diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/cache.json b/cache.json new file mode 100644 index 0000000..35695ae --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2024-03-18T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2403.11708v1","updated":"2024-03-18T12:12:45Z","published":"2024-03-18T12:12:45Z","title":"Implicit Discriminative Knowledge Learning for Visible-Infrared Person\n Re-Identification","summary":" Visible-Infrared Person Re-identification (VI-ReID) is a challenging\ncross-modal pedestrian retrieval task, due to significant intra-class\nvariations and cross-modal discrepancies among different cameras. Existing\nworks mainly focus on embedding images of different modalities into a unified\nspace to mine modality-shared features. They only seek distinctive information\nwithin these shared features, while ignoring the identity-aware useful\ninformation that is implicit in the modality-specific features. To address this\nissue, we propose a novel Implicit Discriminative Knowledge Learning (IDKL)\nnetwork to uncover and leverage the implicit discriminative information\ncontained within the modality-specific. First, we extract modality-specific and\nmodality-shared features using a novel dual-stream network. Then, the\nmodality-specific features undergo purification to reduce their modality style\ndiscrepancies while preserving identity-aware discriminative knowledge.\nSubsequently, this kind of implicit knowledge is distilled into the\nmodality-shared feature to enhance its distinctiveness. Finally, an alignment\nloss is proposed to minimize modality discrepancy on enhanced modality-shared\nfeatures. Extensive experiments on multiple public datasets demonstrate the\nsuperiority of IDKL network over the state-of-the-art methods. Code is\navailable at https://github.com/1KK077/IDKL.\n","authors":["Kaijie Ren","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.11708v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.09401v2","updated":"2024-03-18T12:08:01Z","published":"2024-03-14T13:52:03Z","title":"Unsupervised Modality-Transferable Video Highlight Detection with\n Representation Activation Sequence Learning","summary":" Identifying highlight moments of raw video materials is crucial for improving\nthe efficiency of editing videos that are pervasive on internet platforms.\nHowever, the extensive work of manually labeling footage has created obstacles\nto applying supervised methods to videos of unseen categories. The absence of\nan audio modality that contains valuable cues for highlight detection in many\nvideos also makes it difficult to use multimodal strategies. In this paper, we\npropose a novel model with cross-modal perception for unsupervised highlight\ndetection. The proposed model learns representations with visual-audio level\nsemantics from image-audio pair data via a self-reconstruction task. To achieve\nunsupervised highlight detection, we investigate the latent representations of\nthe network and propose the representation activation sequence learning (RASL)\nmodule with k-point contrastive learning to learn significant representation\nactivations. To connect the visual modality with the audio modality, we use the\nsymmetric contrastive learning (SCL) module to learn the paired visual and\naudio representations. Furthermore, an auxiliary task of masked feature vector\nsequence (FVS) reconstruction is simultaneously conducted during pretraining\nfor representation enhancement. During inference, the cross-modal pretrained\nmodel can generate representations with paired visual-audio semantics given\nonly the visual modality. The RASL module is used to output the highlight\nscores. The experimental results show that the proposed framework achieves\nsuperior performance compared to other state-of-the-art approaches.\n","authors":["Tingtian Li","Zixun Sun","Xinyu Xiao"],"pdf_url":"https://arxiv.org/pdf/2403.09401v2.pdf","comment":"Accepted by IEEE Transactions on Image Processing, 2024"},{"id":"http://arxiv.org/abs/2403.11703v1","updated":"2024-03-18T12:04:11Z","published":"2024-03-18T12:04:11Z","title":"LLaVA-UHD: an LMM Perceiving Any Aspect Ratio and High-Resolution Images","summary":" Visual encoding constitutes the basis of large multimodal models (LMMs) in\nunderstanding the visual world. Conventional LMMs process images in fixed sizes\nand limited resolutions, while recent explorations in this direction are\nlimited in adaptivity, efficiency, and even correctness. In this work, we first\ntake GPT-4V and LLaVA-1.5 as representative examples and expose systematic\nflaws rooted in their visual encoding strategy. To address the challenges, we\npresent LLaVA-UHD, a large multimodal model that can efficiently perceive\nimages in any aspect ratio and high resolution. LLaVA-UHD includes three key\ncomponents: (1) An image modularization strategy that divides native-resolution\nimages into smaller variable-sized slices for efficient and extensible\nencoding, (2) a compression module that further condenses image tokens from\nvisual encoders, and (3) a spatial schema to organize slice tokens for LLMs.\nComprehensive experiments show that LLaVA-UHD outperforms established LMMs\ntrained with 2-3 orders of magnitude more data on 9 benchmarks. Notably, our\nmodel built on LLaVA-1.5 336x336 supports 6 times larger (i.e., 672x1088)\nresolution images using only 94% inference computation, and achieves 6.4\naccuracy improvement on TextVQA. Moreover, the model can be efficiently trained\nin academic settings, within 23 hours on 8 A100 GPUs (vs. 26 hours of\nLLaVA-1.5). We make the data and code publicly available at\nhttps://github.com/thunlp/LLaVA-UHD.\n","authors":["Ruyi Xu","Yuan Yao","Zonghao Guo","Junbo Cui","Zanlin Ni","Chunjiang Ge","Tat-Seng Chua","Zhiyuan Liu","Maosong Sun","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2403.11703v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2403.11699v1","updated":"2024-03-18T11:56:32Z","published":"2024-03-18T11:56:32Z","title":"A Spatial-Temporal Progressive Fusion Network for Breast Lesion\n Segmentation in Ultrasound Videos","summary":" Ultrasound video-based breast lesion segmentation provides a valuable\nassistance in early breast lesion detection and treatment. However, existing\nworks mainly focus on lesion segmentation based on ultrasound breast images\nwhich usually can not be adapted well to obtain desirable results on ultrasound\nvideos. The main challenge for ultrasound video-based breast lesion\nsegmentation is how to exploit the lesion cues of both intra-frame and\ninter-frame simultaneously. To address this problem, we propose a novel\nSpatial-Temporal Progressive Fusion Network (STPFNet) for video based breast\nlesion segmentation problem. The main aspects of the proposed STPFNet are\nthreefold. First, we propose to adopt a unified network architecture to capture\nboth spatial dependences within each ultrasound frame and temporal correlations\nbetween different frames together for ultrasound data representation. Second,\nwe propose a new fusion module, termed Multi-Scale Feature Fusion (MSFF), to\nfuse spatial and temporal cues together for lesion detection. MSFF can help to\ndetermine the boundary contour of lesion region to overcome the issue of lesion\nboundary blurring. Third, we propose to exploit the segmentation result of\nprevious frame as the prior knowledge to suppress the noisy background and\nlearn more robust representation. In particular, we introduce a new publicly\navailable ultrasound video breast lesion segmentation dataset, termed UVBLS200,\nwhich is specifically dedicated to breast lesion segmentation. It contains 200\nvideos, including 80 videos of benign lesions and 120 videos of malignant\nlesions. Experiments on the proposed dataset demonstrate that the proposed\nSTPFNet achieves better breast lesion detection performance than\nstate-of-the-art methods.\n","authors":["Zhengzheng Tu","Zigang Zhu","Yayang Duan","Bo Jiang","Qishun Wang","Chaoxue Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.11699v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11697v1","updated":"2024-03-18T11:54:35Z","published":"2024-03-18T11:54:35Z","title":"Urban Scene Diffusion through Semantic Occupancy Map","summary":" Generating unbounded 3D scenes is crucial for large-scale scene understanding\nand simulation. Urban scenes, unlike natural landscapes, consist of various\ncomplex man-made objects and structures such as roads, traffic signs, vehicles,\nand buildings. To create a realistic and detailed urban scene, it is crucial to\naccurately represent the geometry and semantics of the underlying objects,\ngoing beyond their visual appearance. In this work, we propose UrbanDiffusion,\na 3D diffusion model that is conditioned on a Bird's-Eye View (BEV) map and\ngenerates an urban scene with geometry and semantics in the form of semantic\noccupancy map. Our model introduces a novel paradigm that learns the data\ndistribution of scene-level structures within a latent space and further\nenables the expansion of the synthesized scene into an arbitrary scale. After\ntraining on real-world driving datasets, our model can generate a wide range of\ndiverse urban scenes given the BEV maps from the held-out set and also\ngeneralize to the synthesized maps from a driving simulator. We further\ndemonstrate its application to scene image synthesis with a pretrained image\ngenerator as a prior.\n","authors":["Junge Zhang","Qihang Zhang","Li Zhang","Ramana Rao Kompella","Gaowen Liu","Bolei Zhou"],"pdf_url":"https://arxiv.org/pdf/2403.11697v1.pdf","comment":"The project website is https://metadriverse.github.io/urbandiff/"},{"id":"http://arxiv.org/abs/2403.11695v1","updated":"2024-03-18T11:48:41Z","published":"2024-03-18T11:48:41Z","title":"TrajectoryNAS: A Neural Architecture Search for Trajectory Prediction","summary":" Autonomous driving systems are a rapidly evolving technology that enables\ndriverless car production. Trajectory prediction is a critical component of\nautonomous driving systems, enabling cars to anticipate the movements of\nsurrounding objects for safe navigation. Trajectory prediction using Lidar\npoint-cloud data performs better than 2D images due to providing 3D\ninformation. However, processing point-cloud data is more complicated and\ntime-consuming than 2D images. Hence, state-of-the-art 3D trajectory\npredictions using point-cloud data suffer from slow and erroneous predictions.\nThis paper introduces TrajectoryNAS, a pioneering method that focuses on\nutilizing point cloud data for trajectory prediction. By leveraging Neural\nArchitecture Search (NAS), TrajectoryNAS automates the design of trajectory\nprediction models, encompassing object detection, tracking, and forecasting in\na cohesive manner. This approach not only addresses the complex\ninterdependencies among these tasks but also emphasizes the importance of\naccuracy and efficiency in trajectory modeling. Through empirical studies,\nTrajectoryNAS demonstrates its effectiveness in enhancing the performance of\nautonomous driving systems, marking a significant advancement in the\nfield.Experimental results reveal that TrajcetoryNAS yield a minimum of 4.8\nhigger accuracy and 1.1* lower latency over competing methods on the NuScenes\ndataset.\n","authors":["Ali Asghar Sharifi","Ali Zoljodi","Masoud Daneshtalab"],"pdf_url":"https://arxiv.org/pdf/2403.11695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11694v1","updated":"2024-03-18T11:48:20Z","published":"2024-03-18T11:48:20Z","title":"Object Segmentation-Assisted Inter Prediction for Versatile Video Coding","summary":" In modern video coding standards, block-based inter prediction is widely\nadopted, which brings high compression efficiency. However, in natural videos,\nthere are usually multiple moving objects of arbitrary shapes, resulting in\ncomplex motion fields that are difficult to compactly represent. This problem\nhas been tackled by more flexible block partitioning methods in the Versatile\nVideo Coding (VVC) standard, but the more flexible partitions require more\noverhead bits to signal and still cannot be made arbitrary shaped. To address\nthis limitation, we propose an object segmentation-assisted inter prediction\nmethod (SAIP), where objects in the reference frames are segmented by some\nadvanced technologies. With a proper indication, the object segmentation mask\nis translated from the reference frame to the current frame as the\narbitrary-shaped partition of different regions without any extra signal. Using\nthe segmentation mask, motion compensation is separately performed for\ndifferent regions, achieving higher prediction accuracy. The segmentation mask\nis further used to code the motion vectors of different regions more\nefficiently. Moreover, segmentation mask is considered in the joint\nrate-distortion optimization for motion estimation and partition estimation to\nderive the motion vector of different regions and partition more accurately.\nThe proposed method is implemented into the VVC reference software, VTM version\n12.0. Experimental results show that the proposed method achieves up to 1.98%,\n1.14%, 0.79%, and on average 0.82%, 0.49%, 0.37% BD-rate reduction for common\ntest sequences, under the Low-delay P, Low-delay B, and Random Access\nconfigurations, respectively.\n","authors":["Zhuoyuan Li","Zikun Yuan","Li Li","Dong Liu","Xiaohu Tang","Feng Wu"],"pdf_url":"https://arxiv.org/pdf/2403.11694v1.pdf","comment":"22 pages, 15 figures"},{"id":"http://arxiv.org/abs/2403.11691v1","updated":"2024-03-18T11:41:55Z","published":"2024-03-18T11:41:55Z","title":"TTT-KD: Test-Time Training for 3D Semantic Segmentation through\n Knowledge Distillation from Foundation Models","summary":" Test-Time Training (TTT) proposes to adapt a pre-trained network to changing\ndata distributions on-the-fly. In this work, we propose the first TTT method\nfor 3D semantic segmentation, TTT-KD, which models Knowledge Distillation (KD)\nfrom foundation models (e.g. DINOv2) as a self-supervised objective for\nadaptation to distribution shifts at test-time. Given access to paired\nimage-pointcloud (2D-3D) data, we first optimize a 3D segmentation backbone for\nthe main task of semantic segmentation using the pointclouds and the task of 2D\n$\\to$ 3D KD by using an off-the-shelf 2D pre-trained foundation model. At\ntest-time, our TTT-KD updates the 3D segmentation backbone for each test\nsample, by using the self-supervised task of knowledge distillation, before\nperforming the final prediction. Extensive evaluations on multiple indoor and\noutdoor 3D segmentation benchmarks show the utility of TTT-KD, as it improves\nperformance for both in-distribution (ID) and out-of-distribution (ODO) test\ndatasets. We achieve a gain of up to 13% mIoU (7% on average) when the train\nand test distributions are similar and up to 45% (20% on average) when adapting\nto OOD test samples.\n","authors":["Lisa Weijler","Muhammad Jehanzeb Mirza","Leon Sick","Can Ekkazan","Pedro Hermosilla"],"pdf_url":"https://arxiv.org/pdf/2403.11691v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11689v1","updated":"2024-03-18T11:38:47Z","published":"2024-03-18T11:38:47Z","title":"MoreStyle: Relax Low-frequency Constraint of Fourier-based Image\n Reconstruction in Generalizable Medical Image Segmentation","summary":" The task of single-source domain generalization (SDG) in medical image\nsegmentation is crucial due to frequent domain shifts in clinical image\ndatasets. To address the challenge of poor generalization across different\ndomains, we introduce a Plug-and-Play module for data augmentation called\nMoreStyle. MoreStyle diversifies image styles by relaxing low-frequency\nconstraints in Fourier space, guiding the image reconstruction network. With\nthe help of adversarial learning, MoreStyle further expands the style range and\npinpoints the most intricate style combinations within latent features. To\nhandle significant style variations, we introduce an uncertainty-weighted loss.\nThis loss emphasizes hard-to-classify pixels resulting only from style shifts\nwhile mitigating true hard-to-classify pixels in both MoreStyle-generated and\noriginal images. Extensive experiments on two widely used benchmarks\ndemonstrate that the proposed MoreStyle effectively helps to achieve good\ndomain generalization ability, and has the potential to further boost the\nperformance of some state-of-the-art SDG methods.\n","authors":["Haoyu Zhao","Wenhui Dong","Rui Yu","Zhou Zhao","Du Bo","Yongchao Xu"],"pdf_url":"https://arxiv.org/pdf/2403.11689v1.pdf","comment":"10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2403.11681v1","updated":"2024-03-18T11:35:18Z","published":"2024-03-18T11:35:18Z","title":"MASSTAR: A Multi-Modal and Large-Scale Scene Dataset with a Versatile\n Toolchain for Surface Prediction and Completion","summary":" Surface prediction and completion have been widely studied in various\napplications. Recently, research in surface completion has evolved from small\nobjects to complex large-scale scenes. As a result, researchers have begun\nincreasing the volume of data and leveraging a greater variety of data\nmodalities including rendered RGB images, descriptive texts, depth images, etc,\nto enhance algorithm performance. However, existing datasets suffer from a\ndeficiency in the amounts of scene-level models along with the corresponding\nmulti-modal information. Therefore, a method to scale the datasets and generate\nmulti-modal information in them efficiently is essential. To bridge this\nresearch gap, we propose MASSTAR: a Multi-modal lArge-scale Scene dataset with\na verSatile Toolchain for surfAce pRediction and completion. We develop a\nversatile and efficient toolchain for processing the raw 3D data from the\nenvironments. It screens out a set of fine-grained scene models and generates\nthe corresponding multi-modal data. Utilizing the toolchain, we then generate\nan example dataset composed of over a thousand scene-level models with partial\nreal-world data added. We compare MASSTAR with the existing datasets, which\nvalidates its superiority: the ability to efficiently extract high-quality\nmodels from complex scenarios to expand the dataset. Additionally, several\nrepresentative surface completion algorithms are benchmarked on MASSTAR, which\nreveals that existing algorithms can hardly deal with scene-level completion.\nWe will release the source code of our toolchain and the dataset. For more\ndetails, please see our project page at https://sysu-star.github.io/MASSTAR.\n","authors":["Guiyong Zheng","Jinqi Jiang","Chen Feng","Shaojie Shen","Boyu Zhou"],"pdf_url":"https://arxiv.org/pdf/2403.11681v1.pdf","comment":"Submitted to IROS2024. Code: https://github.com/SYSU-STAR/MASSTAR.\n Project Page: https://github.com/SYSU-STAR/MASSTAR"},{"id":"http://arxiv.org/abs/2403.11679v1","updated":"2024-03-18T11:31:03Z","published":"2024-03-18T11:31:03Z","title":"NEDS-SLAM: A Novel Neural Explicit Dense Semantic SLAM Framework using\n 3D Gaussian Splatting","summary":" We propose NEDS-SLAM, an Explicit Dense semantic SLAM system based on 3D\nGaussian representation, that enables robust 3D semantic mapping, accurate\ncamera tracking, and high-quality rendering in real-time. In the system, we\npropose a Spatially Consistent Feature Fusion model to reduce the effect of\nerroneous estimates from pre-trained segmentation head on semantic\nreconstruction, achieving robust 3D semantic Gaussian mapping. Additionally, we\nemploy a lightweight encoder-decoder to compress the high-dimensional semantic\nfeatures into a compact 3D Gaussian representation, mitigating the burden of\nexcessive memory consumption. Furthermore, we leverage the advantage of 3D\nGaussian splatting, which enables efficient and differentiable novel view\nrendering, and propose a Virtual Camera View Pruning method to eliminate\noutlier GS points, thereby effectively enhancing the quality of scene\nrepresentations. Our NEDS-SLAM method demonstrates competitive performance over\nexisting dense semantic SLAM methods in terms of mapping and tracking accuracy\non Replica and ScanNet datasets, while also showing excellent capabilities in\n3D dense semantic mapping.\n","authors":["Yiming Ji","Yang Liu","Guanghu Xie","Boyu Ma","Zongwu Xie"],"pdf_url":"https://arxiv.org/pdf/2403.11679v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11678v1","updated":"2024-03-18T11:29:43Z","published":"2024-03-18T11:29:43Z","title":"Exploring 3D-aware Latent Spaces for Efficiently Learning Numerous\n Scenes","summary":" We present a method enabling the scaling of NeRFs to learn a large number of\nsemantically-similar scenes. We combine two techniques to improve the required\ntraining time and memory cost per scene. First, we learn a 3D-aware latent\nspace in which we train Tri-Plane scene representations, hence reducing the\nresolution at which scenes are learned. Moreover, we present a way to share\ncommon information across scenes, hence allowing for a reduction of model\ncomplexity to learn a particular scene. Our method reduces effective per-scene\nmemory costs by 44% and per-scene time costs by 86% when training 1000 scenes.\nOur project page can be found at https://3da-ae.github.io .\n","authors":["Antoine Schnepf","Karim Kassab","Jean-Yves Franceschi","Laurent Caraffa","Flavian Vasile","Jeremie Mary","Andrew Comport","Valérie Gouet-Brunet"],"pdf_url":"https://arxiv.org/pdf/2403.11678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11675v1","updated":"2024-03-18T11:23:02Z","published":"2024-03-18T11:23:02Z","title":"Better (pseudo-)labels for semi-supervised instance segmentation","summary":" Despite the availability of large datasets for tasks like image\nclassification and image-text alignment, labeled data for more complex\nrecognition tasks, such as detection and segmentation, is less abundant. In\nparticular, for instance segmentation annotations are time-consuming to\nproduce, and the distribution of instances is often highly skewed across\nclasses. While semi-supervised teacher-student distillation methods show\npromise in leveraging vast amounts of unlabeled data, they suffer from\nmiscalibration, resulting in overconfidence in frequently represented classes\nand underconfidence in rarer ones. Additionally, these methods encounter\ndifficulties in efficiently learning from a limited set of examples. We\nintroduce a dual-strategy to enhance the teacher model's training process,\nsubstantially improving the performance on few-shot learning. Secondly, we\npropose a calibration correction mechanism that that enables the student model\nto correct the teacher's calibration errors. Using our approach, we observed\nmarked improvements over a state-of-the-art supervised baseline performance on\nthe LVIS dataset, with an increase of 2.8% in average precision (AP) and 10.3%\ngain in AP for rare classes.\n","authors":["François Porcher","Camille Couprie","Marc Szafraniec","Jakob Verbeek"],"pdf_url":"https://arxiv.org/pdf/2403.11675v1.pdf","comment":"Appeared at the Practical ML for Low Resource Settings workshop at\n ICLR 2024"},{"id":"http://arxiv.org/abs/2403.11674v1","updated":"2024-03-18T11:21:52Z","published":"2024-03-18T11:21:52Z","title":"Towards Generalizing to Unseen Domains with Few Labels","summary":" We approach the challenge of addressing semi-supervised domain generalization\n(SSDG). Specifically, our aim is to obtain a model that learns\ndomain-generalizable features by leveraging a limited subset of labelled data\nalongside a substantially larger pool of unlabeled data. Existing domain\ngeneralization (DG) methods which are unable to exploit unlabeled data perform\npoorly compared to semi-supervised learning (SSL) methods under SSDG setting.\nNevertheless, SSL methods have considerable room for performance improvement\nwhen compared to fully-supervised DG training. To tackle this underexplored,\nyet highly practical problem of SSDG, we make the following core contributions.\nFirst, we propose a feature-based conformity technique that matches the\nposterior distributions from the feature space with the pseudo-label from the\nmodel's output space. Second, we develop a semantics alignment loss to learn\nsemantically-compatible representations by regularizing the semantic structure\nin the feature space. Our method is plug-and-play and can be readily integrated\nwith different SSL-based SSDG baselines without introducing any additional\nparameters. Extensive experimental results across five challenging DG\nbenchmarks with four strong SSL baselines suggest that our method provides\nconsistent and notable gains in two different SSDG settings.\n","authors":["Chamuditha Jayanga Galappaththige","Sanoojan Baliah","Malitha Gunawardhana","Muhammad Haris Khan"],"pdf_url":"https://arxiv.org/pdf/2403.11674v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.11672v1","updated":"2024-03-18T11:20:11Z","published":"2024-03-18T11:20:11Z","title":"WIA-LD2ND: Wavelet-based Image Alignment for Self-supervised Low-Dose CT\n Denoising","summary":" In clinical examinations and diagnoses, low-dose computed tomography (LDCT)\nis crucial for minimizing health risks compared with normal-dose computed\ntomography (NDCT). However, reducing the radiation dose compromises the\nsignal-to-noise ratio, leading to degraded quality of CT images. To address\nthis, we analyze LDCT denoising task based on experimental results from the\nfrequency perspective, and then introduce a novel self-supervised CT image\ndenoising method called WIA-LD2ND, only using NDCT data. The proposed WIA-LD2ND\ncomprises two modules: Wavelet-based Image Alignment (WIA) and Frequency-Aware\nMulti-scale Loss (FAM). First, WIA is introduced to align NDCT with LDCT by\nmainly adding noise to the high-frequency components, which is the main\ndifference between LDCT and NDCT. Second, to better capture high-frequency\ncomponents and detailed information, Frequency-Aware Multi-scale Loss (FAM) is\nproposed by effectively utilizing multi-scale feature space. Extensive\nexperiments on two public LDCT denoising datasets demonstrate that our\nWIA-LD2ND, only uses NDCT, outperforms existing several state-of-the-art\nweakly-supervised and self-supervised methods.\n","authors":["Haoyu Zhao","Guyu Liang","Zhou Zhao","Bo Du","Yongchao Xu","Rui Yu"],"pdf_url":"https://arxiv.org/pdf/2403.11672v1.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2403.11667v1","updated":"2024-03-18T11:15:03Z","published":"2024-03-18T11:15:03Z","title":"Binary Noise for Binary Tasks: Masked Bernoulli Diffusion for\n Unsupervised Anomaly Detection","summary":" The high performance of denoising diffusion models for image generation has\npaved the way for their application in unsupervised medical anomaly detection.\nAs diffusion-based methods require a lot of GPU memory and have long sampling\ntimes, we present a novel and fast unsupervised anomaly detection approach\nbased on latent Bernoulli diffusion models. We first apply an autoencoder to\ncompress the input images into a binary latent representation. Next, a\ndiffusion model that follows a Bernoulli noise schedule is employed to this\nlatent space and trained to restore binary latent representations from\nperturbed ones. The binary nature of this diffusion model allows us to identify\nentries in the latent space that have a high probability of flipping their\nbinary code during the denoising process, which indicates out-of-distribution\ndata. We propose a masking algorithm based on these probabilities, which\nimproves the anomaly detection scores. We achieve state-of-the-art performance\ncompared to other diffusion-based unsupervised anomaly detection algorithms\nwhile significantly reducing sampling time and memory consumption. The code is\navailable at https://github.com/JuliaWolleb/Anomaly_berdiff.\n","authors":["Julia Wolleb","Florentin Bieder","Paul Friedrich","Peter Zhang","Alicia Durrer","Philippe C. Cattin"],"pdf_url":"https://arxiv.org/pdf/2403.11667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11665v1","updated":"2024-03-18T11:12:39Z","published":"2024-03-18T11:12:39Z","title":"Normalized Validity Scores for DNNs in Regression based Eye Feature\n Extraction","summary":" We propose an improvement to the landmark validity loss. Landmark detection\nis widely used in head pose estimation, eyelid shape extraction, as well as\npupil and iris segmentation. There are numerous additional applications where\nlandmark detection is used to estimate the shape of complex objects. One part\nof this process is the accurate and fine-grained detection of the shape. The\nother part is the validity or inaccuracy per landmark, which can be used to\ndetect unreliable areas, where the shape possibly does not fit, and to improve\nthe accuracy of the entire shape extraction by excluding inaccurate landmarks.\nWe propose a normalization in the loss formulation, which improves the accuracy\nof the entire approach due to the numerical balance of the normalized\ninaccuracy. In addition, we propose a margin for the inaccuracy to reduce the\nimpact of gradients, which are produced by negligible errors close to the\nground truth.\n","authors":["Wolfgang Fuhl"],"pdf_url":"https://arxiv.org/pdf/2403.11665v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10040v2","updated":"2024-03-18T11:02:11Z","published":"2024-03-15T06:20:09Z","title":"Histo-Genomic Knowledge Distillation For Cancer Prognosis From\n Histopathology Whole Slide Images","summary":" Histo-genomic multi-modal methods have recently emerged as a powerful\nparadigm, demonstrating significant potential for improving cancer prognosis.\nHowever, genome sequencing, unlike histopathology imaging, is still not widely\naccessible in underdeveloped regions, limiting the application of these\nmulti-modal approaches in clinical settings. To address this, we propose a\nnovel Genome-informed Hyper-Attention Network, termed G-HANet, which is capable\nof effectively distilling the histo-genomic knowledge during training to\nelevate uni-modal whole slide image (WSI)-based inference for the first time.\nCompared with traditional knowledge distillation methods (i.e., teacher-student\narchitecture) in other tasks, our end-to-end model is superior in terms of\ntraining efficiency and learning cross-modal interactions. Specifically, the\nnetwork comprises the cross-modal associating branch (CAB) and hyper-attention\nsurvival branch (HSB). Through the genomic data reconstruction from WSIs, CAB\neffectively distills the associations between functional genotypes and\nmorphological phenotypes and offers insights into the gene expression profiles\nin the feature space. Subsequently, HSB leverages the distilled histo-genomic\nassociations as well as the generated morphology-based weights to achieve the\nhyper-attention modeling of the patients from both histopathology and genomic\nperspectives to improve cancer prognosis. Extensive experiments are conducted\non five TCGA benchmarking datasets and the results demonstrate that G-HANet\nsignificantly outperforms the state-of-the-art WSI-based methods and achieves\ncompetitive performance with genome-based and multi-modal methods. G-HANet is\nexpected to be explored as a useful tool by the research community to address\nthe current bottleneck of insufficient histo-genomic data pairing in the\ncontext of cancer prognosis and precision oncology.\n","authors":["Zhikang Wang","Yumeng Zhang","Yingxue Xu","Seiya Imoto","Hao Chen","Jiangning Song"],"pdf_url":"https://arxiv.org/pdf/2403.10040v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14899v2","updated":"2024-03-18T10:55:36Z","published":"2024-02-22T17:36:34Z","title":"Stop Reasoning! When Multimodal LLMs with Chain-of-Thought Reasoning\n Meets Adversarial Images","summary":" Recently, Multimodal LLMs (MLLMs) have shown a great ability to understand\nimages. However, like traditional vision models, they are still vulnerable to\nadversarial images. Meanwhile, Chain-of-Thought (CoT) reasoning has been widely\nexplored on MLLMs, which not only improves model's performance, but also\nenhances model's explainability by giving intermediate reasoning steps.\nNevertheless, there is still a lack of study regarding MLLMs' adversarial\nrobustness with CoT and an understanding of what the rationale looks like when\nMLLMs infer wrong answers with adversarial images. Our research evaluates the\nadversarial robustness of MLLMs when employing CoT reasoning, finding that CoT\nmarginally improves adversarial robustness against existing attack methods.\nMoreover, we introduce a novel stop-reasoning attack technique that effectively\nbypasses the CoT-induced robustness enhancements. Finally, we demonstrate the\nalterations in CoT reasoning when MLLMs confront adversarial images, shedding\nlight on their reasoning process under adversarial attacks.\n","authors":["Zefeng Wang","Zhen Han","Shuo Chen","Fan Xue","Zifeng Ding","Xun Xiao","Volker Tresp","Philip Torr","Jindong Gu"],"pdf_url":"https://arxiv.org/pdf/2402.14899v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13316v2","updated":"2024-03-18T10:54:03Z","published":"2023-12-20T11:00:54Z","title":"ECAMP: Entity-centered Context-aware Medical Vision Language\n Pre-training","summary":" Despite significant advancements in medical vision-language pre-training,\nexisting methods have largely overlooked the inherent entity-specific context\nwithin radiology reports and the complex cross-modality contextual\nrelationships between text and images. To close this gap, we propose a novel\nEntity-centered Context-aware Medical Vision-language Pre-training (ECAMP)\nframework, which is designed to enable a more entity-centered and\ncontext-sensitive interpretation of medical data. Utilizing the recent powerful\nlarge language model, we distill entity-centered context from medical reports,\nwhich enables ECAMP to gain more effective supervision from the text modality.\nBy further pre-training our model with carefully designed entity-aware,\ncontext-enhanced masked language modeling and context-guided super-resolution\ntasks, ECAMP significantly refines the interplay between text and image\nmodalities, leading to an enhanced ability to extract entity-centered\ncontextual features. Besides, our proposed multi-scale context fusion design\nalso improves the semantic integration of both coarse and fine-level image\nrepresentations, prompting better performance for multi-scale downstream\napplications. Combining these components leads to significant performance leaps\nover current state-of-the-art methods and establishes a new standard for\ncross-modality learning in medical imaging, whose effectiveness is demonstrated\nby our extensive experiments on various tasks including classification,\nsegmentation, and detection across several public datasets. Code and models are\navailable at https://github.com/ToniChopp/ECAMP.\n","authors":["Rongsheng Wang","Qingsong Yao","Haoran Lai","Zhiyang He","Xiaodong Tao","Zihang Jiang","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.13316v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11656v1","updated":"2024-03-18T10:53:00Z","published":"2024-03-18T10:53:00Z","title":"LocalStyleFool: Regional Video Style Transfer Attack Using Segment\n Anything Model","summary":" Previous work has shown that well-crafted adversarial perturbations can\nthreaten the security of video recognition systems. Attackers can invade such\nmodels with a low query budget when the perturbations are semantic-invariant,\nsuch as StyleFool. Despite the query efficiency, the naturalness of the minutia\nareas still requires amelioration, since StyleFool leverages style transfer to\nall pixels in each frame. To close the gap, we propose LocalStyleFool, an\nimproved black-box video adversarial attack that superimposes regional\nstyle-transfer-based perturbations on videos. Benefiting from the popularity\nand scalably usability of Segment Anything Model (SAM), we first extract\ndifferent regions according to semantic information and then track them through\nthe video stream to maintain the temporal consistency. Then, we add\nstyle-transfer-based perturbations to several regions selected based on the\nassociative criterion of transfer-based gradient information and regional area.\nPerturbation fine adjustment is followed to make stylized videos adversarial.\nWe demonstrate that LocalStyleFool can improve both intra-frame and inter-frame\nnaturalness through a human-assessed survey, while maintaining competitive\nfooling rate and query efficiency. Successful experiments on the\nhigh-resolution dataset also showcase that scrupulous segmentation of SAM helps\nto improve the scalability of adversarial attacks under high-resolution data.\n","authors":["Yuxin Cao","Jinghao Li","Xi Xiao","Derui Wang","Minhui Xue","Hao Ge","Wei Liu","Guangwu Hu"],"pdf_url":"https://arxiv.org/pdf/2403.11656v1.pdf","comment":"Accepted to 2024 IEEE Security and Privacy Workshops (SPW)"},{"id":"http://arxiv.org/abs/2403.11650v1","updated":"2024-03-18T10:45:50Z","published":"2024-03-18T10:45:50Z","title":"Prioritized Semantic Learning for Zero-shot Instance Navigation","summary":" We study zero-shot instance navigation, in which the agent navigates to a\nspecific object without using object annotations for training. Previous object\nnavigation approaches apply the image-goal navigation (ImageNav) task (go to\nthe location of an image) for pretraining, and transfer the agent to achieve\nobject goals using a vision-language model. However, these approaches lead to\nissues of semantic neglect, where the model fails to learn meaningful semantic\nalignments. In this paper, we propose a Prioritized Semantic Learning (PSL)\nmethod to improve the semantic understanding ability of navigation agents.\nSpecifically, a semantic-enhanced PSL agent is proposed and a prioritized\nsemantic training strategy is introduced to select goal images that exhibit\nclear semantic supervision and relax the reward function from strict exact view\nmatching. At inference time, a semantic expansion inference scheme is designed\nto preserve the same granularity level of the goal-semantic as training.\nFurthermore, for the popular HM3D environment, we present an Instance\nNavigation (InstanceNav) task that requires going to a specific object instance\nwith detailed descriptions, as opposed to the Object Navigation (ObjectNav)\ntask where the goal is defined merely by the object category. Our PSL agent\noutperforms the previous state-of-the-art by 66% on zero-shot ObjectNav in\nterms of success rate and is also superior on the new InstanceNav task. Code\nwill be released at https://anonymous.4open. science/r/PSL/.\n","authors":["Xander Sun","Louis Lau","Hoyard Zhi","Ronghe Qiu","Junwei Liang"],"pdf_url":"https://arxiv.org/pdf/2403.11650v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11649v1","updated":"2024-03-18T10:45:27Z","published":"2024-03-18T10:45:27Z","title":"Gridless 2D Recovery of Lines using the Sliding Frank-Wolfe Algorithm","summary":" We present a new approach leveraging the Sliding Frank--Wolfe algorithm to\naddress the challenge of line recovery in degraded images. Building upon\nadvances in conditional gradient methods for sparse inverse problems with\ndifferentiable measurement models, we propose two distinct models tailored for\nline detection tasks within the realm of blurred line deconvolution and ridge\ndetection of linear chirps in spectrogram images.\n","authors":["Kévin Polisano","Basile Dubois-Bonnaire","Sylvain Meignen"],"pdf_url":"https://arxiv.org/pdf/2403.11649v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11646v1","updated":"2024-03-18T10:42:24Z","published":"2024-03-18T10:42:24Z","title":"MedMerge: Merging Models for Effective Transfer Learning to Medical\n Imaging Tasks","summary":" Transfer learning has become a powerful tool to initialize deep learning\nmodels to achieve faster convergence and higher performance. This is especially\nuseful in the medical imaging analysis domain, where data scarcity limits\npossible performance gains for deep learning models. Some advancements have\nbeen made in boosting the transfer learning performance gain by merging models\nstarting from the same initialization. However, in the medical imaging analysis\ndomain, there is an opportunity in merging models starting from different\ninitialisations, thus combining the features learnt from different tasks. In\nthis work, we propose MedMerge, a method whereby the weights of different\nmodels can be merged, and their features can be effectively utilized to boost\nperformance on a new task. With MedMerge, we learn kernel-level weights that\ncan later be used to merge the models into a single model, even when starting\nfrom different initializations. Testing on various medical imaging analysis\ntasks, we show that our merged model can achieve significant performance gains,\nwith up to 3% improvement on the F1 score. The code implementation of this work\nwill be available at www.github.com/BioMedIA-MBZUAI/MedMerge.\n","authors":["Ibrahim Almakky","Santosh Sanjeev","Anees Ur Rehman Hashmi","Mohammad Areeb Qazi","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2403.11646v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11643v1","updated":"2024-03-18T10:35:15Z","published":"2024-03-18T10:35:15Z","title":"Diffusion-Based Environment-Aware Trajectory Prediction","summary":" The ability to predict the future trajectories of traffic participants is\ncrucial for the safe and efficient operation of autonomous vehicles. In this\npaper, a diffusion-based generative model for multi-agent trajectory prediction\nis proposed. The model is capable of capturing the complex interactions between\ntraffic participants and the environment, accurately learning the multimodal\nnature of the data. The effectiveness of the approach is assessed on\nlarge-scale datasets of real-world traffic scenarios, showing that our model\noutperforms several well-established methods in terms of prediction accuracy.\nBy the incorporation of differential motion constraints on the model output, we\nillustrate that our model is capable of generating a diverse set of realistic\nfuture trajectories. Through the use of an interaction-aware guidance signal,\nwe further demonstrate that the model can be adapted to predict the behavior of\nless cooperative agents, emphasizing its practical applicability under\nuncertain traffic conditions.\n","authors":["Theodor Westny","Björn Olofsson","Erik Frisk"],"pdf_url":"https://arxiv.org/pdf/2403.11643v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11641v1","updated":"2024-03-18T10:32:51Z","published":"2024-03-18T10:32:51Z","title":"Arc2Face: A Foundation Model of Human Faces","summary":" This paper presents Arc2Face, an identity-conditioned face foundation model,\nwhich, given the ArcFace embedding of a person, can generate diverse\nphoto-realistic images with an unparalleled degree of face similarity than\nexisting models. Despite previous attempts to decode face recognition features\ninto detailed images, we find that common high-resolution datasets (e.g. FFHQ)\nlack sufficient identities to reconstruct any subject. To that end, we\nmeticulously upsample a significant portion of the WebFace42M database, the\nlargest public dataset for face recognition (FR). Arc2Face builds upon a\npretrained Stable Diffusion model, yet adapts it to the task of ID-to-face\ngeneration, conditioned solely on ID vectors. Deviating from recent works that\ncombine ID with text embeddings for zero-shot personalization of text-to-image\nmodels, we emphasize on the compactness of FR features, which can fully capture\nthe essence of the human face, as opposed to hand-crafted prompts. Crucially,\ntext-augmented models struggle to decouple identity and text, usually\nnecessitating some description of the given face to achieve satisfactory\nsimilarity. Arc2Face, however, only needs the discriminative features of\nArcFace to guide the generation, offering a robust prior for a plethora of\ntasks where ID consistency is of paramount importance. As an example, we train\na FR model on synthetic images from our model and achieve superior performance\nto existing synthetic datasets.\n","authors":["Foivos Paraperas Papantoniou","Alexandros Lattas","Stylianos Moschoglou","Jiankang Deng","Bernhard Kainz","Stefanos Zafeiriou"],"pdf_url":"https://arxiv.org/pdf/2403.11641v1.pdf","comment":"29 pages, 20 figures. Project page: https://arc2face.github.io/"},{"id":"http://arxiv.org/abs/2403.11639v1","updated":"2024-03-18T10:21:05Z","published":"2024-03-18T10:21:05Z","title":"An Accurate and Real-time Relative Pose Estimation from Triple\n Point-line Images by Decoupling Rotation and Translation","summary":" Line features are valid complements for point features in man-made\nenvironments. 3D-2D constraints provided by line features have been widely used\nin Visual Odometry (VO) and Structure-from-Motion (SfM) systems. However, how\nto accurately solve three-view relative motion only with 2D observations of\npoints and lines in real time has not been fully explored. In this paper, we\npropose a novel three-view pose solver based on rotation-translation decoupled\nestimation. First, a high-precision rotation estimation method based on normal\nvector coplanarity constraints that consider the uncertainty of observations is\nproposed, which can be solved by Levenberg-Marquardt (LM) algorithm\nefficiently. Second, a robust linear translation constraint that minimizes the\ndegree of the rotation components and feature observation components in\nequations is elaborately designed for estimating translations accurately.\nExperiments on synthetic data and real-world data show that the proposed\napproach improves both rotation and translation accuracy compared to the\nclassical trifocal-tensor-based method and the state-of-the-art two-view\nalgorithm in outdoor and indoor environments.\n","authors":["Zewen Xu","Yijia He","Hao Wei","Bo Xu","BinJian Xie","Yihong Wu"],"pdf_url":"https://arxiv.org/pdf/2403.11639v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11634v1","updated":"2024-03-18T10:13:53Z","published":"2024-03-18T10:13:53Z","title":"Personalized 3D Human Pose and Shape Refinement","summary":" Recently, regression-based methods have dominated the field of 3D human pose\nand shape estimation. Despite their promising results, a common issue is the\nmisalignment between predictions and image observations, often caused by minor\njoint rotation errors that accumulate along the kinematic chain. To address\nthis issue, we propose to construct dense correspondences between initial human\nmodel estimates and the corresponding images that can be used to refine the\ninitial predictions. To this end, we utilize renderings of the 3D models to\npredict per-pixel 2D displacements between the synthetic renderings and the RGB\nimages. This allows us to effectively integrate and exploit appearance\ninformation of the persons. Our per-pixel displacements can be efficiently\ntransformed to per-visible-vertex displacements and then used for 3D model\nrefinement by minimizing a reprojection loss. To demonstrate the effectiveness\nof our approach, we refine the initial 3D human mesh predictions of multiple\nmodels using different refinement procedures on 3DPW and RICH. We show that our\napproach not only consistently leads to better image-model alignment, but also\nto improved 3D accuracy.\n","authors":["Tom Wehrbein","Bodo Rosenhahn","Iain Matthews","Carsten Stoll"],"pdf_url":"https://arxiv.org/pdf/2403.11634v1.pdf","comment":"Accepted to 2023 IEEE/CVF International Conference on Computer Vision\n Workshops (ICCVW)"},{"id":"http://arxiv.org/abs/2403.11631v1","updated":"2024-03-18T10:09:28Z","published":"2024-03-18T10:09:28Z","title":"Compositional Kronecker Context Optimization for Vision-Language Models","summary":" Context Optimization (CoOp) has emerged as a simple yet effective technique\nfor adapting CLIP-like vision-language models to downstream image recognition\ntasks. Nevertheless, learning compact context with satisfactory base-to-new,\ndomain and cross-task generalization ability while adapting to new tasks is\nstill a challenge. To tackle such a challenge, we propose a lightweight yet\ngeneralizable approach termed Compositional Kronecker Context Optimization\n(CK-CoOp). Technically, the prompt's context words in CK-CoOp are learnable\nvectors, which are crafted by linearly combining base vectors sourced from a\ndictionary. These base vectors consist of a non-learnable component obtained by\nquantizing the weights in the token embedding layer, and a learnable component\nconstructed by applying Kronecker product on several learnable tiny matrices.\nIntuitively, the compositional structure mitigates the risk of overfitting on\ntraining data by remembering more pre-trained knowledge. Meantime, the\nKronecker product breaks the non-learnable restrictions of the dictionary,\nthereby enhancing representation ability with minimal additional parameters.\nExtensive experiments confirm that CK-CoOp achieves state-of-the-art\nperformance under base-to-new, domain and cross-task generalization evaluation,\nbut also has the metrics of fewer learnable parameters and efficient training\nand inference speed.\n","authors":["Kun Ding","Xiaohui Li","Qiang Yu","Ying Wang","Haojian Zhang","Shiming Xiang"],"pdf_url":"https://arxiv.org/pdf/2403.11631v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11627v1","updated":"2024-03-18T09:58:52Z","published":"2024-03-18T09:58:52Z","title":"LoRA-Composer: Leveraging Low-Rank Adaptation for Multi-Concept\n Customization in Training-Free Diffusion Models","summary":" Customization generation techniques have significantly advanced the synthesis\nof specific concepts across varied contexts. Multi-concept customization\nemerges as the challenging task within this domain. Existing approaches often\nrely on training a Low-Rank Adaptations (LoRA) fusion matrix of multiple LoRA\nto merge various concepts into a single image. However, we identify this\nstraightforward method faces two major challenges: 1) concept confusion, which\noccurs when the model cannot preserve distinct individual characteristics, and\n2) concept vanishing, where the model fails to generate the intended subjects.\nTo address these issues, we introduce LoRA-Composer, a training-free framework\ndesigned for seamlessly integrating multiple LoRAs, thereby enhancing the\nharmony among different concepts within generated images. LoRA-Composer\naddresses concept vanishing through Concept Injection Constraints, enhancing\nconcept visibility via an expanded cross-attention mechanism. To combat concept\nconfusion, Concept Isolation Constraints are introduced, refining the\nself-attention computation. Furthermore, Latent Re-initialization is proposed\nto effectively stimulate concept-specific latent within designated regions. Our\nextensive testing showcases a notable enhancement in LoRA-Composer's\nperformance compared to standard baselines, especially when eliminating the\nimage-based conditions like canny edge or pose estimations. Code is released at\nhttps://github.com/Young98CN/LoRA\\_Composer.\n","authors":["Yang Yang","Wen Wang","Liang Peng","Chaotian Song","Yao Chen","Hengjia Li","Xiaolong Yang","Qinglin Lu","Deng Cai","Boxi Wu","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2403.11627v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11626v1","updated":"2024-03-18T09:58:43Z","published":"2024-03-18T09:58:43Z","title":"QEAN: Quaternion-Enhanced Attention Network for Visual Dance Generation","summary":" The study of music-generated dance is a novel and challenging Image\ngeneration task. It aims to input a piece of music and seed motions, then\ngenerate natural dance movements for the subsequent music. Transformer-based\nmethods face challenges in time series prediction tasks related to human\nmovements and music due to their struggle in capturing the nonlinear\nrelationship and temporal aspects. This can lead to issues like joint\ndeformation, role deviation, floating, and inconsistencies in dance movements\ngenerated in response to the music. In this paper, we propose a\nQuaternion-Enhanced Attention Network (QEAN) for visual dance synthesis from a\nquaternion perspective, which consists of a Spin Position Embedding (SPE)\nmodule and a Quaternion Rotary Attention (QRA) module. First, SPE embeds\nposition information into self-attention in a rotational manner, leading to\nbetter learning of features of movement sequences and audio sequences, and\nimproved understanding of the connection between music and dance. Second, QRA\nrepresents and fuses 3D motion features and audio features in the form of a\nseries of quaternions, enabling the model to better learn the temporal\ncoordination of music and dance under the complex temporal cycle conditions of\ndance generation. Finally, we conducted experiments on the dataset AIST++, and\nthe results show that our approach achieves better and more robust performance\nin generating accurate, high-quality dance movements. Our source code and\ndataset can be available from https://github.com/MarasyZZ/QEAN and\nhttps://google.github.io/aistplusplus_dataset respectively.\n","authors":["Zhizhen Zhou","Yejing Huo","Guoheng Huang","An Zeng","Xuhang Chen","Lian Huang","Zinuo Li"],"pdf_url":"https://arxiv.org/pdf/2403.11626v1.pdf","comment":"Accepted by The Visual Computer Journal"},{"id":"http://arxiv.org/abs/2403.11625v1","updated":"2024-03-18T09:56:48Z","published":"2024-03-18T09:56:48Z","title":"GaussNav: Gaussian Splatting for Visual Navigation","summary":" In embodied vision, Instance ImageGoal Navigation (IIN) requires an agent to\nlocate a specific object depicted in a goal image within an unexplored\nenvironment. The primary difficulty of IIN stems from the necessity of\nrecognizing the target object across varying viewpoints and rejecting potential\ndistractors.\n Existing map-based navigation methods largely adopt the representation form\nof Bird's Eye View (BEV) maps, which, however, lack the representation of\ndetailed textures in a scene.\n To address the above issues, we propose a new Gaussian Splatting Navigation\n(abbreviated as GaussNav) framework for IIN task, which constructs a novel map\nrepresentation based on 3D Gaussian Splatting (3DGS).\n The proposed framework enables the agent to not only memorize the geometry\nand semantic information of the scene, but also retain the textural features of\nobjects.\n Our GaussNav framework demonstrates a significant leap in performance,\nevidenced by an increase in Success weighted by Path Length (SPL) from 0.252 to\n0.578 on the challenging Habitat-Matterport 3D (HM3D) dataset.\n Our code will be made publicly available.\n","authors":["Xiaohan Lei","Min Wang","Wengang Zhou","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2403.11625v1.pdf","comment":"conference"},{"id":"http://arxiv.org/abs/2403.08227v2","updated":"2024-03-18T09:55:51Z","published":"2024-03-13T04:11:38Z","title":"Matching Non-Identical Objects","summary":" Not identical but similar objects are everywhere in the world. Examples\ninclude four-legged animals such as dogs and cats, cars of different models,\nakin flowers in various colors, and countless others. In this study, we address\na novel task of matching such non-identical objects. We propose a simple\nweighting scheme of descriptors that enhances various sparse image matching\nmethods, which were originally designed for matching identical objects captured\nfrom different perspectives, and achieve semantically robust matching. The\nexperiments show successful matching between non-identical objects in various\ncases including domain shift. Further, we present a first evaluation of the\nrobustness of the image matching methods under common corruptions, which is a\nsort of domain shift, and the proposed method improves the matching in this\ncase as well.\n","authors":["Yusuke Marumo","Kazuhiko Kawamoto","Hiroshi Kera"],"pdf_url":"https://arxiv.org/pdf/2403.08227v2.pdf","comment":"10+7 pages, 10 figures, 4 tables"},{"id":"http://arxiv.org/abs/2402.13602v3","updated":"2024-03-18T09:50:00Z","published":"2024-02-21T08:09:05Z","title":"Hybrid Reasoning Based on Large Language Models for Autonomous Car\n Driving","summary":" Large Language Models (LLMs) have garnered significant attention for their\nability to understand text and images, generate human-like text, and perform\ncomplex reasoning tasks. However, their ability to generalize this advanced\nreasoning with a combination of natural language text for decision-making in\ndynamic situations requires further exploration. In this study, we investigate\nhow well LLMs can adapt and apply a combination of arithmetic and common-sense\nreasoning, particularly in autonomous driving scenarios. We hypothesize that\nLLMs hybrid reasoning abilities can improve autonomous driving by enabling them\nto analyze detected object and sensor data, understand driving regulations and\nphysical laws, and offer additional context. This addresses complex scenarios,\nlike decisions in low visibility (due to weather conditions), where traditional\nmethods might fall short. We evaluated Large Language Models (LLMs) based on\naccuracy by comparing their answers with human-generated ground truth inside\nCARLA. The results showed that when a combination of images (detected objects)\nand sensor data is fed into the LLM, it can offer precise information for brake\nand throttle control in autonomous vehicles across various weather conditions.\nThis formulation and answers can assist in decision-making for auto-pilot\nsystems.\n","authors":["Mehdi Azarafza","Mojtaba Nayyeri","Charles Steinmetz","Steffen Staab","Achim Rettberg"],"pdf_url":"https://arxiv.org/pdf/2402.13602v3.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2403.11616v1","updated":"2024-03-18T09:47:41Z","published":"2024-03-18T09:47:41Z","title":"Multi-View Video-Based Learning: Leveraging Weak Labels for Frame-Level\n Perception","summary":" For training a video-based action recognition model that accepts multi-view\nvideo, annotating frame-level labels is tedious and difficult. However, it is\nrelatively easy to annotate sequence-level labels. This kind of coarse\nannotations are called as weak labels. However, training a multi-view\nvideo-based action recognition model with weak labels for frame-level\nperception is challenging. In this paper, we propose a novel learning\nframework, where the weak labels are first used to train a multi-view\nvideo-based base model, which is subsequently used for downstream frame-level\nperception tasks. The base model is trained to obtain individual latent\nembeddings for each view in the multi-view input. For training the model using\nthe weak labels, we propose a novel latent loss function. We also propose a\nmodel that uses the view-specific latent embeddings for downstream frame-level\naction recognition and detection tasks. The proposed framework is evaluated\nusing the MM Office dataset by comparing several baseline algorithms. The\nresults show that the proposed base model is effectively trained using weak\nlabels and the latent embeddings help the downstream models improve accuracy.\n","authors":["Vijay John","Yasutomo Kawanishi"],"pdf_url":"https://arxiv.org/pdf/2403.11616v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.03411v2","updated":"2024-03-18T09:47:24Z","published":"2024-01-07T08:03:06Z","title":"GRAM: Global Reasoning for Multi-Page VQA","summary":" The increasing use of transformer-based large language models brings forward\nthe challenge of processing long sequences. In document visual question\nanswering (DocVQA), leading methods focus on the single-page setting, while\ndocuments can span hundreds of pages. We present GRAM, a method that seamlessly\nextends pre-trained single-page models to the multi-page setting, without\nrequiring computationally-heavy pretraining. To do so, we leverage a\nsingle-page encoder for local page-level understanding, and enhance it with\ndocument-level designated layers and learnable tokens, facilitating the flow of\ninformation across pages for global reasoning. To enforce our model to utilize\nthe newly introduced document tokens, we propose a tailored bias adaptation\nmethod. For additional computational savings during decoding, we introduce an\noptional compression stage using our compression-transformer\n(C-Former),reducing the encoded sequence length, thereby allowing a tradeoff\nbetween quality and latency. Extensive experiments showcase GRAM's\nstate-of-the-art performance on the benchmarks for multi-page DocVQA,\ndemonstrating the effectiveness of our approach.\n","authors":["Tsachi Blau","Sharon Fogel","Roi Ronen","Alona Golts","Roy Ganz","Elad Ben Avraham","Aviad Aberdam","Shahar Tsiper","Ron Litman"],"pdf_url":"https://arxiv.org/pdf/2401.03411v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11614v1","updated":"2024-03-18T09:44:44Z","published":"2024-03-18T09:44:44Z","title":"CRS-Diff: Controllable Generative Remote Sensing Foundation Model","summary":" The emergence of diffusion models has revolutionized the field of image\ngeneration, providing new methods for creating high-quality, high-resolution\nimages across various applications. However, the potential of these models for\ngenerating domain-specific images, particularly remote sensing (RS) images,\nremains largely untapped. RS images that are notable for their high resolution,\nextensive coverage, and rich information content, bring new challenges that\ngeneral diffusion models may not adequately address. This paper proposes\nCRS-Diff, a pioneering diffusion modeling framework specifically tailored for\ngenerating remote sensing imagery, leveraging the inherent advantages of\ndiffusion models while integrating advanced control mechanisms to ensure that\nthe imagery is not only visually clear but also enriched with geographic and\ntemporal information. The model integrates global and local control inputs,\nenabling precise combinations of generation conditions to refine the generation\nprocess. A comprehensive evaluation of CRS-Diff has demonstrated its superior\ncapability to generate RS imagery both in a single condition and multiple\nconditions compared with previous methods in terms of image quality and\ndiversity.\n","authors":["Datao Tang","Xiangyong Cao","Xingsong Hou","Zhongyuan Jiang","Deyu Meng"],"pdf_url":"https://arxiv.org/pdf/2403.11614v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.07800v2","updated":"2024-03-18T09:42:20Z","published":"2024-03-12T16:36:27Z","title":"BraSyn 2023 challenge: Missing MRI synthesis and the effect of different\n learning objectives","summary":" This work addresses the Brain Magnetic Resonance Image Synthesis for Tumor\nSegmentation (BraSyn) challenge, which was hosted as part of the Brain Tumor\nSegmentation (BraTS) challenge in 2023. In this challenge, researchers are\ninvited to synthesize a missing magnetic resonance image sequence, given other\navailable sequences, to facilitate tumor segmentation pipelines trained on\ncomplete sets of image sequences. This problem can be tackled using deep\nlearning within the framework of paired image-to-image translation. In this\nstudy, we propose investigating the effectiveness of a commonly used deep\nlearning framework, such as Pix2Pix, trained under the supervision of different\nimage-quality loss functions. Our results indicate that the use of different\nloss functions significantly affects the synthesis quality. We systematically\nstudy the impact of various loss functions in the multi-sequence MR image\nsynthesis setting of the BraSyn challenge. Furthermore, we demonstrate how\nimage synthesis performance can be optimized by combining different learning\nobjectives beneficially.\n","authors":["Ivo M. Baltruschat","Parvaneh Janbakhshi","Matthias Lenga"],"pdf_url":"https://arxiv.org/pdf/2403.07800v2.pdf","comment":"minor changes, to be published as part of the 9th BrainLes:\n International MICCAI Brain Lesion Workshop"},{"id":"http://arxiv.org/abs/2402.16086v2","updated":"2024-03-18T09:33:47Z","published":"2024-02-25T13:22:17Z","title":"Deep Homography Estimation for Visual Place Recognition","summary":" Visual place recognition (VPR) is a fundamental task for many applications\nsuch as robot localization and augmented reality. Recently, the hierarchical\nVPR methods have received considerable attention due to the trade-off between\naccuracy and efficiency. They usually first use global features to retrieve the\ncandidate images, then verify the spatial consistency of matched local features\nfor re-ranking. However, the latter typically relies on the RANSAC algorithm\nfor fitting homography, which is time-consuming and non-differentiable. This\nmakes existing methods compromise to train the network only in global feature\nextraction. Here, we propose a transformer-based deep homography estimation\n(DHE) network that takes the dense feature map extracted by a backbone network\nas input and fits homography for fast and learnable geometric verification.\nMoreover, we design a re-projection error of inliers loss to train the DHE\nnetwork without additional homography labels, which can also be jointly trained\nwith the backbone network to help it extract the features that are more\nsuitable for local matching. Extensive experiments on benchmark datasets show\nthat our method can outperform several state-of-the-art methods. And it is more\nthan one order of magnitude faster than the mainstream hierarchical VPR methods\nusing RANSAC. The code is released at https://github.com/Lu-Feng/DHE-VPR.\n","authors":["Feng Lu","Shuting Dong","Lijun Zhang","Bingxi Liu","Xiangyuan Lan","Dongmei Jiang","Chun Yuan"],"pdf_url":"https://arxiv.org/pdf/2402.16086v2.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2403.06645v2","updated":"2024-03-18T09:22:01Z","published":"2024-03-11T12:07:33Z","title":"Ricci flow-based brain surface covariance descriptors for diagnosing\n Alzheimer's disease","summary":" Automated feature extraction from MRI brain scans and diagnosis of\nAlzheimer's disease are ongoing challenges. With advances in 3D imaging\ntechnology, 3D data acquisition is becoming more viable and efficient than its\n2D counterpart. Rather than using feature-based vectors, in this paper, for the\nfirst time, we suggest a pipeline to extract novel covariance-based descriptors\nfrom the cortical surface using the Ricci energy optimization. The covariance\ndescriptors are components of the nonlinear manifold of symmetric\npositive-definite matrices, thus we focus on using the Gaussian radial basis\nfunction to apply manifold-based classification to the 3D shape problem.\nApplying this novel signature to the analysis of abnormal cortical brain\nmorphometry allows for diagnosing Alzheimer's disease. Experimental studies\nperformed on about two hundred 3D MRI brain models, gathered from Alzheimer's\nDisease Neuroimaging Initiative (ADNI) dataset demonstrate the effectiveness of\nour descriptors in achieving remarkable classification accuracy.\n","authors":["Fatemeh Ahmadi","Mohamad Ebrahim Shiri","Behroz Bidabad","Maral Sedaghat","Pooran Memari"],"pdf_url":"https://arxiv.org/pdf/2403.06645v2.pdf","comment":"Accepted for publication in Biomedical Signal Processing and Control\n journal"},{"id":"http://arxiv.org/abs/2403.11593v1","updated":"2024-03-18T09:12:16Z","published":"2024-03-18T09:12:16Z","title":"End-to-end multi-modal product matching in fashion e-commerce","summary":" Product matching, the task of identifying different representations of the\nsame product for better discoverability, curation, and pricing, is a key\ncapability for online marketplace and e-commerce companies. We present a robust\nmulti-modal product matching system in an industry setting, where large\ndatasets, data distribution shifts and unseen domains pose challenges. We\ncompare different approaches and conclude that a relatively straightforward\nprojection of pretrained image and text encoders, trained through contrastive\nlearning, yields state-of-the-art results, while balancing cost and\nperformance. Our solution outperforms single modality matching systems and\nlarge pretrained models, such as CLIP. Furthermore we show how a\nhuman-in-the-loop process can be combined with model-based predictions to\nachieve near perfect precision in a production system.\n","authors":["Sándor Tóth","Stephen Wilson","Alexia Tsoukara","Enric Moreu","Anton Masalovich","Lars Roemheld"],"pdf_url":"https://arxiv.org/pdf/2403.11593v1.pdf","comment":"9 pages, submitted to SIGKDD"},{"id":"http://arxiv.org/abs/2403.11590v1","updated":"2024-03-18T09:08:41Z","published":"2024-03-18T09:08:41Z","title":"HSEmotion Team at the 6th ABAW Competition: Facial Expressions,\n Valence-Arousal and Emotion Intensity Prediction","summary":" This article presents our results for the sixth Affective Behavior Analysis\nin-the-wild (ABAW) competition. To improve the trustworthiness of facial\nanalysis, we study the possibility of using pre-trained deep models that\nextract reliable emotional features without the need to fine-tune the neural\nnetworks for a downstream task. In particular, we introduce several lightweight\nmodels based on MobileViT, MobileFaceNet, EfficientNet, and DDAMFN\narchitectures trained in multi-task scenarios to recognize facial expressions,\nvalence, and arousal on static photos. These neural networks extract\nframe-level features fed into a simple classifier, e.g., linear feed-forward\nneural network, to predict emotion intensity, compound expressions, action\nunits, facial expressions, and valence/arousal. Experimental results for five\ntasks from the sixth ABAW challenge demonstrate that our approach lets us\nsignificantly improve quality metrics on validation sets compared to existing\nnon-ensemble techniques.\n","authors":["Andrey V. Savchenko"],"pdf_url":"https://arxiv.org/pdf/2403.11590v1.pdf","comment":"10 pages, 1 figure, 8 tables"},{"id":"http://arxiv.org/abs/2310.16640v2","updated":"2024-03-18T09:07:03Z","published":"2023-10-25T13:43:36Z","title":"EmoCLIP: A Vision-Language Method for Zero-Shot Video Facial Expression\n Recognition","summary":" Facial Expression Recognition (FER) is a crucial task in affective computing,\nbut its conventional focus on the seven basic emotions limits its applicability\nto the complex and expanding emotional spectrum. To address the issue of new\nand unseen emotions present in dynamic in-the-wild FER, we propose a novel\nvision-language model that utilises sample-level text descriptions (i.e.\ncaptions of the context, expressions or emotional cues) as natural language\nsupervision, aiming to enhance the learning of rich latent representations, for\nzero-shot classification. To test this, we evaluate using zero-shot\nclassification of the model trained on sample-level descriptions on four\npopular dynamic FER datasets. Our findings show that this approach yields\nsignificant improvements when compared to baseline methods. Specifically, for\nzero-shot video FER, we outperform CLIP by over 10\\% in terms of Weighted\nAverage Recall and 5\\% in terms of Unweighted Average Recall on several\ndatasets. Furthermore, we evaluate the representations obtained from the\nnetwork trained using sample-level descriptions on the downstream task of\nmental health symptom estimation, achieving performance comparable or superior\nto state-of-the-art methods and strong agreement with human experts. Namely, we\nachieve a Pearson's Correlation Coefficient of up to 0.85 on schizophrenia\nsymptom severity estimation, which is comparable to human experts' agreement.\nThe code is publicly available at: https://github.com/NickyFot/EmoCLIP.\n","authors":["Niki Maria Foteinopoulou","Ioannis Patras"],"pdf_url":"https://arxiv.org/pdf/2310.16640v2.pdf","comment":"Accepted at FG'2024"},{"id":"http://arxiv.org/abs/2403.11589v1","updated":"2024-03-18T09:03:56Z","published":"2024-03-18T09:03:56Z","title":"UV Gaussians: Joint Learning of Mesh Deformation and Gaussian Textures\n for Human Avatar Modeling","summary":" Reconstructing photo-realistic drivable human avatars from multi-view image\nsequences has been a popular and challenging topic in the field of computer\nvision and graphics. While existing NeRF-based methods can achieve high-quality\nnovel view rendering of human models, both training and inference processes are\ntime-consuming. Recent approaches have utilized 3D Gaussians to represent the\nhuman body, enabling faster training and rendering. However, they undermine the\nimportance of the mesh guidance and directly predict Gaussians in 3D space with\ncoarse mesh guidance. This hinders the learning procedure of the Gaussians and\ntends to produce blurry textures. Therefore, we propose UV Gaussians, which\nmodels the 3D human body by jointly learning mesh deformations and 2D UV-space\nGaussian textures. We utilize the embedding of UV map to learn Gaussian\ntextures in 2D space, leveraging the capabilities of powerful 2D networks to\nextract features. Additionally, through an independent Mesh network, we\noptimize pose-dependent geometric deformations, thereby guiding Gaussian\nrendering and significantly enhancing rendering quality. We collect and process\na new dataset of human motion, which includes multi-view images, scanned\nmodels, parametric model registration, and corresponding texture maps.\nExperimental results demonstrate that our method achieves state-of-the-art\nsynthesis of novel view and novel pose. The code and data will be made\navailable on the homepage https://alex-jyj.github.io/UV-Gaussians/ once the\npaper is accepted.\n","authors":["Yujiao Jiang","Qingmin Liao","Xiaoyu Li","Li Ma","Qi Zhang","Chaopeng Zhang","Zongqing Lu","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2403.11589v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18605v3","updated":"2024-03-18T09:03:05Z","published":"2023-11-30T15:02:13Z","title":"Learning Triangular Distribution in Visual World","summary":" Convolution neural network is successful in pervasive vision tasks, including\nlabel distribution learning, which usually takes the form of learning an\ninjection from the non-linear visual features to the well-defined labels.\nHowever, how the discrepancy between features is mapped to the label\ndiscrepancy is ambient, and its correctness is not guaranteed.To address these\nproblems, we study the mathematical connection between feature and its label,\npresenting a general and simple framework for label distribution learning. We\npropose a so-called Triangular Distribution Transform (TDT) to build an\ninjective function between feature and label, guaranteeing that any symmetric\nfeature discrepancy linearly reflects the difference between labels. The\nproposed TDT can be used as a plug-in in mainstream backbone networks to\naddress different label distribution learning tasks. Experiments on Facial Age\nRecognition, Illumination Chromaticity Estimation, and Aesthetics assessment\nshow that TDT achieves on-par or better results than the prior arts.\n","authors":["Ping Chen","Xingpeng Zhang","Chengtao Zhou","Dichao Fan","Peng Tu","Le Zhang","Yanlin Qian"],"pdf_url":"https://arxiv.org/pdf/2311.18605v3.pdf","comment":"Accepet by CVPR 2024 (11 pages, 5 figures)"},{"id":"http://arxiv.org/abs/2312.12833v2","updated":"2024-03-18T09:02:20Z","published":"2023-12-20T08:30:07Z","title":"Learning Exhaustive Correlation for Spectral Super-Resolution: Where\n Spatial-Spectral Attention Meets Linear Dependence","summary":" Spectral super-resolution that aims to recover hyperspectral image (HSI) from\neasily obtainable RGB image has drawn increasing interest in the field of\ncomputational photography. The crucial aspect of spectral super-resolution lies\nin exploiting the correlation within HSIs. However, two types of bottlenecks in\nexisting Transformers limit performance improvement and practical applications.\nFirst, existing Transformers often separately emphasize either spatial-wise or\nspectral-wise correlation, disrupting the 3D features of HSI and hindering the\nexploitation of unified spatial-spectral correlation. Second, existing\nself-attention mechanism always establishes full-rank correlation matrix by\nlearning the correlation between pairs of tokens, leading to its inability to\ndescribe linear dependence widely existing in HSI among multiple tokens. To\naddress these issues, we propose a novel Exhaustive Correlation Transformer\n(ECT) for spectral super-resolution. First, we propose a Spectral-wise\nDiscontinuous 3D (SD3D) splitting strategy, which models unified\nspatial-spectral correlation by integrating spatial-wise continuous splitting\nstrategy and spectral-wise discontinuous splitting strategy. Second, we propose\na Dynamic Low-Rank Mapping (DLRM) model, which captures linear dependence among\nmultiple tokens through a dynamically calculated low-rank dependence map. By\nintegrating unified spatial-spectral attention and linear dependence, our ECT\ncan model exhaustive correlation within HSI. The experimental results on both\nsimulated and real data indicate that our method achieves state-of-the-art\nperformance. Codes and pretrained models will be available later.\n","authors":["Hongyuan Wang","Lizhi Wang","Jiang Xu","Chang Chen","Xue Hu","Fenglong Song","Youliang Yan"],"pdf_url":"https://arxiv.org/pdf/2312.12833v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11944v2","updated":"2024-03-18T09:02:03Z","published":"2024-01-22T13:34:34Z","title":"CMMMU: A Chinese Massive Multi-discipline Multimodal Understanding\n Benchmark","summary":" As the capabilities of large multimodal models (LMMs) continue to advance,\nevaluating the performance of LMMs emerges as an increasing need. Additionally,\nthere is an even larger gap in evaluating the advanced knowledge and reasoning\nabilities of LMMs in non-English contexts such as Chinese. We introduce CMMMU,\na new Chinese Massive Multi-discipline Multimodal Understanding benchmark\ndesigned to evaluate LMMs on tasks demanding college-level subject knowledge\nand deliberate reasoning in a Chinese context. CMMMU is inspired by and\nstrictly follows the annotation and analysis pattern of MMMU.\n CMMMU includes 12k manually collected multimodal questions from college\nexams, quizzes, and textbooks, covering six core disciplines: Art & Design,\nBusiness, Science, Health & Medicine, Humanities & Social Science, and Tech &\nEngineering, like its companion, MMMU. These questions span 30 subjects and\ncomprise 39 highly heterogeneous image types, such as charts, diagrams, maps,\ntables, music sheets, and chemical structures.\n CMMMU focuses on complex perception and reasoning with domain-specific\nknowledge in the Chinese context. We evaluate 11 open-source LLMs and one\nproprietary GPT-4V(ision). Even GPT-4V only achieves accuracies of 42%,\nindicating a large space for improvement. CMMMU will boost the community to\nbuild the next-generation LMMs towards expert artificial intelligence and\npromote the democratization of LMMs by providing diverse language contexts.\n","authors":["Ge Zhang","Xinrun Du","Bei Chen","Yiming Liang","Tongxu Luo","Tianyu Zheng","Kang Zhu","Yuyang Cheng","Chunpu Xu","Shuyue Guo","Haoran Zhang","Xingwei Qu","Junjie Wang","Ruibin Yuan","Yizhi Li","Zekun Wang","Yudong Liu","Yu-Hsuan Tsai","Fengji Zhang","Chenghua Lin","Wenhao Huang","Wenhu Chen","Jie Fu"],"pdf_url":"https://arxiv.org/pdf/2401.11944v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11586v1","updated":"2024-03-18T08:58:48Z","published":"2024-03-18T08:58:48Z","title":"DynoSurf: Neural Deformation-based Temporally Consistent Dynamic Surface\n Reconstruction","summary":" This paper explores the problem of reconstructing temporally consistent\nsurfaces from a 3D point cloud sequence without correspondence. To address this\nchallenging task, we propose DynoSurf, an unsupervised learning framework\nintegrating a template surface representation with a learnable deformation\nfield. Specifically, we design a coarse-to-fine strategy for learning the\ntemplate surface based on the deformable tetrahedron representation.\nFurthermore, we propose a learnable deformation representation based on the\nlearnable control points and blending weights, which can deform the template\nsurface non-rigidly while maintaining the consistency of the local shape.\nExperimental results demonstrate the significant superiority of DynoSurf over\ncurrent state-of-the-art approaches, showcasing its potential as a powerful\ntool for dynamic mesh reconstruction. The code is publicly available at\nhttps://github.com/yaoyx689/DynoSurf.\n","authors":["Yuxin Yao","Siyu Ren","Junhui Hou","Zhi Deng","Juyong Zhang","Wenping Wang"],"pdf_url":"https://arxiv.org/pdf/2403.11586v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11582v1","updated":"2024-03-18T08:55:48Z","published":"2024-03-18T08:55:48Z","title":"OurDB: Ouroboric Domain Bridging for Multi-Target Domain Adaptive\n Semantic Segmentation","summary":" Multi-target domain adaptation (MTDA) for semantic segmentation poses a\nsignificant challenge, as it involves multiple target domains with varying\ndistributions. The goal of MTDA is to minimize the domain discrepancies among a\nsingle source and multi-target domains, aiming to train a single model that\nexcels across all target domains. Previous MTDA approaches typically employ\nmultiple teacher architectures, where each teacher specializes in one target\ndomain to simplify the task. However, these architectures hinder the student\nmodel from fully assimilating comprehensive knowledge from all target-specific\nteachers and escalate training costs with increasing target domains. In this\npaper, we propose an ouroboric domain bridging (OurDB) framework, offering an\nefficient solution to the MTDA problem using a single teacher architecture.\nThis framework dynamically cycles through multiple target domains, aligning\neach domain individually to restrain the biased alignment problem, and utilizes\nFisher information to minimize the forgetting of knowledge from previous target\ndomains. We also propose a context-guided class-wise mixup (CGMix) that\nleverages contextual information tailored to diverse target contexts in MTDA.\nExperimental evaluations conducted on four urban driving datasets (i.e., GTA5,\nCityscapes, IDD, and Mapillary) demonstrate the superiority of our method over\nexisting state-of-the-art approaches.\n","authors":["Seungbeom Woo","Geonwoo Baek","Taehoon Kim","Jaemin Na","Joong-won Hwang","Wonjun Hwang"],"pdf_url":"https://arxiv.org/pdf/2403.11582v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16701v2","updated":"2024-03-18T08:55:36Z","published":"2023-08-15T17:38:55Z","title":"Is it Really Negative? Evaluating Natural Language Video Localization\n Performance on Multiple Reliable Videos Pool","summary":" With the explosion of multimedia content in recent years, Video Corpus Moment\nRetrieval (VCMR), which aims to detect a video moment that matches a given\nnatural language query from multiple videos, has become a critical problem.\nHowever, existing VCMR studies have a significant limitation since they have\nregarded all videos not paired with a specific query as negative, neglecting\nthe possibility of including false negatives when constructing the negative\nvideo set. In this paper, we propose an MVMR (Massive Videos Moment Retrieval)\ntask that aims to localize video frames within a massive video set, mitigating\nthe possibility of falsely distinguishing positive and negative videos. For\nthis task, we suggest an automatic dataset construction framework by employing\ntextual and visual semantic matching evaluation methods on the existing video\nmoment search datasets and introduce three MVMR datasets. To solve MVMR task,\nwe further propose a strong method, CroCs, which employs cross-directional\ncontrastive learning that selectively identifies the reliable and informative\nnegatives, enhancing the robustness of a model on MVMR task. Experimental\nresults on the introduced datasets reveal that existing video moment search\nmodels are easily distracted by negative video frames, whereas our model shows\nsignificant performance.\n","authors":["Nakyeong Yang","Minsung Kim","Seunghyun Yoon","Joongbo Shin","Kyomin Jung"],"pdf_url":"https://arxiv.org/pdf/2309.16701v2.pdf","comment":"15 pages, 10 figures"},{"id":"http://arxiv.org/abs/2403.11577v1","updated":"2024-03-18T08:53:03Z","published":"2024-03-18T08:53:03Z","title":"3DGS-Calib: 3D Gaussian Splatting for Multimodal SpatioTemporal\n Calibration","summary":" Reliable multimodal sensor fusion algorithms re- quire accurate\nspatiotemporal calibration. Recently, targetless calibration techniques based\non implicit neural representations have proven to provide precise and robust\nresults. Nevertheless, such methods are inherently slow to train given the high\ncompu- tational overhead caused by the large number of sampled points required\nfor volume rendering. With the recent introduction of 3D Gaussian Splatting as\na faster alternative to implicit representation methods, we propose to leverage\nthis new ren- dering approach to achieve faster multi-sensor calibration. We\nintroduce 3DGS-Calib, a new calibration method that relies on the speed and\nrendering accuracy of 3D Gaussian Splatting to achieve multimodal\nspatiotemporal calibration that is accurate, robust, and with a substantial\nspeed-up compared to methods relying on implicit neural representations. We\ndemonstrate the superiority of our proposal with experimental results on\nsequences from KITTI-360, a widely used driving dataset.\n","authors":["Quentin Herau","Moussab Bennehar","Arthur Moreau","Nathan Piasco","Luis Roldao","Dzmitry Tsishkou","Cyrille Migniot","Pascal Vasseur","Cédric Demonceaux"],"pdf_url":"https://arxiv.org/pdf/2403.11577v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2403.11576v1","updated":"2024-03-18T08:52:23Z","published":"2024-03-18T08:52:23Z","title":"MISS: Memory-efficient Instance Segmentation Framework By Visual\n Inductive Priors Flow Propagation","summary":" Instance segmentation, a cornerstone task in computer vision, has\nwide-ranging applications in diverse industries. The advent of deep learning\nand artificial intelligence has underscored the criticality of training\neffective models, particularly in data-scarce scenarios - a concern that\nresonates in both academic and industrial circles. A significant impediment in\nthis domain is the resource-intensive nature of procuring high-quality,\nannotated data for instance segmentation, a hurdle that amplifies the challenge\nof developing robust models under resource constraints. In this context, the\nstrategic integration of a visual prior into the training dataset emerges as a\npotential solution to enhance congruity with the testing data distribution,\nconsequently reducing the dependency on computational resources and the need\nfor highly complex models. However, effectively embedding a visual prior into\nthe learning process remains a complex endeavor. Addressing this challenge, we\nintroduce the MISS (Memory-efficient Instance Segmentation System) framework.\nMISS leverages visual inductive prior flow propagation, integrating intrinsic\nprior knowledge from the Synergy-basketball dataset at various stages: data\npreprocessing, augmentation, training, and inference. Our empirical evaluations\nunderscore the efficacy of MISS, demonstrating commendable performance in\nscenarios characterized by limited data availability and memory constraints.\n","authors":["Chih-Chung Hsu","Chia-Ming Lee"],"pdf_url":"https://arxiv.org/pdf/2403.11576v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00766v2","updated":"2024-03-18T08:51:58Z","published":"2024-01-01T14:14:35Z","title":"Exposure Bracketing is All You Need for Unifying Image Restoration and\n Enhancement Tasks","summary":" It is highly desired but challenging to acquire high-quality photos with\nclear content in low-light environments. Although multi-image processing\nmethods (using burst, dual-exposure, or multi-exposure images) have made\nsignificant progress in addressing this issue, they typically focus on specific\nrestoration or enhancement problems, being insufficient in exploiting\nmulti-image. Motivated by that multi-exposure images are complementary in\ndenoising, deblurring, high dynamic range imaging, and super-resolution, we\npropose to utilize exposure bracketing photography to unify restoration and\nenhancement tasks in this work. Due to the difficulty in collecting real-world\npairs, we suggest a solution that first pre-trains the model with synthetic\npaired data and then adapts it to real-world unlabeled images. In particular, a\ntemporally modulated recurrent network (TMRNet) and self-supervised adaptation\nmethod are proposed. Moreover, we construct a data simulation pipeline to\nsynthesize pairs and collect real-world images from 200 nighttime scenarios.\nExperiments on both datasets show that our method performs favorably against\nthe state-of-the-art multi-image processing ones. The dataset, code, and\npre-trained models are available at https://github.com/cszhilu1998/BracketIRE.\n","authors":["Zhilu Zhang","Shuohao Zhang","Renlong Wu","Zifei Yan","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2401.00766v2.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/2403.11573v1","updated":"2024-03-18T08:50:04Z","published":"2024-03-18T08:50:04Z","title":"Just Add $100 More: Augmenting NeRF-based Pseudo-LiDAR Point Cloud for\n Resolving Class-imbalance Problem","summary":" Typical LiDAR-based 3D object detection models are trained in a supervised\nmanner with real-world data collection, which is often imbalanced over classes\n(or long-tailed). To deal with it, augmenting minority-class examples by\nsampling ground truth (GT) LiDAR points from a database and pasting them into a\nscene of interest is often used, but challenges still remain: inflexibility in\nlocating GT samples and limited sample diversity. In this work, we propose to\nleverage pseudo-LiDAR point clouds generated (at a low cost) from videos\ncapturing a surround view of miniatures or real-world objects of minor classes.\nOur method, called Pseudo Ground Truth Augmentation (PGT-Aug), consists of\nthree main steps: (i) volumetric 3D instance reconstruction using a 2D-to-3D\nview synthesis model, (ii) object-level domain alignment with LiDAR intensity\nestimation and (iii) a hybrid context-aware placement method from ground and\nmap information. We demonstrate the superiority and generality of our method\nthrough performance improvements in extensive experiments conducted on three\npopular benchmarks, i.e., nuScenes, KITTI, and Lyft, especially for the\ndatasets with large domain gaps captured by different LiDAR configurations. Our\ncode and data will be publicly available upon publication.\n","authors":["Mincheol Chang","Siyeong Lee","Jinkyu Kim","Namil Kim"],"pdf_url":"https://arxiv.org/pdf/2403.11573v1.pdf","comment":"28 pages, 12 figures, 11 tables; Submitted to ECCV 2024"},{"id":"http://arxiv.org/abs/2311.10529v3","updated":"2024-03-18T08:47:03Z","published":"2023-11-17T13:49:00Z","title":"Enhancing the Reliability of Segment Anything Model for Auto-Prompting\n Medical Image Segmentation with Uncertainty Rectification","summary":" The Segment Anything Model (SAM) has recently emerged as a groundbreaking\nfoundation model for prompt-driven image segmentation tasks. However, both the\noriginal SAM and its medical variants require slice-by-slice manual prompting\nof target structures, which directly increase the burden for applications.\nDespite attempts of auto-prompting to turn SAM into a fully automatic manner,\nit still exhibits subpar performance and lacks of reliability especially in the\nfield of medical imaging. In this paper, we propose UR-SAM, an uncertainty\nrectified SAM framework to enhance the reliability for auto-prompting medical\nimage segmentation. Building upon a localization framework for automatic prompt\ngeneration, our method incorporates a prompt augmentation module to obtain a\nseries of input prompts for SAM for uncertainty estimation and an\nuncertainty-based rectification module to further utilize the distribution of\nestimated uncertainty to improve the segmentation performance. Extensive\nexperiments on two public 3D medical datasets covering the segmentation of 35\norgans demonstrate that without supplementary training or fine-tuning, our\nmethod further improves the segmentation performance with up to 10.7 % and 13.8\n% in dice similarity coefficient, demonstrating efficiency and broad\ncapabilities for medical image segmentation without manual prompting.\n","authors":["Yichi Zhang","Shiyao Hu","Sijie Ren","Chen Jiang","Yuan Cheng","Yuan Qi"],"pdf_url":"https://arxiv.org/pdf/2311.10529v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14405v2","updated":"2024-03-18T08:45:52Z","published":"2024-01-25T18:59:58Z","title":"Multimodal Pathway: Improve Transformers with Irrelevant Data from Other\n Modalities","summary":" We propose to improve transformers of a specific modality with irrelevant\ndata from other modalities, e.g., improve an ImageNet model with audio or point\ncloud datasets. We would like to highlight that the data samples of the target\nmodality are irrelevant to the other modalities, which distinguishes our method\nfrom other works utilizing paired (e.g., CLIP) or interleaved data of different\nmodalities. We propose a methodology named Multimodal Pathway - given a target\nmodality and a transformer designed for it, we use an auxiliary transformer\ntrained with data of another modality and construct pathways to connect\ncomponents of the two models so that data of the target modality can be\nprocessed by both models. In this way, we utilize the universal\nsequence-to-sequence modeling abilities of transformers obtained from two\nmodalities. As a concrete implementation, we use a modality-specific tokenizer\nand task-specific head as usual but utilize the transformer blocks of the\nauxiliary model via a proposed method named Cross-Modal Re-parameterization,\nwhich exploits the auxiliary weights without any inference costs. On the image,\npoint cloud, video, and audio recognition tasks, we observe significant and\nconsistent performance improvements with irrelevant data from other modalities.\nThe code and models are available at https://github.com/AILab-CVC/M2PT.\n","authors":["Yiyuan Zhang","Xiaohan Ding","Kaixiong Gong","Yixiao Ge","Ying Shan","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2401.14405v2.pdf","comment":"CVPR 2024. Code and models are available at\n https://github.com/AILab-CVC/M2PT"},{"id":"http://arxiv.org/abs/2403.11572v1","updated":"2024-03-18T08:44:40Z","published":"2024-03-18T08:44:40Z","title":"Augment Before Copy-Paste: Data and Memory Efficiency-Oriented Instance\n Segmentation Framework for Sport-scenes","summary":" Instance segmentation is a fundamental task in computer vision with broad\napplications across various industries. In recent years, with the proliferation\nof deep learning and artificial intelligence applications, how to train\neffective models with limited data has become a pressing issue for both\nacademia and industry. In the Visual Inductive Priors challenge (VIPriors2023),\nparticipants must train a model capable of precisely locating individuals on a\nbasketball court, all while working with limited data and without the use of\ntransfer learning or pre-trained models. We propose Memory effIciency inStance\nSegmentation framework based on visual inductive prior flow propagation that\neffectively incorporates inherent prior information from the dataset into both\nthe data preprocessing and data augmentation stages, as well as the inference\nphase. Our team (ACVLAB) experiments demonstrate that our model achieves\npromising performance (0.509 AP@0.50:0.95) even under limited data and memory\nconstraints.\n","authors":["Chih-Chung Hsu","Chia-Ming Lee","Ming-Shyen Wu"],"pdf_url":"https://arxiv.org/pdf/2403.11572v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11570v1","updated":"2024-03-18T08:43:42Z","published":"2024-03-18T08:43:42Z","title":"LogicalDefender: Discovering, Extracting, and Utilizing Common-Sense\n Knowledge","summary":" Large text-to-image models have achieved astonishing performance in\nsynthesizing diverse and high-quality images guided by texts. With\ndetail-oriented conditioning control, even finer-grained spatial control can be\nachieved. However, some generated images still appear unreasonable, even with\nplentiful object features and a harmonious style. In this paper, we delve into\nthe underlying causes and find that deep-level logical information, serving as\ncommon-sense knowledge, plays a significant role in understanding and\nprocessing images. Nonetheless, almost all models have neglected the importance\nof logical relations in images, resulting in poor performance in this aspect.\nFollowing this observation, we propose LogicalDefender, which combines images\nwith the logical knowledge already summarized by humans in text. This\nencourages models to learn logical knowledge faster and better, and\nconcurrently, extracts the widely applicable logical knowledge from both images\nand human knowledge. Experiments show that our model has achieved better\nlogical performance, and the extracted logical knowledge can be effectively\napplied to other scenarios.\n","authors":["Yuhe Liu","Mengxue Kang","Zengchang Qin","Xiangxiang Chu"],"pdf_url":"https://arxiv.org/pdf/2403.11570v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11568v1","updated":"2024-03-18T08:42:08Z","published":"2024-03-18T08:42:08Z","title":"EffiVED:Efficient Video Editing via Text-instruction Diffusion Models","summary":" Large-scale text-to-video models have shown remarkable abilities, but their\ndirect application in video editing remains challenging due to limited\navailable datasets. Current video editing methods commonly require per-video\nfine-tuning of diffusion models or specific inversion optimization to ensure\nhigh-fidelity edits. In this paper, we introduce EffiVED, an efficient\ndiffusion-based model that directly supports instruction-guided video editing.\nTo achieve this, we present two efficient workflows to gather video editing\npairs, utilizing augmentation and fundamental vision-language techniques. These\nworkflows transform vast image editing datasets and open-world videos into a\nhigh-quality dataset for training EffiVED. Experimental results reveal that\nEffiVED not only generates high-quality editing videos but also executes\nrapidly. Finally, we demonstrate that our data collection method significantly\nimproves editing performance and can potentially tackle the scarcity of video\nediting data. The datasets will be made publicly available upon publication.\n","authors":["Zhenghao Zhang","Zuozhuo Dai","Long Qin","Weizhi Wang"],"pdf_url":"https://arxiv.org/pdf/2403.11568v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.04164v2","updated":"2024-03-18T08:40:48Z","published":"2024-03-07T02:48:42Z","title":"ProMISe: Promptable Medical Image Segmentation using SAM","summary":" With the proposal of the Segment Anything Model (SAM), fine-tuning SAM for\nmedical image segmentation (MIS) has become popular. However, due to the large\nsize of the SAM model and the significant domain gap between natural and\nmedical images, fine-tuning-based strategies are costly with potential risk of\ninstability, feature damage and catastrophic forgetting. Furthermore, some\nmethods of transferring SAM to a domain-specific MIS through fine-tuning\nstrategies disable the model's prompting capability, severely limiting its\nutilization scenarios. In this paper, we propose an Auto-Prompting Module\n(APM), which provides SAM-based foundation model with Euclidean adaptive\nprompts in the target domain. Our experiments demonstrate that such adaptive\nprompts significantly improve SAM's non-fine-tuned performance in MIS. In\naddition, we propose a novel non-invasive method called Incremental Pattern\nShifting (IPS) to adapt SAM to specific medical domains. Experimental results\nshow that the IPS enables SAM to achieve state-of-the-art or competitive\nperformance in MIS without the need for fine-tuning. By coupling these two\nmethods, we propose ProMISe, an end-to-end non-fine-tuned framework for\nPromptable Medical Image Segmentation. Our experiments demonstrate that both\nusing our methods individually or in combination achieves satisfactory\nperformance in low-cost pattern shifting, with all of SAM's parameters frozen.\n","authors":["Jinfeng Wang","Sifan Song","Xinkun Wang","Yiyi Wang","Yiyi Miao","Jionglong Su","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2403.04164v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15599v2","updated":"2024-03-18T08:37:24Z","published":"2023-11-27T07:48:50Z","title":"UniRepLKNet: A Universal Perception Large-Kernel ConvNet for Audio,\n Video, Point Cloud, Time-Series and Image Recognition","summary":" Large-kernel convolutional neural networks (ConvNets) have recently received\nextensive research attention, but two unresolved and critical issues demand\nfurther investigation. 1) The architectures of existing large-kernel ConvNets\nlargely follow the design principles of conventional ConvNets or transformers,\nwhile the architectural design for large-kernel ConvNets remains\nunder-addressed. 2) As transformers have dominated multiple modalities, it\nremains to be investigated whether ConvNets also have a strong universal\nperception ability in domains beyond vision. In this paper, we contribute from\ntwo aspects. 1) We propose four architectural guidelines for designing\nlarge-kernel ConvNets, the core of which is to exploit the essential\ncharacteristics of large kernels that distinguish them from small kernels -\nthey can see wide without going deep. Following such guidelines, our proposed\nlarge-kernel ConvNet shows leading performance in image recognition (ImageNet\naccuracy of 88.0%, ADE20K mIoU of 55.6%, and COCO box AP of 56.4%),\ndemonstrating better performance and higher speed than the recent powerful\ncompetitors. 2) We discover large kernels are the key to unlocking the\nexceptional performance of ConvNets in domains where they were originally not\nproficient. With certain modality-related preprocessing approaches, the\nproposed model achieves state-of-the-art performance on time-series forecasting\nand audio recognition tasks even without modality-specific customization to the\narchitecture. All the code and models are publicly available on GitHub and\nHuggingface.\n","authors":["Xiaohan Ding","Yiyuan Zhang","Yixiao Ge","Sijie Zhao","Lin Song","Xiangyu Yue","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2311.15599v2.pdf","comment":"CVPR 2024. Code, all the models, reproducible training scripts at\n https://github.com/AILab-CVC/UniRepLKNet"},{"id":"http://arxiv.org/abs/2310.12149v2","updated":"2024-03-18T08:36:02Z","published":"2023-10-18T17:59:02Z","title":"Object-aware Inversion and Reassembly for Image Editing","summary":" By comparing the original and target prompts, we can obtain numerous editing\npairs, each comprising an object and its corresponding editing target. To allow\neditability while maintaining fidelity to the input image, existing editing\nmethods typically involve a fixed number of inversion steps that project the\nwhole input image to its noisier latent representation, followed by a denoising\nprocess guided by the target prompt. However, we find that the optimal number\nof inversion steps for achieving ideal editing results varies significantly\namong different editing pairs, owing to varying editing difficulties.\nTherefore, the current literature, which relies on a fixed number of inversion\nsteps, produces sub-optimal generation quality, especially when handling\nmultiple editing pairs in a natural image. To this end, we propose a new image\nediting paradigm, dubbed Object-aware Inversion and Reassembly (OIR), to enable\nobject-level fine-grained editing. Specifically, we design a new search metric,\nwhich determines the optimal inversion steps for each editing pair, by jointly\nconsidering the editability of the target and the fidelity of the non-editing\nregion. We use our search metric to find the optimal inversion step for each\nediting pair when editing an image. We then edit these editing pairs separately\nto avoid concept mismatch. Subsequently, we propose an additional reassembly\nstep to seamlessly integrate the respective editing results and the non-editing\nregion to obtain the final edited image. To systematically evaluate the\neffectiveness of our method, we collect two datasets called OIRBench for\nbenchmarking single- and multi-object editing, respectively. Experiments\ndemonstrate that our method achieves superior performance in editing object\nshapes, colors, materials, categories, etc., especially in multi-object editing\nscenarios.\n","authors":["Zhen Yang","Ganggui Ding","Wen Wang","Hao Chen","Bohan Zhuang","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2310.12149v2.pdf","comment":"Project Page: https://aim-uofa.github.io/OIR-Diffusion/"},{"id":"http://arxiv.org/abs/2403.11561v1","updated":"2024-03-18T08:29:47Z","published":"2024-03-18T08:29:47Z","title":"Learning Unified Reference Representation for Unsupervised Multi-class\n Anomaly Detection","summary":" In the field of multi-class anomaly detection, reconstruction-based methods\nderived from single-class anomaly detection face the well-known challenge of\n``learning shortcuts'', wherein the model fails to learn the patterns of normal\nsamples as it should, opting instead for shortcuts such as identity mapping or\nartificial noise elimination. Consequently, the model becomes unable to\nreconstruct genuine anomalies as normal instances, resulting in a failure of\nanomaly detection. To counter this issue, we present a novel unified feature\nreconstruction-based anomaly detection framework termed RLR (Reconstruct\nfeatures from a Learnable Reference representation). Unlike previous methods,\nRLR utilizes learnable reference representations to compel the model to learn\nnormal feature patterns explicitly, thereby prevents the model from succumbing\nto the ``learning shortcuts'' issue. Additionally, RLR incorporates locality\nconstraints into the learnable reference to facilitate more effective normal\npattern capture and utilizes a masked learnable key attention mechanism to\nenhance robustness. Evaluation of RLR on the 15-category MVTec-AD dataset and\nthe 12-category VisA dataset shows superior performance compared to\nstate-of-the-art methods under the unified setting. The code of RLR will be\npublicly available.\n","authors":["Liren He","Zhengkai Jiang","Jinlong Peng","Liang Liu","Qiangang Du","Xiaobin Hu","Wenbing Zhu","Mingmin Chi","Yabiao Wang","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2403.11561v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14272v2","updated":"2024-03-18T08:15:48Z","published":"2023-11-24T04:16:32Z","title":"CRISP: Hybrid Structured Sparsity for Class-aware Model Pruning","summary":" Machine learning pipelines for classification tasks often train a universal\nmodel to achieve accuracy across a broad range of classes. However, a typical\nuser encounters only a limited selection of classes regularly. This disparity\nprovides an opportunity to enhance computational efficiency by tailoring models\nto focus on user-specific classes. Existing works rely on unstructured pruning,\nwhich introduces randomly distributed non-zero values in the model, making it\nunsuitable for hardware acceleration. Alternatively, some approaches employ\nstructured pruning, such as channel pruning, but these tend to provide only\nminimal compression and may lead to reduced model accuracy. In this work, we\npropose CRISP, a novel pruning framework leveraging a hybrid structured\nsparsity pattern that combines both fine-grained N:M structured sparsity and\ncoarse-grained block sparsity. Our pruning strategy is guided by a\ngradient-based class-aware saliency score, allowing us to retain weights\ncrucial for user-specific classes. CRISP achieves high accuracy with minimal\nmemory consumption for popular models like ResNet-50, VGG-16, and MobileNetV2\non ImageNet and CIFAR-100 datasets. Moreover, CRISP delivers up to 14$\\times$\nreduction in latency and energy consumption compared to existing pruning\nmethods while maintaining comparable accuracy. Our code is available at\nhttps://github.com/shivmgg/CRISP/.\n","authors":["Shivam Aggarwal","Kuluhan Binici","Tulika Mitra"],"pdf_url":"https://arxiv.org/pdf/2311.14272v2.pdf","comment":"6 pages, accepted in Design, Automation & Test in Europe Conference &\n Exhibition (DATE) 2024"},{"id":"http://arxiv.org/abs/2403.11556v1","updated":"2024-03-18T08:13:26Z","published":"2024-03-18T08:13:26Z","title":"Hierarchical Frequency-based Upsampling and Refining for Compressed\n Video Quality Enhancement","summary":" Video compression artifacts arise due to the quantization operation in the\nfrequency domain. The goal of video quality enhancement is to reduce\ncompression artifacts and reconstruct a visually-pleasant result. In this work,\nwe propose a hierarchical frequency-based upsampling and refining neural\nnetwork (HFUR) for compressed video quality enhancement. HFUR consists of two\nmodules: implicit frequency upsampling module (ImpFreqUp) and hierarchical and\niterative refinement module (HIR). ImpFreqUp exploits DCT-domain prior derived\nthrough implicit DCT transform, and accurately reconstructs the DCT-domain loss\nvia a coarse-to-fine transfer. Consequently, HIR is introduced to facilitate\ncross-collaboration and information compensation between the scales, thus\nfurther refine the feature maps and promote the visual quality of the final\noutput. We demonstrate the effectiveness of the proposed modules via ablation\nexperiments and visualized results. Extensive experiments on public benchmarks\nshow that HFUR achieves state-of-the-art performance for both constant bit rate\nand constant QP modes.\n","authors":["Qianyu Zhang","Bolun Zheng","Xinying Chen","Quan Chen","Zhunjie Zhu","Canjin Wang","Zongpeng Li","Chengang Yan"],"pdf_url":"https://arxiv.org/pdf/2403.11556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11550v1","updated":"2024-03-18T08:01:23Z","published":"2024-03-18T08:01:23Z","title":"TARN-VIST: Topic Aware Reinforcement Network for Visual Storytelling","summary":" As a cross-modal task, visual storytelling aims to generate a story for an\nordered image sequence automatically. Different from the image captioning task,\nvisual storytelling requires not only modeling the relationships between\nobjects in the image but also mining the connections between adjacent images.\nRecent approaches primarily utilize either end-to-end frameworks or multi-stage\nframeworks to generate relevant stories, but they usually overlook latent topic\ninformation. In this paper, in order to generate a more coherent and relevant\nstory, we propose a novel method, Topic Aware Reinforcement Network for VIsual\nStoryTelling (TARN-VIST). In particular, we pre-extracted the topic information\nof stories from both visual and linguistic perspectives. Then we apply two\ntopic-consistent reinforcement learning rewards to identify the discrepancy\nbetween the generated story and the human-labeled story so as to refine the\nwhole generation process. Extensive experimental results on the VIST dataset\nand human evaluation demonstrate that our proposed model outperforms most of\nthe competitive models across multiple evaluation metrics.\n","authors":["Weiran Chen","Xin Li","Jiaqi Su","Guiqian Zhu","Ying Li","Yi Ji","Chunping Liu"],"pdf_url":"https://arxiv.org/pdf/2403.11550v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11549v1","updated":"2024-03-18T08:00:23Z","published":"2024-03-18T08:00:23Z","title":"Boosting Continual Learning of Vision-Language Models via\n Mixture-of-Experts Adapters","summary":" Continual learning can empower vision-language models to continuously acquire\nnew knowledge, without the need for access to the entire historical dataset.\nHowever, mitigating the performance degradation in large-scale models is\nnon-trivial due to (i) parameter shifts throughout lifelong learning and (ii)\nsignificant computational burdens associated with full-model tuning. In this\nwork, we present a parameter-efficient continual learning framework to\nalleviate long-term forgetting in incremental learning with vision-language\nmodels. Our approach involves the dynamic expansion of a pre-trained CLIP\nmodel, through the integration of Mixture-of-Experts (MoE) adapters in response\nto new tasks. To preserve the zero-shot recognition capability of\nvision-language models, we further introduce a Distribution Discriminative\nAuto-Selector (DDAS) that automatically routes in-distribution and\nout-of-distribution inputs to the MoE Adapter and the original CLIP,\nrespectively. Through extensive experiments across various settings, our\nproposed method consistently outperforms previous state-of-the-art approaches\nwhile concurrently reducing parameter training burdens by 60%. Our code locates\nat https://github.com/JiazuoYu/MoE-Adapters4CL\n","authors":["Jiazuo Yu","Yunzhi Zhuge","Lu Zhang","Dong Wang","Huchuan Lu","You He"],"pdf_url":"https://arxiv.org/pdf/2403.11549v1.pdf","comment":"This work is accepted by CVPR2024. More modifications may be\n performed"},{"id":"http://arxiv.org/abs/2301.00371v2","updated":"2024-03-18T07:57:19Z","published":"2023-01-01T08:38:07Z","title":"Robust Domain Adaptive Object Detection with Unified Multi-Granularity\n Alignment","summary":" Domain adaptive detection aims to improve the generalization of detectors on\ntarget domain. To reduce discrepancy in feature distributions between two\ndomains, recent approaches achieve domain adaption through feature alignment in\ndifferent granularities via adversarial learning. However, they neglect the\nrelationship between multiple granularities and different features in\nalignment, degrading detection. Addressing this, we introduce a unified\nmulti-granularity alignment (MGA)-based detection framework for\ndomain-invariant feature learning. The key is to encode the dependencies across\ndifferent granularities including pixel-, instance-, and category-levels\nsimultaneously to align two domains. Specifically, based on pixel-level\nfeatures, we first develop an omni-scale gated fusion (OSGF) module to\naggregate discriminative representations of instances with scale-aware\nconvolutions, leading to robust multi-scale detection. Besides, we introduce\nmulti-granularity discriminators to identify where, either source or target\ndomains, different granularities of samples come from. Note that, MGA not only\nleverages instance discriminability in different categories but also exploits\ncategory consistency between two domains for detection. Furthermore, we present\nan adaptive exponential moving average (AEMA) strategy that explores model\nassessments for model update to improve pseudo labels and alleviate local\nmisalignment problem, boosting detection robustness. Extensive experiments on\nmultiple domain adaption scenarios validate the superiority of MGA over other\napproaches on FCOS and Faster R-CNN detectors. Code will be released at\nhttps://github.com/tiankongzhang/MGA.\n","authors":["Libo Zhang","Wenzhang Zhou","Heng Fan","Tiejian Luo","Haibin Ling"],"pdf_url":"https://arxiv.org/pdf/2301.00371v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11541v1","updated":"2024-03-18T07:51:22Z","published":"2024-03-18T07:51:22Z","title":"Hierarchical Spatial Proximity Reasoning for Vision-and-Language\n Navigation","summary":" Most Vision-and-Language Navigation (VLN) algorithms tend to make decision\nerrors, primarily due to a lack of visual common sense and insufficient\nreasoning capabilities. To address this issue, this paper proposes a\nHierarchical Spatial Proximity Reasoning (HSPR) model. Firstly, we design a\nScene Understanding Auxiliary Task (SUAT) to assist the agent in constructing a\nknowledge base of hierarchical spatial proximity for reasoning navigation.\nSpecifically, this task utilizes panoramic views and object features to\nidentify regions in the navigation environment and uncover the adjacency\nrelationships between regions, objects, and region-object pairs. Secondly, we\ndynamically construct a semantic topological map through agent-environment\ninteractions and propose a Multi-step Reasoning Navigation Algorithm (MRNA)\nbased on the map. This algorithm continuously plans various feasible paths from\none region to another, utilizing the constructed proximity knowledge base,\nenabling more efficient exploration. Additionally, we introduce a Proximity\nAdaptive Attention Module (PAAM) and Residual Fusion Method (RFM) to enable the\nmodel to obtain more accurate navigation decision confidence. Finally, we\nconduct experiments on publicly available datasets including REVERIE, SOON,\nR2R, and R4R to validate the effectiveness of the proposed approach.\n","authors":["Ming Xu","Zilong Xie"],"pdf_url":"https://arxiv.org/pdf/2403.11541v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11537v1","updated":"2024-03-18T07:43:14Z","published":"2024-03-18T07:43:14Z","title":"Semantic Prompting with Image-Token for Continual Learning","summary":" Continual learning aims to refine model parameters for new tasks while\nretaining knowledge from previous tasks. Recently, prompt-based learning has\nemerged to leverage pre-trained models to be prompted to learn subsequent tasks\nwithout the reliance on the rehearsal buffer. Although this approach has\ndemonstrated outstanding results, existing methods depend on preceding\ntask-selection process to choose appropriate prompts. However, imperfectness in\ntask-selection may lead to negative impacts on the performance particularly in\nthe scenarios where the number of tasks is large or task distributions are\nimbalanced. To address this issue, we introduce I-Prompt, a task-agnostic\napproach focuses on the visual semantic information of image tokens to\neliminate task prediction. Our method consists of semantic prompt matching,\nwhich determines prompts based on similarities between tokens, and image\ntoken-level prompting, which applies prompts directly to image tokens in the\nintermediate layers. Consequently, our method achieves competitive performance\non four benchmarks while significantly reducing training time compared to\nstate-of-the-art methods. Moreover, we demonstrate the superiority of our\nmethod across various scenarios through extensive experiments.\n","authors":["Jisu Han","Jaemin Na","Wonjun Hwang"],"pdf_url":"https://arxiv.org/pdf/2403.11537v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11536v1","updated":"2024-03-18T07:41:39Z","published":"2024-03-18T07:41:39Z","title":"OCR is All you need: Importing Multi-Modality into Image-based Defect\n Detection System","summary":" Automatic optical inspection (AOI) plays a pivotal role in the manufacturing\nprocess, predominantly leveraging high-resolution imaging instruments for\nscanning purposes. It detects anomalies by analyzing image textures or\npatterns, making it an essential tool in industrial manufacturing and quality\ncontrol. Despite its importance, the deployment of models for AOI often faces\nchallenges. These include limited sample sizes, which hinder effective feature\nlearning, variations among source domains, and sensitivities to changes in\nlighting and camera positions during imaging. These factors collectively\ncompromise the accuracy of model predictions. Traditional AOI often fails to\ncapitalize on the rich mechanism-parameter information from machines or inside\nimages, including statistical parameters, which typically benefit AOI\nclassification. To address this, we introduce an external modality-guided data\nmining framework, primarily rooted in optical character recognition (OCR), to\nextract statistical features from images as a second modality to enhance\nperformance, termed OANet (Ocr-Aoi-Net). A key aspect of our approach is the\nalignment of external modality features, extracted using a single\nmodality-aware model, with image features encoded by a convolutional neural\nnetwork. This synergy enables a more refined fusion of semantic representations\nfrom different modalities. We further introduce feature refinement and a gating\nfunction in our OANet to optimize the combination of these features, enhancing\ninference and decision-making capabilities. Experimental outcomes show that our\nmethodology considerably boosts the recall rate of the defect detection model\nand maintains high robustness even in challenging scenarios.\n","authors":["Chih-Chung Hsu","Chia-Ming Lee","Chun-Hung Sun","Kuang-Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2403.11536v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11535v1","updated":"2024-03-18T07:41:19Z","published":"2024-03-18T07:41:19Z","title":"EchoReel: Enhancing Action Generation of Existing Video Diffusion Models","summary":" Recent large-scale video datasets have facilitated the generation of diverse\nopen-domain videos of Video Diffusion Models (VDMs). Nonetheless, the efficacy\nof VDMs in assimilating complex knowledge from these datasets remains\nconstrained by their inherent scale, leading to suboptimal comprehension and\nsynthesis of numerous actions. In this paper, we introduce EchoReel, a novel\napproach to augment the capability of VDMs in generating intricate actions by\nemulating motions from pre-existing videos, which are readily accessible from\ndatabases or online repositories. EchoReel seamlessly integrates with existing\nVDMs, enhancing their ability to produce realistic motions without compromising\ntheir fundamental capabilities. Specifically, the Action Prism (AP), is\nintroduced to distill motion information from reference videos, which requires\ntraining on only a small dataset. Leveraging the knowledge from pre-trained\nVDMs, EchoReel incorporates new action features into VDMs through the\nadditional layers, eliminating the need for any further fine-tuning of\nuntrained actions. Extensive experiments demonstrate that EchoReel is not\nmerely replicating the whole content from references, and it significantly\nimproves the generation of realistic actions, even in situations where existing\nVDMs might directly fail.\n","authors":["Jianzhi liu","Junchen Zhu","Lianli Gao","Jingkuan Song"],"pdf_url":"https://arxiv.org/pdf/2403.11535v1.pdf","comment":"22 pages, 10 figures"},{"id":"http://arxiv.org/abs/2403.11532v1","updated":"2024-03-18T07:35:25Z","published":"2024-03-18T07:35:25Z","title":"Out-of-Distribution Detection Should Use Conformal Prediction (and\n Vice-versa?)","summary":" Research on Out-Of-Distribution (OOD) detection focuses mainly on building\nscores that efficiently distinguish OOD data from In Distribution (ID) data. On\nthe other hand, Conformal Prediction (CP) uses non-conformity scores to\nconstruct prediction sets with probabilistic coverage guarantees. In this work,\nwe propose to use CP to better assess the efficiency of OOD scores.\nSpecifically, we emphasize that in standard OOD benchmark settings, evaluation\nmetrics can be overly optimistic due to the finite sample size of the test\ndataset. Based on the work of (Bates et al., 2022), we define new conformal\nAUROC and conformal FRP@TPR95 metrics, which are corrections that provide\nprobabilistic conservativeness guarantees on the variability of these metrics.\nWe show the effect of these corrections on two reference OOD and anomaly\ndetection benchmarks, OpenOOD (Yang et al., 2022) and ADBench (Han et al.,\n2022). We also show that the benefits of using OOD together with CP apply the\nother way around by using OOD scores as non-conformity scores, which results in\nimproving upon current CP methods. One of the key messages of these\ncontributions is that since OOD is concerned with designing scores and CP with\ninterpreting these scores, the two fields may be inherently intertwined.\n","authors":["Paul Novello","Joseba Dalmau","Léo Andeol"],"pdf_url":"https://arxiv.org/pdf/2403.11532v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11530v1","updated":"2024-03-18T07:33:56Z","published":"2024-03-18T07:33:56Z","title":"Continual Forgetting for Pre-trained Vision Models","summary":" For privacy and security concerns, the need to erase unwanted information\nfrom pre-trained vision models is becoming evident nowadays. In real-world\nscenarios, erasure requests originate at any time from both users and model\nowners. These requests usually form a sequence. Therefore, under such a\nsetting, selective information is expected to be continuously removed from a\npre-trained model while maintaining the rest. We define this problem as\ncontinual forgetting and identify two key challenges. (i) For unwanted\nknowledge, efficient and effective deleting is crucial. (ii) For remaining\nknowledge, the impact brought by the forgetting procedure should be minimal. To\naddress them, we propose Group Sparse LoRA (GS-LoRA). Specifically, towards\n(i), we use LoRA modules to fine-tune the FFN layers in Transformer blocks for\neach forgetting task independently, and towards (ii), a simple group sparse\nregularization is adopted, enabling automatic selection of specific LoRA groups\nand zeroing out the others. GS-LoRA is effective, parameter-efficient,\ndata-efficient, and easy to implement. We conduct extensive experiments on face\nrecognition, object detection and image classification and demonstrate that\nGS-LoRA manages to forget specific classes with minimal impact on other\nclasses. Codes will be released on \\url{https://github.com/bjzhb666/GS-LoRA}.\n","authors":["Hongbo Zhao","Bolin Ni","Haochen Wang","Junsong Fan","Fei Zhu","Yuxi Wang","Yuntao Chen","Gaofeng Meng","Zhaoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.11530v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.11529v1","updated":"2024-03-18T07:31:39Z","published":"2024-03-18T07:31:39Z","title":"Video Object Segmentation with Dynamic Query Modulation","summary":" Storing intermediate frame segmentations as memory for long-range context\nmodeling, spatial-temporal memory-based methods have recently showcased\nimpressive results in semi-supervised video object segmentation (SVOS).\nHowever, these methods face two key limitations: 1) relying on non-local\npixel-level matching to read memory, resulting in noisy retrieved features for\nsegmentation; 2) segmenting each object independently without interaction.\nThese shortcomings make the memory-based methods struggle in similar object and\nmulti-object segmentation. To address these issues, we propose a query\nmodulation method, termed QMVOS. This method summarizes object features into\ndynamic queries and then treats them as dynamic filters for mask prediction,\nthereby providing high-level descriptions and object-level perception for the\nmodel. Efficient and effective multi-object interactions are realized through\ninter-query attention. Extensive experiments demonstrate that our method can\nbring significant improvements to the memory-based SVOS method and achieve\ncompetitive performance on standard SVOS benchmarks. The code is available at\nhttps://github.com/zht8506/QMVOS.\n","authors":["Hantao Zhou","Runze Hu","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2403.11529v1.pdf","comment":"Accepted by ICME2024"},{"id":"http://arxiv.org/abs/2311.00318v2","updated":"2024-03-18T07:21:27Z","published":"2023-11-01T06:02:59Z","title":"Flooding Regularization for Stable Training of Generative Adversarial\n Networks","summary":" Generative Adversarial Networks (GANs) have shown remarkable performance in\nimage generation. However, GAN training suffers from the problem of\ninstability. One of the main approaches to address this problem is to modify\nthe loss function, often using regularization terms in addition to changing the\ntype of adversarial losses. This paper focuses on directly regularizing the\nadversarial loss function. We propose a method that applies flooding, an\noverfitting suppression method in supervised learning, to GANs to directly\nprevent the discriminator's loss from becoming excessively low. Flooding\nrequires tuning the flood level, but when applied to GANs, we propose that the\nappropriate range of flood level settings is determined by the adversarial loss\nfunction, supported by theoretical analysis of GANs using the binary cross\nentropy loss. We experimentally verify that flooding stabilizes GAN training\nand can be combined with other stabilization techniques. We also show that by\nrestricting the discriminator's loss to be no less than the flood level, the\ntraining proceeds stably even when the flood level is somewhat high.\n","authors":["Iu Yahiro","Takashi Ishida","Naoto Yokoya"],"pdf_url":"https://arxiv.org/pdf/2311.00318v2.pdf","comment":"25 pages, 9 figures, 18 tables"},{"id":"http://arxiv.org/abs/2403.04306v2","updated":"2024-03-18T07:21:01Z","published":"2024-03-07T08:25:27Z","title":"Effectiveness Assessment of Recent Large Vision-Language Models","summary":" The advent of large vision-language models (LVLMs) represents a noteworthy\nadvancement towards the pursuit of artificial general intelligence. However,\nthe extent of their efficacy across both specialized and general tasks warrants\nfurther investigation. This article endeavors to evaluate the competency of\npopular LVLMs in specialized and general tasks, respectively, aiming to offer a\ncomprehensive comprehension of these innovative methodologies. To gauge their\nefficacy in specialized tasks, we tailor a comprehensive testbed comprising\nthree distinct scenarios: natural, healthcare, and industrial, encompassing six\nchallenging tasks. These tasks include salient, camouflaged, and transparent\nobject detection, as well as polyp and skin lesion detection, alongside\nindustrial anomaly detection. We examine the performance of three recent\nopen-source LVLMs -- MiniGPT-v2, LLaVA-1.5, and Shikra -- in the realm of\nvisual recognition and localization. Moreover, we conduct empirical\ninvestigations utilizing the aforementioned models alongside GPT-4V, assessing\ntheir multi-modal understanding capacities in general tasks such as object\ncounting, absurd question answering, affordance reasoning, attribute\nrecognition, and spatial relation reasoning. Our investigations reveal that\nthese models demonstrate limited proficiency not only in specialized tasks but\nalso in general tasks. We delve deeper into this inadequacy and suggest several\npotential factors, including limited cognition in specialized tasks, object\nhallucination, text-to-image interference, and decreased robustness in complex\nproblems. We hope this study would provide valuable insights for the future\ndevelopment of LVLMs, augmenting their power in coping with both general and\nspecialized applications.\n","authors":["Yao Jiang","Xinyu Yan","Ge-Peng Ji","Keren Fu","Meijun Sun","Huan Xiong","Deng-Ping Fan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2403.04306v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.16838v2","updated":"2024-03-18T07:20:41Z","published":"2023-10-25T17:59:41Z","title":"SparseDFF: Sparse-View Feature Distillation for One-Shot Dexterous\n Manipulation","summary":" Humans demonstrate remarkable skill in transferring manipulation abilities\nacross objects of varying shapes, poses, and appearances, a capability rooted\nin their understanding of semantic correspondences between different instances.\nTo equip robots with a similar high-level comprehension, we present SparseDFF,\na novel DFF for 3D scenes utilizing large 2D vision models to extract semantic\nfeatures from sparse RGBD images, a domain where research is limited despite\nits relevance to many tasks with fixed-camera setups. SparseDFF generates\nview-consistent 3D DFFs, enabling efficient one-shot learning of dexterous\nmanipulations by mapping image features to a 3D point cloud. Central to\nSparseDFF is a feature refinement network, optimized with a contrastive loss\nbetween views and a point-pruning mechanism for feature continuity. This\nfacilitates the minimization of feature discrepancies w.r.t. end-effector\nparameters, bridging demonstrations and target manipulations. Validated in\nreal-world scenarios with a dexterous hand, SparseDFF proves effective in\nmanipulating both rigid and deformable objects, demonstrating significant\ngeneralization capabilities across object and scene variations.\n","authors":["Qianxu Wang","Haotong Zhang","Congyue Deng","Yang You","Hao Dong","Yixin Zhu","Leonidas Guibas"],"pdf_url":"https://arxiv.org/pdf/2310.16838v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11515v1","updated":"2024-03-18T07:01:21Z","published":"2024-03-18T07:01:21Z","title":"SSAP: A Shape-Sensitive Adversarial Patch for Comprehensive Disruption\n of Monocular Depth Estimation in Autonomous Navigation Applications","summary":" Monocular depth estimation (MDE) has advanced significantly, primarily\nthrough the integration of convolutional neural networks (CNNs) and more\nrecently, Transformers. However, concerns about their susceptibility to\nadversarial attacks have emerged, especially in safety-critical domains like\nautonomous driving and robotic navigation. Existing approaches for assessing\nCNN-based depth prediction methods have fallen short in inducing comprehensive\ndisruptions to the vision system, often limited to specific local areas. In\nthis paper, we introduce SSAP (Shape-Sensitive Adversarial Patch), a novel\napproach designed to comprehensively disrupt monocular depth estimation (MDE)\nin autonomous navigation applications. Our patch is crafted to selectively\nundermine MDE in two distinct ways: by distorting estimated distances or by\ncreating the illusion of an object disappearing from the system's perspective.\nNotably, our patch is shape-sensitive, meaning it considers the specific shape\nand scale of the target object, thereby extending its influence beyond\nimmediate proximity. Furthermore, our patch is trained to effectively address\ndifferent scales and distances from the camera. Experimental results\ndemonstrate that our approach induces a mean depth estimation error surpassing\n0.5, impacting up to 99% of the targeted region for CNN-based MDE models.\nAdditionally, we investigate the vulnerability of Transformer-based MDE models\nto patch-based attacks, revealing that SSAP yields a significant error of 0.59\nand exerts substantial influence over 99% of the target region on these models.\n","authors":["Amira Guesmi","Muhammad Abdullah Hanif","Ihsen Alouani","Bassem Ouni","Muhammad Shafique"],"pdf_url":"https://arxiv.org/pdf/2403.11515v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11511v1","updated":"2024-03-18T06:42:38Z","published":"2024-03-18T06:42:38Z","title":"Sim-to-Real Grasp Detection with Global-to-Local RGB-D Adaptation","summary":" This paper focuses on the sim-to-real issue of RGB-D grasp detection and\nformulates it as a domain adaptation problem. In this case, we present a\nglobal-to-local method to address hybrid domain gaps in RGB and depth data and\ninsufficient multi-modal feature alignment. First, a self-supervised rotation\npre-training strategy is adopted to deliver robust initialization for RGB and\ndepth networks. We then propose a global-to-local alignment pipeline with\nindividual global domain classifiers for scene features of RGB and depth images\nas well as a local one specifically working for grasp features in the two\nmodalities. In particular, we propose a grasp prototype adaptation module,\nwhich aims to facilitate fine-grained local feature alignment by dynamically\nupdating and matching the grasp prototypes from the simulation and real-world\nscenarios throughout the training process. Due to such designs, the proposed\nmethod substantially reduces the domain shift and thus leads to consistent\nperformance improvements. Extensive experiments are conducted on the\nGraspNet-Planar benchmark and physical environment, and superior results are\nachieved which demonstrate the effectiveness of our method.\n","authors":["Haoxiang Ma","Ran Qin","Modi shi","Boyang Gao","Di Huang"],"pdf_url":"https://arxiv.org/pdf/2403.11511v1.pdf","comment":"Accepted at ICRA 2024"},{"id":"http://arxiv.org/abs/2305.12681v2","updated":"2024-03-18T06:34:01Z","published":"2023-05-22T03:38:59Z","title":"Phased Data Augmentation for Training a Likelihood-Based Generative\n Model with Limited Data","summary":" Generative models excel in creating realistic images, yet their dependency on\nextensive datasets for training presents significant challenges, especially in\ndomains where data collection is costly or challenging. Current data-efficient\nmethods largely focus on GAN architectures, leaving a gap in training other\ntypes of generative models. Our study introduces \"phased data augmentation\" as\na novel technique that addresses this gap by optimizing training in limited\ndata scenarios without altering the inherent data distribution. By limiting the\naugmentation intensity throughout the learning phases, our method enhances the\nmodel's ability to learn from limited data, thus maintaining fidelity. Applied\nto a model integrating PixelCNNs with VQ-VAE-2, our approach demonstrates\nsuperior performance in both quantitative and qualitative evaluations across\ndiverse datasets. This represents an important step forward in the efficient\ntraining of likelihood-based models, extending the usefulness of data\naugmentation techniques beyond just GANs.\n","authors":["Yuta Mimura"],"pdf_url":"https://arxiv.org/pdf/2305.12681v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11510v1","updated":"2024-03-18T06:32:23Z","published":"2024-03-18T06:32:23Z","title":"GenFlow: Generalizable Recurrent Flow for 6D Pose Refinement of Novel\n Objects","summary":" Despite the progress of learning-based methods for 6D object pose estimation,\nthe trade-off between accuracy and scalability for novel objects still exists.\nSpecifically, previous methods for novel objects do not make good use of the\ntarget object's 3D shape information since they focus on generalization by\nprocessing the shape indirectly, making them less effective. We present\nGenFlow, an approach that enables both accuracy and generalization to novel\nobjects with the guidance of the target object's shape. Our method predicts\noptical flow between the rendered image and the observed image and refines the\n6D pose iteratively. It boosts the performance by a constraint of the 3D shape\nand the generalizable geometric knowledge learned from an end-to-end\ndifferentiable system. We further improve our model by designing a cascade\nnetwork architecture to exploit the multi-scale correlations and coarse-to-fine\nrefinement. GenFlow ranked first on the unseen object pose estimation\nbenchmarks in both the RGB and RGB-D cases. It also achieves performance\ncompetitive with existing state-of-the-art methods for the seen object pose\nestimation without any fine-tuning.\n","authors":["Sungphill Moon","Hyeontae Son","Dongcheol Hur","Sangwook Kim"],"pdf_url":"https://arxiv.org/pdf/2403.11510v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11507v1","updated":"2024-03-18T06:25:41Z","published":"2024-03-18T06:25:41Z","title":"Circle Representation for Medical Instance Object Segmentation","summary":" Recently, circle representation has been introduced for medical imaging,\ndesigned specifically to enhance the detection of instance objects that are\nspherically shaped (e.g., cells, glomeruli, and nuclei). Given its outstanding\neffectiveness in instance detection, it is compelling to consider the\napplication of circle representation for segmenting instance medical objects.\nIn this study, we introduce CircleSnake, a simple end-to-end segmentation\napproach that utilizes circle contour deformation for segmenting ball-shaped\nmedical objects at the instance level. The innovation of CircleSnake lies in\nthese three areas: (1) It substitutes the complex bounding box-to-octagon\ncontour transformation with a more consistent and rotation-invariant bounding\ncircle-to-circle contour adaptation. This adaptation specifically targets\nball-shaped medical objects. (2) The circle representation employed in\nCircleSnake significantly reduces the degrees of freedom to two, compared to\neight in the octagon representation. This reduction enhances both the\nrobustness of the segmentation performance and the rotational consistency of\nthe method. (3) CircleSnake is the first end-to-end deep instance segmentation\npipeline to incorporate circle representation, encompassing consistent circle\ndetection, circle contour proposal, and circular convolution in a unified\nframework. This integration is achieved through the novel application of\ncircular graph convolution within the context of circle detection and instance\nsegmentation. In practical applications, such as the detection of glomeruli,\nnuclei, and eosinophils in pathological images, CircleSnake has demonstrated\nsuperior performance and greater rotation invariance when compared to\nbenchmarks. The code has been made publicly available:\nhttps://github.com/hrlblab/CircleSnake.\n","authors":["Juming Xiong","Ethan H. Nguyen","Yilin Liu","Ruining Deng","Regina N Tyree","Hernan Correa","Girish Hiremath","Yaohong Wang","Haichun Yang","Agnes B. Fogo","Yuankai Huo"],"pdf_url":"https://arxiv.org/pdf/2403.11507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11506v1","updated":"2024-03-18T06:24:46Z","published":"2024-03-18T06:24:46Z","title":"End-To-End Underwater Video Enhancement: Dataset and Model","summary":" Underwater video enhancement (UVE) aims to improve the visibility and frame\nquality of underwater videos, which has significant implications for marine\nresearch and exploration. However, existing methods primarily focus on\ndeveloping image enhancement algorithms to enhance each frame independently.\nThere is a lack of supervised datasets and models specifically tailored for UVE\ntasks. To fill this gap, we construct the Synthetic Underwater Video\nEnhancement (SUVE) dataset, comprising 840 diverse underwater-style videos\npaired with ground-truth reference videos. Based on this dataset, we train a\nnovel underwater video enhancement model, UVENet, which utilizes inter-frame\nrelationships to achieve better enhancement performance. Through extensive\nexperiments on both synthetic and real underwater videos, we demonstrate the\neffectiveness of our approach. This study represents the first comprehensive\nexploration of UVE to our knowledge. The code is available at\nhttps://anonymous.4open.science/r/UVENet.\n","authors":["Dazhao Du","Enhan Li","Lingyu Si","Fanjiang Xu","Jianwei Niu"],"pdf_url":"https://arxiv.org/pdf/2403.11506v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11505v1","updated":"2024-03-18T06:20:49Z","published":"2024-03-18T06:20:49Z","title":"Covid-19 detection from CT scans using EfficientNet and Attention\n mechanism","summary":" Manual diagnosis and analysis of COVID-19 through the examination of lung\nComputed Tomography (CT) scan images by physicians tends to result in\ninefficiency, especially with high patient volumes and numerous images per\npatient. We address the need for automation by developing a deep learning\nmodel-based pipeline for COVID-19 detection from CT scan images of the lungs.\nThe Domain adaptation, Explainability, and Fairness in AI for Medical Image\nAnalysis Workshop and COVID-19 Diagnosis Competition (DEF-AI-MIA COV19D)\nprovides an opportunity to assess our designed pipeline for COVID-19 detection\nfrom CT scan images. The proposed pipeline incorporates EfficientNet with an\nAttention mechanism with a pre-processing step. Our pipeline outperforms last\nyear's teams on the validation set of the competition dataset.\n","authors":["Ramy Farag","Parth Upadhyay","Guilhermen DeSouza"],"pdf_url":"https://arxiv.org/pdf/2403.11505v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11504v1","updated":"2024-03-18T06:19:37Z","published":"2024-03-18T06:19:37Z","title":"MLVICX: Multi-Level Variance-Covariance Exploration for Chest X-ray\n Self-Supervised Representation Learning","summary":" Self-supervised learning (SSL) is potentially useful in reducing the need for\nmanual annotation and making deep learning models accessible for medical image\nanalysis tasks. By leveraging the representations learned from unlabeled data,\nself-supervised models perform well on tasks that require little to no\nfine-tuning. However, for medical images, like chest X-rays, which are\ncharacterized by complex anatomical structures and diverse clinical conditions,\nthere arises a need for representation learning techniques that can encode\nfine-grained details while preserving the broader contextual information. In\nthis context, we introduce MLVICX (Multi-Level Variance-Covariance Exploration\nfor Chest X-ray Self-Supervised Representation Learning), an approach to\ncapture rich representations in the form of embeddings from chest X-ray images.\nCentral to our approach is a novel multi-level variance and covariance\nexploration strategy that empowers the model to detect diagnostically\nmeaningful patterns while reducing redundancy effectively. By enhancing the\nvariance and covariance of the learned embeddings, MLVICX promotes the\nretention of critical medical insights by adapting both global and local\ncontextual details. We demonstrate the performance of MLVICX in advancing\nself-supervised chest X-ray representation learning through comprehensive\nexperiments. The performance enhancements we observe across various downstream\ntasks highlight the significance of the proposed approach in enhancing the\nutility of chest X-ray embeddings for precision medical diagnosis and\ncomprehensive image analysis. For pertaining, we used the NIH-Chest X-ray\ndataset, while for downstream tasks, we utilized NIH-Chest X-ray, Vinbig-CXR,\nRSNA pneumonia, and SIIM-ACR Pneumothorax datasets. Overall, we observe more\nthan 3% performance gains over SOTA SSL approaches in various downstream tasks.\n","authors":["Azad Singh","Vandan Gorade","Deepak Mishra"],"pdf_url":"https://arxiv.org/pdf/2403.11504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11503v1","updated":"2024-03-18T06:18:59Z","published":"2024-03-18T06:18:59Z","title":"Diffusion Models are Geometry Critics: Single Image 3D Editing Using\n Pre-Trained Diffusion Priors","summary":" We propose a novel image editing technique that enables 3D manipulations on\nsingle images, such as object rotation and translation. Existing 3D-aware image\nediting approaches typically rely on synthetic multi-view datasets for training\nspecialized models, thus constraining their effectiveness on open-domain images\nfeaturing significantly more varied layouts and styles. In contrast, our method\ndirectly leverages powerful image diffusion models trained on a broad spectrum\nof text-image pairs and thus retain their exceptional generalization abilities.\nThis objective is realized through the development of an iterative novel view\nsynthesis and geometry alignment algorithm. The algorithm harnesses diffusion\nmodels for dual purposes: they provide appearance prior by predicting novel\nviews of the selected object using estimated depth maps, and they act as a\ngeometry critic by correcting misalignments in 3D shapes across the sampled\nviews. Our method can generate high-quality 3D-aware image edits with large\nviewpoint transformations and high appearance and shape consistency with the\ninput image, pushing the boundaries of what is possible with single-image\n3D-aware editing.\n","authors":["Ruicheng Wang","Jianfeng Xiang","Jiaolong Yang","Xin Tong"],"pdf_url":"https://arxiv.org/pdf/2403.11503v1.pdf","comment":"Project page: https://wangrc.site/DiffCriticEdit/"},{"id":"http://arxiv.org/abs/2403.11498v1","updated":"2024-03-18T06:07:45Z","published":"2024-03-18T06:07:45Z","title":"Domain Adaptation Using Pseudo Labels for COVID-19 Detection","summary":" In response to the need for rapid and accurate COVID-19 diagnosis during the\nglobal pandemic, we present a two-stage framework that leverages pseudo labels\nfor domain adaptation to enhance the detection of COVID-19 from CT scans. By\nutilizing annotated data from one domain and non-annotated data from another,\nthe model overcomes the challenge of data scarcity and variability, common in\nemergent health crises. The innovative approach of generating pseudo labels\nenables the model to iteratively refine its learning process, thereby improving\nits accuracy and adaptability across different hospitals and medical centres.\nExperimental results on COV19-CT-DB database showcase the model's potential to\nachieve high diagnostic precision, significantly contributing to efficient\npatient management and alleviating the strain on healthcare systems. Our method\nachieves 0.92 Macro F1 Score on the validation set of Covid-19 domain\nadaptation challenge.\n","authors":["Runtian Yuan","Qingqiu Li","Junlin Hou","Jilan Xu","Yuejie Zhang","Rui Feng","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2403.11498v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11497v1","updated":"2024-03-18T06:04:02Z","published":"2024-03-18T06:04:02Z","title":"Do CLIPs Always Generalize Better than ImageNet Models?","summary":" Large vision language models, such as CLIPs, have revolutionized modern\nmachine learning. CLIPs have demonstrated great generalizability under\ndistribution shifts, supported by an increasing body of literature. However,\nthe evaluation datasets for CLIPs are variations primarily designed for\nImageNet benchmarks, which may not fully reflect the extent to which CLIPs,\ne.g., pre-trained on LAION, robust to spurious correlations. To bridge the gap,\nwe collect a real-world dataset called CounterAnimal that contains realistic\nspurious features found in animal photos. CounterAnimal consists of a) the\ncommon group: comprising animals on common backgrounds, and b) the counter\ngroup: including animals on unusual backgrounds. The performance drops from the\ncommon to counter groups quantify the reliance of models on spurious features\n(i.e., backgrounds) to predict the animals. We find that CLIPs trained on\neither LAION or the OpenAI data exhibit notable performance drops on the\ncounter group. Surprisingly, we observe that single-modal models trained on\nImageNet are more robust than CLIPs. We provide both theoretical and empirical\nexplanations for why CLIPs still learn spurious features. Our findings suggest\nthat distribution shifts remain an open problem for CLIPs, and one needs to be\ncautious about test setups when evaluating foundation models pre-trained on a\nsignificantly different scale and distribution.\n","authors":["Qizhou Wang","Yong Lin","Yongqiang Chen","Ludwig Schmidt","Bo Han","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.11497v1.pdf","comment":"Qizhou Wang, Yong Lin, and Yongqiang Chen contributed equally.\n Project page: https://counteranimal.github.io"},{"id":"http://arxiv.org/abs/2403.11494v1","updated":"2024-03-18T05:58:13Z","published":"2024-03-18T05:58:13Z","title":"CCC++: Optimized Color Classified Colorization with Segment Anything\n Model (SAM) Empowered Object Selective Color Harmonization","summary":" In this paper, we formulate the colorization problem into a multinomial\nclassification problem and then apply a weighted function to classes. We\npropose a set of formulas to transform color values into color classes and vice\nversa. To optimize the classes, we experiment with different bin sizes for\ncolor class transformation. Observing class appearance, standard deviation, and\nmodel parameters on various extremely large-scale real-time images in practice\nwe propose 532 color classes for our classification task. During training, we\npropose a class-weighted function based on true class appearance in each batch\nto ensure proper saturation of individual objects. We adjust the weights of the\nmajor classes, which are more frequently observed, by lowering them, while\nescalating the weights of the minor classes, which are less commonly observed.\nIn our class re-weight formula, we propose a hyper-parameter for finding the\noptimal trade-off between the major and minor appeared classes. As we apply\nregularization to enhance the stability of the minor class, occasional minor\nnoise may appear at the object's edges. We propose a novel object-selective\ncolor harmonization method empowered by the Segment Anything Model (SAM) to\nrefine and enhance these edges. We propose two new color image evaluation\nmetrics, the Color Class Activation Ratio (CCAR), and the True Activation Ratio\n(TAR), to quantify the richness of color components. We compare our proposed\nmodel with state-of-the-art models using six different dataset: Place, ADE,\nCeleba, COCO, Oxford 102 Flower, and ImageNet, in qualitative and quantitative\napproaches. The experimental results show that our proposed model outstrips\nother models in visualization, CNR and in our proposed CCAR and TAR measurement\ncriteria while maintaining satisfactory performance in regression (MSE, PSNR),\nsimilarity (SSIM, LPIPS, UIUI), and generative criteria (FID).\n","authors":["Mrityunjoy Gain","Avi Deb Raha","Rameswar Debnath"],"pdf_url":"https://arxiv.org/pdf/2403.11494v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2403.01476"},{"id":"http://arxiv.org/abs/2210.15563v3","updated":"2024-03-18T05:55:34Z","published":"2022-10-27T15:53:38Z","title":"Multimodal Transformer Distillation for Audio-Visual Synchronization","summary":" Audio-visual synchronization aims to determine whether the mouth movements\nand speech in the video are synchronized. VocaLiST reaches state-of-the-art\nperformance by incorporating multimodal Transformers to model audio-visual\ninteract information. However, it requires high computing resources, making it\nimpractical for real-world applications. This paper proposed an MTDVocaLiST\nmodel, which is trained by our proposed multimodal Transformer distillation\n(MTD) loss. MTD loss enables MTDVocaLiST model to deeply mimic the\ncross-attention distribution and value-relation in the Transformer of VocaLiST.\nAdditionally, we harness uncertainty weighting to fully exploit the interaction\ninformation across all layers. Our proposed method is effective in two aspects:\nFrom the distillation method perspective, MTD loss outperforms other strong\ndistillation baselines. From the distilled model's performance perspective: 1)\nMTDVocaLiST outperforms similar-size SOTA models, SyncNet, and Perfect Match\nmodels by 15.65% and 3.35%; 2) MTDVocaLiST reduces the model size of VocaLiST\nby 83.52%, yet still maintaining similar performance.\n","authors":["Xuanjun Chen","Haibin Wu","Chung-Che Wang","Hung-yi Lee","Jyh-Shing Roger Jang"],"pdf_url":"https://arxiv.org/pdf/2210.15563v3.pdf","comment":"Accepted by ICASSP 2024"},{"id":"http://arxiv.org/abs/2403.11492v1","updated":"2024-03-18T05:53:20Z","published":"2024-03-18T05:53:20Z","title":"SmartRefine: An Scenario-Adaptive Refinement Framework for Efficient\n Motion Prediction","summary":" Predicting the future motion of surrounding agents is essential for\nautonomous vehicles (AVs) to operate safely in dynamic, human-robot-mixed\nenvironments. Context information, such as road maps and surrounding agents'\nstates, provides crucial geometric and semantic information for motion behavior\nprediction. To this end, recent works explore two-stage prediction frameworks\nwhere coarse trajectories are first proposed, and then used to select critical\ncontext information for trajectory refinement. However, they either incur a\nlarge amount of computation or bring limited improvement, if not both. In this\npaper, we introduce a novel scenario-adaptive refinement strategy, named\nSmartRefine, to refine prediction with minimal additional computation.\nSpecifically, SmartRefine can comprehensively adapt refinement configurations\nbased on each scenario's properties, and smartly chooses the number of\nrefinement iterations by introducing a quality score to measure the prediction\nquality and remaining refinement potential of each scenario. SmartRefine is\ndesigned as a generic and flexible approach that can be seamlessly integrated\ninto most state-of-the-art motion prediction models. Experiments on Argoverse\n(1 & 2) show that our method consistently improves the prediction accuracy of\nmultiple state-of-the-art prediction models. Specifically, by adding\nSmartRefine to QCNet, we outperform all published ensemble-free works on the\nArgoverse 2 leaderboard (single agent track) at submission. Comprehensive\nstudies are also conducted to ablate design choices and explore the mechanism\nbehind multi-iteration refinement. Codes are available at\nhttps://github.com/opendilab/SmartRefine/\n","authors":["Yang Zhou","Hao Shao","Letian Wang","Steven L. Waslander","Hongsheng Li","Yu Liu"],"pdf_url":"https://arxiv.org/pdf/2403.11492v1.pdf","comment":"Camera-ready version for CVPR 2024"},{"id":"http://arxiv.org/abs/2401.13937v2","updated":"2024-03-18T05:45:07Z","published":"2024-01-25T04:39:48Z","title":"Self-supervised Video Object Segmentation with Distillation Learning of\n Deformable Attention","summary":" Video object segmentation is a fundamental research problem in computer\nvision. Recent techniques have often applied attention mechanism to object\nrepresentation learning from video sequences. However, due to temporal changes\nin the video data, attention maps may not well align with the objects of\ninterest across video frames, causing accumulated errors in long-term video\nprocessing. In addition, existing techniques have utilised complex\narchitectures, requiring highly computational complexity and hence limiting the\nability to integrate video object segmentation into low-powered devices. To\naddress these issues, we propose a new method for self-supervised video object\nsegmentation based on distillation learning of deformable attention.\nSpecifically, we devise a lightweight architecture for video object\nsegmentation that is effectively adapted to temporal changes. This is enabled\nby deformable attention mechanism, where the keys and values capturing the\nmemory of a video sequence in the attention module have flexible locations\nupdated across frames. The learnt object representations are thus adaptive to\nboth the spatial and temporal dimensions. We train the proposed architecture in\na self-supervised fashion through a new knowledge distillation paradigm where\ndeformable attention maps are integrated into the distillation loss. We\nqualitatively and quantitatively evaluate our method and compare it with\nexisting methods on benchmark datasets including DAVIS 2016/2017 and\nYouTube-VOS 2018/2019. Experimental results verify the superiority of our\nmethod via its achieved state-of-the-art performance and optimal memory usage.\n","authors":["Quang-Trung Truong","Duc Thanh Nguyen","Binh-Son Hua","Sai-Kit Yeung"],"pdf_url":"https://arxiv.org/pdf/2401.13937v2.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2403.11481v1","updated":"2024-03-18T05:07:59Z","published":"2024-03-18T05:07:59Z","title":"VideoAgent: A Memory-augmented Multimodal Agent for Video Understanding","summary":" We explore how reconciling several foundation models (large language models\nand vision-language models) with a novel unified memory mechanism could tackle\nthe challenging video understanding problem, especially capturing the long-term\ntemporal relations in lengthy videos. In particular, the proposed multimodal\nagent VideoAgent: 1) constructs a structured memory to store both the generic\ntemporal event descriptions and object-centric tracking states of the video; 2)\ngiven an input task query, it employs tools including video segment\nlocalization and object memory querying along with other visual foundation\nmodels to interactively solve the task, utilizing the zero-shot tool-use\nability of LLMs. VideoAgent demonstrates impressive performances on several\nlong-horizon video understanding benchmarks, an average increase of 6.6% on\nNExT-QA and 26.0% on EgoSchema over baselines, closing the gap between\nopen-sourced models and private counterparts including Gemini 1.5 Pro.\n","authors":["Yue Fan","Xiaojian Ma","Rujie Wu","Yuntao Du","Jiaqi Li","Zhi Gao","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2403.11481v1.pdf","comment":"Project page: videoagent.github.io; First two authors contributed\n equally"},{"id":"http://arxiv.org/abs/2403.08282v2","updated":"2024-03-18T05:03:53Z","published":"2024-03-13T06:22:17Z","title":"Hierarchical Auto-Organizing System for Open-Ended Multi-Agent\n Navigation","summary":" Due to the dynamic and unpredictable open-world setting, navigating complex\nenvironments in Minecraft poses significant challenges for multi-agent systems.\nAgents must interact with the environment and coordinate their actions with\nother agents to achieve common objectives. However, traditional approaches\noften struggle to efficiently manage inter-agent communication and task\ndistribution, crucial for effective multi-agent navigation. Furthermore,\nprocessing and integrating multi-modal information (such as visual, textual,\nand auditory data) is essential for agents to comprehend their goals and\nnavigate the environment successfully and fully. To address this issue, we\ndesign the HAS framework to auto-organize groups of LLM-based agents to\ncomplete navigation tasks. In our approach, we devise a hierarchical\nauto-organizing navigation system, which is characterized by 1) a hierarchical\nsystem for multi-agent organization, ensuring centralized planning and\ndecentralized execution; 2) an auto-organizing and intra-communication\nmechanism, enabling dynamic group adjustment under subtasks; 3) a multi-modal\ninformation platform, facilitating multi-modal perception to perform the three\nnavigation tasks with one system. To assess organizational behavior, we design\na series of navigation tasks in the Minecraft environment, which includes\nsearching and exploring. We aim to develop embodied organizations that push the\nboundaries of embodied AI, moving it towards a more human-like organizational\nstructure.\n","authors":["Zhonghan Zhao","Kewei Chen","Dongxu Guo","Wenhao Chai","Tian Ye","Yanting Zhang","Gaoang Wang"],"pdf_url":"https://arxiv.org/pdf/2403.08282v2.pdf","comment":"ICLR 2024 Workshop on LLM Agents"},{"id":"http://arxiv.org/abs/2403.11480v1","updated":"2024-03-18T05:03:07Z","published":"2024-03-18T05:03:07Z","title":"Towards understanding the nature of direct functional connectivity in\n visual brain network","summary":" Recent advances in neuroimaging have enabled studies in functional\nconnectivity (FC) of human brain, alongside investigation of the neuronal basis\nof cognition. One important FC study is the representation of vision in human\nbrain. The release of publicly available dataset BOLD5000 has made it possible\nto study the brain dynamics during visual tasks in greater detail. In this\npaper, a comprehensive analysis of fMRI time series (TS) has been performed to\nexplore different types of visual brain networks (VBN). The novelty of this\nwork lies in (1) constructing VBN with consistently significant direct\nconnectivity using both marginal and partial correlation, which is further\nanalyzed using graph theoretic measures, (2) classification of VBNs as formed\nby image complexity-specific TS, using graphical features. In image\ncomplexity-specific VBN classification, XGBoost yields average accuracy in the\nrange of 86.5% to 91.5% for positively correlated VBN, which is 2% greater than\nthat using negative correlation. This result not only reflects the\ndistinguishing graphical characteristics of each image complexity-specific VBN,\nbut also highlights the importance of studying both positively correlated and\nnegatively correlated VBN to understand the how differently brain functions\nwhile viewing different complexities of real-world images.\n","authors":["Debanjali Bhattacharya","Neelam Sinha"],"pdf_url":"https://arxiv.org/pdf/2403.11480v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11469v1","updated":"2024-03-18T04:41:59Z","published":"2024-03-18T04:41:59Z","title":"Generative Motion Stylization within Canonical Motion Space","summary":" Stylized motion breathes life into characters. However, the fixed skeleton\nstructure and style representation hinder existing data-driven motion synthesis\nmethods from generating stylized motion for various characters. In this work,\nwe propose a generative motion stylization pipeline, named MotionS, for\nsynthesizing diverse and stylized motion on cross-structure characters using\ncross-modality style prompts. Our key insight is to embed motion style into a\ncross-modality latent space and perceive the cross-structure skeleton\ntopologies, allowing for motion stylization within a canonical motion space.\nSpecifically, the large-scale Contrastive-Language-Image-Pre-training (CLIP)\nmodel is leveraged to construct the cross-modality latent space, enabling\nflexible style representation within this space. Additionally, two\ntopology-encoded tokens are learned to capture the canonical and specific\nskeleton topologies, facilitating cross-structure topology shifting.\nSubsequently, the topology-shifted stylization diffusion is designed to\ngenerate motion content for the specific skeleton and stylize it in the shifted\ncanonical motion space using multi-modality style descriptions. Through an\nextensive set of examples, we demonstrate the flexibility and generalizability\nof our pipeline across various characters and style descriptions. Qualitative\nand quantitative experiments underscore the superiority of our pipeline over\nstate-of-the-art methods, consistently delivering high-quality stylized motion\nacross a broad spectrum of skeletal structures.\n","authors":["Jiaxu Zhang","Xin Chen","Gang Yu","Zhigang Tu"],"pdf_url":"https://arxiv.org/pdf/2403.11469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11468v1","updated":"2024-03-18T04:41:38Z","published":"2024-03-18T04:41:38Z","title":"Collage Prompting: Budget-Friendly Visual Recognition with GPT-4V","summary":" Recent advancements in generative AI have suggested that by taking visual\nprompt, GPT-4V can demonstrate significant proficiency in image recognition\ntask. Despite its impressive capabilities, the financial cost associated with\nGPT-4V's inference presents a substantial barrier for its wide use. To address\nthis challenge, our work introduces Collage Prompting, a budget-friendly\nprompting approach that concatenates multiple images into a single visual\ninput. With collage prompt, GPT-4V is able to perform image recognition on\nseveral images simultaneously. Based on the observation that the accuracy of\nGPT-4V's image recognition varies significantly with the order of images within\nthe collage prompt, our method further learns to optimize the arrangement of\nimages for maximum recognition accuracy. A graph predictor is trained to\nindicate the accuracy of each collage prompt, then we propose an optimization\nmethod to navigate the search space of possible image arrangements. Experiment\nresults across various datasets demonstrate the cost-efficiency score of\ncollage prompt is much larger than standard prompt. Additionally, collage\nprompt with learned arrangement achieves clearly better accuracy than collage\nprompt with random arrangement in GPT-4V's visual recognition.\n","authors":["Siyu Xu","Yunke Wang","Daochang Liu","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2403.11468v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17555v3","updated":"2024-03-18T04:34:27Z","published":"2023-05-27T19:10:19Z","title":"Diffeomorphic Mesh Deformation via Efficient Optimal Transport for\n Cortical Surface Reconstruction","summary":" Mesh deformation plays a pivotal role in many 3D vision tasks including\ndynamic simulations, rendering, and reconstruction. However, defining an\nefficient discrepancy between predicted and target meshes remains an open\nproblem. A prevalent approach in current deep learning is the set-based\napproach which measures the discrepancy between two surfaces by comparing two\nrandomly sampled point-clouds from the two meshes with Chamfer pseudo-distance.\nNevertheless, the set-based approach still has limitations such as lacking a\ntheoretical guarantee for choosing the number of points in sampled\npoint-clouds, and the pseudo-metricity and the quadratic complexity of the\nChamfer divergence. To address these issues, we propose a novel metric for\nlearning mesh deformation. The metric is defined by sliced Wasserstein distance\non meshes represented as probability measures that generalize the set-based\napproach. By leveraging probability measure space, we gain flexibility in\nencoding meshes using diverse forms of probability measures, such as\ncontinuous, empirical, and discrete measures via varifold representation. After\nhaving encoded probability measures, we can compare meshes by using the sliced\nWasserstein distance which is an effective optimal transport distance with\nlinear computational complexity and can provide a fast statistical rate for\napproximating the surface of meshes. To the end, we employ a neural ordinary\ndifferential equation (ODE) to deform the input surface into the target shape\nby modeling the trajectories of the points on the surface. Our experiments on\ncortical surface reconstruction demonstrate that our approach surpasses other\ncompeting methods in multiple datasets and metrics.\n","authors":["Tung Le","Khai Nguyen","Shanlin Sun","Kun Han","Nhat Ho","Xiaohui Xie"],"pdf_url":"https://arxiv.org/pdf/2305.17555v3.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2403.11463v1","updated":"2024-03-18T04:30:31Z","published":"2024-03-18T04:30:31Z","title":"Siamese Learning with Joint Alignment and Regression for\n Weakly-Supervised Video Paragraph Grounding","summary":" Video Paragraph Grounding (VPG) is an emerging task in video-language\nunderstanding, which aims at localizing multiple sentences with semantic\nrelations and temporal order from an untrimmed video. However, existing VPG\napproaches are heavily reliant on a considerable number of temporal labels that\nare laborious and time-consuming to acquire. In this work, we introduce and\nexplore Weakly-Supervised Video Paragraph Grounding (WSVPG) to eliminate the\nneed of temporal annotations. Different from previous weakly-supervised\ngrounding frameworks based on multiple instance learning or reconstruction\nlearning for two-stage candidate ranking, we propose a novel siamese learning\nframework that jointly learns the cross-modal feature alignment and temporal\ncoordinate regression without timestamp labels to achieve concise one-stage\nlocalization for WSVPG. Specifically, we devise a Siamese Grounding TRansformer\n(SiamGTR) consisting of two weight-sharing branches for learning complementary\nsupervision. An Augmentation Branch is utilized for directly regressing the\ntemporal boundaries of a complete paragraph within a pseudo video, and an\nInference Branch is designed to capture the order-guided feature correspondence\nfor localizing multiple sentences in a normal video. We demonstrate by\nextensive experiments that our paradigm has superior practicability and\nflexibility to achieve efficient weakly-supervised or semi-supervised learning,\noutperforming state-of-the-art methods trained with the same or stronger\nsupervision.\n","authors":["Chaolei Tan","Jianhuang Lai","Wei-Shi Zheng","Jian-Fang Hu"],"pdf_url":"https://arxiv.org/pdf/2403.11463v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.11460v1","updated":"2024-03-18T04:26:18Z","published":"2024-03-18T04:26:18Z","title":"Fed3DGS: Scalable 3D Gaussian Splatting with Federated Learning","summary":" In this work, we present Fed3DGS, a scalable 3D reconstruction framework\nbased on 3D Gaussian splatting (3DGS) with federated learning. Existing\ncity-scale reconstruction methods typically adopt a centralized approach, which\ngathers all data in a central server and reconstructs scenes. The approach\nhampers scalability because it places a heavy load on the server and demands\nextensive data storage when reconstructing scenes on a scale beyond city-scale.\nIn pursuit of a more scalable 3D reconstruction, we propose a federated\nlearning framework with 3DGS, which is a decentralized framework and can\npotentially use distributed computational resources across millions of clients.\nWe tailor a distillation-based model update scheme for 3DGS and introduce\nappearance modeling for handling non-IID data in the scenario of 3D\nreconstruction with federated learning. We simulate our method on several\nlarge-scale benchmarks, and our method demonstrates rendered image quality\ncomparable to centralized approaches. In addition, we also simulate our method\nwith data collected in different seasons, demonstrating that our framework can\nreflect changes in the scenes and our appearance modeling captures changes due\nto seasonal variations.\n","authors":["Teppei Suzuki"],"pdf_url":"https://arxiv.org/pdf/2403.11460v1.pdf","comment":"Code: https://github.com/DensoITLab/Fed3DGS"},{"id":"http://arxiv.org/abs/2310.08044v2","updated":"2024-03-18T04:25:43Z","published":"2023-10-12T05:34:45Z","title":"EC-Depth: Exploring the consistency of self-supervised monocular depth\n estimation in challenging scenes","summary":" Self-supervised monocular depth estimation holds significant importance in\nthe fields of autonomous driving and robotics. However, existing methods are\ntypically trained and tested on standard datasets, overlooking the impact of\nvarious adverse conditions prevalent in real-world applications, such as rainy\ndays. As a result, it is commonly observed that these methods struggle to\nhandle these challenging scenarios. To address this issue, we present EC-Depth,\na novel self-supervised two-stage framework to achieve a robust depth\nestimation. In the first stage, we propose depth consistency regularization to\npropagate reliable supervision from standard to challenging scenes. In the\nsecond stage, we adopt the Mean Teacher paradigm and propose a novel\nconsistency-based pseudo-label filtering strategy to improve the quality of\npseudo-labels, further improving both the accuracy and robustness of our model.\nExtensive experiments demonstrate that our method achieves accurate and\nconsistent depth predictions in both standard and challenging scenarios,\nsurpassing existing state-of-the-art methods on KITTI, KITTI-C, DrivingStereo,\nand NuScenes-Night benchmarks.\n","authors":["Ziyang Song","Ruijie Zhu","Chuxin Wang","Jiacheng Deng","Jianfeng He","Tianzhu Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.08044v2.pdf","comment":"Project page: https://ruijiezhu94.github.io/ECDepth_page"},{"id":"http://arxiv.org/abs/2306.10900v2","updated":"2024-03-18T04:14:50Z","published":"2023-06-19T12:58:17Z","title":"MotionGPT: Finetuned LLMs Are General-Purpose Motion Generators","summary":" Generating realistic human motion from given action descriptions has\nexperienced significant advancements because of the emerging requirement of\ndigital humans. While recent works have achieved impressive results in\ngenerating motion directly from textual action descriptions, they often support\nonly a single modality of the control signal, which limits their application in\nthe real digital human industry. This paper presents a Motion General-Purpose\ngeneraTor (MotionGPT) that can use multimodal control signals, e.g., text and\nsingle-frame poses, for generating consecutive human motions by treating\nmultimodal signals as special input tokens in large language models (LLMs).\nSpecifically, we first quantize multimodal control signals into discrete codes\nand then formulate them in a unified prompt instruction to ask the LLMs to\ngenerate the motion answer. Our MotionGPT demonstrates a unified human motion\ngeneration model with multimodal control signals by tuning a mere 0.4% of LLM\nparameters. To the best of our knowledge, MotionGPT is the first method to\ngenerate human motion by multimodal control signals, which we hope can shed\nlight on this new direction. Visit our webpage at\nhttps://qiqiapink.github.io/MotionGPT/.\n","authors":["Yaqi Zhang","Di Huang","Bin Liu","Shixiang Tang","Yan Lu","Lu Chen","Lei Bai","Qi Chu","Nenghai Yu","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2306.10900v2.pdf","comment":"18 pages, 8 figures, accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2312.14134v2","updated":"2024-03-18T04:08:54Z","published":"2023-12-21T18:55:05Z","title":"Diffusion Reward: Learning Rewards via Conditional Video Diffusion","summary":" Learning rewards from expert videos offers an affordable and effective\nsolution to specify the intended behaviors for reinforcement learning tasks. In\nthis work, we propose Diffusion Reward, a novel framework that learns rewards\nfrom expert videos via conditional video diffusion models for solving complex\nvisual RL problems. Our key insight is that lower generative diversity is\nobserved when conditioned on expert trajectories. Diffusion Reward is\naccordingly formalized by the negative of conditional entropy that encourages\nproductive exploration of expert-like behaviors. We show the efficacy of our\nmethod over 10 robotic manipulation tasks from MetaWorld and Adroit with visual\ninput and sparse reward. Moreover, Diffusion Reward could even solve unseen\ntasks successfully and effectively, largely surpassing baseline methods.\nProject page and code: https://diffusion-reward.github.io/.\n","authors":["Tao Huang","Guangqi Jiang","Yanjie Ze","Huazhe Xu"],"pdf_url":"https://arxiv.org/pdf/2312.14134v2.pdf","comment":"Project page and code: https://diffusion-reward.github.io/"},{"id":"http://arxiv.org/abs/2311.10319v3","updated":"2024-03-18T04:01:53Z","published":"2023-11-17T04:04:29Z","title":"Shifting to Machine Supervision: Annotation-Efficient Semi and\n Self-Supervised Learning for Automatic Medical Image Segmentation and\n Classification","summary":" Advancements in clinical treatment are increasingly constrained by the\nlimitations of supervised learning techniques, which depend heavily on large\nvolumes of annotated data. The annotation process is not only costly but also\ndemands substantial time from clinical specialists. Addressing this issue, we\nintroduce the S4MI (Self-Supervision and Semi-Supervision for Medical Imaging)\npipeline, a novel approach that leverages the advancements in self-supervised\nand semi-supervised learning. These techniques engage in auxiliary tasks that\ndo not require labeling, thus simplifying the scaling of machine supervision\ncompared to fully-supervised methods. Our study benchmarks these techniques on\nthree distinct medical imaging datasets to evaluate their effectiveness in\nclassification and segmentation tasks. Notably, we observed that\nself-supervised learning significantly surpassed the performance of supervised\nmethods in the classification of all evaluated datasets. Remarkably, the\nsemi-supervised approach demonstrated superior outcomes in segmentation,\noutperforming fully-supervised methods while using 50% fewer labels across all\ndatasets. In line with our commitment to contributing to the scientific\ncommunity, we have made the S4MI code openly accessible, allowing for broader\napplication and further development of these methods.\n","authors":["Pranav Singh","Raviteja Chukkapalli","Shravan Chaudhari","Luoyao Chen","Mei Chen","Jinqian Pan","Craig Smuda","Jacopo Cirrone"],"pdf_url":"https://arxiv.org/pdf/2311.10319v3.pdf","comment":"Seventeen pages (incl. references), five figures, and one table.\n (Under Review)"},{"id":"http://arxiv.org/abs/2403.11453v1","updated":"2024-03-18T04:01:26Z","published":"2024-03-18T04:01:26Z","title":"Bridging 3D Gaussian and Mesh for Freeview Video Rendering","summary":" This is only a preview version of GauMesh. Recently, primitive-based\nrendering has been proven to achieve convincing results in solving the problem\nof modeling and rendering the 3D dynamic scene from 2D images. Despite this, in\nthe context of novel view synthesis, each type of primitive has its inherent\ndefects in terms of representation ability. It is difficult to exploit the mesh\nto depict the fuzzy geometry. Meanwhile, the point-based splatting (e.g. the 3D\nGaussian Splatting) method usually produces artifacts or blurry pixels in the\narea with smooth geometry and sharp textures. As a result, it is difficult,\neven not impossible, to represent the complex and dynamic scene with a single\ntype of primitive. To this end, we propose a novel approach, GauMesh, to bridge\nthe 3D Gaussian and Mesh for modeling and rendering the dynamic scenes. Given a\nsequence of tracked mesh as initialization, our goal is to simultaneously\noptimize the mesh geometry, color texture, opacity maps, a set of 3D Gaussians,\nand the deformation field. At a specific time, we perform $\\alpha$-blending on\nthe RGB and opacity values based on the merged and re-ordered z-buffers from\nmesh and 3D Gaussian rasterizations. This produces the final rendering, which\nis supervised by the ground-truth image. Experiments demonstrate that our\napproach adapts the appropriate type of primitives to represent the different\nparts of the dynamic scene and outperforms all the baseline methods in both\nquantitative and qualitative comparisons without losing render speed.\n","authors":["Yuting Xiao","Xuan Wang","Jiafei Li","Hongrui Cai","Yanbo Fan","Nan Xue","Minghui Yang","Yujun Shen","Shenghua Gao"],"pdf_url":"https://arxiv.org/pdf/2403.11453v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2403.11451v1","updated":"2024-03-18T03:59:43Z","published":"2024-03-18T03:59:43Z","title":"CasSR: Activating Image Power for Real-World Image Super-Resolution","summary":" The objective of image super-resolution is to generate clean and\nhigh-resolution images from degraded versions. Recent advancements in diffusion\nmodeling have led to the emergence of various image super-resolution techniques\nthat leverage pretrained text-to-image (T2I) models. Nevertheless, due to the\nprevalent severe degradation in low-resolution images and the inherent\ncharacteristics of diffusion models, achieving high-fidelity image restoration\nremains challenging. Existing methods often exhibit issues including semantic\nloss, artifacts, and the introduction of spurious content not present in the\noriginal image. To tackle this challenge, we propose Cascaded diffusion for\nSuper-Resolution, CasSR , a novel method designed to produce highly detailed\nand realistic images. In particular, we develop a cascaded controllable\ndiffusion model that aims to optimize the extraction of information from\nlow-resolution images. This model generates a preliminary reference image to\nfacilitate initial information extraction and degradation mitigation.\nFurthermore, we propose a multi-attention mechanism to enhance the T2I model's\ncapability in maximizing the restoration of the original image content. Through\na comprehensive blend of qualitative and quantitative analyses, we substantiate\nthe efficacy and superiority of our approach.\n","authors":["Haolan Chen","Jinhua Hao","Kai Zhao","Kun Yuan","Ming Sun","Chao Zhou","Wei Hu"],"pdf_url":"https://arxiv.org/pdf/2403.11451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11450v1","updated":"2024-03-18T03:59:24Z","published":"2024-03-18T03:59:24Z","title":"Zero-shot Compound Expression Recognition with Visual Language Model at\n the 6th ABAW Challenge","summary":" Conventional approaches to facial expression recognition primarily focus on\nthe classification of six basic facial expressions. Nevertheless, real-world\nsituations present a wider range of complex compound expressions that consist\nof combinations of these basics ones due to limited availability of\ncomprehensive training datasets. The 6th Workshop and Competition on Affective\nBehavior Analysis in-the-wild (ABAW) offered unlabeled datasets containing\ncompound expressions. In this study, we propose a zero-shot approach for\nrecognizing compound expressions by leveraging a pretrained visual language\nmodel integrated with some traditional CNN networks.\n","authors":["Jiahe Wang","Jiale Huang","Bingzhao Cai","Yifan Cao","Xin Yun","Shangfei Wang"],"pdf_url":"https://arxiv.org/pdf/2403.11450v1.pdf","comment":"USTC-AC's paper for Compound Expression (CE) Recognition Challenge in\n 6th Workshop and Competition on Affective Behavior Analysis in-the-wild\n (ABAW)"},{"id":"http://arxiv.org/abs/2403.11448v1","updated":"2024-03-18T03:54:01Z","published":"2024-03-18T03:54:01Z","title":"Robust Overfitting Does Matter: Test-Time Adversarial Purification With\n FGSM","summary":" Numerous studies have demonstrated the susceptibility of deep neural networks\n(DNNs) to subtle adversarial perturbations, prompting the development of many\nadvanced adversarial defense methods aimed at mitigating adversarial attacks.\nCurrent defense strategies usually train DNNs for a specific adversarial attack\nmethod and can achieve good robustness in defense against this type of\nadversarial attack. Nevertheless, when subjected to evaluations involving\nunfamiliar attack modalities, empirical evidence reveals a pronounced\ndeterioration in the robustness of DNNs. Meanwhile, there is a trade-off\nbetween the classification accuracy of clean examples and adversarial examples.\nMost defense methods often sacrifice the accuracy of clean examples in order to\nimprove the adversarial robustness of DNNs. To alleviate these problems and\nenhance the overall robust generalization of DNNs, we propose the Test-Time\nPixel-Level Adversarial Purification (TPAP) method. This approach is based on\nthe robust overfitting characteristic of DNNs to the fast gradient sign method\n(FGSM) on training and test datasets. It utilizes FGSM for adversarial\npurification, to process images for purifying unknown adversarial perturbations\nfrom pixels at testing time in a \"counter changes with changelessness\" manner,\nthereby enhancing the defense capability of DNNs against various unknown\nadversarial attacks. Extensive experimental results show that our method can\neffectively improve both overall robust generalization of DNNs, notably over\nprevious methods.\n","authors":["Linyu Tang","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.11448v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.11447v1","updated":"2024-03-18T03:46:26Z","published":"2024-03-18T03:46:26Z","title":"Motion-aware 3D Gaussian Splatting for Efficient Dynamic Scene\n Reconstruction","summary":" 3D Gaussian Splatting (3DGS) has become an emerging tool for dynamic scene\nreconstruction. However, existing methods focus mainly on extending static 3DGS\ninto a time-variant representation, while overlooking the rich motion\ninformation carried by 2D observations, thus suffering from performance\ndegradation and model redundancy. To address the above problem, we propose a\nnovel motion-aware enhancement framework for dynamic scene reconstruction,\nwhich mines useful motion cues from optical flow to improve different paradigms\nof dynamic 3DGS. Specifically, we first establish a correspondence between 3D\nGaussian movements and pixel-level flow. Then a novel flow augmentation method\nis introduced with additional insights into uncertainty and loss collaboration.\nMoreover, for the prevalent deformation-based paradigm that presents a harder\noptimization problem, a transient-aware deformation auxiliary module is\nproposed. We conduct extensive experiments on both multi-view and monocular\nscenes to verify the merits of our work. Compared with the baselines, our\nmethod shows significant superiority in both rendering quality and efficiency.\n","authors":["Zhiyang Guo","Wengang Zhou","Li Li","Min Wang","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2403.11447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11490v5","updated":"2024-03-18T03:41:09Z","published":"2023-05-19T07:44:39Z","title":"LLM-CXR: Instruction-Finetuned LLM for CXR Image Understanding and\n Generation","summary":" Following the impressive development of LLMs, vision-language alignment in\nLLMs is actively being researched to enable multimodal reasoning and visual IO.\nThis direction of research is particularly relevant to medical imaging because\nmedical image analysis and generation consist of reasoning based on a\ncombination of visual features and prior knowledge. Many recent works have\nfocused on training adapter networks that serve as an information bridge\nbetween image processing networks and LLMs; but presumably, in order to achieve\nmaximum reasoning potential of LLMs on visual information as well, visual and\nlanguage features should be allowed to interact more freely. This is especially\nimportant in the medical domain because understanding and generating medical\nimages such as chest X-rays (CXR) require not only accurate visual and\nlanguage-based reasoning but also a more intimate mapping between the two\nmodalities. Thus, taking inspiration from previous work on the transformer and\nVQ-GAN combination for bidirectional image and text generation, we build upon\nthis approach and develop a method for instruction-tuning an LLM pre-trained\nonly on text to gain vision-language capabilities for medical images.\nSpecifically, we leverage a pretrained LLM's existing question-answering and\ninstruction-following abilities to teach it to understand visual inputs by\ninstructing it to answer questions about image inputs and, symmetrically,\noutput both text and image responses appropriate to a given query by tuning the\nLLM with diverse tasks that encompass image-based text-generation and\ntext-based image-generation. We show that our model, LLM-CXR, trained in this\napproach shows better image-text alignment in both CXR understanding and\ngeneration tasks while being smaller in size compared to previously developed\nmodels that perform a narrower range of tasks. The code is at\nhttps://github.com/hyn2028/llm-cxr.\n","authors":["Suhyeon Lee","Won Jun Kim","Jinho Chang","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2305.11490v5.pdf","comment":"21 pages, 8 figures; ICLR 2024 (poster)"},{"id":"http://arxiv.org/abs/2403.09065v2","updated":"2024-03-18T03:37:54Z","published":"2024-03-14T03:12:02Z","title":"When Semantic Segmentation Meets Frequency Aliasing","summary":" Despite recent advancements in semantic segmentation, where and what pixels\nare hard to segment remains largely unexplored. Existing research only\nseparates an image into easy and hard regions and empirically observes the\nlatter are associated with object boundaries. In this paper, we conduct a\ncomprehensive analysis of hard pixel errors, categorizing them into three\ntypes: false responses, merging mistakes, and displacements. Our findings\nreveal a quantitative association between hard pixels and aliasing, which is\ndistortion caused by the overlapping of frequency components in the Fourier\ndomain during downsampling. To identify the frequencies responsible for\naliasing, we propose using the equivalent sampling rate to calculate the\nNyquist frequency, which marks the threshold for aliasing. Then, we introduce\nthe aliasing score as a metric to quantify the extent of aliasing. While\npositively correlated with the proposed aliasing score, three types of hard\npixels exhibit different patterns. Here, we propose two novel de-aliasing\nfilter (DAF) and frequency mixing (FreqMix) modules to alleviate aliasing\ndegradation by accurately removing or adjusting frequencies higher than the\nNyquist frequency. The DAF precisely removes the frequencies responsible for\naliasing before downsampling, while the FreqMix dynamically selects\nhigh-frequency components within the encoder block. Experimental results\ndemonstrate consistent improvements in semantic segmentation and low-light\ninstance segmentation tasks. The code is available at:\nhttps://github.com/Linwei-Chen/Seg-Aliasing.\n","authors":["Linwei Chen","Lin Gu","Ying Fu"],"pdf_url":"https://arxiv.org/pdf/2403.09065v2.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2310.18917v4","updated":"2024-03-18T03:37:31Z","published":"2023-10-29T06:10:46Z","title":"TivNe-SLAM: Dynamic Mapping and Tracking via Time-Varying Neural\n Radiance Fields","summary":" Previous attempts to integrate Neural Radiance Fields (NeRF) into the\nSimultaneous Localization and Mapping (SLAM) framework either rely on the\nassumption of static scenes or require the ground truth camera poses, which\nimpedes their application in real-world scenarios. In this paper, we propose a\ntime-varying representation to track and reconstruct the dynamic scenes.\nFirstly, two processes, tracking process and mapping process, are\nsimultaneously maintained in our framework. For the tracking process, all input\nimages are uniformly sampled, then progressively trained in a self-supervised\nparadigm. For the mapping process, we leverage motion masks to distinguish\ndynamic objects from static background, and sample more pixels from dynamic\nareas. Secondly, the parameter optimization for both processes consists of two\nstages: the first stage associates time with 3D positions to convert the\ndeformation field to the canonical field. And the second stage associates time\nwith the embeddings of canonical field to obtain colors and Signed Distance\nFunction (SDF). Lastly, we propose a novel keyframe selection strategy based on\nthe overlapping rate. We evaluate our approach on two synthetic datasets and\none real-world dataset. And the experiments validate that our method achieves\ncompetitive results in both tracking and mapping when compared to existing\nstate-of-the-art NeRF-based methods.\n","authors":["Chengyao Duan","Zhiliu Yang"],"pdf_url":"https://arxiv.org/pdf/2310.18917v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11440v1","updated":"2024-03-18T03:28:01Z","published":"2024-03-18T03:28:01Z","title":"Boosting Continuous Emotion Recognition with Self-Pretraining using\n Masked Autoencoders, Temporal Convolutional Networks, and Transformers","summary":" Human emotion recognition holds a pivotal role in facilitating seamless\nhuman-computer interaction. This paper delineates our methodology in tackling\nthe Valence-Arousal (VA) Estimation Challenge, Expression (Expr) Classification\nChallenge, and Action Unit (AU) Detection Challenge within the ambit of the 6th\nWorkshop and Competition on Affective Behavior Analysis in-the-wild (ABAW). Our\nstudy advocates a novel approach aimed at refining continuous emotion\nrecognition. We achieve this by initially harnessing pre-training with Masked\nAutoencoders (MAE) on facial datasets, followed by fine-tuning on the aff-wild2\ndataset annotated with expression (Expr) labels. The pre-trained model serves\nas an adept visual feature extractor, thereby enhancing the model's robustness.\nFurthermore, we bolster the performance of continuous emotion recognition by\nintegrating Temporal Convolutional Network (TCN) modules and Transformer\nEncoder modules into our framework.\n","authors":["Weiwei Zhou","Jiada Lu","Chenkun Ling","Weifeng Wang","Shaowei Liu"],"pdf_url":"https://arxiv.org/pdf/2403.11440v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.19160v2","updated":"2024-03-18T03:26:02Z","published":"2024-02-29T13:44:19Z","title":"Effective Message Hiding with Order-Preserving Mechanisms","summary":" Message hiding, a technique that conceals secret message bits within a cover\nimage, aims to achieve an optimal balance among message capacity, recovery\naccuracy, and imperceptibility. While convolutional neural networks have\nnotably improved message capacity and imperceptibility, achieving high recovery\naccuracy remains challenging. This challenge arises because convolutional\noperations struggle to preserve the sequential order of message bits and\neffectively address the discrepancy between these two modalities. To address\nthis, we propose StegaFormer, an innovative MLP-based framework designed to\npreserve bit order and enable global fusion between modalities. Specifically,\nStegaFormer incorporates three crucial components: Order-Preserving Message\nEncoder (OPME), Decoder (OPMD) and Global Message-Image Fusion (GMIF). OPME and\nOPMD aim to preserve the order of message bits by segmenting the entire\nsequence into equal-length segments and incorporating sequential information\nduring encoding and decoding. Meanwhile, GMIF employs a cross-modality fusion\nmechanism to effectively fuse the features from the two uncorrelated\nmodalities. Experimental results on the COCO and DIV2K datasets demonstrate\nthat StegaFormer surpasses existing state-of-the-art methods in terms of\nrecovery accuracy, message capacity, and imperceptibility. We will make our\ncode publicly available.\n","authors":["Gao Yu","Qiu Xuchong","Ye Zihan"],"pdf_url":"https://arxiv.org/pdf/2402.19160v2.pdf","comment":"7 Pages"},{"id":"http://arxiv.org/abs/2402.16594v3","updated":"2024-03-18T03:21:31Z","published":"2024-02-26T14:18:12Z","title":"CURSOR: Scalable Mixed-Order Hypergraph Matching with CUR Decomposition","summary":" To achieve greater accuracy, hypergraph matching algorithms require\nexponential increases in computational resources. Recent kd-tree-based\napproximate nearest neighbor (ANN) methods, despite the sparsity of their\ncompatibility tensor, still require exhaustive calculations for large-scale\ngraph matching. This work utilizes CUR tensor decomposition and introduces a\nnovel cascaded second and third-order hypergraph matching framework (CURSOR)\nfor efficient hypergraph matching. A CUR-based second-order graph matching\nalgorithm is used to provide a rough match, and then the core of CURSOR, a\nfiber-CUR-based tensor generation method, directly calculates entries of the\ncompatibility tensor by leveraging the initial second-order match result. This\nsignificantly decreases the time complexity and tensor density. A probability\nrelaxation labeling (PRL)-based matching algorithm, especially suitable for\nsparse tensors, is developed. Experiment results on large-scale synthetic\ndatasets and widely-adopted benchmark sets demonstrate the superiority of\nCURSOR over existing methods. The tensor generation method in CURSOR can be\nintegrated seamlessly into existing hypergraph matching methods to improve\ntheir performance and lower their computational costs.\n","authors":["Qixuan Zheng","Ming Zhang","Hong Yan"],"pdf_url":"https://arxiv.org/pdf/2402.16594v3.pdf","comment":"Accepted to CVPR 2024. The final camera-ready version. 15 pages with\n supplementary materials and 11 figures. Minor grammarly and syntax errors\n fixed. Irrelavant hyperrefs removed. Authorship information amended"},{"id":"http://arxiv.org/abs/2403.09136v2","updated":"2024-03-18T03:18:37Z","published":"2024-03-14T07:21:46Z","title":"Biophysics Informed Pathological Regularisation for Brain Tumour\n Segmentation","summary":" Recent advancements in deep learning have significantly improved brain tumour\nsegmentation techniques; however, the results still lack confidence and\nrobustness as they solely consider image data without biophysical priors or\npathological information. Integrating biophysics-informed regularisation is one\neffective way to change this situation, as it provides an prior regularisation\nfor automated end-to-end learning. In this paper, we propose a novel approach\nthat designs brain tumour growth Partial Differential Equation (PDE) models as\na regularisation with deep learning, operational with any network model. Our\nmethod introduces tumour growth PDE models directly into the segmentation\nprocess, improving accuracy and robustness, especially in data-scarce\nscenarios. This system estimates tumour cell density using a periodic\nactivation function. By effectively integrating this estimation with\nbiophysical models, we achieve a better capture of tumour characteristics. This\napproach not only aligns the segmentation closer to actual biological behaviour\nbut also strengthens the model's performance under limited data conditions. We\ndemonstrate the effectiveness of our framework through extensive experiments on\nthe BraTS 2023 dataset, showcasing significant improvements in both precision\nand reliability of tumour segmentation.\n","authors":["Lipei Zhang","Yanqi Cheng","Lihao Liu","Carola-Bibiane Schönlieb","Angelica I Aviles-Rivero"],"pdf_url":"https://arxiv.org/pdf/2403.09136v2.pdf","comment":"11 pages, 4 figures and 1 table"},{"id":"http://arxiv.org/abs/2401.08178v2","updated":"2024-03-18T03:15:39Z","published":"2024-01-16T07:51:00Z","title":"Key-point Guided Deformable Image Manipulation Using Diffusion Model","summary":" In this paper, we introduce a Key-point-guided Diffusion probabilistic Model\n(KDM) that gains precise control over images by manipulating the object's\nkey-point. We propose a two-stage generative model incorporating an optical\nflow map as an intermediate output. By doing so, a dense pixel-wise\nunderstanding of the semantic relation between the image and sparse key point\nis configured, leading to more realistic image generation. Additionally, the\nintegration of optical flow helps regulate the inter-frame variance of\nsequential images, demonstrating an authentic sequential image generation. The\nKDM is evaluated with diverse key-point conditioned image synthesis tasks,\nincluding facial image generation, human pose synthesis, and echocardiography\nvideo prediction, demonstrating the KDM is proving consistency enhanced and\nphoto-realistic images compared with state-of-the-art models.\n","authors":["Seok-Hwan Oh","Guil Jung","Myeong-Gee Kim","Sang-Yun Kim","Young-Min Kim","Hyeon-Jik Lee","Hyuk-Sool Kwon","Hyeon-Min Bae"],"pdf_url":"https://arxiv.org/pdf/2401.08178v2.pdf","comment":"1. The ideas and approaches for the existing network have undergone\n significant revisions, along with changes in the dataset, resulting in an\n overall overhaul. I am planning to upload the newly written paper. 2. All\n authors have agreed to these decisions"},{"id":"http://arxiv.org/abs/2401.14846v2","updated":"2024-03-18T03:09:57Z","published":"2024-01-26T13:27:15Z","title":"Understanding Domain Generalization: A Noise Robustness Perspective","summary":" Despite the rapid development of machine learning algorithms for domain\ngeneralization (DG), there is no clear empirical evidence that the existing DG\nalgorithms outperform the classic empirical risk minimization (ERM) across\nstandard benchmarks. To better understand this phenomenon, we investigate\nwhether there are benefits of DG algorithms over ERM through the lens of label\nnoise. Specifically, our finite-sample analysis reveals that label noise\nexacerbates the effect of spurious correlations for ERM, undermining\ngeneralization. Conversely, we illustrate that DG algorithms exhibit implicit\nlabel-noise robustness during finite-sample training even when spurious\ncorrelation is present. Such desirable property helps mitigate spurious\ncorrelations and improve generalization in synthetic experiments. However,\nadditional comprehensive experiments on real-world benchmark datasets indicate\nthat label-noise robustness does not necessarily translate to better\nperformance compared to ERM. We conjecture that the failure mode of ERM arising\nfrom spurious correlations may be less pronounced in practice.\n","authors":["Rui Qiao","Bryan Kian Hsiang Low"],"pdf_url":"https://arxiv.org/pdf/2401.14846v2.pdf","comment":"Accepted to the 12th International Conference on Learning\n Representations (ICLR 2024). Code is available at\n https://github.com/qiaoruiyt/NoiseRobustDG"},{"id":"http://arxiv.org/abs/2403.06467v2","updated":"2024-03-18T02:56:26Z","published":"2024-03-11T07:07:39Z","title":"Point Mamba: A Novel Point Cloud Backbone Based on State Space Model\n with Octree-Based Ordering Strategy","summary":" Recently, state space model (SSM) has gained great attention due to its\npromising performance, linear complexity, and long sequence modeling ability in\nboth language and image domains. However, it is non-trivial to extend SSM to\nthe point cloud field, because of the causality requirement of SSM and the\ndisorder and irregularity nature of point clouds. In this paper, we propose a\nnovel SSM-based point cloud processing backbone, named Point Mamba, with a\ncausality-aware ordering mechanism. To construct the causal dependency\nrelationship, we design an octree-based ordering strategy on raw irregular\npoints, globally sorting points in a z-order sequence and also retaining their\nspatial proximity. Our method achieves state-of-the-art performance compared\nwith transformer-based counterparts, with 93.4% accuracy and 75.7 mIOU\nrespectively on the ModelNet40 classification dataset and ScanNet semantic\nsegmentation dataset. Furthermore, our Point Mamba has linear complexity, which\nis more efficient than transformer-based methods. Our method demonstrates the\ngreat potential that SSM can serve as a generic backbone in point cloud\nunderstanding. Codes are released at https://github.com/IRMVLab/Point-Mamba.\n","authors":["Jiuming Liu","Ruiji Yu","Yian Wang","Yu Zheng","Tianchen Deng","Weicai Ye","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2403.06467v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.07030v2","updated":"2024-03-18T02:45:04Z","published":"2024-03-11T03:34:14Z","title":"AuG-KD: Anchor-Based Mixup Generation for Out-of-Domain Knowledge\n Distillation","summary":" Due to privacy or patent concerns, a growing number of large models are\nreleased without granting access to their training data, making transferring\ntheir knowledge inefficient and problematic. In response, Data-Free Knowledge\nDistillation (DFKD) methods have emerged as direct solutions. However, simply\nadopting models derived from DFKD for real-world applications suffers\nsignificant performance degradation, due to the discrepancy between teachers'\ntraining data and real-world scenarios (student domain). The degradation stems\nfrom the portions of teachers' knowledge that are not applicable to the student\ndomain. They are specific to the teacher domain and would undermine students'\nperformance. Hence, selectively transferring teachers' appropriate knowledge\nbecomes the primary challenge in DFKD. In this work, we propose a simple but\neffective method AuG-KD. It utilizes an uncertainty-guided and sample-specific\nanchor to align student-domain data with the teacher domain and leverages a\ngenerative method to progressively trade off the learning process between OOD\nknowledge distillation and domain-specific information learning via mixup\nlearning. Extensive experiments in 3 datasets and 8 settings demonstrate the\nstability and superiority of our approach. Code available at\nhttps://github.com/IshiKura-a/AuG-KD .\n","authors":["Zihao Tang","Zheqi Lv","Shengyu Zhang","Yifan Zhou","Xinyu Duan","Fei Wu","Kun Kuang"],"pdf_url":"https://arxiv.org/pdf/2403.07030v2.pdf","comment":"Accepted to ICLR 2024"},{"id":"http://arxiv.org/abs/2310.04152v2","updated":"2024-03-18T02:44:59Z","published":"2023-10-06T10:55:34Z","title":"Improving Neural Radiance Field using Near-Surface Sampling with Point\n Cloud Generation","summary":" Neural radiance field (NeRF) is an emerging view synthesis method that\nsamples points in a three-dimensional (3D) space and estimates their existence\nand color probabilities. The disadvantage of NeRF is that it requires a long\ntraining time since it samples many 3D points. In addition, if one samples\npoints from occluded regions or in the space where an object is unlikely to\nexist, the rendering quality of NeRF can be degraded. These issues can be\nsolved by estimating the geometry of 3D scene. This paper proposes a\nnear-surface sampling framework to improve the rendering quality of NeRF. To\nthis end, the proposed method estimates the surface of a 3D object using depth\nimages of the training set and sampling is performed around there only. To\nobtain depth information on a novel view, the paper proposes a 3D point cloud\ngeneration method and a simple refining method for projected depth from a point\ncloud. Experimental results show that the proposed near-surface sampling NeRF\nframework can significantly improve the rendering quality, compared to the\noriginal NeRF and three different state-of-the-art NeRF. In addition, one can\nsignificantly accelerate the training time of a NeRF model with the proposed\nnear-surface sampling framework.\n","authors":["Hye Bin Yoo","Hyun Min Han","Sung Soo Hwang","Il Yong Chun"],"pdf_url":"https://arxiv.org/pdf/2310.04152v2.pdf","comment":"14 figures, 3 tables"},{"id":"http://arxiv.org/abs/2403.11427v1","updated":"2024-03-18T02:44:46Z","published":"2024-03-18T02:44:46Z","title":"BAGS: Building Animatable Gaussian Splatting from a Monocular Video with\n Diffusion Priors","summary":" Animatable 3D reconstruction has significant applications across various\nfields, primarily relying on artists' handcraft creation. Recently, some\nstudies have successfully constructed animatable 3D models from monocular\nvideos. However, these approaches require sufficient view coverage of the\nobject within the input video and typically necessitate significant time and\ncomputational costs for training and rendering. This limitation restricts the\npractical applications. In this work, we propose a method to build animatable\n3D Gaussian Splatting from monocular video with diffusion priors. The 3D\nGaussian representations significantly accelerate the training and rendering\nprocess, and the diffusion priors allow the method to learn 3D models with\nlimited viewpoints. We also present the rigid regularization to enhance the\nutilization of the priors. We perform an extensive evaluation across various\nreal-world videos, demonstrating its superior performance compared to the\ncurrent state-of-the-art methods.\n","authors":["Tingyang Zhang","Qingzhe Gao","Weiyu Li","Libin Liu","Baoquan Chen"],"pdf_url":"https://arxiv.org/pdf/2403.11427v1.pdf","comment":"https://talegqz.github.io/BAGS/"},{"id":"http://arxiv.org/abs/2310.14566v4","updated":"2024-03-18T02:42:10Z","published":"2023-10-23T04:49:09Z","title":"HallusionBench: An Advanced Diagnostic Suite for Entangled Language\n Hallucination and Visual Illusion in Large Vision-Language Models","summary":" We introduce HallusionBench, a comprehensive benchmark designed for the\nevaluation of image-context reasoning. This benchmark presents significant\nchallenges to advanced large visual-language models (LVLMs), such as\nGPT-4V(Vision), Gemini Pro Vision, Claude 3, and LLaVA-1.5, by emphasizing\nnuanced understanding and interpretation of visual data. The benchmark\ncomprises 346 images paired with 1129 questions, all meticulously crafted by\nhuman experts. We introduce a novel structure for these visual questions\ndesigned to establish control groups. This structure enables us to conduct a\nquantitative analysis of the models' response tendencies, logical consistency,\nand various failure modes. In our evaluation on HallusionBench, we benchmarked\n15 different models, highlighting a 31.42% question-pair accuracy achieved by\nthe state-of-the-art GPT-4V. Notably, all other evaluated models achieve\naccuracy below 16%. Moreover, our analysis not only highlights the observed\nfailure modes, including language hallucination and visual illusion, but also\ndeepens an understanding of these pitfalls. Our comprehensive case studies\nwithin HallusionBench shed light on the challenges of hallucination and\nillusion in LVLMs. Based on these insights, we suggest potential pathways for\ntheir future improvement. The benchmark and codebase can be accessed at\nhttps://github.com/tianyi-lab/HallusionBench.\n","authors":["Tianrui Guan","Fuxiao Liu","Xiyang Wu","Ruiqi Xian","Zongxia Li","Xiaoyu Liu","Xijun Wang","Lichang Chen","Furong Huang","Yaser Yacoob","Dinesh Manocha","Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.14566v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11424v1","updated":"2024-03-18T02:39:21Z","published":"2024-03-18T02:39:21Z","title":"Benchmarking the Robustness of UAV Tracking Against Common Corruptions","summary":" The robustness of unmanned aerial vehicle (UAV) tracking is crucial in many\ntasks like surveillance and robotics. Despite its importance, little attention\nis paid to the performance of UAV trackers under common corruptions due to lack\nof a dedicated platform. Addressing this, we propose UAV-C, a large-scale\nbenchmark for assessing robustness of UAV trackers under common corruptions.\nSpecifically, UAV-C is built upon two popular UAV datasets by introducing 18\ncommon corruptions from 4 representative categories including adversarial,\nsensor, blur, and composite corruptions in different levels. Finally, UAV-C\ncontains more than 10K sequences. To understand the robustness of existing UAV\ntrackers against corruptions, we extensively evaluate 12 representative\nalgorithms on UAV-C. Our study reveals several key findings: 1) Current\ntrackers are vulnerable to corruptions, indicating more attention needed in\nenhancing the robustness of UAV trackers; 2) When accompanying together,\ncomposite corruptions result in more severe degradation to trackers; and 3)\nWhile each tracker has its unique performance profile, some trackers may be\nmore sensitive to specific corruptions. By releasing UAV-C, we hope it, along\nwith comprehensive analysis, serves as a valuable resource for advancing the\nrobustness of UAV tracking against corruption. Our UAV-C will be available at\nhttps://github.com/Xiaoqiong-Liu/UAV-C.\n","authors":["Xiaoqiong Liu","Yunhe Feng","Shu Hu","Xiaohui Yuan","Heng Fan"],"pdf_url":"https://arxiv.org/pdf/2403.11424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11423v1","updated":"2024-03-18T02:38:55Z","published":"2024-03-18T02:38:55Z","title":"VmambaIR: Visual State Space Model for Image Restoration","summary":" Image restoration is a critical task in low-level computer vision, aiming to\nrestore high-quality images from degraded inputs. Various models, such as\nconvolutional neural networks (CNNs), generative adversarial networks (GANs),\ntransformers, and diffusion models (DMs), have been employed to address this\nproblem with significant impact. However, CNNs have limitations in capturing\nlong-range dependencies. DMs require large prior models and computationally\nintensive denoising steps. Transformers have powerful modeling capabilities but\nface challenges due to quadratic complexity with input image size. To address\nthese challenges, we propose VmambaIR, which introduces State Space Models\n(SSMs) with linear complexity into comprehensive image restoration tasks. We\nutilize a Unet architecture to stack our proposed Omni Selective Scan (OSS)\nblocks, consisting of an OSS module and an Efficient Feed-Forward Network\n(EFFN). Our proposed omni selective scan mechanism overcomes the unidirectional\nmodeling limitation of SSMs by efficiently modeling image information flows in\nall six directions. Furthermore, we conducted a comprehensive evaluation of our\nVmambaIR across multiple image restoration tasks, including image deraining,\nsingle image super-resolution, and real-world image super-resolution. Extensive\nexperimental results demonstrate that our proposed VmambaIR achieves\nstate-of-the-art (SOTA) performance with much fewer computational resources and\nparameters. Our research highlights the potential of state space models as\npromising alternatives to the transformer and CNN architectures in serving as\nfoundational frameworks for next-generation low-level visual tasks.\n","authors":["Yuan Shi","Bin Xia","Xiaoyu Jin","Xing Wang","Tianyu Zhao","Xin Xia","Xuefeng Xiao","Wenming Yang"],"pdf_url":"https://arxiv.org/pdf/2403.11423v1.pdf","comment":"23 pages"},{"id":"http://arxiv.org/abs/2403.09195v2","updated":"2024-03-18T02:30:17Z","published":"2024-03-14T09:07:34Z","title":"SAM-Lightening: A Lightweight Segment Anything Model with Dilated Flash\n Attention to Achieve 30 times Acceleration","summary":" Segment Anything Model (SAM) has garnered significant attention in\nsegmentation tasks due to their zero-shot generalization ability. However, a\nbroader application of SAMs to real-world practice has been restricted by their\nlow inference speed and high computational memory demands, which mainly stem\nfrom the attention mechanism. Existing work concentrated on optimizing the\nencoder, yet has not adequately addressed the inefficiency of the attention\nmechanism itself, even when distilled to a smaller model, which thus leaves\nspace for further improvement. In response, we introduce SAM-Lightening, a\nvariant of SAM, that features a re-engineered attention mechanism, termed\nDilated Flash Attention. It not only facilitates higher parallelism, enhancing\nprocessing efficiency but also retains compatibility with the existing\nFlashAttention. Correspondingly, we propose a progressive distillation to\nenable an efficient knowledge transfer from the vanilla SAM without costly\ntraining from scratch. Experiments on COCO and LVIS reveal that SAM-Lightening\nsignificantly outperforms the state-of-the-art methods in both run-time\nefficiency and segmentation accuracy. Specifically, it can achieve an inference\nspeed of 7 milliseconds (ms) per image, for images of size 1024*1024 pixels,\nwhich is 30.1 times faster than the vanilla SAM and 2.1 times than the\nstate-of-the-art. Moreover, it takes only 244MB memory, which is 3.5\\% of the\nvanilla SAM. The code and weights are available at\nhttps://anonymous.4open.science/r/SAM-LIGHTENING-BC25/.\n","authors":["Yanfei Song","Bangzheng Pu","Peng Wang","Hongxu Jiang","Dong Dong","Yongxiang Cao","Yiqing Shen"],"pdf_url":"https://arxiv.org/pdf/2403.09195v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11171v2","updated":"2024-03-18T02:12:44Z","published":"2023-11-18T21:27:04Z","title":"LOSTU: Fast, Scalable, and Uncertainty-Aware Triangulation","summary":" This work proposes a non-iterative, scalable, and statistically optimal way\nto triangulate called \\texttt{LOSTU}. Unlike triangulation algorithms that\nminimize the reprojection ($L_2$) error, LOSTU will still provide the maximum\nlikelihood estimate when there are errors in camera pose or parameters. This\ngeneric framework is used to contextualize other triangulation methods like the\ndirect linear transform (DLT) or the midpoint. Synthetic experiments show that\nLOSTU can be substantially faster than using uncertainty-aware\nLevenberg-Marquardt (or similar) optimization schemes, while providing results\nof comparable precision. Finally, LOSTU is implemented in sequential\nreconstruction in conjunction with uncertainty-aware pose estimation, where it\nyields better reconstruction metrics.\n","authors":["Sébastien Henry","John A. Christian"],"pdf_url":"https://arxiv.org/pdf/2311.11171v2.pdf","comment":"19 pages, 5 figures, 3 tables"},{"id":"http://arxiv.org/abs/2403.11415v1","updated":"2024-03-18T02:08:58Z","published":"2024-03-18T02:08:58Z","title":"DreamSampler: Unifying Diffusion Sampling and Score Distillation for\n Image Manipulation","summary":" Reverse sampling and score-distillation have emerged as main workhorses in\nrecent years for image manipulation using latent diffusion models (LDMs). While\nreverse diffusion sampling often requires adjustments of LDM architecture or\nfeature engineering, score distillation offers a simple yet powerful\nmodel-agnostic approach, but it is often prone to mode-collapsing. To address\nthese limitations and leverage the strengths of both approaches, here we\nintroduce a novel framework called {\\em DreamSampler}, which seamlessly\nintegrates these two distinct approaches through the lens of regularized latent\noptimization. Similar to score-distillation, DreamSampler is a model-agnostic\napproach applicable to any LDM architecture, but it allows both distillation\nand reverse sampling with additional guidance for image editing and\nreconstruction. Through experiments involving image editing, SVG reconstruction\nand etc, we demonstrate the competitive performance of DreamSampler compared to\nexisting approaches, while providing new applications.\n","authors":["Jeongsol Kim","Geon Yeong Park","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2403.11415v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10022v2","updated":"2024-03-18T01:57:08Z","published":"2024-03-15T05:08:59Z","title":"Lifelong Person Re-Identification with Backward-Compatibility","summary":" Lifelong person re-identification (LReID) assumes a practical scenario where\nthe model is sequentially trained on continuously incoming datasets while\nalleviating the catastrophic forgetting in the old datasets. However, not only\nthe training datasets but also the gallery images are incrementally\naccumulated, that requires a huge amount of computational complexity and\nstorage space to extract the features at the inference phase. In this paper, we\naddress the above mentioned problem by incorporating the backward-compatibility\nto LReID for the first time. We train the model using the continuously incoming\ndatasets while maintaining the model's compatibility toward the previously\ntrained old models without re-computing the features of the old gallery images.\nTo this end, we devise the cross-model compatibility loss based on the\ncontrastive learning with respect to the replay features across all the old\ndatasets. Moreover, we also develop the knowledge consolidation method based on\nthe part classification to learn the shared representation across different\ndatasets for the backward-compatibility. We suggest a more practical\nmethodology for performance evaluation as well where all the gallery and query\nimages are considered together. Experimental results demonstrate that the\nproposed method achieves a significantly higher performance of the\nbackward-compatibility compared with the existing methods. It is a promising\ntool for more practical scenarios of LReID.\n","authors":["Minyoung Oh","Jae-Young Sim"],"pdf_url":"https://arxiv.org/pdf/2403.10022v2.pdf","comment":"17 pages, 5 figures, 7 tables"},{"id":"http://arxiv.org/abs/2403.05369v3","updated":"2024-03-18T01:53:42Z","published":"2024-03-08T15:00:44Z","title":"Frequency-Adaptive Dilated Convolution for Semantic Segmentation","summary":" Dilated convolution, which expands the receptive field by inserting gaps\nbetween its consecutive elements, is widely employed in computer vision. In\nthis study, we propose three strategies to improve individual phases of dilated\nconvolution from the view of spectrum analysis. Departing from the conventional\npractice of fixing a global dilation rate as a hyperparameter, we introduce\nFrequency-Adaptive Dilated Convolution (FADC), which dynamically adjusts\ndilation rates spatially based on local frequency components. Subsequently, we\ndesign two plug-in modules to directly enhance effective bandwidth and\nreceptive field size. The Adaptive Kernel (AdaKern) module decomposes\nconvolution weights into low-frequency and high-frequency components,\ndynamically adjusting the ratio between these components on a per-channel\nbasis. By increasing the high-frequency part of convolution weights, AdaKern\ncaptures more high-frequency components, thereby improving effective bandwidth.\nThe Frequency Selection (FreqSelect) module optimally balances high- and\nlow-frequency components in feature representations through spatially variant\nreweighting. It suppresses high frequencies in the background to encourage FADC\nto learn a larger dilation, thereby increasing the receptive field for an\nexpanded scope. Extensive experiments on segmentation and object detection\nconsistently validate the efficacy of our approach. The code is publicly\navailable at \\url{https://github.com/Linwei-Chen/FADC}.\n","authors":["Linwei Chen","Lin Gu","Ying Fu"],"pdf_url":"https://arxiv.org/pdf/2403.05369v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04293v3","updated":"2024-03-18T01:50:08Z","published":"2023-12-07T13:27:37Z","title":"GPT-4V with Emotion: A Zero-shot Benchmark for Generalized Emotion\n Recognition","summary":" Recently, GPT-4 with Vision (GPT-4V) has demonstrated remarkable visual\ncapabilities across various tasks, but its performance in emotion recognition\nhas not been fully evaluated. To bridge this gap, we present the quantitative\nevaluation results of GPT-4V on 21 benchmark datasets covering 6 tasks: visual\nsentiment analysis, tweet sentiment analysis, micro-expression recognition,\nfacial emotion recognition, dynamic facial emotion recognition, and multimodal\nemotion recognition. This paper collectively refers to these tasks as\n``Generalized Emotion Recognition (GER)''. Through experimental analysis, we\nobserve that GPT-4V exhibits strong visual understanding capabilities in GER\ntasks. Meanwhile, GPT-4V shows the ability to integrate multimodal clues and\nexploit temporal information, which is also critical for emotion recognition.\nHowever, it's worth noting that GPT-4V is primarily designed for general\ndomains and cannot recognize micro-expressions that require specialized\nknowledge. To the best of our knowledge, this paper provides the first\nquantitative assessment of GPT-4V for GER tasks. We have open-sourced the code\nand encourage subsequent researchers to broaden the evaluation scope by\nincluding more tasks and datasets. Our code and evaluation results are\navailable at: https://github.com/zeroQiaoba/gpt4v-emotion.\n","authors":["Zheng Lian","Licai Sun","Haiyang Sun","Kang Chen","Zhuofan Wen","Hao Gu","Bin Liu","Jianhua Tao"],"pdf_url":"https://arxiv.org/pdf/2312.04293v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08927v2","updated":"2024-03-18T01:22:04Z","published":"2023-09-16T08:46:59Z","title":"DynaMoN: Motion-Aware Fast and Robust Camera Localization for Dynamic\n Neural Radiance Fields","summary":" The accurate reconstruction of dynamic scenes with neural radiance fields is\nsignificantly dependent on the estimation of camera poses. Widely used\nstructure-from-motion pipelines encounter difficulties in accurately tracking\nthe camera trajectory when faced with separate dynamics of the scene content\nand the camera movement. To address this challenge, we propose DynaMoN. DynaMoN\nutilizes semantic segmentation and generic motion masks to handle dynamic\ncontent for initial camera pose estimation and statics-focused ray sampling for\nfast and accurate novel-view synthesis. Our novel iterative learning scheme\nswitches between training the NeRF and updating the pose parameters for an\nimproved reconstruction and trajectory estimation quality. The proposed\npipeline shows significant acceleration of the training process. We extensively\nevaluate our approach on two real-world dynamic datasets, the TUM RGB-D and the\nBONN RGB-D Dynamic dataset. DynaMoN improves over the state-of-the-art both in\nterms of reconstruction quality and trajectory accuracy. We plan to make our\ncode public to enhance research in this area.\n","authors":["Nicolas Schischka","Hannah Schieber","Mert Asim Karaoglu","Melih Görgülü","Florian Grötzner","Alexander Ladikos","Daniel Roth","Nassir Navab","Benjamin Busam"],"pdf_url":"https://arxiv.org/pdf/2309.08927v2.pdf","comment":"6 pages, 4 figures"},{"id":"http://arxiv.org/abs/2403.11401v1","updated":"2024-03-18T01:18:48Z","published":"2024-03-18T01:18:48Z","title":"Scene-LLM: Extending Language Model for 3D Visual Understanding and\n Reasoning","summary":" This paper introduces Scene-LLM, a 3D-visual-language model that enhances\nembodied agents' abilities in interactive 3D indoor environments by integrating\nthe reasoning strengths of Large Language Models (LLMs). Scene-LLM adopts a\nhybrid 3D visual feature representation, that incorporates dense spatial\ninformation and supports scene state updates. The model employs a projection\nlayer to efficiently project these features in the pre-trained textual\nembedding space, enabling effective interpretation of 3D visual information.\nUnique to our approach is the integration of both scene-level and ego-centric\n3D information. This combination is pivotal for interactive planning, where\nscene-level data supports global planning and ego-centric data is important for\nlocalization. Notably, we use ego-centric 3D frame features for feature\nalignment, an efficient technique that enhances the model's ability to align\nfeatures of small objects within the scene. Our experiments with Scene-LLM\ndemonstrate its strong capabilities in dense captioning, question answering,\nand interactive planning. We believe Scene-LLM advances the field of 3D visual\nunderstanding and reasoning, offering new possibilities for sophisticated agent\ninteractions in indoor settings.\n","authors":["Rao Fu","Jingyu Liu","Xilun Chen","Yixin Nie","Wenhan Xiong"],"pdf_url":"https://arxiv.org/pdf/2403.11401v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10075v2","updated":"2024-03-18T01:16:04Z","published":"2024-03-15T07:34:08Z","title":"A survey of synthetic data augmentation methods in computer vision","summary":" The standard approach to tackling computer vision problems is to train deep\nconvolutional neural network (CNN) models using large-scale image datasets\nwhich are representative of the target task. However, in many scenarios, it is\noften challenging to obtain sufficient image data for the target task. Data\naugmentation is a way to mitigate this challenge. A common practice is to\nexplicitly transform existing images in desired ways so as to create the\nrequired volume and variability of training data necessary to achieve good\ngeneralization performance. In situations where data for the target domain is\nnot accessible, a viable workaround is to synthesize training data from\nscratch--i.e., synthetic data augmentation. This paper presents an extensive\nreview of synthetic data augmentation techniques. It covers data synthesis\napproaches based on realistic 3D graphics modeling, neural style transfer\n(NST), differential neural rendering, and generative artificial intelligence\n(AI) techniques such as generative adversarial networks (GANs) and variational\nautoencoders (VAEs). For each of these classes of methods, we focus on the\nimportant data generation and augmentation techniques, general scope of\napplication and specific use-cases, as well as existing limitations and\npossible workarounds. Additionally, we provide a summary of common synthetic\ndatasets for training computer vision models, highlighting the main features,\napplication domains and supported tasks. Finally, we discuss the effectiveness\nof synthetic data augmentation methods. Since this is the first paper to\nexplore synthetic data augmentation methods in great detail, we are hoping to\nequip readers with the necessary background information and in-depth knowledge\nof existing methods and their attendant issues.\n","authors":["Alhassan Mumuni","Fuseini Mumuni","Nana Kobina Gerrar"],"pdf_url":"https://arxiv.org/pdf/2403.10075v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11397v1","updated":"2024-03-18T01:11:53Z","published":"2024-03-18T01:11:53Z","title":"Defense Against Adversarial Attacks on No-Reference Image Quality Models\n with Gradient Norm Regularization","summary":" The task of No-Reference Image Quality Assessment (NR-IQA) is to estimate the\nquality score of an input image without additional information. NR-IQA models\nplay a crucial role in the media industry, aiding in performance evaluation and\noptimization guidance. However, these models are found to be vulnerable to\nadversarial attacks, which introduce imperceptible perturbations to input\nimages, resulting in significant changes in predicted scores. In this paper, we\npropose a defense method to improve the stability in predicted scores when\nattacked by small perturbations, thus enhancing the adversarial robustness of\nNR-IQA models. To be specific, we present theoretical evidence showing that the\nmagnitude of score changes is related to the $\\ell_1$ norm of the model's\ngradient with respect to the input image. Building upon this theoretical\nfoundation, we propose a norm regularization training strategy aimed at\nreducing the $\\ell_1$ norm of the gradient, thereby boosting the robustness of\nNR-IQA models. Experiments conducted on four NR-IQA baseline models demonstrate\nthe effectiveness of our strategy in reducing score changes in the presence of\nadversarial attacks. To the best of our knowledge, this work marks the first\nattempt to defend against adversarial attacks on NR-IQA models. Our study\noffers valuable insights into the adversarial robustness of NR-IQA models and\nprovides a foundation for future research in this area.\n","authors":["Yujia Liu","Chenxi Yang","Dingquan Li","Jianhao Ding","Tingting Jiang"],"pdf_url":"https://arxiv.org/pdf/2403.11397v1.pdf","comment":"accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.11391v1","updated":"2024-03-18T00:48:58Z","published":"2024-03-18T00:48:58Z","title":"Investigating the Benefits of Projection Head for Representation\n Learning","summary":" An effective technique for obtaining high-quality representations is adding a\nprojection head on top of the encoder during training, then discarding it and\nusing the pre-projection representations. Despite its proven practical\neffectiveness, the reason behind the success of this technique is poorly\nunderstood. The pre-projection representations are not directly optimized by\nthe loss function, raising the question: what makes them better? In this work,\nwe provide a rigorous theoretical answer to this question. We start by\nexamining linear models trained with self-supervised contrastive loss. We\nreveal that the implicit bias of training algorithms leads to layer-wise\nprogressive feature weighting, where features become increasingly unequal as we\ngo deeper into the layers. Consequently, lower layers tend to have more\nnormalized and less specialized representations. We theoretically characterize\nscenarios where such representations are more beneficial, highlighting the\nintricate interplay between data augmentation and input features. Additionally,\nwe demonstrate that introducing non-linearity into the network allows lower\nlayers to learn features that are completely absent in higher layers. Finally,\nwe show how this mechanism improves the robustness in supervised contrastive\nlearning and supervised learning. We empirically validate our results through\nvarious experiments on CIFAR-10/100, UrbanCars and shifted versions of\nImageNet. We also introduce a potential alternative to projection head, which\noffers a more interpretable and controllable design.\n","authors":["Yihao Xue","Eric Gan","Jiayi Ni","Siddharth Joshi","Baharan Mirzasoleiman"],"pdf_url":"https://arxiv.org/pdf/2403.11391v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.08747v3","updated":"2024-03-18T00:45:45Z","published":"2022-09-19T03:46:13Z","title":"On Robust Cross-View Consistency in Self-Supervised Monocular Depth\n Estimation","summary":" Remarkable progress has been made in self-supervised monocular depth\nestimation (SS-MDE) by exploring cross-view consistency, e.g., photometric\nconsistency and 3D point cloud consistency. However, they are very vulnerable\nto illumination variance, occlusions, texture-less regions, as well as moving\nobjects, making them not robust enough to deal with various scenes. To address\nthis challenge, we study two kinds of robust cross-view consistency in this\npaper. Firstly, the spatial offset field between adjacent frames is obtained by\nreconstructing the reference frame from its neighbors via deformable alignment,\nwhich is used to align the temporal depth features via a Depth Feature\nAlignment (DFA) loss. Secondly, the 3D point clouds of each reference frame and\nits nearby frames are calculated and transformed into voxel space, where the\npoint density in each voxel is calculated and aligned via a Voxel Density\nAlignment (VDA) loss. In this way, we exploit the temporal coherence in both\ndepth feature space and 3D voxel space for SS-MDE, shifting the\n\"point-to-point\" alignment paradigm to the \"region-to-region\" one. Compared\nwith the photometric consistency loss as well as the rigid point cloud\nalignment loss, the proposed DFA and VDA losses are more robust owing to the\nstrong representation power of deep features as well as the high tolerance of\nvoxel density to the aforementioned challenges. Experimental results on several\noutdoor benchmarks show that our method outperforms current state-of-the-art\ntechniques. Extensive ablation study and analysis validate the effectiveness of\nthe proposed losses, especially in challenging scenes. The code and models are\navailable at https://github.com/sunnyHelen/RCVC-depth.\n","authors":["Haimei Zhao","Jing Zhang","Zhuo Chen","Bo Yuan","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2209.08747v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11380v1","updated":"2024-03-18T00:13:41Z","published":"2024-03-18T00:13:41Z","title":"Boosting Order-Preserving and Transferability for Neural Architecture\n Search: a Joint Architecture Refined Search and Fine-tuning Approach","summary":" Supernet is a core component in many recent Neural Architecture Search (NAS)\nmethods. It not only helps embody the search space but also provides a\n(relative) estimation of the final performance of candidate architectures.\nThus, it is critical that the top architectures ranked by a supernet should be\nconsistent with those ranked by true performance, which is known as the\norder-preserving ability. In this work, we analyze the order-preserving ability\non the whole search space (global) and a sub-space of top architectures\n(local), and empirically show that the local order-preserving for current\ntwo-stage NAS methods still need to be improved. To rectify this, we propose a\nnovel concept of Supernet Shifting, a refined search strategy combining\narchitecture searching with supernet fine-tuning. Specifically, apart from\nevaluating, the training loss is also accumulated in searching and the supernet\nis updated every iteration. Since superior architectures are sampled more\nfrequently in evolutionary searching, the supernet is encouraged to focus on\ntop architectures, thus improving local order-preserving. Besides, a\npre-trained supernet is often un-reusable for one-shot methods. We show that\nSupernet Shifting can fulfill transferring supernet to a new dataset.\nSpecifically, the last classifier layer will be unset and trained through\nevolutionary searching. Comprehensive experiments show that our method has\nbetter order-preserving ability and can find a dominating architecture.\nMoreover, the pre-trained supernet can be easily transferred into a new dataset\nwith no loss of performance.\n","authors":["Beichen Zhang","Xiaoxing Wang","Xiaohan Qin","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2403.11380v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.11376v1","updated":"2024-03-18T00:03:48Z","published":"2024-03-18T00:03:48Z","title":"ShapeFormer: Shape Prior Visible-to-Amodal Transformer-based Amodal\n Instance Segmentation","summary":" Amodal Instance Segmentation (AIS) presents a challenging task as it involves\npredicting both visible and occluded parts of objects within images. Existing\nAIS methods rely on a bidirectional approach, encompassing both the transition\nfrom amodal features to visible features (amodal-to-visible) and from visible\nfeatures to amodal features (visible-to-amodal). Our observation shows that the\nutilization of amodal features through the amodal-to-visible can confuse the\nvisible features due to the extra information of occluded/hidden segments not\npresented in visible display. Consequently, this compromised quality of visible\nfeatures during the subsequent visible-to-amodal transition. To tackle this\nissue, we introduce ShapeFormer, a decoupled Transformer-based model with a\nvisible-to-amodal transition. It facilitates the explicit relationship between\noutput segmentations and avoids the need for amodal-to-visible transitions.\nShapeFormer comprises three key modules: (i) Visible-Occluding Mask Head for\npredicting visible segmentation with occlusion awareness, (ii) Shape-Prior\nAmodal Mask Head for predicting amodal and occluded masks, and (iii)\nCategory-Specific Shape Prior Retriever aims to provide shape prior knowledge.\nComprehensive experiments and extensive ablation studies across various AIS\nbenchmarks demonstrate the effectiveness of our ShapeFormer. The code is\navailable at: https://github.com/UARK-AICV/ShapeFormer\n","authors":["Minh Tran","Winston Bounsavy","Khoa Vo","Anh Nguyen","Tri Nguyen","Ngan Le"],"pdf_url":"https://arxiv.org/pdf/2403.11376v1.pdf","comment":"Accepted to IJCNN 2024"},{"id":"http://arxiv.org/abs/2403.11375v1","updated":"2024-03-18T00:02:48Z","published":"2024-03-18T00:02:48Z","title":"Path-GPTOmic: A Balanced Multi-modal Learning Framework for Survival\n Outcome Prediction","summary":" For predicting cancer survival outcomes, standard approaches in clinical\nresearch are often based on two main modalities: pathology images for observing\ncell morphology features, and genomic (e.g., bulk RNA-seq) for quantifying gene\nexpressions. However, existing pathology-genomic multi-modal algorithms face\nsignificant challenges: (1) Valuable biological insights regarding genes and\ngene-gene interactions are frequently overlooked; (2) one modality often\ndominates the optimization process, causing inadequate training for the other\nmodality. In this paper, we introduce a new multi-modal ``Path-GPTOmic\"\nframework for cancer survival outcome prediction. First, to extract valuable\nbiological insights, we regulate the embedding space of a foundation model,\nscGPT, initially trained on single-cell RNA-seq data, making it adaptable for\nbulk RNA-seq data. Second, to address the imbalance-between-modalities problem,\nwe propose a gradient modulation mechanism tailored to the Cox partial\nlikelihood loss for survival prediction. The contributions of the modalities\nare dynamically monitored and adjusted during the training process, encouraging\nthat both modalities are sufficiently trained. Evaluated on two TCGA(The Cancer\nGenome Atlas) datasets, our model achieves substantially improved survival\nprediction accuracy.\n","authors":["Hongxiao Wang","Yang Yang","Zhuo Zhao","Pengfei Gu","Nishchal Sapkota","Danny Z. Chen"],"pdf_url":"https://arxiv.org/pdf/2403.11375v1.pdf","comment":"Accepted by IEEE International Symposium on Biomedical Imaging (ISBI\n 2024)"},{"id":"http://arxiv.org/abs/2403.10054v2","updated":"2024-03-18T17:42:45Z","published":"2024-03-15T06:50:19Z","title":"Control and Automation for Industrial Production Storage Zone:\n Generation of Optimal Route Using Image Processing","summary":" Digital image processing (DIP) is of great importance in validating and\nguaranteeing parameters that ensure the quality of mass-produced products.\nTherefore, this article focused on developing an industrial automation method\nfor a zone of a production line model using the DIP. The neo-cascade\nmethodology employed allowed for defining each of the stages in an adequate\nway, ensuring the inclusion of the relevant methods for its development, which\nfinally incurred in the modeling, design, implementation, and testing of an\noptimal route generation system for a warehouse area, using DIP with\noptimization guidelines, in conjunction with an embedded platform and the\nconnection to programmable logic controllers (PLCs) for its execution. The\nsystem was based on the OpenCV library; tool focused on artificial vision,\nwhich was implemented on an object-oriented programming (OOP) platform based on\nJava language. It generated the optimal route for the automation of processes\nin a scale warehouse area, using the segmentation of objects and the\noptimization of flow in networks as pillars, ending with the connection to PLCs\nas a method of action, which in case of implementation would eliminate\nconstraints such as process inefficiency, the use of manpower to perform these\ntasks, inadequate use of resources, among others\n","authors":["Bejamin A. Huerfano","Fernando Jimenez"],"pdf_url":"https://arxiv.org/pdf/2403.10054v2.pdf","comment":"17 figures, 17 tables, from a thesis (2017)"},{"id":"http://arxiv.org/abs/2403.05050v3","updated":"2024-03-18T17:39:34Z","published":"2024-03-08T04:53:53Z","title":"DyRoNet: Dynamic Routing and Low-Rank Adapters for Autonomous Driving\n Streaming Perception","summary":" The advancement of autonomous driving systems hinges on the ability to\nachieve low-latency and high-accuracy perception. To address this critical\nneed, this paper introduces Dynamic Routering Network (DyRoNet), a low-rank\nenhanced dynamic routing framework designed for streaming perception in\nautonomous driving systems. DyRoNet integrates a suite of pre-trained branch\nnetworks, each meticulously fine-tuned to function under distinct environmental\nconditions. At its core, the framework offers a speed router module, developed\nto assess and route input data to the most suitable branch for processing. This\napproach not only addresses the inherent limitations of conventional models in\nadapting to diverse driving conditions but also ensures the balance between\nperformance and efficiency. Extensive experimental evaluations demonstrating\nthe adaptability of DyRoNet to diverse branch selection strategies, resulting\nin significant performance enhancements across different scenarios. This work\nnot only establishes a new benchmark for streaming perception but also provides\nvaluable engineering insights for future work.\n","authors":["Xiang Huang","Zhi-Qi Cheng","Jun-Yan He","Chenyang Li","Wangmeng Xiang","Baigui Sun","Xiao Wu"],"pdf_url":"https://arxiv.org/pdf/2403.05050v3.pdf","comment":"Project: https://tastevision.github.io/DyRoNet/"},{"id":"http://arxiv.org/abs/2403.12327v1","updated":"2024-03-18T23:45:18Z","published":"2024-03-18T23:45:18Z","title":"GT-Rain Single Image Deraining Challenge Report","summary":" This report reviews the results of the GT-Rain challenge on single image\nderaining at the UG2+ workshop at CVPR 2023. The aim of this competition is to\nstudy the rainy weather phenomenon in real world scenarios, provide a novel\nreal world rainy image dataset, and to spark innovative ideas that will further\nthe development of single image deraining methods on real images. Submissions\nwere trained on the GT-Rain dataset and evaluated on an extension of the\ndataset consisting of 15 additional scenes. Scenes in GT-Rain are comprised of\nreal rainy image and ground truth image captured moments after the rain had\nstopped. 275 participants were registered in the challenge and 55 competed in\nthe final testing phase.\n","authors":["Howard Zhang","Yunhao Ba","Ethan Yang","Rishi Upadhyay","Alex Wong","Achuta Kadambi","Yun Guo","Xueyao Xiao","Xiaoxiong Wang","Yi Li","Yi Chang","Luxin Yan","Chaochao Zheng","Luping Wang","Bin Liu","Sunder Ali Khowaja","Jiseok Yoon","Ik-Hyun Lee","Zhao Zhang","Yanyan Wei","Jiahuan Ren","Suiyi Zhao","Huan Zheng"],"pdf_url":"https://arxiv.org/pdf/2403.12327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12326v1","updated":"2024-03-18T23:42:04Z","published":"2024-03-18T23:42:04Z","title":"Removing Undesirable Concepts in Text-to-Image Generative Models with\n Learnable Prompts","summary":" Generative models have demonstrated remarkable potential in generating\nvisually impressive content from textual descriptions. However, training these\nmodels on unfiltered internet data poses the risk of learning and subsequently\npropagating undesirable concepts, such as copyrighted or unethical content. In\nthis paper, we propose a novel method to remove undesirable concepts from\ntext-to-image generative models by incorporating a learnable prompt into the\ncross-attention module. This learnable prompt acts as additional memory to\ntransfer the knowledge of undesirable concepts into it and reduce the\ndependency of these concepts on the model parameters and corresponding textual\ninputs. Because of this knowledge transfer into the prompt, erasing these\nundesirable concepts is more stable and has minimal negative impact on other\nconcepts. We demonstrate the effectiveness of our method on the Stable\nDiffusion model, showcasing its superiority over state-of-the-art erasure\nmethods in terms of removing undesirable content while preserving other\nunrelated elements.\n","authors":["Anh Bui","Khanh Doan","Trung Le","Paul Montague","Tamas Abraham","Dinh Phung"],"pdf_url":"https://arxiv.org/pdf/2403.12326v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00935v2","updated":"2024-03-18T23:41:41Z","published":"2024-01-01T19:00:55Z","title":"Boundary Attention: Learning to Localize Boundaries under High Noise","summary":" We present a differentiable model that infers explicit boundaries, including\ncurves, corners and junctions, using a mechanism that we call boundary\nattention. Boundary attention is a boundary-aware local attention operation\nthat, when applied densely and repeatedly, progressively refines a field of\nvariables that specify an unrasterized description of the local boundary\nstructure in every overlapping patch within an image. It operates in a\nbottom-up fashion, similar to classical methods for sub-pixel edge localization\nand edge-linking, but with a higher-dimensional description of local boundary\nstructure, a notion of spatial consistency that is learned instead of designed,\nand a sequence of operations that is end-to-end differentiable. We train our\nmodel using simple synthetic data and then evaluate it using photographs that\nwere captured under low-light conditions with variable amounts of noise. We\nfind that our method generalizes to natural images corrupted by real sensor\nnoise, and predicts consistent boundaries under increasingly noisy conditions\nwhere other state-of-the-art methods fail.\n","authors":["Mia Gaia Polansky","Charles Herrmann","Junhwa Hur","Deqing Sun","Dor Verbin","Todd Zickler"],"pdf_url":"https://arxiv.org/pdf/2401.00935v2.pdf","comment":"Project website at boundaryattention.github.io:\n http://boundaryattention.github.io"},{"id":"http://arxiv.org/abs/2403.12317v1","updated":"2024-03-18T23:22:37Z","published":"2024-03-18T23:22:37Z","title":"EffiPerception: an Efficient Framework for Various Perception Tasks","summary":" The accuracy-speed-memory trade-off is always the priority to consider for\nseveral computer vision perception tasks.\n Previous methods mainly focus on a single or small couple of these tasks,\nsuch as creating effective data augmentation, feature extractor, learning\nstrategies, etc. These approaches, however, could be inherently task-specific:\ntheir proposed model's performance may depend on a specific perception task or\na dataset.\n Targeting to explore common learning patterns and increasing the module\nrobustness, we propose the EffiPerception framework.\n It could achieve great accuracy-speed performance with relatively low memory\ncost under several perception tasks: 2D Object Detection, 3D Object Detection,\n2D Instance Segmentation, and 3D Point Cloud Segmentation.\n Overall, the framework consists of three parts:\n (1) Efficient Feature Extractors, which extract the input features for each\nmodality. (2) Efficient Layers, plug-in plug-out layers that further process\nthe feature representation, aggregating core learned information while pruning\nnoisy proposals. (3) The EffiOptim, an 8-bit optimizer to further cut down the\ncomputational cost and facilitate performance stability.\n Extensive experiments on the KITTI, semantic-KITTI, and COCO datasets\nrevealed that EffiPerception could show great accuracy-speed-memory overall\nperformance increase within the four detection and segmentation tasks, in\ncomparison to earlier, well-respected methods.\n","authors":["Xinhao Xiang","Simon Dräger","Jiawei Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.12317v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12310v1","updated":"2024-03-18T23:18:40Z","published":"2024-03-18T23:18:40Z","title":"Prototipo de un Contador Bidireccional Automático de Personas basado\n en sensores de visión 3D","summary":" 3D sensors, also known as RGB-D sensors, utilize depth images where each\npixel measures the distance from the camera to objects, using principles like\nstructured light or time-of-flight. Advances in artificial vision have led to\naffordable 3D cameras capable of real-time object detection without object\nmovement, surpassing 2D cameras in information depth. These cameras can\nidentify objects of varying colors and reflectivities and are less affected by\nlighting changes. The described prototype uses RGB-D sensors for bidirectional\npeople counting in venues, aiding security and surveillance in spaces like\nstadiums or airports. It determines real-time occupancy and checks against\nmaximum capacity, crucial during emergencies. The system includes a RealSense\nD415 depth camera and a mini-computer running object detection algorithms to\ncount people and a 2D camera for identity verification. The system supports\nstatistical analysis and uses C++, Python, and PHP with OpenCV for image\nprocessing, demonstrating a comprehensive approach to monitoring venue\noccupancy.\n","authors":["Benjamín Ojeda-Magaña","Rubén Ruelas","José Guadalupe Robledo-Hernández","Víctor Manuel Rangel-Cobián","Fernando López Aguilar-Hernández"],"pdf_url":"https://arxiv.org/pdf/2403.12310v1.pdf","comment":"8 pages, in Spanish language, 8 figures"},{"id":"http://arxiv.org/abs/2403.10357v2","updated":"2024-03-18T23:00:57Z","published":"2024-03-15T14:45:38Z","title":"ANIM: Accurate Neural Implicit Model for Human Reconstruction from a\n single RGB-D image","summary":" Recent progress in human shape learning, shows that neural implicit models\nare effective in generating 3D human surfaces from limited number of views, and\neven from a single RGB image. However, existing monocular approaches still\nstruggle to recover fine geometric details such as face, hands or cloth\nwrinkles. They are also easily prone to depth ambiguities that result in\ndistorted geometries along the camera optical axis. In this paper, we explore\nthe benefits of incorporating depth observations in the reconstruction process\nby introducing ANIM, a novel method that reconstructs arbitrary 3D human shapes\nfrom single-view RGB-D images with an unprecedented level of accuracy. Our\nmodel learns geometric details from both multi-resolution pixel-aligned and\nvoxel-aligned features to leverage depth information and enable spatial\nrelationships, mitigating depth ambiguities. We further enhance the quality of\nthe reconstructed shape by introducing a depth-supervision strategy, which\nimproves the accuracy of the signed distance field estimation of points that\nlie on the reconstructed surface. Experiments demonstrate that ANIM outperforms\nstate-of-the-art works that use RGB, surface normals, point cloud or RGB-D data\nas input. In addition, we introduce ANIM-Real, a new multi-modal dataset\ncomprising high-quality scans paired with consumer-grade RGB-D camera, and our\nprotocol to fine-tune ANIM, enabling high-quality reconstruction from\nreal-world human capture.\n","authors":["Marco Pesavento","Yuanlu Xu","Nikolaos Sarafianos","Robert Maier","Ziyan Wang","Chun-Han Yao","Marco Volino","Edmond Boyer","Adrian Hilton","Tony Tung"],"pdf_url":"https://arxiv.org/pdf/2403.10357v2.pdf","comment":"Accepted to CVPR24; Project page:\n https://marcopesavento.github.io/ANIM/"},{"id":"http://arxiv.org/abs/2403.12301v1","updated":"2024-03-18T22:46:14Z","published":"2024-03-18T22:46:14Z","title":"R3DS: Reality-linked 3D Scenes for Panoramic Scene Understanding","summary":" We introduce the Reality-linked 3D Scenes (R3DS) dataset of synthetic 3D\nscenes mirroring the real-world scene arrangements from Matterport3D panoramas.\nCompared to prior work, R3DS has more complete and densely populated scenes\nwith objects linked to real-world observations in panoramas. R3DS also provides\nan object support hierarchy, and matching object sets (e.g., same chairs around\na dining table) for each scene. Overall, R3DS contains 19K objects represented\nby 3,784 distinct CAD models from over 100 object categories. We demonstrate\nthe effectiveness of R3DS on the Panoramic Scene Understanding task. We find\nthat: 1) training on R3DS enables better generalization; 2) support relation\nprediction trained with R3DS improves performance compared to heuristically\ncalculated support; and 3) R3DS offers a challenging benchmark for future work\non panoramic scene understanding.\n","authors":["Qirui Wu","Sonia Raychaudhuri","Daniel Ritchie","Manolis Savva","Angel X Chang"],"pdf_url":"https://arxiv.org/pdf/2403.12301v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12290v1","updated":"2024-03-18T22:26:19Z","published":"2024-03-18T22:26:19Z","title":"Estimation and Analysis of Slice Propagation Uncertainty in 3D Anatomy\n Segmentation","summary":" Supervised methods for 3D anatomy segmentation demonstrate superior\nperformance but are often limited by the availability of annotated data. This\nlimitation has led to a growing interest in self-supervised approaches in\ntandem with the abundance of available un-annotated data. Slice propagation has\nemerged as an self-supervised approach that leverages slice registration as a\nself-supervised task to achieve full anatomy segmentation with minimal\nsupervision. This approach significantly reduces the need for domain expertise,\ntime, and the cost associated with building fully annotated datasets required\nfor training segmentation networks. However, this shift toward reduced\nsupervision via deterministic networks raises concerns about the\ntrustworthiness and reliability of predictions, especially when compared with\nmore accurate supervised approaches. To address this concern, we propose the\nintegration of calibrated uncertainty quantification (UQ) into slice\npropagation methods, providing insights into the model's predictive reliability\nand confidence levels. Incorporating uncertainty measures enhances user\nconfidence in self-supervised approaches, thereby improving their practical\napplicability. We conducted experiments on three datasets for 3D abdominal\nsegmentation using five UQ methods. The results illustrate that incorporating\nUQ improves not only model trustworthiness, but also segmentation accuracy.\nFurthermore, our analysis reveals various failure modes of slice propagation\nmethods that might not be immediately apparent to end-users. This study opens\nup new research avenues to improve the accuracy and trustworthiness of slice\npropagation methods.\n","authors":["Rachaell Nihalaani","Tushar Kataria","Jadie Adams","Shireen Y. Elhabian"],"pdf_url":"https://arxiv.org/pdf/2403.12290v1.pdf","comment":"13 pages including Supplementary, 4 figures"},{"id":"http://arxiv.org/abs/2307.03132v2","updated":"2024-03-18T22:18:02Z","published":"2023-07-06T16:59:52Z","title":"T-MARS: Improving Visual Representations by Circumventing Text Feature\n Learning","summary":" Large web-sourced multimodal datasets have powered a slew of new methods for\nlearning general-purpose visual representations, advancing the state of the art\nin computer vision and revolutionizing zero- and few-shot recognition. One\ncrucial decision facing practitioners is how, if at all, to curate these\never-larger datasets. For example, the creators of the LAION-5B dataset chose\nto retain only image-caption pairs whose CLIP similarity score exceeded a\ndesignated threshold. In this paper, we propose a new state-of-the-art data\nfiltering approach motivated by our observation that nearly 40% of LAION's\nimages contain text that overlaps significantly with the caption. Intuitively,\nsuch data could be wasteful as it incentivizes models to perform optical\ncharacter recognition rather than learning visual features. However, naively\nremoving all such data could also be wasteful, as it throws away images that\ncontain visual features (in addition to overlapping text). Our simple and\nscalable approach, T-MARS (Text Masking and Re-Scoring), filters out only those\npairs where the text dominates the remaining visual features -- by first\nmasking out the text and then filtering out those with a low CLIP similarity\nscore of the masked image. Experimentally, T-MARS outperforms the top-ranked\nmethod on the \"medium scale\" of DataComp (a data filtering benchmark) by a\nmargin of 6.5% on ImageNet and 4.7% on VTAB. Additionally, our systematic\nevaluation on various data pool sizes from 2M to 64M shows that the accuracy\ngains enjoyed by T-MARS linearly increase as data and compute are scaled\nexponentially. Code is available at https://github.com/locuslab/T-MARS.\n","authors":["Pratyush Maini","Sachin Goyal","Zachary C. Lipton","J. Zico Kolter","Aditi Raghunathan"],"pdf_url":"https://arxiv.org/pdf/2307.03132v2.pdf","comment":"Accepted to ICLR 2024. Oral at ICCV Datacomp 2023"},{"id":"http://arxiv.org/abs/2311.16974v2","updated":"2024-03-18T21:43:20Z","published":"2023-11-28T17:22:17Z","title":"COLE: A Hierarchical Generation Framework for Multi-Layered and Editable\n Graphic Design","summary":" Graphic design, which has been evolving since the 15th century, plays a\ncrucial role in advertising. The creation of high-quality designs demands\ndesign-oriented planning, reasoning, and layer-wise generation. Unlike the\nrecent CanvaGPT, which integrates GPT-4 with existing design templates to build\na custom GPT, this paper introduces the COLE system - a hierarchical generation\nframework designed to comprehensively address these challenges. This COLE\nsystem can transform a vague intention prompt into a high-quality multi-layered\ngraphic design, while also supporting flexible editing based on user input.\nExamples of such input might include directives like ``design a poster for\nHisaishi's concert.'' The key insight is to dissect the complex task of\ntext-to-design generation into a hierarchy of simpler sub-tasks, each addressed\nby specialized models working collaboratively. The results from these models\nare then consolidated to produce a cohesive final output. Our hierarchical task\ndecomposition can streamline the complex process and significantly enhance\ngeneration reliability. Our COLE system comprises multiple fine-tuned Large\nLanguage Models (LLMs), Large Multimodal Models (LMMs), and Diffusion Models\n(DMs), each specifically tailored for design-aware layer-wise captioning,\nlayout planning, reasoning, and the task of generating images and text.\nFurthermore, we construct the DESIGNINTENTION benchmark to demonstrate the\nsuperiority of our COLE system over existing methods in generating high-quality\ngraphic designs from user intent. Last, we present a Canva-like multi-layered\nimage editing tool to support flexible editing of the generated multi-layered\ngraphic design images. We perceive our COLE system as an important step towards\naddressing more complex and multi-layered graphic design generation tasks in\nthe future.\n","authors":["Peidong Jia","Chenxuan Li","Yuhui Yuan","Zeyu Liu","Yichao Shen","Bohan Chen","Xingru Chen","Yinglin Zheng","Dong Chen","Ji Li","Xiaodong Xie","Shanghang Zhang","Baining Guo"],"pdf_url":"https://arxiv.org/pdf/2311.16974v2.pdf","comment":"Technical report. Project page:\n https://graphic-design-generation-github-io.vercel.app/"},{"id":"http://arxiv.org/abs/2403.12267v1","updated":"2024-03-18T21:32:58Z","published":"2024-03-18T21:32:58Z","title":"Data-Efficient Contrastive Language-Image Pretraining: Prioritizing Data\n Quality over Quantity","summary":" Contrastive Language-Image Pre-training (CLIP) on large-scale image-caption\ndatasets learns representations that can achieve remarkable zero-shot\ngeneralization. However, such models require a massive amount of pre-training\ndata. Improving the quality of the pre-training data has been shown to be much\nmore effective in improving CLIP's performance than increasing its volume.\nNevertheless, finding small subsets of training data that provably generalize\nthe best has remained an open question. In this work, we propose the first\ntheoretically rigorous data selection method for CLIP. We show that subsets\nthat closely preserve the cross-covariance of the images and captions of the\nfull data provably achieve a superior generalization performance. Our extensive\nexperiments on ConceptualCaptions3M and ConceptualCaptions12M demonstrate that\nsubsets found by \\method\\ achieve over 2.7x and 1.4x the accuracy of the next\nbest baseline on ImageNet and its shifted versions. Moreover, we show that our\nsubsets obtain 1.5x the average accuracy across 11 downstream datasets, of the\nnext best baseline. The code is available at:\nhttps://github.com/BigML-CS-UCLA/clipcov-data-efficient-clip.\n","authors":["Siddharth Joshi","Arnav Jain","Ali Payani","Baharan Mirzasoleiman"],"pdf_url":"https://arxiv.org/pdf/2403.12267v1.pdf","comment":"AISTATS 2024, Code:\n https://github.com/BigML-CS-UCLA/clipcov-data-efficient-clip"},{"id":"http://arxiv.org/abs/2310.01755v2","updated":"2024-03-18T21:31:29Z","published":"2023-10-03T02:37:57Z","title":"ImageNet-OOD: Deciphering Modern Out-of-Distribution Detection\n Algorithms","summary":" The task of out-of-distribution (OOD) detection is notoriously ill-defined.\nEarlier works focused on new-class detection, aiming to identify label-altering\ndata distribution shifts, also known as \"semantic shift.\" However, recent works\nargue for a focus on failure detection, expanding the OOD evaluation framework\nto account for label-preserving data distribution shifts, also known as\n\"covariate shift.\" Intriguingly, under this new framework, complex OOD\ndetectors that were previously considered state-of-the-art now perform\nsimilarly to, or even worse than the simple maximum softmax probability\nbaseline. This raises the question: what are the latest OOD detectors actually\ndetecting? Deciphering the behavior of OOD detection algorithms requires\nevaluation datasets that decouples semantic shift and covariate shift. To aid\nour investigations, we present ImageNet-OOD, a clean semantic shift dataset\nthat minimizes the interference of covariate shift. Through comprehensive\nexperiments, we show that OOD detectors are more sensitive to covariate shift\nthan to semantic shift, and the benefits of recent OOD detection algorithms on\nsemantic shift detection is minimal. Our dataset and analyses provide important\ninsights for guiding the design of future OOD detectors.\n","authors":["William Yang","Byron Zhang","Olga Russakovsky"],"pdf_url":"https://arxiv.org/pdf/2310.01755v2.pdf","comment":"ICLR 2024. Code and dataset at\n https://github.com/princetonvisualai/imagenetood"},{"id":"http://arxiv.org/abs/2401.08876v5","updated":"2024-03-18T20:43:07Z","published":"2024-01-16T23:19:30Z","title":"Evaluating the Utility of Conformal Prediction Sets for AI-Advised Image\n Labeling","summary":" As deep neural networks are more commonly deployed in high-stakes domains,\ntheir black-box nature makes uncertainty quantification challenging. We\ninvestigate the effects of presenting conformal prediction sets--a\ndistribution-free class of methods for generating prediction sets with\nspecified coverage--to express uncertainty in AI-advised decision-making.\nThrough a large online experiment, we compare the utility of conformal\nprediction sets to displays of Top-1 and Top-k predictions for AI-advised image\nlabeling. In a pre-registered analysis, we find that the utility of prediction\nsets for accuracy varies with the difficulty of the task: while they result in\naccuracy on par with or less than Top-1 and Top-k displays for easy images,\nprediction sets excel at assisting humans in labeling out-of-distribution (OOD)\nimages, especially when the set size is small. Our results empirically pinpoint\npractical challenges of conformal prediction sets and provide implications on\nhow to incorporate them for real-world decision-making.\n","authors":["Dongping Zhang","Angelos Chatzimparmpas","Negar Kamali","Jessica Hullman"],"pdf_url":"https://arxiv.org/pdf/2401.08876v5.pdf","comment":"19 pages, 11 figures, 10 tables. Accepted by ACM CHI 2024"},{"id":"http://arxiv.org/abs/2403.12236v1","updated":"2024-03-18T20:33:44Z","published":"2024-03-18T20:33:44Z","title":"Improving Generalization via Meta-Learning on Hard Samples","summary":" Learned reweighting (LRW) approaches to supervised learning use an\noptimization criterion to assign weights for training instances, in order to\nmaximize performance on a representative validation dataset. We pose and\nformalize the problem of optimized selection of the validation set used in LRW\ntraining, to improve classifier generalization. In particular, we show that\nusing hard-to-classify instances in the validation set has both a theoretical\nconnection to, and strong empirical evidence of generalization. We provide an\nefficient algorithm for training this meta-optimized model, as well as a simple\ntrain-twice heuristic for careful comparative study. We demonstrate that LRW\nwith easy validation data performs consistently worse than LRW with hard\nvalidation data, establishing the validity of our meta-optimization problem.\nOur proposed algorithm outperforms a wide range of baselines on a range of\ndatasets and domain shift challenges (Imagenet-1K, CIFAR-100, Clothing-1M,\nCAMELYON, WILDS, etc.), with ~1% gains using VIT-B on Imagenet. We also show\nthat using naturally hard examples for validation (Imagenet-R / Imagenet-A) in\nLRW training for Imagenet improves performance on both clean and naturally hard\ntest instances by 1-2%. Secondary analyses show that using hard validation data\nin an LRW framework improves margins on test data, hinting at the mechanism\nunderlying our empirical gains. We believe this work opens up new research\ndirections for the meta-optimization of meta-learning in a supervised learning\ncontext.\n","authors":["Nishant Jain","Arun S. Suggala","Pradeep Shenoy"],"pdf_url":"https://arxiv.org/pdf/2403.12236v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.12229v1","updated":"2024-03-18T20:20:13Z","published":"2024-03-18T20:20:13Z","title":"Fusion Transformer with Object Mask Guidance for Image Forgery Analysis","summary":" In this work, we introduce OMG-Fuser, a fusion transformer-based network\ndesigned to extract information from various forensic signals to enable robust\nimage forgery detection and localization. Our approach can operate with an\narbitrary number of forensic signals and leverages object information for their\nanalysis -- unlike previous methods that rely on fusion schemes with few\nsignals and often disregard image semantics. To this end, we design a forensic\nsignal stream composed of a transformer guided by an object attention\nmechanism, associating patches that depict the same objects. In that way, we\nincorporate object-level information from the image. Each forensic signal is\nprocessed by a different stream that adapts to its peculiarities. Subsequently,\na token fusion transformer efficiently aggregates the outputs of an arbitrary\nnumber of network streams and generates a fused representation for each image\npatch. These representations are finally processed by a long-range dependencies\ntransformer that captures the intrinsic relations between the image patches. We\nassess two fusion variants on top of the proposed approach: (i) score-level\nfusion that fuses the outputs of multiple image forensics algorithms and (ii)\nfeature-level fusion that fuses low-level forensic traces directly. Both\nvariants exceed state-of-the-art performance on seven datasets for image\nforgery detection and localization, with a relative average improvement of\n12.1% and 20.4% in terms of F1. Our network demonstrates robustness against\ntraditional and novel forgery attacks and can be expanded with new signals\nwithout training from scratch.\n","authors":["Dimitrios Karageorgiou","Giorgos Kordopatis-Zilos","Symeon Papadopoulos"],"pdf_url":"https://arxiv.org/pdf/2403.12229v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12226v1","updated":"2024-03-18T20:18:32Z","published":"2024-03-18T20:18:32Z","title":"Large-scale flood modeling and forecasting with FloodCast","summary":" Large-scale hydrodynamic models generally rely on fixed-resolution spatial\ngrids and model parameters as well as incurring a high computational cost. This\nlimits their ability to accurately forecast flood crests and issue\ntime-critical hazard warnings. In this work, we build a fast, stable, accurate,\nresolution-invariant, and geometry-adaptative flood modeling and forecasting\nframework that can perform at large scales, namely FloodCast. The framework\ncomprises two main modules: multi-satellite observation and hydrodynamic\nmodeling. In the multi-satellite observation module, a real-time unsupervised\nchange detection method and a rainfall processing and analysis tool are\nproposed to harness the full potential of multi-satellite observations in\nlarge-scale flood prediction. In the hydrodynamic modeling module, a\ngeometry-adaptive physics-informed neural solver (GeoPINS) is introduced,\nbenefiting from the absence of a requirement for training data in\nphysics-informed neural networks and featuring a fast, accurate, and\nresolution-invariant architecture with Fourier neural operators. GeoPINS\ndemonstrates impressive performance on popular PDEs across regular and\nirregular domains. Building upon GeoPINS, we propose a sequence-to-sequence\nGeoPINS model to handle long-term temporal series and extensive spatial domains\nin large-scale flood modeling. Next, we establish a benchmark dataset in the\n2022 Pakistan flood to assess various flood prediction methods. Finally, we\nvalidate the model in three dimensions - flood inundation range, depth, and\ntransferability of spatiotemporal downscaling. Traditional hydrodynamics and\nsequence-to-sequence GeoPINS exhibit exceptional agreement during high water\nlevels, while comparative assessments with SAR-based flood depth data show that\nsequence-to-sequence GeoPINS outperforms traditional hydrodynamics, with\nsmaller prediction errors.\n","authors":["Qingsong Xu","Yilei Shi","Jonathan Bamber","Chaojun Ouyang","Xiao Xiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.12226v1.pdf","comment":"40 pages, 16 figures, under review"},{"id":"http://arxiv.org/abs/2403.12211v1","updated":"2024-03-18T19:51:55Z","published":"2024-03-18T19:51:55Z","title":"A Unified Model for Longitudinal Multi-Modal Multi-View Prediction with\n Missingness","summary":" Medical records often consist of different modalities, such as images, text,\nand tabular information. Integrating all modalities offers a holistic view of a\npatient's condition, while analyzing them longitudinally provides a better\nunderstanding of disease progression. However, real-world longitudinal medical\nrecords present challenges: 1) patients may lack some or all of the data for a\nspecific timepoint, and 2) certain modalities or views might be absent for all\npatients during a particular period. In this work, we introduce a unified model\nfor longitudinal multi-modal multi-view (MMMV) prediction with missingness. Our\nmethod allows as many timepoints as desired for input, and aims to leverage all\navailable data, regardless of their availability. We conduct extensive\nexperiments on the knee osteoarthritis dataset from the Osteoarthritis\nInitiative (OAI) for pain and Kellgren-Lawrence grade (KLG) prediction at a\nfuture timepoint. We demonstrate the effectiveness of our method by comparing\nresults from our unified model to specific models that use the same modality\nand view combinations during training and evaluation. We also show the benefit\nof having extended temporal data and provide post-hoc analysis for a deeper\nunderstanding of each modality/view's importance for different tasks.\n","authors":["Boqi Chen","Junier Oliva","Marc Niethammer"],"pdf_url":"https://arxiv.org/pdf/2403.12211v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12207v1","updated":"2024-03-18T19:44:30Z","published":"2024-03-18T19:44:30Z","title":"Synthetic Image Generation in Cyber Influence Operations: An Emergent\n Threat?","summary":" The evolution of artificial intelligence (AI) has catalyzed a transformation\nin digital content generation, with profound implications for cyber influence\noperations. This report delves into the potential and limitations of generative\ndeep learning models, such as diffusion models, in fabricating convincing\nsynthetic images. We critically assess the accessibility, practicality, and\noutput quality of these tools and their implications in threat scenarios of\ndeception, influence, and subversion. Notably, the report generates content for\nseveral hypothetical cyber influence operations to demonstrate the current\ncapabilities and limitations of these AI-driven methods for threat actors.\nWhile generative models excel at producing illustrations and non-realistic\nimagery, creating convincing photo-realistic content remains a significant\nchallenge, limited by computational resources and the necessity for\nhuman-guided refinement. Our exploration underscores the delicate balance\nbetween technological advancement and its potential for misuse, prompting\nrecommendations for ongoing research, defense mechanisms, multi-disciplinary\ncollaboration, and policy development. These recommendations aim to leverage\nAI's potential for positive impact while safeguarding against its risks to the\nintegrity of information, especially in the context of cyber influence.\n","authors":["Melanie Mathys","Marco Willi","Michael Graber","Raphael Meier"],"pdf_url":"https://arxiv.org/pdf/2403.12207v1.pdf","comment":"44 pages, 56 figures"},{"id":"http://arxiv.org/abs/2403.12203v1","updated":"2024-03-18T19:25:57Z","published":"2024-03-18T19:25:57Z","title":"Bootstrapping Reinforcement Learning with Imitation for Vision-Based\n Agile Flight","summary":" We combine the effectiveness of Reinforcement Learning (RL) and the\nefficiency of Imitation Learning (IL) in the context of vision-based,\nautonomous drone racing. We focus on directly processing visual input without\nexplicit state estimation. While RL offers a general framework for learning\ncomplex controllers through trial and error, it faces challenges regarding\nsample efficiency and computational demands due to the high dimensionality of\nvisual inputs. Conversely, IL demonstrates efficiency in learning from visual\ndemonstrations but is limited by the quality of those demonstrations and faces\nissues like covariate shift. To overcome these limitations, we propose a novel\ntraining framework combining RL and IL's advantages. Our framework involves\nthree stages: initial training of a teacher policy using privileged state\ninformation, distilling this policy into a student policy using IL, and\nperformance-constrained adaptive RL fine-tuning. Our experiments in both\nsimulated and real-world environments demonstrate that our approach achieves\nsuperior performance and robustness than IL or RL alone in navigating a\nquadrotor through a racing course using only visual information without\nexplicit state estimation.\n","authors":["Jiaxu Xing","Angel Romero","Leonard Bauersfeld","Davide Scaramuzza"],"pdf_url":"https://arxiv.org/pdf/2403.12203v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12202v1","updated":"2024-03-18T19:22:55Z","published":"2024-03-18T19:22:55Z","title":"DeCoTR: Enhancing Depth Completion with 2D and 3D Attentions","summary":" In this paper, we introduce a novel approach that harnesses both 2D and 3D\nattentions to enable highly accurate depth completion without requiring\niterative spatial propagations. Specifically, we first enhance a baseline\nconvolutional depth completion model by applying attention to 2D features in\nthe bottleneck and skip connections. This effectively improves the performance\nof this simple network and sets it on par with the latest, complex\ntransformer-based models. Leveraging the initial depths and features from this\nnetwork, we uplift the 2D features to form a 3D point cloud and construct a 3D\npoint transformer to process it, allowing the model to explicitly learn and\nexploit 3D geometric features. In addition, we propose normalization techniques\nto process the point cloud, which improves learning and leads to better\naccuracy than directly using point transformers off the shelf. Furthermore, we\nincorporate global attention on downsampled point cloud features, which enables\nlong-range context while still being computationally feasible. We evaluate our\nmethod, DeCoTR, on established depth completion benchmarks, including NYU Depth\nV2 and KITTI, showcasing that it sets new state-of-the-art performance. We\nfurther conduct zero-shot evaluations on ScanNet and DDAD benchmarks and\ndemonstrate that DeCoTR has superior generalizability compared to existing\napproaches.\n","authors":["Yunxiao Shi","Manish Kumar Singh","Hong Cai","Fatih Porikli"],"pdf_url":"https://arxiv.org/pdf/2403.12202v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.12198v1","updated":"2024-03-18T19:13:02Z","published":"2024-03-18T19:13:02Z","title":"FLex: Joint Pose and Dynamic Radiance Fields Optimization for Stereo\n Endoscopic Videos","summary":" Reconstruction of endoscopic scenes is an important asset for various medical\napplications, from post-surgery analysis to educational training. Neural\nrendering has recently shown promising results in endoscopic reconstruction\nwith deforming tissue. However, the setup has been restricted to a static\nendoscope, limited deformation, or required an external tracking device to\nretrieve camera pose information of the endoscopic camera. With FLex we adress\nthe challenging setup of a moving endoscope within a highly dynamic environment\nof deforming tissue. We propose an implicit scene separation into multiple\noverlapping 4D neural radiance fields (NeRFs) and a progressive optimization\nscheme jointly optimizing for reconstruction and camera poses from scratch.\nThis improves the ease-of-use and allows to scale reconstruction capabilities\nin time to process surgical videos of 5,000 frames and more; an improvement of\nmore than ten times compared to the state of the art while being agnostic to\nexternal tracking information. Extensive evaluations on the StereoMIS dataset\nshow that FLex significantly improves the quality of novel view synthesis while\nmaintaining competitive pose accuracy.\n","authors":["Florian Philipp Stilz","Mert Asim Karaoglu","Felix Tristram","Nassir Navab","Benjamin Busam","Alexander Ladikos"],"pdf_url":"https://arxiv.org/pdf/2403.12198v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12197v1","updated":"2024-03-18T19:11:34Z","published":"2024-03-18T19:11:34Z","title":"E2F-Net: Eyes-to-Face Inpainting via StyleGAN Latent Space","summary":" Face inpainting, the technique of restoring missing or damaged regions in\nfacial images, is pivotal for applications like face recognition in occluded\nscenarios and image analysis with poor-quality captures. This process not only\nneeds to produce realistic visuals but also preserve individual identity\ncharacteristics. The aim of this paper is to inpaint a face given periocular\nregion (eyes-to-face) through a proposed new Generative Adversarial Network\n(GAN)-based model called Eyes-to-Face Network (E2F-Net). The proposed approach\nextracts identity and non-identity features from the periocular region using\ntwo dedicated encoders have been used. The extracted features are then mapped\nto the latent space of a pre-trained StyleGAN generator to benefit from its\nstate-of-the-art performance and its rich, diverse and expressive latent space\nwithout any additional training. We further improve the StyleGAN output to find\nthe optimal code in the latent space using a new optimization for GAN inversion\ntechnique. Our E2F-Net requires a minimum training process reducing the\ncomputational complexity as a secondary benefit. Through extensive experiments,\nwe show that our method successfully reconstructs the whole face with high\nquality, surpassing current techniques, despite significantly less training and\nsupervision efforts. We have generated seven eyes-to-face datasets based on\nwell-known public face datasets for training and verifying our proposed\nmethods. The code and datasets are publicly available.\n","authors":["Ahmad Hassanpour","Fatemeh Jamalbafrani","Bian Yang","Kiran Raja","Raymond Veldhuis","Julian Fierrez"],"pdf_url":"https://arxiv.org/pdf/2403.12197v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12194v1","updated":"2024-03-18T19:07:42Z","published":"2024-03-18T19:07:42Z","title":"The POLAR Traverse Dataset: A Dataset of Stereo Camera Images Simulating\n Traverses across Lunar Polar Terrain under Extreme Lighting Conditions","summary":" We present the POLAR Traverse Dataset: a dataset of high-fidelity stereo pair\nimages of lunar-like terrain under polar lighting conditions designed to\nsimulate a straight-line traverse. Images from individual traverses with\ndifferent camera heights and pitches were recorded at 1 m intervals by moving a\nsuspended stereo bar across a test bed filled with regolith simulant and shaped\nto mimic lunar south polar terrain. Ground truth geometry and camera position\ninformation was also recorded. This dataset is intended for developing and\ntesting software algorithms that rely on stereo or monocular camera images,\nsuch as visual odometry, for use in the lunar polar environment, as well as to\nprovide insight into the expected lighting conditions in lunar polar regions.\n","authors":["Margaret Hansen","Uland Wong","Terrence Fong"],"pdf_url":"https://arxiv.org/pdf/2403.12194v1.pdf","comment":"6 pages, 5 figures, 3 tables. Associated dataset can be found at\n https://ti.arc.nasa.gov/dataset/PolarTrav/"},{"id":"http://arxiv.org/abs/2403.09875v2","updated":"2024-03-18T18:46:13Z","published":"2024-03-14T21:09:59Z","title":"Touch-GS: Visual-Tactile Supervised 3D Gaussian Splatting","summary":" In this work, we propose a novel method to supervise 3D Gaussian Splatting\n(3DGS) scenes using optical tactile sensors. Optical tactile sensors have\nbecome widespread in their use in robotics for manipulation and object\nrepresentation; however, raw optical tactile sensor data is unsuitable to\ndirectly supervise a 3DGS scene. Our representation leverages a Gaussian\nProcess Implicit Surface to implicitly represent the object, combining many\ntouches into a unified representation with uncertainty. We merge this model\nwith a monocular depth estimation network, which is aligned in a two stage\nprocess, coarsely aligning with a depth camera and then finely adjusting to\nmatch our touch data. For every training image, our method produces a\ncorresponding fused depth and uncertainty map. Utilizing this additional\ninformation, we propose a new loss function, variance weighted depth supervised\nloss, for training the 3DGS scene model. We leverage the DenseTact optical\ntactile sensor and RealSense RGB-D camera to show that combining touch and\nvision in this manner leads to quantitatively and qualitatively better results\nthan vision or touch alone in a few-view scene syntheses on opaque as well as\non reflective and transparent objects. Please see our project page at\nhttp://armlabstanford.github.io/touch-gs\n","authors":["Aiden Swann","Matthew Strong","Won Kyung Do","Gadiel Sznaier Camps","Mac Schwager","Monroe Kennedy III"],"pdf_url":"https://arxiv.org/pdf/2403.09875v2.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2403.12172v1","updated":"2024-03-18T18:42:32Z","published":"2024-03-18T18:42:32Z","title":"Graph-Jigsaw Conditioned Diffusion Model for Skeleton-based Video\n Anomaly Detection","summary":" Skeleton-based video anomaly detection (SVAD) is a crucial task in computer\nvision. Accurately identifying abnormal patterns or events enables operators to\npromptly detect suspicious activities, thereby enhancing safety. Achieving this\ndemands a comprehensive understanding of human motions, both at body and region\nlevels, while also accounting for the wide variations of performing a single\naction. However, existing studies fail to simultaneously address these crucial\nproperties. This paper introduces a novel, practical and lightweight framework,\nnamely Graph-Jigsaw Conditioned Diffusion Model for Skeleton-based Video\nAnomaly Detection (GiCiSAD) to overcome the challenges associated with SVAD.\nGiCiSAD consists of three novel modules: the Graph Attention-based Forecasting\nmodule to capture the spatio-temporal dependencies inherent in the data, the\nGraph-level Jigsaw Puzzle Maker module to distinguish subtle region-level\ndiscrepancies between normal and abnormal motions, and the Graph-based\nConditional Diffusion model to generate a wide spectrum of human motions.\nExtensive experiments on four widely used skeleton-based video datasets show\nthat GiCiSAD outperforms existing methods with significantly fewer training\nparameters, establishing it as the new state-of-the-art.\n","authors":["Ali Karami","Thi Kieu Khanh Ho","Narges Armanfard"],"pdf_url":"https://arxiv.org/pdf/2403.12172v1.pdf","comment":"18 pages, 2 figures, 6 tables"},{"id":"http://arxiv.org/abs/2403.12167v1","updated":"2024-03-18T18:35:32Z","published":"2024-03-18T18:35:32Z","title":"Generalizing deep learning models for medical image classification","summary":" Numerous Deep Learning (DL) models have been developed for a large spectrum\nof medical image analysis applications, which promises to reshape various\nfacets of medical practice. Despite early advances in DL model validation and\nimplementation, which encourage healthcare institutions to adopt them, some\nfundamental questions remain: are the DL models capable of generalizing? What\ncauses a drop in DL model performances? How to overcome the DL model\nperformance drop? Medical data are dynamic and prone to domain shift, due to\nmultiple factors such as updates to medical equipment, new imaging workflow,\nand shifts in patient demographics or populations can induce this drift over\ntime. In this paper, we review recent developments in generalization methods\nfor DL-based classification models. We also discuss future challenges,\nincluding the need for improved evaluation protocols and benchmarks, and\nenvisioned future developments to achieve robust, generalized models for\nmedical image classification.\n","authors":["Matta Sarah","Lamard Mathieu","Zhang Philippe","Alexandre Le Guilcher","Laurent Borderie","Béatrice Cochener","Gwenolé Quellec"],"pdf_url":"https://arxiv.org/pdf/2403.12167v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12154v1","updated":"2024-03-18T18:10:34Z","published":"2024-03-18T18:10:34Z","title":"ThermoNeRF: Multimodal Neural Radiance Fields for Thermal Novel View\n Synthesis","summary":" Thermal scene reconstruction exhibit great potential for applications across\na broad spectrum of fields, including building energy consumption analysis and\nnon-destructive testing. However, existing methods typically require dense\nscene measurements and often rely on RGB images for 3D geometry reconstruction,\nwith thermal information being projected post-reconstruction. This two-step\nstrategy, adopted due to the lack of texture in thermal images, can lead to\ndisparities between the geometry and temperatures of the reconstructed objects\nand those of the actual scene. To address this challenge, we propose\nThermoNeRF, a novel multimodal approach based on Neural Radiance Fields,\ncapable of rendering new RGB and thermal views of a scene jointly. To overcome\nthe lack of texture in thermal images, we use paired RGB and thermal images to\nlearn scene density, while distinct networks estimate color and temperature\ninformation. Furthermore, we introduce ThermoScenes, a new dataset to palliate\nthe lack of available RGB+thermal datasets for scene reconstruction.\nExperimental results validate that ThermoNeRF achieves accurate thermal image\nsynthesis, with an average mean absolute error of 1.5$^\\circ$C, an improvement\nof over 50% compared to using concatenated RGB+thermal data with Nerfacto, a\nstate-of-the-art NeRF method.\n","authors":["Mariam Hassan","Florent Forest","Olga Fink","Malcolm Mielle"],"pdf_url":"https://arxiv.org/pdf/2403.12154v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03000v3","updated":"2024-03-18T18:09:58Z","published":"2023-06-05T16:10:21Z","title":"BeyondPixels: A Comprehensive Review of the Evolution of Neural Radiance\n Fields","summary":" Neural rendering combines ideas from classical computer graphics and machine\nlearning to synthesize images from real-world observations. NeRF, short for\nNeural Radiance Fields, is a recent innovation that uses AI algorithms to\ncreate 3D objects from 2D images. By leveraging an interpolation approach, NeRF\ncan produce new 3D reconstructed views of complicated scenes. Rather than\ndirectly restoring the whole 3D scene geometry, NeRF generates a volumetric\nrepresentation called a ``radiance field,'' which is capable of creating color\nand density for every point within the relevant 3D space. The broad appeal and\nnotoriety of NeRF make it imperative to examine the existing research on the\ntopic comprehensively. While previous surveys on 3D rendering have primarily\nfocused on traditional computer vision-based or deep learning-based approaches,\nonly a handful of them discuss the potential of NeRF. However, such surveys\nhave predominantly focused on NeRF's early contributions and have not explored\nits full potential. NeRF is a relatively new technique continuously being\ninvestigated for its capabilities and limitations. This survey reviews recent\nadvances in NeRF and categorizes them according to their architectural designs,\nespecially in the field of novel view synthesis.\n","authors":["AKM Shahariar Azad Rabby","Chengcui Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.03000v3.pdf","comment":"33 page, 7 figure, 5 table"},{"id":"http://arxiv.org/abs/2403.12152v1","updated":"2024-03-18T18:09:22Z","published":"2024-03-18T18:09:22Z","title":"Development of Automated Neural Network Prediction for Echocardiographic\n Left ventricular Ejection Fraction","summary":" The echocardiographic measurement of left ventricular ejection fraction\n(LVEF) is fundamental to the diagnosis and classification of patients with\nheart failure (HF). In order to quantify LVEF automatically and accurately,\nthis paper proposes a new pipeline method based on deep neural networks and\nensemble learning. Within the pipeline, an Atrous Convolutional Neural Network\n(ACNN) was first trained to segment the left ventricle (LV), before employing\nthe area-length formulation based on the ellipsoid single-plane model to\ncalculate LVEF values. This formulation required inputs of LV area, derived\nfrom segmentation using an improved Jeffrey's method, as well as LV length,\nderived from a novel ensemble learning model. To further improve the pipeline's\naccuracy, an automated peak detection algorithm was used to identify\nend-diastolic and end-systolic frames, avoiding issues with human error.\nSubsequently, single-beat LVEF values were averaged across all cardiac cycles\nto obtain the final LVEF. This method was developed and internally validated in\nan open-source dataset containing 10,030 echocardiograms. The Pearson's\ncorrelation coefficient was 0.83 for LVEF prediction compared to expert human\nanalysis (p<0.001), with a subsequent area under the receiver operator curve\n(AUROC) of 0.98 (95% confidence interval 0.97 to 0.99) for categorisation of HF\nwith reduced ejection (HFrEF; LVEF<40%). In an external dataset with 200\nechocardiograms, this method achieved an AUC of 0.90 (95% confidence interval\n0.88 to 0.91) for HFrEF assessment. This study demonstrates that an automated\nneural network-based calculation of LVEF is comparable to expert clinicians\nperforming time-consuming, frame-by-frame manual evaluation of cardiac systolic\nfunction.\n","authors":["Yuting Zhang","Boyang Liu","Karina V. Bunting","David Brind","Alexander Thorley","Andreas Karwath","Wenqi Lu","Diwei Zhou","Xiaoxia Wang","Alastair R. Mobley","Otilia Tica","Georgios Gkoutos","Dipak Kotecha","Jinming Duan"],"pdf_url":"https://arxiv.org/pdf/2403.12152v1.pdf","comment":"Accepted to Frontiers in Medicine"},{"id":"http://arxiv.org/abs/2403.12151v1","updated":"2024-03-18T18:08:44Z","published":"2024-03-18T18:08:44Z","title":"Fusing Domain-Specific Content from Large Language Models into Knowledge\n Graphs for Enhanced Zero Shot Object State Classification","summary":" Domain-specific knowledge can significantly contribute to addressing a wide\nvariety of vision tasks. However, the generation of such knowledge entails\nconsiderable human labor and time costs. This study investigates the potential\nof Large Language Models (LLMs) in generating and providing domain-specific\ninformation through semantic embeddings. To achieve this, an LLM is integrated\ninto a pipeline that utilizes Knowledge Graphs and pre-trained semantic vectors\nin the context of the Vision-based Zero-shot Object State Classification task.\nWe thoroughly examine the behavior of the LLM through an extensive ablation\nstudy. Our findings reveal that the integration of LLM-based embeddings, in\ncombination with general-purpose pre-trained embeddings, leads to substantial\nperformance improvements. Drawing insights from this ablation study, we conduct\na comparative analysis against competing models, thereby highlighting the\nstate-of-the-art performance achieved by the proposed approach.\n","authors":["Filippos Gouidis","Katerina Papantoniou","Konstantinos Papoutsakis Theodore Patkos","Antonis Argyros","Dimitris Plexousakis"],"pdf_url":"https://arxiv.org/pdf/2403.12151v1.pdf","comment":"Accepted at the AAAI-MAKE 24"},{"id":"http://arxiv.org/abs/2403.09920v2","updated":"2024-03-18T18:04:39Z","published":"2024-03-14T23:41:00Z","title":"Predicting Generalization of AI Colonoscopy Models to Unseen Data","summary":" Background: Generalizability of AI colonoscopy algorithms is important for\nwider adoption in clinical practice. However, current techniques for evaluating\nperformance on unseen data require expensive and time-intensive labels.\n Methods: We use a \"Masked Siamese Network\" (MSN) to identify novel phenomena\nin unseen data and predict polyp detector performance. MSN is trained to\npredict masked out regions of polyp images, without any labels. We test MSN's\nability to be trained on data only from Israel and detect unseen techniques,\nnarrow-band imaging (NBI) and chromendoscoy (CE), on colonoscopes from Japan\n(354 videos, 128 hours). We also test MSN's ability to predict performance of\nComputer Aided Detection (CADe) of polyps on colonoscopies from both countries,\neven though MSN is not trained on data from Japan.\n Results: MSN correctly identifies NBI and CE as less similar to Israel\nwhitelight than Japan whitelight (bootstrapped z-test, |z| > 496, p < 10^-8 for\nboth) using the label-free Frechet distance. MSN detects NBI with 99% accuracy,\npredicts CE better than our heuristic (90% vs 79% accuracy) despite being\ntrained only on whitelight, and is the only method that is robust to noisy\nlabels. MSN predicts CADe polyp detector performance on in-domain Israel and\nout-of-domain Japan colonoscopies (r=0.79, 0.37 respectively). With few\nexamples of Japan detector performance to train on, MSN prediction of Japan\nperformance improves (r=0.56).\n Conclusion: Our technique can identify distribution shifts in clinical data\nand can predict CADe detector performance on unseen data, without labels. Our\nself-supervised approach can aid in detecting when data in practice is\ndifferent from training, such as between hospitals or data has meaningfully\nshifted from training. MSN has potential for application to medical image\ndomains beyond colonoscopy.\n","authors":["Joel Shor","Carson McNeil","Yotam Intrator","Joseph R Ledsam","Hiro-o Yamano","Daisuke Tsurumaru","Hiroki Kayama","Atsushi Hamabe","Koji Ando","Mitsuhiko Ota","Haruei Ogino","Hiroshi Nakase","Kaho Kobayashi","Masaaki Miyo","Eiji Oki","Ichiro Takemasa","Ehud Rivlin","Roman Goldenberg"],"pdf_url":"https://arxiv.org/pdf/2403.09920v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12042v1","updated":"2024-03-18T17:59:58Z","published":"2024-03-18T17:59:58Z","title":"Exploring Pre-trained Text-to-Video Diffusion Models for Referring Video\n Object Segmentation","summary":" In this paper, we explore the visual representations produced from a\npre-trained text-to-video (T2V) diffusion model for video understanding tasks.\nWe hypothesize that the latent representation learned from a pretrained\ngenerative T2V model encapsulates rich semantics and coherent temporal\ncorrespondences, thereby naturally facilitating video understanding. Our\nhypothesis is validated through the classic referring video object segmentation\n(R-VOS) task. We introduce a novel framework, termed ``VD-IT'', tailored with\ndedicatedly designed components built upon a fixed pretrained T2V model.\nSpecifically, VD-IT uses textual information as a conditional input, ensuring\nsemantic consistency across time for precise temporal instance matching. It\nfurther incorporates image tokens as supplementary textual inputs, enriching\nthe feature set to generate detailed and nuanced masks.Besides, instead of\nusing the standard Gaussian noise, we propose to predict the video-specific\nnoise with an extra noise prediction module, which can help preserve the\nfeature fidelity and elevates segmentation quality. Through extensive\nexperiments, we surprisingly observe that fixed generative T2V diffusion\nmodels, unlike commonly used video backbones (e.g., Video Swin Transformer)\npretrained with discriminative image/video pre-tasks, exhibit better potential\nto maintain semantic alignment and temporal consistency. On existing standard\nbenchmarks, our VD-IT achieves highly competitive results, surpassing many\nexisting state-of-the-art methods. The code will be available at\n\\url{https://github.com/buxiangzhiren/VD-IT}\n","authors":["Zixin Zhu","Xuelu Feng","Dongdong Chen","Junsong Yuan","Chunming Qiao","Gang Hua"],"pdf_url":"https://arxiv.org/pdf/2403.12042v1.pdf","comment":"The code will be available at\n \\url{https://github.com/buxiangzhiren/VD-IT}"},{"id":"http://arxiv.org/abs/2403.12040v1","updated":"2024-03-18T17:59:49Z","published":"2024-03-18T17:59:49Z","title":"Distilling Datasets Into Less Than One Image","summary":" Dataset distillation aims to compress a dataset into a much smaller one so\nthat a model trained on the distilled dataset achieves high accuracy. Current\nmethods frame this as maximizing the distilled classification accuracy for a\nbudget of K distilled images-per-class, where K is a positive integer. In this\npaper, we push the boundaries of dataset distillation, compressing the dataset\ninto less than an image-per-class. It is important to realize that the\nmeaningful quantity is not the number of distilled images-per-class but the\nnumber of distilled pixels-per-dataset. We therefore, propose Poster Dataset\nDistillation (PoDD), a new approach that distills the entire original dataset\ninto a single poster. The poster approach motivates new technical solutions for\ncreating training images and learnable labels. Our method can achieve\ncomparable or better performance with less than an image-per-class compared to\nexisting methods that use one image-per-class. Specifically, our method\nestablishes a new state-of-the-art performance on CIFAR-10, CIFAR-100, and\nCUB200 using as little as 0.3 images-per-class.\n","authors":["Asaf Shul","Eliahu Horwitz","Yedid Hoshen"],"pdf_url":"https://arxiv.org/pdf/2403.12040v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12038v1","updated":"2024-03-18T17:59:47Z","published":"2024-03-18T17:59:47Z","title":"Zero-Shot Image Feature Consensus with Deep Functional Maps","summary":" Correspondences emerge from large-scale vision models trained for generative\nand discriminative tasks. This has been revealed and benchmarked by computing\ncorrespondence maps between pairs of images, using nearest neighbors on the\nfeature grids. Existing work has attempted to improve the quality of these\ncorrespondence maps by carefully mixing features from different sources, such\nas by combining the features of different layers or networks. We point out that\na better correspondence strategy is available, which directly imposes structure\non the correspondence field: the functional map. Wielding this simple\nmathematical tool, we lift the correspondence problem from the pixel space to\nthe function space and directly optimize for mappings that are globally\ncoherent. We demonstrate that our technique yields correspondences that are not\nonly smoother but also more accurate, with the possibility of better reflecting\nthe knowledge embedded in the large-scale vision models that we are studying.\nOur approach sets a new state-of-the-art on various dense correspondence tasks.\nWe also demonstrate our effectiveness in keypoint correspondence and affordance\nmap transfer.\n","authors":["Xinle Cheng","Congyue Deng","Adam Harley","Yixin Zhu","Leonidas Guibas"],"pdf_url":"https://arxiv.org/pdf/2403.12038v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12036v1","updated":"2024-03-18T17:59:40Z","published":"2024-03-18T17:59:40Z","title":"One-Step Image Translation with Text-to-Image Models","summary":" In this work, we address two limitations of existing conditional diffusion\nmodels: their slow inference speed due to the iterative denoising process and\ntheir reliance on paired data for model fine-tuning. To tackle these issues, we\nintroduce a general method for adapting a single-step diffusion model to new\ntasks and domains through adversarial learning objectives. Specifically, we\nconsolidate various modules of the vanilla latent diffusion model into a single\nend-to-end generator network with small trainable weights, enhancing its\nability to preserve the input image structure while reducing overfitting. We\ndemonstrate that, for unpaired settings, our model CycleGAN-Turbo outperforms\nexisting GAN-based and diffusion-based methods for various scene translation\ntasks, such as day-to-night conversion and adding/removing weather effects like\nfog, snow, and rain. We extend our method to paired settings, where our model\npix2pix-Turbo is on par with recent works like Control-Net for Sketch2Photo and\nEdge2Image, but with a single-step inference. This work suggests that\nsingle-step diffusion models can serve as strong backbones for a range of GAN\nlearning objectives. Our code and models are available at\nhttps://github.com/GaParmar/img2img-turbo.\n","authors":["Gaurav Parmar","Taesung Park","Srinivasa Narasimhan","Jun-Yan Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.12036v1.pdf","comment":"Github: https://github.com/GaParmar/img2img-turbo"},{"id":"http://arxiv.org/abs/2103.17271v2","updated":"2024-03-18T17:59:33Z","published":"2021-03-31T17:59:31Z","title":"DCVNet: Dilated Cost Volume Networks for Fast Optical Flow","summary":" The cost volume, capturing the similarity of possible correspondences across\ntwo input images, is a key ingredient in state-of-the-art optical flow\napproaches. When sampling correspondences to build the cost volume, a large\nneighborhood radius is required to deal with large displacements, introducing a\nsignificant computational burden. To address this, coarse-to-fine or recurrent\nprocessing of the cost volume is usually adopted, where correspondence sampling\nin a local neighborhood with a small radius suffices. In this paper, we propose\nan alternative by constructing cost volumes with different dilation factors to\ncapture small and large displacements simultaneously. A U-Net with skip\nconnections is employed to convert the dilated cost volumes into interpolation\nweights between all possible captured displacements to get the optical flow.\nOur proposed model DCVNet only needs to process the cost volume once in a\nsimple feedforward manner and does not rely on the sequential processing\nstrategy. DCVNet obtains comparable accuracy to existing approaches and\nachieves real-time inference (30 fps on a mid-end 1080ti GPU). The code and\nmodel weights are available at https://github.com/neu-vi/ezflow.\n","authors":["Huaizu Jiang","Erik Learned-Miller"],"pdf_url":"https://arxiv.org/pdf/2103.17271v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12035v1","updated":"2024-03-18T17:59:27Z","published":"2024-03-18T17:59:27Z","title":"CoCoCo: Improving Text-Guided Video Inpainting for Better Consistency,\n Controllability and Compatibility","summary":" Recent advancements in video generation have been remarkable, yet many\nexisting methods struggle with issues of consistency and poor text-video\nalignment. Moreover, the field lacks effective techniques for text-guided video\ninpainting, a stark contrast to the well-explored domain of text-guided image\ninpainting. To this end, this paper proposes a novel text-guided video\ninpainting model that achieves better consistency, controllability and\ncompatibility. Specifically, we introduce a simple but efficient motion capture\nmodule to preserve motion consistency, and design an instance-aware region\nselection instead of a random region selection to obtain better textual\ncontrollability, and utilize a novel strategy to inject some personalized\nmodels into our CoCoCo model and thus obtain better model compatibility.\nExtensive experiments show that our model can generate high-quality video\nclips. Meanwhile, our model shows better motion consistency, textual\ncontrollability and model compatibility. More details are shown in\n[cococozibojia.github.io](cococozibojia.github.io).\n","authors":["Bojia Zi","Shihao Zhao","Xianbiao Qi","Jianan Wang","Yukai Shi","Qianyu Chen","Bin Liang","Kam-Fai Wong","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.12035v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12034v1","updated":"2024-03-18T17:59:12Z","published":"2024-03-18T17:59:12Z","title":"VFusion3D: Learning Scalable 3D Generative Models from Video Diffusion\n Models","summary":" This paper presents a novel paradigm for building scalable 3D generative\nmodels utilizing pre-trained video diffusion models. The primary obstacle in\ndeveloping foundation 3D generative models is the limited availability of 3D\ndata. Unlike images, texts, or videos, 3D data are not readily accessible and\nare difficult to acquire. This results in a significant disparity in scale\ncompared to the vast quantities of other types of data. To address this issue,\nwe propose using a video diffusion model, trained with extensive volumes of\ntext, images, and videos, as a knowledge source for 3D data. By unlocking its\nmulti-view generative capabilities through fine-tuning, we generate a\nlarge-scale synthetic multi-view dataset to train a feed-forward 3D generative\nmodel. The proposed model, VFusion3D, trained on nearly 3M synthetic multi-view\ndata, can generate a 3D asset from a single image in seconds and achieves\nsuperior performance when compared to current SOTA feed-forward 3D generative\nmodels, with users preferring our results over 70% of the time.\n","authors":["Junlin Han","Filippos Kokkinos","Philip Torr"],"pdf_url":"https://arxiv.org/pdf/2403.12034v1.pdf","comment":"Project page: https://junlinhan.github.io/projects/vfusion3d.html"},{"id":"http://arxiv.org/abs/2403.12033v1","updated":"2024-03-18T17:59:10Z","published":"2024-03-18T17:59:10Z","title":"HiKER-SGG: Hierarchical Knowledge Enhanced Robust Scene Graph Generation","summary":" Being able to understand visual scenes is a precursor for many downstream\ntasks, including autonomous driving, robotics, and other vision-based\napproaches. A common approach enabling the ability to reason over visual data\nis Scene Graph Generation (SGG); however, many existing approaches assume\nundisturbed vision, i.e., the absence of real-world corruptions such as fog,\nsnow, smoke, as well as non-uniform perturbations like sun glare or water\ndrops. In this work, we propose a novel SGG benchmark containing procedurally\ngenerated weather corruptions and other transformations over the Visual Genome\ndataset. Further, we introduce a corresponding approach, Hierarchical Knowledge\nEnhanced Robust Scene Graph Generation (HiKER-SGG), providing a strong baseline\nfor scene graph generation under such challenging setting. At its core,\nHiKER-SGG utilizes a hierarchical knowledge graph in order to refine its\npredictions from coarse initial estimates to detailed predictions. In our\nextensive experiments, we show that HiKER-SGG does not only demonstrate\nsuperior performance on corrupted images in a zero-shot manner, but also\noutperforms current state-of-the-art methods on uncorrupted SGG tasks. Code is\navailable at https://github.com/zhangce01/HiKER-SGG.\n","authors":["Ce Zhang","Simon Stepputtis","Joseph Campbell","Katia Sycara","Yaqi Xie"],"pdf_url":"https://arxiv.org/pdf/2403.12033v1.pdf","comment":"Accepted by CVPR 2024. Project page:\n https://zhangce01.github.io/HiKER-SGG"},{"id":"http://arxiv.org/abs/2403.12030v1","updated":"2024-03-18T17:58:13Z","published":"2024-03-18T17:58:13Z","title":"Expandable Subspace Ensemble for Pre-Trained Model-Based\n Class-Incremental Learning","summary":" Class-Incremental Learning (CIL) requires a learning system to continually\nlearn new classes without forgetting. Despite the strong performance of\nPre-Trained Models (PTMs) in CIL, a critical issue persists: learning new\nclasses often results in the overwriting of old ones. Excessive modification of\nthe network causes forgetting, while minimal adjustments lead to an inadequate\nfit for new classes. As a result, it is desired to figure out a way of\nefficient model updating without harming former knowledge. In this paper, we\npropose ExpAndable Subspace Ensemble (EASE) for PTM-based CIL. To enable model\nupdating without conflict, we train a distinct lightweight adapter module for\neach new task, aiming to create task-specific subspaces. These adapters span a\nhigh-dimensional feature space, enabling joint decision-making across multiple\nsubspaces. As data evolves, the expanding subspaces render the old class\nclassifiers incompatible with new-stage spaces. Correspondingly, we design a\nsemantic-guided prototype complement strategy that synthesizes old classes' new\nfeatures without using any old class instance. Extensive experiments on seven\nbenchmark datasets verify EASE's state-of-the-art performance. Code is\navailable at: https://github.com/sun-hailong/CVPR24-Ease\n","authors":["Da-Wei Zhou","Hai-Long Sun","Han-Jia Ye","De-Chuan Zhan"],"pdf_url":"https://arxiv.org/pdf/2403.12030v1.pdf","comment":"Accepted to CVPR 2024. Code is available at:\n https://github.com/sun-hailong/CVPR24-Ease"},{"id":"http://arxiv.org/abs/2402.12712v2","updated":"2024-03-18T17:58:05Z","published":"2024-02-20T04:25:57Z","title":"MVDiffusion++: A Dense High-resolution Multi-view Diffusion Model for\n Single or Sparse-view 3D Object Reconstruction","summary":" This paper presents a neural architecture MVDiffusion++ for 3D object\nreconstruction that synthesizes dense and high-resolution views of an object\ngiven one or a few images without camera poses. MVDiffusion++ achieves superior\nflexibility and scalability with two surprisingly simple ideas: 1) A\n``pose-free architecture'' where standard self-attention among 2D latent\nfeatures learns 3D consistency across an arbitrary number of conditional and\ngeneration views without explicitly using camera pose information; and 2) A\n``view dropout strategy'' that discards a substantial number of output views\nduring training, which reduces the training-time memory footprint and enables\ndense and high-resolution view synthesis at test time. We use the Objaverse for\ntraining and the Google Scanned Objects for evaluation with standard novel view\nsynthesis and 3D reconstruction metrics, where MVDiffusion++ significantly\noutperforms the current state of the arts. We also demonstrate a text-to-3D\napplication example by combining MVDiffusion++ with a text-to-image generative\nmodel. The project page is at https://mvdiffusion-plusplus.github.io.\n","authors":["Shitao Tang","Jiacheng Chen","Dilin Wang","Chengzhou Tang","Fuyang Zhang","Yuchen Fan","Vikas Chandra","Yasutaka Furukawa","Rakesh Ranjan"],"pdf_url":"https://arxiv.org/pdf/2402.12712v2.pdf","comment":"3D generation, project page: https://mvdiffusion-plusplus.github.io/"},{"id":"http://arxiv.org/abs/2403.12029v1","updated":"2024-03-18T17:58:02Z","published":"2024-03-18T17:58:02Z","title":"Align and Distill: Unifying and Improving Domain Adaptive Object\n Detection","summary":" Object detectors often perform poorly on data that differs from their\ntraining set. Domain adaptive object detection (DAOD) methods have recently\ndemonstrated strong results on addressing this challenge. Unfortunately, we\nidentify systemic benchmarking pitfalls that call past results into question\nand hamper further progress: (a) Overestimation of performance due to\nunderpowered baselines, (b) Inconsistent implementation practices preventing\ntransparent comparisons of methods, and (c) Lack of generality due to outdated\nbackbones and lack of diversity in benchmarks. We address these problems by\nintroducing: (1) A unified benchmarking and implementation framework, Align and\nDistill (ALDI), enabling comparison of DAOD methods and supporting future\ndevelopment, (2) A fair and modern training and evaluation protocol for DAOD\nthat addresses benchmarking pitfalls, (3) A new DAOD benchmark dataset,\nCFC-DAOD, enabling evaluation on diverse real-world data, and (4) A new method,\nALDI++, that achieves state-of-the-art results by a large margin. ALDI++\noutperforms the previous state-of-the-art by +3.5 AP50 on Cityscapes to Foggy\nCityscapes, +5.7 AP50 on Sim10k to Cityscapes (where ours is the only method to\noutperform a fair baseline), and +2.0 AP50 on CFC Kenai to Channel. Our\nframework, dataset, and state-of-the-art method offer a critical reset for DAOD\nand provide a strong foundation for future research. Code and data are\navailable: https://github.com/justinkay/aldi and\nhttps://github.com/visipedia/caltech-fish-counting.\n","authors":["Justin Kay","Timm Haucke","Suzanne Stathatos","Siqi Deng","Erik Young","Pietro Perona","Sara Beery","Grant Van Horn"],"pdf_url":"https://arxiv.org/pdf/2403.12029v1.pdf","comment":"30 pages, 10 figures"},{"id":"http://arxiv.org/abs/2403.12028v1","updated":"2024-03-18T17:57:30Z","published":"2024-03-18T17:57:30Z","title":"Ultraman: Single Image 3D Human Reconstruction with Ultra Speed and\n Detail","summary":" 3D human body reconstruction has been a challenge in the field of computer\nvision. Previous methods are often time-consuming and difficult to capture the\ndetailed appearance of the human body. In this paper, we propose a new method\ncalled \\emph{Ultraman} for fast reconstruction of textured 3D human models from\na single image. Compared to existing techniques, \\emph{Ultraman} greatly\nimproves the reconstruction speed and accuracy while preserving high-quality\ntexture details. We present a set of new frameworks for human reconstruction\nconsisting of three parts, geometric reconstruction, texture generation and\ntexture mapping. Firstly, a mesh reconstruction framework is used, which\naccurately extracts 3D human shapes from a single image. At the same time, we\npropose a method to generate a multi-view consistent image of the human body\nbased on a single image. This is finally combined with a novel texture mapping\nmethod to optimize texture details and ensure color consistency during\nreconstruction. Through extensive experiments and evaluations, we demonstrate\nthe superior performance of \\emph{Ultraman} on various standard datasets. In\naddition, \\emph{Ultraman} outperforms state-of-the-art methods in terms of\nhuman rendering quality and speed. Upon acceptance of the article, we will make\nthe code and data publicly available.\n","authors":["Mingjin Chen","Junhao Chen","Xiaojun Ye","Huan-ang Gao","Xiaoxue Chen","Zhaoxin Fan","Hao Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.12028v1.pdf","comment":"Project Page: https://air-discover.github.io/Ultraman/"},{"id":"http://arxiv.org/abs/2403.12027v1","updated":"2024-03-18T17:57:09Z","published":"2024-03-18T17:57:09Z","title":"From Pixels to Insights: A Survey on Automatic Chart Understanding in\n the Era of Large Foundation Models","summary":" Data visualization in the form of charts plays a pivotal role in data\nanalysis, offering critical insights and aiding in informed decision-making.\nAutomatic chart understanding has witnessed significant advancements with the\nrise of large foundation models in recent years. Foundation models, such as\nlarge language models (LLMs), have revolutionized various natural language\nprocessing (NLP) tasks and are increasingly being applied to chart\nunderstanding tasks. This survey paper provides a comprehensive overview of the\nrecent developments, challenges, and future directions in chart understanding\nwithin the context of these foundation models. The paper begins by defining\nchart understanding, outlining problem formulations, and discussing fundamental\nbuilding blocks crucial for studying chart understanding tasks. In the section\non tasks and datasets, we explore various tasks within chart understanding and\ndiscuss their evaluation metrics and sources of both charts and textual inputs.\nModeling strategies are then examined, encompassing both classification-based\nand generation-based approaches, along with tool augmentation techniques that\nenhance chart understanding performance. Furthermore, we discuss the\nstate-of-the-art performance of each task and discuss how we can improve the\nperformance. Challenges and future directions are addressed in a dedicated\nsection, highlighting issues such as domain-specific charts, lack of efforts in\nevaluation, and agent-oriented settings. This survey paper serves to provide\nvaluable insights and directions for future research in chart understanding\nleveraging large foundation models. The studies mentioned in this paper, along\nwith emerging new research, will be continually updated at:\nhttps://github.com/khuangaf/Awesome-Chart-Understanding.\n","authors":["Kung-Hsiang Huang","Hou Pong Chan","Yi R. Fung","Haoyi Qiu","Mingyang Zhou","Shafiq Joty","Shih-Fu Chang","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2403.12027v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12026v1","updated":"2024-03-18T17:57:02Z","published":"2024-03-18T17:57:02Z","title":"FlexCap: Generating Rich, Localized, and Flexible Captions in Images","summary":" We introduce a versatile $\\textit{flexible-captioning}$ vision-language model\n(VLM) capable of generating region-specific descriptions of varying lengths.\nThe model, FlexCap, is trained to produce length-conditioned captions for input\nbounding boxes, and this allows control over the information density of its\noutput, with descriptions ranging from concise object labels to detailed\ncaptions. To achieve this we create large-scale training datasets of image\nregion descriptions of varying length, starting from captioned images. This\nflexible-captioning capability has several valuable applications.\n First, FlexCap demonstrates superior performance in dense captioning tasks on\nthe Visual Genome dataset. Second, a visual question answering (VQA) system can\nbe built by employing FlexCap to generate localized descriptions as inputs to a\nlarge language model. The resulting system achieves state-of-the-art zero-shot\nperformance on a number of VQA datasets. We also demonstrate a\n$\\textit{localize-then-describe}$ approach with FlexCap can be better at\nopen-ended object detection than a $\\textit{describe-then-localize}$ approach\nwith other VLMs. We highlight a novel characteristic of FlexCap, which is its\nability to extract diverse visual information through prefix conditioning.\nFinally, we qualitatively demonstrate FlexCap's broad applicability in tasks\nsuch as image labeling, object attribute recognition, and visual dialog.\nProject webpage: https://flex-cap.github.io .\n","authors":["Debidatta Dwibedi","Vidhi Jain","Jonathan Tompson","Andrew Zisserman","Yusuf Aytar"],"pdf_url":"https://arxiv.org/pdf/2403.12026v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12019v1","updated":"2024-03-18T17:54:34Z","published":"2024-03-18T17:54:34Z","title":"LN3Diff: Scalable Latent Neural Fields Diffusion for Speedy 3D\n Generation","summary":" The field of neural rendering has witnessed significant progress with\nadvancements in generative models and differentiable rendering techniques.\nThough 2D diffusion has achieved success, a unified 3D diffusion pipeline\nremains unsettled. This paper introduces a novel framework called LN3Diff to\naddress this gap and enable fast, high-quality, and generic conditional 3D\ngeneration. Our approach harnesses a 3D-aware architecture and variational\nautoencoder (VAE) to encode the input image into a structured, compact, and 3D\nlatent space. The latent is decoded by a transformer-based decoder into a\nhigh-capacity 3D neural field. Through training a diffusion model on this\n3D-aware latent space, our method achieves state-of-the-art performance on\nShapeNet for 3D generation and demonstrates superior performance in monocular\n3D reconstruction and conditional 3D generation across various datasets.\nMoreover, it surpasses existing 3D diffusion methods in terms of inference\nspeed, requiring no per-instance optimization. Our proposed LN3Diff presents a\nsignificant advancement in 3D generative modeling and holds promise for various\napplications in 3D vision and graphics tasks.\n","authors":["Yushi Lan","Fangzhou Hong","Shuai Yang","Shangchen Zhou","Xuyi Meng","Bo Dai","Xingang Pan","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2403.12019v1.pdf","comment":"project webpage: https://nirvanalan.github.io/projects/ln3diff/"},{"id":"http://arxiv.org/abs/2403.12015v1","updated":"2024-03-18T17:51:43Z","published":"2024-03-18T17:51:43Z","title":"Fast High-Resolution Image Synthesis with Latent Adversarial Diffusion\n Distillation","summary":" Diffusion models are the main driver of progress in image and video\nsynthesis, but suffer from slow inference speed. Distillation methods, like the\nrecently introduced adversarial diffusion distillation (ADD) aim to shift the\nmodel from many-shot to single-step inference, albeit at the cost of expensive\nand difficult optimization due to its reliance on a fixed pretrained DINOv2\ndiscriminator. We introduce Latent Adversarial Diffusion Distillation (LADD), a\nnovel distillation approach overcoming the limitations of ADD. In contrast to\npixel-based ADD, LADD utilizes generative features from pretrained latent\ndiffusion models. This approach simplifies training and enhances performance,\nenabling high-resolution multi-aspect ratio image synthesis. We apply LADD to\nStable Diffusion 3 (8B) to obtain SD3-Turbo, a fast model that matches the\nperformance of state-of-the-art text-to-image generators using only four\nunguided sampling steps. Moreover, we systematically investigate its scaling\nbehavior and demonstrate LADD's effectiveness in various applications such as\nimage editing and inpainting.\n","authors":["Axel Sauer","Frederic Boesel","Tim Dockhorn","Andreas Blattmann","Patrick Esser","Robin Rombach"],"pdf_url":"https://arxiv.org/pdf/2403.12015v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12013v1","updated":"2024-03-18T17:50:41Z","published":"2024-03-18T17:50:41Z","title":"GeoWizard: Unleashing the Diffusion Priors for 3D Geometry Estimation\n from a Single Image","summary":" We introduce GeoWizard, a new generative foundation model designed for\nestimating geometric attributes, e.g., depth and normals, from single images.\nWhile significant research has already been conducted in this area, the\nprogress has been substantially limited by the low diversity and poor quality\nof publicly available datasets. As a result, the prior works either are\nconstrained to limited scenarios or suffer from the inability to capture\ngeometric details. In this paper, we demonstrate that generative models, as\nopposed to traditional discriminative models (e.g., CNNs and Transformers), can\neffectively address the inherently ill-posed problem. We further show that\nleveraging diffusion priors can markedly improve generalization, detail\npreservation, and efficiency in resource usage. Specifically, we extend the\noriginal stable diffusion model to jointly predict depth and normal, allowing\nmutual information exchange and high consistency between the two\nrepresentations. More importantly, we propose a simple yet effective strategy\nto segregate the complex data distribution of various scenes into distinct\nsub-distributions. This strategy enables our model to recognize different scene\nlayouts, capturing 3D geometry with remarkable fidelity. GeoWizard sets new\nbenchmarks for zero-shot depth and normal prediction, significantly enhancing\nmany downstream applications such as 3D reconstruction, 2D content creation,\nand novel viewpoint synthesis.\n","authors":["Xiao Fu","Wei Yin","Mu Hu","Kaixuan Wang","Yuexin Ma","Ping Tan","Shaojie Shen","Dahua Lin","Xiaoxiao Long"],"pdf_url":"https://arxiv.org/pdf/2403.12013v1.pdf","comment":"Project page: https://fuxiao0719.github.io/projects/geowizard/"},{"id":"http://arxiv.org/abs/2403.12011v1","updated":"2024-03-18T17:48:31Z","published":"2024-03-18T17:48:31Z","title":"HOIDiffusion: Generating Realistic 3D Hand-Object Interaction Data","summary":" 3D hand-object interaction data is scarce due to the hardware constraints in\nscaling up the data collection process. In this paper, we propose HOIDiffusion\nfor generating realistic and diverse 3D hand-object interaction data. Our model\nis a conditional diffusion model that takes both the 3D hand-object geometric\nstructure and text description as inputs for image synthesis. This offers a\nmore controllable and realistic synthesis as we can specify the structure and\nstyle inputs in a disentangled manner. HOIDiffusion is trained by leveraging a\ndiffusion model pre-trained on large-scale natural images and a few 3D human\ndemonstrations. Beyond controllable image synthesis, we adopt the generated 3D\ndata for learning 6D object pose estimation and show its effectiveness in\nimproving perception systems. Project page:\nhttps://mq-zhang1.github.io/HOIDiffusion\n","authors":["Mengqi Zhang","Yang Fu","Zheng Ding","Sifei Liu","Zhuowen Tu","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2403.12011v1.pdf","comment":"Project page: https://mq-zhang1.github.io/HOIDiffusion"},{"id":"http://arxiv.org/abs/2403.12010v1","updated":"2024-03-18T17:48:15Z","published":"2024-03-18T17:48:15Z","title":"VideoMV: Consistent Multi-View Generation Based on Large Video\n Generative Model","summary":" Generating multi-view images based on text or single-image prompts is a\ncritical capability for the creation of 3D content. Two fundamental questions\non this topic are what data we use for training and how to ensure multi-view\nconsistency. This paper introduces a novel framework that makes fundamental\ncontributions to both questions. Unlike leveraging images from 2D diffusion\nmodels for training, we propose a dense consistent multi-view generation model\nthat is fine-tuned from off-the-shelf video generative models. Images from\nvideo generative models are more suitable for multi-view generation because the\nunderlying network architecture that generates them employs a temporal module\nto enforce frame consistency. Moreover, the video data sets used to train these\nmodels are abundant and diverse, leading to a reduced train-finetuning domain\ngap. To enhance multi-view consistency, we introduce a 3D-Aware Denoising\nSampling, which first employs a feed-forward reconstruction module to get an\nexplicit global 3D model, and then adopts a sampling strategy that effectively\ninvolves images rendered from the global 3D model into the denoising sampling\nloop to improve the multi-view consistency of the final images. As a\nby-product, this module also provides a fast way to create 3D assets\nrepresented by 3D Gaussians within a few seconds. Our approach can generate 24\ndense views and converges much faster in training than state-of-the-art\napproaches (4 GPU hours versus many thousand GPU hours) with comparable visual\nquality and consistency. By further fine-tuning, our approach outperforms\nexisting state-of-the-art methods in both quantitative metrics and visual\neffects. Our project page is aigc3d.github.io/VideoMV.\n","authors":["Qi Zuo","Xiaodong Gu","Lingteng Qiu","Yuan Dong","Zhengyi Zhao","Weihao Yuan","Rui Peng","Siyu Zhu","Zilong Dong","Liefeng Bo","Qixing Huang"],"pdf_url":"https://arxiv.org/pdf/2403.12010v1.pdf","comment":"Project page: aigc3d.github.io/VideoMV/"},{"id":"http://arxiv.org/abs/2403.12008v1","updated":"2024-03-18T17:46:06Z","published":"2024-03-18T17:46:06Z","title":"SV3D: Novel Multi-view Synthesis and 3D Generation from a Single Image\n using Latent Video Diffusion","summary":" We present Stable Video 3D (SV3D) -- a latent video diffusion model for\nhigh-resolution, image-to-multi-view generation of orbital videos around a 3D\nobject. Recent work on 3D generation propose techniques to adapt 2D generative\nmodels for novel view synthesis (NVS) and 3D optimization. However, these\nmethods have several disadvantages due to either limited views or inconsistent\nNVS, thereby affecting the performance of 3D object generation. In this work,\nwe propose SV3D that adapts image-to-video diffusion model for novel multi-view\nsynthesis and 3D generation, thereby leveraging the generalization and\nmulti-view consistency of the video models, while further adding explicit\ncamera control for NVS. We also propose improved 3D optimization techniques to\nuse SV3D and its NVS outputs for image-to-3D generation. Extensive experimental\nresults on multiple datasets with 2D and 3D metrics as well as user study\ndemonstrate SV3D's state-of-the-art performance on NVS as well as 3D\nreconstruction compared to prior works.\n","authors":["Vikram Voleti","Chun-Han Yao","Mark Boss","Adam Letts","David Pankratz","Dmitry Tochilkin","Christian Laforte","Robin Rombach","Varun Jampani"],"pdf_url":"https://arxiv.org/pdf/2403.12008v1.pdf","comment":"Project page: https://sv3d.github.io/"},{"id":"http://arxiv.org/abs/2403.12003v1","updated":"2024-03-18T17:41:26Z","published":"2024-03-18T17:41:26Z","title":"GenView: Enhancing View Quality with Pretrained Generative Model for\n Self-Supervised Learning","summary":" Self-supervised learning has achieved remarkable success in acquiring\nhigh-quality representations from unlabeled data. The widely adopted\ncontrastive learning framework aims to learn invariant representations by\nminimizing the distance between positive views originating from the same image.\nHowever, existing techniques to construct positive views highly rely on manual\ntransformations, resulting in limited diversity and potentially false positive\npairs. To tackle these challenges, we present GenView, a controllable framework\nthat augments the diversity of positive views leveraging the power of\npretrained generative models while preserving semantics. We develop an adaptive\nview generation method that dynamically adjusts the noise level in sampling to\nensure the preservation of essential semantic meaning while introducing\nvariability. Additionally, we introduce a quality-driven contrastive loss,\nwhich assesses the quality of positive pairs by considering both foreground\nsimilarity and background diversity. This loss prioritizes the high-quality\npositive pairs we construct while reducing the influence of low-quality pairs,\nthereby mitigating potential semantic inconsistencies introduced by generative\nmodels and aggressive data augmentation. Thanks to the improved positive view\nquality and the quality-driven contrastive loss, GenView significantly improves\nself-supervised learning across various tasks. For instance, GenView improves\nMoCov2 performance by 2.5%/2.2% on ImageNet linear/semi-supervised\nclassification. Moreover, GenView even performs much better than naively\naugmenting the ImageNet dataset with Laion400M or ImageNet21K. Code is\navailable at https://github.com/xiaojieli0903/genview.\n","authors":["Xiaojie Li","Yibo Yang","Xiangtai Li","Jianlong Wu","Yue Yu","Bernard Ghanem","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.12003v1.pdf","comment":"Code: https://github.com/xiaojieli0903/genview"},{"id":"http://arxiv.org/abs/2403.12002v1","updated":"2024-03-18T17:38:53Z","published":"2024-03-18T17:38:53Z","title":"DreamMotion: Space-Time Self-Similarity Score Distillation for Zero-Shot\n Video Editing","summary":" Text-driven diffusion-based video editing presents a unique challenge not\nencountered in image editing literature: establishing real-world motion. Unlike\nexisting video editing approaches, here we focus on score distillation sampling\nto circumvent the standard reverse diffusion process and initiate optimization\nfrom videos that already exhibit natural motion. Our analysis reveals that\nwhile video score distillation can effectively introduce new content indicated\nby target text, it can also cause significant structure and motion deviation.\nTo counteract this, we propose to match space-time self-similarities of the\noriginal video and the edited video during the score distillation. Thanks to\nthe use of score distillation, our approach is model-agnostic, which can be\napplied for both cascaded and non-cascaded video diffusion frameworks. Through\nextensive comparisons with leading methods, our approach demonstrates its\nsuperiority in altering appearances while accurately preserving the original\nstructure and motion.\n","authors":["Hyeonho Jeong","Jinho Chang","Geon Yeong Park","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2403.12002v1.pdf","comment":"Project page: https://hyeonho99.github.io/dreammotion/"},{"id":"http://arxiv.org/abs/2403.11999v1","updated":"2024-03-18T17:34:29Z","published":"2024-03-18T17:34:29Z","title":"HIRI-ViT: Scaling Vision Transformer with High Resolution Inputs","summary":" The hybrid deep models of Vision Transformer (ViT) and Convolution Neural\nNetwork (CNN) have emerged as a powerful class of backbones for vision tasks.\nScaling up the input resolution of such hybrid backbones naturally strengthes\nmodel capacity, but inevitably suffers from heavy computational cost that\nscales quadratically. Instead, we present a new hybrid backbone with\nHIgh-Resolution Inputs (namely HIRI-ViT), that upgrades prevalent four-stage\nViT to five-stage ViT tailored for high-resolution inputs. HIRI-ViT is built\nupon the seminal idea of decomposing the typical CNN operations into two\nparallel CNN branches in a cost-efficient manner. One high-resolution branch\ndirectly takes primary high-resolution features as inputs, but uses less\nconvolution operations. The other low-resolution branch first performs\ndown-sampling and then utilizes more convolution operations over such\nlow-resolution features. Experiments on both recognition task (ImageNet-1K\ndataset) and dense prediction tasks (COCO and ADE20K datasets) demonstrate the\nsuperiority of HIRI-ViT. More remarkably, under comparable computational cost\n($\\sim$5.0 GFLOPs), HIRI-ViT achieves to-date the best published Top-1 accuracy\nof 84.3% on ImageNet with 448$\\times$448 inputs, which absolutely improves\n83.4% of iFormer-S by 0.9% with 224$\\times$224 inputs.\n","authors":["Ting Yao","Yehao Li","Yingwei Pan","Tao Mei"],"pdf_url":"https://arxiv.org/pdf/2403.11999v1.pdf","comment":"IEEE Transactions on Pattern Analysis and Machine Intelligence\n (TPAMI)"},{"id":"http://arxiv.org/abs/2403.11990v1","updated":"2024-03-18T17:25:36Z","published":"2024-03-18T17:25:36Z","title":"GetMesh: A Controllable Model for High-quality Mesh Generation and\n Manipulation","summary":" Mesh is a fundamental representation of 3D assets in various industrial\napplications, and is widely supported by professional softwares. However, due\nto its irregular structure, mesh creation and manipulation is often\ntime-consuming and labor-intensive. In this paper, we propose a highly\ncontrollable generative model, GetMesh, for mesh generation and manipulation\nacross different categories. By taking a varying number of points as the latent\nrepresentation, and re-organizing them as triplane representation, GetMesh\ngenerates meshes with rich and sharp details, outperforming both\nsingle-category and multi-category counterparts. Moreover, it also enables\nfine-grained control over the generation process that previous mesh generative\nmodels cannot achieve, where changing global/local mesh topologies,\nadding/removing mesh parts, and combining mesh parts across categories can be\nintuitively, efficiently, and robustly accomplished by adjusting the number,\npositions or features of latent points. Project page is\nhttps://getmesh.github.io.\n","authors":["Zhaoyang Lyu","Ben Fei","Jinyi Wang","Xudong Xu","Ya Zhang","Weidong Yang","Bo Dai"],"pdf_url":"https://arxiv.org/pdf/2403.11990v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11981v1","updated":"2024-03-18T17:17:07Z","published":"2024-03-18T17:17:07Z","title":"Diffusion Denoising as a Certified Defense against Clean-label Poisoning","summary":" We present a certified defense to clean-label poisoning attacks. These\nattacks work by injecting a small number of poisoning samples (e.g., 1%) that\ncontain $p$-norm bounded adversarial perturbations into the training data to\ninduce a targeted misclassification of a test-time input. Inspired by the\nadversarial robustness achieved by $denoised$ $smoothing$, we show how an\noff-the-shelf diffusion model can sanitize the tampered training data. We\nextensively test our defense against seven clean-label poisoning attacks and\nreduce their attack success to 0-16% with only a negligible drop in the test\ntime accuracy. We compare our defense with existing countermeasures against\nclean-label poisoning, showing that the defense reduces the attack success the\nmost and offers the best model utility. Our results highlight the need for\nfuture work on developing stronger clean-label attacks and using our certified\nyet practical defense as a strong baseline to evaluate these attacks.\n","authors":["Sanghyun Hong","Nicholas Carlini","Alexey Kurakin"],"pdf_url":"https://arxiv.org/pdf/2403.11981v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11978v1","updated":"2024-03-18T17:13:18Z","published":"2024-03-18T17:13:18Z","title":"Pedestrian Tracking with Monocular Camera using Unconstrained 3D Motion\n Model","summary":" A first-principle single-object model is proposed for pedestrian tracking. It\nis assumed that the extent of the moving object can be described via known\nstatistics in 3D, such as pedestrian height. The proposed model thus need not\nconstrain the object motion in 3D to a common ground plane, which is usual in\n3D visual tracking applications. A nonlinear filter for this model is\nimplemented using the unscented Kalman filter (UKF) and tested using the\npublicly available MOT-17 dataset. The proposed solution yields promising\nresults in 3D while maintaining perfect results when projected into the 2D\nimage. Moreover, the estimation error covariance matches the true one. Unlike\nconventional methods, the introduced model parameters have convenient meaning\nand can readily be adjusted for a problem.\n","authors":["Jan Krejčí","Oliver Kost","Ondřej Straka","Jindřich Duník"],"pdf_url":"https://arxiv.org/pdf/2403.11978v1.pdf","comment":"Submitted to FUSION2024 conference"},{"id":"http://arxiv.org/abs/2403.11974v1","updated":"2024-03-18T17:12:00Z","published":"2024-03-18T17:12:00Z","title":"OUCopula: Bi-Channel Multi-Label Copula-Enhanced Adapter-Based CNN for\n Myopia Screening Based on OU-UWF Images","summary":" Myopia screening using cutting-edge ultra-widefield (UWF) fundus imaging is\npotentially significant for ophthalmic outcomes. Current multidisciplinary\nresearch between ophthalmology and deep learning (DL) concentrates primarily on\ndisease classification and diagnosis using single-eye images, largely ignoring\njoint modeling and prediction for Oculus Uterque (OU, both eyes). Inspired by\nthe complex relationships between OU and the high correlation between the\n(continuous) outcome labels (Spherical Equivalent and Axial Length), we propose\na framework of copula-enhanced adapter convolutional neural network (CNN)\nlearning with OU UWF fundus images (OUCopula) for joint prediction of multiple\nclinical scores. We design a novel bi-channel multi-label CNN that can (1) take\nbi-channel image inputs subject to both high correlation and heterogeneity (by\nsharing the same backbone network and employing adapters to parameterize the\nchannel-wise discrepancy), and (2) incorporate correlation information between\ncontinuous output labels (using a copula). Solid experiments show that OUCopula\nachieves satisfactory performance in myopia score prediction compared to\nbackbone models. Moreover, OUCopula can far exceed the performance of models\nconstructed for single-eye inputs. Importantly, our study also hints at the\npotential extension of the bi-channel model to a multi-channel paradigm and the\ngeneralizability of OUCopula across various backbone CNNs.\n","authors":["Yang Li","Qiuyi Huang","Chong Zhong","Danjuan Yang","Meiyan Li","A. H. Welsh","Aiyi Liu","Bo Fu","Catherien C. Liu","Xingtao Zhou"],"pdf_url":"https://arxiv.org/pdf/2403.11974v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11961v1","updated":"2024-03-18T16:58:23Z","published":"2024-03-18T16:58:23Z","title":"Enhanced Event-Based Video Reconstruction with Motion Compensation","summary":" Deep neural networks for event-based video reconstruction often suffer from a\nlack of interpretability and have high memory demands. A lightweight network\ncalled CISTA-LSTC has recently been introduced showing that high-quality\nreconstruction can be achieved through the systematic design of its\narchitecture. However, its modelling assumption that input signals and output\nreconstructed frame share the same sparse representation neglects the\ndisplacement caused by motion. To address this, we propose warping the input\nintensity frames and sparse codes to enhance reconstruction quality. A\nCISTA-Flow network is constructed by integrating a flow network with CISTA-LSTC\nfor motion compensation. The system relies solely on events, in which predicted\nflow aids in reconstruction and then reconstructed frames are used to\nfacilitate flow estimation. We also introduce an iterative training framework\nfor this combined system. Results demonstrate that our approach achieves\nstate-of-the-art reconstruction accuracy and simultaneously provides reliable\ndense flow estimation. Furthermore, our model exhibits flexibility in that it\ncan integrate different flow networks, suggesting its potential for further\nperformance enhancement.\n","authors":["Siying Liu","Pier Luigi Dragotti"],"pdf_url":"https://arxiv.org/pdf/2403.11961v1.pdf","comment":"22 pages, 8 figures (supplementary material included)"},{"id":"http://arxiv.org/abs/2403.11959v1","updated":"2024-03-18T16:56:47Z","published":"2024-03-18T16:56:47Z","title":"IVAC-P2L: Enhancing Video Action Counting through Irregular Repetition\n Priors","summary":" Video Action Counting (VAC) is crucial in analyzing sports, fitness, and\neveryday activities by quantifying repetitive actions in videos. However,\ntraditional VAC methods have overlooked the complexity of action repetitions,\nsuch as interruptions and the variability in cycle duration. Our research\naddresses the shortfall by introducing a novel approach to VAC, called\nIrregular Video Action Counting (IVAC). IVAC prioritizes modeling irregular\nrepetition patterns in videos, which we define through two primary aspects:\nInter-cycle Consistency and Cycle-interval Inconsistency. Inter-cycle\nConsistency ensures homogeneity in the spatial-temporal representations of\ncycle segments, signifying action uniformity within cycles. Cycle-interval\ninconsistency highlights the importance of distinguishing between cycle\nsegments and intervals based on their inherent content differences. To\nencapsulate these principles, we propose a new methodology that includes\nconsistency and inconsistency modules, supported by a unique pull-push loss\n(P2L) mechanism. The IVAC-P2L model applies a pull loss to promote coherence\namong cycle segment features and a push loss to clearly distinguish features of\ncycle segments from interval segments. Empirical evaluations conducted on the\nRepCount dataset demonstrate that the IVAC-P2L model sets a new benchmark in\nVAC task performance. Furthermore, the model demonstrates exceptional\nadaptability and generalization across various video contents, outperforming\nexisting models on two additional datasets, UCFRep and Countix, without the\nneed for dataset-specific optimization. These results confirm the efficacy of\nour approach in addressing irregular repetitions in videos and pave the way for\nfurther advancements in video analysis and understanding.\n","authors":["Hang Wang","Zhi-Qi Cheng","Youtian Du","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.11959v1.pdf","comment":"Under continuous updates. Modified for arXiv"},{"id":"http://arxiv.org/abs/2403.11953v1","updated":"2024-03-18T16:50:13Z","published":"2024-03-18T16:50:13Z","title":"Advancing COVID-19 Detection in 3D CT Scans","summary":" To make a more accurate diagnosis of COVID-19, we propose a straightforward\nyet effective model. Firstly, we analyse the characteristics of 3D CT scans and\nremove the non-lung parts, facilitating the model to focus on lesion-related\nareas and reducing computational cost. We use ResNeSt50 as the strong feature\nextractor, initializing it with pretrained weights which have COVID-19-specific\nprior knowledge. Our model achieves a Macro F1 Score of 0.94 on the validation\nset of the 4th COV19D Competition Challenge $\\mathrm{I}$, surpassing the\nbaseline by 16%. This indicates its effectiveness in distinguishing between\nCOVID-19 and non-COVID-19 cases, making it a robust method for COVID-19\ndetection.\n","authors":["Qingqiu Li","Runtian Yuan","Junlin Hou","Jilan Xu","Yuejie Zhang","Rui Feng","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2403.11953v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14091v3","updated":"2024-03-18T16:48:13Z","published":"2023-12-21T18:09:30Z","title":"HD-Painter: High-Resolution and Prompt-Faithful Text-Guided Image\n Inpainting with Diffusion Models","summary":" Recent progress in text-guided image inpainting, based on the unprecedented\nsuccess of text-to-image diffusion models, has led to exceptionally realistic\nand visually plausible results. However, there is still significant potential\nfor improvement in current text-to-image inpainting models, particularly in\nbetter aligning the inpainted area with user prompts and performing\nhigh-resolution inpainting. Therefore, we introduce HD-Painter, a training free\napproach that accurately follows prompts and coherently scales to high\nresolution image inpainting. To this end, we design the Prompt-Aware\nIntroverted Attention (PAIntA) layer enhancing self-attention scores by prompt\ninformation resulting in better text aligned generations. To further improve\nthe prompt coherence we introduce the Reweighting Attention Score Guidance\n(RASG) mechanism seamlessly integrating a post-hoc sampling strategy into the\ngeneral form of DDIM to prevent out-of-distribution latent shifts. Moreover,\nHD-Painter allows extension to larger scales by introducing a specialized\nsuper-resolution technique customized for inpainting, enabling the completion\nof missing regions in images of up to 2K resolution. Our experiments\ndemonstrate that HD-Painter surpasses existing state-of-the-art approaches\nquantitatively and qualitatively across multiple metrics and a user study. Code\nis publicly available at: https://github.com/Picsart-AI-Research/HD-Painter\n","authors":["Hayk Manukyan","Andranik Sargsyan","Barsegh Atanyan","Zhangyang Wang","Shant Navasardyan","Humphrey Shi"],"pdf_url":"https://arxiv.org/pdf/2312.14091v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.03959v2","updated":"2024-03-18T16:40:17Z","published":"2023-04-08T09:01:37Z","title":"StillFast: An End-to-End Approach for Short-Term Object Interaction\n Anticipation","summary":" Anticipation problem has been studied considering different aspects such as\npredicting humans' locations, predicting hands and objects trajectories, and\nforecasting actions and human-object interactions. In this paper, we studied\nthe short-term object interaction anticipation problem from the egocentric\npoint of view, proposing a new end-to-end architecture named StillFast. Our\napproach simultaneously processes a still image and a video detecting and\nlocalizing next-active objects, predicting the verb which describes the future\ninteraction and determining when the interaction will start. Experiments on the\nlarge-scale egocentric dataset EGO4D show that our method outperformed\nstate-of-the-art approaches on the considered task. Our method is ranked first\nin the public leaderboard of the EGO4D short term object interaction\nanticipation challenge 2022. Please see the project web page for code and\nadditional details: https://iplab.dmi.unict.it/stillfast/.\n","authors":["Francesco Ragusa","Giovanni Maria Farinella","Antonino Furnari"],"pdf_url":"https://arxiv.org/pdf/2304.03959v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11936v1","updated":"2024-03-18T16:34:38Z","published":"2024-03-18T16:34:38Z","title":"AI-Assisted Cervical Cancer Screening","summary":" Visual Inspection with Acetic Acid (VIA) remains the most feasible cervical\ncancer screening test in resource-constrained settings of low- and\nmiddle-income countries (LMICs), which are often performed screening camps or\nprimary/community health centers by nurses instead of the preferred but\nunavailable expert Gynecologist. To address the highly subjective nature of the\ntest, various handheld devices integrating cameras or smartphones have been\nrecently explored to capture cervical images during VIA and aid decision-making\nvia telemedicine or AI models. Most studies proposing AI models retrospectively\nuse a relatively small number of already collected images from specific\ndevices, digital cameras, or smartphones; the challenges and protocol for\nquality image acquisition during VIA in resource-constrained camp settings,\nchallenges in getting gold standard, data imbalance, etc. are often overlooked.\nWe present a novel approach and describe the end-to-end design process to build\na robust smartphone-based AI-assisted system that does not require buying a\nseparate integrated device: the proposed protocol for quality image acquisition\nin resource-constrained settings, dataset collected from 1,430 women during VIA\nperformed by nurses in screening camps, preprocessing pipeline, and training\nand evaluation of a deep-learning-based classification model aimed to identify\n(pre)cancerous lesions. Our work shows that the readily available smartphones\nand a suitable protocol can capture the cervix images with the required details\nfor the VIA test well; the deep-learning-based classification model provides\npromising results to assist nurses in VIA screening; and provides a direction\nfor large-scale data collection and validation in resource-constrained\nsettings.\n","authors":["Kanchan Poudel","Lisasha Poudel","Prabin Raj Shakya","Atit Poudel","Archana Shrestha","Bishesh Khanal"],"pdf_url":"https://arxiv.org/pdf/2403.11936v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11935v1","updated":"2024-03-18T16:33:43Z","published":"2024-03-18T16:33:43Z","title":"HyperColorization: Propagating spatially sparse noisy spectral clues for\n reconstructing hyperspectral images","summary":" Hyperspectral cameras face challenging spatial-spectral resolution trade-offs\nand are more affected by shot noise than RGB photos taken over the same total\nexposure time. Here, we present a colorization algorithm to reconstruct\nhyperspectral images from a grayscale guide image and spatially sparse spectral\nclues. We demonstrate that our algorithm generalizes to varying spectral\ndimensions for hyperspectral images, and show that colorizing in a low-rank\nspace reduces compute time and the impact of shot noise. To enhance robustness,\nwe incorporate guided sampling, edge-aware filtering, and dimensionality\nestimation techniques. Our method surpasses previous algorithms in various\nperformance metrics, including SSIM, PSNR, GFC, and EMD, which we analyze as\nmetrics for characterizing hyperspectral image quality. Collectively, these\nfindings provide a promising avenue for overcoming the time-space-wavelength\nresolution trade-off by reconstructing a dense hyperspectral image from samples\nobtained by whisk or push broom scanners, as well as hybrid spatial-spectral\ncomputational imaging systems.\n","authors":["M. Kerem Aydin","Qi Guo","Emma Alexander"],"pdf_url":"https://arxiv.org/pdf/2403.11935v1.pdf","comment":"16 Pages, 13 Figures, 3 Tables, for more information:\n https://mehmetkeremaydin.github.io/hypercolorization/"},{"id":"http://arxiv.org/abs/2403.11934v1","updated":"2024-03-18T16:33:29Z","published":"2024-03-18T16:33:29Z","title":"High-energy physics image classification: A Survey of Jet Applications","summary":" In recent times, the fields of high-energy physics (HEP) experimentation and\nphenomenological studies have seen the integration of machine learning (ML) and\nits specialized branch, deep learning (DL). This survey offers a comprehensive\nassessment of these applications within the realm of various DL approaches. The\ninitial segment of the paper introduces the fundamentals encompassing diverse\nparticle physics types and establishes criteria for evaluating particle physics\nin tandem with learning models. Following this, a comprehensive taxonomy is\npresented for representing HEP images, encompassing accessible datasets,\nintricate details of preprocessing techniques, and methods of feature\nextraction and selection. Subsequently, the focus shifts to an exploration of\navailable artificial intelligence (AI) models tailored to HEP images, along\nwith a concentrated examination of HEP image classification pertaining to Jet\nparticles. Within this review, a profound investigation is undertaken into\ndistinct ML and DL proposed state-of-the art (SOTA) techniques, underscoring\ntheir implications for HEP inquiries. The discussion delves into specific\napplications in substantial detail, including Jet tagging, Jet tracking,\nparticle classification, and more. The survey culminates with an analysis\nconcerning the present status of HEP grounded in DL methodologies, encompassing\ninherent challenges and prospective avenues for future research endeavors.\n","authors":["Hamza Kheddar","Yassine Himeur","Abbes Amira","Rachik Soualah"],"pdf_url":"https://arxiv.org/pdf/2403.11934v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11929v1","updated":"2024-03-18T16:28:28Z","published":"2024-03-18T16:28:28Z","title":"LayerDiff: Exploring Text-guided Multi-layered Composable Image\n Synthesis via Layer-Collaborative Diffusion Model","summary":" Despite the success of generating high-quality images given any text prompts\nby diffusion-based generative models, prior works directly generate the entire\nimages, but cannot provide object-wise manipulation capability. To support\nwider real applications like professional graphic design and digital artistry,\nimages are frequently created and manipulated in multiple layers to offer\ngreater flexibility and control. Therefore in this paper, we propose a\nlayer-collaborative diffusion model, named LayerDiff, specifically designed for\ntext-guided, multi-layered, composable image synthesis. The composable image\nconsists of a background layer, a set of foreground layers, and associated mask\nlayers for each foreground element. To enable this, LayerDiff introduces a\nlayer-based generation paradigm incorporating multiple layer-collaborative\nattention modules to capture inter-layer patterns. Specifically, an inter-layer\nattention module is designed to encourage information exchange and learning\nbetween layers, while a text-guided intra-layer attention module incorporates\nlayer-specific prompts to direct the specific-content generation for each\nlayer. A layer-specific prompt-enhanced module better captures detailed textual\ncues from the global prompt. Additionally, a self-mask guidance sampling\nstrategy further unleashes the model's ability to generate multi-layered\nimages. We also present a pipeline that integrates existing perceptual and\ngenerative models to produce a large dataset of high-quality, text-prompted,\nmulti-layered images. Extensive experiments demonstrate that our LayerDiff\nmodel can generate high-quality multi-layered images with performance\ncomparable to conventional whole-image generation methods. Moreover, LayerDiff\nenables a broader range of controllable generative applications, including\nlayer-specific image editing and style transfer.\n","authors":["Runhui Huang","Kaixin Cai","Jianhua Han","Xiaodan Liang","Renjing Pei","Guansong Lu","Songcen Xu","Wei Zhang","Hang Xu"],"pdf_url":"https://arxiv.org/pdf/2403.11929v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11909v1","updated":"2024-03-18T16:11:42Z","published":"2024-03-18T16:11:42Z","title":"RoGUENeRF: A Robust Geometry-Consistent Universal Enhancer for NeRF","summary":" Recent advances in neural rendering have enabled highly photorealistic 3D\nscene reconstruction and novel view synthesis. Despite this progress, current\nstate-of-the-art methods struggle to reconstruct high frequency detail, due to\nfactors such as a low-frequency bias of radiance fields and inaccurate camera\ncalibration. One approach to mitigate this issue is to enhance images\npost-rendering. 2D enhancers can be pre-trained to recover some detail but are\nagnostic to scene geometry and do not easily generalize to new distributions of\nimage degradation. Conversely, existing 3D enhancers are able to transfer\ndetail from nearby training images in a generalizable manner, but suffer from\ninaccurate camera calibration and can propagate errors from the geometry into\nrendered images. We propose a neural rendering enhancer, RoGUENeRF, which\nexploits the best of both paradigms. Our method is pre-trained to learn a\ngeneral enhancer while also leveraging information from nearby training images\nvia robust 3D alignment and geometry-aware fusion. Our approach restores\nhigh-frequency textures while maintaining geometric consistency and is also\nrobust to inaccurate camera calibration. We show that RoGUENeRF substantially\nenhances the rendering quality of a wide range of neural rendering baselines,\ne.g. improving the PSNR of MipNeRF360 by 0.63dB and Nerfacto by 1.34dB on the\nreal world 360v2 dataset.\n","authors":["Sibi Catley-Chandar","Richard Shaw","Gregory Slabaugh","Eduardo Perez-Pellitero"],"pdf_url":"https://arxiv.org/pdf/2403.11909v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11905v1","updated":"2024-03-18T16:06:30Z","published":"2024-03-18T16:06:30Z","title":"Tur[k]ingBench: A Challenge Benchmark for Web Agents","summary":" Recent chatbots have demonstrated impressive ability to understand and\ncommunicate in raw-text form. However, there is more to the world than raw\ntext. For example, humans spend long hours of their time on web pages, where\ntext is intertwined with other modalities and tasks are accomplished in the\nform of various complex interactions. Can state-of-the-art multi-modal models\ngeneralize to such complex domains?\n To address this question, we introduce TurkingBench, a benchmark of tasks\nformulated as web pages containing textual instructions with multi-modal\ncontext. Unlike existing work which employs artificially synthesized web pages,\nhere we use natural HTML pages that were originally designed for crowdsourcing\nworkers for various annotation purposes. The HTML instructions of each task are\nalso instantiated with various values (obtained from the crowdsourcing tasks)\nto form new instances of the task. This benchmark contains 32.2K instances\ndistributed across 158 tasks.\n Additionally, to facilitate the evaluation on TurkingBench, we develop an\nevaluation framework that connects the responses of chatbots to modifications\non web pages (modifying a text box, checking a radio, etc.). We evaluate the\nperformance of state-of-the-art models, including language-only, vision-only,\nand layout-only models, and their combinations, on this benchmark. Our findings\nreveal that these models perform significantly better than random chance, yet\nconsiderable room exists for improvement. We hope this benchmark will help\nfacilitate the evaluation and development of web-based agents.\n","authors":["Kevin Xu","Yeganeh Kordi","Kate Sanders","Yizhong Wang","Adam Byerly","Jack Zhang","Benjamin Van Durme","Daniel Khashabi"],"pdf_url":"https://arxiv.org/pdf/2403.11905v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01313v3","updated":"2024-03-18T16:02:10Z","published":"2023-08-02T17:57:25Z","title":"PerceptionCLIP: Visual Classification by Inferring and Conditioning on\n Contexts","summary":" Vision-language models like CLIP are widely used in zero-shot image\nclassification due to their ability to understand various visual concepts and\nnatural language descriptions. However, how to fully leverage CLIP's\nunprecedented human-like understanding capabilities to achieve better\nperformance is still an open question. This paper draws inspiration from the\nhuman visual perception process: when classifying an object, humans first infer\ncontextual attributes (e.g., background and orientation) which help separate\nthe foreground object from the background, and then classify the object based\non this information. Inspired by it, we observe that providing CLIP with\ncontextual attributes improves zero-shot image classification and mitigates\nreliance on spurious features. We also observe that CLIP itself can reasonably\ninfer the attributes from an image. With these observations, we propose a\ntraining-free, two-step zero-shot classification method PerceptionCLIP. Given\nan image, it first infers contextual attributes (e.g., background) and then\nperforms object classification conditioning on them. Our experiments show that\nPerceptionCLIP achieves better generalization, group robustness, and\ninteroperability. Our code is available at\nhttps://github.com/umd-huang-lab/perceptionCLIP\n","authors":["Bang An","Sicheng Zhu","Michael-Andrei Panaitescu-Liess","Chaithanya Kumar Mummadi","Furong Huang"],"pdf_url":"https://arxiv.org/pdf/2308.01313v3.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2403.11899v1","updated":"2024-03-18T15:58:03Z","published":"2024-03-18T15:58:03Z","title":"GNeRP: Gaussian-guided Neural Reconstruction of Reflective Objects with\n Noisy Polarization Priors","summary":" Learning surfaces from neural radiance field (NeRF) became a rising topic in\nMulti-View Stereo (MVS). Recent Signed Distance Function (SDF)-based methods\ndemonstrated their ability to reconstruct accurate 3D shapes of Lambertian\nscenes. However, their results on reflective scenes are unsatisfactory due to\nthe entanglement of specular radiance and complicated geometry. To address the\nchallenges, we propose a Gaussian-based representation of normals in SDF\nfields. Supervised by polarization priors, this representation guides the\nlearning of geometry behind the specular reflection and captures more details\nthan existing methods. Moreover, we propose a reweighting strategy in the\noptimization process to alleviate the noise issue of polarization priors. To\nvalidate the effectiveness of our design, we capture polarimetric information,\nand ground truth meshes in additional reflective scenes with various geometry.\nWe also evaluated our framework on the PANDORA dataset. Comparisons prove our\nmethod outperforms existing neural 3D reconstruction methods in reflective\nscenes by a large margin.\n","authors":["LI Yang","WU Ruizheng","LI Jiyong","CHEN Ying-cong"],"pdf_url":"https://arxiv.org/pdf/2403.11899v1.pdf","comment":"Accepted to ICLR 2024 Poster. For the Appendix, please see\n http://yukiumi13.github.io/gnerp_page"},{"id":"http://arxiv.org/abs/2403.12747v1","updated":"2024-03-18T15:48:24Z","published":"2024-03-18T15:48:24Z","title":"N-Modal Contrastive Losses with Applications to Social Media Data in\n Trimodal Space","summary":" The social media landscape of conflict dynamics has grown increasingly\nmulti-modal. Recent advancements in model architectures such as CLIP have\nenabled researchers to begin studying the interplay between the modalities of\ntext and images in a shared latent space. However, CLIP models fail to handle\nsituations on social media when modalities present in a post expand above two.\nSocial media dynamics often require understanding the interplay between not\nonly text and images, but video as well. In this paper we explore an extension\nof the contrastive loss function to allow for any number of modalities, and\ndemonstrate its usefulness in trimodal spaces on social media. By extending\nCLIP into three dimensions we can further aide understanding social media\nlandscapes where all three modalities are present (an increasingly common\nsituation). We use a newly collected public data set of Telegram posts\ncontaining all three modalities to train, and then demonstrate the usefulness\nof, a trimodal model in two OSINT scenarios: classifying a social media\nartifact post as either pro-Russian or pro-Ukrainian and identifying which\naccount a given artifact originated from. While trimodal CLIP models have been\nexplored before (though not on social media data), we also display a novel\nquadmodal CLIP model. This model can learn the interplay between text, image,\nvideo, and audio. We demonstrate new state-of-the-art baseline results on\nretrieval for quadmodel models moving forward.\n","authors":["William Theisen","Walter Scheirer"],"pdf_url":"https://arxiv.org/pdf/2403.12747v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12115v1","updated":"2024-03-18T15:43:45Z","published":"2024-03-18T15:43:45Z","title":"Deep learning automates Cobb angle measurement compared with\n multi-expert observers","summary":" Scoliosis, a prevalent condition characterized by abnormal spinal curvature\nleading to deformity, requires precise assessment methods for effective\ndiagnosis and management. The Cobb angle is a widely used scoliosis\nquantification method that measures the degree of curvature between the tilted\nvertebrae. Yet, manual measuring of Cobb angles is time-consuming and\nlabor-intensive, fraught with significant interobserver and intraobserver\nvariability. To address these challenges and the lack of interpretability found\nin certain existing automated methods, we have created fully automated software\nthat not only precisely measures the Cobb angle but also provides clear\nvisualizations of these measurements. This software integrates deep neural\nnetwork-based spine region detection and segmentation, spine centerline\nidentification, pinpointing the most significantly tilted vertebrae, and direct\nvisualization of Cobb angles on the original images. Upon comparison with the\nassessments of 7 expert readers, our algorithm exhibited a mean deviation in\nCobb angle measurements of 4.17 degrees, notably surpassing the manual\napproach's average intra-reader discrepancy of 5.16 degrees. The algorithm also\nachieved intra-class correlation coefficients (ICC) exceeding 0.96 and Pearson\ncorrelation coefficients above 0.944, reflecting robust agreement with expert\nassessments and superior measurement reliability. Through the comprehensive\nreader study and statistical analysis, we believe this algorithm not only\nensures a higher consensus with expert readers but also enhances\ninterpretability and reproducibility during assessments. It holds significant\npromise for clinical application, potentially aiding physicians in more\naccurate scoliosis assessment and diagnosis, thereby improving patient care.\n","authors":["Keyu Li","Hanxue Gu","Roy Colglazier","Robert Lark","Elizabeth Hubbard","Robert French","Denise Smith","Jikai Zhang","Erin McCrum","Anthony Catanzano","Joseph Cao","Leah Waldman","Maciej A. Mazurowski","Benjamin Alman"],"pdf_url":"https://arxiv.org/pdf/2403.12115v1.pdf","comment":"17 pages, 5 figures"},{"id":"http://arxiv.org/abs/2403.11887v1","updated":"2024-03-18T15:40:36Z","published":"2024-03-18T15:40:36Z","title":"SuperLoRA: Parameter-Efficient Unified Adaptation of Multi-Layer\n Attention Modules","summary":" Low-rank adaptation (LoRA) and its variants are widely employed in\nfine-tuning large models, including large language models for natural language\nprocessing and diffusion models for computer vision. This paper proposes a\ngeneralized framework called SuperLoRA that unifies and extends different LoRA\nvariants, which can be realized under different hyper-parameter settings.\nIntroducing grouping, folding, shuffling, projecting, and tensor factoring,\nSuperLoRA offers high flexibility compared with other LoRA variants and\ndemonstrates superior performance for transfer learning tasks especially in the\nextremely few-parameter regimes.\n","authors":["Xiangyu Chen","Jing Liu","Ye Wang","Pu Perry Wang","Matthew Brand","Guanghui Wang","Toshiaki Koike-Akino"],"pdf_url":"https://arxiv.org/pdf/2403.11887v1.pdf","comment":"33 pages, 29 figures"},{"id":"http://arxiv.org/abs/2403.11882v1","updated":"2024-03-18T15:33:06Z","published":"2024-03-18T15:33:06Z","title":"ReGenNet: Towards Human Action-Reaction Synthesis","summary":" Humans constantly interact with their surrounding environments. Current\nhuman-centric generative models mainly focus on synthesizing humans plausibly\ninteracting with static scenes and objects, while the dynamic human\naction-reaction synthesis for ubiquitous causal human-human interactions is\nless explored. Human-human interactions can be regarded as asymmetric with\nactors and reactors in atomic interaction periods. In this paper, we\ncomprehensively analyze the asymmetric, dynamic, synchronous, and detailed\nnature of human-human interactions and propose the first multi-setting human\naction-reaction synthesis benchmark to generate human reactions conditioned on\ngiven human actions. To begin with, we propose to annotate the actor-reactor\norder of the interaction sequences for the NTU120, InterHuman, and Chi3D\ndatasets. Based on them, a diffusion-based generative model with a Transformer\ndecoder architecture called ReGenNet together with an explicit distance-based\ninteraction loss is proposed to predict human reactions in an online manner,\nwhere the future states of actors are unavailable to reactors. Quantitative and\nqualitative results show that our method can generate instant and plausible\nhuman reactions compared to the baselines, and can generalize to unseen actor\nmotions and viewpoint changes.\n","authors":["Liang Xu","Yizhou Zhou","Yichao Yan","Xin Jin","Wenhan Zhu","Fengyun Rao","Xiaokang Yang","Wenjun Zeng"],"pdf_url":"https://arxiv.org/pdf/2403.11882v1.pdf","comment":"Accepted by CVPR 2024, Project Page:\n https://liangxuy.github.io/ReGenNet/"},{"id":"http://arxiv.org/abs/2308.06198v3","updated":"2024-03-18T15:31:57Z","published":"2023-08-11T15:43:37Z","title":"DIG In: Evaluating Disparities in Image Generations with Indicators for\n Geographic Diversity","summary":" The unprecedented photorealistic results achieved by recent text-to-image\ngenerative systems and their increasing use as plug-and-play content creation\nsolutions make it crucial to understand their potential biases. In this work,\nwe introduce three indicators to evaluate the realism, diversity and\nprompt-generation consistency of text-to-image generative systems when prompted\nto generate objects from across the world. Our indicators complement\nqualitative analysis of the broader impact of such systems by enabling\nautomatic and efficient benchmarking of geographic disparities, an important\nstep towards building responsible visual content creation systems. We use our\nproposed indicators to analyze potential geographic biases in state-of-the-art\nvisual content creation systems and find that: (1) models have less realism and\ndiversity of generations when prompting for Africa and West Asia than Europe,\n(2) prompting with geographic information comes at a cost to prompt-consistency\nand diversity of generated images, and (3) models exhibit more region-level\ndisparities for some objects than others. Perhaps most interestingly, our\nindicators suggest that progress in image generation quality has come at the\ncost of real-world geographic representation. Our comprehensive evaluation\nconstitutes a crucial step towards ensuring a positive experience of visual\ncontent creation for everyone.\n","authors":["Melissa Hall","Candace Ross","Adina Williams","Nicolas Carion","Michal Drozdzal","Adriana Romero Soriano"],"pdf_url":"https://arxiv.org/pdf/2308.06198v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11878v1","updated":"2024-03-18T15:31:57Z","published":"2024-03-18T15:31:57Z","title":"InTeX: Interactive Text-to-texture Synthesis via Unified Depth-aware\n Inpainting","summary":" Text-to-texture synthesis has become a new frontier in 3D content creation\nthanks to the recent advances in text-to-image models. Existing methods\nprimarily adopt a combination of pretrained depth-aware diffusion and\ninpainting models, yet they exhibit shortcomings such as 3D inconsistency and\nlimited controllability. To address these challenges, we introduce InteX, a\nnovel framework for interactive text-to-texture synthesis. 1) InteX includes a\nuser-friendly interface that facilitates interaction and control throughout the\nsynthesis process, enabling region-specific repainting and precise texture\nediting. 2) Additionally, we develop a unified depth-aware inpainting model\nthat integrates depth information with inpainting cues, effectively mitigating\n3D inconsistencies and improving generation speed. Through extensive\nexperiments, our framework has proven to be both practical and effective in\ntext-to-texture synthesis, paving the way for high-quality 3D content creation.\n","authors":["Jiaxiang Tang","Ruijie Lu","Xiaokang Chen","Xiang Wen","Gang Zeng","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2403.11878v1.pdf","comment":"Project Page: https://me.kiui.moe/intex/"},{"id":"http://arxiv.org/abs/2403.11876v1","updated":"2024-03-18T15:28:35Z","published":"2024-03-18T15:28:35Z","title":"Deep Bayesian Future Fusion for Self-Supervised, High-Resolution,\n Off-Road Mapping","summary":" The limited sensing resolution of resource-constrained off-road vehicles\nposes significant challenges towards reliable off-road autonomy. To overcome\nthis limitation, we propose a general framework based on fusing the future\ninformation (i.e. future fusion) for self-supervision. Recent approaches\nexploit this future information alongside the hand-crafted heuristics to\ndirectly supervise the targeted downstream tasks (e.g. traversability\nestimation). However, in this paper, we opt for a more general line of\ndevelopment - time-efficient completion of the highest resolution (i.e. 2cm per\npixel) BEV map in a self-supervised manner via future fusion, which can be used\nfor any downstream tasks for better longer range prediction. To this end,\nfirst, we create a high-resolution future-fusion dataset containing pairs of\n(RGB / height) raw sparse and noisy inputs and map-based dense labels. Next, to\naccommodate the noise and sparsity of the sensory information, especially in\nthe distal regions, we design an efficient realization of the Bayes filter onto\nthe vanilla convolutional network via the recurrent mechanism. Equipped with\nthe ideas from SOTA generative models, our Bayesian structure effectively\npredicts high-quality BEV maps in the distal regions. Extensive evaluation on\nboth the quality of completion and downstream task on our future-fusion dataset\ndemonstrates the potential of our approach.\n","authors":["Shubhra Aich","Wenshan Wang","Parv Maheshwari","Matthew Sivaprakasam","Samuel Triest","Cherie Ho","Jason M. Gregory","John G. Rogers III","Sebastian Scherer"],"pdf_url":"https://arxiv.org/pdf/2403.11876v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11875v1","updated":"2024-03-18T15:27:58Z","published":"2024-03-18T15:27:58Z","title":"Towards Real-Time Fast Unmanned Aerial Vehicle Detection Using Dynamic\n Vision Sensors","summary":" Unmanned Aerial Vehicles (UAVs) are gaining popularity in civil and military\napplications. However, uncontrolled access to restricted areas threatens\nprivacy and security. Thus, prevention and detection of UAVs are pivotal to\nguarantee confidentiality and safety. Although active scanning, mainly based on\nradars, is one of the most accurate technologies, it can be expensive and less\nversatile than passive inspections, e.g., object recognition. Dynamic vision\nsensors (DVS) are bio-inspired event-based vision models that leverage\ntimestamped pixel-level brightness changes in fast-moving scenes that adapt\nwell to low-latency object detection. This paper presents F-UAV-D (Fast\nUnmanned Aerial Vehicle Detector), an embedded system that enables fast-moving\ndrone detection. In particular, we propose a setup to exploit DVS as an\nalternative to RGB cameras in a real-time and low-power configuration. Our\napproach leverages the high-dynamic range (HDR) and background suppression of\nDVS and, when trained with various fast-moving drones, outperforms RGB input in\nsuboptimal ambient conditions such as low illumination and fast-moving scenes.\nOur results show that F-UAV-D can (i) detect drones by using less than <15 W on\naverage and (ii) perform real-time inference (i.e., <50 ms) by leveraging the\nCPU and GPU nodes of our edge computer.\n","authors":["Jakub Mandula","Jonas Kühne","Luca Pascarella","Michele Magno"],"pdf_url":"https://arxiv.org/pdf/2403.11875v1.pdf","comment":"Accepted at 2024 IEEE International Instrumentation and Measurement\n Technology Conference (I2MTC)"},{"id":"http://arxiv.org/abs/2403.11870v1","updated":"2024-03-18T15:23:48Z","published":"2024-03-18T15:23:48Z","title":"IDF-CR: Iterative Diffusion Process for Divide-and-Conquer Cloud Removal\n in Remote-sensing Images","summary":" Deep learning technologies have demonstrated their effectiveness in removing\ncloud cover from optical remote-sensing images. Convolutional Neural Networks\n(CNNs) exert dominance in the cloud removal tasks. However, constrained by the\ninherent limitations of convolutional operations, CNNs can address only a\nmodest fraction of cloud occlusion. In recent years, diffusion models have\nachieved state-of-the-art (SOTA) proficiency in image generation and\nreconstruction due to their formidable generative capabilities. Inspired by the\nrapid development of diffusion models, we first present an iterative diffusion\nprocess for cloud removal (IDF-CR), which exhibits a strong generative\ncapabilities to achieve component divide-and-conquer cloud removal. IDF-CR\nconsists of a pixel space cloud removal module (Pixel-CR) and a latent space\niterative noise diffusion network (IND). Specifically, IDF-CR is divided into\ntwo-stage models that address pixel space and latent space. The two-stage model\nfacilitates a strategic transition from preliminary cloud reduction to\nmeticulous detail refinement. In the pixel space stage, Pixel-CR initiates the\nprocessing of cloudy images, yielding a suboptimal cloud removal prior to\nproviding the diffusion model with prior cloud removal knowledge. In the latent\nspace stage, the diffusion model transforms low-quality cloud removal into\nhigh-quality clean output. We refine the Stable Diffusion by implementing\nControlNet. In addition, an unsupervised iterative noise refinement (INR)\nmodule is introduced for diffusion model to optimize the distribution of the\npredicted noise, thereby enhancing advanced detail recovery. Our model performs\nbest with other SOTA methods, including image reconstruction and optical\nremote-sensing cloud removal on the optical remote-sensing datasets.\n","authors":["Meilin Wang","Yexing Song","Pengxu Wei","Xiaoyu Xian","Yukai Shi","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2403.11870v1.pdf","comment":"Accepted by IEEE TGRS, we first present an iterative diffusion\n process for cloud removal, the code is available at:\n https://github.com/SongYxing/IDF-CR"},{"id":"http://arxiv.org/abs/2403.11868v1","updated":"2024-03-18T15:22:09Z","published":"2024-03-18T15:22:09Z","title":"View-Consistent 3D Editing with Gaussian Splatting","summary":" The advent of 3D Gaussian Splatting (3DGS) has revolutionized 3D editing,\noffering efficient, high-fidelity rendering and enabling precise local\nmanipulations. Currently, diffusion-based 2D editing models are harnessed to\nmodify multi-view rendered images, which then guide the editing of 3DGS models.\nHowever, this approach faces a critical issue of multi-view inconsistency,\nwhere the guidance images exhibit significant discrepancies across views,\nleading to mode collapse and visual artifacts of 3DGS. To this end, we\nintroduce View-consistent Editing (VcEdit), a novel framework that seamlessly\nincorporates 3DGS into image editing processes, ensuring multi-view consistency\nin edited guidance images and effectively mitigating mode collapse issues.\nVcEdit employs two innovative consistency modules: the Cross-attention\nConsistency Module and the Editing Consistency Module, both designed to reduce\ninconsistencies in edited images. By incorporating these consistency modules\ninto an iterative pattern, VcEdit proficiently resolves the issue of multi-view\ninconsistency, facilitating high-quality 3DGS editing across a diverse range of\nscenes.\n","authors":["Yuxuan Wang","Xuanyu Yi","Zike Wu","Na Zhao","Long Chen","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.11868v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11865v1","updated":"2024-03-18T15:18:55Z","published":"2024-03-18T15:18:55Z","title":"Exploring Multi-modal Neural Scene Representations With Applications on\n Thermal Imaging","summary":" Neural Radiance Fields (NeRFs) quickly evolved as the new de-facto standard\nfor the task of novel view synthesis when trained on a set of RGB images. In\nthis paper, we conduct a comprehensive evaluation of neural scene\nrepresentations, such as NeRFs, in the context of multi-modal learning.\nSpecifically, we present four different strategies of how to incorporate a\nsecond modality, other than RGB, into NeRFs: (1) training from scratch\nindependently on both modalities; (2) pre-training on RGB and fine-tuning on\nthe second modality; (3) adding a second branch; and (4) adding a separate\ncomponent to predict (color) values of the additional modality. We chose\nthermal imaging as second modality since it strongly differs from RGB in terms\nof radiosity, making it challenging to integrate into neural scene\nrepresentations. For the evaluation of the proposed strategies, we captured a\nnew publicly available multi-view dataset, ThermalMix, consisting of six common\nobjects and about 360 RGB and thermal images in total. We employ cross-modality\ncalibration prior to data capturing, leading to high-quality alignments between\nRGB and thermal images. Our findings reveal that adding a second branch to NeRF\nperforms best for novel view synthesis on thermal images while also yielding\ncompelling results on RGB. Finally, we also show that our analysis generalizes\nto other modalities, including near-infrared images and depth maps. Project\npage: https://mert-o.github.io/ThermalNeRF/.\n","authors":["Mert Özer","Maximilian Weiherer","Martin Hundhausen","Bernhard Egger"],"pdf_url":"https://arxiv.org/pdf/2403.11865v1.pdf","comment":"24 pages, 14 figures"},{"id":"http://arxiv.org/abs/2304.10535v2","updated":"2024-03-18T15:09:33Z","published":"2023-04-20T17:59:34Z","title":"Farm3D: Learning Articulated 3D Animals by Distilling 2D Diffusion","summary":" We present Farm3D, a method for learning category-specific 3D reconstructors\nfor articulated objects, relying solely on \"free\" virtual supervision from a\npre-trained 2D diffusion-based image generator. Recent approaches can learn a\nmonocular network that predicts the 3D shape, albedo, illumination, and\nviewpoint of any object occurrence, given a collection of single-view images of\nan object category. However, these approaches heavily rely on manually curated\nclean training data, which are expensive to obtain. We propose a framework that\nuses an image generator, such as Stable Diffusion, to generate synthetic\ntraining data that are sufficiently clean and do not require further manual\ncuration, enabling the learning of such a reconstruction network from scratch.\nAdditionally, we incorporate the diffusion model as a score to enhance the\nlearning process. The idea involves randomizing certain aspects of the\nreconstruction, such as viewpoint and illumination, generating virtual views of\nthe reconstructed 3D object, and allowing the 2D network to assess the quality\nof the resulting image, thus providing feedback to the reconstructor. Unlike\nwork based on distillation, which produces a single 3D asset for each textual\nprompt, our approach yields a monocular reconstruction network capable of\noutputting a controllable 3D asset from any given image, whether real or\ngenerated, in a single forward pass in a matter of seconds. Our network can be\nused for analysis, including monocular reconstruction, or for synthesis,\ngenerating articulated assets for real-time applications such as video games.\n","authors":["Tomas Jakab","Ruining Li","Shangzhe Wu","Christian Rupprecht","Andrea Vedaldi"],"pdf_url":"https://arxiv.org/pdf/2304.10535v2.pdf","comment":"In 3DV 2024, Project page: http://farm3d.github.io"},{"id":"http://arxiv.org/abs/2403.11854v1","updated":"2024-03-18T15:03:56Z","published":"2024-03-18T15:03:56Z","title":"denoiSplit: a method for joint image splitting and unsupervised\n denoising","summary":" In this work we present denoiSplit, a method to tackle a new analysis task,\ni.e. the challenge of joint semantic image splitting and unsupervised\ndenoising. This dual approach has important applications in fluorescence\nmicroscopy, where semantic image splitting has important applications but noise\ndoes generally hinder the downstream analysis of image content. Image splitting\ninvolves dissecting an image into its distinguishable semantic structures. We\nshow that the current state-of-the-art method for this task struggles in the\npresence of image noise, inadvertently also distributing the noise across the\npredicted outputs. The method we present here can deal with image noise by\nintegrating an unsupervised denoising sub-task. This integration results in\nimproved semantic image unmixing, even in the presence of notable and realistic\nlevels of imaging noise. A key innovation in denoiSplit is the use of\nspecifically formulated noise models and the suitable adjustment of\nKL-divergence loss for the high-dimensional hierarchical latent space we are\ntraining. We showcase the performance of denoiSplit across 4 tasks on\nreal-world microscopy images. Additionally, we perform qualitative and\nquantitative evaluations and compare results to existing benchmarks,\ndemonstrating the effectiveness of using denoiSplit: a single Variational\nSplitting Encoder-Decoder (VSE) Network using two suitable noise models to\njointly perform semantic splitting and denoising.\n","authors":["Ashesh Ashesh","Florian Jug"],"pdf_url":"https://arxiv.org/pdf/2403.11854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11848v1","updated":"2024-03-18T15:00:38Z","published":"2024-03-18T15:00:38Z","title":"GraphBEV: Towards Robust BEV Feature Alignment for Multi-Modal 3D Object\n Detection","summary":" Integrating LiDAR and camera information into Bird's-Eye-View (BEV)\nrepresentation has emerged as a crucial aspect of 3D object detection in\nautonomous driving. However, existing methods are susceptible to the inaccurate\ncalibration relationship between LiDAR and the camera sensor. Such inaccuracies\nresult in errors in depth estimation for the camera branch, ultimately causing\nmisalignment between LiDAR and camera BEV features. In this work, we propose a\nrobust fusion framework called Graph BEV. Addressing errors caused by\ninaccurate point cloud projection, we introduce a Local Align module that\nemploys neighbor-aware depth features via Graph matching. Additionally, we\npropose a Global Align module to rectify the misalignment between LiDAR and\ncamera BEV features. Our Graph BEV framework achieves state-of-the-art\nperformance, with an mAP of 70.1\\%, surpassing BEV Fusion by 1.6\\% on the\nnuscenes validation set. Importantly, our Graph BEV outperforms BEV Fusion by\n8.3\\% under conditions with misalignment noise.\n","authors":["Ziying Song","Lei Yang","Shaoqing Xu","Lin Liu","Dongyang Xu","Caiyan Jia","Feiyang Jia","Li Wang"],"pdf_url":"https://arxiv.org/pdf/2403.11848v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11835v1","updated":"2024-03-18T14:47:03Z","published":"2024-03-18T14:47:03Z","title":"Agent3D-Zero: An Agent for Zero-shot 3D Understanding","summary":" The ability to understand and reason the 3D real world is a crucial milestone\ntowards artificial general intelligence. The current common practice is to\nfinetune Large Language Models (LLMs) with 3D data and texts to enable 3D\nunderstanding. Despite their effectiveness, these approaches are inherently\nlimited by the scale and diversity of the available 3D data. Alternatively, in\nthis work, we introduce Agent3D-Zero, an innovative 3D-aware agent framework\naddressing the 3D scene understanding in a zero-shot manner. The essence of our\napproach centers on reconceptualizing the challenge of 3D scene perception as a\nprocess of understanding and synthesizing insights from multiple images,\ninspired by how our human beings attempt to understand 3D scenes. By\nconsolidating this idea, we propose a novel way to make use of a Large Visual\nLanguage Model (VLM) via actively selecting and analyzing a series of\nviewpoints for 3D understanding. Specifically, given an input 3D scene,\nAgent3D-Zero first processes a bird's-eye view image with custom-designed\nvisual prompts, then iteratively chooses the next viewpoints to observe and\nsummarize the underlying knowledge. A distinctive advantage of Agent3D-Zero is\nthe introduction of novel visual prompts, which significantly unleash the VLMs'\nability to identify the most informative viewpoints and thus facilitate\nobserving 3D scenes. Extensive experiments demonstrate the effectiveness of the\nproposed framework in understanding diverse and previously unseen 3D\nenvironments.\n","authors":["Sha Zhang","Di Huang","Jiajun Deng","Shixiang Tang","Wanli Ouyang","Tong He","Yanyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.11835v1.pdf","comment":"project page: https://zhangsha1024.github.io/Agent3D-Zero/"},{"id":"http://arxiv.org/abs/2203.01587v3","updated":"2024-03-18T14:32:54Z","published":"2022-03-03T09:30:55Z","title":"Multi-Tailed Vision Transformer for Efficient Inference","summary":" Recently, Vision Transformer (ViT) has achieved promising performance in\nimage recognition and gradually serves as a powerful backbone in various vision\ntasks. To satisfy the sequential input of Transformer, the tail of ViT first\nsplits each image into a sequence of visual tokens with a fixed length. Then\nthe following self-attention layers constructs the global relationship between\ntokens to produce useful representation for the downstream tasks. Empirically,\nrepresenting the image with more tokens leads to better performance, yet the\nquadratic computational complexity of self-attention layer to the number of\ntokens could seriously influence the efficiency of ViT's inference. For\ncomputational reduction, a few pruning methods progressively prune\nuninformative tokens in the Transformer encoder, while leaving the number of\ntokens before the Transformer untouched. In fact, fewer tokens as the input for\nthe Transformer encoder can directly reduce the following computational cost.\nIn this spirit, we propose a Multi-Tailed Vision Transformer (MT-ViT) in the\npaper. MT-ViT adopts multiple tails to produce visual sequences of different\nlengths for the following Transformer encoder. A tail predictor is introduced\nto decide which tail is the most efficient for the image to produce accurate\nprediction. Both modules are optimized in an end-to-end fashion, with the\nGumbel-Softmax trick. Experiments on ImageNet-1K demonstrate that MT-ViT can\nachieve a significant reduction on FLOPs with no degradation of the accuracy\nand outperform other compared methods in both accuracy and FLOPs.\n","authors":["Yunke Wang","Bo Du","Wenyuan Wang","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2203.01587v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00464v2","updated":"2024-03-18T14:24:34Z","published":"2023-09-01T14:02:44Z","title":"A Theoretical and Practical Framework for Evaluating Uncertainty\n Calibration in Object Detection","summary":" The proliferation of Deep Neural Networks has resulted in machine learning\nsystems becoming increasingly more present in various real-world applications.\nConsequently, there is a growing demand for highly reliable models in many\ndomains, making the problem of uncertainty calibration pivotal when considering\nthe future of deep learning. This is especially true when considering object\ndetection systems, that are commonly present in safety-critical applications\nsuch as autonomous driving, robotics and medical diagnosis. For this reason,\nthis work presents a novel theoretical and practical framework to evaluate\nobject detection systems in the context of uncertainty calibration. This\nencompasses a new comprehensive formulation of this concept through distinct\nformal definitions, and also three novel evaluation metrics derived from such\ntheoretical foundation. The robustness of the proposed uncertainty calibration\nmetrics is shown through a series of representative experiments.\n","authors":["Pedro Conde","Rui L. Lopes","Cristiano Premebida"],"pdf_url":"https://arxiv.org/pdf/2309.00464v2.pdf","comment":"Pre-print"},{"id":"http://arxiv.org/abs/2403.11821v1","updated":"2024-03-18T14:24:20Z","published":"2024-03-18T14:24:20Z","title":"Evaluating Text to Image Synthesis: Survey and Taxonomy of Image Quality\n Metrics","summary":" Recent advances in text-to-image synthesis have been enabled by exploiting a\ncombination of language and vision through foundation models. These models are\npre-trained on tremendous amounts of text-image pairs sourced from the World\nWide Web or other large-scale databases. As the demand for high-quality image\ngeneration shifts towards ensuring content alignment between text and image,\nnovel evaluation metrics have been developed with the aim of mimicking human\njudgments. Thus, researchers have started to collect datasets with increasingly\ncomplex annotations to study the compositionality of vision-language models and\ntheir incorporation as a quality measure of compositional alignment between\ntext and image contents. In this work, we provide a comprehensive overview of\nexisting text-to-image evaluation metrics and propose a new taxonomy for\ncategorizing these metrics. We also review frequently adopted text-image\nbenchmark datasets before discussing techniques to optimize text-to-image\nsynthesis models towards quality and human preferences. Ultimately, we derive\nguidelines for improving text-to-image evaluation and discuss the open\nchallenges and current limitations.\n","authors":["Sebastian Hartwig","Dominik Engel","Leon Sick","Hannah Kniesel","Tristan Payer"," Poonam","Timo Ropinski"],"pdf_url":"https://arxiv.org/pdf/2403.11821v1.pdf","comment":"preprint, 18 pages, 2 figures, 2 tables"},{"id":"http://arxiv.org/abs/2403.11818v1","updated":"2024-03-18T14:20:17Z","published":"2024-03-18T14:20:17Z","title":"TCNet: Continuous Sign Language Recognition from Trajectories and\n Correlated Regions","summary":" A key challenge in continuous sign language recognition (CSLR) is to\nefficiently capture long-range spatial interactions over time from the video\ninput. To address this challenge, we propose TCNet, a hybrid network that\neffectively models spatio-temporal information from Trajectories and Correlated\nregions. TCNet's trajectory module transforms frames into aligned trajectories\ncomposed of continuous visual tokens. In addition, for a query token,\nself-attention is learned along the trajectory. As such, our network can also\nfocus on fine-grained spatio-temporal patterns, such as finger movements, of a\nspecific region in motion. TCNet's correlation module uses a novel dynamic\nattention mechanism that filters out irrelevant frame regions. Additionally, it\nassigns dynamic key-value tokens from correlated regions to each query. Both\ninnovations significantly reduce the computation cost and memory. We perform\nexperiments on four large-scale datasets: PHOENIX14, PHOENIX14-T, CSL, and\nCSL-Daily, respectively. Our results demonstrate that TCNet consistently\nachieves state-of-the-art performance. For example, we improve over the\nprevious state-of-the-art by 1.5% and 1.0% word error rate on PHOENIX14 and\nPHOENIX14-T, respectively.\n","authors":["Hui Lu","Albert Ali Salah","Ronald Poppe"],"pdf_url":"https://arxiv.org/pdf/2403.11818v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11817v1","updated":"2024-03-18T14:18:08Z","published":"2024-03-18T14:18:08Z","title":"HVDistill: Transferring Knowledge from Images to Point Clouds via\n Unsupervised Hybrid-View Distillation","summary":" We present a hybrid-view-based knowledge distillation framework, termed\nHVDistill, to guide the feature learning of a point cloud neural network with a\npre-trained image network in an unsupervised manner. By exploiting the\ngeometric relationship between RGB cameras and LiDAR sensors, the\ncorrespondence between the two modalities based on both image-plane view and\nbird-eye view can be established, which facilitates representation learning.\nSpecifically, the image-plane correspondences can be simply obtained by\nprojecting the point clouds, while the bird-eye-view correspondences can be\nachieved by lifting pixels to the 3D space with the predicted depths under the\nsupervision of projected point clouds. The image teacher networks provide rich\nsemantics from the image-plane view and meanwhile acquire geometric information\nfrom the bird-eye view. Indeed, image features from the two views naturally\ncomplement each other and together can ameliorate the learned feature\nrepresentation of the point cloud student networks. Moreover, with a\nself-supervised pre-trained 2D network, HVDistill requires neither 2D nor 3D\nannotations. We pre-train our model on nuScenes dataset and transfer it to\nseveral downstream tasks on nuScenes, SemanticKITTI, and KITTI datasets for\nevaluation. Extensive experimental results show that our method achieves\nconsistent improvements over the baseline trained from scratch and\nsignificantly outperforms the existing schemes. Codes are available at\ngit@github.com:zhangsha1024/HVDistill.git.\n","authors":["Sha Zhang","Jiajun Deng","Lei Bai","Houqiang Li","Wanli Ouyang","Yanyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.11817v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02544v3","updated":"2024-03-18T14:16:29Z","published":"2024-02-04T15:46:43Z","title":"LHRS-Bot: Empowering Remote Sensing with VGI-Enhanced Large Multimodal\n Language Model","summary":" The revolutionary capabilities of large language models (LLMs) have paved the\nway for multimodal large language models (MLLMs) and fostered diverse\napplications across various specialized domains. In the remote sensing (RS)\nfield, however, the diverse geographical landscapes and varied objects in RS\nimagery are not adequately considered in recent MLLM endeavors. To bridge this\ngap, we construct a large-scale RS image-text dataset, LHRS-Align, and an\ninformative RS-specific instruction dataset, LHRS-Instruct, leveraging the\nextensive volunteered geographic information (VGI) and globally available RS\nimages. Building on this foundation, we introduce LHRS-Bot, an MLLM tailored\nfor RS image understanding through a novel multi-level vision-language\nalignment strategy and a curriculum learning method. Additionally, we introduce\nLHRS-Bench, a benchmark for thoroughly evaluating MLLMs' abilities in RS image\nunderstanding. Comprehensive experiments demonstrate that LHRS-Bot exhibits a\nprofound understanding of RS images and the ability to perform nuanced\nreasoning within the RS domain.\n","authors":["Dilxat Muhtar","Zhenshi Li","Feng Gu","Xueliang Zhang","Pengfeng Xiao"],"pdf_url":"https://arxiv.org/pdf/2402.02544v3.pdf","comment":"36 pages, 10 figures. Github https://github.com/NJU-LHRS/LHRS-Bot"},{"id":"http://arxiv.org/abs/2403.11812v1","updated":"2024-03-18T14:15:39Z","published":"2024-03-18T14:15:39Z","title":"Aerial Lifting: Neural Urban Semantic and Building Instance Lifting from\n Aerial Imagery","summary":" We present a neural radiance field method for urban-scale semantic and\nbuilding-level instance segmentation from aerial images by lifting noisy 2D\nlabels to 3D. This is a challenging problem due to two primary reasons.\nFirstly, objects in urban aerial images exhibit substantial variations in size,\nincluding buildings, cars, and roads, which pose a significant challenge for\naccurate 2D segmentation. Secondly, the 2D labels generated by existing\nsegmentation methods suffer from the multi-view inconsistency problem,\nespecially in the case of aerial images, where each image captures only a small\nportion of the entire scene. To overcome these limitations, we first introduce\na scale-adaptive semantic label fusion strategy that enhances the segmentation\nof objects of varying sizes by combining labels predicted from different\naltitudes, harnessing the novel-view synthesis capabilities of NeRF. We then\nintroduce a novel cross-view instance label grouping strategy based on the 3D\nscene representation to mitigate the multi-view inconsistency problem in the 2D\ninstance labels. Furthermore, we exploit multi-view reconstructed depth priors\nto improve the geometric quality of the reconstructed radiance field, resulting\nin enhanced segmentation results. Experiments on multiple real-world\nurban-scale datasets demonstrate that our approach outperforms existing\nmethods, highlighting its effectiveness.\n","authors":["Yuqi Zhang","Guanying Chen","Jiaxing Chen","Shuguang Cui"],"pdf_url":"https://arxiv.org/pdf/2403.11812v1.pdf","comment":"CVPR 2024: https://zyqz97.github.io/Aerial_Lifting/"},{"id":"http://arxiv.org/abs/2403.11808v1","updated":"2024-03-18T14:05:52Z","published":"2024-03-18T14:05:52Z","title":"Dynamic Tuning Towards Parameter and Inference Efficiency for ViT\n Adaptation","summary":" Existing parameter-efficient fine-tuning (PEFT) methods have achieved\nsignificant success on vision transformers (ViTs) adaptation by improving\nparameter efficiency. However, the exploration of enhancing inference\nefficiency during adaptation remains underexplored. This limits the broader\napplication of pre-trained ViT models, especially when the model is\ncomputationally extensive. In this paper, we propose Dynamic Tuning (DyT), a\nnovel approach to improve both parameter and inference efficiency for ViT\nadaptation. Specifically, besides using the lightweight adapter modules, we\npropose a token dispatcher to distinguish informative tokens from less\nimportant ones, allowing the latter to dynamically skip the original block,\nthereby reducing the redundant computation during inference. Additionally, we\nexplore multiple design variants to find the best practice of DyT. Finally,\ninspired by the mixture-of-experts (MoE) mechanism, we introduce an enhanced\nadapter to further boost the adaptation performance. We validate DyT across\nvarious tasks, including image/video recognition and semantic segmentation. For\ninstance, DyT achieves comparable or even superior performance compared to\nexisting PEFT methods while evoking only 71%-85% of their FLOPs on the VTAB-1K\nbenchmark.\n","authors":["Wangbo Zhao","Jiasheng Tang","Yizeng Han","Yibing Song","Kai Wang","Gao Huang","Fan Wang","Yang You"],"pdf_url":"https://arxiv.org/pdf/2403.11808v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11803v1","updated":"2024-03-18T14:02:53Z","published":"2024-03-18T14:02:53Z","title":"Federated Modality-specific Encoders and Multimodal Anchors for\n Personalized Brain Tumor Segmentation","summary":" Most existing federated learning (FL) methods for medical image analysis only\nconsidered intramodal heterogeneity, limiting their applicability to multimodal\nimaging applications. In practice, it is not uncommon that some FL participants\nonly possess a subset of the complete imaging modalities, posing inter-modal\nheterogeneity as a challenge to effectively training a global model on all\nparticipants' data. In addition, each participant would expect to obtain a\npersonalized model tailored for its local data characteristics from the FL in\nsuch a scenario. In this work, we propose a new FL framework with federated\nmodality-specific encoders and multimodal anchors (FedMEMA) to simultaneously\naddress the two concurrent issues. Above all, FedMEMA employs an exclusive\nencoder for each modality to account for the inter-modal heterogeneity in the\nfirst place. In the meantime, while the encoders are shared by the\nparticipants, the decoders are personalized to meet individual needs.\nSpecifically, a server with full-modal data employs a fusion decoder to\naggregate and fuse representations from all modality-specific encoders, thus\nbridging the modalities to optimize the encoders via backpropagation reversely.\nMeanwhile, multiple anchors are extracted from the fused multimodal\nrepresentations and distributed to the clients in addition to the encoder\nparameters. On the other end, the clients with incomplete modalities calibrate\ntheir missing-modal representations toward the global full-modal anchors via\nscaled dot-product cross-attention, making up the information loss due to\nabsent modalities while adapting the representations of present ones. FedMEMA\nis validated on the BraTS 2020 benchmark for multimodal brain tumor\nsegmentation. Results show that it outperforms various up-to-date methods for\nmultimodal and personalized FL and that its novel designs are effective. Our\ncode is available.\n","authors":["Qian Dai","Dong Wei","Hong Liu","Jinghan Sun","Liansheng Wang","Yefeng Zheng"],"pdf_url":"https://arxiv.org/pdf/2403.11803v1.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2403.11796v1","updated":"2024-03-18T13:53:48Z","published":"2024-03-18T13:53:48Z","title":"OpenOcc: Open Vocabulary 3D Scene Reconstruction via Occupancy\n Representation","summary":" 3D reconstruction has been widely used in autonomous navigation fields of\nmobile robotics. However, the former research can only provide the basic\ngeometry structure without the capability of open-world scene understanding,\nlimiting advanced tasks like human interaction and visual navigation. Moreover,\ntraditional 3D scene understanding approaches rely on expensive labeled 3D\ndatasets to train a model for a single task with supervision. Thus, geometric\nreconstruction with zero-shot scene understanding i.e. Open vocabulary 3D\nUnderstanding and Reconstruction, is crucial for the future development of\nmobile robots. In this paper, we propose OpenOcc, a novel framework unifying\nthe 3D scene reconstruction and open vocabulary understanding with neural\nradiance fields. We model the geometric structure of the scene with occupancy\nrepresentation and distill the pre-trained open vocabulary model into a 3D\nlanguage field via volume rendering for zero-shot inference. Furthermore, a\nnovel semantic-aware confidence propagation (SCP) method has been proposed to\nrelieve the issue of language field representation degeneracy caused by\ninconsistent measurements in distilled features. Experimental results show that\nour approach achieves competitive performance in 3D scene understanding tasks,\nespecially for small and long-tail objects.\n","authors":["Haochen Jiang","Yueming Xu","Yihan Zeng","Hang Xu","Wei Zhang","Jianfeng Feng","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.11796v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11792v1","updated":"2024-03-18T13:50:35Z","published":"2024-03-18T13:50:35Z","title":"SETA: Semantic-Aware Token Augmentation for Domain Generalization","summary":" Domain generalization (DG) aims to enhance the model robustness against\ndomain shifts without accessing target domains. A prevalent category of methods\nfor DG is data augmentation, which focuses on generating virtual samples to\nsimulate domain shifts. However, existing augmentation techniques in DG are\nmainly tailored for convolutional neural networks (CNNs), with limited\nexploration in token-based architectures, i.e., vision transformer (ViT) and\nmulti-layer perceptrons (MLP) models. In this paper, we study the impact of\nprior CNN-based augmentation methods on token-based models, revealing their\nperformance is suboptimal due to the lack of incentivizing the model to learn\nholistic shape information. To tackle the issue, we propose the SEmantic-aware\nToken Augmentation (SETA) method. SETA transforms token features by perturbing\nlocal edge cues while preserving global shape features, thereby enhancing the\nmodel learning of shape information. To further enhance the generalization\nability of the model, we introduce two stylized variants of our method combined\nwith two state-of-the-art style augmentation methods in DG. We provide a\ntheoretical insight into our method, demonstrating its effectiveness in\nreducing the generalization risk bound. Comprehensive experiments on five\nbenchmarks prove that our method achieves SOTA performances across various ViT\nand MLP architectures. Our code is available at\nhttps://github.com/lingeringlight/SETA.\n","authors":["Jintao Guo","Lei Qi","Yinghuan Shi","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2403.11792v1.pdf","comment":"13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2403.11791v1","updated":"2024-03-18T13:49:30Z","published":"2024-03-18T13:49:30Z","title":"PAON: A New Neuron Model using Padé Approximants","summary":" Convolutional neural networks (CNN) are built upon the classical\nMcCulloch-Pitts neuron model, which is essentially a linear model, where the\nnonlinearity is provided by a separate activation function. Several researchers\nhave proposed enhanced neuron models, including quadratic neurons, generalized\noperational neurons, generative neurons, and super neurons, with stronger\nnonlinearity than that provided by the pointwise activation function. There has\nalso been a proposal to use Pade approximation as a generalized activation\nfunction. In this paper, we introduce a brand new neuron model called Pade\nneurons (Paons), inspired by the Pade approximants, which is the best\nmathematical approximation of a transcendental function as a ratio of\npolynomials with different orders. We show that Paons are a super set of all\nother proposed neuron models. Hence, the basic neuron in any known CNN model\ncan be replaced by Paons. In this paper, we extend the well-known ResNet to\nPadeNet (built by Paons) to demonstrate the concept. Our experiments on the\nsingle-image super-resolution task show that PadeNets can obtain better results\nthan competing architectures.\n","authors":["Onur Keleş","A. Murat Tekalp"],"pdf_url":"https://arxiv.org/pdf/2403.11791v1.pdf","comment":"Submitted to IEEE ICIP 2024"},{"id":"http://arxiv.org/abs/2403.11790v1","updated":"2024-03-18T13:47:18Z","published":"2024-03-18T13:47:18Z","title":"Deep Medial Voxels: Learned Medial Axis Approximations for Anatomical\n Shape Modeling","summary":" Shape reconstruction from imaging volumes is a recurring need in medical\nimage analysis. Common workflows start with a segmentation step, followed by\ncareful post-processing and,finally, ad hoc meshing algorithms. As this\nsequence can be timeconsuming, neural networks are trained to reconstruct\nshapes through template deformation. These networks deliver state-ofthe-art\nresults without manual intervention, but, so far, they have primarily been\nevaluated on anatomical shapes with little topological variety between\nindividuals. In contrast, other works favor learning implicit shape models,\nwhich have multiple benefits for meshing and visualization. Our work follows\nthis direction by introducing deep medial voxels, a semi-implicit\nrepresentation that faithfully approximates the topological skeleton from\nimaging volumes and eventually leads to shape reconstruction via convolution\nsurfaces. Our reconstruction technique shows potential for both visualization\nand computer simulations.\n","authors":["Antonio Pepe","Richard Schussnig","Jianning Li","Christina Gsaxner","Dieter Schmalstieg","Jan Egger"],"pdf_url":"https://arxiv.org/pdf/2403.11790v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2403.11789v1","updated":"2024-03-18T13:46:52Z","published":"2024-03-18T13:46:52Z","title":"EMIE-MAP: Large-Scale Road Surface Reconstruction Based on Explicit Mesh\n and Implicit Encoding","summary":" Road surface reconstruction plays a vital role in autonomous driving systems,\nenabling road lane perception and high-precision mapping. Recently, neural\nimplicit encoding has achieved remarkable results in scene representation,\nparticularly in the realistic rendering of scene textures. However, it faces\nchallenges in directly representing geometric information for large-scale\nscenes. To address this, we propose EMIE-MAP, a novel method for large-scale\nroad surface reconstruction based on explicit mesh and implicit encoding. The\nroad geometry is represented using explicit mesh, where each vertex stores\nimplicit encoding representing the color and semantic information. To overcome\nthe difficulty in optimizing road elevation, we introduce a trajectory-based\nelevation initialization and an elevation residual learning method based on\nMulti-Layer Perceptron (MLP). Additionally, by employing implicit encoding and\nmulti-camera color MLPs decoding, we achieve separate modeling of scene\nphysical properties and camera characteristics, allowing surround-view\nreconstruction compatible with different camera models. Our method achieves\nremarkable road surface reconstruction performance in a variety of real-world\nchallenging scenarios.\n","authors":["Wenhua Wu","Qi Wang","Guangming Wang","Junping Wang","Tiankun Zhao","Yang Liu","Dongchao Gao","Zhe Liu","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2403.11789v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11781v1","updated":"2024-03-18T13:39:53Z","published":"2024-03-18T13:39:53Z","title":"Infinite-ID: Identity-preserved Personalization via ID-semantics\n Decoupling Paradigm","summary":" Drawing on recent advancements in diffusion models for text-to-image\ngeneration, identity-preserved personalization has made significant progress in\naccurately capturing specific identities with just a single reference image.\nHowever, existing methods primarily integrate reference images within the text\nembedding space, leading to a complex entanglement of image and text\ninformation, which poses challenges for preserving both identity fidelity and\nsemantic consistency. To tackle this challenge, we propose Infinite-ID, an\nID-semantics decoupling paradigm for identity-preserved personalization.\nSpecifically, we introduce identity-enhanced training, incorporating an\nadditional image cross-attention module to capture sufficient ID information\nwhile deactivating the original text cross-attention module of the diffusion\nmodel. This ensures that the image stream faithfully represents the identity\nprovided by the reference image while mitigating interference from textual\ninput. Additionally, we introduce a feature interaction mechanism that combines\na mixed attention module with an AdaIN-mean operation to seamlessly merge the\ntwo streams. This mechanism not only enhances the fidelity of identity and\nsemantic consistency but also enables convenient control over the styles of the\ngenerated images. Extensive experimental results on both raw photo generation\nand style image generation demonstrate the superior performance of our proposed\nmethod.\n","authors":["Yi Wu","Ziqiang Li","Heliang Zheng","Chaoyue Wang","Bin Li"],"pdf_url":"https://arxiv.org/pdf/2403.11781v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.05284v3","updated":"2024-03-18T13:38:04Z","published":"2023-12-08T12:48:53Z","title":"SlimSAM: 0.1% Data Makes Segment Anything Slim","summary":" Current approaches for compressing the Segment Anything Model (SAM) yield\ncommendable results, yet necessitate extensive data to train a new network from\nscratch. Employing conventional pruning techniques can remarkably reduce data\nrequirements but would suffer from a degradation in performance. To address\nthis challenging trade-off, we introduce SlimSAM, a novel data-efficient SAM\ncompression method that achieves superior performance with extremely less\ntraining data. The essence of SlimSAM is encapsulated in the alternate slimming\nframework which effectively enhances knowledge inheritance under severely\nlimited training data availability and exceptional pruning ratio. Diverging\nfrom prior techniques, our framework progressively compresses the model by\nalternately pruning and distilling distinct, decoupled sub-structures.\nDisturbed Taylor pruning is also proposed to address the misalignment between\nthe pruning objective and training target, thereby boosting the\npost-distillation after pruning. SlimSAM yields significant performance\nimprovements while demanding over 10 times less training data than any other\nexisting compression methods. Even when compared to the original SAM, SlimSAM\nachieves approaching performance while reducing parameter counts to merely 1.4%\n(9.1M), MACs to 0.8% (23G), and requiring only 0.1% (10k) of the SAM training\ndata. The code is available at http://github.com/czg1225/SlimSAM.\n","authors":["Zigeng Chen","Gongfan Fang","Xinyin Ma","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2312.05284v3.pdf","comment":"Work in progress. Code reposity: http://github.com/czg1225/SlimSAM"},{"id":"http://arxiv.org/abs/2303.05656v2","updated":"2024-03-18T13:34:44Z","published":"2023-03-10T02:15:58Z","title":"EHRDiff: Exploring Realistic EHR Synthesis with Diffusion Models","summary":" Electronic health records (EHR) contain a wealth of biomedical information,\nserving as valuable resources for the development of precision medicine\nsystems. However, privacy concerns have resulted in limited access to\nhigh-quality and large-scale EHR data for researchers, impeding progress in\nmethodological development. Recent research has delved into synthesizing\nrealistic EHR data through generative modeling techniques, where a majority of\nproposed methods relied on generative adversarial networks (GAN) and their\nvariants for EHR synthesis. Despite GAN-based methods attaining\nstate-of-the-art performance in generating EHR data, these approaches are\ndifficult to train and prone to mode collapse. Recently introduced in\ngenerative modeling, diffusion models have established cutting-edge performance\nin image generation, but their efficacy in EHR data synthesis remains largely\nunexplored. In this study, we investigate the potential of diffusion models for\nEHR data synthesis and introduce a novel method, EHRDiff. Through extensive\nexperiments, EHRDiff establishes new state-of-the-art quality for synthetic EHR\ndata, protecting private information in the meanwhile.\n","authors":["Hongyi Yuan","Songchi Zhou","Sheng Yu"],"pdf_url":"https://arxiv.org/pdf/2303.05656v2.pdf","comment":"Accepted by TMLR, preprint of camera-ready version"},{"id":"http://arxiv.org/abs/2403.11776v1","updated":"2024-03-18T13:34:22Z","published":"2024-03-18T13:34:22Z","title":"DVN-SLAM: Dynamic Visual Neural SLAM Based on Local-Global Encoding","summary":" Recent research on Simultaneous Localization and Mapping (SLAM) based on\nimplicit representation has shown promising results in indoor environments.\nHowever, there are still some challenges: the limited scene representation\ncapability of implicit encodings, the uncertainty in the rendering process from\nimplicit representations, and the disruption of consistency by dynamic objects.\nTo address these challenges, we propose a real-time dynamic visual SLAM system\nbased on local-global fusion neural implicit representation, named DVN-SLAM. To\nimprove the scene representation capability, we introduce a local-global fusion\nneural implicit representation that enables the construction of an implicit map\nwhile considering both global structure and local details. To tackle\nuncertainties arising from the rendering process, we design an information\nconcentration loss for optimization, aiming to concentrate scene information on\nobject surfaces. The proposed DVN-SLAM achieves competitive performance in\nlocalization and mapping across multiple datasets. More importantly, DVN-SLAM\ndemonstrates robustness in dynamic scenes, a trait that sets it apart from\nother NeRF-based methods.\n","authors":["Wenhua Wu","Guangming Wang","Ting Deng","Sebastian Aegidius","Stuart Shanks","Valerio Modugno","Dimitrios Kanoulas","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2403.11776v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19957v2","updated":"2024-03-18T13:30:03Z","published":"2023-05-31T15:44:00Z","title":"DeepSolo++: Let Transformer Decoder with Explicit Points Solo for\n Multilingual Text Spotting","summary":" End-to-end text spotting aims to integrate scene text detection and\nrecognition into a unified framework. Dealing with the relationship between the\ntwo sub-tasks plays a pivotal role in designing effective spotters. Although\nTransformer-based methods eliminate the heuristic post-processing, they still\nsuffer from the synergy issue between the sub-tasks and low training\nefficiency. Besides, they overlook the exploring on multilingual text spotting\nwhich requires an extra script identification task. In this paper, we present\nDeepSolo++, a simple DETR-like baseline that lets a single decoder with\nexplicit points solo for text detection, recognition, and script identification\nsimultaneously. Technically, for each text instance, we represent the character\nsequence as ordered points and model them with learnable explicit point\nqueries. After passing a single decoder, the point queries have encoded\nrequisite text semantics and locations, thus can be further decoded to the\ncenter line, boundary, script, and confidence of text via very simple\nprediction heads in parallel. Furthermore, we show the surprisingly good\nextensibility of our method, in terms of character class, language type, and\ntask. On the one hand, our method not only performs well in English scenes but\nalso masters the transcription with complex font structure and a thousand-level\ncharacter classes, such as Chinese. On the other hand, our DeepSolo++ achieves\nbetter performance on the additionally introduced script identification task\nwith a simpler training pipeline compared with previous methods. In addition,\nour models are also compatible with line annotations, which require much less\nannotation cost than polygons. The code is available at\n\\url{https://github.com/ViTAE-Transformer/DeepSolo}.\n","authors":["Maoyuan Ye","Jing Zhang","Shanshan Zhao","Juhua Liu","Tongliang Liu","Bo Du","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2305.19957v2.pdf","comment":"The extension of the CVPR 2023 paper (DeepSolo: Let Transformer\n Decoder with Explicit Points Solo for Text Spotting). arXiv admin note:\n substantial text overlap with arXiv:2211.10772"},{"id":"http://arxiv.org/abs/2403.11771v1","updated":"2024-03-18T13:30:03Z","published":"2024-03-18T13:30:03Z","title":"Modality-Agnostic fMRI Decoding of Vision and Language","summary":" Previous studies have shown that it is possible to map brain activation data\nof subjects viewing images onto the feature representation space of not only\nvision models (modality-specific decoding) but also language models\n(cross-modal decoding). In this work, we introduce and use a new large-scale\nfMRI dataset (~8,500 trials per subject) of people watching both images and\ntext descriptions of such images. This novel dataset enables the development of\nmodality-agnostic decoders: a single decoder that can predict which stimulus a\nsubject is seeing, irrespective of the modality (image or text) in which the\nstimulus is presented. We train and evaluate such decoders to map brain signals\nonto stimulus representations from a large range of publicly available vision,\nlanguage and multimodal (vision+language) models. Our findings reveal that (1)\nmodality-agnostic decoders perform as well as (and sometimes even better than)\nmodality-specific decoders (2) modality-agnostic decoders mapping brain data\nonto representations from unimodal models perform as well as decoders relying\non multimodal representations (3) while language and low-level visual\n(occipital) brain regions are best at decoding text and image stimuli,\nrespectively, high-level visual (temporal) regions perform well on both\nstimulus types.\n","authors":["Mitja Nikolaus","Milad Mozafari","Nicholas Asher","Leila Reddy","Rufin VanRullen"],"pdf_url":"https://arxiv.org/pdf/2403.11771v1.pdf","comment":"To appear at ICLR 2024 workshop on Representational Alignment\n (Re-Align)"},{"id":"http://arxiv.org/abs/2310.15778v3","updated":"2024-03-18T13:27:01Z","published":"2023-10-24T12:25:37Z","title":"Privacy Protection in MRI Scans Using 3D Masked Autoencoders","summary":" MRI scans provide valuable medical information, however they also contain\nsensitive and personally identifiable information that needs to be protected.\nWhereas MRI metadata is easily sanitized, MRI image data is a privacy risk\nbecause it contains information to render highly-realistic 3D visualizations of\na patient's head, enabling malicious actors to possibly identify the subject by\ncross-referencing a database. Data anonymization and de-identification is\nconcerned with ensuring the privacy and confidentiality of individuals'\npersonal information. Traditional MRI de-identification methods remove\nprivacy-sensitive parts (e.g. eyes, nose etc.) from a given scan. This comes at\nthe expense of introducing a domain shift that can throw off downstream\nanalyses. In this work, we propose CP-MAE, a model that de-identifies the face\nby remodeling it (e.g. changing the face) rather than by removing parts using\nmasked autoencoders. CP-MAE outperforms all previous approaches in terms of\ndownstream task performance as well as de-identification. With our method we\nare able to synthesize high-fidelity scans of resolution up to $256^3$ --\ncompared to $128^3$ with previous approaches -- which constitutes an eight-fold\nincrease in the number of voxels.\n","authors":["Lennart Alexander Van der Goten","Kevin Smith"],"pdf_url":"https://arxiv.org/pdf/2310.15778v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05713v2","updated":"2024-03-18T13:19:33Z","published":"2024-02-08T14:40:32Z","title":"Hidden in Plain Sight: Undetectable Adversarial Bias Attacks on\n Vulnerable Patient Populations","summary":" The proliferation of artificial intelligence (AI) in radiology has shed light\non the risk of deep learning (DL) models exacerbating clinical biases towards\nvulnerable patient populations. While prior literature has focused on\nquantifying biases exhibited by trained DL models, demographically targeted\nadversarial bias attacks on DL models and its implication in the clinical\nenvironment remains an underexplored field of research in medical imaging. In\nthis work, we demonstrate that demographically targeted label poisoning attacks\ncan introduce undetectable underdiagnosis bias in DL models. Our results across\nmultiple performance metrics and demographic groups like sex, age, and their\nintersectional subgroups show that adversarial bias attacks demonstrate\nhigh-selectivity for bias in the targeted group by degrading group model\nperformance without impacting overall model performance. Furthermore, our\nresults indicate that adversarial bias attacks result in biased DL models that\npropagate prediction bias even when evaluated with external datasets.\n","authors":["Pranav Kulkarni","Andrew Chan","Nithya Navarathna","Skylar Chan","Paul H. Yi","Vishwa S. Parekh"],"pdf_url":"https://arxiv.org/pdf/2402.05713v2.pdf","comment":"29 pages, 4 figures"},{"id":"http://arxiv.org/abs/2403.11761v1","updated":"2024-03-18T13:14:46Z","published":"2024-03-18T13:14:46Z","title":"BEVCar: Camera-Radar Fusion for BEV Map and Object Segmentation","summary":" Semantic scene segmentation from a bird's-eye-view (BEV) perspective plays a\ncrucial role in facilitating planning and decision-making for mobile robots.\nAlthough recent vision-only methods have demonstrated notable advancements in\nperformance, they often struggle under adverse illumination conditions such as\nrain or nighttime. While active sensors offer a solution to this challenge, the\nprohibitively high cost of LiDARs remains a limiting factor. Fusing camera data\nwith automotive radars poses a more inexpensive alternative but has received\nless attention in prior research. In this work, we aim to advance this\npromising avenue by introducing BEVCar, a novel approach for joint BEV object\nand map segmentation. The core novelty of our approach lies in first learning a\npoint-based encoding of raw radar data, which is then leveraged to efficiently\ninitialize the lifting of image features into the BEV space. We perform\nextensive experiments on the nuScenes dataset and demonstrate that BEVCar\noutperforms the current state of the art. Moreover, we show that incorporating\nradar information significantly enhances robustness in challenging\nenvironmental conditions and improves segmentation performance for distant\nobjects. To foster future research, we provide the weather split of the\nnuScenes dataset used in our experiments, along with our code and trained\nmodels at http://bevcar.cs.uni-freiburg.de.\n","authors":["Jonas Schramm","Niclas Vödisch","Kürsat Petek","B Ravi Kiran","Senthil Yogamani","Wolfram Burgard","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2403.11761v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11760v1","updated":"2024-03-18T13:14:30Z","published":"2024-03-18T13:14:30Z","title":"3R-INN: How to be climate friendly while consuming/delivering videos?","summary":" The consumption of a video requires a considerable amount of energy during\nthe various stages of its life-cycle. With a billion hours of video consumed\ndaily, this contributes significantly to the greenhouse gas emission.\nTherefore, reducing the end-to-end carbon footprint of the video chain, while\npreserving the quality of experience at the user side, is of high importance.\nTo contribute in an impactful manner, we propose 3R-INN, a single light\ninvertible network that does three tasks at once: given a high-resolution\ngrainy image, it Rescales it to a lower resolution, Removes film grain and\nReduces its power consumption when displayed. Providing such a minimum viable\nquality content contributes to reducing the energy consumption during encoding,\ntransmission, decoding and display. 3R-INN also offers the possibility to\nrestore either the high-resolution grainy original image or a grain-free\nversion, thanks to its invertibility and the disentanglement of the high\nfrequency, and without transmitting auxiliary data. Experiments show that,\nwhile enabling significant energy savings for encoding (78%), decoding (77%)\nand rendering (5% to 20%), 3R-INN outperforms state-of-the-art film grain\nsynthesis and energy-aware methods and achieves state-of-the-art performance on\nthe rescaling task on different test-sets.\n","authors":["Zoubida Ameur","Claire-Hélène Demarty","Daniel Menard","Olivier Le Meur"],"pdf_url":"https://arxiv.org/pdf/2403.11760v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11751v1","updated":"2024-03-18T12:59:35Z","published":"2024-03-18T12:59:35Z","title":"Relational Representation Learning Network for Cross-Spectral Image\n Patch Matching","summary":" Recently, feature relation learning has drawn widespread attention in\ncross-spectral image patch matching. However, existing related research focuses\non extracting diverse relations between image patch features and ignores\nsufficient intrinsic feature representations of individual image patches.\nTherefore, an innovative relational representation learning idea is proposed\nfor the first time, which simultaneously focuses on sufficiently mining the\nintrinsic features of individual image patches and the relations between image\npatch features. Based on this, we construct a lightweight Relational\nRepresentation Learning Network (RRL-Net). Specifically, we innovatively\nconstruct an autoencoder to fully characterize the individual intrinsic\nfeatures, and introduce a Feature Interaction Learning (FIL) module to extract\ndeep-level feature relations. To further fully mine individual intrinsic\nfeatures, a lightweight Multi-dimensional Global-to-Local Attention (MGLA)\nmodule is constructed to enhance the global feature extraction of individual\nimage patches and capture local dependencies within global features. By\ncombining the MGLA module, we further explore the feature extraction network\nand construct an Attention-based Lightweight Feature Extraction (ALFE) network.\nIn addition, we propose a Multi-Loss Post-Pruning (MLPP) optimization strategy,\nwhich greatly promotes network optimization while avoiding increases in\nparameters and inference time. Extensive experiments demonstrate that our\nRRL-Net achieves state-of-the-art (SOTA) performance on multiple public\ndatasets. Our code will be made public later.\n","authors":["Chuang Yu","Yunpeng Liu","Jinmiao Zhao","Dou Quan","Zelin Shi"],"pdf_url":"https://arxiv.org/pdf/2403.11751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11735v1","updated":"2024-03-18T12:43:38Z","published":"2024-03-18T12:43:38Z","title":"LSKNet: A Foundation Lightweight Backbone for Remote Sensing","summary":" Remote sensing images pose distinct challenges for downstream tasks due to\ntheir inherent complexity. While a considerable amount of research has been\ndedicated to remote sensing classification, object detection and semantic\nsegmentation, most of these studies have overlooked the valuable prior\nknowledge embedded within remote sensing scenarios. Such prior knowledge can be\nuseful because remote sensing objects may be mistakenly recognized without\nreferencing a sufficiently long-range context, which can vary for different\nobjects. This paper considers these priors and proposes a lightweight Large\nSelective Kernel Network (LSKNet) backbone. LSKNet can dynamically adjust its\nlarge spatial receptive field to better model the ranging context of various\nobjects in remote sensing scenarios. To our knowledge, large and selective\nkernel mechanisms have not been previously explored in remote sensing images.\nWithout bells and whistles, our lightweight LSKNet sets new state-of-the-art\nscores on standard remote sensing classification, object detection and semantic\nsegmentation benchmarks. Our comprehensive analysis further validated the\nsignificance of the identified priors and the effectiveness of LSKNet. The code\nis available at https://github.com/zcablii/LSKNet.\n","authors":["Yuxuan Li","Xiang Li","Yimain Dai","Qibin Hou","Li Liu","Yongxiang Liu","Ming-Ming Cheng","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2403.11735v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2303.09030"},{"id":"http://arxiv.org/abs/2302.07661v4","updated":"2024-03-18T12:41:45Z","published":"2023-02-15T13:48:10Z","title":"Depth- and Semantics-aware Multi-modal Domain Translation: Generating 3D\n Panoramic Color Images from LiDAR Point Clouds","summary":" This work presents a new depth- and semantics-aware conditional generative\nmodel, named TITAN-Next, for cross-domain image-to-image translation in a\nmulti-modal setup between LiDAR and camera sensors. The proposed model\nleverages scene semantics as a mid-level representation and is able to\ntranslate raw LiDAR point clouds to RGB-D camera images by solely relying on\nsemantic scene segments. We claim that this is the first framework of its kind\nand it has practical applications in autonomous vehicles such as providing a\nfail-safe mechanism and augmenting available data in the target image domain.\nThe proposed model is evaluated on the large-scale and challenging\nSemantic-KITTI dataset, and experimental findings show that it considerably\noutperforms the original TITAN-Net and other strong baselines by 23.7$\\%$\nmargin in terms of IoU.\n","authors":["Tiago Cortinhal","Eren Erdal Aksoy"],"pdf_url":"https://arxiv.org/pdf/2302.07661v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.05657v3","updated":"2024-03-18T12:34:30Z","published":"2023-03-10T02:16:35Z","title":"Tag2Text: Guiding Vision-Language Model via Image Tagging","summary":" This paper presents Tag2Text, a vision language pre-training (VLP) framework,\nwhich introduces image tagging into vision-language models to guide the\nlearning of visual-linguistic features. In contrast to prior works which\nutilize object tags either manually labeled or automatically detected with an\noff-the-shelf detector with limited performance, our approach explicitly learns\nan image tagger using tags parsed from image-paired text and thus provides a\nstrong semantic guidance to vision-language models. In this way, Tag2Text can\nutilize large-scale annotation-free image tags in accordance with image-text\npairs, and provides more diverse tag categories beyond objects. As a result,\nTag2Text demonstrates the ability of a foundational image tagging model, with\nsuperior zero-shot performance even comparable to fully supervised models.\nMoreover, by leveraging the tagging guidance, Tag2Text effectively enhances the\nperformance of vision-language models on both generation-based and\nalignment-based tasks. Across a wide range of downstream benchmarks, Tag2Text\nachieves state-of-the-art results with similar model sizes and data scales,\ndemonstrating the efficacy of the proposed tagging guidance. Code, demo and\npre-trained models are available at\nhttps://github.com/xinyu1205/recognize-anything.\n","authors":["Xinyu Huang","Youcai Zhang","Jinyu Ma","Weiwei Tian","Rui Feng","Yuejie Zhang","Yaqian Li","Yandong Guo","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.05657v3.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2402.14505v2","updated":"2024-03-18T12:28:31Z","published":"2024-02-22T12:55:01Z","title":"Towards Seamless Adaptation of Pre-trained Models for Visual Place\n Recognition","summary":" Recent studies show that vision models pre-trained in generic visual learning\ntasks with large-scale data can provide useful feature representations for a\nwide range of visual perception problems. However, few attempts have been made\nto exploit pre-trained foundation models in visual place recognition (VPR). Due\nto the inherent difference in training objectives and data between the tasks of\nmodel pre-training and VPR, how to bridge the gap and fully unleash the\ncapability of pre-trained models for VPR is still a key issue to address. To\nthis end, we propose a novel method to realize seamless adaptation of\npre-trained models for VPR. Specifically, to obtain both global and local\nfeatures that focus on salient landmarks for discriminating places, we design a\nhybrid adaptation method to achieve both global and local adaptation\nefficiently, in which only lightweight adapters are tuned without adjusting the\npre-trained model. Besides, to guide effective adaptation, we propose a mutual\nnearest neighbor local feature loss, which ensures proper dense local features\nare produced for local matching and avoids time-consuming spatial verification\nin re-ranking. Experimental results show that our method outperforms the\nstate-of-the-art methods with less training data and training time, and uses\nabout only 3% retrieval runtime of the two-stage VPR methods with RANSAC-based\nspatial verification. It ranks 1st on the MSLS challenge leaderboard (at the\ntime of submission). The code is released at\nhttps://github.com/Lu-Feng/SelaVPR.\n","authors":["Feng Lu","Lijun Zhang","Xiangyuan Lan","Shuting Dong","Yaowei Wang","Chun Yuan"],"pdf_url":"https://arxiv.org/pdf/2402.14505v2.pdf","comment":"ICLR2024"},{"id":"http://arxiv.org/abs/2312.15736v2","updated":"2024-03-18T12:23:48Z","published":"2023-12-25T14:16:24Z","title":"Towards Real-World Blind Face Restoration with Generative Diffusion\n Prior","summary":" Blind face restoration is an important task in computer vision and has gained\nsignificant attention due to its wide-range applications. Previous works mainly\nexploit facial priors to restore face images and have demonstrated high-quality\nresults. However, generating faithful facial details remains a challenging\nproblem due to the limited prior knowledge obtained from finite data. In this\nwork, we delve into the potential of leveraging the pretrained Stable Diffusion\nfor blind face restoration. We propose BFRffusion which is thoughtfully\ndesigned to effectively extract features from low-quality face images and could\nrestore realistic and faithful facial details with the generative prior of the\npretrained Stable Diffusion. In addition, we build a privacy-preserving face\ndataset called PFHQ with balanced attributes like race, gender, and age. This\ndataset can serve as a viable alternative for training blind face restoration\nnetworks, effectively addressing privacy and bias concerns usually associated\nwith the real face datasets. Through an extensive series of experiments, we\ndemonstrate that our BFRffusion achieves state-of-the-art performance on both\nsynthetic and real-world public testing datasets for blind face restoration and\nour PFHQ dataset is an available resource for training blind face restoration\nnetworks. The codes, pretrained models, and dataset are released at\nhttps://github.com/chenxx89/BFRffusion.\n","authors":["Xiaoxu Chen","Jingfan Tan","Tao Wang","Kaihao Zhang","Wenhan Luo","Xiaochun Cao"],"pdf_url":"https://arxiv.org/pdf/2312.15736v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12067v3","updated":"2024-03-18T12:23:25Z","published":"2023-11-19T06:43:11Z","title":"Quality and Quantity: Unveiling a Million High-Quality Images for\n Text-to-Image Synthesis in Fashion Design","summary":" The fusion of AI and fashion design has emerged as a promising research area.\nHowever, the lack of extensive, interrelated data on clothing and try-on stages\nhas hindered the full potential of AI in this domain. Addressing this, we\npresent the Fashion-Diffusion dataset, a product of multiple years' rigorous\neffort. This dataset, the first of its kind, comprises over a million\nhigh-quality fashion images, paired with detailed text descriptions. Sourced\nfrom a diverse range of geographical locations and cultural backgrounds, the\ndataset encapsulates global fashion trends. The images have been meticulously\nannotated with fine-grained attributes related to clothing and humans,\nsimplifying the fashion design process into a Text-to-Image (T2I) task. The\nFashion-Diffusion dataset not only provides high-quality text-image pairs and\ndiverse human-garment pairs but also serves as a large-scale resource about\nhumans, thereby facilitating research in T2I generation. Moreover, to foster\nstandardization in the T2I-based fashion design field, we propose a new\nbenchmark comprising multiple datasets for evaluating the performance of\nfashion design models. This work represents a significant leap forward in the\nrealm of AI-driven fashion design, setting a new standard for future research\nin this field.\n","authors":["Jia Yu","Lichao Zhang","Zijie Chen","Fayu Pan","MiaoMiao Wen","Yuming Yan","Fangsheng Weng","Shuai Zhang","Lili Pan","Zhenzhong Lan"],"pdf_url":"https://arxiv.org/pdf/2311.12067v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.05104v2","updated":"2024-03-18T11:52:58Z","published":"2023-04-11T10:01:39Z","title":"Approaching Test Time Augmentation in the Context of Uncertainty\n Calibration for Deep Neural Networks","summary":" With the rise of Deep Neural Networks, machine learning systems are nowadays\nubiquitous in a number of real-world applications, which bears the need for\nhighly reliable models. This requires a thorough look not only at the accuracy\nof such systems, but also at their predictive uncertainty. Hence, we propose a\nnovel technique (with two different variations, named M-ATTA and V-ATTA) based\non test time augmentation, to improve the uncertainty calibration of deep\nmodels for image classification. By leveraging na adaptive weighting system,\nM/V-ATTA improves uncertainty calibration without affecting the model's\naccuracy. The performance of these techniques is evaluated by considering\ndiverse metrics related to uncertainty calibration, demonstrating their\nrobustness. Empirical results, obtained on CIFAR-10, CIFAR-100, Aerial Image\nDataset, as well as in two different scenarios under distribution-shift,\nindicate that the proposed methods outperform several state-of-the-art post-hoc\ncalibration techniques. Furthermore, the methods proposed also show\nimprovements in terms of predictive entropy on out-of-distribution samples.\nCode for M/V-ATTA available at: https://github.com/pedrormconde/MV-ATTA\n","authors":["Pedro Conde","Tiago Barros","Rui L. Lopes","Cristiano Premebida","Urbano J. Nunes"],"pdf_url":"https://arxiv.org/pdf/2304.05104v2.pdf","comment":"Submitted to IEEE Transactions on Pattern Analysis and Machine\n Intelligence"},{"id":"http://arxiv.org/abs/2310.05773v2","updated":"2024-03-18T11:44:06Z","published":"2023-10-09T14:57:41Z","title":"Towards Lossless Dataset Distillation via Difficulty-Aligned Trajectory\n Matching","summary":" The ultimate goal of Dataset Distillation is to synthesize a small synthetic\ndataset such that a model trained on this synthetic set will perform equally\nwell as a model trained on the full, real dataset. Until now, no method of\nDataset Distillation has reached this completely lossless goal, in part due to\nthe fact that previous methods only remain effective when the total number of\nsynthetic samples is extremely small. Since only so much information can be\ncontained in such a small number of samples, it seems that to achieve truly\nloss dataset distillation, we must develop a distillation method that remains\neffective as the size of the synthetic dataset grows. In this work, we present\nsuch an algorithm and elucidate why existing methods fail to generate larger,\nhigh-quality synthetic sets. Current state-of-the-art methods rely on\ntrajectory-matching, or optimizing the synthetic data to induce similar\nlong-term training dynamics as the real data. We empirically find that the\ntraining stage of the trajectories we choose to match (i.e., early or late)\ngreatly affects the effectiveness of the distilled dataset. Specifically, early\ntrajectories (where the teacher network learns easy patterns) work well for a\nlow-cardinality synthetic set since there are fewer examples wherein to\ndistribute the necessary information. Conversely, late trajectories (where the\nteacher network learns hard patterns) provide better signals for larger\nsynthetic sets since there are now enough samples to represent the necessary\ncomplex patterns. Based on our findings, we propose to align the difficulty of\nthe generated patterns with the size of the synthetic dataset. In doing so, we\nsuccessfully scale trajectory matching-based methods to larger synthetic\ndatasets, achieving lossless dataset distillation for the very first time. Code\nand distilled datasets are available at https://gzyaftermath.github.io/DATM.\n","authors":["Ziyao Guo","Kai Wang","George Cazenavette","Hui Li","Kaipeng Zhang","Yang You"],"pdf_url":"https://arxiv.org/pdf/2310.05773v2.pdf","comment":"First lossless dataset distillation method, accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2311.14189v2","updated":"2024-03-18T11:43:13Z","published":"2023-11-23T20:14:50Z","title":"D-SCo: Dual-Stream Conditional Diffusion for Monocular Hand-Held Object\n Reconstruction","summary":" Reconstructing hand-held objects from a single RGB image is a challenging\ntask in computer vision. In contrast to prior works that utilize deterministic\nmodeling paradigms, we employ a point cloud denoising diffusion model to\naccount for the probabilistic nature of this problem. In the core, we introduce\ncentroid-fixed dual-stream conditional diffusion for monocular hand-held object\nreconstruction (D-SCo), tackling two predominant challenges. First, to avoid\nthe object centroid from deviating, we utilize a novel hand-constrained\ncentroid fixing paradigm, enhancing the stability of diffusion and reverse\nprocesses and the precision of feature projection. Second, we introduce a\ndual-stream denoiser to semantically and geometrically model hand-object\ninteractions with a novel unified hand-object semantic embedding, enhancing the\nreconstruction performance of the hand-occluded region of the object.\nExperiments on the synthetic ObMan dataset and three real-world datasets HO3D,\nMOW and DexYCB demonstrate that our approach can surpass all other\nstate-of-the-art methods. Codes will be released.\n","authors":["Bowen Fu","Gu Wang","Chenyangguang Zhang","Yan Di","Ziqin Huang","Zhiying Leng","Fabian Manhardt","Xiangyang Ji","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2311.14189v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11577v1","updated":"2024-03-18T08:53:03Z","published":"2024-03-18T08:53:03Z","title":"3DGS-Calib: 3D Gaussian Splatting for Multimodal SpatioTemporal\n Calibration","summary":" Reliable multimodal sensor fusion algorithms require accurate spatiotemporal\ncalibration. Recently, targetless calibration techniques based on implicit\nneural representations have proven to provide precise and robust results.\nNevertheless, such methods are inherently slow to train given the high\ncomputational overhead caused by the large number of sampled points required\nfor volume rendering. With the recent introduction of 3D Gaussian Splatting as\na faster alternative to implicit representation methods, we propose to leverage\nthis new rendering approach to achieve faster multi-sensor calibration. We\nintroduce 3DGS-Calib, a new calibration method that relies on the speed and\nrendering accuracy of 3D Gaussian Splatting to achieve multimodal\nspatiotemporal calibration that is accurate, robust, and with a substantial\nspeed-up compared to methods relying on implicit neural representations. We\ndemonstrate the superiority of our proposal with experimental results on\nsequences from KITTI-360, a widely used driving dataset.\n","authors":["Quentin Herau","Moussab Bennehar","Arthur Moreau","Nathan Piasco","Luis Roldao","Dzmitry Tsishkou","Cyrille Migniot","Pascal Vasseur","Cédric Demonceaux"],"pdf_url":"https://arxiv.org/pdf/2403.11577v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2205.02830v4","updated":"2024-03-18T07:11:53Z","published":"2022-05-05T17:58:06Z","title":"Interaction Replica: Tracking Human-Object Interaction and Scene Changes\n From Human Motion","summary":" Our world is not static and humans naturally cause changes in their\nenvironments through interactions, e.g., opening doors or moving furniture.\nModeling changes caused by humans is essential for building digital twins,\ne.g., in the context of shared physical-virtual spaces (metaverses) and\nrobotics. In order for widespread adoption of such emerging applications, the\nsensor setup used to capture the interactions needs to be inexpensive and\neasy-to-use for non-expert users. I.e., interactions should be captured and\nmodeled by simple ego-centric sensors such as a combination of cameras and IMU\nsensors, not relying on any external cameras or object trackers. Yet, to the\nbest of our knowledge, no work tackling the challenging problem of modeling\nhuman-scene interactions via such an ego-centric sensor setup exists. This\npaper closes this gap in the literature by developing a novel approach that\ncombines visual localization of humans in the scene with contact-based\nreasoning about human-scene interactions from IMU data. Interestingly, we can\nshow that even without visual observations of the interactions, human-scene\ncontacts and interactions can be realistically predicted from human pose\nsequences. Our method, iReplica (Interaction Replica), is an essential first\nstep towards the egocentric capture of human interactions and modeling of\ndynamic scenes, which is required for future AR/VR applications in immersive\nvirtual universes and for training machines to behave like humans. Our code,\ndata and model are available on our project page at\nhttp://virtualhumans.mpi-inf.mpg.de/ireplica/\n","authors":["Vladimir Guzov","Julian Chibane","Riccardo Marin","Yannan He","Yunus Saracoglu","Torsten Sattler","Gerard Pons-Moll"],"pdf_url":"https://arxiv.org/pdf/2205.02830v4.pdf","comment":"International Conference on 3D Vision 2024 (3DV'24)"},{"id":"http://arxiv.org/abs/2403.09669v2","updated":"2024-03-18T07:02:44Z","published":"2024-01-30T08:18:20Z","title":"STREAM: Spatio-TempoRal Evaluation and Analysis Metric for Video\n Generative Models","summary":" Image generative models have made significant progress in generating\nrealistic and diverse images, supported by comprehensive guidance from various\nevaluation metrics. However, current video generative models struggle to\ngenerate even short video clips, with limited tools that provide insights for\nimprovements. Current video evaluation metrics are simple adaptations of image\nmetrics by switching the embeddings with video embedding networks, which may\nunderestimate the unique characteristics of video. Our analysis reveals that\nthe widely used Frechet Video Distance (FVD) has a stronger emphasis on the\nspatial aspect than the temporal naturalness of video and is inherently\nconstrained by the input size of the embedding networks used, limiting it to 16\nframes. Additionally, it demonstrates considerable instability and diverges\nfrom human evaluations. To address the limitations, we propose STREAM, a new\nvideo evaluation metric uniquely designed to independently evaluate spatial and\ntemporal aspects. This feature allows comprehensive analysis and evaluation of\nvideo generative models from various perspectives, unconstrained by video\nlength. We provide analytical and experimental evidence demonstrating that\nSTREAM provides an effective evaluation tool for both visual and temporal\nquality of videos, offering insights into area of improvement for video\ngenerative models. To the best of our knowledge, STREAM is the first evaluation\nmetric that can separately assess the temporal and spatial aspects of videos.\nOur code is available at https://github.com/pro2nit/STREAM.\n","authors":["Pum Jun Kim","Seojun Kim","Jaejun Yoo"],"pdf_url":"https://arxiv.org/pdf/2403.09669v2.pdf","comment":"Our work is accepted to ICLR 2024"},{"id":"http://arxiv.org/abs/2403.12109v1","updated":"2024-03-18T03:39:54Z","published":"2024-03-18T03:39:54Z","title":"GCAM: Gaussian and causal-attention model of food fine-grained\n recognition","summary":" Currently, most food recognition relies on deep learning for category\nclassification. However, these approaches struggle to effectively distinguish\nbetween visually similar food samples, highlighting the pressing need to\naddress fine-grained issues in food recognition. To mitigate these challenges,\nwe propose the adoption of a Gaussian and causal-attention model for\nfine-grained object recognition.In particular, we train to obtain Gaussian\nfeatures over target regions, followed by the extraction of fine-grained\nfeatures from the objects, thereby enhancing the feature mapping capabilities\nof the target regions. To counteract data drift resulting from uneven data\ndistributions, we employ a counterfactual reasoning approach. By using\ncounterfactual interventions, we analyze the impact of the learned image\nattention mechanism on network predictions, enabling the network to acquire\nmore useful attention weights for fine-grained image recognition. Finally, we\ndesign a learnable loss strategy to balance training stability across various\nmodules, ultimately improving the accuracy of the final target recognition. We\nvalidate our approach on four relevant datasets, demonstrating its excellent\nperformance across these four datasets.We experimentally show that GCAM\nsurpasses state-of-the-art methods on the ETH-FOOD101, UECFOOD256, and\nVireo-FOOD172 datasets. Furthermore, our approach also achieves\nstate-of-the-art performance on the CUB-200 dataset.\n","authors":["Guohang Zhuang","Yue Hu","Tianxing Yan","JiaZhan Gao"],"pdf_url":"https://arxiv.org/pdf/2403.12109v1.pdf","comment":"23 pages, 11 figures"},{"id":"http://arxiv.org/abs/2204.04236v3","updated":"2024-03-18T15:22:51Z","published":"2022-04-08T18:03:39Z","title":"ChildCI Framework: Analysis of Motor and Cognitive Development in\n Children-Computer Interaction for Age Detection","summary":" This article presents a comprehensive analysis of the different tests\nproposed in the recent ChildCI framework, proving its potential for generating\na better understanding of children's neuromotor and cognitive development along\ntime, as well as their possible application in other research areas such as\ne-Health and e-Learning. In particular, we propose a set of over 100 global\nfeatures related to motor and cognitive aspects of the children interaction\nwith mobile devices, some of them collected and adapted from the literature.\n Furthermore, we analyse the robustness and discriminative power of the\nproposed feature set including experimental results for the task of children\nage group detection based on their motor and cognitive behaviours. Two\ndifferent scenarios are considered in this study: i) single-test scenario, and\nii) multiple-test scenario. Results over 93% accuracy are achieved using the\npublicly available ChildCIdb_v1 database (over 400 children from 18 months to 8\nyears old), proving the high correlation of children's age with the way they\ninteract with mobile devices.\n","authors":["Juan Carlos Ruiz-Garcia","Ruben Tolosana","Ruben Vera-Rodriguez","Julian Fierrez","Jaime Herreros-Rodriguez"],"pdf_url":"https://arxiv.org/pdf/2204.04236v3.pdf","comment":"12 pages, 3 figures, 7 tables"},{"id":"http://arxiv.org/abs/2403.11549v1","updated":"2024-03-18T08:00:23Z","published":"2024-03-18T08:00:23Z","title":"Boosting Continual Learning of Vision-Language Models via\n Mixture-of-Experts Adapters","summary":" Continual learning can empower vision-language models to continuously acquire\nnew knowledge, without the need for access to the entire historical dataset.\nHowever, mitigating the performance degradation in large-scale models is\nnon-trivial due to (i) parameter shifts throughout lifelong learning and (ii)\nsignificant computational burdens associated with full-model tuning. In this\nwork, we present a parameter-efficient continual learning framework to\nalleviate long-term forgetting in incremental learning with vision-language\nmodels. Our approach involves the dynamic expansion of a pre-trained CLIP\nmodel, through the integration of Mixture-of-Experts (MoE) adapters in response\nto new tasks. To preserve the zero-shot recognition capability of\nvision-language models, we further introduce a Distribution Discriminative\nAuto-Selector (DDAS) that automatically routes in-distribution and\nout-of-distribution inputs to the MoE Adapter and the original CLIP,\nrespectively. Through extensive experiments across various settings, our\nproposed method consistently outperforms previous state-of-the-art approaches\nwhile concurrently reducing parameter training burdens by 60%. Our code locates\nat https://github.com/JiazuoYu/MoE-Adapters4CL\n","authors":["Jiazuo Yu","Yunzhi Zhuge","Lu Zhang","Ping Hu","Dong Wang","Huchuan Lu","You He"],"pdf_url":"https://arxiv.org/pdf/2403.11549v1.pdf","comment":"This work is accepted by CVPR2024. More modifications may be\n performed"},{"id":"http://arxiv.org/abs/1812.00352v3","updated":"2024-03-18T08:09:10Z","published":"2018-12-02T08:09:55Z","title":"MDU-Net: Multi-scale Densely Connected U-Net for biomedical image\n segmentation","summary":" Biomedical image segmentation plays a central role in quantitative analysis,\nclinical diagnosis, and medical intervention. In the light of the fully\nconvolutional networks (FCN) and U-Net, deep convolutional networks (DNNs) have\nmade significant contributions to biomedical image segmentation applications.\nIn this paper, we propose three different multi-scale dense connections (MDC)\nfor the encoder, the decoder of U-shaped architectures, and across them. Based\non three dense connections, we propose a multi-scale densely connected U-Net\n(MDU-Net) for biomedical image segmentation. MDU-Net directly fuses the\nneighboring feature maps with different scales from both higher layers and\nlower layers to strengthen feature propagation in the current layer.\nMulti-scale dense connections, which contain shorter connections between layers\nclose to the input and output, also make a much deeper U-Net possible. Besides,\nwe introduce quantization to alleviate the potential overfitting in dense\nconnections, and further improve the segmentation performance. We evaluate our\nproposed model on the MICCAI 2015 Gland Segmentation (GlaS) dataset. The three\nMDC improve U-Net performance by up to 1.8% on test A and 3.5% on test B in the\nMICCAI Gland dataset. Meanwhile, the MDU-Net with quantization obviously\nimproves the segmentation performance of original U-Net.\n","authors":["Jiawei Zhang","Yuzhen Jin","Jilan Xu","Xiaowei Xu","Yanchun Zhang"],"pdf_url":"https://arxiv.org/pdf/1812.00352v3.pdf","comment":"10 pages, 5 figures, 6 tables, published in the Health Information\n Science and Systems journal"}]},"2024-03-17T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2312.08459v2","updated":"2024-03-17T23:45:01Z","published":"2023-12-13T19:01:07Z","title":"FaceTalk: Audio-Driven Motion Diffusion for Neural Parametric Head\n Models","summary":" We introduce FaceTalk, a novel generative approach designed for synthesizing\nhigh-fidelity 3D motion sequences of talking human heads from input audio\nsignal. To capture the expressive, detailed nature of human heads, including\nhair, ears, and finer-scale eye movements, we propose to couple speech signal\nwith the latent space of neural parametric head models to create high-fidelity,\ntemporally coherent motion sequences. We propose a new latent diffusion model\nfor this task, operating in the expression space of neural parametric head\nmodels, to synthesize audio-driven realistic head sequences. In the absence of\na dataset with corresponding NPHM expressions to audio, we optimize for these\ncorrespondences to produce a dataset of temporally-optimized NPHM expressions\nfit to audio-video recordings of people talking. To the best of our knowledge,\nthis is the first work to propose a generative approach for realistic and\nhigh-quality motion synthesis of volumetric human heads, representing a\nsignificant advancement in the field of audio-driven 3D animation. Notably, our\napproach stands out in its ability to generate plausible motion sequences that\ncan produce high-fidelity head animation coupled with the NPHM shape space. Our\nexperimental results substantiate the effectiveness of FaceTalk, consistently\nachieving superior and visually natural motion, encompassing diverse facial\nexpressions and styles, outperforming existing methods by 75% in perceptual\nuser study evaluation.\n","authors":["Shivangi Aneja","Justus Thies","Angela Dai","Matthias Nießner"],"pdf_url":"https://arxiv.org/pdf/2312.08459v2.pdf","comment":"Paper Video: https://youtu.be/7Jf0kawrA3Q Project Page:\n https://shivangi-aneja.github.io/projects/facetalk/"},{"id":"http://arxiv.org/abs/2403.11373v1","updated":"2024-03-17T23:44:20Z","published":"2024-03-17T23:44:20Z","title":"Reconstruct before Query: Continual Missing Modality Learning with\n Decomposed Prompt Collaboration","summary":" Pre-trained large multi-modal models (LMMs) exploit fine-tuning to adapt\ndiverse user applications. Nevertheless, fine-tuning may face challenges due to\ndeactivated sensors (e.g., cameras turned off for privacy or technical issues),\nyielding modality-incomplete data and leading to inconsistency in training data\nand the data for inference. Additionally, continuous training leads to\ncatastrophic forgetting, diluting the knowledge in pre-trained LMMs. To\novercome these challenges, we introduce a novel task, Continual Missing\nModality Learning (CMML), to investigate how models can generalize when data of\ncertain modalities is missing during continual fine-tuning. Our preliminary\nbenchmarks reveal that existing methods suffer from a significant performance\ndrop in CMML, even with the aid of advanced continual learning techniques.\nTherefore, we devise a framework termed Reconstruct before Query (RebQ). It\ndecomposes prompts into modality-specific ones and breaks them into components\nstored in pools accessible via a key-query mechanism, which facilitates\nParameterEfficient Fine-Tuning and enhances knowledge transferability for\nsubsequent tasks. Meanwhile, our RebQ leverages extensive multi-modal knowledge\nfrom pre-trained LMMs to reconstruct the data of missing modality.\nComprehensive experiments demonstrate that RebQ effectively reconstructs the\nmissing modality information and retains pre-trained knowledge. Specifically,\ncompared with the baseline, RebQ improves average precision from 20.00 to 50.92\nand decreases average forgetting from 75.95 to 8.56. Code and datasets are\navailable on https://github.com/Tree-Shu-Zhao/RebQ.pytorch\n","authors":["Shu Zhao","Xiaohan Zou","Tan Yu","Huijuan Xu"],"pdf_url":"https://arxiv.org/pdf/2403.11373v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11371v1","updated":"2024-03-17T23:29:41Z","published":"2024-03-17T23:29:41Z","title":"V2X-DGW: Domain Generalization for Multi-agent Perception under Adverse\n Weather Conditions","summary":" Current LiDAR-based Vehicle-to-Everything (V2X) multi-agent perception\nsystems have shown the significant success on 3D object detection. While these\nmodels perform well in the trained clean weather, they struggle in unseen\nadverse weather conditions with the real-world domain gap. In this paper, we\npropose a domain generalization approach, named V2X-DGW, for LiDAR-based 3D\nobject detection on multi-agent perception system under adverse weather\nconditions. Not only in the clean weather does our research aim to ensure\nfavorable multi-agent performance, but also in the unseen adverse weather\nconditions by learning only on the clean weather data. To advance research in\nthis area, we have simulated the impact of three prevalent adverse weather\nconditions on two widely-used multi-agent datasets, resulting in the creation\nof two novel benchmark datasets: OPV2V-w and V2XSet-w.\n To this end, we first introduce the Adaptive Weather Augmentation (AWA) to\nmimic the unseen adverse weather conditions, and then propose two alignments\nfor generalizable representation learning: Trust-region Weather-invariant\nAlignment (TWA) and Agent-aware Contrastive Alignment (ACA). Extensive\nexperimental results demonstrate that our V2X-DGW achieved improvements in the\nunseen adverse weather conditions.\n","authors":["Baolu Li","Jinlong Li","Xinyu Liu","Runsheng Xu","Zhengzhong Tu","Jiacheng Guo","Xiaopeng Li","Hongkai Yu"],"pdf_url":"https://arxiv.org/pdf/2403.11371v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11370v1","updated":"2024-03-17T23:23:40Z","published":"2024-03-17T23:23:40Z","title":"DynamicGlue: Epipolar and Time-Informed Data Association in Dynamic\n Environments using Graph Neural Networks","summary":" The assumption of a static environment is common in many geometric computer\nvision tasks like SLAM but limits their applicability in highly dynamic scenes.\nSince these tasks rely on identifying point correspondences between input\nimages within the static part of the environment, we propose a graph neural\nnetwork-based sparse feature matching network designed to perform robust\nmatching under challenging conditions while excluding keypoints on moving\nobjects. We employ a similar scheme of attentional aggregation over graph edges\nto enhance keypoint representations as state-of-the-art feature-matching\nnetworks but augment the graph with epipolar and temporal information and\nvastly reduce the number of graph edges. Furthermore, we introduce a\nself-supervised training scheme to extract pseudo labels for image pairs in\ndynamic environments from exclusively unprocessed visual-inertial data. A\nseries of experiments show the superior performance of our network as it\nexcludes keypoints on moving objects compared to state-of-the-art feature\nmatching networks while still achieving similar results regarding conventional\nmatching metrics. When integrated into a SLAM system, our network significantly\nimproves performance, especially in highly dynamic scenes.\n","authors":["Theresa Huber","Simon Schaefer","Stefan Leutenegger"],"pdf_url":"https://arxiv.org/pdf/2403.11370v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.00385v2","updated":"2024-03-17T23:23:22Z","published":"2023-04-30T04:40:32Z","title":"Cross-Shaped Windows Transformer with Self-supervised Pretraining for\n Clinically Significant Prostate Cancer Detection in Bi-parametric MRI","summary":" Biparametric magnetic resonance imaging (bpMRI) has demonstrated promising\nresults in prostate cancer (PCa) detection using convolutional neural networks\n(CNNs). Recently, transformers have achieved competitive performance compared\nto CNNs in computer vision. Large scale transformers need abundant annotated\ndata for training, which are difficult to obtain in medical imaging.\nSelf-supervised learning (SSL) utilizes unlabeled data to generate meaningful\nsemantic representations without the need for costly annotations, enhancing\nmodel performance on tasks with limited labeled data. We introduce a novel\nend-to-end Cross-Shaped windows (CSwin) transformer UNet model, CSwin UNet, to\ndetect clinically significant prostate cancer (csPCa) in prostate bi-parametric\nMR imaging (bpMRI) and demonstrate the effectiveness of our proposed\nself-supervised pre-training framework. Using a large prostate bpMRI dataset\nwith 1500 patients, we first pretrain CSwin transformer using multi-task\nself-supervised learning to improve data-efficiency and network\ngeneralizability. We then finetune using lesion annotations to perform csPCa\ndetection. Five-fold cross validation shows that self-supervised CSwin UNet\nachieves 0.888 AUC and 0.545 Average Precision (AP), significantly\noutperforming four comparable models (Swin UNETR, DynUNet, Attention UNet,\nUNet). Using a separate bpMRI dataset with 158 patients, we evaluate our method\nrobustness to external hold-out data. Self-supervised CSwin UNet achieves 0.79\nAUC and 0.45 AP, still outperforming all other comparable methods and\ndemonstrating good generalization to external data.\n","authors":["Yuheng Li","Jacob Wynne","Jing Wang","Richard L. J. Qiu","Justin Roper","Shaoyan Pan","Ashesh B. Jani","Tian Liu","Pretesh R. Patel","Hui Mao","Xiaofeng Yang"],"pdf_url":"https://arxiv.org/pdf/2305.00385v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11367v1","updated":"2024-03-17T23:06:12Z","published":"2024-03-17T23:06:12Z","title":"3DGS-ReLoc: 3D Gaussian Splatting for Map Representation and Visual\n ReLocalization","summary":" This paper presents a novel system designed for 3D mapping and visual\nrelocalization using 3D Gaussian Splatting. Our proposed method uses LiDAR and\ncamera data to create accurate and visually plausible representations of the\nenvironment. By leveraging LiDAR data to initiate the training of the 3D\nGaussian Splatting map, our system constructs maps that are both detailed and\ngeometrically accurate. To mitigate excessive GPU memory usage and facilitate\nrapid spatial queries, we employ a combination of a 2D voxel map and a KD-tree.\nThis preparation makes our method well-suited for visual localization tasks,\nenabling efficient identification of correspondences between the query image\nand the rendered image from the Gaussian Splatting map via normalized\ncross-correlation (NCC). Additionally, we refine the camera pose of the query\nimage using feature-based matching and the Perspective-n-Point (PnP) technique.\nThe effectiveness, adaptability, and precision of our system are demonstrated\nthrough extensive evaluation on the KITTI360 dataset.\n","authors":["Peng Jiang","Gaurav Pandey","Srikanth Saripalli"],"pdf_url":"https://arxiv.org/pdf/2403.11367v1.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2210.06323v4","updated":"2024-03-17T22:58:03Z","published":"2022-10-12T15:42:40Z","title":"AISFormer: Amodal Instance Segmentation with Transformer","summary":" Amodal Instance Segmentation (AIS) aims to segment the region of both visible\nand possible occluded parts of an object instance. While Mask R-CNN-based AIS\napproaches have shown promising results, they are unable to model high-level\nfeatures coherence due to the limited receptive field. The most recent\ntransformer-based models show impressive performance on vision tasks, even\nbetter than Convolution Neural Networks (CNN). In this work, we present\nAISFormer, an AIS framework, with a Transformer-based mask head. AISFormer\nexplicitly models the complex coherence between occluder, visible, amodal, and\ninvisible masks within an object's regions of interest by treating them as\nlearnable queries. Specifically, AISFormer contains four modules: (i) feature\nencoding: extract ROI and learn both short-range and long-range visual\nfeatures. (ii) mask transformer decoding: generate the occluder, visible, and\namodal mask query embeddings by a transformer decoder (iii) invisible mask\nembedding: model the coherence between the amodal and visible masks, and (iv)\nmask predicting: estimate output masks including occluder, visible, amodal and\ninvisible. We conduct extensive experiments and ablation studies on three\nchallenging benchmarks i.e. KINS, D2SA, and COCOA-cls to evaluate the\neffectiveness of AISFormer. The code is available at:\nhttps://github.com/UARK-AICV/AISFormer\n","authors":["Minh Tran","Khoa Vo","Kashu Yamazaki","Arthur Fernandes","Michael Kidd","Ngan Le"],"pdf_url":"https://arxiv.org/pdf/2210.06323v4.pdf","comment":"Accepted to BMVC2022"},{"id":"http://arxiv.org/abs/2403.11364v1","updated":"2024-03-17T22:49:07Z","published":"2024-03-17T22:49:07Z","title":"Creating Seamless 3D Maps Using Radiance Fields","summary":" It is desirable to create 3D object models and 3D maps from 2D input images\nfor applications such as navigation, virtual tourism, and urban planning. The\ntraditional methods of creating 3D maps, (such as photogrammetry), require a\nlarge number of images and odometry. Additionally, traditional methods have\ndifficulty with reflective surfaces and specular reflections; windows and\nchrome in the scene can be problematic. Google Road View is a familiar\napplication, which uses traditional methods to fuse a collection of 2D input\nimages into the illusion of a 3D map. However, Google Road View does not create\nan actual 3D object model, only a collection of views. The objective of this\nwork is to create an actual 3D object model using updated techniques. Neural\nRadiance Fields (NeRF[1]) has emerged as a potential solution, offering the\ncapability to produce more precise and intricate 3D maps. Gaussian Splatting[4]\nis another contemporary technique. This investigation compares Neural Radiance\nFields to Gaussian Splatting, and describes some of their inner workings. Our\nprimary contribution is a method for improving the results of the 3D\nreconstructed models. Our results indicate that Gaussian Splatting was superior\nto the NeRF technique.\n","authors":["Sai Tarun Sathyan","Thomas B. Kinsman"],"pdf_url":"https://arxiv.org/pdf/2403.11364v1.pdf","comment":"10 pages with figures"},{"id":"http://arxiv.org/abs/2403.11340v1","updated":"2024-03-17T20:47:52Z","published":"2024-03-17T20:47:52Z","title":"StainDiffuser: MultiTask Dual Diffusion Model for Virtual Staining","summary":" Hematoxylin and Eosin (H&E) staining is the most commonly used for disease\ndiagnosis and tumor recurrence tracking. Hematoxylin excels at highlighting\nnuclei, whereas eosin stains the cytoplasm. However, H&E stain lacks details\nfor differentiating different types of cells relevant to identifying the grade\nof the disease or response to specific treatment variations. Pathologists\nrequire special immunohistochemical (IHC) stains that highlight different cell\ntypes. These stains help in accurately identifying different regions of disease\ngrowth and their interactions with the cell's microenvironment. The advent of\ndeep learning models has made Image-to-Image (I2I) translation a key research\narea, reducing the need for expensive physical staining processes. Pix2Pix and\nCycleGAN are still the most commonly used methods for virtual staining\napplications. However, both suffer from hallucinations or staining\nirregularities when H&E stain has less discriminate information about the\nunderlying cells IHC needs to highlight (e.g.,CD3 lymphocytes). Diffusion\nmodels are currently the state-of-the-art models for image generation and\nconditional generation tasks. However, they require extensive and diverse\ndatasets (millions of samples) to converge, which is less feasible for virtual\nstaining applications.Inspired by the success of multitask deep learning models\nfor limited dataset size, we propose StainDiffuser, a novel multitask dual\ndiffusion architecture for virtual staining that converges under a limited\ntraining budget. StainDiffuser trains two diffusion processes simultaneously:\n(a) generation of cell-specific IHC stain from H&E and (b) H&E-based cell\nsegmentation using coarse segmentation only during training. Our results show\nthat StainDiffuser produces high-quality results for easier (CK8/18,epithelial\nmarker) and difficult stains(CD3, Lymphocytes).\n","authors":["Tushar Kataria","Beatrice Knudsen","Shireen Y. Elhabian"],"pdf_url":"https://arxiv.org/pdf/2403.11340v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11338v1","updated":"2024-03-17T20:44:38Z","published":"2024-03-17T20:44:38Z","title":"Ensembling and Test Augmentation for Covid-19 Detection and Covid-19\n Domain Adaptation from 3D CT-Scans","summary":" Since the emergence of Covid-19 in late 2019, medical image analysis using\nartificial intelligence (AI) has emerged as a crucial research area,\nparticularly with the utility of CT-scan imaging for disease diagnosis. This\npaper contributes to the 4th COV19D competition, focusing on Covid-19 Detection\nand Covid-19 Domain Adaptation Challenges. Our approach centers on lung\nsegmentation and Covid-19 infection segmentation employing the recent CNN-based\nsegmentation architecture PDAtt-Unet, which simultaneously segments lung\nregions and infections. Departing from traditional methods, we concatenate the\ninput slice (grayscale) with segmented lung and infection, generating three\ninput channels akin to color channels. Additionally, we employ three 3D CNN\nbackbones Customized Hybrid-DeCoVNet, along with pretrained 3D-Resnet-18 and\n3D-Resnet-50 models to train Covid-19 recognition for both challenges.\nFurthermore, we explore ensemble approaches and testing augmentation to enhance\nperformance. Comparison with baseline results underscores the substantial\nefficiency of our approach, with a significant margin in terms of F1-score (14\n%). This study advances the field by presenting a comprehensive methodology for\naccurate Covid-19 detection and adaptation, leveraging cutting-edge AI\ntechniques in medical image analysis.\n","authors":["Fares Bougourzi","Feryal Windal Moula","Halim Benhabiles","Fadi Dornaika","Abdelmalik Taleb-Ahmed"],"pdf_url":"https://arxiv.org/pdf/2403.11338v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11337v1","updated":"2024-03-17T20:36:43Z","published":"2024-03-17T20:36:43Z","title":"Enhancing Bandwidth Efficiency for Video Motion Transfer Applications\n using Deep Learning Based Keypoint Prediction","summary":" We propose a deep learning based novel prediction framework for enhanced\nbandwidth reduction in motion transfer enabled video applications such as video\nconferencing, virtual reality gaming and privacy preservation for patient\nhealth monitoring. To model complex motion, we use the First Order Motion Model\n(FOMM) that represents dynamic objects using learned keypoints along with their\nlocal affine transformations. Keypoints are extracted by a self-supervised\nkeypoint detector and organized in a time series corresponding to the video\nframes. Prediction of keypoints, to enable transmission using lower frames per\nsecond on the source device, is performed using a Variational Recurrent Neural\nNetwork (VRNN). The predicted keypoints are then synthesized to video frames\nusing an optical flow estimator and a generator network. This efficacy of\nleveraging keypoint based representations in conjunction with VRNN based\nprediction for both video animation and reconstruction is demonstrated on three\ndiverse datasets. For real-time applications, our results show the\neffectiveness of our proposed architecture by enabling up to 2x additional\nbandwidth reduction over existing keypoint based video motion transfer\nframeworks without significantly compromising video quality.\n","authors":["Xue Bai","Tasmiah Haque","Sumit Mohan","Yuliang Cai","Byungheon Jeong","Adam Halasz","Srinjoy Das"],"pdf_url":"https://arxiv.org/pdf/2403.11337v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15851v2","updated":"2024-03-17T20:26:48Z","published":"2023-11-27T14:17:41Z","title":"Single-Model and Any-Modality for Video Object Tracking","summary":" In the realm of video object tracking, auxiliary modalities such as depth,\nthermal, or event data have emerged as valuable assets to complement the RGB\ntrackers. In practice, most existing RGB trackers learn a single set of\nparameters to use them across datasets and applications. However, a similar\nsingle-model unification for multi-modality tracking presents several\nchallenges. These challenges stem from the inherent heterogeneity of inputs --\neach with modality-specific representations, the scarcity of multi-modal\ndatasets, and the absence of all the modalities at all times. In this work, we\nintroduce Un-Track, a Unified Tracker of a single set of parameters for any\nmodality. To handle any modality, our method learns their common latent space\nthrough low-rank factorization and reconstruction techniques. More importantly,\nwe use only the RGB-X pairs to learn the common latent space. This unique\nshared representation seamlessly binds all modalities together, enabling\neffective unification and accommodating any missing modality, all within a\nsingle transformer-based architecture. Our Un-Track achieves +8.1 absolute\nF-score gain, on the DepthTrack dataset, by introducing only +2.14 (over 21.50)\nGFLOPs with +6.6M (over 93M) parameters, through a simple yet efficient\nprompting strategy. Extensive comparisons on five benchmark datasets with\ndifferent modalities show that Un-Track surpasses both SOTA unified trackers\nand modality-specific counterparts, validating our effectiveness and\npracticality. The source code is publicly available at\nhttps://github.com/Zongwei97/UnTrack.\n","authors":["Zongwei Wu","Jilai Zheng","Xiangxuan Ren","Florin-Alexandru Vasluianu","Chao Ma","Danda Pani Paudel","Luc Van Gool","Radu Timofte"],"pdf_url":"https://arxiv.org/pdf/2311.15851v2.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2312.06630v3","updated":"2024-03-17T20:15:45Z","published":"2023-12-11T18:50:09Z","title":"TMT-VIS: Taxonomy-aware Multi-dataset Joint Training for Video Instance\n Segmentation","summary":" Training on large-scale datasets can boost the performance of video instance\nsegmentation while the annotated datasets for VIS are hard to scale up due to\nthe high labor cost. What we possess are numerous isolated filed-specific\ndatasets, thus, it is appealing to jointly train models across the aggregation\nof datasets to enhance data volume and diversity. However, due to the\nheterogeneity in category space, as mask precision increases with the data\nvolume, simply utilizing multiple datasets will dilute the attention of models\non different taxonomies. Thus, increasing the data scale and enriching taxonomy\nspace while improving classification precision is important. In this work, we\nanalyze that providing extra taxonomy information can help models concentrate\non specific taxonomy, and propose our model named Taxonomy-aware Multi-dataset\nJoint Training for Video Instance Segmentation (TMT-VIS) to address this vital\nchallenge. Specifically, we design a two-stage taxonomy aggregation module that\nfirst compiles taxonomy information from input videos and then aggregates these\ntaxonomy priors into instance queries before the transformer decoder. We\nconduct extensive experimental evaluations on four popular and challenging\nbenchmarks, including YouTube-VIS 2019, YouTube-VIS 2021, OVIS, and UVO. Our\nmodel shows significant improvement over the baseline solutions, and sets new\nstate-of-the-art records on all benchmarks. These appealing and encouraging\nresults demonstrate the effectiveness and generality of our approach. The code\nis available at https://github.com/rkzheng99/TMT-VIS .\n","authors":["Rongkun Zheng","Lu Qi","Xi Chen","Yi Wang","Kun Wang","Yu Qiao","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2312.06630v3.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2403.11328v1","updated":"2024-03-17T20:14:57Z","published":"2024-03-17T20:14:57Z","title":"Domain-Guided Masked Autoencoders for Unique Player Identification","summary":" Unique player identification is a fundamental module in vision-driven sports\nanalytics. Identifying players from broadcast videos can aid with various\ndownstream tasks such as player assessment, in-game analysis, and broadcast\nproduction. However, automatic detection of jersey numbers using deep features\nis challenging primarily due to: a) motion blur, b) low resolution video feed,\nand c) occlusions. With their recent success in various vision tasks, masked\nautoencoders (MAEs) have emerged as a superior alternative to conventional\nfeature extractors. However, most MAEs simply zero-out image patches either\nrandomly or focus on where to mask rather than how to mask. Motivated by human\nvision, we devise a novel domain-guided masking policy for MAEs termed d-MAE to\nfacilitate robust feature extraction in the presence of motion blur for player\nidentification. We further introduce a new spatio-temporal network leveraging\nour novel d-MAE for unique player identification. We conduct experiments on\nthree large-scale sports datasets, including a curated baseball dataset, the\nSoccerNet dataset, and an in-house ice hockey dataset. We preprocess the\ndatasets using an upgraded keyframe identification (KfID) module by focusing on\nframes containing jersey numbers. Additionally, we propose a keyframe-fusion\ntechnique to augment keyframes, preserving spatial and temporal context. Our\nspatio-temporal network showcases significant improvements, surpassing the\ncurrent state-of-the-art by 8.58%, 4.29%, and 1.20% in the test set accuracies,\nrespectively. Rigorous ablations highlight the effectiveness of our\ndomain-guided masking approach and the refined KfID module, resulting in\nperformance enhancements of 1.48% and 1.84% respectively, compared to original\narchitectures.\n","authors":["Bavesh Balaji","Jerrin Bright","Sirisha Rambhatla","Yuhao Chen","Alexander Wong","John Zelek","David A Clausi"],"pdf_url":"https://arxiv.org/pdf/2403.11328v1.pdf","comment":"Submitted to 21st International Conference on Robots and Vision\n (CRV'24), Guelph, Ontario, Canada"},{"id":"http://arxiv.org/abs/2403.11324v1","updated":"2024-03-17T20:06:41Z","published":"2024-03-17T20:06:41Z","title":"GeoGaussian: Geometry-aware Gaussian Splatting for Scene Rendering","summary":" During the Gaussian Splatting optimization process, the scene's geometry can\ngradually deteriorate if its structure is not deliberately preserved,\nespecially in non-textured regions such as walls, ceilings, and furniture\nsurfaces. This degradation significantly affects the rendering quality of novel\nviews that deviate significantly from the viewpoints in the training data. To\nmitigate this issue, we propose a novel approach called GeoGaussian. Based on\nthe smoothly connected areas observed from point clouds, this method introduces\na novel pipeline to initialize thin Gaussians aligned with the surfaces, where\nthe characteristic can be transferred to new generations through a carefully\ndesigned densification strategy. Finally, the pipeline ensures that the scene's\ngeometry and texture are maintained through constrained optimization processes\nwith explicit geometry constraints. Benefiting from the proposed architecture,\nthe generative ability of 3D Gaussians is enhanced, especially in structured\nregions. Our proposed pipeline achieves state-of-the-art performance in novel\nview synthesis and geometric reconstruction, as evaluated qualitatively and\nquantitatively on public datasets.\n","authors":["Yanyan Li","Chenyu Lyu","Yan Di","Guangyao Zhai","Gim Hee Lee","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2403.11324v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11317v1","updated":"2024-03-17T19:44:05Z","published":"2024-03-17T19:44:05Z","title":"Few-Shot VQA with Frozen LLMs: A Tale of Two Approaches","summary":" Two approaches have emerged to input images into large language models\n(LLMs). The first is to caption images into natural language. The second is to\nmap image feature embeddings into the domain of the LLM and pass the mapped\nembeddings directly to the LLM. The majority of recent few-shot multimodal work\nreports performance using architectures that employ variations of one of these\ntwo approaches. But they overlook an important comparison between them. We\ndesign a controlled and focused experiment to compare these two approaches to\nfew-shot visual question answering (VQA) with LLMs. Our findings indicate that\nfor Flan-T5 XL, a 3B parameter LLM, connecting visual embeddings directly to\nthe LLM embedding space does not guarantee improved performance over using\nimage captions. In the zero-shot regime, we find using textual image captions\nis better. In the few-shot regimes, how the in-context examples are selected\ndetermines which is better.\n","authors":["Igor Sterner","Weizhe Lin","Jinghong Chen","Bill Byrne"],"pdf_url":"https://arxiv.org/pdf/2403.11317v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11310v1","updated":"2024-03-17T19:10:07Z","published":"2024-03-17T19:10:07Z","title":"A Dual-Augmentor Framework for Domain Generalization in 3D Human Pose\n Estimation","summary":" 3D human pose data collected in controlled laboratory settings present\nchallenges for pose estimators that generalize across diverse scenarios. To\naddress this, domain generalization is employed. Current methodologies in\ndomain generalization for 3D human pose estimation typically utilize\nadversarial training to generate synthetic poses for training. Nonetheless,\nthese approaches exhibit several limitations. First, the lack of prior\ninformation about the target domain complicates the application of suitable\naugmentation through a single pose augmentor, affecting generalization on\ntarget domains. Moreover, adversarial training's discriminator tends to enforce\nsimilarity between source and synthesized poses, impeding the exploration of\nout-of-source distributions. Furthermore, the pose estimator's optimization is\nnot exposed to domain shifts, limiting its overall generalization ability.\n To address these limitations, we propose a novel framework featuring two pose\naugmentors: the weak and the strong augmentors. Our framework employs\ndifferential strategies for generation and discrimination processes,\nfacilitating the preservation of knowledge related to source poses and the\nexploration of out-of-source distributions without prior information about\ntarget poses. Besides, we leverage meta-optimization to simulate domain shifts\nin the optimization process of the pose estimator, thereby improving its\ngeneralization ability. Our proposed approach significantly outperforms\nexisting methods, as demonstrated through comprehensive experiments on various\nbenchmark datasets.\n","authors":["Qucheng Peng","Ce Zheng","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2403.11310v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.11299v1","updated":"2024-03-17T18:42:38Z","published":"2024-03-17T18:42:38Z","title":"SQ-LLaVA: Self-Questioning for Large Vision-Language Assistant","summary":" Recent advancements in the vision-language model have shown notable\ngeneralization in vision-language tasks after visual instruction tuning.\nHowever, bridging the gap between the pre-trained vision encoder and the large\nlanguage models becomes the whole network's bottleneck. To improve\ncross-modality alignment, existing works usually consider more visual\ninstruction data covering a broader range of vision tasks to fine-tune the\nmodel for question-answering, which are costly to obtain. However, the image\ncontains rich contextual information that has been largely under-explored. This\npaper first attempts to harness this overlooked context within visual\ninstruction data, training the model to self-supervised `learning' how to ask\nhigh-quality questions. In this way, we introduce a novel framework named\nSQ-LLaVA: Self-Questioning for Large Vision-Language Assistant. SQ-LLaVA\nexhibits proficiency in generating flexible and meaningful image-related\nquestions while analyzing the visual clue and prior language knowledge,\nsignifying an advanced level of generalized visual understanding. Moreover,\nfine-tuning SQ-LLaVA on higher-quality instruction data shows a consistent\nperformance improvement compared with traditional visual-instruction tuning\nmethods. This improvement highlights the efficacy of self-questioning\ntechniques in achieving a deeper and more nuanced comprehension of visual\ncontent across various contexts.\n","authors":["Guohao Sun","Can Qin","Jiamian Wang","Zeyuan Chen","Ran Xu","Zhiqiang Tao"],"pdf_url":"https://arxiv.org/pdf/2403.11299v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06668v4","updated":"2024-03-17T18:29:53Z","published":"2023-08-13T02:59:36Z","title":"Large Language Models and Foundation Models in Smart Agriculture:\n Basics, Opportunities, and Challenges","summary":" The past decade has witnessed the rapid development and adoption of ML & DL\nmethodologies in agricultural systems, showcased by great successes in\nagricultural applications. However, these conventional ML/DL models have\ncertain limitations: they heavily rely on large, costly-to-acquire labeled\ndatasets for training, require specialized expertise for development and\nmaintenance, and are mostly tailored for specific tasks, thus lacking\ngeneralizability. Recently, large pre-trained models, also known as FMs, have\ndemonstrated remarkable successes in language, vision, and decision-making\ntasks across various domains. These models are trained on a large amount of\ndata from multiple domains and modalities. Once trained, they can accomplish\nversatile tasks with just minor fine-tuning and minimal task-specific labeled\ndata. Despite their proven effectiveness and huge potential, there has been\nlittle exploration of applying FMs to agriculture AI. Thus, this study aims to\nexplore the potential of FMs in the field of smart agriculture. In particular,\nconceptual tools and technical background are presented to help the\nunderstanding of the problem space and uncover new research directions. To this\nend, recent FMs in the general CS domain are reviewed, and the models are\ncategorized into four categories: language FMs, vision FMs, multimodal FMs, and\nreinforcement learning FMs. Then, the steps of developing agriculture FMs\n(AFMs) are outlined and potential applications in smart agriculture are\ndiscussed. Moreover, challenges and risks associated with developing AFMs are\ndiscussed, including model training, validation, and deployment. In summary,\nthe advancement of AI in agriculture is explored by introducing AFMs as a\npromising paradigm that can significantly mitigate the reliance on extensive\nlabeled datasets and enhance the efficiency, effectiveness, and generalization\nof agricultural AI systems.\n","authors":["Jiajia Li","Mingle Xu","Lirong Xiang","Dong Chen","Weichao Zhuang","Xunyuan Yin","Zhaojian Li"],"pdf_url":"https://arxiv.org/pdf/2308.06668v4.pdf","comment":"18 pages, 3 figures"},{"id":"http://arxiv.org/abs/2403.11295v1","updated":"2024-03-17T18:28:24Z","published":"2024-03-17T18:28:24Z","title":"Order-One Rolling Shutter Cameras","summary":" Rolling shutter (RS) cameras dominate consumer and smartphone markets.\nSeveral methods for computing the absolute pose of RS cameras have appeared in\nthe last 20 years, but the relative pose problem has not been fully solved yet.\nWe provide a unified theory for the important class of order-one rolling\nshutter (RS$_1$) cameras. These cameras generalize the perspective projection\nto RS cameras, projecting a generic space point to exactly one image point via\na rational map. We introduce a new back-projection RS camera model,\ncharacterize RS$_1$ cameras, construct explicit parameterizations of such\ncameras, and determine the image of a space line. We classify all minimal\nproblems for solving the relative camera pose problem with linear RS$_1$\ncameras and discover new practical cases. Finally, we show how the theory can\nbe used to explain RS models previously used for absolute pose computation.\n","authors":["Marvin Anas Hahn","Kathlén Kohn","Orlando Marigliano","Tomas Pajdla"],"pdf_url":"https://arxiv.org/pdf/2403.11295v1.pdf","comment":"36 pages, 6 figures, 3 ancillary files"},{"id":"http://arxiv.org/abs/2403.11291v1","updated":"2024-03-17T18:06:06Z","published":"2024-03-17T18:06:06Z","title":"Advanced Knowledge Extraction of Physical Design Drawings, Translation\n and conversion to CAD formats using Deep Learning","summary":" The maintenance, archiving and usage of the design drawings is cumbersome in\nphysical form in different industries for longer period. It is hard to extract\ninformation by simple scanning of drawing sheets. Converting them to their\ndigital formats such as Computer-Aided Design (CAD), with needed knowledge\nextraction can solve this problem. The conversion of these machine drawings to\nits digital form is a crucial challenge which requires advanced techniques.\nThis research proposes an innovative methodology utilizing Deep Learning\nmethods. The approach employs object detection model, such as Yolov7, Faster\nR-CNN, to detect physical drawing objects present in the images followed by,\nedge detection algorithms such as canny filter to extract and refine the\nidentified lines from the drawing region and curve detection techniques to\ndetect circle. Also ornaments (complex shapes) within the drawings are\nextracted. To ensure comprehensive conversion, an Optical Character Recognition\n(OCR) tool is integrated to identify and extract the text elements from the\ndrawings. The extracted data which includes the lines, shapes and text is\nconsolidated and stored in a structured comma separated values(.csv) file\nformat. The accuracy and the efficiency of conversion is evaluated. Through\nthis, conversion can be automated to help organizations enhance their\nproductivity, facilitate seamless collaborations and preserve valuable design\ninformation in a digital format easily accessible. Overall, this study\ncontributes to the advancement of CAD conversions, providing accurate results\nfrom the translating process. Future research can focus on handling diverse\ndrawing types, enhanced accuracy in shape and line detection and extraction.\n","authors":["Jesher Joshua M","Ragav V","Syed Ibrahim S P"],"pdf_url":"https://arxiv.org/pdf/2403.11291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11284v1","updated":"2024-03-17T17:42:02Z","published":"2024-03-17T17:42:02Z","title":"Fast Personalized Text-to-Image Syntheses With Attention Injection","summary":" Currently, personalized image generation methods mostly require considerable\ntime to finetune and often overfit the concept resulting in generated images\nthat are similar to custom concepts but difficult to edit by prompts. We\npropose an effective and fast approach that could balance the text-image\nconsistency and identity consistency of the generated image and reference\nimage. Our method can generate personalized images without any fine-tuning\nwhile maintaining the inherent text-to-image generation ability of diffusion\nmodels. Given a prompt and a reference image, we merge the custom concept into\ngenerated images by manipulating cross-attention and self-attention layers of\nthe original diffusion model to generate personalized images that match the\ntext description. Comprehensive experiments highlight the superiority of our\nmethod.\n","authors":["Yuxuan Zhang","Yiren Song","Jinpeng Yu","Han Pan","Zhongliang Jing"],"pdf_url":"https://arxiv.org/pdf/2403.11284v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11273v1","updated":"2024-03-17T17:04:45Z","published":"2024-03-17T17:04:45Z","title":"BrightDreamer: Generic 3D Gaussian Generative Framework for Fast\n Text-to-3D Synthesis","summary":" Text-to-3D synthesis has recently seen intriguing advances by combining the\ntext-to-image models with 3D representation methods, e.g., Gaussian Splatting\n(GS), via Score Distillation Sampling (SDS). However, a hurdle of existing\nmethods is the low efficiency, per-prompt optimization for a single 3D object.\nTherefore, it is imperative for a paradigm shift from per-prompt optimization\nto one-stage generation for any unseen text prompts, which yet remains\nchallenging. A hurdle is how to directly generate a set of millions of 3D\nGaussians to represent a 3D object. This paper presents BrightDreamer, an\nend-to-end single-stage approach that can achieve generalizable and fast (77\nms) text-to-3D generation. Our key idea is to formulate the generation process\nas estimating the 3D deformation from an anchor shape with predefined\npositions. For this, we first propose a Text-guided Shape Deformation (TSD)\nnetwork to predict the deformed shape and its new positions, used as the\ncenters (one attribute) of 3D Gaussians. To estimate the other four attributes\n(i.e., scaling, rotation, opacity, and SH coefficient), we then design a novel\nText-guided Triplane Generator (TTG) to generate a triplane representation for\na 3D object. The center of each Gaussian enables us to transform the triplane\nfeature into the four attributes. The generated 3D Gaussians can be finally\nrendered at 705 frames per second. Extensive experiments demonstrate the\nsuperiority of our method over existing methods. Also, BrightDreamer possesses\na strong semantic understanding capability even for complex text prompts. The\nproject code is available at https://vlislab22.github.io/BrightDreamer.\n","authors":["Lutao Jiang","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.11273v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.04692v2","updated":"2024-03-17T16:59:25Z","published":"2024-03-07T17:41:37Z","title":"PixArt-Σ: Weak-to-Strong Training of Diffusion Transformer for 4K\n Text-to-Image Generation","summary":" In this paper, we introduce PixArt-\\Sigma, a Diffusion Transformer\nmodel~(DiT) capable of directly generating images at 4K resolution.\nPixArt-\\Sigma represents a significant advancement over its predecessor,\nPixArt-\\alpha, offering images of markedly higher fidelity and improved\nalignment with text prompts. A key feature of PixArt-\\Sigma is its training\nefficiency. Leveraging the foundational pre-training of PixArt-\\alpha, it\nevolves from the `weaker' baseline to a `stronger' model via incorporating\nhigher quality data, a process we term \"weak-to-strong training\". The\nadvancements in PixArt-\\Sigma are twofold: (1) High-Quality Training Data:\nPixArt-\\Sigma incorporates superior-quality image data, paired with more\nprecise and detailed image captions. (2) Efficient Token Compression: we\npropose a novel attention module within the DiT framework that compresses both\nkeys and values, significantly improving efficiency and facilitating\nultra-high-resolution image generation. Thanks to these improvements,\nPixArt-\\Sigma achieves superior image quality and user prompt adherence\ncapabilities with significantly smaller model size (0.6B parameters) than\nexisting text-to-image diffusion models, such as SDXL (2.6B parameters) and SD\nCascade (5.1B parameters). Moreover, PixArt-\\Sigma's capability to generate 4K\nimages supports the creation of high-resolution posters and wallpapers,\nefficiently bolstering the production of high-quality visual content in\nindustries such as film and gaming.\n","authors":["Junsong Chen","Chongjian Ge","Enze Xie","Yue Wu","Lewei Yao","Xiaozhe Ren","Zhongdao Wang","Ping Luo","Huchuan Lu","Zhenguo Li"],"pdf_url":"https://arxiv.org/pdf/2403.04692v2.pdf","comment":"Project Page: https://pixart-alpha.github.io/PixArt-sigma-project/"},{"id":"http://arxiv.org/abs/2309.10556v2","updated":"2024-03-17T16:55:07Z","published":"2023-09-19T12:05:26Z","title":"Forgedit: Text Guided Image Editing via Learning and Forgetting","summary":" Text-guided image editing on real or synthetic images, given only the\noriginal image itself and the target text prompt as inputs, is a very general\nand challenging task. It requires an editing model to estimate by itself which\npart of the image should be edited, and then perform either rigid or non-rigid\nediting while preserving the characteristics of original image. In this paper,\nwe design a novel text-guided image editing method, named as Forgedit. First,\nwe propose a vision-language joint optimization framework capable of\nreconstructing the original image in 30 seconds, much faster than previous SOTA\nand much less overfitting. Then we propose a novel vector projection mechanism\nin text embedding space of Diffusion Models, which is capable to control the\nidentity similarity and editing strength seperately. Finally, we discovered a\ngeneral property of UNet in Diffusion Models, i.e., Unet encoder learns space\nand structure, Unet decoder learns appearance and identity. With such a\nproperty, we design forgetting mechanisms to successfully tackle the fatal and\ninevitable overfitting issues when fine-tuning Diffusion Models on one image,\nthus significantly boosting the editing capability of Diffusion Models. Our\nmethod, Forgedit, built on Stable Diffusion, achieves new state-of-the-art\nresults on the challenging text-guided image editing benchmark: TEdBench,\nsurpassing the previous SOTA methods such as Imagic with Imagen, in terms of\nboth CLIP score and LPIPS score. Codes are available at\nhttps://github.com/witcherofresearch/Forgedit\n","authors":["Shiwen Zhang","Shuai Xiao","Weilin Huang"],"pdf_url":"https://arxiv.org/pdf/2309.10556v2.pdf","comment":"Codes are available at https://github.com/witcherofresearch/Forgedit"},{"id":"http://arxiv.org/abs/2403.11270v1","updated":"2024-03-17T16:48:46Z","published":"2024-03-17T16:48:46Z","title":"Bilateral Propagation Network for Depth Completion","summary":" Depth completion aims to derive a dense depth map from sparse depth\nmeasurements with a synchronized color image. Current state-of-the-art (SOTA)\nmethods are predominantly propagation-based, which work as an iterative\nrefinement on the initial estimated dense depth. However, the initial depth\nestimations mostly result from direct applications of convolutional layers on\nthe sparse depth map. In this paper, we present a Bilateral Propagation Network\n(BP-Net), that propagates depth at the earliest stage to avoid directly\nconvolving on sparse data. Specifically, our approach propagates the target\ndepth from nearby depth measurements via a non-linear model, whose coefficients\nare generated through a multi-layer perceptron conditioned on both\n\\emph{radiometric difference} and \\emph{spatial distance}. By integrating\nbilateral propagation with multi-modal fusion and depth refinement in a\nmulti-scale framework, our BP-Net demonstrates outstanding performance on both\nindoor and outdoor scenes. It achieves SOTA on the NYUv2 dataset and ranks 1st\non the KITTI depth completion benchmark at the time of submission. Experimental\nresults not only show the effectiveness of bilateral propagation but also\nemphasize the significance of early-stage propagation in contrast to the\nrefinement stage. Our code and trained models will be available on the project\npage.\n","authors":["Jie Tang","Fei-Peng Tian","Boshi An","Jian Li","Ping Tan"],"pdf_url":"https://arxiv.org/pdf/2403.11270v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2305.12854v2","updated":"2024-03-17T16:34:31Z","published":"2023-05-22T09:27:17Z","title":"RDA-INR: Riemannian Diffeomorphic Autoencoding via Implicit Neural\n Representations","summary":" Diffeomorphic registration frameworks such as Large Deformation Diffeomorphic\nMetric Mapping (LDDMM) are used in computer graphics and the medical domain for\natlas building, statistical latent modeling, and pairwise and groupwise\nregistration. In recent years, researchers have developed neural network-based\napproaches regarding diffeomorphic registration to improve the accuracy and\ncomputational efficiency of traditional methods. In this work, we focus on a\nlimitation of neural network-based atlas building and statistical latent\nmodeling methods, namely that they either are (i) resolution dependent or (ii)\ndisregard any data/problem-specific geometry needed for proper mean-variance\nanalysis. In particular, we overcome this limitation by designing a novel\nencoder based on resolution-independent implicit neural representations. The\nencoder achieves resolution invariance for LDDMM-based statistical latent\nmodeling. Additionally, the encoder adds LDDMM Riemannian geometry to\nresolution-independent deep learning models for statistical latent modeling. We\nshowcase that the Riemannian geometry aspect improves latent modeling and is\nrequired for a proper mean-variance analysis. Furthermore, to showcase the\nbenefit of resolution independence for LDDMM-based data variability modeling,\nwe show that our approach outperforms another neural network-based LDDMM latent\ncode model. Our work paves a way to more research into how Riemannian geometry,\nshape/image analysis, and deep learning can be combined.\n","authors":["Sven Dummer","Nicola Strisciuglio","Christoph Brune"],"pdf_url":"https://arxiv.org/pdf/2305.12854v2.pdf","comment":"34 pages, 27 figures (including subfigures)"},{"id":"http://arxiv.org/abs/2403.11263v1","updated":"2024-03-17T16:25:25Z","published":"2024-03-17T16:25:25Z","title":"Stylized Face Sketch Extraction via Generative Prior with Limited Data","summary":" Facial sketches are both a concise way of showing the identity of a person\nand a means to express artistic intention. While a few techniques have recently\nemerged that allow sketches to be extracted in different styles, they typically\nrely on a large amount of data that is difficult to obtain. Here, we propose\nStyleSketch, a method for extracting high-resolution stylized sketches from a\nface image. Using the rich semantics of the deep features from a pretrained\nStyleGAN, we are able to train a sketch generator with 16 pairs of face and the\ncorresponding sketch images. The sketch generator utilizes part-based losses\nwith two-stage learning for fast convergence during training for high-quality\nsketch extraction. Through a set of comparisons, we show that StyleSketch\noutperforms existing state-of-the-art sketch extraction methods and few-shot\nimage adaptation methods for the task of extracting high-resolution abstract\nface sketches. We further demonstrate the versatility of StyleSketch by\nextending its use to other domains and explore the possibility of semantic\nediting. The project page can be found in\nhttps://kwanyun.github.io/stylesketch_project.\n","authors":["Kwan Yun","Kwanggyoon Seo","Chang Wook Seo","Soyeon Yoon","Seongcheol Kim","Soohyun Ji","Amirsaman Ashtari","Junyong Noh"],"pdf_url":"https://arxiv.org/pdf/2403.11263v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2403.11256v1","updated":"2024-03-17T16:19:40Z","published":"2024-03-17T16:19:40Z","title":"Uncertainty-Aware Pseudo-Label Filtering for Source-Free Unsupervised\n Domain Adaptation","summary":" Source-free unsupervised domain adaptation (SFUDA) aims to enable the\nutilization of a pre-trained source model in an unlabeled target domain without\naccess to source data. Self-training is a way to solve SFUDA, where confident\ntarget samples are iteratively selected as pseudo-labeled samples to guide\ntarget model learning. However, prior heuristic noisy pseudo-label filtering\nmethods all involve introducing extra models, which are sensitive to model\nassumptions and may introduce additional errors or mislabeling. In this work,\nwe propose a method called Uncertainty-aware Pseudo-label-filtering Adaptation\n(UPA) to efficiently address this issue in a coarse-to-fine manner. Specially,\nwe first introduce a sample selection module named Adaptive Pseudo-label\nSelection (APS), which is responsible for filtering noisy pseudo labels. The\nAPS utilizes a simple sample uncertainty estimation method by aggregating\nknowledge from neighboring samples and confident samples are selected as clean\npseudo-labeled. Additionally, we incorporate Class-Aware Contrastive Learning\n(CACL) to mitigate the memorization of pseudo-label noise by learning robust\npair-wise representation supervised by pseudo labels. Through extensive\nexperiments conducted on three widely used benchmarks, we demonstrate that our\nproposed method achieves competitive performance on par with state-of-the-art\nSFUDA methods. Code is available at https://github.com/chenxi52/UPA.\n","authors":["Xi Chen","Haosen Yang","Huicong Zhang","Hongxun Yao","Xiatian Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.11256v1.pdf","comment":"Neurocomputing 2024"},{"id":"http://arxiv.org/abs/2403.07773v3","updated":"2024-03-17T16:18:14Z","published":"2024-03-12T15:59:08Z","title":"SemCity: Semantic Scene Generation with Triplane Diffusion","summary":" We present \"SemCity,\" a 3D diffusion model for semantic scene generation in\nreal-world outdoor environments. Most 3D diffusion models focus on generating a\nsingle object, synthetic indoor scenes, or synthetic outdoor scenes, while the\ngeneration of real-world outdoor scenes is rarely addressed. In this paper, we\nconcentrate on generating a real-outdoor scene through learning a diffusion\nmodel on a real-world outdoor dataset. In contrast to synthetic data,\nreal-outdoor datasets often contain more empty spaces due to sensor\nlimitations, causing challenges in learning real-outdoor distributions. To\naddress this issue, we exploit a triplane representation as a proxy form of\nscene distributions to be learned by our diffusion model. Furthermore, we\npropose a triplane manipulation that integrates seamlessly with our triplane\ndiffusion model. The manipulation improves our diffusion model's applicability\nin a variety of downstream tasks related to outdoor scene generation such as\nscene inpainting, scene outpainting, and semantic scene completion refinements.\nIn experimental results, we demonstrate that our triplane diffusion model shows\nmeaningful generation results compared with existing work in a real-outdoor\ndataset, SemanticKITTI. We also show our triplane manipulation facilitates\nseamlessly adding, removing, or modifying objects within a scene. Further, it\nalso enables the expansion of scenes toward a city-level scale. Finally, we\nevaluate our method on semantic scene completion refinements where our\ndiffusion model enhances predictions of semantic scene completion networks by\nlearning scene distribution. Our code is available at\nhttps://github.com/zoomin-lee/SemCity.\n","authors":["Jumin Lee","Sebin Lee","Changho Jo","Woobin Im","Juhyeong Seon","Sung-Eui Yoon"],"pdf_url":"https://arxiv.org/pdf/2403.07773v3.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2311.06572v2","updated":"2024-03-17T15:58:32Z","published":"2023-11-11T13:52:59Z","title":"Swin UNETR++: Advancing Transformer-Based Dense Dose Prediction Towards\n Fully Automated Radiation Oncology Treatments","summary":" The field of Radiation Oncology is uniquely positioned to benefit from the\nuse of artificial intelligence to fully automate the creation of radiation\ntreatment plans for cancer therapy. This time-consuming and specialized task\ncombines patient imaging with organ and tumor segmentation to generate a 3D\nradiation dose distribution to meet clinical treatment goals, similar to\nvoxel-level dense prediction. In this work, we propose Swin UNETR++, that\ncontains a lightweight 3D Dual Cross-Attention (DCA) module to capture the\nintra and inter-volume relationships of each patient's unique anatomy, which\nfully convolutional neural networks lack. Our model was trained, validated, and\ntested on the Open Knowledge-Based Planning dataset. In addition to metrics of\nDose Score $\\overline{S_{\\text{Dose}}}$ and DVH Score\n$\\overline{S_{\\text{DVH}}}$ that quantitatively measure the difference between\nthe predicted and ground-truth 3D radiation dose distribution, we propose the\nqualitative metrics of average volume-wise acceptance rate\n$\\overline{R_{\\text{VA}}}$ and average patient-wise clinical acceptance rate\n$\\overline{R_{\\text{PA}}}$ to assess the clinical reliability of the\npredictions. Swin UNETR++ demonstrates near-state-of-the-art performance on\nvalidation and test dataset (validation: $\\overline{S_{\\text{DVH}}}$=1.492 Gy,\n$\\overline{S_{\\text{Dose}}}$=2.649 Gy, $\\overline{R_{\\text{VA}}}$=88.58%,\n$\\overline{R_{\\text{PA}}}$=100.0%; test: $\\overline{S_{\\text{DVH}}}$=1.634 Gy,\n$\\overline{S_{\\text{Dose}}}$=2.757 Gy, $\\overline{R_{\\text{VA}}}$=90.50%,\n$\\overline{R_{\\text{PA}}}$=98.0%), establishing a basis for future studies to\ntranslate 3D dose predictions into a deliverable treatment plan, facilitating\nfull automation.\n","authors":["Kuancheng Wang","Hai Siong Tan","Rafe Mcbeth"],"pdf_url":"https://arxiv.org/pdf/2311.06572v2.pdf","comment":"Extended Abstract presented at Machine Learning for Health (ML4H)\n symposium 2023, December 10th, 2023, New Orleans, United States, 16 pages"},{"id":"http://arxiv.org/abs/2302.01089v3","updated":"2024-03-17T15:53:18Z","published":"2023-02-02T13:22:18Z","title":"Curriculum Learning for ab initio Deep Learned Refractive Optics","summary":" Deep optical optimization has recently emerged as a new paradigm for\ndesigning computational imaging systems using only the output image as the\nobjective. However, it has been limited to either simple optical systems\nconsisting of a single element such as a diffractive optical element (DOE) or\nmetalens, or the fine-tuning of compound lenses from good initial designs. Here\nwe present a DeepLens design method based on curriculum learning, which is able\nto learn optical designs of compound lenses ab initio from randomly initialized\nsurfaces without human intervention, therefore overcoming the need for a good\ninitial design. We demonstrate the effectiveness of our approach by fully\nautomatically designing both classical imaging lenses and a large field-of-view\nextended depth-of-field computational lens in a cellphone-style form factor,\nwith highly aspheric surfaces and a short back focal length.\n","authors":["Xinge Yang","Qiang Fu","Wolfgang Heidrich"],"pdf_url":"https://arxiv.org/pdf/2302.01089v3.pdf","comment":"Automatically design computational lenses from scratch with\n differentiable ray tracing"},{"id":"http://arxiv.org/abs/2403.11251v1","updated":"2024-03-17T15:51:21Z","published":"2024-03-17T15:51:21Z","title":"NeoNeXt: Novel neural network operator and architecture based on the\n patch-wise matrix multiplications","summary":" Most of the computer vision architectures nowadays are built upon the\nwell-known foundation operations: fully-connected layers, convolutions and\nmulti-head self-attention blocks. In this paper we propose a novel foundation\noperation - NeoCell - which learns matrix patterns and performs patchwise\nmatrix multiplications with the input data. The main advantages of the proposed\noperator are (1) simple implementation without need in operations like im2col,\n(2) low computational complexity (especially for large matrices) and (3) simple\nand flexible implementation of up-/down-sampling. We validate NeoNeXt family of\nmodels based on this operation on ImageNet-1K classification task and show that\nthey achieve competitive quality.\n","authors":["Vladimir Korviakov","Denis Koposov"],"pdf_url":"https://arxiv.org/pdf/2403.11251v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11249v1","updated":"2024-03-17T15:47:54Z","published":"2024-03-17T15:47:54Z","title":"YOLOv9 for Fracture Detection in Pediatric Wrist Trauma X-ray Images","summary":" The introduction of YOLOv9, the latest version of the You Only Look Once\n(YOLO) series, has led to its widespread adoption across various scenarios.\nThis paper is the first to apply the YOLOv9 algorithm model to the fracture\ndetection task as computer-assisted diagnosis (CAD) to help radiologists and\nsurgeons to interpret X-ray images. Specifically, this paper trained the model\non the GRAZPEDWRI-DX dataset and extended the training set using data\naugmentation techniques to improve the model performance. Experimental results\ndemonstrate that compared to the mAP 50-95 of the current state-of-the-art\n(SOTA) model, the YOLOv9 model increased the value from 42.16% to 43.73%, with\nan improvement of 3.7%. The implementation code is publicly available at\nhttps://github.com/RuiyangJu/YOLOv9-Fracture-Detection.\n","authors":["Chun-Tse Chien","Rui-Yang Ju","Kuang-Yi Chou","Jen-Shiun Chiang"],"pdf_url":"https://arxiv.org/pdf/2403.11249v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11247v1","updated":"2024-03-17T15:41:35Z","published":"2024-03-17T15:41:35Z","title":"Compact 3D Gaussian Splatting For Dense Visual SLAM","summary":" Recent work has shown that 3D Gaussian-based SLAM enables high-quality\nreconstruction, accurate pose estimation, and real-time rendering of scenes.\nHowever, these approaches are built on a tremendous number of redundant 3D\nGaussian ellipsoids, leading to high memory and storage costs, and slow\ntraining speed. To address the limitation, we propose a compact 3D Gaussian\nSplatting SLAM system that reduces the number and the parameter size of\nGaussian ellipsoids. A sliding window-based masking strategy is first proposed\nto reduce the redundant ellipsoids. Then we observe that the covariance matrix\n(geometry) of most 3D Gaussian ellipsoids are extremely similar, which\nmotivates a novel geometry codebook to compress 3D Gaussian geometric\nattributes, i.e., the parameters. Robust and accurate pose estimation is\nachieved by a global bundle adjustment method with reprojection loss. Extensive\nexperiments demonstrate that our method achieves faster training and rendering\nspeed while maintaining the state-of-the-art (SOTA) quality of the scene\nrepresentation.\n","authors":["Tianchen Deng","Yaohui Chen","Leyan Zhang","Jianfei Yang","Shenghai Yuan","Danwei Wang","Weidong Chen"],"pdf_url":"https://arxiv.org/pdf/2403.11247v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11237v1","updated":"2024-03-17T14:52:05Z","published":"2024-03-17T14:52:05Z","title":"FORCE: Dataset and Method for Intuitive Physics Guided Human-object\n Interaction","summary":" Interactions between human and objects are influenced not only by the\nobject's pose and shape, but also by physical attributes such as object mass\nand surface friction. They introduce important motion nuances that are\nessential for diversity and realism. Despite advancements in recent\nkinematics-based methods, this aspect has been overlooked. Generating nuanced\nhuman motion presents two challenges. First, it is non-trivial to learn from\nmulti-modal human and object information derived from both the physical and\nnon-physical attributes. Second, there exists no dataset capturing nuanced\nhuman interactions with objects of varying physical properties, hampering model\ndevelopment. This work addresses the gap by introducing the FORCE model, a\nkinematic approach for synthesizing diverse, nuanced human-object interactions\nby modeling physical attributes. Our key insight is that human motion is\ndictated by the interrelation between the force exerted by the human and the\nperceived resistance. Guided by a novel intuitive physics encoding, the model\ncaptures the interplay between human force and resistance. Experiments also\ndemonstrate incorporating human force facilitates learning multi-class motion.\nAccompanying our model, we contribute the FORCE dataset. It features diverse,\ndifferent-styled motion through interactions with varying resistances.\n","authors":["Xiaohan Zhang","Bharat Lal Bhatnagar","Sebastian Starke","Ilya Petrov","Vladimir Guzov","Helisa Dhamo","Eduardo Pérez-Pellitero","Gerard Pons-Moll"],"pdf_url":"https://arxiv.org/pdf/2403.11237v1.pdf","comment":"24 pages, 9 figures"},{"id":"http://arxiv.org/abs/2108.02759v2","updated":"2024-03-17T14:46:31Z","published":"2021-08-05T17:51:32Z","title":"Unifying Global-Local Representations in Salient Object Detection with\n Transformer","summary":" The fully convolutional network (FCN) has dominated salient object detection\nfor a long period. However, the locality of CNN requires the model deep enough\nto have a global receptive field and such a deep model always leads to the loss\nof local details. In this paper, we introduce a new attention-based encoder,\nvision transformer, into salient object detection to ensure the globalization\nof the representations from shallow to deep layers. With the global view in\nvery shallow layers, the transformer encoder preserves more local\nrepresentations to recover the spatial details in final saliency maps. Besides,\nas each layer can capture a global view of its previous layer, adjacent layers\ncan implicitly maximize the representation differences and minimize the\nredundant features, making that every output feature of transformer layers\ncontributes uniquely for final prediction. To decode features from the\ntransformer, we propose a simple yet effective deeply-transformed decoder. The\ndecoder densely decodes and upsamples the transformer features, generating the\nfinal saliency map with less noise injection. Experimental results demonstrate\nthat our method significantly outperforms other FCN-based and transformer-based\nmethods in five benchmarks by a large margin, with an average of 12.17%\nimprovement in terms of Mean Absolute Error (MAE). Code will be available at\nhttps://github.com/OliverRensu/GLSTR.\n","authors":["Sucheng Ren","Qiang Wen","Nanxuan Zhao","Guoqiang Han","Shengfeng He"],"pdf_url":"https://arxiv.org/pdf/2108.02759v2.pdf","comment":"accepted by IEEE TETCI"},{"id":"http://arxiv.org/abs/2403.11234v1","updated":"2024-03-17T14:43:47Z","published":"2024-03-17T14:43:47Z","title":"Universal Semi-Supervised Domain Adaptation by Mitigating Common-Class\n Bias","summary":" Domain adaptation is a critical task in machine learning that aims to improve\nmodel performance on a target domain by leveraging knowledge from a related\nsource domain. In this work, we introduce Universal Semi-Supervised Domain\nAdaptation (UniSSDA), a practical yet challenging setting where the target\ndomain is partially labeled, and the source and target label space may not\nstrictly match. UniSSDA is at the intersection of Universal Domain Adaptation\n(UniDA) and Semi-Supervised Domain Adaptation (SSDA): the UniDA setting does\nnot allow for fine-grained categorization of target private classes not\nrepresented in the source domain, while SSDA focuses on the restricted\nclosed-set setting where source and target label spaces match exactly. Existing\nUniDA and SSDA methods are susceptible to common-class bias in UniSSDA\nsettings, where models overfit to data distributions of classes common to both\ndomains at the expense of private classes. We propose a new prior-guided\npseudo-label refinement strategy to reduce the reinforcement of common-class\nbias due to pseudo-labeling, a common label propagation strategy in domain\nadaptation. We demonstrate the effectiveness of the proposed strategy on\nbenchmark datasets Office-Home, DomainNet, and VisDA. The proposed strategy\nattains the best performance across UniSSDA adaptation settings and establishes\na new baseline for UniSSDA.\n","authors":["Wenyu Zhang","Qingmu Liu","Felix Ong Wei Cong","Mohamed Ragab","Chuan-Sheng Foo"],"pdf_url":"https://arxiv.org/pdf/2403.11234v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.11233v1","updated":"2024-03-17T14:42:05Z","published":"2024-03-17T14:42:05Z","title":"STAIR: Semantic-Targeted Active Implicit Reconstruction","summary":" Many autonomous robotic applications require object-level understanding when\ndeployed. Actively reconstructing objects of interest, i.e. objects with\nspecific semantic meanings, is therefore relevant for a robot to perform\ndownstream tasks in an initially unknown environment. In this work, we propose\na novel framework for semantic-targeted active reconstruction using posed RGB-D\nmeasurements and 2D semantic labels as input. The key components of our\nframework are a semantic implicit neural representation and a compatible\nplanning utility function based on semantic rendering and uncertainty\nestimation, enabling adaptive view planning to target objects of interest. Our\nplanning approach achieves better reconstruction performance in terms of mesh\nand novel view rendering quality compared to implicit reconstruction baselines\nthat do not consider semantics for view planning. Our framework further\noutperforms a state-of-the-art semantic-targeted active reconstruction pipeline\nbased on explicit maps, justifying our choice of utilising implicit neural\nrepresentations to tackle semantic-targeted active reconstruction problems.\n","authors":["Liren Jin","Haofei Kuang","Yue Pan","Cyrill Stachniss","Marija Popović"],"pdf_url":"https://arxiv.org/pdf/2403.11233v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11230v1","updated":"2024-03-17T14:34:51Z","published":"2024-03-17T14:34:51Z","title":"Simple 2D Convolutional Neural Network-based Approach for COVID-19\n Detection","summary":" This study explores the use of deep learning techniques for analyzing lung\nComputed Tomography (CT) images. Classic deep learning approaches face\nchallenges with varying slice counts and resolutions in CT images, a diversity\narising from the utilization of assorted scanning equipment. Typically,\npredictions are made on single slices which are then combined for a\ncomprehensive outcome. Yet, this method does not incorporate learning features\nspecific to each slice, leading to a compromise in effectiveness. To address\nthese challenges, we propose an advanced Spatial-Slice Feature Learning\n(SSFL++) framework specifically tailored for CT scans. It aims to filter out\nout-of-distribution (OOD) data within the entire CT scan, allowing us to select\nessential spatial-slice features for analysis by reducing data redundancy by\n70\\%. Additionally, we introduce a Kernel-Density-based slice Sampling (KDS)\nmethod to enhance stability during training and inference phases, thereby\naccelerating convergence and enhancing overall performance. Remarkably, our\nexperiments reveal that our model achieves promising results with a simple\nEfficientNet-2D (E2D) model. The effectiveness of our approach is confirmed on\nthe COVID-19-CT-DB datasets provided by the DEF-AI-MIA workshop.\n","authors":["Chih-Chung Hsu","Chia-Ming Lee","Yang Fan Chiang","Yi-Shiuan Chou","Chih-Yu Jiang","Shen-Chieh Tai","Chi-Han Tsai"],"pdf_url":"https://arxiv.org/pdf/2403.11230v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11229v1","updated":"2024-03-17T14:30:56Z","published":"2024-03-17T14:30:56Z","title":"Concatenate, Fine-tuning, Re-training: A SAM-enabled Framework for\n Semi-supervised 3D Medical Image Segmentation","summary":" Segment Anything Model (SAM) fine-tuning has shown remarkable performance in\nmedical image segmentation in a fully supervised manner, but requires precise\nannotations. To reduce the annotation cost and maintain satisfactory\nperformance, in this work, we leverage the capabilities of SAM for establishing\nsemi-supervised medical image segmentation models. Rethinking the requirements\nof effectiveness, efficiency, and compatibility, we propose a three-stage\nframework, i.e., Concatenate, Fine-tuning, and Re-training (CFR). The current\nfine-tuning approaches mostly involve 2D slice-wise fine-tuning that disregards\nthe contextual information between adjacent slices. Our concatenation strategy\nmitigates the mismatch between natural and 3D medical images. The concatenated\nimages are then used for fine-tuning SAM, providing robust initialization\npseudo-labels. Afterwards, we train a 3D semi-supervised segmentation model\nwhile maintaining the same parameter size as the conventional segmenter such as\nV-Net. Our CFR framework is plug-and-play, and easily compatible with various\npopular semi-supervised methods. Extensive experiments validate that our CFR\nachieves significant improvements in both moderate annotation and scarce\nannotation across four datasets. In particular, CFR framework improves the Dice\nscore of Mean Teacher from 29.68% to 74.40% with only one labeled data of LA\ndataset.\n","authors":["Shumeng Li","Lei Qi","Qian Yu","Jing Huo","Yinghuan Shi","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2403.11229v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09257v2","updated":"2024-03-17T14:14:28Z","published":"2024-03-14T10:30:43Z","title":"WSI-SAM: Multi-resolution Segment Anything Model (SAM) for\n histopathology whole-slide images","summary":" The Segment Anything Model (SAM) marks a significant advancement in\nsegmentation models, offering robust zero-shot abilities and dynamic prompting.\nHowever, existing medical SAMs are not suitable for the multi-scale nature of\nwhole-slide images (WSIs), restricting their effectiveness. To resolve this\ndrawback, we present WSI-SAM, enhancing SAM with precise object segmentation\ncapabilities for histopathology images using multi-resolution patches, while\npreserving its efficient, prompt-driven design, and zero-shot abilities. To\nfully exploit pretrained knowledge while minimizing training overhead, we keep\nSAM frozen, introducing only minimal extra parameters and computational\noverhead. In particular, we introduce High-Resolution (HR) token,\nLow-Resolution (LR) token and dual mask decoder. This decoder integrates the\noriginal SAM mask decoder with a lightweight fusion module that integrates\nfeatures at multiple scales. Instead of predicting a mask independently, we\nintegrate HR and LR token at intermediate layer to jointly learn features of\nthe same object across multiple resolutions. Experiments show that our WSI-SAM\noutperforms state-of-the-art SAM and its variants. In particular, our model\noutperforms SAM by 4.1 and 2.5 percent points on a ductal carcinoma in situ\n(DCIS) segmentation tasks and breast cancer metastasis segmentation task\n(CAMELYON16 dataset). The code will be available at\nhttps://github.com/HongLiuuuuu/WSI-SAM.\n","authors":["Hong Liu","Haosen Yang","Paul J. van Diest","Josien P. W. Pluim","Mitko Veta"],"pdf_url":"https://arxiv.org/pdf/2403.09257v2.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.06378v3","updated":"2024-03-17T14:10:15Z","published":"2023-08-11T20:32:39Z","title":"DCNFIS: Deep Convolutional Neuro-Fuzzy Inference System","summary":" A key challenge in eXplainable Artificial Intelligence is the well-known\ntradeoff between the transparency of an algorithm (i.e., how easily a human can\ndirectly understand the algorithm, as opposed to receiving a post-hoc\nexplanation), and its accuracy. We report on the design of a new deep network\nthat achieves improved transparency without sacrificing accuracy. We design a\ndeep convolutional neuro-fuzzy inference system (DCNFIS) by hybridizing fuzzy\nlogic and deep learning models and show that DCNFIS performs as accurately as\nexisting convolutional neural networks on four well-known datasets and 3 famous\narchitectures. Our performance comparison with available fuzzy methods show\nthat DCNFIS is now state-of-the-art fuzzy system and outperforms other shallow\nand deep fuzzy methods to the best of our knowledge. At the end, we exploit the\ntransparency of fuzzy logic by deriving explanations, in the form of saliency\nmaps, from the fuzzy rules encoded in the network to take benefit of fuzzy\nlogic upon regular deep learning methods. We investigate the properties of\nthese explanations in greater depth using the Fashion-MNIST dataset.\n","authors":["Mojtaba Yeganejou","Kimia Honari","Ryan Kluzinski","Scott Dick","Michael Lipsett","James Miller"],"pdf_url":"https://arxiv.org/pdf/2308.06378v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09034v2","updated":"2024-03-17T13:59:38Z","published":"2024-03-14T02:11:16Z","title":"rFaceNet: An End-to-End Network for Enhanced Physiological Signal\n Extraction through Identity-Specific Facial Contours","summary":" Remote photoplethysmography (rPPG) technique extracts blood volume pulse\n(BVP) signals from subtle pixel changes in video frames. This study introduces\nrFaceNet, an advanced rPPG method that enhances the extraction of facial BVP\nsignals with a focus on facial contours. rFaceNet integrates identity-specific\nfacial contour information and eliminates redundant data. It efficiently\nextracts facial contours from temporally normalized frame inputs through a\nTemporal Compressor Unit (TCU) and steers the model focus to relevant facial\nregions by using the Cross-Task Feature Combiner (CTFC). Through elaborate\ntraining, the quality and interpretability of facial physiological signals\nextracted by rFaceNet are greatly improved compared to previous methods.\nMoreover, our novel approach demonstrates superior performance than SOTA\nmethods in various heart rate estimation benchmarks.\n","authors":["Dali Zhu","Wenli Zhang","Hualin Zeng","Xiaohao Liu","Long Yang","Jiaqi Zheng"],"pdf_url":"https://arxiv.org/pdf/2403.09034v2.pdf","comment":"under-review"},{"id":"http://arxiv.org/abs/2403.11222v1","updated":"2024-03-17T13:51:25Z","published":"2024-03-17T13:51:25Z","title":"SpikeNeRF: Learning Neural Radiance Fields from Continuous Spike Stream","summary":" Spike cameras, leveraging spike-based integration sampling and high temporal\nresolution, offer distinct advantages over standard cameras. However, existing\napproaches reliant on spike cameras often assume optimal illumination, a\ncondition frequently unmet in real-world scenarios. To address this, we\nintroduce SpikeNeRF, the first work that derives a NeRF-based volumetric scene\nrepresentation from spike camera data. Our approach leverages NeRF's multi-view\nconsistency to establish robust self-supervision, effectively eliminating\nerroneous measurements and uncovering coherent structures within exceedingly\nnoisy input amidst diverse real-world illumination scenarios. The framework\ncomprises two core elements: a spike generation model incorporating an\nintegrate-and-fire neuron layer and parameters accounting for non-idealities,\nsuch as threshold variation, and a spike rendering loss capable of generalizing\nacross varying illumination conditions. We describe how to effectively optimize\nneural radiance fields to render photorealistic novel views from the novel\ncontinuous spike stream, demonstrating advantages over other vision sensors in\ncertain scenes. Empirical evaluations conducted on both real and novel\nrealistically simulated sequences affirm the efficacy of our methodology. The\ndataset and source code are released at\nhttps://github.com/BIT-Vision/SpikeNeRF.\n","authors":["Lin Zhu","Kangmin Jia","Yifan Zhao","Yunshan Qi","Lizhi Wang","Hua Huang"],"pdf_url":"https://arxiv.org/pdf/2403.11222v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.03452v3","updated":"2024-03-17T13:47:00Z","published":"2024-03-06T04:36:43Z","title":"D4C glove-train: solving the RPM and Bongard-logo problem by\n distributing and Circumscribing concepts","summary":" This paper achieves significant progress in the field of abstract reasoning,\nparticularly in addressing Raven's Progressive Matrices (RPM) and Bongard-Logo\nproblems. We propose the D2C approach, which redefines conceptual boundaries in\nthese domains and bridges the gap between high-level concepts and their\nlow-dimensional representations. Based on this, we further introduce the D3C\nmethod that handles Bongard-Logo problems and significantly improves reasoning\naccuracy by estimating the distribution of image representations and measuring\ntheir Sinkhorn distance. To enhance computational efficiency, we introduce the\nD3C-cos variant, which provides an efficient and accurate solution for RPM\nproblems by constraining distribution distances. Additionally, we present\nLico-Net, a network that combines D3C and D3C-cos to achieve state-of-the-art\nperformance in both problem-solving and interpretability. Finally, we extend\nour approach to D4C, employing adversarial strategies to further refine\nconceptual boundaries and demonstrate notable improvements for both RPM and\nBongard-Logo problems. Overall, our contributions offer a new perspective and\npractical solutions to the field of abstract reasoning.\n","authors":["Ruizhuo Song","Beiming Yuan"],"pdf_url":"https://arxiv.org/pdf/2403.03452v3.pdf","comment":"16 pages, 14 figures, 6 tables"},{"id":"http://arxiv.org/abs/2403.11220v1","updated":"2024-03-17T13:43:10Z","published":"2024-03-17T13:43:10Z","title":"CPA-Enhancer: Chain-of-Thought Prompted Adaptive Enhancer for Object\n Detection under Unknown Degradations","summary":" Object detection methods under known single degradations have been\nextensively investigated. However, existing approaches require prior knowledge\nof the degradation type and train a separate model for each, limiting their\npractical applications in unpredictable environments. To address this\nchallenge, we propose a chain-of-thought (CoT) prompted adaptive enhancer,\nCPA-Enhancer, for object detection under unknown degradations. Specifically,\nCPA-Enhancer progressively adapts its enhancement strategy under the\nstep-by-step guidance of CoT prompts, that encode degradation-related\ninformation. To the best of our knowledge, it's the first work that exploits\nCoT prompting for object detection tasks. Overall, CPA-Enhancer is a\nplug-and-play enhancement model that can be integrated into any generic\ndetectors to achieve substantial gains on degraded images, without knowing the\ndegradation type priorly. Experimental results demonstrate that CPA-Enhancer\nnot only sets the new state of the art for object detection but also boosts the\nperformance of other downstream vision tasks under unknown degradations.\n","authors":["Yuwei Zhang","Yan Wu","Yanming Liu","Xinyue Peng"],"pdf_url":"https://arxiv.org/pdf/2403.11220v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.03379v2","updated":"2024-03-17T13:37:37Z","published":"2024-01-07T03:35:04Z","title":"Towards Effective Multiple-in-One Image Restoration: A Sequential and\n Prompt Learning Strategy","summary":" While single task image restoration (IR) has achieved significant successes,\nit remains a challenging issue to train a single model which can tackle\nmultiple IR tasks. In this work, we investigate in-depth the multiple-in-one\n(MiO) IR problem, which comprises seven popular IR tasks. We point out that MiO\nIR faces two pivotal challenges: the optimization of diverse objectives and the\nadaptation to multiple tasks. To tackle these challenges, we present two simple\nyet effective strategies. The first strategy, referred to as sequential\nlearning, attempts to address how to optimize the diverse objectives, which\nguides the network to incrementally learn individual IR tasks in a sequential\nmanner rather than mixing them together. The second strategy, i.e., prompt\nlearning, attempts to address how to adapt to the different IR tasks, which\nassists the network to understand the specific task and improves the\ngeneralization ability. By evaluating on 19 test sets, we demonstrate that the\nsequential and prompt learning strategies can significantly enhance the MiO\nperformance of commonly used CNN and Transformer backbones. Our experiments\nalso reveal that the two strategies can supplement each other to learn better\ndegradation representations and enhance the model robustness. It is expected\nthat our proposed MiO IR formulation and strategies could facilitate the\nresearch on how to train IR models with higher generalization capabilities.\n","authors":["Xiangtao Kong","Chao Dong","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.03379v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09194v2","updated":"2024-03-17T13:28:32Z","published":"2024-03-14T09:07:31Z","title":"Intention-driven Ego-to-Exo Video Generation","summary":" Ego-to-exo video generation refers to generating the corresponding exocentric\nvideo according to the egocentric video, providing valuable applications in\nAR/VR and embodied AI. Benefiting from advancements in diffusion model\ntechniques, notable progress has been achieved in video generation. However,\nexisting methods build upon the spatiotemporal consistency assumptions between\nadjacent frames, which cannot be satisfied in the ego-to-exo scenarios due to\ndrastic changes in views. To this end, this paper proposes an Intention-Driven\nEgo-to-exo video generation framework (IDE) that leverages action intention\nconsisting of human movement and action description as view-independent\nrepresentation to guide video generation, preserving the consistency of content\nand motion. Specifically, the egocentric head trajectory is first estimated\nthrough multi-view stereo matching. Then, cross-view feature perception module\nis introduced to establish correspondences between exo- and ego- views, guiding\nthe trajectory transformation module to infer human full-body movement from the\nhead trajectory. Meanwhile, we present an action description unit that maps the\naction semantics into the feature space consistent with the exocentric image.\nFinally, the inferred human movement and high-level action descriptions jointly\nguide the generation of exocentric motion and interaction content (i.e.,\ncorresponding optical flow and occlusion maps) in the backward process of the\ndiffusion model, ultimately warping them into the corresponding exocentric\nvideo. We conduct extensive experiments on the relevant dataset with diverse\nexo-ego video pairs, and our IDE outperforms state-of-the-art models in both\nsubjective and objective assessments, demonstrating its efficacy in ego-to-exo\nvideo generation.\n","authors":["Hongchen Luo","Kai Zhu","Wei Zhai","Yang Cao"],"pdf_url":"https://arxiv.org/pdf/2403.09194v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11211v1","updated":"2024-03-17T13:23:25Z","published":"2024-03-17T13:23:25Z","title":"RCdpia: A Renal Carcinoma Digital Pathology Image Annotation dataset\n based on pathologists","summary":" The annotation of digital pathological slide data for renal cell carcinoma is\nof paramount importance for correct diagnosis of artificial intelligence models\ndue to the heterogeneous nature of the tumor. This process not only facilitates\na deeper understanding of renal cell cancer heterogeneity but also aims to\nminimize noise in the data for more accurate studies. To enhance the\napplicability of the data, two pathologists were enlisted to meticulously\ncurate, screen, and label a kidney cancer pathology image dataset from The\nCancer Genome Atlas Program (TCGA) database. Subsequently, a Resnet model was\ndeveloped to validate the annotated dataset against an additional dataset from\nthe First Affiliated Hospital of Zhejiang University. Based on these results,\nwe have meticulously compiled the TCGA digital pathological dataset with\nindependent labeling of tumor regions and adjacent areas (RCdpia), which\nincludes 109 cases of kidney chromophobe cell carcinoma, 486 cases of kidney\nclear cell carcinoma, and 292 cases of kidney papillary cell carcinoma. This\ndataset is now publicly accessible at http://39.171.241.18:8888/RCdpia/.\nFurthermore, model analysis has revealed significant discrepancies in\npredictive outcomes when applying the same model to datasets from different\ncenters. Leveraging the RCdpia, we can now develop more precise digital\npathology artificial intelligence models for tasks such as normalization,\nclassification, and segmentation. These advancements underscore the potential\nfor more nuanced and accurate AI applications in the field of digital\npathology.\n","authors":["Qingrong Sun","Weixiang Zhong","Jie Zhou","Chong Lai","Xiaodong Teng","Maode Lai"],"pdf_url":"https://arxiv.org/pdf/2403.11211v1.pdf","comment":"8 pages, 3 figures, 1 table"},{"id":"http://arxiv.org/abs/2403.11208v1","updated":"2024-03-17T13:17:25Z","published":"2024-03-17T13:17:25Z","title":"THOR: Text to Human-Object Interaction Diffusion via Relation\n Intervention","summary":" This paper addresses new methodologies to deal with the challenging task of\ngenerating dynamic Human-Object Interactions from textual descriptions\n(Text2HOI). While most existing works assume interactions with limited body\nparts or static objects, our task involves addressing the variation in human\nmotion, the diversity of object shapes, and the semantic vagueness of object\nmotion simultaneously. To tackle this, we propose a novel Text-guided\nHuman-Object Interaction diffusion model with Relation Intervention (THOR).\nTHOR is a cohesive diffusion model equipped with a relation intervention\nmechanism. In each diffusion step, we initiate text-guided human and object\nmotion and then leverage human-object relations to intervene in object motion.\nThis intervention enhances the spatial-temporal relations between humans and\nobjects, with human-centric interaction representation providing additional\nguidance for synthesizing consistent motion from text. To achieve more\nreasonable and realistic results, interaction losses is introduced at different\nlevels of motion granularity. Moreover, we construct Text-BEHAVE, a Text2HOI\ndataset that seamlessly integrates textual descriptions with the currently\nlargest publicly available 3D HOI dataset. Both quantitative and qualitative\nexperiments demonstrate the effectiveness of our proposed model.\n","authors":["Qianyang Wu","Ye Shi","Xiaoshui Huang","Jingyi Yu","Lan Xu","Jingya Wang"],"pdf_url":"https://arxiv.org/pdf/2403.11208v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11207v1","updated":"2024-03-17T13:15:22Z","published":"2024-03-17T13:15:22Z","title":"MindEye2: Shared-Subject Models Enable fMRI-To-Image With 1 Hour of Data","summary":" Reconstructions of visual perception from brain activity have improved\ntremendously, but the practical utility of such methods has been limited. This\nis because such models are trained independently per subject where each subject\nrequires dozens of hours of expensive fMRI training data to attain high-quality\nresults. The present work showcases high-quality reconstructions using only 1\nhour of fMRI training data. We pretrain our model across 7 subjects and then\nfine-tune on minimal data from a new subject. Our novel functional alignment\nprocedure linearly maps all brain data to a shared-subject latent space,\nfollowed by a shared non-linear mapping to CLIP image space. We then map from\nCLIP space to pixel space by fine-tuning Stable Diffusion XL to accept CLIP\nlatents as inputs instead of text. This approach improves out-of-subject\ngeneralization with limited training data and also attains state-of-the-art\nimage retrieval and reconstruction metrics compared to single-subject\napproaches. MindEye2 demonstrates how accurate reconstructions of perception\nare possible from a single visit to the MRI facility. All code is available on\nGitHub.\n","authors":["Paul S. Scotti","Mihir Tripathy","Cesar Kadir Torrico Villanueva","Reese Kneeland","Tong Chen","Ashutosh Narang","Charan Santhirasegaran","Jonathan Xu","Thomas Naselaris","Kenneth A. Norman","Tanishq Mathew Abraham"],"pdf_url":"https://arxiv.org/pdf/2403.11207v1.pdf","comment":"Code at https://github.com/MedARC-AI/MindEyeV2/tree/main"},{"id":"http://arxiv.org/abs/2403.11197v1","updated":"2024-03-17T12:49:02Z","published":"2024-03-17T12:49:02Z","title":"TAG: Guidance-free Open-Vocabulary Semantic Segmentation","summary":" Semantic segmentation is a crucial task in computer vision, where each pixel\nin an image is classified into a category. However, traditional methods face\nsignificant challenges, including the need for pixel-level annotations and\nextensive training. Furthermore, because supervised learning uses a limited set\nof predefined categories, models typically struggle with rare classes and\ncannot recognize new ones. Unsupervised and open-vocabulary segmentation,\nproposed to tackle these issues, faces challenges, including the inability to\nassign specific class labels to clusters and the necessity of user-provided\ntext queries for guidance. In this context, we propose a novel approach, TAG\nwhich achieves Training, Annotation, and Guidance-free open-vocabulary semantic\nsegmentation. TAG utilizes pre-trained models such as CLIP and DINO to segment\nimages into meaningful categories without additional training or dense\nannotations. It retrieves class labels from an external database, providing\nflexibility to adapt to new scenarios. Our TAG achieves state-of-the-art\nresults on PascalVOC, PascalContext and ADE20K for open-vocabulary segmentation\nwithout given class names, i.e. improvement of +15.3 mIoU on PascalVOC. All\ncode and data will be released at https://github.com/Valkyrja3607/TAG.\n","authors":["Yasufumi Kawano","Yoshimitsu Aoki"],"pdf_url":"https://arxiv.org/pdf/2403.11197v1.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2403.11194v1","updated":"2024-03-17T12:40:49Z","published":"2024-03-17T12:40:49Z","title":"MaskDiffusion: Exploiting Pre-trained Diffusion Models for Semantic\n Segmentation","summary":" Semantic segmentation is essential in computer vision for various\napplications, yet traditional approaches face significant challenges, including\nthe high cost of annotation and extensive training for supervised learning.\nAdditionally, due to the limited predefined categories in supervised learning,\nmodels typically struggle with infrequent classes and are unable to predict\nnovel classes. To address these limitations, we propose MaskDiffusion, an\ninnovative approach that leverages pretrained frozen Stable Diffusion to\nachieve open-vocabulary semantic segmentation without the need for additional\ntraining or annotation, leading to improved performance compared to similar\nmethods. We also demonstrate the superior performance of MaskDiffusion in\nhandling open vocabularies, including fine-grained and proper noun-based\ncategories, thus expanding the scope of segmentation applications. Overall, our\nMaskDiffusion shows significant qualitative and quantitative improvements in\ncontrast to other comparable unsupervised segmentation methods, i.e. on the\nPotsdam dataset (+10.5 mIoU compared to GEM) and COCO-Stuff (+14.8 mIoU\ncompared to DiffSeg). All code and data will be released at\nhttps://github.com/Valkyrja3607/MaskDiffusion.\n","authors":["Yasufumi Kawano","Yoshimitsu Aoki"],"pdf_url":"https://arxiv.org/pdf/2403.11194v1.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2403.11193v1","updated":"2024-03-17T12:40:46Z","published":"2024-03-17T12:40:46Z","title":"Neural Markov Random Field for Stereo Matching","summary":" Stereo matching is a core task for many computer vision and robotics\napplications. Despite their dominance in traditional stereo methods, the\nhand-crafted Markov Random Field (MRF) models lack sufficient modeling accuracy\ncompared to end-to-end deep models. While deep learning representations have\ngreatly improved the unary terms of the MRF models, the overall accuracy is\nstill severely limited by the hand-crafted pairwise terms and message passing.\nTo address these issues, we propose a neural MRF model, where both potential\nfunctions and message passing are designed using data-driven neural networks.\nOur fully data-driven model is built on the foundation of variational inference\ntheory, to prevent convergence issues and retain stereo MRF's graph inductive\nbias. To make the inference tractable and scale well to high-resolution images,\nwe also propose a Disparity Proposal Network (DPN) to adaptively prune the\nsearch space of disparity. The proposed approach ranks $1^{st}$ on both KITTI\n2012 and 2015 leaderboards among all published methods while running faster\nthan 100 ms. This approach significantly outperforms prior global methods,\ne.g., lowering D1 metric by more than 50% on KITTI 2015. In addition, our\nmethod exhibits strong cross-domain generalization and can recover sharp edges.\nThe codes at https://github.com/aeolusguan/NMRF .\n","authors":["Tongfan Guan","Chen Wang","Yun-Hui Liu"],"pdf_url":"https://arxiv.org/pdf/2403.11193v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.11192v1","updated":"2024-03-17T12:38:58Z","published":"2024-03-17T12:38:58Z","title":"Self-Supervised Video Desmoking for Laparoscopic Surgery","summary":" Due to the difficulty of collecting real paired data, most existing desmoking\nmethods train the models by synthesizing smoke, generalizing poorly to real\nsurgical scenarios. Although a few works have explored single-image real-world\ndesmoking in unpaired learning manners, they still encounter challenges in\nhandling dense smoke. In this work, we address these issues together by\nintroducing the self-supervised surgery video desmoking (SelfSVD). On the one\nhand, we observe that the frame captured before the activation of high-energy\ndevices is generally clear (named pre-smoke frame, PS frame), thus it can serve\nas supervision for other smoky frames, making real-world self-supervised video\ndesmoking practically feasible. On the other hand, in order to enhance the\ndesmoking performance, we further feed the valuable information from PS frame\ninto models, where a masking strategy and a regularization term are presented\nto avoid trivial solutions. In addition, we construct a real surgery video\ndataset for desmoking, which covers a variety of smoky scenes. Extensive\nexperiments on the dataset show that our SelfSVD can remove smoke more\neffectively and efficiently while recovering more photo-realistic details than\nthe state-of-the-art methods. The dataset, codes, and pre-trained models are\navailable at \\url{https://github.com/ZcsrenlongZ/SelfSVD}.\n","authors":["Renlong Wu","Zhilu Zhang","Shuohao Zhang","Longfei Gou","Haobin Chen","Lei Zhang","Hao Chen","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2403.11192v1.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/2403.11189v1","updated":"2024-03-17T12:26:23Z","published":"2024-03-17T12:26:23Z","title":"Boosting Semi-Supervised Temporal Action Localization by Learning from\n Non-Target Classes","summary":" The crux of semi-supervised temporal action localization (SS-TAL) lies in\nexcavating valuable information from abundant unlabeled videos. However,\ncurrent approaches predominantly focus on building models that are robust to\nthe error-prone target class (i.e, the predicted class with the highest\nconfidence) while ignoring informative semantics within non-target classes.\nThis paper approaches SS-TAL from a novel perspective by advocating for\nlearning from non-target classes, transcending the conventional focus solely on\nthe target class. The proposed approach involves partitioning the label space\nof the predicted class distribution into distinct subspaces: target class,\npositive classes, negative classes, and ambiguous classes, aiming to mine both\npositive and negative semantics that are absent in the target class, while\nexcluding ambiguous classes. To this end, we first devise innovative strategies\nto adaptively select high-quality positive and negative classes from the label\nspace, by modeling both the confidence and rank of a class in relation to those\nof the target class. Then, we introduce novel positive and negative losses\ndesigned to guide the learning process, pushing predictions closer to positive\nclasses and away from negative classes. Finally, the positive and negative\nprocesses are integrated into a hybrid positive-negative learning framework,\nfacilitating the utilization of non-target classes in both labeled and\nunlabeled videos. Experimental results on THUMOS14 and ActivityNet v1.3\ndemonstrate the superiority of the proposed method over prior state-of-the-art\napproaches.\n","authors":["Kun Xia","Le Wang","Sanping Zhou","Gang Hua","Wei Tang"],"pdf_url":"https://arxiv.org/pdf/2403.11189v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16861v2","updated":"2024-03-17T12:15:34Z","published":"2024-01-30T10:04:49Z","title":"Repositioning the Subject within Image","summary":" Current image manipulation primarily centers on static manipulation, such as\nreplacing specific regions within an image or altering its overall style. In\nthis paper, we introduce an innovative dynamic manipulation task, subject\nrepositioning. This task involves relocating a user-specified subject to a\ndesired position while preserving the image's fidelity. Our research reveals\nthat the fundamental sub-tasks of subject repositioning, which include filling\nthe void left by the repositioned subject, reconstructing obscured portions of\nthe subject and blending the subject to be consistent with surrounding areas,\ncan be effectively reformulated as a unified, prompt-guided inpainting task.\nConsequently, we can employ a single diffusion generative model to address\nthese sub-tasks using various task prompts learned through our proposed task\ninversion technique. Additionally, we integrate pre-processing and\npost-processing techniques to further enhance the quality of subject\nrepositioning. These elements together form our SEgment-gEnerate-and-bLEnd\n(SEELE) framework. To assess SEELE's effectiveness in subject repositioning, we\nassemble a real-world subject repositioning dataset called ReS. Results of\nSEELE on ReS demonstrate its efficacy.\n","authors":["Yikai Wang","Chenjie Cao","Ke Fan","Qiaole Dong","Yifan Li","Xiangyang Xue","Yanwei Fu"],"pdf_url":"https://arxiv.org/pdf/2401.16861v2.pdf","comment":"Project page: https://yikai-wang.github.io/seele/. Dataset:\n https://github.com/Yikai-Wang/ReS. Arxiv version uses small size images for\n fast preview. Full size PDF is available at project page"},{"id":"http://arxiv.org/abs/2403.11186v1","updated":"2024-03-17T12:15:02Z","published":"2024-03-17T12:15:02Z","title":"NetTrack: Tracking Highly Dynamic Objects with a Net","summary":" The complex dynamicity of open-world objects presents non-negligible\nchallenges for multi-object tracking (MOT), often manifested as severe\ndeformations, fast motion, and occlusions. Most methods that solely depend on\ncoarse-grained object cues, such as boxes and the overall appearance of the\nobject, are susceptible to degradation due to distorted internal relationships\nof dynamic objects. To address this problem, this work proposes NetTrack, an\nefficient, generic, and affordable tracking framework to introduce fine-grained\nlearning that is robust to dynamicity. Specifically, NetTrack constructs a\ndynamicity-aware association with a fine-grained Net, leveraging point-level\nvisual cues. Correspondingly, a fine-grained sampler and matching method have\nbeen incorporated. Furthermore, NetTrack learns object-text correspondence for\nfine-grained localization. To evaluate MOT in extremely dynamic open-world\nscenarios, a bird flock tracking (BFT) dataset is constructed, which exhibits\nhigh dynamicity with diverse species and open-world scenarios. Comprehensive\nevaluation on BFT validates the effectiveness of fine-grained learning on\nobject dynamicity, and thorough transfer experiments on challenging open-world\nbenchmarks, i.e., TAO, TAO-OW, AnimalTrack, and GMOT-40, validate the strong\ngeneralization ability of NetTrack even without finetuning. Project page:\nhttps://george-zhuang.github.io/nettrack/.\n","authors":["Guangze Zheng","Shijie Lin","Haobo Zuo","Changhong Fu","Jia Pan"],"pdf_url":"https://arxiv.org/pdf/2403.11186v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.11184v1","updated":"2024-03-17T12:14:34Z","published":"2024-03-17T12:14:34Z","title":"DuPL: Dual Student with Trustworthy Progressive Learning for Robust\n Weakly Supervised Semantic Segmentation","summary":" Recently, One-stage Weakly Supervised Semantic Segmentation (WSSS) with\nimage-level labels has gained increasing interest due to simplification over\nits cumbersome multi-stage counterpart. Limited by the inherent ambiguity of\nClass Activation Map (CAM), we observe that one-stage pipelines often encounter\nconfirmation bias caused by incorrect CAM pseudo-labels, impairing their final\nsegmentation performance. Although recent works discard many unreliable\npseudo-labels to implicitly alleviate this issue, they fail to exploit\nsufficient supervision for their models. To this end, we propose a dual student\nframework with trustworthy progressive learning (DuPL). Specifically, we\npropose a dual student network with a discrepancy loss to yield diverse CAMs\nfor each sub-net. The two sub-nets generate supervision for each other,\nmitigating the confirmation bias caused by learning their own incorrect\npseudo-labels. In this process, we progressively introduce more trustworthy\npseudo-labels to be involved in the supervision through dynamic threshold\nadjustment with an adaptive noise filtering strategy. Moreover, we believe that\nevery pixel, even discarded from supervision due to its unreliability, is\nimportant for WSSS. Thus, we develop consistency regularization on these\ndiscarded regions, providing supervision of every pixel. Experiment results\ndemonstrate the superiority of the proposed DuPL over the recent\nstate-of-the-art alternatives on PASCAL VOC 2012 and MS COCO datasets. Code is\navailable at https://github.com/Wu0409/DuPL.\n","authors":["Yuanchen Wu","Xichen Ye","Kequan Yang","Jide Li","Xiaoqiang Li"],"pdf_url":"https://arxiv.org/pdf/2403.11184v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.11176v1","updated":"2024-03-17T11:32:18Z","published":"2024-03-17T11:32:18Z","title":"Quality-Aware Image-Text Alignment for Real-World Image Quality\n Assessment","summary":" No-Reference Image Quality Assessment (NR-IQA) focuses on designing methods\nto measure image quality in alignment with human perception when a high-quality\nreference image is unavailable. The reliance on annotated Mean Opinion Scores\n(MOS) in the majority of state-of-the-art NR-IQA approaches limits their\nscalability and broader applicability to real-world scenarios. To overcome this\nlimitation, we propose QualiCLIP (Quality-aware CLIP), a CLIP-based\nself-supervised opinion-unaware method that does not require labeled MOS. In\nparticular, we introduce a quality-aware image-text alignment strategy to make\nCLIP generate representations that correlate with the inherent quality of the\nimages. Starting from pristine images, we synthetically degrade them with\nincreasing levels of intensity. Then, we train CLIP to rank these degraded\nimages based on their similarity to quality-related antonym text prompts, while\nguaranteeing consistent representations for images with comparable quality. Our\nmethod achieves state-of-the-art performance on several datasets with authentic\ndistortions. Moreover, despite not requiring MOS, QualiCLIP outperforms\nsupervised methods when their training dataset differs from the testing one,\nthus proving to be more suitable for real-world scenarios. Furthermore, our\napproach demonstrates greater robustness and improved explainability than\ncompeting methods. The code and the model are publicly available at\nhttps://github.com/miccunifi/QualiCLIP.\n","authors":["Lorenzo Agnolucci","Leonardo Galteri","Marco Bertini"],"pdf_url":"https://arxiv.org/pdf/2403.11176v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11172v1","updated":"2024-03-17T11:17:06Z","published":"2024-03-17T11:17:06Z","title":"Artifact Feature Purification for Cross-domain Detection of AI-generated\n Images","summary":" In the era of AIGC, the fast development of visual content generation\ntechnologies, such as diffusion models, bring potential security risks to our\nsociety. Existing generated image detection methods suffer from performance\ndrop when faced with out-of-domain generators and image scenes. To relieve this\nproblem, we propose Artifact Purification Network (APN) to facilitate the\nartifact extraction from generated images through the explicit and implicit\npurification processes. For the explicit one, a suspicious frequency-band\nproposal method and a spatial feature decomposition method are proposed to\nextract artifact-related features. For the implicit one, a training strategy\nbased on mutual information estimation is proposed to further purify the\nartifact-related features. Experiments show that for cross-generator detection,\nthe average accuracy of APN is 5.6% ~ 16.4% higher than the previous 10 methods\non GenImage dataset and 1.7% ~ 50.1% on DiffusionForensics dataset. For\ncross-scene detection, APN maintains its high performance. Via visualization\nanalysis, we find that the proposed method extracts flexible forgery patterns\nand condenses the forgery information diluted in irrelevant features. We also\nfind that the artifact features APN focuses on across generators and scenes are\nglobal and diverse. The code will be available on GitHub.\n","authors":["Zheling Meng","Bo Peng","Jing Dong","Tieniu Tan"],"pdf_url":"https://arxiv.org/pdf/2403.11172v1.pdf","comment":"This work is under consideration at Computer Vision and Image\n Understanding"},{"id":"http://arxiv.org/abs/2310.02044v3","updated":"2024-03-17T10:37:08Z","published":"2023-10-03T13:35:49Z","title":"How Physics and Background Attributes Impact Video Transformers in\n Robotic Manipulation: A Case Study on Planar Pushing","summary":" As model and dataset sizes continue to scale in robot learning, the need to\nunderstand what is the specific factor in the dataset that affects model\nperformance becomes increasingly urgent to ensure cost-effective data\ncollection and model performance. In this work, we empirically investigate how\nphysics attributes (color, friction coefficient, shape) and scene background\ncharacteristics, such as the complexity and dynamics of interactions with\nbackground objects, influence the performance of Video Transformers in\npredicting planar pushing trajectories. We aim to investigate three primary\nquestions: How do physics attributes and background scene characteristics\ninfluence model performance? What kind of changes in attributes are most\ndetrimental to model generalization? What proportion of fine-tuning data is\nrequired to adapt models to novel scenarios? To facilitate this research, we\npresent CloudGripper-Push-1K, a large real-world vision-based robot pushing\ndataset comprising 1278 hours and 460,000 videos of planar pushing interactions\nwith objects with different physics and background attributes. We also propose\nVideo Occlusion Transformer (VOT), a generic modular video-transformer-based\ntrajectory prediction framework which features 3 choices of 2D-spatial encoders\nas the subject of our case study. Dataset and codes will be available at\nhttps://cloudgripper.org.\n","authors":["Shutong Jin","Ruiyu Wang","Muhammad Zahid","Florian T. Pokorny"],"pdf_url":"https://arxiv.org/pdf/2310.02044v3.pdf","comment":"Under review at IEEE/RSJ IROS 2024"},{"id":"http://arxiv.org/abs/2403.11162v1","updated":"2024-03-17T10:06:38Z","published":"2024-03-17T10:06:38Z","title":"CGI-DM: Digital Copyright Authentication for Diffusion Models via\n Contrasting Gradient Inversion","summary":" Diffusion Models (DMs) have evolved into advanced image generation tools,\nespecially for few-shot generation where a pretrained model is fine-tuned on a\nsmall set of images to capture a specific style or object. Despite their\nsuccess, concerns exist about potential copyright violations stemming from the\nuse of unauthorized data in this process. In response, we present Contrasting\nGradient Inversion for Diffusion Models (CGI-DM), a novel method featuring\nvivid visual representations for digital copyright authentication. Our approach\ninvolves removing partial information of an image and recovering missing\ndetails by exploiting conceptual differences between the pretrained and\nfine-tuned models. We formulate the differences as KL divergence between latent\nvariables of the two models when given the same input image, which can be\nmaximized through Monte Carlo sampling and Projected Gradient Descent (PGD).\nThe similarity between original and recovered images serves as a strong\nindicator of potential infringements. Extensive experiments on the WikiArt and\nDreambooth datasets demonstrate the high accuracy of CGI-DM in digital\ncopyright authentication, surpassing alternative validation techniques. Code\nimplementation is available at https://github.com/Nicholas0228/Revelio.\n","authors":["Xiaoyu Wu","Yang Hua","Chumeng Liang","Jiaru Zhang","Hao Wang","Tao Song","Haibing Guan"],"pdf_url":"https://arxiv.org/pdf/2403.11162v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.11157v1","updated":"2024-03-17T09:41:20Z","published":"2024-03-17T09:41:20Z","title":"Selective Hourglass Mapping for Universal Image Restoration Based on\n Diffusion Model","summary":" Universal image restoration is a practical and potential computer vision task\nfor real-world applications. The main challenge of this task is handling the\ndifferent degradation distributions at once. Existing methods mainly utilize\ntask-specific conditions (e.g., prompt) to guide the model to learn different\ndistributions separately, named multi-partite mapping. However, it is not\nsuitable for universal model learning as it ignores the shared information\nbetween different tasks. In this work, we propose an advanced selective\nhourglass mapping strategy based on diffusion model, termed DiffUIR. Two novel\nconsiderations make our DiffUIR non-trivial. Firstly, we equip the model with\nstrong condition guidance to obtain accurate generation direction of diffusion\nmodel (selective). More importantly, DiffUIR integrates a flexible shared\ndistribution term (SDT) into the diffusion algorithm elegantly and naturally,\nwhich gradually maps different distributions into a shared one. In the reverse\nprocess, combined with SDT and strong condition guidance, DiffUIR iteratively\nguides the shared distribution to the task-specific distribution with high\nimage quality (hourglass). Without bells and whistles, by only modifying the\nmapping strategy, we achieve state-of-the-art performance on five image\nrestoration tasks, 22 benchmarks in the universal setting and zero-shot\ngeneralization setting. Surprisingly, by only using a lightweight model (only\n0.89M), we could achieve outstanding performance. The source code and\npre-trained models are available at https://github.com/iSEE-Laboratory/DiffUIR\n","authors":["Dian Zheng","Xiao-Ming Wu","Shuzhou Yang","Jian Zhang","Jian-Fang Hu","Wei-Shi Zheng"],"pdf_url":"https://arxiv.org/pdf/2403.11157v1.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2312.16476v3","updated":"2024-03-17T09:12:58Z","published":"2023-12-27T08:50:01Z","title":"SVGDreamer: Text Guided SVG Generation with Diffusion Model","summary":" Recently, text-guided scalable vector graphics (SVGs) synthesis has shown\npromise in domains such as iconography and sketch. However, existing\ntext-to-SVG generation methods lack editability and struggle with visual\nquality and result diversity. To address these limitations, we propose a novel\ntext-guided vector graphics synthesis method called SVGDreamer. SVGDreamer\nincorporates a semantic-driven image vectorization (SIVE) process that enables\nthe decomposition of synthesis into foreground objects and background, thereby\nenhancing editability. Specifically, the SIVE process introduce attention-based\nprimitive control and an attention-mask loss function for effective control and\nmanipulation of individual elements. Additionally, we propose a Vectorized\nParticle-based Score Distillation (VPSD) approach to tackle the challenges of\ncolor over-saturation, vector primitives over-smoothing, and limited result\ndiversity in existing text-to-SVG generation methods. Furthermore, on the basis\nof VPSD, we introduce Reward Feedback Learning (ReFL) to accelerate VPSD\nconvergence and improve aesthetic appeal. Extensive experiments have been\nconducted to validate the effectiveness of SVGDreamer, demonstrating its\nsuperiority over baseline methods in terms of editability, visual quality, and\ndiversity. The code and demo of SVGDreamer can be found at\nhttps://ximinng.github.io/SVGDreamer-project/\n","authors":["Ximing Xing","Haitao Zhou","Chuang Wang","Jing Zhang","Dong Xu","Qian Yu"],"pdf_url":"https://arxiv.org/pdf/2312.16476v3.pdf","comment":"19 pages, 16 figures, project link:\n https://ximinng.github.io/SVGDreamer-project/"},{"id":"http://arxiv.org/abs/2312.17225v2","updated":"2024-03-17T09:07:35Z","published":"2023-12-28T18:53:39Z","title":"4DGen: Grounded 4D Content Generation with Spatial-temporal Consistency","summary":" Aided by text-to-image and text-to-video diffusion models, existing 4D\ncontent creation pipelines utilize score distillation sampling to optimize the\nentire dynamic 3D scene. However, as these pipelines generate 4D content from\ntext or image inputs, they incur significant time and effort in prompt\nengineering through trial and error. This work introduces 4DGen, a novel,\nholistic framework for grounded 4D content creation that decomposes the 4D\ngeneration task into multiple stages. We identify static 3D assets and\nmonocular video sequences as key components in constructing the 4D content. Our\npipeline facilitates conditional 4D generation, enabling users to specify\ngeometry (3D assets) and motion (monocular videos), thus offering superior\ncontrol over content creation. Furthermore, we construct our 4D representation\nusing dynamic 3D Gaussians, which permits efficient, high-resolution\nsupervision through rendering during training, thereby facilitating\nhigh-quality 4D generation. Additionally, we employ spatial-temporal pseudo\nlabels on anchor frames, along with seamless consistency priors implemented\nthrough 3D-aware score distillation sampling and smoothness regularizations.\nCompared to existing baselines, our approach yields competitive results in\nfaithfully reconstructing input signals and realistically inferring renderings\nfrom novel viewpoints and timesteps. Most importantly, our method supports\ngrounded generation, offering users enhanced control, a feature difficult to\nachieve with previous methods. Project page:\nhttps://vita-group.github.io/4DGen/\n","authors":["Yuyang Yin","Dejia Xu","Zhangyang Wang","Yao Zhao","Yunchao Wei"],"pdf_url":"https://arxiv.org/pdf/2312.17225v2.pdf","comment":"Project page: https://vita-group.github.io/4DGen/"},{"id":"http://arxiv.org/abs/2403.11150v1","updated":"2024-03-17T09:01:02Z","published":"2024-03-17T09:01:02Z","title":"Training A Small Emotional Vision Language Model for Visual Art\n Comprehension","summary":" This paper develops small vision language models to understand visual art,\nwhich, given an art work, aims to identify its emotion category and explain\nthis prediction with natural language. While small models are computationally\nefficient, their capacity is much limited compared with large models. To break\nthis trade-off, this paper builds a small emotional vision language model\n(SEVLM) by emotion modeling and input-output feature alignment. On the one\nhand, based on valence-arousal-dominance (VAD) knowledge annotated by\npsychology experts, we introduce and fuse emotional features derived through\nVAD dictionary and a VAD head to align VAD vectors of predicted emotion\nexplanation and the ground truth. This allows the vision language model to\nbetter understand and generate emotional texts, compared with using traditional\ntext embeddings alone. On the other hand, we design a contrastive head to pull\nclose embeddings of the image, its emotion class, and explanation, which aligns\nmodel outputs and inputs. On two public affective explanation datasets, we show\nthat the proposed techniques consistently improve the visual art understanding\nperformance of baseline SEVLMs. Importantly, the proposed model can be trained\nand evaluated on a single RTX 2080 Ti while exhibiting very strong performance:\nit not only outperforms the state-of-the-art small models but is also\ncompetitive compared with LLaVA 7B after fine-tuning and GPT4(V).\n","authors":["Jing Zhang","Liang Zheng","Dan Guo","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2403.11150v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.02044v2","updated":"2024-03-17T08:51:50Z","published":"2024-01-04T03:09:39Z","title":"Multi-modal vision-language model for generalizable annotation-free\n pathological lesions localization","summary":" Defining pathologies automatically from medical images aids the understanding\nof the emergence and progression of diseases, and such an ability is crucial in\nclinical diagnostics. However, existing deep learning models heavily rely on\nexpert annotations and lack generalization capabilities in open clinical\nenvironments. In this study, we present a generalizable vision-language\npre-training model for Annotation-Free pathological lesions Localization\n(AFLoc). The core strength of AFLoc lies in its extensive multi-level semantic\nstructure-based contrastive learning, which comprehensively aligns\nmulti-granularity medical concepts from reports with abundant image features,\nto adapt to the diverse expressions of pathologies and unseen pathologies\nwithout the reliance on image annotations from experts. We demonstrate the\nproof of concept on CXR images, with extensive experimental validation across 4\ndistinct external datasets, encompassing 11 types of chest pathologies. The\nresults demonstrate that AFLoc surpasses 6 state-of-the-art methods and even\noutperforms the human benchmark in locating 5 different pathologies. We further\nverify its generalization ability in retinal fundus image pathological lesions\nlocalization. Our approach showcases AFLoc versatilities and underscores its\nsuitability in complex clinical environments.\n","authors":["Hao Yang","Hong-Yu Zhou","Zhihuan Li","Yuanxu Gao","Cheng Li","Weijian Huang","Jiarun Liu","Hairong Zheng","Kang Zhang","Shanshan Wang"],"pdf_url":"https://arxiv.org/pdf/2401.02044v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.01524v2","updated":"2024-03-17T08:42:19Z","published":"2024-01-03T03:33:48Z","title":"Multimodal self-supervised learning for lesion localization","summary":" Multimodal deep learning utilizing imaging and diagnostic reports has made\nimpressive progress in the field of medical imaging diagnostics, demonstrating\na particularly strong capability for auxiliary diagnosis in cases where\nsufficient annotation information is lacking. Nonetheless, localizing diseases\naccurately without detailed positional annotations remains a challenge.\nAlthough existing methods have attempted to utilize local information to\nachieve fine-grained semantic alignment, their capability in extracting the\nfine-grained semantics of the comprehensive context within reports is limited.\nTo address this problem, a new method is introduced that takes full sentences\nfrom textual reports as the basic units for local semantic alignment. This\napproach combines chest X-ray images with their corresponding textual reports,\nperforming contrastive learning at both global and local levels. The leading\nresults obtained by this method on multiple datasets confirm its efficacy in\nthe task of lesion localization.\n","authors":["Hao Yang","Hong-Yu Zhou","Cheng Li","Weijian Huang","Jiarun Liu","Yong Liang","Shanshan Wang"],"pdf_url":"https://arxiv.org/pdf/2401.01524v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.15321v2","updated":"2024-03-17T08:41:49Z","published":"2024-02-23T13:39:59Z","title":"OpenSUN3D: 1st Workshop Challenge on Open-Vocabulary 3D Scene\n Understanding","summary":" This report provides an overview of the challenge hosted at the OpenSUN3D\nWorkshop on Open-Vocabulary 3D Scene Understanding held in conjunction with\nICCV 2023. The goal of this workshop series is to provide a platform for\nexploration and discussion of open-vocabulary 3D scene understanding tasks,\nincluding but not limited to segmentation, detection and mapping. We provide an\noverview of the challenge hosted at the workshop, present the challenge\ndataset, the evaluation methodology, and brief descriptions of the winning\nmethods. For additional details, please see\nhttps://opensun3d.github.io/index_iccv23.html.\n","authors":["Francis Engelmann","Ayca Takmaz","Jonas Schult","Elisabetta Fedele","Johanna Wald","Songyou Peng","Xi Wang","Or Litany","Siyu Tang","Federico Tombari","Marc Pollefeys","Leonidas Guibas","Hongbo Tian","Chunjie Wang","Xiaosheng Yan","Bingwen Wang","Xuanyang Zhang","Xiao Liu","Phuc Nguyen","Khoi Nguyen","Anh Tran","Cuong Pham","Zhening Huang","Xiaoyang Wu","Xi Chen","Hengshuang Zhao","Lei Zhu","Joan Lasenby"],"pdf_url":"https://arxiv.org/pdf/2402.15321v2.pdf","comment":"Our OpenSUN3D workshop website for ICCV 2023:\n https://opensun3d.github.io/index_iccv23.html"},{"id":"http://arxiv.org/abs/2303.07605v2","updated":"2024-03-17T08:36:52Z","published":"2023-03-14T02:58:27Z","title":"Modeling Continuous Motion for 3D Point Cloud Object Tracking","summary":" The task of 3D single object tracking (SOT) with LiDAR point clouds is\ncrucial for various applications, such as autonomous driving and robotics.\nHowever, existing approaches have primarily relied on appearance matching or\nmotion modeling within only two successive frames, thereby overlooking the\nlong-range continuous motion property of objects in 3D space. To address this\nissue, this paper presents a novel approach that views each tracklet as a\ncontinuous stream: at each timestamp, only the current frame is fed into the\nnetwork to interact with multi-frame historical features stored in a memory\nbank, enabling efficient exploitation of sequential information. To achieve\neffective cross-frame message passing, a hybrid attention mechanism is designed\nto account for both long-range relation modeling and local geometric feature\nextraction. Furthermore, to enhance the utilization of multi-frame features for\nrobust tracking, a contrastive sequence enhancement strategy is proposed, which\nuses ground truth tracklets to augment training sequences and promote\ndiscrimination against false positives in a contrastive manner. Extensive\nexperiments demonstrate that the proposed method outperforms the\nstate-of-the-art method by significant margins on multiple benchmarks.\n","authors":["Zhipeng Luo","Gongjie Zhang","Changqing Zhou","Zhonghua Wu","Qingyi Tao","Lewei Lu","Shijian Lu"],"pdf_url":"https://arxiv.org/pdf/2303.07605v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11135v1","updated":"2024-03-17T08:09:48Z","published":"2024-03-17T08:09:48Z","title":"A lightweight deep learning pipeline with DRDA-Net and MobileNet for\n breast cancer classification","summary":" Accurate and early detection of breast cancer is essential for successful\ntreatment. This paper introduces a novel deep-learning approach for improved\nbreast cancer classification in histopathological images, a crucial step in\ndiagnosis. Our method hinges on the Dense Residual Dual-Shuffle Attention\nNetwork (DRDA-Net), inspired by ShuffleNet's efficient architecture. DRDA-Net\nachieves exceptional accuracy across various magnification levels on the\nBreaKHis dataset, a breast cancer histopathology analysis benchmark. However,\nfor real-world deployment, computational efficiency is paramount. We integrate\na pre-trained MobileNet model renowned for its lightweight design to address\ncomputational. MobileNet ensures fast execution even on devices with limited\nresources without sacrificing performance. This combined approach offers a\npromising solution for accurate breast cancer diagnosis, paving the way for\nfaster and more accessible screening procedures.\n","authors":["Mahdie Ahmadi","Nader Karimi","Shadrokh Samavi"],"pdf_url":"https://arxiv.org/pdf/2403.11135v1.pdf","comment":"4 pages, 3 figures"},{"id":"http://arxiv.org/abs/2403.02148v3","updated":"2024-03-17T07:58:23Z","published":"2024-03-04T15:57:29Z","title":"MiM-ISTD: Mamba-in-Mamba for Efficient Infrared Small Target Detection","summary":" Recently, infrared small target detection (ISTD) has made significant\nprogress, thanks to the development of basic models. Specifically, the\nstructures combining convolutional networks with transformers can successfully\nextract both local and global features. However, the disadvantage of the\ntransformer is also inherited, i.e., the quadratic computational complexity to\nthe length of the sequence. Inspired by the recent basic model with linear\ncomplexity for long-distance modeling, called Mamba, we explore the potential\nof this state space model for ISTD task in terms of effectiveness and\nefficiency in the paper. However, directly applying Mamba achieves poor\nperformance since local features, which are critical to detecting small\ntargets, cannot be fully exploited. Instead, we tailor a Mamba-in-Mamba\n(MiM-ISTD) structure for efficient ISTD. Specifically, we treat the local\npatches as \"visual sentences\" and use the Outer Mamba to explore the global\ninformation. We then decompose each visual sentence into sub-patches as \"visual\nwords\" and use the Inner Mamba to further explore the local information among\nwords in the visual sentence with negligible computational costs. By\naggregating the word and sentence features, the MiM-ISTD can effectively\nexplore both global and local information. Experiments on NUAA-SIRST and\nIRSTD-1k show the superior accuracy and efficiency of our method. Specifically,\nMiM-ISTD is $10 \\times$ faster than the SOTA method and reduces GPU memory\nusage by 73.4$\\%$ when testing on $2048 \\times 2048$ image, overcoming the\ncomputation and memory constraints on high-resolution infrared images. Source\ncode is available at https://github.com/txchen-USTC/MiM-ISTD.\n","authors":["Tianxiang Chen","Zhentao Tan","Tao Gong","Qi Chu","Yue Wu","Bin Liu","Jieping Ye","Nenghai Yu"],"pdf_url":"https://arxiv.org/pdf/2403.02148v3.pdf","comment":"The first Mamba-based model for infrared small target detection"},{"id":"http://arxiv.org/abs/2403.11134v1","updated":"2024-03-17T07:57:08Z","published":"2024-03-17T07:57:08Z","title":"Recent Advances in 3D Gaussian Splatting","summary":" The emergence of 3D Gaussian Splatting (3DGS) has greatly accelerated the\nrendering speed of novel view synthesis. Unlike neural implicit representations\nlike Neural Radiance Fields (NeRF) that represent a 3D scene with position and\nviewpoint-conditioned neural networks, 3D Gaussian Splatting utilizes a set of\nGaussian ellipsoids to model the scene so that efficient rendering can be\naccomplished by rasterizing Gaussian ellipsoids into images. Apart from the\nfast rendering speed, the explicit representation of 3D Gaussian Splatting\nfacilitates editing tasks like dynamic reconstruction, geometry editing, and\nphysical simulation. Considering the rapid change and growing number of works\nin this field, we present a literature review of recent 3D Gaussian Splatting\nmethods, which can be roughly classified into 3D reconstruction, 3D editing,\nand other downstream applications by functionality. Traditional point-based\nrendering methods and the rendering formulation of 3D Gaussian Splatting are\nalso illustrated for a better understanding of this technique. This survey aims\nto help beginners get into this field quickly and provide experienced\nresearchers with a comprehensive overview, which can stimulate the future\ndevelopment of the 3D Gaussian Splatting representation.\n","authors":["Tong Wu","Yu-Jie Yuan","Ling-Xiao Zhang","Jie Yang","Yan-Pei Cao","Ling-Qi Yan","Lin Gao"],"pdf_url":"https://arxiv.org/pdf/2403.11134v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00476v2","updated":"2024-03-17T07:50:04Z","published":"2024-03-01T12:02:19Z","title":"TempCompass: Do Video LLMs Really Understand Videos?","summary":" Recently, there is a surge in interest surrounding video large language\nmodels (Video LLMs). However, existing benchmarks fail to provide a\ncomprehensive feedback on the temporal perception ability of Video LLMs. On the\none hand, most of them are unable to distinguish between different temporal\naspects (e.g., speed, direction) and thus cannot reflect the nuanced\nperformance on these specific aspects. On the other hand, they are limited in\nthe diversity of task formats (e.g., only multi-choice QA), which hinders the\nunderstanding of how temporal perception performance may vary across different\ntypes of tasks. Motivated by these two problems, we propose the\n\\textbf{TempCompass} benchmark, which introduces a diversity of temporal\naspects and task formats. To collect high-quality test data, we devise two\nnovel strategies: (1) In video collection, we construct conflicting videos that\nshare the same static content but differ in a specific temporal aspect, which\nprevents Video LLMs from leveraging single-frame bias or language priors. (2)\nTo collect the task instructions, we propose a paradigm where humans first\nannotate meta-information for a video and then an LLM generates the\ninstruction. We also design an LLM-based approach to automatically and\naccurately evaluate the responses from Video LLMs. Based on TempCompass, we\ncomprehensively evaluate 8 state-of-the-art (SOTA) Video LLMs and 3 Image LLMs,\nand reveal the discerning fact that these models exhibit notably poor temporal\nperception ability. The data and evaluation code are available at\nhttps://github.com/llyx97/TempCompass.\n","authors":["Yuanxin Liu","Shicheng Li","Yi Liu","Yuxiang Wang","Shuhuai Ren","Lei Li","Sishuo Chen","Xu Sun","Lu Hou"],"pdf_url":"https://arxiv.org/pdf/2403.00476v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11131v1","updated":"2024-03-17T07:47:26Z","published":"2024-03-17T07:47:26Z","title":"Omni-Recon: Towards General-Purpose Neural Radiance Fields for Versatile\n 3D Applications","summary":" Recent breakthroughs in Neural Radiance Fields (NeRFs) have sparked\nsignificant demand for their integration into real-world 3D applications.\nHowever, the varied functionalities required by different 3D applications often\nnecessitate diverse NeRF models with various pipelines, leading to tedious NeRF\ntraining for each target task and cumbersome trial-and-error experiments.\nDrawing inspiration from the generalization capability and adaptability of\nemerging foundation models, our work aims to develop one general-purpose NeRF\nfor handling diverse 3D tasks. We achieve this by proposing a framework called\nOmni-Recon, which is capable of (1) generalizable 3D reconstruction and\nzero-shot multitask scene understanding, and (2) adaptability to diverse\ndownstream 3D applications such as real-time rendering and scene editing. Our\nkey insight is that an image-based rendering pipeline, with accurate geometry\nand appearance estimation, can lift 2D image features into their 3D\ncounterparts, thus extending widely explored 2D tasks to the 3D world in a\ngeneralizable manner. Specifically, our Omni-Recon features a general-purpose\nNeRF model using image-based rendering with two decoupled branches: one complex\ntransformer-based branch that progressively fuses geometry and appearance\nfeatures for accurate geometry estimation, and one lightweight branch for\npredicting blending weights of source views. This design achieves\nstate-of-the-art (SOTA) generalizable 3D surface reconstruction quality with\nblending weights reusable across diverse tasks for zero-shot multitask scene\nunderstanding. In addition, it can enable real-time rendering after baking the\ncomplex geometry branch into meshes, swift adaptation to achieve SOTA\ngeneralizable 3D understanding performance, and seamless integration with 2D\ndiffusion models for text-guided 3D editing.\n","authors":["Yonggan Fu","Huaizhi Qu","Zhifan Ye","Chaojian Li","Kevin Zhao","Yingyan Lin"],"pdf_url":"https://arxiv.org/pdf/2403.11131v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11127v1","updated":"2024-03-17T07:29:32Z","published":"2024-03-17T07:29:32Z","title":"GRA: Detecting Oriented Objects through Group-wise Rotating and\n Attention","summary":" Oriented object detection, an emerging task in recent years, aims to identify\nand locate objects across varied orientations. This requires the detector to\naccurately capture the orientation information, which varies significantly\nwithin and across images. Despite the existing substantial efforts,\nsimultaneously ensuring model effectiveness and parameter efficiency remains\nchallenging in this scenario. In this paper, we propose a lightweight yet\neffective \\textbf{G}roup-wise \\textbf{R}otating and \\textbf{A}ttention (GRA)\nmodule to replace the convolution operations in backbone networks for oriented\nobject detection. GRA can adaptively capture fine-grained features of objects\nwith diverse orientations, comprising two key components: Group-wise Rotating\nand Group-wise Attention. Group-wise Rotating first divides the convolution\nkernel into groups, where each group extracts different object features by\nrotating at a specific angle according to the object orientation. Subsequently,\nGroup-wise Attention is employed to adaptively enhance the object-related\nregions in the feature. The collaborative effort of these components enables\nGRA to effectively capture the various orientation information while\nmaintaining parameter efficiency. Extensive experimental results demonstrate\nthe superiority of our method. For example, GRA achieves a new state-of-the-art\n(SOTA) on the DOTA-v2.0 benchmark, while saving the parameters by nearly 50\\%\ncompared to the previous SOTA method. Code will be released.\n","authors":["Jiangshan Wang","Yifan Pu","Yizeng Han","Jiayi Guo","Yiru Wang","Xiu Li","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2403.11127v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11122v1","updated":"2024-03-17T07:07:12Z","published":"2024-03-17T07:07:12Z","title":"LERENet: Eliminating Intra-class Differences for Metal Surface Defect\n Few-shot Semantic Segmentation","summary":" Few-shot segmentation models excel in metal defect detection due to their\nrapid generalization ability to new classes and pixel-level segmentation,\nrendering them ideal for addressing data scarcity issues and achieving refined\nobject delineation in industrial applications. Existing works neglect the\n\\textit{Intra-Class Differences}, inherent in metal surface defect data, which\nhinders the model from learning sufficient knowledge from the support set to\nguide the query set segmentation. Specifically, it can be categorized into two\ntypes: the \\textit{Semantic Difference} induced by internal factors in metal\nsamples and the \\textit{Distortion Difference} caused by external factors of\nsurroundings. To address these differences, we introduce a \\textbf{L}ocal\nd\\textbf{E}scriptor based \\textbf{R}easoning and \\textbf{E}xcitation\n\\textbf{Net}work (\\textbf{LERENet}) to learn the two-view guidance, i.e., local\nand global information from the graph and feature space, and fuse them to\nsegment precisely. Since the relation structure of local features embedded in\ngraph space will help to eliminate \\textit{Semantic Difference}, we employ\nMulti-Prototype Reasoning (MPR) module, extracting local descriptors based\nprototypes and analyzing local-view feature relevance in support-query pairs.\nBesides, due to the global information that will assist in countering the\n\\textit{Distortion Difference} in observations, we utilize Multi-Prototype\nExcitation (MPE) module to capture the global-view relations in support-query\npairs. Finally, we employ an Information Fusion Module (IFM) to fuse learned\nprototypes in local and global views to generate pixel-level masks. Our\ncomprehensive experiments on defect datasets demonstrate that it outperforms\nexisting benchmarks, establishing a new state-of-the-art.\n","authors":["Hanze Ding","Zhangkai Wu","Jiyan Zhang","Ming Ping","Yanfang Liu"],"pdf_url":"https://arxiv.org/pdf/2403.11122v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11121v1","updated":"2024-03-17T07:04:09Z","published":"2024-03-17T07:04:09Z","title":"A Versatile Framework for Multi-scene Person Re-identification","summary":" Person Re-identification (ReID) has been extensively developed for a decade\nin order to learn the association of images of the same person across\nnon-overlapping camera views. To overcome significant variations between images\nacross camera views, mountains of variants of ReID models were developed for\nsolving a number of challenges, such as resolution change, clothing change,\nocclusion, modality change, and so on. Despite the impressive performance of\nmany ReID variants, these variants typically function distinctly and cannot be\napplied to other challenges. To our best knowledge, there is no versatile ReID\nmodel that can handle various ReID challenges at the same time. This work\ncontributes to the first attempt at learning a versatile ReID model to solve\nsuch a problem. Our main idea is to form a two-stage prompt-based twin modeling\nframework called VersReID. Our VersReID firstly leverages the scene label to\ntrain a ReID Bank that contains abundant knowledge for handling various scenes,\nwhere several groups of scene-specific prompts are used to encode different\nscene-specific knowledge. In the second stage, we distill a V-Branch model with\nversatile prompts from the ReID Bank for adaptively solving the ReID of\ndifferent scenes, eliminating the demand for scene labels during the inference\nstage. To facilitate training VersReID, we further introduce the multi-scene\nproperties into self-supervised learning of ReID via a multi-scene prioris data\naugmentation (MPDA) strategy. Through extensive experiments, we demonstrate the\nsuccess of learning an effective and versatile ReID model for handling ReID\ntasks under multi-scene conditions without manual assignment of scene labels in\nthe inference stage, including general, low-resolution, clothing change,\nocclusion, and cross-modality scenes. Codes and models are available at\nhttps://github.com/iSEE-Laboratory/VersReID.\n","authors":["Wei-Shi Zheng","Junkai Yan","Yi-Xing Peng"],"pdf_url":"https://arxiv.org/pdf/2403.11121v1.pdf","comment":"To appear in TPAMI"},{"id":"http://arxiv.org/abs/2403.11120v1","updated":"2024-03-17T07:02:55Z","published":"2024-03-17T07:02:55Z","title":"Unifying Feature and Cost Aggregation with Transformers for Semantic and\n Visual Correspondence","summary":" This paper introduces a Transformer-based integrative feature and cost\naggregation network designed for dense matching tasks. In the context of dense\nmatching, many works benefit from one of two forms of aggregation: feature\naggregation, which pertains to the alignment of similar features, or cost\naggregation, a procedure aimed at instilling coherence in the flow estimates\nacross neighboring pixels. In this work, we first show that feature aggregation\nand cost aggregation exhibit distinct characteristics and reveal the potential\nfor substantial benefits stemming from the judicious use of both aggregation\nprocesses. We then introduce a simple yet effective architecture that harnesses\nself- and cross-attention mechanisms to show that our approach unifies feature\naggregation and cost aggregation and effectively harnesses the strengths of\nboth techniques. Within the proposed attention layers, the features and cost\nvolume both complement each other, and the attention layers are interleaved\nthrough a coarse-to-fine design to further promote accurate correspondence\nestimation. Finally at inference, our network produces multi-scale predictions,\ncomputes their confidence scores, and selects the most confident flow for final\nprediction. Our framework is evaluated on standard benchmarks for semantic\nmatching, and also applied to geometric matching, where we show that our\napproach achieves significant improvements compared to existing methods.\n","authors":["Sunghwan Hong","Seokju Cho","Seungryong Kim","Stephen Lin"],"pdf_url":"https://arxiv.org/pdf/2403.11120v1.pdf","comment":"Accepted by ICLR'24 (camera ready version) Code and weights can be\n found here: https://github.com/KU-CVLAB/UFC"},{"id":"http://arxiv.org/abs/2403.11116v1","updated":"2024-03-17T06:53:44Z","published":"2024-03-17T06:53:44Z","title":"PhD: A Prompted Visual Hallucination Evaluation Dataset","summary":" The rapid growth of Large Language Models (LLMs) has driven the development\nof Large Vision-Language Models (LVLMs). The challenge of hallucination,\nprevalent in LLMs, also emerges in LVLMs. However, most existing efforts mainly\nfocus on object hallucination in LVLM, ignoring diverse types of LVLM\nhallucinations. In this study, we delve into the Intrinsic Vision-Language\nHallucination (IVL-Hallu) issue, thoroughly analyzing different types of\nIVL-Hallu on their causes and reflections. Specifically, we propose several\nnovel IVL-Hallu tasks and categorize them into four types: (a) object\nhallucination, which arises from the misidentification of objects, (b)\nattribute hallucination, which is caused by the misidentification of\nattributes, (c) multi-modal conflicting hallucination, which derives from the\ncontradictions between textual and visual information, and (d)\ncounter-common-sense hallucination, which owes to the contradictions between\nthe LVLM knowledge and actual images. Based on these taxonomies, we propose a\nmore challenging benchmark named PhD to evaluate and explore IVL-Hallu. An\nautomated pipeline is proposed for generating different types of IVL-Hallu\ndata. Extensive experiments on five SOTA LVLMs reveal their inability to\neffectively tackle our proposed IVL-Hallu tasks, with detailed analyses and\ninsights on the origins and possible solutions of these new challenging\nIVL-Hallu tasks, facilitating future researches on IVL-Hallu and LVLM. The\nbenchmark can be accessed at\n\\href{https://github.com/jiazhen-code/IntrinsicHallu}{this https URL}.\n","authors":["Jiazhen Liu","Yuhan Fu","Ruobing Xie","Runquan Xie","Xingwu Sun","Fengzong Lian","Zhanhui Kang","Xirong Li"],"pdf_url":"https://arxiv.org/pdf/2403.11116v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07952v3","updated":"2024-03-17T06:49:19Z","published":"2023-06-13T17:51:18Z","title":"MOFI: Learning Image Representations from Noisy Entity Annotated Images","summary":" We present MOFI, Manifold OF Images, a new vision foundation model designed\nto learn image representations from noisy entity annotated images. MOFI differs\nfrom previous work in two key aspects: (i) pre-training data, and (ii) training\nrecipe. Regarding data, we introduce a new approach to automatically assign\nentity labels to images from noisy image-text pairs. Our approach involves\nemploying a named entity recognition model to extract entities from the\nalt-text, and then using a CLIP model to select the correct entities as labels\nof the paired image. It's a simple, cost-effective method that can scale to\nhandle billions of web-mined image-text pairs. Through this method, we have\ncreated Image-to-Entities (I2E), a new dataset with 1 billion images and 2\nmillion distinct entities, covering rich visual concepts in the wild. Building\nupon the I2E dataset, we study different training recipes like supervised\npre-training, contrastive pre-training, and multi-task learning. For\ncontrastive pre-training, we treat entity names as free-form text, and further\nenrich them with entity descriptions. Experiments show that supervised\npre-training with large-scale fine-grained entity labels is highly effective\nfor image retrieval tasks, and multi-task training further improves the\nperformance. The final MOFI model achieves 86.66% mAP on the challenging\nGPR1200 dataset, surpassing the previous state-of-the-art performance of 72.19%\nfrom OpenAI's CLIP model. Further experiments on zero-shot and linear probe\nimage classification also show that MOFI outperforms a CLIP model trained on\nthe original image-text data, demonstrating the effectiveness of the I2E\ndataset in learning strong image representations. We release our code and model\nweights at https://github.com/apple/ml-mofi.\n","authors":["Wentao Wu","Aleksei Timofeev","Chen Chen","Bowen Zhang","Kun Duan","Shuangning Liu","Yantao Zheng","Jonathon Shlens","Xianzhi Du","Zhe Gan","Yinfei Yang"],"pdf_url":"https://arxiv.org/pdf/2306.07952v3.pdf","comment":"Accepted to ICLR 2024"},{"id":"http://arxiv.org/abs/2312.04831v2","updated":"2024-03-17T06:44:17Z","published":"2023-12-08T05:08:06Z","title":"Towards Context-Stable and Visual-Consistent Image Inpainting","summary":" Recent progress in inpainting increasingly relies on generative models,\nleveraging their strong generation capabilities for addressing large irregular\nmasks. However, this enhanced generation often introduces context-instability,\nleading to arbitrary object generation within masked regions. This paper\nproposes a balanced solution, emphasizing the importance of unmasked regions in\nguiding inpainting while preserving generation capacity. Our approach, Aligned\nStable Inpainting with UnKnown Areas Prior (ASUKA), employs a Masked\nAuto-Encoder (MAE) to produce reconstruction-based prior. Aligned with the\npowerful Stable Diffusion inpainting model (SD), ASUKA significantly improves\ncontext stability. ASUKA further adopts an inpainting-specialized decoder,\nhighly reducing the color inconsistency issue of SD and thus ensuring more\nvisual-consistent inpainting. We validate effectiveness of inpainting\nalgorithms on benchmark dataset Places 2 and a collection of several existing\ndatasets, dubbed MISATO, across diverse domains and masking scenarios. Results\non these benchmark datasets confirm ASUKA's efficacy in both context-stability\nand visual-consistency compared to SD and other inpainting algorithms.\n","authors":["Yikai Wang","Chenjie Cao","Ke Fan Xiangyang Xue Yanwei Fu"],"pdf_url":"https://arxiv.org/pdf/2312.04831v2.pdf","comment":"Project page: https://yikai-wang.github.io/asuka/ where full-size PDF\n with appendix is available. Dataset:\n https://github.com/Yikai-Wang/asuka-misato. Yikai Wang and Chenjie Cao\n contribute equally"},{"id":"http://arxiv.org/abs/2403.11113v1","updated":"2024-03-17T06:40:50Z","published":"2024-03-17T06:40:50Z","title":"Local-consistent Transformation Learning for Rotation-invariant Point\n Cloud Analysis","summary":" Rotation invariance is an important requirement for point shape analysis. To\nachieve this, current state-of-the-art methods attempt to construct the local\nrotation-invariant representation through learning or defining the local\nreference frame (LRF). Although efficient, these LRF-based methods suffer from\nperturbation of local geometric relations, resulting in suboptimal local\nrotation invariance. To alleviate this issue, we propose a Local-consistent\nTransformation (LocoTrans) learning strategy. Specifically, we first construct\nthe local-consistent reference frame (LCRF) by considering the symmetry of the\ntwo axes in LRF. In comparison with previous LRFs, our LCRF is able to preserve\nlocal geometric relationships better through performing local-consistent\ntransformation. However, as the consistency only exists in local regions, the\nrelative pose information is still lost in the intermediate layers of the\nnetwork. We mitigate such a relative pose issue by developing a relative pose\nrecovery (RPR) module. RPR aims to restore the relative pose between adjacent\ntransformed patches. Equipped with LCRF and RPR, our LocoTrans is capable of\nlearning local-consistent transformation and preserving local geometry, which\nbenefits rotation invariance learning. Competitive performance under arbitrary\nrotations on both shape classification and part segmentation tasks and\nablations can demonstrate the effectiveness of our method. Code will be\navailable publicly at https://github.com/wdttt/LocoTrans.\n","authors":["Yiyang Chen","Lunhao Duan","Shanshan Zhao","Changxing Ding","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2403.11113v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.11111v1","updated":"2024-03-17T06:31:16Z","published":"2024-03-17T06:31:16Z","title":"3D Human Reconstruction in the Wild with Synthetic Data Using Generative\n Models","summary":" In this work, we show that synthetic data created by generative models is\ncomplementary to computer graphics (CG) rendered data for achieving remarkable\ngeneralization performance on diverse real-world scenes for 3D human pose and\nshape estimation (HPS). Specifically, we propose an effective approach based on\nrecent diffusion models, termed HumanWild, which can effortlessly generate\nhuman images and corresponding 3D mesh annotations. We first collect a\nlarge-scale human-centric dataset with comprehensive annotations, e.g., text\ncaptions and surface normal images. Then, we train a customized ControlNet\nmodel upon this dataset to generate diverse human images and initial\nground-truth labels. At the core of this step is that we can easily obtain\nnumerous surface normal images from a 3D human parametric model, e.g., SMPL-X,\nby rendering the 3D mesh onto the image plane. As there exists inevitable noise\nin the initial labels, we then apply an off-the-shelf foundation segmentation\nmodel, i.e., SAM, to filter negative data samples. Our data generation pipeline\nis flexible and customizable to facilitate different real-world tasks, e.g.,\nego-centric scenes and perspective-distortion scenes. The generated dataset\ncomprises 0.79M images with corresponding 3D annotations, covering versatile\nviewpoints, scenes, and human identities. We train various HPS regressors on\ntop of the generated data and evaluate them on a wide range of benchmarks\n(3DPW, RICH, EgoBody, AGORA, SSP-3D) to verify the effectiveness of the\ngenerated data. By exclusively employing generative models, we generate\nlarge-scale in-the-wild human images and high-quality annotations, eliminating\nthe need for real-world data collection.\n","authors":["Yongtao Ge","Wenjia Wang","Yongfan Chen","Hao Chen","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2403.11111v1.pdf","comment":"project page: https://yongtaoge.github.io/projects/humanwild"},{"id":"http://arxiv.org/abs/2403.11107v1","updated":"2024-03-17T06:21:21Z","published":"2024-03-17T06:21:21Z","title":"Self-supervised co-salient object detection via feature correspondence\n at multiple scales","summary":" Our paper introduces a novel two-stage self-supervised approach for detecting\nco-occurring salient objects (CoSOD) in image groups without requiring\nsegmentation annotations. Unlike existing unsupervised methods that rely solely\non patch-level information (e.g. clustering patch descriptors) or on\ncomputation heavy off-the-shelf components for CoSOD, our lightweight model\nleverages feature correspondences at both patch and region levels,\nsignificantly improving prediction performance. In the first stage, we train a\nself-supervised network that detects co-salient regions by computing local\npatch-level feature correspondences across images. We obtain the segmentation\npredictions using confidence-based adaptive thresholding. In the next stage, we\nrefine these intermediate segmentations by eliminating the detected regions\n(within each image) whose averaged feature representations are dissimilar to\nthe foreground feature representation averaged across all the cross-attention\nmaps (from the previous stage). Extensive experiments on three CoSOD benchmark\ndatasets show that our self-supervised model outperforms the corresponding\nstate-of-the-art models by a huge margin (e.g. on the CoCA dataset, our model\nhas a 13.7% F-measure gain over the SOTA unsupervised CoSOD model). Notably,\nour self-supervised model also outperforms several recent fully supervised\nCoSOD models on the three test datasets (e.g., on the CoCA dataset, our model\nhas a 4.6% F-measure gain over a recent supervised CoSOD model).\n","authors":["Souradeep Chakraborty","Dimitris Samaras"],"pdf_url":"https://arxiv.org/pdf/2403.11107v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11106v1","updated":"2024-03-17T06:20:28Z","published":"2024-03-17T06:20:28Z","title":"Self-Supervised Quantization-Aware Knowledge Distillation","summary":" Quantization-aware training (QAT) and Knowledge Distillation (KD) are\ncombined to achieve competitive performance in creating low-bit deep learning\nmodels. However, existing works applying KD to QAT require tedious\nhyper-parameter tuning to balance the weights of different loss terms, assume\nthe availability of labeled training data, and require complex, computationally\nintensive training procedures for good performance. To address these\nlimitations, this paper proposes a novel Self-Supervised Quantization-Aware\nKnowledge Distillation (SQAKD) framework. SQAKD first unifies the forward and\nbackward dynamics of various quantization functions, making it flexible for\nincorporating various QAT works. Then it formulates QAT as a co-optimization\nproblem that simultaneously minimizes the KL-Loss between the full-precision\nand low-bit models for KD and the discretization error for quantization,\nwithout supervision from labels. A comprehensive evaluation shows that SQAKD\nsubstantially outperforms the state-of-the-art QAT and KD works for a variety\nof model architectures. Our code is at: https://github.com/kaiqi123/SQAKD.git.\n","authors":["Kaiqi Zhao","Ming Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.11106v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11105v1","updated":"2024-03-17T06:19:30Z","published":"2024-03-17T06:19:30Z","title":"Source Prompt Disentangled Inversion for Boosting Image Editability with\n Diffusion Models","summary":" Text-driven diffusion models have significantly advanced the image editing\nperformance by using text prompts as inputs. One crucial step in text-driven\nimage editing is to invert the original image into a latent noise code\nconditioned on the source prompt. While previous methods have achieved\npromising results by refactoring the image synthesizing process, the inverted\nlatent noise code is tightly coupled with the source prompt, limiting the image\neditability by target text prompts. To address this issue, we propose a novel\nmethod called Source Prompt Disentangled Inversion (SPDInv), which aims at\nreducing the impact of source prompt, thereby enhancing the text-driven image\nediting performance by employing diffusion models. To make the inverted noise\ncode be independent of the given source prompt as much as possible, we indicate\nthat the iterative inversion process should satisfy a fixed-point constraint.\nConsequently, we transform the inversion problem into a searching problem to\nfind the fixed-point solution, and utilize the pre-trained diffusion models to\nfacilitate the searching process. The experimental results show that our\nproposed SPDInv method can effectively mitigate the conflicts between the\ntarget editing prompt and the source prompt, leading to a significant decrease\nin editing artifacts. In addition to text-driven image editing, with SPDInv we\ncan easily adapt customized image generation models to localized editing tasks\nand produce promising performance. The source code are available at\nhttps://github.com/leeruibin/SPDInv.\n","authors":["Ruibin Li","Ruihuang Li","Song Guo","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.11105v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12790v3","updated":"2024-03-17T06:10:25Z","published":"2023-10-19T14:47:11Z","title":"Anomaly Heterogeneity Learning for Open-set Supervised Anomaly Detection","summary":" Open-set supervised anomaly detection (OSAD) - a recently emerging anomaly\ndetection area - aims at utilizing a few samples of anomaly classes seen during\ntraining to detect unseen anomalies (i.e., samples from open-set anomaly\nclasses), while effectively identifying the seen anomalies. Benefiting from the\nprior knowledge illustrated by the seen anomalies, current OSAD methods can\noften largely reduce false positive errors. However, these methods are trained\nin a closed-set setting and treat the anomaly examples as from a homogeneous\ndistribution, rendering them less effective in generalizing to unseen anomalies\nthat can be drawn from any distribution. This paper proposes to learn\nheterogeneous anomaly distributions using the limited anomaly examples to\naddress this issue. To this end, we introduce a novel approach, namely Anomaly\nHeterogeneity Learning (AHL), that simulates a diverse set of heterogeneous\nanomaly distributions and then utilizes them to learn a unified heterogeneous\nabnormality model in surrogate open-set environments. Further, AHL is a generic\nframework that existing OSAD models can plug and play for enhancing their\nabnormality modeling. Extensive experiments on nine real-world anomaly\ndetection datasets show that AHL can 1) substantially enhance different\nstate-of-the-art OSAD models in detecting seen and unseen anomalies, and 2)\neffectively generalize to unseen anomalies in new domains. Code is available at\nhttps://github.com/mala-lab/AHL.\n","authors":["Jiawen Zhu","Choubo Ding","Yu Tian","Guansong Pang"],"pdf_url":"https://arxiv.org/pdf/2310.12790v3.pdf","comment":"Accepted by CVPR2024; 15 pages; 4 figures"},{"id":"http://arxiv.org/abs/2403.11101v1","updated":"2024-03-17T06:09:27Z","published":"2024-03-17T06:09:27Z","title":"Hierarchical Generative Network for Face Morphing Attacks","summary":" Face morphing attacks circumvent face recognition systems (FRSs) by creating\na morphed image that contains multiple identities. However, existing face\nmorphing attack methods either sacrifice image quality or compromise the\nidentity preservation capability. Consequently, these attacks fail to bypass\nFRSs verification well while still managing to deceive human observers. These\nmethods typically rely on global information from contributing images, ignoring\nthe detailed information from effective facial regions. To address the above\nissues, we propose a novel morphing attack method to improve the quality of\nmorphed images and better preserve the contributing identities. Our proposed\nmethod leverages the hierarchical generative network to capture both local\ndetailed and global consistency information. Additionally, a mask-guided image\nblending module is dedicated to removing artifacts from areas outside the face\nto improve the image's visual quality. The proposed attack method is compared\nto state-of-the-art methods on three public datasets in terms of FRSs'\nvulnerability, attack detectability, and image quality. The results show our\nmethod's potential threat of deceiving FRSs while being capable of passing\nmultiple morphing attack detection (MAD) scenarios.\n","authors":["Zuyuan He","Zongyong Deng","Qiaoyun He","Qijun Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.11101v1.pdf","comment":"Accepted by FG2024"},{"id":"http://arxiv.org/abs/2403.11100v1","updated":"2024-03-17T06:08:08Z","published":"2024-03-17T06:08:08Z","title":"Graph Expansion in Pruned Recurrent Neural Network Layers Preserve\n Performance","summary":" Expansion property of a graph refers to its strong connectivity as well as\nsparseness. It has been reported that deep neural networks can be pruned to a\nhigh degree of sparsity while maintaining their performance. Such pruning is\nessential for performing real time sequence learning tasks using recurrent\nneural networks in resource constrained platforms. We prune recurrent networks\nsuch as RNNs and LSTMs, maintaining a large spectral gap of the underlying\ngraphs and ensuring their layerwise expansion properties. We also study the\ntime unfolded recurrent network graphs in terms of the properties of their\nbipartite layers. Experimental results for the benchmark sequence MNIST,\nCIFAR-10, and Google speech command data show that expander graph properties\nare key to preserving classification accuracy of RNN and LSTM.\n","authors":["Suryam Arnav Kalra","Arindam Biswas","Pabitra Mitra","Biswajit Basu"],"pdf_url":"https://arxiv.org/pdf/2403.11100v1.pdf","comment":"Accepted as tiny paper in ICLR 2024"},{"id":"http://arxiv.org/abs/2310.05556v2","updated":"2024-03-17T05:53:44Z","published":"2023-10-09T09:26:27Z","title":"WeatherDepth: Curriculum Contrastive Learning for Self-Supervised Depth\n Estimation under Adverse Weather Conditions","summary":" Depth estimation models have shown promising performance on clear scenes but\nfail to generalize to adverse weather conditions due to illumination\nvariations, weather particles, etc. In this paper, we propose WeatherDepth, a\nself-supervised robust depth estimation model with curriculum contrastive\nlearning, to tackle performance degradation in complex weather conditions.\nConcretely, we first present a progressive curriculum learning scheme with\nthree simple-to-complex curricula to gradually adapt the model from clear to\nrelative adverse, and then to adverse weather scenes. It encourages the model\nto gradually grasp beneficial depth cues against the weather effect, yielding\nsmoother and better domain adaption. Meanwhile, to prevent the model from\nforgetting previous curricula, we integrate contrastive learning into different\ncurricula. By drawing reference knowledge from the previous course, our\nstrategy establishes a depth consistency constraint between different courses\ntoward robust depth estimation in diverse weather. Besides, to reduce manual\nintervention and better adapt to different models, we designed an adaptive\ncurriculum scheduler to automatically search for the best timing for course\nswitching. In the experiment, the proposed solution is proven to be easily\nincorporated into various architectures and demonstrates state-of-the-art\n(SoTA) performance on both synthetic and real weather datasets. Source code and\ndata are available at \\url{https://github.com/wangjiyuan9/WeatherDepth}.\n","authors":["Jiyuan Wang","Chunyu Lin","Lang Nie","Shujun Huang","Yao Zhao","Xing Pan","Rui Ai"],"pdf_url":"https://arxiv.org/pdf/2310.05556v2.pdf","comment":"6 pages, accept by ICRA 2024"},{"id":"http://arxiv.org/abs/2311.15241v2","updated":"2024-03-17T05:30:40Z","published":"2023-11-26T08:59:30Z","title":"CalibFormer: A Transformer-based Automatic LiDAR-Camera Calibration\n Network","summary":" The fusion of LiDARs and cameras has been increasingly adopted in autonomous\ndriving for perception tasks. The performance of such fusion-based algorithms\nlargely depends on the accuracy of sensor calibration, which is challenging due\nto the difficulty of identifying common features across different data\nmodalities. Previously, many calibration methods involved specific targets\nand/or manual intervention, which has proven to be cumbersome and costly.\nLearning-based online calibration methods have been proposed, but their\nperformance is barely satisfactory in most cases. These methods usually suffer\nfrom issues such as sparse feature maps, unreliable cross-modality association,\ninaccurate calibration parameter regression, etc. In this paper, to address\nthese issues, we propose CalibFormer, an end-to-end network for automatic\nLiDAR-camera calibration. We aggregate multiple layers of camera and LiDAR\nimage features to achieve high-resolution representations. A multi-head\ncorrelation module is utilized to identify correlations between features more\naccurately. Lastly, we employ transformer architectures to estimate accurate\ncalibration parameters from the correlation information. Our method achieved a\nmean translation error of $0.8751 \\mathrm{cm}$ and a mean rotation error of\n$0.0562 ^{\\circ}$ on the KITTI dataset, surpassing existing state-of-the-art\nmethods and demonstrating strong robustness, accuracy, and generalization\ncapabilities.\n","authors":["Yuxuan Xiao","Yao Li","Chengzhen Meng","Xingchen Li","Jianmin Ji","Yanyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.15241v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.09207v3","updated":"2024-03-17T05:10:23Z","published":"2022-01-23T08:02:49Z","title":"Visual Object Tracking on Multi-modal RGB-D Videos: A Review","summary":" The development of visual object tracking has continued for decades. Recent\nyears, as the wide accessibility of the low-cost RGBD sensors, the task of\nvisual object tracking on RGB-D videos has drawn much attention. Compared to\nconventional RGB-only tracking, the RGB-D videos can provide more information\nthat facilitates objecting tracking in some complicated scenarios. The goal of\nthis review is to summarize the relative knowledge of the research filed of\nRGB-D tracking. To be specific, we will generalize the related RGB-D tracking\nbenchmarking datasets as well as the corresponding performance measurements.\nBesides, the existing RGB-D tracking methods are summarized in the paper.\nMoreover, we discuss the possible future direction in the field of RGB-D\ntracking.\n","authors":["Xue-Feng Zhu","Tianyang Xu","Xiao-Jun Wu"],"pdf_url":"https://arxiv.org/pdf/2201.09207v3.pdf","comment":"I prefer not to present this paper due to its subpar quality"},{"id":"http://arxiv.org/abs/2403.11092v1","updated":"2024-03-17T05:05:11Z","published":"2024-03-17T05:05:11Z","title":"Lost in Translation? Translation Errors and Challenges for Fair\n Assessment of Text-to-Image Models on Multilingual Concepts","summary":" Benchmarks of the multilingual capabilities of text-to-image (T2I) models\ncompare generated images prompted in a test language to an expected image\ndistribution over a concept set. One such benchmark, \"Conceptual Coverage\nAcross Languages\" (CoCo-CroLa), assesses the tangible noun inventory of T2I\nmodels by prompting them to generate pictures from a concept list translated to\nseven languages and comparing the output image populations. Unfortunately, we\nfind that this benchmark contains translation errors of varying severity in\nSpanish, Japanese, and Chinese. We provide corrections for these errors and\nanalyze how impactful they are on the utility and validity of CoCo-CroLa as a\nbenchmark. We reassess multiple baseline T2I models with the revisions, compare\nthe outputs elicited under the new translations to those conditioned on the\nold, and show that a correction's impactfulness on the image-domain benchmark\nresults can be predicted in the text domain with similarity scores. Our\nfindings will guide the future development of T2I multilinguality metrics by\nproviding analytical tools for practical translation decisions.\n","authors":["Michael Saxon","Yiran Luo","Sharon Levy","Chitta Baral","Yezhou Yang","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2403.11092v1.pdf","comment":"NAACL 2024 Main Conference"},{"id":"http://arxiv.org/abs/2403.11091v1","updated":"2024-03-17T05:00:40Z","published":"2024-03-17T05:00:40Z","title":"Multitask frame-level learning for few-shot sound event detection","summary":" This paper focuses on few-shot Sound Event Detection (SED), which aims to\nautomatically recognize and classify sound events with limited samples.\nHowever, prevailing methods methods in few-shot SED predominantly rely on\nsegment-level predictions, which often providing detailed, fine-grained\npredictions, particularly for events of brief duration. Although frame-level\nprediction strategies have been proposed to overcome these limitations, these\nstrategies commonly face difficulties with prediction truncation caused by\nbackground noise. To alleviate this issue, we introduces an innovative\nmultitask frame-level SED framework. In addition, we introduce TimeFilterAug, a\nlinear timing mask for data augmentation, to increase the model's robustness\nand adaptability to diverse acoustic environments. The proposed method achieves\na F-score of 63.8%, securing the 1st rank in the few-shot bioacoustic event\ndetection category of the Detection and Classification of Acoustic Scenes and\nEvents Challenge 2023.\n","authors":["Liang Zou","Genwei Yan","Ruoyu Wang","Jun Du","Meng Lei","Tian Gao","Xin Fang"],"pdf_url":"https://arxiv.org/pdf/2403.11091v1.pdf","comment":"6 pages, 4 figures, conference"},{"id":"http://arxiv.org/abs/2304.12461v3","updated":"2024-03-17T04:57:18Z","published":"2023-04-24T21:39:13Z","title":"TensoIR: Tensorial Inverse Rendering","summary":" We propose TensoIR, a novel inverse rendering approach based on tensor\nfactorization and neural fields. Unlike previous works that use purely\nMLP-based neural fields, thus suffering from low capacity and high computation\ncosts, we extend TensoRF, a state-of-the-art approach for radiance field\nmodeling, to estimate scene geometry, surface reflectance, and environment\nillumination from multi-view images captured under unknown lighting conditions.\nOur approach jointly achieves radiance field reconstruction and\nphysically-based model estimation, leading to photo-realistic novel view\nsynthesis and relighting results. Benefiting from the efficiency and\nextensibility of the TensoRF-based representation, our method can accurately\nmodel secondary shading effects (like shadows and indirect lighting) and\ngenerally support input images captured under single or multiple unknown\nlighting conditions. The low-rank tensor representation allows us to not only\nachieve fast and compact reconstruction but also better exploit shared\ninformation under an arbitrary number of capturing lighting conditions. We\ndemonstrate the superiority of our method to baseline methods qualitatively and\nquantitatively on various challenging synthetic and real-world scenes.\n","authors":["Haian Jin","Isabella Liu","Peijia Xu","Xiaoshuai Zhang","Songfang Han","Sai Bi","Xiaowei Zhou","Zexiang Xu","Hao Su"],"pdf_url":"https://arxiv.org/pdf/2304.12461v3.pdf","comment":"Project page: https://haian-jin.github.io/TensoIR"},{"id":"http://arxiv.org/abs/2311.11013v3","updated":"2024-03-17T04:54:59Z","published":"2023-11-18T08:48:58Z","title":"Implicit Event-RGBD Neural SLAM","summary":" Implicit neural SLAM has achieved remarkable progress recently. Nevertheless,\nexisting methods face significant challenges in non-ideal scenarios, such as\nmotion blur or lighting variation, which often leads to issues like convergence\nfailures, localization drifts, and distorted mapping. To address these\nchallenges, we propose EN-SLAM, the first event-RGBD implicit neural SLAM\nframework, which effectively leverages the high rate and high dynamic range\nadvantages of event data for tracking and mapping. Specifically, EN-SLAM\nproposes a differentiable CRF (Camera Response Function) rendering technique to\ngenerate distinct RGB and event camera data via a shared radiance field, which\nis optimized by learning a unified implicit representation with the captured\nevent and RGBD supervision. Moreover, based on the temporal difference property\nof events, we propose a temporal aggregating optimization strategy for the\nevent joint tracking and global bundle adjustment, capitalizing on the\nconsecutive difference constraints of events, significantly enhancing tracking\naccuracy and robustness. Finally, we construct the simulated dataset\nDEV-Indoors and real captured dataset DEV-Reals containing 6 scenes, 17\nsequences with practical motion blur and lighting changes for evaluations.\nExperimental results show that our method outperforms the SOTA methods in both\ntracking ATE and mapping ACC with a real-time 17 FPS in various challenging\nenvironments. Project page: https://delinqu.github.io/EN-SLAM.\n","authors":["Delin Qu","Chi Yan","Dong Wang","Jie Yin","Dan Xu","Bin Zhao","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2311.11013v3.pdf","comment":"Accept at CVPR 2024"},{"id":"http://arxiv.org/abs/2310.17645v2","updated":"2024-03-17T04:40:48Z","published":"2023-10-26T17:58:08Z","title":"PubDef: Defending Against Transfer Attacks From Public Models","summary":" Adversarial attacks have been a looming and unaddressed threat in the\nindustry. However, through a decade-long history of the robustness evaluation\nliterature, we have learned that mounting a strong or optimal attack is\nchallenging. It requires both machine learning and domain expertise. In other\nwords, the white-box threat model, religiously assumed by a large majority of\nthe past literature, is unrealistic. In this paper, we propose a new practical\nthreat model where the adversary relies on transfer attacks through publicly\navailable surrogate models. We argue that this setting will become the most\nprevalent for security-sensitive applications in the future. We evaluate the\ntransfer attacks in this setting and propose a specialized defense method based\non a game-theoretic perspective. The defenses are evaluated under 24 public\nmodels and 11 attack algorithms across three datasets (CIFAR-10, CIFAR-100, and\nImageNet). Under this threat model, our defense, PubDef, outperforms the\nstate-of-the-art white-box adversarial training by a large margin with almost\nno loss in the normal accuracy. For instance, on ImageNet, our defense achieves\n62% accuracy under the strongest transfer attack vs only 36% of the best\nadversarially trained model. Its accuracy when not under attack is only 2%\nlower than that of an undefended model (78% vs 80%). We release our code at\nhttps://github.com/wagner-group/pubdef.\n","authors":["Chawin Sitawarin","Jaewon Chang","David Huang","Wesson Altoyan","David Wagner"],"pdf_url":"https://arxiv.org/pdf/2310.17645v2.pdf","comment":"ICLR 2024. Code available at https://github.com/wagner-group/pubdef"},{"id":"http://arxiv.org/abs/2403.11085v1","updated":"2024-03-17T04:36:18Z","published":"2024-03-17T04:36:18Z","title":"m&m's: A Benchmark to Evaluate Tool-Use for multi-step multi-modal Tasks","summary":" Real-world multi-modal problems are rarely solved by a single machine\nlearning model, and often require multi-step computational plans that involve\nstitching several models. Tool-augmented LLMs hold tremendous promise for\nautomating the generation of such computational plans. However, the lack of\nstandardized benchmarks for evaluating LLMs as planners for multi-step\nmulti-modal tasks has prevented a systematic study of planner design decisions.\nShould LLMs generate a full plan in a single shot or step-by-step? Should they\ninvoke tools directly with Python code or through structured data formats like\nJSON? Does feedback improve planning? To answer these questions and more, we\nintroduce m&m's: a benchmark containing 4K+ multi-step multi-modal tasks\ninvolving 33 tools that include multi-modal models, (free) public APIs, and\nimage processing modules. For each of these task queries, we provide\nautomatically generated plans using this realistic toolset. We further provide\na high-quality subset of 1,565 task plans that are human-verified and correctly\nexecutable. With m&m's, we evaluate 6 popular LLMs with 2 planning strategies\n(multi-step vs. step-by-step planning), 2 plan formats (JSON vs. code), and 3\ntypes of feedback (parsing/verification/execution). Finally, we summarize\ntakeaways from our extensive experiments. Our dataset and code are available on\nHuggingFace (https://huggingface.co/datasets/zixianma/mnms) and Github\n(https://github.com/RAIVNLab/mnms).\n","authors":["Zixian Ma","Weikai Huang","Jieyu Zhang","Tanmay Gupta","Ranjay Krishna"],"pdf_url":"https://arxiv.org/pdf/2403.11085v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11083v1","updated":"2024-03-17T04:30:57Z","published":"2024-03-17T04:30:57Z","title":"Customizing Visual-Language Foundation Models for Multi-modal Anomaly\n Detection and Reasoning","summary":" Anomaly detection is vital in various industrial scenarios, including the\nidentification of unusual patterns in production lines and the detection of\nmanufacturing defects for quality control. Existing techniques tend to be\nspecialized in individual scenarios and lack generalization capacities. In this\nstudy, we aim to develop a generic anomaly detection model applicable across\nmultiple scenarios. To achieve this, we customize generic visual-language\nfoundation models that possess extensive knowledge and robust reasoning\nabilities into anomaly detectors and reasoners. Specifically, we introduce a\nmulti-modal prompting strategy that incorporates domain knowledge from experts\nas conditions to guide the models. Our approach considers multi-modal prompt\ntypes, including task descriptions, class context, normality rules, and\nreference images. In addition, we unify the input representation of\nmulti-modality into a 2D image format, enabling multi-modal anomaly detection\nand reasoning. Our preliminary studies demonstrate that combining visual and\nlanguage prompts as conditions for customizing the models enhances anomaly\ndetection performance. The customized models showcase the ability to detect\nanomalies across different data modalities such as images and point clouds.\nQualitative case studies further highlight the anomaly detection and reasoning\ncapabilities, particularly for multi-object scenes and temporal data. Our code\nis available at https://github.com/Xiaohao-Xu/Customizable-VLM.\n","authors":["Xiaohao Xu","Yunkang Cao","Yongqi Chen","Weiming Shen","Xiaonan Huang"],"pdf_url":"https://arxiv.org/pdf/2403.11083v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11078v1","updated":"2024-03-17T04:08:58Z","published":"2024-03-17T04:08:58Z","title":"Adaptive Semantic-Enhanced Denoising Diffusion Probabilistic Model for\n Remote Sensing Image Super-Resolution","summary":" Remote sensing image super-resolution (SR) is a crucial task to restore\nhigh-resolution (HR) images from low-resolution (LR) observations. Recently,\nthe Denoising Diffusion Probabilistic Model (DDPM) has shown promising\nperformance in image reconstructions by overcoming problems inherent in\ngenerative models, such as over-smoothing and mode collapse. However, the\nhigh-frequency details generated by DDPM often suffer from misalignment with HR\nimages due to the model's tendency to overlook long-range semantic contexts.\nThis is attributed to the widely used U-Net decoder in the conditional noise\npredictor, which tends to overemphasize local information, leading to the\ngeneration of noises with significant variances during the prediction process.\nTo address these issues, an adaptive semantic-enhanced DDPM (ASDDPM) is\nproposed to enhance the detail-preserving capability of the DDPM by\nincorporating low-frequency semantic information provided by the Transformer.\nSpecifically, a novel adaptive diffusion Transformer decoder (ADTD) is\ndeveloped to bridge the semantic gap between the encoder and decoder through\nregulating the noise prediction with the global contextual relationships and\nlong-range dependencies in the diffusion process. Additionally, a residual\nfeature fusion strategy establishes information exchange between the two\ndecoders at multiple levels. As a result, the predicted noise generated by our\napproach closely approximates that of the real noise distribution.Extensive\nexperiments on two SR and two semantic segmentation datasets confirm the\nsuperior performance of the proposed ASDDPM in both SR and the subsequent\ndownstream applications. The source code will be available at\nhttps://github.com/littlebeen/ASDDPM-Adaptive-Semantic-Enhanced-DDPM.\n","authors":["Jialu Sui","Xianping Ma","Xiaokang Zhang","Man-On Pun"],"pdf_url":"https://arxiv.org/pdf/2403.11078v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11077v1","updated":"2024-03-17T04:02:39Z","published":"2024-03-17T04:02:39Z","title":"Zippo: Zipping Color and Transparency Distributions into a Single\n Diffusion Model","summary":" Beyond the superiority of the text-to-image diffusion model in generating\nhigh-quality images, recent studies have attempted to uncover its potential for\nadapting the learned semantic knowledge to visual perception tasks. In this\nwork, instead of translating a generative diffusion model into a visual\nperception model, we explore to retain the generative ability with the\nperceptive adaptation. To accomplish this, we present Zippo, a unified\nframework for zipping the color and transparency distributions into a single\ndiffusion model by expanding the diffusion latent into a joint representation\nof RGB images and alpha mattes. By alternatively selecting one modality as the\ncondition and then applying the diffusion process to the counterpart modality,\nZippo is capable of generating RGB images from alpha mattes and predicting\ntransparency from input images. In addition to single-modality prediction, we\npropose a modality-aware noise reassignment strategy to further empower Zippo\nwith jointly generating RGB images and its corresponding alpha mattes under the\ntext guidance. Our experiments showcase Zippo's ability of efficient\ntext-conditioned transparent image generation and present plausible results of\nMatte-to-RGB and RGB-to-Matte translation.\n","authors":["Kangyang Xie","Binbin Yang","Hao Chen","Meng Wang","Cheng Zou","Hui Xue","Ming Yang","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2403.11077v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17950v3","updated":"2024-03-17T03:56:13Z","published":"2023-11-29T06:25:59Z","title":"Generalized Large-Scale Data Condensation via Various Backbone and\n Statistical Matching","summary":" The lightweight \"local-match-global\" matching introduced by SRe2L\nsuccessfully creates a distilled dataset with comprehensive information on the\nfull 224x224 ImageNet-1k. However, this one-sided approach is limited to a\nparticular backbone, layer, and statistics, which limits the improvement of the\ngeneralization of a distilled dataset. We suggest that sufficient and various\n\"local-match-global\" matching are more precise and effective than a single one\nand has the ability to create a distilled dataset with richer information and\nbetter generalization. We call this perspective \"generalized matching\" and\npropose Generalized Various Backbone and Statistical Matching (G-VBSM) in this\nwork, which aims to create a synthetic dataset with densities, ensuring\nconsistency with the complete dataset across various backbones, layers, and\nstatistics. As experimentally demonstrated, G-VBSM is the first algorithm to\nobtain strong performance across both small-scale and large-scale datasets.\nSpecifically, G-VBSM achieves a performance of 38.7% on CIFAR-100 with\n128-width ConvNet, 47.6% on Tiny-ImageNet with ResNet18, and 31.4% on the full\n224x224 ImageNet-1k with ResNet18, under images per class (IPC) 10, 50, and 10,\nrespectively. These results surpass all SOTA methods by margins of 3.9%, 6.5%,\nand 10.1%, respectively.\n","authors":["Shitong Shao","Zeyuan Yin","Muxin Zhou","Xindong Zhang","Zhiqiang Shen"],"pdf_url":"https://arxiv.org/pdf/2311.17950v3.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2306.13394v4","updated":"2024-03-17T03:55:52Z","published":"2023-06-23T09:22:36Z","title":"MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language\n Models","summary":" Multimodal Large Language Model (MLLM) relies on the powerful LLM to perform\nmultimodal tasks, showing amazing emergent abilities in recent studies, such as\nwriting poems based on an image. However, it is difficult for these case\nstudies to fully reflect the performance of MLLM, lacking a comprehensive\nevaluation. In this paper, we fill in this blank, presenting the first\ncomprehensive MLLM Evaluation benchmark MME. It measures both perception and\ncognition abilities on a total of 14 subtasks. In order to avoid data leakage\nthat may arise from direct use of public datasets for evaluation, the\nannotations of instruction-answer pairs are all manually designed. The concise\ninstruction design allows us to fairly compare MLLMs, instead of struggling in\nprompt engineering. Besides, with such an instruction, we can also easily carry\nout quantitative statistics. A total of 30 advanced MLLMs are comprehensively\nevaluated on our MME, which not only suggests that existing MLLMs still have a\nlarge room for improvement, but also reveals the potential directions for the\nsubsequent model optimization. The data application manner and online\nleaderboards are released at\nhttps://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation.\n","authors":["Chaoyou Fu","Peixian Chen","Yunhang Shen","Yulei Qin","Mengdan Zhang","Xu Lin","Jinrui Yang","Xiawu Zheng","Ke Li","Xing Sun","Yunsheng Wu","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2306.13394v4.pdf","comment":"Project page:\n https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models"},{"id":"http://arxiv.org/abs/2403.11074v1","updated":"2024-03-17T03:45:14Z","published":"2024-03-17T03:45:14Z","title":"Audio-Visual Segmentation via Unlabeled Frame Exploitation","summary":" Audio-visual segmentation (AVS) aims to segment the sounding objects in video\nframes. Although great progress has been witnessed, we experimentally reveal\nthat current methods reach marginal performance gain within the use of the\nunlabeled frames, leading to the underutilization issue. To fully explore the\npotential of the unlabeled frames for AVS, we explicitly divide them into two\ncategories based on their temporal characteristics, i.e., neighboring frame\n(NF) and distant frame (DF). NFs, temporally adjacent to the labeled frame,\noften contain rich motion information that assists in the accurate localization\nof sounding objects. Contrary to NFs, DFs have long temporal distances from the\nlabeled frame, which share semantic-similar objects with appearance variations.\nConsidering their unique characteristics, we propose a versatile framework that\neffectively leverages them to tackle AVS. Specifically, for NFs, we exploit the\nmotion cues as the dynamic guidance to improve the objectness localization.\nBesides, we exploit the semantic cues in DFs by treating them as valid\naugmentations to the labeled frames, which are then used to enrich data\ndiversity in a self-training manner. Extensive experimental results demonstrate\nthe versatility and superiority of our method, unleashing the power of the\nabundant unlabeled frames.\n","authors":["Jinxiang Liu","Yikun Liu","Fei Zhang","Chen Ju","Ya Zhang","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2403.11074v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.11073v1","updated":"2024-03-17T03:38:50Z","published":"2024-03-17T03:38:50Z","title":"Tokensome: Towards a Genetic Vision-Language GPT for Explainable and\n Cognitive Karyotyping","summary":" Automatic karyotype analysis is often defined as a visual perception task\nfocused solely on chromosomal object-level modeling. This definition has led\nmost existing methods to overlook componential and holistic information,\nsignificantly constraining model performance. Moreover, the lack of\ninterpretability in current technologies hinders clinical adoption. In this\npaper, we introduce Tokensome, a novel vision-language model based on\nchromosome tokenization for explainable and cognitive karyotyping. Tokensome\nelevates the method from the conventional visual perception layer to the\ncognitive decision-making layer. This elevation enables the integration of\ndomain knowledge and cognitive reasoning via knowledge graphs and LLMs,\nmarkedly enhancing model's explainability and facilitating abnormality\ndetection.\n","authors":["Haoxi Zhang","Xinxu Zhang","Yuanxin Lin","Maiqi Wang","Yi Lai","Yu Wang","Linfeng Yu","Yufeng Xu","Ran Cheng","Edward Szczerbicki"],"pdf_url":"https://arxiv.org/pdf/2403.11073v1.pdf","comment":"Preprint. Work in progress"},{"id":"http://arxiv.org/abs/2312.07061v2","updated":"2024-03-17T03:17:47Z","published":"2023-12-12T08:28:29Z","title":"MaxQ: Multi-Axis Query for N:M Sparsity Network","summary":" N:M sparsity has received increasing attention due to its remarkable\nperformance and latency trade-off compared with structured and unstructured\nsparsity. However, existing N:M sparsity methods do not differentiate the\nrelative importance of weights among blocks and leave important weights\nunderappreciated. Besides, they directly apply N:M sparsity to the whole\nnetwork, which will cause severe information loss. Thus, they are still\nsub-optimal. In this paper, we propose an efficient and effective Multi-Axis\nQuery methodology, dubbed as MaxQ, to rectify these problems. During the\ntraining, MaxQ employs a dynamic approach to generate soft N:M masks,\nconsidering the weight importance across multiple axes. This method enhances\nthe weights with more importance and ensures more effective updates. Meanwhile,\na sparsity strategy that gradually increases the percentage of N:M weight\nblocks is applied, which allows the network to heal from the pruning-induced\ndamage progressively. During the runtime, the N:M soft masks can be precomputed\nas constants and folded into weights without causing any distortion to the\nsparse pattern and incurring additional computational overhead. Comprehensive\nexperiments demonstrate that MaxQ achieves consistent improvements across\ndiverse CNN architectures in various computer vision tasks, including image\nclassification, object detection and instance segmentation. For ResNet50 with\n1:16 sparse pattern, MaxQ can achieve 74.6\\% top-1 accuracy on ImageNet and\nimprove by over 2.8\\% over the state-of-the-art. Codes and checkpoints are\navailable at \\url{https://github.com/JingyangXiang/MaxQ}.\n","authors":["Jingyang Xiang","Siqi Li","Junhao Chen","Zhuangzhi Chen","Tianxin Huang","Linpeng Peng","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2312.07061v2.pdf","comment":"Accepted by the IEEE/CVF Conference on Computer Vision and Pattern\n Recognition 2024 (CVPR2024)"},{"id":"http://arxiv.org/abs/2403.11070v1","updated":"2024-03-17T03:16:59Z","published":"2024-03-17T03:16:59Z","title":"Controllable Relation Disentanglement for Few-Shot Class-Incremental\n Learning","summary":" In this paper, we propose to tackle Few-Shot Class-Incremental Learning\n(FSCIL) from a new perspective, i.e., relation disentanglement, which means\nenhancing FSCIL via disentangling spurious relation between categories. The\nchallenge of disentangling spurious correlations lies in the poor\ncontrollability of FSCIL. On one hand, an FSCIL model is required to be trained\nin an incremental manner and thus it is very hard to directly control\nrelationships between categories of different sessions. On the other hand,\ntraining samples per novel category are only in the few-shot setting, which\nincreases the difficulty of alleviating spurious relation issues as well. To\novercome this challenge, in this paper, we propose a new simple-yet-effective\nmethod, called ConTrollable Relation-disentangLed Few-Shot Class-Incremental\nLearning (CTRL-FSCIL). Specifically, during the base session, we propose to\nanchor base category embeddings in feature space and construct disentanglement\nproxies to bridge gaps between the learning for category representations in\ndifferent sessions, thereby making category relation controllable. During\nincremental learning, the parameters of the backbone network are frozen in\norder to relieve the negative impact of data scarcity. Moreover, a\ndisentanglement loss is designed to effectively guide a relation\ndisentanglement controller to disentangle spurious correlations between the\nembeddings encoded by the backbone. In this way, the spurious correlation issue\nin FSCIL can be suppressed. Extensive experiments on CIFAR-100, mini-ImageNet,\nand CUB-200 datasets demonstrate the effectiveness of our CTRL-FSCIL method.\n","authors":["Yuan Zhou","Richang Hong","Yanrong Guo","Lin Liu","Shijie Hao","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.11070v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11060v1","updated":"2024-03-17T02:15:15Z","published":"2024-03-17T02:15:15Z","title":"Intelligent Railroad Grade Crossing: Leveraging Semantic Segmentation\n and Object Detection for Enhanced Safety","summary":" Crashes and delays at Railroad Highway Grade Crossings (RHGC), where highways\nand railroads intersect, pose significant safety concerns for the U.S. Federal\nRailroad Administration (FRA). Despite the critical importance of addressing\naccidents and traffic delays at highway-railroad intersections, there is a\nnotable dearth of research on practical solutions for managing these issues. In\nresponse to this gap in the literature, our study introduces an intelligent\nsystem that leverages machine learning and computer vision techniques to\nenhance safety at Railroad Highway Grade crossings (RHGC). This research\nproposed a Non-Maximum Suppression (NMS)- based ensemble model that integrates\na variety of YOLO variants, specifically YOLOv5S, YOLOv5M, and YOLOv5L, for\ngrade-crossing object detection, utilizes segmentation techniques from the UNet\narchitecture for detecting approaching rail at a grade crossing. Both methods\nare implemented on a Raspberry Pi. Moreover, the strategy employs\nhigh-definition cameras installed at the RHGC. This framework enables the\nsystem to monitor objects within the Region of Interest (ROI) at crossings,\ndetect the approach of trains, and clear the crossing area before a train\narrives. Regarding accuracy, precision, recall, and Intersection over Union\n(IoU), the proposed state-of-the-art NMS-based object detection ensemble model\nachieved 96% precision. In addition, the UNet segmentation model obtained a 98%\nIoU value. This automated railroad grade crossing system powered by artificial\nintelligence represents a promising solution for enhancing safety at\nhighway-railroad intersections.\n","authors":["Al Amin","Deo Chimba","Kamrul Hasan","Emmanuel Samson"],"pdf_url":"https://arxiv.org/pdf/2403.11060v1.pdf","comment":"11 pages, 11 figures, conference"},{"id":"http://arxiv.org/abs/2403.11057v1","updated":"2024-03-17T02:06:49Z","published":"2024-03-17T02:06:49Z","title":"Large Language Models Powered Context-aware Motion Prediction","summary":" Motion prediction is among the most fundamental tasks in autonomous driving.\nTraditional methods of motion forecasting primarily encode vector information\nof maps and historical trajectory data of traffic participants, lacking a\ncomprehensive understanding of overall traffic semantics, which in turn affects\nthe performance of prediction tasks. In this paper, we utilized Large Language\nModels (LLMs) to enhance the global traffic context understanding for motion\nprediction tasks. We first conducted systematic prompt engineering, visualizing\ncomplex traffic environments and historical trajectory information of traffic\nparticipants into image prompts -- Transportation Context Map (TC-Map),\naccompanied by corresponding text prompts. Through this approach, we obtained\nrich traffic context information from the LLM. By integrating this information\ninto the motion prediction model, we demonstrate that such context can enhance\nthe accuracy of motion predictions. Furthermore, considering the cost\nassociated with LLMs, we propose a cost-effective deployment strategy:\nenhancing the accuracy of motion prediction tasks at scale with 0.7\\%\nLLM-augmented datasets. Our research offers valuable insights into enhancing\nthe understanding of traffic scenes of LLMs and the motion prediction\nperformance of autonomous driving.\n","authors":["Xiaoji Zheng","Lixiu Wu","Zhijie Yan","Yuanrong Tang","Hao Zhao","Chen Zhong","Bokui Chen","Jiangtao Gong"],"pdf_url":"https://arxiv.org/pdf/2403.11057v1.pdf","comment":"6 pages,4 figures"},{"id":"http://arxiv.org/abs/2403.11056v1","updated":"2024-03-17T02:06:03Z","published":"2024-03-17T02:06:03Z","title":"Analytic-Splatting: Anti-Aliased 3D Gaussian Splatting via Analytic\n Integration","summary":" The 3D Gaussian Splatting (3DGS) gained its popularity recently by combining\nthe advantages of both primitive-based and volumetric 3D representations,\nresulting in improved quality and efficiency for 3D scene rendering. However,\n3DGS is not alias-free, and its rendering at varying resolutions could produce\nsevere blurring or jaggies. This is because 3DGS treats each pixel as an\nisolated, single point rather than as an area, causing insensitivity to changes\nin the footprints of pixels. Consequently, this discrete sampling scheme\ninevitably results in aliasing, owing to the restricted sampling bandwidth. In\nthis paper, we derive an analytical solution to address this issue. More\nspecifically, we use a conditioned logistic function as the analytic\napproximation of the cumulative distribution function (CDF) in a\none-dimensional Gaussian signal and calculate the Gaussian integral by\nsubtracting the CDFs. We then introduce this approximation in the\ntwo-dimensional pixel shading, and present Analytic-Splatting, which\nanalytically approximates the Gaussian integral within the 2D-pixel window area\nto better capture the intensity response of each pixel. Moreover, we use the\napproximated response of the pixel window integral area to participate in the\ntransmittance calculation of volume rendering, making Analytic-Splatting\nsensitive to the changes in pixel footprint at different resolutions.\nExperiments on various datasets validate that our approach has better\nanti-aliasing capability that gives more details and better fidelity.\n","authors":["Zhihao Liang","Qi Zhang","Wenbo Hu","Ying Feng","Lei Zhu","Kui Jia"],"pdf_url":"https://arxiv.org/pdf/2403.11056v1.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2403.11053v1","updated":"2024-03-17T01:42:48Z","published":"2024-03-17T01:42:48Z","title":"OSTAF: A One-Shot Tuning Method for Improved Attribute-Focused T2I\n Personalization","summary":" Personalized text-to-image (T2I) models not only produce lifelike and varied\nvisuals but also allow users to tailor the images to fit their personal taste.\nThese personalization techniques can grasp the essence of a concept through a\ncollection of images, or adjust a pre-trained text-to-image model with a\nspecific image input for subject-driven or attribute-aware guidance. Yet,\naccurately capturing the distinct visual attributes of an individual image\nposes a challenge for these methods. To address this issue, we introduce OSTAF,\na novel parameter-efficient one-shot fine-tuning method which only utilizes one\nreference image for T2I personalization. A novel hypernetwork-powered\nattribute-focused fine-tuning mechanism is employed to achieve the precise\nlearning of various attribute features (e.g., appearance, shape or drawing\nstyle) from the reference image. Comparing to existing image customization\nmethods, our method shows significant superiority in attribute identification\nand application, as well as achieves a good balance between efficiency and\noutput quality.\n","authors":["Ye Wang","Zili Yi","Rui Ma"],"pdf_url":"https://arxiv.org/pdf/2403.11053v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11052v1","updated":"2024-03-17T01:27:00Z","published":"2024-03-17T01:27:00Z","title":"Unveiling and Mitigating Memorization in Text-to-image Diffusion Models\n through Cross Attention","summary":" Recent advancements in text-to-image diffusion models have demonstrated their\nremarkable capability to generate high-quality images from textual prompts.\nHowever, increasing research indicates that these models memorize and replicate\nimages from their training data, raising tremendous concerns about potential\ncopyright infringement and privacy risks. In our study, we provide a novel\nperspective to understand this memorization phenomenon by examining its\nrelationship with cross-attention mechanisms. We reveal that during\nmemorization, the cross-attention tends to focus disproportionately on the\nembeddings of specific tokens. The diffusion model is overfitted to these token\nembeddings, memorizing corresponding training images. To elucidate this\nphenomenon, we further identify and discuss various intrinsic findings of\ncross-attention that contribute to memorization. Building on these insights, we\nintroduce an innovative approach to detect and mitigate memorization in\ndiffusion models. The advantage of our proposed method is that it will not\ncompromise the speed of either the training or the inference processes in these\nmodels while preserving the quality of generated images. Our code is available\nat https://github.com/renjie3/MemAttn .\n","authors":["Jie Ren","Yaxin Li","Shenglai Zen","Han Xu","Lingjuan Lyu","Yue Xing","Jiliang Tang"],"pdf_url":"https://arxiv.org/pdf/2403.11052v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11050v1","updated":"2024-03-17T00:51:59Z","published":"2024-03-17T00:51:59Z","title":"Endora: Video Generation Models as Endoscopy Simulators","summary":" Generative models hold promise for revolutionizing medical education,\nrobot-assisted surgery, and data augmentation for machine learning. Despite\nprogress in generating 2D medical images, the complex domain of clinical video\ngeneration has largely remained untapped.This paper introduces \\model, an\ninnovative approach to generate medical videos that simulate clinical endoscopy\nscenes. We present a novel generative model design that integrates a\nmeticulously crafted spatial-temporal video transformer with advanced 2D vision\nfoundation model priors, explicitly modeling spatial-temporal dynamics during\nvideo generation. We also pioneer the first public benchmark for endoscopy\nsimulation with video generation models, adapting existing state-of-the-art\nmethods for this endeavor.Endora demonstrates exceptional visual quality in\ngenerating endoscopy videos, surpassing state-of-the-art methods in extensive\ntesting. Moreover, we explore how this endoscopy simulator can empower\ndownstream video analysis tasks and even generate 3D medical scenes with\nmulti-view consistency. In a nutshell, Endora marks a notable breakthrough in\nthe deployment of generative AI for clinical endoscopy research, setting a\nsubstantial stage for further advances in medical content generation. For more\ndetails, please visit our project page: https://endora-medvidgen.github.io/.\n","authors":["Chenxin Li","Hengyu Liu","Yifan Liu","Brandon Y. Feng","Wuyang Li","Xinyu Liu","Zhen Chen","Jing Shao","Yixuan Yuan"],"pdf_url":"https://arxiv.org/pdf/2403.11050v1.pdf","comment":"Project page: https://endora-medvidgen.github.io/"},{"id":"http://arxiv.org/abs/2304.00436v2","updated":"2024-03-17T00:39:40Z","published":"2023-04-02T03:03:21Z","title":"Instance-Level Trojan Attacks on Visual Question Answering via\n Adversarial Learning in Neuron Activation Space","summary":" Trojan attacks embed perturbations in input data leading to malicious\nbehavior in neural network models. A combination of various Trojans in\ndifferent modalities enables an adversary to mount a sophisticated attack on\nmultimodal learning such as Visual Question Answering (VQA). However,\nmultimodal Trojans in conventional methods are susceptible to parameter\nadjustment during processes such as fine-tuning. To this end, we propose an\ninstance-level multimodal Trojan attack on VQA that efficiently adapts to\nfine-tuned models through a dual-modality adversarial learning method. This\nmethod compromises two specific neurons in a specific perturbation layer in the\npretrained model to produce overly large neuron activations. Then, a malicious\ncorrelation between these overactive neurons and the malicious output of a\nfine-tuned model is established through adversarial learning. Extensive\nexperiments are conducted using the VQA-v2 dataset, based on a wide range of\nmetrics including sample efficiency, stealthiness, and robustness. The proposed\nattack demonstrates enhanced performance with diverse vision and text Trojans\ntailored for each sample. We demonstrate that the proposed attack can be\nefficiently adapted to different fine-tuned models, by injecting only a few\nshots of Trojan samples. Moreover, we investigate the attack performance under\nconventional defenses, where the defenses cannot effectively mitigate the\nattack.\n","authors":["Yuwei Sun","Hideya Ochiai","Jun Sakuma"],"pdf_url":"https://arxiv.org/pdf/2304.00436v2.pdf","comment":"Accepted for IJCNN 2024"},{"id":"http://arxiv.org/abs/2403.11047v1","updated":"2024-03-17T00:14:29Z","published":"2024-03-17T00:14:29Z","title":"From Pixels to Predictions: Spectrogram and Vision Transformer for\n Better Time Series Forecasting","summary":" Time series forecasting plays a crucial role in decision-making across\nvarious domains, but it presents significant challenges. Recent studies have\nexplored image-driven approaches using computer vision models to address these\nchallenges, often employing lineplots as the visual representation of time\nseries data. In this paper, we propose a novel approach that uses\ntime-frequency spectrograms as the visual representation of time series data.\nWe introduce the use of a vision transformer for multimodal learning,\nshowcasing the advantages of our approach across diverse datasets from\ndifferent domains. To evaluate its effectiveness, we compare our method against\nstatistical baselines (EMA and ARIMA), a state-of-the-art deep learning-based\napproach (DeepAR), other visual representations of time series data (lineplot\nimages), and an ablation study on using only the time series as input. Our\nexperiments demonstrate the benefits of utilizing spectrograms as a visual\nrepresentation for time series data, along with the advantages of employing a\nvision transformer for simultaneous learning in both the time and frequency\ndomains.\n","authors":["Zhen Zeng","Rachneet Kaur","Suchetha Siddagangappa","Tucker Balch","Manuela Veloso"],"pdf_url":"https://arxiv.org/pdf/2403.11047v1.pdf","comment":"Published at ACM ICAIF 2023"},{"id":"http://arxiv.org/abs/2403.11116v1","updated":"2024-03-17T06:53:44Z","published":"2024-03-17T06:53:44Z","title":"PhD: A Prompted Visual Hallucination Evaluation Dataset","summary":" The rapid growth of Large Language Models (LLMs) has driven the development\nof Large Vision-Language Models (LVLMs). The challenge of hallucination,\nprevalent in LLMs, also emerges in LVLMs. However, most existing efforts mainly\nfocus on object hallucination in LVLM, ignoring diverse types of LVLM\nhallucinations. In this study, we delve into the Intrinsic Vision-Language\nHallucination (IVL-Hallu) issue, thoroughly analyzing different types of\nIVL-Hallu on their causes and reflections. Specifically, we propose several\nnovel IVL-Hallu tasks and categorize them into four types: (a) object\nhallucination, which arises from the misidentification of objects, (b)\nattribute hallucination, which is caused by the misidentification of\nattributes, (c) multi-modal conflicting hallucination, which derives from the\ncontradictions between textual and visual information, and (d)\ncounter-common-sense hallucination, which owes to the contradictions between\nthe LVLM knowledge and actual images. Based on these taxonomies, we propose a\nmore challenging benchmark named PhD to evaluate and explore IVL-Hallu. An\nautomated pipeline is proposed for generating different types of IVL-Hallu\ndata. Extensive experiments on five SOTA LVLMs reveal their inability to\neffectively tackle our proposed IVL-Hallu tasks, with detailed analyses and\ninsights on the origins and possible solutions of these new challenging\nIVL-Hallu tasks, facilitating future researches on IVL-Hallu and LVLM. The\nbenchmark can be accessed at https://github.com/jiazhen-code/IntrinsicHallu\n","authors":["Jiazhen Liu","Yuhan Fu","Ruobing Xie","Runquan Xie","Xingwu Sun","Fengzong Lian","Zhanhui Kang","Xirong Li"],"pdf_url":"https://arxiv.org/pdf/2403.11116v1.pdf","comment":null}]},"2024-03-16T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2403.11038v1","updated":"2024-03-16T23:01:51Z","published":"2024-03-16T23:01:51Z","title":"Texture Edge detection by Patch consensus (TEP)","summary":" We propose Texture Edge detection using Patch consensus (TEP) which is a\ntraining-free method to detect the boundary of texture. We propose a new simple\nway to identify the texture edge location, using the consensus of segmented\nlocal patch information. While on the boundary, even using local patch\ninformation, the distinction between textures are typically not clear, but\nusing neighbor consensus give a clear idea of the boundary. We utilize local\npatch, and its response against neighboring regions, to emphasize the\nsimilarities and the differences across different textures. The step of\nsegmentation of response further emphasizes the edge location, and the\nneighborhood voting gives consensus and stabilize the edge detection. We\nanalyze texture as a stationary process to give insight into the patch width\nparameter verses the quality of edge detection. We derive the necessary\ncondition for textures to be distinguished, and analyze the patch width with\nrespect to the scale of textures. Various experiments are presented to validate\nthe proposed model.\n","authors":["Guangyu Cui","Sung Ha Kang"],"pdf_url":"https://arxiv.org/pdf/2403.11038v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11035v1","updated":"2024-03-16T22:49:47Z","published":"2024-03-16T22:49:47Z","title":"Multiplane Quantitative Phase Imaging Using a Wavelength-Multiplexed\n Diffractive Optical Processor","summary":" Quantitative phase imaging (QPI) is a label-free technique that provides\noptical path length information for transparent specimens, finding utility in\nbiology, materials science, and engineering. Here, we present quantitative\nphase imaging of a 3D stack of phase-only objects using a\nwavelength-multiplexed diffractive optical processor. Utilizing multiple\nspatially engineered diffractive layers trained through deep learning, this\ndiffractive processor can transform the phase distributions of multiple 2D\nobjects at various axial positions into intensity patterns, each encoded at a\nunique wavelength channel. These wavelength-multiplexed patterns are projected\nonto a single field-of-view (FOV) at the output plane of the diffractive\nprocessor, enabling the capture of quantitative phase distributions of input\nobjects located at different axial planes using an intensity-only image sensor.\nBased on numerical simulations, we show that our diffractive processor could\nsimultaneously achieve all-optical quantitative phase imaging across several\ndistinct axial planes at the input by scanning the illumination wavelength. A\nproof-of-concept experiment with a 3D-fabricated diffractive processor further\nvalidated our approach, showcasing successful imaging of two distinct phase\nobjects at different axial positions by scanning the illumination wavelength in\nthe terahertz spectrum. Diffractive network-based multiplane QPI designs can\nopen up new avenues for compact on-chip phase imaging and sensing devices.\n","authors":["Che-Yung Shen","Jingxi Li","Tianyi Gan","Yuhang Li","Langxing Bai","Mona Jarrahi","Aydogan Ozcan"],"pdf_url":"https://arxiv.org/pdf/2403.11035v1.pdf","comment":"27 Pages, 9 Figures"},{"id":"http://arxiv.org/abs/2403.11032v1","updated":"2024-03-16T22:35:21Z","published":"2024-03-16T22:35:21Z","title":"FH-TabNet: Multi-Class Familial Hypercholesterolemia Detection via a\n Multi-Stage Tabular Deep Learning","summary":" Familial Hypercholesterolemia (FH) is a genetic disorder characterized by\nelevated levels of Low-Density Lipoprotein (LDL) cholesterol or its associated\ngenes. Early-stage and accurate categorization of FH is of significance\nallowing for timely interventions to mitigate the risk of life-threatening\nconditions. Conventional diagnosis approach, however, is complex, costly, and a\nchallenging interpretation task even for experienced clinicians resulting in\nhigh underdiagnosis rates. Although there has been a recent surge of interest\nin using Machine Learning (ML) models for early FH detection, existing\nsolutions only consider a binary classification task solely using classical ML\nmodels. Despite its significance, application of Deep Learning (DL) for FH\ndetection is in its infancy, possibly, due to categorical nature of the\nunderlying clinical data. The paper addresses this gap by introducing the\nFH-TabNet, which is a multi-stage tabular DL network for multi-class (Definite,\nProbable, Possible, and Unlikely) FH detection. The FH-TabNet initially\ninvolves applying a deep tabular data learning architecture (TabNet) for\nprimary categorization into healthy (Possible/Unlikely) and patient\n(Probable/Definite) classes. Subsequently, independent TabNet classifiers are\napplied to each subgroup, enabling refined classification. The model's\nperformance is evaluated through 5-fold cross-validation illustrating superior\nperformance in categorizing FH patients, particularly in the challenging\nlow-prevalence subcategories.\n","authors":["Sadaf Khademi","Zohreh Hajiakhondi","Golnaz Vaseghi","Nizal Sarrafzadegan","Arash Mohammadi"],"pdf_url":"https://arxiv.org/pdf/2403.11032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11027v1","updated":"2024-03-16T22:14:56Z","published":"2024-03-16T22:14:56Z","title":"Reward Guided Latent Consistency Distillation","summary":" Latent Consistency Distillation (LCD) has emerged as a promising paradigm for\nefficient text-to-image synthesis. By distilling a latent consistency model\n(LCM) from a pre-trained teacher latent diffusion model (LDM), LCD facilitates\nthe generation of high-fidelity images within merely 2 to 4 inference steps.\nHowever, the LCM's efficient inference is obtained at the cost of the sample\nquality. In this paper, we propose compensating the quality loss by aligning\nLCM's output with human preference during training. Specifically, we introduce\nReward Guided LCD (RG-LCD), which integrates feedback from a reward model (RM)\ninto the LCD process by augmenting the original LCD loss with the objective of\nmaximizing the reward associated with LCM's single-step generation. As\nvalidated through human evaluation, when trained with the feedback of a good\nRM, the 2-step generations from our RG-LCM are favored by humans over the\n50-step DDIM samples from the teacher LDM, representing a 25 times inference\nacceleration without quality loss.\n As directly optimizing towards differentiable RMs can suffer from\nover-optimization, we overcome this difficulty by proposing the use of a latent\nproxy RM (LRM). This novel component serves as an intermediary, connecting our\nLCM with the RM. Empirically, we demonstrate that incorporating the LRM into\nour RG-LCD successfully avoids high-frequency noise in the generated images,\ncontributing to both improved FID on MS-COCO and a higher HPSv2.1 score on\nHPSv2's test set, surpassing those achieved by the baseline LCM.\n","authors":["Jiachen Li","Weixi Feng","Wenhu Chen","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2403.11027v1.pdf","comment":"Project page: https://rg-lcd.github.io/"},{"id":"http://arxiv.org/abs/2403.11026v1","updated":"2024-03-16T22:01:55Z","published":"2024-03-16T22:01:55Z","title":"EfficientMorph: Parameter-Efficient Transformer-Based Architecture for\n 3D Image Registration","summary":" Transformers have emerged as the state-of-the-art architecture in medical\nimage registration, outperforming convolutional neural networks (CNNs) by\naddressing their limited receptive fields and overcoming gradient instability\nin deeper models. Despite their success, transformer-based models require\nsubstantial resources for training, including data, memory, and computational\npower, which may restrict their applicability for end users with limited\nresources. In particular, existing transformer-based 3D image registration\narchitectures face three critical gaps that challenge their efficiency and\neffectiveness. Firstly, while mitigating the quadratic complexity of full\nattention by focusing on local regions, window-based attention mechanisms often\nfail to adequately integrate local and global information. Secondly, feature\nsimilarities across attention heads that were recently found in multi-head\nattention architectures indicate a significant computational redundancy,\nsuggesting that the capacity of the network could be better utilized to enhance\nperformance. Lastly, the granularity of tokenization, a key factor in\nregistration accuracy, presents a trade-off; smaller tokens improve detail\ncapture at the cost of higher computational complexity, increased memory\ndemands, and a risk of overfitting. Here, we propose EfficientMorph, a\ntransformer-based architecture for unsupervised 3D image registration. It\noptimizes the balance between local and global attention through a plane-based\nattention mechanism, reduces computational redundancy via cascaded group\nattention, and captures fine details without compromising computational\nefficiency, thanks to a Hi-Res tokenization strategy complemented by merging\noperations. Notably, EfficientMorph sets a new benchmark for performance on the\nOASIS dataset with 16-27x fewer parameters.\n","authors":["Abu Zahid Bin Aziz","Mokshagna Sai Teja Karanam","Tushar Kataria","Shireen Y. Elhabian"],"pdf_url":"https://arxiv.org/pdf/2403.11026v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11024v1","updated":"2024-03-16T22:00:16Z","published":"2024-03-16T22:00:16Z","title":"Fast Sparse View Guided NeRF Update for Object Reconfigurations","summary":" Neural Radiance Field (NeRF), as an implicit 3D scene representation, lacks\ninherent ability to accommodate changes made to the initial static scene. If\nobjects are reconfigured, it is difficult to update the NeRF to reflect the new\nstate of the scene without time-consuming data re-capturing and NeRF\nre-training. To address this limitation, we develop the first update method for\nNeRFs to physical changes. Our method takes only sparse new images (e.g. 4) of\nthe altered scene as extra inputs and update the pre-trained NeRF in around 1\nto 2 minutes. Particularly, we develop a pipeline to identify scene changes and\nupdate the NeRF accordingly. Our core idea is the use of a second helper NeRF\nto learn the local geometry and appearance changes, which sidesteps the\noptimization difficulties in direct NeRF fine-tuning. The interpolation power\nof the helper NeRF is the key to accurately reconstruct the un-occluded objects\nregions under sparse view supervision. Our method imposes no constraints on\nNeRF pre-training, and requires no extra user input or explicit semantic\npriors. It is an order of magnitude faster than re-training NeRF from scratch\nwhile maintaining on-par and even superior performance.\n","authors":["Ziqi Lu","Jianbo Ye","Xiaohan Fei","Xiaolong Li","Jiawei Mo","Ashwin Swaminathan","Stefano Soatto"],"pdf_url":"https://arxiv.org/pdf/2403.11024v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00761v3","updated":"2024-03-16T21:59:21Z","published":"2023-07-03T05:38:28Z","title":"Learning Degradation-Independent Representations for Camera ISP\n Pipelines","summary":" Image signal processing (ISP) pipeline plays a fundamental role in digital\ncameras, which converts raw Bayer sensor data to RGB images. However,\nISP-generated images usually suffer from imperfections due to the compounded\ndegradations that stem from sensor noises, demosaicing noises, compression\nartifacts, and possibly adverse effects of erroneous ISP hyperparameter\nsettings such as ISO and gamma values. In a general sense, these ISP\nimperfections can be considered as degradations. The highly complex mechanisms\nof ISP degradations, some of which are even unknown, pose great challenges to\nthe generalization capability of deep neural networks (DNN) for image\nrestoration and to their adaptability to downstream tasks. To tackle the\nissues, we propose a novel DNN approach to learn degradation-independent\nrepresentations (DiR) through the refinement of a self-supervised learned\nbaseline representation. The proposed DiR learning technique has remarkable\ndomain generalization capability and consequently, it outperforms\nstate-of-the-art methods across various downstream tasks, including blind image\nrestoration, object detection, and instance segmentation, as verified in our\nexperiments.\n","authors":["Yanhui Guo","Fangzhou Luo","Xiaolin Wu"],"pdf_url":"https://arxiv.org/pdf/2307.00761v3.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.11021v1","updated":"2024-03-16T21:40:27Z","published":"2024-03-16T21:40:27Z","title":"Neuro-Symbolic Video Search","summary":" The unprecedented surge in video data production in recent years necessitates\nefficient tools to extract meaningful frames from videos for downstream tasks.\nLong-term temporal reasoning is a key desideratum for frame retrieval systems.\nWhile state-of-the-art foundation models, like VideoLLaMA and ViCLIP, are\nproficient in short-term semantic understanding, they surprisingly fail at\nlong-term reasoning across frames. A key reason for this failure is that they\nintertwine per-frame perception and temporal reasoning into a single deep\nnetwork. Hence, decoupling but co-designing semantic understanding and temporal\nreasoning is essential for efficient scene identification. We propose a system\nthat leverages vision-language models for semantic understanding of individual\nframes but effectively reasons about the long-term evolution of events using\nstate machines and temporal logic (TL) formulae that inherently capture memory.\nOur TL-based reasoning improves the F1 score of complex event identification by\n9-15% compared to benchmarks that use GPT4 for reasoning on state-of-the-art\nself-driving datasets such as Waymo and NuScenes.\n","authors":["Minkyu Choi","Harsh Goel","Mohammad Omama","Yunhao Yang","Sahil Shah","Sandeep Chinchali"],"pdf_url":"https://arxiv.org/pdf/2403.11021v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11008v1","updated":"2024-03-16T20:16:37Z","published":"2024-03-16T20:16:37Z","title":"MASSM: An End-to-End Deep Learning Framework for Multi-Anatomy\n Statistical Shape Modeling Directly From Images","summary":" Statistical Shape Modeling (SSM) is an effective method for quantitatively\nanalyzing anatomical variations within populations. However, its utility is\nlimited by the need for manual segmentations of anatomies, a task that relies\non the scarce expertise of medical professionals. Recent advances in deep\nlearning have provided a promising approach that automatically generates\nstatistical representations from unsegmented images. Once trained, these deep\nlearning-based models eliminate the need for manual segmentation for new\nsubjects. Nonetheless, most current methods still require manual pre-alignment\nof image volumes and specifying a bounding box around the target anatomy prior\nfor inference, resulting in a partially manual inference process. Recent\napproaches facilitate anatomy localization but only estimate statistical\nrepresentations at the population level. However, they cannot delineate anatomy\ndirectly in images and are limited to modeling a single anatomy. Here, we\nintroduce MASSM, a novel end-to-end deep learning framework that simultaneously\nlocalizes multiple anatomies in an image, estimates population-level\nstatistical representations, and delineates each anatomy. Our findings\nemphasize the crucial role of local correspondences, showcasing their\nindispensability in providing superior shape information for medical imaging\ntasks.\n","authors":["Janmesh Ukey","Tushar Kataria","Shireen Y. Elhabian"],"pdf_url":"https://arxiv.org/pdf/2403.11008v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00754v2","updated":"2024-03-16T19:28:08Z","published":"2023-10-01T18:10:53Z","title":"Analyzing and Mitigating Object Hallucination in Large Vision-Language\n Models","summary":" Large vision-language models (LVLMs) have shown remarkable abilities in\nunderstanding visual information with human languages. However, LVLMs still\nsuffer from object hallucination, which is the problem of generating\ndescriptions that include objects that do not actually exist in the images.\nThis can negatively impact many vision-language tasks, such as visual\nsummarization and reasoning. To address this issue, we propose a simple yet\npowerful algorithm, LVLM Hallucination Revisor (LURE), to post-hoc rectify\nobject hallucination in LVLMs by reconstructing less hallucinatory\ndescriptions. LURE is grounded in a rigorous statistical analysis of the key\nfactors underlying object hallucination, including co-occurrence (the frequent\nappearance of certain objects alongside others in images), uncertainty (objects\nwith higher uncertainty during LVLM decoding), and object position\n(hallucination often appears in the later part of the generated text). LURE can\nalso be seamlessly integrated with any LVLMs. We evaluate LURE on six\nopen-source LVLMs, achieving a 23% improvement in general object hallucination\nevaluation metrics over the previous best approach. In both GPT and human\nevaluations, LURE consistently ranks at the top. Our data and code are\navailable at https://github.com/YiyangZhou/LURE.\n","authors":["Yiyang Zhou","Chenhang Cui","Jaehong Yoon","Linjun Zhang","Zhun Deng","Chelsea Finn","Mohit Bansal","Huaxiu Yao"],"pdf_url":"https://arxiv.org/pdf/2310.00754v2.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2403.11001v1","updated":"2024-03-16T19:11:57Z","published":"2024-03-16T19:11:57Z","title":"Topologically faithful multi-class segmentation in medical images","summary":" Topological accuracy in medical image segmentation is a highly important\nproperty for downstream applications such as network analysis and flow modeling\nin vessels or cell counting. Recently, significant methodological advancements\nhave brought well-founded concepts from algebraic topology to binary\nsegmentation. However, these approaches have been underexplored in multi-class\nsegmentation scenarios, where topological errors are common. We propose a\ngeneral loss function for topologically faithful multi-class segmentation\nextending the recent Betti matching concept, which is based on induced\nmatchings of persistence barcodes. We project the N-class segmentation problem\nto N single-class segmentation tasks, which allows us to use 1-parameter\npersistent homology making training of neural networks computationally\nfeasible. We validate our method on a comprehensive set of four medical\ndatasets with highly variant topological characteristics. Our loss formulation\nsignificantly enhances topological correctness in cardiac, cell, artery-vein,\nand Circle of Willis segmentation.\n","authors":["Alexander H. Berger","Nico Stucki","Laurin Lux","Vincent Buergin","Suprosanna Shit","Anna Banaszak","Daniel Rueckert","Ulrich Bauer","Johannes C. Paetzold"],"pdf_url":"https://arxiv.org/pdf/2403.11001v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10997v1","updated":"2024-03-16T18:50:44Z","published":"2024-03-16T18:50:44Z","title":"N2F2: Hierarchical Scene Understanding with Nested Neural Feature Fields","summary":" Understanding complex scenes at multiple levels of abstraction remains a\nformidable challenge in computer vision. To address this, we introduce Nested\nNeural Feature Fields (N2F2), a novel approach that employs hierarchical\nsupervision to learn a single feature field, wherein different dimensions\nwithin the same high-dimensional feature encode scene properties at varying\ngranularities. Our method allows for a flexible definition of hierarchies,\ntailored to either the physical dimensions or semantics or both, thereby\nenabling a comprehensive and nuanced understanding of scenes. We leverage a 2D\nclass-agnostic segmentation model to provide semantically meaningful pixel\ngroupings at arbitrary scales in the image space, and query the CLIP\nvision-encoder to obtain language-aligned embeddings for each of these\nsegments. Our proposed hierarchical supervision method then assigns different\nnested dimensions of the feature field to distill the CLIP embeddings using\ndeferred volumetric rendering at varying physical scales, creating a\ncoarse-to-fine representation. Extensive experiments show that our approach\noutperforms the state-of-the-art feature field distillation methods on tasks\nsuch as open-vocabulary 3D segmentation and localization, demonstrating the\neffectiveness of the learned nested feature field.\n","authors":["Yash Bhalgat","Iro Laina","João F. Henriques","Andrew Zisserman","Andrea Vedaldi"],"pdf_url":"https://arxiv.org/pdf/2403.10997v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08789v3","updated":"2024-03-16T18:38:18Z","published":"2023-07-17T19:17:10Z","title":"Creating Image Datasets in Agricultural Environments using DALL.E:\n Generative AI-Powered Large Language Model","summary":" This research investigated the role of artificial intelligence (AI),\nspecifically the DALL.E model by OpenAI, in advancing data generation and\nvisualization techniques in agriculture. DALL.E, an advanced AI image\ngenerator, works alongside ChatGPT's language processing to transform text\ndescriptions and image clues into realistic visual representations of the\ncontent. The study used both approaches of image generation: text-to-image and\nimage-to image (variation). Six types of datasets depicting fruit crop\nenvironment were generated. These AI-generated images were then compared\nagainst ground truth images captured by sensors in real agricultural fields.\nThe comparison was based on Peak Signal-to-Noise Ratio (PSNR) and Feature\nSimilarity Index (FSIM) metrics. The image-to-image generation exhibited a\n5.78% increase in average PSNR over text-to-image methods, signifying superior\nimage clarity and quality. However, this method also resulted in a 10.23%\ndecrease in average FSIM, indicating a diminished structural and textural\nsimilarity to the original images. Similar to these measures, human evaluation\nalso showed that images generated using image-to-image-based method were more\nrealistic compared to those generated with text-to-image approach. The results\nhighlighted DALL.E's potential in generating realistic agricultural image\ndatasets and thus accelerating the development and adoption of imaging-based\nprecision agricultural solutions.\n","authors":["Ranjan Sapkota","Dawood Ahmed","Manoj Karkee"],"pdf_url":"https://arxiv.org/pdf/2307.08789v3.pdf","comment":"9 Figures, 1 table, 17 pages"},{"id":"http://arxiv.org/abs/2211.14769v4","updated":"2024-03-16T18:35:52Z","published":"2022-11-27T09:01:31Z","title":"Navigation as Attackers Wish? Towards Building Robust Embodied Agents\n under Federated Learning","summary":" Federated embodied agent learning protects the data privacy of individual\nvisual environments by keeping data locally at each client (the individual\nenvironment) during training. However, since the local data is inaccessible to\nthe server under federated learning, attackers may easily poison the training\ndata of the local client to build a backdoor in the agent without notice.\nDeploying such an agent raises the risk of potential harm to humans, as the\nattackers may easily navigate and control the agent as they wish via the\nbackdoor. Towards Byzantine-robust federated embodied agent learning, in this\npaper, we study the attack and defense for the task of vision-and-language\nnavigation (VLN), where the agent is required to follow natural language\ninstructions to navigate indoor environments. First, we introduce a simple but\neffective attack strategy, Navigation as Wish (NAW), in which the malicious\nclient manipulates local trajectory data to implant a backdoor into the global\nmodel. Results on two VLN datasets (R2R and RxR) show that NAW can easily\nnavigate the deployed VLN agent regardless of the language instruction, without\naffecting its performance on normal test sets. Then, we propose a new\nPrompt-Based Aggregation (PBA) to defend against the NAW attack in federated\nVLN, which provides the server with a ''prompt'' of the vision-and-language\nalignment variance between the benign and malicious clients so that they can be\ndistinguished during training. We validate the effectiveness of the PBA method\non protecting the global model from the NAW attack, which outperforms other\nstate-of-the-art defense methods by a large margin in the defense metrics on\nR2R and RxR.\n","authors":["Yunchao Zhang","Zonglin Di","Kaiwen Zhou","Cihang Xie","Xin Eric Wang"],"pdf_url":"https://arxiv.org/pdf/2211.14769v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10988v1","updated":"2024-03-16T18:04:12Z","published":"2024-03-16T18:04:12Z","title":"Boosting Flow-based Generative Super-Resolution Models via Learned Prior","summary":" Flow-based super-resolution (SR) models have demonstrated astonishing\ncapabilities in generating high-quality images. However, these methods\nencounter several challenges during image generation, such as grid artifacts,\nexploding inverses, and suboptimal results due to a fixed sampling temperature.\nTo overcome these issues, this work introduces a conditional learned prior to\nthe inference phase of a flow-based SR model. This prior is a latent code\npredicted by our proposed latent module conditioned on the low-resolution\nimage, which is then transformed by the flow model into an SR image. Our\nframework is designed to seamlessly integrate with any contemporary flow-based\nSR model without modifying its architecture or pre-trained weights. We evaluate\nthe effectiveness of our proposed framework through extensive experiments and\nablation analyses. The proposed framework successfully addresses all the\ninherent issues in flow-based SR models and enhances their performance in\nvarious SR scenarios. Our code is available at:\nhttps://github.com/liyuantsao/FlowSR-LP\n","authors":["Li-Yuan Tsao","Yi-Chen Lo","Chia-Che Chang","Hao-Wei Chen","Roy Tseng","Chien Feng","Chun-Yi Lee"],"pdf_url":"https://arxiv.org/pdf/2403.10988v1.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2403.06495v3","updated":"2024-03-16T17:56:53Z","published":"2024-03-11T08:07:46Z","title":"Toward Generalist Anomaly Detection via In-context Residual Learning\n with Few-shot Sample Prompts","summary":" This paper explores the problem of Generalist Anomaly Detection (GAD), aiming\nto train one single detection model that can generalize to detect anomalies in\ndiverse datasets from different application domains without any further\ntraining on the target data. Some recent studies have shown that large\npre-trained Visual-Language Models (VLMs) like CLIP have strong generalization\ncapabilities on detecting industrial defects from various datasets, but their\nmethods rely heavily on handcrafted text prompts about defects, making them\ndifficult to generalize to anomalies in other applications, e.g., medical image\nanomalies or semantic anomalies in natural images. In this work, we propose to\ntrain a GAD model with few-shot normal images as sample prompts for AD on\ndiverse datasets on the fly. To this end, we introduce a novel approach that\nlearns an in-context residual learning model for GAD, termed InCTRL. It is\ntrained on an auxiliary dataset to discriminate anomalies from normal samples\nbased on a holistic evaluation of the residuals between query images and\nfew-shot normal sample prompts. Regardless of the datasets, per definition of\nanomaly, larger residuals are expected for anomalies than normal samples,\nthereby enabling InCTRL to generalize across different domains without further\ntraining. Comprehensive experiments on nine AD datasets are performed to\nestablish a GAD benchmark that encapsulate the detection of industrial defect\nanomalies, medical anomalies, and semantic anomalies in both one-vs-all and\nmulti-class setting, on which InCTRL is the best performer and significantly\noutperforms state-of-the-art competing methods. Code is available at\nhttps://github.com/mala-lab/InCTRL.\n","authors":["Jiawen Zhu","Guansong Pang"],"pdf_url":"https://arxiv.org/pdf/2403.06495v3.pdf","comment":"Accepted to CVPR 2024; 17 pages; 5 figures"},{"id":"http://arxiv.org/abs/2311.14265v2","updated":"2024-03-16T17:38:53Z","published":"2023-11-24T03:43:59Z","title":"Adaptive Calibration: A Unified Conversion Framework of Spiking Neural\n Networks","summary":" Spiking Neural Networks (SNNs) have emerged as a promising energy-efficient\nalternative to traditional Artificial Neural Networks (ANNs). Despite this,\nbridging the performance gap with ANNs in practical scenarios remains a\nsignificant challenge. This paper focuses on addressing the dual objectives of\nenhancing the performance and efficiency of SNNs through the established SNN\nCalibration conversion framework. Inspired by the biological nervous system, we\npropose a novel Adaptive-Firing Neuron Model (AdaFire) that dynamically adjusts\nfiring patterns across different layers, substantially reducing conversion\nerrors within limited timesteps. Moreover, to meet our efficiency objectives,\nwe propose two novel strategies: an Sensitivity Spike Compression (SSC)\ntechnique and an Input-aware Adaptive Timesteps (IAT) technique. These\ntechniques synergistically reduce both energy consumption and latency during\nthe conversion process, thereby enhancing the overall efficiency of SNNs.\nExtensive experiments demonstrate our approach outperforms state-of-the-art\nSNNs methods, showcasing superior performance and efficiency in 2D, 3D, and\nevent-driven classification, as well as object detection and segmentation\ntasks.\n","authors":["Ziqing Wang","Yuetong Fang","Jiahang Cao","Renjing Xu"],"pdf_url":"https://arxiv.org/pdf/2311.14265v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2403.10983v1","updated":"2024-03-16T17:30:15Z","published":"2024-03-16T17:30:15Z","title":"OMG: Occlusion-friendly Personalized Multi-concept Generation in\n Diffusion Models","summary":" Personalization is an important topic in text-to-image generation, especially\nthe challenging multi-concept personalization. Current multi-concept methods\nare struggling with identity preservation, occlusion, and the harmony between\nforeground and background. In this work, we propose OMG, an occlusion-friendly\npersonalized generation framework designed to seamlessly integrate multiple\nconcepts within a single image. We propose a novel two-stage sampling solution.\nThe first stage takes charge of layout generation and visual comprehension\ninformation collection for handling occlusions. The second one utilizes the\nacquired visual comprehension information and the designed noise blending to\nintegrate multiple concepts while considering occlusions. We also observe that\nthe initiation denoising timestep for noise blending is the key to identity\npreservation and layout. Moreover, our method can be combined with various\nsingle-concept models, such as LoRA and InstantID without additional tuning.\nEspecially, LoRA models on civitai.com can be exploited directly. Extensive\nexperiments demonstrate that OMG exhibits superior performance in multi-concept\npersonalization.\n","authors":["Zhe Kong","Yong Zhang","Tianyu Yang","Tao Wang","Kaihao Zhang","Bizhu Wu","Guanying Chen","Wei Liu","Wenhan Luo"],"pdf_url":"https://arxiv.org/pdf/2403.10983v1.pdf","comment":"Homepage: https://kongzhecn.github.io/omg-project/ Github:\n https://github.com/kongzhecn/OMG/"},{"id":"http://arxiv.org/abs/2403.10981v1","updated":"2024-03-16T17:24:46Z","published":"2024-03-16T17:24:46Z","title":"Automatic Spatial Calibration of Near-Field MIMO Radar With Respect to\n Optical Sensors","summary":" Despite an emerging interest in MIMO radar, the utilization of its\ncomplementary strengths in combination with optical sensors has so far been\nlimited to far-field applications, due to the challenges that arise from mutual\nsensor calibration in the near field. In fact, most related approaches in the\nautonomous industry propose target-based calibration methods using corner\nreflectors that have proven to be unsuitable for the near field. In contrast,\nwe propose a novel, joint calibration approach for optical RGB-D sensors and\nMIMO radars that is designed to operate in the radar's near-field range, within\ndecimeters from the sensors. Our pipeline consists of a bespoke calibration\ntarget, allowing for automatic target detection and localization, followed by\nthe spatial calibration of the two sensor coordinate systems through target\nregistration. We validate our approach using two different depth sensing\ntechnologies from the optical domain. The experiments show the efficiency and\naccuracy of our calibration for various target displacements, as well as its\nrobustness of our localization in terms of signal ambiguities.\n","authors":["Vanessa Wirth","Johanna Bräunig","Danti Khouri","Florian Gutsche","Martin Vossiek","Tim Weyrich","Marc Stamminger"],"pdf_url":"https://arxiv.org/pdf/2403.10981v1.pdf","comment":"8 pages, 9 figures"},{"id":"http://arxiv.org/abs/2403.10971v1","updated":"2024-03-16T17:02:50Z","published":"2024-03-16T17:02:50Z","title":"Task-Aware Low-Rank Adaptation of Segment Anything Model","summary":" The Segment Anything Model (SAM), with its remarkable zero-shot capability,\nhas been proven to be a powerful foundation model for image segmentation tasks,\nwhich is an important task in computer vision. However, the transfer of its\nrich semantic information to multiple different downstream tasks remains\nunexplored. In this paper, we propose the Task-Aware Low-Rank Adaptation\n(TA-LoRA) method, which enables SAM to work as a foundation model for\nmulti-task learning. Specifically, TA-LoRA injects an update parameter tensor\ninto each layer of the encoder in SAM and leverages a low-rank tensor\ndecomposition method to incorporate both task-shared and task-specific\ninformation. Furthermore, we introduce modified SAM (mSAM) for multi-task\nlearning where we remove the prompt encoder of SAM and use task-specific no\nmask embeddings and mask decoder for each task. Extensive experiments conducted\non benchmark datasets substantiate the efficacy of TA-LoRA in enhancing the\nperformance of mSAM across multiple downstream tasks.\n","authors":["Xuehao Wang","Feiyang Ye","Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.10971v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.03253v2","updated":"2024-03-16T16:39:07Z","published":"2024-01-06T16:33:39Z","title":"VLLaVO: Mitigating Visual Gap through LLMs","summary":" Recent advances achieved by deep learning models rely on the independent and\nidentically distributed assumption, hindering their applications in real-world\nscenarios with domain shifts. To tackle this issue, cross-domain learning aims\nat extracting domain-invariant knowledge to reduce the domain shift between\ntraining and testing data. However, in visual cross-domain learning,\ntraditional methods concentrate solely on the image modality, disregarding the\npotential benefits of incorporating the text modality. In this work, we propose\nVLLaVO, combining Vision language models and Large Language models as Visual\ncross-dOmain learners. VLLaVO uses vision-language models to convert images\ninto detailed textual descriptions. A large language model is then finetuned on\ntextual descriptions of the source/target domain generated by a designed\ninstruction template. Extensive experimental results under domain\ngeneralization and unsupervised domain adaptation settings demonstrate the\neffectiveness of the proposed method.\n","authors":["Shuhao Chen","Yulong Zhang","Weisen Jiang","Jiangang Lu","Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.03253v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10962v1","updated":"2024-03-16T16:17:44Z","published":"2024-03-16T16:17:44Z","title":"Exploiting Topological Prior for Boosting Point Cloud Generation","summary":" This paper presents an innovative enhancement to the Sphere as Prior\nGenerative Adversarial Network (SP-GAN) model, a state-of-the-art GAN designed\nfor point cloud generation. A novel method is introduced for point cloud\ngeneration that elevates the structural integrity and overall quality of the\ngenerated point clouds by incorporating topological priors into the training\nprocess of the generator. Specifically, this work utilizes the K-means\nalgorithm to segment a point cloud from the repository into clusters and\nextract centroids, which are then used as priors in the generation process of\nthe SP-GAN. Furthermore, the discriminator component of the SP-GAN utilizes the\nidentical point cloud that contributed the centroids, ensuring a coherent and\nconsistent learning environment. This strategic use of centroids as intuitive\nguides not only boosts the efficiency of global feature learning but also\nsubstantially improves the structural coherence and fidelity of the generated\npoint clouds. By applying the K-means algorithm to generate centroids as the\nprior, the work intuitively and experimentally demonstrates that such a prior\nenhances the quality of generated point clouds.\n","authors":["Baiyuan Chen"],"pdf_url":"https://arxiv.org/pdf/2403.10962v1.pdf","comment":"7 pages, 3 figures, AIDML 2024"},{"id":"http://arxiv.org/abs/2312.02520v2","updated":"2024-03-16T16:07:33Z","published":"2023-12-05T06:02:21Z","title":"Towards More Unified In-context Visual Understanding","summary":" The rapid advancement of large language models (LLMs) has accelerated the\nemergence of in-context learning (ICL) as a cutting-edge approach in the\nnatural language processing domain. Recently, ICL has been employed in visual\nunderstanding tasks, such as semantic segmentation and image captioning,\nyielding promising results. However, existing visual ICL framework can not\nenable producing content across multiple modalities, which limits their\npotential usage scenarios. To address this issue, we present a new ICL\nframework for visual understanding with multi-modal output enabled. First, we\nquantize and embed both text and visual prompt into a unified representational\nspace, structured as interleaved in-context sequences. Then a decoder-only\nsparse transformer architecture is employed to perform generative modeling on\nthem, facilitating in-context learning. Thanks to this design, the model is\ncapable of handling in-context vision understanding tasks with multimodal\noutput in a unified pipeline.Experimental results demonstrate that our model\nachieves competitive performance compared with specialized models and previous\nICL baselines. Overall, our research takes a further step toward unified\nmultimodal in-context learning.\n","authors":["Dianmo Sheng","Dongdong Chen","Zhentao Tan","Qiankun Liu","Qi Chu","Jianmin Bao","Tao Gong","Bin Liu","Shengwei Xu","Nenghai Yu"],"pdf_url":"https://arxiv.org/pdf/2312.02520v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.10953v1","updated":"2024-03-16T15:39:23Z","published":"2024-03-16T15:39:23Z","title":"Ctrl123: Consistent Novel View Synthesis via Closed-Loop Transcription","summary":" Large image diffusion models have demonstrated zero-shot capability in novel\nview synthesis (NVS). However, existing diffusion-based NVS methods struggle to\ngenerate novel views that are accurately consistent with the corresponding\nground truth poses and appearances, even on the training set. This consequently\nlimits the performance of downstream tasks, such as image-to-multiview\ngeneration and 3D reconstruction. We realize that such inconsistency is largely\ndue to the fact that it is difficult to enforce accurate pose and appearance\nalignment directly in the diffusion training, as mostly done by existing\nmethods such as Zero123. To remedy this problem, we propose Ctrl123, a\nclosed-loop transcription-based NVS diffusion method that enforces alignment\nbetween the generated view and ground truth in a pose-sensitive feature space.\nOur extensive experiments demonstrate the effectiveness of Ctrl123 on the tasks\nof NVS and 3D reconstruction, achieving significant improvements in both\nmultiview-consistency and pose-consistency over existing methods.\n","authors":["Hongxiang Zhao","Xili Dai","Jianan Wang","Shengbang Tong","Jingyuan Zhang","Weida Wang","Lei Zhang","Yi Ma"],"pdf_url":"https://arxiv.org/pdf/2403.10953v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06725v2","updated":"2024-03-16T15:16:48Z","published":"2023-12-11T05:20:52Z","title":"EpiDiff: Enhancing Multi-View Synthesis via Localized\n Epipolar-Constrained Diffusion","summary":" Generating multiview images from a single view facilitates the rapid\ngeneration of a 3D mesh conditioned on a single image. Recent methods that\nintroduce 3D global representation into diffusion models have shown the\npotential to generate consistent multiviews, but they have reduced generation\nspeed and face challenges in maintaining generalizability and quality. To\naddress this issue, we propose EpiDiff, a localized interactive multiview\ndiffusion model. At the core of the proposed approach is to insert a\nlightweight epipolar attention block into the frozen diffusion model,\nleveraging epipolar constraints to enable cross-view interaction among feature\nmaps of neighboring views. The newly initialized 3D modeling module preserves\nthe original feature distribution of the diffusion model, exhibiting\ncompatibility with a variety of base diffusion models. Experiments show that\nEpiDiff generates 16 multiview images in just 12 seconds, and it surpasses\nprevious methods in quality evaluation metrics, including PSNR, SSIM and LPIPS.\nAdditionally, EpiDiff can generate a more diverse distribution of views,\nimproving the reconstruction quality from generated multiviews. Please see our\nproject page at https://huanngzh.github.io/EpiDiff/.\n","authors":["Zehuan Huang","Hao Wen","Junting Dong","Yaohui Wang","Yangguang Li","Xinyuan Chen","Yan-Pei Cao","Ding Liang","Yu Qiao","Bo Dai","Lu Sheng"],"pdf_url":"https://arxiv.org/pdf/2312.06725v2.pdf","comment":"Project page: https://huanngzh.github.io/EpiDiff/"},{"id":"http://arxiv.org/abs/2403.10942v1","updated":"2024-03-16T14:58:58Z","published":"2024-03-16T14:58:58Z","title":"ScanTalk: 3D Talking Heads from Unregistered Scans","summary":" Speech-driven 3D talking heads generation has emerged as a significant area\nof interest among researchers, presenting numerous challenges. Existing methods\nare constrained by animating faces with fixed topologies, wherein point-wise\ncorrespondence is established, and the number and order of points remains\nconsistent across all identities the model can animate. In this work, we\npresent ScanTalk, a novel framework capable of animating 3D faces in arbitrary\ntopologies including scanned data. Our approach relies on the DiffusionNet\narchitecture to overcome the fixed topology constraint, offering promising\navenues for more flexible and realistic 3D animations. By leveraging the power\nof DiffusionNet, ScanTalk not only adapts to diverse facial structures but also\nmaintains fidelity when dealing with scanned data, thereby enhancing the\nauthenticity and versatility of generated 3D talking heads. Through\ncomprehensive comparisons with state-of-the-art methods, we validate the\nefficacy of our approach, demonstrating its capacity to generate realistic\ntalking heads comparable to existing techniques. While our primary objective is\nto develop a generic method free from topological constraints, all\nstate-of-the-art methodologies are bound by such limitations. Code for\nreproducing our results, and the pre-trained model will be made available.\n","authors":["Federico Nocentini","Thomas Besnier","Claudio Ferrari","Sylvain Arguillere","Stefano Berretti","Mohamed Daoudi"],"pdf_url":"https://arxiv.org/pdf/2403.10942v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10936v1","updated":"2024-03-16T14:30:25Z","published":"2024-03-16T14:30:25Z","title":"Channel-wise Feature Decorrelation for Enhanced Learned Image\n Compression","summary":" The emerging Learned Compression (LC) replaces the traditional codec modules\nwith Deep Neural Networks (DNN), which are trained end-to-end for\nrate-distortion performance. This approach is considered as the future of\nimage/video compression, and major efforts have been dedicated to improving its\ncompression efficiency. However, most proposed works target compression\nefficiency by employing more complex DNNS, which contributes to higher\ncomputational complexity. Alternatively, this paper proposes to improve\ncompression by fully exploiting the existing DNN capacity. To do so, the latent\nfeatures are guided to learn a richer and more diverse set of features, which\ncorresponds to better reconstruction. A channel-wise feature decorrelation loss\nis designed and is integrated into the LC optimization. Three strategies are\nproposed and evaluated, which optimize (1) the transformation network, (2) the\ncontext model, and (3) both networks. Experimental results on two established\nLC methods show that the proposed method improves the compression with a\nBD-Rate of up to 8.06%, with no added complexity. The proposed solution can be\napplied as a plug-and-play solution to optimize any similar LC method.\n","authors":["Farhad Pakdaman","Moncef Gabbouj"],"pdf_url":"https://arxiv.org/pdf/2403.10936v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10935v1","updated":"2024-03-16T14:23:17Z","published":"2024-03-16T14:23:17Z","title":"Understanding Robustness of Visual State Space Models for Image\n Classification","summary":" Visual State Space Model (VMamba) has recently emerged as a promising\narchitecture, exhibiting remarkable performance in various computer vision\ntasks. However, its robustness has not yet been thoroughly studied. In this\npaper, we delve into the robustness of this architecture through comprehensive\ninvestigations from multiple perspectives. Firstly, we investigate its\nrobustness to adversarial attacks, employing both whole-image and\npatch-specific adversarial attacks. Results demonstrate superior adversarial\nrobustness compared to Transformer architectures while revealing scalability\nweaknesses. Secondly, the general robustness of VMamba is assessed against\ndiverse scenarios, including natural adversarial examples, out-of-distribution\ndata, and common corruptions. VMamba exhibits exceptional generalizability with\nout-of-distribution data but shows scalability weaknesses against natural\nadversarial examples and common corruptions. Additionally, we explore VMamba's\ngradients and back-propagation during white-box attacks, uncovering unique\nvulnerabilities and defensive capabilities of its novel components. Lastly, the\nsensitivity of VMamba to image structure variations is examined, highlighting\nvulnerabilities associated with the distribution of disturbance areas and\nspatial information, with increased susceptibility closer to the image center.\nThrough these comprehensive studies, we contribute to a deeper understanding of\nVMamba's robustness, providing valuable insights for refining and advancing the\ncapabilities of deep neural networks in computer vision applications.\n","authors":["Chengbin Du","Yanxi Li","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2403.10935v1.pdf","comment":"27 pages"},{"id":"http://arxiv.org/abs/2403.10931v1","updated":"2024-03-16T14:11:54Z","published":"2024-03-16T14:11:54Z","title":"Uncertainty-Aware Adapter: Adapting Segment Anything Model (SAM) for\n Ambiguous Medical Image Segmentation","summary":" The Segment Anything Model (SAM) gained significant success in natural image\nsegmentation, and many methods have tried to fine-tune it to medical image\nsegmentation. An efficient way to do so is by using Adapters, specialized\nmodules that learn just a few parameters to tailor SAM specifically for medical\nimages. However, unlike natural images, many tissues and lesions in medical\nimages have blurry boundaries and may be ambiguous. Previous efforts to adapt\nSAM ignore this challenge and can only predict distinct segmentation.It may\nmislead clinicians or cause misdiagnosis, especially when encountering rare\nvariants or situations with low model confidence. In this work, we propose a\nnovel module called the Uncertainty-aware Adapter, which efficiently\nfine-tuning SAM for uncertainty-aware medical image segmentation. Utilizing a\nconditional variational autoencoder, we encoded stochastic samples to\neffectively represent the inherent uncertainty in medical imaging. We designed\na new module on a standard adapter that utilizes a condition-based strategy to\ninteract with samples to help SAM integrate uncertainty. We evaluated our\nmethod on two multi-annotated datasets with different modalities: LIDC-IDRI\n(lung abnormalities segmentation) and REFUGE2 (optic-cup segmentation). The\nexperimental results show that the proposed model outperforms all the previous\nmethods and achieves the new state-of-the-art (SOTA) on both benchmarks. We\nalso demonstrated that our method can generate diverse segmentation hypotheses\nthat are more realistic as well as heterogeneous.\n","authors":["Mingzhou Jiang","Jiaying Zhou","Junde Wu","Tianyang Wang","Yueming Jin","Min Xu"],"pdf_url":"https://arxiv.org/pdf/2403.10931v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15236v2","updated":"2024-03-16T14:11:11Z","published":"2023-08-29T11:51:27Z","title":"Rotation Augmented Distillation for Exemplar-Free Class Incremental\n Learning with Detailed Analysis","summary":" Class incremental learning (CIL) aims to recognize both the old and new\nclasses along the increment tasks. Deep neural networks in CIL suffer from\ncatastrophic forgetting and some approaches rely on saving exemplars from\nprevious tasks, known as the exemplar-based setting, to alleviate this problem.\nOn the contrary, this paper focuses on the Exemplar-Free setting with no old\nclass sample preserved. Balancing the plasticity and stability in deep feature\nlearning with only supervision from new classes is more challenging. Most\nexisting Exemplar-Free CIL methods report the overall performance only and lack\nfurther analysis. In this work, different methods are examined with\ncomplementary metrics in greater detail. Moreover, we propose a simple CIL\nmethod, Rotation Augmented Distillation (RAD), which achieves one of the\ntop-tier performances under the Exemplar-Free setting. Detailed analysis shows\nour RAD benefits from the superior balance between plasticity and stability.\nFinally, more challenging exemplar-free settings with fewer initial classes are\nundertaken for further demonstrations and comparisons among the\nstate-of-the-art methods.\n","authors":["Xiuwei Chen","Xiaobin Chang"],"pdf_url":"https://arxiv.org/pdf/2308.15236v2.pdf","comment":"Accepted by PRCV2023"},{"id":"http://arxiv.org/abs/2309.00359v4","updated":"2024-03-16T14:02:45Z","published":"2023-09-01T09:34:49Z","title":"Large Content And Behavior Models To Understand, Simulate, And Optimize\n Content And Behavior","summary":" Shannon and Weaver's seminal information theory divides communication into\nthree levels: technical, semantic, and effectiveness. While the technical level\ndeals with the accurate reconstruction of transmitted symbols, the semantic and\neffectiveness levels deal with the inferred meaning and its effect on the\nreceiver. Large Language Models (LLMs), with their wide generalizability, make\nsome progress towards the second level. However, LLMs and other communication\nmodels are not conventionally designed for predicting and optimizing\ncommunication for desired receiver behaviors and intents. As a result, the\neffectiveness level remains largely untouched by modern communication systems.\nIn this paper, we introduce the receivers' \"behavior tokens,\" such as shares,\nlikes, clicks, purchases, and retweets, in the LLM's training corpora to\noptimize content for the receivers and predict their behaviors. Other than\nshowing similar performance to LLMs on content understanding tasks, our trained\nmodels show generalization capabilities on the behavior dimension for behavior\nsimulation, content simulation, behavior understanding, and behavior domain\nadaptation. We show results on all these capabilities using a wide range of\ntasks on three corpora. We call these models Large Content and Behavior Models\n(LCBMs). Further, to spur more research on LCBMs, we release our new Content\nBehavior Corpus (CBC), a repository containing communicator, message, and\ncorresponding receiver behavior (https://behavior-in-the-wild.github.io/LCBM).\n","authors":["Ashmit Khandelwal","Aditya Agrawal","Aanisha Bhattacharyya","Yaman K Singla","Somesh Singh","Uttaran Bhattacharya","Ishita Dasgupta","Stefano Petrangeli","Rajiv Ratn Shah","Changyou Chen","Balaji Krishnamurthy"],"pdf_url":"https://arxiv.org/pdf/2309.00359v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02069v2","updated":"2024-03-16T13:52:03Z","published":"2024-03-04T14:17:30Z","title":"HyperPredict: Estimating Hyperparameter Effects for Instance-Specific\n Regularization in Deformable Image Registration","summary":" Methods for medical image registration infer geometric transformations that\nalign pairs/groups of images by maximising an image similarity metric. This\nproblem is ill-posed as several solutions may have equivalent likelihoods, also\noptimising purely for image similarity can yield implausible transformations.\nFor these reasons regularization terms are essential to obtain meaningful\nregistration results. However, this requires the introduction of at least one\nhyperparameter often termed $\\lambda$, that serves as a tradeoff between loss\nterms. In some situations, the quality of the estimated transformation greatly\ndepends on hyperparameter choice, and different choices may be required\ndepending on the characteristics of the data. Analyzing the effect of these\nhyperparameters requires labelled data, which is not commonly available at\ntest-time. In this paper, we propose a method for evaluating the influence of\nhyperparameters and subsequently selecting an optimal value for given image\npairs. Our approach which we call HyperPredict, implements a Multi-Layer\nPerceptron that learns the effect of selecting particular hyperparameters for\nregistering an image pair by predicting the resulting segmentation overlap and\nmeasure of deformation smoothness. This approach enables us to select optimal\nhyperparameters at test time without requiring labelled data, removing the need\nfor a one-size-fits-all cross-validation approach. Furthermore, the criteria\nused to define optimal hyperparameter is flexible post-training, allowing us to\nefficiently choose specific properties. We evaluate our proposed method on the\nOASIS brain MR dataset using a recent deep learning approach(cLapIRN) and an\nalgorithmic method(Niftyreg). Our results demonstrate good performance in\npredicting the effects of regularization hyperparameters and highlight the\nbenefits of our image-pair specific approach to hyperparameter selection.\n","authors":["Aisha L. Shuaibu","Ivor J. A. Simpson"],"pdf_url":"https://arxiv.org/pdf/2403.02069v2.pdf","comment":"Accepted for publication at the Journal of Machine Learning for\n Biomedical Imaging (MELBA) https://melba-journal.org/2024:005"},{"id":"http://arxiv.org/abs/2403.10925v1","updated":"2024-03-16T13:44:42Z","published":"2024-03-16T13:44:42Z","title":"Learning Dual-Level Deformable Implicit Representation for Real-World\n Scale Arbitrary Super-Resolution","summary":" Scale arbitrary super-resolution based on implicit image function gains\nincreasing popularity since it can better represent the visual world in a\ncontinuous manner. However, existing scale arbitrary works are trained and\nevaluated on simulated datasets, where low-resolution images are generated from\ntheir ground truths by the simplest bicubic downsampling. These models exhibit\nlimited generalization to real-world scenarios due to the greater complexity of\nreal-world degradations. To address this issue, we build a RealArbiSR dataset,\na new real-world super-resolution benchmark with both integer and non-integer\nscaling factors for the training and evaluation of real-world scale arbitrary\nsuper-resolution. Moreover, we propose a Dual-level Deformable Implicit\nRepresentation (DDIR) to solve real-world scale arbitrary super-resolution.\nSpecifically, we design the appearance embedding and deformation field to\nhandle both image-level and pixel-level deformations caused by real-world\ndegradations. The appearance embedding models the characteristics of\nlow-resolution inputs to deal with photometric variations at different scales,\nand the pixel-based deformation field learns RGB differences which result from\nthe deviations between the real-world and simulated degradations at arbitrary\ncoordinates. Extensive experiments show our trained model achieves\nstate-of-the-art performance on the RealArbiSR and RealSR benchmarks for\nreal-world scale arbitrary super-resolution. Our dataset as well as source code\nwill be publicly available.\n","authors":["Zhiheng Li","Muheng Li","Jixuan Fan","Lei Chen","Yansong Tang","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2403.10925v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18152v3","updated":"2024-03-16T13:23:58Z","published":"2024-02-28T08:32:19Z","title":"Boosting Neural Representations for Videos with a Conditional Decoder","summary":" Implicit neural representations (INRs) have emerged as a promising approach\nfor video storage and processing, showing remarkable versatility across various\nvideo tasks. However, existing methods often fail to fully leverage their\nrepresentation capabilities, primarily due to inadequate alignment of\nintermediate features during target frame decoding. This paper introduces a\nuniversal boosting framework for current implicit video representation\napproaches. Specifically, we utilize a conditional decoder with a\ntemporal-aware affine transform module, which uses the frame index as a prior\ncondition to effectively align intermediate features with target frames.\nBesides, we introduce a sinusoidal NeRV-like block to generate diverse\nintermediate features and achieve a more balanced parameter distribution,\nthereby enhancing the model's capacity. With a high-frequency\ninformation-preserving reconstruction loss, our approach successfully boosts\nmultiple baseline INRs in the reconstruction quality and convergence speed for\nvideo regression, and exhibits superior inpainting and interpolation results.\nFurther, we integrate a consistent entropy minimization technique and develop\nvideo codecs based on these boosted INRs. Experiments on the UVG dataset\nconfirm that our enhanced codecs significantly outperform baseline INRs and\noffer competitive rate-distortion performance compared to traditional and\nlearning-based codecs. Code is available at\nhttps://github.com/Xinjie-Q/Boosting-NeRV.\n","authors":["Xinjie Zhang","Ren Yang","Dailan He","Xingtong Ge","Tongda Xu","Yan Wang","Hongwei Qin","Jun Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.18152v3.pdf","comment":"Accept by CVPR 2024"},{"id":"http://arxiv.org/abs/2302.01622v5","updated":"2024-03-16T12:52:18Z","published":"2023-02-03T09:49:13Z","title":"Private, fair and accurate: Training large-scale, privacy-preserving AI\n models in medical imaging","summary":" Artificial intelligence (AI) models are increasingly used in the medical\ndomain. However, as medical data is highly sensitive, special precautions to\nensure its protection are required. The gold standard for privacy preservation\nis the introduction of differential privacy (DP) to model training. Prior work\nindicates that DP has negative implications on model accuracy and fairness,\nwhich are unacceptable in medicine and represent a main barrier to the\nwidespread use of privacy-preserving techniques. In this work, we evaluated the\neffect of privacy-preserving training of AI models regarding accuracy and\nfairness compared to non-private training. For this, we used two datasets: (1)\nA large dataset (N=193,311) of high quality clinical chest radiographs, and (2)\na dataset (N=1,625) of 3D abdominal computed tomography (CT) images, with the\ntask of classifying the presence of pancreatic ductal adenocarcinoma (PDAC).\nBoth were retrospectively collected and manually labeled by experienced\nradiologists. We then compared non-private deep convolutional neural networks\n(CNNs) and privacy-preserving (DP) models with respect to privacy-utility\ntrade-offs measured as area under the receiver-operator-characteristic curve\n(AUROC), and privacy-fairness trade-offs, measured as Pearson's r or\nStatistical Parity Difference. We found that, while the privacy-preserving\ntrainings yielded lower accuracy, they did largely not amplify discrimination\nagainst age, sex or co-morbidity. Our study shows that -- under the challenging\nrealistic circumstances of a real-life clinical dataset -- the\nprivacy-preserving training of diagnostic deep learning models is possible with\nexcellent diagnostic accuracy and fairness.\n","authors":["Soroosh Tayebi Arasteh","Alexander Ziller","Christiane Kuhl","Marcus Makowski","Sven Nebelung","Rickmer Braren","Daniel Rueckert","Daniel Truhn","Georgios Kaissis"],"pdf_url":"https://arxiv.org/pdf/2302.01622v5.pdf","comment":"Published in Communications Medicine. Nature Portfolio"},{"id":"http://arxiv.org/abs/2401.06637v5","updated":"2024-03-16T12:45:42Z","published":"2024-01-12T15:29:21Z","title":"Adversarial Examples are Misaligned in Diffusion Model Manifolds","summary":" In recent years, diffusion models (DMs) have drawn significant attention for\ntheir success in approximating data distributions, yielding state-of-the-art\ngenerative results. Nevertheless, the versatility of these models extends\nbeyond their generative capabilities to encompass various vision applications,\nsuch as image inpainting, segmentation, adversarial robustness, among others.\nThis study is dedicated to the investigation of adversarial attacks through the\nlens of diffusion models. However, our objective does not involve enhancing the\nadversarial robustness of image classifiers. Instead, our focus lies in\nutilizing the diffusion model to detect and analyze the anomalies introduced by\nthese attacks on images. To that end, we systematically examine the alignment\nof the distributions of adversarial examples when subjected to the process of\ntransformation using diffusion models. The efficacy of this approach is\nassessed across CIFAR-10 and ImageNet datasets, including varying image sizes\nin the latter. The results demonstrate a notable capacity to discriminate\neffectively between benign and attacked images, providing compelling evidence\nthat adversarial instances do not align with the learned manifold of the DMs.\n","authors":["Peter Lorenz","Ricard Durall","Janis Keuper"],"pdf_url":"https://arxiv.org/pdf/2401.06637v5.pdf","comment":"accepted at IJCNN"},{"id":"http://arxiv.org/abs/2403.10916v1","updated":"2024-03-16T12:44:08Z","published":"2024-03-16T12:44:08Z","title":"FishNet: Deep Neural Networks for Low-Cost Fish Stock Estimation","summary":" Fish stock assessment often involves manual fish counting by taxonomy\nspecialists, which is both time-consuming and costly. We propose an automated\ncomputer vision system that performs both taxonomic classification and fish\nsize estimation from images taken with a low-cost digital camera. The system\nfirst performs object detection and segmentation using a Mask R-CNN to identify\nindividual fish from images containing multiple fish, possibly consisting of\ndifferent species. Then each fish species is classified and the predicted\nlength using separate machine learning models. These models are trained on a\ndataset of 50,000 hand-annotated images containing 163 different fish species,\nranging in length from 10cm to 250cm. Evaluated on held-out test data, our\nsystem achieves a $92\\%$ intersection over union on the fish segmentation task,\na $89\\%$ top-1 classification accuracy on single fish species classification,\nand a $2.3$~cm mean error on the fish length estimation task.\n","authors":["Moseli Mots'oehli","Anton Nikolaev","Wawan B. IGede","John Lynham","Peter J. Mous","Peter Sadowski"],"pdf_url":"https://arxiv.org/pdf/2403.10916v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2403.10912v1","updated":"2024-03-16T12:25:30Z","published":"2024-03-16T12:25:30Z","title":"Automatic location detection based on deep learning","summary":" The proliferation of digital images and the advancements in deep learning\nhave paved the way for innovative solutions in various domains, especially in\nthe field of image classification. Our project presents an in-depth study and\nimplementation of an image classification system specifically tailored to\nidentify and classify images of Indian cities. Drawing from an extensive\ndataset, our model classifies images into five major Indian cities: Ahmedabad,\nDelhi, Kerala, Kolkata, and Mumbai to recognize the distinct features and\ncharacteristics of each city/state. To achieve high precision and recall rates,\nwe adopted two approaches. The first, a vanilla Convolutional Neural Network\n(CNN) and then we explored the power of transfer learning by leveraging the\nVGG16 model. The vanilla CNN achieved commendable accuracy and the VGG16 model\nachieved a test accuracy of 63.6%. Evaluations highlighted the strengths and\npotential areas of improvement, positioning our model as not only competitive\nbut also scalable for broader applications. With an emphasis on open-source\nethos, our work aims to contribute to the community, encouraging further\ndevelopment and diverse applications. Our findings demonstrate the potential\napplications in tourism, urban planning, and even real-time location\nidentification systems, among others.\n","authors":["Anjali Karangiya","Anirudh Sharma","Divax Shah","Kartavya Badgujar","Dr. Chintan Thacker","Dainik Dave"],"pdf_url":"https://arxiv.org/pdf/2403.10912v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10911v1","updated":"2024-03-16T12:18:20Z","published":"2024-03-16T12:18:20Z","title":"Efficient Diffusion-Driven Corruption Editor for Test-Time Adaptation","summary":" Test-time adaptation (TTA) addresses the unforeseen distribution shifts\noccurring during test time. In TTA, both performance and, memory and time\nconsumption serve as crucial considerations. A recent diffusion-based TTA\napproach for restoring corrupted images involves image-level updates. However,\nusing pixel space diffusion significantly increases resource requirements\ncompared to conventional model updating TTA approaches, revealing limitations\nas a TTA method. To address this, we propose a novel TTA method by leveraging a\nlatent diffusion model (LDM) based image editing model and fine-tuning it with\nour newly introduced corruption modeling scheme. This scheme enhances the\nrobustness of the diffusion model against distribution shifts by creating\n(clean, corrupted) image pairs and fine-tuning the model to edit corrupted\nimages into clean ones. Moreover, we introduce a distilled variant to\naccelerate the model for corruption editing using only 4 network function\nevaluations (NFEs). We extensively validated our method across various\narchitectures and datasets including image and video domains. Our model\nachieves the best performance with a 100 times faster runtime than that of a\ndiffusion-based baseline. Furthermore, it outpaces the speed of the model\nupdating TTA method based on data augmentation threefold, rendering an\nimage-level updating approach more practical.\n","authors":["Yeongtak Oh","Jonghyun Lee","Jooyoung Choi","Dahuin Jung","Uiwon Hwang","Sungroh Yoon"],"pdf_url":"https://arxiv.org/pdf/2403.10911v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06529v2","updated":"2024-03-16T12:18:02Z","published":"2024-03-11T09:12:24Z","title":"Confidence-Aware RGB-D Face Recognition via Virtual Depth Synthesis","summary":" 2D face recognition encounters challenges in unconstrained environments due\nto varying illumination, occlusion, and pose. Recent studies focus on RGB-D\nface recognition to improve robustness by incorporating depth information.\nHowever, collecting sufficient paired RGB-D training data is expensive and\ntime-consuming, hindering wide deployment. In this work, we first construct a\ndiverse depth dataset generated by 3D Morphable Models for depth model\npre-training. Then, we propose a domain-independent pre-training framework that\nutilizes readily available pre-trained RGB and depth models to separately\nperform face recognition without needing additional paired data for retraining.\nTo seamlessly integrate the two distinct networks and harness the complementary\nbenefits of RGB and depth information for improved accuracy, we propose an\ninnovative Adaptive Confidence Weighting (ACW). This mechanism is designed to\nlearn confidence estimates for each modality to achieve modality fusion at the\nscore level. Our method is simple and lightweight, only requiring ACW training\nbeyond the backbone models. Experiments on multiple public RGB-D face\nrecognition benchmarks demonstrate state-of-the-art performance surpassing\nprevious methods based on depth estimation and feature fusion, validating the\nefficacy of our approach.\n","authors":["Zijian Chen","Mei Wang","Weihong Deng","Hongzhi Shi","Dongchao Wen","Yingjie Zhang","Xingchen Cui","Jian Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.06529v2.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2403.10906v1","updated":"2024-03-16T11:44:55Z","published":"2024-03-16T11:44:55Z","title":"HourglassNeRF: Casting an Hourglass as a Bundle of Rays for Few-shot\n Neural Rendering","summary":" Recent advancements in the Neural Radiance Field (NeRF) have bolstered its\ncapabilities for novel view synthesis, yet its reliance on dense multi-view\ntraining images poses a practical challenge. Addressing this, we propose\nHourglassNeRF, an effective regularization-based approach with a novel\nhourglass casting strategy. Our proposed hourglass is conceptualized as a\nbundle of additional rays within the area between the original input ray and\nits corresponding reflection ray, by featurizing the conical frustum via\nIntegrated Positional Encoding (IPE). This design expands the coverage of\nunseen views and enables an adaptive high-frequency regularization based on\ntarget pixel photo-consistency. Furthermore, we propose luminance consistency\nregularization based on the Lambertian assumption, which is known to be\neffective for training a set of augmented rays under the few-shot setting.\nLeveraging the inherent property of a Lambertian surface, which retains\nconsistent luminance irrespective of the viewing angle, we assume our proposed\nhourglass as a collection of flipped diffuse reflection rays and enhance the\nluminance consistency between the original input ray and its corresponding\nhourglass, resulting in more physically grounded training framework and\nperformance improvement. Our HourglassNeRF outperforms its baseline and\nachieves competitive results on multiple benchmarks with sharply rendered fine\ndetails. The code will be available.\n","authors":["Seunghyeon Seo","Yeonjin Chang","Jayeon Yoo","Seungwoo Lee","Hojun Lee","Nojun Kwak"],"pdf_url":"https://arxiv.org/pdf/2403.10906v1.pdf","comment":"21 pages, 11 figures"},{"id":"http://arxiv.org/abs/2403.10904v1","updated":"2024-03-16T11:38:58Z","published":"2024-03-16T11:38:58Z","title":"Urban Sound Propagation: a Benchmark for 1-Step Generative Modeling of\n Complex Physical Systems","summary":" Data-driven modeling of complex physical systems is receiving a growing\namount of attention in the simulation and machine learning communities. Since\nmost physical simulations are based on compute-intensive, iterative\nimplementations of differential equation systems, a (partial) replacement with\nlearned, 1-step inference models has the potential for significant speedups in\na wide range of application areas. In this context, we present a novel\nbenchmark for the evaluation of 1-step generative learning models in terms of\nspeed and physical correctness. Our Urban Sound Propagation benchmark is based\non the physically complex and practically relevant, yet intuitively easy to\ngrasp task of modeling the 2d propagation of waves from a sound source in an\nurban environment. We provide a dataset with 100k samples, where each sample\nconsists of pairs of real 2d building maps drawn from OpenStreetmap, a\nparameterized sound source, and a simulated ground truth sound propagation for\nthe given scene. The dataset provides four different simulation tasks with\nincreasing complexity regarding reflection, diffraction and source variance. A\nfirst baseline evaluation of common generative U-Net, GAN and Diffusion models\nshows, that while these models are very well capable of modeling sound\npropagations in simple cases, the approximation of sub-systems represented by\nhigher order equations systematically fails. Information about the dataset,\ndownload instructions and source codes are provided on our anonymous website:\nhttps://www.urban-sound-data.org.\n","authors":["Martin Spitznagel","Janis Keuper"],"pdf_url":"https://arxiv.org/pdf/2403.10904v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10897v1","updated":"2024-03-16T11:21:24Z","published":"2024-03-16T11:21:24Z","title":"Rethinking Multi-view Representation Learning via Distilled\n Disentangling","summary":" Multi-view representation learning aims to derive robust representations that\nare both view-consistent and view-specific from diverse data sources. This\npaper presents an in-depth analysis of existing approaches in this domain,\nhighlighting a commonly overlooked aspect: the redundancy between\nview-consistent and view-specific representations. To this end, we propose an\ninnovative framework for multi-view representation learning, which incorporates\na technique we term 'distilled disentangling'. Our method introduces the\nconcept of masked cross-view prediction, enabling the extraction of compact,\nhigh-quality view-consistent representations from various sources without\nincurring extra computational overhead. Additionally, we develop a distilled\ndisentangling module that efficiently filters out consistency-related\ninformation from multi-view representations, resulting in purer view-specific\nrepresentations. This approach significantly reduces redundancy between\nview-consistent and view-specific representations, enhancing the overall\nefficiency of the learning process. Our empirical evaluations reveal that\nhigher mask ratios substantially improve the quality of view-consistent\nrepresentations. Moreover, we find that reducing the dimensionality of\nview-consistent representations relative to that of view-specific\nrepresentations further refines the quality of the combined representations.\nOur code is accessible at: https://github.com/Guanzhou-Ke/MRDD.\n","authors":["Guanzhou Ke","Bo Wang","Xiaoli Wang","Shengfeng He"],"pdf_url":"https://arxiv.org/pdf/2403.10897v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.10887v1","updated":"2024-03-16T10:46:14Z","published":"2024-03-16T10:46:14Z","title":"LuoJiaHOG: A Hierarchy Oriented Geo-aware Image Caption Dataset for\n Remote Sensing Image-Text Retrival","summary":" Image-text retrieval (ITR) plays a significant role in making informed\ndecisions for various remote sensing (RS) applications. Nonetheless, creating\nITR datasets containing vision and language modalities not only requires\nsignificant geo-spatial sampling area but also varing categories and detailed\ndescriptions. To this end, we introduce an image caption dataset LuojiaHOG,\nwhich is geospatial-aware, label-extension-friendly and\ncomprehensive-captioned. LuojiaHOG involves the hierarchical spatial sampling,\nextensible classification system to Open Geospatial Consortium (OGC) standards,\nand detailed caption generation. In addition, we propose a CLIP-based Image\nSemantic Enhancement Network (CISEN) to promote sophisticated ITR. CISEN\nconsists of two components, namely dual-path knowledge transfer and progressive\ncross-modal feature fusion. Comprehensive statistics on LuojiaHOG reveal the\nrichness in sampling diversity, labels quantity and descriptions granularity.\nThe evaluation on LuojiaHOG is conducted across various state-of-the-art ITR\nmodels, including ALBEF, ALIGN, CLIP, FILIP, Wukong, GeoRSCLIP and CISEN. We\nuse second- and third-level labels to evaluate these vision-language models\nthrough adapter-tuning and CISEN demonstrates superior performance. For\ninstance, it achieves the highest scores with WMAP@5 of 88.47\\% and 87.28\\% on\nthird-level ITR tasks, respectively. In particular, CISEN exhibits an\nimprovement of approximately 1.3\\% and 0.9\\% in terms of WMAP@5 compared to its\nbaseline. These findings highlight CISEN advancements accurately retrieving\npertinent information across image and text. LuojiaHOG and CISEN can serve as a\nfoundational resource for future RS image-text alignment research, facilitating\na wide range of vision-language applications.\n","authors":["Yuanxin Zhao","Mi Zhang","Bingnan Yang","Zhan Zhang","Jiaju Kang","Jianya Gong"],"pdf_url":"https://arxiv.org/pdf/2403.10887v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10885v1","updated":"2024-03-16T10:43:12Z","published":"2024-03-16T10:43:12Z","title":"Could We Generate Cytology Images from Histopathology Images? An\n Empirical Study","summary":" Automation in medical imaging is quite challenging due to the unavailability\nof annotated datasets and the scarcity of domain experts. In recent years, deep\nlearning techniques have solved some complex medical imaging tasks like disease\nclassification, important object localization, segmentation, etc. However, most\nof the task requires a large amount of annotated data for their successful\nimplementation. To mitigate the shortage of data, different generative models\nare proposed for data augmentation purposes which can boost the classification\nperformances. For this, different synthetic medical image data generation\nmodels are developed to increase the dataset. Unpaired image-to-image\ntranslation models here shift the source domain to the target domain. In the\nbreast malignancy identification domain, FNAC is one of the low-cost\nlow-invasive modalities normally used by medical practitioners. But\navailability of public datasets in this domain is very poor. Whereas, for\nautomation of cytology images, we need a large amount of annotated data.\nTherefore synthetic cytology images are generated by translating breast\nhistopathology samples which are publicly available. In this study, we have\nexplored traditional image-to-image transfer models like CycleGAN, and Neural\nStyle Transfer. Further, it is observed that the generated cytology images are\nquite similar to real breast cytology samples by measuring FID and KID scores.\n","authors":["Soumyajyoti Dey","Sukanta Chakraborty","Utso Guha Roy","Nibaran Das"],"pdf_url":"https://arxiv.org/pdf/2403.10885v1.pdf","comment":"Accept at International Conference on Advanced Computing and\n Applications(ICACA-2024)"},{"id":"http://arxiv.org/abs/2310.11784v2","updated":"2024-03-16T10:33:40Z","published":"2023-10-18T08:23:14Z","title":"Progressive3D: Progressively Local Editing for Text-to-3D Content\n Creation with Complex Semantic Prompts","summary":" Recent text-to-3D generation methods achieve impressive 3D content creation\ncapacity thanks to the advances in image diffusion models and optimizing\nstrategies. However, current methods struggle to generate correct 3D content\nfor a complex prompt in semantics, i.e., a prompt describing multiple\ninteracted objects binding with different attributes. In this work, we propose\na general framework named Progressive3D, which decomposes the entire generation\ninto a series of locally progressive editing steps to create precise 3D content\nfor complex prompts, and we constrain the content change to only occur in\nregions determined by user-defined region prompts in each editing step.\nFurthermore, we propose an overlapped semantic component suppression technique\nto encourage the optimization process to focus more on the semantic differences\nbetween prompts. Extensive experiments demonstrate that the proposed\nProgressive3D framework generates precise 3D content for prompts with complex\nsemantics and is general for various text-to-3D methods driven by different 3D\nrepresentations.\n","authors":["Xinhua Cheng","Tianyu Yang","Jianan Wang","Yu Li","Lei Zhang","Jian Zhang","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2310.11784v2.pdf","comment":"Accept by ICLR2024. Project Page:\n https://cxh0519.github.io/projects/Progressive3D/"},{"id":"http://arxiv.org/abs/2403.10884v1","updated":"2024-03-16T10:33:02Z","published":"2024-03-16T10:33:02Z","title":"Fuzzy Rank-based Late Fusion Technique for Cytology image Segmentation","summary":" Cytology image segmentation is quite challenging due to its complex cellular\nstructure and multiple overlapping regions. On the other hand, for supervised\nmachine learning techniques, we need a large amount of annotated data, which is\ncostly. In recent years, late fusion techniques have given some promising\nperformances in the field of image classification. In this paper, we have\nexplored a fuzzy-based late fusion techniques for cytology image segmentation.\nThis fusion rule integrates three traditional semantic segmentation models\nUNet, SegNet, and PSPNet. The technique is applied on two cytology image\ndatasets, i.e., cervical cytology(HErlev) and breast cytology(JUCYT-v1) image\ndatasets. We have achieved maximum MeanIoU score 84.27% and 83.79% on the\nHErlev dataset and JUCYT-v1 dataset after the proposed late fusion technique,\nrespectively which are better than that of the traditional fusion rules such as\naverage probability, geometric mean, Borda Count, etc. The codes of the\nproposed model are available on GitHub.\n","authors":["Soumyajyoti Dey","Sukanta Chakraborty","Utso Guha Roy","Nibaran Das"],"pdf_url":"https://arxiv.org/pdf/2403.10884v1.pdf","comment":"Accept at International Conference on Data, Electronics and Computing\n (ICDEC-2023)"},{"id":"http://arxiv.org/abs/2403.10883v1","updated":"2024-03-16T10:32:24Z","published":"2024-03-16T10:32:24Z","title":"Improving Adversarial Transferability of Visual-Language Pre-training\n Models through Collaborative Multimodal Interaction","summary":" Despite the substantial advancements in Vision-Language Pre-training (VLP)\nmodels, their susceptibility to adversarial attacks poses a significant\nchallenge. Existing work rarely studies the transferability of attacks on VLP\nmodels, resulting in a substantial performance gap from white-box attacks. We\nobserve that prior work overlooks the interaction mechanisms between\nmodalities, which plays a crucial role in understanding the intricacies of VLP\nmodels. In response, we propose a novel attack, called Collaborative Multimodal\nInteraction Attack (CMI-Attack), leveraging modality interaction through\nembedding guidance and interaction enhancement. Specifically, attacking text at\nthe embedding level while preserving semantics, as well as utilizing\ninteraction image gradients to enhance constraints on perturbations of texts\nand images. Significantly, in the image-text retrieval task on Flickr30K\ndataset, CMI-Attack raises the transfer success rates from ALBEF to TCL,\n$\\text{CLIP}_{\\text{ViT}}$ and $\\text{CLIP}_{\\text{CNN}}$ by 8.11%-16.75% over\nstate-of-the-art methods. Moreover, CMI-Attack also demonstrates superior\nperformance in cross-task generalization scenarios. Our work addresses the\nunderexplored realm of transfer attacks on VLP models, shedding light on the\nimportance of modality interaction for enhanced adversarial robustness.\n","authors":["Jiyuan Fu","Zhaoyu Chen","Kaixun Jiang","Haijing Guo","Jiafeng Wang","Shuyong Gao","Wenqiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.10883v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10881v1","updated":"2024-03-16T10:25:49Z","published":"2024-03-16T10:25:49Z","title":"Regularizing CNNs using Confusion Penalty Based Label Smoothing for\n Histopathology Images","summary":" Deep Learning, particularly Convolutional Neural Networks (CNN), has been\nsuccessful in computer vision tasks and medical image analysis. However, modern\nCNNs can be overconfident, making them difficult to deploy in real-world\nscenarios. Researchers propose regularizing techniques, such as Label Smoothing\n(LS), which introduces soft labels for training data, making the classifier\nmore regularized. LS captures disagreements or lack of confidence in the\ntraining phase, making the classifier more regularized. Although LS is quite\nsimple and effective, traditional LS techniques utilize a weighted average\nbetween target distribution and a uniform distribution across the classes,\nwhich limits the objective of LS as well as the performance. This paper\nintroduces a novel LS technique based on the confusion penalty, which treats\nmodel confusion for each class with more importance than others. We have\nperformed extensive experiments with well-known CNN architectures with this\ntechnique on publicly available Colorectal Histology datasets and got\nsatisfactory results. Also, we have compared our findings with the\nState-of-the-art and shown our method's efficacy with Reliability diagrams and\nt-distributed Stochastic Neighbor Embedding (t-SNE) plots of feature space.\n","authors":["Somenath Kuiry","Alaka Das","Mita Nasipuri","Nibaran Das"],"pdf_url":"https://arxiv.org/pdf/2403.10881v1.pdf","comment":"Accepted at CICBA 2024 : 6th International Conference on\n Computational Intelligence in Communications, and Business Analytics"},{"id":"http://arxiv.org/abs/2403.10880v1","updated":"2024-03-16T10:25:07Z","published":"2024-03-16T10:25:07Z","title":"COVID-CT-H-UNet: a novel COVID-19 CT segmentation network based on\n attention mechanism and Bi-category Hybrid loss","summary":" Since 2019, the global COVID-19 outbreak has emerged as a crucial focus in\nhealthcare research. Although RT-PCR stands as the primary method for COVID-19\ndetection, its extended detection time poses a significant challenge.\nConsequently, supplementing RT-PCR with the pathological study of COVID-19\nthrough CT imaging has become imperative. The current segmentation approach\nbased on TVLoss enhances the connectivity of afflicted areas. Nevertheless, it\ntends to misclassify normal pixels between certain adjacent diseased regions as\ndiseased pixels. The typical Binary cross entropy(BCE) based U-shaped network\nonly concentrates on the entire CT images without emphasizing on the affected\nregions, which results in hazy borders and low contrast in the projected\noutput. In addition, the fraction of infected pixels in CT images is much less,\nwhich makes it a challenge for segmentation models to make accurate\npredictions. In this paper, we propose COVID-CT-H-UNet, a COVID-19 CT\nsegmentation network to solve these problems. To recognize the unaffected\npixels between neighbouring diseased regions, extra visual layer information is\ncaptured by combining the attention module on the skip connections with the\nproposed composite function Bi-category Hybrid Loss. The issue of hazy\nboundaries and poor contrast brought on by the BCE Loss in conventional\ntechniques is resolved by utilizing the composite function Bi-category Hybrid\nLoss that concentrates on the pixels in the diseased area. The experiment shows\nwhen compared to the previous COVID-19 segmentation networks, the proposed\nCOVID-CT-H-UNet's segmentation impact has greatly improved, and it may be used\nto identify and study clinical COVID-19.\n","authors":["Anay Panja","Somenath Kuiry","Alaka Das","Mita Nasipuri","Nibaran Das"],"pdf_url":"https://arxiv.org/pdf/2403.10880v1.pdf","comment":"Accepted at CICBA 2024 : 6th International Conference on\n Computational Intelligence in Communications, and Business Analytics"},{"id":"http://arxiv.org/abs/2403.10860v1","updated":"2024-03-16T08:57:00Z","published":"2024-03-16T08:57:00Z","title":"Efficient Domain Adaptation for Endoscopic Visual Odometry","summary":" Visual odometry plays a crucial role in endoscopic imaging, yet the scarcity\nof realistic images with ground truth poses poses a significant challenge.\nTherefore, domain adaptation offers a promising approach to bridge the\npre-operative planning domain with the intra-operative real domain for learning\nodometry information. However, existing methodologies suffer from\ninefficiencies in the training time. In this work, an efficient neural style\ntransfer framework for endoscopic visual odometry is proposed, which compresses\nthe time from pre-operative planning to testing phase to less than five\nminutes. For efficient traing, this work focuses on training modules with only\na limited number of real images and we exploit pre-operative prior information\nto dramatically reduce training duration. Moreover, during the testing phase,\nwe propose a novel Test Time Adaptation (TTA) method to mitigate the gap in\nlighting conditions between training and testing datasets. Experimental\nevaluations conducted on two public endoscope datasets showcase that our method\nachieves state-of-the-art accuracy in visual odometry tasks while boasting the\nfastest training speeds. These results demonstrate significant promise for\nintra-operative surgery applications.\n","authors":["Junyang Wu","Yun Gu","Guang-Zhong Yang"],"pdf_url":"https://arxiv.org/pdf/2403.10860v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10858v1","updated":"2024-03-16T08:50:47Z","published":"2024-03-16T08:50:47Z","title":"RetMIL: Retentive Multiple Instance Learning for Histopathological Whole\n Slide Image Classification","summary":" Histopathological whole slide image (WSI) analysis with deep learning has\nbecome a research focus in computational pathology. The current paradigm is\nmainly based on multiple instance learning (MIL), in which approaches with\nTransformer as the backbone are well discussed. These methods convert WSI tasks\ninto sequence tasks by representing patches as tokens in the WSI sequence.\nHowever, the feature complexity brought by high heterogeneity and the\nultra-long sequences brought by gigapixel size makes Transformer-based MIL\nsuffer from the challenges of high memory consumption, slow inference speed,\nand lack of performance. To this end, we propose a retentive MIL method called\nRetMIL, which processes WSI sequences through hierarchical feature propagation\nstructure. At the local level, the WSI sequence is divided into multiple\nsubsequences. Tokens of each subsequence are updated through a parallel linear\nretention mechanism and aggregated utilizing an attention layer. At the global\nlevel, subsequences are fused into a global sequence, then updated through a\nserial retention mechanism, and finally the slide-level representation is\nobtained through a global attention pooling. We conduct experiments on two\npublic CAMELYON and BRACS datasets and an public-internal LUNG dataset,\nconfirming that RetMIL not only achieves state-of-the-art performance but also\nsignificantly reduces computational overhead. Our code will be accessed\nshortly.\n","authors":["Hongbo Chu","Qiehe Sun","Jiawen Li","Yuxuan Chen","Lizhong Zhang","Tian Guan","Anjia Han","Yonghong He"],"pdf_url":"https://arxiv.org/pdf/2403.10858v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2308.03276v3","updated":"2024-03-16T08:39:17Z","published":"2023-08-07T03:35:47Z","title":"Spatialyze: A Geospatial Video Analytics System with Spatial-Aware\n Optimizations","summary":" Videos that are shot using commodity hardware such as phones and surveillance\ncameras record various metadata such as time and location. We encounter such\ngeospatial videos on a daily basis and such videos have been growing in volume\nsignificantly. Yet, we do not have data management systems that allow users to\ninteract with such data effectively.\n In this paper, we describe Spatialyze, a new framework for end-to-end\nquerying of geospatial videos. Spatialyze comes with a domain-specific language\nwhere users can construct geospatial video analytic workflows using a 3-step,\ndeclarative, build-filter-observe paradigm. Internally, Spatialyze leverages\nthe declarative nature of such workflows, the temporal-spatial metadata stored\nwith videos, and physical behavior of real-world objects to optimize the\nexecution of workflows. Our results using real-world videos and workflows show\nthat Spatialyze can reduce execution time by up to 5.3x, while maintaining up\nto 97.1% accuracy compared to unoptimized execution.\n","authors":["Chanwut Kittivorawong","Yongming Ge","Yousef Helal","Alvin Cheung"],"pdf_url":"https://arxiv.org/pdf/2308.03276v3.pdf","comment":"GitHub Repository: https://github.com/apperception-db/spatialyze"},{"id":"http://arxiv.org/abs/2403.10854v1","updated":"2024-03-16T08:30:45Z","published":"2024-03-16T08:30:45Z","title":"A Comprehensive Study of Multimodal Large Language Models for Image\n Quality Assessment","summary":" While Multimodal Large Language Models (MLLMs) have experienced significant\nadvancement on visual understanding and reasoning, their potentials to serve as\npowerful, flexible, interpretable, and text-driven models for Image Quality\nAssessment (IQA) remains largely unexplored. In this paper, we conduct a\ncomprehensive and systematic study of prompting MLLMs for IQA. Specifically, we\nfirst investigate nine prompting systems for MLLMs as the combinations of three\nstandardized testing procedures in psychophysics (i.e., the single-stimulus,\ndouble-stimulus, and multiple-stimulus methods) and three popular prompting\nstrategies in natural language processing (i.e., the standard, in-context, and\nchain-of-thought prompting). We then present a difficult sample selection\nprocedure, taking into account sample diversity and uncertainty, to further\nchallenge MLLMs equipped with the respective optimal prompting systems. We\nassess three open-source and one close-source MLLMs on several visual\nattributes of image quality (e.g., structural and textural distortions, color\ndifferences, and geometric transformations) in both full-reference and\nno-reference scenarios. Experimental results show that only the close-source\nGPT-4V provides a reasonable account for human perception of image quality, but\nis weak at discriminating fine-grained quality variations (e.g., color\ndifferences) and at comparing visual quality of multiple images, tasks humans\ncan perform effortlessly.\n","authors":["Tianhe Wu","Kede Ma","Jie Liang","Yujiu Yang","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.10854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10853v1","updated":"2024-03-16T08:28:42Z","published":"2024-03-16T08:28:42Z","title":"Just Say the Name: Online Continual Learning with Category Names Only\n via Data Generation","summary":" In real-world scenarios, extensive manual annotation for continual learning\nis impractical due to prohibitive costs. Although prior arts, influenced by\nlarge-scale webly supervised training, suggest leveraging web-scraped data in\ncontinual learning, this poses challenges such as data imbalance, usage\nrestrictions, and privacy concerns. Addressing the risks of continual webly\nsupervised training, we present an online continual learning framework -\nGenerative Name only Continual Learning (G-NoCL). The proposed G-NoCL uses a\nset of generators G along with the learner. When encountering new concepts\n(i.e., classes), G-NoCL employs the novel sample complexity-guided data\nensembling technique DIverSity and COmplexity enhancing ensemBlER (DISCOBER) to\noptimally sample training data from generated data. Through extensive\nexperimentation, we demonstrate superior performance of DISCOBER in G-NoCL\nonline CL benchmarks, covering both In-Distribution (ID) and\nOut-of-Distribution (OOD) generalization evaluations, compared to naive\ngenerator-ensembling, web-supervised, and manually annotated data.\n","authors":["Minhyuk Seo","Diganta Misra","Seongwon Cho","Minjae Lee","Jonghyun Choi"],"pdf_url":"https://arxiv.org/pdf/2403.10853v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17525v2","updated":"2024-03-16T07:56:13Z","published":"2024-02-27T14:07:09Z","title":"Diffusion Model-Based Image Editing: A Survey","summary":" Denoising diffusion models have emerged as a powerful tool for various image\ngeneration and editing tasks, facilitating the synthesis of visual content in\nan unconditional or input-conditional manner. The core idea behind them is\nlearning to reverse the process of gradually adding noise to images, allowing\nthem to generate high-quality samples from a complex distribution. In this\nsurvey, we provide an exhaustive overview of existing methods using diffusion\nmodels for image editing, covering both theoretical and practical aspects in\nthe field. We delve into a thorough analysis and categorization of these works\nfrom multiple perspectives, including learning strategies, user-input\nconditions, and the array of specific editing tasks that can be accomplished.\nIn addition, we pay special attention to image inpainting and outpainting, and\nexplore both earlier traditional context-driven and current multimodal\nconditional methods, offering a comprehensive analysis of their methodologies.\nTo further evaluate the performance of text-guided image editing algorithms, we\npropose a systematic benchmark, EditEval, featuring an innovative metric, LMM\nScore. Finally, we address current limitations and envision some potential\ndirections for future research. The accompanying repository is released at\nhttps://github.com/SiatMMLab/Awesome-Diffusion-Model-Based-Image-Editing-Methods.\n","authors":["Yi Huang","Jiancheng Huang","Yifan Liu","Mingfu Yan","Jiaxi Lv","Jianzhuang Liu","Wei Xiong","He Zhang","Shifeng Chen","Liangliang Cao"],"pdf_url":"https://arxiv.org/pdf/2402.17525v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06104v2","updated":"2024-03-16T07:34:05Z","published":"2024-03-10T06:15:42Z","title":"Universal Debiased Editing on Foundation Models for Fair Medical Image\n Classification","summary":" In the era of Foundation Models' (FMs) rising prominence in AI, our study\naddresses the challenge of biases in medical images while using FM API,\nparticularly spurious correlations between pixels and sensitive attributes.\nTraditional methods for bias mitigation face limitations due to the restricted\naccess to web-hosted FMs and difficulties in addressing the underlying bias\nencoded within the FM API. We propose an U(niversal) D(ebiased) E(diting)\nstrategy, termed UDE, which generates UDE noise to mask such spurious\ncorrelation. UDE is capable of mitigating bias both within the FM API embedding\nand the images themselves. Furthermore, UDE is suitable for both white-box and\nblack-box FM APIs, where we introduced G(reedy) (Z)eroth-O(rder) (GeZO)\noptimization for it when the gradient is inaccessible in black-box APIs. Our\nwhole pipeline enables fairness-aware image editing that can be applied across\nvarious medical contexts without requiring direct model manipulation or\nsignificant computational resources. Our empirical results demonstrate the\nmethod's effectiveness in maintaining fairness and utility across different\npatient groups and diseases. In the era of AI-driven medicine, this work\ncontributes to making healthcare diagnostics more equitable, showcasing a\npractical solution for bias mitigation in pre-trained image FMs.\n","authors":["Ruinan Jin","Wenlong Deng","Minghui Chen","Xiaoxiao Li"],"pdf_url":"https://arxiv.org/pdf/2403.06104v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10840v1","updated":"2024-03-16T07:26:50Z","published":"2024-03-16T07:26:50Z","title":"MSI-NeRF: Linking Omni-Depth with View Synthesis through Multi-Sphere\n Image aided Generalizable Neural Radiance Field","summary":" Panoramic observation using fisheye cameras is significant in robot\nperception, reconstruction, and remote operation. However, panoramic images\nsynthesized by traditional methods lack depth information and can only provide\nthree degrees-of-freedom (3DoF) rotation rendering in virtual reality\napplications. To fully preserve and exploit the parallax information within the\noriginal fisheye cameras, we introduce MSI-NeRF, which combines deep learning\nomnidirectional depth estimation and novel view rendering. We first construct a\nmulti-sphere image as a cost volume through feature extraction and warping of\nthe input images. It is then processed by geometry and appearance decoders,\nrespectively. Unlike methods that regress depth maps directly, we further build\nan implicit radiance field using spatial points and interpolated 3D feature\nvectors as input. In this way, we can simultaneously realize omnidirectional\ndepth estimation and 6DoF view synthesis. Our method is trained in a\nsemi-self-supervised manner. It does not require target view images and only\nuses depth data for supervision. Our network has the generalization ability to\nreconstruct unknown scenes efficiently using only four images. Experimental\nresults show that our method outperforms existing methods in depth estimation\nand novel view synthesis tasks.\n","authors":["Dongyu Yan","Guanyu Huang","Fengyu Quan","Haoyao Chen"],"pdf_url":"https://arxiv.org/pdf/2403.10840v1.pdf","comment":"8 pages, 7 figures, Submitted to IEEE/RSJ International Conference on\n Intelligent Robots and Systems 2024"},{"id":"http://arxiv.org/abs/2310.00390v3","updated":"2024-03-16T07:21:34Z","published":"2023-09-30T14:26:43Z","title":"InstructCV: Instruction-Tuned Text-to-Image Diffusion Models as Vision\n Generalists","summary":" Recent advances in generative diffusion models have enabled text-controlled\nsynthesis of realistic and diverse images with impressive quality. Despite\nthese remarkable advances, the application of text-to-image generative models\nin computer vision for standard visual recognition tasks remains limited. The\ncurrent de facto approach for these tasks is to design model architectures and\nloss functions that are tailored to the task at hand. In this paper, we develop\na unified language interface for computer vision tasks that abstracts away\ntask-specific design choices and enables task execution by following natural\nlanguage instructions. Our approach involves casting multiple computer vision\ntasks as text-to-image generation problems. Here, the text represents an\ninstruction describing the task, and the resulting image is a visually-encoded\ntask output. To train our model, we pool commonly-used computer vision datasets\ncovering a range of tasks, including segmentation, object detection, depth\nestimation, and classification. We then use a large language model to\nparaphrase prompt templates that convey the specific tasks to be conducted on\neach image, and through this process, we create a multi-modal and multi-task\ntraining dataset comprising input and output images along with annotated\ninstructions. Following the InstructPix2Pix architecture, we apply\ninstruction-tuning to a text-to-image diffusion model using our constructed\ndataset, steering its functionality from a generative model to an\ninstruction-guided multi-task vision learner. Experiments demonstrate that our\nmodel, dubbed InstructCV, performs competitively compared to other generalist\nand task-specific vision models. Moreover, it exhibits compelling\ngeneralization capabilities to unseen data, categories, and user instructions.\n","authors":["Yulu Gan","Sungwoo Park","Alexander Schubert","Anthony Philippakis","Ahmed M. Alaa"],"pdf_url":"https://arxiv.org/pdf/2310.00390v3.pdf","comment":"ICLR 2024; Code is available at https://github.com/AlaaLab/InstructCV"},{"id":"http://arxiv.org/abs/2403.10834v1","updated":"2024-03-16T07:05:47Z","published":"2024-03-16T07:05:47Z","title":"SF(DA)$^2$: Source-free Domain Adaptation Through the Lens of Data\n Augmentation","summary":" In the face of the deep learning model's vulnerability to domain shift,\nsource-free domain adaptation (SFDA) methods have been proposed to adapt models\nto new, unseen target domains without requiring access to source domain data.\nAlthough the potential benefits of applying data augmentation to SFDA are\nattractive, several challenges arise such as the dependence on prior knowledge\nof class-preserving transformations and the increase in memory and\ncomputational requirements. In this paper, we propose Source-free Domain\nAdaptation Through the Lens of Data Augmentation (SF(DA)$^2$), a novel approach\nthat leverages the benefits of data augmentation without suffering from these\nchallenges. We construct an augmentation graph in the feature space of the\npretrained model using the neighbor relationships between target features and\npropose spectral neighborhood clustering to identify partitions in the\nprediction space. Furthermore, we propose implicit feature augmentation and\nfeature disentanglement as regularization loss functions that effectively\nutilize class semantic information within the feature space. These regularizers\nsimulate the inclusion of an unlimited number of augmented target features into\nthe augmentation graph while minimizing computational and memory demands. Our\nmethod shows superior adaptation performance in SFDA scenarios, including 2D\nimage and 3D point cloud datasets and a highly imbalanced dataset.\n","authors":["Uiwon Hwang","Jonghyun Lee","Juhyeon Shin","Sungroh Yoon"],"pdf_url":"https://arxiv.org/pdf/2403.10834v1.pdf","comment":"ICLR 2024. Code: https://github.com/shinyflight/SFDA2"},{"id":"http://arxiv.org/abs/2403.10831v1","updated":"2024-03-16T06:49:32Z","published":"2024-03-16T06:49:32Z","title":"DUE: Dynamic Uncertainty-Aware Explanation Supervision via 3D Imputation","summary":" Explanation supervision aims to enhance deep learning models by integrating\nadditional signals to guide the generation of model explanations, showcasing\nnotable improvements in both the predictability and explainability of the\nmodel. However, the application of explanation supervision to\nhigher-dimensional data, such as 3D medical images, remains an under-explored\ndomain. Challenges associated with supervising visual explanations in the\npresence of an additional dimension include: 1) spatial correlation changed, 2)\nlack of direct 3D annotations, and 3) uncertainty varies across different parts\nof the explanation. To address these challenges, we propose a Dynamic\nUncertainty-aware Explanation supervision (DUE) framework for 3D explanation\nsupervision that ensures uncertainty-aware explanation guidance when dealing\nwith sparsely annotated 3D data with diffusion-based 3D interpolation. Our\nproposed framework is validated through comprehensive experiments on diverse\nreal-world medical imaging datasets. The results demonstrate the effectiveness\nof our framework in enhancing the predictability and explainability of deep\nlearning models in the context of medical imaging diagnosis applications.\n","authors":["Qilong Zhao","Yifei Zhang","Mengdan Zhu","Siyi Gu","Yuyang Gao","Xiaofeng Yang","Liang Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.10831v1.pdf","comment":"9 pages,6 figures"},{"id":"http://arxiv.org/abs/2403.10830v1","updated":"2024-03-16T06:48:33Z","published":"2024-03-16T06:48:33Z","title":"View-Centric Multi-Object Tracking with Homographic Matching in Moving\n UAV","summary":" In this paper, we address the challenge of multi-object tracking (MOT) in\nmoving Unmanned Aerial Vehicle (UAV) scenarios, where irregular flight\ntrajectories, such as hovering, turning left/right, and moving up/down, lead to\nsignificantly greater complexity compared to fixed-camera MOT. Specifically,\nchanges in the scene background not only render traditional frame-to-frame\nobject IOU association methods ineffective but also introduce significant view\nshifts in the objects, which complicates tracking. To overcome these issues, we\npropose a novel universal HomView-MOT framework, which for the first time,\nharnesses the view Homography inherent in changing scenes to solve MOT\nchallenges in moving environments, incorporating Homographic Matching and\nView-Centric concepts. We introduce a Fast Homography Estimation (FHE)\nalgorithm for rapid computation of Homography matrices between video frames,\nenabling object View-Centric ID Learning (VCIL) and leveraging multi-view\nHomography to learn cross-view ID features. Concurrently, our Homographic\nMatching Filter (HMF) maps object bounding boxes from different frames onto a\ncommon view plane for a more realistic physical IOU association. Extensive\nexperiments have proven that these innovations allow HomView-MOT to achieve\nstate-of-the-art performance on prominent UAV MOT datasets VisDrone and UAVDT.\n","authors":["Deyi Ji","Siqi Gao","Lanyun Zhu","Yiru Zhao","Peng Xu","Hongtao Lu","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.10830v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10826v1","updated":"2024-03-16T06:26:52Z","published":"2024-03-16T06:26:52Z","title":"Exploring Learning-based Motion Models in Multi-Object Tracking","summary":" In the field of multi-object tracking (MOT), traditional methods often rely\non the Kalman Filter for motion prediction, leveraging its strengths in linear\nmotion scenarios. However, the inherent limitations of these methods become\nevident when confronted with complex, nonlinear motions and occlusions\nprevalent in dynamic environments like sports and dance. This paper explores\nthe possibilities of replacing the Kalman Filter with various learning-based\nmotion model that effectively enhances tracking accuracy and adaptability\nbeyond the constraints of Kalman Filter-based systems. In this paper, we\nproposed MambaTrack, an online motion-based tracker that outperforms all\nexisting motion-based trackers on the challenging DanceTrack and SportsMOT\ndatasets. Moreover, we further exploit the potential of the state-space-model\nin trajectory feature extraction to boost the tracking performance and proposed\nMambaTrack+, which achieves the state-of-the-art performance on DanceTrack\ndataset with 56.1 HOTA and 54.9 IDF1.\n","authors":["Hsiang-Wei Huang","Cheng-Yen Yang","Wenhao Chai","Zhongyu Jiang","Jenq-Neng Hwang"],"pdf_url":"https://arxiv.org/pdf/2403.10826v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10825v1","updated":"2024-03-16T06:26:43Z","published":"2024-03-16T06:26:43Z","title":"Affective Behaviour Analysis via Integrating Multi-Modal Knowledge","summary":" Affective Behavior Analysis aims to facilitate technology emotionally smart,\ncreating a world where devices can understand and react to our emotions as\nhumans do. To comprehensively evaluate the authenticity and applicability of\nemotional behavior analysis techniques in natural environments, the 6th\ncompetition on Affective Behavior Analysis in-the-wild (ABAW) utilizes the\nAff-Wild2, Hume-Vidmimic2, and C-EXPR-DB datasets to set up five competitive\ntracks, i.e., Valence-Arousal (VA) Estimation, Expression (EXPR) Recognition,\nAction Unit (AU) Detection, Compound Expression (CE) Recognition, and Emotional\nMimicry Intensity (EMI) Estimation. In this paper, we present our method\ndesigns for the five tasks. Specifically, our design mainly includes three\naspects: 1) Utilizing a transformer-based feature fusion module to fully\nintegrate emotional information provided by audio signals, visual images, and\ntranscripts, offering high-quality expression features for the downstream\ntasks. 2) To achieve high-quality facial feature representations, we employ\nMasked-Auto Encoder as the visual features extraction model and fine-tune it\nwith our facial dataset. 3) Considering the complexity of the video collection\nscenes, we conduct a more detailed dataset division based on scene\ncharacteristics and train the classifier for each scene. Extensive experiments\ndemonstrate the superiority of our designs.\n","authors":["Wei Zhang","Feng Qiu","Chen Liu","Lincheng Li","Heming Du","Tiancheng Guo","Xin Yu"],"pdf_url":"https://arxiv.org/pdf/2403.10825v1.pdf","comment":"11 pages, 1 figure"},{"id":"http://arxiv.org/abs/2403.10823v1","updated":"2024-03-16T06:21:19Z","published":"2024-03-16T06:21:19Z","title":"VisionCLIP: An Med-AIGC based Ethical Language-Image Foundation Model\n for Generalizable Retina Image Analysis","summary":" Generalist foundation model has ushered in newfound capabilities in medical\ndomain. However, the contradiction between the growing demand for high-quality\nannotated data with patient privacy continues to intensify. The utilization of\nmedical artificial intelligence generated content (Med-AIGC) as an\ninexhaustible resource repository arises as a potential solution to address the\naforementioned challenge. Here we harness 1 million open-source synthetic\nfundus images paired with natural language descriptions, to curate an ethical\nlanguage-image foundation model for retina image analysis named VisionCLIP.\nVisionCLIP achieves competitive performance on three external datasets compared\nwith the existing method pre-trained on real-world data in a zero-shot fashion.\nThe employment of artificially synthetic images alongside corresponding textual\ndata for training enables the medical foundation model to successfully\nassimilate knowledge of disease symptomatology, thereby circumventing potential\nbreaches of patient confidentiality.\n","authors":["Hao Wei","Bowen Liu","Minqing Zhang","Peilun Shi","Wu Yuan"],"pdf_url":"https://arxiv.org/pdf/2403.10823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10820v1","updated":"2024-03-16T06:10:22Z","published":"2024-03-16T06:10:22Z","title":"Active Label Correction for Semantic Segmentation with Foundation Models","summary":" Training and validating models for semantic segmentation require datasets\nwith pixel-wise annotations, which are notoriously labor-intensive. Although\nuseful priors such as foundation models or crowdsourced datasets are available,\nthey are error-prone. We hence propose an effective framework of active label\ncorrection (ALC) based on a design of correction query to rectify pseudo labels\nof pixels, which in turn is more annotator-friendly than the standard one\ninquiring to classify a pixel directly according to our theoretical analysis\nand user study. Specifically, leveraging foundation models providing useful\nzero-shot predictions on pseudo labels and superpixels, our method comprises\ntwo key techniques: (i) an annotator-friendly design of correction query with\nthe pseudo labels, and (ii) an acquisition function looking ahead label\nexpansions based on the superpixels. Experimental results on PASCAL,\nCityscapes, and Kvasir-SEG datasets demonstrate the effectiveness of our ALC\nframework, outperforming prior methods for active semantic segmentation and\nlabel correction. Notably, utilizing our method, we obtained a revised dataset\nof PASCAL by rectifying errors in 2.6 million pixels in PASCAL dataset.\n","authors":["Hoyoung Kim","Sehyun Hwang","Suha Kwak","Jungseul Ok"],"pdf_url":"https://arxiv.org/pdf/2403.10820v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08247v6","updated":"2024-03-16T05:45:59Z","published":"2023-06-14T05:25:06Z","title":"Diffusion in Diffusion: Cyclic One-Way Diffusion for\n Text-Vision-Conditioned Generation","summary":" Originating from the diffusion phenomenon in physics that describes particle\nmovement, the diffusion generative models inherit the characteristics of\nstochastic random walk in the data space along the denoising trajectory.\nHowever, the intrinsic mutual interference among image regions contradicts the\nneed for practical downstream application scenarios where the preservation of\nlow-level pixel information from given conditioning is desired (e.g.,\ncustomization tasks like personalized generation and inpainting based on a\nuser-provided single image). In this work, we investigate the diffusion\n(physics) in diffusion (machine learning) properties and propose our Cyclic\nOne-Way Diffusion (COW) method to control the direction of diffusion phenomenon\ngiven a pre-trained frozen diffusion model for versatile customization\napplication scenarios, where the low-level pixel information from the\nconditioning needs to be preserved. Notably, unlike most current methods that\nincorporate additional conditions by fine-tuning the base text-to-image\ndiffusion model or learning auxiliary networks, our method provides a novel\nperspective to understand the task needs and is applicable to a wider range of\ncustomization scenarios in a learning-free manner. Extensive experiment results\nshow that our proposed COW can achieve more flexible customization based on\nstrict visual conditions in different application settings. Project page:\nhttps://wangruoyu02.github.io/cow.github.io/.\n","authors":["Ruoyu Wang","Yongqi Yang","Zhihao Qian","Ye Zhu","Yu Wu"],"pdf_url":"https://arxiv.org/pdf/2306.08247v6.pdf","comment":"Accepted by ICLR2024, 21 pages with 18 figures"},{"id":"http://arxiv.org/abs/2311.06792v2","updated":"2024-03-16T05:33:09Z","published":"2023-11-12T10:03:32Z","title":"IMPUS: Image Morphing with Perceptually-Uniform Sampling Using Diffusion\n Models","summary":" We present a diffusion-based image morphing approach with\nperceptually-uniform sampling (IMPUS) that produces smooth, direct and\nrealistic interpolations given an image pair. The embeddings of two images may\nlie on distinct conditioned distributions of a latent diffusion model,\nespecially when they have significant semantic difference. To bridge this gap,\nwe interpolate in the locally linear and continuous text embedding space and\nGaussian latent space. We first optimize the endpoint text embeddings and then\nmap the images to the latent space using a probability flow ODE. Unlike\nexisting work that takes an indirect morphing path, we show that the model\nadaptation yields a direct path and suppresses ghosting artifacts in the\ninterpolated images. To achieve this, we propose a heuristic bottleneck\nconstraint based on a novel relative perceptual path diversity score that\nautomatically controls the bottleneck size and balances the diversity along the\npath with its directness. We also propose a perceptually-uniform sampling\ntechnique that enables visually smooth changes between the interpolated images.\nExtensive experiments validate that our IMPUS can achieve smooth, direct, and\nrealistic image morphing and is adaptable to several other generative tasks.\n","authors":["Zhaoyuan Yang","Zhengyang Yu","Zhiwei Xu","Jaskirat Singh","Jing Zhang","Dylan Campbell","Peter Tu","Richard Hartley"],"pdf_url":"https://arxiv.org/pdf/2311.06792v2.pdf","comment":"Published as a conference paper at ICLR 2024"},{"id":"http://arxiv.org/abs/2403.10815v1","updated":"2024-03-16T05:22:16Z","published":"2024-03-16T05:22:16Z","title":"MicroDiffusion: Implicit Representation-Guided Diffusion for 3D\n Reconstruction from Limited 2D Microscopy Projections","summary":" Volumetric optical microscopy using non-diffracting beams enables rapid\nimaging of 3D volumes by projecting them axially to 2D images but lacks crucial\ndepth information. Addressing this, we introduce MicroDiffusion, a pioneering\ntool facilitating high-quality, depth-resolved 3D volume reconstruction from\nlimited 2D projections. While existing Implicit Neural Representation (INR)\nmodels often yield incomplete outputs and Denoising Diffusion Probabilistic\nModels (DDPM) excel at capturing details, our method integrates INR's\nstructural coherence with DDPM's fine-detail enhancement capabilities. We\npretrain an INR model to transform 2D axially-projected images into a\npreliminary 3D volume. This pretrained INR acts as a global prior guiding\nDDPM's generative process through a linear interpolation between INR outputs\nand noise inputs. This strategy enriches the diffusion process with structured\n3D information, enhancing detail and reducing noise in localized 2D images. By\nconditioning the diffusion model on the closest 2D projection, MicroDiffusion\nsubstantially enhances fidelity in resulting 3D reconstructions, surpassing INR\nand standard DDPM outputs with unparalleled image quality and structural\nfidelity. Our code and dataset are available at\nhttps://github.com/UCSC-VLAA/MicroDiffusion.\n","authors":["Mude Hui","Zihao Wei","Hongru Zhu","Fei Xia","Yuyin Zhou"],"pdf_url":"https://arxiv.org/pdf/2403.10815v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.10814v1","updated":"2024-03-16T05:21:42Z","published":"2024-03-16T05:21:42Z","title":"DarkGS: Learning Neural Illumination and 3D Gaussians Relighting for\n Robotic Exploration in the Dark","summary":" Humans have the remarkable ability to construct consistent mental models of\nan environment, even under limited or varying levels of illumination. We wish\nto endow robots with this same capability. In this paper, we tackle the\nchallenge of constructing a photorealistic scene representation under poorly\nilluminated conditions and with a moving light source. We approach the task of\nmodeling illumination as a learning problem, and utilize the developed\nillumination model to aid in scene reconstruction. We introduce an innovative\nframework that uses a data-driven approach, Neural Light Simulators (NeLiS), to\nmodel and calibrate the camera-light system. Furthermore, we present DarkGS, a\nmethod that applies NeLiS to create a relightable 3D Gaussian scene model\ncapable of real-time, photorealistic rendering from novel viewpoints. We show\nthe applicability and robustness of our proposed simulator and system in a\nvariety of real-world environments.\n","authors":["Tianyi Zhang","Kaining Huang","Weiming Zhi","Matthew Johnson-Roberson"],"pdf_url":"https://arxiv.org/pdf/2403.10814v1.pdf","comment":"8 pages, 9 figures"},{"id":"http://arxiv.org/abs/2403.10805v1","updated":"2024-03-16T04:40:10Z","published":"2024-03-16T04:40:10Z","title":"Speech-driven Personalized Gesture Synthetics: Harnessing Automatic\n Fuzzy Feature Inference","summary":" Speech-driven gesture generation is an emerging field within virtual human\ncreation. However, a significant challenge lies in accurately determining and\nprocessing the multitude of input features (such as acoustic, semantic,\nemotional, personality, and even subtle unknown features). Traditional\napproaches, reliant on various explicit feature inputs and complex multimodal\nprocessing, constrain the expressiveness of resulting gestures and limit their\napplicability. To address these challenges, we present Persona-Gestor, a novel\nend-to-end generative model designed to generate highly personalized 3D\nfull-body gestures solely relying on raw speech audio. The model combines a\nfuzzy feature extractor and a non-autoregressive Adaptive Layer Normalization\n(AdaLN) transformer diffusion architecture. The fuzzy feature extractor\nharnesses a fuzzy inference strategy that automatically infers implicit,\ncontinuous fuzzy features. These fuzzy features, represented as a unified\nlatent feature, are fed into the AdaLN transformer. The AdaLN transformer\nintroduces a conditional mechanism that applies a uniform function across all\ntokens, thereby effectively modeling the correlation between the fuzzy features\nand the gesture sequence. This module ensures a high level of gesture-speech\nsynchronization while preserving naturalness. Finally, we employ the diffusion\nmodel to train and infer various gestures. Extensive subjective and objective\nevaluations on the Trinity, ZEGGS, and BEAT datasets confirm our model's\nsuperior performance to the current state-of-the-art approaches. Persona-Gestor\nimproves the system's usability and generalization capabilities, setting a new\nbenchmark in speech-driven gesture synthesis and broadening the horizon for\nvirtual human technology. Supplementary videos and code can be accessed at\nhttps://zf223669.github.io/Diffmotion-v2-website/\n","authors":["Fan Zhang","Zhaohan Wang","Xin Lyu","Siyuan Zhao","Mengjian Li","Weidong Geng","Naye Ji","Hui Du","Fuxing Gao","Hao Wu","Shunman Li"],"pdf_url":"https://arxiv.org/pdf/2403.10805v1.pdf","comment":"12 pages,"},{"id":"http://arxiv.org/abs/2403.10803v1","updated":"2024-03-16T04:35:04Z","published":"2024-03-16T04:35:04Z","title":"Enhancing Out-of-Distribution Detection with Multitesting-based\n Layer-wise Feature Fusion","summary":" Deploying machine learning in open environments presents the challenge of\nencountering diverse test inputs that differ significantly from the training\ndata. These out-of-distribution samples may exhibit shifts in local or global\nfeatures compared to the training distribution. The machine learning (ML)\ncommunity has responded with a number of methods aimed at distinguishing\nanomalous inputs from original training data. However, the majority of previous\nstudies have primarily focused on the output layer or penultimate layer of\npre-trained deep neural networks. In this paper, we propose a novel framework,\nMultitesting-based Layer-wise Out-of-Distribution (OOD) Detection (MLOD), to\nidentify distributional shifts in test samples at different levels of features\nthrough rigorous multiple testing procedure. Our approach distinguishes itself\nfrom existing methods as it does not require modifying the structure or\nfine-tuning of the pre-trained classifier. Through extensive experiments, we\ndemonstrate that our proposed framework can seamlessly integrate with any\nexisting distance-based inspection method while efficiently utilizing feature\nextractors of varying depths. Our scheme effectively enhances the performance\nof out-of-distribution detection when compared to baseline methods. In\nparticular, MLOD-Fisher achieves superior performance in general. When trained\nusing KNN on CIFAR10, MLOD-Fisher significantly lowers the false positive rate\n(FPR) from 24.09% to 7.47% on average compared to merely utilizing the features\nof the last layer.\n","authors":["Jiawei Li","Sitong Li","Shanshan Wang","Yicheng Zeng","Falong Tan","Chuanlong Xie"],"pdf_url":"https://arxiv.org/pdf/2403.10803v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10801v1","updated":"2024-03-16T04:23:46Z","published":"2024-03-16T04:23:46Z","title":"Securely Fine-tuning Pre-trained Encoders Against Adversarial Examples","summary":" With the evolution of self-supervised learning, the pre-training paradigm has\nemerged as a predominant solution within the deep learning landscape. Model\nproviders furnish pre-trained encoders designed to function as versatile\nfeature extractors, enabling downstream users to harness the benefits of\nexpansive models with minimal effort through fine-tuning. Nevertheless, recent\nworks have exposed a vulnerability in pre-trained encoders, highlighting their\nsusceptibility to downstream-agnostic adversarial examples (DAEs) meticulously\ncrafted by attackers. The lingering question pertains to the feasibility of\nfortifying the robustness of downstream models against DAEs, particularly in\nscenarios where the pre-trained encoders are publicly accessible to the\nattackers.\n In this paper, we initially delve into existing defensive mechanisms against\nadversarial examples within the pre-training paradigm. Our findings reveal that\nthe failure of current defenses stems from the domain shift between\npre-training data and downstream tasks, as well as the sensitivity of encoder\nparameters. In response to these challenges, we propose Genetic\nEvolution-Nurtured Adversarial Fine-tuning (Gen-AF), a two-stage adversarial\nfine-tuning approach aimed at enhancing the robustness of downstream models.\nOur extensive experiments, conducted across ten self-supervised training\nmethods and six datasets, demonstrate that Gen-AF attains high testing accuracy\nand robust testing accuracy against state-of-the-art DAEs.\n","authors":["Ziqi Zhou","Minghui Li","Wei Liu","Shengshan Hu","Yechao Zhang","Wei Wan","Lulu Xue","Leo Yu Zhang","Dezhong Yang","Hai Jin"],"pdf_url":"https://arxiv.org/pdf/2403.10801v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.15698v3","updated":"2024-03-16T04:21:44Z","published":"2023-03-28T03:00:28Z","title":"TFS-ViT: Token-Level Feature Stylization for Domain Generalization","summary":" Standard deep learning models such as convolutional neural networks (CNNs)\nlack the ability of generalizing to domains which have not been seen during\ntraining. This problem is mainly due to the common but often wrong assumption\nof such models that the source and target data come from the same i.i.d.\ndistribution. Recently, Vision Transformers (ViTs) have shown outstanding\nperformance for a broad range of computer vision tasks. However, very few\nstudies have investigated their ability to generalize to new domains. This\npaper presents a first Token-level Feature Stylization (TFS-ViT) approach for\ndomain generalization, which improves the performance of ViTs to unseen data by\nsynthesizing new domains. Our approach transforms token features by mixing the\nnormalization statistics of images from different domains. We further improve\nthis approach with a novel strategy for attention-aware stylization, which uses\nthe attention maps of class (CLS) tokens to compute and mix normalization\nstatistics of tokens corresponding to different image regions. The proposed\nmethod is flexible to the choice of backbone model and can be easily applied to\nany ViT-based architecture with a negligible increase in computational\ncomplexity. Comprehensive experiments show that our approach is able to achieve\nstate-of-the-art performance on five challenging benchmarks for domain\ngeneralization, and demonstrate its ability to deal with different types of\ndomain shifts. The implementation is available at:\nhttps://github.com/Mehrdad-Noori/TFS-ViT_Token-level_Feature_Stylization.\n","authors":["Mehrdad Noori","Milad Cheraghalikhani","Ali Bahri","Gustavo A. Vargas Hakim","David Osowiechi","Ismail Ben Ayed","Christian Desrosiers"],"pdf_url":"https://arxiv.org/pdf/2303.15698v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10798v1","updated":"2024-03-16T04:01:50Z","published":"2024-03-16T04:01:50Z","title":"Unsupervised Collaborative Metric Learning with Mixed-Scale Groups for\n General Object Retrieval","summary":" The task of searching for visual objects in a large image dataset is\ndifficult because it requires efficient matching and accurate localization of\nobjects that can vary in size. Although the segment anything model (SAM) offers\na potential solution for extracting object spatial context, learning embeddings\nfor local objects remains a challenging problem. This paper presents a novel\nunsupervised deep metric learning approach, termed unsupervised collaborative\nmetric learning with mixed-scale groups (MS-UGCML), devised to learn embeddings\nfor objects of varying scales. Following this, a benchmark of challenges is\nassembled by utilizing COCO 2017 and VOC 2007 datasets to facilitate the\ntraining and evaluation of general object retrieval models. Finally, we conduct\ncomprehensive ablation studies and discuss the complexities faced within the\ndomain of general object retrieval. Our object retrieval evaluations span a\nrange of datasets, including BelgaLogos, Visual Genome, LVIS, in addition to a\nchallenging evaluation set that we have individually assembled for\nopen-vocabulary evaluation. These comprehensive evaluations effectively\nhighlight the robustness of our unsupervised MS-UGCML approach, with an object\nlevel and image level mAPs improvement of up to 6.69% and 10.03%, respectively.\nThe code is publicly available at https://github.com/dengyuhai/MS-UGCML.\n","authors":["Shichao Kan","Yuhai Deng","Yixiong Liang","Lihui Cen","Zhe Qu","Yigang Cen","Zhihai He"],"pdf_url":"https://arxiv.org/pdf/2403.10798v1.pdf","comment":"13 pages, 10 figures"},{"id":"http://arxiv.org/abs/2401.16694v2","updated":"2024-03-16T03:55:50Z","published":"2024-01-30T02:41:05Z","title":"EdgeOL: Efficient in-situ Online Learning on Edge Devices","summary":" Emerging applications, such as robot-assisted eldercare and object\nrecognition, generally employ deep learning neural networks (DNNs) and\nnaturally require: i) handling streaming-in inference requests and ii) adapting\nto possible deployment scenario changes. Online model fine-tuning is widely\nadopted to satisfy these needs. However, an inappropriate fine-tuning scheme\ncould involve significant energy consumption, making it challenging to deploy\non edge devices. In this paper, we propose EdgeOL, an edge online learning\nframework that optimizes inference accuracy, fine-tuning execution time, and\nenergy efficiency through both inter-tuning and intra-tuning optimizations.\nExperimental results show that, on average, EdgeOL reduces overall fine-tuning\nexecution time by 64%, energy consumption by 52%, and improves average\ninference accuracy by 1.75% over the immediate online learning strategy.\n","authors":["Sheng Li","Geng Yuan","Yawen Wu","Yue Dai","Chao Wu","Alex K. Jones","Jingtong Hu","Yanzhi Wang","Xulong Tang"],"pdf_url":"https://arxiv.org/pdf/2401.16694v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10786v1","updated":"2024-03-16T03:33:52Z","published":"2024-03-16T03:33:52Z","title":"ContourDiff: Unpaired Image Translation with Contour-Guided Diffusion\n Models","summary":" Accurately translating medical images across different modalities (e.g., CT\nto MRI) has numerous downstream clinical and machine learning applications.\nWhile several methods have been proposed to achieve this, they often prioritize\nperceptual quality with respect to output domain features over preserving\nanatomical fidelity. However, maintaining anatomy during translation is\nessential for many tasks, e.g., when leveraging masks from the input domain to\ndevelop a segmentation model with images translated to the output domain. To\naddress these challenges, we propose ContourDiff, a novel framework that\nleverages domain-invariant anatomical contour representations of images. These\nrepresentations are simple to extract from images, yet form precise spatial\nconstraints on their anatomical content. We introduce a diffusion model that\nconverts contour representations of images from arbitrary input domains into\nimages in the output domain of interest. By applying the contour as a\nconstraint at every diffusion sampling step, we ensure the preservation of\nanatomical content. We evaluate our method by training a segmentation model on\nimages translated from CT to MRI with their original CT masks and testing its\nperformance on real MRIs. Our method outperforms other unpaired image\ntranslation methods by a significant margin, furthermore without the need to\naccess any input domain information during training.\n","authors":["Yuwen Chen","Nicholas Konz","Hanxue Gu","Haoyu Dong","Yaqian Chen","Lin Li","Jisoo Lee","Maciej A. Mazurowski"],"pdf_url":"https://arxiv.org/pdf/2403.10786v1.pdf","comment":"Code will be released on GitHub"},{"id":"http://arxiv.org/abs/2403.10783v1","updated":"2024-03-16T03:05:07Z","published":"2024-03-16T03:05:07Z","title":"StableGarment: Garment-Centric Generation via Stable Diffusion","summary":" In this paper, we introduce StableGarment, a unified framework to tackle\ngarment-centric(GC) generation tasks, including GC text-to-image, controllable\nGC text-to-image, stylized GC text-to-image, and robust virtual try-on. The\nmain challenge lies in retaining the intricate textures of the garment while\nmaintaining the flexibility of pre-trained Stable Diffusion. Our solution\ninvolves the development of a garment encoder, a trainable copy of the\ndenoising UNet equipped with additive self-attention (ASA) layers. These ASA\nlayers are specifically devised to transfer detailed garment textures, also\nfacilitating the integration of stylized base models for the creation of\nstylized images. Furthermore, the incorporation of a dedicated try-on\nControlNet enables StableGarment to execute virtual try-on tasks with\nprecision. We also build a novel data engine that produces high-quality\nsynthesized data to preserve the model's ability to follow prompts. Extensive\nexperiments demonstrate that our approach delivers state-of-the-art (SOTA)\nresults among existing virtual try-on methods and exhibits high flexibility\nwith broad potential applications in various garment-centric image generation.\n","authors":["Rui Wang","Hailong Guo","Jiaming Liu","Huaxia Li","Haibo Zhao","Xu Tang","Yao Hu","Hao Tang","Peipei Li"],"pdf_url":"https://arxiv.org/pdf/2403.10783v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10782v1","updated":"2024-03-16T03:03:27Z","published":"2024-03-16T03:03:27Z","title":"Bidirectional Multi-Step Domain Generalization for Visible-Infrared\n Person Re-Identification","summary":" A key challenge in visible-infrared person re-identification (V-I ReID) is\ntraining a backbone model capable of effectively addressing the significant\ndiscrepancies across modalities. State-of-the-art methods that generate a\nsingle intermediate bridging domain are often less effective, as this generated\ndomain may not adequately capture sufficient common discriminant information.\nThis paper introduces the Bidirectional Multi-step Domain Generalization\n(BMDG), a novel approach for unifying feature representations across diverse\nmodalities. BMDG creates multiple virtual intermediate domains by finding and\naligning body part features extracted from both I and V modalities. Indeed,\nBMDG aims to reduce the modality gaps in two steps. First, it aligns modalities\nin feature space by learning shared and modality-invariant body part prototypes\nfrom V and I images. Then, it generalizes the feature representation by\napplying bidirectional multi-step learning, which progressively refines feature\nrepresentations in each step and incorporates more prototypes from both\nmodalities. In particular, our method minimizes the cross-modal gap by\nidentifying and aligning shared prototypes that capture key discriminative\nfeatures across modalities, then uses multiple bridging steps based on this\ninformation to enhance the feature representation. Experiments conducted on\nchallenging V-I ReID datasets indicate that our BMDG approach outperforms\nstate-of-the-art part-based models or methods that generate an intermediate\ndomain from V-I person ReID.\n","authors":["Mahdi Alehdaghi","Pourya Shamsolmoali","Rafael M. O. Cruz","Eric Granger"],"pdf_url":"https://arxiv.org/pdf/2403.10782v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.07589v2","updated":"2024-03-16T02:55:14Z","published":"2024-03-12T12:19:05Z","title":"PeLK: Parameter-efficient Large Kernel ConvNets with Peripheral\n Convolution","summary":" Recently, some large kernel convnets strike back with appealing performance\nand efficiency. However, given the square complexity of convolution, scaling up\nkernels can bring about an enormous amount of parameters and the proliferated\nparameters can induce severe optimization problem. Due to these issues, current\nCNNs compromise to scale up to 51x51 in the form of stripe convolution (i.e.,\n51x5 + 5x51) and start to saturate as the kernel size continues growing. In\nthis paper, we delve into addressing these vital issues and explore whether we\ncan continue scaling up kernels for more performance gains. Inspired by human\nvision, we propose a human-like peripheral convolution that efficiently reduces\nover 90% parameter count of dense grid convolution through parameter sharing,\nand manage to scale up kernel size to extremely large. Our peripheral\nconvolution behaves highly similar to human, reducing the complexity of\nconvolution from O(K^2) to O(logK) without backfiring performance. Built on\nthis, we propose Parameter-efficient Large Kernel Network (PeLK). Our PeLK\noutperforms modern vision Transformers and ConvNet architectures like Swin,\nConvNeXt, RepLKNet and SLaK on various vision tasks including ImageNet\nclassification, semantic segmentation on ADE20K and object detection on MS\nCOCO. For the first time, we successfully scale up the kernel size of CNNs to\nan unprecedented 101x101 and demonstrate consistent improvements.\n","authors":["Honghao Chen","Xiangxiang Chu","Yongjian Ren","Xin Zhao","Kaiqi Huang"],"pdf_url":"https://arxiv.org/pdf/2403.07589v2.pdf","comment":"CVPR 2024; Modification for Fig.1(b); Add Acknowledgements"},{"id":"http://arxiv.org/abs/2403.10780v1","updated":"2024-03-16T02:54:49Z","published":"2024-03-16T02:54:49Z","title":"Segment Any Object Model (SAOM): Real-to-Simulation Fine-Tuning Strategy\n for Multi-Class Multi-Instance Segmentation","summary":" Multi-class multi-instance segmentation is the task of identifying masks for\nmultiple object classes and multiple instances of the same class within an\nimage. The foundational Segment Anything Model (SAM) is designed for promptable\nmulti-class multi-instance segmentation but tends to output part or sub-part\nmasks in the \"everything\" mode for various real-world applications. Whole\nobject segmentation masks play a crucial role for indoor scene understanding,\nespecially in robotics applications. We propose a new domain invariant\nReal-to-Simulation (Real-Sim) fine-tuning strategy for SAM. We use object\nimages and ground truth data collected from Ai2Thor simulator during\nfine-tuning (real-to-sim). To allow our Segment Any Object Model (SAOM) to work\nin the \"everything\" mode, we propose the novel nearest neighbour assignment\nmethod, updating point embeddings for each ground-truth mask. SAOM is evaluated\non our own dataset collected from Ai2Thor simulator. SAOM significantly\nimproves on SAM, with a 28% increase in mIoU and a 25% increase in mAcc for 54\nfrequently-seen indoor object classes. Moreover, our Real-to-Simulation\nfine-tuning strategy demonstrates promising generalization performance in real\nenvironments without being trained on the real-world data (sim-to-real). The\ndataset and the code will be released after publication.\n","authors":["Mariia Khan","Yue Qiu","Yuren Cong","Jumana Abu-Khalaf","David Suter","Bodo Rosenhahn"],"pdf_url":"https://arxiv.org/pdf/2403.10780v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10778v1","updated":"2024-03-16T02:45:42Z","published":"2024-03-16T02:45:42Z","title":"HCF-Net: Hierarchical Context Fusion Network for Infrared Small Object\n Detection","summary":" Infrared small object detection is an important computer vision task\ninvolving the recognition and localization of tiny objects in infrared images,\nwhich usually contain only a few pixels. However, it encounters difficulties\ndue to the diminutive size of the objects and the generally complex backgrounds\nin infrared images. In this paper, we propose a deep learning method, HCF-Net,\nthat significantly improves infrared small object detection performance through\nmultiple practical modules. Specifically, it includes the parallelized\npatch-aware attention (PPA) module, dimension-aware selective integration\n(DASI) module, and multi-dilated channel refiner (MDCR) module. The PPA module\nuses a multi-branch feature extraction strategy to capture feature information\nat different scales and levels. The DASI module enables adaptive channel\nselection and fusion. The MDCR module captures spatial features of different\nreceptive field ranges through multiple depth-separable convolutional layers.\nExtensive experimental results on the SIRST infrared single-frame image dataset\nshow that the proposed HCF-Net performs well, surpassing other traditional and\ndeep learning models. Code is available at\nhttps://github.com/zhengshuchen/HCFNet.\n","authors":["Shibiao Xu","ShuChen Zheng","Wenhao Xu","Rongtao Xu","Changwei Wang","Jiguang Zhang","Xiaoqiang Teng","Ao Li","Li Guo"],"pdf_url":"https://arxiv.org/pdf/2403.10778v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04054v2","updated":"2024-03-16T02:33:26Z","published":"2023-07-08T22:21:23Z","title":"Deep Unsupervised Learning Using Spike-Timing-Dependent Plasticity","summary":" Spike-Timing-Dependent Plasticity (STDP) is an unsupervised learning\nmechanism for Spiking Neural Networks (SNNs) that has received significant\nattention from the neuromorphic hardware community. However, scaling such local\nlearning techniques to deeper networks and large-scale tasks has remained\nelusive. In this work, we investigate a Deep-STDP framework where a rate-based\nconvolutional network, that can be deployed in a neuromorphic setting, is\ntrained in tandem with pseudo-labels generated by the STDP clustering process\non the network outputs. We achieve $24.56\\%$ higher accuracy and $3.5\\times$\nfaster convergence speed at iso-accuracy on a 10-class subset of the Tiny\nImageNet dataset in contrast to a $k$-means clustering approach.\n","authors":["Sen Lu","Abhronil Sengupta"],"pdf_url":"https://arxiv.org/pdf/2307.04054v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10773v1","updated":"2024-03-16T02:22:10Z","published":"2024-03-16T02:22:10Z","title":"DPPE: Dense Pose Estimation in a Plenoxels Environment using Gradient\n Approximation","summary":" We present DPPE, a dense pose estimation algorithm that functions over a\nPlenoxels environment. Recent advances in neural radiance field techniques have\nshown that it is a powerful tool for environment representation. More recent\nneural rendering algorithms have significantly improved both training duration\nand rendering speed. Plenoxels introduced a fully-differentiable radiance field\ntechnique that uses Plenoptic volume elements contained in voxels for\nrendering, offering reduced training times and better rendering accuracy, while\nalso eliminating the neural net component. In this work, we introduce a 6-DoF\nmonocular RGB-only pose estimation procedure for Plenoxels, which seeks to\nrecover the ground truth camera pose after a perturbation. We employ a\nvariation on classical template matching techniques, using stochastic gradient\ndescent to optimize the pose by minimizing errors in re-rendering. In\nparticular, we examine an approach that takes advantage of the rapid rendering\nspeed of Plenoxels to numerically approximate part of the pose gradient, using\na central differencing technique. We show that such methods are effective in\npose estimation. Finally, we perform ablations over key components of the\nproblem space, with a particular focus on image subsampling and Plenoxel grid\nresolution. Project website: https://sites.google.com/view/dppe\n","authors":["Christopher Kolios","Yeganeh Bahoo","Sajad Saeedi"],"pdf_url":"https://arxiv.org/pdf/2403.10773v1.pdf","comment":"8 pages, 4 figures, conference"},{"id":"http://arxiv.org/abs/2403.10755v1","updated":"2024-03-16T01:38:28Z","published":"2024-03-16T01:38:28Z","title":"Match-Stereo-Videos: Bidirectional Alignment for Consistent Dynamic\n Stereo Matching","summary":" Dynamic stereo matching is the task of estimating consistent disparities from\nstereo videos with dynamic objects. Recent learning-based methods prioritize\noptimal performance on a single stereo pair, resulting in temporal\ninconsistencies. Existing video methods apply per-frame matching and\nwindow-based cost aggregation across the time dimension, leading to\nlow-frequency oscillations at the scale of the window size. Towards this\nchallenge, we develop a bidirectional alignment mechanism for adjacent frames\nas a fundamental operation. We further propose a novel framework, BiDAStereo,\nthat achieves consistent dynamic stereo matching. Unlike the existing methods,\nwe model this task as local matching and global aggregation. Locally, we\nconsider correlation in a triple-frame manner to pool information from adjacent\nframes and improve the temporal consistency. Globally, to exploit the entire\nsequence's consistency and extract dynamic scene cues for aggregation, we\ndevelop a motion-propagation recurrent unit. Extensive experiments demonstrate\nthe performance of our method, showcasing improvements in prediction quality\nand achieving state-of-the-art results on various commonly used benchmarks.\n","authors":["Junpeng Jing","Ye Mao","Krystian Mikolajczyk"],"pdf_url":"https://arxiv.org/pdf/2403.10755v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16512v3","updated":"2024-03-16T01:10:16Z","published":"2023-08-31T07:49:06Z","title":"MVDream: Multi-view Diffusion for 3D Generation","summary":" We introduce MVDream, a diffusion model that is able to generate consistent\nmulti-view images from a given text prompt. Learning from both 2D and 3D data,\na multi-view diffusion model can achieve the generalizability of 2D diffusion\nmodels and the consistency of 3D renderings. We demonstrate that such a\nmulti-view diffusion model is implicitly a generalizable 3D prior agnostic to\n3D representations. It can be applied to 3D generation via Score Distillation\nSampling, significantly enhancing the consistency and stability of existing\n2D-lifting methods. It can also learn new concepts from a few 2D examples, akin\nto DreamBooth, but for 3D generation.\n","authors":["Yichun Shi","Peng Wang","Jianglong Ye","Mai Long","Kejie Li","Xiao Yang"],"pdf_url":"https://arxiv.org/pdf/2308.16512v3.pdf","comment":"Camera-ready version; Our project page is https://MV-Dream.github.io"},{"id":"http://arxiv.org/abs/2312.07460v2","updated":"2024-03-16T00:51:16Z","published":"2023-12-12T17:37:16Z","title":"Empirical Validation of Conformal Prediction for Trustworthy Skin\n Lesions Classification","summary":" Background and objective: Uncertainty quantification is a pivotal field that\ncontributes to realizing reliable and robust systems. It becomes instrumental\nin fortifying safe decisions by providing complementary information,\nparticularly within high-risk applications. existing studies have explored\nvarious methods that often operate under specific assumptions or necessitate\nsubstantial modifications to the network architecture to effectively account\nfor uncertainties. The objective of this paper is to study Conformal\nPrediction, an emerging distribution-free uncertainty quantification technique,\nand provide a comprehensive understanding of the advantages and limitations\ninherent in various methods within the medical imaging field.\n Methods: In this study, we developed Conformal Prediction, Monte Carlo\nDropout, and Evidential Deep Learning approaches to assess uncertainty\nquantification in deep neural networks. The effectiveness of these methods is\nevaluated using three public medical imaging datasets focused on detecting\npigmented skin lesions and blood cell types.\n Results: The experimental results demonstrate a significant enhancement in\nuncertainty quantification with the utilization of the Conformal Prediction\nmethod, surpassing the performance of the other two methods. Furthermore, the\nresults present insights into the effectiveness of each uncertainty method in\nhandling Out-of-Distribution samples from domain-shifted datasets. Our code is\navailable at:\n Conclusions: Our conclusion highlights a robust and consistent performance of\nconformal prediction across diverse testing conditions. This positions it as\nthe preferred choice for decision-making in safety-critical applications.\n","authors":["Jamil Fayyad","Shadi Alijani","Homayoun Najjaran"],"pdf_url":"https://arxiv.org/pdf/2312.07460v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10746v1","updated":"2024-03-16T00:34:25Z","published":"2024-03-16T00:34:25Z","title":"Vector search with small radiuses","summary":" In recent years, the dominant accuracy metric for vector search is the recall\nof a result list of fixed size (top-k retrieval), considering as ground truth\nthe exact vector retrieval results. Although convenient to compute, this metric\nis distantly related to the end-to-end accuracy of a full system that\nintegrates vector search. In this paper we focus on the common case where a\nhard decision needs to be taken depending on the vector retrieval results, for\nexample, deciding whether a query image matches a database image or not. We\nsolve this as a range search task, where all vectors within a certain radius\nfrom the query are returned.\n We show that the value of a range search result can be modeled rigorously\nbased on the query-to-vector distance. This yields a metric for range search,\nRSM, that is both principled and easy to compute without running an end-to-end\nevaluation. We apply this metric to the case of image retrieval. We show that\nindexing methods that are adapted for top-k retrieval do not necessarily\nmaximize the RSM. In particular, for inverted file based indexes, we show that\nvisiting a limited set of clusters and encoding vectors compactly yields near\noptimal results.\n","authors":["Gergely Szilvasy","Pierre-Emmanuel Mazaré","Matthijs Douze"],"pdf_url":"https://arxiv.org/pdf/2403.10746v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12098v1","updated":"2024-03-16T01:32:00Z","published":"2024-03-16T01:32:00Z","title":"Deep Generative Design for Mass Production","summary":" Generative Design (GD) has evolved as a transformative design approach,\nemploying advanced algorithms and AI to create diverse and innovative solutions\nbeyond traditional constraints. Despite its success, GD faces significant\nchallenges regarding the manufacturability of complex designs, often\nnecessitating extensive manual modifications due to limitations in standard\nmanufacturing processes and the reliance on additive manufacturing, which is\nnot ideal for mass production. Our research introduces an innovative framework\naddressing these manufacturability concerns by integrating constraints\npertinent to die casting and injection molding into GD, through the utilization\nof 2D depth images. This method simplifies intricate 3D geometries into\nmanufacturable profiles, removing unfeasible features such as\nnon-manufacturable overhangs and allowing for the direct consideration of\nessential manufacturing aspects like thickness and rib design. Consequently,\ndesigns previously unsuitable for mass production are transformed into viable\nsolutions. We further enhance this approach by adopting an advanced 2D\ngenerative model, which offer a more efficient alternative to traditional 3D\nshape generation methods. Our results substantiate the efficacy of this\nframework, demonstrating the production of innovative, and, importantly,\nmanufacturable designs. This shift towards integrating practical manufacturing\nconsiderations into GD represents a pivotal advancement, transitioning from\npurely inspirational concepts to actionable, production-ready solutions. Our\nfindings underscore usefulness and potential of GD for broader industry\nadoption, marking a significant step forward in aligning GD with the demands of\nmanufacturing challenges.\n","authors":["Jihoon Kim","Yongmin Kwon","Namwoo Kang"],"pdf_url":"https://arxiv.org/pdf/2403.12098v1.pdf","comment":null}]},"2024-03-15T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2403.10737v1","updated":"2024-03-15T23:50:18Z","published":"2024-03-15T23:50:18Z","title":"Leveraging Synthetic Data for Generalizable and Fair Facial Action Unit\n Detection","summary":" Facial action unit (AU) detection is a fundamental block for objective facial\nexpression analysis. Supervised learning approaches require a large amount of\nmanual labeling which is costly. The limited labeled data are also not diverse\nin terms of gender which can affect model fairness. In this paper, we propose\nto use synthetically generated data and multi-source domain adaptation (MSDA)\nto address the problems of the scarcity of labeled data and the diversity of\nsubjects. Specifically, we propose to generate a diverse dataset through\nsynthetic facial expression re-targeting by transferring the expressions from\nreal faces to synthetic avatars. Then, we use MSDA to transfer the AU detection\nknowledge from a real dataset and the synthetic dataset to a target dataset.\nInstead of aligning the overall distributions of different domains, we propose\nPaired Moment Matching (PM2) to align the features of the paired real and\nsynthetic data with the same facial expression. To further improve gender\nfairness, PM2 matches the features of the real data with a female and a male\nsynthetic image. Our results indicate that synthetic data and the proposed\nmodel improve both AU detection performance and fairness across genders,\ndemonstrating its potential to solve AU detection in-the-wild.\n","authors":["Liupei Lu","Yufeng Yin","Yuming Gu","Yizhen Wu","Pratusha Prasad","Yajie Zhao","Mohammad Soleymani"],"pdf_url":"https://arxiv.org/pdf/2403.10737v1.pdf","comment":"The work was done in 2021"},{"id":"http://arxiv.org/abs/2403.10731v1","updated":"2024-03-15T23:31:41Z","published":"2024-03-15T23:31:41Z","title":"Giving a Hand to Diffusion Models: a Two-Stage Approach to Improving\n Conditional Human Image Generation","summary":" Recent years have seen significant progress in human image generation,\nparticularly with the advancements in diffusion models. However, existing\ndiffusion methods encounter challenges when producing consistent hand anatomy\nand the generated images often lack precise control over the hand pose. To\naddress this limitation, we introduce a novel approach to pose-conditioned\nhuman image generation, dividing the process into two stages: hand generation\nand subsequent body out-painting around the hands. We propose training the hand\ngenerator in a multi-task setting to produce both hand images and their\ncorresponding segmentation masks, and employ the trained model in the first\nstage of generation. An adapted ControlNet model is then used in the second\nstage to outpaint the body around the generated hands, producing the final\nresult. A novel blending technique is introduced to preserve the hand details\nduring the second stage that combines the results of both stages in a coherent\nway. This involves sequential expansion of the out-painted region while fusing\nthe latent representations, to ensure a seamless and cohesive synthesis of the\nfinal image. Experimental evaluations demonstrate the superiority of our\nproposed method over state-of-the-art techniques, in both pose accuracy and\nimage quality, as validated on the HaGRID dataset. Our approach not only\nenhances the quality of the generated hands but also offers improved control\nover hand pose, advancing the capabilities of pose-conditioned human image\ngeneration. The source code of the proposed approach is available at\nhttps://github.com/apelykh/hand-to-diffusion.\n","authors":["Anton Pelykh","Ozge Mercanoglu Sincan","Richard Bowden"],"pdf_url":"https://arxiv.org/pdf/2403.10731v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08755v2","updated":"2024-03-15T23:04:33Z","published":"2023-10-12T22:45:03Z","title":"PU-Ray: Domain-Independent Point Cloud Upsampling via Ray Marching on\n Neural Implicit Surface","summary":" While recent advancements in deep-learning point cloud upsampling methods\nhave improved the input to intelligent transportation systems, they still\nsuffer from issues of domain dependency between synthetic and real-scanned\npoint clouds. This paper addresses the above issues by proposing a new\nray-based upsampling approach with an arbitrary rate, where a depth prediction\nis made for each query ray and its corresponding patch. Our novel method\nsimulates the sphere-tracing ray marching algorithm on the neural implicit\nsurface defined with an unsigned distance function (UDF) to achieve more\nprecise and stable ray-depth predictions by training a point-transformer-based\nnetwork. The rule-based mid-point query sampling method generates more evenly\ndistributed points without requiring an end-to-end model trained using a\nnearest-neighbor-based reconstruction loss function, which may be biased\ntowards the training dataset. Self-supervised learning becomes possible with\naccurate ground truths within the input point cloud. The results demonstrate\nthe method's versatility across domains and training scenarios with limited\ncomputational resources and training data. Comprehensive analyses of synthetic\nand real-scanned applications provide empirical evidence for the significance\nof the upsampling task across the computer vision and graphics domains to\nreal-world applications of ITS.\n","authors":["Sangwon Lim","Karim El-Basyouny","Yee Hong Yang"],"pdf_url":"https://arxiv.org/pdf/2310.08755v2.pdf","comment":"17 pages (11 main + 6 supplement), 21 figures (8 main + 13\n supplement), 8 tables"},{"id":"http://arxiv.org/abs/2403.10722v1","updated":"2024-03-15T22:49:47Z","published":"2024-03-15T22:49:47Z","title":"Cannabis Seed Variant Detection using Faster R-CNN","summary":" Analyzing and detecting cannabis seed variants is crucial for the agriculture\nindustry. It enables precision breeding, allowing cultivators to selectively\nenhance desirable traits. Accurate identification of seed variants also ensures\nregulatory compliance, facilitating the cultivation of specific cannabis\nstrains with defined characteristics, ultimately improving agricultural\nproductivity and meeting diverse market demands. This paper presents a study on\ncannabis seed variant detection by employing a state-of-the-art object\ndetection model Faster R-CNN. This study implemented the model on a locally\nsourced cannabis seed dataset in Thailand, comprising 17 distinct classes. We\nevaluate six Faster R-CNN models by comparing performance on various metrics\nand achieving a mAP score of 94.08\\% and an F1 score of 95.66\\%. This paper\npresents the first known application of deep neural network object detection\nmodels to the novel task of visually identifying cannabis seed types.\n","authors":["Toqi Tahamid Sarker","Taminul Islam","Khaled R Ahmed"],"pdf_url":"https://arxiv.org/pdf/2403.10722v1.pdf","comment":"6 pages, 2 figures, this has been submitted and accepted for\n publication at IEEE - ICACCS 2024"},{"id":"http://arxiv.org/abs/2311.18531v2","updated":"2024-03-15T22:14:40Z","published":"2023-11-30T13:15:28Z","title":"Dataset Distillation via the Wasserstein Metric","summary":" Dataset Distillation (DD) emerges as a powerful strategy to encapsulate the\nexpansive information of large datasets into significantly smaller, synthetic\nequivalents, thereby preserving model performance with reduced computational\noverhead. Pursuing this objective, we introduce the Wasserstein distance, a\nmetric grounded in optimal transport theory, to enhance distribution matching\nin DD. Our approach employs the Wasserstein barycenter to provide a\ngeometrically meaningful method for quantifying distribution differences and\ncapturing the centroid of distribution sets efficiently. By embedding synthetic\ndata in the feature spaces of pretrained classification models, we facilitate\neffective distribution matching that leverages prior knowledge inherent in\nthese models. Our method not only maintains the computational advantages of\ndistribution matching-based techniques but also achieves new state-of-the-art\nperformance across a range of high-resolution datasets. Extensive testing\ndemonstrates the effectiveness and adaptability of our method, underscoring the\nuntapped potential of Wasserstein metrics in dataset distillation.\n","authors":["Haoyang Liu","Yijiang Li","Tiancheng Xing","Vibhu Dalal","Luwei Li","Jingrui He","Haohan Wang"],"pdf_url":"https://arxiv.org/pdf/2311.18531v2.pdf","comment":"21 pages, 8 figures"},{"id":"http://arxiv.org/abs/2310.02239v3","updated":"2024-03-15T21:54:08Z","published":"2023-10-03T17:49:04Z","title":"MiniGPT-5: Interleaved Vision-and-Language Generation via Generative\n Vokens","summary":" The effectiveness of Multimodal Large Language Models (MLLMs) demonstrates a\nprofound capability in multimodal understanding. However, the simultaneous\ngeneration of images with coherent texts is still underdeveloped. Addressing\nthis, we introduce a novel interleaved vision-and-language generation method,\ncentered around the concept of ``generative vokens\". These vokens serve as\npivotal elements contributing to coherent image-text outputs. Our method is\nmarked by a unique two-stage training strategy for description-free multimodal\ngeneration, which does not necessitate extensive descriptions of images. We\nintegrate classifier-free guidance to enhance the alignment of generated images\nand texts, ensuring more seamless and contextually relevant multimodal\ninteractions. Our model, MiniGPT-5, exhibits substantial improvement over the\nbaseline models on multimodal generation datasets, including MMDialog and VIST.\nThe human evaluation shows MiniGPT-5 is better than the baseline model on more\nthan 56\\% cases for multimodal generation, highlighting its efficacy across\ndiverse benchmarks.\n","authors":["Kaizhi Zheng","Xuehai He","Xin Eric Wang"],"pdf_url":"https://arxiv.org/pdf/2310.02239v3.pdf","comment":"23 pages, 10 figures"},{"id":"http://arxiv.org/abs/2403.10706v1","updated":"2024-03-15T21:49:13Z","published":"2024-03-15T21:49:13Z","title":"PyHySCO: GPU-Enabled Susceptibility Artifact Distortion Correction in\n Seconds","summary":" Over the past decade, reversed Gradient Polarity (RGP) methods have become a\npopular approach for correcting susceptibility artifacts in Echo-Planar Imaging\n(EPI). Although several post-processing tools for RGP are available, their\nimplementations do not fully leverage recent hardware, algorithmic, and\ncomputational advances, leading to correction times of several minutes per\nimage volume. To enable 3D RGP correction in seconds, we introduce PyHySCO, a\nuser-friendly EPI distortion correction tool implemented in PyTorch that\nenables multi-threading and efficient use of graphics processing units (GPUs).\nPyHySCO uses a time-tested physical distortion model and mathematical\nformulation and is, therefore, reliable without training. An algorithmic\nimprovement in PyHySCO is its novel initialization scheme that uses 1D optimal\ntransport. PyHySCO is published under the GNU public license and can be used\nfrom the command line or its Python interface. Our extensive numerical\nvalidation using 3T and 7T data from the Human Connectome Project suggests that\nPyHySCO achieves accuracy comparable to that of leading RGP tools at a fraction\nof the cost. We also validate the new initialization scheme, compare different\noptimization algorithms, and test the algorithm on different hardware and\narithmetic precision.\n","authors":["Abigail Julian","Lars Ruthotto"],"pdf_url":"https://arxiv.org/pdf/2403.10706v1.pdf","comment":"20 pages, 9 figures"},{"id":"http://arxiv.org/abs/2403.10701v1","updated":"2024-03-15T21:37:04Z","published":"2024-03-15T21:37:04Z","title":"IMPRINT: Generative Object Compositing by Learning Identity-Preserving\n Representation","summary":" Generative object compositing emerges as a promising new avenue for\ncompositional image editing. However, the requirement of object identity\npreservation poses a significant challenge, limiting practical usage of most\nexisting methods. In response, this paper introduces IMPRINT, a novel\ndiffusion-based generative model trained with a two-stage learning framework\nthat decouples learning of identity preservation from that of compositing. The\nfirst stage is targeted for context-agnostic, identity-preserving pretraining\nof the object encoder, enabling the encoder to learn an embedding that is both\nview-invariant and conducive to enhanced detail preservation. The subsequent\nstage leverages this representation to learn seamless harmonization of the\nobject composited to the background. In addition, IMPRINT incorporates a\nshape-guidance mechanism offering user-directed control over the compositing\nprocess. Extensive experiments demonstrate that IMPRINT significantly\noutperforms existing methods and various baselines on identity preservation and\ncomposition quality.\n","authors":["Yizhi Song","Zhifei Zhang","Zhe Lin","Scott Cohen","Brian Price","Jianming Zhang","Soo Ye Kim","He Zhang","Wei Xiong","Daniel Aliaga"],"pdf_url":"https://arxiv.org/pdf/2403.10701v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10698v1","updated":"2024-03-15T21:30:25Z","published":"2024-03-15T21:30:25Z","title":"Robust Influence-based Training Methods for Noisy Brain MRI","summary":" Correctly classifying brain tumors is imperative to the prompt and accurate\ntreatment of a patient. While several classification algorithms based on\nclassical image processing or deep learning methods have been proposed to\nrapidly classify tumors in MR images, most assume the unrealistic setting of\nnoise-free training data. In this work, we study a difficult but realistic\nsetting of training a deep learning model on noisy MR images to classify brain\ntumors. We propose two training methods that are robust to noisy MRI training\ndata, Influence-based Sample Reweighing (ISR) and Influence-based Sample\nPerturbation (ISP), which are based on influence functions from robust\nstatistics. Using the influence functions, in ISR, we adaptively reweigh\ntraining examples according to how helpful/harmful they are to the training\nprocess, while in ISP, we craft and inject helpful perturbation proportional to\nthe influence score. Both ISR and ISP harden the classification model against\nnoisy training data without significantly affecting the generalization ability\nof the model on test data. We conduct empirical evaluations over a common brain\ntumor dataset and compare ISR and ISP to three baselines. Our empirical results\nshow that ISR and ISP can efficiently train deep learning models robust against\nnoisy training data.\n","authors":["Minh-Hao Van","Alycia N. Carey","Xintao Wu"],"pdf_url":"https://arxiv.org/pdf/2403.10698v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10696v1","updated":"2024-03-15T21:29:33Z","published":"2024-03-15T21:29:33Z","title":"On the low-shot transferability of [V]-Mamba","summary":" The strength of modern large-scale neural networks lies in their ability to\nefficiently adapt to new tasks with few examples. Although extensive research\nhas investigated the transferability of Vision Transformers (ViTs) to various\ndownstream tasks under diverse constraints, this study shifts focus to explore\nthe transfer learning potential of [V]-Mamba. We compare its performance with\nViTs across different few-shot data budgets and efficient transfer methods. Our\nanalysis yields three key insights into [V]-Mamba's few-shot transfer\nperformance: (a) [V]-Mamba demonstrates superior or equivalent few-shot\nlearning capabilities compared to ViTs when utilizing linear probing (LP) for\ntransfer, (b) Conversely, [V]-Mamba exhibits weaker or similar few-shot\nlearning performance compared to ViTs when employing visual prompting (VP) as\nthe transfer method, and (c) We observe a weak positive correlation between the\nperformance gap in transfer via LP and VP and the scale of the [V]-Mamba model.\nThis preliminary analysis lays the foundation for more comprehensive studies\naimed at furthering our understanding of the capabilities of [V]-Mamba variants\nand their distinctions from ViTs.\n","authors":["Diganta Misra","Jay Gala","Antonio Orvieto"],"pdf_url":"https://arxiv.org/pdf/2403.10696v1.pdf","comment":"Preprint (Work in progress)"},{"id":"http://arxiv.org/abs/2305.14392v2","updated":"2024-03-15T21:28:59Z","published":"2023-05-22T22:59:05Z","title":"FEDORA: Flying Event Dataset fOr Reactive behAvior","summary":" The ability of resource-constrained biological systems such as fruitflies to\nperform complex and high-speed maneuvers in cluttered environments has been one\nof the prime sources of inspiration for developing vision-based autonomous\nsystems. To emulate this capability, the perception pipeline of such systems\nmust integrate information cues from tasks including optical flow and depth\nestimation, object detection and tracking, and segmentation, among others.\nHowever, the conventional approach of employing slow, synchronous inputs from\nstandard frame-based cameras constrains these perception capabilities,\nparticularly during high-speed maneuvers. Recently, event-based sensors have\nemerged as low latency and low energy alternatives to standard frame-based\ncameras for capturing high-speed motion, effectively speeding up perception and\nhence navigation. For coherence, all the perception tasks must be trained on\nthe same input data. However, present-day datasets are curated mainly for a\nsingle or a handful of tasks and are limited in the rate of the provided ground\ntruths. To address these limitations, we present Flying Event Dataset fOr\nReactive behAviour (FEDORA) - a fully synthetic dataset for perception tasks,\nwith raw data from frame-based cameras, event-based cameras, and Inertial\nMeasurement Units (IMU), along with ground truths for depth, pose, and optical\nflow at a rate much higher than existing datasets.\n","authors":["Amogh Joshi","Adarsh Kosta","Wachirawit Ponghiran","Manish Nagaraj","Kaushik Roy"],"pdf_url":"https://arxiv.org/pdf/2305.14392v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10695v1","updated":"2024-03-15T21:28:06Z","published":"2024-03-15T21:28:06Z","title":"EAGLE: An Edge-Aware Gradient Localization Enhanced Loss for CT Image\n Reconstruction","summary":" Computed Tomography (CT) image reconstruction is crucial for accurate\ndiagnosis and deep learning approaches have demonstrated significant potential\nin improving reconstruction quality. However, the choice of loss function\nprofoundly affects the reconstructed images. Traditional mean squared error\nloss often produces blurry images lacking fine details, while alternatives\ndesigned to improve may introduce structural artifacts or other undesirable\neffects. To address these limitations, we propose Eagle-Loss, a novel loss\nfunction designed to enhance the visual quality of CT image reconstructions.\nEagle-Loss applies spectral analysis of localized features within gradient\nchanges to enhance sharpness and well-defined edges. We evaluated Eagle-Loss on\ntwo public datasets across low-dose CT reconstruction and CT field-of-view\nextension tasks. Our results show that Eagle-Loss consistently improves the\nvisual quality of reconstructed images, surpassing state-of-the-art methods\nacross various network architectures. Code and data are available at\n\\url{https://github.com/sypsyp97/Eagle_Loss}.\n","authors":["Yipeng Sun","Yixing Huang","Linda-Sophie Schneider","Mareike Thies","Mingxuan Gu","Siyuan Mei","Siming Bayer","Andreas Maier"],"pdf_url":"https://arxiv.org/pdf/2403.10695v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2403.10689v1","updated":"2024-03-15T21:18:14Z","published":"2024-03-15T21:18:14Z","title":"Latent Object Characteristics Recognition with Visual to Haptic-Audio\n Cross-modal Transfer Learning","summary":" Recognising the characteristics of objects while a robot handles them is\ncrucial for adjusting motions that ensure stable and efficient interactions\nwith containers. Ahead of realising stable and efficient robot motions for\nhandling/transferring the containers, this work aims to recognise the latent\nunobservable object characteristics. While vision is commonly used for object\nrecognition by robots, it is ineffective for detecting hidden objects. However,\nrecognising objects indirectly using other sensors is a challenging task. To\naddress this challenge, we propose a cross-modal transfer learning approach\nfrom vision to haptic-audio. We initially train the model with vision, directly\nobserving the target object. Subsequently, we transfer the latent space learned\nfrom vision to a second module, trained only with haptic-audio and motor data.\nThis transfer learning framework facilitates the representation of object\ncharacteristics using indirect sensor data, thereby improving recognition\naccuracy. For evaluating the recognition accuracy of our proposed learning\nframework we selected shape, position, and orientation as the object\ncharacteristics. Finally, we demonstrate online recognition of both trained and\nuntrained objects using the humanoid robot Nextage Open.\n","authors":["Namiko Saito","Joao Moura","Hiroki Uchida","Sethu Vijayakumar"],"pdf_url":"https://arxiv.org/pdf/2403.10689v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2403.10683v1","updated":"2024-03-15T21:06:14Z","published":"2024-03-15T21:06:14Z","title":"GS-Pose: Cascaded Framework for Generalizable Segmentation-based 6D\n Object Pose Estimation","summary":" This paper introduces GS-Pose, an end-to-end framework for locating and\nestimating the 6D pose of objects. GS-Pose begins with a set of posed RGB\nimages of a previously unseen object and builds three distinct representations\nstored in a database. At inference, GS-Pose operates sequentially by locating\nthe object in the input image, estimating its initial 6D pose using a retrieval\napproach, and refining the pose with a render-and-compare method. The key\ninsight is the application of the appropriate object representation at each\nstage of the process. In particular, for the refinement step, we utilize 3D\nGaussian splatting, a novel differentiable rendering technique that offers high\nrendering speed and relatively low optimization time. Off-the-shelf toolchains\nand commodity hardware, such as mobile phones, can be used to capture new\nobjects to be added to the database. Extensive evaluations on the LINEMOD and\nOnePose-LowTexture datasets demonstrate excellent performance, establishing the\nnew state-of-the-art. Project page: https://dingdingcai.github.io/gs-pose.\n","authors":["Dingding Cai","Janne Heikkilä","Esa Rahtu"],"pdf_url":"https://arxiv.org/pdf/2403.10683v1.pdf","comment":"Project Page: https://dingdingcai.github.io/gs-pose"},{"id":"http://arxiv.org/abs/2308.14969v3","updated":"2024-03-15T21:04:31Z","published":"2023-08-29T01:47:49Z","title":"Uncovering the Hidden Cost of Model Compression","summary":" In an age dominated by resource-intensive foundation models, the ability to\nefficiently adapt to downstream tasks is crucial. Visual Prompting (VP),\ndrawing inspiration from the prompting techniques employed in Large Language\nModels (LLMs), has emerged as a pivotal method for transfer learning in the\nrealm of computer vision. As the importance of efficiency continues to rise,\nresearch into model compression has become indispensable in alleviating the\ncomputational burdens associated with training and deploying over-parameterized\nneural networks. A primary objective in model compression is to develop sparse\nand/or quantized models capable of matching or even surpassing the performance\nof their over-parameterized, full-precision counterparts. Although previous\nstudies have explored the effects of model compression on transfer learning,\nits impact on visual prompting-based transfer remains unclear. This study aims\nto bridge this gap, shedding light on the fact that model compression\ndetrimentally impacts the performance of visual prompting-based transfer,\nparticularly evident in scenarios with low data volume. Furthermore, our\nfindings underscore the adverse influence of sparsity on the calibration of\ndownstream visual-prompted models. However, intriguingly, we also illustrate\nthat such negative effects on calibration are not present when models are\ncompressed via quantization. This empirical investigation underscores the need\nfor a nuanced understanding beyond mere accuracy in sparse and quantized\nsettings, thereby paving the way for further exploration in Visual Prompting\ntechniques tailored for sparse and quantized models.\n","authors":["Diganta Misra","Muawiz Chaudhary","Agam Goyal","Bharat Runwal","Pin Yu Chen"],"pdf_url":"https://arxiv.org/pdf/2308.14969v3.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2403.10677v1","updated":"2024-03-15T20:53:10Z","published":"2024-03-15T20:53:10Z","title":"Spiking Neural Networks for Fast-Moving Object Detection on Neuromorphic\n Hardware Devices Using an Event-Based Camera","summary":" Table tennis is a fast-paced and exhilarating sport that demands agility,\nprecision, and fast reflexes. In recent years, robotic table tennis has become\na popular research challenge for robot perception algorithms. Fast and accurate\nball detection is crucial for enabling a robotic arm to rally the ball back\nsuccessfully. Previous approaches have employed conventional frame-based\ncameras with Convolutional Neural Networks (CNNs) or traditional computer\nvision methods. In this paper, we propose a novel solution that combines an\nevent-based camera with Spiking Neural Networks (SNNs) for ball detection. We\nuse multiple state-of-the-art SNN frameworks and develop a SNN architecture for\neach of them, complying with their corresponding constraints. Additionally, we\nimplement the SNN solution across multiple neuromorphic edge devices,\nconducting comparisons of their accuracies and run-times. This furnishes\nrobotics researchers with a benchmark illustrating the capabilities achievable\nwith each SNN framework and a corresponding neuromorphic edge device. Next to\nthis comparison of SNN solutions for robots, we also show that an SNN on a\nneuromorphic edge device is able to run in real-time in a closed loop robotic\nsystem, a table tennis robot in our use case.\n","authors":["Andreas Ziegler","Karl Vetter","Thomas Gossard","Jonas Tebbe","Andreas Zell"],"pdf_url":"https://arxiv.org/pdf/2403.10677v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10674v1","updated":"2024-03-15T20:49:43Z","published":"2024-03-15T20:49:43Z","title":"D-Net: Dynamic Large Kernel with Dynamic Feature Fusion for Volumetric\n Medical Image Segmentation","summary":" Hierarchical transformers have achieved significant success in medical image\nsegmentation due to their large receptive field and capabilities of effectively\nleveraging global long-range contextual information. Convolutional neural\nnetworks (CNNs) can also deliver a large receptive field by using large\nkernels, enabling them to achieve competitive performance with fewer model\nparameters. However, CNNs incorporated with large convolutional kernels remain\nconstrained in adaptively capturing multi-scale features from organs with large\nvariations in shape and size due to the employment of fixed-sized kernels.\nAdditionally, they are unable to utilize global contextual information\nefficiently. To address these limitations, we propose Dynamic Large Kernel\n(DLK) and Dynamic Feature Fusion (DFF) modules. The DLK module employs multiple\nlarge kernels with varying kernel sizes and dilation rates to capture\nmulti-scale features. Subsequently, a dynamic selection mechanism is utilized\nto adaptively highlight the most important spatial features based on global\ninformation. Additionally, the DFF module is proposed to adaptively fuse\nmulti-scale local feature maps based on their global information. We integrate\nDLK and DFF in a hierarchical transformer architecture to develop a novel\narchitecture, termed D-Net. D-Net is able to effectively utilize a multi-scale\nlarge receptive field and adaptively harness global contextual information.\nExtensive experimental results demonstrate that D-Net outperforms other\nstate-of-the-art models in the two volumetric segmentation tasks, including\nabdominal multi-organ segmentation and multi-modality brain tumor segmentation.\nOur code is available at https://github.com/sotiraslab/DLK.\n","authors":["Jin Yang","Peijie Qiu","Yichi Zhang","Daniel S. Marcus","Aristeidis Sotiras"],"pdf_url":"https://arxiv.org/pdf/2403.10674v1.pdf","comment":"12 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2306.17010v6","updated":"2024-03-15T20:26:20Z","published":"2023-06-29T15:06:21Z","title":"milliFlow: Scene Flow Estimation on mmWave Radar Point Cloud for Human\n Motion Sensing","summary":" Human motion sensing plays a crucial role in smart systems for\ndecision-making, user interaction, and personalized services. Extensive\nresearch that has been conducted is predominantly based on cameras, whose\nintrusive nature limits their use in smart home applications. To address this,\nmmWave radars have gained popularity due to their privacy-friendly features. In\nthis work, we propose milliFlow, a novel deep learning approach to estimate\nscene flow as complementary motion information for mmWave point cloud, serving\nas an intermediate level of features and directly benefiting downstream human\nmotion sensing tasks. Experimental results demonstrate the superior performance\nof our method when compared with the competing approaches. Furthermore, by\nincorporating scene flow information, we achieve remarkable improvements in\nhuman activity recognition and human parsing and support human body part\ntracking. To foster further research in this area, we will provide our codebase\nand dataset for open access.\n","authors":["Fangqiang Ding","Zhen Luo","Peijun Zhao","Chris Xiaoxuan Lu"],"pdf_url":"https://arxiv.org/pdf/2306.17010v6.pdf","comment":"27 pages, 8 figures, 8 tables"},{"id":"http://arxiv.org/abs/2403.10663v1","updated":"2024-03-15T20:12:41Z","published":"2024-03-15T20:12:41Z","title":"Not Just Change the Labels, Learn the Features: Watermarking Deep Neural\n Networks with Multi-View Data","summary":" With the increasing prevalence of Machine Learning as a Service (MLaaS)\nplatforms, there is a growing focus on deep neural network (DNN) watermarking\ntechniques. These methods are used to facilitate the verification of ownership\nfor a target DNN model to protect intellectual property. One of the most widely\nemployed watermarking techniques involves embedding a trigger set into the\nsource model. Unfortunately, existing methodologies based on trigger sets are\nstill susceptible to functionality-stealing attacks, potentially enabling\nadversaries to steal the functionality of the source model without a reliable\nmeans of verifying ownership. In this paper, we first introduce a novel\nperspective on trigger set-based watermarking methods from a feature learning\nperspective. Specifically, we demonstrate that by selecting data exhibiting\nmultiple features, also referred to as $\\textit{multi-view data}$, it becomes\nfeasible to effectively defend functionality stealing attacks. Based on this\nperspective, we introduce a novel watermarking technique based on Multi-view\ndATa, called MAT, for efficiently embedding watermarks within DNNs. This\napproach involves constructing a trigger set with multi-view data and\nincorporating a simple feature-based regularization method for training the\nsource model. We validate our method across various benchmarks and demonstrate\nits efficacy in defending against model extraction attacks, surpassing relevant\nbaselines by a significant margin.\n","authors":["Yuxuan Li","Sarthak Kumar Maharana","Yunhui Guo"],"pdf_url":"https://arxiv.org/pdf/2403.10663v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10662v1","updated":"2024-03-15T20:04:27Z","published":"2024-03-15T20:04:27Z","title":"SwinMTL: A Shared Architecture for Simultaneous Depth Estimation and\n Semantic Segmentation from Monocular Camera Images","summary":" This research paper presents an innovative multi-task learning framework that\nallows concurrent depth estimation and semantic segmentation using a single\ncamera. The proposed approach is based on a shared encoder-decoder\narchitecture, which integrates various techniques to improve the accuracy of\nthe depth estimation and semantic segmentation task without compromising\ncomputational efficiency. Additionally, the paper incorporates an adversarial\ntraining component, employing a Wasserstein GAN framework with a critic\nnetwork, to refine model's predictions. The framework is thoroughly evaluated\non two datasets - the outdoor Cityscapes dataset and the indoor NYU Depth V2\ndataset - and it outperforms existing state-of-the-art methods in both\nsegmentation and depth estimation tasks. We also conducted ablation studies to\nanalyze the contributions of different components, including pre-training\nstrategies, the inclusion of critics, the use of logarithmic depth scaling, and\nadvanced image augmentations, to provide a better understanding of the proposed\nframework. The accompanying source code is accessible at\n\\url{https://github.com/PardisTaghavi/SwinMTL}.\n","authors":["Pardis Taghavi","Reza Langari","Gaurav Pandey"],"pdf_url":"https://arxiv.org/pdf/2403.10662v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10658v1","updated":"2024-03-15T19:54:10Z","published":"2024-03-15T19:54:10Z","title":"InterLUDE: Interactions between Labeled and Unlabeled Data to Enhance\n Semi-Supervised Learning","summary":" Semi-supervised learning (SSL) seeks to enhance task performance by training\non both labeled and unlabeled data. Mainstream SSL image classification methods\nmostly optimize a loss that additively combines a supervised classification\nobjective with a regularization term derived solely from unlabeled data. This\nformulation neglects the potential for interaction between labeled and\nunlabeled images. In this paper, we introduce InterLUDE, a new approach to\nenhance SSL made of two parts that each benefit from labeled-unlabeled\ninteraction. The first part, embedding fusion, interpolates between labeled and\nunlabeled embeddings to improve representation learning. The second part is a\nnew loss, grounded in the principle of consistency regularization, that aims to\nminimize discrepancies in the model's predictions between labeled versus\nunlabeled inputs. Experiments on standard closed-set SSL benchmarks and a\nmedical SSL task with an uncurated unlabeled set show clear benefits to our\napproach. On the STL-10 dataset with only 40 labels, InterLUDE achieves 3.2%\nerror rate, while the best previous method reports 14.9%.\n","authors":["Zhe Huang","Xiaowei Yu","Dajiang Zhu","Michael C. Hughes"],"pdf_url":"https://arxiv.org/pdf/2403.10658v1.pdf","comment":"Semi-supervised Learning; Vision Transformers"},{"id":"http://arxiv.org/abs/2403.10650v1","updated":"2024-03-15T19:35:10Z","published":"2024-03-15T19:35:10Z","title":"PALM: Pushing Adaptive Learning Rate Mechanisms for Continual Test-Time\n Adaptation","summary":" Real-world vision models in dynamic environments face rapid shifts in domain\ndistributions, leading to decreased recognition performance. Continual\ntest-time adaptation (CTTA) directly adjusts a pre-trained source\ndiscriminative model to these changing domains using test data. A highly\neffective CTTA method involves applying layer-wise adaptive learning rates, and\nselectively adapting pre-trained layers. However, it suffers from the poor\nestimation of domain shift and the inaccuracies arising from the pseudo-labels.\nIn this work, we aim to overcome these limitations by identifying layers\nthrough the quantification of model prediction uncertainty without relying on\npseudo-labels. We utilize the magnitude of gradients as a metric, calculated by\nbackpropagating the KL divergence between the softmax output and a uniform\ndistribution, to select layers for further adaptation. Subsequently, for the\nparameters exclusively belonging to these selected layers, with the remaining\nones frozen, we evaluate their sensitivity in order to approximate the domain\nshift, followed by adjusting their learning rates accordingly. Overall, this\napproach leads to a more robust and stable optimization than prior approaches.\nWe conduct extensive image classification experiments on CIFAR-10C, CIFAR-100C,\nand ImageNet-C and demonstrate the efficacy of our method against standard\nbenchmarks and prior methods.\n","authors":["Sarthak Kumar Maharana","Baoming Zhang","Yunhui Guo"],"pdf_url":"https://arxiv.org/pdf/2403.10650v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.11499v2","updated":"2024-03-15T19:19:28Z","published":"2023-09-20T17:58:05Z","title":"DreamLLM: Synergistic Multimodal Comprehension and Creation","summary":" This paper presents DreamLLM, a learning framework that first achieves\nversatile Multimodal Large Language Models (MLLMs) empowered with frequently\noverlooked synergy between multimodal comprehension and creation. DreamLLM\noperates on two fundamental principles. The first focuses on the generative\nmodeling of both language and image posteriors by direct sampling in the raw\nmultimodal space. This approach circumvents the limitations and information\nloss inherent to external feature extractors like CLIP, and a more thorough\nmultimodal understanding is obtained. Second, DreamLLM fosters the generation\nof raw, interleaved documents, modeling both text and image contents, along\nwith unstructured layouts. This allows DreamLLM to learn all conditional,\nmarginal, and joint multimodal distributions effectively. As a result, DreamLLM\nis the first MLLM capable of generating free-form interleaved content.\nComprehensive experiments highlight DreamLLM's superior performance as a\nzero-shot multimodal generalist, reaping from the enhanced learning synergy.\nProject page: https://dreamllm.github.io.\n","authors":["Runpei Dong","Chunrui Han","Yuang Peng","Zekun Qi","Zheng Ge","Jinrong Yang","Liang Zhao","Jianjian Sun","Hongyu Zhou","Haoran Wei","Xiangwen Kong","Xiangyu Zhang","Kaisheng Ma","Li Yi"],"pdf_url":"https://arxiv.org/pdf/2309.11499v2.pdf","comment":"ICLR 2024 (Spotlight)"},{"id":"http://arxiv.org/abs/2403.10635v1","updated":"2024-03-15T19:03:29Z","published":"2024-03-15T19:03:29Z","title":"MeDSLIP: Medical Dual-Stream Language-Image Pre-training for\n Fine-grained Alignment","summary":" Vision-language pre-training (VLP) models have shown significant advancements\nin the medical domain. Yet, most VLP models align raw reports to images at a\nvery coarse level, without modeling fine-grained relationships between\nanatomical and pathological concepts outlined in reports and the corresponding\nsemantic counterparts in images. To address this problem, we propose a Medical\nDual-Stream Language-Image Pre-training (MeDSLIP) framework. Specifically,\nMeDSLIP establishes vision-language fine-grained alignments via disentangling\nvisual and textual representations into anatomy-relevant and pathology-relevant\nstreams. Moreover, a novel vision-language Prototypical Contr-astive Learning\n(ProtoCL) method is adopted in MeDSLIP to enhance the alignment within the\nanatomical and pathological streams. MeDSLIP further employs cross-stream\nIntra-image Contrastive Learning (ICL) to ensure the consistent coexistence of\npaired anatomical and pathological concepts within the same image. Such a\ncross-stream regularization encourages the model to exploit the synchrony\nbetween two streams for a more comprehensive representation learning. MeDSLIP\nis evaluated under zero-shot and supervised fine-tuning settings on three\npublic datasets: NIH CXR14, RSNA Pneumonia, and SIIM-ACR Pneumothorax. Under\nthese settings, MeDSLIP outperforms six leading CNN-based models on\nclassification, grounding, and segmentation tasks.\n","authors":["Wenrui Fan","Mohammod Naimul Islam Suvon","Shuo Zhou","Xianyuan Liu","Samer Alabed","Venet Osmani","Andrew Swift","Chen Chen","Haiping Lu"],"pdf_url":"https://arxiv.org/pdf/2403.10635v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11759v2","updated":"2024-03-15T18:44:51Z","published":"2023-10-18T07:31:47Z","title":"Perceptual Scales Predicted by Fisher Information Metrics","summary":" Perception is often viewed as a process that transforms physical variables,\nexternal to an observer, into internal psychological variables. Such a process\ncan be modeled by a function coined perceptual scale. The perceptual scale can\nbe deduced from psychophysical measurements that consist in comparing the\nrelative differences between stimuli (i.e. difference scaling experiments).\nHowever, this approach is often overlooked by the modeling and experimentation\ncommunities. Here, we demonstrate the value of measuring the perceptual scale\nof classical (spatial frequency, orientation) and less classical physical\nvariables (interpolation between textures) by embedding it in recent\nprobabilistic modeling of perception. First, we show that the assumption that\nan observer has an internal representation of univariate parameters such as\nspatial frequency or orientation while stimuli are high-dimensional does not\nlead to contradictory predictions when following the theoretical framework.\nSecond, we show that the measured perceptual scale corresponds to the\ntransduction function hypothesized in this framework. In particular, we\ndemonstrate that it is related to the Fisher information of the generative\nmodel that underlies perception and we test the predictions given by the\ngenerative model of different stimuli in a set a of difference scaling\nexperiments. Our main conclusion is that the perceptual scale is mostly driven\nby the stimulus power spectrum. Finally, we propose that this measure of\nperceptual scale is a way to push further the notion of perceptual distances by\nestimating the perceptual geometry of images i.e. the path between images\ninstead of simply the distance between those.\n","authors":["Jonathan Vacher","Pascal Mamassian"],"pdf_url":"https://arxiv.org/pdf/2310.11759v2.pdf","comment":"15 pages, 6 figures, 7 appendix"},{"id":"http://arxiv.org/abs/2401.04791v2","updated":"2024-03-15T18:40:19Z","published":"2024-01-09T19:34:47Z","title":"SOS-Match: Segmentation for Open-Set Robust Correspondence Search and\n Robot Localization in Unstructured Environments","summary":" We present SOS-Match, a novel framework for detecting and matching objects in\nunstructured environments. Our system consists of 1) a front-end mapping\npipeline using a zero-shot segmentation model to extract object masks from\nimages and track them across frames and 2) a frame alignment pipeline that uses\nthe geometric consistency of object relationships to efficiently localize\nacross a variety of conditions. We evaluate SOS-Match on the Batvik seasonal\ndataset which includes drone flights collected over a coastal plot of southern\nFinland during different seasons and lighting conditions. Results show that our\napproach is more robust to changes in lighting and appearance than classical\nimage feature-based approaches or global descriptor methods, and it provides\nmore viewpoint invariance than learning-based feature detection and description\napproaches. SOS-Match localizes within a reference map up to 46x faster than\nother feature-based approaches and has a map size less than 0.5% the size of\nthe most compact other maps. SOS-Match is a promising new approach for landmark\ndetection and correspondence search in unstructured environments that is robust\nto changes in lighting and appearance and is more computationally efficient\nthan other approaches, suggesting that the geometric arrangement of segments is\na valuable localization cue in unstructured environments. We release our\ndatasets at https://acl.mit.edu/SOS-Match/.\n","authors":["Annika Thomas","Jouko Kinnari","Parker Lusk","Kota Kondo","Jonathan P. How"],"pdf_url":"https://arxiv.org/pdf/2401.04791v2.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2310.04566v2","updated":"2024-03-15T18:37:34Z","published":"2023-10-06T20:13:07Z","title":"Knolling Bot: Learning Robotic Object Arrangement from Tidy\n Demonstrations","summary":" Addressing the challenge of organizing scattered items in domestic spaces is\ncomplicated by the diversity and subjective nature of tidiness. Just as the\ncomplexity of human language allows for multiple expressions of the same idea,\nhousehold tidiness preferences and organizational patterns vary widely, so\npresetting object locations would limit the adaptability to new objects and\nenvironments. Inspired by advancements in natural language processing (NLP),\nthis paper introduces a self-supervised learning framework that allows robots\nto understand and replicate the concept of tidiness from demonstrations of\nwell-organized layouts, akin to using conversational datasets to train Large\nLanguage Models(LLM). We leverage a transformer neural network to predict the\nplacement of subsequent objects. We demonstrate a ``knolling'' system with a\nrobotic arm and an RGB camera to organize items of varying sizes and quantities\non a table. Our method not only trains a generalizable concept of tidiness,\nenabling the model to provide diverse solutions and adapt to different numbers\nof objects, but it can also incorporate human preferences to generate\ncustomized tidy tables without explicit target positions for each object.\n","authors":["Yuhang Hu","Zhizhuo Zhang","Xinyue Zhu","Ruibo Liu","Philippe Wyder","Hod Lipson"],"pdf_url":"https://arxiv.org/pdf/2310.04566v2.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2403.10624v1","updated":"2024-03-15T18:37:15Z","published":"2024-03-15T18:37:15Z","title":"Leveraging CLIP for Inferring Sensitive Information and Improving Model\n Fairness","summary":" Performance disparities across sub-populations are known to exist in deep\nlearning-based vision recognition models, but previous work has largely\naddressed such fairness concerns assuming knowledge of sensitive attribute\nlabels. To overcome this reliance, previous strategies have involved separate\nlearning structures to expose and adjust for disparities. In this work, we\nexplore a new paradigm that does not require sensitive attribute labels, and\nevades the need for extra training by leveraging the vision-language model,\nCLIP, as a rich knowledge source to infer sensitive information. We present\nsample clustering based on similarity derived from image and\nattribute-specified language embeddings and assess their correspondence to true\nattribute distribution. We train a target model by re-sampling and augmenting\nunder-performed clusters. Extensive experiments on multiple benchmark bias\ndatasets show clear fairness gains of the model over existing baselines, which\nindicate that CLIP can extract discriminative sensitive information prompted by\nlanguage, and used to promote model fairness.\n","authors":["Miao Zhang","Rumi Chunara"],"pdf_url":"https://arxiv.org/pdf/2403.10624v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10622v1","updated":"2024-03-15T18:35:14Z","published":"2024-03-15T18:35:14Z","title":"NeuralOCT: Airway OCT Analysis via Neural Fields","summary":" Optical coherence tomography (OCT) is a popular modality in ophthalmology and\nis also used intravascularly. Our interest in this work is OCT in the context\nof airway abnormalities in infants and children where the high resolution of\nOCT and the fact that it is radiation-free is important. The goal of airway OCT\nis to provide accurate estimates of airway geometry (in 2D and 3D) to assess\nairway abnormalities such as subglottic stenosis. We propose\n$\\texttt{NeuralOCT}$, a learning-based approach to process airway OCT images.\nSpecifically, $\\texttt{NeuralOCT}$ extracts 3D geometries from OCT scans by\nrobustly bridging two steps: point cloud extraction via 2D segmentation and 3D\nreconstruction from point clouds via neural fields. Our experiments show that\n$\\texttt{NeuralOCT}$ produces accurate and robust 3D airway reconstructions\nwith an average A-line error smaller than 70 micrometer. Our code will cbe\navailable on GitHub.\n","authors":["Yining Jiao","Amy Oldenburg","Yinghan Xu","Srikamal Soundararajan","Carlton Zdanski","Julia Kimbell","Marc Niethammer"],"pdf_url":"https://arxiv.org/pdf/2403.10622v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10615v1","updated":"2024-03-15T18:26:33Z","published":"2024-03-15T18:26:33Z","title":"LightIt: Illumination Modeling and Control for Diffusion Models","summary":" We introduce LightIt, a method for explicit illumination control for image\ngeneration. Recent generative methods lack lighting control, which is crucial\nto numerous artistic aspects of image generation such as setting the overall\nmood or cinematic appearance. To overcome these limitations, we propose to\ncondition the generation on shading and normal maps. We model the lighting with\nsingle bounce shading, which includes cast shadows. We first train a shading\nestimation module to generate a dataset of real-world images and shading pairs.\nThen, we train a control network using the estimated shading and normals as\ninput. Our method demonstrates high-quality image generation and lighting\ncontrol in numerous scenes. Additionally, we use our generated dataset to train\nan identity-preserving relighting model, conditioned on an image and a target\nshading. Our method is the first that enables the generation of images with\ncontrollable, consistent lighting and performs on par with specialized\nrelighting state-of-the-art methods.\n","authors":["Peter Kocsis","Julien Philip","Kalyan Sunkavalli","Matthias Nießner","Yannick Hold-Geoffroy"],"pdf_url":"https://arxiv.org/pdf/2403.10615v1.pdf","comment":"Project page: https://peter-kocsis.github.io/LightIt/ Video:\n https://youtu.be/cCfSBD5aPLI"},{"id":"http://arxiv.org/abs/2403.07487v2","updated":"2024-03-15T18:24:45Z","published":"2024-03-12T10:25:29Z","title":"Motion Mamba: Efficient and Long Sequence Motion Generation with\n Hierarchical and Bidirectional Selective SSM","summary":" Human motion generation stands as a significant pursuit in generative\ncomputer vision, while achieving long-sequence and efficient motion generation\nremains challenging. Recent advancements in state space models (SSMs), notably\nMamba, have showcased considerable promise in long sequence modeling with an\nefficient hardware-aware design, which appears to be a promising direction to\nbuild motion generation model upon it. Nevertheless, adapting SSMs to motion\ngeneration faces hurdles since the lack of a specialized design architecture to\nmodel motion sequence. To address these challenges, we propose Motion Mamba, a\nsimple and efficient approach that presents the pioneering motion generation\nmodel utilized SSMs. Specifically, we design a Hierarchical Temporal Mamba\n(HTM) block to process temporal data by ensemble varying numbers of isolated\nSSM modules across a symmetric U-Net architecture aimed at preserving motion\nconsistency between frames. We also design a Bidirectional Spatial Mamba (BSM)\nblock to bidirectionally process latent poses, to enhance accurate motion\ngeneration within a temporal frame. Our proposed method achieves up to 50% FID\nimprovement and up to 4 times faster on the HumanML3D and KIT-ML datasets\ncompared to the previous best diffusion-based method, which demonstrates strong\ncapabilities of high-quality long sequence motion modeling and real-time human\nmotion generation. See project website\nhttps://steve-zeyu-zhang.github.io/MotionMamba/\n","authors":["Zeyu Zhang","Akide Liu","Ian Reid","Richard Hartley","Bohan Zhuang","Hao Tang"],"pdf_url":"https://arxiv.org/pdf/2403.07487v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11927v2","updated":"2024-03-15T18:23:16Z","published":"2023-05-19T14:43:00Z","title":"Evaluating how interactive visualizations can assist in finding samples\n where and how computer vision models make mistakes","summary":" Creating Computer Vision (CV) models remains a complex practice, despite\ntheir ubiquity. Access to data, the requirement for ML expertise, and model\nopacity are just a few points of complexity that limit the ability of end-users\nto build, inspect, and improve these models. Interactive ML perspectives have\nhelped address some of these issues by considering a teacher in the loop where\nplanning, teaching, and evaluating tasks take place. We present and evaluate\ntwo interactive visualizations in the context of Sprite, a system for creating\nCV classification and detection models for images originating from videos. We\nstudy how these visualizations help Sprite's users identify (evaluate) and\nselect (plan) images where a model is struggling and can lead to improved\nperformance, compared to a baseline condition where users used a query\nlanguage. We found that users who had used the visualizations found more images\nacross a wider set of potential types of model errors.\n","authors":["Hayeong Song","Gonzalo Ramos","Peter Bodik"],"pdf_url":"https://arxiv.org/pdf/2305.11927v2.pdf","comment":"Hayeong Song, Gonzalo Ramos, and Peter Bodik. \"Evaluating how\n interactive visualizations can assist in finding samples where and how\n computer vision models make mistakes\" 2024 IEEE Pacific Visualization\n Symposium (PacificVis). Ieee, 2024"},{"id":"http://arxiv.org/abs/2310.02557v2","updated":"2024-03-15T18:21:48Z","published":"2023-10-04T03:30:32Z","title":"Generalization in diffusion models arises from geometry-adaptive\n harmonic representations","summary":" Deep neural networks (DNNs) trained for image denoising are able to generate\nhigh-quality samples with score-based reverse diffusion algorithms. These\nimpressive capabilities seem to imply an escape from the curse of\ndimensionality, but recent reports of memorization of the training set raise\nthe question of whether these networks are learning the \"true\" continuous\ndensity of the data. Here, we show that two DNNs trained on non-overlapping\nsubsets of a dataset learn nearly the same score function, and thus the same\ndensity, when the number of training images is large enough. In this regime of\nstrong generalization, diffusion-generated images are distinct from the\ntraining set, and are of high visual quality, suggesting that the inductive\nbiases of the DNNs are well-aligned with the data density. We analyze the\nlearned denoising functions and show that the inductive biases give rise to a\nshrinkage operation in a basis adapted to the underlying image. Examination of\nthese bases reveals oscillating harmonic structures along contours and in\nhomogeneous regions. We demonstrate that trained denoisers are inductively\nbiased towards these geometry-adaptive harmonic bases since they arise not only\nwhen the network is trained on photographic images, but also when it is trained\non image classes supported on low-dimensional manifolds for which the harmonic\nbasis is suboptimal. Finally, we show that when trained on regular image\nclasses for which the optimal basis is known to be geometry-adaptive and\nharmonic, the denoising performance of the networks is near-optimal.\n","authors":["Zahra Kadkhodaie","Florentin Guth","Eero P. Simoncelli","Stéphane Mallat"],"pdf_url":"https://arxiv.org/pdf/2310.02557v2.pdf","comment":"Accepted for oral presentation at ICLR, Vienna, May 2024"},{"id":"http://arxiv.org/abs/2402.13195v2","updated":"2024-03-15T18:15:18Z","published":"2024-02-20T18:06:00Z","title":"Design and Flight Demonstration of a Quadrotor for Urban Mapping and\n Target Tracking Research","summary":" This paper describes the hardware design and flight demonstration of a small\nquadrotor with imaging sensors for urban mapping, hazard avoidance, and target\ntracking research. The vehicle is equipped with five cameras, including two\npairs of fisheye stereo cameras that enable a nearly omnidirectional view and a\ntwo-axis gimbaled camera. An onboard NVIDIA Jetson Orin Nano computer running\nthe Robot Operating System software is used for data collection. An autonomous\ntracking behavior was implemented to coordinate the motion of the quadrotor and\ngimbaled camera to track a moving GPS coordinate. The data collection system\nwas demonstrated through a flight test that tracked a moving GPS-tagged vehicle\nthrough a series of roads and parking lots. A map of the environment was\nreconstructed from the collected images using the Direct Sparse Odometry (DSO)\nalgorithm. The performance of the quadrotor was also characterized by acoustic\nnoise, communication range, battery voltage in hover, and maximum speed tests.\n","authors":["Collin Hague","Nick Kakavitsas","Jincheng Zhang","Chris Beam","Andrew Willis","Artur Wolek"],"pdf_url":"https://arxiv.org/pdf/2402.13195v2.pdf","comment":"7 pages, 10 figures, To be presented at IEEE SoutheastCon 2024"},{"id":"http://arxiv.org/abs/2403.10603v1","updated":"2024-03-15T18:00:11Z","published":"2024-03-15T18:00:11Z","title":"SurvRNC: Learning Ordered Representations for Survival Prediction using\n Rank-N-Contrast","summary":" Predicting the likelihood of survival is of paramount importance for\nindividuals diagnosed with cancer as it provides invaluable information\nregarding prognosis at an early stage. This knowledge enables the formulation\nof effective treatment plans that lead to improved patient outcomes. In the\npast few years, deep learning models have provided a feasible solution for\nassessing medical images, electronic health records, and genomic data to\nestimate cancer risk scores. However, these models often fall short of their\npotential because they struggle to learn regression-aware feature\nrepresentations. In this study, we propose Survival Rank-N Contrast (SurvRNC)\nmethod, which introduces a loss function as a regularizer to obtain an ordered\nrepresentation based on the survival times. This function can handle censored\ndata and can be incorporated into any survival model to ensure that the learned\nrepresentation is ordinal. The model was extensively evaluated on a HEad \\&\nNeCK TumOR (HECKTOR) segmentation and the outcome-prediction task dataset. We\ndemonstrate that using the SurvRNC method for training can achieve higher\nperformance on different deep survival models. Additionally, it outperforms\nstate-of-the-art methods by 3.6% on the concordance index. The code is publicly\navailable on https://github.com/numanai/SurvRNC\n","authors":["Numan Saeed","Muhammad Ridzuan","Fadillah Adamsyah Maani","Hussain Alasmawi","Karthik Nandakumar","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2403.10603v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10521v1","updated":"2024-03-15T17:59:53Z","published":"2024-03-15T17:59:53Z","title":"P-MapNet: Far-seeing Map Generator Enhanced by both SDMap and HDMap\n Priors","summary":" Autonomous vehicles are gradually entering city roads today, with the help of\nhigh-definition maps (HDMaps). However, the reliance on HDMaps prevents\nautonomous vehicles from stepping into regions without this expensive digital\ninfrastructure. This fact drives many researchers to study online HDMap\ngeneration algorithms, but the performance of these algorithms at far regions\nis still unsatisfying. We present P-MapNet, in which the letter P highlights\nthe fact that we focus on incorporating map priors to improve model\nperformance. Specifically, we exploit priors in both SDMap and HDMap. On one\nhand, we extract weakly aligned SDMap from OpenStreetMap, and encode it as an\nadditional conditioning branch. Despite the misalignment challenge, our\nattention-based architecture adaptively attends to relevant SDMap skeletons and\nsignificantly improves performance. On the other hand, we exploit a masked\nautoencoder to capture the prior distribution of HDMap, which can serve as a\nrefinement module to mitigate occlusions and artifacts. We benchmark on the\nnuScenes and Argoverse2 datasets. Through comprehensive experiments, we show\nthat: (1) our SDMap prior can improve online map generation performance, using\nboth rasterized (by up to $+18.73$ $\\rm mIoU$) and vectorized (by up to $+8.50$\n$\\rm mAP$) output representations. (2) our HDMap prior can improve map\nperceptual metrics by up to $6.34\\%$. (3) P-MapNet can be switched into\ndifferent inference modes that covers different regions of the\naccuracy-efficiency trade-off landscape. (4) P-MapNet is a far-seeing solution\nthat brings larger improvements on longer ranges. Codes and models are publicly\navailable at https://jike5.github.io/P-MapNet.\n","authors":["Zhou Jiang","Zhenxin Zhu","Pengfei Li","Huan-ang Gao","Tianyuan Yuan","Yongliang Shi","Hang Zhao","Hao Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.10521v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10520v1","updated":"2024-03-15T17:59:44Z","published":"2024-03-15T17:59:44Z","title":"Strong and Controllable Blind Image Decomposition","summary":" Blind image decomposition aims to decompose all components present in an\nimage, typically used to restore a multi-degraded input image. While fully\nrecovering the clean image is appealing, in some scenarios, users might want to\nretain certain degradations, such as watermarks, for copyright protection. To\naddress this need, we add controllability to the blind image decomposition\nprocess, allowing users to enter which types of degradation to remove or\nretain. We design an architecture named controllable blind image decomposition\nnetwork. Inserted in the middle of U-Net structure, our method first decomposes\nthe input feature maps and then recombines them according to user instructions.\nAdvantageously, this functionality is implemented at minimal computational\ncost: decomposition and recombination are all parameter-free. Experimentally,\nour system excels in blind image decomposition tasks and can outputs partially\nor fully restored images that well reflect user intentions. Furthermore, we\nevaluate and configure different options for the network structure and loss\nfunctions. This, combined with the proposed decomposition-and-recombination\nmethod, yields an efficient and competitive system for blind image\ndecomposition, compared with current state-of-the-art methods.\n","authors":["Zeyu Zhang","Junlin Han","Chenhui Gou","Hongdong Li","Liang Zheng"],"pdf_url":"https://arxiv.org/pdf/2403.10520v1.pdf","comment":"Code: https://github.com/Zhangzeyu97/CBD.git"},{"id":"http://arxiv.org/abs/2403.10519v1","updated":"2024-03-15T17:59:40Z","published":"2024-03-15T17:59:40Z","title":"Frozen Feature Augmentation for Few-Shot Image Classification","summary":" Training a linear classifier or lightweight model on top of pretrained vision\nmodel outputs, so-called 'frozen features', leads to impressive performance on\na number of downstream few-shot tasks. Currently, frozen features are not\nmodified during training. On the other hand, when networks are trained directly\non images, data augmentation is a standard recipe that improves performance\nwith no substantial overhead. In this paper, we conduct an extensive pilot\nstudy on few-shot image classification that explores applying data\naugmentations in the frozen feature space, dubbed 'frozen feature augmentation\n(FroFA)', covering twenty augmentations in total. Our study demonstrates that\nadopting a deceptively simple pointwise FroFA, such as brightness, can improve\nfew-shot performance consistently across three network architectures, three\nlarge pretraining datasets, and eight transfer datasets.\n","authors":["Andreas Bär","Neil Houlsby","Mostafa Dehghani","Manoj Kumar"],"pdf_url":"https://arxiv.org/pdf/2403.10519v1.pdf","comment":"CVPR 2024 (18 pages, main paper + supplementary material)"},{"id":"http://arxiv.org/abs/2403.10518v1","updated":"2024-03-15T17:59:33Z","published":"2024-03-15T17:59:33Z","title":"Lodge: A Coarse to Fine Diffusion Network for Long Dance Generation\n Guided by the Characteristic Dance Primitives","summary":" We propose Lodge, a network capable of generating extremely long dance\nsequences conditioned on given music. We design Lodge as a two-stage coarse to\nfine diffusion architecture, and propose the characteristic dance primitives\nthat possess significant expressiveness as intermediate representations between\ntwo diffusion models. The first stage is global diffusion, which focuses on\ncomprehending the coarse-level music-dance correlation and production\ncharacteristic dance primitives. In contrast, the second-stage is the local\ndiffusion, which parallelly generates detailed motion sequences under the\nguidance of the dance primitives and choreographic rules. In addition, we\npropose a Foot Refine Block to optimize the contact between the feet and the\nground, enhancing the physical realism of the motion. Our approach can\nparallelly generate dance sequences of extremely long length, striking a\nbalance between global choreographic patterns and local motion quality and\nexpressiveness. Extensive experiments validate the efficacy of our method.\n","authors":["Ronghui Li","YuXiang Zhang","Yachao Zhang","Hongwen Zhang","Jie Guo","Yan Zhang","Yebin Liu","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2403.10518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10517v1","updated":"2024-03-15T17:57:52Z","published":"2024-03-15T17:57:52Z","title":"VideoAgent: Long-form Video Understanding with Large Language Model as\n Agent","summary":" Long-form video understanding represents a significant challenge within\ncomputer vision, demanding a model capable of reasoning over long multi-modal\nsequences. Motivated by the human cognitive process for long-form video\nunderstanding, we emphasize interactive reasoning and planning over the ability\nto process lengthy visual inputs. We introduce a novel agent-based system,\nVideoAgent, that employs a large language model as a central agent to\niteratively identify and compile crucial information to answer a question, with\nvision-language foundation models serving as tools to translate and retrieve\nvisual information. Evaluated on the challenging EgoSchema and NExT-QA\nbenchmarks, VideoAgent achieves 54.1% and 71.3% zero-shot accuracy with only\n8.4 and 8.2 frames used on average. These results demonstrate superior\neffectiveness and efficiency of our method over the current state-of-the-art\nmethods, highlighting the potential of agent-based approaches in advancing\nlong-form video understanding.\n","authors":["Xiaohan Wang","Yuhui Zhang","Orr Zohar","Serena Yeung-Levy"],"pdf_url":"https://arxiv.org/pdf/2403.10517v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10516v1","updated":"2024-03-15T17:57:06Z","published":"2024-03-15T17:57:06Z","title":"FeatUp: A Model-Agnostic Framework for Features at Any Resolution","summary":" Deep features are a cornerstone of computer vision research, capturing image\nsemantics and enabling the community to solve downstream tasks even in the\nzero- or few-shot regime. However, these features often lack the spatial\nresolution to directly perform dense prediction tasks like segmentation and\ndepth prediction because models aggressively pool information over large areas.\nIn this work, we introduce FeatUp, a task- and model-agnostic framework to\nrestore lost spatial information in deep features. We introduce two variants of\nFeatUp: one that guides features with high-resolution signal in a single\nforward pass, and one that fits an implicit model to a single image to\nreconstruct features at any resolution. Both approaches use a multi-view\nconsistency loss with deep analogies to NeRFs. Our features retain their\noriginal semantics and can be swapped into existing applications to yield\nresolution and performance gains even without re-training. We show that FeatUp\nsignificantly outperforms other feature upsampling and image super-resolution\napproaches in class activation map generation, transfer learning for\nsegmentation and depth prediction, and end-to-end training for semantic\nsegmentation.\n","authors":["Stephanie Fu","Mark Hamilton","Laura Brandt","Axel Feldman","Zhoutong Zhang","William T. Freeman"],"pdf_url":"https://arxiv.org/pdf/2403.10516v1.pdf","comment":"Accepted to the International Conference on Learning Representations\n (ICLR) 2024"},{"id":"http://arxiv.org/abs/2403.10511v1","updated":"2024-03-15T17:50:45Z","published":"2024-03-15T17:50:45Z","title":"A Novel Framework for Multi-Person Temporal Gaze Following and Social\n Gaze Prediction","summary":" Gaze following and social gaze prediction are fundamental tasks providing\ninsights into human communication behaviors, intent, and social interactions.\nMost previous approaches addressed these tasks separately, either by designing\nhighly specialized social gaze models that do not generalize to other social\ngaze tasks or by considering social gaze inference as an ad-hoc post-processing\nof the gaze following task. Furthermore, the vast majority of gaze following\napproaches have proposed static models that can handle only one person at a\ntime, therefore failing to take advantage of social interactions and temporal\ndynamics. In this paper, we address these limitations and introduce a novel\nframework to jointly predict the gaze target and social gaze label for all\npeople in the scene. The framework comprises of: (i) a temporal,\ntransformer-based architecture that, in addition to image tokens, handles\nperson-specific tokens capturing the gaze information related to each\nindividual; (ii) a new dataset, VSGaze, that unifies annotation types across\nmultiple gaze following and social gaze datasets. We show that our model\ntrained on VSGaze can address all tasks jointly, and achieves state-of-the-art\nresults for multi-person gaze following and social gaze prediction.\n","authors":["Anshul Gupta","Samy Tafasca","Arya Farkhondeh","Pierre Vuillecard","Jean-Marc Odobez"],"pdf_url":"https://arxiv.org/pdf/2403.10511v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06199v3","updated":"2024-03-15T17:48:05Z","published":"2024-03-10T12:43:27Z","title":"Mipha: A Comprehensive Overhaul of Multimodal Assistant with Small\n Language Models","summary":" Multimodal Large Language Models (MLLMs) have showcased impressive skills in\ntasks related to visual understanding and reasoning. Yet, their widespread\napplication faces obstacles due to the high computational demands during both\nthe training and inference phases, restricting their use to a limited audience\nwithin the research and user communities. In this paper, we investigate the\ndesign aspects of Multimodal Small Language Models (MSLMs) and propose an\nefficient multimodal assistant named Mipha, which is designed to create synergy\namong various aspects: visual representation, language models, and optimization\nstrategies. We show that without increasing the volume of training data, our\nMipha-3B outperforms the state-of-the-art large MLLMs, especially\nLLaVA-1.5-13B, on multiple benchmarks. Through detailed discussion, we provide\ninsights and guidelines for developing strong MSLMs that rival the capabilities\nof MLLMs. Our code is available at https://github.com/zhuyiche/Mipha.\n","authors":["Minjie Zhu","Yichen Zhu","Xin Liu","Ning Liu","Zhiyuan Xu","Chaomin Shen","Yaxin Peng","Zhicai Ou","Feifei Feng","Jian Tang"],"pdf_url":"https://arxiv.org/pdf/2403.06199v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10499v1","updated":"2024-03-15T17:33:49Z","published":"2024-03-15T17:33:49Z","title":"Benchmarking Zero-Shot Robustness of Multimodal Foundation Models: A\n Pilot Study","summary":" Pre-training image representations from the raw text about images enables\nzero-shot vision transfer to downstream tasks. Through pre-training on millions\nof samples collected from the internet, multimodal foundation models, such as\nCLIP, produce state-of-the-art zero-shot results that often reach\ncompetitiveness with fully supervised methods without the need for\ntask-specific training. Besides the encouraging performance on classification\naccuracy, it is reported that these models close the robustness gap by matching\nthe performance of supervised models trained on ImageNet under natural\ndistribution shift. Because robustness is critical to real-world applications,\nespecially safety-critical ones, in this paper, we present a comprehensive\nevaluation based on a large-scale robustness benchmark covering 7 natural, 3\nsynthetic distribution shifts, and 11 adversarial attacks. We use CLIP as a\npilot study. We show that CLIP leads to a significant robustness drop compared\nto supervised ImageNet models on our benchmark, especially under synthetic\ndistribution shift and adversarial attacks. Furthermore, data overlap analysis\nsuggests that the observed robustness under natural distribution shifts could\nbe attributed, at least in part, to data overlap. In summary, our evaluation\nshows a comprehensive evaluation of robustness is necessary; and there is a\nsignificant need to improve the robustness of zero-shot multimodal models.\n","authors":["Chenguang Wang","Ruoxi Jia","Xin Liu","Dawn Song"],"pdf_url":"https://arxiv.org/pdf/2403.10499v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10589v1","updated":"2024-03-15T17:29:16Z","published":"2024-03-15T17:29:16Z","title":"A General Method to Incorporate Spatial Information into Loss Functions\n for GAN-based Super-resolution Models","summary":" Generative Adversarial Networks (GANs) have shown great performance on\nsuper-resolution problems since they can generate more visually realistic\nimages and video frames. However, these models often introduce side effects\ninto the outputs, such as unexpected artifacts and noises. To reduce these\nartifacts and enhance the perceptual quality of the results, in this paper, we\npropose a general method that can be effectively used in most GAN-based\nsuper-resolution (SR) models by introducing essential spatial information into\nthe training process. We extract spatial information from the input data and\nincorporate it into the training loss, making the corresponding loss a\nspatially adaptive (SA) one. After that, we utilize it to guide the training\nprocess. We will show that the proposed approach is independent of the methods\nused to extract the spatial information and independent of the SR tasks and\nmodels. This method consistently guides the training process towards generating\nvisually pleasing SR images and video frames, substantially mitigating\nartifacts and noise, ultimately leading to enhanced perceptual quality.\n","authors":["Xijun Wang","Santiago López-Tapia","Alice Lucas","Xinyi Wu","Rafael Molina","Aggelos K. Katsaggelos"],"pdf_url":"https://arxiv.org/pdf/2403.10589v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10492v1","updated":"2024-03-15T17:27:12Z","published":"2024-03-15T17:27:12Z","title":"Mitigating Dialogue Hallucination for Large Multi-modal Models via\n Adversarial Instruction Tuning","summary":" Mitigating hallucinations of Large Multi-modal Models(LMMs) is crucial to\nenhance their reliability for general-purpose assistants. This paper shows that\nsuch hallucinations of LMMs can be significantly exacerbated by preceding\nuser-system dialogues. To precisely measure this, we first present an\nevaluation benchmark by extending popular multi-modal benchmark datasets with\nprepended hallucinatory dialogues generated by our novel Adversarial Question\nGenerator, which can automatically generate image-related yet adversarial\ndialogues by adopting adversarial attacks on LMMs. On our benchmark, the\nzero-shot performance of state-of-the-art LMMs dropped significantly for both\nthe VQA and Captioning tasks. Next, we further reveal this hallucination is\nmainly due to the prediction bias toward preceding dialogues rather than visual\ncontent. To reduce this bias, we propose Adversarial Instruction Tuning that\nrobustly fine-tunes LMMs on augmented multi-modal instruction-following\ndatasets with hallucinatory dialogues. Extensive experiments show that our\nproposed approach successfully reduces dialogue hallucination while maintaining\nor even improving performance.\n","authors":["Dongmin Park","Zhaofang Qian","Guangxing Han","Ser-Nam Lim"],"pdf_url":"https://arxiv.org/pdf/2403.10492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06553v2","updated":"2024-03-15T17:24:30Z","published":"2023-12-11T17:41:17Z","title":"HOI-Diff: Text-Driven Synthesis of 3D Human-Object Interactions using\n Diffusion Models","summary":" We address the problem of generating realistic 3D human-object interactions\n(HOIs) driven by textual prompts. To this end, we take a modular design and\ndecompose the complex task into simpler sub-tasks. We first develop a\ndual-branch diffusion model (HOI-DM) to generate both human and object motions\nconditioned on the input text, and encourage coherent motions by a\ncross-attention communication module between the human and object motion\ngeneration branches. We also develop an affordance prediction diffusion model\n(APDM) to predict the contacting area between the human and object during the\ninteractions driven by the textual prompt. The APDM is independent of the\nresults by the HOI-DM and thus can correct potential errors by the latter.\nMoreover, it stochastically generates the contacting points to diversify the\ngenerated motions. Finally, we incorporate the estimated contacting points into\nthe classifier-guidance to achieve accurate and close contact between humans\nand objects. To train and evaluate our approach, we annotate BEHAVE dataset\nwith text descriptions. Experimental results on BEHAVE and OMOMO demonstrate\nthat our approach produces realistic HOIs with various interactions and\ndifferent types of objects.\n","authors":["Xiaogang Peng","Yiming Xie","Zizhao Wu","Varun Jampani","Deqing Sun","Huaizu Jiang"],"pdf_url":"https://arxiv.org/pdf/2312.06553v2.pdf","comment":"Project Page: https://neu-vi.github.io/HOI-Diff/"},{"id":"http://arxiv.org/abs/2403.10488v1","updated":"2024-03-15T17:23:38Z","published":"2024-03-15T17:23:38Z","title":"Joint Multimodal Transformer for Dimensional Emotional Recognition in\n the Wild","summary":" Audiovisual emotion recognition (ER) in videos has immense potential over\nunimodal performance. It effectively leverages the inter- and intra-modal\ndependencies between visual and auditory modalities. This work proposes a novel\naudio-visual emotion recognition system utilizing a joint multimodal\ntransformer architecture with key-based cross-attention. This framework aims to\nexploit the complementary nature of audio and visual cues (facial expressions\nand vocal patterns) in videos, leading to superior performance compared to\nsolely relying on a single modality. The proposed model leverages separate\nbackbones for capturing intra-modal temporal dependencies within each modality\n(audio and visual). Subsequently, a joint multimodal transformer architecture\nintegrates the individual modality embeddings, enabling the model to\neffectively capture inter-modal (between audio and visual) and intra-modal\n(within each modality) relationships. Extensive evaluations on the challenging\nAffwild2 dataset demonstrate that the proposed model significantly outperforms\nbaseline and state-of-the-art methods in ER tasks.\n","authors":["Paul Waligora","Osama Zeeshan","Haseeb Aslam","Soufiane Belharbi","Alessandro Lameiras Koerich","Marco Pedersoli","Simon Bacon","Eric Granger"],"pdf_url":"https://arxiv.org/pdf/2403.10488v1.pdf","comment":"5 pages, 1 figure"},{"id":"http://arxiv.org/abs/2403.08586v2","updated":"2024-03-15T17:23:21Z","published":"2024-03-13T14:42:55Z","title":"PRAGO: Differentiable Multi-View Pose Optimization From Objectness\n Detections","summary":" Robustly estimating camera poses from a set of images is a fundamental task\nwhich remains challenging for differentiable methods, especially in the case of\nsmall and sparse camera pose graphs. To overcome this challenge, we propose\nPose-refined Rotation Averaging Graph Optimization (PRAGO). From a set of\nobjectness detections on unordered images, our method reconstructs the\nrotational pose, and in turn, the absolute pose, in a differentiable manner\nbenefiting from the optimization of a sequence of geometrical tasks. We show\nhow our objectness pose-refinement module in PRAGO is able to refine the\ninherent ambiguities in pairwise relative pose estimation without removing\nedges and avoiding making early decisions on the viability of graph edges.\nPRAGO then refines the absolute rotations through iterative graph construction,\nreweighting the graph edges to compute the final rotational pose, which can be\nconverted into absolute poses using translation averaging. We show that PRAGO\nis able to outperform non-differentiable solvers on small and sparse scenes\nextracted from 7-Scenes achieving a relative improvement of 21% for rotations\nwhile achieving similar translation estimates.\n","authors":["Matteo Taiana","Matteo Toso","Stuart James","Alessio Del Bue"],"pdf_url":"https://arxiv.org/pdf/2403.08586v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07652v3","updated":"2024-03-15T17:20:14Z","published":"2023-08-15T09:00:21Z","title":"Geometry of the Visual Cortex with Applications to Image Inpainting and\n Enhancement","summary":" Equipping the rototranslation group $SE(2)$ with a sub-Riemannian structure\ninspired by the visual cortex V1, we propose algorithms for image inpainting\nand enhancement based on hypoelliptic diffusion. We innovate on previous\nimplementations of the methods by Citti, Sarti, and Boscain et al., by\nproposing an alternative that prevents fading and is capable of producing\nsharper results in a procedure that we call WaxOn-WaxOff. We also exploit the\nsub-Riemannian structure to define a completely new unsharp filter using\n$SE(2)$, analogous to the classical unsharp filter for 2D image processing. We\ndemonstrate our method on blood vessels enhancement in retinal scans.\n","authors":["Francesco Ballerin","Erlend Grong"],"pdf_url":"https://arxiv.org/pdf/2308.07652v3.pdf","comment":"Associated python package available at\n https://github.com/ballerin/v1diffusion"},{"id":"http://arxiv.org/abs/2403.08019v2","updated":"2024-03-15T17:07:55Z","published":"2024-03-12T18:36:59Z","title":"MRC-Net: 6-DoF Pose Estimation with MultiScale Residual Correlation","summary":" We propose a single-shot approach to determining 6-DoF pose of an object with\navailable 3D computer-aided design (CAD) model from a single RGB image. Our\nmethod, dubbed MRC-Net, comprises two stages. The first performs pose\nclassification and renders the 3D object in the classified pose. The second\nstage performs regression to predict fine-grained residual pose within class.\nConnecting the two stages is a novel multi-scale residual correlation (MRC)\nlayer that captures high-and-low level correspondences between the input image\nand rendering from first stage. MRC-Net employs a Siamese network with shared\nweights between both stages to learn embeddings for input and rendered images.\nTo mitigate ambiguity when predicting discrete pose class labels on symmetric\nobjects, we use soft probabilistic labels to define pose class in the first\nstage. We demonstrate state-of-the-art accuracy, outperforming all competing\nRGB-based methods on four challenging BOP benchmark datasets: T-LESS, LM-O,\nYCB-V, and ITODD. Our method is non-iterative and requires no complex\npost-processing.\n","authors":["Yuelong Li","Yafei Mao","Raja Bala","Sunil Hadap"],"pdf_url":"https://arxiv.org/pdf/2403.08019v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.10476v1","updated":"2024-03-15T17:07:39Z","published":"2024-03-15T17:07:39Z","title":"Approximate Nullspace Augmented Finetuning for Robust Vision\n Transformers","summary":" Enhancing the robustness of deep learning models, particularly in the realm\nof vision transformers (ViTs), is crucial for their real-world deployment. In\nthis work, we provide a finetuning approach to enhance the robustness of vision\ntransformers inspired by the concept of nullspace from linear algebra. Our\ninvestigation centers on whether a vision transformer can exhibit resilience to\ninput variations akin to the nullspace property in linear mappings, implying\nthat perturbations sampled from this nullspace do not influence the model's\noutput when added to the input. Firstly, we show that for many pretrained ViTs,\na non-trivial nullspace exists due to the presence of the patch embedding\nlayer. Secondly, as nullspace is a concept associated with linear algebra, we\ndemonstrate that it is possible to synthesize approximate nullspace elements\nfor the non-linear blocks of ViTs employing an optimisation strategy. Finally,\nwe propose a fine-tuning strategy for ViTs wherein we augment the training data\nwith synthesized approximate nullspace noise. After finetuning, we find that\nthe model demonstrates robustness to adversarial and natural image perbutations\nalike.\n","authors":["Haoyang Liu","Aditya Singh","Yijiang Li","Haohan Wang"],"pdf_url":"https://arxiv.org/pdf/2403.10476v1.pdf","comment":"21 pages, 8 figures"},{"id":"http://arxiv.org/abs/2309.15551v2","updated":"2024-03-15T17:01:24Z","published":"2023-09-27T10:20:45Z","title":"DeepRepViz: Identifying Confounders in Deep Learning Model Predictions","summary":" Deep Learning (DL) models have gained popularity in neuroimaging studies for\npredicting psychological behaviors, cognitive traits, and brain pathologies.\nHowever, these models can be biased by confounders such as age, sex, or imaging\nartifacts from the acquisition process. To address this, we introduce\n'DeepRepViz', a two-part framework designed to identify confounders in DL model\npredictions. The first component is a visualization tool that can be used to\nqualitatively examine the final latent representation of the DL model. The\nsecond component is a metric called 'Con-score' that quantifies the confounder\nrisk associated with a variable, using the final latent representation of the\nDL model. We demonstrate the effectiveness of the Con-score using a simple\nsimulated setup by iteratively altering the strength of a simulated confounder\nand observing the corresponding change in the Con-score. Next, we validate the\nDeepRepViz framework on a large-scale neuroimaging dataset (n=12000) by\nperforming three MRI-phenotype prediction tasks that include (a) predicting\nchronic alcohol users, (b) classifying participant sex, and (c) predicting\nperformance speed on a cognitive task called 'trail making'. DeepRepViz\nidentifies sex as a significant confounder in the DL model predicting chronic\nalcohol users (Con-score=0.35) and age as a confounder in the model predicting\ncognitive task performance (Con-score=0.3). In conclusion, the DeepRepViz\nframework provides a systematic approach to test for potential confounders such\nas age, sex, and imaging artifacts and improves the transparency of DL models\nfor neuroimaging studies.\n","authors":["Roshan Prakash Rane","JiHoon Kim","Arjun Umesha","Didem Stark","Marc-André Schulz","Kerstin Ritter"],"pdf_url":"https://arxiv.org/pdf/2309.15551v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10459v1","updated":"2024-03-15T16:51:24Z","published":"2024-03-15T16:51:24Z","title":"Understanding the Double Descent Phenomenon in Deep Learning","summary":" Combining empirical risk minimization with capacity control is a classical\nstrategy in machine learning when trying to control the generalization gap and\navoid overfitting, as the model class capacity gets larger. Yet, in modern deep\nlearning practice, very large over-parameterized models (e.g. neural networks)\nare optimized to fit perfectly the training data and still obtain great\ngeneralization performance. Past the interpolation point, increasing model\ncomplexity seems to actually lower the test error.\n In this tutorial, we explain the concept of double descent and its\nmechanisms. The first section sets the classical statistical learning framework\nand introduces the double descent phenomenon. By looking at a number of\nexamples, section 2 introduces inductive biases that appear to have a key role\nin double descent by selecting, among the multiple interpolating solutions, a\nsmooth empirical risk minimizer. Finally, section 3 explores the double descent\nwith two linear models, and gives other points of view from recent related\nworks.\n","authors":["Marc Lafon","Alexandre Thomas"],"pdf_url":"https://arxiv.org/pdf/2403.10459v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05920v3","updated":"2024-03-15T16:47:19Z","published":"2023-10-09T17:59:26Z","title":"SimPLR: A Simple and Plain Transformer for Scaling-Efficient Object\n Detection and Segmentation","summary":" The ability to detect objects in images at varying scales has played a\npivotal role in the design of modern object detectors. Despite considerable\nprogress in removing hand-crafted components and simplifying the architecture\nwith transformers, multi-scale feature maps and/or pyramid design remain a key\nfactor for their empirical success. In this paper, we show that this reliance\non either feature pyramids or an hierarchical backbone is unnecessary and a\ntransformer-based detector with scale-aware attention enables the plain\ndetector `SimPLR' whose backbone and detection head are both non-hierarchical\nand operate on single-scale features. We find through our experiments that\nSimPLR with scale-aware attention is plain and simple, yet competitive with\nmulti-scale vision transformer alternatives. Compared to the multi-scale and\nsingle-scale state-of-the-art, our model scales much better with bigger\ncapacity (self-supervised) models and more pre-training data, allowing us to\nreport a consistently better accuracy and faster runtime for object detection,\ninstance segmentation as well as panoptic segmentation. Code will be released.\n","authors":["Duy-Kien Nguyen","Martin R. Oswald","Cees G. M. Snoek"],"pdf_url":"https://arxiv.org/pdf/2310.05920v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10585v1","updated":"2024-03-15T16:38:47Z","published":"2024-03-15T16:38:47Z","title":"Solving General Noisy Inverse Problem via Posterior Sampling: A Policy\n Gradient Viewpoint","summary":" Solving image inverse problems (e.g., super-resolution and inpainting)\nrequires generating a high fidelity image that matches the given input (the\nlow-resolution image or the masked image). By using the input image as\nguidance, we can leverage a pretrained diffusion generative model to solve a\nwide range of image inverse tasks without task specific model fine-tuning. To\nprecisely estimate the guidance score function of the input image, we propose\nDiffusion Policy Gradient (DPG), a tractable computation method by viewing the\nintermediate noisy images as policies and the target image as the states\nselected by the policy. Experiments show that our method is robust to both\nGaussian and Poisson noise degradation on multiple linear and non-linear\ninverse tasks, resulting into a higher image restoration quality on FFHQ,\nImageNet and LSUN datasets.\n","authors":["Haoyue Tang","Tian Xie","Aosong Feng","Hanyu Wang","Chenyang Zhang","Yang Bai"],"pdf_url":"https://arxiv.org/pdf/2403.10585v1.pdf","comment":"Accepted and to Appear, AISTATS 2024"},{"id":"http://arxiv.org/abs/2403.10452v1","updated":"2024-03-15T16:37:43Z","published":"2024-03-15T16:37:43Z","title":"Robust Shape Fitting for 3D Scene Abstraction","summary":" Humans perceive and construct the world as an arrangement of simple\nparametric models. In particular, we can often describe man-made environments\nusing volumetric primitives such as cuboids or cylinders. Inferring these\nprimitives is important for attaining high-level, abstract scene descriptions.\nPrevious approaches for primitive-based abstraction estimate shape parameters\ndirectly and are only able to reproduce simple objects. In contrast, we propose\na robust estimator for primitive fitting, which meaningfully abstracts complex\nreal-world environments using cuboids. A RANSAC estimator guided by a neural\nnetwork fits these primitives to a depth map. We condition the network on\npreviously detected parts of the scene, parsing it one-by-one. To obtain\ncuboids from single RGB images, we additionally optimise a depth estimation CNN\nend-to-end. Naively minimising point-to-primitive distances leads to large or\nspurious cuboids occluding parts of the scene. We thus propose an improved\nocclusion-aware distance metric correctly handling opaque scenes. Furthermore,\nwe present a neural network based cuboid solver which provides more\nparsimonious scene abstractions while also reducing inference time. The\nproposed algorithm does not require labour-intensive labels, such as cuboid\nannotations, for training. Results on the NYU Depth v2 dataset demonstrate that\nthe proposed algorithm successfully abstracts cluttered real-world 3D scene\nlayouts.\n","authors":["Florian Kluger","Eric Brachmann","Michael Ying Yang","Bodo Rosenhahn"],"pdf_url":"https://arxiv.org/pdf/2403.10452v1.pdf","comment":"Accepted for publication in Transactions on Pattern Analysis and\n Machine Intelligence (PAMI). arXiv admin note: substantial text overlap with\n arXiv:2105.02047"},{"id":"http://arxiv.org/abs/2306.03204v2","updated":"2024-03-15T16:15:51Z","published":"2023-06-05T19:26:21Z","title":"ChatGPT as a mapping assistant: A novel method to enrich maps with\n generative AI and content derived from street-level photographs","summary":" This paper explores the concept of leveraging generative AI as a mapping\nassistant for enhancing the efficiency of collaborative mapping. We present\nresults of an experiment that combines multiple sources of volunteered\ngeographic information (VGI) and large language models (LLMs). Three analysts\ndescribed the content of crowdsourced Mapillary street-level photographs taken\nalong roads in a small test area in Miami, Florida. GPT-3.5-turbo was\ninstructed to suggest the most appropriate tagging for each road in\nOpenStreetMap (OSM). The study also explores the utilization of BLIP-2, a\nstate-of-the-art multimodal pre-training method as an artificial analyst of\nstreet-level photographs in addition to human analysts. Results demonstrate two\nways to effectively increase the accuracy of mapping suggestions without\nmodifying the underlying AI models: by (1) providing a more detailed\ndescription of source photographs, and (2) combining prompt engineering with\nadditional context (e.g. location and objects detected along a road). The first\napproach increases the suggestion accuracy by up to 29%, and the second one by\nup to 20%.\n","authors":["Levente Juhász","Peter Mooney","Hartwig H. Hochmair","Boyuan Guan"],"pdf_url":"https://arxiv.org/pdf/2306.03204v2.pdf","comment":"Submitted to The Fourth Spatial Data Science Symposium"},{"id":"http://arxiv.org/abs/2311.15206v2","updated":"2024-03-15T16:15:00Z","published":"2023-11-26T06:17:29Z","title":"Insect-Foundation: A Foundation Model and Large-scale 1M Dataset for\n Visual Insect Understanding","summary":" In precision agriculture, the detection and recognition of insects play an\nessential role in the ability of crops to grow healthy and produce a\nhigh-quality yield. The current machine vision model requires a large volume of\ndata to achieve high performance. However, there are approximately 5.5 million\ndifferent insect species in the world. None of the existing insect datasets can\ncover even a fraction of them due to varying geographic locations and\nacquisition costs. In this paper, we introduce a novel \"Insect-1M\" dataset, a\ngame-changing resource poised to revolutionize insect-related foundation model\ntraining. Covering a vast spectrum of insect species, our dataset, including 1\nmillion images with dense identification labels of taxonomy hierarchy and\ninsect descriptions, offers a panoramic view of entomology, enabling foundation\nmodels to comprehend visual and semantic information about insects like never\nbefore. Then, to efficiently establish an Insect Foundation Model, we develop a\nmicro-feature self-supervised learning method with a Patch-wise Relevant\nAttention mechanism capable of discerning the subtle differences among insect\nimages. In addition, we introduce Description Consistency loss to improve\nmicro-feature modeling via insect descriptions. Through our experiments, we\nillustrate the effectiveness of our proposed approach in insect modeling and\nachieve State-of-the-Art performance on standard benchmarks of insect-related\ntasks. Our Insect Foundation Model and Dataset promise to empower the next\ngeneration of insect-related vision models, bringing them closer to the\nultimate goal of precision agriculture.\n","authors":["Hoang-Quan Nguyen","Thanh-Dat Truong","Xuan Bac Nguyen","Ashley Dowling","Xin Li","Khoa Luu"],"pdf_url":"https://arxiv.org/pdf/2311.15206v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10434v1","updated":"2024-03-15T16:14:34Z","published":"2024-03-15T16:14:34Z","title":"Using an LLM to Turn Sign Spottings into Spoken Language Sentences","summary":" Sign Language Translation (SLT) is a challenging task that aims to generate\nspoken language sentences from sign language videos. In this paper, we\nintroduce a hybrid SLT approach, Spotter+GPT, that utilizes a sign spotter and\na pretrained large language model to improve SLT performance. Our method builds\nupon the strengths of both components. The videos are first processed by the\nspotter, which is trained on a linguistic sign language dataset, to identify\nindividual signs. These spotted signs are then passed to the powerful language\nmodel, which transforms them into coherent and contextually appropriate spoken\nlanguage sentences.\n","authors":["Ozge Mercanoglu Sincan","Necati Cihan Camgoz","Richard Bowden"],"pdf_url":"https://arxiv.org/pdf/2403.10434v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10427v1","updated":"2024-03-15T16:00:04Z","published":"2024-03-15T16:00:04Z","title":"SWAG: Splatting in the Wild images with Appearance-conditioned Gaussians","summary":" Implicit neural representation methods have shown impressive advancements in\nlearning 3D scenes from unstructured in-the-wild photo collections but are\nstill limited by the large computational cost of volumetric rendering. More\nrecently, 3D Gaussian Splatting emerged as a much faster alternative with\nsuperior rendering quality and training efficiency, especially for small-scale\nand object-centric scenarios. Nevertheless, this technique suffers from poor\nperformance on unstructured in-the-wild data. To tackle this, we extend over 3D\nGaussian Splatting to handle unstructured image collections. We achieve this by\nmodeling appearance to seize photometric variations in the rendered images.\nAdditionally, we introduce a new mechanism to train transient Gaussians to\nhandle the presence of scene occluders in an unsupervised manner. Experiments\non diverse photo collection scenes and multi-pass acquisition of outdoor\nlandmarks show the effectiveness of our method over prior works achieving\nstate-of-the-art results with improved efficiency.\n","authors":["Hiba Dahmani","Moussab Bennehar","Nathan Piasco","Luis Roldao","Dzmitry Tsishkou"],"pdf_url":"https://arxiv.org/pdf/2403.10427v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10425v1","updated":"2024-03-15T15:58:51Z","published":"2024-03-15T15:58:51Z","title":"NeuFlow: Real-time, High-accuracy Optical Flow Estimation on Robots\n Using Edge Devices","summary":" Real-time high-accuracy optical flow estimation is a crucial component in\nvarious applications, including localization and mapping in robotics, object\ntracking, and activity recognition in computer vision. While recent\nlearning-based optical flow methods have achieved high accuracy, they often\ncome with heavy computation costs. In this paper, we propose a highly efficient\noptical flow architecture, called NeuFlow, that addresses both high accuracy\nand computational cost concerns. The architecture follows a global-to-local\nscheme. Given the features of the input images extracted at different spatial\nresolutions, global matching is employed to estimate an initial optical flow on\nthe 1/16 resolution, capturing large displacement, which is then refined on the\n1/8 resolution with lightweight CNN layers for better accuracy. We evaluate our\napproach on Jetson Orin Nano and RTX 2080 to demonstrate efficiency\nimprovements across different computing platforms. We achieve a notable 10x-80x\nspeedup compared to several state-of-the-art methods, while maintaining\ncomparable accuracy. Our approach achieves around 30 FPS on edge computing\nplatforms, which represents a significant breakthrough in deploying complex\ncomputer vision tasks such as SLAM on small robots like drones. The full\ntraining and evaluation code is available at\nhttps://github.com/neufieldrobotics/NeuFlow.\n","authors":["Zhiyong Zhang","Huaizu Jiang","Hanumant Singh"],"pdf_url":"https://arxiv.org/pdf/2403.10425v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10413v1","updated":"2024-03-15T15:47:54Z","published":"2024-03-15T15:47:54Z","title":"Real-Time Image Segmentation via Hybrid Convolutional-Transformer\n Architecture Search","summary":" Image segmentation is one of the most fundamental problems in computer vision\nand has drawn a lot of attentions due to its vast applications in image\nunderstanding and autonomous driving. However, designing effective and\nefficient segmentation neural architectures is a labor-intensive process that\nmay require lots of trials by human experts. In this paper, we address the\nchallenge of integrating multi-head self-attention into high resolution\nrepresentation CNNs efficiently, by leveraging architecture search. Manually\nreplacing convolution layers with multi-head self-attention is non-trivial due\nto the costly overhead in memory to maintain high resolution. By contrast, we\ndevelop a multi-target multi-branch supernet method, which not only fully\nutilizes the advantages of high-resolution features, but also finds the proper\nlocation for placing multi-head self-attention module. Our search algorithm is\noptimized towards multiple objective s (e.g., latency and mIoU) and capable of\nfinding architectures on Pareto frontier with arbitrary number of branches in a\nsingle search. We further present a series of model via Hybrid\nConvolutional-Transformer Architecture Search (HyCTAS) method that searched for\nthe best hybrid combination of light-weight convolution layers and\nmemory-efficient self-attention layers between branches from different\nresolutions and fuse to high resolution for both efficiency and effectiveness.\nExtensive experiments demonstrate that HyCTAS outperforms previous methods on\nsemantic segmentation task. Code and models are available at\n\\url{https://github.com/MarvinYu1995/HyCTAS}.\n","authors":["Hongyuan Yu","Cheng Wan","Mengchen Liu","Dongdong Chen","Bin Xiao","Xiyang Dai"],"pdf_url":"https://arxiv.org/pdf/2403.10413v1.pdf","comment":"8 pages, 3 figures, submitted to IROS 2024"},{"id":"http://arxiv.org/abs/2403.02769v2","updated":"2024-03-15T15:46:54Z","published":"2024-03-05T08:37:05Z","title":"HUNTER: Unsupervised Human-centric 3D Detection via Transferring\n Knowledge from Synthetic Instances to Real Scenes","summary":" Human-centric 3D scene understanding has recently drawn increasing attention,\ndriven by its critical impact on robotics. However, human-centric real-life\nscenarios are extremely diverse and complicated, and humans have intricate\nmotions and interactions. With limited labeled data, supervised methods are\ndifficult to generalize to general scenarios, hindering real-life applications.\nMimicking human intelligence, we propose an unsupervised 3D detection method\nfor human-centric scenarios by transferring the knowledge from synthetic human\ninstances to real scenes. To bridge the gap between the distinct data\nrepresentations and feature distributions of synthetic models and real point\nclouds, we introduce novel modules for effective instance-to-scene\nrepresentation transfer and synthetic-to-real feature alignment. Remarkably,\nour method exhibits superior performance compared to current state-of-the-art\ntechniques, achieving 87.8% improvement in mAP and closely approaching the\nperformance of fully supervised methods (62.15 mAP vs. 69.02 mAP) on HuCenLife\nDataset.\n","authors":["Yichen Yao","Zimo Jiang","Yujing Sun","Zhencai Zhu","Xinge Zhu","Runnan Chen","Yuexin Ma"],"pdf_url":"https://arxiv.org/pdf/2403.02769v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.07706v2","updated":"2024-03-15T15:46:31Z","published":"2024-03-12T14:51:23Z","title":"Fast and Simple Explainability for Point Cloud Networks","summary":" We propose a fast and simple explainable AI (XAI) method for point cloud\ndata. It computes pointwise importance with respect to a trained network\ndownstream task. This allows better understanding of the network properties,\nwhich is imperative for safety-critical applications. In addition to debugging\nand visualization, our low computational complexity facilitates online feedback\nto the network at inference. This can be used to reduce uncertainty and to\nincrease robustness. In this work, we introduce \\emph{Feature Based\nInterpretability} (FBI), where we compute the features' norm, per point, before\nthe bottleneck. We analyze the use of gradients and post- and pre-bottleneck\nstrategies, showing pre-bottleneck is preferred, in terms of smoothness and\nranking. We obtain at least three orders of magnitude speedup, compared to\ncurrent XAI methods, thus, scalable for big point clouds or large-scale\narchitectures. Our approach achieves SOTA results, in terms of classification\nexplainability. We demonstrate how the proposed measure is helpful in analyzing\nand characterizing various aspects of 3D learning, such as rotation invariance,\nrobustness to out-of-distribution (OOD) outliers or domain shift and dataset\nbias.\n","authors":["Meir Yossef Levi","Guy Gilboa"],"pdf_url":"https://arxiv.org/pdf/2403.07706v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10404v1","updated":"2024-03-15T15:37:19Z","published":"2024-03-15T15:37:19Z","title":"A comparative study on machine learning approaches for rock mass\n classification using drilling data","summary":" Current rock engineering design in drill and blast tunnelling primarily\nrelies on engineers' observational assessments. Measure While Drilling (MWD)\ndata, a high-resolution sensor dataset collected during tunnel excavation, is\nunderutilised, mainly serving for geological visualisation. This study aims to\nautomate the translation of MWD data into actionable metrics for rock\nengineering. It seeks to link data to specific engineering actions, thus\nproviding critical decision support for geological challenges ahead of the\ntunnel face. Leveraging a large and geologically diverse dataset of 500,000\ndrillholes from 15 tunnels, the research introduces models for accurate rock\nmass quality classification in a real-world tunnelling context. Both\nconventional machine learning and image-based deep learning are explored to\nclassify MWD data into Q-classes and Q-values, examples of metrics describing\nthe stability of the rock mass, using both tabular and image data. The results\nindicate that the K-nearest neighbours algorithm in an ensemble with tree-based\nmodels using tabular data, effectively classifies rock mass quality. It\nachieves a cross-validated balanced accuracy of 0.86 in classifying rock mass\ninto the Q-classes A, B, C, D, E1, E2, and 0.95 for a binary classification\nwith E versus the rest. Classification using a CNN with MWD-images for each\nblasting round resulted in a balanced accuracy of 0.82 for binary\nclassification. Regressing the Q-value from tabular MWD-data achieved\ncross-validated R2 and MSE scores of 0.80 and 0.18 for a similar ensemble model\nas in classification. High performance in regression and classification boosts\nconfidence in automated rock mass assessment. Applying advanced modelling on a\nunique dataset demonstrates MWD data's value in improving rock mass\nclassification accuracy and advancing data-driven rock engineering design,\nreducing manual intervention.\n","authors":["Tom F. Hansen","Georg H. Erharter","Zhongqiang Liu","Jim Torresen"],"pdf_url":"https://arxiv.org/pdf/2403.10404v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10403v1","updated":"2024-03-15T15:37:04Z","published":"2024-03-15T15:37:04Z","title":"Energy Correction Model in the Feature Space for Out-of-Distribution\n Detection","summary":" In this work, we study the out-of-distribution (OOD) detection problem\nthrough the use of the feature space of a pre-trained deep classifier. We show\nthat learning the density of in-distribution (ID) features with an energy-based\nmodels (EBM) leads to competitive detection results. However, we found that the\nnon-mixing of MCMC sampling during the EBM's training undermines its detection\nperformance. To overcome this an energy-based correction of a mixture of\nclass-conditional Gaussian distributions. We obtains favorable results when\ncompared to a strong baseline like the KNN detector on the CIFAR-10/CIFAR-100\nOOD detection benchmarks.\n","authors":["Marc Lafon","Clément Rambour","Nicolas Thome"],"pdf_url":"https://arxiv.org/pdf/2403.10403v1.pdf","comment":"NeurIPS ML Safety Workshop (2022)"},{"id":"http://arxiv.org/abs/2403.10395v1","updated":"2024-03-15T15:27:58Z","published":"2024-03-15T15:27:58Z","title":"Isotropic3D: Image-to-3D Generation Based on a Single CLIP Embedding","summary":" Encouraged by the growing availability of pre-trained 2D diffusion models,\nimage-to-3D generation by leveraging Score Distillation Sampling (SDS) is\nmaking remarkable progress. Most existing methods combine novel-view lifting\nfrom 2D diffusion models which usually take the reference image as a condition\nwhile applying hard L2 image supervision at the reference view. Yet heavily\nadhering to the image is prone to corrupting the inductive knowledge of the 2D\ndiffusion model leading to flat or distorted 3D generation frequently. In this\nwork, we reexamine image-to-3D in a novel perspective and present Isotropic3D,\nan image-to-3D generation pipeline that takes only an image CLIP embedding as\ninput. Isotropic3D allows the optimization to be isotropic w.r.t. the azimuth\nangle by solely resting on the SDS loss. The core of our framework lies in a\ntwo-stage diffusion model fine-tuning. Firstly, we fine-tune a text-to-3D\ndiffusion model by substituting its text encoder with an image encoder, by\nwhich the model preliminarily acquires image-to-image capabilities. Secondly,\nwe perform fine-tuning using our Explicit Multi-view Attention (EMA) which\ncombines noisy multi-view images with the noise-free reference image as an\nexplicit condition. CLIP embedding is sent to the diffusion model throughout\nthe whole process while reference images are discarded once after fine-tuning.\nAs a result, with a single image CLIP embedding, Isotropic3D is capable of\ngenerating multi-view mutually consistent images and also a 3D model with more\nsymmetrical and neat content, well-proportioned geometry, rich colored texture,\nand less distortion compared with existing image-to-3D methods while still\npreserving the similarity to the reference image to a large extent. The project\npage is available at https://isotropic3d.github.io/. The code and models are\navailable at https://github.com/pkunliu/Isotropic3D.\n","authors":["Pengkun Liu","Yikai Wang","Fuchun Sun","Jiafang Li","Hang Xiao","Hongxiang Xue","Xinzhou Wang"],"pdf_url":"https://arxiv.org/pdf/2403.10395v1.pdf","comment":"Project page: https://isotropic3d.github.io/ Source code:\n https://github.com/pkunliu/Isotropic3D"},{"id":"http://arxiv.org/abs/2403.10391v1","updated":"2024-03-15T15:22:13Z","published":"2024-03-15T15:22:13Z","title":"CDMAD: Class-Distribution-Mismatch-Aware Debiasing for Class-Imbalanced\n Semi-Supervised Learning","summary":" Pseudo-label-based semi-supervised learning (SSL) algorithms trained on a\nclass-imbalanced set face two cascading challenges: 1) Classifiers tend to be\nbiased towards majority classes, and 2) Biased pseudo-labels are used for\ntraining. It is difficult to appropriately re-balance the classifiers in SSL\nbecause the class distribution of an unlabeled set is often unknown and could\nbe mismatched with that of a labeled set. We propose a novel class-imbalanced\nSSL algorithm called class-distribution-mismatch-aware debiasing (CDMAD). For\neach iteration of training, CDMAD first assesses the classifier's biased degree\ntowards each class by calculating the logits on an image without any patterns\n(e.g., solid color image), which can be considered irrelevant to the training\nset. CDMAD then refines biased pseudo-labels of the base SSL algorithm by\nensuring the classifier's neutrality. CDMAD uses these refined pseudo-labels\nduring the training of the base SSL algorithm to improve the quality of the\nrepresentations. In the test phase, CDMAD similarly refines biased class\npredictions on test samples. CDMAD can be seen as an extension of post-hoc\nlogit adjustment to address a challenge of incorporating the unknown class\ndistribution of the unlabeled set for re-balancing the biased classifier under\nclass distribution mismatch. CDMAD ensures Fisher consistency for the balanced\nerror. Extensive experiments verify the effectiveness of CDMAD.\n","authors":["Hyuck Lee","Heeyoung Kim"],"pdf_url":"https://arxiv.org/pdf/2403.10391v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.10390v1","updated":"2024-03-15T15:21:04Z","published":"2024-03-15T15:21:04Z","title":"Evaluating Perceptual Distances by Fitting Binomial Distributions to\n Two-Alternative Forced Choice Data","summary":" The two-alternative forced choice (2AFC) experimental setup is popular in the\nvisual perception literature, where practitioners aim to understand how human\nobservers perceive distances within triplets that consist of a reference image\nand two distorted versions of that image. In the past, this had been conducted\nin controlled environments, with a tournament-style algorithm dictating which\nimages are shown to each participant to rank the distorted images. Recently,\ncrowd-sourced perceptual datasets have emerged, with no images shared between\ntriplets, making ranking impossible. Evaluating perceptual distances using this\ndata is non-trivial, relying on reducing the collection of judgements on a\ntriplet to a binary decision -- which is suboptimal and prone to misleading\nconclusions. Instead, we statistically model the underlying decision-making\nprocess during 2AFC experiments using a binomial distribution. We use maximum\nlikelihood estimation to fit a distribution to the perceptual judgements,\nconditioned on the perceptual distance to test and impose consistency and\nsmoothness between our empirical estimates of the density. This way, we can\nevaluate a different number of judgements per triplet, and can calculate\nmetrics such as likelihoods of judgements according to a set of distances --\nkey ingredients that neural network counterparts lack.\n","authors":["Alexander Hepburn","Raul Santos-Rodriguez","Javier Portilla"],"pdf_url":"https://arxiv.org/pdf/2403.10390v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10378v1","updated":"2024-03-15T15:08:39Z","published":"2024-03-15T15:08:39Z","title":"EXAMS-V: A Multi-Discipline Multilingual Multimodal Exam Benchmark for\n Evaluating Vision Language Models","summary":" We introduce EXAMS-V, a new challenging multi-discipline multimodal\nmultilingual exam benchmark for evaluating vision language models. It consists\nof 20,932 multiple-choice questions across 20 school disciplines covering\nnatural science, social science, and other miscellaneous studies, e.g.,\nreligion, fine arts, business, etc. EXAMS-V includes a variety of multimodal\nfeatures such as text, images, tables, figures, diagrams, maps, scientific\nsymbols, and equations. The questions come in 11 languages from 7 language\nfamilies. Unlike existing benchmarks, EXAMS-V is uniquely curated by gathering\nschool exam questions from various countries, with a variety of education\nsystems. This distinctive approach calls for intricate reasoning across diverse\nlanguages and relies on region-specific knowledge. Solving the problems in the\ndataset requires advanced perception and joint reasoning over the text and the\nvisual content of the image. Our evaluation results demonstrate that this is a\nchallenging dataset, which is difficult even for advanced vision-text models\nsuch as GPT-4V and Gemini; this underscores the inherent complexity of the\ndataset and its significance as a future benchmark.\n","authors":["Rocktim Jyoti Das","Simeon Emilov Hristov","Haonan Li","Dimitar Iliyanov Dimitrov","Ivan Koychev","Preslav Nakov"],"pdf_url":"https://arxiv.org/pdf/2403.10378v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14155v2","updated":"2024-03-15T15:05:31Z","published":"2023-11-23T18:55:03Z","title":"GigaPose: Fast and Robust Novel Object Pose Estimation via One\n Correspondence","summary":" We present GigaPose, a fast, robust, and accurate method for CAD-based novel\nobject pose estimation in RGB images. GigaPose first leverages discriminative\n\"templates\", rendered images of the CAD models, to recover the out-of-plane\nrotation and then uses patch correspondences to estimate the four remaining\nparameters. Our approach samples templates in only a two-degrees-of-freedom\nspace instead of the usual three and matches the input image to the templates\nusing fast nearest-neighbor search in feature space, results in a speedup\nfactor of 35x compared to the state of the art. Moreover, GigaPose is\nsignificantly more robust to segmentation errors. Our extensive evaluation on\nthe seven core datasets of the BOP challenge demonstrates that it achieves\nstate-of-the-art accuracy and can be seamlessly integrated with existing\nrefinement methods. Additionally, we show the potential of GigaPose with 3D\nmodels predicted by recent work on 3D reconstruction from a single image,\nrelaxing the need for CAD models and making 6D pose object estimation much more\nconvenient. Our source code and trained models are publicly available at\nhttps://github.com/nv-nguyen/gigaPose\n","authors":["Van Nguyen Nguyen","Thibault Groueix","Mathieu Salzmann","Vincent Lepetit"],"pdf_url":"https://arxiv.org/pdf/2311.14155v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.10376v1","updated":"2024-03-15T15:05:29Z","published":"2024-03-15T15:05:29Z","title":"PASTA: Towards Flexible and Efficient HDR Imaging Via Progressively\n Aggregated Spatio-Temporal Aligment","summary":" Leveraging Transformer attention has led to great advancements in HDR\ndeghosting. However, the intricate nature of self-attention introduces\npractical challenges, as existing state-of-the-art methods often demand\nhigh-end GPUs or exhibit slow inference speeds, especially for high-resolution\nimages like 2K. Striking an optimal balance between performance and latency\nremains a critical concern. In response, this work presents PASTA, a novel\nProgressively Aggregated Spatio-Temporal Alignment framework for HDR\ndeghosting. Our approach achieves effectiveness and efficiency by harnessing\nhierarchical representation during feature distanglement. Through the\nutilization of diverse granularities within the hierarchical structure, our\nmethod substantially boosts computational speed and optimizes the HDR imaging\nworkflow. In addition, we explore within-scale feature modeling with local and\nglobal attention, gradually merging and refining them in a coarse-to-fine\nfashion. Experimental results showcase PASTA's superiority over current SOTA\nmethods in both visual quality and performance metrics, accompanied by a\nsubstantial 3-fold (x3) increase in inference speed.\n","authors":["Xiaoning Liu","Ao Li","Zongwei Wu","Yapeng Du","Le Zhang","Yulun Zhang","Radu Timofte","Ce Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.10376v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10374v1","updated":"2024-03-15T15:04:30Z","published":"2024-03-15T15:04:30Z","title":"Overcoming Distribution Shifts in Plug-and-Play Methods with Test-Time\n Training","summary":" Plug-and-Play Priors (PnP) is a well-known class of methods for solving\ninverse problems in computational imaging. PnP methods combine physical forward\nmodels with learned prior models specified as image denoisers. A common issue\nwith the learned models is that of a performance drop when there is a\ndistribution shift between the training and testing data. Test-time training\n(TTT) was recently proposed as a general strategy for improving the performance\nof learned models when training and testing data come from different\ndistributions. In this paper, we propose PnP-TTT as a new method for overcoming\ndistribution shifts in PnP. PnP-TTT uses deep equilibrium learning (DEQ) for\noptimizing a self-supervised loss at the fixed points of PnP iterations.\nPnP-TTT can be directly applied on a single test sample to improve the\ngeneralization of PnP. We show through simulations that given a sufficient\nnumber of measurements, PnP-TTT enables the use of image priors trained on\nnatural images for image reconstruction in magnetic resonance imaging (MRI).\n","authors":["Edward P. Chandler","Shirin Shoushtari","Jiaming Liu","M. Salman Asif","Ulugbek S. Kamilov"],"pdf_url":"https://arxiv.org/pdf/2403.10374v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07516v3","updated":"2024-03-15T15:03:40Z","published":"2023-06-30T17:05:11Z","title":"Voting-based Multimodal Automatic Deception Detection","summary":" Automatic Deception Detection has been a hot research topic for a long time,\nusing machine learning and deep learning to automatically detect deception,\nbrings new light to this old field. In this paper, we proposed a voting-based\nmethod for automatic deception detection from videos using audio, visual and\nlexical features. Experiments were done on two datasets, the Real-life trial\ndataset by Michigan University and the Miami University deception detection\ndataset. Video samples were split into frames of images, audio, and\nmanuscripts. Our Voting-based Multimodal proposed solution consists of three\nmodels. The first model is CNN for detecting deception from images, the second\nmodel is Support Vector Machine (SVM) on Mel spectrograms for detecting\ndeception from audio and the third model is Word2Vec on Support Vector Machine\n(SVM) for detecting deception from manuscripts. Our proposed solution\noutperforms state of the art. Best results achieved on images, audio and text\nwere 97%, 96%, 92% respectively on Real-Life Trial Dataset, and 97%, 82%, 73%\non video, audio and text respectively on Miami University Deception Detection.\n","authors":["Lana Touma","Mohammad Al Horani","Manar Tailouni","Anas Dahabiah","Khloud Al Jallad"],"pdf_url":"https://arxiv.org/pdf/2307.07516v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10369v1","updated":"2024-03-15T15:00:42Z","published":"2024-03-15T15:00:42Z","title":"Open Stamped Parts Dataset","summary":" We present the Open Stamped Parts Dataset (OSPD), featuring synthetic and\nreal images of stamped metal sheets for auto manufacturing. The real part\nimages, captured from 7 cameras, consist of 7,980 unlabeled images and 1,680\nlabeled images. In addition, we have compiled a defect dataset by overlaying\nsynthetically generated masks on 10% of the holes. The synthetic dataset\nreplicates the real manufacturing environment in terms of lighting and part\nplacement relative to the cameras. The synthetic data includes 7,980 training\nimages, 1,680 validation images and 1,680 test images, each with bounding box\nand segmentation mask annotations around all holes. 10% of the holes in the\nsynthetic data mimic defects generated in the real image dataset. We trained a\nhole-detection model on the synthetic-OSPD, achieving a modified recall score\nof 67.2% and a precision of 94.4% . We anticipate researchers in the auto\nmanufacturing and broader machine learning and computer vision communities\nusing OSPD to advance the state of the art in defect detection of stamped holes\nin the metalsheet stamping process. The dataset is available for download at:\nhttps://tinyurl.com/hm6xatd7\n","authors":["Sara Antiles","Sachin S. Talathi"],"pdf_url":"https://arxiv.org/pdf/2403.10369v1.pdf","comment":"6 pages, 7 figures, 2 tables"},{"id":"http://arxiv.org/abs/2403.10367v1","updated":"2024-03-15T14:59:21Z","published":"2024-03-15T14:59:21Z","title":"Testing MediaPipe Holistic for Linguistic Analysis of Nonmanual Markers\n in Sign Languages","summary":" Advances in Deep Learning have made possible reliable landmark tracking of\nhuman bodies and faces that can be used for a variety of tasks. We test a\nrecent Computer Vision solution, MediaPipe Holistic (MPH), to find out if its\ntracking of the facial features is reliable enough for a linguistic analysis of\ndata from sign languages, and compare it to an older solution (OpenFace, OF).\nWe use an existing data set of sentences in Kazakh-Russian Sign Language and a\nnewly created small data set of videos with head tilts and eyebrow movements.\nWe find that MPH does not perform well enough for linguistic analysis of\neyebrow movement -- but in a different way from OF, which is also performing\npoorly without correction. We reiterate a previous proposal to train additional\ncorrection models to overcome these limitations.\n","authors":["Anna Kuznetsova","Vadim Kimmelman"],"pdf_url":"https://arxiv.org/pdf/2403.10367v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10362v1","updated":"2024-03-15T14:53:31Z","published":"2024-03-15T14:53:31Z","title":"CPGA: Coding Priors-Guided Aggregation Network for Compressed Video\n Quality Enhancement","summary":" Recently, numerous approaches have achieved notable success in compressed\nvideo quality enhancement (VQE). However, these methods usually ignore the\nutilization of valuable coding priors inherently embedded in compressed videos,\nsuch as motion vectors and residual frames, which carry abundant temporal and\nspatial information. To remedy this problem, we propose the Coding\nPriors-Guided Aggregation (CPGA) network to utilize temporal and spatial\ninformation from coding priors. The CPGA mainly consists of an inter-frame\ntemporal aggregation (ITA) module and a multi-scale non-local aggregation (MNA)\nmodule. Specifically, the ITA module aggregates temporal information from\nconsecutive frames and coding priors, while the MNA module globally captures\nspatial information guided by residual frames. In addition, to facilitate\nresearch in VQE task, we newly construct the Video Coding Priors (VCP) dataset,\ncomprising 300 videos with various coding priors extracted from corresponding\nbitstreams. It remedies the shortage of previous datasets on the lack of coding\ninformation. Experimental results demonstrate the superiority of our method\ncompared to existing state-of-the-art methods. The code and dataset will be\nreleased at https://github.com/CPGA/CPGA.git.\n","authors":["Qiang Zhu","Jinhua Hao","Yukang Ding","Yu Liu","Qiao Mo","Ming Sun","Chao Zhou","Shuyuan Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.10362v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10357v1","updated":"2024-03-15T14:45:38Z","published":"2024-03-15T14:45:38Z","title":"ANIM: Accurate Neural Implicit Model for Human Reconstruction from a\n single RGB-D image","summary":" Recent progress in human shape learning, shows that neural implicit models\nare effective in generating 3D human surfaces from limited number of views, and\neven from a single RGB image. However, existing monocular approaches still\nstruggle to recover fine geometric details such as face, hands or cloth\nwrinkles. They are also easily prone to depth ambiguities that result in\ndistorted geometries along the camera optical axis. In this paper, we explore\nthe benefits of incorporating depth observations in the reconstruction process\nby introducing ANIM, a novel method that reconstructs arbitrary 3D human shapes\nfrom single-view RGB-D images with an unprecedented level of accuracy. Our\nmodel learns geometric details from both multi-resolution pixel-aligned and\nvoxel-aligned features to leverage depth information and enable spatial\nrelationships, mitigating depth ambiguities. We further enhance the quality of\nthe reconstructed shape by introducing a depth-supervision strategy, which\nimproves the accuracy of the signed distance field estimation of points that\nlie on the reconstructed surface. Experiments demonstrate that ANIM outperforms\nstate-of-the-art works that use RGB, surface normals, point cloud or RGB-D data\nas input. In addition, we introduce ANIM-Real, a new multi-modal dataset\ncomprising high-quality scans paired with consumer-grade RGB-D camera, and our\nprotocol to fine-tune ANIM, enabling high-quality reconstruction from\nreal-world human capture.\n","authors":["Marco Pesavento","Yuanlu Xu","Nikolaos Sarafianos","Robert Maier","Ziyan Wang","Chun-Han Yao","Marco Volino","Edmond Boyer","Adrian Hilton","Tony Tung"],"pdf_url":"https://arxiv.org/pdf/2403.10357v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04175v2","updated":"2024-03-15T14:44:55Z","published":"2023-06-07T05:59:20Z","title":"ScoreCL: Augmentation-Adaptive Contrastive Learning via Score-Matching\n Function","summary":" Self-supervised contrastive learning (CL) has achieved state-of-the-art\nperformance in representation learning by minimizing the distance between\npositive pairs while maximizing that of negative ones. Recently, it has been\nverified that the model learns better representation with diversely augmented\npositive pairs because they enable the model to be more view-invariant.\nHowever, only a few studies on CL have considered the difference between\naugmented views, and have not gone beyond the hand-crafted findings. In this\npaper, we first observe that the score-matching function can measure how much\ndata has changed from the original through augmentation. With the observed\nproperty, every pair in CL can be weighted adaptively by the difference of\nscore values, resulting in boosting the performance of the existing CL method.\nWe show the generality of our method, referred to as ScoreCL, by consistently\nimproving various CL methods, SimCLR, SimSiam, W-MSE, and VICReg, up to 3%p in\nk-NN evaluation on CIFAR-10, CIFAR-100, and ImageNet-100. Moreover, we have\nconducted exhaustive experiments and ablations, including results on diverse\ndownstream tasks, comparison with possible baselines, and improvement when used\nwith other proposed augmentation methods. We hope our exploration will inspire\nmore research in exploiting the score matching for CL.\n","authors":["Jin-Young Kim","Soonwoo Kwon","Hyojun Go","Yunsung Lee","Seungtaek Choi"],"pdf_url":"https://arxiv.org/pdf/2306.04175v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10353v1","updated":"2024-03-15T14:39:39Z","published":"2024-03-15T14:39:39Z","title":"SimPB: A Single Model for 2D and 3D Object Detection from Multiple\n Cameras","summary":" The field of autonomous driving has attracted considerable interest in\napproaches that directly infer 3D objects in the Bird's Eye View (BEV) from\nmultiple cameras. Some attempts have also explored utilizing 2D detectors from\nsingle images to enhance the performance of 3D detection. However, these\napproaches rely on a two-stage process with separate detectors, where the 2D\ndetection results are utilized only once for token selection or query\ninitialization. In this paper, we present a single model termed SimPB, which\nsimultaneously detects 2D objects in the perspective view and 3D objects in the\nBEV space from multiple cameras. To achieve this, we introduce a hybrid decoder\nconsisting of several multi-view 2D decoder layers and several 3D decoder\nlayers, specifically designed for their respective detection tasks. A Dynamic\nQuery Allocation module and an Adaptive Query Aggregation module are proposed\nto continuously update and refine the interaction between 2D and 3D results, in\na cyclic 3D-2D-3D manner. Additionally, Query-group Attention is utilized to\nstrengthen the interaction among 2D queries within each camera group. In the\nexperiments, we evaluate our method on the nuScenes dataset and demonstrate\npromising results for both 2D and 3D detection tasks. Our code is available at:\nhttps://github.com/nullmax-vision/SimPB.\n","authors":["Yingqi Tang","Zhaotie Meng","Guoliang Chen","Erkang Cheng"],"pdf_url":"https://arxiv.org/pdf/2403.10353v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10349v1","updated":"2024-03-15T14:35:05Z","published":"2024-03-15T14:35:05Z","title":"ParaPoint: Learning Global Free-Boundary Surface Parameterization of 3D\n Point Clouds","summary":" Surface parameterization is a fundamental geometry processing problem with\nrich downstream applications. Traditional approaches are designed to operate on\nwell-behaved mesh models with high-quality triangulations that are laboriously\nproduced by specialized 3D modelers, and thus unable to meet the processing\ndemand for the current explosion of ordinary 3D data. In this paper, we seek to\nperform UV unwrapping on unstructured 3D point clouds. Technically, we propose\nParaPoint, an unsupervised neural learning pipeline for achieving global\nfree-boundary surface parameterization by building point-wise mappings between\ngiven 3D points and 2D UV coordinates with adaptively deformed boundaries. We\ningeniously construct several geometrically meaningful sub-networks with\nspecific functionalities, and assemble them into a bi-directional cycle mapping\nframework. We also design effective loss functions and auxiliary differential\ngeometric constraints for the optimization of the neural mapping process. To\nthe best of our knowledge, this work makes the first attempt to investigate\nneural point cloud parameterization that pursues both global mappings and free\nboundaries. Experiments demonstrate the effectiveness and inspiring potential\nof our proposed learning paradigm. The code will be publicly available.\n","authors":["Qijian Zhang","Junhui Hou","Ying He"],"pdf_url":"https://arxiv.org/pdf/2403.10349v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10348v1","updated":"2024-03-15T14:34:34Z","published":"2024-03-15T14:34:34Z","title":"Denoising Task Difficulty-based Curriculum for Training Diffusion Models","summary":" Diffusion-based generative models have emerged as powerful tools in the realm\nof generative modeling. Despite extensive research on denoising across various\ntimesteps and noise levels, a conflict persists regarding the relative\ndifficulties of the denoising tasks. While various studies argue that lower\ntimesteps present more challenging tasks, others contend that higher timesteps\nare more difficult. To address this conflict, our study undertakes a\ncomprehensive examination of task difficulties, focusing on convergence\nbehavior and changes in relative entropy between consecutive probability\ndistributions across timesteps. Our observational study reveals that denoising\nat earlier timesteps poses challenges characterized by slower convergence and\nhigher relative entropy, indicating increased task difficulty at these lower\ntimesteps. Building on these observations, we introduce an easy-to-hard\nlearning scheme, drawing from curriculum learning, to enhance the training\nprocess of diffusion models. By organizing timesteps or noise levels into\nclusters and training models with descending orders of difficulty, we\nfacilitate an order-aware training regime, progressing from easier to harder\ndenoising tasks, thereby deviating from the conventional approach of training\ndiffusion models simultaneously across all timesteps. Our approach leads to\nimproved performance and faster convergence by leveraging the benefits of\ncurriculum learning, while maintaining orthogonality with existing improvements\nin diffusion training techniques. We validate these advantages through\ncomprehensive experiments in image generation tasks, including unconditional,\nclass-conditional, and text-to-image generation.\n","authors":["Jin-Young Kim","Hyojun Go","Soonwoo Kwon","Hyun-Gyoon Kim"],"pdf_url":"https://arxiv.org/pdf/2403.10348v1.pdf","comment":"22 pages, 8 figures, 5 tables"},{"id":"http://arxiv.org/abs/2403.10346v1","updated":"2024-03-15T14:31:35Z","published":"2024-03-15T14:31:35Z","title":"End-to-end Adaptive Dynamic Subsampling and Reconstruction for Cardiac\n MRI","summary":" Accelerating dynamic MRI is essential for enhancing clinical applications,\nsuch as adaptive radiotherapy, and improving patient comfort. Traditional deep\nlearning (DL) approaches for accelerated dynamic MRI reconstruction typically\nrely on predefined or random subsampling patterns, applied uniformly across all\ntemporal phases. This standard practice overlooks the potential benefits of\nleveraging temporal correlations and lacks the adaptability required for\ncase-specific subsampling optimization, which holds the potential for\nmaximizing reconstruction quality. Addressing this gap, we present a novel\nend-to-end framework for adaptive dynamic MRI subsampling and reconstruction.\nOur pipeline integrates a DL-based adaptive sampler, generating case-specific\ndynamic subsampling patterns, trained end-to-end with a state-of-the-art 2D\ndynamic reconstruction network, namely vSHARP, which effectively reconstructs\nthe adaptive dynamic subsampled data into a moving image. Our method is\nassessed using dynamic cine cardiac MRI data, comparing its performance against\nvSHARP models that employ common subsampling trajectories, and pipelines\ntrained to optimize dataset-specific sampling schemes alongside vSHARP\nreconstruction. Our results indicate superior reconstruction quality,\nparticularly at high accelerations.\n","authors":["George Yiasemis","Jan-Jakob Sonke","Jonas Teuwen"],"pdf_url":"https://arxiv.org/pdf/2403.10346v1.pdf","comment":"14 pages, 5 figures, 8 tables"},{"id":"http://arxiv.org/abs/2303.07033v3","updated":"2024-03-15T14:31:21Z","published":"2023-03-13T11:47:24Z","title":"SelfPromer: Self-Prompt Dehazing Transformers with Depth-Consistency","summary":" This work presents an effective depth-consistency self-prompt Transformer for\nimage dehazing. It is motivated by an observation that the estimated depths of\nan image with haze residuals and its clear counterpart vary. Enforcing the\ndepth consistency of dehazed images with clear ones, therefore, is essential\nfor dehazing. For this purpose, we develop a prompt based on the features of\ndepth differences between the hazy input images and corresponding clear\ncounterparts that can guide dehazing models for better restoration.\nSpecifically, we first apply deep features extracted from the input images to\nthe depth difference features for generating the prompt that contains the haze\nresidual information in the input. Then we propose a prompt embedding module\nthat is designed to perceive the haze residuals, by linearly adding the prompt\nto the deep features. Further, we develop an effective prompt attention module\nto pay more attention to haze residuals for better removal. By incorporating\nthe prompt, prompt embedding, and prompt attention into an encoder-decoder\nnetwork based on VQGAN, we can achieve better perception quality. As the depths\nof clear images are not available at inference, and the dehazed images with\none-time feed-forward execution may still contain a portion of haze residuals,\nwe propose a new continuous self-prompt inference that can iteratively correct\nthe dehazing model towards better haze-free image generation. Extensive\nexperiments show that our method performs favorably against the\nstate-of-the-art approaches on both synthetic and real-world datasets in terms\nof perception metrics including NIQE, PI, and PIQE.\n","authors":["Cong Wang","Jinshan Pan","Wanyu Lin","Jiangxin Dong","Xiao-Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2303.07033v3.pdf","comment":"Accepted by AAAI24. Source codes will be made available at:\n https://github.com/supersupercong/SelfPromer"},{"id":"http://arxiv.org/abs/2403.10344v1","updated":"2024-03-15T14:31:17Z","published":"2024-03-15T14:31:17Z","title":"SCILLA: SurfaCe Implicit Learning for Large Urban Area, a volumetric\n hybrid solution","summary":" Neural implicit surface representation methods have recently shown impressive\n3D reconstruction results. However, existing solutions struggle to reconstruct\nurban outdoor scenes due to their large, unbounded, and highly detailed nature.\nHence, to achieve accurate reconstructions, additional supervision data such as\nLiDAR, strong geometric priors, and long training times are required. To tackle\nsuch issues, we present SCILLA, a new hybrid implicit surface learning method\nto reconstruct large driving scenes from 2D images. SCILLA's hybrid\narchitecture models two separate implicit fields: one for the volumetric\ndensity and another for the signed distance to the surface. To accurately\nrepresent urban outdoor scenarios, we introduce a novel volume-rendering\nstrategy that relies on self-supervised probabilistic density estimation to\nsample points near the surface and transition progressively from volumetric to\nsurface representation. Our solution permits a proper and fast initialization\nof the signed distance field without relying on any geometric prior on the\nscene, compared to concurrent methods. By conducting extensive experiments on\nfour outdoor driving datasets, we show that SCILLA can learn an accurate and\ndetailed 3D surface scene representation in various urban scenarios while being\ntwo times faster to train compared to previous state-of-the-art solutions.\n","authors":["Hala Djeghim","Nathan Piasco","Moussab Bennehar","Luis Roldão","Dzmitry Tsishkou","Désiré Sidibé"],"pdf_url":"https://arxiv.org/pdf/2403.10344v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10340v1","updated":"2024-03-15T14:27:15Z","published":"2024-03-15T14:27:15Z","title":"Thermal-NeRF: Neural Radiance Fields from an Infrared Camera","summary":" In recent years, Neural Radiance Fields (NeRFs) have demonstrated significant\npotential in encoding highly-detailed 3D geometry and environmental appearance,\npositioning themselves as a promising alternative to traditional explicit\nrepresentation for 3D scene reconstruction. However, the predominant reliance\non RGB imaging presupposes ideal lighting conditions: a premise frequently\nunmet in robotic applications plagued by poor lighting or visual obstructions.\nThis limitation overlooks the capabilities of infrared (IR) cameras, which\nexcel in low-light detection and present a robust alternative under such\nadverse scenarios. To tackle these issues, we introduce Thermal-NeRF, the first\nmethod that estimates a volumetric scene representation in the form of a NeRF\nsolely from IR imaging. By leveraging a thermal mapping and structural thermal\nconstraint derived from the thermal characteristics of IR imaging, our method\nshowcasing unparalleled proficiency in recovering NeRFs in visually degraded\nscenes where RGB-based methods fall short. We conduct extensive experiments to\ndemonstrate that Thermal-NeRF can achieve superior quality compared to existing\nmethods. Furthermore, we contribute a dataset for IR-based NeRF applications,\npaving the way for future research in IR NeRF reconstruction.\n","authors":["Tianxiang Ye","Qi Wu","Junyuan Deng","Guoqing Liu","Liu Liu","Songpengcheng Xia","Liang Pang","Wenxian Yu","Ling Pei"],"pdf_url":"https://arxiv.org/pdf/2403.10340v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.01357v2","updated":"2024-03-15T14:27:00Z","published":"2023-11-02T16:04:32Z","title":"Robust Identity Perceptual Watermark Against Deepfake Face Swapping","summary":" Notwithstanding offering convenience and entertainment to society, Deepfake\nface swapping has caused critical privacy issues with the rapid development of\ndeep generative models. Due to imperceptible artifacts in high-quality\nsynthetic images, passive detection models against face swapping in recent\nyears usually suffer performance damping regarding the generalizability issue.\nTherefore, several studies have been attempted to proactively protect the\noriginal images against malicious manipulations by inserting invisible signals\nin advance. However, the existing proactive defense approaches demonstrate\nunsatisfactory results with respect to visual quality, detection accuracy, and\nsource tracing ability. In this study, to fulfill the research gap, we propose\nthe first robust identity perceptual watermarking framework that concurrently\nperforms detection and source tracing against Deepfake face swapping\nproactively. We assign identity semantics regarding the image contents to the\nwatermarks and devise an unpredictable and nonreversible chaotic encryption\nsystem to ensure watermark confidentiality. The watermarks are encoded and\nrecovered by jointly training an encoder-decoder framework along with\nadversarial image manipulations. Falsification and source tracing are\naccomplished by justifying the consistency between the content-matched identity\nperceptual watermark and the recovered robust watermark from the image.\nExtensive experiments demonstrate state-of-the-art detection performance on\nDeepfake face swapping under both cross-dataset and cross-manipulation\nsettings.\n","authors":["Tianyi Wang","Mengxiao Huang","Harry Cheng","Bin Ma","Yinglong Wang"],"pdf_url":"https://arxiv.org/pdf/2311.01357v2.pdf","comment":"In peer review"},{"id":"http://arxiv.org/abs/2403.10336v1","updated":"2024-03-15T14:23:12Z","published":"2024-03-15T14:23:12Z","title":"How Powerful Potential of Attention on Image Restoration?","summary":" Transformers have demonstrated their effectiveness in image restoration\ntasks. Existing Transformer architectures typically comprise two essential\ncomponents: multi-head self-attention and feed-forward network (FFN). The\nformer captures long-range pixel dependencies, while the latter enables the\nmodel to learn complex patterns and relationships in the data. Previous studies\nhave demonstrated that FFNs are key-value memories \\cite{geva2020transformer},\nwhich are vital in modern Transformer architectures. In this paper, we conduct\nan empirical study to explore the potential of attention mechanisms without\nusing FFN and provide novel structures to demonstrate that removing FFN is\nflexible for image restoration. Specifically, we propose Continuous Scaling\nAttention (\\textbf{CSAttn}), a method that computes attention continuously in\nthree stages without using FFN. To achieve competitive performance, we propose\na series of key components within the attention. Our designs provide a closer\nlook at the attention mechanism and reveal that some simple operations can\nsignificantly affect the model performance. We apply our \\textbf{CSAttn} to\nseveral image restoration tasks and show that our model can outperform\nCNN-based and Transformer-based image restoration approaches.\n","authors":["Cong Wang","Jinshan Pan","Yeying Jin","Liyan Wang","Wei Wang","Gang Fu","Wenqi Ren","Xiaochun Cao"],"pdf_url":"https://arxiv.org/pdf/2403.10336v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10335v1","updated":"2024-03-15T14:23:06Z","published":"2024-03-15T14:23:06Z","title":"NECA: Neural Customizable Human Avatar","summary":" Human avatar has become a novel type of 3D asset with various applications.\nIdeally, a human avatar should be fully customizable to accommodate different\nsettings and environments. In this work, we introduce NECA, an approach capable\nof learning versatile human representation from monocular or sparse-view\nvideos, enabling granular customization across aspects such as pose, shadow,\nshape, lighting and texture. The core of our approach is to represent humans in\ncomplementary dual spaces and predict disentangled neural fields of geometry,\nalbedo, shadow, as well as an external lighting, from which we are able to\nderive realistic rendering with high-frequency details via volumetric\nrendering. Extensive experiments demonstrate the advantage of our method over\nthe state-of-the-art methods in photorealistic rendering, as well as various\nediting tasks such as novel pose synthesis and relighting. The code is\navailable at https://github.com/iSEE-Laboratory/NECA.\n","authors":["Junjin Xiao","Qing Zhang","Zhan Xu","Wei-Shi Zheng"],"pdf_url":"https://arxiv.org/pdf/2403.10335v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2206.08751v3","updated":"2024-03-15T14:16:40Z","published":"2022-06-13T02:22:57Z","title":"Perceptual Quality Assessment of Virtual Reality Videos in the Wild","summary":" Investigating how people perceive virtual reality (VR) videos in the wild\n(i.e., those captured by everyday users) is a crucial and challenging task in\nVR-related applications due to complex authentic distortions localized in space\nand time. Existing panoramic video databases only consider synthetic\ndistortions, assume fixed viewing conditions, and are limited in size. To\novercome these shortcomings, we construct the VR Video Quality in the Wild\n(VRVQW) database, containing $502$ user-generated videos with diverse content\nand distortion characteristics. Based on VRVQW, we conduct a formal\npsychophysical experiment to record the scanpaths and perceived quality scores\nfrom $139$ participants under two different viewing conditions. We provide a\nthorough statistical analysis of the recorded data, observing significant\nimpact of viewing conditions on both human scanpaths and perceived quality.\nMoreover, we develop an objective quality assessment model for VR videos based\non pseudocylindrical representation and convolution. Results on the proposed\nVRVQW show that our method is superior to existing video quality assessment\nmodels. We have made the database and code available at\nhttps://github.com/limuhit/VR-Video-Quality-in-the-Wild.\n","authors":["Wen Wen","Mu Li","Yiru Yao","Xiangjie Sui","Yabin Zhang","Long Lan","Yuming Fang","Kede Ma"],"pdf_url":"https://arxiv.org/pdf/2206.08751v3.pdf","comment":"Accepted by IEEE Transactions on Circuits and Systems for Video\n Technology"},{"id":"http://arxiv.org/abs/2310.19351v2","updated":"2024-03-15T14:10:50Z","published":"2023-10-30T08:46:26Z","title":"Seeking Flat Minima with Mean Teacher on Semi- and Weakly-Supervised\n Domain Generalization for Object Detection","summary":" Object detectors do not work well when domains largely differ between\ntraining and testing data. To overcome this domain gap in object detection\nwithout requiring expensive annotations, we consider two problem settings:\nsemi-supervised domain generalizable object detection (SS-DGOD) and\nweakly-supervised DGOD (WS-DGOD). In contrast to the conventional domain\ngeneralization for object detection that requires labeled data from multiple\ndomains, SS-DGOD and WS-DGOD require labeled data only from one domain and\nunlabeled or weakly-labeled data from multiple domains for training. In this\npaper, we show that object detectors can be effectively trained on the two\nsettings with the same Mean Teacher learning framework, where a student network\nis trained with pseudo-labels output from a teacher on the unlabeled or\nweakly-labeled data. We provide novel interpretations of why the Mean Teacher\nlearning framework works well on the two settings in terms of the relationships\nbetween the generalization gap and flat minima in parameter space. On the basis\nof the interpretations, we also propose incorporating a simple regularization\nmethod into the Mean Teacher learning framework to find flatter minima. The\nexperimental results demonstrate that the regularization leads to flatter\nminima and boosts the performance of the detectors trained with the Mean\nTeacher learning framework on the two settings. They also indicate that those\ndetectors significantly outperform the state-of-the-art methods.\n","authors":["Ryosuke Furuta","Yoichi Sato"],"pdf_url":"https://arxiv.org/pdf/2310.19351v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06020v2","updated":"2024-03-15T13:53:19Z","published":"2023-10-09T18:00:01Z","title":"DyST: Towards Dynamic Neural Scene Representations on Real-World Videos","summary":" Visual understanding of the world goes beyond the semantics and flat\nstructure of individual images. In this work, we aim to capture both the 3D\nstructure and dynamics of real-world scenes from monocular real-world videos.\nOur Dynamic Scene Transformer (DyST) model leverages recent work in neural\nscene representation to learn a latent decomposition of monocular real-world\nvideos into scene content, per-view scene dynamics, and camera pose. This\nseparation is achieved through a novel co-training scheme on monocular videos\nand our new synthetic dataset DySO. DyST learns tangible latent representations\nfor dynamic scenes that enable view generation with separate control over the\ncamera and the content of the scene.\n","authors":["Maximilian Seitzer","Sjoerd van Steenkiste","Thomas Kipf","Klaus Greff","Mehdi S. M. Sajjadi"],"pdf_url":"https://arxiv.org/pdf/2310.06020v2.pdf","comment":"ICLR 2024 spotlight. Project website: https://dyst-paper.github.io/"},{"id":"http://arxiv.org/abs/2403.10301v1","updated":"2024-03-15T13:43:47Z","published":"2024-03-15T13:43:47Z","title":"Uni-SMART: Universal Science Multimodal Analysis and Research\n Transformer","summary":" In scientific research and its application, scientific literature analysis is\ncrucial as it allows researchers to build on the work of others. However, the\nfast growth of scientific knowledge has led to a massive increase in scholarly\narticles, making in-depth literature analysis increasingly challenging and\ntime-consuming. The emergence of Large Language Models (LLMs) has offered a new\nway to address this challenge. Known for their strong abilities in summarizing\ntexts, LLMs are seen as a potential tool to improve the analysis of scientific\nliterature. However, existing LLMs have their own limits. Scientific literature\noften includes a wide range of multimodal elements, such as molecular\nstructure, tables, and charts, which are hard for text-focused LLMs to\nunderstand and analyze. This issue points to the urgent need for new solutions\nthat can fully understand and analyze multimodal content in scientific\nliterature. To answer this demand, we present Uni-SMART (Universal Science\nMultimodal Analysis and Research Transformer), an innovative model designed for\nin-depth understanding of multimodal scientific literature. Through rigorous\nquantitative evaluation across several domains, Uni-SMART demonstrates superior\nperformance over leading text-focused LLMs. Furthermore, our exploration\nextends to practical applications, including patent infringement detection and\nnuanced analysis of charts. These applications not only highlight Uni-SMART's\nadaptability but also its potential to revolutionize how we interact with\nscientific literature.\n","authors":["Hengxing Cai","Xiaochen Cai","Shuwen Yang","Jiankun Wang","Lin Yao","Zhifeng Gao","Junhan Chang","Sihang Li","Mingjun Xu","Changxin Wang","Hongshuai Wang","Yongge Li","Mujie Lin","Yaqi Li","Yuqi Yin","Linfeng Zhang","Guolin Ke"],"pdf_url":"https://arxiv.org/pdf/2403.10301v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10298v1","updated":"2024-03-15T13:40:44Z","published":"2024-03-15T13:40:44Z","title":"Context-Semantic Quality Awareness Network for Fine-Grained Visual\n Categorization","summary":" Exploring and mining subtle yet distinctive features between sub-categories\nwith similar appearances is crucial for fine-grained visual categorization\n(FGVC). However, less effort has been devoted to assessing the quality of\nextracted visual representations. Intuitively, the network may struggle to\ncapture discriminative features from low-quality samples, which leads to a\nsignificant decline in FGVC performance. To tackle this challenge, we propose a\nweakly supervised Context-Semantic Quality Awareness Network (CSQA-Net) for\nFGVC. In this network, to model the spatial contextual relationship between\nrich part descriptors and global semantics for capturing more discriminative\ndetails within the object, we design a novel multi-part and multi-scale\ncross-attention (MPMSCA) module. Before feeding to the MPMSCA module, the part\nnavigator is developed to address the scale confusion problems and accurately\nidentify the local distinctive regions. Furthermore, we propose a generic\nmulti-level semantic quality evaluation module (MLSQE) to progressively\nsupervise and enhance hierarchical semantics from different levels of the\nbackbone network. Finally, context-aware features from MPMSCA and semantically\nenhanced features from MLSQE are fed into the corresponding quality probing\nclassifiers to evaluate their quality in real-time, thus boosting the\ndiscriminability of feature representations. Comprehensive experiments on four\npopular and highly competitive FGVC datasets demonstrate the superiority of the\nproposed CSQA-Net in comparison with the state-of-the-art methods.\n","authors":["Qin Xu","Sitong Li","Jiahui Wang","Bo Jiang","Jinhui Tang"],"pdf_url":"https://arxiv.org/pdf/2403.10298v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10297v1","updated":"2024-03-15T13:40:37Z","published":"2024-03-15T13:40:37Z","title":"Leveraging Neural Radiance Field in Descriptor Synthesis for Keypoints\n Scene Coordinate Regression","summary":" Classical structural-based visual localization methods offer high accuracy\nbut face trade-offs in terms of storage, speed, and privacy. A recent\ninnovation, keypoint scene coordinate regression (KSCR) named D2S addresses\nthese issues by leveraging graph attention networks to enhance keypoint\nrelationships and predict their 3D coordinates using a simple multilayer\nperceptron (MLP). Camera pose is then determined via PnP+RANSAC, using\nestablished 2D-3D correspondences. While KSCR achieves competitive results,\nrivaling state-of-the-art image-retrieval methods like HLoc across multiple\nbenchmarks, its performance is hindered when data samples are limited due to\nthe deep learning model's reliance on extensive data. This paper proposes a\nsolution to this challenge by introducing a pipeline for keypoint descriptor\nsynthesis using Neural Radiance Field (NeRF). By generating novel poses and\nfeeding them into a trained NeRF model to create new views, our approach\nenhances the KSCR's generalization capabilities in data-scarce environments.\nThe proposed system could significantly improve localization accuracy by up to\n50\\% and cost only a fraction of time for data synthesis. Furthermore, its\nmodular design allows for the integration of multiple NeRFs, offering a\nversatile and efficient solution for visual localization. The implementation is\npublicly available at: https://github.com/ais-lab/DescriptorSynthesis4Feat2Map.\n","authors":["Huy-Hoang Bui","Bach-Thuan Bui","Dinh-Tuan Tran","Joo-Ho Lee"],"pdf_url":"https://arxiv.org/pdf/2403.10297v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10291v1","updated":"2024-03-15T13:31:33Z","published":"2024-03-15T13:31:33Z","title":"Deep Learning for Multi-Level Detection and Localization of Myocardial\n Scars Based on Regional Strain Validated on Virtual Patients","summary":" How well the heart is functioning can be quantified through measurements of\nmyocardial deformation via echocardiography. Clinical assessment of cardiac\nfunction is generally focused on global indices of relative shortening,\nhowever, territorial, and segmental strain indices have shown to be abnormal in\nregions of myocardial disease, such as scar. In this work, we propose a single\nframework to predict myocardial disease substrates at global, territorial, and\nsegmental levels using regional myocardial strain traces as input to a\nconvolutional neural network (CNN)-based classification algorithm. An\nanatomically meaningful representation of the input data from the clinically\nstandard bullseye representation to a multi-channel 2D image is proposed, to\nformulate the task as an image classification problem, thus enabling the use of\nstate-of-the-art neural network configurations. A Fully Convolutional Network\n(FCN) is trained to detect and localize myocardial scar from regional left\nventricular (LV) strain patterns. Simulated regional strain data from a\ncontrolled dataset of virtual patients with varying degrees and locations of\nmyocardial scar is used for training and validation. The proposed method\nsuccessfully detects and localizes the scars on 98% of the 5490 left ventricle\n(LV) segments of the 305 patients in the test set using strain traces only. Due\nto the sparse existence of scar, only 10% of the LV segments in the virtual\npatient cohort have scar. Taking the imbalance into account, the class balanced\naccuracy is calculated as 95%. The performance is reported on global,\nterritorial, and segmental levels. The proposed method proves successful on the\nstrain traces of the virtual cohort and offers the potential to solve the\nregional myocardial scar detection problem on the strain traces of the real\npatient cohorts.\n","authors":["Müjde Akdeniz","Claudia Alessandra Manetti","Tijmen Koopsen","Hani Nozari Mirar","Sten Roar Snare","Svein Arne Aase","Joost Lumens","Jurica Šprem","Kristin Sarah McLeod"],"pdf_url":"https://arxiv.org/pdf/2403.10291v1.pdf","comment":"11 pages, 9 figures and 1 table. Preliminary results of the method\n was presented as poster in IEEE conference International Ultrasonics\n Symposium 2022 in Venice, Italy"},{"id":"http://arxiv.org/abs/2403.10287v1","updated":"2024-03-15T13:29:41Z","published":"2024-03-15T13:29:41Z","title":"Few-Shot Image Classification and Segmentation as Visual Question\n Answering Using Vision-Language Models","summary":" The task of few-shot image classification and segmentation (FS-CS) involves\nclassifying and segmenting target objects in a query image, given only a few\nexamples of the target classes. We introduce the Vision-Instructed Segmentation\nand Evaluation (VISE) method that transforms the FS-CS problem into the Visual\nQuestion Answering (VQA) problem, utilising Vision-Language Models (VLMs), and\naddresses it in a training-free manner. By enabling a VLM to interact with\noff-the-shelf vision models as tools, the proposed method is capable of\nclassifying and segmenting target objects using only image-level labels.\nSpecifically, chain-of-thought prompting and in-context learning guide the VLM\nto answer multiple-choice questions like a human; vision models such as YOLO\nand Segment Anything Model (SAM) assist the VLM in completing the task. The\nmodular framework of the proposed method makes it easily extendable. Our\napproach achieves state-of-the-art performance on the Pascal-5i and COCO-20i\ndatasets.\n","authors":["Tian Meng","Yang Tao","Ruilin Lyu","Wuliang Yin"],"pdf_url":"https://arxiv.org/pdf/2403.10287v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10283v1","updated":"2024-03-15T13:26:39Z","published":"2024-03-15T13:26:39Z","title":"Local positional graphs and attentive local features for a data and\n runtime-efficient hierarchical place recognition pipeline","summary":" Large-scale applications of Visual Place Recognition (VPR) require\ncomputationally efficient approaches. Further, a well-balanced combination of\ndata-based and training-free approaches can decrease the required amount of\ntraining data and effort and can reduce the influence of distribution shifts\nbetween the training and application phases. This paper proposes a runtime and\ndata-efficient hierarchical VPR pipeline that extends existing approaches and\npresents novel ideas. There are three main contributions: First, we propose\nLocal Positional Graphs (LPG), a training-free and runtime-efficient approach\nto encode spatial context information of local image features. LPG can be\ncombined with existing local feature detectors and descriptors and considerably\nimproves the image-matching quality compared to existing techniques in our\nexperiments. Second, we present Attentive Local SPED (ATLAS), an extension of\nour previous local features approach with an attention module that improves the\nfeature quality while maintaining high data efficiency. The influence of the\nproposed modifications is evaluated in an extensive ablation study. Third, we\npresent a hierarchical pipeline that exploits hyperdimensional computing to use\nthe same local features as holistic HDC-descriptors for fast candidate\nselection and for candidate reranking. We combine all contributions in a\nruntime and data-efficient VPR pipeline that shows benefits over the\nstate-of-the-art method Patch-NetVLAD on a large collection of standard place\nrecognition datasets with 15$\\%$ better performance in VPR accuracy, 54$\\times$\nfaster feature comparison speed, and 55$\\times$ less descriptor storage\noccupancy, making our method promising for real-world high-performance\nlarge-scale VPR in changing environments. Code will be made available with\npublication of this paper.\n","authors":["Fangming Yuan","Stefan Schubert","Peter Protzel","Peer Neubert"],"pdf_url":"https://arxiv.org/pdf/2403.10283v1.pdf","comment":"IEEE Robotics and Automation Letters (RA-L)"},{"id":"http://arxiv.org/abs/2309.13596v3","updated":"2024-03-15T13:08:32Z","published":"2023-09-24T09:58:49Z","title":"Advancements in 3D Lane Detection Using LiDAR Point Clouds: From Data\n Collection to Model Development","summary":" Advanced Driver-Assistance Systems (ADAS) have successfully integrated\nlearning-based techniques into vehicle perception and decision-making. However,\ntheir application in 3D lane detection for effective driving environment\nperception is hindered by the lack of comprehensive LiDAR datasets. The sparse\nnature of LiDAR point cloud data prevents an efficient manual annotation\nprocess. To solve this problem, we present LiSV-3DLane, a large-scale 3D lane\ndataset that comprises 20k frames of surround-view LiDAR point clouds with\nenriched semantic annotation. Unlike existing datasets confined to a frontal\nperspective, LiSV-3DLane provides a full 360-degree spatial panorama around the\nego vehicle, capturing complex lane patterns in both urban and highway\nenvironments. We leverage the geometric traits of lane lines and the intrinsic\nspatial attributes of LiDAR data to design a simple yet effective automatic\nannotation pipeline for generating finer lane labels. To propel future\nresearch, we propose a novel LiDAR-based 3D lane detection model, LiLaDet,\nincorporating the spatial geometry learning of the LiDAR point cloud into\nBird's Eye View (BEV) based lane identification. Experimental results indicate\nthat LiLaDet outperforms existing camera- and LiDAR-based approaches in the 3D\nlane detection task on the K-Lane dataset and our LiSV-3DLane.\n","authors":["Runkai Zhao","Yuwen Heng","Heng Wang","Yuanda Gao","Shilei Liu","Changhao Yao","Jiawen Chen","Weidong Cai"],"pdf_url":"https://arxiv.org/pdf/2309.13596v3.pdf","comment":"Accepted by ICRA2024"},{"id":"http://arxiv.org/abs/2403.10261v1","updated":"2024-03-15T12:48:44Z","published":"2024-03-15T12:48:44Z","title":"Towards Generalizable Deepfake Video Detection with Thumbnail Layout and\n Graph Reasoning","summary":" The deepfake threats to society and cybersecurity have provoked significant\npublic apprehension, driving intensified efforts within the realm of deepfake\nvideo detection. Current video-level methods are mostly based on {3D CNNs}\nresulting in high computational demands, although have achieved good\nperformance. This paper introduces an elegantly simple yet effective strategy\nnamed Thumbnail Layout (TALL), which transforms a video clip into a pre-defined\nlayout to realize the preservation of spatial and temporal dependencies. This\ntransformation process involves sequentially masking frames at the same\npositions within each frame. These frames are then resized into sub-frames and\nreorganized into the predetermined layout, forming thumbnails. TALL is\nmodel-agnostic and has remarkable simplicity, necessitating only minimal code\nmodifications. Furthermore, we introduce a graph reasoning block (GRB) and\nsemantic consistency (SC) loss to strengthen TALL, culminating in TALL++. GRB\nenhances interactions between different semantic regions to capture\nsemantic-level inconsistency clues. The semantic consistency loss imposes\nconsistency constraints on semantic features to improve model generalization\nability. Extensive experiments on intra-dataset, cross-dataset,\ndiffusion-generated image detection, and deepfake generation method recognition\nshow that TALL++ achieves results surpassing or comparable to the\nstate-of-the-art methods, demonstrating the effectiveness of our approaches for\nvarious deepfake detection problems. The code is available at\nhttps://github.com/rainy-xu/TALL4Deepfake.\n","authors":["Yuting Xu","Jian Liang","Lijun Sheng","Xiao-Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.10261v1.pdf","comment":"Accepted by IJCV"},{"id":"http://arxiv.org/abs/2402.00038v2","updated":"2024-03-15T12:47:51Z","published":"2024-01-10T13:06:52Z","title":"Detecting Brain Tumors through Multimodal Neural Networks","summary":" Tumors can manifest in various forms and in different areas of the human\nbody. Brain tumors are specifically hard to diagnose and treat because of the\ncomplexity of the organ in which they develop. Detecting them in time can lower\nthe chances of death and facilitate the therapy process for patients. The use\nof Artificial Intelligence (AI) and, more specifically, deep learning, has the\npotential to significantly reduce costs in terms of time and resources for the\ndiscovery and identification of tumors from images obtained through imaging\ntechniques. This research work aims to assess the performance of a multimodal\nmodel for the classification of Magnetic Resonance Imaging (MRI) scans\nprocessed as grayscale images. The results are promising, and in line with\nsimilar works, as the model reaches an accuracy of around 98\\%. We also\nhighlight the need for explainability and transparency to ensure human control\nand safety.\n","authors":["Antonio Curci","Andrea Esposito"],"pdf_url":"https://arxiv.org/pdf/2402.00038v2.pdf","comment":"Presented at NeroPRAI 2024 (co-located with ICPRAM 2024). This\n version did not undergo peer review: refer to the open access version of\n record (see DOI)"},{"id":"http://arxiv.org/abs/2403.10255v1","updated":"2024-03-15T12:45:40Z","published":"2024-03-15T12:45:40Z","title":"Arbitrary-Scale Image Generation and Upsampling using Latent Diffusion\n Model and Implicit Neural Decoder","summary":" Super-resolution (SR) and image generation are important tasks in computer\nvision and are widely adopted in real-world applications. Most existing\nmethods, however, generate images only at fixed-scale magnification and suffer\nfrom over-smoothing and artifacts. Additionally, they do not offer enough\ndiversity of output images nor image consistency at different scales. Most\nrelevant work applied Implicit Neural Representation (INR) to the denoising\ndiffusion model to obtain continuous-resolution yet diverse and high-quality SR\nresults. Since this model operates in the image space, the larger the\nresolution of image is produced, the more memory and inference time is\nrequired, and it also does not maintain scale-specific consistency. We propose\na novel pipeline that can super-resolve an input image or generate from a\nrandom noise a novel image at arbitrary scales. The method consists of a\npretrained auto-encoder, a latent diffusion model, and an implicit neural\ndecoder, and their learning strategies. The proposed method adopts diffusion\nprocesses in a latent space, thus efficient, yet aligned with output image\nspace decoded by MLPs at arbitrary scales. More specifically, our\narbitrary-scale decoder is designed by the symmetric decoder w/o up-scaling\nfrom the pretrained auto-encoder, and Local Implicit Image Function (LIIF) in\nseries. The latent diffusion process is learnt by the denoising and the\nalignment losses jointly. Errors in output images are backpropagated via the\nfixed decoder, improving the quality of output images. In the extensive\nexperiments using multiple public benchmarks on the two tasks i.e. image\nsuper-resolution and novel image generation at arbitrary scales, the proposed\nmethod outperforms relevant methods in metrics of image quality, diversity and\nscale consistency. It is significantly better than the relevant prior-art in\nthe inference speed and memory usage.\n","authors":["Jinseok Kim","Tae-Kyun Kim"],"pdf_url":"https://arxiv.org/pdf/2403.10255v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.10254v1","updated":"2024-03-15T12:44:35Z","published":"2024-03-15T12:44:35Z","title":"Magic Tokens: Select Diverse Tokens for Multi-modal Object\n Re-Identification","summary":" Single-modal object re-identification (ReID) faces great challenges in\nmaintaining robustness within complex visual scenarios. In contrast,\nmulti-modal object ReID utilizes complementary information from diverse\nmodalities, showing great potentials for practical applications. However,\nprevious methods may be easily affected by irrelevant backgrounds and usually\nignore the modality gaps. To address above issues, we propose a novel learning\nframework named \\textbf{EDITOR} to select diverse tokens from vision\nTransformers for multi-modal object ReID. We begin with a shared vision\nTransformer to extract tokenized features from different input modalities.\nThen, we introduce a Spatial-Frequency Token Selection (SFTS) module to\nadaptively select object-centric tokens with both spatial and frequency\ninformation. Afterwards, we employ a Hierarchical Masked Aggregation (HMA)\nmodule to facilitate feature interactions within and across modalities.\nFinally, to further reduce the effect of backgrounds, we propose a Background\nConsistency Constraint (BCC) and an Object-Centric Feature Refinement (OCFR).\nThey are formulated as two new loss functions, which improve the feature\ndiscrimination with background suppression. As a result, our framework can\ngenerate more discriminative features for multi-modal object ReID. Extensive\nexperiments on three multi-modal ReID benchmarks verify the effectiveness of\nour methods. The code is available at https://github.com/924973292/EDITOR.\n","authors":["Pingping Zhang","Yuhao Wang","Yang Liu","Zhengzheng Tu","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2403.10254v1.pdf","comment":"This work is accepted by CVPR2024. More modifications may be\n performed"},{"id":"http://arxiv.org/abs/2311.10329v4","updated":"2024-03-15T12:43:41Z","published":"2023-11-17T05:03:53Z","title":"High-fidelity Person-centric Subject-to-Image Synthesis","summary":" Current subject-driven image generation methods encounter significant\nchallenges in person-centric image generation. The reason is that they learn\nthe semantic scene and person generation by fine-tuning a common pre-trained\ndiffusion, which involves an irreconcilable training imbalance. Precisely, to\ngenerate realistic persons, they need to sufficiently tune the pre-trained\nmodel, which inevitably causes the model to forget the rich semantic scene\nprior and makes scene generation over-fit to the training data. Moreover, even\nwith sufficient fine-tuning, these methods can still not generate high-fidelity\npersons since joint learning of the scene and person generation also lead to\nquality compromise. In this paper, we propose Face-diffuser, an effective\ncollaborative generation pipeline to eliminate the above training imbalance and\nquality compromise. Specifically, we first develop two specialized pre-trained\ndiffusion models, i.e., Text-driven Diffusion Model (TDM) and Subject-augmented\nDiffusion Model (SDM), for scene and person generation, respectively. The\nsampling process is divided into three sequential stages, i.e., semantic scene\nconstruction, subject-scene fusion, and subject enhancement. The first and last\nstages are performed by TDM and SDM respectively. The subject-scene fusion\nstage, that is the collaboration achieved through a novel and highly effective\nmechanism, Saliency-adaptive Noise Fusion (SNF). Specifically, it is based on\nour key observation that there exists a robust link between classifier-free\nguidance responses and the saliency of generated images. In each time step, SNF\nleverages the unique strengths of each model and allows for the spatial\nblending of predicted noises from both models automatically in a saliency-aware\nmanner. Extensive experiments confirm the impressive effectiveness and\nrobustness of the Face-diffuser.\n","authors":["Yibin Wang","Weizhong Zhang","Jianwei Zheng","Cheng Jin"],"pdf_url":"https://arxiv.org/pdf/2311.10329v4.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.10252v1","updated":"2024-03-15T12:41:30Z","published":"2024-03-15T12:41:30Z","title":"Region-aware Distribution Contrast: A Novel Approach to Multi-Task\n Partially Supervised Learning","summary":" In this study, we address the intricate challenge of multi-task dense\nprediction, encompassing tasks such as semantic segmentation, depth estimation,\nand surface normal estimation, particularly when dealing with partially\nannotated data (MTPSL). The complexity arises from the absence of complete task\nlabels for each training image. Given the inter-related nature of these\npixel-wise dense tasks, our focus is on mining and capturing cross-task\nrelationships. Existing solutions typically rely on learning global image\nrepresentations for global cross-task image matching, imposing constraints\nthat, unfortunately, sacrifice the finer structures within the images.\nAttempting local matching as a remedy faces hurdles due to the lack of precise\nregion supervision, making local alignment a challenging endeavor. The\nintroduction of Segment Anything Model (SAM) sheds light on addressing local\nalignment challenges by providing free and high-quality solutions for region\ndetection. Leveraging SAM-detected regions, the subsequent challenge lies in\naligning the representations within these regions. Diverging from conventional\nmethods that directly learn a monolithic image representation, our proposal\ninvolves modeling region-wise representations using Gaussian Distributions.\nAligning these distributions between corresponding regions from different tasks\nimparts higher flexibility and capacity to capture intra-region structures,\naccommodating a broader range of tasks. This innovative approach significantly\nenhances our ability to effectively capture cross-task relationships, resulting\nin improved overall performance in partially supervised multi-task dense\nprediction scenarios. Extensive experiments conducted on two widely used\nbenchmarks underscore the superior effectiveness of our proposed method,\nshowcasing state-of-the-art performance even when compared to fully supervised\nmethods.\n","authors":["Meixuan Li","Tianyu Li","Guoqing Wang","Peng Wang","Yang Yang","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2403.10252v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01435v2","updated":"2024-03-15T12:39:24Z","published":"2023-12-03T15:56:09Z","title":"Automatic Report Generation for Histopathology images using pre-trained\n Vision Transformers and BERT","summary":" Deep learning for histopathology has been successfully used for disease\nclassification, image segmentation and more. However, combining image and text\nmodalities using current state-of-the-art (SOTA) methods has been a challenge\ndue to the high resolution of histopathology images. Automatic report\ngeneration for histopathology images is one such challenge. In this work, we\nshow that using an existing pre-trained Vision Transformer (ViT) to encode\n4096x4096 sized patches of the Whole Slide Image (WSI) and a pre-trained\nBidirectional Encoder Representations from Transformers (BERT) model for\nlanguage modeling-based decoder for report generation, we can build a\nperformant and portable report generation mechanism that takes into account the\nwhole high resolution image. Our method allows us to not only generate and\nevaluate captions that describe the image, but also helps us classify the image\ninto tissue types and the gender of the patient as well. Our best performing\nmodel achieves a 89.52% accuracy in Tissue Type classification with a BLEU-4\nscore of 0.12 in our caption generation task.\n","authors":["Saurav Sengupta","Donald E. Brown"],"pdf_url":"https://arxiv.org/pdf/2312.01435v2.pdf","comment":"Accepted at IEEE ISBI 2024. arXiv admin note: substantial text\n overlap with arXiv:2311.06176"},{"id":"http://arxiv.org/abs/2310.11890v2","updated":"2024-03-15T12:31:26Z","published":"2023-10-18T11:19:32Z","title":"IRAD: Implicit Representation-driven Image Resampling against\n Adversarial Attacks","summary":" We introduce a novel approach to counter adversarial attacks, namely, image\nresampling. Image resampling transforms a discrete image into a new one,\nsimulating the process of scene recapturing or rerendering as specified by a\ngeometrical transformation. The underlying rationale behind our idea is that\nimage resampling can alleviate the influence of adversarial perturbations while\npreserving essential semantic information, thereby conferring an inherent\nadvantage in defending against adversarial attacks. To validate this concept,\nwe present a comprehensive study on leveraging image resampling to defend\nagainst adversarial attacks. We have developed basic resampling methods that\nemploy interpolation strategies and coordinate shifting magnitudes. Our\nanalysis reveals that these basic methods can partially mitigate adversarial\nattacks. However, they come with apparent limitations: the accuracy of clean\nimages noticeably decreases, while the improvement in accuracy on adversarial\nexamples is not substantial. We propose implicit representation-driven image\nresampling (IRAD) to overcome these limitations. First, we construct an\nimplicit continuous representation that enables us to represent any input image\nwithin a continuous coordinate space. Second, we introduce SampleNet, which\nautomatically generates pixel-wise shifts for resampling in response to\ndifferent inputs. Furthermore, we can extend our approach to the\nstate-of-the-art diffusion-based method, accelerating it with fewer time steps\nwhile preserving its defense capability. Extensive experiments demonstrate that\nour method significantly enhances the adversarial robustness of diverse deep\nmodels against various attacks while maintaining high accuracy on clean images.\n","authors":["Yue Cao","Tianlin Li","Xiaofeng Cao","Ivor Tsang","Yang Liu","Qing Guo"],"pdf_url":"https://arxiv.org/pdf/2310.11890v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10245v1","updated":"2024-03-15T12:28:21Z","published":"2024-03-15T12:28:21Z","title":"CoLeCLIP: Open-Domain Continual Learning via Joint Task Prompt and\n Vocabulary Learning","summary":" This paper explores the problem of continual learning (CL) of vision-language\nmodels (VLMs) in open domains, where the models need to perform continual\nupdating and inference on a streaming of datasets from diverse seen and unseen\ndomains with novel classes. Such a capability is crucial for various\napplications in open environments, e.g., AI assistants, autonomous driving\nsystems, and robotics. Current CL studies mostly focus on closed-set scenarios\nin a single domain with known classes. Large pre-trained VLMs like CLIP have\ndemonstrated superior zero-shot recognition ability, and a number of recent\nstudies leverage this ability to mitigate catastrophic forgetting in CL, but\nthey focus on closed-set CL in a single domain dataset. Open-domain CL of large\nVLMs is significantly more challenging due to 1) large class correlations and\ndomain gaps across the datasets and 2) the forgetting of zero-shot knowledge in\nthe pre-trained VLMs in addition to the knowledge learned from the newly\nadapted datasets. In this work we introduce a novel approach, termed CoLeCLIP,\nthat learns an open-domain CL model based on CLIP. It addresses these\nchallenges by a joint learning of a set of task prompts and a cross-domain\nclass vocabulary. Extensive experiments on 11 domain datasets show that\nCoLeCLIP outperforms state-of-the-art methods for open-domain CL under both\ntask- and class-incremental learning settings.\n","authors":["Yukun Li","Guansong Pang","Wei Suo","Chenchen Jing","Yuling Xi","Lingqiao Liu","Hao Chen","Guoqiang Liang","Peng Wang"],"pdf_url":"https://arxiv.org/pdf/2403.10245v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10242v1","updated":"2024-03-15T12:24:36Z","published":"2024-03-15T12:24:36Z","title":"FDGaussian: Fast Gaussian Splatting from Single Image via\n Geometric-aware Diffusion Model","summary":" Reconstructing detailed 3D objects from single-view images remains a\nchallenging task due to the limited information available. In this paper, we\nintroduce FDGaussian, a novel two-stage framework for single-image 3D\nreconstruction. Recent methods typically utilize pre-trained 2D diffusion\nmodels to generate plausible novel views from the input image, yet they\nencounter issues with either multi-view inconsistency or lack of geometric\nfidelity. To overcome these challenges, we propose an orthogonal plane\ndecomposition mechanism to extract 3D geometric features from the 2D input,\nenabling the generation of consistent multi-view images. Moreover, we further\naccelerate the state-of-the-art Gaussian Splatting incorporating epipolar\nattention to fuse images from different viewpoints. We demonstrate that\nFDGaussian generates images with high consistency across different views and\nreconstructs high-quality 3D objects, both qualitatively and quantitatively.\nMore examples can be found at our website https://qjfeng.net/FDGaussian/.\n","authors":["Qijun Feng","Zhen Xing","Zuxuan Wu","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2403.10242v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16352v3","updated":"2024-03-15T12:19:09Z","published":"2024-01-29T17:56:42Z","title":"Adversarial Training on Purification (AToP): Advancing Both Robustness\n and Generalization","summary":" The deep neural networks are known to be vulnerable to well-designed\nadversarial attacks. The most successful defense technique based on adversarial\ntraining (AT) can achieve optimal robustness against particular attacks but\ncannot generalize well to unseen attacks. Another effective defense technique\nbased on adversarial purification (AP) can enhance generalization but cannot\nachieve optimal robustness. Meanwhile, both methods share one common limitation\non the degraded standard accuracy. To mitigate these issues, we propose a novel\npipeline to acquire the robust purifier model, named Adversarial Training on\nPurification (AToP), which comprises two components: perturbation destruction\nby random transforms (RT) and purifier model fine-tuned (FT) by adversarial\nloss. RT is essential to avoid overlearning to known attacks, resulting in the\nrobustness generalization to unseen attacks, and FT is essential for the\nimprovement of robustness. To evaluate our method in an efficient and scalable\nway, we conduct extensive experiments on CIFAR-10, CIFAR-100, and ImageNette to\ndemonstrate that our method achieves optimal robustness and exhibits\ngeneralization ability against unseen attacks.\n","authors":["Guang Lin","Chao Li","Jianhai Zhang","Toshihisa Tanaka","Qibin Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.16352v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10236v1","updated":"2024-03-15T12:05:44Z","published":"2024-03-15T12:05:44Z","title":"A Fixed-Point Approach to Unified Prompt-Based Counting","summary":" Existing class-agnostic counting models typically rely on a single type of\nprompt, e.g., box annotations. This paper aims to establish a comprehensive\nprompt-based counting framework capable of generating density maps for\nconcerned objects indicated by various prompt types, such as box, point, and\ntext. To achieve this goal, we begin by converting prompts from different\nmodalities into prompt masks without requiring training. These masks are then\nintegrated into a class-agnostic counting methodology for predicting density\nmaps. Furthermore, we introduce a fixed-point inference along with an\nassociated loss function to improve counting accuracy, all without introducing\nnew parameters. The effectiveness of this method is substantiated both\ntheoretically and experimentally. Additionally, a contrastive training scheme\nis implemented to mitigate dataset bias inherent in current class-agnostic\ncounting datasets, a strategy whose effectiveness is confirmed by our ablation\nstudy. Our model excels in prominent class-agnostic datasets and exhibits\nsuperior performance in cross-dataset adaptation tasks.\n","authors":["Wei Lin","Antoni B. Chan"],"pdf_url":"https://arxiv.org/pdf/2403.10236v1.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2311.00500v2","updated":"2024-03-15T12:05:14Z","published":"2023-11-01T13:00:46Z","title":"Intriguing Properties of Data Attribution on Diffusion Models","summary":" Data attribution seeks to trace model outputs back to training data. With the\nrecent development of diffusion models, data attribution has become a desired\nmodule to properly assign valuations for high-quality or copyrighted training\nsamples, ensuring that data contributors are fairly compensated or credited.\nSeveral theoretically motivated methods have been proposed to implement data\nattribution, in an effort to improve the trade-off between computational\nscalability and effectiveness. In this work, we conduct extensive experiments\nand ablation studies on attributing diffusion models, specifically focusing on\nDDPMs trained on CIFAR-10 and CelebA, as well as a Stable Diffusion model\nLoRA-finetuned on ArtBench. Intriguingly, we report counter-intuitive\nobservations that theoretically unjustified design choices for attribution\nempirically outperform previous baselines by a large margin, in terms of both\nlinear datamodeling score and counterfactual evaluation. Our work presents a\nsignificantly more efficient approach for attributing diffusion models, while\nthe unexpected findings suggest that at least in non-convex settings,\nconstructions guided by theoretical assumptions may lead to inferior\nattribution performance. The code is available at\nhttps://github.com/sail-sg/D-TRAK.\n","authors":["Xiaosen Zheng","Tianyu Pang","Chao Du","Jing Jiang","Min Lin"],"pdf_url":"https://arxiv.org/pdf/2311.00500v2.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2403.10228v1","updated":"2024-03-15T11:58:18Z","published":"2024-03-15T11:58:18Z","title":"HawkEye: Training Video-Text LLMs for Grounding Text in Videos","summary":" Video-text Large Language Models (video-text LLMs) have shown remarkable\nperformance in answering questions and holding conversations on simple videos.\nHowever, they perform almost the same as random on grounding text queries in\nlong and complicated videos, having little ability to understand and reason\nabout temporal information, which is the most fundamental difference between\nvideos and images. In this paper, we propose HawkEye, one of the first\nvideo-text LLMs that can perform temporal video grounding in a fully\ntext-to-text manner. To collect training data that is applicable for temporal\nvideo grounding, we construct InternVid-G, a large-scale video-text corpus with\nsegment-level captions and negative spans, with which we introduce two new\ntime-aware training objectives to video-text LLMs. We also propose a\ncoarse-grained method of representing segments in videos, which is more robust\nand easier for LLMs to learn and follow than other alternatives. Extensive\nexperiments show that HawkEye is better at temporal video grounding and\ncomparable on other video-text tasks with existing video-text LLMs, which\nverifies its superior video-text multi-modal understanding abilities.\n","authors":["Yueqian Wang","Xiaojun Meng","Jianxin Liang","Yuxuan Wang","Qun Liu","Dongyan Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.10228v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.04701v2","updated":"2024-03-15T11:43:21Z","published":"2024-03-07T17:48:48Z","title":"ObjectCompose: Evaluating Resilience of Vision-Based Models on\n Object-to-Background Compositional Changes","summary":" Given the large-scale multi-modal training of recent vision-based models and\ntheir generalization capabilities, understanding the extent of their robustness\nis critical for their real-world deployment. In this work, we evaluate the\nresilience of current vision-based models against diverse object-to-background\ncontext variations. The majority of robustness evaluation methods have\nintroduced synthetic datasets to induce changes to object characteristics\n(viewpoints, scale, color) or utilized image transformation techniques\n(adversarial changes, common corruptions) on real images to simulate shifts in\ndistributions. Recent works have explored leveraging large language models and\ndiffusion models to generate changes in the background. However, these methods\neither lack in offering control over the changes to be made or distort the\nobject semantics, making them unsuitable for the task. Our method, on the other\nhand, can induce diverse object-to-background changes while preserving the\noriginal semantics and appearance of the object. To achieve this goal, we\nharness the generative capabilities of text-to-image, image-to-text, and\nimage-to-segment models to automatically generate a broad spectrum of\nobject-to-background changes. We induce both natural and adversarial background\nchanges by either modifying the textual prompts or optimizing the latents and\ntextual embedding of text-to-image models. This allows us to quantify the role\nof background context in understanding the robustness and generalization of\ndeep neural networks. We produce various versions of standard vision datasets\n(ImageNet, COCO), incorporating either diverse and realistic backgrounds into\nthe images or introducing color, texture, and adversarial changes in the\nbackground. We conduct extensive experiment to analyze the robustness of\nvision-based models against object-to-background context variations across\ndiverse tasks.\n","authors":["Hashmat Shadab Malik","Muhammad Huzaifa","Muzammal Naseer","Salman Khan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2403.04701v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10216v1","updated":"2024-03-15T11:36:26Z","published":"2024-03-15T11:36:26Z","title":"Exploring Optical Flow Inclusion into nnU-Net Framework for Surgical\n Instrument Segmentation","summary":" Surgical instrument segmentation in laparoscopy is essential for\ncomputer-assisted surgical systems. Despite the Deep Learning progress in\nrecent years, the dynamic setting of laparoscopic surgery still presents\nchallenges for precise segmentation. The nnU-Net framework excelled in semantic\nsegmentation analyzing single frames without temporal information. The\nframework's ease of use, including its ability to be automatically configured,\nand its low expertise requirements, have made it a popular base framework for\ncomparisons. Optical flow (OF) is a tool commonly used in video tasks to\nestimate motion and represent it in a single frame, containing temporal\ninformation. This work seeks to employ OF maps as an additional input to the\nnnU-Net architecture to improve its performance in the surgical instrument\nsegmentation task, taking advantage of the fact that instruments are the main\nmoving objects in the surgical field. With this new input, the temporal\ncomponent would be indirectly added without modifying the architecture. Using\nCholecSeg8k dataset, three different representations of movement were estimated\nand used as new inputs, comparing them with a baseline model. Results showed\nthat the use of OF maps improves the detection of classes with high movement,\neven when these are scarce in the dataset. To further improve performance,\nfuture work may focus on implementing other OF-preserving augmentations.\n","authors":["Marcos Fernández-Rodríguez","Bruno Silva","Sandro Queirós","Helena R. Torres","Bruno Oliveira","Pedro Morais","Lukas R. Buschle","Jorge Correia-Pinto","Estevão Lima","João L. Vilaça"],"pdf_url":"https://arxiv.org/pdf/2403.10216v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10211v1","updated":"2024-03-15T11:21:34Z","published":"2024-03-15T11:21:34Z","title":"BlindDiff: Empowering Degradation Modelling in Diffusion Models for\n Blind Image Super-Resolution","summary":" Diffusion models (DM) have achieved remarkable promise in image\nsuper-resolution (SR). However, most of them are tailored to solving non-blind\ninverse problems with fixed known degradation settings, limiting their\nadaptability to real-world applications that involve complex unknown\ndegradations. In this work, we propose BlindDiff, a DM-based blind SR method to\ntackle the blind degradation settings in SISR. BlindDiff seamlessly integrates\nthe MAP-based optimization into DMs, which constructs a joint distribution of\nthe low-resolution (LR) observation, high-resolution (HR) data, and degradation\nkernels for the data and kernel priors, and solves the blind SR problem by\nunfolding MAP approach along with the reverse process. Unlike most DMs,\nBlindDiff firstly presents a modulated conditional transformer (MCFormer) that\nis pre-trained with noise and kernel constraints, further serving as a\nposterior sampler to provide both priors simultaneously. Then, we plug a simple\nyet effective kernel-aware gradient term between adjacent sampling iterations\nthat guides the diffusion model to learn degradation consistency knowledge.\nThis also enables to joint refine the degradation model as well as HR images by\nobserving the previous denoised sample. With the MAP-based reverse diffusion\nprocess, we show that BlindDiff advocates alternate optimization for blur\nkernel estimation and HR image restoration in a mutual reinforcing manner.\nExperiments on both synthetic and real-world datasets show that BlindDiff\nachieves the state-of-the-art performance with significant model complexity\nreduction compared to recent DM-based methods. Code will be available at\n\\url{https://github.com/lifengcs/BlindDiff}\n","authors":["Feng Li","Yixuan Wu","Zichao Liang","Runmin Cong","Huihui Bai","Yao Zhao","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2403.10211v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.13607v2","updated":"2024-03-15T11:19:30Z","published":"2024-02-21T08:21:12Z","title":"CODIS: Benchmarking Context-Dependent Visual Comprehension for\n Multimodal Large Language Models","summary":" Multimodal large language models (MLLMs) have demonstrated promising results\nin a variety of tasks that combine vision and language. As these models become\nmore integral to research and applications, conducting comprehensive\nevaluations of their capabilities has grown increasingly important. However,\nmost existing benchmarks fail to consider that, in certain situations, images\nneed to be interpreted within a broader context. In this work, we introduce a\nnew benchmark, named as CODIS, designed to assess the ability of models to use\ncontext provided in free-form text to enhance visual comprehension. Our\nfindings indicate that MLLMs consistently fall short of human performance on\nthis benchmark. Further analysis confirms that these models struggle to\neffectively extract and utilize contextual information to improve their\nunderstanding of images. This underscores the pressing need to enhance the\nability of MLLMs to comprehend visuals in a context-dependent manner. View our\nproject website at https://thunlp-mt.github.io/CODIS.\n","authors":["Fuwen Luo","Chi Chen","Zihao Wan","Zhaolu Kang","Qidong Yan","Yingjie Li","Xiaolong Wang","Siyu Wang","Ziyue Wang","Xiaoyue Mi","Peng Li","Ning Ma","Maosong Sun","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2402.13607v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10206v1","updated":"2024-03-15T11:15:06Z","published":"2024-03-15T11:15:06Z","title":"A Data-Driven Approach for Mitigating Dark Current Noise and Bad Pixels\n in Complementary Metal Oxide Semiconductor Cameras for Space-based Telescopes","summary":" In recent years, there has been a gradual increase in the performance of\nComplementary Metal Oxide Semiconductor (CMOS) cameras. These cameras have\ngained popularity as a viable alternative to charge-coupled device (CCD)\ncameras in a wide range of applications. One particular application is the CMOS\ncamera installed in small space telescopes. However, the limited power and\nspatial resources available on satellites present challenges in maintaining\nideal observation conditions, including temperature and radiation environment.\nConsequently, images captured by CMOS cameras are susceptible to issues such as\ndark current noise and defective pixels. In this paper, we introduce a\ndata-driven framework for mitigating dark current noise and bad pixels for CMOS\ncameras. Our approach involves two key steps: pixel clustering and function\nfitting. During pixel clustering step, we identify and group pixels exhibiting\nsimilar dark current noise properties. Subsequently, in the function fitting\nstep, we formulate functions that capture the relationship between dark current\nand temperature, as dictated by the Arrhenius law. Our framework leverages\nground-based test data to establish distinct temperature-dark current relations\nfor pixels within different clusters. The cluster results could then be\nutilized to estimate the dark current noise level and detect bad pixels from\nreal observational data. To assess the effectiveness of our approach, we have\nconducted tests using real observation data obtained from the Yangwang-1\nsatellite, equipped with a near-ultraviolet telescope and an optical telescope.\nThe results show a considerable improvement in the detection efficiency of\nspace-based telescopes.\n","authors":["Peng Jia","Chao Lv","Yushan Li","Yongyang Sun","Shu Niu","Zhuoxiao Wang"],"pdf_url":"https://arxiv.org/pdf/2403.10206v1.pdf","comment":"Accepted by the AJ, comments are welcome. The complete code could be\n downloaded from: DOI: 10.12149/101387"},{"id":"http://arxiv.org/abs/2403.10202v1","updated":"2024-03-15T11:07:38Z","published":"2024-03-15T11:07:38Z","title":"Learning on JPEG-LDPC Compressed Images: Classifying with Syndromes","summary":" In goal-oriented communications, the objective of the receiver is often to\napply a Deep-Learning model, rather than reconstructing the original data. In\nthis context, direct learning over compressed data, without any prior decoding,\nholds promise for enhancing the time-efficient execution of inference models at\nthe receiver. However, conventional entropic-coding methods like Huffman and\nArithmetic break data structure, rendering them unsuitable for learning without\ndecoding. In this paper, we propose an alternative approach in which entropic\ncoding is realized with Low-Density Parity Check (LDPC) codes. We hypothesize\nthat Deep Learning models can more effectively exploit the internal code\nstructure of LDPC codes. At the receiver, we leverage a specific class of\nRecurrent Neural Networks (RNNs), specifically Gated Recurrent Unit (GRU),\ntrained for image classification. Our numerical results indicate that\nclassification based on LDPC-coded bit-planes surpasses Huffman and Arithmetic\ncoding, while necessitating a significantly smaller learning model. This\ndemonstrates the efficiency of classification directly from LDPC-coded data,\neliminating the need for any form of decompression, even partial, prior to\napplying the learning model.\n","authors":["Ahcen Aliouat","Elsa Dupraz"],"pdf_url":"https://arxiv.org/pdf/2403.10202v1.pdf","comment":"5 pages, 3 figures, conference paper, submitted to the EUSIPCO 2024\n Conference"},{"id":"http://arxiv.org/abs/2403.07786v2","updated":"2024-03-15T10:56:22Z","published":"2024-03-12T16:20:27Z","title":"Generative deep learning-enabled ultra-large field-of-view lens-free\n imaging","summary":" Advancements in high-throughput biomedical applications necessitate\nreal-time, large field-of-view (FOV) imaging capabilities. Conventional\nlens-free imaging (LFI) systems, while addressing the limitations of physical\nlenses, have been constrained by dynamic, hard-to-model optical fields,\nresulting in a limited one-shot FOV of approximately 20 $mm^2$. This\nrestriction has been a major bottleneck in applications like live-cell imaging\nand automation of microfluidic systems for biomedical research. Here, we\npresent a deep-learning(DL)-based imaging framework - GenLFI - leveraging\ngenerative artificial intelligence (AI) for holographic image reconstruction.\nWe demonstrate that GenLFI can achieve a real-time FOV over 550 $mm^2$,\nsurpassing the current LFI system by more than 20-fold, and even larger than\nthe world's largest confocal microscope by 1.76 times. The resolution is at the\nsub-pixel level of 5.52 $\\mu m$, without the need for a shifting light source.\nThe unsupervised learning-based reconstruction does not require optical field\nmodeling, making imaging dynamic 3D samples (e.g., droplet-based microfluidics\nand 3D cell models) in complex optical fields possible. This GenLFI framework\nunlocks the potential of LFI systems, offering a robust tool to tackle new\nfrontiers in high-throughput biomedical applications such as drug discovery.\n","authors":["Ronald B. Liu","Zhe Liu","Max G. A. Wolf","Krishna P. Purohit","Gregor Fritz","Yi Feng","Carsten G. Hansen","Pierre O. Bagnaninchi","Xavier Casadevall i Solvas","Yunjie Yang"],"pdf_url":"https://arxiv.org/pdf/2403.07786v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.08553v3","updated":"2024-03-15T10:53:11Z","published":"2023-05-15T11:30:28Z","title":"Distilling Knowledge for Short-to-Long Term Trajectory Prediction","summary":" Long-term trajectory forecasting is an important and challenging problem in\nthe fields of computer vision, machine learning, and robotics. One fundamental\ndifficulty stands in the evolution of the trajectory that becomes more and more\nuncertain and unpredictable as the time horizon grows, subsequently increasing\nthe complexity of the problem. To overcome this issue, in this paper, we\npropose Di-Long, a new method that employs the distillation of a short-term\ntrajectory model forecaster that guides a student network for long-term\ntrajectory prediction during the training process. Given a total sequence\nlength that comprehends the allowed observation for the student network and the\ncomplementary target sequence, we let the student and the teacher solve two\ndifferent related tasks defined over the same full trajectory: the student\nobserves a short sequence and predicts a long trajectory, whereas the teacher\nobserves a longer sequence and predicts the remaining short target trajectory.\nThe teacher's task is less uncertain, and we use its accurate predictions to\nguide the student through our knowledge distillation framework, reducing\nlong-term future uncertainty. Our experiments show that our proposed Di-Long\nmethod is effective for long-term forecasting and achieves state-of-the-art\nperformance on the Intersection Drone Dataset (inD) and the Stanford Drone\nDataset (SDD).\n","authors":["Sourav Das","Guglielmo Camporese","Shaokang Cheng","Lamberto Ballan"],"pdf_url":"https://arxiv.org/pdf/2305.08553v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10191v1","updated":"2024-03-15T10:52:39Z","published":"2024-03-15T10:52:39Z","title":"Generative Region-Language Pretraining for Open-Ended Object Detection","summary":" In recent research, significant attention has been devoted to the\nopen-vocabulary object detection task, aiming to generalize beyond the limited\nnumber of classes labeled during training and detect objects described by\narbitrary category names at inference. Compared with conventional object\ndetection, open vocabulary object detection largely extends the object\ndetection categories. However, it relies on calculating the similarity between\nimage regions and a set of arbitrary category names with a pretrained\nvision-and-language model. This implies that, despite its open-set nature, the\ntask still needs the predefined object categories during the inference stage.\nThis raises the question: What if we do not have exact knowledge of object\ncategories during inference? In this paper, we call such a new setting as\ngenerative open-ended object detection, which is a more general and practical\nproblem. To address it, we formulate object detection as a generative problem\nand propose a simple framework named GenerateU, which can detect dense objects\nand generate their names in a free-form way. Particularly, we employ Deformable\nDETR as a region proposal generator with a language model translating visual\nregions to object names. To assess the free-form object detection task, we\nintroduce an evaluation method designed to quantitatively measure the\nperformance of generative outcomes. Extensive experiments demonstrate strong\nzero-shot detection performance of our GenerateU. For example, on the LVIS\ndataset, our GenerateU achieves comparable results to the open-vocabulary\nobject detection method GLIP, even though the category names are not seen by\nGenerateU during inference. Code is available at: https://\ngithub.com/FoundationVision/GenerateU .\n","authors":["Chuang Lin","Yi Jiang","Lizhen Qu","Zehuan Yuan","Jianfei Cai"],"pdf_url":"https://arxiv.org/pdf/2403.10191v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.10190v1","updated":"2024-03-15T10:52:18Z","published":"2024-03-15T10:52:18Z","title":"Perceptual Quality-based Model Training under Annotator Label\n Uncertainty","summary":" Annotators exhibit disagreement during data labeling, which can be termed as\nannotator label uncertainty. Annotator label uncertainty manifests in\nvariations of labeling quality. Training with a single low-quality annotation\nper sample induces model reliability degradations. In this work, we first\nexamine the effects of annotator label uncertainty in terms of the model's\ngeneralizability and prediction uncertainty. We observe that the model's\ngeneralizability and prediction uncertainty degrade with the presence of\nlow-quality noisy labels. Meanwhile, our evaluation of existing uncertainty\nestimation algorithms indicates their incapability in response to annotator\nlabel uncertainty. To mitigate performance degradation, prior methods show that\ntraining models with labels collected from multiple independent annotators can\nenhance generalizability. However, they require massive annotations. Hence, we\nintroduce a novel perceptual quality-based model training framework to\nobjectively generate multiple labels for model training to enhance reliability,\nwhile avoiding massive annotations. Specifically, we first select a subset of\nsamples with low perceptual quality scores ranked by statistical regularities\nof visual signals. We then assign de-aggregated labels to each sample in this\nsubset to obtain a training set with multiple labels. Our experiments and\nanalysis demonstrate that training with the proposed framework alleviates the\ndegradation of generalizability and prediction uncertainty caused by annotator\nlabel uncertainty.\n","authors":["Chen Zhou","Mohit Prabhushankar","Ghassan AlRegib"],"pdf_url":"https://arxiv.org/pdf/2403.10190v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10179v1","updated":"2024-03-15T10:36:24Z","published":"2024-03-15T10:36:24Z","title":"Animate Your Motion: Turning Still Images into Dynamic Videos","summary":" In recent years, diffusion models have made remarkable strides in\ntext-to-video generation, sparking a quest for enhanced control over video\noutputs to more accurately reflect user intentions. Traditional efforts\npredominantly focus on employing either semantic cues, like images or depth\nmaps, or motion-based conditions, like moving sketches or object bounding\nboxes. Semantic inputs offer a rich scene context but lack detailed motion\nspecificity; conversely, motion inputs provide precise trajectory information\nbut miss the broader semantic narrative. For the first time, we integrate both\nsemantic and motion cues within a diffusion model for video generation, as\ndemonstrated in Fig 1. To this end, we introduce the Scene and Motion\nConditional Diffusion (SMCD), a novel methodology for managing multimodal\ninputs. It incorporates a recognized motion conditioning module and\ninvestigates various approaches to integrate scene conditions, promoting\nsynergy between different modalities. For model training, we separate the\nconditions for the two modalities, introducing a two-stage training pipeline.\nExperimental results demonstrate that our design significantly enhances video\nquality, motion precision, and semantic coherence.\n","authors":["Mingxiao Li","Bo Wan","Marie-Francine Moens","Tinne Tuytelaars"],"pdf_url":"https://arxiv.org/pdf/2403.10179v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2403.10173v1","updated":"2024-03-15T10:28:31Z","published":"2024-03-15T10:28:31Z","title":"A Hybrid SNN-ANN Network for Event-based Object Detection with Spatial\n and Temporal Attention","summary":" Event cameras offer high temporal resolution and dynamic range with minimal\nmotion blur, making them promising for object detection tasks. While Spiking\nNeural Networks (SNNs) are a natural match for event-based sensory data and\nenable ultra-energy efficient and low latency inference on neuromorphic\nhardware, Artificial Neural Networks (ANNs) tend to display more stable\ntraining dynamics and faster convergence resulting in greater task performance.\nHybrid SNN-ANN approaches are a promising alternative, enabling to leverage the\nstrengths of both SNN and ANN architectures. In this work, we introduce the\nfirst Hybrid Attention-based SNN-ANN backbone for object detection using event\ncameras. We propose a novel Attention-based SNN-ANN bridge module to capture\nsparse spatial and temporal relations from the SNN layer and convert them into\ndense feature maps for the ANN part of the backbone. Experimental results\ndemonstrate that our proposed method surpasses baseline hybrid and SNN-based\napproaches by significant margins, with results comparable to existing\nANN-based methods. Extensive ablation studies confirm the effectiveness of our\nproposed modules and architectural choices. These results pave the way toward a\nhybrid SNN-ANN architecture that achieves ANN like performance at a drastically\nreduced parameter budget. We implemented the SNN blocks on digital neuromorphic\nhardware to investigate latency and power consumption and demonstrate the\nfeasibility of our approach.\n","authors":["Soikat Hasan Ahmed","Jan Finkbeiner","Emre Neftci"],"pdf_url":"https://arxiv.org/pdf/2403.10173v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10171v1","updated":"2024-03-15T10:27:17Z","published":"2024-03-15T10:27:17Z","title":"AUTONODE: A Neuro-Graphic Self-Learnable Engine for Cognitive GUI\n Automation","summary":" In recent advancements within the domain of Large Language Models (LLMs),\nthere has been a notable emergence of agents capable of addressing Robotic\nProcess Automation (RPA) challenges through enhanced cognitive capabilities and\nsophisticated reasoning. This development heralds a new era of scalability and\nhuman-like adaptability in goal attainment. In this context, we introduce\nAUTONODE (Autonomous User-interface Transformation through Online Neuro-graphic\nOperations and Deep Exploration). AUTONODE employs advanced neuro-graphical\ntechniques to facilitate autonomous navigation and task execution on web\ninterfaces, thereby obviating the necessity for predefined scripts or manual\nintervention. Our engine empowers agents to comprehend and implement complex\nworkflows, adapting to dynamic web environments with unparalleled efficiency.\nOur methodology synergizes cognitive functionalities with robotic automation,\nendowing AUTONODE with the ability to learn from experience. We have integrated\nan exploratory module, DoRA (Discovery and mapping Operation for graph\nRetrieval Agent), which is instrumental in constructing a knowledge graph that\nthe engine utilizes to optimize its actions and achieve objectives with minimal\nsupervision. The versatility and efficacy of AUTONODE are demonstrated through\na series of experiments, highlighting its proficiency in managing a diverse\narray of web-based tasks, ranging from data extraction to transaction\nprocessing.\n","authors":["Arkajit Datta","Tushar Verma","Rajat Chawla"],"pdf_url":"https://arxiv.org/pdf/2403.10171v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10170v1","updated":"2024-03-15T10:26:52Z","published":"2024-03-15T10:26:52Z","title":"Computer User Interface Understanding. A New Dataset and a Learning\n Framework","summary":" User Interface (UI) understanding has been an increasingly popular topic over\nthe last few years. So far, there has been a vast focus solely on web and\nmobile applications. In this paper, we introduce the harder task of computer UI\nunderstanding. With the goal of enabling research in this field, we have\ngenerated a dataset with a set of videos where a user is performing a sequence\nof actions and each image shows the desktop contents at that time point. We\nalso present a framework that is composed of a synthetic sample generation\npipeline to augment the dataset with relevant characteristics, and a\ncontrastive learning method to classify images in the videos. We take advantage\nof the natural conditional, tree-like, relationship of the images'\ncharacteristics to regularize the learning of the representations by dealing\nwith multiple partial tasks simultaneously. Experimental results show that the\nproposed framework outperforms previously proposed hierarchical multi-label\ncontrastive losses in fine-grain UI classification.\n","authors":["Andrés Muñoz","Daniel Borrajo"],"pdf_url":"https://arxiv.org/pdf/2403.10170v1.pdf","comment":"14 pages main paper, 6 pages appendix"},{"id":"http://arxiv.org/abs/2403.10166v1","updated":"2024-03-15T10:18:56Z","published":"2024-03-15T10:18:56Z","title":"SemanticHuman-HD: High-Resolution Semantic Disentangled 3D Human\n Generation","summary":" With the development of neural radiance fields and generative models,\nnumerous methods have been proposed for learning 3D human generation from 2D\nimages. These methods allow control over the pose of the generated 3D human and\nenable rendering from different viewpoints. However, none of these methods\nexplore semantic disentanglement in human image synthesis, i.e., they can not\ndisentangle the generation of different semantic parts, such as the body, tops,\nand bottoms. Furthermore, existing methods are limited to synthesize images at\n$512^2$ resolution due to the high computational cost of neural radiance\nfields. To address these limitations, we introduce SemanticHuman-HD, the first\nmethod to achieve semantic disentangled human image synthesis. Notably,\nSemanticHuman-HD is also the first method to achieve 3D-aware image synthesis\nat $1024^2$ resolution, benefiting from our proposed 3D-aware super-resolution\nmodule. By leveraging the depth maps and semantic masks as guidance for the\n3D-aware super-resolution, we significantly reduce the number of sampling\npoints during volume rendering, thereby reducing the computational cost. Our\ncomparative experiments demonstrate the superiority of our method. The\neffectiveness of each proposed component is also verified through ablation\nstudies. Moreover, our method opens up exciting possibilities for various\napplications, including 3D garment generation, semantic-aware image synthesis,\ncontrollable image synthesis, and out-of-domain image synthesis.\n","authors":["Peng Zheng","Tao Liu","Zili Yi","Rui Ma"],"pdf_url":"https://arxiv.org/pdf/2403.10166v1.pdf","comment":"26 pages, 14 figures"},{"id":"http://arxiv.org/abs/2403.10164v1","updated":"2024-03-15T10:18:06Z","published":"2024-03-15T10:18:06Z","title":"CoReEcho: Continuous Representation Learning for 2D+time\n Echocardiography Analysis","summary":" Deep learning (DL) models have been advancing automatic medical image\nanalysis on various modalities, including echocardiography, by offering a\ncomprehensive end-to-end training pipeline. This approach enables DL models to\nregress ejection fraction (EF) directly from 2D+time echocardiograms, resulting\nin superior performance. However, the end-to-end training pipeline makes the\nlearned representations less explainable. The representations may also fail to\ncapture the continuous relation among echocardiogram clips, indicating the\nexistence of spurious correlations, which can negatively affect the\ngeneralization. To mitigate this issue, we propose CoReEcho, a novel training\nframework emphasizing continuous representations tailored for direct EF\nregression. Our extensive experiments demonstrate that CoReEcho: 1) outperforms\nthe current state-of-the-art (SOTA) on the largest echocardiography dataset\n(EchoNet-Dynamic) with MAE of 3.90 & R2 of 82.44, and 2) provides robust and\ngeneralizable features that transfer more effectively in related downstream\ntasks. The code is publicly available at https://github.com/fadamsyah/CoReEcho.\n","authors":["Fadillah Adamsyah Maani","Numan Saeed","Aleksandr Matsun","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2403.10164v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01753v3","updated":"2024-03-15T10:12:48Z","published":"2024-03-04T06:19:27Z","title":"Training-Free Pretrained Model Merging","summary":" Recently, model merging techniques have surfaced as a solution to combine\nmultiple single-talent models into a single multi-talent model. However,\nprevious endeavors in this field have either necessitated additional training\nor fine-tuning processes, or require that the models possess the same\npre-trained initialization. In this work, we identify a common drawback in\nprior works w.r.t. the inconsistency of unit similarity in the weight space and\nthe activation space. To address this inconsistency, we propose an innovative\nmodel merging framework, coined as merging under dual-space constraints\n(MuDSC). Specifically, instead of solely maximizing the objective of a single\nspace, we advocate for the exploration of permutation matrices situated in a\nregion with a unified high similarity in the dual space, achieved through the\nlinear combination of activation and weight similarity matrices. In order to\nenhance usability, we have also incorporated adaptations for group structure,\nincluding Multi-Head Attention and Group Normalization. Comprehensive\nexperimental comparisons demonstrate that MuDSC can significantly boost the\nperformance of merged models with various task combinations and architectures.\nFurthermore, the visualization of the merged model within the multi-task loss\nlandscape reveals that MuDSC enables the merged model to reside in the\noverlapping segment, featuring a unified lower loss for each task. Our code is\npublicly available at https://github.com/zju-vipa/training_free_model_merging.\n","authors":["Zhengqi Xu","Ke Yuan","Huiqiong Wang","Yong Wang","Mingli Song","Jie Song"],"pdf_url":"https://arxiv.org/pdf/2403.01753v3.pdf","comment":"CVPR2024 accepted"},{"id":"http://arxiv.org/abs/2306.15612v2","updated":"2024-03-15T10:04:38Z","published":"2023-06-27T16:53:35Z","title":"Adaptive Multi-Modal Cross-Entropy Loss for Stereo Matching","summary":" Despite the great success of deep learning in stereo matching, recovering\naccurate disparity maps is still challenging. Currently, L1 and cross-entropy\nare the two most widely used losses for stereo network training. Compared with\nthe former, the latter usually performs better thanks to its probability\nmodeling and direct supervision to the cost volume. However, how to accurately\nmodel the stereo ground-truth for cross-entropy loss remains largely\nunder-explored. Existing works simply assume that the ground-truth\ndistributions are uni-modal, which ignores the fact that most of the edge\npixels can be multi-modal. In this paper, a novel adaptive multi-modal\ncross-entropy loss (ADL) is proposed to guide the networks to learn different\ndistribution patterns for each pixel. Moreover, we optimize the disparity\nestimator to further alleviate the bleeding or misalignment artifacts in\ninference. Extensive experimental results show that our method is generic and\ncan help classic stereo networks regain state-of-the-art performance. In\nparticular, GANet with our method ranks $1^{st}$ on both the KITTI 2015 and\n2012 benchmarks among the published methods. Meanwhile, excellent\nsynthetic-to-realistic generalization performance can be achieved by simply\nreplacing the traditional loss with ours.\n","authors":["Peng Xu","Zhiyu Xiang","Chenyu Qiao","Jingyun Fu","Tianyu Pu"],"pdf_url":"https://arxiv.org/pdf/2306.15612v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.08506v2","updated":"2024-03-15T10:04:16Z","published":"2024-02-13T15:02:46Z","title":"P-Mamba: Marrying Perona Malik Diffusion with Mamba for Efficient\n Pediatric Echocardiographic Left Ventricular Segmentation","summary":" In pediatric cardiology, the accurate and immediate assessment of cardiac\nfunction through echocardiography is important since it can determine whether\nurgent intervention is required in many emergencies. However, echocardiography\nis characterized by ambiguity and heavy background noise interference, bringing\nmore difficulty to accurate segmentation. Present methods lack efficiency and\nare also prone to mistakenly segmenting some background noise areas as the left\nventricular area due to noise disturbance. To relieve the two issues, we\nintroduce P-Mamba for efficient pediatric echocardiographic left ventricular\nsegmentation. Specifically, we turn to the recently proposed vision mamba\nlayers in our vision mamba encoder branch to improve the computing and memory\nefficiency of our model while modeling global dependencies. In the other\nDWT-based PMD encoder branch, we devise DWT-based Perona-Malik Diffusion (PMD)\nBlocks that utilize PMD for noise suppression, while simultaneously preserving\nthe local shape cues of the left ventricle. Leveraging the strengths of both\nthe two encoder branches, P-Mamba achieves superior accuracy and efficiency to\nestablished models, such as vision transformers with quadratic and linear\ncomputational complexity. This innovative approach promises significant\nadvancements in pediatric cardiac imaging and beyond.\n","authors":["Zi Ye","Tianxiang Chen","Fangyijie Wang","Hanwei Zhang","Guanxi Li","Lijun Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.08506v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10156v1","updated":"2024-03-15T09:58:49Z","published":"2024-03-15T09:58:49Z","title":"Cardiac valve event timing in echocardiography using deep learning and\n triplane recordings","summary":" Cardiac valve event timing plays a crucial role when conducting clinical\nmeasurements using echocardiography. However, established automated approaches\nare limited by the need of external electrocardiogram sensors, and manual\nmeasurements often rely on timing from different cardiac cycles. Recent methods\nhave applied deep learning to cardiac timing, but they have mainly been\nrestricted to only detecting two key time points, namely end-diastole (ED) and\nend-systole (ES). In this work, we propose a deep learning approach that\nleverages triplane recordings to enhance detection of valve events in\nechocardiography. Our method demonstrates improved performance detecting six\ndifferent events, including valve events conventionally associated with ED and\nES. Of all events, we achieve an average absolute frame difference (aFD) of\nmaximum 1.4 frames (29 ms) for start of diastasis, down to 0.6 frames (12 ms)\nfor mitral valve opening when performing a ten-fold cross-validation with test\nsplits on triplane data from 240 patients. On an external independent test\nconsisting of apical long-axis data from 180 other patients, the worst\nperforming event detection had an aFD of 1.8 (30 ms). The proposed approach has\nthe potential to significantly impact clinical practice by enabling more\naccurate, rapid and comprehensive event detection, leading to improved clinical\nmeasurements.\n","authors":["Benjamin Strandli Fermann","John Nyberg","Espen W. Remme","Jahn Frederik Grue","Helén Grue","Roger Håland","Lasse Lovstakken","Håvard Dalen","Bjørnar Grenne","Svein Arne Aase","Sten Roar Snar","Andreas Østvik"],"pdf_url":"https://arxiv.org/pdf/2403.10156v1.pdf","comment":"To be published in IEEE Journal of Biomedical and Health Informatics.\n 10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2403.10153v1","updated":"2024-03-15T09:54:04Z","published":"2024-03-15T09:54:04Z","title":"Improving Medical Multi-modal Contrastive Learning with Expert\n Annotations","summary":" We introduce eCLIP, an enhanced version of the CLIP model that integrates\nexpert annotations in the form of radiologist eye-gaze heatmaps. It tackles key\nchallenges in contrastive multi-modal medical imaging analysis, notably data\nscarcity and the \"modality gap\" -- a significant disparity between image and\ntext embeddings that diminishes the quality of representations and hampers\ncross-modal interoperability. eCLIP integrates a heatmap processor and\nleverages mixup augmentation to efficiently utilize the scarce expert\nannotations, thus boosting the model's learning effectiveness. eCLIP is\ndesigned to be generally applicable to any variant of CLIP without requiring\nany modifications of the core architecture. Through detailed evaluations across\nseveral tasks, including zero-shot inference, linear probing, cross-modal\nretrieval, and Retrieval Augmented Generation (RAG) of radiology reports using\na frozen Large Language Model, eCLIP showcases consistent improvements in\nembedding quality. The outcomes reveal enhanced alignment and uniformity,\naffirming eCLIP's capability to harness high-quality annotations for enriched\nmulti-modal analysis in the medical imaging domain.\n","authors":["Yogesh Kumar","Pekka Marttinen"],"pdf_url":"https://arxiv.org/pdf/2403.10153v1.pdf","comment":"Under review at a conference"},{"id":"http://arxiv.org/abs/2403.10147v1","updated":"2024-03-15T09:47:35Z","published":"2024-03-15T09:47:35Z","title":"GGRt: Towards Generalizable 3D Gaussians without Pose Priors in\n Real-Time","summary":" This paper presents GGRt, a novel approach to generalizable novel view\nsynthesis that alleviates the need for real camera poses, complexity in\nprocessing high-resolution images, and lengthy optimization processes, thus\nfacilitating stronger applicability of 3D Gaussian Splatting (3D-GS) in\nreal-world scenarios. Specifically, we design a novel joint learning framework\nthat consists of an Iterative Pose Optimization Network (IPO-Net) and a\nGeneralizable 3D-Gaussians (G-3DG) model. With the joint learning mechanism,\nthe proposed framework can inherently estimate robust relative pose information\nfrom the image observations and thus primarily alleviate the requirement of\nreal camera poses. Moreover, we implement a deferred back-propagation mechanism\nthat enables high-resolution training and inference, overcoming the resolution\nconstraints of previous methods. To enhance the speed and efficiency, we\nfurther introduce a progressive Gaussian cache module that dynamically adjusts\nduring training and inference. As the first pose-free generalizable 3D-GS\nframework, GGRt achieves inference at $\\ge$ 5 FPS and real-time rendering at\n$\\ge$ 100 FPS. Through extensive experimentation, we demonstrate that our\nmethod outperforms existing NeRF-based pose-free techniques in terms of\ninference speed and effectiveness. It can also approach the real pose-based\n3D-GS methods. Our contributions provide a significant leap forward for the\nintegration of computer vision and computer graphics into practical\napplications, offering state-of-the-art results on LLFF, KITTI, and Waymo Open\ndatasets and enabling real-time rendering for immersive experiences.\n","authors":["Hao Li","Yuanyuan Gao","Dingwen Zhang","Chenming Wu","Yalun Dai","Chen Zhao","Haocheng Feng","Errui Ding","Jingdong Wang","Junwei Han"],"pdf_url":"https://arxiv.org/pdf/2403.10147v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10145v1","updated":"2024-03-15T09:44:02Z","published":"2024-03-15T09:44:02Z","title":"RCooper: A Real-world Large-scale Dataset for Roadside Cooperative\n Perception","summary":" The value of roadside perception, which could extend the boundaries of\nautonomous driving and traffic management, has gradually become more prominent\nand acknowledged in recent years. However, existing roadside perception\napproaches only focus on the single-infrastructure sensor system, which cannot\nrealize a comprehensive understanding of a traffic area because of the limited\nsensing range and blind spots. Orienting high-quality roadside perception, we\nneed Roadside Cooperative Perception (RCooper) to achieve practical\narea-coverage roadside perception for restricted traffic areas. Rcooper has its\nown domain-specific challenges, but further exploration is hindered due to the\nlack of datasets. We hence release the first real-world, large-scale RCooper\ndataset to bloom the research on practical roadside cooperative perception,\nincluding detection and tracking. The manually annotated dataset comprises 50k\nimages and 30k point clouds, including two representative traffic scenes (i.e.,\nintersection and corridor). The constructed benchmarks prove the effectiveness\nof roadside cooperation perception and demonstrate the direction of further\nresearch. Codes and dataset can be accessed at:\nhttps://github.com/AIR-THU/DAIR-RCooper.\n","authors":["Ruiyang Hao","Siqi Fan","Yingru Dai","Zhenlin Zhang","Chenxi Li","Yuntian Wang","Haibao Yu","Wenxian Yang","Jirui Yuan","Zaiqing Nie"],"pdf_url":"https://arxiv.org/pdf/2403.10145v1.pdf","comment":"Accepted by CVPR2024. 10 pages with 6 figures"},{"id":"http://arxiv.org/abs/2311.03008v2","updated":"2024-03-15T09:35:10Z","published":"2023-11-06T10:25:26Z","title":"Exploring the Capability of Text-to-Image Diffusion Models with\n Structural Edge Guidance for Multi-Spectral Satellite Image Inpainting","summary":" The letter investigates the utility of text-to-image inpainting models for\nsatellite image data. Two technical challenges of injecting structural guiding\nsignals into the generative process as well as translating the inpainted RGB\npixels to a wider set of MSI bands are addressed by introducing a novel\ninpainting framework based on StableDiffusion and ControlNet as well as a novel\nmethod for RGB-to-MSI translation. The results on a wider set of data suggest\nthat the inpainting synthesized via StableDiffusion suffers from undesired\nartifacts and that a simple alternative of self-supervised internal inpainting\nachieves a higher quality of synthesis.\n","authors":["Mikolaj Czerkawski","Christos Tachtatzis"],"pdf_url":"https://arxiv.org/pdf/2311.03008v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.07392v2","updated":"2024-03-15T09:30:14Z","published":"2024-03-12T07:59:41Z","title":"ViT-CoMer: Vision Transformer with Convolutional Multi-scale Feature\n Interaction for Dense Predictions","summary":" Although Vision Transformer (ViT) has achieved significant success in\ncomputer vision, it does not perform well in dense prediction tasks due to the\nlack of inner-patch information interaction and the limited diversity of\nfeature scale. Most existing studies are devoted to designing vision-specific\ntransformers to solve the above problems, which introduce additional\npre-training costs. Therefore, we present a plain, pre-training-free, and\nfeature-enhanced ViT backbone with Convolutional Multi-scale feature\ninteraction, named ViT-CoMer, which facilitates bidirectional interaction\nbetween CNN and transformer. Compared to the state-of-the-art, ViT-CoMer has\nthe following advantages: (1) We inject spatial pyramid multi-receptive field\nconvolutional features into the ViT architecture, which effectively alleviates\nthe problems of limited local information interaction and single-feature\nrepresentation in ViT. (2) We propose a simple and efficient CNN-Transformer\nbidirectional fusion interaction module that performs multi-scale fusion across\nhierarchical features, which is beneficial for handling dense prediction tasks.\n(3) We evaluate the performance of ViT-CoMer across various dense prediction\ntasks, different frameworks, and multiple advanced pre-training. Notably, our\nViT-CoMer-L achieves 64.3% AP on COCO val2017 without extra training data, and\n62.1% mIoU on ADE20K val, both of which are comparable to state-of-the-art\nmethods. We hope ViT-CoMer can serve as a new backbone for dense prediction\ntasks to facilitate future research. The code will be released at\nhttps://github.com/Traffic-X/ViT-CoMer.\n","authors":["Chunlong Xia","Xinliang Wang","Feng Lv","Xin Hao","Yifeng Shi"],"pdf_url":"https://arxiv.org/pdf/2403.07392v2.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2403.10133v1","updated":"2024-03-15T09:26:48Z","published":"2024-03-15T09:26:48Z","title":"E4C: Enhance Editability for Text-Based Image Editing by Harnessing\n Efficient CLIP Guidance","summary":" Diffusion-based image editing is a composite process of preserving the source\nimage content and generating new content or applying modifications. While\ncurrent editing approaches have made improvements under text guidance, most of\nthem have only focused on preserving the information of the input image,\ndisregarding the importance of editability and alignment to the target prompt.\nIn this paper, we prioritize the editability by proposing a zero-shot image\nediting method, named \\textbf{E}nhance \\textbf{E}ditability for text-based\nimage \\textbf{E}diting via \\textbf{E}fficient \\textbf{C}LIP guidance\n(\\textbf{E4C}), which only requires inference-stage optimization to explicitly\nenhance the edibility and text alignment. Specifically, we develop a unified\ndual-branch feature-sharing pipeline that enables the preservation of the\nstructure or texture of the source image while allowing the other to be adapted\nbased on the editing task. We further integrate CLIP guidance into our pipeline\nby utilizing our novel random-gateway optimization mechanism to efficiently\nenhance the semantic alignment with the target prompt. Comprehensive\nquantitative and qualitative experiments demonstrate that our method\neffectively resolves the text alignment issues prevalent in existing methods\nwhile maintaining the fidelity to the source image, and performs well across a\nwide range of editing tasks.\n","authors":["Tianrui Huang","Pu Cao","Lu Yang","Chun Liu","Mengjie Hu","Zhiwei Liu","Qing Song"],"pdf_url":"https://arxiv.org/pdf/2403.10133v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10127v1","updated":"2024-03-15T09:18:53Z","published":"2024-03-15T09:18:53Z","title":"TransLandSeg: A Transfer Learning Approach for Landslide Semantic\n Segmentation Based on Vision Foundation Model","summary":" Landslides are one of the most destructive natural disasters in the world,\nposing a serious threat to human life and safety. The development of foundation\nmodels has provided a new research paradigm for large-scale landslide\ndetection. The Segment Anything Model (SAM) has garnered widespread attention\nin the field of image segmentation. However, our experiment found that SAM\nperformed poorly in the task of landslide segmentation. We propose\nTransLandSeg, which is a transfer learning approach for landslide semantic\nsegmentation based on a vision foundation model (VFM). TransLandSeg outperforms\ntraditional semantic segmentation models on both the Landslide4Sense dataset\nand the Bijie landslide dataset. Our proposed adaptive transfer learning (ATL)\narchitecture enables the powerful segmentation capability of SAM to be\ntransferred to landslide detection by training only 1.3% of the number of the\nparameters of SAM, which greatly improves the training efficiency of the model.\nFinally we also conducted ablation experiments on models with different ATL\nstructures, concluded that the deployment location and residual connection of\nATL play an important role in TransLandSeg accuracy improvement.\n","authors":["Changhong Hou","Junchuan Yu","Daqing Ge","Liu Yang","Laidian Xi","Yunxuan Pang","Yi Wen"],"pdf_url":"https://arxiv.org/pdf/2403.10127v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10124v1","updated":"2024-03-15T09:15:57Z","published":"2024-03-15T09:15:57Z","title":"Depth-induced Saliency Comparison Network for Diagnosis of Alzheimer's\n Disease via Jointly Analysis of Visual Stimuli and Eye Movements","summary":" Early diagnosis of Alzheimer's Disease (AD) is very important for following\nmedical treatments, and eye movements under special visual stimuli may serve as\na potential non-invasive biomarker for detecting cognitive abnormalities of AD\npatients. In this paper, we propose an Depth-induced saliency comparison\nnetwork (DISCN) for eye movement analysis, which may be used for diagnosis the\nAlzheimers disease. In DISCN, a salient attention module fuses normal eye\nmovements with RGB and depth maps of visual stimuli using hierarchical salient\nattention (SAA) to evaluate comprehensive saliency maps, which contain\ninformation from both visual stimuli and normal eye movement behaviors. In\naddition, we introduce serial attention module (SEA) to emphasis the most\nabnormal eye movement behaviors to reduce personal bias for a more robust\nresult. According to our experiments, the DISCN achieves consistent validity in\nclassifying the eye movements between the AD patients and normal controls.\n","authors":["Yu Liu","Wenlin Zhang","Shaochu Wang","Fangyu Zuo","Peiguang Jing","Yong Ji"],"pdf_url":"https://arxiv.org/pdf/2403.10124v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10119v1","updated":"2024-03-15T09:08:27Z","published":"2024-03-15T09:08:27Z","title":"URS-NeRF: Unordered Rolling Shutter Bundle Adjustment for Neural\n Radiance Fields","summary":" We propose a novel rolling shutter bundle adjustment method for neural\nradiance fields (NeRF), which utilizes the unordered rolling shutter (RS)\nimages to obtain the implicit 3D representation. Existing NeRF methods suffer\nfrom low-quality images and inaccurate initial camera poses due to the RS\neffect in the image, whereas, the previous method that incorporates the RS into\nNeRF requires strict sequential data input, limiting its widespread\napplicability. In constant, our method recovers the physical formation of RS\nimages by estimating camera poses and velocities, thereby removing the input\nconstraints on sequential data. Moreover, we adopt a coarse-to-fine training\nstrategy, in which the RS epipolar constraints of the pairwise frames in the\nscene graph are used to detect the camera poses that fall into local minima.\nThe poses detected as outliers are corrected by the interpolation method with\nneighboring poses. The experimental results validate the effectiveness of our\nmethod over state-of-the-art works and demonstrate that the reconstruction of\n3D representations is not constrained by the requirement of video sequence\ninput.\n","authors":["Bo Xu","Ziao Liu","Mengqi Guo","Jiancheng Li","Gim Hee Li"],"pdf_url":"https://arxiv.org/pdf/2403.10119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.07593v2","updated":"2024-03-15T08:58:05Z","published":"2023-11-10T05:24:07Z","title":"Follow-Up Differential Descriptions: Language Models Resolve Ambiguities\n for Image Classification","summary":" A promising approach for improving the performance of vision-language models\nlike CLIP for image classification is to extend the class descriptions (i.e.,\nprompts) with related attributes, e.g., using brown sparrow instead of sparrow.\nHowever, current zero-shot methods select a subset of attributes regardless of\ncommonalities between the target classes, potentially providing no useful\ninformation that would have helped to distinguish between them. For instance,\nthey may use color instead of bill shape to distinguish between sparrows and\nwrens, which are both brown. We propose Follow-up Differential Descriptions\n(FuDD), a zero-shot approach that tailors the class descriptions to each\ndataset and leads to additional attributes that better differentiate the target\nclasses. FuDD first identifies the ambiguous classes for each image, and then\nuses a Large Language Model (LLM) to generate new class descriptions that\ndifferentiate between them. The new class descriptions resolve the initial\nambiguity and help predict the correct label. In our experiments, FuDD\nconsistently outperforms generic description ensembles and naive LLM-generated\ndescriptions on 12 datasets. We show that differential descriptions are an\neffective tool to resolve class ambiguities, which otherwise significantly\ndegrade the performance. We also show that high quality natural language class\ndescriptions produced by FuDD result in comparable performance to few-shot\nadaptation methods.\n","authors":["Reza Esfandiarpoor","Stephen H. Bach"],"pdf_url":"https://arxiv.org/pdf/2311.07593v2.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2403.10107v1","updated":"2024-03-15T08:51:15Z","published":"2024-03-15T08:51:15Z","title":"Enhancing Human-Centered Dynamic Scene Understanding via Multiple LLMs\n Collaborated Reasoning","summary":" Human-centered dynamic scene understanding plays a pivotal role in enhancing\nthe capability of robotic and autonomous systems, in which Video-based\nHuman-Object Interaction (V-HOI) detection is a crucial task in semantic scene\nunderstanding, aimed at comprehensively understanding HOI relationships within\na video to benefit the behavioral decisions of mobile robots and autonomous\ndriving systems. Although previous V-HOI detection models have made significant\nstrides in accurate detection on specific datasets, they still lack the general\nreasoning ability like human beings to effectively induce HOI relationships. In\nthis study, we propose V-HOI Multi-LLMs Collaborated Reasoning (V-HOI MLCR), a\nnovel framework consisting of a series of plug-and-play modules that could\nfacilitate the performance of current V-HOI detection models by leveraging the\nstrong reasoning ability of different off-the-shelf pre-trained large language\nmodels (LLMs). We design a two-stage collaboration system of different LLMs for\nthe V-HOI task. Specifically, in the first stage, we design a Cross-Agents\nReasoning scheme to leverage the LLM conduct reasoning from different aspects.\nIn the second stage, we perform Multi-LLMs Debate to get the final reasoning\nanswer based on the different knowledge in different LLMs. Additionally, we\ndevise an auxiliary training strategy that utilizes CLIP, a large\nvision-language model to enhance the base V-HOI models' discriminative ability\nto better cooperate with LLMs. We validate the superiority of our design by\ndemonstrating its effectiveness in improving the prediction accuracy of the\nbase V-HOI model via reasoning from multiple perspectives.\n","authors":["Hang Zhang","Wenxiao Zhang","Haoxuan Qu","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2403.10107v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2403.10104v1","updated":"2024-03-15T08:49:33Z","published":"2024-03-15T08:49:33Z","title":"CSDNet: Detect Salient Object in Depth-Thermal via A Lightweight Cross\n Shallow and Deep Perception Network","summary":" While we enjoy the richness and informativeness of multimodal data, it also\nintroduces interference and redundancy of information. To achieve optimal\ndomain interpretation with limited resources, we propose CSDNet, a lightweight\n\\textbf{C}ross \\textbf{S}hallow and \\textbf{D}eep Perception \\textbf{Net}work\ndesigned to integrate two modalities with less coherence, thereby discarding\nredundant information or even modality. We implement our CSDNet for Salient\nObject Detection (SOD) task in robotic perception. The proposed method\ncapitalises on spatial information prescreening and implicit coherence\nnavigation across shallow and deep layers of the depth-thermal (D-T) modality,\nprioritising integration over fusion to maximise the scene interpretation. To\nfurther refine the descriptive capabilities of the encoder for the less-known\nD-T modalities, we also propose SAMAEP to guide an effective feature mapping to\nthe generalised feature space. Our approach is tested on the VDT-2048 dataset,\nleveraging the D-T modality outperforms those of SOTA methods using RGB-T or\nRGB-D modalities for the first time, achieves comparable performance with the\nRGB-D-T triple-modality benchmark method with 5.97 times faster at runtime and\ndemanding 0.0036 times fewer FLOPs. Demonstrates the proposed CSDNet\neffectively integrates the information from the D-T modality. The code will be\nreleased upon acceptance.\n","authors":["Xiaotong Yu","Ruihan Xie","Zhihe Zhao","Chang-Wen Chen"],"pdf_url":"https://arxiv.org/pdf/2403.10104v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10103v1","updated":"2024-03-15T08:48:37Z","published":"2024-03-15T08:48:37Z","title":"DyBluRF: Dynamic Neural Radiance Fields from Blurry Monocular Video","summary":" Recent advancements in dynamic neural radiance field methods have yielded\nremarkable outcomes. However, these approaches rely on the assumption of sharp\ninput images. When faced with motion blur, existing dynamic NeRF methods often\nstruggle to generate high-quality novel views. In this paper, we propose\nDyBluRF, a dynamic radiance field approach that synthesizes sharp novel views\nfrom a monocular video affected by motion blur. To account for motion blur in\ninput images, we simultaneously capture the camera trajectory and object\nDiscrete Cosine Transform (DCT) trajectories within the scene. Additionally, we\nemploy a global cross-time rendering approach to ensure consistent temporal\ncoherence across the entire scene. We curate a dataset comprising diverse\ndynamic scenes that are specifically tailored for our task. Experimental\nresults on our dataset demonstrate that our method outperforms existing\napproaches in generating sharp novel views from motion-blurred inputs while\nmaintaining spatial-temporal consistency of the scene.\n","authors":["Huiqiang Sun","Xingyi Li","Liao Shen","Xinyi Ye","Ke Xian","Zhiguo Cao"],"pdf_url":"https://arxiv.org/pdf/2403.10103v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.10922v2","updated":"2024-03-15T08:48:04Z","published":"2022-08-23T12:49:01Z","title":"StyleTalker: One-shot Style-based Audio-driven Talking Head Video\n Generation","summary":" We propose StyleTalker, a novel audio-driven talking head generation model\nthat can synthesize a video of a talking person from a single reference image\nwith accurately audio-synced lip shapes, realistic head poses, and eye blinks.\nSpecifically, by leveraging a pretrained image generator and an image encoder,\nwe estimate the latent codes of the talking head video that faithfully reflects\nthe given audio. This is made possible with several newly devised components:\n1) A contrastive lip-sync discriminator for accurate lip synchronization, 2) A\nconditional sequential variational autoencoder that learns the latent motion\nspace disentangled from the lip movements, such that we can independently\nmanipulate the motions and lip movements while preserving the identity. 3) An\nauto-regressive prior augmented with normalizing flow to learn a complex\naudio-to-motion multi-modal latent space. Equipped with these components,\nStyleTalker can generate talking head videos not only in a motion-controllable\nway when another motion source video is given but also in a completely\naudio-driven manner by inferring realistic motions from the input audio.\nThrough extensive experiments and user studies, we show that our model is able\nto synthesize talking head videos with impressive perceptual quality which are\naccurately lip-synced with the input audios, largely outperforming\nstate-of-the-art baselines.\n","authors":["Dongchan Min","Minyoung Song","Eunji Ko","Sung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2208.10922v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10099v1","updated":"2024-03-15T08:44:56Z","published":"2024-03-15T08:44:56Z","title":"KP-RED: Exploiting Semantic Keypoints for Joint 3D Shape Retrieval and\n Deformation","summary":" In this paper, we present KP-RED, a unified KeyPoint-driven REtrieval and\nDeformation framework that takes object scans as input and jointly retrieves\nand deforms the most geometrically similar CAD models from a pre-processed\ndatabase to tightly match the target. Unlike existing dense matching based\nmethods that typically struggle with noisy partial scans, we propose to\nleverage category-consistent sparse keypoints to naturally handle both full and\npartial object scans. Specifically, we first employ a lightweight retrieval\nmodule to establish a keypoint-based embedding space, measuring the similarity\namong objects by dynamically aggregating deformation-aware local-global\nfeatures around extracted keypoints. Objects that are close in the embedding\nspace are considered similar in geometry. Then we introduce the neural\ncage-based deformation module that estimates the influence vector of each\nkeypoint upon cage vertices inside its local support region to control the\ndeformation of the retrieved shape. Extensive experiments on the synthetic\ndataset PartNet and the real-world dataset Scan2CAD demonstrate that KP-RED\nsurpasses existing state-of-the-art approaches by a large margin. Codes and\ntrained models will be released in https://github.com/lolrudy/KP-RED.\n","authors":["Ruida Zhang","Chenyangguang Zhang","Yan Di","Fabian Manhardt","Xingyu Liu","Federico Tombari","Xiangyang Ji"],"pdf_url":"https://arxiv.org/pdf/2403.10099v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2402.02498v2","updated":"2024-03-15T08:44:40Z","published":"2024-02-04T14:12:51Z","title":"Fully Differentiable Correlation-driven 2D/3D Registration for X-ray to\n CT Image Fusion","summary":" Image-based rigid 2D/3D registration is a critical technique for fluoroscopic\nguided surgical interventions. In recent years, some learning-based fully\ndifferentiable methods have produced beneficial outcomes while the process of\nfeature extraction and gradient flow transmission still lack controllability\nand interpretability. To alleviate these problems, in this work, we propose a\nnovel fully differentiable correlation-driven network using a dual-branch\nCNN-transformer encoder which enables the network to extract and separate\nlow-frequency global features from high-frequency local features. A\ncorrelation-driven loss is further proposed for low-frequency feature and\nhigh-frequency feature decomposition based on embedded information. Besides, a\ntraining strategy that learns to approximate a convex-shape similarity function\nis applied in our work. We test our approach on a in-house datasetand show that\nit outperforms both existing fully differentiable learning-based registration\napproaches and the conventional optimization-based baseline.\n","authors":["Minheng Chen","Zhirun Zhang","Shuheng Gu","Zhangyang Ge","Youyong Kong"],"pdf_url":"https://arxiv.org/pdf/2402.02498v2.pdf","comment":"ISBI 2024"},{"id":"http://arxiv.org/abs/2403.10098v1","updated":"2024-03-15T08:44:15Z","published":"2024-03-15T08:44:15Z","title":"DiffMAC: Diffusion Manifold Hallucination Correction for High\n Generalization Blind Face Restoration","summary":" Blind face restoration (BFR) is a highly challenging problem due to the\nuncertainty of degradation patterns. Current methods have low generalization\nacross photorealistic and heterogeneous domains. In this paper, we propose a\nDiffusion-Information-Diffusion (DID) framework to tackle diffusion manifold\nhallucination correction (DiffMAC), which achieves high-generalization face\nrestoration in diverse degraded scenes and heterogeneous domains. Specifically,\nthe first diffusion stage aligns the restored face with spatial feature\nembedding of the low-quality face based on AdaIN, which synthesizes\ndegradation-removal results but with uncontrollable artifacts for some hard\ncases. Based on Stage I, Stage II considers information compression using\nmanifold information bottleneck (MIB) and finetunes the first diffusion model\nto improve facial fidelity. DiffMAC effectively fights against blind\ndegradation patterns and synthesizes high-quality faces with attribute and\nidentity consistencies. Experimental results demonstrate the superiority of\nDiffMAC over state-of-the-art methods, with a high degree of generalization in\nreal-world and heterogeneous settings. The source code and models will be\npublic.\n","authors":["Nan Gao","Jia Li","Huaibo Huang","Zhi Zeng","Ke Shang","Shuwu Zhang","Ran He"],"pdf_url":"https://arxiv.org/pdf/2403.10098v1.pdf","comment":"15 pages, 12 figures"},{"id":"http://arxiv.org/abs/2311.07604v2","updated":"2024-03-15T08:42:39Z","published":"2023-11-11T05:40:54Z","title":"Finetuning Text-to-Image Diffusion Models for Fairness","summary":" The rapid adoption of text-to-image diffusion models in society underscores\nan urgent need to address their biases. Without interventions, these biases\ncould propagate a skewed worldview and restrict opportunities for minority\ngroups. In this work, we frame fairness as a distributional alignment problem.\nOur solution consists of two main technical contributions: (1) a distributional\nalignment loss that steers specific characteristics of the generated images\ntowards a user-defined target distribution, and (2) adjusted direct finetuning\nof diffusion model's sampling process (adjusted DFT), which leverages an\nadjusted gradient to directly optimize losses defined on the generated images.\nEmpirically, our method markedly reduces gender, racial, and their\nintersectional biases for occupational prompts. Gender bias is significantly\nreduced even when finetuning just five soft tokens. Crucially, our method\nsupports diverse perspectives of fairness beyond absolute equality, which is\ndemonstrated by controlling age to a $75\\%$ young and $25\\%$ old distribution\nwhile simultaneously debiasing gender and race. Finally, our method is\nscalable: it can debias multiple concepts at once by simply including these\nprompts in the finetuning data. We share code and various fair diffusion model\nadaptors at https://sail-sg.github.io/finetune-fair-diffusion/.\n","authors":["Xudong Shen","Chao Du","Tianyu Pang","Min Lin","Yongkang Wong","Mohan Kankanhalli"],"pdf_url":"https://arxiv.org/pdf/2311.07604v2.pdf","comment":"ICLR 2024 oral presentation"},{"id":"http://arxiv.org/abs/2311.18496v2","updated":"2024-03-15T08:38:04Z","published":"2023-11-30T12:17:16Z","title":"Accurate Segmentation of Optic Disc And Cup from Multiple Pseudo-labels\n by Noise-aware Learning","summary":" Optic disc and cup segmentation plays a crucial role in automating the\nscreening and diagnosis of optic glaucoma. While data-driven convolutional\nneural networks (CNNs) show promise in this area, the inherent ambiguity of\nsegmenting objects and background boundaries in the task of optic disc and cup\nsegmentation leads to noisy annotations that impact model performance. To\naddress this, we propose an innovative label-denoising method of Multiple\nPseudo-labels Noise-aware Network (MPNN) for accurate optic disc and cup\nsegmentation. Specifically, the Multiple Pseudo-labels Generation and Guided\nDenoising (MPGGD) module generates pseudo-labels by multiple different\ninitialization networks trained on true labels, and the pixel-level consensus\ninformation extracted from these pseudo-labels guides to differentiate clean\npixels from noisy pixels. The training framework of the MPNN is constructed by\na teacher-student architecture to learn segmentation from clean pixels and\nnoisy pixels. Particularly, such a framework adeptly leverages (i) reliable and\nfundamental insight from clean pixels and (ii) the supplementary knowledge\nwithin noisy pixels via multiple perturbation-based unsupervised consistency.\nCompared to other label-denoising methods, comprehensive experimental results\non the RIGA dataset demonstrate our method's excellent performance. The code is\navailable at https://github.com/wwwtttjjj/MPNN\n","authors":["Tengjin Weng","Yang Shen","Zhidong Zhao","Zhiming Cheng","Shuai Wang"],"pdf_url":"https://arxiv.org/pdf/2311.18496v2.pdf","comment":"CSCWD 2024"},{"id":"http://arxiv.org/abs/2311.16096v2","updated":"2024-03-15T08:32:46Z","published":"2023-11-27T18:59:04Z","title":"Animatable Gaussians: Learning Pose-dependent Gaussian Maps for\n High-fidelity Human Avatar Modeling","summary":" Modeling animatable human avatars from RGB videos is a long-standing and\nchallenging problem. Recent works usually adopt MLP-based neural radiance\nfields (NeRF) to represent 3D humans, but it remains difficult for pure MLPs to\nregress pose-dependent garment details. To this end, we introduce Animatable\nGaussians, a new avatar representation that leverages powerful 2D CNNs and 3D\nGaussian splatting to create high-fidelity avatars. To associate 3D Gaussians\nwith the animatable avatar, we learn a parametric template from the input\nvideos, and then parameterize the template on two front \\& back canonical\nGaussian maps where each pixel represents a 3D Gaussian. The learned template\nis adaptive to the wearing garments for modeling looser clothes like dresses.\nSuch template-guided 2D parameterization enables us to employ a powerful\nStyleGAN-based CNN to learn the pose-dependent Gaussian maps for modeling\ndetailed dynamic appearances. Furthermore, we introduce a pose projection\nstrategy for better generalization given novel poses. Overall, our method can\ncreate lifelike avatars with dynamic, realistic and generalized appearances.\nExperiments show that our method outperforms other state-of-the-art approaches.\nCode: https://github.com/lizhe00/AnimatableGaussians\n","authors":["Zhe Li","Zerong Zheng","Lizhen Wang","Yebin Liu"],"pdf_url":"https://arxiv.org/pdf/2311.16096v2.pdf","comment":"Accepted by CVPR 2024, Projectpage:\n https://animatable-gaussians.github.io/, Code:\n https://github.com/lizhe00/AnimatableGaussians"},{"id":"http://arxiv.org/abs/2403.10097v1","updated":"2024-03-15T08:26:59Z","published":"2024-03-15T08:26:59Z","title":"Adaptive Random Feature Regularization on Fine-tuning Deep Neural\n Networks","summary":" While fine-tuning is a de facto standard method for training deep neural\nnetworks, it still suffers from overfitting when using small target datasets.\nPrevious methods improve fine-tuning performance by maintaining knowledge of\nthe source datasets or introducing regularization terms such as contrastive\nloss. However, these methods require auxiliary source information (e.g., source\nlabels or datasets) or heavy additional computations. In this paper, we propose\na simple method called adaptive random feature regularization (AdaRand).\nAdaRand helps the feature extractors of training models to adaptively change\nthe distribution of feature vectors for downstream classification tasks without\nauxiliary source information and with reasonable computation costs. To this\nend, AdaRand minimizes the gap between feature vectors and random reference\nvectors that are sampled from class conditional Gaussian distributions.\nFurthermore, AdaRand dynamically updates the conditional distribution to follow\nthe currently updated feature extractors and balance the distance between\nclasses in feature spaces. Our experiments show that AdaRand outperforms the\nother fine-tuning regularization, which requires auxiliary source information\nand heavy computation costs.\n","authors":["Shin'ya Yamaguchi","Sekitoshi Kanai","Kazuki Adachi","Daiki Chijiwa"],"pdf_url":"https://arxiv.org/pdf/2403.10097v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2209.07105v3","updated":"2024-03-15T08:21:04Z","published":"2022-09-15T07:35:12Z","title":"Bridging Implicit and Explicit Geometric Transformation for Single-Image\n View Synthesis","summary":" Creating novel views from a single image has achieved tremendous strides with\nadvanced autoregressive models, as unseen regions have to be inferred from the\nvisible scene contents. Although recent methods generate high-quality novel\nviews, synthesizing with only one explicit or implicit 3D geometry has a\ntrade-off between two objectives that we call the \"seesaw\" problem: 1)\npreserving reprojected contents and 2) completing realistic out-of-view\nregions. Also, autoregressive models require a considerable computational cost.\nIn this paper, we propose a single-image view synthesis framework for\nmitigating the seesaw problem while utilizing an efficient non-autoregressive\nmodel. Motivated by the characteristics that explicit methods well preserve\nreprojected pixels and implicit methods complete realistic out-of-view regions,\nwe introduce a loss function to complement two renderers. Our loss function\npromotes that explicit features improve the reprojected area of implicit\nfeatures and implicit features improve the out-of-view area of explicit\nfeatures. With the proposed architecture and loss function, we can alleviate\nthe seesaw problem, outperforming autoregressive-based state-of-the-art methods\nand generating an image $\\approx$100 times faster. We validate the efficiency\nand effectiveness of our method with experiments on RealEstate10K and ACID\ndatasets.\n","authors":["Byeongjun Park","Hyojun Go","Changick Kim"],"pdf_url":"https://arxiv.org/pdf/2209.07105v3.pdf","comment":"TPAMI 2024"},{"id":"http://arxiv.org/abs/2403.10094v1","updated":"2024-03-15T08:19:57Z","published":"2024-03-15T08:19:57Z","title":"RangeLDM: Fast Realistic LiDAR Point Cloud Generation","summary":" Autonomous driving demands high-quality LiDAR data, yet the cost of physical\nLiDAR sensors presents a significant scaling-up challenge. While recent efforts\nhave explored deep generative models to address this issue, they often consume\nsubstantial computational resources with slow generation speeds while suffering\nfrom a lack of realism. To address these limitations, we introduce RangeLDM, a\nnovel approach for rapidly generating high-quality range-view LiDAR point\nclouds via latent diffusion models. We achieve this by correcting range-view\ndata distribution for accurate projection from point clouds to range images via\nHough voting, which has a critical impact on generative learning. We then\ncompress the range images into a latent space with a variational autoencoder,\nand leverage a diffusion model to enhance expressivity. Additionally, we\ninstruct the model to preserve 3D structural fidelity by devising a\nrange-guided discriminator. Experimental results on KITTI-360 and nuScenes\ndatasets demonstrate both the robust expressiveness and fast speed of our LiDAR\npoint cloud generation.\n","authors":["Qianjiang Hu","Zhimin Zhang","Wei Hu"],"pdf_url":"https://arxiv.org/pdf/2403.10094v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10091v1","updated":"2024-03-15T08:08:24Z","published":"2024-03-15T08:08:24Z","title":"PQDynamicISP: Dynamically Controlled Image Signal Processor for Any\n Image Sensors Pursuing Perceptual Quality","summary":" Full DNN-based image signal processors (ISPs) have been actively studied and\nhave achieved superior image quality compared to conventional ISPs. In contrast\nto this trend, we propose a lightweight ISP that consists of simple\nconventional ISP functions but achieves high image quality by increasing\nexpressiveness. Specifically, instead of tuning the parameters of the ISP, we\npropose to control them dynamically for each environment and even locally. As a\nresult, state-of-the-art accuracy is achieved on various datasets, including\nother tasks like tone mapping and image enhancement, even though ours is\nlighter than DNN-based ISPs. Additionally, our method can process different\nimage sensors with a single ISP through dynamic control, whereas conventional\nmethods require training for each sensor.\n","authors":["Masakazu Yoshimura","Junji Otsuka","Takeshi Ohashi"],"pdf_url":"https://arxiv.org/pdf/2403.10091v1.pdf","comment":"Keywords: image signal processor, ISP, image enhancement, tone\n mapping"},{"id":"http://arxiv.org/abs/2403.10089v1","updated":"2024-03-15T08:05:16Z","published":"2024-03-15T08:05:16Z","title":"Approximation and bounding techniques for the Fisher-Rao distances","summary":" The Fisher-Rao distance between two probability distributions of a\nstatistical model is defined as the Riemannian geodesic distance induced by the\nFisher information metric. In order to calculate the Fisher-Rao distance in\nclosed-form, we need (1) to elicit a formula for the Fisher-Rao geodesics, and\n(2) to integrate the Fisher length element along those geodesics. We consider\nseveral numerically robust approximation and bounding techniques for the\nFisher-Rao distances: First, we report generic upper bounds on Fisher-Rao\ndistances based on closed-form 1D Fisher-Rao distances of submodels. Second, we\ndescribe several generic approximation schemes depending on whether the\nFisher-Rao geodesics or pregeodesics are available in closed-form or not. In\nparticular, we obtain a generic method to guarantee an arbitrarily small\nadditive error on the approximation provided that Fisher-Rao pregeodesics and\ntight lower and upper bounds are available. Third, we consider the case of\nFisher metrics being Hessian metrics, and report generic tight upper bounds on\nthe Fisher-Rao distances using techniques of information geometry.\nUniparametric and biparametric statistical models always have Fisher Hessian\nmetrics, and in general a simple test allows to check whether the Fisher\ninformation matrix yields a Hessian metric or not. Fourth, we consider\nelliptical distribution families and show how to apply the above techniques to\nthese models. We also propose two new distances based either on the Fisher-Rao\nlengths of curves serving as proxies of Fisher-Rao geodesics, or based on the\nBirkhoff/Hilbert projective cone distance. Last, we consider an alternative\ngroup-theoretic approach for statistical transformation models based on the\nnotion of maximal invariant which yields insights on the structures of the\nFisher-Rao distance formula which may be used fruitfully in applications.\n","authors":["Frank Nielsen"],"pdf_url":"https://arxiv.org/pdf/2403.10089v1.pdf","comment":"38 pages"},{"id":"http://arxiv.org/abs/2403.10087v1","updated":"2024-03-15T08:01:44Z","published":"2024-03-15T08:01:44Z","title":"Monkeypox disease recognition model based on improved SE-InceptionV3","summary":" In the wake of the global spread of monkeypox, accurate disease recognition\nhas become crucial. This study introduces an improved SE-InceptionV3 model,\nembedding the SENet module and incorporating L2 regularization into the\nInceptionV3 framework to enhance monkeypox disease detection. Utilizing the\nKaggle monkeypox dataset, which includes images of monkeypox and similar skin\nconditions, our model demonstrates a noteworthy accuracy of 96.71% on the test\nset, outperforming conventional methods and deep learning models. The SENet\nmodules channel attention mechanism significantly elevates feature\nrepresentation, while L2 regularization ensures robust generalization.\nExtensive experiments validate the models superiority in precision, recall, and\nF1 score, highlighting its effectiveness in differentiating monkeypox lesions\nin diverse and complex cases. The study not only provides insights into the\napplication of advanced CNN architectures in medical diagnostics but also opens\navenues for further research in model optimization and hyperparameter tuning\nfor enhanced disease recognition. https://github.com/jzc777/SE-inceptionV3-L2\n","authors":["Junzhuo Chen","Zonghan Lu","Shitong Kang"],"pdf_url":"https://arxiv.org/pdf/2403.10087v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10085v1","updated":"2024-03-15T08:00:29Z","published":"2024-03-15T08:00:29Z","title":"VRHCF: Cross-Source Point Cloud Registration via Voxel Representation\n and Hierarchical Correspondence Filtering","summary":" Addressing the challenges posed by the substantial gap in point cloud data\ncollected from diverse sensors, achieving robust cross-source point cloud\nregistration becomes a formidable task. In response, we present a novel\nframework for point cloud registration with broad applicability, suitable for\nboth homologous and cross-source registration scenarios. To tackle the issues\narising from different densities and distributions in cross-source point cloud\ndata, we introduce a feature representation based on spherical voxels.\nFurthermore, addressing the challenge of numerous outliers and mismatches in\ncross-source registration, we propose a hierarchical correspondence filtering\napproach. This method progressively filters out mismatches, yielding a set of\nhigh-quality correspondences. Our method exhibits versatile applicability and\nexcels in both traditional homologous registration and challenging cross-source\nregistration scenarios. Specifically, in homologous registration using the\n3DMatch dataset, we achieve the highest registration recall of 95.1% and an\ninlier ratio of 87.8%. In cross-source point cloud registration, our method\nattains the best RR on the 3DCSR dataset, demonstrating a 9.3 percentage points\nimprovement. The code is available at https://github.com/GuiyuZhao/VRHCF.\n","authors":["Guiyu Zhao","Zewen Du","Zhentao Guo","Hongbin Ma"],"pdf_url":"https://arxiv.org/pdf/2403.10085v1.pdf","comment":"Accepted by IEEE International Conference on Multimedia and Expo\n (ICME), 2024"},{"id":"http://arxiv.org/abs/2312.09063v2","updated":"2024-03-15T07:55:25Z","published":"2023-12-14T16:00:28Z","title":"Image Demoireing in RAW and sRGB Domains","summary":" Moire patterns frequently appear when capturing screens with smartphones or\ncameras, potentially compromising image quality. Previous studies suggest that\nmoire pattern elimination in the RAW domain offers greater effectiveness\ncompared to demoireing in the sRGB domain. Nevertheless, relying solely on RAW\ndata for image demoireing is insufficient in mitigating the color cast due to\nthe absence of essential information required for the color correction by the\nimage signal processor (ISP). In this paper, we propose to jointly utilize both\nRAW and sRGB data for image demoireing (RRID), which are readily accessible in\nmodern smartphones and DSLR cameras. We develop Skip-Connection-based\nDemoireing Module (SCDM) with Gated Feedback Module (GFM) and Frequency\nSelection Module (FSM) embedded in skip-connections for the efficient and\neffective demoireing of RAW and sRGB features, respectively. Subsequently, we\ndesign a RGB Guided ISP (RGISP) to learn a device-dependent ISP, assisting the\nprocess of color recovery. Extensive experiments demonstrate that our RRID\noutperforms state-of-the-art approaches, in terms of the performance in moire\npattern removal and color cast correction by 0.62dB in PSNR and 0.003 in SSIM.\n","authors":["Shuning Xu","Binbin Song","Xiangyu Chen","Xina Liu","Jiantao Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.09063v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10082v1","updated":"2024-03-15T07:51:35Z","published":"2024-03-15T07:51:35Z","title":"CrossGLG: LLM Guides One-shot Skeleton-based 3D Action Recognition in a\n Cross-level Manner","summary":" Most existing one-shot skeleton-based action recognition focuses on raw\nlow-level information (e.g., joint location), and may suffer from local\ninformation loss and low generalization ability. To alleviate these, we propose\nto leverage text description generated from large language models (LLM) that\ncontain high-level human knowledge, to guide feature learning, in a\nglobal-local-global way. Particularly, during training, we design $2$ prompts\nto gain global and local text descriptions of each action from an LLM. We first\nutilize the global text description to guide the skeleton encoder focus on\ninformative joints (i.e.,global-to-local). Then we build non-local interaction\nbetween local text and joint features, to form the final global representation\n(i.e., local-to-global). To mitigate the asymmetry issue between the training\nand inference phases, we further design a dual-branch architecture that allows\nthe model to perform novel class inference without any text input, also making\nthe additional inference cost neglectable compared with the base skeleton\nencoder. Extensive experiments on three different benchmarks show that CrossGLG\nconsistently outperforms the existing SOTA methods with large margins, and the\ninference cost (model size) is only $2.8$\\% than the previous SOTA. CrossGLG\ncan also serve as a plug-and-play module that can substantially enhance the\nperformance of different SOTA skeleton encoders with a neglectable cost during\ninference. The source code will be released soon.\n","authors":["Tingbing Yan","Wenzheng Zeng","Yang Xiao","Xingyu Tong","Bo Tan","Zhiwen Fang","Zhiguo Cao","Joey Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2403.10082v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10079v1","updated":"2024-03-15T07:45:25Z","published":"2024-03-15T07:45:25Z","title":"Learning Physical Dynamics for Object-centric Visual Prediction","summary":" The ability to model the underlying dynamics of visual scenes and reason\nabout the future is central to human intelligence. Many attempts have been made\nto empower intelligent systems with such physical understanding and prediction\nabilities. However, most existing methods focus on pixel-to-pixel prediction,\nwhich suffers from heavy computational costs while lacking a deep understanding\nof the physical dynamics behind videos. Recently, object-centric prediction\nmethods have emerged and attracted increasing interest. Inspired by it, this\npaper proposes an unsupervised object-centric prediction model that makes\nfuture predictions by learning visual dynamics between objects. Our model\nconsists of two modules, perceptual, and dynamic module. The perceptual module\nis utilized to decompose images into several objects and synthesize images with\na set of object-centric representations. The dynamic module fuses contextual\ninformation, takes environment-object and object-object interaction into\naccount, and predicts the future trajectory of objects. Extensive experiments\nare conducted to validate the effectiveness of the proposed method. Both\nquantitative and qualitative experimental results demonstrate that our model\ngenerates higher visual quality and more physically reliable predictions\ncompared to the state-of-the-art methods.\n","authors":["Huilin Xu","Tao Chen","Feng Xu"],"pdf_url":"https://arxiv.org/pdf/2403.10079v1.pdf","comment":"13 pages, 10 figures"},{"id":"http://arxiv.org/abs/2401.12596v2","updated":"2024-03-15T07:44:00Z","published":"2024-01-23T09:49:24Z","title":"UniHDA: A Unified and Versatile Framework for Multi-Modal Hybrid Domain\n Adaptation","summary":" Recently, generative domain adaptation has achieved remarkable progress,\nenabling us to adapt a pre-trained generator to a new target domain. However,\nexisting methods simply adapt the generator to a single target domain and are\nlimited to a single modality, either text-driven or image-driven. Moreover,\nthey cannot maintain well consistency with the source domain, which impedes the\ninheritance of the diversity. In this paper, we propose UniHDA, a\n\\textbf{unified} and \\textbf{versatile} framework for generative hybrid domain\nadaptation with multi-modal references from multiple domains. We use CLIP\nencoder to project multi-modal references into a unified embedding space and\nthen linearly interpolate the direction vectors from multiple target domains to\nachieve hybrid domain adaptation. To ensure \\textbf{consistency} with the\nsource domain, we propose a novel cross-domain spatial structure (CSS) loss\nthat maintains detailed spatial structure information between source and target\ngenerator. Experiments show that the adapted generator can synthesise realistic\nimages with various attribute compositions. Additionally, our framework is\ngenerator-agnostic and versatile to multiple generators, e.g., StyleGAN, EG3D,\nand Diffusion Models.\n","authors":["Hengjia Li","Yang Liu","Yuqi Lin","Zhanwei Zhang","Yibo Zhao","weihang Pan","Tu Zheng","Zheng Yang","Yuchun Jiang","Boxi Wu","Deng Cai"],"pdf_url":"https://arxiv.org/pdf/2401.12596v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10076v1","updated":"2024-03-15T07:43:42Z","published":"2024-03-15T07:43:42Z","title":"Benchmarking Adversarial Robustness of Image Shadow Removal with\n Shadow-adaptive Attacks","summary":" Shadow removal is a task aimed at erasing regional shadows present in images\nand reinstating visually pleasing natural scenes with consistent illumination.\nWhile recent deep learning techniques have demonstrated impressive performance\nin image shadow removal, their robustness against adversarial attacks remains\nlargely unexplored. Furthermore, many existing attack frameworks typically\nallocate a uniform budget for perturbations across the entire input image,\nwhich may not be suitable for attacking shadow images. This is primarily due to\nthe unique characteristic of spatially varying illumination within shadow\nimages. In this paper, we propose a novel approach, called shadow-adaptive\nadversarial attack. Different from standard adversarial attacks, our attack\nbudget is adjusted based on the pixel intensity in different regions of shadow\nimages. Consequently, the optimized adversarial noise in the shadowed regions\nbecomes visually less perceptible while permitting a greater tolerance for\nperturbations in non-shadow regions. The proposed shadow-adaptive attacks\nnaturally align with the varying illumination distribution in shadow images,\nresulting in perturbations that are less conspicuous. Building on this, we\nconduct a comprehensive empirical evaluation of existing shadow removal\nmethods, subjecting them to various levels of attack on publicly available\ndatasets.\n","authors":["Chong Wang","Yi Yu","Lanqing Guo","Bihan Wen"],"pdf_url":"https://arxiv.org/pdf/2403.10076v1.pdf","comment":"Accepted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2403.10073v1","updated":"2024-03-15T07:29:41Z","published":"2024-03-15T07:29:41Z","title":"Revisiting Adversarial Training under Long-Tailed Distributions","summary":" Deep neural networks are vulnerable to adversarial attacks, often leading to\nerroneous outputs. Adversarial training has been recognized as one of the most\neffective methods to counter such attacks. However, existing adversarial\ntraining techniques have predominantly been tested on balanced datasets,\nwhereas real-world data often exhibit a long-tailed distribution, casting doubt\non the efficacy of these methods in practical scenarios.\n In this paper, we delve into adversarial training under long-tailed\ndistributions. Through an analysis of the previous work \"RoBal\", we discover\nthat utilizing Balanced Softmax Loss alone can achieve performance comparable\nto the complete RoBal approach while significantly reducing training overheads.\nAdditionally, we reveal that, similar to uniform distributions, adversarial\ntraining under long-tailed distributions also suffers from robust overfitting.\nTo address this, we explore data augmentation as a solution and unexpectedly\ndiscover that, unlike results obtained with balanced data, data augmentation\nnot only effectively alleviates robust overfitting but also significantly\nimproves robustness. We further investigate the reasons behind the improvement\nof robustness through data augmentation and identify that it is attributable to\nthe increased diversity of examples. Extensive experiments further corroborate\nthat data augmentation alone can significantly improve robustness. Finally,\nbuilding on these findings, we demonstrate that compared to RoBal, the\ncombination of BSL and data augmentation leads to a +6.66% improvement in model\nrobustness under AutoAttack on CIFAR-10-LT. Our code is available at\nhttps://github.com/NISPLab/AT-BSL .\n","authors":["Xinli Yue","Ningping Mou","Qian Wang","Lingchen Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.10073v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.01683v2","updated":"2024-03-15T07:25:48Z","published":"2024-03-04T02:29:02Z","title":"DD-VNB: A Depth-based Dual-Loop Framework for Real-time Visually\n Navigated Bronchoscopy","summary":" Real-time 6 DOF localization of bronchoscopes is crucial for enhancing\nintervention quality. However, current vision-based technologies struggle to\nbalance between generalization to unseen data and computational speed. In this\nstudy, we propose a Depth-based Dual-Loop framework for real-time Visually\nNavigated Bronchoscopy (DD-VNB) that can generalize across patient cases\nwithout the need of re-training. The DD-VNB framework integrates two key\nmodules: depth estimation and dual-loop localization. To address the domain gap\namong patients, we propose a knowledge-embedded depth estimation network that\nmaps endoscope frames to depth, ensuring generalization by eliminating\npatient-specific textures. The network embeds view synthesis knowledge into a\ncycle adversarial architecture for scale-constrained monocular depth\nestimation. For real-time performance, our localization module embeds a fast\nego-motion estimation network into the loop of depth registration. The\nego-motion inference network estimates the pose change of the bronchoscope in\nhigh frequency while depth registration against the pre-operative 3D model\nprovides absolute pose periodically. Specifically, the relative pose changes\nare fed into the registration process as the initial guess to boost its\naccuracy and speed. Experiments on phantom and in-vivo data from patients\ndemonstrate the effectiveness of our framework: 1) monocular depth estimation\noutperforms SOTA, 2) localization achieves an accuracy of Absolute Tracking\nError (ATE) of 4.7 $\\pm$ 3.17 mm in phantom and 6.49 $\\pm$ 3.88 mm in patient\ndata, 3) with a frame-rate approaching video capture speed, 4) without the\nnecessity of case-wise network retraining. The framework's superior speed and\naccuracy demonstrate its promising clinical potential for real-time\nbronchoscopic navigation.\n","authors":["Qingyao Tian","Huai Liao","Xinyan Huang","Jian Chen","Zihui Zhang","Bingyu Yang","Sebastien Ourselin","Hongbin Liu"],"pdf_url":"https://arxiv.org/pdf/2403.01683v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10071v1","updated":"2024-03-15T07:24:13Z","published":"2024-03-15T07:24:13Z","title":"Codebook Transfer with Part-of-Speech for Vector-Quantized Image\n Modeling","summary":" Vector-Quantized Image Modeling (VQIM) is a fundamental research problem in\nimage synthesis, which aims to represent an image with a discrete token\nsequence. Existing studies effectively address this problem by learning a\ndiscrete codebook from scratch and in a code-independent manner to quantize\ncontinuous representations into discrete tokens. However, learning a codebook\nfrom scratch and in a code-independent manner is highly challenging, which may\nbe a key reason causing codebook collapse, i.e., some code vectors can rarely\nbe optimized without regard to the relationship between codes and good codebook\npriors such that die off finally. In this paper, inspired by pretrained\nlanguage models, we find that these language models have actually pretrained a\nsuperior codebook via a large number of text corpus, but such information is\nrarely exploited in VQIM. To this end, we propose a novel codebook transfer\nframework with part-of-speech, called VQCT, which aims to transfer a\nwell-trained codebook from pretrained language models to VQIM for robust\ncodebook learning. Specifically, we first introduce a pretrained codebook from\nlanguage models and part-of-speech knowledge as priors. Then, we construct a\nvision-related codebook with these priors for achieving codebook transfer.\nFinally, a novel codebook transfer network is designed to exploit abundant\nsemantic relationships between codes contained in pretrained codebooks for\nrobust VQIM codebook learning. Experimental results on four datasets show that\nour VQCT method achieves superior VQIM performance over previous\nstate-of-the-art methods.\n","authors":["Baoquan Zhang","Huaibin Wang","Luo Chuyao","Xutao Li","Liang Guotao","Yunming Ye","Xiaochen Qi","Yao He"],"pdf_url":"https://arxiv.org/pdf/2403.10071v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2402.19289v2","updated":"2024-03-15T07:22:27Z","published":"2024-02-29T15:52:59Z","title":"CAMixerSR: Only Details Need More \"Attention\"","summary":" To satisfy the rapidly increasing demands on the large image (2K-8K)\nsuper-resolution (SR), prevailing methods follow two independent tracks: 1)\naccelerate existing networks by content-aware routing, and 2) design better\nsuper-resolution networks via token mixer refining. Despite directness, they\nencounter unavoidable defects (e.g., inflexible route or non-discriminative\nprocessing) limiting further improvements of quality-complexity trade-off. To\nerase the drawbacks, we integrate these schemes by proposing a content-aware\nmixer (CAMixer), which assigns convolution for simple contexts and additional\ndeformable window-attention for sparse textures. Specifically, the CAMixer uses\na learnable predictor to generate multiple bootstraps, including offsets for\nwindows warping, a mask for classifying windows, and convolutional attentions\nfor endowing convolution with the dynamic property, which modulates attention\nto include more useful textures self-adaptively and improves the representation\ncapability of convolution. We further introduce a global classification loss to\nimprove the accuracy of predictors. By simply stacking CAMixers, we obtain\nCAMixerSR which achieves superior performance on large-image SR, lightweight\nSR, and omnidirectional-image SR.\n","authors":["Yan Wang","Yi Liu","Shijie Zhao","Junlin Li","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.19289v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2201.01034v2","updated":"2024-03-15T07:22:02Z","published":"2022-01-04T08:30:09Z","title":"Uncovering the Over-smoothing Challenge in Image Super-Resolution:\n Entropy-based Quantification and Contrastive Optimization","summary":" PSNR-oriented models are a critical class of super-resolution models with\napplications across various fields. However, these models tend to generate\nover-smoothed images, a problem that has been analyzed previously from the\nperspectives of models or loss functions, but without taking into account the\nimpact of data properties. In this paper, we present a novel phenomenon that we\nterm the center-oriented optimization (COO) problem, where a model's output\nconverges towards the center point of similar high-resolution images, rather\nthan towards the ground truth. We demonstrate that the strength of this problem\nis related to the uncertainty of data, which we quantify using entropy. We\nprove that as the entropy of high-resolution images increases, their center\npoint will move further away from the clean image distribution, and the model\nwill generate over-smoothed images. Implicitly optimizing the COO problem,\nperceptual-driven approaches such as perceptual loss, model structure\noptimization, or GAN-based methods can be viewed. We propose an explicit\nsolution to the COO problem, called Detail Enhanced Contrastive Loss (DECLoss).\nDECLoss utilizes the clustering property of contrastive learning to directly\nreduce the variance of the potential high-resolution distribution and thereby\ndecrease the entropy. We evaluate DECLoss on multiple super-resolution\nbenchmarks and demonstrate that it improves the perceptual quality of\nPSNR-oriented models. Moreover, when applied to GAN-based methods, such as\nRaGAN, DECLoss helps to achieve state-of-the-art performance, such as 0.093\nLPIPS with 24.51 PSNR on 4x downsampled Urban100, validating the effectiveness\nand generalization of our approach.\n","authors":["Tianshuo Xu","Lijiang Li","Peng Mi","Xiawu Zheng","Fei Chao","Rongrong Ji","Yonghong Tian","Qiang Shen"],"pdf_url":"https://arxiv.org/pdf/2201.01034v2.pdf","comment":"Accepted in IEEE Transactions on Pattern Analysis and Machine\n Intelligence"},{"id":"http://arxiv.org/abs/2403.10069v1","updated":"2024-03-15T07:19:15Z","published":"2024-03-15T07:19:15Z","title":"Boundary Matters: A Bi-Level Active Finetuning Framework","summary":" The pretraining-finetuning paradigm has gained widespread adoption in vision\ntasks and other fields, yet it faces the significant challenge of high sample\nannotation costs. To mitigate this, the concept of active finetuning has\nemerged, aiming to select the most appropriate samples for model finetuning\nwithin a limited budget. Traditional active learning methods often struggle in\nthis setting due to their inherent bias in batch selection. Furthermore, the\nrecent active finetuning approach has primarily concentrated on aligning the\ndistribution of selected subsets with the overall data pool, focusing solely on\ndiversity. In this paper, we propose a Bi-Level Active Finetuning framework to\nselect the samples for annotation in one shot, which includes two stages: core\nsample selection for diversity, and boundary sample selection for uncertainty.\nThe process begins with the identification of pseudo-class centers, followed by\nan innovative denoising method and an iterative strategy for boundary sample\nselection in the high-dimensional feature space, all without relying on\nground-truth labels. Our comprehensive experiments provide both qualitative and\nquantitative evidence of our method's efficacy, outperforming all the existing\nbaselines.\n","authors":["Han Lu","Yichen Xie","Xiaokang Yang","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2403.10069v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10068v1","updated":"2024-03-15T07:18:55Z","published":"2024-03-15T07:18:55Z","title":"What Makes Good Collaborative Views? Contrastive Mutual Information\n Maximization for Multi-Agent Perception","summary":" Multi-agent perception (MAP) allows autonomous systems to understand complex\nenvironments by interpreting data from multiple sources. This paper\ninvestigates intermediate collaboration for MAP with a specific focus on\nexploring \"good\" properties of collaborative view (i.e., post-collaboration\nfeature) and its underlying relationship to individual views (i.e.,\npre-collaboration features), which were treated as an opaque procedure by most\nexisting works. We propose a novel framework named CMiMC (Contrastive Mutual\nInformation Maximization for Collaborative Perception) for intermediate\ncollaboration. The core philosophy of CMiMC is to preserve discriminative\ninformation of individual views in the collaborative view by maximizing mutual\ninformation between pre- and post-collaboration features while enhancing the\nefficacy of collaborative views by minimizing the loss function of downstream\ntasks. In particular, we define multi-view mutual information (MVMI) for\nintermediate collaboration that evaluates correlations between collaborative\nviews and individual views on both global and local scales. We establish\nCMiMNet based on multi-view contrastive learning to realize estimation and\nmaximization of MVMI, which assists the training of a collaboration encoder for\nvoxel-level feature fusion. We evaluate CMiMC on V2X-Sim 1.0, and it improves\nthe SOTA average precision by 3.08% and 4.44% at 0.5 and 0.7 IoU\n(Intersection-over-Union) thresholds, respectively. In addition, CMiMC can\nreduce communication volume to 1/32 while achieving performance comparable to\nSOTA. Code and Appendix are released at https://github.com/77SWF/CMiMC.\n","authors":["Wanfang Su","Lixing Chen","Yang Bai","Xi Lin","Gaolei Li","Zhe Qu","Pan Zhou"],"pdf_url":"https://arxiv.org/pdf/2403.10068v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10067v1","updated":"2024-03-15T07:18:43Z","published":"2024-03-15T07:18:43Z","title":"Hybrid Convolutional and Attention Network for Hyperspectral Image\n Denoising","summary":" Hyperspectral image (HSI) denoising is critical for the effective analysis\nand interpretation of hyperspectral data. However, simultaneously modeling\nglobal and local features is rarely explored to enhance HSI denoising. In this\nletter, we propose a hybrid convolution and attention network (HCANet), which\nleverages both the strengths of convolution neural networks (CNNs) and\nTransformers. To enhance the modeling of both global and local features, we\nhave devised a convolution and attention fusion module aimed at capturing\nlong-range dependencies and neighborhood spectral correlations. Furthermore, to\nimprove multi-scale information aggregation, we design a multi-scale\nfeed-forward network to enhance denoising performance by extracting features at\ndifferent scales. Experimental results on mainstream HSI datasets demonstrate\nthe rationality and effectiveness of the proposed HCANet. The proposed model is\neffective in removing various types of complex noise. Our codes are available\nat \\url{https://github.com/summitgao/HCANet}.\n","authors":["Shuai Hu","Feng Gao","Xiaowei Zhou","Junyu Dong","Qian Du"],"pdf_url":"https://arxiv.org/pdf/2403.10067v1.pdf","comment":"IEEE GRSL 2024"},{"id":"http://arxiv.org/abs/2403.10066v1","updated":"2024-03-15T07:16:07Z","published":"2024-03-15T07:16:07Z","title":"Contrastive Pre-Training with Multi-View Fusion for No-Reference Point\n Cloud Quality Assessment","summary":" No-reference point cloud quality assessment (NR-PCQA) aims to automatically\nevaluate the perceptual quality of distorted point clouds without available\nreference, which have achieved tremendous improvements due to the utilization\nof deep neural networks. However, learning-based NR-PCQA methods suffer from\nthe scarcity of labeled data and usually perform suboptimally in terms of\ngeneralization. To solve the problem, we propose a novel contrastive\npre-training framework tailored for PCQA (CoPA), which enables the pre-trained\nmodel to learn quality-aware representations from unlabeled data. To obtain\nanchors in the representation space, we project point clouds with different\ndistortions into images and randomly mix their local patches to form mixed\nimages with multiple distortions. Utilizing the generated anchors, we constrain\nthe pre-training process via a quality-aware contrastive loss following the\nphilosophy that perceptual quality is closely related to both content and\ndistortion. Furthermore, in the model fine-tuning stage, we propose a\nsemantic-guided multi-view fusion module to effectively integrate the features\nof projected images from multiple perspectives. Extensive experiments show that\nour method outperforms the state-of-the-art PCQA methods on popular benchmarks.\nFurther investigations demonstrate that CoPA can also benefit existing\nlearning-based PCQA models.\n","authors":["Ziyu Shan","Yujie Zhang","Qi Yang","Haichen Yang","Yiling Xu","Jenq-Neng Hwang","Xiaozhong Xu","Shan Liu"],"pdf_url":"https://arxiv.org/pdf/2403.10066v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10064v1","updated":"2024-03-15T07:14:01Z","published":"2024-03-15T07:14:01Z","title":"Progressive Divide-and-Conquer via Subsampling Decomposition for\n Accelerated MRI","summary":" Deep unfolding networks (DUN) have emerged as a popular iterative framework\nfor accelerated magnetic resonance imaging (MRI) reconstruction. However,\nconventional DUN aims to reconstruct all the missing information within the\nentire null space in each iteration. Thus it could be challenging when dealing\nwith highly ill-posed degradation, usually leading to unsatisfactory\nreconstruction. In this work, we propose a Progressive Divide-And-Conquer\n(PDAC) strategy, aiming to break down the subsampling process in the actual\nsevere degradation and thus perform reconstruction sequentially. Starting from\ndecomposing the original maximum-a-posteriori problem of accelerated MRI, we\npresent a rigorous derivation of the proposed PDAC framework, which could be\nfurther unfolded into an end-to-end trainable network. Specifically, each\niterative stage in PDAC focuses on recovering a distinct moderate degradation\naccording to the decomposition. Furthermore, as part of the PDAC iteration,\nsuch decomposition is adaptively learned as an auxiliary task through a\ndegradation predictor which provides an estimation of the decomposed sampling\nmask. Following this prediction, the sampling mask is further integrated via a\nseverity conditioning module to ensure awareness of the degradation severity at\neach stage. Extensive experiments demonstrate that our proposed method achieves\nsuperior performance on the publicly available fastMRI and Stanford2D FSE\ndatasets in both multi-coil and single-coil settings.\n","authors":["Chong Wang","Lanqing Guo","Yufei Wang","Hao Cheng","Yi Yu","Bihan Wen"],"pdf_url":"https://arxiv.org/pdf/2403.10064v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.06461v2","updated":"2024-03-15T07:07:20Z","published":"2024-03-11T06:56:08Z","title":"Reliable Spatial-Temporal Voxels For Multi-Modal Test-Time Adaptation","summary":" Multi-modal test-time adaptation (MM-TTA) is proposed to adapt models to an\nunlabeled target domain by leveraging the complementary multi-modal inputs in\nan online manner. Previous MM-TTA methods rely on predictions of cross-modal\ninformation in each input frame, while they ignore the fact that predictions of\ngeometric neighborhoods within consecutive frames are highly correlated,\nleading to unstable predictions across time. To fulfill this gap, we propose\nReLiable Spatial-temporal Voxels (Latte), an MM-TTA method that leverages\nreliable cross-modal spatial-temporal correspondences for multi-modal 3D\nsegmentation. Motivated by the fact that reliable predictions should be\nconsistent with their spatial-temporal correspondences, Latte aggregates\nconsecutive frames in a slide window manner and constructs ST voxel to capture\ntemporally local prediction consistency for each modality. After filtering out\nST voxels with high ST entropy, Latte conducts cross-modal learning for each\npoint and pixel by attending to those with reliable and consistent predictions\namong both spatial and temporal neighborhoods. Experimental results show that\nLatte achieves state-of-the-art performance on three different MM-TTA\nbenchmarks compared to previous MM-TTA or TTA methods.\n","authors":["Haozhi Cao","Yuecong Xu","Jianfei Yang","Pengyu Yin","Xingyu Ji","Shenghai Yuan","Lihua Xie"],"pdf_url":"https://arxiv.org/pdf/2403.06461v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10061v1","updated":"2024-03-15T07:01:33Z","published":"2024-03-15T07:01:33Z","title":"PAME: Self-Supervised Masked Autoencoder for No-Reference Point Cloud\n Quality Assessment","summary":" No-reference point cloud quality assessment (NR-PCQA) aims to automatically\npredict the perceptual quality of point clouds without reference, which has\nachieved remarkable performance due to the utilization of deep learning-based\nmodels. However, these data-driven models suffer from the scarcity of labeled\ndata and perform unsatisfactorily in cross-dataset evaluations. To address this\nproblem, we propose a self-supervised pre-training framework using masked\nautoencoders (PAME) to help the model learn useful representations without\nlabels. Specifically, after projecting point clouds into images, our PAME\nemploys dual-branch autoencoders, reconstructing masked patches from distorted\nimages into the original patches within reference and distorted images. In this\nmanner, the two branches can separately learn content-aware features and\ndistortion-aware features from the projected images. Furthermore, in the model\nfine-tuning stage, the learned content-aware features serve as a guide to fuse\nthe point cloud quality features extracted from different perspectives.\nExtensive experiments show that our method outperforms the state-of-the-art\nNR-PCQA methods on popular benchmarks in terms of prediction accuracy and\ngeneralizability.\n","authors":["Ziyu Shan","Yujie Zhang","Qi Yang","Haichen Yang","Yiling Xu","Shan Liu"],"pdf_url":"https://arxiv.org/pdf/2403.10061v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10058v1","updated":"2024-03-15T06:59:21Z","published":"2024-03-15T06:59:21Z","title":"RID-TWIN: An end-to-end pipeline for automatic face de-identification in\n videos","summary":" Face de-identification in videos is a challenging task in the domain of\ncomputer vision, primarily used in privacy-preserving applications. Despite the\nconsiderable progress achieved through generative vision models, there remain\nmultiple challenges in the latest approaches. They lack a comprehensive\ndiscussion and evaluation of aspects such as realism, temporal coherence, and\npreservation of non-identifiable features. In our work, we propose RID-Twin: a\nnovel pipeline that leverages the state-of-the-art generative models, and\ndecouples identity from motion to perform automatic face de-identification in\nvideos. We investigate the task from a holistic point of view and discuss how\nour approach addresses the pertinent existing challenges in this domain. We\nevaluate the performance of our methodology on the widely employed VoxCeleb2\ndataset, and also a custom dataset designed to accommodate the limitations of\ncertain behavioral variations absent in the VoxCeleb2 dataset. We discuss the\nimplications and advantages of our work and suggest directions for future\nresearch.\n","authors":["Anirban Mukherjee","Monjoy Narayan Choudhury","Dinesh Babu Jayagopi"],"pdf_url":"https://arxiv.org/pdf/2403.10058v1.pdf","comment":"This work has been submitted to IEEE ICIP 2024"},{"id":"http://arxiv.org/abs/2403.04473v2","updated":"2024-03-15T06:51:30Z","published":"2024-03-07T13:16:24Z","title":"TextMonkey: An OCR-Free Large Multimodal Model for Understanding\n Document","summary":" We present TextMonkey, a large multimodal model (LMM) tailored for\ntext-centric tasks. Our approach introduces enhancement across several\ndimensions: By adopting Shifted Window Attention with zero-initialization, we\nachieve cross-window connectivity at higher input resolutions and stabilize\nearly training; We hypothesize that images may contain redundant tokens, and by\nusing similarity to filter out significant tokens, we can not only streamline\nthe token length but also enhance the model's performance. Moreover, by\nexpanding our model's capabilities to encompass text spotting and grounding,\nand incorporating positional information into responses, we enhance\ninterpretability. It also learns to perform screenshot tasks through\nfinetuning. Evaluation on 12 benchmarks shows notable improvements: 5.2% in\nScene Text-Centric tasks (including STVQA, TextVQA, and OCRVQA), 6.9% in\nDocument-Oriented tasks (such as DocVQA, InfoVQA, ChartVQA, DeepForm, Kleister\nCharity, and WikiTableQuestions), and 2.8% in Key Information Extraction tasks\n(comprising FUNSD, SROIE, and POIE). It outperforms in scene text spotting with\na 10.9\\% increase and sets a new standard on OCRBench, a comprehensive\nbenchmark consisting of 29 OCR-related assessments, with a score of 561,\nsurpassing previous open-sourced large multimodal models for document\nunderstanding. Code will be released at https://github.com/Yuliang-Liu/Monkey.\n","authors":["Yuliang Liu","Biao Yang","Qiang Liu","Zhang Li","Zhiyin Ma","Shuo Zhang","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2403.04473v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12689v3","updated":"2024-03-15T06:51:28Z","published":"2024-01-23T11:54:09Z","title":"Energy-based Automated Model Evaluation","summary":" The conventional evaluation protocols on machine learning models rely heavily\non a labeled, i.i.d-assumed testing dataset, which is not often present in real\nworld applications. The Automated Model Evaluation (AutoEval) shows an\nalternative to this traditional workflow, by forming a proximal prediction\npipeline of the testing performance without the presence of ground-truth\nlabels. Despite its recent successes, the AutoEval frameworks still suffer from\nan overconfidence issue, substantial storage and computational cost. In that\nregard, we propose a novel measure -- Meta-Distribution Energy (MDE) -- that\nallows the AutoEval framework to be both more efficient and effective. The core\nof the MDE is to establish a meta-distribution statistic, on the information\n(energy) associated with individual samples, then offer a smoother\nrepresentation enabled by energy-based learning. We further provide our\ntheoretical insights by connecting the MDE with the classification loss. We\nprovide extensive experiments across modalities, datasets and different\narchitectural backbones to validate MDE's validity, together with its\nsuperiority compared with prior approaches. We also prove MDE's versatility by\nshowing its seamless integration with large-scale models, and easy adaption to\nlearning scenarios with noisy- or imbalanced- labels. Code and data are\navailable: https://github.com/pengr/Energy_AutoEval\n","authors":["Ru Peng","Heming Zou","Haobo Wang","Yawen Zeng","Zenan Huang","Junbo Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.12689v3.pdf","comment":"ICLR2024 poster paper"},{"id":"http://arxiv.org/abs/2403.10053v1","updated":"2024-03-15T06:48:44Z","published":"2024-03-15T06:48:44Z","title":"Group-Mix SAM: Lightweight Solution for Industrial Assembly Line\n Applications","summary":" Since the advent of the Segment Anything Model(SAM) approximately one year\nago, it has engendered significant academic interest and has spawned a large\nnumber of investigations and publications from various perspectives. However,\nthe deployment of SAM in practical assembly line scenarios has yet to\nmaterialize due to its large image encoder, which weighs in at an imposing\n632M. In this study, we have replaced the heavyweight image encoder with a\nlightweight one, thereby enabling the deployment of SAM in practical assembly\nline scenarios. Specifically, we have employed decoupled distillation to train\nthe encoder of MobileSAM in a resource-limited setting. The entire knowledge\ndistillation experiment can be completed in a single day on a single RTX 4090.\nThe resulting lightweight SAM, referred to as Group-Mix SAM, had 37.63% (2.16M)\nfewer parameters and 42.5% (15614.7M) fewer floating-point operations compared\nto MobileSAM. However, on our constructed industrial dataset, MALSD, its mIoU\nwas only marginally lower than that of MobileSAM, at 0.615. Finally, we\nconducted a comprehensive comparative experiment to demonstrate the superiority\nof Group-Mix SAM in the industrial domain. With its exceptional performance,\nour Group-Mix SAM is more suitable for practical assembly line applications.\n","authors":["Wu Liang","X. -G. Ma"],"pdf_url":"https://arxiv.org/pdf/2403.10053v1.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2403.10052v1","updated":"2024-03-15T06:47:14Z","published":"2024-03-15T06:47:14Z","title":"T4P: Test-Time Training of Trajectory Prediction via Masked Autoencoder\n and Actor-specific Token Memory","summary":" Trajectory prediction is a challenging problem that requires considering\ninteractions among multiple actors and the surrounding environment. While\ndata-driven approaches have been used to address this complex problem, they\nsuffer from unreliable predictions under distribution shifts during test time.\nAccordingly, several online learning methods have been proposed using\nregression loss from the ground truth of observed data leveraging the\nauto-labeling nature of trajectory prediction task. We mainly tackle the\nfollowing two issues. First, previous works underfit and overfit as they only\noptimize the last layer of the motion decoder. To this end, we employ the\nmasked autoencoder (MAE) for representation learning to encourage complex\ninteraction modeling in shifted test distribution for updating deeper layers.\nSecond, utilizing the sequential nature of driving data, we propose an\nactor-specific token memory that enables the test-time learning of actor-wise\nmotion characteristics. Our proposed method has been validated across various\nchallenging cross-dataset distribution shift scenarios including nuScenes,\nLyft, Waymo, and Interaction. Our method surpasses the performance of existing\nstate-of-the-art online learning methods in terms of both prediction accuracy\nand computational efficiency. The code is available at\nhttps://github.com/daeheepark/T4P.\n","authors":["Daehee Park","Jaeseok Jeong","Sung-Hoon Yoon","Jaewoo Jeong","Kuk-Jin Yoon"],"pdf_url":"https://arxiv.org/pdf/2403.10052v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.10050v1","updated":"2024-03-15T06:42:55Z","published":"2024-03-15T06:42:55Z","title":"Texture-GS: Disentangling the Geometry and Texture for 3D Gaussian\n Splatting Editing","summary":" 3D Gaussian splatting, emerging as a groundbreaking approach, has drawn\nincreasing attention for its capabilities of high-fidelity reconstruction and\nreal-time rendering. However, it couples the appearance and geometry of the\nscene within the Gaussian attributes, which hinders the flexibility of editing\noperations, such as texture swapping. To address this issue, we propose a novel\napproach, namely Texture-GS, to disentangle the appearance from the geometry by\nrepresenting it as a 2D texture mapped onto the 3D surface, thereby\nfacilitating appearance editing. Technically, the disentanglement is achieved\nby our proposed texture mapping module, which consists of a UV mapping MLP to\nlearn the UV coordinates for the 3D Gaussian centers, a local Taylor expansion\nof the MLP to efficiently approximate the UV coordinates for the ray-Gaussian\nintersections, and a learnable texture to capture the fine-grained appearance.\nExtensive experiments on the DTU dataset demonstrate that our method not only\nfacilitates high-fidelity appearance editing but also achieves real-time\nrendering on consumer-level devices, e.g. a single RTX 2080 Ti GPU.\n","authors":["Tian-Xing Xu","Wenbo Hu","Yu-Kun Lai","Ying Shan","Song-Hai Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.10050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10047v1","updated":"2024-03-15T06:38:25Z","published":"2024-03-15T06:38:25Z","title":"TextBlockV2: Towards Precise-Detection-Free Scene Text Spotting with\n Pre-trained Language Model","summary":" Existing scene text spotters are designed to locate and transcribe texts from\nimages. However, it is challenging for a spotter to achieve precise detection\nand recognition of scene texts simultaneously. Inspired by the glimpse-focus\nspotting pipeline of human beings and impressive performances of Pre-trained\nLanguage Models (PLMs) on visual tasks, we ask: 1) \"Can machines spot texts\nwithout precise detection just like human beings?\", and if yes, 2) \"Is text\nblock another alternative for scene text spotting other than word or\ncharacter?\" To this end, our proposed scene text spotter leverages advanced\nPLMs to enhance performance without fine-grained detection. Specifically, we\nfirst use a simple detector for block-level text detection to obtain rough\npositional information. Then, we finetune a PLM using a large-scale OCR dataset\nto achieve accurate recognition. Benefiting from the comprehensive language\nknowledge gained during the pre-training phase, the PLM-based recognition\nmodule effectively handles complex scenarios, including multi-line, reversed,\noccluded, and incomplete-detection texts. Taking advantage of the fine-tuned\nlanguage model on scene recognition benchmarks and the paradigm of text block\ndetection, extensive experiments demonstrate the superior performance of our\nscene text spotter across multiple public benchmarks. Additionally, we attempt\nto spot texts directly from an entire scene image to demonstrate the potential\nof PLMs, even Large Language Models (LLMs).\n","authors":["Jiahao Lyu","Jin Wei","Gangyan Zeng","Zeng Li","Enze Xie","Wei Wang","Yu Zhou"],"pdf_url":"https://arxiv.org/pdf/2403.10047v1.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2403.08504v2","updated":"2024-03-15T06:31:45Z","published":"2024-03-13T13:12:42Z","title":"OccFiner: Offboard Occupancy Refinement with Hybrid Propagation","summary":" Vision-based occupancy prediction, also known as 3D Semantic Scene Completion\n(SSC), presents a significant challenge in computer vision. Previous methods,\nconfined to onboard processing, struggle with simultaneous geometric and\nsemantic estimation, continuity across varying viewpoints, and single-view\nocclusion. Our paper introduces OccFiner, a novel offboard framework designed\nto enhance the accuracy of vision-based occupancy predictions. OccFiner\noperates in two hybrid phases: 1) a multi-to-multi local propagation network\nthat implicitly aligns and processes multiple local frames for correcting\nonboard model errors and consistently enhancing occupancy accuracy across all\ndistances. 2) the region-centric global propagation, focuses on refining labels\nusing explicit multi-view geometry and integrating sensor bias, especially to\nincrease the accuracy of distant occupied voxels. Extensive experiments\ndemonstrate that OccFiner improves both geometric and semantic accuracy across\nvarious types of coarse occupancy, setting a new state-of-the-art performance\non the SemanticKITTI dataset. Notably, OccFiner elevates vision-based SSC\nmodels to a level even surpassing that of LiDAR-based onboard SSC models.\n","authors":["Hao Shi","Song Wang","Jiaming Zhang","Xiaoting Yin","Zhongdao Wang","Zhijian Zhao","Guangming Wang","Jianke Zhu","Kailun Yang","Kaiwei Wang"],"pdf_url":"https://arxiv.org/pdf/2403.08504v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10045v1","updated":"2024-03-15T06:31:03Z","published":"2024-03-15T06:31:03Z","title":"Towards Adversarially Robust Dataset Distillation by Curvature\n Regularization","summary":" Dataset distillation (DD) allows datasets to be distilled to fractions of\ntheir original size while preserving the rich distributional information so\nthat models trained on the distilled datasets can achieve a comparable accuracy\nwhile saving significant computational loads. Recent research in this area has\nbeen focusing on improving the accuracy of models trained on distilled\ndatasets. In this paper, we aim to explore a new perspective of DD. We study\nhow to embed adversarial robustness in distilled datasets, so that models\ntrained on these datasets maintain the high accuracy and meanwhile acquire\nbetter adversarial robustness. We propose a new method that achieves this goal\nby incorporating curvature regularization into the distillation process with\nmuch less computational overhead than standard adversarial training. Extensive\nempirical experiments suggest that our method not only outperforms standard\nadversarial training on both accuracy and robustness with less computation\noverhead but is also capable of generating robust distilled datasets that can\nwithstand various adversarial attacks.\n","authors":["Eric Xue","Yijiang Li","Haoyang Liu","Yifan Shen","Haohan Wang"],"pdf_url":"https://arxiv.org/pdf/2403.10045v1.pdf","comment":"17 pages, 3 figures"},{"id":"http://arxiv.org/abs/2403.10044v1","updated":"2024-03-15T06:26:46Z","published":"2024-03-15T06:26:46Z","title":"SphereDiffusion: Spherical Geometry-Aware Distortion Resilient Diffusion\n Model","summary":" Controllable spherical panoramic image generation holds substantial\napplicative potential across a variety of domains.However, it remains a\nchallenging task due to the inherent spherical distortion and geometry\ncharacteristics, resulting in low-quality content generation.In this paper, we\nintroduce a novel framework of SphereDiffusion to address these unique\nchallenges, for better generating high-quality and precisely controllable\nspherical panoramic images.For the spherical distortion characteristic, we\nembed the semantics of the distorted object with text encoding, then explicitly\nconstruct the relationship with text-object correspondence to better use the\npre-trained knowledge of the planar images.Meanwhile, we employ a deformable\ntechnique to mitigate the semantic deviation in latent space caused by\nspherical distortion.For the spherical geometry characteristic, in virtue of\nspherical rotation invariance, we improve the data diversity and optimization\nobjectives in the training process, enabling the model to better learn the\nspherical geometry characteristic.Furthermore, we enhance the denoising process\nof the diffusion model, enabling it to effectively use the learned geometric\ncharacteristic to ensure the boundary continuity of the generated images.With\nthese specific techniques, experiments on Structured3D dataset show that\nSphereDiffusion significantly improves the quality of controllable spherical\nimage generation and relatively reduces around 35% FID on average.\n","authors":["Tao Wu","Xuewei Li","Zhongang Qi","Di Hu","Xintao Wang","Ying Shan","Xi Li"],"pdf_url":"https://arxiv.org/pdf/2403.10044v1.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2403.10039v1","updated":"2024-03-15T06:19:02Z","published":"2024-03-15T06:19:02Z","title":"Rethinking Low-quality Optical Flow in Unsupervised Surgical Instrument\n Segmentation","summary":" Video-based surgical instrument segmentation plays an important role in\nrobot-assisted surgeries. Unlike supervised settings, unsupervised segmentation\nrelies heavily on motion cues, which are challenging to discern due to the\ntypically lower quality of optical flow in surgical footage compared to natural\nscenes. This presents a considerable burden for the advancement of unsupervised\nsegmentation techniques. In our work, we address the challenge of enhancing\nmodel performance despite the inherent limitations of low-quality optical flow.\nOur methodology employs a three-pronged approach: extracting boundaries\ndirectly from the optical flow, selectively discarding frames with inferior\nflow quality, and employing a fine-tuning process with variable frame rates. We\nthoroughly evaluate our strategy on the EndoVis2017 VOS dataset and Endovis2017\nChallenge dataset, where our model demonstrates promising results, achieving a\nmean Intersection-over-Union (mIoU) of 0.75 and 0.72, respectively. Our\nfindings suggest that our approach can greatly decrease the need for manual\nannotations in clinical environments and may facilitate the annotation process\nfor new datasets. The code is available at\nhttps://github.com/wpr1018001/Rethinking-Low-quality-Optical-Flow.git\n","authors":["Peiran Wu","Yang Liu","Jiayu Huo","Gongyu Zhang","Christos Bergeles","Rachel Sparks","Prokar Dasgupta","Alejandro Granados","Sebastien Ourselin"],"pdf_url":"https://arxiv.org/pdf/2403.10039v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11317v5","updated":"2024-03-15T06:08:30Z","published":"2023-11-19T13:07:06Z","title":"Discrete approximations of Gaussian smoothing and Gaussian derivatives","summary":" This paper develops an in-depth treatment concerning the problem of\napproximating the Gaussian smoothing and Gaussian derivative computations in\nscale-space theory for application on discrete data. With close connections to\nprevious axiomatic treatments of continuous and discrete scale-space theory, we\nconsider three main ways discretizing these scale-space operations in terms of\nexplicit discrete convolutions, based on either (i) sampling the Gaussian\nkernels and the Gaussian derivative kernels, (ii) locally integrating the\nGaussian kernels and the Gaussian derivative kernels over each pixel support\nregion and (iii) basing the scale-space analysis on the discrete analogue of\nthe Gaussian kernel, and then computing derivative approximations by applying\nsmall-support central difference operators to the spatially smoothed image\ndata.\n We study the properties of these three main discretization methods both\ntheoretically and experimentally, and characterize their performance by\nquantitative measures, including the results they give rise to with respect to\nthe task of scale selection, investigated for four different use cases, and\nwith emphasis on the behaviour at fine scales. The results show that the\nsampled Gaussian kernels and derivatives as well as the integrated Gaussian\nkernels and derivatives perform very poorly at very fine scales. At very fine\nscales, the discrete analogue of the Gaussian kernel with its corresponding\ndiscrete derivative approximations performs substantially better. The sampled\nGaussian kernel and the sampled Gaussian derivatives do, on the other hand,\nlead to numerically very good approximations of the corresponding continuous\nresults, when the scale parameter is sufficiently large, in the experiments\npresented in the paper, when the scale parameter is greater than a value of\nabout 1, in units of the grid spacing.\n","authors":["Tony Lindeberg"],"pdf_url":"https://arxiv.org/pdf/2311.11317v5.pdf","comment":"40 pages, 21 figures"},{"id":"http://arxiv.org/abs/2403.10037v1","updated":"2024-03-15T06:06:06Z","published":"2024-03-15T06:06:06Z","title":"Knowledge Condensation and Reasoning for Knowledge-based VQA","summary":" Knowledge-based visual question answering (KB-VQA) is a challenging task,\nwhich requires the model to leverage external knowledge for comprehending and\nanswering questions grounded in visual content. Recent studies retrieve the\nknowledge passages from external knowledge bases and then use them to answer\nquestions. However, these retrieved knowledge passages often contain irrelevant\nor noisy information, which limits the performance of the model. To address the\nchallenge, we propose two synergistic models: Knowledge Condensation model and\nKnowledge Reasoning model. We condense the retrieved knowledge passages from\ntwo perspectives. First, we leverage the multimodal perception and reasoning\nability of the visual-language models to distill concise knowledge concepts\nfrom retrieved lengthy passages, ensuring relevance to both the visual content\nand the question. Second, we leverage the text comprehension ability of the\nlarge language models to summarize and condense the passages into the knowledge\nessence which helps answer the question. These two types of condensed knowledge\nare then seamlessly integrated into our Knowledge Reasoning model, which\njudiciously navigates through the amalgamated information to arrive at the\nconclusive answer. Extensive experiments validate the superiority of the\nproposed method. Compared to previous methods, our method achieves\nstate-of-the-art performance on knowledge-based VQA datasets (65.1% on OK-VQA\nand 60.1% on A-OKVQA) without resorting to the knowledge produced by GPT-3\n(175B).\n","authors":["Dongze Hao","Jian Jia","Longteng Guo","Qunbo Wang","Te Yang","Yan Li","Yanhua Cheng","Bo Wang","Quan Chen","Han Li","Jing Liu"],"pdf_url":"https://arxiv.org/pdf/2403.10037v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10036v1","updated":"2024-03-15T05:59:10Z","published":"2024-03-15T05:59:10Z","title":"SparseFusion: Efficient Sparse Multi-Modal Fusion Framework for\n Long-Range 3D Perception","summary":" Multi-modal 3D object detection has exhibited significant progress in recent\nyears. However, most existing methods can hardly scale to long-range scenarios\ndue to their reliance on dense 3D features, which substantially escalate\ncomputational demands and memory usage. In this paper, we introduce\nSparseFusion, a novel multi-modal fusion framework fully built upon sparse 3D\nfeatures to facilitate efficient long-range perception. The core of our method\nis the Sparse View Transformer module, which selectively lifts regions of\ninterest in 2D image space into the unified 3D space. The proposed module\nintroduces sparsity from both semantic and geometric aspects which only fill\ngrids that foreground objects potentially reside in. Comprehensive experiments\nhave verified the efficiency and effectiveness of our framework in long-range\n3D perception. Remarkably, on the long-range Argoverse2 dataset, SparseFusion\nreduces memory footprint and accelerates the inference by about two times\ncompared to dense detectors. It also achieves state-of-the-art performance with\nmAP of 41.2% and CDS of 32.1%. The versatility of SparseFusion is also\nvalidated in the temporal object detection task and 3D lane detection task.\nCodes will be released upon acceptance.\n","authors":["Yiheng Li","Hongyang Li","Zehao Huang","Hong Chang","Naiyan Wang"],"pdf_url":"https://arxiv.org/pdf/2403.10036v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.07420v3","updated":"2024-03-15T05:53:11Z","published":"2024-03-12T08:57:29Z","title":"DragAnything: Motion Control for Anything using Entity Representation","summary":" We introduce DragAnything, which utilizes a entity representation to achieve\nmotion control for any object in controllable video generation. Comparison to\nexisting motion control methods, DragAnything offers several advantages.\nFirstly, trajectory-based is more userfriendly for interaction, when acquiring\nother guidance signals (e.g., masks, depth maps) is labor-intensive. Users only\nneed to draw a line (trajectory) during interaction. Secondly, our entity\nrepresentation serves as an open-domain embedding capable of representing any\nobject, enabling the control of motion for diverse entities, including\nbackground. Lastly, our entity representation allows simultaneous and distinct\nmotion control for multiple objects. Extensive experiments demonstrate that our\nDragAnything achieves state-of-the-art performance for FVD, FID, and User\nStudy, particularly in terms of object motion control, where our method\nsurpasses the previous methods (e.g., DragNUWA) by 26% in human voting.\n","authors":["Weijia Wu","Zhuang Li","Yuchao Gu","Rui Zhao","Yefei He","David Junhao Zhang","Mike Zheng Shou","Yan Li","Tingting Gao","Di Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.07420v3.pdf","comment":"The project website is at:\n https://weijiawu.github.io/draganything_page/ . The code is at:\n https://github.com/showlab/DragAnything"},{"id":"http://arxiv.org/abs/2403.10030v1","updated":"2024-03-15T05:30:29Z","published":"2024-03-15T05:30:29Z","title":"Multi-criteria Token Fusion with One-step-ahead Attention for Efficient\n Vision Transformers","summary":" Vision Transformer (ViT) has emerged as a prominent backbone for computer\nvision. For more efficient ViTs, recent works lessen the quadratic cost of the\nself-attention layer by pruning or fusing the redundant tokens. However, these\nworks faced the speed-accuracy trade-off caused by the loss of information.\nHere, we argue that token fusion needs to consider diverse relations between\ntokens to minimize information loss. In this paper, we propose a Multi-criteria\nToken Fusion (MCTF), that gradually fuses the tokens based on multi-criteria\n(e.g., similarity, informativeness, and size of fused tokens). Further, we\nutilize the one-step-ahead attention, which is the improved approach to capture\nthe informativeness of the tokens. By training the model equipped with MCTF\nusing a token reduction consistency, we achieve the best speed-accuracy\ntrade-off in the image classification (ImageNet1K). Experimental results prove\nthat MCTF consistently surpasses the previous reduction methods with and\nwithout training. Specifically, DeiT-T and DeiT-S with MCTF reduce FLOPs by\nabout 44% while improving the performance (+0.5%, and +0.3%) over the base\nmodel, respectively. We also demonstrate the applicability of MCTF in various\nVision Transformers (e.g., T2T-ViT, LV-ViT), achieving at least 31% speedup\nwithout performance degradation. Code is available at\nhttps://github.com/mlvlab/MCTF.\n","authors":["Sanghyeok Lee","Joonmyung Choi","Hyunwoo J. Kim"],"pdf_url":"https://arxiv.org/pdf/2403.10030v1.pdf","comment":"Conference on Computer Vision and Pattern Recognition (CVPR), 2024"},{"id":"http://arxiv.org/abs/2311.14024v3","updated":"2024-03-15T05:27:42Z","published":"2023-11-23T14:28:28Z","title":"Creating and Leveraging a Synthetic Dataset of Cloud Optical Thickness\n Measures for Cloud Detection in MSI","summary":" Cloud formations often obscure optical satellite-based monitoring of the\nEarth's surface, thus limiting Earth observation (EO) activities such as land\ncover mapping, ocean color analysis, and cropland monitoring. The integration\nof machine learning (ML) methods within the remote sensing domain has\nsignificantly improved performance on a wide range of EO tasks, including cloud\ndetection and filtering, but there is still much room for improvement. A key\nbottleneck is that ML methods typically depend on large amounts of annotated\ndata for training, which is often difficult to come by in EO contexts. This is\nespecially true when it comes to cloud optical thickness (COT) estimation. A\nreliable estimation of COT enables more fine-grained and application-dependent\ncontrol compared to using pre-specified cloud categories, as is commonly done\nin practice. To alleviate the COT data scarcity problem, in this work we\npropose a novel synthetic dataset for COT estimation, that we subsequently\nleverage for obtaining reliable and versatile cloud masks on real data. In our\ndataset, top-of-atmosphere radiances have been simulated for 12 of the spectral\nbands of the Multispectral Imagery (MSI) sensor onboard Sentinel-2 platforms.\nThese data points have been simulated under consideration of different cloud\ntypes, COTs, and ground surface and atmospheric profiles. Extensive\nexperimentation of training several ML models to predict COT from the measured\nreflectivity of the spectral bands demonstrates the usefulness of our proposed\ndataset. In particular, by thresholding COT estimates from our ML models, we\nshow on two satellite image datasets (one that is publicly available, and one\nwhich we have collected and annotated) that reliable cloud masks can be\nobtained. The synthetic data, the collected real dataset, code and models have\nbeen made publicly available at\nhttps://github.com/aleksispi/ml-cloud-opt-thick.\n","authors":["Aleksis Pirinen","Nosheen Abid","Nuria Agues Paszkowsky","Thomas Ohlson Timoudas","Ronald Scheirer","Chiara Ceccobello","György Kovács","Anders Persson"],"pdf_url":"https://arxiv.org/pdf/2311.14024v3.pdf","comment":"Published in the journal Remote Sensing (2024). Code, data and models\n available at https://github.com/aleksispi/ml-cloud-opt-thick"},{"id":"http://arxiv.org/abs/2403.06090v2","updated":"2024-03-15T04:43:21Z","published":"2024-03-10T04:23:24Z","title":"Diffusion Models Trained with Large Data Are Transferable Visual Models","summary":" We show that, simply initializing image understanding models using a\npre-trained UNet (or transformer) of diffusion models, it is possible to\nachieve remarkable transferable performance on fundamental vision perception\ntasks using a moderate amount of target data (even synthetic data only),\nincluding monocular depth, surface normal, image segmentation, matting, human\npose estimation, among virtually many others. Previous works have adapted\ndiffusion models for various perception tasks, often reformulating these tasks\nas generation processes to align with the diffusion process. In sharp contrast,\nwe demonstrate that fine-tuning these models with minimal adjustments can be a\nmore effective alternative, offering the advantages of being embarrassingly\nsimple and significantly faster. As the backbone network of Stable Diffusion\nmodels is trained on giant datasets comprising billions of images, we observe\nvery robust generalization capabilities of the diffusion backbone. Experimental\nresults showcase the remarkable transferability of the backbone of diffusion\nmodels across diverse tasks and real-world datasets.\n","authors":["Guangkai Xu","Yongtao Ge","Mingyu Liu","Chengxiang Fan","Kangyang Xie","Zhiyue Zhao","Hao Chen","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2403.06090v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10015v1","updated":"2024-03-15T04:39:27Z","published":"2024-03-15T04:39:27Z","title":"Linear optimal transport subspaces for point set classification","summary":" Learning from point sets is an essential component in many computer vision\nand machine learning applications. Native, unordered, and permutation invariant\nset structure space is challenging to model, particularly for point set\nclassification under spatial deformations. Here we propose a framework for\nclassifying point sets experiencing certain types of spatial deformations, with\na particular emphasis on datasets featuring affine deformations. Our approach\nemploys the Linear Optimal Transport (LOT) transform to obtain a linear\nembedding of set-structured data. Utilizing the mathematical properties of the\nLOT transform, we demonstrate its capacity to accommodate variations in point\nsets by constructing a convex data space, effectively simplifying point set\nclassification problems. Our method, which employs a nearest-subspace algorithm\nin the LOT space, demonstrates label efficiency, non-iterative behavior, and\nrequires no hyper-parameter tuning. It achieves competitive accuracies compared\nto state-of-the-art methods across various point set classification tasks.\nFurthermore, our approach exhibits robustness in out-of-distribution scenarios\nwhere training and test distributions vary in terms of deformation magnitudes.\n","authors":["Mohammad Shifat E Rabbi","Naqib Sad Pathan","Shiying Li","Yan Zhuang","Abu Hasnat Mohammad Rubaiyat","Gustavo K Rohde"],"pdf_url":"https://arxiv.org/pdf/2403.10015v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02992v2","updated":"2024-03-15T04:38:21Z","published":"2023-10-04T17:28:44Z","title":"Kosmos-G: Generating Images in Context with Multimodal Large Language\n Models","summary":" Recent advancements in subject-driven image generation have made significant\nstrides. However, current methods still fall short in diverse application\nscenarios, as they require test-time tuning and cannot accept interleaved\nmulti-image and text input. These limitations keep them far from the ultimate\ngoal of \"image as a foreign language in image generation.\" This paper presents\nKosmos-G, a model that leverages the advanced multimodal perception\ncapabilities of Multimodal Large Language Models (MLLMs) to tackle the\naforementioned challenge. Our approach aligns the output space of MLLM with\nCLIP using the textual modality as an anchor and performs compositional\ninstruction tuning on curated data. Kosmos-G demonstrates an impressive\ncapability of zero-shot subject-driven generation with interleaved multi-image\nand text input. Notably, the score distillation instruction tuning requires no\nmodifications to the image decoder. This allows for a seamless substitution of\nCLIP and effortless integration with a myriad of U-Net techniques ranging from\nfine-grained controls to personalized image decoder variants. We posit Kosmos-G\nas an initial attempt towards the goal of \"image as a foreign language in image\ngeneration.\" The code can be found at https://aka.ms/Kosmos-G\n","authors":["Xichen Pan","Li Dong","Shaohan Huang","Zhiliang Peng","Wenhu Chen","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2310.02992v2.pdf","comment":"Code: https://aka.ms/Kosmos-G Project Page:\n https://xichenpan.github.io/kosmosg"},{"id":"http://arxiv.org/abs/2312.02521v2","updated":"2024-03-15T04:37:32Z","published":"2023-12-05T06:04:16Z","title":"Retrieving Conditions from Reference Images for Diffusion Models","summary":" Newly developed diffusion-based techniques have showcased phenomenal\nabilities in producing a wide range of high-quality images, sparking\nconsiderable interest in various applications. A prevalent scenario is to\ngenerate new images based on a subject from reference images. This subject\ncould be face identity for styled avatars, body and clothing for virtual try-on\nand so on. Satisfying this requirement is evolving into a field called\nSubject-Driven Generation. In this paper, we consider Subject-Driven Generation\nas a unified retrieval problem with diffusion models. We introduce a novel\ndiffusion model architecture, named RetriNet, designed to address and solve\nthese problems by retrieving subject attributes from reference images\nprecisely, and filter out irrelevant information. RetriNet demonstrates\nimpressive performance when compared to existing state-of-the-art approaches in\nface generation. We further propose a research and iteration friendly dataset,\nRetriBooru, to study a more difficult problem, concept composition. Finally, to\nbetter evaluate alignment between similarity and diversity or measure diversity\nthat have been previously unaccounted for, we introduce a novel class of\nmetrics named Similarity Weighted Diversity (SWD).\n","authors":["Haoran Tang","Xin Zhou","Jieren Deng","Zhihong Pan","Hao Tian","Pratik Chaudhari"],"pdf_url":"https://arxiv.org/pdf/2312.02521v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10012v1","updated":"2024-03-15T04:35:25Z","published":"2024-03-15T04:35:25Z","title":"Real-World Computational Aberration Correction via Quantized\n Domain-Mixing Representation","summary":" Relying on paired synthetic data, existing learning-based Computational\nAberration Correction (CAC) methods are confronted with the intricate and\nmultifaceted synthetic-to-real domain gap, which leads to suboptimal\nperformance in real-world applications. In this paper, in contrast to improving\nthe simulation pipeline, we deliver a novel insight into real-world CAC from\nthe perspective of Unsupervised Domain Adaptation (UDA). By incorporating\nreadily accessible unpaired real-world data into training, we formalize the\nDomain Adaptive CAC (DACAC) task, and then introduce a comprehensive Real-world\naberrated images (Realab) dataset to benchmark it. The setup task presents a\nformidable challenge due to the intricacy of understanding the target\naberration domain. To this intent, we propose a novel Quntized Domain-Mixing\nRepresentation (QDMR) framework as a potent solution to the issue. QDMR adapts\nthe CAC model to the target domain from three key aspects: (1) reconstructing\naberrated images of both domains by a VQGAN to learn a Domain-Mixing Codebook\n(DMC) which characterizes the degradation-aware priors; (2) modulating the deep\nfeatures in CAC model with DMC to transfer the target domain knowledge; and (3)\nleveraging the trained VQGAN to generate pseudo target aberrated images from\nthe source ones for convincing target domain supervision. Extensive experiments\non both synthetic and real-world benchmarks reveal that the models with QDMR\nconsistently surpass the competitive methods in mitigating the\nsynthetic-to-real gap, which produces visually pleasant real-world CAC results\nwith fewer artifacts. Codes and datasets will be made publicly available.\n","authors":["Qi Jiang","Zhonghua Yi","Shaohua Gao","Yao Gao","Xiaolong Qian","Hao Shi","Lei Sun","Zhijie Xu","Kailun Yang","Kaiwei Wang"],"pdf_url":"https://arxiv.org/pdf/2403.10012v1.pdf","comment":"Codes and datasets will be made publicly available at\n https://github.com/zju-jiangqi/QDMR"},{"id":"http://arxiv.org/abs/2403.10009v1","updated":"2024-03-15T04:27:55Z","published":"2024-03-15T04:27:55Z","title":"Cardiac Magnetic Resonance 2D+T Short- and Long-axis Segmentation via\n Spatio-temporal SAM Adaptation","summary":" Accurate 2D+T myocardium segmentation in cine cardiac magnetic resonance\n(CMR) scans is essential to analyze LV motion throughout the cardiac cycle\ncomprehensively. The Segment Anything Model (SAM), known for its accurate\nsegmentation and zero-shot generalization, has not yet been tailored for CMR\n2D+T segmentation. We therefore introduce CMR2D+T-SAM, a novel approach to\nadapt SAM for CMR 2D+T segmentation using spatio-temporal adaption. This\napproach also incorporates a U-Net framework for multi-scale feature\nextraction, as well as text prompts for accurate segmentation on both\nshort-axis (SAX) and long-axis (LAX) views using a single model. CMR2D+T-SAM\noutperforms existing deep learning methods on the STACOM2011 dataset, achieving\na myocardium Dice score of 0.885 and a Hausdorff distance (HD) of 2.900 pixels.\nIt also demonstrates superior zero-shot generalization on the ACDC dataset with\na Dice score of 0.840 and a HD of 4.076 pixels.\n","authors":["Zhennong Chen","Sekeun Kim","Hui Ren","Quanzheng Li","Xiang Li"],"pdf_url":"https://arxiv.org/pdf/2403.10009v1.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2304.09783v3","updated":"2024-03-15T04:13:51Z","published":"2023-04-19T16:09:59Z","title":"Application of attention-based Siamese composite neural network in\n medical image recognition","summary":" Medical image recognition often faces the problem of insufficient data in\npractical applications. Image recognition and processing under few-shot\nconditions will produce overfitting, low recognition accuracy, low reliability\nand insufficient robustness. It is often the case that the difference of\ncharacteristics is subtle, and the recognition is affected by perspectives,\nbackground, occlusion and other factors, which increases the difficulty of\nrecognition. Furthermore, in fine-grained images, the few-shot problem leads to\ninsufficient useful feature information in the images. Considering the\ncharacteristics of few-shot and fine-grained image recognition, this study has\nestablished a recognition model based on attention and Siamese neural network.\nAiming at the problem of few-shot samples, a Siamese neural network suitable\nfor classification model is proposed. The Attention-Based neural network is\nused as the main network to improve the classification effect. Covid- 19 lung\nsamples have been selected for testing the model. The results show that the\nless the number of image samples are, the more obvious the advantage shows than\nthe ordinary neural network.\n","authors":["Zihao Huang","Yue Wang","Weixing Xin","Xingtong Lin","Huizhen Li","Haowen Chen","Yizhen Lao","Xia Chen"],"pdf_url":"https://arxiv.org/pdf/2304.09783v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10004v1","updated":"2024-03-15T04:02:31Z","published":"2024-03-15T04:02:31Z","title":"ST-LDM: A Universal Framework for Text-Grounded Object Generation in\n Real Images","summary":" We present a novel image editing scenario termed Text-grounded Object\nGeneration (TOG), defined as generating a new object in the real image\nspatially conditioned by textual descriptions. Existing diffusion models\nexhibit limitations of spatial perception in complex real-world scenes, relying\non additional modalities to enforce constraints, and TOG imposes heightened\nchallenges on scene comprehension under the weak supervision of linguistic\ninformation. We propose a universal framework ST-LDM based on Swin-Transformer,\nwhich can be integrated into any latent diffusion model with training-free\nbackward guidance. ST-LDM encompasses a global-perceptual autoencoder with\nadaptable compression scales and hierarchical visual features, parallel with\ndeformable multimodal transformer to generate region-wise guidance for the\nsubsequent denoising process. We transcend the limitation of traditional\nattention mechanisms that only focus on existing visual features by introducing\ndeformable feature alignment to hierarchically refine spatial positioning fused\nwith multi-scale visual and linguistic information. Extensive Experiments\ndemonstrate that our model enhances the localization of attention mechanisms\nwhile preserving the generative capabilities inherent to diffusion models.\n","authors":["Xiangtian Xue","Jiasong Wu","Youyong Kong","Lotfi Senhadji","Huazhong Shu"],"pdf_url":"https://arxiv.org/pdf/2403.10004v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10001v1","updated":"2024-03-15T03:58:17Z","published":"2024-03-15T03:58:17Z","title":"Visual Foundation Models Boost Cross-Modal Unsupervised Domain\n Adaptation for 3D Semantic Segmentation","summary":" Unsupervised domain adaptation (UDA) is vital for alleviating the workload of\nlabeling 3D point cloud data and mitigating the absence of labels when facing a\nnewly defined domain. Various methods of utilizing images to enhance the\nperformance of cross-domain 3D segmentation have recently emerged. However, the\npseudo labels, which are generated from models trained on the source domain and\nprovide additional supervised signals for the unseen domain, are inadequate\nwhen utilized for 3D segmentation due to their inherent noisiness and\nconsequently restrict the accuracy of neural networks. With the advent of 2D\nvisual foundation models (VFMs) and their abundant knowledge prior, we propose\na novel pipeline VFMSeg to further enhance the cross-modal unsupervised domain\nadaptation framework by leveraging these models. In this work, we study how to\nharness the knowledge priors learned by VFMs to produce more accurate labels\nfor unlabeled target domains and improve overall performance. We first utilize\na multi-modal VFM, which is pre-trained on large scale image-text pairs, to\nprovide supervised labels (VFM-PL) for images and point clouds from the target\ndomain. Then, another VFM trained on fine-grained 2D masks is adopted to guide\nthe generation of semantically augmented images and point clouds to enhance the\nperformance of neural networks, which mix the data from source and target\ndomains like view frustums (FrustumMixing). Finally, we merge class-wise\nprediction across modalities to produce more accurate annotations for unlabeled\ntarget domains. Our method is evaluated on various autonomous driving datasets\nand the results demonstrate a significant improvement for 3D segmentation task.\n","authors":["Jingyi Xu","Weidong Yang","Lingdong Kong","Youquan Liu","Rui Zhang","Qingyuan Zhou","Ben Fei"],"pdf_url":"https://arxiv.org/pdf/2403.10001v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2403.09998v1","updated":"2024-03-15T03:45:10Z","published":"2024-03-15T03:45:10Z","title":"FBPT: A Fully Binary Point Transformer","summary":" This paper presents a novel Fully Binary Point Cloud Transformer (FBPT) model\nwhich has the potential to be widely applied and expanded in the fields of\nrobotics and mobile devices. By compressing the weights and activations of a\n32-bit full-precision network to 1-bit binary values, the proposed binary point\ncloud Transformer network significantly reduces the storage footprint and\ncomputational resource requirements of neural network models for point cloud\nprocessing tasks, compared to full-precision point cloud networks. However,\nachieving a fully binary point cloud Transformer network, where all parts\nexcept the modules specific to the task are binary, poses challenges and\nbottlenecks in quantizing the activations of Q, K, V and self-attention in the\nattention module, as they do not adhere to simple probability distributions and\ncan vary with input data. Furthermore, in our network, the binary attention\nmodule undergoes a degradation of the self-attention module due to the uniform\ndistribution that occurs after the softmax operation. The primary focus of this\npaper is on addressing the performance degradation issue caused by the use of\nbinary point cloud Transformer modules. We propose a novel binarization\nmechanism called dynamic-static hybridization. Specifically, our approach\ncombines static binarization of the overall network model with fine granularity\ndynamic binarization of data-sensitive components. Furthermore, we make use of\na novel hierarchical training scheme to obtain the optimal model and\nbinarization parameters. These above improvements allow the proposed\nbinarization method to outperform binarization methods applied to convolution\nneural networks when used in point cloud Transformer structures. To demonstrate\nthe superiority of our algorithm, we conducted experiments on two different\ntasks: point cloud classification and place recognition.\n","authors":["Zhixing Hou","Yuzhang Shang","Yan Yan"],"pdf_url":"https://arxiv.org/pdf/2403.09998v1.pdf","comment":"Accepted to ICRA 2024. arXiv admin note: substantial text overlap\n with arXiv:2303.01166"},{"id":"http://arxiv.org/abs/2403.09996v1","updated":"2024-03-15T03:42:38Z","published":"2024-03-15T03:42:38Z","title":"MEDPNet: Achieving High-Precision Adaptive Registration for Complex Die\n Castings","summary":" Due to their complex spatial structure and diverse geometric features,\nachieving high-precision and robust point cloud registration for complex Die\nCastings has been a significant challenge in the die-casting industry. Existing\npoint cloud registration methods primarily optimize network models using\nwell-established high-quality datasets, often neglecting practical application\nin real scenarios. To address this gap, this paper proposes a high-precision\nadaptive registration method called Multiscale Efficient Deep Closest Point\n(MEDPNet) and introduces a die-casting point cloud dataset, DieCastCloud,\nspecifically designed to tackle the challenges of point cloud registration in\nthe die-casting industry. The MEDPNet method performs coarse die-casting point\ncloud data registration using the Efficient-DCP method, followed by precision\nregistration using the Multiscale feature fusion dual-channel registration\n(MDR) method. We enhance the modeling capability and computational efficiency\nof the model by replacing the attention mechanism of the Transformer in DCP\nwith Efficient Attention and implementing a collaborative scale mechanism\nthrough the combination of serial and parallel blocks. Additionally, we propose\nthe MDR method, which utilizes multilayer perceptrons (MLP), Normal\nDistributions Transform (NDT), and Iterative Closest Point (ICP) to achieve\nlearnable adaptive fusion, enabling high-precision, scalable, and\nnoise-resistant global point cloud registration. Our proposed method\ndemonstrates excellent performance compared to state-of-the-art geometric and\nlearning-based registration methods when applied to complex die-casting point\ncloud data.\n","authors":["Yu Du","Yu Song","Ce Guo","Xiaojing Tian","Dong Liu","Ming Cong"],"pdf_url":"https://arxiv.org/pdf/2403.09996v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09993v1","updated":"2024-03-15T03:27:39Z","published":"2024-03-15T03:27:39Z","title":"TRG-Net: An Interpretable and Controllable Rain Generator","summary":" Exploring and modeling rain generation mechanism is critical for augmenting\npaired data to ease training of rainy image processing models. Against this\ntask, this study proposes a novel deep learning based rain generator, which\nfully takes the physical generation mechanism underlying rains into\nconsideration and well encodes the learning of the fundamental rain factors\n(i.e., shape, orientation, length, width and sparsity) explicitly into the deep\nnetwork. Its significance lies in that the generator not only elaborately\ndesign essential elements of the rain to simulate expected rains, like\nconventional artificial strategies, but also finely adapt to complicated and\ndiverse practical rainy images, like deep learning methods. By rationally\nadopting filter parameterization technique, we first time achieve a deep\nnetwork that is finely controllable with respect to rain factors and able to\nlearn the distribution of these factors purely from data. Our unpaired\ngeneration experiments demonstrate that the rain generated by the proposed rain\ngenerator is not only of higher quality, but also more effective for deraining\nand downstream tasks compared to current state-of-the-art rain generation\nmethods. Besides, the paired data augmentation experiments, including both\nin-distribution and out-of-distribution (OOD), further validate the diversity\nof samples generated by our model for in-distribution deraining and OOD\ngeneralization tasks.\n","authors":["Zhiqiang Pang","Hong Wang","Qi Xie","Deyu Meng","Zongben Xu"],"pdf_url":"https://arxiv.org/pdf/2403.09993v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08557v2","updated":"2024-03-15T03:26:20Z","published":"2024-03-13T14:08:45Z","title":"Occluded Cloth-Changing Person Re-Identification","summary":" Cloth-changing person re-identification aims to retrieve and identify\nspe-cific pedestrians by using cloth-unrelated features in person\ncloth-changing scenarios. However, pedestrian images captured by surveillance\nprobes usually contain occlusions in real-world scenarios. The perfor-mance of\nexisting cloth-changing person re-identification methods is sig-nificantly\ndegraded due to the reduction of discriminative cloth-unrelated features caused\nby occlusion. We define cloth-changing person re-identification in occlusion\nscenarios as occluded cloth-changing person re-identification (Occ-CC-ReID),\nand to the best of our knowledge, we are the first to propose occluded\ncloth-changing person re-identification as a new task. We constructed two\noccluded cloth-changing person re-identification datasets: Occluded-PRCC and\nOccluded-LTCC. The da-tasets can be obtained from the following link:\nhttps://github.com/1024AILab/Occluded-Cloth-Changing-Person-Re-Identification.\n","authors":["Zhihao Chen","Yiyuan Ge"],"pdf_url":"https://arxiv.org/pdf/2403.08557v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08247v2","updated":"2024-03-15T03:22:05Z","published":"2024-03-13T05:01:37Z","title":"A Dual-domain Regularization Method for Ring Artifact Removal of X-ray\n CT","summary":" Ring artifacts in computed tomography images, arising from the undesirable\nresponses of detector units, significantly degrade image quality and diagnostic\nreliability. To address this challenge, we propose a dual-domain regularization\nmodel to effectively remove ring artifacts, while maintaining the integrity of\nthe original CT image. The proposed model corrects the vertical stripe\nartifacts on the sinogram by innovatively updating the response inconsistency\ncompensation coefficients of detector units, which is achieved by employing the\ngroup sparse constraint and the projection-view direction sparse constraint on\nthe stripe artifacts. Simultaneously, we apply the sparse constraint on the\nreconstructed image to further rectified ring artifacts in the image domain.\nThe key advantage of the proposed method lies in considering the relationship\nbetween the response inconsistency compensation coefficients of the detector\nunits and the projection views, which enables a more accurate correction of the\nresponse of the detector units. An alternating minimization method is designed\nto solve the model. Comparative experiments on real photon counting detector\ndata demonstrate that the proposed method not only surpasses existing methods\nin removing ring artifacts but also excels in preserving structural details and\nimage fidelity.\n","authors":["Hongyang Zhu","Xin Lu","Yanwei Qin","Xinran Yu","Tianjiao Sun","Yunsong Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.08247v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08006v2","updated":"2024-03-15T03:16:30Z","published":"2023-09-14T19:33:11Z","title":"Facial Kinship Verification from remote photoplethysmography","summary":" Facial Kinship Verification (FKV) aims at automatically determining whether\ntwo subjects have a kinship relation based on human faces. It has potential\napplications in finding missing children and social media analysis. Traditional\nFKV faces challenges as it is vulnerable to spoof attacks and raises privacy\nissues. In this paper, we explore for the first time the FKV with vital\nbio-signals, focusing on remote Photoplethysmography (rPPG). rPPG signals are\nextracted from facial videos, resulting in a one-dimensional signal that\nmeasures the changes in visible light reflection emitted to and detected from\nthe skin caused by the heartbeat. Specifically, in this paper, we employed a\nstraightforward one-dimensional Convolutional Neural Network (1DCNN) with a\n1DCNN-Attention module and kinship contrastive loss to learn the kin similarity\nfrom rPPGs. The network takes multiple rPPG signals extracted from various\nfacial Regions of Interest (ROIs) as inputs. Additionally, the 1DCNN attention\nmodule is designed to learn and capture the discriminative kin features from\nfeature embeddings. Finally, we demonstrate the feasibility of rPPG to detect\nkinship with the experiment evaluation on the UvANEMO Smile Database from\ndifferent kin relations.\n","authors":["Xiaoting Wu","Xiaoyi Feng","Constantino Álvarez Casado","Lili Liu","Miguel Bordallo López"],"pdf_url":"https://arxiv.org/pdf/2309.08006v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.13800v3","updated":"2024-03-15T03:15:26Z","published":"2023-03-24T04:45:45Z","title":"Aligning Step-by-Step Instructional Diagrams to Video Demonstrations","summary":" Multimodal alignment facilitates the retrieval of instances from one modality\nwhen queried using another. In this paper, we consider a novel setting where\nsuch an alignment is between (i) instruction steps that are depicted as\nassembly diagrams (commonly seen in Ikea assembly manuals) and (ii) video\nsegments from in-the-wild videos; these videos comprising an enactment of the\nassembly actions in the real world. To learn this alignment, we introduce a\nnovel supervised contrastive learning method that learns to align videos with\nthe subtle details in the assembly diagrams, guided by a set of novel losses.\nTo study this problem and demonstrate the effectiveness of our method, we\nintroduce a novel dataset: IAW for Ikea assembly in the wild consisting of 183\nhours of videos from diverse furniture assembly collections and nearly 8,300\nillustrations from their associated instruction manuals and annotated for their\nground truth alignments. We define two tasks on this dataset: First, nearest\nneighbor retrieval between video segments and illustrations, and, second,\nalignment of instruction steps and the segments for each video. Extensive\nexperiments on IAW demonstrate superior performances of our approach against\nalternatives.\n","authors":["Jiahao Zhang","Anoop Cherian","Yanbin Liu","Yizhak Ben-Shabat","Cristian Rodriguez","Stephen Gould"],"pdf_url":"https://arxiv.org/pdf/2303.13800v3.pdf","comment":"Project website:\n https://academic.davidz.cn/en/publication/zhang-cvpr-2023/"},{"id":"http://arxiv.org/abs/2403.09108v2","updated":"2024-03-15T03:09:32Z","published":"2024-03-14T05:01:31Z","title":"CardioCaps: Attention-based Capsule Network for Class-Imbalanced\n Echocardiogram Classification","summary":" Capsule Neural Networks (CapsNets) is a novel architecture that utilizes\nvector-wise representations formed by multiple neurons. Specifically, the\nDynamic Routing CapsNets (DR-CapsNets) employ an affine matrix and dynamic\nrouting mechanism to train capsules and acquire translation-equivariance\nproperties, enhancing its robustness compared to traditional Convolutional\nNeural Networks (CNNs). Echocardiograms, which capture moving images of the\nheart, present unique challenges for traditional image classification methods.\nIn this paper, we explore the potential of DR-CapsNets and propose CardioCaps,\na novel attention-based DR-CapsNet architecture for class-imbalanced\nechocardiogram classification. CardioCaps comprises two key components: a\nweighted margin loss incorporating a regression auxiliary loss and an attention\nmechanism. First, the weighted margin loss prioritizes positive cases,\nsupplemented by an auxiliary loss function based on the Ejection Fraction (EF)\nregression task, a crucial measure of cardiac function. This approach enhances\nthe model's resilience in the face of class imbalance. Second, recognizing the\nquadratic complexity of dynamic routing leading to training inefficiencies, we\nadopt the attention mechanism as a more computationally efficient alternative.\nOur results demonstrate that CardioCaps surpasses traditional machine learning\nbaseline methods, including Logistic Regression, Random Forest, and XGBoost\nwith sampling methods and a class weight matrix. Furthermore, CardioCaps\noutperforms other deep learning baseline methods such as CNNs, ResNets, U-Nets,\nand ViTs, as well as advanced CapsNets methods such as EM-CapsNets and\nEfficient-CapsNets. Notably, our model demonstrates robustness to class\nimbalance, achieving high precision even in datasets with a substantial\nproportion of negative cases.\n","authors":["Hyunkyung Han","Jihyeon Seong","Jaesik Choi"],"pdf_url":"https://arxiv.org/pdf/2403.09108v2.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2403.09981v1","updated":"2024-03-15T02:57:20Z","published":"2024-03-15T02:57:20Z","title":"Controllable Text-to-3D Generation via Surface-Aligned Gaussian\n Splatting","summary":" While text-to-3D and image-to-3D generation tasks have received considerable\nattention, one important but under-explored field between them is controllable\ntext-to-3D generation, which we mainly focus on in this work. To address this\ntask, 1) we introduce Multi-view ControlNet (MVControl), a novel neural network\narchitecture designed to enhance existing pre-trained multi-view diffusion\nmodels by integrating additional input conditions, such as edge, depth, normal,\nand scribble maps. Our innovation lies in the introduction of a conditioning\nmodule that controls the base diffusion model using both local and global\nembeddings, which are computed from the input condition images and camera\nposes. Once trained, MVControl is able to offer 3D diffusion guidance for\noptimization-based 3D generation. And, 2) we propose an efficient multi-stage\n3D generation pipeline that leverages the benefits of recent large\nreconstruction models and score distillation algorithm. Building upon our\nMVControl architecture, we employ a unique hybrid diffusion guidance method to\ndirect the optimization process. In pursuit of efficiency, we adopt 3D\nGaussians as our representation instead of the commonly used implicit\nrepresentations. We also pioneer the use of SuGaR, a hybrid representation that\nbinds Gaussians to mesh triangle faces. This approach alleviates the issue of\npoor geometry in 3D Gaussians and enables the direct sculpting of fine-grained\ngeometry on the mesh. Extensive experiments demonstrate that our method\nachieves robust generalization and enables the controllable generation of\nhigh-quality 3D content.\n","authors":["Zhiqi Li","Yiming Chen","Lingzhe Zhao","Peidong Liu"],"pdf_url":"https://arxiv.org/pdf/2403.09981v1.pdf","comment":"Project page: https://lizhiqi49.github.io/MVControl/"},{"id":"http://arxiv.org/abs/2403.09977v1","updated":"2024-03-15T02:48:47Z","published":"2024-03-15T02:48:47Z","title":"EfficientVMamba: Atrous Selective Scan for Light Weight Visual Mamba","summary":" Prior efforts in light-weight model development mainly centered on CNN and\nTransformer-based designs yet faced persistent challenges. CNNs adept at local\nfeature extraction compromise resolution while Transformers offer global reach\nbut escalate computational demands $\\mathcal{O}(N^2)$. This ongoing trade-off\nbetween accuracy and efficiency remains a significant hurdle. Recently, state\nspace models (SSMs), such as Mamba, have shown outstanding performance and\ncompetitiveness in various tasks such as language modeling and computer vision,\nwhile reducing the time complexity of global information extraction to\n$\\mathcal{O}(N)$. Inspired by this, this work proposes to explore the potential\nof visual state space models in light-weight model design and introduce a novel\nefficient model variant dubbed EfficientVMamba. Concretely, our EfficientVMamba\nintegrates a atrous-based selective scan approach by efficient skip sampling,\nconstituting building blocks designed to harness both global and local\nrepresentational features. Additionally, we investigate the integration between\nSSM blocks and convolutions, and introduce an efficient visual state space\nblock combined with an additional convolution branch, which further elevate the\nmodel performance. Experimental results show that, EfficientVMamba scales down\nthe computational complexity while yields competitive results across a variety\nof vision tasks. For example, our EfficientVMamba-S with $1.3$G FLOPs improves\nVim-Ti with $1.5$G FLOPs by a large margin of $5.6\\%$ accuracy on ImageNet.\nCode is available at: \\url{https://github.com/TerryPei/EfficientVMamba}.\n","authors":["Xiaohuan Pei","Tao Huang","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2403.09977v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09976v1","updated":"2024-03-15T02:46:19Z","published":"2024-03-15T02:46:19Z","title":"AD3: Implicit Action is the Key for World Models to Distinguish the\n Diverse Visual Distractors","summary":" Model-based methods have significantly contributed to distinguishing\ntask-irrelevant distractors for visual control. However, prior research has\nprimarily focused on heterogeneous distractors like noisy background videos,\nleaving homogeneous distractors that closely resemble controllable agents\nlargely unexplored, which poses significant challenges to existing methods. To\ntackle this problem, we propose Implicit Action Generator (IAG) to learn the\nimplicit actions of visual distractors, and present a new algorithm named\nimplicit Action-informed Diverse visual Distractors Distinguisher (AD3), that\nleverages the action inferred by IAG to train separated world models. Implicit\nactions effectively capture the behavior of background distractors, aiding in\ndistinguishing the task-irrelevant components, and the agent can optimize the\npolicy within the task-relevant state space. Our method achieves superior\nperformance on various visual control tasks featuring both heterogeneous and\nhomogeneous distractors. The indispensable role of implicit actions learned by\nIAG is also empirically validated.\n","authors":["Yucen Wang","Shenghua Wan","Le Gan","Shuai Feng","De-Chuan Zhan"],"pdf_url":"https://arxiv.org/pdf/2403.09976v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09975v1","updated":"2024-03-15T02:42:28Z","published":"2024-03-15T02:42:28Z","title":"Skeleton-Based Human Action Recognition with Noisy Labels","summary":" Understanding human actions from body poses is critical for assistive robots\nsharing space with humans in order to make informed and safe decisions about\nthe next interaction. However, precise temporal localization and annotation of\nactivity sequences is time-consuming and the resulting labels are often noisy.\nIf not effectively addressed, label noise negatively affects the model's\ntraining, resulting in lower recognition quality. Despite its importance,\naddressing label noise for skeleton-based action recognition has been\noverlooked so far. In this study, we bridge this gap by implementing a\nframework that augments well-established skeleton-based human action\nrecognition methods with label-denoising strategies from various research areas\nto serve as the initial benchmark. Observations reveal that these baselines\nyield only marginal performance when dealing with sparse skeleton data.\nConsequently, we introduce a novel methodology, NoiseEraSAR, which integrates\nglobal sample selection, co-teaching, and Cross-Modal Mixture-of-Experts\n(CM-MOE) strategies, aimed at mitigating the adverse impacts of label noise.\nOur proposed approach demonstrates better performance on the established\nbenchmark, setting new state-of-the-art standards. The source code for this\nstudy will be made accessible at https://github.com/xuyizdby/NoiseEraSAR.\n","authors":["Yi Xu","Kunyu Peng","Di Wen","Ruiping Liu","Junwei Zheng","Yufan Chen","Jiaming Zhang","Alina Roitberg","Kailun Yang","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2403.09975v1.pdf","comment":"The source code will be made accessible at\n https://github.com/xuyizdby/NoiseEraSAR"},{"id":"http://arxiv.org/abs/2403.09974v1","updated":"2024-03-15T02:40:13Z","published":"2024-03-15T02:40:13Z","title":"GET: Unlocking the Multi-modal Potential of CLIP for Generalized\n Category Discovery","summary":" Given unlabelled datasets containing both old and new categories, generalized\ncategory discovery (GCD) aims to accurately discover new classes while\ncorrectly classifying old classes, leveraging the class concepts learned from\nlabeled samples. Current GCD methods only use a single visual modality of\ninformation, resulting in poor classification of visually similar classes.\nThough certain classes are visually confused, their text information might be\ndistinct, motivating us to introduce text information into the GCD task.\nHowever, the lack of class names for unlabelled data makes it impractical to\nutilize text information. To tackle this challenging problem, in this paper, we\npropose a Text Embedding Synthesizer (TES) to generate pseudo text embeddings\nfor unlabelled samples. Specifically, our TES leverages the property that CLIP\ncan generate aligned vision-language features, converting visual embeddings\ninto tokens of the CLIP's text encoder to generate pseudo text embeddings.\nBesides, we employ a dual-branch framework, through the joint learning and\ninstance consistency of different modality branches, visual and semantic\ninformation mutually enhance each other, promoting the interaction and fusion\nof visual and text embedding space. Our method unlocks the multi-modal\npotentials of CLIP and outperforms the baseline methods by a large margin on\nall GCD benchmarks, achieving new state-of-the-art. The code will be released\nat \\url{https://github.com/enguangW/GET}.\n","authors":["Enguang Wang","Zhimao Peng","Zhengyuan Xie","Xialei Liu","Ming-Ming Cheng"],"pdf_url":"https://arxiv.org/pdf/2403.09974v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09973v1","updated":"2024-03-15T02:39:44Z","published":"2024-03-15T02:39:44Z","title":"Den-SOFT: Dense Space-Oriented Light Field DataseT for 6-DOF Immersive\n Experience","summary":" We have built a custom mobile multi-camera large-space dense light field\ncapture system, which provides a series of high-quality and sufficiently dense\nlight field images for various scenarios. Our aim is to contribute to the\ndevelopment of popular 3D scene reconstruction algorithms such as IBRnet, NeRF,\nand 3D Gaussian splitting. More importantly, the collected dataset, which is\nmuch denser than existing datasets, may also inspire space-oriented light field\nreconstruction, which is potentially different from object-centric 3D\nreconstruction, for immersive VR/AR experiences. We utilized a total of 40\nGoPro 10 cameras, capturing images of 5k resolution. The number of photos\ncaptured for each scene is no less than 1000, and the average density (view\nnumber within a unit sphere) is 134.68. It is also worth noting that our system\nis capable of efficiently capturing large outdoor scenes. Addressing the\ncurrent lack of large-space and dense light field datasets, we made efforts to\ninclude elements such as sky, reflections, lights and shadows that are of\ninterest to researchers in the field of 3D reconstruction during the data\ncapture process. Finally, we validated the effectiveness of our provided\ndataset on three popular algorithms and also integrated the reconstructed 3DGS\nresults into the Unity engine, demonstrating the potential of utilizing our\ndatasets to enhance the realism of virtual reality (VR) and create feasible\ninteractive spaces. The dataset is available at our project website.\n","authors":["Xiaohang Yu","Zhengxian Yang","Shi Pan","Yuqi Han","Haoxiang Wang","Jun Zhang","Shi Yan","Borong Lin","Lei Yang","Tao Yu","Lu Fang"],"pdf_url":"https://arxiv.org/pdf/2403.09973v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10574v1","updated":"2024-03-15T02:39:26Z","published":"2024-03-15T02:39:26Z","title":"Autoregressive Queries for Adaptive Tracking with\n Spatio-TemporalTransformers","summary":" The rich spatio-temporal information is crucial to capture the complicated\ntarget appearance variations in visual tracking. However, most top-performing\ntracking algorithms rely on many hand-crafted components for spatio-temporal\ninformation aggregation. Consequently, the spatio-temporal information is far\naway from being fully explored. To alleviate this issue, we propose an adaptive\ntracker with spatio-temporal transformers (named AQATrack), which adopts simple\nautoregressive queries to effectively learn spatio-temporal information without\nmany hand-designed components. Firstly, we introduce a set of learnable and\nautoregressive queries to capture the instantaneous target appearance changes\nin a sliding window fashion. Then, we design a novel attention mechanism for\nthe interaction of existing queries to generate a new query in current frame.\nFinally, based on the initial target template and learnt autoregressive\nqueries, a spatio-temporal information fusion module (STM) is designed for\nspatiotemporal formation aggregation to locate a target object. Benefiting from\nthe STM, we can effectively combine the static appearance and instantaneous\nchanges to guide robust tracking. Extensive experiments show that our method\nsignificantly improves the tracker's performance on six popular tracking\nbenchmarks: LaSOT, LaSOText, TrackingNet, GOT-10k, TNL2K, and UAV123.\n","authors":["Jinxia Xie","Bineng Zhong","Zhiyi Mo","Shengping Zhang","Liangtao Shi","Shuxiang Song","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2403.10574v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10573v1","updated":"2024-03-15T02:35:36Z","published":"2024-03-15T02:35:36Z","title":"Medical Unlearnable Examples: Securing Medical Data from Unauthorized\n Traning via Sparsity-Aware Local Masking","summary":" With the rapid growth of artificial intelligence (AI) in healthcare, there\nhas been a significant increase in the generation and storage of sensitive\nmedical data. This abundance of data, in turn, has propelled the advancement of\nmedical AI technologies. However, concerns about unauthorized data\nexploitation, such as training commercial AI models, often deter researchers\nfrom making their invaluable datasets publicly available. In response to the\nneed to protect this hard-to-collect data while still encouraging medical\ninstitutions to share it, one promising solution is to introduce imperceptible\nnoise into the data. This method aims to safeguard the data against\nunauthorized training by inducing degradation in model generalization. Although\nexisting methods have shown commendable data protection capabilities in general\ndomains, they tend to fall short when applied to biomedical data, mainly due to\ntheir failure to account for the sparse nature of medical images. To address\nthis problem, we propose the Sparsity-Aware Local Masking (SALM) method, a\nnovel approach that selectively perturbs significant pixel regions rather than\nthe entire image as previous strategies have done. This simple-yet-effective\napproach significantly reduces the perturbation search space by concentrating\non local regions, thereby improving both the efficiency and effectiveness of\ndata protection for biomedical datasets characterized by sparse features.\nBesides, we have demonstrated that SALM maintains the essential characteristics\nof the data, ensuring its clinical utility remains uncompromised. Our extensive\nexperiments across various datasets and model architectures demonstrate that\nSALM effectively prevents unauthorized training of deep-learning models and\noutperforms previous state-of-the-art data protection methods.\n","authors":["Weixiang Sun","Yixin Liu","Zhiling Yan","Kaidi Xu","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2403.10573v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03050v3","updated":"2024-03-15T02:33:32Z","published":"2023-12-05T18:47:19Z","title":"HIG: Hierarchical Interlacement Graph Approach to Scene Graph Generation\n in Video Understanding","summary":" Visual interactivity understanding within visual scenes presents a\nsignificant challenge in computer vision. Existing methods focus on complex\ninteractivities while leveraging a simple relationship model. These methods,\nhowever, struggle with a diversity of appearance, situation, position,\ninteraction, and relation in videos. This limitation hinders the ability to\nfully comprehend the interplay within the complex visual dynamics of subjects.\nIn this paper, we delve into interactivities understanding within visual\ncontent by deriving scene graph representations from dense interactivities\namong humans and objects. To achieve this goal, we first present a new dataset\ncontaining Appearance-Situation-Position-Interaction-Relation predicates, named\nASPIRe, offering an extensive collection of videos marked by a wide range of\ninteractivities. Then, we propose a new approach named Hierarchical\nInterlacement Graph (HIG), which leverages a unified layer and graph within a\nhierarchical structure to provide deep insights into scene changes across five\ndistinct tasks. Our approach demonstrates superior performance to other methods\nthrough extensive experiments conducted in various scenarios.\n","authors":["Trong-Thuan Nguyen","Pha Nguyen","Khoa Luu"],"pdf_url":"https://arxiv.org/pdf/2312.03050v3.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.01759v2","updated":"2024-03-15T02:22:07Z","published":"2024-03-04T06:25:26Z","title":"Open-world Machine Learning: A Review and New Outlooks","summary":" Machine learning has achieved remarkable success in many applications.\nHowever, existing studies are largely based on the closed-world assumption,\nwhich assumes that the environment is stationary, and the model is fixed once\ndeployed. In many real-world applications, this fundamental and rather naive\nassumption may not hold because an open environment is complex, dynamic, and\nfull of unknowns. In such cases, rejecting unknowns, discovering novelties, and\nthen incrementally learning them, could enable models to be safe and evolve\ncontinually as biological systems do. This paper provides a holistic view of\nopen-world machine learning by investigating unknown rejection, novel class\ndiscovery, and class-incremental learning in a unified paradigm. The\nchallenges, principles, and limitations of current methodologies are discussed\nin detail. Finally, we discuss several potential directions for future\nresearch. This paper aims to provide a comprehensive introduction to the\nemerging open-world machine learning paradigm, to help researchers build more\npowerful AI systems in their respective fields, and to promote the development\nof artificial general intelligence.\n","authors":["Fei Zhu","Shijie Ma","Zhen Cheng","Xu-Yao Zhang","Zhaoxiang Zhang","Cheng-Lin Liu"],"pdf_url":"https://arxiv.org/pdf/2403.01759v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09964v1","updated":"2024-03-15T02:05:20Z","published":"2024-03-15T02:05:20Z","title":"Boundary Constraint-free Biomechanical Model-Based Surface Matching for\n Intraoperative Liver Deformation Correction","summary":" In image-guided liver surgery, 3D-3D non-rigid registration methods play a\ncrucial role in estimating the mapping between the preoperative model and the\nintraoperative surface represented as point clouds, addressing the challenge of\ntissue deformation. Typically, these methods incorporate a biomechanical model,\nrepresented as a finite element model (FEM), used to regularize a surface\nmatching term. This paper introduces a novel 3D-3D non-rigid registration\nmethod. In contrast to the preceding techniques, our method uniquely\nincorporates the FEM within the surface matching term itself, ensuring that the\nestimated deformation maintains geometric consistency throughout the\nregistration process. Additionally, we eliminate the need to determine\nzero-boundary conditions and applied force locations in the FEM. We achieve\nthis by integrating soft springs into the stiffness matrix and allowing forces\nto be distributed across the entire liver surface. To further improve\nrobustness, we introduce a regularization technique focused on the gradient of\nthe force magnitudes. This regularization imposes spatial smoothness and helps\nprevent the overfitting of irregular noise in intraoperative data. Optimization\nis achieved through an accelerated proximal gradient algorithm, further\nenhanced by our proposed method for determining the optimal step size. Our\nmethod is evaluated and compared to both a learning-based method and a\ntraditional method that features FEM regularization using data collected on our\ncustom-developed phantom, as well as two publicly available datasets. Our\nmethod consistently outperforms or is comparable to the baseline techniques.\nBoth the code and dataset will be made publicly available.\n","authors":["Zixin Yang","Richard Simon","Kelly Merrell","Cristian. A. Linte"],"pdf_url":"https://arxiv.org/pdf/2403.09964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08579v2","updated":"2024-03-15T02:02:21Z","published":"2023-10-12T17:59:34Z","title":"HyperHuman: Hyper-Realistic Human Generation with Latent Structural\n Diffusion","summary":" Despite significant advances in large-scale text-to-image models, achieving\nhyper-realistic human image generation remains a desirable yet unsolved task.\nExisting models like Stable Diffusion and DALL-E 2 tend to generate human\nimages with incoherent parts or unnatural poses. To tackle these challenges,\nour key insight is that human image is inherently structural over multiple\ngranularities, from the coarse-level body skeleton to fine-grained spatial\ngeometry. Therefore, capturing such correlations between the explicit\nappearance and latent structure in one model is essential to generate coherent\nand natural human images. To this end, we propose a unified framework,\nHyperHuman, that generates in-the-wild human images of high realism and diverse\nlayouts. Specifically, 1) we first build a large-scale human-centric dataset,\nnamed HumanVerse, which consists of 340M images with comprehensive annotations\nlike human pose, depth, and surface normal. 2) Next, we propose a Latent\nStructural Diffusion Model that simultaneously denoises the depth and surface\nnormal along with the synthesized RGB image. Our model enforces the joint\nlearning of image appearance, spatial relationship, and geometry in a unified\nnetwork, where each branch in the model complements to each other with both\nstructural awareness and textural richness. 3) Finally, to further boost the\nvisual quality, we propose a Structure-Guided Refiner to compose the predicted\nconditions for more detailed generation of higher resolution. Extensive\nexperiments demonstrate that our framework yields the state-of-the-art\nperformance, generating hyper-realistic human images under diverse scenarios.\nProject Page: https://snap-research.github.io/HyperHuman/\n","authors":["Xian Liu","Jian Ren","Aliaksandr Siarohin","Ivan Skorokhodov","Yanyu Li","Dahua Lin","Xihui Liu","Ziwei Liu","Sergey Tulyakov"],"pdf_url":"https://arxiv.org/pdf/2310.08579v2.pdf","comment":"Accepted by ICLR 2024, camera-ready version. Project Page:\n https://snap-research.github.io/HyperHuman/"},{"id":"http://arxiv.org/abs/2403.09962v1","updated":"2024-03-15T02:01:14Z","published":"2024-03-15T02:01:14Z","title":"ViTCN: Vision Transformer Contrastive Network For Reasoning","summary":" Machine learning models have achieved significant milestones in various\ndomains, for example, computer vision models have an exceptional result in\nobject recognition, and in natural language processing, where Large Language\nModels (LLM) like GPT can start a conversation with human-like proficiency.\nHowever, abstract reasoning remains a challenge for these models, Can AI really\nthinking like a human? still be a question yet to be answered. Raven\nProgressive Matrices (RPM) is a metric designed to assess human reasoning\ncapabilities. It presents a series of eight images as a problem set, where the\nparticipant should try to discover the underlying rules among these images and\nselect the most appropriate image from eight possible options that best\ncompletes the sequence. This task always be used to test human reasoning\nabilities and IQ. Zhang et al proposed a dataset called RAVEN which can be used\nto test Machine Learning model abstract reasoning ability. In this paper, we\npurposed Vision Transformer Contrastive Network which build on previous work\nwith the Contrastive Perceptual Inference network (CoPiNet), which set a new\nbenchmark for permutationinvariant models Raven Progressive Matrices by\nincorporating contrast effects from psychology, cognition, and education, and\nextends this foundation by leveraging the cutting-edge Vision Transformer\narchitecture. This integration aims to further refine the machine ability to\nprocess and reason about spatial-temporal information from pixel-level inputs\nand global wise features on RAVEN dataset.\n","authors":["Bo Song","Yuanhao Xu","Yichao Wu"],"pdf_url":"https://arxiv.org/pdf/2403.09962v1.pdf","comment":"5 pages, 2 figures , in proceeding of 5th International Seminar on\n Artificial Intelligence, Networking and Information Technology"},{"id":"http://arxiv.org/abs/2403.09303v2","updated":"2024-03-15T01:58:19Z","published":"2024-03-14T11:51:01Z","title":"Rethinking Autoencoders for Medical Anomaly Detection from A Theoretical\n Perspective","summary":" Medical anomaly detection aims to identify abnormal findings using only\nnormal training data, playing a crucial role in health screening and\nrecognizing rare diseases. Reconstruction-based methods, particularly those\nutilizing autoencoders (AEs), are dominant in this field. They work under the\nassumption that AEs trained on only normal data cannot reconstruct unseen\nabnormal regions well, thereby enabling the anomaly detection based on\nreconstruction errors. However, this assumption does not always hold due to the\nmismatch between the reconstruction training objective and the anomaly\ndetection task objective, rendering these methods theoretically unsound. This\nstudy focuses on providing a theoretical foundation for AE-based reconstruction\nmethods in anomaly detection. By leveraging information theory, we elucidate\nthe principles of these methods and reveal that the key to improving AE in\nanomaly detection lies in minimizing the information entropy of latent vectors.\nExperiments on four datasets with two image modalities validate the\neffectiveness of our theory. To the best of our knowledge, this is the first\neffort to theoretically clarify the principles and design philosophy of AE for\nanomaly detection. Code will be available upon acceptance.\n","authors":["Yu Cai","Hao Chen","Kwang-Ting Cheng"],"pdf_url":"https://arxiv.org/pdf/2403.09303v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08995v2","updated":"2024-03-15T01:40:23Z","published":"2024-03-13T23:27:31Z","title":"NTIRE 2023 Image Shadow Removal Challenge Technical Report: Team IIM_TTI","summary":" In this paper, we analyze and discuss ShadowFormer in preparation for the\nNTIRE2023 Shadow Removal Challenge [1], implementing five key improvements:\nimage alignment, the introduction of a perceptual quality loss function, the\nsemi-automatic annotation for shadow detection, joint learning of shadow\ndetection and removal, and the introduction of new data augmentation technique\n\"CutShadow\" for shadow removal. Our method achieved scores of 0.196 (3rd out of\n19) in LPIPS and 7.44 (4th out of 19) in the Mean Opinion Score (MOS).\n","authors":["Yuki Kondo","Riku Miyata","Fuma Yasue","Taito Naruki","Norimichi Ukita"],"pdf_url":"https://arxiv.org/pdf/2403.08995v2.pdf","comment":"This version is a brief technical report submitted to the organizers,\n and there are still some points to be added; please wait for updates until\n May 2024. The code can be found here\n (https://github.com/Yuki-11/NTIRE2023_ShadowRemoval_IIM_TTI)"},{"id":"http://arxiv.org/abs/2403.08947v2","updated":"2024-03-15T01:21:14Z","published":"2024-03-13T20:26:50Z","title":"Robust COVID-19 Detection in CT Images with CLIP","summary":" In the realm of medical imaging, particularly for COVID-19 detection, deep\nlearning models face substantial challenges such as the necessity for extensive\ncomputational resources, the paucity of well-annotated datasets, and a\nsignificant amount of unlabeled data. In this work, we introduce the first\nlightweight detector designed to overcome these obstacles, leveraging a frozen\nCLIP image encoder and a trainable multilayer perception (MLP). Enhanced with\nConditional Value at Risk (CVaR) for robustness and a loss landscape flattening\nstrategy for improved generalization, our model is tailored for high efficacy\nin COVID-19 detection. Furthermore, we integrate a teacher-student framework to\ncapitalize on the vast amounts of unlabeled data, enabling our model to achieve\nsuperior performance despite the inherent data limitations. Experimental\nresults on the COV19-CT-DB dataset demonstrate the effectiveness of our\napproach, surpassing baseline by up to 10.6% in `macro' F1 score in supervised\nlearning. The code is available at\nhttps://github.com/Purdue-M2/COVID-19_Detection_M2_PURDUE.\n","authors":["Li Lin","Yamini Sri Krubha","Zhenhuan Yang","Cheng Ren","Thuc Duy Le","Irene Amerini","Xin Wang","Shu Hu"],"pdf_url":"https://arxiv.org/pdf/2403.08947v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09948v1","updated":"2024-03-15T01:18:08Z","published":"2024-03-15T01:18:08Z","title":"RadCLIP: Enhancing Radiologic Image Analysis through Contrastive\n Language-Image Pre-training","summary":" The integration of artificial intelligence (AI) with radiology has marked a\ntransformative era in medical diagnostics. Vision foundation models have been\nadopted to enhance radiologic imaging analysis. However, the distinct\ncomplexities of radiological imaging, including the interpretation of 2D and 3D\nradiological data, pose unique challenges that existing models, trained on\ngeneral non-medical images, fail to address adequately. To bridge this gap and\ncapitalize on the diagnostic precision required in medical imaging, we\nintroduce RadCLIP: a pioneering cross-modal foundational model that harnesses\nContrastive Language-Image Pre-training (CLIP) to refine radiologic image\nanalysis. RadCLIP incorporates a novel 3D slice pooling mechanism tailored for\nvolumetric image analysis and is trained using a comprehensive and diverse\ndataset of radiologic image-text pairs. Our evaluations demonstrate that\nRadCLIP effectively aligns radiological images with their corresponding textual\nannotations, and in the meantime, offers a robust vision backbone for\nradiologic imagery with significant promise.\n","authors":["Zhixiu Lu","Hailong Li","Lili He"],"pdf_url":"https://arxiv.org/pdf/2403.09948v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08103v3","updated":"2024-03-15T01:16:58Z","published":"2023-06-13T19:48:56Z","title":"Generating Images with 3D Annotations Using Diffusion Models","summary":" Diffusion models have emerged as a powerful generative method, capable of\nproducing stunning photo-realistic images from natural language descriptions.\nHowever, these models lack explicit control over the 3D structure in the\ngenerated images. Consequently, this hinders our ability to obtain detailed 3D\nannotations for the generated images or to craft instances with specific poses\nand distances. In this paper, we propose 3D Diffusion Style Transfer (3D-DST),\nwhich incorporates 3D geometry control into diffusion models. Our method\nexploits ControlNet, which extends diffusion models by using visual prompts in\naddition to text prompts. We generate images of the 3D objects taken from 3D\nshape repositories~(e.g., ShapeNet and Objaverse), render them from a variety\nof poses and viewing directions, compute the edge maps of the rendered images,\nand use these edge maps as visual prompts to generate realistic images. With\nexplicit 3D geometry control, we can easily change the 3D structures of the\nobjects in the generated images and obtain ground-truth 3D annotations\nautomatically. This allows us to improve a wide range of vision tasks, e.g.,\nclassification and 3D pose estimation, in both in-distribution (ID) and\nout-of-distribution (OOD) settings. We demonstrate the effectiveness of our\nmethod through extensive experiments on ImageNet-100/200, ImageNet-R,\nPASCAL3D+, ObjectNet3D, and OOD-CV. The results show that our method\nsignificantly outperforms existing methods, e.g., 3.8 percentage points on\nImageNet-100 using DeiT-B.\n","authors":["Wufei Ma","Qihao Liu","Jiahao Wang","Angtian Wang","Xiaoding Yuan","Yi Zhang","Zihao Xiao","Guofeng Zhang","Beijia Lu","Ruxiao Duan","Yongrui Qi","Adam Kortylewski","Yaoyao Liu","Alan Yuille"],"pdf_url":"https://arxiv.org/pdf/2306.08103v3.pdf","comment":"ICLR 2024 Spotlight. Code: https://ccvl.jhu.edu/3D-DST/"},{"id":"http://arxiv.org/abs/2308.07212v2","updated":"2024-03-15T01:13:38Z","published":"2023-08-14T15:29:32Z","title":"Automated ensemble method for pediatric brain tumor segmentation","summary":" Brain tumors remain a critical global health challenge, necessitating\nadvancements in diagnostic techniques and treatment methodologies. A tumor or\nits recurrence often needs to be identified in imaging studies and\ndifferentiated from normal brain tissue. In response to the growing need for\nage-specific segmentation models, particularly for pediatric patients, this\nstudy explores the deployment of deep learning techniques using magnetic\nresonance imaging (MRI) modalities. By introducing a novel ensemble approach\nusing ONet and modified versions of UNet, coupled with innovative loss\nfunctions, this study achieves a precise segmentation model for the BraTS-PEDs\n2023 Challenge. Data augmentation, including both single and composite\ntransformations, ensures model robustness and accuracy across different\nscanning protocols. The ensemble strategy, integrating the ONet and UNet\nmodels, shows greater effectiveness in capturing specific features and modeling\ndiverse aspects of the MRI images which result in lesion wise Dice scores of\n0.52, 0.72 and 0.78 on unseen validation data and scores of 0.55, 0.70, 0.79 on\nfinal testing data for the \"enhancing tumor\", \"tumor core\" and \"whole tumor\"\nlabels respectively. Visual comparisons further confirm the superiority of the\nensemble method in accurate tumor region coverage. The results indicate that\nthis advanced ensemble approach, building upon the unique strengths of\nindividual models, offers promising prospects for enhanced diagnostic accuracy\nand effective treatment planning and monitoring for brain tumors in pediatric\nbrains.\n","authors":["Shashidhar Reddy Javaji","Sovesh Mohapatra","Advait Gosai","Gottfried Schlaug"],"pdf_url":"https://arxiv.org/pdf/2308.07212v2.pdf","comment":"Accepted at MICCAI BrainLes Workshop 2023"},{"id":"http://arxiv.org/abs/2403.09947v1","updated":"2024-03-15T01:09:58Z","published":"2024-03-15T01:09:58Z","title":"Shifting Focus: From Global Semantics to Local Prominent Features in\n Swin-Transformer for Knee Osteoarthritis Severity Assessment","summary":" Conventional imaging diagnostics frequently encounter bottlenecks due to\nmanual inspection, which can lead to delays and inconsistencies. Although deep\nlearning offers a pathway to automation and enhanced accuracy, foundational\nmodels in computer vision often emphasize global context at the expense of\nlocal details, which are vital for medical imaging diagnostics. To address\nthis, we harness the Swin Transformer's capacity to discern extended spatial\ndependencies within images through the hierarchical framework. Our novel\ncontribution lies in refining local feature representations, orienting them\nspecifically toward the final distribution of the classifier. This method\nensures that local features are not only preserved but are also enriched with\ntask-specific information, enhancing their relevance and detail at every\nhierarchical level. By implementing this strategy, our model demonstrates\nsignificant robustness and precision, as evidenced by extensive validation of\ntwo established benchmarks for Knee OsteoArthritis (KOA) grade classification.\nThese results highlight our approach's effectiveness and its promising\nimplications for the future of medical imaging diagnostics. Our implementation\nis available on https://github.com/mtliba/KOA_NLCS2024\n","authors":["Aymen Sekhri","Marouane Tliba","Mohamed Amine Kerkouri","Yassine Nasser","Aladine Chetouani","Alessandro Bruno","Rachid Jennane"],"pdf_url":"https://arxiv.org/pdf/2403.09947v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.17420v2","updated":"2024-03-15T01:04:33Z","published":"2023-12-29T01:28:40Z","title":"Exact Consistency Tests for Gaussian Mixture Filters using Normalized\n Deviation Squared Statistics","summary":" We consider the problem of evaluating dynamic consistency in discrete time\nprobabilistic filters that approximate stochastic system state densities with\nGaussian mixtures. Dynamic consistency means that the estimated probability\ndistributions correctly describe the actual uncertainties. As such, the problem\nof consistency testing naturally arises in applications with regards to\nestimator tuning and validation. However, due to the general complexity of the\ndensity functions involved, straightforward approaches for consistency testing\nof mixture-based estimators have remained challenging to define and implement.\nThis paper derives a new exact result for Gaussian mixture consistency testing\nwithin the framework of normalized deviation squared (NDS) statistics. It is\nshown that NDS test statistics for generic multivariate Gaussian mixture models\nexactly follow mixtures of generalized chi-square distributions, for which\nefficient computational tools are available. The accuracy and utility of the\nresulting consistency tests are numerically demonstrated on static and dynamic\nmixture estimation examples.\n","authors":["Nisar Ahmed","Luke Burks","Kailah Cabral","Alyssa Bekai Rose"],"pdf_url":"https://arxiv.org/pdf/2312.17420v2.pdf","comment":"8 pages, 4 figures; final manuscript to be published 2024 American\n Control Conference (ACC 2024), corrected small typos and updated Fig. 1 for\n clarity"},{"id":"http://arxiv.org/abs/2207.03341v3","updated":"2024-03-15T00:52:40Z","published":"2022-07-05T03:08:27Z","title":"Softmax-free Linear Transformers","summary":" Vision transformers (ViTs) have pushed the state-of-the-art for visual\nperception tasks. The self-attention mechanism underpinning the strength of\nViTs has a quadratic complexity in both computation and memory usage. This\nmotivates the development of approximating the self-attention at linear\ncomplexity. However, an in-depth analysis in this work reveals that existing\nmethods are either theoretically flawed or empirically ineffective for visual\nrecognition. We identify that their limitations are rooted in the inheritance\nof softmax-based self-attention during approximations, that is, normalizing the\nscaled dot-product between token feature vectors using the softmax function. As\npreserving the softmax operation challenges any subsequent linearization\nefforts. By this insight, a family of Softmax-Free Transformers (SOFT) are\nproposed. Specifically, a Gaussian kernel function is adopted to replace the\ndot-product similarity, enabling a full self-attention matrix to be\napproximated under low-rank matrix decomposition. For computational robustness,\nwe estimate the Moore-Penrose inverse using an iterative Newton-Raphson method\nin the forward process only, while calculating its theoretical gradients only\nonce in the backward process. To further expand applicability (e.g., dense\nprediction tasks), an efficient symmetric normalization technique is\nintroduced. Extensive experiments on ImageNet, COCO, and ADE20K show that our\nSOFT significantly improves the computational efficiency of existing ViT\nvariants. With linear complexity, much longer token sequences are permitted by\nSOFT, resulting in superior trade-off between accuracy and complexity. Code and\nmodels are available at https://github.com/fudan-zvg/SOFT.\n","authors":["Jiachen Lu","Junge Zhang","Xiatian Zhu","Jianfeng Feng","Tao Xiang","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2207.03341v3.pdf","comment":"Accepted by IJCV. arXiv admin note: substantial text overlap with\n arXiv:2110.11945"},{"id":"http://arxiv.org/abs/2403.09942v1","updated":"2024-03-15T00:52:17Z","published":"2024-03-15T00:52:17Z","title":"Attention-Enhanced Hybrid Feature Aggregation Network for 3D Brain Tumor\n Segmentation","summary":" Glioblastoma is a highly aggressive and malignant brain tumor type that\nrequires early diagnosis and prompt intervention. Due to its heterogeneity in\nappearance, developing automated detection approaches is challenging. To\naddress this challenge, Artificial Intelligence (AI)-driven approaches in\nhealthcare have generated interest in efficiently diagnosing and evaluating\nbrain tumors. The Brain Tumor Segmentation Challenge (BraTS) is a platform for\ndeveloping and assessing automated techniques for tumor analysis using\nhigh-quality, clinically acquired MRI data. In our approach, we utilized a\nmulti-scale, attention-guided and hybrid U-Net-shaped model -- GLIMS -- to\nperform 3D brain tumor segmentation in three regions: Enhancing Tumor (ET),\nTumor Core (TC), and Whole Tumor (WT). The multi-scale feature extraction\nprovides better contextual feature aggregation in high resolutions and the Swin\nTransformer blocks improve the global feature extraction at deeper levels of\nthe model. The segmentation mask generation in the decoder branch is guided by\nthe attention-refined features gathered from the encoder branch to enhance the\nimportant attributes. Moreover, hierarchical supervision is used to train the\nmodel efficiently. Our model's performance on the validation set resulted in\n92.19, 87.75, and 83.18 Dice Scores and 89.09, 84.67, and 82.15 Lesion-wise\nDice Scores in WT, TC, and ET, respectively. The code is publicly available at\nhttps://github.com/yaziciz/GLIMS.\n","authors":["Ziya Ata Yazıcı","İlkay Öksüz","Hazım Kemal Ekenel"],"pdf_url":"https://arxiv.org/pdf/2403.09942v1.pdf","comment":"Accepted at 9th BrainLes Workshop (BraTS 2023 Challenge) @\n International Conference on Medical Image Computing and Computer Assisted\n Intervention (MICCAI) 2023"},{"id":"http://arxiv.org/abs/2403.09939v1","updated":"2024-03-15T00:43:03Z","published":"2024-03-15T00:43:03Z","title":"Quantization Effects on Neural Networks Perception: How would\n quantization change the perceptual field of vision models?","summary":" Neural network quantization is an essential technique for deploying models on\nresource-constrained devices. However, its impact on model perceptual fields,\nparticularly regarding class activation maps (CAMs), remains a significant area\nof investigation. In this study, we explore how quantization alters the spatial\nrecognition ability of the perceptual field of vision models, shedding light on\nthe alignment between CAMs and visual saliency maps across various\narchitectures. Leveraging a dataset of 10,000 images from ImageNet, we\nrigorously evaluate six diverse foundational CNNs: VGG16, ResNet50,\nEfficientNet, MobileNet, SqueezeNet, and DenseNet. We uncover nuanced changes\nin CAMs and their alignment with human visual saliency maps through systematic\nquantization techniques applied to these models. Our findings reveal the\nvarying sensitivities of different architectures to quantization and underscore\nits implications for real-world applications in terms of model performance and\ninterpretability. The primary contribution of this work revolves around\ndeepening our understanding of neural network quantization, providing insights\ncrucial for deploying efficient and interpretable models in practical settings.\n","authors":["Mohamed Amine Kerkouri","Marouane Tliba","Aladine Chetouani","Alessandro Bruno"],"pdf_url":"https://arxiv.org/pdf/2403.09939v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11755v4","updated":"2024-03-15T00:41:08Z","published":"2023-10-18T07:30:08Z","title":"RGM: A Robust Generalizable Matching Model","summary":" Finding corresponding pixels within a pair of images is a fundamental\ncomputer vision task with various applications. Due to the specific\nrequirements of different tasks like optical flow estimation and local feature\nmatching, previous works are primarily categorized into dense matching and\nsparse feature matching focusing on specialized architectures along with\ntask-specific datasets, which may somewhat hinder the generalization\nperformance of specialized models. In this paper, we propose a deep model for\nsparse and dense matching, termed RGM (Robust Generalist Matching). In\nparticular, we elaborately design a cascaded GRU module for refinement by\nexploring the geometric similarity iteratively at multiple scales following an\nadditional uncertainty estimation module for sparsification. To narrow the gap\nbetween synthetic training samples and real-world scenarios, we build a new,\nlarge-scale dataset with sparse correspondence ground truth by generating\noptical flow supervision with greater intervals. As such, we are able to mix up\nvarious dense and sparse matching datasets, significantly improving the\ntraining diversity. The generalization capacity of our proposed RGM is greatly\nimproved by learning the matching and uncertainty estimation in a two-stage\nmanner on the large, mixed data. Superior performance is achieved for zero-shot\nmatching and downstream geometry estimation across multiple datasets,\noutperforming the previous methods by a large margin.\n","authors":["Songyan Zhang","Xinyu Sun","Hao Chen","Bo Li","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2310.11755v4.pdf","comment":"Code is available at: https://github.com/aim-uofa/RGM"},{"id":"http://arxiv.org/abs/2310.10942v3","updated":"2024-03-15T00:18:30Z","published":"2023-10-17T02:38:09Z","title":"UNK-VQA: A Dataset and A Probe into Multi-modal Large Models' Abstention\n Ability","summary":" Teaching Visual Question Answering (VQA) models to refrain from answering\nunanswerable questions is necessary for building a trustworthy AI system.\nExisting studies, though have explored various aspects of VQA but somewhat\nignored this particular attribute. This paper aims to bridge the research gap\nby contributing a comprehensive dataset, called UNK-VQA. The dataset is\nspecifically designed to address the challenge of questions that models do not\nknow. To this end, we first augment the existing data via deliberate\nperturbations on either the image or question. In specific, we carefully ensure\nthat the question-image semantics remain close to the original unperturbed\ndistribution. By this means, the identification of unanswerable questions\nbecomes challenging, setting our dataset apart from others that involve mere\nimage replacement. We then extensively evaluate the zero- and few-shot\nperformance of several emerging multi-modal large models and discover their\nsignificant limitations when applied to our dataset. Additionally, we also\npropose a straightforward method to tackle these unanswerable questions. This\ndataset, we believe, will serve as a valuable benchmark for enhancing the\nabstention capability of VQA models, thereby leading to increased\ntrustworthiness of AI systems. We have made the dataset\n(https://github.com/guoyang9/UNK-VQA) available to facilitate further\nexploration in this area.\n","authors":["Yangyang Guo","Fangkai Jiao","Zhiqi Shen","Liqiang Nie","Mohan Kankanhalli"],"pdf_url":"https://arxiv.org/pdf/2310.10942v3.pdf","comment":null}]},"2024-03-19T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2403.12965v1","updated":"2024-03-19T17:59:52Z","published":"2024-03-19T17:59:52Z","title":"Wear-Any-Way: Manipulable Virtual Try-on via Sparse Correspondence\n Alignment","summary":" This paper introduces a novel framework for virtual try-on, termed\nWear-Any-Way. Different from previous methods, Wear-Any-Way is a customizable\nsolution. Besides generating high-fidelity results, our method supports users\nto precisely manipulate the wearing style. To achieve this goal, we first\nconstruct a strong pipeline for standard virtual try-on, supporting\nsingle/multiple garment try-on and model-to-model settings in complicated\nscenarios. To make it manipulable, we propose sparse correspondence alignment\nwhich involves point-based control to guide the generation for specific\nlocations. With this design, Wear-Any-Way gets state-of-the-art performance for\nthe standard setting and provides a novel interaction form for customizing the\nwearing style. For instance, it supports users to drag the sleeve to make it\nrolled up, drag the coat to make it open, and utilize clicks to control the\nstyle of tuck, etc. Wear-Any-Way enables more liberated and flexible\nexpressions of the attires, holding profound implications in the fashion\nindustry.\n","authors":["Mengting Chen","Xi Chen","Zhonghua Zhai","Chen Ju","Xuewen Hong","Jinsong Lan","Shuai Xiao"],"pdf_url":"https://arxiv.org/pdf/2403.12965v1.pdf","comment":"Project Page: https://mengtingchen.github.io/wear-any-way-page/"},{"id":"http://arxiv.org/abs/2403.12966v1","updated":"2024-03-19T17:59:52Z","published":"2024-03-19T17:59:52Z","title":"Chain-of-Spot: Interactive Reasoning Improves Large Vision-Language\n Models","summary":" In the realm of vision-language understanding, the proficiency of models in\ninterpreting and reasoning over visual content has become a cornerstone for\nnumerous applications. However, it is challenging for the visual encoder in\nLarge Vision-Language Models (LVLMs) to extract useful features tailored to\nquestions that aid the language model's response. Furthermore, a common\npractice among existing LVLMs is to utilize lower-resolution images, which\nrestricts the ability for visual recognition. Our work introduces the\nChain-of-Spot (CoS) method, which we describe as Interactive Reasoning, a novel\napproach that enhances feature extraction by focusing on key regions of\ninterest (ROI) within the image, corresponding to the posed questions or\ninstructions. This technique allows LVLMs to access more detailed visual\ninformation without altering the original image resolution, thereby offering\nmulti-granularity image features. By integrating Chain-of-Spot with\ninstruct-following LLaVA-1.5 models, the process of image reasoning\nconsistently improves performance across a wide range of multimodal datasets\nand benchmarks without bells and whistles and achieves new state-of-the-art\nresults. Our empirical findings demonstrate a significant improvement in LVLMs'\nability to understand and reason about visual content, paving the way for more\nsophisticated visual instruction-following applications. Code and models are\navailable at https://github.com/dongyh20/Chain-of-Spot\n","authors":["Zuyan Liu","Yuhao Dong","Yongming Rao","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2403.12966v1.pdf","comment":"Project Page: https://sites.google.com/view/chain-of-spot/"},{"id":"http://arxiv.org/abs/2403.12964v1","updated":"2024-03-19T17:59:39Z","published":"2024-03-19T17:59:39Z","title":"Negative Yields Positive: Unified Dual-Path Adapter for Vision-Language\n Models","summary":" Recently, large-scale pre-trained Vision-Language Models (VLMs) have\ndemonstrated great potential in learning open-world visual representations, and\nexhibit remarkable performance across a wide range of downstream tasks through\nefficient fine-tuning. In this work, we innovatively introduce the concept of\ndual learning into fine-tuning VLMs, i.e., we not only learn what an image is,\nbut also what an image isn't. Building on this concept, we introduce a novel\nDualAdapter approach to enable dual-path adaptation of VLMs from both positive\nand negative perspectives with only limited annotated samples. In the inference\nstage, our DualAdapter performs unified predictions by simultaneously\nconducting complementary positive selection and negative exclusion across\ntarget classes, thereby enhancing the overall recognition accuracy of VLMs in\ndownstream tasks. Our extensive experimental results across 15 datasets\nvalidate that the proposed DualAdapter outperforms existing state-of-the-art\nmethods on both few-shot learning and domain generalization tasks while\nachieving competitive computational efficiency. Code is available at\nhttps://github.com/zhangce01/DualAdapter.\n","authors":["Ce Zhang","Simon Stepputtis","Katia Sycara","Yaqi Xie"],"pdf_url":"https://arxiv.org/pdf/2403.12964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12963v1","updated":"2024-03-19T17:59:33Z","published":"2024-03-19T17:59:33Z","title":"FouriScale: A Frequency Perspective on Training-Free High-Resolution\n Image Synthesis","summary":" In this study, we delve into the generation of high-resolution images from\npre-trained diffusion models, addressing persistent challenges, such as\nrepetitive patterns and structural distortions, that emerge when models are\napplied beyond their trained resolutions. To address this issue, we introduce\nan innovative, training-free approach FouriScale from the perspective of\nfrequency domain analysis. We replace the original convolutional layers in\npre-trained diffusion models by incorporating a dilation technique along with a\nlow-pass operation, intending to achieve structural consistency and scale\nconsistency across resolutions, respectively. Further enhanced by a\npadding-then-crop strategy, our method can flexibly handle text-to-image\ngeneration of various aspect ratios. By using the FouriScale as guidance, our\nmethod successfully balances the structural integrity and fidelity of generated\nimages, achieving an astonishing capacity of arbitrary-size, high-resolution,\nand high-quality generation. With its simplicity and compatibility, our method\ncan provide valuable insights for future explorations into the synthesis of\nultra-high-resolution images. The code will be released at\nhttps://github.com/LeonHLJ/FouriScale.\n","authors":["Linjiang Huang","Rongyao Fang","Aiping Zhang","Guanglu Song","Si Liu","Yu Liu","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2403.12963v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12962v1","updated":"2024-03-19T17:59:18Z","published":"2024-03-19T17:59:18Z","title":"FRESCO: Spatial-Temporal Correspondence for Zero-Shot Video Translation","summary":" The remarkable efficacy of text-to-image diffusion models has motivated\nextensive exploration of their potential application in video domains.\nZero-shot methods seek to extend image diffusion models to videos without\nnecessitating model training. Recent methods mainly focus on incorporating\ninter-frame correspondence into attention mechanisms. However, the soft\nconstraint imposed on determining where to attend to valid features can\nsometimes be insufficient, resulting in temporal inconsistency. In this paper,\nwe introduce FRESCO, intra-frame correspondence alongside inter-frame\ncorrespondence to establish a more robust spatial-temporal constraint. This\nenhancement ensures a more consistent transformation of semantically similar\ncontent across frames. Beyond mere attention guidance, our approach involves an\nexplicit update of features to achieve high spatial-temporal consistency with\nthe input video, significantly improving the visual coherence of the resulting\ntranslated videos. Extensive experiments demonstrate the effectiveness of our\nproposed framework in producing high-quality, coherent videos, marking a\nnotable improvement over existing zero-shot methods.\n","authors":["Shuai Yang","Yifan Zhou","Ziwei Liu","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2403.12962v1.pdf","comment":"CVPR 24, Code: https://github.com/williamyang1991/FRESCO, Project:\n https://www.mmlab-ntu.com/project/fresco/"},{"id":"http://arxiv.org/abs/2403.12961v1","updated":"2024-03-19T17:59:09Z","published":"2024-03-19T17:59:09Z","title":"TexTile: A Differentiable Metric for Texture Tileability","summary":" We introduce TexTile, a novel differentiable metric to quantify the degree\nupon which a texture image can be concatenated with itself without introducing\nrepeating artifacts (i.e., the tileability). Existing methods for tileable\ntexture synthesis focus on general texture quality, but lack explicit analysis\nof the intrinsic repeatability properties of a texture. In contrast, our\nTexTile metric effectively evaluates the tileable properties of a texture,\nopening the door to more informed synthesis and analysis of tileable textures.\nUnder the hood, TexTile is formulated as a binary classifier carefully built\nfrom a large dataset of textures of different styles, semantics, regularities,\nand human annotations.Key to our method is a set of architectural modifications\nto baseline pre-train image classifiers to overcome their shortcomings at\nmeasuring tileability, along with a custom data augmentation and training\nregime aimed at increasing robustness and accuracy. We demonstrate that TexTile\ncan be plugged into different state-of-the-art texture synthesis methods,\nincluding diffusion-based strategies, and generate tileable textures while\nkeeping or even improving the overall texture quality. Furthermore, we show\nthat TexTile can objectively evaluate any tileable texture synthesis method,\nwhereas the current mix of existing metrics produces uncorrelated scores which\nheavily hinders progress in the field.\n","authors":["Carlos Rodriguez-Pardo","Dan Casas","Elena Garces","Jorge Lopez-Moreno"],"pdf_url":"https://arxiv.org/pdf/2403.12961v1.pdf","comment":"CVPR 2024. Project page: https://mslab.es/projects/TexTile/"},{"id":"http://arxiv.org/abs/2403.12960v1","updated":"2024-03-19T17:58:04Z","published":"2024-03-19T17:58:04Z","title":"FaceXFormer: A Unified Transformer for Facial Analysis","summary":" In this work, we introduce FaceXformer, an end-to-end unified transformer\nmodel for a comprehensive range of facial analysis tasks such as face parsing,\nlandmark detection, head pose estimation, attributes recognition, and\nestimation of age, gender, race, and landmarks visibility. Conventional methods\nin face analysis have often relied on task-specific designs and preprocessing\ntechniques, which limit their approach to a unified architecture. Unlike these\nconventional methods, our FaceXformer leverages a transformer-based\nencoder-decoder architecture where each task is treated as a learnable token,\nenabling the integration of multiple tasks within a single framework. Moreover,\nwe propose a parameter-efficient decoder, FaceX, which jointly processes face\nand task tokens, thereby learning generalized and robust face representations\nacross different tasks. To the best of our knowledge, this is the first work to\npropose a single model capable of handling all these facial analysis tasks\nusing transformers. We conducted a comprehensive analysis of effective\nbackbones for unified face task processing and evaluated different task queries\nand the synergy between them. We conduct experiments against state-of-the-art\nspecialized models and previous multi-task models in both intra-dataset and\ncross-dataset evaluations across multiple benchmarks. Additionally, our model\neffectively handles images \"in-the-wild,\" demonstrating its robustness and\ngeneralizability across eight different tasks, all while maintaining the\nreal-time performance of 37 FPS.\n","authors":["Kartik Narayan","Vibashan VS","Rama Chellappa","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2403.12960v1.pdf","comment":"Project page: https://kartik-3004.github.io/facexformer_web/"},{"id":"http://arxiv.org/abs/2403.12959v1","updated":"2024-03-19T17:58:02Z","published":"2024-03-19T17:58:02Z","title":"WHAC: World-grounded Humans and Cameras","summary":" Estimating human and camera trajectories with accurate scale in the world\ncoordinate system from a monocular video is a highly desirable yet challenging\nand ill-posed problem. In this study, we aim to recover expressive parametric\nhuman models (i.e., SMPL-X) and corresponding camera poses jointly, by\nleveraging the synergy between three critical players: the world, the human,\nand the camera. Our approach is founded on two key observations. Firstly,\ncamera-frame SMPL-X estimation methods readily recover absolute human depth.\nSecondly, human motions inherently provide absolute spatial cues. By\nintegrating these insights, we introduce a novel framework, referred to as\nWHAC, to facilitate world-grounded expressive human pose and shape estimation\n(EHPS) alongside camera pose estimation, without relying on traditional\noptimization techniques. Additionally, we present a new synthetic dataset,\nWHAC-A-Mole, which includes accurately annotated humans and cameras, and\nfeatures diverse interactive human motions as well as realistic camera\ntrajectories. Extensive experiments on both standard and newly established\nbenchmarks highlight the superiority and efficacy of our framework. We will\nmake the code and dataset publicly available.\n","authors":["Wanqi Yin","Zhongang Cai","Ruisi Wang","Fanzhou Wang","Chen Wei","Haiyi Mei","Weiye Xiao","Zhitao Yang","Qingping Sun","Atsushi Yamashita","Ziwei Liu","Lei Yang"],"pdf_url":"https://arxiv.org/pdf/2403.12959v1.pdf","comment":"Homepage: https://wqyin.github.io/projects/WHAC/"},{"id":"http://arxiv.org/abs/2403.12957v1","updated":"2024-03-19T17:57:52Z","published":"2024-03-19T17:57:52Z","title":"GVGEN: Text-to-3D Generation with Volumetric Representation","summary":" In recent years, 3D Gaussian splatting has emerged as a powerful technique\nfor 3D reconstruction and generation, known for its fast and high-quality\nrendering capabilities. To address these shortcomings, this paper introduces a\nnovel diffusion-based framework, GVGEN, designed to efficiently generate 3D\nGaussian representations from text input. We propose two innovative\ntechniques:(1) Structured Volumetric Representation. We first arrange\ndisorganized 3D Gaussian points as a structured form GaussianVolume. This\ntransformation allows the capture of intricate texture details within a volume\ncomposed of a fixed number of Gaussians. To better optimize the representation\nof these details, we propose a unique pruning and densifying method named the\nCandidate Pool Strategy, enhancing detail fidelity through selective\noptimization. (2) Coarse-to-fine Generation Pipeline. To simplify the\ngeneration of GaussianVolume and empower the model to generate instances with\ndetailed 3D geometry, we propose a coarse-to-fine pipeline. It initially\nconstructs a basic geometric structure, followed by the prediction of complete\nGaussian attributes. Our framework, GVGEN, demonstrates superior performance in\nqualitative and quantitative assessments compared to existing 3D generation\nmethods. Simultaneously, it maintains a fast generation speed ($\\sim$7\nseconds), effectively striking a balance between quality and efficiency.\n","authors":["Xianglong He","Junyi Chen","Sida Peng","Di Huang","Yangguang Li","Xiaoshui Huang","Chun Yuan","Wanli Ouyang","Tong He"],"pdf_url":"https://arxiv.org/pdf/2403.12957v1.pdf","comment":"project page: https://gvgen.github.io/"},{"id":"http://arxiv.org/abs/2403.12953v1","updated":"2024-03-19T17:55:22Z","published":"2024-03-19T17:55:22Z","title":"FutureDepth: Learning to Predict the Future Improves Video Depth\n Estimation","summary":" In this paper, we propose a novel video depth estimation approach,\nFutureDepth, which enables the model to implicitly leverage multi-frame and\nmotion cues to improve depth estimation by making it learn to predict the\nfuture at training. More specifically, we propose a future prediction network,\nF-Net, which takes the features of multiple consecutive frames and is trained\nto predict multi-frame features one time step ahead iteratively. In this way,\nF-Net learns the underlying motion and correspondence information, and we\nincorporate its features into the depth decoding process. Additionally, to\nenrich the learning of multiframe correspondence cues, we further leverage a\nreconstruction network, R-Net, which is trained via adaptively masked\nauto-encoding of multiframe feature volumes. At inference time, both F-Net and\nR-Net are used to produce queries to work with the depth decoder, as well as a\nfinal refinement network. Through extensive experiments on several benchmarks,\ni.e., NYUDv2, KITTI, DDAD, and Sintel, which cover indoor, driving, and\nopen-domain scenarios, we show that FutureDepth significantly improves upon\nbaseline models, outperforms existing video depth estimation methods, and sets\nnew state-of-the-art (SOTA) accuracy. Furthermore, FutureDepth is more\nefficient than existing SOTA video depth estimation models and has similar\nlatencies when comparing to monocular models\n","authors":["Rajeev Yasarla","Manish Kumar Singh","Hong Cai","Yunxiao Shi","Jisoo Jeong","Yinhao Zhu","Shizhong Han","Risheek Garrepalli","Fatih Porikli"],"pdf_url":"https://arxiv.org/pdf/2403.12953v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12952v1","updated":"2024-03-19T17:54:34Z","published":"2024-03-19T17:54:34Z","title":"Just Shift It: Test-Time Prototype Shifting for Zero-Shot Generalization\n with Vision-Language Models","summary":" Advancements in vision-language models (VLMs) have propelled the field of\ncomputer vision, particularly in the zero-shot learning setting. Despite their\npromise, the effectiveness of these models often diminishes due to domain\nshifts in test environments. To address this, we introduce the Test-Time\nPrototype Shifting (TPS) framework, a pioneering approach designed to adapt\nVLMs to test datasets using unlabeled test inputs. Our method is based on the\nnotion of modulating per-class prototypes in the shared embedding space. By\npre-computing and caching prototypes generated with the pre-trained text\nencoder, TPS not only facilitates optimization-free prototype reuse for\nsubsequent predictions but also enables seamless integration with current\nadvancements in prompt engineering. At test-time, TPS dynamically learns shift\nvectors for each prototype based solely on the given test sample, effectively\nbridging the domain gap and enhancing classification accuracy. A notable aspect\nof our framework is its significantly reduced memory and computational demands\nwhen compared to conventional text-prompt tuning methods. Extensive evaluations\nacross 15 datasets involving natural distribution shifts and cross-dataset\ngeneralization demonstrate TPS's superior performance, achieving\nstate-of-the-art results while reducing resource requirements.\n","authors":["Elaine Sui","Xiaohan Wang","Serena Yeung-Levy"],"pdf_url":"https://arxiv.org/pdf/2403.12952v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14918v2","updated":"2024-03-19T17:53:39Z","published":"2023-11-25T03:33:36Z","title":"Resolution- and Stimulus-agnostic Super-Resolution of Ultra-High-Field\n Functional MRI: Application to Visual Studies","summary":" High-resolution fMRI provides a window into the brain's mesoscale\norganization. Yet, higher spatial resolution increases scan times, to\ncompensate for the low signal and contrast-to-noise ratio. This work introduces\na deep learning-based 3D super-resolution (SR) method for fMRI. By\nincorporating a resolution-agnostic image augmentation framework, our method\nadapts to varying voxel sizes without retraining. We apply this innovative\ntechnique to localize fine-scale motion-selective sites in the early visual\nareas. Detection of these sites typically requires a resolution higher than 1\nmm isotropic, whereas here, we visualize them based on lower resolution (2-3mm\nisotropic) fMRI data. Remarkably, the super-resolved fMRI is able to recover\nhigh-frequency detail of the interdigitated organization of these sites\n(relative to the color-selective sites), even with training data sourced from\ndifferent subjects and experimental paradigms -- including non-visual\nresting-state fMRI, underscoring its robustness and versatility. Quantitative\nand qualitative results indicate that our method has the potential to enhance\nthe spatial resolution of fMRI, leading to a drastic reduction in acquisition\ntime.\n","authors":["Hongwei Bran Li","Matthew S. Rosen","Shahin Nasr","Juan Eugenio Iglesias"],"pdf_url":"https://arxiv.org/pdf/2311.14918v2.pdf","comment":"ISBI2024 final version"},{"id":"http://arxiv.org/abs/2311.10081v2","updated":"2024-03-19T17:51:45Z","published":"2023-11-16T18:37:29Z","title":"DRESS: Instructing Large Vision-Language Models to Align and Interact\n with Humans via Natural Language Feedback","summary":" We present DRESS, a large vision language model (LVLM) that innovatively\nexploits Natural Language feedback (NLF) from Large Language Models to enhance\nits alignment and interactions by addressing two key limitations in the\nstate-of-the-art LVLMs. First, prior LVLMs generally rely only on the\ninstruction finetuning stage to enhance alignment with human preferences.\nWithout incorporating extra feedback, they are still prone to generate\nunhelpful, hallucinated, or harmful responses. Second, while the visual\ninstruction tuning data is generally structured in a multi-turn dialogue\nformat, the connections and dependencies among consecutive conversational turns\nare weak. This reduces the capacity for effective multi-turn interactions. To\ntackle these, we propose a novel categorization of the NLF into two key types:\ncritique and refinement. The critique NLF identifies the strengths and\nweaknesses of the responses and is used to align the LVLMs with human\npreferences. The refinement NLF offers concrete suggestions for improvement and\nis adopted to improve the interaction ability of the LVLMs-- which focuses on\nLVLMs' ability to refine responses by incorporating feedback in multi-turn\ninteractions. To address the non-differentiable nature of NLF, we generalize\nconditional reinforcement learning for training. Our experimental results\ndemonstrate that DRESS can generate more helpful (9.76%), honest (11.52%), and\nharmless (21.03%) responses, and more effectively learn from feedback during\nmulti-turn interactions compared to SOTA LVMLs.\n","authors":["Yangyi Chen","Karan Sikka","Michael Cogswell","Heng Ji","Ajay Divakaran"],"pdf_url":"https://arxiv.org/pdf/2311.10081v2.pdf","comment":"CVPR 2024. The feedback datasets are released at:\n https://huggingface.co/datasets/YangyiYY/LVLM_NLF"},{"id":"http://arxiv.org/abs/2402.03908v2","updated":"2024-03-19T17:41:04Z","published":"2024-02-06T11:21:58Z","title":"EscherNet: A Generative Model for Scalable View Synthesis","summary":" We introduce EscherNet, a multi-view conditioned diffusion model for view\nsynthesis. EscherNet learns implicit and generative 3D representations coupled\nwith a specialised camera positional encoding, allowing precise and continuous\nrelative control of the camera transformation between an arbitrary number of\nreference and target views. EscherNet offers exceptional generality,\nflexibility, and scalability in view synthesis -- it can generate more than 100\nconsistent target views simultaneously on a single consumer-grade GPU, despite\nbeing trained with a fixed number of 3 reference views to 3 target views. As a\nresult, EscherNet not only addresses zero-shot novel view synthesis, but also\nnaturally unifies single- and multi-image 3D reconstruction, combining these\ndiverse tasks into a single, cohesive framework. Our extensive experiments\ndemonstrate that EscherNet achieves state-of-the-art performance in multiple\nbenchmarks, even when compared to methods specifically tailored for each\nindividual problem. This remarkable versatility opens up new directions for\ndesigning scalable neural architectures for 3D vision. Project page:\nhttps://kxhit.github.io/EscherNet.\n","authors":["Xin Kong","Shikun Liu","Xiaoyang Lyu","Marwan Taher","Xiaojuan Qi","Andrew J. Davison"],"pdf_url":"https://arxiv.org/pdf/2402.03908v2.pdf","comment":"CVPR2024 Project Page: https://kxhit.github.io/EscherNet"},{"id":"http://arxiv.org/abs/2403.12935v1","updated":"2024-03-19T17:37:18Z","published":"2024-03-19T17:37:18Z","title":"Segment Anything for comprehensive analysis of grapevine cluster\n architecture and berry properties","summary":" Grape cluster architecture and compactness are complex traits influencing\ndisease susceptibility, fruit quality, and yield. Evaluation methods for these\ntraits include visual scoring, manual methodologies, and computer vision, with\nthe latter being the most scalable approach. Most of the existing computer\nvision approaches for processing cluster images often rely on conventional\nsegmentation or machine learning with extensive training and limited\ngeneralization. The Segment Anything Model (SAM), a novel foundation model\ntrained on a massive image dataset, enables automated object segmentation\nwithout additional training. This study demonstrates out-of-the-box SAM's high\naccuracy in identifying individual berries in 2D cluster images. Using this\nmodel, we managed to segment approximately 3,500 cluster images, generating\nover 150,000 berry masks, each linked with spatial coordinates within their\nclusters. The correlation between human-identified berries and SAM predictions\nwas very strong (Pearson r2=0.96). Although the visible berry count in images\ntypically underestimates the actual cluster berry count due to visibility\nissues, we demonstrated that this discrepancy could be adjusted using a linear\nregression model (adjusted R2=0.87). We emphasized the critical importance of\nthe angle at which the cluster is imaged, noting its substantial effect on\nberry counts and architecture. We proposed different approaches in which berry\nlocation information facilitated the calculation of complex features related to\ncluster architecture and compactness. Finally, we discussed SAM's potential\nintegration into currently available pipelines for image generation and\nprocessing in vineyard conditions.\n","authors":["Efrain Torres-Lomas","Jimena Lado-Jimena","Guillermo Garcia-Zamora","Luis Diaz-Garcia"],"pdf_url":"https://arxiv.org/pdf/2403.12935v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12933v1","updated":"2024-03-19T17:36:28Z","published":"2024-03-19T17:36:28Z","title":"Zero-Reference Low-Light Enhancement via Physical Quadruple Priors","summary":" Understanding illumination and reducing the need for supervision pose a\nsignificant challenge in low-light enhancement. Current approaches are highly\nsensitive to data usage during training and illumination-specific\nhyper-parameters, limiting their ability to handle unseen scenarios. In this\npaper, we propose a new zero-reference low-light enhancement framework\ntrainable solely with normal light images. To accomplish this, we devise an\nillumination-invariant prior inspired by the theory of physical light transfer.\nThis prior serves as the bridge between normal and low-light images. Then, we\ndevelop a prior-to-image framework trained without low-light data. During\ntesting, this framework is able to restore our illumination-invariant prior\nback to images, automatically achieving low-light enhancement. Within this\nframework, we leverage a pretrained generative diffusion model for model\nability, introduce a bypass decoder to handle detail distortion, as well as\noffer a lightweight version for practicality. Extensive experiments demonstrate\nour framework's superiority in various scenarios as well as good\ninterpretability, robustness, and efficiency. Code is available on our project\nhomepage: http://daooshee.github.io/QuadPrior-Website/\n","authors":["Wenjing Wang","Huan Yang","Jianlong Fu","Jiaying Liu"],"pdf_url":"https://arxiv.org/pdf/2403.12933v1.pdf","comment":"Accepted by CVPR-2024"},{"id":"http://arxiv.org/abs/2306.02960v2","updated":"2024-03-19T17:35:51Z","published":"2023-06-05T15:26:02Z","title":"Best of Both Worlds: Hybrid SNN-ANN Architecture for Event-based Optical\n Flow Estimation","summary":" In the field of robotics, event-based cameras are emerging as a promising\nlow-power alternative to traditional frame-based cameras for capturing\nhigh-speed motion and high dynamic range scenes. This is due to their sparse\nand asynchronous event outputs. Spiking Neural Networks (SNNs) with their\nasynchronous event-driven compute, show great potential for extracting the\nspatio-temporal features from these event streams. In contrast, the standard\nAnalog Neural Networks (ANNs) fail to process event data effectively. However,\ntraining SNNs is difficult due to additional trainable parameters (thresholds\nand leaks), vanishing spikes at deeper layers, and a non-differentiable binary\nactivation function. Furthermore, an additional data structure, membrane\npotential, responsible for keeping track of temporal information, must be\nfetched and updated at every timestep in SNNs. To overcome these challenges, we\npropose a novel SNN-ANN hybrid architecture that combines the strengths of\nboth. Specifically, we leverage the asynchronous compute capabilities of SNN\nlayers to effectively extract the input temporal information. Concurrently, the\nANN layers facilitate training and efficient hardware deployment on traditional\nmachine learning hardware such as GPUs. We provide extensive experimental\nanalysis for assigning each layer to be spiking or analog, leading to a network\nconfiguration optimized for performance and ease of training. We evaluate our\nhybrid architecture for optical flow estimation on DSEC-flow and Multi-Vehicle\nStereo Event-Camera (MVSEC) datasets. On the DSEC-flow dataset, the hybrid\nSNN-ANN architecture achieves a 40% reduction in average endpoint error (AEE)\nwith 22% lower energy consumption compared to Full-SNN, and 48% lower AEE\ncompared to Full-ANN, while maintaining comparable energy usage.\n","authors":["Shubham Negi","Deepika Sharma","Adarsh Kumar Kosta","Kaushik Roy"],"pdf_url":"https://arxiv.org/pdf/2306.02960v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12931v1","updated":"2024-03-19T17:34:27Z","published":"2024-03-19T17:34:27Z","title":"You Only Sample Once: Taming One-Step Text-To-Image Synthesis by\n Self-Cooperative Diffusion GANs","summary":" We introduce YOSO, a novel generative model designed for rapid, scalable, and\nhigh-fidelity one-step image synthesis. This is achieved by integrating the\ndiffusion process with GANs. Specifically, we smooth the distribution by the\ndenoising generator itself, performing self-cooperative learning. We show that\nour method can serve as a one-step generation model training from scratch with\ncompetitive performance. Moreover, we show that our method can be extended to\nfinetune pre-trained text-to-image diffusion for high-quality one-step\ntext-to-image synthesis even with LoRA fine-tuning. In particular, we provide\nthe first diffusion transformer that can generate images in one step trained on\n512 resolution, with the capability of adapting to 1024 resolution without\nexplicit training. Our code is provided at https://github.com/Luo-Yihong/YOSO.\n","authors":["Yihong Luo","Xiaolong Chen","Jing Tang"],"pdf_url":"https://arxiv.org/pdf/2403.12931v1.pdf","comment":"Early version"},{"id":"http://arxiv.org/abs/2403.12922v1","updated":"2024-03-19T17:27:55Z","published":"2024-03-19T17:27:55Z","title":"Contextual AD Narration with Interleaved Multimodal Sequence","summary":" The Audio Description (AD) task aims to generate descriptions of visual\nelements for visually impaired individuals to help them access long-form video\ncontents, like movie. With video feature, text, character bank and context\ninformation as inputs, the generated ADs are able to correspond to the\ncharacters by name and provide reasonable, contextual descriptions to help\naudience understand the storyline of movie. To achieve this goal, we propose to\nleverage pre-trained foundation models through a simple and unified framework\nto generate ADs with interleaved multimodal sequence as input, termed as\nUni-AD. To enhance the alignment of features across various modalities with\nfiner granularity, we introduce a simple and lightweight module that maps video\nfeatures into the textual feature space. Moreover, we also propose a\ncharacter-refinement module to provide more precise information by identifying\nthe main characters who play more significant role in the video context. With\nthese unique designs, we further incorporate contextual information and a\ncontrastive loss into our architecture to generate more smooth and contextual\nADs. Experiments on the MAD-eval dataset show that Uni-AD can achieve\nstate-of-the-art performance on AD generation, which demonstrates the\neffectiveness of our approach. Code will be available at\nhttps://github.com/MCG-NJU/Uni-AD.\n","authors":["Hanlin Wang","Zhan Tong","Kecheng Zheng","Yujun Shen","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.12922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12920v1","updated":"2024-03-19T17:23:44Z","published":"2024-03-19T17:23:44Z","title":"Semantic Layering in Room Segmentation via LLMs","summary":" In this paper, we introduce Semantic Layering in Room Segmentation via LLMs\n(SeLRoS), an advanced method for semantic room segmentation by integrating\nLarge Language Models (LLMs) with traditional 2D map-based segmentation. Unlike\nprevious approaches that solely focus on the geometric segmentation of indoor\nenvironments, our work enriches segmented maps with semantic data, including\nobject identification and spatial relationships, to enhance robotic navigation.\nBy leveraging LLMs, we provide a novel framework that interprets and organizes\ncomplex information about each segmented area, thereby improving the accuracy\nand contextual relevance of room segmentation. Furthermore, SeLRoS overcomes\nthe limitations of existing algorithms by using a semantic evaluation method to\naccurately distinguish true room divisions from those erroneously generated by\nfurniture and segmentation inaccuracies. The effectiveness of SeLRoS is\nverified through its application across 30 different 3D environments. Source\ncode and experiment videos for this work are available at:\nhttps://sites.google.com/view/selros.\n","authors":["Taehyeon Kim","Byung-Cheol Min"],"pdf_url":"https://arxiv.org/pdf/2403.12920v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11942v2","updated":"2024-03-19T17:20:59Z","published":"2024-03-18T16:36:54Z","title":"Exploring Facial Expression Recognition through Semi-Supervised\n Pretraining and Temporal Modeling","summary":" Facial Expression Recognition (FER) plays a crucial role in computer vision\nand finds extensive applications across various fields. This paper aims to\npresent our approach for the upcoming 6th Affective Behavior Analysis\nin-the-Wild (ABAW) competition, scheduled to be held at CVPR2024. In the facial\nexpression recognition task, The limited size of the FER dataset poses a\nchallenge to the expression recognition model's generalization ability,\nresulting in subpar recognition performance. To address this problem, we employ\na semi-supervised learning technique to generate expression category\npseudo-labels for unlabeled face data. At the same time, we uniformly sampled\nthe labeled facial expression samples and implemented a debiased feedback\nlearning strategy to address the problem of category imbalance in the dataset\nand the possible data bias in semi-supervised learning. Moreover, to further\ncompensate for the limitation and bias of features obtained only from static\nimages, we introduced a Temporal Encoder to learn and capture temporal\nrelationships between neighbouring expression image features. In the 6th ABAW\ncompetition, our method achieved outstanding results on the official validation\nset, a result that fully confirms the effectiveness and competitiveness of our\nproposed method.\n","authors":["Jun Yu","Zhihong Wei","Zhongpeng Cai","Gongpeng Zhao","Zerui Zhang","Yongqi Wang","Guochen Xie","Jichao Zhu","Wangyuan Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.11942v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15619v2","updated":"2024-03-19T17:17:50Z","published":"2023-11-27T08:32:28Z","title":"Align before Adapt: Leveraging Entity-to-Region Alignments for\n Generalizable Video Action Recognition","summary":" Large-scale visual-language pre-trained models have achieved significant\nsuccess in various video tasks. However, most existing methods follow an \"adapt\nthen align\" paradigm, which adapts pre-trained image encoders to model\nvideo-level representations and utilizes one-hot or text embedding of the\naction labels for supervision. This paradigm overlooks the challenge of mapping\nfrom static images to complicated activity concepts. In this paper, we propose\na novel \"Align before Adapt\" (ALT) paradigm. Prior to adapting to video\nrepresentation learning, we exploit the entity-to-region alignments for each\nframe. The alignments are fulfilled by matching the region-aware image\nembeddings to an offline-constructed text corpus. With the aligned entities, we\nfeed their text embeddings to a transformer-based video adapter as the queries,\nwhich can help extract the semantics of the most important entities from a\nvideo to a vector. This paradigm reuses the visual-language alignment of VLP\nduring adaptation and tries to explain an action by the underlying entities.\nThis helps understand actions by bridging the gap with complex activity\nsemantics, particularly when facing unfamiliar or unseen categories. ALT\ndemonstrates competitive performance while maintaining remarkably low\ncomputational costs. In fully supervised experiments, it achieves 88.1% top-1\naccuracy on Kinetics-400 with only 4947 GFLOPs. Moreover, ALT outperforms the\nprevious state-of-the-art methods in both zero-shot and few-shot experiments,\nemphasizing its superior generalizability across various learning scenarios.\n","authors":["Yifei Chen","Dapeng Chen","Ruijin Liu","Sai Zhou","Wenyuan Xue","Wei Peng"],"pdf_url":"https://arxiv.org/pdf/2311.15619v2.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.12915v1","updated":"2024-03-19T17:12:58Z","published":"2024-03-19T17:12:58Z","title":"Ultra-High-Resolution Image Synthesis with Pyramid Diffusion Model","summary":" We introduce the Pyramid Diffusion Model (PDM), a novel architecture designed\nfor ultra-high-resolution image synthesis. PDM utilizes a pyramid latent\nrepresentation, providing a broader design space that enables more flexible,\nstructured, and efficient perceptual compression which enable AutoEncoder and\nNetwork of Diffusion to equip branches and deeper layers. To enhance PDM's\ncapabilities for generative tasks, we propose the integration of\nSpatial-Channel Attention and Res-Skip Connection, along with the utilization\nof Spectral Norm and Decreasing Dropout Strategy for the Diffusion Network and\nAutoEncoder. In summary, PDM achieves the synthesis of images with a 2K\nresolution for the first time, demonstrated on two new datasets comprising\nimages of sizes 2048x2048 pixels and 2048x1024 pixels respectively. We believe\nthat this work offers an alternative approach to designing scalable image\ngenerative models, while also providing incremental reinforcement for existing\nframeworks.\n","authors":["Jiajie Yang"],"pdf_url":"https://arxiv.org/pdf/2403.12915v1.pdf","comment":"Preprint Version"},{"id":"http://arxiv.org/abs/2401.07931v2","updated":"2024-03-19T17:07:40Z","published":"2024-01-15T19:47:14Z","title":"Vertical Federated Image Segmentation","summary":" With the popularization of AI solutions for image based problems, there has\nbeen a growing concern for both data privacy and acquisition. In a large number\nof cases, information is located on separate data silos and it can be difficult\nfor a developer to consolidate all of it in a fashion that is appropriate for\nmachine learning model development. Alongside this, a portion of these\nlocalized data regions may not have access to a labelled ground truth. This\nindicates that they have the capacity to reach conclusions numerically, but are\nnot able to assign classifications amid a lack of pertinent information. Such a\ndetermination is often negligible, especially when attempting to develop image\nbased solutions that often necessitate this capability. With this being the\ncase, we propose an innovative vertical federated learning (VFL) model\narchitecture that can operate under this common set of conditions. This is the\nfirst (and currently the only) implementation of a system that can work under\nthe constraints of a VFL environment and perform image segmentation while\nmaintaining nominal accuracies. We achieved this by utilizing an FCN that\nboasts the ability to operate on federates that lack labelled data and\nprivately share the respective weights with a central server, that of which\nhosts the necessary features for classification. Tests were conducted on the\nCamVid dataset in order to determine the impact of heavy feature compression\nrequired for the transfer of information between federates, as well as to reach\nnominal conclusions about the overall performance metrics when working under\nsuch constraints.\n","authors":["Paul K. Mandal","Cole Leo"],"pdf_url":"https://arxiv.org/pdf/2401.07931v2.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.11232v2","updated":"2024-03-19T17:05:57Z","published":"2023-12-18T14:30:54Z","title":"Self-Supervised Learning for Image Super-Resolution and Deblurring","summary":" Self-supervised methods have recently proved to be nearly as effective as\nsupervised methods in various imaging inverse problems, paving the way for\nlearning-based methods in scientific and medical imaging applications where\nground truth data is hard or expensive to obtain. This is the case in magnetic\nresonance imaging and computed tomography. These methods critically rely on\ninvariance to translations and/or rotations of the image distribution to learn\nfrom incomplete measurement data alone. However, existing approaches fail to\nobtain competitive performances in the problems of image super-resolution and\ndeblurring, which play a key role in most imaging systems. In this work, we\nshow that invariance to translations and rotations is insufficient to learn\nfrom measurements that only contain low-frequency information. Instead, we\npropose a new self-supervised approach that leverages the fact that many image\ndistributions are approximately scale-invariant, and that enables recovering\nhigh-frequency information lost in the measurement process. We demonstrate\nthroughout a series of experiments on real datasets that the proposed method\noutperforms other self-supervised approaches, and obtains performances on par\nwith fully supervised learning.\n","authors":["Jérémy Scanvic","Mike Davies","Patrice Abry","Julián Tachella"],"pdf_url":"https://arxiv.org/pdf/2312.11232v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11492v2","updated":"2024-03-19T17:04:35Z","published":"2024-03-18T05:53:20Z","title":"SmartRefine: A Scenario-Adaptive Refinement Framework for Efficient\n Motion Prediction","summary":" Predicting the future motion of surrounding agents is essential for\nautonomous vehicles (AVs) to operate safely in dynamic, human-robot-mixed\nenvironments. Context information, such as road maps and surrounding agents'\nstates, provides crucial geometric and semantic information for motion behavior\nprediction. To this end, recent works explore two-stage prediction frameworks\nwhere coarse trajectories are first proposed, and then used to select critical\ncontext information for trajectory refinement. However, they either incur a\nlarge amount of computation or bring limited improvement, if not both. In this\npaper, we introduce a novel scenario-adaptive refinement strategy, named\nSmartRefine, to refine prediction with minimal additional computation.\nSpecifically, SmartRefine can comprehensively adapt refinement configurations\nbased on each scenario's properties, and smartly chooses the number of\nrefinement iterations by introducing a quality score to measure the prediction\nquality and remaining refinement potential of each scenario. SmartRefine is\ndesigned as a generic and flexible approach that can be seamlessly integrated\ninto most state-of-the-art motion prediction models. Experiments on Argoverse\n(1 & 2) show that our method consistently improves the prediction accuracy of\nmultiple state-of-the-art prediction models. Specifically, by adding\nSmartRefine to QCNet, we outperform all published ensemble-free works on the\nArgoverse 2 leaderboard (single agent track) at submission. Comprehensive\nstudies are also conducted to ablate design choices and explore the mechanism\nbehind multi-iteration refinement. Codes are available at\nhttps://github.com/opendilab/SmartRefine/\n","authors":["Yang Zhou","Hao Shao","Letian Wang","Steven L. Waslander","Hongsheng Li","Yu Liu"],"pdf_url":"https://arxiv.org/pdf/2403.11492v2.pdf","comment":"Camera-ready version for CVPR 2024"},{"id":"http://arxiv.org/abs/2403.12906v1","updated":"2024-03-19T17:02:07Z","published":"2024-03-19T17:02:07Z","title":"TexDreamer: Towards Zero-Shot High-Fidelity 3D Human Texture Generation","summary":" Texturing 3D humans with semantic UV maps remains a challenge due to the\ndifficulty of acquiring reasonably unfolded UV. Despite recent text-to-3D\nadvancements in supervising multi-view renderings using large text-to-image\n(T2I) models, issues persist with generation speed, text consistency, and\ntexture quality, resulting in data scarcity among existing datasets. We present\nTexDreamer, the first zero-shot multimodal high-fidelity 3D human texture\ngeneration model. Utilizing an efficient texture adaptation finetuning\nstrategy, we adapt large T2I model to a semantic UV structure while preserving\nits original generalization capability. Leveraging a novel feature translator\nmodule, the trained model is capable of generating high-fidelity 3D human\ntextures from either text or image within seconds. Furthermore, we introduce\nArTicuLated humAn textureS (ATLAS), the largest high-resolution (1024 X 1024)\n3D human texture dataset which contains 50k high-fidelity textures with text\ndescriptions.\n","authors":["Yufei Liu","Junwei Zhu","Junshu Tang","Shijie Zhang","Jiangning Zhang","Weijian Cao","Chengjie Wang","Yunsheng Wu","Dongjin Huang"],"pdf_url":"https://arxiv.org/pdf/2403.12906v1.pdf","comment":"Project Page: https://ggxxii.github.io/texdreamer/"},{"id":"http://arxiv.org/abs/2401.00420v2","updated":"2024-03-19T16:56:53Z","published":"2023-12-31T08:06:53Z","title":"SynCDR : Training Cross Domain Retrieval Models with Synthetic Data","summary":" In cross-domain retrieval, a model is required to identify images from the\nsame semantic category across two visual domains. For instance, given a sketch\nof an object, a model needs to retrieve a real image of it from an online\nstore's catalog. A standard approach for such a problem is learning a feature\nspace of images where Euclidean distances reflect similarity. Even without\nhuman annotations, which may be expensive to acquire, prior methods function\nreasonably well using unlabeled images for training. Our problem constraint\ntakes this further to scenarios where the two domains do not necessarily share\nany common categories in training data. This can occur when the two domains in\nquestion come from different versions of some biometric sensor recording\nidentities of different people. We posit a simple solution, which is to\ngenerate synthetic data to fill in these missing category examples across\ndomains. This, we do via category preserving translation of images from one\nvisual domain to another. We compare approaches specifically trained for this\ntranslation for a pair of domains, as well as those that can use large-scale\npre-trained text-to-image diffusion models via prompts, and find that the\nlatter can generate better replacement synthetic data, leading to more accurate\ncross-domain retrieval models. Our best SynCDR model can outperform prior art\nby up to 15\\%. Code for our work is available at\nhttps://github.com/samarth4149/SynCDR .\n","authors":["Samarth Mishra","Carlos D. Castillo","Hongcheng Wang","Kate Saenko","Venkatesh Saligrama"],"pdf_url":"https://arxiv.org/pdf/2401.00420v2.pdf","comment":"Pre-print"},{"id":"http://arxiv.org/abs/2403.12895v1","updated":"2024-03-19T16:48:40Z","published":"2024-03-19T16:48:40Z","title":"mPLUG-DocOwl 1.5: Unified Structure Learning for OCR-free Document\n Understanding","summary":" Structure information is critical for understanding the semantics of\ntext-rich images, such as documents, tables, and charts. Existing Multimodal\nLarge Language Models (MLLMs) for Visual Document Understanding are equipped\nwith text recognition ability but lack general structure understanding\nabilities for text-rich document images. In this work, we emphasize the\nimportance of structure information in Visual Document Understanding and\npropose the Unified Structure Learning to boost the performance of MLLMs. Our\nUnified Structure Learning comprises structure-aware parsing tasks and\nmulti-grained text localization tasks across 5 domains: document, webpage,\ntable, chart, and natural image. To better encode structure information, we\ndesign a simple and effective vision-to-text module H-Reducer, which can not\nonly maintain the layout information but also reduce the length of visual\nfeatures by merging horizontal adjacent patches through convolution, enabling\nthe LLM to understand high-resolution images more efficiently. Furthermore, by\nconstructing structure-aware text sequences and multi-grained pairs of texts\nand bounding boxes for publicly available text-rich images, we build a\ncomprehensive training set DocStruct4M to support structure learning. Finally,\nwe construct a small but high-quality reasoning tuning dataset DocReason25K to\ntrigger the detailed explanation ability in the document domain. Our model\nDocOwl 1.5 achieves state-of-the-art performance on 10 visual document\nunderstanding benchmarks, improving the SOTA performance of MLLMs with a 7B LLM\nby more than 10 points in 5/10 benchmarks. Our codes, models, and datasets are\npublicly available at\nhttps://github.com/X-PLUG/mPLUG-DocOwl/tree/main/DocOwl1.5.\n","authors":["Anwen Hu","Haiyang Xu","Jiabo Ye","Ming Yan","Liang Zhang","Bo Zhang","Chen Li","Ji Zhang","Qin Jin","Fei Huang","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2403.12895v1.pdf","comment":"21 pages, 15 figures"},{"id":"http://arxiv.org/abs/2403.12894v1","updated":"2024-03-19T16:46:29Z","published":"2024-03-19T16:46:29Z","title":"MEDBind: Unifying Language and Multimodal Medical Data Embeddings","summary":" Medical vision-language pretraining models (VLPM) have achieved remarkable\nprogress in fusing chest X-rays (CXR) with clinical texts, introducing\nimage-text data binding approaches that enable zero-shot learning and\ndownstream clinical tasks. However, the current landscape lacks the holistic\nintegration of additional medical modalities, such as electrocardiograms (ECG).\nWe present MEDBind (Medical Electronic patient recorD), which learns joint\nembeddings across CXR, ECG, and medical text. Using text data as the central\nanchor, MEDBind features tri-modality binding, delivering competitive\nperformance in top-K retrieval, zero-shot, and few-shot benchmarks against\nestablished VLPM, and the ability for CXR-to-ECG zero-shot classification and\nretrieval. This seamless integration is achieved through combination of\ncontrastive loss on modality-text pairs with our proposed contrastive loss\nfunction, Edge-Modality Contrastive Loss, fostering a cohesive embedding space\nfor CXR, ECG, and text. Finally, we demonstrate that MEDBind can improve\ndownstream tasks by directly integrating CXR and ECG embeddings into a\nlarge-language model for multimodal prompt tuning.\n","authors":["Yuan Gao","Sangwook Kim","David E Austin","Chris McIntosh"],"pdf_url":"https://arxiv.org/pdf/2403.12894v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12032v2","updated":"2024-03-19T16:45:22Z","published":"2024-03-18T17:59:09Z","title":"Generic 3D Diffusion Adapter Using Controlled Multi-View Editing","summary":" Open-domain 3D object synthesis has been lagging behind image synthesis due\nto limited data and higher computational complexity. To bridge this gap, recent\nworks have investigated multi-view diffusion but often fall short in either 3D\nconsistency, visual quality, or efficiency. This paper proposes MVEdit, which\nfunctions as a 3D counterpart of SDEdit, employing ancestral sampling to\njointly denoise multi-view images and output high-quality textured meshes.\nBuilt on off-the-shelf 2D diffusion models, MVEdit achieves 3D consistency\nthrough a training-free 3D Adapter, which lifts the 2D views of the last\ntimestep into a coherent 3D representation, then conditions the 2D views of the\nnext timestep using rendered views, without uncompromising visual quality. With\nan inference time of only 2-5 minutes, this framework achieves better trade-off\nbetween quality and speed than score distillation. MVEdit is highly versatile\nand extendable, with a wide range of applications including text/image-to-3D\ngeneration, 3D-to-3D editing, and high-quality texture synthesis. In\nparticular, evaluations demonstrate state-of-the-art performance in both\nimage-to-3D and text-guided texture generation tasks. Additionally, we\nintroduce a method for fine-tuning 2D latent diffusion models on small 3D\ndatasets with limited resources, enabling fast low-resolution text-to-3D\ninitialization.\n","authors":["Hansheng Chen","Ruoxi Shi","Yulin Liu","Bokui Shen","Jiayuan Gu","Gordon Wetzstein","Hao Su","Leonidas Guibas"],"pdf_url":"https://arxiv.org/pdf/2403.12032v2.pdf","comment":"V2 note: Fix missing acknowledgements. Project page:\n https://lakonik.github.io/mvedit"},{"id":"http://arxiv.org/abs/2403.12891v1","updated":"2024-03-19T16:40:57Z","published":"2024-03-19T16:40:57Z","title":"Adaptive Visual Imitation Learning for Robotic Assisted Feeding Across\n Varied Bowl Configurations and Food Types","summary":" In this study, we introduce a novel visual imitation network with a spatial\nattention module for robotic assisted feeding (RAF). The goal is to acquire\n(i.e., scoop) food items from a bowl. However, achieving robust and adaptive\nfood manipulation is particularly challenging. To deal with this, we propose a\nframework that integrates visual perception with imitation learning to enable\nthe robot to handle diverse scenarios during scooping. Our approach, named AVIL\n(adaptive visual imitation learning), exhibits adaptability and robustness\nacross different bowl configurations in terms of material, size, and position,\nas well as diverse food types including granular, semi-solid, and liquid, even\nin the presence of distractors. We validate the effectiveness of our approach\nby conducting experiments on a real robot. We also compare its performance with\na baseline. The results demonstrate improvement over the baseline across all\nscenarios, with an enhancement of up to 2.5 times in terms of a success metric.\nNotably, our model, trained solely on data from a transparent glass bowl\ncontaining granular cereals, showcases generalization ability when tested\nzero-shot on other bowl configurations with different types of food.\n","authors":["Rui Liu","Amisha Bhaskar","Pratap Tokekar"],"pdf_url":"https://arxiv.org/pdf/2403.12891v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.01838v2","updated":"2024-03-19T16:40:25Z","published":"2023-04-04T14:44:06Z","title":"BugNIST - a Large Volumetric Dataset for Object Detection under Domain\n Shift","summary":" Domain shift significantly influences the performance of deep learning\nalgorithms, particularly for object detection within volumetric 3D images.\nAnnotated training data is essential for deep learning-based object detection.\nHowever, annotating densely packed objects is time-consuming and costly.\nInstead, we suggest training models on individually scanned objects, causing a\ndomain shift between training and detection data. To address this challenge, we\nintroduce the BugNIST dataset, comprising 9154 micro-CT volumes of 12 bug types\nand 388 volumes of tightly packed bug mixtures. This dataset is characterized\nby having objects with the same appearance in the source and target domain,\nwhich is uncommon for other benchmark datasets for domain shift. During\ntraining, individual bug volumes labeled by class are utilized, while testing\nemploys mixtures with center point annotations and bug type labels. Together\nwith the dataset, we provide a baseline detection analysis, aiming at advancing\nthe field of 3D object detection methods.\n","authors":["Patrick Møller Jensen","Vedrana Andersen Dahl","Carsten Gundlach","Rebecca Engberg","Hans Martin Kjer","Anders Bjorholm Dahl"],"pdf_url":"https://arxiv.org/pdf/2304.01838v2.pdf","comment":"20 pages, 6 figures, 2 tables"},{"id":"http://arxiv.org/abs/2403.09611v2","updated":"2024-03-19T16:37:13Z","published":"2024-03-14T17:51:32Z","title":"MM1: Methods, Analysis & Insights from Multimodal LLM Pre-training","summary":" In this work, we discuss building performant Multimodal Large Language Models\n(MLLMs). In particular, we study the importance of various architecture\ncomponents and data choices. Through careful and comprehensive ablations of the\nimage encoder, the vision language connector, and various pre-training data\nchoices, we identified several crucial design lessons. For example, we\ndemonstrate that for large-scale multimodal pre-training using a careful mix of\nimage-caption, interleaved image-text, and text-only data is crucial for\nachieving state-of-the-art (SOTA) few-shot results across multiple benchmarks,\ncompared to other published pre-training results. Further, we show that the\nimage encoder together with image resolution and the image token count has\nsubstantial impact, while the vision-language connector design is of\ncomparatively negligible importance. By scaling up the presented recipe, we\nbuild MM1, a family of multimodal models up to 30B parameters, including both\ndense models and mixture-of-experts (MoE) variants, that are SOTA in\npre-training metrics and achieve competitive performance after supervised\nfine-tuning on a range of established multimodal benchmarks. Thanks to\nlarge-scale pre-training, MM1 enjoys appealing properties such as enhanced\nin-context learning, and multi-image reasoning, enabling few-shot\nchain-of-thought prompting.\n","authors":["Brandon McKinzie","Zhe Gan","Jean-Philippe Fauconnier","Sam Dodge","Bowen Zhang","Philipp Dufter","Dhruti Shah","Xianzhi Du","Futang Peng","Floris Weers","Anton Belyi","Haotian Zhang","Karanjeet Singh","Doug Kang","Ankur Jain","Hongyu Hè","Max Schwarzer","Tom Gunter","Xiang Kong","Aonan Zhang","Jianyu Wang","Chong Wang","Nan Du","Tao Lei","Sam Wiseman","Mark Lee","Zirui Wang","Ruoming Pang","Peter Grasch","Alexander Toshev","Yinfei Yang"],"pdf_url":"https://arxiv.org/pdf/2403.09611v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03094v2","updated":"2024-03-19T16:34:28Z","published":"2024-02-05T15:25:32Z","title":"Cross-Domain Few-Shot Object Detection via Enhanced Open-Set Object\n Detector","summary":" This paper studies the challenging cross-domain few-shot object detection\n(CD-FSOD), aiming to develop an accurate object detector for novel domains with\nminimal labeled examples. While transformer-based open-set detectors, such as\nDE-ViT, show promise in traditional few-shot object detection, their\ngeneralization to CD-FSOD remains unclear: 1) can such open-set detection\nmethods easily generalize to CD-FSOD? 2) If not, how can models be enhanced\nwhen facing huge domain gaps? To answer the first question, we employ measures\nincluding style, inter-class variance (ICV), and indefinable boundaries (IB) to\nunderstand the domain gap. Based on these measures, we establish a new\nbenchmark named CD-FSOD to evaluate object detection methods, revealing that\nmost of the current approaches fail to generalize across domains. Technically,\nwe observe that the performance decline is associated with our proposed\nmeasures: style, ICV, and IB. Consequently, we propose several novel modules to\naddress these issues. First, the learnable instance features align initial\nfixed instances with target categories, enhancing feature distinctiveness.\nSecond, the instance reweighting module assigns higher importance to\nhigh-quality instances with slight IB. Third, the domain prompter encourages\nfeatures resilient to different styles by synthesizing imaginary domains\nwithout altering semantic contents. These techniques collectively contribute to\nthe development of the Cross-Domain Vision Transformer for CD-FSOD (CD-ViTO),\nsignificantly improving upon the base DE-ViT. Experimental results validate the\nefficacy of our model. All datasets, codes, and models will be released to the\ncommunity.\n","authors":["Yuqian Fu","Yu Wang","Yixuan Pan","Lian Huai","Xingyu Qiu","Zeyu Shangguan","Tong Liu","Yanwei Fu","Luc Van Gool","Xingqun Jiang"],"pdf_url":"https://arxiv.org/pdf/2402.03094v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12886v1","updated":"2024-03-19T16:33:26Z","published":"2024-03-19T16:33:26Z","title":"EmoVOCA: Speech-Driven Emotional 3D Talking Heads","summary":" The domain of 3D talking head generation has witnessed significant progress\nin recent years. A notable challenge in this field consists in blending\nspeech-related motions with expression dynamics, which is primarily caused by\nthe lack of comprehensive 3D datasets that combine diversity in spoken\nsentences with a variety of facial expressions. Whereas literature works\nattempted to exploit 2D video data and parametric 3D models as a workaround,\nthese still show limitations when jointly modeling the two motions. In this\nwork, we address this problem from a different perspective, and propose an\ninnovative data-driven technique that we used for creating a synthetic dataset,\ncalled EmoVOCA, obtained by combining a collection of inexpressive 3D talking\nheads and a set of 3D expressive sequences. To demonstrate the advantages of\nthis approach, and the quality of the dataset, we then designed and trained an\nemotional 3D talking head generator that accepts a 3D face, an audio file, an\nemotion label, and an intensity value as inputs, and learns to animate the\naudio-synchronized lip movements with expressive traits of the face.\nComprehensive experiments, both quantitative and qualitative, using our data\nand generator evidence superior ability in synthesizing convincing animations,\nwhen compared with the best performing methods in the literature. Our code and\npre-trained model will be made available.\n","authors":["Federico Nocentini","Claudio Ferrari","Stefano Berretti"],"pdf_url":"https://arxiv.org/pdf/2403.12886v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12884v1","updated":"2024-03-19T16:31:30Z","published":"2024-03-19T16:31:30Z","title":"HYDRA: A Hyper Agent for Dynamic Compositional Visual Reasoning","summary":" Recent advances in visual reasoning (VR), particularly with the aid of Large\nVision-Language Models (VLMs), show promise but require access to large-scale\ndatasets and face challenges such as high computational costs and limited\ngeneralization capabilities. Compositional visual reasoning approaches have\nemerged as effective strategies; however, they heavily rely on the commonsense\nknowledge encoded in Large Language Models (LLMs) to perform planning,\nreasoning, or both, without considering the effect of their decisions on the\nvisual reasoning process, which can lead to errors or failed procedures. To\naddress these challenges, we introduce HYDRA, a multi-stage dynamic\ncompositional visual reasoning framework designed for reliable and\nincrementally progressive general reasoning. HYDRA integrates three essential\nmodules: a planner, a Reinforcement Learning (RL) agent serving as a cognitive\ncontroller, and a reasoner. The planner and reasoner modules utilize an LLM to\ngenerate instruction samples and executable code from the selected instruction,\nrespectively, while the RL agent dynamically interacts with these modules,\nmaking high-level decisions on selection of the best instruction sample given\ninformation from the historical state stored through a feedback loop. This\nadaptable design enables HYDRA to adjust its actions based on previous feedback\nreceived during the reasoning process, leading to more reliable reasoning\noutputs and ultimately enhancing its overall effectiveness. Our framework\ndemonstrates state-of-the-art performance in various VR tasks on four different\nwidely-used datasets.\n","authors":["Fucai Ke","Zhixi Cai","Simindokht Jahangard","Weiqing Wang","Pari Delir Haghighi","Hamid Rezatofighi"],"pdf_url":"https://arxiv.org/pdf/2403.12884v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12883v1","updated":"2024-03-19T16:29:59Z","published":"2024-03-19T16:29:59Z","title":"Confusing Pair Correction Based on Category Prototype for Domain\n Adaptation under Noisy Environments","summary":" In this paper, we address unsupervised domain adaptation under noisy\nenvironments, which is more challenging and practical than traditional domain\nadaptation. In this scenario, the model is prone to overfitting noisy labels,\nresulting in a more pronounced domain shift and a notable decline in the\noverall model performance. Previous methods employed prototype methods for\ndomain adaptation on robust feature spaces. However, these approaches struggle\nto effectively classify classes with similar features under noisy environments.\nTo address this issue, we propose a new method to detect and correct confusing\nclass pair. We first divide classes into easy and hard classes based on the\nsmall loss criterion. We then leverage the top-2 predictions for each sample\nafter aligning the source and target domain to find the confusing pair in the\nhard classes. We apply label correction to the noisy samples within the\nconfusing pair. With the proposed label correction method, we can train our\nmodel with more accurate labels. Extensive experiments confirm the\neffectiveness of our method and demonstrate its favorable performance compared\nwith existing state-of-the-art methods. Our codes are publicly available at\nhttps://github.com/Hehxcf/CPC/.\n","authors":["Churan Zhi","Junbao Zhuo","Shuhui Wang"],"pdf_url":"https://arxiv.org/pdf/2403.12883v1.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2403.12870v1","updated":"2024-03-19T16:15:08Z","published":"2024-03-19T16:15:08Z","title":"PoNQ: a Neural QEM-based Mesh Representation","summary":" Although polygon meshes have been a standard representation in geometry\nprocessing, their irregular and combinatorial nature hinders their suitability\nfor learning-based applications. In this work, we introduce a novel learnable\nmesh representation through a set of local 3D sample Points and their\nassociated Normals and Quadric error metrics (QEM) w.r.t. the underlying shape,\nwhich we denote PoNQ. A global mesh is directly derived from PoNQ by\nefficiently leveraging the knowledge of the local quadric errors. Besides\nmarking the first use of QEM within a neural shape representation, our\ncontribution guarantees both topological and geometrical properties by ensuring\nthat a PoNQ mesh does not self-intersect and is always the boundary of a\nvolume. Notably, our representation does not rely on a regular grid, is\nsupervised directly by the target surface alone, and also handles open surfaces\nwith boundaries and/or sharp features. We demonstrate the efficacy of PoNQ\nthrough a learning-based mesh prediction from SDF grids and show that our\nmethod surpasses recent state-of-the-art techniques in terms of both surface\nand edge-based metrics.\n","authors":["Nissim Maruani","Maks Ovsjanikov","Pierre Alliez","Mathieu Desbrun"],"pdf_url":"https://arxiv.org/pdf/2403.12870v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18451v2","updated":"2024-03-19T16:09:37Z","published":"2024-02-28T16:24:08Z","title":"MambaMIR: An Arbitrary-Masked Mamba for Joint Medical Image\n Reconstruction and Uncertainty Estimation","summary":" The recent Mamba model has shown remarkable adaptability for visual\nrepresentation learning, including in medical imaging tasks. This study\nintroduces MambaMIR, a Mamba-based model for medical image reconstruction, as\nwell as its Generative Adversarial Network-based variant, MambaMIR-GAN. Our\nproposed MambaMIR inherits several advantages, such as linear complexity,\nglobal receptive fields, and dynamic weights, from the original Mamba model.\nThe innovated arbitrary-mask mechanism effectively adapt Mamba to our image\nreconstruction task, providing randomness for subsequent Monte Carlo-based\nuncertainty estimation. Experiments conducted on various medical image\nreconstruction tasks, including fast MRI and SVCT, which cover anatomical\nregions such as the knee, chest, and abdomen, have demonstrated that MambaMIR\nand MambaMIR-GAN achieve comparable or superior reconstruction results relative\nto state-of-the-art methods. Additionally, the estimated uncertainty maps offer\nfurther insights into the reliability of the reconstruction quality. The code\nis publicly available at https://github.com/ayanglab/MambaMIR.\n","authors":["Jiahao Huang","Liutao Yang","Fanwen Wang","Yinzhe Wu","Yang Nan","Angelica I. Aviles-Rivero","Carola-Bibiane Schönlieb","Daoqiang Zhang","Guang Yang"],"pdf_url":"https://arxiv.org/pdf/2402.18451v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03799v2","updated":"2024-03-19T16:08:37Z","published":"2023-12-06T14:58:03Z","title":"Low-power, Continuous Remote Behavioral Localization with Event Cameras","summary":" Researchers in natural science need reliable methods for quantifying animal\nbehavior. Recently, numerous computer vision methods emerged to automate the\nprocess. However, observing wild species at remote locations remains a\nchallenging task due to difficult lighting conditions and constraints on power\nsupply and data storage. Event cameras offer unique advantages for\nbattery-dependent remote monitoring due to their low power consumption and high\ndynamic range capabilities. We use this novel sensor to quantify a behavior in\nChinstrap penguins called ecstatic display. We formulate the problem as a\ntemporal action detection task, determining the start and end times of the\nbehavior. For this purpose, we recorded a colony of breeding penguins in\nAntarctica for several weeks and labeled event data on 16 nests. The developed\nmethod consists of a generator of candidate time intervals (proposals) and a\nclassifier of the actions within them. The experiments show that the event\ncameras' natural response to motion is effective for continuous behavior\nmonitoring and detection, reaching a mean average precision (mAP) of 58% (which\nincreases to 63% in good weather conditions). The results also demonstrate the\nrobustness against various lighting conditions contained in the challenging\ndataset. The low-power capabilities of the event camera allow it to record\nsignificantly longer than with a conventional camera. This work pioneers the\nuse of event cameras for remote wildlife observation, opening new\ninterdisciplinary opportunities. https://tub-rip.github.io/eventpenguins/\n","authors":["Friedhelm Hamann","Suman Ghosh","Ignacio Juarez Martinez","Tom Hart","Alex Kacelnik","Guillermo Gallego"],"pdf_url":"https://arxiv.org/pdf/2312.03799v2.pdf","comment":"13 pages, 8 figures, 12 tables, Project page:\n https://tub-rip.github.io/eventpenguins/"},{"id":"http://arxiv.org/abs/2203.07738v4","updated":"2024-03-19T16:03:09Z","published":"2022-03-15T09:13:35Z","title":"GCT: Graph Co-Training for Semi-Supervised Few-Shot Learning","summary":" Few-shot learning (FSL), purposing to resolve the problem of data-scarce, has\nattracted considerable attention in recent years. A popular FSL framework\ncontains two phases: (i) the pre-train phase employs the base data to train a\nCNN-based feature extractor. (ii) the meta-test phase applies the frozen\nfeature extractor to novel data (novel data has different categories from base\ndata) and designs a classifier for recognition. To correct few-shot data\ndistribution, researchers propose Semi-Supervised Few-Shot Learning (SSFSL) by\nintroducing unlabeled data. Although SSFSL has been proved to achieve\noutstanding performances in the FSL community, there still exists a fundamental\nproblem: the pre-trained feature extractor can not adapt to the novel data\nflawlessly due to the cross-category setting. Usually, large amounts of noises\nare introduced to the novel feature. We dub it as Feature-Extractor-Maladaptive\n(FEM) problem. To tackle FEM, we make two efforts in this paper. First, we\npropose a novel label prediction method, Isolated Graph Learning (IGL). IGL\nintroduces the Laplacian operator to encode the raw data to graph space, which\nhelps reduce the dependence on features when classifying, and then project\ngraph representation to label space for prediction. The key point is that: IGL\ncan weaken the negative influence of noise from the feature representation\nperspective, and is also flexible to independently complete training and\ntesting procedures, which is suitable for SSFSL. Second, we propose Graph\nCo-Training (GCT) to tackle this challenge from a multi-modal fusion\nperspective by extending the proposed IGL to the co-training framework. GCT is\na semi-supervised method that exploits the unlabeled samples with two modal\nfeatures to crossly strengthen the IGL classifier.\n","authors":["Rui Xu","Lei Xing","Shuai Shao","Lifei Zhao","Baodi Liu","Weifeng Liu","Yicong Zhou"],"pdf_url":"https://arxiv.org/pdf/2203.07738v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12852v1","updated":"2024-03-19T15:57:04Z","published":"2024-03-19T15:57:04Z","title":"Generative Enhancement for 3D Medical Images","summary":" The limited availability of 3D medical image datasets, due to privacy\nconcerns and high collection or annotation costs, poses significant challenges\nin the field of medical imaging. While a promising alternative is the use of\nsynthesized medical data, there are few solutions for realistic 3D medical\nimage synthesis due to difficulties in backbone design and fewer 3D training\nsamples compared to 2D counterparts. In this paper, we propose GEM-3D, a novel\ngenerative approach to the synthesis of 3D medical images and the enhancement\nof existing datasets using conditional diffusion models. Our method begins with\na 2D slice, noted as the informed slice to serve the patient prior, and\npropagates the generation process using a 3D segmentation mask. By decomposing\nthe 3D medical images into masks and patient prior information, GEM-3D offers a\nflexible yet effective solution for generating versatile 3D images from\nexisting datasets. GEM-3D can enable dataset enhancement by combining informed\nslice selection and generation at random positions, along with editable mask\nvolumes to introduce large variations in diffusion sampling. Moreover, as the\ninformed slice contains patient-wise information, GEM-3D can also facilitate\ncounterfactual image synthesis and dataset-level de-enhancement with desired\ncontrol. Experiments on brain MRI and abdomen CT images demonstrate that GEM-3D\nis capable of synthesizing high-quality 3D medical images with volumetric\nconsistency, offering a straightforward solution for dataset enhancement during\ninference. The code is available at https://github.com/HKU-MedAI/GEM-3D.\n","authors":["Lingting Zhu","Noel Codella","Dongdong Chen","Zhenchao Jin","Lu Yuan","Lequan Yu"],"pdf_url":"https://arxiv.org/pdf/2403.12852v1.pdf","comment":"19 pages, 4 figures"},{"id":"http://arxiv.org/abs/2403.12848v1","updated":"2024-03-19T15:54:48Z","published":"2024-03-19T15:54:48Z","title":"Compositional 3D Scene Synthesis with Scene Graph Guided Layout-Shape\n Generation","summary":" Compositional 3D scene synthesis has diverse applications across a spectrum\nof industries such as robotics, films, and video games, as it closely mirrors\nthe complexity of real-world multi-object environments. Early works typically\nemploy shape retrieval based frameworks which naturally suffer from limited\nshape diversity. Recent progresses have been made in shape generation with\npowerful generative models, such as diffusion models, which increases the shape\nfidelity. However, these approaches separately treat 3D shape generation and\nlayout generation. The synthesized scenes are usually hampered by layout\ncollision, which implies that the scene-level fidelity is still under-explored.\nIn this paper, we aim at generating realistic and reasonable 3D scenes from\nscene graph. To enrich the representation capability of the given scene graph\ninputs, large language model is utilized to explicitly aggregate the global\ngraph features with local relationship features. With a unified graph\nconvolution network (GCN), graph features are extracted from scene graphs\nupdated via joint layout-shape distribution. During scene generation, an\nIoU-based regularization loss is introduced to constrain the predicted 3D\nlayouts. Benchmarked on the SG-FRONT dataset, our method achieves better 3D\nscene synthesis, especially in terms of scene-level fidelity. The source code\nwill be released after publication.\n","authors":["Yao Wei","Martin Renqiang Min","George Vosselman","Li Erran Li","Michael Ying Yang"],"pdf_url":"https://arxiv.org/pdf/2403.12848v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.02317v3","updated":"2024-03-19T15:48:17Z","published":"2024-01-04T15:34:44Z","title":"BA-SAM: Scalable Bias-Mode Attention Mask for Segment Anything Model","summary":" In this paper, we address the challenge of image resolution variation for the\nSegment Anything Model (SAM). SAM, known for its zero-shot generalizability,\nexhibits a performance degradation when faced with datasets with varying image\nsizes. Previous approaches tend to resize the image to a fixed size or adopt\nstructure modifications, hindering the preservation of SAM's rich prior\nknowledge. Besides, such task-specific tuning necessitates a complete\nretraining of the model, which is cost-expensive and unacceptable for\ndeployment in the downstream tasks. In this paper, we reformulate this issue as\na length extrapolation problem, where token sequence length varies while\nmaintaining a consistent patch size for images of different sizes. To this end,\nwe propose Scalable Bias-Mode Attention Mask (BA-SAM) to enhance SAM's\nadaptability to varying image resolutions while eliminating the need for\nstructure modifications. Firstly, we introduce a new scaling factor to ensure\nconsistent magnitude in the attention layer's dot product values when the token\nsequence length changes. Secondly, we present a bias-mode attention mask that\nallows each token to prioritize neighboring information, mitigating the impact\nof untrained distant information. Our BA-SAM demonstrates efficacy in two\nscenarios: zero-shot and fine-tuning. Extensive evaluation on diverse datasets,\nincluding DIS5K, DUTS, ISIC, COD10K, and COCO, reveals its ability to\nsignificantly mitigate performance degradation in the zero-shot setting and\nachieve state-of-the-art performance with minimal fine-tuning. Furthermore, we\npropose a generalized model and benchmark, showcasing BA-SAM's generalizability\nacross all four datasets simultaneously.\n","authors":["Yiran Song","Qianyu Zhou","Xiangtai Li","Deng-Ping Fan","Xuequan Lu","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2401.02317v3.pdf","comment":"Code:https://github.com/zongzi13545329/BA-SAM"},{"id":"http://arxiv.org/abs/2403.12839v1","updated":"2024-03-19T15:45:54Z","published":"2024-03-19T15:45:54Z","title":"Global-guided Focal Neural Radiance Field for Large-scale Scene\n Rendering","summary":" Neural radiance fields~(NeRF) have recently been applied to render\nlarge-scale scenes. However, their limited model capacity typically results in\nblurred rendering results. Existing large-scale NeRFs primarily address this\nlimitation by partitioning the scene into blocks, which are subsequently\nhandled by separate sub-NeRFs. These sub-NeRFs, trained from scratch and\nprocessed independently, lead to inconsistencies in geometry and appearance\nacross the scene. Consequently, the rendering quality fails to exhibit\nsignificant improvement despite the expansion of model capacity. In this work,\nwe present global-guided focal neural radiance field (GF-NeRF) that achieves\nhigh-fidelity rendering of large-scale scenes. Our proposed GF-NeRF utilizes a\ntwo-stage (Global and Focal) architecture and a global-guided training\nstrategy. The global stage obtains a continuous representation of the entire\nscene while the focal stage decomposes the scene into multiple blocks and\nfurther processes them with distinct sub-encoders. Leveraging this two-stage\narchitecture, sub-encoders only need fine-tuning based on the global encoder,\nthus reducing training complexity in the focal stage while maintaining\nscene-wide consistency. Spatial information and error information from the\nglobal stage also benefit the sub-encoders to focus on crucial areas and\neffectively capture more details of large-scale scenes. Notably, our approach\ndoes not rely on any prior knowledge about the target scene, attributing\nGF-NeRF adaptable to various large-scale scene types, including street-view and\naerial-view scenes. We demonstrate that our method achieves high-fidelity,\nnatural rendering results on various types of large-scale datasets. Our project\npage: https://shaomq2187.github.io/GF-NeRF/\n","authors":["Mingqi Shao","Feng Xiong","Hang Zhang","Shuang Yang","Mu Xu","Wei Bian","Xueqian Wang"],"pdf_url":"https://arxiv.org/pdf/2403.12839v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12835v1","updated":"2024-03-19T15:41:39Z","published":"2024-03-19T15:41:39Z","title":"AnySkill: Learning Open-Vocabulary Physical Skill for Interactive Agents","summary":" Traditional approaches in physics-based motion generation, centered around\nimitation learning and reward shaping, often struggle to adapt to new\nscenarios. To tackle this limitation, we propose AnySkill, a novel hierarchical\nmethod that learns physically plausible interactions following open-vocabulary\ninstructions. Our approach begins by developing a set of atomic actions via a\nlow-level controller trained via imitation learning. Upon receiving an\nopen-vocabulary textual instruction, AnySkill employs a high-level policy that\nselects and integrates these atomic actions to maximize the CLIP similarity\nbetween the agent's rendered images and the text. An important feature of our\nmethod is the use of image-based rewards for the high-level policy, which\nallows the agent to learn interactions with objects without manual reward\nengineering. We demonstrate AnySkill's capability to generate realistic and\nnatural motion sequences in response to unseen instructions of varying lengths,\nmarking it the first method capable of open-vocabulary physical skill learning\nfor interactive humanoid agents.\n","authors":["Jieming Cui","Tengyu Liu","Nian Liu","Yaodong Yang","Yixin Zhu","Siyuan Huang"],"pdf_url":"https://arxiv.org/pdf/2403.12835v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12834v1","updated":"2024-03-19T15:41:16Z","published":"2024-03-19T15:41:16Z","title":"Embarrassingly Simple Scribble Supervision for 3D Medical Segmentation","summary":" Traditionally, segmentation algorithms require dense annotations for\ntraining, demanding significant annotation efforts, particularly within the 3D\nmedical imaging field. Scribble-supervised learning emerges as a possible\nsolution to this challenge, promising a reduction in annotation efforts when\ncreating large-scale datasets. Recently, a plethora of methods for optimized\nlearning from scribbles have been proposed, but have so far failed to position\nscribble annotation as a beneficial alternative. We relate this shortcoming to\ntwo major issues: 1) the complex nature of many methods which deeply ties them\nto the underlying segmentation model, thus preventing a migration to more\npowerful state-of-the-art models as the field progresses and 2) the lack of a\nsystematic evaluation to validate consistent performance across the broader\nmedical domain, resulting in a lack of trust when applying these methods to new\nsegmentation problems. To address these issues, we propose a comprehensive\nscribble supervision benchmark consisting of seven datasets covering a diverse\nset of anatomies and pathologies imaged with varying modalities. We furthermore\npropose the systematic use of partial losses, i.e. losses that are only\ncomputed on annotated voxels. Contrary to most existing methods, these losses\ncan be seamlessly integrated into state-of-the-art segmentation methods,\nenabling them to learn from scribble annotations while preserving their\noriginal loss formulations. Our evaluation using nnU-Net reveals that while\nmost existing methods suffer from a lack of generalization, the proposed\napproach consistently delivers state-of-the-art performance. Thanks to its\nsimplicity, our approach presents an embarrassingly simple yet effective\nsolution to the challenges of scribble supervision. Source code as well as our\nextensive scribble benchmarking suite will be made publicly available upon\npublication.\n","authors":["Karol Gotkowski","Carsten Lüth","Paul F. Jäger","Sebastian Ziegler","Lars Krämer","Stefan Denner","Shuhan Xiao","Nico Disch","Klaus H. Maier-Hein","Fabian Isensee"],"pdf_url":"https://arxiv.org/pdf/2403.12834v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02934v3","updated":"2024-03-19T15:33:17Z","published":"2023-12-05T18:05:14Z","title":"WoVoGen: World Volume-aware Diffusion for Controllable Multi-camera\n Driving Scene Generation","summary":" Generating multi-camera street-view videos is critical for augmenting\nautonomous driving datasets, addressing the urgent demand for extensive and\nvaried data. Due to the limitations in diversity and challenges in handling\nlighting conditions, traditional rendering-based methods are increasingly being\nsupplanted by diffusion-based methods. However, a significant challenge in\ndiffusion-based methods is ensuring that the generated sensor data preserve\nboth intra-world consistency and inter-sensor coherence. To address these\nchallenges, we combine an additional explicit world volume and propose the\nWorld Volume-aware Multi-camera Driving Scene Generator (WoVoGen). This system\nis specifically designed to leverage 4D world volume as a foundational element\nfor video generation. Our model operates in two distinct phases: (i)\nenvisioning the future 4D temporal world volume based on vehicle control\nsequences, and (ii) generating multi-camera videos, informed by this envisioned\n4D temporal world volume and sensor interconnectivity. The incorporation of the\n4D world volume empowers WoVoGen not only to generate high-quality street-view\nvideos in response to vehicle control inputs but also to facilitate scene\nediting tasks.\n","authors":["Jiachen Lu","Ze Huang","Zeyu Yang","Jiahui Zhang","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.02934v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12816v1","updated":"2024-03-19T15:15:19Z","published":"2024-03-19T15:15:19Z","title":"Re-identification from histopathology images","summary":" In numerous studies, deep learning algorithms have proven their potential for\nthe analysis of histopathology images, for example, for revealing the subtypes\nof tumors or the primary origin of metastases. These models require large\ndatasets for training, which must be anonymized to prevent possible patient\nidentity leaks. This study demonstrates that even relatively simple deep\nlearning algorithms can re-identify patients in large histopathology datasets\nwith substantial accuracy. We evaluated our algorithms on two TCIA datasets\nincluding lung squamous cell carcinoma (LSCC) and lung adenocarcinoma (LUAD).\nWe also demonstrate the algorithm's performance on an in-house dataset of\nmeningioma tissue. We predicted the source patient of a slide with F1 scores of\n50.16 % and 52.30 % on the LSCC and LUAD datasets, respectively, and with 62.31\n% on our meningioma dataset. Based on our findings, we formulated a risk\nassessment scheme to estimate the risk to the patient's privacy prior to\npublication.\n","authors":["Jonathan Ganz","Jonas Ammeling","Samir Jabari","Katharina Breininger","Marc Aubreville"],"pdf_url":"https://arxiv.org/pdf/2403.12816v1.pdf","comment":"20 pages, 7 figures, 2 tables"},{"id":"http://arxiv.org/abs/2403.12806v1","updated":"2024-03-19T15:07:08Z","published":"2024-03-19T15:07:08Z","title":"VisualCritic: Making LMMs Perceive Visual Quality Like Humans","summary":" At present, large multimodal models (LMMs) have exhibited impressive\ngeneralization capabilities in understanding and generating visual signals.\nHowever, they currently still lack sufficient capability to perceive low-level\nvisual quality akin to human perception. Can LMMs achieve this and show the\nsame degree of generalization in this regard? If so, not only could the\nversatility of LMMs be further enhanced, but also the challenge of poor\ncross-dataset performance in the field of visual quality assessment could be\naddressed. In this paper, we explore this question and provide the answer\n\"Yes!\". As the result of this initial exploration, we present VisualCritic, the\nfirst LMM for broad-spectrum image subjective quality assessment. VisualCritic\ncan be used across diverse data right out of box, without any requirements of\ndataset-specific adaptation operations like conventional specialist models. As\nan instruction-following LMM, VisualCritic enables new capabilities of (1)\nquantitatively measuring the perceptual quality of given images in terms of\ntheir Mean Opinion Score (MOS), noisiness, colorfulness, sharpness, and other\nnumerical indicators, (2) qualitatively evaluating visual quality and providing\nexplainable descriptions, (3) discerning whether a given image is AI-generated\nor photographic. Extensive experiments demonstrate the efficacy of VisualCritic\nby comparing it with other open-source LMMs and conventional specialist models\nover both AI-generated and photographic images.\n","authors":["Zhipeng Huang","Zhizheng Zhang","Yiting Lu","Zheng-Jun Zha","Zhibo Chen","Baining Guo"],"pdf_url":"https://arxiv.org/pdf/2403.12806v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.02778v2","updated":"2024-03-19T15:05:22Z","published":"2023-11-05T21:46:12Z","title":"MuSHRoom: Multi-Sensor Hybrid Room Dataset for Joint 3D Reconstruction\n and Novel View Synthesis","summary":" Metaverse technologies demand accurate, real-time, and immersive modeling on\nconsumer-grade hardware for both non-human perception (e.g.,\ndrone/robot/autonomous car navigation) and immersive technologies like AR/VR,\nrequiring both structural accuracy and photorealism. However, there exists a\nknowledge gap in how to apply geometric reconstruction and photorealism\nmodeling (novel view synthesis) in a unified framework. To address this gap and\npromote the development of robust and immersive modeling and rendering with\nconsumer-grade devices, we propose a real-world Multi-Sensor Hybrid Room\nDataset (MuSHRoom). Our dataset presents exciting challenges and requires\nstate-of-the-art methods to be cost-effective, robust to noisy data and\ndevices, and can jointly learn 3D reconstruction and novel view synthesis\ninstead of treating them as separate tasks, making them ideal for real-world\napplications. We benchmark several famous pipelines on our dataset for joint 3D\nmesh reconstruction and novel view synthesis. Our dataset and benchmark show\ngreat potential in promoting the improvements for fusing 3D reconstruction and\nhigh-quality rendering in a robust and computationally efficient end-to-end\nfashion. The dataset and code are available at the project website:\nhttps://xuqianren.github.io/publications/MuSHRoom/.\n","authors":["Xuqian Ren","Wenjia Wang","Dingding Cai","Tuuli Tuominen","Juho Kannala","Esa Rahtu"],"pdf_url":"https://arxiv.org/pdf/2311.02778v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12803v1","updated":"2024-03-19T15:04:35Z","published":"2024-03-19T15:04:35Z","title":"DreamDA: Generative Data Augmentation with Diffusion Models","summary":" The acquisition of large-scale, high-quality data is a resource-intensive and\ntime-consuming endeavor. Compared to conventional Data Augmentation (DA)\ntechniques (e.g. cropping and rotation), exploiting prevailing diffusion models\nfor data generation has received scant attention in classification tasks.\nExisting generative DA methods either inadequately bridge the domain gap\nbetween real-world and synthesized images, or inherently suffer from a lack of\ndiversity. To solve these issues, this paper proposes a new\nclassification-oriented framework DreamDA, which enables data synthesis and\nlabel generation by way of diffusion models. DreamDA generates diverse samples\nthat adhere to the original data distribution by considering training images in\nthe original data as seeds and perturbing their reverse diffusion process. In\naddition, since the labels of the generated data may not align with the labels\nof their corresponding seed images, we introduce a self-training paradigm for\ngenerating pseudo labels and training classifiers using the synthesized data.\nExtensive experiments across four tasks and five datasets demonstrate\nconsistent improvements over strong baselines, revealing the efficacy of\nDreamDA in synthesizing high-quality and diverse images with accurate labels.\nOur code will be available at https://github.com/yunxiangfu2001/DreamDA.\n","authors":["Yunxiang Fu","Chaoqi Chen","Yu Qiao","Yizhou Yu"],"pdf_url":"https://arxiv.org/pdf/2403.12803v1.pdf","comment":"14 pages, 8 tables, 3 figures"},{"id":"http://arxiv.org/abs/2211.14823v3","updated":"2024-03-19T15:02:04Z","published":"2022-11-27T13:31:00Z","title":"3D Scene Creation and Rendering via Rough Meshes: A Lighting Transfer\n Avenue","summary":" This paper studies how to flexibly integrate reconstructed 3D models into\npractical 3D modeling pipelines such as 3D scene creation and rendering. Due to\nthe technical difficulty, one can only obtain rough 3D models (R3DMs) for most\nreal objects using existing 3D reconstruction techniques. As a result,\nphysically-based rendering (PBR) would render low-quality images or videos for\nscenes that are constructed by R3DMs. One promising solution would be\nrepresenting real-world objects as Neural Fields such as NeRFs, which are able\nto generate photo-realistic renderings of an object under desired viewpoints.\nHowever, a drawback is that the synthesized views through Neural Fields\nRendering (NFR) cannot reflect the simulated lighting details on R3DMs in PBR\npipelines, especially when object interactions in the 3D scene creation cause\nlocal shadows. To solve this dilemma, we propose a lighting transfer network\n(LighTNet) to bridge NFR and PBR, such that they can benefit from each other.\nLighTNet reasons about a simplified image composition model, remedies the\nuneven surface issue caused by R3DMs, and is empowered by several\nperceptual-motivated constraints and a new Lab angle loss which enhances the\ncontrast between lighting strength and colors. Comparisons demonstrate that\nLighTNet is superior in synthesizing impressive lighting, and is promising in\npushing NFR further in practical 3D modeling workflows.\n","authors":["Bowen Cai","Yujie Li","Yuqin Liang","Rongfei Jia","Binqiang Zhao","Mingming Gong","Huan Fu"],"pdf_url":"https://arxiv.org/pdf/2211.14823v3.pdf","comment":"Accepted by IEEE Transactions on Pattern Analysis and Machine\n Intelligence (T-PAMI), project page:\n http://3d-front-future.github.io/LighTNet"},{"id":"http://arxiv.org/abs/2403.12801v1","updated":"2024-03-19T15:01:19Z","published":"2024-03-19T15:01:19Z","title":"RelationVLM: Making Large Vision-Language Models Understand Visual\n Relations","summary":" The development of Large Vision-Language Models (LVLMs) is striving to catch\nup with the success of Large Language Models (LLMs), yet it faces more\nchallenges to be resolved. Very recent works enable LVLMs to localize\nobject-level visual contents and ground text to them. Nonetheless, current\nLVLMs still struggle to precisely understand visual relations due to the lack\nof relevant data. In this work, we present RelationVLM, a large vision-language\nmodel capable of comprehending various levels and types of relations whether\nacross multiple images or within a video. Specifically, we devise a multi-stage\nrelation-aware training scheme and a series of corresponding data configuration\nstrategies to bestow RelationVLM with the capabilities of understanding\nsemantic relations, temporal associations and geometric transforms. Extensive\ncase studies and quantitative evaluations show RelationVLM has strong\ncapability in understanding such relations and emerges impressive in-context\ncapability of reasoning from few-shot examples by comparison. This work fosters\nthe advancements of LVLMs by enabling them to support a wider range of\ndownstream applications toward artificial general intelligence.\n","authors":["Zhipeng Huang","Zhizheng Zhang","Zheng-Jun Zha","Yan Lu","Baining Guo"],"pdf_url":"https://arxiv.org/pdf/2403.12801v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12800v1","updated":"2024-03-19T15:01:18Z","published":"2024-03-19T15:01:18Z","title":"Learning Neural Volumetric Pose Features for Camera Localization","summary":" We introduce a novel neural volumetric pose feature, termed PoseMap, designed\nto enhance camera localization by encapsulating the information between images\nand the associated camera poses. Our framework leverages an Absolute Pose\nRegression (APR) architecture, together with an augmented NeRF module. This\nintegration not only facilitates the generation of novel views to enrich the\ntraining dataset but also enables the learning of effective pose features.\nAdditionally, we extend our architecture for self-supervised online alignment,\nallowing our method to be used and fine-tuned for unlabelled images within a\nunified framework. Experiments demonstrate that our method achieves 14.28% and\n20.51% performance gain on average in indoor and outdoor benchmark scenes,\noutperforming existing APR methods with state-of-the-art accuracy.\n","authors":["Jingyu Lin","Jiaqi Gu","Bojian Wu","Lubin Fan","Renjie Chen","Ligang Liu","Jieping Ye"],"pdf_url":"https://arxiv.org/pdf/2403.12800v1.pdf","comment":"14 pages, 9 figures"},{"id":"http://arxiv.org/abs/2403.10521v2","updated":"2024-03-19T14:54:14Z","published":"2024-03-15T17:59:53Z","title":"P-MapNet: Far-seeing Map Generator Enhanced by both SDMap and HDMap\n Priors","summary":" Autonomous vehicles are gradually entering city roads today, with the help of\nhigh-definition maps (HDMaps). However, the reliance on HDMaps prevents\nautonomous vehicles from stepping into regions without this expensive digital\ninfrastructure. This fact drives many researchers to study online HDMap\ngeneration algorithms, but the performance of these algorithms at far regions\nis still unsatisfying. We present P-MapNet, in which the letter P highlights\nthe fact that we focus on incorporating map priors to improve model\nperformance. Specifically, we exploit priors in both SDMap and HDMap. On one\nhand, we extract weakly aligned SDMap from OpenStreetMap, and encode it as an\nadditional conditioning branch. Despite the misalignment challenge, our\nattention-based architecture adaptively attends to relevant SDMap skeletons and\nsignificantly improves performance. On the other hand, we exploit a masked\nautoencoder to capture the prior distribution of HDMap, which can serve as a\nrefinement module to mitigate occlusions and artifacts. We benchmark on the\nnuScenes and Argoverse2 datasets. Through comprehensive experiments, we show\nthat: (1) our SDMap prior can improve online map generation performance, using\nboth rasterized (by up to $+18.73$ $\\rm mIoU$) and vectorized (by up to $+8.50$\n$\\rm mAP$) output representations. (2) our HDMap prior can improve map\nperceptual metrics by up to $6.34\\%$. (3) P-MapNet can be switched into\ndifferent inference modes that covers different regions of the\naccuracy-efficiency trade-off landscape. (4) P-MapNet is a far-seeing solution\nthat brings larger improvements on longer ranges. Codes and models are publicly\navailable at https://jike5.github.io/P-MapNet.\n","authors":["Zhou Jiang","Zhenxin Zhu","Pengfei Li","Huan-ang Gao","Tianyuan Yuan","Yongliang Shi","Hang Zhao","Hao Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.10521v2.pdf","comment":"Code: https://jike5.github.io/P-MapNet"},{"id":"http://arxiv.org/abs/2403.12037v2","updated":"2024-03-19T14:52:28Z","published":"2024-03-18T17:59:42Z","title":"MineDreamer: Learning to Follow Instructions via Chain-of-Imagination\n for Simulated-World Control","summary":" It is a long-lasting goal to design a generalist-embodied agent that can\nfollow diverse instructions in human-like ways. However, existing approaches\noften fail to steadily follow instructions due to difficulties in understanding\nabstract and sequential natural language instructions. To this end, we\nintroduce MineDreamer, an open-ended embodied agent built upon the challenging\nMinecraft simulator with an innovative paradigm that enhances\ninstruction-following ability in low-level control signal generation.\nSpecifically, MineDreamer is developed on top of recent advances in Multimodal\nLarge Language Models (MLLMs) and diffusion models, and we employ a\nChain-of-Imagination (CoI) mechanism to envision the step-by-step process of\nexecuting instructions and translating imaginations into more precise visual\nprompts tailored to the current state; subsequently, the agent generates\nkeyboard-and-mouse actions to efficiently achieve these imaginations, steadily\nfollowing the instructions at each step. Extensive experiments demonstrate that\nMineDreamer follows single and multi-step instructions steadily, significantly\noutperforming the best generalist agent baseline and nearly doubling its\nperformance. Moreover, qualitative analysis of the agent's imaginative ability\nreveals its generalization and comprehension of the open world.\n","authors":["Enshen Zhou","Yiran Qin","Zhenfei Yin","Yuzhou Huang","Ruimao Zhang","Lu Sheng","Yu Qiao","Jing Shao"],"pdf_url":"https://arxiv.org/pdf/2403.12037v2.pdf","comment":"Project page: https://sites.google.com/view/minedreamer/main"},{"id":"http://arxiv.org/abs/2403.12787v1","updated":"2024-03-19T14:51:01Z","published":"2024-03-19T14:51:01Z","title":"DDSB: An Unsupervised and Training-free Method for Phase Detection in\n Echocardiography","summary":" Accurate identification of End-Diastolic (ED) and End-Systolic (ES) frames is\nkey for cardiac function assessment through echocardiography. However,\ntraditional methods face several limitations: they require extensive amounts of\ndata, extensive annotations by medical experts, significant training resources,\nand often lack robustness. Addressing these challenges, we proposed an\nunsupervised and training-free method, our novel approach leverages\nunsupervised segmentation to enhance fault tolerance against segmentation\ninaccuracies. By identifying anchor points and analyzing directional\ndeformation, we effectively reduce dependence on the accuracy of initial\nsegmentation images and enhance fault tolerance, all while improving\nrobustness. Tested on Echo-dynamic and CAMUS datasets, our method achieves\ncomparable accuracy to learning-based models without their associated\ndrawbacks. The code is available at https://github.com/MRUIL/DDSB\n","authors":["Zhenyu Bu","Yang Liu","Jiayu Huo","Jingjing Peng","Kaini Wang","Guangquan Zhou","Rachel Sparks","Prokar Dasgupta","Alejandro Granados","Sebastien Ourselin"],"pdf_url":"https://arxiv.org/pdf/2403.12787v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12784v1","updated":"2024-03-19T14:50:13Z","published":"2024-03-19T14:50:13Z","title":"Total Disentanglement of Font Images into Style and Character Class\n Features","summary":" In this paper, we demonstrate a total disentanglement of font images. Total\ndisentanglement is a neural network-based method for decomposing each font\nimage nonlinearly and completely into its style and content (i.e., character\nclass) features. It uses a simple but careful training procedure to extract the\ncommon style feature from all `A'-`Z' images in the same font and the common\ncontent feature from all `A' (or another class) images in different fonts.\nThese disentangled features guarantee the reconstruction of the original font\nimage. Various experiments have been conducted to understand the performance of\ntotal disentanglement. First, it is demonstrated that total disentanglement is\nachievable with very high accuracy; this is experimental proof of the\nlong-standing open question, ``Does `A'-ness exist?'' Hofstadter (1985).\nSecond, it is demonstrated that the disentangled features produced by total\ndisentanglement apply to a variety of tasks, including font recognition,\ncharacter recognition, and one-shot font image generation.\n","authors":["Daichi Haraguchi","Wataru Shimoda","Kota Yamaguchi","Seiichi Uchida"],"pdf_url":"https://arxiv.org/pdf/2403.12784v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12778v1","updated":"2024-03-19T14:45:17Z","published":"2024-03-19T14:45:17Z","title":"ViTGaze: Gaze Following with Interaction Features in Vision Transformers","summary":" Gaze following aims to interpret human-scene interactions by predicting the\nperson's focal point of gaze. Prevailing approaches often use multi-modality\ninputs, most of which adopt a two-stage framework. Hence their performance\nhighly depends on the previous prediction accuracy. Others use a\nsingle-modality approach with complex decoders, increasing network\ncomputational load. Inspired by the remarkable success of pre-trained plain\nVision Transformers (ViTs), we introduce a novel single-modality gaze following\nframework, ViTGaze. In contrast to previous methods, ViTGaze creates a brand\nnew gaze following framework based mainly on powerful encoders (dec. param.\nless than 1%). Our principal insight lies in that the inter-token interactions\nwithin self-attention can be transferred to interactions between humans and\nscenes. Leveraging this presumption, we formulate a framework consisting of a\n4D interaction encoder and a 2D spatial guidance module to extract human-scene\ninteraction information from self-attention maps. Furthermore, our\ninvestigation reveals that ViT with self-supervised pre-training exhibits an\nenhanced ability to extract correlated information. A large number of\nexperiments have been conducted to demonstrate the performance of the proposed\nmethod. Our method achieves state-of-the-art (SOTA) performance among all\nsingle-modality methods (3.4% improvement on AUC, 5.1% improvement on AP) and\nvery comparable performance against multi-modality methods with 59% number of\nparameters less.\n","authors":["Yuehao Song","Xinggang Wang","Jingfeng Yao","Wenyu Liu","Jinglin Zhang","Xiangmin Xu"],"pdf_url":"https://arxiv.org/pdf/2403.12778v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12777v1","updated":"2024-03-19T14:44:54Z","published":"2024-03-19T14:44:54Z","title":"Discover and Mitigate Multiple Biased Subgroups in Image Classifiers","summary":" Machine learning models can perform well on in-distribution data but often\nfail on biased subgroups that are underrepresented in the training data,\nhindering the robustness of models for reliable applications. Such subgroups\nare typically unknown due to the absence of subgroup labels. Discovering biased\nsubgroups is the key to understanding models' failure modes and further\nimproving models' robustness. Most previous works of subgroup discovery make an\nimplicit assumption that models only underperform on a single biased subgroup,\nwhich does not hold on in-the-wild data where multiple biased subgroups exist.\n In this work, we propose Decomposition, Interpretation, and Mitigation (DIM),\na novel method to address a more challenging but also more practical problem of\ndiscovering multiple biased subgroups in image classifiers. Our approach\ndecomposes the image features into multiple components that represent multiple\nsubgroups. This decomposition is achieved via a bilinear dimension reduction\nmethod, Partial Least Square (PLS), guided by useful supervision from the image\nclassifier. We further interpret the semantic meaning of each subgroup\ncomponent by generating natural language descriptions using vision-language\nfoundation models. Finally, DIM mitigates multiple biased subgroups\nsimultaneously via two strategies, including the data- and model-centric\nstrategies. Extensive experiments on CIFAR-100 and Breeds datasets demonstrate\nthe effectiveness of DIM in discovering and mitigating multiple biased\nsubgroups. Furthermore, DIM uncovers the failure modes of the classifier on\nHard ImageNet, showcasing its broader applicability to understanding model bias\nin image classifiers. The code is available at\nhttps://github.com/ZhangAIPI/DIM.\n","authors":["Zeliang Zhang","Mingqian Feng","Zhiheng Li","Chenliang Xu"],"pdf_url":"https://arxiv.org/pdf/2403.12777v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12770v1","updated":"2024-03-19T14:34:44Z","published":"2024-03-19T14:34:44Z","title":"Multispectral Image Restoration by Generalized Opponent Transformation\n Total Variation","summary":" Multispectral images (MSI) contain light information in different wavelengths\nof objects, which convey spectral-spatial information and help improve the\nperformance of various image processing tasks. Numerous techniques have been\ncreated to extend the application of total variation regularization in\nrestoring multispectral images, for example, based on channel coupling and\nadaptive total variation regularization. The primary contribution of this paper\nis to propose and develop a new multispectral total variation regularization in\na generalized opponent transformation domain instead of the original\nmultispectral image domain. Here opponent transformations for multispectral\nimages are generalized from a well-known opponent transformation for color\nimages. We will explore the properties of generalized opponent transformation\ntotal variation (GOTTV) regularization and the corresponding optimization\nformula for multispectral image restoration. To evaluate the effectiveness of\nthe new GOTTV method, we provide numerical examples that showcase its superior\nperformance compared to existing multispectral image total variation methods,\nusing criteria such as MPSNR and MSSIM.\n","authors":["Zhantao Ma","Michael K. Ng"],"pdf_url":"https://arxiv.org/pdf/2403.12770v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12767v1","updated":"2024-03-19T14:32:21Z","published":"2024-03-19T14:32:21Z","title":"Inter- and intra-uncertainty based feature aggregation model for\n semi-supervised histopathology image segmentation","summary":" Acquiring pixel-level annotations is often limited in applications such as\nhistology studies that require domain expertise. Various semi-supervised\nlearning approaches have been developed to work with limited ground truth\nannotations, such as the popular teacher-student models. However, hierarchical\nprediction uncertainty within the student model (intra-uncertainty) and image\nprediction uncertainty (inter-uncertainty) have not been fully utilized by\nexisting methods. To address these issues, we first propose a novel inter- and\nintra-uncertainty regularization method to measure and constrain both inter-\nand intra-inconsistencies in the teacher-student architecture. We also propose\na new two-stage network with pseudo-mask guided feature aggregation (PG-FANet)\nas the segmentation model. The two-stage structure complements with the\nuncertainty regularization strategy to avoid introducing extra modules in\nsolving uncertainties and the aggregation mechanisms enable multi-scale and\nmulti-stage feature integration. Comprehensive experimental results over the\nMoNuSeg and CRAG datasets show that our PG-FANet outperforms other\nstate-of-the-art methods and our semi-supervised learning framework yields\ncompetitive performance with a limited amount of labeled data.\n","authors":["Qiangguo Jin","Hui Cui","Changming Sun","Yang Song","Jiangbin Zheng","Leilei Cao","Leyi Wei","Ran Su"],"pdf_url":"https://arxiv.org/pdf/2403.12767v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12760v1","updated":"2024-03-19T14:27:24Z","published":"2024-03-19T14:27:24Z","title":"WaveFace: Authentic Face Restoration with Efficient Frequency Recovery","summary":" Although diffusion models are rising as a powerful solution for blind face\nrestoration, they are criticized for two problems: 1) slow training and\ninference speed, and 2) failure in preserving identity and recovering\nfine-grained facial details. In this work, we propose WaveFace to solve the\nproblems in the frequency domain, where low- and high-frequency components\ndecomposed by wavelet transformation are considered individually to maximize\nauthenticity as well as efficiency. The diffusion model is applied to recover\nthe low-frequency component only, which presents general information of the\noriginal image but 1/16 in size. To preserve the original identity, the\ngeneration is conditioned on the low-frequency component of low-quality images\nat each denoising step. Meanwhile, high-frequency components at multiple\ndecomposition levels are handled by a unified network, which recovers complex\nfacial details in a single step. Evaluations on four benchmark datasets show\nthat: 1) WaveFace outperforms state-of-the-art methods in authenticity,\nespecially in terms of identity preservation, and 2) authentic images are\nrestored with the efficiency 10x faster than existing diffusion model-based BFR\nmethods.\n","authors":["Yunqi Miao","Jiankang Deng","Jungong Han"],"pdf_url":"https://arxiv.org/pdf/2403.12760v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.16226v2","updated":"2024-03-19T14:17:54Z","published":"2023-10-24T22:41:14Z","title":"TiC-CLIP: Continual Training of CLIP Models","summary":" Keeping large foundation models up to date on latest data is inherently\nexpensive. To avoid the prohibitive costs of constantly retraining, it is\nimperative to \\emph{continually} train these models. This problem is\nexacerbated by the lack of any large scale continual learning benchmarks or\nbaselines. We introduce the first set of web-scale Time-Continual (TiC)\nbenchmarks for training vision-language models: TiC-DataComp, TiC-YFCC, and\nTiC-Redcaps. TiC-DataComp, our largest dataset, contains over 12.7B timestamped\nimage-text pairs spanning 9 years (2014--2022). We first use our benchmarks to\ncurate various \\emph{dynamic} evaluations to measure temporal robustness of\nexisting models. We show OpenAI's CLIP (trained on data up to 2020) loses\n$\\approx 8\\%$ zero-shot accuracy on our curated retrieval task from 2021--2022\ncompared with more recently trained models in OpenCLIP repository. We then\nstudy how to efficiently train models on time-continuous data. We demonstrate\nthat a simple rehearsal-based approach that continues training from the last\ncheckpoint and replays old data reduces compute by $2.5\\times$ when compared to\nthe standard practice of retraining from scratch. Code is available at\nhttps://github.com/apple/ml-tic-clip.\n","authors":["Saurabh Garg","Mehrdad Farajtabar","Hadi Pouransari","Raviteja Vemulapalli","Sachin Mehta","Oncel Tuzel","Vaishaal Shankar","Fartash Faghri"],"pdf_url":"https://arxiv.org/pdf/2310.16226v2.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2403.12748v1","updated":"2024-03-19T14:11:26Z","published":"2024-03-19T14:11:26Z","title":"Building Brain Tumor Segmentation Networks with User-Assisted Filter\n Estimation and Selection","summary":" Brain tumor image segmentation is a challenging research topic in which\ndeep-learning models have presented the best results. However, the traditional\nway of training those models from many pre-annotated images leaves several\nunanswered questions. Hence methodologies, such as Feature Learning from Image\nMarkers (FLIM), have involved an expert in the learning loop to reduce human\neffort in data annotation and build models sufficiently deep for a given\nproblem. FLIM has been successfully used to create encoders, estimating the\nfilters of all convolutional layers from patches centered at marker voxels. In\nthis work, we present Multi-Step (MS) FLIM - a user-assisted approach to\nestimating and selecting the most relevant filters from multiple FLIM\nexecutions. MS-FLIM is used only for the first convolutional layer, and the\nresults already indicate improvement over FLIM. For evaluation, we build a\nsimple U-shaped encoder-decoder network, named sU-Net, for glioblastoma\nsegmentation using T1Gd and FLAIR MRI scans, varying the encoder's training\nmethod, using FLIM, MS-FLIM, and backpropagation algorithm. Also, we compared\nthese sU-Nets with two State-Of-The-Art (SOTA) deep-learning models using two\ndatasets. The results show that the sU-Net based on MS-FLIM outperforms the\nother training methods and achieves effectiveness within the standard\ndeviations of the SOTA models.\n","authors":["Matheus A. Cerqueira","Flávia Sprenger","Bernardo C. A. Teixeira","Alexandre X. Falcão"],"pdf_url":"https://arxiv.org/pdf/2403.12748v1.pdf","comment":"10 pages, 5 figures, 2 tables, 24 references, manuscript of\n conference paper"},{"id":"http://arxiv.org/abs/2401.10191v3","updated":"2024-03-19T14:09:31Z","published":"2024-01-18T18:25:29Z","title":"Divide and not forget: Ensemble of selectively trained experts in\n Continual Learning","summary":" Class-incremental learning is becoming more popular as it helps models widen\ntheir applicability while not forgetting what they already know. A trend in\nthis area is to use a mixture-of-expert technique, where different models work\ntogether to solve the task. However, the experts are usually trained all at\nonce using whole task data, which makes them all prone to forgetting and\nincreasing computational burden. To address this limitation, we introduce a\nnovel approach named SEED. SEED selects only one, the most optimal expert for a\nconsidered task, and uses data from this task to fine-tune only this expert.\nFor this purpose, each expert represents each class with a Gaussian\ndistribution, and the optimal expert is selected based on the similarity of\nthose distributions. Consequently, SEED increases diversity and heterogeneity\nwithin the experts while maintaining the high stability of this ensemble\nmethod. The extensive experiments demonstrate that SEED achieves\nstate-of-the-art performance in exemplar-free settings across various\nscenarios, showing the potential of expert diversification through data in\ncontinual learning.\n","authors":["Grzegorz Rypeść","Sebastian Cygert","Valeriya Khan","Tomasz Trzciński","Bartosz Zieliński","Bartłomiej Twardowski"],"pdf_url":"https://arxiv.org/pdf/2401.10191v3.pdf","comment":"Accepted for ICLR 2024 (main track), code is available at:\n https://github.com/grypesc/SEED"},{"id":"http://arxiv.org/abs/2403.11956v2","updated":"2024-03-19T14:03:22Z","published":"2024-03-18T16:52:49Z","title":"Subjective-Aligned Dateset and Metric for Text-to-Video Quality\n Assessment","summary":" With the rapid development of generative models, Artificial\nIntelligence-Generated Contents (AIGC) have exponentially increased in daily\nlives. Among them, Text-to-Video (T2V) generation has received widespread\nattention. Though many T2V models have been released for generating high\nperceptual quality videos, there is still lack of a method to evaluate the\nquality of these videos quantitatively. To solve this issue, we establish the\nlargest-scale Text-to-Video Quality Assessment DataBase (T2VQA-DB) to date. The\ndataset is composed of 10,000 videos generated by 9 different T2V models. We\nalso conduct a subjective study to obtain each video's corresponding mean\nopinion score. Based on T2VQA-DB, we propose a novel transformer-based model\nfor subjective-aligned Text-to-Video Quality Assessment (T2VQA). The model\nextracts features from text-video alignment and video fidelity perspectives,\nthen it leverages the ability of a large language model to give the prediction\nscore. Experimental results show that T2VQA outperforms existing T2V metrics\nand SOTA video quality assessment models. Quantitative analysis indicates that\nT2VQA is capable of giving subjective-align predictions, validating its\neffectiveness. The dataset and code will be released at\nhttps://github.com/QMME/T2VQA.\n","authors":["Tengchuan Kou","Xiaohong Liu","Zicheng Zhang","Chunyi Li","Haoning Wu","Xiongkuo Min","Guangtao Zhai","Ning Liu"],"pdf_url":"https://arxiv.org/pdf/2403.11956v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12743v1","updated":"2024-03-19T14:02:13Z","published":"2024-03-19T14:02:13Z","title":"Towards Controllable Face Generation with Semantic Latent Diffusion\n Models","summary":" Semantic Image Synthesis (SIS) is among the most popular and effective\ntechniques in the field of face generation and editing, thanks to its good\ngeneration quality and the versatility is brings along. Recent works attempted\nto go beyond the standard GAN-based framework, and started to explore Diffusion\nModels (DMs) for this task as these stand out with respect to GANs in terms of\nboth quality and diversity. On the other hand, DMs lack in fine-grained\ncontrollability and reproducibility. To address that, in this paper we propose\na SIS framework based on a novel Latent Diffusion Model architecture for human\nface generation and editing that is both able to reproduce and manipulate a\nreal reference image and generate diversity-driven results. The proposed system\nutilizes both SPADE normalization and cross-attention layers to merge shape and\nstyle information and, by doing so, allows for a precise control over each of\nthe semantic parts of the human face. This was not possible with previous\nmethods in the state of the art. Finally, we performed an extensive set of\nexperiments to prove that our model surpasses current state of the art, both\nqualitatively and quantitatively.\n","authors":["Alex Ergasti","Claudio Ferrari","Tomaso Fontanini","Massimo Bertozzi","Andrea Prati"],"pdf_url":"https://arxiv.org/pdf/2403.12743v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.03094v3","updated":"2024-03-19T13:56:07Z","published":"2023-04-06T14:22:02Z","title":"PopulAtion Parameter Averaging (PAPA)","summary":" Ensemble methods combine the predictions of multiple models to improve\nperformance, but they require significantly higher computation costs at\ninference time. To avoid these costs, multiple neural networks can be combined\ninto one by averaging their weights. However, this usually performs\nsignificantly worse than ensembling. Weight averaging is only beneficial when\ndifferent enough to benefit from combining them, but similar enough to average\nwell. Based on this idea, we propose PopulAtion Parameter Averaging (PAPA): a\nmethod that combines the generality of ensembling with the efficiency of weight\naveraging. PAPA leverages a population of diverse models (trained on different\ndata orders, augmentations, and regularizations) while slowly pushing the\nweights of the networks toward the population average of the weights. We also\npropose PAPA variants (PAPA-all, and PAPA-2) that average weights rarely rather\nthan continuously; all methods increase generalization, but PAPA tends to\nperform best. PAPA reduces the performance gap between averaging and\nensembling, increasing the average accuracy of a population of models by up to\n0.8% on CIFAR-10, 1.9% on CIFAR-100, and 1.6% on ImageNet when compared to\ntraining independent (non-averaged) models.\n","authors":["Alexia Jolicoeur-Martineau","Emy Gervais","Kilian Fatras","Yan Zhang","Simon Lacoste-Julien"],"pdf_url":"https://arxiv.org/pdf/2304.03094v3.pdf","comment":"Blog post: https://ajolicoeur.wordpress.com/papa/, Code:\n https://github.com/SamsungSAILMontreal/PAPA, TMLR journal publication:\n https://openreview.net/forum?id=cPDVjsOytS"},{"id":"http://arxiv.org/abs/2403.12736v1","updated":"2024-03-19T13:53:37Z","published":"2024-03-19T13:53:37Z","title":"Towards Multimodal In-Context Learning for Vision & Language Models","summary":" Inspired by the emergence of Large Language Models (LLMs) that can truly\nunderstand human language, significant progress has been made in aligning\nother, non-language, modalities to be `understandable' by an LLM, primarily via\nconverting their samples into a sequence of embedded language-like tokens\ndirectly fed into the LLM (decoder) input stream. However, so far limited\nattention has been given to transferring (and evaluating) one of the core LLM\ncapabilities to the emerging VLMs, namely the In-Context Learning (ICL)\nability, or in other words to guide VLMs to desired target downstream tasks or\noutput structure using in-context image+text demonstrations. In this work, we\ndive deeper into analyzing the capabilities of some of the state-of-the-art\nVLMs to follow ICL instructions, discovering them to be somewhat lacking. We\ndiscover that even models that underwent large-scale mixed modality\npre-training and were implicitly guided to make use of interleaved image and\ntext information (intended to consume helpful context from multiple images)\nunder-perform when prompted with few-shot (ICL) demonstrations, likely due to\ntheir lack of `direct' ICL instruction tuning. To test this conjecture, we\npropose a simple, yet surprisingly effective, strategy of extending a common\nVLM alignment framework with ICL support, methodology, and curriculum. We\nexplore, analyze, and provide insights into effective data mixes, leading up to\na significant 21.03% (and 11.3% on average) ICL performance boost over the\nstrongest VLM baselines and a variety of ICL benchmarks. We also contribute new\nbenchmarks for ICL evaluation in VLMs and discuss their advantages over the\nprior art.\n","authors":["Sivan Doveh","Shaked Perek","M. Jehanzeb Mirza","Amit Alfassy","Assaf Arbelle","Shimon Ullman","Leonid Karlinsky"],"pdf_url":"https://arxiv.org/pdf/2403.12736v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12728v1","updated":"2024-03-19T13:43:27Z","published":"2024-03-19T13:43:27Z","title":"Diffusion-Driven Self-Supervised Learning for Shape Reconstruction and\n Pose Estimation","summary":" Fully-supervised category-level pose estimation aims to determine the 6-DoF\nposes of unseen instances from known categories, requiring expensive mannual\nlabeling costs. Recently, various self-supervised category-level pose\nestimation methods have been proposed to reduce the requirement of the\nannotated datasets. However, most methods rely on synthetic data or 3D CAD\nmodel for self-supervised training, and they are typically limited to\naddressing single-object pose problems without considering multi-objective\ntasks or shape reconstruction. To overcome these challenges and limitations, we\nintroduce a diffusion-driven self-supervised network for multi-object shape\nreconstruction and categorical pose estimation, only leveraging the shape\npriors. Specifically, to capture the SE(3)-equivariant pose features and 3D\nscale-invariant shape information, we present a Prior-Aware Pyramid 3D Point\nTransformer in our network. This module adopts a point convolutional layer with\nradial-kernels for pose-aware learning and a 3D scale-invariant graph\nconvolution layer for object-level shape representation, respectively.\nFurthermore, we introduce a pretrain-to-refine self-supervised training\nparadigm to train our network. It enables proposed network to capture the\nassociations between shape priors and observations, addressing the challenge of\nintra-class shape variations by utilising the diffusion mechanism. Extensive\nexperiments conducted on four public datasets and a self-built dataset\ndemonstrate that our method significantly outperforms state-of-the-art\nself-supervised category-level baselines and even surpasses some\nfully-supervised instance-level and category-level methods.\n","authors":["Jingtao Sun","Yaonan Wang","Mingtao Feng","Chao Ding","Mike Zheng Shou","Ajmal Saeed Mian"],"pdf_url":"https://arxiv.org/pdf/2403.12728v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12722v1","updated":"2024-03-19T13:39:05Z","published":"2024-03-19T13:39:05Z","title":"HUGS: Holistic Urban 3D Scene Understanding via Gaussian Splatting","summary":" Holistic understanding of urban scenes based on RGB images is a challenging\nyet important problem. It encompasses understanding both the geometry and\nappearance to enable novel view synthesis, parsing semantic labels, and\ntracking moving objects. Despite considerable progress, existing approaches\noften focus on specific aspects of this task and require additional inputs such\nas LiDAR scans or manually annotated 3D bounding boxes. In this paper, we\nintroduce a novel pipeline that utilizes 3D Gaussian Splatting for holistic\nurban scene understanding. Our main idea involves the joint optimization of\ngeometry, appearance, semantics, and motion using a combination of static and\ndynamic 3D Gaussians, where moving object poses are regularized via physical\nconstraints. Our approach offers the ability to render new viewpoints in\nreal-time, yielding 2D and 3D semantic information with high accuracy, and\nreconstruct dynamic scenes, even in scenarios where 3D bounding box detection\nare highly noisy. Experimental results on KITTI, KITTI-360, and Virtual KITTI 2\ndemonstrate the effectiveness of our approach.\n","authors":["Hongyu Zhou","Jiahao Shao","Lu Xu","Dongfeng Bai","Weichao Qiu","Bingbing Liu","Yue Wang","Andreas Geiger","Yiyi Liao"],"pdf_url":"https://arxiv.org/pdf/2403.12722v1.pdf","comment":"Our project page is at https://xdimlab.github.io/hugs_website"},{"id":"http://arxiv.org/abs/2307.00574v4","updated":"2024-03-19T13:35:37Z","published":"2023-07-02T13:57:45Z","title":"Bidirectional Temporal Diffusion Model for Temporally Consistent Human\n Animation","summary":" We introduce a method to generate temporally coherent human animation from a\nsingle image, a video, or a random noise. This problem has been formulated as\nmodeling of an auto-regressive generation, i.e., to regress past frames to\ndecode future frames. However, such unidirectional generation is highly prone\nto motion drifting over time, generating unrealistic human animation with\nsignificant artifacts such as appearance distortion. We claim that\nbidirectional temporal modeling enforces temporal coherence on a generative\nnetwork by largely suppressing the motion ambiguity of human appearance. To\nprove our claim, we design a novel human animation framework using a denoising\ndiffusion model: a neural network learns to generate the image of a person by\ndenoising temporal Gaussian noises whose intermediate results are\ncross-conditioned bidirectionally between consecutive frames. In the\nexperiments, our method demonstrates strong performance compared to existing\nunidirectional approaches with realistic temporal coherence\n","authors":["Tserendorj Adiya","Jae Shin Yoon","Jungeun Lee","Sanghun Kim","Hwasup Lim"],"pdf_url":"https://arxiv.org/pdf/2307.00574v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11755v2","updated":"2024-03-19T13:28:27Z","published":"2024-03-18T13:03:24Z","title":"Meta-Prompting for Automating Zero-shot Visual Recognition with LLMs","summary":" Prompt ensembling of Large Language Model (LLM) generated category-specific\nprompts has emerged as an effective method to enhance zero-shot recognition\nability of Vision-Language Models (VLMs). To obtain these category-specific\nprompts, the present methods rely on hand-crafting the prompts to the LLMs for\ngenerating VLM prompts for the downstream tasks. However, this requires\nmanually composing these task-specific prompts and still, they might not cover\nthe diverse set of visual concepts and task-specific styles associated with the\ncategories of interest. To effectively take humans out of the loop and\ncompletely automate the prompt generation process for zero-shot recognition, we\npropose Meta-Prompting for Visual Recognition (MPVR). Taking as input only\nminimal information about the target task, in the form of its short natural\nlanguage description, and a list of associated class labels, MPVR automatically\nproduces a diverse set of category-specific prompts resulting in a strong\nzero-shot classifier. MPVR generalizes effectively across various popular\nzero-shot image recognition benchmarks belonging to widely different domains\nwhen tested with multiple LLMs and VLMs. For example, MPVR obtains a zero-shot\nrecognition improvement over CLIP by up to 19.8% and 18.2% (5.0% and 4.5% on\naverage over 20 datasets) leveraging GPT and Mixtral LLMs, respectively\n","authors":["M. Jehanzeb Mirza","Leonid Karlinsky","Wei Lin","Sivan Doveh","Jakub Micorek","Mateusz Kozinski","Hilde Kuhene","Horst Possegger"],"pdf_url":"https://arxiv.org/pdf/2403.11755v2.pdf","comment":"Project Page (Code and Data):\n https://jmiemirza.github.io/Meta-Prompting/"},{"id":"http://arxiv.org/abs/2403.12712v1","updated":"2024-03-19T13:19:41Z","published":"2024-03-19T13:19:41Z","title":"Addressing Source Scale Bias via Image Warping for Domain Adaptation","summary":" In visual recognition, scale bias is a key challenge due to the imbalance of\nobject and image size distribution inherent in real scene datasets.\nConventional solutions involve injecting scale invariance priors, oversampling\nthe dataset at different scales during training, or adjusting scale at\ninference. While these strategies mitigate scale bias to some extent, their\nability to adapt across diverse datasets is limited. Besides, they increase\ncomputational load during training and latency during inference. In this work,\nwe use adaptive attentional processing -- oversampling salient object regions\nby warping images in-place during training. Discovering that shifting the\nsource scale distribution improves backbone features, we developed a\ninstance-level warping guidance aimed at object region sampling to mitigate\nsource scale bias in domain adaptation. Our approach improves adaptation across\ngeographies, lighting and weather conditions, is agnostic to the task, domain\nadaptation algorithm, saliency guidance, and underlying model architecture.\nHighlights include +6.1 mAP50 for BDD100K Clear $\\rightarrow$ DENSE Foggy, +3.7\nmAP50 for BDD100K Day $\\rightarrow$ Night, +3.0 mAP50 for BDD100K Clear\n$\\rightarrow$ Rainy, and +6.3 mIoU for Cityscapes $\\rightarrow$ ACDC. Our\napproach adds minimal memory during training and has no additional latency at\ninference time. Please see Appendix for more results and analysis.\n","authors":["Shen Zheng","Anurag Ghosh","Srinivasa G. Narasimhan"],"pdf_url":"https://arxiv.org/pdf/2403.12712v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12710v1","updated":"2024-03-19T13:17:26Z","published":"2024-03-19T13:17:26Z","title":"Selective, Interpretable, and Motion Consistent Privacy Attribute\n Obfuscation for Action Recognition","summary":" Concerns for the privacy of individuals captured in public imagery have led\nto privacy-preserving action recognition. Existing approaches often suffer from\nissues arising through obfuscation being applied globally and a lack of\ninterpretability. Global obfuscation hides privacy sensitive regions, but also\ncontextual regions important for action recognition. Lack of interpretability\nerodes trust in these new technologies. We highlight the limitations of current\nparadigms and propose a solution: Human selected privacy templates that yield\ninterpretability by design, an obfuscation scheme that selectively hides\nattributes and also induces temporal consistency, which is important in action\nrecognition. Our approach is architecture agnostic and directly modifies input\nimagery, while existing approaches generally require architecture training. Our\napproach offers more flexibility, as no retraining is required, and outperforms\nalternatives on three widely used datasets.\n","authors":["Filip Ilic","He Zhao","Thomas Pock","Richard P. Wildes"],"pdf_url":"https://arxiv.org/pdf/2403.12710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12707v1","updated":"2024-03-19T13:09:19Z","published":"2024-03-19T13:09:19Z","title":"Selective Domain-Invariant Feature for Generalizable Deepfake Detection","summary":" With diverse presentation forgery methods emerging continually, detecting the\nauthenticity of images has drawn growing attention. Although existing methods\nhave achieved impressive accuracy in training dataset detection, they still\nperform poorly in the unseen domain and suffer from forgery of irrelevant\ninformation such as background and identity, affecting generalizability. To\nsolve this problem, we proposed a novel framework Selective Domain-Invariant\nFeature (SDIF), which reduces the sensitivity to face forgery by fusing content\nfeatures and styles. Specifically, we first use a Farthest-Point Sampling (FPS)\ntraining strategy to construct a task-relevant style sample representation\nspace for fusing with content features. Then, we propose a dynamic feature\nextraction module to generate features with diverse styles to improve the\nperformance and effectiveness of the feature extractor. Finally, a domain\nseparation strategy is used to retain domain-related features to help\ndistinguish between real and fake faces. Both qualitative and quantitative\nresults in existing benchmarks and proposals demonstrate the effectiveness of\nour approach.\n","authors":["Yingxin Lai","Guoqing Yang Yifan He","Zhiming Luo","Shaozi Li"],"pdf_url":"https://arxiv.org/pdf/2403.12707v1.pdf","comment":"Accepted by ICASSP 2024"},{"id":"http://arxiv.org/abs/2403.12706v1","updated":"2024-03-19T13:08:54Z","published":"2024-03-19T13:08:54Z","title":"AnimateDiff-Lightning: Cross-Model Diffusion Distillation","summary":" We present AnimateDiff-Lightning for lightning-fast video generation. Our\nmodel uses progressive adversarial diffusion distillation to achieve new\nstate-of-the-art in few-step video generation. We discuss our modifications to\nadapt it for the video modality. Furthermore, we propose to simultaneously\ndistill the probability flow of multiple base diffusion models, resulting in a\nsingle distilled motion module with broader style compatibility. We are pleased\nto release our distilled AnimateDiff-Lightning model for the community's use.\n","authors":["Shanchuan Lin","Xiao Yang"],"pdf_url":"https://arxiv.org/pdf/2403.12706v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11220v2","updated":"2024-03-19T13:02:10Z","published":"2024-03-17T13:43:10Z","title":"CPA-Enhancer: Chain-of-Thought Prompted Adaptive Enhancer for Object\n Detection under Unknown Degradations","summary":" Object detection methods under known single degradations have been\nextensively investigated. However, existing approaches require prior knowledge\nof the degradation type and train a separate model for each, limiting their\npractical applications in unpredictable environments. To address this\nchallenge, we propose a chain-of-thought (CoT) prompted adaptive enhancer,\nCPA-Enhancer, for object detection under unknown degradations. Specifically,\nCPA-Enhancer progressively adapts its enhancement strategy under the\nstep-by-step guidance of CoT prompts, that encode degradation-related\ninformation. To the best of our knowledge, it's the first work that exploits\nCoT prompting for object detection tasks. Overall, CPA-Enhancer is a\nplug-and-play enhancement model that can be integrated into any generic\ndetectors to achieve substantial gains on degraded images, without knowing the\ndegradation type priorly. Experimental results demonstrate that CPA-Enhancer\nnot only sets the new state of the art for object detection but also boosts the\nperformance of other downstream vision tasks under unknown degradations.\n","authors":["Yuwei Zhang","Yan Wu","Yanming Liu","Xinyue Peng"],"pdf_url":"https://arxiv.org/pdf/2403.11220v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12702v1","updated":"2024-03-19T13:01:57Z","published":"2024-03-19T13:01:57Z","title":"Learning Cross-view Visual Geo-localization without Ground Truth","summary":" Cross-View Geo-Localization (CVGL) involves determining the geographical\nlocation of a query image by matching it with a corresponding GPS-tagged\nreference image. Current state-of-the-art methods predominantly rely on\ntraining models with labeled paired images, incurring substantial annotation\ncosts and training burdens. In this study, we investigate the adaptation of\nfrozen models for CVGL without requiring ground truth pair labels. We observe\nthat training on unlabeled cross-view images presents significant challenges,\nincluding the need to establish relationships within unlabeled data and\nreconcile view discrepancies between uncertain queries and references. To\naddress these challenges, we propose a self-supervised learning framework to\ntrain a learnable adapter for a frozen Foundation Model (FM). This adapter is\ndesigned to map feature distributions from diverse views into a uniform space\nusing unlabeled data exclusively. To establish relationships within unlabeled\ndata, we introduce an Expectation-Maximization-based Pseudo-labeling module,\nwhich iteratively estimates associations between cross-view features and\noptimizes the adapter. To maintain the robustness of the FM's representation,\nwe incorporate an information consistency module with a reconstruction loss,\nensuring that adapted features retain strong discriminative ability across\nviews. Experimental results demonstrate that our proposed method achieves\nsignificant improvements over vanilla FMs and competitive accuracy compared to\nsupervised methods, while necessitating fewer training parameters and relying\nsolely on unlabeled data. Evaluation of our adaptation for task-specific models\nfurther highlights its broad applicability.\n","authors":["Haoyuan Li","Chang Xu","Wen Yang","Huai Yu","Gui-Song Xia"],"pdf_url":"https://arxiv.org/pdf/2403.12702v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12695v1","updated":"2024-03-19T12:52:38Z","published":"2024-03-19T12:52:38Z","title":"Federated Semi-supervised Learning for Medical Image Segmentation with\n intra-client and inter-client Consistency","summary":" Medical image segmentation plays a vital role in clinic disease diagnosis and\nmedical image analysis. However, labeling medical images for segmentation task\nis tough due to the indispensable domain expertise of radiologists.\nFurthermore, considering the privacy and sensitivity of medical images, it is\nimpractical to build a centralized segmentation dataset from different medical\ninstitutions. Federated learning aims to train a shared model of isolated\nclients without local data exchange which aligns well with the scarcity and\nprivacy characteristics of medical data. To solve the problem of labeling hard,\nmany advanced semi-supervised methods have been proposed in a centralized data\nsetting. As for federated learning, how to conduct semi-supervised learning\nunder this distributed scenario is worth investigating. In this work, we\npropose a novel federated semi-supervised learning framework for medical image\nsegmentation. The intra-client and inter-client consistency learning are\nintroduced to smooth predictions at the data level and avoid confirmation bias\nof local models. They are achieved with the assistance of a Variational\nAutoencoder (VAE) trained collaboratively by clients. The added VAE model plays\nthree roles: 1) extracting latent low-dimensional features of all labeled and\nunlabeled data; 2) performing a novel type of data augmentation in calculating\nintra-client consistency loss; 3) utilizing the generative ability of itself to\nconduct inter-client consistency distillation. The proposed framework is\ncompared with other federated semi-supervised or self-supervised learning\nmethods. The experimental results illustrate that our method outperforms the\nstate-of-the-art method while avoiding a lot of computation and communication\noverhead.\n","authors":["Yubin Zheng","Peng Tang","Tianjie Ju","Weidong Qiu","Bo Yan"],"pdf_url":"https://arxiv.org/pdf/2403.12695v1.pdf","comment":"Working in progress"},{"id":"http://arxiv.org/abs/2403.12693v1","updated":"2024-03-19T12:51:39Z","published":"2024-03-19T12:51:39Z","title":"As Firm As Their Foundations: Can open-sourced foundation models be used\n to create adversarial examples for downstream tasks?","summary":" Foundation models pre-trained on web-scale vision-language data, such as\nCLIP, are widely used as cornerstones of powerful machine learning systems.\nWhile pre-training offers clear advantages for downstream learning, it also\nendows downstream models with shared adversarial vulnerabilities that can be\neasily identified through the open-sourced foundation model. In this work, we\nexpose such vulnerabilities in CLIP's downstream models and show that\nfoundation models can serve as a basis for attacking their downstream systems.\nIn particular, we propose a simple yet effective adversarial attack strategy\ntermed Patch Representation Misalignment (PRM). Solely based on open-sourced\nCLIP vision encoders, this method produces adversaries that simultaneously fool\nmore than 20 downstream models spanning 4 common vision-language tasks\n(semantic segmentation, object detection, image captioning and visual\nquestion-answering). Our findings highlight the concerning safety risks\nintroduced by the extensive usage of public foundational models in the\ndevelopment of downstream systems, calling for extra caution in these\nscenarios.\n","authors":["Anjun Hu","Jindong Gu","Francesco Pinto","Konstantinos Kamnitsas","Philip Torr"],"pdf_url":"https://arxiv.org/pdf/2403.12693v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12687v1","updated":"2024-03-19T12:45:52Z","published":"2024-03-19T12:45:52Z","title":"Audio-Visual Compound Expression Recognition Method based on Late\n Modality Fusion and Rule-based Decision","summary":" This paper presents the results of the SUN team for the Compound Expressions\nRecognition Challenge of the 6th ABAW Competition. We propose a novel\naudio-visual method for compound expression recognition. Our method relies on\nemotion recognition models that fuse modalities at the emotion probability\nlevel, while decisions regarding the prediction of compound expressions are\nbased on predefined rules. Notably, our method does not use any training data\nspecific to the target task. The method is evaluated in multi-corpus training\nand cross-corpus validation setups. Our findings from the challenge demonstrate\nthat the proposed method can potentially form a basis for development of\nintelligent tools for annotating audio-visual data in the context of human's\nbasic and compound emotions. The source code is publicly available.\n","authors":["Elena Ryumina","Maxim Markitantov","Dmitry Ryumin","Heysem Kaya","Alexey Karpov"],"pdf_url":"https://arxiv.org/pdf/2403.12687v1.pdf","comment":"7 pages, 3 figures"},{"id":"http://arxiv.org/abs/2403.12686v1","updated":"2024-03-19T12:45:18Z","published":"2024-03-19T12:45:18Z","title":"WaterVG: Waterway Visual Grounding based on Text-Guided Vision and\n mmWave Radar","summary":" The perception of waterways based on human intent holds significant\nimportance for autonomous navigation and operations of Unmanned Surface\nVehicles (USVs) in water environments. Inspired by visual grounding, in this\npaper, we introduce WaterVG, the first visual grounding dataset designed for\nUSV-based waterway perception based on human intention prompts. WaterVG\nencompasses prompts describing multiple targets, with annotations at the\ninstance level including bounding boxes and masks. Notably, WaterVG includes\n11,568 samples with 34,950 referred targets, which integrates both visual and\nradar characteristics captured by monocular camera and millimeter-wave (mmWave)\nradar, enabling a finer granularity of text prompts. Furthermore, we propose a\nnovel multi-modal visual grounding model, Potamoi, which is a multi-modal and\nmulti-task model based on the one-stage paradigm with a designed Phased\nHeterogeneous Modality Fusion (PHMF) structure, including Adaptive Radar\nWeighting (ARW) and Multi-Head Slim Cross Attention (MHSCA). In specific, MHSCA\nis a low-cost and efficient fusion module with a remarkably small parameter\ncount and FLOPs, elegantly aligning and fusing scenario context information\ncaptured by two sensors with linguistic features, which can effectively address\ntasks of referring expression comprehension and segmentation based on\nfine-grained prompts. Comprehensive experiments and evaluations have been\nconducted on WaterVG, where our Potamoi archives state-of-the-art performances\ncompared with counterparts.\n","authors":["Runwei Guan","Liye Jia","Fengyufan Yang","Shanliang Yao","Erick Purwanto","Xiaohui Zhu","Eng Gee Lim","Jeremy Smith","Ka Lok Man","Yutao Yue"],"pdf_url":"https://arxiv.org/pdf/2403.12686v1.pdf","comment":"10 pages, 9 figures"},{"id":"http://arxiv.org/abs/2403.12682v1","updated":"2024-03-19T12:36:51Z","published":"2024-03-19T12:36:51Z","title":"IFFNeRF: Initialisation Free and Fast 6DoF pose estimation from a single\n image and a NeRF model","summary":" We introduce IFFNeRF to estimate the six degrees-of-freedom (6DoF) camera\npose of a given image, building on the Neural Radiance Fields (NeRF)\nformulation. IFFNeRF is specifically designed to operate in real-time and\neliminates the need for an initial pose guess that is proximate to the sought\nsolution. IFFNeRF utilizes the Metropolis-Hasting algorithm to sample surface\npoints from within the NeRF model. From these sampled points, we cast rays and\ndeduce the color for each ray through pixel-level view synthesis. The camera\npose can then be estimated as the solution to a Least Squares problem by\nselecting correspondences between the query image and the resulting bundle. We\nfacilitate this process through a learned attention mechanism, bridging the\nquery image embedding with the embedding of parameterized rays, thereby\nmatching rays pertinent to the image. Through synthetic and real evaluation\nsettings, we show that our method can improve the angular and translation error\naccuracy by 80.1% and 67.3%, respectively, compared to iNeRF while performing\nat 34fps on consumer hardware and not requiring the initial pose guess.\n","authors":["Matteo Bortolon","Theodore Tsesmelis","Stuart James","Fabio Poiesi","Alessio Del Bue"],"pdf_url":"https://arxiv.org/pdf/2403.12682v1.pdf","comment":"Accepted ICRA 2024, Project page:\n https://mbortolon97.github.io/iffnerf/"},{"id":"http://arxiv.org/abs/2308.13812v2","updated":"2024-03-19T12:29:54Z","published":"2023-08-26T08:31:48Z","title":"Dysen-VDM: Empowering Dynamics-aware Text-to-Video Diffusion with LLMs","summary":" Text-to-video (T2V) synthesis has gained increasing attention in the\ncommunity, in which the recently emerged diffusion models (DMs) have\npromisingly shown stronger performance than the past approaches. While existing\nstate-of-the-art DMs are competent to achieve high-resolution video generation,\nthey may largely suffer from key limitations (e.g., action occurrence\ndisorders, crude video motions) with respect to the intricate temporal dynamics\nmodeling, one of the crux of video synthesis. In this work, we investigate\nstrengthening the awareness of video dynamics for DMs, for high-quality T2V\ngeneration. Inspired by human intuition, we design an innovative dynamic scene\nmanager (dubbed as Dysen) module, which includes (step-1) extracting from input\ntext the key actions with proper time-order arrangement, (step-2) transforming\nthe action schedules into the dynamic scene graph (DSG) representations, and\n(step-3) enriching the scenes in the DSG with sufficient and reasonable\ndetails. Taking advantage of the existing powerful LLMs (e.g., ChatGPT) via\nin-context learning, Dysen realizes (nearly) human-level temporal dynamics\nunderstanding. Finally, the resulting video DSG with rich action scene details\nis encoded as fine-grained spatio-temporal features, integrated into the\nbackbone T2V DM for video generating. Experiments on popular T2V datasets\nsuggest that our Dysen-VDM consistently outperforms prior arts with significant\nmargins, especially in scenarios with complex actions. Codes at\nhttps://haofei.vip/Dysen-VDM\n","authors":["Hao Fei","Shengqiong Wu","Wei Ji","Hanwang Zhang","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2308.13812v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2307.01187v4","updated":"2024-03-19T12:27:37Z","published":"2023-07-03T17:52:44Z","title":"SAMAug: Point Prompt Augmentation for Segment Anything Model","summary":" This paper introduces SAMAug, a novel visual point augmentation method for\nthe Segment Anything Model (SAM) that enhances interactive image segmentation\nperformance. SAMAug generates augmented point prompts to provide more\ninformation about the user's intention to SAM. Starting with an initial point\nprompt, SAM produces an initial mask, which is then fed into our proposed\nSAMAug to generate augmented point prompts. By incorporating these extra\npoints, SAM can generate augmented segmentation masks based on both the\naugmented point prompts and the initial prompt, resulting in improved\nsegmentation performance. We conducted evaluations using four different point\naugmentation strategies: random sampling, sampling based on maximum difference\nentropy, maximum distance, and saliency. Experiment results on the COCO,\nFundus, COVID QUEx, and ISIC2018 datasets show that SAMAug can boost SAM's\nsegmentation results, especially using the maximum distance and saliency.\nSAMAug demonstrates the potential of visual prompt augmentation for computer\nvision. Codes of SAMAug are available at github.com/yhydhx/SAMAug\n","authors":["Haixing Dai","Chong Ma","Zhiling Yan","Zhengliang Liu","Enze Shi","Yiwei Li","Peng Shu","Xiaozheng Wei","Lin Zhao","Zihao Wu","Fang Zeng","Dajiang Zhu","Wei Liu","Quanzheng Li","Lichao Sun","Shu Zhang Tianming Liu","Xiang Li"],"pdf_url":"https://arxiv.org/pdf/2307.01187v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07269v2","updated":"2024-03-19T12:27:33Z","published":"2023-08-14T16:52:42Z","title":"EasyEdit: An Easy-to-use Knowledge Editing Framework for Large Language\n Models","summary":" Large Language Models (LLMs) usually suffer from knowledge cutoff or fallacy\nissues, which means they are unaware of unseen events or generate text with\nincorrect facts owing to outdated/noisy data. To this end, many knowledge\nediting approaches for LLMs have emerged -- aiming to subtly inject/edit\nupdated knowledge or adjust undesired behavior while minimizing the impact on\nunrelated inputs. Nevertheless, due to significant differences among various\nknowledge editing methods and the variations in task setups, there is no\nstandard implementation framework available for the community, which hinders\npractitioners from applying knowledge editing to applications. To address these\nissues, we propose EasyEdit, an easy-to-use knowledge editing framework for\nLLMs. It supports various cutting-edge knowledge editing approaches and can be\nreadily applied to many well-known LLMs such as T5, GPT-J, LlaMA, etc.\nEmpirically, we report the knowledge editing results on LlaMA-2 with EasyEdit,\ndemonstrating that knowledge editing surpasses traditional fine-tuning in terms\nof reliability and generalization. We have released the source code on GitHub,\nalong with Google Colab tutorials and comprehensive documentation for beginners\nto get started. Besides, we present an online system for real-time knowledge\nediting, and a demo video.\n","authors":["Peng Wang","Ningyu Zhang","Bozhong Tian","Zekun Xi","Yunzhi Yao","Ziwen Xu","Mengru Wang","Shengyu Mao","Xiaohan Wang","Siyuan Cheng","Kangwei Liu","Yuansheng Ni","Guozhou Zheng","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2308.07269v2.pdf","comment":"Code: https://github.com/zjunlp/EasyEdit HF Demo:\n https://huggingface.co/spaces/zjunlp/EasyEdit Video:\n https://youtu.be/Gm6T0QaaskU Docs: https://zjunlp.gitbook.io/easyedit"},{"id":"http://arxiv.org/abs/2403.12670v1","updated":"2024-03-19T12:11:57Z","published":"2024-03-19T12:11:57Z","title":"Driving Animatronic Robot Facial Expression From Speech","summary":" Animatronic robots aim to enable natural human-robot interaction through\nlifelike facial expressions. However, generating realistic, speech-synchronized\nrobot expressions is challenging due to the complexities of facial biomechanics\nand responsive motion synthesis. This paper presents a principled,\nskinning-centric approach to drive animatronic robot facial expressions from\nspeech. The proposed approach employs linear blend skinning (LBS) as the core\nrepresentation to guide tightly integrated innovations in embodiment design and\nmotion synthesis. LBS informs the actuation topology, enables human expression\nretargeting, and allows speech-driven facial motion generation. The proposed\napproach is capable of generating highly realistic, real-time facial\nexpressions from speech on an animatronic face, significantly advancing robots'\nability to replicate nuanced human expressions for natural interaction.\n","authors":["Boren Li","Hang Li","Hangxin Liu"],"pdf_url":"https://arxiv.org/pdf/2403.12670v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2309.07439v2","updated":"2024-03-19T11:51:38Z","published":"2023-09-14T05:45:40Z","title":"DePT: Decoupled Prompt Tuning","summary":" This work breaks through the Base-New Tradeoff (BNT)dilemma in prompt tuning,\ni.e., the better the tuned model generalizes to the base (or target) task, the\nworse it generalizes to new tasks, and vice versa. Specifically, through an\nin-depth analysis of the learned features of the base and new tasks, we observe\nthat the BNT stems from a channel bias issue, i.e., the vast majority of\nfeature channels are occupied by base-specific knowledge, resulting in the\ncollapse of taskshared knowledge important to new tasks. To address this, we\npropose the Decoupled Prompt Tuning (DePT) framework, which decouples\nbase-specific knowledge from feature channels into an isolated feature space\nduring prompt tuning, so as to maximally preserve task-shared knowledge in the\noriginal feature space for achieving better zero-shot generalization on new\ntasks. Importantly, our DePT is orthogonal to existing prompt tuning methods,\nhence it can improve all of them. Extensive experiments on 11 datasets show the\nstrong flexibility and effectiveness of DePT. Our code and pretrained models\nare available at https://github.com/Koorye/DePT.\n","authors":["Ji Zhang","Shihan Wu","Lianli Gao","Heng Tao Shen","Jingkuan Song"],"pdf_url":"https://arxiv.org/pdf/2309.07439v2.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2403.12658v1","updated":"2024-03-19T11:48:35Z","published":"2024-03-19T11:48:35Z","title":"Tuning-Free Image Customization with Image and Text Guidance","summary":" Despite significant advancements in image customization with diffusion\nmodels, current methods still have several limitations: 1) unintended changes\nin non-target areas when regenerating the entire image; 2) guidance solely by a\nreference image or text descriptions; and 3) time-consuming fine-tuning, which\nlimits their practical application. In response, we introduce a tuning-free\nframework for simultaneous text-image-guided image customization, enabling\nprecise editing of specific image regions within seconds. Our approach\npreserves the semantic features of the reference image subject while allowing\nmodification of detailed attributes based on text descriptions. To achieve\nthis, we propose an innovative attention blending strategy that blends\nself-attention features in the UNet decoder during the denoising process. To\nour knowledge, this is the first tuning-free method that concurrently utilizes\ntext and image guidance for image customization in specific regions. Our\napproach outperforms previous methods in both human and quantitative\nevaluations, providing an efficient solution for various practical\napplications, such as image synthesis, design, and creative photography.\n","authors":["Pengzhi Li","Qiang Nie","Ying Chen","Xi Jiang","Kai Wu","Yuhuan Lin","Yong Liu","Jinlong Peng","Chengjie Wang","Feng Zheng"],"pdf_url":"https://arxiv.org/pdf/2403.12658v1.pdf","comment":"17 pages, 8 figures"},{"id":"http://arxiv.org/abs/2403.10904v2","updated":"2024-03-19T11:37:28Z","published":"2024-03-16T11:38:58Z","title":"Urban Sound Propagation: a Benchmark for 1-Step Generative Modeling of\n Complex Physical Systems","summary":" Data-driven modeling of complex physical systems is receiving a growing\namount of attention in the simulation and machine learning communities. Since\nmost physical simulations are based on compute-intensive, iterative\nimplementations of differential equation systems, a (partial) replacement with\nlearned, 1-step inference models has the potential for significant speedups in\na wide range of application areas. In this context, we present a novel\nbenchmark for the evaluation of 1-step generative learning models in terms of\nspeed and physical correctness. Our Urban Sound Propagation benchmark is based\non the physically complex and practically relevant, yet intuitively easy to\ngrasp task of modeling the 2d propagation of waves from a sound source in an\nurban environment. We provide a dataset with 100k samples, where each sample\nconsists of pairs of real 2d building maps drawn from OpenStreetmap, a\nparameterized sound source, and a simulated ground truth sound propagation for\nthe given scene. The dataset provides four different simulation tasks with\nincreasing complexity regarding reflection, diffraction and source variance. A\nfirst baseline evaluation of common generative U-Net, GAN and Diffusion models\nshows, that while these models are very well capable of modeling sound\npropagations in simple cases, the approximation of sub-systems represented by\nhigher order equations systematically fails. Information about the dataset,\ndownload instructions and source codes are provided on our website:\nhttps://www.urban-sound-data.org.\n","authors":["Martin Spitznagel","Janis Keuper"],"pdf_url":"https://arxiv.org/pdf/2403.10904v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11831v2","updated":"2024-03-19T11:31:44Z","published":"2024-03-18T14:43:04Z","title":"BAD-Gaussians: Bundle Adjusted Deblur Gaussian Splatting","summary":" While neural rendering has demonstrated impressive capabilities in 3D scene\nreconstruction and novel view synthesis, it heavily relies on high-quality\nsharp images and accurate camera poses. Numerous approaches have been proposed\nto train Neural Radiance Fields (NeRF) with motion-blurred images, commonly\nencountered in real-world scenarios such as low-light or long-exposure\nconditions. However, the implicit representation of NeRF struggles to\naccurately recover intricate details from severely motion-blurred images and\ncannot achieve real-time rendering. In contrast, recent advancements in 3D\nGaussian Splatting achieve high-quality 3D scene reconstruction and real-time\nrendering by explicitly optimizing point clouds as Gaussian spheres.\n In this paper, we introduce a novel approach, named BAD-Gaussians (Bundle\nAdjusted Deblur Gaussian Splatting), which leverages explicit Gaussian\nrepresentation and handles severe motion-blurred images with inaccurate camera\nposes to achieve high-quality scene reconstruction. Our method models the\nphysical image formation process of motion-blurred images and jointly learns\nthe parameters of Gaussians while recovering camera motion trajectories during\nexposure time.\n In our experiments, we demonstrate that BAD-Gaussians not only achieves\nsuperior rendering quality compared to previous state-of-the-art deblur neural\nrendering methods on both synthetic and real datasets but also enables\nreal-time rendering capabilities.\n Our project page and source code is available at\nhttps://lingzhezhao.github.io/BAD-Gaussians/\n","authors":["Lingzhe Zhao","Peng Wang","Peidong Liu"],"pdf_url":"https://arxiv.org/pdf/2403.11831v2.pdf","comment":"Project Page and Source Code:\n https://lingzhezhao.github.io/BAD-Gaussians/"},{"id":"http://arxiv.org/abs/2403.10942v2","updated":"2024-03-19T11:28:12Z","published":"2024-03-16T14:58:58Z","title":"ScanTalk: 3D Talking Heads from Unregistered Scans","summary":" Speech-driven 3D talking heads generation has emerged as a significant area\nof interest among researchers, presenting numerous challenges. Existing methods\nare constrained by animating faces with fixed topologies, wherein point-wise\ncorrespondence is established, and the number and order of points remains\nconsistent across all identities the model can animate. In this work, we\npresent ScanTalk, a novel framework capable of animating 3D faces in arbitrary\ntopologies including scanned data. Our approach relies on the DiffusionNet\narchitecture to overcome the fixed topology constraint, offering promising\navenues for more flexible and realistic 3D animations. By leveraging the power\nof DiffusionNet, ScanTalk not only adapts to diverse facial structures but also\nmaintains fidelity when dealing with scanned data, thereby enhancing the\nauthenticity and versatility of generated 3D talking heads. Through\ncomprehensive comparisons with state-of-the-art methods, we validate the\nefficacy of our approach, demonstrating its capacity to generate realistic\ntalking heads comparable to existing techniques. While our primary objective is\nto develop a generic method free from topological constraints, all\nstate-of-the-art methodologies are bound by such limitations. Code for\nreproducing our results, and the pre-trained model will be made available.\n","authors":["Federico Nocentini","Thomas Besnier","Claudio Ferrari","Sylvain Arguillere","Stefano Berretti","Mohamed Daoudi"],"pdf_url":"https://arxiv.org/pdf/2403.10942v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12547v2","updated":"2024-03-19T11:09:12Z","published":"2023-10-19T07:54:30Z","title":"PGA: Personalizing Grasping Agents with Single Human-Robot Interaction","summary":" Language-Conditioned Robotic Grasping (LCRG) aims to develop robots that\ncomprehend and grasp objects based on natural language instructions. While the\nability to understand personal objects like my wallet facilitates more natural\ninteraction with human users, current LCRG systems only allow generic language\ninstructions, e.g., the black-colored wallet next to the laptop. To this end,\nwe introduce a task scenario GraspMine alongside a novel dataset aimed at\npinpointing and grasping personal objects given personal indicators via\nlearning from a single human-robot interaction, rather than a large labeled\ndataset. Our proposed method, Personalized Grasping Agent (PGA), addresses\nGraspMine by leveraging the unlabeled image data of the user's environment,\ncalled Reminiscence. Specifically, PGA acquires personal object information by\na user presenting a personal object with its associated indicator, followed by\nPGA inspecting the object by rotating it. Based on the acquired information,\nPGA pseudo-labels objects in the Reminiscence by our proposed label propagation\nalgorithm. Harnessing the information acquired from the interactions and the\npseudo-labeled objects in the Reminiscence, PGA adapts the object grounding\nmodel to grasp personal objects. This results in significant efficiency while\nprevious LCRG systems rely on resource-intensive human annotations --\nnecessitating hundreds of labeled data to learn my wallet. Moreover, PGA\noutperforms baseline methods across all metrics and even shows comparable\nperformance compared to the fully-supervised method, which learns from 9k\nannotated data samples. We further validate PGA's real-world applicability by\nemploying a physical robot to execute GrsapMine. Code and data are publicly\navailable at https://github.com/JHKim-snu/PGA.\n","authors":["Junghyun Kim","Gi-Cheon Kang","Jaein Kim","Seoyun Yang","Minjoon Jung","Byoung-Tak Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.12547v2.pdf","comment":"8 pages, under review"},{"id":"http://arxiv.org/abs/2312.13316v3","updated":"2024-03-19T11:01:35Z","published":"2023-12-20T11:00:54Z","title":"ECAMP: Entity-centered Context-aware Medical Vision Language\n Pre-training","summary":" Despite significant advancements in medical vision-language pre-training,\nexisting methods have largely overlooked the inherent entity-specific context\nwithin radiology reports and the complex cross-modality contextual\nrelationships between text and images. To close this gap, we propose a novel\nEntity-centered Context-aware Medical Vision-language Pre-training (ECAMP)\nframework, which is designed to enable a more entity-centered and\ncontext-sensitive interpretation of medical data. Utilizing the recent powerful\nlarge language model, we distill entity-centered context from medical reports,\nwhich enables ECAMP to gain more effective supervision from the text modality.\nBy further pre-training our model with carefully designed entity-aware,\ncontext-enhanced masked language modeling and context-guided super-resolution\ntasks, ECAMP significantly refines the interplay between text and image\nmodalities, leading to an enhanced ability to extract entity-centered\ncontextual features. Besides, our proposed multi-scale context fusion design\nalso improves the semantic integration of both coarse and fine-level image\nrepresentations, prompting better performance for multi-scale downstream\napplications. Combining these components leads to significant performance leaps\nover current state-of-the-art methods and establishes a new standard for\ncross-modality learning in medical imaging, whose effectiveness is demonstrated\nby our extensive experiments on various tasks including classification,\nsegmentation, and detection across several public datasets. Code and models are\navailable at https://github.com/ToniChopp/ECAMP.\n","authors":["Rongsheng Wang","Qingsong Yao","Haoran Lai","Zhiyang He","Xiaodong Tao","Zihang Jiang","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.13316v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.09749v2","updated":"2024-03-19T10:56:12Z","published":"2022-03-18T05:17:00Z","title":"Goal-conditioned dual-action imitation learning for dexterous dual-arm\n robot manipulation","summary":" Long-horizon dexterous robot manipulation of deformable objects, such as\nbanana peeling, is a problematic task because of the difficulties in object\nmodeling and a lack of knowledge about stable and dexterous manipulation\nskills. This paper presents a goal-conditioned dual-action (GC-DA) deep\nimitation learning (DIL) approach that can learn dexterous manipulation skills\nusing human demonstration data. Previous DIL methods map the current sensory\ninput and reactive action, which often fails because of compounding errors in\nimitation learning caused by the recurrent computation of actions. The method\npredicts reactive action only when the precise manipulation of the target\nobject is required (local action) and generates the entire trajectory when\nprecise manipulation is not required (global action). This dual-action\nformulation effectively prevents compounding error in the imitation learning\nusing the trajectory-based global action while responding to unexpected changes\nin the target object during the reactive local action. The proposed method was\ntested in a real dual-arm robot and successfully accomplished the\nbanana-peeling task.\n","authors":["Heecheol Kim","Yoshiyuki Ohmura","Yasuo Kuniyoshi"],"pdf_url":"https://arxiv.org/pdf/2203.09749v2.pdf","comment":"19 pages, published in Transactions on Robotics (T-RO)"},{"id":"http://arxiv.org/abs/2310.10325v2","updated":"2024-03-19T09:54:41Z","published":"2023-10-16T12:08:35Z","title":"Towards image compression with perfect realism at ultra-low bitrates","summary":" Image codecs are typically optimized to trade-off bitrate \\vs distortion\nmetrics. At low bitrates, this leads to compression artefacts which are easily\nperceptible, even when training with perceptual or adversarial losses. To\nimprove image quality and remove dependency on the bitrate, we propose to\ndecode with iterative diffusion models. We condition the decoding process on a\nvector-quantized image representation, as well as a global image description to\nprovide additional context. We dub our model PerCo for 'perceptual\ncompression', and compare it to state-of-the-art codecs at rates from 0.1 down\nto 0.003 bits per pixel. The latter rate is more than an order of magnitude\nsmaller than those considered in most prior work, compressing a 512x768 Kodak\nimage with less than 153 bytes. Despite this ultra-low bitrate, our approach\nmaintains the ability to reconstruct realistic images. We find that our model\nleads to reconstructions with state-of-the-art visual quality as measured by\nFID and KID. As predicted by rate-distortion-perception theory, visual quality\nis less dependent on the bitrate than previous methods.\n","authors":["Marlène Careil","Matthew J. Muckley","Jakob Verbeek","Stéphane Lathuilière"],"pdf_url":"https://arxiv.org/pdf/2310.10325v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08255v2","updated":"2024-03-19T09:49:01Z","published":"2023-12-13T16:18:40Z","title":"OCTDL: Optical Coherence Tomography Dataset for Image-Based Deep\n Learning Methods","summary":" Optical coherence tomography (OCT) is a non-invasive imaging technique with\nextensive clinical applications in ophthalmology. OCT enables the visualization\nof the retinal layers, playing a vital role in the early detection and\nmonitoring of retinal diseases. OCT uses the principle of light wave\ninterference to create detailed images of the retinal microstructures, making\nit a valuable tool for diagnosing ocular conditions. This work presents an\nopen-access OCT dataset (OCTDL) comprising over 2000 OCT images labeled\naccording to disease group and retinal pathology. The dataset consists of OCT\nrecords of patients with Age-related Macular Degeneration (AMD), Diabetic\nMacular Edema (DME), Epiretinal Membrane (ERM), Retinal Artery Occlusion (RAO),\nRetinal Vein Occlusion (RVO), and Vitreomacular Interface Disease (VID). The\nimages were acquired with an Optovue Avanti RTVue XR using raster scanning\nprotocols with dynamic scan length and image resolution. Each retinal b-scan\nwas acquired by centering on the fovea and interpreted and cataloged by an\nexperienced retinal specialist. In this work, we applied Deep Learning\nclassification techniques to this new open-access dataset.\n","authors":["Mikhail Kulyabin","Aleksei Zhdanov","Anastasia Nikiforova","Andrey Stepichev","Anna Kuznetsova","Mikhail Ronkin","Vasilii Borisov","Alexander Bogachev","Sergey Korotkich","Paul A Constable","Andreas Maier"],"pdf_url":"https://arxiv.org/pdf/2312.08255v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12585v1","updated":"2024-03-19T09:47:08Z","published":"2024-03-19T09:47:08Z","title":"LASPA: Latent Spatial Alignment for Fast Training-free Single Image\n Editing","summary":" We present a novel, training-free approach for textual editing of real images\nusing diffusion models. Unlike prior methods that rely on computationally\nexpensive finetuning, our approach leverages LAtent SPatial Alignment (LASPA)\nto efficiently preserve image details. We demonstrate how the diffusion process\nis amenable to spatial guidance using a reference image, leading to\nsemantically coherent edits. This eliminates the need for complex optimization\nand costly model finetuning, resulting in significantly faster editing compared\nto previous methods. Additionally, our method avoids the storage requirements\nassociated with large finetuned models. These advantages make our approach\nparticularly well-suited for editing on mobile devices and applications\ndemanding rapid response times. While simple and fast, our method achieves\n62-71\\% preference in a user-study and significantly better model-based editing\nstrength and image preservation scores.\n","authors":["Yazeed Alharbi","Peter Wonka"],"pdf_url":"https://arxiv.org/pdf/2403.12585v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12580v1","updated":"2024-03-19T09:44:41Z","published":"2024-03-19T09:44:41Z","title":"Real-IAD: A Real-World Multi-View Dataset for Benchmarking Versatile\n Industrial Anomaly Detection","summary":" Industrial anomaly detection (IAD) has garnered significant attention and\nexperienced rapid development. However, the recent development of IAD approach\nhas encountered certain difficulties due to dataset limitations. On the one\nhand, most of the state-of-the-art methods have achieved saturation (over 99%\nin AUROC) on mainstream datasets such as MVTec, and the differences of methods\ncannot be well distinguished, leading to a significant gap between public\ndatasets and actual application scenarios. On the other hand, the research on\nvarious new practical anomaly detection settings is limited by the scale of the\ndataset, posing a risk of overfitting in evaluation results. Therefore, we\npropose a large-scale, Real-world, and multi-view Industrial Anomaly Detection\ndataset, named Real-IAD, which contains 150K high-resolution images of 30\ndifferent objects, an order of magnitude larger than existing datasets. It has\na larger range of defect area and ratio proportions, making it more challenging\nthan previous datasets. To make the dataset closer to real application\nscenarios, we adopted a multi-view shooting method and proposed sample-level\nevaluation metrics. In addition, beyond the general unsupervised anomaly\ndetection setting, we propose a new setting for Fully Unsupervised Industrial\nAnomaly Detection (FUIAD) based on the observation that the yield rate in\nindustrial production is usually greater than 60%, which has more practical\napplication value. Finally, we report the results of popular IAD methods on the\nReal-IAD dataset, providing a highly challenging benchmark to promote the\ndevelopment of the IAD field.\n","authors":["Chengjie Wang","Wenbing Zhu","Bin-Bin Gao","Zhenye Gan","Jianning Zhang","Zhihao Gu","Shuguang Qian","Mingang Chen","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2403.12580v1.pdf","comment":"It is accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.12574v1","updated":"2024-03-19T09:34:11Z","published":"2024-03-19T09:34:11Z","title":"EAS-SNN: End-to-End Adaptive Sampling and Representation for Event-based\n Detection with Recurrent Spiking Neural Networks","summary":" Event cameras, with their high dynamic range and temporal resolution, are\nideally suited for object detection, especially under scenarios with motion\nblur and challenging lighting conditions. However, while most existing\napproaches prioritize optimizing spatiotemporal representations with advanced\ndetection backbones and early aggregation functions, the crucial issue of\nadaptive event sampling remains largely unaddressed. Spiking Neural Networks\n(SNNs), which operate on an event-driven paradigm through sparse spike\ncommunication, emerge as a natural fit for addressing this challenge. In this\nstudy, we discover that the neural dynamics of spiking neurons align closely\nwith the behavior of an ideal temporal event sampler. Motivated by this\ninsight, we propose a novel adaptive sampling module that leverages recurrent\nconvolutional SNNs enhanced with temporal memory, facilitating a fully\nend-to-end learnable framework for event-based detection. Additionally, we\nintroduce Residual Potential Dropout (RPD) and Spike-Aware Training (SAT) to\nregulate potential distribution and address performance degradation encountered\nin spike-based sampling modules. Through rigorous testing on neuromorphic\ndatasets for event-based detection, our approach demonstrably surpasses\nexisting state-of-the-art spike-based methods, achieving superior performance\nwith significantly fewer parameters and time steps. For instance, our method\nachieves a 4.4\\% mAP improvement on the Gen1 dataset, while requiring 38\\%\nfewer parameters and three time steps. Moreover, the applicability and\neffectiveness of our adaptive sampling methodology extend beyond SNNs, as\ndemonstrated through further validation on conventional non-spiking detection\nmodels.\n","authors":["Ziming Wang","Ziling Wang","Huaning Li","Lang Qin","Runhao Jiang","De Ma","Huajin Tang"],"pdf_url":"https://arxiv.org/pdf/2403.12574v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12573v1","updated":"2024-03-19T09:33:07Z","published":"2024-03-19T09:33:07Z","title":"Lifting Multi-View Detection and Tracking to the Bird's Eye View","summary":" Taking advantage of multi-view aggregation presents a promising solution to\ntackle challenges such as occlusion and missed detection in multi-object\ntracking and detection. Recent advancements in multi-view detection and 3D\nobject recognition have significantly improved performance by strategically\nprojecting all views onto the ground plane and conducting detection analysis\nfrom a Bird's Eye View. In this paper, we compare modern lifting methods, both\nparameter-free and parameterized, to multi-view aggregation. Additionally, we\npresent an architecture that aggregates the features of multiple times steps to\nlearn robust detection and combines appearance- and motion-based cues for\ntracking. Most current tracking approaches either focus on pedestrians or\nvehicles. In our work, we combine both branches and add new challenges to\nmulti-view detection with cross-scene setups. Our method generalizes to three\npublic datasets across two domains: (1) pedestrian: Wildtrack and MultiviewX,\nand (2) roadside perception: Synthehicle, achieving state-of-the-art\nperformance in detection and tracking. https://github.com/tteepe/TrackTacular\n","authors":["Torben Teepe","Philipp Wolters","Johannes Gilg","Fabian Herzog","Gerhard Rigoll"],"pdf_url":"https://arxiv.org/pdf/2403.12573v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12572v1","updated":"2024-03-19T09:30:56Z","published":"2024-03-19T09:30:56Z","title":"Compound Expression Recognition via Multi Model Ensemble","summary":" Compound Expression Recognition (CER) plays a crucial role in interpersonal\ninteractions. Due to the existence of Compound Expressions , human emotional\nexpressions are complex, requiring consideration of both local and global\nfacial expressions to make judgments. In this paper, to address this issue, we\npropose a solution based on ensemble learning methods for Compound Expression\nRecognition. Specifically, our task is classification, where we train three\nexpression classification models based on convolutional networks, Vision\nTransformers, and multi-scale local attention networks. Then, through model\nensemble using late fusion, we merge the outputs of multiple models to predict\nthe final result. Our method achieves high accuracy on RAF-DB and is able to\nrecognize expressions through zero-shot on certain portions of C-EXPR-DB.\n","authors":["Jun Yu","Jichao Zhu","Wangyuan Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.12572v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12570v1","updated":"2024-03-19T09:28:19Z","published":"2024-03-19T09:28:19Z","title":"Adapting Visual-Language Models for Generalizable Anomaly Detection in\n Medical Images","summary":" Recent advancements in large-scale visual-language pre-trained models have\nled to significant progress in zero-/few-shot anomaly detection within natural\nimage domains. However, the substantial domain divergence between natural and\nmedical images limits the effectiveness of these methodologies in medical\nanomaly detection. This paper introduces a novel lightweight multi-level\nadaptation and comparison framework to repurpose the CLIP model for medical\nanomaly detection. Our approach integrates multiple residual adapters into the\npre-trained visual encoder, enabling a stepwise enhancement of visual features\nacross different levels. This multi-level adaptation is guided by multi-level,\npixel-wise visual-language feature alignment loss functions, which recalibrate\nthe model's focus from object semantics in natural imagery to anomaly\nidentification in medical images. The adapted features exhibit improved\ngeneralization across various medical data types, even in zero-shot scenarios\nwhere the model encounters unseen medical modalities and anatomical regions\nduring training. Our experiments on medical anomaly detection benchmarks\ndemonstrate that our method significantly surpasses current state-of-the-art\nmodels, with an average AUC improvement of 6.24% and 7.33% for anomaly\nclassification, 2.03% and 2.37% for anomaly segmentation, under the zero-shot\nand few-shot settings, respectively. Source code is available at:\nhttps://github.com/MediaBrain-SJTU/MVFA-AD\n","authors":["Chaoqin Huang","Aofan Jiang","Jinghao Feng","Ya Zhang","Xinchao Wang","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2403.12570v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.12559v1","updated":"2024-03-19T09:14:52Z","published":"2024-03-19T09:14:52Z","title":"Confidence Self-Calibration for Multi-Label Class-Incremental Learning","summary":" The partial label challenge in Multi-Label Class-Incremental Learning (MLCIL)\narises when only the new classes are labeled during training, while past and\nfuture labels remain unavailable. This issue leads to a proliferation of\nfalse-positive errors due to erroneously high confidence multi-label\npredictions, exacerbating catastrophic forgetting within the disjoint label\nspace. In this paper, we aim to refine multi-label confidence calibration in\nMLCIL and propose a Confidence Self-Calibration (CSC) approach. Firstly, for\nlabel relationship calibration, we introduce a class-incremental graph\nconvolutional network that bridges the isolated label spaces by constructing\nlearnable, dynamically extended label relationship graph. Then, for confidence\ncalibration, we present a max-entropy regularization for each multi-label\nincrement, facilitating confidence self-calibration through the penalization of\nover-confident output distributions. Our approach attains new state-of-the-art\nresults in MLCIL tasks on both MS-COCO and PASCAL VOC datasets, with the\ncalibration of label confidences confirmed through our methodology.\n","authors":["Kaile Du","Yifan Zhou","Fan Lyu","Yuyang Li","Chen Lu","Guangcan Liu"],"pdf_url":"https://arxiv.org/pdf/2403.12559v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06731v2","updated":"2024-03-19T09:13:22Z","published":"2023-12-11T09:44:41Z","title":"Genixer: Empowering Multimodal Large Language Models as a Powerful Data\n Generator","summary":" Instruction tuning data is essential for training the Multimodal Large\nLanguage Models (MLLMs). However, the creation of high-quality instruction\ntuning data presents significant challenges. Prior methods that depended on\nGPT-4 for data generation were not only costly but also lacked satisfactory\nperformance in complex tasks (i.e., grounding-based reasoning tasks). To\naddress these issues, we developed an innovative data generation pipeline,\nGenixer, to generate various high-quality instruction tuning data, including\nnine representative tasks, e.g., Common VQA, REC, REG, and PointQ.\nSpecifically, Genixer provides a unified solution with four key steps for\nalleviating the difficulty of data generation: (i) instruction data collection,\n(ii) instruction template design, (iii) empowering MLLM, and (iv) data\ngeneration and filtering. Subsequently, the superior qualitative results of our\nGenixer demonstrate that current MLLMs have a strong potential to evolve into\npowerful data generators. Additionally, to validate the efficacy of generated\ndata quantitatively, we add the instruction tuning data produced by Genixer\ninto the training of two representative MLLMs and observe the consistent\nimprovements on various VQA tasks and multimodal benchmarks.\n","authors":["Henry Hengyuan Zhao","Pan Zhou","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2312.06731v2.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2312.10103v2","updated":"2024-03-19T09:13:19Z","published":"2023-12-15T02:54:31Z","title":"GSVA: Generalized Segmentation via Multimodal Large Language Models","summary":" Generalized Referring Expression Segmentation (GRES) extends the scope of\nclassic RES to refer to multiple objects in one expression or identify the\nempty targets absent in the image. GRES poses challenges in modeling the\ncomplex spatial relationships of the instances in the image and identifying\nnon-existing referents. Multimodal Large Language Models (MLLMs) have recently\nshown tremendous progress in these complicated vision-language tasks.\nConnecting Large Language Models (LLMs) and vision models, MLLMs are proficient\nin understanding contexts with visual inputs. Among them, LISA, as a\nrepresentative, adopts a special [SEG] token to prompt a segmentation mask\ndecoder, e.g., SAM, to enable MLLMs in the RES task. However, existing\nsolutions to GRES remain unsatisfactory since current segmentation MLLMs cannot\ncorrectly handle the cases where users might reference multiple subjects in a\nsingular prompt or provide descriptions incongruent with any image target. In\nthis paper, we propose Generalized Segmentation Vision Assistant (GSVA) to\naddress this gap. Specifically, GSVA reuses the [SEG] token to prompt the\nsegmentation model towards supporting multiple mask references simultaneously\nand innovatively learns to generate a [REJ] token to reject the null targets\nexplicitly. Experiments validate GSVA's efficacy in resolving the GRES issue,\nmarking a notable enhancement and setting a new record on the GRES benchmark\ngRefCOCO dataset. GSVA also proves effective across various classic referring\nsegmentation and comprehension tasks.\n","authors":["Zhuofan Xia","Dongchen Han","Yizeng Han","Xuran Pan","Shiji Song","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2312.10103v2.pdf","comment":"Accepted by CVPR2024 (19 pages, 9 figures, 11 tables)"},{"id":"http://arxiv.org/abs/2403.11370v2","updated":"2024-03-19T09:12:00Z","published":"2024-03-17T23:23:40Z","title":"DynamicGlue: Epipolar and Time-Informed Data Association in Dynamic\n Environments using Graph Neural Networks","summary":" The assumption of a static environment is common in many geometric computer\nvision tasks like SLAM but limits their applicability in highly dynamic scenes.\nSince these tasks rely on identifying point correspondences between input\nimages within the static part of the environment, we propose a graph neural\nnetwork-based sparse feature matching network designed to perform robust\nmatching under challenging conditions while excluding keypoints on moving\nobjects. We employ a similar scheme of attentional aggregation over graph edges\nto enhance keypoint representations as state-of-the-art feature-matching\nnetworks but augment the graph with epipolar and temporal information and\nvastly reduce the number of graph edges. Furthermore, we introduce a\nself-supervised training scheme to extract pseudo labels for image pairs in\ndynamic environments from exclusively unprocessed visual-inertial data. A\nseries of experiments show the superior performance of our network as it\nexcludes keypoints on moving objects compared to state-of-the-art feature\nmatching networks while still achieving similar results regarding conventional\nmatching metrics. When integrated into a SLAM system, our network significantly\nimproves performance, especially in highly dynamic scenes.\n","authors":["Theresa Huber","Simon Schaefer","Stefan Leutenegger"],"pdf_url":"https://arxiv.org/pdf/2403.11370v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.16607v2","updated":"2024-03-19T08:58:17Z","published":"2024-02-26T14:40:15Z","title":"GVA: Reconstructing Vivid 3D Gaussian Avatars from Monocular Videos","summary":" In this paper, we present a novel method that facilitates the creation of\nvivid 3D Gaussian avatars from monocular video inputs (GVA). Our innovation\nlies in addressing the intricate challenges of delivering high-fidelity human\nbody reconstructions and aligning 3D Gaussians with human skin surfaces\naccurately. The key contributions of this paper are twofold. Firstly, we\nintroduce a pose refinement technique to improve hand and foot pose accuracy by\naligning normal maps and silhouettes. Precise pose is crucial for correct shape\nand appearance reconstruction. Secondly, we address the problems of unbalanced\naggregation and initialization bias that previously diminished the quality of\n3D Gaussian avatars, through a novel surface-guided re-initialization method\nthat ensures accurate alignment of 3D Gaussian points with avatar surfaces.\nExperimental results demonstrate that our proposed method achieves\nhigh-fidelity and vivid 3D Gaussian avatar reconstruction. Extensive\nexperimental analyses validate the performance qualitatively and\nquantitatively, demonstrating that it achieves state-of-the-art performance in\nphoto-realistic novel view synthesis while offering fine-grained control over\nthe human body and hand pose. Project page: https://3d-aigc.github.io/GVA/.\n","authors":["Xinqi Liu","Chenming Wu","Jialun Liu","Xing Liu","Jinbo Wu","Chen Zhao","Haocheng Feng","Errui Ding","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2402.16607v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10103v2","updated":"2024-03-19T08:56:44Z","published":"2024-03-15T08:48:37Z","title":"DyBluRF: Dynamic Neural Radiance Fields from Blurry Monocular Video","summary":" Recent advancements in dynamic neural radiance field methods have yielded\nremarkable outcomes. However, these approaches rely on the assumption of sharp\ninput images. When faced with motion blur, existing dynamic NeRF methods often\nstruggle to generate high-quality novel views. In this paper, we propose\nDyBluRF, a dynamic radiance field approach that synthesizes sharp novel views\nfrom a monocular video affected by motion blur. To account for motion blur in\ninput images, we simultaneously capture the camera trajectory and object\nDiscrete Cosine Transform (DCT) trajectories within the scene. Additionally, we\nemploy a global cross-time rendering approach to ensure consistent temporal\ncoherence across the entire scene. We curate a dataset comprising diverse\ndynamic scenes that are specifically tailored for our task. Experimental\nresults on our dataset demonstrate that our method outperforms existing\napproaches in generating sharp novel views from motion-blurred inputs while\nmaintaining spatial-temporal consistency of the scene.\n","authors":["Huiqiang Sun","Xingyi Li","Liao Shen","Xinyi Ye","Ke Xian","Zhiguo Cao"],"pdf_url":"https://arxiv.org/pdf/2403.10103v2.pdf","comment":"Accepted by CVPR 2024. Project page:\n https://huiqiang-sun.github.io/dyblurf/"},{"id":"http://arxiv.org/abs/2403.12552v1","updated":"2024-03-19T08:54:52Z","published":"2024-03-19T08:54:52Z","title":"M2DA: Multi-Modal Fusion Transformer Incorporating Driver Attention for\n Autonomous Driving","summary":" End-to-end autonomous driving has witnessed remarkable progress. However, the\nextensive deployment of autonomous vehicles has yet to be realized, primarily\ndue to 1) inefficient multi-modal environment perception: how to integrate data\nfrom multi-modal sensors more efficiently; 2) non-human-like scene\nunderstanding: how to effectively locate and predict critical risky agents in\ntraffic scenarios like an experienced driver. To overcome these challenges, in\nthis paper, we propose a Multi-Modal fusion transformer incorporating Driver\nAttention (M2DA) for autonomous driving. To better fuse multi-modal data and\nachieve higher alignment between different modalities, a novel\nLidar-Vision-Attention-based Fusion (LVAFusion) module is proposed. By\nincorporating driver attention, we empower the human-like scene understanding\nability to autonomous vehicles to identify crucial areas within complex\nscenarios precisely and ensure safety. We conduct experiments on the CARLA\nsimulator and achieve state-of-the-art performance with less data in\nclosed-loop benchmarks. Source codes are available at\nhttps://anonymous.4open.science/r/M2DA-4772.\n","authors":["Dongyang Xu","Haokun Li","Qingfan Wang","Ziying Song","Lei Chen","Hanming Deng"],"pdf_url":"https://arxiv.org/pdf/2403.12552v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.10787v2","updated":"2024-03-19T08:51:51Z","published":"2023-09-19T17:35:16Z","title":"AV-SUPERB: A Multi-Task Evaluation Benchmark for Audio-Visual\n Representation Models","summary":" Audio-visual representation learning aims to develop systems with human-like\nperception by utilizing correlation between auditory and visual information.\nHowever, current models often focus on a limited set of tasks, and\ngeneralization abilities of learned representations are unclear. To this end,\nwe propose the AV-SUPERB benchmark that enables general-purpose evaluation of\nunimodal audio/visual and bimodal fusion representations on 7 datasets covering\n5 audio-visual tasks in speech and audio processing. We evaluate 5 recent\nself-supervised models and show that none of these models generalize to all\ntasks, emphasizing the need for future study on improving universal model\nperformance. In addition, we show that representations may be improved with\nintermediate-task fine-tuning and audio event classification with AudioSet\nserves as a strong intermediate task. We release our benchmark with evaluation\ncode and a model submission platform to encourage further research in\naudio-visual learning.\n","authors":["Yuan Tseng","Layne Berry","Yi-Ting Chen","I-Hsiang Chiu","Hsuan-Hao Lin","Max Liu","Puyuan Peng","Yi-Jen Shih","Hung-Yu Wang","Haibin Wu","Po-Yao Huang","Chun-Mao Lai","Shang-Wen Li","David Harwath","Yu Tsao","Shinji Watanabe","Abdelrahman Mohamed","Chi-Luen Feng","Hung-yi Lee"],"pdf_url":"https://arxiv.org/pdf/2309.10787v2.pdf","comment":"Accepted to ICASSP 2024; Evaluation Code:\n https://github.com/roger-tseng/av-superb Submission Platform:\n https://av.superbbenchmark.org"},{"id":"http://arxiv.org/abs/2403.12550v1","updated":"2024-03-19T08:49:48Z","published":"2024-03-19T08:49:48Z","title":"RGBD GS-ICP SLAM","summary":" Simultaneous Localization and Mapping (SLAM) with dense representation plays\na key role in robotics, Virtual Reality (VR), and Augmented Reality (AR)\napplications. Recent advancements in dense representation SLAM have highlighted\nthe potential of leveraging neural scene representation and 3D Gaussian\nrepresentation for high-fidelity spatial representation. In this paper, we\npropose a novel dense representation SLAM approach with a fusion of Generalized\nIterative Closest Point (G-ICP) and 3D Gaussian Splatting (3DGS). In contrast\nto existing methods, we utilize a single Gaussian map for both tracking and\nmapping, resulting in mutual benefits. Through the exchange of covariances\nbetween tracking and mapping processes with scale alignment techniques, we\nminimize redundant computations and achieve an efficient system. Additionally,\nwe enhance tracking accuracy and mapping quality through our keyframe selection\nmethods. Experimental results demonstrate the effectiveness of our approach,\nshowing an incredibly fast speed up to 107 FPS (for the entire system) and\nsuperior quality of the reconstructed map.\n","authors":["Seongbo Ha","Jiung Yeon","Hyeonwoo Yu"],"pdf_url":"https://arxiv.org/pdf/2403.12550v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.18639v3","updated":"2024-03-19T08:45:53Z","published":"2023-10-28T08:48:44Z","title":"Towards Plastic and Stable Exemplar-Free Incremental Learning: A\n Dual-Learner Framework with Cumulative Parameter Averaging","summary":" The dilemma between plasticity and stability presents a significant challenge\nin Incremental Learning (IL), especially in the exemplar-free scenario where\naccessing old-task samples is strictly prohibited during the learning of a new\ntask. A straightforward solution to this issue is learning and storing an\nindependent model for each task, known as Single Task Learning (STL). Despite\nthe linear growth in model storage with the number of tasks in STL, we\nempirically discover that averaging these model parameters can potentially\npreserve knowledge across all tasks. Inspired by this observation, we propose a\nDual-Learner framework with Cumulative Parameter Averaging (DLCPA). DLCPA\nemploys a dual-learner design: a plastic learner focused on acquiring new-task\nknowledge and a stable learner responsible for accumulating all learned\nknowledge. The knowledge from the plastic learner is transferred to the stable\nlearner via cumulative parameter averaging. Additionally, several task-specific\nclassifiers work in cooperation with the stable learner to yield the final\nprediction. Specifically, when learning a new task, these modules are updated\nin a cyclic manner: i) the plastic learner is initially optimized using a\nself-supervised loss besides the supervised loss to enhance the feature\nextraction robustness; ii) the stable learner is then updated with respect to\nthe plastic learner in a cumulative parameter averaging manner to maintain its\ntask-wise generalization; iii) the task-specific classifier is accordingly\noptimized to align with the stable learner. Experimental results on CIFAR-100\nand Tiny-ImageNet show that DLCPA outperforms several state-of-the-art\nexemplar-free baselines in both Task-IL and Class-IL settings.\n","authors":["Wenju Sun","Qingyong Li","Wen Wang","Yangli-ao Geng"],"pdf_url":"https://arxiv.org/pdf/2310.18639v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17200v3","updated":"2024-03-19T08:41:00Z","published":"2024-02-27T04:37:04Z","title":"Enhancing Quality of Compressed Images by Mitigating Enhancement Bias\n Towards Compression Domain","summary":" Existing quality enhancement methods for compressed images focus on aligning\nthe enhancement domain with the raw domain to yield realistic images. However,\nthese methods exhibit a pervasive enhancement bias towards the compression\ndomain, inadvertently regarding it as more realistic than the raw domain. This\nbias makes enhanced images closely resemble their compressed counterparts, thus\ndegrading their perceptual quality. In this paper, we propose a simple yet\neffective method to mitigate this bias and enhance the quality of compressed\nimages. Our method employs a conditional discriminator with the compressed\nimage as a key condition, and then incorporates a domain-divergence\nregularization to actively distance the enhancement domain from the compression\ndomain. Through this dual strategy, our method enables the discrimination\nagainst the compression domain, and brings the enhancement domain closer to the\nraw domain. Comprehensive quality evaluations confirm the superiority of our\nmethod over other state-of-the-art methods without incurring inference\noverheads.\n","authors":["Qunliang Xing","Mai Xu","Shengxi Li","Xin Deng","Meisong Zheng","Huaida Liu","Ying Chen"],"pdf_url":"https://arxiv.org/pdf/2402.17200v3.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.12543v1","updated":"2024-03-19T08:40:19Z","published":"2024-03-19T08:40:19Z","title":"HCPM: Hierarchical Candidates Pruning for Efficient Detector-Free\n Matching","summary":" Deep learning-based image matching methods play a crucial role in computer\nvision, yet they often suffer from substantial computational demands. To tackle\nthis challenge, we present HCPM, an efficient and detector-free local\nfeature-matching method that employs hierarchical pruning to optimize the\nmatching pipeline. In contrast to recent detector-free methods that depend on\nan exhaustive set of coarse-level candidates for matching, HCPM selectively\nconcentrates on a concise subset of informative candidates, resulting in fewer\ncomputational candidates and enhanced matching efficiency. The method comprises\na self-pruning stage for selecting reliable candidates and an\ninteractive-pruning stage that identifies correlated patches at the coarse\nlevel. Our results reveal that HCPM significantly surpasses existing methods in\nterms of speed while maintaining high accuracy. The source code will be made\navailable upon publication.\n","authors":["Ying Chen","Yong Liu","Kai Wu","Qiang Nie","Shang Xu","Huifang Ma","Bing Wang","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2403.12543v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.04962v2","updated":"2024-03-19T08:28:46Z","published":"2023-04-11T04:12:31Z","title":"Mask-Based Modeling for Neural Radiance Fields","summary":" Most Neural Radiance Fields (NeRFs) exhibit limited generalization\ncapabilities, which restrict their applicability in representing multiple\nscenes using a single model. To address this problem, existing generalizable\nNeRF methods simply condition the model on image features. These methods still\nstruggle to learn precise global representations over diverse scenes since they\nlack an effective mechanism for interacting among different points and views.\nIn this work, we unveil that 3D implicit representation learning can be\nsignificantly improved by mask-based modeling. Specifically, we propose masked\nray and view modeling for generalizable NeRF (MRVM-NeRF), which is a\nself-supervised pretraining target to predict complete scene representations\nfrom partially masked features along each ray. With this pretraining target,\nMRVM-NeRF enables better use of correlations across different points and views\nas the geometry priors, which thereby strengthens the capability of capturing\nintricate details within the scenes and boosts the generalization capability\nacross different scenes. Extensive experiments demonstrate the effectiveness of\nour proposed MRVM-NeRF on both synthetic and real-world datasets, qualitatively\nand quantitatively. Besides, we also conduct experiments to show the\ncompatibility of our proposed method with various backbones and its superiority\nunder few-shot cases.\n","authors":["Ganlin Yang","Guoqiang Wei","Zhizheng Zhang","Yan Lu","Dong Liu"],"pdf_url":"https://arxiv.org/pdf/2304.04962v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12537v1","updated":"2024-03-19T08:23:12Z","published":"2024-03-19T08:23:12Z","title":"Prompt-Guided Adaptive Model Transformation for Whole Slide Image\n Classification","summary":" Multiple instance learning (MIL) has emerged as a popular method for\nclassifying histopathology whole slide images (WSIs). Existing approaches\ntypically rely on frozen pre-trained models to extract instance features,\nneglecting the substantial domain shift between pre-training natural and\nhistopathological images. To address this issue, we propose PAMT, a novel\nPrompt-guided Adaptive Model Transformation framework that enhances MIL\nclassification performance by seamlessly adapting pre-trained models to the\nspecific characteristics of histopathology data. To capture the intricate\nhistopathology distribution, we introduce Representative Patch Sampling (RPS)\nand Prototypical Visual Prompt (PVP) to reform the input data, building a\ncompact while informative representation. Furthermore, to narrow the domain\ngap, we introduce Adaptive Model Transformation (AMT) that integrates adapter\nblocks within the feature extraction pipeline, enabling the pre-trained models\nto learn domain-specific features. We rigorously evaluate our approach on two\npublicly available datasets, Camelyon16 and TCGA-NSCLC, showcasing substantial\nimprovements across various MIL models. Our findings affirm the potential of\nPAMT to set a new benchmark in WSI classification, underscoring the value of a\ntargeted reprogramming approach.\n","authors":["Yi Lin","Zhengjie Zhu","Kwang-Ting Cheng","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2403.12537v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01166v2","updated":"2024-03-19T08:22:42Z","published":"2024-02-02T06:20:44Z","title":"A Comprehensive Survey on 3D Content Generation","summary":" Recent years have witnessed remarkable advances in artificial intelligence\ngenerated content(AIGC), with diverse input modalities, e.g., text, image,\nvideo, audio and 3D. The 3D is the most close visual modality to real-world 3D\nenvironment and carries enormous knowledge. The 3D content generation shows\nboth academic and practical values while also presenting formidable technical\nchallenges. This review aims to consolidate developments within the burgeoning\ndomain of 3D content generation. Specifically, a new taxonomy is proposed that\ncategorizes existing approaches into three types: 3D native generative methods,\n2D prior-based 3D generative methods, and hybrid 3D generative methods. The\nsurvey covers approximately 60 papers spanning the major techniques. Besides,\nwe discuss limitations of current 3D content generation techniques, and point\nout open challenges as well as promising directions for future work.\nAccompanied with this survey, we have established a project website where the\nresources on 3D content generation research are provided. The project page is\navailable at https://github.com/hitcslj/Awesome-AIGC-3D.\n","authors":["Jian Liu","Xiaoshui Huang","Tianyu Huang","Lu Chen","Yuenan Hou","Shixiang Tang","Ziwei Liu","Wanli Ouyang","Wangmeng Zuo","Junjun Jiang","Xianming Liu"],"pdf_url":"https://arxiv.org/pdf/2402.01166v2.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2403.12536v1","updated":"2024-03-19T08:21:54Z","published":"2024-03-19T08:21:54Z","title":"Vox-Fusion++: Voxel-based Neural Implicit Dense Tracking and Mapping\n with Multi-maps","summary":" In this paper, we introduce Vox-Fusion++, a multi-maps-based robust dense\ntracking and mapping system that seamlessly fuses neural implicit\nrepresentations with traditional volumetric fusion techniques. Building upon\nthe concept of implicit mapping and positioning systems, our approach extends\nits applicability to real-world scenarios. Our system employs a voxel-based\nneural implicit surface representation, enabling efficient encoding and\noptimization of the scene within each voxel. To handle diverse environments\nwithout prior knowledge, we incorporate an octree-based structure for scene\ndivision and dynamic expansion. To achieve real-time performance, we propose a\nhigh-performance multi-process framework. This ensures the system's suitability\nfor applications with stringent time constraints. Additionally, we adopt the\nidea of multi-maps to handle large-scale scenes, and leverage loop detection\nand hierarchical pose optimization strategies to reduce long-term pose drift\nand remove duplicate geometry. Through comprehensive evaluations, we\ndemonstrate that our method outperforms previous methods in terms of\nreconstruction quality and accuracy across various scenarios. We also show that\nour Vox-Fusion++ can be used in augmented reality and collaborative mapping\napplications. Our source code will be publicly available at\n\\url{https://github.com/zju3dv/Vox-Fusion_Plus_Plus}\n","authors":["Hongjia Zhai","Hai Li","Xingrui Yang","Gan Huang","Yuhang Ming","Hujun Bao","Guofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.12536v1.pdf","comment":"14 pages. arXiv admin note: text overlap with arXiv:2210.15858"},{"id":"http://arxiv.org/abs/2403.11077v2","updated":"2024-03-19T08:21:02Z","published":"2024-03-17T04:02:39Z","title":"Zippo: Zipping Color and Transparency Distributions into a Single\n Diffusion Model","summary":" Beyond the superiority of the text-to-image diffusion model in generating\nhigh-quality images, recent studies have attempted to uncover its potential for\nadapting the learned semantic knowledge to visual perception tasks. In this\nwork, instead of translating a generative diffusion model into a visual\nperception model, we explore to retain the generative ability with the\nperceptive adaptation. To accomplish this, we present Zippo, a unified\nframework for zipping the color and transparency distributions into a single\ndiffusion model by expanding the diffusion latent into a joint representation\nof RGB images and alpha mattes. By alternatively selecting one modality as the\ncondition and then applying the diffusion process to the counterpart modality,\nZippo is capable of generating RGB images from alpha mattes and predicting\ntransparency from input images. In addition to single-modality prediction, we\npropose a modality-aware noise reassignment strategy to further empower Zippo\nwith jointly generating RGB images and its corresponding alpha mattes under the\ntext guidance. Our experiments showcase Zippo's ability of efficient\ntext-conditioned transparent image generation and present plausible results of\nMatte-to-RGB and RGB-to-Matte translation.\n","authors":["Kangyang Xie","Binbin Yang","Hao Chen","Meng Wang","Cheng Zou","Hui Xue","Ming Yang","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2403.11077v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12535v1","updated":"2024-03-19T08:19:53Z","published":"2024-03-19T08:19:53Z","title":"High-Fidelity SLAM Using Gaussian Splatting with Rendering-Guided\n Densification and Regularized Optimization","summary":" We propose a dense RGBD SLAM system based on 3D Gaussian Splatting that\nprovides metrically accurate pose tracking and visually realistic\nreconstruction. To this end, we first propose a Gaussian densification strategy\nbased on the rendering loss to map unobserved areas and refine reobserved\nareas. Second, we introduce extra regularization parameters to alleviate the\nforgetting problem in the continuous mapping problem, where parameters tend to\noverfit the latest frame and result in decreasing rendering quality for\nprevious frames. Both mapping and tracking are performed with Gaussian\nparameters by minimizing re-rendering loss in a differentiable way. Compared to\nrecent neural and concurrently developed gaussian splatting RGBD SLAM\nbaselines, our method achieves state-of-the-art results on the synthetic\ndataset Replica and competitive results on the real-world dataset TUM.\n","authors":["Shuo Sun","Malcolm Mielle","Achim J. Lilienthal","Martin Magnusson"],"pdf_url":"https://arxiv.org/pdf/2403.12535v1.pdf","comment":"submitted to IROS24"},{"id":"http://arxiv.org/abs/2403.12534v1","updated":"2024-03-19T08:15:53Z","published":"2024-03-19T08:15:53Z","title":"ExACT: Language-guided Conceptual Reasoning and Uncertainty Estimation\n for Event-based Action Recognition and More","summary":" Event cameras have recently been shown beneficial for practical vision tasks,\nsuch as action recognition, thanks to their high temporal resolution, power\nefficiency, and reduced privacy concerns. However, current research is hindered\nby 1) the difficulty in processing events because of their prolonged duration\nand dynamic actions with complex and ambiguous semantics and 2) the redundant\naction depiction of the event frame representation with fixed stacks. We find\nlanguage naturally conveys abundant semantic information, rendering it\nstunningly superior in reducing semantic uncertainty. In light of this, we\npropose ExACT, a novel approach that, for the first time, tackles event-based\naction recognition from a cross-modal conceptualizing perspective. Our ExACT\nbrings two technical contributions. Firstly, we propose an adaptive\nfine-grained event (AFE) representation to adaptively filter out the repeated\nevents for the stationary objects while preserving dynamic ones. This subtly\nenhances the performance of ExACT without extra computational cost. Then, we\npropose a conceptual reasoning-based uncertainty estimation module, which\nsimulates the recognition process to enrich the semantic representation. In\nparticular, conceptual reasoning builds the temporal relation based on the\naction semantics, and uncertainty estimation tackles the semantic uncertainty\nof actions based on the distributional representation. Experiments show that\nour ExACT achieves superior recognition accuracy of 94.83%(+2.23%),\n90.10%(+37.47%) and 67.24% on PAF, HARDVS and our SeAct datasets respectively.\n","authors":["Jiazhou Zhou","Xu Zheng","Yuanhuiyi Lyu","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.12534v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.12532v1","updated":"2024-03-19T08:09:27Z","published":"2024-03-19T08:09:27Z","title":"UniBind: LLM-Augmented Unified and Balanced Representation Space to Bind\n Them All","summary":" We present UniBind, a flexible and efficient approach that learns a unified\nrepresentation space for seven diverse modalities -- images, text, audio, point\ncloud, thermal, video, and event data. Existing works, eg., ImageBind, treat\nthe image as the central modality and build an image-centered representation\nspace; however, the space may be sub-optimal as it leads to an unbalanced\nrepresentation space among all modalities. Moreover, the category names are\ndirectly used to extract text embeddings for the downstream tasks, making it\nhardly possible to represent the semantics of multi-modal data. The\n'out-of-the-box' insight of our UniBind is to make the alignment center\nmodality-agnostic and further learn a unified and balanced representation\nspace, empowered by the large language models (LLMs). UniBind is superior in\nits flexible application to all CLIP-style models and delivers remarkable\nperformance boosts. To make this possible, we 1) construct a knowledge base of\ntext embeddings with the help of LLMs and multi-modal LLMs; 2) adaptively build\nLLM-augmented class-wise embedding center on top of the knowledge base and\nencoded visual embeddings; 3) align all the embeddings to the LLM-augmented\nembedding center via contrastive learning to achieve a unified and balanced\nrepresentation space. UniBind shows strong zero-shot recognition performance\ngains over prior arts by an average of 6.36%. Finally, we achieve new\nstate-of-the-art performance, eg., a 6.75% gain on ImageNet, on the multi-modal\nfine-tuning setting while reducing 90% of the learnable parameters.\n","authors":["Yuanhuiyi Lyu","Xu Zheng","Jiazhou Zhou","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.12532v1.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2403.12530v1","updated":"2024-03-19T08:08:12Z","published":"2024-03-19T08:08:12Z","title":"PCT: Perspective Cue Training Framework for Multi-Camera BEV\n Segmentation","summary":" Generating annotations for bird's-eye-view (BEV) segmentation presents\nsignificant challenges due to the scenes' complexity and the high manual\nannotation cost. In this work, we address these challenges by leveraging the\nabundance of unlabeled data available. We propose the Perspective Cue Training\n(PCT) framework, a novel training framework that utilizes pseudo-labels\ngenerated from unlabeled perspective images using publicly available semantic\nsegmentation models trained on large street-view datasets. PCT applies a\nperspective view task head to the image encoder shared with the BEV\nsegmentation head, effectively utilizing the unlabeled data to be trained with\nthe generated pseudo-labels. Since image encoders are present in nearly all\ncamera-based BEV segmentation architectures, PCT is flexible and applicable to\nvarious existing BEV architectures. PCT can be applied to various settings\nwhere unlabeled data is available. In this paper, we applied PCT for\nsemi-supervised learning (SSL) and unsupervised domain adaptation (UDA).\nAdditionally, we introduce strong input perturbation through Camera Dropout\n(CamDrop) and feature perturbation via BEV Feature Dropout (BFD), which are\ncrucial for enhancing SSL capabilities using our teacher-student framework. Our\ncomprehensive approach is simple and flexible but yields significant\nimprovements over various baselines for SSL and UDA, achieving competitive\nperformances even against the current state-of-the-art.\n","authors":["Haruya Ishikawa","Takumi Iida","Yoshinori Konishi","Yoshimitsu Aoki"],"pdf_url":"https://arxiv.org/pdf/2403.12530v1.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.03526v2","updated":"2024-03-19T08:03:07Z","published":"2023-12-06T14:40:05Z","title":"On the Diversity and Realism of Distilled Dataset: An Efficient Dataset\n Distillation Paradigm","summary":" Contemporary machine learning requires training large neural networks on\nmassive datasets and thus faces the challenges of high computational demands.\nDataset distillation, as a recent emerging strategy, aims to compress\nreal-world datasets for efficient training. However, this line of research\ncurrently struggle with large-scale and high-resolution datasets, hindering its\npracticality and feasibility. To this end, we re-examine the existing dataset\ndistillation methods and identify three properties required for large-scale\nreal-world applications, namely, realism, diversity, and efficiency. As a\nremedy, we propose RDED, a novel computationally-efficient yet effective data\ndistillation paradigm, to enable both diversity and realism of the distilled\ndata. Extensive empirical results over various neural architectures and\ndatasets demonstrate the advancement of RDED: we can distill the full\nImageNet-1K to a small dataset comprising 10 images per class within 7 minutes,\nachieving a notable 42% top-1 accuracy with ResNet-18 on a single RTX-4090 GPU\n(while the SOTA only achieves 21% but requires 6 hours).\n","authors":["Peng Sun","Bei Shi","Daiwei Yu","Tao Lin"],"pdf_url":"https://arxiv.org/pdf/2312.03526v2.pdf","comment":"17 pages, 20 figures"},{"id":"http://arxiv.org/abs/2305.08473v2","updated":"2024-03-19T07:59:52Z","published":"2023-05-15T09:24:48Z","title":"Shared and Private Information Learning in Multimodal Sentiment Analysis\n with Deep Modal Alignment and Self-supervised Multi-Task Learning","summary":" Designing an effective representation learning method for multimodal\nsentiment analysis tasks is a crucial research direction. The challenge lies in\nlearning both shared and private information in a complete modal\nrepresentation, which is difficult with uniform multimodal labels and a raw\nfeature fusion approach. In this work, we propose a deep modal shared\ninformation learning module based on the covariance matrix to capture the\nshared information between modalities. Additionally, we use a label generation\nmodule based on a self-supervised learning strategy to capture the private\ninformation of the modalities. Our module is plug-and-play in multimodal tasks,\nand by changing the parameterization, it can adjust the information exchange\nrelationship between the modes and learn the private or shared information\nbetween the specified modes. We also employ a multi-task learning strategy to\nhelp the model focus its attention on the modal differentiation training data.\nWe provide a detailed formulation derivation and feasibility proof for the\ndesign of the deep modal shared information learning module. We conduct\nextensive experiments on three common multimodal sentiment analysis baseline\ndatasets, and the experimental results validate the reliability of our model.\nFurthermore, we explore more combinatorial techniques for the use of the\nmodule. Our approach outperforms current state-of-the-art methods on most of\nthe metrics of the three public datasets.\n","authors":["Songning Lai","Jiakang Li","Guinan Guo","Xifeng Hu","Yulong Li","Yuan Tan","Zichen Song","Yutong Liu","Zhaoxia Ren","Chun Wan","Danmin Miao","Zhi Liu"],"pdf_url":"https://arxiv.org/pdf/2305.08473v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10089v2","updated":"2024-03-19T07:58:17Z","published":"2024-03-15T08:05:16Z","title":"Approximation and bounding techniques for the Fisher-Rao distances","summary":" The Fisher-Rao distance between two probability distributions of a\nstatistical model is defined as the Riemannian geodesic distance induced by the\nFisher information metric. In order to calculate the Fisher-Rao distance in\nclosed-form, we need (1) to elicit a formula for the Fisher-Rao geodesics, and\n(2) to integrate the Fisher length element along those geodesics. We consider\nseveral numerically robust approximation and bounding techniques for the\nFisher-Rao distances: First, we report generic upper bounds on Fisher-Rao\ndistances based on closed-form 1D Fisher-Rao distances of submodels. Second, we\ndescribe several generic approximation schemes depending on whether the\nFisher-Rao geodesics or pregeodesics are available in closed-form or not. In\nparticular, we obtain a generic method to guarantee an arbitrarily small\nadditive error on the approximation provided that Fisher-Rao pregeodesics and\ntight lower and upper bounds are available. Third, we consider the case of\nFisher metrics being Hessian metrics, and report generic tight upper bounds on\nthe Fisher-Rao distances using techniques of information geometry.\nUniparametric and biparametric statistical models always have Fisher Hessian\nmetrics, and in general a simple test allows to check whether the Fisher\ninformation matrix yields a Hessian metric or not. Fourth, we consider\nelliptical distribution families and show how to apply the above techniques to\nthese models. We also propose two new distances based either on the Fisher-Rao\nlengths of curves serving as proxies of Fisher-Rao geodesics, or based on the\nBirkhoff/Hilbert projective cone distance. Last, we consider an alternative\ngroup-theoretic approach for statistical transformation models based on the\nnotion of maximal invariant which yields insights on the structures of the\nFisher-Rao distance formula which may be used fruitfully in applications.\n","authors":["Frank Nielsen"],"pdf_url":"https://arxiv.org/pdf/2403.10089v2.pdf","comment":"43 pages"},{"id":"http://arxiv.org/abs/2403.12519v1","updated":"2024-03-19T07:42:57Z","published":"2024-03-19T07:42:57Z","title":"Dynamic Spatial-Temporal Aggregation for Skeleton-Aware Sign Language\n Recognition","summary":" Skeleton-aware sign language recognition (SLR) has gained popularity due to\nits ability to remain unaffected by background information and its lower\ncomputational requirements. Current methods utilize spatial graph modules and\ntemporal modules to capture spatial and temporal features, respectively.\nHowever, their spatial graph modules are typically built on fixed graph\nstructures such as graph convolutional networks or a single learnable graph,\nwhich only partially explore joint relationships. Additionally, a simple\ntemporal convolution kernel is used to capture temporal information, which may\nnot fully capture the complex movement patterns of different signers. To\novercome these limitations, we propose a new spatial architecture consisting of\ntwo concurrent branches, which build input-sensitive joint relationships and\nincorporates specific domain knowledge for recognition, respectively. These two\nbranches are followed by an aggregation process to distinguishe important joint\nconnections. We then propose a new temporal module to model multi-scale\ntemporal information to capture complex human dynamics. Our method achieves\nstate-of-the-art accuracy compared to previous skeleton-aware methods on four\nlarge-scale SLR benchmarks. Moreover, our method demonstrates superior accuracy\ncompared to RGB-based methods in most cases while requiring much fewer\ncomputational resources, bringing better accuracy-computation trade-off. Code\nis available at https://github.com/hulianyuyy/DSTA-SLR.\n","authors":["Lianyu Hu","Liqing Gao","Zekang Liu","Wei Feng"],"pdf_url":"https://arxiv.org/pdf/2403.12519v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14817v2","updated":"2024-03-19T07:29:20Z","published":"2024-02-22T18:59:56Z","title":"Cameras as Rays: Pose Estimation via Ray Diffusion","summary":" Estimating camera poses is a fundamental task for 3D reconstruction and\nremains challenging given sparsely sampled views (<10). In contrast to existing\napproaches that pursue top-down prediction of global parametrizations of camera\nextrinsics, we propose a distributed representation of camera pose that treats\na camera as a bundle of rays. This representation allows for a tight coupling\nwith spatial image features improving pose precision. We observe that this\nrepresentation is naturally suited for set-level transformers and develop a\nregression-based approach that maps image patches to corresponding rays. To\ncapture the inherent uncertainties in sparse-view pose inference, we adapt this\napproach to learn a denoising diffusion model which allows us to sample\nplausible modes while improving performance. Our proposed methods, both\nregression- and diffusion-based, demonstrate state-of-the-art performance on\ncamera pose estimation on CO3D while generalizing to unseen object categories\nand in-the-wild captures.\n","authors":["Jason Y. Zhang","Amy Lin","Moneish Kumar","Tzu-Hsuan Yang","Deva Ramanan","Shubham Tulsiani"],"pdf_url":"https://arxiv.org/pdf/2402.14817v2.pdf","comment":"In ICLR 2024 (oral). v2: updated references. Project webpage:\n https://jasonyzhang.com/RayDiffusion"},{"id":"http://arxiv.org/abs/2403.12510v1","updated":"2024-03-19T07:24:54Z","published":"2024-03-19T07:24:54Z","title":"Generalized Consistency Trajectory Models for Image Manipulation","summary":" Diffusion-based generative models excel in unconditional generation, as well\nas on applied tasks such as image editing and restoration. The success of\ndiffusion models lies in the iterative nature of diffusion: diffusion breaks\ndown the complex process of mapping noise to data into a sequence of simple\ndenoising tasks. Moreover, we are able to exert fine-grained control over the\ngeneration process by injecting guidance terms into each denoising step.\nHowever, the iterative process is also computationally intensive, often taking\nfrom tens up to thousands of function evaluations. Although consistency\ntrajectory models (CTMs) enable traversal between any time points along the\nprobability flow ODE (PFODE) and score inference with a single function\nevaluation, CTMs only allow translation from Gaussian noise to data. Thus, this\nwork aims to unlock the full potential of CTMs by proposing generalized CTMs\n(GCTMs), which translate between arbitrary distributions via ODEs. We discuss\nthe design space of GCTMs and demonstrate their efficacy in various image\nmanipulation tasks such as image-to-image translation, restoration, and\nediting. Code: \\url{https://github.com/1202kbs/GCTM}\n","authors":["Beomsu Kim","Jaemin Kim","Jeongsol Kim","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2403.12510v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12505v1","updated":"2024-03-19T07:11:53Z","published":"2024-03-19T07:11:53Z","title":"Semantics, Distortion, and Style Matter: Towards Source-free UDA for\n Panoramic Segmentation","summary":" This paper addresses an interesting yet challenging problem -- source-free\nunsupervised domain adaptation (SFUDA) for pinhole-to-panoramic semantic\nsegmentation -- given only a pinhole image-trained model (i.e., source) and\nunlabeled panoramic images (i.e., target). Tackling this problem is nontrivial\ndue to the semantic mismatches, style discrepancies, and inevitable distortion\nof panoramic images. To this end, we propose a novel method that utilizes\nTangent Projection (TP) as it has less distortion and meanwhile slits the\nequirectangular projection (ERP) with a fixed FoV to mimic the pinhole images.\nBoth projections are shown effective in extracting knowledge from the source\nmodel. However, the distinct projection discrepancies between source and target\ndomains impede the direct knowledge transfer; thus, we propose a panoramic\nprototype adaptation module (PPAM) to integrate panoramic prototypes from the\nextracted knowledge for adaptation. We then impose the loss constraints on both\npredictions and prototypes and propose a cross-dual attention module (CDAM) at\nthe feature level to better align the spatial and channel characteristics\nacross the domains and projections. Both knowledge extraction and transfer\nprocesses are synchronously updated to reach the best performance. Extensive\nexperiments on the synthetic and real-world benchmarks, including outdoor and\nindoor scenarios, demonstrate that our method achieves significantly better\nperformance than prior SFUDA methods for pinhole-to-panoramic adaptation.\n","authors":["Xu Zheng","Pengyuan Zhou","Athanasios Vasilakos","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.12505v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.12009v2","updated":"2024-03-19T07:11:28Z","published":"2024-03-18T17:47:39Z","title":"Leveraging Spatial and Semantic Feature Extraction for Skin Cancer\n Diagnosis with Capsule Networks and Graph Neural Networks","summary":" In the realm of skin lesion image classification, the intricate spatial and\nsemantic features pose significant challenges for conventional Convolutional\nNeural Network (CNN)-based methodologies. These challenges are compounded by\nthe imbalanced nature of skin lesion datasets, which hampers the ability of\nmodels to learn minority class features effectively. Despite augmentation\nstrategies, such as those using Generative Adversarial Networks (GANs),\nprevious attempts have not fully addressed these complexities. This study\nintroduces an innovative approach by integrating Graph Neural Networks (GNNs)\nwith Capsule Networks to enhance classification performance. GNNs, known for\ntheir proficiency in handling graph-structured data, offer an advanced\nmechanism for capturing complex patterns and relationships beyond the\ncapabilities of traditional CNNs. Capsule Networks further contribute by\nproviding superior recognition of spatial hierarchies within images. Our\nresearch focuses on evaluating and enhancing the Tiny Pyramid Vision GNN (Tiny\nPyramid ViG) architecture by incorporating it with a Capsule Network. This\nhybrid model was applied to the MNIST:HAM10000 dataset, a comprehensive skin\nlesion dataset designed for benchmarking classification models. After 75 epochs\nof training, our model achieved a significant accuracy improvement, reaching\n89.23% and 95.52%, surpassing established benchmarks such as GoogLeNet\n(83.94%), InceptionV3 (86.82%), MobileNet V3 (89.87%), EfficientNet-B7\n(92.07%), ResNet18 (92.22%), ResNet34 (91.90%), ViT-Base (73.70%), and IRv2-SA\n(93.47%) on the same dataset. This outcome underscores the potential of our\napproach in overcoming the inherent challenges of skin lesion classification,\ncontributing to the advancement of image-based diagnosis in dermatology.\n","authors":["K. P. Santoso","R. V. H. Ginardi","R. A. Sastrowardoyo","F. A. Madany"],"pdf_url":"https://arxiv.org/pdf/2403.12009v2.pdf","comment":"This is the first version of our paper, we gladly expect feedback and\n corrections if there is any mistake within our paper"},{"id":"http://arxiv.org/abs/2403.07487v3","updated":"2024-03-19T07:05:37Z","published":"2024-03-12T10:25:29Z","title":"Motion Mamba: Efficient and Long Sequence Motion Generation with\n Hierarchical and Bidirectional Selective SSM","summary":" Human motion generation stands as a significant pursuit in generative\ncomputer vision, while achieving long-sequence and efficient motion generation\nremains challenging. Recent advancements in state space models (SSMs), notably\nMamba, have showcased considerable promise in long sequence modeling with an\nefficient hardware-aware design, which appears to be a promising direction to\nbuild motion generation model upon it. Nevertheless, adapting SSMs to motion\ngeneration faces hurdles since the lack of a specialized design architecture to\nmodel motion sequence. To address these challenges, we propose Motion Mamba, a\nsimple and efficient approach that presents the pioneering motion generation\nmodel utilized SSMs. Specifically, we design a Hierarchical Temporal Mamba\n(HTM) block to process temporal data by ensemble varying numbers of isolated\nSSM modules across a symmetric U-Net architecture aimed at preserving motion\nconsistency between frames. We also design a Bidirectional Spatial Mamba (BSM)\nblock to bidirectionally process latent poses, to enhance accurate motion\ngeneration within a temporal frame. Our proposed method achieves up to 50% FID\nimprovement and up to 4 times faster on the HumanML3D and KIT-ML datasets\ncompared to the previous best diffusion-based method, which demonstrates strong\ncapabilities of high-quality long sequence motion modeling and real-time human\nmotion generation. See project website\nhttps://steve-zeyu-zhang.github.io/MotionMamba/\n","authors":["Zeyu Zhang","Akide Liu","Ian Reid","Richard Hartley","Bohan Zhuang","Hao Tang"],"pdf_url":"https://arxiv.org/pdf/2403.07487v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12494v1","updated":"2024-03-19T07:02:08Z","published":"2024-03-19T07:02:08Z","title":"Task-Customized Mixture of Adapters for General Image Fusion","summary":" General image fusion aims at integrating important information from\nmulti-source images. However, due to the significant cross-task gap, the\nrespective fusion mechanism varies considerably in practice, resulting in\nlimited performance across subtasks. To handle this problem, we propose a novel\ntask-customized mixture of adapters (TC-MoA) for general image fusion,\nadaptively prompting various fusion tasks in a unified model. We borrow the\ninsight from the mixture of experts (MoE), taking the experts as efficient\ntuning adapters to prompt a pre-trained foundation model. These adapters are\nshared across different tasks and constrained by mutual information\nregularization, ensuring compatibility with different tasks while\ncomplementarity for multi-source images. The task-specific routing networks\ncustomize these adapters to extract task-specific information from different\nsources with dynamic dominant intensity, performing adaptive visual feature\nprompt fusion. Notably, our TC-MoA controls the dominant intensity bias for\ndifferent fusion tasks, successfully unifying multiple fusion tasks in a single\nmodel. Extensive experiments show that TC-MoA outperforms the competing\napproaches in learning commonalities while retaining compatibility for general\nimage fusion (multi-modal, multi-exposure, and multi-focus), and also\ndemonstrating striking controllability on more generalization experiments. The\ncode is available at https://github.com/YangSun22/TC-MoA .\n","authors":["Pengfei Zhu","Yang Sun","Bing Cao","Qinghua Hu"],"pdf_url":"https://arxiv.org/pdf/2403.12494v1.pdf","comment":"19 pages, 17 figures, CVPR 2024"},{"id":"http://arxiv.org/abs/2403.12493v1","updated":"2024-03-19T07:02:06Z","published":"2024-03-19T07:02:06Z","title":"A Trainable Feature Extractor Module for Deep Neural Networks and\n Scanpath Classification","summary":" Scanpath classification is an area in eye tracking research with possible\napplications in medicine, manufacturing as well as training systems for\nstudents in various domains. In this paper we propose a trainable feature\nextraction module for deep neural networks. The purpose of this module is to\ntransform a scanpath into a feature vector which is directly useable for the\ndeep neural network architecture. Based on the backpropagated error of the deep\nneural network, the feature extraction module adapts its parameters to improve\nthe classification performance. Therefore, our feature extraction module is\njointly trainable with the deep neural network. The motivation to this feature\nextraction module is based on classical histogram-based approaches which\nusually compute distributions over a scanpath. We evaluated our module on three\npublic datasets and compared it to the state of the art approaches.\n","authors":["Wolfgang Fuhl"],"pdf_url":"https://arxiv.org/pdf/2403.12493v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12488v1","updated":"2024-03-19T06:54:33Z","published":"2024-03-19T06:54:33Z","title":"DetToolChain: A New Prompting Paradigm to Unleash Detection Ability of\n MLLM","summary":" We present DetToolChain, a novel prompting paradigm, to unleash the zero-shot\nobject detection ability of multimodal large language models (MLLMs), such as\nGPT-4V and Gemini. Our approach consists of a detection prompting toolkit\ninspired by high-precision detection priors and a new Chain-of-Thought to\nimplement these prompts. Specifically, the prompts in the toolkit are designed\nto guide the MLLM to focus on regional information (e.g., zooming in), read\ncoordinates according to measure standards (e.g., overlaying rulers and\ncompasses), and infer from the contextual information (e.g., overlaying scene\ngraphs). Building upon these tools, the new detection chain-of-thought can\nautomatically decompose the task into simple subtasks, diagnose the\npredictions, and plan for progressive box refinements. The effectiveness of our\nframework is demonstrated across a spectrum of detection tasks, especially hard\ncases. Compared to existing state-of-the-art methods, GPT-4V with our\nDetToolChain improves state-of-the-art object detectors by +21.5% AP50 on MS\nCOCO Novel class set for open-vocabulary detection, +24.23% Acc on RefCOCO val\nset for zero-shot referring expression comprehension, +14.5% AP on D-cube\ndescribe object detection FULL setting.\n","authors":["Yixuan Wu","Yizhou Wang","Shixiang Tang","Wenhao Wu","Tong He","Wanli Ouyang","Jian Wu","Philip Torr"],"pdf_url":"https://arxiv.org/pdf/2403.12488v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08985v3","updated":"2024-03-19T06:50:17Z","published":"2023-12-14T14:31:40Z","title":"OMG: Towards Open-vocabulary Motion Generation via Mixture of\n Controllers","summary":" We have recently seen tremendous progress in realistic text-to-motion\ngeneration. Yet, the existing methods often fail or produce implausible motions\nwith unseen text inputs, which limits the applications. In this paper, we\npresent OMG, a novel framework, which enables compelling motion generation from\nzero-shot open-vocabulary text prompts. Our key idea is to carefully tailor the\npretrain-then-finetune paradigm into the text-to-motion generation. At the\npre-training stage, our model improves the generation ability by learning the\nrich out-of-domain inherent motion traits. To this end, we scale up a large\nunconditional diffusion model up to 1B parameters, so as to utilize the massive\nunlabeled motion data up to over 20M motion instances. At the subsequent\nfine-tuning stage, we introduce motion ControlNet, which incorporates text\nprompts as conditioning information, through a trainable copy of the\npre-trained model and the proposed novel Mixture-of-Controllers (MoC) block.\nMoC block adaptively recognizes various ranges of the sub-motions with a\ncross-attention mechanism and processes them separately with the\ntext-token-specific experts. Such a design effectively aligns the CLIP token\nembeddings of text prompts to various ranges of compact and expressive motion\nfeatures. Extensive experiments demonstrate that our OMG achieves significant\nimprovements over the state-of-the-art methods on zero-shot text-to-motion\ngeneration. Project page: https://tr3e.github.io/omg-page.\n","authors":["Han Liang","Jiacheng Bao","Ruichi Zhang","Sihan Ren","Yuecheng Xu","Sibei Yang","Xin Chen","Jingyi Yu","Lan Xu"],"pdf_url":"https://arxiv.org/pdf/2312.08985v3.pdf","comment":"accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.12483v1","updated":"2024-03-19T06:40:06Z","published":"2024-03-19T06:40:06Z","title":"A Hybrid Transformer-Sequencer approach for Age and Gender\n classification from in-wild facial images","summary":" The advancements in computer vision and image processing techniques have led\nto emergence of new application in the domain of visual surveillance, targeted\nadvertisement, content-based searching, and human-computer interaction etc. Out\nof the various techniques in computer vision, face analysis, in particular, has\ngained much attention. Several previous studies have tried to explore different\napplications of facial feature processing for a variety of tasks, including age\nand gender classification. However, despite several previous studies having\nexplored the problem, the age and gender classification of in-wild human faces\nis still far from the achieving the desired levels of accuracy required for\nreal-world applications. This paper, therefore, attempts to bridge this gap by\nproposing a hybrid model that combines self-attention and BiLSTM approaches for\nage and gender classification problems. The proposed models performance is\ncompared with several state-of-the-art model proposed so far. An improvement of\napproximately 10percent and 6percent over the state-of-the-art implementations\nfor age and gender classification, respectively, are noted for the proposed\nmodel. The proposed model is thus found to achieve superior performance and is\nfound to provide a more generalized learning. The model can, therefore, be\napplied as a core classification component in various image processing and\ncomputer vision problems.\n","authors":["Aakash Singh","Vivek Kumar Singh"],"pdf_url":"https://arxiv.org/pdf/2403.12483v1.pdf","comment":"22 pages"},{"id":"http://arxiv.org/abs/2308.09591v3","updated":"2024-03-19T06:37:32Z","published":"2023-08-18T14:38:31Z","title":"O$^2$-Recon: Completing 3D Reconstruction of Occluded Objects in the\n Scene with a Pre-trained 2D Diffusion Model","summary":" Occlusion is a common issue in 3D reconstruction from RGB-D videos, often\nblocking the complete reconstruction of objects and presenting an ongoing\nproblem. In this paper, we propose a novel framework, empowered by a 2D\ndiffusion-based in-painting model, to reconstruct complete surfaces for the\nhidden parts of objects. Specifically, we utilize a pre-trained diffusion model\nto fill in the hidden areas of 2D images. Then we use these in-painted images\nto optimize a neural implicit surface representation for each instance for 3D\nreconstruction. Since creating the in-painting masks needed for this process is\ntricky, we adopt a human-in-the-loop strategy that involves very little human\nengagement to generate high-quality masks. Moreover, some parts of objects can\nbe totally hidden because the videos are usually shot from limited\nperspectives. To ensure recovering these invisible areas, we develop a cascaded\nnetwork architecture for predicting signed distance field, making use of\ndifferent frequency bands of positional encoding and maintaining overall\nsmoothness. Besides the commonly used rendering loss, Eikonal loss, and\nsilhouette loss, we adopt a CLIP-based semantic consistency loss to guide the\nsurface from unseen camera angles. Experiments on ScanNet scenes show that our\nproposed framework achieves state-of-the-art accuracy and completeness in\nobject-level reconstruction from scene-level RGB-D videos. Code:\nhttps://github.com/THU-LYJ-Lab/O2-Recon.\n","authors":["Yubin Hu","Sheng Ye","Wang Zhao","Matthieu Lin","Yuze He","Yu-Hui Wen","Ying He","Yong-Jin Liu"],"pdf_url":"https://arxiv.org/pdf/2308.09591v3.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2403.12481v1","updated":"2024-03-19T06:36:42Z","published":"2024-03-19T06:36:42Z","title":"TT-BLIP: Enhancing Fake News Detection Using BLIP and Tri-Transformer","summary":" Detecting fake news has received a lot of attention. Many previous methods\nconcatenate independently encoded unimodal data, ignoring the benefits of\nintegrated multimodal information. Also, the absence of specialized feature\nextraction for text and images further limits these methods. This paper\nintroduces an end-to-end model called TT-BLIP that applies the bootstrapping\nlanguage-image pretraining for unified vision-language understanding and\ngeneration (BLIP) for three types of information: BERT and\nBLIP\\textsubscript{Txt} for text, ResNet and BLIP\\textsubscript{Img} for\nimages, and bidirectional BLIP encoders for multimodal information. The\nMultimodal Tri-Transformer fuses tri-modal features using three types of\nmulti-head attention mechanisms, ensuring integrated modalities for enhanced\nrepresentations and improved multimodal data analysis. The experiments are\nperformed using two fake news datasets, Weibo and Gossipcop. The results\nindicate TT-BLIP outperforms the state-of-the-art models.\n","authors":["Eunjee Choi","Jong-Kook Kim"],"pdf_url":"https://arxiv.org/pdf/2403.12481v1.pdf","comment":"8 pages, submitted to conference"},{"id":"http://arxiv.org/abs/2311.16117v2","updated":"2024-03-19T06:27:18Z","published":"2023-10-03T15:45:50Z","title":"Predicated Diffusion: Predicate Logic-Based Attention Guidance for\n Text-to-Image Diffusion Models","summary":" Diffusion models have achieved remarkable results in generating high-quality,\ndiverse, and creative images. However, when it comes to text-based image\ngeneration, they often fail to capture the intended meaning presented in the\ntext. For instance, a specified object may not be generated, an unnecessary\nobject may be generated, and an adjective may alter objects it was not intended\nto modify. Moreover, we found that relationships indicating possession between\nobjects are often overlooked. While users' intentions in text are diverse,\nexisting methods tend to specialize in only some aspects of these. In this\npaper, we propose Predicated Diffusion, a unified framework to express users'\nintentions. We consider that the root of the above issues lies in the text\nencoder, which often focuses only on individual words and neglects the logical\nrelationships between them. The proposed method does not solely rely on the\ntext encoder, but instead, represents the intended meaning in the text as\npropositions using predicate logic and treats the pixels in the attention maps\nas the fuzzy predicates. This enables us to obtain a differentiable loss\nfunction that makes the image fulfill the proposition by minimizing it. When\ncompared to several existing methods, we demonstrated that Predicated Diffusion\ncan generate images that are more faithful to various text prompts, as verified\nby human evaluators and pretrained image-text models.\n","authors":["Kota Sueyoshi","Takashi Matsubara"],"pdf_url":"https://arxiv.org/pdf/2311.16117v2.pdf","comment":"20 pages, 16 figures, 6 tables, ~500 images, ~30MB"},{"id":"http://arxiv.org/abs/2403.12473v1","updated":"2024-03-19T06:18:25Z","published":"2024-03-19T06:18:25Z","title":"PostoMETRO: Pose Token Enhanced Mesh Transformer for Robust 3D Human\n Mesh Recovery","summary":" With the recent advancements in single-image-based human mesh recovery, there\nis a growing interest in enhancing its performance in certain extreme\nscenarios, such as occlusion, while maintaining overall model accuracy.\nAlthough obtaining accurately annotated 3D human poses under occlusion is\nchallenging, there is still a wealth of rich and precise 2D pose annotations\nthat can be leveraged. However, existing works mostly focus on directly\nleveraging 2D pose coordinates to estimate 3D pose and mesh. In this paper, we\npresent PostoMETRO($\\textbf{Pos}$e $\\textbf{to}$ken enhanced $\\textbf{ME}$sh\n$\\textbf{TR}$ansf$\\textbf{O}$rmer), which integrates occlusion-resilient 2D\npose representation into transformers in a token-wise manner. Utilizing a\nspecialized pose tokenizer, we efficiently condense 2D pose data to a compact\nsequence of pose tokens and feed them to the transformer together with the\nimage tokens. This process not only ensures a rich depiction of texture from\nthe image but also fosters a robust integration of pose and image information.\nSubsequently, these combined tokens are queried by vertex and joint tokens to\ndecode 3D coordinates of mesh vertices and human joints. Facilitated by the\nrobust pose token representation and the effective combination, we are able to\nproduce more precise 3D coordinates, even under extreme scenarios like\nocclusion. Experiments on both standard and occlusion-specific benchmarks\ndemonstrate the effectiveness of PostoMETRO. Qualitative results further\nillustrate the clarity of how 2D pose can help 3D reconstruction. Code will be\nmade available.\n","authors":["Wendi Yang","Zihang Jiang","Shang Zhao","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2403.12473v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12470v1","updated":"2024-03-19T06:01:11Z","published":"2024-03-19T06:01:11Z","title":"SC-Diff: 3D Shape Completion with Latent Diffusion Models","summary":" This paper introduces a 3D shape completion approach using a 3D latent\ndiffusion model optimized for completing shapes, represented as Truncated\nSigned Distance Functions (TSDFs), from partial 3D scans. Our method combines\nimage-based conditioning through cross-attention and spatial conditioning\nthrough the integration of 3D features from captured partial scans. This dual\nguidance enables high-fidelity, realistic shape completions at superior\nresolutions. At the core of our approach is the compression of 3D data into a\nlow-dimensional latent space using an auto-encoder inspired by 2D latent\ndiffusion models. This compression facilitates the processing of\nhigher-resolution shapes and allows us to apply our model across multiple\nobject classes, a significant improvement over other existing diffusion-based\nshape completion methods, which often require a separate diffusion model for\neach class. We validated our approach against two common benchmarks in the\nfield of shape completion, demonstrating competitive performance in terms of\naccuracy and realism and performing on par with state-of-the-art methods\ndespite operating at a higher resolution with a single model for all object\nclasses. We present a comprehensive evaluation of our model, showcasing its\nefficacy in handling diverse shape completion challenges, even on unseen object\nclasses. The code will be released upon acceptance.\n","authors":["Juan D. Galvis","Xingxing Zuo","Simon Schaefer","Stefan Leutengger"],"pdf_url":"https://arxiv.org/pdf/2403.12470v1.pdf","comment":"22 pages"},{"id":"http://arxiv.org/abs/2403.06025v3","updated":"2024-03-19T05:58:51Z","published":"2024-03-09T22:25:14Z","title":"CarbonNet: How Computer Vision Plays a Role in Climate Change?\n Application: Learning Geomechanics from Subsurface Geometry of CCS to\n Mitigate Global Warming","summary":" We introduce a new approach using computer vision to predict the land surface\ndisplacement from subsurface geometry images for Carbon Capture and\nSequestration (CCS). CCS has been proved to be a key component for a carbon\nneutral society. However, scientists see there are challenges along the way\nincluding the high computational cost due to the large model scale and\nlimitations to generalize a pre-trained model with complex physics. We tackle\nthose challenges by training models directly from the subsurface geometry\nimages. The goal is to understand the respons of land surface displacement due\nto carbon injection and utilize our trained models to inform decision making in\nCCS projects.\n We implement multiple models (CNN, ResNet, and ResNetUNet) for static\nmechanics problem, which is a image prediction problem. Next, we use the LSTM\nand transformer for transient mechanics scenario, which is a video prediction\nproblem. It shows ResNetUNet outperforms the others thanks to its architecture\nin static mechanics problem, and LSTM shows comparable performance to\ntransformer in transient problem. This report proceeds by outlining our dataset\nin detail followed by model descriptions in method section. Result and\ndiscussion state the key learning, observations, and conclusion with future\nwork rounds out the paper.\n","authors":["Wei Chen","Yunan Li","Yuan Tian"],"pdf_url":"https://arxiv.org/pdf/2403.06025v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12466v1","updated":"2024-03-19T05:50:48Z","published":"2024-03-19T05:50:48Z","title":"Few-shot Object Localization","summary":" Existing few-shot object counting tasks primarily focus on quantifying the\nnumber of objects in an image, neglecting precise positional information. To\nbridge this research gap, this paper introduces the novel task of Few-Shot\nObject Localization (FSOL), which aims to provide accurate object positional\ninformation. This task achieves generalized object localization by leveraging a\nsmall number of labeled support samples to query the positional information of\nobjects within corresponding images. To advance this research field, we propose\nan innovative high-performance baseline model. Our model integrates a dual-path\nfeature augmentation module to enhance shape association and gradient\ndifferences between supports and query images, alongside a self-query module\ndesigned to explore the association between feature maps and query images.\nExperimental results demonstrate a significant performance improvement of our\napproach in the FSOL task, establishing an efficient benchmark for further\nresearch.\n","authors":["Yunhan Ren","Bo Li","Chengyang Zhang","Yong Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.12466v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11616v2","updated":"2024-03-19T05:49:31Z","published":"2024-03-18T09:47:41Z","title":"Multi-View Video-Based Learning: Leveraging Weak Labels for Frame-Level\n Perception","summary":" For training a video-based action recognition model that accepts multi-view\nvideo, annotating frame-level labels is tedious and difficult. However, it is\nrelatively easy to annotate sequence-level labels. This kind of coarse\nannotations are called as weak labels. However, training a multi-view\nvideo-based action recognition model with weak labels for frame-level\nperception is challenging. In this paper, we propose a novel learning\nframework, where the weak labels are first used to train a multi-view\nvideo-based base model, which is subsequently used for downstream frame-level\nperception tasks. The base model is trained to obtain individual latent\nembeddings for each view in the multi-view input. For training the model using\nthe weak labels, we propose a novel latent loss function. We also propose a\nmodel that uses the view-specific latent embeddings for downstream frame-level\naction recognition and detection tasks. The proposed framework is evaluated\nusing the MM Office dataset by comparing several baseline algorithms. The\nresults show that the proposed base model is effectively trained using weak\nlabels and the latent embeddings help the downstream models improve accuracy.\n","authors":["Vijay John","Yasutomo Kawanishi"],"pdf_url":"https://arxiv.org/pdf/2403.11616v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03135v5","updated":"2024-03-19T05:30:50Z","published":"2023-08-06T15:05:42Z","title":"EventBind: Learning a Unified Representation to Bind Them All for\n Event-based Open-world Understanding","summary":" In this paper, we propose EventBind, a novel and effective framework that\nunleashes the potential of vision-language models (VLMs) for event-based\nrecognition to compensate for the lack of large-scale event-based datasets. In\nparticular, due to the distinct modality gap with the image-text data and the\nlack of large-scale datasets, learning a common representation space for\nimages, texts, and events is non-trivial.Intuitively, we need to address two\nkey challenges: 1) how to generalize CLIP's visual encoder to event data while\nfully leveraging events' unique properties, e.g., sparsity and high temporal\nresolution; 2) how to effectively align the multi-modal embeddings, i.e.,\nimage, text, and events. Accordingly, we first introduce a novel event encoder\nthat subtly models the temporal information from events and meanwhile,\ngenerates event prompts for modality bridging. We then design a text encoder\nthat generates content prompts and utilizes hybrid text prompts to enhance\nEventBind's generalization ability across diverse datasets.With the proposed\nevent encoder, text encoder, and image encoder, a novel Hierarchical Triple\nContrastive Alignment (HTCA) module is introduced to jointly optimize the\ncorrelation and enable efficient knowledge transfer among the three modalities.\nWe evaluate various settings, including fine-tuning and few-shot on three\nbenchmarks, and our EventBind achieves new state-of-the-art accuracy compared\nwith the previous methods, such as on N-Caltech101 (+5.34% and +1.70%) and\nN-Imagenet (+5.65% and +1.99%) with fine-tuning and 20-shot settings,\nrespectively. Moreover, our EventBind can be flexibly extended to the event\nretrieval task using text or image queries, showing plausible performance. Our\nproject code will be made publicly available.\n","authors":["Jiazhou Zhou","Xu Zheng","Yuanhuiyi Lyu","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2308.03135v5.pdf","comment":"Conference version with supplementary"},{"id":"http://arxiv.org/abs/2403.12459v1","updated":"2024-03-19T05:30:50Z","published":"2024-03-19T05:30:50Z","title":"Non-negative Contrastive Learning","summary":" Deep representations have shown promising performance when transferred to\ndownstream tasks in a black-box manner. Yet, their inherent lack of\ninterpretability remains a significant challenge, as these features are often\nopaque to human understanding. In this paper, we propose Non-negative\nContrastive Learning (NCL), a renaissance of Non-negative Matrix Factorization\n(NMF) aimed at deriving interpretable features. The power of NCL lies in its\nenforcement of non-negativity constraints on features, reminiscent of NMF's\ncapability to extract features that align closely with sample clusters. NCL not\nonly aligns mathematically well with an NMF objective but also preserves NMF's\ninterpretability attributes, resulting in a more sparse and disentangled\nrepresentation compared to standard contrastive learning (CL). Theoretically,\nwe establish guarantees on the identifiability and downstream generalization of\nNCL. Empirically, we show that these advantages enable NCL to outperform CL\nsignificantly on feature disentanglement, feature selection, as well as\ndownstream classification tasks. At last, we show that NCL can be easily\nextended to other learning scenarios and benefit supervised learning as well.\nCode is available at https://github.com/PKU-ML/non_neg.\n","authors":["Yifei Wang","Qi Zhang","Yaoyu Guo","Yisen Wang"],"pdf_url":"https://arxiv.org/pdf/2403.12459v1.pdf","comment":"22 pages. Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2403.12457v1","updated":"2024-03-19T05:27:52Z","published":"2024-03-19T05:27:52Z","title":"Privacy-Preserving Face Recognition Using Trainable Feature Subtraction","summary":" The widespread adoption of face recognition has led to increasing privacy\nconcerns, as unauthorized access to face images can expose sensitive personal\ninformation. This paper explores face image protection against viewing and\nrecovery attacks. Inspired by image compression, we propose creating a visually\nuninformative face image through feature subtraction between an original face\nand its model-produced regeneration. Recognizable identity features within the\nimage are encouraged by co-training a recognition model on its high-dimensional\nfeature representation. To enhance privacy, the high-dimensional representation\nis crafted through random channel shuffling, resulting in randomized\nrecognizable images devoid of attacker-leverageable texture details. We distill\nour methodologies into a novel privacy-preserving face recognition method,\nMinusFace. Experiments demonstrate its high recognition accuracy and effective\nprivacy protection. Its code is available at https://github.com/Tencent/TFace.\n","authors":["Yuxi Mi","Zhizhou Zhong","Yuge Huang","Jiazhen Ji","Jianqing Xu","Jun Wang","Shaoming Wang","Shouhong Ding","Shuigeng Zhou"],"pdf_url":"https://arxiv.org/pdf/2403.12457v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.12455v1","updated":"2024-03-19T05:27:04Z","published":"2024-03-19T05:27:04Z","title":"CLIP-VIS: Adapting CLIP for Open-Vocabulary Video Instance Segmentation","summary":" Open-vocabulary video instance segmentation strives to segment and track\ninstances belonging to an open set of categories in a video. The\nvision-language model Contrastive Language-Image Pre-training (CLIP) has shown\nstrong zero-shot classification ability in image-level open-vocabulary task. In\nthis paper, we propose a simple encoder-decoder network, called CLIP-VIS, to\nadapt CLIP for open-vocabulary video instance segmentation. Our CLIP-VIS adopts\nfrozen CLIP image encoder and introduces three modules, including\nclass-agnostic mask generation, temporal topK-enhanced matching, and weighted\nopen-vocabulary classification. Given a set of initial queries, class-agnostic\nmask generation employs a transformer decoder to predict query masks and\ncorresponding object scores and mask IoU scores. Then, temporal topK-enhanced\nmatching performs query matching across frames by using K mostly matched\nframes. Finally, weighted open-vocabulary classification first generates query\nvisual features with mask pooling, and second performs weighted classification\nusing object scores and mask IoU scores. Our CLIP-VIS does not require the\nannotations of instance categories and identities. The experiments are\nperformed on various video instance segmentation datasets, which demonstrate\nthe effectiveness of our proposed method, especially on novel categories. When\nusing ConvNeXt-B as backbone, our CLIP-VIS achieves the AP and APn scores of\n32.1% and 40.3% on validation set of LV-VIS dataset, which outperforms OV2Seg\nby 11.0% and 24.0% respectively. We will release the source code and models at\nhttps://github.com/zwq456/CLIP-VIS.git.\n","authors":["Wenqi Zhu","Jiale Cao","Jin Xie","Shuangming Yang","Yanwei Pang"],"pdf_url":"https://arxiv.org/pdf/2403.12455v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08460v2","updated":"2024-03-19T05:25:20Z","published":"2024-03-13T12:20:20Z","title":"Towards Dense and Accurate Radar Perception Via Efficient Cross-Modal\n Diffusion Model","summary":" Millimeter wave (mmWave) radars have attracted significant attention from\nboth academia and industry due to their capability to operate in extreme\nweather conditions. However, they face challenges in terms of sparsity and\nnoise interference, which hinder their application in the field of micro aerial\nvehicle (MAV) autonomous navigation. To this end, this paper proposes a novel\napproach to dense and accurate mmWave radar point cloud construction via\ncross-modal learning. Specifically, we introduce diffusion models, which\npossess state-of-the-art performance in generative modeling, to predict\nLiDAR-like point clouds from paired raw radar data. We also incorporate the\nmost recent diffusion model inference accelerating techniques to ensure that\nthe proposed method can be implemented on MAVs with limited computing\nresources.We validate the proposed method through extensive benchmark\ncomparisons and real-world experiments, demonstrating its superior performance\nand generalization ability. Code and pretrained models will be available at\nhttps://github.com/ZJU-FAST-Lab/Radar-Diffusion.\n","authors":["Ruibin Zhang","Donglai Xue","Yuhan Wang","Ruixu Geng","Fei Gao"],"pdf_url":"https://arxiv.org/pdf/2403.08460v2.pdf","comment":"8 pages, 6 figures, submitted to RA-L"},{"id":"http://arxiv.org/abs/2403.12450v1","updated":"2024-03-19T05:21:12Z","published":"2024-03-19T05:21:12Z","title":"Intention Action Anticipation Model with Guide-Feedback Loop Mechanism","summary":" Anticipating human intention from videos has broad applications, such as\nautomatic driving, robot assistive technology, and virtual reality. This study\naddresses the problem of intention action anticipation using egocentric video\nsequences to estimate actions that indicate human intention. We propose a\nHierarchical Complete-Recent (HCR) information fusion model that makes full use\nof the features of the entire video sequence (i.e., complete features) and the\nfeatures of the video tail sequence (i.e., recent features). The HCR model has\ntwo primary mechanisms. The Guide-Feedback Loop (GFL) mechanism is proposed to\nmodel the relation between one recent feature and one complete feature. Based\non GFL, the MultiComplete-Recent Feature Aggregation (MCRFA) module is proposed\nto model the relation of one recent feature with multiscale complete features.\nBased on GFL and MCRFA, the HCR model can hierarchically explore the rich\ninterrelationships between multiscale complete features and multiscale recent\nfeatures. Through comparative and ablation experiments, we validate the\neffectiveness of our model on two well-known public datasets: EPIC-Kitchens and\nEGTEA Gaze+.\n","authors":["Zongnan Ma","Fuchun Zhang","Zhixiong Nan","Yao Ge"],"pdf_url":"https://arxiv.org/pdf/2403.12450v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12448v1","updated":"2024-03-19T05:17:47Z","published":"2024-03-19T05:17:47Z","title":"Do Generated Data Always Help Contrastive Learning?","summary":" Contrastive Learning (CL) has emerged as one of the most successful paradigms\nfor unsupervised visual representation learning, yet it often depends on\nintensive manual data augmentations. With the rise of generative models,\nespecially diffusion models, the ability to generate realistic images close to\nthe real data distribution has been well recognized. These generated\nhigh-equality images have been successfully applied to enhance contrastive\nrepresentation learning, a technique termed ``data inflation''. However, we\nfind that the generated data (even from a good diffusion model like DDPM) may\nsometimes even harm contrastive learning. We investigate the causes behind this\nfailure from the perspective of both data inflation and data augmentation. For\nthe first time, we reveal the complementary roles that stronger data inflation\nshould be accompanied by weaker augmentations, and vice versa. We also provide\nrigorous theoretical explanations for these phenomena via deriving its\ngeneralization bounds under data inflation. Drawing from these insights, we\npropose Adaptive Inflation (AdaInf), a purely data-centric strategy without\nintroducing any extra computation cost. On benchmark datasets, AdaInf can bring\nsignificant improvements for various contrastive learning methods. Notably,\nwithout using external data, AdaInf obtains 94.70% linear accuracy on CIFAR-10\nwith SimCLR, setting a new record that surpasses many sophisticated methods.\nCode is available at https://github.com/PKU-ML/adainf.\n","authors":["Yifei Wang","Jizhe Zhang","Yisen Wang"],"pdf_url":"https://arxiv.org/pdf/2403.12448v1.pdf","comment":"19 pages. Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2403.09981v2","updated":"2024-03-19T05:17:18Z","published":"2024-03-15T02:57:20Z","title":"Controllable Text-to-3D Generation via Surface-Aligned Gaussian\n Splatting","summary":" While text-to-3D and image-to-3D generation tasks have received considerable\nattention, one important but under-explored field between them is controllable\ntext-to-3D generation, which we mainly focus on in this work. To address this\ntask, 1) we introduce Multi-view ControlNet (MVControl), a novel neural network\narchitecture designed to enhance existing pre-trained multi-view diffusion\nmodels by integrating additional input conditions, such as edge, depth, normal,\nand scribble maps. Our innovation lies in the introduction of a conditioning\nmodule that controls the base diffusion model using both local and global\nembeddings, which are computed from the input condition images and camera\nposes. Once trained, MVControl is able to offer 3D diffusion guidance for\noptimization-based 3D generation. And, 2) we propose an efficient multi-stage\n3D generation pipeline that leverages the benefits of recent large\nreconstruction models and score distillation algorithm. Building upon our\nMVControl architecture, we employ a unique hybrid diffusion guidance method to\ndirect the optimization process. In pursuit of efficiency, we adopt 3D\nGaussians as our representation instead of the commonly used implicit\nrepresentations. We also pioneer the use of SuGaR, a hybrid representation that\nbinds Gaussians to mesh triangle faces. This approach alleviates the issue of\npoor geometry in 3D Gaussians and enables the direct sculpting of fine-grained\ngeometry on the mesh. Extensive experiments demonstrate that our method\nachieves robust generalization and enables the controllable generation of\nhigh-quality 3D content.\n","authors":["Zhiqi Li","Yiming Chen","Lingzhe Zhao","Peidong Liu"],"pdf_url":"https://arxiv.org/pdf/2403.09981v2.pdf","comment":"Project page: https://lizhiqi49.github.io/MVControl/"},{"id":"http://arxiv.org/abs/2403.12445v1","updated":"2024-03-19T05:10:10Z","published":"2024-03-19T05:10:10Z","title":"Boosting Transferability in Vision-Language Attacks via Diversification\n along the Intersection Region of Adversarial Trajectory","summary":" Vision-language pre-training (VLP) models exhibit remarkable capabilities in\ncomprehending both images and text, yet they remain susceptible to multimodal\nadversarial examples (AEs). Strengthening adversarial attacks and uncovering\nvulnerabilities, especially common issues in VLP models (e.g., high\ntransferable AEs), can stimulate further research on constructing reliable and\npractical VLP models. A recent work (i.e., Set-level guidance attack) indicates\nthat augmenting image-text pairs to increase AE diversity along the\noptimization path enhances the transferability of adversarial examples\nsignificantly. However, this approach predominantly emphasizes diversity around\nthe online adversarial examples (i.e., AEs in the optimization period), leading\nto the risk of overfitting the victim model and affecting the transferability.\nIn this study, we posit that the diversity of adversarial examples towards the\nclean input and online AEs are both pivotal for enhancing transferability\nacross VLP models. Consequently, we propose using diversification along the\nintersection region of adversarial trajectory to expand the diversity of AEs.\nTo fully leverage the interaction between modalities, we introduce text-guided\nadversarial example selection during optimization. Furthermore, to further\nmitigate the potential overfitting, we direct the adversarial text deviating\nfrom the last intersection region along the optimization path, rather than\nadversarial images as in existing methods. Extensive experiments affirm the\neffectiveness of our method in improving transferability across various VLP\nmodels and downstream vision-and-language tasks (e.g., Image-Text\nRetrieval(ITR), Visual Grounding(VG), Image Captioning(IC)).\n","authors":["Sensen Gao","Xiaojun Jia","Xuhong Ren","Ivor Tsang","Qing Guo"],"pdf_url":"https://arxiv.org/pdf/2403.12445v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03774v3","updated":"2024-03-19T05:03:18Z","published":"2023-12-06T02:52:54Z","title":"OctreeOcc: Efficient and Multi-Granularity Occupancy Prediction Using\n Octree Queries","summary":" Occupancy prediction has increasingly garnered attention in recent years for\nits fine-grained understanding of 3D scenes. Traditional approaches typically\nrely on dense, regular grid representations, which often leads to excessive\ncomputational demands and a loss of spatial details for small objects. This\npaper introduces OctreeOcc, an innovative 3D occupancy prediction framework\nthat leverages the octree representation to adaptively capture valuable\ninformation in 3D, offering variable granularity to accommodate object shapes\nand semantic regions of varying sizes and complexities. In particular, we\nincorporate image semantic information to improve the accuracy of initial\noctree structures and design an effective rectification mechanism to refine the\noctree structure iteratively. Our extensive evaluations show that OctreeOcc not\nonly surpasses state-of-the-art methods in occupancy prediction, but also\nachieves a 15%-24% reduction in computational overhead compared to\ndense-grid-based methods.\n","authors":["Yuhang Lu","Xinge Zhu","Tai Wang","Yuexin Ma"],"pdf_url":"https://arxiv.org/pdf/2312.03774v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12440v1","updated":"2024-03-19T04:54:59Z","published":"2024-03-19T04:54:59Z","title":"Self-learning Canonical Space for Multi-view 3D Human Pose Estimation","summary":" Multi-view 3D human pose estimation is naturally superior to single view one,\nbenefiting from more comprehensive information provided by images of multiple\nviews. The information includes camera poses, 2D/3D human poses, and 3D\ngeometry. However, the accurate annotation of these information is hard to\nobtain, making it challenging to predict accurate 3D human pose from multi-view\nimages. To deal with this issue, we propose a fully self-supervised framework,\nnamed cascaded multi-view aggregating network (CMANet), to construct a\ncanonical parameter space to holistically integrate and exploit multi-view\ninformation. In our framework, the multi-view information is grouped into two\ncategories: 1) intra-view information , 2) inter-view information. Accordingly,\nCMANet consists of two components: intra-view module (IRV) and inter-view\nmodule (IEV). IRV is used for extracting initial camera pose and 3D human pose\nof each view; IEV is to fuse complementary pose information and cross-view 3D\ngeometry for a final 3D human pose. To facilitate the aggregation of the intra-\nand inter-view, we define a canonical parameter space, depicted by per-view\ncamera pose and human pose and shape parameters ($\\theta$ and $\\beta$) of SMPL\nmodel, and propose a two-stage learning procedure. At first stage, IRV learns\nto estimate camera pose and view-dependent 3D human pose supervised by\nconfident output of an off-the-shelf 2D keypoint detector. At second stage, IRV\nis frozen and IEV further refines the camera pose and optimizes the 3D human\npose by implicitly encoding the cross-view complement and 3D geometry\nconstraint, achieved by jointly fitting predicted multi-view 2D keypoints. The\nproposed framework, modules, and learning strategy are demonstrated to be\neffective by comprehensive experiments and CMANet is superior to\nstate-of-the-art methods in extensive quantitative and qualitative analysis.\n","authors":["Xiaoben Li","Mancheng Meng","Ziyan Wu","Terrence Chen","Fan Yang","Dinggang Shen"],"pdf_url":"https://arxiv.org/pdf/2403.12440v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12438v1","updated":"2024-03-19T04:51:38Z","published":"2024-03-19T04:51:38Z","title":"Precise-Physics Driven Text-to-3D Generation","summary":" Text-to-3D generation has shown great promise in generating novel 3D content\nbased on given text prompts. However, existing generative methods mostly focus\non geometric or visual plausibility while ignoring precise physics perception\nfor the generated 3D shapes. This greatly hinders the practicality of generated\n3D shapes in real-world applications. In this work, we propose Phy3DGen, a\nprecise-physics-driven text-to-3D generation method. By analyzing the solid\nmechanics of generated 3D shapes, we reveal that the 3D shapes generated by\nexisting text-to-3D generation methods are impractical for real-world\napplications as the generated 3D shapes do not conform to the laws of physics.\nTo this end, we leverage 3D diffusion models to provide 3D shape priors and\ndesign a data-driven differentiable physics layer to optimize 3D shape priors\nwith solid mechanics. This allows us to optimize geometry efficiently and learn\nprecise physics information about 3D shapes at the same time. Experimental\nresults demonstrate that our method can consider both geometric plausibility\nand precise physics perception, further bridging 3D virtual modeling and\nprecise physical worlds.\n","authors":["Qingshan Xu","Jiao Liu","Melvin Wong","Caishun Chen","Yew-Soon Ong"],"pdf_url":"https://arxiv.org/pdf/2403.12438v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.05804v2","updated":"2024-03-19T04:50:32Z","published":"2023-12-10T07:34:43Z","title":"Layered 3D Human Generation via Semantic-Aware Diffusion Model","summary":" The generation of 3D clothed humans has attracted increasing attention in\nrecent years. However, existing work cannot generate layered high-quality 3D\nhumans with consistent body structures. As a result, these methods are unable\nto arbitrarily and separately change and edit the body and clothing of the\nhuman. In this paper, we propose a text-driven layered 3D human generation\nframework based on a novel physically-decoupled semantic-aware diffusion model.\nTo keep the generated clothing consistent with the target text, we propose a\nsemantic-confidence strategy for clothing that can eliminate the non-clothing\ncontent generated by the model. To match the clothing with different body\nshapes, we propose a SMPL-driven implicit field deformation network that\nenables the free transfer and reuse of clothing. Besides, we introduce uniform\nshape priors based on the SMPL model for body and clothing, respectively, which\ngenerates more diverse 3D content without being constrained by specific\ntemplates. The experimental results demonstrate that the proposed method not\nonly generates 3D humans with consistent body structures but also allows free\nediting in a layered manner. The source code will be made public.\n","authors":["Yi Wang","Jian Ma","Ruizhi Shao","Qiao Feng","Yu-Kun Lai","Yebin Liu","Kun Li"],"pdf_url":"https://arxiv.org/pdf/2312.05804v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05139v2","updated":"2024-03-19T04:49:40Z","published":"2024-03-08T08:12:18Z","title":"Improving Diffusion Models for Virtual Try-on","summary":" This paper considers image-based virtual try-on, which renders an image of a\nperson wearing a curated garment, given a pair of images depicting the person\nand the garment, respectively. Previous works adapt existing exemplar-based\ninpainting diffusion models for virtual try-on to improve the naturalness of\nthe generated visuals compared to other methods (e.g., GAN-based), but they\nfail to preserve the identity of the garments. To overcome this limitation, we\npropose a novel diffusion model that improves garment fidelity and generates\nauthentic virtual try-on images. Our method, coined IDM-VTON, uses two\ndifferent modules to encode the semantics of garment image; given the base UNet\nof the diffusion model, 1) the high-level semantics extracted from a visual\nencoder are fused to the cross-attention layer, and then 2) the low-level\nfeatures extracted from parallel UNet are fused to the self-attention layer. In\naddition, we provide detailed textual prompts for both garment and person\nimages to enhance the authenticity of the generated visuals. Finally, we\npresent a customization method using a pair of person-garment images, which\nsignificantly improves fidelity and authenticity. Our experimental results show\nthat our method outperforms previous approaches (both diffusion-based and\nGAN-based) in preserving garment details and generating authentic virtual\ntry-on images, both qualitatively and quantitatively. Furthermore, the proposed\ncustomization method demonstrates its effectiveness in a real-world scenario.\nMore visualizations are available in our project page:\nhttps://idm-vton.github.io\n","authors":["Yisol Choi","Sangkyung Kwak","Kyungmin Lee","Hyungwon Choi","Jinwoo Shin"],"pdf_url":"https://arxiv.org/pdf/2403.05139v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12434v1","updated":"2024-03-19T04:47:56Z","published":"2024-03-19T04:47:56Z","title":"Human Mesh Recovery from Arbitrary Multi-view Images","summary":" Human mesh recovery from arbitrary multi-view images involves two\ncharacteristics: the arbitrary camera poses and arbitrary number of camera\nviews. Because of the variability, designing a unified framework to tackle this\ntask is challenging. The challenges can be summarized as the dilemma of being\nable to simultaneously estimate arbitrary camera poses and recover human mesh\nfrom arbitrary multi-view images while maintaining flexibility. To solve this\ndilemma, we propose a divide and conquer framework for Unified Human Mesh\nRecovery (U-HMR) from arbitrary multi-view images. In particular, U-HMR\nconsists of a decoupled structure and two main components: camera and body\ndecoupling (CBD), camera pose estimation (CPE), and arbitrary view fusion\n(AVF). As camera poses and human body mesh are independent of each other, CBD\nsplits the estimation of them into two sub-tasks for two individual\nsub-networks (\\ie, CPE and AVF) to handle respectively, thus the two sub-tasks\nare disentangled. In CPE, since each camera pose is unrelated to the others, we\nadopt a shared MLP to process all views in a parallel way. In AVF, in order to\nfuse multi-view information and make the fusion operation independent of the\nnumber of views, we introduce a transformer decoder with a SMPL parameters\nquery token to extract cross-view features for mesh recovery. To demonstrate\nthe efficacy and flexibility of the proposed framework and effect of each\ncomponent, we conduct extensive experiments on three public datasets:\nHuman3.6M, MPI-INF-3DHP, and TotalCapture.\n","authors":["Xiaoben Li","Mancheng Meng","Ziyan Wu","Terrence Chen","Fan Yang","Dinggang Shen"],"pdf_url":"https://arxiv.org/pdf/2403.12434v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06793v2","updated":"2024-03-19T04:46:42Z","published":"2024-03-11T15:11:57Z","title":"Boosting Image Restoration via Priors from Pre-trained Models","summary":" Pre-trained models with large-scale training data, such as CLIP and Stable\nDiffusion, have demonstrated remarkable performance in various high-level\ncomputer vision tasks such as image understanding and generation from language\ndescriptions. Yet, their potential for low-level tasks such as image\nrestoration remains relatively unexplored. In this paper, we explore such\nmodels to enhance image restoration. As off-the-shelf features (OSF) from\npre-trained models do not directly serve image restoration, we propose to learn\nan additional lightweight module called Pre-Train-Guided Refinement Module\n(PTG-RM) to refine restoration results of a target restoration network with\nOSF. PTG-RM consists of two components, Pre-Train-Guided Spatial-Varying\nEnhancement (PTG-SVE), and Pre-Train-Guided Channel-Spatial Attention\n(PTG-CSA). PTG-SVE enables optimal short- and long-range neural operations,\nwhile PTG-CSA enhances spatial-channel attention for restoration-related\nlearning. Extensive experiments demonstrate that PTG-RM, with its compact size\n($<$1M parameters), effectively enhances restoration performance of various\nmodels across different tasks, including low-light enhancement, deraining,\ndeblurring, and denoising.\n","authors":["Xiaogang Xu","Shu Kong","Tao Hu","Zhe Liu","Hujun Bao"],"pdf_url":"https://arxiv.org/pdf/2403.06793v2.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2401.04325v2","updated":"2024-03-19T04:45:47Z","published":"2024-01-09T02:40:03Z","title":"RadarCam-Depth: Radar-Camera Fusion for Depth Estimation with Learned\n Metric Scale","summary":" We present a novel approach for metric dense depth estimation based on the\nfusion of a single-view image and a sparse, noisy Radar point cloud. The direct\nfusion of heterogeneous Radar and image data, or their encodings, tends to\nyield dense depth maps with significant artifacts, blurred boundaries, and\nsuboptimal accuracy. To circumvent this issue, we learn to augment versatile\nand robust monocular depth prediction with the dense metric scale induced from\nsparse and noisy Radar data. We propose a Radar-Camera framework for highly\naccurate and fine-detailed dense depth estimation with four stages, including\nmonocular depth prediction, global scale alignment of monocular depth with\nsparse Radar points, quasi-dense scale estimation through learning the\nassociation between Radar points and image patches, and local scale refinement\nof dense depth using a scale map learner. Our proposed method significantly\noutperforms the state-of-the-art Radar-Camera depth estimation methods by\nreducing the mean absolute error (MAE) of depth estimation by 25.6% and 40.2%\non the challenging nuScenes dataset and our self-collected ZJU-4DRadarCam\ndataset, respectively. Our code and dataset will be released at\n\\url{https://github.com/MMOCKING/RadarCam-Depth}.\n","authors":["Han Li","Yukai Ma","Yaqing Gu","Kewei Hu","Yong Liu","Xingxing Zuo"],"pdf_url":"https://arxiv.org/pdf/2401.04325v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03441v3","updated":"2024-03-19T04:45:07Z","published":"2023-12-06T11:50:14Z","title":"UFineBench: Towards Text-based Person Retrieval with Ultra-fine\n Granularity","summary":" Existing text-based person retrieval datasets often have relatively\ncoarse-grained text annotations. This hinders the model to comprehend the\nfine-grained semantics of query texts in real scenarios. To address this\nproblem, we contribute a new benchmark named \\textbf{UFineBench} for text-based\nperson retrieval with ultra-fine granularity.\n Firstly, we construct a new \\textbf{dataset} named UFine6926. We collect a\nlarge number of person images and manually annotate each image with two\ndetailed textual descriptions, averaging 80.8 words each. The average word\ncount is three to four times that of the previous datasets. In addition of\nstandard in-domain evaluation, we also propose a special \\textbf{evaluation\nparadigm} more representative of real scenarios. It contains a new evaluation\nset with cross domains, cross textual granularity and cross textual styles,\nnamed UFine3C, and a new evaluation metric for accurately measuring retrieval\nability, named mean Similarity Distribution (mSD). Moreover, we propose CFAM, a\nmore efficient \\textbf{algorithm} especially designed for text-based person\nretrieval with ultra fine-grained texts. It achieves fine granularity mining by\nadopting a shared cross-modal granularity decoder and hard negative match\nmechanism.\n With standard in-domain evaluation, CFAM establishes competitive performance\nacross various datasets, especially on our ultra fine-grained UFine6926.\nFurthermore, by evaluating on UFine3C, we demonstrate that training on our\nUFine6926 significantly improves generalization to real scenarios compared with\nother coarse-grained datasets. The dataset and code will be made publicly\navailable at \\url{https://github.com/Zplusdragon/UFineBench}.\n","authors":["Jialong Zuo","Hanyu Zhou","Ying Nie","Feng Zhang","Tianyu Guo","Nong Sang","Yunhe Wang","Changxin Gao"],"pdf_url":"https://arxiv.org/pdf/2312.03441v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12432v1","updated":"2024-03-19T04:44:09Z","published":"2024-03-19T04:44:09Z","title":"Prototipo de video juego activo basado en una cámara 3D para motivar\n la actividad física en niños y adultos mayores","summary":" This document describes the development of a video game prototype designed to\nencourage physical activity among children and older adults. The prototype\nconsists of a laptop, a camera with 3D sensors, and optionally requires an LCD\nscreen or a projector. The programming component of this prototype was\ndeveloped in Scratch, a programming language geared towards children, which\ngreatly facilitates the creation of a game tailored to the users' preferences.\nThe idea to create such a prototype originated from the desire to offer an\noption that promotes physical activity among children and adults, given that a\nlack of physical exercise is a predominant factor in the development of chronic\ndegenerative diseases such as diabetes and hypertension, to name the most\ncommon. As a result of this initiative, an active video game prototype was\nsuccessfully developed, based on a ping-pong game, which allows both children\nand adults to interact in a fun way while encouraging the performance of\nphysical activities that can positively impact the users' health.\n","authors":["Benjamín Ojeda Magaña","José Guadalupe Robledo Hernández","Leopoldo Gómez Barba","Victor Manuel Rangel Cobián"],"pdf_url":"https://arxiv.org/pdf/2403.12432v1.pdf","comment":"13 pages, in Spanish language, 11 figures"},{"id":"http://arxiv.org/abs/2403.12431v1","updated":"2024-03-19T04:41:09Z","published":"2024-03-19T04:41:09Z","title":"Geometric Constraints in Deep Learning Frameworks: A Survey","summary":" Stereophotogrammetry is an emerging technique of scene understanding. Its\norigins go back to at least the 1800s when people first started to investigate\nusing photographs to measure the physical properties of the world. Since then,\nthousands of approaches have been explored. The classic geometric techniques of\nShape from Stereo is built on using geometry to define constraints on scene and\ncamera geometry and then solving the non-linear systems of equations. More\nrecent work has taken an entirely different approach, using end-to-end deep\nlearning without any attempt to explicitly model the geometry. In this survey,\nwe explore the overlap for geometric-based and deep learning-based frameworks.\nWe compare and contrast geometry enforcing constraints integrated into a deep\nlearning framework for depth estimation or other closely related problems. We\npresent a new taxonomy for prevalent geometry enforcing constraints used in\nmodern deep learning frameworks. We also present insightful observations and\npotential future research directions.\n","authors":["Vibhas K Vats","David J Crandall"],"pdf_url":"https://arxiv.org/pdf/2403.12431v1.pdf","comment":"A preprint"},{"id":"http://arxiv.org/abs/2403.11127v2","updated":"2024-03-19T04:40:43Z","published":"2024-03-17T07:29:32Z","title":"GRA: Detecting Oriented Objects through Group-wise Rotating and\n Attention","summary":" Oriented object detection, an emerging task in recent years, aims to identify\nand locate objects across varied orientations. This requires the detector to\naccurately capture the orientation information, which varies significantly\nwithin and across images. Despite the existing substantial efforts,\nsimultaneously ensuring model effectiveness and parameter efficiency remains\nchallenging in this scenario. In this paper, we propose a lightweight yet\neffective Group-wise Rotating and Attention (GRA) module to replace the\nconvolution operations in backbone networks for oriented object detection. GRA\ncan adaptively capture fine-grained features of objects with diverse\norientations, comprising two key components: Group-wise Rotating and Group-wise\nAttention. Group-wise Rotating first divides the convolution kernel into\ngroups, where each group extracts different object features by rotating at a\nspecific angle according to the object orientation. Subsequently, Group-wise\nAttention is employed to adaptively enhance the object-related regions in the\nfeature. The collaborative effort of these components enables GRA to\neffectively capture the various orientation information while maintaining\nparameter efficiency. Extensive experimental results demonstrate the\nsuperiority of our method. For example, GRA achieves a new state-of-the-art\n(SOTA) on the DOTA-v2.0 benchmark, while saving the parameters by nearly 50%\ncompared to the previous SOTA method. Code will be released.\n","authors":["Jiangshan Wang","Yifan Pu","Yizeng Han","Jiayi Guo","Yiru Wang","Xiu Li","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2403.11127v2.pdf","comment":"tech report"},{"id":"http://arxiv.org/abs/2403.11697v2","updated":"2024-03-19T04:37:01Z","published":"2024-03-18T11:54:35Z","title":"Urban Scene Diffusion through Semantic Occupancy Map","summary":" Generating unbounded 3D scenes is crucial for large-scale scene understanding\nand simulation. Urban scenes, unlike natural landscapes, consist of various\ncomplex man-made objects and structures such as roads, traffic signs, vehicles,\nand buildings. To create a realistic and detailed urban scene, it is crucial to\naccurately represent the geometry and semantics of the underlying objects,\ngoing beyond their visual appearance. In this work, we propose UrbanDiffusion,\na 3D diffusion model that is conditioned on a Bird's-Eye View (BEV) map and\ngenerates an urban scene with geometry and semantics in the form of semantic\noccupancy map. Our model introduces a novel paradigm that learns the data\ndistribution of scene-level structures within a latent space and further\nenables the expansion of the synthesized scene into an arbitrary scale. After\ntraining on real-world driving datasets, our model can generate a wide range of\ndiverse urban scenes given the BEV maps from the held-out set and also\ngeneralize to the synthesized maps from a driving simulator. We further\ndemonstrate its application to scene image synthesis with a pretrained image\ngenerator as a prior.\n","authors":["Junge Zhang","Qihang Zhang","Li Zhang","Ramana Rao Kompella","Gaowen Liu","Bolei Zhou"],"pdf_url":"https://arxiv.org/pdf/2403.11697v2.pdf","comment":"The project website is https://metadriverse.github.io/urbandiff/"},{"id":"http://arxiv.org/abs/2403.12429v1","updated":"2024-03-19T04:36:41Z","published":"2024-03-19T04:36:41Z","title":"TransformMix: Learning Transformation and Mixing Strategies from Data","summary":" Data augmentation improves the generalization power of deep learning models\nby synthesizing more training samples. Sample-mixing is a popular data\naugmentation approach that creates additional data by combining existing\nsamples. Recent sample-mixing methods, like Mixup and Cutmix, adopt simple\nmixing operations to blend multiple inputs. Although such a heuristic approach\nshows certain performance gains in some computer vision tasks, it mixes the\nimages blindly and does not adapt to different datasets automatically. A mixing\nstrategy that is effective for a particular dataset does not often generalize\nwell to other datasets. If not properly configured, the methods may create\nmisleading mixed images, which jeopardize the effectiveness of sample-mixing\naugmentations. In this work, we propose an automated approach, TransformMix, to\nlearn better transformation and mixing augmentation strategies from data. In\nparticular, TransformMix applies learned transformations and mixing masks to\ncreate compelling mixed images that contain correct and important information\nfor the target tasks. We demonstrate the effectiveness of TransformMix on\nmultiple datasets in transfer learning, classification, object detection, and\nknowledge distillation settings. Experimental results show that our method\nachieves better performance as well as efficiency when compared with strong\nsample-mixing baselines.\n","authors":["Tsz-Him Cheung","Dit-Yan Yeung"],"pdf_url":"https://arxiv.org/pdf/2403.12429v1.pdf","comment":"17 pages, 9 figures"},{"id":"http://arxiv.org/abs/2403.12425v1","updated":"2024-03-19T04:25:54Z","published":"2024-03-19T04:25:54Z","title":"Multimodal Fusion Method with Spatiotemporal Sequences and Relationship\n Learning for Valence-Arousal Estimation","summary":" This paper presents our approach for the VA (Valence-Arousal) estimation task\nin the ABAW6 competition. We devised a comprehensive model by preprocessing\nvideo frames and audio segments to extract visual and audio features. Through\nthe utilization of Temporal Convolutional Network (TCN) modules, we effectively\ncaptured the temporal and spatial correlations between these features.\nSubsequently, we employed a Transformer encoder structure to learn long-range\ndependencies, thereby enhancing the model's performance and generalization\nability. Our method leverages a multimodal data fusion approach, integrating\npre-trained audio and video backbones for feature extraction, followed by\nTCN-based spatiotemporal encoding and Transformer-based temporal information\ncapture. Experimental results demonstrate the effectiveness of our approach,\nachieving competitive performance in VA estimation on the AffWild2 dataset.\n","authors":["Jun Yu","Gongpeng Zhao","Yongqi Wan","Zhihong Wei","Yang Zheng","Zerui Zhang","Zhongpeng Cai","Guochen Xie","Jichao Zhu","Wangyuan Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.12425v1.pdf","comment":"6 pages,1 figures"},{"id":"http://arxiv.org/abs/2310.16387v2","updated":"2024-03-19T04:15:28Z","published":"2023-10-25T05:59:25Z","title":"FTIC: Frequency-Aware Transformer for Learned Image Compression","summary":" Learned image compression (LIC) has gained traction as an effective solution\nfor image storage and transmission in recent years. However, existing LIC\nmethods are redundant in latent representation due to limitations in capturing\nanisotropic frequency components and preserving directional details. To\novercome these challenges, we propose a novel frequency-aware transformer (FAT)\nblock that for the first time achieves multiscale directional ananlysis for\nLIC. The FAT block comprises frequency-decomposition window attention (FDWA)\nmodules to capture multiscale and directional frequency components of natural\nimages. Additionally, we introduce frequency-modulation feed-forward network\n(FMFFN) to adaptively modulate different frequency components, improving\nrate-distortion performance. Furthermore, we present a transformer-based\nchannel-wise autoregressive (T-CA) model that effectively exploits channel\ndependencies. Experiments show that our method achieves state-of-the-art\nrate-distortion performance compared to existing LIC methods, and evidently\noutperforms latest standardized codec VTM-12.1 by 14.5%, 15.1%, 13.0% in\nBD-rate on the Kodak, Tecnick, and CLIC datasets.\n","authors":["Han Li","Shaohui Li","Wenrui Dai","Chenglin Li","Junni Zou","Hongkai Xiong"],"pdf_url":"https://arxiv.org/pdf/2310.16387v2.pdf","comment":"ICLR2024 poster"},{"id":"http://arxiv.org/abs/2301.10048v2","updated":"2024-03-19T04:02:28Z","published":"2023-01-24T14:44:44Z","title":"Exploiting Optical Flow Guidance for Transformer-Based Video Inpainting","summary":" Transformers have been widely used for video processing owing to the\nmulti-head self attention (MHSA) mechanism. However, the MHSA mechanism\nencounters an intrinsic difficulty for video inpainting, since the features\nassociated with the corrupted regions are degraded and incur inaccurate self\nattention. This problem, termed query degradation, may be mitigated by first\ncompleting optical flows and then using the flows to guide the self attention,\nwhich was verified in our previous work - flow-guided transformer (FGT). We\nfurther exploit the flow guidance and propose FGT++ to pursue more effective\nand efficient video inpainting. First, we design a lightweight flow completion\nnetwork by using local aggregation and edge loss. Second, to address the query\ndegradation, we propose a flow guidance feature integration module, which uses\nthe motion discrepancy to enhance the features, together with a flow-guided\nfeature propagation module that warps the features according to the flows.\nThird, we decouple the transformer along the temporal and spatial dimensions,\nwhere flows are used to select the tokens through a temporally deformable MHSA\nmechanism, and global tokens are combined with the inner-window local tokens\nthrough a dual perspective MHSA mechanism. FGT++ is experimentally evaluated to\nbe outperforming the existing video inpainting networks qualitatively and\nquantitatively.\n","authors":["Kaidong Zhang","Jialun Peng","Jingjing Fu","Dong Liu"],"pdf_url":"https://arxiv.org/pdf/2301.10048v2.pdf","comment":"Accepted to TPAMI. This manuscript is a journal extension of our ECCV\n 2022 paper (arXiv:2208.06768)"},{"id":"http://arxiv.org/abs/2403.12416v1","updated":"2024-03-19T03:59:14Z","published":"2024-03-19T03:59:14Z","title":"Eye-gaze Guided Multi-modal Alignment Framework for Radiology","summary":" In multi-modal frameworks, the alignment of cross-modal features presents a\nsignificant challenge. The predominant approach in multi-modal pre-training\nemphasizes either global or local alignment between modalities, utilizing\nextensive datasets. This bottom-up driven method often suffers from a lack of\ninterpretability, a critical concern in radiology. Previous studies have\nintegrated high-level labels in medical images or text, but these still rely on\nmanual annotation, a costly and labor-intensive process. Our work introduces a\nnovel approach by using eye-gaze data, collected synchronously by radiologists\nduring diagnostic evaluations. This data, indicating radiologists' focus areas,\nnaturally links chest X-rays to diagnostic texts. We propose the Eye-gaze\nGuided Multi-modal Alignment (EGMA) framework to harness eye-gaze data for\nbetter alignment of image and text features, aiming to reduce reliance on\nmanual annotations and thus cut training costs. Our model demonstrates robust\nperformance, outperforming other state-of-the-art methods in zero-shot\nclassification and retrieval tasks. The incorporation of easily-obtained\neye-gaze data during routine radiological diagnoses signifies a step towards\nminimizing manual annotation dependency. Additionally, we explore the impact of\nvarying amounts of eye-gaze data on model performance, highlighting the\nfeasibility and utility of integrating this auxiliary data into multi-modal\npre-training.\n","authors":["Chong Ma","Hanqi Jiang","Wenting Chen","Zihao Wu","Xiaowei Yu","Fang Zeng","Lei Guo","Dajiang Zhu","Tuo Zhang","Dinggang Shen","Tianming Liu","Xiang Li"],"pdf_url":"https://arxiv.org/pdf/2403.12416v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2403.12415v1","updated":"2024-03-19T03:55:39Z","published":"2024-03-19T03:55:39Z","title":"VisionGPT: LLM-Assisted Real-Time Anomaly Detection for Safe Visual\n Navigation","summary":" This paper explores the potential of Large Language Models(LLMs) in zero-shot\nanomaly detection for safe visual navigation. With the assistance of the\nstate-of-the-art real-time open-world object detection model Yolo-World and\nspecialized prompts, the proposed framework can identify anomalies within\ncamera-captured frames that include any possible obstacles, then generate\nconcise, audio-delivered descriptions emphasizing abnormalities, assist in safe\nvisual navigation in complex circumstances. Moreover, our proposed framework\nleverages the advantages of LLMs and the open-vocabulary object detection model\nto achieve the dynamic scenario switch, which allows users to transition\nsmoothly from scene to scene, which addresses the limitation of traditional\nvisual navigation. Furthermore, this paper explored the performance\ncontribution of different prompt components, provided the vision for future\nimprovement in visual accessibility, and paved the way for LLMs in video\nanomaly detection and vision-language understanding.\n","authors":["Hao Wang","Jiayou Qin","Ashish Bastola","Xiwen Chen","John Suchanek","Zihao Gong","Abolfazl Razi"],"pdf_url":"https://arxiv.org/pdf/2403.12415v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15562v3","updated":"2024-03-19T03:50:36Z","published":"2023-11-27T06:19:00Z","title":"Fully Authentic Visual Question Answering Dataset from Online\n Communities","summary":" Visual Question Answering (VQA) entails answering questions about images. We\nintroduce the first VQA dataset in which all contents originate from an\nauthentic use case. Sourced from online question answering community forums, we\ncall it VQAonline. We characterize this dataset and how it relates to eight\nmainstream VQA datasets. Observing that answers in our dataset tend to be much\nlonger (i.e., a mean of 173 words) and so incompatible with standard VQA\nevaluation metrics, we instead utilize popular metrics for longer text\nevaluation for evaluating six state-of-the-art VQA models on VQAonline and\nreport where they struggle most. Finally, we analyze which evaluation metrics\nalign best with human judgments. To facilitate future extensions, we\npublicly-share the dataset at: https://vqaonline.github.io/.\n","authors":["Chongyan Chen","Mengchen Liu","Noel Codella","Yunsheng Li","Lu Yuan","Danna Gurari"],"pdf_url":"https://arxiv.org/pdf/2311.15562v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08178v3","updated":"2024-03-19T03:47:39Z","published":"2024-01-16T07:51:00Z","title":"Key-point Guided Deformable Image Manipulation Using Diffusion Model","summary":" In this paper, we introduce a Key-point-guided Diffusion probabilistic Model\n(KDM) that gains precise control over images by manipulating the object's\nkey-point. We propose a two-stage generative model incorporating an optical\nflow map as an intermediate output. By doing so, a dense pixel-wise\nunderstanding of the semantic relation between the image and sparse key point\nis configured, leading to more realistic image generation. Additionally, the\nintegration of optical flow helps regulate the inter-frame variance of\nsequential images, demonstrating an authentic sequential image generation. The\nKDM is evaluated with diverse key-point conditioned image synthesis tasks,\nincluding facial image generation, human pose synthesis, and echocardiography\nvideo prediction, demonstrating the KDM is proving consistency enhanced and\nphoto-realistic images compared with state-of-the-art models.\n","authors":["Seok-Hwan Oh","Guil Jung","Myeong-Gee Kim","Sang-Yun Kim","Young-Min Kim","Hyeon-Jik Lee","Hyuk-Sool Kwon","Hyeon-Min Bae"],"pdf_url":"https://arxiv.org/pdf/2401.08178v3.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2403.12409v1","updated":"2024-03-19T03:39:43Z","published":"2024-03-19T03:39:43Z","title":"ComboVerse: Compositional 3D Assets Creation Using Spatially-Aware\n Diffusion Guidance","summary":" Generating high-quality 3D assets from a given image is highly desirable in\nvarious applications such as AR/VR. Recent advances in single-image 3D\ngeneration explore feed-forward models that learn to infer the 3D model of an\nobject without optimization. Though promising results have been achieved in\nsingle object generation, these methods often struggle to model complex 3D\nassets that inherently contain multiple objects. In this work, we present\nComboVerse, a 3D generation framework that produces high-quality 3D assets with\ncomplex compositions by learning to combine multiple models. 1) We first\nperform an in-depth analysis of this ``multi-object gap'' from both model and\ndata perspectives. 2) Next, with reconstructed 3D models of different objects,\nwe seek to adjust their sizes, rotation angles, and locations to create a 3D\nasset that matches the given image. 3) To automate this process, we apply\nspatially-aware score distillation sampling (SSDS) from pretrained diffusion\nmodels to guide the positioning of objects. Our proposed framework emphasizes\nspatial alignment of objects, compared with standard score distillation\nsampling, and thus achieves more accurate results. Extensive experiments\nvalidate ComboVerse achieves clear improvements over existing methods in\ngenerating compositional 3D assets.\n","authors":["Yongwei Chen","Tengfei Wang","Tong Wu","Xingang Pan","Kui Jia","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2403.12409v1.pdf","comment":"https://cyw-3d.github.io/ComboVerse/"},{"id":"http://arxiv.org/abs/2403.11674v2","updated":"2024-03-19T03:33:22Z","published":"2024-03-18T11:21:52Z","title":"Towards Generalizing to Unseen Domains with Few Labels","summary":" We approach the challenge of addressing semi-supervised domain generalization\n(SSDG). Specifically, our aim is to obtain a model that learns\ndomain-generalizable features by leveraging a limited subset of labelled data\nalongside a substantially larger pool of unlabeled data. Existing domain\ngeneralization (DG) methods which are unable to exploit unlabeled data perform\npoorly compared to semi-supervised learning (SSL) methods under SSDG setting.\nNevertheless, SSL methods have considerable room for performance improvement\nwhen compared to fully-supervised DG training. To tackle this underexplored,\nyet highly practical problem of SSDG, we make the following core contributions.\nFirst, we propose a feature-based conformity technique that matches the\nposterior distributions from the feature space with the pseudo-label from the\nmodel's output space. Second, we develop a semantics alignment loss to learn\nsemantically-compatible representations by regularizing the semantic structure\nin the feature space. Our method is plug-and-play and can be readily integrated\nwith different SSL-based SSDG baselines without introducing any additional\nparameters. Extensive experimental results across five challenging DG\nbenchmarks with four strong SSL baselines suggest that our method provides\nconsistent and notable gains in two different SSDG settings.\n","authors":["Chamuditha Jayanga Galappaththige","Sanoojan Baliah","Malitha Gunawardhana","Muhammad Haris Khan"],"pdf_url":"https://arxiv.org/pdf/2403.11674v2.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.12404v1","updated":"2024-03-19T03:27:01Z","published":"2024-03-19T03:27:01Z","title":"Understanding Training-free Diffusion Guidance: Mechanisms and\n Limitations","summary":" Adding additional control to pretrained diffusion models has become an\nincreasingly popular research area, with extensive applications in computer\nvision, reinforcement learning, and AI for science. Recently, several studies\nhave proposed training-free diffusion guidance by using off-the-shelf networks\npretrained on clean images. This approach enables zero-shot conditional\ngeneration for universal control formats, which appears to offer a free lunch\nin diffusion guidance. In this paper, we aim to develop a deeper understanding\nof the operational mechanisms and fundamental limitations of training-free\nguidance. We offer a theoretical analysis that supports training-free guidance\nfrom the perspective of optimization, distinguishing it from classifier-based\n(or classifier-free) guidance. To elucidate their drawbacks, we theoretically\ndemonstrate that training-free methods are more susceptible to adversarial\ngradients and exhibit slower convergence rates compared to classifier guidance.\nWe then introduce a collection of techniques designed to overcome the\nlimitations, accompanied by theoretical rationale and empirical evidence. Our\nexperiments in image and motion generation confirm the efficacy of these\ntechniques.\n","authors":["Yifei Shen","Xinyang Jiang","Yezhen Wang","Yifan Yang","Dongqi Han","Dongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2403.12404v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.09297v2","updated":"2024-03-19T03:25:50Z","published":"2023-09-17T15:14:01Z","title":"Chasing Day and Night: Towards Robust and Efficient All-Day Object\n Detection Guided by an Event Camera","summary":" The ability to detect objects in all lighting (i.e., normal-, over-, and\nunder-exposed) conditions is crucial for real-world applications, such as\nself-driving.Traditional RGB-based detectors often fail under such varying\nlighting conditions.Therefore, recent works utilize novel event cameras to\nsupplement or guide the RGB modality; however, these methods typically adopt\nasymmetric network structures that rely predominantly on the RGB modality,\nresulting in limited robustness for all-day detection. In this paper, we\npropose EOLO, a novel object detection framework that achieves robust and\nefficient all-day detection by fusing both RGB and event modalities. Our EOLO\nframework is built based on a lightweight spiking neural network (SNN) to\nefficiently leverage the asynchronous property of events. Buttressed by it, we\nfirst introduce an Event Temporal Attention (ETA) module to learn the high\ntemporal information from events while preserving crucial edge information.\nSecondly, as different modalities exhibit varying levels of importance under\ndiverse lighting conditions, we propose a novel Symmetric RGB-Event Fusion\n(SREF) module to effectively fuse RGB-Event features without relying on a\nspecific modality, thus ensuring a balanced and adaptive fusion for all-day\ndetection. In addition, to compensate for the lack of paired RGB-Event datasets\nfor all-day training and evaluation, we propose an event synthesis approach\nbased on the randomized optical flow that allows for directly generating the\nevent frame from a single exposure image. We further build two new datasets,\nE-MSCOCO and E-VOC based on the popular benchmarks MSCOCO and PASCAL VOC.\nExtensive experiments demonstrate that our EOLO outperforms the\nstate-of-the-art detectors,e.g.,RENet,by a substantial margin (+3.74% mAP50) in\nall lighting conditions.Our code and datasets will be available at\nhttps://vlislab22.github.io/EOLO/\n","authors":["Jiahang Cao","Xu Zheng","Yuanhuiyi Lyu","Jiaxu Wang","Renjing Xu","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2309.09297v2.pdf","comment":"Accepted by ICRA 2024"},{"id":"http://arxiv.org/abs/2403.12401v1","updated":"2024-03-19T03:19:07Z","published":"2024-03-19T03:19:07Z","title":"VQ-NeRV: A Vector Quantized Neural Representation for Videos","summary":" Implicit neural representations (INR) excel in encoding videos within neural\nnetworks, showcasing promise in computer vision tasks like video compression\nand denoising. INR-based approaches reconstruct video frames from\ncontent-agnostic embeddings, which hampers their efficacy in video frame\nregression and restricts their generalization ability for video interpolation.\nTo address these deficiencies, Hybrid Neural Representation for Videos (HNeRV)\nwas introduced with content-adaptive embeddings. Nevertheless, HNeRV's\ncompression ratios remain relatively low, attributable to an oversight in\nleveraging the network's shallow features and inter-frame residual information.\nIn this work, we introduce an advanced U-shaped architecture, Vector\nQuantized-NeRV (VQ-NeRV), which integrates a novel component--the VQ-NeRV\nBlock. This block incorporates a codebook mechanism to discretize the network's\nshallow residual features and inter-frame residual information effectively.\nThis approach proves particularly advantageous in video compression, as it\nresults in smaller size compared to quantized features. Furthermore, we\nintroduce an original codebook optimization technique, termed shallow codebook\noptimization, designed to refine the utility and efficiency of the codebook.\nThe experimental evaluations indicate that VQ-NeRV outperforms HNeRV on video\nregression tasks, delivering superior reconstruction quality (with an increase\nof 1-2 dB in Peak Signal-to-Noise Ratio (PSNR)), better bit per pixel (bpp)\nefficiency, and improved video inpainting outcomes.\n","authors":["Yunjie Xu","Xiang Feng","Feiwei Qin","Ruiquan Ge","Yong Peng","Changmiao Wang"],"pdf_url":"https://arxiv.org/pdf/2403.12401v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2403.12396v1","updated":"2024-03-19T03:09:24Z","published":"2024-03-19T03:09:24Z","title":"OV9D: Open-Vocabulary Category-Level 9D Object Pose and Size Estimation","summary":" This paper studies a new open-set problem, the open-vocabulary category-level\nobject pose and size estimation. Given human text descriptions of arbitrary\nnovel object categories, the robot agent seeks to predict the position,\norientation, and size of the target object in the observed scene image. To\nenable such generalizability, we first introduce OO3D-9D, a large-scale\nphotorealistic dataset for this task. Derived from OmniObject3D, OO3D-9D is the\nlargest and most diverse dataset in the field of category-level object pose and\nsize estimation. It includes additional annotations for the symmetry axis of\neach category, which help resolve symmetric ambiguity. Apart from the\nlarge-scale dataset, we find another key to enabling such generalizability is\nleveraging the strong prior knowledge in pre-trained visual-language foundation\nmodels. We then propose a framework built on pre-trained DinoV2 and\ntext-to-image stable diffusion models to infer the normalized object coordinate\nspace (NOCS) maps of the target instances. This framework fully leverages the\nvisual semantic prior from DinoV2 and the aligned visual and language knowledge\nwithin the text-to-image diffusion model, which enables generalization to\nvarious text descriptions of novel categories. Comprehensive quantitative and\nqualitative experiments demonstrate that the proposed open-vocabulary method,\ntrained on our large-scale synthesized data, significantly outperforms the\nbaseline and can effectively generalize to real-world images of unseen\ncategories. The project page is at https://ov9d.github.io.\n","authors":["Junhao Cai","Yisheng He","Weihao Yuan","Siyu Zhu","Zilong Dong","Liefeng Bo","Qifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2403.12396v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10147v2","updated":"2024-03-19T03:03:14Z","published":"2024-03-15T09:47:35Z","title":"GGRt: Towards Pose-free Generalizable 3D Gaussian Splatting in Real-time","summary":" This paper presents GGRt, a novel approach to generalizable novel view\nsynthesis that alleviates the need for real camera poses, complexity in\nprocessing high-resolution images, and lengthy optimization processes, thus\nfacilitating stronger applicability of 3D Gaussian Splatting (3D-GS) in\nreal-world scenarios. Specifically, we design a novel joint learning framework\nthat consists of an Iterative Pose Optimization Network (IPO-Net) and a\nGeneralizable 3D-Gaussians (G-3DG) model. With the joint learning mechanism,\nthe proposed framework can inherently estimate robust relative pose information\nfrom the image observations and thus primarily alleviate the requirement of\nreal camera poses. Moreover, we implement a deferred back-propagation mechanism\nthat enables high-resolution training and inference, overcoming the resolution\nconstraints of previous methods. To enhance the speed and efficiency, we\nfurther introduce a progressive Gaussian cache module that dynamically adjusts\nduring training and inference. As the first pose-free generalizable 3D-GS\nframework, GGRt achieves inference at $\\ge$ 5 FPS and real-time rendering at\n$\\ge$ 100 FPS. Through extensive experimentation, we demonstrate that our\nmethod outperforms existing NeRF-based pose-free techniques in terms of\ninference speed and effectiveness. It can also approach the real pose-based\n3D-GS methods. Our contributions provide a significant leap forward for the\nintegration of computer vision and computer graphics into practical\napplications, offering state-of-the-art results on LLFF, KITTI, and Waymo Open\ndatasets and enabling real-time rendering for immersive experiences.\n","authors":["Hao Li","Yuanyuan Gao","Chenming Wu","Dingwen Zhang","Yalun Dai","Chen Zhao","Haocheng Feng","Errui Ding","Jingdong Wang","Junwei Han"],"pdf_url":"https://arxiv.org/pdf/2403.10147v2.pdf","comment":"Project page:\n \\href{https://3d-aigc.github.io/GGRt}{https://3d-aigc.github.io/GGRt}"},{"id":"http://arxiv.org/abs/2311.06455v2","updated":"2024-03-19T02:59:03Z","published":"2023-11-11T01:56:35Z","title":"Aria-NeRF: Multimodal Egocentric View Synthesis","summary":" We seek to accelerate research in developing rich, multimodal scene models\ntrained from egocentric data, based on differentiable volumetric ray-tracing\ninspired by Neural Radiance Fields (NeRFs). The construction of a NeRF-like\nmodel from an egocentric image sequence plays a pivotal role in understanding\nhuman behavior and holds diverse applications within the realms of VR/AR. Such\negocentric NeRF-like models may be used as realistic simulations, contributing\nsignificantly to the advancement of intelligent agents capable of executing\ntasks in the real-world. The future of egocentric view synthesis may lead to\nnovel environment representations going beyond today's NeRFs by augmenting\nvisual data with multimodal sensors such as IMU for egomotion tracking, audio\nsensors to capture surface texture and human language context, and eye-gaze\ntrackers to infer human attention patterns in the scene. To support and\nfacilitate the development and evaluation of egocentric multimodal scene\nmodeling, we present a comprehensive multimodal egocentric video dataset. This\ndataset offers a comprehensive collection of sensory data, featuring RGB\nimages, eye-tracking camera footage, audio recordings from a microphone,\natmospheric pressure readings from a barometer, positional coordinates from\nGPS, connectivity details from Wi-Fi and Bluetooth, and information from\ndual-frequency IMU datasets (1kHz and 800Hz) paired with a magnetometer. The\ndataset was collected with the Meta Aria Glasses wearable device platform. The\ndiverse data modalities and the real-world context captured within this dataset\nserve as a robust foundation for furthering our understanding of human behavior\nand enabling more immersive and intelligent experiences in the realms of VR,\nAR, and robotics.\n","authors":["Jiankai Sun","Jianing Qiu","Chuanyang Zheng","John Tucker","Javier Yu","Mac Schwager"],"pdf_url":"https://arxiv.org/pdf/2311.06455v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12385v1","updated":"2024-03-19T02:52:06Z","published":"2024-03-19T02:52:06Z","title":"VideoBadminton: A Video Dataset for Badminton Action Recognition","summary":" In the dynamic and evolving field of computer vision, action recognition has\nbecome a key focus, especially with the advent of sophisticated methodologies\nlike Convolutional Neural Networks (CNNs), Convolutional 3D, Transformer, and\nspatial-temporal feature fusion. These technologies have shown promising\nresults on well-established benchmarks but face unique challenges in real-world\napplications, particularly in sports analysis, where the precise decomposition\nof activities and the distinction of subtly different actions are crucial.\nExisting datasets like UCF101, HMDB51, and Kinetics have offered a diverse\nrange of video data for various scenarios. However, there's an increasing need\nfor fine-grained video datasets that capture detailed categorizations and\nnuances within broader action categories. In this paper, we introduce the\nVideoBadminton dataset derived from high-quality badminton footage. Through an\nexhaustive evaluation of leading methodologies on this dataset, this study aims\nto advance the field of action recognition, particularly in badminton sports.\nThe introduction of VideoBadminton could not only serve for badminton action\nrecognition but also provide a dataset for recognizing fine-grained actions.\nThe insights gained from these evaluations are expected to catalyze further\nresearch in action comprehension, especially within sports contexts.\n","authors":["Qi Li","Tzu-Chen Chiu","Hsiang-Wei Huang","Min-Te Sun","Wei-Shinn Ku"],"pdf_url":"https://arxiv.org/pdf/2403.12385v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12382v1","updated":"2024-03-19T02:47:33Z","published":"2024-03-19T02:47:33Z","title":"Low-Trace Adaptation of Zero-shot Self-supervised Blind Image Denoising","summary":" Deep learning-based denoiser has been the focus of recent development on\nimage denoising. In the past few years, there has been increasing interest in\ndeveloping self-supervised denoising networks that only require noisy images,\nwithout the need for clean ground truth for training. However, a performance\ngap remains between current self-supervised methods and their supervised\ncounterparts. Additionally, these methods commonly depend on assumptions about\nnoise characteristics, thereby constraining their applicability in real-world\nscenarios. Inspired by the properties of the Frobenius norm expansion, we\ndiscover that incorporating a trace term reduces the optimization goal\ndisparity between self-supervised and supervised methods, thereby enhancing the\nperformance of self-supervised learning. To exploit this insight, we propose a\ntrace-constraint loss function and design the low-trace adaptation Noise2Noise\n(LoTA-N2N) model that bridges the gap between self-supervised and supervised\nlearning. Furthermore, we have discovered that several existing self-supervised\ndenoising frameworks naturally fall within the proposed trace-constraint loss\nas subcases. Extensive experiments conducted on natural and confocal image\ndatasets indicate that our method achieves state-of-the-art performance within\nthe realm of zero-shot self-supervised image denoising approaches, without\nrelying on any assumptions regarding the noise.\n","authors":["Jintong Hu","Bin Xia","Bingchen Li","Wenming Yang"],"pdf_url":"https://arxiv.org/pdf/2403.12382v1.pdf","comment":"11pages, 6 figures"},{"id":"http://arxiv.org/abs/2403.10801v2","updated":"2024-03-19T02:45:48Z","published":"2024-03-16T04:23:46Z","title":"Securely Fine-tuning Pre-trained Encoders Against Adversarial Examples","summary":" With the evolution of self-supervised learning, the pre-training paradigm has\nemerged as a predominant solution within the deep learning landscape. Model\nproviders furnish pre-trained encoders designed to function as versatile\nfeature extractors, enabling downstream users to harness the benefits of\nexpansive models with minimal effort through fine-tuning. Nevertheless, recent\nworks have exposed a vulnerability in pre-trained encoders, highlighting their\nsusceptibility to downstream-agnostic adversarial examples (DAEs) meticulously\ncrafted by attackers. The lingering question pertains to the feasibility of\nfortifying the robustness of downstream models against DAEs, particularly in\nscenarios where the pre-trained encoders are publicly accessible to the\nattackers.\n In this paper, we initially delve into existing defensive mechanisms against\nadversarial examples within the pre-training paradigm. Our findings reveal that\nthe failure of current defenses stems from the domain shift between\npre-training data and downstream tasks, as well as the sensitivity of encoder\nparameters. In response to these challenges, we propose Genetic\nEvolution-Nurtured Adversarial Fine-tuning (Gen-AF), a two-stage adversarial\nfine-tuning approach aimed at enhancing the robustness of downstream models.\nOur extensive experiments, conducted across ten self-supervised training\nmethods and six datasets, demonstrate that Gen-AF attains high testing accuracy\nand robust testing accuracy against state-of-the-art DAEs.\n","authors":["Ziqi Zhou","Minghui Li","Wei Liu","Shengshan Hu","Yechao Zhang","Wei Wan","Lulu Xue","Leo Yu Zhang","Dezhong Yao","Hai Jin"],"pdf_url":"https://arxiv.org/pdf/2403.10801v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.10523v2","updated":"2024-03-19T02:42:28Z","published":"2023-04-20T17:52:58Z","title":"GenCorres: Consistent Shape Matching via Coupled Implicit-Explicit Shape\n Generative Models","summary":" This paper introduces GenCorres, a novel unsupervised joint shape matching\n(JSM) approach. Our key idea is to learn a mesh generator to fit an unorganized\ndeformable shape collection while constraining deformations between adjacent\nsynthetic shapes to preserve geometric structures such as local rigidity and\nlocal conformality. GenCorres presents three appealing advantages over existing\nJSM techniques. First, GenCorres performs JSM among a synthetic shape\ncollection whose size is much bigger than the input shapes and fully leverages\nthe datadriven power of JSM. Second, GenCorres unifies consistent shape\nmatching and pairwise matching (i.e., by enforcing deformation priors between\nadjacent synthetic shapes). Third, the generator provides a concise encoding of\nconsistent shape correspondences. However, learning a mesh generator from an\nunorganized shape collection is challenging, requiring a good initialization.\nGenCorres addresses this issue by learning an implicit generator from the input\nshapes, which provides intermediate shapes between two arbitrary shapes. We\nintroduce a novel approach for computing correspondences between adjacent\nimplicit surfaces, which we use to regularize the implicit generator. Synthetic\nshapes of the implicit generator then guide initial fittings (i.e., via\ntemplate-based deformation) for learning the mesh generator. Experimental\nresults show that GenCorres considerably outperforms state-of-the-art JSM\ntechniques. The synthetic shapes of GenCorres also achieve salient performance\ngains against state-of-the-art deformable shape generators.\n","authors":["Haitao Yang","Xiangru Huang","Bo Sun","Chandrajit Bajaj","Qixing Huang"],"pdf_url":"https://arxiv.org/pdf/2304.10523v2.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2403.12370v1","updated":"2024-03-19T02:29:34Z","published":"2024-03-19T02:29:34Z","title":"XPose: eXplainable Human Pose Estimation","summary":" Current approaches in pose estimation primarily concentrate on enhancing\nmodel architectures, often overlooking the importance of comprehensively\nunderstanding the rationale behind model decisions. In this paper, we propose\nXPose, a novel framework that incorporates Explainable AI (XAI) principles into\npose estimation. This integration aims to elucidate the individual contribution\nof each keypoint to final prediction, thereby elevating the model's\ntransparency and interpretability. Conventional XAI techniques have\npredominantly addressed tasks with single-target tasks like classification.\nAdditionally, the application of Shapley value, a common measure in XAI, to\npose estimation has been hindered by prohibitive computational demands.\n To address these challenges, this work introduces an innovative concept\ncalled Group Shapley Value (GSV). This approach strategically organizes\nkeypoints into clusters based on their interdependencies. Within these\nclusters, GSV meticulously calculates Shapley value for keypoints, while for\ninter-cluster keypoints, it opts for a more holistic group-level valuation.\nThis dual-level computation framework meticulously assesses keypoint\ncontributions to the final outcome, optimizing computational efficiency.\nBuilding on the insights into keypoint interactions, we devise a novel data\naugmentation technique known as Group-based Keypoint Removal (GKR). This method\ningeniously removes individual keypoints during training phases, deliberately\npreserving those with strong mutual connections, thereby refining the model's\npredictive prowess for non-visible keypoints. The empirical validation of GKR\nacross a spectrum of standard approaches attests to its efficacy. GKR's success\ndemonstrates how using Explainable AI (XAI) can directly enhance pose\nestimation models.\n","authors":["Luyu Qiu","Jianing Li","Lei Wen","Chi Su","Fei Hao","Chen Jason Zhang","Lei Chen"],"pdf_url":"https://arxiv.org/pdf/2403.12370v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10911v2","updated":"2024-03-19T02:26:55Z","published":"2024-03-16T12:18:20Z","title":"Efficient Diffusion-Driven Corruption Editor for Test-Time Adaptation","summary":" Test-time adaptation (TTA) addresses the unforeseen distribution shifts\noccurring during test time. In TTA, both performance and, memory and time\nconsumption serve as crucial considerations. A recent diffusion-based TTA\napproach for restoring corrupted images involves image-level updates. However,\nusing pixel space diffusion significantly increases resource requirements\ncompared to conventional model updating TTA approaches, revealing limitations\nas a TTA method. To address this, we propose a novel TTA method by leveraging a\nlatent diffusion model (LDM) based image editing model and fine-tuning it with\nour newly introduced corruption modeling scheme. This scheme enhances the\nrobustness of the diffusion model against distribution shifts by creating\n(clean, corrupted) image pairs and fine-tuning the model to edit corrupted\nimages into clean ones. Moreover, we introduce a distilled variant to\naccelerate the model for corruption editing using only 4 network function\nevaluations (NFEs). We extensively validated our method across various\narchitectures and datasets including image and video domains. Our model\nachieves the best performance with a 100 times faster runtime than that of a\ndiffusion-based baseline. Furthermore, it outpaces the speed of the model\nupdating TTA method based on data augmentation threefold, rendering an\nimage-level updating approach more practical.\n","authors":["Yeongtak Oh","Jonghyun Lee","Jooyoung Choi","Dahuin Jung","Uiwon Hwang","Sungroh Yoon"],"pdf_url":"https://arxiv.org/pdf/2403.10911v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12365v1","updated":"2024-03-19T02:22:21Z","published":"2024-03-19T02:22:21Z","title":"GaussianFlow: Splatting Gaussian Dynamics for 4D Content Creation","summary":" Creating 4D fields of Gaussian Splatting from images or videos is a\nchallenging task due to its under-constrained nature. While the optimization\ncan draw photometric reference from the input videos or be regulated by\ngenerative models, directly supervising Gaussian motions remains underexplored.\nIn this paper, we introduce a novel concept, Gaussian flow, which connects the\ndynamics of 3D Gaussians and pixel velocities between consecutive frames. The\nGaussian flow can be efficiently obtained by splatting Gaussian dynamics into\nthe image space. This differentiable process enables direct dynamic supervision\nfrom optical flow. Our method significantly benefits 4D dynamic content\ngeneration and 4D novel view synthesis with Gaussian Splatting, especially for\ncontents with rich motions that are hard to be handled by existing methods. The\ncommon color drifting issue that happens in 4D generation is also resolved with\nimproved Guassian dynamics. Superior visual quality on extensive experiments\ndemonstrates our method's effectiveness. Quantitative and qualitative\nevaluations show that our method achieves state-of-the-art results on both\ntasks of 4D generation and 4D novel view synthesis. Project page:\nhttps://zerg-overmind.github.io/GaussianFlow.github.io/\n","authors":["Quankai Gao","Qiangeng Xu","Zhe Cao","Ben Mildenhall","Wenchao Ma","Le Chen","Danhang Tang","Ulrich Neumann"],"pdf_url":"https://arxiv.org/pdf/2403.12365v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12364v1","updated":"2024-03-19T02:19:57Z","published":"2024-03-19T02:19:57Z","title":"Class and Region-Adaptive Constraints for Network Calibration","summary":" In this work, we present a novel approach to calibrate segmentation networks\nthat considers the inherent challenges posed by different categories and object\nregions. In particular, we present a formulation that integrates class and\nregion-wise constraints into the learning objective, with multiple penalty\nweights to account for class and region differences. Finding the optimal\npenalty weights manually, however, might be unfeasible, and potentially hinder\nthe optimization process. To overcome this limitation, we propose an approach\nbased on Class and Region-Adaptive constraints (CRaC), which allows to learn\nthe class and region-wise penalty weights during training. CRaC is based on a\ngeneral Augmented Lagrangian method, a well-established technique in\nconstrained optimization. Experimental results on two popular segmentation\nbenchmarks, and two well-known segmentation networks, demonstrate the\nsuperiority of CRaC compared to existing approaches. The code is available at:\nhttps://github.com/Bala93/CRac/\n","authors":["Balamurali Murugesan","Julio Silva-Rodriguez","Ismail Ben Ayed","Jose Dolz"],"pdf_url":"https://arxiv.org/pdf/2403.12364v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2403.02628v2","updated":"2024-03-19T02:19:52Z","published":"2024-03-05T03:37:28Z","title":"Interactive Continual Learning: Fast and Slow Thinking","summary":" Advanced life forms, sustained by the synergistic interaction of neural\ncognitive mechanisms, continually acquire and transfer knowledge throughout\ntheir lifespan. In contrast, contemporary machine learning paradigms exhibit\nlimitations in emulating the facets of continual learning (CL). Nonetheless,\nthe emergence of large language models (LLMs) presents promising avenues for\nrealizing CL via interactions with these models. Drawing on Complementary\nLearning System theory, this paper presents a novel Interactive Continual\nLearning (ICL) framework, enabled by collaborative interactions among models of\nvarious sizes. Specifically, we assign the ViT model as System1 and multimodal\nLLM as System2. To enable the memory module to deduce tasks from class\ninformation and enhance Set2Set retrieval, we propose the Class-Knowledge-Task\nMulti-Head Attention (CKT-MHA). Additionally, to improve memory retrieval in\nSystem1 through enhanced geometric representation, we introduce the CL-vMF\nmechanism, based on the von Mises-Fisher (vMF) distribution. Meanwhile, we\nintroduce the von Mises-Fisher Outlier Detection and Interaction (vMF-ODI)\nstrategy to identify hard examples, thus enhancing collaboration between\nSystem1 and System2 for complex reasoning realization. Comprehensive evaluation\nof our proposed ICL demonstrates significant resistance to forgetting and\nsuperior performance relative to existing methods. Code is available at\ngithub.com/ICL.\n","authors":["Biqing Qi","Xingquan Chen","Junqi Gao","Dong Li","Jianxing Liu","Ligang Wu","Bowen Zhou"],"pdf_url":"https://arxiv.org/pdf/2403.02628v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.12362v1","updated":"2024-03-19T02:16:32Z","published":"2024-03-19T02:16:32Z","title":"DMAD: Dual Memory Bank for Real-World Anomaly Detection","summary":" Training a unified model is considered to be more suitable for practical\nindustrial anomaly detection scenarios due to its generalization ability and\nstorage efficiency. However, this multi-class setting, which exclusively uses\nnormal data, overlooks the few but important accessible annotated anomalies in\nthe real world. To address the challenge of real-world anomaly detection, we\npropose a new framework named Dual Memory bank enhanced representation learning\nfor Anomaly Detection (DMAD). This framework handles both unsupervised and\nsemi-supervised scenarios in a unified (multi-class) setting. DMAD employs a\ndual memory bank to calculate feature distance and feature attention between\nnormal and abnormal patterns, thereby encapsulating knowledge about normal and\nabnormal instances. This knowledge is then used to construct an enhanced\nrepresentation for anomaly score learning. We evaluated DMAD on the MVTec-AD\nand VisA datasets. The results show that DMAD surpasses current\nstate-of-the-art methods, highlighting DMAD's capability in handling the\ncomplexities of real-world anomaly detection scenarios.\n","authors":["Jianlong Hu","Xu Chen","Zhenye Gan","Jinlong Peng","Shengchuan Zhang","Jiangning Zhang","Yabiao Wang","Chengjie Wang","Liujuan Cao","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2403.12362v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11689v2","updated":"2024-03-19T02:13:46Z","published":"2024-03-18T11:38:47Z","title":"MoreStyle: Relax Low-frequency Constraint of Fourier-based Image\n Reconstruction in Generalizable Medical Image Segmentation","summary":" The task of single-source domain generalization (SDG) in medical image\nsegmentation is crucial due to frequent domain shifts in clinical image\ndatasets. To address the challenge of poor generalization across different\ndomains, we introduce a Plug-and-Play module for data augmentation called\nMoreStyle. MoreStyle diversifies image styles by relaxing low-frequency\nconstraints in Fourier space, guiding the image reconstruction network. With\nthe help of adversarial learning, MoreStyle further expands the style range and\npinpoints the most intricate style combinations within latent features. To\nhandle significant style variations, we introduce an uncertainty-weighted loss.\nThis loss emphasizes hard-to-classify pixels resulting only from style shifts\nwhile mitigating true hard-to-classify pixels in both MoreStyle-generated and\noriginal images. Extensive experiments on two widely used benchmarks\ndemonstrate that the proposed MoreStyle effectively helps to achieve good\ndomain generalization ability, and has the potential to further boost the\nperformance of some state-of-the-art SDG methods.\n","authors":["Haoyu Zhao","Wenhui Dong","Rui Yu","Zhou Zhao","Du Bo","Yongchao Xu"],"pdf_url":"https://arxiv.org/pdf/2403.11689v2.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2403.11672v2","updated":"2024-03-19T02:07:11Z","published":"2024-03-18T11:20:11Z","title":"WIA-LD2ND: Wavelet-based Image Alignment for Self-supervised Low-Dose CT\n Denoising","summary":" In clinical examinations and diagnoses, low-dose computed tomography (LDCT)\nis crucial for minimizing health risks compared with normal-dose computed\ntomography (NDCT). However, reducing the radiation dose compromises the\nsignal-to-noise ratio, leading to degraded quality of CT images. To address\nthis, we analyze LDCT denoising task based on experimental results from the\nfrequency perspective, and then introduce a novel self-supervised CT image\ndenoising method called WIA-LD2ND, only using NDCT data. The proposed WIA-LD2ND\ncomprises two modules: Wavelet-based Image Alignment (WIA) and Frequency-Aware\nMulti-scale Loss (FAM). First, WIA is introduced to align NDCT with LDCT by\nmainly adding noise to the high-frequency components, which is the main\ndifference between LDCT and NDCT. Second, to better capture high-frequency\ncomponents and detailed information, Frequency-Aware Multi-scale Loss (FAM) is\nproposed by effectively utilizing multi-scale feature space. Extensive\nexperiments on two public LDCT denoising datasets demonstrate that our\nWIA-LD2ND, only uses NDCT, outperforms existing several state-of-the-art\nweakly-supervised and self-supervised methods.\n","authors":["Haoyu Zhao","Yuliang Gu","Zhou Zhao","Bo Du","Yongchao Xu","Rui Yu"],"pdf_url":"https://arxiv.org/pdf/2403.11672v2.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2403.10931v2","updated":"2024-03-19T01:54:43Z","published":"2024-03-16T14:11:54Z","title":"Uncertainty-Aware Adapter: Adapting Segment Anything Model (SAM) for\n Ambiguous Medical Image Segmentation","summary":" The Segment Anything Model (SAM) gained significant success in natural image\nsegmentation, and many methods have tried to fine-tune it to medical image\nsegmentation. An efficient way to do so is by using Adapters, specialized\nmodules that learn just a few parameters to tailor SAM specifically for medical\nimages. However, unlike natural images, many tissues and lesions in medical\nimages have blurry boundaries and may be ambiguous. Previous efforts to adapt\nSAM ignore this challenge and can only predict distinct segmentation. It may\nmislead clinicians or cause misdiagnosis, especially when encountering rare\nvariants or situations with low model confidence. In this work, we propose a\nnovel module called the Uncertainty-aware Adapter, which efficiently\nfine-tuning SAM for uncertainty-aware medical image segmentation. Utilizing a\nconditional variational autoencoder, we encoded stochastic samples to\neffectively represent the inherent uncertainty in medical imaging. We designed\na new module on a standard adapter that utilizes a condition-based strategy to\ninteract with samples to help SAM integrate uncertainty. We evaluated our\nmethod on two multi-annotated datasets with different modalities: LIDC-IDRI\n(lung abnormalities segmentation) and REFUGE2 (optic-cup segmentation). The\nexperimental results show that the proposed model outperforms all the previous\nmethods and achieves the new state-of-the-art (SOTA) on both benchmarks. We\nalso demonstrated that our method can generate diverse segmentation hypotheses\nthat are more realistic as well as heterogeneous.\n","authors":["Mingzhou Jiang","Jiaying Zhou","Junde Wu","Tianyang Wang","Yueming Jin","Min Xu"],"pdf_url":"https://arxiv.org/pdf/2403.10931v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12339v1","updated":"2024-03-19T01:07:53Z","published":"2024-03-19T01:07:53Z","title":"Entity6K: A Large Open-Domain Evaluation Dataset for Real-World Entity\n Recognition","summary":" Open-domain real-world entity recognition is essential yet challenging,\ninvolving identifying various entities in diverse environments. The lack of a\nsuitable evaluation dataset has been a major obstacle in this field due to the\nvast number of entities and the extensive human effort required for data\ncuration. We introduce Entity6K, a comprehensive dataset for real-world entity\nrecognition, featuring 5,700 entities across 26 categories, each supported by 5\nhuman-verified images with annotations. Entity6K offers a diverse range of\nentity names and categorizations, addressing a gap in existing datasets. We\nconducted benchmarks with existing models on tasks like image captioning,\nobject detection, zero-shot classification, and dense captioning to demonstrate\nEntity6K's effectiveness in evaluating models' entity recognition capabilities.\nWe believe Entity6K will be a valuable resource for advancing accurate entity\nrecognition in open-domain settings.\n","authors":["Jielin Qiu","William Han","Winfred Wang","Zhengyuan Yang","Linjie Li","Jianfeng Wang","Christos Faloutsos","Lei Li","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2403.12339v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01582v4","updated":"2024-03-19T01:05:26Z","published":"2023-09-04T13:10:11Z","title":"Improving Visual Quality and Transferability of Adversarial Attacks on\n Face Recognition Simultaneously with Adversarial Restoration","summary":" Adversarial face examples possess two critical properties: Visual Quality and\nTransferability. However, existing approaches rarely address these properties\nsimultaneously, leading to subpar results. To address this issue, we propose a\nnovel adversarial attack technique known as Adversarial Restoration\n(AdvRestore), which enhances both visual quality and transferability of\nadversarial face examples by leveraging a face restoration prior. In our\napproach, we initially train a Restoration Latent Diffusion Model (RLDM)\ndesigned for face restoration. Subsequently, we employ the inference process of\nRLDM to generate adversarial face examples. The adversarial perturbations are\napplied to the intermediate features of RLDM. Additionally, by treating RLDM\nface restoration as a sibling task, the transferability of the generated\nadversarial face examples is further improved. Our experimental results\nvalidate the effectiveness of the proposed attack method.\n","authors":["Fengfan Zhou","Hefei Ling","Yuxuan Shi","Jiazhong Chen","Ping Li"],"pdf_url":"https://arxiv.org/pdf/2309.01582v4.pdf","comment":"\\copyright 2023 IEEE. Personal use of this material is permitted.\n Permission from IEEE must be obtained for all other uses, in any current or\n future media, including reprinting/republishing this material for advertising\n or promotional purposes, creating new collective works, for resale or\n redistribution to servers or lists, or reuse of any copyrighted component of\n this work in other works"},{"id":"http://arxiv.org/abs/2403.12331v1","updated":"2024-03-19T00:07:48Z","published":"2024-03-19T00:07:48Z","title":"Deep Few-view High-resolution Photon-counting Extremity CT at Halved\n Dose for a Clinical Trial","summary":" The latest X-ray photon-counting computed tomography (PCCT) for extremity\nallows multi-energy high-resolution (HR) imaging for tissue characterization\nand material decomposition. However, both radiation dose and imaging speed need\nimprovement for contrast-enhanced and other studies. Despite the success of\ndeep learning methods for 2D few-view reconstruction, applying them to HR\nvolumetric reconstruction of extremity scans for clinical diagnosis has been\nlimited due to GPU memory constraints, training data scarcity, and domain gap\nissues. In this paper, we propose a deep learning-based approach for PCCT image\nreconstruction at halved dose and doubled speed in a New Zealand clinical\ntrial. Particularly, we present a patch-based volumetric refinement network to\nalleviate the GPU memory limitation, train network with synthetic data, and use\nmodel-based iterative refinement to bridge the gap between synthetic and\nreal-world data. The simulation and phantom experiments demonstrate\nconsistently improved results under different acquisition conditions on both\nin- and off-domain structures using a fixed network. The image quality of 8\npatients from the clinical trial are evaluated by three radiologists in\ncomparison with the standard image reconstruction with a full-view dataset. It\nis shown that our proposed approach is essentially identical to or better than\nthe clinical benchmark in terms of diagnostic image quality scores. Our\napproach has a great potential to improve the safety and efficiency of PCCT\nwithout compromising image quality.\n","authors":["Mengzhou Li","Chuang Niu","Ge Wang","Maya R Amma","Krishna M Chapagain","Stefan Gabrielson","Andrew Li","Kevin Jonker","Niels de Ruiter","Jennifer A Clark","Phil Butler","Anthony Butler","Hengyong Yu"],"pdf_url":"https://arxiv.org/pdf/2403.12331v1.pdf","comment":"9 figures, 5 tables"},{"id":"http://arxiv.org/abs/2403.13206v1","updated":"2024-03-19T23:54:07Z","published":"2024-03-19T23:54:07Z","title":"Depth-guided NeRF Training via Earth Mover's Distance","summary":" Neural Radiance Fields (NeRFs) are trained to minimize the rendering loss of\npredicted viewpoints. However, the photometric loss often does not provide\nenough information to disambiguate between different possible geometries\nyielding the same image. Previous work has thus incorporated depth supervision\nduring NeRF training, leveraging dense predictions from pre-trained depth\nnetworks as pseudo-ground truth. While these depth priors are assumed to be\nperfect once filtered for noise, in practice, their accuracy is more\nchallenging to capture. This work proposes a novel approach to uncertainty in\ndepth priors for NeRF supervision. Instead of using custom-trained depth or\nuncertainty priors, we use off-the-shelf pretrained diffusion models to predict\ndepth and capture uncertainty during the denoising process. Because we know\nthat depth priors are prone to errors, we propose to supervise the ray\ntermination distance distribution with Earth Mover's Distance instead of\nenforcing the rendered depth to replicate the depth prior exactly through\nL2-loss. Our depth-guided NeRF outperforms all baselines on standard depth\nmetrics by a large margin while maintaining performance on photometric\nmeasures.\n","authors":["Anita Rau","Josiah Aklilu","F. Christopher Holsinger","Serena Yeung-Levy"],"pdf_url":"https://arxiv.org/pdf/2403.13206v1.pdf","comment":"Preprint. Under review"},{"id":"http://arxiv.org/abs/2403.13204v1","updated":"2024-03-19T23:50:11Z","published":"2024-03-19T23:50:11Z","title":"Diversity-Aware Agnostic Ensemble of Sharpness Minimizers","summary":" There has long been plenty of theoretical and empirical evidence supporting\nthe success of ensemble learning. Deep ensembles in particular take advantage\nof training randomness and expressivity of individual neural networks to gain\nprediction diversity, ultimately leading to better generalization, robustness\nand uncertainty estimation. In respect of generalization, it is found that\npursuing wider local minima result in models being more robust to shifts\nbetween training and testing sets. A natural research question arises out of\nthese two approaches as to whether a boost in generalization ability can be\nachieved if ensemble learning and loss sharpness minimization are integrated.\nOur work investigates this connection and proposes DASH - a learning algorithm\nthat promotes diversity and flatness within deep ensembles. More concretely,\nDASH encourages base learners to move divergently towards low-loss regions of\nminimal sharpness. We provide a theoretical backbone for our method along with\nextensive empirical evidence demonstrating an improvement in ensemble\ngeneralizability.\n","authors":["Anh Bui","Vy Vo","Tung Pham","Dinh Phung","Trung Le"],"pdf_url":"https://arxiv.org/pdf/2403.13204v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13199v1","updated":"2024-03-19T23:23:35Z","published":"2024-03-19T23:23:35Z","title":"DecentNeRFs: Decentralized Neural Radiance Fields from Crowdsourced\n Images","summary":" Neural radiance fields (NeRFs) show potential for transforming images\ncaptured worldwide into immersive 3D visual experiences. However, most of this\ncaptured visual data remains siloed in our camera rolls as these images contain\npersonal details. Even if made public, the problem of learning 3D\nrepresentations of billions of scenes captured daily in a centralized manner is\ncomputationally intractable. Our approach, DecentNeRF, is the first attempt at\ndecentralized, crowd-sourced NeRFs that require $\\sim 10^4\\times$ less server\ncomputing for a scene than a centralized approach. Instead of sending the raw\ndata, our approach requires users to send a 3D representation, distributing the\nhigh computation cost of training centralized NeRFs between the users. It\nlearns photorealistic scene representations by decomposing users' 3D views into\npersonal and global NeRFs and a novel optimally weighted aggregation of only\nthe latter. We validate the advantage of our approach to learn NeRFs with\nphotorealism and minimal server computation cost on structured synthetic and\nreal-world photo tourism datasets. We further analyze how secure aggregation of\nglobal NeRFs in DecentNeRF minimizes the undesired reconstruction of personal\ncontent by the server.\n","authors":["Zaid Tasneem","Akshat Dave","Abhishek Singh","Kushagra Tiwary","Praneeth Vepakomma","Ashok Veeraraghavan","Ramesh Raskar"],"pdf_url":"https://arxiv.org/pdf/2403.13199v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13196v1","updated":"2024-03-19T23:13:40Z","published":"2024-03-19T23:13:40Z","title":"ADAPT to Robustify Prompt Tuning Vision Transformers","summary":" The performance of deep models, including Vision Transformers, is known to be\nvulnerable to adversarial attacks. Many existing defenses against these\nattacks, such as adversarial training, rely on full-model fine-tuning to induce\nrobustness in the models. These defenses require storing a copy of the entire\nmodel, that can have billions of parameters, for each task. At the same time,\nparameter-efficient prompt tuning is used to adapt large transformer-based\nmodels to downstream tasks without the need to save large copies. In this\npaper, we examine parameter-efficient prompt tuning of Vision Transformers for\ndownstream tasks under the lens of robustness. We show that previous\nadversarial defense methods, when applied to the prompt tuning paradigm, suffer\nfrom gradient obfuscation and are vulnerable to adaptive attacks. We introduce\nADAPT, a novel framework for performing adaptive adversarial training in the\nprompt tuning paradigm. Our method achieves competitive robust accuracy of ~40%\nw.r.t. SOTA robustness methods using full-model fine-tuning, by tuning only ~1%\nof the number of parameters.\n","authors":["Masih Eskandar","Tooba Imtiaz","Zifeng Wang","Jennifer Dy"],"pdf_url":"https://arxiv.org/pdf/2403.13196v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13195v1","updated":"2024-03-19T23:06:10Z","published":"2024-03-19T23:06:10Z","title":"Hermite coordinate interpolation kernels: application to image zooming","summary":" A number of basic image processing tasks, such as any geometric\ntransformation require interpolation at subpixel image values. In this work we\nutilize the multidimensional coordinate Hermite spline interpolation defined on\nnon-equal spaced, rectilinear grids and apply it to a very common image\nprocessing task, image zooming. Since Hermite interpolation utilizes function\nvalues, as well as partial derivative values, it is natural to apply it to\nimage processing tasks as a special case of equi-spaced grid, using numerical\napproximations of the image partial derivatives at each pixel. Furthermore, the\ntask of image interpolation requires the calculation of image values at\npositions with nono-zero fractional part. Thus, any spline interpolation can be\nwritten as convolution with an appropriate kernel. In this context we generate\nthe Hermite kernels according to the derived $n-$dimensional interpolant of\nTheorem 2 in [1]. We show that despite the increased complexity of the\ninterpolant, once the kernels are constructed, the Hermite spline interpolation\ncan be applied to images as efficiently as any other less complicated method.\nFinally, we perform illustrative numerical examples to showcase the\napplicability and high accuracy of the proposed Hermite kernels for image\nzooming, compared to other interpolation methods, both traditional\nconvolution-based, as well as employing deep learning, in terms of PSNR, as\nwell as SSIM error metrics. The proposed Hermite spline kernels outperform all\nother methods in the majority of the test images, in experiments using many\ncascaded repetitions of the zoom operation. Interesting conclusions can be\ndrawn considering all methods under comparison.\n","authors":["Konstantinos K. Delibasis","Iro Oikonomou","Aristides I. Kechriniotis","Georgios N. Tsigaridas"],"pdf_url":"https://arxiv.org/pdf/2403.13195v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13016v4","updated":"2024-03-19T23:01:59Z","published":"2023-12-20T13:31:11Z","title":"DiffPortrait3D: Controllable Diffusion for Zero-Shot Portrait View\n Synthesis","summary":" We present DiffPortrait3D, a conditional diffusion model that is capable of\nsynthesizing 3D-consistent photo-realistic novel views from as few as a single\nin-the-wild portrait. Specifically, given a single RGB input, we aim to\nsynthesize plausible but consistent facial details rendered from novel camera\nviews with retained both identity and facial expression. In lieu of\ntime-consuming optimization and fine-tuning, our zero-shot method generalizes\nwell to arbitrary face portraits with unposed camera views, extreme facial\nexpressions, and diverse artistic depictions. At its core, we leverage the\ngenerative prior of 2D diffusion models pre-trained on large-scale image\ndatasets as our rendering backbone, while the denoising is guided with\ndisentangled attentive control of appearance and camera pose. To achieve this,\nwe first inject the appearance context from the reference image into the\nself-attention layers of the frozen UNets. The rendering view is then\nmanipulated with a novel conditional control module that interprets the camera\npose by watching a condition image of a crossed subject from the same view.\nFurthermore, we insert a trainable cross-view attention module to enhance view\nconsistency, which is further strengthened with a novel 3D-aware noise\ngeneration process during inference. We demonstrate state-of-the-art results\nboth qualitatively and quantitatively on our challenging in-the-wild and\nmulti-view benchmarks.\n","authors":["Yuming Gu","You Xie","Hongyi Xu","Guoxian Song","Yichun Shi","Di Chang","Jing Yang","Linjie Luo"],"pdf_url":"https://arxiv.org/pdf/2312.13016v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13190v1","updated":"2024-03-19T23:01:14Z","published":"2024-03-19T23:01:14Z","title":"3D Semantic MapNet: Building Maps for Multi-Object Re-Identification in\n 3D","summary":" We study the task of 3D multi-object re-identification from embodied tours.\nSpecifically, an agent is given two tours of an environment (e.g. an apartment)\nunder two different layouts (e.g. arrangements of furniture). Its task is to\ndetect and re-identify objects in 3D - e.g. a \"sofa\" moved from location A to\nB, a new \"chair\" in the second layout at location C, or a \"lamp\" from location\nD in the first layout missing in the second. To support this task, we create an\nautomated infrastructure to generate paired egocentric tours of\ninitial/modified layouts in the Habitat simulator using Matterport3D scenes,\nYCB and Google-scanned objects. We present 3D Semantic MapNet (3D-SMNet) - a\ntwo-stage re-identification model consisting of (1) a 3D object detector that\noperates on RGB-D videos with known pose, and (2) a differentiable object\nmatching module that solves correspondence estimation between two sets of 3D\nbounding boxes. Overall, 3D-SMNet builds object-based maps of each layout and\nthen uses a differentiable matcher to re-identify objects across the tours.\nAfter training 3D-SMNet on our generated episodes, we demonstrate zero-shot\ntransfer to real-world rearrangement scenarios by instantiating our task in\nReplica, Active Vision, and RIO environments depicting rearrangements. On all\ndatasets, we find 3D-SMNet outperforms competitive baselines. Further, we show\njointly training on real and generated episodes can lead to significant\nimprovements over training on real data alone.\n","authors":["Vincent Cartillier","Neha Jain","Irfan Essa"],"pdf_url":"https://arxiv.org/pdf/2403.13190v1.pdf","comment":"8pages"},{"id":"http://arxiv.org/abs/2403.13188v1","updated":"2024-03-19T22:57:03Z","published":"2024-03-19T22:57:03Z","title":"Reflectivity Is All You Need!: Advancing LiDAR Semantic Segmentation","summary":" LiDAR semantic segmentation frameworks predominantly leverage geometry-based\nfeatures to differentiate objects within a scan. While these methods excel in\nscenarios with clear boundaries and distinct shapes, their performance declines\nin environments where boundaries are blurred, particularly in off-road\ncontexts. To address this, recent strides in 3D segmentation algorithms have\nfocused on harnessing raw LiDAR intensity measurements to improve prediction\naccuracy. Despite these efforts, current learning-based models struggle to\ncorrelate the intricate connections between raw intensity and factors such as\ndistance, incidence angle, material reflectivity, and atmospheric conditions.\nBuilding upon our prior work, this paper delves into the advantages of\nemploying calibrated intensity (also referred to as reflectivity) within\nlearning-based LiDAR semantic segmentation frameworks. We initially establish\nthat incorporating reflectivity as an input enhances the existing LiDAR\nsemantic segmentation model. Furthermore, we present findings that enable the\nmodel to learn to calibrate intensity can boost its performance. Through\nextensive experimentation on the off-road dataset Rellis-3D, we demonstrate\nnotable improvements. Specifically, converting intensity to reflectivity\nresults in a 4% increase in mean Intersection over Union (mIoU) when compared\nto using raw intensity in Off-road scenarios. Additionally, we also investigate\nthe possible benefits of using calibrated intensity in semantic segmentation in\nurban environments (SemanticKITTI) and cross-sensor domain adaptation.\n","authors":["Kasi Viswanath","Peng Jiang","Srikanth Saripalli"],"pdf_url":"https://arxiv.org/pdf/2403.13188v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14565v4","updated":"2024-03-19T22:53:25Z","published":"2023-06-26T10:26:33Z","title":"Mitigating Hallucination in Large Multi-Modal Models via Robust\n Instruction Tuning","summary":" Despite the promising progress in multi-modal tasks, current large\nmulti-modal models (LMMs) are prone to hallucinating inconsistent descriptions\nwith respect to the associated image and human instructions. This paper\naddresses this issue by introducing the first large and diverse visual\ninstruction tuning dataset, named Large-scale Robust Visual (LRV)-Instruction.\nOur dataset comprises 400k visual instructions generated by GPT4, covering 16\nvision-and-language tasks with open-ended instructions and answers. Unlike\nexisting studies that primarily focus on positive instruction samples, we\ndesign LRV-Instruction to include both positive and negative instructions for\nmore robust visual instruction tuning. Our negative instructions are designed\nat three semantic levels: (i) Nonexistent Object Manipulation, (ii) Existent\nObject Manipulation and (iii) Knowledge Manipulation. To efficiently measure\nthe hallucination generated by LMMs, we propose GPT4-Assisted Visual\nInstruction Evaluation (GAVIE), a stable approach to evaluate visual\ninstruction tuning like human experts. GAVIE does not require human-annotated\ngroundtruth answers and can adapt to diverse instruction formats. We conduct\ncomprehensive experiments to investigate the hallucination of LMMs. Our results\ndemonstrate existing LMMs exhibit significant hallucinations when presented\nwith our negative instructions, particularly Existent Object and Knowledge\nManipulation instructions. Moreover, we successfully mitigate hallucination by\nfinetuning MiniGPT4 and mPLUG-Owl on LRV-Instruction while improving\nperformance on several public datasets compared to state-of-the-art methods.\nAdditionally, we observed that a balanced ratio of positive and negative\ninstances in the training data leads to a more robust model. Code and data are\navailable at https://github.com/FuxiaoLiu/LRV-Instruction.\n","authors":["Fuxiao Liu","Kevin Lin","Linjie Li","Jianfeng Wang","Yaser Yacoob","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2306.14565v4.pdf","comment":"40 pages, 32 figures, ICLR 2024"},{"id":"http://arxiv.org/abs/2312.00210v2","updated":"2024-03-19T22:19:18Z","published":"2023-11-30T21:44:39Z","title":"DREAM: Diffusion Rectification and Estimation-Adaptive Models","summary":" We present DREAM, a novel training framework representing Diffusion\nRectification and Estimation Adaptive Models, requiring minimal code changes\n(just three lines) yet significantly enhancing the alignment of training with\nsampling in diffusion models. DREAM features two components: diffusion\nrectification, which adjusts training to reflect the sampling process, and\nestimation adaptation, which balances perception against distortion. When\napplied to image super-resolution (SR), DREAM adeptly navigates the tradeoff\nbetween minimizing distortion and preserving high image quality. Experiments\ndemonstrate DREAM's superiority over standard diffusion-based SR methods,\nshowing a $2$ to $3\\times $ faster training convergence and a $10$ to\n$20\\times$ reduction in sampling steps to achieve comparable results. We hope\nDREAM will inspire a rethinking of diffusion model training paradigms.\n","authors":["Jinxin Zhou","Tianyu Ding","Tianyi Chen","Jiachen Jiang","Ilya Zharkov","Zhihui Zhu","Luming Liang"],"pdf_url":"https://arxiv.org/pdf/2312.00210v2.pdf","comment":"16 pages, 22 figures, 5 tables; the first two authors contributed to\n this work equally"},{"id":"http://arxiv.org/abs/2403.13176v1","updated":"2024-03-19T22:05:32Z","published":"2024-03-19T22:05:32Z","title":"Castor: Competing shapelets for fast and accurate time series\n classification","summary":" Shapelets are discriminative subsequences, originally embedded in\nshapelet-based decision trees but have since been extended to shapelet-based\ntransformations. We propose Castor, a simple, efficient, and accurate time\nseries classification algorithm that utilizes shapelets to transform time\nseries. The transformation organizes shapelets into groups with varying\ndilation and allows the shapelets to compete over the time context to construct\na diverse feature representation. By organizing the shapelets into groups, we\nenable the transformation to transition between levels of competition,\nresulting in methods that more closely resemble distance-based transformations\nor dictionary-based transformations. We demonstrate, through an extensive\nempirical investigation, that Castor yields transformations that result in\nclassifiers that are significantly more accurate than several state-of-the-art\nclassifiers. In an extensive ablation study, we examine the effect of choosing\nhyperparameters and suggest accurate and efficient default values.\n","authors":["Isak Samsten","Zed Lee"],"pdf_url":"https://arxiv.org/pdf/2403.13176v1.pdf","comment":"Submitted to Data Mining and Knowledge Discovery Journal"},{"id":"http://arxiv.org/abs/2403.13858v1","updated":"2024-03-19T22:05:17Z","published":"2024-03-19T22:05:17Z","title":"A conditional latent autoregressive recurrent model for generation and\n forecasting of beam dynamics in particle accelerators","summary":" Particle accelerators are complex systems that focus, guide, and accelerate\nintense charged particle beams to high energy. Beam diagnostics present a\nchallenging problem due to limited non-destructive measurements,\ncomputationally demanding simulations, and inherent uncertainties in the\nsystem. We propose a two-step unsupervised deep learning framework named as\nConditional Latent Autoregressive Recurrent Model (CLARM) for learning the\nspatiotemporal dynamics of charged particles in accelerators. CLARM consists of\na Conditional Variational Autoencoder (CVAE) transforming six-dimensional phase\nspace into a lower-dimensional latent distribution and a Long Short-Term Memory\n(LSTM) network capturing temporal dynamics in an autoregressive manner. The\nCLARM can generate projections at various accelerator modules by sampling and\ndecoding the latent space representation. The model also forecasts future\nstates (downstream locations) of charged particles from past states (upstream\nlocations). The results demonstrate that the generative and forecasting ability\nof the proposed approach is promising when tested against a variety of\nevaluation metrics.\n","authors":["Mahindra Rautela","Alan Williams","Alexander Scheinker"],"pdf_url":"https://arxiv.org/pdf/2403.13858v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02350v2","updated":"2024-03-19T21:54:22Z","published":"2023-12-04T21:29:31Z","title":"Instant Uncertainty Calibration of NeRFs Using a Meta-calibrator","summary":" Although Neural Radiance Fields (NeRFs) have markedly improved novel view\nsynthesis, accurate uncertainty quantification in their image predictions\nremains an open problem. The prevailing methods for estimating uncertainty,\nincluding the state-of-the-art Density-aware NeRF Ensembles (DANE) [29],\nquantify uncertainty without calibration. This frequently leads to over- or\nunder-confidence in image predictions, which can undermine their real-world\napplications. In this paper, we propose a method which, for the first time,\nachieves calibrated uncertainties for NeRFs. To accomplish this, we overcome a\nsignificant challenge in adapting existing calibration techniques to NeRFs: a\nneed to hold out ground truth images from the target scene, reducing the number\nof images left to train the NeRF. This issue is particularly problematic in\nsparse-view settings, where we can operate with as few as three images. To\naddress this, we introduce the concept of a meta-calibrator that performs\nuncertainty calibration for NeRFs with a single forward pass without the need\nfor holding out any images from the target scene. Our meta-calibrator is a\nneural network that takes as input the NeRF images and uncalibrated uncertainty\nmaps and outputs a scene-specific calibration curve that corrects the NeRF's\nuncalibrated uncertainties. We show that the meta-calibrator can generalize on\nunseen scenes and achieves well-calibrated and state-of-the-art uncertainty for\nNeRFs, significantly beating DANE and other approaches. This opens\nopportunities to improve applications that rely on accurate NeRF uncertainty\nestimates such as next-best view planning and potentially more trustworthy\nimage reconstruction for medical diagnosis.\n","authors":["Niki Amini-Naieni","Tomas Jakab","Andrea Vedaldi","Ronald Clark"],"pdf_url":"https://arxiv.org/pdf/2312.02350v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13171v1","updated":"2024-03-19T21:52:19Z","published":"2024-03-19T21:52:19Z","title":"LUWA Dataset: Learning Lithic Use-Wear Analysis on Microscopic Images","summary":" Lithic Use-Wear Analysis (LUWA) using microscopic images is an underexplored\nvision-for-science research area. It seeks to distinguish the worked material,\nwhich is critical for understanding archaeological artifacts, material\ninteractions, tool functionalities, and dental records. However, this\nchallenging task goes beyond the well-studied image classification problem for\ncommon objects. It is affected by many confounders owing to the complex wear\nmechanism and microscopic imaging, which makes it difficult even for human\nexperts to identify the worked material successfully. In this paper, we\ninvestigate the following three questions on this unique vision task for the\nfirst time:(i) How well can state-of-the-art pre-trained models (like DINOv2)\ngeneralize to the rarely seen domain? (ii) How can few-shot learning be\nexploited for scarce microscopic images? (iii) How do the ambiguous\nmagnification and sensing modality influence the classification accuracy? To\nstudy these, we collaborated with archaeologists and built the first\nopen-source and the largest LUWA dataset containing 23,130 microscopic images\nwith different magnifications and sensing modalities. Extensive experiments\nshow that existing pre-trained models notably outperform human experts but\nstill leave a large gap for improvements. Most importantly, the LUWA dataset\nprovides an underexplored opportunity for vision and learning communities and\ncomplements existing image classification problems on common objects.\n","authors":["Jing Zhang","Irving Fang","Juexiao Zhang","Hao Wu","Akshat Kaushik","Alice Rodriguez","Hanwen Zhao Zhuo Zheng","Radu Iovita","Chen Feng"],"pdf_url":"https://arxiv.org/pdf/2403.13171v1.pdf","comment":"CVPR"},{"id":"http://arxiv.org/abs/2309.04461v2","updated":"2024-03-19T21:48:59Z","published":"2023-09-08T17:49:44Z","title":"Measuring and Improving Chain-of-Thought Reasoning in Vision-Language\n Models","summary":" Vision-language models (VLMs) have recently demonstrated strong efficacy as\nvisual assistants that can parse natural queries about the visual content and\ngenerate human-like outputs. In this work, we explore the ability of these\nmodels to demonstrate human-like reasoning based on the perceived information.\nTo address a crucial concern regarding the extent to which their reasoning\ncapabilities are fully consistent and grounded, we also measure the reasoning\nconsistency of these models. We achieve this by proposing a chain-of-thought\n(CoT) based consistency measure. However, such an evaluation requires a\nbenchmark that encompasses both high-level inference and detailed reasoning\nchains, which is costly. We tackle this challenge by proposing a\nLLM-Human-in-the-Loop pipeline, which notably reduces cost while simultaneously\nensuring the generation of a high-quality dataset. Based on this pipeline and\nthe existing coarse-grained annotated dataset, we build the CURE benchmark to\nmeasure both the zero-shot reasoning performance and consistency of VLMs. We\nevaluate existing state-of-the-art VLMs, and find that even the best-performing\nmodel is unable to demonstrate strong visual reasoning capabilities and\nconsistency, indicating that substantial efforts are required to enable VLMs to\nperform visual reasoning as systematically and consistently as humans. As an\nearly step, we propose a two-stage training framework aimed at improving both\nthe reasoning performance and consistency of VLMs. The first stage involves\nemploying supervised fine-tuning of VLMs using step-by-step reasoning samples\nautomatically generated by LLMs. In the second stage, we further augment the\ntraining process by incorporating feedback provided by LLMs to produce\nreasoning chains that are highly consistent and grounded. We empirically\nhighlight the effectiveness of our framework in both reasoning performance and\nconsistency.\n","authors":["Yangyi Chen","Karan Sikka","Michael Cogswell","Heng Ji","Ajay Divakaran"],"pdf_url":"https://arxiv.org/pdf/2309.04461v2.pdf","comment":"NAACL 2024 Main Conference. The data is released at\n https://github.com/Yangyi-Chen/CoTConsistency"},{"id":"http://arxiv.org/abs/2403.13167v1","updated":"2024-03-19T21:40:20Z","published":"2024-03-19T21:40:20Z","title":"Improved EATFormer: A Vision Transformer for Medical Image\n Classification","summary":" The accurate analysis of medical images is vital for diagnosing and\npredicting medical conditions. Traditional approaches relying on radiologists\nand clinicians suffer from inconsistencies and missed diagnoses. Computer-aided\ndiagnosis systems can assist in achieving early, accurate, and efficient\ndiagnoses. This paper presents an improved Evolutionary Algorithm-based\nTransformer architecture for medical image classification using Vision\nTransformers. The proposed EATFormer architecture combines the strengths of\nConvolutional Neural Networks and Vision Transformers, leveraging their ability\nto identify patterns in data and adapt to specific characteristics. The\narchitecture incorporates novel components, including the Enhanced EA-based\nTransformer block with Feed-Forward Network, Global and Local Interaction , and\nMulti-Scale Region Aggregation modules. It also introduces the Modulated\nDeformable MSA module for dynamic modeling of irregular locations. The paper\ndiscusses the Vision Transformer (ViT) model's key features, such as\npatch-based processing, positional context incorporation, and Multi-Head\nAttention mechanism. It introduces the Multi-Scale Region Aggregation module,\nwhich aggregates information from different receptive fields to provide an\ninductive bias. The Global and Local Interaction module enhances the MSA-based\nglobal module by introducing a local path for extracting discriminative local\ninformation. Experimental results on the Chest X-ray and Kvasir datasets\ndemonstrate that the proposed EATFormer significantly improves prediction speed\nand accuracy compared to baseline models.\n","authors":["Yulong Shisu","Susano Mingwin","Yongshuai Wanwag","Zengqiang Chenso","Sunshin Huing"],"pdf_url":"https://arxiv.org/pdf/2403.13167v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13163v1","updated":"2024-03-19T21:31:31Z","published":"2024-03-19T21:31:31Z","title":"DeblurDiNAT: A Lightweight and Effective Transformer for Image\n Deblurring","summary":" Blurry images may contain local and global non-uniform artifacts, which\ncomplicate the deblurring process and make it more challenging to achieve\nsatisfactory results. Recently, Transformers generate improved deblurring\noutcomes than existing CNN architectures. However, the large model size and\nlong inference time are still two bothersome issues which have not been fully\nexplored. To this end, we propose DeblurDiNAT, a compact encoder-decoder\nTransformer which efficiently restores clean images from real-world blurry\nones. We adopt an alternating dilation factor structure with the aim of\nglobal-local feature learning. Also, we observe that simply using\nself-attention layers in networks does not always produce good deblurred\nresults. To solve this problem, we propose a channel modulation self-attention\n(CMSA) block, where a cross-channel learner (CCL) is utilized to capture\nchannel relationships. In addition, we present a divide and multiply\nfeed-forward network (DMFN) allowing fast feature propagation. Moreover, we\ndesign a lightweight gated feature fusion (LGFF) module, which performs\ncontrolled feature merging. Comprehensive experimental results show that the\nproposed model, named DeblurDiNAT, provides a favorable performance boost\nwithout introducing noticeable computational costs over the baseline, and\nachieves state-of-the-art (SOTA) performance on several image deblurring\ndatasets. Compared to nearest competitors, our space-efficient and time-saving\nmethod demonstrates a stronger generalization ability with 3%-68% fewer\nparameters and produces deblurred images that are visually closer to the ground\ntruth.\n","authors":["Hanzhou Liu","Binghan Li","Chengkai Liu","Mi Lu"],"pdf_url":"https://arxiv.org/pdf/2403.13163v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13148v1","updated":"2024-03-19T20:52:31Z","published":"2024-03-19T20:52:31Z","title":"SIFT-DBT: Self-supervised Initialization and Fine-Tuning for Imbalanced\n Digital Breast Tomosynthesis Image Classification","summary":" Digital Breast Tomosynthesis (DBT) is a widely used medical imaging modality\nfor breast cancer screening and diagnosis, offering higher spatial resolution\nand greater detail through its 3D-like breast volume imaging capability.\nHowever, the increased data volume also introduces pronounced data imbalance\nchallenges, where only a small fraction of the volume contains suspicious\ntissue. This further exacerbates the data imbalance due to the case-level\ndistribution in real-world data and leads to learning a trivial classification\nmodel that only predicts the majority class. To address this, we propose a\nnovel method using view-level contrastive Self-supervised Initialization and\nFine-Tuning for identifying abnormal DBT images, namely SIFT-DBT. We further\nintroduce a patch-level multi-instance learning method to preserve spatial\nresolution. The proposed method achieves 92.69% volume-wise AUC on an\nevaluation of 970 unique studies.\n","authors":["Yuexi Du","Regina J. Hooley","John Lewin","Nicha C. Dvornek"],"pdf_url":"https://arxiv.org/pdf/2403.13148v1.pdf","comment":"Accepted by IEEE ISBI 2024"},{"id":"http://arxiv.org/abs/2403.13135v1","updated":"2024-03-19T20:10:50Z","published":"2024-03-19T20:10:50Z","title":"A Parallel Workflow for Polar Sea-Ice Classification using Auto-labeling\n of Sentinel-2 Imagery","summary":" The observation of the advancing and retreating pattern of polar sea ice\ncover stands as a vital indicator of global warming. This research aims to\ndevelop a robust, effective, and scalable system for classifying polar sea ice\nas thick/snow-covered, young/thin, or open water using Sentinel-2 (S2) images.\nSince the S2 satellite is actively capturing high-resolution imagery over the\nearth's surface, there are lots of images that need to be classified. One major\nobstacle is the absence of labeled S2 training data (images) to act as the\nground truth. We demonstrate a scalable and accurate method for segmenting and\nautomatically labeling S2 images using carefully determined color thresholds.\nWe employ a parallel workflow using PySpark to scale and achieve 9-fold data\nloading and 16-fold map-reduce speedup on auto-labeling S2 images based on thin\ncloud and shadow-filtered color-based segmentation to generate label data. The\nauto-labeled data generated from this process are then employed to train a\nU-Net machine learning model, resulting in good classification accuracy. As\ntraining the U-Net classification model is computationally heavy and\ntime-consuming, we distribute the U-Net model training to scale it over 8 GPUs\nusing the Horovod framework over a DGX cluster with a 7.21x speedup without\naffecting the accuracy of the model. Using the Antarctic's Ross Sea region as\nan example, the U-Net model trained on auto-labeled data achieves a\nclassification accuracy of 98.97% for auto-labeled training datasets when the\nthin clouds and shadows from the S2 images are filtered out.\n","authors":["Jurdana Masuma Iqrah","Wei Wang","Hongjie Xie","Sushil Prasad"],"pdf_url":"https://arxiv.org/pdf/2403.13135v1.pdf","comment":"Accepted in the 25th IEEE International Workshop on Parallel and\n Distributed Scientific and Engineering Computing (PDSEC 2024), May 2024.\n arXiv admin note: substantial text overlap with arXiv:2303.12719"},{"id":"http://arxiv.org/abs/2403.13129v1","updated":"2024-03-19T19:58:54Z","published":"2024-03-19T19:58:54Z","title":"Better Call SAL: Towards Learning to Segment Anything in Lidar","summary":" We propose $\\texttt{SAL}$ ($\\texttt{S}$egment $\\texttt{A}$nything in\n$\\texttt{L}$idar) method consisting of a text-promptable zero-shot model for\nsegmenting and classifying any object in Lidar, and a pseudo-labeling engine\nthat facilitates model training without manual supervision. While the\nestablished paradigm for $\\textit{Lidar Panoptic Segmentation}$ (LPS) relies on\nmanual supervision for a handful of object classes defined a priori, we utilize\n2D vision foundation models to generate 3D supervision \"for free\". Our\npseudo-labels consist of instance masks and corresponding CLIP tokens, which we\nlift to Lidar using calibrated multi-modal data. By training our model on these\nlabels, we distill the 2D foundation models into our Lidar $\\texttt{SAL}$\nmodel. Even without manual labels, our model achieves $91\\%$ in terms of\nclass-agnostic segmentation and $44\\%$ in terms of zero-shot LPS of the fully\nsupervised state-of-the-art. Furthermore, we outperform several baselines that\ndo not distill but only lift image features to 3D. More importantly, we\ndemonstrate that $\\texttt{SAL}$ supports arbitrary class prompts, can be easily\nextended to new datasets, and shows significant potential to improve with\nincreasing amounts of self-labeled data.\n","authors":["Aljoša Ošep","Tim Meinhardt","Francesco Ferroni","Neehar Peri","Deva Ramanan","Laura Leal-Taixé"],"pdf_url":"https://arxiv.org/pdf/2403.13129v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12621v3","updated":"2024-03-19T19:42:14Z","published":"2023-05-22T01:14:30Z","title":"DermSynth3D: Synthesis of in-the-wild Annotated Dermatology Images","summary":" In recent years, deep learning (DL) has shown great potential in the field of\ndermatological image analysis. However, existing datasets in this domain have\nsignificant limitations, including a small number of image samples, limited\ndisease conditions, insufficient annotations, and non-standardized image\nacquisitions. To address these shortcomings, we propose a novel framework\ncalled DermSynth3D. DermSynth3D blends skin disease patterns onto 3D textured\nmeshes of human subjects using a differentiable renderer and generates 2D\nimages from various camera viewpoints under chosen lighting conditions in\ndiverse background scenes. Our method adheres to top-down rules that constrain\nthe blending and rendering process to create 2D images with skin conditions\nthat mimic in-the-wild acquisitions, ensuring more meaningful results. The\nframework generates photo-realistic 2D dermoscopy images and the corresponding\ndense annotations for semantic segmentation of the skin, skin conditions, body\nparts, bounding boxes around lesions, depth maps, and other 3D scene\nparameters, such as camera position and lighting conditions. DermSynth3D allows\nfor the creation of custom datasets for various dermatology tasks. We\ndemonstrate the effectiveness of data generated using DermSynth3D by training\nDL models on synthetic data and evaluating them on various dermatology tasks\nusing real 2D dermatological images. We make our code publicly available at\nhttps://github.com/sfu-mial/DermSynth3D.\n","authors":["Ashish Sinha","Jeremy Kawahara","Arezou Pakzad","Kumar Abhishek","Matthieu Ruthven","Enjie Ghorbel","Anis Kacem","Djamila Aouada","Ghassan Hamarneh"],"pdf_url":"https://arxiv.org/pdf/2305.12621v3.pdf","comment":"Accepted to Medical Image Analysis (MedIA) 2024"},{"id":"http://arxiv.org/abs/2305.03001v2","updated":"2024-03-19T19:40:44Z","published":"2023-05-04T17:19:47Z","title":"OSDaR23: Open Sensor Data for Rail 2023","summary":" To achieve a driverless train operation on mainline railways, actual and\npotential obstacles for the train's driveway must be detected automatically by\nappropriate sensor systems. Machine learning algorithms have proven to be\npowerful tools for this task during the last years. However, these algorithms\nrequire large amounts of high-quality annotated data containing\nrailway-specific objects as training data. Unfortunately, all of the publicly\navailable datasets that tackle this requirement are restricted in some way.\nTherefore, this paper presents OSDaR23, a multi-sensor dataset of 45\nsubsequences acquired in Hamburg, Germany, in September 2021, that was created\nto foster driverless train operation on mainline railways. The sensor setup\nconsists of multiple calibrated and synchronized infrared (IR) and visual (RGB)\ncameras, lidars, a radar, and position and acceleration sensors mounted on the\nfront of a rail vehicle. In addition to the raw data, the dataset contains\n204091 polyline, polygonal, rectangle, and cuboid annotations in total for 20\ndifferent object classes. It is the first publicly available multi-sensor\ndataset annotated with a variety of object classes that are relevant for the\nrailway context. OSDaR23, available at data.fid-move.de/dataset/osdar23, can\nalso be used for tasks beyond collision prediction, which are listed in this\npaper.\n","authors":["Rustam Tagiew","Martin Köppel","Karsten Schwalbe","Patrick Denzler","Philipp Neumaier","Tobias Klockau","Martin Boekhoff","Pavel Klasek","Roman Tilly"],"pdf_url":"https://arxiv.org/pdf/2305.03001v2.pdf","comment":"7 pages, 11 images, 5 tables"},{"id":"http://arxiv.org/abs/2403.13113v1","updated":"2024-03-19T19:36:48Z","published":"2024-03-19T19:36:48Z","title":"Trustworthiness of Pretrained Transformers for Lung Cancer Segmentation","summary":" We assessed the trustworthiness of two self-supervision pretrained\ntransformer models, Swin UNETR and SMIT, for fine-tuned lung (LC) tumor\nsegmentation using 670 CT and MRI scans. We measured segmentation accuracy on\ntwo public 3D-CT datasets, robustness on CT scans of patients with COVID-19, CT\nscans of patients with ovarian cancer and T2-weighted MRI of men with prostate\ncancer, and zero-shot generalization of LC for T2-weighted MRIs. Both models\ndemonstrated high accuracy on in-distribution data (Dice 0.80 for SMIT and 0.78\nfor Swin UNETR). SMIT showed similar near-out-of-distribution performance on CT\nscans (AUROC 89.85% vs. 89.19%) but significantly better\nfar-out-of-distribution accuracy on CT (AUROC 97.2% vs. 87.1%) and MRI (92.15%\nvs. 73.8%). SMIT outperformed Swin UNETR in zero-shot segmentation on MRI (Dice\n0.78 vs. 0.69). We expect these findings to guide the safe development and\ndeployment of current and future pretrained models in routine clinical use.\n","authors":["Aneesh Rangnekar","Nishant Nadkarni","Jue Jiang","Harini Veeraraghavan"],"pdf_url":"https://arxiv.org/pdf/2403.13113v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13106v1","updated":"2024-03-19T19:13:22Z","published":"2024-03-19T19:13:22Z","title":"Knowing Your Nonlinearities: Shapley Interactions Reveal the Underlying\n Structure of Data","summary":" Measuring nonlinear feature interaction is an established approach to\nunderstanding complex patterns of attribution in many models. In this paper, we\nuse Shapley Taylor interaction indices (STII) to analyze the impact of\nunderlying data structure on model representations in a variety of modalities,\ntasks, and architectures. Considering linguistic structure in masked and\nauto-regressive language models (MLMs and ALMs), we find that STII increases\nwithin idiomatic expressions and that MLMs scale STII with syntactic distance,\nrelying more on syntax in their nonlinear structure than ALMs do. Our speech\nmodel findings reflect the phonetic principal that the openness of the oral\ncavity determines how much a phoneme varies based on its context. Finally, we\nstudy image classifiers and illustrate that feature interactions intuitively\nreflect object boundaries. Our wide range of results illustrates the benefits\nof interdisciplinary work and domain expertise in interpretability research.\n","authors":["Divyansh Singhvi","Andrej Erkelens","Raghav Jain","Diganta Misra","Naomi Saphra"],"pdf_url":"https://arxiv.org/pdf/2403.13106v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07169v2","updated":"2024-03-19T19:12:26Z","published":"2023-12-12T11:13:17Z","title":"Semi-supervised Active Learning for Video Action Detection","summary":" In this work, we focus on label efficient learning for video action\ndetection. We develop a novel semi-supervised active learning approach which\nutilizes both labeled as well as unlabeled data along with informative sample\nselection for action detection. Video action detection requires spatio-temporal\nlocalization along with classification, which poses several challenges for both\nactive learning informative sample selection as well as semi-supervised\nlearning pseudo label generation. First, we propose NoiseAug, a simple\naugmentation strategy which effectively selects informative samples for video\naction detection. Next, we propose fft-attention, a novel technique based on\nhigh-pass filtering which enables effective utilization of pseudo label for SSL\nin video action detection by emphasizing on relevant activity region within a\nvideo. We evaluate the proposed approach on three different benchmark datasets,\nUCF-101-24, JHMDB-21, and Youtube-VOS. First, we demonstrate its effectiveness\non video action detection where the proposed approach outperforms prior works\nin semi-supervised and weakly-supervised learning along with several baseline\napproaches in both UCF101-24 and JHMDB-21. Next, we also show its effectiveness\non Youtube-VOS for video object segmentation demonstrating its generalization\ncapability for other dense prediction tasks in videos.\n","authors":["Ayush Singh","Aayush J Rana","Akash Kumar","Shruti Vyas","Yogesh Singh Rawat"],"pdf_url":"https://arxiv.org/pdf/2312.07169v2.pdf","comment":"AAAI Conference on Artificial Intelligence, Main Technical Track\n (AAAI), 2024"},{"id":"http://arxiv.org/abs/2403.13094v1","updated":"2024-03-19T18:46:32Z","published":"2024-03-19T18:46:32Z","title":"Train Ego-Path Detection on Railway Tracks Using End-to-End Deep\n Learning","summary":" This paper introduces the task of \"train ego-path detection\", a refined\napproach to railway track detection designed for intelligent onboard vision\nsystems. Whereas existing research lacks precision and often considers all\ntracks within the visual field uniformly, our proposed task specifically aims\nto identify the train's immediate path, or \"ego-path\", within potentially\ncomplex and dynamic railway environments. Building on this, we extend the\nRailSem19 dataset with ego-path annotations, facilitating further research in\nthis direction. At the heart of our study lies TEP-Net, an end-to-end deep\nlearning framework tailored for ego-path detection, featuring a configurable\nmodel architecture, a dynamic data augmentation strategy, and a domain-specific\nloss function. Leveraging a regression-based approach, TEP-Net outperforms\nSOTA: while addressing the track detection problem in a more nuanced way than\npreviously, our model achieves 97.5% IoU on the test set and is faster than all\nexisting methods. Further comparative analysis highlights the relevance of the\nconceptual choices behind TEP-Net, demonstrating its inherent propensity for\nrobustness across diverse environmental conditions and operational dynamics.\nThis work opens promising avenues for the development of intelligent driver\nassistance systems and autonomous train operations, paving the way toward safer\nand more efficient railway transportation.\n","authors":["Thomas Laurent"],"pdf_url":"https://arxiv.org/pdf/2403.13094v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13078v1","updated":"2024-03-19T18:15:15Z","published":"2024-03-19T18:15:15Z","title":"HuLP: Human-in-the-Loop for Prognosis","summary":" This paper introduces HuLP, a Human-in-the-Loop for Prognosis model designed\nto enhance the reliability and interpretability of prognostic models in\nclinical contexts, especially when faced with the complexities of missing\ncovariates and outcomes. HuLP offers an innovative approach that enables human\nexpert intervention, empowering clinicians to interact with and correct models'\npredictions, thus fostering collaboration between humans and AI models to\nproduce more accurate prognosis. Additionally, HuLP addresses the challenges of\nmissing data by utilizing neural networks and providing a tailored methodology\nthat effectively handles missing data. Traditional methods often struggle to\ncapture the nuanced variations within patient populations, leading to\ncompromised prognostic predictions. HuLP imputes missing covariates based on\nimaging features, aligning more closely with clinician workflows and enhancing\nreliability. We conduct our experiments on two real-world, publicly available\nmedical datasets to demonstrate the superiority of HuLP.\n","authors":["Muhammad Ridzuan","Mai Kassem","Numan Saeed","Ikboljon Sobirov","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2403.13078v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10369v2","updated":"2024-03-19T18:08:39Z","published":"2024-03-15T15:00:42Z","title":"Open Stamped Parts Dataset","summary":" We present the Open Stamped Parts Dataset (OSPD), featuring synthetic and\nreal images of stamped metal sheets for auto manufacturing. The real part\nimages, captured from 7 cameras, consist of 7,980 unlabeled images and 1,680\nlabeled images. In addition, we have compiled a defect dataset by overlaying\nsynthetically generated masks on 10% of the holes. The synthetic dataset\nreplicates the real manufacturing environment in terms of lighting and part\nplacement relative to the cameras. The synthetic data includes 7,980 training\nimages, 1,680 validation images and 1,680 test images, each with bounding box\nand segmentation mask annotations around all holes. 10% of the holes in the\nsynthetic data mimic defects generated in the real image dataset. We trained a\nhole-detection model on the synthetic-OSPD, achieving a modified recall score\nof 67.2% and a precision of 94.4% . We anticipate researchers in the auto\nmanufacturing and broader machine learning and computer vision communities\nusing OSPD to advance the state of the art in defect detection of stamped holes\nin the metalsheet stamping process. The dataset is available for download at:\nhttps://tinyurl.com/hm6xatd7\n","authors":["Sarah Antiles","Sachin S. Talathi"],"pdf_url":"https://arxiv.org/pdf/2403.10369v2.pdf","comment":"6 pages, 7 figures, 2 tables"},{"id":"http://arxiv.org/abs/2403.13064v1","updated":"2024-03-19T18:01:29Z","published":"2024-03-19T18:01:29Z","title":"SceneScript: Reconstructing Scenes With An Autoregressive Structured\n Language Model","summary":" We introduce SceneScript, a method that directly produces full scene models\nas a sequence of structured language commands using an autoregressive,\ntoken-based approach. Our proposed scene representation is inspired by recent\nsuccesses in transformers & LLMs, and departs from more traditional methods\nwhich commonly describe scenes as meshes, voxel grids, point clouds or radiance\nfields. Our method infers the set of structured language commands directly from\nencoded visual data using a scene language encoder-decoder architecture. To\ntrain SceneScript, we generate and release a large-scale synthetic dataset\ncalled Aria Synthetic Environments consisting of 100k high-quality in-door\nscenes, with photorealistic and ground-truth annotated renders of egocentric\nscene walkthroughs. Our method gives state-of-the art results in architectural\nlayout estimation, and competitive results in 3D object detection. Lastly, we\nexplore an advantage for SceneScript, which is the ability to readily adapt to\nnew commands via simple additions to the structured language, which we\nillustrate for tasks such as coarse 3D object part reconstruction.\n","authors":["Armen Avetisyan","Christopher Xie","Henry Howard-Jenkins","Tsun-Yi Yang","Samir Aroudj","Suvam Patra","Fuyang Zhang","Duncan Frost","Luke Holland","Campbell Orme","Jakob Engel","Edward Miller","Richard Newcombe","Vasileios Balntas"],"pdf_url":"https://arxiv.org/pdf/2403.13064v1.pdf","comment":"see project page, https://projectaria.com/scenescript"},{"id":"http://arxiv.org/abs/2403.13044v1","updated":"2024-03-19T17:59:58Z","published":"2024-03-19T17:59:58Z","title":"Magic Fixup: Streamlining Photo Editing by Watching Dynamic Videos","summary":" We propose a generative model that, given a coarsely edited image,\nsynthesizes a photorealistic output that follows the prescribed layout. Our\nmethod transfers fine details from the original image and preserves the\nidentity of its parts. Yet, it adapts it to the lighting and context defined by\nthe new layout. Our key insight is that videos are a powerful source of\nsupervision for this task: objects and camera motions provide many observations\nof how the world changes with viewpoint, lighting, and physical interactions.\nWe construct an image dataset in which each sample is a pair of source and\ntarget frames extracted from the same video at randomly chosen time intervals.\nWe warp the source frame toward the target using two motion models that mimic\nthe expected test-time user edits. We supervise our model to translate the\nwarped image into the ground truth, starting from a pretrained diffusion model.\nOur model design explicitly enables fine detail transfer from the source frame\nto the generated image, while closely following the user-specified layout. We\nshow that by using simple segmentations and coarse 2D manipulations, we can\nsynthesize a photorealistic edit faithful to the user's input while addressing\nsecond-order effects like harmonizing the lighting and physical interactions\nbetween edited objects.\n","authors":["Hadi Alzayer","Zhihao Xia","Xuaner Zhang","Eli Shechtman","Jia-Bin Huang","Michael Gharbi"],"pdf_url":"https://arxiv.org/pdf/2403.13044v1.pdf","comment":"Project page: https://magic-fixup.github.io/"},{"id":"http://arxiv.org/abs/2403.13043v1","updated":"2024-03-19T17:58:39Z","published":"2024-03-19T17:58:39Z","title":"When Do We Not Need Larger Vision Models?","summary":" Scaling up the size of vision models has been the de facto standard to obtain\nmore powerful visual representations. In this work, we discuss the point beyond\nwhich larger vision models are not necessary. First, we demonstrate the power\nof Scaling on Scales (S$^2$), whereby a pre-trained and frozen smaller vision\nmodel (e.g., ViT-B or ViT-L), run over multiple image scales, can outperform\nlarger models (e.g., ViT-H or ViT-G) on classification, segmentation, depth\nestimation, Multimodal LLM (MLLM) benchmarks, and robotic manipulation.\nNotably, S$^2$ achieves state-of-the-art performance in detailed understanding\nof MLLM on the V* benchmark, surpassing models such as GPT-4V. We examine the\nconditions under which S$^2$ is a preferred scaling approach compared to\nscaling on model size. While larger models have the advantage of better\ngeneralization on hard examples, we show that features of larger vision models\ncan be well approximated by those of multi-scale smaller models. This suggests\nmost, if not all, of the representations learned by current large pre-trained\nmodels can also be obtained from multi-scale smaller models. Our results show\nthat a multi-scale smaller model has comparable learning capacity to a larger\nmodel, and pre-training smaller models with S$^2$ can match or even exceed the\nadvantage of larger models. We release a Python package that can apply S$^2$ on\nany vision model with one line of code:\nhttps://github.com/bfshi/scaling_on_scales.\n","authors":["Baifeng Shi","Ziyang Wu","Maolin Mao","Xin Wang","Trevor Darrell"],"pdf_url":"https://arxiv.org/pdf/2403.13043v1.pdf","comment":"Code: https://github.com/bfshi/scaling_on_scales"},{"id":"http://arxiv.org/abs/2403.13042v1","updated":"2024-03-19T17:57:09Z","published":"2024-03-19T17:57:09Z","title":"TAPTR: Tracking Any Point with Transformers as Detection","summary":" In this paper, we propose a simple and strong framework for Tracking Any\nPoint with TRansformers (TAPTR). Based on the observation that point tracking\nbears a great resemblance to object detection and tracking, we borrow designs\nfrom DETR-like algorithms to address the task of TAP. In the proposed\nframework, in each video frame, each tracking point is represented as a point\nquery, which consists of a positional part and a content part. As in DETR, each\nquery (its position and content feature) is naturally updated layer by layer.\nIts visibility is predicted by its updated content feature. Queries belonging\nto the same tracking point can exchange information through self-attention\nalong the temporal dimension. As all such operations are well-designed in\nDETR-like algorithms, the model is conceptually very simple. We also adopt some\nuseful designs such as cost volume from optical flow models and develop simple\ndesigns to provide long temporal information while mitigating the feature\ndrifting issue. Our framework demonstrates strong performance with\nstate-of-the-art performance on various TAP datasets with faster inference\nspeed.\n","authors":["Hongyang Li","Hao Zhang","Shilong Liu","Zhaoyang Zeng","Tianhe Ren","Feng Li","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.13042v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13040v1","updated":"2024-03-19T17:35:17Z","published":"2024-03-19T17:35:17Z","title":"Physics-Guided Neural Networks for Intraventricular Vector Flow Mapping","summary":" Intraventricular vector flow mapping (iVFM) seeks to enhance and quantify\ncolor Doppler in cardiac imaging. In this study, we propose novel alternatives\nto the traditional iVFM optimization scheme by utilizing physics-informed\nneural networks (PINNs) and a physics-guided nnU-Net-based supervised approach.\nThrough rigorous evaluation on simulated color Doppler images derived from a\npatient-specific computational fluid dynamics model and in vivo Doppler\nacquisitions, both approaches demonstrate comparable reconstruction performance\nto the original iVFM algorithm. The efficiency of PINNs is boosted through\ndual-stage optimization and pre-optimized weights. On the other hand, the\nnnU-Net method excels in generalizability and real time capabilities. Notably,\nnnU-Net shows superior robustness on sparse and truncated Doppler data while\nmaintaining independence from explicit boundary conditions. Overall, our\nresults highlight the effectiveness of these methods in reconstructing\nintraventricular vector blood flow. The study also suggests potential\napplications of PINNs in ultrafast color Doppler imaging and the incorporation\nof fluid dynamics equations to derive biomarkers for cardiovascular diseases\nbased on blood flow.\n","authors":["Hang Jung Ling","Salomé Bru","Julia Puig","Florian Vixège","Simon Mendez","Franck Nicoud","Pierre-Yves Courand","Olivier Bernard","Damien Garcia"],"pdf_url":"https://arxiv.org/pdf/2403.13040v1.pdf","comment":"11 pages, submitted to IEEE TUFFC"},{"id":"http://arxiv.org/abs/2403.13039v1","updated":"2024-03-19T16:21:47Z","published":"2024-03-19T16:21:47Z","title":"Emotic Masked Autoencoder with Attention Fusion for Facial Expression\n Recognition","summary":" Facial Expression Recognition (FER) is a critical task within computer vision\nwith diverse applications across various domains. Addressing the challenge of\nlimited FER datasets, which hampers the generalization capability of expression\nrecognition models, is imperative for enhancing performance. Our paper presents\nan innovative approach integrating the MAE-Face self-supervised learning (SSL)\nmethod and Fusion Attention mechanism for expression classification,\nparticularly showcased in the 6th Affective Behavior Analysis in-the-wild\n(ABAW) competition. Additionally, we propose preprocessing techniques to\nemphasize essential facial features, thereby enhancing model performance on\nboth training and validation sets, notably demonstrated on the Aff-wild2\ndataset.\n","authors":["Bach Nguyen-Xuan","Thien Nguyen-Hoang","Nhu Tai-Do"],"pdf_url":"https://arxiv.org/pdf/2403.13039v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13731v1","updated":"2024-03-19T12:26:53Z","published":"2024-03-19T12:26:53Z","title":"Emotion Recognition Using Transformers with Masked Learning","summary":" In recent years, deep learning has achieved innovative advancements in\nvarious fields, including the analysis of human emotions and behaviors.\nInitiatives such as the Affective Behavior Analysis in-the-wild (ABAW)\ncompetition have been particularly instrumental in driving research in this\narea by providing diverse and challenging datasets that enable precise\nevaluation of complex emotional states. This study leverages the Vision\nTransformer (ViT) and Transformer models to focus on the estimation of\nValence-Arousal (VA), which signifies the positivity and intensity of emotions,\nrecognition of various facial expressions, and detection of Action Units (AU)\nrepresenting fundamental muscle movements. This approach transcends traditional\nConvolutional Neural Networks (CNNs) and Long Short-Term Memory (LSTM) based\nmethods, proposing a new Transformer-based framework that maximizes the\nunderstanding of temporal and spatial features. The core contributions of this\nresearch include the introduction of a learning technique through random frame\nmasking and the application of Focal loss adapted for imbalanced data,\nenhancing the accuracy and applicability of emotion and behavior analysis in\nreal-world settings. This approach is expected to contribute to the advancement\nof emotional computing and deep learning methodologies.\n","authors":["Seongjae Min","Junseok Yang","Sangjun Lim","Junyong Lee","Sangwon Lee","Sejoon Lim"],"pdf_url":"https://arxiv.org/pdf/2403.13731v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13030v1","updated":"2024-03-19T04:19:11Z","published":"2024-03-19T04:19:11Z","title":"Super-High-Fidelity Image Compression via Hierarchical-ROI and Adaptive\n Quantization","summary":" Learned Image Compression (LIC) has achieved dramatic progress regarding\nobjective and subjective metrics. MSE-based models aim to improve objective\nmetrics while generative models are leveraged to improve visual quality\nmeasured by subjective metrics. However, they all suffer from blurring or\ndeformation at low bit rates, especially at below $0.2bpp$. Besides,\ndeformation on human faces and text is unacceptable for visual quality\nassessment, and the problem becomes more prominent on small faces and text. To\nsolve this problem, we combine the advantage of MSE-based models and generative\nmodels by utilizing region of interest (ROI). We propose Hierarchical-ROI\n(H-ROI), to split images into several foreground regions and one background\nregion to improve the reconstruction of regions containing faces, text, and\ncomplex textures. Further, we propose adaptive quantization by non-linear\nmapping within the channel dimension to constrain the bit rate while\nmaintaining the visual quality. Exhaustive experiments demonstrate that our\nmethods achieve better visual quality on small faces and text with lower bit\nrates, e.g., $0.7X$ bits of HiFiC and $0.5X$ bits of BPG.\n","authors":["Jixiang Luo","Yan Wang","Hongwei Qin"],"pdf_url":"https://arxiv.org/pdf/2403.13030v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14719v1","updated":"2024-03-19T17:54:39Z","published":"2024-03-19T17:54:39Z","title":"Bypassing LLM Watermarks with Color-Aware Substitutions","summary":" Watermarking approaches are proposed to identify if text being circulated is\nhuman or large language model (LLM) generated. The state-of-the-art\nwatermarking strategy of Kirchenbauer et al. (2023a) biases the LLM to generate\nspecific (``green'') tokens. However, determining the robustness of this\nwatermarking method is an open problem. Existing attack methods fail to evade\ndetection for longer text segments. We overcome this limitation, and propose\n{\\em Self Color Testing-based Substitution (SCTS)}, the first ``color-aware''\nattack. SCTS obtains color information by strategically prompting the\nwatermarked LLM and comparing output tokens frequencies. It uses this\ninformation to determine token colors, and substitutes green tokens with\nnon-green ones. In our experiments, SCTS successfully evades watermark\ndetection using fewer number of edits than related work. Additionally, we show\nboth theoretically and empirically that SCTS can remove the watermark for\narbitrarily long watermarked text.\n","authors":["Qilong Wu","Varun Chandrasekaran"],"pdf_url":"https://arxiv.org/pdf/2403.14719v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14715v1","updated":"2024-03-19T06:46:24Z","published":"2024-03-19T06:46:24Z","title":"Understanding Why Label Smoothing Degrades Selective Classification and\n How to Fix It","summary":" Label smoothing (LS) is a popular regularisation method for training deep\nneural network classifiers due to its effectiveness in improving test accuracy\nand its simplicity in implementation. \"Hard\" one-hot labels are \"smoothed\" by\nuniformly distributing probability mass to other classes, reducing overfitting.\nIn this work, we reveal that LS negatively affects selective classification\n(SC) - where the aim is to reject misclassifications using a model's predictive\nuncertainty. We first demonstrate empirically across a range of tasks and\narchitectures that LS leads to a consistent degradation in SC. We then explain\nthis by analysing logit-level gradients, showing that LS exacerbates\noverconfidence and underconfidence by regularising the max logit more when the\nprobability of error is low, and less when the probability of error is high.\nThis elucidates previously reported experimental results where strong\nclassifiers underperform in SC. We then demonstrate the empirical effectiveness\nof logit normalisation for recovering lost SC performance caused by LS.\nFurthermore, based on our gradient analysis, we explain why such normalisation\nis effective. We will release our code shortly.\n","authors":["Guoxuan Xia","Olivier Laurent","Gianni Franchi","Christos-Savvas Bouganis"],"pdf_url":"https://arxiv.org/pdf/2403.14715v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15463v1","updated":"2024-03-19T15:53:57Z","published":"2024-03-19T15:53:57Z","title":"Unveiling the Anomalies in an Ever-Changing World: A Benchmark for\n Pixel-Level Anomaly Detection in Continual Learning","summary":" Anomaly Detection is a relevant problem in numerous real-world applications,\nespecially when dealing with images. However, little attention has been paid to\nthe issue of changes over time in the input data distribution, which may cause\na significant decrease in performance. In this study, we investigate the\nproblem of Pixel-Level Anomaly Detection in the Continual Learning setting,\nwhere new data arrives over time and the goal is to perform well on new and old\ndata. We implement several state-of-the-art techniques to solve the Anomaly\nDetection problem in the classic setting and adapt them to work in the\nContinual Learning setting. To validate the approaches, we use a real-world\ndataset of images with pixel-based anomalies to provide a reliable benchmark\nand serve as a foundation for further advancements in the field. We provide a\ncomprehensive analysis, discussing which Anomaly Detection methods and which\nfamilies of approaches seem more suitable for the Continual Learning setting.\n","authors":["Nikola Bugarin","Jovana Bugaric","Manuel Barusco","Davide Dalle Pezze","Gian Antonio Susto"],"pdf_url":"https://arxiv.org/pdf/2403.15463v1.pdf","comment":null}]},"2024-03-20T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2403.13808v1","updated":"2024-03-20T17:59:58Z","published":"2024-03-20T17:59:58Z","title":"On Pretraining Data Diversity for Self-Supervised Learning","summary":" We explore the impact of training with more diverse datasets, characterized\nby the number of unique samples, on the performance of self-supervised learning\n(SSL) under a fixed computational budget. Our findings consistently demonstrate\nthat increasing pretraining data diversity enhances SSL performance, albeit\nonly when the distribution distance to the downstream data is minimal. Notably,\neven with an exceptionally large pretraining data diversity achieved through\nmethods like web crawling or diffusion-generated data, among other ways, the\ndistribution shift remains a challenge. Our experiments are comprehensive with\nseven SSL methods using large-scale datasets such as ImageNet and YFCC100M\namounting to over 200 GPU days. Code and trained models will be available at\nhttps://github.com/hammoudhasan/DiversitySSL .\n","authors":["Hasan Abed Al Kader Hammoud","Tuhin Das","Fabio Pizzati","Philip Torr","Adel Bibi","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2403.13808v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2403.13807v1","updated":"2024-03-20T17:59:57Z","published":"2024-03-20T17:59:57Z","title":"Editing Massive Concepts in Text-to-Image Diffusion Models","summary":" Text-to-image diffusion models suffer from the risk of generating outdated,\ncopyrighted, incorrect, and biased content. While previous methods have\nmitigated the issues on a small scale, it is essential to handle them\nsimultaneously in larger-scale real-world scenarios. We propose a two-stage\nmethod, Editing Massive Concepts In Diffusion Models (EMCID). The first stage\nperforms memory optimization for each individual concept with dual\nself-distillation from text alignment loss and diffusion noise prediction loss.\nThe second stage conducts massive concept editing with multi-layer, closed form\nmodel editing. We further propose a comprehensive benchmark, named ImageNet\nConcept Editing Benchmark (ICEB), for evaluating massive concept editing for\nT2I models with two subtasks, free-form prompts, massive concept categories,\nand extensive evaluation metrics. Extensive experiments conducted on our\nproposed benchmark and previous benchmarks demonstrate the superior scalability\nof EMCID for editing up to 1,000 concepts, providing a practical approach for\nfast adjustment and re-deployment of T2I diffusion models in real-world\napplications.\n","authors":["Tianwei Xiong","Yue Wu","Enze Xie","Yue Wu","Zhenguo Li","Xihui Liu"],"pdf_url":"https://arxiv.org/pdf/2403.13807v1.pdf","comment":"Project page: https://silentview.github.io/EMCID/ . Code:\n https://github.com/SilentView/EMCID"},{"id":"http://arxiv.org/abs/2403.13805v1","updated":"2024-03-20T17:59:55Z","published":"2024-03-20T17:59:55Z","title":"RAR: Retrieving And Ranking Augmented MLLMs for Visual Recognition","summary":" CLIP (Contrastive Language-Image Pre-training) uses contrastive learning from\nnoise image-text pairs to excel at recognizing a wide array of candidates, yet\nits focus on broad associations hinders the precision in distinguishing subtle\ndifferences among fine-grained items. Conversely, Multimodal Large Language\nModels (MLLMs) excel at classifying fine-grained categories, thanks to their\nsubstantial knowledge from pre-training on web-level corpora. However, the\nperformance of MLLMs declines with an increase in category numbers, primarily\ndue to growing complexity and constraints of limited context window size. To\nsynergize the strengths of both approaches and enhance the few-shot/zero-shot\nrecognition abilities for datasets characterized by extensive and fine-grained\nvocabularies, this paper introduces RAR, a Retrieving And Ranking augmented\nmethod for MLLMs. We initially establish a multi-modal retriever based on CLIP\nto create and store explicit memory for different categories beyond the\nimmediate context window. During inference, RAR retrieves the top-k similar\nresults from the memory and uses MLLMs to rank and make the final predictions.\nOur proposed approach not only addresses the inherent limitations in\nfine-grained recognition but also preserves the model's comprehensive knowledge\nbase, significantly boosting accuracy across a range of vision-language\nrecognition tasks. Notably, our approach demonstrates a significant improvement\nin performance on 5 fine-grained visual recognition benchmarks, 11 few-shot\nimage recognition datasets, and the 2 object detection datasets under the\nzero-shot recognition setting.\n","authors":["Ziyu Liu","Zeyi Sun","Yuhang Zang","Wei Li","Pan Zhang","Xiaoyi Dong","Yuanjun Xiong","Dahua Lin","Jiaqi Wang"],"pdf_url":"https://arxiv.org/pdf/2403.13805v1.pdf","comment":"Project: https://github.com/Liuziyu77/RAR"},{"id":"http://arxiv.org/abs/2403.13806v1","updated":"2024-03-20T17:59:55Z","published":"2024-03-20T17:59:55Z","title":"RadSplat: Radiance Field-Informed Gaussian Splatting for Robust\n Real-Time Rendering with 900+ FPS","summary":" Recent advances in view synthesis and real-time rendering have achieved\nphotorealistic quality at impressive rendering speeds. While Radiance\nField-based methods achieve state-of-the-art quality in challenging scenarios\nsuch as in-the-wild captures and large-scale scenes, they often suffer from\nexcessively high compute requirements linked to volumetric rendering. Gaussian\nSplatting-based methods, on the other hand, rely on rasterization and naturally\nachieve real-time rendering but suffer from brittle optimization heuristics\nthat underperform on more challenging scenes. In this work, we present\nRadSplat, a lightweight method for robust real-time rendering of complex\nscenes. Our main contributions are threefold. First, we use radiance fields as\na prior and supervision signal for optimizing point-based scene\nrepresentations, leading to improved quality and more robust optimization.\nNext, we develop a novel pruning technique reducing the overall point count\nwhile maintaining high quality, leading to smaller and more compact scene\nrepresentations with faster inference speeds. Finally, we propose a novel\ntest-time filtering approach that further accelerates rendering and allows to\nscale to larger, house-sized scenes. We find that our method enables\nstate-of-the-art synthesis of complex captures at 900+ FPS.\n","authors":["Michael Niemeyer","Fabian Manhardt","Marie-Julie Rakotosaona","Michael Oechsle","Daniel Duckworth","Rama Gosula","Keisuke Tateno","John Bates","Dominik Kaeser","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2403.13806v1.pdf","comment":"Project page at https://m-niemeyer.github.io/radsplat/"},{"id":"http://arxiv.org/abs/2403.13804v1","updated":"2024-03-20T17:59:43Z","published":"2024-03-20T17:59:43Z","title":"Learning from Models and Data for Visual Grounding","summary":" We introduce SynGround, a novel framework that combines data-driven learning\nand knowledge transfer from various large-scale pretrained models to enhance\nthe visual grounding capabilities of a pretrained vision-and-language model.\nThe knowledge transfer from the models initiates the generation of image\ndescriptions through an image description generator. These descriptions serve\ndual purposes: they act as prompts for synthesizing images through a\ntext-to-image generator, and as queries for synthesizing text, from which\nphrases are extracted using a large language model. Finally, we leverage an\nopen-vocabulary object detector to generate synthetic bounding boxes for the\nsynthetic images and texts. We finetune a pretrained vision-and-language model\non this dataset by optimizing a mask-attention consistency objective that\naligns region annotations with gradient-based model explanations. The resulting\nmodel improves the grounding capabilities of an off-the-shelf\nvision-and-language model. Particularly, SynGround improves the pointing game\naccuracy of ALBEF on the Flickr30k dataset from 79.38% to 87.26%, and on\nRefCOCO+ Test A from 69.35% to 79.06% and on RefCOCO+ Test B from 53.77% to\n63.67%.\n","authors":["Ruozhen He","Paola Cascante-Bonilla","Ziyan Yang","Alexander C. Berg","Vicente Ordonez"],"pdf_url":"https://arxiv.org/pdf/2403.13804v1.pdf","comment":"Project Page: https://catherine-r-he.github.io/SynGround/"},{"id":"http://arxiv.org/abs/2403.13803v1","updated":"2024-03-20T17:59:16Z","published":"2024-03-20T17:59:16Z","title":"Bounding Box Stability against Feature Dropout Reflects Detector\n Generalization across Environments","summary":" Bounding boxes uniquely characterize object detection, where a good detector\ngives accurate bounding boxes of categories of interest. However, in the\nreal-world where test ground truths are not provided, it is non-trivial to find\nout whether bounding boxes are accurate, thus preventing us from assessing the\ndetector generalization ability. In this work, we find under feature map\ndropout, good detectors tend to output bounding boxes whose locations do not\nchange much, while bounding boxes of poor detectors will undergo noticeable\nposition changes. We compute the box stability score (BoS score) to reflect\nthis stability. Specifically, given an image, we compute a normal set of\nbounding boxes and a second set after feature map dropout. To obtain BoS score,\nwe use bipartite matching to find the corresponding boxes between the two sets\nand compute the average Intersection over Union (IoU) across the entire test\nset. We contribute to finding that BoS score has a strong, positive correlation\nwith detection accuracy measured by mean average precision (mAP) under various\ntest environments. This relationship allows us to predict the accuracy of\ndetectors on various real-world test sets without accessing test ground truths,\nverified on canonical detection tasks such as vehicle detection and pedestrian\ndetection. Code and data are available at https://github.com/YangYangGirl/BoS.\n","authors":["Yang Yang","Wenhai Wang","Zhe Chen","Jifeng Dai","Liang Zheng"],"pdf_url":"https://arxiv.org/pdf/2403.13803v1.pdf","comment":"ICLR 2024 spotlight"},{"id":"http://arxiv.org/abs/2403.13802v1","updated":"2024-03-20T17:59:14Z","published":"2024-03-20T17:59:14Z","title":"ZigMa: Zigzag Mamba Diffusion Model","summary":" The diffusion model has long been plagued by scalability and quadratic\ncomplexity issues, especially within transformer-based structures. In this\nstudy, we aim to leverage the long sequence modeling capability of a\nState-Space Model called Mamba to extend its applicability to visual data\ngeneration. Firstly, we identify a critical oversight in most current\nMamba-based vision methods, namely the lack of consideration for spatial\ncontinuity in the scan scheme of Mamba. Secondly, building upon this insight,\nwe introduce a simple, plug-and-play, zero-parameter method named Zigzag Mamba,\nwhich outperforms Mamba-based baselines and demonstrates improved speed and\nmemory utilization compared to transformer-based baselines. Lastly, we\nintegrate Zigzag Mamba with the Stochastic Interpolant framework to investigate\nthe scalability of the model on large-resolution visual datasets, such as\nFacesHQ $1024\\times 1024$ and UCF101, MultiModal-CelebA-HQ, and MS COCO\n$256\\times 256$. Code will be released at https://taohu.me/zigma/\n","authors":["Vincent Tao Hu","Stefan Andreas Baumann","Ming Gui","Olga Grebenkova","Pingchuan Ma","Johannes Fischer","Bjorn Ommer"],"pdf_url":"https://arxiv.org/pdf/2403.13802v1.pdf","comment":"Project Page: https://taohu.me/zigma/"},{"id":"http://arxiv.org/abs/2312.06644v2","updated":"2024-03-20T17:58:05Z","published":"2023-12-11T18:56:37Z","title":"AnyHome: Open-Vocabulary Generation of Structured and Textured 3D Homes","summary":" Inspired by cognitive theories, we introduce AnyHome, a framework that\ntranslates any text into well-structured and textured indoor scenes at a\nhouse-scale. By prompting Large Language Models (LLMs) with designed templates,\nour approach converts provided textual narratives into amodal structured\nrepresentations. These representations guarantee consistent and realistic\nspatial layouts by directing the synthesis of a geometry mesh within defined\nconstraints. A Score Distillation Sampling process is then employed to refine\nthe geometry, followed by an egocentric inpainting process that adds lifelike\ntextures to it. AnyHome stands out with its editability, customizability,\ndiversity, and realism. The structured representations for scenes allow for\nextensive editing at varying levels of granularity. Capable of interpreting\ntexts ranging from simple labels to detailed narratives, AnyHome generates\ndetailed geometries and textures that outperform existing methods in both\nquantitative and qualitative measures.\n","authors":["Rao Fu","Zehao Wen","Zichen Liu","Srinath Sridhar"],"pdf_url":"https://arxiv.org/pdf/2312.06644v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13800v1","updated":"2024-03-20T17:57:02Z","published":"2024-03-20T17:57:02Z","title":"TimeRewind: Rewinding Time with Image-and-Events Video Diffusion","summary":" This paper addresses the novel challenge of ``rewinding'' time from a single\ncaptured image to recover the fleeting moments missed just before the shutter\nbutton is pressed. This problem poses a significant challenge in computer\nvision and computational photography, as it requires predicting plausible\npre-capture motion from a single static frame, an inherently ill-posed task due\nto the high degree of freedom in potential pixel movements. We overcome this\nchallenge by leveraging the emerging technology of neuromorphic event cameras,\nwhich capture motion information with high temporal resolution, and integrating\nthis data with advanced image-to-video diffusion models. Our proposed framework\nintroduces an event motion adaptor conditioned on event camera data, guiding\nthe diffusion model to generate videos that are visually coherent and\nphysically grounded in the captured events. Through extensive experimentation,\nwe demonstrate the capability of our approach to synthesize high-quality videos\nthat effectively ``rewind'' time, showcasing the potential of combining event\ncamera technology with generative models. Our work opens new avenues for\nresearch at the intersection of computer vision, computational photography, and\ngenerative modeling, offering a forward-thinking solution to capturing missed\nmoments and enhancing future consumer cameras and smartphones. Please see the\nproject page at https://timerewind.github.io/ for video results and code\nrelease.\n","authors":["Jingxi Chen","Brandon Y. Feng","Haoming Cai","Mingyang Xie","Christopher Metzler","Cornelia Fermuller","Yiannis Aloimonos"],"pdf_url":"https://arxiv.org/pdf/2403.13800v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13798v1","updated":"2024-03-20T17:55:21Z","published":"2024-03-20T17:55:21Z","title":"Hierarchical NeuroSymbolic Approach for Action Quality Assessment","summary":" Action quality assessment (AQA) applies computer vision to quantitatively\nassess the performance or execution of a human action. Current AQA approaches\nare end-to-end neural models, which lack transparency and tend to be biased\nbecause they are trained on subjective human judgements as ground-truth. To\naddress these issues, we introduce a neuro-symbolic paradigm for AQA, which\nuses neural networks to abstract interpretable symbols from video data and\nmakes quality assessments by applying rules to those symbols. We take diving as\nthe case study. We found that domain experts prefer our system and find it more\ninformative than purely neural approaches to AQA in diving. Our system also\nachieves state-of-the-art action recognition and temporal segmentation, and\nautomatically generates a detailed report that breaks the dive down into its\nelements and provides objective scoring with visual evidence. As verified by a\ngroup of domain experts, this report may be used to assist judges in scoring,\nhelp train judges, and provide feedback to divers. We will open-source all of\nour annotated training data and code for ease of reproducibility.\n","authors":["Lauren Okamoto","Paritosh Parmar"],"pdf_url":"https://arxiv.org/pdf/2403.13798v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13797v1","updated":"2024-03-20T17:54:58Z","published":"2024-03-20T17:54:58Z","title":"Bridge the Modality and Capacity Gaps in Vision-Language Model Selection","summary":" Vision Language Models (VLMs) excel in zero-shot image classification by\npairing images with textual category names. The expanding variety of\nPre-Trained VLMs enhances the likelihood of identifying a suitable VLM for\nspecific tasks. Thus, a promising zero-shot image classification strategy is\nselecting the most appropriate Pre-Trained VLM from the VLM Zoo, relying solely\non the text data of the target dataset without access to the dataset's images.\nIn this paper, we analyze two inherent challenges in assessing the ability of a\nVLM in this Language-Only VLM selection: the \"Modality Gap\" -- the disparity in\nVLM's embeddings across two different modalities, making text a less reliable\nsubstitute for images; and the \"Capability Gap\" -- the discrepancy between the\nVLM's overall ranking and its ranking for target dataset, hindering direct\nprediction of a model's dataset-specific performance from its general\nperformance. We propose VLM Selection With gAp Bridging (SWAB) to mitigate the\nnegative impact of these two gaps. SWAB first adopts optimal transport to\ncapture the relevance between open-source datasets and target dataset with a\ntransportation matrix. It then uses this matrix to transfer useful statistics\nof VLMs from open-source datasets to the target dataset for bridging those two\ngaps and enhancing the VLM's capacity estimation for VLM selection. Experiments\nacross various VLMs and image classification datasets validate SWAB's\neffectiveness.\n","authors":["Chao Yi","De-Chuan Zhan","Han-Jia Ye"],"pdf_url":"https://arxiv.org/pdf/2403.13797v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13788v1","updated":"2024-03-20T17:51:53Z","published":"2024-03-20T17:51:53Z","title":"DepthFM: Fast Monocular Depth Estimation with Flow Matching","summary":" Monocular depth estimation is crucial for numerous downstream vision tasks\nand applications. Current discriminative approaches to this problem are limited\ndue to blurry artifacts, while state-of-the-art generative methods suffer from\nslow sampling due to their SDE nature. Rather than starting from noise, we seek\na direct mapping from input image to depth map. We observe that this can be\neffectively framed using flow matching, since its straight trajectories through\nsolution space offer efficiency and high quality. Our study demonstrates that a\npre-trained image diffusion model can serve as an adequate prior for a flow\nmatching depth model, allowing efficient training on only synthetic data to\ngeneralize to real images. We find that an auxiliary surface normals loss\nfurther improves the depth estimates. Due to the generative nature of our\napproach, our model reliably predicts the confidence of its depth estimates. On\nstandard benchmarks of complex natural scenes, our lightweight approach\nexhibits state-of-the-art performance at favorable low computational cost\ndespite only being trained on little synthetic data.\n","authors":["Ming Gui","Johannes S. Fischer","Ulrich Prestel","Pingchuan Ma","Dmytro Kotovenko","Olga Grebenkova","Stefan Andreas Baumann","Vincent Tao Hu","Björn Ommer"],"pdf_url":"https://arxiv.org/pdf/2403.13788v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13778v1","updated":"2024-03-20T17:41:35Z","published":"2024-03-20T17:41:35Z","title":"Certified Human Trajectory Prediction","summary":" Trajectory prediction plays an essential role in autonomous vehicles. While\nnumerous strategies have been developed to enhance the robustness of trajectory\nprediction models, these methods are predominantly heuristic and do not offer\nguaranteed robustness against adversarial attacks and noisy observations. In\nthis work, we propose a certification approach tailored for the task of\ntrajectory prediction. To this end, we address the inherent challenges\nassociated with trajectory prediction, including unbounded outputs, and\nmutli-modality, resulting in a model that provides guaranteed robustness.\nFurthermore, we integrate a denoiser into our method to further improve the\nperformance. Through comprehensive evaluations, we demonstrate the\neffectiveness of the proposed technique across various baselines and using\nstandard trajectory prediction datasets. The code will be made available\nonline: https://s-attack.github.io/\n","authors":["Mohammadhossein Bahari","Saeed Saadatnejad","Amirhossein Asgari Farsangi","Seyed-Mohsen Moosavi-Dezfooli","Alexandre Alahi"],"pdf_url":"https://arxiv.org/pdf/2403.13778v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.09368v2","updated":"2024-03-20T17:36:35Z","published":"2024-02-14T18:13:51Z","title":"Magic-Me: Identity-Specific Video Customized Diffusion","summary":" Creating content with specified identities (ID) has attracted significant\ninterest in the field of generative models. In the field of text-to-image\ngeneration (T2I), subject-driven creation has achieved great progress with the\nidentity controlled via reference images. However, its extension to video\ngeneration is not well explored. In this work, we propose a simple yet\neffective subject identity controllable video generation framework, termed\nVideo Custom Diffusion (VCD). With a specified identity defined by a few\nimages, VCD reinforces the identity characteristics and injects frame-wise\ncorrelation at the initialization stage for stable video outputs. To achieve\nthis, we propose three novel components that are essential for high-quality\nidentity preservation and stable video generation: 1) a noise initialization\nmethod with 3D Gaussian Noise Prior for better inter-frame stability; 2) an ID\nmodule based on extended Textual Inversion trained with the cropped identity to\ndisentangle the ID information from the background 3) Face VCD and Tiled VCD\nmodules to reinforce faces and upscale the video to higher resolution while\npreserving the identity's features. We conducted extensive experiments to\nverify that VCD is able to generate stable videos with better ID over the\nbaselines. Besides, with the transferability of the encoded identity in the ID\nmodule, VCD is also working well with personalized text-to-image models\navailable publicly. The codes are available at\nhttps://github.com/Zhen-Dong/Magic-Me.\n","authors":["Ze Ma","Daquan Zhou","Chun-Hsiao Yeh","Xue-She Wang","Xiuyu Li","Huanrui Yang","Zhen Dong","Kurt Keutzer","Jiashi Feng"],"pdf_url":"https://arxiv.org/pdf/2402.09368v2.pdf","comment":"Project Page at https://magic-me-webpage.github.io"},{"id":"http://arxiv.org/abs/2403.11085v2","updated":"2024-03-20T17:35:15Z","published":"2024-03-17T04:36:18Z","title":"m&m's: A Benchmark to Evaluate Tool-Use for multi-step multi-modal Tasks","summary":" Real-world multi-modal problems are rarely solved by a single machine\nlearning model, and often require multi-step computational plans that involve\nstitching several models. Tool-augmented LLMs hold tremendous promise for\nautomating the generation of such computational plans. However, the lack of\nstandardized benchmarks for evaluating LLMs as planners for multi-step\nmulti-modal tasks has prevented a systematic study of planner design decisions.\nShould LLMs generate a full plan in a single shot or step-by-step? Should they\ninvoke tools directly with Python code or through structured data formats like\nJSON? Does feedback improve planning? To answer these questions and more, we\nintroduce m&m's: a benchmark containing 4K+ multi-step multi-modal tasks\ninvolving 33 tools that include multi-modal models, (free) public APIs, and\nimage processing modules. For each of these task queries, we provide\nautomatically generated plans using this realistic toolset. We further provide\na high-quality subset of 1,565 task plans that are human-verified and correctly\nexecutable. With m&m's, we evaluate 6 popular LLMs with 2 planning strategies\n(multi-step vs. step-by-step planning), 2 plan formats (JSON vs. code), and 3\ntypes of feedback (parsing/verification/execution). Finally, we summarize\ntakeaways from our extensive experiments. Our dataset and code are available on\nHuggingFace (https://huggingface.co/datasets/zixianma/mnms) and Github\n(https://github.com/RAIVNLab/mnms).\n","authors":["Zixian Ma","Weikai Huang","Jieyu Zhang","Tanmay Gupta","Ranjay Krishna"],"pdf_url":"https://arxiv.org/pdf/2403.11085v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13771v1","updated":"2024-03-20T17:33:02Z","published":"2024-03-20T17:33:02Z","title":"Describe-and-Dissect: Interpreting Neurons in Vision Networks with\n Language Models","summary":" In this paper, we propose Describe-and-Dissect (DnD), a novel method to\ndescribe the roles of hidden neurons in vision networks. DnD utilizes recent\nadvancements in multimodal deep learning to produce complex natural language\ndescriptions, without the need for labeled training data or a predefined set of\nconcepts to choose from. Additionally, DnD is training-free, meaning we don't\ntrain any new models and can easily leverage more capable general purpose\nmodels in the future. We have conducted extensive qualitative and quantitative\nanalysis to show that DnD outperforms prior work by providing higher quality\nneuron descriptions. Specifically, our method on average provides the highest\nquality labels and is more than 2 times as likely to be selected as the best\nexplanation for a neuron than the best baseline.\n","authors":["Nicholas Bai","Rahul A. Iyer","Tuomas Oikarinen","Tsui-Wei Weng"],"pdf_url":"https://arxiv.org/pdf/2403.13771v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13765v1","updated":"2024-03-20T17:28:17Z","published":"2024-03-20T17:28:17Z","title":"Towards Principled Representation Learning from Videos for Reinforcement\n Learning","summary":" We study pre-training representations for decision-making using video data,\nwhich is abundantly available for tasks such as game agents and software\ntesting. Even though significant empirical advances have been made on this\nproblem, a theoretical understanding remains absent. We initiate the\ntheoretical investigation into principled approaches for representation\nlearning and focus on learning the latent state representations of the\nunderlying MDP using video data. We study two types of settings: one where\nthere is iid noise in the observation, and a more challenging setting where\nthere is also the presence of exogenous noise, which is non-iid noise that is\ntemporally correlated, such as the motion of people or cars in the background.\nWe study three commonly used approaches: autoencoding, temporal contrastive\nlearning, and forward modeling. We prove upper bounds for temporal contrastive\nlearning and forward modeling in the presence of only iid noise. We show that\nthese approaches can learn the latent state and use it to do efficient\ndownstream RL with polynomial sample complexity. When exogenous noise is also\npresent, we establish a lower bound result showing that the sample complexity\nof learning from video data can be exponentially worse than learning from\naction-labeled trajectory data. This partially explains why reinforcement\nlearning with video pre-training is hard. We evaluate these representational\nlearning methods in two visual domains, yielding results that are consistent\nwith our theoretical findings.\n","authors":["Dipendra Misra","Akanksha Saran","Tengyang Xie","Alex Lamb","John Langford"],"pdf_url":"https://arxiv.org/pdf/2403.13765v1.pdf","comment":"ICLR 2024 Spotlight Conference Paper"},{"id":"http://arxiv.org/abs/2312.00651v2","updated":"2024-03-20T17:28:02Z","published":"2023-12-01T15:24:38Z","title":"TrackDiffusion: Tracklet-Conditioned Video Generation via Diffusion\n Models","summary":" Despite remarkable achievements in video synthesis, achieving granular\ncontrol over complex dynamics, such as nuanced movement among multiple\ninteracting objects, still presents a significant hurdle for dynamic world\nmodeling, compounded by the necessity to manage appearance and disappearance,\ndrastic scale changes, and ensure consistency for instances across frames.\nThese challenges hinder the development of video generation that can faithfully\nmimic real-world complexity, limiting utility for applications requiring\nhigh-level realism and controllability, including advanced scene simulation and\ntraining of perception systems. To address that, we propose TrackDiffusion, a\nnovel video generation framework affording fine-grained trajectory-conditioned\nmotion control via diffusion models, which facilitates the precise manipulation\nof the object trajectories and interactions, overcoming the prevalent\nlimitation of scale and continuity disruptions. A pivotal component of\nTrackDiffusion is the instance enhancer, which explicitly ensures inter-frame\nconsistency of multiple objects, a critical factor overlooked in the current\nliterature. Moreover, we demonstrate that generated video sequences by our\nTrackDiffusion can be used as training data for visual perception models. To\nthe best of our knowledge, this is the first work to apply video diffusion\nmodels with tracklet conditions and demonstrate that generated frames can be\nbeneficial for improving the performance of object trackers.\n","authors":["Pengxiang Li","Kai Chen","Zhili Liu","Ruiyuan Gao","Lanqing Hong","Guo Zhou","Hua Yao","Dit-Yan Yeung","Huchuan Lu","Xu Jia"],"pdf_url":"https://arxiv.org/pdf/2312.00651v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13763v1","updated":"2024-03-20T17:26:22Z","published":"2024-03-20T17:26:22Z","title":"Practical End-to-End Optical Music Recognition for Pianoform Music","summary":" The majority of recent progress in Optical Music Recognition (OMR) has been\nachieved with Deep Learning methods, especially models following the end-to-end\nparadigm, reading input images and producing a linear sequence of tokens.\nUnfortunately, many music scores, especially piano music, cannot be easily\nconverted to a linear sequence. This has led OMR researchers to use custom\nlinearized encodings, instead of broadly accepted structured formats for music\nnotation. Their diversity makes it difficult to compare the performance of OMR\nsystems directly. To bring recent OMR model progress closer to useful results:\n(a) We define a sequential format called Linearized MusicXML, allowing to train\nan end-to-end model directly and maintaining close cohesion and compatibility\nwith the industry-standard MusicXML format. (b) We create a dev and test set\nfor benchmarking typeset OMR with MusicXML ground truth based on the OpenScore\nLieder corpus. They contain 1,438 and 1,493 pianoform systems, each with an\nimage from IMSLP. (c) We train and fine-tune an end-to-end model to serve as a\nbaseline on the dataset and employ the TEDn metric to evaluate the model. We\nalso test our model against the recently published synthetic pianoform dataset\nGrandStaff and surpass the state-of-the-art results.\n","authors":["Jiří Mayer","Milan Straka","Jan Hajič jr.","Pavel Pecina"],"pdf_url":"https://arxiv.org/pdf/2403.13763v1.pdf","comment":"15+4 pages, 6 figures"},{"id":"http://arxiv.org/abs/2403.13761v1","updated":"2024-03-20T17:20:48Z","published":"2024-03-20T17:20:48Z","title":"HierCode: A Lightweight Hierarchical Codebook for Zero-shot Chinese Text\n Recognition","summary":" Text recognition, especially for complex scripts like Chinese, faces unique\nchallenges due to its intricate character structures and vast vocabulary.\nTraditional one-hot encoding methods struggle with the representation of\nhierarchical radicals, recognition of Out-Of-Vocabulary (OOV) characters, and\non-device deployment due to their computational intensity. To address these\nchallenges, we propose HierCode, a novel and lightweight codebook that exploits\nthe innate hierarchical nature of Chinese characters. HierCode employs a\nmulti-hot encoding strategy, leveraging hierarchical binary tree encoding and\nprototype learning to create distinctive, informative representations for each\ncharacter. This approach not only facilitates zero-shot recognition of OOV\ncharacters by utilizing shared radicals and structures but also excels in\nline-level recognition tasks by computing similarity with visual features, a\nnotable advantage over existing methods. Extensive experiments across diverse\nbenchmarks, including handwritten, scene, document, web, and ancient text, have\nshowcased HierCode's superiority for both conventional and zero-shot Chinese\ncharacter or text recognition, exhibiting state-of-the-art performance with\nsignificantly fewer parameters and fast inference speed.\n","authors":["Yuyi Zhang","Yuanzhi Zhu","Dezhi Peng","Peirong Zhang","Zhenhua Yang","Zhibo Yang","Cong Yao","Lianwen Jin"],"pdf_url":"https://arxiv.org/pdf/2403.13761v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13762v1","updated":"2024-03-20T17:20:48Z","published":"2024-03-20T17:20:48Z","title":"When Cars meet Drones: Hyperbolic Federated Learning for Source-Free\n Domain Adaptation in Adverse Weather","summary":" In Federated Learning (FL), multiple clients collaboratively train a global\nmodel without sharing private data. In semantic segmentation, the Federated\nsource Free Domain Adaptation (FFreeDA) setting is of particular interest,\nwhere clients undergo unsupervised training after supervised pretraining at the\nserver side. While few recent works address FL for autonomous vehicles,\nintrinsic real-world challenges such as the presence of adverse weather\nconditions and the existence of different autonomous agents are still\nunexplored. To bridge this gap, we address both problems and introduce a new\nfederated semantic segmentation setting where both car and drone clients\nco-exist and collaborate. Specifically, we propose a novel approach for this\nsetting which exploits a batch-norm weather-aware strategy to dynamically adapt\nthe model to the different weather conditions, while hyperbolic space\nprototypes are used to align the heterogeneous client representations. Finally,\nwe introduce FLYAWARE, the first semantic segmentation dataset with adverse\nweather data for aerial vehicles.\n","authors":["Giulia Rizzoli","Matteo Caligiuri","Donald Shenaj","Francesco Barbato","Pietro Zanuttigh"],"pdf_url":"https://arxiv.org/pdf/2403.13762v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16355v3","updated":"2024-03-20T17:13:53Z","published":"2024-01-29T17:59:19Z","title":"PathMMU: A Massive Multimodal Expert-Level Benchmark for Understanding\n and Reasoning in Pathology","summary":" The emergence of large multimodal models has unlocked remarkable potential in\nAI, particularly in pathology. However, the lack of specialized, high-quality\nbenchmark impeded their development and precise evaluation. To address this, we\nintroduce PathMMU, the largest and highest-quality expert-validated pathology\nbenchmark for Large Multimodal Models (LMMs). It comprises 33,428 multimodal\nmulti-choice questions and 24,067 images from various sources, each accompanied\nby an explanation for the correct answer. The construction of PathMMU harnesses\nGPT-4V's advanced capabilities, utilizing over 30,000 image-caption pairs to\nenrich captions and generate corresponding Q&As in a cascading process.\nSignificantly, to maximize PathMMU's authority, we invite seven pathologists to\nscrutinize each question under strict standards in PathMMU's validation and\ntest sets, while simultaneously setting an expert-level performance benchmark\nfor PathMMU. We conduct extensive evaluations, including zero-shot assessments\nof 14 open-sourced and 4 closed-sourced LMMs and their robustness to image\ncorruption. We also fine-tune representative LMMs to assess their adaptability\nto PathMMU. The empirical findings indicate that advanced LMMs struggle with\nthe challenging PathMMU benchmark, with the top-performing LMM, GPT-4V,\nachieving only a 49.8% zero-shot performance, significantly lower than the\n71.8% demonstrated by human pathologists. After fine-tuning, significantly\nsmaller open-sourced LMMs can outperform GPT-4V but still fall short of the\nexpertise shown by pathologists. We hope that the PathMMU will offer valuable\ninsights and foster the development of more specialized, next-generation LMMs\nfor pathology.\n","authors":["Yuxuan Sun","Hao Wu","Chenglu Zhu","Sunyi Zheng","Qizi Chen","Kai Zhang","Yunlong Zhang","Dan Wan","Xiaoxiao Lan","Mengyue Zheng","Jingxiong Li","Xinheng Lyu","Tao Lin","Lin Yang"],"pdf_url":"https://arxiv.org/pdf/2401.16355v3.pdf","comment":"27 pages, 12 figures"},{"id":"http://arxiv.org/abs/2403.13756v1","updated":"2024-03-20T17:03:38Z","published":"2024-03-20T17:03:38Z","title":"Enhancing Gait Video Analysis in Neurodegenerative Diseases by Knowledge\n Augmentation in Vision Language Model","summary":" We present a knowledge augmentation strategy for assessing the diagnostic\ngroups and gait impairment from monocular gait videos. Based on a large-scale\npre-trained Vision Language Model (VLM), our model learns and improves visual,\ntextual, and numerical representations of patient gait videos, through a\ncollective learning across three distinct modalities: gait videos,\nclass-specific descriptions, and numerical gait parameters. Our specific\ncontributions are two-fold: First, we adopt a knowledge-aware prompt tuning\nstrategy to utilize the class-specific medical description in guiding the text\nprompt learning. Second, we integrate the paired gait parameters in the form of\nnumerical texts to enhance the numeracy of the textual representation. Results\ndemonstrate that our model not only significantly outperforms state-of-the-art\n(SOTA) in video-based classification tasks but also adeptly decodes the learned\nclass-specific text features into natural language descriptions using the\nvocabulary of quantitative gait parameters. The code and the model will be made\navailable at our project page.\n","authors":["Diwei Wang","Kun Yuan","Candice Muller","Frédéric Blanc","Nicolas Padoy","Hyewon Seo"],"pdf_url":"https://arxiv.org/pdf/2403.13756v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13747v1","updated":"2024-03-20T16:54:55Z","published":"2024-03-20T16:54:55Z","title":"Leveraging High-Resolution Features for Improved Deep Hashing-based\n Image Retrieval","summary":" Deep hashing techniques have emerged as the predominant approach for\nefficient image retrieval. Traditionally, these methods utilize pre-trained\nconvolutional neural networks (CNNs) such as AlexNet and VGG-16 as feature\nextractors. However, the increasing complexity of datasets poses challenges for\nthese backbone architectures in capturing meaningful features essential for\neffective image retrieval. In this study, we explore the efficacy of employing\nhigh-resolution features learned through state-of-the-art techniques for image\nretrieval tasks. Specifically, we propose a novel methodology that utilizes\nHigh-Resolution Networks (HRNets) as the backbone for the deep hashing task,\ntermed High-Resolution Hashing Network (HHNet). Our approach demonstrates\nsuperior performance compared to existing methods across all tested benchmark\ndatasets, including CIFAR-10, NUS-WIDE, MS COCO, and ImageNet. This performance\nimprovement is more pronounced for complex datasets, which highlights the need\nto learn high-resolution features for intricate image retrieval tasks.\nFurthermore, we conduct a comprehensive analysis of different HRNet\nconfigurations and provide insights into the optimal architecture for the deep\nhashing task\n","authors":["Aymene Berriche","Mehdi Adjal Zakaria","Riyadh Baghdadi"],"pdf_url":"https://arxiv.org/pdf/2403.13747v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13745v1","updated":"2024-03-20T16:53:45Z","published":"2024-03-20T16:53:45Z","title":"Be-Your-Outpainter: Mastering Video Outpainting through Input-Specific\n Adaptation","summary":" Video outpainting is a challenging task, aiming at generating video content\noutside the viewport of the input video while maintaining inter-frame and\nintra-frame consistency. Existing methods fall short in either generation\nquality or flexibility. We introduce MOTIA Mastering Video Outpainting Through\nInput-Specific Adaptation, a diffusion-based pipeline that leverages both the\nintrinsic data-specific patterns of the source video and the image/video\ngenerative prior for effective outpainting. MOTIA comprises two main phases:\ninput-specific adaptation and pattern-aware outpainting. The input-specific\nadaptation phase involves conducting efficient and effective pseudo outpainting\nlearning on the single-shot source video. This process encourages the model to\nidentify and learn patterns within the source video, as well as bridging the\ngap between standard generative processes and outpainting. The subsequent\nphase, pattern-aware outpainting, is dedicated to the generalization of these\nlearned patterns to generate outpainting outcomes. Additional strategies\nincluding spatial-aware insertion and noise travel are proposed to better\nleverage the diffusion model's generative prior and the acquired video patterns\nfrom source videos. Extensive evaluations underscore MOTIA's superiority,\noutperforming existing state-of-the-art methods in widely recognized\nbenchmarks. Notably, these advancements are achieved without necessitating\nextensive, task-specific tuning.\n","authors":["Fu-Yun Wang","Xiaoshi Wu","Zhaoyang Huang","Xiaoyu Shi","Dazhong Shen","Guanglu Song","Yu Liu","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2403.13745v1.pdf","comment":"Code will be available at https://github.com/G-U-N/Be-Your-Outpainter"},{"id":"http://arxiv.org/abs/2302.05666v5","updated":"2024-03-20T16:50:25Z","published":"2023-02-11T11:56:06Z","title":"Jaccard Metric Losses: Optimizing the Jaccard Index with Soft Labels","summary":" Intersection over Union (IoU) losses are surrogates that directly optimize\nthe Jaccard index. Leveraging IoU losses as part of the loss function have\ndemonstrated superior performance in semantic segmentation tasks compared to\noptimizing pixel-wise losses such as the cross-entropy loss alone. However, we\nidentify a lack of flexibility in these losses to support vital training\ntechniques like label smoothing, knowledge distillation, and semi-supervised\nlearning, mainly due to their inability to process soft labels. To address\nthis, we introduce Jaccard Metric Losses (JMLs), which are identical to the\nsoft Jaccard loss in standard settings with hard labels but are fully\ncompatible with soft labels. We apply JMLs to three prominent use cases of soft\nlabels: label smoothing, knowledge distillation and semi-supervised learning,\nand demonstrate their potential to enhance model accuracy and calibration. Our\nexperiments show consistent improvements over the cross-entropy loss across 4\nsemantic segmentation datasets (Cityscapes, PASCAL VOC, ADE20K, DeepGlobe Land)\nand 13 architectures, including classic CNNs and recent vision transformers.\nRemarkably, our straightforward approach significantly outperforms\nstate-of-the-art knowledge distillation and semi-supervised learning methods.\nThe code is available at\n\\href{https://github.com/zifuwanggg/JDTLosses}{https://github.com/zifuwanggg/JDTLosses}.\n","authors":["Zifu Wang","Xuefei Ning","Matthew B. Blaschko"],"pdf_url":"https://arxiv.org/pdf/2302.05666v5.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.18561v2","updated":"2024-03-20T16:27:53Z","published":"2023-11-30T13:53:50Z","title":"Periodic Vibration Gaussian: Dynamic Urban Scene Reconstruction and\n Real-time Rendering","summary":" Modeling dynamic, large-scale urban scenes is challenging due to their highly\nintricate geometric structures and unconstrained dynamics in both space and\ntime. Prior methods often employ high-level architectural priors, separating\nstatic and dynamic elements, resulting in suboptimal capture of their\nsynergistic interactions. To address this challenge, we present a unified\nrepresentation model, called Periodic Vibration Gaussian (PVG). PVG builds upon\nthe efficient 3D Gaussian splatting technique, originally designed for static\nscene representation, by introducing periodic vibration-based temporal\ndynamics. This innovation enables PVG to elegantly and uniformly represent the\ncharacteristics of various objects and elements in dynamic urban scenes. To\nenhance temporally coherent and large scene representation learning with sparse\ntraining data, we introduce a novel temporal smoothing mechanism and a\nposition-aware adaptive control strategy respectively. Extensive experiments on\nWaymo Open Dataset and KITTI benchmarks demonstrate that PVG surpasses\nstate-of-the-art alternatives in both reconstruction and novel view synthesis\nfor both dynamic and static scenes. Notably, PVG achieves this without relying\non manually labeled object bounding boxes or expensive optical flow estimation.\nMoreover, PVG exhibits 900-fold acceleration in rendering over the best\nalternative.\n","authors":["Yurui Chen","Chun Gu","Junzhe Jiang","Xiatian Zhu","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.18561v2.pdf","comment":"Project page: https://fudan-zvg.github.io/PVG/"},{"id":"http://arxiv.org/abs/2310.13805v2","updated":"2024-03-20T16:23:20Z","published":"2023-10-20T20:32:43Z","title":"Normalizing flow-based deep variational Bayesian network for seismic\n multi-hazards and impacts estimation from InSAR imagery","summary":" Onsite disasters like earthquakes can trigger cascading hazards and impacts,\nsuch as landslides and infrastructure damage, leading to catastrophic losses;\nthus, rapid and accurate estimates are crucial for timely and effective\npost-disaster responses. Interferometric Synthetic aperture radar (InSAR) data\nis important in providing high-resolution onsite information for rapid hazard\nestimation. Most recent methods using InSAR imagery signals predict a single\ntype of hazard and thus often suffer low accuracy due to noisy and complex\nsignals induced by co-located hazards, impacts, and irrelevant environmental\nchanges (e.g., vegetation changes, human activities). We introduce a novel\nstochastic variational inference with normalizing flows derived to jointly\napproximate posteriors of multiple unobserved hazards and impacts from noisy\nInSAR imagery.\n","authors":["Xuechun Li","Paula M. Burgi","Wei Ma","Hae Young Noh","David J. Wald","Susu Xu"],"pdf_url":"https://arxiv.org/pdf/2310.13805v2.pdf","comment":"This paper needs to be reviewed by the USGS"},{"id":"http://arxiv.org/abs/2303.17783v5","updated":"2024-03-20T16:21:33Z","published":"2023-03-31T03:14:44Z","title":"Uncertainty-Aware Source-Free Adaptive Image Super-Resolution with\n Wavelet Augmentation Transformer","summary":" Unsupervised Domain Adaptation (UDA) can effectively address domain gap\nissues in real-world image Super-Resolution (SR) by accessing both the source\nand target data. Considering privacy policies or transmission restrictions of\nsource data in practical scenarios, we propose a SOurce-free Domain Adaptation\nframework for image SR (SODA-SR) to address this issue, i.e., adapt a\nsource-trained model to a target domain with only unlabeled target data.\nSODA-SR leverages the source-trained model to generate refined pseudo-labels\nfor teacher-student learning. To better utilize pseudo-labels, we propose a\nnovel wavelet-based augmentation method, named Wavelet Augmentation Transformer\n(WAT), which can be flexibly incorporated with existing networks, to implicitly\nproduce useful augmented data. WAT learns low-frequency information of varying\nlevels across diverse samples, which is aggregated efficiently via deformable\nattention. Furthermore, an uncertainty-aware self-training mechanism is\nproposed to improve the accuracy of pseudo-labels, with inaccurate predictions\nbeing rectified by uncertainty estimation. To acquire better SR results and\navoid overfitting pseudo-labels, several regularization losses are proposed to\nconstrain target LR and SR images in the frequency domain. Experiments show\nthat without accessing source data, SODA-SR outperforms state-of-the-art UDA\nmethods in both synthetic$\\rightarrow$real and real$\\rightarrow$real adaptation\nsettings, and is not constrained by specific network architectures.\n","authors":["Yuang Ai","Xiaoqiang Zhou","Huaibo Huang","Lei Zhang","Ran He"],"pdf_url":"https://arxiv.org/pdf/2303.17783v5.pdf","comment":"11 pages, 7 figures, 3 tables"},{"id":"http://arxiv.org/abs/2403.13714v1","updated":"2024-03-20T16:20:54Z","published":"2024-03-20T16:20:54Z","title":"DBA-Fusion: Tightly Integrating Deep Dense Visual Bundle Adjustment with\n Multiple Sensors for Large-Scale Localization and Mapping","summary":" Visual simultaneous localization and mapping (VSLAM) has broad applications,\nwith state-of-the-art methods leveraging deep neural networks for better\nrobustness and applicability. However, there is a lack of research in fusing\nthese learning-based methods with multi-sensor information, which could be\nindispensable to push related applications to large-scale and complex\nscenarios. In this paper, we tightly integrate the trainable deep dense bundle\nadjustment (DBA) with multi-sensor information through a factor graph. In the\nframework, recurrent optical flow and DBA are performed among sequential\nimages. The Hessian information derived from DBA is fed into a generic factor\ngraph for multi-sensor fusion, which employs a sliding window and supports\nprobabilistic marginalization. A pipeline for visual-inertial integration is\nfirstly developed, which provides the minimum ability of metric-scale\nlocalization and mapping. Furthermore, other sensors (e.g., global navigation\nsatellite system) are integrated for driftless and geo-referencing\nfunctionality. Extensive tests are conducted on both public datasets and\nself-collected datasets. The results validate the superior localization\nperformance of our approach, which enables real-time dense mapping in\nlarge-scale environments. The code has been made open-source\n(https://github.com/GREAT-WHU/DBA-Fusion).\n","authors":["Yuxuan Zhou","Xingxing Li","Shengyu Li","Xuanbin Wang","Shaoquan Feng","Yuxuan Tan"],"pdf_url":"https://arxiv.org/pdf/2403.13714v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.09988v2","updated":"2024-03-20T16:19:49Z","published":"2023-12-15T18:01:47Z","title":"Towards Architecture-Agnostic Untrained Network Priors for Image\n Reconstruction with Frequency Regularization","summary":" Untrained networks inspired by deep image prior have shown promising\ncapabilities in recovering a high-quality image from noisy or partial\nmeasurements, without requiring training data. Their success has been widely\nattributed to the spectral bias acting as an implicit regularization induced by\nsuitable network architectures. However, applications of such network-based\npriors often entail superfluous architectural decisions, overfitting risks, and\nslow optimization, all of which hinder their practicality. In this work, we\npropose efficient, architecture-agnostic methods for a more direct frequency\ncontrol over the network priors: 1) constraining the bandwidth of the\nwhite-noise input, 2) controlling the bandwidth of the interpolation-based\nupsamplers, and 3) regularizing the Lipschitz constants of the layers. We show\nthat even with just one extra line of code, the overfitting issues in\nunderperforming architectures can be alleviated such that their performance\ngaps with the high-performing counterparts can be largely closed despite their\ndistinct configurations, mitigating the need for architecture tuning. This then\nmakes it possible to employ a more compact model to achieve similar or superior\nperformance to larger models with greater efficiency. Our regularized network\npriors compare favorably with current supervised and self-supervised methods on\nMRI reconstruction and image inpainting tasks, serving as a stronger zero-shot\nbaseline reconstructor. Our code will be made publicly available.\n","authors":["Yilin Liu","Yunkui Pang","Jiang Li","Yong Chen","Pew-Thian Yap"],"pdf_url":"https://arxiv.org/pdf/2312.09988v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18649v2","updated":"2024-03-20T16:18:09Z","published":"2023-11-30T15:57:34Z","title":"Simple Semantic-Aided Few-Shot Learning","summary":" Learning from a limited amount of data, namely Few-Shot Learning, stands out\nas a challenging computer vision task. Several works exploit semantics and\ndesign complicated semantic fusion mechanisms to compensate for rare\nrepresentative features within restricted data. However, relying on naive\nsemantics such as class names introduces biases due to their brevity, while\nacquiring extensive semantics from external knowledge takes a huge time and\neffort. This limitation severely constrains the potential of semantics in\nfew-shot learning. In this paper, we design an automatic way called Semantic\nEvolution to generate high-quality semantics. The incorporation of high-quality\nsemantics alleviates the need for complex network structures and learning\nalgorithms used in previous works. Hence, we employ a simple two-layer network\ntermed Semantic Alignment Network to transform semantics and visual features\ninto robust class prototypes with rich discriminative features for few-shot\nclassification. The experimental results show our framework outperforms all\nprevious methods on six benchmarks, demonstrating a simple network with\nhigh-quality semantics can beat intricate multi-modal modules on few-shot\nclassification tasks. Code is available at\nhttps://github.com/zhangdoudou123/SemFew.\n","authors":["Hai Zhang","Junzhe Xu","Shanlin Jiang","Zhenan He"],"pdf_url":"https://arxiv.org/pdf/2311.18649v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2309.07915v3","updated":"2024-03-20T16:17:02Z","published":"2023-09-14T17:59:17Z","title":"MMICL: Empowering Vision-language Model with Multi-Modal In-Context\n Learning","summary":" Since the resurgence of deep learning, vision-language models (VLMs) enhanced\nby large language models (LLMs) have grown exponentially in popularity.\nHowever, while LLMs can utilize extensive background knowledge and task\ninformation with in-context learning, most VLMs still struggle with\nunderstanding complex multi-modal prompts with multiple images, making VLMs\nless effective in downstream vision-language tasks. In this paper, we address\nthe limitation above by 1) introducing vision-language Model with Multi-Modal\nIn-Context Learning(MMICL), a new approach to allow the VLM to deal with\nmulti-modal inputs efficiently; 2) proposing a novel context scheme to augment\nthe in-context learning ability of the VLM; 3) constructing the Multi-modal\nIn-Context Learning (MIC) dataset, designed to enhance the VLM's ability to\nunderstand complex multi-modal prompts. Our experiments confirm that MMICL\nachieves new state-of-the-art zero-shot performance on a wide range of general\nvision-language tasks, especially for complex benchmarks, including MME and\nMMBench. Our analysis demonstrates that MMICL effectively tackles the challenge\nof complex multi-modal prompt understanding and emerges the impressive ICL\nability. Furthermore, we observe that MMICL successfully alleviates language\nbias in VLMs, a common issue for VLMs that often leads to hallucination when\nfaced with extensive textual context. Our code, dataset, dataset tool, and\nmodel are available at https://github.com/PKUnlp-icler/MIC\n","authors":["Haozhe Zhao","Zefan Cai","Shuzheng Si","Xiaojian Ma","Kaikai An","Liang Chen","Zixuan Liu","Sheng Wang","Wenjuan Han","Baobao Chang"],"pdf_url":"https://arxiv.org/pdf/2309.07915v3.pdf","comment":"Accepted by ICLR2024"},{"id":"http://arxiv.org/abs/2312.02918v2","updated":"2024-03-20T16:12:57Z","published":"2023-12-05T17:47:11Z","title":"Multimodal Prompt Perceiver: Empower Adaptiveness, Generalizability and\n Fidelity for All-in-One Image Restoration","summary":" Despite substantial progress, all-in-one image restoration (IR) grapples with\npersistent challenges in handling intricate real-world degradations. This paper\nintroduces MPerceiver: a novel multimodal prompt learning approach that\nharnesses Stable Diffusion (SD) priors to enhance adaptiveness,\ngeneralizability and fidelity for all-in-one image restoration. Specifically,\nwe develop a dual-branch module to master two types of SD prompts: textual for\nholistic representation and visual for multiscale detail representation. Both\nprompts are dynamically adjusted by degradation predictions from the CLIP image\nencoder, enabling adaptive responses to diverse unknown degradations. Moreover,\na plug-in detail refinement module improves restoration fidelity via direct\nencoder-to-decoder information transformation. To assess our method, MPerceiver\nis trained on 9 tasks for all-in-one IR and outperforms state-of-the-art\ntask-specific methods across most tasks. Post multitask pre-training,\nMPerceiver attains a generalized representation in low-level vision, exhibiting\nremarkable zero-shot and few-shot capabilities in unseen tasks. Extensive\nexperiments on 16 IR tasks underscore the superiority of MPerceiver in terms of\nadaptiveness, generalizability and fidelity.\n","authors":["Yuang Ai","Huaibo Huang","Xiaoqiang Zhou","Jiexiang Wang","Ran He"],"pdf_url":"https://arxiv.org/pdf/2312.02918v2.pdf","comment":"13 pages, 8 figures, 9 tables"},{"id":"http://arxiv.org/abs/2312.04539v2","updated":"2024-03-20T16:11:22Z","published":"2023-12-07T18:55:52Z","title":"Auto-Vocabulary Semantic Segmentation","summary":" Open-ended image understanding tasks gained significant attention from the\nresearch community, particularly with the emergence of Vision-Language Models.\nOpen-Vocabulary Segmentation (OVS) methods are capable of performing semantic\nsegmentation without relying on a fixed vocabulary, and in some cases, they\noperate without the need for training or fine-tuning. However, OVS methods\ntypically require users to specify the vocabulary based on the task or dataset\nat hand. In this paper, we introduce \\textit{Auto-Vocabulary Semantic\nSegmentation (AVS)}, advancing open-ended image understanding by eliminating\nthe necessity to predefine object categories for segmentation. Our approach,\n\\ours, presents a framework that autonomously identifies relevant class names\nusing enhanced BLIP embeddings, which are utilized for segmentation afterwards.\nGiven that open-ended object category predictions cannot be directly compared\nwith a fixed ground truth, we develop a Large Language Model-based\nAuto-Vocabulary Evaluator (LAVE) to efficiently evaluate the automatically\ngenerated class names and their corresponding segments. Our method sets new\nbenchmarks on datasets such as PASCAL VOC and Context, ADE20K, and Cityscapes\nfor AVS and showcases competitive performance to OVS methods that require\nspecified class names.\n","authors":["Osman Ülger","Maksymilian Kulicki","Yuki Asano","Martin R. Oswald"],"pdf_url":"https://arxiv.org/pdf/2312.04539v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03320v3","updated":"2024-03-20T16:10:27Z","published":"2023-09-06T19:01:58Z","title":"CoNeS: Conditional neural fields with shift modulation for\n multi-sequence MRI translation","summary":" Multi-sequence magnetic resonance imaging (MRI) has found wide applications\nin both modern clinical studies and deep learning research. However, in\nclinical practice, it frequently occurs that one or more of the MRI sequences\nare missing due to different image acquisition protocols or contrast agent\ncontraindications of patients, limiting the utilization of deep learning models\ntrained on multi-sequence data. One promising approach is to leverage\ngenerative models to synthesize the missing sequences, which can serve as a\nsurrogate acquisition. State-of-the-art methods tackling this problem are based\non convolutional neural networks (CNN) which usually suffer from spectral\nbiases, resulting in poor reconstruction of high-frequency fine details. In\nthis paper, we propose Conditional Neural fields with Shift modulation (CoNeS),\na model that takes voxel coordinates as input and learns a representation of\nthe target images for multi-sequence MRI translation. The proposed model uses a\nmulti-layer perceptron (MLP) instead of a CNN as the decoder for pixel-to-pixel\nmapping. Hence, each target image is represented as a neural field that is\nconditioned on the source image via shift modulation with a learned latent\ncode. Experiments on BraTS 2018 and an in-house clinical dataset of vestibular\nschwannoma patients showed that the proposed method outperformed\nstate-of-the-art methods for multi-sequence MRI translation both visually and\nquantitatively. Moreover, we conducted spectral analysis, showing that CoNeS\nwas able to overcome the spectral bias issue common in conventional CNN models.\nTo further evaluate the usage of synthesized images in clinical downstream\ntasks, we tested a segmentation network using the synthesized images at\ninference.\n","authors":["Yunjie Chen","Marius Staring","Olaf M. Neve","Stephan R. Romeijn","Erik F. Hensen","Berit M. Verbist","Jelmer M. Wolterink","Qian Tao"],"pdf_url":"https://arxiv.org/pdf/2309.03320v3.pdf","comment":"Accepted for publication at the Journal of Machine Learning for\n Biomedical Imaging (MELBA) https://melba-journal.org/2024:004"},{"id":"http://arxiv.org/abs/2403.13703v1","updated":"2024-03-20T16:07:04Z","published":"2024-03-20T16:07:04Z","title":"Fostc3net:A Lightweight YOLOv5 Based On the Network Structure\n Optimization","summary":" Transmission line detection technology is crucial for automatic monitoring\nand ensuring the safety of electrical facilities. The YOLOv5 series is\ncurrently one of the most advanced and widely used methods for object\ndetection. However, it faces inherent challenges, such as high computational\nload on devices and insufficient detection accuracy. To address these concerns,\nthis paper presents an enhanced lightweight YOLOv5 technique customized for\nmobile devices, specifically intended for identifying objects associated with\ntransmission lines. The C3Ghost module is integrated into the convolutional\nnetwork of YOLOv5 to reduce floating point operations per second (FLOPs) in the\nfeature channel fusion process and improve feature expression performance. In\naddition, a FasterNet module is introduced to replace the c3 module in the\nYOLOv5 Backbone. The FasterNet module uses Partial Convolutions to process only\na portion of the input channels, improving feature extraction efficiency and\nreducing computational overhead. To address the imbalance between simple and\nchallenging samples in the dataset and the diversity of aspect ratios of\nbounding boxes, the wIoU v3 LOSS is adopted as the loss function. To validate\nthe performance of the proposed approach, Experiments are conducted on a custom\ndataset of transmission line poles. The results show that the proposed model\nachieves a 1% increase in detection accuracy, a 13% reduction in FLOPs, and a\n26% decrease in model parameters compared to the existing YOLOv5.In the\nablation experiment, it was also discovered that while the Fastnet module and\nthe CSghost module improved the precision of the original YOLOv5 baseline\nmodel, they caused a decrease in the mAP@.5-.95 metric. However, the\nimprovement of the wIoUv3 loss function significantly mitigated the decline of\nthe mAP@.5-.95 metric.\n","authors":["Danqing Ma","Shaojie Li","Bo Dang","Hengyi Zang","Xinqi Dong"],"pdf_url":"https://arxiv.org/pdf/2403.13703v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13698v1","updated":"2024-03-20T16:03:01Z","published":"2024-03-20T16:03:01Z","title":"Insight Into the Collocation of Multi-Source Satellite Imagery for\n Multi-Scale Vessel Detection","summary":" Ship detection from satellite imagery using Deep Learning (DL) is an\nindispensable solution for maritime surveillance. However, applying DL models\ntrained on one dataset to others having differences in spatial resolution and\nradiometric features requires many adjustments. To overcome this issue, this\npaper focused on the DL models trained on datasets that consist of different\noptical images and a combination of radar and optical data. When dealing with a\nlimited number of training images, the performance of DL models via this\napproach was satisfactory. They could improve 5-20% of average precision,\ndepending on the optical images tested. Likewise, DL models trained on the\ncombined optical and radar dataset could be applied to both optical and radar\nimages. Our experiments showed that the models trained on an optical dataset\ncould be used for radar images, while those trained on a radar dataset offered\nvery poor scores when applied to optical images.\n","authors":["Tran-Vu La","Minh-Tan Pham","Marco Chini"],"pdf_url":"https://arxiv.org/pdf/2403.13698v1.pdf","comment":"5 pages, accepted to IGARSS 2024"},{"id":"http://arxiv.org/abs/2403.13690v1","updated":"2024-03-20T15:53:07Z","published":"2024-03-20T15:53:07Z","title":"MotorEase: Automated Detection of Motor Impairment Accessibility Issues\n in Mobile App UIs","summary":" Recent research has begun to examine the potential of automatically finding\nand fixing accessibility issues that manifest in software. However, while\nrecent work makes important progress, it has generally been skewed toward\nidentifying issues that affect users with certain disabilities, such as those\nwith visual or hearing impairments. However, there are other groups of users\nwith different types of disabilities that also need software tooling support to\nimprove their experience. As such, this paper aims to automatically identify\naccessibility issues that affect users with motor-impairments.\n To move toward this goal, this paper introduces a novel approach, called\nMotorEase, capable of identifying accessibility issues in mobile app UIs that\nimpact motor-impaired users. Motor-impaired users often have limited ability to\ninteract with touch-based devices, and instead may make use of a switch or\nother assistive mechanism -- hence UIs must be designed to support both limited\ntouch gestures and the use of assistive devices. MotorEase adapts computer\nvision and text processing techniques to enable a semantic understanding of app\nUI screens, enabling the detection of violations related to four popular,\npreviously unexplored UI design guidelines that support motor-impaired users,\nincluding: (i) visual touch target size, (ii) expanding sections, (iii)\npersisting elements, and (iv) adjacent icon visual distance. We evaluate\nMotorEase on a newly derived benchmark, called MotorCheck, that contains 555\nmanually annotated examples of violations to the above accessibility\nguidelines, across 1599 screens collected from 70 applications via a mobile app\ntesting tool. Our experiments illustrate that MotorEase is able to identify\nviolations with an average accuracy of ~90%, and a false positive rate of less\nthan 9%, outperforming baseline techniques.\n","authors":["Arun Krishnavajjala","SM Hasan Mansur","Justin Jose","Kevin Moran"],"pdf_url":"https://arxiv.org/pdf/2403.13690v1.pdf","comment":"Accepted to ICSE 2024 Research Track, 13 pages"},{"id":"http://arxiv.org/abs/2303.16296v4","updated":"2024-03-20T15:52:49Z","published":"2023-03-28T20:35:38Z","title":"Dice Semimetric Losses: Optimizing the Dice Score with Soft Labels","summary":" The soft Dice loss (SDL) has taken a pivotal role in numerous automated\nsegmentation pipelines in the medical imaging community. Over the last years,\nsome reasons behind its superior functioning have been uncovered and further\noptimizations have been explored. However, there is currently no implementation\nthat supports its direct utilization in scenarios involving soft labels. Hence,\na synergy between the use of SDL and research leveraging the use of soft\nlabels, also in the context of model calibration, is still missing. In this\nwork, we introduce Dice semimetric losses (DMLs), which (i) are by design\nidentical to SDL in a standard setting with hard labels, but (ii) can be\nemployed in settings with soft labels. Our experiments on the public QUBIQ,\nLiTS and KiTS benchmarks confirm the potential synergy of DMLs with soft labels\n(e.g. averaging, label smoothing, and knowledge distillation) over hard labels\n(e.g. majority voting and random selection). As a result, we obtain superior\nDice scores and model calibration, which supports the wider adoption of DMLs in\npractice. The code is available at https://github.com/zifuwanggg/JDTLosses\n","authors":["Zifu Wang","Teodora Popordanoska","Jeroen Bertels","Robin Lemmens","Matthew B. Blaschko"],"pdf_url":"https://arxiv.org/pdf/2303.16296v4.pdf","comment":"MICCAI 2023"},{"id":"http://arxiv.org/abs/2403.13684v1","updated":"2024-03-20T15:41:39Z","published":"2024-03-20T15:41:39Z","title":"SPTNet: An Efficient Alternative Framework for Generalized Category\n Discovery with Spatial Prompt Tuning","summary":" Generalized Category Discovery (GCD) aims to classify unlabelled images from\nboth `seen' and `unseen' classes by transferring knowledge from a set of\nlabelled `seen' class images. A key theme in existing GCD approaches is\nadapting large-scale pre-trained models for the GCD task. An alternate\nperspective, however, is to adapt the data representation itself for better\nalignment with the pre-trained model. As such, in this paper, we introduce a\ntwo-stage adaptation approach termed SPTNet, which iteratively optimizes model\nparameters (i.e., model-finetuning) and data parameters (i.e., prompt\nlearning). Furthermore, we propose a novel spatial prompt tuning method (SPT)\nwhich considers the spatial property of image data, enabling the method to\nbetter focus on object parts, which can transfer between seen and unseen\nclasses. We thoroughly evaluate our SPTNet on standard benchmarks and\ndemonstrate that our method outperforms existing GCD methods. Notably, we find\nour method achieves an average accuracy of 61.4% on the SSB, surpassing prior\nstate-of-the-art methods by approximately 10%. The improvement is particularly\nremarkable as our method yields extra parameters amounting to only 0.117% of\nthose in the backbone architecture. Project page:\nhttps://visual-ai.github.io/sptnet.\n","authors":["Hongjun Wang","Sagar Vaze","Kai Han"],"pdf_url":"https://arxiv.org/pdf/2403.13684v1.pdf","comment":"Accepted as a conference paper at ICLR 2024; Project page:\n https://visual-ai.github.io/sptnet"},{"id":"http://arxiv.org/abs/2403.13683v1","updated":"2024-03-20T15:41:32Z","published":"2024-03-20T15:41:32Z","title":"DVMNet: Computing Relative Pose for Unseen Objects Beyond Hypotheses","summary":" Determining the relative pose of an object between two images is pivotal to\nthe success of generalizable object pose estimation. Existing approaches\ntypically approximate the continuous pose representation with a large number of\ndiscrete pose hypotheses, which incurs a computationally expensive process of\nscoring each hypothesis at test time. By contrast, we present a Deep Voxel\nMatching Network (DVMNet) that eliminates the need for pose hypotheses and\ncomputes the relative object pose in a single pass. To this end, we map the two\ninput RGB images, reference and query, to their respective voxelized 3D\nrepresentations. We then pass the resulting voxels through a pose estimation\nmodule, where the voxels are aligned and the pose is computed in an end-to-end\nfashion by solving a least-squares problem. To enhance robustness, we introduce\na weighted closest voxel algorithm capable of mitigating the impact of noisy\nvoxels. We conduct extensive experiments on the CO3D, LINEMOD, and Objaverse\ndatasets, demonstrating that our method delivers more accurate relative pose\nestimates for novel objects at a lower computational cost compared to\nstate-of-the-art methods. Our code is released at:\nhttps://github.com/sailor-z/DVMNet/.\n","authors":["Chen Zhao","Tong Zhang","Zheng Dang","Mathieu Salzmann"],"pdf_url":"https://arxiv.org/pdf/2403.13683v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.13680v1","updated":"2024-03-20T15:38:53Z","published":"2024-03-20T15:38:53Z","title":"Step-Calibrated Diffusion for Biomedical Optical Image Restoration","summary":" High-quality, high-resolution medical imaging is essential for clinical care.\nRaman-based biomedical optical imaging uses non-ionizing infrared radiation to\nevaluate human tissues in real time and is used for early cancer detection,\nbrain tumor diagnosis, and intraoperative tissue analysis. Unfortunately,\noptical imaging is vulnerable to image degradation due to laser scattering and\nabsorption, which can result in diagnostic errors and misguided treatment.\nRestoration of optical images is a challenging computer vision task because the\nsources of image degradation are multi-factorial, stochastic, and\ntissue-dependent, preventing a straightforward method to obtain paired\nlow-quality/high-quality data. Here, we present Restorative Step-Calibrated\nDiffusion (RSCD), an unpaired image restoration method that views the image\nrestoration problem as completing the finishing steps of a diffusion-based\nimage generation task. RSCD uses a step calibrator model to dynamically\ndetermine the severity of image degradation and the number of steps required to\ncomplete the reverse diffusion process for image restoration. RSCD outperforms\nother widely used unpaired image restoration methods on both image quality and\nperceptual evaluation metrics for restoring optical images. Medical imaging\nexperts consistently prefer images restored using RSCD in blinded comparison\nexperiments and report minimal to no hallucinations. Finally, we show that RSCD\nimproves performance on downstream clinical imaging tasks, including automated\nbrain tumor diagnosis and deep tissue imaging. Our code is available at\nhttps://github.com/MLNeurosurg/restorative_step-calibrated_diffusion.\n","authors":["Yiwei Lyu","Sung Jik Cha","Cheng Jiang","Asadur Chowdury","Xinhai Hou","Edward Harake","Akhil Kondepudi","Christian Freudiger","Honglak Lee","Todd C. Hollon"],"pdf_url":"https://arxiv.org/pdf/2403.13680v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13678v1","updated":"2024-03-20T15:37:19Z","published":"2024-03-20T15:37:19Z","title":"AUD-TGN: Advancing Action Unit Detection with Temporal Convolution and\n GPT-2 in Wild Audiovisual Contexts","summary":" Leveraging the synergy of both audio data and visual data is essential for\nunderstanding human emotions and behaviors, especially in in-the-wild setting.\nTraditional methods for integrating such multimodal information often stumble,\nleading to less-than-ideal outcomes in the task of facial action unit\ndetection. To overcome these shortcomings, we propose a novel approach\nutilizing audio-visual multimodal data. This method enhances audio feature\nextraction by leveraging Mel Frequency Cepstral Coefficients (MFCC) and Log-Mel\nspectrogram features alongside a pre-trained VGGish network. Moreover, this\npaper adaptively captures fusion features across modalities by modeling the\ntemporal relationships, and ultilizes a pre-trained GPT-2 model for\nsophisticated context-aware fusion of multimodal information. Our method\nnotably improves the accuracy of AU detection by understanding the temporal and\ncontextual nuances of the data, showcasing significant advancements in the\ncomprehension of intricate scenarios. These findings underscore the potential\nof integrating temporal dynamics and contextual interpretation, paving the way\nfor future research endeavors.\n","authors":["Jun Yu","Zerui Zhang","Zhihong Wei","Gongpeng Zhao","Zhongpeng Cai","Yongqi Wang","Guochen Xie","Jichao Zhu","Wangyuan Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.13678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13677v1","updated":"2024-03-20T15:35:36Z","published":"2024-03-20T15:35:36Z","title":"Retina Vision Transformer (RetinaViT): Introducing Scaled Patches into\n Vision Transformers","summary":" Humans see low and high spatial frequency components at the same time, and\ncombine the information from both to form a visual scene. Drawing on this\nneuroscientific inspiration, we propose an altered Vision Transformer\narchitecture where patches from scaled down versions of the input image are\nadded to the input of the first Transformer Encoder layer. We name this model\nRetina Vision Transformer (RetinaViT) due to its inspiration from the human\nvisual system. Our experiments show that when trained on the ImageNet-1K\ndataset with a moderate configuration, RetinaViT achieves a 3.3% performance\nimprovement over the original ViT. We hypothesize that this improvement can be\nattributed to the inclusion of low spatial frequency components in the input,\nwhich improves the ability to capture structural features, and to select and\nforward important features to deeper layers. RetinaViT thereby opens doors to\nfurther investigations into vertical pathways and attention patterns.\n","authors":["Yuyang Shu","Michael E. Bain"],"pdf_url":"https://arxiv.org/pdf/2403.13677v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06258v2","updated":"2024-03-20T15:31:28Z","published":"2024-03-10T16:56:44Z","title":"Poly Kernel Inception Network for Remote Sensing Detection","summary":" Object detection in remote sensing images (RSIs) often suffers from several\nincreasing challenges, including the large variation in object scales and the\ndiverse-ranging context. Prior methods tried to address these challenges by\nexpanding the spatial receptive field of the backbone, either through\nlarge-kernel convolution or dilated convolution. However, the former typically\nintroduces considerable background noise, while the latter risks generating\noverly sparse feature representations. In this paper, we introduce the Poly\nKernel Inception Network (PKINet) to handle the above challenges. PKINet\nemploys multi-scale convolution kernels without dilation to extract object\nfeatures of varying scales and capture local context. In addition, a Context\nAnchor Attention (CAA) module is introduced in parallel to capture long-range\ncontextual information. These two components work jointly to advance the\nperformance of PKINet on four challenging remote sensing detection benchmarks,\nnamely DOTA-v1.0, DOTA-v1.5, HRSC2016, and DIOR-R.\n","authors":["Xinhao Cai","Qiuxia Lai","Yuwei Wang","Wenguan Wang","Zeren Sun","Yazhou Yao"],"pdf_url":"https://arxiv.org/pdf/2403.06258v2.pdf","comment":"accepted by IEEE Conference on Computer Vision and Pattern\n Recognition, 2024"},{"id":"http://arxiv.org/abs/2308.03001v2","updated":"2024-03-20T15:29:56Z","published":"2023-08-06T03:28:08Z","title":"Weakly supervised segmentation of intracranial aneurysms using a novel\n 3D focal modulation UNet","summary":" Accurate identification and quantification of unruptured intracranial\naneurysms (UIAs) is crucial for the risk assessment and treatment of this\ncerebrovascular disorder. Current 2D manual assessment on 3D magnetic resonance\nangiography (MRA) is suboptimal and time-consuming. In addition, one major\nissue in medical image segmentation is the need for large well-annotated data,\nwhich can be expensive to obtain. Techniques that mitigate this requirement,\nsuch as weakly supervised learning with coarse labels are highly desirable. In\nthe paper, we propose FocalSegNet, a novel 3D focal modulation UNet, to detect\nan aneurysm and offer an initial, coarse segmentation of it from time-of-flight\nMRA image patches, which is further refined with a dense conditional random\nfield (CRF) post-processing layer to produce a final segmentation map. We\ntrained and evaluated our model on a public dataset, and in terms of UIA\ndetection, our model showed a low false-positive rate of 0.21 and a high\nsensitivity of 0.80. For voxel-wise aneurysm segmentation, we achieved a Dice\nscore of 0.68 and a 95% Hausdorff distance of ~0.95 mm, demonstrating its\nstrong performance. We evaluated our algorithms against the state-of-the-art 3D\nResidual-UNet and Swin-UNETR, and illustrated the superior performance of our\nproposed FocalSegNet, highlighting the advantages of employing focal modulation\nfor this task.\n","authors":["Amirhossein Rasoulian","Arash Harirpoush","Soorena Salari","Yiming Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.03001v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13667v1","updated":"2024-03-20T15:24:57Z","published":"2024-03-20T15:24:57Z","title":"DanceCamera3D: 3D Camera Movement Synthesis with Music and Dance","summary":" Choreographers determine what the dances look like, while cameramen determine\nthe final presentation of dances. Recently, various methods and datasets have\nshowcased the feasibility of dance synthesis. However, camera movement\nsynthesis with music and dance remains an unsolved challenging problem due to\nthe scarcity of paired data. Thus, we present DCM, a new multi-modal 3D\ndataset, which for the first time combines camera movement with dance motion\nand music audio. This dataset encompasses 108 dance sequences (3.2 hours) of\npaired dance-camera-music data from the anime community, covering 4 music\ngenres. With this dataset, we uncover that dance camera movement is\nmultifaceted and human-centric, and possesses multiple influencing factors,\nmaking dance camera synthesis a more challenging task compared to camera or\ndance synthesis alone. To overcome these difficulties, we propose\nDanceCamera3D, a transformer-based diffusion model that incorporates a novel\nbody attention loss and a condition separation strategy. For evaluation, we\ndevise new metrics measuring camera movement quality, diversity, and dancer\nfidelity. Utilizing these metrics, we conduct extensive experiments on our DCM\ndataset, providing both quantitative and qualitative evidence showcasing the\neffectiveness of our DanceCamera3D model. Code and video demos are available at\nhttps://github.com/Carmenw1203/DanceCamera3D-Official.\n","authors":["Zixuan Wang","Jia Jia","Shikun Sun","Haozhe Wu","Rong Han","Zhenyu Li","Di Tang","Jiaqing Zhou","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/2403.13667v1.pdf","comment":"Accept to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.11868v2","updated":"2024-03-20T15:22:12Z","published":"2024-03-18T15:22:09Z","title":"View-Consistent 3D Editing with Gaussian Splatting","summary":" The advent of 3D Gaussian Splatting (3DGS) has revolutionized 3D editing,\noffering efficient, high-fidelity rendering and enabling precise local\nmanipulations. Currently, diffusion-based 2D editing models are harnessed to\nmodify multi-view rendered images, which then guide the editing of 3DGS models.\nHowever, this approach faces a critical issue of multi-view inconsistency,\nwhere the guidance images exhibit significant discrepancies across views,\nleading to mode collapse and visual artifacts of 3DGS. To this end, we\nintroduce View-consistent Editing (VcEdit), a novel framework that seamlessly\nincorporates 3DGS into image editing processes, ensuring multi-view consistency\nin edited guidance images and effectively mitigating mode collapse issues.\nVcEdit employs two innovative consistency modules: the Cross-attention\nConsistency Module and the Editing Consistency Module, both designed to reduce\ninconsistencies in edited images. By incorporating these consistency modules\ninto an iterative pattern, VcEdit proficiently resolves the issue of multi-view\ninconsistency, facilitating high-quality 3DGS editing across a diverse range of\nscenes.\n","authors":["Yuxuan Wang","Xuanyu Yi","Zike Wu","Na Zhao","Long Chen","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.11868v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13663v1","updated":"2024-03-20T15:14:22Z","published":"2024-03-20T15:14:22Z","title":"T-Pixel2Mesh: Combining Global and Local Transformer for 3D Mesh\n Generation from a Single Image","summary":" Pixel2Mesh (P2M) is a classical approach for reconstructing 3D shapes from a\nsingle color image through coarse-to-fine mesh deformation. Although P2M is\ncapable of generating plausible global shapes, its Graph Convolution Network\n(GCN) often produces overly smooth results, causing the loss of fine-grained\ngeometry details. Moreover, P2M generates non-credible features for occluded\nregions and struggles with the domain gap from synthetic data to real-world\nimages, which is a common challenge for single-view 3D reconstruction methods.\nTo address these challenges, we propose a novel Transformer-boosted\narchitecture, named T-Pixel2Mesh, inspired by the coarse-to-fine approach of\nP2M. Specifically, we use a global Transformer to control the holistic shape\nand a local Transformer to progressively refine the local geometry details with\ngraph-based point upsampling. To enhance real-world reconstruction, we present\nthe simple yet effective Linear Scale Search (LSS), which serves as prompt\ntuning during the input preprocessing. Our experiments on ShapeNet demonstrate\nstate-of-the-art performance, while results on real-world data show the\ngeneralization capability.\n","authors":["Shijie Zhang","Boyan Jiang","Keke He","Junwei Zhu","Ying Tai","Chengjie Wang","Yinda Zhang","Yanwei Fu"],"pdf_url":"https://arxiv.org/pdf/2403.13663v1.pdf","comment":"Received by ICASSP 2024"},{"id":"http://arxiv.org/abs/2403.13660v1","updated":"2024-03-20T15:08:57Z","published":"2024-03-20T15:08:57Z","title":"ProMamba: Prompt-Mamba for polyp segmentation","summary":" Detecting polyps through colonoscopy is an important task in medical image\nsegmentation, which provides significant assistance and reference value for\nclinical surgery. However, accurate segmentation of polyps is a challenging\ntask due to two main reasons. Firstly, polyps exhibit various shapes and\ncolors. Secondly, the boundaries between polyps and their normal surroundings\nare often unclear. Additionally, significant differences between different\ndatasets lead to limited generalization capabilities of existing methods. To\naddress these issues, we propose a segmentation model based on Prompt-Mamba,\nwhich incorporates the latest Vision-Mamba and prompt technologies. Compared to\nprevious models trained on the same dataset, our model not only maintains high\nsegmentation accuracy on the validation part of the same dataset but also\ndemonstrates superior accuracy on unseen datasets, exhibiting excellent\ngeneralization capabilities. Notably, we are the first to apply the\nVision-Mamba architecture to polyp segmentation and the first to utilize prompt\ntechnology in a polyp segmentation model. Our model efficiently accomplishes\nsegmentation tasks, surpassing previous state-of-the-art methods by an average\nof 5% across six datasets. Furthermore, we have developed multiple versions of\nour model with scaled parameter counts, achieving better performance than\nprevious models even with fewer parameters. Our code and trained weights will\nbe released soon.\n","authors":["Jianhao Xie","Ruofan Liao","Ziang Zhang","Sida Yi","Yuesheng Zhu","Guibo Luo"],"pdf_url":"https://arxiv.org/pdf/2403.13660v1.pdf","comment":"10 pages, 2 figures,3 tabels"},{"id":"http://arxiv.org/abs/2403.13659v1","updated":"2024-03-20T15:08:43Z","published":"2024-03-20T15:08:43Z","title":"Recursive Cross-Modal Attention for Multimodal Fusion in Dimensional\n Emotion Recognition","summary":" Multi-modal emotion recognition has recently gained a lot of attention since\nit can leverage diverse and complementary relationships over multiple\nmodalities, such as audio, visual, and text. Most state-of-the-art methods for\nmultimodal fusion rely on recurrent networks or conventional attention\nmechanisms that do not effectively leverage the complementary nature of the\nmodalities. In this paper, we focus on dimensional emotion recognition based on\nthe fusion of facial, vocal, and text modalities extracted from videos.\nSpecifically, we propose a recursive cross-modal attention (RCMA) to\neffectively capture the complementary relationships across the modalities in a\nrecursive fashion. The proposed model is able to effectively capture the\ninter-modal relationships by computing the cross-attention weights across the\nindividual modalities and the joint representation of the other two modalities.\nTo further improve the inter-modal relationships, the obtained attended\nfeatures of the individual modalities are again fed as input to the cross-modal\nattention to refine the feature representations of the individual modalities.\nIn addition to that, we have used Temporal convolution networks (TCNs) to\ncapture the temporal modeling (intra-modal relationships) of the individual\nmodalities. By deploying the TCNs as well cross-modal attention in a recursive\nfashion, we are able to effectively capture both intra- and inter-modal\nrelationships across the audio, visual, and text modalities. Experimental\nresults on validation-set videos from the AffWild2 dataset indicate that our\nproposed fusion model is able to achieve significant improvement over the\nbaseline for the sixth challenge of Affective Behavior Analysis in-the-Wild\n2024 (ABAW6) competition.\n","authors":["R. Gnana Praveen","Jahangir Alam"],"pdf_url":"https://arxiv.org/pdf/2403.13659v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2209.09068;\n text overlap with arXiv:2203.14779 by other authors"},{"id":"http://arxiv.org/abs/2403.13658v1","updated":"2024-03-20T15:06:49Z","published":"2024-03-20T15:06:49Z","title":"Multimodal Variational Autoencoder for Low-cost Cardiac Hemodynamics\n Instability Detection","summary":" Recent advancements in non-invasive detection of cardiac hemodynamic\ninstability (CHDI) primarily focus on applying machine learning techniques to a\nsingle data modality, e.g. cardiac magnetic resonance imaging (MRI). Despite\ntheir potential, these approaches often fall short especially when the size of\nlabeled patient data is limited, a common challenge in the medical domain.\nFurthermore, only a few studies have explored multimodal methods to study CHDI,\nwhich mostly rely on costly modalities such as cardiac MRI and echocardiogram.\nIn response to these limitations, we propose a novel multimodal variational\nautoencoder ($\\text{CardioVAE}_\\text{X,G}$) to integrate low-cost chest X-ray\n(CXR) and electrocardiogram (ECG) modalities with pre-training on a large\nunlabeled dataset. Specifically, $\\text{CardioVAE}_\\text{X,G}$ introduces a\nnovel tri-stream pre-training strategy to learn both shared and\nmodality-specific features, thus enabling fine-tuning with both unimodal and\nmultimodal datasets. We pre-train $\\text{CardioVAE}_\\text{X,G}$ on a large,\nunlabeled dataset of $50,982$ subjects from a subset of MIMIC database and then\nfine-tune the pre-trained model on a labeled dataset of $795$ subjects from the\nASPIRE registry. Comprehensive evaluations against existing methods show that\n$\\text{CardioVAE}_\\text{X,G}$ offers promising performance (AUROC $=0.79$ and\nAccuracy $=0.77$), representing a significant step forward in non-invasive\nprediction of CHDI. Our model also excels in producing fine interpretations of\npredictions directly associated with clinical features, thereby supporting\nclinical decision-making.\n","authors":["Mohammod N. I. Suvon","Prasun C. Tripathi","Wenrui Fan","Shuo Zhou","Xianyuan Liu","Samer Alabed","Venet Osmani","Andrew J. Swift","Chen Chen","Haiping Lu"],"pdf_url":"https://arxiv.org/pdf/2403.13658v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13653v1","updated":"2024-03-20T14:58:40Z","published":"2024-03-20T14:58:40Z","title":"Learning User Embeddings from Human Gaze for Personalised Saliency\n Prediction","summary":" Reusable embeddings of user behaviour have shown significant performance\nimprovements for the personalised saliency prediction task. However, prior\nworks require explicit user characteristics and preferences as input, which are\noften difficult to obtain. We present a novel method to extract user embeddings\nfrom pairs of natural images and corresponding saliency maps generated from a\nsmall amount of user-specific eye tracking data. At the core of our method is a\nSiamese convolutional neural encoder that learns the user embeddings by\ncontrasting the image and personal saliency map pairs of different users.\nEvaluations on two public saliency datasets show that the generated embeddings\nhave high discriminative power, are effective at refining universal saliency\nmaps to the individual users, and generalise well across users and images.\nFinally, based on our model's ability to encode individual user\ncharacteristics, our work points towards other applications that can benefit\nfrom reusable embeddings of gaze behaviour.\n","authors":["Florian Strohm","Mihai Bâce","Andreas Bulling"],"pdf_url":"https://arxiv.org/pdf/2403.13653v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13652v1","updated":"2024-03-20T14:58:09Z","published":"2024-03-20T14:58:09Z","title":"ZoDi: Zero-Shot Domain Adaptation with Diffusion-Based Image Transfer","summary":" Deep learning models achieve high accuracy in segmentation tasks among\nothers, yet domain shift often degrades the models' performance, which can be\ncritical in real-world scenarios where no target images are available. This\npaper proposes a zero-shot domain adaptation method based on diffusion models,\ncalled ZoDi, which is two-fold by the design: zero-shot image transfer and\nmodel adaptation. First, we utilize an off-the-shelf diffusion model to\nsynthesize target-like images by transferring the domain of source images to\nthe target domain. In this we specifically try to maintain the layout and\ncontent by utilising layout-to-image diffusion models with stochastic\ninversion. Secondly, we train the model using both source images and\nsynthesized images with the original segmentation maps while maximizing the\nfeature similarity of images from the two domains to learn domain-robust\nrepresentations. Through experiments we show benefits of ZoDi in the task of\nimage segmentation over state-of-the-art methods. It is also more applicable\nthan existing CLIP-based methods because it assumes no specific backbone or\nmodels, and it enables to estimate the model's performance without target\nimages by inspecting generated images. Our implementation will be publicly\navailable.\n","authors":["Hiroki Azuma","Yusuke Matsui","Atsuto Maki"],"pdf_url":"https://arxiv.org/pdf/2403.13652v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13647v1","updated":"2024-03-20T14:54:33Z","published":"2024-03-20T14:54:33Z","title":"Meta-Point Learning and Refining for Category-Agnostic Pose Estimation","summary":" Category-agnostic pose estimation (CAPE) aims to predict keypoints for\narbitrary classes given a few support images annotated with keypoints. Existing\nmethods only rely on the features extracted at support keypoints to predict or\nrefine the keypoints on query image, but a few support feature vectors are\nlocal and inadequate for CAPE. Considering that human can quickly perceive\npotential keypoints of arbitrary objects, we propose a novel framework for CAPE\nbased on such potential keypoints (named as meta-points). Specifically, we\nmaintain learnable embeddings to capture inherent information of various\nkeypoints, which interact with image feature maps to produce meta-points\nwithout any support. The produced meta-points could serve as meaningful\npotential keypoints for CAPE. Due to the inevitable gap between inherency and\nannotation, we finally utilize the identities and details offered by support\nkeypoints to assign and refine meta-points to desired keypoints in query image.\nIn addition, we propose a progressive deformable point decoder and a slacked\nregression loss for better prediction and supervision. Our novel framework not\nonly reveals the inherency of keypoints but also outperforms existing methods\nof CAPE. Comprehensive experiments and in-depth studies on large-scale MP-100\ndataset demonstrate the effectiveness of our framework.\n","authors":["Junjie Chen","Jiebin Yan","Yuming Fang","Li Niu"],"pdf_url":"https://arxiv.org/pdf/2403.13647v1.pdf","comment":"Published in CVPR 2024"},{"id":"http://arxiv.org/abs/2403.02075v2","updated":"2024-03-20T14:52:27Z","published":"2024-03-04T14:21:51Z","title":"DiffMOT: A Real-time Diffusion-based Multiple Object Tracker with\n Non-linear Prediction","summary":" In Multiple Object Tracking, objects often exhibit non-linear motion of\nacceleration and deceleration, with irregular direction changes.\nTacking-by-detection (TBD) trackers with Kalman Filter motion prediction work\nwell in pedestrian-dominant scenarios but fall short in complex situations when\nmultiple objects perform non-linear and diverse motion simultaneously. To\ntackle the complex non-linear motion, we propose a real-time diffusion-based\nMOT approach named DiffMOT. Specifically, for the motion predictor component,\nwe propose a novel Decoupled Diffusion-based Motion Predictor (D$^2$MP). It\nmodels the entire distribution of various motion presented by the data as a\nwhole. It also predicts an individual object's motion conditioning on an\nindividual's historical motion information. Furthermore, it optimizes the\ndiffusion process with much fewer sampling steps. As a MOT tracker, the DiffMOT\nis real-time at 22.7FPS, and also outperforms the state-of-the-art on\nDanceTrack and SportsMOT datasets with $62.3\\%$ and $76.2\\%$ in HOTA metrics,\nrespectively. To the best of our knowledge, DiffMOT is the first to introduce a\ndiffusion probabilistic model into the MOT to tackle non-linear motion\nprediction.\n","authors":["Weiyi Lv","Yuhang Huang","Ning Zhang","Ruei-Sung Lin","Mei Han","Dan Zeng"],"pdf_url":"https://arxiv.org/pdf/2403.02075v2.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2403.13642v1","updated":"2024-03-20T14:49:52Z","published":"2024-03-20T14:49:52Z","title":"H-vmunet: High-order Vision Mamba UNet for Medical Image Segmentation","summary":" In the field of medical image segmentation, variant models based on\nConvolutional Neural Networks (CNNs) and Visual Transformers (ViTs) as the base\nmodules have been very widely developed and applied. However, CNNs are often\nlimited in their ability to deal with long sequences of information, while the\nlow sensitivity of ViTs to local feature information and the problem of\nsecondary computational complexity limit their development. Recently, the\nemergence of state-space models (SSMs), especially 2D-selective-scan (SS2D),\nhas had an impact on the longtime dominance of traditional CNNs and ViTs as the\nfoundational modules of visual neural networks. In this paper, we extend the\nadaptability of SS2D by proposing a High-order Vision Mamba UNet (H-vmunet) for\nmedical image segmentation. Among them, the proposed High-order\n2D-selective-scan (H-SS2D) progressively reduces the introduction of redundant\ninformation during SS2D operations through higher-order interactions. In\naddition, the proposed Local-SS2D module improves the learning ability of local\nfeatures of SS2D at each order of interaction. We conducted comparison and\nablation experiments on three publicly available medical image datasets\n(ISIC2017, Spleen, and CVC-ClinicDB), and the results all demonstrate the\nstrong competitiveness of H-vmunet in medical image segmentation tasks. The\ncode is available from https://github.com/wurenkai/H-vmunet .\n","authors":["Renkai Wu","Yinghao Liu","Pengchen Liang","Qing Chang"],"pdf_url":"https://arxiv.org/pdf/2403.13642v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17128v3","updated":"2024-03-20T14:49:16Z","published":"2024-02-27T01:48:19Z","title":"OSCaR: Object State Captioning and State Change Representation","summary":" The capability of intelligent models to extrapolate and comprehend changes in\nobject states is a crucial yet demanding aspect of AI research, particularly\nthrough the lens of human interaction in real-world settings. This task\ninvolves describing complex visual environments, identifying active objects,\nand interpreting their changes as conveyed through language. Traditional\nmethods, which isolate object captioning and state change detection, offer a\nlimited view of dynamic environments. Moreover, relying on a small set of\nsymbolic words to represent changes has restricted the expressiveness of the\nlanguage. To address these challenges, in this paper, we introduce the Object\nState Captioning and State Change Representation (OSCaR) dataset and benchmark.\nOSCaR consists of 14,084 annotated video segments with nearly 1,000 unique\nobjects from various egocentric video collections. It sets a new testbed for\nevaluating multimodal large language models (MLLMs). Our experiments\ndemonstrate that while MLLMs show some skill, they lack a full understanding of\nobject state changes. The benchmark includes a fine-tuned model that, despite\ninitial capabilities, requires significant improvements in accuracy and\ngeneralization ability for effective understanding of these changes. Our code\nand dataset are available at https://github.com/nguyennm1024/OSCaR.\n","authors":["Nguyen Nguyen","Jing Bi","Ali Vosoughi","Yapeng Tian","Pooyan Fazli","Chenliang Xu"],"pdf_url":"https://arxiv.org/pdf/2402.17128v3.pdf","comment":"NAACL 2024"},{"id":"http://arxiv.org/abs/2208.08270v3","updated":"2024-03-20T14:13:44Z","published":"2022-08-17T13:02:17Z","title":"On the Privacy Effect of Data Enhancement via the Lens of Memorization","summary":" Machine learning poses severe privacy concerns as it has been shown that the\nlearned models can reveal sensitive information about their training data. Many\nworks have investigated the effect of widely adopted data augmentation and\nadversarial training techniques, termed data enhancement in the paper, on the\nprivacy leakage of machine learning models. Such privacy effects are often\nmeasured by membership inference attacks (MIAs), which aim to identify whether\na particular example belongs to the training set or not. We propose to\ninvestigate privacy from a new perspective called memorization. Through the\nlens of memorization, we find that previously deployed MIAs produce misleading\nresults as they are less likely to identify samples with higher privacy risks\nas members compared to samples with low privacy risks. To solve this problem,\nwe deploy a recent attack that can capture individual samples' memorization\ndegrees for evaluation. Through extensive experiments, we unveil several\nfindings about the connections between three essential properties of machine\nlearning models, including privacy, generalization gap, and adversarial\nrobustness. We demonstrate that the generalization gap and privacy leakage are\nless correlated than those of the previous results. Moreover, there is not\nnecessarily a trade-off between adversarial robustness and privacy as stronger\nadversarial robustness does not make the model more susceptible to privacy\nattacks.\n","authors":["Xiao Li","Qiongxiu Li","Zhanhao Hu","Xiaolin Hu"],"pdf_url":"https://arxiv.org/pdf/2208.08270v3.pdf","comment":"Accepted by IEEE TIFS, 17 pages"},{"id":"http://arxiv.org/abs/2403.12425v2","updated":"2024-03-20T13:56:56Z","published":"2024-03-19T04:25:54Z","title":"Multimodal Fusion Method with Spatiotemporal Sequences and Relationship\n Learning for Valence-Arousal Estimation","summary":" This paper presents our approach for the VA (Valence-Arousal) estimation task\nin the ABAW6 competition. We devised a comprehensive model by preprocessing\nvideo frames and audio segments to extract visual and audio features. Through\nthe utilization of Temporal Convolutional Network (TCN) modules, we effectively\ncaptured the temporal and spatial correlations between these features.\nSubsequently, we employed a Transformer encoder structure to learn long-range\ndependencies, thereby enhancing the model's performance and generalization\nability. Our method leverages a multimodal data fusion approach, integrating\npre-trained audio and video backbones for feature extraction, followed by\nTCN-based spatiotemporal encoding and Transformer-based temporal information\ncapture. Experimental results demonstrate the effectiveness of our approach,\nachieving competitive performance in VA estimation on the AffWild2 dataset.\n","authors":["Jun Yu","Gongpeng Zhao","Yongqi Wang","Zhihong Wei","Yang Zheng","Zerui Zhang","Zhongpeng Cai","Guochen Xie","Jichao Zhu","Wangyuan Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.12425v2.pdf","comment":"8 pages,3 figures"},{"id":"http://arxiv.org/abs/2403.13600v1","updated":"2024-03-20T13:48:50Z","published":"2024-03-20T13:48:50Z","title":"VL-Mamba: Exploring State Space Models for Multimodal Learning","summary":" Multimodal large language models (MLLMs) have attracted widespread interest\nand have rich applications. However, the inherent attention mechanism in its\nTransformer structure requires quadratic complexity and results in expensive\ncomputational overhead. Therefore, in this work, we propose VL-Mamba, a\nmultimodal large language model based on state space models, which have been\nshown to have great potential for long-sequence modeling with fast inference\nand linear scaling in sequence length. Specifically, we first replace the\ntransformer-based backbone language model such as LLama or Vicuna with the\npre-trained Mamba language model. Then, we empirically explore how to\neffectively apply the 2D vision selective scan mechanism for multimodal\nlearning and the combinations of different vision encoders and variants of\npretrained Mamba language models. The extensive experiments on diverse\nmultimodal benchmarks with competitive performance show the effectiveness of\nour proposed VL-Mamba and demonstrate the great potential of applying state\nspace models for multimodal learning tasks.\n","authors":["Yanyuan Qiao","Zheng Yu","Longteng Guo","Sihan Chen","Zijia Zhao","Mingzhen Sun","Qi Wu","Jing Liu"],"pdf_url":"https://arxiv.org/pdf/2403.13600v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13589v1","updated":"2024-03-20T13:37:29Z","published":"2024-03-20T13:37:29Z","title":"ReGround: Improving Textual and Spatial Grounding at No Cost","summary":" When an image generation process is guided by both a text prompt and spatial\ncues, such as a set of bounding boxes, do these elements work in harmony, or\ndoes one dominate the other? Our analysis of a pretrained image diffusion model\nthat integrates gated self-attention into the U-Net reveals that spatial\ngrounding often outweighs textual grounding due to the sequential flow from\ngated self-attention to cross-attention. We demonstrate that such bias can be\nsignificantly mitigated without sacrificing accuracy in either grounding by\nsimply rewiring the network architecture, changing from sequential to parallel\nfor gated self-attention and cross-attention. This surprisingly simple yet\neffective solution does not require any fine-tuning of the network but\nsignificantly reduces the trade-off between the two groundings. Our experiments\ndemonstrate significant improvements from the original GLIGEN to the rewired\nversion in the trade-off between textual grounding and spatial grounding.\n","authors":["Yuseung Lee","Minhyuk Sung"],"pdf_url":"https://arxiv.org/pdf/2403.13589v1.pdf","comment":"Project page: https://re-ground.github.io/"},{"id":"http://arxiv.org/abs/2403.13575v1","updated":"2024-03-20T13:20:05Z","published":"2024-03-20T13:20:05Z","title":"Leveraging feature communication in federated learning for remote\n sensing image classification","summary":" In the realm of Federated Learning (FL) applied to remote sensing image\nclassification, this study introduces and assesses several innovative\ncommunication strategies. Our exploration includes feature-centric\ncommunication, pseudo-weight amalgamation, and a combined method utilizing both\nweights and features. Experiments conducted on two public scene classification\ndatasets unveil the effectiveness of these strategies, showcasing accelerated\nconvergence, heightened privacy, and reduced network information exchange. This\nresearch provides valuable insights into the implications of feature-centric\ncommunication in FL, offering potential applications tailored for remote\nsensing scenarios.\n","authors":["Anh-Kiet Duong","Hoàng-Ân Lê","Minh-Tan Pham"],"pdf_url":"https://arxiv.org/pdf/2403.13575v1.pdf","comment":"5 pages, to appear in IGARSS 2024"},{"id":"http://arxiv.org/abs/2306.11335v4","updated":"2024-03-20T13:18:18Z","published":"2023-06-20T07:06:04Z","title":"Surfer: Progressive Reasoning with World Models for Robotic Manipulation","summary":" Considering how to make the model accurately understand and follow natural\nlanguage instructions and perform actions consistent with world knowledge is a\nkey challenge in robot manipulation. This mainly includes human fuzzy\ninstruction reasoning and the following of physical knowledge. Therefore, the\nembodied intelligence agent must have the ability to model world knowledge from\ntraining data. However, most existing vision and language robot manipulation\nmethods mainly operate in less realistic simulator and language settings and\nlack explicit modeling of world knowledge. To bridge this gap, we introduce a\nnovel and simple robot manipulation framework, called Surfer. It is based on\nthe world model, treats robot manipulation as a state transfer of the visual\nscene, and decouples it into two parts: action and scene. Then, the\ngeneralization ability of the model on new instructions and new scenes is\nenhanced by explicit modeling of the action and scene prediction in multi-modal\ninformation. In addition to the framework, we also built a robot manipulation\nsimulator that supports full physics execution based on the MuJoCo physics\nengine. It can automatically generate demonstration training data and test\ndata, effectively reducing labor costs. To conduct a comprehensive and\nsystematic evaluation of the robot manipulation model in terms of language\nunderstanding and physical execution, we also created a robotic manipulation\nbenchmark with progressive reasoning tasks, called SeaWave. It contains 4\nlevels of progressive reasoning tasks and can provide a standardized testing\nplatform for embedded AI agents in multi-modal environments. On average, Surfer\nachieved a success rate of 54.74% on the defined four levels of manipulation\ntasks, exceeding the best baseline performance of 47.64%.\n","authors":["Pengzhen Ren","Kaidong Zhang","Hetao Zheng","Zixuan Li","Yuhang Wen","Fengda Zhu","Mas Ma","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2306.11335v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10261v2","updated":"2024-03-20T13:15:28Z","published":"2024-03-15T12:48:44Z","title":"Learning Spatiotemporal Inconsistency via Thumbnail Layout for Face\n Deepfake Detection","summary":" The deepfake threats to society and cybersecurity have provoked significant\npublic apprehension, driving intensified efforts within the realm of deepfake\nvideo detection. Current video-level methods are mostly based on {3D CNNs}\nresulting in high computational demands, although have achieved good\nperformance. This paper introduces an elegantly simple yet effective strategy\nnamed Thumbnail Layout (TALL), which transforms a video clip into a pre-defined\nlayout to realize the preservation of spatial and temporal dependencies. This\ntransformation process involves sequentially masking frames at the same\npositions within each frame. These frames are then resized into sub-frames and\nreorganized into the predetermined layout, forming thumbnails. TALL is\nmodel-agnostic and has remarkable simplicity, necessitating only minimal code\nmodifications. Furthermore, we introduce a graph reasoning block (GRB) and\nsemantic consistency (SC) loss to strengthen TALL, culminating in TALL++. GRB\nenhances interactions between different semantic regions to capture\nsemantic-level inconsistency clues. The semantic consistency loss imposes\nconsistency constraints on semantic features to improve model generalization\nability. Extensive experiments on intra-dataset, cross-dataset,\ndiffusion-generated image detection, and deepfake generation method recognition\nshow that TALL++ achieves results surpassing or comparable to the\nstate-of-the-art methods, demonstrating the effectiveness of our approaches for\nvarious deepfake detection problems. The code is available at\nhttps://github.com/rainy-xu/TALL4Deepfake.\n","authors":["Yuting Xu","Jian Liang","Lijun Sheng","Xiao-Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.10261v2.pdf","comment":"Accepted by IJCV"},{"id":"http://arxiv.org/abs/2302.09389v2","updated":"2024-03-20T13:11:19Z","published":"2023-02-18T17:45:11Z","title":"Vulnerability analysis of captcha using Deep learning","summary":" Several websites improve their security and avoid dangerous Internet attacks\nby implementing CAPTCHAs (Completely Automated Public Turing test to tell\nComputers and Humans Apart), a type of verification to identify whether the\nend-user is human or a robot. The most prevalent type of CAPTCHA is text-based,\ndesigned to be easily recognized by humans while being unsolvable towards\nmachines or robots. However, as deep learning technology progresses,\ndevelopment of convolutional neural network (CNN) models that predict\ntext-based CAPTCHAs becomes easier. The purpose of this research is to\ninvestigate the flaws and vulnerabilities in the CAPTCHA generating systems in\norder to design more resilient CAPTCHAs. To achieve this, we created CapNet, a\nConvolutional Neural Network. The proposed platform can evaluate both numerical\nand alphanumerical CAPTCHAs\n","authors":["Jaskaran Singh Walia","Aryan Odugoudar"],"pdf_url":"https://arxiv.org/pdf/2302.09389v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13570v1","updated":"2024-03-20T13:09:54Z","published":"2024-03-20T13:09:54Z","title":"Portrait4D-v2: Pseudo Multi-View Data Creates Better 4D Head Synthesizer","summary":" In this paper, we propose a novel learning approach for feed-forward one-shot\n4D head avatar synthesis. Different from existing methods that often learn from\nreconstructing monocular videos guided by 3DMM, we employ pseudo multi-view\nvideos to learn a 4D head synthesizer in a data-driven manner, avoiding\nreliance on inaccurate 3DMM reconstruction that could be detrimental to the\nsynthesis performance. The key idea is to first learn a 3D head synthesizer\nusing synthetic multi-view images to convert monocular real videos into\nmulti-view ones, and then utilize the pseudo multi-view videos to learn a 4D\nhead synthesizer via cross-view self-reenactment. By leveraging a simple vision\ntransformer backbone with motion-aware cross-attentions, our method exhibits\nsuperior performance compared to previous methods in terms of reconstruction\nfidelity, geometry consistency, and motion control accuracy. We hope our method\noffers novel insights into integrating 3D priors with 2D supervisions for\nimproved 4D head avatar creation.\n","authors":["Yu Deng","Duomin Wang","Baoyuan Wang"],"pdf_url":"https://arxiv.org/pdf/2403.13570v1.pdf","comment":"Project page: https://yudeng.github.io/Portrait4D-v2/"},{"id":"http://arxiv.org/abs/2312.02696v2","updated":"2024-03-20T12:58:14Z","published":"2023-12-05T11:55:47Z","title":"Analyzing and Improving the Training Dynamics of Diffusion Models","summary":" Diffusion models currently dominate the field of data-driven image synthesis\nwith their unparalleled scaling to large datasets. In this paper, we identify\nand rectify several causes for uneven and ineffective training in the popular\nADM diffusion model architecture, without altering its high-level structure.\nObserving uncontrolled magnitude changes and imbalances in both the network\nactivations and weights over the course of training, we redesign the network\nlayers to preserve activation, weight, and update magnitudes on expectation. We\nfind that systematic application of this philosophy eliminates the observed\ndrifts and imbalances, resulting in considerably better networks at equal\ncomputational complexity. Our modifications improve the previous record FID of\n2.41 in ImageNet-512 synthesis to 1.81, achieved using fast deterministic\nsampling.\n As an independent contribution, we present a method for setting the\nexponential moving average (EMA) parameters post-hoc, i.e., after completing\nthe training run. This allows precise tuning of EMA length without the cost of\nperforming several training runs, and reveals its surprising interactions with\nnetwork architecture, training time, and guidance.\n","authors":["Tero Karras","Miika Aittala","Jaakko Lehtinen","Janne Hellsten","Timo Aila","Samuli Laine"],"pdf_url":"https://arxiv.org/pdf/2312.02696v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13556v1","updated":"2024-03-20T12:51:30Z","published":"2024-03-20T12:51:30Z","title":"Find n' Propagate: Open-Vocabulary 3D Object Detection in Urban\n Environments","summary":" In this work, we tackle the limitations of current LiDAR-based 3D object\ndetection systems, which are hindered by a restricted class vocabulary and the\nhigh costs associated with annotating new object classes. Our exploration of\nopen-vocabulary (OV) learning in urban environments aims to capture novel\ninstances using pre-trained vision-language models (VLMs) with multi-sensor\ndata. We design and benchmark a set of four potential solutions as baselines,\ncategorizing them into either top-down or bottom-up approaches based on their\ninput data strategies. While effective, these methods exhibit certain\nlimitations, such as missing novel objects in 3D box estimation or applying\nrigorous priors, leading to biases towards objects near the camera or of\nrectangular geometries. To overcome these limitations, we introduce a universal\n\\textsc{Find n' Propagate} approach for 3D OV tasks, aimed at maximizing the\nrecall of novel objects and propagating this detection capability to more\ndistant areas thereby progressively capturing more. In particular, we utilize a\ngreedy box seeker to search against 3D novel boxes of varying orientations and\ndepth in each generated frustum and ensure the reliability of newly identified\nboxes by cross alignment and density ranker. Additionally, the inherent bias\ntowards camera-proximal objects is alleviated by the proposed remote simulator,\nwhich randomly diversifies pseudo-labeled novel instances in the self-training\nprocess, combined with the fusion of base samples in the memory bank. Extensive\nexperiments demonstrate a 53% improvement in novel recall across diverse OV\nsettings, VLMs, and 3D detectors. Notably, we achieve up to a 3.97-fold\nincrease in Average Precision (AP) for novel object classes. The source code is\nmade available in the supplementary material.\n","authors":["Djamahl Etchegaray","Zi Huang","Tatsuya Harada","Yadan Luo"],"pdf_url":"https://arxiv.org/pdf/2403.13556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13551v1","updated":"2024-03-20T12:40:32Z","published":"2024-03-20T12:40:32Z","title":"Ground-A-Score: Scaling Up the Score Distillation for Multi-Attribute\n Editing","summary":" Despite recent advancements in text-to-image diffusion models facilitating\nvarious image editing techniques, complex text prompts often lead to an\noversight of some requests due to a bottleneck in processing text information.\nTo tackle this challenge, we present Ground-A-Score, a simple yet powerful\nmodel-agnostic image editing method by incorporating grounding during score\ndistillation. This approach ensures a precise reflection of intricate prompt\nrequirements in the editing outcomes, taking into account the prior knowledge\nof the object locations within the image. Moreover, the selective application\nwith a new penalty coefficient and contrastive loss helps to precisely target\nediting areas while preserving the integrity of the objects in the source\nimage. Both qualitative assessments and quantitative analyses confirm that\nGround-A-Score successfully adheres to the intricate details of extended and\nmultifaceted prompts, ensuring high-quality outcomes that respect the original\nimage attributes.\n","authors":["Hangeol Chang","Jinho Chang","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2403.13551v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.09008v2","updated":"2024-03-20T12:39:52Z","published":"2023-12-11T09:53:12Z","title":"Style Injection in Diffusion: A Training-free Approach for Adapting\n Large-scale Diffusion Models for Style Transfer","summary":" Despite the impressive generative capabilities of diffusion models, existing\ndiffusion model-based style transfer methods require inference-stage\noptimization (e.g. fine-tuning or textual inversion of style) which is\ntime-consuming, or fails to leverage the generative ability of large-scale\ndiffusion models. To address these issues, we introduce a novel artistic style\ntransfer method based on a pre-trained large-scale diffusion model without any\noptimization. Specifically, we manipulate the features of self-attention layers\nas the way the cross-attention mechanism works; in the generation process,\nsubstituting the key and value of content with those of style image. This\napproach provides several desirable characteristics for style transfer\nincluding 1) preservation of content by transferring similar styles into\nsimilar image patches and 2) transfer of style based on similarity of local\ntexture (e.g. edge) between content and style images. Furthermore, we introduce\nquery preservation and attention temperature scaling to mitigate the issue of\ndisruption of original content, and initial latent Adaptive Instance\nNormalization (AdaIN) to deal with the disharmonious color (failure to transfer\nthe colors of style). Our experimental results demonstrate that our proposed\nmethod surpasses state-of-the-art methods in both conventional and\ndiffusion-based style transfer baselines.\n","authors":["Jiwoo Chung","Sangeek Hyun","Jae-Pil Heo"],"pdf_url":"https://arxiv.org/pdf/2312.09008v2.pdf","comment":"Accepted to CVPR 2024. Project page:\n https://jiwoogit.github.io/StyleID_site"},{"id":"http://arxiv.org/abs/2303.15263v4","updated":"2024-03-20T12:39:28Z","published":"2023-03-27T14:52:08Z","title":"Joint Person Identity, Gender and Age Estimation from Hand Images using\n Deep Multi-Task Representation Learning","summary":" In this paper, we propose a multi-task representation learning framework to\njointly estimate the identity, gender and age of individuals from their hand\nimages for the purpose of criminal investigations since the hand images are\noften the only available information in cases of serious crime such as sexual\nabuse. We investigate different up-to-date deep learning architectures and\ncompare their performance for joint estimation of identity, gender and age from\nhand images of perpetrators of serious crime. To simplify the age prediction,\nwe create age groups for the age estimation. We make extensive evaluations and\ncomparisons of both convolution-based and transformer-based deep learning\narchitectures on a publicly available 11k hands dataset. Our experimental\nanalysis shows that it is possible to efficiently estimate not only identity\nbut also other attributes such as gender and age of suspects jointly from hand\nimages for criminal investigations, which is crucial in assisting international\npolice forces in the court to identify and convict abusers.\n","authors":["Nathanael L. Baisa"],"pdf_url":"https://arxiv.org/pdf/2303.15263v4.pdf","comment":"arXiv admin note: text overlap with arXiv:2209.04821"},{"id":"http://arxiv.org/abs/2403.13548v1","updated":"2024-03-20T12:36:41Z","published":"2024-03-20T12:36:41Z","title":"Diversity-aware Channel Pruning for StyleGAN Compression","summary":" StyleGAN has shown remarkable performance in unconditional image generation.\nHowever, its high computational cost poses a significant challenge for\npractical applications. Although recent efforts have been made to compress\nStyleGAN while preserving its performance, existing compressed models still lag\nbehind the original model, particularly in terms of sample diversity. To\novercome this, we propose a novel channel pruning method that leverages varying\nsensitivities of channels to latent vectors, which is a key factor in sample\ndiversity. Specifically, by assessing channel importance based on their\nsensitivities to latent vector perturbations, our method enhances the diversity\nof samples in the compressed model. Since our method solely focuses on the\nchannel pruning stage, it has complementary benefits with prior training\nschemes without additional training cost. Extensive experiments demonstrate\nthat our method significantly enhances sample diversity across various\ndatasets. Moreover, in terms of FID scores, our method not only surpasses\nstate-of-the-art by a large margin but also achieves comparable scores with\nonly half training iterations.\n","authors":["Jiwoo Chung","Sangeek Hyun","Sang-Heon Shim","Jae-Pil Heo"],"pdf_url":"https://arxiv.org/pdf/2403.13548v1.pdf","comment":"Accepted to CVPR 2024. Project page:\n https://jiwoogit.github.io/DCP-GAN_site"},{"id":"http://arxiv.org/abs/2403.13545v1","updated":"2024-03-20T12:31:13Z","published":"2024-03-20T12:31:13Z","title":"Next day fire prediction via semantic segmentation","summary":" In this paper we present a deep learning pipeline for next day fire\nprediction. The next day fire prediction task consists in learning models that\nreceive as input the available information for an area up until a certain day,\nin order to predict the occurrence of fire for the next day. Starting from our\nprevious problem formulation as a binary classification task on instances\n(daily snapshots of each area) represented by tabular feature vectors, we\nreformulate the problem as a semantic segmentation task on images; there, each\npixel corresponds to a daily snapshot of an area, while its channels represent\nthe formerly tabular training features. We demonstrate that this problem\nformulation, built within a thorough pipeline achieves state of the art\nresults.\n","authors":["Konstantinos Alexis","Stella Girtsou","Alexis Apostolakis","Giorgos Giannopoulos","Charalampos Kontoes"],"pdf_url":"https://arxiv.org/pdf/2403.13545v1.pdf","comment":"Accepted in MACLEAN@ECML/PKDD 2023"},{"id":"http://arxiv.org/abs/2403.13537v1","updated":"2024-03-20T12:14:54Z","published":"2024-03-20T12:14:54Z","title":"What explains the success of cross-modal fine-tuning with ORCA?","summary":" ORCA (Shen et al., 2023) is a recent technique for cross-modal fine-tuning,\ni.e., applying pre-trained transformer models to modalities beyond their\ntraining data. The technique consists primarily of training an embedder and\nfine-tuning the embedder and model. Despite its high performance on a variety\nof downstream tasks, we do not understand precisely how each of these\ncomponents contribute to ORCA's success. Therefore, we run a series of\nablations and find that embedder training does not help 2D tasks at all,\ncontrary to what the original paper posits. In 1D tasks, some amount of\nembedder training is necessary but more is not better. In 4 out of 6 datasets\nwe experiment with, it is model fine-tuning that makes the biggest difference.\nThrough our ablations and baselines, we contribute a better understanding of\nthe individual components of ORCA.\n","authors":["Paloma García-de-Herreros","Vagrant Gautam","Philipp Slusallek","Dietrich Klakow","Marius Mosbach"],"pdf_url":"https://arxiv.org/pdf/2403.13537v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13535v1","updated":"2024-03-20T12:13:04Z","published":"2024-03-20T12:13:04Z","title":"IDAdapter: Learning Mixed Features for Tuning-Free Personalization of\n Text-to-Image Models","summary":" Leveraging Stable Diffusion for the generation of personalized portraits has\nemerged as a powerful and noteworthy tool, enabling users to create\nhigh-fidelity, custom character avatars based on their specific prompts.\nHowever, existing personalization methods face challenges, including test-time\nfine-tuning, the requirement of multiple input images, low preservation of\nidentity, and limited diversity in generated outcomes. To overcome these\nchallenges, we introduce IDAdapter, a tuning-free approach that enhances the\ndiversity and identity preservation in personalized image generation from a\nsingle face image. IDAdapter integrates a personalized concept into the\ngeneration process through a combination of textual and visual injections and a\nface identity loss. During the training phase, we incorporate mixed features\nfrom multiple reference images of a specific identity to enrich\nidentity-related content details, guiding the model to generate images with\nmore diverse styles, expressions, and angles compared to previous works.\nExtensive evaluations demonstrate the effectiveness of our method, achieving\nboth diversity and identity fidelity in generated images.\n","authors":["Siying Cui","Jiankang Deng","Jia Guo","Xiang An","Yongle Zhao","Xinyu Wei","Ziyong Feng"],"pdf_url":"https://arxiv.org/pdf/2403.13535v1.pdf","comment":"14 pages, 15 figures"},{"id":"http://arxiv.org/abs/2312.09031v2","updated":"2024-03-20T12:00:59Z","published":"2023-12-14T15:31:33Z","title":"iComMa: Inverting 3D Gaussian Splatting for Camera Pose Estimation via\n Comparing and Matching","summary":" We present a method named iComMa to address the 6D camera pose estimation\nproblem in computer vision. Conventional pose estimation methods typically rely\non the target's CAD model or necessitate specific network training tailored to\nparticular object classes. Some existing methods have achieved promising\nresults in mesh-free object and scene pose estimation by inverting the Neural\nRadiance Fields (NeRF). However, they still struggle with adverse\ninitializations such as large rotations and translations. To address this\nissue, we propose an efficient method for accurate camera pose estimation by\ninverting 3D Gaussian Splatting (3DGS). Specifically, a gradient-based\ndifferentiable framework optimizes camera pose by minimizing the residual\nbetween the query image and the rendered image, requiring no training. An\nend-to-end matching module is designed to enhance the model's robustness\nagainst adverse initializations, while minimizing pixel-level comparing loss\naids in precise pose estimation. Experimental results on synthetic and complex\nreal-world data demonstrate the effectiveness of the proposed approach in\nchallenging conditions and the accuracy of camera pose estimation.\n","authors":["Yuan Sun","Xuan Wang","Yunfan Zhang","Jie Zhang","Caigui Jiang","Yu Guo","Fei Wang"],"pdf_url":"https://arxiv.org/pdf/2312.09031v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11959v2","updated":"2024-03-20T11:58:23Z","published":"2024-03-18T16:56:47Z","title":"IVAC-P2L: Leveraging Irregular Repetition Priors for Improving Video\n Action Counting","summary":" Video Action Counting (VAC) is crucial in analyzing sports, fitness, and\neveryday activities by quantifying repetitive actions in videos. However,\ntraditional VAC methods have overlooked the complexity of action repetitions,\nsuch as interruptions and the variability in cycle duration. Our research\naddresses the shortfall by introducing a novel approach to VAC, called\nIrregular Video Action Counting (IVAC). IVAC prioritizes modeling irregular\nrepetition patterns in videos, which we define through two primary aspects:\nInter-cycle Consistency and Cycle-interval Inconsistency. Inter-cycle\nConsistency ensures homogeneity in the spatial-temporal representations of\ncycle segments, signifying action uniformity within cycles. Cycle-interval\ninconsistency highlights the importance of distinguishing between cycle\nsegments and intervals based on their inherent content differences. To\nencapsulate these principles, we propose a new methodology that includes\nconsistency and inconsistency modules, supported by a unique pull-push loss\n(P2L) mechanism. The IVAC-P2L model applies a pull loss to promote coherence\namong cycle segment features and a push loss to clearly distinguish features of\ncycle segments from interval segments. Empirical evaluations conducted on the\nRepCount dataset demonstrate that the IVAC-P2L model sets a new benchmark in\nVAC task performance. Furthermore, the model demonstrates exceptional\nadaptability and generalization across various video contents, outperforming\nexisting models on two additional datasets, UCFRep and Countix, without the\nneed for dataset-specific optimization. These results confirm the efficacy of\nour approach in addressing irregular repetitions in videos and pave the way for\nfurther advancements in video analysis and understanding.\n","authors":["Hang Wang","Zhi-Qi Cheng","Youtian Du","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.11959v2.pdf","comment":"Source code: https://github.com/hwang-cs-ime/IVAC-P2L"},{"id":"http://arxiv.org/abs/2403.13524v1","updated":"2024-03-20T11:51:04Z","published":"2024-03-20T11:51:04Z","title":"Compress3D: a Compressed Latent Space for 3D Generation from a Single\n Image","summary":" 3D generation has witnessed significant advancements, yet efficiently\nproducing high-quality 3D assets from a single image remains challenging. In\nthis paper, we present a triplane autoencoder, which encodes 3D models into a\ncompact triplane latent space to effectively compress both the 3D geometry and\ntexture information. Within the autoencoder framework, we introduce a 3D-aware\ncross-attention mechanism, which utilizes low-resolution latent representations\nto query features from a high-resolution 3D feature volume, thereby enhancing\nthe representation capacity of the latent space. Subsequently, we train a\ndiffusion model on this refined latent space. In contrast to solely relying on\nimage embedding for 3D generation, our proposed method advocates for the\nsimultaneous utilization of both image embedding and shape embedding as\nconditions. Specifically, the shape embedding is estimated via a diffusion\nprior model conditioned on the image embedding. Through comprehensive\nexperiments, we demonstrate that our method outperforms state-of-the-art\nalgorithms, achieving superior performance while requiring less training data\nand time. Our approach enables the generation of high-quality 3D assets in\nmerely 7 seconds on a single A100 GPU.\n","authors":["Bowen Zhang","Tianyu Yang","Yu Li","Lei Zhang","Xi Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.13524v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13522v1","updated":"2024-03-20T11:48:10Z","published":"2024-03-20T11:48:10Z","title":"REAL: Representation Enhanced Analytic Learning for Exemplar-free\n Class-incremental Learning","summary":" Exemplar-free class-incremental learning (EFCIL) aims to mitigate\ncatastrophic forgetting in class-incremental learning without available\nhistorical data. Compared with its counterpart (replay-based CIL) that stores\nhistorical samples, the EFCIL suffers more from forgetting issues under the\nexemplar-free constraint. In this paper, inspired by the recently developed\nanalytic learning (AL) based CIL, we propose a representation enhanced analytic\nlearning (REAL) for EFCIL. The REAL constructs a dual-stream base pretraining\n(DS-BPT) and a representation enhancing distillation (RED) process to enhance\nthe representation of the extractor. The DS-BPT pretrains model in streams of\nboth supervised learning and self-supervised contrastive learning (SSCL) for\nbase knowledge extraction. The RED process distills the supervised knowledge to\nthe SSCL pretrained backbone and facilitates a subsequent AL-basd CIL that\nconverts the CIL to a recursive least-square problem. Our method addresses the\nissue of insufficient discriminability in representations of unseen data caused\nby a frozen backbone in the existing AL-based CIL. Empirical results on various\ndatasets including CIFAR-100, ImageNet-100 and ImageNet-1k, demonstrate that\nour REAL outperforms the state-of-the-arts in EFCIL, and achieves comparable or\neven more superior performance compared with the replay-based methods.\n","authors":["Run He","Huiping Zhuang","Di Fang","Yizhu Chen","Kai Tong","Cen Chen"],"pdf_url":"https://arxiv.org/pdf/2403.13522v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13518v1","updated":"2024-03-20T11:38:30Z","published":"2024-03-20T11:38:30Z","title":"Motion Generation from Fine-grained Textual Descriptions","summary":" The task of text2motion is to generate motion sequences from given textual\ndescriptions, where a model should explore the interactions between natural\nlanguage instructions and human body movements. While most existing works are\nconfined to coarse-grained motion descriptions (e.g., \"A man squats.\"),\nfine-grained ones specifying movements of relevant body parts are barely\nexplored. Models trained with coarse texts may not be able to learn mappings\nfrom fine-grained motion-related words to motion primitives, resulting in the\nfailure in generating motions from unseen descriptions. In this paper, we build\na large-scale language-motion dataset with fine-grained textual descriptions,\nFineHumanML3D, by feeding GPT-3.5-turbo with delicate prompts. Accordingly, we\ndesign a new text2motion model, FineMotionDiffuse, which makes full use of\nfine-grained textual information. Our experiments show that FineMotionDiffuse\ntrained on FineHumanML3D acquires good results in quantitative evaluation. We\nalso find this model can better generate spatially/chronologically composite\nmotions by learning the implicit mappings from simple descriptions to the\ncorresponding basic motions.\n","authors":["Kunhang Li","Yansong Feng"],"pdf_url":"https://arxiv.org/pdf/2403.13518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13513v1","updated":"2024-03-20T11:27:20Z","published":"2024-03-20T11:27:20Z","title":"What if...?: Counterfactual Inception to Mitigate Hallucination Effects\n in Large Multimodal Models","summary":" This paper presents a way of enhancing the reliability of Large Multimodal\nModels (LMMs) in addressing hallucination effects, where models generate\nincorrect or unrelated responses. Without additional instruction tuning\nparadigm, we introduce Counterfactual Inception, a novel method that implants\ncounterfactual thoughts into LMMs using carefully chosen, misaligned\ncounterfactual keywords. This method is grounded in the concept of\ncounterfactual thinking, a cognitive process where humans consider alternative\nrealities and outcomes. By applying this human-like reasoning mechanism to\nLMMs, we aim to reduce hallucination effects and improve the models'\ntrustworthiness. We also propose Dual-modality Verification Process (DVP), a\nrigorous framework for selecting optimal counterfactual keywords to trigger\ncounterfactual thinking into LMMs, concurrently considering visual and\nlinguistic context. Our extensive experiments across various LMMs, including\nboth open-source and proprietary models, corroborate that our method\nsignificantly mitigates hallucination phenomena across different datasets.\n","authors":["Junho Kim","Yeon Ju Kim","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2403.13513v1.pdf","comment":"under review, code available:\n https://github.com/IVY-LVLM/Counterfactual-Inception"},{"id":"http://arxiv.org/abs/2403.13512v1","updated":"2024-03-20T11:21:22Z","published":"2024-03-20T11:21:22Z","title":"Scale Decoupled Distillation","summary":" Logit knowledge distillation attracts increasing attention due to its\npracticality in recent studies. However, it often suffers inferior performance\ncompared to the feature knowledge distillation. In this paper, we argue that\nexisting logit-based methods may be sub-optimal since they only leverage the\nglobal logit output that couples multiple semantic knowledge. This may transfer\nambiguous knowledge to the student and mislead its learning. To this end, we\npropose a simple but effective method, i.e., Scale Decoupled Distillation\n(SDD), for logit knowledge distillation. SDD decouples the global logit output\ninto multiple local logit outputs and establishes distillation pipelines for\nthem. This helps the student to mine and inherit fine-grained and unambiguous\nlogit knowledge. Moreover, the decoupled knowledge can be further divided into\nconsistent and complementary logit knowledge that transfers the semantic\ninformation and sample ambiguity, respectively. By increasing the weight of\ncomplementary parts, SDD can guide the student to focus more on ambiguous\nsamples, improving its discrimination ability. Extensive experiments on several\nbenchmark datasets demonstrate the effectiveness of SDD for wide\nteacher-student pairs, especially in the fine-grained classification task. Code\nis available at: https://github.com/shicaiwei123/SDD-CVPR2024\n","authors":["Shicai Wei Chunbo Luo Yang Luo"],"pdf_url":"https://arxiv.org/pdf/2403.13512v1.pdf","comment":"Accepted to CVPR2024 10 pages 6figure"},{"id":"http://arxiv.org/abs/2403.13509v1","updated":"2024-03-20T11:12:57Z","published":"2024-03-20T11:12:57Z","title":"High-confidence pseudo-labels for domain adaptation in COVID-19\n detection","summary":" This paper outlines our submission for the 4th COV19D competition as part of\nthe `Domain adaptation, Explainability, Fairness in AI for Medical Image\nAnalysis' (DEF-AI-MIA) workshop at the Computer Vision and Pattern Recognition\nConference (CVPR). The competition consists of two challenges. The first is to\ntrain a classifier to detect the presence of COVID-19 from over one thousand CT\nscans from the COV19-CT-DB database. The second challenge is to perform domain\nadaptation by taking the dataset from Challenge 1 and adding a small number of\nscans (some annotated and other not) for a different distribution. We\npreprocessed the CT scans to segment the lungs, and output volumes with the\nlungs individually and together. We then trained 3D ResNet and Swin Transformer\nmodels on these inputs. We annotated the unlabeled CT scans using an ensemble\nof these models and chose the high-confidence predictions as pseudo-labels for\nfine-tuning. This resulted in a best cross-validation mean F1 score of 93.39\\%\nfor Challenge 1 and a mean F1 score of 92.15 for Challenge 2.\n","authors":["Robert Turnbull","Simon Mutch"],"pdf_url":"https://arxiv.org/pdf/2403.13509v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13507v1","updated":"2024-03-20T11:05:07Z","published":"2024-03-20T11:05:07Z","title":"FMM-Attack: A Flow-based Multi-modal Adversarial Attack on Video-based\n LLMs","summary":" Despite the remarkable performance of video-based large language models\n(LLMs), their adversarial threat remains unexplored. To fill this gap, we\npropose the first adversarial attack tailored for video-based LLMs by crafting\nflow-based multi-modal adversarial perturbations on a small fraction of frames\nwithin a video, dubbed FMM-Attack. Extensive experiments show that our attack\ncan effectively induce video-based LLMs to generate incorrect answers when\nvideos are added with imperceptible adversarial perturbations. Intriguingly,\nour FMM-Attack can also induce garbling in the model output, prompting\nvideo-based LLMs to hallucinate. Overall, our observations inspire a further\nunderstanding of multi-modal robustness and safety-related feature alignment\nacross different modalities, which is of great importance for various large\nmulti-modal models. Our code is available at\nhttps://github.com/THU-Kingmin/FMM-Attack.\n","authors":["Jinmin Li","Kuofeng Gao","Yang Bai","Jingyun Zhang","Shu-tao Xia","Yisen Wang"],"pdf_url":"https://arxiv.org/pdf/2403.13507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13501v1","updated":"2024-03-20T10:58:58Z","published":"2024-03-20T10:58:58Z","title":"VSTAR: Generative Temporal Nursing for Longer Dynamic Video Synthesis","summary":" Despite tremendous progress in the field of text-to-video (T2V) synthesis,\nopen-sourced T2V diffusion models struggle to generate longer videos with\ndynamically varying and evolving content. They tend to synthesize quasi-static\nvideos, ignoring the necessary visual change-over-time implied in the text\nprompt. At the same time, scaling these models to enable longer, more dynamic\nvideo synthesis often remains computationally intractable. To address this\nchallenge, we introduce the concept of Generative Temporal Nursing (GTN), where\nwe aim to alter the generative process on the fly during inference to improve\ncontrol over the temporal dynamics and enable generation of longer videos. We\npropose a method for GTN, dubbed VSTAR, which consists of two key ingredients:\n1) Video Synopsis Prompting (VSP) - automatic generation of a video synopsis\nbased on the original single prompt leveraging LLMs, which gives accurate\ntextual guidance to different visual states of longer videos, and 2) Temporal\nAttention Regularization (TAR) - a regularization technique to refine the\ntemporal attention units of the pre-trained T2V diffusion models, which enables\ncontrol over the video dynamics. We experimentally showcase the superiority of\nthe proposed approach in generating longer, visually appealing videos over\nexisting open-sourced T2V models. We additionally analyze the temporal\nattention maps realized with and without VSTAR, demonstrating the importance of\napplying our method to mitigate neglect of the desired visual change over time.\n","authors":["Yumeng Li","William Beluch","Margret Keuper","Dan Zhang","Anna Khoreva"],"pdf_url":"https://arxiv.org/pdf/2403.13501v1.pdf","comment":"Project page: https://yumengli007.github.io/VSTAR"},{"id":"http://arxiv.org/abs/2403.13499v1","updated":"2024-03-20T10:57:17Z","published":"2024-03-20T10:57:17Z","title":"Improved Baselines for Data-efficient Perceptual Augmentation of LLMs","summary":" The abilities of large language models (LLMs) have recently progressed to\nunprecedented levels, paving the way to novel applications in a wide variety of\nareas. In computer vision, LLMs can be used to prime vision-language tasks such\nimage captioning and visual question answering when coupled with pre-trained\nvision backbones. While different approaches have been explored to interface\nLLMs with ``perceptual backbones'' that process, e.g., visual or audio data,\nthey are often explored for different tasks, different datasets, and using\ndifferent perceptual backbones and language models, hindering direct comparison\nof the interfacing mechanisms. To remedy this lack of comparability between\nmethods, we present an extensive experimental evaluation of different\ninterfacing mechanisms, across multiple tasks (including image, video, and\naudio captioning as well as visual question answering), datasets and backbones,\npaying special attention to low-data settings. We find improved performance\nusing existing mechanisms over state-of-the-art results, and identify a new\ninterfacing mechanism that yields (near) optimal results across different\ntasks, while obtaining a 4x reduction in training time.\n","authors":["Théophane Vallaeys","Mustafa Shukor","Matthieu Cord","Jakob Verbeek"],"pdf_url":"https://arxiv.org/pdf/2403.13499v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13480v1","updated":"2024-03-20T10:34:40Z","published":"2024-03-20T10:34:40Z","title":"A Unified Optimal Transport Framework for Cross-Modal Retrieval with\n Noisy Labels","summary":" Cross-modal retrieval (CMR) aims to establish interaction between different\nmodalities, among which supervised CMR is emerging due to its flexibility in\nlearning semantic category discrimination. Despite the remarkable performance\nof previous supervised CMR methods, much of their success can be attributed to\nthe well-annotated data. However, even for unimodal data, precise annotation is\nexpensive and time-consuming, and it becomes more challenging with the\nmultimodal scenario. In practice, massive multimodal data are collected from\nthe Internet with coarse annotation, which inevitably introduces noisy labels.\nTraining with such misleading labels would bring two key challenges --\nenforcing the multimodal samples to \\emph{align incorrect semantics} and\n\\emph{widen the heterogeneous gap}, resulting in poor retrieval performance. To\ntackle these challenges, this work proposes UOT-RCL, a Unified framework based\non Optimal Transport (OT) for Robust Cross-modal Retrieval. First, we propose a\nsemantic alignment based on partial OT to progressively correct the noisy\nlabels, where a novel cross-modal consistent cost function is designed to blend\ndifferent modalities and provide precise transport cost. Second, to narrow the\ndiscrepancy in multi-modal data, an OT-based relation alignment is proposed to\ninfer the semantic-level cross-modal matching. Both of these two components\nleverage the inherent correlation among multi-modal data to facilitate\neffective cost function. The experiments on three widely-used cross-modal\nretrieval datasets demonstrate that our UOT-RCL surpasses the state-of-the-art\napproaches and significantly improves the robustness against noisy labels.\n","authors":["Haochen Han","Minnan Luo","Huan Liu","Fang Nan"],"pdf_url":"https://arxiv.org/pdf/2403.13480v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2403.13479v1","updated":"2024-03-20T10:33:10Z","published":"2024-03-20T10:33:10Z","title":"Deepfake Detection without Deepfakes: Generalization via Synthetic\n Frequency Patterns Injection","summary":" Deepfake detectors are typically trained on large sets of pristine and\ngenerated images, resulting in limited generalization capacity; they excel at\nidentifying deepfakes created through methods encountered during training but\nstruggle with those generated by unknown techniques. This paper introduces a\nlearning approach aimed at significantly enhancing the generalization\ncapabilities of deepfake detectors. Our method takes inspiration from the\nunique \"fingerprints\" that image generation processes consistently introduce\ninto the frequency domain. These fingerprints manifest as structured and\ndistinctly recognizable frequency patterns. We propose to train detectors using\nonly pristine images injecting in part of them crafted frequency patterns,\nsimulating the effects of various deepfake generation techniques without being\nspecific to any. These synthetic patterns are based on generic shapes, grids,\nor auras. We evaluated our approach using diverse architectures across 25\ndifferent generation methods. The models trained with our approach were able to\nperform state-of-the-art deepfake detection, demonstrating also superior\ngeneralization capabilities in comparison with previous methods. Indeed, they\nare untied to any specific generation technique and can effectively identify\ndeepfakes regardless of how they were made.\n","authors":["Davide Alessandro Coccomini","Roberto Caldelli","Claudio Gennaro","Giuseppe Fiameni","Giuseppe Amato","Fabrizio Falchi"],"pdf_url":"https://arxiv.org/pdf/2403.13479v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.11427v2","updated":"2024-03-20T10:23:49Z","published":"2023-02-22T15:07:29Z","title":"Enhanced Face Authentication With Separate Loss Functions","summary":" The overall objective of the main project is to propose and develop a system\nof facial authentication in unlocking phones or applications in phones using\nfacial recognition. The system will include four separate architectures: face\ndetection, face recognition, face spoofing, and classification of closed eyes.\nIn which, we consider the problem of face recognition to be the most important,\ndetermining the true identity of the person standing in front of the screen\nwith absolute accuracy is what facial recognition systems need to achieve.\nAlong with the development of the face recognition problem, the problem of the\nanti-fake face is also gradually becoming popular and equally important. Our\ngoal is to propose and develop two loss functions: LMCot and Double Loss. Then\napply them to the face authentication process.\n","authors":["Anh-Kiet Duong","Hoang-Lan Nguyen","Toan-Thinh Truong"],"pdf_url":"https://arxiv.org/pdf/2302.11427v2.pdf","comment":"in Vietnamese language"},{"id":"http://arxiv.org/abs/2403.13470v1","updated":"2024-03-20T10:19:05Z","published":"2024-03-20T10:19:05Z","title":"Scaling Diffusion Models to Real-World 3D LiDAR Scene Completion","summary":" Computer vision techniques play a central role in the perception stack of\nautonomous vehicles. Such methods are employed to perceive the vehicle\nsurroundings given sensor data. 3D LiDAR sensors are commonly used to collect\nsparse 3D point clouds from the scene. However, compared to human perception,\nsuch systems struggle to deduce the unseen parts of the scene given those\nsparse point clouds. In this matter, the scene completion task aims at\npredicting the gaps in the LiDAR measurements to achieve a more complete scene\nrepresentation. Given the promising results of recent diffusion models as\ngenerative models for images, we propose extending them to achieve scene\ncompletion from a single 3D LiDAR scan. Previous works used diffusion models\nover range images extracted from LiDAR data, directly applying image-based\ndiffusion methods. Distinctly, we propose to directly operate on the points,\nreformulating the noising and denoising diffusion process such that it can\nefficiently work at scene scale. Together with our approach, we propose a\nregularization loss to stabilize the noise predicted during the denoising\nprocess. Our experimental evaluation shows that our method can complete the\nscene given a single LiDAR scan as input, producing a scene with more details\ncompared to state-of-the-art scene completion methods. We believe that our\nproposed diffusion process formulation can support further research in\ndiffusion models applied to scene-scale point cloud data.\n","authors":["Lucas Nunes","Rodrigo Marcuzzi","Benedikt Mersch","Jens Behley","Cyrill Stachniss"],"pdf_url":"https://arxiv.org/pdf/2403.13470v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13469v1","updated":"2024-03-20T10:18:20Z","published":"2024-03-20T10:18:20Z","title":"Progressive trajectory matching for medical dataset distillation","summary":" It is essential but challenging to share medical image datasets due to\nprivacy issues, which prohibit building foundation models and knowledge\ntransfer. In this paper, we propose a novel dataset distillation method to\ncondense the original medical image datasets into a synthetic one that\npreserves useful information for building an analysis model without accessing\nthe original datasets. Existing methods tackle only natural images by randomly\nmatching parts of the training trajectories of the model parameters trained by\nthe whole real datasets. However, through extensive experiments on medical\nimage datasets, the training process is extremely unstable and achieves\ninferior distillation results. To solve these barriers, we propose to design a\nnovel progressive trajectory matching strategy to improve the training\nstability for medical image dataset distillation. Additionally, it is observed\nthat improved stability prevents the synthetic dataset diversity and final\nperformance improvements. Therefore, we propose a dynamic overlap mitigation\nmodule that improves the synthetic dataset diversity by dynamically eliminating\nthe overlap across different images and retraining parts of the synthetic\nimages for better convergence. Finally, we propose a new medical image dataset\ndistillation benchmark of various modalities and configurations to promote fair\nevaluations. It is validated that our proposed method achieves 8.33%\nimprovement over previous state-of-the-art methods on average, and 11.7%\nimprovement when ipc=2 (i.e., image per class is 2). Codes and benchmarks will\nbe released.\n","authors":["Zhen Yu","Yang Liu","Qingchao Chen"],"pdf_url":"https://arxiv.org/pdf/2403.13469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13467v1","updated":"2024-03-20T10:17:39Z","published":"2024-03-20T10:17:39Z","title":"CLIPSwarm: Generating Drone Shows from Text Prompts with Vision-Language\n Models","summary":" This paper introduces CLIPSwarm, a new algorithm designed to automate the\nmodeling of swarm drone formations based on natural language. The algorithm\nbegins by enriching a provided word, to compose a text prompt that serves as\ninput to an iterative approach to find the formation that best matches the\nprovided word. The algorithm iteratively refines formations of robots to align\nwith the textual description, employing different steps for \"exploration\" and\n\"exploitation\". Our framework is currently evaluated on simple formation\ntargets, limited to contour shapes. A formation is visually represented through\nalpha-shape contours and the most representative color is automatically found\nfor the input word. To measure the similarity between the description and the\nvisual representation of the formation, we use CLIP [1], encoding text and\nimages into vectors and assessing their similarity. Subsequently, the algorithm\nrearranges the formation to visually represent the word more effectively,\nwithin the given constraints of available drones. Control actions are then\nassigned to the drones, ensuring robotic behavior and collision-free movement.\nExperimental results demonstrate the system's efficacy in accurately modeling\nrobot formations from natural language descriptions. The algorithm's\nversatility is showcased through the execution of drone shows in photorealistic\nsimulation with varying shapes. We refer the reader to the supplementary video\nfor a visual reference of the results.\n","authors":["Pablo Pueyo","Eduardo Montijano","Ana C. Murillo","Mac Schwager"],"pdf_url":"https://arxiv.org/pdf/2403.13467v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13466v1","updated":"2024-03-20T10:16:40Z","published":"2024-03-20T10:16:40Z","title":"An AI-Assisted Skincare Routine Recommendation System in XR","summary":" In recent years, there has been an increasing interest in the use of\nartificial intelligence (AI) and extended reality (XR) in the beauty industry.\nIn this paper, we present an AI-assisted skin care recommendation system\nintegrated into an XR platform. The system uses a convolutional neural network\n(CNN) to analyse an individual's skin type and recommend personalised skin care\nproducts in an immersive and interactive manner. Our methodology involves\ncollecting data from individuals through a questionnaire and conducting skin\nanalysis using a provided facial image in an immersive environment. This data\nis then used to train the CNN model, which recognises the skin type and\nexisting issues and allows the recommendation engine to suggest personalised\nskin care products. We evaluate our system in terms of the accuracy of the CNN\nmodel, which achieves an average score of 93% in correctly classifying existing\nskin issues. Being integrated into an XR system, this approach has the\npotential to significantly enhance the beauty industry by providing immersive\nand engaging experiences to users, leading to more efficient and consistent\nskincare routines.\n","authors":["Gowravi Malalur Rajegowda","Yannis Spyridis","Barbara Villarini","Vasileios Argyriou"],"pdf_url":"https://arxiv.org/pdf/2403.13466v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09380v2","updated":"2024-03-20T10:09:42Z","published":"2024-03-14T13:31:56Z","title":"Impact of Synthetic Images on Morphing Attack Detection Using a Siamese\n Network","summary":" This paper evaluated the impact of synthetic images on Morphing Attack\nDetection (MAD) using a Siamese network with a semi-hard-loss function. Intra\nand cross-dataset evaluations were performed to measure synthetic image\ngeneralisation capabilities using a cross-dataset for evaluation. Three\ndifferent pre-trained networks were used as feature extractors from traditional\nMobileNetV2, MobileNetV3 and EfficientNetB0. Our results show that MAD trained\non EfficientNetB0 from FERET, FRGCv2, and FRLL can reach a lower error rate in\ncomparison with SOTA. Conversely, worse performances were reached when the\nsystem was trained only with synthetic images. A mixed approach (synthetic +\ndigital) database may help to improve MAD and reduce the error rate. This fact\nshows that we still need to keep going with our efforts to include synthetic\nimages in the training process.\n","authors":["Juan Tapia","Christoph Busch"],"pdf_url":"https://arxiv.org/pdf/2403.09380v2.pdf","comment":"Arxiv version of CIARP2023 - fixed typo errors"},{"id":"http://arxiv.org/abs/2311.13261v2","updated":"2024-03-20T10:06:09Z","published":"2023-11-22T09:25:08Z","title":"Immunohistochemistry guided segmentation of benign epithelial cells, in\n situ lesions, and invasive epithelial cells in breast cancer slides","summary":" Digital pathology enables automatic analysis of histopathological sections\nusing artificial intelligence (AI). Automatic evaluation could improve\ndiagnostic efficiency and help find associations between morphological features\nand clinical outcome. For development of such prediction models, identifying\ninvasive epithelial cells, and separating these from benign epithelial cells\nand in situ lesions would be the first step. In this study, we aimed to develop\nan AI model for segmentation of epithelial cells in sections from breast\ncancer. We generated epithelial ground truth masks by restaining hematoxylin\nand eosin (HE) sections with cytokeratin (CK) AE1/AE3, and by pathologists'\nannotations. HE/CK image pairs were used to train a convolutional neural\nnetwork, and data augmentation was used to make the model more robust. Tissue\nmicroarrays (TMAs) from 839 patients, and whole slide images from two patients\nwere used for training and evaluation of the models. The sections were derived\nfrom four cohorts of breast cancer patients. TMAs from 21 patients from a fifth\ncohort was used as a second test set. In quantitative evaluation, a mean Dice\nscore of 0.70, 0.79, and 0.75 for invasive epithelial cells, benign epithelial\ncells, and in situ lesions, respectively, were achieved. In qualitative scoring\n(0-5) by pathologists, results were best for all epithelium and invasive\nepithelium, with scores of 4.7 and 4.4. Scores for benign epithelium and in\nsitu lesions were 3.7 and 2.0. The proposed model segmented epithelial cells in\nHE stained breast cancer slides well, but further work is needed for accurate\ndivision between the classes. Immunohistochemistry, together with pathologists'\nannotations, enabled the creation of accurate ground truths. The model is made\nfreely available in FastPathology and the code is available at\nhttps://github.com/AICAN-Research/breast-epithelium-segmentation\n","authors":["Maren Høibø","André Pedersen","Vibeke Grotnes Dale","Sissel Marie Berget","Borgny Ytterhus","Cecilia Lindskog","Elisabeth Wik","Lars A. Akslen","Ingerid Reinertsen","Erik Smistad","Marit Valla"],"pdf_url":"https://arxiv.org/pdf/2311.13261v2.pdf","comment":"19 pages, 6 figures. Submitted to a scientific journal"},{"id":"http://arxiv.org/abs/2403.06225v2","updated":"2024-03-20T10:05:02Z","published":"2024-03-10T14:11:25Z","title":"MoST: Motion Style Transformer between Diverse Action Contents","summary":" While existing motion style transfer methods are effective between two\nmotions with identical content, their performance significantly diminishes when\ntransferring style between motions with different contents. This challenge lies\nin the lack of clear separation between content and style of a motion. To\ntackle this challenge, we propose a novel motion style transformer that\neffectively disentangles style from content and generates a plausible motion\nwith transferred style from a source motion. Our distinctive approach to\nachieving the goal of disentanglement is twofold: (1) a new architecture for\nmotion style transformer with `part-attentive style modulator across body\nparts' and `Siamese encoders that encode style and content features\nseparately'; (2) style disentanglement loss. Our method outperforms existing\nmethods and demonstrates exceptionally high quality, particularly in motion\npairs with different contents, without the need for heuristic post-processing.\nCodes are available at https://github.com/Boeun-Kim/MoST.\n","authors":["Boeun Kim","Jungho Kim","Hyung Jin Chang","Jin Young Choi"],"pdf_url":"https://arxiv.org/pdf/2403.06225v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2305.11488v2","updated":"2024-03-20T09:44:50Z","published":"2023-05-19T07:39:17Z","title":"AttriCLIP: A Non-Incremental Learner for Incremental Knowledge Learning","summary":" Continual learning aims to enable a model to incrementally learn knowledge\nfrom sequentially arrived data. Previous works adopt the conventional\nclassification architecture, which consists of a feature extractor and a\nclassifier. The feature extractor is shared across sequentially arrived tasks\nor classes, but one specific group of weights of the classifier corresponding\nto one new class should be incrementally expanded. Consequently, the parameters\nof a continual learner gradually increase. Moreover, as the classifier contains\nall historical arrived classes, a certain size of the memory is usually\nrequired to store rehearsal data to mitigate classifier bias and catastrophic\nforgetting. In this paper, we propose a non-incremental learner, named\nAttriCLIP, to incrementally extract knowledge of new classes or tasks.\nSpecifically, AttriCLIP is built upon the pre-trained visual-language model\nCLIP. Its image encoder and text encoder are fixed to extract features from\nboth images and text. Text consists of a category name and a fixed number of\nlearnable parameters which are selected from our designed attribute word bank\nand serve as attributes. As we compute the visual and textual similarity for\nclassification, AttriCLIP is a non-incremental learner. The attribute prompts,\nwhich encode the common knowledge useful for classification, can effectively\nmitigate the catastrophic forgetting and avoid constructing a replay memory. We\nevaluate our AttriCLIP and compare it with CLIP-based and previous\nstate-of-the-art continual learning methods in realistic settings with\ndomain-shift and long-sequence learning. The results show that our method\nperforms favorably against previous state-of-the-arts. The implementation code\ncan be available at https://github.com/bhrqw/AttriCLIP.\n","authors":["Runqi Wang","Xiaoyue Duan","Guoliang Kang","Jianzhuang Liu","Shaohui Lin","Songcen Xu","Jinhu Lv","Baochang Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.11488v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13447v1","updated":"2024-03-20T09:42:43Z","published":"2024-03-20T09:42:43Z","title":"HyperLLaVA: Dynamic Visual and Language Expert Tuning for Multimodal\n Large Language Models","summary":" Recent advancements indicate that scaling up Multimodal Large Language Models\n(MLLMs) effectively enhances performance on downstream multimodal tasks. The\nprevailing MLLM paradigm, \\emph{e.g.}, LLaVA, transforms visual features into\ntext-like tokens using a \\emph{static} vision-language mapper, thereby enabling\n\\emph{static} LLMs to develop the capability to comprehend visual information\nthrough visual instruction tuning. Although promising, the \\emph{static} tuning\nstrategy~\\footnote{The static tuning refers to the trained model with static\nparameters.} that shares the same parameters may constrain performance across\ndifferent downstream multimodal tasks. In light of this, we introduce\nHyperLLaVA, which involves adaptive tuning of the projector and LLM parameters,\nin conjunction with a dynamic visual expert and language expert, respectively.\nThese experts are derived from HyperNetworks, which generates adaptive\nparameter shifts through visual and language guidance, enabling dynamic\nprojector and LLM modeling in two-stage training.\n Our experiments demonstrate that our solution significantly surpasses LLaVA\non existing MLLM benchmarks, including MME, MMBench, SEED-Bench, and\nLLaVA-Bench. ~\\footnote{Our project is available on the link\nhttps://github.com/DCDmllm/HyperLLaVA}.\n","authors":["Wenqiao Zhang","Tianwei Lin","Jiang Liu","Fangxun Shu","Haoyuan Li","Lei Zhang","He Wanggui","Hao Zhou","Zheqi Lv","Hao Jiang","Juncheng Li","Siliang Tang","Yueting Zhuang"],"pdf_url":"https://arxiv.org/pdf/2403.13447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13444v1","updated":"2024-03-20T09:40:11Z","published":"2024-03-20T09:40:11Z","title":"MedCycle: Unpaired Medical Report Generation via Cycle-Consistency","summary":" Generating medical reports for X-ray images presents a significant challenge,\nparticularly in unpaired scenarios where access to paired image-report data for\ntraining is unavailable. Previous works have typically learned a joint\nembedding space for images and reports, necessitating a specific labeling\nschema for both. We introduce an innovative approach that eliminates the need\nfor consistent labeling schemas, thereby enhancing data accessibility and\nenabling the use of incompatible datasets. This approach is based on\ncycle-consistent mapping functions that transform image embeddings into report\nembeddings, coupled with report auto-encoding for medical report generation.\nOur model and objectives consider intricate local details and the overarching\nsemantic context within images and reports. This approach facilitates the\nlearning of effective mapping functions, resulting in the generation of\ncoherent reports. It outperforms state-of-the-art results in unpaired chest\nX-ray report generation, demonstrating improvements in both language and\nclinical metrics.\n","authors":["Elad Hirsch","Gefen Dawidowicz","Ayellet Tal"],"pdf_url":"https://arxiv.org/pdf/2403.13444v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13443v1","updated":"2024-03-20T09:39:39Z","published":"2024-03-20T09:39:39Z","title":"Fast-Poly: A Fast Polyhedral Framework For 3D Multi-Object Tracking","summary":" 3D Multi-Object Tracking (MOT) captures stable and comprehensive motion\nstates of surrounding obstacles, essential for robotic perception. However,\ncurrent 3D trackers face issues with accuracy and latency consistency. In this\npaper, we propose Fast-Poly, a fast and effective filter-based method for 3D\nMOT. Building upon our previous work Poly-MOT, Fast-Poly addresses object\nrotational anisotropy in 3D space, enhances local computation densification,\nand leverages parallelization technique, improving inference speed and\nprecision. Fast-Poly is extensively tested on two large-scale tracking\nbenchmarks with Python implementation. On the nuScenes dataset, Fast-Poly\nachieves new state-of-the-art performance with 75.8% AMOTA among all methods\nand can run at 34.2 FPS on a personal CPU. On the Waymo dataset, Fast-Poly\nexhibits competitive accuracy with 63.6% MOTA and impressive inference speed\n(35.5 FPS). The source code is publicly available at\nhttps://github.com/lixiaoyu2000/FastPoly.\n","authors":["Xiaoyu Li","Dedong Liu","Lijun Zhao","Yitao Wu","Xian Wu","Jinghan Gao"],"pdf_url":"https://arxiv.org/pdf/2403.13443v1.pdf","comment":"1st on the NuScenes Tracking benchmark with 75.8 AMOTA and 34.2 FPS"},{"id":"http://arxiv.org/abs/2403.13439v1","updated":"2024-03-20T09:27:49Z","published":"2024-03-20T09:27:49Z","title":"Stochastic Geometry Models for Texture Synthesis of Machined Metallic\n Surfaces: Sandblasting and Milling","summary":" Training defect detection algorithms for visual surface inspection systems\nrequires a large and representative set of training data. Often there is not\nenough real data available which additionally cannot cover the variety of\npossible defects. Synthetic data generated by a synthetic visual surface\ninspection environment can overcome this problem. Therefore, a digital twin of\nthe object is needed, whose micro-scale surface topography is modeled by\ntexture synthesis models. We develop stochastic texture models for sandblasted\nand milled surfaces based on topography measurements of such surfaces. As the\nsurface patterns differ significantly, we use separate modeling approaches for\nthe two cases. Sandblasted surfaces are modeled by a combination of data-based\ntexture synthesis methods that rely entirely on the measurements. In contrast,\nthe model for milled surfaces is procedural and includes all process-related\nparameters known from the machine settings.\n","authors":["Natascha Jeziorski","Claudia Redenbach"],"pdf_url":"https://arxiv.org/pdf/2403.13439v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13434v1","updated":"2024-03-20T09:22:22Z","published":"2024-03-20T09:22:22Z","title":"Advancing 6D Pose Estimation in Augmented Reality -- Overcoming\n Projection Ambiguity with Uncontrolled Imagery","summary":" This study addresses the challenge of accurate 6D pose estimation in\nAugmented Reality (AR), a critical component for seamlessly integrating virtual\nobjects into real-world environments. Our research primarily addresses the\ndifficulty of estimating 6D poses from uncontrolled RGB images, a common\nscenario in AR applications, which lacks metadata such as focal length. We\npropose a novel approach that strategically decomposes the estimation of z-axis\ntranslation and focal length, leveraging the neural-render and compare strategy\ninherent in the FocalPose architecture. This methodology not only streamlines\nthe 6D pose estimation process but also significantly enhances the accuracy of\n3D object overlaying in AR settings. Our experimental results demonstrate a\nmarked improvement in 6D pose estimation accuracy, with promising applications\nin manufacturing and robotics. Here, the precise overlay of AR visualizations\nand the advancement of robotic vision systems stand to benefit substantially\nfrom our findings.\n","authors":["Mayura Manawadu","Sieun Park","Soon-Yong Park"],"pdf_url":"https://arxiv.org/pdf/2403.13434v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13430v1","updated":"2024-03-20T09:17:22Z","published":"2024-03-20T09:17:22Z","title":"MTP: Advancing Remote Sensing Foundation Model via Multi-Task\n Pretraining","summary":" Foundation models have reshaped the landscape of Remote Sensing (RS) by\nenhancing various image interpretation tasks. Pretraining is an active research\ntopic, encompassing supervised and self-supervised learning methods to\ninitialize model weights effectively. However, transferring the pretrained\nmodels to downstream tasks may encounter task discrepancy due to their\nformulation of pretraining as image classification or object discrimination\ntasks. In this study, we explore the Multi-Task Pretraining (MTP) paradigm for\nRS foundation models to address this issue. Using a shared encoder and\ntask-specific decoder architecture, we conduct multi-task supervised\npretraining on the SAMRS dataset, encompassing semantic segmentation, instance\nsegmentation, and rotated object detection. MTP supports both convolutional\nneural networks and vision transformer foundation models with over 300 million\nparameters. The pretrained models are finetuned on various RS downstream tasks,\nsuch as scene classification, horizontal and rotated object detection, semantic\nsegmentation, and change detection. Extensive experiments across 14 datasets\ndemonstrate the superiority of our models over existing ones of similar size\nand their competitive performance compared to larger state-of-the-art models,\nthus validating the effectiveness of MTP.\n","authors":["Di Wang","Jing Zhang","Minqiang Xu","Lin Liu","Dongsheng Wang","Erzhong Gao","Chengxi Han","Haonan Guo","Bo Du","Dacheng Tao","Liangpei Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.13430v1.pdf","comment":"The codes and pretrained models will be released at\n https://github.com/ViTAE-Transformer/MTP"},{"id":"http://arxiv.org/abs/2312.04530v2","updated":"2024-03-20T09:12:21Z","published":"2023-12-07T18:50:01Z","title":"Camera Height Doesn't Change: Unsupervised Training for Metric Monocular\n Road-Scene Depth Estimation","summary":" In this paper, we introduce a novel training method for making any monocular\ndepth network learn absolute scale and estimate metric road-scene depth just\nfrom regular training data, i.e., driving videos. We refer to this training\nframework as StableCamH. The key idea is to leverage cars found on the road as\nsources of scale supervision but to incorporate them in the training robustly.\nStableCamH detects and estimates the sizes of cars in the frame and aggregates\nscale information extracted from them into a camera height estimate whose\nconsistency across the entire video sequence is enforced as scale supervision.\nThis realizes robust unsupervised training of any, otherwise scale-oblivious,\nmonocular depth network to become not only scale-aware but also metric-accurate\nwithout the need for auxiliary sensors and extra supervision. Extensive\nexperiments on the KITTI and Cityscapes datasets show the effectiveness of\nStableCamH and its state-of-the-art accuracy compared with related methods. We\nalso show that StableCamH enables training on mixed datasets of different\ncamera heights, which leads to larger-scale training and thus higher\ngeneralization. Metric depth reconstruction is essential in any road-scene\nvisual modeling, and StableCamH democratizes its deployment by establishing the\nmeans to train any model as a metric depth estimator.\n","authors":["Genki Kinoshita","Ko Nishino"],"pdf_url":"https://arxiv.org/pdf/2312.04530v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13417v1","updated":"2024-03-20T09:00:19Z","published":"2024-03-20T09:00:19Z","title":"Diversified and Personalized Multi-rater Medical Image Segmentation","summary":" Annotation ambiguity due to inherent data uncertainties such as blurred\nboundaries in medical scans and different observer expertise and preferences\nhas become a major obstacle for training deep-learning based medical image\nsegmentation models. To address it, the common practice is to gather multiple\nannotations from different experts, leading to the setting of multi-rater\nmedical image segmentation. Existing works aim to either merge different\nannotations into the \"groundtruth\" that is often unattainable in numerous\nmedical contexts, or generate diverse results, or produce personalized results\ncorresponding to individual expert raters. Here, we bring up a more ambitious\ngoal for multi-rater medical image segmentation, i.e., obtaining both\ndiversified and personalized results. Specifically, we propose a two-stage\nframework named D-Persona (first Diversification and then Personalization). In\nStage I, we exploit multiple given annotations to train a Probabilistic U-Net\nmodel, with a bound-constrained loss to improve the prediction diversity. In\nthis way, a common latent space is constructed in Stage I, where different\nlatent codes denote diversified expert opinions. Then, in Stage II, we design\nmultiple attention-based projection heads to adaptively query the corresponding\nexpert prompts from the shared latent space, and then perform the personalized\nmedical image segmentation. We evaluated the proposed model on our in-house\nNasopharyngeal Carcinoma dataset and the public lung nodule dataset (i.e.,\nLIDC-IDRI). Extensive experiments demonstrated our D-Persona can provide\ndiversified and personalized results at the same time, achieving new SOTA\nperformance for multi-rater medical image segmentation. Our code will be\nreleased at https://github.com/ycwu1997/D-Persona.\n","authors":["Yicheng Wu","Xiangde Luo","Zhe Xu","Xiaoqing Guo","Lie Ju","Zongyuan Ge","Wenjun Liao","Jianfei Cai"],"pdf_url":"https://arxiv.org/pdf/2403.13417v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.13412v1","updated":"2024-03-20T08:53:56Z","published":"2024-03-20T08:53:56Z","title":"Cell Tracking in C. elegans with Cell Position Heatmap-Based Alignment\n and Pairwise Detection","summary":" 3D cell tracking in a living organism has a crucial role in live cell image\nanalysis. Cell tracking in C. elegans has two difficulties. First, cell\nmigration in a consecutive frame is large since they move their head during\nscanning. Second, cell detection is often inconsistent in consecutive frames\ndue to touching cells and low-contrast images, and these inconsistent\ndetections affect the tracking performance worse. In this paper, we propose a\ncell tracking method to address these issues, which has two main contributions.\nFirst, we introduce cell position heatmap-based non-rigid alignment with\ntest-time fine-tuning, which can warp the detected points to near the positions\nat the next frame. Second, we propose a pairwise detection method, which uses\nthe information of detection results at the previous frame for detecting cells\nat the current frame. The experimental results demonstrate the effectiveness of\neach module, and the proposed method achieved the best performance in\ncomparison.\n","authors":["Kaito Shiku","Hiromitsu Shirai","Takeshi Ishihara","Ryoma Bise"],"pdf_url":"https://arxiv.org/pdf/2403.13412v1.pdf","comment":"4 pages, 5 figures, Accepted in EMBC 2023"},{"id":"http://arxiv.org/abs/2403.13408v1","updated":"2024-03-20T08:50:15Z","published":"2024-03-20T08:50:15Z","title":"S2DM: Sector-Shaped Diffusion Models for Video Generation","summary":" Diffusion models have achieved great success in image generation. However,\nwhen leveraging this idea for video generation, we face significant challenges\nin maintaining the consistency and continuity across video frames. This is\nmainly caused by the lack of an effective framework to align frames of videos\nwith desired temporal features while preserving consistent semantic and\nstochastic features. In this work, we propose a novel Sector-Shaped Diffusion\nModel (S2DM) whose sector-shaped diffusion region is formed by a set of\nray-shaped reverse diffusion processes starting at the same noise point. S2DM\ncan generate a group of intrinsically related data sharing the same semantic\nand stochastic features while varying on temporal features with appropriate\nguided conditions. We apply S2DM to video generation tasks, and explore the use\nof optical flow as temporal conditions. Our experimental results show that S2DM\noutperforms many existing methods in the task of video generation without any\ntemporal-feature modelling modules. For text-to-video generation tasks where\ntemporal conditions are not explicitly given, we propose a two-stage generation\nstrategy which can decouple the generation of temporal features from\nsemantic-content features. We show that, without additional training, our model\nintegrated with another temporal conditions generative model can still achieve\ncomparable performance with existing works. Our results can be viewd at\nhttps://s2dm.github.io/S2DM/.\n","authors":["Haoran Lang","Yuxuan Ge","Zheng Tian"],"pdf_url":"https://arxiv.org/pdf/2403.13408v1.pdf","comment":"17 pages, 6 figures"},{"id":"http://arxiv.org/abs/2403.13405v1","updated":"2024-03-20T08:47:51Z","published":"2024-03-20T08:47:51Z","title":"DOR3D-Net: Dense Ordinal Regression Network for 3D Hand Pose Estimation","summary":" Depth-based 3D hand pose estimation is an important but challenging research\ntask in human-machine interaction community. Recently, dense regression methods\nhave attracted increasing attention in 3D hand pose estimation task, which\nprovide a low computational burden and high accuracy regression way by densely\nregressing hand joint offset maps. However, large-scale regression offset\nvalues are often affected by noise and outliers, leading to a significant drop\nin accuracy. To tackle this, we re-formulate 3D hand pose estimation as a dense\nordinal regression problem and propose a novel Dense Ordinal Regression 3D Pose\nNetwork (DOR3D-Net). Specifically, we first decompose offset value regression\ninto sub-tasks of binary classifications with ordinal constraints. Then, each\nbinary classifier can predict the probability of a binary spatial relationship\nrelative to joint, which is easier to train and yield much lower level of\nnoise. The estimated hand joint positions are inferred by aggregating the\nordinal regression results at local positions with a weighted sum. Furthermore,\nboth joint regression loss and ordinal regression loss are used to train our\nDOR3D-Net in an end-to-end manner. Extensive experiments on public datasets\n(ICVL, MSRA, NYU and HANDS2017) show that our design provides significant\nimprovements over SOTA methods.\n","authors":["Yamin Mao","Zhihua Liu","Weiming Li","SoonYong Cho","Qiang Wang","Xiaoshuai Hao"],"pdf_url":"https://arxiv.org/pdf/2403.13405v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13395v1","updated":"2024-03-20T08:35:57Z","published":"2024-03-20T08:35:57Z","title":"Unifying Local and Global Multimodal Features for Place Recognition in\n Aliased and Low-Texture Environments","summary":" Perceptual aliasing and weak textures pose significant challenges to the task\nof place recognition, hindering the performance of Simultaneous Localization\nand Mapping (SLAM) systems. This paper presents a novel model, called UMF\n(standing for Unifying Local and Global Multimodal Features) that 1) leverages\nmulti-modality by cross-attention blocks between vision and LiDAR features, and\n2) includes a re-ranking stage that re-orders based on local feature matching\nthe top-k candidates retrieved using a global representation. Our experiments,\nparticularly on sequences captured on a planetary-analogous environment, show\nthat UMF outperforms significantly previous baselines in those challenging\naliased environments. Since our work aims to enhance the reliability of SLAM in\nall situations, we also explore its performance on the widely used RobotCar\ndataset, for broader applicability. Code and models are available at\nhttps://github.com/DLR-RM/UMF\n","authors":["Alberto García-Hernández","Riccardo Giubilato","Klaus H. Strobl","Javier Civera","Rudolph Triebel"],"pdf_url":"https://arxiv.org/pdf/2403.13395v1.pdf","comment":"Accepted submission to International Conference on Robotics and\n Automation (ICRA), 2024"},{"id":"http://arxiv.org/abs/2309.09947v2","updated":"2024-03-20T08:35:08Z","published":"2023-09-18T17:12:43Z","title":"End-to-end Learned Visual Odometry with Events and Frames","summary":" Visual Odometry (VO) is crucial for autonomous robotic navigation, especially\nin GPS-denied environments like planetary terrains. To improve robustness,\nrecent model-based VO systems have begun combining standard and event-based\ncameras. Event cameras excel in low-light and high-speed motion, while standard\ncameras provide dense and easier-to-track features, even in low-textured areas.\nHowever, the field of image- and event-based VO still predominantly relies on\nmodel-based methods and is yet to fully integrate recent image-only\nadvancements leveraging end-to-end learning-based architectures. Seamlessly\nintegrating the two modalities remains challenging due to their different\nnature, one asynchronous, the other not, limiting the potential for a more\neffective image- and event-based VO. We introduce RAMP-VO, the first end-to-end\nlearned image- and event-based VO system. It leverages novel Recurrent,\nAsynchronous, and Massively Parallel (RAMP) encoders capable of fusing\nasynchronous events with image data, providing 8x faster inference and 33% more\naccurate predictions than existing solutions. Despite being trained only in\nsimulation, RAMP-VO outperforms image- and event-based methods by 46% and 60%,\nrespectively, on traditional, real-world benchmarks as well as newly introduced\nApollo and Malapert landing sequences, paving the way for robust and\nasynchronous VO in space.\n","authors":["Roberto Pellerito","Marco Cannici","Daniel Gehrig","Joris Belhadj","Olivier Dubois-Matra","Massimo Casasco","Davide Scaramuzza"],"pdf_url":"https://arxiv.org/pdf/2309.09947v2.pdf","comment":"8 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2403.13392v1","updated":"2024-03-20T08:33:40Z","published":"2024-03-20T08:33:40Z","title":"Robust image segmentation model based on binary level set","summary":" In order to improve the robustness of traditional image segmentation models\nto noise, this paper models the illumination term in intensity inhomogeneity\nimages. Additionally, to enhance the model's robustness to noisy images, we\nincorporate the binary level set model into the proposed model. Compared to the\ntraditional level set, the binary level set eliminates the need for continuous\nreinitialization. Moreover, by introducing the variational operator GL, our\nmodel demonstrates better capability in segmenting noisy images. Finally, we\nemploy the three-step splitting operator method for solving, and the\neffectiveness of the proposed model is demonstrated on various images.\n","authors":["Wenqi Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.13392v1.pdf","comment":"SCI"},{"id":"http://arxiv.org/abs/2403.13378v1","updated":"2024-03-20T08:21:00Z","published":"2024-03-20T08:21:00Z","title":"IIDM: Image-to-Image Diffusion Model for Semantic Image Synthesis","summary":" Semantic image synthesis aims to generate high-quality images given semantic\nconditions, i.e. segmentation masks and style reference images. Existing\nmethods widely adopt generative adversarial networks (GANs). GANs take all\nconditional inputs and directly synthesize images in a single forward step. In\nthis paper, semantic image synthesis is treated as an image denoising task and\nis handled with a novel image-to-image diffusion model (IIDM). Specifically,\nthe style reference is first contaminated with random noise and then\nprogressively denoised by IIDM, guided by segmentation masks. Moreover, three\ntechniques, refinement, color-transfer and model ensembles, are proposed to\nfurther boost the generation quality. They are plug-in inference modules and do\nnot require additional training. Extensive experiments show that our IIDM\noutperforms existing state-of-the-art methods by clear margins. Further\nanalysis is provided via detailed demonstrations. We have implemented IIDM\nbased on the Jittor framework; code is available at\nhttps://github.com/ader47/jittor-jieke-semantic_images_synthesis.\n","authors":["Feng Liu"," Xiaobin-Chang"],"pdf_url":"https://arxiv.org/pdf/2403.13378v1.pdf","comment":"6 pages, 7 figures, accetped by CVMJ 2024"},{"id":"http://arxiv.org/abs/2403.06866v3","updated":"2024-03-20T08:19:52Z","published":"2024-03-11T16:21:50Z","title":"QUASAR: QUality and Aesthetics Scoring with Advanced Representations","summary":" This paper introduces a new data-driven, non-parametric method for image\nquality and aesthetics assessment, surpassing existing approaches and requiring\nno prompt engineering or fine-tuning. We eliminate the need for expressive\ntextual embeddings by proposing efficient image anchors in the data. Through\nextensive evaluations of 7 state-of-the-art self-supervised models, our method\ndemonstrates superior performance and robustness across various datasets and\nbenchmarks. Notably, it achieves high agreement with human assessments even\nwith limited data and shows high robustness to the nature of data and their\npre-processing pipeline. Our contributions offer a streamlined solution for\nassessment of images while providing insights into the perception of visual\ninformation.\n","authors":["Sergey Kastryulin","Denis Prokopenko","Artem Babenko","Dmitry V. Dylov"],"pdf_url":"https://arxiv.org/pdf/2403.06866v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13376v1","updated":"2024-03-20T08:15:34Z","published":"2024-03-20T08:15:34Z","title":"Correlation Clustering of Organoid Images","summary":" In biological and medical research, scientists now routinely acquire\nmicroscopy images of hundreds of morphologically heterogeneous organoids and\nare then faced with the task of finding patterns in the image collection, i.e.,\nsubsets of organoids that appear similar and potentially represent the same\nmorphological class. We adopt models and algorithms for correlating organoid\nimages, i.e., for quantifying the similarity in appearance and geometry of the\norganoids they depict, and for clustering organoid images by consolidating\nconflicting correlations. For correlating organoid images, we adopt and compare\ntwo alternatives, a partial quadratic assignment problem and a twin network.\nFor clustering organoid images, we employ the correlation clustering problem.\nEmpirically, we learn the parameters of these models, infer a clustering of\norganoid images, and quantify the accuracy of the inferred clusters, with\nrespect to a training set and a test set we contribute of state-of-the-art\nlight microscopy images of organoids clustered manually by biologists.\n","authors":["Jannik Presberger","Rashmiparvathi Keshara","David Stein","Yung Hae Kim","Anne Grapin-Botton","Bjoern Andres"],"pdf_url":"https://arxiv.org/pdf/2403.13376v1.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2403.13375v1","updated":"2024-03-20T08:15:18Z","published":"2024-03-20T08:15:18Z","title":"Few-shot Oriented Object Detection with Memorable Contrastive Learning\n in Remote Sensing Images","summary":" Few-shot object detection (FSOD) has garnered significant research attention\nin the field of remote sensing due to its ability to reduce the dependency on\nlarge amounts of annotated data. However, two challenges persist in this area:\n(1) axis-aligned proposals, which can result in misalignment for arbitrarily\noriented objects, and (2) the scarcity of annotated data still limits the\nperformance for unseen object categories. To address these issues, we propose a\nnovel FSOD method for remote sensing images called Few-shot Oriented object\ndetection with Memorable Contrastive learning (FOMC). Specifically, we employ\noriented bounding boxes instead of traditional horizontal bounding boxes to\nlearn a better feature representation for arbitrary-oriented aerial objects,\nleading to enhanced detection performance. To the best of our knowledge, we are\nthe first to address oriented object detection in the few-shot setting for\nremote sensing images. To address the challenging issue of object\nmisclassification, we introduce a supervised contrastive learning module with a\ndynamically updated memory bank. This module enables the use of large batches\nof negative samples and enhances the model's capability to learn discriminative\nfeatures for unseen classes. We conduct comprehensive experiments on the DOTA\nand HRSC2016 datasets, and our model achieves state-of-the-art performance on\nthe few-shot oriented object detection task. Code and pretrained models will be\nreleased.\n","authors":["Jiawei Zhou","Wuzhou Li","Yi Cao","Hongtao Cai","Xiang Li"],"pdf_url":"https://arxiv.org/pdf/2403.13375v1.pdf","comment":"13 pages, 8 tables, 10 figures"},{"id":"http://arxiv.org/abs/2403.13370v1","updated":"2024-03-20T08:04:00Z","published":"2024-03-20T08:04:00Z","title":"Counting Network for Learning from Majority Label","summary":" The paper proposes a novel problem in multi-class Multiple-Instance Learning\n(MIL) called Learning from the Majority Label (LML). In LML, the majority class\nof instances in a bag is assigned as the bag's label. LML aims to classify\ninstances using bag-level majority classes. This problem is valuable in various\napplications. Existing MIL methods are unsuitable for LML due to aggregating\nconfidences, which may lead to inconsistency between the bag-level label and\nthe label obtained by counting the number of instances for each class. This may\nlead to incorrect instance-level classification. We propose a counting network\ntrained to produce the bag-level majority labels estimated by counting the\nnumber of instances for each class. This led to the consistency of the majority\nclass between the network outputs and one obtained by counting the number of\ninstances. Experimental results show that our counting network outperforms\nconventional MIL methods on four datasets The code is publicly available at\nhttps://github.com/Shiku-Kaito/Counting-Network-for-Learning-from-Majority-Label.\n","authors":["Kaito Shiku","Shinnosuke Matsuo","Daiki Suehiro","Ryoma Bise"],"pdf_url":"https://arxiv.org/pdf/2403.13370v1.pdf","comment":"5 pages, 4 figures, Accepted in ICASSP 2024"},{"id":"http://arxiv.org/abs/2403.12483v2","updated":"2024-03-20T07:56:29Z","published":"2024-03-19T06:40:06Z","title":"A Hybrid Transformer-Sequencer approach for Age and Gender\n classification from in-wild facial images","summary":" The advancements in computer vision and image processing techniques have led\nto emergence of new application in the domain of visual surveillance, targeted\nadvertisement, content-based searching, and human-computer interaction etc. Out\nof the various techniques in computer vision, face analysis, in particular, has\ngained much attention. Several previous studies have tried to explore different\napplications of facial feature processing for a variety of tasks, including age\nand gender classification. However, despite several previous studies having\nexplored the problem, the age and gender classification of in-wild human faces\nis still far from the achieving the desired levels of accuracy required for\nreal-world applications. This paper, therefore, attempts to bridge this gap by\nproposing a hybrid model that combines self-attention and BiLSTM approaches for\nage and gender classification problems. The proposed models performance is\ncompared with several state-of-the-art model proposed so far. An improvement of\napproximately 10percent and 6percent over the state-of-the-art implementations\nfor age and gender classification, respectively, are noted for the proposed\nmodel. The proposed model is thus found to achieve superior performance and is\nfound to provide a more generalized learning. The model can, therefore, be\napplied as a core classification component in various image processing and\ncomputer vision problems.\n","authors":["Aakash Singh","Vivek Kumar Singh"],"pdf_url":"https://arxiv.org/pdf/2403.12483v2.pdf","comment":"22 pages"},{"id":"http://arxiv.org/abs/2403.13365v1","updated":"2024-03-20T07:48:32Z","published":"2024-03-20T07:48:32Z","title":"ManiPose: A Comprehensive Benchmark for Pose-aware Object Manipulation\n in Robotics","summary":" Robotic manipulation in everyday scenarios, especially in unstructured\nenvironments, requires skills in pose-aware object manipulation (POM), which\nadapts robots' grasping and handling according to an object's 6D pose.\nRecognizing an object's position and orientation is crucial for effective\nmanipulation. For example, if a mug is lying on its side, it's more effective\nto grasp it by the rim rather than the handle. Despite its importance, research\nin POM skills remains limited, because learning manipulation skills requires\npose-varying simulation environments and datasets. This paper introduces\nManiPose, a pioneering benchmark designed to advance the study of pose-varying\nmanipulation tasks. ManiPose encompasses: 1) Simulation environments for POM\nfeature tasks ranging from 6D pose-specific pick-and-place of single objects to\ncluttered scenes, further including interactions with articulated objects. 2) A\ncomprehensive dataset featuring geometrically consistent and\nmanipulation-oriented 6D pose labels for 2936 real-world scanned rigid objects\nand 100 articulated objects across 59 categories. 3) A baseline for POM,\nleveraging the inferencing abilities of LLM (e.g., ChatGPT) to analyze the\nrelationship between 6D pose and task-specific requirements, offers enhanced\npose-aware grasp prediction and motion planning capabilities. Our benchmark\ndemonstrates notable advancements in pose estimation, pose-aware manipulation,\nand real-robot skill transfer, setting new standards for POM research. We will\nopen-source the ManiPose benchmark with the final version paper, inviting the\ncommunity to engage with our resources, available at our\nwebsite:https://sites.google.com/view/manipose.\n","authors":["Qiaojun Yu","Ce Hao","Junbo Wang","Wenhai Liu","Liu Liu","Yao Mu","Yang You","Hengxu Yan","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2403.13365v1.pdf","comment":"8 pages, 7 figures, submitted to 2024 IEEE/RSJ International\n Conference on Intelligent Robots and Systems (IROS 2024)"},{"id":"http://arxiv.org/abs/2312.07920v3","updated":"2024-03-20T07:36:27Z","published":"2023-12-13T06:30:51Z","title":"DrivingGaussian: Composite Gaussian Splatting for Surrounding Dynamic\n Autonomous Driving Scenes","summary":" We present DrivingGaussian, an efficient and effective framework for\nsurrounding dynamic autonomous driving scenes. For complex scenes with moving\nobjects, we first sequentially and progressively model the static background of\nthe entire scene with incremental static 3D Gaussians. We then leverage a\ncomposite dynamic Gaussian graph to handle multiple moving objects,\nindividually reconstructing each object and restoring their accurate positions\nand occlusion relationships within the scene. We further use a LiDAR prior for\nGaussian Splatting to reconstruct scenes with greater details and maintain\npanoramic consistency. DrivingGaussian outperforms existing methods in dynamic\ndriving scene reconstruction and enables photorealistic surround-view synthesis\nwith high-fidelity and multi-camera consistency. Our project page is at:\nhttps://github.com/VDIGPKU/DrivingGaussian.\n","authors":["Xiaoyu Zhou","Zhiwei Lin","Xiaojun Shan","Yongtao Wang","Deqing Sun","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2312.07920v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13352v1","updated":"2024-03-20T07:31:07Z","published":"2024-03-20T07:31:07Z","title":"AGFSync: Leveraging AI-Generated Feedback for Preference Optimization in\n Text-to-Image Generation","summary":" Text-to-Image (T2I) diffusion models have achieved remarkable success in\nimage generation. Despite their progress, challenges remain in both\nprompt-following ability, image quality and lack of high-quality datasets,\nwhich are essential for refining these models. As acquiring labeled data is\ncostly, we introduce AGFSync, a framework that enhances T2I diffusion models\nthrough Direct Preference Optimization (DPO) in a fully AI-driven approach.\nAGFSync utilizes Vision-Language Models (VLM) to assess image quality across\nstyle, coherence, and aesthetics, generating feedback data within an AI-driven\nloop. By applying AGFSync to leading T2I models such as SD v1.4, v1.5, and\nSDXL, our extensive experiments on the TIFA dataset demonstrate notable\nimprovements in VQA scores, aesthetic evaluations, and performance on the HPSv2\nbenchmark, consistently outperforming the base models. AGFSync's method of\nrefining T2I diffusion models paves the way for scalable alignment techniques.\n","authors":["Jingkun An","Yinghao Zhu","Zongjian Li","Haoran Feng","Bohua Chen","Yemin Shi","Chengwei Pan"],"pdf_url":"https://arxiv.org/pdf/2403.13352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13351v1","updated":"2024-03-20T07:25:24Z","published":"2024-03-20T07:25:24Z","title":"OrthCaps: An Orthogonal CapsNet with Sparse Attention Routing and\n Pruning","summary":" Redundancy is a persistent challenge in Capsule Networks (CapsNet),leading to\nhigh computational costs and parameter counts. Although previous works have\nintroduced pruning after the initial capsule layer, dynamic routing's fully\nconnected nature and non-orthogonal weight matrices reintroduce redundancy in\ndeeper layers. Besides, dynamic routing requires iterating to converge, further\nincreasing computational demands. In this paper, we propose an Orthogonal\nCapsule Network (OrthCaps) to reduce redundancy, improve routing performance\nand decrease parameter counts. Firstly, an efficient pruned capsule layer is\nintroduced to discard redundant capsules. Secondly, dynamic routing is replaced\nwith orthogonal sparse attention routing, eliminating the need for iterations\nand fully connected structures. Lastly, weight matrices during routing are\northogonalized to sustain low capsule similarity, which is the first approach\nto introduce orthogonality into CapsNet as far as we know. Our experiments on\nbaseline datasets affirm the efficiency and robustness of OrthCaps in\nclassification tasks, in which ablation studies validate the criticality of\neach component. Remarkably, OrthCaps-Shallow outperforms other Capsule Network\nbenchmarks on four datasets, utilizing only 110k parameters, which is a mere\n1.25% of a standard Capsule Network's total. To the best of our knowledge, it\nachieves the smallest parameter count among existing Capsule Networks.\nSimilarly, OrthCaps-Deep demonstrates competitive performance across four\ndatasets, utilizing only 1.2% of the parameters required by its counterparts.\n","authors":["Xinyu Geng","Jiaming Wang","Jiawei Gong","Yuerong Xue","Jun Xu","Fanglin Chen","Xiaolin Huang"],"pdf_url":"https://arxiv.org/pdf/2403.13351v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2403.13349v1","updated":"2024-03-20T07:21:37Z","published":"2024-03-20T07:21:37Z","title":"Hierarchical Gaussian Mixture Normalizing Flow Modeling for Unified\n Anomaly Detection","summary":" Unified anomaly detection (AD) is one of the most challenges for anomaly\ndetection, where one unified model is trained with normal samples from multiple\nclasses with the objective to detect anomalies in these classes. For such a\nchallenging task, popular normalizing flow (NF) based AD methods may fall into\na \"homogeneous mapping\" issue,where the NF-based AD models are biased to\ngenerate similar latent representations for both normal and abnormal features,\nand thereby lead to a high missing rate of anomalies. In this paper, we propose\na novel Hierarchical Gaussian mixture normalizing flow modeling method for\naccomplishing unified Anomaly Detection, which we call HGAD. Our HGAD consists\nof two key components: inter-class Gaussian mixture modeling and intra-class\nmixed class centers learning. Compared to the previous NF-based AD methods, the\nhierarchical Gaussian mixture modeling approach can bring stronger\nrepresentation capability to the latent space of normalizing flows, so that\neven complex multi-class distribution can be well represented and learned in\nthe latent space. In this way, we can avoid mapping different class\ndistributions into the same single Gaussian prior, thus effectively avoiding or\nmitigating the \"homogeneous mapping\" issue. We further indicate that the more\ndistinguishable different class centers, the more conducive to avoiding the\nbias issue. Thus, we further propose a mutual information maximization loss for\nbetter structuring the latent feature space. We evaluate our method on four\nreal-world AD benchmarks, where we can significantly improve the previous\nNF-based AD methods and also outperform the SOTA unified AD methods.\n","authors":["Xincheng Yao","Ruoqi Li","Zefeng Qian","Lu Wang","Chongyang Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.13349v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2307.10711v3","updated":"2024-03-20T07:17:19Z","published":"2023-07-20T09:06:21Z","title":"AdjointDPM: Adjoint Sensitivity Method for Gradient Backpropagation of\n Diffusion Probabilistic Models","summary":" Existing customization methods require access to multiple reference examples\nto align pre-trained diffusion probabilistic models (DPMs) with user-provided\nconcepts. This paper aims to address the challenge of DPM customization when\nthe only available supervision is a differentiable metric defined on the\ngenerated contents. Since the sampling procedure of DPMs involves recursive\ncalls to the denoising UNet, na\\\"ive gradient backpropagation requires storing\nthe intermediate states of all iterations, resulting in extremely high memory\nconsumption. To overcome this issue, we propose a novel method AdjointDPM,\nwhich first generates new samples from diffusion models by solving the\ncorresponding probability-flow ODEs. It then uses the adjoint sensitivity\nmethod to backpropagate the gradients of the loss to the models' parameters\n(including conditioning signals, network weights, and initial noises) by\nsolving another augmented ODE. To reduce numerical errors in both the forward\ngeneration and gradient backpropagation processes, we further reparameterize\nthe probability-flow ODE and augmented ODE as simple non-stiff ODEs using\nexponential integration. Finally, we demonstrate the effectiveness of\nAdjointDPM on three interesting tasks: converting visual effects into\nidentification text embeddings, finetuning DPMs for specific types of\nstylization, and optimizing initial noise to generate adversarial samples for\nsecurity auditing.\n","authors":["Jiachun Pan","Jun Hao Liew","Vincent Y. F. Tan","Jiashi Feng","Hanshu Yan"],"pdf_url":"https://arxiv.org/pdf/2307.10711v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13347v1","updated":"2024-03-20T07:15:22Z","published":"2024-03-20T07:15:22Z","title":"vid-TLDR: Training Free Token merging for Light-weight Video Transformer","summary":" Video Transformers have become the prevalent solution for various video\ndownstream tasks with superior expressive power and flexibility. However, these\nvideo transformers suffer from heavy computational costs induced by the massive\nnumber of tokens across the entire video frames, which has been the major\nbarrier to training the model. Further, the patches irrelevant to the main\ncontents, e.g., backgrounds, degrade the generalization performance of models.\nTo tackle these issues, we propose training free token merging for lightweight\nvideo Transformer (vid-TLDR) that aims to enhance the efficiency of video\nTransformers by merging the background tokens without additional training. For\nvid-TLDR, we introduce a novel approach to capture the salient regions in\nvideos only with the attention map. Further, we introduce the saliency-aware\ntoken merging strategy by dropping the background tokens and sharpening the\nobject scores. Our experiments show that vid-TLDR significantly mitigates the\ncomputational complexity of video Transformers while achieving competitive\nperformance compared to the base model without vid-TLDR. Code is available at\nhttps://github.com/mlvlab/vid-TLDR.\n","authors":["Joonmyung Choi","Sanghyeok Lee","Jaewon Chu","Minhyuk Choi","Hyunwoo J. Kim"],"pdf_url":"https://arxiv.org/pdf/2403.13347v1.pdf","comment":"Conference on Computer Vision and Pattern Recognition (CVPR), 2024"},{"id":"http://arxiv.org/abs/2403.10099v2","updated":"2024-03-20T07:12:12Z","published":"2024-03-15T08:44:56Z","title":"KP-RED: Exploiting Semantic Keypoints for Joint 3D Shape Retrieval and\n Deformation","summary":" In this paper, we present KP-RED, a unified KeyPoint-driven REtrieval and\nDeformation framework that takes object scans as input and jointly retrieves\nand deforms the most geometrically similar CAD models from a pre-processed\ndatabase to tightly match the target. Unlike existing dense matching based\nmethods that typically struggle with noisy partial scans, we propose to\nleverage category-consistent sparse keypoints to naturally handle both full and\npartial object scans. Specifically, we first employ a lightweight retrieval\nmodule to establish a keypoint-based embedding space, measuring the similarity\namong objects by dynamically aggregating deformation-aware local-global\nfeatures around extracted keypoints. Objects that are close in the embedding\nspace are considered similar in geometry. Then we introduce the neural\ncage-based deformation module that estimates the influence vector of each\nkeypoint upon cage vertices inside its local support region to control the\ndeformation of the retrieved shape. Extensive experiments on the synthetic\ndataset PartNet and the real-world dataset Scan2CAD demonstrate that KP-RED\nsurpasses existing state-of-the-art approaches by a large margin. Codes and\ntrained models will be released in https://github.com/lolrudy/KP-RED.\n","authors":["Ruida Zhang","Chenyangguang Zhang","Yan Di","Fabian Manhardt","Xingyu Liu","Federico Tombari","Xiangyang Ji"],"pdf_url":"https://arxiv.org/pdf/2403.10099v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2402.08359v2","updated":"2024-03-20T07:05:55Z","published":"2024-02-13T10:40:10Z","title":"Learning to Produce Semi-dense Correspondences for Visual Localization","summary":" This study addresses the challenge of performing visual localization in\ndemanding conditions such as night-time scenarios, adverse weather, and\nseasonal changes. While many prior studies have focused on improving\nimage-matching performance to facilitate reliable dense keypoint matching\nbetween images, existing methods often heavily rely on predefined feature\npoints on a reconstructed 3D model. Consequently, they tend to overlook\nunobserved keypoints during the matching process. Therefore, dense keypoint\nmatches are not fully exploited, leading to a notable reduction in accuracy,\nparticularly in noisy scenes. To tackle this issue, we propose a novel\nlocalization method that extracts reliable semi-dense 2D-3D matching points\nbased on dense keypoint matches. This approach involves regressing semi-dense\n2D keypoints into 3D scene coordinates using a point inference network. The\nnetwork utilizes both geometric and visual cues to effectively infer 3D\ncoordinates for unobserved keypoints from the observed ones. The abundance of\nmatching information significantly enhances the accuracy of camera pose\nestimation, even in scenarios involving noisy or sparse 3D models.\nComprehensive evaluations demonstrate that the proposed method outperforms\nother methods in challenging scenes and achieves competitive results in\nlarge-scale visual localization benchmarks. The code will be available.\n","authors":["Khang Truong Giang","Soohwan Song","Sungho Jo"],"pdf_url":"https://arxiv.org/pdf/2402.08359v2.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2312.06731v3","updated":"2024-03-20T07:00:39Z","published":"2023-12-11T09:44:41Z","title":"Genixer: Empowering Multimodal Large Language Models as a Powerful Data\n Generator","summary":" Instruction tuning data is essential for training the Multimodal Large\nLanguage Models (MLLMs). However, the creation of high-quality instruction\ntuning data presents significant challenges. Prior methods that depended on\nGPT-4 for data generation were not only costly but also lacked satisfactory\nperformance in complex tasks (i.e., grounding-based reasoning tasks). To\naddress these issues, we developed an innovative data generation pipeline,\nGenixer, to generate various high-quality instruction tuning data, including\nnine representative tasks, e.g., Common VQA, REC, REG, and PointQ.\nSpecifically, Genixer provides a unified solution with four key steps for\nalleviating the difficulty of data generation: (i) instruction data collection,\n(ii) instruction template design, (iii) empowering MLLM, and (iv) data\ngeneration and filtering. Subsequently, the superior qualitative results of our\nGenixer demonstrate that current MLLMs have a strong potential to evolve into\npowerful data generators. Additionally, to validate the efficacy of generated\ndata quantitatively, we add the instruction tuning data produced by Genixer\ninto the training of two representative MLLMs and observe the consistent\nimprovements on various VQA tasks and multimodal benchmarks.\n","authors":["Henry Hengyuan Zhao","Pan Zhou","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2312.06731v3.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2403.13343v1","updated":"2024-03-20T07:00:03Z","published":"2024-03-20T07:00:03Z","title":"TiBiX: Leveraging Temporal Information for Bidirectional X-ray and\n Report Generation","summary":" With the emergence of vision language models in the medical imaging domain,\nnumerous studies have focused on two dominant research activities: (1) report\ngeneration from Chest X-rays (CXR), and (2) synthetic scan generation from text\nor reports. Despite some research incorporating multi-view CXRs into the\ngenerative process, prior patient scans and reports have been generally\ndisregarded. This can inadvertently lead to the leaving out of important\nmedical information, thus affecting generation quality. To address this, we\npropose TiBiX: Leveraging Temporal information for Bidirectional X-ray and\nReport Generation. Considering previous scans, our approach facilitates\nbidirectional generation, primarily addressing two challenging problems: (1)\ngenerating the current image from the previous image and current report and (2)\ngenerating the current report based on both the previous and current images.\nMoreover, we extract and release a curated temporal benchmark dataset derived\nfrom the MIMIC-CXR dataset, which focuses on temporal data. Our comprehensive\nexperiments and ablation studies explore the merits of incorporating prior CXRs\nand achieve state-of-the-art (SOTA) results on the report generation task.\nFurthermore, we attain on-par performance with SOTA image generation efforts,\nthus serving as a new baseline in longitudinal bidirectional CXR-to-report\ngeneration. The code is available at https://github.com/BioMedIA-MBZUAI/TiBiX.\n","authors":["Santosh Sanjeev","Fadillah Adamsyah Maani","Arsen Abzhanov","Vijay Ram Papineni","Ibrahim Almakky","Bartłomiej W. Papież","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2403.13343v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16244v3","updated":"2024-03-20T06:50:19Z","published":"2023-12-25T11:39:00Z","title":"Modality-missing RGBT Tracking: Invertible Prompt Learning and\n High-quality Benchmarks","summary":" Current RGBT tracking research relies on the complete multi-modal input, but\nmodal information might miss due to some factors such as thermal sensor\nself-calibration and data transmission error, called modality-missing challenge\nin this work. To address this challenge, we propose a novel invertible prompt\nlearning approach, which integrates the content-preserving prompts into a\nwell-trained tracking model to adapt to various modality-missing scenarios, for\nrobust RGBT tracking. Given one modality-missing scenario, we propose to\nutilize the available modality to generate the prompt of the missing modality\nto adapt to RGBT tracking model. However, the cross-modality gap between\navailable and missing modalities usually causes semantic distortion and\ninformation loss in prompt generation. To handle this issue, we design the\ninvertible prompter by incorporating the full reconstruction of the input\navailable modality from the generated prompt. To provide a comprehensive\nevaluation platform, we construct several high-quality benchmark datasets, in\nwhich various modality-missing scenarios are considered to simulate real-world\nchallenges. Extensive experiments on three modality-missing benchmark datasets\nshow that our method achieves significant performance improvements compared\nwith state-of-the-art methods. We have released the code and simulation\ndatasets at:\n\\href{https://github.com/Alexadlu/Modality-missing-RGBT-Tracking.git}{https://github.com/Alexadlu/Modality-missing-RGBT-Tracking.git}.\n","authors":["Andong Lu","Jiacong Zhao","Chenglong Li","Jin Tang","Bin Luo"],"pdf_url":"https://arxiv.org/pdf/2312.16244v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13341v1","updated":"2024-03-20T06:48:48Z","published":"2024-03-20T06:48:48Z","title":"FissionFusion: Fast Geometric Generation and Hierarchical Souping for\n Medical Image Analysis","summary":" The scarcity of well-annotated medical datasets requires leveraging transfer\nlearning from broader datasets like ImageNet or pre-trained models like CLIP.\nModel soups averages multiple fine-tuned models aiming to improve performance\non In-Domain (ID) tasks and enhance robustness against Out-of-Distribution\n(OOD) datasets. However, applying these methods to the medical imaging domain\nfaces challenges and results in suboptimal performance. This is primarily due\nto differences in error surface characteristics that stem from data\ncomplexities such as heterogeneity, domain shift, class imbalance, and\ndistributional shifts between training and testing phases. To address this\nissue, we propose a hierarchical merging approach that involves local and\nglobal aggregation of models at various levels based on models' hyperparameter\nconfigurations. Furthermore, to alleviate the need for training a large number\nof models in the hyperparameter search, we introduce a computationally\nefficient method using a cyclical learning rate scheduler to produce multiple\nmodels for aggregation in the weight space. Our method demonstrates significant\nimprovements over the model souping approach across multiple datasets (around\n6% gain in HAM10000 and CheXpert datasets) while maintaining low computational\ncosts for model generation and selection. Moreover, we achieve better results\non OOD datasets than model soups. The code is available at\nhttps://github.com/BioMedIA-MBZUAI/FissionFusion.\n","authors":["Santosh Sanjeev","Nuren Zhaksylyk","Ibrahim Almakky","Anees Ur Rehman Hashmi","Mohammad Areeb Qazi","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2403.13341v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13338v1","updated":"2024-03-20T06:46:01Z","published":"2024-03-20T06:46:01Z","title":"Adaptive Critical Subgraph Mining for Cognitive Impairment Conversion\n Prediction with T1-MRI-based Brain Network","summary":" Prediction the conversion to early-stage dementia is critical for mitigating\nits progression but remains challenging due to subtle cognitive impairments and\nstructural brain changes. Traditional T1-weighted magnetic resonance imaging\n(T1-MRI) research focus on identifying brain atrophy regions but often fails to\naddress the intricate connectivity between them. This limitation underscores\nthe necessity of focuing on inter-regional connectivity for a comprehensive\nunderstand of the brain's complex network. Moreover, there is a pressing demand\nfor methods that adaptively preserve and extract critical information,\nparticularly specialized subgraph mining techniques for brain networks. These\nare essential for developing high-quality feature representations that reveal\ncritical spatial impacts of structural brain changes and its topology. In this\npaper, we propose Brain-SubGNN, a novel graph representation network to mine\nand enhance critical subgraphs based on T1-MRI. This network provides a\nsubgraph-level interpretation, enhancing interpretability and insights for\ngraph analysis. The process begins by extracting node features and a\ncorrelation matrix between nodes to construct a task-oriented brain network.\nBrain-SubGNN then adaptively identifies and enhances critical subgraphs,\ncapturing both loop and neighbor subgraphs. This method reflects the loop\ntopology and local changes, indicative of long-range connections, and maintains\nlocal and global brain attributes. Extensive experiments validate the\neffectiveness and advantages of Brain-SubGNN, demonstrating its potential as a\npowerful tool for understanding and diagnosing early-stage dementia. Source\ncode is available at https://github.com/Leng-10/Brain-SubGNN.\n","authors":["Yilin Leng","Wenju Cui","Bai Chen","Xi Jiang","Shuangqing Chen","Jian Zheng"],"pdf_url":"https://arxiv.org/pdf/2403.13338v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2403.13337v1","updated":"2024-03-20T06:44:26Z","published":"2024-03-20T06:44:26Z","title":"Learning Novel View Synthesis from Heterogeneous Low-light Captures","summary":" Neural radiance field has achieved fundamental success in novel view\nsynthesis from input views with the same brightness level captured under fixed\nnormal lighting. Unfortunately, synthesizing novel views remains to be a\nchallenge for input views with heterogeneous brightness level captured under\nlow-light condition. The condition is pretty common in the real world. It\ncauses low-contrast images where details are concealed in the darkness and\ncamera sensor noise significantly degrades the image quality. To tackle this\nproblem, we propose to learn to decompose illumination, reflectance, and noise\nfrom input views according to that reflectance remains invariant across\nheterogeneous views. To cope with heterogeneous brightness and noise levels\nacross multi-views, we learn an illumination embedding and optimize a noise map\nindividually for each view. To allow intuitive editing of the illumination, we\ndesign an illumination adjustment module to enable either brightening or\ndarkening of the illumination component. Comprehensive experiments demonstrate\nthat this approach enables effective intrinsic decomposition for low-light\nmulti-view noisy images and achieves superior visual quality and numerical\nperformance for synthesizing novel views compared to state-of-the-art methods.\n","authors":["Quan Zheng","Hao Sun","Huiyao Xu","Fanjiang Xu"],"pdf_url":"https://arxiv.org/pdf/2403.13337v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01482v2","updated":"2024-03-20T06:38:15Z","published":"2024-03-03T11:24:16Z","title":"EAGLE: Eigen Aggregation Learning for Object-Centric Unsupervised\n Semantic Segmentation","summary":" Semantic segmentation has innately relied on extensive pixel-level annotated\ndata, leading to the emergence of unsupervised methodologies. Among them,\nleveraging self-supervised Vision Transformers for unsupervised semantic\nsegmentation (USS) has been making steady progress with expressive deep\nfeatures. Yet, for semantically segmenting images with complex objects, a\npredominant challenge remains: the lack of explicit object-level semantic\nencoding in patch-level features. This technical limitation often leads to\ninadequate segmentation of complex objects with diverse structures. To address\nthis gap, we present a novel approach, EAGLE, which emphasizes object-centric\nrepresentation learning for unsupervised semantic segmentation. Specifically,\nwe introduce EiCue, a spectral technique providing semantic and structural cues\nthrough an eigenbasis derived from the semantic similarity matrix of deep image\nfeatures and color affinity from an image. Further, by incorporating our\nobject-centric contrastive loss with EiCue, we guide our model to learn\nobject-level representations with intra- and inter-image object-feature\nconsistency, thereby enhancing semantic accuracy. Extensive experiments on\nCOCO-Stuff, Cityscapes, and Potsdam-3 datasets demonstrate the state-of-the-art\nUSS results of EAGLE with accurate and consistent semantic segmentation across\ncomplex scenes.\n","authors":["Chanyoung Kim","Woojung Han","Dayun Ju","Seong Jae Hwang"],"pdf_url":"https://arxiv.org/pdf/2403.01482v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.03379v3","updated":"2024-03-20T06:33:20Z","published":"2024-01-07T03:35:04Z","title":"Towards Effective Multiple-in-One Image Restoration: A Sequential and\n Prompt Learning Strategy","summary":" While single task image restoration (IR) has achieved significant successes,\nit remains a challenging issue to train a single model which can tackle\nmultiple IR tasks. In this work, we investigate in-depth the multiple-in-one\n(MiO) IR problem, which comprises seven popular IR tasks. We point out that MiO\nIR faces two pivotal challenges: the optimization of diverse objectives and the\nadaptation to multiple tasks. To tackle these challenges, we present two simple\nyet effective strategies. The first strategy, referred to as sequential\nlearning, attempts to address how to optimize the diverse objectives, which\nguides the network to incrementally learn individual IR tasks in a sequential\nmanner rather than mixing them together. The second strategy, i.e., prompt\nlearning, attempts to address how to adapt to the different IR tasks, which\nassists the network to understand the specific task and improves the\ngeneralization ability. By evaluating on 19 test sets, we demonstrate that the\nsequential and prompt learning strategies can significantly enhance the MiO\nperformance of commonly used CNN and Transformer backbones. Our experiments\nalso reveal that the two strategies can supplement each other to learn better\ndegradation representations and enhance the model robustness. It is expected\nthat our proposed MiO IR formulation and strategies could facilitate the\nresearch on how to train IR models with higher generalization capabilities.\n","authors":["Xiangtao Kong","Chao Dong","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.03379v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13331v1","updated":"2024-03-20T06:22:37Z","published":"2024-03-20T06:22:37Z","title":"AMP: Autoregressive Motion Prediction Revisited with Next Token\n Prediction for Autonomous Driving","summary":" As an essential task in autonomous driving (AD), motion prediction aims to\npredict the future states of surround objects for navigation. One natural\nsolution is to estimate the position of other agents in a step-by-step manner\nwhere each predicted time-step is conditioned on both observed time-steps and\npreviously predicted time-steps, i.e., autoregressive prediction. Pioneering\nworks like SocialLSTM and MFP design their decoders based on this intuition.\nHowever, almost all state-of-the-art works assume that all predicted time-steps\nare independent conditioned on observed time-steps, where they use a single\nlinear layer to generate positions of all time-steps simultaneously. They\ndominate most motion prediction leaderboards due to the simplicity of training\nMLPs compared to autoregressive networks.\n In this paper, we introduce the GPT style next token prediction into motion\nforecasting. In this way, the input and output could be represented in a\nunified space and thus the autoregressive prediction becomes more feasible.\nHowever, different from language data which is composed of homogeneous units\n-words, the elements in the driving scene could have complex spatial-temporal\nand semantic relations. To this end, we propose to adopt three factorized\nattention modules with different neighbors for information aggregation and\ndifferent position encoding styles to capture their relations, e.g., encoding\nthe transformation between coordinate systems for spatial relativity while\nadopting RoPE for temporal relativity. Empirically, by equipping with the\naforementioned tailored designs, the proposed method achieves state-of-the-art\nperformance in the Waymo Open Motion and Waymo Interaction datasets. Notably,\nAMP outperforms other recent autoregressive motion prediction methods: MotionLM\nand StateTransformer, which demonstrates the effectiveness of the proposed\ndesigns.\n","authors":["Xiaosong Jia","Shaoshuai Shi","Zijun Chen","Li Jiang","Wenlong Liao","Tao He","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2403.13331v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13330v1","updated":"2024-03-20T06:20:54Z","published":"2024-03-20T06:20:54Z","title":"Efficient scene text image super-resolution with semantic guidance","summary":" Scene text image super-resolution has significantly improved the accuracy of\nscene text recognition. However, many existing methods emphasize performance\nover efficiency and ignore the practical need for lightweight solutions in\ndeployment scenarios. Faced with the issues, our work proposes an efficient\nframework called SGENet to facilitate deployment on resource-limited platforms.\nSGENet contains two branches: super-resolution branch and semantic guidance\nbranch. We apply a lightweight pre-trained recognizer as a semantic extractor\nto enhance the understanding of text information. Meanwhile, we design the\nvisual-semantic alignment module to achieve bidirectional alignment between\nimage features and semantics, resulting in the generation of highquality prior\nguidance. We conduct extensive experiments on benchmark dataset, and the\nproposed SGENet achieves excellent performance with fewer computational costs.\nCode is available at https://github.com/SijieLiu518/SGENet\n","authors":["LeoWu TomyEnrique","Xiangcheng Du","Kangliang Liu","Han Yuan","Zhao Zhou","Cheng Jin"],"pdf_url":"https://arxiv.org/pdf/2403.13330v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13327v1","updated":"2024-03-20T06:19:41Z","published":"2024-03-20T06:19:41Z","title":"Gaussian Splatting on the Move: Blur and Rolling Shutter Compensation\n for Natural Camera Motion","summary":" High-quality scene reconstruction and novel view synthesis based on Gaussian\nSplatting (3DGS) typically require steady, high-quality photographs, often\nimpractical to capture with handheld cameras. We present a method that adapts\nto camera motion and allows high-quality scene reconstruction with handheld\nvideo data suffering from motion blur and rolling shutter distortion. Our\napproach is based on detailed modelling of the physical image formation process\nand utilizes velocities estimated using visual-inertial odometry (VIO). Camera\nposes are considered non-static during the exposure time of a single image\nframe and camera poses are further optimized in the reconstruction process. We\nformulate a differentiable rendering pipeline that leverages screen space\napproximation to efficiently incorporate rolling-shutter and motion blur\neffects into the 3DGS framework. Our results with both synthetic and real data\ndemonstrate superior performance in mitigating camera motion over existing\nmethods, thereby advancing 3DGS in naturalistic settings.\n","authors":["Otto Seiskari","Jerry Ylilammi","Valtteri Kaatrasalo","Pekka Rantalankila","Matias Turkulainen","Juho Kannala","Esa Rahtu","Arno Solin"],"pdf_url":"https://arxiv.org/pdf/2403.13327v1.pdf","comment":"Source code available at https://github.com/SpectacularAI/3dgs-deblur"},{"id":"http://arxiv.org/abs/2403.13324v1","updated":"2024-03-20T06:04:05Z","published":"2024-03-20T06:04:05Z","title":"Out-of-Distribution Detection Using Peer-Class Generated by Large\n Language Model","summary":" Out-of-distribution (OOD) detection is a critical task to ensure the\nreliability and security of machine learning models deployed in real-world\napplications. Conventional methods for OOD detection that rely on single-modal\ninformation, often struggle to capture the rich variety of OOD instances. The\nprimary difficulty in OOD detection arises when an input image has numerous\nsimilarities to a particular class in the in-distribution (ID) dataset, e.g.,\nwolf to dog, causing the model to misclassify it. Nevertheless, it may be easy\nto distinguish these classes in the semantic domain. To this end, in this\npaper, a novel method called ODPC is proposed, in which specific prompts to\ngenerate OOD peer classes of ID semantics are designed by a large language\nmodel as an auxiliary modality to facilitate detection. Moreover, a contrastive\nloss based on OOD peer classes is devised to learn compact representations of\nID classes and improve the clarity of boundaries between different classes. The\nextensive experiments on five benchmark datasets show that the method we\npropose can yield state-of-the-art results.\n","authors":["K Huang","G Song","Hanwen Su","Jiyan Wang"],"pdf_url":"https://arxiv.org/pdf/2403.13324v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13322v1","updated":"2024-03-20T06:00:53Z","published":"2024-03-20T06:00:53Z","title":"DD-RobustBench: An Adversarial Robustness Benchmark for Dataset\n Distillation","summary":" Dataset distillation is an advanced technique aimed at compressing datasets\ninto significantly smaller counterparts, while preserving formidable training\nperformance. Significant efforts have been devoted to promote evaluation\naccuracy under limited compression ratio while overlooked the robustness of\ndistilled dataset. In this work, we introduce a comprehensive benchmark that,\nto the best of our knowledge, is the most extensive to date for evaluating the\nadversarial robustness of distilled datasets in a unified way. Our benchmark\nsignificantly expands upon prior efforts by incorporating a wider range of\ndataset distillation methods, including the latest advancements such as TESLA\nand SRe2L, a diverse array of adversarial attack methods, and evaluations\nacross a broader and more extensive collection of datasets such as ImageNet-1K.\nMoreover, we assessed the robustness of these distilled datasets against\nrepresentative adversarial attack algorithms like PGD and AutoAttack, while\nexploring their resilience from a frequency perspective. We also discovered\nthat incorporating distilled data into the training batches of the original\ndataset can yield to improvement of robustness.\n","authors":["Yifan Wu","Jiawei Du","Ping Liu","Yuewei Lin","Wenqing Cheng","Wei Xu"],"pdf_url":"https://arxiv.org/pdf/2403.13322v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13831v2","updated":"2024-03-20T05:55:56Z","published":"2023-11-23T07:25:31Z","title":"Posterior Distillation Sampling","summary":" We introduce Posterior Distillation Sampling (PDS), a novel optimization\nmethod for parametric image editing based on diffusion models. Existing\noptimization-based methods, which leverage the powerful 2D prior of diffusion\nmodels to handle various parametric images, have mainly focused on generation.\nUnlike generation, editing requires a balance between conforming to the target\nattribute and preserving the identity of the source content. Recent 2D image\nediting methods have achieved this balance by leveraging the stochastic latent\nencoded in the generative process of diffusion models. To extend the editing\ncapabilities of diffusion models shown in pixel space to parameter space, we\nreformulate the 2D image editing method into an optimization form named PDS.\nPDS matches the stochastic latents of the source and the target, enabling the\nsampling of targets in diverse parameter spaces that align with a desired\nattribute while maintaining the source's identity. We demonstrate that this\noptimization resembles running a generative process with the target attribute,\nbut aligning this process with the trajectory of the source's generative\nprocess. Extensive editing results in Neural Radiance Fields and Scalable\nVector Graphics representations demonstrate that PDS is capable of sampling\ntargets to fulfill the aforementioned balance across various parameter spaces.\n","authors":["Juil Koo","Chanho Park","Minhyuk Sung"],"pdf_url":"https://arxiv.org/pdf/2311.13831v2.pdf","comment":"Project page: https://posterior-distillation-sampling.github.io/"},{"id":"http://arxiv.org/abs/2403.13319v1","updated":"2024-03-20T05:50:04Z","published":"2024-03-20T05:50:04Z","title":"HyperFusion: A Hypernetwork Approach to Multimodal Integration of\n Tabular and Medical Imaging Data for Predictive Modeling","summary":" The integration of diverse clinical modalities such as medical imaging and\nthe tabular data obtained by the patients' Electronic Health Records (EHRs) is\na crucial aspect of modern healthcare. The integrative analysis of multiple\nsources can provide a comprehensive understanding of a patient's condition and\ncan enhance diagnoses and treatment decisions. Deep Neural Networks (DNNs)\nconsistently showcase outstanding performance in a wide range of multimodal\ntasks in the medical domain. However, the complex endeavor of effectively\nmerging medical imaging with clinical, demographic and genetic information\nrepresented as numerical tabular data remains a highly active and ongoing\nresearch pursuit.\n We present a novel framework based on hypernetworks to fuse clinical imaging\nand tabular data by conditioning the image processing on the EHR's values and\nmeasurements. This approach aims to leverage the complementary information\npresent in these modalities to enhance the accuracy of various medical\napplications. We demonstrate the strength and the generality of our method on\ntwo different brain Magnetic Resonance Imaging (MRI) analysis tasks, namely,\nbrain age prediction conditioned by subject's sex, and multiclass Alzheimer's\nDisease (AD) classification conditioned by tabular data. We show that our\nframework outperforms both single-modality models and state-of-the-art\nMRI-tabular data fusion methods. The code, enclosed to this manuscript will be\nmade publicly available.\n","authors":["Daniel Duenias","Brennan Nichyporuk","Tal Arbel","Tammy Riklin Raviv"],"pdf_url":"https://arxiv.org/pdf/2403.13319v1.pdf","comment":"17 pages, 8 figures"},{"id":"http://arxiv.org/abs/2403.13315v1","updated":"2024-03-20T05:37:24Z","published":"2024-03-20T05:37:24Z","title":"PuzzleVQA: Diagnosing Multimodal Reasoning Challenges of Language Models\n with Abstract Visual Patterns","summary":" Large multimodal models extend the impressive capabilities of large language\nmodels by integrating multimodal understanding abilities. However, it is not\nclear how they can emulate the general intelligence and reasoning ability of\nhumans. As recognizing patterns and abstracting concepts are key to general\nintelligence, we introduce PuzzleVQA, a collection of puzzles based on abstract\npatterns. With this dataset, we evaluate large multimodal models with abstract\npatterns based on fundamental concepts, including colors, numbers, sizes, and\nshapes. Through our experiments on state-of-the-art large multimodal models, we\nfind that they are not able to generalize well to simple abstract patterns.\nNotably, even GPT-4V cannot solve more than half of the puzzles. To diagnose\nthe reasoning challenges in large multimodal models, we progressively guide the\nmodels with our ground truth reasoning explanations for visual perception,\ninductive reasoning, and deductive reasoning. Our systematic analysis finds\nthat the main bottlenecks of GPT-4V are weaker visual perception and inductive\nreasoning abilities. Through this work, we hope to shed light on the\nlimitations of large multimodal models and how they can better emulate human\ncognitive processes in the future (Our data and code will be released publicly\nat https://github.com/declare-lab/LLM-PuzzleTest).\n","authors":["Yew Ken Chia","Vernon Toh Yan Han","Deepanway Ghosal","Lidong Bing","Soujanya Poria"],"pdf_url":"https://arxiv.org/pdf/2403.13315v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13307v1","updated":"2024-03-20T05:11:10Z","published":"2024-03-20T05:11:10Z","title":"LaserHuman: Language-guided Scene-aware Human Motion Generation in Free\n Environment","summary":" Language-guided scene-aware human motion generation has great significance\nfor entertainment and robotics. In response to the limitations of existing\ndatasets, we introduce LaserHuman, a pioneering dataset engineered to\nrevolutionize Scene-Text-to-Motion research. LaserHuman stands out with its\ninclusion of genuine human motions within 3D environments, unbounded free-form\nnatural language descriptions, a blend of indoor and outdoor scenarios, and\ndynamic, ever-changing scenes. Diverse modalities of capture data and rich\nannotations present great opportunities for the research of conditional motion\ngeneration, and can also facilitate the development of real-life applications.\nMoreover, to generate semantically consistent and physically plausible human\nmotions, we propose a multi-conditional diffusion model, which is simple but\neffective, achieving state-of-the-art performance on existing datasets.\n","authors":["Peishan Cong","Ziyi WangZhiyang Dou","Yiming Ren","Wei Yin","Kai Cheng","Yujing Sun","Xiaoxiao Long","Xinge Zhu","Yuexin Ma"],"pdf_url":"https://arxiv.org/pdf/2403.13307v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.16759v4","updated":"2024-03-20T05:00:06Z","published":"2023-05-26T09:21:56Z","title":"StyleHumanCLIP: Text-guided Garment Manipulation for StyleGAN-Human","summary":" This paper tackles text-guided control of StyleGAN for editing garments in\nfull-body human images. Existing StyleGAN-based methods suffer from handling\nthe rich diversity of garments and body shapes and poses. We propose a\nframework for text-guided full-body human image synthesis via an\nattention-based latent code mapper, which enables more disentangled control of\nStyleGAN than existing mappers. Our latent code mapper adopts an attention\nmechanism that adaptively manipulates individual latent codes on different\nStyleGAN layers under text guidance. In addition, we introduce feature-space\nmasking at inference time to avoid unwanted changes caused by text inputs. Our\nquantitative and qualitative evaluations reveal that our method can control\ngenerated images more faithfully to given texts than existing methods.\n","authors":["Takato Yoshikawa","Yuki Endo","Yoshihiro Kanamori"],"pdf_url":"https://arxiv.org/pdf/2305.16759v4.pdf","comment":"VISIAPP 2024, project page:\n https://www.cgg.cs.tsukuba.ac.jp/~yoshikawa/pub/style_human_clip/"},{"id":"http://arxiv.org/abs/2403.13304v1","updated":"2024-03-20T04:58:03Z","published":"2024-03-20T04:58:03Z","title":"DetDiffusion: Synergizing Generative and Perceptive Models for Enhanced\n Data Generation and Perception","summary":" Current perceptive models heavily depend on resource-intensive datasets,\nprompting the need for innovative solutions. Leveraging recent advances in\ndiffusion models, synthetic data, by constructing image inputs from various\nannotations, proves beneficial for downstream tasks. While prior methods have\nseparately addressed generative and perceptive models, DetDiffusion, for the\nfirst time, harmonizes both, tackling the challenges in generating effective\ndata for perceptive models. To enhance image generation with perceptive models,\nwe introduce perception-aware loss (P.A. loss) through segmentation, improving\nboth quality and controllability. To boost the performance of specific\nperceptive models, our method customizes data augmentation by extracting and\nutilizing perception-aware attribute (P.A. Attr) during generation.\nExperimental results from the object detection task highlight DetDiffusion's\nsuperior performance, establishing a new state-of-the-art in layout-guided\ngeneration. Furthermore, image syntheses from DetDiffusion can effectively\naugment training data, significantly enhancing downstream detection\nperformance.\n","authors":["Yibo Wang","Ruiyuan Gao","Kai Chen","Kaiqiang Zhou","Yingjie Cai","Lanqing Hong","Zhenguo Li","Lihui Jiang","Dit-Yan Yeung","Qiang Xu","Kai Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.13304v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.12894v2","updated":"2024-03-20T04:56:03Z","published":"2024-03-19T16:46:29Z","title":"MEDBind: Unifying Language and Multimodal Medical Data Embeddings","summary":" Medical vision-language pretraining models (VLPM) have achieved remarkable\nprogress in fusing chest X-rays (CXR) with clinical texts, introducing\nimage-text data binding approaches that enable zero-shot learning and\ndownstream clinical tasks. However, the current landscape lacks the holistic\nintegration of additional medical modalities, such as electrocardiograms (ECG).\nWe present MEDBind (Medical Electronic patient recorD), which learns joint\nembeddings across CXR, ECG, and medical text. Using text data as the central\nanchor, MEDBind features tri-modality binding, delivering competitive\nperformance in top-K retrieval, zero-shot, and few-shot benchmarks against\nestablished VLPM, and the ability for CXR-to-ECG zero-shot classification and\nretrieval. This seamless integration is achieved through combination of\ncontrastive loss on modality-text pairs with our proposed contrastive loss\nfunction, Edge-Modality Contrastive Loss, fostering a cohesive embedding space\nfor CXR, ECG, and text. Finally, we demonstrate that MEDBind can improve\ndownstream tasks by directly integrating CXR and ECG embeddings into a\nlarge-language model for multimodal prompt tuning.\n","authors":["Yuan Gao","Sangwook Kim","David E Austin","Chris McIntosh"],"pdf_url":"https://arxiv.org/pdf/2403.12894v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12236v2","updated":"2024-03-20T04:47:38Z","published":"2023-03-21T23:43:58Z","title":"SALAD: Part-Level Latent Diffusion for 3D Shape Generation and\n Manipulation","summary":" We present a cascaded diffusion model based on a part-level implicit 3D\nrepresentation. Our model achieves state-of-the-art generation quality and also\nenables part-level shape editing and manipulation without any additional\ntraining in conditional setup. Diffusion models have demonstrated impressive\ncapabilities in data generation as well as zero-shot completion and editing via\na guided reverse process. Recent research on 3D diffusion models has focused on\nimproving their generation capabilities with various data representations,\nwhile the absence of structural information has limited their capability in\ncompletion and editing tasks. We thus propose our novel diffusion model using a\npart-level implicit representation. To effectively learn diffusion with\nhigh-dimensional embedding vectors of parts, we propose a cascaded framework,\nlearning diffusion first on a low-dimensional subspace encoding extrinsic\nparameters of parts and then on the other high-dimensional subspace encoding\nintrinsic attributes. In the experiments, we demonstrate the outperformance of\nour method compared with the previous ones both in generation and part-level\ncompletion and manipulation tasks.\n","authors":["Juil Koo","Seungwoo Yoo","Minh Hieu Nguyen","Minhyuk Sung"],"pdf_url":"https://arxiv.org/pdf/2303.12236v2.pdf","comment":"Project page: https://salad3d.github.io"},{"id":"http://arxiv.org/abs/2403.13298v1","updated":"2024-03-20T04:47:13Z","published":"2024-03-20T04:47:13Z","title":"Rotary Position Embedding for Vision Transformer","summary":" Rotary Position Embedding (RoPE) performs remarkably on language models,\nespecially for length extrapolation of Transformers. However, the impacts of\nRoPE on computer vision domains have been underexplored, even though RoPE\nappears capable of enhancing Vision Transformer (ViT) performance in a way\nsimilar to the language domain. This study provides a comprehensive analysis of\nRoPE when applied to ViTs, utilizing practical implementations of RoPE for 2D\nvision data. The analysis reveals that RoPE demonstrates impressive\nextrapolation performance, i.e., maintaining precision while increasing image\nresolution at inference. It eventually leads to performance improvement for\nImageNet-1k, COCO detection, and ADE-20k segmentation. We believe this study\nprovides thorough guidelines to apply RoPE into ViT, promising improved\nbackbone performance with minimal extra computational overhead. Our code and\npre-trained models are available at https://github.com/naver-ai/rope-vit\n","authors":["Byeongho Heo","Song Park","Dongyoon Han","Sangdoo Yun"],"pdf_url":"https://arxiv.org/pdf/2403.13298v1.pdf","comment":"20 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.04302v2","updated":"2024-03-20T23:32:08Z","published":"2023-12-07T13:53:29Z","title":"Prompt Highlighter: Interactive Control for Multi-Modal LLMs","summary":" This study targets a critical aspect of multi-modal LLMs' (LLMs&VLMs)\ninference: explicit controllable text generation. Multi-modal LLMs empower\nmulti-modality understanding with the capability of semantic generation yet\nbring less explainability and heavier reliance on prompt contents due to their\nautoregressive generative nature. While manipulating prompt formats could\nimprove outputs, designing specific and precise prompts per task can be\nchallenging and ineffective. To tackle this issue, we introduce a novel\ninference method, Prompt Highlighter, which enables users to highlight specific\nprompt spans to interactively control the focus during generation. Motivated by\nthe classifier-free diffusion guidance, we form regular and unconditional\ncontext pairs based on highlighted tokens, demonstrating that the\nautoregressive generation in models can be guided in a classifier-free way.\nNotably, we find that, during inference, guiding the models with highlighted\ntokens through the attention weights leads to more desired outputs. Our\napproach is compatible with current LLMs and VLMs, achieving impressive\ncustomized generation results without training. Experiments confirm its\neffectiveness in focusing on input contexts and generating reliable content.\nWithout tuning on LLaVA-v1.5, our method secured 70.7 in the MMBench test and\n1552.5 in MME-perception. The code is available at:\nhttps://github.com/dvlab-research/Prompt-Highlighter/\n","authors":["Yuechen Zhang","Shengju Qian","Bohao Peng","Shu Liu","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2312.04302v2.pdf","comment":"CVPR 2024; Project Page:\n https://julianjuaner.github.io/projects/PromptHighlighter"},{"id":"http://arxiv.org/abs/2403.08002v2","updated":"2024-03-20T23:31:22Z","published":"2024-03-12T18:12:02Z","title":"Training Small Multimodal Models to Bridge Biomedical Competency Gap: A\n Case Study in Radiology Imaging","summary":" The scaling laws and extraordinary performance of large foundation models\nmotivate the development and utilization of such large models in biomedicine.\nHowever, despite early promising results on some biomedical benchmarks, there\nare still major challenges that need to be addressed before these models can be\nused in real-world applications. Frontier models such as GPT-4V still have\nmajor competency gaps in multimodal capabilities for biomedical applications.\nMoreover, pragmatic issues such as access, cost, latency, and compliance make\nit hard for clinicians to use privately-hosted state-of-the-art large models\ndirectly on private patient data. In this paper, we explore training\nopen-source small multimodal models (SMMs) to bridge biomedical competency gaps\nfor unmet clinical needs. To maximize data efficiency, we adopt a modular\napproach by incorporating state-of-the-art pre-trained models for image and\ntext modalities, and focusing on training a lightweight adapter to ground each\nmodality to the text embedding space. We conduct a comprehensive study of this\napproach on radiology imaging. For training, we assemble a large dataset with\nover 1 million image-text pairs. For evaluation, we propose a clinically driven\nnovel approach using GPT-4 and demonstrate its parity with expert evaluation.\nWe also study grounding qualitatively using attention. For best practice, we\nconduct a systematic ablation study on various choices in data engineering and\nmultimodal training. The resulting LLaVA-Rad (7B) model attains\nstate-of-the-art results on radiology tasks such as report generation and\ncross-modal retrieval, even outperforming much larger models such as GPT-4V and\nMed-PaLM M (84B). LLaVA-Rad is fast and can be run on a single V100 GPU in\nprivate settings, offering a promising state-of-the-art tool for real-world\nclinical applications.\n","authors":["Juan Manuel Zambrano Chaves","Shih-Cheng Huang","Yanbo Xu","Hanwen Xu","Naoto Usuyama","Sheng Zhang","Fei Wang","Yujia Xie","Mahmoud Khademi","Ziyi Yang","Hany Awadalla","Julia Gong","Houdong Hu","Jianwei Yang","Chunyuan Li","Jianfeng Gao","Yu Gu","Cliff Wong","Mu Wei","Tristan Naumann","Muhao Chen","Matthew P. Lungren","Serena Yeung-Levy","Curtis P. Langlotz","Sheng Wang","Hoifung Poon"],"pdf_url":"https://arxiv.org/pdf/2403.08002v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14027v1","updated":"2024-03-20T22:52:34Z","published":"2024-03-20T22:52:34Z","title":"EcoSense: Energy-Efficient Intelligent Sensing for In-Shore Ship\n Detection through Edge-Cloud Collaboration","summary":" Detecting marine objects inshore presents challenges owing to algorithmic\nintricacies and complexities in system deployment. We propose a\ndifficulty-aware edge-cloud collaborative sensing system that splits the task\ninto object localization and fine-grained classification. Objects are\nclassified either at the edge or within the cloud, based on their estimated\ndifficulty. The framework comprises a low-power device-tailored front-end model\nfor object localization, classification, and difficulty estimation, along with\na transformer-graph convolutional network-based back-end model for fine-grained\nclassification. Our system demonstrates superior performance (mAP@0.5 +4.3%})\non widely used marine object detection datasets, significantly reducing both\ndata transmission volume (by 95.43%) and energy consumption (by 72.7%}) at the\nsystem level. We validate the proposed system across various embedded system\nplatforms and in real-world scenarios involving drone deployment.\n","authors":["Wenjun Huang","Hanning Chen","Yang Ni","Arghavan Rezvani","Sanggeon Yun","Sungheon Jeon","Eric Pedley","Mohsen Imani"],"pdf_url":"https://arxiv.org/pdf/2403.14027v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14003v1","updated":"2024-03-20T22:05:18Z","published":"2024-03-20T22:05:18Z","title":"Multi-Modal Hallucination Control by Visual Information Grounding","summary":" Generative Vision-Language Models (VLMs) are prone to generate\nplausible-sounding textual answers that, however, are not always grounded in\nthe input image. We investigate this phenomenon, usually referred to as\n\"hallucination\" and show that it stems from an excessive reliance on the\nlanguage prior. In particular, we show that as more tokens are generated, the\nreliance on the visual prompt decreases, and this behavior strongly correlates\nwith the emergence of hallucinations. To reduce hallucinations, we introduce\nMulti-Modal Mutual-Information Decoding (M3ID), a new sampling method for\nprompt amplification. M3ID amplifies the influence of the reference image over\nthe language prior, hence favoring the generation of tokens with higher mutual\ninformation with the visual prompt. M3ID can be applied to any pre-trained\nautoregressive VLM at inference time without necessitating further training and\nwith minimal computational overhead. If training is an option, we show that\nM3ID can be paired with Direct Preference Optimization (DPO) to improve the\nmodel's reliance on the prompt image without requiring any labels. Our\nempirical findings show that our algorithms maintain the fluency and linguistic\ncapabilities of pre-trained VLMs while reducing hallucinations by mitigating\nvisually ungrounded answers. Specifically, for the LLaVA 13B model, M3ID and\nM3ID+DPO reduce the percentage of hallucinated objects in captioning tasks by\n25% and 28%, respectively, and improve the accuracy on VQA benchmarks such as\nPOPE by 21% and 24%.\n","authors":["Alessandro Favero","Luca Zancato","Matthew Trager","Siddharth Choudhary","Pramuditha Perera","Alessandro Achille","Ashwin Swaminathan","Stefano Soatto"],"pdf_url":"https://arxiv.org/pdf/2403.14003v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14002v1","updated":"2024-03-20T22:03:40Z","published":"2024-03-20T22:03:40Z","title":"Uncertainty Driven Active Learning for Image Segmentation in Underwater\n Inspection","summary":" Active learning aims to select the minimum amount of data to train a model\nthat performs similarly to a model trained with the entire dataset. We study\nthe potential of active learning for image segmentation in underwater\ninfrastructure inspection tasks, where large amounts of data are typically\ncollected. The pipeline inspection images are usually semantically repetitive\nbut with great variations in quality. We use mutual information as the\nacquisition function, calculated using Monte Carlo dropout. To assess the\neffectiveness of the framework, DenseNet and HyperSeg are trained with the\nCamVid dataset using active learning. In addition, HyperSeg is trained with a\npipeline inspection dataset of over 50,000 images. For the pipeline dataset,\nHyperSeg with active learning achieved 67.5% meanIoU using 12.5% of the data,\nand 61.4% with the same amount of randomly selected images. This shows that\nusing active learning for segmentation models in underwater inspection tasks\ncan lower the cost significantly.\n","authors":["Luiza Ribeiro Marnet","Yury Brodskiy","Stella Grasshof","Andrzej Wasowski"],"pdf_url":"https://arxiv.org/pdf/2403.14002v1.pdf","comment":"16 pages, 8 figures, to be published in the Proceedings of the 4th\n International Conference on Robotics, Computer Vision and Intelligent\n Systems, Springer Nature, Feb 2024"},{"id":"http://arxiv.org/abs/2403.05435v3","updated":"2024-03-20T21:57:26Z","published":"2024-03-08T16:38:11Z","title":"OmniCount: Multi-label Object Counting with Semantic-Geometric Priors","summary":" Object counting is pivotal for understanding the composition of scenes.\nPreviously, this task was dominated by class-specific methods, which have\ngradually evolved into more adaptable class-agnostic strategies. However, these\nstrategies come with their own set of limitations, such as the need for manual\nexemplar input and multiple passes for multiple categories, resulting in\nsignificant inefficiencies. This paper introduces a new, more practical\napproach enabling simultaneous counting of multiple object categories using an\nopen vocabulary framework. Our solution, OmniCount, stands out by using\nsemantic and geometric insights from pre-trained models to count multiple\ncategories of objects as specified by users, all without additional training.\nOmniCount distinguishes itself by generating precise object masks and\nleveraging point prompts via the Segment Anything Model for efficient counting.\nTo evaluate OmniCount, we created the OmniCount-191 benchmark, a\nfirst-of-its-kind dataset with multi-label object counts, including points,\nbounding boxes, and VQA annotations. Our comprehensive evaluation in\nOmniCount-191, alongside other leading benchmarks, demonstrates OmniCount's\nexceptional performance, significantly outpacing existing solutions and\nheralding a new era in object counting technology.\n","authors":["Anindya Mondal","Sauradip Nag","Xiatian Zhu","Anjan Dutta"],"pdf_url":"https://arxiv.org/pdf/2403.05435v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13996v1","updated":"2024-03-20T21:52:02Z","published":"2024-03-20T21:52:02Z","title":"P-Count: Persistence-based Counting of White Matter Hyperintensities in\n Brain MRI","summary":" White matter hyperintensities (WMH) are a hallmark of cerebrovascular disease\nand multiple sclerosis. Automated WMH segmentation methods enable quantitative\nanalysis via estimation of total lesion load, spatial distribution of lesions,\nand number of lesions (i.e., number of connected components after\nthresholding), all of which are correlated with patient outcomes. While the two\nformer measures can generally be estimated robustly, the number of lesions is\nhighly sensitive to noise and segmentation mistakes -- even when small\nconnected components are eroded or disregarded. In this article, we present\nP-Count, an algebraic WMH counting tool based on persistent homology that\naccounts for the topological features of WM lesions in a robust manner. Using\ncomputational geometry, P-Count takes the persistence of connected components\ninto consideration, effectively filtering out the noisy WMH positives,\nresulting in a more accurate count of true lesions. We validated P-Count on the\nISBI2015 longitudinal lesion segmentation dataset, where it produces\nsignificantly more accurate results than direct thresholding.\n","authors":["Xiaoling Hu","Annabel Sorby-Adams","Frederik Barkhof","W Taylor Kimberly","Oula Puonti","Juan Eugenio Iglesias"],"pdf_url":"https://arxiv.org/pdf/2403.13996v1.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2403.13972v1","updated":"2024-03-20T20:47:53Z","published":"2024-03-20T20:47:53Z","title":"SeFFeC: Semantic Facial Feature Control for Fine-grained Face Editing","summary":" We propose Semantic Facial Feature Control (SeFFeC) - a novel method for\nfine-grained face shape editing. Our method enables the manipulation of\nhuman-understandable, semantic face features, such as nose length or mouth\nwidth, which are defined by different groups of facial landmarks. In contrast\nto existing methods, the use of facial landmarks enables precise measurement of\nthe facial features, which then enables training SeFFeC without any manually\nannotated labels. SeFFeC consists of a transformer-based encoder network that\ntakes a latent vector of a pre-trained generative model and a facial feature\nembedding as input, and learns to modify the latent vector to perform the\ndesired face edit operation. To ensure that the desired feature measurement is\nchanged towards the target value without altering uncorrelated features, we\nintroduced a novel semantic face feature loss. Qualitative and quantitative\nresults show that SeFFeC enables precise and fine-grained control of 23 facial\nfeatures, some of which could not previously be controlled by other methods,\nwithout requiring manual annotations. Unlike existing methods, SeFFeC also\nprovides deterministic control over the exact values of the facial features and\nmore localised and disentangled face edits.\n","authors":["Florian Strohm","Mihai Bâce","Markus Kaltenecker","Andreas Bulling"],"pdf_url":"https://arxiv.org/pdf/2403.13972v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05594v2","updated":"2024-03-20T20:39:27Z","published":"2024-03-07T04:33:42Z","title":"An Image-based Typology for Visualization","summary":" We present and discuss the results of a qualitative analysis of visual\nrepresentations from images. We labeled each image's essential stimuli, the\nremoval of which would render a visualization uninterpretable. As a result, we\nderive a typology of 10 visualization types of defined groups. We describe the\ntypology derivation process in which we engaged. The resulting typology and\nimage analysis can serve a number of purposes: enabling researchers to study\nthe evolution of the community and its research output over time, facilitating\nthe categorization of visualization images for the purpose of research and\nteaching, allowing researchers and practitioners to identify visual design\nstyles to further align the quantification of any visual information processor,\nbe that a person or an algorithm observer, and it facilitates a discussion of\nstandardization in visualization. In addition to the visualization typology\nfrom images, we provide a dataset of 6,833 tagged images and an online tool\nthat can be used to explore and analyze the large set of labeled images. The\ntool and data set enable scholars to closely examine the diverse visual designs\nused and how they are published and communicated in our community. A\npre-registration, a free copy of this paper, and all supplemental materials are\navailable via osf.io/dxjwt.\n","authors":["Jian Chen","Petra Isenberg","Robert S. Laramee","Tobias Isenberg","Michael Sedlmair","Torsten Moeller","Rui Li"],"pdf_url":"https://arxiv.org/pdf/2403.05594v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2209.07533"},{"id":"http://arxiv.org/abs/2403.13965v1","updated":"2024-03-20T20:37:13Z","published":"2024-03-20T20:37:13Z","title":"ConGeo: Robust Cross-view Geo-localization across Ground View Variations","summary":" Cross-view geo-localization aims at localizing a ground-level query image by\nmatching it to its corresponding geo-referenced aerial view. In real-world\nscenarios, the task requires accommodating diverse ground images captured by\nusers with varying orientations and reduced field of views (FoVs). However,\nexisting learning pipelines are orientation-specific or FoV-specific, demanding\nseparate model training for different ground view variations. Such models\nheavily depend on the North-aligned spatial correspondence and predefined FoVs\nin the training data, compromising their robustness across different settings.\nTo tackle this challenge, we propose ConGeo, a single- and cross-modal\nContrastive method for Geo-localization: it enhances robustness and consistency\nin feature representations to improve a model's invariance to orientation and\nits resilience to FoV variations, by enforcing proximity between ground view\nvariations of the same location. As a generic learning objective for cross-view\ngeo-localization, when integrated into state-of-the-art pipelines, ConGeo\nsignificantly boosts the performance of three base models on four\ngeo-localization benchmarks for diverse ground view variations and outperforms\ncompeting methods that train separate models for each ground view variation.\n","authors":["Li Mi","Chang Xu","Javiera Castillo-Navarro","Syrielle Montariol","Wen Yang","Antoine Bosselut","Devis Tuia"],"pdf_url":"https://arxiv.org/pdf/2403.13965v1.pdf","comment":"Project page at https://chasel-tsui.github.io/ConGeo/"},{"id":"http://arxiv.org/abs/2312.03816v2","updated":"2024-03-20T20:20:01Z","published":"2023-12-06T18:56:14Z","title":"AVID: Any-Length Video Inpainting with Diffusion Model","summary":" Recent advances in diffusion models have successfully enabled text-guided\nimage inpainting. While it seems straightforward to extend such editing\ncapability into the video domain, there have been fewer works regarding\ntext-guided video inpainting. Given a video, a masked region at its initial\nframe, and an editing prompt, it requires a model to do infilling at each frame\nfollowing the editing guidance while keeping the out-of-mask region intact.\nThere are three main challenges in text-guided video inpainting: ($i$) temporal\nconsistency of the edited video, ($ii$) supporting different inpainting types\nat different structural fidelity levels, and ($iii$) dealing with variable\nvideo length. To address these challenges, we introduce Any-Length Video\nInpainting with Diffusion Model, dubbed as AVID. At its core, our model is\nequipped with effective motion modules and adjustable structure guidance, for\nfixed-length video inpainting. Building on top of that, we propose a novel\nTemporal MultiDiffusion sampling pipeline with a middle-frame attention\nguidance mechanism, facilitating the generation of videos with any desired\nduration. Our comprehensive experiments show our model can robustly deal with\nvarious inpainting types at different video duration ranges, with high quality.\nMore visualization results are made publicly available at\nhttps://zhang-zx.github.io/AVID/ .\n","authors":["Zhixing Zhang","Bichen Wu","Xiaoyan Wang","Yaqiao Luo","Luxin Zhang","Yinan Zhao","Peter Vajda","Dimitris Metaxas","Licheng Yu"],"pdf_url":"https://arxiv.org/pdf/2312.03816v2.pdf","comment":"Project website: https://zhang-zx.github.io/AVID/"},{"id":"http://arxiv.org/abs/2403.02302v2","updated":"2024-03-20T20:05:45Z","published":"2024-03-04T18:32:12Z","title":"Beyond Specialization: Assessing the Capabilities of MLLMs in Age and\n Gender Estimation","summary":" Multimodal Large Language Models (MLLMs) have recently gained immense\npopularity. Powerful commercial models like ChatGPT-4V and Gemini, as well as\nopen-source ones such as LLaVA, are essentially general-purpose models and are\napplied to solve a wide variety of tasks, including those in computer vision.\nThese neural networks possess such strong general knowledge and reasoning\nabilities that they have proven capable of working even on tasks for which they\nwere not specifically trained. We compared the capabilities of the most\npowerful MLLMs to date: ShareGPT4V, ChatGPT, LLaVA-Next in a specialized task\nof age and gender estimation with our state-of-the-art specialized model,\nMiVOLO. We also updated MiVOLO and provide details and new metrics in this\narticle. This comparison has yielded some interesting results and insights\nabout the strengths and weaknesses of the participating models. Furthermore, we\nattempted various ways to fine-tune the ShareGPT4V model for this specific\ntask, aiming to achieve state-of-the-art results in this particular challenge.\nAlthough such a model would not be practical in production, as it is incredibly\nexpensive compared to a specialized model like MiVOLO, it could be very useful\nin some tasks, like data annotation.\n","authors":["Maksim Kuprashevich","Grigorii Alekseenko","Irina Tolstykh"],"pdf_url":"https://arxiv.org/pdf/2403.02302v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13951v1","updated":"2024-03-20T19:45:06Z","published":"2024-03-20T19:45:06Z","title":"ACDG-VTON: Accurate and Contained Diffusion Generation for Virtual\n Try-On","summary":" Virtual Try-on (VTON) involves generating images of a person wearing selected\ngarments. Diffusion-based methods, in particular, can create high-quality\nimages, but they struggle to maintain the identities of the input garments. We\nidentified this problem stems from the specifics in the training formulation\nfor diffusion. To address this, we propose a unique training scheme that limits\nthe scope in which diffusion is trained. We use a control image that perfectly\naligns with the target image during training. In turn, this accurately\npreserves garment details during inference. We demonstrate our method not only\neffectively conserves garment details but also allows for layering, styling,\nand shoe try-on. Our method runs multi-garment try-on in a single inference\ncycle and can support high-quality zoomed-in generations without training in\nhigher resolutions. Finally, we show our method surpasses prior methods in\naccuracy and quality.\n","authors":["Jeffrey Zhang","Kedan Li","Shao-Yu Chang","David Forsyth"],"pdf_url":"https://arxiv.org/pdf/2403.13951v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11061v2","updated":"2024-03-20T19:44:07Z","published":"2024-01-19T23:34:48Z","title":"PhotoBot: Reference-Guided Interactive Photography via Natural Language","summary":" We introduce PhotoBot, a framework for fully automated photo acquisition\nbased on an interplay between high-level human language guidance and a robot\nphotographer. We propose to communicate photography suggestions to the user via\nreference images that are selected from a curated gallery. We leverage a visual\nlanguage model (VLM) and an object detector to characterize the reference\nimages via textual descriptions and then use a large language model (LLM) to\nretrieve relevant reference images based on a user's language query through\ntext-based reasoning. To correspond the reference image and the observed scene,\nwe exploit pre-trained features from a vision transformer capable of capturing\nsemantic similarity across marked appearance variations. Using these features,\nwe compute pose adjustments for an RGB-D camera by solving a\nperspective-n-point (PnP) problem. We demonstrate our approach using a\nmanipulator equipped with a wrist camera. Our user studies show that photos\ntaken by PhotoBot are often more aesthetically pleasing than those taken by\nusers themselves, as measured by human feedback. We also show that PhotoBot can\ngeneralize to other reference sources such as paintings.\n","authors":["Oliver Limoyo","Jimmy Li","Dmitriy Rivkin","Jonathan Kelly","Gregory Dudek"],"pdf_url":"https://arxiv.org/pdf/2401.11061v2.pdf","comment":"Submitted to the IEEE/RSJ International Conference on Intelligent\n Robotics and Systems (IROS'24), Abu Dhabi, UAE, Oct 14-18, 2024"},{"id":"http://arxiv.org/abs/2403.08019v3","updated":"2024-03-20T19:38:56Z","published":"2024-03-12T18:36:59Z","title":"MRC-Net: 6-DoF Pose Estimation with MultiScale Residual Correlation","summary":" We propose a single-shot approach to determining 6-DoF pose of an object with\navailable 3D computer-aided design (CAD) model from a single RGB image. Our\nmethod, dubbed MRC-Net, comprises two stages. The first performs pose\nclassification and renders the 3D object in the classified pose. The second\nstage performs regression to predict fine-grained residual pose within class.\nConnecting the two stages is a novel multi-scale residual correlation (MRC)\nlayer that captures high-and-low level correspondences between the input image\nand rendering from first stage. MRC-Net employs a Siamese network with shared\nweights between both stages to learn embeddings for input and rendered images.\nTo mitigate ambiguity when predicting discrete pose class labels on symmetric\nobjects, we use soft probabilistic labels to define pose class in the first\nstage. We demonstrate state-of-the-art accuracy, outperforming all competing\nRGB-based methods on four challenging BOP benchmark datasets: T-LESS, LM-O,\nYCB-V, and ITODD. Our method is non-iterative and requires no complex\npost-processing.\n","authors":["Yuelong Li","Yafei Mao","Raja Bala","Sunil Hadap"],"pdf_url":"https://arxiv.org/pdf/2403.08019v3.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.07203v2","updated":"2024-03-20T19:31:15Z","published":"2024-03-11T23:08:29Z","title":"How to Handle Sketch-Abstraction in Sketch-Based Image Retrieval?","summary":" In this paper, we propose a novel abstraction-aware sketch-based image\nretrieval framework capable of handling sketch abstraction at varied levels.\nPrior works had mainly focused on tackling sub-factors such as drawing style\nand order, we instead attempt to model abstraction as a whole, and propose\nfeature-level and retrieval granularity-level designs so that the system builds\ninto its DNA the necessary means to interpret abstraction. On learning\nabstraction-aware features, we for the first-time harness the rich semantic\nembedding of pre-trained StyleGAN model, together with a novel\nabstraction-level mapper that deciphers the level of abstraction and\ndynamically selects appropriate dimensions in the feature matrix\ncorrespondingly, to construct a feature matrix embedding that can be freely\ntraversed to accommodate different levels of abstraction. For granularity-level\nabstraction understanding, we dictate that the retrieval model should not treat\nall abstraction-levels equally and introduce a differentiable surrogate Acc.@q\nloss to inject that understanding into the system. Different to the\ngold-standard triplet loss, our Acc.@q loss uniquely allows a sketch to\nnarrow/broaden its focus in terms of how stringent the evaluation should be -\nthe more abstract a sketch, the less stringent (higher q). Extensive\nexperiments depict our method to outperform existing state-of-the-arts in\nstandard SBIR tasks along with challenging scenarios like early retrieval,\nforensic sketch-photo matching, and style-invariant retrieval.\n","authors":["Subhadeep Koley","Ayan Kumar Bhunia","Aneeshan Sain","Pinaki Nath Chowdhury","Tao Xiang","Yi-Zhe Song"],"pdf_url":"https://arxiv.org/pdf/2403.07203v2.pdf","comment":"Accepted in CVPR 2024. Project page available at\n https://subhadeepkoley.github.io/AbstractAway"},{"id":"http://arxiv.org/abs/2403.07214v2","updated":"2024-03-20T19:27:27Z","published":"2024-03-12T00:02:03Z","title":"Text-to-Image Diffusion Models are Great Sketch-Photo Matchmakers","summary":" This paper, for the first time, explores text-to-image diffusion models for\nZero-Shot Sketch-based Image Retrieval (ZS-SBIR). We highlight a pivotal\ndiscovery: the capacity of text-to-image diffusion models to seamlessly bridge\nthe gap between sketches and photos. This proficiency is underpinned by their\nrobust cross-modal capabilities and shape bias, findings that are substantiated\nthrough our pilot studies. In order to harness pre-trained diffusion models\neffectively, we introduce a straightforward yet powerful strategy focused on\ntwo key aspects: selecting optimal feature layers and utilising visual and\ntextual prompts. For the former, we identify which layers are most enriched\nwith information and are best suited for the specific retrieval requirements\n(category-level or fine-grained). Then we employ visual and textual prompts to\nguide the model's feature extraction process, enabling it to generate more\ndiscriminative and contextually relevant cross-modal representations. Extensive\nexperiments on several benchmark datasets validate significant performance\nimprovements.\n","authors":["Subhadeep Koley","Ayan Kumar Bhunia","Aneeshan Sain","Pinaki Nath Chowdhury","Tao Xiang","Yi-Zhe Song"],"pdf_url":"https://arxiv.org/pdf/2403.07214v2.pdf","comment":"Accepted in CVPR 2024. Project page available at\n https://subhadeepkoley.github.io/DiffusionZSSBIR"},{"id":"http://arxiv.org/abs/2403.07222v2","updated":"2024-03-20T19:25:38Z","published":"2024-03-12T00:27:18Z","title":"You'll Never Walk Alone: A Sketch and Text Duet for Fine-Grained Image\n Retrieval","summary":" Two primary input modalities prevail in image retrieval: sketch and text.\nWhile text is widely used for inter-category retrieval tasks, sketches have\nbeen established as the sole preferred modality for fine-grained image\nretrieval due to their ability to capture intricate visual details. In this\npaper, we question the reliance on sketches alone for fine-grained image\nretrieval by simultaneously exploring the fine-grained representation\ncapabilities of both sketch and text, orchestrating a duet between the two. The\nend result enables precise retrievals previously unattainable, allowing users\nto pose ever-finer queries and incorporate attributes like colour and\ncontextual cues from text. For this purpose, we introduce a novel\ncompositionality framework, effectively combining sketches and text using\npre-trained CLIP models, while eliminating the need for extensive fine-grained\ntextual descriptions. Last but not least, our system extends to novel\napplications in composed image retrieval, domain attribute transfer, and\nfine-grained generation, providing solutions for various real-world scenarios.\n","authors":["Subhadeep Koley","Ayan Kumar Bhunia","Aneeshan Sain","Pinaki Nath Chowdhury","Tao Xiang","Yi-Zhe Song"],"pdf_url":"https://arxiv.org/pdf/2403.07222v2.pdf","comment":"Accepted in CVPR 2024. Project page available at\n https://subhadeepkoley.github.io/Sketch2Word"},{"id":"http://arxiv.org/abs/2403.07234v2","updated":"2024-03-20T19:23:17Z","published":"2024-03-12T01:05:25Z","title":"It's All About Your Sketch: Democratising Sketch Control in Diffusion\n Models","summary":" This paper unravels the potential of sketches for diffusion models,\naddressing the deceptive promise of direct sketch control in generative AI. We\nimportantly democratise the process, enabling amateur sketches to generate\nprecise images, living up to the commitment of \"what you sketch is what you\nget\". A pilot study underscores the necessity, revealing that deformities in\nexisting models stem from spatial-conditioning. To rectify this, we propose an\nabstraction-aware framework, utilising a sketch adapter, adaptive time-step\nsampling, and discriminative guidance from a pre-trained fine-grained\nsketch-based image retrieval model, working synergistically to reinforce\nfine-grained sketch-photo association. Our approach operates seamlessly during\ninference without the need for textual prompts; a simple, rough sketch akin to\nwhat you and I can create suffices! We welcome everyone to examine results\npresented in the paper and its supplementary. Contributions include\ndemocratising sketch control, introducing an abstraction-aware framework, and\nleveraging discriminative guidance, validated through extensive experiments.\n","authors":["Subhadeep Koley","Ayan Kumar Bhunia","Deeptanshu Sekhri","Aneeshan Sain","Pinaki Nath Chowdhury","Tao Xiang","Yi-Zhe Song"],"pdf_url":"https://arxiv.org/pdf/2403.07234v2.pdf","comment":"Accepted in CVPR 2024. Project page available at\n https://subhadeepkoley.github.io/StableSketching"},{"id":"http://arxiv.org/abs/2403.12777v2","updated":"2024-03-20T19:18:27Z","published":"2024-03-19T14:44:54Z","title":"Discover and Mitigate Multiple Biased Subgroups in Image Classifiers","summary":" Machine learning models can perform well on in-distribution data but often\nfail on biased subgroups that are underrepresented in the training data,\nhindering the robustness of models for reliable applications. Such subgroups\nare typically unknown due to the absence of subgroup labels. Discovering biased\nsubgroups is the key to understanding models' failure modes and further\nimproving models' robustness. Most previous works of subgroup discovery make an\nimplicit assumption that models only underperform on a single biased subgroup,\nwhich does not hold on in-the-wild data where multiple biased subgroups exist.\n In this work, we propose Decomposition, Interpretation, and Mitigation (DIM),\na novel method to address a more challenging but also more practical problem of\ndiscovering multiple biased subgroups in image classifiers. Our approach\ndecomposes the image features into multiple components that represent multiple\nsubgroups. This decomposition is achieved via a bilinear dimension reduction\nmethod, Partial Least Square (PLS), guided by useful supervision from the image\nclassifier. We further interpret the semantic meaning of each subgroup\ncomponent by generating natural language descriptions using vision-language\nfoundation models. Finally, DIM mitigates multiple biased subgroups\nsimultaneously via two strategies, including the data- and model-centric\nstrategies. Extensive experiments on CIFAR-100 and Breeds datasets demonstrate\nthe effectiveness of DIM in discovering and mitigating multiple biased\nsubgroups. Furthermore, DIM uncovers the failure modes of the classifier on\nHard ImageNet, showcasing its broader applicability to understanding model bias\nin image classifiers. The code is available at\nhttps://github.com/ZhangAIPI/DIM.\n","authors":["Zeliang Zhang","Mingqian Feng","Zhiheng Li","Chenliang Xu"],"pdf_url":"https://arxiv.org/pdf/2403.12777v2.pdf","comment":"CVPR 2024. Code is available at https://github.com/ZhangAIPI/DIM"},{"id":"http://arxiv.org/abs/2403.13916v1","updated":"2024-03-20T18:36:30Z","published":"2024-03-20T18:36:30Z","title":"Enhancing Fingerprint Image Synthesis with GANs, Diffusion Models, and\n Style Transfer Techniques","summary":" We present novel approaches involving generative adversarial networks and\ndiffusion models in order to synthesize high quality, live and spoof\nfingerprint images while preserving features such as uniqueness and diversity.\nWe generate live fingerprints from noise with a variety of methods, and we use\nimage translation techniques to translate live fingerprint images to spoof. To\ngenerate different types of spoof images based on limited training data we\nincorporate style transfer techniques through a cycle autoencoder equipped with\na Wasserstein metric along with Gradient Penalty (CycleWGAN-GP) in order to\navoid mode collapse and instability. We find that when the spoof training data\nincludes distinct spoof characteristics, it leads to improved live-to-spoof\ntranslation. We assess the diversity and realism of the generated live\nfingerprint images mainly through the Fr\\'echet Inception Distance (FID) and\nthe False Acceptance Rate (FAR). Our best diffusion model achieved a FID of\n15.78. The comparable WGAN-GP model achieved slightly higher FID while\nperforming better in the uniqueness assessment due to a slightly lower FAR when\nmatched against the training data, indicating better creativity. Moreover, we\ngive example images showing that a DDPM model clearly can generate realistic\nfingerprint images.\n","authors":["W. Tang","D. Figueroa","D. Liu","K. Johnsson","A. Sopasakis"],"pdf_url":"https://arxiv.org/pdf/2403.13916v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15619v3","updated":"2024-03-20T18:27:25Z","published":"2023-11-27T08:32:28Z","title":"Align before Adapt: Leveraging Entity-to-Region Alignments for\n Generalizable Video Action Recognition","summary":" Large-scale visual-language pre-trained models have achieved significant\nsuccess in various video tasks. However, most existing methods follow an \"adapt\nthen align\" paradigm, which adapts pre-trained image encoders to model\nvideo-level representations and utilizes one-hot or text embedding of the\naction labels for supervision. This paradigm overlooks the challenge of mapping\nfrom static images to complicated activity concepts. In this paper, we propose\na novel \"Align before Adapt\" (ALT) paradigm. Prior to adapting to video\nrepresentation learning, we exploit the entity-to-region alignments for each\nframe. The alignments are fulfilled by matching the region-aware image\nembeddings to an offline-constructed text corpus. With the aligned entities, we\nfeed their text embeddings to a transformer-based video adapter as the queries,\nwhich can help extract the semantics of the most important entities from a\nvideo to a vector. This paradigm reuses the visual-language alignment of VLP\nduring adaptation and tries to explain an action by the underlying entities.\nThis helps understand actions by bridging the gap with complex activity\nsemantics, particularly when facing unfamiliar or unseen categories. ALT\ndemonstrates competitive performance while maintaining remarkably low\ncomputational costs. In fully supervised experiments, it achieves 88.1% top-1\naccuracy on Kinetics-400 with only 4947 GFLOPs. Moreover, ALT outperforms the\nprevious state-of-the-art methods in both zero-shot and few-shot experiments,\nemphasizing its superior generalizability across various learning scenarios.\n","authors":["Yifei Chen","Dapeng Chen","Ruijin Liu","Sai Zhou","Wenyuan Xue","Wei Peng"],"pdf_url":"https://arxiv.org/pdf/2311.15619v3.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.13900v1","updated":"2024-03-20T18:11:10Z","published":"2024-03-20T18:11:10Z","title":"CoMo: Controllable Motion Generation through Language Guided Pose Code\n Editing","summary":" Text-to-motion models excel at efficient human motion generation, but\nexisting approaches lack fine-grained controllability over the generation\nprocess. Consequently, modifying subtle postures within a motion or inserting\nnew actions at specific moments remains a challenge, limiting the applicability\nof these methods in diverse scenarios. In light of these challenges, we\nintroduce CoMo, a Controllable Motion generation model, adept at accurately\ngenerating and editing motions by leveraging the knowledge priors of large\nlanguage models (LLMs). Specifically, CoMo decomposes motions into discrete and\nsemantically meaningful pose codes, with each code encapsulating the semantics\nof a body part, representing elementary information such as \"left knee slightly\nbent\". Given textual inputs, CoMo autoregressively generates sequences of pose\ncodes, which are then decoded into 3D motions. Leveraging pose codes as\ninterpretable representations, an LLM can directly intervene in motion editing\nby adjusting the pose codes according to editing instructions. Experiments\ndemonstrate that CoMo achieves competitive performance in motion generation\ncompared to state-of-the-art models while, in human studies, CoMo substantially\nsurpasses previous work in motion editing abilities.\n","authors":["Yiming Huang","Weilin Wan","Yue Yang","Chris Callison-Burch","Mark Yatskar","Lingjie Liu"],"pdf_url":"https://arxiv.org/pdf/2403.13900v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04334v3","updated":"2024-03-20T18:11:07Z","published":"2023-12-07T14:51:12Z","title":"Towards a Perceptual Evaluation Framework for Lighting Estimation","summary":" Progress in lighting estimation is tracked by computing existing image\nquality assessment (IQA) metrics on images from standard datasets. While this\nmay appear to be a reasonable approach, we demonstrate that doing so does not\ncorrelate to human preference when the estimated lighting is used to relight a\nvirtual scene into a real photograph. To study this, we design a controlled\npsychophysical experiment where human observers must choose their preference\namongst rendered scenes lit using a set of lighting estimation algorithms\nselected from the recent literature, and use it to analyse how these algorithms\nperform according to human perception. Then, we demonstrate that none of the\nmost popular IQA metrics from the literature, taken individually, correctly\nrepresent human perception. Finally, we show that by learning a combination of\nexisting IQA metrics, we can more accurately represent human preference. This\nprovides a new perceptual framework to help evaluate future lighting estimation\nalgorithms.\n","authors":["Justine Giroux","Mohammad Reza Karimi Dastjerdi","Yannick Hold-Geoffroy","Javier Vazquez-Corral","Jean-François Lalonde"],"pdf_url":"https://arxiv.org/pdf/2312.04334v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13890v1","updated":"2024-03-20T18:01:57Z","published":"2024-03-20T18:01:57Z","title":"Towards Learning Contrast Kinetics with Multi-Condition Latent Diffusion\n Models","summary":" Contrast agents in dynamic contrast enhanced magnetic resonance imaging allow\nto localize tumors and observe their contrast kinetics, which is essential for\ncancer characterization and respective treatment decision-making. However,\ncontrast agent administration is not only associated with adverse health risks,\nbut also restricted for patients during pregnancy, and for those with kidney\nmalfunction, or other adverse reactions. With contrast uptake as key biomarker\nfor lesion malignancy, cancer recurrence risk, and treatment response, it\nbecomes pivotal to reduce the dependency on intravenous contrast agent\nadministration. To this end, we propose a multi-conditional latent diffusion\nmodel capable of acquisition time-conditioned image synthesis of DCE-MRI\ntemporal sequences. To evaluate medical image synthesis, we additionally\npropose and validate the Fr\\'echet radiomics distance as an image quality\nmeasure based on biomarker variability between synthetic and real imaging data.\nOur results demonstrate our method's ability to generate realistic\nmulti-sequence fat-saturated breast DCE-MRI and uncover the emerging potential\nof deep learning based contrast kinetics simulation. We publicly share our\naccessible codebase at https://github.com/RichardObi/ccnet.\n","authors":["Richard Osuala","Daniel Lang","Preeti Verma","Smriti Joshi","Apostolia Tsirikoglou","Grzegorz Skorupko","Kaisar Kushibar","Lidia Garrucho","Walter H. L. Pinaya","Oliver Diaz","Julia Schnabel","Karim Lekadir"],"pdf_url":"https://arxiv.org/pdf/2403.13890v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13870v1","updated":"2024-03-20T14:47:28Z","published":"2024-03-20T14:47:28Z","title":"ExMap: Leveraging Explainability Heatmaps for Unsupervised Group\n Robustness to Spurious Correlations","summary":" Group robustness strategies aim to mitigate learned biases in deep learning\nmodels that arise from spurious correlations present in their training\ndatasets. However, most existing methods rely on the access to the label\ndistribution of the groups, which is time-consuming and expensive to obtain. As\na result, unsupervised group robustness strategies are sought. Based on the\ninsight that a trained model's classification strategies can be inferred\naccurately based on explainability heatmaps, we introduce ExMap, an\nunsupervised two stage mechanism designed to enhance group robustness in\ntraditional classifiers. ExMap utilizes a clustering module to infer\npseudo-labels based on a model's explainability heatmaps, which are then used\nduring training in lieu of actual labels. Our empirical studies validate the\nefficacy of ExMap - We demonstrate that it bridges the performance gap with its\nsupervised counterparts and outperforms existing partially supervised and\nunsupervised methods. Additionally, ExMap can be seamlessly integrated with\nexisting group robustness learning strategies. Finally, we demonstrate its\npotential in tackling the emerging issue of multiple shortcut\nmitigation\\footnote{Code available at \\url{https://github.com/rwchakra/exmap}}.\n","authors":["Rwiddhi Chakraborty","Adrian Sletten","Michael Kampffmeyer"],"pdf_url":"https://arxiv.org/pdf/2403.13870v1.pdf","comment":null},{"id":"http://arxiv.org/abs/1610.00291v2","updated":"2024-03-20T13:41:19Z","published":"2016-10-02T15:48:36Z","title":"Deep Feature Consistent Variational Autoencoder","summary":" We present a novel method for constructing Variational Autoencoder (VAE).\nInstead of using pixel-by-pixel loss, we enforce deep feature consistency\nbetween the input and the output of a VAE, which ensures the VAE's output to\npreserve the spatial correlation characteristics of the input, thus leading the\noutput to have a more natural visual appearance and better perceptual quality.\nBased on recent deep learning works such as style transfer, we employ a\npre-trained deep convolutional neural network (CNN) and use its hidden features\nto define a feature perceptual loss for VAE training. Evaluated on the CelebA\nface dataset, we show that our model produces better results than other methods\nin the literature. We also show that our method can produce latent vectors that\ncan capture the semantic information of face expressions and can be used to\nachieve state-of-the-art performance in facial attribute prediction.\n","authors":["Xianxu Hou","Linlin Shen","Ke Sun","Guoping Qiu"],"pdf_url":"https://arxiv.org/pdf/1610.00291v2.pdf","comment":"WACV"},{"id":"http://arxiv.org/abs/2311.02313v2","updated":"2024-03-20T04:45:17Z","published":"2023-11-04T03:55:38Z","title":"LISNeRF Mapping: LiDAR-based Implicit Mapping via Semantic Neural Fields\n for Large-Scale 3D Scenes","summary":" Large-scale semantic mapping is crucial for outdoor autonomous agents to\nfulfill high-level tasks such as planning and navigation. This paper proposes a\nnovel method for large-scale 3D semantic reconstruction through implicit\nrepresentations from posed LiDAR measurements alone. We first leverage an\noctree-based and hierarchical structure to store implicit features, then these\nimplicit features are decoded to semantic information and signed distance value\nthrough shallow Multilayer Perceptrons (MLPs). We adopt off-the-shelf\nalgorithms to predict the semantic labels and instance IDs of point clouds. We\nthen jointly optimize the feature embeddings and MLPs parameters with a\nself-supervision paradigm for point cloud geometry and a pseudo-supervision\nparadigm for semantic and panoptic labels. Subsequently, categories and\ngeometric structures for novel points are regressed, and marching cubes are\nexploited to subdivide and visualize the scenes in the inferring stage. For\nscenarios with memory constraints, a map stitching strategy is also developed\nto merge sub-maps into a complete map. Experiments on two real-world datasets,\nSemanticKITTI and SemanticPOSS, demonstrate the superior segmentation\nefficiency and mapping effectiveness of our framework compared to current\nstate-of-the-art 3D LiDAR mapping methods.\n","authors":["Jianyuan Zhang","Zhiliu Yang","Meng Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.02313v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03177v6","updated":"2024-03-20T04:43:27Z","published":"2023-07-06T17:57:02Z","title":"PanoDiffusion: 360-degree Panorama Outpainting via Diffusion","summary":" Generating complete 360-degree panoramas from narrow field of view images is\nongoing research as omnidirectional RGB data is not readily available. Existing\nGAN-based approaches face some barriers to achieving higher quality output, and\nhave poor generalization performance over different mask types. In this paper,\nwe present our 360-degree indoor RGB-D panorama outpainting model using latent\ndiffusion models (LDM), called PanoDiffusion. We introduce a new bi-modal\nlatent diffusion structure that utilizes both RGB and depth panoramic data\nduring training, which works surprisingly well to outpaint depth-free RGB\nimages during inference. We further propose a novel technique of introducing\nprogressive camera rotations during each diffusion denoising step, which leads\nto substantial improvement in achieving panorama wraparound consistency.\nResults show that our PanoDiffusion not only significantly outperforms\nstate-of-the-art methods on RGB-D panorama outpainting by producing diverse\nwell-structured results for different types of masks, but can also synthesize\nhigh-quality depth panoramas to provide realistic 3D indoor models.\n","authors":["Tianhao Wu","Chuanxia Zheng","Tat-Jen Cham"],"pdf_url":"https://arxiv.org/pdf/2307.03177v6.pdf","comment":"Project Page: https://sm0kywu.github.io/panodiffusion/"},{"id":"http://arxiv.org/abs/2403.13293v1","updated":"2024-03-20T04:18:38Z","published":"2024-03-20T04:18:38Z","title":"Building Optimal Neural Architectures using Interpretable Knowledge","summary":" Neural Architecture Search is a costly practice. The fact that a search space\ncan span a vast number of design choices with each architecture evaluation\ntaking nontrivial overhead makes it hard for an algorithm to sufficiently\nexplore candidate networks. In this paper, we propose AutoBuild, a scheme which\nlearns to align the latent embeddings of operations and architecture modules\nwith the ground-truth performance of the architectures they appear in. By doing\nso, AutoBuild is capable of assigning interpretable importance scores to\narchitecture modules, such as individual operation features and larger macro\noperation sequences such that high-performance neural networks can be\nconstructed without any need for search. Through experiments performed on\nstate-of-the-art image classification, segmentation, and Stable Diffusion\nmodels, we show that by mining a relatively small set of evaluated\narchitectures, AutoBuild can learn to build high-quality architectures directly\nor help to reduce search space to focus on relevant areas, finding better\narchitectures that outperform both the original labeled ones and ones found by\nsearch baselines. Code available at\nhttps://github.com/Ascend-Research/AutoBuild\n","authors":["Keith G. Mills","Fred X. Han","Mohammad Salameh","Shengyao Lu","Chunhua Zhou","Jiao He","Fengyu Sun","Di Niu"],"pdf_url":"https://arxiv.org/pdf/2403.13293v1.pdf","comment":"CVPR'24; 18 Pages, 18 Figures, 3 Tables"},{"id":"http://arxiv.org/abs/2312.09570v2","updated":"2024-03-20T04:05:37Z","published":"2023-12-15T07:04:27Z","title":"CAGE: Controllable Articulation GEneration","summary":" We address the challenge of generating 3D articulated objects in a\ncontrollable fashion. Currently, modeling articulated 3D objects is either\nachieved through laborious manual authoring, or using methods from prior work\nthat are hard to scale and control directly. We leverage the interplay between\npart shape, connectivity, and motion using a denoising diffusion-based method\nwith attention modules designed to extract correlations between part\nattributes. Our method takes an object category label and a part connectivity\ngraph as input and generates an object's geometry and motion parameters. The\ngenerated objects conform to user-specified constraints on the object category,\npart shape, and part articulation. Our experiments show that our method\noutperforms the state-of-the-art in articulated object generation, producing\nmore realistic objects while conforming better to user constraints.\n Video Summary at: http://youtu.be/cH_rbKbyTpE\n","authors":["Jiayi Liu","Hou In Ivan Tam","Ali Mahdavi-Amiri","Manolis Savva"],"pdf_url":"https://arxiv.org/pdf/2312.09570v2.pdf","comment":"CVPR 2024. Project page: https://3dlg-hcvc.github.io/cage/"},{"id":"http://arxiv.org/abs/2403.13289v1","updated":"2024-03-20T04:03:44Z","published":"2024-03-20T04:03:44Z","title":"Text-to-3D Shape Generation","summary":" Recent years have seen an explosion of work and interest in text-to-3D shape\ngeneration. Much of the progress is driven by advances in 3D representations,\nlarge-scale pretraining and representation learning for text and image data\nenabling generative AI models, and differentiable rendering. Computational\nsystems that can perform text-to-3D shape generation have captivated the\npopular imagination as they enable non-expert users to easily create 3D content\ndirectly from text. However, there are still many limitations and challenges\nremaining in this problem space. In this state-of-the-art report, we provide a\nsurvey of the underlying technology and methods enabling text-to-3D shape\ngeneration to summarize the background literature. We then derive a systematic\ncategorization of recent work on text-to-3D shape generation based on the type\nof supervision data required. Finally, we discuss limitations of the existing\ncategories of methods, and delineate promising directions for future work.\n","authors":["Han-Hung Lee","Manolis Savva","Angel X. Chang"],"pdf_url":"https://arxiv.org/pdf/2403.13289v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02626v2","updated":"2024-03-20T03:56:57Z","published":"2024-03-05T03:34:11Z","title":"Modeling Collaborator: Enabling Subjective Vision Classification With\n Minimal Human Effort via LLM Tool-Use","summary":" From content moderation to wildlife conservation, the number of applications\nthat require models to recognize nuanced or subjective visual concepts is\ngrowing. Traditionally, developing classifiers for such concepts requires\nsubstantial manual effort measured in hours, days, or even months to identify\nand annotate data needed for training. Even with recently proposed Agile\nModeling techniques, which enable rapid bootstrapping of image classifiers,\nusers are still required to spend 30 minutes or more of monotonous, repetitive\ndata labeling just to train a single classifier. Drawing on Fiske's Cognitive\nMiser theory, we propose a new framework that alleviates manual effort by\nreplacing human labeling with natural language interactions, reducing the total\neffort required to define a concept by an order of magnitude: from labeling\n2,000 images to only 100 plus some natural language interactions. Our framework\nleverages recent advances in foundation models, both large language models and\nvision-language models, to carve out the concept space through conversation and\nby automatically labeling training data points. Most importantly, our framework\neliminates the need for crowd-sourced annotations. Moreover, our framework\nultimately produces lightweight classification models that are deployable in\ncost-sensitive scenarios. Across 15 subjective concepts and across 2 public\nimage classification datasets, our trained models outperform traditional Agile\nModeling as well as state-of-the-art zero-shot classification models like\nALIGN, CLIP, CuPL, and large visual question-answering models like PaLI-X.\n","authors":["Imad Eddine Toubal","Aditya Avinash","Neil Gordon Alldrin","Jan Dlabal","Wenlei Zhou","Enming Luo","Otilia Stretcu","Hao Xiong","Chun-Ta Lu","Howard Zhou","Ranjay Krishna","Ariel Fuxman","Tom Duerig"],"pdf_url":"https://arxiv.org/pdf/2403.02626v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13282v1","updated":"2024-03-20T03:47:53Z","published":"2024-03-20T03:47:53Z","title":"AdaViPro: Region-based Adaptive Visual Prompt for Large-Scale Models\n Adapting","summary":" Recently, prompt-based methods have emerged as a new alternative\n`parameter-efficient fine-tuning' paradigm, which only fine-tunes a small\nnumber of additional parameters while keeping the original model frozen.\nHowever, despite achieving notable results, existing prompt methods mainly\nfocus on `what to add', while overlooking the equally important aspect of\n`where to add', typically relying on the manually crafted placement. To this\nend, we propose a region-based Adaptive Visual Prompt, named AdaViPro, which\nintegrates the `where to add' optimization of the prompt into the learning\nprocess. Specifically, we reconceptualize the `where to add' optimization as a\nproblem of regional decision-making. During inference, AdaViPro generates a\nregionalized mask map for the whole image, which is composed of 0 and 1, to\ndesignate whether to apply or discard the prompt in each specific area.\nTherefore, we employ Gumbel-Softmax sampling to enable AdaViPro's end-to-end\nlearning through standard back-propagation. Extensive experiments demonstrate\nthat our AdaViPro yields new efficiency and accuracy trade-offs for adapting\npre-trained models.\n","authors":["Mengyu Yang","Ye Tian","Lanshan Zhang","Xiao Liang","Xuming Ran","Wendong Wang"],"pdf_url":"https://arxiv.org/pdf/2403.13282v1.pdf","comment":"Submitted to ICIP 2024"},{"id":"http://arxiv.org/abs/2403.12052v2","updated":"2024-03-20T03:45:13Z","published":"2024-01-04T11:14:01Z","title":"A Dataset and Benchmark for Copyright Protection from Text-to-Image\n Diffusion Models","summary":" Copyright is a legal right that grants creators the exclusive authority to\nreproduce, distribute, and profit from their creative works. However, the\nrecent advancements in text-to-image generation techniques have posed\nsignificant challenges to copyright protection, as these methods have\nfacilitated the learning of unauthorized content, artistic creations, and\nportraits, which are subsequently utilized to generate and disseminate\nuncontrolled content. Especially, the use of stable diffusion, an emerging\nmodel for text-to-image generation, poses an increased risk of unauthorized\ncopyright infringement and distribution. Currently, there is a lack of\nsystematic studies evaluating the potential correlation between content\ngenerated by stable diffusion and those under copyright protection. Conducting\nsuch studies faces several challenges, including i) the intrinsic ambiguity\nrelated to copyright infringement in text-to-image models, ii) the absence of a\ncomprehensive large-scale dataset, and iii) the lack of standardized metrics\nfor defining copyright infringement. This work provides the first large-scale\nstandardized dataset and benchmark on copyright protection. Specifically, we\npropose a pipeline to coordinate CLIP, ChatGPT, and diffusion models to\ngenerate a dataset that contains anchor images, corresponding prompts, and\nimages generated by text-to-image models, reflecting the potential abuses of\ncopyright. Furthermore, we explore a suite of evaluation metrics to judge the\neffectiveness of copyright protection methods. The proposed dataset, benchmark\nlibrary, and evaluation metrics will be open-sourced to facilitate future\nresearch and application. The website and dataset can be accessed website\ndataset.\n","authors":["Rui Ma","Qiang Zhou","Bangjun Xiao","Yizhu Jin","Daquan Zhou","Xiuyu Li","Aishani Singh","Yi Qu","Kurt Keutzer","Xiaodong Xie","Jingtong Hu","Zhen Dong","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.12052v2.pdf","comment":"Improve experimental content"},{"id":"http://arxiv.org/abs/2403.08381v2","updated":"2024-03-20T03:41:07Z","published":"2024-03-13T09:47:04Z","title":"Tackling the Singularities at the Endpoints of Time Intervals in\n Diffusion Models","summary":" Most diffusion models assume that the reverse process adheres to a Gaussian\ndistribution. However, this approximation has not been rigorously validated,\nespecially at singularities, where t=0 and t=1. Improperly dealing with such\nsingularities leads to an average brightness issue in applications, and limits\nthe generation of images with extreme brightness or darkness. We primarily\nfocus on tackling singularities from both theoretical and practical\nperspectives. Initially, we establish the error bounds for the reverse process\napproximation, and showcase its Gaussian characteristics at singularity time\nsteps. Based on this theoretical insight, we confirm the singularity at t=1 is\nconditionally removable while it at t=0 is an inherent property. Upon these\nsignificant conclusions, we propose a novel plug-and-play method SingDiffusion\nto address the initial singular time step sampling, which not only effectively\nresolves the average brightness issue for a wide range of diffusion models\nwithout extra training efforts, but also enhances their generation capability\nin achieving notable lower FID scores.\n","authors":["Pengze Zhang","Hubery Yin","Chen Li","Xiaohua Xie"],"pdf_url":"https://arxiv.org/pdf/2403.08381v2.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2205.00415v3","updated":"2024-03-20T03:23:11Z","published":"2022-05-01T07:51:22Z","title":"Don't Blame the Annotator: Bias Already Starts in the Annotation\n Instructions","summary":" In recent years, progress in NLU has been driven by benchmarks. These\nbenchmarks are typically collected by crowdsourcing, where annotators write\nexamples based on annotation instructions crafted by dataset creators. In this\nwork, we hypothesize that annotators pick up on patterns in the crowdsourcing\ninstructions, which bias them to write many similar examples that are then\nover-represented in the collected data. We study this form of bias, termed\ninstruction bias, in 14 recent NLU benchmarks, showing that instruction\nexamples often exhibit concrete patterns, which are propagated by crowdworkers\nto the collected data. This extends previous work (Geva et al., 2019) and\nraises a new concern of whether we are modeling the dataset creator's\ninstructions, rather than the task. Through a series of experiments, we show\nthat, indeed, instruction bias can lead to overestimation of model performance,\nand that models struggle to generalize beyond biases originating in the\ncrowdsourcing instructions. We further analyze the influence of instruction\nbias in terms of pattern frequency and model size, and derive concrete\nrecommendations for creating future NLU benchmarks.\n","authors":["Mihir Parmar","Swaroop Mishra","Mor Geva","Chitta Baral"],"pdf_url":"https://arxiv.org/pdf/2205.00415v3.pdf","comment":"EACL 2023 (Outstanding Paper Award)"},{"id":"http://arxiv.org/abs/2312.01027v3","updated":"2024-03-20T03:19:41Z","published":"2023-12-02T04:31:51Z","title":"LDM-ISP: Enhancing Neural ISP for Low Light with Latent Diffusion Models","summary":" Enhancing a low-light noisy RAW image into a well-exposed and clean sRGB\nimage is a significant challenge for modern digital cameras. Prior approaches\nhave difficulties in recovering fine-grained details and true colors of the\nscene under extremely low-light environments due to near-to-zero SNR.\nMeanwhile, diffusion models have shown significant progress towards general\ndomain image generation. In this paper, we propose to leverage the pre-trained\nlatent diffusion model to perform the neural ISP for enhancing extremely\nlow-light images. Specifically, to tailor the pre-trained latent diffusion\nmodel to operate on the RAW domain, we train a set of lightweight taming\nmodules to inject the RAW information into the diffusion denoising process via\nmodulating the intermediate features of UNet. We further observe different\nroles of UNet denoising and decoder reconstruction in the latent diffusion\nmodel, which inspires us to decompose the low-light image enhancement task into\nlatent-space low-frequency content generation and decoding-phase high-frequency\ndetail maintenance. Through extensive experiments on representative datasets,\nwe demonstrate our simple design not only achieves state-of-the-art performance\nin quantitative evaluations but also shows significant superiority in visual\ncomparisons over strong baselines, which highlight the effectiveness of\npowerful generative priors for neural ISP under extremely low-light\nenvironments. The project page is available at\nhttps://csqiangwen.github.io/projects/ldm-isp/\n","authors":["Qiang Wen","Yazhou Xing","Zhefan Rao","Qifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2312.01027v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04834v2","updated":"2024-03-20T03:07:26Z","published":"2023-08-09T09:46:26Z","title":"View while Moving: Efficient Video Recognition in Long-untrimmed Videos","summary":" Recent adaptive methods for efficient video recognition mostly follow the\ntwo-stage paradigm of \"preview-then-recognition\" and have achieved great\nsuccess on multiple video benchmarks. However, this two-stage paradigm involves\ntwo visits of raw frames from coarse-grained to fine-grained during inference\n(cannot be parallelized), and the captured spatiotemporal features cannot be\nreused in the second stage (due to varying granularity), being not friendly to\nefficiency and computation optimization. To this end, inspired by human\ncognition, we propose a novel recognition paradigm of \"View while Moving\" for\nefficient long-untrimmed video recognition. In contrast to the two-stage\nparadigm, our paradigm only needs to access the raw frame once. The two phases\nof coarse-grained sampling and fine-grained recognition are combined into\nunified spatiotemporal modeling, showing great performance. Moreover, we\ninvestigate the properties of semantic units in video and propose a\nhierarchical mechanism to efficiently capture and reason about the unit-level\nand video-level temporal semantics in long-untrimmed videos respectively.\nExtensive experiments on both long-untrimmed and short-trimmed videos\ndemonstrate that our approach outperforms state-of-the-art methods in terms of\naccuracy as well as efficiency, yielding new efficiency and accuracy trade-offs\nfor video spatiotemporal modeling.\n","authors":["Ye Tian","Mengyu Yang","Lanshan Zhang","Zhizhen Zhang","Yang Liu","Xiaohui Xie","Xirong Que","Wendong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.04834v2.pdf","comment":"Published on ACM MM 2023"},{"id":"http://arxiv.org/abs/2403.13263v1","updated":"2024-03-20T03:00:21Z","published":"2024-03-20T03:00:21Z","title":"SC-Tune: Unleashing Self-Consistent Referential Comprehension in Large\n Vision Language Models","summary":" Recent trends in Large Vision Language Models (LVLMs) research have been\nincreasingly focusing on advancing beyond general image understanding towards\nmore nuanced, object-level referential comprehension. In this paper, we present\nand delve into the self-consistency capability of LVLMs, a crucial aspect that\nreflects the models' ability to both generate informative captions for specific\nobjects and subsequently utilize these captions to accurately re-identify the\nobjects in a closed-loop process. This capability significantly mirrors the\nprecision and reliability of fine-grained visual-language understanding. Our\nfindings reveal that the self-consistency level of existing LVLMs falls short\nof expectations, posing limitations on their practical applicability and\npotential. To address this gap, we introduce a novel fine-tuning paradigm named\nSelf-Consistency Tuning (SC-Tune). It features the synergistic learning of a\ncyclic describer-locator system. This paradigm is not only data-efficient but\nalso exhibits generalizability across multiple LVLMs. Through extensive\nexperiments, we demonstrate that SC-Tune significantly elevates performance\nacross a spectrum of object-level vision-language benchmarks and maintains\ncompetitive or improved performance on image-level vision-language benchmarks.\nBoth our model and code will be publicly available at\nhttps://github.com/ivattyue/SC-Tune.\n","authors":["Tongtian Yue","Jie Cheng","Longteng Guo","Xingyuan Dai","Zijia Zhao","Xingjian He","Gang Xiong","Yisheng Lv","Jing Liu"],"pdf_url":"https://arxiv.org/pdf/2403.13263v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.13258v1","updated":"2024-03-20T02:39:15Z","published":"2024-03-20T02:39:15Z","title":"SAMCT: Segment Any CT Allowing Labor-Free Task-Indicator Prompts","summary":" Segment anything model (SAM), a foundation model with superior versatility\nand generalization across diverse segmentation tasks, has attracted widespread\nattention in medical imaging. However, it has been proved that SAM would\nencounter severe performance degradation due to the lack of medical knowledge\nin training and local feature encoding. Though several SAM-based models have\nbeen proposed for tuning SAM in medical imaging, they still suffer from\ninsufficient feature extraction and highly rely on high-quality prompts. In\nthis paper, we construct a large CT dataset consisting of 1.1M CT images and 5M\nmasks from public datasets and propose a powerful foundation model SAMCT\nallowing labor-free prompts. Specifically, based on SAM, SAMCT is further\nequipped with a U-shaped CNN image encoder, a cross-branch interaction module,\nand a task-indicator prompt encoder. The U-shaped CNN image encoder works in\nparallel with the ViT image encoder in SAM to supplement local features.\nCross-branch interaction enhances the feature expression capability of the CNN\nimage encoder and the ViT image encoder by exchanging global perception and\nlocal features from one to the other. The task-indicator prompt encoder is a\nplug-and-play component to effortlessly encode task-related indicators into\nprompt embeddings. In this way, SAMCT can work in an automatic manner in\naddition to the semi-automatic interactive strategy in SAM. Extensive\nexperiments demonstrate the superiority of SAMCT against the state-of-the-art\ntask-specific and SAM-based medical foundation models on various tasks. The\ncode, data, and models are released at https://github.com/xianlin7/SAMCT.\n","authors":["Xian Lin","Yangyang Xiang","Zhehao Wang","Kwang-Ting Cheng","Zengqiang Yan","Li Yu"],"pdf_url":"https://arxiv.org/pdf/2403.13258v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09233v2","updated":"2024-03-20T02:38:44Z","published":"2024-03-14T09:57:15Z","title":"D-YOLO a robust framework for object detection in adverse weather\n conditions","summary":" Adverse weather conditions including haze, snow and rain lead to decline in\nimage qualities, which often causes a decline in performance for deep-learning\nbased detection networks. Most existing approaches attempts to rectify hazy\nimages before performing object detection, which increases the complexity of\nthe network and may result in the loss in latent information. To better\nintegrate image restoration and object detection tasks, we designed a\ndouble-route network with an attention feature fusion module, taking both hazy\nand dehazed features into consideration. We also proposed a subnetwork to\nprovide haze-free features to the detection network. Specifically, our D-YOLO\nimproves the performance of the detection network by minimizing the distance\nbetween the clear feature extraction subnetwork and detection network.\nExperiments on RTTS and FoggyCityscapes datasets show that D-YOLO demonstrates\nbetter performance compared to the state-of-the-art methods. It is a robust\ndetection framework for bridging the gap between low-level dehazing and\nhigh-level detection.\n","authors":["Zihan Chu"],"pdf_url":"https://arxiv.org/pdf/2403.09233v2.pdf","comment":"Object detection in adverse weather conditions. arXiv admin note:\n text overlap with arXiv:2209.01373 by other authors"},{"id":"http://arxiv.org/abs/2403.08505v2","updated":"2024-03-20T02:35:57Z","published":"2024-03-13T13:12:57Z","title":"Content-aware Masked Image Modeling Transformer for Stereo Image\n Compression","summary":" Existing learning-based stereo image codec adopt sophisticated transformation\nwith simple entropy models derived from single image codecs to encode latent\nrepresentations. However, those entropy models struggle to effectively capture\nthe spatial-disparity characteristics inherent in stereo images, which leads to\nsuboptimal rate-distortion results. In this paper, we propose a stereo image\ncompression framework, named CAMSIC. CAMSIC independently transforms each image\nto latent representation and employs a powerful decoder-free Transformer\nentropy model to capture both spatial and disparity dependencies, by\nintroducing a novel content-aware masked image modeling (MIM) technique. Our\ncontent-aware MIM facilitates efficient bidirectional interaction between prior\ninformation and estimated tokens, which naturally obviates the need for an\nextra Transformer decoder. Experiments show that our stereo image codec\nachieves state-of-the-art rate-distortion performance on two stereo image\ndatasets Cityscapes and InStereo2K with fast encoding and decoding speed.\n","authors":["Xinjie Zhang","Shenyuan Gao","Zhening Liu","Jiawei Shao","Xingtong Ge","Dailan He","Tongda Xu","Yan Wang","Jun Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.08505v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10297v2","updated":"2024-03-20T02:34:27Z","published":"2024-03-15T13:40:37Z","title":"Leveraging Neural Radiance Field in Descriptor Synthesis for Keypoints\n Scene Coordinate Regression","summary":" Classical structural-based visual localization methods offer high accuracy\nbut face trade-offs in terms of storage, speed, and privacy. A recent\ninnovation, keypoint scene coordinate regression (KSCR) named D2S addresses\nthese issues by leveraging graph attention networks to enhance keypoint\nrelationships and predict their 3D coordinates using a simple multilayer\nperceptron (MLP). Camera pose is then determined via PnP+RANSAC, using\nestablished 2D-3D correspondences. While KSCR achieves competitive results,\nrivaling state-of-the-art image-retrieval methods like HLoc across multiple\nbenchmarks, its performance is hindered when data samples are limited due to\nthe deep learning model's reliance on extensive data. This paper proposes a\nsolution to this challenge by introducing a pipeline for keypoint descriptor\nsynthesis using Neural Radiance Field (NeRF). By generating novel poses and\nfeeding them into a trained NeRF model to create new views, our approach\nenhances the KSCR's generalization capabilities in data-scarce environments.\nThe proposed system could significantly improve localization accuracy by up to\n50% and cost only a fraction of time for data synthesis. Furthermore, its\nmodular design allows for the integration of multiple NeRFs, offering a\nversatile and efficient solution for visual localization. The implementation is\npublicly available at: https://github.com/ais-lab/DescriptorSynthesis4Feat2Map.\n","authors":["Huy-Hoang Bui","Bach-Thuan Bui","Dinh-Tuan Tran","Joo-Ho Lee"],"pdf_url":"https://arxiv.org/pdf/2403.10297v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.20210v2","updated":"2024-03-20T02:34:21Z","published":"2023-10-31T06:19:09Z","title":"UWFormer: Underwater Image Enhancement via a Semi-Supervised Multi-Scale\n Transformer","summary":" Underwater images often exhibit poor quality, distorted color balance and low\ncontrast due to the complex and intricate interplay of light, water, and\nobjects. Despite the significant contributions of previous underwater\nenhancement techniques, there exist several problems that demand further\nimprovement: (i) The current deep learning methods rely on Convolutional Neural\nNetworks (CNNs) that lack the multi-scale enhancement, and global perception\nfield is also limited. (ii) The scarcity of paired real-world underwater\ndatasets poses a significant challenge, and the utilization of synthetic image\npairs could lead to overfitting. To address the aforementioned problems, this\npaper introduces a Multi-scale Transformer-based Network called UWFormer for\nenhancing images at multiple frequencies via semi-supervised learning, in which\nwe propose a Nonlinear Frequency-aware Attention mechanism and a Multi-Scale\nFusion Feed-forward Network for low-frequency enhancement. Besides, we\nintroduce a special underwater semi-supervised training strategy, where we\npropose a Subaqueous Perceptual Loss function to generate reliable pseudo\nlabels. Experiments using full-reference and non-reference underwater\nbenchmarks demonstrate that our method outperforms state-of-the-art methods in\nterms of both quantity and visual quality.\n","authors":["Yingtie Lei","Weiwen Chen","Shenghong Luo","Ziyang Zhou","Mingxian Li","Chi-Man Pun"],"pdf_url":"https://arxiv.org/pdf/2310.20210v2.pdf","comment":"Accepted by IJCNN 2024"},{"id":"http://arxiv.org/abs/2403.13249v1","updated":"2024-03-20T02:21:44Z","published":"2024-03-20T02:21:44Z","title":"A Unified and General Framework for Continual Learning","summary":" Continual Learning (CL) focuses on learning from dynamic and changing data\ndistributions while retaining previously acquired knowledge. Various methods\nhave been developed to address the challenge of catastrophic forgetting,\nincluding regularization-based, Bayesian-based, and memory-replay-based\ntechniques. However, these methods lack a unified framework and common\nterminology for describing their approaches. This research aims to bridge this\ngap by introducing a comprehensive and overarching framework that encompasses\nand reconciles these existing methodologies. Notably, this new framework is\ncapable of encompassing established CL approaches as special instances within a\nunified and general optimization objective. An intriguing finding is that\ndespite their diverse origins, these methods share common mathematical\nstructures. This observation highlights the compatibility of these seemingly\ndistinct techniques, revealing their interconnectedness through a shared\nunderlying optimization objective. Moreover, the proposed general framework\nintroduces an innovative concept called refresh learning, specifically designed\nto enhance the CL performance. This novel approach draws inspiration from\nneuroscience, where the human brain often sheds outdated information to improve\nthe retention of crucial knowledge and facilitate the acquisition of new\ninformation. In essence, refresh learning operates by initially unlearning\ncurrent data and subsequently relearning it. It serves as a versatile plug-in\nthat seamlessly integrates with existing CL methods, offering an adaptable and\neffective enhancement to the learning process. Extensive experiments on CL\nbenchmarks and theoretical analysis demonstrate the effectiveness of the\nproposed refresh learning. Code is available at\n\\url{https://github.com/joey-wang123/CL-refresh-learning}.\n","authors":["Zhenyi Wang","Yan Li","Li Shen","Heng Huang"],"pdf_url":"https://arxiv.org/pdf/2403.13249v1.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2403.13248v1","updated":"2024-03-20T02:19:21Z","published":"2024-03-20T02:19:21Z","title":"Mora: Enabling Generalist Video Generation via A Multi-Agent Framework","summary":" Sora is the first large-scale generalist video generation model that garnered\nsignificant attention across society. Since its launch by OpenAI in February\n2024, no other video generation models have paralleled {Sora}'s performance or\nits capacity to support a broad spectrum of video generation tasks.\nAdditionally, there are only a few fully published video generation models,\nwith the majority being closed-source. To address this gap, this paper proposes\na new multi-agent framework Mora, which incorporates several advanced visual AI\nagents to replicate generalist video generation demonstrated by Sora. In\nparticular, Mora can utilize multiple visual agents and successfully mimic\nSora's video generation capabilities in various tasks, such as (1)\ntext-to-video generation, (2) text-conditional image-to-video generation, (3)\nextend generated videos, (4) video-to-video editing, (5) connect videos and (6)\nsimulate digital worlds. Our extensive experimental results show that Mora\nachieves performance that is proximate to that of Sora in various tasks.\nHowever, there exists an obvious performance gap between our work and Sora when\nassessed holistically. In summary, we hope this project can guide the future\ntrajectory of video generation through collaborative AI agents.\n","authors":["Zhengqing Yuan","Ruoxi Chen","Zhaoxu Li","Haolong Jia","Lifang He","Chi Wang","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2403.13248v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12434v2","updated":"2024-03-20T02:04:21Z","published":"2024-03-19T04:47:56Z","title":"Human Mesh Recovery from Arbitrary Multi-view Images","summary":" Human mesh recovery from arbitrary multi-view images involves two\ncharacteristics: the arbitrary camera poses and arbitrary number of camera\nviews. Because of the variability, designing a unified framework to tackle this\ntask is challenging. The challenges can be summarized as the dilemma of being\nable to simultaneously estimate arbitrary camera poses and recover human mesh\nfrom arbitrary multi-view images while maintaining flexibility. To solve this\ndilemma, we propose a divide and conquer framework for Unified Human Mesh\nRecovery (U-HMR) from arbitrary multi-view images. In particular, U-HMR\nconsists of a decoupled structure and two main components: camera and body\ndecoupling (CBD), camera pose estimation (CPE), and arbitrary view fusion\n(AVF). As camera poses and human body mesh are independent of each other, CBD\nsplits the estimation of them into two sub-tasks for two individual\nsub-networks (\\ie, CPE and AVF) to handle respectively, thus the two sub-tasks\nare disentangled. In CPE, since each camera pose is unrelated to the others, we\nadopt a shared MLP to process all views in a parallel way. In AVF, in order to\nfuse multi-view information and make the fusion operation independent of the\nnumber of views, we introduce a transformer decoder with a SMPL parameters\nquery token to extract cross-view features for mesh recovery. To demonstrate\nthe efficacy and flexibility of the proposed framework and effect of each\ncomponent, we conduct extensive experiments on three public datasets:\nHuman3.6M, MPI-INF-3DHP, and TotalCapture.\n","authors":["Xiaoben Li","Mancheng Meng","Ziyan Wu","Terrence Chen","Fan Yang","Dinggang Shen"],"pdf_url":"https://arxiv.org/pdf/2403.12434v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.02317v4","updated":"2024-03-20T02:03:52Z","published":"2024-01-04T15:34:44Z","title":"BA-SAM: Scalable Bias-Mode Attention Mask for Segment Anything Model","summary":" In this paper, we address the challenge of image resolution variation for the\nSegment Anything Model (SAM). SAM, known for its zero-shot generalizability,\nexhibits a performance degradation when faced with datasets with varying image\nsizes. Previous approaches tend to resize the image to a fixed size or adopt\nstructure modifications, hindering the preservation of SAM's rich prior\nknowledge. Besides, such task-specific tuning necessitates a complete\nretraining of the model, which is cost-expensive and unacceptable for\ndeployment in the downstream tasks. In this paper, we reformulate this issue as\na length extrapolation problem, where token sequence length varies while\nmaintaining a consistent patch size for images of different sizes. To this end,\nwe propose Scalable Bias-Mode Attention Mask (BA-SAM) to enhance SAM's\nadaptability to varying image resolutions while eliminating the need for\nstructure modifications. Firstly, we introduce a new scaling factor to ensure\nconsistent magnitude in the attention layer's dot product values when the token\nsequence length changes. Secondly, we present a bias-mode attention mask that\nallows each token to prioritize neighboring information, mitigating the impact\nof untrained distant information. Our BA-SAM demonstrates efficacy in two\nscenarios: zero-shot and fine-tuning. Extensive evaluation on diverse datasets,\nincluding DIS5K, DUTS, ISIC, COD10K, and COCO, reveals its ability to\nsignificantly mitigate performance degradation in the zero-shot setting and\nachieve state-of-the-art performance with minimal fine-tuning. Furthermore, we\npropose a generalized model and benchmark, showcasing BA-SAM's generalizability\nacross all four datasets simultaneously. Code is available at\nhttps://github.com/zongzi13545329/BA-SAM\n","authors":["Yiran Song","Qianyu Zhou","Xiangtai Li","Deng-Ping Fan","Xuequan Lu","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2401.02317v4.pdf","comment":"Accepted to IEEE/CVF Conference on Computer Vision and Pattern\n Recognition (CVPR), 2024"},{"id":"http://arxiv.org/abs/2403.11625v2","updated":"2024-03-20T02:01:58Z","published":"2024-03-18T09:56:48Z","title":"GaussNav: Gaussian Splatting for Visual Navigation","summary":" In embodied vision, Instance ImageGoal Navigation (IIN) requires an agent to\nlocate a specific object depicted in a goal image within an unexplored\nenvironment. The primary difficulty of IIN stems from the necessity of\nrecognizing the target object across varying viewpoints and rejecting potential\ndistractors.\n Existing map-based navigation methods largely adopt the representation form\nof Bird's Eye View (BEV) maps, which, however, lack the representation of\ndetailed textures in a scene.\n To address the above issues, we propose a new Gaussian Splatting Navigation\n(abbreviated as GaussNav) framework for IIN task, which constructs a novel map\nrepresentation based on 3D Gaussian Splatting (3DGS).\n The proposed framework enables the agent to not only memorize the geometry\nand semantic information of the scene, but also retain the textural features of\nobjects.\n Our GaussNav framework demonstrates a significant leap in performance,\nevidenced by an increase in Success weighted by Path Length (SPL) from 0.252 to\n0.578 on the challenging Habitat-Matterport 3D (HM3D) dataset.\n Our code will be made publicly available.\n","authors":["Xiaohan Lei","Min Wang","Wengang Zhou","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2403.11625v2.pdf","comment":"conference"},{"id":"http://arxiv.org/abs/2403.13238v1","updated":"2024-03-20T01:59:43Z","published":"2024-03-20T01:59:43Z","title":"Beyond Skeletons: Integrative Latent Mapping for Coherent 4D Sequence\n Generation","summary":" Directly learning to model 4D content, including shape, color and motion, is\nchallenging. Existing methods depend on skeleton-based motion control and offer\nlimited continuity in detail. To address this, we propose a novel framework\nthat generates coherent 4D sequences with animation of 3D shapes under given\nconditions with dynamic evolution of shape and color over time through\nintegrative latent mapping. We first employ an integrative latent unified\nrepresentation to encode shape and color information of each detailed 3D\ngeometry frame. The proposed skeleton-free latent 4D sequence joint\nrepresentation allows us to leverage diffusion models in a low-dimensional\nspace to control the generation of 4D sequences. Finally, temporally coherent\n4D sequences are generated conforming well to the input images and text\nprompts. Extensive experiments on the ShapeNet, 3DBiCar and DeformingThings4D\ndatasets for several tasks demonstrate that our method effectively learns to\ngenerate quality 3D shapes with color and 4D mesh animations, improving over\nthe current state-of-the-art. Source code will be released.\n","authors":["Qitong Yang","Mingtao Feng","Zijie Wu","Shijie Sun","Weisheng Dong","Yaonan Wang","Ajmal Mian"],"pdf_url":"https://arxiv.org/pdf/2403.13238v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12267v2","updated":"2024-03-20T01:46:13Z","published":"2024-03-18T21:32:58Z","title":"Data-Efficient Contrastive Language-Image Pretraining: Prioritizing Data\n Quality over Quantity","summary":" Contrastive Language-Image Pre-training (CLIP) on large-scale image-caption\ndatasets learns representations that can achieve remarkable zero-shot\ngeneralization. However, such models require a massive amount of pre-training\ndata. Improving the quality of the pre-training data has been shown to be much\nmore effective in improving CLIP's performance than increasing its volume.\nNevertheless, finding small subsets of training data that provably generalize\nthe best has remained an open question. In this work, we propose the first\ntheoretically rigorous data selection method for CLIP. We show that subsets\nthat closely preserve the cross-covariance of the images and captions of the\nfull data provably achieve a superior generalization performance. Our extensive\nexperiments on ConceptualCaptions3M and ConceptualCaptions12M demonstrate that\nsubsets found by \\method\\ achieve over 2.7x and 1.4x the accuracy of the next\nbest baseline on ImageNet and its shifted versions. Moreover, we show that our\nsubsets obtain 1.5x the average accuracy across 11 downstream datasets, of the\nnext best baseline. The code is available at:\nhttps://github.com/BigML-CS-UCLA/clipcov-data-efficient-clip.\n","authors":["Siddharth Joshi","Arnav Jain","Ali Payani","Baharan Mirzasoleiman"],"pdf_url":"https://arxiv.org/pdf/2403.12267v2.pdf","comment":"AISTATS 2024, Code:\n https://github.com/BigML-CS-UCLA/clipcov-data-efficient-clip"},{"id":"http://arxiv.org/abs/2403.11310v2","updated":"2024-03-20T01:34:35Z","published":"2024-03-17T19:10:07Z","title":"A Dual-Augmentor Framework for Domain Generalization in 3D Human Pose\n Estimation","summary":" 3D human pose data collected in controlled laboratory settings present\nchallenges for pose estimators that generalize across diverse scenarios. To\naddress this, domain generalization is employed. Current methodologies in\ndomain generalization for 3D human pose estimation typically utilize\nadversarial training to generate synthetic poses for training. Nonetheless,\nthese approaches exhibit several limitations. First, the lack of prior\ninformation about the target domain complicates the application of suitable\naugmentation through a single pose augmentor, affecting generalization on\ntarget domains. Moreover, adversarial training's discriminator tends to enforce\nsimilarity between source and synthesized poses, impeding the exploration of\nout-of-source distributions. Furthermore, the pose estimator's optimization is\nnot exposed to domain shifts, limiting its overall generalization ability.\n To address these limitations, we propose a novel framework featuring two pose\naugmentors: the weak and the strong augmentors. Our framework employs\ndifferential strategies for generation and discrimination processes,\nfacilitating the preservation of knowledge related to source poses and the\nexploration of out-of-source distributions without prior information about\ntarget poses. Besides, we leverage meta-optimization to simulate domain shifts\nin the optimization process of the pose estimator, thereby improving its\ngeneralization ability. Our proposed approach significantly outperforms\nexisting methods, as demonstrated through comprehensive experiments on various\nbenchmark datasets.Our code will be released at\n\\url{https://github.com/davidpengucf/DAF-DG}.\n","authors":["Qucheng Peng","Ce Zheng","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2403.11310v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.11573v2","updated":"2024-03-20T01:13:48Z","published":"2024-03-18T08:50:04Z","title":"Just Add $100 More: Augmenting NeRF-based Pseudo-LiDAR Point Cloud for\n Resolving Class-imbalance Problem","summary":" Typical LiDAR-based 3D object detection models are trained in a supervised\nmanner with real-world data collection, which is often imbalanced over classes\n(or long-tailed). To deal with it, augmenting minority-class examples by\nsampling ground truth (GT) LiDAR points from a database and pasting them into a\nscene of interest is often used, but challenges still remain: inflexibility in\nlocating GT samples and limited sample diversity. In this work, we propose to\nleverage pseudo-LiDAR point clouds generated (at a low cost) from videos\ncapturing a surround view of miniatures or real-world objects of minor classes.\nOur method, called Pseudo Ground Truth Augmentation (PGT-Aug), consists of\nthree main steps: (i) volumetric 3D instance reconstruction using a 2D-to-3D\nview synthesis model, (ii) object-level domain alignment with LiDAR intensity\nestimation and (iii) a hybrid context-aware placement method from ground and\nmap information. We demonstrate the superiority and generality of our method\nthrough performance improvements in extensive experiments conducted on three\npopular benchmarks, i.e., nuScenes, KITTI, and Lyft, especially for the\ndatasets with large domain gaps captured by different LiDAR configurations. Our\ncode and data will be publicly available upon publication.\n","authors":["Mincheol Chang","Siyeong Lee","Jinkyu Kim","Namil Kim"],"pdf_url":"https://arxiv.org/pdf/2403.11573v2.pdf","comment":"28 pages, 12 figures, 11 tables"},{"id":"http://arxiv.org/abs/2311.16581v2","updated":"2024-03-20T00:59:44Z","published":"2023-11-28T07:55:25Z","title":"GeoScaler: Geometry and Rendering-Aware Downsampling of 3D Mesh Textures","summary":" High-resolution texture maps are necessary for representing real-world\nobjects accurately with 3D meshes. The large sizes of textures can bottleneck\nthe real-time rendering of high-quality virtual 3D scenes on devices having low\ncomputational budgets and limited memory. Downsampling the texture maps\ndirectly addresses the issue, albeit at the cost of visual fidelity.\nTraditionally, downsampling of texture maps is performed using methods like\nbicubic interpolation and the Lanczos algorithm. These methods ignore the\ngeometric layout of the mesh and its UV parametrization and also do not account\nfor the rendering process used to obtain the final visualization that the users\nwill experience. Towards filling these gaps, we introduce GeoScaler, which is a\nmethod of downsampling texture maps of 3D meshes while incorporating geometric\ncues, and by maximizing the visual fidelity of the rendered views of the\ntextured meshes. We show that the textures generated by GeoScaler deliver\nsignificantly better quality rendered images compared to those generated by\ntraditional downsampling methods\n","authors":["Sai Karthikey Pentapati","Anshul Rai","Arkady Ten","Chaitanya Atluru","Alan Bovik"],"pdf_url":"https://arxiv.org/pdf/2311.16581v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05892v4","updated":"2024-03-20T00:58:15Z","published":"2024-02-08T18:30:50Z","title":"Mamba-ND: Selective State Space Modeling for Multi-Dimensional Data","summary":" In recent years, Transformers have become the de-facto architecture for\nsequence modeling on text and a variety of multi-dimensional data, such as\nimages and video. However, the use of self-attention layers in a Transformer\nincurs prohibitive compute and memory complexity that scales quadratically\nw.r.t. the sequence length. A recent architecture, Mamba, based on state space\nmodels has been shown to achieve comparable performance for modeling text\nsequences, while scaling linearly with the sequence length. In this work, we\npresent Mamba-ND, a generalized design extending the Mamba architecture to\narbitrary multi-dimensional data. Our design alternatively unravels the input\ndata across different dimensions following row-major orderings. We provide a\nsystematic comparison of Mamba-ND with several other alternatives, based on\nprior multi-dimensional extensions such as Bi-directional LSTMs and S4ND.\nEmpirically, we show that Mamba-ND demonstrates performance competitive with\nthe state-of-the-art on a variety of multi-dimensional benchmarks, including\nImageNet-1K classification, HMDB-51 action recognition, and ERA5 weather\nforecasting.\n","authors":["Shufan Li","Harkanwar Singh","Aditya Grover"],"pdf_url":"https://arxiv.org/pdf/2402.05892v4.pdf","comment":"22 pages, 7 figures"},{"id":"http://arxiv.org/abs/2403.13218v1","updated":"2024-03-20T00:37:19Z","published":"2024-03-20T00:37:19Z","title":"Self-Attention Based Semantic Decomposition in Vector Symbolic\n Architectures","summary":" Vector Symbolic Architectures (VSAs) have emerged as a novel framework for\nenabling interpretable machine learning algorithms equipped with the ability to\nreason and explain their decision processes. The basic idea is to represent\ndiscrete information through high dimensional random vectors. Complex data\nstructures can be built up with operations over vectors such as the \"binding\"\noperation involving element-wise vector multiplication, which associates data\ntogether. The reverse task of decomposing the associated elements is a\ncombinatorially hard task, with an exponentially large search space. The main\nalgorithm for performing this search is the resonator network, inspired by\nHopfield network-based memory search operations.\n In this work, we introduce a new variant of the resonator network, based on\nself-attention based update rules in the iterative search problem. This update\nrule, based on the Hopfield network with log-sum-exp energy function and\nnorm-bounded states, is shown to substantially improve the performance and rate\nof convergence. As a result, our algorithm enables a larger capacity for\nassociative memory, enabling applications in many tasks like perception based\npattern recognition, scene decomposition, and object reasoning. We substantiate\nour algorithm with a thorough evaluation and comparisons to baselines.\n","authors":["Calvin Yeung","Prathyush Poduval","Mohsen Imani"],"pdf_url":"https://arxiv.org/pdf/2403.13218v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13214v1","updated":"2024-03-20T00:23:42Z","published":"2024-03-20T00:23:42Z","title":"Nellie: Automated organelle segmentation, tracking, and hierarchical\n feature extraction in 2D/3D live-cell microscopy","summary":" The analysis of dynamic organelles remains a formidable challenge, though key\nto understanding biological processes. We introduce Nellie, an automated and\nunbiased pipeline for segmentation, tracking, and feature extraction of diverse\nintracellular structures. Nellie adapts to image metadata, eliminating user\ninput. Nellie's preprocessing pipeline enhances structural contrast on multiple\nintracellular scales allowing for robust hierarchical segmentation of\nsub-organellar regions. Internal motion capture markers are generated and\ntracked via a radius-adaptive pattern matching scheme, and used as guides for\nsub-voxel flow interpolation. Nellie extracts a plethora of features at\nmultiple hierarchical levels for deep and customizable analysis. Nellie\nfeatures a Napari-based GUI that allows for code-free operation and\nvisualization, while its modular open-source codebase invites customization by\nexperienced users. We demonstrate Nellie's wide variety of use cases with two\nexamples: unmixing multiple organelles from a single channel using\nfeature-based classification and training an unsupervised graph autoencoder on\nmitochondrial multi-mesh graphs to quantify latent space embedding changes\nfollowing ionomycin treatment.\n","authors":["Austin E. Y. T. Lefebvre","Gabriel Sturm","Ting-Yu Lin","Emily Stoops","Magdalena Preciado Lopez","Benjamin Kaufmann-Malaga","Kayley Hake"],"pdf_url":"https://arxiv.org/pdf/2403.13214v1.pdf","comment":"for associated code, see https://github.com/aelefebv/nellie; 82\n pages, 5 main figures, 11 extended figures"},{"id":"http://arxiv.org/abs/2312.14115v2","updated":"2024-03-20T00:23:39Z","published":"2023-12-21T18:40:34Z","title":"LingoQA: Video Question Answering for Autonomous Driving","summary":" Autonomous driving has long faced a challenge with public acceptance due to\nthe lack of explainability in the decision-making process. Video\nquestion-answering (QA) in natural language provides the opportunity for\nbridging this gap. Nonetheless, evaluating the performance of Video QA models\nhas proved particularly tough due to the absence of comprehensive benchmarks.\nTo fill this gap, we introduce LingoQA, a benchmark specifically for autonomous\ndriving Video QA. The LingoQA trainable metric demonstrates a 0.95 Spearman\ncorrelation coefficient with human evaluations. We introduce a Video QA dataset\nof central London consisting of 419k samples that we release with the paper. We\nestablish a baseline vision-language model and run extensive ablation studies\nto understand its performance.\n","authors":["Ana-Maria Marcu","Long Chen","Jan Hünermann","Alice Karnsund","Benoit Hanotte","Prajwal Chidananda","Saurabh Nair","Vijay Badrinarayanan","Alex Kendall","Jamie Shotton","Elahe Arani","Oleg Sinavski"],"pdf_url":"https://arxiv.org/pdf/2312.14115v2.pdf","comment":"Benchmark and dataset are available at\n https://github.com/wayveai/LingoQA/"},{"id":"http://arxiv.org/abs/2212.02477v3","updated":"2024-03-20T00:20:34Z","published":"2022-12-05T18:37:41Z","title":"Malaria Parasitic Detection using a New Deep Boosted and Ensemble\n Learning Framework","summary":" Malaria is a potentially fatal plasmodium parasite injected by female\nanopheles mosquitoes that infect red blood cells and millions worldwide yearly.\nHowever, specialists' manual screening in clinical practice is laborious and\nprone to error. Therefore, a novel Deep Boosted and Ensemble Learning (DBEL)\nframework, comprising the stacking of new Boosted-BR-STM convolutional neural\nnetworks (CNN) and the ensemble ML classifiers, is developed to screen malaria\nparasite images. The proposed Boosted-BR-STM is based on a new\ndilated-convolutional block-based split transform merge (STM) and feature-map\nSqueezing-Boosting (SB) ideas. Moreover, the new STM block uses regional and\nboundary operations to learn the malaria parasite's homogeneity, heterogeneity,\nand boundary with patterns. Furthermore, the diverse boosted channels are\nattained by employing Transfer Learning-based new feature-map SB in STM blocks\nat the abstract, medium, and conclusion levels to learn minute intensity and\ntexture variation of the parasitic pattern. The proposed DBEL framework\nimplicates the stacking of prominent and diverse boosted channels and provides\nthe generated discriminative features of the developed Boosted-BR-STM to the\nensemble of ML classifiers. The proposed framework improves the discrimination\nability and generalization of ensemble learning. Moreover, the deep feature\nspaces of the developed Boosted-BR-STM and customized CNNs are fed into ML\nclassifiers for comparative analysis. The proposed DBEL framework outperforms\nthe existing techniques on the NIH malaria dataset that are enhanced using\ndiscrete wavelet transform to enrich feature space. The proposed DBEL framework\nachieved Accuracy (98.50%), Sensitivity (0.9920), F-score (0.9850), and AUC\n(0.997), which suggest it to be utilized for malaria parasite screening.\n","authors":["Saddam Hussain Khan","Tahani Jaser Alahmadi"],"pdf_url":"https://arxiv.org/pdf/2212.02477v3.pdf","comment":"26 pages, 10 figures, 9 Tables"},{"id":"http://arxiv.org/abs/2312.06071v2","updated":"2024-03-20T00:12:22Z","published":"2023-12-11T02:38:07Z","title":"Precipitation Downscaling with Spatiotemporal Video Diffusion","summary":" In climate science and meteorology, high-resolution local precipitation (rain\nand snowfall) predictions are limited by the computational costs of\nsimulation-based methods. Statistical downscaling, or super-resolution, is a\ncommon workaround where a low-resolution prediction is improved using\nstatistical approaches. Unlike traditional computer vision tasks, weather and\nclimate applications require capturing the accurate conditional distribution of\nhigh-resolution given low-resolution patterns to assure reliable ensemble\naverages and unbiased estimates of extreme events, such as heavy rain. This\nwork extends recent video diffusion models to precipitation super-resolution,\nemploying a deterministic downscaler followed by a temporally-conditioned\ndiffusion model to capture noise characteristics and high-frequency patterns.\nWe test our approach on FV3GFS output, an established large-scale global\natmosphere model, and compare it against five state-of-the-art baselines. Our\nanalysis, capturing CRPS, MSE, precipitation distributions, and qualitative\naspects using California and the Himalayas as examples, establishes our method\nas a new standard for data-driven precipitation downscaling.\n","authors":["Prakhar Srivastava","Ruihan Yang","Gavin Kerrigan","Gideon Dresdner","Jeremy McGibbon","Christopher Bretherton","Stephan Mandt"],"pdf_url":"https://arxiv.org/pdf/2312.06071v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15476v1","updated":"2024-03-20T17:29:58Z","published":"2024-03-20T17:29:58Z","title":"Learning to Infer Generative Template Programs for Visual Concepts","summary":" People grasp flexible visual concepts from a few examples. We explore a\nneurosymbolic system that learns how to infer programs that capture visual\nconcepts in a domain-general fashion. We introduce Template Programs:\nprogrammatic expressions from a domain-specific language that specify\nstructural and parametric patterns common to an input concept. Our framework\nsupports multiple concept-related tasks, including few-shot generation and\nco-segmentation through parsing. We develop a learning paradigm that allows us\nto train networks that infer Template Programs directly from visual datasets\nthat contain concept groupings. We run experiments across multiple visual\ndomains: 2D layouts, Omniglot characters, and 3D shapes. We find that our\nmethod outperforms task-specific alternatives, and performs competitively\nagainst domain-specific approaches for the limited domains where they exist.\n","authors":["R. Kenny Jones","Siddhartha Chaudhuri","Daniel Ritchie"],"pdf_url":"https://arxiv.org/pdf/2403.15476v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15474v1","updated":"2024-03-20T16:25:49Z","published":"2024-03-20T16:25:49Z","title":"EC-IoU: Orienting Safety for Object Detectors via Ego-Centric\n Intersection-over-Union","summary":" This paper presents safety-oriented object detection via a novel Ego-Centric\nIntersection-over-Union (EC-IoU) measure, addressing practical concerns when\napplying state-of-the-art learning-based perception models in safety-critical\ndomains such as autonomous driving. Concretely, we propose a weighting\nmechanism to refine the widely used IoU measure, allowing it to assign a higher\nscore to a prediction that covers closer points of a ground-truth object from\nthe ego agent's perspective. The proposed EC-IoU measure can be used in typical\nevaluation processes to select object detectors with higher safety-related\nperformance for downstream tasks. It can also be integrated into common loss\nfunctions for model fine-tuning. While geared towards safety, our experiment\nwith the KITTI dataset demonstrates the performance of a model trained on\nEC-IoU can be better than that of a variant trained on IoU in terms of mean\nAverage Precision as well.\n","authors":["Brian Hsuan-Cheng Liao","Chih-Hong Cheng","Hasan Esen","Alois Knoll"],"pdf_url":"https://arxiv.org/pdf/2403.15474v1.pdf","comment":"8 pages (IEEE double column format), 7 figures, 2 tables, submitted\n to IROS 2024"},{"id":"http://arxiv.org/abs/2403.15466v1","updated":"2024-03-20T03:42:15Z","published":"2024-03-20T03:42:15Z","title":"Using Super-Resolution Imaging for Recognition of Low-Resolution Blurred\n License Plates: A Comparative Study of Real-ESRGAN, A-ESRGAN, and StarSRGAN","summary":" With the robust development of technology, license plate recognition\ntechnology can now be properly applied in various scenarios, such as road\nmonitoring, tracking of stolen vehicles, detection at parking lot entrances and\nexits, and so on. However, the precondition for these applications to function\nnormally is that the license plate must be 'clear' enough to be recognized by\nthe system with the correct license plate number. If the license plate becomes\nblurred due to some external factors, then the accuracy of recognition will be\ngreatly reduced. Although there are many road surveillance cameras in Taiwan,\nthe quality of most cameras is not good, often leading to the inability to\nrecognize license plate numbers due to low photo resolution. Therefore, this\nstudy focuses on using super-resolution technology to process blurred license\nplates. This study will mainly fine-tune three super-resolution models:\nReal-ESRGAN, A-ESRGAN, and StarSRGAN, and compare their effectiveness in\nenhancing the resolution of license plate photos and enabling accurate license\nplate recognition. By comparing different super-resolution models, it is hoped\nto find the most suitable model for this task, providing valuable references\nfor future researchers.\n","authors":["Ching-Hsiang Wang"],"pdf_url":"https://arxiv.org/pdf/2403.15466v1.pdf","comment":"Master's thesis"}]},"2024-03-21T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2403.14628v1","updated":"2024-03-21T17:59:59Z","published":"2024-03-21T17:59:59Z","title":"Zero-Shot Multi-Object Shape Completion","summary":" We present a 3D shape completion method that recovers the complete geometry\nof multiple objects in complex scenes from a single RGB-D image. Despite\nnotable advancements in single object 3D shape completion, high-quality\nreconstructions in highly cluttered real-world multi-object scenes remains a\nchallenge. To address this issue, we propose OctMAE, an architecture that\nleverages an Octree U-Net and a latent 3D MAE to achieve high-quality and near\nreal-time multi-object shape completion through both local and global geometric\nreasoning. Because a na\\\"ive 3D MAE can be computationally intractable and\nmemory intensive even in the latent space, we introduce a novel occlusion\nmasking strategy and adopt 3D rotary embeddings, which significantly improves\nthe runtime and shape completion quality. To generalize to a wide range of\nobjects in diverse scenes, we create a large-scale photorealistic dataset,\nfeaturing a diverse set of 12K 3D object models from the Objaverse dataset\nwhich are rendered in multi-object scenes with physics-based positioning. Our\nmethod outperforms the current state-of-the-art on both synthetic and\nreal-world datasets and demonstrates a strong zero-shot capability.\n","authors":["Shun Iwase","Katherine Liu","Vitor Guizilini","Adrien Gaidon","Kris Kitani","Rares Ambrus","Sergey Zakharov"],"pdf_url":"https://arxiv.org/pdf/2403.14628v1.pdf","comment":"21 pages, 8 figues"},{"id":"http://arxiv.org/abs/2403.14627v1","updated":"2024-03-21T17:59:58Z","published":"2024-03-21T17:59:58Z","title":"MVSplat: Efficient 3D Gaussian Splatting from Sparse Multi-View Images","summary":" We propose MVSplat, an efficient feed-forward 3D Gaussian Splatting model\nlearned from sparse multi-view images. To accurately localize the Gaussian\ncenters, we propose to build a cost volume representation via plane sweeping in\nthe 3D space, where the cross-view feature similarities stored in the cost\nvolume can provide valuable geometry cues to the estimation of depth. We learn\nthe Gaussian primitives' opacities, covariances, and spherical harmonics\ncoefficients jointly with the Gaussian centers while only relying on\nphotometric supervision. We demonstrate the importance of the cost volume\nrepresentation in learning feed-forward Gaussian Splatting models via extensive\nexperimental evaluations. On the large-scale RealEstate10K and ACID benchmarks,\nour model achieves state-of-the-art performance with the fastest feed-forward\ninference speed (22 fps). Compared to the latest state-of-the-art method\npixelSplat, our model uses $10\\times $ fewer parameters and infers more than\n$2\\times$ faster while providing higher appearance and geometry quality as well\nas better cross-dataset generalization.\n","authors":["Yuedong Chen","Haofei Xu","Chuanxia Zheng","Bohan Zhuang","Marc Pollefeys","Andreas Geiger","Tat-Jen Cham","Jianfei Cai"],"pdf_url":"https://arxiv.org/pdf/2403.14627v1.pdf","comment":"Project page: https://donydchen.github.io/mvsplat Code:\n https://github.com/donydchen/mvsplat"},{"id":"http://arxiv.org/abs/2403.14625v1","updated":"2024-03-21T17:59:55Z","published":"2024-03-21T17:59:55Z","title":"LiFT: A Surprisingly Simple Lightweight Feature Transform for Dense ViT\n Descriptors","summary":" We present a simple self-supervised method to enhance the performance of ViT\nfeatures for dense downstream tasks. Our Lightweight Feature Transform (LiFT)\nis a straightforward and compact postprocessing network that can be applied to\nenhance the features of any pre-trained ViT backbone. LiFT is fast and easy to\ntrain with a self-supervised objective, and it boosts the density of ViT\nfeatures for minimal extra inference cost. Furthermore, we demonstrate that\nLiFT can be applied with approaches that use additional task-specific\ndownstream modules, as we integrate LiFT with ViTDet for COCO detection and\nsegmentation. Despite the simplicity of LiFT, we find that it is not simply\nlearning a more complex version of bilinear interpolation. Instead, our LiFT\ntraining protocol leads to several desirable emergent properties that benefit\nViT features in dense downstream tasks. This includes greater scale invariance\nfor features, and better object boundary maps. By simply training LiFT for a\nfew epochs, we show improved performance on keypoint correspondence, detection,\nsegmentation, and object discovery tasks. Overall, LiFT provides an easy way to\nunlock the benefits of denser feature arrays for a fraction of the\ncomputational cost. For more details, refer to our project page at\nhttps://www.cs.umd.edu/~sakshams/LiFT/.\n","authors":["Saksham Suri","Matthew Walmer","Kamal Gupta","Abhinav Shrivastava"],"pdf_url":"https://arxiv.org/pdf/2403.14625v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14626v1","updated":"2024-03-21T17:59:55Z","published":"2024-03-21T17:59:55Z","title":"ODTFormer: Efficient Obstacle Detection and Tracking with Stereo Cameras\n Based on Transformer","summary":" Obstacle detection and tracking represent a critical component in robot\nautonomous navigation. In this paper, we propose ODTFormer, a Transformer-based\nmodel to address both obstacle detection and tracking problems. For the\ndetection task, our approach leverages deformable attention to construct a 3D\ncost volume, which is decoded progressively in the form of voxel occupancy\ngrids. We further track the obstacles by matching the voxels between\nconsecutive frames. The entire model can be optimized in an end-to-end manner.\nThrough extensive experiments on DrivingStereo and KITTI benchmarks, our model\nachieves state-of-the-art performance in the obstacle detection task. We also\nreport comparable accuracy to state-of-the-art obstacle tracking models while\nrequiring only a fraction of their computation cost, typically ten-fold to\ntwenty-fold less. The code and model weights will be publicly released.\n","authors":["Tianye Ding","Hongyu Li","Huaizu Jiang"],"pdf_url":"https://arxiv.org/pdf/2403.14626v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2403.14624v1","updated":"2024-03-21T17:59:50Z","published":"2024-03-21T17:59:50Z","title":"MathVerse: Does Your Multi-modal LLM Truly See the Diagrams in Visual\n Math Problems?","summary":" The remarkable progress of Multi-modal Large Language Models (MLLMs) has\ngarnered unparalleled attention, due to their superior performance in visual\ncontexts. However, their capabilities in visual math problem-solving remain\ninsufficiently evaluated and understood. We investigate current benchmarks to\nincorporate excessive visual content within textual questions, which\npotentially assist MLLMs in deducing answers without truly interpreting the\ninput diagrams. To this end, we introduce MathVerse, an all-around visual math\nbenchmark designed for an equitable and in-depth evaluation of MLLMs. We\nmeticulously collect 2,612 high-quality, multi-subject math problems with\ndiagrams from publicly available sources. Each problem is then transformed by\nhuman annotators into six distinct versions, each offering varying degrees of\ninformation content in multi-modality, contributing to 15K test samples in\ntotal. This approach allows MathVerse to comprehensively assess whether and how\nmuch MLLMs can truly understand the visual diagrams for mathematical reasoning.\nIn addition, we propose a Chain-of-Thought (CoT) evaluation strategy for a\nfine-grained assessment of the output answers. Rather than naively judging True\nor False, we employ GPT-4(V) to adaptively extract crucial reasoning steps, and\nthen score each step with detailed error analysis, which can reveal the\nintermediate CoT reasoning quality by MLLMs. We hope the MathVerse benchmark\nmay provide unique insights to guide the future development of MLLMs. Project\npage: https://mathverse-cuhk.github.io\n","authors":["Renrui Zhang","Dongzhi Jiang","Yichi Zhang","Haokun Lin","Ziyu Guo","Pengshuo Qiu","Aojun Zhou","Pan Lu","Kai-Wei Chang","Peng Gao","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2403.14624v1.pdf","comment":"46 Pages, Work in Progress, Benchmark Project Page:\n https://mathverse-cuhk.github.io"},{"id":"http://arxiv.org/abs/2403.14623v1","updated":"2024-03-21T17:59:41Z","published":"2024-03-21T17:59:41Z","title":"Simplified Diffusion Schrödinger Bridge","summary":" This paper introduces a novel theoretical simplification of the Diffusion\nSchr\\\"odinger Bridge (DSB) that facilitates its unification with Score-based\nGenerative Models (SGMs), addressing the limitations of DSB in complex data\ngeneration and enabling faster convergence and enhanced performance. By\nemploying SGMs as an initial solution for DSB, our approach capitalizes on the\nstrengths of both frameworks, ensuring a more efficient training process and\nimproving the performance of SGM. We also propose a reparameterization\ntechnique that, despite theoretical approximations, practically improves the\nnetwork's fitting capabilities. Our extensive experimental evaluations confirm\nthe effectiveness of the simplified DSB, demonstrating its significant\nimprovements. We believe the contributions of this work pave the way for\nadvanced generative modeling. The code is available at\nhttps://github.com/tzco/Simplified-Diffusion-Schrodinger-Bridge.\n","authors":["Zhicong Tang","Tiankai Hang","Shuyang Gu","Dong Chen","Baining Guo"],"pdf_url":"https://arxiv.org/pdf/2403.14623v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14622v1","updated":"2024-03-21T17:59:35Z","published":"2024-03-21T17:59:35Z","title":"Language Repository for Long Video Understanding","summary":" Language has become a prominent modality in computer vision with the rise of\nmulti-modal LLMs. Despite supporting long context-lengths, their effectiveness\nin handling long-term information gradually declines with input length. This\nbecomes critical, especially in applications such as long-form video\nunderstanding. In this paper, we introduce a Language Repository (LangRepo) for\nLLMs, that maintains concise and structured information as an interpretable\n(i.e., all-textual) representation. Our repository is updated iteratively based\non multi-scale video chunks. We introduce write and read operations that focus\non pruning redundancies in text, and extracting information at various temporal\nscales. The proposed framework is evaluated on zero-shot visual\nquestion-answering benchmarks including EgoSchema, NExT-QA, IntentQA and\nNExT-GQA, showing state-of-the-art performance at its scale. Our code is\navailable at https://github.com/kkahatapitiya/LangRepo.\n","authors":["Kumara Kahatapitiya","Kanchana Ranasinghe","Jongwoo Park","Michael S. Ryoo"],"pdf_url":"https://arxiv.org/pdf/2403.14622v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14621v1","updated":"2024-03-21T17:59:34Z","published":"2024-03-21T17:59:34Z","title":"GRM: Large Gaussian Reconstruction Model for Efficient 3D Reconstruction\n and Generation","summary":" We introduce GRM, a large-scale reconstructor capable of recovering a 3D\nasset from sparse-view images in around 0.1s. GRM is a feed-forward\ntransformer-based model that efficiently incorporates multi-view information to\ntranslate the input pixels into pixel-aligned Gaussians, which are unprojected\nto create a set of densely distributed 3D Gaussians representing a scene.\nTogether, our transformer architecture and the use of 3D Gaussians unlock a\nscalable and efficient reconstruction framework. Extensive experimental results\ndemonstrate the superiority of our method over alternatives regarding both\nreconstruction quality and efficiency. We also showcase the potential of GRM in\ngenerative tasks, i.e., text-to-3D and image-to-3D, by integrating it with\nexisting multi-view diffusion models. Our project website is at:\nhttps://justimyhxu.github.io/projects/grm/.\n","authors":["Yinghao Xu","Zifan Shi","Wang Yifan","Hansheng Chen","Ceyuan Yang","Sida Peng","Yujun Shen","Gordon Wetzstein"],"pdf_url":"https://arxiv.org/pdf/2403.14621v1.pdf","comment":"Project page: https://justimyhxu.github.io/projects/grm/ Code:\n https://github.com/justimyhxu/GRM"},{"id":"http://arxiv.org/abs/2403.14619v1","updated":"2024-03-21T17:59:16Z","published":"2024-03-21T17:59:16Z","title":"ClusteringSDF: Self-Organized Neural Implicit Surfaces for 3D\n Decomposition","summary":" 3D decomposition/segmentation still remains a challenge as large-scale 3D\nannotated data is not readily available. Contemporary approaches typically\nleverage 2D machine-generated segments, integrating them for 3D consistency.\nWhile the majority of these methods are based on NeRFs, they face a potential\nweakness that the instance/semantic embedding features derive from independent\nMLPs, thus preventing the segmentation network from learning the geometric\ndetails of the objects directly through radiance and density. In this paper, we\npropose ClusteringSDF, a novel approach to achieve both segmentation and\nreconstruction in 3D via the neural implicit surface representation,\nspecifically Signal Distance Function (SDF), where the segmentation rendering\nis directly integrated with the volume rendering of neural implicit surfaces.\nAlthough based on ObjectSDF++, ClusteringSDF no longer requires the\nground-truth segments for supervision while maintaining the capability of\nreconstructing individual object surfaces, but purely with the noisy and\ninconsistent labels from pre-trained models.As the core of ClusteringSDF, we\nintroduce a high-efficient clustering mechanism for lifting the 2D labels to 3D\nand the experimental results on the challenging scenes from ScanNet and Replica\ndatasets show that ClusteringSDF can achieve competitive performance compared\nagainst the state-of-the-art with significantly reduced training time.\n","authors":["Tianhao Wu","Chuanxia Zheng","Tat-Jen Cham","Qianyi Wu"],"pdf_url":"https://arxiv.org/pdf/2403.14619v1.pdf","comment":"Project Page: https://sm0kywu.github.io/ClusteringSDF/"},{"id":"http://arxiv.org/abs/2403.14617v1","updated":"2024-03-21T17:59:03Z","published":"2024-03-21T17:59:03Z","title":"Videoshop: Localized Semantic Video Editing with Noise-Extrapolated\n Diffusion Inversion","summary":" We introduce Videoshop, a training-free video editing algorithm for localized\nsemantic edits. Videoshop allows users to use any editing software, including\nPhotoshop and generative inpainting, to modify the first frame; it\nautomatically propagates those changes, with semantic, spatial, and temporally\nconsistent motion, to the remaining frames. Unlike existing methods that enable\nedits only through imprecise textual instructions, Videoshop allows users to\nadd or remove objects, semantically change objects, insert stock photos into\nvideos, etc. with fine-grained control over locations and appearance. We\nachieve this through image-based video editing by inverting latents with noise\nextrapolation, from which we generate videos conditioned on the edited image.\nVideoshop produces higher quality edits against 6 baselines on 2 editing\nbenchmarks using 10 evaluation metrics.\n","authors":["Xiang Fan","Anand Bhattad","Ranjay Krishna"],"pdf_url":"https://arxiv.org/pdf/2403.14617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14616v1","updated":"2024-03-21T17:58:56Z","published":"2024-03-21T17:58:56Z","title":"Hierarchical Text-to-Vision Self Supervised Alignment for Improved\n Histopathology Representation Learning","summary":" Self-supervised representation learning has been highly promising for\nhistopathology image analysis with numerous approaches leveraging their\npatient-slide-patch hierarchy to learn better representations. In this paper,\nwe explore how the combination of domain specific natural language information\nwith such hierarchical visual representations can benefit rich representation\nlearning for medical image tasks. Building on automated language description\ngeneration for features visible in histopathology images, we present a novel\nlanguage-tied self-supervised learning framework, Hierarchical Language-tied\nSelf-Supervision (HLSS) for histopathology images. We explore contrastive\nobjectives and granular language description based text alignment at multiple\nhierarchies to inject language modality information into the visual\nrepresentations. Our resulting model achieves state-of-the-art performance on\ntwo medical imaging benchmarks, OpenSRH and TCGA datasets. Our framework also\nprovides better interpretability with our language aligned representation\nspace. Code is available at https://github.com/Hasindri/HLSS.\n","authors":["Hasindri Watawana","Kanchana Ranasinghe","Tariq Mahmood","Muzammal Naseer","Salman Khan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2403.14616v1.pdf","comment":"13 pages and 5 figures"},{"id":"http://arxiv.org/abs/2403.14614v1","updated":"2024-03-21T17:58:14Z","published":"2024-03-21T17:58:14Z","title":"AdaIR: Adaptive All-in-One Image Restoration via Frequency Mining and\n Modulation","summary":" In the image acquisition process, various forms of degradation, including\nnoise, haze, and rain, are frequently introduced. These degradations typically\narise from the inherent limitations of cameras or unfavorable ambient\nconditions. To recover clean images from degraded versions, numerous\nspecialized restoration methods have been developed, each targeting a specific\ntype of degradation. Recently, all-in-one algorithms have garnered significant\nattention by addressing different types of degradations within a single model\nwithout requiring prior information of the input degradation type. However,\nthese methods purely operate in the spatial domain and do not delve into the\ndistinct frequency variations inherent to different degradation types. To\naddress this gap, we propose an adaptive all-in-one image restoration network\nbased on frequency mining and modulation. Our approach is motivated by the\nobservation that different degradation types impact the image content on\ndifferent frequency subbands, thereby requiring different treatments for each\nrestoration task. Specifically, we first mine low- and high-frequency\ninformation from the input features, guided by the adaptively decoupled spectra\nof the degraded image. The extracted features are then modulated by a\nbidirectional operator to facilitate interactions between different frequency\ncomponents. Finally, the modulated features are merged into the original input\nfor a progressively guided restoration. With this approach, the model achieves\nadaptive reconstruction by accentuating the informative frequency subbands\naccording to different input degradations. Extensive experiments demonstrate\nthat the proposed method achieves state-of-the-art performance on different\nimage restoration tasks, including denoising, dehazing, deraining, motion\ndeblurring, and low-light image enhancement. Our code is available at\nhttps://github.com/c-yn/AdaIR.\n","authors":["Yuning Cui","Syed Waqas Zamir","Salman Khan","Alois Knoll","Mubarak Shah","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2403.14614v1.pdf","comment":"28 pages,15 figures"},{"id":"http://arxiv.org/abs/2403.14613v1","updated":"2024-03-21T17:58:04Z","published":"2024-03-21T17:58:04Z","title":"DreamReward: Text-to-3D Generation with Human Preference","summary":" 3D content creation from text prompts has shown remarkable success recently.\nHowever, current text-to-3D methods often generate 3D results that do not align\nwell with human preferences. In this paper, we present a comprehensive\nframework, coined DreamReward, to learn and improve text-to-3D models from\nhuman preference feedback. To begin with, we collect 25k expert comparisons\nbased on a systematic annotation pipeline including rating and ranking. Then,\nwe build Reward3D -- the first general-purpose text-to-3D human preference\nreward model to effectively encode human preferences. Building upon the 3D\nreward model, we finally perform theoretical analysis and present the Reward3D\nFeedback Learning (DreamFL), a direct tuning algorithm to optimize the\nmulti-view diffusion models with a redefined scorer. Grounded by theoretical\nproof and extensive experiment comparisons, our DreamReward successfully\ngenerates high-fidelity and 3D consistent results with significant boosts in\nprompt alignment with human intention. Our results demonstrate the great\npotential for learning from human feedback to improve text-to-3D models.\n","authors":["Junliang Ye","Fangfu Liu","Qixiu Li","Zhengyi Wang","Yikai Wang","Xinzhou Wang","Yueqi Duan","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.14613v1.pdf","comment":"Project page: https://jamesyjl.github.io/DreamReward"},{"id":"http://arxiv.org/abs/2403.14611v1","updated":"2024-03-21T17:57:31Z","published":"2024-03-21T17:57:31Z","title":"Explorative Inbetweening of Time and Space","summary":" We introduce bounded generation as a generalized task to control video\ngeneration to synthesize arbitrary camera and subject motion based only on a\ngiven start and end frame. Our objective is to fully leverage the inherent\ngeneralization capability of an image-to-video model without additional\ntraining or fine-tuning of the original model. This is achieved through the\nproposed new sampling strategy, which we call Time Reversal Fusion, that fuses\nthe temporally forward and backward denoising paths conditioned on the start\nand end frame, respectively. The fused path results in a video that smoothly\nconnects the two frames, generating inbetweening of faithful subject motion,\nnovel views of static scenes, and seamless video looping when the two bounding\nframes are identical. We curate a diverse evaluation dataset of image pairs and\ncompare against the closest existing methods. We find that Time Reversal Fusion\noutperforms related work on all subtasks, exhibiting the ability to generate\ncomplex motions and 3D-consistent views guided by bounded frames. See project\npage at https://time-reversal.github.io.\n","authors":["Haiwen Feng","Zheng Ding","Zhihao Xia","Simon Niklaus","Victoria Abrevaya","Michael J. Black","Xuaner Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.14611v1.pdf","comment":"project page at https://time-reversal.github.io"},{"id":"http://arxiv.org/abs/2403.14610v1","updated":"2024-03-21T17:57:03Z","published":"2024-03-21T17:57:03Z","title":"T-Rex2: Towards Generic Object Detection via Text-Visual Prompt Synergy","summary":" We present T-Rex2, a highly practical model for open-set object detection.\nPrevious open-set object detection methods relying on text prompts effectively\nencapsulate the abstract concept of common objects, but struggle with rare or\ncomplex object representation due to data scarcity and descriptive limitations.\nConversely, visual prompts excel in depicting novel objects through concrete\nvisual examples, but fall short in conveying the abstract concept of objects as\neffectively as text prompts. Recognizing the complementary strengths and\nweaknesses of both text and visual prompts, we introduce T-Rex2 that synergizes\nboth prompts within a single model through contrastive learning. T-Rex2 accepts\ninputs in diverse formats, including text prompts, visual prompts, and the\ncombination of both, so that it can handle different scenarios by switching\nbetween the two prompt modalities. Comprehensive experiments demonstrate that\nT-Rex2 exhibits remarkable zero-shot object detection capabilities across a\nwide spectrum of scenarios. We show that text prompts and visual prompts can\nbenefit from each other within the synergy, which is essential to cover massive\nand complicated real-world scenarios and pave the way towards generic object\ndetection. Model API is now available at\n\\url{https://github.com/IDEA-Research/T-Rex}.\n","authors":["Qing Jiang","Feng Li","Zhaoyang Zeng","Tianhe Ren","Shilong Liu","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.14610v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2403.14602v1","updated":"2024-03-21T17:52:08Z","published":"2024-03-21T17:52:08Z","title":"ReNoise: Real Image Inversion Through Iterative Noising","summary":" Recent advancements in text-guided diffusion models have unlocked powerful\nimage manipulation capabilities. However, applying these methods to real images\nnecessitates the inversion of the images into the domain of the pretrained\ndiffusion model. Achieving faithful inversion remains a challenge, particularly\nfor more recent models trained to generate images with a small number of\ndenoising steps. In this work, we introduce an inversion method with a high\nquality-to-operation ratio, enhancing reconstruction accuracy without\nincreasing the number of operations. Building on reversing the diffusion\nsampling process, our method employs an iterative renoising mechanism at each\ninversion sampling step. This mechanism refines the approximation of a\npredicted point along the forward diffusion trajectory, by iteratively applying\nthe pretrained diffusion model, and averaging these predictions. We evaluate\nthe performance of our ReNoise technique using various sampling algorithms and\nmodels, including recent accelerated diffusion models. Through comprehensive\nevaluations and comparisons, we show its effectiveness in terms of both\naccuracy and speed. Furthermore, we confirm that our method preserves\neditability by demonstrating text-driven image editing on real images.\n","authors":["Daniel Garibi","Or Patashnik","Andrey Voynov","Hadar Averbuch-Elor","Daniel Cohen-Or"],"pdf_url":"https://arxiv.org/pdf/2403.14602v1.pdf","comment":"project page at: https://garibida.github.io/ReNoise-Inversion/"},{"id":"http://arxiv.org/abs/2403.14599v1","updated":"2024-03-21T17:51:01Z","published":"2024-03-21T17:51:01Z","title":"MyVLM: Personalizing VLMs for User-Specific Queries","summary":" Recent large-scale vision-language models (VLMs) have demonstrated remarkable\ncapabilities in understanding and generating textual descriptions for visual\ncontent. However, these models lack an understanding of user-specific concepts.\nIn this work, we take a first step toward the personalization of VLMs, enabling\nthem to learn and reason over user-provided concepts. For example, we explore\nwhether these models can learn to recognize you in an image and communicate\nwhat you are doing, tailoring the model to reflect your personal experiences\nand relationships. To effectively recognize a variety of user-specific\nconcepts, we augment the VLM with external concept heads that function as\ntoggles for the model, enabling the VLM to identify the presence of specific\ntarget concepts in a given image. Having recognized the concept, we learn a new\nconcept embedding in the intermediate feature space of the VLM. This embedding\nis tasked with guiding the language model to naturally integrate the target\nconcept in its generated response. We apply our technique to BLIP-2 and LLaVA\nfor personalized image captioning and further show its applicability for\npersonalized visual question-answering. Our experiments demonstrate our ability\nto generalize to unseen images of learned concepts while preserving the model\nbehavior on unrelated inputs.\n","authors":["Yuval Alaluf","Elad Richardson","Sergey Tulyakov","Kfir Aberman","Daniel Cohen-Or"],"pdf_url":"https://arxiv.org/pdf/2403.14599v1.pdf","comment":"Project page: https://snap-research.github.io/MyVLM/"},{"id":"http://arxiv.org/abs/2403.14598v1","updated":"2024-03-21T17:50:47Z","published":"2024-03-21T17:50:47Z","title":"PSALM: Pixelwise SegmentAtion with Large Multi-Modal Model","summary":" PSALM is a powerful extension of the Large Multi-modal Model (LMM) to address\nthe segmentation task challenges. To overcome the limitation of the LMM being\nlimited to textual output, PSALM incorporates a mask decoder and a\nwell-designed input schema to handle a variety of segmentation tasks. This\nschema includes images, task instructions, conditional prompts, and mask\ntokens, which enable the model to generate and classify segmentation masks\neffectively. The flexible design of PSALM supports joint training across\nmultiple datasets and tasks, leading to improved performance and task\ngeneralization. PSALM achieves superior results on several benchmarks, such as\nRefCOCO/RefCOCO+/RefCOCOg, COCO Panoptic Segmentation, and COCO-Interactive,\nand further exhibits zero-shot capabilities on unseen tasks, such as\nopen-vocabulary segmentation, generalized referring expression segmentation and\nvideo object segmentation, making a significant step towards a GPT moment in\ncomputer vision. Through extensive experiments, PSALM demonstrates its\npotential to transform the domain of image segmentation, leveraging the robust\nvisual understanding capabilities of LMMs as seen in natural language\nprocessing. Code and models are available at https://github.com/zamling/PSALM.\n","authors":["Zheng Zhang","Yeyao Ma","Enming Zhang","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2403.14598v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14594v1","updated":"2024-03-21T17:49:26Z","published":"2024-03-21T17:49:26Z","title":"VXP: Voxel-Cross-Pixel Large-scale Image-LiDAR Place Recognition","summary":" Recent works on the global place recognition treat the task as a retrieval\nproblem, where an off-the-shelf global descriptor is commonly designed in\nimage-based and LiDAR-based modalities. However, it is non-trivial to perform\naccurate image-LiDAR global place recognition since extracting consistent and\nrobust global descriptors from different domains (2D images and 3D point\nclouds) is challenging. To address this issue, we propose a novel\nVoxel-Cross-Pixel (VXP) approach, which establishes voxel and pixel\ncorrespondences in a self-supervised manner and brings them into a shared\nfeature space. Specifically, VXP is trained in a two-stage manner that first\nexplicitly exploits local feature correspondences and enforces similarity of\nglobal descriptors. Extensive experiments on the three benchmarks (Oxford\nRobotCar, ViViD++ and KITTI) demonstrate our method surpasses the\nstate-of-the-art cross-modal retrieval by a large margin.\n","authors":["Yun-Jin Li","Mariia Gladkova","Yan Xia","Rui Wang","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2403.14594v1.pdf","comment":"Project page https://yunjinli.github.io/projects-vxp/"},{"id":"http://arxiv.org/abs/2402.19150v2","updated":"2024-03-21T17:26:47Z","published":"2024-02-29T13:31:56Z","title":"Unveiling Typographic Deceptions: Insights of the Typographic\n Vulnerability in Large Vision-Language Model","summary":" Large Vision-Language Models (LVLMs) rely on vision encoders and Large\nLanguage Models (LLMs) to exhibit remarkable capabilities on various\nmulti-modal tasks in the joint space of vision and language. However, the\nTypographic Attack, which disrupts vision-language models (VLMs) such as\nContrastive Language-Image Pretraining (CLIP), has also been expected to be a\nsecurity threat to LVLMs. Firstly, we verify typographic attacks on current\nwell-known commercial and open-source LVLMs and uncover the widespread\nexistence of this threat. Secondly, to better assess this vulnerability, we\npropose the most comprehensive and largest-scale Typographic Dataset to date.\nThe Typographic Dataset not only considers the evaluation of typographic\nattacks under various multi-modal tasks but also evaluates the effects of\ntypographic attacks, influenced by texts generated with diverse factors. Based\non the evaluation results, we investigate the causes why typographic attacks\nmay impact VLMs and LVLMs, leading to three highly insightful discoveries. By\nthe examination of our discoveries and experimental validation in the\nTypographic Dataset, we reduce the performance degradation from $42.07\\%$ to\n$13.90\\%$ when LVLMs confront typographic attacks.\n","authors":["Hao Cheng","Erjia Xiao","Jindong Gu","Le Yang","Jinhao Duan","Jize Zhang","Jiahang Cao","Kaidi Xu","Renjing Xu"],"pdf_url":"https://arxiv.org/pdf/2402.19150v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.19474v2","updated":"2024-03-21T17:25:52Z","published":"2024-02-29T18:59:17Z","title":"The All-Seeing Project V2: Towards General Relation Comprehension of the\n Open World","summary":" We present the All-Seeing Project V2: a new model and dataset designed for\nunderstanding object relations in images. Specifically, we propose the\nAll-Seeing Model V2 (ASMv2) that integrates the formulation of text generation,\nobject localization, and relation comprehension into a relation conversation\n(ReC) task. Leveraging this unified task, our model excels not only in\nperceiving and recognizing all objects within the image but also in grasping\nthe intricate relation graph between them, diminishing the relation\nhallucination often encountered by Multi-modal Large Language Models (MLLMs).\nTo facilitate training and evaluation of MLLMs in relation understanding, we\ncreated the first high-quality ReC dataset ({AS-V2) which is aligned with the\nformat of standard instruction tuning data. In addition, we design a new\nbenchmark, termed Circular-based Relation Probing Evaluation (CRPE) for\ncomprehensively evaluating the relation comprehension capabilities of MLLMs.\nNotably, our ASMv2 achieves an overall accuracy of 52.04 on this relation-aware\nbenchmark, surpassing the 43.14 of LLaVA-1.5 by a large margin. We hope that\nour work can inspire more future research and contribute to the evolution\ntowards artificial general intelligence. Our project is released at\nhttps://github.com/OpenGVLab/all-seeing.\n","authors":["Weiyun Wang","Yiming Ren","Haowen Luo","Tiantong Li","Chenxiang Yan","Zhe Chen","Wenhai Wang","Qingyun Li","Lewei Lu","Xizhou Zhu","Yu Qiao","Jifeng Dai"],"pdf_url":"https://arxiv.org/pdf/2402.19474v2.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2403.11085v3","updated":"2024-03-21T17:25:23Z","published":"2024-03-17T04:36:18Z","title":"m&m's: A Benchmark to Evaluate Tool-Use for multi-step multi-modal Tasks","summary":" Real-world multi-modal problems are rarely solved by a single machine\nlearning model, and often require multi-step computational plans that involve\nstitching several models. Tool-augmented LLMs hold tremendous promise for\nautomating the generation of such computational plans. However, the lack of\nstandardized benchmarks for evaluating LLMs as planners for multi-step\nmulti-modal tasks has prevented a systematic study of planner design decisions.\nShould LLMs generate a full plan in a single shot or step-by-step? Should they\ninvoke tools directly with Python code or through structured data formats like\nJSON? Does feedback improve planning? To answer these questions and more, we\nintroduce m&m's: a benchmark containing 4K+ multi-step multi-modal tasks\ninvolving 33 tools that include multi-modal models, (free) public APIs, and\nimage processing modules. For each of these task queries, we provide\nautomatically generated plans using this realistic toolset. We further provide\na high-quality subset of 1,565 task plans that are human-verified and correctly\nexecutable. With m&m's, we evaluate 6 popular LLMs with 2 planning strategies\n(multi-step vs. step-by-step planning), 2 plan formats (JSON vs. code), and 3\ntypes of feedback (parsing/verification/execution). Finally, we summarize\ntakeaways from our extensive experiments. Our dataset and code are available on\nHuggingFace (https://huggingface.co/datasets/zixianma/mnms) and Github\n(https://github.com/RAIVNLab/mnms).\n","authors":["Zixian Ma","Weikai Huang","Jieyu Zhang","Tanmay Gupta","Ranjay Krishna"],"pdf_url":"https://arxiv.org/pdf/2403.11085v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14572v1","updated":"2024-03-21T17:20:21Z","published":"2024-03-21T17:20:21Z","title":"Implicit Style-Content Separation using B-LoRA","summary":" Image stylization involves manipulating the visual appearance and texture\n(style) of an image while preserving its underlying objects, structures, and\nconcepts (content). The separation of style and content is essential for\nmanipulating the image's style independently from its content, ensuring a\nharmonious and visually pleasing result. Achieving this separation requires a\ndeep understanding of both the visual and semantic characteristics of images,\noften necessitating the training of specialized models or employing heavy\noptimization. In this paper, we introduce B-LoRA, a method that leverages LoRA\n(Low-Rank Adaptation) to implicitly separate the style and content components\nof a single image, facilitating various image stylization tasks. By analyzing\nthe architecture of SDXL combined with LoRA, we find that jointly learning the\nLoRA weights of two specific blocks (referred to as B-LoRAs) achieves\nstyle-content separation that cannot be achieved by training each B-LoRA\nindependently. Consolidating the training into only two blocks and separating\nstyle and content allows for significantly improving style manipulation and\novercoming overfitting issues often associated with model fine-tuning. Once\ntrained, the two B-LoRAs can be used as independent components to allow various\nimage stylization tasks, including image style transfer, text-based image\nstylization, consistent style generation, and style-content mixing.\n","authors":["Yarden Frenkel","Yael Vinker","Ariel Shamir","Daniel Cohen-Or"],"pdf_url":"https://arxiv.org/pdf/2403.14572v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13444v2","updated":"2024-03-21T17:19:25Z","published":"2024-03-20T09:40:11Z","title":"MedCycle: Unpaired Medical Report Generation via Cycle-Consistency","summary":" Generating medical reports for X-ray images presents a significant challenge,\nparticularly in unpaired scenarios where access to paired image-report data for\ntraining is unavailable. Previous works have typically learned a joint\nembedding space for images and reports, necessitating a specific labeling\nschema for both. We introduce an innovative approach that eliminates the need\nfor consistent labeling schemas, thereby enhancing data accessibility and\nenabling the use of incompatible datasets. This approach is based on\ncycle-consistent mapping functions that transform image embeddings into report\nembeddings, coupled with report auto-encoding for medical report generation.\nOur model and objectives consider intricate local details and the overarching\nsemantic context within images and reports. This approach facilitates the\nlearning of effective mapping functions, resulting in the generation of\ncoherent reports. It outperforms state-of-the-art results in unpaired chest\nX-ray report generation, demonstrating improvements in both language and\nclinical metrics.\n","authors":["Elad Hirsch","Gefen Dawidowicz","Ayellet Tal"],"pdf_url":"https://arxiv.org/pdf/2403.13444v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06860v2","updated":"2024-03-21T17:06:49Z","published":"2024-03-11T16:13:58Z","title":"A Geospatial Approach to Predicting Desert Locust Breeding Grounds in\n Africa","summary":" Desert locust swarms present a major threat to agriculture and food security.\nAddressing this challenge, our study develops an operationally-ready model for\npredicting locust breeding grounds, which has the potential to enhance early\nwarning systems and targeted control measures. We curated a dataset from the\nUnited Nations Food and Agriculture Organization's (UN-FAO) locust observation\nrecords and analyzed it using two types of spatio-temporal input features:\nremotely-sensed environmental and climate data as well as multi-spectral earth\nobservation images. Our approach employed custom deep learning models\n(three-dimensional and LSTM-based recurrent convolutional networks), along with\nthe geospatial foundational model Prithvi recently released by Jakubik et al.,\n2023. These models notably outperformed existing baselines, with the\nPrithvi-based model, fine-tuned on multi-spectral images from NASA's Harmonized\nLandsat and Sentinel-2 (HLS) dataset, achieving the highest accuracy, F1 and\nROC-AUC scores (83.03%, 81.53% and 87.69%, respectively). A significant finding\nfrom our research is that multi-spectral earth observation images alone are\nsufficient for effective locust breeding ground prediction without the need to\nexplicitly incorporate climatic or environmental features.\n","authors":["Ibrahim Salihu Yusuf","Mukhtar Opeyemi Yusuf","Kobby Panford-Quainoo","Arnu Pretorius"],"pdf_url":"https://arxiv.org/pdf/2403.06860v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14559v1","updated":"2024-03-21T16:59:45Z","published":"2024-03-21T16:59:45Z","title":"Visibility-Aware Keypoint Localization for 6DoF Object Pose Estimation","summary":" Localizing predefined 3D keypoints in a 2D image is an effective way to\nestablish 3D-2D correspondences for 6DoF object pose estimation. However,\nunreliable localization results of invisible keypoints degrade the quality of\ncorrespondences. In this paper, we address this issue by localizing the\nimportant keypoints in terms of visibility. Since keypoint visibility\ninformation is currently missing in dataset collection process, we propose an\nefficient way to generate binary visibility labels from available object-level\nannotations, for keypoints of both asymmetric objects and symmetric objects. We\nfurther derive real-valued visibility-aware importance from binary labels based\non PageRank algorithm. Taking advantage of the flexibility of our\nvisibility-aware importance, we construct VAPO (Visibility-Aware POse\nestimator) by integrating the visibility-aware importance with a\nstate-of-the-art pose estimation algorithm, along with additional positional\nencoding. Extensive experiments are conducted on popular pose estimation\nbenchmarks including Linemod, Linemod-Occlusion, and YCB-V. The results show\nthat, VAPO improves both the keypoint correspondences and final estimated\nposes, and clearly achieves state-of-the-art performances.\n","authors":["Ruyi Lian","Haibin Ling"],"pdf_url":"https://arxiv.org/pdf/2403.14559v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16274v2","updated":"2024-03-21T16:58:06Z","published":"2023-12-26T15:00:35Z","title":"Towards Flexible, Scalable, and Adaptive Multi-Modal Conditioned Face\n Synthesis","summary":" Recent progress in multi-modal conditioned face synthesis has enabled the\ncreation of visually striking and accurately aligned facial images. Yet,\ncurrent methods still face issues with scalability, limited flexibility, and a\none-size-fits-all approach to control strength, not accounting for the\ndiffering levels of conditional entropy, a measure of unpredictability in data\ngiven some condition, across modalities. To address these challenges, we\nintroduce a novel uni-modal training approach with modal surrogates, coupled\nwith an entropy-aware modal-adaptive modulation, to support flexible, scalable,\nand scalable multi-modal conditioned face synthesis network. Our uni-modal\ntraining with modal surrogate that only leverage uni-modal data, use modal\nsurrogate to decorate condition with modal-specific characteristic and serve as\nlinker for inter-modal collaboration , fully learns each modality control in\nface synthesis process as well as inter-modal collaboration. The entropy-aware\nmodal-adaptive modulation finely adjust diffusion noise according to\nmodal-specific characteristics and given conditions, enabling well-informed\nstep along denoising trajectory and ultimately leading to synthesis results of\nhigh fidelity and quality. Our framework improves multi-modal face synthesis\nunder various conditions, surpassing current methods in image quality and\nfidelity, as demonstrated by our thorough experimental results.\n","authors":["Jingjing Ren","Cheng Xu","Haoyu Chen","Xinran Qin","Lei Zhu"],"pdf_url":"https://arxiv.org/pdf/2312.16274v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14554v1","updated":"2024-03-21T16:53:03Z","published":"2024-03-21T16:53:03Z","title":"Gaussian Frosting: Editable Complex Radiance Fields with Real-Time\n Rendering","summary":" We propose Gaussian Frosting, a novel mesh-based representation for\nhigh-quality rendering and editing of complex 3D effects in real-time. Our\napproach builds on the recent 3D Gaussian Splatting framework, which optimizes\na set of 3D Gaussians to approximate a radiance field from images. We propose\nfirst extracting a base mesh from Gaussians during optimization, then building\nand refining an adaptive layer of Gaussians with a variable thickness around\nthe mesh to better capture the fine details and volumetric effects near the\nsurface, such as hair or grass. We call this layer Gaussian Frosting, as it\nresembles a coating of frosting on a cake. The fuzzier the material, the\nthicker the frosting. We also introduce a parameterization of the Gaussians to\nenforce them to stay inside the frosting layer and automatically adjust their\nparameters when deforming, rescaling, editing or animating the mesh. Our\nrepresentation allows for efficient rendering using Gaussian splatting, as well\nas editing and animation by modifying the base mesh. We demonstrate the\neffectiveness of our method on various synthetic and real scenes, and show that\nit outperforms existing surface-based approaches. We will release our code and\na web-based viewer as additional contributions. Our project page is the\nfollowing: https://anttwo.github.io/frosting/\n","authors":["Antoine Guédon","Vincent Lepetit"],"pdf_url":"https://arxiv.org/pdf/2403.14554v1.pdf","comment":"Project Webpage: https://anttwo.github.io/frosting/"},{"id":"http://arxiv.org/abs/2403.14552v1","updated":"2024-03-21T16:52:27Z","published":"2024-03-21T16:52:27Z","title":"Token Transformation Matters: Towards Faithful Post-hoc Explanation for\n Vision Transformer","summary":" While Transformers have rapidly gained popularity in various computer vision\napplications, post-hoc explanations of their internal mechanisms remain largely\nunexplored. Vision Transformers extract visual information by representing\nimage regions as transformed tokens and integrating them via attention weights.\nHowever, existing post-hoc explanation methods merely consider these attention\nweights, neglecting crucial information from the transformed tokens, which\nfails to accurately illustrate the rationales behind the models' predictions.\nTo incorporate the influence of token transformation into interpretation, we\npropose TokenTM, a novel post-hoc explanation method that utilizes our\nintroduced measurement of token transformation effects. Specifically, we\nquantify token transformation effects by measuring changes in token lengths and\ncorrelations in their directions pre- and post-transformation. Moreover, we\ndevelop initialization and aggregation rules to integrate both attention\nweights and token transformation effects across all layers, capturing holistic\ntoken contributions throughout the model. Experimental results on segmentation\nand perturbation tests demonstrate the superiority of our proposed TokenTM\ncompared to state-of-the-art Vision Transformer explanation methods.\n","authors":["Junyi Wu","Bin Duan","Weitai Kang","Hao Tang","Yan Yan"],"pdf_url":"https://arxiv.org/pdf/2403.14552v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.03849v2","updated":"2024-03-21T16:49:20Z","published":"2024-03-06T16:49:33Z","title":"MedMamba: Vision Mamba for Medical Image Classification","summary":" Medical image classification is a very fundamental and crucial task in the\nfield of computer vision. These years, CNN-based and Transformer-based models\nhave been widely used to classify various medical images. Unfortunately, The\nlimitation of CNNs in long-range modeling capabilities prevents them from\neffectively extracting features in medical images, while Transformers are\nhampered by their quadratic computational complexity. Recent research has shown\nthat the state space model (SSM) represented by Mamba can efficiently model\nlong-range interactions while maintaining linear computational complexity.\nInspired by this, we propose Vision Mamba for medical image classification\n(MedMamba). More specifically, we introduce a novel Conv-SSM module. Conv-SSM\ncombines the local feature extraction ability of convolutional layers with the\nability of SSM to capture long-range dependency, thereby modeling medical\nimages with different modalities. To demonstrate the potential of MedMamba, we\nconducted extensive experiments using 14 publicly available medical datasets\nwith different imaging techniques and two private datasets built by ourselves.\nExtensive experimental results demonstrate that the proposed MedMamba performs\nwell in detecting lesions in various medical images. To the best of our\nknowledge, this is the first Vision Mamba tailored for medical image\nclassification. The purpose of this work is to establish a new baseline for\nmedical image classification tasks and provide valuable insights for the future\ndevelopment of more efficient and effective SSM-based artificial intelligence\nalgorithms and application systems in the medical. Source code has been\navailable at https://github.com/YubiaoYue/MedMamba.\n","authors":["Yubiao Yue","Zhenzhang Li"],"pdf_url":"https://arxiv.org/pdf/2403.03849v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14548v1","updated":"2024-03-21T16:49:20Z","published":"2024-03-21T16:49:20Z","title":"DINO-Tracker: Taming DINO for Self-Supervised Point Tracking in a Single\n Video","summary":" We present DINO-Tracker -- a new framework for long-term dense tracking in\nvideo. The pillar of our approach is combining test-time training on a single\nvideo, with the powerful localized semantic features learned by a pre-trained\nDINO-ViT model. Specifically, our framework simultaneously adopts DINO's\nfeatures to fit to the motion observations of the test video, while training a\ntracker that directly leverages the refined features. The entire framework is\ntrained end-to-end using a combination of self-supervised losses, and\nregularization that allows us to retain and benefit from DINO's semantic prior.\nExtensive evaluation demonstrates that our method achieves state-of-the-art\nresults on known benchmarks. DINO-tracker significantly outperforms\nself-supervised methods and is competitive with state-of-the-art supervised\ntrackers, while outperforming them in challenging cases of tracking under\nlong-term occlusions.\n","authors":["Narek Tumanyan","Assaf Singer","Shai Bagon","Tali Dekel"],"pdf_url":"https://arxiv.org/pdf/2403.14548v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14547v1","updated":"2024-03-21T16:48:45Z","published":"2024-03-21T16:48:45Z","title":"Estimating Physical Information Consistency of Channel Data Augmentation\n for Remote Sensing Images","summary":" The application of data augmentation for deep learning (DL) methods plays an\nimportant role in achieving state-of-the-art results in supervised,\nsemi-supervised, and self-supervised image classification. In particular,\nchannel transformations (e.g., solarize, grayscale, brightness adjustments) are\nintegrated into data augmentation pipelines for remote sensing (RS) image\nclassification tasks. However, contradicting beliefs exist about their proper\napplications to RS images. A common point of critique is that the application\nof channel augmentation techniques may lead to physically inconsistent spectral\ndata (i.e., pixel signatures). To shed light on the open debate, we propose an\napproach to estimate whether a channel augmentation technique affects the\nphysical information of RS images. To this end, the proposed approach estimates\na score that measures the alignment of a pixel signature within a time series\nthat can be naturally subject to deviations caused by factors such as\nacquisition conditions or phenological states of vegetation. We compare the\nscores associated with original and augmented pixel signatures to evaluate the\nphysical consistency. Experimental results on a multi-label image\nclassification task show that channel augmentations yielding a score that\nexceeds the expected deviation of original pixel signatures can not improve the\nperformance of a baseline model trained without augmentation.\n","authors":["Tom Burgert","Begüm Demir"],"pdf_url":"https://arxiv.org/pdf/2403.14547v1.pdf","comment":"Accepted at the IEEE International Geoscience and Remote Sensing\n Symposium"},{"id":"http://arxiv.org/abs/2402.17587v2","updated":"2024-03-21T16:40:43Z","published":"2024-02-25T07:59:10Z","title":"Instance-aware Exploration-Verification-Exploitation for Instance\n ImageGoal Navigation","summary":" As a new embodied vision task, Instance ImageGoal Navigation (IIN) aims to\nnavigate to a specified object depicted by a goal image in an unexplored\nenvironment.\n The main challenge of this task lies in identifying the target object from\ndifferent viewpoints while rejecting similar distractors.\n Existing ImageGoal Navigation methods usually adopt the simple\nExploration-Exploitation framework and ignore the identification of specific\ninstance during navigation.\n In this work, we propose to imitate the human behaviour of ``getting closer\nto confirm\" when distinguishing objects from a distance.\n Specifically, we design a new modular navigation framework named\nInstance-aware Exploration-Verification-Exploitation (IEVE) for instance-level\nimage goal navigation.\n Our method allows for active switching among the exploration, verification,\nand exploitation actions, thereby facilitating the agent in making reasonable\ndecisions under different situations.\n On the challenging HabitatMatterport 3D semantic (HM3D-SEM) dataset, our\nmethod surpasses previous state-of-the-art work, with a classical segmentation\nmodel (0.684 vs. 0.561 success) or a robust model (0.702 vs. 0.561 success).\nOur code will be made publicly available at https://github.com/XiaohanLei/IEVE.\n","authors":["Xiaohan Lei","Min Wang","Wengang Zhou","Li Li","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2402.17587v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14539v1","updated":"2024-03-21T16:40:10Z","published":"2024-03-21T16:40:10Z","title":"Object-Centric Domain Randomization for 3D Shape Reconstruction in the\n Wild","summary":" One of the biggest challenges in single-view 3D shape reconstruction in the\nwild is the scarcity of <3D shape, 2D image>-paired data from real-world\nenvironments. Inspired by remarkable achievements via domain randomization, we\npropose ObjectDR which synthesizes such paired data via a random simulation of\nvisual variations in object appearances and backgrounds. Our data synthesis\nframework exploits a conditional generative model (e.g., ControlNet) to\ngenerate images conforming to spatial conditions such as 2.5D sketches, which\nare obtainable through a rendering process of 3D shapes from object collections\n(e.g., Objaverse-XL). To simulate diverse variations while preserving object\nsilhouettes embedded in spatial conditions, we also introduce a disentangled\nframework which leverages an initial object guidance. After synthesizing a wide\nrange of data, we pre-train a model on them so that it learns to capture a\ndomain-invariant geometry prior which is consistent across various domains. We\nvalidate its effectiveness by substantially improving 3D shape reconstruction\nmodels on a real-world benchmark. In a scale-up evaluation, our pre-training\nachieves 23.6% superior results compared with the pre-training on high-quality\ncomputer graphics renderings.\n","authors":["Junhyeong Cho","Kim Youwang","Hunmin Yang","Tae-Hyun Oh"],"pdf_url":"https://arxiv.org/pdf/2403.14539v1.pdf","comment":"Project Page: https://ObjectDR.github.io"},{"id":"http://arxiv.org/abs/2403.12167v2","updated":"2024-03-21T16:38:33Z","published":"2024-03-18T18:35:32Z","title":"Generalizing deep learning models for medical image classification","summary":" Numerous Deep Learning (DL) models have been developed for a large spectrum\nof medical image analysis applications, which promises to reshape various\nfacets of medical practice. Despite early advances in DL model validation and\nimplementation, which encourage healthcare institutions to adopt them, some\nfundamental questions remain: are the DL models capable of generalizing? What\ncauses a drop in DL model performances? How to overcome the DL model\nperformance drop? Medical data are dynamic and prone to domain shift, due to\nmultiple factors such as updates to medical equipment, new imaging workflow,\nand shifts in patient demographics or populations can induce this drift over\ntime. In this paper, we review recent developments in generalization methods\nfor DL-based classification models. We also discuss future challenges,\nincluding the need for improved evaluation protocols and benchmarks, and\nenvisioned future developments to achieve robust, generalized models for\nmedical image classification.\n","authors":["Matta Sarah","Lamard Mathieu","Zhang Philippe","Alexandre Le Guilcher","Laurent Borderie","Béatrice Cochener","Gwenolé Quellec"],"pdf_url":"https://arxiv.org/pdf/2403.12167v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14534v1","updated":"2024-03-21T16:36:40Z","published":"2024-03-21T16:36:40Z","title":"Transfer Learning for Cross-dataset Isolated Sign Language Recognition\n in Under-Resourced Datasets","summary":" Sign language recognition (SLR) has recently achieved a breakthrough in\nperformance thanks to deep neural networks trained on large annotated sign\ndatasets. Of the many different sign languages, these annotated datasets are\nonly available for a select few. Since acquiring gloss-level labels on sign\nlanguage videos is difficult, learning by transferring knowledge from existing\nannotated sources is useful for recognition in under-resourced sign languages.\nThis study provides a publicly available cross-dataset transfer learning\nbenchmark from two existing public Turkish SLR datasets. We use a temporal\ngraph convolution-based sign language recognition approach to evaluate five\nsupervised transfer learning approaches and experiment with closed-set and\npartial-set cross-dataset transfer learning. Experiments demonstrate that\nimprovement over finetuning based transfer learning is possible with\nspecialized supervised transfer learning methods.\n","authors":["Ahmet Alp Kindiroglu","Ozgur Kara","Ogulcan Ozdemir","Lale Akarun"],"pdf_url":"https://arxiv.org/pdf/2403.14534v1.pdf","comment":"Accepted to The 18th IEEE International Conference on Automatic Face\n and Gesture Recognition 2024, Code available in\n https://github.com/alpk/tid-supervised-transfer-learning-dataset"},{"id":"http://arxiv.org/abs/2403.14530v1","updated":"2024-03-21T16:28:58Z","published":"2024-03-21T16:28:58Z","title":"HAC: Hash-grid Assisted Context for 3D Gaussian Splatting Compression","summary":" 3D Gaussian Splatting (3DGS) has emerged as a promising framework for novel\nview synthesis, boasting rapid rendering speed with high fidelity. However, the\nsubstantial Gaussians and their associated attributes necessitate effective\ncompression techniques. Nevertheless, the sparse and unorganized nature of the\npoint cloud of Gaussians (or anchors in our paper) presents challenges for\ncompression. To address this, we make use of the relations between the\nunorganized anchors and the structured hash grid, leveraging their mutual\ninformation for context modeling, and propose a Hash-grid Assisted Context\n(HAC) framework for highly compact 3DGS representation. Our approach introduces\na binary hash grid to establish continuous spatial consistencies, allowing us\nto unveil the inherent spatial relations of anchors through a carefully\ndesigned context model. To facilitate entropy coding, we utilize Gaussian\ndistributions to accurately estimate the probability of each quantized\nattribute, where an adaptive quantization module is proposed to enable\nhigh-precision quantization of these attributes for improved fidelity\nrestoration. Additionally, we incorporate an adaptive masking strategy to\neliminate invalid Gaussians and anchors. Importantly, our work is the pioneer\nto explore context-based compression for 3DGS representation, resulting in a\nremarkable size reduction of over $75\\times$ compared to vanilla 3DGS, while\nsimultaneously improving fidelity, and achieving over $11\\times$ size reduction\nover SOTA 3DGS compression approach Scaffold-GS. Our code is available here:\nhttps://github.com/YihangChen-ee/HAC\n","authors":["Yihang Chen","Qianyi Wu","Jianfei Cai","Mehrtash Harandi","Weiyao Lin"],"pdf_url":"https://arxiv.org/pdf/2403.14530v1.pdf","comment":"Project Page: https://yihangchen-ee.github.io/project_hac/ Code:\n https://github.com/YihangChen-ee/HAC"},{"id":"http://arxiv.org/abs/2403.12966v2","updated":"2024-03-21T16:26:44Z","published":"2024-03-19T17:59:52Z","title":"Chain-of-Spot: Interactive Reasoning Improves Large Vision-Language\n Models","summary":" In the realm of vision-language understanding, the proficiency of models in\ninterpreting and reasoning over visual content has become a cornerstone for\nnumerous applications. However, it is challenging for the visual encoder in\nLarge Vision-Language Models (LVLMs) to extract useful features tailored to\nquestions that aid the language model's response. Furthermore, a common\npractice among existing LVLMs is to utilize lower-resolution images, which\nrestricts the ability for visual recognition. Our work introduces the\nChain-of-Spot (CoS) method, which we describe as Interactive Reasoning, a novel\napproach that enhances feature extraction by focusing on key regions of\ninterest (ROI) within the image, corresponding to the posed questions or\ninstructions. This technique allows LVLMs to access more detailed visual\ninformation without altering the original image resolution, thereby offering\nmulti-granularity image features. By integrating Chain-of-Spot with\ninstruct-following LLaVA-1.5 models, the process of image reasoning\nconsistently improves performance across a wide range of multimodal datasets\nand benchmarks without bells and whistles and achieves new state-of-the-art\nresults. Our empirical findings demonstrate a significant improvement in LVLMs'\nability to understand and reason about visual content, paving the way for more\nsophisticated visual instruction-following applications. Code and models are\navailable at https://github.com/dongyh20/Chain-of-Spot\n","authors":["Zuyan Liu","Yuhao Dong","Yongming Rao","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2403.12966v2.pdf","comment":"Project Page: https://sites.google.com/view/chain-of-spot/"},{"id":"http://arxiv.org/abs/2403.14526v1","updated":"2024-03-21T16:26:19Z","published":"2024-03-21T16:26:19Z","title":"Click to Grasp: Zero-Shot Precise Manipulation via Visual Diffusion\n Descriptors","summary":" Precise manipulation that is generalizable across scenes and objects remains\na persistent challenge in robotics. Current approaches for this task heavily\ndepend on having a significant number of training instances to handle objects\nwith pronounced visual and/or geometric part ambiguities. Our work explores the\ngrounding of fine-grained part descriptors for precise manipulation in a\nzero-shot setting by utilizing web-trained text-to-image diffusion-based\ngenerative models. We tackle the problem by framing it as a dense semantic part\ncorrespondence task. Our model returns a gripper pose for manipulating a\nspecific part, using as reference a user-defined click from a source image of a\nvisually different instance of the same object. We require no manual grasping\ndemonstrations as we leverage the intrinsic object geometry and features.\nPractical experiments in a real-world tabletop scenario validate the efficacy\nof our approach, demonstrating its potential for advancing semantic-aware\nrobotics manipulation. Web page: https://tsagkas.github.io/click2grasp\n","authors":["Nikolaos Tsagkas","Jack Rome","Subramanian Ramamoorthy","Oisin Mac Aodha","Chris Xiaoxuan Lu"],"pdf_url":"https://arxiv.org/pdf/2403.14526v1.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2403.14523v1","updated":"2024-03-21T16:23:25Z","published":"2024-03-21T16:23:25Z","title":"Invisible Needle Detection in Ultrasound: Leveraging Mechanism-Induced\n Vibration","summary":" In clinical applications that involve ultrasound-guided intervention, the\nvisibility of the needle can be severely impeded due to steep insertion and\nstrong distractors such as speckle noise and anatomical occlusion. To address\nthis challenge, we propose VibNet, a learning-based framework tailored to\nenhance the robustness and accuracy of needle detection in ultrasound images,\neven when the target becomes invisible to the naked eye. Inspired by Eulerian\nVideo Magnification techniques, we utilize an external step motor to induce\nlow-amplitude periodic motion on the needle. These subtle vibrations offer the\npotential to generate robust frequency features for detecting the motion\npatterns around the needle. To robustly and precisely detect the needle\nleveraging these vibrations, VibNet integrates learning-based\nShort-Time-Fourier-Transform and Hough-Transform modules to achieve successive\nsub-goals, including motion feature extraction in the spatiotemporal space,\nfrequency feature aggregation, and needle detection in the Hough space. Based\non the results obtained on distinct ex vivo porcine and bovine tissue samples,\nthe proposed algorithm exhibits superior detection performance with efficient\ncomputation and generalization capability.\n","authors":["Chenyang Li","Dianye Huang","Angelos Karlas","Nassir Navab","Zhongliang Jiang"],"pdf_url":"https://arxiv.org/pdf/2403.14523v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14520v1","updated":"2024-03-21T16:17:57Z","published":"2024-03-21T16:17:57Z","title":"Cobra: Extending Mamba to Multi-Modal Large Language Model for Efficient\n Inference","summary":" In recent years, the application of multimodal large language models (MLLM)\nin various fields has achieved remarkable success. However, as the foundation\nmodel for many downstream tasks, current MLLMs are composed of the well-known\nTransformer network, which has a less efficient quadratic computation\ncomplexity. To improve the efficiency of such basic models, we propose Cobra, a\nlinear computational complexity MLLM. Specifically, Cobra integrates the\nefficient Mamba language model into the visual modality. Moreover, we explore\nand study various modal fusion schemes to create an effective multi-modal\nMamba. Extensive experiments demonstrate that (1) Cobra achieves extremely\ncompetitive performance with current computationally efficient state-of-the-art\nmethods, \\textit{e.g.}, LLaVA-Phi, TinyLLaVA, and MobileVLM v2, and has faster\nspeed due to Cobra's linear sequential modeling. (2) Interestingly, the results\nof closed-set challenging prediction benchmarks show that Cobra performs well\nin overcoming visual illusions and spatial relationship judgments. (3) Notably,\nCobra even achieves comparable performance to LLaVA with about 43% of the\nnumber of parameters. We will make all codes of Cobra open-source and hope that\nthe proposed method can facilitate future research on complexity problems in\nMLLM. Our project page is available at: https://sites.google.com/view/cobravlm.\n","authors":["Han Zhao","Min Zhang","Wei Zhao","Pengxiang Ding","Siteng Huang","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.14520v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17797v3","updated":"2024-03-21T16:11:23Z","published":"2024-02-26T22:00:59Z","title":"Neural Radiance Fields in Medical Imaging: Challenges and Next Steps","summary":" Neural Radiance Fields (NeRF), as a pioneering technique in computer vision,\noffer great potential to revolutionize medical imaging by synthesizing\nthree-dimensional representations from the projected two-dimensional image\ndata. However, they face unique challenges when applied to medical\napplications. This paper presents a comprehensive examination of applications\nof NeRFs in medical imaging, highlighting four imminent challenges, including\nfundamental imaging principles, inner structure requirement, object boundary\ndefinition, and color density significance. We discuss current methods on\ndifferent organs and discuss related limitations. We also review several\ndatasets and evaluation metrics and propose several promising directions for\nfuture research.\n","authors":["Xin Wang","Shu Hu","Heng Fan","Hongtu Zhu","Xin Li"],"pdf_url":"https://arxiv.org/pdf/2402.17797v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12157v2","updated":"2024-03-21T16:09:57Z","published":"2023-03-21T19:34:20Z","title":"Learning a Depth Covariance Function","summary":" We propose learning a depth covariance function with applications to\ngeometric vision tasks. Given RGB images as input, the covariance function can\nbe flexibly used to define priors over depth functions, predictive\ndistributions given observations, and methods for active point selection. We\nleverage these techniques for a selection of downstream tasks: depth\ncompletion, bundle adjustment, and monocular dense visual odometry.\n","authors":["Eric Dexheimer","Andrew J. Davison"],"pdf_url":"https://arxiv.org/pdf/2303.12157v2.pdf","comment":"CVPR 2023. Project page: https://edexheim.github.io/DepthCov/"},{"id":"http://arxiv.org/abs/2403.14513v1","updated":"2024-03-21T16:08:21Z","published":"2024-03-21T16:08:21Z","title":"View-decoupled Transformer for Person Re-identification under\n Aerial-ground Camera Network","summary":" Existing person re-identification methods have achieved remarkable advances\nin appearance-based identity association across homogeneous cameras, such as\nground-ground matching. However, as a more practical scenario, aerial-ground\nperson re-identification (AGPReID) among heterogeneous cameras has received\nminimal attention. To alleviate the disruption of discriminative identity\nrepresentation by dramatic view discrepancy as the most significant challenge\nin AGPReID, the view-decoupled transformer (VDT) is proposed as a simple yet\neffective framework. Two major components are designed in VDT to decouple\nview-related and view-unrelated features, namely hierarchical subtractive\nseparation and orthogonal loss, where the former separates these two features\ninside the VDT, and the latter constrains these two to be independent. In\naddition, we contribute a large-scale AGPReID dataset called CARGO, consisting\nof five/eight aerial/ground cameras, 5,000 identities, and 108,563 images.\nExperiments on two datasets show that VDT is a feasible and effective solution\nfor AGPReID, surpassing the previous method on mAP/Rank1 by up to 5.0%/2.7% on\nCARGO and 3.7%/5.2% on AG-ReID, keeping the same magnitude of computational\ncomplexity. Our project is available at https://github.com/LinlyAC/VDT-AGPReID\n","authors":["Quan Zhang","Lei Wang","Vishal M. Patel","Xiaohua Xie","Jianhuang Lai"],"pdf_url":"https://arxiv.org/pdf/2403.14513v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2312.10217v2","updated":"2024-03-21T16:06:17Z","published":"2023-12-15T21:30:49Z","title":"T-MAE: Temporal Masked Autoencoders for Point Cloud Representation\n Learning","summary":" The scarcity of annotated data in LiDAR point cloud understanding hinders\neffective representation learning. Consequently, scholars have been actively\ninvestigating efficacious self-supervised pre-training paradigms. Nevertheless,\ntemporal information, which is inherent in the LiDAR point cloud sequence, is\nconsistently disregarded. To better utilize this property, we propose an\neffective pre-training strategy, namely Temporal Masked Auto-Encoders (T-MAE),\nwhich takes as input temporally adjacent frames and learns temporal dependency.\nA SiamWCA backbone, containing a Siamese encoder and a windowed cross-attention\n(WCA) module, is established for the two-frame input. Considering that the\nmovement of an ego-vehicle alters the view of the same instance, temporal\nmodeling also serves as a robust and natural data augmentation, enhancing the\ncomprehension of target objects. SiamWCA is a powerful architecture but heavily\nrelies on annotated data. Our T-MAE pre-training strategy alleviates its demand\nfor annotated data. Comprehensive experiments demonstrate that T-MAE achieves\nthe best performance on both Waymo and ONCE datasets among competitive\nself-supervised approaches.\n","authors":["Weijie Wei","Fatemeh Karimi Nejadasl","Theo Gevers","Martin R. Oswald"],"pdf_url":"https://arxiv.org/pdf/2312.10217v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2312.09641v2","updated":"2024-03-21T15:57:28Z","published":"2023-12-15T09:30:47Z","title":"Ins-HOI: Instance Aware Human-Object Interactions Recovery","summary":" Accurately modeling detailed interactions between human/hand and object is an\nappealing yet challenging task. Current multi-view capture systems are only\ncapable of reconstructing multiple subjects into a single, unified mesh, which\nfails to model the states of each instance individually during interactions. To\naddress this, previous methods use template-based representations to track\nhuman/hand and object. However, the quality of the reconstructions is limited\nby the descriptive capabilities of the templates so that these methods are\ninherently struggle with geometry details, pressing deformations and invisible\ncontact surfaces. In this work, we propose an end-to-end Instance-aware\nHuman-Object Interactions recovery (Ins-HOI) framework by introducing an\ninstance-level occupancy field representation. However, the real-captured data\nis presented as a holistic mesh, unable to provide instance-level supervision.\nTo address this, we further propose a complementary training strategy that\nleverages synthetic data to introduce instance-level shape priors, enabling the\ndisentanglement of occupancy fields for different instances. Specifically,\nsynthetic data, created by randomly combining individual scans of humans/hands\nand objects, guides the network to learn a coarse prior of instances.\nMeanwhile, real-captured data helps in learning the overall geometry and\nrestricting interpenetration in contact areas. As demonstrated in experiments,\nour method Ins-HOI supports instance-level reconstruction and provides\nreasonable and realistic invisible contact surfaces even in cases of extremely\nclose interaction. To facilitate the research of this task, we collect a\nlarge-scale, high-fidelity 3D scan dataset, including 5.2k high-quality scans\nwith real-world human-chair and hand-object interactions. The code and data\nwill be public for research purposes.\n","authors":["Jiajun Zhang","Yuxiang Zhang","Hongwen Zhang","Xiao Zhou","Boyao Zhou","Ruizhi Shao","Zonghai Hu","Yebin Liu"],"pdf_url":"https://arxiv.org/pdf/2312.09641v2.pdf","comment":"Project Page: https://jiajunzhang16.github.io/ins-hoi/ , Code and\n Dataset Page: https://github.com/jiajunzhang16/ins-hoi"},{"id":"http://arxiv.org/abs/2403.14499v1","updated":"2024-03-21T15:52:05Z","published":"2024-03-21T15:52:05Z","title":"Denoising Diffusion Models for 3D Healthy Brain Tissue Inpainting","summary":" Monitoring diseases that affect the brain's structural integrity requires\nautomated analysis of magnetic resonance (MR) images, e.g., for the evaluation\nof volumetric changes. However, many of the evaluation tools are optimized for\nanalyzing healthy tissue. To enable the evaluation of scans containing\npathological tissue, it is therefore required to restore healthy tissue in the\npathological areas. In this work, we explore and extend denoising diffusion\nmodels for consistent inpainting of healthy 3D brain tissue. We modify\nstate-of-the-art 2D, pseudo-3D, and 3D methods working in the image space, as\nwell as 3D latent and 3D wavelet diffusion models, and train them to synthesize\nhealthy brain tissue. Our evaluation shows that the pseudo-3D model performs\nbest regarding the structural-similarity index, peak signal-to-noise ratio, and\nmean squared error. To emphasize the clinical relevance, we fine-tune this\nmodel on data containing synthetic MS lesions and evaluate it on a downstream\nbrain tissue segmentation task, whereby it outperforms the established FMRIB\nSoftware Library (FSL) lesion-filling method.\n","authors":["Alicia Durrer","Julia Wolleb","Florentin Bieder","Paul Friedrich","Lester Melie-Garcia","Mario Ocampo-Pineda","Cosmin I. Bercea","Ibrahim E. Hamamci","Benedikt Wiestler","Marie Piraud","Özgür Yaldizli","Cristina Granziera","Bjoern H. Menze","Philippe C. Cattin","Florian Kofler"],"pdf_url":"https://arxiv.org/pdf/2403.14499v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14497v1","updated":"2024-03-21T15:46:19Z","published":"2024-03-21T15:46:19Z","title":"MULDE: Multiscale Log-Density Estimation via Denoising Score Matching\n for Video Anomaly Detection","summary":" We propose a novel approach to video anomaly detection: we treat feature\nvectors extracted from videos as realizations of a random variable with a fixed\ndistribution and model this distribution with a neural network. This lets us\nestimate the likelihood of test videos and detect video anomalies by\nthresholding the likelihood estimates. We train our video anomaly detector\nusing a modification of denoising score matching, a method that injects\ntraining data with noise to facilitate modeling its distribution. To eliminate\nhyperparameter selection, we model the distribution of noisy video features\nacross a range of noise levels and introduce a regularizer that tends to align\nthe models for different levels of noise. At test time, we combine anomaly\nindications at multiple noise scales with a Gaussian mixture model. Running our\nvideo anomaly detector induces minimal delays as inference requires merely\nextracting the features and forward-propagating them through a shallow neural\nnetwork and a Gaussian mixture model. Our experiments on five popular video\nanomaly detection benchmarks demonstrate state-of-the-art performance, both in\nthe object-centric and in the frame-centric setup.\n","authors":["Jakub Micorek","Horst Possegger","Dominik Narnhofer","Horst Bischof","Mateusz Kozinski"],"pdf_url":"https://arxiv.org/pdf/2403.14497v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02116v3","updated":"2024-03-21T15:45:29Z","published":"2023-12-04T18:48:02Z","title":"GIVT: Generative Infinite-Vocabulary Transformers","summary":" We introduce generative infinite-vocabulary transformers (GIVT) which\ngenerate vector sequences with real-valued entries, instead of discrete tokens\nfrom a finite vocabulary. To this end, we propose two surprisingly simple\nmodifications to decoder-only transformers: 1) at the input, we replace the\nfinite-vocabulary lookup table with a linear projection of the input vectors;\nand 2) at the output, we replace the logits prediction (usually mapped to a\ncategorical distribution) with the parameters of a multivariate Gaussian\nmixture model. Inspired by the image-generation paradigm of VQ-GAN and MaskGIT,\nwhere transformers are used to model the discrete latent sequences of a VQ-VAE,\nwe use GIVT to model the unquantized real-valued latent sequences of a\n$\\beta$-VAE. In class-conditional image generation GIVT outperforms VQ-GAN (and\nimproved variants thereof) as well as MaskGIT, and achieves performance\ncompetitive with recent latent diffusion models. Finally, we obtain strong\nresults outside of image generation when applying GIVT to panoptic segmentation\nand depth estimation with a VAE variant of the UViM framework\n","authors":["Michael Tschannen","Cian Eastwood","Fabian Mentzer"],"pdf_url":"https://arxiv.org/pdf/2312.02116v3.pdf","comment":"v2: add related NLP work, loss details. v3: Improved GMM formulation,\n added adapter module, larger models, better image generation results. Code\n and model checkpoints are available at:\n https://github.com/google-research/big_vision"},{"id":"http://arxiv.org/abs/2403.14494v1","updated":"2024-03-21T15:42:17Z","published":"2024-03-21T15:42:17Z","title":"Learning to Project for Cross-Task Knowledge Distillation","summary":" Traditional knowledge distillation (KD) relies on a proficient teacher\ntrained on the target task, which is not always available. In this setting,\ncross-task distillation can be used, enabling the use of any teacher model\ntrained on a different task. However, many KD methods prove ineffective when\napplied to this cross-task setting. To address this limitation, we propose a\nsimple modification: the use of an inverted projection. We show that this\ndrop-in replacement for a standard projector is effective by learning to\ndisregard any task-specific features which might degrade the student's\nperformance. We find that this simple modification is sufficient for extending\nmany KD methods to the cross-task setting, where the teacher and student tasks\ncan be very different. In doing so, we obtain up to a 1.9% improvement in the\ncross-task setting compared to the traditional projection, at no additional\ncost. Our method can obtain significant performance improvements (up to 7%)\nwhen using even a randomly-initialised teacher on various tasks such as depth\nestimation, image translation, and semantic segmentation, despite the lack of\nany learned knowledge to transfer. To provide conceptual and analytical\ninsights into this result, we show that using an inverted projection allows the\ndistillation loss to be decomposed into a knowledge transfer and a spectral\nregularisation component. Through this analysis we are additionally able to\npropose a novel regularisation loss that allows teacher-free distillation,\nenabling performance improvements of up to 8.57% on ImageNet with no additional\ntraining costs.\n","authors":["Dylan Auty","Roy Miles","Benedikt Kolbeinsson","Krystian Mikolajczyk"],"pdf_url":"https://arxiv.org/pdf/2403.14494v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10132v2","updated":"2024-03-21T15:42:06Z","published":"2023-12-15T17:02:19Z","title":"Closing the Gap: Achieving Better Accuracy-Robustness Tradeoffs against\n Query-Based Attacks","summary":" Although promising, existing defenses against query-based attacks share a\ncommon limitation: they offer increased robustness against attacks at the price\nof a considerable accuracy drop on clean samples. In this work, we show how to\nefficiently establish, at test-time, a solid tradeoff between robustness and\naccuracy when mitigating query-based attacks. Given that these attacks\nnecessarily explore low-confidence regions, our insight is that activating\ndedicated defenses, such as random noise defense and random image\ntransformations, only for low-confidence inputs is sufficient to prevent them.\nOur approach is independent of training and supported by theory. We verify the\neffectiveness of our approach for various existing defenses by conducting\nextensive experiments on CIFAR-10, CIFAR-100, and ImageNet. Our results confirm\nthat our proposal can indeed enhance these defenses by providing better\ntradeoffs between robustness and accuracy when compared to state-of-the-art\napproaches while being completely training-free.\n","authors":["Pascal Zimmer","Sébastien Andreina","Giorgia Azzurra Marson","Ghassan Karame"],"pdf_url":"https://arxiv.org/pdf/2312.10132v2.pdf","comment":"To appear in the Proceedings of the AAAI Conference on Artificial\n Intelligence (AAAI) 2024"},{"id":"http://arxiv.org/abs/2403.13261v2","updated":"2024-03-21T15:40:16Z","published":"2024-03-20T02:58:45Z","title":"Self-Supervised Class-Agnostic Motion Prediction with Spatial and\n Temporal Consistency Regularizations","summary":" The perception of motion behavior in a dynamic environment holds significant\nimportance for autonomous driving systems, wherein class-agnostic motion\nprediction methods directly predict the motion of the entire point cloud. While\nmost existing methods rely on fully-supervised learning, the manual labeling of\npoint cloud data is laborious and time-consuming. Therefore, several\nannotation-efficient methods have been proposed to address this challenge.\nAlthough effective, these methods rely on weak annotations or additional\nmulti-modal data like images, and the potential benefits inherent in the point\ncloud sequence are still underexplored. To this end, we explore the feasibility\nof self-supervised motion prediction with only unlabeled LiDAR point clouds.\nInitially, we employ an optimal transport solver to establish coarse\ncorrespondences between current and future point clouds as the coarse pseudo\nmotion labels. Training models directly using such coarse labels leads to\nnoticeable spatial and temporal prediction inconsistencies. To mitigate these\nissues, we introduce three simple spatial and temporal regularization losses,\nwhich facilitate the self-supervised training process effectively. Experimental\nresults demonstrate the significant superiority of our approach over the\nstate-of-the-art self-supervised methods.\n","authors":["Kewei Wang","Yizheng Wu","Jun Cen","Zhiyu Pan","Xingyi Li","Zhe Wang","Zhiguo Cao","Guosheng Lin"],"pdf_url":"https://arxiv.org/pdf/2403.13261v2.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.14489v1","updated":"2024-03-21T15:37:37Z","published":"2024-03-21T15:37:37Z","title":"Adversary-Robust Graph-Based Learning of WSIs","summary":" Enhancing the robustness of deep learning models against adversarial attacks\nis crucial, especially in critical domains like healthcare where significant\nfinancial interests heighten the risk of such attacks. Whole slide images\n(WSIs) are high-resolution, digitized versions of tissue samples mounted on\nglass slides, scanned using sophisticated imaging equipment. The digital\nanalysis of WSIs presents unique challenges due to their gigapixel size and\nmulti-resolution storage format. In this work, we aim at improving the\nrobustness of cancer Gleason grading classification systems against adversarial\nattacks, addressing challenges at both the image and graph levels. As regards\nthe proposed algorithm, we develop a novel and innovative graph-based model\nwhich utilizes GNN to extract features from the graph representation of WSIs. A\ndenoising module, along with a pooling layer is incorporated to manage the\nimpact of adversarial attacks on the WSIs. The process concludes with a\ntransformer module that classifies various grades of prostate cancer based on\nthe processed data. To assess the effectiveness of the proposed method, we\nconducted a comparative analysis using two scenarios. Initially, we trained and\ntested the model without the denoiser using WSIs that had not been exposed to\nany attack. We then introduced a range of attacks at either the image or graph\nlevel and processed them through the proposed network. The performance of the\nmodel was evaluated in terms of accuracy and kappa scores. The results from\nthis comparison showed a significant improvement in cancer diagnosis accuracy,\nhighlighting the robustness and efficiency of the proposed method in handling\nadversarial challenges in the context of medical imaging.\n","authors":["Saba Heidari Gheshlaghi","Milan Aryal","Nasim Yahyasoltani","Masoud Ganji"],"pdf_url":"https://arxiv.org/pdf/2403.14489v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14487v1","updated":"2024-03-21T15:35:42Z","published":"2024-03-21T15:35:42Z","title":"DesignEdit: Multi-Layered Latent Decomposition and Fusion for Unified &\n Accurate Image Editing","summary":" Recently, how to achieve precise image editing has attracted increasing\nattention, especially given the remarkable success of text-to-image generation\nmodels. To unify various spatial-aware image editing abilities into one\nframework, we adopt the concept of layers from the design domain to manipulate\nobjects flexibly with various operations. The key insight is to transform the\nspatial-aware image editing task into a combination of two sub-tasks:\nmulti-layered latent decomposition and multi-layered latent fusion. First, we\nsegment the latent representations of the source images into multiple layers,\nwhich include several object layers and one incomplete background layer that\nnecessitates reliable inpainting. To avoid extra tuning, we further explore the\ninner inpainting ability within the self-attention mechanism. We introduce a\nkey-masking self-attention scheme that can propagate the surrounding context\ninformation into the masked region while mitigating its impact on the regions\noutside the mask. Second, we propose an instruction-guided latent fusion that\npastes the multi-layered latent representations onto a canvas latent. We also\nintroduce an artifact suppression scheme in the latent space to enhance the\ninpainting quality. Due to the inherent modular advantages of such\nmulti-layered representations, we can achieve accurate image editing, and we\ndemonstrate that our approach consistently surpasses the latest spatial editing\nmethods, including Self-Guidance and DiffEditor. Last, we show that our\napproach is a unified framework that supports various accurate image editing\ntasks on more than six different editing tasks.\n","authors":["Yueru Jia","Yuhui Yuan","Aosong Cheng","Chuke Wang","Ji Li","Huizhu Jia","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.14487v1.pdf","comment":"technical report, 15 pages, webpage: https://design-edit.github.io/"},{"id":"http://arxiv.org/abs/2312.02015v2","updated":"2024-03-21T15:32:35Z","published":"2023-12-04T16:38:16Z","title":"ColonNeRF: High-Fidelity Neural Reconstruction of Long Colonoscopy","summary":" Colonoscopy reconstruction is pivotal for diagnosing colorectal cancer.\nHowever, accurate long-sequence colonoscopy reconstruction faces three major\nchallenges: (1) dissimilarity among segments of the colon due to its meandering\nand convoluted shape; (2) co-existence of simple and intricately folded\ngeometry structures; (3) sparse viewpoints due to constrained camera\ntrajectories. To tackle these challenges, we introduce a new reconstruction\nframework based on neural radiance field (NeRF), named ColonNeRF, which\nleverages neural rendering for novel view synthesis of long-sequence\ncolonoscopy. Specifically, to reconstruct the entire colon in a piecewise\nmanner, our ColonNeRF introduces a region division and integration module,\neffectively reducing shape dissimilarity and ensuring geometric consistency in\neach segment. To learn both the simple and complex geometry in a unified\nframework, our ColonNeRF incorporates a multi-level fusion module that\nprogressively models the colon regions from easy to hard. Additionally, to\novercome the challenges from sparse views, we devise a DensiNet module for\ndensifying camera poses under the guidance of semantic consistency. We conduct\nextensive experiments on both synthetic and real-world datasets to evaluate our\nColonNeRF. Quantitatively, ColonNeRF exhibits a 67%-85% increase in LPIPS-ALEX\nscores. Qualitatively, our reconstruction visualizations show much clearer\ntextures and more accurate geometric details. These sufficiently demonstrate\nour superior performance over the state-of-the-art methods.\n","authors":["Yufei Shi","Beijia Lu","Jia-Wei Liu","Ming Li","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2312.02015v2.pdf","comment":"for Project Page, see https://showlab.github.io/ColonNeRF/"},{"id":"http://arxiv.org/abs/2403.14484v1","updated":"2024-03-21T15:31:28Z","published":"2024-03-21T15:31:28Z","title":"HyperGALE: ASD Classification via Hypergraph Gated Attention with\n Learnable Hyperedges","summary":" Autism Spectrum Disorder (ASD) is a neurodevelopmental condition\ncharacterized by varied social cognitive challenges and repetitive behavioral\npatterns. Identifying reliable brain imaging-based biomarkers for ASD has been\na persistent challenge due to the spectrum's diverse symptomatology. Existing\nbaselines in the field have made significant strides in this direction, yet\nthere remains room for improvement in both performance and interpretability. We\npropose \\emph{HyperGALE}, which builds upon the hypergraph by incorporating\nlearned hyperedges and gated attention mechanisms. This approach has led to\nsubstantial improvements in the model's ability to interpret complex brain\ngraph data, offering deeper insights into ASD biomarker characterization.\nEvaluated on the extensive ABIDE II dataset, \\emph{HyperGALE} not only improves\ninterpretability but also demonstrates statistically significant enhancements\nin key performance metrics compared to both previous baselines and the\nfoundational hypergraph model. The advancement \\emph{HyperGALE} brings to ASD\nresearch highlights the potential of sophisticated graph-based techniques in\nneurodevelopmental studies. The source code and implementation instructions are\navailable at GitHub:https://github.com/mehular0ra/HyperGALE.\n","authors":["Mehul Arora","Chirag Shantilal Jain","Lalith Bharadwaj Baru","Kamalaker Dadi","Bapi Raju Surampudi"],"pdf_url":"https://arxiv.org/pdf/2403.14484v1.pdf","comment":"Accepted to IJCNN 2024"},{"id":"http://arxiv.org/abs/2403.14472v1","updated":"2024-03-21T15:18:30Z","published":"2024-03-21T15:18:30Z","title":"Detoxifying Large Language Models via Knowledge Editing","summary":" This paper investigates using knowledge editing techniques to detoxify Large\nLanguage Models (LLMs). We construct a benchmark, SafeEdit, which covers nine\nunsafe categories with various powerful attack prompts and equips comprehensive\nmetrics for systematic evaluation. We conduct experiments to compare knowledge\nediting approaches with previous baselines, indicating that knowledge editing\nhas the potential to efficiently detoxify LLMs with limited impact on general\nperformance. Then, we propose a simple yet effective baseline, dubbed\nDetoxifying with Intraoperative Neural Monitoring (DINM), to diminish the\ntoxicity of LLMs within a few tuning steps via only one instance. We further\nprovide an in-depth analysis of the internal mechanism for various detoxify\napproaches, demonstrating that previous methods like SFT and DPO may merely\nsuppress the activations of toxic parameters, while DINM mitigates the toxicity\nof the toxic parameters to a certain extent, making permanent adjustments. We\nhope that these insights could shed light on future work of developing\ndetoxifying approaches and the underlying knowledge mechanisms of LLMs. Code\nand benchmark are available at https://github.com/zjunlp/EasyEdit.\n","authors":["Mengru Wang","Ningyu Zhang","Ziwen Xu","Zekun Xi","Shumin Deng","Yunzhi Yao","Qishen Zhang","Linyi Yang","Jindong Wang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2403.14472v1.pdf","comment":"Ongoing work. Project website:\n https://zjunlp.github.io/project/SafeEdit Benchmark:\n https://huggingface.co/datasets/zjunlp/SafeEdit Code:\n https://github.com/zjunlp/EasyEdit"},{"id":"http://arxiv.org/abs/2309.15627v2","updated":"2024-03-21T15:17:10Z","published":"2023-09-27T12:58:18Z","title":"Neuromorphic Imaging and Classification with Graph Learning","summary":" Bio-inspired neuromorphic cameras asynchronously record pixel brightness\nchanges and generate sparse event streams. They can capture dynamic scenes with\nlittle motion blur and more details in extreme illumination conditions. Due to\nthe multidimensional address-event structure, most existing vision algorithms\ncannot properly handle asynchronous event streams. While several event\nrepresentations and processing methods have been developed to address such an\nissue, they are typically driven by a large number of events, leading to\nsubstantial overheads in runtime and memory. In this paper, we propose a new\ngraph representation of the event data and couple it with a Graph Transformer\nto perform accurate neuromorphic classification. Extensive experiments show\nthat our approach leads to better results and excels at the challenging\nrealistic situations where only a small number of events and limited\ncomputational resources are available, paving the way for neuromorphic\napplications embedded into mobile facilities.\n","authors":["Pei Zhang","Chutian Wang","Edmund Y. Lam"],"pdf_url":"https://arxiv.org/pdf/2309.15627v2.pdf","comment":"15 pages, 4 figures, and 7 tables. Accepted by Elsevier\n Neurocomputing"},{"id":"http://arxiv.org/abs/2403.08262v2","updated":"2024-03-21T15:15:28Z","published":"2024-03-13T05:25:49Z","title":"BiTT: Bi-directional Texture Reconstruction of Interacting Two Hands\n from a Single Image","summary":" Creating personalized hand avatars is important to offer a realistic\nexperience to users on AR / VR platforms. While most prior studies focused on\nreconstructing 3D hand shapes, some recent work has tackled the reconstruction\nof hand textures on top of shapes. However, these methods are often limited to\ncapturing pixels on the visible side of a hand, requiring diverse views of the\nhand in a video or multiple images as input. In this paper, we propose a novel\nmethod, BiTT(Bi-directional Texture reconstruction of Two hands), which is the\nfirst end-to-end trainable method for relightable, pose-free texture\nreconstruction of two interacting hands taking only a single RGB image, by\nthree novel components: 1) bi-directional (left $\\leftrightarrow$ right)\ntexture reconstruction using the texture symmetry of left / right hands, 2)\nutilizing a texture parametric model for hand texture recovery, and 3) the\noverall coarse-to-fine stage pipeline for reconstructing personalized texture\nof two interacting hands. BiTT first estimates the scene light condition and\nalbedo image from an input image, then reconstructs the texture of both hands\nthrough the texture parametric model and bi-directional texture reconstructor.\nIn experiments using InterHand2.6M and RGB2Hands datasets, our method\nsignificantly outperforms state-of-the-art hand texture reconstruction methods\nquantitatively and qualitatively. The code is available at\nhttps://github.com/yunminjin2/BiTT\n","authors":["Minje Kim","Tae-Kyun Kim"],"pdf_url":"https://arxiv.org/pdf/2403.08262v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.14468v1","updated":"2024-03-21T15:15:00Z","published":"2024-03-21T15:15:00Z","title":"AnyV2V: A Plug-and-Play Framework For Any Video-to-Video Editing Tasks","summary":" Video-to-video editing involves editing a source video along with additional\ncontrol (such as text prompts, subjects, or styles) to generate a new video\nthat aligns with the source video and the provided control. Traditional methods\nhave been constrained to certain editing types, limiting their ability to meet\nthe wide range of user demands. In this paper, we introduce AnyV2V, a novel\ntraining-free framework designed to simplify video editing into two primary\nsteps: (1) employing an off-the-shelf image editing model (e.g.\nInstructPix2Pix, InstantID, etc) to modify the first frame, (2) utilizing an\nexisting image-to-video generation model (e.g. I2VGen-XL) for DDIM inversion\nand feature injection. In the first stage, AnyV2V can plug in any existing\nimage editing tools to support an extensive array of video editing tasks.\nBeyond the traditional prompt-based editing methods, AnyV2V also can support\nnovel video editing tasks, including reference-based style transfer,\nsubject-driven editing, and identity manipulation, which were unattainable by\nprevious methods. In the second stage, AnyV2V can plug in any existing\nimage-to-video models to perform DDIM inversion and intermediate feature\ninjection to maintain the appearance and motion consistency with the source\nvideo. On the prompt-based editing, we show that AnyV2V can outperform the\nprevious best approach by 35\\% on prompt alignment, and 25\\% on human\npreference. On the three novel tasks, we show that AnyV2V also achieves a high\nsuccess rate. We believe AnyV2V will continue to thrive due to its ability to\nseamlessly integrate the fast-evolving image editing methods. Such\ncompatibility can help AnyV2V to increase its versatility to cater to diverse\nuser demands.\n","authors":["Max Ku","Cong Wei","Weiming Ren","Huan Yang","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2403.14468v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2403.14465v1","updated":"2024-03-21T15:13:36Z","published":"2024-03-21T15:13:36Z","title":"CathFlow: Self-Supervised Segmentation of Catheters in Interventional\n Ultrasound Using Optical Flow and Transformers","summary":" In minimally invasive endovascular procedures, contrast-enhanced angiography\nremains the most robust imaging technique. However, it is at the expense of the\npatient and clinician's health due to prolonged radiation exposure. As an\nalternative, interventional ultrasound has notable benefits such as being\nradiation-free, fast to deploy, and having a small footprint in the operating\nroom. Yet, ultrasound is hard to interpret, and highly prone to artifacts and\nnoise. Additionally, interventional radiologists must undergo extensive\ntraining before they become qualified to diagnose and treat patients\neffectively, leading to a shortage of staff, and a lack of open-source\ndatasets. In this work, we seek to address both problems by introducing a\nself-supervised deep learning architecture to segment catheters in longitudinal\nultrasound images, without demanding any labeled data. The network architecture\nbuilds upon AiAReSeg, a segmentation transformer built with the Attention in\nAttention mechanism, and is capable of learning feature changes across time and\nspace. To facilitate training, we used synthetic ultrasound data based on\nphysics-driven catheter insertion simulations, and translated the data into a\nunique CT-Ultrasound common domain, CACTUSS, to improve the segmentation\nperformance. We generated ground truth segmentation masks by computing the\noptical flow between adjacent frames using FlowNet2, and performed thresholding\nto obtain a binary map estimate. Finally, we validated our model on a test\ndataset, consisting of unseen synthetic data and images collected from silicon\naorta phantoms, thus demonstrating its potential for applications to clinical\ndata in the future.\n","authors":["Alex Ranne","Liming Kuang","Yordanka Velikova","Nassir Navab","Ferdinando Rodriguez y Baena"],"pdf_url":"https://arxiv.org/pdf/2403.14465v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2309.00903v2","updated":"2024-03-21T15:12:36Z","published":"2023-09-02T10:46:05Z","title":"An explainable three dimension framework to uncover learning patterns: A\n unified look in variable sulci recognition","summary":" Explainable AI is crucial in medical imaging. In the challenging field of\nneuroscience, visual topics present a high level of complexity, particularly\nwithin three-dimensional space. The application of neuroscience, which involves\nidentifying brain sulcal features from MRI, faces significant hurdles due to\nvarying annotation protocols among experts and the intricate three-dimension\nfunctionality of the brain. Consequently, traditional explainability approaches\nfall short in effectively validating and evaluating these networks. To address\nthis, we first present a mathematical formulation delineating various\ncategories of explanation needs across diverse computer vision tasks,\ncategorized into self-explanatory, semi-explanatory, non-explanatory, and\nnew-pattern learning applications based on the reliability of the validation\nprotocol. With respect to this mathematical formulation, we propose a 3D\nexplainability framework aimed at validating the outputs of deep learning\nnetworks in detecting the paracingulate sulcus an essential brain anatomical\nfeature. The framework integrates local 3D explanations, global explanations\nthrough dimensionality reduction, concatenated global explanations, and\nstatistical shape features, unveiling new insights into pattern learning. We\ntrained and tested two advanced 3D deep learning networks on the challenging\nTOP-OSLO dataset, significantly improving sulcus detection accuracy,\nparticularly on the left hemisphere. During evaluation with diverse annotation\nprotocols for this dataset, we highlighted the crucial role of an unbiased\nannotation process in achieving precise predictions and effective pattern\nlearning within our proposed 3D framework. The proposed framework not only\nannotates the variable sulcus but also uncovers hidden AI knowledge, promising\nto advance our understanding of brain anatomy and function.\n","authors":["Michail Mamalakis","Heloise de Vareilles","Atheer AI-Manea","Samantha C. Mitchell","Ingrid Arartz","Lynn Egeland Morch-Johnsen","Jane Garrison","Jon Simons","Pietro Lio","John Suckling","Graham Murray"],"pdf_url":"https://arxiv.org/pdf/2309.00903v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02041v2","updated":"2024-03-21T14:59:13Z","published":"2024-03-04T13:47:30Z","title":"A Generative Approach for Wikipedia-Scale Visual Entity Recognition","summary":" In this paper, we address web-scale visual entity recognition, specifically\nthe task of mapping a given query image to one of the 6 million existing\nentities in Wikipedia. One way of approaching a problem of such scale is using\ndual-encoder models (eg CLIP), where all the entity names and query images are\nembedded into a unified space, paving the way for an approximate k-NN search.\nAlternatively, it is also possible to re-purpose a captioning model to directly\ngenerate the entity names for a given image. In contrast, we introduce a novel\nGenerative Entity Recognition (GER) framework, which given an input image\nlearns to auto-regressively decode a semantic and discriminative ``code''\nidentifying the target entity. Our experiments demonstrate the efficacy of this\nGER paradigm, showcasing state-of-the-art performance on the challenging OVEN\nbenchmark. GER surpasses strong captioning, dual-encoder, visual matching and\nhierarchical classification baselines, affirming its advantage in tackling the\ncomplexities of web-scale recognition.\n","authors":["Mathilde Caron","Ahmet Iscen","Alireza Fathi","Cordelia Schmid"],"pdf_url":"https://arxiv.org/pdf/2403.02041v2.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2401.00463v2","updated":"2024-03-21T14:57:25Z","published":"2023-12-31T11:38:50Z","title":"Analyzing Local Representations of Self-supervised Vision Transformers","summary":" In this paper, we present a comparative analysis of various self-supervised\nVision Transformers (ViTs), focusing on their local representative power.\nInspired by large language models, we examine the abilities of ViTs to perform\nvarious computer vision tasks with little to no fine-tuning. We design\nevaluation framework to analyze the quality of local, i.e.\\ patch-level,\nrepresentations in the context of few-shot semantic segmentation, instance\nidentification, object retrieval and tracking. We discover that contrastive\nlearning based methods like DINO produce more universal patch representations\nthat can be immediately applied for downstream tasks with no parameter tuning,\ncompared to masked image modeling. The embeddings learned using the latter\napproach, e.g. in masked autoencoders, have high variance features that harm\ndistance-based algorithms, such as k-NN, and do not contain useful information\nfor most downstream tasks. Furthermore, we demonstrate that removing these\nhigh-variance features enhances k-NN for MAE, as well as for its recent\nextension Scale-MAE. Finally, we find an object instance retrieval setting\nwhere DINOv2, a model pretrained on two orders of magnitude more data, falls\nshort of its less compute intensive counterpart DINO.\n","authors":["Ani Vanyan","Alvard Barseghyan","Hakob Tamazyan","Vahan Huroyan","Hrant Khachatrian","Martin Danelljan"],"pdf_url":"https://arxiv.org/pdf/2401.00463v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14447v1","updated":"2024-03-21T14:53:50Z","published":"2024-03-21T14:53:50Z","title":"Exploring 3D Human Pose Estimation and Forecasting from the Robot's\n Perspective: The HARPER Dataset","summary":" We introduce HARPER, a novel dataset for 3D body pose estimation and forecast\nin dyadic interactions between users and \\spot, the quadruped robot\nmanufactured by Boston Dynamics. The key-novelty is the focus on the robot's\nperspective, i.e., on the data captured by the robot's sensors. These make 3D\nbody pose analysis challenging because being close to the ground captures\nhumans only partially. The scenario underlying HARPER includes 15 actions, of\nwhich 10 involve physical contact between the robot and users. The Corpus\ncontains not only the recordings of the built-in stereo cameras of Spot, but\nalso those of a 6-camera OptiTrack system (all recordings are synchronized).\nThis leads to ground-truth skeletal representations with a precision lower than\na millimeter. In addition, the Corpus includes reproducible benchmarks on 3D\nHuman Pose Estimation, Human Pose Forecasting, and Collision Prediction, all\nbased on publicly available baseline approaches. This enables future HARPER\nusers to rigorously compare their results with those we provide in this work.\n","authors":["Andrea Avogaro. Andrea Toaiari","Federico Cunico","Xiangmin Xu","Haralambos Dafas","Alessandro Vinciarelli","Emma Li","Marco Cristani"],"pdf_url":"https://arxiv.org/pdf/2403.14447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07208v2","updated":"2024-03-21T14:52:55Z","published":"2024-01-14T06:07:07Z","title":"Enhanced Few-Shot Class-Incremental Learning via Ensemble Models","summary":" Few-shot class-incremental learning (FSCIL) aims to continually fit new\nclasses with limited training data, while maintaining the performance of\npreviously learned classes. The main challenges are overfitting the rare new\ntraining samples and forgetting old classes. While catastrophic forgetting has\nbeen extensively studied, the overfitting problem has attracted less attention\nin FSCIL. To tackle overfitting challenge, we design a new ensemble model\nframework cooperated with data augmentation to boost generalization. In this\nway, the enhanced model works as a library storing abundant features to\nguarantee fast adaptation to downstream tasks. Specifically, the multi-input\nmulti-output ensemble structure is applied with a spatial-aware data\naugmentation strategy, aiming at diversifying the feature extractor and\nalleviating overfitting in incremental sessions. Moreover, self-supervised\nlearning is also integrated to further improve the model generalization.\nComprehensive experimental results show that the proposed method can indeed\nmitigate the overfitting problem in FSCIL, and outperform the state-of-the-art\nmethods.\n","authors":["Mingli Zhu","Zihao Zhu","Sihong Chen","Chen Chen","Baoyuan Wu"],"pdf_url":"https://arxiv.org/pdf/2401.07208v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14442v1","updated":"2024-03-21T14:47:12Z","published":"2024-03-21T14:47:12Z","title":"RoDLA: Benchmarking the Robustness of Document Layout Analysis Models","summary":" Before developing a Document Layout Analysis (DLA) model in real-world\napplications, conducting comprehensive robustness testing is essential.\nHowever, the robustness of DLA models remains underexplored in the literature.\nTo address this, we are the first to introduce a robustness benchmark for DLA\nmodels, which includes 450K document images of three datasets. To cover\nrealistic corruptions, we propose a perturbation taxonomy with 36 common\ndocument perturbations inspired by real-world document processing.\nAdditionally, to better understand document perturbation impacts, we propose\ntwo metrics, Mean Perturbation Effect (mPE) for perturbation assessment and\nMean Robustness Degradation (mRD) for robustness evaluation. Furthermore, we\nintroduce a self-titled model, i.e., Robust Document Layout Analyzer (RoDLA),\nwhich improves attention mechanisms to boost extraction of robust features.\nExperiments on the proposed benchmarks (PubLayNet-P, DocLayNet-P, and\nM$^6$Doc-P) demonstrate that RoDLA obtains state-of-the-art mRD scores of\n115.7, 135.4, and 150.4, respectively. Compared to previous methods, RoDLA\nachieves notable improvements in mAP of +3.8%, +7.1% and +12.1%, respectively.\n","authors":["Yufan Chen","Jiaming Zhang","Kunyu Peng","Junwei Zheng","Ruiping Liu","Philip Torr","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2403.14442v1.pdf","comment":"Accepted by CVPR 2024. Project page:\n https://yufanchen96.github.io/projects/RoDLA"},{"id":"http://arxiv.org/abs/2403.14440v1","updated":"2024-03-21T14:45:54Z","published":"2024-03-21T14:45:54Z","title":"Analysing Diffusion Segmentation for Medical Images","summary":" Denoising Diffusion Probabilistic models have become increasingly popular due\nto their ability to offer probabilistic modeling and generate diverse outputs.\nThis versatility inspired their adaptation for image segmentation, where\nmultiple predictions of the model can produce segmentation results that not\nonly achieve high quality but also capture the uncertainty inherent in the\nmodel. Here, powerful architectures were proposed for improving diffusion\nsegmentation performance. However, there is a notable lack of analysis and\ndiscussions on the differences between diffusion segmentation and image\ngeneration, and thorough evaluations are missing that distinguish the\nimprovements these architectures provide for segmentation in general from their\nbenefit for diffusion segmentation specifically. In this work, we critically\nanalyse and discuss how diffusion segmentation for medical images differs from\ndiffusion image generation, with a particular focus on the training behavior.\nFurthermore, we conduct an assessment how proposed diffusion segmentation\narchitectures perform when trained directly for segmentation. Lastly, we\nexplore how different medical segmentation tasks influence the diffusion\nsegmentation behavior and the diffusion process could be adapted accordingly.\nWith these analyses, we aim to provide in-depth insights into the behavior of\ndiffusion segmentation that allow for a better design and evaluation of\ndiffusion segmentation methods in the future.\n","authors":["Mathias Öttl","Siyuan Mei","Frauke Wilm","Jana Steenpass","Matthias Rübner","Arndt Hartmann","Matthias Beckmann","Peter Fasching","Andreas Maier","Ramona Erber","Katharina Breininger"],"pdf_url":"https://arxiv.org/pdf/2403.14440v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14439v1","updated":"2024-03-21T14:45:41Z","published":"2024-03-21T14:45:41Z","title":"Raw Instinct: Trust Your Classifiers and Skip the Conversion","summary":" Using RAW-images in computer vision problems is surprisingly underexplored\nconsidering that converting from RAW to RGB does not introduce any new capture\ninformation. In this paper, we show that a sufficiently advanced classifier can\nyield equivalent results on RAW input compared to RGB and present a new public\ndataset consisting of RAW images and the corresponding converted RGB images.\nClassifying images directly from RAW is attractive, as it allows for skipping\nthe conversion to RGB, lowering computation time significantly. Two CNN\nclassifiers are used to classify the images in both formats, confirming that\nclassification performance can indeed be preserved. We furthermore show that\nthe total computation time from RAW image data to classification results for\nRAW images can be up to 8.46 times faster than RGB. These results contribute to\nthe evidence found in related works, that using RAW images as direct input to\ncomputer vision algorithms looks very promising.\n","authors":["Christos Kantas","Bjørk Antoniussen","Mathias V. Andersen","Rasmus Munksø","Shobhit Kotnala","Simon B. Jensen","Andreas Møgelmose","Lau Nørgaard","Thomas B. Moeslund"],"pdf_url":"https://arxiv.org/pdf/2403.14439v1.pdf","comment":"https://www.kaggle.com/datasets/mathiasviborg/raw-instinct"},{"id":"http://arxiv.org/abs/2403.14435v1","updated":"2024-03-21T14:41:58Z","published":"2024-03-21T14:41:58Z","title":"Biased Binary Attribute Classifiers Ignore the Majority Classes","summary":" To visualize the regions of interest that classifiers base their decisions\non, different Class Activation Mapping (CAM) methods have been developed.\nHowever, all of these techniques target categorical classifiers only, though\nmost real-world tasks are binary classification. In this paper, we extend\ngradient-based CAM techniques to work with binary classifiers and visualize the\nactive regions for binary facial attribute classifiers. When training an\nunbalanced binary classifier on an imbalanced dataset, it is well-known that\nthe majority class, i.e. the class with many training samples, is mostly\npredicted much better than minority class with few training instances. In our\nexperiments on the CelebA dataset, we verify these results, when training an\nunbalanced classifier to extract 40 facial attributes simultaneously. One would\nexpect that the biased classifier has learned to extract features mainly for\nthe majority classes and that the proportional energy of the activations mainly\nreside in certain specific regions of the image where the attribute is located.\nHowever, we find very little regular activation for samples of majority\nclasses, while the active regions for minority classes seem mostly reasonable\nand overlap with our expectations. These results suggest that biased\nclassifiers mainly rely on bias activation for majority classes. When training\na balanced classifier on the imbalanced data by employing attribute-specific\nclass weights, majority and minority classes are classified similarly well and\nshow expected activations for almost all attributes\n","authors":["Xinyi Zhang","Johanna Sophie Bieri","Manuel Günther"],"pdf_url":"https://arxiv.org/pdf/2403.14435v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14430v1","updated":"2024-03-21T14:37:50Z","published":"2024-03-21T14:37:50Z","title":"Ranking Distillation for Open-Ended Video Question Answering with\n Insufficient Labels","summary":" This paper focuses on open-ended video question answering, which aims to find\nthe correct answers from a large answer set in response to a video-related\nquestion. This is essentially a multi-label classification task, since a\nquestion may have multiple answers. However, due to annotation costs, the\nlabels in existing benchmarks are always extremely insufficient, typically one\nanswer per question. As a result, existing works tend to directly treat all the\nunlabeled answers as negative labels, leading to limited ability for\ngeneralization. In this work, we introduce a simple yet effective ranking\ndistillation framework (RADI) to mitigate this problem without additional\nmanual annotation. RADI employs a teacher model trained with incomplete labels\nto generate rankings for potential answers, which contain rich knowledge about\nlabel priority as well as label-associated visual cues, thereby enriching the\ninsufficient labeling information. To avoid overconfidence in the imperfect\nteacher model, we further present two robust and parameter-free ranking\ndistillation approaches: a pairwise approach which introduces adaptive soft\nmargins to dynamically refine the optimization constraints on various pairwise\nrankings, and a listwise approach which adopts sampling-based partial listwise\nlearning to resist the bias in teacher ranking. Extensive experiments on five\npopular benchmarks consistently show that both our pairwise and listwise RADIs\noutperform state-of-the-art methods. Further analysis demonstrates the\neffectiveness of our methods on the insufficient labeling problem.\n","authors":["Tianming Liang","Chaolei Tan","Beihao Xia","Wei-Shi Zheng","Jian-Fang Hu"],"pdf_url":"https://arxiv.org/pdf/2403.14430v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.14429v1","updated":"2024-03-21T14:36:59Z","published":"2024-03-21T14:36:59Z","title":"Style-Extracting Diffusion Models for Semi-Supervised Histopathology\n Segmentation","summary":" Deep learning-based image generation has seen significant advancements with\ndiffusion models, notably improving the quality of generated images. Despite\nthese developments, generating images with unseen characteristics beneficial\nfor downstream tasks has received limited attention. To bridge this gap, we\npropose Style-Extracting Diffusion Models, featuring two conditioning\nmechanisms. Specifically, we utilize 1) a style conditioning mechanism which\nallows to inject style information of previously unseen images during image\ngeneration and 2) a content conditioning which can be targeted to a downstream\ntask, e.g., layout for segmentation. We introduce a trainable style encoder to\nextract style information from images, and an aggregation block that merges\nstyle information from multiple style inputs. This architecture enables the\ngeneration of images with unseen styles in a zero-shot manner, by leveraging\nstyles from unseen images, resulting in more diverse generations. In this work,\nwe use the image layout as target condition and first show the capability of\nour method on a natural image dataset as a proof-of-concept. We further\ndemonstrate its versatility in histopathology, where we combine prior knowledge\nabout tissue composition and unannotated data to create diverse synthetic\nimages with known layouts. This allows us to generate additional synthetic data\nto train a segmentation network in a semi-supervised fashion. We verify the\nadded value of the generated images by showing improved segmentation results\nand lower performance variability between patients when synthetic images are\nincluded during segmentation training. Our code will be made publicly available\nat [LINK].\n","authors":["Mathias Öttl","Frauke Wilm","Jana Steenpass","Jingna Qiu","Matthias Rübner","Arndt Hartmann","Matthias Beckmann","Peter Fasching","Andreas Maier","Ramona Erber","Bernhard Kainz","Katharina Breininger"],"pdf_url":"https://arxiv.org/pdf/2403.14429v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18467v3","updated":"2024-03-21T14:33:33Z","published":"2024-02-28T16:43:27Z","title":"Separate and Conquer: Decoupling Co-occurrence via Decomposition and\n Representation for Weakly Supervised Semantic Segmentation","summary":" Weakly supervised semantic segmentation (WSSS) with image-level labels aims\nto achieve segmentation tasks without dense annotations. However, attributed to\nthe frequent coupling of co-occurring objects and the limited supervision from\nimage-level labels, the challenging co-occurrence problem is widely present and\nleads to false activation of objects in WSSS. In this work, we devise a\n'Separate and Conquer' scheme SeCo to tackle this issue from dimensions of\nimage space and feature space. In the image space, we propose to 'separate' the\nco-occurring objects with image decomposition by subdividing images into\npatches. Importantly, we assign each patch a category tag from Class Activation\nMaps (CAMs), which spatially helps remove the co-context bias and guide the\nsubsequent representation. In the feature space, we propose to 'conquer' the\nfalse activation by enhancing semantic representation with multi-granularity\nknowledge contrast. To this end, a dual-teacher-single-student architecture is\ndesigned and tag-guided contrast is conducted, which guarantee the correctness\nof knowledge and further facilitate the discrepancy among co-contexts. We\nstreamline the multi-staged WSSS pipeline end-to-end and tackle this issue\nwithout external supervision. Extensive experiments are conducted, validating\nthe efficiency of our method and the superiority over previous single-staged\nand even multi-staged competitors on PASCAL VOC and MS COCO. Code is available\nat https://github.com/zwyang6/SeCo.git.\n","authors":["Zhiwei Yang","Kexue Fu","Minghong Duan","Linhao Qu","Shuo Wang","Zhijian Song"],"pdf_url":"https://arxiv.org/pdf/2402.18467v3.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2308.08325v2","updated":"2024-03-21T14:31:56Z","published":"2023-08-16T12:39:39Z","title":"Visually-Aware Context Modeling for News Image Captioning","summary":" News Image Captioning aims to create captions from news articles and images,\nemphasizing the connection between textual context and visual elements.\nRecognizing the significance of human faces in news images and the face-name\nco-occurrence pattern in existing datasets, we propose a face-naming module for\nlearning better name embeddings. Apart from names, which can be directly linked\nto an image area (faces), news image captions mostly contain context\ninformation that can only be found in the article. We design a retrieval\nstrategy using CLIP to retrieve sentences that are semantically close to the\nimage, mimicking human thought process of linking articles to images.\nFurthermore, to tackle the problem of the imbalanced proportion of article\ncontext and image context in captions, we introduce a simple yet effective\nmethod Contrasting with Language Model backbone (CoLaM) to the training\npipeline. We conduct extensive experiments to demonstrate the efficacy of our\nframework. We out-perform the previous state-of-the-art (without external data)\nby 7.97/5.80 CIDEr scores on GoodNews/NYTimes800k. Our code is available at\nhttps://github.com/tingyu215/VACNIC.\n","authors":["Tingyu Qu","Tinne Tuytelaars","Marie-Francine Moens"],"pdf_url":"https://arxiv.org/pdf/2308.08325v2.pdf","comment":"Accepted at NAACL 2024 Main Conference"},{"id":"http://arxiv.org/abs/2403.07570v2","updated":"2024-03-21T14:28:15Z","published":"2024-03-12T11:58:37Z","title":"An Active Contour Model Driven By the Hybrid Signed Pressure Function","summary":" Due to the influence of imaging equipment and complex imaging environments,\nmost images in daily life have features of intensity inhomogeneity and noise.\nTherefore, many scholars have designed many image segmentation algorithms to\naddress these issues. Among them, the active contour model is one of the most\neffective image segmentation algorithms.This paper proposes an active contour\nmodel driven by the hybrid signed pressure function that combines global and\nlocal information construction. Firstly, a new global region-based signed\npressure function is introduced by combining the average intensity of the inner\nand outer regions of the curve with the median intensity of the inner region of\nthe evolution curve. Then, the paper uses the energy differences between the\ninner and outer regions of the curve in the local region to design the signed\npressure function of the local term. Combine the two SPF function to obtain a\nnew signed pressure function and get the evolution equation of the new model.\nFinally, experiments and numerical analysis show that the model has excellent\nsegmentation performance for both intensity inhomogeneous images and noisy\nimages.\n","authors":["Jing Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.07570v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15980v2","updated":"2024-03-21T14:21:58Z","published":"2023-11-27T16:26:54Z","title":"Direct2.5: Diverse Text-to-3D Generation via Multi-view 2.5D Diffusion","summary":" Recent advances in generative AI have unveiled significant potential for the\ncreation of 3D content. However, current methods either apply a pre-trained 2D\ndiffusion model with the time-consuming score distillation sampling (SDS), or a\ndirect 3D diffusion model trained on limited 3D data losing generation\ndiversity. In this work, we approach the problem by employing a multi-view 2.5D\ndiffusion fine-tuned from a pre-trained 2D diffusion model. The multi-view 2.5D\ndiffusion directly models the structural distribution of 3D data, while still\nmaintaining the strong generalization ability of the original 2D diffusion\nmodel, filling the gap between 2D diffusion-based and direct 3D diffusion-based\nmethods for 3D content generation. During inference, multi-view normal maps are\ngenerated using the 2.5D diffusion, and a novel differentiable rasterization\nscheme is introduced to fuse the almost consistent multi-view normal maps into\na consistent 3D model. We further design a normal-conditioned multi-view image\ngeneration module for fast appearance generation given the 3D geometry. Our\nmethod is a one-pass diffusion process and does not require any SDS\noptimization as post-processing. We demonstrate through extensive experiments\nthat, our direct 2.5D generation with the specially-designed fusion scheme can\nachieve diverse, mode-seeking-free, and high-fidelity 3D content generation in\nonly 10 seconds. Project page: https://nju-3dv.github.io/projects/direct25.\n","authors":["Yuanxun Lu","Jingyang Zhang","Shiwei Li","Tian Fang","David McKinnon","Yanghai Tsin","Long Quan","Xun Cao","Yao Yao"],"pdf_url":"https://arxiv.org/pdf/2311.15980v2.pdf","comment":"CVPR 2024 camera ready, including more evaluations and discussions.\n Project webpage: https://nju-3dv.github.io/projects/direct25"},{"id":"http://arxiv.org/abs/2403.14421v1","updated":"2024-03-21T14:17:28Z","published":"2024-03-21T14:17:28Z","title":"DP-RDM: Adapting Diffusion Models to Private Domains Without Fine-Tuning","summary":" Text-to-image diffusion models have been shown to suffer from sample-level\nmemorization, possibly reproducing near-perfect replica of images that they are\ntrained on, which may be undesirable. To remedy this issue, we develop the\nfirst differentially private (DP) retrieval-augmented generation algorithm that\nis capable of generating high-quality image samples while providing provable\nprivacy guarantees. Specifically, we assume access to a text-to-image diffusion\nmodel trained on a small amount of public data, and design a DP retrieval\nmechanism to augment the text prompt with samples retrieved from a private\nretrieval dataset. Our \\emph{differentially private retrieval-augmented\ndiffusion model} (DP-RDM) requires no fine-tuning on the retrieval dataset to\nadapt to another domain, and can use state-of-the-art generative models to\ngenerate high-quality image samples while satisfying rigorous DP guarantees.\nFor instance, when evaluated on MS-COCO, our DP-RDM can generate samples with a\nprivacy budget of $\\epsilon=10$, while providing a $3.5$ point improvement in\nFID compared to public-only retrieval for up to $10,000$ queries.\n","authors":["Jonathan Lebensold","Maziar Sanjabi","Pietro Astolfi","Adriana Romero-Soriano","Kamalika Chaudhuri","Mike Rabbat","Chuan Guo"],"pdf_url":"https://arxiv.org/pdf/2403.14421v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15583v7","updated":"2024-03-21T14:16:39Z","published":"2023-05-24T21:39:27Z","title":"Alleviating Exposure Bias in Diffusion Models through Sampling with\n Shifted Time Steps","summary":" Diffusion Probabilistic Models (DPM) have shown remarkable efficacy in the\nsynthesis of high-quality images. However, their inference process\ncharacteristically requires numerous, potentially hundreds, of iterative steps,\nwhich could exaggerate the problem of exposure bias due to the training and\ninference discrepancy. Previous work has attempted to mitigate this issue by\nperturbing inputs during training, which consequently mandates the retraining\nof the DPM. In this work, we conduct a systematic study of exposure bias in DPM\nand, intriguingly, we find that the exposure bias could be alleviated with a\nnovel sampling method that we propose, without retraining the model. We\nempirically and theoretically show that, during inference, for each backward\ntime step $t$ and corresponding state $\\hat{x}_t$, there might exist another\ntime step $t_s$ which exhibits superior coupling with $\\hat{x}_t$. Based on\nthis finding, we introduce a sampling method named Time-Shift Sampler. Our\nframework can be seamlessly integrated to existing sampling algorithms, such as\nDDPM, DDIM and other high-order solvers, inducing merely minimal additional\ncomputations. Experimental results show our method brings significant and\nconsistent improvements in FID scores on different datasets and sampling\nmethods. For example, integrating Time-Shift Sampler to F-PNDM yields a\nFID=3.88, achieving 44.49\\% improvements as compared to F-PNDM, on CIFAR-10\nwith 10 sampling steps, which is more performant than the vanilla DDIM with 100\nsampling steps. Our code is available at https://github.com/Mingxiao-Li/TS-DPM.\n","authors":["Mingxiao Li","Tingyu Qu","Ruicong Yao","Wei Sun","Marie-Francine Moens"],"pdf_url":"https://arxiv.org/pdf/2305.15583v7.pdf","comment":"Accepted at International Conference on Learning Representations\n (ICLR2024)"},{"id":"http://arxiv.org/abs/2403.14418v1","updated":"2024-03-21T14:06:38Z","published":"2024-03-21T14:06:38Z","title":"OA-CNNs: Omni-Adaptive Sparse CNNs for 3D Semantic Segmentation","summary":" The booming of 3D recognition in the 2020s began with the introduction of\npoint cloud transformers. They quickly overwhelmed sparse CNNs and became\nstate-of-the-art models, especially in 3D semantic segmentation. However,\nsparse CNNs are still valuable networks, due to their efficiency treasure, and\nease of application. In this work, we reexamine the design distinctions and\ntest the limits of what a sparse CNN can achieve. We discover that the key\ncredit to the performance difference is adaptivity. Specifically, we propose\ntwo key components, i.e., adaptive receptive fields (spatially) and adaptive\nrelation, to bridge the gap. This exploration led to the creation of\nOmni-Adaptive 3D CNNs (OA-CNNs), a family of networks that integrates a\nlightweight module to greatly enhance the adaptivity of sparse CNNs at minimal\ncomputational cost. Without any self-attention modules, OA-CNNs favorably\nsurpass point transformers in terms of accuracy in both indoor and outdoor\nscenes, with much less latency and memory cost. Notably, it achieves 76.1%,\n78.9%, and 70.6% mIoU on ScanNet v2, nuScenes, and SemanticKITTI validation\nbenchmarks respectively, while maintaining at most 5x better speed than\ntransformer counterparts. This revelation highlights the potential of pure\nsparse CNNs to outperform transformer-related networks.\n","authors":["Bohao Peng","Xiaoyang Wu","Li Jiang","Yukang Chen","Hengshuang Zhao","Zhuotao Tian","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2403.14418v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2402.11504v2","updated":"2024-03-21T14:02:48Z","published":"2024-02-18T08:26:22Z","title":"To use or not to use proprietary street view images in (health and\n place) research? That is the question","summary":" Computer vision-based analysis of street view imagery has transformative\nimpacts on environmental assessments. Interactive web services, particularly\nGoogle Street View, play an ever-important role in making imagery data\nubiquitous. Despite the technical ease of harnessing millions of Google Street\nView images, this article questions the current practices in using this\nproprietary data source from a European viewpoint. Our concern lies with\nGoogle's terms of service, which restrict bulk image downloads and the\ngeneration of street view image-based indices. To reconcile the challenge of\nadvancing society through groundbreaking research while maintaining data\nlicense agreements and legal integrity, we believe it is crucial to 1) include\nan author's statement on using proprietary street view data and the directives\nit entails, 2) negotiate academic-specific license to democratize Google Street\nView data access, and 3) adhere to open data principles and utilize open image\nsources for future research.\n","authors":["Marco Helbich","Matthew Danish","SM Labib","Britta Ricker"],"pdf_url":"https://arxiv.org/pdf/2402.11504v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14412v1","updated":"2024-03-21T13:59:00Z","published":"2024-03-21T13:59:00Z","title":"CombiNeRF: A Combination of Regularization Techniques for Few-Shot\n Neural Radiance Field View Synthesis","summary":" Neural Radiance Fields (NeRFs) have shown impressive results for novel view\nsynthesis when a sufficiently large amount of views are available. When dealing\nwith few-shot settings, i.e. with a small set of input views, the training\ncould overfit those views, leading to artifacts and geometric and chromatic\ninconsistencies in the resulting rendering. Regularization is a valid solution\nthat helps NeRF generalization. On the other hand, each of the most recent NeRF\nregularization techniques aim to mitigate a specific rendering problem.\nStarting from this observation, in this paper we propose CombiNeRF, a framework\nthat synergically combines several regularization techniques, some of them\nnovel, in order to unify the benefits of each. In particular, we regularize\nsingle and neighboring rays distributions and we add a smoothness term to\nregularize near geometries. After these geometric approaches, we propose to\nexploit Lipschitz regularization to both NeRF density and color networks and to\nuse encoding masks for input features regularization. We show that CombiNeRF\noutperforms the state-of-the-art methods with few-shot settings in several\npublicly available datasets. We also present an ablation study on the LLFF and\nNeRF-Synthetic datasets that support the choices made. We release with this\npaper the open-source implementation of our framework.\n","authors":["Matteo Bonotto","Luigi Sarrocco","Daniele Evangelista","Marco Imperoli","Alberto Pretto"],"pdf_url":"https://arxiv.org/pdf/2403.14412v1.pdf","comment":"This paper has been accepted for publication at the 2024\n International Conference on 3D Vision (3DV)"},{"id":"http://arxiv.org/abs/2403.14410v1","updated":"2024-03-21T13:57:45Z","published":"2024-03-21T13:57:45Z","title":"GLC++: Source-Free Universal Domain Adaptation through Global-Local\n Clustering and Contrastive Affinity Learning","summary":" Deep neural networks often exhibit sub-optimal performance under covariate\nand category shifts. Source-Free Domain Adaptation (SFDA) presents a promising\nsolution to this dilemma, yet most SFDA approaches are restricted to closed-set\nscenarios. In this paper, we explore Source-Free Universal Domain Adaptation\n(SF-UniDA) aiming to accurately classify \"known\" data belonging to common\ncategories and segregate them from target-private \"unknown\" data. We propose a\nnovel Global and Local Clustering (GLC) technique, which comprises an adaptive\none-vs-all global clustering algorithm to discern between target classes,\ncomplemented by a local k-NN clustering strategy to mitigate negative transfer.\nDespite the effectiveness, the inherent closed-set source architecture leads to\nuniform treatment of \"unknown\" data, impeding the identification of distinct\n\"unknown\" categories. To address this, we evolve GLC to GLC++, integrating a\ncontrastive affinity learning strategy. We examine the superiority of GLC and\nGLC++ across multiple benchmarks and category shift scenarios. Remarkably, in\nthe most challenging open-partial-set scenarios, GLC and GLC++ surpass GATE by\n16.7% and 18.6% in H-score on VisDA, respectively. GLC++ enhances the novel\ncategory clustering accuracy of GLC by 4.3% in open-set scenarios on\nOffice-Home. Furthermore, the introduced contrastive learning strategy not only\nenhances GLC but also significantly facilitates existing methodologies.\n","authors":["Sanqing Qu","Tianpei Zou","Florian Röhrbein","Cewu Lu","Guang Chen","Dacheng Tao","Changjun Jiang"],"pdf_url":"https://arxiv.org/pdf/2403.14410v1.pdf","comment":"This is a substantial extension of the CVPR 2023 paper \"Upcycling\n Models under Domain and Category Shift\""},{"id":"http://arxiv.org/abs/2312.02914v3","updated":"2024-03-21T13:53:48Z","published":"2023-12-05T17:39:19Z","title":"Unsupervised Video Domain Adaptation with Masked Pre-Training and\n Collaborative Self-Training","summary":" In this work, we tackle the problem of unsupervised domain adaptation (UDA)\nfor video action recognition. Our approach, which we call UNITE, uses an image\nteacher model to adapt a video student model to the target domain. UNITE first\nemploys self-supervised pre-training to promote discriminative feature learning\non target domain videos using a teacher-guided masked distillation objective.\nWe then perform self-training on masked target data, using the video student\nmodel and image teacher model together to generate improved pseudolabels for\nunlabeled target videos. Our self-training process successfully leverages the\nstrengths of both models to achieve strong transfer performance across domains.\nWe evaluate our approach on multiple video domain adaptation benchmarks and\nobserve significant improvements upon previously reported results.\n","authors":["Arun Reddy","William Paul","Corban Rivera","Ketul Shah","Celso M. de Melo","Rama Chellappa"],"pdf_url":"https://arxiv.org/pdf/2312.02914v3.pdf","comment":"Accepted at CVPR 2024. 13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2211.10938v2","updated":"2024-03-21T13:51:10Z","published":"2022-11-20T10:30:58Z","title":"AI-KD: Adversarial learning and Implicit regularization for\n self-Knowledge Distillation","summary":" We present a novel adversarial penalized self-knowledge distillation method,\nnamed adversarial learning and implicit regularization for self-knowledge\ndistillation (AI-KD), which regularizes the training procedure by adversarial\nlearning and implicit distillations. Our model not only distills the\ndeterministic and progressive knowledge which are from the pre-trained and\nprevious epoch predictive probabilities but also transfers the knowledge of the\ndeterministic predictive distributions using adversarial learning. The\nmotivation is that the self-knowledge distillation methods regularize the\npredictive probabilities with soft targets, but the exact distributions may be\nhard to predict. Our method deploys a discriminator to distinguish the\ndistributions between the pre-trained and student models while the student\nmodel is trained to fool the discriminator in the trained procedure. Thus, the\nstudent model not only can learn the pre-trained model's predictive\nprobabilities but also align the distributions between the pre-trained and\nstudent models. We demonstrate the effectiveness of the proposed method with\nnetwork architectures on multiple datasets and show the proposed method\nachieves better performance than state-of-the-art methods.\n","authors":["Hyungmin Kim","Sungho Suh","Sunghyun Baek","Daehwan Kim","Daun Jeong","Hansang Cho","Junmo Kim"],"pdf_url":"https://arxiv.org/pdf/2211.10938v2.pdf","comment":"Accepted to KBS"},{"id":"http://arxiv.org/abs/2403.14401v1","updated":"2024-03-21T13:49:42Z","published":"2024-03-21T13:49:42Z","title":"Pensieve: Retrospect-then-Compare Mitigates Visual Hallucination","summary":" Multi-modal Large Language Models (MLLMs) demonstrate remarkable success\nacross various vision-language tasks. However, they suffer from visual\nhallucination, where the generated responses diverge from the provided image.\nAre MLLMs completely oblivious to accurate visual cues when they hallucinate?\nOur investigation reveals that the visual branch may simultaneously advocate\nboth accurate and non-existent content. To address this issue, we propose\nPensieve, a training-free method inspired by our observation that analogous\nvisual hallucinations can arise among images sharing common semantic and\nappearance characteristics. During inference, Pensieve enables MLLMs to\nretrospect relevant images as references and compare them with the test image.\nThis paradigm assists MLLMs in downgrading hallucinatory content mistakenly\nsupported by the visual input. Experiments on Whoops, MME, POPE, and LLaVA\nBench demonstrate the efficacy of Pensieve in mitigating visual hallucination,\nsurpassing other advanced decoding strategies. Additionally, Pensieve aids\nMLLMs in identifying details in the image and enhancing the specificity of\nimage descriptions.\n","authors":["Dingchen Yang","Bowen Cao","Guang Chen","Changjun Jiang"],"pdf_url":"https://arxiv.org/pdf/2403.14401v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14392v1","updated":"2024-03-21T13:33:00Z","published":"2024-03-21T13:33:00Z","title":"A Bag of Tricks for Few-Shot Class-Incremental Learning","summary":" We present a bag of tricks framework for few-shot class-incremental learning\n(FSCIL), which is a challenging form of continual learning that involves\ncontinuous adaptation to new tasks with limited samples. FSCIL requires both\nstability and adaptability, i.e., preserving proficiency in previously learned\ntasks while learning new ones. Our proposed bag of tricks brings together eight\nkey and highly influential techniques that improve stability, adaptability, and\noverall performance under a unified framework for FSCIL. We organize these\ntricks into three categories: stability tricks, adaptability tricks, and\ntraining tricks. Stability tricks aim to mitigate the forgetting of previously\nlearned classes by enhancing the separation between the embeddings of learned\nclasses and minimizing interference when learning new ones. On the other hand,\nadaptability tricks focus on the effective learning of new classes. Finally,\ntraining tricks improve the overall performance without compromising stability\nor adaptability. We perform extensive experiments on three benchmark datasets,\nCIFAR-100, CUB-200, and miniIMageNet, to evaluate the impact of our proposed\nframework. Our detailed analysis shows that our approach substantially improves\nboth stability and adaptability, establishing a new state-of-the-art by\noutperforming prior works in the area. We believe our method provides a go-to\nsolution and establishes a robust baseline for future research in this area.\n","authors":["Shuvendu Roy","Chunjong Park","Aldi Fahrezi","Ali Etemad"],"pdf_url":"https://arxiv.org/pdf/2403.14392v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12648v3","updated":"2024-03-21T13:23:44Z","published":"2024-01-23T10:56:01Z","title":"Consistency Enhancement-Based Deep Multiview Clustering via Contrastive\n Learning","summary":" Multiview clustering (MVC) segregates data samples into meaningful clusters\nby synthesizing information across multiple views. Moreover, deep\nlearning-based methods have demonstrated their strong feature learning\ncapabilities in MVC scenarios. However, effectively generalizing feature\nrepresentations while maintaining consistency is still an intractable problem.\nIn addition, most existing deep clustering methods based on contrastive\nlearning overlook the consistency of the clustering representations during the\nclustering process. In this paper, we show how the above problems can be\novercome and propose a consistent enhancement-based deep MVC method via\ncontrastive learning (CCEC). Specifically, semantic connection blocks are\nincorporated into a feature representation to preserve the consistent\ninformation among multiple views. Furthermore, the representation process for\nclustering is enhanced through spectral clustering, and the consistency across\nmultiple views is improved. Experiments conducted on five datasets demonstrate\nthe effectiveness and superiority of our method in comparison with the\nstate-of-the-art (SOTA) methods. The code for this method can be accessed at\nhttps://anonymous.4open.science/r/CCEC-E84E/.\n","authors":["Hao Yang","Hua Mao","Wai Lok Woo","Jie Chen","Xi Peng"],"pdf_url":"https://arxiv.org/pdf/2401.12648v3.pdf","comment":"There are multiple errors that need to be corrected, including some\n formulas and concept descriptions. We will re upload the paper after the\n modifications are completed"},{"id":"http://arxiv.org/abs/2403.14379v1","updated":"2024-03-21T13:12:33Z","published":"2024-03-21T13:12:33Z","title":"Tensor network compressibility of convolutional models","summary":" Convolutional neural networks (CNNs) represent one of the most widely used\nneural network architectures, showcasing state-of-the-art performance in\ncomputer vision tasks. Although larger CNNs generally exhibit higher accuracy,\ntheir size can be effectively reduced by \"tensorization\" while maintaining\naccuracy. Tensorization consists of replacing the convolution kernels with\ncompact decompositions such as Tucker, Canonical Polyadic decompositions, or\nquantum-inspired decompositions such as matrix product states, and directly\ntraining the factors in the decompositions to bias the learning towards\nlow-rank decompositions. But why doesn't tensorization seem to impact the\naccuracy adversely? We explore this by assessing how truncating the convolution\nkernels of dense (untensorized) CNNs impact their accuracy. Specifically, we\ntruncated the kernels of (i) a vanilla four-layer CNN and (ii) ResNet-50\npre-trained for image classification on CIFAR-10 and CIFAR-100 datasets. We\nfound that kernels (especially those inside deeper layers) could often be\ntruncated along several cuts resulting in significant loss in kernel norm but\nnot in classification accuracy. This suggests that such ``correlation\ncompression'' (underlying tensorization) is an intrinsic feature of how\ninformation is encoded in dense CNNs. We also found that aggressively truncated\nmodels could often recover the pre-truncation accuracy after only a few epochs\nof re-training, suggesting that compressing the internal correlations of\nconvolution layers does not often transport the model to a worse minimum. Our\nresults can be applied to tensorize and compress CNN models more effectively.\n","authors":["Sukhbinder Singh","Saeed S. Jahromi","Roman Orus"],"pdf_url":"https://arxiv.org/pdf/2403.14379v1.pdf","comment":"20 pages, 21 images"},{"id":"http://arxiv.org/abs/2403.14376v1","updated":"2024-03-21T13:06:57Z","published":"2024-03-21T13:06:57Z","title":"InfNeRF: Towards Infinite Scale NeRF Rendering with O(log n) Space\n Complexity","summary":" The conventional mesh-based Level of Detail (LoD) technique, exemplified by\napplications such as Google Earth and many game engines, exhibits the\ncapability to holistically represent a large scene even the Earth, and achieves\nrendering with a space complexity of O(log n). This constrained data\nrequirement not only enhances rendering efficiency but also facilitates dynamic\ndata fetching, thereby enabling a seamless 3D navigation experience for users.\nIn this work, we extend this proven LoD technique to Neural Radiance Fields\n(NeRF) by introducing an octree structure to represent the scenes in different\nscales. This innovative approach provides a mathematically simple and elegant\nrepresentation with a rendering space complexity of O(log n), aligned with the\nefficiency of mesh-based LoD techniques. We also present a novel training\nstrategy that maintains a complexity of O(n). This strategy allows for parallel\ntraining with minimal overhead, ensuring the scalability and efficiency of our\nproposed method. Our contribution is not only in extending the capabilities of\nexisting techniques but also in establishing a foundation for scalable and\nefficient large-scale scene representation using NeRF and octree structures.\n","authors":["Jiabin Liang","Lanqing Zhang","Zhuoran Zhao","Xiangyu Xu"],"pdf_url":"https://arxiv.org/pdf/2403.14376v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13307v2","updated":"2024-03-21T13:06:49Z","published":"2024-03-20T05:11:10Z","title":"LaserHuman: Language-guided Scene-aware Human Motion Generation in Free\n Environment","summary":" Language-guided scene-aware human motion generation has great significance\nfor entertainment and robotics. In response to the limitations of existing\ndatasets, we introduce LaserHuman, a pioneering dataset engineered to\nrevolutionize Scene-Text-to-Motion research. LaserHuman stands out with its\ninclusion of genuine human motions within 3D environments, unbounded free-form\nnatural language descriptions, a blend of indoor and outdoor scenarios, and\ndynamic, ever-changing scenes. Diverse modalities of capture data and rich\nannotations present great opportunities for the research of conditional motion\ngeneration, and can also facilitate the development of real-life applications.\nMoreover, to generate semantically consistent and physically plausible human\nmotions, we propose a multi-conditional diffusion model, which is simple but\neffective, achieving state-of-the-art performance on existing datasets.\n","authors":["Peishan Cong","Ziyi Wang","Zhiyang Dou","Yiming Ren","Wei Yin","Kai Cheng","Yujing Sun","Xiaoxiao Long","Xinge Zhu","Yuexin Ma"],"pdf_url":"https://arxiv.org/pdf/2403.13307v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10404v6","updated":"2024-03-21T12:59:04Z","published":"2023-10-16T13:49:46Z","title":"LLM4SGG: Large Language Model for Weakly Supervised Scene Graph\n Generation","summary":" Weakly-Supervised Scene Graph Generation (WSSGG) research has recently\nemerged as an alternative to the fully-supervised approach that heavily relies\non costly annotations. In this regard, studies on WSSGG have utilized image\ncaptions to obtain unlocalized triplets while primarily focusing on grounding\nthe unlocalized triplets over image regions. However, they have overlooked the\ntwo issues involved in the triplet formation process from the captions: 1)\nSemantic over-simplification issue arises when extracting triplets from\ncaptions, where fine-grained predicates in captions are undesirably converted\ninto coarse-grained predicates, resulting in a long-tailed predicate\ndistribution, and 2) Low-density scene graph issue arises when aligning the\ntriplets in the caption with entity/predicate classes of interest, where many\ntriplets are discarded and not used in training, leading to insufficient\nsupervision. To tackle the two issues, we propose a new approach, i.e., Large\nLanguage Model for weakly-supervised SGG (LLM4SGG), where we mitigate the two\nissues by leveraging the LLM's in-depth understanding of language and reasoning\nability during the extraction of triplets from captions and alignment of\nentity/predicate classes with target data. To further engage the LLM in these\nprocesses, we adopt the idea of Chain-of-Thought and the in-context few-shot\nlearning strategy. To validate the effectiveness of LLM4SGG, we conduct\nextensive experiments on Visual Genome and GQA datasets, showing significant\nimprovements in both Recall@K and mean Recall@K compared to the\nstate-of-the-art WSSGG methods. A further appeal is that LLM4SGG is\ndata-efficient, enabling effective model training with a small amount of\ntraining images.\n","authors":["Kibum Kim","Kanghoon Yoon","Jaehyeong Jeon","Yeonjun In","Jinyoung Moon","Donghyun Kim","Chanyoung Park"],"pdf_url":"https://arxiv.org/pdf/2310.10404v6.pdf","comment":"8 pages; CVPR 2024"},{"id":"http://arxiv.org/abs/2403.14370v1","updated":"2024-03-21T12:57:30Z","published":"2024-03-21T12:57:30Z","title":"SyncTweedies: A General Generative Framework Based on Synchronized\n Diffusions","summary":" We introduce a general framework for generating diverse visual content,\nincluding ambiguous images, panorama images, mesh textures, and Gaussian splat\ntextures, by synchronizing multiple diffusion processes. We present exhaustive\ninvestigation into all possible scenarios for synchronizing multiple diffusion\nprocesses through a canonical space and analyze their characteristics across\napplications. In doing so, we reveal a previously unexplored case: averaging\nthe outputs of Tweedie's formula while conducting denoising in multiple\ninstance spaces. This case also provides the best quality with the widest\napplicability to downstream tasks. We name this case SyncTweedies. In our\nexperiments generating visual content aforementioned, we demonstrate the\nsuperior quality of generation by SyncTweedies compared to other\nsynchronization methods, optimization-based and iterative-update-based methods.\n","authors":["Jaihoon Kim","Juil Koo","Kyeongmin Yeo","Minhyuk Sung"],"pdf_url":"https://arxiv.org/pdf/2403.14370v1.pdf","comment":"Project page: https://synctweedies.github.io/"},{"id":"http://arxiv.org/abs/2312.12274v2","updated":"2024-03-21T12:51:31Z","published":"2023-12-19T15:56:19Z","title":"Intrinsic Image Diffusion for Indoor Single-view Material Estimation","summary":" We present Intrinsic Image Diffusion, a generative model for appearance\ndecomposition of indoor scenes. Given a single input view, we sample multiple\npossible material explanations represented as albedo, roughness, and metallic\nmaps. Appearance decomposition poses a considerable challenge in computer\nvision due to the inherent ambiguity between lighting and material properties\nand the lack of real datasets. To address this issue, we advocate for a\nprobabilistic formulation, where instead of attempting to directly predict the\ntrue material properties, we employ a conditional generative model to sample\nfrom the solution space. Furthermore, we show that utilizing the strong learned\nprior of recent diffusion models trained on large-scale real-world images can\nbe adapted to material estimation and highly improves the generalization to\nreal images. Our method produces significantly sharper, more consistent, and\nmore detailed materials, outperforming state-of-the-art methods by $1.5dB$ on\nPSNR and by $45\\%$ better FID score on albedo prediction. We demonstrate the\neffectiveness of our approach through experiments on both synthetic and\nreal-world datasets.\n","authors":["Peter Kocsis","Vincent Sitzmann","Matthias Nießner"],"pdf_url":"https://arxiv.org/pdf/2312.12274v2.pdf","comment":"Project page: https://peter-kocsis.github.io/IntrinsicImageDiffusion/\n Video: https://youtu.be/lz0meJlj5cA"},{"id":"http://arxiv.org/abs/2403.14368v1","updated":"2024-03-21T12:50:15Z","published":"2024-03-21T12:50:15Z","title":"Enabling Visual Composition and Animation in Unsupervised Video\n Generation","summary":" In this work we propose a novel method for unsupervised controllable video\ngeneration. Once trained on a dataset of unannotated videos, at inference our\nmodel is capable of both composing scenes of predefined object parts and\nanimating them in a plausible and controlled way. This is achieved by\nconditioning video generation on a randomly selected subset of local\npre-trained self-supervised features during training. We call our model CAGE\nfor visual Composition and Animation for video GEneration. We conduct a series\nof experiments to demonstrate capabilities of CAGE in various settings. Project\nwebsite: https://araachie.github.io/cage.\n","authors":["Aram Davtyan","Sepehr Sameni","Björn Ommer","Paolo Favaro"],"pdf_url":"https://arxiv.org/pdf/2403.14368v1.pdf","comment":"Project website: https://araachie.github.io/cage"},{"id":"http://arxiv.org/abs/2403.14366v1","updated":"2024-03-21T12:49:32Z","published":"2024-03-21T12:49:32Z","title":"SurroundSDF: Implicit 3D Scene Understanding Based on Signed Distance\n Field","summary":" Vision-centric 3D environment understanding is both vital and challenging for\nautonomous driving systems. Recently, object-free methods have attracted\nconsiderable attention. Such methods perceive the world by predicting the\nsemantics of discrete voxel grids but fail to construct continuous and accurate\nobstacle surfaces. To this end, in this paper, we propose SurroundSDF to\nimplicitly predict the signed distance field (SDF) and semantic field for the\ncontinuous perception from surround images. Specifically, we introduce a\nquery-based approach and utilize SDF constrained by the Eikonal formulation to\naccurately describe the surfaces of obstacles. Furthermore, considering the\nabsence of precise SDF ground truth, we propose a novel weakly supervised\nparadigm for SDF, referred to as the Sandwich Eikonal formulation, which\nemphasizes applying correct and dense constraints on both sides of the surface,\nthereby enhancing the perceptual accuracy of the surface. Experiments suggest\nthat our method achieves SOTA for both occupancy prediction and 3D scene\nreconstruction tasks on the nuScenes dataset.\n","authors":["Lizhe Liu","Bohua Wang","Hongwei Xie","Daqi Liu","Li Liu","Zhiqiang Tian","Kuiyuan Yang","Bing Wang"],"pdf_url":"https://arxiv.org/pdf/2403.14366v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14362v1","updated":"2024-03-21T12:45:01Z","published":"2024-03-21T12:45:01Z","title":"Less but Better: Enabling Generalized Zero-shot Learning Towards Unseen\n Domains by Intrinsic Learning from Redundant LLM Semantics","summary":" Generalized zero-shot learning (GZSL) focuses on recognizing seen and unseen\nclasses against domain shift problem (DSP) where data of unseen classes may be\nmisclassified as seen classes. However, existing GZSL is still limited to seen\ndomains. In the current work, we pioneer cross-domain GZSL (CDGZSL) which\naddresses GZSL towards unseen domains. Different from existing GZSL methods\nwhich alleviate DSP by generating features of unseen classes with semantics,\nCDGZSL needs to construct a common feature space across domains and acquire the\ncorresponding intrinsic semantics shared among domains to transfer from seen to\nunseen domains. Considering the information asymmetry problem caused by\nredundant class semantics annotated with large language models (LLMs), we\npresent Meta Domain Alignment Semantic Refinement (MDASR). Technically, MDASR\nconsists of two parts: Inter-class Similarity Alignment (ISA), which eliminates\nthe non-intrinsic semantics not shared across all domains under the guidance of\ninter-class feature relationships, and Unseen-class Meta Generation (UMG),\nwhich preserves intrinsic semantics to maintain connectivity between seen and\nunseen classes by simulating feature generation. MDASR effectively aligns the\nredundant semantic space with the common feature space, mitigating the\ninformation asymmetry in CDGZSL. The effectiveness of MDASR is demonstrated on\nthe Office-Home and Mini-DomainNet, and we have shared the LLM-based semantics\nfor these datasets as the benchmark.\n","authors":["Jiaqi Yue","Jiancheng Zhao","Chunhui Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.14362v1.pdf","comment":"This work is submitted to IEEE TNNLS and is subject to IEEE copyright"},{"id":"http://arxiv.org/abs/2311.14758v2","updated":"2024-03-21T12:43:32Z","published":"2023-11-23T15:57:41Z","title":"Point2RBox: Combine Knowledge from Synthetic Visual Patterns for\n End-to-end Oriented Object Detection with Single Point Supervision","summary":" With the rapidly increasing demand for oriented object detection (OOD),\nrecent research involving weakly-supervised detectors for learning rotated box\n(RBox) from the horizontal box (HBox) has attracted more and more attention. In\nthis paper, we explore a more challenging yet label-efficient setting, namely\nsingle point-supervised OOD, and present our approach called Point2RBox.\nSpecifically, we propose to leverage two principles: 1) Synthetic pattern\nknowledge combination: By sampling around each labeled point on the image, we\nspread the object feature to synthetic visual patterns with known boxes to\nprovide the knowledge for box regression. 2) Transform self-supervision: With a\ntransformed input image (e.g. scaled/rotated), the output RBoxes are trained to\nfollow the same transformation so that the network can perceive the relative\nsize/rotation between objects. The detector is further enhanced by a few\ndevised techniques to cope with peripheral issues, e.g. the anchor/layer\nassignment as the size of the object is not available in our point supervision\nsetting. To our best knowledge, Point2RBox is the first end-to-end solution for\npoint-supervised OOD. In particular, our method uses a lightweight paradigm,\nyet it achieves a competitive performance among point-supervised alternatives,\n41.05%/27.62%/80.01% on DOTA/DIOR/HRSC datasets.\n","authors":["Yi Yu","Xue Yang","Qingyun Li","Feipeng Da","Jifeng Dai","Yu Qiao","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2311.14758v2.pdf","comment":"10 pages, 3 figures, 5 tables, code:\n https://github.com/yuyi1005/point2rbox-mmrotate"},{"id":"http://arxiv.org/abs/2403.14359v1","updated":"2024-03-21T12:40:41Z","published":"2024-03-21T12:40:41Z","title":"Varroa destructor detection on honey bees using hyperspectral imagery","summary":" Hyperspectral (HS) imagery in agriculture is becoming increasingly common.\nThese images have the advantage of higher spectral resolution. Advanced\nspectral processing techniques are required to unlock the information potential\nin these HS images. The present paper introduces a method rooted in\nmultivariate statistics designed to detect parasitic Varroa destructor mites on\nthe body of western honey bee Apis mellifera, enabling easier and continuous\nmonitoring of the bee hives. The methodology explores unsupervised (K-means++)\nand recently developed supervised (Kernel Flows - Partial Least-Squares,\nKF-PLS) methods for parasitic identification. Additionally, in light of the\nemergence of custom-band multispectral cameras, the present research outlines a\nstrategy for identifying the specific wavelengths necessary for effective\nbee-mite separation, suitable for implementation in a custom-band camera.\nIllustrated with a real-case dataset, our findings demonstrate that as few as\nfour spectral bands are sufficient for accurate parasite identification.\n","authors":["Zina-Sabrina Duma","Tomas Zemcik","Simon Bilik","Tuomas Sihvonen","Peter Honec","Satu-Pia Reinikainen","Karel Horak"],"pdf_url":"https://arxiv.org/pdf/2403.14359v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14354v1","updated":"2024-03-21T12:29:26Z","published":"2024-03-21T12:29:26Z","title":"LDTR: Transformer-based Lane Detection with Anchor-chain Representation","summary":" Despite recent advances in lane detection methods, scenarios with limited- or\nno-visual-clue of lanes due to factors such as lighting conditions and\nocclusion remain challenging and crucial for automated driving. Moreover,\ncurrent lane representations require complex post-processing and struggle with\nspecific instances. Inspired by the DETR architecture, we propose LDTR, a\ntransformer-based model to address these issues. Lanes are modeled with a novel\nanchor-chain, regarding a lane as a whole from the beginning, which enables\nLDTR to handle special lanes inherently. To enhance lane instance perception,\nLDTR incorporates a novel multi-referenced deformable attention module to\ndistribute attention around the object. Additionally, LDTR incorporates two\nline IoU algorithms to improve convergence efficiency and employs a Gaussian\nheatmap auxiliary branch to enhance model representation capability during\ntraining. To evaluate lane detection models, we rely on Frechet distance,\nparameterized F1-score, and additional synthetic metrics. Experimental results\ndemonstrate that LDTR achieves state-of-the-art performance on well-known\ndatasets.\n","authors":["Zhongyu Yang","Chen Shen","Wei Shao","Tengfei Xing","Runbo Hu","Pengfei Xu","Hua Chai","Ruini Xue"],"pdf_url":"https://arxiv.org/pdf/2403.14354v1.pdf","comment":"Accepted by CVM 2024 and CVMJ. 16 pages, 14 figures"},{"id":"http://arxiv.org/abs/2403.14350v1","updated":"2024-03-21T12:25:17Z","published":"2024-03-21T12:25:17Z","title":"Annotation-Efficient Polyp Segmentation via Active Learning","summary":" Deep learning-based techniques have proven effective in polyp segmentation\ntasks when provided with sufficient pixel-wise labeled data. However, the high\ncost of manual annotation has created a bottleneck for model generalization. To\nminimize annotation costs, we propose a deep active learning framework for\nannotation-efficient polyp segmentation. In practice, we measure the\nuncertainty of each sample by examining the similarity between features masked\nby the prediction map of the polyp and the background area. Since the\nsegmentation model tends to perform weak in samples with indistinguishable\nfeatures of foreground and background areas, uncertainty sampling facilitates\nthe fitting of under-learning data. Furthermore, clustering image-level\nfeatures weighted by uncertainty identify samples that are both uncertain and\nrepresentative. To enhance the selectivity of the active selection strategy, we\npropose a novel unsupervised feature discrepancy learning mechanism. The\nselection strategy and feature optimization work in tandem to achieve optimal\nperformance with a limited annotation budget. Extensive experimental results\nhave demonstrated that our proposed method achieved state-of-the-art\nperformance compared to other competitors on both a public dataset and a\nlarge-scale in-house dataset.\n","authors":["Duojun Huang","Xinyu Xiong","De-Jun Fan","Feng Gao","Xiao-Jian Wu","Guanbin Li"],"pdf_url":"https://arxiv.org/pdf/2403.14350v1.pdf","comment":"2024 IEEE 21th International Symposium on Biomedical Imaging (ISBI)"},{"id":"http://arxiv.org/abs/2403.14349v1","updated":"2024-03-21T12:24:53Z","published":"2024-03-21T12:24:53Z","title":"On the Concept Trustworthiness in Concept Bottleneck Models","summary":" Concept Bottleneck Models (CBMs), which break down the reasoning process into\nthe input-to-concept mapping and the concept-to-label prediction, have garnered\nsignificant attention due to their remarkable interpretability achieved by the\ninterpretable concept bottleneck. However, despite the transparency of the\nconcept-to-label prediction, the mapping from the input to the intermediate\nconcept remains a black box, giving rise to concerns about the trustworthiness\nof the learned concepts (i.e., these concepts may be predicted based on\nspurious cues). The issue of concept untrustworthiness greatly hampers the\ninterpretability of CBMs, thereby hindering their further advancement. To\nconduct a comprehensive analysis on this issue, in this study we establish a\nbenchmark to assess the trustworthiness of concepts in CBMs. A pioneering\nmetric, referred to as concept trustworthiness score, is proposed to gauge\nwhether the concepts are derived from relevant regions. Additionally, an\nenhanced CBM is introduced, enabling concept predictions to be made\nspecifically from distinct parts of the feature map, thereby facilitating the\nexploration of their related regions. Besides, we introduce three modules,\nnamely the cross-layer alignment (CLA) module, the cross-image alignment (CIA)\nmodule, and the prediction alignment (PA) module, to further enhance the\nconcept trustworthiness within the elaborated CBM. The experiments on five\ndatasets across ten architectures demonstrate that without using any concept\nlocalization annotations during training, our model improves the concept\ntrustworthiness by a large margin, meanwhile achieving superior accuracy to the\nstate-of-the-arts. Our code is available at https://github.com/hqhQAQ/ProtoCBM.\n","authors":["Qihan Huang","Jie Song","Jingwen Hu","Haofei Zhang","Yong Wang","Mingli Song"],"pdf_url":"https://arxiv.org/pdf/2403.14349v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14346v1","updated":"2024-03-21T12:23:29Z","published":"2024-03-21T12:23:29Z","title":"Towards Efficient Information Fusion: Concentric Dual Fusion Attention\n Based Multiple Instance Learning for Whole Slide Images","summary":" In the realm of digital pathology, multi-magnification Multiple Instance\nLearning (multi-mag MIL) has proven effective in leveraging the hierarchical\nstructure of Whole Slide Images (WSIs) to reduce information loss and redundant\ndata. However, current methods fall short in bridging the domain gap between\npretrained models and medical imaging, and often fail to account for spatial\nrelationships across different magnifications. Addressing these challenges, we\nintroduce the Concentric Dual Fusion Attention-MIL (CDFA-MIL) framework,which\ninnovatively combines point-to-area feature-colum attention and point-to-point\nconcentric-row attention using concentric patch. This approach is designed to\neffectively fuse correlated information, enhancing feature representation and\nproviding stronger correlation guidance for WSI analysis. CDFA-MIL\ndistinguishes itself by offering a robust fusion strategy that leads to\nsuperior WSI recognition. Its application has demonstrated exceptional\nperformance, significantly surpassing existing MIL methods in accuracy and F1\nscores on prominent datasets like Camelyon16 and TCGA-NSCLC. Specifically,\nCDFA-MIL achieved an average accuracy and F1-score of 93.7\\% and 94.1\\%\nrespectively on these datasets, marking a notable advancement over traditional\nMIL approaches.\n","authors":["Yujian Liu","Ruoxuan Wu","Xinjie Shen","Zihuang Lu","Lingyu Liang","Haiyu Zhou","Shipu Xu","Shaoai Cai","Shidang Xu"],"pdf_url":"https://arxiv.org/pdf/2403.14346v1.pdf","comment":"14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2403.12670v2","updated":"2024-03-21T12:14:14Z","published":"2024-03-19T12:11:57Z","title":"Driving Animatronic Robot Facial Expression From Speech","summary":" Animatronic robots aim to enable natural human-robot interaction through\nlifelike facial expressions. However, generating realistic, speech-synchronized\nrobot expressions is challenging due to the complexities of facial biomechanics\nand responsive motion synthesis. This paper presents a principled,\nskinning-centric approach to drive animatronic robot facial expressions from\nspeech. The proposed approach employs linear blend skinning (LBS) as the core\nrepresentation to guide tightly integrated innovations in embodiment design and\nmotion synthesis. LBS informs the actuation topology, enables human expression\nretargeting, and allows speech-driven facial motion generation. The proposed\napproach is capable of generating highly realistic, real-time facial\nexpressions from speech on an animatronic face, significantly advancing robots'\nability to replicate nuanced human expressions for natural interaction.\n","authors":["Boren Li","Hang Li","Hangxin Liu"],"pdf_url":"https://arxiv.org/pdf/2403.12670v2.pdf","comment":"Under review. For associated project page, see\n https://library87.github.io/animatronic-face-iros24"},{"id":"http://arxiv.org/abs/2403.14339v1","updated":"2024-03-21T12:11:26Z","published":"2024-03-21T12:11:26Z","title":"$\\nabla τ$: Gradient-based and Task-Agnostic machine Unlearning","summary":" Machine Unlearning, the process of selectively eliminating the influence of\ncertain data examples used during a model's training, has gained significant\nattention as a means for practitioners to comply with recent data protection\nregulations. However, existing unlearning methods face critical drawbacks,\nincluding their prohibitively high cost, often associated with a large number\nof hyperparameters, and the limitation of forgetting only relatively small data\nportions. This often makes retraining the model from scratch a quicker and more\neffective solution. In this study, we introduce Gradient-based and\nTask-Agnostic machine Unlearning ($\\nabla \\tau$), an optimization framework\ndesigned to remove the influence of a subset of training data efficiently. It\napplies adaptive gradient ascent to the data to be forgotten while using\nstandard gradient descent for the remaining data. $\\nabla \\tau$ offers multiple\nbenefits over existing approaches. It enables the unlearning of large sections\nof the training dataset (up to 30%). It is versatile, supporting various\nunlearning tasks (such as subset forgetting or class removal) and applicable\nacross different domains (images, text, etc.). Importantly, $\\nabla \\tau$\nrequires no hyperparameter adjustments, making it a more appealing option than\nretraining the model from scratch. We evaluate our framework's effectiveness\nusing a set of well-established Membership Inference Attack metrics,\ndemonstrating up to 10% enhancements in performance compared to\nstate-of-the-art methods without compromising the original model's accuracy.\n","authors":["Daniel Trippa","Cesare Campagnano","Maria Sofia Bucarelli","Gabriele Tolomei","Fabrizio Silvestri"],"pdf_url":"https://arxiv.org/pdf/2403.14339v1.pdf","comment":"14 pages, 2 figures"},{"id":"http://arxiv.org/abs/2303.09780v3","updated":"2024-03-21T12:05:47Z","published":"2023-03-17T05:27:16Z","title":"Mpox-AISM: AI-Mediated Super Monitoring for Mpox and Like-Mpox","summary":" The key to preventing the spread of mpox (monkeypox) lies in timely,\nconvenient, and accurate diagnosis for earlier-stage infected individuals.\nUnfortunately, the resemblances between common skin diseases and mpox and the\nneed for professional diagnosis inevitably deteriorated the diagnosis of\nearlier-stage patients with Mpox and contributed to its widespread outbreak in\ncrowded areas. Here, we proposed a real-time visualization strategy called\n\"Super Monitoring\" using artificial intelligence and Internet technology,\nthereby performing a low-cost, convenient, timely, and unspecialized diagnosis\nfor earlier-stage mpox. Specifically, such AI-mediated \"super monitoring\"\n(Mpox-AISM) invokes a framework assembled by deep learning models, data\naugmentation, self-supervised learning, and cloud services. Verified by\npublicly available datasets, the Precision, Recall, Specificity, and F1-score\nof Mpox-AISM in diagnosing mpox achieved 99.3%, 94.1%, 99.9%, and 96.6%,\nrespectively. Furthermore, Mpox-AISM's overall accuracy reaches 94.51% in\ndiagnosing mpox, six like-mpox skin diseases, and normal skin. We also employed\ngradient-weighted class activation mapping to explain the decision-making\nprocess of Mpox-AISM, thus handily understanding the specific characteristics\nthat may indicate the mpox's onset and improving its reliability. With the help\nof the Internet and communication terminal, Mpox-AISM can perform a real-time,\nlow-cost, and convenient diagnosis for earlier-stage mpox in various real-world\nsettings, thereby effectively curbing the spread of mpox virus.\n","authors":["Yubiao Yue","Minghua Jiang","Xinyue Zhang","Jialong Xu","Huacong Ye","Fan Zhang","Zhenzhang Li","Yang Li"],"pdf_url":"https://arxiv.org/pdf/2303.09780v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14335v1","updated":"2024-03-21T12:01:54Z","published":"2024-03-21T12:01:54Z","title":"FFT-based Selection and Optimization of Statistics for Robust\n Recognition of Severely Corrupted Images","summary":" Improving model robustness in case of corrupted images is among the key\nchallenges to enable robust vision systems on smart devices, such as robotic\nagents. Particularly, robust test-time performance is imperative for most of\nthe applications. This paper presents a novel approach to improve robustness of\nany classification model, especially on severely corrupted images. Our method\n(FROST) employs high-frequency features to detect input image corruption type,\nand select layer-wise feature normalization statistics. FROST provides the\nstate-of-the-art results for different models and datasets, outperforming\ncompetitors on ImageNet-C by up to 37.1% relative gain, improving baseline of\n40.9% mCE on severe corruptions.\n","authors":["Elena Camuffo","Umberto Michieli","Jijoong Moon","Daehyun Kim","Mete Ozay"],"pdf_url":"https://arxiv.org/pdf/2403.14335v1.pdf","comment":"ICASSP 2024. Copyright 2024 IEEE. Personal use of this material is\n permitted. Permission from IEEE must be obtained for all other uses, in any\n current or future media, including reprinting/republishing this material for\n advertising or promotional purposes, creating new collective works, for\n resale or redistribution to servers or lists, or reuse of any copyrighted\n component of this work in other"},{"id":"http://arxiv.org/abs/2403.14333v1","updated":"2024-03-21T11:58:50Z","published":"2024-03-21T11:58:50Z","title":"CFPL-FAS: Class Free Prompt Learning for Generalizable Face\n Anti-spoofing","summary":" Domain generalization (DG) based Face Anti-Spoofing (FAS) aims to improve the\nmodel's performance on unseen domains. Existing methods either rely on domain\nlabels to align domain-invariant feature spaces, or disentangle generalizable\nfeatures from the whole sample, which inevitably lead to the distortion of\nsemantic feature structures and achieve limited generalization. In this work,\nwe make use of large-scale VLMs like CLIP and leverage the textual feature to\ndynamically adjust the classifier's weights for exploring generalizable visual\nfeatures. Specifically, we propose a novel Class Free Prompt Learning (CFPL)\nparadigm for DG FAS, which utilizes two lightweight transformers, namely\nContent Q-Former (CQF) and Style Q-Former (SQF), to learn the different\nsemantic prompts conditioned on content and style features by using a set of\nlearnable query vectors, respectively. Thus, the generalizable prompt can be\nlearned by two improvements: (1) A Prompt-Text Matched (PTM) supervision is\nintroduced to ensure CQF learns visual representation that is most informative\nof the content description. (2) A Diversified Style Prompt (DSP) technology is\nproposed to diversify the learning of style prompts by mixing feature\nstatistics between instance-specific styles. Finally, the learned text features\nmodulate visual features to generalization through the designed Prompt\nModulation (PM). Extensive experiments show that the CFPL is effective and\noutperforms the state-of-the-art methods on several cross-domain datasets.\n","authors":["Ajian Liu","Shuai Xue","Jianwen Gan","Jun Wan","Yanyan Liang","Jiankang Deng","Sergio Escalera","Zhen Lei"],"pdf_url":"https://arxiv.org/pdf/2403.14333v1.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2403.14324v1","updated":"2024-03-21T11:44:25Z","published":"2024-03-21T11:44:25Z","title":"Neural Network-Based Processing and Reconstruction of Compromised\n Biophotonic Image Data","summary":" The integration of deep learning techniques with biophotonic setups has\nopened new horizons in bioimaging. A compelling trend in this field involves\ndeliberately compromising certain measurement metrics to engineer better\nbioimaging tools in terms of cost, speed, and form-factor, followed by\ncompensating for the resulting defects through the utilization of deep learning\nmodels trained on a large amount of ideal, superior or alternative data. This\nstrategic approach has found increasing popularity due to its potential to\nenhance various aspects of biophotonic imaging. One of the primary motivations\nfor employing this strategy is the pursuit of higher temporal resolution or\nincreased imaging speed, critical for capturing fine dynamic biological\nprocesses. This approach also offers the prospect of simplifying hardware\nrequirements/complexities, thereby making advanced imaging standards more\naccessible in terms of cost and/or size. This article provides an in-depth\nreview of the diverse measurement aspects that researchers intentionally impair\nin their biophotonic setups, including the point spread function,\nsignal-to-noise ratio, sampling density, and pixel resolution. By deliberately\ncompromising these metrics, researchers aim to not only recuperate them through\nthe application of deep learning networks, but also bolster in return other\ncrucial parameters, such as the field-of-view, depth-of-field, and\nspace-bandwidth product. Here, we discuss various biophotonic methods that have\nsuccessfully employed this strategic approach. These techniques span broad\napplications and showcase the versatility and effectiveness of deep learning in\nthe context of compromised biophotonic data. Finally, by offering our\nperspectives on the future possibilities of this rapidly evolving concept, we\nhope to motivate our readers to explore novel ways of balancing hardware\ncompromises with compensation via AI.\n","authors":["Michael John Fanous","Paloma Casteleiro Costa","Cagatay Isil","Luzhe Huang","Aydogan Ozcan"],"pdf_url":"https://arxiv.org/pdf/2403.14324v1.pdf","comment":"17 Pages, 4 Figures, 1 Table"},{"id":"http://arxiv.org/abs/2403.14320v1","updated":"2024-03-21T11:41:39Z","published":"2024-03-21T11:41:39Z","title":"Exosense: A Vision-Centric Scene Understanding System For Safe\n Exoskeleton Navigation","summary":" Exoskeletons for daily use by those with mobility impairments are being\ndeveloped. They will require accurate and robust scene understanding systems.\nCurrent research has used vision to identify immediate terrain and geometric\nobstacles, however these approaches are constrained to detections directly in\nfront of the user and are limited to classifying a finite range of terrain\ntypes (e.g., stairs, ramps and level-ground). This paper presents Exosense, a\nvision-centric scene understanding system which is capable of generating rich,\nglobally-consistent elevation maps, incorporating both semantic and terrain\ntraversability information. It features an elastic Atlas mapping framework\nassociated with a visual SLAM pose graph, embedded with open-vocabulary room\nlabels from a Vision-Language Model (VLM). The device's design includes a wide\nfield-of-view (FoV) fisheye multi-camera system to mitigate the challenges\nintroduced by the exoskeleton walking pattern. We demonstrate the system's\nrobustness to the challenges of typical periodic walking gaits, and its ability\nto construct accurate semantically-rich maps in indoor settings. Additionally,\nwe showcase its potential for motion planning -- providing a step towards safe\nnavigation for exoskeletons.\n","authors":["Jianeng Wang","Matias Mattamala","Christina Kassab","Lintong Zhang","Maurice Fallon"],"pdf_url":"https://arxiv.org/pdf/2403.14320v1.pdf","comment":"8 pages, 10 figures"},{"id":"http://arxiv.org/abs/2403.14318v1","updated":"2024-03-21T11:40:51Z","published":"2024-03-21T11:40:51Z","title":"A Lightweight Attention-based Deep Network via Multi-Scale Feature\n Fusion for Multi-View Facial Expression Recognition","summary":" Convolutional neural networks (CNNs) and their variations have shown\neffectiveness in facial expression recognition (FER). However, they face\nchallenges when dealing with high computational complexity and multi-view head\nposes in real-world scenarios. We introduce a lightweight attentional network\nincorporating multi-scale feature fusion (LANMSFF) to tackle these issues. For\nthe first challenge, we have carefully designed a lightweight fully\nconvolutional network (FCN). We address the second challenge by presenting two\nnovel components, namely mass attention (MassAtt) and point wise feature\nselection (PWFS) blocks. The MassAtt block simultaneously generates channel and\nspatial attention maps to recalibrate feature maps by emphasizing important\nfeatures while suppressing irrelevant ones. On the other hand, the PWFS block\nemploys a feature selection mechanism that discards less meaningful features\nprior to the fusion process. This mechanism distinguishes it from previous\nmethods that directly fuse multi-scale features. Our proposed approach achieved\nresults comparable to state-of-the-art methods in terms of parameter counts and\nrobustness to pose variation, with accuracy rates of 90.77% on KDEF, 70.44% on\nFER-2013, and 86.96% on FERPlus datasets. The code for LANMSFF is available at\nhttps://github.com/AE-1129/LANMSFF.\n","authors":["Ali Ezati","Mohammadreza Dezyani","Rajib Rana","Roozbeh Rajabi","Ahmad Ayatollahi"],"pdf_url":"https://arxiv.org/pdf/2403.14318v1.pdf","comment":"9 pages, two-column, submitted to journal"},{"id":"http://arxiv.org/abs/2403.14302v1","updated":"2024-03-21T11:16:42Z","published":"2024-03-21T11:16:42Z","title":"SpikingResformer: Bridging ResNet and Vision Transformer in Spiking\n Neural Networks","summary":" The remarkable success of Vision Transformers in Artificial Neural Networks\n(ANNs) has led to a growing interest in incorporating the self-attention\nmechanism and transformer-based architecture into Spiking Neural Networks\n(SNNs). While existing methods propose spiking self-attention mechanisms that\nare compatible with SNNs, they lack reasonable scaling methods, and the overall\narchitectures proposed by these methods suffer from a bottleneck in effectively\nextracting local features. To address these challenges, we propose a novel\nspiking self-attention mechanism named Dual Spike Self-Attention (DSSA) with a\nreasonable scaling method. Based on DSSA, we propose a novel spiking Vision\nTransformer architecture called SpikingResformer, which combines the\nResNet-based multi-stage architecture with our proposed DSSA to improve both\nperformance and energy efficiency while reducing parameters. Experimental\nresults show that SpikingResformer achieves higher accuracy with fewer\nparameters and lower energy consumption than other spiking Vision Transformer\ncounterparts. Notably, our SpikingResformer-L achieves 79.40% top-1 accuracy on\nImageNet with 4 time-steps, which is the state-of-the-art result in the SNN\nfield.\n","authors":["Xinyu Shi","Zecheng Hao","Zhaofei Yu"],"pdf_url":"https://arxiv.org/pdf/2403.14302v1.pdf","comment":"To be published in the 2024 IEEE/CVF Conference on Computer Vision\n and Pattern Recognition (CVPR)"},{"id":"http://arxiv.org/abs/2403.14297v1","updated":"2024-03-21T11:03:56Z","published":"2024-03-21T11:03:56Z","title":"Impact Assessment of Missing Data in Model Predictions for Earth\n Observation Applications","summary":" Earth observation (EO) applications involving complex and heterogeneous data\nsources are commonly approached with machine learning models. However, there is\na common assumption that data sources will be persistently available. Different\nsituations could affect the availability of EO sources, like noise, clouds, or\nsatellite mission failures. In this work, we assess the impact of missing\ntemporal and static EO sources in trained models across four datasets with\nclassification and regression tasks. We compare the predictive quality of\ndifferent methods and find that some are naturally more robust to missing data.\nThe Ensemble strategy, in particular, achieves a prediction robustness up to\n100%. We evidence that missing scenarios are significantly more challenging in\nregression than classification tasks. Finally, we find that the optical view is\nthe most critical view when it is missing individually.\n","authors":["Francisco Mena","Diego Arenas","Marcela Charfuelan","Marlon Nuske","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2403.14297v1.pdf","comment":"Accepted at IEEE International Geoscience and Remote Sensing\n Symposium 2024"},{"id":"http://arxiv.org/abs/2403.14292v1","updated":"2024-03-21T10:59:44Z","published":"2024-03-21T10:59:44Z","title":"HySim: An Efficient Hybrid Similarity Measure for Patch Matching in\n Image Inpainting","summary":" Inpainting, for filling missing image regions, is a crucial task in various\napplications, such as medical imaging and remote sensing. Trending data-driven\napproaches efficiency, for image inpainting, often requires extensive data\npreprocessing. In this sense, there is still a need for model-driven approaches\nin case of application constrained with data availability and quality,\nespecially for those related for time series forecasting using image inpainting\ntechniques. This paper proposes an improved modeldriven approach relying on\npatch-based techniques. Our approach deviates from the standard Sum of Squared\nDifferences (SSD) similarity measure by introducing a Hybrid Similarity\n(HySim), which combines both strengths of Chebychev and Minkowski distances.\nThis hybridization enhances patch selection, leading to high-quality inpainting\nresults with reduced mismatch errors. Experimental results proved the\neffectiveness of our approach against other model-driven techniques, such as\ndiffusion or patch-based approaches, showcasing its effectiveness in achieving\nvisually pleasing restorations.\n","authors":["Saad Noufel","Nadir Maaroufi","Mehdi Najib","Mohamed Bakhouya"],"pdf_url":"https://arxiv.org/pdf/2403.14292v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14291v1","updated":"2024-03-21T10:56:12Z","published":"2024-03-21T10:56:12Z","title":"Open-Vocabulary Attention Maps with Token Optimization for Semantic\n Segmentation in Diffusion Models","summary":" Diffusion models represent a new paradigm in text-to-image generation. Beyond\ngenerating high-quality images from text prompts, models such as Stable\nDiffusion have been successfully extended to the joint generation of semantic\nsegmentation pseudo-masks. However, current extensions primarily rely on\nextracting attentions linked to prompt words used for image synthesis. This\napproach limits the generation of segmentation masks derived from word tokens\nnot contained in the text prompt. In this work, we introduce Open-Vocabulary\nAttention Maps (OVAM)-a training-free method for text-to-image diffusion models\nthat enables the generation of attention maps for any word. In addition, we\npropose a lightweight optimization process based on OVAM for finding tokens\nthat generate accurate attention maps for an object class with a single\nannotation. We evaluate these tokens within existing state-of-the-art Stable\nDiffusion extensions. The best-performing model improves its mIoU from 52.1 to\n86.6 for the synthetic images' pseudo-masks, demonstrating that our optimized\ntokens are an efficient way to improve the performance of existing methods\nwithout architectural changes or retraining.\n","authors":["Pablo Marcos-Manchón","Roberto Alcover-Couso","Juan C. SanMiguel","Jose M. Martínez"],"pdf_url":"https://arxiv.org/pdf/2403.14291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14290v1","updated":"2024-03-21T10:54:21Z","published":"2024-03-21T10:54:21Z","title":"Exploring Green AI for Audio Deepfake Detection","summary":" The state-of-the-art audio deepfake detectors leveraging deep neural networks\nexhibit impressive recognition performance. Nonetheless, this advantage is\naccompanied by a significant carbon footprint. This is mainly due to the use of\nhigh-performance computing with accelerators and high training time. Studies\nshow that average deep NLP model produces around 626k lbs of\nCO\\textsubscript{2} which is equivalent to five times of average US car\nemission at its lifetime. This is certainly a massive threat to the\nenvironment. To tackle this challenge, this study presents a novel framework\nfor audio deepfake detection that can be seamlessly trained using standard CPU\nresources. Our proposed framework utilizes off-the-shelve self-supervised\nlearning (SSL) based models which are pre-trained and available in public\nrepositories. In contrast to existing methods that fine-tune SSL models and\nemploy additional deep neural networks for downstream tasks, we exploit\nclassical machine learning algorithms such as logistic regression and shallow\nneural networks using the SSL embeddings extracted using the pre-trained model.\nOur approach shows competitive results compared to the commonly used\nhigh-carbon footprint approaches. In experiments with the ASVspoof 2019 LA\ndataset, we achieve a 0.90\\% equal error rate (EER) with less than 1k trainable\nmodel parameters. To encourage further research in this direction and support\nreproducible results, the Python code will be made publicly accessible\nfollowing acceptance. Github: https://github.com/sahasubhajit/Speech-Spoofing-\n","authors":["Subhajit Saha","Md Sahidullah","Swagatam Das"],"pdf_url":"https://arxiv.org/pdf/2403.14290v1.pdf","comment":"This manuscript is under review in a conference"},{"id":"http://arxiv.org/abs/2403.14287v1","updated":"2024-03-21T10:51:19Z","published":"2024-03-21T10:51:19Z","title":"Enhancing Historical Image Retrieval with Compositional Cues","summary":" In analyzing vast amounts of digitally stored historical image data, existing\ncontent-based retrieval methods often overlook significant non-semantic\ninformation, limiting their effectiveness for flexible exploration across\nvaried themes. To broaden the applicability of image retrieval methods for\ndiverse purposes and uncover more general patterns, we innovatively introduce a\ncrucial factor from computational aesthetics, namely image composition, into\nthis topic. By explicitly integrating composition-related information extracted\nby CNN into the designed retrieval model, our method considers both the image's\ncomposition rules and semantic information. Qualitative and quantitative\nexperiments demonstrate that the image retrieval network guided by composition\ninformation outperforms those relying solely on content information,\nfacilitating the identification of images in databases closer to the target\nimage in human perception. Please visit https://github.com/linty5/CCBIR to try\nour codes.\n","authors":["Tingyu Lin","Robert Sablatnig"],"pdf_url":"https://arxiv.org/pdf/2403.14287v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14286v1","updated":"2024-03-21T10:49:54Z","published":"2024-03-21T10:49:54Z","title":"Assessing the Robustness of Spectral Clustering for Deep Speaker\n Diarization","summary":" Clustering speaker embeddings is crucial in speaker diarization but hasn't\nreceived as much focus as other components. Moreover, the robustness of speaker\ndiarization across various datasets hasn't been explored when the development\nand evaluation data are from different domains. To bridge this gap, this study\nthoroughly examines spectral clustering for both same-domain and cross-domain\nspeaker diarization. Our extensive experiments on two widely used corpora, AMI\nand DIHARD, reveal the performance trend of speaker diarization in the presence\nof domain mismatch. We observe that the performance difference between two\ndifferent domain conditions can be attributed to the role of spectral\nclustering. In particular, keeping other modules unchanged, we show that\ndifferences in optimal tuning parameters as well as speaker count estimation\noriginates due to the mismatch. This study opens several future directions for\nspeaker diarization research.\n","authors":["Nikhil Raghav","Md Sahidullah"],"pdf_url":"https://arxiv.org/pdf/2403.14286v1.pdf","comment":"Manuscript Under Review"},{"id":"http://arxiv.org/abs/2403.14279v1","updated":"2024-03-21T10:38:18Z","published":"2024-03-21T10:38:18Z","title":"Zero123-6D: Zero-shot Novel View Synthesis for RGB Category-level 6D\n Pose Estimation","summary":" Estimating the pose of objects through vision is essential to make robotic\nplatforms interact with the environment. Yet, it presents many challenges,\noften related to the lack of flexibility and generalizability of\nstate-of-the-art solutions. Diffusion models are a cutting-edge neural\narchitecture transforming 2D and 3D computer vision, outlining remarkable\nperformances in zero-shot novel-view synthesis. Such a use case is particularly\nintriguing for reconstructing 3D objects. However, localizing objects in\nunstructured environments is rather unexplored. To this end, this work presents\nZero123-6D to demonstrate the utility of Diffusion Model-based\nnovel-view-synthesizers in enhancing RGB 6D pose estimation at category-level\nby integrating them with feature extraction techniques. The outlined method\nexploits such a novel view synthesizer to expand a sparse set of RGB-only\nreference views for the zero-shot 6D pose estimation task. Experiments are\nquantitatively analyzed on the CO3D dataset, showcasing increased performance\nover baselines, a substantial reduction in data requirements, and the removal\nof the necessity of depth information.\n","authors":["Francesco Di Felice","Alberto Remus","Stefano Gasperini","Benjamin Busam","Lionel Ott","Federico Tombari","Roland Siegwart","Carlo Alberto Avizzano"],"pdf_url":"https://arxiv.org/pdf/2403.14279v1.pdf","comment":"6 pages, 2 reference pages, 4 figures"},{"id":"http://arxiv.org/abs/2403.14270v1","updated":"2024-03-21T10:15:57Z","published":"2024-03-21T10:15:57Z","title":"Scene-Graph ViT: End-to-End Open-Vocabulary Visual Relationship\n Detection","summary":" Visual relationship detection aims to identify objects and their\nrelationships in images. Prior methods approach this task by adding separate\nrelationship modules or decoders to existing object detection architectures.\nThis separation increases complexity and hinders end-to-end training, which\nlimits performance. We propose a simple and highly efficient decoder-free\narchitecture for open-vocabulary visual relationship detection. Our model\nconsists of a Transformer-based image encoder that represents objects as tokens\nand models their relationships implicitly. To extract relationship information,\nwe introduce an attention mechanism that selects object pairs likely to form a\nrelationship. We provide a single-stage recipe to train this model on a mixture\nof object and relationship detection data. Our approach achieves\nstate-of-the-art relationship detection performance on Visual Genome and on the\nlarge-vocabulary GQA benchmark at real-time inference speeds. We provide\nanalyses of zero-shot performance, ablations, and real-world qualitative\nexamples.\n","authors":["Tim Salzmann","Markus Ryll","Alex Bewley","Matthias Minderer"],"pdf_url":"https://arxiv.org/pdf/2403.14270v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14264v1","updated":"2024-03-21T09:59:53Z","published":"2024-03-21T09:59:53Z","title":"A Framework for Portrait Stylization with Skin-Tone Awareness and Nudity\n Identification","summary":" Portrait stylization is a challenging task involving the transformation of an\ninput portrait image into a specific style while preserving its inherent\ncharacteristics. The recent introduction of Stable Diffusion (SD) has\nsignificantly improved the quality of outcomes in this field. However, a\npractical stylization framework that can effectively filter harmful input\ncontent and preserve the distinct characteristics of an input, such as\nskin-tone, while maintaining the quality of stylization remains lacking. These\nchallenges have hindered the wide deployment of such a framework. To address\nthese issues, this study proposes a portrait stylization framework that\nincorporates a nudity content identification module (NCIM) and a\nskin-tone-aware portrait stylization module (STAPSM). In experiments, NCIM\nshowed good performance in enhancing explicit content filtering, and STAPSM\naccurately represented a diverse range of skin tones. Our proposed framework\nhas been successfully deployed in practice, and it has effectively satisfied\ncritical requirements of real-world applications.\n","authors":["Seungkwon Kim","Sangyeon Kim","Seung-Hun Nam"],"pdf_url":"https://arxiv.org/pdf/2403.14264v1.pdf","comment":"Accepted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2403.14262v1","updated":"2024-03-21T09:50:39Z","published":"2024-03-21T09:50:39Z","title":"Diffusion Models with Ensembled Structure-Based Anomaly Scoring for\n Unsupervised Anomaly Detection","summary":" Supervised deep learning techniques show promise in medical image analysis.\nHowever, they require comprehensive annotated data sets, which poses\nchallenges, particularly for rare diseases. Consequently, unsupervised anomaly\ndetection (UAD) emerges as a viable alternative for pathology segmentation, as\nonly healthy data is required for training. However, recent UAD anomaly scoring\nfunctions often focus on intensity only and neglect structural differences,\nwhich impedes the segmentation performance. This work investigates the\npotential of Structural Similarity (SSIM) to bridge this gap. SSIM captures\nboth intensity and structural disparities and can be advantageous over the\nclassical $l1$ error. However, we show that there is more than one optimal\nkernel size for the SSIM calculation for different pathologies. Therefore, we\ninvestigate an adaptive ensembling strategy for various kernel sizes to offer a\nmore pathology-agnostic scoring mechanism. We demonstrate that this ensembling\nstrategy can enhance the performance of DMs and mitigate the sensitivity to\ndifferent kernel sizes across varying pathologies, highlighting its promise for\nbrain MRI anomaly detection.\n","authors":["Finn Behrendt","Debayan Bhattacharya","Lennart Maack","Julia Krüger","Roland Opfer","Robin Mieling","Alexander Schlaefer"],"pdf_url":"https://arxiv.org/pdf/2403.14262v1.pdf","comment":"Accepted at IEEE ISBI 2024"},{"id":"http://arxiv.org/abs/2311.11241v2","updated":"2024-03-21T09:49:50Z","published":"2023-11-19T06:00:39Z","title":"Open-Vocabulary Camouflaged Object Segmentation","summary":" Recently, the emergence of the large-scale vision-language model (VLM), such\nas CLIP, has opened the way towards open-world object perception. Many works\nhave explored the utilization of pre-trained VLM for the challenging\nopen-vocabulary dense prediction task that requires perceiving diverse objects\nwith novel classes at inference time. Existing methods construct experiments\nbased on the public datasets of related tasks, which are not tailored for open\nvocabulary and rarely involve imperceptible objects camouflaged in complex\nscenes due to data collection bias and annotation costs. To fill in the gaps,\nwe introduce a new task, open-vocabulary camouflaged object segmentation\n(OVCOS), and construct a large-scale complex scene dataset (\\textbf{OVCamo})\ncontaining 11,483 hand-selected images with fine annotations and corresponding\nobject classes. Further, we build a strong single-stage open-vocabulary\n\\underline{c}amouflaged \\underline{o}bject \\underline{s}egmentation\ntransform\\underline{er} baseline \\textbf{OVCoser} attached to the\nparameter-fixed CLIP with iterative semantic guidance and structure\nenhancement. By integrating the guidance of class semantic knowledge and the\nsupplement of visual structure cues from the edge and depth information, the\nproposed method can efficiently capture camouflaged objects. Moreover, this\neffective framework also surpasses previous state-of-the-arts of\nopen-vocabulary semantic image segmentation by a large margin on our OVCamo\ndataset. With the proposed dataset and baseline, we hope that this new task\nwith more practical value can further expand the research on open-vocabulary\ndense prediction tasks. The code and data will be available in the future.\n","authors":["Youwei Pang","Xiaoqi Zhao","Jiaming Zuo","Lihe Zhang","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2311.11241v2.pdf","comment":"Update the style and add details"},{"id":"http://arxiv.org/abs/2308.09951v2","updated":"2024-03-21T09:28:34Z","published":"2023-08-19T09:12:13Z","title":"Semantics Meets Temporal Correspondence: Self-supervised Object-centric\n Learning in Videos","summary":" Self-supervised methods have shown remarkable progress in learning high-level\nsemantics and low-level temporal correspondence. Building on these results, we\ntake one step further and explore the possibility of integrating these two\nfeatures to enhance object-centric representations. Our preliminary experiments\nindicate that query slot attention can extract different semantic components\nfrom the RGB feature map, while random sampling based slot attention can\nexploit temporal correspondence cues between frames to assist instance\nidentification. Motivated by this, we propose a novel semantic-aware masked\nslot attention on top of the fused semantic features and correspondence maps.\nIt comprises two slot attention stages with a set of shared learnable Gaussian\ndistributions. In the first stage, we use the mean vectors as slot\ninitialization to decompose potential semantics and generate semantic\nsegmentation masks through iterative attention. In the second stage, for each\nsemantics, we randomly sample slots from the corresponding Gaussian\ndistribution and perform masked feature aggregation within the semantic area to\nexploit temporal correspondence patterns for instance identification. We adopt\nsemantic- and instance-level temporal consistency as self-supervision to\nencourage temporally coherent object-centric representations. Our model\neffectively identifies multiple object instances with semantic structure,\nreaching promising results on unsupervised video object discovery. Furthermore,\nwe achieve state-of-the-art performance on dense label propagation tasks,\ndemonstrating the potential for object-centric analysis. The code is released\nat https://github.com/shvdiwnkozbw/SMTC.\n","authors":["Rui Qian","Shuangrui Ding","Xian Liu","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2308.09951v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2403.14252v1","updated":"2024-03-21T09:25:24Z","published":"2024-03-21T09:25:24Z","title":"LayoutLLM: Large Language Model Instruction Tuning for Visually Rich\n Document Understanding","summary":" This paper proposes LayoutLLM, a more flexible document analysis method for\nunderstanding imaged documents. Visually Rich Document Understanding tasks,\nsuch as document image classification and information extraction, have gained\nsignificant attention due to their importance. Existing methods have been\ndeveloped to enhance document comprehension by incorporating pre-training\nawareness of images, text, and layout structure. However, these methods require\nfine-tuning for each task and dataset, and the models are expensive to train\nand operate. To overcome this limitation, we propose a new LayoutLLM that\nintegrates these with large-scale language models (LLMs). By leveraging the\nstrengths of existing research in document image understanding and LLMs'\nsuperior language understanding capabilities, the proposed model, fine-tuned\nwith multimodal instruction datasets, performs an understanding of document\nimages in a single model. Our experiments demonstrate improvement over the\nbaseline model in various document analysis tasks.\n","authors":["Masato Fujitake"],"pdf_url":"https://arxiv.org/pdf/2403.14252v1.pdf","comment":"LREC-COLING 2024"},{"id":"http://arxiv.org/abs/2403.14250v1","updated":"2024-03-21T09:22:23Z","published":"2024-03-21T09:22:23Z","title":"Safeguarding Medical Image Segmentation Datasets against Unauthorized\n Training via Contour- and Texture-Aware Perturbations","summary":" The widespread availability of publicly accessible medical images has\nsignificantly propelled advancements in various research and clinical fields.\nNonetheless, concerns regarding unauthorized training of AI systems for\ncommercial purposes and the duties of patient privacy protection have led\nnumerous institutions to hesitate to share their images. This is particularly\ntrue for medical image segmentation (MIS) datasets, where the processes of\ncollection and fine-grained annotation are time-intensive and laborious.\nRecently, Unlearnable Examples (UEs) methods have shown the potential to\nprotect images by adding invisible shortcuts. These shortcuts can prevent\nunauthorized deep neural networks from generalizing. However, existing UEs are\ndesigned for natural image classification and fail to protect MIS datasets\nimperceptibly as their protective perturbations are less learnable than\nimportant prior knowledge in MIS, e.g., contour and texture features. To this\nend, we propose an Unlearnable Medical image generation method, termed UMed.\nUMed integrates the prior knowledge of MIS by injecting contour- and\ntexture-aware perturbations to protect images. Given that our target is to only\npoison features critical to MIS, UMed requires only minimal perturbations\nwithin the ROI and its contour to achieve greater imperceptibility (average\nPSNR is 50.03) and protective performance (clean average DSC degrades from\n82.18% to 6.80%).\n","authors":["Xun Lin","Yi Yu","Song Xia","Jue Jiang","Haoran Wang","Zitong Yu","Yizhong Liu","Ying Fu","Shuai Wang","Wenzhong Tang","Alex Kot"],"pdf_url":"https://arxiv.org/pdf/2403.14250v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17351v2","updated":"2024-03-21T09:20:54Z","published":"2024-02-27T09:41:59Z","title":"ICP-Flow: LiDAR Scene Flow Estimation with ICP","summary":" Scene flow characterizes the 3D motion between two LiDAR scans captured by an\nautonomous vehicle at nearby timesteps. Prevalent methods consider scene flow\nas point-wise unconstrained flow vectors that can be learned by either\nlarge-scale training beforehand or time-consuming optimization at inference.\nHowever, these methods do not take into account that objects in autonomous\ndriving often move rigidly. We incorporate this rigid-motion assumption into\nour design, where the goal is to associate objects over scans and then estimate\nthe locally rigid transformations. We propose ICP-Flow, a learning-free flow\nestimator. The core of our design is the conventional Iterative Closest Point\n(ICP) algorithm, which aligns the objects over time and outputs the\ncorresponding rigid transformations. Crucially, to aid ICP, we propose a\nhistogram-based initialization that discovers the most likely translation, thus\nproviding a good starting point for ICP. The complete scene flow is then\nrecovered from the rigid transformations. We outperform state-of-the-art\nbaselines, including supervised models, on the Waymo dataset and perform\ncompetitively on Argoverse-v2 and nuScenes. Further, we train a feedforward\nneural network, supervised by the pseudo labels from our model, and achieve top\nperformance among all models capable of real-time inference. We validate the\nadvantage of our model on scene flow estimation with longer temporal gaps, up\nto 0.4 seconds where other models fail to deliver meaningful results.\n","authors":["Yancong Lin","Holger Caesar"],"pdf_url":"https://arxiv.org/pdf/2402.17351v2.pdf","comment":"CVPR 2024, camera-ready. Code: https://github.com/yanconglin/ICP-Flow"},{"id":"http://arxiv.org/abs/2312.10103v3","updated":"2024-03-21T09:20:49Z","published":"2023-12-15T02:54:31Z","title":"GSVA: Generalized Segmentation via Multimodal Large Language Models","summary":" Generalized Referring Expression Segmentation (GRES) extends the scope of\nclassic RES to refer to multiple objects in one expression or identify the\nempty targets absent in the image. GRES poses challenges in modeling the\ncomplex spatial relationships of the instances in the image and identifying\nnon-existing referents. Multimodal Large Language Models (MLLMs) have recently\nshown tremendous progress in these complicated vision-language tasks.\nConnecting Large Language Models (LLMs) and vision models, MLLMs are proficient\nin understanding contexts with visual inputs. Among them, LISA, as a\nrepresentative, adopts a special [SEG] token to prompt a segmentation mask\ndecoder, e.g., SAM, to enable MLLMs in the RES task. However, existing\nsolutions to GRES remain unsatisfactory since current segmentation MLLMs cannot\ncorrectly handle the cases where users might reference multiple subjects in a\nsingular prompt or provide descriptions incongruent with any image target. In\nthis paper, we propose Generalized Segmentation Vision Assistant (GSVA) to\naddress this gap. Specifically, GSVA reuses the [SEG] token to prompt the\nsegmentation model towards supporting multiple mask references simultaneously\nand innovatively learns to generate a [REJ] token to reject the null targets\nexplicitly. Experiments validate GSVA's efficacy in resolving the GRES issue,\nmarking a notable enhancement and setting a new record on the GRES benchmark\ngRefCOCO dataset. GSVA also proves effective across various classic referring\nsegmentation and comprehension tasks.\n","authors":["Zhuofan Xia","Dongchen Han","Yizeng Han","Xuran Pan","Shiji Song","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2312.10103v3.pdf","comment":"Accepted by CVPR2024 (19 pages, 9 figures, 11 tables)"},{"id":"http://arxiv.org/abs/2309.17189v4","updated":"2024-03-21T09:19:18Z","published":"2023-09-29T12:38:00Z","title":"RTFS-Net: Recurrent Time-Frequency Modelling for Efficient Audio-Visual\n Speech Separation","summary":" Audio-visual speech separation methods aim to integrate different modalities\nto generate high-quality separated speech, thereby enhancing the performance of\ndownstream tasks such as speech recognition. Most existing state-of-the-art\n(SOTA) models operate in the time domain. However, their overly simplistic\napproach to modeling acoustic features often necessitates larger and more\ncomputationally intensive models in order to achieve SOTA performance. In this\npaper, we present a novel time-frequency domain audio-visual speech separation\nmethod: Recurrent Time-Frequency Separation Network (RTFS-Net), which applies\nits algorithms on the complex time-frequency bins yielded by the Short-Time\nFourier Transform. We model and capture the time and frequency dimensions of\nthe audio independently using a multi-layered RNN along each dimension.\nFurthermore, we introduce a unique attention-based fusion technique for the\nefficient integration of audio and visual information, and a new mask\nseparation approach that takes advantage of the intrinsic spectral nature of\nthe acoustic features for a clearer separation. RTFS-Net outperforms the prior\nSOTA method in both inference speed and separation quality while reducing the\nnumber of parameters by 90% and MACs by 83%. This is the first time-frequency\ndomain audio-visual speech separation method to outperform all contemporary\ntime-domain counterparts.\n","authors":["Samuel Pegg","Kai Li","Xiaolin Hu"],"pdf_url":"https://arxiv.org/pdf/2309.17189v4.pdf","comment":"Accepted by The Twelfth International Conference on Learning\n Representations (ICLR) 2024, see https://openreview.net/forum?id=PEuDO2EiDr"},{"id":"http://arxiv.org/abs/2312.08007v2","updated":"2024-03-21T09:09:52Z","published":"2023-12-13T09:29:45Z","title":"Unveiling Parts Beyond Objects:Towards Finer-Granularity Referring\n Expression Segmentation","summary":" Referring expression segmentation (RES) aims at segmenting the foreground\nmasks of the entities that match the descriptive natural language expression.\nPrevious datasets and methods for classic RES task heavily rely on the prior\nassumption that one expression must refer to object-level targets. In this\npaper, we take a step further to finer-grained part-level RES task. To promote\nthe object-level RES task towards finer-grained vision-language understanding,\nwe put forward a new multi-granularity referring expression segmentation (MRES)\ntask and construct an evaluation benchmark called RefCOCOm by manual\nannotations. By employing our automatic model-assisted data engine, we build\nthe largest visual grounding dataset namely MRES-32M, which comprises over\n32.2M high-quality masks and captions on the provided 1M images. Besides, a\nsimple yet strong model named UniRES is designed to accomplish the unified\nobject-level and part-level grounding task. Extensive experiments on our\nRefCOCOm for MRES and three datasets (i.e., RefCOCO(+/g) for classic RES task\ndemonstrate the superiority of our method over previous state-of-the-art\nmethods. To foster future research into fine-grained visual grounding, our\nbenchmark RefCOCOm, the MRES-32M dataset and model UniRES will be publicly\navailable at https://github.com/Rubics-Xuan/MRES\n","authors":["Wenxuan Wang","Tongtian Yue","Yisi Zhang","Longteng Guo","Xingjian He","Xinlong Wang","Jing Liu"],"pdf_url":"https://arxiv.org/pdf/2312.08007v2.pdf","comment":"This work is accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.14248v1","updated":"2024-03-21T09:07:28Z","published":"2024-03-21T09:07:28Z","title":"ResNet101 and DAE for Enhance Quality and Classification Accuracy in\n Skin Cancer Imaging","summary":" Skin cancer is a crucial health issue that requires timely detection for\nhigher survival rates. Traditional computer vision techniques face challenges\nin addressing the advanced variability of skin lesion features, a gap partially\nbridged by convolutional neural networks (CNNs). To overcome the existing\nissues, we introduce an innovative convolutional ensemble network approach\nnamed deep autoencoder (DAE) with ResNet101. This method utilizes\nconvolution-based deep neural networks for the detection of skin cancer. The\nISIC-2018 public data taken from the source is used for experimental results,\nwhich demonstrate remarkable performance with the different in terms of\nperformance metrics. The methods result in 96.03% of accuracy, 95.40 % of\nprecision, 96.05% of recall, 0.9576 of F-measure, 0.98 of AUC.\n","authors":["Sibasish Dhibar"],"pdf_url":"https://arxiv.org/pdf/2403.14248v1.pdf","comment":"6 Pages; 14 figures; 3 tables"},{"id":"http://arxiv.org/abs/2403.14244v1","updated":"2024-03-21T09:02:31Z","published":"2024-03-21T09:02:31Z","title":"Isotropic Gaussian Splatting for Real-Time Radiance Field Rendering","summary":" The 3D Gaussian splatting method has drawn a lot of attention, thanks to its\nhigh performance in training and high quality of the rendered image. However,\nit uses anisotropic Gaussian kernels to represent the scene. Although such\nanisotropic kernels have advantages in representing the geometry, they lead to\ndifficulties in terms of computation, such as splitting or merging two kernels.\nIn this paper, we propose to use isotropic Gaussian kernels to avoid such\ndifficulties in the computation, leading to a higher performance method. The\nexperiments confirm that the proposed method is about {\\bf 100X} faster without\nlosing the geometry representation accuracy. The proposed method can be applied\nin a large range applications where the radiance field is needed, such as 3D\nreconstruction, view synthesis, and dynamic object modeling.\n","authors":["Yuanhao Gong","Lantao Yu","Guanghui Yue"],"pdf_url":"https://arxiv.org/pdf/2403.14244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14243v1","updated":"2024-03-21T09:02:17Z","published":"2024-03-21T09:02:17Z","title":"Dermacen Analytica: A Novel Methodology Integrating Multi-Modal Large\n Language Models with Machine Learning in tele-dermatology","summary":" The rise of Artificial Intelligence creates great promise in the field of\nmedical discovery, diagnostics and patient management. However, the vast\ncomplexity of all medical domains require a more complex approach that combines\nmachine learning algorithms, classifiers, segmentation algorithms and, lately,\nlarge language models. In this paper, we describe, implement and assess an\nArtificial Intelligence-empowered system and methodology aimed at assisting the\ndiagnosis process of skin lesions and other skin conditions within the field of\ndermatology that aims to holistically address the diagnostic process in this\ndomain. The workflow integrates large language, transformer-based vision models\nand sophisticated machine learning tools. This holistic approach achieves a\nnuanced interpretation of dermatological conditions that simulates and\nfacilitates a dermatologist's workflow. We assess our proposed methodology\nthrough a thorough cross-model validation technique embedded in an evaluation\npipeline that utilizes publicly available medical case studies of skin\nconditions and relevant images. To quantitatively score the system performance,\nadvanced machine learning and natural language processing tools are employed\nwhich focus on similarity comparison and natural language inference.\nAdditionally, we incorporate a human expert evaluation process based on a\nstructured checklist to further validate our results. We implemented the\nproposed methodology in a system which achieved approximate (weighted) scores\nof 0.87 for both contextual understanding and diagnostic accuracy,\ndemonstrating the efficacy of our approach in enhancing dermatological\nanalysis. The proposed methodology is expected to prove useful in the\ndevelopment of next-generation tele-dermatology applications, enhancing remote\nconsultation capabilities and access to care, especially in underserved areas.\n","authors":["Dimitrios P. Panagoulias","Evridiki Tsoureli-Nikita","Maria Virvou","George A. Tsihrintzis"],"pdf_url":"https://arxiv.org/pdf/2403.14243v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14240v1","updated":"2024-03-21T09:01:21Z","published":"2024-03-21T09:01:21Z","title":"Weak Supervision with Arbitrary Single Frame for Micro- and\n Macro-expression Spotting","summary":" Frame-level micro- and macro-expression spotting methods require\ntime-consuming frame-by-frame observation during annotation. Meanwhile,\nvideo-level spotting lacks sufficient information about the location and number\nof expressions during training, resulting in significantly inferior performance\ncompared with fully-supervised spotting. To bridge this gap, we propose a\npoint-level weakly-supervised expression spotting (PWES) framework, where each\nexpression requires to be annotated with only one random frame (i.e., a point).\nTo mitigate the issue of sparse label distribution, the prevailing solution is\npseudo-label mining, which, however, introduces new problems: localizing\ncontextual background snippets results in inaccurate boundaries and discarding\nforeground snippets leads to fragmentary predictions. Therefore, we design the\nstrategies of multi-refined pseudo label generation (MPLG) and\ndistribution-guided feature contrastive learning (DFCL) to address these\nproblems. Specifically, MPLG generates more reliable pseudo labels by merging\nclass-specific probabilities, attention scores, fused features, and point-level\nlabels. DFCL is utilized to enhance feature similarity for the same categories\nand feature variability for different categories while capturing global\nrepresentations across the entire datasets. Extensive experiments on the\nCAS(ME)^2, CAS(ME)^3, and SAMM-LV datasets demonstrate PWES achieves promising\nperformance comparable to that of recent fully-supervised methods.\n","authors":["Wang-Wang Yu","Xian-Shi Zhang","Fu-Ya Luo","Yijun Cao","Kai-Fu Yang","Hong-Mei Yan","Yong-Jie Li"],"pdf_url":"https://arxiv.org/pdf/2403.14240v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13507v2","updated":"2024-03-21T08:54:27Z","published":"2024-03-20T11:05:07Z","title":"FMM-Attack: A Flow-based Multi-modal Adversarial Attack on Video-based\n LLMs","summary":" Despite the remarkable performance of video-based large language models\n(LLMs), their adversarial threat remains unexplored. To fill this gap, we\npropose the first adversarial attack tailored for video-based LLMs by crafting\nflow-based multi-modal adversarial perturbations on a small fraction of frames\nwithin a video, dubbed FMM-Attack. Extensive experiments show that our attack\ncan effectively induce video-based LLMs to generate incorrect answers when\nvideos are added with imperceptible adversarial perturbations. Intriguingly,\nour FMM-Attack can also induce garbling in the model output, prompting\nvideo-based LLMs to hallucinate. Overall, our observations inspire a further\nunderstanding of multi-modal robustness and safety-related feature alignment\nacross different modalities, which is of great importance for various large\nmulti-modal models. Our code is available at\nhttps://github.com/THU-Kingmin/FMM-Attack.\n","authors":["Jinmin Li","Kuofeng Gao","Yang Bai","Jingyun Zhang","Shu-tao Xia","Yisen Wang"],"pdf_url":"https://arxiv.org/pdf/2403.13507v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14235v1","updated":"2024-03-21T08:52:39Z","published":"2024-03-21T08:52:39Z","title":"RG-CAT: Detection Pipeline and Catalogue of Radio Galaxies in the EMU\n Pilot Survey","summary":" We present source detection and catalogue construction pipelines to build the\nfirst catalogue of radio galaxies from the 270 $\\rm deg^2$ pilot survey of the\nEvolutionary Map of the Universe (EMU-PS) conducted with the Australian Square\nKilometre Array Pathfinder (ASKAP) telescope. The detection pipeline uses\nGal-DINO computer-vision networks (Gupta et al., 2024) to predict the\ncategories of radio morphology and bounding boxes for radio sources, as well as\ntheir potential infrared host positions. The Gal-DINO network is trained and\nevaluated on approximately 5,000 visually inspected radio galaxies and their\ninfrared hosts, encompassing both compact and extended radio morphologies. We\nfind that the Intersection over Union (IoU) for the predicted and ground truth\nbounding boxes is larger than 0.5 for 99% of the radio sources, and 98% of\npredicted host positions are within $3^{\\prime \\prime}$ of the ground truth\ninfrared host in the evaluation set. The catalogue construction pipeline uses\nthe predictions of the trained network on the radio and infrared image cutouts\nbased on the catalogue of radio components identified using the Selavy source\nfinder algorithm. Confidence scores of the predictions are then used to\nprioritize Selavy components with higher scores and incorporate them first into\nthe catalogue. This results in identifications for a total of 211,625 radio\nsources, with 201,211 classified as compact and unresolved. The remaining\n10,414 are categorized as extended radio morphologies, including 582 FR-I,\n5,602 FR-II, 1,494 FR-x (uncertain whether FR-I or FR-II), 2,375 R (single-peak\nresolved) radio galaxies, and 361 with peculiar and other rare morphologies. We\ncross-match the radio sources in the catalogue with the infrared and optical\ncatalogues, finding infrared cross-matches for 73% and photometric redshifts\nfor 36% of the radio galaxies.\n","authors":["Nikhel Gupta","Ray P. Norris","Zeeshan Hayder","Minh Huynh","Lars Petersson","X. Rosalind Wang","Andrew M. Hopkins","Heinz Andernach","Yjan Gordon","Simone Riggi","Miranda Yew","Evan J. Crawford","Bärbel Koribalski","Miroslav D. Filipović","Anna D. Kapinśka","Stanislav Shabala","Tessa Vernstrom","Joshua R. Marvil"],"pdf_url":"https://arxiv.org/pdf/2403.14235v1.pdf","comment":"Accepted for publication in PASA. The paper has 22 pages, 12 figures\n and 5 tables"},{"id":"http://arxiv.org/abs/2212.02340v3","updated":"2024-03-21T08:50:54Z","published":"2022-12-05T15:15:27Z","title":"CBNet: A Plug-and-Play Network for Segmentation-Based Scene Text\n Detection","summary":" Recently, segmentation-based methods are quite popular in scene text\ndetection, which mainly contain two steps: text kernel segmentation and\nexpansion. However, the segmentation process only considers each pixel\nindependently, and the expansion process is difficult to achieve a favorable\naccuracy-speed trade-off. In this paper, we propose a Context-aware and\nBoundary-guided Network (CBN) to tackle these problems. In CBN, a basic text\ndetector is firstly used to predict initial segmentation results. Then, we\npropose a context-aware module to enhance text kernel feature representations,\nwhich considers both global and local contexts. Finally, we introduce a\nboundary-guided module to expand enhanced text kernels adaptively with only the\npixels on the contours, which not only obtains accurate text boundaries but\nalso keeps high speed, especially on high-resolution output maps. In\nparticular, with a lightweight backbone, the basic detector equipped with our\nproposed CBN achieves state-of-the-art results on several popular benchmarks,\nand our proposed CBN can be plugged into several segmentation-based methods.\nCode is available at https://github.com/XiiZhao/cbn.pytorch.\n","authors":["Xi Zhao","Wei Feng","Zheng Zhang","Jingjing Lv","Xin Zhu","Zhangang Lin","Jinghe Hu","Jingping Shao"],"pdf_url":"https://arxiv.org/pdf/2212.02340v3.pdf","comment":"Accepted by IJCV 2024. Code is available at this https URL:\n https://github.com/XiiZhao/cbn.pytorch"},{"id":"http://arxiv.org/abs/2403.14233v1","updated":"2024-03-21T08:49:34Z","published":"2024-03-21T08:49:34Z","title":"SoftPatch: Unsupervised Anomaly Detection with Noisy Data","summary":" Although mainstream unsupervised anomaly detection (AD) algorithms perform\nwell in academic datasets, their performance is limited in practical\napplication due to the ideal experimental setting of clean training data.\nTraining with noisy data is an inevitable problem in real-world anomaly\ndetection but is seldom discussed. This paper considers label-level noise in\nimage sensory anomaly detection for the first time. To solve this problem, we\nproposed a memory-based unsupervised AD method, SoftPatch, which efficiently\ndenoises the data at the patch level. Noise discriminators are utilized to\ngenerate outlier scores for patch-level noise elimination before coreset\nconstruction. The scores are then stored in the memory bank to soften the\nanomaly detection boundary. Compared with existing methods, SoftPatch maintains\na strong modeling ability of normal data and alleviates the overconfidence\nproblem in coreset. Comprehensive experiments in various noise scenes\ndemonstrate that SoftPatch outperforms the state-of-the-art AD methods on the\nMVTecAD and BTAD benchmarks and is comparable to those methods under the\nsetting without noise.\n","authors":["Xi Jiang","Ying Chen","Qiang Nie","Yong Liu","Jianlin Liu","Bin-Bin Gao","Jun Liu","Chengjie Wang","Feng Zheng"],"pdf_url":"https://arxiv.org/pdf/2403.14233v1.pdf","comment":"36th Conference on Neural Information Processing Systems"},{"id":"http://arxiv.org/abs/2402.03631v2","updated":"2024-03-21T08:36:15Z","published":"2024-02-06T02:00:18Z","title":"Conditional Tuning Network for Few-Shot Adaptation of Segmentation\n Anything Model","summary":" The recent Segment Anything Model (SAM) has demonstrated remarkable zero-shot\ncapability and flexible geometric prompting in general image segmentation.\nHowever, SAM often struggles when handling various unconventional images, such\nas aerial, medical, and non-RGB images. This paper presents CAT-SAM, a\nConditionAl Tuning network that adapts SAM toward various unconventional target\ntasks with just few-shot target samples. CAT-SAM freezes the entire SAM and\nadapts its mask decoder and image encoder simultaneously with a small number of\nlearnable parameters. The core design is a prompt bridge structure that enables\ndecoder-conditioned joint tuning of the heavyweight image encoder and the\nlightweight mask decoder. The bridging maps the prompt token of the mask\ndecoder to the image encoder, fostering synergic adaptation of the encoder and\nthe decoder with mutual benefits. We develop two representative tuning\nstrategies for the image encoder which leads to two CAT-SAM variants: one\ninjecting learnable prompt tokens in the input space and the other inserting\nlightweight adapter networks. Extensive experiments over 11 unconventional\ntasks show that both CAT-SAM variants achieve superior target segmentation\nperformance consistently even under the very challenging one-shot adaptation\nsetup. Project page: https://xiaoaoran.github.io/projects/CAT-SAM\n","authors":["Aoran Xiao","Weihao Xuan","Heli Qi","Yun Xing","Ruijie Ren","Xiaoqin Zhang","Ling Shao","Shijian Lu"],"pdf_url":"https://arxiv.org/pdf/2402.03631v2.pdf","comment":"Project page: https://xiaoaoran.github.io/projects/CAT-SAM"},{"id":"http://arxiv.org/abs/2309.06670v4","updated":"2024-03-21T08:30:54Z","published":"2023-09-13T02:15:29Z","title":"ShaDocFormer: A Shadow-Attentive Threshold Detector With Cascaded Fusion\n Refiner for Document Shadow Removal","summary":" Document shadow is a common issue that arises when capturing documents using\nmobile devices, which significantly impacts readability. Current methods\nencounter various challenges, including inaccurate detection of shadow masks\nand estimation of illumination. In this paper, we propose ShaDocFormer, a\nTransformer-based architecture that integrates traditional methodologies and\ndeep learning techniques to tackle the problem of document shadow removal. The\nShaDocFormer architecture comprises two components: the Shadow-attentive\nThreshold Detector (STD) and the Cascaded Fusion Refiner (CFR). The STD module\nemploys a traditional thresholding technique and leverages the attention\nmechanism of the Transformer to gather global information, thereby enabling\nprecise detection of shadow masks. The cascaded and aggregative structure of\nthe CFR module facilitates a coarse-to-fine restoration process for the entire\nimage. As a result, ShaDocFormer excels in accurately detecting and capturing\nvariations in both shadow and illumination, thereby enabling effective removal\nof shadows. Extensive experiments demonstrate that ShaDocFormer outperforms\ncurrent state-of-the-art methods in both qualitative and quantitative\nmeasurements.\n","authors":["Weiwen Chen","Yingtie Lei","Shenghong Luo","Ziyang Zhou","Mingxian Li","Chi-Man Pun"],"pdf_url":"https://arxiv.org/pdf/2309.06670v4.pdf","comment":"Accepted by IJCNN 2024"},{"id":"http://arxiv.org/abs/2312.07485v2","updated":"2024-03-21T08:19:05Z","published":"2023-12-12T18:21:36Z","title":"MinD-3D: Reconstruct High-quality 3D objects in Human Brain","summary":" In this paper, we introduce Recon3DMind, an innovative task aimed at\nreconstructing 3D visuals from Functional Magnetic Resonance Imaging (fMRI)\nsignals, marking a significant advancement in the fields of cognitive\nneuroscience and computer vision. To support this pioneering task, we present\nthe fMRI-Shape dataset, which includes data from 14 participants and features\n360-degree videos of 3D objects to enable comprehensive fMRI signal capture\nacross various settings, thereby laying a foundation for future research.\nFurthermore, we propose MinD-3D, a novel and effective three-stage framework\nspecifically designed to decode the brain's 3D visual information from fMRI\nsignals, demonstrating the feasibility of this challenging task. The framework\nbegins by extracting and aggregating features from fMRI frames through a\nneuro-fusion encoder, subsequently employs a feature bridge diffusion model to\ngenerate visual features, and ultimately recovers the 3D object via a\ngenerative transformer decoder. We assess the performance of MinD-3D using a\nsuite of semantic and structural metrics and analyze the correlation between\nthe features extracted by our model and the visual regions of interest (ROIs)\nin fMRI signals. Our findings indicate that MinD-3D not only reconstructs 3D\nobjects with high semantic relevance and spatial similarity but also\nsignificantly enhances our understanding of the human brain's capabilities in\nprocessing 3D visual information. Project page at:\nhttps://jianxgao.github.io/MinD-3D.\n","authors":["Jianxiong Gao","Yuqian Fu","Yun Wang","Xuelin Qian","Jianfeng Feng","Yanwei Fu"],"pdf_url":"https://arxiv.org/pdf/2312.07485v2.pdf","comment":"26 pages, 13 figures"},{"id":"http://arxiv.org/abs/2403.14213v1","updated":"2024-03-21T08:08:31Z","published":"2024-03-21T08:08:31Z","title":"Toward Multi-class Anomaly Detection: Exploring Class-aware Unified\n Model against Inter-class Interference","summary":" In the context of high usability in single-class anomaly detection models,\nrecent academic research has become concerned about the more complex\nmulti-class anomaly detection. Although several papers have designed unified\nmodels for this task, they often overlook the utility of class labels, a potent\ntool for mitigating inter-class interference. To address this issue, we\nintroduce a Multi-class Implicit Neural representation Transformer for unified\nAnomaly Detection (MINT-AD), which leverages the fine-grained category\ninformation in the training stage. By learning the multi-class distributions,\nthe model generates class-aware query embeddings for the transformer decoder,\nmitigating inter-class interference within the reconstruction model. Utilizing\nsuch an implicit neural representation network, MINT-AD can project category\nand position information into a feature embedding space, further supervised by\nclassification and prior probability loss functions. Experimental results on\nmultiple datasets demonstrate that MINT-AD outperforms existing unified\ntraining models.\n","authors":["Xi Jiang","Ying Chen","Qiang Nie","Jianlin Liu","Yong Liu","Chengjie Wang","Feng Zheng"],"pdf_url":"https://arxiv.org/pdf/2403.14213v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14203v1","updated":"2024-03-21T07:56:09Z","published":"2024-03-21T07:56:09Z","title":"Unsupervised Audio-Visual Segmentation with Modality Alignment","summary":" Audio-Visual Segmentation (AVS) aims to identify, at the pixel level, the\nobject in a visual scene that produces a given sound. Current AVS methods rely\non costly fine-grained annotations of mask-audio pairs, making them impractical\nfor scalability. To address this, we introduce unsupervised AVS, eliminating\nthe need for such expensive annotation. To tackle this more challenging\nproblem, we propose an unsupervised learning method, named Modality\nCorrespondence Alignment (MoCA), which seamlessly integrates off-the-shelf\nfoundation models like DINO, SAM, and ImageBind. This approach leverages their\nknowledge complementarity and optimizes their joint usage for multi-modality\nassociation. Initially, we estimate positive and negative image pairs in the\nfeature space. For pixel-level association, we introduce an audio-visual\nadapter and a novel pixel matching aggregation strategy within the image-level\ncontrastive learning framework. This allows for a flexible connection between\nobject appearance and audio signal at the pixel level, with tolerance to\nimaging variations such as translation and rotation. Extensive experiments on\nthe AVSBench (single and multi-object splits) and AVSS datasets demonstrate\nthat our MoCA outperforms strongly designed baseline methods and approaches\nsupervised counterparts, particularly in complex scenarios with multiple\nauditory objects. Notably when comparing mIoU, MoCA achieves a substantial\nimprovement over baselines in both the AVSBench (S4: +17.24%; MS3: +67.64%) and\nAVSS (+19.23%) audio-visual segmentation challenges.\n","authors":["Swapnil Bhosale","Haosen Yang","Diptesh Kanojia","Jiangkang Deng","Xiatian Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.14203v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11193v2","updated":"2024-03-21T07:53:23Z","published":"2024-03-17T12:40:46Z","title":"Neural Markov Random Field for Stereo Matching","summary":" Stereo matching is a core task for many computer vision and robotics\napplications. Despite their dominance in traditional stereo methods, the\nhand-crafted Markov Random Field (MRF) models lack sufficient modeling accuracy\ncompared to end-to-end deep models. While deep learning representations have\ngreatly improved the unary terms of the MRF models, the overall accuracy is\nstill severely limited by the hand-crafted pairwise terms and message passing.\nTo address these issues, we propose a neural MRF model, where both potential\nfunctions and message passing are designed using data-driven neural networks.\nOur fully data-driven model is built on the foundation of variational inference\ntheory, to prevent convergence issues and retain stereo MRF's graph inductive\nbias. To make the inference tractable and scale well to high-resolution images,\nwe also propose a Disparity Proposal Network (DPN) to adaptively prune the\nsearch space of disparity. The proposed approach ranks $1^{st}$ on both KITTI\n2012 and 2015 leaderboards among all published methods while running faster\nthan 100 ms. This approach significantly outperforms prior global methods,\ne.g., lowering D1 metric by more than 50% on KITTI 2015. In addition, our\nmethod exhibits strong cross-domain generalization and can recover sharp edges.\nThe codes at https://github.com/aeolusguan/NMRF\n","authors":["Tongfan Guan","Chen Wang","Yun-Hui Liu"],"pdf_url":"https://arxiv.org/pdf/2403.11193v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2311.11178v3","updated":"2024-03-21T07:52:03Z","published":"2023-11-18T22:42:16Z","title":"Active Prompt Learning in Vision Language Models","summary":" Pre-trained Vision Language Models (VLMs) have demonstrated notable progress\nin various zero-shot tasks, such as classification and retrieval. Despite their\nperformance, because improving performance on new tasks requires task-specific\nknowledge, their adaptation is essential. While labels are needed for the\nadaptation, acquiring them is typically expensive. To overcome this challenge,\nactive learning, a method of achieving a high performance by obtaining labels\nfor a small number of samples from experts, has been studied. Active learning\nprimarily focuses on selecting unlabeled samples for labeling and leveraging\nthem to train models. In this study, we pose the question, \"how can the\npre-trained VLMs be adapted under the active learning framework?\" In response\nto this inquiry, we observe that (1) simply applying a conventional active\nlearning framework to pre-trained VLMs even may degrade performance compared to\nrandom selection because of the class imbalance in labeling candidates, and (2)\nthe knowledge of VLMs can provide hints for achieving the balance before\nlabeling. Based on these observations, we devise a novel active learning\nframework for VLMs, denoted as PCB. To assess the effectiveness of our\napproach, we conduct experiments on seven different real-world datasets, and\nthe results demonstrate that PCB surpasses conventional active learning and\nrandom sampling methods. Code will be available in\nhttps://github.com/kaist-dmlab/pcb .\n","authors":["Jihwan Bang","Sumyeong Ahn","Jae-Gil Lee"],"pdf_url":"https://arxiv.org/pdf/2311.11178v3.pdf","comment":"accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.14200v1","updated":"2024-03-21T07:50:45Z","published":"2024-03-21T07:50:45Z","title":"Debiasing surgeon: fantastic weights and how to find them","summary":" Nowadays an ever-growing concerning phenomenon, the emergence of algorithmic\nbiases that can lead to unfair models, emerges. Several debiasing approaches\nhave been proposed in the realm of deep learning, employing more or less\nsophisticated approaches to discourage these models from massively employing\nthese biases. However, a question emerges: is this extra complexity really\nnecessary? Is a vanilla-trained model already embodying some ``unbiased\nsub-networks'' that can be used in isolation and propose a solution without\nrelying on the algorithmic biases? In this work, we show that such a\nsub-network typically exists, and can be extracted from a vanilla-trained model\nwithout requiring additional training. We further validate that such specific\narchitecture is incapable of learning a specific bias, suggesting that there\nare possible architectural countermeasures to the problem of biases in deep\nneural networks.\n","authors":["Rémi Nahon","Ivan Luiz De Moura Matos","Van-Tam Nguyen","Enzo Tartaglione"],"pdf_url":"https://arxiv.org/pdf/2403.14200v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14198v1","updated":"2024-03-21T07:48:35Z","published":"2024-03-21T07:48:35Z","title":"Unleashing Unlabeled Data: A Paradigm for Cross-View Geo-Localization","summary":" This paper investigates the effective utilization of unlabeled data for\nlarge-area cross-view geo-localization (CVGL), encompassing both unsupervised\nand semi-supervised settings. Common approaches to CVGL rely on\nground-satellite image pairs and employ label-driven supervised training.\nHowever, the cost of collecting precise cross-view image pairs hinders the\ndeployment of CVGL in real-life scenarios. Without the pairs, CVGL will be more\nchallenging to handle the significant imaging and spatial gaps between ground\nand satellite images. To this end, we propose an unsupervised framework\nincluding a cross-view projection to guide the model for retrieving initial\npseudo-labels and a fast re-ranking mechanism to refine the pseudo-labels by\nleveraging the fact that ``the perfectly paired ground-satellite image is\nlocated in a unique and identical scene\". The framework exhibits competitive\nperformance compared with supervised works on three open-source benchmarks. Our\ncode and models will be released on https://github.com/liguopeng0923/UCVGL.\n","authors":["Guopeng Li","Ming Qian","Gui-Song Xia"],"pdf_url":"https://arxiv.org/pdf/2403.14198v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2311.15876v2","updated":"2024-03-21T07:38:51Z","published":"2023-11-27T14:49:06Z","title":"LMM-Assisted Breast Cancer Treatment Target Segmentation with\n Consistency Embedding","summary":" Recent advancements in Artificial Intelligence (AI) have profoundly\ninfluenced medical fields, by providing tools to reduce clinical workloads.\nHowever, most AI models are constrained to execute unimodal tasks, in stark\ncontrast to the comprehensive approaches utilized by medical professionals. To\naddress this, here we present RO-LMM, a multi-purpose large multimodal model\n(LMM) tailored for the field of radiation oncology. This model covers series of\ntasks within clinical workflow, adept at clinical report summarization,\nradiation treatment plan suggestion, and plan-guided target volume\nsegmentation. In particular, to perform consecutive clinical tasks, we further\npresent a novel Consistency Embedding Fine-Tuning (CEFTune) technique, which\nboosts LMM's robustness to noisy inputs while preserving the capability of\nhandling clean inputs, and transform this concept into LMM-driven segmentation\nframework as Consistency Embedding Segmentation~(CESEG). Experimental results\non multi-centre cohorts demonstrate our RO-LMM's promising performance for\nmultiple clinical tasks with generalization capabilities.\n","authors":["Kwanyoung Kim","Yujin Oh","Sangjoon Park","Hwa Kyung Byun","Jin Sung Kim","Yong Bae Kim","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2311.15876v2.pdf","comment":"30 pages, 16 table, 5 figures"},{"id":"http://arxiv.org/abs/2403.14191v1","updated":"2024-03-21T07:34:31Z","published":"2024-03-21T07:34:31Z","title":"PECI-Net: Bolus segmentation from video fluoroscopic swallowing study\n images using preprocessing ensemble and cascaded inference","summary":" Bolus segmentation is crucial for the automated detection of swallowing\ndisorders in videofluoroscopic swallowing studies (VFSS). However, it is\ndifficult for the model to accurately segment a bolus region in a VFSS image\nbecause VFSS images are translucent, have low contrast and unclear region\nboundaries, and lack color information. To overcome these challenges, we\npropose PECI-Net, a network architecture for VFSS image analysis that combines\ntwo novel techniques: the preprocessing ensemble network (PEN) and the cascaded\ninference network (CIN). PEN enhances the sharpness and contrast of the VFSS\nimage by combining multiple preprocessing algorithms in a learnable way. CIN\nreduces ambiguity in bolus segmentation by using context from other regions\nthrough cascaded inference. Moreover, CIN prevents undesirable side effects\nfrom unreliably segmented regions by referring to the context in an asymmetric\nway. In experiments, PECI-Net exhibited higher performance than four recently\ndeveloped baseline models, outperforming TernausNet, the best among the\nbaseline models, by 4.54\\% and the widely used UNet by 10.83\\%. The results of\nthe ablation studies confirm that CIN and PEN are effective in improving bolus\nsegmentation performance.\n","authors":["Dougho Park","Younghun Kim","Harim Kang","Junmyeoung Lee","Jinyoung Choi","Taeyeon Kim","Sangeok Lee","Seokil Son","Minsol Kim","Injung Kim"],"pdf_url":"https://arxiv.org/pdf/2403.14191v1.pdf","comment":"20 pages, 8 figures,"},{"id":"http://arxiv.org/abs/2402.17159v2","updated":"2024-03-21T07:29:34Z","published":"2024-02-27T02:47:09Z","title":"NocPlace: Nocturnal Visual Place Recognition via Generative and\n Inherited Knowledge Transfer","summary":" Visual Place Recognition (VPR) is crucial in computer vision, aiming to\nretrieve database images similar to a query image from an extensive collection\nof known images. However, like many vision tasks, VPR always degrades at night\ndue to the scarcity of nighttime images. Moreover, VPR needs to address the\ncross-domain problem of night-to-day rather than just the issue of a single\nnighttime domain. In response to these issues, we present NocPlace, which\nleverages generative and inherited knowledge transfer to embed resilience\nagainst dazzling lights and extreme darkness in the global descriptor. First,\nwe establish a day-night urban scene dataset called NightCities, capturing\ndiverse lighting variations and dark scenarios across 60 cities globally. Then,\nan image generation network is trained on this dataset and processes a\nlarge-scale VPR dataset, obtaining its nighttime version. Finally, VPR models\nare fine-tuned using descriptors inherited from themselves and night-style\nimages, which builds explicit cross-domain contrastive relationships.\nComprehensive experiments on various datasets demonstrate our contributions and\nthe superiority of NocPlace. Without adding any real-time computing resources,\nNocPlace improves the performance of Eigenplaces by 7.6% on Tokyo 24/7 Night\nand 16.8% on SVOX Night.\n","authors":["Bingxi Liu","Yiqun Wang","Huaqi Tao","Tingjun Huang","Fulin Tang","Yihong Wu","Jinqiang Cui","Hong Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.17159v2.pdf","comment":"28 pages,9 figures"},{"id":"http://arxiv.org/abs/2403.14186v1","updated":"2024-03-21T07:21:51Z","published":"2024-03-21T07:21:51Z","title":"StyleCineGAN: Landscape Cinemagraph Generation using a Pre-trained\n StyleGAN","summary":" We propose a method that can generate cinemagraphs automatically from a still\nlandscape image using a pre-trained StyleGAN. Inspired by the success of recent\nunconditional video generation, we leverage a powerful pre-trained image\ngenerator to synthesize high-quality cinemagraphs. Unlike previous approaches\nthat mainly utilize the latent space of a pre-trained StyleGAN, our approach\nutilizes its deep feature space for both GAN inversion and cinemagraph\ngeneration. Specifically, we propose multi-scale deep feature warping (MSDFW),\nwhich warps the intermediate features of a pre-trained StyleGAN at different\nresolutions. By using MSDFW, the generated cinemagraphs are of high resolution\nand exhibit plausible looping animation. We demonstrate the superiority of our\nmethod through user studies and quantitative comparisons with state-of-the-art\ncinemagraph generation methods and a video generation method that uses a\npre-trained StyleGAN.\n","authors":["Jongwoo Choi","Kwanggyoon Seo","Amirsaman Ashtari","Junyong Noh"],"pdf_url":"https://arxiv.org/pdf/2403.14186v1.pdf","comment":"Project website: https://jeolpyeoni.github.io/stylecinegan_project/"},{"id":"http://arxiv.org/abs/2310.02712v2","updated":"2024-03-21T07:20:35Z","published":"2023-10-04T10:28:38Z","title":"ED-NeRF: Efficient Text-Guided Editing of 3D Scene with Latent Space\n NeRF","summary":" Recently, there has been a significant advancement in text-to-image diffusion\nmodels, leading to groundbreaking performance in 2D image generation. These\nadvancements have been extended to 3D models, enabling the generation of novel\n3D objects from textual descriptions. This has evolved into NeRF editing\nmethods, which allow the manipulation of existing 3D objects through textual\nconditioning. However, existing NeRF editing techniques have faced limitations\nin their performance due to slow training speeds and the use of loss functions\nthat do not adequately consider editing. To address this, here we present a\nnovel 3D NeRF editing approach dubbed ED-NeRF by successfully embedding\nreal-world scenes into the latent space of the latent diffusion model (LDM)\nthrough a unique refinement layer. This approach enables us to obtain a NeRF\nbackbone that is not only faster but also more amenable to editing compared to\ntraditional image space NeRF editing. Furthermore, we propose an improved loss\nfunction tailored for editing by migrating the delta denoising score (DDS)\ndistillation loss, originally used in 2D image editing to the three-dimensional\ndomain. This novel loss function surpasses the well-known score distillation\nsampling (SDS) loss in terms of suitability for editing purposes. Our\nexperimental results demonstrate that ED-NeRF achieves faster editing speed\nwhile producing improved output quality compared to state-of-the-art 3D editing\nmodels.\n","authors":["Jangho Park","Gihyun Kwon","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2310.02712v2.pdf","comment":"ICLR 2024; Project Page: https://jhq1234.github.io/ed-nerf.github.io/"},{"id":"http://arxiv.org/abs/2403.14183v1","updated":"2024-03-21T07:15:37Z","published":"2024-03-21T07:15:37Z","title":"OTSeg: Multi-prompt Sinkhorn Attention for Zero-Shot Semantic\n Segmentation","summary":" The recent success of CLIP has demonstrated promising results in zero-shot\nsemantic segmentation by transferring muiltimodal knowledge to pixel-level\nclassification. However, leveraging pre-trained CLIP knowledge to closely align\ntext embeddings with pixel embeddings still has limitations in existing\napproaches. To address this issue, we propose OTSeg, a novel multimodal\nattention mechanism aimed at enhancing the potential of multiple text prompts\nfor matching associated pixel embeddings. We first propose Multi-Prompts\nSinkhorn (MPS) based on the Optimal Transport (OT) algorithm, which leads\nmultiple text prompts to selectively focus on various semantic features within\nimage pixels. Moreover, inspired by the success of Sinkformers in unimodal\nsettings, we introduce the extension of MPS, called Multi-Prompts Sinkhorn\nAttention (MPSA), which effectively replaces cross-attention mechanisms within\nTransformer framework in multimodal settings. Through extensive experiments, we\ndemonstrate that OTSeg achieves state-of-the-art (SOTA) performance with\nsignificant gains on Zero-Shot Semantic Segmentation (ZS3) tasks across three\nbenchmark datasets.\n","authors":["Kwanyoung Kim","Yujin Oh","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2403.14183v1.pdf","comment":"22 pages, 7 figures"},{"id":"http://arxiv.org/abs/2403.14174v1","updated":"2024-03-21T06:53:40Z","published":"2024-03-21T06:53:40Z","title":"Unified Static and Dynamic Network: Efficient Temporal Filtering for\n Video Grounding","summary":" Inspired by the activity-silent and persistent activity mechanisms in human\nvisual perception biology, we design a Unified Static and Dynamic Network\n(UniSDNet), to learn the semantic association between the video and text/audio\nqueries in a cross-modal environment for efficient video grounding. For static\nmodeling, we devise a novel residual structure (ResMLP) to boost the global\ncomprehensive interaction between the video segments and queries, achieving\nmore effective semantic enhancement/supplement. For dynamic modeling, we\neffectively exploit three characteristics of the persistent activity mechanism\nin our network design for a better video context comprehension. Specifically,\nwe construct a diffusely connected video clip graph on the basis of 2D sparse\ntemporal masking to reflect the \"short-term effect\" relationship. We\ninnovatively consider the temporal distance and relevance as the joint\n\"auxiliary evidence clues\" and design a multi-kernel Temporal Gaussian Filter\nto expand the context clue into high-dimensional space, simulating the \"complex\nvisual perception\", and then conduct element level filtering convolution\noperations on neighbour clip nodes in message passing stage for finally\ngenerating and ranking the candidate proposals. Our UniSDNet is applicable to\nboth Natural Language Video Grounding (NLVG) and Spoken Language Video\nGrounding (SLVG) tasks. Our UniSDNet achieves SOTA performance on three widely\nused datasets for NLVG, as well as three datasets for SLVG, e.g., reporting new\nrecords at 38.88% R@1,IoU@0.7 on ActivityNet Captions and 40.26% R@1,IoU@0.5 on\nTACoS. To facilitate this field, we collect two new datasets (Charades-STA\nSpeech and TACoS Speech) for SLVG task. Meanwhile, the inference speed of our\nUniSDNet is 1.56$\\times$ faster than the strong multi-query benchmark. Code is\navailable at: https://github.com/xian-sh/UniSDNet.\n","authors":["Jingjing Hu","Dan Guo","Kun Li","Zhan Si","Xun Yang","Xiaojun Chang","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2403.14174v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09559v2","updated":"2024-03-21T06:51:16Z","published":"2024-03-14T16:47:25Z","title":"Less is More: Data Value Estimation for Visual Instruction Tuning","summary":" Visual instruction tuning is the key to building multimodal large language\nmodels (MLLMs), which greatly improves the reasoning capabilities of large\nlanguage models (LLMs) in vision scenario. However, existing MLLMs mostly rely\non a mixture of multiple highly diverse visual instruction datasets for\ntraining (even more than a million instructions), which may introduce data\nredundancy. To investigate this issue, we conduct a series of empirical\nstudies, which reveal a significant redundancy within the visual instruction\ndatasets, and show that greatly reducing the amount of several instruction\ndataset even do not affect the performance. Based on the findings, we propose a\nnew data selection approach TIVE, to eliminate redundancy within visual\ninstruction data. TIVE first estimates the task-level and instance-level value\nof the visual instructions based on computed gradients. Then, according to the\nestimated values, TIVE determines the task proportion within the visual\ninstructions, and selects representative instances to compose a smaller visual\ninstruction subset for training. Experiments on LLaVA-1.5 show that our\napproach using only about 7.5% data can achieve comparable performance as the\nfull-data fine-tuned model across seven benchmarks, even surpassing it on four\nof the benchmarks. Our code and data will be publicly released.\n","authors":["Zikang Liu","Kun Zhou","Wayne Xin Zhao","Dawei Gao","Yaliang Li","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2403.09559v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.07798v2","updated":"2024-03-21T06:45:53Z","published":"2024-03-12T16:35:32Z","title":"A Fourier Transform Framework for Domain Adaptation","summary":" By using unsupervised domain adaptation (UDA), knowledge can be transferred\nfrom a label-rich source domain to a target domain that contains relevant\ninformation but lacks labels. Many existing UDA algorithms suffer from directly\nusing raw images as input, resulting in models that overly focus on redundant\ninformation and exhibit poor generalization capability. To address this issue,\nwe attempt to improve the performance of unsupervised domain adaptation by\nemploying the Fourier method (FTF).Specifically, FTF is inspired by the\namplitude of Fourier spectra, which primarily preserves low-level statistical\ninformation. In FTF, we effectively incorporate low-level information from the\ntarget domain into the source domain by fusing the amplitudes of both domains\nin the Fourier domain. Additionally, we observe that extracting features from\nbatches of images can eliminate redundant information while retaining\nclass-specific features relevant to the task. Building upon this observation,\nwe apply the Fourier Transform at the data stream level for the first time. To\nfurther align multiple sources of data, we introduce the concept of correlation\nalignment. To evaluate the effectiveness of our FTF method, we conducted\nevaluations on four benchmark datasets for domain adaptation, including\nOffice-31, Office-Home, ImageCLEF-DA, and Office-Caltech. Our results\ndemonstrate superior performance.\n","authors":["Le Luo","Bingrong Xu","Qingyong Zhang","Cheng Lian","Jie Luo"],"pdf_url":"https://arxiv.org/pdf/2403.07798v2.pdf","comment":"The paper contains significant errors and the experimental\n methodology is not rigorous. The experimental section and methodology need to\n be rewritten"},{"id":"http://arxiv.org/abs/2308.13223v2","updated":"2024-03-21T06:45:32Z","published":"2023-08-25T07:39:26Z","title":"EfficientDreamer: High-Fidelity and Robust 3D Creation via\n Orthogonal-view Diffusion Prior","summary":" While image diffusion models have made significant progress in text-driven 3D\ncontent creation, they often fail to accurately capture the intended meaning of\ntext prompts, especially for view information. This limitation leads to the\nJanus problem, where multi-faced 3D models are generated under the guidance of\nsuch diffusion models. In this paper, we propose a robust high-quality 3D\ncontent generation pipeline by exploiting orthogonal-view image guidance.\nFirst, we introduce a novel 2D diffusion model that generates an image\nconsisting of four orthogonal-view sub-images based on the given text prompt.\nThen, the 3D content is created using this diffusion model. Notably, the\ngenerated orthogonal-view image provides strong geometric structure priors and\nthus improves 3D consistency. As a result, it effectively resolves the Janus\nproblem and significantly enhances the quality of 3D content creation.\nAdditionally, we present a 3D synthesis fusion network that can further improve\nthe details of the generated 3D contents. Both quantitative and qualitative\nevaluations demonstrate that our method surpasses previous text-to-3D\ntechniques. Project page: https://efficientdreamer.github.io.\n","authors":["Zhipeng Hu","Minda Zhao","Chaoyi Zhao","Xinyue Liang","Lincheng Li","Zeng Zhao","Changjie Fan","Xiaowei Zhou","Xin Yu"],"pdf_url":"https://arxiv.org/pdf/2308.13223v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14166v1","updated":"2024-03-21T06:34:46Z","published":"2024-03-21T06:34:46Z","title":"Mini-Splatting: Representing Scenes with a Constrained Number of\n Gaussians","summary":" In this study, we explore the challenge of efficiently representing scenes\nwith a constrained number of Gaussians. Our analysis shifts from traditional\ngraphics and 2D computer vision to the perspective of point clouds,\nhighlighting the inefficient spatial distribution of Gaussian representation as\na key limitation in model performance. To address this, we introduce strategies\nfor densification including blur split and depth reinitialization, and\nsimplification through Gaussian binarization and sampling. These techniques\nreorganize the spatial positions of the Gaussians, resulting in significant\nimprovements across various datasets and benchmarks in terms of rendering\nquality, resource consumption, and storage compression. Our proposed\nMini-Splatting method integrates seamlessly with the original rasterization\npipeline, providing a strong baseline for future research in\nGaussian-Splatting-based works.\n","authors":["Guangchi Fang","Bing Wang"],"pdf_url":"https://arxiv.org/pdf/2403.14166v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14163v1","updated":"2024-03-21T06:32:36Z","published":"2024-03-21T06:32:36Z","title":"Leveraging Large Language Model-based Room-Object Relationships\n Knowledge for Enhancing Multimodal-Input Object Goal Navigation","summary":" Object-goal navigation is a crucial engineering task for the community of\nembodied navigation; it involves navigating to an instance of a specified\nobject category within unseen environments. Although extensive investigations\nhave been conducted on both end-to-end and modular-based, data-driven\napproaches, fully enabling an agent to comprehend the environment through\nperceptual knowledge and perform object-goal navigation as efficiently as\nhumans remains a significant challenge. Recently, large language models have\nshown potential in this task, thanks to their powerful capabilities for\nknowledge extraction and integration. In this study, we propose a data-driven,\nmodular-based approach, trained on a dataset that incorporates common-sense\nknowledge of object-to-room relationships extracted from a large language\nmodel. We utilize the multi-channel Swin-Unet architecture to conduct\nmulti-task learning incorporating with multimodal inputs. The results in the\nHabitat simulator demonstrate that our framework outperforms the baseline by an\naverage of 10.6% in the efficiency metric, Success weighted by Path Length\n(SPL). The real-world demonstration shows that the proposed approach can\nefficiently conduct this task by traversing several rooms. For more details and\nreal-world demonstrations, please check our project webpage\n(https://sunleyuan.github.io/ObjectNav).\n","authors":["Leyuan Sun","Asako Kanezaki","Guillaume Caron","Yusuke Yoshiyasu"],"pdf_url":"https://arxiv.org/pdf/2403.14163v1.pdf","comment":"will soon submit to the Elsevier journal, Advanced Engineering\n Informatics"},{"id":"http://arxiv.org/abs/2403.14158v1","updated":"2024-03-21T06:14:46Z","published":"2024-03-21T06:14:46Z","title":"Volumetric Environment Representation for Vision-Language Navigation","summary":" Vision-language navigation (VLN) requires an agent to navigate through an 3D\nenvironment based on visual observations and natural language instructions. It\nis clear that the pivotal factor for successful navigation lies in the\ncomprehensive scene understanding. Previous VLN agents employ monocular\nframeworks to extract 2D features of perspective views directly. Though\nstraightforward, they struggle for capturing 3D geometry and semantics, leading\nto a partial and incomplete environment representation. To achieve a\ncomprehensive 3D representation with fine-grained details, we introduce a\nVolumetric Environment Representation (VER), which voxelizes the physical world\ninto structured 3D cells. For each cell, VER aggregates multi-view 2D features\ninto such a unified 3D space via 2D-3D sampling. Through coarse-to-fine feature\nextraction and multi-task learning for VER, our agent predicts 3D occupancy, 3D\nroom layout, and 3D bounding boxes jointly. Based on online collected VERs, our\nagent performs volume state estimation and builds episodic memory for\npredicting the next step. Experimental results show our environment\nrepresentations from multi-task learning lead to evident performance gains on\nVLN. Our model achieves state-of-the-art performance across VLN benchmarks\n(R2R, REVERIE, and R4R).\n","authors":["Rui Liu","Wenguan Wang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2403.14158v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.14155v1","updated":"2024-03-21T06:03:51Z","published":"2024-03-21T06:03:51Z","title":"Harmonizing Visual and Textual Embeddings for Zero-Shot Text-to-Image\n Customization","summary":" In a surge of text-to-image (T2I) models and their customization methods that\ngenerate new images of a user-provided subject, current works focus on\nalleviating the costs incurred by a lengthy per-subject optimization. These\nzero-shot customization methods encode the image of a specified subject into a\nvisual embedding which is then utilized alongside the textual embedding for\ndiffusion guidance. The visual embedding incorporates intrinsic information\nabout the subject, while the textual embedding provides a new, transient\ncontext. However, the existing methods often 1) are significantly affected by\nthe input images, eg., generating images with the same pose, and 2) exhibit\ndeterioration in the subject's identity. We first pin down the problem and show\nthat redundant pose information in the visual embedding interferes with the\ntextual embedding containing the desired pose information. To address this\nissue, we propose orthogonal visual embedding which effectively harmonizes with\nthe given textual embedding. We also adopt the visual-only embedding and inject\nthe subject's clear features utilizing a self-attention swap. Our results\ndemonstrate the effectiveness and robustness of our method, which offers highly\nflexible zero-shot generation while effectively maintaining the subject's\nidentity.\n","authors":["Yeji Song","Jimyeong Kim","Wonhark Park","Wonsik Shin","Wonjong Rhee","Nojun Kwak"],"pdf_url":"https://arxiv.org/pdf/2403.14155v1.pdf","comment":"Project page: https://ldynx.github.io/harmony-zero-t2i/"},{"id":"http://arxiv.org/abs/2307.12429v2","updated":"2024-03-21T05:59:17Z","published":"2023-07-23T20:55:11Z","title":"SwIPE: Efficient and Robust Medical Image Segmentation with Implicit\n Patch Embeddings","summary":" Modern medical image segmentation methods primarily use discrete\nrepresentations in the form of rasterized masks to learn features and generate\npredictions. Although effective, this paradigm is spatially inflexible, scales\npoorly to higher-resolution images, and lacks direct understanding of object\nshapes. To address these limitations, some recent works utilized implicit\nneural representations (INRs) to learn continuous representations for\nsegmentation. However, these methods often directly adopted components designed\nfor 3D shape reconstruction. More importantly, these formulations were also\nconstrained to either point-based or global contexts, lacking contextual\nunderstanding or local fine-grained details, respectively--both critical for\naccurate segmentation. To remedy this, we propose a novel approach, SwIPE\n(Segmentation with Implicit Patch Embeddings), that leverages the advantages of\nINRs and predicts shapes at the patch level--rather than at the point level or\nimage level--to enable both accurate local boundary delineation and global\nshape coherence. Extensive evaluations on two tasks (2D polyp segmentation and\n3D abdominal organ segmentation) show that SwIPE significantly improves over\nrecent implicit approaches and outperforms state-of-the-art discrete methods\nwith over 10x fewer parameters. Our method also demonstrates superior data\nefficiency and improved robustness to data shifts across image resolutions and\ndatasets. Code is available on Github\n(https://github.com/charzharr/miccai23-swipe-implicit-segmentation).\n","authors":["Yejia Zhang","Pengfei Gu","Nishchal Sapkota","Danny Z. Chen"],"pdf_url":"https://arxiv.org/pdf/2307.12429v2.pdf","comment":"Accepted to the 2023 International Conference on Medical Image\n Computing and Computer Assisted Intervention (MICCAI'23)"},{"id":"http://arxiv.org/abs/2403.14148v1","updated":"2024-03-21T05:48:48Z","published":"2024-03-21T05:48:48Z","title":"Efficient Video Diffusion Models via Content-Frame Motion-Latent\n Decomposition","summary":" Video diffusion models have recently made great progress in generation\nquality, but are still limited by the high memory and computational\nrequirements. This is because current video diffusion models often attempt to\nprocess high-dimensional videos directly. To tackle this issue, we propose\ncontent-motion latent diffusion model (CMD), a novel efficient extension of\npretrained image diffusion models for video generation. Specifically, we\npropose an autoencoder that succinctly encodes a video as a combination of a\ncontent frame (like an image) and a low-dimensional motion latent\nrepresentation. The former represents the common content, and the latter\nrepresents the underlying motion in the video, respectively. We generate the\ncontent frame by fine-tuning a pretrained image diffusion model, and we\ngenerate the motion latent representation by training a new lightweight\ndiffusion model. A key innovation here is the design of a compact latent space\nthat can directly utilizes a pretrained image diffusion model, which has not\nbeen done in previous latent video diffusion models. This leads to considerably\nbetter quality generation and reduced computational costs. For instance, CMD\ncan sample a video 7.7$\\times$ faster than prior approaches by generating a\nvideo of 512$\\times$1024 resolution and length 16 in 3.1 seconds. Moreover, CMD\nachieves an FVD score of 212.7 on WebVid-10M, 27.3% better than the previous\nstate-of-the-art of 292.4.\n","authors":["Sihyun Yu","Weili Nie","De-An Huang","Boyi Li","Jinwoo Shin","Anima Anandkumar"],"pdf_url":"https://arxiv.org/pdf/2403.14148v1.pdf","comment":"ICLR 2024. Project page: https://sihyun.me/CMD"},{"id":"http://arxiv.org/abs/2301.12831v3","updated":"2024-03-21T05:39:44Z","published":"2023-01-30T12:37:04Z","title":"M3FAS: An Accurate and Robust MultiModal Mobile Face Anti-Spoofing\n System","summary":" Face presentation attacks (FPA), also known as face spoofing, have brought\nincreasing concerns to the public through various malicious applications, such\nas financial fraud and privacy leakage. Therefore, safeguarding face\nrecognition systems against FPA is of utmost importance. Although existing\nlearning-based face anti-spoofing (FAS) models can achieve outstanding\ndetection performance, they lack generalization capability and suffer\nsignificant performance drops in unforeseen environments. Many methodologies\nseek to use auxiliary modality data (e.g., depth and infrared maps) during the\npresentation attack detection (PAD) to address this limitation. However, these\nmethods can be limited since (1) they require specific sensors such as depth\nand infrared cameras for data capture, which are rarely available on commodity\nmobile devices, and (2) they cannot work properly in practical scenarios when\neither modality is missing or of poor quality. In this paper, we devise an\naccurate and robust MultiModal Mobile Face Anti-Spoofing system named M3FAS to\novercome the issues above. The primary innovation of this work lies in the\nfollowing aspects: (1) To achieve robust PAD, our system combines visual and\nauditory modalities using three commonly available sensors: camera, speaker,\nand microphone; (2) We design a novel two-branch neural network with three\nhierarchical feature aggregation modules to perform cross-modal feature fusion;\n(3). We propose a multi-head training strategy, allowing the model to output\npredictions from the vision, acoustic, and fusion heads, resulting in a more\nflexible PAD. Extensive experiments have demonstrated the accuracy, robustness,\nand flexibility of M3FAS under various challenging experimental settings. The\nsource code and dataset are available at: https://github.com/ChenqiKONG/M3FAS/\n","authors":["Chenqi Kong","Kexin Zheng","Yibing Liu","Shiqi Wang","Anderson Rocha","Haoliang Li"],"pdf_url":"https://arxiv.org/pdf/2301.12831v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14141v1","updated":"2024-03-21T05:36:25Z","published":"2024-03-21T05:36:25Z","title":"Empowering Segmentation Ability to Multi-modal Large Language Models","summary":" Multi-modal large language models (MLLMs) can understand image-language\nprompts and demonstrate impressive reasoning ability. In this paper, we extend\nMLLMs' output by empowering MLLMs with the segmentation ability. The extended\nMLLMs can both output language responses to the image-language prompts and\nsegment the regions that the complex question or query in the language prompts\nfocuses on. To this end, the existing work, LISA, enlarges the original word\nembeddings with an additional segment token and fine-tunes dialogue generation\nand query-focused segmentation together, where the feature of the segment token\nis used to prompt the segment-anything model. Although they achieve superior\nsegmentation performance, we observe that the dialogue ability decreases by a\nlarge margin compared to the original MLLMs. To maintain the original MLLMs'\ndialogue ability, we propose a novel MLLMs framework, coined as LLaVASeg, which\nleverages a chain-of-thought prompting strategy to instruct the MLLMs to\nsegment the target region queried by the user. The MLLMs are first prompted to\nreason about the simple description of the target region from the complicated\nuser query, then extract the visual attributes of the target region according\nto the understanding of MLLMs to the image. These visual attributes, such as\ncolor and relative locations, are utilized to prompt the downstream\nsegmentation model. Experiments show that the proposed method keeps the\noriginal dialogue ability and equips the MLLMs' model with strong reasoning\nsegmentation ability. The code is available at\nhttps://github.com/YuqiYang213/LLaVASeg.\n","authors":["Yuqi Yang","Peng-Tao Jiang","Jing Wang","Hao Zhang","Kai Zhao","Jinwei Chen","Bo Li"],"pdf_url":"https://arxiv.org/pdf/2403.14141v1.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2403.13438v2","updated":"2024-03-21T05:35:24Z","published":"2024-03-18T17:38:29Z","title":"See, Imagine, Plan: Discovering and Hallucinating Tasks from a Single\n Image","summary":" Humans can not only recognize and understand the world in its current state\nbut also envision future scenarios that extend beyond immediate perception. To\nresemble this profound human capacity, we introduce zero-shot task\nhallucination -- given a single RGB image of any scene comprising unknown\nenvironments and objects, our model can identify potential tasks and imagine\ntheir execution in a vivid narrative, realized as a video. We develop a modular\npipeline that progressively enhances scene decomposition, comprehension, and\nreconstruction, incorporating VLM for dynamic interaction and 3D motion\nplanning for object trajectories. Our model can discover diverse tasks, with\nthe generated task videos demonstrating realistic and compelling visual\noutcomes that are understandable by both machines and humans. Project Page:\nhttps://dannymcy.github.io/zeroshot_task_hallucination/\n","authors":["Chenyang Ma","Kai Lu","Ta-Ying Cheng","Niki Trigoni","Andrew Markham"],"pdf_url":"https://arxiv.org/pdf/2403.13438v2.pdf","comment":"Project Page: https://dannymcy.github.io/zeroshot_task_hallucination/"},{"id":"http://arxiv.org/abs/2403.14140v1","updated":"2024-03-21T05:33:49Z","published":"2024-03-21T05:33:49Z","title":"Learning Decomposable and Debiased Representations via Attribute-Centric\n Information Bottlenecks","summary":" Biased attributes, spuriously correlated with target labels in a dataset, can\nproblematically lead to neural networks that learn improper shortcuts for\nclassifications and limit their capabilities for out-of-distribution (OOD)\ngeneralization. Although many debiasing approaches have been proposed to ensure\ncorrect predictions from biased datasets, few studies have considered learning\nlatent embedding consisting of intrinsic and biased attributes that contribute\nto improved performance and explain how the model pays attention to attributes.\nIn this paper, we propose a novel debiasing framework, Debiasing Global\nWorkspace, introducing attention-based information bottlenecks for learning\ncompositional representations of attributes without defining specific bias\ntypes. Based on our observation that learning shape-centric representation\nhelps robust performance on OOD datasets, we adopt those abilities to learn\nrobust and generalizable representations of decomposable latent embeddings\ncorresponding to intrinsic and biasing attributes. We conduct comprehensive\nevaluations on biased datasets, along with both quantitative and qualitative\nanalyses, to showcase our approach's efficacy in attribute-centric\nrepresentation learning and its ability to differentiate between intrinsic and\nbias-related features.\n","authors":["Jinyung Hong","Eun Som Jeon","Changhoon Kim","Keun Hee Park","Utkarsh Nath","Yezhou Yang","Pavan Turaga","Theodore P. Pavlic"],"pdf_url":"https://arxiv.org/pdf/2403.14140v1.pdf","comment":"24 pages, 16 figures, 3 tables"},{"id":"http://arxiv.org/abs/2309.06030v4","updated":"2024-03-21T05:32:37Z","published":"2023-09-12T08:04:56Z","title":"Federated Learning for Large-Scale Scene Modeling with Neural Radiance\n Fields","summary":" We envision a system to continuously build and maintain a map based on\nearth-scale neural radiance fields (NeRF) using data collected from vehicles\nand drones in a lifelong learning manner. However, existing large-scale\nmodeling by NeRF has problems in terms of scalability and maintainability when\nmodeling earth-scale environments. Therefore, to address these problems, we\npropose a federated learning pipeline for large-scale modeling with NeRF. We\ntailor the model aggregation pipeline in federated learning for NeRF, thereby\nallowing local updates of NeRF. In the aggregation step, the accuracy of the\nclients' global pose is critical. Thus, we also propose global pose alignment\nto align the noisy global pose of clients before the aggregation step. In\nexperiments, we show the effectiveness of the proposed pose alignment and the\nfederated learning pipeline on the large-scale scene dataset, Mill19.\n","authors":["Teppei Suzuki"],"pdf_url":"https://arxiv.org/pdf/2309.06030v4.pdf","comment":"Our subsequent work is available at arXiv:2403.11460"},{"id":"http://arxiv.org/abs/2311.08046v2","updated":"2024-03-21T05:28:06Z","published":"2023-11-14T10:11:36Z","title":"Chat-UniVi: Unified Visual Representation Empowers Large Language Models\n with Image and Video Understanding","summary":" Large language models have demonstrated impressive universal capabilities\nacross a wide range of open-ended tasks and have extended their utility to\nencompass multimodal conversations. However, existing methods encounter\nchallenges in effectively handling both image and video understanding,\nparticularly with limited visual tokens. In this work, we introduce Chat-UniVi,\na Unified Vision-language model capable of comprehending and engaging in\nconversations involving images and videos through a unified visual\nrepresentation. Specifically, we employ a set of dynamic visual tokens to\nuniformly represent images and videos. This representation framework empowers\nthe model to efficiently utilize a limited number of visual tokens to\nsimultaneously capture the spatial details necessary for images and the\ncomprehensive temporal relationship required for videos. Moreover, we leverage\na multi-scale representation, enabling the model to perceive both high-level\nsemantic concepts and low-level visual details. Notably, Chat-UniVi is trained\non a mixed dataset containing both images and videos, allowing direct\napplication to tasks involving both mediums without requiring any\nmodifications. Extensive experimental results demonstrate that Chat-UniVi\nconsistently outperforms even existing methods exclusively designed for either\nimages or videos. Code is available at\nhttps://github.com/PKU-YuanGroup/Chat-UniVi.\n","authors":["Peng Jin","Ryuichi Takanobu","Wancai Zhang","Xiaochun Cao","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2311.08046v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.14138v1","updated":"2024-03-21T05:13:34Z","published":"2024-03-21T05:13:34Z","title":"Evidential Semantic Mapping in Off-road Environments with\n Uncertainty-aware Bayesian Kernel Inference","summary":" Robotic mapping with Bayesian Kernel Inference (BKI) has shown promise in\ncreating semantic maps by effectively leveraging local spatial information.\nHowever, existing semantic mapping methods face challenges in constructing\nreliable maps in unstructured outdoor scenarios due to unreliable semantic\npredictions. To address this issue, we propose an evidential semantic mapping,\nwhich can enhance reliability in perceptually challenging off-road\nenvironments. We integrate Evidential Deep Learning into the semantic\nsegmentation network to obtain the uncertainty estimate of semantic prediction.\nSubsequently, this semantic uncertainty is incorporated into an\nuncertainty-aware BKI, tailored to prioritize more confident semantic\npredictions when accumulating semantic information. By adaptively handling\nsemantic uncertainties, the proposed framework constructs robust\nrepresentations of the surroundings even in previously unseen environments.\nComprehensive experiments across various off-road datasets demonstrate that our\nframework enhances accuracy and robustness, consistently outperforming existing\nmethods in scenes with high perceptual uncertainties.\n","authors":["Junyoung Kim","Junwon Seo","Jihong Min"],"pdf_url":"https://arxiv.org/pdf/2403.14138v1.pdf","comment":"Our project website can be found at\n https://kjyoung.github.io/Homepage/#/Projects/Evidential-Semantic-Mapping"},{"id":"http://arxiv.org/abs/2403.14137v1","updated":"2024-03-21T05:13:12Z","published":"2024-03-21T05:13:12Z","title":"Improving Image Classification Accuracy through Complementary\n Intra-Class and Inter-Class Mixup","summary":" MixUp and its variants, such as Manifold MixUp, have two key limitations in\nimage classification tasks. First, they often neglect mixing within the same\nclass (intra-class mixup), leading to an underutilization of the relationships\namong samples within the same class. Second, although these methods effectively\nenhance inter-class separability by mixing between different classes\n(inter-class mixup), they fall short in improving intra-class cohesion through\ntheir mixing operations, limiting their classification performance. To tackle\nthese issues, we propose a novel mixup method and a comprehensive integrated\nsolution.Our mixup approach specifically targets intra-class mixup, an aspect\ncommonly overlooked, to strengthen intra-class cohesion-a feature not provided\nby current mixup techniques.For each mini-batch, our method utilizes feature\nrepresentations of unaugmented original images from each class within the\nmini-batch to generate a single synthesized feature representation through\nrandom linear interpolation. All synthesized representations for this\nmini-batch are then fed into the classification and loss layers to calculate an\naverage classification loss that can markedly enhance intra-class cohesion.\nMoreover, our integrated solution seamlessly combines our intra-class mixup\nmethod with an existing mixup approach such as MixUp or Manifold MixUp. This\ncomprehensive solution incorporates inter- and intra-class mixup in a balanced\nmanner while concurrently improving intra-class cohesion and inter-class\nseparability. Experimental results on six public datasets demonstrate that our\nintegrated solution achieves a 0.1% to 3.43% higher accuracy than the best of\neither MixUp or our intra-class mixup method, averaging a 1.16% gain. It also\noutperforms the better performer of either Manifold MixUp or our intra-class\nmixup method by 0.12% to 5.16%, with an average gain of 1.11%.\n","authors":["Ye Xu","Ya Gao","Xiaorong Qiu","Yang Chen","Ying Ji"],"pdf_url":"https://arxiv.org/pdf/2403.14137v1.pdf","comment":"25 pages,12 figures"},{"id":"http://arxiv.org/abs/2303.03757v3","updated":"2024-03-21T05:11:08Z","published":"2023-03-07T09:33:49Z","title":"Deep Learning for Inertial Positioning: A Survey","summary":" Inertial sensors are widely utilized in smartphones, drones, robots, and IoT\ndevices, playing a crucial role in enabling ubiquitous and reliable\nlocalization. Inertial sensor-based positioning is essential in various\napplications, including personal navigation, location-based security, and\nhuman-device interaction. However, low-cost MEMS inertial sensors' measurements\nare inevitably corrupted by various error sources, leading to unbounded drifts\nwhen integrated doubly in traditional inertial navigation algorithms,\nsubjecting inertial positioning to the problem of error drifts. In recent\nyears, with the rapid increase in sensor data and computational power, deep\nlearning techniques have been developed, sparking significant research into\naddressing the problem of inertial positioning. Relevant literature in this\nfield spans across mobile computing, robotics, and machine learning. In this\narticle, we provide a comprehensive review of deep learning-based inertial\npositioning and its applications in tracking pedestrians, drones, vehicles, and\nrobots. We connect efforts from different fields and discuss how deep learning\ncan be applied to address issues such as sensor calibration, positioning error\ndrift reduction, and multi-sensor fusion. This article aims to attract readers\nfrom various backgrounds, including researchers and practitioners interested in\nthe potential of deep learning-based techniques to solve inertial positioning\nproblems. Our review demonstrates the exciting possibilities that deep learning\nbrings to the table and provides a roadmap for future research in this field.\n","authors":["Changhao Chen","Xianfei Pan"],"pdf_url":"https://arxiv.org/pdf/2303.03757v3.pdf","comment":"Accepted by IEEE Transactions on Intelligent Transportation Systems"},{"id":"http://arxiv.org/abs/2403.14135v1","updated":"2024-03-21T05:10:26Z","published":"2024-03-21T05:10:26Z","title":"Powerful Lossy Compression for Noisy Images","summary":" Image compression and denoising represent fundamental challenges in image\nprocessing with many real-world applications. To address practical demands,\ncurrent solutions can be categorized into two main strategies: 1) sequential\nmethod; and 2) joint method. However, sequential methods have the disadvantage\nof error accumulation as there is information loss between multiple individual\nmodels. Recently, the academic community began to make some attempts to tackle\nthis problem through end-to-end joint methods. Most of them ignore that\ndifferent regions of noisy images have different characteristics. To solve\nthese problems, in this paper, our proposed signal-to-noise ratio~(SNR) aware\njoint solution exploits local and non-local features for image compression and\ndenoising simultaneously. We design an end-to-end trainable network, which\nincludes the main encoder branch, the guidance branch, and the signal-to-noise\nratio~(SNR) aware branch. We conducted extensive experiments on both synthetic\nand real-world datasets, demonstrating that our joint solution outperforms\nexisting state-of-the-art methods.\n","authors":["Shilv Cai","Xiaoguo Liang","Shuning Cao","Luxin Yan","Sheng Zhong","Liqun Chen","Xu Zou"],"pdf_url":"https://arxiv.org/pdf/2403.14135v1.pdf","comment":"Accpeted by ICME 2024"},{"id":"http://arxiv.org/abs/2403.14133v1","updated":"2024-03-21T05:04:52Z","published":"2024-03-21T05:04:52Z","title":"3D Object Detection from Point Cloud via Voting Step Diffusion","summary":" 3D object detection is a fundamental task in scene understanding. Numerous\nresearch efforts have been dedicated to better incorporate Hough voting into\nthe 3D object detection pipeline. However, due to the noisy, cluttered, and\npartial nature of real 3D scans, existing voting-based methods often receive\nvotes from the partial surfaces of individual objects together with severe\nnoises, leading to sub-optimal detection performance. In this work, we focus on\nthe distributional properties of point clouds and formulate the voting process\nas generating new points in the high-density region of the distribution of\nobject centers. To achieve this, we propose a new method to move random 3D\npoints toward the high-density region of the distribution by estimating the\nscore function of the distribution with a noise conditioned score network.\nSpecifically, we first generate a set of object center proposals to coarsely\nidentify the high-density region of the object center distribution. To estimate\nthe score function, we perturb the generated object center proposals by adding\nnormalized Gaussian noise, and then jointly estimate the score function of all\nperturbed distributions. Finally, we generate new votes by moving random 3D\npoints to the high-density region of the object center distribution according\nto the estimated score function. Extensive experiments on two large scale\nindoor 3D scene datasets, SUN RGB-D and ScanNet V2, demonstrate the superiority\nof our proposed method. The code will be released at\nhttps://github.com/HHrEtvP/DiffVote.\n","authors":["Haoran Hou","Mingtao Feng","Zijie Wu","Weisheng Dong","Qing Zhu","Yaonan Wang","Ajmal Mian"],"pdf_url":"https://arxiv.org/pdf/2403.14133v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.16387v3","updated":"2024-03-21T04:52:57Z","published":"2023-10-25T05:59:25Z","title":"Frequency-Aware Transformer for Learned Image Compression","summary":" Learned image compression (LIC) has gained traction as an effective solution\nfor image storage and transmission in recent years. However, existing LIC\nmethods are redundant in latent representation due to limitations in capturing\nanisotropic frequency components and preserving directional details. To\novercome these challenges, we propose a novel frequency-aware transformer (FAT)\nblock that for the first time achieves multiscale directional ananlysis for\nLIC. The FAT block comprises frequency-decomposition window attention (FDWA)\nmodules to capture multiscale and directional frequency components of natural\nimages. Additionally, we introduce frequency-modulation feed-forward network\n(FMFFN) to adaptively modulate different frequency components, improving\nrate-distortion performance. Furthermore, we present a transformer-based\nchannel-wise autoregressive (T-CA) model that effectively exploits channel\ndependencies. Experiments show that our method achieves state-of-the-art\nrate-distortion performance compared to existing LIC methods, and evidently\noutperforms latest standardized codec VTM-12.1 by 14.5%, 15.1%, 13.0% in\nBD-rate on the Kodak, Tecnick, and CLIC datasets.\n","authors":["Han Li","Shaohui Li","Wenrui Dai","Chenglin Li","Junni Zou","Hongkai Xiong"],"pdf_url":"https://arxiv.org/pdf/2310.16387v3.pdf","comment":"ICLR2024 poster"},{"id":"http://arxiv.org/abs/2310.16226v3","updated":"2024-03-21T04:47:27Z","published":"2023-10-24T22:41:14Z","title":"TiC-CLIP: Continual Training of CLIP Models","summary":" Keeping large foundation models up to date on latest data is inherently\nexpensive. To avoid the prohibitive costs of constantly retraining, it is\nimperative to continually train these models. This problem is exacerbated by\nthe lack of any large scale continual learning benchmarks or baselines. We\nintroduce the first set of web-scale Time-Continual (TiC) benchmarks for\ntraining vision-language models: TiC-DataComp, TiC-YFCC, and TiC-Redcaps.\nTiC-DataComp, our largest dataset, contains over 12.7B timestamped image-text\npairs spanning 9 years (2014-2022). We first use our benchmarks to curate\nvarious dynamic evaluations to measure temporal robustness of existing models.\nWe show OpenAI's CLIP (trained on data up to 2020) loses $\\approx 8\\%$\nzero-shot accuracy on our curated retrieval task from 2021-2022 compared with\nmore recently trained models in OpenCLIP repository. We then study how to\nefficiently train models on time-continuous data. We demonstrate that a simple\nrehearsal-based approach that continues training from the last checkpoint and\nreplays old data reduces compute by $2.5\\times$ when compared to the standard\npractice of retraining from scratch. Code is available at\nhttps://github.com/apple/ml-tic-clip.\n","authors":["Saurabh Garg","Mehrdad Farajtabar","Hadi Pouransari","Raviteja Vemulapalli","Sachin Mehta","Oncel Tuzel","Vaishaal Shankar","Fartash Faghri"],"pdf_url":"https://arxiv.org/pdf/2310.16226v3.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2403.14124v1","updated":"2024-03-21T04:34:24Z","published":"2024-03-21T04:34:24Z","title":"Soft Masked Transformer for Point Cloud Processing with Skip\n Attention-Based Upsampling","summary":" Point cloud processing methods leverage local and global point features %at\nthe feature level to cater to downstream tasks, yet they often overlook the\ntask-level context inherent in point clouds during the encoding stage. We argue\nthat integrating task-level information into the encoding stage significantly\nenhances performance. To that end, we propose SMTransformer which incorporates\ntask-level information into a vector-based transformer by utilizing a soft mask\ngenerated from task-level queries and keys to learn the attention weights.\nAdditionally, to facilitate effective communication between features from the\nencoding and decoding layers in high-level tasks such as segmentation, we\nintroduce a skip-attention-based up-sampling block. This block dynamically\nfuses features from various resolution points across the encoding and decoding\nlayers. To mitigate the increase in network parameters and training time\nresulting from the complexity of the aforementioned blocks, we propose a novel\nshared position encoding strategy. This strategy allows various transformer\nblocks to share the same position information over the same resolution points,\nthereby reducing network parameters and training time without compromising\naccuracy.Experimental comparisons with existing methods on multiple datasets\ndemonstrate the efficacy of SMTransformer and skip-attention-based up-sampling\nfor point cloud processing tasks, including semantic segmentation and\nclassification. In particular, we achieve state-of-the-art semantic\nsegmentation results of 73.4% mIoU on S3DIS Area 5 and 62.4% mIoU on SWAN\ndataset\n","authors":["Yong He","Hongshan Yu","Muhammad Ibrahim","Xiaoyan Liu","Tongjia Chen","Anwaar Ulhaq","Ajmal Mian"],"pdf_url":"https://arxiv.org/pdf/2403.14124v1.pdf","comment":"14 pages, 8 figures"},{"id":"http://arxiv.org/abs/2403.14121v1","updated":"2024-03-21T04:24:49Z","published":"2024-03-21T04:24:49Z","title":"External Knowledge Enhanced 3D Scene Generation from Sketch","summary":" Generating realistic 3D scenes is challenging due to the complexity of room\nlayouts and object geometries.We propose a sketch based knowledge enhanced\ndiffusion architecture (SEK) for generating customized, diverse, and plausible\n3D scenes. SEK conditions the denoising process with a hand-drawn sketch of the\ntarget scene and cues from an object relationship knowledge base. We first\nconstruct an external knowledge base containing object relationships and then\nleverage knowledge enhanced graph reasoning to assist our model in\nunderstanding hand-drawn sketches. A scene is represented as a combination of\n3D objects and their relationships, and then incrementally diffused to reach a\nGaussian distribution.We propose a 3D denoising scene transformer that learns\nto reverse the diffusion process, conditioned by a hand-drawn sketch along with\nknowledge cues, to regressively generate the scene including the 3D object\ninstances as well as their layout. Experiments on the 3D-FRONT dataset show\nthat our model improves FID, CKL by 17.41%, 37.18% in 3D scene generation and\nFID, KID by 19.12%, 20.06% in 3D scene completion compared to the nearest\ncompetitor DiffuScene.\n","authors":["Zijie Wu","Mingtao Feng","Yaonan Wang","He Xie","Weisheng Dong","Bo Miao","Ajmal Mian"],"pdf_url":"https://arxiv.org/pdf/2403.14121v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10159v4","updated":"2024-03-21T04:17:26Z","published":"2023-06-16T20:02:51Z","title":"Vision-Language Models can Identify Distracted Driver Behavior from\n Naturalistic Videos","summary":" Recognizing the activities causing distraction in real-world driving\nscenarios is critical for ensuring the safety and reliability of both drivers\nand pedestrians on the roadways. Conventional computer vision techniques are\ntypically data-intensive and require a large volume of annotated training data\nto detect and classify various distracted driving behaviors, thereby limiting\ntheir efficiency and scalability. We aim to develop a generalized framework\nthat showcases robust performance with access to limited or no annotated\ntraining data. Recently, vision-language models have offered large-scale\nvisual-textual pretraining that can be adapted to task-specific learning like\ndistracted driving activity recognition. Vision-language pretraining models,\nsuch as CLIP, have shown significant promise in learning natural\nlanguage-guided visual representations. This paper proposes a CLIP-based driver\nactivity recognition approach that identifies driver distraction from\nnaturalistic driving images and videos. CLIP's vision embedding offers\nzero-shot transfer and task-based finetuning, which can classify distracted\nactivities from driving video data. Our results show that this framework offers\nstate-of-the-art performance on zero-shot transfer and video-based CLIP for\npredicting the driver's state on two public datasets. We propose both\nframe-based and video-based frameworks developed on top of the CLIP's visual\nrepresentation for distracted driving detection and classification tasks and\nreport the results.\n","authors":["Md Zahid Hasan","Jiajing Chen","Jiyang Wang","Mohammed Shaiqur Rahman","Ameya Joshi","Senem Velipasalar","Chinmay Hegde","Anuj Sharma","Soumik Sarkar"],"pdf_url":"https://arxiv.org/pdf/2306.10159v4.pdf","comment":"15 pages, 7 figures"},{"id":"http://arxiv.org/abs/2403.14119v1","updated":"2024-03-21T04:08:29Z","published":"2024-03-21T04:08:29Z","title":"C-TPT: Calibrated Test-Time Prompt Tuning for Vision-Language Models via\n Text Feature Dispersion","summary":" In deep learning, test-time adaptation has gained attention as a method for\nmodel fine-tuning without the need for labeled data. A prime exemplification is\nthe recently proposed test-time prompt tuning for large-scale vision-language\nmodels such as CLIP. Unfortunately, these prompts have been mainly developed to\nimprove accuracy, overlooking the importance of calibration-a crucial aspect\nfor quantifying prediction uncertainty. However, traditional calibration\nmethods rely on substantial amounts of labeled data, making them impractical\nfor test-time scenarios. To this end, this paper explores calibration during\ntest-time prompt tuning by leveraging the inherent properties of CLIP. Through\na series of observations, we find that the prompt choice significantly affects\nthe calibration in CLIP, where the prompts leading to higher text feature\ndispersion result in better-calibrated predictions. Introducing the Average\nText Feature Dispersion (ATFD), we establish its relationship with calibration\nerror and present a novel method, Calibrated Test-time Prompt Tuning (C-TPT),\nfor optimizing prompts during test-time with enhanced calibration. Through\nextensive experiments on different CLIP architectures and datasets, we show\nthat C-TPT can effectively improve the calibration of test-time prompt tuning\nwithout needing labeled data.\n","authors":["Hee Suk Yoon","Eunseop Yoon","Joshua Tian Jin Tee","Mark Hasegawa-Johnson","Yingzhen Li","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2403.14119v1.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2312.08977v2","updated":"2024-03-21T04:04:25Z","published":"2023-12-14T14:26:57Z","title":"Weighted Ensemble Models Are Strong Continual Learners","summary":" In this work, we study the problem of continual learning (CL) where the goal\nis to learn a model on a sequence of tasks, such that the data from the\nprevious tasks becomes unavailable while learning on the current task data. CL\nis essentially a balancing act between being able to learn on the new task\n(i.e., plasticity) and maintaining the performance on the previously learned\nconcepts (i.e., stability). Intending to address the stability-plasticity\ntrade-off, we propose to perform weight-ensembling of the model parameters of\nthe previous and current tasks. This weighted-ensembled model, which we call\nContinual Model Averaging (or CoMA), attains high accuracy on the current task\nby leveraging plasticity, while not deviating too far from the previous weight\nconfiguration, ensuring stability. We also propose an improved variant of CoMA,\nnamed Continual Fisher-weighted Model Averaging (or CoFiMA), that selectively\nweighs each parameter in the weights ensemble by leveraging the Fisher\ninformation of the weights of the model. Both variants are conceptually simple,\neasy to implement, and effective in attaining state-of-the-art performance on\nseveral standard CL benchmarks. Code is available at:\nhttps://github.com/IemProg/CoFiMA.\n","authors":["Imad Eddine Marouf","Subhankar Roy","Enzo Tartaglione","Stéphane Lathuilière"],"pdf_url":"https://arxiv.org/pdf/2312.08977v2.pdf","comment":"Code: https://github.com/IemProg/CoFiMA"},{"id":"http://arxiv.org/abs/2403.14115v1","updated":"2024-03-21T04:01:26Z","published":"2024-03-21T04:01:26Z","title":"Training point-based deep learning networks for forest segmentation with\n synthetic data","summary":" Remote sensing through unmanned aerial systems (UAS) has been increasing in\nforestry in recent years, along with using machine learning for data\nprocessing. Deep learning architectures, extensively applied in natural\nlanguage and image processing, have recently been extended to the point cloud\ndomain. However, the availability of point cloud datasets for training and\ntesting remains limited. Creating forested environment point cloud datasets is\nexpensive, requires high-precision sensors, and is time-consuming as manual\npoint classification is required. Moreover, forest areas could be inaccessible\nor dangerous for humans, further complicating data collection. Then, a question\narises whether it is possible to use synthetic data to train deep learning\nnetworks without the need to rely on large volumes of real forest data. To\nanswer this question, we developed a realistic simulator that procedurally\ngenerates synthetic forest scenes. Thanks to this, we have conducted a\ncomparative study of different state-of-the-art point-based deep learning\nnetworks for forest segmentation. Using created datasets, we determined the\nfeasibility of using synthetic data to train deep learning networks to classify\npoint clouds from real forest datasets. Both the simulator and the datasets are\nreleased as part of this work.\n","authors":["Francisco Raverta Capua","Juan Schandin","Pablo De Cristóforis"],"pdf_url":"https://arxiv.org/pdf/2403.14115v1.pdf","comment":"15 pages, 4 figures. Submitted to the International Conference on\n Pattern Recognition (ICPR) 2024"},{"id":"http://arxiv.org/abs/2403.13331v2","updated":"2024-03-21T04:01:10Z","published":"2024-03-20T06:22:37Z","title":"AMP: Autoregressive Motion Prediction Revisited with Next Token\n Prediction for Autonomous Driving","summary":" As an essential task in autonomous driving (AD), motion prediction aims to\npredict the future states of surround objects for navigation. One natural\nsolution is to estimate the position of other agents in a step-by-step manner\nwhere each predicted time-step is conditioned on both observed time-steps and\npreviously predicted time-steps, i.e., autoregressive prediction. Pioneering\nworks like SocialLSTM and MFP design their decoders based on this intuition.\nHowever, almost all state-of-the-art works assume that all predicted time-steps\nare independent conditioned on observed time-steps, where they use a single\nlinear layer to generate positions of all time-steps simultaneously. They\ndominate most motion prediction leaderboards due to the simplicity of training\nMLPs compared to autoregressive networks.\n In this paper, we introduce the GPT style next token prediction into motion\nforecasting. In this way, the input and output could be represented in a\nunified space and thus the autoregressive prediction becomes more feasible.\nHowever, different from language data which is composed of homogeneous units\n-words, the elements in the driving scene could have complex spatial-temporal\nand semantic relations. To this end, we propose to adopt three factorized\nattention modules with different neighbors for information aggregation and\ndifferent position encoding styles to capture their relations, e.g., encoding\nthe transformation between coordinate systems for spatial relativity while\nadopting RoPE for temporal relativity. Empirically, by equipping with the\naforementioned tailored designs, the proposed method achieves state-of-the-art\nperformance in the Waymo Open Motion and Waymo Interaction datasets. Notably,\nAMP outperforms other recent autoregressive motion prediction methods: MotionLM\nand StateTransformer, which demonstrates the effectiveness of the proposed\ndesigns.\n","authors":["Xiaosong Jia","Shaoshuai Shi","Zijun Chen","Li Jiang","Wenlong Liao","Tao He","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2403.13331v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14114v1","updated":"2024-03-21T03:58:27Z","published":"2024-03-21T03:58:27Z","title":"Test-time Similarity Modification for Person Re-identification toward\n Temporal Distribution Shift","summary":" Person re-identification (re-id), which aims to retrieve images of the same\nperson in a given image from a database, is one of the most practical image\nrecognition applications. In the real world, however, the environments that the\nimages are taken from change over time. This causes a distribution shift\nbetween training and testing and degrades the performance of re-id. To maintain\nre-id performance, models should continue adapting to the test environment's\ntemporal changes. Test-time adaptation (TTA), which aims to adapt models to the\ntest environment with only unlabeled test data, is a promising way to handle\nthis problem because TTA can adapt models instantly in the test environment.\nHowever, the previous TTA methods are designed for classification and cannot be\ndirectly applied to re-id. This is because the set of people's identities in\nthe dataset differs between training and testing in re-id, whereas the set of\nclasses is fixed in the current TTA methods designed for classification. To\nimprove re-id performance in changing test environments, we propose TEst-time\nsimilarity Modification for Person re-identification (TEMP), a novel TTA method\nfor re-id. TEMP is the first fully TTA method for re-id, which does not require\nany modification to pre-training. Inspired by TTA methods that refine the\nprediction uncertainty in classification, we aim to refine the uncertainty in\nre-id. However, the uncertainty cannot be computed in the same way as\nclassification in re-id since it is an open-set task, which does not share\nperson labels between training and testing. Hence, we propose re-id entropy, an\nalternative uncertainty measure for re-id computed based on the similarity\nbetween the feature vectors. Experiments show that the re-id entropy can\nmeasure the uncertainty on re-id and TEMP improves the performance of re-id in\nonline settings where the distribution changes over time.\n","authors":["Kazuki Adachi","Shohei Enomoto","Taku Sasaki","Shin'ya Yamaguchi"],"pdf_url":"https://arxiv.org/pdf/2403.14114v1.pdf","comment":"Accepted to IJCNN2024"},{"id":"http://arxiv.org/abs/2403.14113v1","updated":"2024-03-21T03:56:24Z","published":"2024-03-21T03:56:24Z","title":"Spatio-Temporal Proximity-Aware Dual-Path Model for Panoramic Activity\n Recognition","summary":" Panoramic Activity Recognition (PAR) seeks to identify diverse human\nactivities across different scales, from individual actions to social group and\nglobal activities in crowded panoramic scenes. PAR presents two major\nchallenges: 1) recognizing the nuanced interactions among numerous individuals\nand 2) understanding multi-granular human activities. To address these, we\npropose Social Proximity-aware Dual-Path Network (SPDP-Net) based on two key\ndesign principles. First, while previous works often focus on spatial distance\namong individuals within an image, we argue to consider the spatio-temporal\nproximity. It is crucial for individual relation encoding to correctly\nunderstand social dynamics. Secondly, deviating from existing hierarchical\napproaches (individual-to-social-to-global activity), we introduce a dual-path\narchitecture for multi-granular activity recognition. This architecture\ncomprises individual-to-global and individual-to-social paths, mutually\nreinforcing each other's task with global-local context through multiple\nlayers. Through extensive experiments, we validate the effectiveness of the\nspatio-temporal proximity among individuals and the dual-path architecture in\nPAR. Furthermore, SPDP-Net achieves new state-of-the-art performance with\n46.5\\% of overall F1 score on JRDB-PAR dataset.\n","authors":["Sumin Lee","Yooseung Wang","Sangmin Woo","Changick Kim"],"pdf_url":"https://arxiv.org/pdf/2403.14113v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14104v1","updated":"2024-03-21T03:34:18Z","published":"2024-03-21T03:34:18Z","title":"Existence Is Chaos: Enhancing 3D Human Motion Prediction with\n Uncertainty Consideration","summary":" Human motion prediction is consisting in forecasting future body poses from\nhistorically observed sequences. It is a longstanding challenge due to motion's\ncomplex dynamics and uncertainty. Existing methods focus on building up\ncomplicated neural networks to model the motion dynamics. The predicted results\nare required to be strictly similar to the training samples with L2 loss in\ncurrent training pipeline. However, little attention has been paid to the\nuncertainty property which is crucial to the prediction task. We argue that the\nrecorded motion in training data could be an observation of possible future,\nrather than a predetermined result. In addition, existing works calculate the\npredicted error on each future frame equally during training, while recent work\nindicated that different frames could play different roles. In this work, a\nnovel computationally efficient encoder-decoder model with uncertainty\nconsideration is proposed, which could learn proper characteristics for future\nframes by a dynamic function. Experimental results on benchmark datasets\ndemonstrate that our uncertainty consideration approach has obvious advantages\nboth in quantity and quality. Moreover, the proposed method could produce\nmotion sequences with much better quality that avoids the intractable shaking\nartefacts. We believe our work could provide a novel perspective to consider\nthe uncertainty quality for the general motion prediction task and encourage\nthe studies in this field. The code will be available in\nhttps://github.com/Motionpre/Adaptive-Salient-Loss-SAGGB.\n","authors":["Zhihao Wang","Yulin Zhou","Ningyu Zhang","Xiaosong Yang","Jun Xiao","Zhao Wang"],"pdf_url":"https://arxiv.org/pdf/2403.14104v1.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2403.14103v1","updated":"2024-03-21T03:28:24Z","published":"2024-03-21T03:28:24Z","title":"MaskSAM: Towards Auto-prompt SAM with Mask Classification for Medical\n Image Segmentation","summary":" Segment Anything Model~(SAM), a prompt-driven foundation model for natural\nimage segmentation, has demonstrated impressive zero-shot performance. However,\nSAM does not work when directly applied to medical image segmentation tasks,\nsince SAM lacks the functionality to predict semantic labels for predicted\nmasks and needs to provide extra prompts, such as points or boxes, to segment\ntarget regions. Meanwhile, there is a huge gap between 2D natural images and 3D\nmedical images, so the performance of SAM is imperfect for medical image\nsegmentation tasks. Following the above issues, we propose MaskSAM, a novel\nmask classification prompt-free SAM adaptation framework for medical image\nsegmentation. We design a prompt generator combined with the image encoder in\nSAM to generate a set of auxiliary classifier tokens, auxiliary binary masks,\nand auxiliary bounding boxes. Each pair of auxiliary mask and box prompts,\nwhich can solve the requirements of extra prompts, is associated with class\nlabel predictions by the sum of the auxiliary classifier token and the\nlearnable global classifier tokens in the mask decoder of SAM to solve the\npredictions of semantic labels. Meanwhile, we design a 3D depth-convolution\nadapter for image embeddings and a 3D depth-MLP adapter for prompt embeddings.\nWe inject one of them into each transformer block in the image encoder and mask\ndecoder to enable pre-trained 2D SAM models to extract 3D information and adapt\nto 3D medical images. Our method achieves state-of-the-art performance on\nAMOS2022, 90.52% Dice, which improved by 2.7% compared to nnUNet. Our method\nsurpasses nnUNet by 1.7% on ACDC and 1.0% on Synapse datasets.\n","authors":["Bin Xie","Hao Tang","Bin Duan","Dawen Cai","Yan Yan"],"pdf_url":"https://arxiv.org/pdf/2403.14103v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14101v1","updated":"2024-03-21T03:24:01Z","published":"2024-03-21T03:24:01Z","title":"Text-Enhanced Data-free Approach for Federated Class-Incremental\n Learning","summary":" Federated Class-Incremental Learning (FCIL) is an underexplored yet pivotal\nissue, involving the dynamic addition of new classes in the context of\nfederated learning. In this field, Data-Free Knowledge Transfer (DFKT) plays a\ncrucial role in addressing catastrophic forgetting and data privacy problems.\nHowever, prior approaches lack the crucial synergy between DFKT and the model\ntraining phases, causing DFKT to encounter difficulties in generating\nhigh-quality data from a non-anchored latent space of the old task model. In\nthis paper, we introduce LANDER (Label Text Centered Data-Free Knowledge\nTransfer) to address this issue by utilizing label text embeddings (LTE)\nproduced by pretrained language models. Specifically, during the model training\nphase, our approach treats LTE as anchor points and constrains the feature\nembeddings of corresponding training samples around them, enriching the\nsurrounding area with more meaningful information. In the DFKT phase, by using\nthese LTE anchors, LANDER can synthesize more meaningful samples, thereby\neffectively addressing the forgetting problem. Additionally, instead of tightly\nconstraining embeddings toward the anchor, the Bounding Loss is introduced to\nencourage sample embeddings to remain flexible within a defined radius. This\napproach preserves the natural differences in sample embeddings and mitigates\nthe embedding overlap caused by heterogeneous federated settings. Extensive\nexperiments conducted on CIFAR100, Tiny-ImageNet, and ImageNet demonstrate that\nLANDER significantly outperforms previous methods and achieves state-of-the-art\nperformance in FCIL. The code is available at\nhttps://github.com/tmtuan1307/lander.\n","authors":["Minh-Tuan Tran","Trung Le","Xuan-May Le","Mehrtash Harandi","Dinh Phung"],"pdf_url":"https://arxiv.org/pdf/2403.14101v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2104.02857v2","updated":"2024-03-21T03:21:34Z","published":"2021-04-07T02:04:12Z","title":"Soft-Label Anonymous Gastric X-ray Image Distillation","summary":" This paper presents a soft-label anonymous gastric X-ray image distillation\nmethod based on a gradient descent approach. The sharing of medical data is\ndemanded to construct high-accuracy computer-aided diagnosis (CAD) systems.\nHowever, the large size of the medical dataset and privacy protection are\nremaining problems in medical data sharing, which hindered the research of CAD\nsystems. The idea of our distillation method is to extract the valid\ninformation of the medical dataset and generate a tiny distilled dataset that\nhas a different data distribution. Different from model distillation, our\nmethod aims to find the optimal distilled images, distilled labels and the\noptimized learning rate. Experimental results show that the proposed method can\nnot only effectively compress the medical dataset but also anonymize medical\nimages to protect the patient's private information. The proposed approach can\nimprove the efficiency and security of medical data sharing.\n","authors":["Guang Li","Ren Togo","Takahiro Ogawa","Miki Haseyama"],"pdf_url":"https://arxiv.org/pdf/2104.02857v2.pdf","comment":"The first paper to explore real-world dataset distillation; Work was\n done in 2019 and published as a conference paper at ICIP 2020"},{"id":"http://arxiv.org/abs/2309.06255v3","updated":"2024-03-21T03:21:24Z","published":"2023-09-12T14:16:34Z","title":"Enhancing Multimodal Cooperation via Fine-grained Modality Valuation","summary":" One primary topic of multimodal learning is to jointly incorporate\nheterogeneous information from different modalities. However, most models often\nsuffer from unsatisfactory multimodal cooperation, which cannot jointly utilize\nall modalities well. Some methods are proposed to identify and enhance the\nworse learnt modality, but they are often hard to provide the fine-grained\nobservation of multimodal cooperation at sample-level with theoretical support.\nHence, it is essential to reasonably observe and improve the fine-grained\ncooperation between modalities, especially when facing realistic scenarios\nwhere the modality discrepancy could vary across different samples. To this\nend, we introduce a sample-level modality valuation metric to evaluate the\ncontribution of each modality for each sample. Via modality valuation, we\nobserve that modality discrepancy indeed could be different at sample-level,\nbeyond the global contribution discrepancy at dataset-level. We further analyze\nthis issue and improve cooperation between modalities at sample-level by\nenhancing the discriminative ability of low-contributing modalities in a\ntargeted manner. Overall, our methods reasonably observe the fine-grained\nuni-modal contribution and achieve considerable improvement. The source code\nand dataset are available at\n\\url{https://github.com/GeWu-Lab/Valuate-and-Enhance-Multimodal-Cooperation}.\n","authors":["Yake Wei","Ruoxuan Feng","Zihe Wang","Di Hu"],"pdf_url":"https://arxiv.org/pdf/2309.06255v3.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2402.19286v2","updated":"2024-03-21T03:14:59Z","published":"2024-02-29T15:51:14Z","title":"PrPSeg: Universal Proposition Learning for Panoramic Renal Pathology\n Segmentation","summary":" Understanding the anatomy of renal pathology is crucial for advancing disease\ndiagnostics, treatment evaluation, and clinical research. The complex kidney\nsystem comprises various components across multiple levels, including regions\n(cortex, medulla), functional units (glomeruli, tubules), and cells (podocytes,\nmesangial cells in glomerulus). Prior studies have predominantly overlooked the\nintricate spatial interrelations among objects from clinical knowledge. In this\nresearch, we introduce a novel universal proposition learning approach, called\npanoramic renal pathology segmentation (PrPSeg), designed to segment\ncomprehensively panoramic structures within kidney by integrating extensive\nknowledge of kidney anatomy.\n In this paper, we propose (1) the design of a comprehensive universal\nproposition matrix for renal pathology, facilitating the incorporation of\nclassification and spatial relationships into the segmentation process; (2) a\ntoken-based dynamic head single network architecture, with the improvement of\nthe partial label image segmentation and capability for future data\nenlargement; and (3) an anatomy loss function, quantifying the inter-object\nrelationships across the kidney.\n","authors":["Ruining Deng","Quan Liu","Can Cui","Tianyuan Yao","Jialin Yue","Juming Xiong","Lining Yu","Yifei Wu","Mengmeng Yin","Yu Wang","Shilin Zhao","Yucheng Tang","Haichun Yang","Yuankai Huo"],"pdf_url":"https://arxiv.org/pdf/2402.19286v2.pdf","comment":"IEEE / CVF Computer Vision and Pattern Recognition Conference 2024"},{"id":"http://arxiv.org/abs/2403.14093v1","updated":"2024-03-21T03:01:25Z","published":"2024-03-21T03:01:25Z","title":"Science based AI model certification for untrained operational\n environments with application in traffic state estimation","summary":" The expanding role of Artificial Intelligence (AI) in diverse engineering\ndomains highlights the challenges associated with deploying AI models in new\noperational environments, involving substantial investments in data collection\nand model training. Rapid application of AI necessitates evaluating the\nfeasibility of utilizing pre-trained models in unobserved operational settings\nwith minimal or no additional data. However, interpreting the opaque nature of\nAI's black-box models remains a persistent challenge. Addressing this issue,\nthis paper proposes a science-based certification methodology to assess the\nviability of employing pre-trained data-driven models in untrained operational\nenvironments. The methodology advocates a profound integration of domain\nknowledge, leveraging theoretical and analytical models from physics and\nrelated disciplines, with data-driven AI models. This novel approach introduces\ntools to facilitate the development of secure engineering systems, providing\ndecision-makers with confidence in the trustworthiness and safety of AI-based\nmodels across diverse environments characterized by limited training data and\ndynamic, uncertain conditions. The paper demonstrates the efficacy of this\nmethodology in real-world safety-critical scenarios, particularly in the\ncontext of traffic state estimation. Through simulation results, the study\nillustrates how the proposed methodology efficiently quantifies physical\ninconsistencies exhibited by pre-trained AI models. By utilizing analytical\nmodels, the methodology offers a means to gauge the applicability of\npre-trained AI models in new operational environments. This research\ncontributes to advancing the understanding and deployment of AI models,\noffering a robust certification framework that enhances confidence in their\nreliability and safety across a spectrum of operational conditions.\n","authors":["Daryl Mupupuni","Anupama Guntu","Liang Hong","Kamrul Hasan","Leehyun Keel"],"pdf_url":"https://arxiv.org/pdf/2403.14093v1.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2403.11708v2","updated":"2024-03-21T02:48:22Z","published":"2024-03-18T12:12:45Z","title":"Implicit Discriminative Knowledge Learning for Visible-Infrared Person\n Re-Identification","summary":" Visible-Infrared Person Re-identification (VI-ReID) is a challenging\ncross-modal pedestrian retrieval task, due to significant intra-class\nvariations and cross-modal discrepancies among different cameras. Existing\nworks mainly focus on embedding images of different modalities into a unified\nspace to mine modality-shared features. They only seek distinctive information\nwithin these shared features, while ignoring the identity-aware useful\ninformation that is implicit in the modality-specific features. To address this\nissue, we propose a novel Implicit Discriminative Knowledge Learning (IDKL)\nnetwork to uncover and leverage the implicit discriminative information\ncontained within the modality-specific. First, we extract modality-specific and\nmodality-shared features using a novel dual-stream network. Then, the\nmodality-specific features undergo purification to reduce their modality style\ndiscrepancies while preserving identity-aware discriminative knowledge.\nSubsequently, this kind of implicit knowledge is distilled into the\nmodality-shared feature to enhance its distinctiveness. Finally, an alignment\nloss is proposed to minimize modality discrepancy on enhanced modality-shared\nfeatures. Extensive experiments on multiple public datasets demonstrate the\nsuperiority of IDKL network over the state-of-the-art methods. Code is\navailable at https://github.com/1KK077/IDKL.\n","authors":["Kaijie Ren","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.11708v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.14089v1","updated":"2024-03-21T02:45:16Z","published":"2024-03-21T02:45:16Z","title":"Unsupervised Intrinsic Image Decomposition with LiDAR Intensity Enhanced\n Training","summary":" Unsupervised intrinsic image decomposition (IID) is the process of separating\na natural image into albedo and shade without these ground truths. A recent\nmodel employing light detection and ranging (LiDAR) intensity demonstrated\nimpressive performance, though the necessity of LiDAR intensity during\ninference restricts its practicality. Thus, IID models employing only a single\nimage during inference while keeping as high IID quality as the one with an\nimage plus LiDAR intensity are highly desired. To address this challenge, we\npropose a novel approach that utilizes only an image during inference while\nutilizing an image and LiDAR intensity during training. Specifically, we\nintroduce a partially-shared model that accepts an image and LiDAR intensity\nindividually using a different specific encoder but processes them together in\nspecific components to learn shared representations. In addition, to enhance\nIID quality, we propose albedo-alignment loss and image-LiDAR conversion (ILC)\npaths. Albedo-alignment loss aligns the gray-scale albedo from an image to that\ninferred from LiDAR intensity, thereby reducing cast shadows in albedo from an\nimage due to the absence of cast shadows in LiDAR intensity. Furthermore, to\ntranslate the input image into albedo and shade style while keeping the image\ncontents, the input image is separated into style code and content code by\nencoders. The ILC path mutually translates the image and LiDAR intensity, which\nshare content but differ in style, contributing to the distinct differentiation\nof style from content. Consequently, LIET achieves comparable IID quality to\nthe existing model with LiDAR intensity, while utilizing only an image without\nLiDAR intensity during inference.\n","authors":["Shogo Sato","Takuhiro Kaneko","Kazuhiko Murasaki","Taiga Yoshida","Ryuichi Tanida","Akisato Kimura"],"pdf_url":"https://arxiv.org/pdf/2403.14089v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13535v2","updated":"2024-03-21T02:31:58Z","published":"2024-03-20T12:13:04Z","title":"IDAdapter: Learning Mixed Features for Tuning-Free Personalization of\n Text-to-Image Models","summary":" Leveraging Stable Diffusion for the generation of personalized portraits has\nemerged as a powerful and noteworthy tool, enabling users to create\nhigh-fidelity, custom character avatars based on their specific prompts.\nHowever, existing personalization methods face challenges, including test-time\nfine-tuning, the requirement of multiple input images, low preservation of\nidentity, and limited diversity in generated outcomes. To overcome these\nchallenges, we introduce IDAdapter, a tuning-free approach that enhances the\ndiversity and identity preservation in personalized image generation from a\nsingle face image. IDAdapter integrates a personalized concept into the\ngeneration process through a combination of textual and visual injections and a\nface identity loss. During the training phase, we incorporate mixed features\nfrom multiple reference images of a specific identity to enrich\nidentity-related content details, guiding the model to generate images with\nmore diverse styles, expressions, and angles compared to previous works.\nExtensive evaluations demonstrate the effectiveness of our method, achieving\nboth diversity and identity fidelity in generated images.\n","authors":["Siying Cui","Jia Guo","Xiang An","Jiankang Deng","Yongle Zhao","Xinyu Wei","Ziyong Feng"],"pdf_url":"https://arxiv.org/pdf/2403.13535v2.pdf","comment":"14 pages, 15 figures"},{"id":"http://arxiv.org/abs/2303.13800v4","updated":"2024-03-21T02:31:39Z","published":"2023-03-24T04:45:45Z","title":"Aligning Step-by-Step Instructional Diagrams to Video Demonstrations","summary":" Multimodal alignment facilitates the retrieval of instances from one modality\nwhen queried using another. In this paper, we consider a novel setting where\nsuch an alignment is between (i) instruction steps that are depicted as\nassembly diagrams (commonly seen in Ikea assembly manuals) and (ii) video\nsegments from in-the-wild videos; these videos comprising an enactment of the\nassembly actions in the real world. To learn this alignment, we introduce a\nnovel supervised contrastive learning method that learns to align videos with\nthe subtle details in the assembly diagrams, guided by a set of novel losses.\nTo study this problem and demonstrate the effectiveness of our method, we\nintroduce a novel dataset: IAW for Ikea assembly in the wild consisting of 183\nhours of videos from diverse furniture assembly collections and nearly 8,300\nillustrations from their associated instruction manuals and annotated for their\nground truth alignments. We define two tasks on this dataset: First, nearest\nneighbor retrieval between video segments and illustrations, and, second,\nalignment of instruction steps and the segments for each video. Extensive\nexperiments on IAW demonstrate superior performances of our approach against\nalternatives.\n","authors":["Jiahao Zhang","Anoop Cherian","Yanbin Liu","Yizhak Ben-Shabat","Cristian Rodriguez","Stephen Gould"],"pdf_url":"https://arxiv.org/pdf/2303.13800v4.pdf","comment":"Project website:\n https://academic.davidz.cn/en/publication/zhang-cvpr-2023/"},{"id":"http://arxiv.org/abs/2403.14085v1","updated":"2024-03-21T02:31:17Z","published":"2024-03-21T02:31:17Z","title":"Surface Reconstruction from Point Clouds via Grid-based Intersection\n Prediction","summary":" Surface reconstruction from point clouds is a crucial task in the fields of\ncomputer vision and computer graphics. SDF-based methods excel at\nreconstructing smooth meshes with minimal error and artifacts but struggle with\nrepresenting open surfaces. On the other hand, UDF-based methods can\neffectively represent open surfaces but often introduce noise near the surface,\nleading to artifacts in the mesh. In this work, we propose a novel approach\nthat directly predicts the intersection points between sampled line segments of\npoint pairs and implicit surfaces. This method not only preserves the ability\nto represent open surfaces but also eliminates artifacts in the mesh. Our\napproach demonstrates state-of-the-art performance on three datasets: ShapeNet,\nMGN, and ScanNet. The code will be made available upon acceptance.\n","authors":["Hui Tian","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2403.14085v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14082v1","updated":"2024-03-21T02:19:54Z","published":"2024-03-21T02:19:54Z","title":"EventDance: Unsupervised Source-free Cross-modal Adaptation for\n Event-based Object Recognition","summary":" In this paper, we make the first attempt at achieving the cross-modal (i.e.,\nimage-to-events) adaptation for event-based object recognition without\naccessing any labeled source image data owning to privacy and commercial\nissues. Tackling this novel problem is non-trivial due to the novelty of event\ncameras and the distinct modality gap between images and events. In particular,\nas only the source model is available, a hurdle is how to extract the knowledge\nfrom the source model by only using the unlabeled target event data while\nachieving knowledge transfer. To this end, we propose a novel framework, dubbed\nEventDance for this unsupervised source-free cross-modal adaptation problem.\nImportantly, inspired by event-to-video reconstruction methods, we propose a\nreconstruction-based modality bridging (RMB) module, which reconstructs\nintensity frames from events in a self-supervised manner. This makes it\npossible to build up the surrogate images to extract the knowledge (i.e.,\nlabels) from the source model. We then propose a multi-representation knowledge\nadaptation (MKA) module that transfers the knowledge to target models learning\nevents with multiple representation types for fully exploring the\nspatiotemporal information of events. The two modules connecting the source and\ntarget models are mutually updated so as to achieve the best performance.\nExperiments on three benchmark datasets with two adaption settings show that\nEventDance is on par with prior methods utilizing the source data.\n","authors":["Xu Zheng","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.14082v1.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2312.05826v2","updated":"2024-03-21T02:08:18Z","published":"2023-12-10T08:59:43Z","title":"R2Human: Real-Time 3D Human Appearance Rendering from a Single Image","summary":" Rendering 3D human appearance in different views is crucial for achieving\nholographic communication and immersive VR/AR. Existing methods either rely on\nmulti-camera setups or have low-quality rendered images from a single image. In\nthis paper, we propose R2Human, the first approach for real-time inference and\nrendering of photorealistic 3D human appearance from a single image. The core\nof our approach is to combine the strengths of implicit texture fields and\nexplicit neural rendering with our novel representation, namely Z-map. Based on\nthis, we present an end-to-end network that performs high-fidelity color\nreconstruction of visible areas and provides reliable color inference for\noccluded regions. To further enhance the 3D perception ability of our network,\nwe leverage the Fourier occupancy field as a prior for generating the texture\nfield and providing a sampling surface in the rendering stage. We also propose\na consistency loss and a spatio-temporal fusion strategy to ensure the\nmulti-view coherence. Experimental results show that our method outperforms the\nstate-of-the-art methods on both synthetic data and challenging real-world\nimages, in real time.\n","authors":["Yuanwang Yang","Qiao Feng","Yu-Kun Lai","Kun Li"],"pdf_url":"https://arxiv.org/pdf/2312.05826v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14070v1","updated":"2024-03-21T01:37:50Z","published":"2024-03-21T01:37:50Z","title":"QSMDiff: Unsupervised 3D Diffusion Models for Quantitative\n Susceptibility Mapping","summary":" Quantitative Susceptibility Mapping (QSM) dipole inversion is an ill-posed\ninverse problem for quantifying magnetic susceptibility distributions from MRI\ntissue phases. While supervised deep learning methods have shown success in\nspecific QSM tasks, their generalizability across different acquisition\nscenarios remains constrained. Recent developments in diffusion models have\ndemonstrated potential for solving 2D medical imaging inverse problems.\nHowever, their application to 3D modalities, such as QSM, remains challenging\ndue to high computational demands. In this work, we developed a 3D image\npatch-based diffusion model, namely QSMDiff, for robust QSM reconstruction\nacross different scan parameters, alongside simultaneous super-resolution and\nimage-denoising tasks. QSMDiff adopts unsupervised 3D image patch training and\nfull-size measurement guidance during inference for controlled image\ngeneration. Evaluation on simulated and in-vivo human brains, using\ngradient-echo and echo-planar imaging sequences across different acquisition\nparameters, demonstrates superior performance. The method proposed in QSMDiff\nalso holds promise for impacting other 3D medical imaging applications beyond\nQSM.\n","authors":["Zhuang Xiong","Wei Jiang","Yang Gao","Feng Liu","Hongfu Sun"],"pdf_url":"https://arxiv.org/pdf/2403.14070v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14066v1","updated":"2024-03-21T01:25:39Z","published":"2024-03-21T01:25:39Z","title":"LeFusion: Synthesizing Myocardial Pathology on Cardiac MRI via\n Lesion-Focus Diffusion Models","summary":" Data generated in clinical practice often exhibits biases, such as long-tail\nimbalance and algorithmic unfairness. This study aims to mitigate these\nchallenges through data synthesis. Previous efforts in medical imaging\nsynthesis have struggled with separating lesion information from background\ncontext, leading to difficulties in generating high-quality backgrounds and\nlimited control over the synthetic output. Inspired by diffusion-based image\ninpainting, we propose LeFusion, lesion-focused diffusion models. By\nredesigning the diffusion learning objectives to concentrate on lesion areas,\nit simplifies the model learning process and enhance the controllability of the\nsynthetic output, while preserving background by integrating forward-diffused\nbackground contexts into the reverse diffusion process. Furthermore, we\ngeneralize it to jointly handle multi-class lesions, and further introduce a\ngenerative model for lesion masks to increase synthesis diversity. Validated on\nthe DE-MRI cardiac lesion segmentation dataset (Emidec), our methodology\nemploys the popular nnUNet to demonstrate that the synthetic data make it\npossible to effectively enhance a state-of-the-art model. Code and model are\navailable at https://github.com/M3DV/LeFusion.\n","authors":["Hantao Zhang","Jiancheng Yang","Shouhong Wan","Pascal Fua"],"pdf_url":"https://arxiv.org/pdf/2403.14066v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2403.14056v1","updated":"2024-03-21T00:59:35Z","published":"2024-03-21T00:59:35Z","title":"Semantics from Space: Satellite-Guided Thermal Semantic Segmentation\n Annotation for Aerial Field Robots","summary":" We present a new method to automatically generate semantic segmentation\nannotations for thermal imagery captured from an aerial vehicle by utilizing\nsatellite-derived data products alongside onboard global positioning and\nattitude estimates. This new capability overcomes the challenge of developing\nthermal semantic perception algorithms for field robots due to the lack of\nannotated thermal field datasets and the time and costs of manual annotation,\nenabling precise and rapid annotation of thermal data from field collection\nefforts at a massively-parallelizable scale. By incorporating a\nthermal-conditioned refinement step with visual foundation models, our approach\ncan produce highly-precise semantic segmentation labels using low-resolution\nsatellite land cover data for little-to-no cost. It achieves 98.5% of the\nperformance from using costly high-resolution options and demonstrates between\n70-160% improvement over popular zero-shot semantic segmentation methods based\non large vision-language models currently used for generating annotations for\nRGB imagery. Code will be available at:\nhttps://github.com/connorlee77/aerial-auto-segment.\n","authors":["Connor Lee","Saraswati Soedarmadji","Matthew Anderson","Anthony J. Clark","Soon-Jo Chung"],"pdf_url":"https://arxiv.org/pdf/2403.14056v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11371v3","updated":"2024-03-21T00:55:04Z","published":"2024-03-17T23:29:41Z","title":"V2X-DGW: Domain Generalization for Multi-agent Perception under Adverse\n Weather Conditions","summary":" Current LiDAR-based Vehicle-to-Everything (V2X) multi-agent perception\nsystems have shown the significant success on 3D object detection. While these\nmodels perform well in the trained clean weather, they struggle in unseen\nadverse weather conditions with the real-world domain gap. In this paper, we\npropose a domain generalization approach, named V2X-DGW, for LiDAR-based 3D\nobject detection on multi-agent perception system under adverse weather\nconditions. Not only in the clean weather does our research aim to ensure\nfavorable multi-agent performance, but also in the unseen adverse weather\nconditions by learning only on the clean weather data. To advance research in\nthis area, we have simulated the impact of three prevalent adverse weather\nconditions on two widely-used multi-agent datasets, resulting in the creation\nof two novel benchmark datasets: OPV2V-w and V2XSet-w.\n To this end, we first introduce the Adaptive Weather Augmentation (AWA) to\nmimic the unseen adverse weather conditions, and then propose two alignments\nfor generalizable representation learning: Trust-region Weather-invariant\nAlignment (TWA) and Agent-aware Contrastive Alignment (ACA). Extensive\nexperimental results demonstrate that our V2X-DGW achieved improvements in the\nunseen adverse weather conditions.\n","authors":["Baolu Li","Jinlong Li","Xinyu Liu","Runsheng Xu","Zhengzhong Tu","Jiacheng Guo","Xiaopeng Li","Hongkai Yu"],"pdf_url":"https://arxiv.org/pdf/2403.11371v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.13854v4","updated":"2024-03-21T00:53:19Z","published":"2022-11-25T01:37:48Z","title":"ComCLIP: Training-Free Compositional Image and Text Matching","summary":" Contrastive Language-Image Pretraining (CLIP) has demonstrated great\nzero-shot performance for matching images and text. However, it is still\nchallenging to adapt vision-lanaguage pretrained models like CLIP to\ncompositional image and text matching -- a more challenging image and text\nmatching task requiring the model understanding of compositional word concepts\nand visual components. Towards better compositional generalization in zero-shot\nimage and text matching, in this paper, we study the problem from a causal\nperspective: the erroneous semantics of individual entities are essentially\nconfounders that cause the matching failure. Therefore, we propose a novel\n\\textbf{\\textit{training-free}} compositional CLIP model (ComCLIP). ComCLIP\ndisentangles input images into subjects, objects, and action sub-images and\ncomposes CLIP's vision encoder and text encoder to perform evolving matching\nover compositional text embedding and sub-image embeddings. In this way,\nComCLIP can mitigate spurious correlations introduced by the pretrained CLIP\nmodels and dynamically evaluate the importance of each component. Experiments\non four compositional image-text matching datasets: SVO, ComVG, Winoground, and\nVL-checklist, and two general image-text retrieval datasets: Flick30K, and\nMSCOCO demonstrate the effectiveness of our plug-and-play method, which boosts\nthe \\textbf{\\textit{zero-shot}} inference ability of CLIP, SLIP, and BLIP2 even\nwithout further training or fine-tuning. Our codes can be found at\nhttps://github.com/eric-ai-lab/ComCLIP.\n","authors":["Kenan Jiang","Xuehai He","Ruize Xu","Xin Eric Wang"],"pdf_url":"https://arxiv.org/pdf/2211.13854v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.05584v2","updated":"2024-03-21T00:42:39Z","published":"2024-01-10T23:30:48Z","title":"FourCastNeXt: Optimizing FourCastNet Training for Limited Compute","summary":" FourCastNeXt is an optimization of FourCastNet - a global machine learning\nweather forecasting model - that performs with a comparable level of accuracy\nand can be trained using around 5% of the original FourCastNet computational\nrequirements. This technical report presents strategies for model optimization\nthat maintain similar performance as measured by the root-mean-square error\n(RMSE) of the modelled variables. By providing a model with very low\ncomparative training costs, FourCastNeXt makes Neural Earth System Modelling\nmuch more accessible to researchers looking to conduct training experiments and\nablation studies. FourCastNeXt training and inference code are available at\nhttps://github.com/nci/FourCastNeXt\n","authors":["Edison Guo","Maruf Ahmed","Yue Sun","Rui Yang","Harrison Cook","Tennessee Leeuwenburg","Ben Evans"],"pdf_url":"https://arxiv.org/pdf/2401.05584v2.pdf","comment":"Major revision. All prior content (text, figures, table) has been\n updated. Additionally, new text, tables and figures have been added. Updated\n title. Updated author list"},{"id":"http://arxiv.org/abs/2403.14053v1","updated":"2024-03-21T00:35:31Z","published":"2024-03-21T00:35:31Z","title":"Leveraging Thermal Modality to Enhance Reconstruction in Low-Light\n Conditions","summary":" Neural Radiance Fields (NeRF) accomplishes photo-realistic novel view\nsynthesis by learning the implicit volumetric representation of a scene from\nmulti-view images, which faithfully convey the colorimetric information.\nHowever, sensor noises will contaminate low-value pixel signals, and the lossy\ncamera image signal processor will further remove near-zero intensities in\nextremely dark situations, deteriorating the synthesis performance. Existing\napproaches reconstruct low-light scenes from raw images but struggle to recover\ntexture and boundary details in dark regions. Additionally, they are unsuitable\nfor high-speed models relying on explicit representations. To address these\nissues, we present Thermal-NeRF, which takes thermal and visible raw images as\ninputs, considering the thermal camera is robust to the illumination variation\nand raw images preserve any possible clues in the dark, to accomplish visible\nand thermal view synthesis simultaneously. Also, the first multi-view thermal\nand visible dataset (MVTV) is established to support the research on multimodal\nNeRF. Thermal-NeRF achieves the best trade-off between detail preservation and\nnoise smoothing and provides better synthesis performance than previous work.\nFinally, we demonstrate that both modalities are beneficial to each other in 3D\nreconstruction.\n","authors":["Jiacong Xu","Mingqian Liao","K Ram Prabhakar","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2403.14053v1.pdf","comment":"25 pages, 13 figures"},{"id":"http://arxiv.org/abs/2403.14047v1","updated":"2024-03-21T00:09:04Z","published":"2024-03-21T00:09:04Z","title":"Accelerating ViT Inference on FPGA through Static and Dynamic Pruning","summary":" Vision Transformers (ViTs) have achieved state-of-the-art accuracy on various\ncomputer vision tasks. However, their high computational complexity prevents\nthem from being applied to many real-world applications. Weight and token\npruning are two well-known methods for reducing complexity: weight pruning\nreduces the model size and associated computational demands, while token\npruning further dynamically reduces the computation based on the input.\nCombining these two techniques should significantly reduce computation\ncomplexity and model size; however, naively integrating them results in\nirregular computation patterns, leading to significant accuracy drops and\ndifficulties in hardware acceleration.\n Addressing the above challenges, we propose a comprehensive\nalgorithm-hardware codesign for accelerating ViT on FPGA through simultaneous\npruning -combining static weight pruning and dynamic token pruning. For\nalgorithm design, we systematically combine a hardware-aware structured\nblock-pruning method for pruning model parameters and a dynamic token pruning\nmethod for removing unimportant token vectors. Moreover, we design a novel\ntraining algorithm to recover the model's accuracy. For hardware design, we\ndevelop a novel hardware accelerator for executing the pruned model. The\nproposed hardware design employs multi-level parallelism with load balancing\nstrategy to efficiently deal with the irregular computation pattern led by the\ntwo pruning approaches. Moreover, we develop an efficient hardware mechanism\nfor efficiently executing the on-the-fly token pruning.\n","authors":["Dhruv Parikh","Shouyi Li","Bingyi Zhang","Rajgopal Kannan","Carl Busart","Viktor Prasanna"],"pdf_url":"https://arxiv.org/pdf/2403.14047v1.pdf","comment":"FCCM 2024"},{"id":"http://arxiv.org/abs/2311.01623v2","updated":"2024-03-21T00:05:23Z","published":"2023-11-03T16:58:10Z","title":"VQPy: An Object-Oriented Approach to Modern Video Analytics","summary":" Video analytics is widely used in contemporary systems and services. At the\nforefront of video analytics are video queries that users develop to find\nobjects of particular interest. Building upon the insight that video objects\n(e.g., human, animals, cars, etc.), the center of video analytics, are similar\nin spirit to objects modeled by traditional object-oriented languages, we\npropose to develop an object-oriented approach to video analytics. This\napproach, named VQPy, consists of a frontend$\\unicode{x2015}$a Python variant\nwith constructs that make it easy for users to express video objects and their\ninteractions$\\unicode{x2015}$as well as an extensible backend that can\nautomatically construct and optimize pipelines based on video objects. We have\nimplemented and open-sourced VQPy, which has been productized in Cisco as part\nof its DeepVision framework.\n","authors":["Shan Yu","Zhenting Zhu","Yu Chen","Hanchen Xu","Pengzhan Zhao","Yang Wang","Arthi Padmanabhan","Hugo Latapie","Harry Xu"],"pdf_url":"https://arxiv.org/pdf/2311.01623v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.16828v2","updated":"2024-03-21T17:56:19Z","published":"2023-10-25T17:57:07Z","title":"TD-MPC2: Scalable, Robust World Models for Continuous Control","summary":" TD-MPC is a model-based reinforcement learning (RL) algorithm that performs\nlocal trajectory optimization in the latent space of a learned implicit\n(decoder-free) world model. In this work, we present TD-MPC2: a series of\nimprovements upon the TD-MPC algorithm. We demonstrate that TD-MPC2 improves\nsignificantly over baselines across 104 online RL tasks spanning 4 diverse task\ndomains, achieving consistently strong results with a single set of\nhyperparameters. We further show that agent capabilities increase with model\nand data size, and successfully train a single 317M parameter agent to perform\n80 tasks across multiple task domains, embodiments, and action spaces. We\nconclude with an account of lessons, opportunities, and risks associated with\nlarge TD-MPC2 agents. Explore videos, models, data, code, and more at\nhttps://tdmpc2.com\n","authors":["Nicklas Hansen","Hao Su","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2310.16828v2.pdf","comment":"ICLR 2024. Explore videos, models, data, code, and more at\n https://tdmpc2.com"},{"id":"http://arxiv.org/abs/2009.09213v6","updated":"2024-03-21T16:24:05Z","published":"2020-09-19T11:26:01Z","title":"Dodging DeepFake Detection via Implicit Spatial-Domain Notch Filtering","summary":" The current high-fidelity generation and high-precision detection of DeepFake\nimages are at an arms race. We believe that producing DeepFakes that are highly\nrealistic and 'detection evasive' can serve the ultimate goal of improving\nfuture generation DeepFake detection capabilities. In this paper, we propose a\nsimple yet powerful pipeline to reduce the artifact patterns of fake images\nwithout hurting image quality by performing implicit spatial-domain notch\nfiltering. We first demonstrate that frequency-domain notch filtering, although\nfamously shown to be effective in removing periodic noise in the spatial\ndomain, is infeasible for our task at hand due to the manual designs required\nfor the notch filters. We, therefore, resort to a learning-based approach to\nreproduce the notch filtering effects, but solely in the spatial domain. We\nadopt a combination of adding overwhelming spatial noise for breaking the\nperiodic noise pattern and deep image filtering to reconstruct the noise-free\nfake images, and we name our method DeepNotch. Deep image filtering provides a\nspecialized filter for each pixel in the noisy image, producing filtered images\nwith high fidelity compared to their DeepFake counterparts. Moreover, we also\nuse the semantic information of the image to generate an adversarial guidance\nmap to add noise intelligently. Our large-scale evaluation on 3 representative\nstate-of-the-art DeepFake detection methods (tested on 16 types of DeepFakes)\nhas demonstrated that our technique significantly reduces the accuracy of these\n3 fake image detection methods, 36.79% on average and up to 97.02% in the best\ncase.\n","authors":["Yihao Huang","Felix Juefei-Xu","Qing Guo","Yang Liu","Geguang Pu"],"pdf_url":"https://arxiv.org/pdf/2009.09213v6.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2403.14886v1","updated":"2024-03-21T23:43:30Z","published":"2024-03-21T23:43:30Z","title":"DSGG: Dense Relation Transformer for an End-to-end Scene Graph\n Generation","summary":" Scene graph generation aims to capture detailed spatial and semantic\nrelationships between objects in an image, which is challenging due to\nincomplete labelling, long-tailed relationship categories, and relational\nsemantic overlap. Existing Transformer-based methods either employ distinct\nqueries for objects and predicates or utilize holistic queries for relation\ntriplets and hence often suffer from limited capacity in learning low-frequency\nrelationships. In this paper, we present a new Transformer-based method, called\nDSGG, that views scene graph detection as a direct graph prediction problem\nbased on a unique set of graph-aware queries. In particular, each graph-aware\nquery encodes a compact representation of both the node and all of its\nrelations in the graph, acquired through the utilization of a relaxed sub-graph\nmatching during the training process. Moreover, to address the problem of\nrelational semantic overlap, we utilize a strategy for relation distillation,\naiming to efficiently learn multiple instances of semantic relationships.\nExtensive experiments on the VG and the PSG datasets show that our model\nachieves state-of-the-art results, showing a significant improvement of 3.5\\%\nand 6.7\\% in mR@50 and mR@100 for the scene-graph generation task and achieves\nan even more substantial improvement of 8.5\\% and 10.3\\% in mR@50 and mR@100\nfor the panoptic scene graph generation task. Code is available at\n\\url{https://github.com/zeeshanhayder/DSGG}.\n","authors":["Zeeshan Hayder","Xuming He"],"pdf_url":"https://arxiv.org/pdf/2403.14886v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.14874v1","updated":"2024-03-21T22:46:27Z","published":"2024-03-21T22:46:27Z","title":"WeatherProof: Leveraging Language Guidance for Semantic Segmentation in\n Adverse Weather","summary":" We propose a method to infer semantic segmentation maps from images captured\nunder adverse weather conditions. We begin by examining existing models on\nimages degraded by weather conditions such as rain, fog, or snow, and found\nthat they exhibit a large performance drop as compared to those captured under\nclear weather. To control for changes in scene structures, we propose\nWeatherProof, the first semantic segmentation dataset with accurate clear and\nadverse weather image pairs that share an underlying scene. Through this\ndataset, we analyze the error modes in existing models and found that they were\nsensitive to the highly complex combination of different weather effects\ninduced on the image during capture. To improve robustness, we propose a way to\nuse language as guidance by identifying contributions of adverse weather\nconditions and injecting that as \"side information\". Models trained using our\nlanguage guidance exhibit performance gains by up to 10.2% in mIoU on\nWeatherProof, up to 8.44% in mIoU on the widely used ACDC dataset compared to\nstandard training techniques, and up to 6.21% in mIoU on the ACDC dataset as\ncompared to previous SOTA methods.\n","authors":["Blake Gella","Howard Zhang","Rishi Upadhyay","Tiffany Chang","Nathan Wei","Matthew Waliman","Yunhao Bao","Celso de Melo","Alex Wong","Achuta Kadambi"],"pdf_url":"https://arxiv.org/pdf/2403.14874v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2312.09534"},{"id":"http://arxiv.org/abs/2403.14870v1","updated":"2024-03-21T22:36:24Z","published":"2024-03-21T22:36:24Z","title":"VidLA: Video-Language Alignment at Scale","summary":" In this paper, we propose VidLA, an approach for video-language alignment at\nscale. There are two major limitations of previous video-language alignment\napproaches. First, they do not capture both short-range and long-range temporal\ndependencies and typically employ complex hierarchical deep network\narchitectures that are hard to integrate with existing pretrained image-text\nfoundation models. To effectively address this limitation, we instead keep the\nnetwork architecture simple and use a set of data tokens that operate at\ndifferent temporal resolutions in a hierarchical manner, accounting for the\ntemporally hierarchical nature of videos. By employing a simple two-tower\narchitecture, we are able to initialize our video-language model with\npretrained image-text foundation models, thereby boosting the final\nperformance. Second, existing video-language alignment works struggle due to\nthe lack of semantically aligned large-scale training data. To overcome it, we\nleverage recent LLMs to curate the largest video-language dataset to date with\nbetter visual grounding. Furthermore, unlike existing video-text datasets which\nonly contain short clips, our dataset is enriched with video clips of varying\ndurations to aid our temporally hierarchical data tokens in extracting better\nrepresentations at varying temporal scales. Overall, empirical results show\nthat our proposed approach surpasses state-of-the-art methods on multiple\nretrieval benchmarks, especially on longer videos, and performs competitively\non classification benchmarks.\n","authors":["Mamshad Nayeem Rizve","Fan Fei","Jayakrishnan Unnikrishnan","Son Tran","Benjamin Z. Yao","Belinda Zeng","Mubarak Shah","Trishul Chilimbi"],"pdf_url":"https://arxiv.org/pdf/2403.14870v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.14863v1","updated":"2024-03-21T22:18:25Z","published":"2024-03-21T22:18:25Z","title":"Distribution-informed and wavelength-flexible data-driven photoacoustic\n oximetry","summary":" Significance: Photoacoustic imaging (PAI) promises to measure\nspatially-resolved blood oxygen saturation, but suffers from a lack of accurate\nand robust spectral unmixing methods to deliver on this promise. Accurate blood\noxygenation estimation could have important clinical applications, from cancer\ndetection to quantifying inflammation.\n Aim: This study addresses the inflexibility of existing data-driven methods\nfor estimating blood oxygenation in PAI by introducing a recurrent neural\nnetwork architecture.\n Approach: We created 25 simulated training dataset variations to assess\nneural network performance. We used a long short-term memory network to\nimplement a wavelength-flexible network architecture and proposed the\nJensen-Shannon divergence to predict the most suitable training dataset.\n Results: The network architecture can handle arbitrary input wavelengths and\noutperforms linear unmixing and the previously proposed learned spectral\ndecolouring method. Small changes in the training data significantly affect the\naccuracy of our method, but we find that the Jensen-Shannon divergence\ncorrelates with the estimation error and is thus suitable for predicting the\nmost appropriate training datasets for any given application.\n Conclusions: A flexible data-driven network architecture combined with the\nJensen-Shannon Divergence to predict the best training data set provides a\npromising direction that might enable robust data-driven photoacoustic oximetry\nfor clinical use cases.\n","authors":["Janek Gröhl","Kylie Yeung","Kevin Gu","Thomas R. Else","Monika Golinska","Ellie V. Bunce","Lina Hacker","Sarah E. Bohndiek"],"pdf_url":"https://arxiv.org/pdf/2403.14863v1.pdf","comment":"37 pages, 7 figures"},{"id":"http://arxiv.org/abs/2312.06729v2","updated":"2024-03-21T22:11:31Z","published":"2023-12-11T09:12:35Z","title":"RGNet: A Unified Clip Retrieval and Grounding Network for Long Videos","summary":" Locating specific moments within long videos (20-120 minutes) presents a\nsignificant challenge, akin to finding a needle in a haystack. Adapting\nexisting short video (5-30 seconds) grounding methods to this problem yields\npoor performance. Since most real life videos, such as those on YouTube and\nAR/VR, are lengthy, addressing this issue is crucial. Existing methods\ntypically operate in two stages: clip retrieval and grounding. However, this\ndisjoint process limits the retrieval module's fine-grained event\nunderstanding, crucial for specific moment detection. We propose RGNet which\ndeeply integrates clip retrieval and grounding into a single network capable of\nprocessing long videos into multiple granular levels, e.g., clips and frames.\nIts core component is a novel transformer encoder, RG-Encoder, that unifies the\ntwo stages through shared features and mutual optimization. The encoder\nincorporates a sparse attention mechanism and an attention loss to model both\ngranularity jointly. Moreover, we introduce a contrastive clip sampling\ntechnique to mimic the long video paradigm closely during training. RGNet\nsurpasses prior methods, showcasing state-of-the-art performance on long video\ntemporal grounding (LVTG) datasets MAD and Ego4D.\n","authors":["Tanveer Hannan","Md Mohaiminul Islam","Thomas Seidl","Gedas Bertasius"],"pdf_url":"https://arxiv.org/pdf/2312.06729v2.pdf","comment":"The code is released at https://github.com/Tanveer81/RGNet"},{"id":"http://arxiv.org/abs/2403.11905v2","updated":"2024-03-21T21:57:13Z","published":"2024-03-18T16:06:30Z","title":"Tur[k]ingBench: A Challenge Benchmark for Web Agents","summary":" Recent chatbots have demonstrated impressive ability to understand and\ncommunicate in raw-text form. However, there is more to the world than raw\ntext. For example, humans spend long hours of their time on web pages, where\ntext is intertwined with other modalities and tasks are accomplished in the\nform of various complex interactions. Can state-of-the-art multi-modal models\ngeneralize to such complex domains?\n To address this question, we introduce TurkingBench, a benchmark of tasks\nformulated as web pages containing textual instructions with multi-modal\ncontext. Unlike existing work which employs artificially synthesized web pages,\nhere we use natural HTML pages that were originally designed for crowdsourcing\nworkers for various annotation purposes. The HTML instructions of each task are\nalso instantiated with various values (obtained from the crowdsourcing tasks)\nto form new instances of the task. This benchmark contains 32.2K instances\ndistributed across 158 tasks.\n Additionally, to facilitate the evaluation on TurkingBench, we develop an\nevaluation framework that connects the responses of chatbots to modifications\non web pages (modifying a text box, checking a radio, etc.). We evaluate the\nperformance of state-of-the-art models, including language-only, vision-only,\nand layout-only models, and their combinations, on this benchmark. Our findings\nreveal that these models perform significantly better than random chance, yet\nconsiderable room exists for improvement. We hope this benchmark will help\nfacilitate the evaluation and development of web-based agents.\n","authors":["Kevin Xu","Yeganeh Kordi","Kate Sanders","Yizhong Wang","Adam Byerly","Jack Zhang","Benjamin Van Durme","Daniel Khashabi"],"pdf_url":"https://arxiv.org/pdf/2403.11905v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14852v1","updated":"2024-03-21T21:56:09Z","published":"2024-03-21T21:56:09Z","title":"KeyPoint Relative Position Encoding for Face Recognition","summary":" In this paper, we address the challenge of making ViT models more robust to\nunseen affine transformations. Such robustness becomes useful in various\nrecognition tasks such as face recognition when image alignment failures occur.\nWe propose a novel method called KP-RPE, which leverages key points\n(e.g.~facial landmarks) to make ViT more resilient to scale, translation, and\npose variations. We begin with the observation that Relative Position Encoding\n(RPE) is a good way to bring affine transform generalization to ViTs. RPE,\nhowever, can only inject the model with prior knowledge that nearby pixels are\nmore important than far pixels. Keypoint RPE (KP-RPE) is an extension of this\nprinciple, where the significance of pixels is not solely dictated by their\nproximity but also by their relative positions to specific keypoints within the\nimage. By anchoring the significance of pixels around keypoints, the model can\nmore effectively retain spatial relationships, even when those relationships\nare disrupted by affine transformations. We show the merit of KP-RPE in face\nand gait recognition. The experimental results demonstrate the effectiveness in\nimproving face recognition performance from low-quality images, particularly\nwhere alignment is prone to failure. Code and pre-trained models are available.\n","authors":["Minchul Kim","Yiyang Su","Feng Liu","Anil Jain","Xiaoming Liu"],"pdf_url":"https://arxiv.org/pdf/2403.14852v1.pdf","comment":"To appear in CVPR2024"},{"id":"http://arxiv.org/abs/2402.10045v3","updated":"2024-03-21T21:37:50Z","published":"2024-01-11T03:36:47Z","title":"Short-Form Videos and Mental Health: A Knowledge-Guided Neural Topic\n Model","summary":" While short-form videos head to reshape the entire social media landscape,\nexperts are exceedingly worried about their depressive impacts on viewers, as\nevidenced by medical studies. To prevent widespread consequences, platforms are\neager to predict these videos' impact on viewers' mental health. Subsequently,\nthey can take intervention measures, such as revising recommendation algorithms\nand displaying viewer discretion. Nevertheless, applicable predictive methods\nlack relevance to well-established medical knowledge, which outlines clinically\nproven external and environmental factors of depression. To account for such\nmedical knowledge, we resort to an emergent methodological discipline, seeded\nNeural Topic Models (NTMs). However, existing seeded NTMs suffer from the\nlimitations of single-origin topics, unknown topic sources, unclear seed\nsupervision, and suboptimal convergence. To address those challenges, we\ndevelop a novel Knowledge-guided Multimodal NTM to predict a short-form video's\ndepressive impact on viewers. Extensive empirical analyses using TikTok and\nDouyin datasets prove that our method outperforms state-of-the-art benchmarks.\nOur method also discovers medically relevant topics from videos that are linked\nto depressive impact. We contribute to IS with a novel video analytics method\nthat is generalizable to other video classification problems. Practically, our\nmethod can help platforms understand videos' mental impacts, thus adjusting\nrecommendations and video topic disclosure.\n","authors":["Jiaheng Xie","Ruicheng Liang","Yidong Chai","Yang Liu","Daniel Zeng"],"pdf_url":"https://arxiv.org/pdf/2402.10045v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02362v2","updated":"2024-03-21T21:28:37Z","published":"2023-12-04T21:43:00Z","title":"PointNeRF++: A multi-scale, point-based Neural Radiance Field","summary":" Point clouds offer an attractive source of information to complement images\nin neural scene representations, especially when few images are available.\nNeural rendering methods based on point clouds do exist, but they do not\nperform well when the point cloud quality is low -- e.g., sparse or incomplete,\nwhich is often the case with real-world data. We overcome these problems with a\nsimple representation that aggregates point clouds at multiple scale levels\nwith sparse voxel grids at different resolutions. To deal with point cloud\nsparsity, we average across multiple scale levels -- but only among those that\nare valid, i.e., that have enough neighboring points in proximity to the ray of\na pixel. To help model areas without points, we add a global voxel at the\ncoarsest scale, thus unifying ``classical'' and point-based NeRF formulations.\nWe validate our method on the NeRF Synthetic, ScanNet, and KITTI-360 datasets,\noutperforming the state of the art, with a significant gap compared to other\nNeRF-based methods, especially on more challenging scenes.\n","authors":["Weiwei Sun","Eduard Trulls","Yang-Che Tseng","Sneha Sambandam","Gopal Sharma","Andrea Tagliasacchi","Kwang Moo Yi"],"pdf_url":"https://arxiv.org/pdf/2312.02362v2.pdf","comment":"Project website: https://pointnerfpp.github.io/"},{"id":"http://arxiv.org/abs/2403.14839v1","updated":"2024-03-21T21:18:08Z","published":"2024-03-21T21:18:08Z","title":"Hyperspectral Neural Radiance Fields","summary":" Hyperspectral Imagery (HSI) has been used in many applications to\nnon-destructively determine the material and/or chemical compositions of\nsamples. There is growing interest in creating 3D hyperspectral\nreconstructions, which could provide both spatial and spectral information\nwhile also mitigating common HSI challenges such as non-Lambertian surfaces and\ntranslucent objects. However, traditional 3D reconstruction with HSI is\ndifficult due to technological limitations of hyperspectral cameras. In recent\nyears, Neural Radiance Fields (NeRFs) have seen widespread success in creating\nhigh quality volumetric 3D representations of scenes captured by a variety of\ncamera models. Leveraging recent advances in NeRFs, we propose computing a\nhyperspectral 3D reconstruction in which every point in space and view\ndirection is characterized by wavelength-dependent radiance and transmittance\nspectra. To evaluate our approach, a dataset containing nearly 2000\nhyperspectral images across 8 scenes and 2 cameras was collected. We perform\ncomparisons against traditional RGB NeRF baselines and apply ablation testing\nwith alternative spectra representations. Finally, we demonstrate the potential\nof hyperspectral NeRFs for hyperspectral super-resolution and imaging sensor\nsimulation. We show that our hyperspectral NeRF approach enables creating fast,\naccurate volumetric 3D hyperspectral scenes and enables several new\napplications and areas for future study.\n","authors":["Gerry Chen","Sunil Kumar Narayanan","Thomas Gautier Ottou","Benjamin Missaoui","Harsh Muriki","Cédric Pradalier","Yongsheng Chen"],"pdf_url":"https://arxiv.org/pdf/2403.14839v1.pdf","comment":"Main paper: 15 pages + 2 pages references. Supplemental/Appendix: 6\n pages"},{"id":"http://arxiv.org/abs/2403.14837v1","updated":"2024-03-21T21:13:53Z","published":"2024-03-21T21:13:53Z","title":"Osmosis: RGBD Diffusion Prior for Underwater Image Restoration","summary":" Underwater image restoration is a challenging task because of strong water\neffects that increase dramatically with distance. This is worsened by lack of\nground truth data of clean scenes without water. Diffusion priors have emerged\nas strong image restoration priors. However, they are often trained with a\ndataset of the desired restored output, which is not available in our case. To\novercome this critical issue, we show how to leverage in-air images to train\ndiffusion priors for underwater restoration. We also observe that only color\ndata is insufficient, and augment the prior with a depth channel. We train an\nunconditional diffusion model prior on the joint space of color and depth,\nusing standard RGBD datasets of natural outdoor scenes in air. Using this prior\ntogether with a novel guidance method based on the underwater image formation\nmodel, we generate posterior samples of clean images, removing the water\neffects. Even though our prior did not see any underwater images during\ntraining, our method outperforms state-of-the-art baselines for image\nrestoration on very challenging scenes. Data, models and code are published in\nthe project page.\n","authors":["Opher Bar Nathan","Deborah Levy","Tali Treibitz","Dan Rosenbaum"],"pdf_url":"https://arxiv.org/pdf/2403.14837v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14836v1","updated":"2024-03-21T21:11:23Z","published":"2024-03-21T21:11:23Z","title":"Evaluating Panoramic 3D Estimation in Indoor Lighting Analysis","summary":" This paper presents the use of panoramic 3D estimation in lighting\nsimulation. Conventional lighting simulation necessitates detailed modeling as\ninput, resulting in significant labor effort and time cost. The 3D layout\nestimation method directly takes a single panorama as input and generates a\nlighting simulation model with room geometry and window aperture. We evaluate\nthe simulation results by comparing the luminance errors between on-site High\nDynamic Range (HDR) photographs, 3D estimation model, and detailed model in\npanoramic representation and fisheye perspective. Given the selected scene, the\nresults demonstrate the estimated room layout is reliable for lighting\nsimulation.\n","authors":["Zining Cheng","Guanzhou Ji"],"pdf_url":"https://arxiv.org/pdf/2403.14836v1.pdf","comment":"Annual Modeling and Simulation Conference (ANNSIM), May 20-23, 2024,\n Washington D.C., USA"},{"id":"http://arxiv.org/abs/2403.14828v1","updated":"2024-03-21T20:43:10Z","published":"2024-03-21T20:43:10Z","title":"Multimodal-Conditioned Latent Diffusion Models for Fashion Image Editing","summary":" Fashion illustration is a crucial medium for designers to convey their\ncreative vision and transform design concepts into tangible representations\nthat showcase the interplay between clothing and the human body. In the context\nof fashion design, computer vision techniques have the potential to enhance and\nstreamline the design process. Departing from prior research primarily focused\non virtual try-on, this paper tackles the task of multimodal-conditioned\nfashion image editing. Our approach aims to generate human-centric fashion\nimages guided by multimodal prompts, including text, human body poses, garment\nsketches, and fabric textures. To address this problem, we propose extending\nlatent diffusion models to incorporate these multiple modalities and modifying\nthe structure of the denoising network, taking multimodal prompts as input. To\ncondition the proposed architecture on fabric textures, we employ textual\ninversion techniques and let diverse cross-attention layers of the denoising\nnetwork attend to textual and texture information, thus incorporating different\ngranularity conditioning details. Given the lack of datasets for the task, we\nextend two existing fashion datasets, Dress Code and VITON-HD, with multimodal\nannotations. Experimental evaluations demonstrate the effectiveness of our\nproposed approach in terms of realism and coherence concerning the provided\nmultimodal inputs.\n","authors":["Alberto Baldrati","Davide Morelli","Marcella Cornia","Marco Bertini","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2403.14828v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03077v3","updated":"2024-03-21T20:37:54Z","published":"2024-03-05T16:01:55Z","title":"MiKASA: Multi-Key-Anchor & Scene-Aware Transformer for 3D Visual\n Grounding","summary":" 3D visual grounding involves matching natural language descriptions with\ntheir corresponding objects in 3D spaces. Existing methods often face\nchallenges with accuracy in object recognition and struggle in interpreting\ncomplex linguistic queries, particularly with descriptions that involve\nmultiple anchors or are view-dependent. In response, we present the MiKASA\n(Multi-Key-Anchor Scene-Aware) Transformer. Our novel end-to-end trained model\nintegrates a self-attention-based scene-aware object encoder and an original\nmulti-key-anchor technique, enhancing object recognition accuracy and the\nunderstanding of spatial relationships. Furthermore, MiKASA improves the\nexplainability of decision-making, facilitating error diagnosis. Our model\nachieves the highest overall accuracy in the Referit3D challenge for both the\nSr3D and Nr3D datasets, particularly excelling by a large margin in categories\nthat require viewpoint-dependent descriptions.\n","authors":["Chun-Peng Chang","Shaoxiang Wang","Alain Pagani","Didier Stricker"],"pdf_url":"https://arxiv.org/pdf/2403.03077v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14821v1","updated":"2024-03-21T20:28:22Z","published":"2024-03-21T20:28:22Z","title":"Learning Gaussian Representation for Eye Fixation Prediction","summary":" Existing eye fixation prediction methods perform the mapping from input\nimages to the corresponding dense fixation maps generated from raw fixation\npoints. However, due to the stochastic nature of human fixation, the generated\ndense fixation maps may be a less-than-ideal representation of human fixation.\nTo provide a robust fixation model, we introduce Gaussian Representation for\neye fixation modeling. Specifically, we propose to model the eye fixation map\nas a mixture of probability distributions, namely a Gaussian Mixture Model. In\nthis new representation, we use several Gaussian distribution components as an\nalternative to the provided fixation map, which makes the model more robust to\nthe randomness of fixation. Meanwhile, we design our framework upon some\nlightweight backbones to achieve real-time fixation prediction. Experimental\nresults on three public fixation prediction datasets (SALICON, MIT1003,\nTORONTO) demonstrate that our method is fast and effective.\n","authors":["Peipei Song","Jing Zhang","Piotr Koniusz","Nick Barnes"],"pdf_url":"https://arxiv.org/pdf/2403.14821v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2403.14800v1","updated":"2024-03-21T19:28:17Z","published":"2024-03-21T19:28:17Z","title":"Deep Active Learning: A Reality Check","summary":" We conduct a comprehensive evaluation of state-of-the-art deep active\nlearning methods. Surprisingly, under general settings, no single-model method\ndecisively outperforms entropy-based active learning, and some even fall short\nof random sampling. We delve into overlooked aspects like starting budget,\nbudget step, and pretraining's impact, revealing their significance in\nachieving superior results. Additionally, we extend our evaluation to other\ntasks, exploring the active learning effectiveness in combination with\nsemi-supervised learning, and object detection. Our experiments provide\nvaluable insights and concrete recommendations for future active learning\nstudies. By uncovering the limitations of current methods and understanding the\nimpact of different experimental settings, we aim to inspire more efficient\ntraining of deep learning models in real-world scenarios with limited\nannotation budgets. This work contributes to advancing active learning's\nefficacy in deep learning and empowers researchers to make informed decisions\nwhen applying active learning to their tasks.\n","authors":["Edrina Gashi","Jiankang Deng","Ismail Elezi"],"pdf_url":"https://arxiv.org/pdf/2403.14800v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14797v1","updated":"2024-03-21T19:20:29Z","published":"2024-03-21T19:20:29Z","title":"Preventing Catastrophic Forgetting through Memory Networks in Continuous\n Detection","summary":" Modern pre-trained architectures struggle to retain previous information\nwhile undergoing continuous fine-tuning on new tasks. Despite notable progress\nin continual classification, systems designed for complex vision tasks such as\ndetection or segmentation still struggle to attain satisfactory performance. In\nthis work, we introduce a memory-based detection transformer architecture to\nadapt a pre-trained DETR-style detector to new tasks while preserving knowledge\nfrom previous tasks. We propose a novel localized query function for efficient\ninformation retrieval from memory units, aiming to minimize forgetting.\nFurthermore, we identify a fundamental challenge in continual detection\nreferred to as background relegation. This arises when object categories from\nearlier tasks reappear in future tasks, potentially without labels, leading\nthem to be implicitly treated as background. This is an inevitable issue in\ncontinual detection or segmentation. The introduced continual optimization\ntechnique effectively tackles this challenge. Finally, we assess the\nperformance of our proposed system on continual detection benchmarks and\ndemonstrate that our approach surpasses the performance of existing\nstate-of-the-art resulting in 5-7% improvements on MS-COCO and PASCAL-VOC on\nthe task of continual detection.\n","authors":["Gaurav Bhatt","James Ross","Leonid Sigal"],"pdf_url":"https://arxiv.org/pdf/2403.14797v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03187v2","updated":"2024-03-21T19:14:04Z","published":"2023-12-05T23:33:49Z","title":"FERGI: Automatic Annotation of User Preferences for Text-to-Image\n Generation from Spontaneous Facial Expression Reaction","summary":" Researchers have proposed to use data of human preference feedback to\nfine-tune text-to-image generative models. However, the scalability of human\nfeedback collection has been limited by its reliance on manual annotation.\nTherefore, we develop and test a method to automatically annotate user\npreferences from their spontaneous facial expression reaction to the generated\nimages. We collect a dataset of Facial Expression Reaction to Generated Images\n(FERGI) and show that the activations of multiple facial action units (AUs) are\nhighly correlated with user evaluations of the generated images. Specifically,\nAU4 (brow lowerer) is reflective of negative evaluations of the generated image\nwhereas AU12 (lip corner puller) is reflective of positive evaluations. These\ncan be useful in two ways. Firstly, we can automatically annotate user\npreferences between image pairs with substantial difference in these AU\nresponses with an accuracy significantly outperforming state-of-the-art scoring\nmodels. Secondly, directly integrating the AU responses with the scoring models\nimproves their consistency with human preferences. Finally, this method of\nautomatic annotation with facial expression analysis can be potentially\ngeneralized to other generation tasks. The code is available at\nhttps://github.com/ShuangquanFeng/FERGI, and the dataset is also available at\nthe same link for research purposes.\n","authors":["Shuangquan Feng","Junhua Ma","Virginia R. de Sa"],"pdf_url":"https://arxiv.org/pdf/2312.03187v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14790v1","updated":"2024-03-21T19:09:21Z","published":"2024-03-21T19:09:21Z","title":"Latent Diffusion Models for Attribute-Preserving Image Anonymization","summary":" Generative techniques for image anonymization have great potential to\ngenerate datasets that protect the privacy of those depicted in the images,\nwhile achieving high data fidelity and utility. Existing methods have focused\nextensively on preserving facial attributes, but failed to embrace a more\ncomprehensive perspective that considers the scene and background into the\nanonymization process. This paper presents, to the best of our knowledge, the\nfirst approach to image anonymization based on Latent Diffusion Models (LDMs).\nEvery element of a scene is maintained to convey the same meaning, yet\nmanipulated in a way that makes re-identification difficult. We propose two\nLDMs for this purpose: CAMOUFLaGE-Base exploits a combination of pre-trained\nControlNets, and a new controlling mechanism designed to increase the distance\nbetween the real and anonymized images. CAMOFULaGE-Light is based on the\nAdapter technique, coupled with an encoding designed to efficiently represent\nthe attributes of different persons in a scene. The former solution achieves\nsuperior performance on most metrics and benchmarks, while the latter cuts the\ninference time in half at the cost of fine-tuning a lightweight module. We show\nthrough extensive experimental comparison that the proposed method is\ncompetitive with the state-of-the-art concerning identity obfuscation whilst\nbetter preserving the original content of the image and tackling unresolved\nchallenges that current solutions fail to address.\n","authors":["Luca Piano","Pietro Basci","Fabrizio Lamberti","Lia Morra"],"pdf_url":"https://arxiv.org/pdf/2403.14790v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14789v1","updated":"2024-03-21T19:05:31Z","published":"2024-03-21T19:05:31Z","title":"On the exploitation of DCT statistics for cropping detectors","summary":" {The study of frequency components derived from Discrete Cosine Transform\n(DCT) has been widely used in image analysis. In recent years it has been\nobserved that significant information can be extrapolated from them about the\nlifecycle of the image, but no study has focused on the analysis between them\nand the source resolution of the image. In this work, we investigated a novel\nimage resolution classifier that employs DCT statistics with the goal to detect\nthe original resolution of images; in particular the insight was exploited to\naddress the challenge of identifying cropped images. Training a Machine\nLearning (ML) classifier on entire images (not cropped), the generated model\ncan leverage this information to detect cropping. The results demonstrate the\nclassifier's reliability in distinguishing between cropped and not cropped\nimages, providing a dependable estimation of their original resolution. This\nadvancement has significant implications for image processing applications,\nincluding digital security, authenticity verification, and visual quality\nanalysis, by offering a new tool for detecting image manipulations and\nenhancing qualitative image assessment. This work opens new perspectives in the\nfield, with potential to transform image analysis and usage across multiple\ndomains.}\n","authors":["Claudio Vittorio Ragaglia","Francesco Guarnera","Sebastiano Battiato"],"pdf_url":"https://arxiv.org/pdf/2403.14789v1.pdf","comment":"8 pages, 3 figures, conference"},{"id":"http://arxiv.org/abs/2401.10224v2","updated":"2024-03-21T18:59:50Z","published":"2024-01-18T18:59:09Z","title":"The Manga Whisperer: Automatically Generating Transcriptions for Comics","summary":" In the past few decades, Japanese comics, commonly referred to as Manga, have\ntranscended both cultural and linguistic boundaries to become a true worldwide\nsensation. Yet, the inherent reliance on visual cues and illustration within\nmanga renders it largely inaccessible to individuals with visual impairments.\nIn this work, we seek to address this substantial barrier, with the aim of\nensuring that manga can be appreciated and actively engaged by everyone.\nSpecifically, we tackle the problem of diarisation i.e. generating a\ntranscription of who said what and when, in a fully automatic way.\n To this end, we make the following contributions: (1) we present a unified\nmodel, Magi, that is able to (a) detect panels, text boxes and character boxes,\n(b) cluster characters by identity (without knowing the number of clusters\napriori), and (c) associate dialogues to their speakers; (2) we propose a novel\napproach that is able to sort the detected text boxes in their reading order\nand generate a dialogue transcript; (3) we annotate an evaluation benchmark for\nthis task using publicly available [English] manga pages. The code, evaluation\ndatasets and the pre-trained model can be found at:\nhttps://github.com/ragavsachdeva/magi.\n","authors":["Ragav Sachdeva","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2401.10224v2.pdf","comment":"Accepted at CVPR'24"},{"id":"http://arxiv.org/abs/2403.14783v1","updated":"2024-03-21T18:57:25Z","published":"2024-03-21T18:57:25Z","title":"Multi-Agent VQA: Exploring Multi-Agent Foundation Models in Zero-Shot\n Visual Question Answering","summary":" This work explores the zero-shot capabilities of foundation models in Visual\nQuestion Answering (VQA) tasks. We propose an adaptive multi-agent system,\nnamed Multi-Agent VQA, to overcome the limitations of foundation models in\nobject detection and counting by using specialized agents as tools. Unlike\nexisting approaches, our study focuses on the system's performance without\nfine-tuning it on specific VQA datasets, making it more practical and robust in\nthe open world. We present preliminary experimental results under zero-shot\nscenarios and highlight some failure cases, offering new directions for future\nresearch.\n","authors":["Bowen Jiang","Zhijun Zhuang","Shreyas S. Shivakumar","Dan Roth","Camillo J. Taylor"],"pdf_url":"https://arxiv.org/pdf/2403.14783v1.pdf","comment":"A full version of the paper will be released soon. The codes are\n available at https://github.com/bowen-upenn/Multi-Agent-VQA"},{"id":"http://arxiv.org/abs/2403.14781v1","updated":"2024-03-21T18:52:58Z","published":"2024-03-21T18:52:58Z","title":"Champ: Controllable and Consistent Human Image Animation with 3D\n Parametric Guidance","summary":" In this study, we introduce a methodology for human image animation by\nleveraging a 3D human parametric model within a latent diffusion framework to\nenhance shape alignment and motion guidance in curernt human generative\ntechniques. The methodology utilizes the SMPL(Skinned Multi-Person Linear)\nmodel as the 3D human parametric model to establish a unified representation of\nbody shape and pose. This facilitates the accurate capture of intricate human\ngeometry and motion characteristics from source videos. Specifically, we\nincorporate rendered depth images, normal maps, and semantic maps obtained from\nSMPL sequences, alongside skeleton-based motion guidance, to enrich the\nconditions to the latent diffusion model with comprehensive 3D shape and\ndetailed pose attributes. A multi-layer motion fusion module, integrating\nself-attention mechanisms, is employed to fuse the shape and motion latent\nrepresentations in the spatial domain. By representing the 3D human parametric\nmodel as the motion guidance, we can perform parametric shape alignment of the\nhuman body between the reference image and the source video motion.\nExperimental evaluations conducted on benchmark datasets demonstrate the\nmethodology's superior ability to generate high-quality human animations that\naccurately capture both pose and shape variations. Furthermore, our approach\nalso exhibits superior generalization capabilities on the proposed wild\ndataset. Project page: https://fudan-generative-vision.github.io/champ.\n","authors":["Shenhao Zhu","Junming Leo Chen","Zuozhuo Dai","Yinghui Xu","Xun Cao","Yao Yao","Hao Zhu","Siyu Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.14781v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14778v1","updated":"2024-03-21T18:49:20Z","published":"2024-03-21T18:49:20Z","title":"Diffusion Attack: Leveraging Stable Diffusion for Naturalistic Image\n Attacking","summary":" In Virtual Reality (VR), adversarial attack remains a significant security\nthreat. Most deep learning-based methods for physical and digital adversarial\nattacks focus on enhancing attack performance by crafting adversarial examples\nthat contain large printable distortions that are easy for human observers to\nidentify. However, attackers rarely impose limitations on the naturalness and\ncomfort of the appearance of the generated attack image, resulting in a\nnoticeable and unnatural attack. To address this challenge, we propose a\nframework to incorporate style transfer to craft adversarial inputs of natural\nstyles that exhibit minimal detectability and maximum natural appearance, while\nmaintaining superior attack capabilities.\n","authors":["Qianyu Guo","Jiaming Fu","Yawen Lu","Dongming Gan"],"pdf_url":"https://arxiv.org/pdf/2403.14778v1.pdf","comment":"Accepted to IEEE VRW"},{"id":"http://arxiv.org/abs/2403.14774v1","updated":"2024-03-21T18:28:43Z","published":"2024-03-21T18:28:43Z","title":"Few-Shot Adversarial Prompt Learning on Vision-Language Models","summary":" The vulnerability of deep neural networks to imperceptible adversarial\nperturbations has attracted widespread attention. Inspired by the success of\nvision-language foundation models, previous efforts achieved zero-shot\nadversarial robustness by aligning adversarial visual features with text\nsupervision. However, in practice, they are still unsatisfactory due to several\nissues, including heavy adaptation cost, suboptimal text supervision, and\nuncontrolled natural generalization capacity. In this paper, to address these\nissues, we propose a few-shot adversarial prompt framework where adapting input\nsequences with limited data makes significant adversarial robustness\nimprovement. Specifically, we achieve this by providing adversarially\ncorrelated text supervision that is end-to-end learned from adversarial\nexamples. We also propose a novel training objective that enhances the\nconsistency of multi-modal features while encourages differentiated uni-modal\nfeatures between natural and adversarial examples. The proposed framework gives\naccess to learn adversarial text supervision, which provides superior\ncross-modal adversarial alignment and matches state-of-the-art zero-shot\nadversarial robustness with only 1% training data.\n","authors":["Yiwei Zhou","Xiaobo Xia","Zhiwei Lin","Bo Han","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2403.14774v1.pdf","comment":"25 pages, 13 tables, 8 figures"},{"id":"http://arxiv.org/abs/2403.14773v1","updated":"2024-03-21T18:27:29Z","published":"2024-03-21T18:27:29Z","title":"StreamingT2V: Consistent, Dynamic, and Extendable Long Video Generation\n from Text","summary":" Text-to-video diffusion models enable the generation of high-quality videos\nthat follow text instructions, making it easy to create diverse and individual\ncontent. However, existing approaches mostly focus on high-quality short video\ngeneration (typically 16 or 24 frames), ending up with hard-cuts when naively\nextended to the case of long video synthesis. To overcome these limitations, we\nintroduce StreamingT2V, an autoregressive approach for long video generation of\n80, 240, 600, 1200 or more frames with smooth transitions. The key components\nare:(i) a short-term memory block called conditional attention module (CAM),\nwhich conditions the current generation on the features extracted from the\nprevious chunk via an attentional mechanism, leading to consistent chunk\ntransitions, (ii) a long-term memory block called appearance preservation\nmodule, which extracts high-level scene and object features from the first\nvideo chunk to prevent the model from forgetting the initial scene, and (iii) a\nrandomized blending approach that enables to apply a video enhancer\nautoregressively for infinitely long videos without inconsistencies between\nchunks. Experiments show that StreamingT2V generates high motion amount. In\ncontrast, all competing image-to-video methods are prone to video stagnation\nwhen applied naively in an autoregressive manner. Thus, we propose with\nStreamingT2V a high-quality seamless text-to-long video generator that\noutperforms competitors with consistency and motion. Our code will be available\nat: https://github.com/Picsart-AI-Research/StreamingT2V\n","authors":["Roberto Henschel","Levon Khachatryan","Daniil Hayrapetyan","Hayk Poghosyan","Vahram Tadevosyan","Zhangyang Wang","Shant Navasardyan","Humphrey Shi"],"pdf_url":"https://arxiv.org/pdf/2403.14773v1.pdf","comment":"https://github.com/Picsart-AI-Research/StreamingT2V"},{"id":"http://arxiv.org/abs/2403.14772v1","updated":"2024-03-21T18:26:23Z","published":"2024-03-21T18:26:23Z","title":"Improving Robustness to Model Inversion Attacks via Sparse Coding\n Architectures","summary":" Recent model inversion attack algorithms permit adversaries to reconstruct a\nneural network's private training data just by repeatedly querying the network\nand inspecting its outputs. In this work, we develop a novel network\narchitecture that leverages sparse-coding layers to obtain superior robustness\nto this class of attacks. Three decades of computer science research has\nstudied sparse coding in the context of image denoising, object recognition,\nand adversarial misclassification settings, but to the best of our knowledge,\nits connection to state-of-the-art privacy vulnerabilities remains unstudied.\nHowever, sparse coding architectures suggest an advantageous means to defend\nagainst model inversion attacks because they allow us to control the amount of\nirrelevant private information encoded in a network's intermediate\nrepresentations in a manner that can be computed efficiently during training\nand that is known to have little effect on classification accuracy.\nSpecifically, compared to networks trained with a variety of state-of-the-art\ndefenses, our sparse-coding architectures maintain comparable or higher\nclassification accuracy while degrading state-of-the-art training data\nreconstructions by factors of 1.1 to 18.3 across a variety of reconstruction\nquality metrics (PSNR, SSIM, FID). This performance advantage holds across 5\ndatasets ranging from CelebA faces to medical images and CIFAR-10, and across\nvarious state-of-the-art SGD-based and GAN-based inversion attacks, including\nPlug-&-Play attacks. We provide a cluster-ready PyTorch codebase to promote\nresearch and standardize defense evaluations.\n","authors":["Sayanton V. Dibbo","Adam Breuer","Juston Moore","Michael Teti"],"pdf_url":"https://arxiv.org/pdf/2403.14772v1.pdf","comment":"32 pages, 15 Tables, and 9 Figures"},{"id":"http://arxiv.org/abs/2403.14760v1","updated":"2024-03-21T18:02:20Z","published":"2024-03-21T18:02:20Z","title":"Can 3D Vision-Language Models Truly Understand Natural Language?","summary":" Rapid advancements in 3D vision-language (3D-VL) tasks have opened up new\navenues for human interaction with embodied agents or robots using natural\nlanguage. Despite this progress, we find a notable limitation: existing 3D-VL\nmodels exhibit sensitivity to the styles of language input, struggling to\nunderstand sentences with the same semantic meaning but written in different\nvariants. This observation raises a critical question: Can 3D vision-language\nmodels truly understand natural language? To test the language\nunderstandability of 3D-VL models, we first propose a language robustness task\nfor systematically assessing 3D-VL models across various tasks, benchmarking\ntheir performance when presented with different language style variants.\nImportantly, these variants are commonly encountered in applications requiring\ndirect interaction with humans, such as embodied robotics, given the diversity\nand unpredictability of human language. We propose a 3D Language Robustness\nDataset, designed based on the characteristics of human language, to facilitate\nthe systematic study of robustness. Our comprehensive evaluation uncovers a\nsignificant drop in the performance of all existing models across various 3D-VL\ntasks. Even the state-of-the-art 3D-LLM fails to understand some variants of\nthe same sentences. Further in-depth analysis suggests that the existing models\nhave a fragile and biased fusion module, which stems from the low diversity of\nthe existing dataset. Finally, we propose a training-free module driven by LLM,\nwhich improves language robustness. Datasets and code will be available at\ngithub.\n","authors":["Weipeng Deng","Runyu Ding","Jihan Yang","Jiahui Liu","Yijiang Li","Xiaojuan Qi","Edith Ngai"],"pdf_url":"https://arxiv.org/pdf/2403.14760v1.pdf","comment":"https://github.com/VincentDENGP/3D-LR"},{"id":"http://arxiv.org/abs/2403.14743v1","updated":"2024-03-21T18:00:00Z","published":"2024-03-21T18:00:00Z","title":"VURF: A General-purpose Reasoning and Self-refinement Framework for\n Video Understanding","summary":" Recent studies have demonstrated the effectiveness of Large Language Models\n(LLMs) as reasoning modules that can deconstruct complex tasks into more\nmanageable sub-tasks, particularly when applied to visual reasoning tasks for\nimages. In contrast, this paper introduces a Video Understanding and Reasoning\nFramework (VURF) based on the reasoning power of LLMs. Ours is a novel approach\nto extend the utility of LLMs in the context of video tasks, leveraging their\ncapacity to generalize from minimal input and output demonstrations within a\ncontextual framework. By presenting LLMs with pairs of instructions and their\ncorresponding high-level programs, we harness their contextual learning\ncapabilities to generate executable visual programs for video understanding. To\nenhance program's accuracy and robustness, we implement two important\nstrategies. Firstly, we employ a feedback-generation approach, powered by\nGPT-3.5, to rectify errors in programs utilizing unsupported functions.\nSecondly, taking motivation from recent works on self refinement of LLM\noutputs, we introduce an iterative procedure for improving the quality of the\nin-context examples by aligning the initial outputs to the outputs that would\nhave been generated had the LLM not been bound by the structure of the\nin-context examples. Our results on several video-specific tasks, including\nvisual QA, video anticipation, pose estimation and multi-video QA illustrate\nthe efficacy of these enhancements in improving the performance of visual\nprogramming approaches for video tasks. Our Codes and data will be publicly\nreleased.\n","authors":["Ahmad Mahmood","Ashmal Vayani","Muzammal Naseer","Salman Khan","Fahad Khan"],"pdf_url":"https://arxiv.org/pdf/2403.14743v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14729v1","updated":"2024-03-21T02:33:37Z","published":"2024-03-21T02:33:37Z","title":"Auto-Train-Once: Controller Network Guided Automatic Network Pruning\n from Scratch","summary":" Current techniques for deep neural network (DNN) pruning often involve\nintricate multi-step processes that require domain-specific expertise, making\ntheir widespread adoption challenging. To address the limitation, the\nOnly-Train-Once (OTO) and OTOv2 are proposed to eliminate the need for\nadditional fine-tuning steps by directly training and compressing a general DNN\nfrom scratch. Nevertheless, the static design of optimizers (in OTO) can lead\nto convergence issues of local optima. In this paper, we proposed the\nAuto-Train-Once (ATO), an innovative network pruning algorithm designed to\nautomatically reduce the computational and storage costs of DNNs. During the\nmodel training phase, our approach not only trains the target model but also\nleverages a controller network as an architecture generator to guide the\nlearning of target model weights. Furthermore, we developed a novel stochastic\ngradient algorithm that enhances the coordination between model training and\ncontroller network training, thereby improving pruning performance. We provide\na comprehensive convergence analysis as well as extensive experiments, and the\nresults show that our approach achieves state-of-the-art performance across\nvarious model architectures (including ResNet18, ResNet34, ResNet50, ResNet56,\nand MobileNetv2) on standard benchmark datasets (CIFAR-10, CIFAR-100, and\nImageNet).\n","authors":["Xidong Wu","Shangqian Gao","Zeyu Zhang","Zhenzhen Li","Runxue Bao","Yanfu Zhang","Xiaoqian Wang","Heng Huang"],"pdf_url":"https://arxiv.org/pdf/2403.14729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15497v1","updated":"2024-03-21T18:31:47Z","published":"2024-03-21T18:31:47Z","title":"On the Detection of Anomalous or Out-Of-Distribution Data in Vision\n Models Using Statistical Techniques","summary":" Out-of-distribution data and anomalous inputs are vulnerabilities of machine\nlearning systems today, often causing systems to make incorrect predictions.\nThe diverse range of data on which these models are used makes detecting\natypical inputs a difficult and important task. We assess a tool, Benford's\nlaw, as a method used to quantify the difference between real and corrupted\ninputs. We believe that in many settings, it could function as a filter for\nanomalous data points and for signalling out-of-distribution data. We hope to\nopen a discussion on these applications and further areas where this technique\nis underexplored.\n","authors":["Laura O'Mahony","David JP O'Sullivan","Nikola S. Nikolov"],"pdf_url":"https://arxiv.org/pdf/2403.15497v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12167v2","updated":"2024-03-21T16:38:33Z","published":"2024-03-18T18:35:32Z","title":"Generalizing deep learning models for medical image classification","summary":" Numerous Deep Learning (DL) models have been developed for a large spectrum\nof medical image analysis applications, which promises to reshape various\nfacets of medical practice. Despite early advances in DL model validation and\nimplementation, which encourage healthcare institutions to adopt them, some\nfundamental questions remain: are the DL models capable of generalizing? What\ncauses a drop in DL model performances? How to overcome the DL model\nperformance drop? Medical data are dynamic and prone to domain shift, due to\nmultiple factors such as updates to medical equipment, new imaging workflow,\nand shifts in patient demographics or populations can induce this drift over\ntime. In this paper, we review recent developments in generalization methods\nfor DL-based classification models. We also discuss future challenges,\nincluding the need for improved evaluation protocols and benchmarks, and\nenvisioned future developments to achieve robust, generalized models for\nmedical image classification.\n","authors":["Sarah Matta","Mathieu Lamard","Philippe Zhang","Alexandre Le Guilcher","Laurent Borderie","Béatrice Cochener","Gwenolé Quellec"],"pdf_url":"https://arxiv.org/pdf/2403.12167v2.pdf","comment":null}]},"2024-03-22T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2403.15389v1","updated":"2024-03-22T17:59:58Z","published":"2024-03-22T17:59:58Z","title":"DiffusionMTL: Learning Multi-Task Denoising Diffusion Model from\n Partially Annotated Data","summary":" Recently, there has been an increased interest in the practical problem of\nlearning multiple dense scene understanding tasks from partially annotated\ndata, where each training sample is only labeled for a subset of the tasks. The\nmissing of task labels in training leads to low-quality and noisy predictions,\nas can be observed from state-of-the-art methods. To tackle this issue, we\nreformulate the partially-labeled multi-task dense prediction as a pixel-level\ndenoising problem, and propose a novel multi-task denoising diffusion framework\ncoined as DiffusionMTL. It designs a joint diffusion and denoising paradigm to\nmodel a potential noisy distribution in the task prediction or feature maps and\ngenerate rectified outputs for different tasks. To exploit multi-task\nconsistency in denoising, we further introduce a Multi-Task Conditioning\nstrategy, which can implicitly utilize the complementary nature of the tasks to\nhelp learn the unlabeled tasks, leading to an improvement in the denoising\nperformance of the different tasks. Extensive quantitative and qualitative\nexperiments demonstrate that the proposed multi-task denoising diffusion model\ncan significantly improve multi-task prediction maps, and outperform the\nstate-of-the-art methods on three challenging multi-task benchmarks, under two\ndifferent partial-labeling evaluation settings. The code is available at\nhttps://prismformore.github.io/diffusionmtl/.\n","authors":["Hanrong Ye","Dan Xu"],"pdf_url":"https://arxiv.org/pdf/2403.15389v1.pdf","comment":"The paper is accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.15388v1","updated":"2024-03-22T17:59:52Z","published":"2024-03-22T17:59:52Z","title":"LLaVA-PruMerge: Adaptive Token Reduction for Efficient Large Multimodal\n Models","summary":" Large Multimodal Models (LMMs) have shown significant reasoning capabilities\nby connecting a visual encoder and a large language model. LMMs typically use a\nfixed amount of visual tokens, such as the penultimate layer features in the\nCLIP visual encoder, as the prefix content. Recent LMMs incorporate more\ncomplex visual inputs, such as high-resolution images and videos, which\nincrease the number of visual tokens significantly. However, due to the design\nof the Transformer architecture, computational costs associated with these\nmodels tend to increase quadratically with the number of input tokens. To\ntackle this problem, we explore a token reduction mechanism and find, similar\nto prior work, that many visual tokens are spatially redundant. Based on this,\nwe propose PruMerge, a novel adaptive visual token reduction approach, which\nlargely reduces the number of visual tokens while maintaining comparable model\nperformance. We first select the unpruned visual tokens based on their\nsimilarity to class tokens and spatial tokens. We then cluster the pruned\ntokens based on key similarity and merge the clustered tokens with the unpruned\ntokens to supplement their information. Empirically, when applied to LLaVA-1.5,\nour approach can compress the visual tokens by 14.4 times on average, and\nachieve comparable performance across diverse visual question-answering and\nreasoning tasks. Code and checkpoints are at https://llava-prumerge.github.io/.\n","authors":["Yuzhang Shang","Mu Cai","Bingxin Xu","Yong Jae Lee","Yan Yan"],"pdf_url":"https://arxiv.org/pdf/2403.15388v1.pdf","comment":"Project page: https://llava-prumerge.github.io/"},{"id":"http://arxiv.org/abs/2403.15385v1","updated":"2024-03-22T17:59:37Z","published":"2024-03-22T17:59:37Z","title":"LATTE3D: Large-scale Amortized Text-To-Enhanced3D Synthesis","summary":" Recent text-to-3D generation approaches produce impressive 3D results but\nrequire time-consuming optimization that can take up to an hour per prompt.\nAmortized methods like ATT3D optimize multiple prompts simultaneously to\nimprove efficiency, enabling fast text-to-3D synthesis. However, they cannot\ncapture high-frequency geometry and texture details and struggle to scale to\nlarge prompt sets, so they generalize poorly. We introduce LATTE3D, addressing\nthese limitations to achieve fast, high-quality generation on a significantly\nlarger prompt set. Key to our method is 1) building a scalable architecture and\n2) leveraging 3D data during optimization through 3D-aware diffusion priors,\nshape regularization, and model initialization to achieve robustness to diverse\nand complex training prompts. LATTE3D amortizes both neural field and textured\nsurface generation to produce highly detailed textured meshes in a single\nforward pass. LATTE3D generates 3D objects in 400ms, and can be further\nenhanced with fast test-time optimization.\n","authors":["Kevin Xie","Jonathan Lorraine","Tianshi Cao","Jun Gao","James Lucas","Antonio Torralba","Sanja Fidler","Xiaohui Zeng"],"pdf_url":"https://arxiv.org/pdf/2403.15385v1.pdf","comment":"See the project website at\n https://research.nvidia.com/labs/toronto-ai/LATTE3D/"},{"id":"http://arxiv.org/abs/2312.10070v2","updated":"2024-03-22T17:59:09Z","published":"2023-12-06T10:47:53Z","title":"Gaussian-SLAM: Photo-realistic Dense SLAM with Gaussian Splatting","summary":" We present a dense simultaneous localization and mapping (SLAM) method that\nuses 3D Gaussians as a scene representation. Our approach enables\ninteractive-time reconstruction and photo-realistic rendering from real-world\nsingle-camera RGBD videos. To this end, we propose a novel effective strategy\nfor seeding new Gaussians for newly explored areas and their effective online\noptimization that is independent of the scene size and thus scalable to larger\nscenes. This is achieved by organizing the scene into sub-maps which are\nindependently optimized and do not need to be kept in memory. We further\naccomplish frame-to-model camera tracking by minimizing photometric and\ngeometric losses between the input and rendered frames. The Gaussian\nrepresentation allows for high-quality photo-realistic real-time rendering of\nreal-world scenes. Evaluation on synthetic and real-world datasets demonstrates\ncompetitive or superior performance in mapping, tracking, and rendering\ncompared to existing neural dense SLAM methods.\n","authors":["Vladimir Yugay","Yue Li","Theo Gevers","Martin R. Oswald"],"pdf_url":"https://arxiv.org/pdf/2312.10070v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15383v1","updated":"2024-03-22T17:59:01Z","published":"2024-03-22T17:59:01Z","title":"ThemeStation: Generating Theme-Aware 3D Assets from Few Exemplars","summary":" Real-world applications often require a large gallery of 3D assets that share\na consistent theme. While remarkable advances have been made in general 3D\ncontent creation from text or image, synthesizing customized 3D assets\nfollowing the shared theme of input 3D exemplars remains an open and\nchallenging problem. In this work, we present ThemeStation, a novel approach\nfor theme-aware 3D-to-3D generation. ThemeStation synthesizes customized 3D\nassets based on given few exemplars with two goals: 1) unity for generating 3D\nassets that thematically align with the given exemplars and 2) diversity for\ngenerating 3D assets with a high degree of variations. To this end, we design a\ntwo-stage framework that draws a concept image first, followed by a\nreference-informed 3D modeling stage. We propose a novel dual score\ndistillation (DSD) loss to jointly leverage priors from both the input\nexemplars and the synthesized concept image. Extensive experiments and user\nstudies confirm that ThemeStation surpasses prior works in producing diverse\ntheme-aware 3D models with impressive quality. ThemeStation also enables\nvarious applications such as controllable 3D-to-3D generation.\n","authors":["Zhenwei Wang","Tengfei Wang","Gerhard Hancke","Ziwei Liu","Rynson W. H. Lau"],"pdf_url":"https://arxiv.org/pdf/2403.15383v1.pdf","comment":"Project page: https://3dthemestation.github.io/"},{"id":"http://arxiv.org/abs/2403.15382v1","updated":"2024-03-22T17:58:59Z","published":"2024-03-22T17:58:59Z","title":"DragAPart: Learning a Part-Level Motion Prior for Articulated Objects","summary":" We introduce DragAPart, a method that, given an image and a set of drags as\ninput, can generate a new image of the same object in a new state, compatible\nwith the action of the drags. Differently from prior works that focused on\nrepositioning objects, DragAPart predicts part-level interactions, such as\nopening and closing a drawer. We study this problem as a proxy for learning a\ngeneralist motion model, not restricted to a specific kinematic structure or\nobject category. To this end, we start from a pre-trained image generator and\nfine-tune it on a new synthetic dataset, Drag-a-Move, which we introduce.\nCombined with a new encoding for the drags and dataset randomization, the new\nmodel generalizes well to real images and different categories. Compared to\nprior motion-controlled generators, we demonstrate much better part-level\nmotion understanding.\n","authors":["Ruining Li","Chuanxia Zheng","Christian Rupprecht","Andrea Vedaldi"],"pdf_url":"https://arxiv.org/pdf/2403.15382v1.pdf","comment":"Project page: https://dragapart.github.io/"},{"id":"http://arxiv.org/abs/2403.15378v1","updated":"2024-03-22T17:58:16Z","published":"2024-03-22T17:58:16Z","title":"Long-CLIP: Unlocking the Long-Text Capability of CLIP","summary":" Contrastive Language-Image Pre-training (CLIP) has been the cornerstone for\nzero-shot classification, text-image retrieval, and text-image generation by\naligning image and text modalities. Despite its widespread adoption, a\nsignificant limitation of CLIP lies in the inadequate length of text input. The\nlength of the text token is restricted to 77, and an empirical study shows the\nactual effective length is even less than 20. This prevents CLIP from handling\ndetailed descriptions, limiting its applications for image retrieval and\ntext-to-image generation with extensive prerequisites. To this end, we propose\nLong-CLIP as a plug-and-play alternative to CLIP that supports long-text input,\nretains or even surpasses its zero-shot generalizability, and aligns the CLIP\nlatent space, making it readily replace CLIP without any further adaptation in\ndownstream frameworks. Nevertheless, achieving this goal is far from\nstraightforward, as simplistic fine-tuning can result in a significant\ndegradation of CLIP's performance. Moreover, substituting the text encoder with\na language model supporting longer contexts necessitates pretraining with vast\namounts of data, incurring significant expenses. Accordingly, Long-CLIP\nintroduces an efficient fine-tuning solution on CLIP with two novel strategies\ndesigned to maintain the original capabilities, including (1) a\nknowledge-preserved stretching of positional embedding and (2) a primary\ncomponent matching of CLIP features. With leveraging just one million extra\nlong text-image pairs, Long-CLIP has shown the superiority to CLIP for about\n20% in long caption text-image retrieval and 6% in traditional text-image\nretrieval tasks, e.g., COCO and Flickr30k. Furthermore, Long-CLIP offers\nenhanced capabilities for generating images from detailed text descriptions by\nreplacing CLIP in a plug-and-play manner.\n","authors":["Beichen Zhang","Pan Zhang","Xiaoyi Dong","Yuhang Zang","Jiaqi Wang"],"pdf_url":"https://arxiv.org/pdf/2403.15378v1.pdf","comment":"All codes and models are publicly available at\n https://github.com/beichenzbc/Long-CLIP"},{"id":"http://arxiv.org/abs/2403.15377v1","updated":"2024-03-22T17:57:42Z","published":"2024-03-22T17:57:42Z","title":"InternVideo2: Scaling Video Foundation Models for Multimodal Video\n Understanding","summary":" We introduce InternVideo2, a new video foundation model (ViFM) that achieves\nthe state-of-the-art performance in action recognition, video-text tasks, and\nvideo-centric dialogue. Our approach employs a progressive training paradigm\nthat unifies the different self- or weakly-supervised learning frameworks of\nmasked video token reconstruction, cross-modal contrastive learning, and next\ntoken prediction. Different training stages would guide our model to capture\ndifferent levels of structure and semantic information through different\npretext tasks. At the data level, we prioritize the spatiotemporal consistency\nby semantically segmenting videos and generating video-audio-speech captions.\nThis improves the alignment between video and text. We scale both data and\nmodel size for our InternVideo2. Through extensive experiments, we validate our\ndesigns and demonstrate the state-of-the-art performance on over 60 video and\naudio tasks. Notably, our model outperforms others on various video-related\ncaptioning, dialogue, and long video understanding benchmarks, highlighting its\nability to reason and comprehend long temporal contexts. Code and models are\navailable at https://github.com/OpenGVLab/InternVideo2/.\n","authors":["Yi Wang","Kunchang Li","Xinhao Li","Jiashuo Yu","Yinan He","Guo Chen","Baoqi Pei","Rongkun Zheng","Jilan Xu","Zun Wang","Yansong Shi","Tianxiang Jiang","Songze Li","Hongjie Zhang","Yifei Huang","Yu Qiao","Yali Wang","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.15377v1.pdf","comment":"a technical report about video understanding"},{"id":"http://arxiv.org/abs/2403.15370v1","updated":"2024-03-22T17:49:11Z","published":"2024-03-22T17:49:11Z","title":"Augmented Reality based Simulated Data (ARSim) with multi-view\n consistency for AV perception networks","summary":" Detecting a diverse range of objects under various driving scenarios is\nessential for the effectiveness of autonomous driving systems. However, the\nreal-world data collected often lacks the necessary diversity presenting a\nlong-tail distribution. Although synthetic data has been utilized to overcome\nthis issue by generating virtual scenes, it faces hurdles such as a significant\ndomain gap and the substantial efforts required from 3D artists to create\nrealistic environments. To overcome these challenges, we present ARSim, a fully\nautomated, comprehensive, modular framework designed to enhance real multi-view\nimage data with 3D synthetic objects of interest. The proposed method\nintegrates domain adaptation and randomization strategies to address covariate\nshift between real and simulated data by inferring essential domain attributes\nfrom real data and employing simulation-based randomization for other\nattributes. We construct a simplified virtual scene using real data and\nstrategically place 3D synthetic assets within it. Illumination is achieved by\nestimating light distribution from multiple images capturing the surroundings\nof the vehicle. Camera parameters from real data are employed to render\nsynthetic assets in each frame. The resulting augmented multi-view consistent\ndataset is used to train a multi-camera perception network for autonomous\nvehicles. Experimental results on various AV perception tasks demonstrate the\nsuperior performance of networks trained on the augmented dataset.\n","authors":["Aqeel Anwar","Tae Eun Choe","Zian Wang","Sanja Fidler","Minwoo Park"],"pdf_url":"https://arxiv.org/pdf/2403.15370v1.pdf","comment":"17 pages, 15 figures, 7 tables"},{"id":"http://arxiv.org/abs/2403.14617v2","updated":"2024-03-22T17:45:52Z","published":"2024-03-21T17:59:03Z","title":"Videoshop: Localized Semantic Video Editing with Noise-Extrapolated\n Diffusion Inversion","summary":" We introduce Videoshop, a training-free video editing algorithm for localized\nsemantic edits. Videoshop allows users to use any editing software, including\nPhotoshop and generative inpainting, to modify the first frame; it\nautomatically propagates those changes, with semantic, spatial, and temporally\nconsistent motion, to the remaining frames. Unlike existing methods that enable\nedits only through imprecise textual instructions, Videoshop allows users to\nadd or remove objects, semantically change objects, insert stock photos into\nvideos, etc. with fine-grained control over locations and appearance. We\nachieve this through image-based video editing by inverting latents with noise\nextrapolation, from which we generate videos conditioned on the edited image.\nVideoshop produces higher quality edits against 6 baselines on 2 editing\nbenchmarks using 10 evaluation metrics.\n","authors":["Xiang Fan","Anand Bhattad","Ranjay Krishna"],"pdf_url":"https://arxiv.org/pdf/2403.14617v2.pdf","comment":"Project page at https://videoshop-editing.github.io/"},{"id":"http://arxiv.org/abs/2403.15361v1","updated":"2024-03-22T17:23:37Z","published":"2024-03-22T17:23:37Z","title":"Learning Topological Representations for Deep Image Understanding","summary":" In many scenarios, especially biomedical applications, the correct\ndelineation of complex fine-scaled structures such as neurons, tissues, and\nvessels is critical for downstream analysis. Despite the strong predictive\npower of deep learning methods, they do not provide a satisfactory\nrepresentation of these structures, thus creating significant barriers in\nscalable annotation and downstream analysis. In this dissertation, we tackle\nsuch challenges by proposing novel representations of these topological\nstructures in a deep learning framework. We leverage the mathematical tools\nfrom topological data analysis, i.e., persistent homology and discrete Morse\ntheory, to develop principled methods for better segmentation and uncertainty\nestimation, which will become powerful tools for scalable annotation.\n","authors":["Xiaoling Hu"],"pdf_url":"https://arxiv.org/pdf/2403.15361v1.pdf","comment":"Ph.D. thesis from Stony Brook University. This thesis includes works\n arXiv:1906.05404, arXiv:2110.08335, arXiv:2112.07812, arXiv:2103.09992,\n arXiv:2206.01742"},{"id":"http://arxiv.org/abs/2403.15360v1","updated":"2024-03-22T17:22:56Z","published":"2024-03-22T17:22:56Z","title":"SiMBA: Simplified Mamba-Based Architecture for Vision and Multivariate\n Time series","summary":" Transformers have widely adopted attention networks for sequence mixing and\nMLPs for channel mixing, playing a pivotal role in achieving breakthroughs\nacross domains. However, recent literature highlights issues with attention\nnetworks, including low inductive bias and quadratic complexity concerning\ninput sequence length. State Space Models (SSMs) like S4 and others (Hippo,\nGlobal Convolutions, liquid S4, LRU, Mega, and Mamba), have emerged to address\nthe above issues to help handle longer sequence lengths. Mamba, while being the\nstate-of-the-art SSM, has a stability issue when scaled to large networks for\ncomputer vision datasets. We propose SiMBA, a new architecture that introduces\nEinstein FFT (EinFFT) for channel modeling by specific eigenvalue computations\nand uses the Mamba block for sequence modeling. Extensive performance studies\nacross image and time-series benchmarks demonstrate that SiMBA outperforms\nexisting SSMs, bridging the performance gap with state-of-the-art transformers.\nNotably, SiMBA establishes itself as the new state-of-the-art SSM on ImageNet\nand transfer learning benchmarks such as Stanford Car and Flower as well as\ntask learning benchmarks as well as seven time series benchmark datasets. The\nproject page is available on this website\n~\\url{https://github.com/badripatro/Simba}.\n","authors":["Badri N. Patro","Vijay S. Agneeswaran"],"pdf_url":"https://arxiv.org/pdf/2403.15360v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15356v1","updated":"2024-03-22T17:11:47Z","published":"2024-03-22T17:11:47Z","title":"Neural Plasticity-Inspired Foundation Model for Observing the Earth\n Crossing Modalities","summary":" The development of foundation models has revolutionized our ability to\ninterpret the Earth's surface using satellite observational data. Traditional\nmodels have been siloed, tailored to specific sensors or data types like\noptical, radar, and hyperspectral, each with its own unique characteristics.\nThis specialization hinders the potential for a holistic analysis that could\nbenefit from the combined strengths of these diverse data sources. Our novel\napproach introduces the Dynamic One-For-All (DOFA) model, leveraging the\nconcept of neural plasticity in brain science to integrate various data\nmodalities into a single framework adaptively. This dynamic hypernetwork,\nadjusting to different wavelengths, enables a single versatile Transformer\njointly trained on data from five sensors to excel across 12 distinct Earth\nobservation tasks, including sensors never seen during pretraining. DOFA's\ninnovative design offers a promising leap towards more accurate, efficient, and\nunified Earth observation analysis, showcasing remarkable adaptability and\nperformance in harnessing the potential of multimodal Earth observation data.\n","authors":["Zhitong Xiong","Yi Wang","Fahong Zhang","Adam J. Stewart","Joëlle Hanna","Damian Borth","Ioannis Papoutsis","Bertrand Le Saux","Gustau Camps-Valls","Xiao Xiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.15356v1.pdf","comment":"33 pages, 10 figures"},{"id":"http://arxiv.org/abs/2403.15353v1","updated":"2024-03-22T17:08:03Z","published":"2024-03-22T17:08:03Z","title":"Fully automated workflow for the design of patient-specific orthopaedic\n implants: application to total knee arthroplasty","summary":" Arthroplasty is commonly performed to treat joint osteoarthritis, reducing\npain and improving mobility. While arthroplasty has known several technical\nimprovements, a significant share of patients are still unsatisfied with their\nsurgery. Personalised arthroplasty improves surgical outcomes however current\nsolutions require delays, making it difficult to integrate in clinical routine.\nWe propose a fully automated workflow to design patient-specific implants,\npresented for total knee arthroplasty, the most widely performed arthroplasty\nin the world nowadays.\n The proposed pipeline first uses artificial neural networks to segment the\nproximal and distal extremities of the femur and tibia. Then the full bones are\nreconstructed using augmented statistical shape models, combining shape and\nlandmarks information. Finally, 77 morphological parameters are computed to\ndesign patient-specific implants. The developed workflow has been trained using\n91 CT scans of lower limb and evaluated on 41 CT scans manually segmented, in\nterms of accuracy and execution time.\n The workflow accuracy was $0.4\\pm0.2mm$ for the segmentation, $1.2\\pm0.4mm$\nfor the full bones reconstruction, and $2.8\\pm2.2mm$ for the anatomical\nlandmarks determination. The custom implants fitted the patients' anatomy with\n$0.6\\pm0.2mm$ accuracy. The whole process from segmentation to implants' design\nlasted about 5 minutes.\n The proposed workflow allows for a fast and reliable personalisation of knee\nimplants, directly from the patient CT image without requiring any manual\nintervention. It establishes a patient-specific pre-operative planning for TKA\nin a very short time making it easily available for all patients. Combined with\nefficient implant manufacturing techniques, this solution could help answer the\ngrowing number of arthroplasties while reducing complications and improving the\npatients' satisfaction.\n","authors":["Aziliz Guezou-Philippe","Arnaud Clavé","Ehouarn Maguet","Ludivine Maintier","Charles Garraud","Jean-Rassaire Fouefack","Valérie Burdin","Eric Stindel","Guillaume Dardenne"],"pdf_url":"https://arxiv.org/pdf/2403.15353v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14125v3","updated":"2024-03-22T17:06:53Z","published":"2023-12-21T18:46:41Z","title":"VideoPoet: A Large Language Model for Zero-Shot Video Generation","summary":" We present VideoPoet, a language model capable of synthesizing high-quality\nvideo, with matching audio, from a large variety of conditioning signals.\nVideoPoet employs a decoder-only transformer architecture that processes\nmultimodal inputs -- including images, videos, text, and audio. The training\nprotocol follows that of Large Language Models (LLMs), consisting of two\nstages: pretraining and task-specific adaptation. During pretraining, VideoPoet\nincorporates a mixture of multimodal generative objectives within an\nautoregressive Transformer framework. The pretrained LLM serves as a foundation\nthat can be adapted for a range of video generation tasks. We present empirical\nresults demonstrating the model's state-of-the-art capabilities in zero-shot\nvideo generation, specifically highlighting VideoPoet's ability to generate\nhigh-fidelity motions. Project page: http://sites.research.google/videopoet/\n","authors":["Dan Kondratyuk","Lijun Yu","Xiuye Gu","José Lezama","Jonathan Huang","Grant Schindler","Rachel Hornung","Vighnesh Birodkar","Jimmy Yan","Ming-Chang Chiu","Krishna Somandepalli","Hassan Akbari","Yair Alon","Yong Cheng","Josh Dillon","Agrim Gupta","Meera Hahn","Anja Hauth","David Hendon","Alonso Martinez","David Minnen","Mikhail Sirotenko","Kihyuk Sohn","Xuan Yang","Hartwig Adam","Ming-Hsuan Yang","Irfan Essa","Huisheng Wang","David A. Ross","Bryan Seybold","Lu Jiang"],"pdf_url":"https://arxiv.org/pdf/2312.14125v3.pdf","comment":"Project page: http://sites.research.google/videopoet/"},{"id":"http://arxiv.org/abs/2403.09611v3","updated":"2024-03-22T17:03:16Z","published":"2024-03-14T17:51:32Z","title":"MM1: Methods, Analysis & Insights from Multimodal LLM Pre-training","summary":" In this work, we discuss building performant Multimodal Large Language Models\n(MLLMs). In particular, we study the importance of various architecture\ncomponents and data choices. Through careful and comprehensive ablations of the\nimage encoder, the vision language connector, and various pre-training data\nchoices, we identified several crucial design lessons. For example, we\ndemonstrate that for large-scale multimodal pre-training using a careful mix of\nimage-caption, interleaved image-text, and text-only data is crucial for\nachieving state-of-the-art (SOTA) few-shot results across multiple benchmarks,\ncompared to other published pre-training results. Further, we show that the\nimage encoder together with image resolution and the image token count has\nsubstantial impact, while the vision-language connector design is of\ncomparatively negligible importance. By scaling up the presented recipe, we\nbuild MM1, a family of multimodal models up to 30B parameters, including both\ndense models and mixture-of-experts (MoE) variants, that are SOTA in\npre-training metrics and achieve competitive performance after supervised\nfine-tuning on a range of established multimodal benchmarks. Thanks to\nlarge-scale pre-training, MM1 enjoys appealing properties such as enhanced\nin-context learning, and multi-image reasoning, enabling few-shot\nchain-of-thought prompting.\n","authors":["Brandon McKinzie","Zhe Gan","Jean-Philippe Fauconnier","Sam Dodge","Bowen Zhang","Philipp Dufter","Dhruti Shah","Xianzhi Du","Futang Peng","Floris Weers","Anton Belyi","Haotian Zhang","Karanjeet Singh","Doug Kang","Ankur Jain","Hongyu Hè","Max Schwarzer","Tom Gunter","Xiang Kong","Aonan Zhang","Jianyu Wang","Chong Wang","Nan Du","Tao Lei","Sam Wiseman","Guoli Yin","Mark Lee","Zirui Wang","Ruoming Pang","Peter Grasch","Alexander Toshev","Yinfei Yang"],"pdf_url":"https://arxiv.org/pdf/2403.09611v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10115v2","updated":"2024-03-22T16:46:36Z","published":"2023-12-15T09:57:21Z","title":"SkySense: A Multi-Modal Remote Sensing Foundation Model Towards\n Universal Interpretation for Earth Observation Imagery","summary":" Prior studies on Remote Sensing Foundation Model (RSFM) reveal immense\npotential towards a generic model for Earth Observation. Nevertheless, these\nworks primarily focus on a single modality without temporal and geo-context\nmodeling, hampering their capabilities for diverse tasks. In this study, we\npresent SkySense, a generic billion-scale model, pre-trained on a curated\nmulti-modal Remote Sensing Imagery (RSI) dataset with 21.5 million temporal\nsequences. SkySense incorporates a factorized multi-modal spatiotemporal\nencoder taking temporal sequences of optical and Synthetic Aperture Radar (SAR)\ndata as input. This encoder is pre-trained by our proposed Multi-Granularity\nContrastive Learning to learn representations across different modal and\nspatial granularities. To further enhance the RSI representations by the\ngeo-context clue, we introduce Geo-Context Prototype Learning to learn\nregion-aware prototypes upon RSI's multi-modal spatiotemporal features. To our\nbest knowledge, SkySense is the largest Multi-Modal RSFM to date, whose modules\ncan be flexibly combined or used individually to accommodate various tasks. It\ndemonstrates remarkable generalization capabilities on a thorough evaluation\nencompassing 16 datasets over 7 tasks, from single- to multi-modal, static to\ntemporal, and classification to localization. SkySense surpasses 18 recent\nRSFMs in all test scenarios. Specifically, it outperforms the latest models\nsuch as GFM, SatLas and Scale-MAE by a large margin, i.e., 2.76%, 3.67% and\n3.61% on average respectively. We will release the pre-trained weights to\nfacilitate future research and Earth Observation applications.\n","authors":["Xin Guo","Jiangwei Lao","Bo Dang","Yingying Zhang","Lei Yu","Lixiang Ru","Liheng Zhong","Ziyuan Huang","Kang Wu","Dingxiang Hu","Huimei He","Jian Wang","Jingdong Chen","Ming Yang","Yongjun Zhang","Yansheng Li"],"pdf_url":"https://arxiv.org/pdf/2312.10115v2.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2312.00094v2","updated":"2024-03-22T16:38:34Z","published":"2023-11-30T13:07:19Z","title":"Fast ODE-based Sampling for Diffusion Models in Around 5 Steps","summary":" Sampling from diffusion models can be treated as solving the corresponding\nordinary differential equations (ODEs), with the aim of obtaining an accurate\nsolution with as few number of function evaluations (NFE) as possible.\nRecently, various fast samplers utilizing higher-order ODE solvers have emerged\nand achieved better performance than the initial first-order one. However,\nthese numerical methods inherently result in certain approximation errors,\nwhich significantly degrades sample quality with extremely small NFE (e.g.,\naround 5). In contrast, based on the geometric observation that each sampling\ntrajectory almost lies in a two-dimensional subspace embedded in the ambient\nspace, we propose Approximate MEan-Direction Solver (AMED-Solver) that\neliminates truncation errors by directly learning the mean direction for fast\ndiffusion sampling. Besides, our method can be easily used as a plugin to\nfurther improve existing ODE-based samplers. Extensive experiments on image\nsynthesis with the resolution ranging from 32 to 512 demonstrate the\neffectiveness of our method. With only 5 NFE, we achieve 6.61 FID on CIFAR-10,\n10.74 FID on ImageNet 64$\\times$64, and 13.20 FID on LSUN Bedroom. Our code is\navailable at https://github.com/zju-pi/diff-sampler.\n","authors":["Zhenyu Zhou","Defang Chen","Can Wang","Chun Chen"],"pdf_url":"https://arxiv.org/pdf/2312.00094v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.14520v2","updated":"2024-03-22T16:35:49Z","published":"2024-03-21T16:17:57Z","title":"Cobra: Extending Mamba to Multi-Modal Large Language Model for Efficient\n Inference","summary":" In recent years, the application of multimodal large language models (MLLM)\nin various fields has achieved remarkable success. However, as the foundation\nmodel for many downstream tasks, current MLLMs are composed of the well-known\nTransformer network, which has a less efficient quadratic computation\ncomplexity. To improve the efficiency of such basic models, we propose Cobra, a\nlinear computational complexity MLLM. Specifically, Cobra integrates the\nefficient Mamba language model into the visual modality. Moreover, we explore\nand study various modal fusion schemes to create an effective multi-modal\nMamba. Extensive experiments demonstrate that (1) Cobra achieves extremely\ncompetitive performance with current computationally efficient state-of-the-art\nmethods, e.g., LLaVA-Phi, TinyLLaVA, and MobileVLM v2, and has faster speed due\nto Cobra's linear sequential modeling. (2) Interestingly, the results of\nclosed-set challenging prediction benchmarks show that Cobra performs well in\novercoming visual illusions and spatial relationship judgments. (3) Notably,\nCobra even achieves comparable performance to LLaVA with about 43% of the\nnumber of parameters. We will make all codes of Cobra open-source and hope that\nthe proposed method can facilitate future research on complexity problems in\nMLLM. Our project page is available at: https://sites.google.com/view/cobravlm.\n","authors":["Han Zhao","Min Zhang","Wei Zhao","Pengxiang Ding","Siteng Huang","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.14520v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15330v1","updated":"2024-03-22T16:35:38Z","published":"2024-03-22T16:35:38Z","title":"Selectively Informative Description can Reduce Undesired Embedding\n Entanglements in Text-to-Image Personalization","summary":" In text-to-image personalization, a timely and crucial challenge is the\ntendency of generated images overfitting to the biases present in the reference\nimages. We initiate our study with a comprehensive categorization of the biases\ninto background, nearby-object, tied-object, substance (in style\nre-contextualization), and pose biases. These biases manifest in the generated\nimages due to their entanglement into the subject embedding. This undesired\nembedding entanglement not only results in the reflection of biases from the\nreference images into the generated images but also notably diminishes the\nalignment of the generated images with the given generation prompt. To address\nthis challenge, we propose SID~(Selectively Informative Description), a text\ndescription strategy that deviates from the prevalent approach of only\ncharacterizing the subject's class identification. SID is generated utilizing\nmultimodal GPT-4 and can be seamlessly integrated into optimization-based\nmodels. We present comprehensive experimental results along with analyses of\ncross-attention maps, subject-alignment, non-subject-disentanglement, and\ntext-alignment.\n","authors":["Jimyeong Kim","Jungwon Park","Wonjong Rhee"],"pdf_url":"https://arxiv.org/pdf/2403.15330v1.pdf","comment":"Published at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.04690v2","updated":"2024-03-22T16:26:40Z","published":"2024-03-07T17:35:58Z","title":"Faster Neighborhood Attention: Reducing the O(n^2) Cost of Self\n Attention at the Threadblock Level","summary":" Neighborhood attention reduces the cost of self attention by restricting each\ntoken's attention span to its nearest neighbors. This restriction,\nparameterized by a window size and dilation factor, draws a spectrum of\npossible attention patterns between linear projection and self attention.\nNeighborhood attention, and more generally sliding window attention patterns,\nhave long been bounded by infrastructure, particularly in higher-rank spaces\n(2-D and 3-D), calling for the development of custom kernels, which have been\nlimited in either functionality, or performance, if not both. In this work, we\nfirst show that neighborhood attention can be represented as a batched GEMM\nproblem, similar to standard attention, and implement it for 1-D and 2-D\nneighborhood attention. These kernels on average provide 895% and 272%\nimprovement in full precision latency compared to existing naive kernels for\n1-D and 2-D neighborhood attention respectively. We find certain inherent\ninefficiencies in all unfused neighborhood attention kernels that bound their\nperformance and lower-precision scalability. We also developed fused\nneighborhood attention; an adaptation of fused dot-product attention kernels\nthat allow fine-grained control over attention across different spatial axes.\nKnown for reducing the quadratic time complexity of self attention to a linear\ncomplexity, neighborhood attention can now enjoy a reduced and constant memory\nfootprint, and record-breaking half precision latency. We observe that our\nfused kernels successfully circumvent some of the unavoidable inefficiencies in\nunfused implementations. While our unfused GEMM-based kernels only improve half\nprecision performance compared to naive kernels by an average of 496% and 113%\nin 1-D and 2-D problems respectively, our fused kernels improve naive kernels\nby an average of 1607% and 581% in 1-D and 2-D problems respectively.\n","authors":["Ali Hassani","Wen-Mei Hwu","Humphrey Shi"],"pdf_url":"https://arxiv.org/pdf/2403.04690v2.pdf","comment":"Project page: https://github.com/SHI-Labs/NATTEN"},{"id":"http://arxiv.org/abs/2403.15317v1","updated":"2024-03-22T16:11:29Z","published":"2024-03-22T16:11:29Z","title":"Point-DETR3D: Leveraging Imagery Data with Spatial Point Prior for\n Weakly Semi-supervised 3D Object Detection","summary":" Training high-accuracy 3D detectors necessitates massive labeled 3D\nannotations with 7 degree-of-freedom, which is laborious and time-consuming.\nTherefore, the form of point annotations is proposed to offer significant\nprospects for practical applications in 3D detection, which is not only more\naccessible and less expensive but also provides strong spatial information for\nobject localization.In this paper, we empirically discover that it is\nnon-trivial to merely adapt Point-DETR to its 3D form, encountering two main\nbottlenecks: 1) it fails to encode strong 3D prior into the model, and 2) it\ngenerates low-quality pseudo labels in distant regions due to the extreme\nsparsity of LiDAR points. To overcome these challenges, we introduce\nPoint-DETR3D, a teacher-student framework for weakly semi-supervised 3D\ndetection, designed to fully capitalize on point-wise supervision within a\nconstrained instance-wise annotation budget.Different from Point-DETR which\nencodes 3D positional information solely through a point encoder, we propose an\nexplicit positional query initialization strategy to enhance the positional\nprior. Considering the low quality of pseudo labels at distant regions produced\nby the teacher model, we enhance the detector's perception by incorporating\ndense imagery data through a novel Cross-Modal Deformable RoI Fusion\n(D-RoI).Moreover, an innovative point-guided self-supervised learning technique\nis proposed to allow for fully exploiting point priors, even in student\nmodels.Extensive experiments on representative nuScenes dataset demonstrate our\nPoint-DETR3D obtains significant improvements compared to previous works.\nNotably, with only 5% of labeled data, Point-DETR3D achieves over 90%\nperformance of its fully supervised counterpart.\n","authors":["Hongzhi Gao","Zheng Chen","Zehui Chen","Lin Chen","Jiaming Liu","Shanghang Zhang","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.15317v1.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2403.15316v1","updated":"2024-03-22T16:10:38Z","published":"2024-03-22T16:10:38Z","title":"Ultrasound Imaging based on the Variance of a Diffusion Restoration\n Model","summary":" Despite today's prevalence of ultrasound imaging in medicine, ultrasound\nsignal-to-noise ratio is still affected by several sources of noise and\nartefacts. Moreover, enhancing ultrasound image quality involves balancing\nconcurrent factors like contrast, resolution, and speckle preservation.\nRecently, there has been progress in both model-based and learning-based\napproaches addressing the problem of ultrasound image reconstruction. Bringing\nthe best from both worlds, we propose a hybrid reconstruction method combining\nan ultrasound linear direct model with a learning-based prior coming from a\ngenerative Denoising Diffusion model. More specifically, we rely on the\nunsupervised fine-tuning of a pre-trained Denoising Diffusion Restoration Model\n(DDRM). Given the nature of multiplicative noise inherent to ultrasound, this\npaper proposes an empirical model to characterize the stochasticity of\ndiffusion reconstruction of ultrasound images, and shows the interest of its\nvariance as an echogenicity map estimator. We conduct experiments on synthetic,\nin-vitro, and in-vivo data, demonstrating the efficacy of our variance imaging\napproach in achieving high-quality image reconstructions from single plane-wave\nacquisitions and in comparison to state-of-the-art methods.\n","authors":["Yuxin Zhang","Clément Huneau","Jérôme Idier","Diana Mateus"],"pdf_url":"https://arxiv.org/pdf/2403.15316v1.pdf","comment":"5 pages; submitted to EUSIPCO 2024. arXiv admin note: text overlap\n with arXiv:2310.20618"},{"id":"http://arxiv.org/abs/2403.15314v1","updated":"2024-03-22T16:06:43Z","published":"2024-03-22T16:06:43Z","title":"Global Control for Local SO(3)-Equivariant Scale-Invariant Vessel\n Segmentation","summary":" Personalized 3D vascular models can aid in a range of diagnostic, prognostic,\nand treatment-planning tasks relevant to cardiovascular disease management.\nDeep learning provides a means to automatically obtain such models. Ideally, a\nuser should have control over the exact region of interest (ROI) to be included\nin a vascular model, and the model should be watertight and highly accurate. To\nthis end, we propose a combination of a global controller leveraging voxel mask\nsegmentations to provide boundary conditions for vessels of interest to a\nlocal, iterative vessel segmentation model. We introduce the preservation of\nscale- and rotational symmetries in the local segmentation model, leading to\ngeneralisation to vessels of unseen sizes and orientations. Combined with the\nglobal controller, this enables flexible 3D vascular model building, without\nadditional retraining. We demonstrate the potential of our method on a dataset\ncontaining abdominal aortic aneurysms (AAAs). Our method performs on par with a\nstate-of-the-art segmentation model in the segmentation of AAAs, iliac arteries\nand renal arteries, while providing a watertight, smooth surface segmentation.\nMoreover, we demonstrate that by adapting the global controller, we can easily\nextend vessel sections in the 3D model.\n","authors":["Patryk Rygiel","Dieuwertje Alblas","Christoph Brune","Kak Khee Yeung","Jelmer M. Wolterink"],"pdf_url":"https://arxiv.org/pdf/2403.15314v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15313v1","updated":"2024-03-22T16:06:05Z","published":"2024-03-22T16:06:05Z","title":"CR3DT: Camera-RADAR Fusion for 3D Detection and Tracking","summary":" Accurate detection and tracking of surrounding objects is essential to enable\nself-driving vehicles. While Light Detection and Ranging (LiDAR) sensors have\nset the benchmark for high performance, the appeal of camera-only solutions\nlies in their cost-effectiveness. Notably, despite the prevalent use of Radio\nDetection and Ranging (RADAR) sensors in automotive systems, their potential in\n3D detection and tracking has been largely disregarded due to data sparsity and\nmeasurement noise. As a recent development, the combination of RADARs and\ncameras is emerging as a promising solution. This paper presents Camera-RADAR\n3D Detection and Tracking (CR3DT), a camera-RADAR fusion model for 3D object\ndetection, and Multi-Object Tracking (MOT). Building upon the foundations of\nthe State-of-the-Art (SotA) camera-only BEVDet architecture, CR3DT demonstrates\nsubstantial improvements in both detection and tracking capabilities, by\nincorporating the spatial and velocity information of the RADAR sensor.\nExperimental results demonstrate an absolute improvement in detection\nperformance of 5.3% in mean Average Precision (mAP) and a 14.9% increase in\nAverage Multi-Object Tracking Accuracy (AMOTA) on the nuScenes dataset when\nleveraging both modalities. CR3DT bridges the gap between high-performance and\ncost-effective perception systems in autonomous driving, by capitalizing on the\nubiquitous presence of RADAR in automotive applications.\n","authors":["Nicolas Baumann","Michael Baumgartner","Edoardo Ghignone","Jonas Kühne","Tobias Fischer","Yung-Hsu Yang","Marc Pollefeys","Michele Magno"],"pdf_url":"https://arxiv.org/pdf/2403.15313v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15309v1","updated":"2024-03-22T15:59:24Z","published":"2024-03-22T15:59:24Z","title":"Controlled Training Data Generation with Diffusion Models","summary":" In this work, we present a method to control a text-to-image generative model\nto produce training data specifically \"useful\" for supervised learning. Unlike\nprevious works that employ an open-loop approach and pre-define prompts to\ngenerate new data using either a language model or human expertise, we develop\nan automated closed-loop system which involves two feedback mechanisms. The\nfirst mechanism uses feedback from a given supervised model and finds\nadversarial prompts that result in image generations that maximize the model\nloss. While these adversarial prompts result in diverse data informed by the\nmodel, they are not informed of the target distribution, which can be\ninefficient. Therefore, we introduce the second feedback mechanism that guides\nthe generation process towards a certain target distribution. We call the\nmethod combining these two mechanisms Guided Adversarial Prompts. We perform\nour evaluations on different tasks, datasets and architectures, with different\ntypes of distribution shifts (spuriously correlated data, unseen domains) and\ndemonstrate the efficiency of the proposed feedback mechanisms compared to\nopen-loop approaches.\n","authors":["Teresa Yeo","Andrei Atanov","Harold Benoit","Aleksandr Alekseev","Ruchira Ray","Pooya Esmaeil Akhoondi","Amir Zamir"],"pdf_url":"https://arxiv.org/pdf/2403.15309v1.pdf","comment":"Project page at https://adversarial-prompts.epfl.ch/"},{"id":"http://arxiv.org/abs/2403.12505v2","updated":"2024-03-22T15:41:20Z","published":"2024-03-19T07:11:53Z","title":"Semantics, Distortion, and Style Matter: Towards Source-free UDA for\n Panoramic Segmentation","summary":" This paper addresses an interesting yet challenging problem -- source-free\nunsupervised domain adaptation (SFUDA) for pinhole-to-panoramic semantic\nsegmentation -- given only a pinhole image-trained model (i.e., source) and\nunlabeled panoramic images (i.e., target). Tackling this problem is nontrivial\ndue to the semantic mismatches, style discrepancies, and inevitable distortion\nof panoramic images. To this end, we propose a novel method that utilizes\nTangent Projection (TP) as it has less distortion and meanwhile slits the\nequirectangular projection (ERP) with a fixed FoV to mimic the pinhole images.\nBoth projections are shown effective in extracting knowledge from the source\nmodel. However, the distinct projection discrepancies between source and target\ndomains impede the direct knowledge transfer; thus, we propose a panoramic\nprototype adaptation module (PPAM) to integrate panoramic prototypes from the\nextracted knowledge for adaptation. We then impose the loss constraints on both\npredictions and prototypes and propose a cross-dual attention module (CDAM) at\nthe feature level to better align the spatial and channel characteristics\nacross the domains and projections. Both knowledge extraction and transfer\nprocesses are synchronously updated to reach the best performance. Extensive\nexperiments on the synthetic and real-world benchmarks, including outdoor and\nindoor scenarios, demonstrate that our method achieves significantly better\nperformance than prior SFUDA methods for pinhole-to-panoramic adaptation.\n","authors":["Xu Zheng","Pengyuan Zhou","Athanasios V. Vasilakos","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.12505v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2310.00632v2","updated":"2024-03-22T15:38:53Z","published":"2023-10-01T10:06:01Z","title":"Win-Win: Training High-Resolution Vision Transformers from Two Windows","summary":" Transformers have become the standard in state-of-the-art vision\narchitectures, achieving impressive performance on both image-level and dense\npixelwise tasks. However, training vision transformers for high-resolution\npixelwise tasks has a prohibitive cost. Typical solutions boil down to\nhierarchical architectures, fast and approximate attention, or training on\nlow-resolution crops. This latter solution does not constrain architectural\nchoices, but it leads to a clear performance drop when testing at resolutions\nsignificantly higher than that used for training, thus requiring ad-hoc and\nslow post-processing schemes. In this paper, we propose a novel strategy for\nefficient training and inference of high-resolution vision transformers. The\nkey principle is to mask out most of the high-resolution inputs during\ntraining, keeping only N random windows. This allows the model to learn local\ninteractions between tokens inside each window, and global interactions between\ntokens from different windows. As a result, the model can directly process the\nhigh-resolution input at test time without any special trick. We show that this\nstrategy is effective when using relative positional embedding such as rotary\nembeddings. It is 4 times faster to train than a full-resolution network, and\nit is straightforward to use at test time compared to existing approaches. We\napply this strategy to three dense prediction tasks with high-resolution data.\nFirst, we show on the task of semantic segmentation that a simple setting with\n2 windows performs best, hence the name of our method: Win-Win. Second, we\nconfirm this result on the task of monocular depth prediction. Third, we\nfurther extend it to the binocular task of optical flow, reaching\nstate-of-the-art performance on the Spring benchmark that contains Full-HD\nimages with an order of magnitude faster inference than the best competitor.\n","authors":["Vincent Leroy","Jerome Revaud","Thomas Lucas","Philippe Weinzaepfel"],"pdf_url":"https://arxiv.org/pdf/2310.00632v2.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2401.11170v2","updated":"2024-03-22T15:31:39Z","published":"2024-01-20T08:46:06Z","title":"Inducing High Energy-Latency of Large Vision-Language Models with\n Verbose Images","summary":" Large vision-language models (VLMs) such as GPT-4 have achieved exceptional\nperformance across various multi-modal tasks. However, the deployment of VLMs\nnecessitates substantial energy consumption and computational resources. Once\nattackers maliciously induce high energy consumption and latency time\n(energy-latency cost) during inference of VLMs, it will exhaust computational\nresources. In this paper, we explore this attack surface about availability of\nVLMs and aim to induce high energy-latency cost during inference of VLMs. We\nfind that high energy-latency cost during inference of VLMs can be manipulated\nby maximizing the length of generated sequences. To this end, we propose\nverbose images, with the goal of crafting an imperceptible perturbation to\ninduce VLMs to generate long sentences during inference. Concretely, we design\nthree loss objectives. First, a loss is proposed to delay the occurrence of\nend-of-sequence (EOS) token, where EOS token is a signal for VLMs to stop\ngenerating further tokens. Moreover, an uncertainty loss and a token diversity\nloss are proposed to increase the uncertainty over each generated token and the\ndiversity among all tokens of the whole generated sequence, respectively, which\ncan break output dependency at token-level and sequence-level. Furthermore, a\ntemporal weight adjustment algorithm is proposed, which can effectively balance\nthese losses. Extensive experiments demonstrate that our verbose images can\nincrease the length of generated sequences by 7.87 times and 8.56 times\ncompared to original images on MS-COCO and ImageNet datasets, which presents\npotential challenges for various applications. Our code is available at\nhttps://github.com/KuofengGao/Verbose_Images.\n","authors":["Kuofeng Gao","Yang Bai","Jindong Gu","Shu-Tao Xia","Philip Torr","Zhifeng Li","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2401.11170v2.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2308.13712v3","updated":"2024-03-22T15:30:57Z","published":"2023-08-25T23:54:15Z","title":"Residual Denoising Diffusion Models","summary":" We propose residual denoising diffusion models (RDDM), a novel dual diffusion\nprocess that decouples the traditional single denoising diffusion process into\nresidual diffusion and noise diffusion. This dual diffusion framework expands\nthe denoising-based diffusion models, initially uninterpretable for image\nrestoration, into a unified and interpretable model for both image generation\nand restoration by introducing residuals. Specifically, our residual diffusion\nrepresents directional diffusion from the target image to the degraded input\nimage and explicitly guides the reverse generation process for image\nrestoration, while noise diffusion represents random perturbations in the\ndiffusion process. The residual prioritizes certainty, while the noise\nemphasizes diversity, enabling RDDM to effectively unify tasks with varying\ncertainty or diversity requirements, such as image generation and restoration.\nWe demonstrate that our sampling process is consistent with that of DDPM and\nDDIM through coefficient transformation, and propose a partially\npath-independent generation process to better understand the reverse process.\nNotably, our RDDM enables a generic UNet, trained with only an L1 loss and a\nbatch size of 1, to compete with state-of-the-art image restoration methods. We\nprovide code and pre-trained models to encourage further exploration,\napplication, and development of our innovative framework\n(https://github.com/nachifur/RDDM).\n","authors":["Jiawei Liu","Qiang Wang","Huijie Fan","Yinong Wang","Yandong Tang","Liangqiong Qu"],"pdf_url":"https://arxiv.org/pdf/2308.13712v3.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2403.09530v2","updated":"2024-03-22T15:26:05Z","published":"2024-03-14T16:13:00Z","title":"VisionGPT-3D: A Generalized Multimodal Agent for Enhanced 3D Vision\n Understanding","summary":" The evolution of text to visual components facilitates people's daily lives,\nsuch as generating image, videos from text and identifying the desired elements\nwithin the images. Computer vision models involving the multimodal abilities in\nthe previous days are focused on image detection, classification based on\nwell-defined objects. Large language models (LLMs) introduces the\ntransformation from nature language to visual objects, which present the visual\nlayout for text contexts. OpenAI GPT-4 has emerged as the pinnacle in LLMs,\nwhile the computer vision (CV) domain boasts a plethora of state-of-the-art\n(SOTA) models and algorithms to convert 2D images to their 3D representations.\nHowever, the mismatching between the algorithms with the problem could lead to\nundesired results. In response to this challenge, we propose an unified\nVisionGPT-3D framework to consolidate the state-of-the-art vision models,\nthereby facilitating the development of vision-oriented AI. VisionGPT-3D\nprovides a versatile multimodal framework building upon the strengths of\nmultimodal foundation models. It seamlessly integrates various SOTA vision\nmodels and brings the automation in the selection of SOTA vision models,\nidentifies the suitable 3D mesh creation algorithms corresponding to 2D depth\nmaps analysis, generates optimal results based on diverse multimodal inputs\nsuch as text prompts.\n Keywords: VisionGPT-3D, 3D vision understanding, Multimodal agent\n","authors":["Chris Kelly","Luhui Hu","Jiayin Hu","Yu Tian","Deshun Yang","Bang Yang","Cindy Yang","Zihao Li","Zaoshan Huang","Yuexian Zou"],"pdf_url":"https://arxiv.org/pdf/2403.09530v2.pdf","comment":"12 pages, 7 figures, pending conference"},{"id":"http://arxiv.org/abs/2403.15272v1","updated":"2024-03-22T15:15:44Z","published":"2024-03-22T15:15:44Z","title":"WSCLoc: Weakly-Supervised Sparse-View Camera Relocalization","summary":" Despite the advancements in deep learning for camera relocalization tasks,\nobtaining ground truth pose labels required for the training process remains a\ncostly endeavor. While current weakly supervised methods excel in lightweight\nlabel generation, their performance notably declines in scenarios with sparse\nviews. In response to this challenge, we introduce WSCLoc, a system capable of\nbeing customized to various deep learning-based relocalization models to\nenhance their performance under weakly-supervised and sparse view conditions.\nThis is realized with two stages. In the initial stage, WSCLoc employs a\nmultilayer perceptron-based structure called WFT-NeRF to co-optimize image\nreconstruction quality and initial pose information. To ensure a stable\nlearning process, we incorporate temporal information as input. Furthermore,\ninstead of optimizing SE(3), we opt for $\\mathfrak{sim}(3)$ optimization to\nexplicitly enforce a scale constraint. In the second stage, we co-optimize the\npre-trained WFT-NeRF and WFT-Pose. This optimization is enhanced by\nTime-Encoding based Random View Synthesis and supervised by inter-frame\ngeometric constraints that consider pose, depth, and RGB information. We\nvalidate our approaches on two publicly available datasets, one outdoor and one\nindoor. Our experimental results demonstrate that our weakly-supervised\nrelocalization solutions achieve superior pose estimation accuracy in\nsparse-view scenarios, comparable to state-of-the-art camera relocalization\nmethods. We will make our code publicly available.\n","authors":["Jialu Wang","Kaichen Zhou","Andrew Markham","Niki Trigoni"],"pdf_url":"https://arxiv.org/pdf/2403.15272v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15260v1","updated":"2024-03-22T15:00:29Z","published":"2024-03-22T15:00:29Z","title":"Hyperbolic Metric Learning for Visual Outlier Detection","summary":" Out-Of-Distribution (OOD) detection is critical to deploy deep learning\nmodels in safety-critical applications. However, the inherent hierarchical\nconcept structure of visual data, which is instrumental to OOD detection, is\noften poorly captured by conventional methods based on Euclidean geometry. This\nwork proposes a metric framework that leverages the strengths of Hyperbolic\ngeometry for OOD detection. Inspired by previous works that refine the decision\nboundary for OOD data with synthetic outliers, we extend this method to\nHyperbolic space. Interestingly, we find that synthetic outliers do not benefit\nOOD detection in Hyperbolic space as they do in Euclidean space. Furthermore we\nexplore the relationship between OOD detection performance and Hyperbolic\nembedding dimension, addressing practical concerns in resource-constrained\nenvironments. Extensive experiments show that our framework improves the FPR95\nfor OOD detection from 22\\% to 15\\% and from 49% to 28% on CIFAR-10 and\nCIFAR-100 respectively compared to Euclidean methods.\n","authors":["Alvaro Gonzalez-Jimenez","Simone Lionetti","Dena Bazazian","Philippe Gottfrois","Fabian Gröger","Marc Pouly","Alexander Navarini"],"pdf_url":"https://arxiv.org/pdf/2403.15260v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15249v1","updated":"2024-03-22T14:47:18Z","published":"2024-03-22T14:47:18Z","title":"Spectral Motion Alignment for Video Motion Transfer using Diffusion\n Models","summary":" The evolution of diffusion models has greatly impacted video generation and\nunderstanding. Particularly, text-to-video diffusion models (VDMs) have\nsignificantly facilitated the customization of input video with target\nappearance, motion, etc. Despite these advances, challenges persist in\naccurately distilling motion information from video frames. While existing\nworks leverage the consecutive frame residual as the target motion vector, they\ninherently lack global motion context and are vulnerable to frame-wise\ndistortions. To address this, we present Spectral Motion Alignment (SMA), a\nnovel framework that refines and aligns motion vectors using Fourier and\nwavelet transforms. SMA learns motion patterns by incorporating\nfrequency-domain regularization, facilitating the learning of whole-frame\nglobal motion dynamics, and mitigating spatial artifacts. Extensive experiments\ndemonstrate SMA's efficacy in improving motion transfer while maintaining\ncomputational efficiency and compatibility across various video customization\nframeworks.\n","authors":["Geon Yeong Park","Hyeonho Jeong","Sang Wan Lee","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2403.15249v1.pdf","comment":"Project page:\n https://geonyeong-park.github.io/spectral-motion-alignment/"},{"id":"http://arxiv.org/abs/2403.15248v1","updated":"2024-03-22T14:46:51Z","published":"2024-03-22T14:46:51Z","title":"Self-Supervised Backbone Framework for Diverse Agricultural Vision Tasks","summary":" Computer vision in agriculture is game-changing with its ability to transform\nfarming into a data-driven, precise, and sustainable industry. Deep learning\nhas empowered agriculture vision to analyze vast, complex visual data, but\nheavily rely on the availability of large annotated datasets. This remains a\nbottleneck as manual labeling is error-prone, time-consuming, and expensive.\nThe lack of efficient labeling approaches inspired us to consider\nself-supervised learning as a paradigm shift, learning meaningful feature\nrepresentations from raw agricultural image data. In this work, we explore how\nself-supervised representation learning unlocks the potential applicability to\ndiverse agriculture vision tasks by eliminating the need for large-scale\nannotated datasets. We propose a lightweight framework utilizing SimCLR, a\ncontrastive learning approach, to pre-train a ResNet-50 backbone on a large,\nunannotated dataset of real-world agriculture field images. Our experimental\nanalysis and results indicate that the model learns robust features applicable\nto a broad range of downstream agriculture tasks discussed in the paper.\nAdditionally, the reduced reliance on annotated data makes our approach more\ncost-effective and accessible, paving the way for broader adoption of computer\nvision in agriculture.\n","authors":["Sudhir Sornapudi","Rajhans Singh"],"pdf_url":"https://arxiv.org/pdf/2403.15248v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08863v2","updated":"2024-03-22T14:46:05Z","published":"2023-11-15T10:49:15Z","title":"Toulouse Hyperspectral Data Set: a benchmark data set to assess\n semi-supervised spectral representation learning and pixel-wise\n classification techniques","summary":" Airborne hyperspectral images can be used to map the land cover in large\nurban areas, thanks to their very high spatial and spectral resolutions on a\nwide spectral domain. While the spectral dimension of hyperspectral images is\nhighly informative of the chemical composition of the land surface, the use of\nstate-of-the-art machine learning algorithms to map the land cover has been\ndramatically limited by the availability of training data. To cope with the\nscarcity of annotations, semi-supervised and self-supervised techniques have\nlately raised a lot of interest in the community. Yet, the publicly available\nhyperspectral data sets commonly used to benchmark machine learning models are\nnot totally suited to evaluate their generalization performances due to one or\nseveral of the following properties: a limited geographical coverage (which\ndoes not reflect the spectral diversity in metropolitan areas), a small number\nof land cover classes and a lack of appropriate standard train / test splits\nfor semi-supervised and self-supervised learning. Therefore, we release in this\npaper the Toulouse Hyperspectral Data Set that stands out from other data sets\nin the above-mentioned respects in order to meet key issues in spectral\nrepresentation learning and classification over large-scale hyperspectral\nimages with very few labeled pixels. Besides, we discuss and experiment\nself-supervised techniques for spectral representation learning, including the\nMasked Autoencoder, and establish a baseline for pixel-wise classification\nachieving 85% overall accuracy and 77% F1 score. The Toulouse Hyperspectral\nData Set and our code are publicly available at\nhttps://www.toulouse-hyperspectral-data-set.com and\nhttps://www.github.com/Romain3Ch216/tlse-experiments, respectively.\n","authors":["Romain Thoreau","Laurent Risser","Véronique Achard","Béatrice Berthelot","Xavier Briottet"],"pdf_url":"https://arxiv.org/pdf/2311.08863v2.pdf","comment":"17 pages, 13 figures"},{"id":"http://arxiv.org/abs/2403.15245v1","updated":"2024-03-22T14:41:55Z","published":"2024-03-22T14:41:55Z","title":"Reasoning-Enhanced Object-Centric Learning for Videos","summary":" Object-centric learning aims to break down complex visual scenes into more\nmanageable object representations, enhancing the understanding and reasoning\nabilities of machine learning systems toward the physical world. Recently,\nslot-based video models have demonstrated remarkable proficiency in segmenting\nand tracking objects, but they overlook the importance of the effective\nreasoning module. In the real world, reasoning and predictive abilities play a\ncrucial role in human perception and object tracking; in particular, these\nabilities are closely related to human intuitive physics. Inspired by this, we\ndesigned a novel reasoning module called the Slot-based Time-Space Transformer\nwith Memory buffer (STATM) to enhance the model's perception ability in complex\nscenes. The memory buffer primarily serves as storage for slot information from\nupstream modules, the Slot-based Time-Space Transformer makes predictions\nthrough slot-based spatiotemporal attention computations and fusion. Our\nexperiment results on various datasets show that STATM can significantly\nenhance object-centric learning capabilities of slot-based video models.\n","authors":["Jian Li","Pu Ren","Yang Liu","Hao Sun"],"pdf_url":"https://arxiv.org/pdf/2403.15245v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15241v1","updated":"2024-03-22T14:34:17Z","published":"2024-03-22T14:34:17Z","title":"IS-Fusion: Instance-Scene Collaborative Fusion for Multimodal 3D Object\n Detection","summary":" Bird's eye view (BEV) representation has emerged as a dominant solution for\ndescribing 3D space in autonomous driving scenarios. However, objects in the\nBEV representation typically exhibit small sizes, and the associated point\ncloud context is inherently sparse, which leads to great challenges for\nreliable 3D perception. In this paper, we propose IS-Fusion, an innovative\nmultimodal fusion framework that jointly captures the Instance- and Scene-level\ncontextual information. IS-Fusion essentially differs from existing approaches\nthat only focus on the BEV scene-level fusion by explicitly incorporating\ninstance-level multimodal information, thus facilitating the instance-centric\ntasks like 3D object detection. It comprises a Hierarchical Scene Fusion (HSF)\nmodule and an Instance-Guided Fusion (IGF) module. HSF applies Point-to-Grid\nand Grid-to-Region transformers to capture the multimodal scene context at\ndifferent granularities. IGF mines instance candidates, explores their\nrelationships, and aggregates the local multimodal context for each instance.\nThese instances then serve as guidance to enhance the scene feature and yield\nan instance-aware BEV representation. On the challenging nuScenes benchmark,\nIS-Fusion outperforms all the published multimodal works to date. Code is\navailable at: https://github.com/yinjunbo/IS-Fusion.\n","authors":["Junbo Yin","Jianbing Shen","Runnan Chen","Wei Li","Ruigang Yang","Pascal Frossard","Wenguan Wang"],"pdf_url":"https://arxiv.org/pdf/2403.15241v1.pdf","comment":"Accepted to CVPR 2024; Code: https://github.com/yinjunbo/IS-Fusion"},{"id":"http://arxiv.org/abs/2403.15238v1","updated":"2024-03-22T14:32:02Z","published":"2024-03-22T14:32:02Z","title":"WEEP: A method for spatial interpretation of weakly supervised CNN\n models in computational pathology","summary":" Deep learning enables the modelling of high-resolution histopathology\nwhole-slide images (WSI). Weakly supervised learning of tile-level data is\ntypically applied for tasks where labels only exist on the patient or WSI level\n(e.g. patient outcomes or histological grading). In this context, there is a\nneed for improved spatial interpretability of predictions from such models. We\npropose a novel method, Wsi rEgion sElection aPproach (WEEP), for model\ninterpretation. It provides a principled yet straightforward way to establish\nthe spatial area of WSI required for assigning a particular prediction label.\nWe demonstrate WEEP on a binary classification task in the area of breast\ncancer computational pathology. WEEP is easy to implement, is directly\nconnected to the model-based decision process, and offers information relevant\nto both research and diagnostic applications.\n","authors":["Abhinav Sharma","Bojing Liu","Mattias Rantalainen"],"pdf_url":"https://arxiv.org/pdf/2403.15238v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15234v1","updated":"2024-03-22T14:27:58Z","published":"2024-03-22T14:27:58Z","title":"Shadow Generation for Composite Image Using Diffusion model","summary":" In the realm of image composition, generating realistic shadow for the\ninserted foreground remains a formidable challenge. Previous works have\ndeveloped image-to-image translation models which are trained on paired\ntraining data. However, they are struggling to generate shadows with accurate\nshapes and intensities, hindered by data scarcity and inherent task complexity.\nIn this paper, we resort to foundation model with rich prior knowledge of\nnatural shadow images. Specifically, we first adapt ControlNet to our task and\nthen propose intensity modulation modules to improve the shadow intensity.\nMoreover, we extend the small-scale DESOBA dataset to DESOBAv2 using a novel\ndata acquisition pipeline. Experimental results on both DESOBA and DESOBAv2\ndatasets as well as real composite images demonstrate the superior capability\nof our model for shadow generation task. The dataset, code, and model are\nreleased at https://github.com/bcmi/Object-Shadow-Generation-Dataset-DESOBAv2.\n","authors":["Qingyang Liu","Junqi You","Jianting Wang","Xinhao Tao","Bo Zhang","Li Niu"],"pdf_url":"https://arxiv.org/pdf/2403.15234v1.pdf","comment":"accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.11376v2","updated":"2024-03-22T14:25:14Z","published":"2024-03-18T00:03:48Z","title":"ShapeFormer: Shape Prior Visible-to-Amodal Transformer-based Amodal\n Instance Segmentation","summary":" Amodal Instance Segmentation (AIS) presents a challenging task as it involves\npredicting both visible and occluded parts of objects within images. Existing\nAIS methods rely on a bidirectional approach, encompassing both the transition\nfrom amodal features to visible features (amodal-to-visible) and from visible\nfeatures to amodal features (visible-to-amodal). Our observation shows that the\nutilization of amodal features through the amodal-to-visible can confuse the\nvisible features due to the extra information of occluded/hidden segments not\npresented in visible display. Consequently, this compromised quality of visible\nfeatures during the subsequent visible-to-amodal transition. To tackle this\nissue, we introduce ShapeFormer, a decoupled Transformer-based model with a\nvisible-to-amodal transition. It facilitates the explicit relationship between\noutput segmentations and avoids the need for amodal-to-visible transitions.\nShapeFormer comprises three key modules: (i) Visible-Occluding Mask Head for\npredicting visible segmentation with occlusion awareness, (ii) Shape-Prior\nAmodal Mask Head for predicting amodal and occluded masks, and (iii)\nCategory-Specific Shape Prior Retriever aims to provide shape prior knowledge.\nComprehensive experiments and extensive ablation studies across various AIS\nbenchmarks demonstrate the effectiveness of our ShapeFormer. The code is\navailable at: https://github.com/UARK-AICV/ShapeFormer\n","authors":["Minh Tran","Winston Bounsavy","Khoa Vo","Anh Nguyen","Tri Nguyen","Ngan Le"],"pdf_url":"https://arxiv.org/pdf/2403.11376v2.pdf","comment":"Accepted to IJCNN 2024"},{"id":"http://arxiv.org/abs/2403.15227v1","updated":"2024-03-22T14:20:54Z","published":"2024-03-22T14:20:54Z","title":"LeGO: Leveraging a Surface Deformation Network for Animatable Stylized\n Face Generation with One Example","summary":" Recent advances in 3D face stylization have made significant strides in few\nto zero-shot settings. However, the degree of stylization achieved by existing\nmethods is often not sufficient for practical applications because they are\nmostly based on statistical 3D Morphable Models (3DMM) with limited variations.\nTo this end, we propose a method that can produce a highly stylized 3D face\nmodel with desired topology. Our methods train a surface deformation network\nwith 3DMM and translate its domain to the target style using a paired exemplar.\nThe network achieves stylization of the 3D face mesh by mimicking the style of\nthe target using a differentiable renderer and directional CLIP losses.\nAdditionally, during the inference process, we utilize a Mesh Agnostic Encoder\n(MAGE) that takes deformation target, a mesh of diverse topologies as input to\nthe stylization process and encodes its shape into our latent space. The\nresulting stylized face model can be animated by commonly used 3DMM blend\nshapes. A set of quantitative and qualitative evaluations demonstrate that our\nmethod can produce highly stylized face meshes according to a given style and\noutput them in a desired topology. We also demonstrate example applications of\nour method including image-based stylized avatar generation, linear\ninterpolation of geometric styles, and facial animation of stylized avatars.\n","authors":["Soyeon Yoon","Kwan Yun","Kwanggyoon Seo","Sihun Cha","Jung Eun Yoo","Junyong Noh"],"pdf_url":"https://arxiv.org/pdf/2403.15227v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2403.15218v1","updated":"2024-03-22T14:07:07Z","published":"2024-03-22T14:07:07Z","title":"Anytime, Anywhere, Anyone: Investigating the Feasibility of Segment\n Anything Model for Crowd-Sourcing Medical Image Annotations","summary":" Curating annotations for medical image segmentation is a labor-intensive and\ntime-consuming task that requires domain expertise, resulting in \"narrowly\"\nfocused deep learning (DL) models with limited translational utility. Recently,\nfoundation models like the Segment Anything Model (SAM) have revolutionized\nsemantic segmentation with exceptional zero-shot generalizability across\nvarious domains, including medical imaging, and hold a lot of promise for\nstreamlining the annotation process. However, SAM has yet to be evaluated in a\ncrowd-sourced setting to curate annotations for training 3D DL segmentation\nmodels. In this work, we explore the potential of SAM for crowd-sourcing\n\"sparse\" annotations from non-experts to generate \"dense\" segmentation masks\nfor training 3D nnU-Net models, a state-of-the-art DL segmentation model. Our\nresults indicate that while SAM-generated annotations exhibit high mean Dice\nscores compared to ground-truth annotations, nnU-Net models trained on\nSAM-generated annotations perform significantly worse than nnU-Net models\ntrained on ground-truth annotations ($p<0.001$, all).\n","authors":["Pranav Kulkarni","Adway Kanhere","Dharmam Savani","Andrew Chan","Devina Chatterjee","Paul H. Yi","Vishwa S. Parekh"],"pdf_url":"https://arxiv.org/pdf/2403.15218v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06205v3","updated":"2024-03-22T14:05:33Z","published":"2024-03-10T13:04:01Z","title":"S-DyRF: Reference-Based Stylized Radiance Fields for Dynamic Scenes","summary":" Current 3D stylization methods often assume static scenes, which violates the\ndynamic nature of our real world. To address this limitation, we present\nS-DyRF, a reference-based spatio-temporal stylization method for dynamic neural\nradiance fields. However, stylizing dynamic 3D scenes is inherently challenging\ndue to the limited availability of stylized reference images along the temporal\naxis. Our key insight lies in introducing additional temporal cues besides the\nprovided reference. To this end, we generate temporal pseudo-references from\nthe given stylized reference. These pseudo-references facilitate the\npropagation of style information from the reference to the entire dynamic 3D\nscene. For coarse style transfer, we enforce novel views and times to mimic the\nstyle details present in pseudo-references at the feature level. To preserve\nhigh-frequency details, we create a collection of stylized temporal pseudo-rays\nfrom temporal pseudo-references. These pseudo-rays serve as detailed and\nexplicit stylization guidance for achieving fine style transfer. Experiments on\nboth synthetic and real-world datasets demonstrate that our method yields\nplausible stylized results of space-time view synthesis on dynamic 3D scenes.\n","authors":["Xingyi Li","Zhiguo Cao","Yizheng Wu","Kewei Wang","Ke Xian","Zhe Wang","Guosheng Lin"],"pdf_url":"https://arxiv.org/pdf/2403.06205v3.pdf","comment":"Accepted by CVPR 2024. Project page:\n https://xingyi-li.github.io/s-dyrf/"},{"id":"http://arxiv.org/abs/2402.00631v2","updated":"2024-03-22T14:00:37Z","published":"2024-01-31T11:52:33Z","title":"Beyond Inserting: Learning Identity Embedding for Semantic-Fidelity\n Personalized Diffusion Generation","summary":" Advanced diffusion-based Text-to-Image (T2I) models, such as the Stable\nDiffusion Model, have made significant progress in generating diverse and\nhigh-quality images using text prompts alone. However, when non-famous users\nrequire personalized image generation for their identities (IDs), the T2I\nmodels fail to accurately generate their ID-related images. The main problem is\nthat pre-trained T2I models do not learn the mapping between the new ID prompts\nand their corresponding visual content. The previous methods either failed to\naccurately fit the face region or lost the interactive generative ability with\nother existing concepts in T2I models. In other words, they are unable to\ngenerate T2I-aligned and semantic-fidelity images for the given prompts with\nother concepts such as scenes (``Eiffel Tower''), actions (``holding a\nbasketball''), and facial attributes (``eyes closed''). In this paper, we focus\non inserting accurate and interactive ID embedding into the Stable Diffusion\nModel for semantic-fidelity personalized generation. We address this challenge\nfrom two perspectives: face-wise region fitting and semantic-fidelity token\noptimization. Specifically, we first visualize the attention overfit problem\nand propose a face-wise attention loss to fit the face region instead of\nentangling ID-unrelated information, such as face layout and background. This\nkey trick significantly enhances the ID accuracy and interactive generative\nability with other existing concepts. Then, we optimize one ID representation\nas multiple per-stage tokens where each token contains two disentangled\nfeatures. This expansion of the textual conditioning space improves\nsemantic-fidelity control. Extensive experiments validate that our results\nexhibit superior ID accuracy, text-based manipulation ability, and\ngeneralization compared to previous methods.\n","authors":["Yang Li","Songlin Yang","Wei Wang","Jing Dong"],"pdf_url":"https://arxiv.org/pdf/2402.00631v2.pdf","comment":"14 pages, 16 figures"},{"id":"http://arxiv.org/abs/2403.15212v1","updated":"2024-03-22T13:55:52Z","published":"2024-03-22T13:55:52Z","title":"GCN-DevLSTM: Path Development for Skeleton-Based Action Recognition","summary":" Skeleton-based action recognition (SAR) in videos is an important but\nchallenging task in computer vision. The recent state-of-the-art models for SAR\nare primarily based on graph convolutional neural networks (GCNs), which are\npowerful in extracting the spatial information of skeleton data. However, it is\nyet clear that such GCN-based models can effectively capture the temporal\ndynamics of human action sequences. To this end, we propose the DevLSTM module,\nwhich exploits the path development -- a principled and parsimonious\nrepresentation for sequential data by leveraging the Lie group structure. The\npath development, originated from Rough path theory, can effectively capture\nthe order of events in high-dimensional stream data with massive dimension\nreduction and consequently enhance the LSTM module substantially. Our proposed\nG-DevLSTM module can be conveniently plugged into the temporal graph,\ncomplementing existing advanced GCN-based models. Our empirical studies on the\nNTU60, NTU120 and Chalearn2013 datasets demonstrate that our proposed hybrid\nmodel significantly outperforms the current best-performing methods in SAR\ntasks. The code is available at https://github.com/DeepIntoStreams/GCN-DevLSTM.\n","authors":["Lei Jiang","Weixin Yang","Xin Zhang","Hao Ni"],"pdf_url":"https://arxiv.org/pdf/2403.15212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06020v2","updated":"2024-03-22T13:51:55Z","published":"2024-03-09T21:45:31Z","title":"Multi-conditioned Graph Diffusion for Neural Architecture Search","summary":" Neural architecture search automates the design of neural network\narchitectures usually by exploring a large and thus complex architecture search\nspace. To advance the architecture search, we present a graph diffusion-based\nNAS approach that uses discrete conditional graph diffusion processes to\ngenerate high-performing neural network architectures. We then propose a\nmulti-conditioned classifier-free guidance approach applied to graph diffusion\nnetworks to jointly impose constraints such as high accuracy and low hardware\nlatency. Unlike the related work, our method is completely differentiable and\nrequires only a single model training. In our evaluations, we show promising\nresults on six standard benchmarks, yielding novel and unique architectures at\na fast speed, i.e. less than 0.2 seconds per architecture. Furthermore, we\ndemonstrate the generalisability and efficiency of our method through\nexperiments on ImageNet dataset.\n","authors":["Rohan Asthana","Joschua Conrad","Youssef Dawoud","Maurits Ortmanns","Vasileios Belagiannis"],"pdf_url":"https://arxiv.org/pdf/2403.06020v2.pdf","comment":"Accepted at Transactions on Machine Learning Research (TMLR)"},{"id":"http://arxiv.org/abs/2403.15209v1","updated":"2024-03-22T13:50:27Z","published":"2024-03-22T13:50:27Z","title":"MSCoTDet: Language-driven Multi-modal Fusion for Improved Multispectral\n Pedestrian Detection","summary":" Multispectral pedestrian detection is attractive for around-the-clock\napplications due to the complementary information between RGB and thermal\nmodalities. However, current models often fail to detect pedestrians in obvious\ncases, especially due to the modality bias learned from statistically biased\ndatasets. From these problems, we anticipate that maybe understanding the\ncomplementary information itself is difficult to achieve from vision-only\nmodels. Accordingly, we propose a novel Multispectral Chain-of-Thought\nDetection (MSCoTDet) framework, which incorporates Large Language Models (LLMs)\nto understand the complementary information at the semantic level and further\nenhance the fusion process. Specifically, we generate text descriptions of the\npedestrian in each RGB and thermal modality and design a Multispectral\nChain-of-Thought (MSCoT) prompting, which models a step-by-step process to\nfacilitate cross-modal reasoning at the semantic level and perform accurate\ndetection. Moreover, we design a Language-driven Multi-modal Fusion (LMF)\nstrategy that enables fusing vision-driven and language-driven detections.\nExtensive experiments validate that MSCoTDet improves multispectral pedestrian\ndetection.\n","authors":["Taeheon Kim","Sangyun Chung","Damin Yeom","Youngjoon Yu","Hak Gu Kim","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2403.15209v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15203v1","updated":"2024-03-22T13:46:51Z","published":"2024-03-22T13:46:51Z","title":"DITTO: Demonstration Imitation by Trajectory Transformation","summary":" Teaching robots new skills quickly and conveniently is crucial for the\nbroader adoption of robotic systems. In this work, we address the problem of\none-shot imitation from a single human demonstration, given by an RGB-D video\nrecording through a two-stage process. In the first stage which is offline, we\nextract the trajectory of the demonstration. This entails segmenting\nmanipulated objects and determining their relative motion in relation to\nsecondary objects such as containers. Subsequently, in the live online\ntrajectory generation stage, we first \\mbox{re-detect} all objects, then we\nwarp the demonstration trajectory to the current scene, and finally, we trace\nthe trajectory with the robot. To complete these steps, our method makes\nleverages several ancillary models, including those for segmentation, relative\nobject pose estimation, and grasp prediction. We systematically evaluate\ndifferent combinations of correspondence and re-detection methods to validate\nour design decision across a diverse range of tasks. Specifically, we collect\ndemonstrations of ten different tasks including pick-and-place tasks as well as\narticulated object manipulation. Finally, we perform extensive evaluations on a\nreal robot system to demonstrate the effectiveness and utility of our approach\nin real-world scenarios. We make the code publicly available at\nhttp://ditto.cs.uni-freiburg.de.\n","authors":["Nick Heppert","Max Argus","Tim Welschehold","Thomas Brox","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2403.15203v1.pdf","comment":"8 pages, 4 figures, 3 tables, submitted to IROS 2024"},{"id":"http://arxiv.org/abs/2403.15194v1","updated":"2024-03-22T13:27:57Z","published":"2024-03-22T13:27:57Z","title":"Your Image is My Video: Reshaping the Receptive Field via Image-To-Video\n Differentiable AutoAugmentation and Fusion","summary":" The landscape of deep learning research is moving towards innovative\nstrategies to harness the true potential of data. Traditionally, emphasis has\nbeen on scaling model architectures, resulting in large and complex neural\nnetworks, which can be difficult to train with limited computational resources.\nHowever, independently of the model size, data quality (i.e. amount and\nvariability) is still a major factor that affects model generalization. In this\nwork, we propose a novel technique to exploit available data through the use of\nautomatic data augmentation for the tasks of image classification and semantic\nsegmentation. We introduce the first Differentiable Augmentation Search method\n(DAS) to generate variations of images that can be processed as videos.\nCompared to previous approaches, DAS is extremely fast and flexible, allowing\nthe search on very large search spaces in less than a GPU day. Our intuition is\nthat the increased receptive field in the temporal dimension provided by DAS\ncould lead to benefits also to the spatial receptive field. More specifically,\nwe leverage DAS to guide the reshaping of the spatial receptive field by\nselecting task-dependant transformations. As a result, compared to standard\naugmentation alternatives, we improve in terms of accuracy on ImageNet,\nCifar10, Cifar100, Tiny-ImageNet, Pascal-VOC-2012 and CityScapes datasets when\nplugging-in our DAS over different light-weight video backbones.\n","authors":["Sofia Casarin","Cynthia I. Ugwu","Sergio Escalera","Oswald Lanz"],"pdf_url":"https://arxiv.org/pdf/2403.15194v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13964v2","updated":"2024-03-22T13:25:53Z","published":"2023-12-21T15:51:12Z","title":"PIA: Your Personalized Image Animator via Plug-and-Play Modules in\n Text-to-Image Models","summary":" Recent advancements in personalized text-to-image (T2I) models have\nrevolutionized content creation, empowering non-experts to generate stunning\nimages with unique styles. While promising, adding realistic motions into these\npersonalized images by text poses significant challenges in preserving distinct\nstyles, high-fidelity details, and achieving motion controllability by text. In\nthis paper, we present PIA, a Personalized Image Animator that excels in\naligning with condition images, achieving motion controllability by text, and\nthe compatibility with various personalized T2I models without specific tuning.\nTo achieve these goals, PIA builds upon a base T2I model with well-trained\ntemporal alignment layers, allowing for the seamless transformation of any\npersonalized T2I model into an image animation model. A key component of PIA is\nthe introduction of the condition module, which utilizes the condition frame\nand inter-frame affinity as input to transfer appearance information guided by\nthe affinity hint for individual frame synthesis in the latent space. This\ndesign mitigates the challenges of appearance-related image alignment within\nand allows for a stronger focus on aligning with motion-related guidance.\n","authors":["Yiming Zhang","Zhening Xing","Yanhong Zeng","Youqing Fang","Kai Chen"],"pdf_url":"https://arxiv.org/pdf/2312.13964v2.pdf","comment":"Project page: https://pi-animator.github.io/"},{"id":"http://arxiv.org/abs/2403.15192v1","updated":"2024-03-22T13:24:50Z","published":"2024-03-22T13:24:50Z","title":"SFOD: Spiking Fusion Object Detector","summary":" Event cameras, characterized by high temporal resolution, high dynamic range,\nlow power consumption, and high pixel bandwidth, offer unique capabilities for\nobject detection in specialized contexts. Despite these advantages, the\ninherent sparsity and asynchrony of event data pose challenges to existing\nobject detection algorithms. Spiking Neural Networks (SNNs), inspired by the\nway the human brain codes and processes information, offer a potential solution\nto these difficulties. However, their performance in object detection using\nevent cameras is limited in current implementations. In this paper, we propose\nthe Spiking Fusion Object Detector (SFOD), a simple and efficient approach to\nSNN-based object detection. Specifically, we design a Spiking Fusion Module,\nachieving the first-time fusion of feature maps from different scales in SNNs\napplied to event cameras. Additionally, through integrating our analysis and\nexperiments conducted during the pretraining of the backbone network on the\nNCAR dataset, we delve deeply into the impact of spiking decoding strategies\nand loss functions on model performance. Thereby, we establish state-of-the-art\nclassification results based on SNNs, achieving 93.7\\% accuracy on the NCAR\ndataset. Experimental results on the GEN1 detection dataset demonstrate that\nthe SFOD achieves a state-of-the-art mAP of 32.1\\%, outperforming existing\nSNN-based approaches. Our research not only underscores the potential of SNNs\nin object detection with event cameras but also propels the advancement of\nSNNs. Code is available at https://github.com/yimeng-fan/SFOD.\n","authors":["Yimeng Fan","Wei Zhang","Changsong Liu","Mingyang Li","Wenrui Lu"],"pdf_url":"https://arxiv.org/pdf/2403.15192v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2306.14899v2","updated":"2024-03-22T13:24:35Z","published":"2023-06-26T17:59:55Z","title":"FunQA: Towards Surprising Video Comprehension","summary":" Surprising videos, such as funny clips, creative performances, or visual\nillusions, attract significant attention. Enjoyment of these videos is not\nsimply a response to visual stimuli; rather, it hinges on the human capacity to\nunderstand (and appreciate) commonsense violations depicted in these videos. We\nintroduce FunQA, a challenging video question-answering (QA) dataset\nspecifically designed to evaluate and enhance the depth of video reasoning\nbased on counter-intuitive and fun videos. Unlike most video QA benchmarks\nwhich focus on less surprising contexts, e.g., cooking or instructional videos,\nFunQA covers three previously unexplored types of surprising videos: 1)\nHumorQA, 2) CreativeQA, and 3) MagicQA. For each subset, we establish rigorous\nQA tasks designed to assess the model's capability in counter-intuitive\ntimestamp localization, detailed video description, and reasoning around\ncounter-intuitiveness. We also pose higher-level tasks, such as attributing a\nfitting and vivid title to the video and scoring the video creativity. In\ntotal, the FunQA benchmark consists of 312K free-text QA pairs derived from\n4.3K video clips, spanning a total of 24 video hours. Moreover, we propose\nFunMentor, an agent designed for Vision-Language Models (VLMs) that uses\nmulti-turn dialogues to enhance models' understanding of counter-intuitiveness.\nExtensive experiments with existing VLMs demonstrate the effectiveness of\nFunMentor and reveal significant performance gaps for the FunQA videos across\nspatial-temporal reasoning, visual-centered reasoning, and free-text\ngeneration.\n","authors":["Binzhu Xie","Sicheng Zhang","Zitang Zhou","Bo Li","Yuanhan Zhang","Jack Hessel","Jingkang Yang","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2306.14899v2.pdf","comment":"Project Page: https://funqa-benchmark.github.io/ Codebase:\n https://github.com/Jingkang50/FunQA"},{"id":"http://arxiv.org/abs/2403.15182v1","updated":"2024-03-22T13:11:26Z","published":"2024-03-22T13:11:26Z","title":"PDE-CNNs: Axiomatic Derivations and Applications","summary":" PDE-based Group Convolutional Neural Networks (PDE-G-CNNs) utilize solvers of\ngeometrically meaningful evolution PDEs as substitutes for the conventional\ncomponents in G-CNNs. PDE-G-CNNs offer several key benefits all at once: fewer\nparameters, inherent equivariance, better performance, data efficiency, and\ngeometric interpretability. In this article we focus on Euclidean equivariant\nPDE-G-CNNs where the feature maps are two dimensional throughout. We call this\nvariant of the framework a PDE-CNN. We list several practically desirable\naxioms and derive from these which PDEs should be used in a PDE-CNN. Here our\napproach to geometric learning via PDEs is inspired by the axioms of classical\nlinear and morphological scale-space theory, which we generalize by introducing\nsemifield-valued signals. Furthermore, we experimentally confirm for small\nnetworks that PDE-CNNs offer fewer parameters, better performance, and data\nefficiency in comparison to CNNs. We also investigate what effect the use of\ndifferent semifields has on the performance of the models.\n","authors":["Gijs Bellaard","Sei Sakata","Bart M. N. Smets","Remco Duits"],"pdf_url":"https://arxiv.org/pdf/2403.15182v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.08709v2","updated":"2024-03-22T12:55:14Z","published":"2023-04-18T02:45:18Z","title":"You Only Need Two Detectors to Achieve Multi-Modal 3D Multi-Object\n Tracking","summary":" In the classical tracking-by-detection (TBD) paradigm, detection and tracking\nare separately and sequentially conducted, and data association must be\nproperly performed to achieve satisfactory tracking performance. In this paper,\na new end-to-end multi-object tracking framework is proposed, which integrates\nobject detection and multi-object tracking into a single model. The proposed\ntracking framework eliminates the complex data association process in the\nclassical TBD paradigm, and requires no additional training. Secondly, the\nregression confidence of historical trajectories is investigated, and the\npossible states of a trajectory (weak object or strong object) in the current\nframe are predicted. Then, a confidence fusion module is designed to guide\nnon-maximum suppression for trajectories and detections to achieve ordered and\nrobust tracking. Thirdly, by integrating historical trajectory features, the\nregression performance of the detector is enhanced, which better reflects the\nocclusion and disappearance patterns of objects in real world. Lastly,\nextensive experiments are conducted on the commonly used KITTI and Waymo\ndatasets. The results show that the proposed framework can achieve robust\ntracking by using only a 2D detector and a 3D detector, and it is proven more\naccurate than many of the state-of-the-art TBD-based multi-modal tracking\nmethods. The source codes of the proposed method are available at\nhttps://github.com/wangxiyang2022/YONTD-MOT.\n","authors":["Xiyang Wang","Chunyun Fu","Jiawei He","Mingguang Huang","Ting Meng","Siyu Zhang","Hangning Zhou","Ziyao Xu","Chi Zhang"],"pdf_url":"https://arxiv.org/pdf/2304.08709v2.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2403.15173v1","updated":"2024-03-22T12:54:33Z","published":"2024-03-22T12:54:33Z","title":"LSK3DNet: Towards Effective and Efficient 3D Perception with Large\n Sparse Kernels","summary":" Autonomous systems need to process large-scale, sparse, and irregular point\nclouds with limited compute resources. Consequently, it is essential to develop\nLiDAR perception methods that are both efficient and effective. Although\nnaively enlarging 3D kernel size can enhance performance, it will also lead to\na cubically-increasing overhead. Therefore, it is crucial to develop\nstreamlined 3D large kernel designs that eliminate redundant weights and work\neffectively with larger kernels. In this paper, we propose an efficient and\neffective Large Sparse Kernel 3D Neural Network (LSK3DNet) that leverages\ndynamic pruning to amplify the 3D kernel size. Our method comprises two core\ncomponents: Spatial-wise Dynamic Sparsity (SDS) and Channel-wise Weight\nSelection (CWS). SDS dynamically prunes and regrows volumetric weights from the\nbeginning to learn a large sparse 3D kernel. It not only boosts performance but\nalso significantly reduces model size and computational cost. Moreover, CWS\nselects the most important channels for 3D convolution during training and\nsubsequently prunes the redundant channels to accelerate inference for 3D\nvision tasks. We demonstrate the effectiveness of LSK3DNet on three benchmark\ndatasets and five tracks compared with classical models and large kernel\ndesigns. Notably, LSK3DNet achieves the state-of-the-art performance on\nSemanticKITTI (i.e., 75.6% on single-scan and 63.4% on multi-scan), with\nroughly 40% model size reduction and 60% computing operations reduction\ncompared to the naive large 3D kernel model.\n","authors":["Tuo Feng","Wenguan Wang","Fan Ma","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2403.15173v1.pdf","comment":"Accepted at CVPR 2024; Project page:\n https://github.com/FengZicai/LSK3DNet"},{"id":"http://arxiv.org/abs/2403.13248v2","updated":"2024-03-22T12:43:56Z","published":"2024-03-20T02:19:21Z","title":"Mora: Enabling Generalist Video Generation via A Multi-Agent Framework","summary":" Sora is the first large-scale generalist video generation model that garnered\nsignificant attention across society. Since its launch by OpenAI in February\n2024, no other video generation models have paralleled {Sora}'s performance or\nits capacity to support a broad spectrum of video generation tasks.\nAdditionally, there are only a few fully published video generation models,\nwith the majority being closed-source. To address this gap, this paper proposes\na new multi-agent framework Mora, which incorporates several advanced visual AI\nagents to replicate generalist video generation demonstrated by Sora. In\nparticular, Mora can utilize multiple visual agents and successfully mimic\nSora's video generation capabilities in various tasks, such as (1)\ntext-to-video generation, (2) text-conditional image-to-video generation, (3)\nextend generated videos, (4) video-to-video editing, (5) connect videos and (6)\nsimulate digital worlds. Our extensive experimental results show that Mora\nachieves performance that is proximate to that of Sora in various tasks.\nHowever, there exists an obvious performance gap between our work and Sora when\nassessed holistically. In summary, we hope this project can guide the future\ntrajectory of video generation through collaborative AI agents.\n","authors":["Zhengqing Yuan","Ruoxi Chen","Zhaoxu Li","Haolong Jia","Lifang He","Chi Wang","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2403.13248v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07846v3","updated":"2024-03-22T12:41:50Z","published":"2023-09-14T16:40:44Z","title":"MC-NeRF: Multi-Camera Neural Radiance Fields for Multi-Camera Image\n Acquisition Systems","summary":" Neural Radiance Fields (NeRF) use multi-view images for 3D scene\nrepresentation, demonstrating remarkable performance. As one of the primary\nsources of multi-view images, multi-camera systems encounter challenges such as\nvarying intrinsic parameters and frequent pose changes. Most previous\nNeRF-based methods assume a unique camera and rarely consider multi-camera\nscenarios. Besides, some NeRF methods that can optimize intrinsic and extrinsic\nparameters still remain susceptible to suboptimal solutions when these\nparameters are poor initialized. In this paper, we propose MC-NeRF, a method\nthat enables joint optimization of both intrinsic and extrinsic parameters\nalongside NeRF. The method also supports each image corresponding to\nindependent camera parameters. First, we tackle coupling issue and the\ndegenerate case that arise from the joint optimization between intrinsic and\nextrinsic parameters. Second, based on the proposed solutions, we introduce an\nefficient calibration image acquisition scheme for multi-camera systems,\nincluding the design of calibration object. Finally, we present an end-to-end\nnetwork with training sequence that enables the estimation of intrinsic and\nextrinsic parameters, along with the rendering network. Furthermore,\nrecognizing that most existing datasets are designed for a unique camera, we\nconstruct a real multi-camera image acquisition system and create a\ncorresponding new dataset, which includes both simulated data and real-world\ncaptured images. Experiments confirm the effectiveness of our method when each\nimage corresponds to different camera parameters. Specifically, we use\nmulti-cameras, each with different intrinsic and extrinsic parameters in\nreal-world system, to achieve 3D scene representation without providing initial\nposes.\n","authors":["Yu Gao","Lutong Su","Hao Liang","Yufeng Yue","Yi Yang","Mengyin Fu"],"pdf_url":"https://arxiv.org/pdf/2309.07846v3.pdf","comment":"This manuscript is currently under review"},{"id":"http://arxiv.org/abs/2312.04964v2","updated":"2024-03-22T12:34:13Z","published":"2023-12-07T12:09:56Z","title":"ZePT: Zero-Shot Pan-Tumor Segmentation via Query-Disentangling and\n Self-Prompting","summary":" The long-tailed distribution problem in medical image analysis reflects a\nhigh prevalence of common conditions and a low prevalence of rare ones, which\nposes a significant challenge in developing a unified model capable of\nidentifying rare or novel tumor categories not encountered during training. In\nthis paper, we propose a new zero-shot pan-tumor segmentation framework (ZePT)\nbased on query-disentangling and self-prompting to segment unseen tumor\ncategories beyond the training set. ZePT disentangles the object queries into\ntwo subsets and trains them in two stages. Initially, it learns a set of\nfundamental queries for organ segmentation through an object-aware feature\ngrouping strategy, which gathers organ-level visual features. Subsequently, it\nrefines the other set of advanced queries that focus on the auto-generated\nvisual prompts for unseen tumor segmentation. Moreover, we introduce\nquery-knowledge alignment at the feature level to enhance each query's\ndiscriminative representation and generalizability. Extensive experiments on\nvarious tumor segmentation tasks demonstrate the performance superiority of\nZePT, which surpasses the previous counterparts and evidence the promising\nability for zero-shot tumor segmentation in real-world settings.\n","authors":["Yankai Jiang","Zhongzhen Huang","Rongzhao Zhang","Xiaofan Zhang","Shaoting Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.04964v2.pdf","comment":"This paper has been accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.07359v3","updated":"2024-03-22T12:33:51Z","published":"2024-03-12T06:45:34Z","title":"FSC: Few-point Shape Completion","summary":" While previous studies have demonstrated successful 3D object shape\ncompletion with a sufficient number of points, they often fail in scenarios\nwhen a few points, e.g. tens of points, are observed. Surprisingly, via entropy\nanalysis, we find that even a few points, e.g. 64 points, could retain\nsubstantial information to help recover the 3D shape of the object. To address\nthe challenge of shape completion with very sparse point clouds, we then\npropose Few-point Shape Completion (FSC) model, which contains a novel\ndual-branch feature extractor for handling extremely sparse inputs, coupled\nwith an extensive branch for maximal point utilization with a saliency branch\nfor dynamic importance assignment. This model is further bolstered by a\ntwo-stage revision network that refines both the extracted features and the\ndecoder output, enhancing the detail and authenticity of the completed point\ncloud. Our experiments demonstrate the feasibility of recovering 3D shapes from\na few points. The proposed Few-point Shape Completion (FSC) model outperforms\nprevious methods on both few-point inputs and many-point inputs, and shows good\ngeneralizability to different object categories.\n","authors":["Xianzu Wu","Xianfeng Wu","Tianyu Luan","Yajing Bai","Zhongyuan Lai","Junsong Yuan"],"pdf_url":"https://arxiv.org/pdf/2403.07359v3.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.15161v1","updated":"2024-03-22T12:20:23Z","published":"2024-03-22T12:20:23Z","title":"FastCAD: Real-Time CAD Retrieval and Alignment from Scans and Videos","summary":" Digitising the 3D world into a clean, CAD model-based representation has\nimportant applications for augmented reality and robotics. Current\nstate-of-the-art methods are computationally intensive as they individually\nencode each detected object and optimise CAD alignments in a second stage. In\nthis work, we propose FastCAD, a real-time method that simultaneously retrieves\nand aligns CAD models for all objects in a given scene. In contrast to previous\nworks, we directly predict alignment parameters and shape embeddings. We\nachieve high-quality shape retrievals by learning CAD embeddings in a\ncontrastive learning framework and distilling those into FastCAD. Our\nsingle-stage method accelerates the inference time by a factor of 50 compared\nto other methods operating on RGB-D scans while outperforming them on the\nchallenging Scan2CAD alignment benchmark. Further, our approach collaborates\nseamlessly with online 3D reconstruction techniques. This enables the real-time\ngeneration of precise CAD model-based reconstructions from videos at 10 FPS.\nDoing so, we significantly improve the Scan2CAD alignment accuracy in the video\nsetting from 43.0% to 48.2% and the reconstruction accuracy from 22.9% to\n29.6%.\n","authors":["Florian Langer","Jihong Ju","Georgi Dikov","Gerhard Reitmayr","Mohsen Ghafoorian"],"pdf_url":"https://arxiv.org/pdf/2403.15161v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15156v1","updated":"2024-03-22T12:11:06Z","published":"2024-03-22T12:11:06Z","title":"Infrastructure-Assisted Collaborative Perception in Automated Valet\n Parking: A Safety Perspective","summary":" Environmental perception in Automated Valet Parking (AVP) has been a\nchallenging task due to severe occlusions in parking garages. Although\nCollaborative Perception (CP) can be applied to broaden the field of view of\nconnected vehicles, the limited bandwidth of vehicular communications restricts\nits application. In this work, we propose a BEV feature-based CP network\narchitecture for infrastructure-assisted AVP systems. The model takes the\nroadside camera and LiDAR as optional inputs and adaptively fuses them with\nonboard sensors in a unified BEV representation. Autoencoder and downsampling\nare applied for channel-wise and spatial-wise dimension reduction, while\nsparsification and quantization further compress the feature map with little\nloss in data precision. Combining these techniques, the size of a BEV feature\nmap is effectively compressed to fit in the feasible data rate of the NR-V2X\nnetwork. With the synthetic AVP dataset, we observe that CP can effectively\nincrease perception performance, especially for pedestrians. Moreover, the\nadvantage of infrastructure-assisted CP is demonstrated in two typical\nsafety-critical scenarios in the AVP setting, increasing the maximum safe\ncruising speed by up to 3m/s in both scenarios.\n","authors":["Yukuan Jia","Jiawen Zhang","Shimeng Lu","Baokang Fan","Ruiqing Mao","Sheng Zhou","Zhisheng Niu"],"pdf_url":"https://arxiv.org/pdf/2403.15156v1.pdf","comment":"7 pages, 7 figures, 4 tables, accepted by IEEE VTC2024-Spring"},{"id":"http://arxiv.org/abs/2403.15152v1","updated":"2024-03-22T12:08:16Z","published":"2024-03-22T12:08:16Z","title":"A Multimodal Approach for Cross-Domain Image Retrieval","summary":" Image generators are gaining vast amount of popularity and have rapidly\nchanged how digital content is created. With the latest AI technology, millions\nof high quality images are being generated by the public, which are constantly\nmotivating the research community to push the limits of generative models to\ncreate more complex and realistic images. This paper focuses on Cross-Domain\nImage Retrieval (CDIR) which can be used as an additional tool to inspect\ncollections of generated images by determining the level of similarity between\nimages in a dataset. An ideal retrieval system would be able to generalize to\nunseen complex images from multiple domains (e.g., photos, drawings and\npaintings). To address this goal, we propose a novel caption-matching approach\nthat leverages multimodal language-vision architectures pre-trained on large\ndatasets. The method is tested on DomainNet and Office-Home datasets and\nconsistently achieves state-of-the-art performance over the latest approaches\nin the literature for cross-domain image retrieval. In order to verify the\neffectiveness with AI-generated images, the method was also put to test with a\ndatabase composed by samples collected from Midjourney, which is a widely used\ngenerative platform for content creation.\n","authors":["Lucas Iijima","Tania Stathaki"],"pdf_url":"https://arxiv.org/pdf/2403.15152v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15150v1","updated":"2024-03-22T12:06:40Z","published":"2024-03-22T12:06:40Z","title":"An In-Depth Analysis of Data Reduction Methods for Sustainable Deep\n Learning","summary":" In recent years, Deep Learning has gained popularity for its ability to solve\ncomplex classification tasks, increasingly delivering better results thanks to\nthe development of more accurate models, the availability of huge volumes of\ndata and the improved computational capabilities of modern computers. However,\nthese improvements in performance also bring efficiency problems, related to\nthe storage of datasets and models, and to the waste of energy and time\ninvolved in both the training and inference processes. In this context, data\nreduction can help reduce energy consumption when training a deep learning\nmodel. In this paper, we present up to eight different methods to reduce the\nsize of a tabular training dataset, and we develop a Python package to apply\nthem. We also introduce a representativeness metric based on topology to\nmeasure how similar are the reduced datasets and the full training dataset.\nAdditionally, we develop a methodology to apply these data reduction methods to\nimage datasets for object detection tasks. Finally, we experimentally compare\nhow these data reduction methods affect the representativeness of the reduced\ndataset, the energy consumption and the predictive performance of the model.\n","authors":["Víctor Toscano-Durán","Javier Perera-Lago","Eduardo Paluzo-Hidalgo","Rocío Gonzalez-Diaz","Miguel Ángel Gutierrez-Naranjo","Matteo Rucco"],"pdf_url":"https://arxiv.org/pdf/2403.15150v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12550v2","updated":"2024-03-22T12:05:53Z","published":"2024-03-19T08:49:48Z","title":"RGBD GS-ICP SLAM","summary":" Simultaneous Localization and Mapping (SLAM) with dense representation plays\na key role in robotics, Virtual Reality (VR), and Augmented Reality (AR)\napplications. Recent advancements in dense representation SLAM have highlighted\nthe potential of leveraging neural scene representation and 3D Gaussian\nrepresentation for high-fidelity spatial representation. In this paper, we\npropose a novel dense representation SLAM approach with a fusion of Generalized\nIterative Closest Point (G-ICP) and 3D Gaussian Splatting (3DGS). In contrast\nto existing methods, we utilize a single Gaussian map for both tracking and\nmapping, resulting in mutual benefits. Through the exchange of covariances\nbetween tracking and mapping processes with scale alignment techniques, we\nminimize redundant computations and achieve an efficient system. Additionally,\nwe enhance tracking accuracy and mapping quality through our keyframe selection\nmethods. Experimental results demonstrate the effectiveness of our approach,\nshowing an incredibly fast speed up to 107 FPS (for the entire system) and\nsuperior quality of the reconstructed map.\n","authors":["Seongbo Ha","Jiung Yeon","Hyeonwoo Yu"],"pdf_url":"https://arxiv.org/pdf/2403.12550v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15143v1","updated":"2024-03-22T11:53:03Z","published":"2024-03-22T11:53:03Z","title":"Modular Deep Active Learning Framework for Image Annotation: A Technical\n Report for the Ophthalmo-AI Project","summary":" Image annotation is one of the most essential tasks for guaranteeing proper\ntreatment for patients and tracking progress over the course of therapy in the\nfield of medical imaging and disease diagnosis. However, manually annotating a\nlot of 2D and 3D imaging data can be extremely tedious. Deep Learning (DL)\nbased segmentation algorithms have completely transformed this process and made\nit possible to automate image segmentation. By accurately segmenting medical\nimages, these algorithms can greatly minimize the time and effort necessary for\nmanual annotation. Additionally, by incorporating Active Learning (AL) methods,\nthese segmentation algorithms can perform far more effectively with a smaller\namount of ground truth data. We introduce MedDeepCyleAL, an end-to-end\nframework implementing the complete AL cycle. It provides researchers with the\nflexibility to choose the type of deep learning model they wish to employ and\nincludes an annotation tool that supports the classification and segmentation\nof medical images. The user-friendly interface allows for easy alteration of\nthe AL and DL model settings through a configuration file, requiring no prior\nprogramming experience. While MedDeepCyleAL can be applied to any kind of image\ndata, we have specifically applied it to ophthalmology data in this project.\n","authors":["Md Abdul Kadir","Hasan Md Tusfiqur Alam","Pascale Maul","Hans-Jürgen Profitlich","Moritz Wolf","Daniel Sonntag"],"pdf_url":"https://arxiv.org/pdf/2403.15143v1.pdf","comment":"DFKI Technical Report"},{"id":"http://arxiv.org/abs/2403.15139v1","updated":"2024-03-22T11:48:09Z","published":"2024-03-22T11:48:09Z","title":"Deep Generative Model based Rate-Distortion for Image Downscaling\n Assessment","summary":" In this paper, we propose Image Downscaling Assessment by Rate-Distortion\n(IDA-RD), a novel measure to quantitatively evaluate image downscaling\nalgorithms. In contrast to image-based methods that measure the quality of\ndownscaled images, ours is process-based that draws ideas from rate-distortion\ntheory to measure the distortion incurred during downscaling. Our main idea is\nthat downscaling and super-resolution (SR) can be viewed as the encoding and\ndecoding processes in the rate-distortion model, respectively, and that a\ndownscaling algorithm that preserves more details in the resulting\nlow-resolution (LR) images should lead to less distorted high-resolution (HR)\nimages in SR. In other words, the distortion should increase as the downscaling\nalgorithm deteriorates. However, it is non-trivial to measure this distortion\nas it requires the SR algorithm to be blind and stochastic. Our key insight is\nthat such requirements can be met by recent SR algorithms based on deep\ngenerative models that can find all matching HR images for a given LR image on\ntheir learned image manifolds. Extensive experimental results show the\neffectiveness of our IDA-RD measure.\n","authors":["Yuanbang Liang","Bhavesh Garg","Paul L Rosin","Yipeng Qin"],"pdf_url":"https://arxiv.org/pdf/2403.15139v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.11220v3","updated":"2024-03-22T11:42:40Z","published":"2024-03-17T13:43:10Z","title":"CPA-Enhancer: Chain-of-Thought Prompted Adaptive Enhancer for Object\n Detection under Unknown Degradations","summary":" Object detection methods under known single degradations have been\nextensively investigated. However, existing approaches require prior knowledge\nof the degradation type and train a separate model for each, limiting their\npractical applications in unpredictable environments. To address this\nchallenge, we propose a chain-of-thought (CoT) prompted adaptive enhancer,\nCPA-Enhancer, for object detection under unknown degradations. Specifically,\nCPA-Enhancer progressively adapts its enhancement strategy under the\nstep-by-step guidance of CoT prompts, that encode degradation-related\ninformation. To the best of our knowledge, it's the first work that exploits\nCoT prompting for object detection tasks. Overall, CPA-Enhancer is a\nplug-and-play enhancement model that can be integrated into any generic\ndetectors to achieve substantial gains on degraded images, without knowing the\ndegradation type priorly. Experimental results demonstrate that CPA-Enhancer\nnot only sets the new state of the art for object detection but also boosts the\nperformance of other downstream vision tasks under unknown degradations.\n","authors":["Yuwei Zhang","Yan Wu","Yanming Liu","Xinyue Peng"],"pdf_url":"https://arxiv.org/pdf/2403.11220v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13408v2","updated":"2024-03-22T11:41:38Z","published":"2024-03-20T08:50:15Z","title":"S2DM: Sector-Shaped Diffusion Models for Video Generation","summary":" Diffusion models have achieved great success in image generation. However,\nwhen leveraging this idea for video generation, we face significant challenges\nin maintaining the consistency and continuity across video frames. This is\nmainly caused by the lack of an effective framework to align frames of videos\nwith desired temporal features while preserving consistent semantic and\nstochastic features. In this work, we propose a novel Sector-Shaped Diffusion\nModel (S2DM) whose sector-shaped diffusion region is formed by a set of\nray-shaped reverse diffusion processes starting at the same noise point. S2DM\ncan generate a group of intrinsically related data sharing the same semantic\nand stochastic features while varying on temporal features with appropriate\nguided conditions. We apply S2DM to video generation tasks, and explore the use\nof optical flow as temporal conditions. Our experimental results show that S2DM\noutperforms many existing methods in the task of video generation without any\ntemporal-feature modelling modules. For text-to-video generation tasks where\ntemporal conditions are not explicitly given, we propose a two-stage generation\nstrategy which can decouple the generation of temporal features from\nsemantic-content features. We show that, without additional training, our model\nintegrated with another temporal conditions generative model can still achieve\ncomparable performance with existing works. Our results can be viewd at\nhttps://s2dm.github.io/S2DM/.\n","authors":["Haoran Lang","Yuxuan Ge","Zheng Tian"],"pdf_url":"https://arxiv.org/pdf/2403.13408v2.pdf","comment":"17 pages, 6 figures"},{"id":"http://arxiv.org/abs/2403.15132v1","updated":"2024-03-22T11:33:04Z","published":"2024-03-22T11:33:04Z","title":"Transfer CLIP for Generalizable Image Denoising","summary":" Image denoising is a fundamental task in computer vision. While prevailing\ndeep learning-based supervised and self-supervised methods have excelled in\neliminating in-distribution noise, their susceptibility to out-of-distribution\n(OOD) noise remains a significant challenge. The recent emergence of\ncontrastive language-image pre-training (CLIP) model has showcased exceptional\ncapabilities in open-world image recognition and segmentation. Yet, the\npotential for leveraging CLIP to enhance the robustness of low-level tasks\nremains largely unexplored. This paper uncovers that certain dense features\nextracted from the frozen ResNet image encoder of CLIP exhibit\ndistortion-invariant and content-related properties, which are highly desirable\nfor generalizable denoising. Leveraging these properties, we devise an\nasymmetrical encoder-decoder denoising network, which incorporates dense\nfeatures including the noisy image and its multi-scale features from the frozen\nResNet encoder of CLIP into a learnable image decoder to achieve generalizable\ndenoising. The progressive feature augmentation strategy is further proposed to\nmitigate feature overfitting and improve the robustness of the learnable\ndecoder. Extensive experiments and comparisons conducted across diverse OOD\nnoises, including synthetic noise, real-world sRGB noise, and low-dose CT image\nnoise, demonstrate the superior generalization ability of our method.\n","authors":["Jun Cheng","Dong Liang","Shan Tan"],"pdf_url":"https://arxiv.org/pdf/2403.15132v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.15127v1","updated":"2024-03-22T11:30:10Z","published":"2024-03-22T11:30:10Z","title":"Gradient-based Sampling for Class Imbalanced Semi-supervised Object\n Detection","summary":" Current semi-supervised object detection (SSOD) algorithms typically assume\nclass balanced datasets (PASCAL VOC etc.) or slightly class imbalanced datasets\n(MS-COCO, etc). This assumption can be easily violated since real world\ndatasets can be extremely class imbalanced in nature, thus making the\nperformance of semi-supervised object detectors far from satisfactory. Besides,\nthe research for this problem in SSOD is severely under-explored. To bridge\nthis research gap, we comprehensively study the class imbalance problem for\nSSOD under more challenging scenarios, thus forming the first experimental\nsetting for class imbalanced SSOD (CI-SSOD). Moreover, we propose a simple yet\neffective gradient-based sampling framework that tackles the class imbalance\nproblem from the perspective of two types of confirmation biases. To tackle\nconfirmation bias towards majority classes, the gradient-based reweighting and\ngradient-based thresholding modules leverage the gradients from each class to\nfully balance the influence of the majority and minority classes. To tackle the\nconfirmation bias from incorrect pseudo labels of minority classes, the\nclass-rebalancing sampling module resamples unlabeled data following the\nguidance of the gradient-based reweighting module. Experiments on three\nproposed sub-tasks, namely MS-COCO, MS-COCO to Object365 and LVIS, suggest that\nour method outperforms current class imbalanced object detectors by clear\nmargins, serving as a baseline for future research in CI-SSOD. Code will be\navailable at https://github.com/nightkeepers/CI-SSOD.\n","authors":["Jiaming Li","Xiangru Lin","Wei Zhang","Xiao Tan","Yingying Li","Junyu Han","Errui Ding","Jingdong Wang","Guanbin Li"],"pdf_url":"https://arxiv.org/pdf/2403.15127v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2403.15124v1","updated":"2024-03-22T11:27:43Z","published":"2024-03-22T11:27:43Z","title":"EndoGSLAM: Real-Time Dense Reconstruction and Tracking in Endoscopic\n Surgeries using Gaussian Splatting","summary":" Precise camera tracking, high-fidelity 3D tissue reconstruction, and\nreal-time online visualization are critical for intrabody medical imaging\ndevices such as endoscopes and capsule robots. However, existing SLAM\n(Simultaneous Localization and Mapping) methods often struggle to achieve both\ncomplete high-quality surgical field reconstruction and efficient computation,\nrestricting their intraoperative applications among endoscopic surgeries. In\nthis paper, we introduce EndoGSLAM, an efficient SLAM approach for endoscopic\nsurgeries, which integrates streamlined Gaussian representation and\ndifferentiable rasterization to facilitate over 100 fps rendering speed during\nonline camera tracking and tissue reconstructing. Extensive experiments show\nthat EndoGSLAM achieves a better trade-off between intraoperative availability\nand reconstruction quality than traditional or neural SLAM approaches, showing\ntremendous potential for endoscopic surgeries. The project page is at\nhttps://EndoGSLAM.loping151.com\n","authors":["Kailing Wang","Chen Yang","Yuehao Wang","Sikuang Li","Yan Wang","Qi Dou","Xiaokang Yang","Wei Shen"],"pdf_url":"https://arxiv.org/pdf/2403.15124v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15121v1","updated":"2024-03-22T11:24:31Z","published":"2024-03-22T11:24:31Z","title":"SYNCS: Synthetic Data and Contrastive Self-Supervised Training for\n Central Sulcus Segmentation","summary":" Bipolar disorder (BD) and schizophrenia (SZ) are severe mental disorders with\nprofound societal impact. Identifying risk markers early is crucial for\nunderstanding disease progression and enabling preventive measures. The Danish\nHigh Risk and Resilience Study (VIA) focuses on understanding early disease\nprocesses, particularly in children with familial high risk (FHR).\nUnderstanding structural brain changes associated with these diseases during\nearly stages is essential for effective interventions. The central sulcus (CS)\nis a prominent brain landmark related to brain regions involved in motor and\nsensory processing. Analyzing CS morphology can provide valuable insights into\nneurodevelopmental abnormalities in the FHR group. However, segmenting the\ncentral sulcus (CS) presents challenges due to its variability, especially in\nadolescents. This study introduces two novel approaches to improve CS\nsegmentation: synthetic data generation to model CS variability and\nself-supervised pre-training with multi-task learning to adapt models to new\ncohorts. These methods aim to enhance segmentation performance across diverse\npopulations, eliminating the need for extensive preprocessing.\n","authors":["Vladyslav Zalevskyi","Kristoffer Hougaard Madsen"],"pdf_url":"https://arxiv.org/pdf/2403.15121v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15119v1","updated":"2024-03-22T11:21:51Z","published":"2024-03-22T11:21:51Z","title":"An Open-World, Diverse, Cross-Spatial-Temporal Benchmark for Dynamic\n Wild Person Re-Identification","summary":" Person re-identification (ReID) has made great strides thanks to the\ndata-driven deep learning techniques. However, the existing benchmark datasets\nlack diversity, and models trained on these data cannot generalize well to\ndynamic wild scenarios. To meet the goal of improving the explicit\ngeneralization of ReID models, we develop a new Open-World, Diverse,\nCross-Spatial-Temporal dataset named OWD with several distinct features. 1)\nDiverse collection scenes: multiple independent open-world and highly dynamic\ncollecting scenes, including streets, intersections, shopping malls, etc. 2)\nDiverse lighting variations: long time spans from daytime to nighttime with\nabundant illumination changes. 3) Diverse person status: multiple camera\nnetworks in all seasons with normal/adverse weather conditions and diverse\npedestrian appearances (e.g., clothes, personal belongings, poses, etc.). 4)\nProtected privacy: invisible faces for privacy critical applications. To\nimprove the implicit generalization of ReID, we further propose a Latent Domain\nExpansion (LDE) method to develop the potential of source data, which decouples\ndiscriminative identity-relevant and trustworthy domain-relevant features and\nimplicitly enforces domain-randomized identity feature space expansion with\nricher domain diversity to facilitate domain invariant representations. Our\ncomprehensive evaluations with most benchmark datasets in the community are\ncrucial for progress, although this work is far from the grand goal toward\nopen-world and dynamic wild applications.\n","authors":["Lei Zhang","Xiaowei Fu","Fuxiang Huang","Yi Yang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2403.15119v1.pdf","comment":"Accepted by IJCV in 2024"},{"id":"http://arxiv.org/abs/2403.15107v1","updated":"2024-03-22T10:51:31Z","published":"2024-03-22T10:51:31Z","title":"PseudoTouch: Efficiently Imaging the Surface Feel of Objects for Robotic\n Manipulation","summary":" Humans seemingly incorporate potential touch signals in their perception. Our\ngoal is to equip robots with a similar capability, which we term \\ourmodel.\n\\ourmodel aims to predict the expected touch signal based on a visual patch\nrepresenting the touched area. We frame this problem as the task of learning a\nlow-dimensional visual-tactile embedding, wherein we encode a depth patch from\nwhich we decode the tactile signal. To accomplish this task, we employ ReSkin,\nan inexpensive and replaceable magnetic-based tactile sensor. Using ReSkin, we\ncollect and train PseudoTouch on a dataset comprising aligned tactile and\nvisual data pairs obtained through random touching of eight basic geometric\nshapes. We demonstrate the efficacy of PseudoTouch through its application to\ntwo downstream tasks: object recognition and grasp stability prediction. In the\nobject recognition task, we evaluate the learned embedding's performance on a\nset of five basic geometric shapes and five household objects. Using\nPseudoTouch, we achieve an object recognition accuracy 84% after just ten\ntouches, surpassing a proprioception baseline. For the grasp stability task, we\nuse ACRONYM labels to train and evaluate a grasp success predictor using\nPseudoTouch's predictions derived from virtual depth information. Our approach\nyields an impressive 32% absolute improvement in accuracy compared to the\nbaseline relying on partial point cloud data. We make the data, code, and\ntrained models publicly available at http://pseudotouch.cs.uni-freiburg.de.\n","authors":["Adrian Röfer","Nick Heppert","Abdallah Ayman","Eugenio Chisari","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2403.15107v1.pdf","comment":"8 pages, 7 figures, 2 tables, submitted to IROS2024"},{"id":"http://arxiv.org/abs/2403.15103v1","updated":"2024-03-22T10:42:25Z","published":"2024-03-22T10:42:25Z","title":"Improving cross-domain brain tissue segmentation in fetal MRI with\n synthetic data","summary":" Segmentation of fetal brain tissue from magnetic resonance imaging (MRI)\nplays a crucial role in the study of in utero neurodevelopment. However,\nautomated tools face substantial domain shift challenges as they must be robust\nto highly heterogeneous clinical data, often limited in numbers and lacking\nannotations. Indeed, high variability of the fetal brain morphology, MRI\nacquisition parameters, and superresolution reconstruction (SR) algorithms\nadversely affect the model's performance when evaluated out-of-domain. In this\nwork, we introduce FetalSynthSeg, a domain randomization method to segment\nfetal brain MRI, inspired by SynthSeg. Our results show that models trained\nsolely on synthetic data outperform models trained on real data in out-ofdomain\nsettings, validated on a 120-subject cross-domain dataset. Furthermore, we\nextend our evaluation to 40 subjects acquired using lowfield (0.55T) MRI and\nreconstructed with novel SR models, showcasing robustness across different\nmagnetic field strengths and SR algorithms. Leveraging a generative synthetic\napproach, we tackle the domain shift problem in fetal brain MRI and offer\ncompelling prospects for applications in fields with limited and highly\nheterogeneous data.\n","authors":["Vladyslav Zalevskyi","Thomas Sanchez","Margaux Roulet","Jordina Aviles Verddera","Jana Hutter","Hamza Kebiri","Meritxell Bach Cuadra"],"pdf_url":"https://arxiv.org/pdf/2403.15103v1.pdf","comment":"10 pages, 5 figures, 1 table"},{"id":"http://arxiv.org/abs/2403.15098v1","updated":"2024-03-22T10:36:50Z","published":"2024-03-22T10:36:50Z","title":"UniTraj: A Unified Framework for Scalable Vehicle Trajectory Prediction","summary":" Vehicle trajectory prediction has increasingly relied on data-driven\nsolutions, but their ability to scale to different data domains and the impact\nof larger dataset sizes on their generalization remain under-explored. While\nthese questions can be studied by employing multiple datasets, it is\nchallenging due to several discrepancies, \\textit{e.g.,} in data formats, map\nresolution, and semantic annotation types. To address these challenges, we\nintroduce UniTraj, a comprehensive framework that unifies various datasets,\nmodels, and evaluation criteria, presenting new opportunities for the vehicle\ntrajectory prediction field. In particular, using UniTraj, we conduct extensive\nexperiments and find that model performance significantly drops when\ntransferred to other datasets. However, enlarging data size and diversity can\nsubstantially improve performance, leading to a new state-of-the-art result for\nthe nuScenes dataset. We provide insights into dataset characteristics to\nexplain these findings. The code can be found here:\n\\hyperlink{https://github.com/vita-epfl/UniTraj}{https://github.com/vita-epfl/UniTraj}.\n","authors":["Lan Feng","Mohammadhossein Bahari","Kaouther Messaoud Ben Amor","Éloi Zablocki","Matthieu Cord","Alexandre Alahi"],"pdf_url":"https://arxiv.org/pdf/2403.15098v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00354v3","updated":"2024-03-22T10:36:47Z","published":"2023-09-30T12:17:36Z","title":"AI-Dentify: Deep learning for proximal caries detection on bitewing\n x-ray -- HUNT4 Oral Health Study","summary":" Background: Dental caries diagnosis requires the manual inspection of\ndiagnostic bitewing images of the patient, followed by a visual inspection and\nprobing of the identified dental pieces with potential lesions. Yet the use of\nartificial intelligence, and in particular deep-learning, has the potential to\naid in the diagnosis by providing a quick and informative analysis of the\nbitewing images.\n Methods: A dataset of 13,887 bitewings from the HUNT4 Oral Health Study were\nannotated individually by six different experts, and used to train three\ndifferent object detection deep-learning architectures: RetinaNet (ResNet50),\nYOLOv5 (M size), and EfficientDet (D0 and D1 sizes). A consensus dataset of 197\nimages, annotated jointly by the same six dentist, was used for evaluation. A\nfive-fold cross validation scheme was used to evaluate the performance of the\nAI models.\n Results: he trained models show an increase in average precision and\nF1-score, and decrease of false negative rate, with respect to the dental\nclinicians. When compared against the dental clinicians, the YOLOv5 model shows\nthe largest improvement, reporting 0.647 mean average precision, 0.548 mean\nF1-score, and 0.149 mean false negative rate. Whereas the best annotators on\neach of these metrics reported 0.299, 0.495, and 0.164 respectively.\n Conclusion: Deep-learning models have shown the potential to assist dental\nprofessionals in the diagnosis of caries. Yet, the task remains challenging due\nto the artifacts natural to the bitewing images.\n","authors":["Javier Pérez de Frutos","Ragnhild Holden Helland","Shreya Desai","Line Cathrine Nymoen","Thomas Langø","Theodor Remman","Abhijit Sen"],"pdf_url":"https://arxiv.org/pdf/2310.00354v3.pdf","comment":"24 pages, 5 figure, 7 tables"},{"id":"http://arxiv.org/abs/2304.09793v2","updated":"2024-03-22T10:36:32Z","published":"2023-04-19T16:21:14Z","title":"Event-based Simultaneous Localization and Mapping: A Comprehensive\n Survey","summary":" In recent decades, visual simultaneous localization and mapping (vSLAM) has\ngained significant interest in both academia and industry. It estimates camera\nmotion and reconstructs the environment concurrently using visual sensors on a\nmoving robot. However, conventional cameras are limited by hardware, including\nmotion blur and low dynamic range, which can negatively impact performance in\nchallenging scenarios like high-speed motion and high dynamic range\nillumination. Recent studies have demonstrated that event cameras, a new type\nof bio-inspired visual sensor, offer advantages such as high temporal\nresolution, dynamic range, low power consumption, and low latency. This paper\npresents a timely and comprehensive review of event-based vSLAM algorithms that\nexploit the benefits of asynchronous and irregular event streams for\nlocalization and mapping tasks. The review covers the working principle of\nevent cameras and various event representations for preprocessing event data.\nIt also categorizes event-based vSLAM methods into four main categories:\nfeature-based, direct, motion-compensation, and deep learning methods, with\ndetailed discussions and practical guidance for each approach. Furthermore, the\npaper evaluates the state-of-the-art methods on various benchmarks,\nhighlighting current challenges and future opportunities in this emerging\nresearch area. A public repository will be maintained to keep track of the\nrapid developments in this field at\n{\\url{https://github.com/kun150kun/ESLAM-survey}}.\n","authors":["Kunping Huang","Sen Zhang","Jing Zhang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2304.09793v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14370v2","updated":"2024-03-22T10:26:33Z","published":"2024-03-21T12:57:30Z","title":"SyncTweedies: A General Generative Framework Based on Synchronized\n Diffusions","summary":" We introduce a general framework for generating diverse visual content,\nincluding ambiguous images, panorama images, mesh textures, and Gaussian splat\ntextures, by synchronizing multiple diffusion processes. We present exhaustive\ninvestigation into all possible scenarios for synchronizing multiple diffusion\nprocesses through a canonical space and analyze their characteristics across\napplications. In doing so, we reveal a previously unexplored case: averaging\nthe outputs of Tweedie's formula while conducting denoising in multiple\ninstance spaces. This case also provides the best quality with the widest\napplicability to downstream tasks. We name this case SyncTweedies. In our\nexperiments generating visual content aforementioned, we demonstrate the\nsuperior quality of generation by SyncTweedies compared to other\nsynchronization methods, optimization-based and iterative-update-based methods.\n","authors":["Jaihoon Kim","Juil Koo","Kyeongmin Yeo","Minhyuk Sung"],"pdf_url":"https://arxiv.org/pdf/2403.14370v2.pdf","comment":"Project page: https://synctweedies.github.io/"},{"id":"http://arxiv.org/abs/2402.15756v2","updated":"2024-03-22T10:19:06Z","published":"2024-02-24T08:07:48Z","title":"Detection Is Tracking: Point Cloud Multi-Sweep Deep Learning Models\n Revisited","summary":" Conventional tracking paradigm takes in instantaneous measurements such as\nrange and bearing, and produces object tracks across time. In applications such\nas autonomous driving, lidar measurements in the form of point clouds are\nusually passed through a \"virtual sensor\" realized by a deep learning model, to\nproduce \"measurements\" such as bounding boxes, which are in turn ingested by a\ntracking module to produce object tracks. Very often multiple lidar sweeps are\naccumulated in a buffer to merge and become the input to the virtual sensor. We\nargue in this paper that such an input already contains temporal information,\nand therefore the virtual sensor output should also contain temporal\ninformation, not just instantaneous values for the time corresponding to the\nend of the buffer. In particular, we present the deep learning model called\nMULti-Sweep PAired Detector (MULSPAD) that produces, for each detected object,\na pair of bounding boxes at both the end time and the beginning time of the\ninput buffer. This is achieved with fairly straightforward changes in commonly\nused lidar detection models, and with only marginal extra processing, but the\nresulting symmetry is satisfying. Such paired detections make it possible not\nonly to construct rudimentary trackers fairly easily, but also to construct\nmore sophisticated trackers that can exploit the extra information conveyed by\nthe pair and be robust to choices of motion models and object birth/death\nmodels. We have conducted preliminary training and experimentation using Waymo\nOpen Dataset, which shows the efficacy of our proposed method.\n","authors":["Lingji Chen"],"pdf_url":"https://arxiv.org/pdf/2402.15756v2.pdf","comment":"My previous employer Motional is requiring a review and approval\n process before I can publish this paper"},{"id":"http://arxiv.org/abs/2403.15089v1","updated":"2024-03-22T10:15:53Z","published":"2024-03-22T10:15:53Z","title":"IFSENet : Harnessing Sparse Iterations for Interactive Few-shot\n Segmentation Excellence","summary":" Training a computer vision system to segment a novel class typically requires\ncollecting and painstakingly annotating lots of images with objects from that\nclass. Few-shot segmentation techniques reduce the required number of images to\nlearn to segment a new class, but careful annotations of object boundaries are\nstill required. On the other hand, interactive segmentation techniques only\nfocus on incrementally improving the segmentation of one object at a time\n(typically, using clicks given by an expert) in a class-agnostic manner. We\ncombine the two concepts to drastically reduce the effort required to train\nsegmentation models for novel classes. Instead of trivially feeding interactive\nsegmentation masks as ground truth to a few-shot segmentation model, we propose\nIFSENet, which can accept sparse supervision on a single or few support images\nin the form of clicks to generate masks on support (training, at least clicked\nupon once) as well as query (test, never clicked upon) images. To trade-off\neffort for accuracy flexibly, the number of images and clicks can be\nincrementally added to the support set to further improve the segmentation of\nsupport as well as query images. The proposed model approaches the accuracy of\nprevious state-of-the-art few-shot segmentation models with considerably lower\nannotation effort (clicks instead of maps), when tested on Pascal and SBD\ndatasets on query images. It also works well as an interactive segmentation\nmethod on support images.\n","authors":["Shreyas Chandgothia","Ardhendu Sekhar","Amit Sethi"],"pdf_url":"https://arxiv.org/pdf/2403.15089v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15082v1","updated":"2024-03-22T10:06:31Z","published":"2024-03-22T10:06:31Z","title":"Cell Variational Information Bottleneck Network","summary":" In this work, we propose Cell Variational Information Bottleneck Network\n(cellVIB), a convolutional neural network using information bottleneck\nmechanism, which can be combined with the latest feedforward network\narchitecture in an end-to-end training method. Our Cell Variational Information\nBottleneck Network is constructed by stacking VIB cells, which generate feature\nmaps with uncertainty. As layers going deeper, the regularization effect will\ngradually increase, instead of directly adding excessive regular constraints to\nthe output layer of the model as in Deep VIB. Under each VIB cell, the\nfeedforward process learns an independent mean term and an standard deviation\nterm, and predicts the Gaussian distribution based on them. The feedback\nprocess is based on reparameterization trick for effective training. This work\nperforms an extensive analysis on MNIST dataset to verify the effectiveness of\neach VIB cells, and provides an insightful analysis on how the VIB cells affect\nmutual information. Experiments conducted on CIFAR-10 also prove that our\ncellVIB is robust against noisy labels during training and against corrupted\nimages during testing. Then, we validate our method on PACS dataset, whose\nresults show that the VIB cells can significantly improve the generalization\nperformance of the basic model. Finally, in a more complex representation\nlearning task, face recognition, our network structure has also achieved very\ncompetitive results.\n","authors":["Zhonghua Zhai","Chen Ju","Jinsong Lan","Shuai Xiao"],"pdf_url":"https://arxiv.org/pdf/2403.15082v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15068v1","updated":"2024-03-22T09:48:50Z","published":"2024-03-22T09:48:50Z","title":"Integrating multiscale topology in digital pathology with pyramidal\n graph convolutional networks","summary":" Graph convolutional networks (GCNs) have emerged as a powerful alternative to\nmultiple instance learning with convolutional neural networks in digital\npathology, offering superior handling of structural information across various\nspatial ranges - a crucial aspect of learning from gigapixel H&E-stained whole\nslide images (WSI). However, graph message-passing algorithms often suffer from\noversmoothing when aggregating a large neighborhood. Hence, effective modeling\nof multi-range interactions relies on the careful construction of the graph.\nOur proposed multi-scale GCN (MS-GCN) tackles this issue by leveraging\ninformation across multiple magnification levels in WSIs. MS-GCN enables the\nsimultaneous modeling of long-range structural dependencies at lower\nmagnifications and high-resolution cellular details at higher magnifications,\nakin to analysis pipelines usually conducted by pathologists. The\narchitecture's unique configuration allows for the concurrent modeling of\nstructural patterns at lower magnifications and detailed cellular features at\nhigher ones, while also quantifying the contribution of each magnification\nlevel to the prediction. Through testing on different datasets, MS-GCN\ndemonstrates superior performance over existing single-magnification GCN\nmethods. The enhancement in performance and interpretability afforded by our\nmethod holds promise for advancing computational pathology models, especially\nin tasks requiring extensive spatial context.\n","authors":["Victor Ibañez","Przemyslaw Szostak","Quincy Wong","Konstanty Korski","Samaneh Abbasi-Sureshjani","Alvaro Gomariz"],"pdf_url":"https://arxiv.org/pdf/2403.15068v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15064v1","updated":"2024-03-22T09:46:11Z","published":"2024-03-22T09:46:11Z","title":"Recent Trends in 3D Reconstruction of General Non-Rigid Scenes","summary":" Reconstructing models of the real world, including 3D geometry, appearance,\nand motion of real scenes, is essential for computer graphics and computer\nvision. It enables the synthesizing of photorealistic novel views, useful for\nthe movie industry and AR/VR applications. It also facilitates the content\ncreation necessary in computer games and AR/VR by avoiding laborious manual\ndesign processes. Further, such models are fundamental for intelligent\ncomputing systems that need to interpret real-world scenes and actions to act\nand interact safely with the human world. Notably, the world surrounding us is\ndynamic, and reconstructing models of dynamic, non-rigidly moving scenes is a\nseverely underconstrained and challenging problem. This state-of-the-art report\n(STAR) offers the reader a comprehensive summary of state-of-the-art techniques\nwith monocular and multi-view inputs such as data from RGB and RGB-D sensors,\namong others, conveying an understanding of different approaches, their\npotential applications, and promising further research directions. The report\ncovers 3D reconstruction of general non-rigid scenes and further addresses the\ntechniques for scene decomposition, editing and controlling, and generalizable\nand generative modeling. More specifically, we first review the common and\nfundamental concepts necessary to understand and navigate the field and then\ndiscuss the state-of-the-art techniques by reviewing recent approaches that use\ntraditional and machine-learning-based neural representations, including a\ndiscussion on the newly enabled applications. The STAR is concluded with a\ndiscussion of the remaining limitations and open challenges.\n","authors":["Raza Yunus","Jan Eric Lenssen","Michael Niemeyer","Yiyi Liao","Christian Rupprecht","Christian Theobalt","Gerard Pons-Moll","Jia-Bin Huang","Vladislav Golyanik","Eddy Ilg"],"pdf_url":"https://arxiv.org/pdf/2403.15064v1.pdf","comment":"42 pages, 18 figures, 5 tables; State-of-the-Art Report at\n EUROGRAPHICS 2024"},{"id":"http://arxiv.org/abs/2403.15063v1","updated":"2024-03-22T09:40:52Z","published":"2024-03-22T09:40:52Z","title":"Towards a Comprehensive, Efficient and Promptable Anatomic Structure\n Segmentation Model using 3D Whole-body CT Scans","summary":" Segment anything model (SAM) demonstrates strong generalization ability on\nnatural image segmentation. However, its direct adaption in medical image\nsegmentation tasks shows significant performance drops with inferior accuracy\nand unstable results. It may also requires an excessive number of prompt points\nto obtain a reasonable accuracy. For segmenting 3D radiological CT or MRI\nscans, a 2D SAM model has to separately handle hundreds of 2D slices. Although\nquite a few studies explore adapting SAM into medical image volumes, the\nefficiency of 2D adaption methods is unsatisfactory and 3D adaptation methods\nonly capable of segmenting specific organs/tumors. In this work, we propose a\ncomprehensive and scalable 3D SAM model for whole-body CT segmentation, named\nCT-SAM3D. Instead of adapting SAM, we propose a 3D promptable segmentation\nmodel using a (nearly) fully labeled CT dataset. To train CT-SAM3D effectively,\nensuring the model's accurate responses to higher-dimensional spatial prompts\nis crucial, and 3D patch-wise training is required due to GPU memory\nconstraints. For this purpose, we propose two key technical developments: 1) a\nprogressively and spatially aligned prompt encoding method to effectively\nencode click prompts in local 3D space; and 2) a cross-patch prompt learning\nscheme to capture more 3D spatial context, which is beneficial for reducing the\nediting workloads when interactively prompting on large organs. CT-SAM3D is\ntrained and validated using a curated dataset of 1204 CT scans containing 107\nwhole-body anatomies, reporting significantly better quantitative performance\nagainst all previous SAM-derived models by a large margin with much fewer click\nprompts. Our model can handle segmenting unseen organ as well. Code, data, and\nour 3D interactive segmentation tool with quasi-real-time responses will be\nmade publicly available.\n","authors":["Heng Guo","Jianfeng Zhang","Jiaxing Huang","Tony C. W. Mok","Dazhou Guo","Ke Yan","Le Lu","Dakai Jin","Minfeng Xu"],"pdf_url":"https://arxiv.org/pdf/2403.15063v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15061v1","updated":"2024-03-22T09:38:16Z","published":"2024-03-22T09:38:16Z","title":"Subjective Quality Assessment of Compressed Tone-Mapped High Dynamic\n Range Videos","summary":" High Dynamic Range (HDR) videos are able to represent wider ranges of\ncontrasts and colors than Standard Dynamic Range (SDR) videos, giving more\nvivid experiences. Due to this, HDR videos are expected to grow into the\ndominant video modality of the future. However, HDR videos are incompatible\nwith existing SDR displays, which form the majority of affordable consumer\ndisplays on the market. Because of this, HDR videos must be processed by\ntone-mapping them to reduced bit-depths to service a broad swath of SDR-limited\nvideo consumers. Here, we analyze the impact of tone-mapping operators on the\nvisual quality of streaming HDR videos. To this end, we built the first\nlarge-scale subjectively annotated open-source database of compressed\ntone-mapped HDR videos, containing 15,000 tone-mapped sequences derived from 40\nunique HDR source contents. The videos in the database were labeled with more\nthan 750,000 subjective quality annotations, collected from more than 1,600\nunique human observers. We demonstrate the usefulness of the new subjective\ndatabase by benchmarking objective models of visual quality on it. We envision\nthat the new LIVE Tone-Mapped HDR (LIVE-TMHDR) database will enable significant\nprogress on HDR video tone mapping and quality assessment in the future. To\nthis end, we make the database freely available to the community at\nhttps://live.ece.utexas.edu/research/LIVE_TMHDR/index.html\n","authors":["Abhinau K. Venkataramanan","Alan C. Bovik"],"pdf_url":"https://arxiv.org/pdf/2403.15061v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12379v4","updated":"2024-03-22T09:36:53Z","published":"2023-12-19T18:11:19Z","title":"Mixture of Cluster-conditional LoRA Experts for Vision-language\n Instruction Tuning","summary":" Instruction tuning of Large Vision-language Models (LVLMs) has revolutionized\nthe development of versatile models with zero-shot generalization across a wide\nrange of downstream vision-language tasks. However, the diversity of training\ntasks of different sources and formats would lead to inevitable task conflicts,\nwhere different tasks conflict for the same set of model parameters, resulting\nin sub-optimal instructionfollowing abilities. To address that, we propose the\nMixture of Clusterconditional LoRA Experts (MoCLE), a novel Mixture of Experts\n(MoE) architecture designed to activate the task-customized model parameters\nbased on the instruction clusters. A separate universal expert is further\nincorporated to improve generalization capabilities of MoCLE for novel\ninstructions. Extensive experiments on 11 zero-shot tasks demonstrate the\neffectiveness of MoCLE.\n","authors":["Yunhao Gou","Zhili Liu","Kai Chen","Lanqing Hong","Hang Xu","Aoxue Li","Dit-Yan Yeung","James T. Kwok","Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.12379v4.pdf","comment":"Project website: https://gyhdog99.github.io/projects/mocle/"},{"id":"http://arxiv.org/abs/2403.15059v1","updated":"2024-03-22T09:32:31Z","published":"2024-03-22T09:32:31Z","title":"MM-Diff: High-Fidelity Image Personalization via Multi-Modal Condition\n Integration","summary":" Recent advances in tuning-free personalized image generation based on\ndiffusion models are impressive. However, to improve subject fidelity, existing\nmethods either retrain the diffusion model or infuse it with dense visual\nembeddings, both of which suffer from poor generalization and efficiency. Also,\nthese methods falter in multi-subject image generation due to the unconstrained\ncross-attention mechanism. In this paper, we propose MM-Diff, a unified and\ntuning-free image personalization framework capable of generating high-fidelity\nimages of both single and multiple subjects in seconds. Specifically, to\nsimultaneously enhance text consistency and subject fidelity, MM-Diff employs a\nvision encoder to transform the input image into CLS and patch embeddings. CLS\nembeddings are used on the one hand to augment the text embeddings, and on the\nother hand together with patch embeddings to derive a small number of\ndetail-rich subject embeddings, both of which are efficiently integrated into\nthe diffusion model through the well-designed multimodal cross-attention\nmechanism. Additionally, MM-Diff introduces cross-attention map constraints\nduring the training phase, ensuring flexible multi-subject image sampling\nduring inference without any predefined inputs (e.g., layout). Extensive\nexperiments demonstrate the superior performance of MM-Diff over other leading\nmethods.\n","authors":["Zhichao Wei","Qingkun Su","Long Qin","Weizhi Wang"],"pdf_url":"https://arxiv.org/pdf/2403.15059v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02733v3","updated":"2024-03-22T09:17:24Z","published":"2024-02-05T05:25:33Z","title":"ToonAging: Face Re-Aging upon Artistic Portrait Style Transfer","summary":" Face re-aging is a prominent field in computer vision and graphics, with\nsignificant applications in photorealistic domains such as movies, advertising,\nand live streaming. Recently, the need to apply face re-aging to\nnon-photorealistic images, like comics, illustrations, and animations, has\nemerged as an extension in various entertainment sectors. However, the lack of\na network that can seamlessly edit the apparent age in NPR images has limited\nthese tasks to a naive, sequential approach. This often results in unpleasant\nartifacts and a loss of facial attributes due to domain discrepancies. In this\npaper, we introduce a novel one-stage method for face re-aging combined with\nportrait style transfer, executed in a single generative step. We leverage\nexisting face re-aging and style transfer networks, both trained within the\nsame PR domain. Our method uniquely fuses distinct latent vectors, each\nresponsible for managing aging-related attributes and NPR appearance. By\nadopting an exemplar-based approach, our method offers greater flexibility\ncompared to domain-level fine-tuning approaches, which typically require\nseparate training or fine-tuning for each domain. This effectively addresses\nthe limitation of requiring paired datasets for re-aging and domain-level,\ndata-driven approaches for stylization. Our experiments show that our model can\neffortlessly generate re-aged images while simultaneously transferring the\nstyle of examples, maintaining both natural appearance and controllability.\n","authors":["Bumsoo Kim","Abdul Muqeet","Kyuchul Lee","Sanghyun Seo"],"pdf_url":"https://arxiv.org/pdf/2402.02733v3.pdf","comment":"14 pages, 15 figures, 1 table"},{"id":"http://arxiv.org/abs/2403.15049v1","updated":"2024-03-22T09:15:36Z","published":"2024-03-22T09:15:36Z","title":"Continual Vision-and-Language Navigation","summary":" Vision-and-Language Navigation (VLN) agents navigate to a destination using\nnatural language instructions and the visual information they observe. Existing\nmethods for training VLN agents presuppose fixed datasets, leading to a\nsignificant limitation: the introduction of new environments necessitates\nretraining with previously encountered environments to preserve their\nknowledge. This makes it difficult to train VLN agents that operate in the\never-changing real world. To address this limitation, we present the Continual\nVision-and-Language Navigation (CVLN) paradigm, designed to evaluate agents\ntrained through a continual learning process. For the training and evaluation\nof CVLN agents, we re-arrange existing VLN datasets to propose two datasets:\nCVLN-I, focused on navigation via initial-instruction interpretation, and\nCVLN-D, aimed at navigation through dialogue with other agents. Furthermore, we\npropose two novel rehearsal-based methods for CVLN, Perplexity Replay (PerpR)\nand Episodic Self-Replay (ESR). PerpR prioritizes replaying challenging\nepisodes based on action perplexity, while ESR replays previously predicted\naction logits to preserve learned behaviors. We demonstrate the effectiveness\nof the proposed methods on CVLN through extensive experiments.\n","authors":["Seongjun Jeong","Gi-Cheon Kang","Seongho Choi","Joochan Kim","Byoung-Tak Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.15049v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15048v1","updated":"2024-03-22T09:13:09Z","published":"2024-03-22T09:13:09Z","title":"Cartoon Hallucinations Detection: Pose-aware In Context Visual Learning","summary":" Large-scale Text-to-Image (TTI) models have become a common approach for\ngenerating training data in various generative fields. However, visual\nhallucinations, which contain perceptually critical defects, remain a concern,\nespecially in non-photorealistic styles like cartoon characters. We propose a\nnovel visual hallucination detection system for cartoon character images\ngenerated by TTI models. Our approach leverages pose-aware in-context visual\nlearning (PA-ICVL) with Vision-Language Models (VLMs), utilizing both RGB\nimages and pose information. By incorporating pose guidance from a fine-tuned\npose estimator, we enable VLMs to make more accurate decisions. Experimental\nresults demonstrate significant improvements in identifying visual\nhallucinations compared to baseline methods relying solely on RGB images. This\nresearch advances TTI models by mitigating visual hallucinations, expanding\ntheir potential in non-photorealistic domains.\n","authors":["Bumsoo Kim","Wonseop Shin","Kyuchul Lee","Sanghyun Seo"],"pdf_url":"https://arxiv.org/pdf/2403.15048v1.pdf","comment":"11 pages, 12 figures, 1 table, Project page:\n https://gh-bumsookim.github.io/Cartoon-Hallucinations-Detection/"},{"id":"http://arxiv.org/abs/2403.09572v2","updated":"2024-03-22T09:07:06Z","published":"2024-03-14T17:03:04Z","title":"Eyes Closed, Safety On: Protecting Multimodal LLMs via Image-to-Text\n Transformation","summary":" Multimodal large language models (MLLMs) have shown impressive reasoning\nabilities, which, however, are also more vulnerable to jailbreak attacks than\ntheir LLM predecessors. Although still capable of detecting unsafe responses,\nwe observe that safety mechanisms of the pre-aligned LLMs in MLLMs can be\neasily bypassed due to the introduction of image features. To construct robust\nMLLMs, we propose ECSO(Eyes Closed, Safety On), a novel training-free\nprotecting approach that exploits the inherent safety awareness of MLLMs, and\ngenerates safer responses via adaptively transforming unsafe images into texts\nto activate intrinsic safety mechanism of pre-aligned LLMs in MLLMs.\nExperiments on five state-of-the-art (SoTA) MLLMs demonstrate that our ECSO\nenhances model safety significantly (e.g., a 37.6% improvement on the\nMM-SafetyBench (SD+OCR), and 71.3% on VLSafe for the LLaVA-1.5-7B), while\nconsistently maintaining utility results on common MLLM benchmarks.\nFurthermore, we show that ECSO can be used as a data engine to generate\nsupervised-finetuning (SFT) data for MLLM alignment without extra human\nintervention.\n","authors":["Yunhao Gou","Kai Chen","Zhili Liu","Lanqing Hong","Hang Xu","Zhenguo Li","Dit-Yan Yeung","James T. Kwok","Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.09572v2.pdf","comment":"Project Page: https://gyhdog99.github.io/projects/ecso/"},{"id":"http://arxiv.org/abs/2403.15044v1","updated":"2024-03-22T09:00:24Z","published":"2024-03-22T09:00:24Z","title":"Multimodal Fusion with Pre-Trained Model Features in Affective Behaviour\n Analysis In-the-wild","summary":" Multimodal fusion is a significant method for most multimodal tasks. With the\nrecent surge in the number of large pre-trained models, combining both\nmultimodal fusion methods and pre-trained model features can achieve\noutstanding performance in many multimodal tasks. In this paper, we present our\napproach, which leverages both advantages for addressing the task of Expression\n(Expr) Recognition and Valence-Arousal (VA) Estimation. We evaluate the\nAff-Wild2 database using pre-trained models, then extract the final hidden\nlayers of the models as features. Following preprocessing and interpolation or\nconvolution to align the extracted features, different models are employed for\nmodal fusion. Our code is available at GitHub - FulgenceWen/ABAW6th.\n","authors":["Zhuofan Wen","Fengyu Zhang","Siyuan Zhang","Haiyang Sun","Mingyu Xu","Licai Sun","Zheng Lian","Bin Liu","Jianhua Tao"],"pdf_url":"https://arxiv.org/pdf/2403.15044v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08856v3","updated":"2024-03-22T08:51:55Z","published":"2023-08-17T08:29:54Z","title":"MV-ROPE: Multi-view Constraints for Robust Category-level Object Pose\n and Size Estimation","summary":" Recently there has been a growing interest in category-level object pose and\nsize estimation, and prevailing methods commonly rely on single view RGB-D\nimages. However, one disadvantage of such methods is that they require accurate\ndepth maps which cannot be produced by consumer-grade sensors. Furthermore,\nmany practical real-world situations involve a moving camera that continuously\nobserves its surroundings, and the temporal information of the input video\nstreams is simply overlooked by single-view methods. We propose a novel\nsolution that makes use of RGB video streams. Our framework consists of three\nmodules: a scale-aware monocular dense SLAM solution, a lightweight object pose\npredictor, and an object-level pose graph optimizer. The SLAM module utilizes a\nvideo stream and additional scale-sensitive readings to estimate camera poses\nand metric depth. The object pose predictor then generates canonical object\nrepresentations from RGB images. The object pose is estimated through geometric\nregistration of these canonical object representations with estimated object\ndepth points. All per-view estimates finally undergo optimization within a pose\ngraph, culminating in the output of robust and accurate canonical object poses.\nOur experimental results demonstrate that when utilizing public dataset\nsequences with high-quality depth information, the proposed method exhibits\ncomparable performance to state-of-the-art RGB-D methods. We also collect and\nevaluate on new datasets containing depth maps of varying quality to further\nquantitatively benchmark the proposed method alongside previous RGB-D based\nmethods. We demonstrate a significant advantage in scenarios where depth input\nis absent or the quality of depth sensing is limited.\n","authors":["Jiaqi Yang","Yucong Chen","Xiangting Meng","Chenxin Yan","Min Li","Ran Cheng","Lige Liu","Tao Sun","Laurent Kneip"],"pdf_url":"https://arxiv.org/pdf/2308.08856v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14189v3","updated":"2024-03-22T08:45:52Z","published":"2023-11-23T20:14:50Z","title":"D-SCo: Dual-Stream Conditional Diffusion for Monocular Hand-Held Object\n Reconstruction","summary":" Reconstructing hand-held objects from a single RGB image is a challenging\ntask in computer vision. In contrast to prior works that utilize deterministic\nmodeling paradigms, we employ a point cloud denoising diffusion model to\naccount for the probabilistic nature of this problem. In the core, we introduce\ncentroid-fixed dual-stream conditional diffusion for monocular hand-held object\nreconstruction (D-SCo), tackling two predominant challenges. First, to avoid\nthe object centroid from deviating, we utilize a novel hand-constrained\ncentroid fixing paradigm, enhancing the stability of diffusion and reverse\nprocesses and the precision of feature projection. Second, we introduce a\ndual-stream denoiser to semantically and geometrically model hand-object\ninteractions with a novel unified hand-object semantic embedding, enhancing the\nreconstruction performance of the hand-occluded region of the object.\nExperiments on the synthetic ObMan dataset and three real-world datasets HO3D,\nMOW and DexYCB demonstrate that our approach can surpass all other\nstate-of-the-art methods. Codes will be released.\n","authors":["Bowen Fu","Gu Wang","Chenyangguang Zhang","Yan Di","Ziqin Huang","Zhiying Leng","Fabian Manhardt","Xiangyang Ji","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2311.14189v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15033v1","updated":"2024-03-22T08:32:30Z","published":"2024-03-22T08:32:30Z","title":"Toward Tiny and High-quality Facial Makeup with Data Amplify Learning","summary":" Contemporary makeup approaches primarily hinge on unpaired learning\nparadigms, yet they grapple with the challenges of inaccurate supervision\n(e.g., face misalignment) and sophisticated facial prompts (including face\nparsing, and landmark detection). These challenges prohibit low-cost deployment\nof facial makeup models, especially on mobile devices. To solve above problems,\nwe propose a brand-new learning paradigm, termed \"Data Amplify Learning (DAL),\"\nalongside a compact makeup model named \"TinyBeauty.\" The core idea of DAL lies\nin employing a Diffusion-based Data Amplifier (DDA) to \"amplify\" limited images\nfor the model training, thereby enabling accurate pixel-to-pixel supervision\nwith merely a handful of annotations. Two pivotal innovations in DDA facilitate\nthe above training approach: (1) A Residual Diffusion Model (RDM) is designed\nto generate high-fidelity detail and circumvent the detail vanishing problem in\nthe vanilla diffusion models; (2) A Fine-Grained Makeup Module (FGMM) is\nproposed to achieve precise makeup control and combination while retaining face\nidentity. Coupled with DAL, TinyBeauty necessitates merely 80K parameters to\nachieve a state-of-the-art performance without intricate face prompts.\nMeanwhile, TinyBeauty achieves a remarkable inference speed of up to 460 fps on\nthe iPhone 13. Extensive experiments show that DAL can produce highly\ncompetitive makeup models using only 5 image pairs.\n","authors":["Qiaoqiao Jin","Xuanhong Chen","Meiguang Jin","Ying Cheng","Rui Shi","Yucheng Zheng","Yupeng Zhu","Bingbing Ni"],"pdf_url":"https://arxiv.org/pdf/2403.15033v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15032v1","updated":"2024-03-22T08:27:25Z","published":"2024-03-22T08:27:25Z","title":"An Integrated Neighborhood and Scale Information Network for Open-Pit\n Mine Change Detection in High-Resolution Remote Sensing Images","summary":" Open-pit mine change detection (CD) in high-resolution (HR) remote sensing\nimages plays a crucial role in mineral development and environmental\nprotection. Significant progress has been made in this field in recent years,\nlargely due to the advancement of deep learning techniques. However, existing\ndeep-learning-based CD methods encounter challenges in effectively integrating\nneighborhood and scale information, resulting in suboptimal performance.\nTherefore, by exploring the influence patterns of neighborhood and scale\ninformation, this paper proposes an Integrated Neighborhood and Scale\nInformation Network (INSINet) for open-pit mine CD in HR remote sensing images.\nSpecifically, INSINet introduces 8-neighborhood-image information to acquire a\nlarger receptive field, improving the recognition of center image boundary\nregions. Drawing on techniques of skip connection, deep supervision, and\nattention mechanism, the multi-path deep supervised attention (MDSA) module is\ndesigned to enhance multi-scale information fusion and change feature\nextraction. Experimental analysis reveals that incorporating neighborhood and\nscale information enhances the F1 score of INSINet by 6.40%, with improvements\nof 3.08% and 3.32% respectively. INSINet outperforms existing methods with an\nOverall Accuracy of 97.69%, Intersection over Union of 71.26%, and F1 score of\n83.22%. INSINet shows significance for open-pit mine CD in HR remote sensing\nimages.\n","authors":["Zilin Xie","Kangning Li","Jinbao Jiang","Jinzhong Yang","Xiaojun Qiao","Deshuai Yuan","Cheng Nie"],"pdf_url":"https://arxiv.org/pdf/2403.15032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15031v1","updated":"2024-03-22T08:26:31Z","published":"2024-03-22T08:26:31Z","title":"Image Classification with Rotation-Invariant Variational Quantum\n Circuits","summary":" Variational quantum algorithms are gaining attention as an early application\nof Noisy Intermediate-Scale Quantum (NISQ) devices. One of the main problems of\nvariational methods lies in the phenomenon of Barren Plateaus, present in the\noptimization of variational parameters. Adding geometric inductive bias to the\nquantum models has been proposed as a potential solution to mitigate this\nproblem, leading to a new field called Geometric Quantum Machine Learning. In\nthis work, an equivariant architecture for variational quantum classifiers is\nintroduced to create a label-invariant model for image classification with\n$C_4$ rotational label symmetry. The equivariant circuit is benchmarked against\ntwo different architectures, and it is experimentally observed that the\ngeometric approach boosts the model's performance. Finally, a classical\nequivariant convolution operation is proposed to extend the quantum model for\nthe processing of larger images, employing the resources available in NISQ\ndevices.\n","authors":["Paul San Sebastian","Mikel Cañizo","Román Orús"],"pdf_url":"https://arxiv.org/pdf/2403.15031v1.pdf","comment":"9 pages, 9 figures"},{"id":"http://arxiv.org/abs/2403.15026v1","updated":"2024-03-22T08:16:59Z","published":"2024-03-22T08:16:59Z","title":"VRSO: Visual-Centric Reconstruction for Static Object Annotation","summary":" As a part of the perception results of intelligent driving systems, static\nobject detection (SOD) in 3D space provides crucial cues for driving\nenvironment understanding. With the rapid deployment of deep neural networks\nfor SOD tasks, the demand for high-quality training samples soars. The\ntraditional, also reliable, way is manual labeling over the dense LiDAR point\nclouds and reference images. Though most public driving datasets adopt this\nstrategy to provide SOD ground truth (GT), it is still expensive (requires\nLiDAR scanners) and low-efficient (time-consuming and unscalable) in practice.\nThis paper introduces VRSO, a visual-centric approach for static object\nannotation. VRSO is distinguished in low cost, high efficiency, and high\nquality: (1) It recovers static objects in 3D space with only camera images as\ninput, and (2) manual labeling is barely involved since GT for SOD tasks is\ngenerated based on an automatic reconstruction and annotation pipeline. (3)\nExperiments on the Waymo Open Dataset show that the mean reprojection error\nfrom VRSO annotation is only 2.6 pixels, around four times lower than the Waymo\nlabeling (10.6 pixels). Source code is available at:\nhttps://github.com/CaiYingFeng/VRSO.\n","authors":["Chenyao Yu","Yingfeng Cai","Jiaxin Zhang","Hui Kong","Wei Sui","Cong Yang"],"pdf_url":"https://arxiv.org/pdf/2403.15026v1.pdf","comment":"submitted to iros 2024"},{"id":"http://arxiv.org/abs/2305.03907v3","updated":"2024-03-22T08:10:07Z","published":"2023-05-06T02:53:13Z","title":"Listen to Look into the Future: Audio-Visual Egocentric Gaze\n Anticipation","summary":" Egocentric gaze anticipation serves as a key building block for the emerging\ncapability of Augmented Reality. Notably, gaze behavior is driven by both\nvisual cues and audio signals during daily activities. Motivated by this\nobservation, we introduce the first model that leverages both the video and\naudio modalities for egocentric gaze anticipation. Specifically, we propose a\nContrastive Spatial-Temporal Separable (CSTS) fusion approach that adopts two\nmodules to separately capture audio-visual correlations in spatial and temporal\ndimensions, and applies a contrastive loss on the re-weighted audio-visual\nfeatures from fusion modules for representation learning. We conduct extensive\nablation studies and thorough analysis using two egocentric video datasets:\nEgo4D and Aria, to validate our model design. We demonstrate the audio improves\nthe performance by +2.5% and +2.4% on the two datasets. Our model also\noutperforms the prior state-of-the-art methods by at least +1.9% and +1.6%.\nMoreover, we provide visualizations to show the gaze anticipation results and\nprovide additional insights into audio-visual representation learning. The code\nand data split are available on our website\n(https://bolinlai.github.io/CSTS-EgoGazeAnticipation/).\n","authors":["Bolin Lai","Fiona Ryan","Wenqi Jia","Miao Liu","James M. Rehg"],"pdf_url":"https://arxiv.org/pdf/2305.03907v3.pdf","comment":"30 pages"},{"id":"http://arxiv.org/abs/2403.15019v1","updated":"2024-03-22T08:05:30Z","published":"2024-03-22T08:05:30Z","title":"BSNet: Box-Supervised Simulation-assisted Mean Teacher for 3D Instance\n Segmentation","summary":" 3D instance segmentation (3DIS) is a crucial task, but point-level\nannotations are tedious in fully supervised settings. Thus, using bounding\nboxes (bboxes) as annotations has shown great potential. The current mainstream\napproach is a two-step process, involving the generation of pseudo-labels from\nbox annotations and the training of a 3DIS network with the pseudo-labels.\nHowever, due to the presence of intersections among bboxes, not every point has\na determined instance label, especially in overlapping areas. To generate\nhigher quality pseudo-labels and achieve more precise weakly supervised 3DIS\nresults, we propose the Box-Supervised Simulation-assisted Mean Teacher for 3D\nInstance Segmentation (BSNet), which devises a novel pseudo-labeler called\nSimulation-assisted Transformer. The labeler consists of two main components.\nThe first is Simulation-assisted Mean Teacher, which introduces Mean Teacher\nfor the first time in this task and constructs simulated samples to assist the\nlabeler in acquiring prior knowledge about overlapping areas. To better model\nlocal-global structure, we also propose Local-Global Aware Attention as the\ndecoder for teacher and student labelers. Extensive experiments conducted on\nthe ScanNetV2 and S3DIS datasets verify the superiority of our designs. Code is\navailable at\n\\href{https://github.com/peoplelu/BSNet}{https://github.com/peoplelu/BSNet}.\n","authors":["Jiahao Lu","Jiacheng Deng","Tianzhu Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.15019v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15017v1","updated":"2024-03-22T08:03:10Z","published":"2024-03-22T08:03:10Z","title":"Vehicle Detection Performance in Nordic Region","summary":" This paper addresses the critical challenge of vehicle detection in the harsh\nwinter conditions in the Nordic regions, characterized by heavy snowfall,\nreduced visibility, and low lighting. Due to their susceptibility to\nenvironmental distortions and occlusions, traditional vehicle detection methods\nhave struggled in these adverse conditions. The advanced proposed deep learning\narchitectures brought promise, yet the unique difficulties of detecting\nvehicles in Nordic winters remain inadequately addressed. This study uses the\nNordic Vehicle Dataset (NVD), which has UAV images from northern Sweden, to\nevaluate the performance of state-of-the-art vehicle detection algorithms under\nchallenging weather conditions. Our methodology includes a comprehensive\nevaluation of single-stage, two-stage, and transformer-based detectors against\nthe NVD. We propose a series of enhancements tailored to each detection\nframework, including data augmentation, hyperparameter tuning, transfer\nlearning, and novel strategies designed explicitly for the DETR model. Our\nfindings not only highlight the limitations of current detection systems in the\nNordic environment but also offer promising directions for enhancing these\nalgorithms for improved robustness and accuracy in vehicle detection amidst the\ncomplexities of winter landscapes. The code and the dataset are available at\nhttps://nvd.ltu-ai.dev\n","authors":["Hamam Mokayed","Rajkumar Saini","Oluwatosin Adewumi","Lama Alkhaled","Bjorn Backe","Palaiahnakote Shivakumara","Olle Hagner","Yan Chai Hum"],"pdf_url":"https://arxiv.org/pdf/2403.15017v1.pdf","comment":"submitted to ICPR2024"},{"id":"http://arxiv.org/abs/2403.15013v1","updated":"2024-03-22T07:57:27Z","published":"2024-03-22T07:57:27Z","title":"Extracting Human Attention through Crowdsourced Patch Labeling","summary":" In image classification, a significant problem arises from bias in the\ndatasets. When it contains only specific types of images, the classifier begins\nto rely on shortcuts - simplistic and erroneous rules for decision-making. This\nleads to high performance on the training dataset but inferior results on new,\nvaried images, as the classifier's generalization capability is reduced. For\nexample, if the images labeled as mustache consist solely of male figures, the\nmodel may inadvertently learn to classify images by gender rather than the\npresence of a mustache. One approach to mitigate such biases is to direct the\nmodel's attention toward the target object's location, usually marked using\nbounding boxes or polygons for annotation. However, collecting such annotations\nrequires substantial time and human effort. Therefore, we propose a novel\npatch-labeling method that integrates AI assistance with crowdsourcing to\ncapture human attention from images, which can be a viable solution for\nmitigating bias. Our method consists of two steps. First, we extract the\napproximate location of a target using a pre-trained saliency detection model\nsupplemented by human verification for accuracy. Then, we determine the\nhuman-attentive area in the image by iteratively dividing the image into\nsmaller patches and employing crowdsourcing to ascertain whether each patch can\nbe classified as the target object. We demonstrated the effectiveness of our\nmethod in mitigating bias through improved classification accuracy and the\nrefined focus of the model. Also, crowdsourced experiments validate that our\nmethod collects human annotation up to 3.4 times faster than annotating object\nlocations with polygons, significantly reducing the need for human resources.\nWe conclude the paper by discussing the advantages of our method in a\ncrowdsourcing context, mainly focusing on aspects of human errors and\naccessibility.\n","authors":["Minsuk Chang","Seokhyeon Park","Hyeon Jeon","Aeri Cho","Soohyun Lee","Jinwook Seo"],"pdf_url":"https://arxiv.org/pdf/2403.15013v1.pdf","comment":"21 pages, 11 figures"},{"id":"http://arxiv.org/abs/2401.00029v3","updated":"2024-03-22T07:52:28Z","published":"2023-12-29T05:28:35Z","title":"6D-Diff: A Keypoint Diffusion Framework for 6D Object Pose Estimation","summary":" Estimating the 6D object pose from a single RGB image often involves noise\nand indeterminacy due to challenges such as occlusions and cluttered\nbackgrounds. Meanwhile, diffusion models have shown appealing performance in\ngenerating high-quality images from random noise with high indeterminacy\nthrough step-by-step denoising. Inspired by their denoising capability, we\npropose a novel diffusion-based framework (6D-Diff) to handle the noise and\nindeterminacy in object pose estimation for better performance. In our\nframework, to establish accurate 2D-3D correspondence, we formulate 2D\nkeypoints detection as a reverse diffusion (denoising) process. To facilitate\nsuch a denoising process, we design a Mixture-of-Cauchy-based forward diffusion\nprocess and condition the reverse process on the object features. Extensive\nexperiments on the LM-O and YCB-V datasets demonstrate the effectiveness of our\nframework.\n","authors":["Li Xu","Haoxuan Qu","Yujun Cai","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2401.00029v3.pdf","comment":"CVPR 2024 CAMERA-READY"},{"id":"http://arxiv.org/abs/2403.15011v1","updated":"2024-03-22T07:49:55Z","published":"2024-03-22T07:49:55Z","title":"Cell Tracking according to Biological Needs -- Strong Mitosis-aware\n Random-finite Sets Tracker with Aleatoric Uncertainty","summary":" Cell tracking and segmentation assist biologists in extracting insights from\nlarge-scale microscopy time-lapse data. Driven by local accuracy metrics,\ncurrent tracking approaches often suffer from a lack of long-term consistency.\nTo address this issue, we introduce an uncertainty estimation technique for\nneural tracking-by-regression frameworks and incorporate it into our novel\nextended Poisson multi-Bernoulli mixture tracker. Our uncertainty estimation\nidentifies uncertain associations within high-performing tracking-by-regression\nmethods using problem-specific test-time augmentations. Leveraging this\nuncertainty, along with a novel mitosis-aware assignment problem formulation,\nour tracker resolves false associations and mitosis detections stemming from\nlong-term conflicts. We evaluate our approach on nine competitive datasets and\ndemonstrate that it outperforms the current state-of-the-art on biologically\nrelevant metrics substantially, achieving improvements by a factor of\napproximately $5.75$. Furthermore, we uncover new insights into the behavior of\ntracking-by-regression uncertainty.\n","authors":["Timo Kaiser","Maximilian Schier","Bodo Rosenhahn"],"pdf_url":"https://arxiv.org/pdf/2403.15011v1.pdf","comment":"23 pages, 10 figures, 5 tables"},{"id":"http://arxiv.org/abs/2403.15010v1","updated":"2024-03-22T07:47:13Z","published":"2024-03-22T07:47:13Z","title":"Clean-image Backdoor Attacks","summary":" To gather a significant quantity of annotated training data for\nhigh-performance image classification models, numerous companies opt to enlist\nthird-party providers to label their unlabeled data. This practice is widely\nregarded as secure, even in cases where some annotated errors occur, as the\nimpact of these minor inaccuracies on the final performance of the models is\nnegligible and existing backdoor attacks require attacker's ability to poison\nthe training images. Nevertheless, in this paper, we propose clean-image\nbackdoor attacks which uncover that backdoors can still be injected via a\nfraction of incorrect labels without modifying the training images.\nSpecifically, in our attacks, the attacker first seeks a trigger feature to\ndivide the training images into two parts: those with the feature and those\nwithout it. Subsequently, the attacker falsifies the labels of the former part\nto a backdoor class. The backdoor will be finally implanted into the target\nmodel after it is trained on the poisoned data. During the inference phase, the\nattacker can activate the backdoor in two ways: slightly modifying the input\nimage to obtain the trigger feature, or taking an image that naturally has the\ntrigger feature as input. We conduct extensive experiments to demonstrate the\neffectiveness and practicality of our attacks. According to the experimental\nresults, we conclude that our attacks seriously jeopardize the fairness and\nrobustness of image classification models, and it is necessary to be vigilant\nabout the incorrect labels in outsourced labeling.\n","authors":["Dazhong Rong","Shuheng Shen","Xinyi Fu","Peng Qian","Jianhai Chen","Qinming He","Xing Fu","Weiqiang Wang"],"pdf_url":"https://arxiv.org/pdf/2403.15010v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15009v1","updated":"2024-03-22T07:45:51Z","published":"2024-03-22T07:45:51Z","title":"TexRO: Generating Delicate Textures of 3D Models by Recursive\n Optimization","summary":" This paper presents TexRO, a novel method for generating delicate textures of\na known 3D mesh by optimizing its UV texture. The key contributions are\ntwo-fold. We propose an optimal viewpoint selection strategy, that finds the\nmost miniature set of viewpoints covering all the faces of a mesh. Our\nviewpoint selection strategy guarantees the completeness of a generated result.\nWe propose a recursive optimization pipeline that optimizes a UV texture at\nincreasing resolutions, with an adaptive denoising method that re-uses existing\ntextures for new texture generation. Through extensive experimentation, we\ndemonstrate the superior performance of TexRO in terms of texture quality,\ndetail preservation, visual consistency, and, notably runtime speed,\noutperforming other current methods. The broad applicability of TexRO is\nfurther confirmed through its successful use on diverse 3D models.\n","authors":["Jinbo Wu","Xing Liu","Chenming Wu","Xiaobo Gao","Jialun Liu","Xinqi Liu","Chen Zhao","Haocheng Feng","Errui Ding","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2403.15009v1.pdf","comment":"Technical report. Project page:\n \\href{https://3d-aigc.github.io/TexRO}{https://3d-aigc.github.io/TexRO}"},{"id":"http://arxiv.org/abs/2403.15008v1","updated":"2024-03-22T07:45:50Z","published":"2024-03-22T07:45:50Z","title":"Tri-Perspective View Decomposition for Geometry-Aware Depth Completion","summary":" Depth completion is a vital task for autonomous driving, as it involves\nreconstructing the precise 3D geometry of a scene from sparse and noisy depth\nmeasurements. However, most existing methods either rely only on 2D depth\nrepresentations or directly incorporate raw 3D point clouds for compensation,\nwhich are still insufficient to capture the fine-grained 3D geometry of the\nscene. To address this challenge, we introduce Tri-Perspective view\nDecomposition (TPVD), a novel framework that can explicitly model 3D geometry.\nIn particular, (1) TPVD ingeniously decomposes the original point cloud into\nthree 2D views, one of which corresponds to the sparse depth input. (2) We\ndesign TPV Fusion to update the 2D TPV features through recurrent 2D-3D-2D\naggregation, where a Distance-Aware Spherical Convolution (DASC) is applied.\n(3) By adaptively choosing TPV affinitive neighbors, the newly proposed\nGeometric Spatial Propagation Network (GSPN) further improves the geometric\nconsistency. As a result, our TPVD outperforms existing methods on KITTI,\nNYUv2, and SUN RGBD. Furthermore, we build a novel depth completion dataset\nnamed TOFDC, which is acquired by the time-of-flight (TOF) sensor and the color\ncamera on smartphones. Project page:\nhttps://yanzq95.github.io/projectpage/TOFDC/index.html\n","authors":["Zhiqiang Yan","Yuankai Lin","Kun Wang","Yupeng Zheng","Yufei Wang","Zhenyu Zhang","Jun Li","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2403.15008v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.15004v1","updated":"2024-03-22T07:32:21Z","published":"2024-03-22T07:32:21Z","title":"ParFormer: Vision Transformer Baseline with Parallel Local Global Token\n Mixer and Convolution Attention Patch Embedding","summary":" This work presents ParFormer as an enhanced transformer architecture that\nallows the incorporation of different token mixers into a single stage, hence\nimproving feature extraction capabilities. Integrating both local and global\ndata allows for precise representation of short- and long-range spatial\nrelationships without the need for computationally intensive methods such as\nshifting windows. Along with the parallel token mixer encoder, We offer the\nConvolutional Attention Patch Embedding (CAPE) as an enhancement of standard\npatch embedding to improve token mixer extraction with a convolutional\nattention module. Our comprehensive evaluation demonstrates that our ParFormer\noutperforms CNN-based and state-of-the-art transformer-based architectures in\nimage classification and several complex tasks such as object recognition. The\nproposed CAPE has been demonstrated to benefit the overall MetaFormer\narchitecture, even while utilizing the Identity Mapping Token Mixer, resulting\nin a 0.5\\% increase in accuracy. The ParFormer models outperformed ConvNeXt and\nSwin Transformer for the pure convolution and transformer model in accuracy.\nFurthermore, our model surpasses the current leading hybrid transformer by\nreaching competitive Top-1 scores in the ImageNet-1K classification test.\nSpecifically, our model variants with 11M, 23M, and 34M parameters achieve\nscores of 80.4\\%, 82.1\\%, and 83.1\\%, respectively. Code:\nhttps://github.com/novendrastywn/ParFormer-CAPE-2024\n","authors":["Novendra Setyawan","Ghufron Wahyu Kurniawan","Chi-Chia Sun","Jun-Wei Hsieh","Hui-Kai Su","Wen-Kai Kuo"],"pdf_url":"https://arxiv.org/pdf/2403.15004v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00248v2","updated":"2024-03-22T07:25:03Z","published":"2023-12-30T14:24:33Z","title":"Promoting Segment Anything Model towards Highly Accurate Dichotomous\n Image Segmentation","summary":" The Segment Anything Model (SAM) represents a significant breakthrough into\nfoundation models for computer vision, providing a large-scale image\nsegmentation model. However, despite SAM's zero-shot performance, its\nsegmentation masks lack fine-grained details, particularly in accurately\ndelineating object boundaries. We have high expectations regarding whether SAM,\nas a foundation model, can be improved towards highly accurate object\nsegmentation, which is known as dichotomous image segmentation (DIS). To\naddress this issue, we propose DIS-SAM, which advances SAM towards DIS with\nextremely accurate details. DIS-SAM is a framework specifically tailored for\nhighly accurate segmentation, maintaining SAM's promptable design. DIS-SAM\nemploys a two-stage approach, integrating SAM with a modified IS-Net dedicated\nto DIS. Despite its simplicity, DIS-SAM demonstrates significantly enhanced\nsegmentation accuracy compared to SAM and HQ-SAM.\n","authors":["Xianjie Liu","Keren Fu","Qijun Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.00248v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17587v3","updated":"2024-03-22T07:23:51Z","published":"2024-02-25T07:59:10Z","title":"Instance-aware Exploration-Verification-Exploitation for Instance\n ImageGoal Navigation","summary":" As a new embodied vision task, Instance ImageGoal Navigation (IIN) aims to\nnavigate to a specified object depicted by a goal image in an unexplored\nenvironment.\n The main challenge of this task lies in identifying the target object from\ndifferent viewpoints while rejecting similar distractors.\n Existing ImageGoal Navigation methods usually adopt the simple\nExploration-Exploitation framework and ignore the identification of specific\ninstance during navigation.\n In this work, we propose to imitate the human behaviour of ``getting closer\nto confirm\" when distinguishing objects from a distance.\n Specifically, we design a new modular navigation framework named\nInstance-aware Exploration-Verification-Exploitation (IEVE) for instance-level\nimage goal navigation.\n Our method allows for active switching among the exploration, verification,\nand exploitation actions, thereby facilitating the agent in making reasonable\ndecisions under different situations.\n On the challenging HabitatMatterport 3D semantic (HM3D-SEM) dataset, our\nmethod surpasses previous state-of-the-art work, with a classical segmentation\nmodel (0.684 vs. 0.561 success) or a robust model (0.702 vs. 0.561 success)\n","authors":["Xiaohan Lei","Min Wang","Wengang Zhou","Li Li","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2402.17587v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.14178v2","updated":"2024-03-22T07:23:22Z","published":"2023-04-27T13:27:01Z","title":"mPLUG-Owl: Modularization Empowers Large Language Models with\n Multimodality","summary":" Large language models (LLMs) have demonstrated impressive zero-shot abilities\non a variety of open-ended tasks, while recent research has also explored the\nuse of LLMs for multi-modal generation. In this study, we introduce mPLUG-Owl,\na novel training paradigm that equips LLMs with multi-modal abilities through\nmodularized learning of foundation LLM, a visual knowledge module, and a visual\nabstractor module. This approach can support multiple modalities and facilitate\ndiverse unimodal and multimodal abilities through modality collaboration. The\ntraining paradigm of mPLUG-Owl involves a two-stage method for aligning image\nand text, which learns visual knowledge with the assistance of LLM while\nmaintaining and even improving the generation abilities of LLM. In the first\nstage, the visual knowledge module and abstractor module are trained with a\nfrozen LLM module to align the image and text. In the second stage,\nlanguage-only and multi-modal supervised datasets are used to jointly fine-tune\na low-rank adaption (LoRA) module on LLM and the abstractor module by freezing\nthe visual knowledge module. We carefully build a visually-related instruction\nevaluation set OwlEval. Experimental results show that our model outperforms\nexisting multi-modal models, demonstrating mPLUG-Owl's impressive instruction\nand visual understanding ability, multi-turn conversation ability, and\nknowledge reasoning ability. Besides, we observe some unexpected and exciting\nabilities such as multi-image correlation and scene text understanding, which\nmakes it possible to leverage it for harder real scenarios, such as vision-only\ndocument comprehension. Our code, pre-trained model, instruction-tuned models,\nand evaluation set are available at https://github.com/X-PLUG/mPLUG-Owl. The\nonline demo is available at https://www.modelscope.cn/studios/damo/mPLUG-Owl.\n","authors":["Qinghao Ye","Haiyang Xu","Guohai Xu","Jiabo Ye","Ming Yan","Yiyang Zhou","Junyang Wang","Anwen Hu","Pengcheng Shi","Yaya Shi","Chenliang Li","Yuanhong Xu","Hehong Chen","Junfeng Tian","Qi Qian","Ji Zhang","Fei Huang","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2304.14178v2.pdf","comment":"Working in Process"},{"id":"http://arxiv.org/abs/2403.14999v1","updated":"2024-03-22T07:21:09Z","published":"2024-03-22T07:21:09Z","title":"Magic for the Age of Quantized DNNs","summary":" Recently, the number of parameters in DNNs has explosively increased, as\nexemplified by LLMs (Large Language Models), making inference on small-scale\ncomputers more difficult. Model compression technology is, therefore, essential\nfor integration into products. In this paper, we propose a method of\nquantization-aware training. We introduce a novel normalization (Layer-Batch\nNormalization) that is independent of the mini-batch size and does not require\nany additional computation cost during inference. Then, we quantize the weights\nby the scaled round-clip function with the weight standardization. We also\nquantize activation functions using the same function and apply surrogate\ngradients to train the model with both quantized weights and the quantized\nactivation functions. We call this method Magic for the age of Quantised DNNs\n(MaQD). Experimental results show that our quantization method can be achieved\nwith minimal accuracy degradation.\n","authors":["Yoshihide Sawada","Ryuji Saiin","Kazuma Suetake"],"pdf_url":"https://arxiv.org/pdf/2403.14999v1.pdf","comment":"14 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2403.14995v1","updated":"2024-03-22T07:12:48Z","published":"2024-03-22T07:12:48Z","title":"Improve Cross-domain Mixed Sampling with Guidance Training for Adaptive\n Segmentation","summary":" Unsupervised Domain Adaptation (UDA) endeavors to adjust models trained on a\nsource domain to perform well on a target domain without requiring additional\nannotations. In the context of domain adaptive semantic segmentation, which\ntackles UDA for dense prediction, the goal is to circumvent the need for costly\npixel-level annotations. Typically, various prevailing methods baseline rely on\nconstructing intermediate domains via cross-domain mixed sampling techniques to\nmitigate the performance decline caused by domain gaps. However, such\napproaches generate synthetic data that diverge from real-world distributions,\npotentially leading the model astray from the true target distribution. To\naddress this challenge, we propose a novel auxiliary task called Guidance\nTraining. This task facilitates the effective utilization of cross-domain mixed\nsampling techniques while mitigating distribution shifts from the real world.\nSpecifically, Guidance Training guides the model to extract and reconstruct the\ntarget-domain feature distribution from mixed data, followed by decoding the\nreconstructed target-domain features to make pseudo-label predictions.\nImportantly, integrating Guidance Training incurs minimal training overhead and\nimposes no additional inference burden. We demonstrate the efficacy of our\napproach by integrating it with existing methods, consistently improving\nperformance. The implementation will be available at\nhttps://github.com/Wenlve-Zhou/Guidance-Training.\n","authors":["Wenlve Zhou","Zhiheng Zhou","Tianlei Wang","Delu Zeng"],"pdf_url":"https://arxiv.org/pdf/2403.14995v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15243v3","updated":"2024-03-22T07:05:58Z","published":"2023-11-26T09:06:40Z","title":"ID-like Prompt Learning for Few-Shot Out-of-Distribution Detection","summary":" Out-of-distribution (OOD) detection methods often exploit auxiliary outliers\nto train model identifying OOD samples, especially discovering challenging\noutliers from auxiliary outliers dataset to improve OOD detection. However,\nthey may still face limitations in effectively distinguishing between the most\nchallenging OOD samples that are much like in-distribution (ID) data, i.e.,\n\\idlike samples. To this end, we propose a novel OOD detection framework that\ndiscovers \\idlike outliers using CLIP \\cite{DBLP:conf/icml/RadfordKHRGASAM21}\nfrom the vicinity space of the ID samples, thus helping to identify these most\nchallenging OOD samples. Then a prompt learning framework is proposed that\nutilizes the identified \\idlike outliers to further leverage the capabilities\nof CLIP for OOD detection. Benefiting from the powerful CLIP, we only need a\nsmall number of ID samples to learn the prompts of the model without exposing\nother auxiliary outlier datasets. By focusing on the most challenging \\idlike\nOOD samples and elegantly exploiting the capabilities of CLIP, our method\nachieves superior few-shot learning performance on various real-world image\ndatasets (e.g., in 4-shot OOD detection on the ImageNet-1k dataset, our method\nreduces the average FPR95 by 12.16\\% and improves the average AUROC by 2.76\\%,\ncompared to state-of-the-art methods). Code is available at\nhttps://github.com/ycfate/ID-like.\n","authors":["Yichen Bai","Zongbo Han","Changqing Zhang","Bing Cao","Xiaoheng Jiang","Qinghua Hu"],"pdf_url":"https://arxiv.org/pdf/2311.15243v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.19122v2","updated":"2024-03-22T07:03:54Z","published":"2024-02-29T13:00:22Z","title":"BigGait: Learning Gait Representation You Want by Large Vision Models","summary":" Gait recognition stands as one of the most pivotal remote identification\ntechnologies and progressively expands across research and industry\ncommunities. However, existing gait recognition methods heavily rely on\ntask-specific upstream driven by supervised learning to provide explicit gait\nrepresentations like silhouette sequences, which inevitably introduce expensive\nannotation costs and potential error accumulation. Escaping from this trend,\nthis work explores effective gait representations based on the all-purpose\nknowledge produced by task-agnostic Large Vision Models (LVMs) and proposes a\nsimple yet efficient gait framework, termed BigGait. Specifically, the Gait\nRepresentation Extractor (GRE) within BigGait draws upon design principles from\nestablished gait representations, effectively transforming all-purpose\nknowledge into implicit gait representations without requiring third-party\nsupervision signals. Experiments on CCPG, CAISA-B* and SUSTech1K indicate that\nBigGait significantly outperforms the previous methods in both within-domain\nand cross-domain tasks in most cases, and provides a more practical paradigm\nfor learning the next-generation gait representation. Finally, we delve into\nprospective challenges and promising directions in LVMs-based gait recognition,\naiming to inspire future work in this emerging topic. The source code is\navailable at https://github.com/ShiqiYu/OpenGait.\n","authors":["Dingqiang Ye","Chao Fan","Jingzhe Ma","Xiaoming Liu","Shiqi Yu"],"pdf_url":"https://arxiv.org/pdf/2402.19122v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10413v2","updated":"2024-03-22T07:03:50Z","published":"2023-10-16T13:56:56Z","title":"Image super-resolution via dynamic network","summary":" Convolutional neural networks (CNNs) depend on deep network architectures to\nextract accurate information for image super-resolution. However, obtained\ninformation of these CNNs cannot completely express predicted high-quality\nimages for complex scenes. In this paper, we present a dynamic network for\nimage super-resolution (DSRNet), which contains a residual enhancement block,\nwide enhancement block, feature refinement block and construction block. The\nresidual enhancement block is composed of a residual enhanced architecture to\nfacilitate hierarchical features for image super-resolution. To enhance\nrobustness of obtained super-resolution model for complex scenes, a wide\nenhancement block achieves a dynamic architecture to learn more robust\ninformation to enhance applicability of an obtained super-resolution model for\nvarying scenes. To prevent interference of components in a wide enhancement\nblock, a refinement block utilizes a stacked architecture to accurately learn\nobtained features. Also, a residual learning operation is embedded in the\nrefinement block to prevent long-term dependency problem. Finally, a\nconstruction block is responsible for reconstructing high-quality images.\nDesigned heterogeneous architecture can not only facilitate richer structural\ninformation, but also be lightweight, which is suitable for mobile digital\ndevices. Experimental results shows that our method is more competitive in\nterms of performance and recovering time of image super-resolution and\ncomplexity. The code of DSRNet can be obtained at\nhttps://github.com/hellloxiaotian/DSRNet.\n","authors":["Chunwei Tian","Xuanyu Zhang","Qi Zhang","Mingming Yang","Zhaojie Ju"],"pdf_url":"https://arxiv.org/pdf/2310.10413v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14987v1","updated":"2024-03-22T06:45:45Z","published":"2024-03-22T06:45:45Z","title":"Generative Active Learning for Image Synthesis Personalization","summary":" This paper presents a pilot study that explores the application of active\nlearning, traditionally studied in the context of discriminative models, to\ngenerative models. We specifically focus on image synthesis personalization\ntasks. The primary challenge in conducting active learning on generative models\nlies in the open-ended nature of querying, which differs from the closed form\nof querying in discriminative models that typically target a single concept. We\nintroduce the concept of anchor directions to transform the querying process\ninto a semi-open problem. We propose a direction-based uncertainty sampling\nstrategy to enable generative active learning and tackle the\nexploitation-exploration dilemma. Extensive experiments are conducted to\nvalidate the effectiveness of our approach, demonstrating that an open-source\nmodel can achieve superior performance compared to closed-source models\ndeveloped by large companies, such as Google's StyleDrop. The source code is\navailable at https://github.com/zhangxulu1996/GAL4Personalization.\n","authors":["Xulu Zhang","Wengyu Zhang","Xiao-Yong Wei","Jinlin Wu","Zhaoxiang Zhang","Zhen Lei","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2403.14987v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03408v4","updated":"2024-03-22T06:45:41Z","published":"2023-12-06T10:46:53Z","title":"Open-sourced Data Ecosystem in Autonomous Driving: the Present and\n Future","summary":" With the continuous maturation and application of autonomous driving\ntechnology, a systematic examination of open-source autonomous driving datasets\nbecomes instrumental in fostering the robust evolution of the industry\necosystem. Current autonomous driving datasets can broadly be categorized into\ntwo generations. The first-generation autonomous driving datasets are\ncharacterized by relatively simpler sensor modalities, smaller data scale, and\nis limited to perception-level tasks. KITTI, introduced in 2012, serves as a\nprominent representative of this initial wave. In contrast, the\nsecond-generation datasets exhibit heightened complexity in sensor modalities,\ngreater data scale and diversity, and an expansion of tasks from perception to\nencompass prediction and control. Leading examples of the second generation\ninclude nuScenes and Waymo, introduced around 2019. This comprehensive review,\nconducted in collaboration with esteemed colleagues from both academia and\nindustry, systematically assesses over seventy open-source autonomous driving\ndatasets from domestic and international sources. It offers insights into\nvarious aspects, such as the principles underlying the creation of high-quality\ndatasets, the pivotal role of data engine systems, and the utilization of\ngenerative foundation models to facilitate scalable data generation.\nFurthermore, this review undertakes an exhaustive analysis and discourse\nregarding the characteristics and data scales that future third-generation\nautonomous driving datasets should possess. It also delves into the scientific\nand technical challenges that warrant resolution. These endeavors are pivotal\nin advancing autonomous innovation and fostering technological enhancement in\ncritical domains. For further details, please refer to\nhttps://github.com/OpenDriveLab/DriveAGI.\n","authors":["Hongyang Li","Yang Li","Huijie Wang","Jia Zeng","Huilin Xu","Pinlong Cai","Li Chen","Junchi Yan","Feng Xu","Lu Xiong","Jingdong Wang","Futang Zhu","Chunjing Xu","Tiancai Wang","Fei Xia","Beipeng Mu","Zhihui Peng","Dahua Lin","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2312.03408v4.pdf","comment":"This article is a simplified English translation of corresponding\n Chinese article. Please refer to Chinese version for the complete content"},{"id":"http://arxiv.org/abs/2403.11735v2","updated":"2024-03-22T06:39:20Z","published":"2024-03-18T12:43:38Z","title":"LSKNet: A Foundation Lightweight Backbone for Remote Sensing","summary":" Remote sensing images pose distinct challenges for downstream tasks due to\ntheir inherent complexity. While a considerable amount of research has been\ndedicated to remote sensing classification, object detection and semantic\nsegmentation, most of these studies have overlooked the valuable prior\nknowledge embedded within remote sensing scenarios. Such prior knowledge can be\nuseful because remote sensing objects may be mistakenly recognized without\nreferencing a sufficiently long-range context, which can vary for different\nobjects. This paper considers these priors and proposes a lightweight Large\nSelective Kernel Network (LSKNet) backbone. LSKNet can dynamically adjust its\nlarge spatial receptive field to better model the ranging context of various\nobjects in remote sensing scenarios. To our knowledge, large and selective\nkernel mechanisms have not been previously explored in remote sensing images.\nWithout bells and whistles, our lightweight LSKNet sets new state-of-the-art\nscores on standard remote sensing classification, object detection and semantic\nsegmentation benchmarks. Our comprehensive analysis further validated the\nsignificance of the identified priors and the effectiveness of LSKNet. The code\nis available at https://github.com/zcablii/LSKNet.\n","authors":["Yuxuan Li","Xiang Li","Yimain Dai","Qibin Hou","Li Liu","Yongxiang Liu","Ming-Ming Cheng","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2403.11735v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2303.09030"},{"id":"http://arxiv.org/abs/2403.14977v1","updated":"2024-03-22T06:22:20Z","published":"2024-03-22T06:22:20Z","title":"Piecewise-Linear Manifolds for Deep Metric Learning","summary":" Unsupervised deep metric learning (UDML) focuses on learning a semantic\nrepresentation space using only unlabeled data. This challenging problem\nrequires accurately estimating the similarity between data points, which is\nused to supervise a deep network. For this purpose, we propose to model the\nhigh-dimensional data manifold using a piecewise-linear approximation, with\neach low-dimensional linear piece approximating the data manifold in a small\nneighborhood of a point. These neighborhoods are used to estimate similarity\nbetween data points. We empirically show that this similarity estimate\ncorrelates better with the ground truth than the similarity estimates of\ncurrent state-of-the-art techniques. We also show that proxies, commonly used\nin supervised metric learning, can be used to model the piecewise-linear\nmanifold in an unsupervised setting, helping improve performance. Our method\noutperforms existing unsupervised metric learning approaches on standard\nzero-shot image retrieval benchmarks.\n","authors":["Shubhang Bhatnagar","Narendra Ahuja"],"pdf_url":"https://arxiv.org/pdf/2403.14977v1.pdf","comment":"Accepted at CPAL 2024 (Oral)"},{"id":"http://arxiv.org/abs/2403.14974v1","updated":"2024-03-22T06:04:37Z","published":"2024-03-22T06:04:37Z","title":"AVT2-DWF: Improving Deepfake Detection with Audio-Visual Fusion and\n Dynamic Weighting Strategies","summary":" With the continuous improvements of deepfake methods, forgery messages have\ntransitioned from single-modality to multi-modal fusion, posing new challenges\nfor existing forgery detection algorithms. In this paper, we propose AVT2-DWF,\nthe Audio-Visual dual Transformers grounded in Dynamic Weight Fusion, which\naims to amplify both intra- and cross-modal forgery cues, thereby enhancing\ndetection capabilities. AVT2-DWF adopts a dual-stage approach to capture both\nspatial characteristics and temporal dynamics of facial expressions. This is\nachieved through a face transformer with an n-frame-wise tokenization strategy\nencoder and an audio transformer encoder. Subsequently, it uses multi-modal\nconversion with dynamic weight fusion to address the challenge of heterogeneous\ninformation fusion between audio and visual modalities. Experiments on\nDeepfakeTIMIT, FakeAVCeleb, and DFDC datasets indicate that AVT2-DWF achieves\nstate-of-the-art performance intra- and cross-dataset Deepfake detection. Code\nis available at https://github.com/raining-dev/AVT2-DWF.\n","authors":["Rui Wang","Dengpan Ye","Long Tang","Yunming Zhang","Jiacheng Deng"],"pdf_url":"https://arxiv.org/pdf/2403.14974v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14973v1","updated":"2024-03-22T06:04:11Z","published":"2024-03-22T06:04:11Z","title":"Trajectory Regularization Enhances Self-Supervised Geometric\n Representation","summary":" Self-supervised learning (SSL) has proven effective in learning high-quality\nrepresentations for various downstream tasks, with a primary focus on semantic\ntasks. However, its application in geometric tasks remains underexplored,\npartially due to the absence of a standardized evaluation method for geometric\nrepresentations. To address this gap, we introduce a new pose-estimation\nbenchmark for assessing SSL geometric representations, which demands training\nwithout semantic or pose labels and achieving proficiency in both semantic and\ngeometric downstream tasks. On this benchmark, we study enhancing SSL geometric\nrepresentations without sacrificing semantic classification accuracy. We find\nthat leveraging mid-layer representations improves pose-estimation performance\nby 10-20%. Further, we introduce an unsupervised trajectory-regularization\nloss, which improves performance by an additional 4% and improves\ngeneralization ability on out-of-distribution data. We hope the proposed\nbenchmark and methods offer new insights and improvements in self-supervised\ngeometric representation learning.\n","authors":["Jiayun Wang","Stella X. Yu","Yubei Chen"],"pdf_url":"https://arxiv.org/pdf/2403.14973v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04669v3","updated":"2024-03-22T05:41:55Z","published":"2023-09-09T03:01:38Z","title":"Unified Language-Vision Pretraining in LLM with Dynamic Discrete Visual\n Tokenization","summary":" Recently, the remarkable advance of the Large Language Model (LLM) has\ninspired researchers to transfer its extraordinary reasoning capability to both\nvision and language data. However, the prevailing approaches primarily regard\nthe visual input as a prompt and focus exclusively on optimizing the text\ngeneration process conditioned upon vision content by a frozen LLM. Such an\ninequitable treatment of vision and language heavily constrains the model's\npotential. In this paper, we break through this limitation by representing both\nvision and language in a unified form. Specifically, we introduce a\nwell-designed visual tokenizer to translate the non-linguistic image into a\nsequence of discrete tokens like a foreign language that LLM can read. The\nresulting visual tokens encompass high-level semantics worthy of a word and\nalso support dynamic sequence length varying from the image. Coped with this\ntokenizer, the presented foundation model called LaVIT can handle both image\nand text indiscriminately under the same generative learning paradigm. This\nunification empowers LaVIT to serve as an impressive generalist interface to\nunderstand and generate multi-modal content simultaneously. Extensive\nexperiments further showcase that it outperforms the existing models by a large\nmargin on massive vision-language tasks. Our code and models are available at\nhttps://github.com/jy0205/LaVIT.\n","authors":["Yang Jin","Kun Xu","Kun Xu","Liwei Chen","Chao Liao","Jianchao Tan","Quzhe Huang","Bin Chen","Chenyi Lei","An Liu","Chengru Song","Xiaoqiang Lei","Di Zhang","Wenwu Ou","Kun Gai","Yadong Mu"],"pdf_url":"https://arxiv.org/pdf/2309.04669v3.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2403.14966v1","updated":"2024-03-22T05:38:15Z","published":"2024-03-22T05:38:15Z","title":"DreamFlow: High-Quality Text-to-3D Generation by Approximating\n Probability Flow","summary":" Recent progress in text-to-3D generation has been achieved through the\nutilization of score distillation methods: they make use of the pre-trained\ntext-to-image (T2I) diffusion models by distilling via the diffusion model\ntraining objective. However, such an approach inevitably results in the use of\nrandom timesteps at each update, which increases the variance of the gradient\nand ultimately prolongs the optimization process. In this paper, we propose to\nenhance the text-to-3D optimization by leveraging the T2I diffusion prior in\nthe generative sampling process with a predetermined timestep schedule. To this\nend, we interpret text-to3D optimization as a multi-view image-to-image\ntranslation problem, and propose a solution by approximating the probability\nflow. By leveraging the proposed novel optimization algorithm, we design\nDreamFlow, a practical three-stage coarseto-fine text-to-3D optimization\nframework that enables fast generation of highquality and high-resolution\n(i.e., 1024x1024) 3D contents. For example, we demonstrate that DreamFlow is 5\ntimes faster than the existing state-of-the-art text-to-3D method, while\nproducing more photorealistic 3D contents. Visit our project page\n(https://kyungmnlee.github.io/dreamflow.github.io/) for visualizations.\n","authors":["Kyungmin Lee","Kihyuk Sohn","Jinwoo Shin"],"pdf_url":"https://arxiv.org/pdf/2403.14966v1.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2312.03849v2","updated":"2024-03-22T05:03:34Z","published":"2023-12-06T19:02:40Z","title":"LEGO: Learning EGOcentric Action Frame Generation via Visual Instruction\n Tuning","summary":" Generating instructional images of human daily actions from an egocentric\nviewpoint serves as a key step towards efficient skill transfer. In this paper,\nwe introduce a novel problem -- egocentric action frame generation. The goal is\nto synthesize an image depicting an action in the user's context (i.e., action\nframe) by conditioning on a user prompt and an input egocentric image. Notably,\nexisting egocentric action datasets lack the detailed annotations that describe\nthe execution of actions. Additionally, existing diffusion-based image\nmanipulation models are sub-optimal in controlling the state transition of an\naction in egocentric image pixel space because of the domain gap. To this end,\nwe propose to Learn EGOcentric (LEGO) action frame generation via visual\ninstruction tuning. First, we introduce a prompt enhancement scheme to generate\nenriched action descriptions from a visual large language model (VLLM) by\nvisual instruction tuning. Then we propose a novel method to leverage image and\ntext embeddings from the VLLM as additional conditioning to improve the\nperformance of a diffusion model. We validate our model on two egocentric\ndatasets -- Ego4D and Epic-Kitchens. Our experiments show substantial\nimprovement over prior image manipulation models in both quantitative and\nqualitative evaluation. We also conduct detailed ablation studies and analysis\nto provide insights in our method. More details of the dataset and code are\navailable on the website (https://bolinlai.github.io/Lego_EgoActGen/).\n","authors":["Bolin Lai","Xiaoliang Dai","Lawrence Chen","Guan Pang","James M. Rehg","Miao Liu"],"pdf_url":"https://arxiv.org/pdf/2312.03849v2.pdf","comment":"34 pages"},{"id":"http://arxiv.org/abs/2403.09920v3","updated":"2024-03-22T04:53:29Z","published":"2024-03-14T23:41:00Z","title":"Predicting Generalization of AI Colonoscopy Models to Unseen Data","summary":" $\\textbf{Background}$: Generalizability of AI colonoscopy algorithms is\nimportant for wider adoption in clinical practice. However, current techniques\nfor evaluating performance on unseen data require expensive and time-intensive\nlabels.\n $\\textbf{Methods}$: We use a \"Masked Siamese Network\" (MSN) to identify novel\nphenomena in unseen data and predict polyp detector performance. MSN is trained\nto predict masked out regions of polyp images, without any labels. We test\nMSN's ability to be trained on data only from Israel and detect unseen\ntechniques, narrow-band imaging (NBI) and chromendoscoy (CE), on colonoscopes\nfrom Japan (354 videos, 128 hours). We also test MSN's ability to predict\nperformance of Computer Aided Detection (CADe) of polyps on colonoscopies from\nboth countries, even though MSN is not trained on data from Japan.\n $\\textbf{Results}$: MSN correctly identifies NBI and CE as less similar to\nIsrael whitelight than Japan whitelight (bootstrapped z-test, |z| > 496, p <\n10^-8 for both) using the label-free Frechet distance. MSN detects NBI with 99%\naccuracy, predicts CE better than our heuristic (90% vs 79% accuracy) despite\nbeing trained only on whitelight, and is the only method that is robust to\nnoisy labels. MSN predicts CADe polyp detector performance on in-domain Israel\nand out-of-domain Japan colonoscopies (r=0.79, 0.37 respectively). With few\nexamples of Japan detector performance to train on, MSN prediction of Japan\nperformance improves (r=0.56).\n $\\textbf{Conclusion}$: Our technique can identify distribution shifts in\nclinical data and can predict CADe detector performance on unseen data, without\nlabels. Our self-supervised approach can aid in detecting when data in practice\nis different from training, such as between hospitals or data has meaningfully\nshifted from training. MSN has potential for application to medical image\ndomains beyond colonoscopy.\n","authors":["Joel Shor","Carson McNeil","Yotam Intrator","Joseph R Ledsam","Hiro-o Yamano","Daisuke Tsurumaru","Hiroki Kayama","Atsushi Hamabe","Koji Ando","Mitsuhiko Ota","Haruei Ogino","Hiroshi Nakase","Kaho Kobayashi","Masaaki Miyo","Eiji Oki","Ichiro Takemasa","Ehud Rivlin","Roman Goldenberg"],"pdf_url":"https://arxiv.org/pdf/2403.09920v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13602v3","updated":"2024-03-22T04:46:55Z","published":"2023-11-22T18:59:53Z","title":"Retrieval-Augmented Layout Transformer for Content-Aware Layout\n Generation","summary":" Content-aware graphic layout generation aims to automatically arrange visual\nelements along with a given content, such as an e-commerce product image. In\nthis paper, we argue that the current layout generation approaches suffer from\nthe limited training data for the high-dimensional layout structure. We show\nthat a simple retrieval augmentation can significantly improve the generation\nquality. Our model, which is named Retrieval-Augmented Layout Transformer\n(RALF), retrieves nearest neighbor layout examples based on an input image and\nfeeds these results into an autoregressive generator. Our model can apply\nretrieval augmentation to various controllable generation tasks and yield\nhigh-quality layouts within a unified architecture. Our extensive experiments\nshow that RALF successfully generates content-aware layouts in both constrained\nand unconstrained settings and significantly outperforms the baselines.\n","authors":["Daichi Horita","Naoto Inoue","Kotaro Kikuchi","Kota Yamaguchi","Kiyoharu Aizawa"],"pdf_url":"https://arxiv.org/pdf/2311.13602v3.pdf","comment":"Accepted to CVPR 2024, Project website:\n https://udonda.github.io/RALF/"},{"id":"http://arxiv.org/abs/2403.14947v1","updated":"2024-03-22T04:39:15Z","published":"2024-03-22T04:39:15Z","title":"GPT-Connect: Interaction between Text-Driven Human Motion Generator and\n 3D Scenes in a Training-free Manner","summary":" Recently, while text-driven human motion generation has received massive\nresearch attention, most existing text-driven motion generators are generally\nonly designed to generate motion sequences in a blank background. While this is\nthe case, in practice, human beings naturally perform their motions in 3D\nscenes, rather than in a blank background. Considering this, we here aim to\nperform scene-aware text-drive motion generation instead. Yet, intuitively\ntraining a separate scene-aware motion generator in a supervised way can\nrequire a large amount of motion samples to be troublesomely collected and\nannotated in a large scale of different 3D scenes. To handle this task rather\nin a relatively convenient manner, in this paper, we propose a novel\nGPT-connect framework. In GPT-connect, we enable scene-aware motion sequences\nto be generated directly utilizing the existing blank-background human motion\ngenerator, via leveraging ChatGPT to connect the existing motion generator with\nthe 3D scene in a totally training-free manner. Extensive experiments\ndemonstrate the efficacy and generalizability of our proposed framework.\n","authors":["Haoxuan Qu","Ziyan Guo","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2403.14947v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14944v1","updated":"2024-03-22T04:34:59Z","published":"2024-03-22T04:34:59Z","title":"CLIP-VQDiffusion : Langauge Free Training of Text To Image generation\n using CLIP and vector quantized diffusion model","summary":" There has been a significant progress in text conditional image generation\nmodels. Recent advancements in this field depend not only on improvements in\nmodel structures, but also vast quantities of text-image paired datasets.\nHowever, creating these kinds of datasets is very costly and requires a\nsubstantial amount of labor. Famous face datasets don't have corresponding text\ncaptions, making it difficult to develop text conditional image generation\nmodels on these datasets. Some research has focused on developing text to image\ngeneration models using only images without text captions. Here, we propose\nCLIP-VQDiffusion, which leverage the pretrained CLIP model to provide\nmultimodal text-image representations and strong image generation capabilities.\nOn the FFHQ dataset, our model outperformed previous state-of-the-art methods\nby 4.4% in clipscore and generated very realistic images even when the text was\nboth in and out of distribution. The pretrained models and codes will soon be\navailable at https://github.com/INFINIQ-AI1/CLIPVQDiffusion\n","authors":["Seungdae Han","Joohee Kim"],"pdf_url":"https://arxiv.org/pdf/2403.14944v1.pdf","comment":"15 pages, 9 figures"},{"id":"http://arxiv.org/abs/2403.14939v1","updated":"2024-03-22T04:16:33Z","published":"2024-03-22T04:16:33Z","title":"STAG4D: Spatial-Temporal Anchored Generative 4D Gaussians","summary":" Recent progress in pre-trained diffusion models and 3D generation have\nspurred interest in 4D content creation. However, achieving high-fidelity 4D\ngeneration with spatial-temporal consistency remains a challenge. In this work,\nwe propose STAG4D, a novel framework that combines pre-trained diffusion models\nwith dynamic 3D Gaussian splatting for high-fidelity 4D generation. Drawing\ninspiration from 3D generation techniques, we utilize a multi-view diffusion\nmodel to initialize multi-view images anchoring on the input video frames,\nwhere the video can be either real-world captured or generated by a video\ndiffusion model. To ensure the temporal consistency of the multi-view sequence\ninitialization, we introduce a simple yet effective fusion strategy to leverage\nthe first frame as a temporal anchor in the self-attention computation. With\nthe almost consistent multi-view sequences, we then apply the score\ndistillation sampling to optimize the 4D Gaussian point cloud. The 4D Gaussian\nspatting is specially crafted for the generation task, where an adaptive\ndensification strategy is proposed to mitigate the unstable Gaussian gradient\nfor robust optimization. Notably, the proposed pipeline does not require any\npre-training or fine-tuning of diffusion networks, offering a more accessible\nand practical solution for the 4D generation task. Extensive experiments\ndemonstrate that our method outperforms prior 4D generation works in rendering\nquality, spatial-temporal consistency, and generation robustness, setting a new\nstate-of-the-art for 4D generation from diverse inputs, including text, image,\nand video.\n","authors":["Yifei Zeng","Yanqin Jiang","Siyu Zhu","Yuanxun Lu","Youtian Lin","Hao Zhu","Weiming Hu","Xun Cao","Yao Yao"],"pdf_url":"https://arxiv.org/pdf/2403.14939v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08262v3","updated":"2024-03-22T04:06:31Z","published":"2024-03-13T05:25:49Z","title":"BiTT: Bi-directional Texture Reconstruction of Interacting Two Hands\n from a Single Image","summary":" Creating personalized hand avatars is important to offer a realistic\nexperience to users on AR / VR platforms. While most prior studies focused on\nreconstructing 3D hand shapes, some recent work has tackled the reconstruction\nof hand textures on top of shapes. However, these methods are often limited to\ncapturing pixels on the visible side of a hand, requiring diverse views of the\nhand in a video or multiple images as input. In this paper, we propose a novel\nmethod, BiTT(Bi-directional Texture reconstruction of Two hands), which is the\nfirst end-to-end trainable method for relightable, pose-free texture\nreconstruction of two interacting hands taking only a single RGB image, by\nthree novel components: 1) bi-directional (left $\\leftrightarrow$ right)\ntexture reconstruction using the texture symmetry of left / right hands, 2)\nutilizing a texture parametric model for hand texture recovery, and 3) the\noverall coarse-to-fine stage pipeline for reconstructing personalized texture\nof two interacting hands. BiTT first estimates the scene light condition and\nalbedo image from an input image, then reconstructs the texture of both hands\nthrough the texture parametric model and bi-directional texture reconstructor.\nIn experiments using InterHand2.6M and RGB2Hands datasets, our method\nsignificantly outperforms state-of-the-art hand texture reconstruction methods\nquantitatively and qualitatively. The code is available at\nhttps://github.com/yunminjin2/BiTT\n","authors":["Minje Kim","Tae-Kyun Kim"],"pdf_url":"https://arxiv.org/pdf/2403.08262v3.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.14937v1","updated":"2024-03-22T03:47:02Z","published":"2024-03-22T03:47:02Z","title":"Survey on Modeling of Articulated Objects","summary":" 3D modeling of articulated objects is a research problem within computer\nvision, graphics, and robotics. Its objective is to understand the shape and\nmotion of the articulated components, represent the geometry and mobility of\nobject parts, and create realistic models that reflect articulated objects in\nthe real world. This survey provides a comprehensive overview of the current\nstate-of-the-art in 3D modeling of articulated objects, with a specific focus\non the task of articulated part perception and articulated object creation\n(reconstruction and generation). We systematically review and discuss the\nrelevant literature from two perspectives: geometry processing and articulation\nmodeling. Through this survey, we highlight the substantial progress made in\nthese areas, outline the ongoing challenges, and identify gaps for future\nresearch. Our survey aims to serve as a foundational reference for researchers\nand practitioners in computer vision and graphics, offering insights into the\ncomplexities of articulated object modeling.\n","authors":["Jiayi Liu","Manolis Savva","Ali Mahdavi-Amiri"],"pdf_url":"https://arxiv.org/pdf/2403.14937v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08699v3","updated":"2024-03-22T03:31:22Z","published":"2024-01-14T12:38:49Z","title":"On Image Search in Histopathology","summary":" Pathology images of histopathology can be acquired from camera-mounted\nmicroscopes or whole slide scanners. Utilizing similarity calculations to match\npatients based on these images holds significant potential in research and\nclinical contexts. Recent advancements in search technologies allow for\nimplicit quantification of tissue morphology across diverse primary sites,\nfacilitating comparisons and enabling inferences about diagnosis, and\npotentially prognosis, and predictions for new patients when compared against a\ncurated database of diagnosed and treated cases. In this paper, we\ncomprehensively review the latest developments in image search technologies for\nhistopathology, offering a concise overview tailored for computational\npathology researchers seeking effective, fast and efficient image search\nmethods in their work.\n","authors":["H. R. Tizhoosh","Liron Pantanowitz"],"pdf_url":"https://arxiv.org/pdf/2401.08699v3.pdf","comment":"A chapter in the Book \"Artificial INtelligence in Digital Pathology\"\n by Cohen and Chauhan, 2024"},{"id":"http://arxiv.org/abs/2401.06312v3","updated":"2024-03-22T03:14:53Z","published":"2024-01-12T00:49:49Z","title":"Video Super-Resolution Transformer with Masked Inter&Intra-Frame\n Attention","summary":" Recently, Vision Transformer has achieved great success in recovering missing\ndetails in low-resolution sequences, i.e., the video super-resolution (VSR)\ntask. Despite its superiority in VSR accuracy, the heavy computational burden\nas well as the large memory footprint hinder the deployment of\nTransformer-based VSR models on constrained devices. In this paper, we address\nthe above issue by proposing a novel feature-level masked processing framework:\nVSR with Masked Intra and inter frame Attention (MIA-VSR). The core of MIA-VSR\nis leveraging feature-level temporal continuity between adjacent frames to\nreduce redundant computations and make more rational use of previously enhanced\nSR features. Concretely, we propose an intra-frame and inter-frame attention\nblock which takes the respective roles of past features and input features into\nconsideration and only exploits previously enhanced features to provide\nsupplementary information. In addition, an adaptive block-wise mask prediction\nmodule is developed to skip unimportant computations according to feature\nsimilarity between adjacent frames. We conduct detailed ablation studies to\nvalidate our contributions and compare the proposed method with recent\nstate-of-the-art VSR approaches. The experimental results demonstrate that\nMIA-VSR improves the memory and computation efficiency over state-of-the-art\nmethods, without trading off PSNR accuracy. The code is available at\nhttps://github.com/LabShuHangGU/MIA-VSR.\n","authors":["Xingyu Zhou","Leheng Zhang","Xiaorui Zhao","Keze Wang","Leida Li","Shuhang Gu"],"pdf_url":"https://arxiv.org/pdf/2401.06312v3.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2312.05239v2","updated":"2024-03-22T03:13:23Z","published":"2023-12-08T18:44:09Z","title":"SwiftBrush: One-Step Text-to-Image Diffusion Model with Variational\n Score Distillation","summary":" Despite their ability to generate high-resolution and diverse images from\ntext prompts, text-to-image diffusion models often suffer from slow iterative\nsampling processes. Model distillation is one of the most effective directions\nto accelerate these models. However, previous distillation methods fail to\nretain the generation quality while requiring a significant amount of images\nfor training, either from real data or synthetically generated by the teacher\nmodel. In response to this limitation, we present a novel image-free\ndistillation scheme named $\\textbf{SwiftBrush}$. Drawing inspiration from\ntext-to-3D synthesis, in which a 3D neural radiance field that aligns with the\ninput prompt can be obtained from a 2D text-to-image diffusion prior via a\nspecialized loss without the use of any 3D data ground-truth, our approach\nre-purposes that same loss for distilling a pretrained multi-step text-to-image\nmodel to a student network that can generate high-fidelity images with just a\nsingle inference step. In spite of its simplicity, our model stands as one of\nthe first one-step text-to-image generators that can produce images of\ncomparable quality to Stable Diffusion without reliance on any training image\ndata. Remarkably, SwiftBrush achieves an FID score of $\\textbf{16.67}$ and a\nCLIP score of $\\textbf{0.29}$ on the COCO-30K benchmark, achieving competitive\nresults or even substantially surpassing existing state-of-the-art distillation\ntechniques.\n","authors":["Thuan Hoang Nguyen","Anh Tran"],"pdf_url":"https://arxiv.org/pdf/2312.05239v2.pdf","comment":"Accepted to CVPR 2024; Project Page:\n https://thuanz123.github.io/swiftbrush/"},{"id":"http://arxiv.org/abs/2311.10278v2","updated":"2024-03-22T03:09:25Z","published":"2023-11-17T01:55:15Z","title":"Physics-Enhanced Multi-fidelity Learning for Optical Surface Imprint","summary":" Human fingerprints serve as one unique and powerful characteristic for each\nperson, from which policemen can recognize the identity. Similar to humans,\nmany natural bodies and intrinsic mechanical qualities can also be uniquely\nidentified from surface characteristics. To measure the elasto-plastic\nproperties of one material, one formally sharp indenter is pushed into the\nmeasured body under constant force and retracted, leaving a unique residual\nimprint of the minute size from several micrometers to nanometers. However, one\ngreat challenge is how to map the optical image of this residual imprint into\nthe real wanted mechanical properties, \\ie, the tensile force curve. In this\npaper, we propose a novel method to use multi-fidelity neural networks (MFNN)\nto solve this inverse problem. We first build up the NN model via pure\nsimulation data, and then bridge the sim-to-real gap via transfer learning.\nConsidering the difficulty of collecting real experimental data, we use NN to\ndig out the unknown physics and also implant the known physics into the\ntransfer learning framework, thus highly improving the model stability and\ndecreasing the data requirement. The final constructed model only needs\nthree-shot calibration of real materials. We tested the final model across 20\nreal materials and achieved satisfying accuracy. This work serves as one great\nexample of applying machine learning into scientific research, especially under\nthe constraints of data limitation and fidelity variance.\n","authors":["Yongchao Chen"],"pdf_url":"https://arxiv.org/pdf/2311.10278v2.pdf","comment":"15 pages, 11 figure"},{"id":"http://arxiv.org/abs/2305.10061v2","updated":"2024-03-22T03:07:25Z","published":"2023-05-17T09:04:22Z","title":"Rethinking Boundary Discontinuity Problem for Oriented Object Detection","summary":" Oriented object detection has been developed rapidly in the past few years,\nwhere rotation equivariance is crucial for detectors to predict rotated boxes.\nIt is expected that the prediction can maintain the corresponding rotation when\nobjects rotate, but severe mutation in angular prediction is sometimes observed\nwhen objects rotate near the boundary angle, which is well-known boundary\ndiscontinuity problem. The problem has been long believed to be caused by the\nsharp loss increase at the angular boundary, and widely used joint-optim\nIoU-like methods deal with this problem by loss-smoothing. However, we\nexperimentally find that even state-of-the-art IoU-like methods actually fail\nto solve the problem. On further analysis, we find that the key to solution\nlies in encoding mode of the smoothing function rather than in joint or\nindependent optimization. In existing IoU-like methods, the model essentially\nattempts to fit the angular relationship between box and object, where the\nbreak point at angular boundary makes the predictions highly unstable.To deal\nwith this issue, we propose a dual-optimization paradigm for angles. We\ndecouple reversibility and joint-optim from single smoothing function into two\ndistinct entities, which for the first time achieves the objectives of both\ncorrecting angular boundary and blending angle with other parameters.Extensive\nexperiments on multiple datasets show that boundary discontinuity problem is\nwell-addressed. Moreover, typical IoU-like methods are improved to the same\nlevel without obvious performance gap. The code is available at\nhttps://github.com/hangxu-cv/cvpr24acm.\n","authors":["Hang Xu","Xinyuan Liu","Haonan Xu","Yike Ma","Zunjie Zhu","Chenggang Yan","Feng Dai"],"pdf_url":"https://arxiv.org/pdf/2305.10061v2.pdf","comment":"cvpr 2024"},{"id":"http://arxiv.org/abs/2311.16194v2","updated":"2024-03-22T02:53:59Z","published":"2023-11-26T14:24:13Z","title":"BadCLIP: Trigger-Aware Prompt Learning for Backdoor Attacks on CLIP","summary":" Contrastive Vision-Language Pre-training, known as CLIP, has shown promising\neffectiveness in addressing downstream image recognition tasks. However, recent\nworks revealed that the CLIP model can be implanted with a downstream-oriented\nbackdoor. On downstream tasks, one victim model performs well on clean samples\nbut predicts a specific target class whenever a specific trigger is present.\nFor injecting a backdoor, existing attacks depend on a large amount of\nadditional data to maliciously fine-tune the entire pre-trained CLIP model,\nwhich makes them inapplicable to data-limited scenarios. In this work,\nmotivated by the recent success of learnable prompts, we address this problem\nby injecting a backdoor into the CLIP model in the prompt learning stage. Our\nmethod named BadCLIP is built on a novel and effective mechanism in backdoor\nattacks on CLIP, i.e., influencing both the image and text encoders with the\ntrigger. It consists of a learnable trigger applied to images and a\ntrigger-aware context generator, such that the trigger can change text features\nvia trigger-aware prompts, resulting in a powerful and generalizable attack.\nExtensive experiments conducted on 11 datasets verify that the clean accuracy\nof BadCLIP is similar to those of advanced prompt learning methods and the\nattack success rate is higher than 99% in most cases. BadCLIP is also\ngeneralizable to unseen classes, and shows a strong generalization capability\nunder cross-dataset and cross-domain settings.\n","authors":["Jiawang Bai","Kuofeng Gao","Shaobo Min","Shu-Tao Xia","Zhifeng Li","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2311.16194v2.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2312.01697v4","updated":"2024-03-22T02:47:00Z","published":"2023-12-04T07:36:04Z","title":"Hulk: A Universal Knowledge Translator for Human-Centric Tasks","summary":" Human-centric perception tasks, e.g., pedestrian detection, skeleton-based\naction recognition, and pose estimation, have wide industrial applications,\nsuch as metaverse and sports analysis. There is a recent surge to develop\nhuman-centric foundation models that can benefit a broad range of human-centric\nperception tasks. While many human-centric foundation models have achieved\nsuccess, they did not explore 3D and vision-language tasks for human-centric\nand required task-specific finetuning. These limitations restrict their\napplication to more downstream tasks and situations. To tackle these problems,\nwe present Hulk, the first multimodal human-centric generalist model, capable\nof addressing 2D vision, 3D vision, skeleton-based, and vision-language tasks\nwithout task-specific finetuning. The key to achieving this is condensing\nvarious task-specific heads into two general heads, one for discrete\nrepresentations, e.g., languages, and the other for continuous representations,\ne.g., location coordinates. The outputs of two heads can be further stacked\ninto four distinct input and output modalities. This uniform representation\nenables Hulk to treat diverse human-centric tasks as modality translation,\nintegrating knowledge across a wide range of tasks. Comprehensive evaluations\nof Hulk on 12 benchmarks covering 8 human-centric tasks demonstrate the\nsuperiority of our proposed method, achieving state-of-the-art performance in\n11 benchmarks. The code is available on https://github.com/OpenGVLab/Hulk.\n","authors":["Yizhou Wang","Yixuan Wu","Shixiang Tang","Weizhen He","Xun Guo","Feng Zhu","Lei Bai","Rui Zhao","Jian Wu","Tong He","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2312.01697v4.pdf","comment":"24 pages, 5 figures"},{"id":"http://arxiv.org/abs/2212.02340v4","updated":"2024-03-22T02:33:39Z","published":"2022-12-05T15:15:27Z","title":"CBNet: A Plug-and-Play Network for Segmentation-Based Scene Text\n Detection","summary":" Recently, segmentation-based methods are quite popular in scene text\ndetection, which mainly contain two steps: text kernel segmentation and\nexpansion. However, the segmentation process only considers each pixel\nindependently, and the expansion process is difficult to achieve a favorable\naccuracy-speed trade-off. In this paper, we propose a Context-aware and\nBoundary-guided Network (CBN) to tackle these problems. In CBN, a basic text\ndetector is firstly used to predict initial segmentation results. Then, we\npropose a context-aware module to enhance text kernel feature representations,\nwhich considers both global and local contexts. Finally, we introduce a\nboundary-guided module to expand enhanced text kernels adaptively with only the\npixels on the contours, which not only obtains accurate text boundaries but\nalso keeps high speed, especially on high-resolution output maps. In\nparticular, with a lightweight backbone, the basic detector equipped with our\nproposed CBN achieves state-of-the-art results on several popular benchmarks,\nand our proposed CBN can be plugged into several segmentation-based methods.\nCode is available at https://github.com/XiiZhao/cbn.pytorch.\n","authors":["Xi Zhao","Wei Feng","Zheng Zhang","Jingjing Lv","Xin Zhu","Zhangang Lin","Jinghe Hu","Jingping Shao"],"pdf_url":"https://arxiv.org/pdf/2212.02340v4.pdf","comment":"Accepted by IJCV 2024. Code is available at\n https://github.com/XiiZhao/cbn.pytorch"},{"id":"http://arxiv.org/abs/2308.12038v3","updated":"2024-03-22T02:24:57Z","published":"2023-08-23T09:55:41Z","title":"Large Multilingual Models Pivot Zero-Shot Multimodal Learning across\n Languages","summary":" Recently there has been a significant surge in multimodal learning in terms\nof both image-to-text and text-to-image generation. However, the success is\ntypically limited to English, leaving other languages largely behind. Building\na competitive counterpart in other languages is highly challenging due to the\nlow-resource nature of non-English multimodal data (i.e., lack of large-scale,\nhigh-quality image-text data). In this work, we propose MPM, an effective\ntraining paradigm for training large multimodal models in non-English\nlanguages. MPM demonstrates that Multilingual language models can Pivot\nzero-shot Multimodal learning across languages. Specifically, based on a strong\nmultilingual large language model, multimodal models pretrained on English-only\nimage-text data can well generalize to other languages in a (quasi)-zero-shot\nmanner, even surpassing models trained on image-text data in native languages.\nTaking Chinese as a practice of MPM, we build large multimodal models VisCPM in\nimage-to-text and text-to-image generation, which achieve state-of-the-art\n(open-source) performance in Chinese. To facilitate future research, we\nopen-source codes and model weights at https://github.com/OpenBMB/VisCPM.git.\n","authors":["Jinyi Hu","Yuan Yao","Chongyi Wang","Shan Wang","Yinxu Pan","Qianyu Chen","Tianyu Yu","Hanghao Wu","Yue Zhao","Haoye Zhang","Xu Han","Yankai Lin","Jiao Xue","Dahai Li","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2308.12038v3.pdf","comment":"https://github.com/OpenBMB/VisCPM.git"},{"id":"http://arxiv.org/abs/2307.00574v5","updated":"2024-03-22T02:18:11Z","published":"2023-07-02T13:57:45Z","title":"Bidirectional Temporal Diffusion Model for Temporally Consistent Human\n Animation","summary":" We introduce a method to generate temporally coherent human animation from a\nsingle image, a video, or a random noise. This problem has been formulated as\nmodeling of an auto-regressive generation, i.e., to regress past frames to\ndecode future frames. However, such unidirectional generation is highly prone\nto motion drifting over time, generating unrealistic human animation with\nsignificant artifacts such as appearance distortion. We claim that\nbidirectional temporal modeling enforces temporal coherence on a generative\nnetwork by largely suppressing the motion ambiguity of human appearance. To\nprove our claim, we design a novel human animation framework using a denoising\ndiffusion model: a neural network learns to generate the image of a person by\ndenoising temporal Gaussian noises whose intermediate results are\ncross-conditioned bidirectionally between consecutive frames. In the\nexperiments, our method demonstrates strong performance compared to existing\nunidirectional approaches with realistic temporal coherence.\n","authors":["Tserendorj Adiya","Jae Shin Yoon","Jungeun Lee","Sanghun Kim","Hwasup Lim"],"pdf_url":"https://arxiv.org/pdf/2307.00574v5.pdf","comment":"Project page: see https://typest.github.io/btdm"},{"id":"http://arxiv.org/abs/2403.14468v2","updated":"2024-03-22T02:16:40Z","published":"2024-03-21T15:15:00Z","title":"AnyV2V: A Plug-and-Play Framework For Any Video-to-Video Editing Tasks","summary":" Video-to-video editing involves editing a source video along with additional\ncontrol (such as text prompts, subjects, or styles) to generate a new video\nthat aligns with the source video and the provided control. Traditional methods\nhave been constrained to certain editing types, limiting their ability to meet\nthe wide range of user demands. In this paper, we introduce AnyV2V, a novel\ntraining-free framework designed to simplify video editing into two primary\nsteps: (1) employing an off-the-shelf image editing model (e.g.\nInstructPix2Pix, InstantID, etc) to modify the first frame, (2) utilizing an\nexisting image-to-video generation model (e.g. I2VGen-XL) for DDIM inversion\nand feature injection. In the first stage, AnyV2V can plug in any existing\nimage editing tools to support an extensive array of video editing tasks.\nBeyond the traditional prompt-based editing methods, AnyV2V also can support\nnovel video editing tasks, including reference-based style transfer,\nsubject-driven editing, and identity manipulation, which were unattainable by\nprevious methods. In the second stage, AnyV2V can plug in any existing\nimage-to-video models to perform DDIM inversion and intermediate feature\ninjection to maintain the appearance and motion consistency with the source\nvideo. On the prompt-based editing, we show that AnyV2V can outperform the\nprevious best approach by 35\\% on prompt alignment, and 25\\% on human\npreference. On the three novel tasks, we show that AnyV2V also achieves a high\nsuccess rate. We believe AnyV2V will continue to thrive due to its ability to\nseamlessly integrate the fast-evolving image editing methods. Such\ncompatibility can help AnyV2V to increase its versatility to cater to diverse\nuser demands.\n","authors":["Max Ku","Cong Wei","Weiming Ren","Harry Yang","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2403.14468v2.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2306.07894v5","updated":"2024-03-22T02:10:49Z","published":"2023-06-13T16:39:39Z","title":"iSLAM: Imperative SLAM","summary":" Simultaneous Localization and Mapping (SLAM) stands as one of the critical\nchallenges in robot navigation. A SLAM system often consists of a front-end\ncomponent for motion estimation and a back-end system for eliminating\nestimation drifts. Recent advancements suggest that data-driven methods are\nhighly effective for front-end tasks, while geometry-based methods continue to\nbe essential in the back-end processes. However, such a decoupled paradigm\nbetween the data-driven front-end and geometry-based back-end can lead to\nsub-optimal performance, consequently reducing the system's capabilities and\ngeneralization potential. To solve this problem, we proposed a novel\nself-supervised imperative learning framework, named imperative SLAM (iSLAM),\nwhich fosters reciprocal correction between the front-end and back-end, thus\nenhancing performance without necessitating any external supervision.\nSpecifically, we formulate the SLAM problem as a bilevel optimization so that\nthe front-end and back-end are bidirectionally connected. As a result, the\nfront-end model can learn global geometric knowledge obtained through pose\ngraph optimization by back-propagating the residuals from the back-end\ncomponent. We showcase the effectiveness of this new framework through an\napplication of stereo-inertial SLAM. The experiments show that the iSLAM\ntraining strategy achieves an accuracy improvement of 22% on average over a\nbaseline model. To the best of our knowledge, iSLAM is the first SLAM system\nshowing that the front-end and back-end components can mutually correct each\nother in a self-supervised manner.\n","authors":["Taimeng Fu","Shaoshu Su","Yiren Lu","Chen Wang"],"pdf_url":"https://arxiv.org/pdf/2306.07894v5.pdf","comment":"The paper has been accepted by IEEE Robotics and Automation Letters\n (RA-L)"},{"id":"http://arxiv.org/abs/2403.14910v1","updated":"2024-03-22T02:06:44Z","published":"2024-03-22T02:06:44Z","title":"Defying Imbalanced Forgetting in Class Incremental Learning","summary":" We observe a high level of imbalance in the accuracy of different classes in\nthe same old task for the first time. This intriguing phenomenon, discovered in\nreplay-based Class Incremental Learning (CIL), highlights the imbalanced\nforgetting of learned classes, as their accuracy is similar before the\noccurrence of catastrophic forgetting. This discovery remains previously\nunidentified due to the reliance on average incremental accuracy as the\nmeasurement for CIL, which assumes that the accuracy of classes within the same\ntask is similar. However, this assumption is invalid in the face of\ncatastrophic forgetting. Further empirical studies indicate that this\nimbalanced forgetting is caused by conflicts in representation between\nsemantically similar old and new classes. These conflicts are rooted in the\ndata imbalance present in replay-based CIL methods. Building on these insights,\nwe propose CLass-Aware Disentanglement (CLAD) to predict the old classes that\nare more likely to be forgotten and enhance their accuracy. Importantly, CLAD\ncan be seamlessly integrated into existing CIL methods. Extensive experiments\ndemonstrate that CLAD consistently improves current replay-based methods,\nresulting in performance gains of up to 2.56%.\n","authors":["Shixiong Xu","Gaofeng Meng","Xing Nie","Bolin Ni","Bin Fan","Shiming Xiang"],"pdf_url":"https://arxiv.org/pdf/2403.14910v1.pdf","comment":"AAAI2024"},{"id":"http://arxiv.org/abs/2310.00258v2","updated":"2024-03-22T01:46:44Z","published":"2023-09-30T05:19:10Z","title":"NAYER: Noisy Layer Data Generation for Efficient and Effective Data-free\n Knowledge Distillation","summary":" Data-Free Knowledge Distillation (DFKD) has made significant recent strides\nby transferring knowledge from a teacher neural network to a student neural\nnetwork without accessing the original data. Nonetheless, existing approaches\nencounter a significant challenge when attempting to generate samples from\nrandom noise inputs, which inherently lack meaningful information.\nConsequently, these models struggle to effectively map this noise to the\nground-truth sample distribution, resulting in prolonging training times and\nlow-quality outputs. In this paper, we propose a novel Noisy Layer Generation\nmethod (NAYER) which relocates the random source from the input to a noisy\nlayer and utilizes the meaningful constant label-text embedding (LTE) as the\ninput. LTE is generated by using the language model once, and then it is stored\nin memory for all subsequent training processes. The significance of LTE lies\nin its ability to contain substantial meaningful inter-class information,\nenabling the generation of high-quality samples with only a few training steps.\nSimultaneously, the noisy layer plays a key role in addressing the issue of\ndiversity in sample generation by preventing the model from overemphasizing the\nconstrained label information. By reinitializing the noisy layer in each\niteration, we aim to facilitate the generation of diverse samples while still\nretaining the method's efficiency, thanks to the ease of learning provided by\nLTE. Experiments carried out on multiple datasets demonstrate that our NAYER\nnot only outperforms the state-of-the-art methods but also achieves speeds 5 to\n15 times faster than previous approaches. The code is available at\nhttps://github.com/tmtuan1307/nayer.\n","authors":["Minh-Tuan Tran","Trung Le","Xuan-May Le","Mehrtash Harandi","Quan Hung Tran","Dinh Phung"],"pdf_url":"https://arxiv.org/pdf/2310.00258v2.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2312.11038v2","updated":"2024-03-22T01:33:14Z","published":"2023-12-18T09:16:48Z","title":"UniChest: Conquer-and-Divide Pre-training for Multi-Source Chest X-Ray\n Classification","summary":" Vision-Language Pre-training (VLP) that utilizes the multi-modal information\nto promote the training efficiency and effectiveness, has achieved great\nsuccess in vision recognition of natural domains and shown promise in medical\nimaging diagnosis for the Chest X-Rays (CXRs). However, current works mainly\npay attention to the exploration on single dataset of CXRs, which locks the\npotential of this powerful paradigm on larger hybrid of multi-source CXRs\ndatasets. We identify that although blending samples from the diverse sources\noffers the advantages to improve the model generalization, it is still\nchallenging to maintain the consistent superiority for the task of each source\ndue to the existing heterogeneity among sources. To handle this dilemma, we\ndesign a Conquer-and-Divide pre-training framework, termed as UniChest, aiming\nto make full use of the collaboration benefit of multiple sources of CXRs while\nreducing the negative influence of the source heterogeneity. Specially, the\n``Conquer\" stage in UniChest encourages the model to sufficiently capture\nmulti-source common patterns, and the ``Divide\" stage helps squeeze\npersonalized patterns into different small experts (query networks). We conduct\nthorough experiments on many benchmarks, e.g., ChestX-ray14, CheXpert,\nVindr-CXR, Shenzhen, Open-I and SIIM-ACR Pneumothorax, verifying the\neffectiveness of UniChest over a range of baselines, and release our codes and\npre-training models at https://github.com/Elfenreigen/UniChest.\n","authors":["Tianjie Dai","Ruipeng Zhang","Feng Hong","Jiangchao Yao","Ya Zhang","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2312.11038v2.pdf","comment":"Accepted at IEEE Transactions on Medical Imaging"},{"id":"http://arxiv.org/abs/2311.18773v2","updated":"2024-03-22T01:21:14Z","published":"2023-11-30T18:19:23Z","title":"Spacewalk-18: A Benchmark for Multimodal and Long-form Procedural Video\n Understanding","summary":" Learning from videos is an emerging research area that enables robots to\nacquire skills from human demonstrations, such as procedural videos. To do\nthis, video-language models must be able to obtain structured understandings,\nsuch as the temporal segmentation of a demonstration into sequences of actions\nand skills, and to generalize the understandings to novel domains. In pursuit\nof this goal, we introduce Spacewalk-18, a benchmark containing two tasks: (1)\nstep recognition and (2) intra-video retrieval over a dataset of temporally\nsegmented and labeled tasks in International Space Station spacewalk\nrecordings. In tandem, the two tasks quantify a model's ability to make use of:\n(1) out-of-domain visual information; (2) a high temporal context window; and\n(3) multimodal (e.g. visual and speech) domains. This departs from existing\nbenchmarks for procedural video understanding, which typically deal with short\ncontext lengths and can be solved with a single modality. Spacewalk-18, with\nits inherent multimodal and long-form complexity, exposes the high difficulty\nof task recognition and segmentation. We find that state-of-the-art methods\nperform poorly on our benchmark, but improvements can be obtained by\nincorporating information from longer-range temporal context across different\nmodalities. Our experiments underscore the need to develop new approaches to\nthese tasks. Data, model, and code will be released at\nhttps://brown-palm.github.io/Spacewalk-18/.\n","authors":["Rohan Myer Krishnan","Zitian Tang","Zhiqiu Yu","Chen Sun"],"pdf_url":"https://arxiv.org/pdf/2311.18773v2.pdf","comment":"Under submission. Code and models will be released at\n https://brown-palm.github.io/Spacewalk-18/"},{"id":"http://arxiv.org/abs/2305.14521v2","updated":"2024-03-22T01:20:41Z","published":"2023-05-23T20:49:45Z","title":"Few-shot Adaption to Distribution Shifts By Mixing Source and Target\n Embeddings","summary":" Pretrained machine learning models need to be adapted to distribution shifts\nwhen deployed in new target environments. When obtaining labeled data from the\ntarget distribution is expensive, few-shot adaptation with only a few examples\nfrom the target distribution becomes essential. In this work, we propose\nMixPro, a lightweight and highly data-efficient approach for few-shot\nadaptation. MixPro first generates a relatively large dataset by mixing\n(linearly combining) pre-trained embeddings of large source data with those of\nthe few target examples. This process preserves important features of both\nsource and target distributions, while mitigating the specific noise in the\nsmall target data. Then, it trains a linear classifier on the mixed embeddings\nto effectively adapts the model to the target distribution without overfitting\nthe small target data. Theoretically, we demonstrate the advantages of MixPro\nover previous methods. Our experiments, conducted across various model\narchitectures on 8 datasets featuring different types of distribution shifts,\nreveal that MixPro can outperform baselines by up to 7\\%, with only 2-4 target\nexamples.\n","authors":["Yihao Xue","Ali Payani","Yu Yang","Baharan Mirzasoleiman"],"pdf_url":"https://arxiv.org/pdf/2305.14521v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13802v3","updated":"2024-03-22T01:17:25Z","published":"2023-05-23T08:15:02Z","title":"Online Open-set Semi-supervised Object Detection with Dual Competing\n Head","summary":" Open-set semi-supervised object detection (OSSOD) task leverages practical\nopen-set unlabeled datasets that comprise both in-distribution (ID) and\nout-of-distribution (OOD) instances for conducting semi-supervised object\ndetection (SSOD). The main challenge in OSSOD is distinguishing and filtering\nthe OOD instances (i.e., outliers) during pseudo-labeling since OODs will\naffect the performance. The only OSSOD work employs an additional offline OOD\ndetection network trained solely with labeled data to solve this problem.\nHowever, the limited labeled data restricts the potential for improvement.\nMeanwhile, the offline strategy results in low efficiency. To alleviate these\nissues, this paper proposes an end-to-end online OSSOD framework that improves\nperformance and efficiency: 1) We propose a semi-supervised outlier filtering\nmethod that more effectively filters the OOD instances using both labeled and\nunlabeled data. 2) We propose a threshold-free Dual Competing OOD head that\nfurther improves the performance by suppressing the error accumulation during\nsemi-supervised outlier filtering. 3) Our proposed method is an online\nend-to-end trainable OSSOD framework. Experimental results show that our method\nachieves state-of-the-art performance on several OSSOD benchmarks compared to\nexisting methods. Moreover, additional experiments show that our method is more\nefficient and can be easily applied to different SSOD frameworks to boost their\nperformance.\n","authors":["Zerun Wang","Ling Xiao","Liuyu Xiang","Zhaotian Weng","Toshihiko Yamasaki"],"pdf_url":"https://arxiv.org/pdf/2305.13802v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14898v1","updated":"2024-03-22T01:04:51Z","published":"2024-03-22T01:04:51Z","title":"Web-based Melanoma Detection","summary":" Melanoma is the most aggressive form of skin cancer, and early detection can\nsignificantly increase survival rates and prevent cancer spread. However,\ndeveloping reliable automated detection techniques is difficult due to the lack\nof standardized datasets and evaluation methods. This study introduces a\nunified melanoma classification approach that supports 54 combinations of 11\ndatasets and 24 state-of-the-art deep learning architectures. It enables a fair\ncomparison of 1,296 experiments and results in a lightweight model deployable\nto the web-based MeshNet architecture named Mela-D. This approach can run up to\n33x faster by reducing parameters 24x to yield an analogous 88.8\\% accuracy\ncomparable with ResNet50 on previously unseen images. This allows efficient and\naccurate melanoma detection in real-world settings that can run on\nconsumer-level hardware.\n","authors":["SangHyuk Kim","Edward Gaibor","Daniel Haehn"],"pdf_url":"https://arxiv.org/pdf/2403.14898v1.pdf","comment":"10 pages, 9 figures"},{"id":"http://arxiv.org/abs/2403.14897v1","updated":"2024-03-22T01:02:09Z","published":"2024-03-22T01:02:09Z","title":"Geometric Generative Models based on Morphological Equivariant PDEs and\n GANs","summary":" Content and image generation consist in creating or generating data from\nnoisy information by extracting specific features such as texture, edges, and\nother thin image structures. We are interested here in generative models, and\ntwo main problems are addressed. Firstly, the improvements of specific feature\nextraction while accounting at multiscale levels intrinsic geometric features;\nand secondly, the equivariance of the network to reduce its complexity and\nprovide a geometric interpretability. To proceed, we propose a geometric\ngenerative model based on an equivariant partial differential equation (PDE)\nfor group convolution neural networks (G-CNNs), so called PDE-G-CNNs, built on\nmorphology operators and generative adversarial networks (GANs). Equivariant\nmorphological PDE layers are composed of multiscale dilations and erosions\nformulated in Riemannian manifolds, while group symmetries are defined on a Lie\ngroup. We take advantage of the Lie group structure to properly integrate the\nequivariance in layers, and are able to use the Riemannian metric to solve the\nmultiscale morphological operations. Each point of the Lie group is associated\nwith a unique point in the manifold, which helps us derive a metric on the\nRiemannian manifold from a tensor field invariant under the Lie group so that\nthe induced metric has the same symmetries. The proposed geometric\nmorphological GAN (GM-GAN) is obtained by using the proposed morphological\nequivariant convolutions in PDE-G-CNNs to bring nonlinearity in classical CNNs.\nGM-GAN is evaluated on MNIST data and compared with GANs. Preliminary results\nshow that GM-GAN model outperforms classical GAN.\n","authors":["El Hadji S. Diop","Thierno Fall","Alioune Mbengue","Mohamed Daoudi"],"pdf_url":"https://arxiv.org/pdf/2403.14897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11125v3","updated":"2024-03-22T00:36:02Z","published":"2023-11-18T17:14:07Z","title":"SecondPose: SE(3)-Consistent Dual-Stream Feature Fusion for\n Category-Level Pose Estimation","summary":" Category-level object pose estimation, aiming to predict the 6D pose and 3D\nsize of objects from known categories, typically struggles with large\nintra-class shape variation. Existing works utilizing mean shapes often fall\nshort of capturing this variation. To address this issue, we present\nSecondPose, a novel approach integrating object-specific geometric features\nwith semantic category priors from DINOv2. Leveraging the advantage of DINOv2\nin providing SE(3)-consistent semantic features, we hierarchically extract two\ntypes of SE(3)-invariant geometric features to further encapsulate\nlocal-to-global object-specific information. These geometric features are then\npoint-aligned with DINOv2 features to establish a consistent object\nrepresentation under SE(3) transformations, facilitating the mapping from\ncamera space to the pre-defined canonical space, thus further enhancing pose\nestimation. Extensive experiments on NOCS-REAL275 demonstrate that SecondPose\nachieves a 12.4% leap forward over the state-of-the-art. Moreover, on a more\ncomplex dataset HouseCat6D which provides photometrically challenging objects,\nSecondPose still surpasses other competitors by a large margin.\n","authors":["Yamei Chen","Yan Di","Guangyao Zhai","Fabian Manhardt","Chenyangguang Zhang","Ruida Zhang","Federico Tombari","Nassir Navab","Benjamin Busam"],"pdf_url":"https://arxiv.org/pdf/2311.11125v3.pdf","comment":"CVPR 2024 accepted. Code is available at:\n https://github.com/NOrangeeroli/SecondPose"},{"id":"http://arxiv.org/abs/2403.12211v2","updated":"2024-03-22T00:17:11Z","published":"2024-03-18T19:51:55Z","title":"A Unified Model for Longitudinal Multi-Modal Multi-View Prediction with\n Missingness","summary":" Medical records often consist of different modalities, such as images, text,\nand tabular information. Integrating all modalities offers a holistic view of a\npatient's condition, while analyzing them longitudinally provides a better\nunderstanding of disease progression. However, real-world longitudinal medical\nrecords present challenges: 1) patients may lack some or all of the data for a\nspecific timepoint, and 2) certain modalities or views might be absent for all\npatients during a particular period. In this work, we introduce a unified model\nfor longitudinal multi-modal multi-view prediction with missingness. Our method\nallows as many timepoints as desired for input, and aims to leverage all\navailable data, regardless of their availability. We conduct extensive\nexperiments on the knee osteoarthritis dataset from the Osteoarthritis\nInitiative for pain and Kellgren-Lawrence grade prediction at a future\ntimepoint. We demonstrate the effectiveness of our method by comparing results\nfrom our unified model to specific models that use the same modality and view\ncombinations during training and evaluation. We also show the benefit of having\nextended temporal data and provide post-hoc analysis for a deeper understanding\nof each modality/view's importance for different tasks.\n","authors":["Boqi Chen","Junier Oliva","Marc Niethammer"],"pdf_url":"https://arxiv.org/pdf/2403.12211v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15651v1","updated":"2024-03-22T23:47:19Z","published":"2024-03-22T23:47:19Z","title":"GaNI: Global and Near Field Illumination Aware Neural Inverse Rendering","summary":" In this paper, we present GaNI, a Global and Near-field Illumination-aware\nneural inverse rendering technique that can reconstruct geometry, albedo, and\nroughness parameters from images of a scene captured with co-located light and\ncamera. Existing inverse rendering techniques with co-located light-camera\nfocus on single objects only, without modeling global illumination and\nnear-field lighting more prominent in scenes with multiple objects. We\nintroduce a system that solves this problem in two stages; we first reconstruct\nthe geometry powered by neural volumetric rendering NeuS, followed by inverse\nneural radiosity that uses the previously predicted geometry to estimate albedo\nand roughness. However, such a naive combination fails and we propose multiple\ntechnical contributions that enable this two-stage approach. We observe that\nNeuS fails to handle near-field illumination and strong specular reflections\nfrom the flashlight in a scene. We propose to implicitly model the effects of\nnear-field illumination and introduce a surface angle loss function to handle\nspecular reflections. Similarly, we observe that invNeRad assumes constant\nillumination throughout the capture and cannot handle moving flashlights during\ncapture. We propose a light position-aware radiance cache network and\nadditional smoothness priors on roughness to reconstruct reflectance.\nExperimental evaluation on synthetic and real data shows that our method\noutperforms the existing co-located light-camera-based inverse rendering\ntechniques. Our approach produces significantly better reflectance and slightly\nbetter geometry than capture strategies that do not require a dark room.\n","authors":["Jiaye Wu","Saeed Hadadan","Geng Lin","Matthias Zwicker","David Jacobs","Roni Sengupta"],"pdf_url":"https://arxiv.org/pdf/2403.15651v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12039v2","updated":"2024-03-22T23:39:54Z","published":"2023-05-19T23:52:48Z","title":"Learning for Transductive Threshold Calibration in Open-World\n Recognition","summary":" In deep metric learning for visual recognition, the calibration of distance\nthresholds is crucial for achieving desired model performance in the true\npositive rates (TPR) or true negative rates (TNR). However, calibrating this\nthreshold presents challenges in open-world scenarios, where the test classes\ncan be entirely disjoint from those encountered during training. We define the\nproblem of finding distance thresholds for a trained embedding model to achieve\ntarget performance metrics over unseen open-world test classes as open-world\nthreshold calibration. Existing posthoc threshold calibration methods, reliant\non inductive inference and requiring a calibration dataset with a similar\ndistance distribution as the test data, often prove ineffective in open-world\nscenarios. To address this, we introduce OpenGCN, a Graph Neural Network-based\ntransductive threshold calibration method with enhanced adaptability and\nrobustness. OpenGCN learns to predict pairwise connectivity for the unlabeled\ntest instances embedded in a graph to determine its TPR and TNR at various\ndistance thresholds, allowing for transductive inference of the distance\nthresholds which also incorporates test-time information. Extensive experiments\nacross open-world visual recognition benchmarks validate OpenGCN's superiority\nover existing posthoc calibration methods for open-world threshold calibration.\n","authors":["Qin Zhang","Dongsheng An","Tianjun Xiao","Tong He","Qingming Tang","Ying Nian Wu","Joseph Tighe","Yifan Xing","Stefano Soatto"],"pdf_url":"https://arxiv.org/pdf/2305.12039v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15647v1","updated":"2024-03-22T23:08:31Z","published":"2024-03-22T23:08:31Z","title":"RetiGen: A Framework for Generalized Retinal Diagnosis Using Multi-View\n Fundus Images","summary":" This study introduces a novel framework for enhancing domain generalization\nin medical imaging, specifically focusing on utilizing unlabelled multi-view\ncolour fundus photographs. Unlike traditional approaches that rely on\nsingle-view imaging data and face challenges in generalizing across diverse\nclinical settings, our method leverages the rich information in the unlabelled\nmulti-view imaging data to improve model robustness and accuracy. By\nincorporating a class balancing method, a test-time adaptation technique and a\nmulti-view optimization strategy, we address the critical issue of domain shift\nthat often hampers the performance of machine learning models in real-world\napplications. Experiments comparing various state-of-the-art domain\ngeneralization and test-time optimization methodologies show that our approach\nconsistently outperforms when combined with existing baseline and\nstate-of-the-art methods. We also show our online method improves all existing\ntechniques. Our framework demonstrates improvements in domain generalization\ncapabilities and offers a practical solution for real-world deployment by\nfacilitating online adaptation to new, unseen datasets. Our code is available\nat https://github.com/zgy600/RetiGen .\n","authors":["Ze Chen","Gongyu Zhang","Jiayu Huo","Joan Nunez do Rio","Charalampos Komninos","Yang Liu","Rachel Sparks","Sebastien Ourselin","Christos Bergeles","Timothy Jackson"],"pdf_url":"https://arxiv.org/pdf/2403.15647v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.07786v3","updated":"2024-03-22T22:42:47Z","published":"2024-03-12T16:20:27Z","title":"Generative deep learning-enabled ultra-large field-of-view lens-free\n imaging","summary":" Advancements in high-throughput biomedical applications necessitate\nreal-time, large field-of-view (FOV) imaging capabilities. Conventional\nlens-free imaging (LFI) systems, while addressing the limitations of physical\nlenses, have been constrained by dynamic, hard-to-model optical fields,\nresulting in a limited one-shot FOV of approximately 20 $mm^2$. This\nrestriction has been a major bottleneck in applications like live-cell imaging\nand automation of microfluidic systems for biomedical research. Here, we\npresent a deep-learning(DL)-based imaging framework - GenLFI - leveraging\ngenerative artificial intelligence (AI) for holographic image reconstruction.\nWe demonstrate that GenLFI can achieve a real-time FOV over 550 $mm^2$,\nsurpassing the current LFI system by more than 20-fold, and even larger than\nthe world's largest confocal microscope by 1.76 times. The resolution is at the\nsub-pixel level of 5.52 $\\mu m$, without the need for a shifting light source.\nThe unsupervised learning-based reconstruction does not require optical field\nmodeling, making imaging dynamic 3D samples (e.g., droplet-based microfluidics\nand 3D cell models) in complex optical fields possible. This GenLFI framework\nunlocks the potential of LFI systems, offering a robust tool to tackle new\nfrontiers in high-throughput biomedical applications such as drug discovery.\n","authors":["Ronald B. Liu","Zhe Liu","Max G. A. Wolf","Krishna P. Purohit","Gregor Fritz","Yi Feng","Carsten G. Hansen","Pierre O. Bagnaninchi","Xavier Casadevall i Solvas","Yunjie Yang"],"pdf_url":"https://arxiv.org/pdf/2403.07786v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.04662v2","updated":"2024-03-22T22:35:18Z","published":"2023-10-07T03:00:33Z","title":"HalluciDet: Hallucinating RGB Modality for Person Detection Through\n Privileged Information","summary":" A powerful way to adapt a visual recognition model to a new domain is through\nimage translation. However, common image translation approaches only focus on\ngenerating data from the same distribution as the target domain. Given a\ncross-modal application, such as pedestrian detection from aerial images, with\na considerable shift in data distribution between infrared (IR) to visible\n(RGB) images, a translation focused on generation might lead to poor\nperformance as the loss focuses on irrelevant details for the task. In this\npaper, we propose HalluciDet, an IR-RGB image translation model for object\ndetection. Instead of focusing on reconstructing the original image on the IR\nmodality, it seeks to reduce the detection loss of an RGB detector, and\ntherefore avoids the need to access RGB data. This model produces a new image\nrepresentation that enhances objects of interest in the scene and greatly\nimproves detection performance. We empirically compare our approach against\nstate-of-the-art methods for image translation and for fine-tuning on IR, and\nshow that our HalluciDet improves detection accuracy in most cases by\nexploiting the privileged information encoded in a pre-trained RGB detector.\nCode: https://github.com/heitorrapela/HalluciDet\n","authors":["Heitor Rapela Medeiros","Fidel A. Guerrero Pena","Masih Aminbeidokhti","Thomas Dubail","Eric Granger","Marco Pedersoli"],"pdf_url":"https://arxiv.org/pdf/2310.04662v2.pdf","comment":"IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)\n 2024"},{"id":"http://arxiv.org/abs/2403.15624v1","updated":"2024-03-22T21:28:19Z","published":"2024-03-22T21:28:19Z","title":"Semantic Gaussians: Open-Vocabulary Scene Understanding with 3D Gaussian\n Splatting","summary":" Open-vocabulary 3D scene understanding presents a significant challenge in\ncomputer vision, withwide-ranging applications in embodied agents and augmented\nreality systems. Previous approaches haveadopted Neural Radiance Fields (NeRFs)\nto analyze 3D scenes. In this paper, we introduce SemanticGaussians, a novel\nopen-vocabulary scene understanding approach based on 3D Gaussian Splatting.\nOur keyidea is distilling pre-trained 2D semantics into 3D Gaussians. We design\na versatile projection approachthat maps various 2Dsemantic features from\npre-trained image encoders into a novel semantic component of 3D Gaussians,\nwithoutthe additional training required by NeRFs. We further build a 3D\nsemantic network that directly predictsthe semantic component from raw 3D\nGaussians for fast inference. We explore several applications ofSemantic\nGaussians: semantic segmentation on ScanNet-20, where our approach attains a\n4.2% mIoU and 4.0%mAcc improvement over prior open-vocabulary scene\nunderstanding counterparts; object part segmentation,sceneediting, and\nspatial-temporal segmentation with better qualitative results over 2D and 3D\nbaselines,highlighting its versatility and effectiveness on supporting diverse\ndownstream tasks.\n","authors":["Jun Guo","Xiaojian Ma","Yue Fan","Huaping Liu","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2403.15624v1.pdf","comment":"Project page: see https://semantic-gaussians.github.io"},{"id":"http://arxiv.org/abs/2403.15612v1","updated":"2024-03-22T20:49:26Z","published":"2024-03-22T20:49:26Z","title":"InterFusion: Text-Driven Generation of 3D Human-Object Interaction","summary":" In this study, we tackle the complex task of generating 3D human-object\ninteractions (HOI) from textual descriptions in a zero-shot text-to-3D manner.\nWe identify and address two key challenges: the unsatisfactory outcomes of\ndirect text-to-3D methods in HOI, largely due to the lack of paired\ntext-interaction data, and the inherent difficulties in simultaneously\ngenerating multiple concepts with complex spatial relationships. To effectively\naddress these issues, we present InterFusion, a two-stage framework\nspecifically designed for HOI generation. InterFusion involves human pose\nestimations derived from text as geometric priors, which simplifies the\ntext-to-3D conversion process and introduces additional constraints for\naccurate object generation. At the first stage, InterFusion extracts 3D human\nposes from a synthesized image dataset depicting a wide range of interactions,\nsubsequently mapping these poses to interaction descriptions. The second stage\nof InterFusion capitalizes on the latest developments in text-to-3D generation,\nenabling the production of realistic and high-quality 3D HOI scenes. This is\nachieved through a local-global optimization process, where the generation of\nhuman body and object is optimized separately, and jointly refined with a\nglobal optimization of the entire scene, ensuring a seamless and contextually\ncoherent integration. Our experimental results affirm that InterFusion\nsignificantly outperforms existing state-of-the-art methods in 3D HOI\ngeneration.\n","authors":["Sisi Dai","Wenhao Li","Haowen Sun","Haibin Huang","Chongyang Ma","Hui Huang","Kai Xu","Ruizhen Hu"],"pdf_url":"https://arxiv.org/pdf/2403.15612v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15609v1","updated":"2024-03-22T20:38:40Z","published":"2024-03-22T20:38:40Z","title":"Towards Automatic Abdominal MRI Organ Segmentation: Leveraging\n Synthesized Data Generated From CT Labels","summary":" Deep learning has shown great promise in the ability to automatically\nannotate organs in magnetic resonance imaging (MRI) scans, for example, of the\nbrain. However, despite advancements in the field, the ability to accurately\nsegment abdominal organs remains difficult across MR. In part, this may be\nexplained by the much greater variability in image appearance and severely\nlimited availability of training labels. The inherent nature of computed\ntomography (CT) scans makes it easier to annotate, resulting in a larger\navailability of expert annotations for the latter. We leverage a\nmodality-agnostic domain randomization approach, utilizing CT label maps to\ngenerate synthetic images on-the-fly during training, further used to train a\nU-Net segmentation network for abdominal organs segmentation. Our approach\nshows comparable results compared to fully-supervised segmentation methods\ntrained on MR data. Our method results in Dice scores of 0.90 (0.08) and 0.91\n(0.08) for the right and left kidney respectively, compared to a pretrained\nnnU-Net model yielding 0.87 (0.20) and 0.91 (0.03). We will make our code\npublicly available.\n","authors":["Cosmin Ciausu","Deepa Krishnaswamy","Benjamin Billot","Steve Pieper","Ron Kikinis","Andrey Fedorov"],"pdf_url":"https://arxiv.org/pdf/2403.15609v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2403.15605v1","updated":"2024-03-22T20:22:08Z","published":"2024-03-22T20:22:08Z","title":"Efficiently Assemble Normalization Layers and Regularization for\n Federated Domain Generalization","summary":" Domain shift is a formidable issue in Machine Learning that causes a model to\nsuffer from performance degradation when tested on unseen domains. Federated\nDomain Generalization (FedDG) attempts to train a global model using\ncollaborative clients in a privacy-preserving manner that can generalize well\nto unseen clients possibly with domain shift. However, most existing FedDG\nmethods either cause additional privacy risks of data leakage or induce\nsignificant costs in client communication and computation, which are major\nconcerns in the Federated Learning paradigm. To circumvent these challenges,\nhere we introduce a novel architectural method for FedDG, namely gPerXAN, which\nrelies on a normalization scheme working with a guiding regularizer. In\nparticular, we carefully design Personalized eXplicitly Assembled Normalization\nto enforce client models selectively filtering domain-specific features that\nare biased towards local data while retaining discrimination of those features.\nThen, we incorporate a simple yet effective regularizer to guide these models\nin directly capturing domain-invariant representations that the global model's\nclassifier can leverage. Extensive experimental results on two benchmark\ndatasets, i.e., PACS and Office-Home, and a real-world medical dataset,\nCamelyon17, indicate that our proposed method outperforms other existing\nmethods in addressing this particular problem.\n","authors":["Khiem Le","Long Ho","Cuong Do","Danh Le-Phuoc","Kok-Seng Wong"],"pdf_url":"https://arxiv.org/pdf/2403.15605v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15603v1","updated":"2024-03-22T20:11:19Z","published":"2024-03-22T20:11:19Z","title":"Forward Learning for Gradient-based Black-box Saliency Map Generation","summary":" Gradient-based saliency maps are widely used to explain deep neural network\ndecisions. However, as models become deeper and more black-box, such as in\nclosed-source APIs like ChatGPT, computing gradients become challenging,\nhindering conventional explanation methods. In this work, we introduce a novel\nunified framework for estimating gradients in black-box settings and generating\nsaliency maps to interpret model decisions. We employ the likelihood ratio\nmethod to estimate output-to-input gradients and utilize them for saliency map\ngeneration. Additionally, we propose blockwise computation techniques to\nenhance estimation accuracy. Extensive experiments in black-box settings\nvalidate the effectiveness of our method, demonstrating accurate gradient\nestimation and explainability of generated saliency maps. Furthermore, we\nshowcase the scalability of our approach by applying it to explain GPT-Vision,\nrevealing the continued relevance of gradient-based explanation methods in the\nera of large, closed-source, and black-box models.\n","authors":["Zeliang Zhang","Mingqian Feng","Jinyang Jiang","Rongyi Zhu","Yijie Peng","Chenliang Xu"],"pdf_url":"https://arxiv.org/pdf/2403.15603v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15593v1","updated":"2024-03-22T19:41:26Z","published":"2024-03-22T19:41:26Z","title":"FairerCLIP: Debiasing CLIP's Zero-Shot Predictions using Functions in\n RKHSs","summary":" Large pre-trained vision-language models such as CLIP provide compact and\ngeneral-purpose representations of text and images that are demonstrably\neffective across multiple downstream zero-shot prediction tasks. However, owing\nto the nature of their training process, these models have the potential to 1)\npropagate or amplify societal biases in the training data and 2) learn to rely\non spurious features. This paper proposes FairerCLIP, a general approach for\nmaking zero-shot predictions of CLIP more fair and robust to spurious\ncorrelations. We formulate the problem of jointly debiasing CLIP's image and\ntext representations in reproducing kernel Hilbert spaces (RKHSs), which\naffords multiple benefits: 1) Flexibility: Unlike existing approaches, which\nare specialized to either learn with or without ground-truth labels, FairerCLIP\nis adaptable to learning in both scenarios. 2) Ease of Optimization: FairerCLIP\nlends itself to an iterative optimization involving closed-form solvers, which\nleads to $4\\times$-$10\\times$ faster training than the existing methods. 3)\nSample Efficiency: Under sample-limited conditions, FairerCLIP significantly\noutperforms baselines when they fail entirely. And, 4) Performance:\nEmpirically, FairerCLIP achieves appreciable accuracy gains on benchmark\nfairness and spurious correlation datasets over their respective baselines.\n","authors":["Sepehr Dehdashtian","Lan Wang","Vishnu Naresh Boddeti"],"pdf_url":"https://arxiv.org/pdf/2403.15593v1.pdf","comment":"The Twelfth International Conference on Learning Representations\n (ICLR) 2024"},{"id":"http://arxiv.org/abs/2403.15585v1","updated":"2024-03-22T19:19:51Z","published":"2024-03-22T19:19:51Z","title":"MedPromptX: Grounded Multimodal Prompting for Chest X-ray Diagnosis","summary":" Chest X-ray images are commonly used for predicting acute and chronic\ncardiopulmonary conditions, but efforts to integrate them with structured\nclinical data face challenges due to incomplete electronic health records\n(EHR). This paper introduces \\textbf{MedPromptX}, the first model to integrate\nmultimodal large language models (MLLMs), few-shot prompting (FP) and visual\ngrounding (VG) to combine imagery with EHR data for chest X-ray diagnosis. A\npre-trained MLLM is utilized to complement the missing EHR information,\nproviding a comprehensive understanding of patients' medical history.\nAdditionally, FP reduces the necessity for extensive training of MLLMs while\neffectively tackling the issue of hallucination. Nevertheless, the process of\ndetermining the optimal number of few-shot examples and selecting high-quality\ncandidates can be burdensome, yet it profoundly influences model performance.\nHence, we propose a new technique that dynamically refines few-shot data for\nreal-time adjustment to new patient scenarios. Moreover, VG aids in focusing\nthe model's attention on relevant regions of interest in X-ray images,\nenhancing the identification of abnormalities. We release MedPromptX-VQA, a new\nin-context visual question answering dataset encompassing interleaved image and\nEHR data derived from MIMIC-IV and MIMIC-CXR databases. Results demonstrate the\nSOTA performance of MedPromptX, achieving an 11% improvement in F1-score\ncompared to the baselines. Code and data are available at\n\\url{https://github.com/BioMedIA-MBZUAI/MedPromptX}.\n","authors":["Mai A. Shaaban","Adnan Khan","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2403.15585v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06054v3","updated":"2024-03-22T19:14:59Z","published":"2024-03-10T00:47:05Z","title":"Decoupled Data Consistency with Diffusion Purification for Image\n Restoration","summary":" Diffusion models have recently gained traction as a powerful class of deep\ngenerative priors, excelling in a wide range of image restoration tasks due to\ntheir exceptional ability to model data distributions. To solve image\nrestoration problems, many existing techniques achieve data consistency by\nincorporating additional likelihood gradient steps into the reverse sampling\nprocess of diffusion models. However, the additional gradient steps pose a\nchallenge for real-world practical applications as they incur a large\ncomputational overhead, thereby increasing inference time. They also present\nadditional difficulties when using accelerated diffusion model samplers, as the\nnumber of data consistency steps is limited by the number of reverse sampling\nsteps. In this work, we propose a novel diffusion-based image restoration\nsolver that addresses these issues by decoupling the reverse process from the\ndata consistency steps. Our method involves alternating between a\nreconstruction phase to maintain data consistency and a refinement phase that\nenforces the prior via diffusion purification. Our approach demonstrates\nversatility, making it highly adaptable for efficient problem-solving in latent\nspace. Additionally, it reduces the necessity for numerous sampling steps\nthrough the integration of consistency models. The efficacy of our approach is\nvalidated through comprehensive experiments across various image restoration\ntasks, including image denoising, deblurring, inpainting, and super-resolution.\n","authors":["Xiang Li","Soo Min Kwon","Ismail R. Alkhouri","Saiprasad Ravishankar","Qing Qu"],"pdf_url":"https://arxiv.org/pdf/2403.06054v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15583v1","updated":"2024-03-22T19:14:28Z","published":"2024-03-22T19:14:28Z","title":"U-ARE-ME: Uncertainty-Aware Rotation Estimation in Manhattan\n Environments","summary":" Camera rotation estimation from a single image is a challenging task, often\nrequiring depth data and/or camera intrinsics, which are generally not\navailable for in-the-wild videos. Although external sensors such as inertial\nmeasurement units (IMUs) can help, they often suffer from drift and are not\napplicable in non-inertial reference frames. We present U-ARE-ME, an algorithm\nthat estimates camera rotation along with uncertainty from uncalibrated RGB\nimages. Using a Manhattan World assumption, our method leverages the per-pixel\ngeometric priors encoded in single-image surface normal predictions and\nperforms optimisation over the SO(3) manifold. Given a sequence of images, we\ncan use the per-frame rotation estimates and their uncertainty to perform\nmulti-frame optimisation, achieving robustness and temporal consistency. Our\nexperiments demonstrate that U-ARE-ME performs comparably to RGB-D methods and\nis more robust than sparse feature-based SLAM methods. We encourage the reader\nto view the accompanying video at https://callum-rhodes.github.io/U-ARE-ME for\na visual overview of our method.\n","authors":["Aalok Patwardhan","Callum Rhodes","Gwangbin Bae","Andrew J. Davison"],"pdf_url":"https://arxiv.org/pdf/2403.15583v1.pdf","comment":"For the project page and video see\n https://callum-rhodes.github.io/U-ARE-ME"},{"id":"http://arxiv.org/abs/2403.11401v2","updated":"2024-03-22T18:52:51Z","published":"2024-03-18T01:18:48Z","title":"Scene-LLM: Extending Language Model for 3D Visual Understanding and\n Reasoning","summary":" This paper introduces Scene-LLM, a 3D-visual-language model that enhances\nembodied agents' abilities in interactive 3D indoor environments by integrating\nthe reasoning strengths of Large Language Models (LLMs). Scene-LLM adopts a\nhybrid 3D visual feature representation, that incorporates dense spatial\ninformation and supports scene state updates. The model employs a projection\nlayer to efficiently project these features in the pre-trained textual\nembedding space, enabling effective interpretation of 3D visual information.\nUnique to our approach is the integration of both scene-level and ego-centric\n3D information. This combination is pivotal for interactive planning, where\nscene-level data supports global planning and ego-centric data is important for\nlocalization. Notably, we use ego-centric 3D frame features for feature\nalignment, an efficient technique that enhances the model's ability to align\nfeatures of small objects within the scene. Our experiments with Scene-LLM\ndemonstrate its strong capabilities in dense captioning, question answering,\nand interactive planning. We believe Scene-LLM advances the field of 3D visual\nunderstanding and reasoning, offering new possibilities for sophisticated agent\ninteractions in indoor settings.\n","authors":["Rao Fu","Jingyu Liu","Xilun Chen","Yixin Nie","Wenhan Xiong"],"pdf_url":"https://arxiv.org/pdf/2403.11401v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15571v1","updated":"2024-03-22T18:52:10Z","published":"2024-03-22T18:52:10Z","title":"Augmented Reality Warnings in Roadway Work Zones: Evaluating the Effect\n of Modality on Worker Reaction Times","summary":" Given the aging highway infrastructure requiring extensive rebuilding and\nenhancements, and the consequent rise in the number of work zones, there is an\nurgent need to develop advanced safety systems to protect workers. While\nAugmented Reality (AR) holds significant potential for delivering warnings to\nworkers, its integration into roadway work zones remains relatively unexplored.\nThe primary objective of this study is to improve safety measures within\nroadway work zones by conducting an extensive analysis of how different\ncombinations of multimodal AR warnings influence the reaction times of workers.\nThis paper addresses this gap through a series of experiments that aim to\nreplicate the distinctive conditions of roadway work zones, both in real-world\nand virtual reality environments. Our approach comprises three key components:\nan advanced AR system prototype, a VR simulation of AR functionality within the\nwork zone environment, and the Wizard of Oz technique to synchronize user\nexperiences across experiments. To assess reaction times, we leverage both the\nsimple reaction time (SRT) technique and an innovative vision-based metric that\nutilizes real-time pose estimation. By conducting five experiments in\ncontrolled outdoor work zones and indoor VR settings, our study provides\nvaluable information on how various multimodal AR warnings impact workers\nreaction times. Furthermore, our findings reveal the disparities in reaction\ntimes between VR simulations and real-world scenarios, thereby gauging VR's\ncapability to mirror the dynamics of roadway work zones. Furthermore, our\nresults substantiate the potential and reliability of vision-based reaction\ntime measurements. These insights resonate well with those derived using the\nSRT technique, underscoring the viability of this approach for tangible\nreal-world uses.\n","authors":["Sepehr Sabeti","Fatemeh Banani Ardacani","Omidreza Shoghli"],"pdf_url":"https://arxiv.org/pdf/2403.15571v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15567v1","updated":"2024-03-22T18:43:46Z","published":"2024-03-22T18:43:46Z","title":"Do not trust what you trust: Miscalibration in Semi-supervised Learning","summary":" State-of-the-art semi-supervised learning (SSL) approaches rely on highly\nconfident predictions to serve as pseudo-labels that guide the training on\nunlabeled samples. An inherent drawback of this strategy stems from the quality\nof the uncertainty estimates, as pseudo-labels are filtered only based on their\ndegree of uncertainty, regardless of the correctness of their predictions.\nThus, assessing and enhancing the uncertainty of network predictions is of\nparamount importance in the pseudo-labeling process. In this work, we\nempirically demonstrate that SSL methods based on pseudo-labels are\nsignificantly miscalibrated, and formally demonstrate the minimization of the\nmin-entropy, a lower bound of the Shannon entropy, as a potential cause for\nmiscalibration. To alleviate this issue, we integrate a simple penalty term,\nwhich enforces the logit distances of the predictions on unlabeled samples to\nremain low, preventing the network predictions to become overconfident.\nComprehensive experiments on a variety of SSL image classification benchmarks\ndemonstrate that the proposed solution systematically improves the calibration\nperformance of relevant SSL models, while also enhancing their discriminative\npower, being an appealing addition to tackle SSL tasks.\n","authors":["Shambhavi Mishra","Balamurali Murugesan","Ismail Ben Ayed","Marco Pedersoli","Jose Dolz"],"pdf_url":"https://arxiv.org/pdf/2403.15567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.05224v2","updated":"2024-03-22T18:39:41Z","published":"2024-01-10T15:51:39Z","title":"Do Vision and Language Encoders Represent the World Similarly?","summary":" Aligned text-image encoders such as CLIP have become the de facto model for\nvision-language tasks. Furthermore, modality-specific encoders achieve\nimpressive performances in their respective domains. This raises a central\nquestion: does an alignment exist between uni-modal vision and language\nencoders since they fundamentally represent the same physical world? Analyzing\nthe latent spaces structure of vision and language models on image-caption\nbenchmarks using the Centered Kernel Alignment (CKA), we find that the\nrepresentation spaces of unaligned and aligned encoders are semantically\nsimilar. In the absence of statistical similarity in aligned encoders like\nCLIP, we show that a possible matching of unaligned encoders exists without any\ntraining. We frame this as a seeded graph-matching problem exploiting the\nsemantic similarity between graphs and propose two methods - a Fast Quadratic\nAssignment Problem optimization, and a novel localized CKA metric-based\nmatching/retrieval. We demonstrate the effectiveness of this on several\ndownstream tasks including cross-lingual, cross-domain caption matching and\nimage classification. Code available at github.com/mayug/0-shot-llm-vision.\n","authors":["Mayug Maniparambil","Raiymbek Akshulakov","Yasser Abdelaziz Dahou Djilali","Sanath Narayan","Mohamed El Amine Seddik","Karttikeya Mangalam","Noel E. O'Connor"],"pdf_url":"https://arxiv.org/pdf/2401.05224v2.pdf","comment":"Accepted CVPR 2024"},{"id":"http://arxiv.org/abs/2403.15560v1","updated":"2024-03-22T18:31:24Z","published":"2024-03-22T18:31:24Z","title":"A2DMN: Anatomy-Aware Dilated Multiscale Network for Breast Ultrasound\n Semantic Segmentation","summary":" In recent years, convolutional neural networks for semantic segmentation of\nbreast ultrasound (BUS) images have shown great success; however, two major\nchallenges still exist. 1) Most current approaches inherently lack the ability\nto utilize tissue anatomy, resulting in misclassified image regions. 2) They\nstruggle to produce accurate boundaries due to the repeated down-sampling\noperations. To address these issues, we propose a novel breast anatomy-aware\nnetwork for capturing fine image details and a new smoothness term that encodes\nbreast anatomy. It incorporates context information across multiple spatial\nscales to generate more accurate semantic boundaries. Extensive experiments are\nconducted to compare the proposed method and eight state-of-the-art approaches\nusing a BUS dataset with 325 images. The results demonstrate the proposed\nmethod significantly improves the segmentation of the muscle, mammary, and\ntumor classes and produces more accurate fine details of tissue boundaries.\n","authors":["Kyle Lucke","Aleksandar Vakanski","Min Xian"],"pdf_url":"https://arxiv.org/pdf/2403.15560v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15559v1","updated":"2024-03-22T18:28:04Z","published":"2024-03-22T18:28:04Z","title":"An Optimization Framework to Enforce Multi-View Consistency for\n Texturing 3D Meshes Using Pre-Trained Text-to-Image Models","summary":" A fundamental problem in the texturing of 3D meshes using pre-trained\ntext-to-image models is to ensure multi-view consistency. State-of-the-art\napproaches typically use diffusion models to aggregate multi-view inputs, where\ncommon issues are the blurriness caused by the averaging operation in the\naggregation step or inconsistencies in local features. This paper introduces an\noptimization framework that proceeds in four stages to achieve multi-view\nconsistency. Specifically, the first stage generates an over-complete set of 2D\ntextures from a predefined set of viewpoints using an MV-consistent diffusion\nprocess. The second stage selects a subset of views that are mutually\nconsistent while covering the underlying 3D model. We show how to achieve this\ngoal by solving semi-definite programs. The third stage performs non-rigid\nalignment to align the selected views across overlapping regions. The fourth\nstage solves an MRF problem to associate each mesh face with a selected view.\nIn particular, the third and fourth stages are iterated, with the cuts obtained\nin the fourth stage encouraging non-rigid alignment in the third stage to focus\non regions close to the cuts. Experimental results show that our approach\nsignificantly outperforms baseline approaches both qualitatively and\nquantitatively.\n","authors":["Zhengyi Zhao","Chen Song","Xiaodong Gu","Yuan Dong","Qi Zuo","Weihao Yuan","Zilong Dong","Liefeng Bo","Qixing Huang"],"pdf_url":"https://arxiv.org/pdf/2403.15559v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15551v1","updated":"2024-03-22T18:05:33Z","published":"2024-03-22T18:05:33Z","title":"Language-Based Depth Hints for Monocular Depth Estimation","summary":" Monocular depth estimation (MDE) is inherently ambiguous, as a given image\nmay result from many different 3D scenes and vice versa. To resolve this\nambiguity, an MDE system must make assumptions about the most likely 3D scenes\nfor a given input. These assumptions can be either explicit or implicit. In\nthis work, we demonstrate the use of natural language as a source of an\nexplicit prior about the structure of the world. The assumption is made that\nhuman language encodes the likely distribution in depth-space of various\nobjects. We first show that a language model encodes this implicit bias during\ntraining, and that it can be extracted using a very simple learned approach. We\nthen show that this prediction can be provided as an explicit source of\nassumption to an MDE system, using an off-the-shelf instance segmentation model\nthat provides the labels used as the input to the language model. We\ndemonstrate the performance of our method on the NYUD2 dataset, showing\nimprovement compared to the baseline and to random controls.\n","authors":["Dylan Auty","Krystian Mikolajczyk"],"pdf_url":"https://arxiv.org/pdf/2403.15551v1.pdf","comment":"8 pages, 1 figure. Work originally done in June 2022"},{"id":"http://arxiv.org/abs/2403.15530v1","updated":"2024-03-22T17:59:21Z","published":"2024-03-22T17:59:21Z","title":"Pixel-GS: Density Control with Pixel-aware Gradient for 3D Gaussian\n Splatting","summary":" 3D Gaussian Splatting (3DGS) has demonstrated impressive novel view synthesis\nresults while advancing real-time rendering performance. However, it relies\nheavily on the quality of the initial point cloud, resulting in blurring and\nneedle-like artifacts in areas with insufficient initializing points. This is\nmainly attributed to the point cloud growth condition in 3DGS that only\nconsiders the average gradient magnitude of points from observable views,\nthereby failing to grow for large Gaussians that are observable for many\nviewpoints while many of them are only covered in the boundaries. To this end,\nwe propose a novel method, named Pixel-GS, to take into account the number of\npixels covered by the Gaussian in each view during the computation of the\ngrowth condition. We regard the covered pixel numbers as the weights to\ndynamically average the gradients from different views, such that the growth of\nlarge Gaussians can be prompted. As a result, points within the areas with\ninsufficient initializing points can be grown more effectively, leading to a\nmore accurate and detailed reconstruction. In addition, we propose a simple yet\neffective strategy to scale the gradient field according to the distance to the\ncamera, to suppress the growth of floaters near the camera. Extensive\nexperiments both qualitatively and quantitatively demonstrate that our method\nachieves state-of-the-art rendering quality while maintaining real-time\nrendering speed, on the challenging Mip-NeRF 360 and Tanks & Temples datasets.\n","authors":["Zheng Zhang","Wenbo Hu","Yixing Lao","Tong He","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.15530v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15528v1","updated":"2024-03-22T17:27:18Z","published":"2024-03-22T17:27:18Z","title":"Evaluating GPT-4 with Vision on Detection of Radiological Findings on\n Chest Radiographs","summary":" The study examines the application of GPT-4V, a multi-modal large language\nmodel equipped with visual recognition, in detecting radiological findings from\na set of 100 chest radiographs and suggests that GPT-4V is currently not ready\nfor real-world diagnostic usage in interpreting chest radiographs.\n","authors":["Yiliang Zhou","Hanley Ong","Patrick Kennedy","Carol Wu","Jacob Kazam","Keith Hentel","Adam Flanders","George Shih","Yifan Peng"],"pdf_url":"https://arxiv.org/pdf/2403.15528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.10744v2","updated":"2024-03-22T14:33:35Z","published":"2022-12-21T03:28:30Z","title":"An Audio-Visual Speech Separation Model Inspired by\n Cortico-Thalamo-Cortical Circuits","summary":" Audio-visual approaches involving visual inputs have laid the foundation for\nrecent progress in speech separation. However, the optimization of the\nconcurrent usage of auditory and visual inputs is still an active research\narea. Inspired by the cortico-thalamo-cortical circuit, in which the sensory\nprocessing mechanisms of different modalities modulate one another via the\nnon-lemniscal sensory thalamus, we propose a novel cortico-thalamo-cortical\nneural network (CTCNet) for audio-visual speech separation (AVSS). First, the\nCTCNet learns hierarchical auditory and visual representations in a bottom-up\nmanner in separate auditory and visual subnetworks, mimicking the functions of\nthe auditory and visual cortical areas. Then, inspired by the large number of\nconnections between cortical regions and the thalamus, the model fuses the\nauditory and visual information in a thalamic subnetwork through top-down\nconnections. Finally, the model transmits this fused information back to the\nauditory and visual subnetworks, and the above process is repeated several\ntimes. The results of experiments on three speech separation benchmark datasets\nshow that CTCNet remarkably outperforms existing AVSS methods with considerably\nfewer parameters. These results suggest that mimicking the anatomical\nconnectome of the mammalian brain has great potential for advancing the\ndevelopment of deep neural networks. Project repo is\nhttps://github.com/JusperLee/CTCNet.\n","authors":["Kai Li","Fenghua Xie","Hang Chen","Kexin Yuan","Xiaolin Hu"],"pdf_url":"https://arxiv.org/pdf/2212.10744v2.pdf","comment":"Accepted by TPAMI 2024"},{"id":"http://arxiv.org/abs/2403.15522v1","updated":"2024-03-22T13:24:44Z","published":"2024-03-22T13:24:44Z","title":"Medical Image Data Provenance for Medical Cyber-Physical System","summary":" Continuous advancements in medical technology have led to the creation of\naffordable mobile imaging devices suitable for telemedicine and remote\nmonitoring. However, the rapid examination of large populations poses\nchallenges, including the risk of fraudulent practices by healthcare\nprofessionals and social workers exchanging unverified images via mobile\napplications. To mitigate these risks, this study proposes using watermarking\ntechniques to embed a device fingerprint (DFP) into captured images, ensuring\ndata provenance. The DFP, representing the unique attributes of the capturing\ndevice and raw image, is embedded into raw images before storage, thus enabling\nverification of image authenticity and source. Moreover, a robust remote\nvalidation method is introduced to authenticate images, enhancing the integrity\nof medical image data in interconnected healthcare systems. Through a case\nstudy on mobile fundus imaging, the effectiveness of the proposed framework is\nevaluated in terms of computational efficiency, image quality, security, and\ntrustworthiness. This approach is suitable for a range of applications,\nincluding telemedicine, the Internet of Medical Things (IoMT), eHealth, and\nMedical Cyber-Physical Systems (MCPS) applications, providing a reliable means\nto maintain data provenance in diagnostic settings utilizing medical images or\nvideos.\n","authors":["Vijay Kumar","Kolin Paul"],"pdf_url":"https://arxiv.org/pdf/2403.15522v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15517v1","updated":"2024-03-22T11:14:30Z","published":"2024-03-22T11:14:30Z","title":"Improving Forward Compatibility in Class Incremental Learning by\n Increasing Representation Rank and Feature Richness","summary":" Class Incremental Learning (CIL) constitutes a pivotal subfield within\ncontinual learning, aimed at enabling models to progressively learn new\nclassification tasks while retaining knowledge obtained from prior tasks.\nAlthough previous studies have predominantly focused on backward compatible\napproaches to mitigate catastrophic forgetting, recent investigations have\nintroduced forward compatible methods to enhance performance on novel tasks and\ncomplement existing backward compatible methods. In this study, we introduce an\neffective-Rank based Feature Richness enhancement (RFR) method, designed for\nimproving forward compatibility. Specifically, this method increases the\neffective rank of representations during the base session, thereby facilitating\nthe incorporation of more informative features pertinent to unseen novel tasks.\nConsequently, RFR achieves dual objectives in backward and forward\ncompatibility: minimizing feature extractor modifications and enhancing novel\ntask performance, respectively. To validate the efficacy of our approach, we\nestablish a theoretical connection between effective rank and the Shannon\nentropy of representations. Subsequently, we conduct comprehensive experiments\nby integrating RFR into eleven well-known CIL methods. Our results demonstrate\nthe effectiveness of our approach in enhancing novel-task performance while\nmitigating catastrophic forgetting. Furthermore, our method notably improves\nthe average incremental accuracy across all eleven cases examined.\n","authors":["Jaeill Kim","Wonseok Lee","Moonjung Eo","Wonjong Rhee"],"pdf_url":"https://arxiv.org/pdf/2403.15517v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15576v1","updated":"2024-03-22T19:04:02Z","published":"2024-03-22T19:04:02Z","title":"Data-centric Prediction Explanation via Kernelized Stein Discrepancy","summary":" Existing example-based prediction explanation methods often bridge test and\ntraining data points through the model's parameters or latent representations.\nWhile these methods offer clues to the causes of model predictions, they often\nexhibit innate shortcomings, such as incurring significant computational\noverhead or producing coarse-grained explanations. This paper presents a\nHighly-precise and Data-centric Explanation (HD-Explain), a straightforward\nprediction explanation method exploiting properties of Kernelized Stein\nDiscrepancy (KSD). Specifically, the KSD uniquely defines a parameterized\nkernel function for a trained model that encodes model-dependent data\ncorrelation. By leveraging the kernel function, one can identify training\nsamples that provide the best predictive support to a test point efficiently.\nWe conducted thorough analyses and experiments across multiple classification\ndomains, where we show that HD-Explain outperforms existing methods from\nvarious aspects, including 1) preciseness (fine-grained explanation), 2)\nconsistency, and 3) computation efficiency, leading to a surprisingly simple,\neffective, and robust prediction explanation solution.\n","authors":["Mahtab Sarvmaili","Hassan Sajjad","Ga Wu"],"pdf_url":"https://arxiv.org/pdf/2403.15576v1.pdf","comment":null}]},"2024-03-25T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2403.08651v2","updated":"2024-03-25T14:41:07Z","published":"2024-03-13T16:06:07Z","title":"HAIFIT: Human-Centered AI for Fashion Image Translation","summary":" In the realm of fashion design, sketches serve as the canvas for expressing\nan artist's distinctive drawing style and creative vision, capturing intricate\ndetails like stroke variations and texture nuances. The advent of\nsketch-to-image cross-modal translation technology has notably aided designers.\nHowever, existing methods often compromise these sketch details during image\ngeneration, resulting in images that deviate from the designer's intended\nconcept. This limitation hampers the ability to offer designers a precise\npreview of the final output. To overcome this challenge, we introduce HAIFIT, a\nnovel approach that transforms sketches into high-fidelity, lifelike clothing\nimages by integrating multi-scale features and capturing extensive feature map\ndependencies from diverse perspectives. Through extensive qualitative and\nquantitative evaluations conducted on our self-collected dataset, our method\ndemonstrates superior performance compared to existing methods in generating\nphotorealistic clothing images. Our method excels in preserving the distinctive\nstyle and intricate details essential for fashion design applications.\n","authors":["Jianan Jiang","Xinglin Li","Weiren Yu","Di Wu"],"pdf_url":"https://arxiv.org/pdf/2403.08651v2.pdf","comment":"8 pages,8 figures"},{"id":"http://arxiv.org/abs/2402.19463v2","updated":"2024-03-25T14:27:03Z","published":"2024-02-29T18:54:53Z","title":"SeMoLi: What Moves Together Belongs Together","summary":" We tackle semi-supervised object detection based on motion cues. Recent\nresults suggest that heuristic-based clustering methods in conjunction with\nobject trackers can be used to pseudo-label instances of moving objects and use\nthese as supervisory signals to train 3D object detectors in Lidar data without\nmanual supervision. We re-think this approach and suggest that both, object\ndetection, as well as motion-inspired pseudo-labeling, can be tackled in a\ndata-driven manner. We leverage recent advances in scene flow estimation to\nobtain point trajectories from which we extract long-term, class-agnostic\nmotion patterns. Revisiting correlation clustering in the context of message\npassing networks, we learn to group those motion patterns to cluster points to\nobject instances. By estimating the full extent of the objects, we obtain\nper-scan 3D bounding boxes that we use to supervise a Lidar object detection\nnetwork. Our method not only outperforms prior heuristic-based approaches (57.5\nAP, +14 improvement over prior work), more importantly, we show we can\npseudo-label and train object detectors across datasets.\n","authors":["Jenny Seidenschwarz","Aljoša Ošep","Francesco Ferroni","Simon Lucey","Laura Leal-Taixé"],"pdf_url":"https://arxiv.org/pdf/2402.19463v2.pdf","comment":"Accepted to CVPR 2024!"},{"id":"http://arxiv.org/abs/2403.16803v1","updated":"2024-03-25T14:21:49Z","published":"2024-03-25T14:21:49Z","title":"Exploiting Priors from 3D Diffusion Models for RGB-Based One-Shot View\n Planning","summary":" Object reconstruction is relevant for many autonomous robotic tasks that\nrequire interaction with the environment. A key challenge in such scenarios is\nplanning view configurations to collect informative measurements for\nreconstructing an initially unknown object. One-shot view planning enables\nefficient data collection by predicting view configurations and planning the\nglobally shortest path connecting all views at once. However, geometric priors\nabout the object are required to conduct one-shot view planning. In this work,\nwe propose a novel one-shot view planning approach that utilizes the powerful\n3D generation capabilities of diffusion models as priors. By incorporating such\ngeometric priors into our pipeline, we achieve effective one-shot view planning\nstarting with only a single RGB image of the object to be reconstructed. Our\nplanning experiments in simulation and real-world setups indicate that our\napproach balances well between object reconstruction quality and movement cost.\n","authors":["Sicong Pan","Liren Jin","Xuying Huang","Cyrill Stachniss","Marija Popović","Maren Bennewitz"],"pdf_url":"https://arxiv.org/pdf/2403.16803v1.pdf","comment":"Sicong Pan and Liren Jin have equal contribution. Submitted to IROS\n 2024"},{"id":"http://arxiv.org/abs/2403.16794v1","updated":"2024-03-25T14:13:09Z","published":"2024-03-25T14:13:09Z","title":"CurbNet: Curb Detection Framework Based on LiDAR Point Cloud\n Segmentation","summary":" Curb detection is an important function in intelligent driving and can be\nused to determine drivable areas of the road. However, curbs are difficult to\ndetect due to the complex road environment. This paper introduces CurbNet, a\nnovel framework for curb detection, leveraging point cloud segmentation.\nAddressing the dearth of comprehensive curb datasets and the absence of 3D\nannotations, we have developed the 3D-Curb dataset, encompassing 7,100 frames,\nwhich represents the largest and most categorically diverse collection of curb\npoint clouds currently available. Recognizing that curbs are primarily\ncharacterized by height variations, our approach harnesses spatially-rich 3D\npoint clouds for training. To tackle the challenges presented by the uneven\ndistribution of curb features on the xy-plane and their reliance on z-axis\nhigh-frequency features, we introduce the multi-scale and channel attention\n(MSCA) module, a bespoke solution designed to optimize detection performance.\nMoreover, we propose an adaptive weighted loss function group, specifically\nformulated to counteract the imbalance in the distribution of curb point clouds\nrelative to other categories. Our extensive experimentation on 2 major datasets\nhas yielded results that surpass existing benchmarks set by leading curb\ndetection and point cloud segmentation models. By integrating multi-clustering\nand curve fitting techniques in our post-processing stage, we have\nsubstantially reduced noise in curb detection, thereby enhancing precision to\n0.8744. Notably, CurbNet has achieved an exceptional average metrics of over\n0.95 at a tolerance of just 0.15m, thereby establishing a new benchmark.\nFurthermore, corroborative real-world experiments and dataset analyzes mutually\nvalidate each other, solidifying CurbNet's superior detection proficiency and\nits robust generalizability.\n","authors":["Guoyang Zhao","Fulong Ma","Yuxuan Liu","Weiqing Qi","Ming Liu"],"pdf_url":"https://arxiv.org/pdf/2403.16794v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16788v1","updated":"2024-03-25T14:02:33Z","published":"2024-03-25T14:02:33Z","title":"HPL-ESS: Hybrid Pseudo-Labeling for Unsupervised Event-based Semantic\n Segmentation","summary":" Event-based semantic segmentation has gained popularity due to its capability\nto deal with scenarios under high-speed motion and extreme lighting conditions,\nwhich cannot be addressed by conventional RGB cameras. Since it is hard to\nannotate event data, previous approaches rely on event-to-image reconstruction\nto obtain pseudo labels for training. However, this will inevitably introduce\nnoise, and learning from noisy pseudo labels, especially when generated from a\nsingle source, may reinforce the errors. This drawback is also called\nconfirmation bias in pseudo-labeling. In this paper, we propose a novel hybrid\npseudo-labeling framework for unsupervised event-based semantic segmentation,\nHPL-ESS, to alleviate the influence of noisy pseudo labels. In particular, we\nfirst employ a plain unsupervised domain adaptation framework as our baseline,\nwhich can generate a set of pseudo labels through self-training. Then, we\nincorporate offline event-to-image reconstruction into the framework, and\nobtain another set of pseudo labels by predicting segmentation maps on the\nreconstructed images. A noisy label learning strategy is designed to mix the\ntwo sets of pseudo labels and enhance the quality. Moreover, we propose a soft\nprototypical alignment module to further improve the consistency of target\ndomain features. Extensive experiments show that our proposed method\noutperforms existing state-of-the-art methods by a large margin on the\nDSEC-Semantic dataset (+5.88% accuracy, +10.32% mIoU), which even surpasses\nseveral supervised methods.\n","authors":["Linglin Jing","Yiming Ding","Yunpeng Gao","Zhigang Wang","Xu Yan","Dong Wang","Gerald Schaefer","Hui Fang","Bin Zhao","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2403.16788v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16782v1","updated":"2024-03-25T13:57:45Z","published":"2024-03-25T13:57:45Z","title":"The Anatomy of Adversarial Attacks: Concept-based XAI Dissection","summary":" Adversarial attacks (AAs) pose a significant threat to the reliability and\nrobustness of deep neural networks. While the impact of these attacks on model\npredictions has been extensively studied, their effect on the learned\nrepresentations and concepts within these models remains largely unexplored. In\nthis work, we perform an in-depth analysis of the influence of AAs on the\nconcepts learned by convolutional neural networks (CNNs) using eXplainable\nartificial intelligence (XAI) techniques. Through an extensive set of\nexperiments across various network architectures and targeted AA techniques, we\nunveil several key findings. First, AAs induce substantial alterations in the\nconcept composition within the feature space, introducing new concepts or\nmodifying existing ones. Second, the adversarial perturbation itself can be\nlinearly decomposed into a set of latent vector components, with a subset of\nthese being responsible for the attack's success. Notably, we discover that\nthese components are target-specific, i.e., are similar for a given target\nclass throughout different AA techniques and starting classes. Our findings\nprovide valuable insights into the nature of AAs and their impact on learned\nrepresentations, paving the way for the development of more robust and\ninterpretable deep learning models, as well as effective defenses against\nadversarial threats.\n","authors":["Georgii Mikriukov","Gesina Schwalbe","Franz Motzkus","Korinna Bade"],"pdf_url":"https://arxiv.org/pdf/2403.16782v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16776v1","updated":"2024-03-25T13:52:48Z","published":"2024-03-25T13:52:48Z","title":"Diff-Def: Diffusion-Generated Deformation Fields for Conditional Atlases","summary":" Anatomical atlases are widely used for population analysis. Conditional\natlases target a particular sub-population defined via certain conditions (e.g.\ndemographics or pathologies) and allow for the investigation of fine-grained\nanatomical differences - such as morphological changes correlated with age.\nExisting approaches use either registration-based methods that are unable to\nhandle large anatomical variations or generative models, which can suffer from\ntraining instabilities and hallucinations. To overcome these limitations, we\nuse latent diffusion models to generate deformation fields, which transform a\ngeneral population atlas into one representing a specific sub-population. By\ngenerating a deformation field and registering the conditional atlas to a\nneighbourhood of images, we ensure structural plausibility and avoid\nhallucinations, which can occur during direct image synthesis. We compare our\nmethod to several state-of-the-art atlas generation methods in experiments\nusing 5000 brain as well as whole-body MR images from UK Biobank. Our method\ngenerates highly realistic atlases with smooth transformations and high\nanatomical fidelity, outperforming the baselines.\n","authors":["Sophie Starck","Vasiliki Sideri-Lampretsa","Bernhard Kainz","Martin Menten","Tamara Mueller","Daniel Rueckert"],"pdf_url":"https://arxiv.org/pdf/2403.16776v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14897v2","updated":"2024-03-25T13:46:03Z","published":"2024-03-22T01:02:09Z","title":"Geometric Generative Models based on Morphological Equivariant PDEs and\n GANs","summary":" Content and image generation consist in creating or generating data from\nnoisy information by extracting specific features such as texture, edges, and\nother thin image structures. We are interested here in generative models, and\ntwo main problems are addressed. Firstly, the improvements of specific feature\nextraction while accounting at multiscale levels intrinsic geometric features;\nand secondly, the equivariance of the network to reduce its complexity and\nprovide a geometric interpretability. To proceed, we propose a geometric\ngenerative model based on an equivariant partial differential equation (PDE)\nfor group convolution neural networks (G-CNNs), so called PDE-G-CNNs, built on\nmorphology operators and generative adversarial networks (GANs). Equivariant\nmorphological PDE layers are composed of multiscale dilations and erosions\nformulated in Riemannian manifolds, while group symmetries are defined on a Lie\ngroup. We take advantage of the Lie group structure to properly integrate the\nequivariance in layers, and are able to use the Riemannian metric to solve the\nmultiscale morphological operations. Each point of the Lie group is associated\nwith a unique point in the manifold, which helps us derive a metric on the\nRiemannian manifold from a tensor field invariant under the Lie group so that\nthe induced metric has the same symmetries. The proposed geometric\nmorphological GAN (GM-GAN) is obtained by using the proposed morphological\nequivariant convolutions in PDE-G-CNNs to bring nonlinearity in classical CNNs.\nGM-GAN is evaluated on MNIST data and compared with GANs. Preliminary results\nshow that GM-GAN model outperforms classical GAN.\n","authors":["El Hadji S. Diop","Thierno Fall","Alioune Mbengue","Mohamed Daoudi"],"pdf_url":"https://arxiv.org/pdf/2403.14897v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16700v2","updated":"2024-03-25T13:33:51Z","published":"2024-01-30T03:00:25Z","title":"Towards Precise 3D Human Pose Estimation with Multi-Perspective\n Spatial-Temporal Relational Transformers","summary":" 3D human pose estimation captures the human joint points in three-dimensional\nspace while keeping the depth information and physical structure. That is\nessential for applications that require precise pose information, such as\nhuman-computer interaction, scene understanding, and rehabilitation training.\nDue to the challenges in data collection, mainstream datasets of 3D human pose\nestimation are primarily composed of multi-view video data collected in\nlaboratory environments, which contains rich spatial-temporal correlation\ninformation besides the image frame content. Given the remarkable\nself-attention mechanism of transformers, capable of capturing the\nspatial-temporal correlation from multi-view video datasets, we propose a\nmulti-stage framework for 3D sequence-to-sequence (seq2seq) human pose\ndetection. Firstly, the spatial module represents the human pose feature by\nintra-image content, while the frame-image relation module extracts temporal\nrelationships and 3D spatial positional relationship features between the\nmulti-perspective images. Secondly, the self-attention mechanism is adopted to\neliminate the interference from non-human body parts and reduce computing\nresources. Our method is evaluated on Human3.6M, a popular 3D human pose\ndetection dataset. Experimental results demonstrate that our approach achieves\nstate-of-the-art performance on this dataset. The source code will be available\nat https://github.com/WUJINHUAN/3D-human-pose.\n","authors":["Jianbin Jiao","Xina Cheng","Weijie Chen","Xiaoting Yin","Hao Shi","Kailun Yang"],"pdf_url":"https://arxiv.org/pdf/2401.16700v2.pdf","comment":"Accepted to IJCNN 2024. The source code will be available at\n https://github.com/WUJINHUAN/3D-human-pose"},{"id":"http://arxiv.org/abs/2402.04599v2","updated":"2024-03-25T13:30:37Z","published":"2024-02-07T05:47:31Z","title":"Meet JEANIE: a Similarity Measure for 3D Skeleton Sequences via\n Temporal-Viewpoint Alignment","summary":" Video sequences exhibit significant nuisance variations (undesired effects)\nof speed of actions, temporal locations, and subjects' poses, leading to\ntemporal-viewpoint misalignment when comparing two sets of frames or evaluating\nthe similarity of two sequences. Thus, we propose Joint tEmporal and cAmera\nviewpoiNt alIgnmEnt (JEANIE) for sequence pairs. In particular, we focus on 3D\nskeleton sequences whose camera and subjects' poses can be easily manipulated\nin 3D. We evaluate JEANIE on skeletal Few-shot Action Recognition (FSAR), where\nmatching well temporal blocks (temporal chunks that make up a sequence) of\nsupport-query sequence pairs (by factoring out nuisance variations) is\nessential due to limited samples of novel classes. Given a query sequence, we\ncreate its several views by simulating several camera locations. For a support\nsequence, we match it with view-simulated query sequences, as in the popular\nDynamic Time Warping (DTW). Specifically, each support temporal block can be\nmatched to the query temporal block with the same or adjacent (next) temporal\nindex, and adjacent camera views to achieve joint local temporal-viewpoint\nwarping. JEANIE selects the smallest distance among matching paths with\ndifferent temporal-viewpoint warping patterns, an advantage over DTW which only\nperforms temporal alignment. We also propose an unsupervised FSAR akin to\nclustering of sequences with JEANIE as a distance measure. JEANIE achieves\nstate-of-the-art results on NTU-60, NTU-120, Kinetics-skeleton and UWA3D\nMultiview Activity II on supervised and unsupervised FSAR, and their\nmeta-learning inspired fusion.\n","authors":["Lei Wang","Jun Liu","Liang Zheng","Tom Gedeon","Piotr Koniusz"],"pdf_url":"https://arxiv.org/pdf/2402.04599v2.pdf","comment":"Accepted by the International Journal of Computer Vision (IJCV). An\n extension of our ACCV'22 paper [arXiv:arXiv:2210.16820] which was\n distinguished by the Sang Uk Lee Best Student Paper Award"},{"id":"http://arxiv.org/abs/2403.06764v2","updated":"2024-03-25T13:29:30Z","published":"2024-03-11T14:35:32Z","title":"An Image is Worth 1/2 Tokens After Layer 2: Plug-and-Play Inference\n Acceleration for Large Vision-Language Models","summary":" In this study, we identify the inefficient attention phenomena in Large\nVision-Language Models (LVLMs), notably within prominent models like LLaVA-1.5,\nQwenVL-Chat and Video-LLaVA. We find out that the attention computation over\nvisual tokens is of extreme inefficiency in the deep layers of popular LVLMs,\nsuggesting a need for a sparser approach compared to textual data handling. To\nthis end, we introduce FastV, a versatile plug-and-play method designed to\noptimize computational efficiency by learning adaptive attention patterns in\nearly layers and pruning visual tokens in subsequent ones. Our evaluations\ndemonstrate FastV's ability to dramatically reduce computational costs (e.g., a\n45 reduction in FLOPs for LLaVA-1.5-13B) without sacrificing performance in a\nwide range of image and video understanding tasks. The computational efficiency\nand performance trade-off of FastV are highly customizable and\npareto-efficient. It can compress the FLOPs of a 13B-parameter model to achieve\na lower budget than that of a 7B-parameter model, while still maintaining\nsuperior performance. We believe FastV has practical values for deployment of\nLVLMs in edge devices and commercial models. Code is released at\nhttps://github.com/pkunlp-icler/FastV.\n","authors":["Liang Chen","Haozhe Zhao","Tianyu Liu","Shuai Bai","Junyang Lin","Chang Zhou","Baobao Chang"],"pdf_url":"https://arxiv.org/pdf/2403.06764v2.pdf","comment":"21 papes, 8 figures, code is released at\n https://github.com/pkunlp-icler/FastV"},{"id":"http://arxiv.org/abs/2402.15648v2","updated":"2024-03-25T13:27:26Z","published":"2024-02-23T23:15:54Z","title":"MambaIR: A Simple Baseline for Image Restoration with State-Space Model","summary":" Recent years have seen significant advancements in image restoration, largely\nattributed to the development of modern deep neural networks, such as CNNs and\nTransformers. However, existing restoration backbones often face the dilemma\nbetween global receptive fields and efficient computation, hindering their\napplication in practice. Recently, the Selective Structured State Space Model,\nespecially the improved version Mamba, has shown great potential for long-range\ndependency modeling with linear complexity, which offers a way to resolve the\nabove dilemma. However, the standard Mamba still faces certain challenges in\nlow-level vision such as local pixel forgetting and channel redundancy. In this\nwork, we introduce a simple but effective baseline, named MambaIR, which\nintroduces both local enhancement and channel attention to improve the vanilla\nMamba. In this way, our MambaIR takes advantage of the local pixel similarity\nand reduces the channel redundancy. Extensive experiments demonstrate the\nsuperiority of our method, for example, MambaIR outperforms SwinIR by up to\n0.45dB on image SR, using similar computational cost but with a global\nreceptive field. Code is available at \\url{https://github.com/csguoh/MambaIR}.\n","authors":["Hang Guo","Jinmin Li","Tao Dai","Zhihao Ouyang","Xudong Ren","Shu-Tao Xia"],"pdf_url":"https://arxiv.org/pdf/2402.15648v2.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2312.11897v2","updated":"2024-03-25T13:15:22Z","published":"2023-12-19T06:42:47Z","title":"Text-Conditioned Resampler For Long Form Video Understanding","summary":" In this paper we present a text-conditioned video resampler (TCR) module that\nuses a pre-trained and frozen visual encoder and large language model (LLM) to\nprocess long video sequences for a task. TCR localises relevant visual features\nfrom the video given a text condition and provides them to a LLM to generate a\ntext response. Due to its lightweight design and use of cross-attention, TCR\ncan process more than 100 frames at a time with plain attention and without\noptimised implementations. We make the following contributions: (i) we design a\ntransformer-based sampling architecture that can process long videos\nconditioned on a task, together with a training method that enables it to\nbridge pre-trained visual and language models; (ii) we identify tasks that\ncould benefit from longer video perception; and (iii) we empirically validate\nits efficacy on a wide variety of evaluation tasks including NextQA, EgoSchema,\nand the EGO4D-LTA challenge.\n","authors":["Bruno Korbar","Yongqin Xian","Alessio Tonioni","Andrew Zisserman","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2312.11897v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16736v1","updated":"2024-03-25T13:09:40Z","published":"2024-03-25T13:09:40Z","title":"Creating a Digital Twin of Spinal Surgery: A Proof of Concept","summary":" Surgery digitalization is the process of creating a virtual replica of\nreal-world surgery, also referred to as a surgical digital twin (SDT). It has\nsignificant applications in various fields such as education and training,\nsurgical planning, and automation of surgical tasks. Given their detailed\nrepresentations of surgical procedures, SDTs are an ideal foundation for\nmachine learning methods, enabling automatic generation of training data. In\nrobotic surgery, SDTs can provide realistic virtual environments in which\nrobots may learn through trial and error. In this paper, we present a proof of\nconcept (PoC) for surgery digitalization that is applied to an ex-vivo spinal\nsurgery performed in realistic conditions. The proposed digitalization focuses\non the acquisition and modelling of the geometry and appearance of the entire\nsurgical scene. We employ five RGB-D cameras for dynamic 3D reconstruction of\nthe surgeon, a high-end camera for 3D reconstruction of the anatomy, an\ninfrared stereo camera for surgical instrument tracking, and a laser scanner\nfor 3D reconstruction of the operating room and data fusion. We justify the\nproposed methodology, discuss the challenges faced and further extensions of\nour prototype. While our PoC partially relies on manual data curation, its high\nquality and great potential motivate the development of automated methods for\nthe creation of SDTs. The quality of our SDT can be assessed in a rendered\nvideo available at https://youtu.be/LqVaWGgaTMY .\n","authors":["Jonas Hein","Frederic Giraud","Lilian Calvet","Alexander Schwarz","Nicola Alessandro Cavalcanti","Sergey Prokudin","Mazda Farshad","Siyu Tang","Marc Pollefeys","Fabio Carrillo","Philipp Fürnstahl"],"pdf_url":"https://arxiv.org/pdf/2403.16736v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00374v3","updated":"2024-03-25T13:01:27Z","published":"2023-12-31T02:25:41Z","title":"EMAGE: Towards Unified Holistic Co-Speech Gesture Generation via\n Expressive Masked Audio Gesture Modeling","summary":" We propose EMAGE, a framework to generate full-body human gestures from audio\nand masked gestures, encompassing facial, local body, hands, and global\nmovements. To achieve this, we first introduce BEAT2 (BEAT-SMPLX-FLAME), a new\nmesh-level holistic co-speech dataset. BEAT2 combines MoShed SMPLX body with\nFLAME head parameters and further refines the modeling of head, neck, and\nfinger movements, offering a community-standardized, high-quality 3D motion\ncaptured dataset. EMAGE leverages masked body gesture priors during training to\nboost inference performance. It involves a Masked Audio Gesture Transformer,\nfacilitating joint training on audio-to-gesture generation and masked gesture\nreconstruction to effectively encode audio and body gesture hints. Encoded body\nhints from masked gestures are then separately employed to generate facial and\nbody movements. Moreover, EMAGE adaptively merges speech features from the\naudio's rhythm and content and utilizes four compositional VQ-VAEs to enhance\nthe results' fidelity and diversity. Experiments demonstrate that EMAGE\ngenerates holistic gestures with state-of-the-art performance and is flexible\nin accepting predefined spatial-temporal gesture inputs, generating complete,\naudio-synchronized results. Our code and dataset are available at\nhttps://pantomatrix.github.io/EMAGE/\n","authors":["Haiyang Liu","Zihao Zhu","Giorgio Becherini","Yichen Peng","Mingyang Su","You Zhou","Xuefei Zhe","Naoya Iwamoto","Bo Zheng","Michael J. Black"],"pdf_url":"https://arxiv.org/pdf/2401.00374v3.pdf","comment":"CVPR Camera Ready; Project Page: https://pantomatrix.github.io/EMAGE/"},{"id":"http://arxiv.org/abs/2402.07310v2","updated":"2024-03-25T12:58:45Z","published":"2024-02-11T21:16:42Z","title":"BioNeRF: Biologically Plausible Neural Radiance Fields for View\n Synthesis","summary":" This paper presents BioNeRF, a biologically plausible architecture that\nmodels scenes in a 3D representation and synthesizes new views through radiance\nfields. Since NeRF relies on the network weights to store the scene's\n3-dimensional representation, BioNeRF implements a cognitive-inspired mechanism\nthat fuses inputs from multiple sources into a memory-like structure, improving\nthe storing capacity and extracting more intrinsic and correlated information.\nBioNeRF also mimics a behavior observed in pyramidal cells concerning\ncontextual information, in which the memory is provided as the context and\ncombined with the inputs of two subsequent neural models, one responsible for\nproducing the volumetric densities and the other the colors used to render the\nscene. Experimental results show that BioNeRF outperforms state-of-the-art\nresults concerning a quality measure that encodes human perception in two\ndatasets: real-world images and synthetic data.\n","authors":["Leandro A. Passos","Douglas Rodrigues","Danilo Jodas","Kelton A. P. Costa","Ahsan Adeel","João Paulo Papa"],"pdf_url":"https://arxiv.org/pdf/2402.07310v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02969v2","updated":"2024-03-25T12:45:03Z","published":"2024-03-05T13:45:46Z","title":"Multi-modal Instruction Tuned LLMs with Fine-grained Visual Perception","summary":" Multimodal Large Language Model (MLLMs) leverages Large Language Models as a\ncognitive framework for diverse visual-language tasks. Recent efforts have been\nmade to equip MLLMs with visual perceiving and grounding capabilities. However,\nthere still remains a gap in providing fine-grained pixel-level perceptions and\nextending interactions beyond text-specific inputs. In this work, we propose\n{\\bf{AnyRef}}, a general MLLM model that can generate pixel-wise object\nperceptions and natural language descriptions from multi-modality references,\nsuch as texts, boxes, images, or audio. This innovation empowers users with\ngreater flexibility to engage with the model beyond textual and regional\nprompts, without modality-specific designs. Through our proposed refocusing\nmechanism, the generated grounding output is guided to better focus on the\nreferenced object, implicitly incorporating additional pixel-level supervision.\nThis simple modification utilizes attention scores generated during the\ninference of LLM, eliminating the need for extra computations while exhibiting\nperformance enhancements in both grounding masks and referring expressions.\nWith only publicly available training data, our model achieves state-of-the-art\nresults across multiple benchmarks, including diverse modality referring\nsegmentation and region-level referring expression generation.\n","authors":["Junwen He","Yifan Wang","Lijun Wang","Huchuan Lu","Jun-Yan He","Jin-Peng Lan","Bin Luo","Xuansong Xie"],"pdf_url":"https://arxiv.org/pdf/2403.02969v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.16697v1","updated":"2024-03-25T12:31:01Z","published":"2024-03-25T12:31:01Z","title":"DPStyler: Dynamic PromptStyler for Source-Free Domain Generalization","summary":" Source-Free Domain Generalization (SFDG) aims to develop a model that works\nfor unseen target domains without relying on any source domain. Recent work,\nPromptStyler, employs text prompts to simulate different distribution shifts in\nthe joint vision-language space, allowing the model to generalize effectively\nto unseen domains without using any images. However, 1) PromptStyler's style\ngeneration strategy has limitations, as all style patterns are fixed after the\nfirst training phase. This leads to the training set in the second training\nphase being restricted to a limited set of styles. Additionally, 2) the frozen\ntext encoder in PromptStyler result in the encoder's output varying with the\nstyle of the input text prompts, making it difficult for the model to learn\ndomain-invariant features. In this paper, we introduce Dynamic PromptStyler\n(DPStyler), comprising Style Generation and Style Removal modules to address\nthese issues. The Style Generation module refreshes all styles at every\ntraining epoch, while the Style Removal module eliminates variations in the\nencoder's output features caused by input styles. Moreover, since the Style\nGeneration module, responsible for generating style word vectors using random\nsampling or style mixing, makes the model sensitive to input text prompts, we\nintroduce a model ensemble method to mitigate this sensitivity. Extensive\nexperiments demonstrate that our framework outperforms state-of-the-art methods\non benchmark datasets.\n","authors":["Yunlong Tang","Yuxuan Wan","Lei Qi","Xin Geng"],"pdf_url":"https://arxiv.org/pdf/2403.16697v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16695v1","updated":"2024-03-25T12:26:32Z","published":"2024-03-25T12:26:32Z","title":"Assessing the Performance of Deep Learning for Automated Gleason Grading\n in Prostate Cancer","summary":" Prostate cancer is a dominant health concern calling for advanced diagnostic\ntools. Utilizing digital pathology and artificial intelligence, this study\nexplores the potential of 11 deep neural network architectures for automated\nGleason grading in prostate carcinoma focusing on comparing traditional and\nrecent architectures. A standardized image classification pipeline, based on\nthe AUCMEDI framework, facilitated robust evaluation using an in-house dataset\nconsisting of 34,264 annotated tissue tiles. The results indicated varying\nsensitivity across architectures, with ConvNeXt demonstrating the strongest\nperformance. Notably, newer architectures achieved superior performance, even\nthough with challenges in differentiating closely related Gleason grades. The\nConvNeXt model was capable of learning a balance between complexity and\ngeneralizability. Overall, this study lays the groundwork for enhanced Gleason\ngrading systems, potentially improving diagnostic efficiency for prostate\ncancer.\n","authors":["Dominik Müller","Philip Meyer","Lukas Rentschler","Robin Manz","Daniel Hieber","Jonas Bäcker","Samantha Cramer","Christoph Wengenmayr","Bruno Märkl","Ralf Huss","Frank Kramer","Iñaki Soto-Rey","Johannes Raffler"],"pdf_url":"https://arxiv.org/pdf/2403.16695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16689v1","updated":"2024-03-25T12:23:39Z","published":"2024-03-25T12:23:39Z","title":"Synapse: Learning Preferential Concepts from Visual Demonstrations","summary":" This paper addresses the problem of preference learning, which aims to learn\nuser-specific preferences (e.g., \"good parking spot\", \"convenient drop-off\nlocation\") from visual input. Despite its similarity to learning factual\nconcepts (e.g., \"red cube\"), preference learning is a fundamentally harder\nproblem due to its subjective nature and the paucity of person-specific\ntraining data. We address this problem using a new framework called Synapse,\nwhich is a neuro-symbolic approach designed to efficiently learn preferential\nconcepts from limited demonstrations. Synapse represents preferences as\nneuro-symbolic programs in a domain-specific language (DSL) that operates over\nimages, and leverages a novel combination of visual parsing, large language\nmodels, and program synthesis to learn programs representing individual\npreferences. We evaluate Synapse through extensive experimentation including a\nuser case study focusing on mobility-related concepts in mobile robotics and\nautonomous driving. Our evaluation demonstrates that Synapse significantly\noutperforms existing baselines as well as its own ablations. The code and other\ndetails can be found on the project website https://amrl.cs.utexas.edu/synapse .\n","authors":["Sadanand Modak","Noah Patton","Isil Dillig","Joydeep Biswas"],"pdf_url":"https://arxiv.org/pdf/2403.16689v1.pdf","comment":"23 pages, 7 figures; Preprint"},{"id":"http://arxiv.org/abs/2403.16678v1","updated":"2024-03-25T12:15:42Z","published":"2024-03-25T12:15:42Z","title":"DeepGleason: a System for Automated Gleason Grading of Prostate Cancer\n using Deep Neural Networks","summary":" Advances in digital pathology and artificial intelligence (AI) offer\npromising opportunities for clinical decision support and enhancing diagnostic\nworkflows. Previous studies already demonstrated AI's potential for automated\nGleason grading, but lack state-of-the-art methodology and model reusability.\nTo address this issue, we propose DeepGleason: an open-source deep neural\nnetwork based image classification system for automated Gleason grading using\nwhole-slide histopathology images from prostate tissue sections. Implemented\nwith the standardized AUCMEDI framework, our tool employs a tile-wise\nclassification approach utilizing fine-tuned image preprocessing techniques in\ncombination with a ConvNeXt architecture which was compared to various\nstate-of-the-art architectures. The neural network model was trained and\nvalidated on an in-house dataset of 34,264 annotated tiles from 369 prostate\ncarcinoma slides. We demonstrated that DeepGleason is capable of highly\naccurate and reliable Gleason grading with a macro-averaged F1-score of 0.806,\nAUC of 0.991, and Accuracy of 0.974. The internal architecture comparison\nrevealed that the ConvNeXt model was superior performance-wise on our dataset\nto established and other modern architectures like transformers. Furthermore,\nwe were able to outperform the current state-of-the-art in tile-wise\nfine-classification with a sensitivity and specificity of 0.94 and 0.98 for\nbenign vs malignant detection as well as of 0.91 and 0.75 for Gleason 3 vs\nGleason 4 & 5 classification, respectively. Our tool contributes to the wider\nadoption of AI-based Gleason grading within the research community and paves\nthe way for broader clinical application of deep learning models in digital\npathology. DeepGleason is open-source and publicly available for research\napplication in the following Git repository:\nhttps://github.com/frankkramer-lab/DeepGleason.\n","authors":["Dominik Müller","Philip Meyer","Lukas Rentschler","Robin Manz","Jonas Bäcker","Samantha Cramer","Christoph Wengenmayr","Bruno Märkl","Ralf Huss","Iñaki Soto-Rey","Johannes Raffler"],"pdf_url":"https://arxiv.org/pdf/2403.16678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16677v1","updated":"2024-03-25T12:14:48Z","published":"2024-03-25T12:14:48Z","title":"FOOL: Addressing the Downlink Bottleneck in Satellite Computing with\n Neural Feature Compression","summary":" Nanosatellite constellations equipped with sensors capturing large geographic\nregions provide unprecedented opportunities for Earth observation. As\nconstellation sizes increase, network contention poses a downlink bottleneck.\nOrbital Edge Computing (OEC) leverages limited onboard compute resources to\nreduce transfer costs by processing the raw captures at the source. However,\ncurrent solutions have limited practicability due to reliance on crude\nfiltering methods or over-prioritizing particular downstream tasks.\n This work presents FOOL, an OEC-native and task-agnostic feature compression\nmethod that preserves prediction performance. FOOL partitions high-resolution\nsatellite imagery to maximize throughput. Further, it embeds context and\nleverages inter-tile dependencies to lower transfer costs with negligible\noverhead. While FOOL is a feature compressor, it can recover images with\ncompetitive scores on perceptual quality measures at lower bitrates. We\nextensively evaluate transfer cost reduction by including the peculiarity of\nintermittently available network connections in low earth orbit. Lastly, we\ntest the feasibility of our system for standardized nanosatellite form factors.\nWe demonstrate that FOOL permits downlinking over 100x the data volume without\nrelying on prior information on the downstream tasks.\n","authors":["Alireza Furutanpey","Qiyang Zhang","Philipp Raith","Tobias Pfandzelter","Shangguang Wang","Schahram Dustdar"],"pdf_url":"https://arxiv.org/pdf/2403.16677v1.pdf","comment":"18 pages, double column, 19 figures, 7 tables, Initial Submission to\n IEEE Transactions on Mobile Computing"},{"id":"http://arxiv.org/abs/2403.16669v1","updated":"2024-03-25T12:07:24Z","published":"2024-03-25T12:07:24Z","title":"Domain Adaptive Detection of MAVs: A Benchmark and Noise Suppression\n Network","summary":" Visual detection of Micro Air Vehicles (MAVs) has attracted increasing\nattention in recent years due to its important application in various tasks.\nThe existing methods for MAV detection assume that the training set and testing\nset have the same distribution. As a result, when deployed in new domains, the\ndetectors would have a significant performance degradation due to domain\ndiscrepancy. In this paper, we study the problem of cross-domain MAV detection.\nThe contributions of this paper are threefold. 1) We propose a\nMulti-MAV-Multi-Domain (M3D) dataset consisting of both simulation and\nrealistic images. Compared to other existing datasets, the proposed one is more\ncomprehensive in the sense that it covers rich scenes, diverse MAV types, and\nvarious viewing angles. A new benchmark for cross-domain MAV detection is\nproposed based on the proposed dataset. 2) We propose a Noise Suppression\nNetwork (NSN) based on the framework of pseudo-labeling and a large-to-small\ntraining procedure. To reduce the challenging pseudo-label noises, two novel\nmodules are designed in this network. The first is a prior-based curriculum\nlearning module for allocating adaptive thresholds for pseudo labels with\ndifferent difficulties. The second is a masked copy-paste augmentation module\nfor pasting truly-labeled MAVs on unlabeled target images and thus decreasing\npseudo-label noises. 3) Extensive experimental results verify the superior\nperformance of the proposed method compared to the state-of-the-art ones. In\nparticular, it achieves mAP of 46.9%(+5.8%), 50.5%(+3.7%), and 61.5%(+11.3%) on\nthe tasks of simulation-to-real adaptation, cross-scene adaptation, and\ncross-camera adaptation, respectively.\n","authors":["Yin Zhang","Jinhong Deng","Peidong Liu","Wen Li","Shiyu Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.16669v1.pdf","comment":"17 pages, 11 figures. Accepted by IEEE Transactions on Automation\n Science and Engineering"},{"id":"http://arxiv.org/abs/2308.10299v3","updated":"2024-03-25T12:04:41Z","published":"2023-08-20T15:38:40Z","title":"Boosting Adversarial Transferability by Block Shuffle and Rotation","summary":" Adversarial examples mislead deep neural networks with imperceptible\nperturbations and have brought significant threats to deep learning. An\nimportant aspect is their transferability, which refers to their ability to\ndeceive other models, thus enabling attacks in the black-box setting. Though\nvarious methods have been proposed to boost transferability, the performance\nstill falls short compared with white-box attacks. In this work, we observe\nthat existing input transformation based attacks, one of the mainstream\ntransfer-based attacks, result in different attention heatmaps on various\nmodels, which might limit the transferability. We also find that breaking the\nintrinsic relation of the image can disrupt the attention heatmap of the\noriginal image. Based on this finding, we propose a novel input transformation\nbased attack called block shuffle and rotation (BSR). Specifically, BSR splits\nthe input image into several blocks, then randomly shuffles and rotates these\nblocks to construct a set of new images for gradient calculation. Empirical\nevaluations on the ImageNet dataset demonstrate that BSR could achieve\nsignificantly better transferability than the existing input transformation\nbased methods under single-model and ensemble-model settings. Combining BSR\nwith the current input transformation method can further improve the\ntransferability, which significantly outperforms the state-of-the-art methods.\nCode is available at https://github.com/Trustworthy-AI-Group/BSR\n","authors":["Kunyu Wang","Xuanran He","Wenxuan Wang","Xiaosen Wang"],"pdf_url":"https://arxiv.org/pdf/2308.10299v3.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2311.16515v2","updated":"2024-03-25T12:01:59Z","published":"2023-11-25T14:24:49Z","title":"Word4Per: Zero-shot Composed Person Retrieval","summary":" Searching for specific person has great social benefits and security value,\nand it often involves a combination of visual and textual information.\nConventional person retrieval methods, whether image-based or text-based,\nusually fall short in effectively harnessing both types of information, leading\nto the loss of accuracy. In this paper, a whole new task called Composed Person\nRetrieval (CPR) is proposed to jointly utilize both image and text information\nfor target person retrieval. However, the supervised CPR requires very costly\nmanual annotation dataset, while there are currently no available resources. To\nmitigate this issue, we firstly introduce the Zero-shot Composed Person\nRetrieval (ZS-CPR), which leverages existing domain-related data to resolve the\nCPR problem without expensive annotations. Secondly, to learn ZS-CPR model, we\npropose a two-stage learning framework, Word4Per, where a lightweight Textual\nInversion Network (TINet) and a text-based person retrieval model based on\nfine-tuned Contrastive Language-Image Pre-training (CLIP) network are learned\nwithout utilizing any CPR data. Thirdly, a finely annotated Image-Text Composed\nPerson Retrieval (ITCPR) dataset is built as the benchmark to assess the\nperformance of the proposed Word4Per framework. Extensive experiments under\nboth Rank-1 and mAP demonstrate the effectiveness of Word4Per for the ZS-CPR\ntask, surpassing the comparative methods by over 10\\%. The code and ITCPR\ndataset will be publicly available at\nhttps://github.com/Delong-liu-bupt/Word4Per.\n","authors":["Delong Liu","Haiwen Li","Zhicheng Zhao","Fei Su","Yuan Dong"],"pdf_url":"https://arxiv.org/pdf/2311.16515v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05305v2","updated":"2024-03-25T11:48:27Z","published":"2024-02-07T22:50:47Z","title":"Knowledge Distillation for Road Detection based on cross-model\n Semi-Supervised Learning","summary":" The advancement of knowledge distillation has played a crucial role in\nenabling the transfer of knowledge from larger teacher models to smaller and\nmore efficient student models, and is particularly beneficial for online and\nresource-constrained applications. The effectiveness of the student model\nheavily relies on the quality of the distilled knowledge received from the\nteacher. Given the accessibility of unlabelled remote sensing data,\nsemi-supervised learning has become a prevalent strategy for enhancing model\nperformance. However, relying solely on semi-supervised learning with smaller\nmodels may be insufficient due to their limited capacity for feature\nextraction. This limitation restricts their ability to exploit training data.\nTo address this issue, we propose an integrated approach that combines\nknowledge distillation and semi-supervised learning methods. This hybrid\napproach leverages the robust capabilities of large models to effectively\nutilise large unlabelled data whilst subsequently providing the small student\nmodel with rich and informative features for enhancement. The proposed\nsemi-supervised learning-based knowledge distillation (SSLKD) approach\ndemonstrates a notable improvement in the performance of the student model, in\nthe application of road segmentation, surpassing the effectiveness of\ntraditional semi-supervised learning methods.\n","authors":["Wanli Ma","Oktay Karakus","Paul L. Rosin"],"pdf_url":"https://arxiv.org/pdf/2402.05305v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02935v2","updated":"2024-03-25T11:45:58Z","published":"2023-08-05T18:32:49Z","title":"Unveiling the Blind Spots: A Critical Examination of Fairness in\n Autonomous Driving Systems","summary":" Autonomous driving systems have extended the spectrum of Web of Things for\nintelligent vehicles and have become an important component of the Web\necosystem. Similar to traditional Web-based applications, fairness is an\nessential aspect for ensuring the high quality of autonomous driving systems,\nparticularly in the context of pedestrian detectors within them. However, there\nis an absence in the literature of a comprehensive assessment of the fairness\nof current Deep Learning (DL)-based pedestrian detectors. To fill the gap, we\nevaluate eight widely-explored DL-based pedestrian detectors across demographic\ngroups on large-scale real-world datasets. To enable a thorough fairness\nevaluation, we provide extensive annotations for the datasets, resulting in\n8,311 images with 16,070 gender labels, 20,115 age labels, and 3,513 skin tone\nlabels. Our findings reveal significant fairness issues related to age. The\nundetected proportions for adults are 20.14% lower compared to children.\nFurthermore, we explore how various driving scenarios affect the fairness of\npedestrian detectors. We find that the bias may exacerbate for children and\nfemales towards low brightness and low contrast.\n","authors":["Xinyue Li","Zhenpeng Chen","Jie M. Zhang","Federica Sarro","Ying Zhang","Xuanzhe Liu"],"pdf_url":"https://arxiv.org/pdf/2308.02935v2.pdf","comment":"Update the models evaluated and the experimental results"},{"id":"http://arxiv.org/abs/2310.06744v2","updated":"2024-03-25T11:35:55Z","published":"2023-10-10T16:14:20Z","title":"HiFi-123: Towards High-fidelity One Image to 3D Content Generation","summary":" Recent advances in diffusion models have enabled 3D generation from a single\nimage. However, current methods often produce suboptimal results for novel\nviews, with blurred textures and deviations from the reference image, limiting\ntheir practical applications. In this paper, we introduce HiFi-123, a method\ndesigned for high-fidelity and multi-view consistent 3D generation. Our\ncontributions are twofold: First, we propose a Reference-Guided Novel View\nEnhancement (RGNV) technique that significantly improves the fidelity of\ndiffusion-based zero-shot novel view synthesis methods. Second, capitalizing on\nthe RGNV, we present a novel Reference-Guided State Distillation (RGSD) loss.\nWhen incorporated into the optimization-based image-to-3D pipeline, our method\nsignificantly improves 3D generation quality, achieving state-of-the-art\nperformance. Comprehensive evaluations demonstrate the effectiveness of our\napproach over existing methods, both qualitatively and quantitatively. Video\nresults are available on the project page.\n","authors":["Wangbo Yu","Li Yuan","Yan-Pei Cao","Xiangjun Gao","Xiaoyu Li","Wenbo Hu","Long Quan","Ying Shan","Yonghong Tian"],"pdf_url":"https://arxiv.org/pdf/2310.06744v2.pdf","comment":"Project Page: https://drexubery.github.io/HiFi-123/"},{"id":"http://arxiv.org/abs/2403.16646v1","updated":"2024-03-25T11:32:05Z","published":"2024-03-25T11:32:05Z","title":"Clustering Propagation for Universal Medical Image Segmentation","summary":" Prominent solutions for medical image segmentation are typically tailored for\nautomatic or interactive setups, posing challenges in facilitating progress\nachieved in one task to another.$_{\\!}$ This$_{\\!}$ also$_{\\!}$\nnecessitates$_{\\!}$ separate$_{\\!}$ models for each task, duplicating both\ntraining time and parameters.$_{\\!}$ To$_{\\!}$ address$_{\\!}$ above$_{\\!}$\nissues,$_{\\!}$ we$_{\\!}$ introduce$_{\\!}$ S2VNet,$_{\\!}$ a$_{\\!}$\nuniversal$_{\\!}$ framework$_{\\!}$ that$_{\\!}$ leverages$_{\\!}$\nSlice-to-Volume$_{\\!}$ propagation$_{\\!}$ to$_{\\!}$ unify automatic/interactive\nsegmentation within a single model and one training session. Inspired by\nclustering-based segmentation techniques, S2VNet makes full use of the\nslice-wise structure of volumetric data by initializing cluster centers from\nthe cluster$_{\\!}$ results$_{\\!}$ of$_{\\!}$ previous$_{\\!}$ slice.$_{\\!}$ This\nenables knowledge acquired from prior slices to assist in the segmentation of\nthe current slice, further efficiently bridging the communication between\nremote slices using mere 2D networks. Moreover, such a framework readily\naccommodates interactive segmentation with no architectural change, simply by\ninitializing centroids from user inputs. S2VNet distinguishes itself by swift\ninference speeds and reduced memory consumption compared to prevailing 3D\nsolutions. It can also handle multi-class interactions with each of them\nserving to initialize different centroids. Experiments on three benchmarks\ndemonstrate S2VNet surpasses task-specified solutions on both\nautomatic/interactive setups.\n","authors":["Yuhang Ding","Liulei Li","Wenguan Wang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2403.16646v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.16643v1","updated":"2024-03-25T11:29:19Z","published":"2024-03-25T11:29:19Z","title":"Self-Adaptive Reality-Guided Diffusion for Artifact-Free\n Super-Resolution","summary":" Artifact-free super-resolution (SR) aims to translate low-resolution images\ninto their high-resolution counterparts with a strict integrity of the original\ncontent, eliminating any distortions or synthetic details. While traditional\ndiffusion-based SR techniques have demonstrated remarkable abilities to enhance\nimage detail, they are prone to artifact introduction during iterative\nprocedures. Such artifacts, ranging from trivial noise to unauthentic textures,\ndeviate from the true structure of the source image, thus challenging the\nintegrity of the super-resolution process. In this work, we propose\nSelf-Adaptive Reality-Guided Diffusion (SARGD), a training-free method that\ndelves into the latent space to effectively identify and mitigate the\npropagation of artifacts. Our SARGD begins by using an artifact detector to\nidentify implausible pixels, creating a binary mask that highlights artifacts.\nFollowing this, the Reality Guidance Refinement (RGR) process refines artifacts\nby integrating this mask with realistic latent representations, improving\nalignment with the original image. Nonetheless, initial realistic-latent\nrepresentations from lower-quality images result in over-smoothing in the final\noutput. To address this, we introduce a Self-Adaptive Guidance (SAG) mechanism.\nIt dynamically computes a reality score, enhancing the sharpness of the\nrealistic latent. These alternating mechanisms collectively achieve\nartifact-free super-resolution. Extensive experiments demonstrate the\nsuperiority of our method, delivering detailed artifact-free high-resolution\nimages while reducing sampling steps by 2X. We release our code at\nhttps://github.com/ProAirVerse/Self-Adaptive-Guidance-Diffusion.git.\n","authors":["Qingping Zheng","Ling Zheng","Yuanfan Guo","Ying Li","Songcen Xu","Jiankang Deng","Hang Xu"],"pdf_url":"https://arxiv.org/pdf/2403.16643v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16640v1","updated":"2024-03-25T11:28:52Z","published":"2024-03-25T11:28:52Z","title":"Multi-Scale Texture Loss for CT denoising with GANs","summary":" Generative Adversarial Networks (GANs) have proved as a powerful framework\nfor denoising applications in medical imaging. However, GAN-based denoising\nalgorithms still suffer from limitations in capturing complex relationships\nwithin the images. In this regard, the loss function plays a crucial role in\nguiding the image generation process, encompassing how much a synthetic image\ndiffers from a real image. To grasp highly complex and non-linear textural\nrelationships in the training process, this work presents a loss function that\nleverages the intrinsic multi-scale nature of the Gray-Level-Co-occurrence\nMatrix (GLCM). Although the recent advances in deep learning have demonstrated\nsuperior performance in classification and detection tasks, we hypothesize that\nits information content can be valuable when integrated into GANs' training. To\nthis end, we propose a differentiable implementation of the GLCM suited for\ngradient-based optimization. Our approach also introduces a self-attention\nlayer that dynamically aggregates the multi-scale texture information extracted\nfrom the images. We validate our approach by carrying out extensive experiments\nin the context of low-dose CT denoising, a challenging application that aims to\nenhance the quality of noisy CT scans. We utilize three publicly available\ndatasets, including one simulated and two real datasets. The results are\npromising as compared to other well-established loss functions, being also\nconsistent across three different GAN architectures. The code is available at:\nhttps://github.com/FrancescoDiFeola/DenoTextureLoss\n","authors":["Francesco Di Feola","Lorenzo Tronchin","Valerio Guarrasi","Paolo Soda"],"pdf_url":"https://arxiv.org/pdf/2403.16640v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16638v1","updated":"2024-03-25T11:26:18Z","published":"2024-03-25T11:26:18Z","title":"AI-Generated Video Detection via Spatio-Temporal Anomaly Learning","summary":" The advancement of generation models has led to the emergence of highly\nrealistic artificial intelligence (AI)-generated videos. Malicious users can\neasily create non-existent videos to spread false information. This letter\nproposes an effective AI-generated video detection (AIGVDet) scheme by\ncapturing the forensic traces with a two-branch spatio-temporal convolutional\nneural network (CNN). Specifically, two ResNet sub-detectors are learned\nseparately for identifying the anomalies in spatical and optical flow domains,\nrespectively. Results of such sub-detectors are fused to further enhance the\ndiscrimination ability. A large-scale generated video dataset (GVD) is\nconstructed as a benchmark for model training and evaluation. Extensive\nexperimental results verify the high generalization and robustness of our\nAIGVDet scheme. Code and dataset will be available at\nhttps://github.com/multimediaFor/AIGVDet.\n","authors":["Jianfa Bai","Man Lin","Gang Cao"],"pdf_url":"https://arxiv.org/pdf/2403.16638v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16476v4","updated":"2024-03-25T11:24:45Z","published":"2023-12-27T08:50:01Z","title":"SVGDreamer: Text Guided SVG Generation with Diffusion Model","summary":" Recently, text-guided scalable vector graphics (SVGs) synthesis has shown\npromise in domains such as iconography and sketch. However, existing\ntext-to-SVG generation methods lack editability and struggle with visual\nquality and result diversity. To address these limitations, we propose a novel\ntext-guided vector graphics synthesis method called SVGDreamer. SVGDreamer\nincorporates a semantic-driven image vectorization (SIVE) process that enables\nthe decomposition of synthesis into foreground objects and background, thereby\nenhancing editability. Specifically, the SIVE process introduce attention-based\nprimitive control and an attention-mask loss function for effective control and\nmanipulation of individual elements. Additionally, we propose a Vectorized\nParticle-based Score Distillation (VPSD) approach to tackle the challenges of\nshape over-smoothing, color over-saturation, limited diversity in results, and\nslow convergence in existing text-to-SVG generation methods. VPSD models SVGs\nas distributions of control points and colors to counteract over-smoothing and\nover-saturation. Furthermore, VPSD leverages a reward model to reweight vector\nparticles, which improves aesthetic appeal and accelerates convergence.\nExtensive experiments have been conducted to validate the effectiveness of\nSVGDreamer, demonstrating its superiority over baseline methods in terms of\neditability, visual quality, and diversity. The code and demo of SVGDreamer can\nbe found at https://ximinng.github.io/SVGDreamer-project/\n","authors":["Ximing Xing","Haitao Zhou","Chuang Wang","Jing Zhang","Dong Xu","Qian Yu"],"pdf_url":"https://arxiv.org/pdf/2312.16476v4.pdf","comment":"Accepted by CVPR 2024. project link:\n https://ximinng.github.io/SVGDreamer-project/"},{"id":"http://arxiv.org/abs/2403.16635v1","updated":"2024-03-25T11:24:02Z","published":"2024-03-25T11:24:02Z","title":"V2X-PC: Vehicle-to-everything Collaborative Perception via Point Cluster","summary":" The objective of the collaborative vehicle-to-everything perception task is\nto enhance the individual vehicle's perception capability through message\ncommunication among neighboring traffic agents. Previous methods focus on\nachieving optimal performance within bandwidth limitations and typically adopt\nBEV maps as the basic collaborative message units. However, we demonstrate that\ncollaboration with dense representations is plagued by object feature\ndestruction during message packing, inefficient message aggregation for\nlong-range collaboration, and implicit structure representation communication.\nTo tackle these issues, we introduce a brand new message unit, namely point\ncluster, designed to represent the scene sparsely with a combination of\nlow-level structure information and high-level semantic information. The point\ncluster inherently preserves object information while packing messages, with\nweak relevance to the collaboration range, and supports explicit structure\nmodeling. Building upon this representation, we propose a novel framework\nV2X-PC for collaborative perception. This framework includes a Point Cluster\nPacking (PCP) module to keep object feature and manage bandwidth through the\nmanipulation of cluster point numbers. As for effective message aggregation, we\npropose a Point Cluster Aggregation (PCA) module to match and merge point\nclusters associated with the same object. To further handle time latency and\npose errors encountered in real-world scenarios, we propose parameter-free\nsolutions that can adapt to different noisy levels without finetuning.\nExperiments on two widely recognized collaborative perception benchmarks\nshowcase the superior performance of our method compared to the previous\nstate-of-the-art approaches relying on BEV maps.\n","authors":["Si Liu","Zihan Ding","Jiahui Fu","Hongyu Li","Siheng Chen","Shifeng Zhang","Xu Zhou"],"pdf_url":"https://arxiv.org/pdf/2403.16635v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16627v1","updated":"2024-03-25T11:16:23Z","published":"2024-03-25T11:16:23Z","title":"SDXS: Real-Time One-Step Latent Diffusion Models with Image Conditions","summary":" Recent advancements in diffusion models have positioned them at the forefront\nof image generation. Despite their superior performance, diffusion models are\nnot without drawbacks; they are characterized by complex architectures and\nsubstantial computational demands, resulting in significant latency due to\ntheir iterative sampling process. To mitigate these limitations, we introduce a\ndual approach involving model miniaturization and a reduction in sampling\nsteps, aimed at significantly decreasing model latency. Our methodology\nleverages knowledge distillation to streamline the U-Net and image decoder\narchitectures, and introduces an innovative one-step DM training technique that\nutilizes feature matching and score distillation. We present two models,\nSDXS-512 and SDXS-1024, achieving inference speeds of approximately 100 FPS\n(30x faster than SD v1.5) and 30 FP (60x faster than SDXL) on a single GPU,\nrespectively. Moreover, our training approach offers promising applications in\nimage-conditioned control, facilitating efficient image-to-image translation.\n","authors":["Yuda Song","Zehao Sun","Xuanwu Yin"],"pdf_url":"https://arxiv.org/pdf/2403.16627v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17744v2","updated":"2024-03-25T11:04:17Z","published":"2023-11-29T15:49:31Z","title":"Variational Bayes image restoration with compressive autoencoders","summary":" Regularization of inverse problems is of paramount importance in\ncomputational imaging. The ability of neural networks to learn efficient image\nrepresentations has been recently exploited to design powerful data-driven\nregularizers. While state-of-the-art plug-and-play methods rely on an implicit\nregularization provided by neural denoisers, alternative Bayesian approaches\nconsider Maximum A Posteriori (MAP) estimation in the latent space of a\ngenerative model, thus with an explicit regularization. However,\nstate-of-the-art deep generative models require a huge amount of training data\ncompared to denoisers. Besides, their complexity hampers the optimization\ninvolved in latent MAP derivation. In this work, we first propose to use\ncompressive autoencoders instead. These networks, which can be seen as\nvariational autoencoders with a flexible latent prior, are smaller and easier\nto train than state-of-the-art generative models. As a second contribution, we\nintroduce the Variational Bayes Latent Estimation (VBLE) algorithm, which\nperforms latent estimation within the framework of variational inference.\nThanks to a simple yet efficient parameterization of the variational posterior,\nVBLE allows for fast and easy (approximate) posterior sampling. Experimental\nresults on image datasets BSD and FFHQ demonstrate that VBLE reaches similar\nperformance than state-of-the-art plug-and-play methods, while being able to\nquantify uncertainties faster than other existing posterior sampling\ntechniques.\n","authors":["Maud Biquard","Marie Chabert","Thomas Oberlin"],"pdf_url":"https://arxiv.org/pdf/2311.17744v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12198v2","updated":"2024-03-25T11:04:04Z","published":"2023-12-19T14:34:36Z","title":"Mask Grounding for Referring Image Segmentation","summary":" Referring Image Segmentation (RIS) is a challenging task that requires an\nalgorithm to segment objects referred by free-form language expressions.\nDespite significant progress in recent years, most state-of-the-art (SOTA)\nmethods still suffer from considerable language-image modality gap at the pixel\nand word level. These methods generally 1) rely on sentence-level language\nfeatures for language-image alignment and 2) lack explicit training supervision\nfor fine-grained visual grounding. Consequently, they exhibit weak object-level\ncorrespondence between visual and language features. Without well-grounded\nfeatures, prior methods struggle to understand complex expressions that require\nstrong reasoning over relationships among multiple objects, especially when\ndealing with rarely used or ambiguous clauses. To tackle this challenge, we\nintroduce a novel Mask Grounding auxiliary task that significantly improves\nvisual grounding within language features, by explicitly teaching the model to\nlearn fine-grained correspondence between masked textual tokens and their\nmatching visual objects. Mask Grounding can be directly used on prior RIS\nmethods and consistently bring improvements. Furthermore, to holistically\naddress the modality gap, we also design a cross-modal alignment loss and an\naccompanying alignment module. These additions work synergistically with Mask\nGrounding. With all these techniques, our comprehensive approach culminates in\nMagNet (Mask-grounded Network), an architecture that significantly outperforms\nprior arts on three key benchmarks (RefCOCO, RefCOCO+ and G-Ref), demonstrating\nour method's effectiveness in addressing current limitations of RIS algorithms.\nOur code and pre-trained weights will be released.\n","authors":["Yong Xien Chng","Henry Zheng","Yizeng Han","Xuchong Qiu","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2312.12198v2.pdf","comment":"Accepted by CVPR2024; Project page:\n https://yxchng.github.io/projects/mask-grounding"},{"id":"http://arxiv.org/abs/2403.16612v1","updated":"2024-03-25T10:42:48Z","published":"2024-03-25T10:42:48Z","title":"Calibrating Bayesian UNet++ for Sub-Seasonal Forecasting","summary":" Seasonal forecasting is a crucial task when it comes to detecting the extreme\nheat and colds that occur due to climate change. Confidence in the predictions\nshould be reliable since a small increase in the temperatures in a year has a\nbig impact on the world. Calibration of the neural networks provides a way to\nensure our confidence in the predictions. However, calibrating regression\nmodels is an under-researched topic, especially in forecasters. We calibrate a\nUNet++ based architecture, which was shown to outperform physics-based models\nin temperature anomalies. We show that with a slight trade-off between\nprediction error and calibration error, it is possible to get more reliable and\nsharper forecasts. We believe that calibration should be an important part of\nsafety-critical machine learning applications such as weather forecasters.\n","authors":["Busra Asan","Abdullah Akgul","Alper Unal","Melih Kandemir","Gozde Unal"],"pdf_url":"https://arxiv.org/pdf/2403.16612v1.pdf","comment":"Accepted as a workshop paper at \"ICLR 2024 Tackling Climate Change\n with Machine Learning\""},{"id":"http://arxiv.org/abs/2403.16607v1","updated":"2024-03-25T10:38:17Z","published":"2024-03-25T10:38:17Z","title":"Enhancing Industrial Transfer Learning with Style Filter: Cost Reduction\n and Defect-Focus","summary":" Addressing the challenge of data scarcity in industrial domains, transfer\nlearning emerges as a pivotal paradigm. This work introduces Style Filter, a\ntailored methodology for industrial contexts. By selectively filtering source\ndomain data before knowledge transfer, Style Filter reduces the quantity of\ndata while maintaining or even enhancing the performance of transfer learning\nstrategy. Offering label-free operation, minimal reliance on prior knowledge,\nindependence from specific models, and re-utilization, Style Filter is\nevaluated on authentic industrial datasets, highlighting its effectiveness when\nemployed before conventional transfer strategies in the deep learning domain.\nThe results underscore the effectiveness of Style Filter in real-world\nindustrial applications.\n","authors":["Chen Li","Ruijie Ma","Xiang Qian","Xiaohao Wang","Xinghui Li"],"pdf_url":"https://arxiv.org/pdf/2403.16607v1.pdf","comment":"17 pages, 11 figures,4 tables"},{"id":"http://arxiv.org/abs/2403.16605v1","updated":"2024-03-25T10:30:22Z","published":"2024-03-25T10:30:22Z","title":"SatSynth: Augmenting Image-Mask Pairs through Diffusion Models for\n Aerial Semantic Segmentation","summary":" In recent years, semantic segmentation has become a pivotal tool in\nprocessing and interpreting satellite imagery. Yet, a prevalent limitation of\nsupervised learning techniques remains the need for extensive manual\nannotations by experts. In this work, we explore the potential of generative\nimage diffusion to address the scarcity of annotated data in earth observation\ntasks. The main idea is to learn the joint data manifold of images and labels,\nleveraging recent advancements in denoising diffusion probabilistic models. To\nthe best of our knowledge, we are the first to generate both images and\ncorresponding masks for satellite segmentation. We find that the obtained pairs\nnot only display high quality in fine-scale features but also ensure a wide\nsampling diversity. Both aspects are crucial for earth observation data, where\nsemantic classes can vary severely in scale and occurrence frequency. We employ\nthe novel data instances for downstream segmentation, as a form of data\naugmentation. In our experiments, we provide comparisons to prior works based\non discriminative diffusion models or GANs. We demonstrate that integrating\ngenerated samples yields significant quantitative improvements for satellite\nsemantic segmentation -- both compared to baselines and when training only on\nthe original data.\n","authors":["Aysim Toker","Marvin Eisenberger","Daniel Cremers","Laura Leal-Taixé"],"pdf_url":"https://arxiv.org/pdf/2403.16605v1.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2403.16594v1","updated":"2024-03-25T10:13:52Z","published":"2024-03-25T10:13:52Z","title":"EDUE: Expert Disagreement-Guided One-Pass Uncertainty Estimation for\n Medical Image Segmentation","summary":" Deploying deep learning (DL) models in medical applications relies on\npredictive performance and other critical factors, such as conveying\ntrustworthy predictive uncertainty. Uncertainty estimation (UE) methods provide\npotential solutions for evaluating prediction reliability and improving the\nmodel confidence calibration. Despite increasing interest in UE, challenges\npersist, such as the need for explicit methods to capture aleatoric uncertainty\nand align uncertainty estimates with real-life disagreements among domain\nexperts. This paper proposes an Expert Disagreement-Guided Uncertainty\nEstimation (EDUE) for medical image segmentation. By leveraging variability in\nground-truth annotations from multiple raters, we guide the model during\ntraining and incorporate random sampling-based strategies to enhance\ncalibration confidence. Our method achieves 55% and 23% improvement in\ncorrelation on average with expert disagreements at the image and pixel levels,\nrespectively, better calibration, and competitive segmentation performance\ncompared to the state-of-the-art deep ensembles, requiring only a single\nforward pass.\n","authors":["Kudaibergen Abutalip","Numan Saeed","Ikboljon Sobirov","Vincent Andrearczyk","Adrien Depeursinge","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2403.16594v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14828v2","updated":"2024-03-25T10:12:46Z","published":"2024-03-21T20:43:10Z","title":"Multimodal-Conditioned Latent Diffusion Models for Fashion Image Editing","summary":" Fashion illustration is a crucial medium for designers to convey their\ncreative vision and transform design concepts into tangible representations\nthat showcase the interplay between clothing and the human body. In the context\nof fashion design, computer vision techniques have the potential to enhance and\nstreamline the design process. Departing from prior research primarily focused\non virtual try-on, this paper tackles the task of multimodal-conditioned\nfashion image editing. Our approach aims to generate human-centric fashion\nimages guided by multimodal prompts, including text, human body poses, garment\nsketches, and fabric textures. To address this problem, we propose extending\nlatent diffusion models to incorporate these multiple modalities and modifying\nthe structure of the denoising network, taking multimodal prompts as input. To\ncondition the proposed architecture on fabric textures, we employ textual\ninversion techniques and let diverse cross-attention layers of the denoising\nnetwork attend to textual and texture information, thus incorporating different\ngranularity conditioning details. Given the lack of datasets for the task, we\nextend two existing fashion datasets, Dress Code and VITON-HD, with multimodal\nannotations. Experimental evaluations demonstrate the effectiveness of our\nproposed approach in terms of realism and coherence concerning the provided\nmultimodal inputs.\n","authors":["Alberto Baldrati","Davide Morelli","Marcella Cornia","Marco Bertini","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2403.14828v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16582v1","updated":"2024-03-25T09:49:42Z","published":"2024-03-25T09:49:42Z","title":"In the Search for Optimal Multi-view Learning Models for Crop\n Classification with Global Remote Sensing Data","summary":" Crop classification is of critical importance due to its role in studying\ncrop pattern changes, resource management, and carbon sequestration. When\nemploying data-driven techniques for its prediction, utilizing various temporal\ndata sources is necessary. Deep learning models have proven to be effective for\nthis task by mapping time series data to high-level representation for\nprediction. However, they face substantial challenges when dealing with\nmultiple input patterns. The literature offers limited guidance for Multi-View\nLearning (MVL) scenarios, as it has primarily focused on exploring fusion\nstrategies with specific encoders and validating them in local regions. In\ncontrast, we investigate the impact of simultaneous selection of the fusion\nstrategy and the encoder architecture evaluated on a global-scale cropland and\ncrop-type classifications. We use a range of five fusion strategies (Input,\nFeature, Decision, Ensemble, Hybrid) and five temporal encoder architectures\n(LSTM, GRU, TempCNN, TAE, L-TAE) as possible MVL model configurations. The\nvalidation is on the CropHarvest dataset that provides optical, radar, and\nweather time series, and topographic information as input data. We found that\nin scenarios with a limited number of labeled samples, a unique configuration\nis insufficient for all the cases. Instead, a specialized combination,\nincluding encoder and fusion strategy, should be meticulously sought. To\nstreamline this search process, we suggest initially identifying the optimal\nencoder architecture tailored for a particular fusion strategy, and then\ndetermining the most suitable fusion strategy for the classification task. We\nprovide a technical framework for researchers exploring crop classification or\nrelated tasks through a MVL approach.\n","authors":["Francisco Mena","Diego Arenas","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2403.16582v1.pdf","comment":"submitted to journal"},{"id":"http://arxiv.org/abs/2403.16578v1","updated":"2024-03-25T09:43:56Z","published":"2024-03-25T09:43:56Z","title":"SegICL: A Universal In-context Learning Framework for Enhanced\n Segmentation in Medical Imaging","summary":" Medical image segmentation models adapting to new tasks in a training-free\nmanner through in-context learning is an exciting advancement. Universal\nsegmentation models aim to generalize across the diverse modality of medical\nimages, yet their effectiveness often diminishes when applied to\nout-of-distribution (OOD) data modalities and tasks, requiring intricate\nfine-tuning of model for optimal performance. For addressing this challenge, we\nintroduce SegICL, a novel approach leveraging In-Context Learning (ICL) for\nimage segmentation. Unlike existing methods, SegICL has the capability to\nemploy text-guided segmentation and conduct in-context learning with a small\nset of image-mask pairs, eliminating the need for training the model from\nscratch or fine-tuning for OOD tasks (including OOD modality and dataset).\nExtensive experimental validation of SegICL demonstrates a positive correlation\nbetween the number of prompt samples and segmentation performance on OOD\nmodalities and tasks. This indicates that SegICL effectively address new\nsegmentation tasks based on contextual information. Additionally, SegICL also\nexhibits comparable segmentation performance to mainstream models on OOD and\nin-distribution tasks. Our code will be released soon.\n","authors":["Lingdong Shen","Fangxin Shang","Yehui Yang","Xiaoshuang Huang","Shining Xiang"],"pdf_url":"https://arxiv.org/pdf/2403.16578v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10615v2","updated":"2024-03-25T09:42:13Z","published":"2024-03-15T18:26:33Z","title":"LightIt: Illumination Modeling and Control for Diffusion Models","summary":" We introduce LightIt, a method for explicit illumination control for image\ngeneration. Recent generative methods lack lighting control, which is crucial\nto numerous artistic aspects of image generation such as setting the overall\nmood or cinematic appearance. To overcome these limitations, we propose to\ncondition the generation on shading and normal maps. We model the lighting with\nsingle bounce shading, which includes cast shadows. We first train a shading\nestimation module to generate a dataset of real-world images and shading pairs.\nThen, we train a control network using the estimated shading and normals as\ninput. Our method demonstrates high-quality image generation and lighting\ncontrol in numerous scenes. Additionally, we use our generated dataset to train\nan identity-preserving relighting model, conditioned on an image and a target\nshading. Our method is the first that enables the generation of images with\ncontrollable, consistent lighting and performs on par with specialized\nrelighting state-of-the-art methods.\n","authors":["Peter Kocsis","Julien Philip","Kalyan Sunkavalli","Matthias Nießner","Yannick Hold-Geoffroy"],"pdf_url":"https://arxiv.org/pdf/2403.10615v2.pdf","comment":"Project page: https://peter-kocsis.github.io/LightIt/ Video:\n https://youtu.be/cCfSBD5aPLI"},{"id":"http://arxiv.org/abs/2403.15353v2","updated":"2024-03-25T09:36:42Z","published":"2024-03-22T17:08:03Z","title":"Fully automated workflow for the design of patient-specific orthopaedic\n implants: application to total knee arthroplasty","summary":" Arthroplasty is commonly performed to treat joint osteoarthritis, reducing\npain and improving mobility. While arthroplasty has known several technical\nimprovements, a significant share of patients are still unsatisfied with their\nsurgery. Personalised arthroplasty improves surgical outcomes however current\nsolutions require delays, making it difficult to integrate in clinical routine.\nWe propose a fully automated workflow to design patient-specific implants,\npresented for total knee arthroplasty, the most widely performed arthroplasty\nin the world nowadays.\n The proposed pipeline first uses artificial neural networks to segment the\nproximal and distal extremities of the femur and tibia. Then the full bones are\nreconstructed using augmented statistical shape models, combining shape and\nlandmarks information. Finally, 77 morphological parameters are computed to\ndesign patient-specific implants. The developed workflow has been trained using\n91 CT scans of lower limb and evaluated on 41 CT scans manually segmented, in\nterms of accuracy and execution time.\n The workflow accuracy was $0.4\\pm0.2mm$ for the segmentation, $1.2\\pm0.4mm$\nfor the full bones reconstruction, and $2.8\\pm2.2mm$ for the anatomical\nlandmarks determination. The custom implants fitted the patients' anatomy with\n$0.6\\pm0.2mm$ accuracy. The whole process from segmentation to implants' design\nlasted about 5 minutes.\n The proposed workflow allows for a fast and reliable personalisation of knee\nimplants, directly from the patient CT image without requiring any manual\nintervention. It establishes a patient-specific pre-operative planning for TKA\nin a very short time making it easily available for all patients. Combined with\nefficient implant manufacturing techniques, this solution could help answer the\ngrowing number of arthroplasties while reducing complications and improving the\npatients' satisfaction.\n","authors":["Aziliz Guezou-Philippe","Arnaud Clavé","Ehouarn Maguet","Ludivine Maintier","Charles Garraud","Jean-Rassaire Fouefack","Valérie Burdin","Eric Stindel","Guillaume Dardenne"],"pdf_url":"https://arxiv.org/pdf/2403.15353v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16569v1","updated":"2024-03-25T09:36:10Z","published":"2024-03-25T09:36:10Z","title":"Revealing Vulnerabilities of Neural Networks in Parameter Learning and\n Defense Against Explanation-Aware Backdoors","summary":" Explainable Artificial Intelligence (XAI) strategies play a crucial part in\nincreasing the understanding and trustworthiness of neural networks.\nNonetheless, these techniques could potentially generate misleading\nexplanations. Blinding attacks can drastically alter a machine learning\nalgorithm's prediction and explanation, providing misleading information by\nadding visually unnoticeable artifacts into the input, while maintaining the\nmodel's accuracy. It poses a serious challenge in ensuring the reliability of\nXAI methods. To ensure the reliability of XAI methods poses a real challenge,\nwe leverage statistical analysis to highlight the changes in CNN weights within\na CNN following blinding attacks. We introduce a method specifically designed\nto limit the effectiveness of such attacks during the evaluation phase,\navoiding the need for extra training. The method we suggest defences against\nmost modern explanation-aware adversarial attacks, achieving an approximate\ndecrease of ~99\\% in the Attack Success Rate (ASR) and a ~91\\% reduction in the\nMean Square Error (MSE) between the original explanation and the defended\n(post-attack) explanation across three unique types of attacks.\n","authors":["Md Abdul Kadir","GowthamKrishna Addluri","Daniel Sonntag"],"pdf_url":"https://arxiv.org/pdf/2403.16569v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16558v1","updated":"2024-03-25T09:17:15Z","published":"2024-03-25T09:17:15Z","title":"Elysium: Exploring Object-level Perception in Videos via MLLM","summary":" Multi-modal Large Language Models (MLLMs) have demonstrated their ability to\nperceive objects in still images, but their application in video-related tasks,\nsuch as object tracking, remains understudied. This lack of exploration is\nprimarily due to two key challenges. Firstly, extensive pretraining on\nlarge-scale video datasets is required to equip MLLMs with the capability to\nperceive objects across multiple frames and understand inter-frame\nrelationships. Secondly, processing a large number of frames within the context\nwindow of Large Language Models (LLMs) can impose a significant computational\nburden. To address the first challenge, we introduce ElysiumTrack-1M, a\nlarge-scale video dataset paired with novel tasks: Referring Single Object\nTracking (RSOT) and Video Referring Expression Generation (Video-REG).\nElysiumTrack-1M contains 1.27 million annotated video frames with corresponding\nobject boxes and descriptions. Leveraging this dataset, we conduct training of\nMLLMs and propose a token-compression model T-Selector to tackle the second\nchallenge. Our proposed approach, Elysium: Exploring Object-level Perception in\nVideos via MLLM, is an end-to-end trainable MLLM that makes the first attempt\nto conduct object-level tasks in videos without requiring any additional\nplug-in or expert models.\n","authors":["Han Wang","Yanjie Wang","Yongjie Ye","Yuxiang Nie","Can Huang"],"pdf_url":"https://arxiv.org/pdf/2403.16558v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16552v1","updated":"2024-03-25T08:57:27Z","published":"2024-03-25T08:57:27Z","title":"QKFormer: Hierarchical Spiking Transformer using Q-K Attention","summary":" Spiking Transformers, which integrate Spiking Neural Networks (SNNs) with\nTransformer architectures, have attracted significant attention due to their\npotential for energy efficiency and high performance. However, existing models\nin this domain still suffer from suboptimal performance. We introduce several\ninnovations to improve the performance: i) We propose a novel spike-form Q-K\nattention mechanism, tailored for SNNs, which efficiently models the importance\nof token or channel dimensions through binary vectors with linear complexity.\nii) We incorporate the hierarchical structure, which significantly benefits the\nperformance of both the brain and artificial neural networks, into spiking\ntransformers to obtain multi-scale spiking representation. iii) We design a\nversatile and powerful patch embedding module with a deformed shortcut\nspecifically for spiking transformers. Together, we develop QKFormer, a\nhierarchical spiking transformer based on Q-K attention with direct training.\nQKFormer shows significantly superior performance over existing\nstate-of-the-art SNN models on various mainstream datasets. Notably, with\ncomparable size to Spikformer (66.34 M, 74.81%), QKFormer (64.96 M) achieves a\ngroundbreaking top-1 accuracy of 85.65% on ImageNet-1k, substantially\noutperforming Spikformer by 10.84%. To our best knowledge, this is the first\ntime that directly training SNNs have exceeded 85% accuracy on ImageNet-1K. The\ncode and models are publicly available at\nhttps://github.com/zhouchenlin2096/QKFormer\n","authors":["Chenlin Zhou","Han Zhang","Zhaokun Zhou","Liutao Yu","Liwei Huang","Xiaopeng Fan","Li Yuan","Zhengyu Ma","Huihui Zhou","Yonghong Tian"],"pdf_url":"https://arxiv.org/pdf/2403.16552v1.pdf","comment":"10 pages, code: https://github.com/zhouchenlin2096/QKFormer"},{"id":"http://arxiv.org/abs/2403.11854v2","updated":"2024-03-25T08:54:33Z","published":"2024-03-18T15:03:56Z","title":"denoiSplit: a method for joint image splitting and unsupervised\n denoising","summary":" In this work we present denoiSplit, a method to tackle a new analysis task,\ni.e. the challenge of joint semantic image splitting and unsupervised\ndenoising. This dual approach has important applications in fluorescence\nmicroscopy, where semantic image splitting has important applications but noise\ndoes generally hinder the downstream analysis of image content. Image splitting\ninvolves dissecting an image into its distinguishable semantic structures. We\nshow that the current state-of-the-art method for this task struggles in the\npresence of image noise, inadvertently also distributing the noise across the\npredicted outputs. The method we present here can deal with image noise by\nintegrating an unsupervised denoising sub-task. This integration results in\nimproved semantic image unmixing, even in the presence of notable and realistic\nlevels of imaging noise. A key innovation in denoiSplit is the use of\nspecifically formulated noise models and the suitable adjustment of\nKL-divergence loss for the high-dimensional hierarchical latent space we are\ntraining. We showcase the performance of denoiSplit across 4 tasks on\nreal-world microscopy images. Additionally, we perform qualitative and\nquantitative evaluations and compare results to existing benchmarks,\ndemonstrating the effectiveness of using denoiSplit: a single Variational\nSplitting Encoder-Decoder (VSE) Network using two suitable noise models to\njointly perform semantic splitting and denoising.\n","authors":["Ashesh Ashesh","Florian Jug"],"pdf_url":"https://arxiv.org/pdf/2403.11854v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.02970v5","updated":"2024-03-25T08:50:42Z","published":"2023-04-06T09:54:06Z","title":"Unraveling Instance Associations: A Closer Look for Audio-Visual\n Segmentation","summary":" Audio-visual segmentation (AVS) is a challenging task that involves\naccurately segmenting sounding objects based on audio-visual cues. The\neffectiveness of audio-visual learning critically depends on achieving accurate\ncross-modal alignment between sound and visual objects. Successful audio-visual\nlearning requires two essential components: 1) a challenging dataset with\nhigh-quality pixel-level multi-class annotated images associated with audio\nfiles, and 2) a model that can establish strong links between audio information\nand its corresponding visual object. However, these requirements are only\npartially addressed by current methods, with training sets containing biased\naudio-visual data, and models that generalise poorly beyond this biased\ntraining set. In this work, we propose a new cost-effective strategy to build\nchallenging and relatively unbiased high-quality audio-visual segmentation\nbenchmarks. We also propose a new informative sample mining method for\naudio-visual supervised contrastive learning to leverage discriminative\ncontrastive samples to enforce cross-modal understanding. We show empirical\nresults that demonstrate the effectiveness of our benchmark. Furthermore,\nexperiments conducted on existing AVS datasets and on our new benchmark show\nthat our method achieves state-of-the-art (SOTA) segmentation accuracy.\n","authors":["Yuanhong Chen","Yuyuan Liu","Hu Wang","Fengbei Liu","Chong Wang","Helen Frazer","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2304.02970v5.pdf","comment":"Code is available at https://github.com/cyh-0/CAVP"},{"id":"http://arxiv.org/abs/2403.06904v2","updated":"2024-03-25T08:45:37Z","published":"2024-03-11T16:56:37Z","title":"FocusCLIP: Multimodal Subject-Level Guidance for Zero-Shot Transfer in\n Human-Centric Tasks","summary":" We propose FocusCLIP, integrating subject-level guidance--a specialized\nmechanism for target-specific supervision--into the CLIP framework for improved\nzero-shot transfer on human-centric tasks. Our novel contributions enhance CLIP\non both the vision and text sides. On the vision side, we incorporate ROI\nheatmaps emulating human visual attention mechanisms to emphasize\nsubject-relevant image regions. On the text side, we introduce human pose\ndescriptions to provide rich contextual information. For human-centric tasks,\nFocusCLIP is trained with images from the MPII Human Pose dataset. The proposed\napproach surpassed CLIP by an average of 8.61% across five previously unseen\ndatasets covering three human-centric tasks. FocusCLIP achieved an average\naccuracy of 33.65% compared to 25.04% by CLIP. We observed a 3.98% improvement\nin activity recognition, a 14.78% improvement in age classification, and a\n7.06% improvement in emotion recognition. Moreover, using our proposed\nsingle-shot LLM prompting strategy, we release a high-quality MPII Pose\nDescriptions dataset to encourage further research in multimodal learning for\nhuman-centric tasks. Furthermore, we also demonstrate the effectiveness of our\nsubject-level supervision on non-human-centric tasks. FocusCLIP shows a 2.47%\nimprovement over CLIP in zero-shot bird classification using the CUB dataset.\nOur findings emphasize the potential of integrating subject-level guidance with\ngeneral pretraining methods for enhanced downstream performance.\n","authors":["Muhammad Saif Ullah Khan","Muhammad Ferjad Naeem","Federico Tombari","Luc Van Gool","Didier Stricker","Muhammad Zeshan Afzal"],"pdf_url":"https://arxiv.org/pdf/2403.06904v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00247v4","updated":"2024-03-25T08:34:15Z","published":"2023-08-01T03:00:36Z","title":"Unleashing the Power of Self-Supervised Image Denoising: A Comprehensive\n Review","summary":" The advent of deep learning has brought a revolutionary transformation to\nimage denoising techniques. However, the persistent challenge of acquiring\nnoise-clean pairs for supervised methods in real-world scenarios remains\nformidable, necessitating the exploration of more practical self-supervised\nimage denoising. This paper focuses on self-supervised image denoising methods\nthat offer effective solutions to address this challenge. Our comprehensive\nreview thoroughly analyzes the latest advancements in self-supervised image\ndenoising approaches, categorizing them into three distinct classes: General\nmethods, Blind Spot Network (BSN)-based methods, and Transformer-based methods.\nFor each class, we provide a concise theoretical analysis along with their\npractical applications. To assess the effectiveness of these methods, we\npresent both quantitative and qualitative experimental results on various\ndatasets, utilizing classical algorithms as benchmarks. Additionally, we\ncritically discuss the current limitations of these methods and propose\npromising directions for future research. By offering a detailed overview of\nrecent developments in self-supervised image denoising, this review serves as\nan invaluable resource for researchers and practitioners in the field,\nfacilitating a deeper understanding of this emerging domain and inspiring\nfurther advancements.\n","authors":["Dan Zhang","Fangfang Zhou","Felix Albu","Yuanzhou Wei","Xiao Yang","Yuan Gu","Qiang Li"],"pdf_url":"https://arxiv.org/pdf/2308.00247v4.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2403.16539v1","updated":"2024-03-25T08:31:14Z","published":"2024-03-25T08:31:14Z","title":"DOrA: 3D Visual Grounding with Order-Aware Referring","summary":" 3D visual grounding aims to identify the target object within a 3D point\ncloud scene referred to by a natural language description. While previous works\nattempt to exploit the verbo-visual relation with proposed cross-modal\ntransformers, unstructured natural utterances and scattered objects might lead\nto undesirable performances. In this paper, we introduce DOrA, a novel 3D\nvisual grounding framework with Order-Aware referring. DOrA is designed to\nleverage Large Language Models (LLMs) to parse language description, suggesting\na referential order of anchor objects. Such ordered anchor objects allow DOrA\nto update visual features and locate the target object during the grounding\nprocess. Experimental results on the NR3D and ScanRefer datasets demonstrate\nour superiority in both low-resource and full-data scenarios. In particular,\nDOrA surpasses current state-of-the-art frameworks by 9.3% and 7.8% grounding\naccuracy under 1% data and 10% data settings, respectively.\n","authors":["Tung-Yu Wu","Sheng-Yu Huang","Yu-Chiang Frank Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08262v4","updated":"2024-03-25T08:29:52Z","published":"2024-03-13T05:25:49Z","title":"BiTT: Bi-directional Texture Reconstruction of Interacting Two Hands\n from a Single Image","summary":" Creating personalized hand avatars is important to offer a realistic\nexperience to users on AR / VR platforms. While most prior studies focused on\nreconstructing 3D hand shapes, some recent work has tackled the reconstruction\nof hand textures on top of shapes. However, these methods are often limited to\ncapturing pixels on the visible side of a hand, requiring diverse views of the\nhand in a video or multiple images as input. In this paper, we propose a novel\nmethod, BiTT(Bi-directional Texture reconstruction of Two hands), which is the\nfirst end-to-end trainable method for relightable, pose-free texture\nreconstruction of two interacting hands taking only a single RGB image, by\nthree novel components: 1) bi-directional (left $\\leftrightarrow$ right)\ntexture reconstruction using the texture symmetry of left / right hands, 2)\nutilizing a texture parametric model for hand texture recovery, and 3) the\noverall coarse-to-fine stage pipeline for reconstructing personalized texture\nof two interacting hands. BiTT first estimates the scene light condition and\nalbedo image from an input image, then reconstructs the texture of both hands\nthrough the texture parametric model and bi-directional texture reconstructor.\nIn experiments using InterHand2.6M and RGB2Hands datasets, our method\nsignificantly outperforms state-of-the-art hand texture reconstruction methods\nquantitatively and qualitatively. The code is available at\nhttps://github.com/yunminjin2/BiTT\n","authors":["Minje Kim","Tae-Kyun Kim"],"pdf_url":"https://arxiv.org/pdf/2403.08262v4.pdf","comment":"Accepted by CVPR 2024, Project Page:\n https://yunminjin2.github.io/projects/bitt/"},{"id":"http://arxiv.org/abs/2403.16536v1","updated":"2024-03-25T08:26:42Z","published":"2024-03-25T08:26:42Z","title":"VMRNN: Integrating Vision Mamba and LSTM for Efficient and Accurate\n Spatiotemporal Forecasting","summary":" Combining CNNs or ViTs, with RNNs for spatiotemporal forecasting, has yielded\nunparalleled results in predicting temporal and spatial dynamics. However,\nmodeling extensive global information remains a formidable challenge; CNNs are\nlimited by their narrow receptive fields, and ViTs struggle with the intensive\ncomputational demands of their attention mechanisms. The emergence of recent\nMamba-based architectures has been met with enthusiasm for their exceptional\nlong-sequence modeling capabilities, surpassing established vision models in\nefficiency and accuracy, which motivates us to develop an innovative\narchitecture tailored for spatiotemporal forecasting. In this paper, we propose\nthe VMRNN cell, a new recurrent unit that integrates the strengths of Vision\nMamba blocks with LSTM. We construct a network centered on VMRNN cells to\ntackle spatiotemporal prediction tasks effectively. Our extensive evaluations\nshow that our proposed approach secures competitive results on a variety of\ntasks while maintaining a smaller model size. Our code is available at\nhttps://github.com/yyyujintang/VMRNN-PyTorch.\n","authors":["Yujin Tang","Peijie Dong","Zhenheng Tang","Xiaowen Chu","Junwei Liang"],"pdf_url":"https://arxiv.org/pdf/2403.16536v1.pdf","comment":"11 pages, 7 figures. arXiv admin note: text overlap with\n arXiv:2308.09891 by other authors"},{"id":"http://arxiv.org/abs/2403.16530v1","updated":"2024-03-25T08:16:06Z","published":"2024-03-25T08:16:06Z","title":"An Intermediate Fusion ViT Enables Efficient Text-Image Alignment in\n Diffusion Models","summary":" Diffusion models have been widely used for conditional data cross-modal\ngeneration tasks such as text-to-image and text-to-video. However,\nstate-of-the-art models still fail to align the generated visual concepts with\nhigh-level semantics in a language such as object count, spatial relationship,\netc. We approach this problem from a multimodal data fusion perspective and\ninvestigate how different fusion strategies can affect vision-language\nalignment. We discover that compared to the widely used early fusion of\nconditioning text in a pretrained image feature space, a specially designed\nintermediate fusion can: (i) boost text-to-image alignment with improved\ngeneration quality and (ii) improve training and inference efficiency by\nreducing low-rank text-to-image attention calculations. We perform experiments\nusing a text-to-image generation task on the MS-COCO dataset. We compare our\nintermediate fusion mechanism with the classic early fusion mechanism on two\ncommon conditioning methods on a U-shaped ViT backbone. Our intermediate fusion\nmodel achieves a higher CLIP Score and lower FID, with 20% reduced FLOPs, and\n50% increased training speed compared to a strong U-ViT baseline with an early\nfusion.\n","authors":["Zizhao Hu","Shaochong Jia","Mohammad Rostami"],"pdf_url":"https://arxiv.org/pdf/2403.16530v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16528v1","updated":"2024-03-25T08:14:22Z","published":"2024-03-25T08:14:22Z","title":"Open-Set Recognition in the Age of Vision-Language Models","summary":" Are vision-language models (VLMs) open-set models because they are trained on\ninternet-scale datasets? We answer this question with a clear no - VLMs\nintroduce closed-set assumptions via their finite query set, making them\nvulnerable to open-set conditions. We systematically evaluate VLMs for open-set\nrecognition and find they frequently misclassify objects not contained in their\nquery set, leading to alarmingly low precision when tuned for high recall and\nvice versa. We show that naively increasing the size of the query set to\ncontain more and more classes does not mitigate this problem, but instead\ncauses diminishing task performance and open-set performance. We establish a\nrevised definition of the open-set problem for the age of VLMs, define a new\nbenchmark and evaluation protocol to facilitate standardised evaluation and\nresearch in this important area, and evaluate promising baseline approaches\nbased on predictive uncertainty and dedicated negative embeddings on a range of\nVLM classifiers and object detectors.\n","authors":["Dimity Miller","Niko Sünderhauf","Alex Kenna","Keita Mason"],"pdf_url":"https://arxiv.org/pdf/2403.16528v1.pdf","comment":"31 pages, under review"},{"id":"http://arxiv.org/abs/2403.16526v1","updated":"2024-03-25T08:09:22Z","published":"2024-03-25T08:09:22Z","title":"ModeTv2: GPU-accelerated Motion Decomposition Transformer for Pairwise\n Optimization in Medical Image Registration","summary":" Deformable image registration plays a crucial role in medical imaging, aiding\nin disease diagnosis and image-guided interventions. Traditional iterative\nmethods are slow, while deep learning (DL) accelerates solutions but faces\nusability and precision challenges. This study introduces a pyramid network\nwith the enhanced motion decomposition Transformer (ModeTv2) operator,\nshowcasing superior pairwise optimization (PO) akin to traditional methods. We\nre-implement ModeT operator with CUDA extensions to enhance its computational\nefficiency. We further propose RegHead module which refines deformation fields,\nimproves the realism of deformation and reduces parameters. By adopting the PO,\nthe proposed network balances accuracy, efficiency, and generalizability.\nExtensive experiments on two public brain MRI datasets and one abdominal CT\ndataset demonstrate the network's suitability for PO, providing a DL model with\nenhanced usability and interpretability. The code is publicly available.\n","authors":["Haiqiao Wang","Zhuoyuan Wang","Dong Ni","Yi Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16526v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08863v3","updated":"2024-03-25T08:05:16Z","published":"2023-11-15T10:49:15Z","title":"Toulouse Hyperspectral Data Set: a benchmark data set to assess\n semi-supervised spectral representation learning and pixel-wise\n classification techniques","summary":" Airborne hyperspectral images can be used to map the land cover in large\nurban areas, thanks to their very high spatial and spectral resolutions on a\nwide spectral domain. While the spectral dimension of hyperspectral images is\nhighly informative of the chemical composition of the land surface, the use of\nstate-of-the-art machine learning algorithms to map the land cover has been\ndramatically limited by the availability of training data. To cope with the\nscarcity of annotations, semi-supervised and self-supervised techniques have\nlately raised a lot of interest in the community. Yet, the publicly available\nhyperspectral data sets commonly used to benchmark machine learning models are\nnot totally suited to evaluate their generalization performances due to one or\nseveral of the following properties: a limited geographical coverage (which\ndoes not reflect the spectral diversity in metropolitan areas), a small number\nof land cover classes and a lack of appropriate standard train / test splits\nfor semi-supervised and self-supervised learning. Therefore, we release in this\npaper the Toulouse Hyperspectral Data Set that stands out from other data sets\nin the above-mentioned respects in order to meet key issues in spectral\nrepresentation learning and classification over large-scale hyperspectral\nimages with very few labeled pixels. Besides, we discuss and experiment\nself-supervised techniques for spectral representation learning, including the\nMasked Autoencoder, and establish a baseline for pixel-wise classification\nachieving 85% overall accuracy and 77% F1 score. The Toulouse Hyperspectral\nData Set and our code are publicly available at\nhttps://www.toulouse-hyperspectral-data-set.com and\nhttps://www.github.com/Romain3Ch216/tlse-experiments, respectively.\n","authors":["Romain Thoreau","Laurent Risser","Véronique Achard","Béatrice Berthelot","Xavier Briottet"],"pdf_url":"https://arxiv.org/pdf/2311.08863v3.pdf","comment":"17 pages, 13 figures"},{"id":"http://arxiv.org/abs/2403.16520v1","updated":"2024-03-25T08:02:41Z","published":"2024-03-25T08:02:41Z","title":"CMViM: Contrastive Masked Vim Autoencoder for 3D Multi-modal\n Representation Learning for AD classification","summary":" Alzheimer's disease (AD) is an incurable neurodegenerative condition leading\nto cognitive and functional deterioration. Given the lack of a cure, prompt and\nprecise AD diagnosis is vital, a complex process dependent on multiple factors\nand multi-modal data. While successful efforts have been made to integrate\nmulti-modal representation learning into medical datasets, scant attention has\nbeen given to 3D medical images. In this paper, we propose Contrastive Masked\nVim Autoencoder (CMViM), the first efficient representation learning method\ntailored for 3D multi-modal data. Our proposed framework is built on a masked\nVim autoencoder to learn a unified multi-modal representation and\nlong-dependencies contained in 3D medical images. We also introduce an\nintra-modal contrastive learning module to enhance the capability of the\nmulti-modal Vim encoder for modeling the discriminative features in the same\nmodality, and an inter-modal contrastive learning module to alleviate\nmisaligned representation among modalities. Our framework consists of two main\nsteps: 1) incorporate the Vision Mamba (Vim) into the mask autoencoder to\nreconstruct 3D masked multi-modal data efficiently. 2) align the multi-modal\nrepresentations with contrastive learning mechanisms from both intra-modal and\ninter-modal aspects. Our framework is pre-trained and validated ADNI2 dataset\nand validated on the downstream task for AD classification. The proposed CMViM\nyields 2.7\\% AUC performance improvement compared with other state-of-the-art\nmethods.\n","authors":["Guangqian Yang","Kangrui Du","Zhihan Yang","Ye Du","Yongping Zheng","Shujun Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16520v1.pdf","comment":"11 pages, 1 figure"},{"id":"http://arxiv.org/abs/2403.16516v1","updated":"2024-03-25T08:00:43Z","published":"2024-03-25T08:00:43Z","title":"Visually Guided Generative Text-Layout Pre-training for Document\n Intelligence","summary":" Prior study shows that pre-training techniques can boost the performance of\nvisual document understanding (VDU), which typically requires models to gain\nabilities to perceive and reason both document texts and layouts (e.g.,\nlocations of texts and table-cells). To this end, we propose visually guided\ngenerative text-layout pre-training, named ViTLP. Given a document image, the\nmodel optimizes hierarchical language and layout modeling objectives to\ngenerate the interleaved text and layout sequence. In addition, to address the\nlimitation of processing long documents by Transformers, we introduce a\nstraightforward yet effective multi-segment generative pre-training scheme,\nfacilitating ViTLP to process word-intensive documents of any length. ViTLP can\nfunction as a native OCR model to localize and recognize texts of document\nimages. Besides, ViTLP can be effectively applied to various downstream VDU\ntasks. Extensive experiments show that ViTLP achieves competitive performance\nover existing baselines on benchmark VDU tasks, including information\nextraction, document classification, and document question answering.\n","authors":["Zhiming Mao","Haoli Bai","Lu Hou","Jiansheng Wei","Xin Jiang","Qun Liu","Kam-Fai Wong"],"pdf_url":"https://arxiv.org/pdf/2403.16516v1.pdf","comment":"Accepted to NAACL 2024 main conference. The first version of this\n paper was submitted to OpenReview\n (https://openreview.net/forum?id=ARtBIBAmNR) in June 2023"},{"id":"http://arxiv.org/abs/2403.16513v1","updated":"2024-03-25T07:58:58Z","published":"2024-03-25T07:58:58Z","title":"Let Real Images be as a Judger, Spotting Fake Images Synthesized with\n Generative Models","summary":" In the last few years, generative models have shown their powerful\ncapabilities in synthesizing realistic images in both quality and diversity\n(i.e., facial images, and natural subjects). Unfortunately, the artifact\npatterns in fake images synthesized by different generative models are\ninconsistent, leading to the failure of previous research that relied on\nspotting subtle differences between real and fake. In our preliminary\nexperiments, we find that the artifacts in fake images always change with the\ndevelopment of the generative model, while natural images exhibit stable\nstatistical properties. In this paper, we employ natural traces shared only by\nreal images as an additional predictive target in the detector. Specifically,\nthe natural traces are learned from the wild real images and we introduce\nextended supervised contrastive learning to bring them closer to real images\nand further away from fake ones. This motivates the detector to make decisions\nbased on the proximity of images to the natural traces. To conduct a\ncomprehensive experiment, we built a high-quality and diverse dataset that\nincludes generative models comprising 6 GAN and 6 diffusion models, to evaluate\nthe effectiveness in generalizing unknown forgery techniques and robustness in\nsurviving different transformations. Experimental results show that our\nproposed method gives 96.1% mAP significantly outperforms the baselines.\nExtensive experiments conducted on the widely recognized platform Midjourney\nreveal that our proposed method achieves an accuracy exceeding 78.4%,\nunderscoring its practicality for real-world application deployment. The source\ncode and partial self-built dataset are available in supplementary material.\n","authors":["Ziyou Liang","Run Wang","Weifeng Liu","Yuyang Zhang","Wenyuan Yang","Lina Wang","Xingkai Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16513v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16510v1","updated":"2024-03-25T07:54:18Z","published":"2024-03-25T07:54:18Z","title":"Make-Your-Anchor: A Diffusion-based 2D Avatar Generation Framework","summary":" Despite the remarkable process of talking-head-based avatar-creating\nsolutions, directly generating anchor-style videos with full-body motions\nremains challenging. In this study, we propose Make-Your-Anchor, a novel system\nnecessitating only a one-minute video clip of an individual for training,\nsubsequently enabling the automatic generation of anchor-style videos with\nprecise torso and hand movements. Specifically, we finetune a proposed\nstructure-guided diffusion model on input video to render 3D mesh conditions\ninto human appearances. We adopt a two-stage training strategy for the\ndiffusion model, effectively binding movements with specific appearances. To\nproduce arbitrary long temporal video, we extend the 2D U-Net in the frame-wise\ndiffusion model to a 3D style without additional training cost, and a simple\nyet effective batch-overlapped temporal denoising module is proposed to bypass\nthe constraints on video length during inference. Finally, a novel\nidentity-specific face enhancement module is introduced to improve the visual\nquality of facial regions in the output videos. Comparative experiments\ndemonstrate the effectiveness and superiority of the system in terms of visual\nquality, temporal coherence, and identity preservation, outperforming SOTA\ndiffusion/non-diffusion methods. Project page:\n\\url{https://github.com/ICTMCG/Make-Your-Anchor}.\n","authors":["Ziyao Huang","Fan Tang","Yong Zhang","Xiaodong Cun","Juan Cao","Jintao Li","Tong-Yee Lee"],"pdf_url":"https://arxiv.org/pdf/2403.16510v1.pdf","comment":"accepted at CVPR2024"},{"id":"http://arxiv.org/abs/2305.01309v2","updated":"2024-03-25T07:53:54Z","published":"2023-05-02T10:35:20Z","title":"Geometric Prior Based Deep Human Point Cloud Geometry Compression","summary":" The emergence of digital avatars has raised an exponential increase in the\ndemand for human point clouds with realistic and intricate details. The\ncompression of such data becomes challenging with overwhelming data amounts\ncomprising millions of points. Herein, we leverage the human geometric prior in\ngeometry redundancy removal of point clouds, greatly promoting the compression\nperformance. More specifically, the prior provides topological constraints as\ngeometry initialization, allowing adaptive adjustments with a compact parameter\nset that could be represented with only a few bits. Therefore, we can envisage\nhigh-resolution human point clouds as a combination of geometric priors and\nstructural deviations. The priors could first be derived with an aligned point\ncloud, and subsequently the difference of features is compressed into a compact\nlatent code. The proposed framework can operate in a play-and-plug fashion with\nexisting learning based point cloud compression methods. Extensive experimental\nresults show that our approach significantly improves the compression\nperformance without deteriorating the quality, demonstrating its promise in a\nvariety of applications.\n","authors":["Xinju Wu","Pingping Zhang","Meng Wang","Peilin Chen","Shiqi Wang","Sam Kwong"],"pdf_url":"https://arxiv.org/pdf/2305.01309v2.pdf","comment":"Accepted by TCSVT 2024"},{"id":"http://arxiv.org/abs/2311.17315v3","updated":"2024-03-25T07:51:14Z","published":"2023-11-29T02:10:31Z","title":"Explaining CLIP's performance disparities on data from blind/low vision\n users","summary":" Large multi-modal models (LMMs) hold the potential to usher in a new era of\nautomated visual assistance for people who are blind or low vision (BLV). Yet,\nthese models have not been systematically evaluated on data captured by BLV\nusers. We address this by empirically assessing CLIP, a widely-used LMM likely\nto underpin many assistive technologies. Testing 25 CLIP variants in a\nzero-shot classification task, we find that their accuracy is 15 percentage\npoints lower on average for images captured by BLV users than web-crawled\nimages. This disparity stems from CLIP's sensitivities to 1) image content\n(e.g. not recognizing disability objects as well as other objects); 2) image\nquality (e.g. not being robust to lighting variation); and 3) text content\n(e.g. not recognizing objects described by tactile adjectives as well as visual\nones). We delve deeper with a textual analysis of three common pre-training\ndatasets: LAION-400M, LAION-2B and DataComp-1B, showing that disability content\nis rarely mentioned. We then provide three examples that illustrate how the\nperformance disparities extend to three downstream models underpinned by CLIP:\nOWL-ViT, CLIPSeg and DALL-E2. We find that few-shot learning with as few as 5\nimages can mitigate CLIP's quality-of-service disparities for BLV users in some\nscenarios, which we discuss alongside a set of other possible mitigations.\n","authors":["Daniela Massiceti","Camilla Longden","Agnieszka Słowik","Samuel Wills","Martin Grayson","Cecily Morrison"],"pdf_url":"https://arxiv.org/pdf/2311.17315v3.pdf","comment":"Accepted at 2024 IEEE/CVF Conference on Computer Vision and Pattern\n Recognition (CVPR)"},{"id":"http://arxiv.org/abs/2403.16502v1","updated":"2024-03-25T07:35:28Z","published":"2024-03-25T07:35:28Z","title":"Medical Image Registration and Its Application in Retinal Images: A\n Review","summary":" Medical image registration is vital for disease diagnosis and treatment with\nits ability to merge diverse information of images, which may be captured under\ndifferent times, angles, or modalities. Although several surveys have reviewed\nthe development of medical image registration, these surveys have not\nsystematically summarized methodologies of existing medical image registration\nmethods. To this end, we provide a comprehensive review of these methods from\ntraditional and deep learning-based directions, aiming to help audiences\nunderstand the development of medical image registration quickly. In\nparticular, we review recent advances in retinal image registration at the end\nof each section, which has not attracted much attention. Additionally, we also\ndiscuss the current challenges of retinal image registration and provide\ninsights and prospects for future research.\n","authors":["Qiushi Nie","Xiaoqing Zhang","Yan Hu","Mingdao Gong","Jiang Liu"],"pdf_url":"https://arxiv.org/pdf/2403.16502v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16499v1","updated":"2024-03-25T07:34:06Z","published":"2024-03-25T07:34:06Z","title":"Self-Supervised Learning for Medical Image Data with Anatomy-Oriented\n Imaging Planes","summary":" Self-supervised learning has emerged as a powerful tool for pretraining deep\nnetworks on unlabeled data, prior to transfer learning of target tasks with\nlimited annotation. The relevance between the pretraining pretext and target\ntasks is crucial to the success of transfer learning. Various pretext tasks\nhave been proposed to utilize properties of medical image data (e.g., three\ndimensionality), which are more relevant to medical image analysis than generic\nones for natural images. However, previous work rarely paid attention to data\nwith anatomy-oriented imaging planes, e.g., standard cardiac magnetic resonance\nimaging views. As these imaging planes are defined according to the anatomy of\nthe imaged organ, pretext tasks effectively exploiting this information can\npretrain the networks to gain knowledge on the organ of interest. In this work,\nwe propose two complementary pretext tasks for this group of medical image data\nbased on the spatial relationship of the imaging planes. The first is to learn\nthe relative orientation between the imaging planes and implemented as\nregressing their intersecting lines. The second exploits parallel imaging\nplanes to regress their relative slice locations within a stack. Both pretext\ntasks are conceptually straightforward and easy to implement, and can be\ncombined in multitask learning for better representation learning. Thorough\nexperiments on two anatomical structures (heart and knee) and representative\ntarget tasks (semantic segmentation and classification) demonstrate that the\nproposed pretext tasks are effective in pretraining deep networks for\nremarkably boosted performance on the target tasks, and superior to other\nrecent approaches.\n","authors":["Tianwei Zhang","Dong Wei","Mengmeng Zhua","Shi Gu","Yefeng Zheng"],"pdf_url":"https://arxiv.org/pdf/2403.16499v1.pdf","comment":"Medical Image Analysis"},{"id":"http://arxiv.org/abs/2403.16497v1","updated":"2024-03-25T07:29:18Z","published":"2024-03-25T07:29:18Z","title":"PathoTune: Adapting Visual Foundation Model to Pathological Specialists","summary":" As natural image understanding moves towards the pretrain-finetune era,\nresearch in pathology imaging is concurrently evolving. Despite the predominant\nfocus on pretraining pathological foundation models, how to adapt foundation\nmodels to downstream tasks is little explored. For downstream adaptation, we\npropose the existence of two domain gaps, i.e., the Foundation-Task Gap and the\nTask-Instance Gap. To mitigate these gaps, we introduce PathoTune, a framework\ndesigned to efficiently adapt pathological or even visual foundation models to\npathology-specific tasks via multi-modal prompt tuning. The proposed framework\nleverages Task-specific Visual Prompts and Task-specific Textual Prompts to\nidentify task-relevant features, along with Instance-specific Visual Prompts\nfor encoding single pathological image features. Results across multiple\ndatasets at both patch-level and WSI-level demonstrate its superior performance\nover single-modality prompt tuning approaches. Significantly, PathoTune\nfacilitates the direct adaptation of natural visual foundation models to\npathological tasks, drastically outperforming pathological foundation models\nwith simple linear probing. The code will be available upon acceptance.\n","authors":["Jiaxuan Lu","Fang Yan","Xiaofan Zhang","Yue Gao","Shaoting Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.16497v1.pdf","comment":"Submitted to MICCAI 2024"},{"id":"http://arxiv.org/abs/2403.16494v1","updated":"2024-03-25T07:22:22Z","published":"2024-03-25T07:22:22Z","title":"CT-Bound: Fast Boundary Estimation From Noisy Images Via Hybrid\n Convolution and Transformer Neural Networks","summary":" We present CT-Bound, a fast boundary estimation method for noisy images using\na hybrid Convolution and Transformer neural network. The proposed architecture\ndecomposes boundary estimation into two tasks: local detection and global\nregularization of image boundaries. It first estimates a parametric\nrepresentation of boundary structures only using the input image within a small\nreceptive field and then refines the boundary structure in the parameter domain\nwithout accessing the input image. Because of this, a part of the network can\nbe easily trained using naive, synthetic images and still generalized to real\nimages, and the entire architecture is computationally efficient as the\nboundary refinement is non-iterative and not in the image domain. Compared with\nthe previous highest accuracy methods, our experiment shows that CT-Bound is\n100 times faster, producing comparably accurate, high-quality boundary and\ncolor maps. We also demonstrate that CT-Bound can produce boundary and color\nmaps on real captured images without extra fine-tuning and real-time boundary\nmap and color map videos at ten frames per second.\n","authors":["Wei Xu","Junjie Luo","Qi Guo"],"pdf_url":"https://arxiv.org/pdf/2403.16494v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2403.16481v1","updated":"2024-03-25T07:07:50Z","published":"2024-03-25T07:07:50Z","title":"REFRAME: Reflective Surface Real-Time Rendering for Mobile Devices","summary":" This work tackles the challenging task of achieving real-time novel view\nsynthesis on various scenes, including highly reflective objects and unbounded\noutdoor scenes. Existing real-time rendering methods, especially those based on\nmeshes, often have subpar performance in modeling surfaces with rich\nview-dependent appearances. Our key idea lies in leveraging meshes for\nrendering acceleration while incorporating a novel approach to parameterize\nview-dependent information. We decompose the color into diffuse and specular,\nand model the specular color in the reflected direction based on a neural\nenvironment map. Our experiments demonstrate that our method achieves\ncomparable reconstruction quality for highly reflective surfaces compared to\nstate-of-the-art offline methods, while also efficiently enabling real-time\nrendering on edge devices such as smartphones.\n","authors":["Chaojie Ji","Yufeng Li","Yiyi Liao"],"pdf_url":"https://arxiv.org/pdf/2403.16481v1.pdf","comment":"Project Page:https://xdimlab.github.io/REFRAME/"},{"id":"http://arxiv.org/abs/2403.06606v2","updated":"2024-03-25T06:57:57Z","published":"2024-03-11T10:50:53Z","title":"Distributionally Generative Augmentation for Fair Facial Attribute\n Classification","summary":" Facial Attribute Classification (FAC) holds substantial promise in widespread\napplications. However, FAC models trained by traditional methodologies can be\nunfair by exhibiting accuracy inconsistencies across varied data\nsubpopulations. This unfairness is largely attributed to bias in data, where\nsome spurious attributes (e.g., Male) statistically correlate with the target\nattribute (e.g., Smiling). Most of existing fairness-aware methods rely on the\nlabels of spurious attributes, which may be unavailable in practice. This work\nproposes a novel, generation-based two-stage framework to train a fair FAC\nmodel on biased data without additional annotation. Initially, we identify the\npotential spurious attributes based on generative models. Notably, it enhances\ninterpretability by explicitly showing the spurious attributes in image space.\nFollowing this, for each image, we first edit the spurious attributes with a\nrandom degree sampled from a uniform distribution, while keeping target\nattribute unchanged. Then we train a fair FAC model by fostering model\ninvariance to these augmentation. Extensive experiments on three common\ndatasets demonstrate the effectiveness of our method in promoting fairness in\nFAC without compromising accuracy. Codes are in\nhttps://github.com/heqianpei/DiGA.\n","authors":["Fengda Zhang","Qianpei He","Kun Kuang","Jiashuo Liu","Long Chen","Chao Wu","Jun Xiao","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.06606v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.10066v2","updated":"2024-03-25T06:27:57Z","published":"2024-03-15T07:16:07Z","title":"Contrastive Pre-Training with Multi-View Fusion for No-Reference Point\n Cloud Quality Assessment","summary":" No-reference point cloud quality assessment (NR-PCQA) aims to automatically\nevaluate the perceptual quality of distorted point clouds without available\nreference, which have achieved tremendous improvements due to the utilization\nof deep neural networks. However, learning-based NR-PCQA methods suffer from\nthe scarcity of labeled data and usually perform suboptimally in terms of\ngeneralization. To solve the problem, we propose a novel contrastive\npre-training framework tailored for PCQA (CoPA), which enables the pre-trained\nmodel to learn quality-aware representations from unlabeled data. To obtain\nanchors in the representation space, we project point clouds with different\ndistortions into images and randomly mix their local patches to form mixed\nimages with multiple distortions. Utilizing the generated anchors, we constrain\nthe pre-training process via a quality-aware contrastive loss following the\nphilosophy that perceptual quality is closely related to both content and\ndistortion. Furthermore, in the model fine-tuning stage, we propose a\nsemantic-guided multi-view fusion module to effectively integrate the features\nof projected images from multiple perspectives. Extensive experiments show that\nour method outperforms the state-of-the-art PCQA methods on popular benchmarks.\nFurther investigations demonstrate that CoPA can also benefit existing\nlearning-based PCQA models.\n","authors":["Ziyu Shan","Yujie Zhang","Qi Yang","Haichen Yang","Yiling Xu","Jenq-Neng Hwang","Xiaozhong Xu","Shan Liu"],"pdf_url":"https://arxiv.org/pdf/2403.10066v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16450v1","updated":"2024-03-25T06:22:27Z","published":"2024-03-25T06:22:27Z","title":"Camera-aware Label Refinement for Unsupervised Person Re-identification","summary":" Unsupervised person re-identification aims to retrieve images of a specified\nperson without identity labels. Many recent unsupervised Re-ID approaches adopt\nclustering-based methods to measure cross-camera feature similarity to roughly\ndivide images into clusters. They ignore the feature distribution discrepancy\ninduced by camera domain gap, resulting in the unavoidable performance\ndegradation. Camera information is usually available, and the feature\ndistribution in the single camera usually focuses more on the appearance of the\nindividual and has less intra-identity variance. Inspired by the observation,\nwe introduce a \\textbf{C}amera-\\textbf{A}ware \\textbf{L}abel\n\\textbf{R}efinement~(CALR) framework that reduces camera discrepancy by\nclustering intra-camera similarity. Specifically, we employ intra-camera\ntraining to obtain reliable local pseudo labels within each camera, and then\nrefine global labels generated by inter-camera clustering and train the\ndiscriminative model using more reliable global pseudo labels in a self-paced\nmanner. Meanwhile, we develop a camera-alignment module to align feature\ndistributions under different cameras, which could help deal with the camera\nvariance further. Extensive experiments validate the superiority of our\nproposed method over state-of-the-art approaches. The code is accessible at\nhttps://github.com/leeBooMla/CALR.\n","authors":["Pengna Li","Kangyi Wu","Wenli Huang","Sanping Zhou","Jinjun Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16450v1.pdf","comment":"submitted to IEEE TMM"},{"id":"http://arxiv.org/abs/2312.02480v2","updated":"2024-03-25T06:22:09Z","published":"2023-12-05T04:13:31Z","title":"Differentiable Point-based Inverse Rendering","summary":" We present differentiable point-based inverse rendering, DPIR, an\nanalysis-by-synthesis method that processes images captured under diverse\nilluminations to estimate shape and spatially-varying BRDF. To this end, we\nadopt point-based rendering, eliminating the need for multiple samplings per\nray, typical of volumetric rendering, thus significantly enhancing the speed of\ninverse rendering. To realize this idea, we devise a hybrid point-volumetric\nrepresentation for geometry and a regularized basis-BRDF representation for\nreflectance. The hybrid geometric representation enables fast rendering through\npoint-based splatting while retaining the geometric details and stability\ninherent to SDF-based representations. The regularized basis-BRDF mitigates the\nill-posedness of inverse rendering stemming from limited light-view angular\nsamples. We also propose an efficient shadow detection method using point-based\nshadow map rendering. Our extensive evaluations demonstrate that DPIR\noutperforms prior works in terms of reconstruction accuracy, computational\nefficiency, and memory footprint. Furthermore, our explicit point-based\nrepresentation and rendering enables intuitive geometry and reflectance\nediting.\n","authors":["Hoon-Gyu Chung","Seokjun Choi","Seung-Hwan Baek"],"pdf_url":"https://arxiv.org/pdf/2312.02480v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16442v1","updated":"2024-03-25T06:05:50Z","published":"2024-03-25T06:05:50Z","title":"If CLIP Could Talk: Understanding Vision-Language Model Representations\n Through Their Preferred Concept Descriptions","summary":" Recent works often assume that Vision-Language Model (VLM) representations\nare based on visual attributes like shape. However, it is unclear to what\nextent VLMs prioritize this information to represent concepts. We propose\nExtract and Explore (EX2), a novel approach to characterize important textual\nfeatures for VLMs. EX2 uses reinforcement learning to align a large language\nmodel with VLM preferences and generates descriptions that incorporate the\nimportant features for the VLM. Then, we inspect the descriptions to identify\nthe features that contribute to VLM representations. We find that spurious\ndescriptions have a major role in VLM representations despite providing no\nhelpful information, e.g., Click to enlarge photo of CONCEPT. More importantly,\namong informative descriptions, VLMs rely significantly on non-visual\nattributes like habitat to represent visual concepts. Also, our analysis\nreveals that different VLMs prioritize different attributes in their\nrepresentations. Overall, we show that VLMs do not simply match images to scene\ndescriptions and that non-visual or even spurious descriptions significantly\ninfluence their representations.\n","authors":["Reza Esfandiarpoor","Cristina Menghini","Stephen H. Bach"],"pdf_url":"https://arxiv.org/pdf/2403.16442v1.pdf","comment":"Code: https://github.com/BatsResearch/ex2"},{"id":"http://arxiv.org/abs/2310.14566v5","updated":"2024-03-25T06:05:24Z","published":"2023-10-23T04:49:09Z","title":"HallusionBench: An Advanced Diagnostic Suite for Entangled Language\n Hallucination and Visual Illusion in Large Vision-Language Models","summary":" We introduce HallusionBench, a comprehensive benchmark designed for the\nevaluation of image-context reasoning. This benchmark presents significant\nchallenges to advanced large visual-language models (LVLMs), such as\nGPT-4V(Vision), Gemini Pro Vision, Claude 3, and LLaVA-1.5, by emphasizing\nnuanced understanding and interpretation of visual data. The benchmark\ncomprises 346 images paired with 1129 questions, all meticulously crafted by\nhuman experts. We introduce a novel structure for these visual questions\ndesigned to establish control groups. This structure enables us to conduct a\nquantitative analysis of the models' response tendencies, logical consistency,\nand various failure modes. In our evaluation on HallusionBench, we benchmarked\n15 different models, highlighting a 31.42% question-pair accuracy achieved by\nthe state-of-the-art GPT-4V. Notably, all other evaluated models achieve\naccuracy below 16%. Moreover, our analysis not only highlights the observed\nfailure modes, including language hallucination and visual illusion, but also\ndeepens an understanding of these pitfalls. Our comprehensive case studies\nwithin HallusionBench shed light on the challenges of hallucination and\nillusion in LVLMs. Based on these insights, we suggest potential pathways for\ntheir future improvement. The benchmark and codebase can be accessed at\nhttps://github.com/tianyi-lab/HallusionBench.\n","authors":["Tianrui Guan","Fuxiao Liu","Xiyang Wu","Ruiqi Xian","Zongxia Li","Xiaoyu Liu","Xijun Wang","Lichang Chen","Furong Huang","Yaser Yacoob","Dinesh Manocha","Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.14566v5.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.16440v1","updated":"2024-03-25T06:02:05Z","published":"2024-03-25T06:02:05Z","title":"RCBEVDet: Radar-camera Fusion in Bird's Eye View for 3D Object Detection","summary":" Three-dimensional object detection is one of the key tasks in autonomous\ndriving. To reduce costs in practice, low-cost multi-view cameras for 3D object\ndetection are proposed to replace the expansive LiDAR sensors. However, relying\nsolely on cameras is difficult to achieve highly accurate and robust 3D object\ndetection. An effective solution to this issue is combining multi-view cameras\nwith the economical millimeter-wave radar sensor to achieve more reliable\nmulti-modal 3D object detection. In this paper, we introduce RCBEVDet, a\nradar-camera fusion 3D object detection method in the bird's eye view (BEV).\nSpecifically, we first design RadarBEVNet for radar BEV feature extraction.\nRadarBEVNet consists of a dual-stream radar backbone and a Radar Cross-Section\n(RCS) aware BEV encoder. In the dual-stream radar backbone, a point-based\nencoder and a transformer-based encoder are proposed to extract radar features,\nwith an injection and extraction module to facilitate communication between the\ntwo encoders. The RCS-aware BEV encoder takes RCS as the object size prior to\nscattering the point feature in BEV. Besides, we present the Cross-Attention\nMulti-layer Fusion module to automatically align the multi-modal BEV feature\nfrom radar and camera with the deformable attention mechanism, and then fuse\nthe feature with channel and spatial fusion layers. Experimental results show\nthat RCBEVDet achieves new state-of-the-art radar-camera fusion results on\nnuScenes and view-of-delft (VoD) 3D object detection benchmarks. Furthermore,\nRCBEVDet achieves better 3D detection results than all real-time camera-only\nand radar-camera 3D object detectors with a faster inference speed at 21~28\nFPS. The source code will be released at https://github.com/VDIGPKU/RCBEVDet.\n","authors":["Zhiwei Lin","Zhe Liu","Zhongyu Xia","Xinhao Wang","Yongtao Wang","Shengxiang Qi","Yang Dong","Nan Dong","Le Zhang","Ce Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.16440v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.16439v1","updated":"2024-03-25T05:58:33Z","published":"2024-03-25T05:58:33Z","title":"Producing and Leveraging Online Map Uncertainty in Trajectory Prediction","summary":" High-definition (HD) maps have played an integral role in the development of\nmodern autonomous vehicle (AV) stacks, albeit with high associated labeling and\nmaintenance costs. As a result, many recent works have proposed methods for\nestimating HD maps online from sensor data, enabling AVs to operate outside of\npreviously-mapped regions. However, current online map estimation approaches\nare developed in isolation of their downstream tasks, complicating their\nintegration in AV stacks. In particular, they do not produce uncertainty or\nconfidence estimates. In this work, we extend multiple state-of-the-art online\nmap estimation methods to additionally estimate uncertainty and show how this\nenables more tightly integrating online mapping with trajectory forecasting. In\ndoing so, we find that incorporating uncertainty yields up to 50% faster\ntraining convergence and up to 15% better prediction performance on the\nreal-world nuScenes driving dataset.\n","authors":["Xunjiang Gu","Guanyu Song","Igor Gilitschenski","Marco Pavone","Boris Ivanovic"],"pdf_url":"https://arxiv.org/pdf/2403.16439v1.pdf","comment":"14 pages, 14 figures, 6 tables. CVPR 2024"},{"id":"http://arxiv.org/abs/2403.07371v2","updated":"2024-03-25T05:48:28Z","published":"2024-03-12T07:15:29Z","title":"Time-Efficient and Identity-Consistent Virtual Try-On Using A Variant of\n Altered Diffusion Models","summary":" This study discusses the critical issues of Virtual Try-On in contemporary\ne-commerce and the prospective metaverse, emphasizing the challenges of\npreserving intricate texture details and distinctive features of the target\nperson and the clothes in various scenarios, such as clothing texture and\nidentity characteristics like tattoos or accessories. In addition to the\nfidelity of the synthesized images, the efficiency of the synthesis process\npresents a significant hurdle. Various existing approaches are explored,\nhighlighting the limitations and unresolved aspects, e.g., identity information\nomission, uncontrollable artifacts, and low synthesis speed. It then proposes a\nnovel diffusion-based solution that addresses garment texture preservation and\nuser identity retention during virtual try-on. The proposed network comprises\ntwo primary modules - a warping module aligning clothing with individual\nfeatures and a try-on module refining the attire and generating missing parts\nintegrated with a mask-aware post-processing technique ensuring the integrity\nof the individual's identity. It demonstrates impressive results, surpassing\nthe state-of-the-art in speed by nearly 20 times during inference, with\nsuperior fidelity in qualitative assessments. Quantitative evaluations confirm\ncomparable performance with the recent SOTA method on the VITON-HD and\nDresscode datasets.\n","authors":["Phuong Dam","Jihoon Jeong","Anh Tran","Daeyoung Kim"],"pdf_url":"https://arxiv.org/pdf/2403.07371v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16438v1","updated":"2024-03-25T05:46:06Z","published":"2024-03-25T05:46:06Z","title":"Real-time Neuron Segmentation for Voltage Imaging","summary":" In voltage imaging, where the membrane potentials of individual neurons are\nrecorded at from hundreds to thousand frames per second using fluorescence\nmicroscopy, data processing presents a challenge. Even a fraction of a minute\nof recording with a limited image size yields gigabytes of video data\nconsisting of tens of thousands of frames, which can be time-consuming to\nprocess. Moreover, millisecond-level short exposures lead to noisy video\nframes, obscuring neuron footprints especially in deep-brain samples where\nnoisy signals are buried in background fluorescence. To address this challenge,\nwe propose a fast neuron segmentation method able to detect multiple,\npotentially overlapping, spiking neurons from noisy video frames, and implement\na data processing pipeline incorporating the proposed segmentation method along\nwith GPU-accelerated motion correction. By testing on existing datasets as well\nas on new datasets we introduce, we show that our pipeline extracts neuron\nfootprints that agree well with human annotation even from cluttered datasets,\nand demonstrate real-time processing of voltage imaging data on a single\ndesktop computer for the first time.\n","authors":["Yosuke Bando","Ramdas Pillai","Atsushi Kajita","Farhan Abdul Hakeem","Yves Quemener","Hua-an Tseng","Kiryl D. Piatkevich","Changyang Linghu","Xue Han","Edward S. Boyden"],"pdf_url":"https://arxiv.org/pdf/2403.16438v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06199v4","updated":"2024-03-25T05:36:56Z","published":"2024-03-10T12:43:27Z","title":"Mipha: A Comprehensive Overhaul of Multimodal Assistant with Small\n Language Models","summary":" Multimodal Large Language Models (MLLMs) have showcased impressive skills in\ntasks related to visual understanding and reasoning. Yet, their widespread\napplication faces obstacles due to the high computational demands during both\nthe training and inference phases, restricting their use to a limited audience\nwithin the research and user communities. In this paper, we investigate the\ndesign aspects of Multimodal Small Language Models (MSLMs) and propose an\nefficient multimodal assistant named Mipha, which is designed to create synergy\namong various aspects: visual representation, language models, and optimization\nstrategies. We show that without increasing the volume of training data, our\nMipha-3B outperforms the state-of-the-art large MLLMs, especially\nLLaVA-1.5-13B, on multiple benchmarks. Through detailed discussion, we provide\ninsights and guidelines for developing strong MSLMs that rival the capabilities\nof MLLMs. Our code is available at https://github.com/zhuyiche/llava-phi.\n","authors":["Minjie Zhu","Yichen Zhu","Xin Liu","Ning Liu","Zhiyuan Xu","Chaomin Shen","Yaxin Peng","Zhicai Ou","Feifei Feng","Jian Tang"],"pdf_url":"https://arxiv.org/pdf/2403.06199v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18287v2","updated":"2024-03-25T05:34:58Z","published":"2023-11-30T06:45:52Z","title":"Dispersed Structured Light for Hyperspectral 3D Imaging","summary":" Hyperspectral 3D imaging aims to acquire both depth and spectral information\nof a scene. However, existing methods are either prohibitively expensive and\nbulky or compromise on spectral and depth accuracy. In this work, we present\nDispersed Structured Light (DSL), a cost-effective and compact method for\naccurate hyperspectral 3D imaging. DSL modifies a traditional projector-camera\nsystem by placing a sub-millimeter thick diffraction grating film front of the\nprojector. The grating disperses structured light based on light wavelength. To\nutilize the dispersed structured light, we devise a model for dispersive\nprojection image formation and a per-pixel hyperspectral 3D reconstruction\nmethod. We validate DSL by instantiating a compact experimental prototype. DSL\nachieves spectral accuracy of 18.8nm full-width half-maximum (FWHM) and depth\nerror of 1mm. We demonstrate that DSL outperforms prior work on practical\nhyperspectral 3D imaging. DSL promises accurate and practical hyperspectral 3D\nimaging for diverse application domains, including computer vision and\ngraphics, cultural heritage, geology, and biology.\n","authors":["Suhyun Shin","Seokjun Choi","Felix Heide","Seung-Hwan Baek"],"pdf_url":"https://arxiv.org/pdf/2311.18287v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16431v1","updated":"2024-03-25T05:22:34Z","published":"2024-03-25T05:22:34Z","title":"DOCTR: Disentangled Object-Centric Transformer for Point Scene\n Understanding","summary":" Point scene understanding is a challenging task to process real-world scene\npoint cloud, which aims at segmenting each object, estimating its pose, and\nreconstructing its mesh simultaneously. Recent state-of-the-art method first\nsegments each object and then processes them independently with multiple stages\nfor the different sub-tasks. This leads to a complex pipeline to optimize and\nmakes it hard to leverage the relationship constraints between multiple\nobjects. In this work, we propose a novel Disentangled Object-Centric\nTRansformer (DOCTR) that explores object-centric representation to facilitate\nlearning with multiple objects for the multiple sub-tasks in a unified manner.\nEach object is represented as a query, and a Transformer decoder is adapted to\niteratively optimize all the queries involving their relationship. In\nparticular, we introduce a semantic-geometry disentangled query (SGDQ) design\nthat enables the query features to attend separately to semantic information\nand geometric information relevant to the corresponding sub-tasks. A hybrid\nbipartite matching module is employed to well use the supervisions from all the\nsub-tasks during training. Qualitative and quantitative experimental results\ndemonstrate that our method achieves state-of-the-art performance on the\nchallenging ScanNet dataset. Code is available at\nhttps://github.com/SAITPublic/DOCTR.\n","authors":["Xiaoxuan Yu","Hao Wang","Weiming Li","Qiang Wang","Soonyong Cho","Younghun Sung"],"pdf_url":"https://arxiv.org/pdf/2403.16431v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13964v3","updated":"2024-03-25T05:18:04Z","published":"2023-12-21T15:51:12Z","title":"PIA: Your Personalized Image Animator via Plug-and-Play Modules in\n Text-to-Image Models","summary":" Recent advancements in personalized text-to-image (T2I) models have\nrevolutionized content creation, empowering non-experts to generate stunning\nimages with unique styles. While promising, adding realistic motions into these\npersonalized images by text poses significant challenges in preserving distinct\nstyles, high-fidelity details, and achieving motion controllability by text. In\nthis paper, we present PIA, a Personalized Image Animator that excels in\naligning with condition images, achieving motion controllability by text, and\nthe compatibility with various personalized T2I models without specific tuning.\nTo achieve these goals, PIA builds upon a base T2I model with well-trained\ntemporal alignment layers, allowing for the seamless transformation of any\npersonalized T2I model into an image animation model. A key component of PIA is\nthe introduction of the condition module, which utilizes the condition frame\nand inter-frame affinity as input to transfer appearance information guided by\nthe affinity hint for individual frame synthesis in the latent space. This\ndesign mitigates the challenges of appearance-related image alignment within\nand allows for a stronger focus on aligning with motion-related guidance.\n","authors":["Yiming Zhang","Zhening Xing","Yanhong Zeng","Youqing Fang","Kai Chen"],"pdf_url":"https://arxiv.org/pdf/2312.13964v3.pdf","comment":"Project page: https://pi-animator.github.io/"},{"id":"http://arxiv.org/abs/2403.16428v1","updated":"2024-03-25T05:12:21Z","published":"2024-03-25T05:12:21Z","title":"Benchmarks and Challenges in Pose Estimation for Egocentric Hand\n Interactions with Objects","summary":" We interact with the world with our hands and see it through our own\n(egocentric) perspective. A holistic 3D understanding of such interactions from\negocentric views is important for tasks in robotics, AR/VR, action recognition\nand motion generation. Accurately reconstructing such interactions in 3D is\nchallenging due to heavy occlusion, viewpoint bias, camera distortion, and\nmotion blur from the head movement. To this end, we designed the HANDS23\nchallenge based on the AssemblyHands and ARCTIC datasets with carefully\ndesigned training and testing splits. Based on the results of the top submitted\nmethods and more recent baselines on the leaderboards, we perform a thorough\nanalysis on 3D hand(-object) reconstruction tasks. Our analysis demonstrates\nthe effectiveness of addressing distortion specific to egocentric cameras,\nadopting high-capacity transformers to learn complex hand-object interactions,\nand fusing predictions from different views. Our study further reveals\nchallenging scenarios intractable with state-of-the-art methods, such as fast\nhand motion, object reconstruction from narrow egocentric views, and close\ncontact between two hands and objects. Our efforts will enrich the community's\nknowledge foundation and facilitate future hand studies on egocentric\nhand-object interactions.\n","authors":["Zicong Fan","Takehiko Ohkawa","Linlin Yang","Nie Lin","Zhishan Zhou","Shihao Zhou","Jiajun Liang","Zhong Gao","Xuanyang Zhang","Xue Zhang","Fei Li","Liu Zheng","Feng Lu","Karim Abou Zeid","Bastian Leibe","Jeongwan On","Seungryul Baek","Aditya Prakash","Saurabh Gupta","Kun He","Yoichi Sato","Otmar Hilliges","Hyung Jin Chang","Angela Yao"],"pdf_url":"https://arxiv.org/pdf/2403.16428v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16425v1","updated":"2024-03-25T05:10:34Z","published":"2024-03-25T05:10:34Z","title":"Enhancing Visual Place Recognition via Fast and Slow Adaptive Biasing in\n Event Cameras","summary":" Event cameras are increasingly popular in robotics due to their beneficial\nfeatures, such as low latency, energy efficiency, and high dynamic range.\nNevertheless, their downstream task performance is greatly influenced by the\noptimization of bias parameters. These parameters, for instance, regulate the\nnecessary change in light intensity to trigger an event, which in turn depends\non factors such as the environment lighting and camera motion. This paper\nintroduces feedback control algorithms that automatically tune the bias\nparameters through two interacting methods: 1) An immediate, on-the-fly fast\nadaptation of the refractory period, which sets the minimum interval between\nconsecutive events, and 2) if the event rate exceeds the specified bounds even\nafter changing the refractory period repeatedly, the controller adapts the\npixel bandwidth and event thresholds, which stabilizes after a short period of\nnoise events across all pixels (slow adaptation). Our evaluation focuses on the\nvisual place recognition task, where incoming query images are compared to a\ngiven reference database. We conducted comprehensive evaluations of our\nalgorithms' adaptive feedback control in real-time. To do so, we collected the\nQCR-Fast-and-Slow dataset that contains DAVIS346 event camera streams from 366\nrepeated traversals of a Scout Mini robot navigating through a 100 meter long\nindoor lab setting (totaling over 35km distance traveled) in varying brightness\nconditions with ground truth location information. Our proposed feedback\ncontrollers result in superior performance when compared to the standard bias\nsettings and prior feedback control methods. Our findings also detail the\nimpact of bias adjustments on task performance and feature ablation studies on\nthe fast and slow adaptation mechanisms.\n","authors":["Gokul B. Nair","Michael Milford","Tobias Fischer"],"pdf_url":"https://arxiv.org/pdf/2403.16425v1.pdf","comment":"8 pages, 9 figures, paper under review"},{"id":"http://arxiv.org/abs/2312.03009v2","updated":"2024-03-25T05:04:04Z","published":"2023-12-04T19:01:19Z","title":"I-PHYRE: Interactive Physical Reasoning","summary":" Current evaluation protocols predominantly assess physical reasoning in\nstationary scenes, creating a gap in evaluating agents' abilities to interact\nwith dynamic events. While contemporary methods allow agents to modify initial\nscene configurations and observe consequences, they lack the capability to\ninteract with events in real time. To address this, we introduce I-PHYRE, a\nframework that challenges agents to simultaneously exhibit intuitive physical\nreasoning, multi-step planning, and in-situ intervention. Here, intuitive\nphysical reasoning refers to a quick, approximate understanding of physics to\naddress complex problems; multi-step denotes the need for extensive sequence\nplanning in I-PHYRE, considering each intervention can significantly alter\nsubsequent choices; and in-situ implies the necessity for timely object\nmanipulation within a scene, where minor timing deviations can result in task\nfailure. We formulate four game splits to scrutinize agents' learning and\ngeneralization of essential principles of interactive physical reasoning,\nfostering learning through interaction with representative scenarios. Our\nexploration involves three planning strategies and examines several supervised\nand reinforcement agents' zero-shot generalization proficiency on I-PHYRE. The\noutcomes highlight a notable gap between existing learning algorithms and human\nperformance, emphasizing the imperative for more research in enhancing agents\nwith interactive physical reasoning capabilities. The environment and baselines\nwill be made publicly available.\n","authors":["Shiqian Li","Kewen Wu","Chi Zhang","Yixin Zhu"],"pdf_url":"https://arxiv.org/pdf/2312.03009v2.pdf","comment":"21 pages, ICLR 2024"},{"id":"http://arxiv.org/abs/2403.16422v1","updated":"2024-03-25T04:54:49Z","published":"2024-03-25T04:54:49Z","title":"Refining Text-to-Image Generation: Towards Accurate Training-Free\n Glyph-Enhanced Image Generation","summary":" Over the past few years, Text-to-Image (T2I) generation approaches based on\ndiffusion models have gained significant attention. However, vanilla diffusion\nmodels often suffer from spelling inaccuracies in the text displayed within the\ngenerated images. The capability to generate visual text is crucial, offering\nboth academic interest and a wide range of practical applications. To produce\naccurate visual text images, state-of-the-art techniques adopt a\nglyph-controlled image generation approach, consisting of a text layout\ngenerator followed by an image generator that is conditioned on the generated\ntext layout. Nevertheless, our study reveals that these models still face three\nprimary challenges, prompting us to develop a testbed to facilitate future\nresearch. We introduce a benchmark, LenCom-Eval, specifically designed for\ntesting models' capability in generating images with Lengthy and Complex visual\ntext. Subsequently, we introduce a training-free framework to enhance the\ntwo-stage generation approaches. We examine the effectiveness of our approach\non both LenCom-Eval and MARIO-Eval benchmarks and demonstrate notable\nimprovements across a range of evaluation metrics, including CLIPScore, OCR\nprecision, recall, F1 score, accuracy, and edit distance scores. For instance,\nour proposed framework improves the backbone model, TextDiffuser, by more than\n23\\% and 13.5\\% in terms of OCR word F1 on LenCom-Eval and MARIO-Eval,\nrespectively. Our work makes a unique contribution to the field by focusing on\ngenerating images with long and rare text sequences, a niche previously\nunexplored by existing literature\n","authors":["Sanyam Lakhanpal","Shivang Chopra","Vinija Jain","Aman Chadha","Man Luo"],"pdf_url":"https://arxiv.org/pdf/2403.16422v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03173v3","updated":"2024-03-25T04:42:22Z","published":"2024-03-05T18:08:29Z","title":"Solving the bongard-logo problem by modeling a probabilistic model","summary":" Abstract reasoning problems challenge the perceptual and cognitive abilities\nof AI algorithms, demanding deeper pattern discernment and inductive reasoning\nbeyond explicit image features. This study introduces PMoC, a tailored\nprobability model for the Bongard-Logo problem, achieving high reasoning\naccuracy by constructing independent probability models. Additionally, we\npresent Pose-Transformer, an enhanced Transformer-Encoder designed for complex\nabstract reasoning tasks, including Bongard-Logo, RAVEN, I-RAVEN, and PGM.\nPose-Transformer incorporates positional information learning, inspired by\ncapsule networks' pose matrices, enhancing its focus on local positional\nrelationships in image data processing. When integrated with PMoC, it further\nimproves reasoning accuracy. Our approach effectively addresses reasoning\ndifficulties associated with abstract entities' positional changes,\noutperforming previous models on the OIG, D3$\\times$3 subsets of RAVEN, and PGM\ndatabases. This research contributes to advancing AI's capabilities in abstract\nreasoning and cognitive pattern recognition.\n","authors":["Ruizhuo Song","Beiming Yuan"],"pdf_url":"https://arxiv.org/pdf/2403.03173v3.pdf","comment":"14 pages, 11 figures, 3 tables"},{"id":"http://arxiv.org/abs/2403.03190v4","updated":"2024-03-25T04:40:39Z","published":"2024-03-05T18:29:17Z","title":"Triple-CFN: Restructuring Conceptual Spaces for Enhancing Abstract\n Reasoning process","summary":" Abstract reasoning problems pose significant challenges to artificial\nintelligence algorithms, demanding cognitive capabilities beyond those required\nfor perception tasks. This study introduces the Triple-CFN approach to tackle\nthe Bongard-Logo problem, achieving notable reasoning accuracy by implicitly\nreorganizing the concept space of conflicting instances. Additionally, the\nTriple-CFN paradigm proves effective for the RPM problem with necessary\nmodifications, yielding competitive results. To further enhance performance on\nthe RPM issue, we develop the Meta Triple-CFN network, which explicitly\nstructures the problem space while maintaining interpretability on progressive\npatterns. The success of Meta Triple-CFN is attributed to its paradigm of\nmodeling the conceptual space, equivalent to normalizing reasoning information.\nBased on this ideology, we introduce the Re-space layer, enhancing the\nperformance of both Meta Triple-CFN and Triple-CFN. This paper aims to\ncontribute to advancements in machine intelligence by exploring innovative\nnetwork designs for addressing abstract reasoning problems, paving the way for\nfurther breakthroughs in this domain.\n","authors":["Ruizhuo Song","Beiming Yuan"],"pdf_url":"https://arxiv.org/pdf/2403.03190v4.pdf","comment":"14 pages, 14 figures, 5 tables"},{"id":"http://arxiv.org/abs/2403.03452v4","updated":"2024-03-25T04:38:42Z","published":"2024-03-06T04:36:43Z","title":"D4C glove-train: solving the RPM and Bongard-logo problem by\n distributing and Circumscribing concepts","summary":" This paper achieves noteworthy progress in the realm of abstract reasoning,\nparticularly in addressing Raven's Progressive Matrices (RPM) and Bongard-Logo\nchallenges. Initially, we introduce Lico-Net, a novel baseline model that\nresolves RPM problems with remarkable accuracy. Leveraging this foundation, we\nadvance with the D3C approach, which advocates representing the underlying\nconcepts in abstract reasoning problems through distributions. This perspective\nenhances the performance of both Lico-Net and a baseline model excelling in\nBongard-Logo tasks. To bolster the computational efficiency of D3C, we present\nthe D3C-cos variant, offering a streamlined yet precise solution. Furthermore,\nwe propose the D2C method, redefining conceptual boundaries within these\ndomains and bridging the divide between high-level abstractions and their\nlower-dimensional counterparts. Finally, we extend our methodology to D4C,\nemploying adversarial techniques to refine conceptual boundaries further and\ndemonstrate substantial improvements in both RPM and Bongard-Logo challenges.\nOverall, our contributions present a fresh outlook and practical advancements\nin the field of abstract reasoning.\n","authors":["Ruizhuo Song","Beiming Yuan"],"pdf_url":"https://arxiv.org/pdf/2403.03452v4.pdf","comment":"18 pages, 19 figures, 6 tables"},{"id":"http://arxiv.org/abs/2403.16412v1","updated":"2024-03-25T04:14:07Z","published":"2024-03-25T04:14:07Z","title":"Unsupervised Template-assisted Point Cloud Shape Correspondence Network","summary":" Unsupervised point cloud shape correspondence aims to establish point-wise\ncorrespondences between source and target point clouds. Existing methods obtain\ncorrespondences directly by computing point-wise feature similarity between\npoint clouds. However, non-rigid objects possess strong deformability and\nunusual shapes, making it a longstanding challenge to directly establish\ncorrespondences between point clouds with unconventional shapes. To address\nthis challenge, we propose an unsupervised Template-Assisted point cloud shape\ncorrespondence Network, termed TANet, including a template generation module\nand a template assistance module. The proposed TANet enjoys several merits.\nFirstly, the template generation module establishes a set of learnable\ntemplates with explicit structures. Secondly, we introduce a template\nassistance module that extensively leverages the generated templates to\nestablish more accurate shape correspondences from multiple perspectives.\nExtensive experiments on four human and animal datasets demonstrate that TANet\nachieves favorable performance against state-of-the-art methods.\n","authors":["Jiacheng Deng","Jiahao Lu","Tianzhu Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.16412v1.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2403.16410v1","updated":"2024-03-25T04:05:23Z","published":"2024-03-25T04:05:23Z","title":"Spike-NeRF: Neural Radiance Field Based On Spike Camera","summary":" As a neuromorphic sensor with high temporal resolution, spike cameras offer\nnotable advantages over traditional cameras in high-speed vision applications\nsuch as high-speed optical estimation, depth estimation, and object tracking.\nInspired by the success of the spike camera, we proposed Spike-NeRF, the first\nNeural Radiance Field derived from spike data, to achieve 3D reconstruction and\nnovel viewpoint synthesis of high-speed scenes. Instead of the multi-view\nimages at the same time of NeRF, the inputs of Spike-NeRF are continuous spike\nstreams captured by a moving spike camera in a very short time. To reconstruct\na correct and stable 3D scene from high-frequency but unstable spike data, we\ndevised spike masks along with a distinctive loss function. We evaluate our\nmethod qualitatively and numerically on several challenging synthetic scenes\ngenerated by blender with the spike camera simulator. Our results demonstrate\nthat Spike-NeRF produces more visually appealing results than the existing\nmethods and the baseline we proposed in high-speed scenes. Our code and data\nwill be released soon.\n","authors":["Yijia Guo","Yuanxi Bai","Liwen Hu","Mianzhi Liu","Ziyi Guo","Lei Ma","Tiejun Huang"],"pdf_url":"https://arxiv.org/pdf/2403.16410v1.pdf","comment":"This paper is accepted by ICME2024"},{"id":"http://arxiv.org/abs/2403.16407v1","updated":"2024-03-25T03:47:53Z","published":"2024-03-25T03:47:53Z","title":"A Survey on Long Video Generation: Challenges, Methods, and Prospects","summary":" Video generation is a rapidly advancing research area, garnering significant\nattention due to its broad range of applications. One critical aspect of this\nfield is the generation of long-duration videos, which presents unique\nchallenges and opportunities. This paper presents the first survey of recent\nadvancements in long video generation and summarises them into two key\nparadigms: divide and conquer temporal autoregressive.\n We delve into the common models employed in each paradigm, including aspects\nof network design and conditioning techniques. Furthermore, we offer a\ncomprehensive overview and classification of the datasets and evaluation\nmetrics which are crucial for advancing long video generation research.\nConcluding with a summary of existing studies, we also discuss the emerging\nchallenges and future directions in this dynamic field. We hope that this\nsurvey will serve as an essential reference for researchers and practitioners\nin the realm of long video generation.\n","authors":["Chengxuan Li","Di Huang","Zeyu Lu","Yang Xiao","Qingqi Pei","Lei Bai"],"pdf_url":"https://arxiv.org/pdf/2403.16407v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16405v1","updated":"2024-03-25T03:44:36Z","published":"2024-03-25T03:44:36Z","title":"Ensemble Adversarial Defense via Integration of Multiple Dispersed Low\n Curvature Models","summary":" The integration of an ensemble of deep learning models has been extensively\nexplored to enhance defense against adversarial attacks. The diversity among\nsub-models increases the attack cost required to deceive the majority of the\nensemble, thereby improving the adversarial robustness. While existing\napproaches mainly center on increasing diversity in feature representations or\ndispersion of first-order gradients with respect to input, the limited\ncorrelation between these diversity metrics and adversarial robustness\nconstrains the performance of ensemble adversarial defense. In this work, we\naim to enhance ensemble diversity by reducing attack transferability. We\nidentify second-order gradients, which depict the loss curvature, as a key\nfactor in adversarial robustness. Computing the Hessian matrix involved in\nsecond-order gradients is computationally expensive. To address this, we\napproximate the Hessian-vector product using differential approximation. Given\nthat low curvature provides better robustness, our ensemble model was designed\nto consider the influence of curvature among different sub-models. We introduce\na novel regularizer to train multiple more-diverse low-curvature network\nmodels. Extensive experiments across various datasets demonstrate that our\nensemble model exhibits superior robustness against a range of attacks,\nunderscoring the effectiveness of our approach.\n","authors":["Kaikang Zhao","Xi Chen","Wei Huang","Liuxin Ding","Xianglong Kong","Fan Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.16405v1.pdf","comment":"Accepted to The 2024 International Joint Conference on Neural\n Networks (IJCNN)"},{"id":"http://arxiv.org/abs/2304.06928v2","updated":"2024-03-25T03:40:19Z","published":"2023-04-14T05:25:52Z","title":"CiPR: An Efficient Framework with Cross-instance Positive Relations for\n Generalized Category Discovery","summary":" We tackle the issue of generalized category discovery (GCD). GCD considers\nthe open-world problem of automatically clustering a partially labelled\ndataset, in which the unlabelled data may contain instances from both novel\ncategories and labelled classes. In this paper, we address the GCD problem with\nan unknown category number for the unlabelled data. We propose a framework,\nnamed CiPR, to bootstrap the representation by exploiting Cross-instance\nPositive Relations in the partially labelled data for contrastive learning,\nwhich have been neglected in existing methods. To obtain reliable\ncross-instance relations to facilitate representation learning, we introduce a\nsemi-supervised hierarchical clustering algorithm, named selective neighbor\nclustering (SNC), which can produce a clustering hierarchy directly from the\nconnected components of a graph constructed from selective neighbors. We\nfurther present a method to estimate the unknown class number using SNC with a\njoint reference score that considers clustering indexes of both labelled and\nunlabelled data, and extend SNC to allow label assignment for the unlabelled\ninstances with a given class number. We thoroughly evaluate our framework on\npublic generic image recognition datasets and challenging fine-grained\ndatasets, and establish a new state-of-the-art. Code:\nhttps://github.com/haoosz/CiPR\n","authors":["Shaozhe Hao","Kai Han","Kwan-Yee K. Wong"],"pdf_url":"https://arxiv.org/pdf/2304.06928v2.pdf","comment":"Accepted to TMLR. Code: https://github.com/haoosz/CiPR"},{"id":"http://arxiv.org/abs/2311.13614v2","updated":"2024-03-25T03:39:45Z","published":"2023-11-22T04:52:58Z","title":"HalluciDoctor: Mitigating Hallucinatory Toxicity in Visual Instruction\n Data","summary":" Multi-modal Large Language Models (MLLMs) tuned on machine-generated\ninstruction-following data have demonstrated remarkable performance in various\nmulti-modal understanding and generation tasks. However, the hallucinations\ninherent in machine-generated data, which could lead to hallucinatory outputs\nin MLLMs, remain under-explored. This work aims to investigate various\nhallucinations (i.e., object, relation, attribute hallucinations) and mitigate\nthose hallucinatory toxicities in large-scale machine-generated visual\ninstruction datasets. Drawing on the human ability to identify factual errors,\nwe present a novel hallucination detection and elimination framework,\nHalluciDoctor, based on the cross-checking paradigm. We use our framework to\nidentify and eliminate hallucinations in the training data automatically.\nInterestingly, HalluciDoctor also indicates that spurious correlations arising\nfrom long-tail object co-occurrences contribute to hallucinations. Based on\nthat, we execute counterfactual visual instruction expansion to balance data\ndistribution, thereby enhancing MLLMs' resistance to hallucinations.\nComprehensive experiments on hallucination evaluation benchmarks show that our\nmethod successfully mitigates 44.6% hallucinations relatively and maintains\ncompetitive performance compared to LLaVA. The data and code for this paper are\npublicly available. \\url{https://github.com/Yuqifan1117/HalluciDoctor}.\n","authors":["Qifan Yu","Juncheng Li","Longhui Wei","Liang Pang","Wentao Ye","Bosheng Qin","Siliang Tang","Qi Tian","Yueting Zhuang"],"pdf_url":"https://arxiv.org/pdf/2311.13614v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.16400v1","updated":"2024-03-25T03:30:37Z","published":"2024-03-25T03:30:37Z","title":"ASDF: Assembly State Detection Utilizing Late Fusion by Integrating 6D\n Pose Estimation","summary":" In medical and industrial domains, providing guidance for assembly processes\nis critical to ensure efficiency and safety. Errors in assembly can lead to\nsignificant consequences such as extended surgery times, and prolonged\nmanufacturing or maintenance times in industry. Assembly scenarios can benefit\nfrom in-situ AR visualization to provide guidance, reduce assembly times and\nminimize errors. To enable in-situ visualization 6D pose estimation can be\nleveraged. Existing 6D pose estimation techniques primarily focus on individual\nobjects and static captures. However, assembly scenarios have various dynamics\nincluding occlusion during assembly and dynamics in the assembly objects\nappearance. Existing work, combining object detection/6D pose estimation and\nassembly state detection focuses either on pure deep learning-based approaches,\nor limit the assembly state detection to building blocks. To address the\nchallenges of 6D pose estimation in combination with assembly state detection,\nour approach ASDF builds upon the strengths of YOLOv8, a real-time capable\nobject detection framework. We extend this framework, refine the object pose\nand fuse pose knowledge with network-detected pose information. Utilizing our\nlate fusion in our Pose2State module results in refined 6D pose estimation and\nassembly state detection. By combining both pose and state information, our\nPose2State module predicts the final assembly state with precision. Our\nevaluation on our ASDF dataset shows that our Pose2State module leads to an\nimproved assembly state detection and that the improvement of the assembly\nstate further leads to a more robust 6D pose estimation. Moreover, on the GBOT\ndataset, we outperform the pure deep learning-based network, and even\noutperform the hybrid and pure tracking-based approaches.\n","authors":["Hannah Schieber","Shiyu Li","Niklas Corell","Philipp Beckerle","Julian Kreimeier","Daniel Roth"],"pdf_url":"https://arxiv.org/pdf/2403.16400v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17460v3","updated":"2024-03-25T03:21:39Z","published":"2023-11-29T09:02:07Z","title":"W-HMR: Human Mesh Recovery in World Space with Weak-supervised Camera\n Calibration and Orientation Correction","summary":" For a long time, in reconstructing 3D human bodies from monocular images,\nmost methods opted to simplify the task by minimizing the influence of the\ncamera. Using a coarse focal length setting results in the reconstructed bodies\nnot aligning well with distorted images. Ignoring camera rotation leads to an\nunrealistic reconstructed body pose in world space. Consequently, the\napplication scenarios of existing methods are confined to controlled\nenvironments. When confronted with complex and diverse in-the-wild images, they\nstruggle to achieve accurate and reasonable reconstruction in world space. To\naddress the above issues, we propose W-HMR, which decouples global body\nrecovery into camera calibration, local body recovery, and global body\norientation correction. We design the first weak-supervised camera calibration\nmethod for body distortion, eliminating dependence on focal length labels and\nachieving finer mesh-image alignment. We propose a novel orientation correction\nmodule to allow the reconstructed human body to remain normal in world space.\nDecoupling body orientation and body pose enables our model to consider the\naccuracy in camera coordinate and the reasonableness in world coordinate\nsimultaneously, expanding the range of applications. As a result, W-HMR\nachieves high-quality reconstruction in dual coordinate systems, particularly\nin challenging scenes. Codes and demos have been released on the project page\nhttps://yw0208.github.io/w-hmr/.\n","authors":["Wei Yao","Hongwen Zhang","Yunlian Sun","Jinhui Tang"],"pdf_url":"https://arxiv.org/pdf/2311.17460v3.pdf","comment":"Project Page: https://yw0208.github.io/w-hmr/"},{"id":"http://arxiv.org/abs/2403.16395v1","updated":"2024-03-25T03:18:58Z","published":"2024-03-25T03:18:58Z","title":"Multi-attention Associate Prediction Network for Visual Tracking","summary":" Classification-regression prediction networks have realized impressive\nsuccess in several modern deep trackers. However, there is an inherent\ndifference between classification and regression tasks, so they have diverse\neven opposite demands for feature matching. Existed models always ignore the\nkey issue and only employ a unified matching block in two task branches,\ndecaying the decision quality. Besides, these models also struggle with\ndecision misalignment situation. In this paper, we propose a multi-attention\nassociate prediction network (MAPNet) to tackle the above problems. Concretely,\ntwo novel matchers, i.e., category-aware matcher and spatial-aware matcher, are\nfirst designed for feature comparison by integrating self, cross, channel or\nspatial attentions organically. They are capable of fully capturing the\ncategory-related semantics for classification and the local spatial contexts\nfor regression, respectively. Then, we present a dual alignment module to\nenhance the correspondences between two branches, which is useful to find the\noptimal tracking solution. Finally, we describe a Siamese tracker built upon\nthe proposed prediction network, which achieves the leading performance on five\ntracking benchmarks, consisting of LaSOT, TrackingNet, GOT-10k, TNL2k and\nUAV123, and surpasses other state-of-the-art approaches.\n","authors":["Xinglong Sun","Haijiang Sun","Shan Jiang","Jiacheng Wang","Xilai Wei","Zhonghe Hu"],"pdf_url":"https://arxiv.org/pdf/2403.16395v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16387v1","updated":"2024-03-25T03:06:45Z","published":"2024-03-25T03:06:45Z","title":"Text-IF: Leveraging Semantic Text Guidance for Degradation-Aware and\n Interactive Image Fusion","summary":" Image fusion aims to combine information from different source images to\ncreate a comprehensively representative image. Existing fusion methods are\ntypically helpless in dealing with degradations in low-quality source images\nand non-interactive to multiple subjective and objective needs. To solve them,\nwe introduce a novel approach that leverages semantic text guidance image\nfusion model for degradation-aware and interactive image fusion task, termed as\nText-IF. It innovatively extends the classical image fusion to the text guided\nimage fusion along with the ability to harmoniously address the degradation and\ninteraction issues during fusion. Through the text semantic encoder and\nsemantic interaction fusion decoder, Text-IF is accessible to the all-in-one\ninfrared and visible image degradation-aware processing and the interactive\nflexible fusion outcomes. In this way, Text-IF achieves not only multi-modal\nimage fusion, but also multi-modal information fusion. Extensive experiments\nprove that our proposed text guided image fusion strategy has obvious\nadvantages over SOTA methods in the image fusion performance and degradation\ntreatment. The code is available at https://github.com/XunpengYi/Text-IF.\n","authors":["Xunpeng Yi","Han Xu","Hao Zhang","Linfeng Tang","Jiayi Ma"],"pdf_url":"https://arxiv.org/pdf/2403.16387v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.09065v3","updated":"2024-03-25T03:04:44Z","published":"2024-03-14T03:12:02Z","title":"When Semantic Segmentation Meets Frequency Aliasing","summary":" Despite recent advancements in semantic segmentation, where and what pixels\nare hard to segment remains largely unexplored. Existing research only\nseparates an image into easy and hard regions and empirically observes the\nlatter are associated with object boundaries. In this paper, we conduct a\ncomprehensive analysis of hard pixel errors, categorizing them into three\ntypes: false responses, merging mistakes, and displacements. Our findings\nreveal a quantitative association between hard pixels and aliasing, which is\ndistortion caused by the overlapping of frequency components in the Fourier\ndomain during downsampling. To identify the frequencies responsible for\naliasing, we propose using the equivalent sampling rate to calculate the\nNyquist frequency, which marks the threshold for aliasing. Then, we introduce\nthe aliasing score as a metric to quantify the extent of aliasing. While\npositively correlated with the proposed aliasing score, three types of hard\npixels exhibit different patterns. Here, we propose two novel de-aliasing\nfilter (DAF) and frequency mixing (FreqMix) modules to alleviate aliasing\ndegradation by accurately removing or adjusting frequencies higher than the\nNyquist frequency. The DAF precisely removes the frequencies responsible for\naliasing before downsampling, while the FreqMix dynamically selects\nhigh-frequency components within the encoder block. Experimental results\ndemonstrate consistent improvements in semantic segmentation and low-light\ninstance segmentation tasks. The code is available at:\nhttps://github.com/Linwei-Chen/Seg-Aliasing.\n","authors":["Linwei Chen","Lin Gu","Ying Fu"],"pdf_url":"https://arxiv.org/pdf/2403.09065v3.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2403.16386v1","updated":"2024-03-25T03:02:51Z","published":"2024-03-25T03:02:51Z","title":"Dia-LLaMA: Towards Large Language Model-driven CT Report Generation","summary":" Medical report generation has achieved remarkable advancements yet has still\nbeen faced with several challenges. First, the inherent imbalance in the\ndistribution of normal and abnormal cases may lead models to exhibit a biased\nfocus on normal samples, resulting in unreliable diagnoses. Second, the\nfrequent occurrence of common template sentences in the reports may overwhelm\nthe critical abnormal information. Moreover, existing works focus on 2D chest\nX-rays, leaving CT report generation underexplored due to the high-dimensional\nnature of CT images and the limited availability of CT-report pairs. Recently,\nLLM has shown a great ability to generate reliable answers with appropriate\nprompts, which shed light on addressing the aforementioned challenges. In this\npaper, we propose Dia-LLaMA, a framework to adapt the LLaMA2-7B for CT report\ngeneration by incorporating diagnostic information as guidance prompts.\nConsidering the high dimension of CT, we leverage a pre-trained ViT3D with\nperceiver to extract the visual information. To tailor the LLM for report\ngeneration and emphasize abnormality, we extract additional diagnostic\ninformation by referring to a disease prototype memory bank, which is updated\nduring training to capture common disease representations. Furthermore, we\nintroduce disease-aware attention to enable the model to adjust attention for\ndifferent diseases. Experiments on the chest CT dataset demonstrated that our\nproposed method outperformed previous methods and achieved state-of-the-art on\nboth clinical efficacy performance and natural language generation metrics. The\ncode will be made publically available.\n","authors":["Zhixuan Chen","Luyang Luo","Yequan Bie","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2403.16386v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2403.16385v1","updated":"2024-03-25T03:02:27Z","published":"2024-03-25T03:02:27Z","title":"Synthesize Step-by-Step: Tools, Templates and LLMs as Data Generators\n for Reasoning-Based Chart VQA","summary":" Understanding data visualizations like charts and plots requires reasoning\nabout both visual elements and numerics. Although strong in extractive\nquestions, current chart visual question answering (chart VQA) models suffer on\ncomplex reasoning questions. In this work, we address the lack of reasoning\nability by data augmentation. We leverage Large Language Models (LLMs), which\nhave shown to have strong reasoning ability, as an automatic data annotator\nthat generates question-answer annotations for chart images. The key innovation\nin our method lies in the Synthesize Step-by-Step strategy: our LLM-based data\ngenerator learns to decompose the complex question into step-by-step\nsub-questions (rationales), which are then used to derive the final answer\nusing external tools, i.e. Python. This step-wise generation procedure is\ntrained on synthetic data generated using a template-based QA generation\npipeline. Experimental results highlight the significance of the proposed\nstep-by-step generation. By training with the LLM-augmented data (LAMENDA), we\nsignificantly enhance the chart VQA models, achieving the state-of-the-art\naccuracy on the ChartQA and PlotQA datasets. In particular, our approach\nimproves the accuracy of the previous state-of-the-art approach from 38% to 54%\non the human-written questions in the ChartQA dataset, which needs strong\nreasoning. We hope our work underscores the potential of synthetic data and\nencourages further exploration of data augmentation using LLMs for\nreasoning-heavy tasks.\n","authors":["Li Zhuowan","Jasani Bhavan","Tang Peng","Ghadar Shabnam"],"pdf_url":"https://arxiv.org/pdf/2403.16385v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.16384v1","updated":"2024-03-25T03:01:53Z","published":"2024-03-25T03:01:53Z","title":"Residual Dense Swin Transformer for Continuous Depth-Independent\n Ultrasound Imaging","summary":" Ultrasound imaging is crucial for evaluating organ morphology and function,\nyet depth adjustment can degrade image quality and field-of-view, presenting a\ndepth-dependent dilemma. Traditional interpolation-based zoom-in techniques\noften sacrifice detail and introduce artifacts. Motivated by the potential of\narbitrary-scale super-resolution to naturally address these inherent\nchallenges, we present the Residual Dense Swin Transformer Network (RDSTN),\ndesigned to capture the non-local characteristics and long-range dependencies\nintrinsic to ultrasound images. It comprises a linear embedding module for\nfeature enhancement, an encoder with shifted-window attention for modeling\nnon-locality, and an MLP decoder for continuous detail reconstruction. This\nstrategy streamlines balancing image quality and field-of-view, which offers\nsuperior textures over traditional methods. Experimentally, RDSTN outperforms\nexisting approaches while requiring fewer parameters. In conclusion, RDSTN\nshows promising potential for ultrasound image enhancement by overcoming the\nlimitations of conventional interpolation-based methods and achieving\ndepth-independent imaging.\n","authors":["Jintong Hu","Hui Che","Zishuo Li","Wenming Yang"],"pdf_url":"https://arxiv.org/pdf/2403.16384v1.pdf","comment":"Accepted by ICASSP2024, https://ieeexplore.ieee.org/document/10447712"},{"id":"http://arxiv.org/abs/2403.16379v1","updated":"2024-03-25T02:53:32Z","published":"2024-03-25T02:53:32Z","title":"FlashEval: Towards Fast and Accurate Evaluation of Text-to-image\n Diffusion Generative Models","summary":" In recent years, there has been significant progress in the development of\ntext-to-image generative models. Evaluating the quality of the generative\nmodels is one essential step in the development process. Unfortunately, the\nevaluation process could consume a significant amount of computational\nresources, making the required periodic evaluation of model performance (e.g.,\nmonitoring training progress) impractical. Therefore, we seek to improve the\nevaluation efficiency by selecting the representative subset of the text-image\ndataset. We systematically investigate the design choices, including the\nselection criteria (textural features or image-based metrics) and the selection\ngranularity (prompt-level or set-level). We find that the insights from prior\nwork on subset selection for training data do not generalize to this problem,\nand we propose FlashEval, an iterative search algorithm tailored to evaluation\ndata selection. We demonstrate the effectiveness of FlashEval on ranking\ndiffusion models with various configurations, including architectures,\nquantization levels, and sampler schedules on COCO and DiffusionDB datasets.\nOur searched 50-item subset could achieve comparable evaluation quality to the\nrandomly sampled 500-item subset for COCO annotations on unseen models,\nachieving a 10x evaluation speedup. We release the condensed subset of these\ncommonly used datasets to help facilitate diffusion algorithm design and\nevaluation, and open-source FlashEval as a tool for condensing future datasets,\naccessible at https://github.com/thu-nics/FlashEval.\n","authors":["Lin Zhao","Tianchen Zhao","Zinan Lin","Xuefei Ning","Guohao Dai","Huazhong Yang","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16379v1.pdf","comment":"The paper is accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.15082v2","updated":"2024-03-25T02:50:07Z","published":"2024-03-22T10:06:31Z","title":"Cell Variational Information Bottleneck Network","summary":" In this work, we propose Cell Variational Information Bottleneck Network\n(cellVIB), a convolutional neural network using information bottleneck\nmechanism, which can be combined with the latest feedforward network\narchitecture in an end-to-end training method. Our Cell Variational Information\nBottleneck Network is constructed by stacking VIB cells, which generate feature\nmaps with uncertainty. As layers going deeper, the regularization effect will\ngradually increase, instead of directly adding excessive regular constraints to\nthe output layer of the model as in Deep VIB. Under each VIB cell, the\nfeedforward process learns an independent mean term and an standard deviation\nterm, and predicts the Gaussian distribution based on them. The feedback\nprocess is based on reparameterization trick for effective training. This work\nperforms an extensive analysis on MNIST dataset to verify the effectiveness of\neach VIB cells, and provides an insightful analysis on how the VIB cells affect\nmutual information. Experiments conducted on CIFAR-10 also prove that our\ncellVIB is robust against noisy labels during training and against corrupted\nimages during testing. Then, we validate our method on PACS dataset, whose\nresults show that the VIB cells can significantly improve the generalization\nperformance of the basic model. Finally, in a more complex representation\nlearning task, face recognition, our network structure has also achieved very\ncompetitive results.\n","authors":["Zhonghua Zhai","Chen Ju","Jinsong Lan","Shuai Xiao"],"pdf_url":"https://arxiv.org/pdf/2403.15082v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16376v1","updated":"2024-03-25T02:46:57Z","published":"2024-03-25T02:46:57Z","title":"Elite360D: Towards Efficient 360 Depth Estimation via Semantic- and\n Distance-Aware Bi-Projection Fusion","summary":" 360 depth estimation has recently received great attention for 3D\nreconstruction owing to its omnidirectional field of view (FoV). Recent\napproaches are predominantly focused on cross-projection fusion with\ngeometry-based re-projection: they fuse 360 images with equirectangular\nprojection (ERP) and another projection type, e.g., cubemap projection to\nestimate depth with the ERP format. However, these methods suffer from 1)\nlimited local receptive fields, making it hardly possible to capture large FoV\nscenes, and 2) prohibitive computational cost, caused by the complex\ncross-projection fusion module design. In this paper, we propose Elite360D, a\nnovel framework that inputs the ERP image and icosahedron projection (ICOSAP)\npoint set, which is undistorted and spatially continuous. Elite360D is superior\nin its capacity in learning a representation from a local-with-global\nperspective. With a flexible ERP image encoder, it includes an ICOSAP point\nencoder, and a Bi-projection Bi-attention Fusion (B2F) module (totally ~1M\nparameters). Specifically, the ERP image encoder can take various perspective\nimage-trained backbones (e.g., ResNet, Transformer) to extract local features.\nThe point encoder extracts the global features from the ICOSAP. Then, the B2F\nmodule captures the semantic- and distance-aware dependencies between each\npixel of the ERP feature and the entire ICOSAP feature set. Without specific\nbackbone design and obvious computational cost increase, Elite360D outperforms\nthe prior arts on several benchmark datasets.\n","authors":["Hao Ai","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16376v1.pdf","comment":"8 pages, accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.09506v2","updated":"2024-03-25T02:45:35Z","published":"2024-03-14T15:53:04Z","title":"Don't Judge by the Look: Towards Motion Coherent Video Representation","summary":" Current training pipelines in object recognition neglect Hue Jittering when\ndoing data augmentation as it not only brings appearance changes that are\ndetrimental to classification, but also the implementation is inefficient in\npractice. In this study, we investigate the effect of hue variance in the\ncontext of video understanding and find this variance to be beneficial since\nstatic appearances are less important in videos that contain motion\ninformation. Based on this observation, we propose a data augmentation method\nfor video understanding, named Motion Coherent Augmentation (MCA), that\nintroduces appearance variation in videos and implicitly encourages the model\nto prioritize motion patterns, rather than static appearances. Concretely, we\npropose an operation SwapMix to efficiently modify the appearance of video\nsamples, and introduce Variation Alignment (VA) to resolve the distribution\nshift caused by SwapMix, enforcing the model to learn appearance invariant\nrepresentations. Comprehensive empirical evaluation across various\narchitectures and different datasets solidly validates the effectiveness and\ngeneralization ability of MCA, and the application of VA in other augmentation\nmethods. Code is available at https://github.com/BeSpontaneous/MCA-pytorch.\n","authors":["Yitian Zhang","Yue Bai","Huan Wang","Yizhou Wang","Yun Fu"],"pdf_url":"https://arxiv.org/pdf/2403.09506v2.pdf","comment":"Accepted by ICLR2024"},{"id":"http://arxiv.org/abs/2403.16370v1","updated":"2024-03-25T02:30:32Z","published":"2024-03-25T02:30:32Z","title":"GoodSAM: Bridging Domain and Capacity Gaps via Segment Anything Model\n for Distortion-aware Panoramic Semantic Segmentation","summary":" This paper tackles a novel yet challenging problem: how to transfer knowledge\nfrom the emerging Segment Anything Model (SAM) -- which reveals impressive\nzero-shot instance segmentation capacity -- to learn a compact panoramic\nsemantic segmentation model, i.e., student, without requiring any labeled data.\nThis poses considerable challenges due to SAM's inability to provide semantic\nlabels and the large capacity gap between SAM and the student. To this end, we\npropose a novel framework, called GoodSAM, that introduces a teacher assistant\n(TA) to provide semantic information, integrated with SAM to generate ensemble\nlogits to achieve knowledge transfer. Specifically, we propose a\nDistortion-Aware Rectification (DAR) module that first addresses the distortion\nproblem of panoramic images by imposing prediction-level consistency and\nboundary enhancement. This subtly enhances TA's prediction capacity on\npanoramic images. DAR then incorporates a cross-task complementary fusion block\nto adaptively merge the predictions of SAM and TA to obtain more reliable\nensemble logits. Moreover, we introduce a Multi-level Knowledge Adaptation\n(MKA) module to efficiently transfer the multi-level feature knowledge from TA\nand ensemble logits to learn a compact student model. Extensive experiments on\ntwo benchmarks show that our GoodSAM achieves a remarkable +3.75\\% mIoU\nimprovement over the state-of-the-art (SOTA) domain adaptation methods. Also,\nour most lightweight model achieves comparable performance to the SOTA methods\nwith only 3.7M parameters.\n","authors":["Weiming Zhang","Yexin Liu","Xu Zheng","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16370v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.16368v1","updated":"2024-03-25T02:17:20Z","published":"2024-03-25T02:17:20Z","title":"Distilling Semantic Priors from SAM to Efficient Image Restoration\n Models","summary":" In image restoration (IR), leveraging semantic priors from segmentation\nmodels has been a common approach to improve performance. The recent segment\nanything model (SAM) has emerged as a powerful tool for extracting advanced\nsemantic priors to enhance IR tasks. However, the computational cost of SAM is\nprohibitive for IR, compared to existing smaller IR models. The incorporation\nof SAM for extracting semantic priors considerably hampers the model inference\nefficiency. To address this issue, we propose a general framework to distill\nSAM's semantic knowledge to boost exiting IR models without interfering with\ntheir inference process. Specifically, our proposed framework consists of the\nsemantic priors fusion (SPF) scheme and the semantic priors distillation (SPD)\nscheme. SPF fuses two kinds of information between the restored image predicted\nby the original IR model and the semantic mask predicted by SAM for the refined\nrestored image. SPD leverages a self-distillation manner to distill the fused\nsemantic priors to boost the performance of original IR models. Additionally,\nwe design a semantic-guided relation (SGR) module for SPD, which ensures\nsemantic feature representation space consistency to fully distill the priors.\nWe demonstrate the effectiveness of our framework across multiple IR models and\ntasks, including deraining, deblurring, and denoising.\n","authors":["Quan Zhang","Xiaoyu Liu","Wei Li","Hanting Chen","Junchao Liu","Jie Hu","Zhiwei Xiong","Chun Yuan","Yunhe Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16368v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15048v2","updated":"2024-03-25T02:08:01Z","published":"2024-03-22T09:13:09Z","title":"Cartoon Hallucinations Detection: Pose-aware In Context Visual Learning","summary":" Large-scale Text-to-Image (TTI) models have become a common approach for\ngenerating training data in various generative fields. However, visual\nhallucinations, which contain perceptually critical defects, remain a concern,\nespecially in non-photorealistic styles like cartoon characters. We propose a\nnovel visual hallucination detection system for cartoon character images\ngenerated by TTI models. Our approach leverages pose-aware in-context visual\nlearning (PA-ICVL) with Vision-Language Models (VLMs), utilizing both RGB\nimages and pose information. By incorporating pose guidance from a fine-tuned\npose estimator, we enable VLMs to make more accurate decisions. Experimental\nresults demonstrate significant improvements in identifying visual\nhallucinations compared to baseline methods relying solely on RGB images. This\nresearch advances TTI models by mitigating visual hallucinations, expanding\ntheir potential in non-photorealistic domains.\n","authors":["Bumsoo Kim","Wonseop Shin","Kyuchul Lee","Sanghyun Seo"],"pdf_url":"https://arxiv.org/pdf/2403.15048v2.pdf","comment":"11 pages, 12 figures, 1 table, Project page:\n https://gh-bumsookim.github.io/Cartoon-Hallucinations-Detection/"},{"id":"http://arxiv.org/abs/2403.16365v1","updated":"2024-03-25T02:03:38Z","published":"2024-03-25T02:03:38Z","title":"Generating Potent Poisons and Backdoors from Scratch with Guided\n Diffusion","summary":" Modern neural networks are often trained on massive datasets that are web\nscraped with minimal human inspection. As a result of this insecure curation\npipeline, an adversary can poison or backdoor the resulting model by uploading\nmalicious data to the internet and waiting for a victim to scrape and train on\nit. Existing approaches for creating poisons and backdoors start with randomly\nsampled clean data, called base samples, and then modify those samples to craft\npoisons. However, some base samples may be significantly more amenable to\npoisoning than others. As a result, we may be able to craft more potent poisons\nby carefully choosing the base samples. In this work, we use guided diffusion\nto synthesize base samples from scratch that lead to significantly more potent\npoisons and backdoors than previous state-of-the-art attacks. Our Guided\nDiffusion Poisoning (GDP) base samples can be combined with any downstream\npoisoning or backdoor attack to boost its effectiveness. Our implementation\ncode is publicly available at: https://github.com/hsouri/GDP .\n","authors":["Hossein Souri","Arpit Bansal","Hamid Kazemi","Liam Fowl","Aniruddha Saha","Jonas Geiping","Andrew Gordon Wilson","Rama Chellappa","Tom Goldstein","Micah Goldblum"],"pdf_url":"https://arxiv.org/pdf/2403.16365v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17516v3","updated":"2024-03-25T01:55:03Z","published":"2023-11-29T10:39:53Z","title":"MMA-Diffusion: MultiModal Attack on Diffusion Models","summary":" In recent years, Text-to-Image (T2I) models have seen remarkable\nadvancements, gaining widespread adoption. However, this progress has\ninadvertently opened avenues for potential misuse, particularly in generating\ninappropriate or Not-Safe-For-Work (NSFW) content. Our work introduces\nMMA-Diffusion, a framework that presents a significant and realistic threat to\nthe security of T2I models by effectively circumventing current defensive\nmeasures in both open-source models and commercial online services. Unlike\nprevious approaches, MMA-Diffusion leverages both textual and visual modalities\nto bypass safeguards like prompt filters and post-hoc safety checkers, thus\nexposing and highlighting the vulnerabilities in existing defense mechanisms.\n","authors":["Yijun Yang","Ruiyuan Gao","Xiaosen Wang","Tsung-Yi Ho","Nan Xu","Qiang Xu"],"pdf_url":"https://arxiv.org/pdf/2311.17516v3.pdf","comment":"CVPR 2024. Code is available at\n https://github.com/yangyijune/MMA-Diffusion"},{"id":"http://arxiv.org/abs/2403.16361v1","updated":"2024-03-25T01:54:57Z","published":"2024-03-25T01:54:57Z","title":"RSTAR: Rotational Streak Artifact Reduction in 4D CBCT using Separable\n and Circular Convolutions","summary":" Four-dimensional cone-beam computed tomography (4D CBCT) provides\nrespiration-resolved images and can be used for image-guided radiation therapy.\nHowever, the ability to reveal respiratory motion comes at the cost of image\nartifacts. As raw projection data are sorted into multiple respiratory phases,\nthere is a limited number of cone-beam projections available for image\nreconstruction. Consequently, the 4D CBCT images are covered by severe streak\nartifacts. Although several deep learning-based methods have been proposed to\naddress this issue, most algorithms employ ordinary network models, neglecting\nthe intrinsic structural prior within 4D CBCT images. In this paper, we first\nexplore the origin and appearance of streak artifacts in 4D CBCT\nimages.Specifically, we find that streak artifacts exhibit a periodic\nrotational motion along with the patient's respiration. This unique motion\npattern inspires us to distinguish the artifacts from the desired anatomical\nstructures in the spatiotemporal domain. Thereafter, we propose a\nspatiotemporal neural network named RSTAR-Net with separable and circular\nconvolutions for Rotational Streak Artifact Reduction. The specially designed\nmodel effectively encodes dynamic image features, facilitating the recovery of\n4D CBCT images. Moreover, RSTAR-Net is also lightweight and computationally\nefficient. Extensive experiments substantiate the effectiveness of our proposed\nmethod, and RSTAR-Net shows superior performance to comparison methods.\n","authors":["Ziheng Deng","Hua Chen","Haibo Hu","Zhiyong Xu","Tianling Lyu","Yan Xi","Yang Chen","Jun Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.16361v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09911v2","updated":"2024-03-25T01:54:41Z","published":"2023-08-19T05:34:13Z","title":"Noisy-Correspondence Learning for Text-to-Image Person Re-identification","summary":" Text-to-image person re-identification (TIReID) is a compelling topic in the\ncross-modal community, which aims to retrieve the target person based on a\ntextual query. Although numerous TIReID methods have been proposed and achieved\npromising performance, they implicitly assume the training image-text pairs are\ncorrectly aligned, which is not always the case in real-world scenarios. In\npractice, the image-text pairs inevitably exist under-correlated or even\nfalse-correlated, a.k.a noisy correspondence (NC), due to the low quality of\nthe images and annotation errors. To address this problem, we propose a novel\nRobust Dual Embedding method (RDE) that can learn robust visual-semantic\nassociations even with NC. Specifically, RDE consists of two main components:\n1) A Confident Consensus Division (CCD) module that leverages the dual-grained\ndecisions of dual embedding modules to obtain a consensus set of clean training\ndata, which enables the model to learn correct and reliable visual-semantic\nassociations. 2) A Triplet Alignment Loss (TAL) relaxes the conventional\nTriplet Ranking loss with the hardest negative samples to a log-exponential\nupper bound over all negative ones, thus preventing the model collapse under NC\nand can also focus on hard-negative samples for promising performance. We\nconduct extensive experiments on three public benchmarks, namely CUHK-PEDES,\nICFG-PEDES, and RSTPReID, to evaluate the performance and robustness of our\nRDE. Our method achieves state-of-the-art results both with and without\nsynthetic noisy correspondences on all three datasets. Code is available at\nhttps://github.com/QinYang79/RDE.\n","authors":["Yang Qin","Yingke Chen","Dezhong Peng","Xi Peng","Joey Tianyi Zhou","Peng Hu"],"pdf_url":"https://arxiv.org/pdf/2308.09911v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16358v1","updated":"2024-03-25T01:44:34Z","published":"2024-03-25T01:44:34Z","title":"ChebMixer: Efficient Graph Representation Learning with MLP Mixer","summary":" Graph neural networks have achieved remarkable success in learning graph\nrepresentations, especially graph Transformer, which has recently shown\nsuperior performance on various graph mining tasks. However, graph Transformer\ngenerally treats nodes as tokens, which results in quadratic complexity\nregarding the number of nodes during self-attention computation. The graph MLP\nMixer addresses this challenge by using the efficient MLP Mixer technique from\ncomputer vision. However, the time-consuming process of extracting graph tokens\nlimits its performance. In this paper, we present a novel architecture named\nChebMixer, a newly graph MLP Mixer that uses fast Chebyshev polynomials-based\nspectral filtering to extract a sequence of tokens. Firstly, we produce\nmultiscale representations of graph nodes via fast Chebyshev polynomial-based\nspectral filtering. Next, we consider each node's multiscale representations as\na sequence of tokens and refine the node representation with an effective MLP\nMixer. Finally, we aggregate the multiscale representations of nodes through\nChebyshev interpolation. Owing to the powerful representation capabilities and\nfast computational properties of MLP Mixer, we can quickly extract more\ninformative node representations to improve the performance of downstream\ntasks. The experimental results prove our significant improvements in a variety\nof scenarios ranging from graph node classification to medical image\nsegmentation.\n","authors":["Xiaoyan Kui","Haonan Yan","Qinsong Li","Liming Chen","Beiji Zou"],"pdf_url":"https://arxiv.org/pdf/2403.16358v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11614v2","updated":"2024-03-25T01:23:07Z","published":"2024-03-18T09:44:44Z","title":"CRS-Diff: Controllable Generative Remote Sensing Foundation Model","summary":" The emergence of diffusion models has revolutionized the field of image\ngeneration, providing new methods for creating high-quality, high-resolution\nimages across various applications. However, the potential of these models for\ngenerating domain-specific images, particularly remote sensing (RS) images,\nremains largely untapped. RS images that are notable for their high resolution,\nextensive coverage, and rich information content, bring new challenges that\ngeneral diffusion models may not adequately address. This paper proposes\nCRS-Diff, a pioneering diffusion modeling framework specifically tailored for\ngenerating remote sensing imagery, leveraging the inherent advantages of\ndiffusion models while integrating advanced control mechanisms to ensure that\nthe imagery is not only visually clear but also enriched with geographic and\ntemporal information. The model integrates global and local control inputs,\nenabling precise combinations of generation conditions to refine the generation\nprocess. A comprehensive evaluation of CRS-Diff has demonstrated its superior\ncapability to generate RS imagery both in a single condition and multiple\nconditions compared with previous methods in terms of image quality and\ndiversity.\n","authors":["Datao Tang","Xiangyong Cao","Xingsong Hou","Zhongyuan Jiang","Deyu Meng"],"pdf_url":"https://arxiv.org/pdf/2403.11614v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17034v2","updated":"2024-03-25T01:21:18Z","published":"2023-11-28T18:45:13Z","title":"Telling Left from Right: Identifying Geometry-Aware Semantic\n Correspondence","summary":" While pre-trained large-scale vision models have shown significant promise\nfor semantic correspondence, their features often struggle to grasp the\ngeometry and orientation of instances. This paper identifies the importance of\nbeing geometry-aware for semantic correspondence and reveals a limitation of\nthe features of current foundation models under simple post-processing. We show\nthat incorporating this information can markedly enhance semantic\ncorrespondence performance with simple but effective solutions in both\nzero-shot and supervised settings. We also construct a new challenging\nbenchmark for semantic correspondence built from an existing animal pose\nestimation dataset, for both pre-training validating models. Our method\nachieves a PCK@0.10 score of 65.4 (zero-shot) and 85.6 (supervised) on the\nchallenging SPair-71k dataset, outperforming the state of the art by 5.5p and\n11.0p absolute gains, respectively. Our code and datasets are publicly\navailable at: https://telling-left-from-right.github.io/.\n","authors":["Junyi Zhang","Charles Herrmann","Junhwa Hur","Eric Chen","Varun Jampani","Deqing Sun","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2311.17034v2.pdf","comment":"Accepted by CVPR 24, project page:\n https://telling-left-from-right.github.io/"},{"id":"http://arxiv.org/abs/2403.14743v2","updated":"2024-03-25T01:18:37Z","published":"2024-03-21T18:00:00Z","title":"VURF: A General-purpose Reasoning and Self-refinement Framework for\n Video Understanding","summary":" Recent studies have demonstrated the effectiveness of Large Language Models\n(LLMs) as reasoning modules that can deconstruct complex tasks into more\nmanageable sub-tasks, particularly when applied to visual reasoning tasks for\nimages. In contrast, this paper introduces a Video Understanding and Reasoning\nFramework (VURF) based on the reasoning power of LLMs. Ours is a novel approach\nto extend the utility of LLMs in the context of video tasks, leveraging their\ncapacity to generalize from minimal input and output demonstrations within a\ncontextual framework. By presenting LLMs with pairs of instructions and their\ncorresponding high-level programs, we harness their contextual learning\ncapabilities to generate executable visual programs for video understanding. To\nenhance program's accuracy and robustness, we implement two important\nstrategies. Firstly, we employ a feedback-generation approach, powered by\nGPT-3.5, to rectify errors in programs utilizing unsupported functions.\nSecondly, taking motivation from recent works on self refinement of LLM\noutputs, we introduce an iterative procedure for improving the quality of the\nin-context examples by aligning the initial outputs to the outputs that would\nhave been generated had the LLM not been bound by the structure of the\nin-context examples. Our results on several video-specific tasks, including\nvisual QA, video anticipation, pose estimation and multi-video QA illustrate\nthe efficacy of these enhancements in improving the performance of visual\nprogramming approaches for video tasks.\n","authors":["Ahmad Mahmood","Ashmal Vayani","Muzammal Naseer","Salman Khan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2403.14743v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10119v2","updated":"2024-03-25T01:08:14Z","published":"2024-03-15T09:08:27Z","title":"URS-NeRF: Unordered Rolling Shutter Bundle Adjustment for Neural\n Radiance Fields","summary":" We propose a novel rolling shutter bundle adjustment method for neural\nradiance fields (NeRF), which utilizes the unordered rolling shutter (RS)\nimages to obtain the implicit 3D representation. Existing NeRF methods suffer\nfrom low-quality images and inaccurate initial camera poses due to the RS\neffect in the image, whereas, the previous method that incorporates the RS into\nNeRF requires strict sequential data input, limiting its widespread\napplicability. In constant, our method recovers the physical formation of RS\nimages by estimating camera poses and velocities, thereby removing the input\nconstraints on sequential data. Moreover, we adopt a coarse-to-fine training\nstrategy, in which the RS epipolar constraints of the pairwise frames in the\nscene graph are used to detect the camera poses that fall into local minima.\nThe poses detected as outliers are corrected by the interpolation method with\nneighboring poses. The experimental results validate the effectiveness of our\nmethod over state-of-the-art works and demonstrate that the reconstruction of\n3D representations is not constrained by the requirement of video sequence\ninput.\n","authors":["Bo Xu","Ziao Liu","Mengqi Guo","Jiancheng Li","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2403.10119v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.05453v2","updated":"2024-03-25T01:00:38Z","published":"2021-06-10T01:45:32Z","title":"Improving White-box Robustness of Pre-processing Defenses via Joint\n Adversarial Training","summary":" Deep neural networks (DNNs) are vulnerable to adversarial noise. A range of\nadversarial defense techniques have been proposed to mitigate the interference\nof adversarial noise, among which the input pre-processing methods are scalable\nand show great potential to safeguard DNNs. However, pre-processing methods may\nsuffer from the robustness degradation effect, in which the defense reduces\nrather than improving the adversarial robustness of a target model in a\nwhite-box setting. A potential cause of this negative effect is that\nadversarial training examples are static and independent to the pre-processing\nmodel. To solve this problem, we investigate the influence of full adversarial\nexamples which are crafted against the full model, and find they indeed have a\npositive impact on the robustness of defenses. Furthermore, we find that simply\nchanging the adversarial training examples in pre-processing methods does not\ncompletely alleviate the robustness degradation effect. This is due to the\nadversarial risk of the pre-processed model being neglected, which is another\ncause of the robustness degradation effect. Motivated by above analyses, we\npropose a method called Joint Adversarial Training based Pre-processing (JATP)\ndefense. Specifically, we formulate a feature similarity based adversarial risk\nfor the pre-processing model by using full adversarial examples found in a\nfeature space. Unlike standard adversarial training, we only update the\npre-processing model, which prompts us to introduce a pixel-wise loss to\nimprove its cross-model transferability. We then conduct a joint adversarial\ntraining on the pre-processing model to minimize this overall risk. Empirical\nresults show that our method could effectively mitigate the robustness\ndegradation effect across different target models in comparison to previous\nstate-of-the-art approaches.\n","authors":["Dawei Zhou","Nannan Wang","Xinbo Gao","Bo Han","Jun Yu","Xiaoyu Wang","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2106.05453v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16350v1","updated":"2024-03-25T00:59:35Z","published":"2024-03-25T00:59:35Z","title":"3D-EffiViTCaps: 3D Efficient Vision Transformer with Capsule for Medical\n Image Segmentation","summary":" Medical image segmentation (MIS) aims to finely segment various organs. It\nrequires grasping global information from both parts and the entire image for\nbetter segmenting, and clinically there are often certain requirements for\nsegmentation efficiency. Convolutional neural networks (CNNs) have made\nconsiderable achievements in MIS. However, they are difficult to fully collect\nglobal context information and their pooling layer may cause information loss.\nCapsule networks, which combine the benefits of CNNs while taking into account\nadditional information such as relative location that CNNs do not, have lately\ndemonstrated some advantages in MIS. Vision Transformer (ViT) employs\ntransformers in visual tasks. Transformer based on attention mechanism has\nexcellent global inductive modeling capabilities and is expected to capture\nlongrange information. Moreover, there have been resent studies on making ViT\nmore lightweight to minimize model complexity and increase efficiency. In this\npaper, we propose a U-shaped 3D encoder-decoder network named 3D-EffiViTCaps,\nwhich combines 3D capsule blocks with 3D EfficientViT blocks for MIS. Our\nencoder uses capsule blocks and EfficientViT blocks to jointly capture local\nand global semantic information more effectively and efficiently with less\ninformation loss, while the decoder employs CNN blocks and EfficientViT blocks\nto catch ffner details for segmentation. We conduct experiments on various\ndatasets, including iSeg-2017, Hippocampus and Cardiac to verify the\nperformance and efficiency of 3D-EffiViTCaps, which performs better than\nprevious 3D CNN-based, 3D Capsule-based and 3D Transformer-based models. We\nfurther implement a series of ablation experiments on the main blocks. Our code\nis available at: https://github.com/HidNeuron/3D-EffiViTCaps.\n","authors":["Dongwei Gan","Ming Chang","Juan Chen"],"pdf_url":"https://arxiv.org/pdf/2403.16350v1.pdf","comment":"15 pages, 4 figures, submitted to ICPR2024"},{"id":"http://arxiv.org/abs/2301.06626v2","updated":"2024-03-25T00:45:30Z","published":"2023-01-16T22:30:53Z","title":"Masked Vector Quantization","summary":" Generative models with discrete latent representations have recently\ndemonstrated an impressive ability to learn complex high-dimensional data\ndistributions. However, their performance relies on a long sequence of tokens\nper instance and a large number of codebook entries, resulting in long sampling\ntimes and considerable computation to fit the categorical posterior. To address\nthese issues, we propose the Masked Vector Quantization (MVQ) framework which\nincreases the representational capacity of each code vector by learning mask\nconfigurations via a stochastic winner-takes-all training regime called\nMultiple Hypothese Dropout (MH-Dropout). On ImageNet 64$\\times$64, MVQ reduces\nFID in existing vector quantization architectures by up to $68\\%$ at 2 tokens\nper instance and $57\\%$ at 5 tokens. These improvements widen as codebook\nentries is reduced and allows for $7\\textit{--}45\\times$ speed-up in token\nsampling during inference. As an additional benefit, we find that smaller\nlatent spaces lead to MVQ identifying transferable visual representations where\nmultiple can be smoothly combined.\n","authors":["David D. Nguyen","David Leibowitz","Surya Nepal","Salil S. Kanhere"],"pdf_url":"https://arxiv.org/pdf/2301.06626v2.pdf","comment":"A newer version of this manuscript was archived under 2312.11735"},{"id":"http://arxiv.org/abs/2403.16338v1","updated":"2024-03-25T00:24:10Z","published":"2024-03-25T00:24:10Z","title":"Impact of Video Compression Artifacts on Fisheye Camera Visual\n Perception Tasks","summary":" Autonomous driving systems require extensive data collection schemes to cover\nthe diverse scenarios needed for building a robust and safe system. The data\nvolumes are in the order of Exabytes and have to be stored for a long period of\ntime (i.e., more than 10 years of the vehicle's life cycle). Lossless\ncompression doesn't provide sufficient compression ratios, hence, lossy video\ncompression has been explored. It is essential to prove that lossy video\ncompression artifacts do not impact the performance of the perception\nalgorithms. However, there is limited work in this area to provide a solid\nconclusion. In particular, there is no such work for fisheye cameras, which\nhave high radial distortion and where compression may have higher artifacts.\nFisheye cameras are commonly used in automotive systems for 3D object detection\ntask. In this work, we provide the first analysis of the impact of standard\nvideo compression codecs on wide FOV fisheye camera images. We demonstrate that\nthe achievable compression with negligible impact depends on the dataset and\ntemporal prediction of the video codec. We propose a radial distortion-aware\nzonal metric to evaluate the performance of artifacts in fisheye images. In\naddition, we present a novel method for estimating affine mode parameters of\nthe latest VVC codec, and suggest some areas for improvement in video codecs\nfor the application to fisheye imagery.\n","authors":["Madhumitha Sakthi","Louis Kerofsky","Varun Ravi Kumar","Senthil Yogamani"],"pdf_url":"https://arxiv.org/pdf/2403.16338v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16335v1","updated":"2024-03-25T00:17:43Z","published":"2024-03-25T00:17:43Z","title":"MEDDAP: Medical Dataset Enhancement via Diversified Augmentation\n Pipeline","summary":" The effectiveness of Deep Neural Networks (DNNs) heavily relies on the\nabundance and accuracy of available training data. However, collecting and\nannotating data on a large scale is often both costly and time-intensive,\nparticularly in medical cases where practitioners are already occupied with\ntheir duties. Moreover, ensuring that the model remains robust across various\nscenarios of image capture is crucial in medical domains, especially when\ndealing with ultrasound images that vary based on the settings of different\ndevices and the manual operation of the transducer. To address this challenge,\nwe introduce a novel pipeline called MEDDAP, which leverages Stable Diffusion\n(SD) models to augment existing small datasets by automatically generating new\ninformative labeled samples. Pretrained checkpoints for SD are typically based\non natural images, and training them for medical images requires significant\nGPU resources due to their heavy parameters. To overcome this challenge, we\nintroduce USLoRA (Ultrasound Low-Rank Adaptation), a novel fine-tuning method\ntailored specifically for ultrasound applications. USLoRA allows for selective\nfine-tuning of weights within SD, requiring fewer than 0.1\\% of parameters\ncompared to fully fine-tuning only the UNet portion of SD. To enhance dataset\ndiversity, we incorporate different adjectives into the generation process\nprompts, thereby desensitizing the classifiers to intensity changes across\ndifferent images. This approach is inspired by clinicians' decision-making\nprocesses regarding breast tumors, where tumor shape often plays a more crucial\nrole than intensity. In conclusion, our pipeline not only outperforms\nclassifiers trained on the original dataset but also demonstrates superior\nperformance when encountering unseen datasets. The source code is available at\nhttps://github.com/yasamin-med/MEDDAP.\n","authors":["Yasamin Medghalchi","Niloufar Zakariaei","Arman Rahmim","Ilker Hacihaliloglu"],"pdf_url":"https://arxiv.org/pdf/2403.16335v1.pdf","comment":"submitted to miccai 2024 submitted to miccai 2024 Submitted to\n MICCAI-2024"},{"id":"http://arxiv.org/abs/2403.15388v2","updated":"2024-03-25T17:59:55Z","published":"2024-03-22T17:59:52Z","title":"LLaVA-PruMerge: Adaptive Token Reduction for Efficient Large Multimodal\n Models","summary":" Large Multimodal Models (LMMs) have shown significant reasoning capabilities\nby connecting a visual encoder and a large language model. LMMs typically use a\nfixed amount of visual tokens, such as the penultimate layer features in the\nCLIP visual encoder, as the prefix content. Recent LMMs incorporate more\ncomplex visual inputs, such as high-resolution images and videos, which\nincrease the number of visual tokens significantly. However, due to the design\nof the Transformer architecture, computational costs associated with these\nmodels tend to increase quadratically with the number of input tokens. To\ntackle this problem, we explore a token reduction mechanism and find, similar\nto prior work, that many visual tokens are spatially redundant. Based on this,\nwe propose PruMerge, a novel adaptive visual token reduction approach, which\nlargely reduces the number of visual tokens while maintaining comparable model\nperformance. We first select the unpruned visual tokens based on their\nsimilarity to class tokens and spatial tokens. We then cluster the pruned\ntokens based on key similarity and merge the clustered tokens with the unpruned\ntokens to supplement their information. Empirically, when applied to LLaVA-1.5,\nour approach can compress the visual tokens by 18 times on average, and achieve\ncomparable performance across diverse visual question-answering and reasoning\ntasks. Code and checkpoints are at https://llava-prumerge.github.io/.\n","authors":["Yuzhang Shang","Mu Cai","Bingxin Xu","Yong Jae Lee","Yan Yan"],"pdf_url":"https://arxiv.org/pdf/2403.15388v2.pdf","comment":"Project page: https://llava-prumerge.github.io/"},{"id":"http://arxiv.org/abs/2403.15317v2","updated":"2024-03-25T16:45:41Z","published":"2024-03-22T16:11:29Z","title":"Point-DETR3D: Leveraging Imagery Data with Spatial Point Prior for\n Weakly Semi-supervised 3D Object Detection","summary":" Training high-accuracy 3D detectors necessitates massive labeled 3D\nannotations with 7 degree-of-freedom, which is laborious and time-consuming.\nTherefore, the form of point annotations is proposed to offer significant\nprospects for practical applications in 3D detection, which is not only more\naccessible and less expensive but also provides strong spatial information for\nobject localization. In this paper, we empirically discover that it is\nnon-trivial to merely adapt Point-DETR to its 3D form, encountering two main\nbottlenecks: 1) it fails to encode strong 3D prior into the model, and 2) it\ngenerates low-quality pseudo labels in distant regions due to the extreme\nsparsity of LiDAR points. To overcome these challenges, we introduce\nPoint-DETR3D, a teacher-student framework for weakly semi-supervised 3D\ndetection, designed to fully capitalize on point-wise supervision within a\nconstrained instance-wise annotation budget.Different from Point-DETR which\nencodes 3D positional information solely through a point encoder, we propose an\nexplicit positional query initialization strategy to enhance the positional\nprior. Considering the low quality of pseudo labels at distant regions produced\nby the teacher model, we enhance the detector's perception by incorporating\ndense imagery data through a novel Cross-Modal Deformable RoI Fusion\n(D-RoI).Moreover, an innovative point-guided self-supervised learning technique\nis proposed to allow for fully exploiting point priors, even in student\nmodels.Extensive experiments on representative nuScenes dataset demonstrate our\nPoint-DETR3D obtains significant improvements compared to previous works.\nNotably, with only 5% of labeled data, Point-DETR3D achieves over 90%\nperformance of its fully supervised counterpart.\n","authors":["Hongzhi Gao","Zheng Chen","Zehui Chen","Lin Chen","Jiaming Liu","Shanghang Zhang","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.15317v2.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2403.15011v2","updated":"2024-03-25T14:50:47Z","published":"2024-03-22T07:49:55Z","title":"Cell Tracking according to Biological Needs -- Strong Mitosis-aware\n Random-finite Sets Tracker with Aleatoric Uncertainty","summary":" Cell tracking and segmentation assist biologists in extracting insights from\nlarge-scale microscopy time-lapse data. Driven by local accuracy metrics,\ncurrent tracking approaches often suffer from a lack of long-term consistency.\nTo address this issue, we introduce an uncertainty estimation technique for\nneural tracking-by-regression frameworks and incorporate it into our novel\nextended Poisson multi-Bernoulli mixture tracker. Our uncertainty estimation\nidentifies uncertain associations within high-performing tracking-by-regression\nmethods using problem-specific test-time augmentations. Leveraging this\nuncertainty, along with a novel mitosis-aware assignment problem formulation,\nour tracker resolves false associations and mitosis detections stemming from\nlong-term conflicts. We evaluate our approach on nine competitive datasets and\ndemonstrate that it outperforms the current state-of-the-art on biologically\nrelevant metrics substantially, achieving improvements by a factor of\napproximately $5.75$. Furthermore, we uncover new insights into the behavior of\ntracking-by-regression uncertainty.\n","authors":["Timo Kaiser","Maximilian Schier","Bodo Rosenhahn"],"pdf_url":"https://arxiv.org/pdf/2403.15011v2.pdf","comment":"23 pages, 10 figures, 5 tables"},{"id":"http://arxiv.org/abs/2312.02365v2","updated":"2024-03-25T23:52:15Z","published":"2023-12-04T21:46:39Z","title":"MEDPSeg: Hierarchical polymorphic multitask learning for the\n segmentation of ground-glass opacities, consolidation, and pulmonary\n structures on computed tomography","summary":" The COVID-19 pandemic response highlighted the potential of deep learning\nmethods in facilitating the diagnosis, prognosis and understanding of lung\ndiseases through automated segmentation of pulmonary structures and lesions in\nchest computed tomography (CT). Automated separation of lung lesion into\nground-glass opacity (GGO) and consolidation is hindered due to the\nlabor-intensive and subjective nature of this task, resulting in scarce\navailability of ground truth for supervised learning. To tackle this problem,\nwe propose MEDPSeg. MEDPSeg learns from heterogeneous chest CT targets through\nhierarchical polymorphic multitask learning (HPML). HPML explores the\nhierarchical nature of GGO and consolidation, lung lesions, and the lungs, with\nfurther benefits achieved through multitasking airway and pulmonary artery\nsegmentation. Over 6000 volumetric CT scans from different partially labeled\nsources were used for training and testing. Experiments show PML enabling new\nstate-of-the-art performance for GGO and consolidation segmentation tasks. In\naddition, MEDPSeg simultaneously performs segmentation of the lung parenchyma,\nairways, pulmonary artery, and lung lesions, all in a single forward\nprediction, with performance comparable to state-of-the-art methods specialized\nin each of those targets. Finally, we provide an open-source implementation\nwith a graphical user interface at https://github.com/MICLab-Unicamp/medpseg.\n","authors":["Diedre S. Carmo","Jean A. Ribeiro","Alejandro P. Comellas","Joseph M. Reinhardt","Sarah E. Gerard","Letícia Rittner","Roberto A. Lotufo"],"pdf_url":"https://arxiv.org/pdf/2312.02365v2.pdf","comment":"This manuscript is under review and might change in the future"},{"id":"http://arxiv.org/abs/2312.00412v2","updated":"2024-03-25T23:40:29Z","published":"2023-12-01T08:22:34Z","title":"SCHEME: Scalable Channer Mixer for Vision Transformers","summary":" Vision Transformers have received significant attention due to their\nimpressive performance in many vision tasks. While the token mixer or attention\nblock has been studied in great detail, the channel mixer or feature mixing\nblock (FFN or MLP) has not been explored in depth albeit it accounts for a bulk\nof the parameters and computation in a model. In this work, we study whether\nsparse feature mixing can replace the dense connections and confirm this with a\nblock diagonal MLP structure that improves the accuracy by supporting larger\nexpansion ratios. To improve the feature clusters formed by this structure and\nthereby further improve the accuracy, a lightweight, parameter-free, channel\ncovariance attention (CCA) mechanism is introduced as a parallel branch during\ntraining. This design of CCA enables gradual feature mixing across channel\ngroups during training whose contribution decays to zero as the training\nprogresses to convergence. This allows the CCA block to be discarded during\ninference, thus enabling enhanced performance with no additional computational\ncost. The resulting $\\textit{Scalable CHannEl MixEr}$ (SCHEME) can be plugged\ninto any ViT architecture to obtain a gamut of models with different trade-offs\nbetween complexity and performance by controlling the block diagonal structure\nsize in the MLP. This is shown by the introduction of a new family of\nSCHEMEformer models that is shown to establish new Pareto frontiers for\naccuracy vs FLOPS, accuracy vs model size, and accuracy vs throughput,\nespecially for fast transformers of small model size. For example, the\nSCHEMEformer establishes a new SOTA of 79.7% accuracy for ViTs using pure\nattention mixers on ImageNet-1K at 1.77G FLOPs.\n","authors":["Deepak Sridhar","Yunsheng Li","Nuno Vasconcelos"],"pdf_url":"https://arxiv.org/pdf/2312.00412v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2310.10971v2","updated":"2024-03-25T23:14:28Z","published":"2023-10-17T03:35:27Z","title":"Context-Aware Meta-Learning","summary":" Large Language Models like ChatGPT demonstrate a remarkable capacity to learn\nnew concepts during inference without any fine-tuning. However, visual models\ntrained to detect new objects during inference have been unable to replicate\nthis ability, and instead either perform poorly or require meta-training and/or\nfine-tuning on similar objects. In this work, we propose a meta-learning\nalgorithm that emulates Large Language Models by learning new visual concepts\nduring inference without fine-tuning. Our approach leverages a frozen\npre-trained feature extractor, and analogous to in-context learning, recasts\nvisual meta-learning as sequence modeling over datapoints with known labels and\na test datapoint with an unknown label. On 8 out of 11 meta-learning\nbenchmarks, our approach -- without meta-training or fine-tuning -- exceeds or\nmatches the state-of-the-art algorithm, P>M>F, which is meta-trained on these\nbenchmarks. Our code is available at https://github.com/cfifty/CAML.\n","authors":["Christopher Fifty","Dennis Duan","Ronald G. Junkins","Ehsan Amid","Jure Leskovec","Christopher Re","Sebastian Thrun"],"pdf_url":"https://arxiv.org/pdf/2310.10971v2.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2403.17255v1","updated":"2024-03-25T23:03:51Z","published":"2024-03-25T23:03:51Z","title":"Decoding the visual attention of pathologists to reveal their level of\n expertise","summary":" We present a method for classifying the expertise of a pathologist based on\nhow they allocated their attention during a cancer reading. We engage this\ndecoding task by developing a novel method for predicting the attention of\npathologists as they read whole-slide Images (WSIs) of prostate and make cancer\ngrade classifications. Our ground truth measure of a pathologists' attention is\nthe x, y and z (magnification) movement of their viewport as they navigated\nthrough WSIs during readings, and to date we have the attention behavior of 43\npathologists reading 123 WSIs. These data revealed that specialists have higher\nagreement in both their attention and cancer grades compared to general\npathologists and residents, suggesting that sufficient information may exist in\ntheir attention behavior to classify their expertise level. To attempt this, we\ntrained a transformer-based model to predict the visual attention heatmaps of\nresident, general, and specialist (GU) pathologists during Gleason grading.\nBased solely on a pathologist's attention during a reading, our model was able\nto predict their level of expertise with 75.3%, 56.1%, and 77.2% accuracy,\nrespectively, better than chance and baseline models. Our model therefore\nenables a pathologist's expertise level to be easily and objectively evaluated,\nimportant for pathology training and competency assessment. Tools developed\nfrom our model could also be used to help pathology trainees learn how to read\nWSIs like an expert.\n","authors":["Souradeep Chakraborty","Dana Perez","Paul Friedman","Natallia Sheuka","Constantin Friedman","Oksana Yaskiv","Rajarsi Gupta","Gregory J. Zelinsky","Joel H. Saltz","Dimitris Samaras"],"pdf_url":"https://arxiv.org/pdf/2403.17255v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17237v1","updated":"2024-03-25T22:34:05Z","published":"2024-03-25T22:34:05Z","title":"DreamPolisher: Towards High-Quality Text-to-3D Generation via Geometric\n Diffusion","summary":" We present DreamPolisher, a novel Gaussian Splatting based method with\ngeometric guidance, tailored to learn cross-view consistency and intricate\ndetail from textual descriptions. While recent progress on text-to-3D\ngeneration methods have been promising, prevailing methods often fail to ensure\nview-consistency and textural richness. This problem becomes particularly\nnoticeable for methods that work with text input alone. To address this, we\npropose a two-stage Gaussian Splatting based approach that enforces geometric\nconsistency among views. Initially, a coarse 3D generation undergoes refinement\nvia geometric optimization. Subsequently, we use a ControlNet driven refiner\ncoupled with the geometric consistency term to improve both texture fidelity\nand overall consistency of the generated 3D asset. Empirical evaluations across\ndiverse textual prompts spanning various object categories demonstrate the\nefficacy of DreamPolisher in generating consistent and realistic 3D objects,\naligning closely with the semantics of the textual instructions.\n","authors":["Yuanze Lin","Ronald Clark","Philip Torr"],"pdf_url":"https://arxiv.org/pdf/2403.17237v1.pdf","comment":"Project webpage: https://yuanze-lin.me/DreamPolisher_page/"},{"id":"http://arxiv.org/abs/2312.12735v2","updated":"2024-03-25T22:25:35Z","published":"2023-12-20T03:16:34Z","title":"MetaSegNet: Metadata-collaborative Vision-Language Representation\n Learning for Semantic Segmentation of Remote Sensing Images","summary":" Semantic segmentation of remote sensing images plays a vital role in a wide\nrange of Earth Observation (EO) applications, such as land use land cover\nmapping, environment monitoring, and sustainable development. Driven by rapid\ndevelopments in Artificial Intelligence (AI), deep learning (DL) has emerged as\nthe mainstream tool for semantic segmentation and has achieved many\nbreakthroughs in the field of remote sensing. However, the existing DL-based\nmethods mainly focus on unimodal visual data while ignoring the rich multimodal\ninformation involved in the real world, usually demonstrating weak reliability\nand generlization. Inspired by the success of Vision Transformers and large\nlanguage models, we propose a novel metadata-collaborative multimodal\nsegmentation network (MetaSegNet) that applies vision-language representation\nlearning for semantic segmentation of remote sensing images. Unlike the common\nmodel structure that only uses unimodal visual data, we extract the key\ncharacteristic (e.g. the climate zone) from freely available remote sensing\nimage metadata and transfer it into knowledge-based text prompts via the\ngeneric ChatGPT. Then, we construct an image encoder, a text encoder and a\ncross-modal attention fusion subnetwork to extract the image and text feature\nand apply image-text interaction. Benefiting from such a design, the proposed\nMetaSegNet demonstrates superior generalization and achieves competitive\naccuracy with the state-of-the-art semantic segmentation methods on the\nlarge-scale OpenEarthMap dataset (68.6% mIoU) and Potsdam dataset (93.3% mean\nF1 score) as well as LoveDA dataset (52.2% mIoU).\n","authors":["Libo Wang","Sijun Dong","Ying Chen","Xiaoliang Meng","Shenghui Fang","Ayman Habib","Songlin Fei"],"pdf_url":"https://arxiv.org/pdf/2312.12735v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15101v3","updated":"2024-03-25T22:13:44Z","published":"2023-12-22T22:46:48Z","title":"Fix-Con: Automatic Fault Localization and Repair of Deep Learning Model\n Conversions between Frameworks","summary":" Converting deep learning models between frameworks is a common step to\nmaximize model compatibility across devices and leverage optimization features\nthat may be exclusively provided in one deep learning framework. However, this\nconversion process may be riddled with bugs, making the converted models either\nundeployable or problematic, considerably degrading their prediction\ncorrectness.\n In this paper we propose an automated approach for fault localization and\nrepair, Fix-Con, during model conversion between deep learning frameworks.\nFix-Con is capable of detecting and fixing faults introduced in model input,\nparameters, hyperparameters, and the model graph during conversion.\n Fix-Con uses a set of fault types (mined from surveying conversion issues\nreported \\nick{in code repositories and forums}) to localize potential\nconversion faults in the converted target model and then repair them\nappropriately, e.g., replacing the parameters of the target model with those\nfrom the source model. This is done iteratively for every image in the dataset,\ncomparing output label differences between the source model and the converted\ntarget model until all differences are resolved. We evaluate the effectiveness\nof Fix-Con in fixing model conversion bugs of three widely used image\nrecognition models converted across four different deep learning frameworks.\nOverall, Fix-Con was able to fix $462$ out of $755$ detected conversion faults,\neither completely repairing or significantly improving the performance of $14$\nout of the $15$ erroneous conversion cases.\n","authors":["Nikolaos Louloudakis","Perry Gibson","José Cano","Ajitha Rajan"],"pdf_url":"https://arxiv.org/pdf/2312.15101v3.pdf","comment":"12 pages, 4 figures, 3 tables, 1 algorithm"},{"id":"http://arxiv.org/abs/1905.10711v5","updated":"2024-03-25T22:10:45Z","published":"2019-05-26T01:58:28Z","title":"DISN: Deep Implicit Surface Network for High-quality Single-view 3D\n Reconstruction","summary":" Reconstructing 3D shapes from single-view images has been a long-standing\nresearch problem. In this paper, we present DISN, a Deep Implicit Surface\nNetwork which can generate a high-quality detail-rich 3D mesh from an 2D image\nby predicting the underlying signed distance fields. In addition to utilizing\nglobal image features, DISN predicts the projected location for each 3D point\non the 2D image, and extracts local features from the image feature maps.\nCombining global and local features significantly improves the accuracy of the\nsigned distance field prediction, especially for the detail-rich areas. To the\nbest of our knowledge, DISN is the first method that constantly captures\ndetails such as holes and thin structures present in 3D shapes from single-view\nimages. DISN achieves the state-of-the-art single-view reconstruction\nperformance on a variety of shape categories reconstructed from both synthetic\nand real images. Code is available at https://github.com/xharlie/DISN The\nsupplementary can be found at\nhttps://xharlie.github.io/images/neurips_2019_supp.pdf\n","authors":["Qiangeng Xu","Weiyue Wang","Duygu Ceylan","Radomir Mech","Ulrich Neumann"],"pdf_url":"https://arxiv.org/pdf/1905.10711v5.pdf","comment":"This project was in part supported by the gift funding to the\n University of Southern California from Adobe Research"},{"id":"http://arxiv.org/abs/2403.17223v1","updated":"2024-03-25T21:53:36Z","published":"2024-03-25T21:53:36Z","title":"Co-Occurring of Object Detection and Identification towards unlabeled\n object discovery","summary":" In this paper, we propose a novel deep learning based approach for\nidentifying co-occurring objects in conjunction with base objects in multilabel\nobject categories. Nowadays, with the advancement in computer vision based\ntechniques we need to know about co-occurring objects with respect to base\nobject for various purposes. The pipeline of the proposed work is composed of\ntwo stages: in the first stage of the proposed model we detect all the bounding\nboxes present in the image and their corresponding labels, then in the second\nstage we perform co-occurrence matrix analysis. In co-occurrence matrix\nanalysis, we set base classes based on the maximum occurrences of the labels\nand build association rules and generate frequent patterns. These frequent\npatterns will show base classes and their corresponding co-occurring classes.\nWe performed our experiments on two publicly available datasets: Pascal VOC and\nMS-COCO. The experimental results on public benchmark dataset is reported in\nSec 4. Further we extend this work by considering all frequently objects as\nunlabeled and what if they are occluded as well.\n","authors":["Binay Kumar Singh","Niels Da Vitoria Lobo"],"pdf_url":"https://arxiv.org/pdf/2403.17223v1.pdf","comment":"6 pages, 2 figures,"},{"id":"http://arxiv.org/abs/2403.17217v1","updated":"2024-03-25T21:46:53Z","published":"2024-03-25T21:46:53Z","title":"DiffusionAct: Controllable Diffusion Autoencoder for One-shot Face\n Reenactment","summary":" Video-driven neural face reenactment aims to synthesize realistic facial\nimages that successfully preserve the identity and appearance of a source face,\nwhile transferring the target head pose and facial expressions. Existing\nGAN-based methods suffer from either distortions and visual artifacts or poor\nreconstruction quality, i.e., the background and several important appearance\ndetails, such as hair style/color, glasses and accessories, are not faithfully\nreconstructed. Recent advances in Diffusion Probabilistic Models (DPMs) enable\nthe generation of high-quality realistic images. To this end, in this paper we\npresent DiffusionAct, a novel method that leverages the photo-realistic image\ngeneration of diffusion models to perform neural face reenactment.\nSpecifically, we propose to control the semantic space of a Diffusion\nAutoencoder (DiffAE), in order to edit the facial pose of the input images,\ndefined as the head pose orientation and the facial expressions. Our method\nallows one-shot, self, and cross-subject reenactment, without requiring\nsubject-specific fine-tuning. We compare against state-of-the-art GAN-,\nStyleGAN2-, and diffusion-based methods, showing better or on-par reenactment\nperformance.\n","authors":["Stella Bounareli","Christos Tzelepis","Vasileios Argyriou","Ioannis Patras","Georgios Tzimiropoulos"],"pdf_url":"https://arxiv.org/pdf/2403.17217v1.pdf","comment":"Project page: https://stelabou.github.io/diffusionact/"},{"id":"http://arxiv.org/abs/2403.17213v1","updated":"2024-03-25T21:40:44Z","published":"2024-03-25T21:40:44Z","title":"AnimateMe: 4D Facial Expressions via Diffusion Models","summary":" The field of photorealistic 3D avatar reconstruction and generation has\ngarnered significant attention in recent years; however, animating such avatars\nremains challenging. Recent advances in diffusion models have notably enhanced\nthe capabilities of generative models in 2D animation. In this work, we\ndirectly utilize these models within the 3D domain to achieve controllable and\nhigh-fidelity 4D facial animation. By integrating the strengths of diffusion\nprocesses and geometric deep learning, we employ Graph Neural Networks (GNNs)\nas denoising diffusion models in a novel approach, formulating the diffusion\nprocess directly on the mesh space and enabling the generation of 3D facial\nexpressions. This facilitates the generation of facial deformations through a\nmesh-diffusion-based model. Additionally, to ensure temporal coherence in our\nanimations, we propose a consistent noise sampling method. Under a series of\nboth quantitative and qualitative experiments, we showcase that the proposed\nmethod outperforms prior work in 4D expression synthesis by generating\nhigh-fidelity extreme expressions. Furthermore, we applied our method to\ntextured 4D facial expression generation, implementing a straightforward\nextension that involves training on a large-scale textured 4D facial expression\ndatabase.\n","authors":["Dimitrios Gerogiannis","Foivos Paraperas Papantoniou","Rolandos Alexandros Potamias","Alexandros Lattas","Stylianos Moschoglou","Stylianos Ploumpis","Stefanos Zafeiriou"],"pdf_url":"https://arxiv.org/pdf/2403.17213v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06157v5","updated":"2024-03-25T21:23:11Z","published":"2023-06-10T23:50:02Z","title":"Fault Localization for Buggy Deep Learning Framework Conversions in\n Image Recognition","summary":" When deploying Deep Neural Networks (DNNs), developers often convert models\nfrom one deep learning framework to another (e.g., TensorFlow to PyTorch).\nHowever, this process is error-prone and can impact target model accuracy. To\nidentify the extent of such impact, we perform and briefly present a\ndifferential analysis against three DNNs widely used for image recognition\n(MobileNetV2, ResNet101, and InceptionV3) converted across four well-known deep\nlearning frameworks (PyTorch, Keras, TensorFlow (TF), and TFLite), which\nrevealed numerous model crashes and output label discrepancies of up to 100%.\nTo mitigate such errors, we present a novel approach towards fault localization\nand repair of buggy deep learning framework conversions, focusing on\npre-trained image recognition models. Our technique consists of four stages of\nanalysis: 1) conversion tools, 2) model parameters, 3) model hyperparameters,\nand 4) graph representation. In addition, we propose various strategies towards\nfault repair of the faults detected. We implement our technique on top of the\nApache TVM deep learning compiler, and we test it by conducting a preliminary\nfault localization analysis for the conversion of InceptionV3 from TF to\nTFLite. Our approach detected a fault in a common DNN converter tool, which\nintroduced precision errors in weights, reducing model accuracy. After our\nfault localization, we repaired the issue, reducing our conversion error to\nzero.\n","authors":["Nikolaos Louloudakis","Perry Gibson","José Cano","Ajitha Rajan"],"pdf_url":"https://arxiv.org/pdf/2306.06157v5.pdf","comment":"5 pages, 3 figures, 1 table"},{"id":"http://arxiv.org/abs/2403.17192v1","updated":"2024-03-25T21:08:26Z","published":"2024-03-25T21:08:26Z","title":"Strategies to Improve Real-World Applicability of Laparoscopic Anatomy\n Segmentation Models","summary":" Accurate identification and localization of anatomical structures of varying\nsize and appearance in laparoscopic imaging are necessary to leverage the\npotential of computer vision techniques for surgical decision support.\nSegmentation performance of such models is traditionally reported using metrics\nof overlap such as IoU. However, imbalanced and unrealistic representation of\nclasses in the training data and suboptimal selection of reported metrics have\nthe potential to skew nominal segmentation performance and thereby ultimately\nlimit clinical translation. In this work, we systematically analyze the impact\nof class characteristics (i.e., organ size differences), training and test data\ncomposition (i.e., representation of positive and negative examples), and\nmodeling parameters (i.e., foreground-to-background class weight) on eight\nsegmentation metrics: accuracy, precision, recall, IoU, F1 score, specificity,\nHausdorff Distance, and Average Symmetric Surface Distance. Based on our\nfindings, we propose two simple yet effective strategies to improve real-world\napplicability of image segmentation models in laparoscopic surgical data: (1)\ninclusion of negative examples in the training process and (2) adaptation of\nforeground-background weights in segmentation models to maximize model\nperformance with respect to specific metrics of interest, depending on the\nclinical use case.\n","authors":["Fiona R. Kolbinger","Jiangpeng He","Jinge Ma","Fengqing Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.17192v1.pdf","comment":"13 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2306.06208v5","updated":"2024-03-25T21:08:25Z","published":"2023-06-05T23:07:01Z","title":"DeltaNN: Assessing the Impact of Computational Environment Parameters on\n the Performance of Image Recognition Models","summary":" Image recognition tasks typically use deep learning and require enormous\nprocessing power, thus relying on hardware accelerators like GPUs and TPUs for\nfast, timely processing. Failure in real-time image recognition tasks can occur\ndue to sub-optimal mapping on hardware accelerators during model deployment,\nwhich may lead to timing uncertainty and erroneous behavior. Mapping on\nhardware accelerators is done using multiple software components like deep\nlearning frameworks, compilers, and device libraries, that we refer to as the\ncomputational environment. Owing to the increased use of image recognition\ntasks in safety-critical applications like autonomous driving and medical\nimaging, it is imperative to assess their robustness to changes in the\ncomputational environment, as the impact of parameters like deep learning\nframeworks, compiler optimizations, and hardware devices on model performance\nand correctness is not yet well understood.\n In this paper we present a differential testing framework, DeltaNN, that\nallows us to assess the impact of different computational environment\nparameters on the performance of image recognition models during deployment,\npost training. DeltaNN generates different implementations of a given image\nrecognition model for variations in environment parameters, namely, deep\nlearning frameworks, compiler optimizations and hardware devices and analyzes\ndifferences in model performance as a result. Using DeltaNN, we conduct an\nempirical study of robustness analysis of three popular image recognition\nmodels using the ImageNet dataset. We report the impact in terms of\nmisclassifications and inference time differences across different settings. In\ntotal, we observed up to 100% output label differences across deep learning\nframeworks, and up to 81% unexpected performance degradation in terms of\ninference time, when applying compiler optimizations.\n","authors":["Nikolaos Louloudakis","Perry Gibson","José Cano","Ajitha Rajan"],"pdf_url":"https://arxiv.org/pdf/2306.06208v5.pdf","comment":"11 pages, 10 figures, 2 tables"},{"id":"http://arxiv.org/abs/2403.17188v1","updated":"2024-03-25T21:01:29Z","published":"2024-03-25T21:01:29Z","title":"LOTUS: Evasive and Resilient Backdoor Attacks through Sub-Partitioning","summary":" Backdoor attack poses a significant security threat to Deep Learning\napplications. Existing attacks are often not evasive to established backdoor\ndetection techniques. This susceptibility primarily stems from the fact that\nthese attacks typically leverage a universal trigger pattern or transformation\nfunction, such that the trigger can cause misclassification for any input. In\nresponse to this, recent papers have introduced attacks using sample-specific\ninvisible triggers crafted through special transformation functions. While\nthese approaches manage to evade detection to some extent, they reveal\nvulnerability to existing backdoor mitigation techniques. To address and\nenhance both evasiveness and resilience, we introduce a novel backdoor attack\nLOTUS. Specifically, it leverages a secret function to separate samples in the\nvictim class into a set of partitions and applies unique triggers to different\npartitions. Furthermore, LOTUS incorporates an effective trigger focusing\nmechanism, ensuring only the trigger corresponding to the partition can induce\nthe backdoor behavior. Extensive experimental results show that LOTUS can\nachieve high attack success rate across 4 datasets and 7 model structures, and\neffectively evading 13 backdoor detection and mitigation techniques. The code\nis available at https://github.com/Megum1/LOTUS.\n","authors":["Siyuan Cheng","Guanhong Tao","Yingqi Liu","Guangyu Shen","Shengwei An","Shiwei Feng","Xiangzhe Xu","Kaiyuan Zhang","Shiqing Ma","Xiangyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.17188v1.pdf","comment":"IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR\n 2024)"},{"id":"http://arxiv.org/abs/2403.17177v1","updated":"2024-03-25T20:44:01Z","published":"2024-03-25T20:44:01Z","title":"Brain Stroke Segmentation Using Deep Learning Models: A Comparative\n Study","summary":" Stroke segmentation plays a crucial role in the diagnosis and treatment of\nstroke patients by providing spatial information about affected brain regions\nand the extent of damage. Segmenting stroke lesions accurately is a challenging\ntask, given that conventional manual techniques are time consuming and prone to\nerrors. Recently, advanced deep models have been introduced for general medical\nimage segmentation, demonstrating promising results that surpass many state of\nthe art networks when evaluated on specific datasets. With the advent of the\nvision Transformers, several models have been introduced based on them, while\nothers have aimed to design better modules based on traditional convolutional\nlayers to extract long-range dependencies like Transformers. The question of\nwhether such high-level designs are necessary for all segmentation cases to\nachieve the best results remains unanswered. In this study, we selected four\ntypes of deep models that were recently proposed and evaluated their\nperformance for stroke segmentation: a pure Transformer-based architecture\n(DAE-Former), two advanced CNN-based models (LKA and DLKA) with attention\nmechanisms in their design, an advanced hybrid model that incorporates CNNs\nwith Transformers (FCT), and the well- known self-adaptive nnUNet framework\nwith its configuration based on given data. We examined their performance on\ntwo publicly available datasets, and found that the nnUNet achieved the best\nresults with the simplest design among all. Revealing the robustness issue of\nTransformers to such variabilities serves as a potential reason for their\nweaker performance. Furthermore, nnUNet's success underscores the significant\nimpact of preprocessing and postprocessing techniques in enhancing segmentation\nresults, surpassing the focus solely on architectural designs\n","authors":["Ahmed Soliman","Yousif Yousif","Ahmed Ibrahim","Yalda Zafari-Ghadim","Essam A. Rashed","Mohamed Mabrok"],"pdf_url":"https://arxiv.org/pdf/2403.17177v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17176v1","updated":"2024-03-25T20:43:48Z","published":"2024-03-25T20:43:48Z","title":"Histogram Layers for Neural Engineered Features","summary":" In the computer vision literature, many effective histogram-based features\nhave been developed. These engineered features include local binary patterns\nand edge histogram descriptors among others and they have been shown to be\ninformative features for a variety of computer vision tasks. In this paper, we\nexplore whether these features can be learned through histogram layers embedded\nin a neural network and, therefore, be leveraged within deep learning\nframeworks. By using histogram features, local statistics of the feature maps\nfrom the convolution neural networks can be used to better represent the data.\nWe present neural versions of local binary pattern and edge histogram\ndescriptors that jointly improve the feature representation and perform image\nclassification. Experiments are presented on benchmark and real-world datasets.\n","authors":["Joshua Peeples","Salim Al Kharsa","Luke Saleh","Alina Zare"],"pdf_url":"https://arxiv.org/pdf/2403.17176v1.pdf","comment":"11 pages, 7 figures, submitted for review"},{"id":"http://arxiv.org/abs/2403.17175v1","updated":"2024-03-25T20:43:23Z","published":"2024-03-25T20:43:23Z","title":"Engagement Measurement Based on Facial Landmarks and Spatial-Temporal\n Graph Convolutional Networks","summary":" Engagement in virtual learning is crucial for a variety of factors including\nlearner satisfaction, performance, and compliance with learning programs, but\nmeasuring it is a challenging task. There is therefore considerable interest in\nutilizing artificial intelligence and affective computing to measure engagement\nin natural settings as well as on a large scale. This paper introduces a novel,\nprivacy-preserving method for engagement measurement from videos. It uses\nfacial landmarks, which carry no personally identifiable information, extracted\nfrom videos via the MediaPipe deep learning solution. The extracted facial\nlandmarks are fed to a Spatial-Temporal Graph Convolutional Network (ST-GCN) to\noutput the engagement level of the learner in the video. To integrate the\nordinal nature of the engagement variable into the training process, ST-GCNs\nundergo training in a novel ordinal learning framework based on transfer\nlearning. Experimental results on two video student engagement measurement\ndatasets show the superiority of the proposed method compared to previous\nmethods with improved state-of-the-art on the EngageNet dataset with a %3.1\nimprovement in four-class engagement level classification accuracy and on the\nOnline Student Engagement dataset with a %1.5 improvement in binary engagement\nclassification accuracy. The relatively lightweight ST-GCN and its integration\nwith the real-time MediaPipe deep learning solution make the proposed approach\ncapable of being deployed on virtual learning platforms and measuring\nengagement in real time.\n","authors":["Ali Abedi","Shehroz S. Khan"],"pdf_url":"https://arxiv.org/pdf/2403.17175v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17173v1","updated":"2024-03-25T20:39:58Z","published":"2024-03-25T20:39:58Z","title":"Task2Box: Box Embeddings for Modeling Asymmetric Task Relationships","summary":" Modeling and visualizing relationships between tasks or datasets is an\nimportant step towards solving various meta-tasks such as dataset discovery,\nmulti-tasking, and transfer learning. However, many relationships, such as\ncontainment and transferability, are naturally asymmetric and current\napproaches for representation and visualization (e.g., t-SNE do not readily\nsupport this. We propose Task2Box, an approach to represent tasks using box\nembeddings -- axis-aligned hyperrectangles in low dimensional spaces -- that\ncan capture asymmetric relationships between them through volumetric overlaps.\nWe show that Task2Box accurately predicts unseen hierarchical relationships\nbetween nodes in ImageNet and iNaturalist datasets, as well as transferability\nbetween tasks in the Taskonomy benchmark. We also show that box embeddings\nestimated from task representations (e.g., CLIP, Task2Vec, or attribute based)\ncan be used to predict relationships between unseen tasks more accurately than\nclassifiers trained on the same representations, as well as handcrafted\nasymmetric distances (e.g., KL divergence). This suggests that low-dimensional\nbox embeddings can effectively capture these task relationships and have the\nadded advantage of being interpretable. We use the approach to visualize\nrelationships among publicly available image classification datasets on popular\ndataset hosting platform called Hugging Face.\n","authors":["Rangel Daroya","Aaron Sun","Subhransu Maji"],"pdf_url":"https://arxiv.org/pdf/2403.17173v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09857v2","updated":"2024-03-25T20:08:07Z","published":"2024-03-14T20:34:53Z","title":"Few-Shot Class Incremental Learning with Attention-Aware Self-Adaptive\n Prompt","summary":" Few-Shot Class-Incremental Learning (FSCIL) models aim to incrementally learn\nnew classes with scarce samples while preserving knowledge of old ones.\nExisting FSCIL methods usually fine-tune the entire backbone, leading to\noverfitting and hindering the potential to learn new classes. On the other\nhand, recent prompt-based CIL approaches alleviate forgetting by training\nprompts with sufficient data in each task. In this work, we propose a novel\nframework named Attention-aware Self-adaptive Prompt (ASP). ASP encourages\ntask-invariant prompts to capture shared knowledge by reducing specific\ninformation from the attention aspect. Additionally, self-adaptive\ntask-specific prompts in ASP provide specific information and transfer\nknowledge from old classes to new classes with an Information Bottleneck\nlearning objective. In summary, ASP prevents overfitting on base task and does\nnot require enormous data in few-shot incremental tasks. Extensive experiments\non three benchmark datasets validate that ASP consistently outperforms\nstate-of-the-art FSCIL and prompt-based CIL methods in terms of both learning\nnew classes and mitigating forgetting.\n","authors":["Chenxi Liu","Zhenyi Wang","Tianyi Xiong","Ruibo Chen","Yihan Wu","Junfeng Guo","Heng Huang"],"pdf_url":"https://arxiv.org/pdf/2403.09857v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07021v2","updated":"2024-03-25T19:46:25Z","published":"2023-10-10T21:16:29Z","title":"Pre-Trained Masked Image Model for Mobile Robot Navigation","summary":" 2D top-down maps are commonly used for the navigation and exploration of\nmobile robots through unknown areas. Typically, the robot builds the navigation\nmaps incrementally from local observations using onboard sensors. Recent works\nhave shown that predicting the structural patterns in the environment through\nlearning-based approaches can greatly enhance task efficiency. While many such\nworks build task-specific networks using limited datasets, we show that the\nexisting foundational vision networks can accomplish the same without any\nfine-tuning. Specifically, we use Masked Autoencoders, pre-trained on street\nimages, to present novel applications for field-of-view expansion, single-agent\ntopological exploration, and multi-agent exploration for indoor mapping, across\ndifferent input modalities. Our work motivates the use of foundational vision\nmodels for generalized structure prediction-driven applications, especially in\nthe dearth of training data. For more qualitative results see\nhttps://raaslab.org/projects/MIM4Robots.\n","authors":["Vishnu Dutt Sharma","Anukriti Singh","Pratap Tokekar"],"pdf_url":"https://arxiv.org/pdf/2310.07021v2.pdf","comment":"Accepted at ICRA 2024"},{"id":"http://arxiv.org/abs/2403.17128v1","updated":"2024-03-25T19:13:12Z","published":"2024-03-25T19:13:12Z","title":"Benchmarking Video Frame Interpolation","summary":" Video frame interpolation, the task of synthesizing new frames in between two\nor more given ones, is becoming an increasingly popular research target.\nHowever, the current evaluation of frame interpolation techniques is not ideal.\nDue to the plethora of test datasets available and inconsistent computation of\nerror metrics, a coherent and fair comparison across papers is very\nchallenging. Furthermore, new test sets have been proposed as part of method\npapers so they are unable to provide the in-depth evaluation of a dedicated\nbenchmarking paper. Another severe downside is that these test sets violate the\nassumption of linearity when given two input frames, making it impossible to\nsolve without an oracle. We hence strongly believe that the community would\ngreatly benefit from a benchmarking paper, which is what we propose.\nSpecifically, we present a benchmark which establishes consistent error metrics\nby utilizing a submission website that computes them, provides insights by\nanalyzing the interpolation quality with respect to various per-pixel\nattributes such as the motion magnitude, contains a carefully designed test set\nadhering to the assumption of linearity by utilizing synthetic data, and\nevaluates the computational efficiency in a coherent manner.\n","authors":["Simon Kiefhaber","Simon Niklaus","Feng Liu","Simone Schaub-Meyer"],"pdf_url":"https://arxiv.org/pdf/2403.17128v1.pdf","comment":"http://sniklaus.com/vfibench"},{"id":"http://arxiv.org/abs/2311.17286v2","updated":"2024-03-25T19:05:04Z","published":"2023-11-29T00:09:45Z","title":"LEOD: Label-Efficient Object Detection for Event Cameras","summary":" Object detection with event cameras benefits from the sensor's low latency\nand high dynamic range. However, it is costly to fully label event streams for\nsupervised training due to their high temporal resolution. To reduce this cost,\nwe present LEOD, the first method for label-efficient event-based detection.\nOur approach unifies weakly- and semi-supervised object detection with a\nself-training mechanism. We first utilize a detector pre-trained on limited\nlabels to produce pseudo ground truth on unlabeled events. Then, the detector\nis re-trained with both real and generated labels. Leveraging the temporal\nconsistency of events, we run bi-directional inference and apply tracking-based\npost-processing to enhance the quality of pseudo labels. To stabilize training\nagainst label noise, we further design a soft anchor assignment strategy. We\nintroduce new experimental protocols to evaluate the task of label-efficient\nevent-based detection on Gen1 and 1Mpx datasets. LEOD consistently outperforms\nsupervised baselines across various labeling ratios. For example, on Gen1, it\nimproves mAP by 8.6% and 7.8% for RVT-S trained with 1% and 2% labels. On 1Mpx,\nRVT-S with 10% labels even surpasses its fully-supervised counterpart using\n100% labels. LEOD maintains its effectiveness even when all labeled data are\navailable, reaching new state-of-the-art results. Finally, we show that our\nmethod readily scales to improve larger detectors as well. Code is released at\nhttps://github.com/Wuziyi616/LEOD\n","authors":["Ziyi Wu","Mathias Gehrig","Qing Lyu","Xudong Liu","Igor Gilitschenski"],"pdf_url":"https://arxiv.org/pdf/2311.17286v2.pdf","comment":"CVPR 2024. Code: https://github.com/Wuziyi616/LEOD"},{"id":"http://arxiv.org/abs/2311.16682v2","updated":"2024-03-25T18:54:18Z","published":"2023-11-28T10:53:55Z","title":"ContextSeg: Sketch Semantic Segmentation by Querying the Context with\n Attention","summary":" Sketch semantic segmentation is a well-explored and pivotal problem in\ncomputer vision involving the assignment of pre-defined part labels to\nindividual strokes. This paper presents ContextSeg - a simple yet highly\neffective approach to tackling this problem with two stages. In the first\nstage, to better encode the shape and positional information of strokes, we\npropose to predict an extra dense distance field in an autoencoder network to\nreinforce structural information learning. In the second stage, we treat an\nentire stroke as a single entity and label a group of strokes within the same\nsemantic part using an auto-regressive Transformer with the default attention\nmechanism. By group-based labeling, our method can fully leverage the context\ninformation when making decisions for the remaining groups of strokes. Our\nmethod achieves the best segmentation accuracy compared with state-of-the-art\napproaches on two representative datasets and has been extensively evaluated\ndemonstrating its superior performance. Additionally, we offer insights into\nsolving part imbalance in training data and the preliminary experiment on\ncross-category training, which can inspire future research in this field.\n","authors":["Jiawei Wang","Changjian Li"],"pdf_url":"https://arxiv.org/pdf/2311.16682v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12151v2","updated":"2024-03-25T18:50:06Z","published":"2024-03-18T18:08:44Z","title":"Fusing Domain-Specific Content from Large Language Models into Knowledge\n Graphs for Enhanced Zero Shot Object State Classification","summary":" Domain-specific knowledge can significantly contribute to addressing a wide\nvariety of vision tasks. However, the generation of such knowledge entails\nconsiderable human labor and time costs. This study investigates the potential\nof Large Language Models (LLMs) in generating and providing domain-specific\ninformation through semantic embeddings. To achieve this, an LLM is integrated\ninto a pipeline that utilizes Knowledge Graphs and pre-trained semantic vectors\nin the context of the Vision-based Zero-shot Object State Classification task.\nWe thoroughly examine the behavior of the LLM through an extensive ablation\nstudy. Our findings reveal that the integration of LLM-based embeddings, in\ncombination with general-purpose pre-trained embeddings, leads to substantial\nperformance improvements. Drawing insights from this ablation study, we conduct\na comparative analysis against competing models, thereby highlighting the\nstate-of-the-art performance achieved by the proposed approach.\n","authors":["Filippos Gouidis","Katerina Papantoniou","Konstantinos Papoutsakis Theodore Patkos","Antonis Argyros","Dimitris Plexousakis"],"pdf_url":"https://arxiv.org/pdf/2403.12151v2.pdf","comment":"Accepted at the AAAI-MAKE 24"},{"id":"http://arxiv.org/abs/2312.12730v2","updated":"2024-03-25T18:49:52Z","published":"2023-12-20T02:58:25Z","title":"A Closer Look at the Few-Shot Adaptation of Large Vision-Language Models","summary":" Efficient transfer learning (ETL) is receiving increasing attention to adapt\nlarge pre-trained language-vision models on downstream tasks with a few labeled\nsamples. While significant progress has been made, we reveal that\nstate-of-the-art ETL approaches exhibit strong performance only in\nnarrowly-defined experimental setups, and with a careful adjustment of\nhyperparameters based on a large corpus of labeled samples. In particular, we\nmake two interesting, and surprising empirical observations. First, to\noutperform a simple Linear Probing baseline, these methods require to optimize\ntheir hyper-parameters on each target task. And second, they typically\nunderperform -- sometimes dramatically -- standard zero-shot predictions in the\npresence of distributional drifts. Motivated by the unrealistic assumptions\nmade in the existing literature, i.e., access to a large validation set and\ncase-specific grid-search for optimal hyperparameters, we propose a novel\napproach that meets the requirements of real-world scenarios. More concretely,\nwe introduce a CLass-Adaptive linear Probe (CLAP) objective, whose balancing\nterm is optimized via an adaptation of the general Augmented Lagrangian method\ntailored to this context. We comprehensively evaluate CLAP on a broad span of\ndatasets and scenarios, demonstrating that it consistently outperforms SoTA\napproaches, while yet being a much more efficient alternative.\n","authors":["Julio Silva-Rodríguez","Sina Hajimiri","Ismail Ben Ayed","Jose Dolz"],"pdf_url":"https://arxiv.org/pdf/2312.12730v2.pdf","comment":"CVPR 2024. Code: https://github.com/jusiro/CLAP"},{"id":"http://arxiv.org/abs/2403.17103v1","updated":"2024-03-25T18:41:43Z","published":"2024-03-25T18:41:43Z","title":"Animal Avatars: Reconstructing Animatable 3D Animals from Casual Videos","summary":" We present a method to build animatable dog avatars from monocular videos.\nThis is challenging as animals display a range of (unpredictable) non-rigid\nmovements and have a variety of appearance details (e.g., fur, spots, tails).\nWe develop an approach that links the video frames via a 4D solution that\njointly solves for animal's pose variation, and its appearance (in a canonical\npose). To this end, we significantly improve the quality of template-based\nshape fitting by endowing the SMAL parametric model with Continuous Surface\nEmbeddings, which brings image-to-mesh reprojection constaints that are denser,\nand thus stronger, than the previously used sparse semantic keypoint\ncorrespondences. To model appearance, we propose an implicit duplex-mesh\ntexture that is defined in the canonical pose, but can be deformed using SMAL\npose coefficients and later rendered to enforce a photometric compatibility\nwith the input video frames. On the challenging CoP3D and APTv2 datasets, we\ndemonstrate superior results (both in terms of pose estimates and predicted\nappearance) to existing template-free (RAC) and template-based approaches\n(BARC, BITE).\n","authors":["Remy Sabathier","Niloy J. Mitra","David Novotny"],"pdf_url":"https://arxiv.org/pdf/2403.17103v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10367v2","updated":"2024-03-25T18:33:01Z","published":"2024-03-15T14:59:21Z","title":"Testing MediaPipe Holistic for Linguistic Analysis of Nonmanual Markers\n in Sign Languages","summary":" Advances in Deep Learning have made possible reliable landmark tracking of\nhuman bodies and faces that can be used for a variety of tasks. We test a\nrecent Computer Vision solution, MediaPipe Holistic (MPH), to find out if its\ntracking of the facial features is reliable enough for a linguistic analysis of\ndata from sign languages, and compare it to an older solution (OpenFace, OF).\nWe use an existing data set of sentences in Kazakh-Russian Sign Language and a\nnewly created small data set of videos with head tilts and eyebrow movements.\nWe find that MPH does not perform well enough for linguistic analysis of\neyebrow movement - but in a different way from OF, which is also performing\npoorly without correction. We reiterate a previous proposal to train additional\ncorrection models to overcome these limitations.\n","authors":["Anna Kuznetsova","Vadim Kimmelman"],"pdf_url":"https://arxiv.org/pdf/2403.10367v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17094v1","updated":"2024-03-25T18:32:41Z","published":"2024-03-25T18:32:41Z","title":"SynFog: A Photo-realistic Synthetic Fog Dataset based on End-to-end\n Imaging Simulation for Advancing Real-World Defogging in Autonomous Driving","summary":" To advance research in learning-based defogging algorithms, various synthetic\nfog datasets have been developed. However, existing datasets created using the\nAtmospheric Scattering Model (ASM) or real-time rendering engines often\nstruggle to produce photo-realistic foggy images that accurately mimic the\nactual imaging process. This limitation hinders the effective generalization of\nmodels from synthetic to real data. In this paper, we introduce an end-to-end\nsimulation pipeline designed to generate photo-realistic foggy images. This\npipeline comprehensively considers the entire physically-based foggy scene\nimaging process, closely aligning with real-world image capture methods. Based\non this pipeline, we present a new synthetic fog dataset named SynFog, which\nfeatures both sky light and active lighting conditions, as well as three levels\nof fog density. Experimental results demonstrate that models trained on SynFog\nexhibit superior performance in visual perception and detection accuracy\ncompared to others when applied to real-world foggy images.\n","authors":["Yiming Xie","Henglu Wei","Zhenyi Liu","Xiaoyu Wang","Xiangyang Ji"],"pdf_url":"https://arxiv.org/pdf/2403.17094v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17084v1","updated":"2024-03-25T18:18:12Z","published":"2024-03-25T18:18:12Z","title":"A Comparative Analysis of Visual Odometry in Virtual and Real-World\n Railways Environments","summary":" Perception tasks play a crucial role in the development of automated\noperations and systems across multiple application fields. In the railway\ntransportation domain, these tasks can improve the safety, reliability, and\nefficiency of various perations, including train localization, signal\nrecognition, and track discrimination. However, collecting considerable and\nprecisely labeled datasets for testing such novel algorithms poses extreme\nchallenges in the railway environment due to the severe restrictions in\naccessing the infrastructures and the practical difficulties associated with\nproperly equipping trains with the required sensors, such as cameras and\nLiDARs. The remarkable innovations of graphic engine tools offer new solutions\nto craft realistic synthetic datasets. To illustrate the advantages of\nemploying graphic simulation for early-stage testing of perception tasks in the\nrailway domain, this paper presents a comparative analysis of the performance\nof a SLAM algorithm applied both in a virtual synthetic environment and a\nreal-world scenario. The analysis leverages virtual railway environments\ncreated with the latest version of Unreal Engine, facilitating data collection\nand allowing the examination of challenging scenarios, including\nlow-visibility, dangerous operational modes, and complex environments. The\nresults highlight the feasibility and potentiality of graphic simulation to\nadvance perception tasks in the railway domain.\n","authors":["Gianluca D'Amico","Mauro Marinoni","Giorgio Buttazzo"],"pdf_url":"https://arxiv.org/pdf/2403.17084v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17083v1","updated":"2024-03-25T18:16:34Z","published":"2024-03-25T18:16:34Z","title":"A Study in Dataset Pruning for Image Super-Resolution","summary":" In image Super-Resolution (SR), relying on large datasets for training is a\ndouble-edged sword. While offering rich training material, they also demand\nsubstantial computational and storage resources. In this work, we analyze\ndataset pruning as a solution to these challenges. We introduce a novel\napproach that reduces a dataset to a core-set of training samples, selected\nbased on their loss values as determined by a simple pre-trained SR model. By\nfocusing the training on just 50% of the original dataset, specifically on the\nsamples characterized by the highest loss values, we achieve results comparable\nto or even surpassing those obtained from training on the entire dataset.\nInterestingly, our analysis reveals that the top 5% of samples with the highest\nloss values negatively affect the training process. Excluding these samples and\nadjusting the selection to favor easier samples further enhances training\noutcomes. Our work opens new perspectives to the untapped potential of dataset\npruning in image SR. It suggests that careful selection of training data based\non loss-value metrics can lead to better SR models, challenging the\nconventional wisdom that more data inevitably leads to better performance.\n","authors":["Brian B. Moser","Federico Raue","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2403.17083v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16703v3","updated":"2024-03-25T18:03:41Z","published":"2023-11-28T11:27:48Z","title":"CADTalk: An Algorithm and Benchmark for Semantic Commenting of CAD\n Programs","summary":" CAD programs are a popular way to compactly encode shapes as a sequence of\noperations that are easy to parametrically modify. However, without sufficient\nsemantic comments and structure, such programs can be challenging to\nunderstand, let alone modify. We introduce the problem of semantic commenting\nCAD programs, wherein the goal is to segment the input program into code blocks\ncorresponding to semantically meaningful shape parts and assign a semantic\nlabel to each block. We solve the problem by combining program parsing with\nvisual-semantic analysis afforded by recent advances in foundational language\nand vision models. Specifically, by executing the input programs, we create\nshapes, which we use to generate conditional photorealistic images to make use\nof semantic annotators for such images. We then distill the information across\nthe images and link back to the original programs to semantically comment on\nthem. Additionally, we collected and annotated a benchmark dataset, CADTalk,\nconsisting of 5,288 machine-made programs and 45 human-made programs with\nground truth semantic comments. We extensively evaluated our approach, compared\nit to a GPT-based baseline, and an open-set shape segmentation baseline, and\nreported an 83.24% accuracy on the new CADTalk dataset. Code and data:\nhttps://enigma-li.github.io/CADTalk/.\n","authors":["Haocheng Yuan","Jing Xu","Hao Pan","Adrien Bousseau","Niloy J. Mitra","Changjian Li"],"pdf_url":"https://arxiv.org/pdf/2311.16703v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17064v1","updated":"2024-03-25T18:00:42Z","published":"2024-03-25T18:00:42Z","title":"Continuous, Subject-Specific Attribute Control in T2I Models by\n Identifying Semantic Directions","summary":" In recent years, advances in text-to-image (T2I) diffusion models have\nsubstantially elevated the quality of their generated images. However,\nachieving fine-grained control over attributes remains a challenge due to the\nlimitations of natural language prompts (such as no continuous set of\nintermediate descriptions existing between ``person'' and ``old person''). Even\nthough many methods were introduced that augment the model or generation\nprocess to enable such control, methods that do not require a fixed reference\nimage are limited to either enabling global fine-grained attribute expression\ncontrol or coarse attribute expression control localized to specific subjects,\nnot both simultaneously. We show that there exist directions in the commonly\nused token-level CLIP text embeddings that enable fine-grained subject-specific\ncontrol of high-level attributes in text-to-image models. Based on this\nobservation, we introduce one efficient optimization-free and one robust\noptimization-based method to identify these directions for specific attributes\nfrom contrastive text prompts. We demonstrate that these directions can be used\nto augment the prompt text input with fine-grained control over attributes of\nspecific subjects in a compositional manner (control over multiple attributes\nof a single subject) without having to adapt the diffusion model. Project page:\nhttps://compvis.github.io/attribute-control. Code is available at\nhttps://github.com/CompVis/attribute-control.\n","authors":["Stefan Andreas Baumann","Felix Krause","Michael Neumayr","Nick Stracke","Vincent Tao Hu","Björn Ommer"],"pdf_url":"https://arxiv.org/pdf/2403.17064v1.pdf","comment":"Project page: https://compvis.github.io/attribute-control"},{"id":"http://arxiv.org/abs/2403.17010v1","updated":"2024-03-25T17:59:59Z","published":"2024-03-25T17:59:59Z","title":"Calib3D: Calibrating Model Preferences for Reliable 3D Scene\n Understanding","summary":" Safety-critical 3D scene understanding tasks necessitate not only accurate\nbut also confident predictions from 3D perception models. This study introduces\nCalib3D, a pioneering effort to benchmark and scrutinize the reliability of 3D\nscene understanding models from an uncertainty estimation viewpoint. We\ncomprehensively evaluate 28 state-of-the-art models across 10 diverse 3D\ndatasets, uncovering insightful phenomena that cope with both the aleatoric and\nepistemic uncertainties in 3D scene understanding. We discover that despite\nachieving impressive levels of accuracy, existing models frequently fail to\nprovide reliable uncertainty estimates -- a pitfall that critically undermines\ntheir applicability in safety-sensitive contexts. Through extensive analysis of\nkey factors such as network capacity, LiDAR representations, rasterization\nresolutions, and 3D data augmentation techniques, we correlate these aspects\ndirectly with the model calibration efficacy. Furthermore, we introduce DeptS,\na novel depth-aware scaling approach aimed at enhancing 3D model calibration.\nExtensive experiments across a wide range of configurations validate the\nsuperiority of our method. We hope this work could serve as a cornerstone for\nfostering reliable 3D scene understanding. Code and benchmark toolkits are\npublicly available.\n","authors":["Lingdong Kong","Xiang Xu","Jun Cen","Wenwei Zhang","Liang Pan","Kai Chen","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2403.17010v1.pdf","comment":"Preprint; 37 pages, 8 figures, 11 tables; Code at\n https://github.com/ldkong1205/Calib3D"},{"id":"http://arxiv.org/abs/2403.17009v1","updated":"2024-03-25T17:59:58Z","published":"2024-03-25T17:59:58Z","title":"Optimizing LiDAR Placements for Robust Driving Perception in Adverse\n Conditions","summary":" The robustness of driving perception systems under unprecedented conditions\nis crucial for safety-critical usages. Latest advancements have prompted\nincreasing interests towards multi-LiDAR perception. However, prevailing\ndriving datasets predominantly utilize single-LiDAR systems and collect data\ndevoid of adverse conditions, failing to capture the complexities of real-world\nenvironments accurately. Addressing these gaps, we proposed Place3D, a\nfull-cycle pipeline that encompasses LiDAR placement optimization, data\ngeneration, and downstream evaluations. Our framework makes three appealing\ncontributions. 1) To identify the most effective configurations for multi-LiDAR\nsystems, we introduce a Surrogate Metric of the Semantic Occupancy Grids\n(M-SOG) to evaluate LiDAR placement quality. 2) Leveraging the M-SOG metric, we\npropose a novel optimization strategy to refine multi-LiDAR placements. 3)\nCentered around the theme of multi-condition multi-LiDAR perception, we collect\na 364,000-frame dataset from both clean and adverse conditions. Extensive\nexperiments demonstrate that LiDAR placements optimized using our approach\noutperform various baselines. We showcase exceptional robustness in both 3D\nobject detection and LiDAR semantic segmentation tasks, under diverse adverse\nweather and sensor failure conditions. Code and benchmark toolkit are publicly\navailable.\n","authors":["Ye Li","Lingdong Kong","Hanjiang Hu","Xiaohao Xu","Xiaonan Huang"],"pdf_url":"https://arxiv.org/pdf/2403.17009v1.pdf","comment":"Preprint; 40 pages, 11 figures, 15 tables; Code at\n https://github.com/ywyeli/Place3D"},{"id":"http://arxiv.org/abs/2403.17008v1","updated":"2024-03-25T17:59:57Z","published":"2024-03-25T17:59:57Z","title":"FlashFace: Human Image Personalization with High-fidelity Identity\n Preservation","summary":" This work presents FlashFace, a practical tool with which users can easily\npersonalize their own photos on the fly by providing one or a few reference\nface images and a text prompt. Our approach is distinguishable from existing\nhuman photo customization methods by higher-fidelity identity preservation and\nbetter instruction following, benefiting from two subtle designs. First, we\nencode the face identity into a series of feature maps instead of one image\ntoken as in prior arts, allowing the model to retain more details of the\nreference faces (e.g., scars, tattoos, and face shape ). Second, we introduce a\ndisentangled integration strategy to balance the text and image guidance during\nthe text-to-image generation process, alleviating the conflict between the\nreference faces and the text prompts (e.g., personalizing an adult into a\n\"child\" or an \"elder\"). Extensive experimental results demonstrate the\neffectiveness of our method on various applications, including human image\npersonalization, face swapping under language prompts, making virtual\ncharacters into real people, etc. Project Page:\nhttps://jshilong.github.io/flashface-page.\n","authors":["Shilong Zhang","Lianghua Huang","Xi Chen","Yifei Zhang","Zhi-Fan Wu","Yutong Feng","Wei Wang","Yujun Shen","Yu Liu","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2403.17008v1.pdf","comment":"Project Page:https://jshilong.github.io/flashface-page"},{"id":"http://arxiv.org/abs/2403.17007v1","updated":"2024-03-25T17:59:42Z","published":"2024-03-25T17:59:42Z","title":"DreamLIP: Language-Image Pre-training with Long Captions","summary":" Language-image pre-training largely relies on how precisely and thoroughly a\ntext describes its paired image. In practice, however, the contents of an image\ncan be so rich that well describing them requires lengthy captions (e.g., with\n10 sentences), which are usually missing in existing datasets. Consequently,\nthere are currently no clear evidences on whether and how language-image\npre-training could benefit from long captions. To figure this out, we first\nre-caption 30M images with detailed descriptions using a pre-trained\nMulti-modality Large Language Model (MLLM), and then study the usage of the\nresulting captions under a contrastive learning framework. We observe that,\neach sentence within a long caption is very likely to describe the image\npartially (e.g., an object). Motivated by this, we propose to dynamically\nsample sub-captions from the text label to construct multiple positive pairs,\nand introduce a grouping loss to match the embeddings of each sub-caption with\nits corresponding local image patches in a self-supervised manner. Experimental\nresults on a wide rage of downstream tasks demonstrate the consistent\nsuperiority of our method, termed DreamLIP, over previous alternatives,\nhighlighting its fine-grained representational capacity. It is noteworthy that,\non the tasks of image-text retrieval and semantic segmentation, our model\ntrained with 30M image-text pairs achieves on par or even better performance\nthan CLIP trained with 400M pairs. Project page is available at\nhttps://zyf0619sjtu.github.io/dream-lip.\n","authors":["Kecheng Zheng","Yifei Zhang","Wei Wu","Fan Lu","Shuailei Ma","Xin Jin","Wei Chen","Yujun Shen"],"pdf_url":"https://arxiv.org/pdf/2403.17007v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17006v1","updated":"2024-03-25T17:59:41Z","published":"2024-03-25T17:59:41Z","title":"Invertible Diffusion Models for Compressed Sensing","summary":" While deep neural networks (NN) significantly advance image compressed\nsensing (CS) by improving reconstruction quality, the necessity of training\ncurrent CS NNs from scratch constrains their effectiveness and hampers rapid\ndeployment. Although recent methods utilize pre-trained diffusion models for\nimage reconstruction, they struggle with slow inference and restricted\nadaptability to CS. To tackle these challenges, this paper proposes Invertible\nDiffusion Models (IDM), a novel efficient, end-to-end diffusion-based CS\nmethod. IDM repurposes a large-scale diffusion sampling process as a\nreconstruction model, and finetunes it end-to-end to recover original images\ndirectly from CS measurements, moving beyond the traditional paradigm of\none-step noise estimation learning. To enable such memory-intensive end-to-end\nfinetuning, we propose a novel two-level invertible design to transform both\n(1) the multi-step sampling process and (2) the noise estimation U-Net in each\nstep into invertible networks. As a result, most intermediate features are\ncleared during training to reduce up to 93.8% GPU memory. In addition, we\ndevelop a set of lightweight modules to inject measurements into noise\nestimator to further facilitate reconstruction. Experiments demonstrate that\nIDM outperforms existing state-of-the-art CS networks by up to 2.64dB in PSNR.\nCompared to the recent diffusion model-based approach DDNM, our IDM achieves up\nto 10.09dB PSNR gain and 14.54 times faster inference.\n","authors":["Bin Chen","Zhenyu Zhang","Weiqi Li","Chen Zhao","Jiwen Yu","Shijie Zhao","Jie Chen","Jian Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.17006v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17005v1","updated":"2024-03-25T17:59:40Z","published":"2024-03-25T17:59:40Z","title":"TRIP: Temporal Residual Learning with Image Noise Prior for\n Image-to-Video Diffusion Models","summary":" Recent advances in text-to-video generation have demonstrated the utility of\npowerful diffusion models. Nevertheless, the problem is not trivial when\nshaping diffusion models to animate static image (i.e., image-to-video\ngeneration). The difficulty originates from the aspect that the diffusion\nprocess of subsequent animated frames should not only preserve the faithful\nalignment with the given image but also pursue temporal coherence among\nadjacent frames. To alleviate this, we present TRIP, a new recipe of\nimage-to-video diffusion paradigm that pivots on image noise prior derived from\nstatic image to jointly trigger inter-frame relational reasoning and ease the\ncoherent temporal modeling via temporal residual learning. Technically, the\nimage noise prior is first attained through one-step backward diffusion process\nbased on both static image and noised video latent codes. Next, TRIP executes a\nresidual-like dual-path scheme for noise prediction: 1) a shortcut path that\ndirectly takes image noise prior as the reference noise of each frame to\namplify the alignment between the first frame and subsequent frames; 2) a\nresidual path that employs 3D-UNet over noised video and static image latent\ncodes to enable inter-frame relational reasoning, thereby easing the learning\nof the residual noise for each frame. Furthermore, both reference and residual\nnoise of each frame are dynamically merged via attention mechanism for final\nvideo generation. Extensive experiments on WebVid-10M, DTDB and MSR-VTT\ndatasets demonstrate the effectiveness of our TRIP for image-to-video\ngeneration. Please see our project page at https://trip-i2v.github.io/TRIP/.\n","authors":["Zhongwei Zhang","Fuchen Long","Yingwei Pan","Zhaofan Qiu","Ting Yao","Yang Cao","Tao Mei"],"pdf_url":"https://arxiv.org/pdf/2403.17005v1.pdf","comment":"CVPR 2024; Project page: https://trip-i2v.github.io/TRIP/"},{"id":"http://arxiv.org/abs/2403.17004v1","updated":"2024-03-25T17:59:35Z","published":"2024-03-25T17:59:35Z","title":"SD-DiT: Unleashing the Power of Self-supervised Discrimination in\n Diffusion Transformer","summary":" Diffusion Transformer (DiT) has emerged as the new trend of generative\ndiffusion models on image generation. In view of extremely slow convergence in\ntypical DiT, recent breakthroughs have been driven by mask strategy that\nsignificantly improves the training efficiency of DiT with additional\nintra-image contextual learning. Despite this progress, mask strategy still\nsuffers from two inherent limitations: (a) training-inference discrepancy and\n(b) fuzzy relations between mask reconstruction & generative diffusion process,\nresulting in sub-optimal training of DiT. In this work, we address these\nlimitations by novelly unleashing the self-supervised discrimination knowledge\nto boost DiT training. Technically, we frame our DiT in a teacher-student\nmanner. The teacher-student discriminative pairs are built on the diffusion\nnoises along the same Probability Flow Ordinary Differential Equation (PF-ODE).\nInstead of applying mask reconstruction loss over both DiT encoder and decoder,\nwe decouple DiT encoder and decoder to separately tackle discriminative and\ngenerative objectives. In particular, by encoding discriminative pairs with\nstudent and teacher DiT encoders, a new discriminative loss is designed to\nencourage the inter-image alignment in the self-supervised embedding space.\nAfter that, student samples are fed into student DiT decoder to perform the\ntypical generative diffusion task. Extensive experiments are conducted on\nImageNet dataset, and our method achieves a competitive balance between\ntraining cost and generative capacity.\n","authors":["Rui Zhu","Yingwei Pan","Yehao Li","Ting Yao","Zhenglong Sun","Tao Mei","Chang Wen Chen"],"pdf_url":"https://arxiv.org/pdf/2403.17004v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.17001v1","updated":"2024-03-25T17:59:31Z","published":"2024-03-25T17:59:31Z","title":"VP3D: Unleashing 2D Visual Prompt for Text-to-3D Generation","summary":" Recent innovations on text-to-3D generation have featured Score Distillation\nSampling (SDS), which enables the zero-shot learning of implicit 3D models\n(NeRF) by directly distilling prior knowledge from 2D diffusion models.\nHowever, current SDS-based models still struggle with intricate text prompts\nand commonly result in distorted 3D models with unrealistic textures or\ncross-view inconsistency issues. In this work, we introduce a novel Visual\nPrompt-guided text-to-3D diffusion model (VP3D) that explicitly unleashes the\nvisual appearance knowledge in 2D visual prompt to boost text-to-3D generation.\nInstead of solely supervising SDS with text prompt, VP3D first capitalizes on\n2D diffusion model to generate a high-quality image from input text, which\nsubsequently acts as visual prompt to strengthen SDS optimization with explicit\nvisual appearance. Meanwhile, we couple the SDS optimization with additional\ndifferentiable reward function that encourages rendering images of 3D models to\nbetter visually align with 2D visual prompt and semantically match with text\nprompt. Through extensive experiments, we show that the 2D Visual Prompt in our\nVP3D significantly eases the learning of visual appearance of 3D models and\nthus leads to higher visual fidelity with more detailed textures. It is also\nappealing in view that when replacing the self-generating visual prompt with a\ngiven reference image, VP3D is able to trigger a new task of stylized\ntext-to-3D generation. Our project page is available at\nhttps://vp3d-cvpr24.github.io.\n","authors":["Yang Chen","Yingwei Pan","Haibo Yang","Ting Yao","Tao Mei"],"pdf_url":"https://arxiv.org/pdf/2403.17001v1.pdf","comment":"CVPR 2024; Project page: https://vp3d-cvpr24.github.io"},{"id":"http://arxiv.org/abs/2403.17000v1","updated":"2024-03-25T17:59:26Z","published":"2024-03-25T17:59:26Z","title":"Learning Spatial Adaptation and Temporal Coherence in Diffusion Models\n for Video Super-Resolution","summary":" Diffusion models are just at a tipping point for image super-resolution task.\nNevertheless, it is not trivial to capitalize on diffusion models for video\nsuper-resolution which necessitates not only the preservation of visual\nappearance from low-resolution to high-resolution videos, but also the temporal\nconsistency across video frames. In this paper, we propose a novel approach,\npursuing Spatial Adaptation and Temporal Coherence (SATeCo), for video\nsuper-resolution. SATeCo pivots on learning spatial-temporal guidance from\nlow-resolution videos to calibrate both latent-space high-resolution video\ndenoising and pixel-space video reconstruction. Technically, SATeCo freezes all\nthe parameters of the pre-trained UNet and VAE, and only optimizes two\ndeliberately-designed spatial feature adaptation (SFA) and temporal feature\nalignment (TFA) modules, in the decoder of UNet and VAE. SFA modulates frame\nfeatures via adaptively estimating affine parameters for each pixel,\nguaranteeing pixel-wise guidance for high-resolution frame synthesis. TFA\ndelves into feature interaction within a 3D local window (tubelet) through\nself-attention, and executes cross-attention between tubelet and its\nlow-resolution counterpart to guide temporal feature alignment. Extensive\nexperiments conducted on the REDS4 and Vid4 datasets demonstrate the\neffectiveness of our approach.\n","authors":["Zhikai Chen","Fuchen Long","Zhaofan Qiu","Ting Yao","Wengang Zhou","Jiebo Luo","Tao Mei"],"pdf_url":"https://arxiv.org/pdf/2403.17000v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.16999v1","updated":"2024-03-25T17:59:23Z","published":"2024-03-25T17:59:23Z","title":"Visual CoT: Unleashing Chain-of-Thought Reasoning in Multi-Modal\n Language Models","summary":" This paper presents Visual CoT, a novel pipeline that leverages the reasoning\ncapabilities of multi-modal large language models (MLLMs) by incorporating\nvisual Chain-of-Thought (CoT) reasoning. While MLLMs have shown promise in\nvarious visual tasks, they often lack interpretability and struggle with\ncomplex visual inputs. To address these challenges, we propose a multi-turn\nprocessing pipeline that dynamically focuses on visual inputs and provides\ninterpretable thoughts. We collect and introduce the Visual CoT dataset\ncomprising 373k question-answer pairs, annotated with intermediate bounding\nboxes highlighting key regions essential for answering the questions.\nImportantly, the introduced benchmark is capable of evaluating MLLMs in\nscenarios requiring specific local region identification. Extensive experiments\ndemonstrate the effectiveness of our framework and shed light on better\ninference strategies. The Visual CoT dataset, benchmark, and pre-trained models\nare available to foster further research in this direction.\n","authors":["Hao Shao","Shengju Qian","Han Xiao","Guanglu Song","Zhuofan Zong","Letian Wang","Yu Liu","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2403.16999v1.pdf","comment":"Code: https://github.com/deepcs233/Visual-CoT"},{"id":"http://arxiv.org/abs/2403.16998v1","updated":"2024-03-25T17:59:09Z","published":"2024-03-25T17:59:09Z","title":"Understanding Long Videos in One Multimodal Language Model Pass","summary":" Large Language Models (LLMs), known to contain a strong awareness of world\nknowledge, have allowed recent approaches to achieve excellent performance on\nLong-Video Understanding benchmarks, but at high inference costs. In this work,\nwe first propose Likelihood Selection, a simple technique that unlocks faster\ninference in autoregressive LLMs for multiple-choice tasks common in long-video\nbenchmarks. In addition to faster inference, we discover the resulting models\nto yield surprisingly good accuracy on long-video tasks, even with no video\nspecific information. Building on this, we inject video-specific object-centric\ninformation extracted from off-the-shelf pre-trained models and utilize natural\nlanguage as a medium for information fusion. Our resulting Multimodal Video\nUnderstanding (MVU) framework demonstrates state-of-the-art performance across\nlong-video and fine-grained action recognition benchmarks. Code available at:\nhttps://github.com/kahnchana/mvu\n","authors":["Kanchana Ranasinghe","Xiang Li","Kumara Kahatapitiya","Michael S. Ryoo"],"pdf_url":"https://arxiv.org/pdf/2403.16998v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2403.16997v1","updated":"2024-03-25T17:59:03Z","published":"2024-03-25T17:59:03Z","title":"Composed Video Retrieval via Enriched Context and Discriminative\n Embeddings","summary":" Composed video retrieval (CoVR) is a challenging problem in computer vision\nwhich has recently highlighted the integration of modification text with visual\nqueries for more sophisticated video search in large databases. Existing works\npredominantly rely on visual queries combined with modification text to\ndistinguish relevant videos. However, such a strategy struggles to fully\npreserve the rich query-specific context in retrieved target videos and only\nrepresents the target video using visual embedding. We introduce a novel CoVR\nframework that leverages detailed language descriptions to explicitly encode\nquery-specific contextual information and learns discriminative embeddings of\nvision only, text only and vision-text for better alignment to accurately\nretrieve matched target videos. Our proposed framework can be flexibly employed\nfor both composed video (CoVR) and image (CoIR) retrieval tasks. Experiments on\nthree datasets show that our approach obtains state-of-the-art performance for\nboth CovR and zero-shot CoIR tasks, achieving gains as high as around 7% in\nterms of recall@K=1 score. Our code, models, detailed language descriptions for\nWebViD-CoVR dataset are available at\n\\url{https://github.com/OmkarThawakar/composed-video-retrieval}\n","authors":["Omkar Thawakar","Muzammal Naseer","Rao Muhammad Anwer","Salman Khan","Michael Felsberg","Mubarak Shah","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2403.16997v1.pdf","comment":"CVPR-2024"},{"id":"http://arxiv.org/abs/2403.16996v1","updated":"2024-03-25T17:59:01Z","published":"2024-03-25T17:59:01Z","title":"DriveCoT: Integrating Chain-of-Thought Reasoning with End-to-End Driving","summary":" End-to-end driving has made significant progress in recent years,\ndemonstrating benefits such as system simplicity and competitive driving\nperformance under both open-loop and closed-loop settings. Nevertheless, the\nlack of interpretability and controllability in its driving decisions hinders\nreal-world deployment for end-to-end driving systems. In this paper, we collect\na comprehensive end-to-end driving dataset named DriveCoT, leveraging the CARLA\nsimulator. It contains sensor data, control decisions, and chain-of-thought\nlabels to indicate the reasoning process. We utilize the challenging driving\nscenarios from the CARLA leaderboard 2.0, which involve high-speed driving and\nlane-changing, and propose a rule-based expert policy to control the vehicle\nand generate ground truth labels for its reasoning process across different\ndriving aspects and the final decisions. This dataset can serve as an open-loop\nend-to-end driving benchmark, enabling the evaluation of accuracy in various\nchain-of-thought aspects and the final decision. In addition, we propose a\nbaseline model called DriveCoT-Agent, trained on our dataset, to generate\nchain-of-thought predictions and final decisions. The trained model exhibits\nstrong performance in both open-loop and closed-loop evaluations, demonstrating\nthe effectiveness of our proposed dataset.\n","authors":["Tianqi Wang","Enze Xie","Ruihang Chu","Zhenguo Li","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2403.16996v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.14456v6","updated":"2024-03-25T17:58:59Z","published":"2022-11-26T02:15:35Z","title":"TetraSphere: A Neural Descriptor for O(3)-Invariant Point Cloud Analysis","summary":" In many practical applications, 3D point cloud analysis requires rotation\ninvariance. In this paper, we present a learnable descriptor invariant under 3D\nrotations and reflections, i.e., the O(3) actions, utilizing the recently\nintroduced steerable 3D spherical neurons and vector neurons. Specifically, we\npropose an embedding of the 3D spherical neurons into 4D vector neurons, which\nleverages end-to-end training of the model. In our approach, we perform\nTetraTransform--an equivariant embedding of the 3D input into 4D, constructed\nfrom the steerable neurons--and extract deeper O(3)-equivariant features using\nvector neurons. This integration of the TetraTransform into the VN-DGCNN\nframework, termed TetraSphere, negligibly increases the number of parameters by\nless than 0.0002%. TetraSphere sets a new state-of-the-art performance\nclassifying randomly rotated real-world object scans of the challenging subsets\nof ScanObjectNN. Additionally, TetraSphere outperforms all equivariant methods\non randomly rotated synthetic data: classifying objects from ModelNet40 and\nsegmenting parts of the ShapeNet shapes. Thus, our results reveal the practical\nvalue of steerable 3D spherical neurons for learning in 3D Euclidean space. The\ncode is available at https://github.com/pavlo-melnyk/tetrasphere.\n","authors":["Pavlo Melnyk","Andreas Robinson","Michael Felsberg","Mårten Wadenbäck"],"pdf_url":"https://arxiv.org/pdf/2211.14456v6.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.16994v1","updated":"2024-03-25T17:56:41Z","published":"2024-03-25T17:56:41Z","title":"Mapping Image Transformations Onto Pixel Processor Arrays","summary":" Pixel Processor Arrays (PPA) present a new vision sensor/processor\narchitecture consisting of a SIMD array of processor elements, each capable of\nlight capture, storage, processing and local communication. Such a device\nallows visual data to be efficiently stored and manipulated directly upon the\nfocal plane, but also demands the invention of new approaches and algorithms,\nsuitable for the massively-parallel fine-grain processor arrays. In this paper\nwe demonstrate how various image transformations, including shearing, rotation\nand scaling, can be performed directly upon a PPA. The implementation details\nare presented using the SCAMP-5 vision chip, that contains a 256x256\npixel-parallel array. Our approaches for performing the image transformations\nefficiently exploit the parallel computation in a cellular processor array,\nminimizing the number of SIMD instructions required. These fundamental image\ntransformations are vital building blocks for many visual tasks. This paper\naims to serve as a reference for future PPA research while demonstrating the\nflexibility of PPA architectures.\n","authors":["Laurie Bose","Piotr Dudek"],"pdf_url":"https://arxiv.org/pdf/2403.16994v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16993v1","updated":"2024-03-25T17:55:52Z","published":"2024-03-25T17:55:52Z","title":"Comp4D: LLM-Guided Compositional 4D Scene Generation","summary":" Recent advancements in diffusion models for 2D and 3D content creation have\nsparked a surge of interest in generating 4D content. However, the scarcity of\n3D scene datasets constrains current methodologies to primarily object-centric\ngeneration. To overcome this limitation, we present Comp4D, a novel framework\nfor Compositional 4D Generation. Unlike conventional methods that generate a\nsingular 4D representation of the entire scene, Comp4D innovatively constructs\neach 4D object within the scene separately. Utilizing Large Language Models\n(LLMs), the framework begins by decomposing an input text prompt into distinct\nentities and maps out their trajectories. It then constructs the compositional\n4D scene by accurately positioning these objects along their designated paths.\nTo refine the scene, our method employs a compositional score distillation\ntechnique guided by the pre-defined trajectories, utilizing pre-trained\ndiffusion models across text-to-image, text-to-video, and text-to-3D domains.\nExtensive experiments demonstrate our outstanding 4D content creation\ncapability compared to prior arts, showcasing superior visual quality, motion\nfidelity, and enhanced object interactions.\n","authors":["Dejia Xu","Hanwen Liang","Neel P. Bhatt","Hezhen Hu","Hanxue Liang","Konstantinos N. Plataniotis","Zhangyang Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16993v1.pdf","comment":"Project page: https://vita-group.github.io/Comp4D/"},{"id":"http://arxiv.org/abs/2403.16990v1","updated":"2024-03-25T17:52:07Z","published":"2024-03-25T17:52:07Z","title":"Be Yourself: Bounded Attention for Multi-Subject Text-to-Image\n Generation","summary":" Text-to-image diffusion models have an unprecedented ability to generate\ndiverse and high-quality images. However, they often struggle to faithfully\ncapture the intended semantics of complex input prompts that include multiple\nsubjects. Recently, numerous layout-to-image extensions have been introduced to\nimprove user control, aiming to localize subjects represented by specific\ntokens. Yet, these methods often produce semantically inaccurate images,\nespecially when dealing with multiple semantically or visually similar\nsubjects. In this work, we study and analyze the causes of these limitations.\nOur exploration reveals that the primary issue stems from inadvertent semantic\nleakage between subjects in the denoising process. This leakage is attributed\nto the diffusion model's attention layers, which tend to blend the visual\nfeatures of different subjects. To address these issues, we introduce Bounded\nAttention, a training-free method for bounding the information flow in the\nsampling process. Bounded Attention prevents detrimental leakage among subjects\nand enables guiding the generation to promote each subject's individuality,\neven with complex multi-subject conditioning. Through extensive\nexperimentation, we demonstrate that our method empowers the generation of\nmultiple subjects that better align with given prompts and layouts.\n","authors":["Omer Dahary","Or Patashnik","Kfir Aberman","Daniel Cohen-Or"],"pdf_url":"https://arxiv.org/pdf/2403.16990v1.pdf","comment":"Project page: https://omer11a.github.io/bounded-attention/"},{"id":"http://arxiv.org/abs/2311.15773v3","updated":"2024-03-25T17:41:23Z","published":"2023-11-27T12:48:33Z","title":"Check, Locate, Rectify: A Training-Free Layout Calibration System for\n Text-to-Image Generation","summary":" Diffusion models have recently achieved remarkable progress in generating\nrealistic images. However, challenges remain in accurately understanding and\nsynthesizing the layout requirements in the textual prompts. To align the\ngenerated image with layout instructions, we present a training-free layout\ncalibration system SimM that intervenes in the generative process on the fly\nduring inference time. Specifically, following a \"check-locate-rectify\"\npipeline, the system first analyses the prompt to generate the target layout\nand compares it with the intermediate outputs to automatically detect errors.\nThen, by moving the located activations and making intra- and inter-map\nadjustments, the rectification process can be performed with negligible\ncomputational overhead. To evaluate SimM over a range of layout requirements,\nwe present a benchmark SimMBench that compensates for the lack of superlative\nspatial relations in existing datasets. And both quantitative and qualitative\nresults demonstrate the effectiveness of the proposed SimM in calibrating the\nlayout inconsistencies. Our project page is at https://simm-t2i.github.io/SimM.\n","authors":["Biao Gong","Siteng Huang","Yutong Feng","Shiwei Zhang","Yuyuan Li","Yu Liu"],"pdf_url":"https://arxiv.org/pdf/2311.15773v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16974v1","updated":"2024-03-25T17:40:32Z","published":"2024-03-25T17:40:32Z","title":"Self-STORM: Deep Unrolled Self-Supervised Learning for Super-Resolution\n Microscopy","summary":" The use of fluorescent molecules to create long sequences of low-density,\ndiffraction-limited images enables highly-precise molecule localization.\nHowever, this methodology requires lengthy imaging times, which limits the\nability to view dynamic interactions of live cells on short time scales. Many\ntechniques have been developed to reduce the number of frames needed for\nlocalization, from classic iterative optimization to deep neural networks.\nParticularly, deep algorithm unrolling utilizes both the structure of iterative\nsparse recovery algorithms and the performance gains of supervised deep\nlearning. However, the robustness of this approach is highly dependant on\nhaving sufficient training data. In this paper we introduce deep unrolled\nself-supervised learning, which alleviates the need for such data by training a\nsequence-specific, model-based autoencoder that learns only from given\nmeasurements. Our proposed method exceeds the performance of its supervised\ncounterparts, thus allowing for robust, dynamic imaging well below the\ndiffraction limit without any labeled training samples. Furthermore, the\nsuggested model-based autoencoder scheme can be utilized to enhance\ngeneralization in any sparse recovery framework, without the need for external\ntraining data.\n","authors":["Yair Ben Sahel","Yonina C. Eldar"],"pdf_url":"https://arxiv.org/pdf/2403.16974v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12027v2","updated":"2024-03-25T17:39:10Z","published":"2024-03-18T17:57:09Z","title":"From Pixels to Insights: A Survey on Automatic Chart Understanding in\n the Era of Large Foundation Models","summary":" Data visualization in the form of charts plays a pivotal role in data\nanalysis, offering critical insights and aiding in informed decision-making.\nAutomatic chart understanding has witnessed significant advancements with the\nrise of large foundation models in recent years. Foundation models, such as\nlarge language models, have revolutionized various natural language processing\ntasks and are increasingly being applied to chart understanding tasks. This\nsurvey paper provides a comprehensive overview of the recent developments,\nchallenges, and future directions in chart understanding within the context of\nthese foundation models. We review fundamental building blocks crucial for\nstudying chart understanding tasks. Additionally, we explore various tasks and\ntheir evaluation metrics and sources of both charts and textual inputs. Various\nmodeling strategies are then examined, encompassing both classification-based\nand generation-based approaches, along with tool augmentation techniques that\nenhance chart understanding performance. Furthermore, we discuss the\nstate-of-the-art performance of each task and discuss how we can improve the\nperformance. Challenges and future directions are addressed, highlighting the\nimportance of several topics, such as domain-specific charts, lack of efforts\nin developing evaluation metrics, and agent-oriented settings. This survey\npaper serves as a comprehensive resource for researchers and practitioners in\nthe fields of natural language processing, computer vision, and data analysis,\nproviding valuable insights and directions for future research in chart\nunderstanding leveraging large foundation models. The studies mentioned in this\npaper, along with emerging new research, will be continually updated at:\nhttps://github.com/khuangaf/Awesome-Chart-Understanding.\n","authors":["Kung-Hsiang Huang","Hou Pong Chan","Yi R. Fung","Haoyi Qiu","Mingyang Zhou","Shafiq Joty","Shih-Fu Chang","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2403.12027v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16970v1","updated":"2024-03-25T17:31:12Z","published":"2024-03-25T17:31:12Z","title":"Joint chest X-ray diagnosis and clinical visual attention prediction\n with multi-stage cooperative learning: enhancing interpretability","summary":" As deep learning has become the state-of-the-art for computer-assisted\ndiagnosis, interpretability of the automatic decisions is crucial for clinical\ndeployment. While various methods were proposed in this domain, visual\nattention maps of clinicians during radiological screening offer a unique asset\nto provide important insights and can potentially enhance the quality of\ncomputer-assisted diagnosis. With this paper, we introduce a novel\ndeep-learning framework for joint disease diagnosis and prediction of\ncorresponding visual saliency maps for chest X-ray scans. Specifically, we\ndesigned a novel dual-encoder multi-task UNet, which leverages both a\nDenseNet201 backbone and a Residual and Squeeze-and-Excitation block-based\nencoder to extract diverse features for saliency map prediction, and a\nmulti-scale feature-fusion classifier to perform disease classification. To\ntackle the issue of asynchronous training schedules of individual tasks in\nmulti-task learning, we proposed a multi-stage cooperative learning strategy,\nwith contrastive learning for feature encoder pretraining to boost performance.\nExperiments show that our proposed method outperformed existing techniques for\nchest X-ray diagnosis and the quality of visual saliency map prediction.\n","authors":["Zirui Qiu","Hassan Rivaz","Yiming Xiao"],"pdf_url":"https://arxiv.org/pdf/2403.16970v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16967v1","updated":"2024-03-25T17:26:08Z","published":"2024-03-25T17:26:08Z","title":"Visual Whole-Body Control for Legged Loco-Manipulation","summary":" We study the problem of mobile manipulation using legged robots equipped with\nan arm, namely legged loco-manipulation. The robot legs, while usually utilized\nfor mobility, offer an opportunity to amplify the manipulation capabilities by\nconducting whole-body control. That is, the robot can control the legs and the\narm at the same time to extend its workspace. We propose a framework that can\nconduct the whole-body control autonomously with visual observations. Our\napproach, namely \\ourFull~(\\our), is composed of a low-level policy using all\ndegrees of freedom to track the end-effector manipulator position and a\nhigh-level policy proposing the end-effector position based on visual inputs.\nWe train both levels of policies in simulation and perform Sim2Real transfer\nfor real robot deployment. We perform extensive experiments and show\nsignificant improvements over baselines in picking up diverse objects in\ndifferent configurations (heights, locations, orientations) and environments.\nProject page: https://wholebody-b1.github.io\n","authors":["Minghuan Liu","Zixuan Chen","Xuxin Cheng","Yandong Ji","Ruihan Yang","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16967v1.pdf","comment":"The first two authors contribute equally. Project page:\n https://wholebody-b1.github.io"},{"id":"http://arxiv.org/abs/2403.16964v1","updated":"2024-03-25T17:22:11Z","published":"2024-03-25T17:22:11Z","title":"GSDF: 3DGS Meets SDF for Improved Rendering and Reconstruction","summary":" Presenting a 3D scene from multiview images remains a core and long-standing\nchallenge in computer vision and computer graphics. Two main requirements lie\nin rendering and reconstruction. Notably, SOTA rendering quality is usually\nachieved with neural volumetric rendering techniques, which rely on aggregated\npoint/primitive-wise color and neglect the underlying scene geometry. Learning\nof neural implicit surfaces is sparked from the success of neural rendering.\nCurrent works either constrain the distribution of density fields or the shape\nof primitives, resulting in degraded rendering quality and flaws on the learned\nscene surfaces. The efficacy of such methods is limited by the inherent\nconstraints of the chosen neural representation, which struggles to capture\nfine surface details, especially for larger, more intricate scenes. To address\nthese issues, we introduce GSDF, a novel dual-branch architecture that combines\nthe benefits of a flexible and efficient 3D Gaussian Splatting (3DGS)\nrepresentation with neural Signed Distance Fields (SDF). The core idea is to\nleverage and enhance the strengths of each branch while alleviating their\nlimitation through mutual guidance and joint supervision. We show on diverse\nscenes that our design unlocks the potential for more accurate and detailed\nsurface reconstructions, and at the meantime benefits 3DGS rendering with\nstructures that are more aligned with the underlying geometry.\n","authors":["Mulin Yu","Tao Lu","Linning Xu","Lihan Jiang","Yuanbo Xiangli","Bo Dai"],"pdf_url":"https://arxiv.org/pdf/2403.16964v1.pdf","comment":"Project page: https://city-super.github.io/GSDF"},{"id":"http://arxiv.org/abs/2403.16958v1","updated":"2024-03-25T17:17:45Z","published":"2024-03-25T17:17:45Z","title":"TwinLiteNetPlus: A Stronger Model for Real-time Drivable Area and Lane\n Segmentation","summary":" Semantic segmentation is crucial for autonomous driving, particularly for\nDrivable Area and Lane Segmentation, ensuring safety and navigation. To address\nthe high computational costs of current state-of-the-art (SOTA) models, this\npaper introduces TwinLiteNetPlus (TwinLiteNet$^+$), a model adept at balancing\nefficiency and accuracy. TwinLiteNet$^+$ incorporates standard and depth-wise\nseparable dilated convolutions, reducing complexity while maintaining high\naccuracy. It is available in four configurations, from the robust 1.94\nmillion-parameter TwinLiteNet$^+_{\\text{Large}}$ to the ultra-compact\n34K-parameter TwinLiteNet$^+_{\\text{Nano}}$. Notably,\nTwinLiteNet$^+_{\\text{Large}}$ attains a 92.9\\% mIoU for Drivable Area\nSegmentation and a 34.2\\% IoU for Lane Segmentation. These results notably\noutperform those of current SOTA models while requiring a computational cost\nthat is approximately 11 times lower in terms of Floating Point Operations\n(FLOPs) compared to the existing SOTA model. Extensively tested on various\nembedded devices, TwinLiteNet$^+$ demonstrates promising latency and power\nefficiency, underscoring its suitability for real-world autonomous vehicle\napplications.\n","authors":["Quang-Huy Che","Duc-Tri Le","Minh-Quan Pham","Vinh-Tiep Nguyen","Duc-Khai Lam"],"pdf_url":"https://arxiv.org/pdf/2403.16958v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15841v3","updated":"2024-03-25T17:17:31Z","published":"2023-11-27T14:07:13Z","title":"Learning Disentangled Identifiers for Action-Customized Text-to-Image\n Generation","summary":" This study focuses on a novel task in text-to-image (T2I) generation, namely\naction customization. The objective of this task is to learn the co-existing\naction from limited data and generalize it to unseen humans or even animals.\nExperimental results show that existing subject-driven customization methods\nfail to learn the representative characteristics of actions and struggle in\ndecoupling actions from context features, including appearance. To overcome the\npreference for low-level features and the entanglement of high-level features,\nwe propose an inversion-based method Action-Disentangled Identifier (ADI) to\nlearn action-specific identifiers from the exemplar images. ADI first expands\nthe semantic conditioning space by introducing layer-wise identifier tokens,\nthereby increasing the representational richness while distributing the\ninversion across different features. Then, to block the inversion of\naction-agnostic features, ADI extracts the gradient invariance from the\nconstructed sample triples and masks the updates of irrelevant channels. To\ncomprehensively evaluate the task, we present an ActionBench that includes a\nvariety of actions, each accompanied by meticulously selected samples. Both\nquantitative and qualitative results show that our ADI outperforms existing\nbaselines in action-customized T2I generation. Our project page is at\nhttps://adi-t2i.github.io/ADI.\n","authors":["Siteng Huang","Biao Gong","Yutong Feng","Xi Chen","Yuqian Fu","Yu Liu","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15841v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16954v1","updated":"2024-03-25T17:16:27Z","published":"2024-03-25T17:16:27Z","title":"Isolated Diffusion: Optimizing Multi-Concept Text-to-Image Generation\n Training-Freely with Isolated Diffusion Guidance","summary":" Large-scale text-to-image diffusion models have achieved great success in\nsynthesizing high-quality and diverse images given target text prompts. Despite\nthe revolutionary image generation ability, current state-of-the-art models\nstill struggle to deal with multi-concept generation accurately in many cases.\nThis phenomenon is known as ``concept bleeding\" and displays as the unexpected\noverlapping or merging of various concepts. This paper presents a general\napproach for text-to-image diffusion models to address the mutual interference\nbetween different subjects and their attachments in complex scenes, pursuing\nbetter text-image consistency. The core idea is to isolate the synthesizing\nprocesses of different concepts. We propose to bind each attachment to\ncorresponding subjects separately with split text prompts. Besides, we\nintroduce a revision method to fix the concept bleeding problem in\nmulti-subject synthesis. We first depend on pre-trained object detection and\nsegmentation models to obtain the layouts of subjects. Then we isolate and\nresynthesize each subject individually with corresponding text prompts to avoid\nmutual interference. Overall, we achieve a training-free strategy, named\nIsolated Diffusion, to optimize multi-concept text-to-image synthesis. It is\ncompatible with the latest Stable Diffusion XL (SDXL) and prior Stable\nDiffusion (SD) models. We compare our approach with alternative methods using a\nvariety of multi-concept text prompts and demonstrate its effectiveness with\nclear advantages in text-image consistency and user study.\n","authors":["Jingyuan Zhu","Huimin Ma","Jiansheng Chen","Jian Yuan"],"pdf_url":"https://arxiv.org/pdf/2403.16954v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16937v1","updated":"2024-03-25T17:01:34Z","published":"2024-03-25T17:01:34Z","title":"Hyperspherical Classification with Dynamic Label-to-Prototype Assignment","summary":" Aiming to enhance the utilization of metric space by the parametric softmax\nclassifier, recent studies suggest replacing it with a non-parametric\nalternative. Although a non-parametric classifier may provide better metric\nspace utilization, it introduces the challenge of capturing inter-class\nrelationships. A shared characteristic among prior non-parametric classifiers\nis the static assignment of labels to prototypes during the training, ie, each\nprototype consistently represents a class throughout the training course.\nOrthogonal to previous works, we present a simple yet effective method to\noptimize the category assigned to each prototype (label-to-prototype\nassignment) during the training. To this aim, we formalize the problem as a\ntwo-step optimization objective over network parameters and label-to-prototype\nassignment mapping. We solve this optimization using a sequential combination\nof gradient descent and Bipartide matching. We demonstrate the benefits of the\nproposed approach by conducting experiments on balanced and long-tail\nclassification problems using different backbone network architectures. In\nparticular, our method outperforms its competitors by 1.22\\% accuracy on\nCIFAR-100, and 2.15\\% on ImageNet-200 using a metric space dimension half of\nthe size of its competitors. Code:\nhttps://github.com/msed-Ebrahimi/DL2PA_CVPR24\n","authors":["Mohammad Saeed Ebrahimi Saadabadi","Ali Dabouei","Sahar Rahimi Malakshan","Nasser M. Nasrabad"],"pdf_url":"https://arxiv.org/pdf/2403.16937v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2401.08399v2","updated":"2024-03-25T16:50:43Z","published":"2024-01-16T14:41:42Z","title":"TACO: Benchmarking Generalizable Bimanual Tool-ACtion-Object\n Understanding","summary":" Humans commonly work with multiple objects in daily life and can intuitively\ntransfer manipulation skills to novel objects by understanding object\nfunctional regularities. However, existing technical approaches for analyzing\nand synthesizing hand-object manipulation are mostly limited to handling a\nsingle hand and object due to the lack of data support. To address this, we\nconstruct TACO, an extensive bimanual hand-object-interaction dataset spanning\na large variety of tool-action-object compositions for daily human activities.\nTACO contains 2.5K motion sequences paired with third-person and egocentric\nviews, precise hand-object 3D meshes, and action labels. To rapidly expand the\ndata scale, we present a fully automatic data acquisition pipeline combining\nmulti-view sensing with an optical motion capture system. With the vast\nresearch fields provided by TACO, we benchmark three generalizable\nhand-object-interaction tasks: compositional action recognition, generalizable\nhand-object motion forecasting, and cooperative grasp synthesis. Extensive\nexperiments reveal new insights, challenges, and opportunities for advancing\nthe studies of generalizable hand-object motion analysis and synthesis. Our\ndata and code are available at https://taco2024.github.io.\n","authors":["Yun Liu","Haolin Yang","Xu Si","Ling Liu","Zipeng Li","Yuxiang Zhang","Yebin Liu","Li Yi"],"pdf_url":"https://arxiv.org/pdf/2401.08399v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16921v1","updated":"2024-03-25T16:39:15Z","published":"2024-03-25T16:39:15Z","title":"PropTest: Automatic Property Testing for Improved Visual Programming","summary":" Visual Programming has emerged as an alternative to end-to-end black-box\nvisual reasoning models. This type of methods leverage Large Language Models\n(LLMs) to decompose a problem and generate the source code for an executable\ncomputer program. This strategy has the advantage of offering an interpretable\nreasoning path and does not require finetuning a model with task-specific data.\nWe propose PropTest, a general strategy that improves visual programming by\nfurther using an LLM to generate code that tests for visual properties in an\ninitial round of proposed solutions. Particularly, our method tests for\ndata-type consistency, as well as syntactic and semantic properties in the\ngenerated solutions. Our proposed solution outperforms baselines and achieves\ncomparable results to state-of-the-art methods while using smaller and publicly\navailable LLMs (CodeLlama-7B and WizardCoder-15B). This is demonstrated across\ndifferent benchmarks on visual question answering and referring expression\ncomprehension, showing the efficacy of our approach in enhancing the\nperformance and generalization of visual reasoning tasks. Specifically,\nPropTest improves ViperGPT by obtaining 48.66% accuracy (+8.3%) on the A-OKVQA\nbenchmark and 52.8% (+3.3%) on the RefCOCO+ benchmark using CodeLlama-7B.\n","authors":["Jaywon Koo","Ziyan Yang","Paola Cascante-Bonilla","Baishakhi Ray","Vicente Ordonez"],"pdf_url":"https://arxiv.org/pdf/2403.16921v1.pdf","comment":"Project Page: https://jaywonkoo17.github.io/PropTest/"},{"id":"http://arxiv.org/abs/2311.11138v2","updated":"2024-03-25T16:10:20Z","published":"2023-11-18T18:18:33Z","title":"Estimating Uncertainty in Landslide Segmentation Models","summary":" Landslides are a recurring, widespread hazard. Preparation and mitigation\nefforts can be aided by a high-quality, large-scale dataset that covers global\nat-risk areas. Such a dataset currently does not exist and is impossible to\nconstruct manually. Recent automated efforts focus on deep learning models for\nlandslide segmentation (pixel labeling) from satellite imagery. However, it is\nalso important to characterize the uncertainty or confidence levels of such\nsegmentations. Accurate and robust uncertainty estimates can enable low-cost\n(in terms of manual labor) oversight of auto-generated landslide databases to\nresolve errors, identify hard negative examples, and increase the size of\nlabeled training data. In this paper, we evaluate several methods for assessing\npixel-level uncertainty of the segmentation. Three methods that do not require\narchitectural changes were compared, including Pre-Threshold activations,\nMonte-Carlo Dropout and Test-Time Augmentation -- a method that measures the\nrobustness of predictions in the face of data augmentation. Experimentally, the\nquality of the latter method was consistently higher than the others across a\nvariety of models and metrics in our dataset.\n","authors":["Savinay Nagendra","Chaopeng Shen","Daniel Kifer"],"pdf_url":"https://arxiv.org/pdf/2311.11138v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16897v1","updated":"2024-03-25T16:08:04Z","published":"2024-03-25T16:08:04Z","title":"Make-It-Vivid: Dressing Your Animatable Biped Cartoon Characters from\n Text","summary":" Creating and animating 3D biped cartoon characters is crucial and valuable in\nvarious applications. Compared with geometry, the diverse texture design plays\nan important role in making 3D biped cartoon characters vivid and charming.\nTherefore, we focus on automatic texture design for cartoon characters based on\ninput instructions. This is challenging for domain-specific requirements and a\nlack of high-quality data. To address this challenge, we propose Make-It-Vivid,\nthe first attempt to enable high-quality texture generation from text in UV\nspace. We prepare a detailed text-texture paired data for 3D characters by\nusing vision-question-answering agents. Then we customize a pretrained\ntext-to-image model to generate texture map with template structure while\npreserving the natural 2D image knowledge. Furthermore, to enhance fine-grained\ndetails, we propose a novel adversarial learning scheme to shorten the domain\ngap between original dataset and realistic texture domain. Extensive\nexperiments show that our approach outperforms current texture generation\nmethods, resulting in efficient character texturing and faithful generation\nwith prompts. Besides, we showcase various applications such as out of domain\ngeneration and texture stylization. We also provide an efficient generation\nsystem for automatic text-guided textured character generation and animation.\n","authors":["Junshu Tang","Yanhong Zeng","Ke Fan","Xuheng Wang","Bo Dai","Kai Chen","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2403.16897v1.pdf","comment":"Project page: https://make-it-vivid.github.io/"},{"id":"http://arxiv.org/abs/2312.10035v2","updated":"2024-03-25T16:00:01Z","published":"2023-12-15T18:59:59Z","title":"Point Transformer V3: Simpler, Faster, Stronger","summary":" This paper is not motivated to seek innovation within the attention\nmechanism. Instead, it focuses on overcoming the existing trade-offs between\naccuracy and efficiency within the context of point cloud processing,\nleveraging the power of scale. Drawing inspiration from recent advances in 3D\nlarge-scale representation learning, we recognize that model performance is\nmore influenced by scale than by intricate design. Therefore, we present Point\nTransformer V3 (PTv3), which prioritizes simplicity and efficiency over the\naccuracy of certain mechanisms that are minor to the overall performance after\nscaling, such as replacing the precise neighbor search by KNN with an efficient\nserialized neighbor mapping of point clouds organized with specific patterns.\nThis principle enables significant scaling, expanding the receptive field from\n16 to 1024 points while remaining efficient (a 3x increase in processing speed\nand a 10x improvement in memory efficiency compared with its predecessor,\nPTv2). PTv3 attains state-of-the-art results on over 20 downstream tasks that\nspan both indoor and outdoor scenarios. Further enhanced with multi-dataset\njoint training, PTv3 pushes these results to a higher level.\n","authors":["Xiaoyang Wu","Li Jiang","Peng-Shuai Wang","Zhijian Liu","Xihui Liu","Yu Qiao","Wanli Ouyang","Tong He","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2312.10035v2.pdf","comment":"CVPR 2024, code available at Pointcept\n (https://github.com/Pointcept/PointTransformerV3)"},{"id":"http://arxiv.org/abs/2403.17042v1","updated":"2024-03-25T15:58:26Z","published":"2024-03-25T15:58:26Z","title":"Provably Robust Score-Based Diffusion Posterior Sampling for\n Plug-and-Play Image Reconstruction","summary":" In a great number of tasks in science and engineering, the goal is to infer\nan unknown image from a small number of measurements collected from a known\nforward model describing certain sensing or imaging modality. Due to resource\nconstraints, this task is often extremely ill-posed, which necessitates the\nadoption of expressive prior information to regularize the solution space.\nScore-based diffusion models, due to its impressive empirical success, have\nemerged as an appealing candidate of an expressive prior in image\nreconstruction. In order to accommodate diverse tasks at once, it is of great\ninterest to develop efficient, consistent and robust algorithms that\nincorporate {\\em unconditional} score functions of an image prior distribution\nin conjunction with flexible choices of forward models.\n This work develops an algorithmic framework for employing score-based\ndiffusion models as an expressive data prior in general nonlinear inverse\nproblems. Motivated by the plug-and-play framework in the imaging community, we\nintroduce a diffusion plug-and-play method (\\textsf{DPnP}) that alternatively\ncalls two samplers, a proximal consistency sampler based solely on the\nlikelihood function of the forward model, and a denoising diffusion sampler\nbased solely on the score functions of the image prior. The key insight is that\ndenoising under white Gaussian noise can be solved {\\em rigorously} via both\nstochastic (i.e., DDPM-type) and deterministic (i.e., DDIM-type) samplers using\nthe unconditional score functions. We establish both asymptotic and\nnon-asymptotic performance guarantees of \\textsf{DPnP}, and provide numerical\nexperiments to illustrate its promise in solving both linear and nonlinear\nimage reconstruction tasks. To the best of our knowledge, \\textsf{DPnP} is the\nfirst provably-robust posterior sampling method for nonlinear inverse problems\nusing unconditional diffusion priors.\n","authors":["Xingyu Xu","Yuejie Chi"],"pdf_url":"https://arxiv.org/pdf/2403.17042v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16888v1","updated":"2024-03-25T15:56:51Z","published":"2024-03-25T15:56:51Z","title":"Towards Balanced RGB-TSDF Fusion for Consistent Semantic Scene\n Completion by 3D RGB Feature Completion and a Classwise Entropy Loss Function","summary":" Semantic Scene Completion (SSC) aims to jointly infer semantics and\noccupancies of 3D scenes. Truncated Signed Distance Function (TSDF), a 3D\nencoding of depth, has been a common input for SSC. Furthermore, RGB-TSDF\nfusion, seems promising since these two modalities provide color and geometry\ninformation, respectively. Nevertheless, RGB-TSDF fusion has been considered\nnontrivial and commonly-used naive addition will result in inconsistent\nresults. We argue that the inconsistency comes from the sparsity of RGB\nfeatures upon projecting into 3D space, while TSDF features are dense, leading\nto imbalanced feature maps when summed up. To address this RGB-TSDF\ndistribution difference, we propose a two-stage network with a 3D RGB feature\ncompletion module that completes RGB features with meaningful values for\noccluded areas. Moreover, we propose an effective classwise entropy loss\nfunction to punish inconsistency. Extensive experiments on public datasets\nverify that our method achieves state-of-the-art performance among methods that\ndo not adopt extra data.\n","authors":["Laiyan Ding","Panwen Hu","Jie Li","Rui Huang"],"pdf_url":"https://arxiv.org/pdf/2403.16888v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16885v1","updated":"2024-03-25T15:56:17Z","published":"2024-03-25T15:56:17Z","title":"CVT-xRF: Contrastive In-Voxel Transformer for 3D Consistent Radiance\n Fields from Sparse Inputs","summary":" Neural Radiance Fields (NeRF) have shown impressive capabilities for\nphotorealistic novel view synthesis when trained on dense inputs. However, when\ntrained on sparse inputs, NeRF typically encounters issues of incorrect density\nor color predictions, mainly due to insufficient coverage of the scene causing\npartial and sparse supervision, thus leading to significant performance\ndegradation. While existing works mainly consider ray-level consistency to\nconstruct 2D learning regularization based on rendered color, depth, or\nsemantics on image planes, in this paper we propose a novel approach that\nmodels 3D spatial field consistency to improve NeRF's performance with sparse\ninputs. Specifically, we first adopt a voxel-based ray sampling strategy to\nensure that the sampled rays intersect with a certain voxel in 3D space. We\nthen randomly sample additional points within the voxel and apply a Transformer\nto infer the properties of other points on each ray, which are then\nincorporated into the volume rendering. By backpropagating through the\nrendering loss, we enhance the consistency among neighboring points.\nAdditionally, we propose to use a contrastive loss on the encoder output of the\nTransformer to further improve consistency within each voxel. Experiments\ndemonstrate that our method yields significant improvement over different\nradiance fields in the sparse inputs setting, and achieves comparable\nperformance with current works.\n","authors":["Yingji Zhong","Lanqing Hong","Zhenguo Li","Dan Xu"],"pdf_url":"https://arxiv.org/pdf/2403.16885v1.pdf","comment":"The paper is accepted by CVPR 2024. Project page is available at\n https://zhongyingji.github.io/CVT-xRF"},{"id":"http://arxiv.org/abs/2403.16862v1","updated":"2024-03-25T15:26:32Z","published":"2024-03-25T15:26:32Z","title":"INPC: Implicit Neural Point Clouds for Radiance Field Rendering","summary":" We introduce a new approach for reconstruction and novel-view synthesis of\nunbounded real-world scenes. In contrast to previous methods using either\nvolumetric fields, grid-based models, or discrete point cloud proxies, we\npropose a hybrid scene representation, which implicitly encodes a point cloud\nin a continuous octree-based probability field and a multi-resolution hash\ngrid. In doing so, we combine the benefits of both worlds by retaining\nfavorable behavior during optimization: Our novel implicit point cloud\nrepresentation and differentiable bilinear rasterizer enable fast rendering\nwhile preserving fine geometric detail without depending on initial priors like\nstructure-from-motion point clouds. Our method achieves state-of-the-art image\nquality on several common benchmark datasets. Furthermore, we achieve fast\ninference at interactive frame rates, and can extract explicit point clouds to\nfurther enhance performance.\n","authors":["Florian Hahlbohm","Linus Franke","Moritz Kappel","Susana Castillo","Marc Stamminger","Marcus Magnor"],"pdf_url":"https://arxiv.org/pdf/2403.16862v1.pdf","comment":"Project page: https://fhahlbohm.github.io/inpc/"},{"id":"http://arxiv.org/abs/2304.11959v2","updated":"2024-03-25T15:15:41Z","published":"2023-04-24T09:53:21Z","title":"A Forward and Backward Compatible Framework for Few-shot\n Class-incremental Pill Recognition","summary":" Automatic Pill Recognition (APR) systems are crucial for enhancing hospital\nefficiency, assisting visually impaired individuals, and preventing\ncross-infection. However, most existing deep learning-based pill recognition\nsystems can only perform classification on classes with sufficient training\ndata. In practice, the high cost of data annotation and the continuous increase\nin new pill classes necessitate the development of a few-shot class-incremental\npill recognition system. This paper introduces the first few-shot\nclass-incremental pill recognition framework, named Discriminative and\nBidirectional Compatible Few-Shot Class-Incremental Learning (DBC-FSCIL). It\nencompasses forward-compatible and backward-compatible learning components. In\nforward-compatible learning, we propose an innovative virtual class synthesis\nstrategy and a Center-Triplet (CT) loss to enhance discriminative feature\nlearning. These virtual classes serve as placeholders in the feature space for\nfuture class updates, providing diverse semantic knowledge for model training.\nFor backward-compatible learning, we develop a strategy to synthesize reliable\npseudo-features of old classes using uncertainty quantification, facilitating\nData Replay (DR) and Knowledge Distillation (KD). This approach allows for the\nflexible synthesis of features and effectively reduces additional storage\nrequirements for samples and models. Additionally, we construct a new pill\nimage dataset for FSCIL and assess various mainstream FSCIL methods,\nestablishing new benchmarks. Our experimental results demonstrate that our\nframework surpasses existing State-of-the-art (SOTA) methods. The code is\navailable at https://github.com/zhang-jinghua/DBC-FSCIL.\n","authors":["Jinghua Zhang","Li Liu","Kai Gao","Dewen Hu"],"pdf_url":"https://arxiv.org/pdf/2304.11959v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16848v1","updated":"2024-03-25T15:09:54Z","published":"2024-03-25T15:09:54Z","title":"Multiple Object Tracking as ID Prediction","summary":" In Multiple Object Tracking (MOT), tracking-by-detection methods have stood\nthe test for a long time, which split the process into two parts according to\nthe definition: object detection and association. They leverage robust\nsingle-frame detectors and treat object association as a post-processing step\nthrough hand-crafted heuristic algorithms and surrogate tasks. However, the\nnature of heuristic techniques prevents end-to-end exploitation of training\ndata, leading to increasingly cumbersome and challenging manual modification\nwhile facing complicated or novel scenarios. In this paper, we regard this\nobject association task as an End-to-End in-context ID prediction problem and\npropose a streamlined baseline called MOTIP. Specifically, we form the target\nembeddings into historical trajectory information while considering the\ncorresponding IDs as in-context prompts, then directly predict the ID labels\nfor the objects in the current frame. Thanks to this end-to-end process, MOTIP\ncan learn tracking capabilities straight from training data, freeing itself\nfrom burdensome hand-crafted algorithms. Without bells and whistles, our method\nachieves impressive state-of-the-art performance in complex scenarios like\nDanceTrack and SportsMOT, and it performs competitively with other\ntransformer-based methods on MOT17. We believe that MOTIP demonstrates\nremarkable potential and can serve as a starting point for future research. The\ncode is available at https://github.com/MCG-NJU/MOTIP.\n","authors":["Ruopeng Gao","Yijun Zhang","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16848v1.pdf","comment":"71.4 HOTA on DanceTrack (with CrowdHuman), 67.5/70.0 HOTA on\n DanceTrack built upon Deformable DETR and DAB-Deformable DETR respectively\n (without additional data). The code repository will be created within several\n days"},{"id":"http://arxiv.org/abs/2403.16834v1","updated":"2024-03-25T14:57:29Z","published":"2024-03-25T14:57:29Z","title":"From Two Stream to One Stream: Efficient RGB-T Tracking via Mutual\n Prompt Learning and Knowledge Distillation","summary":" Due to the complementary nature of visible light and thermal in-frared\nmodalities, object tracking based on the fusion of visible light images and\nthermal images (referred to as RGB-T tracking) has received increasing\nattention from researchers in recent years. How to achieve more comprehensive\nfusion of information from the two modalities at a lower cost has been an issue\nthat re-searchers have been exploring. Inspired by visual prompt learn-ing, we\ndesigned a novel two-stream RGB-T tracking architecture based on cross-modal\nmutual prompt learning, and used this model as a teacher to guide a one-stream\nstudent model for rapid learning through knowledge distillation techniques.\nExtensive experiments have shown that, compared to similar RGB-T track-ers, our\ndesigned teacher model achieved the highest precision rate, while the student\nmodel, with comparable precision rate to the teacher model, realized an\ninference speed more than three times faster than the teacher model.(Codes will\nbe available if accepted.)\n","authors":["Yang Luo","Xiqing Guo","Hao Li"],"pdf_url":"https://arxiv.org/pdf/2403.16834v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16831v1","updated":"2024-03-25T14:57:18Z","published":"2024-03-25T14:57:18Z","title":"UrbanVLP: A Multi-Granularity Vision-Language Pre-Trained Foundation\n Model for Urban Indicator Prediction","summary":" Urban indicator prediction aims to infer socio-economic metrics in diverse\nurban landscapes using data-driven methods. However, prevalent pre-trained\nmodels, particularly those reliant on satellite imagery, face dual challenges.\nFirstly, concentrating solely on macro-level patterns from satellite data may\nintroduce bias, lacking nuanced details at micro levels, such as architectural\ndetails at a place. Secondly, the lack of interpretability in pre-trained\nmodels limits their utility in providing transparent evidence for urban\nplanning. In response to these issues, we devise a novel Vision-Language\nPre-Trained Model (UrbanVLP) in this paper. Our UrbanVLP seamlessly integrates\nmulti-granularity information from both macro (satellite) and micro\n(street-view) levels, overcoming the limitations of prior pre-trained models.\nMoreover, it introduces automatic text generation and calibration, elevating\ninterpretability in downstream applications by producing high-quality text\ndescriptions of urban imagery. Rigorous experiments conducted across six\nsocio-economic tasks underscore UrbanVLP's superior performance. We also deploy\na web platform to verify its practicality.\n","authors":["Xixuan Hao","Wei Chen","Yibo Yan","Siru Zhong","Kun Wang","Qingsong Wen","Yuxuan Liang"],"pdf_url":"https://arxiv.org/pdf/2403.16831v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15529v2","updated":"2024-03-25T14:52:44Z","published":"2023-11-27T04:22:48Z","title":"Efficient Dataset Distillation via Minimax Diffusion","summary":" Dataset distillation reduces the storage and computational consumption of\ntraining a network by generating a small surrogate dataset that encapsulates\nrich information of the original large-scale one. However, previous\ndistillation methods heavily rely on the sample-wise iterative optimization\nscheme. As the images-per-class (IPC) setting or image resolution grows larger,\nthe necessary computation will demand overwhelming time and resources. In this\nwork, we intend to incorporate generative diffusion techniques for computing\nthe surrogate dataset. Observing that key factors for constructing an effective\nsurrogate dataset are representativeness and diversity, we design additional\nminimax criteria in the generative training to enhance these facets for the\ngenerated images of diffusion models. We present a theoretical model of the\nprocess as hierarchical diffusion control demonstrating the flexibility of the\ndiffusion process to target these criteria without jeopardizing the\nfaithfulness of the sample to the desired distribution. The proposed method\nachieves state-of-the-art validation performance while demanding much less\ncomputational resources. Under the 100-IPC setting on ImageWoof, our method\nrequires less than one-twentieth the distillation time of previous methods, yet\nyields even better performance. Source code and generated data are available in\nhttps://github.com/vimar-gu/MinimaxDiffusion.\n","authors":["Jianyang Gu","Saeed Vahidian","Vyacheslav Kungurtsev","Haonan Wang","Wei Jiang","Yang You","Yiran Chen"],"pdf_url":"https://arxiv.org/pdf/2311.15529v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2402.13848v2","updated":"2024-03-25T14:45:53Z","published":"2024-02-21T14:50:24Z","title":"Zero-BEV: Zero-shot Projection of Any First-Person Modality to BEV Maps","summary":" Bird's-eye view (BEV) maps are an important geometrically structured\nrepresentation widely used in robotics, in particular self-driving vehicles and\nterrestrial robots. Existing algorithms either require depth information for\nthe geometric projection, which is not always reliably available, or are\ntrained end-to-end in a fully supervised way to map visual first-person\nobservations to BEV representation, and are therefore restricted to the output\nmodality they have been trained for. In contrast, we propose a new model\ncapable of performing zero-shot projections of any modality available in a\nfirst person view to the corresponding BEV map. This is achieved by\ndisentangling the geometric inverse perspective projection from the modality\ntransformation, eg. RGB to occupancy. The method is general and we showcase\nexperiments projecting to BEV three different modalities: semantic\nsegmentation, motion vectors and object bounding boxes detected in first\nperson. We experimentally show that the model outperforms competing methods, in\nparticular the widely used baseline resorting to monocular depth estimation.\n","authors":["Gianluca Monaci","Leonid Antsfeld","Boris Chidlovskii","Christian Wolf"],"pdf_url":"https://arxiv.org/pdf/2402.13848v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.09913v2","updated":"2024-03-25T14:09:09Z","published":"2023-12-15T16:23:42Z","title":"LAENeRF: Local Appearance Editing for Neural Radiance Fields","summary":" Due to the omnipresence of Neural Radiance Fields (NeRFs), the interest\ntowards editable implicit 3D representations has surged over the last years.\nHowever, editing implicit or hybrid representations as used for NeRFs is\ndifficult due to the entanglement of appearance and geometry encoded in the\nmodel parameters. Despite these challenges, recent research has shown first\npromising steps towards photorealistic and non-photorealistic appearance edits.\nThe main open issues of related work include limited interactivity, a lack of\nsupport for local edits and large memory requirements, rendering them less\nuseful in practice. We address these limitations with LAENeRF, a unified\nframework for photorealistic and non-photorealistic appearance editing of\nNeRFs. To tackle local editing, we leverage a voxel grid as starting point for\nregion selection. We learn a mapping from expected ray terminations to final\noutput color, which can optionally be supervised by a style loss, resulting in\na framework which can perform photorealistic and non-photorealistic appearance\nediting of selected regions. Relying on a single point per ray for our mapping,\nwe limit memory requirements and enable fast optimization. To guarantee\ninteractivity, we compose the output color using a set of learned, modifiable\nbase colors, composed with additive layer mixing. Compared to concurrent work,\nLAENeRF enables recoloring and stylization while keeping processing time low.\nFurthermore, we demonstrate that our approach surpasses baseline methods both\nquantitatively and qualitatively.\n","authors":["Lukas Radl","Michael Steiner","Andreas Kurz","Markus Steinberger"],"pdf_url":"https://arxiv.org/pdf/2312.09913v2.pdf","comment":"Accepted to CVPR 2024! Project website:\n https://r4dl.github.io/LAENeRF/"},{"id":"http://arxiv.org/abs/2307.04570v3","updated":"2024-03-25T13:31:33Z","published":"2023-07-10T14:02:31Z","title":"A Call to Reflect on Evaluation Practices for Age Estimation:\n Comparative Analysis of the State-of-the-Art and a Unified Benchmark","summary":" Comparing different age estimation methods poses a challenge due to the\nunreliability of published results stemming from inconsistencies in the\nbenchmarking process. Previous studies have reported continuous performance\nimprovements over the past decade using specialized methods; however, our\nfindings challenge these claims. This paper identifies two trivial, yet\npersistent issues with the currently used evaluation protocol and describes how\nto resolve them. We offer an extensive comparative analysis for\nstate-of-the-art facial age estimation methods. Surprisingly, we find that the\nperformance differences between the methods are negligible compared to the\neffect of other factors, such as facial alignment, facial coverage, image\nresolution, model architecture, or the amount of data used for pretraining. We\nuse the gained insights to propose using FaRL as the backbone model and\ndemonstrate its effectiveness on all public datasets. We make the source code\nand exact data splits public on GitHub.\n","authors":["Jakub Paplham","Vojtech Franc"],"pdf_url":"https://arxiv.org/pdf/2307.04570v3.pdf","comment":"CVPR 2024 Camera-Ready"},{"id":"http://arxiv.org/abs/2403.16707v1","updated":"2024-03-25T12:44:52Z","published":"2024-03-25T12:44:52Z","title":"One-Shot Domain Incremental Learning","summary":" Domain incremental learning (DIL) has been discussed in previous studies on\ndeep neural network models for classification. In DIL, we assume that samples\non new domains are observed over time. The models must classify inputs on all\ndomains. In practice, however, we may encounter a situation where we need to\nperform DIL under the constraint that the samples on the new domain are\nobserved only infrequently. Therefore, in this study, we consider the extreme\ncase where we have only one sample from the new domain, which we call one-shot\nDIL. We first empirically show that existing DIL methods do not work well in\none-shot DIL. We have analyzed the reason for this failure through various\ninvestigations. According to our analysis, we clarify that the difficulty of\none-shot DIL is caused by the statistics in the batch normalization layers.\nTherefore, we propose a technique regarding these statistics and demonstrate\nthe effectiveness of our technique through experiments on open datasets.\n","authors":["Yasushi Esaki","Satoshi Koide","Takuro Kutsuna"],"pdf_url":"https://arxiv.org/pdf/2403.16707v1.pdf","comment":"accepted at IEEE International Joint Conference on Neural Networks\n (IJCNN) 2024"},{"id":"http://arxiv.org/abs/2303.17245v4","updated":"2024-03-25T12:20:02Z","published":"2023-03-30T09:22:17Z","title":"Investigating and Mitigating the Side Effects of Noisy Views for\n Self-Supervised Clustering Algorithms in Practical Multi-View Scenarios","summary":" Multi-view clustering (MVC) aims at exploring category structures among\nmulti-view data in self-supervised manners. Multiple views provide more\ninformation than single views and thus existing MVC methods can achieve\nsatisfactory performance. However, their performance might seriously degenerate\nwhen the views are noisy in practical multi-view scenarios. In this paper, we\nformally investigate the drawback of noisy views and then propose a\ntheoretically grounded deep MVC method (namely MVCAN) to address this issue.\nSpecifically, we propose a novel MVC objective that enables un-shared\nparameters and inconsistent clustering predictions across multiple views to\nreduce the side effects of noisy views. Furthermore, a two-level multi-view\niterative optimization is designed to generate robust learning targets for\nrefining individual views' representation learning. Theoretical analysis\nreveals that MVCAN works by achieving the multi-view consistency,\ncomplementarity, and noise robustness. Finally, experiments on extensive public\ndatasets demonstrate that MVCAN outperforms state-of-the-art methods and is\nrobust against the existence of noisy views.\n","authors":["Jie Xu","Yazhou Ren","Xiaolong Wang","Lei Feng","Zheng Zhang","Gang Niu","Xiaofeng Zhu"],"pdf_url":"https://arxiv.org/pdf/2303.17245v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.14332v3","updated":"2024-03-25T09:53:30Z","published":"2022-05-28T04:45:07Z","title":"V4D: Voxel for 4D Novel View Synthesis","summary":" Neural radiance fields have made a remarkable breakthrough in the novel view\nsynthesis task at the 3D static scene. However, for the 4D circumstance (e.g.,\ndynamic scene), the performance of the existing method is still limited by the\ncapacity of the neural network, typically in a multilayer perceptron network\n(MLP). In this paper, we utilize 3D Voxel to model the 4D neural radiance\nfield, short as V4D, where the 3D voxel has two formats. The first one is to\nregularly model the 3D space and then use the sampled local 3D feature with the\ntime index to model the density field and the texture field by a tiny MLP. The\nsecond one is in look-up tables (LUTs) format that is for the pixel-level\nrefinement, where the pseudo-surface produced by the volume rendering is\nutilized as the guidance information to learn a 2D pixel-level refinement\nmapping. The proposed LUTs-based refinement module achieves the performance\ngain with little computational cost and could serve as the plug-and-play module\nin the novel view synthesis task. Moreover, we propose a more effective\nconditional positional encoding toward the 4D data that achieves performance\ngain with negligible computational burdens. Extensive experiments demonstrate\nthat the proposed method achieves state-of-the-art performance at a low\ncomputational cost.\n","authors":["Wanshui Gan","Hongbin Xu","Yi Huang","Shifeng Chen","Naoto Yokoya"],"pdf_url":"https://arxiv.org/pdf/2205.14332v3.pdf","comment":"Code released. Accepted by IEEE TVCG 2023"},{"id":"http://arxiv.org/abs/2403.16469v1","updated":"2024-03-25T06:50:25Z","published":"2024-03-25T06:50:25Z","title":"Learning from Reduced Labels for Long-Tailed Data","summary":" Long-tailed data is prevalent in real-world classification tasks and heavily\nrelies on supervised information, which makes the annotation process\nexceptionally labor-intensive and time-consuming. Unfortunately, despite being\na common approach to mitigate labeling costs, existing weakly supervised\nlearning methods struggle to adequately preserve supervised information for\ntail samples, resulting in a decline in accuracy for the tail classes. To\nalleviate this problem, we introduce a novel weakly supervised labeling setting\ncalled Reduced Label. The proposed labeling setting not only avoids the decline\nof supervised information for the tail samples, but also decreases the labeling\ncosts associated with long-tailed data. Additionally, we propose an\nstraightforward and highly efficient unbiased framework with strong theoretical\nguarantees to learn from these Reduced Labels. Extensive experiments conducted\non benchmark datasets including ImageNet validate the effectiveness of our\napproach, surpassing the performance of state-of-the-art weakly supervised\nmethods.\n","authors":["Meng Wei","Zhongnian Li","Yong Zhou","Xinzheng Xu"],"pdf_url":"https://arxiv.org/pdf/2403.16469v1.pdf","comment":"12 pages, 3 figures"},{"id":"http://arxiv.org/abs/2403.17719v1","updated":"2024-03-25T05:21:26Z","published":"2024-03-25T05:21:26Z","title":"Resolution Limit of Single-Photon LiDAR","summary":" Single-photon Light Detection and Ranging (LiDAR) systems are often equipped\nwith an array of detectors for improved spatial resolution and sensing speed.\nHowever, given a fixed amount of flux produced by the laser transmitter across\nthe scene, the per-pixel Signal-to-Noise Ratio (SNR) will decrease when more\npixels are packed in a unit space. This presents a fundamental trade-off\nbetween the spatial resolution of the sensor array and the SNR received at each\npixel. Theoretical characterization of this fundamental limit is explored. By\nderiving the photon arrival statistics and introducing a series of new\napproximation techniques, the Mean Squared Error (MSE) of the\nmaximum-likelihood estimator of the time delay is derived. The theoretical\npredictions align well with simulations and real data.\n","authors":["Stanley H. Chan","Hashan K. Weerasooriya","Weijian Zhang","Pamela Abshire","Istvan Gyongy","Robert K. Henderson"],"pdf_url":"https://arxiv.org/pdf/2403.17719v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16374v1","updated":"2024-03-25T02:38:34Z","published":"2024-03-25T02:38:34Z","title":"ProIn: Learning to Predict Trajectory Based on Progressive Interactions\n for Autonomous Driving","summary":" Accurate motion prediction of pedestrians, cyclists, and other surrounding\nvehicles (all called agents) is very important for autonomous driving. Most\nexisting works capture map information through an one-stage interaction with\nmap by vector-based attention, to provide map constraints for social\ninteraction and multi-modal differentiation. However, these methods have to\nencode all required map rules into the focal agent's feature, so as to retain\nall possible intentions' paths while at the meantime to adapt to potential\nsocial interaction. In this work, a progressive interaction network is proposed\nto enable the agent's feature to progressively focus on relevant maps, in order\nto better learn agents' feature representation capturing the relevant map\nconstraints. The network progressively encode the complex influence of map\nconstraints into the agent's feature through graph convolutions at the\nfollowing three stages: after historical trajectory encoder, after social\ninteraction, and after multi-modal differentiation. In addition, a weight\nallocation mechanism is proposed for multi-modal training, so that each mode\ncan obtain learning opportunities from a single-mode ground truth. Experiments\nhave validated the superiority of progressive interactions to the existing\none-stage interaction, and demonstrate the effectiveness of each component.\nEncouraging results were obtained in the challenging benchmarks.\n","authors":["Yinke Dong","Haifeng Yuan","Hongkun Liu","Wei Jing","Fangzhen Li","Hongmin Liu","Bin Fan"],"pdf_url":"https://arxiv.org/pdf/2403.16374v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17177v1","updated":"2024-03-25T20:44:01Z","published":"2024-03-25T20:44:01Z","title":"Brain Stroke Segmentation Using Deep Learning Models: A Comparative\n Study","summary":" Stroke segmentation plays a crucial role in the diagnosis and treatment of\nstroke patients by providing spatial information about affected brain regions\nand the extent of damage. Segmenting stroke lesions accurately is a challenging\ntask, given that conventional manual techniques are time consuming and prone to\nerrors. Recently, advanced deep models have been introduced for general medical\nimage segmentation, demonstrating promising results that surpass many state of\nthe art networks when evaluated on specific datasets. With the advent of the\nvision Transformers, several models have been introduced based on them, while\nothers have aimed to design better modules based on traditional convolutional\nlayers to extract long-range dependencies like Transformers. The question of\nwhether such high-level designs are necessary for all segmentation cases to\nachieve the best results remains unanswered. In this study, we selected four\ntypes of deep models that were recently proposed and evaluated their\nperformance for stroke segmentation: a pure Transformer-based architecture\n(DAE-Former), two advanced CNN-based models (LKA and DLKA) with attention\nmechanisms in their design, an advanced hybrid model that incorporates CNNs\nwith Transformers (FCT), and the well-known self-adaptive nnUNet framework with\nits configuration based on given data. We examined their performance on two\npublicly available datasets, and found that the nnUNet achieved the best\nresults with the simplest design among all. Revealing the robustness issue of\nTransformers to such variabilities serves as a potential reason for their\nweaker performance. Furthermore, nnUNet's success underscores the significant\nimpact of preprocessing and postprocessing techniques in enhancing segmentation\nresults, surpassing the focus solely on architectural designs\n","authors":["Ahmed Soliman","Yousif Yousif","Ahmed Ibrahim","Yalda Zafari-Ghadim","Essam A. Rashed","Mohamed Mabrok"],"pdf_url":"https://arxiv.org/pdf/2403.17177v1.pdf","comment":null}]},"2024-03-24T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2310.15168v3","updated":"2024-03-24T23:32:50Z","published":"2023-10-23T17:59:52Z","title":"Ghost on the Shell: An Expressive Representation of General 3D Shapes","summary":" The creation of photorealistic virtual worlds requires the accurate modeling\nof 3D surface geometry for a wide range of objects. For this, meshes are\nappealing since they 1) enable fast physics-based rendering with realistic\nmaterial and lighting, 2) support physical simulation, and 3) are\nmemory-efficient for modern graphics pipelines. Recent work on reconstructing\nand statistically modeling 3D shape, however, has critiqued meshes as being\ntopologically inflexible. To capture a wide range of object shapes, any 3D\nrepresentation must be able to model solid, watertight, shapes as well as thin,\nopen, surfaces. Recent work has focused on the former, and methods for\nreconstructing open surfaces do not support fast reconstruction with material\nand lighting or unconditional generative modelling. Inspired by the observation\nthat open surfaces can be seen as islands floating on watertight surfaces, we\nparameterize open surfaces by defining a manifold signed distance field on\nwatertight templates. With this parameterization, we further develop a\ngrid-based and differentiable representation that parameterizes both watertight\nand non-watertight meshes of arbitrary topology. Our new representation, called\nGhost-on-the-Shell (G-Shell), enables two important applications:\ndifferentiable rasterization-based reconstruction from multiview images and\ngenerative modelling of non-watertight meshes. We empirically demonstrate that\nG-Shell achieves state-of-the-art performance on non-watertight mesh\nreconstruction and generation tasks, while also performing effectively for\nwatertight meshes.\n","authors":["Zhen Liu","Yao Feng","Yuliang Xiu","Weiyang Liu","Liam Paull","Michael J. Black","Bernhard Schölkopf"],"pdf_url":"https://arxiv.org/pdf/2310.15168v3.pdf","comment":"ICLR 2024 Oral (v3: 30 pages, 19 figures, Project Page:\n https://gshell3d.github.io/)"},{"id":"http://arxiv.org/abs/2311.01623v3","updated":"2024-03-24T23:13:06Z","published":"2023-11-03T16:58:10Z","title":"VQPy: An Object-Oriented Approach to Modern Video Analytics","summary":" Video analytics is widely used in contemporary systems and services. At the\nforefront of video analytics are video queries that users develop to find\nobjects of particular interest. Building upon the insight that video objects\n(e.g., human, animals, cars, etc.), the center of video analytics, are similar\nin spirit to objects modeled by traditional object-oriented languages, we\npropose to develop an object-oriented approach to video analytics. This\napproach, named VQPy, consists of a frontend$\\unicode{x2015}$a Python variant\nwith constructs that make it easy for users to express video objects and their\ninteractions$\\unicode{x2015}$as well as an extensible backend that can\nautomatically construct and optimize pipelines based on video objects. We have\nimplemented and open-sourced VQPy, which has been productized in Cisco as part\nof its DeepVision framework.\n","authors":["Shan Yu","Zhenting Zhu","Yu Chen","Hanchen Xu","Pengzhan Zhao","Yang Wang","Arthi Padmanabhan","Hugo Latapie","Harry Xu"],"pdf_url":"https://arxiv.org/pdf/2311.01623v3.pdf","comment":"MLSys'24"},{"id":"http://arxiv.org/abs/2403.16318v1","updated":"2024-03-24T22:53:16Z","published":"2024-03-24T22:53:16Z","title":"AutoInst: Automatic Instance-Based Segmentation of LiDAR 3D Scans","summary":" Recently, progress in acquisition equipment such as LiDAR sensors has enabled\nsensing increasingly spacious outdoor 3D environments. Making sense of such 3D\nacquisitions requires fine-grained scene understanding, such as constructing\ninstance-based 3D scene segmentations. Commonly, a neural network is trained\nfor this task; however, this requires access to a large, densely annotated\ndataset, which is widely known to be challenging to obtain. To address this\nissue, in this work we propose to predict instance segmentations for 3D scenes\nin an unsupervised way, without relying on ground-truth annotations. To this\nend, we construct a learning framework consisting of two components: (1) a\npseudo-annotation scheme for generating initial unsupervised pseudo-labels; and\n(2) a self-training algorithm for instance segmentation to fit robust, accurate\ninstances from initial noisy proposals. To enable generating 3D instance mask\nproposals, we construct a weighted proxy-graph by connecting 3D points with\nedges integrating multi-modal image- and point-based self-supervised features,\nand perform graph-cuts to isolate individual pseudo-instances. We then build on\na state-of-the-art point-based architecture and train a 3D instance\nsegmentation model, resulting in significant refinement of initial proposals.\nTo scale to arbitrary complexity 3D scenes, we design our algorithm to operate\non local 3D point chunks and construct a merging step to generate scene-level\ninstance segmentations. Experiments on the challenging SemanticKITTI benchmark\ndemonstrate the potential of our approach, where it attains 13.3% higher\nAverage Precision and 9.1% higher F1 score compared to the best-performing\nbaseline. The code will be made publicly available at\nhttps://github.com/artonson/autoinst.\n","authors":["Cedric Perauer","Laurenz Adrian Heidrich","Haifan Zhang","Matthias Nießner","Anastasiia Kornilova","Alexey Artemov"],"pdf_url":"https://arxiv.org/pdf/2403.16318v1.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2403.03881v2","updated":"2024-03-24T21:38:49Z","published":"2024-03-06T17:41:41Z","title":"Latent Dataset Distillation with Diffusion Models","summary":" The efficacy of machine learning has traditionally relied on the availability\nof increasingly larger datasets. However, large datasets pose storage\nchallenges and contain non-influential samples, which could be ignored during\ntraining without impacting the final accuracy of the model. In response to\nthese limitations, the concept of distilling the information on a dataset into\na condensed set of (synthetic) samples, namely a distilled dataset, emerged.\nOne crucial aspect is the selected architecture (usually ConvNet) for linking\nthe original and synthetic datasets. However, the final accuracy is lower if\nthe employed model architecture differs from the model used during\ndistillation. Another challenge is the generation of high-resolution images,\ne.g., 128x128 and higher. In this paper, we propose Latent Dataset Distillation\nwith Diffusion Models (LD3M) that combine diffusion in latent space with\ndataset distillation to tackle both challenges. LD3M incorporates a novel\ndiffusion process tailored for dataset distillation, which improves the\ngradient norms for learning synthetic images. By adjusting the number of\ndiffusion steps, LD3M also offers a straightforward way of controlling the\ntrade-off between speed and accuracy. We evaluate our approach in several\nImageNet subsets and for high-resolution images (128x128 and 256x256). As a\nresult, LD3M consistently outperforms state-of-the-art distillation techniques\nby up to 4.8 p.p. and 4.2 p.p. for 1 and 10 images per class, respectively.\n","authors":["Brian B. Moser","Federico Raue","Sebastian Palacio","Stanislav Frolov","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2403.03881v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16292v1","updated":"2024-03-24T20:48:36Z","published":"2024-03-24T20:48:36Z","title":"latentSplat: Autoencoding Variational Gaussians for Fast Generalizable\n 3D Reconstruction","summary":" We present latentSplat, a method to predict semantic Gaussians in a 3D latent\nspace that can be splatted and decoded by a light-weight generative 2D\narchitecture. Existing methods for generalizable 3D reconstruction either do\nnot enable fast inference of high resolution novel views due to slow volume\nrendering, or are limited to interpolation of close input views, even in\nsimpler settings with a single central object, where 360-degree generalization\nis possible. In this work, we combine a regression-based approach with a\ngenerative model, moving towards both of these capabilities within the same\nmethod, trained purely on readily available real video data. The core of our\nmethod are variational 3D Gaussians, a representation that efficiently encodes\nvarying uncertainty within a latent space consisting of 3D feature Gaussians.\nFrom these Gaussians, specific instances can be sampled and rendered via\nefficient Gaussian splatting and a fast, generative decoder network. We show\nthat latentSplat outperforms previous works in reconstruction quality and\ngeneralization, while being fast and scalable to high-resolution data.\n","authors":["Christopher Wewer","Kevin Raj","Eddy Ilg","Bernt Schiele","Jan Eric Lenssen"],"pdf_url":"https://arxiv.org/pdf/2403.16292v1.pdf","comment":"Project website: https://geometric-rl.mpi-inf.mpg.de/latentsplat/"},{"id":"http://arxiv.org/abs/2403.16286v1","updated":"2024-03-24T20:31:42Z","published":"2024-03-24T20:31:42Z","title":"HemoSet: The First Blood Segmentation Dataset for Automation of\n Hemostasis Management","summary":" Hemorrhaging occurs in surgeries of all types, forcing surgeons to quickly\nadapt to the visual interference that results from blood rapidly filling the\nsurgical field. Introducing automation into the crucial surgical task of\nhemostasis management would offload mental and physical tasks from the surgeon\nand surgical assistants while simultaneously increasing the efficiency and\nsafety of the operation. The first step in automation of hemostasis management\nis detection of blood in the surgical field. To propel the development of blood\ndetection algorithms in surgeries, we present HemoSet, the first blood\nsegmentation dataset based on bleeding during a live animal robotic surgery.\nOur dataset features vessel hemorrhage scenarios where turbulent flow leads to\nabnormal pooling geometries in surgical fields. These pools are formed in\nconditions endemic to surgical procedures -- uneven heterogeneous tissue, under\nglossy lighting conditions and rapid tool movement. We benchmark several\nstate-of-the-art segmentation models and provide insight into the difficulties\nspecific to blood detection. We intend for HemoSet to spur development of\nautonomous blood suction tools by providing a platform for training and\nrefining blood segmentation models, addressing the precision needed for such\nrobotics.\n","authors":["Albert J. Miao Shan Lin","Jingpei Lu","Florian Richter","Benjamin Ostrander","Emily K. Funk","Ryan K. Orosco","Michael C. Yip"],"pdf_url":"https://arxiv.org/pdf/2403.16286v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.04926v2","updated":"2024-03-24T20:25:03Z","published":"2024-03-07T22:21:08Z","title":"BAGS: Blur Agnostic Gaussian Splatting through Multi-Scale Kernel\n Modeling","summary":" Recent efforts in using 3D Gaussians for scene reconstruction and novel view\nsynthesis can achieve impressive results on curated benchmarks; however, images\ncaptured in real life are often blurry. In this work, we analyze the robustness\nof Gaussian-Splatting-based methods against various image blur, such as motion\nblur, defocus blur, downscaling blur, \\etc. Under these degradations,\nGaussian-Splatting-based methods tend to overfit and produce worse results than\nNeural-Radiance-Field-based methods. To address this issue, we propose Blur\nAgnostic Gaussian Splatting (BAGS). BAGS introduces additional 2D modeling\ncapacities such that a 3D-consistent and high quality scene can be\nreconstructed despite image-wise blur. Specifically, we model blur by\nestimating per-pixel convolution kernels from a Blur Proposal Network (BPN).\nBPN is designed to consider spatial, color, and depth variations of the scene\nto maximize modeling capacity. Additionally, BPN also proposes a\nquality-assessing mask, which indicates regions where blur occur. Finally, we\nintroduce a coarse-to-fine kernel optimization scheme; this optimization scheme\nis fast and avoids sub-optimal solutions due to a sparse point cloud\ninitialization, which often occurs when we apply Structure-from-Motion on\nblurry images. We demonstrate that BAGS achieves photorealistic renderings\nunder various challenging blur conditions and imaging geometry, while\nsignificantly improving upon existing approaches.\n","authors":["Cheng Peng","Yutao Tang","Yifan Zhou","Nengyu Wang","Xijun Liu","Deming Li","Rama Chellappa"],"pdf_url":"https://arxiv.org/pdf/2403.04926v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16276v1","updated":"2024-03-24T19:50:49Z","published":"2024-03-24T19:50:49Z","title":"AVicuna: Audio-Visual LLM with Interleaver and Context-Boundary\n Alignment for Temporal Referential Dialogue","summary":" In everyday communication, humans frequently use speech and gestures to refer\nto specific areas or objects, a process known as Referential Dialogue (RD).\nWhile prior studies have investigated RD through Large Language Models (LLMs)\nor Large Multimodal Models (LMMs) in static contexts, the exploration of\nTemporal Referential Dialogue (TRD) within audio-visual media remains limited.\nTwo primary challenges hinder progress in this field: (1) the absence of\ncomprehensive, untrimmed audio-visual video datasets with precise temporal\nannotations, and (2) the need for methods to integrate complex temporal\nauditory and visual cues effectively. To address these challenges, we introduce\na novel framework to generate PU-VALOR, an extensive audio-visual dataset\ncomprising over 114,000 untrimmed videos with accurate temporal demarcations.\nWe also present AVicuna, featuring an Audio-Visual Tokens Interleaver (AVTI)\nthat ensures the temporal alignment of audio-visual information. Additionally,\nwe develop the A5-222K dataset, encompassing more than 200,000 audio-text\npairings, to facilitate the audio and text alignments. Our experiments\ndemonstrate that AVicuna can effectively handle TRD in audio-visual videos and\nachieve state-of-the-art performance on various audio-visual video\nunderstanding tasks, particularly in untrimmed videos. We further investigate\nthe optimal audio-interleaving rate for interleaved audio-visual inputs, which\nmaximizes performance on the Audio-Visual Event Dense Localization task.\n","authors":["Yunlong Tang","Daiki Shimada","Jing Bi","Chenliang Xu"],"pdf_url":"https://arxiv.org/pdf/2403.16276v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.00915v3","updated":"2024-03-24T19:39:00Z","published":"2022-09-02T09:50:31Z","title":"Detection of diabetic retinopathy using longitudinal self-supervised\n learning","summary":" Longitudinal imaging is able to capture both static anatomical structures and\ndynamic changes in disease progression towards earlier and better\npatient-specific pathology management. However, conventional approaches for\ndetecting diabetic retinopathy (DR) rarely take advantage of longitudinal\ninformation to improve DR analysis. In this work, we investigate the benefit of\nexploiting self-supervised learning with a longitudinal nature for DR diagnosis\npurposes. We compare different longitudinal self-supervised learning (LSSL)\nmethods to model the disease progression from longitudinal retinal color fundus\nphotographs (CFP) to detect early DR severity changes using a pair of\nconsecutive exams. The experiments were conducted on a longitudinal DR\nscreening dataset with or without those trained encoders (LSSL) acting as a\nlongitudinal pretext task. Results achieve an AUC of 0.875 for the baseline\n(model trained from scratch) and an AUC of 0.96 (95% CI: 0.9593-0.9655 DeLong\ntest) with a p-value < 2.2e-16 on early fusion using a simple ResNet alike\narchitecture with frozen LSSL weights, suggesting that the LSSL latent space\nenables to encode the dynamic of DR progression.\n","authors":["Rachid Zeghlache","Pierre-Henri Conze","Mostafa El Habib Daho","Ramin Tadayoni","Pascal Massin","Béatrice Cochener","Gwenolé Quellec","Mathieu Lamard"],"pdf_url":"https://arxiv.org/pdf/2209.00915v3.pdf","comment":"Accepted preprint for presentation at MICCAI-OMIA"},{"id":"http://arxiv.org/abs/2403.16272v1","updated":"2024-03-24T19:34:33Z","published":"2024-03-24T19:34:33Z","title":"L-MAE: Longitudinal masked auto-encoder with time and severity-aware\n encoding for diabetic retinopathy progression prediction","summary":" Pre-training strategies based on self-supervised learning (SSL) have proven\nto be effective pretext tasks for many downstream tasks in computer vision. Due\nto the significant disparity between medical and natural images, the\napplication of typical SSL is not straightforward in medical imaging.\nAdditionally, those pretext tasks often lack context, which is critical for\ncomputer-aided clinical decision support. In this paper, we developed a\nlongitudinal masked auto-encoder (MAE) based on the well-known\nTransformer-based MAE. In particular, we explored the importance of time-aware\nposition embedding as well as disease progression-aware masking. Taking into\naccount the time between examinations instead of just scheduling them offers\nthe benefit of capturing temporal changes and trends. The masking strategy, for\nits part, evolves during follow-up to better capture pathological changes,\nensuring a more accurate assessment of disease progression. Using OPHDIAT, a\nlarge follow-up screening dataset targeting diabetic retinopathy (DR), we\nevaluated the pre-trained weights on a longitudinal task, which is to predict\nthe severity label of the next visit within 3 years based on the past time\nseries examinations. Our results demonstrated the relevancy of both time-aware\nposition embedding and masking strategies based on disease progression\nknowledge. Compared to popular baseline models and standard longitudinal\nTransformers, these simple yet effective extensions significantly enhance the\npredictive ability of deep classification models.\n","authors":["Rachid Zeghlache","Pierre-Henri Conze","Mostafa El Habib Daho","Yihao Li","Alireza Rezaei","Hugo Le Boité","Ramin Tadayoni","Pascal Massin","Béatrice Cochener","Ikram Brahim","Gwenolé Quellec","Mathieu Lamard"],"pdf_url":"https://arxiv.org/pdf/2403.16272v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16271v1","updated":"2024-03-24T19:32:39Z","published":"2024-03-24T19:32:39Z","title":"Object Detectors in the Open Environment:Challenges, Solutions, and\n Outlook","summary":" With the emergence of foundation models, deep learning-based object detectors\nhave shown practical usability in closed set scenarios. However, for real-world\ntasks, object detectors often operate in open environments, where crucial\nfactors (\\eg, data distribution, objective) that influence model learning are\noften changing. The dynamic and intricate nature of the open environment poses\nnovel and formidable challenges to object detectors. Unfortunately, current\nresearch on object detectors in open environments lacks a comprehensive\nanalysis of their distinctive characteristics, challenges, and corresponding\nsolutions, which hinders their secure deployment in critical real-world\nscenarios. This paper aims to bridge this gap by conducting a comprehensive\nreview and analysis of object detectors in open environments. We initially\nidentified limitations of key structural components within the existing\ndetection pipeline and propose the open environment object detector challenge\nframework that includes four quadrants (\\ie, out-of-domain, out-of-category,\nrobust learning, and incremental learning) based on the dimensions of the data\n/ target changes. For each quadrant of challenges in the proposed framework, we\npresent a detailed description and systematic analysis of the overarching goals\nand core difficulties, systematically review the corresponding solutions, and\nbenchmark their performance over multiple widely adopted datasets. In addition,\nwe engage in a discussion of open problems and potential avenues for future\nresearch. This paper aims to provide a fresh, comprehensive, and systematic\nunderstanding of the challenges and solutions associated with open-environment\nobject detectors, thus catalyzing the development of more solid applications in\nreal-world scenarios.\n","authors":["Siyuan Liang","Wei Wang","Ruoyu Chen","Aishan Liu","Boxi Wu","Ee-Chien Chang","Xiaochun Cao","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2403.16271v1.pdf","comment":"32 pages, 17 figures"},{"id":"http://arxiv.org/abs/2403.16270v1","updated":"2024-03-24T19:22:15Z","published":"2024-03-24T19:22:15Z","title":"Constricting Normal Latent Space for Anomaly Detection with Normal-only\n Training Data","summary":" In order to devise an anomaly detection model using only normal training\ndata, an autoencoder (AE) is typically trained to reconstruct the data. As a\nresult, the AE can extract normal representations in its latent space. During\ntest time, since AE is not trained using real anomalies, it is expected to\npoorly reconstruct the anomalous data. However, several researchers have\nobserved that it is not the case. In this work, we propose to limit the\nreconstruction capability of AE by introducing a novel latent constriction\nloss, which is added to the existing reconstruction loss. By using our method,\nno extra computational cost is added to the AE during test time. Evaluations\nusing three video anomaly detection benchmark datasets, i.e., Ped2, Avenue, and\nShanghaiTech, demonstrate the effectiveness of our method in limiting the\nreconstruction capability of AE, which leads to a better anomaly detection\nmodel.\n","authors":["Marcella Astrid","Muhammad Zaigham Zaheer","Seung-Ik Lee"],"pdf_url":"https://arxiv.org/pdf/2403.16270v1.pdf","comment":"ICLR Workshop 2024 (PML4LRS)"},{"id":"http://arxiv.org/abs/2303.12054v3","updated":"2024-03-24T19:16:21Z","published":"2023-03-21T17:45:38Z","title":"Influencer Backdoor Attack on Semantic Segmentation","summary":" When a small number of poisoned samples are injected into the training\ndataset of a deep neural network, the network can be induced to exhibit\nmalicious behavior during inferences, which poses potential threats to\nreal-world applications. While they have been intensively studied in\nclassification, backdoor attacks on semantic segmentation have been largely\noverlooked. Unlike classification, semantic segmentation aims to classify every\npixel within a given image. In this work, we explore backdoor attacks on\nsegmentation models to misclassify all pixels of a victim class by injecting a\nspecific trigger on non-victim pixels during inferences, which is dubbed\nInfluencer Backdoor Attack (IBA). IBA is expected to maintain the\nclassification accuracy of non-victim pixels and mislead classifications of all\nvictim pixels in every single inference and could be easily applied to\nreal-world scenes. Based on the context aggregation ability of segmentation\nmodels, we proposed a simple, yet effective, Nearest-Neighbor trigger injection\nstrategy. We also introduce an innovative Pixel Random Labeling strategy which\nmaintains optimal performance even when the trigger is placed far from the\nvictim pixels. Our extensive experiments reveal that current segmentation\nmodels do suffer from backdoor attacks, demonstrate IBA real-world\napplicability, and show that our proposed techniques can further increase\nattack performance.\n","authors":["Haoheng Lan","Jindong Gu","Philip Torr","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2303.12054v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16263v1","updated":"2024-03-24T18:53:57Z","published":"2024-03-24T18:53:57Z","title":"Emotion Recognition from the perspective of Activity Recognition","summary":" Applications of an efficient emotion recognition system can be found in\nseveral domains such as medicine, driver fatigue surveillance, social robotics,\nand human-computer interaction. Appraising human emotional states, behaviors,\nand reactions displayed in real-world settings can be accomplished using latent\ncontinuous dimensions. Continuous dimensional models of human affect, such as\nthose based on valence and arousal are more accurate in describing a broad\nrange of spontaneous everyday emotions than more traditional models of discrete\nstereotypical emotion categories (e.g. happiness, surprise). Most of the prior\nwork on estimating valence and arousal considers laboratory settings and acted\ndata. But, for emotion recognition systems to be deployed and integrated into\nreal-world mobile and computing devices, we need to consider data collected in\nthe world. Action recognition is a domain of Computer Vision that involves\ncapturing complementary information on appearance from still frames and motion\nbetween frames. In this paper, we treat emotion recognition from the\nperspective of action recognition by exploring the application of deep learning\narchitectures specifically designed for action recognition, for continuous\naffect recognition. We propose a novel three-stream end-to-end deep learning\nregression pipeline with an attention mechanism, which is an ensemble design\nbased on sub-modules of multiple state-of-the-art action recognition systems.\nThe pipeline constitutes a novel data pre-processing approach with a spatial\nself-attention mechanism to extract keyframes. The optical flow of\nhigh-attention regions of the face is extracted to capture temporal context.\nAFEW-VA in-the-wild dataset has been used to conduct comparative experiments.\nQuantitative analysis shows that the proposed model outperforms multiple\nstandard baselines of both emotion recognition and action recognition models.\n","authors":["Savinay Nagendra","Prapti Panigrahi"],"pdf_url":"https://arxiv.org/pdf/2403.16263v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16260v1","updated":"2024-03-24T18:43:04Z","published":"2024-03-24T18:43:04Z","title":"Out-of-Distribution Detection via Deep Multi-Comprehension Ensemble","summary":" Recent research underscores the pivotal role of the Out-of-Distribution (OOD)\nfeature representation field scale in determining the efficacy of models in OOD\ndetection. Consequently, the adoption of model ensembles has emerged as a\nprominent strategy to augment this feature representation field, capitalizing\non anticipated model diversity.\n However, our introduction of novel qualitative and quantitative model\nensemble evaluation methods, specifically Loss Basin/Barrier Visualization and\nthe Self-Coupling Index, reveals a critical drawback in existing ensemble\nmethods. We find that these methods incorporate weights that are\naffine-transformable, exhibiting limited variability and thus failing to\nachieve the desired diversity in feature representation.\n To address this limitation, we elevate the dimensions of traditional model\nensembles, incorporating various factors such as different weight\ninitializations, data holdout, etc., into distinct supervision tasks. This\ninnovative approach, termed Multi-Comprehension (MC) Ensemble, leverages\ndiverse training tasks to generate distinct comprehensions of the data and\nlabels, thereby extending the feature representation field.\n Our experimental results demonstrate the superior performance of the MC\nEnsemble strategy in OOD detection compared to both the naive Deep Ensemble\nmethod and a standalone model of comparable size. This underscores the\neffectiveness of our proposed approach in enhancing the model's capability to\ndetect instances outside its training distribution.\n","authors":["Chenhui Xu","Fuxun Yu","Zirui Xu","Nathan Inkawhich","Xiang Chen"],"pdf_url":"https://arxiv.org/pdf/2403.16260v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16258v1","updated":"2024-03-24T18:33:16Z","published":"2024-03-24T18:33:16Z","title":"Laplacian-guided Entropy Model in Neural Codec with Blur-dissipated\n Synthesis","summary":" While replacing Gaussian decoders with a conditional diffusion model enhances\nthe perceptual quality of reconstructions in neural image compression, their\nlack of inductive bias for image data restricts their ability to achieve\nstate-of-the-art perceptual levels. To address this limitation, we adopt a\nnon-isotropic diffusion model at the decoder side. This model imposes an\ninductive bias aimed at distinguishing between frequency contents, thereby\nfacilitating the generation of high-quality images. Moreover, our framework is\nequipped with a novel entropy model that accurately models the probability\ndistribution of latent representation by exploiting spatio-channel correlations\nin latent space, while accelerating the entropy decoding step. This\nchannel-wise entropy model leverages both local and global spatial contexts\nwithin each channel chunk. The global spatial context is built upon the\nTransformer, which is specifically designed for image compression tasks. The\ndesigned Transformer employs a Laplacian-shaped positional encoding, the\nlearnable parameters of which are adaptively adjusted for each channel cluster.\nOur experiments demonstrate that our proposed framework yields better\nperceptual quality compared to cutting-edge generative-based codecs, and the\nproposed entropy model contributes to notable bitrate savings.\n","authors":["Atefeh Khoshkhahtinat","Ali Zafari","Piyush M. Mehta","Nasser M. Nasrabadi"],"pdf_url":"https://arxiv.org/pdf/2403.16258v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.16257v1","updated":"2024-03-24T18:33:15Z","published":"2024-03-24T18:33:15Z","title":"Unlearning Backdoor Threats: Enhancing Backdoor Defense in Multimodal\n Contrastive Learning via Local Token Unlearning","summary":" Multimodal contrastive learning has emerged as a powerful paradigm for\nbuilding high-quality features using the complementary strengths of various\ndata modalities. However, the open nature of such systems inadvertently\nincreases the possibility of backdoor attacks. These attacks subtly embed\nmalicious behaviors within the model during training, which can be activated by\nspecific triggers in the inference phase, posing significant security risks.\nDespite existing countermeasures through fine-tuning that reduce the adverse\nimpacts of such attacks, these defenses often degrade the clean accuracy and\nnecessitate the construction of extensive clean training pairs. In this paper,\nwe explore the possibility of a less-cost defense from the perspective of model\nunlearning, that is, whether the model can be made to quickly \\textbf{u}nlearn\n\\textbf{b}ackdoor \\textbf{t}hreats (UBT) by constructing a small set of\npoisoned samples. Specifically, we strengthen the backdoor shortcuts to\ndiscover suspicious samples through overfitting training prioritized by weak\nsimilarity samples. Building on the initial identification of suspicious\nsamples, we introduce an innovative token-based localized forgetting training\nregime. This technique specifically targets the poisoned aspects of the model,\napplying a focused effort to unlearn the backdoor associations and trying not\nto damage the integrity of the overall model. Experimental results show that\nour method not only ensures a minimal success rate for attacks, but also\npreserves the model's high clean accuracy.\n","authors":["Siyuan Liang","Kuanrong Liu","Jiajun Gong","Jiawei Liang","Yuan Xun","Ee-Chien Chang","Xiaochun Cao"],"pdf_url":"https://arxiv.org/pdf/2403.16257v1.pdf","comment":"6 pages, 2 figures"},{"id":"http://arxiv.org/abs/2403.06912v3","updated":"2024-03-24T18:10:11Z","published":"2024-03-11T17:02:11Z","title":"DNGaussian: Optimizing Sparse-View 3D Gaussian Radiance Fields with\n Global-Local Depth Normalization","summary":" Radiance fields have demonstrated impressive performance in synthesizing\nnovel views from sparse input views, yet prevailing methods suffer from high\ntraining costs and slow inference speed. This paper introduces DNGaussian, a\ndepth-regularized framework based on 3D Gaussian radiance fields, offering\nreal-time and high-quality few-shot novel view synthesis at low costs. Our\nmotivation stems from the highly efficient representation and surprising\nquality of the recent 3D Gaussian Splatting, despite it will encounter a\ngeometry degradation when input views decrease. In the Gaussian radiance\nfields, we find this degradation in scene geometry primarily lined to the\npositioning of Gaussian primitives and can be mitigated by depth constraint.\nConsequently, we propose a Hard and Soft Depth Regularization to restore\naccurate scene geometry under coarse monocular depth supervision while\nmaintaining a fine-grained color appearance. To further refine detailed\ngeometry reshaping, we introduce Global-Local Depth Normalization, enhancing\nthe focus on small local depth changes. Extensive experiments on LLFF, DTU, and\nBlender datasets demonstrate that DNGaussian outperforms state-of-the-art\nmethods, achieving comparable or better results with significantly reduced\nmemory cost, a $25 \\times$ reduction in training time, and over $3000 \\times$\nfaster rendering speed.\n","authors":["Jiahe Li","Jiawei Zhang","Xiao Bai","Jin Zheng","Xin Ning","Jun Zhou","Lin Gu"],"pdf_url":"https://arxiv.org/pdf/2403.06912v3.pdf","comment":"Accepted at CVPR 2024. Project page:\n https://fictionarry.github.io/DNGaussian/"},{"id":"http://arxiv.org/abs/2306.12547v2","updated":"2024-03-24T18:00:57Z","published":"2023-06-21T20:21:15Z","title":"DGC-GNN: Leveraging Geometry and Color Cues for Visual Descriptor-Free\n 2D-3D Matching","summary":" Matching 2D keypoints in an image to a sparse 3D point cloud of the scene\nwithout requiring visual descriptors has garnered increased interest due to its\nlow memory requirements, inherent privacy preservation, and reduced need for\nexpensive 3D model maintenance compared to visual descriptor-based methods.\nHowever, existing algorithms often compromise on performance, resulting in a\nsignificant deterioration compared to their descriptor-based counterparts. In\nthis paper, we introduce DGC-GNN, a novel algorithm that employs a\nglobal-to-local Graph Neural Network (GNN) that progressively exploits\ngeometric and color cues to represent keypoints, thereby improving matching\naccuracy. Our procedure encodes both Euclidean and angular relations at a\ncoarse level, forming the geometric embedding to guide the point matching. We\nevaluate DGC-GNN on both indoor and outdoor datasets, demonstrating that it not\nonly doubles the accuracy of the state-of-the-art visual descriptor-free\nalgorithm but also substantially narrows the performance gap between\ndescriptor-based and descriptor-free methods.\n","authors":["Shuzhe Wang","Juho Kannala","Daniel Barath"],"pdf_url":"https://arxiv.org/pdf/2306.12547v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.16246v1","updated":"2024-03-24T17:33:22Z","published":"2024-03-24T17:33:22Z","title":"Partially Blinded Unlearning: Class Unlearning for Deep Networks a\n Bayesian Perspective","summary":" In order to adhere to regulatory standards governing individual data privacy\nand safety, machine learning models must systematically eliminate information\nderived from specific subsets of a user's training data that can no longer be\nutilized. The emerging discipline of Machine Unlearning has arisen as a pivotal\narea of research, facilitating the process of selectively discarding\ninformation designated to specific sets or classes of data from a pre-trained\nmodel, thereby eliminating the necessity for extensive retraining from scratch.\nThe principal aim of this study is to formulate a methodology tailored for the\npurposeful elimination of information linked to a specific class of data from a\npre-trained classification network. This intentional removal is crafted to\ndegrade the model's performance specifically concerning the unlearned data\nclass while concurrently minimizing any detrimental impacts on the model's\nperformance in other classes. To achieve this goal, we frame the class\nunlearning problem from a Bayesian perspective, which yields a loss function\nthat minimizes the log-likelihood associated with the unlearned data with a\nstability regularization in parameter space. This stability regularization\nincorporates Mohalanobis distance with respect to the Fisher Information matrix\nand $l_2$ distance from the pre-trained model parameters. Our novel approach,\ntermed \\textbf{Partially-Blinded Unlearning (PBU)}, surpasses existing\nstate-of-the-art class unlearning methods, demonstrating superior\neffectiveness. Notably, PBU achieves this efficacy without requiring awareness\nof the entire training dataset but only to the unlearned data points, marking a\ndistinctive feature of its performance.\n","authors":["Subhodip Panda","Shashwat Sourav","Prathosh A. P"],"pdf_url":"https://arxiv.org/pdf/2403.16246v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04364v2","updated":"2024-03-24T17:22:35Z","published":"2023-12-07T15:35:42Z","title":"DemoCaricature: Democratising Caricature Generation with a Rough Sketch","summary":" In this paper, we democratise caricature generation, empowering individuals\nto effortlessly craft personalised caricatures with just a photo and a\nconceptual sketch. Our objective is to strike a delicate balance between\nabstraction and identity, while preserving the creativity and subjectivity\ninherent in a sketch. To achieve this, we present Explicit Rank-1 Model Editing\nalongside single-image personalisation, selectively applying nuanced edits to\ncross-attention layers for a seamless merge of identity and style.\nAdditionally, we propose Random Mask Reconstruction to enhance robustness,\ndirecting the model to focus on distinctive identity and style features.\nCrucially, our aim is not to replace artists but to eliminate accessibility\nbarriers, allowing enthusiasts to engage in the artistry.\n","authors":["Dar-Yen Chen","Ayan Kumar Bhunia","Subhadeep Koley","Aneeshan Sain","Pinaki Nath Chowdhury","Yi-Zhe Song"],"pdf_url":"https://arxiv.org/pdf/2312.04364v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16244v1","updated":"2024-03-24T17:21:32Z","published":"2024-03-24T17:21:32Z","title":"On the Equivalency, Substitutability, and Flexibility of Synthetic Data","summary":" We study, from an empirical standpoint, the efficacy of synthetic data in\nreal-world scenarios. Leveraging synthetic data for training perception models\nhas become a key strategy embraced by the community due to its efficiency,\nscalability, perfect annotations, and low costs. Despite proven advantages, few\nstudies put their stress on how to efficiently generate synthetic datasets to\nsolve real-world problems and to what extent synthetic data can reduce the\neffort for real-world data collection. To answer the questions, we\nsystematically investigate several interesting properties of synthetic data --\nthe equivalency of synthetic data to real-world data, the substitutability of\nsynthetic data for real data, and the flexibility of synthetic data generators\nto close up domain gaps. Leveraging the M3Act synthetic data generator, we\nconduct experiments on DanceTrack and MOT17. Our results suggest that synthetic\ndata not only enhances model performance but also demonstrates substitutability\nfor real data, with 60% to 80% replacement without performance loss. In\naddition, our study of the impact of synthetic data distributions on downstream\nperformance reveals the importance of flexible data generators in narrowing\ndomain gaps for improved model adaptability.\n","authors":["Che-Jui Chang","Danrui Li","Seonghyeon Moon","Mubbasir Kapadia"],"pdf_url":"https://arxiv.org/pdf/2403.16244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.12188v2","updated":"2024-03-24T17:19:14Z","published":"2023-09-21T15:54:33Z","title":"SG-Bot: Object Rearrangement via Coarse-to-Fine Robotic Imagination on\n Scene Graphs","summary":" Object rearrangement is pivotal in robotic-environment interactions,\nrepresenting a significant capability in embodied AI. In this paper, we present\nSG-Bot, a novel rearrangement framework that utilizes a coarse-to-fine scheme\nwith a scene graph as the scene representation. Unlike previous methods that\nrely on either known goal priors or zero-shot large models, SG-Bot exemplifies\nlightweight, real-time, and user-controllable characteristics, seamlessly\nblending the consideration of commonsense knowledge with automatic generation\ncapabilities. SG-Bot employs a three-fold procedure--observation, imagination,\nand execution--to adeptly address the task. Initially, objects are discerned\nand extracted from a cluttered scene during the observation. These objects are\nfirst coarsely organized and depicted within a scene graph, guided by either\ncommonsense or user-defined criteria. Then, this scene graph subsequently\ninforms a generative model, which forms a fine-grained goal scene considering\nthe shape information from the initial scene and object semantics. Finally, for\nexecution, the initial and envisioned goal scenes are matched to formulate\nrobotic action policies. Experimental results demonstrate that SG-Bot\noutperforms competitors by a large margin.\n","authors":["Guangyao Zhai","Xiaoni Cai","Dianye Huang","Yan Di","Fabian Manhardt","Federico Tombari","Nassir Navab","Benjamin Busam"],"pdf_url":"https://arxiv.org/pdf/2309.12188v2.pdf","comment":"ICRA 2024 accepted. Project website:\n https://sites.google.com/view/sg-bot"},{"id":"http://arxiv.org/abs/2403.14119v2","updated":"2024-03-24T17:16:53Z","published":"2024-03-21T04:08:29Z","title":"C-TPT: Calibrated Test-Time Prompt Tuning for Vision-Language Models via\n Text Feature Dispersion","summary":" In deep learning, test-time adaptation has gained attention as a method for\nmodel fine-tuning without the need for labeled data. A prime exemplification is\nthe recently proposed test-time prompt tuning for large-scale vision-language\nmodels such as CLIP. Unfortunately, these prompts have been mainly developed to\nimprove accuracy, overlooking the importance of calibration, which is a crucial\naspect for quantifying prediction uncertainty. However, traditional calibration\nmethods rely on substantial amounts of labeled data, making them impractical\nfor test-time scenarios. To this end, this paper explores calibration during\ntest-time prompt tuning by leveraging the inherent properties of CLIP. Through\na series of observations, we find that the prompt choice significantly affects\nthe calibration in CLIP, where the prompts leading to higher text feature\ndispersion result in better-calibrated predictions. Introducing the Average\nText Feature Dispersion (ATFD), we establish its relationship with calibration\nerror and present a novel method, Calibrated Test-time Prompt Tuning (C-TPT),\nfor optimizing prompts during test-time with enhanced calibration. Through\nextensive experiments on different CLIP architectures and datasets, we show\nthat C-TPT can effectively improve the calibration of test-time prompt tuning\nwithout needing labeled data. The code is publicly accessible at\nhttps://github.com/hee-suk-yoon/C-TPT.\n","authors":["Hee Suk Yoon","Eunseop Yoon","Joshua Tian Jin Tee","Mark Hasegawa-Johnson","Yingzhen Li","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2403.14119v2.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2403.16242v1","updated":"2024-03-24T17:13:46Z","published":"2024-03-24T17:13:46Z","title":"Adversarially Masked Video Consistency for Unsupervised Domain\n Adaptation","summary":" We study the problem of unsupervised domain adaptation for egocentric videos.\nWe propose a transformer-based model to learn class-discriminative and\ndomain-invariant feature representations. It consists of two novel designs. The\nfirst module is called Generative Adversarial Domain Alignment Network with the\naim of learning domain-invariant representations. It simultaneously learns a\nmask generator and a domain-invariant encoder in an adversarial way. The\ndomain-invariant encoder is trained to minimize the distance between the source\nand target domain. The masking generator, conversely, aims at producing\nchallenging masks by maximizing the domain distance. The second is a Masked\nConsistency Learning module to learn class-discriminative representations. It\nenforces the prediction consistency between the masked target videos and their\nfull forms. To better evaluate the effectiveness of domain adaptation methods,\nwe construct a more challenging benchmark for egocentric videos, U-Ego4D. Our\nmethod achieves state-of-the-art performance on the Epic-Kitchen and the\nproposed U-Ego4D benchmark.\n","authors":["Xiaoyu Zhu","Junwei Liang","Po-Yao Huang","Alex Hauptmann"],"pdf_url":"https://arxiv.org/pdf/2403.16242v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16240v1","updated":"2024-03-24T17:12:13Z","published":"2024-03-24T17:12:13Z","title":"Low Rank Groupwise Deformations for Motion Tracking in Cardiac Cine MRI","summary":" Diffeomorphic image registration is a commonly used method to deform one\nimage to resemble another. While warping a single image to another is useful,\nit can be advantageous to warp multiple images simultaneously, such as in\ntracking the motion of the heart across a sequence of images. In this paper,\nour objective is to propose a novel method capable of registering a group or\nsequence of images to a target image, resulting in registered images that\nappear identical and therefore have a low rank. Moreover, we aim for these\nregistered images to closely resemble the target image. Through experimental\nevidence, we will demonstrate our method's superior efficacy in producing\nlow-rank groupwise deformations compared to other state-of-the-art approaches.\n","authors":["Sean Rendell","Jinming Duan"],"pdf_url":"https://arxiv.org/pdf/2403.16240v1.pdf","comment":"A thesis submitted to the University of Birmingham for MSc Degree"},{"id":"http://arxiv.org/abs/2403.16227v1","updated":"2024-03-24T16:41:50Z","published":"2024-03-24T16:41:50Z","title":"Dual-modal Prior Semantic Guided Infrared and Visible Image Fusion for\n Intelligent Transportation System","summary":" Infrared and visible image fusion (IVF) plays an important role in\nintelligent transportation system (ITS). The early works predominantly focus on\nboosting the visual appeal of the fused result, and only several recent\napproaches have tried to combine the high-level vision task with IVF. However,\nthey prioritize the design of cascaded structure to seek unified suitable\nfeatures and fit different tasks. Thus, they tend to typically bias toward to\nreconstructing raw pixels without considering the significance of semantic\nfeatures. Therefore, we propose a novel prior semantic guided image fusion\nmethod based on the dual-modality strategy, improving the performance of IVF in\nITS. Specifically, to explore the independent significant semantic of each\nmodality, we first design two parallel semantic segmentation branches with a\nrefined feature adaptive-modulation (RFaM) mechanism. RFaM can perceive the\nfeatures that are semantically distinct enough in each semantic segmentation\nbranch. Then, two pilot experiments based on the two branches are conducted to\ncapture the significant prior semantic of two images, which then is applied to\nguide the fusion task in the integration of semantic segmentation branches and\nfusion branches. In addition, to aggregate both high-level semantics and\nimpressive visual effects, we further investigate the frequency response of the\nprior semantics, and propose a multi-level representation-adaptive fusion\n(MRaF) module to explicitly integrate the low-frequent prior semantic with the\nhigh-frequent details. Extensive experiments on two public datasets demonstrate\nthe superiority of our method over the state-of-the-art image fusion\napproaches, in terms of either the visual appeal or the high-level semantics.\n","authors":["Jing Li","Lu Bai","Bin Yang","Chang Li","Lingfei Ma","Lixin Cui","Edwin R. Hancock"],"pdf_url":"https://arxiv.org/pdf/2403.16227v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09457v2","updated":"2024-03-24T16:37:04Z","published":"2023-10-14T00:32:11Z","title":"UCM-Net: A Lightweight and Efficient Solution for Skin Lesion\n Segmentation using MLP and CNN","summary":" Skin cancer is a significant public health problem, and computer-aided\ndiagnosis can help to prevent and treat it. A crucial step for computer-aided\ndiagnosis is accurately segmenting skin lesions in images, which allows for\nlesion detection, classification, and analysis. However, this task is\nchallenging due to the diverse characteristics of lesions, such as appearance,\nshape, size, color, texture, and location, as well as image quality issues like\nnoise, artifacts, and occlusions. Deep learning models have recently been\napplied to skin lesion segmentation, but they have high parameter counts and\ncomputational demands, making them unsuitable for mobile health applications.\nTo address this challenge, we propose UCM-Net, a novel, efficient, and\nlightweight solution that integrates Multi-Layer Perceptions (MLP) and\nConvolutional Neural Networks (CNN). Unlike conventional UNet architectures,\nour UCMNet-Block reduces parameter overhead and enhances UCM-Net's learning\ncapabilities, leading to robust segmentation performance. We validate UCM-Net's\ncompetitiveness through extensive experiments on PH2, isic2017 and isic2018\ndatasets. Remarkably, UCM-Net has less than 50KB parameters and less than 0.05\nGiga-Operations Per Second (GLOPs), setting a new possible standard for\nefficiency in skin lesion segmentation. The source code will be publicly\navailable.\n","authors":["Chunyu Yuan","Dongfang Zhao","Sos S. Agaian"],"pdf_url":"https://arxiv.org/pdf/2310.09457v2.pdf","comment":"17 pages, under review"},{"id":"http://arxiv.org/abs/2403.16224v1","updated":"2024-03-24T16:34:47Z","published":"2024-03-24T16:34:47Z","title":"Inverse Rendering of Glossy Objects via the Neural Plenoptic Function\n and Radiance Fields","summary":" Inverse rendering aims at recovering both geometry and materials of objects.\nIt provides a more compatible reconstruction for conventional rendering\nengines, compared with the neural radiance fields (NeRFs). On the other hand,\nexisting NeRF-based inverse rendering methods cannot handle glossy objects with\nlocal light interactions well, as they typically oversimplify the illumination\nas a 2D environmental map, which assumes infinite lights only. Observing the\nsuperiority of NeRFs in recovering radiance fields, we propose a novel 5D\nNeural Plenoptic Function (NeP) based on NeRFs and ray tracing, such that more\naccurate lighting-object interactions can be formulated via the rendering\nequation. We also design a material-aware cone sampling strategy to efficiently\nintegrate lights inside the BRDF lobes with the help of pre-filtered radiance\nfields. Our method has two stages: the geometry of the target object and the\npre-filtered environmental radiance fields are reconstructed in the first\nstage, and materials of the target object are estimated in the second stage\nwith the proposed NeP and material-aware cone sampling strategy. Extensive\nexperiments on the proposed real-world and synthetic datasets demonstrate that\nour method can reconstruct high-fidelity geometry/materials of challenging\nglossy objects with complex lighting interactions from nearby objects. Project\nwebpage: https://whyy.site/paper/nep\n","authors":["Haoyuan Wang","Wenbo Hu","Lei Zhu","Rynson W. H. Lau"],"pdf_url":"https://arxiv.org/pdf/2403.16224v1.pdf","comment":"CVPR 2024 paper. Project webpage https://whyy.site/paper/nep"},{"id":"http://arxiv.org/abs/2403.16221v1","updated":"2024-03-24T16:29:50Z","published":"2024-03-24T16:29:50Z","title":"Exemplar-Free Class Incremental Learning via Incremental Representation","summary":" Exemplar-Free Class Incremental Learning (efCIL) aims to continuously\nincorporate the knowledge from new classes while retaining previously learned\ninformation, without storing any old-class exemplars (i.e., samples). For this\npurpose, various efCIL methods have been proposed over the past few years,\ngenerally with elaborately constructed old pseudo-features, increasing the\ndifficulty of model development and interpretation. In contrast, we propose a\n\\textbf{simple Incremental Representation (IR) framework} for efCIL without\nconstructing old pseudo-features. IR utilizes dataset augmentation to cover a\nsuitable feature space and prevents the model from forgetting by using a single\nL2 space maintenance loss. We discard the transient classifier trained on each\none of the sequence tasks and instead replace it with a 1-near-neighbor\nclassifier for inference, ensuring the representation is incrementally updated\nduring CIL. Extensive experiments demonstrate that our proposed IR achieves\ncomparable performance while significantly preventing the model from forgetting\non CIFAR100, TinyImageNet, and ImageNetSubset datasets.\n","authors":["Libo Huang","Zhulin An","Yan Zeng","Chuanguang Yang","Xinqiang Yu","Yongjun Xu"],"pdf_url":"https://arxiv.org/pdf/2403.16221v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16212v1","updated":"2024-03-24T16:11:27Z","published":"2024-03-24T16:11:27Z","title":"Leveraging Deep Learning and Xception Architecture for High-Accuracy MRI\n Classification in Alzheimer Diagnosis","summary":" Exploring the application of deep learning technologies in the field of\nmedical diagnostics, Magnetic Resonance Imaging (MRI) provides a unique\nperspective for observing and diagnosing complex neurodegenerative diseases\nsuch as Alzheimer Disease (AD). With advancements in deep learning,\nparticularly in Convolutional Neural Networks (CNNs) and the Xception network\narchitecture, we are now able to analyze and classify vast amounts of MRI data\nwith unprecedented accuracy. The progress of this technology not only enhances\nour understanding of brain structural changes but also opens up new avenues for\nmonitoring disease progression through non-invasive means and potentially\nallows for precise diagnosis in the early stages of the disease.\n This study aims to classify MRI images using deep learning models to identify\ndifferent stages of Alzheimer Disease through a series of innovative data\nprocessing and model construction steps. Our experimental results show that the\ndeep learning framework based on the Xception model achieved a 99.6% accuracy\nrate in the multi-class MRI image classification task, demonstrating its\npotential application value in assistive diagnosis. Future research will focus\non expanding the dataset, improving model interpretability, and clinical\nvalidation to further promote the application of deep learning technology in\nthe medical field, with the hope of bringing earlier diagnosis and more\npersonalized treatment plans to Alzheimer Disease patients.\n","authors":["Shaojie Li","Haichen Qu","Xinqi Dong","Bo Dang","Hengyi Zang","Yulu Gong"],"pdf_url":"https://arxiv.org/pdf/2403.16212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16210v1","updated":"2024-03-24T16:09:21Z","published":"2024-03-24T16:09:21Z","title":"Frankenstein: Generating Semantic-Compositional 3D Scenes in One\n Tri-Plane","summary":" We present Frankenstein, a diffusion-based framework that can generate\nsemantic-compositional 3D scenes in a single pass. Unlike existing methods that\noutput a single, unified 3D shape, Frankenstein simultaneously generates\nmultiple separated shapes, each corresponding to a semantically meaningful\npart. The 3D scene information is encoded in one single tri-plane tensor, from\nwhich multiple Singed Distance Function (SDF) fields can be decoded to\nrepresent the compositional shapes. During training, an auto-encoder compresses\ntri-planes into a latent space, and then the denoising diffusion process is\nemployed to approximate the distribution of the compositional scenes.\nFrankenstein demonstrates promising results in generating room interiors as\nwell as human avatars with automatically separated parts. The generated scenes\nfacilitate many downstream applications, such as part-wise re-texturing, object\nrearrangement in the room or avatar cloth re-targeting.\n","authors":["Han Yan","Yang Li","Zhennan Wu","Shenzhou Chen","Weixuan Sun","Taizhang Shang","Weizhe Liu","Tian Chen","Xiaqiang Dai","Chao Ma","Hongdong Li","Pan Ji"],"pdf_url":"https://arxiv.org/pdf/2403.16210v1.pdf","comment":"Video: https://youtu.be/lRn-HqyCrLI"},{"id":"http://arxiv.org/abs/2403.16209v1","updated":"2024-03-24T16:08:10Z","published":"2024-03-24T16:08:10Z","title":"Image Captioning in news report scenario","summary":" Image captioning strives to generate pertinent captions for specified images,\nsituating itself at the crossroads of Computer Vision (CV) and Natural Language\nProcessing (NLP). This endeavor is of paramount importance with far-reaching\napplications in recommendation systems, news outlets, social media, and beyond.\nParticularly within the realm of news reporting, captions are expected to\nencompass detailed information, such as the identities of celebrities captured\nin the images. However, much of the existing body of work primarily centers\naround understanding scenes and actions. In this paper, we explore the realm of\nimage captioning specifically tailored for celebrity photographs, illustrating\nits broad potential for enhancing news industry practices. This exploration\naims to augment automated news content generation, thereby facilitating a more\nnuanced dissemination of information. Our endeavor shows a broader horizon,\nenriching the narrative in news reporting through a more intuitive image\ncaptioning framework.\n","authors":["Tianrui Liu","Qi Cai","Changxin Xu","Zhanxin Zhou","Jize Xiong","Yuxin Qiao","Tsungwei Yang"],"pdf_url":"https://arxiv.org/pdf/2403.16209v1.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2403.16207v1","updated":"2024-03-24T16:03:27Z","published":"2024-03-24T16:03:27Z","title":"Skull-to-Face: Anatomy-Guided 3D Facial Reconstruction and Editing","summary":" Deducing the 3D face from a skull is an essential but challenging task in\nforensic science and archaeology. Existing methods for automated facial\nreconstruction yield inaccurate results, suffering from the non-determinative\nnature of the problem that a skull with a sparse set of tissue depth cannot\nfully determine the skinned face. Additionally, their texture-less results\nrequire further post-processing stages to achieve a photo-realistic appearance.\nThis paper proposes an end-to-end 3D face reconstruction and exploration tool,\nproviding textured 3D faces for reference. With the help of state-of-the-art\ntext-to-image diffusion models and image-based facial reconstruction\ntechniques, we generate an initial reference 3D face, whose biological profile\naligns with the given skull. We then adapt these initial faces to meet the\nstatistical expectations of extruded anatomical landmarks on the skull through\nan optimization process. The joint statistical distribution of tissue depths is\nlearned on a small set of anatomical landmarks on the skull. To support further\nadjustment, we propose an efficient face adaptation tool to assist users in\ntuning tissue depths, either globally or at local regions, while observing\nplausible visual feedback. Experiments conducted on a real skull-face dataset\ndemonstrated the effectiveness of our proposed pipeline in terms of\nreconstruction accuracy, diversity, and stability.\n","authors":["Yongqing Liang","Congyi Zhang","Junli Zhao","Wenping Wang","Xin Li"],"pdf_url":"https://arxiv.org/pdf/2403.16207v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16205v1","updated":"2024-03-24T15:58:48Z","published":"2024-03-24T15:58:48Z","title":"Blur2Blur: Blur Conversion for Unsupervised Image Deblurring on Unknown\n Domains","summary":" This paper presents an innovative framework designed to train an image\ndeblurring algorithm tailored to a specific camera device. This algorithm works\nby transforming a blurry input image, which is challenging to deblur, into\nanother blurry image that is more amenable to deblurring. The transformation\nprocess, from one blurry state to another, leverages unpaired data consisting\nof sharp and blurry images captured by the target camera device. Learning this\nblur-to-blur transformation is inherently simpler than direct blur-to-sharp\nconversion, as it primarily involves modifying blur patterns rather than the\nintricate task of reconstructing fine image details. The efficacy of the\nproposed approach has been demonstrated through comprehensive experiments on\nvarious benchmarks, where it significantly outperforms state-of-the-art methods\nboth quantitatively and qualitatively. Our code and data are available at\nhttps://zero1778.github.io/blur2blur/\n","authors":["Bang-Dang Pham","Phong Tran","Anh Tran","Cuong Pham","Rang Nguyen","Minh Hoai"],"pdf_url":"https://arxiv.org/pdf/2403.16205v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.16202v1","updated":"2024-03-24T15:51:17Z","published":"2024-03-24T15:51:17Z","title":"FH-SSTNet: Forehead Creases based User Verification using Spatio-Spatial\n Temporal Network","summary":" Biometric authentication, which utilizes contactless features, such as\nforehead patterns, has become increasingly important for identity verification\nand access management. The proposed method is based on learning a 3D\nspatio-spatial temporal convolution to create detailed pictures of forehead\npatterns. We introduce a new CNN model called the Forehead Spatio-Spatial\nTemporal Network (FH-SSTNet), which utilizes a 3D CNN architecture with triplet\nloss to capture distinguishing features. We enhance the model's discrimination\ncapability using Arcloss in the network's head. Experimentation on the Forehead\nCreases version 1 (FH-V1) dataset, containing 247 unique subjects, demonstrates\nthe superior performance of FH-SSTNet compared to existing methods and\npre-trained CNNs like ResNet50, especially for forehead-based user\nverification. The results demonstrate the superior performance of FH-SSTNet for\nforehead-based user verification, confirming its effectiveness in identity\nauthentication.\n","authors":["Geetanjali Sharma","Gaurav Jaswal","Aditya Nigam","Raghavendra Ramachandra"],"pdf_url":"https://arxiv.org/pdf/2403.16202v1.pdf","comment":"6 pages, 5 Figure, IWBF conference"},{"id":"http://arxiv.org/abs/2403.16201v1","updated":"2024-03-24T15:48:29Z","published":"2024-03-24T15:48:29Z","title":"From Discrete to Continuous: Deep Fair Clustering With Transferable\n Representations","summary":" We consider the problem of deep fair clustering, which partitions data into\nclusters via the representations extracted by deep neural networks while hiding\nsensitive data attributes. To achieve fairness, existing methods present a\nvariety of fairness-related objective functions based on the group fairness\ncriterion. However, these works typically assume that the sensitive attributes\nare discrete and do not work for continuous sensitive variables, such as the\nproportion of the female population in an area. Besides, the potential of the\nrepresentations learned from clustering tasks to improve performance on other\ntasks is ignored by existing works. In light of these limitations, we propose a\nflexible deep fair clustering method that can handle discrete and continuous\nsensitive attributes simultaneously. Specifically, we design an information\nbottleneck style objective function to learn fair and clustering-friendly\nrepresentations. Furthermore, we explore for the first time the transferability\nof the extracted representations to other downstream tasks. Unlike existing\nworks, we impose fairness at the representation level, which could guarantee\nfairness for the transferred task regardless of clustering results. To verify\nthe effectiveness of the proposed method, we perform extensive experiments on\ndatasets with discrete and continuous sensitive attributes, demonstrating the\nadvantage of our method in comparison with state-of-the-art methods.\n","authors":["Xiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.16201v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16198v1","updated":"2024-03-24T15:39:52Z","published":"2024-03-24T15:39:52Z","title":"Diffusion Model is a Good Pose Estimator from 3D RF-Vision","summary":" Human pose estimation (HPE) from Radio Frequency vision (RF-vision) performs\nhuman sensing using RF signals that penetrate obstacles without revealing\nprivacy (e.g., facial information). Recently, mmWave radar has emerged as a\npromising RF-vision sensor, providing radar point clouds by processing RF\nsignals. However, the mmWave radar has a limited resolution with severe noise,\nleading to inaccurate and inconsistent human pose estimation. This work\nproposes mmDiff, a novel diffusion-based pose estimator tailored for noisy\nradar data. Our approach aims to provide reliable guidance as conditions to\ndiffusion models. Two key challenges are addressed by mmDiff: (1)\nmiss-detection of parts of human bodies, which is addressed by a module that\nisolates feature extraction from different body parts, and (2) signal\ninconsistency due to environmental interference, which is tackled by\nincorporating prior knowledge of body structure and motion. Several modules are\ndesigned to achieve these goals, whose features work as the conditions for the\nsubsequent diffusion model, eliminating the miss-detection and instability of\nHPE based on RF-vision. Extensive experiments demonstrate that mmDiff\noutperforms existing methods significantly, achieving state-of-the-art\nperformances on public datasets.\n","authors":["Junqiao Fan","Jianfei Yang","Yuecong Xu","Lihua Xie"],"pdf_url":"https://arxiv.org/pdf/2403.16198v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13356v2","updated":"2024-03-24T15:30:46Z","published":"2023-08-25T13:05:06Z","title":"CEIMVEN: An Approach of Cutting Edge Implementation of Modified Versions\n of EfficientNet (V1-V2) Architecture for Breast Cancer Detection and\n Classification from Ultrasound Images","summary":" Undoubtedly breast cancer identifies itself as one of the most widespread and\nterrifying cancers across the globe. Millions of women are getting affected\neach year from it. Breast cancer remains the major one for being the reason of\nlargest number of demise of women. In the recent time of research, Medical\nImage Computing and Processing has been playing a significant role for\ndetecting and classifying breast cancers from ultrasound images and mammograms,\nalong with the celestial touch of deep neural networks. In this research, we\nfocused mostly on our rigorous implementations and iterative result analysis of\ndifferent cutting-edge modified versions of EfficientNet architectures namely\nEfficientNet-V1 (b0-b7) and EfficientNet-V2 (b0-b3) with ultrasound image,\nnamed as CEIMVEN. We utilized transfer learning approach here for using the\npre-trained models of EfficientNet versions. We activated the hyper-parameter\ntuning procedures, added fully connected layers, discarded the unprecedented\noutliers and recorded the accuracy results from our custom modified\nEfficientNet architectures. Our deep learning model training approach was\nrelated to both identifying the cancer affected areas with region of interest\n(ROI) techniques and multiple classifications (benign, malignant and normal).\nThe approximate testing accuracies we got from the modified versions of\nEfficientNet-V1 (b0- 99.15%, b1- 98.58%, b2- 98.43%, b3- 98.01%, b4- 98.86%,\nb5- 97.72%, b6- 97.72%, b7- 98.72%) and EfficientNet-V2 (b0- 99.29%, b1-\n99.01%, b2- 98.72%, b3- 99.43%) are showing very bright future and strong\npotentials of deep learning approach for the successful detection and\nclassification of breast cancers from the ultrasound images at a very early\nstage. The code for this research is available here:\nhttps://github.com/ac005sheekar/CEIMVEN-Cutting-Edge-Implementation-of-Modified-EfficientNet-V1-V2-for-BreastCancer-Detection.\n","authors":["Sheekar Banerjee","Md. Kamrul Hasan Monir"],"pdf_url":"https://arxiv.org/pdf/2308.13356v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17417v2","updated":"2024-03-24T15:26:11Z","published":"2024-02-27T11:17:46Z","title":"CARZero: Cross-Attention Alignment for Radiology Zero-Shot\n Classification","summary":" The advancement of Zero-Shot Learning in the medical domain has been driven\nforward by using pre-trained models on large-scale image-text pairs, focusing\non image-text alignment. However, existing methods primarily rely on cosine\nsimilarity for alignment, which may not fully capture the complex relationship\nbetween medical images and reports. To address this gap, we introduce a novel\napproach called Cross-Attention Alignment for Radiology Zero-Shot\nClassification (CARZero). Our approach innovatively leverages cross-attention\nmechanisms to process image and report features, creating a Similarity\nRepresentation that more accurately reflects the intricate relationships in\nmedical semantics. This representation is then linearly projected to form an\nimage-text similarity matrix for cross-modality alignment. Additionally,\nrecognizing the pivotal role of prompt selection in zero-shot learning, CARZero\nincorporates a Large Language Model-based prompt alignment strategy. This\nstrategy standardizes diverse diagnostic expressions into a unified format for\nboth training and inference phases, overcoming the challenges of manual prompt\ndesign. Our approach is simple yet effective, demonstrating state-of-the-art\nperformance in zero-shot classification on five official chest radiograph\ndiagnostic test sets, including remarkable results on datasets with long-tail\ndistributions of rare diseases. This achievement is attributed to our new\nimage-text alignment strategy, which effectively addresses the complex\nrelationship between medical images and reports. Code and models are available\nat https://github.com/laihaoran/CARZero.\n","authors":["Haoran Lai","Qingsong Yao","Zihang Jiang","Rongsheng Wang","Zhiyang He","Xiaodong Tao","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2402.17417v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16194v1","updated":"2024-03-24T15:24:04Z","published":"2024-03-24T15:24:04Z","title":"Pose-Guided Self-Training with Two-Stage Clustering for Unsupervised\n Landmark Discovery","summary":" Unsupervised landmarks discovery (ULD) for an object category is a\nchallenging computer vision problem. In pursuit of developing a robust ULD\nframework, we explore the potential of a recent paradigm of self-supervised\nlearning algorithms, known as diffusion models. Some recent works have shown\nthat these models implicitly contain important correspondence cues. Towards\nharnessing the potential of diffusion models for the ULD task, we make the\nfollowing core contributions. First, we propose a ZeroShot ULD baseline based\non simple clustering of random pixel locations with nearest neighbour matching.\nIt delivers better results than existing ULD methods. Second, motivated by the\nZeroShot performance, we develop a ULD algorithm based on diffusion features\nusing self-training and clustering which also outperforms prior methods by\nnotable margins. Third, we introduce a new proxy task based on generating\nlatent pose codes and also propose a two-stage clustering mechanism to\nfacilitate effective pseudo-labeling, resulting in a significant performance\nimprovement. Overall, our approach consistently outperforms state-of-the-art\nmethods on four challenging benchmarks AFLW, MAFL, CatHeads and LS3D by\nsignificant margins.\n","authors":["Siddharth Tourani","Ahmed Alwheibi","Arif Mahmood","Muhammad Haris Khan"],"pdf_url":"https://arxiv.org/pdf/2403.16194v1.pdf","comment":"Accepted in CVPR 2024"},{"id":"http://arxiv.org/abs/2112.06502v2","updated":"2024-03-24T15:12:20Z","published":"2021-12-13T09:24:45Z","title":"DGL-GAN: Discriminator Guided Learning for GAN Compression","summary":" Generative Adversarial Networks (GANs) with high computation costs, e.g.,\nBigGAN and StyleGAN2, have achieved remarkable results in synthesizing\nhigh-resolution images from random noise. Reducing the computation cost of GANs\nwhile keeping generating photo-realistic images is a challenging field. In this\nwork, we propose a novel yet simple {\\bf D}iscriminator {\\bf G}uided {\\bf\nL}earning approach for compressing vanilla {\\bf GAN}, dubbed {\\bf DGL-GAN}.\nMotivated by the phenomenon that the teacher discriminator may contain some\nmeaningful information about both real images and fake images, we merely\ntransfer the knowledge from the teacher discriminator via the adversarial\ninteraction between the teacher discriminator and the student generator. We\napply DGL-GAN to compress the two most representative large-scale vanilla GANs,\ni.e., StyleGAN2 and BigGAN. Experiments show that DGL-GAN achieves\nstate-of-the-art (SOTA) results on both StyleGAN2 and BigGAN. Moreover, DGL-GAN\nis also effective in boosting the performance of original uncompressed GANs.\nOriginal uncompressed StyleGAN2 boosted with DGL-GAN achieves FID 2.65 on FFHQ,\nwhich achieves a new state-of-the-art performance. Code and models are\navailable at \\url{https://github.com/yuesongtian/DGL-GAN}\n","authors":["Yuesong Tian","Li Shen","Xiang Tian","Dacheng Tao","Zhifeng Li","Wei Liu","Yaowu Chen"],"pdf_url":"https://arxiv.org/pdf/2112.06502v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.14729v3","updated":"2024-03-24T15:11:38Z","published":"2023-10-23T09:05:18Z","title":"MAS: Multi-view Ancestral Sampling for 3D motion generation using 2D\n diffusion","summary":" We introduce Multi-view Ancestral Sampling (MAS), a method for 3D motion\ngeneration, using 2D diffusion models that were trained on motions obtained\nfrom in-the-wild videos. As such, MAS opens opportunities to exciting and\ndiverse fields of motion previously under-explored as 3D data is scarce and\nhard to collect. MAS works by simultaneously denoising multiple 2D motion\nsequences representing different views of the same 3D motion. It ensures\nconsistency across all views at each diffusion step by combining the individual\ngenerations into a unified 3D sequence, and projecting it back to the original\nviews. We demonstrate MAS on 2D pose data acquired from videos depicting\nprofessional basketball maneuvers, rhythmic gymnastic performances featuring a\nball apparatus, and horse races. In each of these domains, 3D motion capture is\narduous, and yet, MAS generates diverse and realistic 3D sequences. Unlike the\nScore Distillation approach, which optimizes each sample by repeatedly applying\nsmall fixes, our method uses a sampling process that was constructed for the\ndiffusion framework. As we demonstrate, MAS avoids common issues such as\nout-of-domain sampling and mode-collapse. https://guytevet.github.io/mas-page/\n","authors":["Roy Kapon","Guy Tevet","Daniel Cohen-Or","Amit H. Bermano"],"pdf_url":"https://arxiv.org/pdf/2310.14729v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16188v1","updated":"2024-03-24T15:10:22Z","published":"2024-03-24T15:10:22Z","title":"Cross-domain Multi-modal Few-shot Object Detection via Rich Text","summary":" Cross-modal feature extraction and integration have led to steady performance\nimprovements in few-shot learning tasks due to generating richer features.\nHowever, existing multi-modal object detection (MM-OD) methods degrade when\nfacing significant domain-shift and are sample insufficient. We hypothesize\nthat rich text information could more effectively help the model to build a\nknowledge relationship between the vision instance and its language description\nand can help mitigate domain shift. Specifically, we study the Cross-Domain\nfew-shot generalization of MM-OD (CDMM-FSOD) and propose a meta-learning based\nmulti-modal few-shot object detection method that utilizes rich text semantic\ninformation as an auxiliary modality to achieve domain adaptation in the\ncontext of FSOD. Our proposed network contains (i) a multi-modal feature\naggregation module that aligns the vision and language support feature\nembeddings and (ii) a rich text semantic rectify module that utilizes\nbidirectional text feature generation to reinforce multi-modal feature\nalignment and thus to enhance the model's language understanding capability. We\nevaluate our model on common standard cross-domain object detection datasets\nand demonstrate that our approach considerably outperforms existing FSOD\nmethods.\n","authors":["Zeyu Shangguan","Daniel Seita","Mohammad Rostami"],"pdf_url":"https://arxiv.org/pdf/2403.16188v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16184v1","updated":"2024-03-24T15:02:24Z","published":"2024-03-24T15:02:24Z","title":"Improving Scene Graph Generation with Relation Words' Debiasing in\n Vision-Language Models","summary":" Scene Graph Generation (SGG) provides basic language representation of visual\nscenes, requiring models to grasp complex and diverse semantics between various\nobjects. However, this complexity and diversity in SGG also leads to\nunderrepresentation, where part of test triplets are rare or even unseen during\ntraining, resulting in imprecise predictions. To tackle this, we propose using\nthe SGG models with pretrained vision-language models (VLMs) to enhance\nrepresentation. However, due to the gap between the pretraining and SGG,\ndirectly ensembling the pretrained VLMs leads to severe biases across relation\nwords. Thus, we introduce LM Estimation to approximate the words' distribution\nunderlies in the pretraining language sets, and then use the distribution for\ndebiasing. After that, we ensemble VLMs with SGG models to enhance\nrepresentation. Considering that each model may represent better at different\nsamples, we use a certainty-aware indicator to score each sample and\ndynamically adjust the ensemble weights. Our method effectively addresses the\nwords biases, enhances SGG's representation, and achieve markable performance\nenhancements. It is training-free and integrates well with existing SGG models.\n","authors":["Yuxuan Wang","Xiaoyuan Liu"],"pdf_url":"https://arxiv.org/pdf/2403.16184v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16182v1","updated":"2024-03-24T15:00:44Z","published":"2024-03-24T15:00:44Z","title":"EgoExoLearn: A Dataset for Bridging Asynchronous Ego- and Exo-centric\n View of Procedural Activities in Real World","summary":" Being able to map the activities of others into one's own point of view is\none fundamental human skill even from a very early age. Taking a step toward\nunderstanding this human ability, we introduce EgoExoLearn, a large-scale\ndataset that emulates the human demonstration following process, in which\nindividuals record egocentric videos as they execute tasks guided by\ndemonstration videos. Focusing on the potential applications in daily\nassistance and professional support, EgoExoLearn contains egocentric and\ndemonstration video data spanning 120 hours captured in daily life scenarios\nand specialized laboratories. Along with the videos we record high-quality gaze\ndata and provide detailed multimodal annotations, formulating a playground for\nmodeling the human ability to bridge asynchronous procedural actions from\ndifferent viewpoints. To this end, we present benchmarks such as cross-view\nassociation, cross-view action planning, and cross-view referenced skill\nassessment, along with detailed analysis. We expect EgoExoLearn can serve as an\nimportant resource for bridging the actions across views, thus paving the way\nfor creating AI agents capable of seamlessly learning by observing humans in\nthe real world. Code and data can be found at:\nhttps://github.com/OpenGVLab/EgoExoLearn\n","authors":["Yifei Huang","Guo Chen","Jilan Xu","Mingfang Zhang","Lijin Yang","Baoqi Pei","Hongjie Zhang","Lu Dong","Yali Wang","Limin Wang","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2403.16182v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2312.01696v2","updated":"2024-03-24T14:50:42Z","published":"2023-12-04T07:35:02Z","title":"BEVNeXt: Reviving Dense BEV Frameworks for 3D Object Detection","summary":" Recently, the rise of query-based Transformer decoders is reshaping\ncamera-based 3D object detection. These query-based decoders are surpassing the\ntraditional dense BEV (Bird's Eye View)-based methods. However, we argue that\ndense BEV frameworks remain important due to their outstanding abilities in\ndepth estimation and object localization, depicting 3D scenes accurately and\ncomprehensively. This paper aims to address the drawbacks of the existing dense\nBEV-based 3D object detectors by introducing our proposed enhanced components,\nincluding a CRF-modulated depth estimation module enforcing object-level\nconsistencies, a long-term temporal aggregation module with extended receptive\nfields, and a two-stage object decoder combining perspective techniques with\nCRF-modulated depth embedding. These enhancements lead to a \"modernized\" dense\nBEV framework dubbed BEVNeXt. On the nuScenes benchmark, BEVNeXt outperforms\nboth BEV-based and query-based frameworks under various settings, achieving a\nstate-of-the-art result of 64.2 NDS on the nuScenes test set. Code will be\navailable at \\url{https://github.com/woxihuanjiangguo/BEVNeXt}.\n","authors":["Zhenxin Li","Shiyi Lan","Jose M. Alvarez","Zuxuan Wu"],"pdf_url":"https://arxiv.org/pdf/2312.01696v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16175v1","updated":"2024-03-24T14:35:06Z","published":"2024-03-24T14:35:06Z","title":"Enhancing MRI-Based Classification of Alzheimer's Disease with\n Explainable 3D Hybrid Compact Convolutional Transformers","summary":" Alzheimer's disease (AD), characterized by progressive cognitive decline and\nmemory loss, presents a formidable global health challenge, underscoring the\ncritical importance of early and precise diagnosis for timely interventions and\nenhanced patient outcomes. While MRI scans provide valuable insights into brain\nstructures, traditional analysis methods often struggle to discern intricate 3D\npatterns crucial for AD identification. Addressing this challenge, we introduce\nan alternative end-to-end deep learning model, the 3D Hybrid Compact\nConvolutional Transformers 3D (HCCT). By synergistically combining\nconvolutional neural networks (CNNs) and vision transformers (ViTs), the 3D\nHCCT adeptly captures both local features and long-range relationships within\n3D MRI scans. Extensive evaluations on prominent AD benchmark dataset, ADNI,\ndemonstrate the 3D HCCT's superior performance, surpassing state of the art CNN\nand transformer-based methods in classification accuracy. Its robust\ngeneralization capability and interpretability marks a significant stride in AD\nclassification from 3D MRI scans, promising more accurate and reliable\ndiagnoses for improved patient care and superior clinical outcomes.\n","authors":["Arindam Majee","Avisek Gupta","Sourav Raha","Swagatam Das"],"pdf_url":"https://arxiv.org/pdf/2403.16175v1.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2403.16172v1","updated":"2024-03-24T14:29:41Z","published":"2024-03-24T14:29:41Z","title":"Fusion of Minutia Cylinder Codes and Minutia Patch Embeddings for Latent\n Fingerprint Recognition","summary":" Latent fingerprints are one of the most widely used forensic evidence by law\nenforcement agencies. However, latent recognition performance is far from the\nexemplary performance of sensor fingerprint recognition due to deformations and\nartifacts within these images. In this study, we propose a fusion based local\nmatching approach towards latent fingerprint recognition. Recent latent\nrecognition studies typically relied on local descriptor generation methods, in\nwhich either handcrafted minutiae features or deep neural network features are\nextracted around a minutia of interest, in the latent recognition process.\nProposed approach would integrate these handcrafted features with a recently\nproposed deep neural network embedding features in a multi-stage fusion\napproach to significantly improve latent recognition results. Effectiveness of\nthe proposed approach has been shown on several public and private data sets.\nAs demonstrated in our experimental results, proposed method improves rank-1\nidentification accuracy by considerably for real-world datasets when compared\nto either the single usage of these features or existing state-of-the-art\nmethods in the literature.\n","authors":["Yusuf Artan","Bensu Alkan Semiz"],"pdf_url":"https://arxiv.org/pdf/2403.16172v1.pdf","comment":"9 pages,7 figures, 4 tables"},{"id":"http://arxiv.org/abs/2403.16169v1","updated":"2024-03-24T14:24:13Z","published":"2024-03-24T14:24:13Z","title":"Gaze-guided Hand-Object Interaction Synthesis: Benchmark and Method","summary":" Gaze plays a crucial role in revealing human attention and intention,\nshedding light on the cognitive processes behind human actions. The integration\nof gaze guidance with the dynamics of hand-object interactions boosts the\naccuracy of human motion prediction. However, the lack of datasets that capture\nthe intricate relationship and consistency among gaze, hand, and object\nmovements remains a substantial hurdle. In this paper, we introduce the first\nGaze-guided Hand-Object Interaction dataset, GazeHOI, and present a novel task\nfor synthesizing gaze-guided hand-object interactions. Our dataset, GazeHOI,\nfeatures simultaneous 3D modeling of gaze, hand, and object interactions,\ncomprising 479 sequences with an average duration of 19.1 seconds, 812\nsub-sequences, and 33 objects of various sizes. We propose a hierarchical\nframework centered on a gaze-guided hand-object interaction diffusion model,\nnamed GHO-Diffusion. In the pre-diffusion phase, we separate gaze conditions\ninto spatial-temporal features and goal pose conditions at different levels of\ninformation granularity. During the diffusion phase, two gaze-conditioned\ndiffusion models are stacked to simplify the complex synthesis of hand-object\nmotions. Here, the object motion diffusion model generates sequences of object\nmotions based on gaze conditions, while the hand motion diffusion model\nproduces hand motions based on the generated object motion. To improve\nfine-grained goal pose alignment, we introduce a Spherical Gaussian constraint\nto guide the denoising step. In the subsequent post-diffusion phase, we\noptimize the generated hand motions using contact consistency. Our extensive\nexperiments highlight the uniqueness of our dataset and the effectiveness of\nour approach.\n","authors":["Jie Tian","Lingxiao Yang","Ran Ji","Yuexin Ma","Lan Xu","Jingyi Yu","Ye Shi","Jingya Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16169v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08924v2","updated":"2024-03-24T14:23:59Z","published":"2023-12-14T13:31:01Z","title":"Training-free Zero-shot Composed Image Retrieval with Local Concept\n Reranking","summary":" Composed image retrieval attempts to retrieve an image of interest from\ngallery images through a composed query of a reference image and its\ncorresponding modified text. It has recently attracted attention due to the\ncollaboration of information-rich images and concise language to precisely\nexpress the requirements of target images. Most current composed image\nretrieval methods follow a supervised learning approach to training on a costly\ntriplet dataset composed of a reference image, modified text, and a\ncorresponding target image. To avoid difficult to-obtain labeled triplet\ntraining data, zero-shot composed image retrieval (ZS-CIR) has been introduced,\nwhich aims to retrieve the target image by learning from image-text pairs\n(self-supervised triplets), without the need for human-labeled triplets.\nHowever, this self-supervised triplet learning approach is computationally less\neffective and less understandable as it assumes the interaction between image\nand text is conducted with implicit query embedding without explicit semantical\ninterpretation. In this work, we present a new training-free zero-shot composed\nimage retrieval method which translates the query into explicit\nhuman-understandable text. This helps improve model learning efficiency to\nenhance the generalization capacity of foundation models. Further, we introduce\na Local Concept Re-ranking (LCR) mechanism to focus on discriminative local\ninformation extracted from the modified instructions. Extensive experiments on\nfour ZS-CIR benchmarks show that our method achieves comparable performances to\nthat of the state of-the-art triplet training based methods, but significantly\noutperforms other training-free methods on the open domain datasets (CIRR,\nCIRCO and COCO), as well as the fashion domain dataset (FashionIQ).\n","authors":["Shitong Sun","Fanghua Ye","Shaogang Gong"],"pdf_url":"https://arxiv.org/pdf/2312.08924v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2403.16167v1","updated":"2024-03-24T14:21:06Z","published":"2024-03-24T14:21:06Z","title":"Exploiting Semantic Reconstruction to Mitigate Hallucinations in\n Vision-Language Models","summary":" Hallucinations in vision-language models pose a significant challenge to\ntheir reliability, particularly in the generation of long captions. Current\nmethods fall short of accurately identifying and mitigating these\nhallucinations. To address this issue, we introduce ESREAL, a novel\nunsupervised learning framework designed to suppress the generation of\nhallucinations through accurate localization and penalization of hallucinated\ntokens. Initially, ESREAL creates a reconstructed image based on the generated\ncaption and aligns its corresponding regions with those of the original image.\nThis semantic reconstruction aids in identifying both the presence and type of\ntoken-level hallucinations within the generated caption. Subsequently, ESREAL\ncomputes token-level hallucination scores by assessing the semantic similarity\nof aligned regions based on the type of hallucination. Finally, ESREAL employs\na proximal policy optimization algorithm, where it selectively penalizes\nhallucinated tokens according to their token-level hallucination scores. Our\nframework notably reduces hallucinations in LLaVA, InstructBLIP, and mPLUG-Owl2\nby 32.81%, 27.08%, and 7.46% on the CHAIR metric. This improvement is achieved\nsolely through signals derived from the image itself, without the need for any\nimage-text pairs.\n","authors":["Minchan Kim","Minyeong Kim","Junik Bae","Suhwan Choi","Sungkyung Kim","Buru Chang"],"pdf_url":"https://arxiv.org/pdf/2403.16167v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16161v1","updated":"2024-03-24T14:02:25Z","published":"2024-03-24T14:02:25Z","title":"Towards Online Real-Time Memory-based Video Inpainting Transformers","summary":" Video inpainting tasks have seen significant improvements in recent years\nwith the rise of deep neural networks and, in particular, vision transformers.\nAlthough these models show promising reconstruction quality and temporal\nconsistency, they are still unsuitable for live videos, one of the last steps\nto make them completely convincing and usable. The main limitations are that\nthese state-of-the-art models inpaint using the whole video (offline\nprocessing) and show an insufficient frame rate. In our approach, we propose a\nframework to adapt existing inpainting transformers to these constraints by\nmemorizing and refining redundant computations while maintaining a decent\ninpainting quality. Using this framework with some of the most recent\ninpainting models, we show great online results with a consistent throughput\nabove 20 frames per second. The code and pretrained models will be made\navailable upon acceptance.\n","authors":["Guillaume Thiry","Hao Tang","Radu Timofte","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2403.16161v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04784v2","updated":"2024-03-24T13:56:31Z","published":"2023-12-08T01:53:06Z","title":"Reality's Canvas, Language's Brush: Crafting 3D Avatars from Monocular\n Video","summary":" Recent advancements in 3D avatar generation excel with multi-view supervision\nfor photorealistic models. However, monocular counterparts lag in quality\ndespite broader applicability. We propose ReCaLaB to close this gap. ReCaLaB is\na fully-differentiable pipeline that learns high-fidelity 3D human avatars from\njust a single RGB video. A pose-conditioned deformable NeRF is optimized to\nvolumetrically represent a human subject in canonical T-pose. The canonical\nrepresentation is then leveraged to efficiently associate neural textures using\n2D-3D correspondences. This enables the separation of diffused color generation\nand lighting correction branches that jointly compose an RGB prediction. The\ndesign allows to control intermediate results for human pose, body shape,\ntexture, and lighting with text prompts. An image-conditioned diffusion model\nthereby helps to animate appearance and pose of the 3D avatar to create video\nsequences with previously unseen human motion. Extensive experiments show that\nReCaLaB outperforms previous monocular approaches in terms of image quality for\nimage synthesis tasks. Moreover, natural language offers an intuitive user\ninterface for creative manipulation of 3D human avatars.\n","authors":["Yuchen Rao","Eduardo Perez Pellitero","Benjamin Busam","Yiren Zhou","Jifei Song"],"pdf_url":"https://arxiv.org/pdf/2312.04784v2.pdf","comment":"Video link: https://youtu.be/Oz83z1es2J4"},{"id":"http://arxiv.org/abs/2403.13352v2","updated":"2024-03-24T13:45:42Z","published":"2024-03-20T07:31:07Z","title":"AGFSync: Leveraging AI-Generated Feedback for Preference Optimization in\n Text-to-Image Generation","summary":" Text-to-Image (T2I) diffusion models have achieved remarkable success in\nimage generation. Despite their progress, challenges remain in both\nprompt-following ability, image quality and lack of high-quality datasets,\nwhich are essential for refining these models. As acquiring labeled data is\ncostly, we introduce AGFSync, a framework that enhances T2I diffusion models\nthrough Direct Preference Optimization (DPO) in a fully AI-driven approach.\nAGFSync utilizes Vision-Language Models (VLM) to assess image quality across\nstyle, coherence, and aesthetics, generating feedback data within an AI-driven\nloop. By applying AGFSync to leading T2I models such as SD v1.4, v1.5, and\nSDXL, our extensive experiments on the TIFA dataset demonstrate notable\nimprovements in VQA scores, aesthetic evaluations, and performance on the HPSv2\nbenchmark, consistently outperforming the base models. AGFSync's method of\nrefining T2I diffusion models paves the way for scalable alignment techniques.\n","authors":["Jingkun An","Yinghao Zhu","Zongjian Li","Haoran Feng","Bohua Chen","Yemin Shi","Chengwei Pan"],"pdf_url":"https://arxiv.org/pdf/2403.13352v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16146v1","updated":"2024-03-24T13:36:23Z","published":"2024-03-24T13:36:23Z","title":"Realtime Robust Shape Estimation of Deformable Linear Object","summary":" Realtime shape estimation of continuum objects and manipulators is essential\nfor developing accurate planning and control paradigms. The existing methods\nthat create dense point clouds from camera images, and/or use distinguishable\nmarkers on a deformable body have limitations in realtime tracking of large\ncontinuum objects/manipulators. The physical occlusion of markers can often\ncompromise accurate shape estimation. We propose a robust method to estimate\nthe shape of linear deformable objects in realtime using scattered and\nunordered key points. By utilizing a robust probability-based labeling\nalgorithm, our approach identifies the true order of the detected key points\nand then reconstructs the shape using piecewise spline interpolation. The\napproach only relies on knowing the number of the key points and the interval\nbetween two neighboring points. We demonstrate the robustness of the method\nwhen key points are partially occluded. The proposed method is also integrated\ninto a simulation in Unity for tracking the shape of a cable with a length of\n1m and a radius of 5mm. The simulation results show that our proposed approach\nachieves an average length error of 1.07% over the continuum's centerline and\nan average cross-section error of 2.11mm. The real-world experiments of\ntracking and estimating a heavy-load cable prove that the proposed approach is\nrobust under occlusion and complex entanglement scenarios.\n","authors":["Jiaming Zhang","Zhaomeng Zhang","Yihao Liu","Yaqian Chen","Amir Kheradmand","Mehran Armand"],"pdf_url":"https://arxiv.org/pdf/2403.16146v1.pdf","comment":"This paper has been accepted to IEEE ICRA 2024 as a contributed paper"},{"id":"http://arxiv.org/abs/2403.16143v1","updated":"2024-03-24T13:31:31Z","published":"2024-03-24T13:31:31Z","title":"CFAT: Unleashing TriangularWindows for Image Super-resolution","summary":" Transformer-based models have revolutionized the field of image\nsuper-resolution (SR) by harnessing their inherent ability to capture complex\ncontextual features. The overlapping rectangular shifted window technique used\nin transformer architecture nowadays is a common practice in super-resolution\nmodels to improve the quality and robustness of image upscaling. However, it\nsuffers from distortion at the boundaries and has limited unique shifting\nmodes. To overcome these weaknesses, we propose a non-overlapping triangular\nwindow technique that synchronously works with the rectangular one to mitigate\nboundary-level distortion and allows the model to access more unique sifting\nmodes. In this paper, we propose a Composite Fusion Attention Transformer\n(CFAT) that incorporates triangular-rectangular window-based local attention\nwith a channel-based global attention technique in image super-resolution. As a\nresult, CFAT enables attention mechanisms to be activated on more image pixels\nand captures long-range, multi-scale features to improve SR performance. The\nextensive experimental results and ablation study demonstrate the effectiveness\nof CFAT in the SR domain. Our proposed model shows a significant 0.7 dB\nperformance improvement over other state-of-the-art SR architectures.\n","authors":["Abhisek Ray","Gaurav Kumar","Maheshkumar H. Kolekar"],"pdf_url":"https://arxiv.org/pdf/2403.16143v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.16141v1","updated":"2024-03-24T13:27:49Z","published":"2024-03-24T13:27:49Z","title":"Entity-NeRF: Detecting and Removing Moving Entities in Urban Scenes","summary":" Recent advancements in the study of Neural Radiance Fields (NeRF) for dynamic\nscenes often involve explicit modeling of scene dynamics. However, this\napproach faces challenges in modeling scene dynamics in urban environments,\nwhere moving objects of various categories and scales are present. In such\nsettings, it becomes crucial to effectively eliminate moving objects to\naccurately reconstruct static backgrounds. Our research introduces an\ninnovative method, termed here as Entity-NeRF, which combines the strengths of\nknowledge-based and statistical strategies. This approach utilizes entity-wise\nstatistics, leveraging entity segmentation and stationary entity classification\nthrough thing/stuff segmentation. To assess our methodology, we created an\nurban scene dataset masked with moving objects. Our comprehensive experiments\ndemonstrate that Entity-NeRF notably outperforms existing techniques in\nremoving moving objects and reconstructing static urban backgrounds, both\nquantitatively and qualitatively.\n","authors":["Takashi Otonari","Satoshi Ikehata","Kiyoharu Aizawa"],"pdf_url":"https://arxiv.org/pdf/2403.16141v1.pdf","comment":"Accepted by IEEE/CVF Conference on Computer Vision and Pattern\n Recognition (CVPR 2024), Project website:\n https://otonari726.github.io/entitynerf/"},{"id":"http://arxiv.org/abs/2403.16131v1","updated":"2024-03-24T13:01:57Z","published":"2024-03-24T13:01:57Z","title":"Salience DETR: Enhancing Detection Transformer with Hierarchical\n Salience Filtering Refinement","summary":" DETR-like methods have significantly increased detection performance in an\nend-to-end manner. The mainstream two-stage frameworks of them perform dense\nself-attention and select a fraction of queries for sparse cross-attention,\nwhich is proven effective for improving performance but also introduces a heavy\ncomputational burden and high dependence on stable query selection. This paper\ndemonstrates that suboptimal two-stage selection strategies result in scale\nbias and redundancy due to the mismatch between selected queries and objects in\ntwo-stage initialization. To address these issues, we propose hierarchical\nsalience filtering refinement, which performs transformer encoding only on\nfiltered discriminative queries, for a better trade-off between computational\nefficiency and precision. The filtering process overcomes scale bias through a\nnovel scale-independent salience supervision. To compensate for the semantic\nmisalignment among queries, we introduce elaborate query refinement modules for\nstable two-stage initialization. Based on above improvements, the proposed\nSalience DETR achieves significant improvements of +4.0% AP, +0.2% AP, +4.4% AP\non three challenging task-specific detection datasets, as well as 49.2% AP on\nCOCO 2017 with less FLOPs. The code is available at\nhttps://github.com/xiuqhou/Salience-DETR.\n","authors":["Xiuquan Hou","Meiqin Liu","Senlin Zhang","Ping Wei","Badong Chen"],"pdf_url":"https://arxiv.org/pdf/2403.16131v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.09334v2","updated":"2024-03-24T13:00:54Z","published":"2024-03-14T12:22:54Z","title":"Video Editing via Factorized Diffusion Distillation","summary":" We introduce Emu Video Edit (EVE), a model that establishes a new\nstate-of-the art in video editing without relying on any supervised video\nediting data. To develop EVE we separately train an image editing adapter and a\nvideo generation adapter, and attach both to the same text-to-image model.\nThen, to align the adapters towards video editing we introduce a new\nunsupervised distillation procedure, Factorized Diffusion Distillation. This\nprocedure distills knowledge from one or more teachers simultaneously, without\nany supervised data. We utilize this procedure to teach EVE to edit videos by\njointly distilling knowledge to (i) precisely edit each individual frame from\nthe image editing adapter, and (ii) ensure temporal consistency among the\nedited frames using the video generation adapter. Finally, to demonstrate the\npotential of our approach in unlocking other capabilities, we align additional\ncombinations of adapters\n","authors":["Uriel Singer","Amit Zohar","Yuval Kirstain","Shelly Sheynin","Adam Polyak","Devi Parikh","Yaniv Taigman"],"pdf_url":"https://arxiv.org/pdf/2403.09334v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16128v1","updated":"2024-03-24T12:55:50Z","published":"2024-03-24T12:55:50Z","title":"Enhancing Video Transformers for Action Understanding with VLM-aided\n Training","summary":" Owing to their ability to extract relevant spatio-temporal video embeddings,\nVision Transformers (ViTs) are currently the best performing models in video\naction understanding. However, their generalization over domains or datasets is\nsomewhat limited. In contrast, Visual Language Models (VLMs) have demonstrated\nexceptional generalization performance, but are currently unable to process\nvideos. Consequently, they cannot extract spatio-temporal patterns that are\ncrucial for action understanding. In this paper, we propose the Four-tiered\nPrompts (FTP) framework that takes advantage of the complementary strengths of\nViTs and VLMs. We retain ViTs' strong spatio-temporal representation ability\nbut improve the visual encodings to be more comprehensive and general by\naligning them with VLM outputs. The FTP framework adds four feature processors\nthat focus on specific aspects of human action in videos: action category,\naction components, action description, and context information. The VLMs are\nonly employed during training, and inference incurs a minimal computation cost.\nOur approach consistently yields state-of-the-art performance. For instance, we\nachieve remarkable top-1 accuracy of 93.8% on Kinetics-400 and 83.4% on\nSomething-Something V2, surpassing VideoMAEv2 by 2.8% and 2.6%, respectively.\n","authors":["Hui Lu","Hu Jian","Ronald Poppe","Albert Ali Salah"],"pdf_url":"https://arxiv.org/pdf/2403.16128v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01099v2","updated":"2024-03-24T12:55:31Z","published":"2023-10-02T11:17:19Z","title":"HyMNet: a Multimodal Deep Learning System for Hypertension\n Classification using Fundus Photographs and Cardiometabolic Risk Factors","summary":" In recent years, deep learning has shown promise in predicting hypertension\n(HTN) from fundus images. However, most prior research has primarily focused on\nanalyzing a single type of data, which may not capture the full complexity of\nHTN risk. To address this limitation, this study introduces a multimodal deep\nlearning (MMDL) system, dubbed HyMNet, which combines fundus images and\ncardiometabolic risk factors, specifically age and gender, to improve\nhypertension detection capabilities. Our MMDL system uses RETFound, a\nfoundation model pre-trained on 1.6 million retinal images, for the fundus path\nand a fully connected neural network for the age and gender path. The two paths\nare jointly trained by concatenating the feature vectors from each path that\nare then fed into a fusion network. The system was trained on 5,016 retinal\nimages from 1,243 individuals collected from the Saudi Ministry of National\nGuard Health Affairs. The results show that the multimodal model that\nintegrates fundus images along with age and gender outperforms the unimodal\nsystem trained solely on fundus photographs, with an F1 score of 0.771 [0.747,\n0.796], and 0.745 [0.719, 0.772] for hypertension detection, respectively.\nAdditionally, we studied the effect underlying diabetes mellitus has on the\nmodel's predictive ability, concluding that diabetes is used as a confounding\nvariable for distinguishing hypertensive cases. Our code and model weights are\npublicly available at https://github.com/MohammedSB/HyMNet.\n","authors":["Mohammed Baharoon","Hessa Almatar","Reema Alduhayan","Tariq Aldebasi","Badr Alahmadi","Yahya Bokhari","Mohammed Alawad","Ahmed Almazroa","Abdulrhman Aljouie"],"pdf_url":"https://arxiv.org/pdf/2310.01099v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12466v2","updated":"2024-03-24T12:42:25Z","published":"2024-03-19T05:50:48Z","title":"Few-shot Object Localization","summary":" Existing object localization methods are tailored to locate a specific class\nof objects, relying on abundant labeled data for model optimization. However,\nin numerous real-world scenarios, acquiring large labeled data can be arduous,\nsignificantly constraining the broader application of localization models. To\nbridge this research gap, this paper proposes the novel task of Few-Shot Object\nLocalization (FSOL), which seeks to achieve precise localization with limited\nsamples available. This task achieves generalized object localization by\nleveraging a small number of labeled support samples to query the positional\ninformation of objects within corresponding images. To advance this research\nfield, we propose an innovative high-performance baseline model. Our model\nintegrates a dual-path feature augmentation module to enhance shape association\nand gradient differences between supports and query images, alongside a self\nquery module designed to explore the association between feature maps and query\nimages. Experimental results demonstrate a significant performance improvement\nof our approach in the FSOL task, establishing an efficient benchmark for\nfurther research. All codes and data are available at\nhttps://github.com/Ryh1218/FSOL.\n","authors":["Yunhan Ren","Bo Li","Chengyang Zhang","Yong Zhang","Baocai Yin"],"pdf_url":"https://arxiv.org/pdf/2403.12466v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16124v1","updated":"2024-03-24T12:41:58Z","published":"2024-03-24T12:41:58Z","title":"Enhancing Visual Continual Learning with Language-Guided Supervision","summary":" Continual learning (CL) aims to empower models to learn new tasks without\nforgetting previously acquired knowledge. Most prior works concentrate on the\ntechniques of architectures, replay data, regularization, \\etc. However, the\ncategory name of each class is largely neglected. Existing methods commonly\nutilize the one-hot labels and randomly initialize the classifier head. We\nargue that the scarce semantic information conveyed by the one-hot labels\nhampers the effective knowledge transfer across tasks. In this paper, we\nrevisit the role of the classifier head within the CL paradigm and replace the\nclassifier with semantic knowledge from pretrained language models (PLMs).\nSpecifically, we use PLMs to generate semantic targets for each class, which\nare frozen and serve as supervision signals during training. Such targets fully\nconsider the semantic correlation between all classes across tasks. Empirical\nstudies show that our approach mitigates forgetting by alleviating\nrepresentation drifting and facilitating knowledge transfer across tasks. The\nproposed method is simple to implement and can seamlessly be plugged into\nexisting methods with negligible adjustments. Extensive experiments based on\neleven mainstream baselines demonstrate the effectiveness and generalizability\nof our approach to various protocols. For example, under the class-incremental\nlearning setting on ImageNet-100, our method significantly improves the Top-1\naccuracy by 3.2\\% to 6.1\\% while reducing the forgetting rate by 2.6\\% to\n13.1\\%.\n","authors":["Bolin Ni","Hongbo Zhao","Chenghao Zhang","Ke Hu","Gaofeng Meng","Zhaoxiang Zhang","Shiming Xiang"],"pdf_url":"https://arxiv.org/pdf/2403.16124v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2401.05010v2","updated":"2024-03-24T12:32:06Z","published":"2024-01-10T08:56:02Z","title":"Less is More: A Closer Look at Semantic-based Few-Shot Learning","summary":" Few-shot Learning aims to learn and distinguish new categories with a very\nlimited number of available images, presenting a significant challenge in the\nrealm of deep learning. Recent researchers have sought to leverage the\nadditional textual or linguistic information of these rare categories with a\npre-trained language model to facilitate learning, thus partially alleviating\nthe problem of insufficient supervision signals. However, the full potential of\nthe textual information and pre-trained language model have been underestimated\nin the few-shot learning till now, resulting in limited performance\nenhancements. To address this, we propose a simple but effective framework for\nfew-shot learning tasks, specifically designed to exploit the textual\ninformation and language model. In more detail, we explicitly exploit the\nzero-shot capability of the pre-trained language model with the learnable\nprompt. And we just add the visual feature with the textual feature for\ninference directly without the intricate designed fusion modules in previous\nworks. Additionally, we apply the self-ensemble and distillation to further\nenhance these components. Our extensive experiments conducted across four\nwidely used few-shot datasets demonstrate that our simple framework achieves\nimpressive results. Particularly noteworthy is its outstanding performance in\nthe 1-shot learning task, surpassing state-of-the-art methods by an average of\n3.0\\% in classification accuracy. \\footnote{We will make the source codes of\nthe proposed framework publicly available upon acceptance. }.\n","authors":["Chunpeng Zhou","Haishuai Wang","Xilu Yuan","Zhi Yu","Jiajun Bu"],"pdf_url":"https://arxiv.org/pdf/2401.05010v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16116v1","updated":"2024-03-24T12:15:28Z","published":"2024-03-24T12:15:28Z","title":"Self-Supervised Multi-Frame Neural Scene Flow","summary":" Neural Scene Flow Prior (NSFP) and Fast Neural Scene Flow (FNSF) have shown\nremarkable adaptability in the context of large out-of-distribution autonomous\ndriving. Despite their success, the underlying reasons for their astonishing\ngeneralization capabilities remain unclear. Our research addresses this gap by\nexamining the generalization capabilities of NSFP through the lens of uniform\nstability, revealing that its performance is inversely proportional to the\nnumber of input point clouds. This finding sheds light on NSFP's effectiveness\nin handling large-scale point cloud scene flow estimation tasks. Motivated by\nsuch theoretical insights, we further explore the improvement of scene flow\nestimation by leveraging historical point clouds across multiple frames, which\ninherently increases the number of point clouds. Consequently, we propose a\nsimple and effective method for multi-frame point cloud scene flow estimation,\nalong with a theoretical evaluation of its generalization abilities. Our\nanalysis confirms that the proposed method maintains a limited generalization\nerror, suggesting that adding multiple frames to the scene flow optimization\nprocess does not detract from its generalizability. Extensive experimental\nresults on large-scale autonomous driving Waymo Open and Argoverse lidar\ndatasets demonstrate that the proposed method achieves state-of-the-art\nperformance.\n","authors":["Dongrui Liu","Daqi Liu","Xueqian Li","Sihao Lin","Hongwei xie","Bing Wang","Xiaojun Chang","Lei Chu"],"pdf_url":"https://arxiv.org/pdf/2403.16116v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16112v1","updated":"2024-03-24T12:05:23Z","published":"2024-03-24T12:05:23Z","title":"Opportunities and challenges in the application of large artificial\n intelligence models in radiology","summary":" Influenced by ChatGPT, artificial intelligence (AI) large models have\nwitnessed a global upsurge in large model research and development. As people\nenjoy the convenience by this AI large model, more and more large models in\nsubdivided fields are gradually being proposed, especially large models in\nradiology imaging field. This article first introduces the development history\nof large models, technical details, workflow, working principles of multimodal\nlarge models and working principles of video generation large models. Secondly,\nwe summarize the latest research progress of AI large models in radiology\neducation, radiology report generation, applications of unimodal and multimodal\nradiology. Finally, this paper also summarizes some of the challenges of large\nAI models in radiology, with the aim of better promoting the rapid revolution\nin the field of radiography.\n","authors":["Liangrui Pan","Zhenyu Zhao","Ying Lu","Kewei Tang","Liyong Fu","Qingchun Liang","Shaoliang Peng"],"pdf_url":"https://arxiv.org/pdf/2403.16112v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18411v2","updated":"2024-03-24T12:04:11Z","published":"2024-02-28T15:31:45Z","title":"Unsupervised Cross-Domain Image Retrieval via Prototypical Optimal\n Transport","summary":" Unsupervised cross-domain image retrieval (UCIR) aims to retrieve images\nsharing the same category across diverse domains without relying on labeled\ndata. Prior approaches have typically decomposed the UCIR problem into two\ndistinct tasks: intra-domain representation learning and cross-domain feature\nalignment. However, these segregated strategies overlook the potential\nsynergies between these tasks. This paper introduces ProtoOT, a novel Optimal\nTransport formulation explicitly tailored for UCIR, which integrates\nintra-domain feature representation learning and cross-domain alignment into a\nunified framework. ProtoOT leverages the strengths of the K-means clustering\nmethod to effectively manage distribution imbalances inherent in UCIR. By\nutilizing K-means for generating initial prototypes and approximating class\nmarginal distributions, we modify the constraints in Optimal Transport\naccordingly, significantly enhancing its performance in UCIR scenarios.\nFurthermore, we incorporate contrastive learning into the ProtoOT framework to\nfurther improve representation learning. This encourages local semantic\nconsistency among features with similar semantics, while also explicitly\nenforcing separation between features and unmatched prototypes, thereby\nenhancing global discriminativeness. ProtoOT surpasses existing\nstate-of-the-art methods by a notable margin across benchmark datasets.\nNotably, on DomainNet, ProtoOT achieves an average P@200 enhancement of 24.44%,\nand on Office-Home, it demonstrates a P@15 improvement of 12.12%. Code is\navailable at https://github.com/HCVLAB/ProtoOT.\n","authors":["Bin Li","Ye Shi","Qian Yu","Jingya Wang"],"pdf_url":"https://arxiv.org/pdf/2402.18411v2.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2403.16111v1","updated":"2024-03-24T12:04:06Z","published":"2024-03-24T12:04:06Z","title":"EVA: Zero-shot Accurate Attributes and Multi-Object Video Editing","summary":" Current diffusion-based video editing primarily focuses on local editing\n(\\textit{e.g.,} object/background editing) or global style editing by utilizing\nvarious dense correspondences. However, these methods often fail to accurately\nedit the foreground and background simultaneously while preserving the original\nlayout. We find that the crux of the issue stems from the imprecise\ndistribution of attention weights across designated regions, including\ninaccurate text-to-attribute control and attention leakage. To tackle this\nissue, we introduce EVA, a \\textbf{zero-shot} and \\textbf{multi-attribute}\nvideo editing framework tailored for human-centric videos with complex motions.\nWe incorporate a Spatial-Temporal Layout-Guided Attention mechanism that\nleverages the intrinsic positive and negative correspondences of cross-frame\ndiffusion features. To avoid attention leakage, we utilize these\ncorrespondences to boost the attention scores of tokens within the same\nattribute across all video frames while limiting interactions between tokens of\ndifferent attributes in the self-attention layer. For precise text-to-attribute\nmanipulation, we use discrete text embeddings focused on specific layout areas\nwithin the cross-attention layer. Benefiting from the precise attention weight\ndistribution, EVA can be easily generalized to multi-object editing scenarios\nand achieves accurate identity mapping. Extensive experiments demonstrate EVA\nachieves state-of-the-art results in real-world scenarios. Full results are\nprovided at https://knightyxp.github.io/EVA/\n","authors":["Xiangpeng Yang","Linchao Zhu","Hehe Fan","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2403.16111v1.pdf","comment":"Project page: https://knightyxp.github.io/EVA"},{"id":"http://arxiv.org/abs/2403.16095v1","updated":"2024-03-24T11:19:59Z","published":"2024-03-24T11:19:59Z","title":"CG-SLAM: Efficient Dense RGB-D SLAM in a Consistent Uncertainty-aware 3D\n Gaussian Field","summary":" Recently neural radiance fields (NeRF) have been widely exploited as 3D\nrepresentations for dense simultaneous localization and mapping (SLAM). Despite\ntheir notable successes in surface modeling and novel view synthesis, existing\nNeRF-based methods are hindered by their computationally intensive and\ntime-consuming volume rendering pipeline. This paper presents an efficient\ndense RGB-D SLAM system, i.e., CG-SLAM, based on a novel uncertainty-aware 3D\nGaussian field with high consistency and geometric stability. Through an\nin-depth analysis of Gaussian Splatting, we propose several techniques to\nconstruct a consistent and stable 3D Gaussian field suitable for tracking and\nmapping. Additionally, a novel depth uncertainty model is proposed to ensure\nthe selection of valuable Gaussian primitives during optimization, thereby\nimproving tracking efficiency and accuracy. Experiments on various datasets\ndemonstrate that CG-SLAM achieves superior tracking and mapping performance\nwith a notable tracking speed of up to 15 Hz. We will make our source code\npublicly available. Project page: https://zju3dv.github.io/cg-slam.\n","authors":["Jiarui Hu","Xianhao Chen","Boyin Feng","Guanglin Li","Liangjing Yang","Hujun Bao","Guofeng Zhang","Zhaopeng Cui"],"pdf_url":"https://arxiv.org/pdf/2403.16095v1.pdf","comment":"Project Page: https://zju3dv.github.io/cg-slam"},{"id":"http://arxiv.org/abs/2403.16092v1","updated":"2024-03-24T11:09:41Z","published":"2024-03-24T11:09:41Z","title":"Are NeRFs ready for autonomous driving? Towards closing the\n real-to-simulation gap","summary":" Neural Radiance Fields (NeRFs) have emerged as promising tools for advancing\nautonomous driving (AD) research, offering scalable closed-loop simulation and\ndata augmentation capabilities. However, to trust the results achieved in\nsimulation, one needs to ensure that AD systems perceive real and rendered data\nin the same way. Although the performance of rendering methods is increasing,\nmany scenarios will remain inherently challenging to reconstruct faithfully. To\nthis end, we propose a novel perspective for addressing the real-to-simulated\ndata gap. Rather than solely focusing on improving rendering fidelity, we\nexplore simple yet effective methods to enhance perception model robustness to\nNeRF artifacts without compromising performance on real data. Moreover, we\nconduct the first large-scale investigation into the real-to-simulated data gap\nin an AD setting using a state-of-the-art neural rendering technique.\nSpecifically, we evaluate object detectors and an online mapping model on real\nand simulated data, and study the effects of different pre-training strategies.\nOur results show notable improvements in model robustness to simulated data,\neven improving real-world performance in some cases. Last, we delve into the\ncorrelation between the real-to-simulated gap and image reconstruction metrics,\nidentifying FID and LPIPS as strong indicators.\n","authors":["Carl Lindström","Georg Hess","Adam Lilja","Maryam Fatemi","Lars Hammarstrand","Christoffer Petersson","Lennart Svensson"],"pdf_url":"https://arxiv.org/pdf/2403.16092v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03059v6","updated":"2024-03-24T10:29:46Z","published":"2023-10-04T16:49:36Z","title":"Point-PEFT: Parameter-Efficient Fine-Tuning for 3D Pre-trained Models","summary":" The popularity of pre-trained large models has revolutionized downstream\ntasks across diverse fields, such as language, vision, and multi-modality. To\nminimize the adaption cost for downstream tasks, many Parameter-Efficient\nFine-Tuning (PEFT) techniques are proposed for language and 2D image\npre-trained models. However, the specialized PEFT method for 3D pre-trained\nmodels is still under-explored. To this end, we introduce Point-PEFT, a novel\nframework for adapting point cloud pre-trained models with minimal learnable\nparameters. Specifically, for a pre-trained 3D model, we freeze most of its\nparameters, and only tune the newly added PEFT modules on downstream tasks,\nwhich consist of a Point-prior Prompt and a Geometry-aware Adapter. The\nPoint-prior Prompt adopts a set of learnable prompt tokens, for which we\npropose to construct a memory bank with domain-specific knowledge, and utilize\na parameter-free attention to enhance the prompt tokens. The Geometry-aware\nAdapter aims to aggregate point cloud features within spatial neighborhoods to\ncapture fine-grained geometric information through local interactions.\nExtensive experiments indicate that our Point-PEFT can achieve better\nperformance than the full fine-tuning on various downstream tasks, while using\nonly 5% of the trainable parameters, demonstrating the efficiency and\neffectiveness of our approach. Code is released at\nhttps://github.com/Ivan-Tang-3D/Point-PEFT.\n","authors":["Yiwen Tang","Ray Zhang","Zoey Guo","Dong Wang","Zhigang Wang","Bin Zhao","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2310.03059v6.pdf","comment":"The specialized PEFT framework for 3D pre-trained models, which\n achieves competitive performance to full fine-tuning, and significantly\n reduces the computational resources. Project page:\n https://github.com/Ivan-Tang-3D/Point-PEFT"},{"id":"http://arxiv.org/abs/2402.12928v4","updated":"2024-03-24T10:06:59Z","published":"2024-02-20T11:28:50Z","title":"A Literature Review of Literature Reviews in Pattern Analysis and\n Machine Intelligence","summary":" By consolidating scattered knowledge, the literature review provides a\ncomprehensive understanding of the investigated topic. However, reading,\nconducting, or peer-reviewing review papers generally demands a significant\ninvestment of time and effort from researchers. To improve efficiency, this\npaper aims to provide a thorough review of reviews in the PAMI field from\ndiverse perspectives. First, this paper proposes several article-level,\nfield-normalized, and large language model-empowered bibliometric indicators to\nevaluate reviews. To facilitate this, a meta-data database dubbed RiPAMI, and a\ntopic dataset are constructed. Second, based on these indicators, the study\npresents comparative analyses of representative reviews, unveiling the\ncharacteristics of publications across various fields, periods, and journals.\nThe newly emerging AI-generated literature reviews are also appraised, and the\nobserved differences suggest that most AI-generated reviews still lag behind\nhuman-authored reviews in multiple aspects. Third, we briefly provide a\nsubjective evaluation of representative PAMI reviews and introduce a paper\nstructure-based typology of literature reviews. This typology may improve the\nclarity and effectiveness for scholars in reading and writing reviews, while\nalso serving as a guide for AI systems in generating well-organized reviews.\nFinally, this work offers insights into the current challenges of literature\nreviews and envisions future directions for their development.\n","authors":["Penghai Zhao","Xin Zhang","Ming-Ming Cheng","Jian Yang","Xiang Li"],"pdf_url":"https://arxiv.org/pdf/2402.12928v4.pdf","comment":"IEEE version v1. [February 19, 2024] IEEE version v2 with typos\n fixed. [February 23, 2024] IEEE version v3 with errors fixed. [February 29,\n 2024] IEEE version v4 with improved quaility. [February 29, 2024]"},{"id":"http://arxiv.org/abs/2403.16080v1","updated":"2024-03-24T10:06:40Z","published":"2024-03-24T10:06:40Z","title":"PKU-DyMVHumans: A Multi-View Video Benchmark for High-Fidelity Dynamic\n Human Modeling","summary":" High-quality human reconstruction and photo-realistic rendering of a dynamic\nscene is a long-standing problem in computer vision and graphics. Despite\nconsiderable efforts invested in developing various capture systems and\nreconstruction algorithms, recent advancements still struggle with loose or\noversized clothing and overly complex poses. In part, this is due to the\nchallenges of acquiring high-quality human datasets. To facilitate the\ndevelopment of these fields, in this paper, we present PKU-DyMVHumans, a\nversatile human-centric dataset for high-fidelity reconstruction and rendering\nof dynamic human scenarios from dense multi-view videos. It comprises 8.2\nmillion frames captured by more than 56 synchronized cameras across diverse\nscenarios. These sequences comprise 32 human subjects across 45 different\nscenarios, each with a high-detailed appearance and realistic human motion.\nInspired by recent advancements in neural radiance field (NeRF)-based scene\nrepresentations, we carefully set up an off-the-shelf framework that is easy to\nprovide those state-of-the-art NeRF-based implementations and benchmark on\nPKU-DyMVHumans dataset. It is paving the way for various applications like\nfine-grained foreground/background decomposition, high-quality human\nreconstruction and photo-realistic novel view synthesis of a dynamic scene.\nExtensive studies are performed on the benchmark, demonstrating new\nobservations and challenges that emerge from using such high-fidelity dynamic\ndata. The dataset is available at: https://pku-dymvhumans.github.io.\n","authors":["Xiaoyun Zheng","Liwei Liao","Xufeng Li","Jianbo Jiao","Rongjie Wang","Feng Gao","Shiqi Wang","Ronggang Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16080v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13438v3","updated":"2024-03-24T10:06:25Z","published":"2024-03-18T17:38:29Z","title":"See, Imagine, Plan: Discovering and Hallucinating Tasks from a Single\n Image","summary":" Humans can not only recognize and understand the world in its current state\nbut also envision future scenarios that extend beyond immediate perception. To\nresemble this profound human capacity, we introduce zero-shot task\nhallucination -- given a single RGB image of any scene comprising unknown\nenvironments and objects, our model can identify potential tasks and imagine\ntheir execution in a vivid narrative, realized as a video. We develop a modular\npipeline that progressively enhances scene decomposition, comprehension, and\nreconstruction, incorporating VLM for dynamic interaction and 3D motion\nplanning for object trajectories. Our model can discover diverse tasks, with\nthe generated task videos demonstrating realistic and compelling visual\noutcomes that are understandable by both machines and humans. Project Page:\nhttps://dannymcy.github.io/zeroshot_task_hallucination/\n","authors":["Chenyang Ma","Kai Lu","Ta-Ying Cheng","Niki Trigoni","Andrew Markham"],"pdf_url":"https://arxiv.org/pdf/2403.13438v3.pdf","comment":"Project Page: https://dannymcy.github.io/zeroshot_task_hallucination/"},{"id":"http://arxiv.org/abs/2310.19258v2","updated":"2024-03-24T09:32:51Z","published":"2023-10-30T04:04:02Z","title":"Improving Online Source-free Domain Adaptation for Object Detection by\n Unsupervised Data Acquisition","summary":" Effective object detection in mobile robots is challenged by deployment in\ndiverse and unfamiliar environments. Online Source-Free Domain Adaptation\n(O-SFDA) offers model adaptation using a stream of unlabeled data from a target\ndomain in online manner. However, not all captured frames contain information\nthat is beneficial for adaptation, particularly when there is a strong class\nimbalance. This paper introduces a novel approach to enhance O-SFDA for\nadaptive object detection in mobile robots via unsupervised data acquisition.\nOur methodology prioritizes the most informative unlabeled frames for inclusion\nin the online training process. Empirical evaluation on a real-world dataset\nreveals that our method outperforms existing state-of-the-art O-SFDA\ntechniques, demonstrating the viability of unsupervised data acquisition for\nimproving adaptive object detection in mobile robots.\n","authors":["Xiangyu Shi","Yanyuan Qiao","Qi Wu","Lingqiao Liu","Feras Dayoub"],"pdf_url":"https://arxiv.org/pdf/2310.19258v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.07347v2","updated":"2024-03-24T09:22:13Z","published":"2024-03-12T06:07:29Z","title":"Frequency Decoupling for Motion Magnification via Multi-Level Isomorphic\n Architecture","summary":" Video Motion Magnification (VMM) aims to reveal subtle and imperceptible\nmotion information of objects in the macroscopic world. Prior methods directly\nmodel the motion field from the Eulerian perspective by Representation Learning\nthat separates shape and texture or Multi-domain Learning from phase\nfluctuations. Inspired by the frequency spectrum, we observe that the\nlow-frequency components with stable energy always possess spatial structure\nand less noise, making them suitable for modeling the subtle motion field. To\nthis end, we present FD4MM, a new paradigm of Frequency Decoupling for Motion\nMagnification with a Multi-level Isomorphic Architecture to capture multi-level\nhigh-frequency details and a stable low-frequency structure (motion field) in\nvideo space. Since high-frequency details and subtle motions are susceptible to\ninformation degradation due to their inherent subtlety and unavoidable external\ninterference from noise, we carefully design Sparse High/Low-pass Filters to\nenhance the integrity of details and motion structures, and a Sparse Frequency\nMixer to promote seamless recoupling. Besides, we innovatively design a\ncontrastive regularization for this task to strengthen the model's ability to\ndiscriminate irrelevant features, reducing undesired motion magnification.\nExtensive experiments on both Real-world and Synthetic Datasets show that our\nFD4MM outperforms SOTA methods. Meanwhile, FD4MM reduces FLOPs by 1.63$\\times$\nand boosts inference speed by 1.68$\\times$ than the latest method. Our code is\navailable at https://github.com/Jiafei127/FD4MM.\n","authors":["Fei Wang","Dan Guo","Kun Li","Zhun Zhong","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2403.07347v2.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.16071v1","updated":"2024-03-24T09:18:21Z","published":"2024-03-24T09:18:21Z","title":"Landmark-Guided Cross-Speaker Lip Reading with Mutual Information\n Regularization","summary":" Lip reading, the process of interpreting silent speech from visual lip\nmovements, has gained rising attention for its wide range of realistic\napplications. Deep learning approaches greatly improve current lip reading\nsystems. However, lip reading in cross-speaker scenarios where the speaker\nidentity changes, poses a challenging problem due to inter-speaker variability.\nA well-trained lip reading system may perform poorly when handling a brand new\nspeaker. To learn a speaker-robust lip reading model, a key insight is to\nreduce visual variations across speakers, avoiding the model overfitting to\nspecific speakers. In this work, in view of both input visual clues and latent\nrepresentations based on a hybrid CTC/attention architecture, we propose to\nexploit the lip landmark-guided fine-grained visual clues instead of\nfrequently-used mouth-cropped images as input features, diminishing\nspeaker-specific appearance characteristics. Furthermore, a max-min mutual\ninformation regularization approach is proposed to capture speaker-insensitive\nlatent representations. Experimental evaluations on public lip reading datasets\ndemonstrate the effectiveness of the proposed approach under the intra-speaker\nand inter-speaker conditions.\n","authors":["Linzhi Wu","Xingyu Zhang","Yakun Zhang","Changyan Zheng","Tiejun Liu","Liang Xie","Ye Yan","Erwei Yin"],"pdf_url":"https://arxiv.org/pdf/2403.16071v1.pdf","comment":"To appear in LREC-COLING 2024"},{"id":"http://arxiv.org/abs/2403.16067v1","updated":"2024-03-24T08:34:08Z","published":"2024-03-24T08:34:08Z","title":"Robust Diffusion Models for Adversarial Purification","summary":" Diffusion models (DMs) based adversarial purification (AP) has shown to be\nthe most powerful alternative to adversarial training (AT). However, these\nmethods neglect the fact that pre-trained diffusion models themselves are not\nrobust to adversarial attacks as well. Additionally, the diffusion process can\neasily destroy semantic information and generate a high quality image but\ntotally different from the original input image after the reverse process,\nleading to degraded standard accuracy. To overcome these issues, a natural idea\nis to harness adversarial training strategy to retrain or fine-tune the\npre-trained diffusion model, which is computationally prohibitive. We propose a\nnovel robust reverse process with adversarial guidance, which is independent of\ngiven pre-trained DMs and avoids retraining or fine-tuning the DMs. This robust\nguidance can not only ensure to generate purified examples retaining more\nsemantic content but also mitigate the accuracy-robustness trade-off of DMs for\nthe first time, which also provides DM-based AP an efficient adaptive ability\nto new attacks. Extensive experiments are conducted to demonstrate that our\nmethod achieves the state-of-the-art results and exhibits generalization\nagainst different attacks.\n","authors":["Guang Lin","Zerui Tao","Jianhai Zhang","Toshihisa Tanaka","Qibin Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.16067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14137v2","updated":"2024-03-24T07:58:01Z","published":"2024-03-21T05:13:12Z","title":"SynerMix: Synergistic Mixup Solution for Enhanced Intra-Class Cohesion\n and Inter-Class Separability in Image Classification","summary":" To address the issues of MixUp and its variants (e.g., Manifold MixUp) in\nimage classification tasks-namely, their neglect of mixing within the same\nclass (intra-class mixup) and their inadequacy in enhancing intra-class\ncohesion through their mixing operations-we propose a novel mixup method named\nSynerMix-Intra and, building upon this, introduce a synergistic mixup solution\nnamed SynerMix. SynerMix-Intra specifically targets intra-class mixup to\nbolster intra-class cohesion, a feature not addressed by current mixup methods.\nFor each mini-batch, it leverages feature representations of unaugmented\noriginal images from each class to generate a synthesized feature\nrepresentation through random linear interpolation. All synthesized\nrepresentations are then fed into the classification and loss layers to\ncalculate an average classification loss that significantly enhances\nintra-class cohesion. Furthermore, SynerMix combines SynerMix-Intra with an\nexisting mixup approach (e.g., MixUp, Manifold MixUp), which primarily focuses\non inter-class mixup and has the benefit of enhancing inter-class separability.\nIn doing so, it integrates both inter- and intra-class mixup in a balanced way\nwhile concurrently improving intra-class cohesion and inter-class separability.\nExperimental results on six datasets show that SynerMix achieves a 0.1% to\n3.43% higher accuracy than the best of either MixUp or SynerMix-Intra alone,\naveraging a 1.16% gain. It also surpasses the top-performer of either Manifold\nMixUp or SynerMix-Intra by 0.12% to 5.16%, with an average gain of 1.11%. Given\nthat SynerMix is model-agnostic, it holds significant potential for application\nin other domains where mixup methods have shown promise, such as speech and\ntext classification. Our code is publicly available at:\nhttps://github.com/wxitxy/synermix.git.\n","authors":["Ye Xu","Ya Gao","Xiaorong Qiu","Yang Chen","Ying Ji"],"pdf_url":"https://arxiv.org/pdf/2403.14137v2.pdf","comment":"25 pages,12 figures"},{"id":"http://arxiv.org/abs/2403.16051v1","updated":"2024-03-24T07:36:38Z","published":"2024-03-24T07:36:38Z","title":"Segment Anything Model for Road Network Graph Extraction","summary":" We propose SAM-Road, an adaptation of the Segment Anything Model (SAM) for\nextracting large-scale, vectorized road network graphs from satellite imagery.\nTo predict graph geometry, we formulate it as a dense semantic segmentation\ntask, leveraging the inherent strengths of SAM. The image encoder of SAM is\nfine-tuned to produce probability masks for roads and intersections, from which\nthe graph vertices are extracted via simple non-maximum suppression. To predict\ngraph topology, we designed a lightweight transformer-based graph neural\nnetwork, which leverages the SAM image embeddings to estimate the edge\nexistence probabilities between vertices. Our approach directly predicts the\ngraph vertices and edges for large regions without expensive and complex\npost-processing heuristics, and is capable of building complete road network\ngraphs spanning multiple square kilometers in a matter of seconds. With its\nsimple, straightforward, and minimalist design, SAM-Road achieves comparable\naccuracy with the state-of-the-art method RNGDet++, while being 40 times faster\non the City-scale dataset. We thus demonstrate the power of a foundational\nvision model when applied to a graph learning task. The code is available at\nhttps://github.com/htcr/sam_road.\n","authors":["Congrui Hetang","Haoru Xue","Cindy Le","Tianwei Yue","Wenping Wang","Yihui He"],"pdf_url":"https://arxiv.org/pdf/2403.16051v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16050v1","updated":"2024-03-24T07:33:08Z","published":"2024-03-24T07:33:08Z","title":"A General and Efficient Federated Split Learning with Pre-trained Image\n Transformers for Heterogeneous Data","summary":" Federated Split Learning (FSL) is a promising distributed learning paradigm\nin practice, which gathers the strengths of both Federated Learning (FL) and\nSplit Learning (SL) paradigms, to ensure model privacy while diminishing the\nresource overhead of each client, especially on large transformer models in a\nresource-constrained environment, e.g., Internet of Things (IoT). However,\nalmost all works merely investigate the performance with simple neural network\nmodels in FSL. Despite the minor efforts focusing on incorporating Vision\nTransformers (ViT) as model architectures, they train ViT from scratch, thereby\nleading to enormous training overhead in each device with limited resources.\nTherefore, in this paper, we harness Pre-trained Image Transformers (PITs) as\nthe initial model, coined FES-PIT, to accelerate the training process and\nimprove model robustness. Furthermore, we propose FES-PTZO to hinder the\ngradient inversion attack, especially having the capability compatible with\nblack-box scenarios, where the gradient information is unavailable. Concretely,\nFES-PTZO approximates the server gradient by utilizing a zeroth-order (ZO)\noptimization, which replaces the backward propagation with just one forward\nprocess. Empirically, we are the first to provide a systematic evaluation of\nFSL methods with PITs in real-world datasets, different partial device\nparticipations, and heterogeneous data splits. Our experiments verify the\neffectiveness of our algorithms.\n","authors":["Yifan Shi","Yuhui Zhang","Ziyue Huang","Xiaofeng Yang","Li Shen","Wei Chen","Xueqian Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16048v1","updated":"2024-03-24T07:29:04Z","published":"2024-03-24T07:29:04Z","title":"Edit3K: Universal Representation Learning for Video Editing Components","summary":" This paper focuses on understanding the predominant video creation pipeline,\ni.e., compositional video editing with six main types of editing components,\nincluding video effects, animation, transition, filter, sticker, and text. In\ncontrast to existing visual representation learning of visual materials (i.e.,\nimages/videos), we aim to learn visual representations of editing\nactions/components that are generally applied on raw materials. We start by\nproposing the first large-scale dataset for editing components of video\ncreation, which covers about $3,094$ editing components with $618,800$ videos.\nEach video in our dataset is rendered by various image/video materials with a\nsingle editing component, which supports atomic visual understanding of\ndifferent editing components. It can also benefit several downstream tasks,\ne.g., editing component recommendation, editing component\nrecognition/retrieval, etc. Existing visual representation methods perform\npoorly because it is difficult to disentangle the visual appearance of editing\ncomponents from raw materials. To that end, we benchmark popular alternative\nsolutions and propose a novel method that learns to attend to the appearance of\nediting components regardless of raw materials. Our method achieves favorable\nresults on editing component retrieval/recognition compared to the alternative\nsolutions. A user study is also conducted to show that our representations\ncluster visually similar editing components better than other alternatives.\nFurthermore, our learned representations used to transition recommendation\ntasks achieve state-of-the-art results on the AutoTransition dataset. The code\nand dataset will be released for academic use.\n","authors":["Xin Gu","Libo Zhang","Fan Chen","Longyin Wen","Yufei Wang","Tiejian Luo","Sijie Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.16048v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.01024v2","updated":"2024-03-24T07:10:27Z","published":"2023-07-03T13:55:44Z","title":"SAM-DA: UAV Tracks Anything at Night with SAM-Powered Domain Adaptation","summary":" Domain adaptation (DA) has demonstrated significant promise for real-time\nnighttime unmanned aerial vehicle (UAV) tracking. However, the state-of-the-art\n(SOTA) DA still lacks the potential object with accurate pixel-level location\nand boundary to generate the high-quality target domain training sample. This\nkey issue constrains the transfer learning of the real-time daytime SOTA\ntrackers for challenging nighttime UAV tracking. Recently, the notable Segment\nAnything Model (SAM) has achieved a remarkable zero-shot generalization ability\nto discover abundant potential objects due to its huge data-driven training\napproach. To solve the aforementioned issue, this work proposes a novel\nSAM-powered DA framework for real-time nighttime UAV tracking, i.e., SAM-DA.\nSpecifically, an innovative SAM-powered target domain training sample swelling\nis designed to determine enormous high-quality target domain training samples\nfrom every single raw nighttime image. This novel one-to-many generation\nsignificantly expands the high-quality target domain training sample for DA.\nComprehensive experiments on extensive nighttime UAV videos prove the\nrobustness and domain adaptability of SAM-DA for nighttime UAV tracking.\nEspecially, compared to the SOTA DA, SAM-DA can achieve better performance with\nfewer raw nighttime images, i.e., the fewer-better training. This economized\ntraining approach facilitates the quick validation and deployment of algorithms\nfor UAVs. The code is available at https://github.com/vision4robotics/SAM-DA.\n","authors":["Changhong Fu","Liangliang Yao","Haobo Zuo","Guangze Zheng","Jia Pan"],"pdf_url":"https://arxiv.org/pdf/2307.01024v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16043v1","updated":"2024-03-24T07:04:08Z","published":"2024-03-24T07:04:08Z","title":"Semantic Is Enough: Only Semantic Information For NeRF Reconstruction","summary":" Recent research that combines implicit 3D representation with semantic\ninformation, like Semantic-NeRF, has proven that NeRF model could perform\nexcellently in rendering 3D structures with semantic labels. This research aims\nto extend the Semantic Neural Radiance Fields (Semantic-NeRF) model by focusing\nsolely on semantic output and removing the RGB output component. We reformulate\nthe model and its training procedure to leverage only the cross-entropy loss\nbetween the model semantic output and the ground truth semantic images,\nremoving the colour data traditionally used in the original Semantic-NeRF\napproach. We then conduct a series of identical experiments using the original\nand the modified Semantic-NeRF model. Our primary objective is to obverse the\nimpact of this modification on the model performance by Semantic-NeRF, focusing\non tasks such as scene understanding, object detection, and segmentation. The\nresults offer valuable insights into the new way of rendering the scenes and\nprovide an avenue for further research and development in semantic-focused 3D\nscene understanding.\n","authors":["Ruibo Wang","Song Zhang","Ping Huang","Donghai Zhang","Wei Yan"],"pdf_url":"https://arxiv.org/pdf/2403.16043v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11557v2","updated":"2024-03-24T07:01:37Z","published":"2023-12-17T09:05:47Z","title":"SAI3D: Segment Any Instance in 3D Scenes","summary":" Advancements in 3D instance segmentation have traditionally been tethered to\nthe availability of annotated datasets, limiting their application to a narrow\nspectrum of object categories. Recent efforts have sought to harness\nvision-language models like CLIP for open-set semantic reasoning, yet these\nmethods struggle to distinguish between objects of the same categories and rely\non specific prompts that are not universally applicable. In this paper, we\nintroduce SAI3D, a novel zero-shot 3D instance segmentation approach that\nsynergistically leverages geometric priors and semantic cues derived from\nSegment Anything Model (SAM). Our method partitions a 3D scene into geometric\nprimitives, which are then progressively merged into 3D instance segmentations\nthat are consistent with the multi-view SAM masks. Moreover, we design a\nhierarchical region-growing algorithm with a dynamic thresholding mechanism,\nwhich largely improves the robustness of finegrained 3D scene parsing.Empirical\nevaluations on ScanNet, Matterport3D and the more challenging ScanNet++\ndatasets demonstrate the superiority of our approach. Notably, SAI3D\noutperforms existing open-vocabulary baselines and even surpasses\nfully-supervised methods in class-agnostic segmentation on ScanNet++. Our\nproject page is at https://yd-yin.github.io/SAI3D.\n","authors":["Yingda Yin","Yuzheng Liu","Yang Xiao","Daniel Cohen-Or","Jingwei Huang","Baoquan Chen"],"pdf_url":"https://arxiv.org/pdf/2312.11557v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.16034v1","updated":"2024-03-24T06:30:02Z","published":"2024-03-24T06:30:02Z","title":"V2X-Real: a Largs-Scale Dataset for Vehicle-to-Everything Cooperative\n Perception","summary":" Recent advancements in Vehicle-to-Everything (V2X) technologies have enabled\nautonomous vehicles to share sensing information to see through occlusions,\ngreatly boosting the perception capability. However, there are no real-world\ndatasets to facilitate the real V2X cooperative perception research -- existing\ndatasets either only support Vehicle-to-Infrastructure cooperation or\nVehicle-to-Vehicle cooperation. In this paper, we propose a dataset that has a\nmixture of multiple vehicles and smart infrastructure simultaneously to\nfacilitate the V2X cooperative perception development with multi-modality\nsensing data. Our V2X-Real is collected using two connected automated vehicles\nand two smart infrastructures, which are all equipped with multi-modal sensors\nincluding LiDAR sensors and multi-view cameras. The whole dataset contains 33K\nLiDAR frames and 171K camera data with over 1.2M annotated bounding boxes of 10\ncategories in very challenging urban scenarios. According to the collaboration\nmode and ego perspective, we derive four types of datasets for Vehicle-Centric,\nInfrastructure-Centric, Vehicle-to-Vehicle, and\nInfrastructure-to-Infrastructure cooperative perception. Comprehensive\nmulti-class multi-agent benchmarks of SOTA cooperative perception methods are\nprovided. The V2X-Real dataset and benchmark codes will be released.\n","authors":["Hao Xiang","Zhaoliang Zheng","Xin Xia","Runsheng Xu","Letian Gao","Zewei Zhou","Xu Han","Xinkai Ji","Mingxi Li","Zonglin Meng","Li Jin","Mingyue Lei","Zhaoyang Ma","Zihang He","Haoxuan Ma","Yunshuang Yuan","Yingqian Zhao","Jiaqi Ma"],"pdf_url":"https://arxiv.org/pdf/2403.16034v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16028v1","updated":"2024-03-24T06:10:22Z","published":"2024-03-24T06:10:22Z","title":"Exploring the Impact of Dataset Bias on Dataset Distillation","summary":" Dataset Distillation (DD) is a promising technique to synthesize a smaller\ndataset that preserves essential information from the original dataset. This\nsynthetic dataset can serve as a substitute for the original large-scale one,\nand help alleviate the training workload. However, current DD methods typically\noperate under the assumption that the dataset is unbiased, overlooking\npotential bias issues within the dataset itself. To fill in this blank, we\nsystematically investigate the influence of dataset bias on DD. To the best of\nour knowledge, this is the first exploration in the DD domain. Given that there\nare no suitable biased datasets for DD, we first construct two biased datasets,\nCMNIST-DD and CCIFAR10-DD, to establish a foundation for subsequent analysis.\nThen we utilize existing DD methods to generate synthetic datasets on CMNIST-DD\nand CCIFAR10-DD, and evaluate their performance following the standard process.\nExperiments demonstrate that biases present in the original dataset\nsignificantly impact the performance of the synthetic dataset in most cases,\nwhich highlights the necessity of identifying and mitigating biases in the\noriginal datasets during DD. Finally, we reformulate DD within the context of a\nbiased dataset. Our code along with biased datasets are available at\nhttps://github.com/yaolu-zjut/Biased-DD.\n","authors":["Yao Lu","Jianyang Gu","Xuguang Chen","Saeed Vahidian","Qi Xuan"],"pdf_url":"https://arxiv.org/pdf/2403.16028v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16024v1","updated":"2024-03-24T05:57:00Z","published":"2024-03-24T05:57:00Z","title":"A Unified Module for Accelerating STABLE-DIFFUSION: LCM-LORA","summary":" This paper presents a comprehensive study on the unified module for\naccelerating stable-diffusion processes, specifically focusing on the lcm-lora\nmodule. Stable-diffusion processes play a crucial role in various scientific\nand engineering domains, and their acceleration is of paramount importance for\nefficient computational performance. The standard iterative procedures for\nsolving fixed-source discrete ordinates problems often exhibit slow\nconvergence, particularly in optically thick scenarios. To address this\nchallenge, unconditionally stable diffusion-acceleration methods have been\ndeveloped, aiming to enhance the computational efficiency of transport\nequations and discrete ordinates problems. This study delves into the\ntheoretical foundations and numerical results of unconditionally stable\ndiffusion synthetic acceleration methods, providing insights into their\nstability and performance for model discrete ordinates problems. Furthermore,\nthe paper explores recent advancements in diffusion model acceleration,\nincluding on device acceleration of large diffusion models via gpu aware\noptimizations, highlighting the potential for significantly improved inference\nlatency. The results and analyses in this study provide important insights into\nstable diffusion processes and have important ramifications for the creation\nand application of acceleration methods specifically, the lcm-lora module in a\nvariety of computing environments.\n","authors":["Ayush Thakur","Rashmi Vashisth"],"pdf_url":"https://arxiv.org/pdf/2403.16024v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16023v1","updated":"2024-03-24T05:55:39Z","published":"2024-03-24T05:55:39Z","title":"RPMArt: Towards Robust Perception and Manipulation for Articulated\n Objects","summary":" Articulated objects are commonly found in daily life. It is essential that\nrobots can exhibit robust perception and manipulation skills for articulated\nobjects in real-world robotic applications. However, existing methods for\narticulated objects insufficiently address noise in point clouds and struggle\nto bridge the gap between simulation and reality, thus limiting the practical\ndeployment in real-world scenarios. To tackle these challenges, we propose a\nframework towards Robust Perception and Manipulation for Articulated Objects\n(RPMArt), which learns to estimate the articulation parameters and manipulate\nthe articulation part from the noisy point cloud. Our primary contribution is a\nRobust Articulation Network (RoArtNet) that is able to predict both joint\nparameters and affordable points robustly by local feature learning and point\ntuple voting. Moreover, we introduce an articulation-aware classification\nscheme to enhance its ability for sim-to-real transfer. Finally, with the\nestimated affordable point and articulation joint constraint, the robot can\ngenerate robust actions to manipulate articulated objects. After learning only\nfrom synthetic data, RPMArt is able to transfer zero-shot to real-world\narticulated objects. Experimental results confirm our approach's effectiveness,\nwith our framework achieving state-of-the-art performance in both noise-added\nsimulation and real-world environments. The code and data will be open-sourced\nfor reproduction. More results are published on the project website at\nhttps://r-pmart.github.io .\n","authors":["Junbo Wang","Wenhai Liu","Qiaojun Yu","Yang You","Liu Liu","Weiming Wang","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2403.16023v1.pdf","comment":"8 pages, 7 figures, submitted to 2024 IEEE/RSJ International\n Conference on Intelligent Robots and Systems (IROS 2024), project website at\n https://r-pmart.github.io"},{"id":"http://arxiv.org/abs/2403.16020v1","updated":"2024-03-24T05:50:00Z","published":"2024-03-24T05:50:00Z","title":"PaPr: Training-Free One-Step Patch Pruning with Lightweight ConvNets for\n Faster Inference","summary":" As deep neural networks evolve from convolutional neural networks (ConvNets)\nto advanced vision transformers (ViTs), there is an increased need to eliminate\nredundant data for faster processing without compromising accuracy. Previous\nmethods are often architecture-specific or necessitate re-training, restricting\ntheir applicability with frequent model updates. To solve this, we first\nintroduce a novel property of lightweight ConvNets: their ability to identify\nkey discriminative patch regions in images, irrespective of model's final\naccuracy or size. We demonstrate that fully-connected layers are the primary\nbottleneck for ConvNets performance, and their suppression with simple weight\nrecalibration markedly enhances discriminative patch localization performance.\nUsing this insight, we introduce PaPr, a method for substantially pruning\nredundant patches with minimal accuracy loss using lightweight ConvNets across\na variety of deep learning architectures, including ViTs, ConvNets, and hybrid\ntransformers, without any re-training. Moreover, the simple early-stage\none-step patch pruning with PaPr enhances existing patch reduction methods.\nThrough extensive testing on diverse architectures, PaPr achieves significantly\nhigher accuracy over state-of-the-art patch reduction methods with similar FLOP\ncount reduction. More specifically, PaPr reduces about 70% of redundant patches\nin videos with less than 0.8% drop in accuracy, and up to 3.7x FLOPs reduction,\nwhich is a 15% more reduction with 2.5% higher accuracy.\n","authors":["Tanvir Mahmud","Burhaneddin Yaman","Chun-Hao Liu","Diana Marculescu"],"pdf_url":"https://arxiv.org/pdf/2403.16020v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.05997v3","updated":"2024-03-24T05:46:10Z","published":"2023-01-15T02:04:02Z","title":"Exploiting Auxiliary Caption for Video Grounding","summary":" Video grounding aims to locate a moment of interest matching the given query\nsentence from an untrimmed video. Previous works ignore the {sparsity dilemma}\nin video annotations, which fails to provide the context information between\npotential events and query sentences in the dataset. In this paper, we contend\nthat exploiting easily available captions which describe general actions, i.e.,\nauxiliary captions defined in our paper, will significantly boost the\nperformance. To this end, we propose an Auxiliary Caption Network (ACNet) for\nvideo grounding. Specifically, we first introduce dense video captioning to\ngenerate dense captions and then obtain auxiliary captions by Non-Auxiliary\nCaption Suppression (NACS). To capture the potential information in auxiliary\ncaptions, we propose Caption Guided Attention (CGA) project the semantic\nrelations between auxiliary captions and query sentences into temporal space\nand fuse them into visual representations. Considering the gap between\nauxiliary captions and ground truth, we propose Asymmetric Cross-modal\nContrastive Learning (ACCL) for constructing more negative pairs to maximize\ncross-modal mutual information. Extensive experiments on three public datasets\n(i.e., ActivityNet Captions, TACoS and ActivityNet-CG) demonstrate that our\nmethod significantly outperforms state-of-the-art methods.\n","authors":["Hongxiang Li","Meng Cao","Xuxin Cheng","Zhihong Zhu","Yaowei Li","Yuexian Zou"],"pdf_url":"https://arxiv.org/pdf/2301.05997v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16016v1","updated":"2024-03-24T05:26:55Z","published":"2024-03-24T05:26:55Z","title":"Fill in the ____ (a Diffusion-based Image Inpainting Pipeline)","summary":" Image inpainting is the process of taking an image and generating lost or\nintentionally occluded portions. Inpainting has countless applications\nincluding restoring previously damaged pictures, restoring the quality of\nimages that have been degraded due to compression, and removing unwanted\nobjects/text. Modern inpainting techniques have shown remarkable ability in\ngenerating sensible completions for images with mask occlusions. In our paper,\nan overview of the progress of inpainting techniques will be provided, along\nwith identifying current leading approaches, focusing on their strengths and\nweaknesses. A critical gap in these existing models will be addressed, focusing\non the ability to prompt and control what exactly is generated. We will\nadditionally justify why we think this is the natural next progressive step\nthat inpainting models must take, and provide multiple approaches to\nimplementing this functionality. Finally, we will evaluate the results of our\napproaches by qualitatively checking whether they generate high-quality images\nthat correctly inpaint regions with the objects that they are instructed to\nproduce.\n","authors":["Eyoel Gebre","Krishna Saxena","Timothy Tran"],"pdf_url":"https://arxiv.org/pdf/2403.16016v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13897v2","updated":"2024-03-24T05:20:15Z","published":"2023-08-26T14:50:24Z","title":"InsertNeRF: Instilling Generalizability into NeRF with HyperNet Modules","summary":" Generalizing Neural Radiance Fields (NeRF) to new scenes is a significant\nchallenge that existing approaches struggle to address without extensive\nmodifications to vanilla NeRF framework. We introduce InsertNeRF, a method for\nINStilling gEneRalizabiliTy into NeRF. By utilizing multiple plug-and-play\nHyperNet modules, InsertNeRF dynamically tailors NeRF's weights to specific\nreference scenes, transforming multi-scale sampling-aware features into\nscene-specific representations. This novel design allows for more accurate and\nefficient representations of complex appearances and geometries. Experiments\nshow that this method not only achieves superior generalization performance but\nalso provides a flexible pathway for integration with other NeRF-like systems,\neven in sparse input settings. Code will be available\nhttps://github.com/bbbbby-99/InsertNeRF.\n","authors":["Yanqi Bao","Tianyu Ding","Jing Huo","Wenbin Li","Yuxin Li","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2308.13897v2.pdf","comment":"This work was accepted at ICLR 2024"},{"id":"http://arxiv.org/abs/2403.16009v1","updated":"2024-03-24T04:39:40Z","published":"2024-03-24T04:39:40Z","title":"SM2C: Boost the Semi-supervised Segmentation for Medical Image by using\n Meta Pseudo Labels and Mixed Images","summary":" Recently, machine learning-based semantic segmentation algorithms have\ndemonstrated their potential to accurately segment regions and contours in\nmedical images, allowing the precise location of anatomical structures and\nabnormalities. Although medical images are difficult to acquire and annotate,\nsemi-supervised learning methods are efficient in dealing with the scarcity of\nlabeled data. However, overfitting is almost inevitable due to the limited\nimages for training. Furthermore, the intricate shapes of organs and lesions in\nmedical images introduce additional complexity in different cases, preventing\nnetworks from acquiring a strong ability to generalize. To this end, we\nintroduce a novel method called Scaling-up Mix with Multi-Class (SM2C). This\nmethod uses three strategies - scaling-up image size, multi-class mixing, and\nobject shape jittering - to improve the ability to learn semantic features\nwithin medical images. By diversifying the shape of the segmentation objects\nand enriching the semantic information within each sample, the SM2C\ndemonstrates its potential, especially in the training of unlabelled data.\nExtensive experiments demonstrate the effectiveness of the SM2C on three\nbenchmark medical image segmentation datasets. The proposed framework shows\nsignificant improvements over state-of-the-art counterparts.\n","authors":["Yifei Wang","Chuhong Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.16009v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16005v1","updated":"2024-03-24T04:23:56Z","published":"2024-03-24T04:23:56Z","title":"Knowledge-Enhanced Dual-stream Zero-shot Composed Image Retrieval","summary":" We study the zero-shot Composed Image Retrieval (ZS-CIR) task, which is to\nretrieve the target image given a reference image and a description without\ntraining on the triplet datasets. Previous works generate pseudo-word tokens by\nprojecting the reference image features to the text embedding space. However,\nthey focus on the global visual representation, ignoring the representation of\ndetailed attributes, e.g., color, object number and layout. To address this\nchallenge, we propose a Knowledge-Enhanced Dual-stream zero-shot composed image\nretrieval framework (KEDs). KEDs implicitly models the attributes of the\nreference images by incorporating a database. The database enriches the\npseudo-word tokens by providing relevant images and captions, emphasizing\nshared attribute information in various aspects. In this way, KEDs recognizes\nthe reference image from diverse perspectives. Moreover, KEDs adopts an extra\nstream that aligns pseudo-word tokens with textual concepts, leveraging\npseudo-triplets mined from image-text pairs. The pseudo-word tokens generated\nin this stream are explicitly aligned with fine-grained semantics in the text\nembedding space. Extensive experiments on widely used benchmarks, i.e.\nImageNet-R, COCO object, Fashion-IQ and CIRR, show that KEDs outperforms\nprevious zero-shot composed image retrieval methods.\n","authors":["Yucheng Suo","Fan Ma","Linchao Zhu","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2403.16005v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.16003v1","updated":"2024-03-24T04:22:37Z","published":"2024-03-24T04:22:37Z","title":"Diverse Representation Embedding for Lifelong Person Re-Identification","summary":" Lifelong Person Re-Identification (LReID) aims to continuously learn from\nsuccessive data streams, matching individuals across multiple cameras. The key\nchallenge for LReID is how to effectively preserve old knowledge while learning\nnew information incrementally. Task-level domain gaps and limited old task\ndatasets are key factors leading to catastrophic forgetting in ReLD, which are\noverlooked in existing methods. To alleviate this problem, we propose a novel\nDiverse Representation Embedding (DRE) framework for LReID. The proposed DRE\npreserves old knowledge while adapting to new information based on\ninstance-level and task-level layout. Concretely, an Adaptive Constraint Module\n(ACM) is proposed to implement integration and push away operations between\nmultiple representations, obtaining dense embedding subspace for each instance\nto improve matching ability on limited old task datasets. Based on the\nprocessed diverse representation, we interact knowledge between the adjustment\nmodel and the learner model through Knowledge Update (KU) and Knowledge\nPreservation (KP) strategies at the task-level layout, which reduce the\ntask-wise domain gap on both old and new tasks, and exploit diverse\nrepresentation of each instance in limited datasets from old tasks, improving\nmodel performance for extended periods. Extensive experiments were conducted on\neleven Re-ID datasets, including five seen datasets for training in order-1 and\norder-2 orders and six unseen datasets for inference. Compared to\nstate-of-the-art methods, our method achieves significantly improved\nperformance in holistic, large-scale, and occluded datasets.\n","authors":["Shiben Liu","Huijie Fan","Qiang Wang","Xiai Chen","Zhi Han","Yandong Tang"],"pdf_url":"https://arxiv.org/pdf/2403.16003v1.pdf","comment":"11 pages,7 Tables,3 Figures"},{"id":"http://arxiv.org/abs/2403.16002v1","updated":"2024-03-24T04:15:50Z","published":"2024-03-24T04:15:50Z","title":"SDSTrack: Self-Distillation Symmetric Adapter Learning for Multi-Modal\n Visual Object Tracking","summary":" Multimodal Visual Object Tracking (VOT) has recently gained significant\nattention due to its robustness. Early research focused on fully fine-tuning\nRGB-based trackers, which was inefficient and lacked generalized representation\ndue to the scarcity of multimodal data. Therefore, recent studies have utilized\nprompt tuning to transfer pre-trained RGB-based trackers to multimodal data.\nHowever, the modality gap limits pre-trained knowledge recall, and the\ndominance of the RGB modality persists, preventing the full utilization of\ninformation from other modalities. To address these issues, we propose a novel\nsymmetric multimodal tracking framework called SDSTrack. We introduce\nlightweight adaptation for efficient fine-tuning, which directly transfers the\nfeature extraction ability from RGB to other domains with a small number of\ntrainable parameters and integrates multimodal features in a balanced,\nsymmetric manner. Furthermore, we design a complementary masked patch\ndistillation strategy to enhance the robustness of trackers in complex\nenvironments, such as extreme weather, poor imaging, and sensor failure.\nExtensive experiments demonstrate that SDSTrack outperforms state-of-the-art\nmethods in various multimodal tracking scenarios, including RGB+Depth,\nRGB+Thermal, and RGB+Event tracking, and exhibits impressive results in extreme\nconditions. Our source code is available at https://github.com/hoqolo/SDSTrack.\n","authors":["Xiaojun Hou","Jiazheng Xing","Yijie Qian","Yaowei Guo","Shuo Xin","Junhao Chen","Kai Tang","Mengmeng Wang","Zhengkai Jiang","Liang Liu","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2403.16002v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.12494v2","updated":"2024-03-24T03:17:24Z","published":"2024-03-19T07:02:08Z","title":"Task-Customized Mixture of Adapters for General Image Fusion","summary":" General image fusion aims at integrating important information from\nmulti-source images. However, due to the significant cross-task gap, the\nrespective fusion mechanism varies considerably in practice, resulting in\nlimited performance across subtasks. To handle this problem, we propose a novel\ntask-customized mixture of adapters (TC-MoA) for general image fusion,\nadaptively prompting various fusion tasks in a unified model. We borrow the\ninsight from the mixture of experts (MoE), taking the experts as efficient\ntuning adapters to prompt a pre-trained foundation model. These adapters are\nshared across different tasks and constrained by mutual information\nregularization, ensuring compatibility with different tasks while\ncomplementarity for multi-source images. The task-specific routing networks\ncustomize these adapters to extract task-specific information from different\nsources with dynamic dominant intensity, performing adaptive visual feature\nprompt fusion. Notably, our TC-MoA controls the dominant intensity bias for\ndifferent fusion tasks, successfully unifying multiple fusion tasks in a single\nmodel. Extensive experiments show that TC-MoA outperforms the competing\napproaches in learning commonalities while retaining compatibility for general\nimage fusion (multi-modal, multi-exposure, and multi-focus), and also\ndemonstrating striking controllability on more generalization experiments. The\ncode is available at https://github.com/YangSun22/TC-MoA .\n","authors":["Pengfei Zhu","Yang Sun","Bing Cao","Qinghua Hu"],"pdf_url":"https://arxiv.org/pdf/2403.12494v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.15994v1","updated":"2024-03-24T03:10:39Z","published":"2024-03-24T03:10:39Z","title":"Multi-Scale Spatio-Temporal Graph Convolutional Network for Facial\n Expression Spotting","summary":" Facial expression spotting is a significant but challenging task in facial\nexpression analysis. The accuracy of expression spotting is affected not only\nby irrelevant facial movements but also by the difficulty of perceiving subtle\nmotions in micro-expressions. In this paper, we propose a Multi-Scale\nSpatio-Temporal Graph Convolutional Network (SpoT-GCN) for facial expression\nspotting. To extract more robust motion features, we track both short- and\nlong-term motion of facial muscles in compact sliding windows whose window\nlength adapts to the temporal receptive field of the network. This strategy,\ntermed the receptive field adaptive sliding window strategy, effectively\nmagnifies the motion features while alleviating the problem of severe head\nmovement. The subtle motion features are then converted to a facial graph\nrepresentation, whose spatio-temporal graph patterns are learned by a graph\nconvolutional network. This network learns both local and global features from\nmultiple scales of facial graph structures using our proposed facial local\ngraph pooling (FLGP). Furthermore, we introduce supervised contrastive learning\nto enhance the discriminative capability of our model for difficult-to-classify\nframes. The experimental results on the SAMM-LV and CAS(ME)^2 datasets\ndemonstrate that our method achieves state-of-the-art performance, particularly\nin micro-expression spotting. Ablation studies further verify the effectiveness\nof our proposed modules.\n","authors":["Yicheng Deng","Hideaki Hayashi","Hajime Nagahara"],"pdf_url":"https://arxiv.org/pdf/2403.15994v1.pdf","comment":"Accepted by FG2024"},{"id":"http://arxiv.org/abs/2403.15992v1","updated":"2024-03-24T03:10:07Z","published":"2024-03-24T03:10:07Z","title":"BIMCV-R: A Landmark Dataset for 3D CT Text-Image Retrieval","summary":" The burgeoning integration of 3D medical imaging into healthcare has led to a\nsubstantial increase in the workload of medical professionals. To assist\nclinicians in their diagnostic processes and alleviate their workload, the\ndevelopment of a robust system for retrieving similar case studies presents a\nviable solution. While the concept holds great promise, the field of 3D medical\ntext-image retrieval is currently limited by the absence of robust evaluation\nbenchmarks and curated datasets. To remedy this, our study presents a\ngroundbreaking dataset, BIMCV-R (This dataset will be released upon\nacceptance.), which includes an extensive collection of 8,069 3D CT volumes,\nencompassing over 2 million slices, paired with their respective radiological\nreports. Expanding upon the foundational work of our dataset, we craft a\nretrieval strategy, MedFinder. This approach employs a dual-stream network\narchitecture, harnessing the potential of large language models to advance the\nfield of medical image retrieval beyond existing text-image retrieval\nsolutions. It marks our preliminary step towards developing a system capable of\nfacilitating text-to-image, image-to-text, and keyword-based retrieval tasks.\n","authors":["Yinda Chen","Che Liu","Xiaoyu Liu","Rossella Arcucci","Zhiwei Xiong"],"pdf_url":"https://arxiv.org/pdf/2403.15992v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02155v2","updated":"2024-03-24T02:57:28Z","published":"2023-12-04T18:59:55Z","title":"GPS-Gaussian: Generalizable Pixel-wise 3D Gaussian Splatting for\n Real-time Human Novel View Synthesis","summary":" We present a new approach, termed GPS-Gaussian, for synthesizing novel views\nof a character in a real-time manner. The proposed method enables 2K-resolution\nrendering under a sparse-view camera setting. Unlike the original Gaussian\nSplatting or neural implicit rendering methods that necessitate per-subject\noptimizations, we introduce Gaussian parameter maps defined on the source views\nand regress directly Gaussian Splatting properties for instant novel view\nsynthesis without any fine-tuning or optimization. To this end, we train our\nGaussian parameter regression module on a large amount of human scan data,\njointly with a depth estimation module to lift 2D parameter maps to 3D space.\nThe proposed framework is fully differentiable and experiments on several\ndatasets demonstrate that our method outperforms state-of-the-art methods while\nachieving an exceeding rendering speed.\n","authors":["Shunyuan Zheng","Boyao Zhou","Ruizhi Shao","Boning Liu","Shengping Zhang","Liqiang Nie","Yebin Liu"],"pdf_url":"https://arxiv.org/pdf/2312.02155v2.pdf","comment":"Accepted by CVPR 2024. Project page:\n https://shunyuanzheng.github.io/GPS-Gaussian"},{"id":"http://arxiv.org/abs/2403.15990v1","updated":"2024-03-24T02:55:45Z","published":"2024-03-24T02:55:45Z","title":"Mars Spectrometry 2: Gas Chromatography -- Second place solution","summary":" The Mars Spectrometry 2: Gas Chromatography challenge was sponsored by NASA\nand run on the DrivenData competition platform in 2022. This report describes\nthe solution which achieved the second-best score on the competition's test\ndataset. The solution utilized two-dimensional, image-like representations of\nthe competition's chromatography data samples. A number of different\nConvolutional Neural Network models were trained and ensembled for the final\nsubmission.\n","authors":["Dmitry A. Konovalov"],"pdf_url":"https://arxiv.org/pdf/2403.15990v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.05656v3","updated":"2024-03-24T02:43:55Z","published":"2023-03-10T02:15:58Z","title":"EHRDiff: Exploring Realistic EHR Synthesis with Diffusion Models","summary":" Electronic health records (EHR) contain a wealth of biomedical information,\nserving as valuable resources for the development of precision medicine\nsystems. However, privacy concerns have resulted in limited access to\nhigh-quality and large-scale EHR data for researchers, impeding progress in\nmethodological development. Recent research has delved into synthesizing\nrealistic EHR data through generative modeling techniques, where a majority of\nproposed methods relied on generative adversarial networks (GAN) and their\nvariants for EHR synthesis. Despite GAN-based methods attaining\nstate-of-the-art performance in generating EHR data, these approaches are\ndifficult to train and prone to mode collapse. Recently introduced in\ngenerative modeling, diffusion models have established cutting-edge performance\nin image generation, but their efficacy in EHR data synthesis remains largely\nunexplored. In this study, we investigate the potential of diffusion models for\nEHR data synthesis and introduce a novel method, EHRDiff. Through extensive\nexperiments, EHRDiff establishes new state-of-the-art quality for synthetic EHR\ndata, protecting private information in the meanwhile.\n","authors":["Hongyi Yuan","Songchi Zhou","Sheng Yu"],"pdf_url":"https://arxiv.org/pdf/2303.05656v3.pdf","comment":"Accepted by TMLR, preprint of camera-ready version"},{"id":"http://arxiv.org/abs/2403.15981v1","updated":"2024-03-24T02:15:14Z","published":"2024-03-24T02:15:14Z","title":"Exploring Accurate 3D Phenotyping in Greenhouse through Neural Radiance\n Fields","summary":" Accurate collection of plant phenotyping is critical to optimising\nsustainable farming practices in precision agriculture. Traditional phenotyping\nin controlled laboratory environments, while valuable, falls short in\nunderstanding plant growth under real-world conditions. Emerging sensor and\ndigital technologies offer a promising approach for direct phenotyping of\nplants in farm environments. This study investigates a learning-based\nphenotyping method using the Neural Radiance Field to achieve accurate in-situ\nphenotyping of pepper plants in greenhouse environments. To quantitatively\nevaluate the performance of this method, traditional point cloud registration\non 3D scanning data is implemented for comparison. Experimental result shows\nthat NeRF(Neural Radiance Fields) achieves competitive accuracy compared to the\n3D scanning methods. The mean distance error between the scanner-based method\nand the NeRF-based method is 0.865mm. This study shows that the learning-based\nNeRF method achieves similar accuracy to 3D scanning-based methods but with\nimproved scalability and robustness.\n","authors":["unhong Zhao","Wei Ying","Yaoqiang Pan","Zhenfeng Yi","Chao Chen","Kewei Hu","Hanwen Kang"],"pdf_url":"https://arxiv.org/pdf/2403.15981v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.03695v2","updated":"2024-03-24T02:03:55Z","published":"2024-01-08T06:53:33Z","title":"A Large-Scale Empirical Study on Improving the Fairness of Image\n Classification Models","summary":" Fairness has been a critical issue that affects the adoption of deep learning\nmodels in real practice. To improve model fairness, many existing methods have\nbeen proposed and evaluated to be effective in their own contexts. However,\nthere is still no systematic evaluation among them for a comprehensive\ncomparison under the same context, which makes it hard to understand the\nperformance distinction among them, hindering the research progress and\npractical adoption of them. To fill this gap, this paper endeavours to conduct\nthe first large-scale empirical study to comprehensively compare the\nperformance of existing state-of-the-art fairness improving techniques.\nSpecifically, we target the widely-used application scenario of image\nclassification, and utilized three different datasets and five commonly-used\nperformance metrics to assess in total 13 methods from diverse categories. Our\nfindings reveal substantial variations in the performance of each method across\ndifferent datasets and sensitive attributes, indicating over-fitting on\nspecific datasets by many existing methods. Furthermore, different fairness\nevaluation metrics, due to their distinct focuses, yield significantly\ndifferent assessment results. Overall, we observe that pre-processing methods\nand in-processing methods outperform post-processing methods, with\npre-processing methods exhibiting the best performance. Our empirical study\noffers comprehensive recommendations for enhancing fairness in deep learning\nmodels. We approach the problem from multiple dimensions, aiming to provide a\nuniform evaluation platform and inspire researchers to explore more effective\nfairness solutions via a set of implications.\n","authors":["Junjie Yang","Jiajun Jiang","Zeyu Sun","Junjie Chen"],"pdf_url":"https://arxiv.org/pdf/2401.03695v2.pdf","comment":"Accepted by the 33rd ACM SIGSOFT International Symposium on Software\n Testing and Analysis (ISSTA 2024). Please include ISSTA in any citations"},{"id":"http://arxiv.org/abs/2012.04132v4","updated":"2024-03-24T01:23:11Z","published":"2020-12-08T00:37:35Z","title":"A Number Sense as an Emergent Property of the Manipulating Brain","summary":" The ability to understand and manipulate numbers and quantities emerges\nduring childhood, but the mechanism through which humans acquire and develop\nthis ability is still poorly understood. We explore this question through a\nmodel, assuming that the learner is able to pick up and place small objects\nfrom, and to, locations of its choosing, and will spontaneously engage in such\nundirected manipulation. We further assume that the learner's visual system\nwill monitor the changing arrangements of objects in the scene and will learn\nto predict the effects of each action by comparing perception with a\nsupervisory signal from the motor system. We model perception using standard\ndeep networks for feature extraction and classification, and gradient descent\nlearning. Our main finding is that, from learning the task of action\nprediction, an unexpected image representation emerges exhibiting regularities\nthat foreshadow the perception and representation of numbers and quantity.\nThese include distinct categories for zero and the first few natural numbers, a\nstrict ordering of the numbers, and a one-dimensional signal that correlates\nwith numerical quantity. As a result, our model acquires the ability to\nestimate numerosity, i.e. the number of objects in the scene, as well as\nsubitization, i.e. the ability to recognize at a glance the exact number of\nobjects in small scenes. Remarkably, subitization and numerosity estimation\nextrapolate to scenes containing many objects, far beyond the three objects\nused during training. We conclude that important aspects of a facility with\nnumbers and quantities may be learned with supervision from a simple\npre-training task. Our observations suggest that cross-modal learning is a\npowerful learning mechanism that may be harnessed in artificial intelligence.\n","authors":["Neehar Kondapaneni","Pietro Perona"],"pdf_url":"https://arxiv.org/pdf/2012.04132v4.pdf","comment":"16 pages, 5 figures, 15 supplemental figures"},{"id":"http://arxiv.org/abs/2403.15977v1","updated":"2024-03-24T01:20:08Z","published":"2024-03-24T01:20:08Z","title":"Towards Two-Stream Foveation-based Active Vision Learning","summary":" Deep neural network (DNN) based machine perception frameworks process the\nentire input in a one-shot manner to provide answers to both \"what object is\nbeing observed\" and \"where it is located\". In contrast, the \"two-stream\nhypothesis\" from neuroscience explains the neural processing in the human\nvisual cortex as an active vision system that utilizes two separate regions of\nthe brain to answer the what and the where questions. In this work, we propose\na machine learning framework inspired by the \"two-stream hypothesis\" and\nexplore the potential benefits that it offers. Specifically, the proposed\nframework models the following mechanisms: 1) ventral (what) stream focusing on\nthe input regions perceived by the fovea part of an eye (foveation), 2) dorsal\n(where) stream providing visual guidance, and 3) iterative processing of the\ntwo streams to calibrate visual focus and process the sequence of focused image\npatches. The training of the proposed framework is accomplished by label-based\nDNN training for the ventral stream model and reinforcement learning for the\ndorsal stream model. We show that the two-stream foveation-based learning is\napplicable to the challenging task of weakly-supervised object localization\n(WSOL), where the training data is limited to the object class or its\nattributes. The framework is capable of both predicting the properties of an\nobject and successfully localizing it by predicting its bounding box. We also\nshow that, due to the independent nature of the two streams, the dorsal model\ncan be applied on its own to unseen images to localize objects from different\ndatasets.\n","authors":["Timur Ibrayev","Amitangshu Mukherjee","Sai Aparna Aketi","Kaushik Roy"],"pdf_url":"https://arxiv.org/pdf/2403.15977v1.pdf","comment":"18 pages, 14 figures, Under consideration at IEEE Transactions on\n Cognitive and Developmental Systems"},{"id":"http://arxiv.org/abs/2403.15974v1","updated":"2024-03-24T00:46:40Z","published":"2024-03-24T00:46:40Z","title":"CBGT-Net: A Neuromimetic Architecture for Robust Classification of\n Streaming Data","summary":" This paper describes CBGT-Net, a neural network model inspired by the\ncortico-basal ganglia-thalamic (CBGT) circuits found in mammalian brains.\nUnlike traditional neural network models, which either generate an output for\neach provided input, or an output after a fixed sequence of inputs, the\nCBGT-Net learns to produce an output after a sufficient criteria for evidence\nis achieved from a stream of observed data. For each observation, the CBGT-Net\ngenerates a vector that explicitly represents the amount of evidence the\nobservation provides for each potential decision, accumulates the evidence over\ntime, and generates a decision when the accumulated evidence exceeds a\npre-defined threshold. We evaluate the proposed model on two image\nclassification tasks, where models need to predict image categories based on a\nstream of small patches extracted from the image. We show that the CBGT-Net\nprovides improved accuracy and robustness compared to models trained to\nclassify from a single patch, and models leveraging an LSTM layer to classify\nfrom a fixed sequence length of patches.\n","authors":["Shreya Sharma","Dana Hughes","Katia Sycara"],"pdf_url":"https://arxiv.org/pdf/2403.15974v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11868v2","updated":"2024-03-24T00:11:08Z","published":"2023-10-18T10:36:34Z","title":"To Generate or Not? Safety-Driven Unlearned Diffusion Models Are Still\n Easy To Generate Unsafe Images ... For Now","summary":" The recent advances in diffusion models (DMs) have revolutionized the\ngeneration of realistic and complex images. However, these models also\nintroduce potential safety hazards, such as producing harmful content and\ninfringing data copyrights. Despite the development of safety-driven unlearning\ntechniques to counteract these challenges, doubts about their efficacy persist.\nTo tackle this issue, we introduce an evaluation framework that leverages\nadversarial prompts to discern the trustworthiness of these safety-driven DMs\nafter they have undergone the process of unlearning harmful concepts.\nSpecifically, we investigated the adversarial robustness of DMs, assessed by\nadversarial prompts, when eliminating unwanted concepts, styles, and objects.\nWe develop an effective and efficient adversarial prompt generation approach\nfor DMs, termed UnlearnDiffAtk. This method capitalizes on the intrinsic\nclassification abilities of DMs to simplify the creation of adversarial\nprompts, thereby eliminating the need for auxiliary classification or diffusion\nmodels.Through extensive benchmarking, we evaluate the robustness of five\nwidely-used safety-driven unlearned DMs (i.e., DMs after unlearning undesirable\nconcepts, styles, or objects) across a variety of tasks. Our results\ndemonstrate the effectiveness and efficiency merits of UnlearnDiffAtk over the\nstate-of-the-art adversarial prompt generation method and reveal the lack of\nrobustness of current safety-driven unlearning techniques when applied to DMs.\nCodes are available at https://github.com/OPTML-Group/Diffusion-MU-Attack.\nWARNING: This paper contains model outputs that may be offensive in nature.\n","authors":["Yimeng Zhang","Jinghan Jia","Xin Chen","Aochuan Chen","Yihua Zhang","Jiancheng Liu","Ke Ding","Sijia Liu"],"pdf_url":"https://arxiv.org/pdf/2310.11868v2.pdf","comment":"Codes are available at\n https://github.com/OPTML-Group/Diffusion-MU-Attack"},{"id":"http://arxiv.org/abs/1908.01978v2","updated":"2024-03-24T17:47:06Z","published":"2019-08-06T06:44:43Z","title":"Multi-view Deep Subspace Clustering Networks","summary":" Multi-view subspace clustering aims to discover the inherent structure of\ndata by fusing multiple views of complementary information. Most existing\nmethods first extract multiple types of handcrafted features and then learn a\njoint affinity matrix for clustering. The disadvantage of this approach lies in\ntwo aspects: 1) multi-view relations are not embedded into feature learning,\nand 2) the end-to-end learning manner of deep learning is not suitable for\nmulti-view clustering. Even when deep features have been extracted, it is a\nnontrivial problem to choose a proper backbone for clustering on different\ndatasets. To address these issues, we propose the Multi-view Deep Subspace\nClustering Networks (MvDSCN), which learns a multi-view self-representation\nmatrix in an end-to-end manner. The MvDSCN consists of two sub-networks, \\ie, a\ndiversity network (Dnet) and a universality network (Unet). A latent space is\nbuilt using deep convolutional autoencoders, and a self-representation matrix\nis learned in the latent space using a fully connected layer. Dnet learns\nview-specific self-representation matrices, whereas Unet learns a common\nself-representation matrix for all views. To exploit the complementarity of\nmulti-view representations, the Hilbert--Schmidt independence criterion (HSIC)\nis introduced as a diversity regularizer that captures the nonlinear,\nhigh-order inter-view relations. Because different views share the same label\nspace, the self-representation matrices of each view are aligned to the common\none by universality regularization. The MvDSCN also unifies multiple backbones\nto boost clustering performance and avoid the need for model selection.\nExperiments demonstrate the superiority of the MvDSCN.\n","authors":["Pengfei Zhu","Xinjie Yao","Yu Wang","Binyuan Hui","Dawei Du","Qinghua Hu"],"pdf_url":"https://arxiv.org/pdf/1908.01978v2.pdf","comment":"Accepted by T-CYB"}]},"2024-03-23T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2403.15955v1","updated":"2024-03-23T23:22:54Z","published":"2024-03-23T23:22:54Z","title":"Finding needles in a haystack: A Black-Box Approach to Invisible\n Watermark Detection","summary":" In this paper, we propose WaterMark Detection (WMD), the first invisible\nwatermark detection method under a black-box and annotation-free setting. WMD\nis capable of detecting arbitrary watermarks within a given reference dataset\nusing a clean non-watermarked dataset as a reference, without relying on\nspecific decoding methods or prior knowledge of the watermarking techniques. We\ndevelop WMD using foundations of offset learning, where a clean non-watermarked\ndataset enables us to isolate the influence of only watermarked samples in the\nreference dataset. Our comprehensive evaluations demonstrate the effectiveness\nof WMD, significantly outperforming naive detection methods, which only yield\nAUC scores around 0.5. In contrast, WMD consistently achieves impressive\ndetection AUC scores, surpassing 0.9 in most single-watermark datasets and\nexceeding 0.7 in more challenging multi-watermark scenarios across diverse\ndatasets and watermarking methods. As invisible watermarks become increasingly\nprevalent, while specific decoding techniques remain undisclosed, our approach\nprovides a versatile solution and establishes a path toward increasing\naccountability, transparency, and trust in our digital visual content.\n","authors":["Minzhou Pan","Zhengting Wang","Xin Dong","Vikash Sehwag","Lingjuan Lyu","Xue Lin"],"pdf_url":"https://arxiv.org/pdf/2403.15955v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15952v1","updated":"2024-03-23T23:06:32Z","published":"2024-03-23T23:06:32Z","title":"IllusionVQA: A Challenging Optical Illusion Dataset for Vision Language\n Models","summary":" The advent of Vision Language Models (VLM) has allowed researchers to\ninvestigate the visual understanding of a neural network using natural\nlanguage. Beyond object classification and detection, VLMs are capable of\nvisual comprehension and common-sense reasoning. This naturally led to the\nquestion: How do VLMs respond when the image itself is inherently unreasonable?\nTo this end, we present IllusionVQA: a diverse dataset of challenging optical\nillusions and hard-to-interpret scenes to test the capability of VLMs in two\ndistinct multiple-choice VQA tasks - comprehension and soft localization.\nGPT4V, the best-performing VLM, achieves 62.99% accuracy (4-shot) on the\ncomprehension task and 49.7% on the localization task (4-shot and\nChain-of-Thought). Human evaluation reveals that humans achieve 91.03% and 100%\naccuracy in comprehension and localization. We discover that In-Context\nLearning (ICL) and Chain-of-Thought reasoning substantially degrade the\nperformance of GeminiPro on the localization task. Tangentially, we discover a\npotential weakness in the ICL capabilities of VLMs: they fail to locate optical\nillusions even when the correct answer is in the context window as a few-shot\nexample.\n","authors":["Haz Sameen Shahgir","Khondker Salman Sayeed","Abhik Bhattacharjee","Wasi Uddin Ahmad","Yue Dong","Rifat Shahriyar"],"pdf_url":"https://arxiv.org/pdf/2403.15952v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15951v1","updated":"2024-03-23T23:05:25Z","published":"2024-03-23T23:05:25Z","title":"MapTracker: Tracking with Strided Memory Fusion for Consistent Vector HD\n Mapping","summary":" This paper presents a vector HD-mapping algorithm that formulates the mapping\nas a tracking task and uses a history of memory latents to ensure consistent\nreconstructions over time. Our method, MapTracker, accumulates a sensor stream\ninto memory buffers of two latent representations: 1) Raster latents in the\nbird's-eye-view (BEV) space and 2) Vector latents over the road elements (i.e.,\npedestrian-crossings, lane-dividers, and road-boundaries). The approach borrows\nthe query propagation paradigm from the tracking literature that explicitly\nassociates tracked road elements from the previous frame to the current, while\nfusing a subset of memory latents selected with distance strides to further\nenhance temporal consistency. A vector latent is decoded to reconstruct the\ngeometry of a road element. The paper further makes benchmark contributions by\n1) Improving processing code for existing datasets to produce consistent ground\ntruth with temporal alignments and 2) Augmenting existing mAP metrics with\nconsistency checks. MapTracker significantly outperforms existing methods on\nboth nuScenes and Agroverse2 datasets by over 8% and 19% on the conventional\nand the new consistency-aware metrics, respectively. The code will be available\non our project page: https://map-tracker.github.io.\n","authors":["Jiacheng Chen","Yuefan Wu","Jiaqi Tan","Hang Ma","Yasutaka Furukawa"],"pdf_url":"https://arxiv.org/pdf/2403.15951v1.pdf","comment":"Project page: https://map-tracker.github.io"},{"id":"http://arxiv.org/abs/2310.09484v2","updated":"2024-03-23T23:04:00Z","published":"2023-10-14T04:11:01Z","title":"Fast-DiM: Towards Fast Diffusion Morphs","summary":" Diffusion Morphs (DiM) are a recent state-of-the-art method for creating high\nquality face morphs; however, they require a high number of network function\nevaluations (NFE) to create the morphs.We propose a new DiM pipeline, Fast-DiM,\nwhich can create morphs of a similar quality but with lower NFE. We investigate\nthe ODE solvers used to solve the Probability Flow ODE and the impact they have\non the the creation of face morphs. Additionally, we employ an alternative\nmethod for encoding images into the latent space of the Diffusion model by\nsolving the Probability Flow ODE as time runs forwards. Our experiments show\nthat we can reduce the NFE by upwards of 85% in the encoding process while\nexperiencing only 1.6% reduction in Mated Morph Presentation Match Rate\n(MMPMR). Likewise, we showed we could cut NFE, in the sampling process, in half\nwith only a maximal reduction of 0.23% in MMPMR.\n","authors":["Zander W. Blasingame","Chen Liu"],"pdf_url":"https://arxiv.org/pdf/2310.09484v2.pdf","comment":"Revised manuscript. Under review for publication"},{"id":"http://arxiv.org/abs/2403.15947v1","updated":"2024-03-23T22:32:06Z","published":"2024-03-23T22:32:06Z","title":"Deep Domain Adaptation: A Sim2Real Neural Approach for Improving\n Eye-Tracking Systems","summary":" Eye image segmentation is a critical step in eye tracking that has great\ninfluence over the final gaze estimate. Segmentation models trained using\nsupervised machine learning can excel at this task, their effectiveness is\ndetermined by the degree of overlap between the narrow distributions of image\nproperties defined by the target dataset and highly specific training datasets,\nof which there are few. Attempts to broaden the distribution of existing eye\nimage datasets through the inclusion of synthetic eye images have found that a\nmodel trained on synthetic images will often fail to generalize back to\nreal-world eye images. In remedy, we use dimensionality-reduction techniques to\nmeasure the overlap between the target eye images and synthetic training data,\nand to prune the training dataset in a manner that maximizes distribution\noverlap. We demonstrate that our methods result in robust, improved performance\nwhen tackling the discrepancy between simulation and real-world data samples.\n","authors":["Viet Dung Nguyen","Reynold Bailey","Gabriel J. Diaz","Chengyi Ma","Alexander Fix","Alexander Ororbia"],"pdf_url":"https://arxiv.org/pdf/2403.15947v1.pdf","comment":"14 pages, 8 figures, accepted to ETRA 2024"},{"id":"http://arxiv.org/abs/2403.15944v1","updated":"2024-03-23T22:14:38Z","published":"2024-03-23T22:14:38Z","title":"Adaptive Super Resolution For One-Shot Talking-Head Generation","summary":" The one-shot talking-head generation learns to synthesize a talking-head\nvideo with one source portrait image under the driving of same or different\nidentity video. Usually these methods require plane-based pixel transformations\nvia Jacobin matrices or facial image warps for novel poses generation. The\nconstraints of using a single image source and pixel displacements often\ncompromise the clarity of the synthesized images. Some methods try to improve\nthe quality of synthesized videos by introducing additional super-resolution\nmodules, but this will undoubtedly increase computational consumption and\ndestroy the original data distribution. In this work, we propose an adaptive\nhigh-quality talking-head video generation method, which synthesizes\nhigh-resolution video without additional pre-trained modules. Specifically,\ninspired by existing super-resolution methods, we down-sample the one-shot\nsource image, and then adaptively reconstruct high-frequency details via an\nencoder-decoder module, resulting in enhanced video clarity. Our method\nconsistently improves the quality of generated videos through a straightforward\nyet effective strategy, substantiated by quantitative and qualitative\nevaluations. The code and demo video are available on:\n\\url{https://github.com/Songluchuan/AdaSR-TalkingHead/}.\n","authors":["Luchuan Song","Pinxin Liu","Guojun Yin","Chenliang Xu"],"pdf_url":"https://arxiv.org/pdf/2403.15944v1.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2403.15943v1","updated":"2024-03-23T22:07:32Z","published":"2024-03-23T22:07:32Z","title":"Feature Manipulation for DDPM based Change Detection","summary":" Change Detection is a classic task of computer vision that receives a\nbi-temporal image pair as input and separates the semantically changed and\nunchanged regions of it. The diffusion model is used in image synthesis and as\na feature extractor and has been applied to various downstream tasks. Using\nthis, a feature map is extracted from the pre-trained diffusion model from the\nlarge-scale data set, and changes are detected through the additional network.\nOn the one hand, the current diffusion-based change detection approach focuses\nonly on extracting a good feature map using the diffusion model. It obtains and\nuses differences without further adjustment to the created feature map. Our\nmethod focuses on manipulating the feature map extracted from the Diffusion\nModel to be more semantically useful, and for this, we propose two methods:\nFeature Attention and FDAF. Our model with Feature Attention achieved a\nstate-of-the-art F1 score (90.18) and IoU (83.86) on the LEVIR-CD dataset.\n","authors":["Zhenglin Li","Yangchen Huang","Mengran Zhu","Jingyu Zhang","JingHao Chang","Houze Liu"],"pdf_url":"https://arxiv.org/pdf/2403.15943v1.pdf","comment":"This paper has been accepted by the 2024 5th International Conference\n on Computer Vision, Image and Deep Learning"},{"id":"http://arxiv.org/abs/2403.15941v1","updated":"2024-03-23T22:04:03Z","published":"2024-03-23T22:04:03Z","title":"Explore until Confident: Efficient Exploration for Embodied Question\n Answering","summary":" We consider the problem of Embodied Question Answering (EQA), which refers to\nsettings where an embodied agent such as a robot needs to actively explore an\nenvironment to gather information until it is confident about the answer to a\nquestion. In this work, we leverage the strong semantic reasoning capabilities\nof large vision-language models (VLMs) to efficiently explore and answer such\nquestions. However, there are two main challenges when using VLMs in EQA: they\ndo not have an internal memory for mapping the scene to be able to plan how to\nexplore over time, and their confidence can be miscalibrated and can cause the\nrobot to prematurely stop exploration or over-explore. We propose a method that\nfirst builds a semantic map of the scene based on depth information and via\nvisual prompting of a VLM - leveraging its vast knowledge of relevant regions\nof the scene for exploration. Next, we use conformal prediction to calibrate\nthe VLM's question answering confidence, allowing the robot to know when to\nstop exploration - leading to a more calibrated and efficient exploration\nstrategy. To test our framework in simulation, we also contribute a new EQA\ndataset with diverse, realistic human-robot scenarios and scenes built upon the\nHabitat-Matterport 3D Research Dataset (HM3D). Both simulated and real robot\nexperiments show our proposed approach improves the performance and efficiency\nover baselines that do no leverage VLM for exploration or do not calibrate its\nconfidence. Webpage with experiment videos and code:\nhttps://explore-eqa.github.io/\n","authors":["Allen Z. Ren","Jaden Clark","Anushri Dixit","Masha Itkina","Anirudha Majumdar","Dorsa Sadigh"],"pdf_url":"https://arxiv.org/pdf/2403.15941v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2403.15931v1","updated":"2024-03-23T20:30:28Z","published":"2024-03-23T20:30:28Z","title":"X-Portrait: Expressive Portrait Animation with Hierarchical Motion\n Attention","summary":" We propose X-Portrait, an innovative conditional diffusion model tailored for\ngenerating expressive and temporally coherent portrait animation. Specifically,\ngiven a single portrait as appearance reference, we aim to animate it with\nmotion derived from a driving video, capturing both highly dynamic and subtle\nfacial expressions along with wide-range head movements. As its core, we\nleverage the generative prior of a pre-trained diffusion model as the rendering\nbackbone, while achieve fine-grained head pose and expression control with\nnovel controlling signals within the framework of ControlNet. In contrast to\nconventional coarse explicit controls such as facial landmarks, our motion\ncontrol module is learned to interpret the dynamics directly from the original\ndriving RGB inputs. The motion accuracy is further enhanced with a patch-based\nlocal control module that effectively enhance the motion attention to\nsmall-scale nuances like eyeball positions. Notably, to mitigate the identity\nleakage from the driving signals, we train our motion control modules with\nscaling-augmented cross-identity images, ensuring maximized disentanglement\nfrom the appearance reference modules. Experimental results demonstrate the\nuniversal effectiveness of X-Portrait across a diverse range of facial\nportraits and expressive driving sequences, and showcase its proficiency in\ngenerating captivating portrait animations with consistently maintained\nidentity characteristics.\n","authors":["You Xie","Hongyi Xu","Guoxian Song","Chao Wang","Yichun Shi","Linjie Luo"],"pdf_url":"https://arxiv.org/pdf/2403.15931v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15918v1","updated":"2024-03-23T19:21:31Z","published":"2024-03-23T19:21:31Z","title":"An Embarrassingly Simple Defense Against Backdoor Attacks On SSL","summary":" Self Supervised Learning (SSL) has emerged as a powerful paradigm to tackle\ndata landscapes with absence of human supervision. The ability to learn\nmeaningful tasks without the use of labeled data makes SSL a popular method to\nmanage large chunks of data in the absence of labels. However, recent work\nindicates SSL to be vulnerable to backdoor attacks, wherein models can be\ncontrolled, possibly maliciously, to suit an adversary's motives. Li et.al\n(2022) introduce a novel frequency-based backdoor attack: CTRL. They show that\nCTRL can be used to efficiently and stealthily gain control over a victim's\nmodel trained using SSL. In this work, we devise two defense strategies against\nfrequency-based attacks in SSL: One applicable before model training and the\nsecond to be applied during model inference. Our first contribution utilizes\nthe invariance property of the downstream task to defend against backdoor\nattacks in a generalizable fashion. We observe the ASR (Attack Success Rate) to\nreduce by over 60% across experiments. Our Inference-time defense relies on\nevasiveness of the attack and uses the luminance channel to defend against\nattacks. Using object classification as the downstream task for SSL, we\ndemonstrate successful defense strategies that do not require re-training of\nthe model. Code is available at https://github.com/Aryan-Satpathy/Backdoor.\n","authors":["Aryan Satpathy"," Nilaksh","Dhruva Rajwade"],"pdf_url":"https://arxiv.org/pdf/2403.15918v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2403.15905v1","updated":"2024-03-23T18:19:02Z","published":"2024-03-23T18:19:02Z","title":"Towards Low-Energy Adaptive Personalization for Resource-Constrained\n Devices","summary":" The personalization of machine learning (ML) models to address data drift is\na significant challenge in the context of Internet of Things (IoT)\napplications. Presently, most approaches focus on fine-tuning either the full\nbase model or its last few layers to adapt to new data, while often neglecting\nenergy costs. However, various types of data drift exist, and fine-tuning the\nfull base model or the last few layers may not result in optimal performance in\ncertain scenarios. We propose Target Block Fine-Tuning (TBFT), a low-energy\nadaptive personalization framework designed for resource-constrained devices.\nWe categorize data drift and personalization into three types: input-level,\nfeature-level, and output-level. For each type, we fine-tune different blocks\nof the model to achieve optimal performance with reduced energy costs.\nSpecifically, input-, feature-, and output-level correspond to fine-tuning the\nfront, middle, and rear blocks of the model. We evaluate TBFT on a ResNet\nmodel, three datasets, three different training sizes, and a Raspberry Pi.\nCompared with the $Block Avg$, where each block is fine-tuned individually and\ntheir performance improvements are averaged, TBFT exhibits an improvement in\nmodel accuracy by an average of 15.30% whilst saving 41.57% energy consumption\non average compared with full fine-tuning.\n","authors":["Yushan Huang","Josh Millar","Yuxuan Long","Yuchen Zhao","Hamed Hadaddi"],"pdf_url":"https://arxiv.org/pdf/2403.15905v1.pdf","comment":"Accepetd to The 4th Workshop on Machine Learning and Systems\n (EuroMLSys '24)"},{"id":"http://arxiv.org/abs/2403.15901v1","updated":"2024-03-23T18:04:58Z","published":"2024-03-23T18:04:58Z","title":"MatchSeg: Towards Better Segmentation via Reference Image Matching","summary":" Recently, automated medical image segmentation methods based on deep learning\nhave achieved great success. However, they heavily rely on large annotated\ndatasets, which are costly and time-consuming to acquire. Few-shot learning\naims to overcome the need for annotated data by using a small labeled dataset,\nknown as a support set, to guide predicting labels for new, unlabeled images,\nknown as the query set. Inspired by this paradigm, we introduce MatchSeg, a\nnovel framework that enhances medical image segmentation through strategic\nreference image matching. We leverage contrastive language-image pre-training\n(CLIP) to select highly relevant samples when defining the support set.\nAdditionally, we design a joint attention module to strengthen the interaction\nbetween support and query features, facilitating a more effective knowledge\ntransfer between support and query sets. We validated our method across four\npublic datasets. Experimental results demonstrate superior segmentation\nperformance and powerful domain generalization ability of MatchSeg against\nexisting methods for domain-specific and cross-domain segmentation tasks. Our\ncode is made available at https://github.com/keeplearning-again/MatchSeg\n","authors":["Ruiqiang Xiao","Jiayu Huo","Haotian Zheng","Yang Liu","Sebastien Ourselin","Rachel Sparks"],"pdf_url":"https://arxiv.org/pdf/2403.15901v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10959v3","updated":"2024-03-23T17:36:19Z","published":"2023-11-18T03:39:02Z","title":"Structure-Aware Sparse-View X-ray 3D Reconstruction","summary":" X-ray, known for its ability to reveal internal structures of objects, is\nexpected to provide richer information for 3D reconstruction than visible\nlight. Yet, existing neural radiance fields (NeRF) algorithms overlook this\nimportant nature of X-ray, leading to their limitations in capturing structural\ncontents of imaged objects. In this paper, we propose a framework,\nStructure-Aware X-ray Neural Radiodensity Fields (SAX-NeRF), for sparse-view\nX-ray 3D reconstruction. Firstly, we design a Line Segment-based Transformer\n(Lineformer) as the backbone of SAX-NeRF. Linefomer captures internal\nstructures of objects in 3D space by modeling the dependencies within each line\nsegment of an X-ray. Secondly, we present a Masked Local-Global (MLG) ray\nsampling strategy to extract contextual and geometric information in 2D\nprojection. Plus, we collect a larger-scale dataset X3D covering wider X-ray\napplications. Experiments on X3D show that SAX-NeRF surpasses previous\nNeRF-based methods by 12.56 and 2.49 dB on novel view synthesis and CT\nreconstruction. Code, models, and data are released at\nhttps://github.com/caiyuanhao1998/SAX-NeRF\n","authors":["Yuanhao Cai","Jiahao Wang","Alan Yuille","Zongwei Zhou","Angtian Wang"],"pdf_url":"https://arxiv.org/pdf/2311.10959v3.pdf","comment":"CVPR 2024; The first Transformer-based method for X-ray and CT 3D\n reconstruction"},{"id":"http://arxiv.org/abs/2403.15891v1","updated":"2024-03-23T17:17:08Z","published":"2024-03-23T17:17:08Z","title":"Human Motion Prediction under Unexpected Perturbation","summary":" We investigate a new task in human motion prediction, which is predicting\nmotions under unexpected physical perturbation potentially involving multiple\npeople. Compared with existing research, this task involves predicting less\ncontrolled, unpremeditated and pure reactive motions in response to external\nimpact and how such motions can propagate through people. It brings new\nchallenges such as data scarcity and predicting complex interactions. To this\nend, we propose a new method capitalizing differential physics and deep neural\nnetworks, leading to an explicit Latent Differential Physics (LDP) model.\nThrough experiments, we demonstrate that LDP has high data efficiency,\noutstanding prediction accuracy, strong generalizability and good\nexplainability. Since there is no similar research, a comprehensive comparison\nwith 11 adapted baselines from several relevant domains is conducted, showing\nLDP outperforming existing research both quantitatively and qualitatively,\nimproving prediction accuracy by as much as 70%, and demonstrating\nsignificantly stronger generalization.\n","authors":["Jiangbei Yue","Baiyi Li","Julien Pettré","Armin Seyfried","He Wang"],"pdf_url":"https://arxiv.org/pdf/2403.15891v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.15852v4","updated":"2024-03-23T16:54:01Z","published":"2024-02-24T16:39:16Z","title":"NaVid: Video-based VLM Plans the Next Step for Vision-and-Language\n Navigation","summary":" Vision-and-Language Navigation (VLN) stands as a key research problem of\nEmbodied AI, aiming at enabling agents to navigate in unseen environments\nfollowing linguistic instructions. In this field, generalization is a\nlong-standing challenge, either to out-of-distribution scenes or from Sim to\nReal. In this paper, we propose NaVid, a video-based large vision language\nmodel (VLM), to mitigate such a generalization gap. NaVid makes the first\nendeavour to showcase the capability of VLMs to achieve state-of-the-art level\nnavigation performance without any maps, odometer and depth inputs. Following\nhuman instruction, NaVid only requires an on-the-fly video stream from a\nmonocular RGB camera equipped on the robot to output the next-step action. Our\nformulation mimics how humans navigate and naturally gets rid of the problems\nintroduced by odometer noises, and the Sim2Real gaps from map or depth inputs.\nMoreover, our video-based approach can effectively encode the historical\nobservations of robots as spatio-temporal contexts for decision-making and\ninstruction following. We train NaVid with 550k navigation samples collected\nfrom VLN-CE trajectories, including action-planning and instruction-reasoning\nsamples, along with 665k large-scale web data. Extensive experiments show that\nNaVid achieves SOTA performance in simulation environments and the real world,\ndemonstrating superior cross-dataset and Sim2Real transfer. We thus believe our\nproposed VLM approach plans the next step for not only the navigation agents\nbut also this research field.\n","authors":["Jiazhao Zhang","Kunyu Wang","Rongtao Xu","Gengze Zhou","Yicong Hong","Xiaomeng Fang","Qi Wu","Zhizheng Zhang","He Wang"],"pdf_url":"https://arxiv.org/pdf/2402.15852v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.06623v3","updated":"2024-03-23T16:27:17Z","published":"2023-11-11T17:52:06Z","title":"VT-Former: An Exploratory Study on Vehicle Trajectory Prediction for\n Highway Surveillance through Graph Isomorphism and Transformer","summary":" Enhancing roadway safety has become an essential computer vision focus area\nfor Intelligent Transportation Systems (ITS). As a part of ITS, Vehicle\nTrajectory Prediction (VTP) aims to forecast a vehicle's future positions based\non its past and current movements. VTP is a pivotal element for road safety,\naiding in applications such as traffic management, accident prevention,\nwork-zone safety, and energy optimization. While most works in this field focus\non autonomous driving, with the growing number of surveillance cameras, another\nsub-field emerges for surveillance VTP with its own set of challenges. In this\npaper, we introduce VT-Former, a novel transformer-based VTP approach for\nhighway safety and surveillance. In addition to utilizing transformers to\ncapture long-range temporal patterns, a new Graph Attentive Tokenization (GAT)\nmodule has been proposed to capture intricate social interactions among\nvehicles. This study seeks to explore both the advantages and the limitations\ninherent in combining transformer architecture with graphs for VTP. Our\ninvestigation, conducted across three benchmark datasets from diverse\nsurveillance viewpoints, showcases the State-of-the-Art (SotA) or comparable\nperformance of VT-Former in predicting vehicle trajectories. This study\nunderscores the potentials of VT-Former and its architecture, opening new\navenues for future research and exploration.\n","authors":["Armin Danesh Pazho","Ghazal Alinezhad Noghre","Vinit Katariya","Hamed Tabkhi"],"pdf_url":"https://arxiv.org/pdf/2311.06623v3.pdf","comment":"Completely updated based on the reviews received for the paper"},{"id":"http://arxiv.org/abs/2403.15878v1","updated":"2024-03-23T16:08:48Z","published":"2024-03-23T16:08:48Z","title":"Diffusion-based Aesthetic QR Code Generation via Scanning-Robust\n Perceptual Guidance","summary":" QR codes, prevalent in daily applications, lack visual appeal due to their\nconventional black-and-white design. Integrating aesthetics while maintaining\nscannability poses a challenge. In this paper, we introduce a novel\ndiffusion-model-based aesthetic QR code generation pipeline, utilizing\npre-trained ControlNet and guided iterative refinement via a novel classifier\nguidance (SRG) based on the proposed Scanning-Robust Loss (SRL) tailored with\nQR code mechanisms, which ensures both aesthetics and scannability. To further\nimprove the scannability while preserving aesthetics, we propose a two-stage\npipeline with Scanning-Robust Perceptual Guidance (SRPG). Moreover, we can\nfurther enhance the scannability of the generated QR code by post-processing it\nthrough the proposed Scanning-Robust Projected Gradient Descent (SRPGD)\npost-processing technique based on SRL with proven convergence. With extensive\nquantitative, qualitative, and subjective experiments, the results demonstrate\nthat the proposed approach can generate diverse aesthetic QR codes with\nflexibility in detail. In addition, our pipelines outperforming existing models\nin terms of Scanning Success Rate (SSR) 86.67% (+40%) with comparable aesthetic\nscores. The pipeline combined with SRPGD further achieves 96.67% (+50%). Our\ncode will be available https://github.com/jwliao1209/DiffQRCode.\n","authors":["Jia-Wei Liao","Winston Wang","Tzu-Sian Wang","Li-Xuan Peng","Cheng-Fu Chou","Jun-Cheng Chen"],"pdf_url":"https://arxiv.org/pdf/2403.15878v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15876v1","updated":"2024-03-23T15:53:00Z","published":"2024-03-23T15:53:00Z","title":"Cognitive resilience: Unraveling the proficiency of image-captioning\n models to interpret masked visual content","summary":" This study explores the ability of Image Captioning (IC) models to decode\nmasked visual content sourced from diverse datasets. Our findings reveal the IC\nmodel's capability to generate captions from masked images, closely resembling\nthe original content. Notably, even in the presence of masks, the model adeptly\ncrafts descriptive textual information that goes beyond what is observable in\nthe original image-generated captions. While the decoding performance of the IC\nmodel experiences a decline with an increase in the masked region's area, the\nmodel still performs well when important regions of the image are not masked at\nhigh coverage.\n","authors":["Zhicheng Du","Zhaotian Xie","Huazhang Ying","Likun Zhang","Peiwu Qin"],"pdf_url":"https://arxiv.org/pdf/2403.15876v1.pdf","comment":"Accepted as tiny paper in ICLR 2024"},{"id":"http://arxiv.org/abs/2110.10494v2","updated":"2024-03-23T14:52:51Z","published":"2021-10-20T11:16:00Z","title":"Deep Point Cloud Normal Estimation via Triplet Learning","summary":" Normal estimation on 3D point clouds is a fundamental problem in 3D vision\nand graphics. Current methods often show limited accuracy in predicting normals\nat sharp features (e.g., edges and corners) and less robustness to noise. In\nthis paper, we propose a novel normal estimation method for point clouds. It\nconsists of two phases: (a) feature encoding which learns representations of\nlocal patches, and (b) normal estimation that takes the learned representation\nas input and regresses the normal vector. We are motivated that local patches\non isotropic and anisotropic surfaces have similar or distinct normals, and\nthat separable features or representations can be learned to facilitate normal\nestimation. To realise this, we first construct triplets of local patches on 3D\npoint cloud data, and design a triplet network with a triplet loss for feature\nencoding. We then design a simple network with several MLPs and a loss function\nto regress the normal vector. Despite having a smaller network size compared to\nmost other methods, experimental results show that our method preserves sharp\nfeatures and achieves better normal estimation results on CAD-like shapes.\n","authors":["Weijia Wang","Xuequan Lu","Dasith de Silva Edirimuni","Xiao Liu","Antonio Robles-Kelly"],"pdf_url":"https://arxiv.org/pdf/2110.10494v2.pdf","comment":"Accepted by ICME 2022. Supplementary material available at\n https://ieeexplore.ieee.org/document/9859844/media#media"},{"id":"http://arxiv.org/abs/2309.06380v2","updated":"2024-03-23T14:22:39Z","published":"2023-09-12T16:42:09Z","title":"InstaFlow: One Step is Enough for High-Quality Diffusion-Based\n Text-to-Image Generation","summary":" Diffusion models have revolutionized text-to-image generation with its\nexceptional quality and creativity. However, its multi-step sampling process is\nknown to be slow, often requiring tens of inference steps to obtain\nsatisfactory results. Previous attempts to improve its sampling speed and\nreduce computational costs through distillation have been unsuccessful in\nachieving a functional one-step model. In this paper, we explore a recent\nmethod called Rectified Flow, which, thus far, has only been applied to small\ndatasets. The core of Rectified Flow lies in its \\emph{reflow} procedure, which\nstraightens the trajectories of probability flows, refines the coupling between\nnoises and images, and facilitates the distillation process with student\nmodels. We propose a novel text-conditioned pipeline to turn Stable Diffusion\n(SD) into an ultra-fast one-step model, in which we find reflow plays a\ncritical role in improving the assignment between noise and images. Leveraging\nour new pipeline, we create, to the best of our knowledge, the first one-step\ndiffusion-based text-to-image generator with SD-level image quality, achieving\nan FID (Frechet Inception Distance) of $23.3$ on MS COCO 2017-5k, surpassing\nthe previous state-of-the-art technique, progressive distillation, by a\nsignificant margin ($37.2$ $\\rightarrow$ $23.3$ in FID). By utilizing an\nexpanded network with 1.7B parameters, we further improve the FID to $22.4$. We\ncall our one-step models \\emph{InstaFlow}. On MS COCO 2014-30k, InstaFlow\nyields an FID of $13.1$ in just $0.09$ second, the best in $\\leq 0.1$ second\nregime, outperforming the recent StyleGAN-T ($13.9$ in $0.1$ second). Notably,\nthe training of InstaFlow only costs 199 A100 GPU days. Codes and pre-trained\nmodels are available at \\url{github.com/gnobitab/InstaFlow}.\n","authors":["Xingchao Liu","Xiwen Zhang","Jianzhu Ma","Jian Peng","Qiang Liu"],"pdf_url":"https://arxiv.org/pdf/2309.06380v2.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2403.15853v1","updated":"2024-03-23T14:16:26Z","published":"2024-03-23T14:16:26Z","title":"An edge detection-based deep learning approach for tear meniscus height\n measurement","summary":" Automatic measurements of tear meniscus height (TMH) have been achieved by\nusing deep learning techniques; however, annotation is significantly influenced\nby subjective factors and is both time-consuming and labor-intensive. In this\npaper, we introduce an automatic TMH measurement technique based on edge\ndetection-assisted annotation within a deep learning framework. This method\ngenerates mask labels less affected by subjective factors with enhanced\nefficiency compared to previous annotation approaches. For improved\nsegmentation of the pupil and tear meniscus areas, the convolutional neural\nnetwork Inceptionv3 was first implemented as an image quality assessment model,\neffectively identifying higher-quality images with an accuracy of 98.224%.\nSubsequently, by using the generated labels, various algorithms, including\nUnet, ResUnet, Deeplabv3+FcnResnet101, Deeplabv3+FcnResnet50, FcnResnet50, and\nFcnResnet101 were trained, with Unet demonstrating the best performance.\nFinally, Unet was used for automatic pupil and tear meniscus segmentation to\nlocate the center of the pupil and calculate TMH,respectively. An evaluation of\nthe mask quality predicted by Unet indicated a Mean Intersection over Union of\n0.9362, a recall of 0.9261, a precision of 0.9423, and an F1-Score of 0.9326.\nAdditionally, the TMH predicted by the model was assessed, with the fitting\ncurve represented as y= 0.982x-0.862, an overall correlation coefficient of\nr^2=0.961 , and an accuracy of 94.80% (237/250). In summary, the algorithm can\nautomatically screen images based on their quality,segment the pupil and tear\nmeniscus areas, and automatically measure TMH. Measurement results using the AI\nalgorithm demonstrate a high level of consistency with manual measurements,\noffering significant support to clinical doctors in diagnosing dry eye disease.\n","authors":["Kesheng Wang","Kunhui Xu","Xiaoyu Chen","Chunlei He","Jianfeng Zhang","Dexing Kong","Qi Dai","Shoujun Huang"],"pdf_url":"https://arxiv.org/pdf/2403.15853v1.pdf","comment":"22 pages, 5 figures"},{"id":"http://arxiv.org/abs/2403.15849v1","updated":"2024-03-23T13:52:16Z","published":"2024-03-23T13:52:16Z","title":"Inpainting-Driven Mask Optimization for Object Removal","summary":" This paper proposes a mask optimization method for improving the quality of\nobject removal using image inpainting. While many inpainting methods are\ntrained with a set of random masks, a target for inpainting may be an object,\nsuch as a person, in many realistic scenarios. This domain gap between masks in\ntraining and inference images increases the difficulty of the inpainting task.\nIn our method, this domain gap is resolved by training the inpainting network\nwith object masks extracted by segmentation, and such object masks are also\nused in the inference step. Furthermore, to optimize the object masks for\ninpainting, the segmentation network is connected to the inpainting network and\nend-to-end trained to improve the inpainting performance. The effect of this\nend-to-end training is further enhanced by our mask expansion loss for\nachieving the trade-off between large and small masks. Experimental results\ndemonstrate the effectiveness of our method for better object removal using\nimage inpainting.\n","authors":["Kodai Shimosato","Norimichi Ukita"],"pdf_url":"https://arxiv.org/pdf/2403.15849v1.pdf","comment":"Accepted to IJCNN 2024 (International Joint Conference on Neural\n Networks)"},{"id":"http://arxiv.org/abs/2312.02134v2","updated":"2024-03-23T13:24:39Z","published":"2023-12-04T18:55:45Z","title":"GaussianAvatar: Towards Realistic Human Avatar Modeling from a Single\n Video via Animatable 3D Gaussians","summary":" We present GaussianAvatar, an efficient approach to creating realistic human\navatars with dynamic 3D appearances from a single video. We start by\nintroducing animatable 3D Gaussians to explicitly represent humans in various\nposes and clothing styles. Such an explicit and animatable representation can\nfuse 3D appearances more efficiently and consistently from 2D observations. Our\nrepresentation is further augmented with dynamic properties to support\npose-dependent appearance modeling, where a dynamic appearance network along\nwith an optimizable feature tensor is designed to learn the\nmotion-to-appearance mapping. Moreover, by leveraging the differentiable motion\ncondition, our method enables a joint optimization of motions and appearances\nduring avatar modeling, which helps to tackle the long-standing issue of\ninaccurate motion estimation in monocular settings. The efficacy of\nGaussianAvatar is validated on both the public dataset and our collected\ndataset, demonstrating its superior performances in terms of appearance quality\nand rendering efficiency.\n","authors":["Liangxiao Hu","Hongwen Zhang","Yuxiang Zhang","Boyao Zhou","Boning Liu","Shengping Zhang","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2312.02134v2.pdf","comment":"Project Page: https://huliangxiao.github.io/GaussianAvatar"},{"id":"http://arxiv.org/abs/2403.15837v1","updated":"2024-03-23T13:24:31Z","published":"2024-03-23T13:24:31Z","title":"Centered Masking for Language-Image Pre-Training","summary":" We introduce Gaussian masking for Language-Image Pre-Training (GLIP) a novel,\nstraightforward, and effective technique for masking image patches during\npre-training of a vision-language model. GLIP builds on Fast Language-Image\nPre-Training (FLIP), which randomly masks image patches while training a CLIP\nmodel. GLIP replaces random masking with centered masking, that uses a Gaussian\ndistribution and is inspired by the importance of image patches at the center\nof the image. GLIP retains the same computational savings as FLIP, while\nimproving performance across a range of downstream datasets and tasks, as\ndemonstrated by our experimental results. We show the benefits of GLIP to be\neasy to obtain, requiring no delicate tuning of the Gaussian, and also\napplicable to data sets containing images without an obvious center focus.\n","authors":["Mingliang Liang","Martha Larson"],"pdf_url":"https://arxiv.org/pdf/2403.15837v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15836v1","updated":"2024-03-23T13:24:30Z","published":"2024-03-23T13:24:30Z","title":"VLM-CPL: Consensus Pseudo Labels from Vision-Language Models for Human\n Annotation-Free Pathological Image Classification","summary":" Despite that deep learning methods have achieved remarkable performance in\npathology image classification, they heavily rely on labeled data, demanding\nextensive human annotation efforts. In this study, we present a novel human\nannotation-free method for pathology image classification by leveraging\npre-trained Vision-Language Models (VLMs). Without human annotation, pseudo\nlabels of the training set are obtained by utilizing the zero-shot inference\ncapabilities of VLM, which may contain a lot of noise due to the domain shift\nbetween the pre-training data and the target dataset. To address this issue, we\nintroduce VLM-CPL, a novel approach based on consensus pseudo labels that\nintegrates two noisy label filtering techniques with a semi-supervised learning\nstrategy. Specifically, we first obtain prompt-based pseudo labels with\nuncertainty estimation by zero-shot inference with the VLM using multiple\naugmented views of an input. Then, by leveraging the feature representation\nability of VLM, we obtain feature-based pseudo labels via sample clustering in\nthe feature space. Prompt-feature consensus is introduced to select reliable\nsamples based on the consensus between the two types of pseudo labels. By\nrejecting low-quality pseudo labels, we further propose High-confidence Cross\nSupervision (HCS) to learn from samples with reliable pseudo labels and the\nremaining unlabeled samples. Experimental results showed that our method\nobtained an accuracy of 87.1% and 95.1% on the HPH and LC25K datasets,\nrespectively, and it largely outperformed existing zero-shot classification and\nnoisy label learning methods. The code is available at\nhttps://github.com/lanfz2000/VLM-CPL.\n","authors":["Lanfeng Zhong","Xin Liao","Shaoting Zhang","Xiaofan Zhang","Guotai Wang"],"pdf_url":"https://arxiv.org/pdf/2403.15836v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2403.15835v1","updated":"2024-03-23T13:22:36Z","published":"2024-03-23T13:22:36Z","title":"Once for Both: Single Stage of Importance and Sparsity Search for Vision\n Transformer Compression","summary":" Recent Vision Transformer Compression (VTC) works mainly follow a two-stage\nscheme, where the importance score of each model unit is first evaluated or\npreset in each submodule, followed by the sparsity score evaluation according\nto the target sparsity constraint. Such a separate evaluation process induces\nthe gap between importance and sparsity score distributions, thus causing high\nsearch costs for VTC. In this work, for the first time, we investigate how to\nintegrate the evaluations of importance and sparsity scores into a single\nstage, searching the optimal subnets in an efficient manner. Specifically, we\npresent OFB, a cost-efficient approach that simultaneously evaluates both\nimportance and sparsity scores, termed Once for Both (OFB), for VTC. First, a\nbi-mask scheme is developed by entangling the importance score and the\ndifferentiable sparsity score to jointly determine the pruning potential\n(prunability) of each unit. Such a bi-mask search strategy is further used\ntogether with a proposed adaptive one-hot loss to realize the\nprogressive-and-efficient search for the most important subnet. Finally,\nProgressive Masked Image Modeling (PMIM) is proposed to regularize the feature\nspace to be more representative during the search process, which may be\ndegraded by the dimension reduction. Extensive experiments demonstrate that OFB\ncan achieve superior compression performance over state-of-the-art\nsearching-based and pruning-based methods under various Vision Transformer\narchitectures, meanwhile promoting search efficiency significantly, e.g.,\ncosting one GPU search day for the compression of DeiT-S on ImageNet-1K.\n","authors":["Hancheng Ye","Chong Yu","Peng Ye","Renqiu Xia","Yansong Tang","Jiwen Lu","Tao Chen","Bo Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.15835v1.pdf","comment":"Accepted by CVPR 2024. Our code will be available at\n www.github.com/HankYe/Once-for-Both"},{"id":"http://arxiv.org/abs/2403.15832v1","updated":"2024-03-23T13:16:07Z","published":"2024-03-23T13:16:07Z","title":"Time-series Initialization and Conditioning for Video-agnostic\n Stabilization of Video Super-Resolution using Recurrent Networks","summary":" A Recurrent Neural Network (RNN) for Video Super Resolution (VSR) is\ngenerally trained with randomly clipped and cropped short videos extracted from\noriginal training videos due to various challenges in learning RNNs. However,\nsince this RNN is optimized to super-resolve short videos, VSR of long videos\nis degraded due to the domain gap. Our preliminary experiments reveal that such\ndegradation changes depending on the video properties, such as the video length\nand dynamics. To avoid this degradation, this paper proposes the training\nstrategy of RNN for VSR that can work efficiently and stably independently of\nthe video length and dynamics. The proposed training strategy stabilizes VSR by\ntraining a VSR network with various RNN hidden states changed depending on the\nvideo properties. Since computing such a variety of hidden states is\ntime-consuming, this computational cost is reduced by reusing the hidden states\nfor efficient training. In addition, training stability is further improved\nwith frame-number conditioning. Our experimental results demonstrate that the\nproposed method performed better than base methods in videos with various\nlengths and dynamics.\n","authors":["Hiroshi Mori","Norimichi Ukita"],"pdf_url":"https://arxiv.org/pdf/2403.15832v1.pdf","comment":"Accepted to IJCNN 2024 (International Joint Conference on Neural\n Networks)"},{"id":"http://arxiv.org/abs/2403.15831v1","updated":"2024-03-23T13:15:44Z","published":"2024-03-23T13:15:44Z","title":"Spatio-Temporal Bi-directional Cross-frame Memory for Distractor\n Filtering Point Cloud Single Object Tracking","summary":" 3D single object tracking within LIDAR point clouds is a pivotal task in\ncomputer vision, with profound implications for autonomous driving and\nrobotics. However, existing methods, which depend solely on appearance matching\nvia Siamese networks or utilize motion information from successive frames,\nencounter significant challenges. Issues such as similar objects nearby or\nocclusions can result in tracker drift. To mitigate these challenges, we design\nan innovative spatio-temporal bi-directional cross-frame distractor filtering\ntracker, named STMD-Tracker. Our first step involves the creation of a 4D\nmulti-frame spatio-temporal graph convolution backbone. This design separates\nKNN graph spatial embedding and incorporates 1D temporal convolution,\neffectively capturing temporal fluctuations and spatio-temporal information.\nSubsequently, we devise a novel bi-directional cross-frame memory procedure.\nThis integrates future and synthetic past frame memory to enhance the current\nmemory, thereby improving the accuracy of iteration-based tracking. This\niterative memory update mechanism allows our tracker to dynamically compensate\nfor information in the current frame, effectively reducing tracker drift.\nLastly, we construct spatially reliable Gaussian masks on the fused features to\neliminate distractor points. This is further supplemented by an object-aware\nsampling strategy, which bolsters the efficiency and precision of object\nlocalization, thereby reducing tracking errors caused by distractors. Our\nextensive experiments on KITTI, NuScenes and Waymo datasets demonstrate that\nour approach significantly surpasses the current state-of-the-art methods.\n","authors":["Shaoyu Sun","Chunyang Wang","Xuelian Liu","Chunhao Shi","Yueyang Ding","Guan Xi"],"pdf_url":"https://arxiv.org/pdf/2403.15831v1.pdf","comment":"18 pages,6 figures"},{"id":"http://arxiv.org/abs/2403.15803v1","updated":"2024-03-23T11:27:23Z","published":"2024-03-23T11:27:23Z","title":"Innovative Quantitative Analysis for Disease Progression Assessment in\n Familial Cerebral Cavernous Malformations","summary":" Familial cerebral cavernous malformation (FCCM) is a hereditary disorder\ncharacterized by abnormal vascular structures within the central nervous\nsystem. The FCCM lesions are often numerous and intricate, making quantitative\nanalysis of the lesions a labor-intensive task. Consequently, clinicians face\nchallenges in quantitatively assessing the severity of lesions and determining\nwhether lesions have progressed. To alleviate this problem, we propose a\nquantitative statistical framework for FCCM, comprising an efficient annotation\nmodule, an FCCM lesion segmentation module, and an FCCM lesion quantitative\nstatistics module. Our framework demonstrates precise segmentation of the FCCM\nlesion based on efficient data annotation, achieving a Dice coefficient of\n93.22\\%. More importantly, we focus on quantitative statistics of lesions,\nwhich is combined with image registration to realize the quantitative\ncomparison of lesions between different examinations of patients, and a\nvisualization framework has been established for doctors to comprehensively\ncompare and analyze lesions. The experimental results have demonstrated that\nour proposed framework not only obtains objective, accurate, and comprehensive\nquantitative statistical information, which provides a quantitative assessment\nmethod for disease progression and drug efficacy study, but also considerably\nreduces the manual measurement and statistical workload of lesions, assisting\nclinical decision-making for FCCM and accelerating progress in FCCM clinical\nresearch. This highlights the potential of practical application of the\nframework in FCCM clinical research and clinical decision-making. The codes are\navailable at https://github.com/6zrg/Quantitative-Statistics-of-FCCM.\n","authors":["Ruige Zong","Tao Wang","Chunwang Li","Xinlin Zhang","Yuanbin Chen","Longxuan Zhao","Qixuan Li","Qinquan Gao","Dezhi Kang","Fuxin Lin","Tong Tong"],"pdf_url":"https://arxiv.org/pdf/2403.15803v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14447v2","updated":"2024-03-23T11:26:38Z","published":"2024-03-21T14:53:50Z","title":"Exploring 3D Human Pose Estimation and Forecasting from the Robot's\n Perspective: The HARPER Dataset","summary":" We introduce HARPER, a novel dataset for 3D body pose estimation and forecast\nin dyadic interactions between users and Spot, the quadruped robot manufactured\nby Boston Dynamics. The key-novelty is the focus on the robot's perspective,\ni.e., on the data captured by the robot's sensors. These make 3D body pose\nanalysis challenging because being close to the ground captures humans only\npartially. The scenario underlying HARPER includes 15 actions, of which 10\ninvolve physical contact between the robot and users. The Corpus contains not\nonly the recordings of the built-in stereo cameras of Spot, but also those of a\n6-camera OptiTrack system (all recordings are synchronized). This leads to\nground-truth skeletal representations with a precision lower than a millimeter.\nIn addition, the Corpus includes reproducible benchmarks on 3D Human Pose\nEstimation, Human Pose Forecasting, and Collision Prediction, all based on\npublicly available baseline approaches. This enables future HARPER users to\nrigorously compare their results with those we provide in this work.\n","authors":["Andrea Avogaro","Andrea Toaiari","Federico Cunico","Xiangmin Xu","Haralambos Dafas","Alessandro Vinciarelli","Emma Li","Marco Cristani"],"pdf_url":"https://arxiv.org/pdf/2403.14447v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15789v1","updated":"2024-03-23T10:32:29Z","published":"2024-03-23T10:32:29Z","title":"In-Context Matting","summary":" We introduce in-context matting, a novel task setting of image matting. Given\na reference image of a certain foreground and guided priors such as points,\nscribbles, and masks, in-context matting enables automatic alpha estimation on\na batch of target images of the same foreground category, without additional\nauxiliary input. This setting marries good performance in auxiliary input-based\nmatting and ease of use in automatic matting, which finds a good trade-off\nbetween customization and automation. To overcome the key challenge of accurate\nforeground matching, we introduce IconMatting, an in-context matting model\nbuilt upon a pre-trained text-to-image diffusion model. Conditioned on inter-\nand intra-similarity matching, IconMatting can make full use of reference\ncontext to generate accurate target alpha mattes. To benchmark the task, we\nalso introduce a novel testing dataset ICM-$57$, covering 57 groups of\nreal-world images. Quantitative and qualitative results on the ICM-57 testing\nset show that IconMatting rivals the accuracy of trimap-based matting while\nretaining the automation level akin to automatic matting. Code is available at\nhttps://github.com/tiny-smart/in-context-matting\n","authors":["He Guo","Zixuan Ye","Zhiguo Cao","Hao Lu"],"pdf_url":"https://arxiv.org/pdf/2403.15789v1.pdf","comment":"Accepted to CVPR 2024. Code is available at\n https://github.com/tiny-smart/in-context-matting"},{"id":"http://arxiv.org/abs/2207.04913v2","updated":"2024-03-23T10:32:22Z","published":"2022-07-11T14:46:50Z","title":"Generalizing to Unseen Domains with Wasserstein Distributional\n Robustness under Limited Source Knowledge","summary":" Domain generalization aims at learning a universal model that performs well\non unseen target domains, incorporating knowledge from multiple source domains.\nIn this research, we consider the scenario where different domain shifts occur\namong conditional distributions of different classes across domains. When\nlabeled samples in the source domains are limited, existing approaches are not\nsufficiently robust. To address this problem, we propose a novel domain\ngeneralization framework called {Wasserstein Distributionally Robust Domain\nGeneralization} (WDRDG), inspired by the concept of distributionally robust\noptimization. We encourage robustness over conditional distributions within\nclass-specific Wasserstein uncertainty sets and optimize the worst-case\nperformance of a classifier over these uncertainty sets. We further develop a\ntest-time adaptation module leveraging optimal transport to quantify the\nrelationship between the unseen target domain and source domains to make\nadaptive inference for target data. Experiments on the Rotated MNIST, PACS and\nthe VLCS datasets demonstrate that our method could effectively balance the\nrobustness and discriminability in challenging generalization scenarios.\n","authors":["Jingge Wang","Liyan Xie","Yao Xie","Shao-Lun Huang","Yang Li"],"pdf_url":"https://arxiv.org/pdf/2207.04913v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15787v1","updated":"2024-03-23T10:16:36Z","published":"2024-03-23T10:16:36Z","title":"Depth Estimation fusing Image and Radar Measurements with Uncertain\n Directions","summary":" This paper proposes a depth estimation method using radar-image fusion by\naddressing the uncertain vertical directions of sparse radar measurements. In\nprior radar-image fusion work, image features are merged with the uncertain\nsparse depths measured by radar through convolutional layers. This approach is\ndisturbed by the features computed with the uncertain radar depths.\nFurthermore, since the features are computed with a fully convolutional\nnetwork, the uncertainty of each depth corresponding to a pixel is spread out\nover its surrounding pixels. Our method avoids this problem by computing\nfeatures only with an image and conditioning the features pixelwise with the\nradar depth. Furthermore, the set of possibly correct radar directions is\nidentified with reliable LiDAR measurements, which are available only in the\ntraining stage. Our method improves training data by learning only these\npossibly correct radar directions, while the previous method trains raw radar\nmeasurements, including erroneous measurements. Experimental results\ndemonstrate that our method can improve the quantitative and qualitative\nresults compared with its base method using radar-image fusion.\n","authors":["Masaya Kotani","Takeru Oba","Norimichi Ukita"],"pdf_url":"https://arxiv.org/pdf/2403.15787v1.pdf","comment":"Accepted to IJCNN 2024 (International Joint Conference on Neural\n Networks)"},{"id":"http://arxiv.org/abs/2403.15786v1","updated":"2024-03-23T10:16:05Z","published":"2024-03-23T10:16:05Z","title":"Adversarial Defense Teacher for Cross-Domain Object Detection under Poor\n Visibility Conditions","summary":" Existing object detectors encounter challenges in handling domain shifts\nbetween training and real-world data, particularly under poor visibility\nconditions like fog and night. Cutting-edge cross-domain object detection\nmethods use teacher-student frameworks and compel teacher and student models to\nproduce consistent predictions under weak and strong augmentations,\nrespectively. In this paper, we reveal that manually crafted augmentations are\ninsufficient for optimal teaching and present a simple yet effective framework\nnamed Adversarial Defense Teacher (ADT), leveraging adversarial defense to\nenhance teaching quality. Specifically, we employ adversarial attacks,\nencouraging the model to generalize on subtly perturbed inputs that effectively\ndeceive the model. To address small objects under poor visibility conditions,\nwe propose a Zoom-in Zoom-out strategy, which zooms-in images for better\npseudo-labels and zooms-out images and pseudo-labels to learn refined features.\nOur results demonstrate that ADT achieves superior performance, reaching 54.5%\nmAP on Foggy Cityscapes, surpassing the previous state-of-the-art by 2.6% mAP.\n","authors":["Kaiwen Wang","Yinzhe Shen","Martin Lauer"],"pdf_url":"https://arxiv.org/pdf/2403.15786v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02746v3","updated":"2024-03-23T09:51:35Z","published":"2024-03-05T08:02:00Z","title":"Learning without Exact Guidance: Updating Large-scale High-resolution\n Land Cover Maps from Low-resolution Historical Labels","summary":" Large-scale high-resolution (HR) land-cover mapping is a vital task to survey\nthe Earth's surface and resolve many challenges facing humanity. However, it is\nstill a non-trivial task hindered by complex ground details, various landforms,\nand the scarcity of accurate training labels over a wide-span geographic area.\nIn this paper, we propose an efficient, weakly supervised framework\n(Paraformer) to guide large-scale HR land-cover mapping with easy-access\nhistorical land-cover data of low resolution (LR). Specifically, existing\nland-cover mapping approaches reveal the dominance of CNNs in preserving local\nground details but still suffer from insufficient global modeling in various\nlandforms. Therefore, we design a parallel CNN-Transformer feature extractor in\nParaformer, consisting of a downsampling-free CNN branch and a Transformer\nbranch, to jointly capture local and global contextual information. Besides,\nfacing the spatial mismatch of training data, a pseudo-label-assisted training\n(PLAT) module is adopted to reasonably refine LR labels for weakly supervised\nsemantic segmentation of HR images. Experiments on two large-scale datasets\ndemonstrate the superiority of Paraformer over other state-of-the-art methods\nfor automatically updating HR land-cover maps from LR historical labels.\n","authors":["Zhuohong Li","Wei He","Jiepan Li","Fangxiao Lu","Hongyan Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.02746v3.pdf","comment":"11 pages, 9 figures, accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.15770v1","updated":"2024-03-23T08:57:46Z","published":"2024-03-23T08:57:46Z","title":"Graph Image Prior for Unsupervised Dynamic MRI Reconstruction","summary":" The inductive bias of the convolutional neural network (CNN) can act as a\nstrong prior for image restoration, which is known as the Deep Image Prior\n(DIP). In recent years, DIP has been utilized in unsupervised dynamic MRI\nreconstruction, which adopts a generative model from the latent space to the\nimage space. However, existing methods usually utilize a single pyramid-shaped\nCNN architecture to parameterize the generator, which cannot effectively\nexploit the spatio-temporal correlations within the dynamic data. In this work,\nwe propose a novel scheme to exploit the DIP prior for dynamic MRI\nreconstruction, named ``Graph Image Prior'' (GIP). The generative model is\ndecomposed into two stages: image recovery and manifold discovery, which is\nbridged by a graph convolutional network to exploit the spatio-temporal\ncorrelations. In addition, we devise an ADMM algorithm to alternately optimize\nthe images and the network parameters to further improve the reconstruction\nperformance. Experimental results demonstrate that GIP outperforms compressed\nsensing methods and unsupervised methods over different sampling trajectories,\nand significantly reduces the performance gap with the state-of-art supervised\ndeep-learning methods. Moreover, GIP displays superior generalization ability\nwhen transferred to a different reconstruction setting, without the need for\nany additional data.\n","authors":["Zhongsen Li","Wenxuan Chen","Shuai Wang","Chuyu Liu","Rui Li"],"pdf_url":"https://arxiv.org/pdf/2403.15770v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15769v1","updated":"2024-03-23T08:54:03Z","published":"2024-03-23T08:54:03Z","title":"FusionINN: Invertible Image Fusion for Brain Tumor Monitoring","summary":" Image fusion typically employs non-invertible neural networks to merge\nmultiple source images into a single fused image. However, for clinical\nexperts, solely relying on fused images may be insufficient for making\ndiagnostic decisions, as the fusion mechanism blends features from source\nimages, thereby making it difficult to interpret the underlying tumor\npathology. We introduce FusionINN, a novel invertible image fusion framework,\ncapable of efficiently generating fused images and also decomposing them back\nto the source images by solving the inverse of the fusion process. FusionINN\nguarantees lossless one-to-one pixel mapping by integrating a normally\ndistributed latent image alongside the fused image to facilitate the generative\nmodeling of the decomposition process. To the best of our knowledge, we are the\nfirst to investigate the decomposability of fused images, which is particularly\ncrucial for life-sensitive applications such as medical image fusion compared\nto other tasks like multi-focus or multi-exposure image fusion. Our extensive\nexperimentation validates FusionINN over existing discriminative and generative\nfusion methods, both subjectively and objectively. Moreover, compared to a\nrecent denoising diffusion-based fusion model, our approach offers faster and\nqualitatively better fusion results. We also exhibit the clinical utility of\nour results in aiding disease prognosis.\n","authors":["Nishant Kumar","Ziyan Tao","Jaikirat Singh","Yang Li","Peiwen Sun","Binghui Zhao","Stefan Gumhold"],"pdf_url":"https://arxiv.org/pdf/2403.15769v1.pdf","comment":"Source code coming soon"},{"id":"http://arxiv.org/abs/2401.08154v3","updated":"2024-03-23T08:44:14Z","published":"2024-01-16T06:53:03Z","title":"TLIC: Learned Image Compression with ROI-Weighted Distortion and Bit\n Allocation","summary":" This short paper describes our method for the track of image compression. To\nachieve better perceptual quality, we use the adversarial loss to generate\nrealistic textures, use region of interest (ROI) mask to guide the bit\nallocation for different regions. Our Team name is TLIC.\n","authors":["Wei Jiang","Yongqi Zhai","Hangyu Li","Ronggang Wang"],"pdf_url":"https://arxiv.org/pdf/2401.08154v3.pdf","comment":"2nd Place in the Image Compression Track, CLIC 2024, DCC 2024"},{"id":"http://arxiv.org/abs/2403.15765v1","updated":"2024-03-23T08:40:35Z","published":"2024-03-23T08:40:35Z","title":"Towards Human-Like Machine Comprehension: Few-Shot Relational Learning\n in Visually-Rich Documents","summary":" Key-value relations are prevalent in Visually-Rich Documents (VRDs), often\ndepicted in distinct spatial regions accompanied by specific color and font\nstyles. These non-textual cues serve as important indicators that greatly\nenhance human comprehension and acquisition of such relation triplets. However,\ncurrent document AI approaches often fail to consider this valuable prior\ninformation related to visual and spatial features, resulting in suboptimal\nperformance, particularly when dealing with limited examples. To address this\nlimitation, our research focuses on few-shot relational learning, specifically\ntargeting the extraction of key-value relation triplets in VRDs. Given the\nabsence of a suitable dataset for this task, we introduce two new few-shot\nbenchmarks built upon existing supervised benchmark datasets. Furthermore, we\npropose a variational approach that incorporates relational 2D-spatial priors\nand prototypical rectification techniques. This approach aims to generate\nrelation representations that are more aware of the spatial context and unseen\nrelation in a manner similar to human perception. Experimental results\ndemonstrate the effectiveness of our proposed method by showcasing its ability\nto outperform existing methods. This study also opens up new possibilities for\npractical applications.\n","authors":["Hao Wang","Tang Li","Chenhui Chu","Nengjun Zhu","Rui Wang","Pinpin Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.15765v1.pdf","comment":"13 pages, 7 figures, accepted by LERC-COLING2024"},{"id":"http://arxiv.org/abs/2303.01656v2","updated":"2024-03-23T07:45:14Z","published":"2023-03-03T01:12:57Z","title":"Feature Completion Transformer for Occluded Person Re-identification","summary":" Occluded person re-identification (Re-ID) is a challenging problem due to the\ndestruction of occluders. Most existing methods focus on visible human body\nparts through some prior information. However, when complementary occlusions\noccur, features in occluded regions can interfere with matching, which affects\nperformance severely. In this paper, different from most previous works that\ndiscard the occluded region, we propose a Feature Completion Transformer\n(FCFormer) to implicitly complement the semantic information of occluded parts\nin the feature space. Specifically, Occlusion Instance Augmentation (OIA) is\nproposed to simulates real and diverse occlusion situations on the holistic\nimage. These augmented images not only enrich the amount of occlusion samples\nin the training set, but also form pairs with the holistic images.\nSubsequently, a dual-stream architecture with a shared encoder is proposed to\nlearn paired discriminative features from pairs of inputs. Without additional\nsemantic information, an occluded-holistic feature sample-label pair can be\nautomatically created. Then, Feature Completion Decoder (FCD) is designed to\ncomplement the features of occluded regions by using learnable tokens to\naggregate possible information from self-generated occluded features. Finally,\nwe propose the Cross Hard Triplet (CHT) loss to further bridge the gap between\ncomplementing features and extracting features under the same ID. In addition,\nFeature Completion Consistency (FC$^2$) loss is introduced to help the\ngenerated completion feature distribution to be closer to the real holistic\nfeature distribution. Extensive experiments over five challenging datasets\ndemonstrate that the proposed FCFormer achieves superior performance and\noutperforms the state-of-the-art methods by significant margins on occluded\ndatasets.\n","authors":["Tao Wang","Mengyuan Liu","Hong Liu","Wenhao Li","Miaoju Ban","Tuanyu Guo","Yidi Li"],"pdf_url":"https://arxiv.org/pdf/2303.01656v2.pdf","comment":"Published on IEEE Transactions on Multimedia (TMM)"},{"id":"http://arxiv.org/abs/2403.15751v1","updated":"2024-03-23T07:39:13Z","published":"2024-03-23T07:39:13Z","title":"AOCIL: Exemplar-free Analytic Online Class Incremental Learning with Low\n Time and Resource Consumption","summary":" Online Class Incremental Learning (OCIL) aims to train the model in a\ntask-by-task manner, where data arrive in mini-batches at a time while previous\ndata are not accessible. A significant challenge is known as Catastrophic\nForgetting, i.e., loss of the previous knowledge on old data. To address this,\nreplay-based methods show competitive results but invade data privacy, while\nexemplar-free methods protect data privacy but struggle for accuracy. In this\npaper, we proposed an exemplar-free approach -- Analytic Online Class\nIncremental Learning (AOCIL). Instead of back-propagation, we design the\nAnalytic Classifier (AC) updated by recursive least square, cooperating with a\nfrozen backbone. AOCIL simultaneously achieves high accuracy, low resource\nconsumption and data privacy protection. We conduct massive experiments on four\nexisting benchmark datasets, and the results demonstrate the strong capability\nof handling OCIL scenarios. Codes will be ready.\n","authors":["Huiping Zhuang","Yuchen Liu","Run He","Kai Tong","Ziqian Zeng","Cen Chen","Yi Wang","Lap-Pui Chau"],"pdf_url":"https://arxiv.org/pdf/2403.15751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15750v1","updated":"2024-03-23T07:36:58Z","published":"2024-03-23T07:36:58Z","title":"iDAT: inverse Distillation Adapter-Tuning","summary":" Adapter-Tuning (AT) method involves freezing a pre-trained model and\nintroducing trainable adapter modules to acquire downstream knowledge, thereby\ncalibrating the model for better adaptation to downstream tasks. This paper\nproposes a distillation framework for the AT method instead of crafting a\ncarefully designed adapter module, which aims to improve fine-tuning\nperformance. For the first time, we explore the possibility of combining the AT\nmethod with knowledge distillation. Via statistical analysis, we observe\nsignificant differences in the knowledge acquisition between adapter modules of\ndifferent models. Leveraging these differences, we propose a simple yet\neffective framework called inverse Distillation Adapter-Tuning (iDAT).\nSpecifically, we designate the smaller model as the teacher and the larger\nmodel as the student. The two are jointly trained, and online knowledge\ndistillation is applied to inject knowledge of different perspective to student\nmodel, and significantly enhance the fine-tuning performance on downstream\ntasks. Extensive experiments on the VTAB-1K benchmark with 19 image\nclassification tasks demonstrate the effectiveness of iDAT. The results show\nthat using existing AT method within our iDAT framework can further yield a\n2.66% performance gain, with only an additional 0.07M trainable parameters. Our\napproach compares favorably with state-of-the-arts without bells and whistles.\nOur code is available at https://github.com/JCruan519/iDAT.\n","authors":["Jiacheng Ruan","Jingsheng Gao","Mingye Xie","Daize Dong","Suncheng Xiang","Ting Liu","Yuzhuo Fu"],"pdf_url":"https://arxiv.org/pdf/2403.15750v1.pdf","comment":"10 pages, 9 figures, 13 tables. This paper has been accepted by ICME\n 2024"},{"id":"http://arxiv.org/abs/2305.15253v2","updated":"2024-03-23T07:14:23Z","published":"2023-05-24T15:36:46Z","title":"Rethinking the Evaluation Protocol of Domain Generalization","summary":" Domain generalization aims to solve the challenge of Out-of-Distribution\n(OOD) generalization by leveraging common knowledge learned from multiple\ntraining domains to generalize to unseen test domains. To accurately evaluate\nthe OOD generalization ability, it is required that test data information is\nunavailable. However, the current domain generalization protocol may still have\npotential test data information leakage. This paper examines the risks of test\ndata information leakage from two aspects of the current evaluation protocol:\nsupervised pretraining on ImageNet and oracle model selection. We propose\nmodifications to the current protocol that we should employ self-supervised\npretraining or train from scratch instead of employing the current supervised\npretraining, and we should use multiple test domains. These would result in a\nmore precise evaluation of OOD generalization ability. We also rerun the\nalgorithms with the modified protocol and introduce new leaderboards to\nencourage future research in domain generalization with a fairer comparison.\n","authors":["Han Yu","Xingxuan Zhang","Renzhe Xu","Jiashuo Liu","Yue He","Peng Cui"],"pdf_url":"https://arxiv.org/pdf/2305.15253v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02923v2","updated":"2024-03-23T07:00:15Z","published":"2023-12-05T17:50:55Z","title":"MoSA: Mixture of Sparse Adapters for Visual Efficient Tuning","summary":" With the rapid growth in the scale of pre-trained foundation models,\nparameter-efficient fine-tuning techniques have gained significant attention,\namong which Adapter Tuning is the most widely used. Despite achieving\nefficiency, it still underperforms full fine-tuning, and the performance\nimproves at the cost of an increase in parameters. Recent efforts have either\nfocused on training multiple adapter experts to increase model capacity or on\npruning adapters to achieve parameter efficiency. However, both approaches\nintroduce more parameters compared to the original adapter, hence are not\ncomputationally efficient. Motivated by this, we propose Mixture of Sparse\nAdapters, or MoSA, as a novel Adapter Tuning method to fully unleash the\npotential of each parameter in the adapter. We first split the standard adapter\ninto multiple non-overlapping modules, then stochastically activate them for\nsparse training, and finally merge them to form a complete adapter after\ntuning. In this way, MoSA can achieve significantly better performance than\nstandard adapters without any additional computational or storage overhead.\nFurthermore, we propose a hierarchical sparse strategy to better leverage\nlimited training data. Extensive experiments on a series of 27 visual tasks\ndemonstrate that MoSA consistently outperforms other Adapter Tuning methods as\nwell as other baselines by a large margin. Furthermore, MoSA brings consistent\nimprovements across various model scales, architectures, and different PEFT\nmethods. Code will be released.\n","authors":["Qizhe Zhang","Bocheng Zou","Ruichuan An","Jiaming Liu","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.02923v2.pdf","comment":"16 pages, 7 figures. Official code:\n https://github.com/Theia-4869/MoSA"},{"id":"http://arxiv.org/abs/2206.08898v2","updated":"2024-03-23T06:43:47Z","published":"2022-06-17T17:15:01Z","title":"SimA: Simple Softmax-free Attention for Vision Transformers","summary":" Recently, vision transformers have become very popular. However, deploying\nthem in many applications is computationally expensive partly due to the\nSoftmax layer in the attention block. We introduce a simple but effective,\nSoftmax-free attention block, SimA, which normalizes query and key matrices\nwith simple $\\ell_1$-norm instead of using Softmax layer. Then, the attention\nblock in SimA is a simple multiplication of three matrices, so SimA can\ndynamically change the ordering of the computation at the test time to achieve\nlinear computation on the number of tokens or the number of channels. We\nempirically show that SimA applied to three SOTA variations of transformers,\nDeiT, XCiT, and CvT, results in on-par accuracy compared to the SOTA models,\nwithout any need for Softmax layer. Interestingly, changing SimA from\nmulti-head to single-head has only a small effect on the accuracy, which\nsimplifies the attention block further. The code is available here:\nhttps://github.com/UCDvision/sima\n","authors":["Soroush Abbasi Koohpayegani","Hamed Pirsiavash"],"pdf_url":"https://arxiv.org/pdf/2206.08898v2.pdf","comment":"Code is available here: https://github.com/UCDvision/sima"},{"id":"http://arxiv.org/abs/2401.08503v3","updated":"2024-03-23T06:40:22Z","published":"2024-01-16T17:04:30Z","title":"Real3D-Portrait: One-shot Realistic 3D Talking Portrait Synthesis","summary":" One-shot 3D talking portrait generation aims to reconstruct a 3D avatar from\nan unseen image, and then animate it with a reference video or audio to\ngenerate a talking portrait video. The existing methods fail to simultaneously\nachieve the goals of accurate 3D avatar reconstruction and stable talking face\nanimation. Besides, while the existing works mainly focus on synthesizing the\nhead part, it is also vital to generate natural torso and background segments\nto obtain a realistic talking portrait video. To address these limitations, we\npresent Real3D-Potrait, a framework that (1) improves the one-shot 3D\nreconstruction power with a large image-to-plane model that distills 3D prior\nknowledge from a 3D face generative model; (2) facilitates accurate\nmotion-conditioned animation with an efficient motion adapter; (3) synthesizes\nrealistic video with natural torso movement and switchable background using a\nhead-torso-background super-resolution model; and (4) supports one-shot\naudio-driven talking face generation with a generalizable audio-to-motion\nmodel. Extensive experiments show that Real3D-Portrait generalizes well to\nunseen identities and generates more realistic talking portrait videos compared\nto previous methods. Video samples and source code are available at\nhttps://real3dportrait.github.io .\n","authors":["Zhenhui Ye","Tianyun Zhong","Yi Ren","Jiaqi Yang","Weichuang Li","Jiawei Huang","Ziyue Jiang","Jinzheng He","Rongjie Huang","Jinglin Liu","Chen Zhang","Xiang Yin","Zejun Ma","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.08503v3.pdf","comment":"ICLR 2024 (Spotlight). Project page: https://real3dportrait.github.io"},{"id":"http://arxiv.org/abs/2312.02548v2","updated":"2024-03-23T06:38:41Z","published":"2023-12-05T07:34:30Z","title":"GeNIe: Generative Hard Negative Images Through Diffusion","summary":" Data augmentation is crucial in training deep models, preventing them from\noverfitting to limited data. Recent advances in generative AI, e.g., diffusion\nmodels, have enabled more sophisticated augmentation techniques that produce\ndata resembling natural images. We introduce GeNIe a novel augmentation method\nwhich leverages a latent diffusion model conditioned on a text prompt to merge\ncontrasting data points (an image from the source category and a text prompt\nfrom the target category) to generate challenging samples. To achieve this,\ninspired by recent diffusion based image editing techniques, we limit the\nnumber of diffusion iterations to ensure the generated image retains low-level\nand background features from the source image while representing the target\ncategory, resulting in a hard negative sample for the source category. We\nfurther enhance the proposed approach by finding the appropriate noise level\nadaptively for each image (coined as GeNIe-Ada) leading to further performance\nimprovement. Our extensive experiments, in both few-shot and long-tail\ndistribution settings, demonstrate the effectiveness of our novel augmentation\nmethod and its superior performance over the prior art. Our code is available\nhere: https://github.com/UCDvision/GeNIe\n","authors":["Soroush Abbasi Koohpayegani","Anuj Singh","K L Navaneet","Hadi Jamali-Rad","Hamed Pirsiavash"],"pdf_url":"https://arxiv.org/pdf/2312.02548v2.pdf","comment":"Our code is available https://github.com/UCDvision/GeNIe"},{"id":"http://arxiv.org/abs/2403.13731v2","updated":"2024-03-23T06:31:11Z","published":"2024-03-19T12:26:53Z","title":"Emotion Recognition Using Transformers with Masked Learning","summary":" In recent years, deep learning has achieved innovative advancements in\nvarious fields, including the analysis of human emotions and behaviors.\nInitiatives such as the Affective Behavior Analysis in-the-wild (ABAW)\ncompetition have been particularly instrumental in driving research in this\narea by providing diverse and challenging datasets that enable precise\nevaluation of complex emotional states. This study leverages the Vision\nTransformer (ViT) and Transformer models to focus on the estimation of\nValence-Arousal (VA), which signifies the positivity and intensity of emotions,\nrecognition of various facial expressions, and detection of Action Units (AU)\nrepresenting fundamental muscle movements. This approach transcends traditional\nConvolutional Neural Networks (CNNs) and Long Short-Term Memory (LSTM) based\nmethods, proposing a new Transformer-based framework that maximizes the\nunderstanding of temporal and spatial features. The core contributions of this\nresearch include the introduction of a learning technique through random frame\nmasking and the application of Focal loss adapted for imbalanced data,\nenhancing the accuracy and applicability of emotion and behavior analysis in\nreal-world settings. This approach is expected to contribute to the advancement\nof emotional computing and deep learning methodologies.\n","authors":["Seongjae Min","Junseok Yang","Sangjun Lim","Junyong Lee","Sangwon Lee","Sejoon Lim"],"pdf_url":"https://arxiv.org/pdf/2403.13731v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15735v1","updated":"2024-03-23T06:03:12Z","published":"2024-03-23T06:03:12Z","title":"3D-TransUNet for Brain Metastases Segmentation in the BraTS2023\n Challenge","summary":" Segmenting brain tumors is complex due to their diverse appearances and\nscales. Brain metastases, the most common type of brain tumor, are a frequent\ncomplication of cancer. Therefore, an effective segmentation model for brain\nmetastases must adeptly capture local intricacies to delineate small tumor\nregions while also integrating global context to understand broader scan\nfeatures. The TransUNet model, which combines Transformer self-attention with\nU-Net's localized information, emerges as a promising solution for this task.\nIn this report, we address brain metastases segmentation by training the\n3D-TransUNet model on the Brain Tumor Segmentation (BraTS-METS) 2023 challenge\ndataset. Specifically, we explored two architectural configurations: the\nEncoder-only 3D-TransUNet, employing Transformers solely in the encoder, and\nthe Decoder-only 3D-TransUNet, utilizing Transformers exclusively in the\ndecoder. For Encoder-only 3D-TransUNet, we note that Masked-Autoencoder\npre-training is required for a better initialization of the Transformer Encoder\nand thus accelerates the training process. We identify that the Decoder-only\n3D-TransUNet model should offer enhanced efficacy in the segmentation of brain\nmetastases, as indicated by our 5-fold cross-validation on the training set.\nHowever, our use of the Encoder-only 3D-TransUNet model already yield notable\nresults, with an average lesion-wise Dice score of 59.8\\% on the test set,\nsecuring second place in the BraTS-METS 2023 challenge.\n","authors":["Siwei Yang","Xianhang Li","Jieru Mei","Jieneng Chen","Cihang Xie","Yuyin Zhou"],"pdf_url":"https://arxiv.org/pdf/2403.15735v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13231v3","updated":"2024-03-23T05:23:00Z","published":"2023-11-22T08:42:46Z","title":"Using Human Feedback to Fine-tune Diffusion Models without Any Reward\n Model","summary":" Using reinforcement learning with human feedback (RLHF) has shown significant\npromise in fine-tuning diffusion models. Previous methods start by training a\nreward model that aligns with human preferences, then leverage RL techniques to\nfine-tune the underlying models. However, crafting an efficient reward model\ndemands extensive datasets, optimal architecture, and manual hyperparameter\ntuning, making the process both time and cost-intensive. The direct preference\noptimization (DPO) method, effective in fine-tuning large language models,\neliminates the necessity for a reward model. However, the extensive GPU memory\nrequirement of the diffusion model's denoising process hinders the direct\napplication of the DPO method. To address this issue, we introduce the Direct\nPreference for Denoising Diffusion Policy Optimization (D3PO) method to\ndirectly fine-tune diffusion models. The theoretical analysis demonstrates that\nalthough D3PO omits training a reward model, it effectively functions as the\noptimal reward model trained using human feedback data to guide the learning\nprocess. This approach requires no training of a reward model, proving to be\nmore direct, cost-effective, and minimizing computational overhead. In\nexperiments, our method uses the relative scale of objectives as a proxy for\nhuman preference, delivering comparable results to methods using ground-truth\nrewards. Moreover, D3PO demonstrates the ability to reduce image distortion\nrates and generate safer images, overcoming challenges lacking robust reward\nmodels. Our code is publicly available at https://github.com/yk7333/D3PO.\n","authors":["Kai Yang","Jian Tao","Jiafei Lyu","Chunjiang Ge","Jiaxin Chen","Qimai Li","Weihan Shen","Xiaolong Zhu","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2311.13231v3.pdf","comment":"CVPR 2024 accepted; huggingface daily paper"},{"id":"http://arxiv.org/abs/2311.15383v2","updated":"2024-03-23T05:21:14Z","published":"2023-11-26T19:01:14Z","title":"Visual Programming for Zero-shot Open-Vocabulary 3D Visual Grounding","summary":" 3D Visual Grounding (3DVG) aims at localizing 3D object based on textual\ndescriptions. Conventional supervised methods for 3DVG often necessitate\nextensive annotations and a predefined vocabulary, which can be restrictive. To\naddress this issue, we propose a novel visual programming approach for\nzero-shot open-vocabulary 3DVG, leveraging the capabilities of large language\nmodels (LLMs). Our approach begins with a unique dialog-based method, engaging\nwith LLMs to establish a foundational understanding of zero-shot 3DVG. Building\non this, we design a visual program that consists of three types of modules,\ni.e., view-independent, view-dependent, and functional modules. These modules,\nspecifically tailored for 3D scenarios, work collaboratively to perform complex\nreasoning and inference. Furthermore, we develop an innovative language-object\ncorrelation module to extend the scope of existing 3D object detectors into\nopen-vocabulary scenarios. Extensive experiments demonstrate that our zero-shot\napproach can outperform some supervised baselines, marking a significant stride\ntowards effective 3DVG.\n","authors":["Zhihao Yuan","Jinke Ren","Chun-Mei Feng","Hengshuang Zhao","Shuguang Cui","Zhen Li"],"pdf_url":"https://arxiv.org/pdf/2311.15383v2.pdf","comment":"Accepted by CVPR 2024, project website:\n https://curryyuan.github.io/ZSVG3D/"},{"id":"http://arxiv.org/abs/2310.11440v3","updated":"2024-03-23T04:58:50Z","published":"2023-10-17T17:50:46Z","title":"EvalCrafter: Benchmarking and Evaluating Large Video Generation Models","summary":" The vision and language generative models have been overgrown in recent\nyears. For video generation, various open-sourced models and public-available\nservices have been developed to generate high-quality videos. However, these\nmethods often use a few metrics, e.g., FVD or IS, to evaluate the performance.\nWe argue that it is hard to judge the large conditional generative models from\nthe simple metrics since these models are often trained on very large datasets\nwith multi-aspect abilities. Thus, we propose a novel framework and pipeline\nfor exhaustively evaluating the performance of the generated videos. Our\napproach involves generating a diverse and comprehensive list of 700 prompts\nfor text-to-video generation, which is based on an analysis of real-world user\ndata and generated with the assistance of a large language model. Then, we\nevaluate the state-of-the-art video generative models on our carefully designed\nbenchmark, in terms of visual qualities, content qualities, motion qualities,\nand text-video alignment with 17 well-selected objective metrics. To obtain the\nfinal leaderboard of the models, we further fit a series of coefficients to\nalign the objective metrics to the users' opinions. Based on the proposed human\nalignment method, our final score shows a higher correlation than simply\naveraging the metrics, showing the effectiveness of the proposed evaluation\nmethod.\n","authors":["Yaofang Liu","Xiaodong Cun","Xuebo Liu","Xintao Wang","Yong Zhang","Haoxin Chen","Yang Liu","Tieyong Zeng","Raymond Chan","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2310.11440v3.pdf","comment":"Technical Report, Project page: https://evalcrafter.github.io/"},{"id":"http://arxiv.org/abs/2312.05541v2","updated":"2024-03-23T04:54:21Z","published":"2023-12-09T11:18:45Z","title":"DPoser: Diffusion Model as Robust 3D Human Pose Prior","summary":" This work targets to construct a robust human pose prior. However, it remains\na persistent challenge due to biomechanical constraints and diverse human\nmovements. Traditional priors like VAEs and NDFs often exhibit shortcomings in\nrealism and generalization, notably with unseen noisy poses. To address these\nissues, we introduce DPoser, a robust and versatile human pose prior built upon\ndiffusion models. DPoser regards various pose-centric tasks as inverse problems\nand employs variational diffusion sampling for efficient solving. Accordingly,\ndesigned with optimization frameworks, DPoser seamlessly benefits human mesh\nrecovery, pose generation, pose completion, and motion denoising tasks.\nFurthermore, due to the disparity between the articulated poses and structured\nimages, we propose truncated timestep scheduling to enhance the effectiveness\nof DPoser. Our approach demonstrates considerable enhancements over common\nuniform scheduling used in image domains, boasting improvements of 5.4%, 17.2%,\nand 3.8% across human mesh recovery, pose completion, and motion denoising,\nrespectively. Comprehensive experiments demonstrate the superiority of DPoser\nover existing state-of-the-art pose priors across multiple tasks.\n","authors":["Junzhe Lu","Jing Lin","Hongkun Dou","Ailing Zeng","Yue Deng","Yulun Zhang","Haoqian Wang"],"pdf_url":"https://arxiv.org/pdf/2312.05541v2.pdf","comment":"Project Page: https://dposer.github.io; Code Released:\n https://github.com/moonbow721/DPoser"},{"id":"http://arxiv.org/abs/2403.15717v1","updated":"2024-03-23T04:44:55Z","published":"2024-03-23T04:44:55Z","title":"Ev-Edge: Efficient Execution of Event-based Vision Algorithms on\n Commodity Edge Platforms","summary":" Event cameras have emerged as a promising sensing modality for autonomous\nnavigation systems, owing to their high temporal resolution, high dynamic range\nand negligible motion blur. To process the asynchronous temporal event streams\nfrom such sensors, recent research has shown that a mix of Artificial Neural\nNetworks (ANNs), Spiking Neural Networks (SNNs) as well as hybrid SNN-ANN\nalgorithms are necessary to achieve high accuracies across a range of\nperception tasks. However, we observe that executing such workloads on\ncommodity edge platforms which feature heterogeneous processing elements such\nas CPUs, GPUs and neural accelerators results in inferior performance. This is\ndue to the mismatch between the irregular nature of event streams and diverse\ncharacteristics of algorithms on the one hand and the underlying hardware\nplatform on the other. We propose Ev-Edge, a framework that contains three key\noptimizations to boost the performance of event-based vision systems on edge\nplatforms: (1) An Event2Sparse Frame converter directly transforms raw event\nstreams into sparse frames, enabling the use of sparse libraries with minimal\nencoding overheads (2) A Dynamic Sparse Frame Aggregator merges sparse frames\nat runtime by trading off the temporal granularity of events and computational\ndemand thereby improving hardware utilization (3) A Network Mapper maps\nconcurrently executing tasks to different processing elements while also\nselecting layer precision by considering both compute and communication\noverheads. On several state-of-art networks for a range of autonomous\nnavigation tasks, Ev-Edge achieves 1.28x-2.05x improvements in latency and\n1.23x-2.15x in energy over an all-GPU implementation on the NVIDIA Jetson\nXavier AGX platform for single-task execution scenarios. Ev-Edge also achieves\n1.43x-1.81x latency improvements over round-robin scheduling methods in\nmulti-task execution scenarios.\n","authors":["Shrihari Sridharan","Surya Selvam","Kaushik Roy","Anand Raghunathan"],"pdf_url":"https://arxiv.org/pdf/2403.15717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15712v1","updated":"2024-03-23T04:18:49Z","published":"2024-03-23T04:18:49Z","title":"PNAS-MOT: Multi-Modal Object Tracking with Pareto Neural Architecture\n Search","summary":" Multiple object tracking is a critical task in autonomous driving. Existing\nworks primarily focus on the heuristic design of neural networks to obtain high\naccuracy. As tracking accuracy improves, however, neural networks become\nincreasingly complex, posing challenges for their practical application in real\ndriving scenarios due to the high level of latency. In this paper, we explore\nthe use of the neural architecture search (NAS) methods to search for efficient\narchitectures for tracking, aiming for low real-time latency while maintaining\nrelatively high accuracy. Another challenge for object tracking is the\nunreliability of a single sensor, therefore, we propose a multi-modal framework\nto improve the robustness. Experiments demonstrate that our algorithm can run\non edge devices within lower latency constraints, thus greatly reducing the\ncomputational requirements for multi-modal object tracking while keeping lower\nlatency.\n","authors":["Chensheng Peng","Zhaoyu Zeng","Jinling Gao","Jundong Zhou","Masayoshi Tomizuka","Xinbing Wang","Chenghu Zhou","Nanyang Ye"],"pdf_url":"https://arxiv.org/pdf/2403.15712v1.pdf","comment":"IEEE Robotics and Automation Letters 2024. Code is available at\n https://github.com/PholyPeng/PNAS-MOT"},{"id":"http://arxiv.org/abs/2403.15709v1","updated":"2024-03-23T04:08:39Z","published":"2024-03-23T04:08:39Z","title":"Contact-aware Human Motion Generation from Textual Descriptions","summary":" This paper addresses the problem of generating 3D interactive human motion\nfrom text. Given a textual description depicting the actions of different body\nparts in contact with objects, we synthesize sequences of 3D body poses that\nare visually natural and physically plausible. Yet, this task poses a\nsignificant challenge due to the inadequate consideration of interactions by\nphysical contacts in both motion and textual descriptions, leading to unnatural\nand implausible sequences. To tackle this challenge, we create a novel dataset\nnamed RICH-CAT, representing ``Contact-Aware Texts'' constructed from the RICH\ndataset. RICH-CAT comprises high-quality motion, accurate human-object contact\nlabels, and detailed textual descriptions, encompassing over 8,500 motion-text\npairs across 26 indoor/outdoor actions. Leveraging RICH-CAT, we propose a novel\napproach named CATMO for text-driven interactive human motion synthesis that\nexplicitly integrates human body contacts as evidence. We employ two VQ-VAE\nmodels to encode motion and body contact sequences into distinct yet\ncomplementary latent spaces and an intertwined GPT for generating human motions\nand contacts in a mutually conditioned manner. Additionally, we introduce a\npre-trained text encoder to learn textual embeddings that better discriminate\namong various contact types, allowing for more precise control over synthesized\nmotions and contacts. Our experiments demonstrate the superior performance of\nour approach compared to existing text-to-motion methods, producing stable,\ncontact-aware motion sequences. Code and data will be available for research\npurposes.\n","authors":["Sihan Ma","Qiong Cao","Jing Zhang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2403.15709v1.pdf","comment":"Project page: https://xymsh.github.io/RICH-CAT/"},{"id":"http://arxiv.org/abs/2403.15706v1","updated":"2024-03-23T03:56:31Z","published":"2024-03-23T03:56:31Z","title":"G-ACIL: Analytic Learning for Exemplar-Free Generalized Class\n Incremental Learning","summary":" Class incremental learning (CIL) trains a network on sequential tasks with\nseparated categories but suffers from catastrophic forgetting, where models\nquickly lose previously learned knowledge when acquiring new tasks. The\ngeneralized CIL (GCIL) aims to address the CIL problem in a more real-world\nscenario, where incoming data have mixed data categories and unknown sample\nsize distribution, leading to intensified forgetting. Existing attempts for the\nGCIL either have poor performance, or invade data privacy by saving historical\nexemplars. To address this, in this paper, we propose an exemplar-free\ngeneralized analytic class incremental learning (G-ACIL). The G-ACIL adopts\nanalytic learning (a gradient-free training technique), and delivers an\nanalytical solution (i.e., closed-form) to the GCIL scenario. This solution is\nderived via decomposing the incoming data into exposed and unexposed classes,\nallowing an equivalence between the incremental learning and its joint\ntraining, i.e., the weight-invariant property. Such an equivalence is\ntheoretically validated through matrix analysis tools, and hence contributes\ninterpretability in GCIL. It is also empirically evidenced by experiments on\nvarious datasets and settings of GCIL. The results show that the G-ACIL\nexhibits leading performance with high robustness compared with existing\ncompetitive GCIL methods. Codes will be ready at\nhttps://github.com/ZHUANGHP/Analytic-continual-learning.\n","authors":["Huiping Zhuang","Yizhu Chen","Di Fang","Run He","Kai Tong","Hongxin Wei","Ziqian Zeng","Cen Chen"],"pdf_url":"https://arxiv.org/pdf/2403.15706v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15705v1","updated":"2024-03-23T03:56:25Z","published":"2024-03-23T03:56:25Z","title":"UPNeRF: A Unified Framework for Monocular 3D Object Reconstruction and\n Pose Estimation","summary":" Monocular 3D reconstruction for categorical objects heavily relies on\naccurately perceiving each object's pose. While gradient-based optimization\nwithin a NeRF framework updates initially given poses, this paper highlights\nthat such a scheme fails when the initial pose even moderately deviates from\nthe true pose. Consequently, existing methods often depend on a third-party 3D\nobject to provide an initial object pose, leading to increased complexity and\ngeneralization issues. To address these challenges, we present UPNeRF, a\nUnified framework integrating Pose estimation and NeRF-based reconstruction,\nbringing us closer to real-time monocular 3D object reconstruction. UPNeRF\ndecouples the object's dimension estimation and pose refinement to resolve the\nscale-depth ambiguity, and introduces an effective projected-box representation\nthat generalizes well cross different domains. While using a dedicated pose\nestimator that smoothly integrates into an object-centric NeRF, UPNeRF is free\nfrom external 3D detectors. UPNeRF achieves state-of-the-art results in both\nreconstruction and pose estimation tasks on the nuScenes dataset. Furthermore,\nUPNeRF exhibits exceptional Cross-dataset generalization on the KITTI and Waymo\ndatasets, surpassing prior methods with up to 50% reduction in rotation and\ntranslation error.\n","authors":["Yuliang Guo","Abhinav Kumar","Cheng Zhao","Ruoyu Wang","Xinyu Huang","Liu Ren"],"pdf_url":"https://arxiv.org/pdf/2403.15705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15704v1","updated":"2024-03-23T03:55:41Z","published":"2024-03-23T03:55:41Z","title":"Gaussian in the Wild: 3D Gaussian Splatting for Unconstrained Image\n Collections","summary":" Novel view synthesis from unconstrained in-the-wild images remains a\nmeaningful but challenging task. The photometric variation and transient\noccluders in those unconstrained images make it difficult to reconstruct the\noriginal scene accurately. Previous approaches tackle the problem by\nintroducing a global appearance feature in Neural Radiance Fields (NeRF).\nHowever, in the real world, the unique appearance of each tiny point in a scene\nis determined by its independent intrinsic material attributes and the varying\nenvironmental impacts it receives. Inspired by this fact, we propose Gaussian\nin the wild (GS-W), a method that uses 3D Gaussian points to reconstruct the\nscene and introduces separated intrinsic and dynamic appearance feature for\neach point, capturing the unchanged scene appearance along with dynamic\nvariation like illumination and weather. Additionally, an adaptive sampling\nstrategy is presented to allow each Gaussian point to focus on the local and\ndetailed information more effectively. We also reduce the impact of transient\noccluders using a 2D visibility map. More experiments have demonstrated better\nreconstruction quality and details of GS-W compared to previous methods, with a\n$1000\\times$ increase in rendering speed.\n","authors":["Dongbin Zhang","Chuming Wang","Weitao Wang","Peihao Li","Minghan Qin","Haoqian Wang"],"pdf_url":"https://arxiv.org/pdf/2403.15704v1.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2208.08270v4","updated":"2024-03-23T03:33:32Z","published":"2022-08-17T13:02:17Z","title":"On the Privacy Effect of Data Enhancement via the Lens of Memorization","summary":" Machine learning poses severe privacy concerns as it has been shown that the\nlearned models can reveal sensitive information about their training data. Many\nworks have investigated the effect of widely adopted data augmentation and\nadversarial training techniques, termed data enhancement in the paper, on the\nprivacy leakage of machine learning models. Such privacy effects are often\nmeasured by membership inference attacks (MIAs), which aim to identify whether\na particular example belongs to the training set or not. We propose to\ninvestigate privacy from a new perspective called memorization. Through the\nlens of memorization, we find that previously deployed MIAs produce misleading\nresults as they are less likely to identify samples with higher privacy risks\nas members compared to samples with low privacy risks. To solve this problem,\nwe deploy a recent attack that can capture individual samples' memorization\ndegrees for evaluation. Through extensive experiments, we unveil several\nfindings about the connections between three essential properties of machine\nlearning models, including privacy, generalization gap, and adversarial\nrobustness. We demonstrate that the generalization gap and privacy leakage are\nless correlated than those of the previous results. Moreover, there is not\nnecessarily a trade-off between adversarial robustness and privacy as stronger\nadversarial robustness does not make the model more susceptible to privacy\nattacks.\n","authors":["Xiao Li","Qiongxiu Li","Zhanhao Hu","Xiaolin Hu"],"pdf_url":"https://arxiv.org/pdf/2208.08270v4.pdf","comment":"Accepted by IEEE TIFS, 17 pages"},{"id":"http://arxiv.org/abs/2403.15698v1","updated":"2024-03-23T03:23:29Z","published":"2024-03-23T03:23:29Z","title":"SceneX:Procedural Controllable Large-scale Scene Generation via\n Large-language Models","summary":" Due to its great application potential, large-scale scene generation has\ndrawn extensive attention in academia and industry. Recent research employs\npowerful generative models to create desired scenes and achieves promising\nresults. However, most of these methods represent the scene using 3D primitives\n(e.g. point cloud or radiance field) incompatible with the industrial pipeline,\nwhich leads to a substantial gap between academic research and industrial\ndeployment. Procedural Controllable Generation (PCG) is an efficient technique\nfor creating scalable and high-quality assets, but it is unfriendly for\nordinary users as it demands profound domain expertise. To address these\nissues, we resort to using the large language model (LLM) to drive the\nprocedural modeling. In this paper, we introduce a large-scale scene generation\nframework, SceneX, which can automatically produce high-quality procedural\nmodels according to designers' textual descriptions.Specifically, the proposed\nmethod comprises two components, PCGBench and PCGPlanner. The former\nencompasses an extensive collection of accessible procedural assets and\nthousands of hand-craft API documents. The latter aims to generate executable\nactions for Blender to produce controllable and precise 3D assets guided by the\nuser's instructions. Our SceneX can generate a city spanning 2.5 km times 2.5\nkm with delicate layout and geometric structures, drastically reducing the time\ncost from several weeks for professional PCG engineers to just a few hours for\nan ordinary user. Extensive experiments demonstrated the capability of our\nmethod in controllable large-scale scene generation and editing, including\nasset placement and season translation.\n","authors":["Mengqi Zhou","Jun Hou","Chuanchen Luo","Yuxi Wang","Zhaoxiang Zhang","Junran Peng"],"pdf_url":"https://arxiv.org/pdf/2403.15698v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14481v2","updated":"2024-03-23T03:13:35Z","published":"2023-12-22T07:17:51Z","title":"SurgicalPart-SAM: Part-to-Whole Collaborative Prompting for Surgical\n Instrument Segmentation","summary":" The Segment Anything Model (SAM) exhibits promise in generic object\nsegmentation and offers potential for various applications. Existing methods\nhave applied SAM to surgical instrument segmentation (SIS) by tuning SAM-based\nframeworks with surgical data. However, they fall short in two crucial aspects:\n(1) Straightforward model tuning with instrument masks treats each instrument\nas a single entity, neglecting their complex structures and fine-grained\ndetails; and (2) Instrument category-based prompts are not flexible and\ninformative enough to describe instrument structures. To address these\nproblems, in this paper, we investigate text promptable SIS and propose\nSurgicalPart-SAM (SP-SAM), a novel SAM efficient-tuning approach that\nexplicitly integrates instrument structure knowledge with SAM's generic\nknowledge, guided by expert knowledge on instrument part compositions.\nSpecifically, we achieve this by proposing (1) Collaborative Prompts that\ndescribe instrument structures via collaborating category-level and part-level\ntexts; (2) Cross-Modal Prompt Encoder that encodes text prompts jointly with\nvisual embeddings into discriminative part-level representations; and (3)\nPart-to-Whole Adaptive Fusion and Hierarchical Decoding that adaptively fuse\nthe part-level representations into a whole for accurate instrument\nsegmentation in surgical scenarios. Built upon them, SP-SAM acquires a better\ncapability to comprehend surgical instruments in terms of both overall\nstructure and part-level details. Extensive experiments on both the EndoVis2018\nand EndoVis2017 datasets demonstrate SP-SAM's state-of-the-art performance with\nminimal tunable parameters. The code will be available at\nhttps://github.com/wenxi-yue/SurgicalPart-SAM.\n","authors":["Wenxi Yue","Jing Zhang","Kun Hu","Qiuxia Wu","Zongyuan Ge","Yong Xia","Jiebo Luo","Zhiyong Wang"],"pdf_url":"https://arxiv.org/pdf/2312.14481v2.pdf","comment":"Technical Report. The source code will be released at\n https://github.com/wenxi-yue/SurgicalPart-SAM"},{"id":"http://arxiv.org/abs/2403.15693v1","updated":"2024-03-23T02:58:10Z","published":"2024-03-23T02:58:10Z","title":"Technical Report: Masked Skeleton Sequence Modeling for Learning Larval\n Zebrafish Behavior Latent Embeddings","summary":" In this report, we introduce a novel self-supervised learning method for\nextracting latent embeddings from behaviors of larval zebrafish. Drawing\ninspiration from Masked Modeling techniquesutilized in image processing with\nMasked Autoencoders (MAE) \\cite{he2022masked} and in natural language\nprocessing with Generative Pre-trained Transformer (GPT)\n\\cite{radford2018improving}, we treat behavior sequences as a blend of images\nand language. For the skeletal sequences of swimming zebrafish, we propose a\npioneering Transformer-CNN architecture, the Sequence Spatial-Temporal\nTransformer (SSTFormer), designed to capture the inter-frame correlation of\ndifferent joints. This correlation is particularly valuable, as it reflects the\ncoordinated movement of various parts of the fish body across adjacent frames.\nTo handle the high frame rate, we segment the skeleton sequence into distinct\ntime slices, analogous to \"words\" in a sentence, and employ self-attention\ntransformer layers to encode the consecutive frames within each slice,\ncapturing the spatial correlation among different joints. Furthermore, we\nincorporate a CNN-based attention module to enhance the representations\noutputted by the transformer layers. Lastly, we introduce a temporal feature\naggregation operation between time slices to improve the discrimination of\nsimilar behaviors.\n","authors":["Lanxin Xu","Shuo Wang"],"pdf_url":"https://arxiv.org/pdf/2403.15693v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15842v2","updated":"2024-03-23T02:46:54Z","published":"2024-01-29T02:32:25Z","title":"LCV2: An Efficient Pretraining-Free Framework for Grounded Visual\n Question Answering","summary":" In this paper, the LCV2 modular method is proposed for the Grounded Visual\nQuestion Answering task in the vision-language multimodal domain. This approach\nrelies on a frozen large language model (LLM) as intermediate mediator between\nthe off-the-shelf VQA model and the off-the-shelf visual grounding (VG) model,\nwhere the LLM transforms and conveys textual information between the two\nmodules based on a designed prompt. LCV2 establish an integrated plug-and-play\nframework without the need for any pre-training process. This framework can be\ndeployed for VQA Grounding tasks under low computational resources. The\nmodularized model within the framework allows application with various\nstate-of-the-art pre-trained models, exhibiting significant potential to be\nadvance with the times. Experimental implementations were conducted under\nconstrained computational and memory resources, evaluating the proposed\nmethod's performance on benchmark datasets including GQA, CLEVR, and\nVizWiz-VQA-Grounding. Comparative analyses with baseline methods demonstrate\nthe robust competitiveness of LCV2.\n","authors":["Yuhan Chen","Lumei Su","Lihua Chen","Zhiwei Lin"],"pdf_url":"https://arxiv.org/pdf/2401.15842v2.pdf","comment":"21 pages,9 figures"},{"id":"http://arxiv.org/abs/2403.15691v1","updated":"2024-03-23T02:44:43Z","published":"2024-03-23T02:44:43Z","title":"Temporal-Spatial Object Relations Modeling for Vision-and-Language\n Navigation","summary":" Vision-and-Language Navigation (VLN) is a challenging task where an agent is\nrequired to navigate to a natural language described location via vision\nobservations. The navigation abilities of the agent can be enhanced by the\nrelations between objects, which are usually learned using internal objects or\nexternal datasets. The relationships between internal objects are modeled\nemploying graph convolutional network (GCN) in traditional studies. However,\nGCN tends to be shallow, limiting its modeling ability. To address this issue,\nwe utilize a cross attention mechanism to learn the connections between objects\nover a trajectory, which takes temporal continuity into account, termed as\nTemporal Object Relations (TOR). The external datasets have a gap with the\nnavigation environment, leading to inaccurate modeling of relations. To avoid\nthis problem, we construct object connections based on observations from all\nviewpoints in the navigational environment, which ensures complete spatial\ncoverage and eliminates the gap, called Spatial Object Relations (SOR).\nAdditionally, we observe that agents may repeatedly visit the same location\nduring navigation, significantly hindering their performance. For resolving\nthis matter, we introduce the Turning Back Penalty (TBP) loss function, which\npenalizes the agent's repetitive visiting behavior, substantially reducing the\nnavigational distance. Experimental results on the REVERIE, SOON, and R2R\ndatasets demonstrate the effectiveness of the proposed method.\n","authors":["Bowen Huang","Yanwei Zheng","Chuanlin Lan","Xinpeng Zhao","Dongxiao yu","Yifei Zou"],"pdf_url":"https://arxiv.org/pdf/2403.15691v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17050v2","updated":"2024-03-23T02:22:04Z","published":"2023-11-28T18:56:01Z","title":"Surf-D: Generating High-Quality Surfaces of Arbitrary Topologies Using\n Diffusion Models","summary":" We present Surf-D, a novel method for generating high-quality 3D shapes as\nSurfaces with arbitrary topologies using Diffusion models. Previous methods\nexplored shape generation with different representations and they suffer from\nlimited topologies and poor geometry details. To generate high-quality surfaces\nof arbitrary topologies, we use the Unsigned Distance Field (UDF) as our\nsurface representation to accommodate arbitrary topologies. Furthermore, we\npropose a new pipeline that employs a point-based AutoEncoder to learn a\ncompact and continuous latent space for accurately encoding UDF and support\nhigh-resolution mesh extraction. We further show that our new pipeline\nsignificantly outperforms the prior approaches to learning the distance fields,\nsuch as the grid-based AutoEncoder, which is not scalable and incapable of\nlearning accurate UDF. In addition, we adopt a curriculum learning strategy to\nefficiently embed various surfaces. With the pretrained shape latent space, we\nemploy a latent diffusion model to acquire the distribution of various shapes.\nExtensive experiments are presented on using Surf-D for unconditional\ngeneration, category conditional generation, image conditional generation, and\ntext-to-shape tasks. The experiments demonstrate the superior performance of\nSurf-D in shape generation across multiple modalities as conditions. Visit our\nproject page at https://yzmblog.github.io/projects/SurfD/.\n","authors":["Zhengming Yu","Zhiyang Dou","Xiaoxiao Long","Cheng Lin","Zekun Li","Yuan Liu","Norman Müller","Taku Komura","Marc Habermann","Christian Theobalt","Xin Li","Wenping Wang"],"pdf_url":"https://arxiv.org/pdf/2311.17050v2.pdf","comment":"Project Page: https://yzmblog.github.io/projects/SurfD/"},{"id":"http://arxiv.org/abs/2403.15684v1","updated":"2024-03-23T02:15:23Z","published":"2024-03-23T02:15:23Z","title":"The Limits of Perception: Analyzing Inconsistencies in Saliency Maps in\n XAI","summary":" Explainable artificial intelligence (XAI) plays an indispensable role in\ndemystifying the decision-making processes of AI, especially within the\nhealthcare industry. Clinicians rely heavily on detailed reasoning when making\na diagnosis, often CT scans for specific features that distinguish between\nbenign and malignant lesions. A comprehensive diagnostic approach includes an\nevaluation of imaging results, patient observations, and clinical tests. The\nsurge in deploying deep learning models as support systems in medical\ndiagnostics has been significant, offering advances that traditional methods\ncould not. However, the complexity and opacity of these models present a\ndouble-edged sword. As they operate as \"black boxes,\" with their reasoning\nobscured and inaccessible, there's an increased risk of misdiagnosis, which can\nlead to patient harm. Hence, there is a pressing need to cultivate transparency\nwithin AI systems, ensuring that the rationale behind an AI's diagnostic\nrecommendations is clear and understandable to medical practitioners. This\nshift towards transparency is not just beneficial -- it's a critical step\ntowards responsible AI integration in healthcare, ensuring that AI aids rather\nthan hinders medical professionals in their crucial work.\n","authors":["Anna Stubbin","Thompson Chyrikov","Jim Zhao","Christina Chajo"],"pdf_url":"https://arxiv.org/pdf/2403.15684v1.pdf","comment":"7 pages, 1 figure, 2 tables"},{"id":"http://arxiv.org/abs/2403.15679v1","updated":"2024-03-23T02:09:23Z","published":"2024-03-23T02:09:23Z","title":"DS-NeRV: Implicit Neural Video Representation with Decomposed Static and\n Dynamic Codes","summary":" Implicit neural representations for video (NeRV) have recently become a novel\nway for high-quality video representation. However, existing works employ a\nsingle network to represent the entire video, which implicitly confuse static\nand dynamic information. This leads to an inability to effectively compress the\nredundant static information and lack the explicitly modeling of global\ntemporal-coherent dynamic details. To solve above problems, we propose DS-NeRV,\nwhich decomposes videos into sparse learnable static codes and dynamic codes\nwithout the need for explicit optical flow or residual supervision. By setting\ndifferent sampling rates for two codes and applying weighted sum and\ninterpolation sampling methods, DS-NeRV efficiently utilizes redundant static\ninformation while maintaining high-frequency details. Additionally, we design a\ncross-channel attention-based (CCA) fusion module to efficiently fuse these two\ncodes for frame decoding. Our approach achieves a high quality reconstruction\nof 31.2 PSNR with only 0.35M parameters thanks to separate static and dynamic\ncodes representation and outperforms existing NeRV methods in many downstream\ntasks. Our project website is at https://haoyan14.github.io/DS-NeRV.\n","authors":["Hao Yan","Zhihui Ke","Xiaobo Zhou","Tie Qiu","Xidong Shi","Dadong Jiang"],"pdf_url":"https://arxiv.org/pdf/2403.15679v1.pdf","comment":"CVPR 2024. Project page at https://haoyan14.github.io/DS-NeRV"},{"id":"http://arxiv.org/abs/2403.15675v1","updated":"2024-03-23T01:42:19Z","published":"2024-03-23T01:42:19Z","title":"An active learning model to classify animal species in Hong Kong","summary":" Camera traps are used by ecologists globally as an efficient and non-invasive\nmethod to monitor animals. While it is time-consuming to manually label the\ncollected images, recent advances in deep learning and computer vision has made\nit possible to automating this process [1]. A major obstacle to this is the\ngeneralisability of these models when applying these images to independently\ncollected data from other parts of the world [2]. Here, we use a deep active\nlearning workflow [3], and train a model that is applicable to camera trap\nimages collected in Hong Kong.\n","authors":["Gareth Lamb","Ching Hei Lo","Jin Wu","Calvin K. F. Lee"],"pdf_url":"https://arxiv.org/pdf/2403.15675v1.pdf","comment":"6 pages, 2 figures, 1 table"},{"id":"http://arxiv.org/abs/2403.15664v1","updated":"2024-03-23T01:22:15Z","published":"2024-03-23T01:22:15Z","title":"What Do You See in Vehicle? Comprehensive Vision Solution for In-Vehicle\n Gaze Estimation","summary":" Driver's eye gaze holds a wealth of cognitive and intentional cues crucial\nfor intelligent vehicles. Despite its significance, research on in-vehicle gaze\nestimation remains limited due to the scarcity of comprehensive and\nwell-annotated datasets in real driving scenarios. In this paper, we present\nthree novel elements to advance in-vehicle gaze research. Firstly, we introduce\nIVGaze, a pioneering dataset capturing in-vehicle gaze, collected from 125\nsubjects and covering a large range of gaze and head poses within vehicles.\nConventional gaze collection systems are inadequate for in-vehicle use. In this\ndataset, we propose a new vision-based solution for in-vehicle gaze collection,\nintroducing a refined gaze target calibration method to tackle annotation\nchallenges. Second, our research focuses on in-vehicle gaze estimation\nleveraging the IVGaze. In-vehicle face images often suffer from low resolution,\nprompting our introduction of a gaze pyramid transformer that leverages\ntransformer-based multilevel features integration. Expanding upon this, we\nintroduce the dual-stream gaze pyramid transformer (GazeDPTR). Employing\nperspective transformation, we rotate virtual cameras to normalize images,\nutilizing camera pose to merge normalized and original images for accurate gaze\nestimation. GazeDPTR shows state-of-the-art performance on the IVGaze dataset.\nThirdly, we explore a novel strategy for gaze zone classification by extending\nthe GazeDPTR. A foundational tri-plane and project gaze onto these planes are\nnewly defined. Leveraging both positional features from the projection points\nand visual attributes from images, we achieve superior performance compared to\nrelying solely on visual features, substantiating the advantage of gaze\nestimation. Our project is available at https://yihua.zone/work/ivgaze.\n","authors":["Yihua Cheng","Yaning Zhu","Zongji Wang","Hongquan Hao","Yongwei Liu","Shiqing Cheng","Xi Wang","Hyung Jin Chang"],"pdf_url":"https://arxiv.org/pdf/2403.15664v1.pdf","comment":"CVPR24"},{"id":"http://arxiv.org/abs/2403.17025v1","updated":"2024-03-23T14:36:48Z","published":"2024-03-23T14:36:48Z","title":"Boosting Few-Shot Learning via Attentive Feature Regularization","summary":" Few-shot learning (FSL) based on manifold regularization aims to improve the\nrecognition capacity of novel objects with limited training samples by mixing\ntwo samples from different categories with a blending factor. However, this\nmixing operation weakens the feature representation due to the linear\ninterpolation and the overlooking of the importance of specific channels. To\nsolve these issues, this paper proposes attentive feature regularization (AFR)\nwhich aims to improve the feature representativeness and discriminability. In\nour approach, we first calculate the relations between different categories of\nsemantic labels to pick out the related features used for regularization. Then,\nwe design two attention-based calculations at both the instance and channel\nlevels. These calculations enable the regularization procedure to focus on two\ncrucial aspects: the feature complementarity through adaptive interpolation in\nrelated categories and the emphasis on specific feature channels. Finally, we\ncombine these regularization strategies to significantly improve the classifier\nperformance. Empirical studies on several popular FSL benchmarks demonstrate\nthe effectiveness of AFR, which improves the recognition accuracy of novel\ncategories without the need to retrain any feature extractor, especially in the\n1-shot setting. Furthermore, the proposed AFR can seamlessly integrate into\nother FSL methods to improve classification performance.\n","authors":["Xingyu Zhu","Shuo Wang","Jinda Lu","Yanbin Hao","Haifeng Liu","Xiangnan He"],"pdf_url":"https://arxiv.org/pdf/2403.17025v1.pdf","comment":"Accepted to AAAI 2024"},{"id":"http://arxiv.org/abs/2205.11100v2","updated":"2024-03-23T09:14:19Z","published":"2022-05-23T07:51:15Z","title":"Supporting Vision-Language Model Inference with Confounder-pruning\n Knowledge Prompt","summary":" Vision-language models are pre-trained by aligning image-text pairs in a\ncommon space to deal with open-set visual concepts. To boost the\ntransferability of the pre-trained models, recent works adopt fixed or\nlearnable prompts, i.e., classification weights are synthesized from natural\nlanguage describing task-relevant categories, to reduce the gap between tasks\nin the training and test phases. However, how and what prompts can improve\ninference performance remains unclear. In this paper, we explicitly clarify the\nimportance of including semantic information in prompts, while existing\nprompting methods generate prompts without exploring the semantic information\nof textual labels. Manually constructing prompts with rich semantics requires\ndomain expertise and is extremely time-consuming. To cope with this issue, we\npropose a semantic-aware prompt learning method, namely CPKP, which retrieves\nan ontological knowledge graph by treating the textual label as a query to\nextract task-relevant semantic information. CPKP further introduces a\ndouble-tier confounder-pruning procedure to refine the derived semantic\ninformation. The graph-tier confounders are gradually identified and phased\nout, inspired by the principle of Granger causality. The feature-tier\nconfounders are demolished by following the maximum entropy principle in\ninformation theory. Empirically, the evaluations demonstrate the effectiveness\nof CPKP, e.g., with two shots, CPKP outperforms the manual-prompt method by\n4.64% and the learnable-prompt method by 1.09% on average, and the superiority\nof CPKP in domain generalization compared to benchmark approaches. Our\nimplementation is available at https://github.com/Mowenyii/CPKP.\n","authors":["Jiangmeng Li","Wenyi Mo","Wenwen Qiang","Bing Su","Changwen Zheng","Hui Xiong","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2205.11100v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15707v1","updated":"2024-03-23T03:57:28Z","published":"2024-03-23T03:57:28Z","title":"Role of Locality and Weight Sharing in Image-Based Tasks: A Sample\n Complexity Separation between CNNs, LCNs, and FCNs","summary":" Vision tasks are characterized by the properties of locality and translation\ninvariance. The superior performance of convolutional neural networks (CNNs) on\nthese tasks is widely attributed to the inductive bias of locality and weight\nsharing baked into their architecture. Existing attempts to quantify the\nstatistical benefits of these biases in CNNs over locally connected\nconvolutional neural networks (LCNs) and fully connected neural networks (FCNs)\nfall into one of the following categories: either they disregard the optimizer\nand only provide uniform convergence upper bounds with no separating lower\nbounds, or they consider simplistic tasks that do not truly mirror the locality\nand translation invariance as found in real-world vision tasks. To address\nthese deficiencies, we introduce the Dynamic Signal Distribution (DSD)\nclassification task that models an image as consisting of $k$ patches, each of\ndimension $d$, and the label is determined by a $d$-sparse signal vector that\ncan freely appear in any one of the $k$ patches. On this task, for any\northogonally equivariant algorithm like gradient descent, we prove that CNNs\nrequire $\\tilde{O}(k+d)$ samples, whereas LCNs require $\\Omega(kd)$ samples,\nestablishing the statistical advantages of weight sharing in translation\ninvariant tasks. Furthermore, LCNs need $\\tilde{O}(k(k+d))$ samples, compared\nto $\\Omega(k^2d)$ samples for FCNs, showcasing the benefits of locality in\nlocal tasks. Additionally, we develop information theoretic tools for analyzing\nrandomized algorithms, which may be of interest for statistical research.\n","authors":["Aakash Lahoti","Stefani Karp","Ezra Winston","Aarti Singh","Yuanzhi Li"],"pdf_url":"https://arxiv.org/pdf/2403.15707v1.pdf","comment":"40 pages, 4 figures, Accepted to ICLR 2024, Spotlight"}]},"2024-03-26T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2403.17937v1","updated":"2024-03-26T17:59:58Z","published":"2024-03-26T17:59:58Z","title":"Efficient Video Object Segmentation via Modulated Cross-Attention Memory","summary":" Recently, transformer-based approaches have shown promising results for\nsemi-supervised video object segmentation. However, these approaches typically\nstruggle on long videos due to increased GPU memory demands, as they frequently\nexpand the memory bank every few frames. We propose a transformer-based\napproach, named MAVOS, that introduces an optimized and dynamic long-term\nmodulated cross-attention (MCA) memory to model temporal smoothness without\nrequiring frequent memory expansion. The proposed MCA effectively encodes both\nlocal and global features at various levels of granularity while efficiently\nmaintaining consistent speed regardless of the video length. Extensive\nexperiments on multiple benchmarks, LVOS, Long-Time Video, and DAVIS 2017,\ndemonstrate the effectiveness of our proposed contributions leading to\nreal-time inference and markedly reduced memory demands without any degradation\nin segmentation accuracy on long videos. Compared to the best existing\ntransformer-based approach, our MAVOS increases the speed by 7.6x, while\nsignificantly reducing the GPU memory by 87% with comparable segmentation\nperformance on short and long video datasets. Notably on the LVOS dataset, our\nMAVOS achieves a J&F score of 63.3% while operating at 37 frames per second\n(FPS) on a single V100 GPU. Our code and models will be publicly available at:\nhttps://github.com/Amshaker/MAVOS.\n","authors":["Abdelrahman Shaker","Syed Talal Wasim","Martin Danelljan","Salman Khan","Ming-Hsuan Yang","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2403.17937v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17936v1","updated":"2024-03-26T17:59:52Z","published":"2024-03-26T17:59:52Z","title":"ConvoFusion: Multi-Modal Conversational Diffusion for Co-Speech Gesture\n Synthesis","summary":" Gestures play a key role in human communication. Recent methods for co-speech\ngesture generation, while managing to generate beat-aligned motions, struggle\ngenerating gestures that are semantically aligned with the utterance. Compared\nto beat gestures that align naturally to the audio signal, semantically\ncoherent gestures require modeling the complex interactions between the\nlanguage and human motion, and can be controlled by focusing on certain words.\nTherefore, we present ConvoFusion, a diffusion-based approach for multi-modal\ngesture synthesis, which can not only generate gestures based on multi-modal\nspeech inputs, but can also facilitate controllability in gesture synthesis.\nOur method proposes two guidance objectives that allow the users to modulate\nthe impact of different conditioning modalities (e.g. audio vs text) as well as\nto choose certain words to be emphasized during gesturing. Our method is\nversatile in that it can be trained either for generating monologue gestures or\neven the conversational gestures. To further advance the research on\nmulti-party interactive gestures, the DnD Group Gesture dataset is released,\nwhich contains 6 hours of gesture data showing 5 people interacting with one\nanother. We compare our method with several recent works and demonstrate\neffectiveness of our method on a variety of tasks. We urge the reader to watch\nour supplementary video at our website.\n","authors":["Muhammad Hamza Mughal","Rishabh Dabral","Ikhsanul Habibie","Lucia Donatelli","Marc Habermann","Christian Theobalt"],"pdf_url":"https://arxiv.org/pdf/2403.17936v1.pdf","comment":"CVPR 2024. Project Page:\n https://vcai.mpi-inf.mpg.de/projects/ConvoFusion/"},{"id":"http://arxiv.org/abs/2403.17935v1","updated":"2024-03-26T17:59:24Z","published":"2024-03-26T17:59:24Z","title":"OmniVid: A Generative Framework for Universal Video Understanding","summary":" The core of video understanding tasks, such as recognition, captioning, and\ntracking, is to automatically detect objects or actions in a video and analyze\ntheir temporal evolution. Despite sharing a common goal, different tasks often\nrely on distinct model architectures and annotation formats. In contrast,\nnatural language processing benefits from a unified output space, i.e., text\nsequences, which simplifies the training of powerful foundational language\nmodels, such as GPT-3, with extensive training corpora. Inspired by this, we\nseek to unify the output space of video understanding tasks by using languages\nas labels and additionally introducing time and box tokens. In this way, a\nvariety of video tasks could be formulated as video-grounded token generation.\nThis enables us to address various types of video tasks, including\nclassification (such as action recognition), captioning (covering clip\ncaptioning, video question answering, and dense video captioning), and\nlocalization tasks (such as visual object tracking) within a fully shared\nencoder-decoder architecture, following a generative framework. Through\ncomprehensive experiments, we demonstrate such a simple and straightforward\nidea is quite effective and can achieve state-of-the-art or competitive results\non seven video benchmarks, providing a novel perspective for more universal\nvideo understanding. Code is available at https://github.com/wangjk666/OmniVid.\n","authors":["Junke Wang","Dongdong Chen","Chong Luo","Bo He","Lu Yuan","Zuxuan Wu","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2403.17935v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.17934v1","updated":"2024-03-26T17:59:23Z","published":"2024-03-26T17:59:23Z","title":"AiOS: All-in-One-Stage Expressive Human Pose and Shape Estimation","summary":" Expressive human pose and shape estimation (a.k.a. 3D whole-body mesh\nrecovery) involves the human body, hand, and expression estimation. Most\nexisting methods have tackled this task in a two-stage manner, first detecting\nthe human body part with an off-the-shelf detection model and inferring the\ndifferent human body parts individually. Despite the impressive results\nachieved, these methods suffer from 1) loss of valuable contextual information\nvia cropping, 2) introducing distractions, and 3) lacking inter-association\namong different persons and body parts, inevitably causing performance\ndegradation, especially for crowded scenes. To address these issues, we\nintroduce a novel all-in-one-stage framework, AiOS, for multiple expressive\nhuman pose and shape recovery without an additional human detection step.\nSpecifically, our method is built upon DETR, which treats multi-person\nwhole-body mesh recovery task as a progressive set prediction problem with\nvarious sequential detection. We devise the decoder tokens and extend them to\nour task. Specifically, we first employ a human token to probe a human location\nin the image and encode global features for each instance, which provides a\ncoarse location for the later transformer block. Then, we introduce a\njoint-related token to probe the human joint in the image and encoder a\nfine-grained local feature, which collaborates with the global feature to\nregress the whole-body mesh. This straightforward but effective model\noutperforms previous state-of-the-art methods by a 9% reduction in NMVE on\nAGORA, a 30% reduction in PVE on EHF, a 10% reduction in PVE on ARCTIC, and a\n3% reduction in PVE on EgoBody.\n","authors":["Qingping Sun","Yanjun Wang","Ailing Zeng","Wanqi Yin","Chen Wei","Wenjia Wang","Haiyi Mei","Chi Sing Leung","Ziwei Liu","Lei Yang","Zhongang Cai"],"pdf_url":"https://arxiv.org/pdf/2403.17934v1.pdf","comment":"Homepage: https://ttxskk.github.io/AiOS/"},{"id":"http://arxiv.org/abs/2403.17933v1","updated":"2024-03-26T17:58:29Z","published":"2024-03-26T17:58:29Z","title":"SLEDGE: Synthesizing Simulation Environments for Driving Agents with\n Generative Models","summary":" SLEDGE is the first generative simulator for vehicle motion planning trained\non real-world driving logs. Its core component is a learned model that is able\nto generate agent bounding boxes and lane graphs. The model's outputs serve as\nan initial state for traffic simulation. The unique properties of the entities\nto be generated for SLEDGE, such as their connectivity and variable count per\nscene, render the naive application of most modern generative models to this\ntask non-trivial. Therefore, together with a systematic study of existing lane\ngraph representations, we introduce a novel raster-to-vector autoencoder\n(RVAE). It encodes agents and the lane graph into distinct channels in a\nrasterized latent map. This facilitates both lane-conditioned agent generation\nand combined generation of lanes and agents with a Diffusion Transformer. Using\ngenerated entities in SLEDGE enables greater control over the simulation, e.g.\nupsampling turns or increasing traffic density. Further, SLEDGE can support\n500m long routes, a capability not found in existing data-driven simulators\nlike nuPlan. It presents new challenges for planning algorithms, evidenced by\nfailure rates of over 40% for PDM, the winner of the 2023 nuPlan challenge,\nwhen tested on hard routes and dense traffic generated by our model. Compared\nto nuPlan, SLEDGE requires 500$\\times$ less storage to set up (<4GB), making it\na more accessible option and helping with democratizing future research in this\nfield.\n","authors":["Kashyap Chitta","Daniel Dauner","Andreas Geiger"],"pdf_url":"https://arxiv.org/pdf/2403.17933v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17931v1","updated":"2024-03-26T17:58:22Z","published":"2024-03-26T17:58:22Z","title":"Track Everything Everywhere Fast and Robustly","summary":" We propose a novel test-time optimization approach for efficiently and\nrobustly tracking any pixel at any time in a video. The latest state-of-the-art\noptimization-based tracking technique, OmniMotion, requires a prohibitively\nlong optimization time, rendering it impractical for downstream applications.\nOmniMotion is sensitive to the choice of random seeds, leading to unstable\nconvergence. To improve efficiency and robustness, we introduce a novel\ninvertible deformation network, CaDeX++, which factorizes the function\nrepresentation into a local spatial-temporal feature grid and enhances the\nexpressivity of the coupling blocks with non-linear functions. While CaDeX++\nincorporates a stronger geometric bias within its architectural design, it also\ntakes advantage of the inductive bias provided by the vision foundation models.\nOur system utilizes monocular depth estimation to represent scene geometry and\nenhances the objective by incorporating DINOv2 long-term semantics to regulate\nthe optimization process. Our experiments demonstrate a substantial improvement\nin training speed (more than \\textbf{10 times} faster), robustness, and\naccuracy in tracking over the SoTA optimization-based method OmniMotion.\n","authors":["Yunzhou Song","Jiahui Lei","Ziyun Wang","Lingjie Liu","Kostas Daniilidis"],"pdf_url":"https://arxiv.org/pdf/2403.17931v1.pdf","comment":"project page: https://timsong412.github.io/FastOmniTrack/"},{"id":"http://arxiv.org/abs/2403.17929v1","updated":"2024-03-26T17:58:07Z","published":"2024-03-26T17:58:07Z","title":"Towards Explaining Hypercomplex Neural Networks","summary":" Hypercomplex neural networks are gaining increasing interest in the deep\nlearning community. The attention directed towards hypercomplex models\noriginates from several aspects, spanning from purely theoretical and\nmathematical characteristics to the practical advantage of lightweight models\nover conventional networks, and their unique properties to capture both global\nand local relations. In particular, a branch of these architectures,\nparameterized hypercomplex neural networks (PHNNs), has also gained popularity\ndue to their versatility across a multitude of application domains.\nNonetheless, only few attempts have been made to explain or interpret their\nintricacies. In this paper, we propose inherently interpretable PHNNs and\nquaternion-like networks, thus without the need for any post-hoc method. To\nachieve this, we define a type of cosine-similarity transform within the\nparameterized hypercomplex domain. This PHB-cos transform induces weight\nalignment with relevant input features and allows to reduce the model into a\nsingle linear transform, rendering it directly interpretable. In this work, we\nstart to draw insights into how this unique branch of neural models operates.\nWe observe that hypercomplex networks exhibit a tendency to concentrate on the\nshape around the main object of interest, in addition to the shape of the\nobject itself. We provide a thorough analysis, studying single neurons of\ndifferent layers and comparing them against how real-valued networks learn. The\ncode of the paper is available at https://github.com/ispamm/HxAI.\n","authors":["Eleonora Lopez","Eleonora Grassucci","Debora Capriotti","Danilo Comminiello"],"pdf_url":"https://arxiv.org/pdf/2403.17929v1.pdf","comment":"The paper has been accepted at IEEE WCCI 2024"},{"id":"http://arxiv.org/abs/2403.17926v1","updated":"2024-03-26T17:57:20Z","published":"2024-03-26T17:57:20Z","title":"FastCAR: Fast Classification And Regression Multi-Task Learning via Task\n Consolidation for Modelling a Continuous Property Variable of Object Classes","summary":" FastCAR is a novel task consolidation approach in Multi-Task Learning (MTL)\nfor a classification and a regression task, despite task heterogeneity with\nonly subtle correlation. It addresses object classification and continuous\nproperty variable regression, a crucial use case in science and engineering.\nFastCAR involves a labeling transformation approach that can be used with a\nsingle-task regression network architecture. FastCAR outperforms traditional\nMTL model families, parametrized in the landscape of architecture and loss\nweighting schemes, when learning of both tasks are collectively considered\n(classification accuracy of 99.54%, regression mean absolute percentage error\nof 2.3%). The experiments performed used an Advanced Steel Property dataset\ncontributed by us. The dataset comprises 4536 images of 224x224 pixels,\nannotated with object classes and hardness properties that take continuous\nvalues. With the labeling transformation and single-task regression network\narchitecture, FastCAR achieves reduced latency and time efficiency.\n","authors":["Anoop Kini","Andreas Jansche","Timo Bernthaler","Gerhard Schneider"],"pdf_url":"https://arxiv.org/pdf/2403.17926v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17924v1","updated":"2024-03-26T17:57:05Z","published":"2024-03-26T17:57:05Z","title":"AID: Attention Interpolation of Text-to-Image Diffusion","summary":" Conditional diffusion models can create unseen images in various settings,\naiding image interpolation. Interpolation in latent spaces is well-studied, but\ninterpolation with specific conditions like text or poses is less understood.\nSimple approaches, such as linear interpolation in the space of conditions,\noften result in images that lack consistency, smoothness, and fidelity. To that\nend, we introduce a novel training-free technique named Attention Interpolation\nvia Diffusion (AID). Our key contributions include 1) proposing an inner/outer\ninterpolated attention layer; 2) fusing the interpolated attention with\nself-attention to boost fidelity; and 3) applying beta distribution to\nselection to increase smoothness. We also present a variant, Prompt-guided\nAttention Interpolation via Diffusion (PAID), that considers interpolation as a\ncondition-dependent generative process. This method enables the creation of new\nimages with greater consistency, smoothness, and efficiency, and offers control\nover the exact path of interpolation. Our approach demonstrates effectiveness\nfor conceptual and spatial interpolation. Code and demo are available at\nhttps://github.com/QY-H00/attention-interpolation-diffusion.\n","authors":["Qiyuan He","Jinghao Wang","Ziwei Liu","Angela Yao"],"pdf_url":"https://arxiv.org/pdf/2403.17924v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17920v1","updated":"2024-03-26T17:55:11Z","published":"2024-03-26T17:55:11Z","title":"TC4D: Trajectory-Conditioned Text-to-4D Generation","summary":" Recent techniques for text-to-4D generation synthesize dynamic 3D scenes\nusing supervision from pre-trained text-to-video models. However, existing\nrepresentations for motion, such as deformation models or time-dependent neural\nrepresentations, are limited in the amount of motion they can generate-they\ncannot synthesize motion extending far beyond the bounding box used for volume\nrendering. The lack of a more flexible motion model contributes to the gap in\nrealism between 4D generation methods and recent, near-photorealistic video\ngeneration models. Here, we propose TC4D: trajectory-conditioned text-to-4D\ngeneration, which factors motion into global and local components. We represent\nthe global motion of a scene's bounding box using rigid transformation along a\ntrajectory parameterized by a spline. We learn local deformations that conform\nto the global trajectory using supervision from a text-to-video model. Our\napproach enables the synthesis of scenes animated along arbitrary trajectories,\ncompositional scene generation, and significant improvements to the realism and\namount of generated motion, which we evaluate qualitatively and through a user\nstudy. Video results can be viewed on our website:\nhttps://sherwinbahmani.github.io/tc4d.\n","authors":["Sherwin Bahmani","Xian Liu","Yifan Wang","Ivan Skorokhodov","Victor Rong","Ziwei Liu","Xihui Liu","Jeong Joon Park","Sergey Tulyakov","Gordon Wetzstein","Andrea Tagliasacchi","David B. Lindell"],"pdf_url":"https://arxiv.org/pdf/2403.17920v1.pdf","comment":"Project Page: https://sherwinbahmani.github.io/tc4d"},{"id":"http://arxiv.org/abs/2403.17916v1","updated":"2024-03-26T17:53:27Z","published":"2024-03-26T17:53:27Z","title":"CMP: Cooperative Motion Prediction with Multi-Agent Communication","summary":" The confluence of the advancement of Autonomous Vehicles (AVs) and the\nmaturity of Vehicle-to-Everything (V2X) communication has enabled the\ncapability of cooperative connected and automated vehicles (CAVs). Building on\ntop of cooperative perception, this paper explores the feasibility and\neffectiveness of cooperative motion prediction. Our method, CMP, takes LiDAR\nsignals as input to enhance tracking and prediction capabilities. Unlike\nprevious work that focuses separately on either cooperative perception or\nmotion prediction, our framework, to the best of our knowledge, is the first to\naddress the unified problem where CAVs share information in both perception and\nprediction modules. Incorporated into our design is the unique capability to\ntolerate realistic V2X bandwidth limitations and transmission delays, while\ndealing with bulky perception representations. We also propose a prediction\naggregation module, which unifies the predictions obtained by different CAVs\nand generates the final prediction. Through extensive experiments and ablation\nstudies, we demonstrate the effectiveness of our method in cooperative\nperception, tracking, and motion prediction tasks. In particular, CMP reduces\nthe average prediction error by 17.2\\% with fewer missing detections compared\nwith the no cooperation setting. Our work marks a significant step forward in\nthe cooperative capabilities of CAVs, showcasing enhanced performance in\ncomplex scenarios.\n","authors":["Zhuoyuan Wu","Yuping Wang","Hengbo Ma","Zhaowei Li","Hang Qiu","Jiachen Li"],"pdf_url":"https://arxiv.org/pdf/2403.17916v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17915v1","updated":"2024-03-26T17:52:23Z","published":"2024-03-26T17:52:23Z","title":"Leveraging Near-Field Lighting for Monocular Depth Estimation from\n Endoscopy Videos","summary":" Monocular depth estimation in endoscopy videos can enable assistive and\nrobotic surgery to obtain better coverage of the organ and detection of various\nhealth issues. Despite promising progress on mainstream, natural image depth\nestimation, techniques perform poorly on endoscopy images due to a lack of\nstrong geometric features and challenging illumination effects. In this paper,\nwe utilize the photometric cues, i.e., the light emitted from an endoscope and\nreflected by the surface, to improve monocular depth estimation. We first\ncreate two novel loss functions with supervised and self-supervised variants\nthat utilize a per-pixel shading representation. We then propose a novel depth\nrefinement network (PPSNet) that leverages the same per-pixel shading\nrepresentation. Finally, we introduce teacher-student transfer learning to\nproduce better depth maps from both synthetic data with supervision and\nclinical data with self-supervision. We achieve state-of-the-art results on the\nC3VD dataset while estimating high-quality depth maps from clinical data. Our\ncode, pre-trained models, and supplementary materials can be found on our\nproject page: https://ppsnet.github.io/\n","authors":["Akshay Paruchuri","Samuel Ehrenstein","Shuxian Wang","Inbar Fried","Stephen M. Pizer","Marc Niethammer","Roni Sengupta"],"pdf_url":"https://arxiv.org/pdf/2403.17915v1.pdf","comment":"26 pages, 7 tables, 7 figures"},{"id":"http://arxiv.org/abs/2403.17909v1","updated":"2024-03-26T17:46:25Z","published":"2024-03-26T17:46:25Z","title":"ELGC-Net: Efficient Local-Global Context Aggregation for Remote Sensing\n Change Detection","summary":" Deep learning has shown remarkable success in remote sensing change detection\n(CD), aiming to identify semantic change regions between co-registered\nsatellite image pairs acquired at distinct time stamps. However, existing\nconvolutional neural network and transformer-based frameworks often struggle to\naccurately segment semantic change regions. Moreover, transformers-based\nmethods with standard self-attention suffer from quadratic computational\ncomplexity with respect to the image resolution, making them less practical for\nCD tasks with limited training data. To address these issues, we propose an\nefficient change detection framework, ELGC-Net, which leverages rich contextual\ninformation to precisely estimate change regions while reducing the model size.\nOur ELGC-Net comprises a Siamese encoder, fusion modules, and a decoder. The\nfocus of our design is the introduction of an Efficient Local-Global Context\nAggregator module within the encoder, capturing enhanced global context and\nlocal spatial information through a novel pooled-transpose (PT) attention and\ndepthwise convolution, respectively. The PT attention employs pooling\noperations for robust feature extraction and minimizes computational cost with\ntransposed attention. Extensive experiments on three challenging CD datasets\ndemonstrate that ELGC-Net outperforms existing methods. Compared to the recent\ntransformer-based CD approach (ChangeFormer), ELGC-Net achieves a 1.4% gain in\nintersection over union metric on the LEVIR-CD dataset, while significantly\nreducing trainable parameters. Our proposed ELGC-Net sets a new\nstate-of-the-art performance in remote sensing change detection benchmarks.\nFinally, we also introduce ELGC-Net-LW, a lighter variant with significantly\nreduced computational complexity, suitable for resource-constrained settings,\nwhile achieving comparable performance. Project url\nhttps://github.com/techmn/elgcnet.\n","authors":["Mubashir Noman","Mustansar Fiaz","Hisham Cholakkal","Salman Khan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2403.17909v1.pdf","comment":"accepted at IEEE TGRS"},{"id":"http://arxiv.org/abs/2403.17905v1","updated":"2024-03-26T17:45:06Z","published":"2024-03-26T17:45:06Z","title":"Scalable Non-Cartesian Magnetic Resonance Imaging with R2D2","summary":" We propose a new approach for non-Cartesian magnetic resonance image\nreconstruction. While unrolled architectures provide robustness via\ndata-consistency layers, embedding measurement operators in Deep Neural Network\n(DNN) can become impractical at large scale. Alternative Plug-and-Play (PnP)\napproaches, where the denoising DNNs are blind to the measurement setting, are\nnot affected by this limitation and have also proven effective, but their\nhighly iterative nature also affects scalability. To address this scalability\nchallenge, we leverage the \"Residual-to-Residual DNN series for high-Dynamic\nrange imaging (R2D2)\" approach recently introduced in astronomical imaging.\nR2D2's reconstruction is formed as a series of residual images, iteratively\nestimated as outputs of DNNs taking the previous iteration's image estimate and\nassociated data residual as inputs. The method can be interpreted as a learned\nversion of the Matching Pursuit algorithm. We demonstrate R2D2 in simulation,\nconsidering radial k-space sampling acquisition sequences. Our preliminary\nresults suggest that R2D2 achieves: (i) suboptimal performance compared to its\nunrolled incarnation R2D2-Net, which is however non-scalable due to the\nnecessary embedding of NUFFT-based data-consistency layers; (ii) superior\nreconstruction quality to a scalable version of R2D2-Net embedding an FFT-based\napproximation for data consistency; (iii) superior reconstruction quality to\nPnP, while only requiring few iterations.\n","authors":["Chen Yiwei","Tang Chao","Aghabiglou Amir","Chu Chung San","Wiaux Yves"],"pdf_url":"https://arxiv.org/pdf/2403.17905v1.pdf","comment":"submitted to IEEE EUSIPCO 2024"},{"id":"http://arxiv.org/abs/2403.17902v1","updated":"2024-03-26T17:43:15Z","published":"2024-03-26T17:43:15Z","title":"Serpent: Scalable and Efficient Image Restoration via Multi-scale\n Structured State Space Models","summary":" The landscape of computational building blocks of efficient image restoration\narchitectures is dominated by a combination of convolutional processing and\nvarious attention mechanisms. However, convolutional filters are inherently\nlocal and therefore struggle at modeling long-range dependencies in images. On\nthe other hand, attention excels at capturing global interactions between\narbitrary image regions, however at a quadratic cost in image dimension. In\nthis work, we propose Serpent, an architecture that leverages recent advances\nin state space models (SSMs) in its core computational block. SSMs, originally\nintroduced for sequence modeling, can maintain a global receptive field with a\nfavorable linear scaling in input size. Our preliminary results demonstrate\nthat Serpent can achieve reconstruction quality on par with state-of-the-art\ntechniques, while requiring orders of magnitude less compute (up to $150$ fold\nreduction in FLOPS) and a factor of up to $5\\times$ less GPU memory while\nmaintaining a compact model size.\n","authors":["Mohammad Shahab Sepehri","Zalan Fabian","Mahdi Soltanolkotabi"],"pdf_url":"https://arxiv.org/pdf/2403.17902v1.pdf","comment":"7 pages, 5 figures, preliminary workshop submission of a\n comprehensive work to be released soon"},{"id":"http://arxiv.org/abs/2307.16897v2","updated":"2024-03-26T17:40:47Z","published":"2023-07-31T17:59:48Z","title":"DiVa-360: The Dynamic Visual Dataset for Immersive Neural Fields","summary":" Advances in neural fields are enabling high-fidelity capture of the shape and\nappearance of dynamic 3D scenes. However, their capabilities lag behind those\noffered by conventional representations such as 2D videos because of\nalgorithmic challenges and the lack of large-scale multi-view real-world\ndatasets. We address the dataset limitation with DiVa-360, a real-world 360\ndynamic visual dataset that contains synchronized high-resolution and\nlong-duration multi-view video sequences of table-scale scenes captured using a\ncustomized low-cost system with 53 cameras. It contains 21 object-centric\nsequences categorized by different motion types, 25 intricate hand-object\ninteraction sequences, and 8 long-duration sequences for a total of 17.4 M\nimage frames. In addition, we provide foreground-background segmentation masks,\nsynchronized audio, and text descriptions. We benchmark the state-of-the-art\ndynamic neural field methods on DiVa-360 and provide insights about existing\nmethods and future challenges on long-duration neural field capture.\n","authors":["Cheng-You Lu","Peisen Zhou","Angela Xing","Chandradeep Pokhariya","Arnab Dey","Ishaan Shah","Rugved Mavidipalli","Dylan Hu","Andrew Comport","Kefan Chen","Srinath Sridhar"],"pdf_url":"https://arxiv.org/pdf/2307.16897v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17898v1","updated":"2024-03-26T17:39:36Z","published":"2024-03-26T17:39:36Z","title":"Octree-GS: Towards Consistent Real-time Rendering with LOD-Structured 3D\n Gaussians","summary":" The recent 3D Gaussian splatting (3D-GS) has shown remarkable rendering\nfidelity and efficiency compared to NeRF-based neural scene representations.\nWhile demonstrating the potential for real-time rendering, 3D-GS encounters\nrendering bottlenecks in large scenes with complex details due to an excessive\nnumber of Gaussian primitives located within the viewing frustum. This\nlimitation is particularly noticeable in zoom-out views and can lead to\ninconsistent rendering speeds in scenes with varying details. Moreover, it\noften struggles to capture the corresponding level of details at different\nscales with its heuristic density control operation. Inspired by the\nLevel-of-Detail (LOD) techniques, we introduce Octree-GS, featuring an\nLOD-structured 3D Gaussian approach supporting level-of-detail decomposition\nfor scene representation that contributes to the final rendering results. Our\nmodel dynamically selects the appropriate level from the set of\nmulti-resolution anchor points, ensuring consistent rendering performance with\nadaptive LOD adjustments while maintaining high-fidelity rendering results.\n","authors":["Kerui Ren","Lihan Jiang","Tao Lu","Mulin Yu","Linning Xu","Zhangkai Ni","Bo Dai"],"pdf_url":"https://arxiv.org/pdf/2403.17898v1.pdf","comment":"Project page: https://city-super.github.io/octree-gs/"},{"id":"http://arxiv.org/abs/2403.17893v1","updated":"2024-03-26T17:29:26Z","published":"2024-03-26T17:29:26Z","title":"A Survey on 3D Egocentric Human Pose Estimation","summary":" Egocentric human pose estimation aims to estimate human body poses and\ndevelop body representations from a first-person camera perspective. It has\ngained vast popularity in recent years because of its wide range of\napplications in sectors like XR-technologies, human-computer interaction, and\nfitness tracking. However, to the best of our knowledge, there is no systematic\nliterature review based on the proposed solutions regarding egocentric 3D human\npose estimation. To that end, the aim of this survey paper is to provide an\nextensive overview of the current state of egocentric pose estimation research.\nIn this paper, we categorize and discuss the popular datasets and the different\npose estimation models, highlighting the strengths and weaknesses of different\nmethods by comparative analysis. This survey can be a valuable resource for\nboth researchers and practitioners in the field, offering insights into key\nconcepts and cutting-edge solutions in egocentric pose estimation, its\nwide-ranging applications, as well as the open problems with future scope.\n","authors":["Md Mushfiqur Azam","Kevin Desai"],"pdf_url":"https://arxiv.org/pdf/2403.17893v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17888v1","updated":"2024-03-26T17:21:24Z","published":"2024-03-26T17:21:24Z","title":"2D Gaussian Splatting for Geometrically Accurate Radiance Fields","summary":" 3D Gaussian Splatting (3DGS) has recently revolutionized radiance field\nreconstruction, achieving high quality novel view synthesis and fast rendering\nspeed without baking. However, 3DGS fails to accurately represent surfaces due\nto the multi-view inconsistent nature of 3D Gaussians. We present 2D Gaussian\nSplatting (2DGS), a novel approach to model and reconstruct geometrically\naccurate radiance fields from multi-view images. Our key idea is to collapse\nthe 3D volume into a set of 2D oriented planar Gaussian disks. Unlike 3D\nGaussians, 2D Gaussians provide view-consistent geometry while modeling\nsurfaces intrinsically. To accurately recover thin surfaces and achieve stable\noptimization, we introduce a perspective-accurate 2D splatting process\nutilizing ray-splat intersection and rasterization. Additionally, we\nincorporate depth distortion and normal consistency terms to further enhance\nthe quality of the reconstructions. We demonstrate that our differentiable\nrenderer allows for noise-free and detailed geometry reconstruction while\nmaintaining competitive appearance quality, fast training speed, and real-time\nrendering. Our code will be made publicly available.\n","authors":["Binbin Huang","Zehao Yu","Anpei Chen","Andreas Geiger","Shenghua Gao"],"pdf_url":"https://arxiv.org/pdf/2403.17888v1.pdf","comment":"12 pages, 12 figures"},{"id":"http://arxiv.org/abs/2403.17884v1","updated":"2024-03-26T17:16:04Z","published":"2024-03-26T17:16:04Z","title":"Sen2Fire: A Challenging Benchmark Dataset for Wildfire Detection using\n Sentinel Data","summary":" Utilizing satellite imagery for wildfire detection presents substantial\npotential for practical applications. To advance the development of machine\nlearning algorithms in this domain, our study introduces the \\textit{Sen2Fire}\ndataset--a challenging satellite remote sensing dataset tailored for wildfire\ndetection. This dataset is curated from Sentinel-2 multi-spectral data and\nSentinel-5P aerosol product, comprising a total of 2466 image patches. Each\npatch has a size of 512$\\times$512 pixels with 13 bands. Given the distinctive\nsensitivities of various wavebands to wildfire responses, our research focuses\non optimizing wildfire detection by evaluating different wavebands and\nemploying a combination of spectral indices, such as normalized burn ratio\n(NBR) and normalized difference vegetation index (NDVI). The results suggest\nthat, in contrast to using all bands for wildfire detection, selecting specific\nband combinations yields superior performance. Additionally, our study\nunderscores the positive impact of integrating Sentinel-5 aerosol data for\nwildfire detection. The code and dataset are available online\n(https://zenodo.org/records/10881058).\n","authors":["Yonghao Xu","Amanda Berg","Leif Haglund"],"pdf_url":"https://arxiv.org/pdf/2403.17884v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02640v3","updated":"2024-03-26T17:14:14Z","published":"2024-03-05T04:08:19Z","title":"HoloVIC: Large-scale Dataset and Benchmark for Multi-Sensor Holographic\n Intersection and Vehicle-Infrastructure Cooperative","summary":" Vehicle-to-everything (V2X) is a popular topic in the field of Autonomous\nDriving in recent years. Vehicle-infrastructure cooperation (VIC) becomes one\nof the important research area. Due to the complexity of traffic conditions\nsuch as blind spots and occlusion, it greatly limits the perception\ncapabilities of single-view roadside sensing systems. To further enhance the\naccuracy of roadside perception and provide better information to the vehicle\nside, in this paper, we constructed holographic intersections with various\nlayouts to build a large-scale multi-sensor holographic vehicle-infrastructure\ncooperation dataset, called HoloVIC. Our dataset includes 3 different types of\nsensors (Camera, Lidar, Fisheye) and employs 4 sensor-layouts based on the\ndifferent intersections. Each intersection is equipped with 6-18 sensors to\ncapture synchronous data. While autonomous vehicles pass through these\nintersections for collecting VIC data. HoloVIC contains in total on 100k+\nsynchronous frames from different sensors. Additionally, we annotated 3D\nbounding boxes based on Camera, Fisheye, and Lidar. We also associate the IDs\nof the same objects across different devices and consecutive frames in\nsequence. Based on HoloVIC, we formulated four tasks to facilitate the\ndevelopment of related research. We also provide benchmarks for these tasks.\n","authors":["Cong Ma","Lei Qiao","Chengkai Zhu","Kai Liu","Zelong Kong","Qing Li","Xueqi Zhou","Yuheng Kan","Wei Wu"],"pdf_url":"https://arxiv.org/pdf/2403.02640v3.pdf","comment":"Accept to CVPR 2024, Benchmark Website: https://holovic.net"},{"id":"http://arxiv.org/abs/2403.17883v1","updated":"2024-03-26T17:13:17Z","published":"2024-03-26T17:13:17Z","title":"Superior and Pragmatic Talking Face Generation with Teacher-Student\n Framework","summary":" Talking face generation technology creates talking videos from arbitrary\nappearance and motion signal, with the \"arbitrary\" offering ease of use but\nalso introducing challenges in practical applications. Existing methods work\nwell with standard inputs but suffer serious performance degradation with\nintricate real-world ones. Moreover, efficiency is also an important concern in\ndeployment. To comprehensively address these issues, we introduce SuperFace, a\nteacher-student framework that balances quality, robustness, cost and\neditability. We first propose a simple but effective teacher model capable of\nhandling inputs of varying qualities to generate high-quality results. Building\non this, we devise an efficient distillation strategy to acquire an\nidentity-specific student model that maintains quality with significantly\nreduced computational load. Our experiments validate that SuperFace offers a\nmore comprehensive solution than existing methods for the four mentioned\nobjectives, especially in reducing FLOPs by 99\\% with the student model.\nSuperFace can be driven by both video and audio and allows for localized facial\nattributes editing.\n","authors":["Chao Liang","Jianwen Jiang","Tianyun Zhong","Gaojie Lin","Zhengkun Rong","Jiaqi Yang","Yongming Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.17883v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17881v1","updated":"2024-03-26T17:12:34Z","published":"2024-03-26T17:12:34Z","title":"Deepfake Generation and Detection: A Benchmark and Survey","summary":" In addition to the advancements in deepfake generation, corresponding\ndetection technologies need to continuously evolve to regulate the potential\nmisuse of deepfakes, such as for privacy invasion and phishing attacks. This\nsurvey comprehensively reviews the latest developments in deepfake generation\nand detection, summarizing and analyzing the current state of the art in this\nrapidly evolving field. We first unify task definitions, comprehensively\nintroduce datasets and metrics, and discuss the development of generation and\ndetection technology frameworks. Then, we discuss the development of several\nrelated sub-fields and focus on researching four mainstream deepfake fields:\npopular face swap, face reenactment, talking face generation, and facial\nattribute editing, as well as foreign detection. Subsequently, we\ncomprehensively benchmark representative methods on popular datasets for each\nfield, fully evaluating the latest and influential works published in top\nconferences/journals. Finally, we analyze the challenges and future research\ndirections of the discussed fields. We closely follow the latest developments\nin https://github.com/flyingby/Awesome-Deepfake-Generation-and-Detection.\n","authors":["Gan Pei","Jiangning Zhang","Menghan Hu","Guangtao Zhai","Chengjie Wang","Zhenyu Zhang","Jian Yang","Chunhua Shen","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2403.17881v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17879v1","updated":"2024-03-26T17:11:51Z","published":"2024-03-26T17:11:51Z","title":"Low-Latency Neural Stereo Streaming","summary":" The rise of new video modalities like virtual reality or autonomous driving\nhas increased the demand for efficient multi-view video compression methods,\nboth in terms of rate-distortion (R-D) performance and in terms of delay and\nruntime. While most recent stereo video compression approaches have shown\npromising performance, they compress left and right views sequentially, leading\nto poor parallelization and runtime performance. This work presents Low-Latency\nneural codec for Stereo video Streaming (LLSS), a novel parallel stereo video\ncoding method designed for fast and efficient low-latency stereo video\nstreaming. Instead of using a sequential cross-view motion compensation like\nexisting methods, LLSS introduces a bidirectional feature shifting module to\ndirectly exploit mutual information among views and encode them effectively\nwith a joint cross-view prior model for entropy coding. Thanks to this design,\nLLSS processes left and right views in parallel, minimizing latency; all while\nsubstantially improving R-D performance compared to both existing neural and\nconventional codecs.\n","authors":["Qiqi Hou","Farzad Farhadzadeh","Amir Said","Guillaume Sautiere","Hoang Le"],"pdf_url":"https://arxiv.org/pdf/2403.17879v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.17870v1","updated":"2024-03-26T16:57:55Z","published":"2024-03-26T16:57:55Z","title":"Boosting Diffusion Models with Moving Average Sampling in Frequency\n Domain","summary":" Diffusion models have recently brought a powerful revolution in image\ngeneration. Despite showing impressive generative capabilities, most of these\nmodels rely on the current sample to denoise the next one, possibly resulting\nin denoising instability. In this paper, we reinterpret the iterative denoising\nprocess as model optimization and leverage a moving average mechanism to\nensemble all the prior samples. Instead of simply applying moving average to\nthe denoised samples at different timesteps, we first map the denoised samples\nto data space and then perform moving average to avoid distribution shift\nacross timesteps. In view that diffusion models evolve the recovery from\nlow-frequency components to high-frequency details, we further decompose the\nsamples into different frequency components and execute moving average\nseparately on each component. We name the complete approach \"Moving Average\nSampling in Frequency domain (MASF)\". MASF could be seamlessly integrated into\nmainstream pre-trained diffusion models and sampling schedules. Extensive\nexperiments on both unconditional and conditional diffusion models demonstrate\nthat our MASF leads to superior performances compared to the baselines, with\nalmost negligible additional complexity cost.\n","authors":["Yurui Qian","Qi Cai","Yingwei Pan","Yehao Li","Ting Yao","Qibin Sun","Tao Mei"],"pdf_url":"https://arxiv.org/pdf/2403.17870v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.17869v1","updated":"2024-03-26T16:57:33Z","published":"2024-03-26T16:57:33Z","title":"To Supervise or Not to Supervise: Understanding and Addressing the Key\n Challenges of 3D Transfer Learning","summary":" Transfer learning has long been a key factor in the advancement of many\nfields including 2D image analysis. Unfortunately, its applicability in 3D data\nprocessing has been relatively limited. While several approaches for 3D\ntransfer learning have been proposed in recent literature, with contrastive\nlearning gaining particular prominence, most existing methods in this domain\nhave only been studied and evaluated in limited scenarios. Most importantly,\nthere is currently a lack of principled understanding of both when and why 3D\ntransfer learning methods are applicable. Remarkably, even the applicability of\nstandard supervised pre-training is poorly understood. In this work, we conduct\nthe first in-depth quantitative and qualitative investigation of supervised and\ncontrastive pre-training strategies and their utility in downstream 3D tasks.\nWe demonstrate that layer-wise analysis of learned features provides\nsignificant insight into the downstream utility of trained networks. Informed\nby this analysis, we propose a simple geometric regularization strategy, which\nimproves the transferability of supervised pre-training. Our work thus sheds\nlight onto both the specific challenges of 3D transfer learning, as well as\nstrategies to overcome them.\n","authors":["Souhail Hadgi","Lei Li","Maks Ovsjanikov"],"pdf_url":"https://arxiv.org/pdf/2403.17869v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17846v1","updated":"2024-03-26T16:36:43Z","published":"2024-03-26T16:36:43Z","title":"Hierarchical Open-Vocabulary 3D Scene Graphs for Language-Grounded Robot\n Navigation","summary":" Recent open-vocabulary robot mapping methods enrich dense geometric maps with\npre-trained visual-language features. While these maps allow for the prediction\nof point-wise saliency maps when queried for a certain language concept,\nlarge-scale environments and abstract queries beyond the object level still\npose a considerable hurdle, ultimately limiting language-grounded robotic\nnavigation. In this work, we present HOV-SG, a hierarchical open-vocabulary 3D\nscene graph mapping approach for language-grounded robot navigation. Leveraging\nopen-vocabulary vision foundation models, we first obtain state-of-the-art\nopen-vocabulary segment-level maps in 3D and subsequently construct a 3D scene\ngraph hierarchy consisting of floor, room, and object concepts, each enriched\nwith open-vocabulary features. Our approach is able to represent multi-story\nbuildings and allows robotic traversal of those using a cross-floor Voronoi\ngraph. HOV-SG is evaluated on three distinct datasets and surpasses previous\nbaselines in open-vocabulary semantic accuracy on the object, room, and floor\nlevel while producing a 75% reduction in representation size compared to dense\nopen-vocabulary maps. In order to prove the efficacy and generalization\ncapabilities of HOV-SG, we showcase successful long-horizon\nlanguage-conditioned robot navigation within real-world multi-storage\nenvironments. We provide code and trial video data at http://hovsg.github.io/.\n","authors":["Abdelrhman Werby","Chenguang Huang","Martin Büchner","Abhinav Valada","Wolfram Burgard"],"pdf_url":"https://arxiv.org/pdf/2403.17846v1.pdf","comment":"Code and video are available at http://hovsg.github.io/"},{"id":"http://arxiv.org/abs/2401.06003v2","updated":"2024-03-26T16:30:20Z","published":"2024-01-11T16:06:36Z","title":"TRIPS: Trilinear Point Splatting for Real-Time Radiance Field Rendering","summary":" Point-based radiance field rendering has demonstrated impressive results for\nnovel view synthesis, offering a compelling blend of rendering quality and\ncomputational efficiency. However, also latest approaches in this domain are\nnot without their shortcomings. 3D Gaussian Splatting [Kerbl and Kopanas et al.\n2023] struggles when tasked with rendering highly detailed scenes, due to\nblurring and cloudy artifacts. On the other hand, ADOP [R\\\"uckert et al. 2022]\ncan accommodate crisper images, but the neural reconstruction network decreases\nperformance, it grapples with temporal instability and it is unable to\neffectively address large gaps in the point cloud.\n In this paper, we present TRIPS (Trilinear Point Splatting), an approach that\ncombines ideas from both Gaussian Splatting and ADOP. The fundamental concept\nbehind our novel technique involves rasterizing points into a screen-space\nimage pyramid, with the selection of the pyramid layer determined by the\nprojected point size. This approach allows rendering arbitrarily large points\nusing a single trilinear write. A lightweight neural network is then used to\nreconstruct a hole-free image including detail beyond splat resolution.\nImportantly, our render pipeline is entirely differentiable, allowing for\nautomatic optimization of both point sizes and positions.\n Our evaluation demonstrate that TRIPS surpasses existing state-of-the-art\nmethods in terms of rendering quality while maintaining a real-time frame rate\nof 60 frames per second on readily available hardware. This performance extends\nto challenging scenarios, such as scenes featuring intricate geometry,\nexpansive landscapes, and auto-exposed footage.\n The project page is located at: https://lfranke.github.io/trips/\n","authors":["Linus Franke","Darius Rückert","Laura Fink","Marc Stamminger"],"pdf_url":"https://arxiv.org/pdf/2401.06003v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17839v1","updated":"2024-03-26T16:27:37Z","published":"2024-03-26T16:27:37Z","title":"ReMamber: Referring Image Segmentation with Mamba Twister","summary":" Referring Image Segmentation (RIS) leveraging transformers has achieved great\nsuccess on the interpretation of complex visual-language tasks. However, the\nquadratic computation cost makes it resource-consuming in capturing long-range\nvisual-language dependencies. Fortunately, Mamba addresses this with efficient\nlinear complexity in processing. However, directly applying Mamba to\nmulti-modal interactions presents challenges, primarily due to inadequate\nchannel interactions for the effective fusion of multi-modal data. In this\npaper, we propose ReMamber, a novel RIS architecture that integrates the power\nof Mamba with a multi-modal Mamba Twister block. The Mamba Twister explicitly\nmodels image-text interaction, and fuses textual and visual features through\nits unique channel and spatial twisting mechanism. We achieve the\nstate-of-the-art on three challenging benchmarks. Moreover, we conduct thorough\nanalyses of ReMamber and discuss other fusion designs using Mamba. These\nprovide valuable perspectives for future research.\n","authors":["Yuhuan Yang","Chaofan Ma","Jiangchao Yao","Zhun Zhong","Ya Zhang","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2403.17839v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17837v1","updated":"2024-03-26T16:24:42Z","published":"2024-03-26T16:24:42Z","title":"GTA-HDR: A Large-Scale Synthetic Dataset for HDR Image Reconstruction","summary":" High Dynamic Range (HDR) content (i.e., images and videos) has a broad range\nof applications. However, capturing HDR content from real-world scenes is\nexpensive and time- consuming. Therefore, the challenging task of\nreconstructing visually accurate HDR images from their Low Dynamic Range (LDR)\ncounterparts is gaining attention in the vision research community. A major\nchallenge in this research problem is the lack of datasets, which capture\ndiverse scene conditions (e.g., lighting, shadows, weather, locations,\nlandscapes, objects, humans, buildings) and various image features (e.g.,\ncolor, contrast, saturation, hue, luminance, brightness, radiance). To address\nthis gap, in this paper, we introduce GTA-HDR, a large-scale synthetic dataset\nof photo-realistic HDR images sampled from the GTA-V video game. We perform\nthorough evaluation of the proposed dataset, which demonstrates significant\nqualitative and quantitative improvements of the state-of-the-art HDR image\nreconstruction methods. Furthermore, we demonstrate the effectiveness of the\nproposed dataset and its impact on additional computer vision tasks including\n3D human pose estimation, human body part segmentation, and holistic scene\nsegmentation. The dataset, data collection pipeline, and evaluation code are\navailable at: https://github.com/HrishavBakulBarua/GTA-HDR.\n","authors":["Hrishav Bakul Barua","Kalin Stefanov","KokSheik Wong","Abhinav Dhall","Ganesh Krishnasamy"],"pdf_url":"https://arxiv.org/pdf/2403.17837v1.pdf","comment":"Submitted to IEEE"},{"id":"http://arxiv.org/abs/2403.17834v1","updated":"2024-03-26T16:19:56Z","published":"2024-03-26T16:19:56Z","title":"A foundation model utilizing chest CT volumes and radiology reports for\n supervised-level zero-shot detection of abnormalities","summary":" A major challenge in computational research in 3D medical imaging is the lack\nof comprehensive datasets. Addressing this issue, our study introduces CT-RATE,\nthe first 3D medical imaging dataset that pairs images with textual reports.\nCT-RATE consists of 25,692 non-contrast chest CT volumes, expanded to 50,188\nthrough various reconstructions, from 21,304 unique patients, along with\ncorresponding radiology text reports. Leveraging CT-RATE, we developed CT-CLIP,\na CT-focused contrastive language-image pre-training framework. As a versatile,\nself-supervised model, CT-CLIP is designed for broad application and does not\nrequire task-specific training. Remarkably, CT-CLIP outperforms\nstate-of-the-art, fully supervised methods in multi-abnormality detection\nacross all key metrics, thus eliminating the need for manual annotation. We\nalso demonstrate its utility in case retrieval, whether using imagery or\ntextual queries, thereby advancing knowledge dissemination. The open-source\nrelease of CT-RATE and CT-CLIP marks a significant advancement in medical AI,\nenhancing 3D imaging analysis and fostering innovation in healthcare.\n","authors":["Ibrahim Ethem Hamamci","Sezgin Er","Furkan Almas","Ayse Gulnihan Simsek","Sevval Nil Esirgun","Irem Dogan","Muhammed Furkan Dasdelen","Bastian Wittmann","Enis Simsar","Mehmet Simsar","Emine Bensu Erdemir","Abdullah Alanbay","Anjany Sekuboyina","Berkan Lafci","Mehmet K. Ozdemir","Bjoern Menze"],"pdf_url":"https://arxiv.org/pdf/2403.17834v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2108.13969v3","updated":"2024-03-26T16:13:26Z","published":"2021-08-31T16:51:00Z","title":"Semi-Supervised Crowd Counting from Unlabeled Data","summary":" Automatic Crowd behavior analysis can be applied to effectively help the\ndaily transportation statistics and planning, which helps the smart city\nconstruction. As one of the most important keys, crowd counting has drawn\nincreasing attention. Recent works achieved promising performance but relied on\nthe supervised paradigm with expensive crowd annotations. To alleviate the\nannotation cost in real-world transportation scenarios, in this work we\nproposed a semi-supervised learning framework $S^{4}\\textit{Crowd}$, which can\nleverage both unlabeled/labeled data for robust crowd counting. In the\nunsupervised pathway, two \\textit{self-supervised losses} were proposed to\nsimulate the crowd variations such as scale, illumination, based on which\nsupervised information pseudo labels were generated and gradually refined. We\nalso proposed a crowd-driven recurrent unit \\textit{Gated-Crowd-Recurrent-Unit\n(GCRU)}, which can preserve discriminant crowd information by extracting\nsecond-order statistics, yielding pseudo labels with improved quality. A joint\nloss including both unsupervised/supervised information was proposed, and a\ndynamic weighting strategy was employed to balance the importance of the\nunsupervised loss and supervised loss at different training stages. We\nconducted extensive experiments on four popular crowd counting datasets in\nsemi-supervised settings. Experimental results supported the effectiveness of\neach proposed component in our $S^{4}$Crowd framework. Our method achieved\ncompetitive performance in semi-supervised learning approaches on these crowd\ncounting datasets.\n","authors":["Haoran Duan","Fan Wan","Rui Sun","Zeyu Wang","Varun Ojha","Yu Guan","Hubert P. H. Shum","Bingzhang Hu","Yang Long"],"pdf_url":"https://arxiv.org/pdf/2108.13969v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17830v1","updated":"2024-03-26T16:10:21Z","published":"2024-03-26T16:10:21Z","title":"Assessment of Multimodal Large Language Models in Alignment with Human\n Values","summary":" Large Language Models (LLMs) aim to serve as versatile assistants aligned\nwith human values, as defined by the principles of being helpful, honest, and\nharmless (hhh). However, in terms of Multimodal Large Language Models (MLLMs),\ndespite their commendable performance in perception and reasoning tasks, their\nalignment with human values remains largely unexplored, given the complexity of\ndefining hhh dimensions in the visual world and the difficulty in collecting\nrelevant data that accurately mirrors real-world situations. To address this\ngap, we introduce Ch3Ef, a Compreh3ensive Evaluation dataset and strategy for\nassessing alignment with human expectations. Ch3Ef dataset contains 1002\nhuman-annotated data samples, covering 12 domains and 46 tasks based on the hhh\nprinciple. We also present a unified evaluation strategy supporting assessment\nacross various scenarios and different perspectives. Based on the evaluation\nresults, we summarize over 10 key findings that deepen the understanding of\nMLLM capabilities, limitations, and the dynamic relationships between\nevaluation levels, guiding future advancements in the field.\n","authors":["Zhelun Shi","Zhipin Wang","Hongxing Fan","Zaibin Zhang","Lijun Li","Yongting Zhang","Zhenfei Yin","Lu Sheng","Yu Qiao","Jing Shao"],"pdf_url":"https://arxiv.org/pdf/2403.17830v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2311.02692"},{"id":"http://arxiv.org/abs/2403.17827v1","updated":"2024-03-26T16:06:42Z","published":"2024-03-26T16:06:42Z","title":"DiffH2O: Diffusion-Based Synthesis of Hand-Object Interactions from\n Textual Descriptions","summary":" Generating natural hand-object interactions in 3D is challenging as the\nresulting hand and object motions are expected to be physically plausible and\nsemantically meaningful. Furthermore, generalization to unseen objects is\nhindered by the limited scale of available hand-object interaction datasets. We\npropose DiffH2O, a novel method to synthesize realistic, one or two-handed\nobject interactions from provided text prompts and geometry of the object. The\nmethod introduces three techniques that enable effective learning from limited\ndata. First, we decompose the task into a grasping stage and a text-based\ninteraction stage and use separate diffusion models for each. In the grasping\nstage, the model only generates hand motions, whereas in the interaction phase\nboth hand and object poses are synthesized. Second, we propose a compact\nrepresentation that tightly couples hand and object poses. Third, we propose\ntwo different guidance schemes to allow more control of the generated motions:\ngrasp guidance and detailed textual guidance. Grasp guidance takes a single\ntarget grasping pose and guides the diffusion model to reach this grasp at the\nend of the grasping stage, which provides control over the grasping pose. Given\na grasping motion from this stage, multiple different actions can be prompted\nin the interaction phase. For textual guidance, we contribute comprehensive\ntext descriptions to the GRAB dataset and show that they enable our method to\nhave more fine-grained control over hand-object interactions. Our quantitative\nand qualitative evaluation demonstrates that the proposed method outperforms\nbaseline methods and leads to natural hand-object motions. Moreover, we\ndemonstrate the practicality of our framework by utilizing a hand pose estimate\nfrom an off-the-shelf pose estimator for guidance, and then sampling multiple\ndifferent actions in the interaction stage.\n","authors":["Sammy Christen","Shreyas Hampali","Fadime Sener","Edoardo Remelli","Tomas Hodan","Eric Sauser","Shugao Ma","Bugra Tekin"],"pdf_url":"https://arxiv.org/pdf/2403.17827v1.pdf","comment":"Project Page: https://diffh2o.github.io/"},{"id":"http://arxiv.org/abs/2403.17823v1","updated":"2024-03-26T16:04:19Z","published":"2024-03-26T16:04:19Z","title":"Efficient Image Pre-Training with Siamese Cropped Masked Autoencoders","summary":" Self-supervised pre-training of image encoders is omnipresent in the\nliterature, particularly following the introduction of Masked autoencoders\n(MAE). Current efforts attempt to learn object-centric representations from\nmotion in videos. In particular, SiamMAE recently introduced a Siamese network,\ntraining a shared-weight encoder from two frames of a video with a high\nasymmetric masking ratio (95%). In this work, we propose CropMAE, an\nalternative approach to the Siamese pre-training introduced by SiamMAE. Our\nmethod specifically differs by exclusively considering pairs of cropped images\nsourced from the same image but cropped differently, deviating from the\nconventional pairs of frames extracted from a video. CropMAE therefore\nalleviates the need for video datasets, while maintaining competitive\nperformances and drastically reducing pre-training time. Furthermore, we\ndemonstrate that CropMAE learns similar object-centric representations without\nexplicit motion, showing that current self-supervised learning methods do not\nlearn objects from motion, but rather thanks to the Siamese architecture.\nFinally, CropMAE achieves the highest masking ratio to date (98.5%), enabling\nthe reconstruction of images using only two visible patches. Our code is\navailable at https://github.com/alexandre-eymael/CropMAE.\n","authors":["Alexandre Eymaël","Renaud Vandeghen","Anthony Cioppa","Silvio Giancola","Bernard Ghanem","Marc Van Droogenbroeck"],"pdf_url":"https://arxiv.org/pdf/2403.17823v1.pdf","comment":"19 pages, 6 figures, 3 tables, 1 page of supplementary material"},{"id":"http://arxiv.org/abs/2403.17822v1","updated":"2024-03-26T16:00:31Z","published":"2024-03-26T16:00:31Z","title":"DN-Splatter: Depth and Normal Priors for Gaussian Splatting and Meshing","summary":" 3D Gaussian splatting, a novel differentiable rendering technique, has\nachieved state-of-the-art novel view synthesis results with high rendering\nspeeds and relatively low training times. However, its performance on scenes\ncommonly seen in indoor datasets is poor due to the lack of geometric\nconstraints during optimization. We extend 3D Gaussian splatting with depth and\nnormal cues to tackle challenging indoor datasets and showcase techniques for\nefficient mesh extraction, an important downstream application. Specifically,\nwe regularize the optimization procedure with depth information, enforce local\nsmoothness of nearby Gaussians, and use the geometry of the 3D Gaussians\nsupervised by normal cues to achieve better alignment with the true scene\ngeometry. We improve depth estimation and novel view synthesis results over\nbaselines and show how this simple yet effective regularization technique can\nbe used to directly extract meshes from the Gaussian representation yielding\nmore physically accurate reconstructions on indoor scenes. Our code will be\nreleased in https://github.com/maturk/dn-splatter.\n","authors":["Matias Turkulainen","Xuqian Ren","Iaroslav Melekhov","Otto Seiskari","Esa Rahtu","Juho Kannala"],"pdf_url":"https://arxiv.org/pdf/2403.17822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15964v2","updated":"2024-03-26T15:58:26Z","published":"2023-11-27T16:07:37Z","title":"Efficient Pre-training for Localized Instruction Generation of Videos","summary":" Procedural videos show step-by-step demonstrations of tasks like recipe\npreparation. Understanding such videos is challenging, involving the precise\nlocalization of steps and the generation of textual instructions. Manually\nannotating steps and writing instructions is costly, which limits the size of\ncurrent datasets and hinders effective learning. Leveraging large but noisy\nvideo-transcript datasets for pre-training can boost performance, but demands\nsignificant computational resources. Furthermore, transcripts contain\nirrelevant content and exhibit style variation compared to instructions written\nby human annotators. To mitigate both issues, we propose a technique,\nSieve-&-Swap, to automatically curate a smaller dataset: (i) Sieve filters\nirrelevant transcripts and (ii) Swap enhances the quality of the text\ninstruction by automatically replacing the transcripts with human-written\ninstructions from a text-only recipe dataset. The curated dataset, three orders\nof magnitude smaller than current web-scale datasets, enables efficient\ntraining of large-scale models with competitive performance. We complement our\nSieve-\\&-Swap approach with a Procedure Transformer (ProcX) for end-to-end step\nlocalization and instruction generation for procedural videos. When this model\nis pre-trained on our curated dataset, it achieves state-of-the-art performance\nin zero-shot and finetuning settings on YouCook2 and Tasty, while using a\nfraction of the computational resources.\n","authors":["Anil Batra","Davide Moltisanti","Laura Sevilla-Lara","Marcus Rohrbach","Frank Keller"],"pdf_url":"https://arxiv.org/pdf/2311.15964v2.pdf","comment":"This version has some missing experiments and elaborative technical\n details"},{"id":"http://arxiv.org/abs/2403.17808v1","updated":"2024-03-26T15:45:29Z","published":"2024-03-26T15:45:29Z","title":"Annotated Biomedical Video Generation using Denoising Diffusion\n Probabilistic Models and Flow Fields","summary":" The segmentation and tracking of living cells play a vital role within the\nbiomedical domain, particularly in cancer research, drug development, and\ndevelopmental biology. These are usually tedious and time-consuming tasks that\nare traditionally done by biomedical experts. Recently, to automatize these\nprocesses, deep learning based segmentation and tracking methods have been\nproposed. These methods require large-scale datasets and their full potential\nis constrained by the scarcity of annotated data in the biomedical imaging\ndomain. To address this limitation, we propose Biomedical Video Diffusion Model\n(BVDM), capable of generating realistic-looking synthetic microscopy videos.\nTrained only on a single real video, BVDM can generate videos of arbitrary\nlength with pixel-level annotations that can be used for training data-hungry\nmodels. It is composed of a denoising diffusion probabilistic model (DDPM)\ngenerating high-fidelity synthetic cell microscopy images and a flow prediction\nmodel (FPM) predicting the non-rigid transformation between consecutive video\nframes. During inference, initially, the DDPM imposes realistic cell textures\non synthetic cell masks which are generated based on real data statistics. The\nflow prediction model predicts the flow field between consecutive masks and\napplies that to the DDPM output from the previous time frame to create the next\none while keeping temporal consistency. BVDM outperforms state-of-the-art\nsynthetic live cell microscopy video generation models. Furthermore, we\ndemonstrate that a sufficiently large synthetic dataset enhances the\nperformance of cell segmentation and tracking models compared to using a\nlimited amount of available real data.\n","authors":["Rüveyda Yilmaz","Dennis Eschweiler","Johannes Stegmaier"],"pdf_url":"https://arxiv.org/pdf/2403.17808v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17804v1","updated":"2024-03-26T15:42:01Z","published":"2024-03-26T15:42:01Z","title":"Improving Text-to-Image Consistency via Automatic Prompt Optimization","summary":" Impressive advances in text-to-image (T2I) generative models have yielded a\nplethora of high performing models which are able to generate aesthetically\nappealing, photorealistic images. Despite the progress, these models still\nstruggle to produce images that are consistent with the input prompt,\noftentimes failing to capture object quantities, relations and attributes\nproperly. Existing solutions to improve prompt-image consistency suffer from\nthe following challenges: (1) they oftentimes require model fine-tuning, (2)\nthey only focus on nearby prompt samples, and (3) they are affected by\nunfavorable trade-offs among image quality, representation diversity, and\nprompt-image consistency. In this paper, we address these challenges and\nintroduce a T2I optimization-by-prompting framework, OPT2I, which leverages a\nlarge language model (LLM) to improve prompt-image consistency in T2I models.\nOur framework starts from a user prompt and iteratively generates revised\nprompts with the goal of maximizing a consistency score. Our extensive\nvalidation on two datasets, MSCOCO and PartiPrompts, shows that OPT2I can boost\nthe initial consistency score by up to 24.9% in terms of DSG score while\npreserving the FID and increasing the recall between generated and real data.\nOur work paves the way toward building more reliable and robust T2I systems by\nharnessing the power of LLMs.\n","authors":["Oscar Mañas","Pietro Astolfi","Melissa Hall","Candace Ross","Jack Urbanek","Adina Williams","Aishwarya Agrawal","Adriana Romero-Soriano","Michal Drozdzal"],"pdf_url":"https://arxiv.org/pdf/2403.17804v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00454v3","updated":"2024-03-26T15:41:17Z","published":"2023-09-30T18:13:41Z","title":"SimLVSeg: Simplifying Left Ventricular Segmentation in 2D+Time\n Echocardiograms with Self- and Weakly-Supervised Learning","summary":" Echocardiography has become an indispensable clinical imaging modality for\ngeneral heart health assessment. From calculating biomarkers such as ejection\nfraction to the probability of a patient's heart failure, accurate segmentation\nof the heart structures allows doctors to assess the heart's condition and\ndevise treatments with greater precision and accuracy. However, achieving\naccurate and reliable left ventricle segmentation is time-consuming and\nchallenging due to different reasons. Hence, clinicians often rely on\nsegmenting the left ventricular (LV) in two specific echocardiogram frames to\nmake a diagnosis. This limited coverage in manual LV segmentation poses a\nchallenge for developing automatic LV segmentation with high temporal\nconsistency, as the resulting dataset is typically annotated sparsely. In\nresponse to this challenge, this work introduces SimLVSeg, a novel paradigm\nthat enables video-based networks for consistent LV segmentation from sparsely\nannotated echocardiogram videos. SimLVSeg consists of self-supervised\npre-training with temporal masking, followed by weakly supervised learning\ntailored for LV segmentation from sparse annotations. We demonstrate how\nSimLVSeg outperforms the state-of-the-art solutions by achieving a 93.32%\n(95%CI 93.21-93.43%) dice score on the largest 2D+time echocardiography dataset\n(EchoNet-Dynamic) while being more efficient. SimLVSeg is compatible with two\ntypes of video segmentation networks: 2D super image and 3D segmentation. To\nshow the effectiveness of our approach, we provide extensive ablation studies,\nincluding pre-training settings and various deep learning backbones. We further\nconduct an out-of-distribution test to showcase SimLVSeg's generalizability on\nunseen distribution (CAMUS dataset). The code is publicly available at\nhttps://github.com/fadamsyah/SimLVSeg.\n","authors":["Fadillah Maani","Asim Ukaye","Nada Saadi","Numan Saeed","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2310.00454v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08639v2","updated":"2024-03-26T15:40:20Z","published":"2024-03-13T15:51:23Z","title":"HIMap: HybrId Representation Learning for End-to-end Vectorized HD Map\n Construction","summary":" Vectorized High-Definition (HD) map construction requires predictions of the\ncategory and point coordinates of map elements (e.g. road boundary, lane\ndivider, pedestrian crossing, etc.). State-of-the-art methods are mainly based\non point-level representation learning for regressing accurate point\ncoordinates. However, this pipeline has limitations in obtaining element-level\ninformation and handling element-level failures, e.g. erroneous element shape\nor entanglement between elements. To tackle the above issues, we propose a\nsimple yet effective HybrId framework named HIMap to sufficiently learn and\ninteract both point-level and element-level information. Concretely, we\nintroduce a hybrid representation called HIQuery to represent all map elements,\nand propose a point-element interactor to interactively extract and encode the\nhybrid information of elements, e.g. point position and element shape, into the\nHIQuery. Additionally, we present a point-element consistency constraint to\nenhance the consistency between the point-level and element-level information.\nFinally, the output point-element integrated HIQuery can be directly converted\ninto map elements' class, point coordinates, and mask. We conduct extensive\nexperiments and consistently outperform previous methods on both nuScenes and\nArgoverse2 datasets. Notably, our method achieves $77.8$ mAP on the nuScenes\ndataset, remarkably superior to previous SOTAs by $8.3$ mAP at least.\n","authors":["Yi Zhou","Hui Zhang","Jiaqian Yu","Yifan Yang","Sangil Jung","Seung-In Park","ByungIn Yoo"],"pdf_url":"https://arxiv.org/pdf/2403.08639v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.17801v1","updated":"2024-03-26T15:40:05Z","published":"2024-03-26T15:40:05Z","title":"Towards 3D Vision with Low-Cost Single-Photon Cameras","summary":" We present a method for reconstructing 3D shape of arbitrary Lambertian\nobjects based on measurements by miniature, energy-efficient, low-cost\nsingle-photon cameras. These cameras, operating as time resolved image sensors,\nilluminate the scene with a very fast pulse of diffuse light and record the\nshape of that pulse as it returns back from the scene at a high temporal\nresolution. We propose to model this image formation process, account for its\nnon-idealities, and adapt neural rendering to reconstruct 3D geometry from a\nset of spatially distributed sensors with known poses. We show that our\napproach can successfully recover complex 3D shapes from simulated data. We\nfurther demonstrate 3D object reconstruction from real-world captures,\nutilizing measurements from a commodity proximity sensor. Our work draws a\nconnection between image-based modeling and active range scanning and is a step\ntowards 3D vision with single-photon cameras.\n","authors":["Fangzhou Mu","Carter Sifferman","Sacha Jungerman","Yiquan Li","Mark Han","Michael Gleicher","Mohit Gupta","Yin Li"],"pdf_url":"https://arxiv.org/pdf/2403.17801v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17787v1","updated":"2024-03-26T15:20:49Z","published":"2024-03-26T15:20:49Z","title":"Evaluating the Efficacy of Prompt-Engineered Large Multimodal Models\n Versus Fine-Tuned Vision Transformers in Image-Based Security Applications","summary":" The success of Large Language Models (LLMs) has led to a parallel rise in the\ndevelopment of Large Multimodal Models (LMMs), such as Gemini-pro, which have\nbegun to transform a variety of applications. These sophisticated multimodal\nmodels are designed to interpret and analyze complex data, integrating both\ntextual and visual information on a scale previously unattainable, opening new\navenues for a range of applications. This paper investigates the applicability\nand effectiveness of prompt-engineered Gemini-pro LMMs versus fine-tuned Vision\nTransformer (ViT) models in addressing critical security challenges. We focus\non two distinct tasks: a visually evident task of detecting simple triggers,\nsuch as small squares in images, indicative of potential backdoors, and a\nnon-visually evident task of malware classification through visual\nrepresentations. Our results highlight a significant divergence in performance,\nwith Gemini-pro falling short in accuracy and reliability when compared to\nfine-tuned ViT models. The ViT models, on the other hand, demonstrate\nexceptional accuracy, achieving near-perfect performance on both tasks. This\nstudy not only showcases the strengths and limitations of prompt-engineered\nLMMs in cybersecurity applications but also emphasizes the unmatched efficacy\nof fine-tuned ViT models for precise and dependable tasks.\n","authors":["Fouad Trad","Ali Chehab"],"pdf_url":"https://arxiv.org/pdf/2403.17787v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17782v1","updated":"2024-03-26T15:15:15Z","published":"2024-03-26T15:15:15Z","title":"GenesisTex: Adapting Image Denoising Diffusion to Texture Space","summary":" We present GenesisTex, a novel method for synthesizing textures for 3D\ngeometries from text descriptions. GenesisTex adapts the pretrained image\ndiffusion model to texture space by texture space sampling. Specifically, we\nmaintain a latent texture map for each viewpoint, which is updated with\npredicted noise on the rendering of the corresponding viewpoint. The sampled\nlatent texture maps are then decoded into a final texture map. During the\nsampling process, we focus on both global and local consistency across multiple\nviewpoints: global consistency is achieved through the integration of style\nconsistency mechanisms within the noise prediction network, and low-level\nconsistency is achieved by dynamically aligning latent textures. Finally, we\napply reference-based inpainting and img2img on denser views for texture\nrefinement. Our approach overcomes the limitations of slow optimization in\ndistillation-based methods and instability in inpainting-based methods.\nExperiments on meshes from various sources demonstrate that our method\nsurpasses the baseline methods quantitatively and qualitatively.\n","authors":["Chenjian Gao","Boyan Jiang","Xinghui Li","Yingpeng Zhang","Qian Yu"],"pdf_url":"https://arxiv.org/pdf/2403.17782v1.pdf","comment":"12 pages, 10 figures"},{"id":"http://arxiv.org/abs/2403.16167v2","updated":"2024-03-26T15:14:25Z","published":"2024-03-24T14:21:06Z","title":"Exploiting Semantic Reconstruction to Mitigate Hallucinations in\n Vision-Language Models","summary":" Hallucinations in vision-language models pose a significant challenge to\ntheir reliability, particularly in the generation of long captions. Current\nmethods fall short of accurately identifying and mitigating these\nhallucinations. To address this issue, we introduce ESREAL, a novel\nunsupervised learning framework designed to suppress the generation of\nhallucinations through accurate localization and penalization of hallucinated\ntokens. Initially, ESREAL creates a reconstructed image based on the generated\ncaption and aligns its corresponding regions with those of the original image.\nThis semantic reconstruction aids in identifying both the presence and type of\ntoken-level hallucinations within the generated caption. Subsequently, ESREAL\ncomputes token-level hallucination scores by assessing the semantic similarity\nof aligned regions based on the type of hallucination. Finally, ESREAL employs\na proximal policy optimization algorithm, where it selectively penalizes\nhallucinated tokens according to their token-level hallucination scores. Our\nframework notably reduces hallucinations in LLaVA, InstructBLIP, and mPLUG-Owl2\nby 32.81%, 27.08%, and 7.46% on the CHAIR metric. This improvement is achieved\nsolely through signals derived from the image itself, without the need for any\nimage-text pairs.\n","authors":["Minchan Kim","Minyeong Kim","Junik Bae","Suhwan Choi","Sungkyung Kim","Buru Chang"],"pdf_url":"https://arxiv.org/pdf/2403.16167v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12225v2","updated":"2024-03-26T15:06:00Z","published":"2024-02-19T15:33:09Z","title":"Pushing Auto-regressive Models for 3D Shape Generation at Capacity and\n Scalability","summary":" Auto-regressive models have achieved impressive results in 2D image\ngeneration by modeling joint distributions in grid space. In this paper, we\nextend auto-regressive models to 3D domains, and seek a stronger ability of 3D\nshape generation by improving auto-regressive models at capacity and\nscalability simultaneously. Firstly, we leverage an ensemble of publicly\navailable 3D datasets to facilitate the training of large-scale models. It\nconsists of a comprehensive collection of approximately 900,000 objects, with\nmultiple properties of meshes, points, voxels, rendered images, and text\ncaptions. This diverse labeled dataset, termed Objaverse-Mix, empowers our\nmodel to learn from a wide range of object variations. However, directly\napplying 3D auto-regression encounters critical challenges of high\ncomputational demands on volumetric grids and ambiguous auto-regressive order\nalong grid dimensions, resulting in inferior quality of 3D shapes. To this end,\nwe then present a novel framework Argus3D in terms of capacity. Concretely, our\napproach introduces discrete representation learning based on a latent vector\ninstead of volumetric grids, which not only reduces computational costs but\nalso preserves essential geometric details by learning the joint distributions\nin a more tractable order. The capacity of conditional generation can thus be\nrealized by simply concatenating various conditioning inputs to the latent\nvector, such as point clouds, categories, images, and texts. In addition,\nthanks to the simplicity of our model architecture, we naturally scale up our\napproach to a larger model with an impressive 3.6 billion parameters, further\nenhancing the quality of versatile 3D generation. Extensive experiments on four\ngeneration tasks demonstrate that Argus3D can synthesize diverse and faithful\nshapes across multiple categories, achieving remarkable performance.\n","authors":["Xuelin Qian","Yu Wang","Simian Luo","Yinda Zhang","Ying Tai","Zhenyu Zhang","Chengjie Wang","Xiangyang Xue","Bo Zhao","Tiejun Huang","Yunsheng Wu","Yanwei Fu"],"pdf_url":"https://arxiv.org/pdf/2402.12225v2.pdf","comment":"Project page: https://argus-3d.github.io/ . Datasets:\n https://huggingface.co/datasets/BAAI/Objaverse-MIX. arXiv admin note:\n substantial text overlap with arXiv:2303.14700"},{"id":"http://arxiv.org/abs/2403.17770v1","updated":"2024-03-26T14:59:11Z","published":"2024-03-26T14:59:11Z","title":"CT Synthesis with Conditional Diffusion Models for Abdominal Lymph Node\n Segmentation","summary":" Despite the significant success achieved by deep learning methods in medical\nimage segmentation, researchers still struggle in the computer-aided diagnosis\nof abdominal lymph nodes due to the complex abdominal environment, small and\nindistinguishable lesions, and limited annotated data. To address these\nproblems, we present a pipeline that integrates the conditional diffusion model\nfor lymph node generation and the nnU-Net model for lymph node segmentation to\nimprove the segmentation performance of abdominal lymph nodes through\nsynthesizing a diversity of realistic abdominal lymph node data. We propose\nLN-DDPM, a conditional denoising diffusion probabilistic model (DDPM) for lymph\nnode (LN) generation. LN-DDPM utilizes lymph node masks and anatomical\nstructure masks as model conditions. These conditions work in two conditioning\nmechanisms: global structure conditioning and local detail conditioning, to\ndistinguish between lymph nodes and their surroundings and better capture lymph\nnode characteristics. The obtained paired abdominal lymph node images and masks\nare used for the downstream segmentation task. Experimental results on the\nabdominal lymph node datasets demonstrate that LN-DDPM outperforms other\ngenerative methods in the abdominal lymph node image synthesis and better\nassists the downstream abdominal lymph node segmentation task.\n","authors":["Yongrui Yu","Hanyu Chen","Zitian Zhang","Qiong Xiao","Wenhui Lei","Linrui Dai","Yu Fu","Hui Tan","Guan Wang","Peng Gao","Xiaofan Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.17770v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17057v2","updated":"2024-03-26T14:54:04Z","published":"2023-11-28T18:59:52Z","title":"ReMoS: 3D Motion-Conditioned Reaction Synthesis for Two-Person\n Interactions","summary":" Current approaches for 3D human motion synthesis generate high-quality\nanimations of digital humans performing a wide variety of actions and gestures.\nHowever, a notable technological gap exists in addressing the complex dynamics\nof multi-human interactions within this paradigm. In this work, we present\nReMoS, a denoising diffusion-based model that synthesizes full-body reactive\nmotion of a person in a two-person interaction scenario. Assuming the motion of\none person is given, we employ a combined spatio-temporal cross-attention\nmechanism to synthesize the reactive body and hand motion of the second person,\nthereby completing the interactions between the two. We demonstrate ReMoS\nacross challenging two-person scenarios such as pair-dancing, Ninjutsu,\nkickboxing, and acrobatics, where one person's movements have complex and\ndiverse influences on the other. We also contribute the ReMoCap dataset for\ntwo-person interactions containing full-body and finger motions. We evaluate\nReMoS through multiple quantitative metrics, qualitative visualizations, and a\nuser study, and also indicate usability in interactive motion editing\napplications.\n","authors":["Anindita Ghosh","Rishabh Dabral","Vladislav Golyanik","Christian Theobalt","Philipp Slusallek"],"pdf_url":"https://arxiv.org/pdf/2311.17057v2.pdf","comment":"17 pages, 7 figures, 5 tables"},{"id":"http://arxiv.org/abs/2403.17765v1","updated":"2024-03-26T14:53:24Z","published":"2024-03-26T14:53:24Z","title":"MUTE-SLAM: Real-Time Neural SLAM with Multiple Tri-Plane Hash\n Representations","summary":" We introduce MUTE-SLAM, a real-time neural RGB-D SLAM system employing\nmultiple tri-plane hash-encodings for efficient scene representation. MUTE-SLAM\neffectively tracks camera positions and incrementally builds a scalable\nmulti-map representation for both small and large indoor environments. It\ndynamically allocates sub-maps for newly observed local regions, enabling\nconstraint-free mapping without prior scene information. Unlike traditional\ngrid-based methods, we use three orthogonal axis-aligned planes for\nhash-encoding scene properties, significantly reducing hash collisions and the\nnumber of trainable parameters. This hybrid approach not only speeds up\nconvergence but also enhances the fidelity of surface reconstruction.\nFurthermore, our optimization strategy concurrently optimizes all sub-maps\nintersecting with the current camera frustum, ensuring global consistency.\nExtensive testing on both real-world and synthetic datasets has shown that\nMUTE-SLAM delivers state-of-the-art surface reconstruction quality and\ncompetitive tracking performance across diverse indoor settings. The code will\nbe made public upon acceptance of the paper.\n","authors":["Yifan Yan","Ruomin He","Zhenghua Liu"],"pdf_url":"https://arxiv.org/pdf/2403.17765v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15585v2","updated":"2024-03-26T14:51:57Z","published":"2024-03-22T19:19:51Z","title":"MedPromptX: Grounded Multimodal Prompting for Chest X-ray Diagnosis","summary":" Chest X-ray images are commonly used for predicting acute and chronic\ncardiopulmonary conditions, but efforts to integrate them with structured\nclinical data face challenges due to incomplete electronic health records\n(EHR). This paper introduces \\textbf{MedPromptX}, the first model to integrate\nmultimodal large language models (MLLMs), few-shot prompting (FP) and visual\ngrounding (VG) to combine imagery with EHR data for chest X-ray diagnosis. A\npre-trained MLLM is utilized to complement the missing EHR information,\nproviding a comprehensive understanding of patients' medical history.\nAdditionally, FP reduces the necessity for extensive training of MLLMs while\neffectively tackling the issue of hallucination. Nevertheless, the process of\ndetermining the optimal number of few-shot examples and selecting high-quality\ncandidates can be burdensome, yet it profoundly influences model performance.\nHence, we propose a new technique that dynamically refines few-shot data for\nreal-time adjustment to new patient scenarios. Moreover, VG aids in focusing\nthe model's attention on relevant regions of interest in X-ray images,\nenhancing the identification of abnormalities. We release MedPromptX-VQA, a new\nin-context visual question answering dataset encompassing interleaved image and\nEHR data derived from MIMIC-IV and MIMIC-CXR databases. Results demonstrate the\nSOTA performance of MedPromptX, achieving an 11% improvement in F1-score\ncompared to the baselines. Code and data are available at\nhttps://github.com/BioMedIA-MBZUAI/MedPromptX\n","authors":["Mai A. Shaaban","Adnan Khan","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2403.15585v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17761v1","updated":"2024-03-26T14:51:53Z","published":"2024-03-26T14:51:53Z","title":"Makeup Prior Models for 3D Facial Makeup Estimation and Applications","summary":" In this work, we introduce two types of makeup prior models to extend\nexisting 3D face prior models: PCA-based and StyleGAN2-based priors. The\nPCA-based prior model is a linear model that is easy to construct and is\ncomputationally efficient. However, it retains only low-frequency information.\nConversely, the StyleGAN2-based model can represent high-frequency information\nwith relatively higher computational cost than the PCA-based model. Although\nthere is a trade-off between the two models, both are applicable to 3D facial\nmakeup estimation and related applications. By leveraging makeup prior models\nand designing a makeup consistency module, we effectively address the\nchallenges that previous methods faced in robustly estimating makeup,\nparticularly in the context of handling self-occluded faces. In experiments, we\ndemonstrate that our approach reduces computational costs by several orders of\nmagnitude, achieving speeds up to 180 times faster. In addition, by improving\nthe accuracy of the estimated makeup, we confirm that our methods are highly\nadvantageous for various 3D facial makeup applications such as 3D makeup face\nreconstruction, user-friendly makeup editing, makeup transfer, and\ninterpolation.\n","authors":["Xingchao Yang","Takafumi Taketomi","Yuki Endo","Yoshihiro Kanamori"],"pdf_url":"https://arxiv.org/pdf/2403.17761v1.pdf","comment":"CVPR2024. Project: https://yangxingchao.github.io/makeup-priors-page"},{"id":"http://arxiv.org/abs/2403.17757v1","updated":"2024-03-26T14:49:22Z","published":"2024-03-26T14:49:22Z","title":"Noise2Noise Denoising of CRISM Hyperspectral Data","summary":" Hyperspectral data acquired by the Compact Reconnaissance Imaging\nSpectrometer for Mars (CRISM) have allowed for unparalleled mapping of the\nsurface mineralogy of Mars. Due to sensor degradation over time, a significant\nportion of the recently acquired data is considered unusable. Here a new\ndata-driven model architecture, Noise2Noise4Mars (N2N4M), is introduced to\nremove noise from CRISM images. Our model is self-supervised and does not\nrequire zero-noise target data, making it well suited for use in Planetary\nScience applications where high quality labelled data is scarce. We demonstrate\nits strong performance on synthetic-noise data and CRISM images, and its impact\non downstream classification performance, outperforming benchmark methods on\nmost metrics. This allows for detailed analysis for critical sites of interest\non the Martian surface, including proposed lander sites.\n","authors":["Robert Platt","Rossella Arcucci","Cédric John"],"pdf_url":"https://arxiv.org/pdf/2403.17757v1.pdf","comment":"5 pages, 3 figures. Accepted as a conference paper at the ICLR 2024\n ML4RS Workshop"},{"id":"http://arxiv.org/abs/2403.17755v1","updated":"2024-03-26T14:44:51Z","published":"2024-03-26T14:44:51Z","title":"DataCook: Crafting Anti-Adversarial Examples for Healthcare Data\n Copyright Protection","summary":" In the realm of healthcare, the challenges of copyright protection and\nunauthorized third-party misuse are increasingly significant. Traditional\nmethods for data copyright protection are applied prior to data distribution,\nimplying that models trained on these data become uncontrollable. This paper\nintroduces a novel approach, named DataCook, designed to safeguard the\ncopyright of healthcare data during the deployment phase. DataCook operates by\n\"cooking\" the raw data before distribution, enabling the development of models\nthat perform normally on this processed data. However, during the deployment\nphase, the original test data must be also \"cooked\" through DataCook to ensure\nnormal model performance. This process grants copyright holders control over\nauthorization during the deployment phase. The mechanism behind DataCook is by\ncrafting anti-adversarial examples (AntiAdv), which are designed to enhance\nmodel confidence, as opposed to standard adversarial examples (Adv) that aim to\nconfuse models. Similar to Adv, AntiAdv introduces imperceptible perturbations,\nensuring that the data processed by DataCook remains easily understandable. We\nconducted extensive experiments on MedMNIST datasets, encompassing both 2D/3D\ndata and the high-resolution variants. The outcomes indicate that DataCook\neffectively meets its objectives, preventing models trained on AntiAdv from\nanalyzing unauthorized data effectively, without compromising the validity and\naccuracy of the data in legitimate scenarios. Code and data are available at\nhttps://github.com/MedMNIST/DataCook.\n","authors":["Sihan Shang","Jiancheng Yang","Zhenglong Sun","Pascal Fua"],"pdf_url":"https://arxiv.org/pdf/2403.17755v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06247v2","updated":"2024-03-26T14:42:21Z","published":"2024-03-10T16:11:17Z","title":"Text-Guided Variational Image Generation for Industrial Anomaly\n Detection and Segmentation","summary":" We propose a text-guided variational image generation method to address the\nchallenge of getting clean data for anomaly detection in industrial\nmanufacturing. Our method utilizes text information about the target object,\nlearned from extensive text library documents, to generate non-defective data\nimages resembling the input image. The proposed framework ensures that the\ngenerated non-defective images align with anticipated distributions derived\nfrom textual and image-based knowledge, ensuring stability and generality.\nExperimental results demonstrate the effectiveness of our approach, surpassing\nprevious methods even with limited non-defective data. Our approach is\nvalidated through generalization tests across four baseline models and three\ndistinct datasets. We present an additional analysis to enhance the\neffectiveness of anomaly detection models by utilizing the generated images.\n","authors":["Mingyu Lee","Jongwon Choi"],"pdf_url":"https://arxiv.org/pdf/2403.06247v2.pdf","comment":"18 pages, Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.17749v1","updated":"2024-03-26T14:40:17Z","published":"2024-03-26T14:40:17Z","title":"Multi-Task Dense Prediction via Mixture of Low-Rank Experts","summary":" Previous multi-task dense prediction methods based on the Mixture of Experts\n(MoE) have received great performance but they neglect the importance of\nexplicitly modeling the global relations among all tasks. In this paper, we\npresent a novel decoder-focused method for multi-task dense prediction, called\nMixture-of-Low-Rank-Experts (MLoRE). To model the global task relationships,\nMLoRE adds a generic convolution path to the original MoE structure, where each\ntask feature can go through this path for explicit parameter sharing.\nFurthermore, to control the parameters and computational cost brought by the\nincrease in the number of experts, we take inspiration from LoRA and propose to\nleverage the low-rank format of a vanilla convolution in the expert network.\nSince the low-rank experts have fewer parameters and can be dynamically\nparameterized into the generic convolution, the parameters and computational\ncost do not change much with the increase of experts. Benefiting from this\ndesign, we increase the number of experts and its reception field to enlarge\nthe representation capacity, facilitating multiple dense tasks learning in a\nunified network. Extensive experiments on the PASCAL-Context and NYUD-v2\nbenchmarks show that our MLoRE achieves superior performance compared to\nprevious state-of-the-art methods on all metrics. Our code is available at\nhttps://github.com/YuqiYang213/MLoRE.\n","authors":["Yuqi Yang","Peng-Tao Jiang","Qibin Hou","Hao Zhang","Jinwei Chen","Bo Li"],"pdf_url":"https://arxiv.org/pdf/2403.17749v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.08270v2","updated":"2024-03-26T14:39:43Z","published":"2024-03-13T05:46:36Z","title":"Identity-aware Dual-constraint Network for Cloth-Changing Person\n Re-identification","summary":" Cloth-Changing Person Re-Identification (CC-ReID) aims to accurately identify\nthe target person in more realistic surveillance scenarios, where pedestrians\nusually change their clothing. Despite great progress, limited cloth-changing\ntraining samples in existing CC-ReID datasets still prevent the model from\nadequately learning cloth-irrelevant features. In addition, due to the absence\nof explicit supervision to keep the model constantly focused on\ncloth-irrelevant areas, existing methods are still hampered by the disruption\nof clothing variations. To solve the above issues, we propose an Identity-aware\nDual-constraint Network (IDNet) for the CC-ReID task. Specifically, to help the\nmodel extract cloth-irrelevant clues, we propose a Clothes Diversity\nAugmentation (CDA), which generates more realistic cloth-changing samples by\nenriching the clothing color while preserving the texture. In addition, a\nMulti-scale Constraint Block (MCB) is designed, which extracts fine-grained\nidentity-related features and effectively transfers cloth-irrelevant knowledge.\nMoreover, a Counterfactual-guided Attention Module (CAM) is presented, which\nlearns cloth-irrelevant features from channel and space dimensions and utilizes\nthe counterfactual intervention for supervising the attention map to highlight\nidentity-related regions. Finally, a Semantic Alignment Constraint (SAC) is\ndesigned to facilitate high-level semantic feature interaction. Comprehensive\nexperiments on four CC-ReID datasets indicate that our method outperforms prior\nstate-of-the-art approaches.\n","authors":["Peini Guo","Mengyuan Liu","Hong Liu","Ruijia Fan","Guoquan Wang","Bin He"],"pdf_url":"https://arxiv.org/pdf/2403.08270v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02129v4","updated":"2024-03-26T14:38:23Z","published":"2023-10-03T15:10:46Z","title":"Unveiling the Pitfalls of Knowledge Editing for Large Language Models","summary":" As the cost associated with fine-tuning Large Language Models (LLMs)\ncontinues to rise, recent research efforts have pivoted towards developing\nmethodologies to edit implicit knowledge embedded within LLMs. Yet, there's\nstill a dark cloud lingering overhead -- will knowledge editing trigger\nbutterfly effect? since it is still unclear whether knowledge editing might\nintroduce side effects that pose potential risks or not. This paper pioneers\nthe investigation into the potential pitfalls associated with knowledge editing\nfor LLMs. To achieve this, we introduce new benchmark datasets and propose\ninnovative evaluation metrics. Our results underline two pivotal concerns: (1)\nKnowledge Conflict: Editing groups of facts that logically clash can magnify\nthe inherent inconsistencies in LLMs-a facet neglected by previous methods. (2)\nKnowledge Distortion: Altering parameters with the aim of editing factual\nknowledge can irrevocably warp the innate knowledge structure of LLMs.\nExperimental results vividly demonstrate that knowledge editing might\ninadvertently cast a shadow of unintended consequences on LLMs, which warrant\nattention and efforts for future works. Code and data are available at\nhttps://github.com/zjunlp/PitfallsKnowledgeEditing.\n","authors":["Zhoubo Li","Ningyu Zhang","Yunzhi Yao","Mengru Wang","Xi Chen","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.02129v4.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2403.17734v1","updated":"2024-03-26T14:21:49Z","published":"2024-03-26T14:21:49Z","title":"Paired Diffusion: Generation of related, synthetic PET-CT-Segmentation\n scans using Linked Denoising Diffusion Probabilistic Models","summary":" The rapid advancement of Artificial Intelligence (AI) in biomedical imaging\nand radiotherapy is hindered by the limited availability of large imaging data\nrepositories. With recent research and improvements in denoising diffusion\nprobabilistic models (DDPM), high quality synthetic medical scans are now\npossible. Despite this, there is currently no way of generating multiple\nrelated images, such as a corresponding ground truth which can be used to train\nmodels, so synthetic scans are often manually annotated before use. This\nresearch introduces a novel architecture that is able to generate multiple,\nrelated PET-CT-tumour mask pairs using paired networks and conditional\nencoders. Our approach includes innovative, time step-controlled mechanisms and\na `noise-seeding' strategy to improve DDPM sampling consistency. While our\nmodel requires a modified perceptual loss function to ensure accurate feature\nalignment we show generation of clearly aligned synthetic images and\nimprovement in segmentation accuracy with generated images.\n","authors":["Rowan Bradbury","Katherine A. Vallis","Bartlomiej W. Papiez"],"pdf_url":"https://arxiv.org/pdf/2403.17734v1.pdf","comment":"to be published in IEEE International Symposium on Biomedical Imaging\n 2024"},{"id":"http://arxiv.org/abs/2403.17727v1","updated":"2024-03-26T14:16:56Z","published":"2024-03-26T14:16:56Z","title":"FastPerson: Enhancing Video Learning through Effective Video\n Summarization that Preserves Linguistic and Visual Contexts","summary":" Quickly understanding lengthy lecture videos is essential for learners with\nlimited time and interest in various topics to improve their learning\nefficiency. To this end, video summarization has been actively researched to\nenable users to view only important scenes from a video. However, these studies\nfocus on either the visual or audio information of a video and extract\nimportant segments in the video. Therefore, there is a risk of missing\nimportant information when both the teacher's speech and visual information on\nthe blackboard or slides are important, such as in a lecture video. To tackle\nthis issue, we propose FastPerson, a video summarization approach that\nconsiders both the visual and auditory information in lecture videos.\nFastPerson creates summary videos by utilizing audio transcriptions along with\non-screen images and text, minimizing the risk of overlooking crucial\ninformation for learners. Further, it provides a feature that allows learners\nto switch between the summary and original videos for each chapter of the\nvideo, enabling them to adjust the pace of learning based on their interests\nand level of understanding. We conducted an evaluation with 40 participants to\nassess the effectiveness of our method and confirmed that it reduced viewing\ntime by 53\\% at the same level of comprehension as that when using traditional\nvideo playback methods.\n","authors":["Kazuki Kawamura","Jun Rekimoto"],"pdf_url":"https://arxiv.org/pdf/2403.17727v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17464v2","updated":"2024-03-26T14:15:25Z","published":"2024-02-27T12:42:06Z","title":"Generative 3D Part Assembly via Part-Whole-Hierarchy Message Passing","summary":" Generative 3D part assembly involves understanding part relationships and\npredicting their 6-DoF poses for assembling a realistic 3D shape. Prior work\noften focus on the geometry of individual parts, neglecting part-whole\nhierarchies of objects. Leveraging two key observations: 1) super-part poses\nprovide strong hints about part poses, and 2) predicting super-part poses is\neasier due to fewer superparts, we propose a part-whole-hierarchy message\npassing network for efficient 3D part assembly. We first introduce super-parts\nby grouping geometrically similar parts without any semantic labels. Then we\nemploy a part-whole hierarchical encoder, wherein a super-part encoder predicts\nlatent super-part poses based on input parts. Subsequently, we transform the\npoint cloud using the latent poses, feeding it to the part encoder for\naggregating super-part information and reasoning about part relationships to\npredict all part poses. In training, only ground-truth part poses are required.\nDuring inference, the predicted latent poses of super-parts enhance\ninterpretability. Experimental results on the PartNet dataset show that our\nmethod achieves state-of-the-art performance in part and connectivity accuracy\nand enables an interpretable hierarchical part assembly.\n","authors":["Bi'an Du","Xiang Gao","Wei Hu","Renjie Liao"],"pdf_url":"https://arxiv.org/pdf/2402.17464v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17725v1","updated":"2024-03-26T14:13:44Z","published":"2024-03-26T14:13:44Z","title":"Deep Learning for Segmentation of Cracks in High-Resolution Images of\n Steel Bridges","summary":" Automating the current bridge visual inspection practices using drones and\nimage processing techniques is a prominent way to make these inspections more\neffective, robust, and less expensive. In this paper, we investigate the\ndevelopment of a novel deep-learning method for the detection of fatigue cracks\nin high-resolution images of steel bridges. First, we present a novel and\nchallenging dataset comprising of images of cracks in steel bridges. Secondly,\nwe integrate the ConvNext neural network with a previous state- of-the-art\nencoder-decoder network for crack segmentation. We study and report, the\neffects of the use of background patches on the network performance when\napplied to high-resolution images of cracks in steel bridges. Finally, we\nintroduce a loss function that allows the use of more background patches for\nthe training process, which yields a significant reduction in false positive\nrates.\n","authors":["Andrii Kompanets","Gautam Pai","Remco Duits","Davide Leonetti","Bert Snijder"],"pdf_url":"https://arxiv.org/pdf/2403.17725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17712v1","updated":"2024-03-26T13:58:47Z","published":"2024-03-26T13:58:47Z","title":"Invisible Gas Detection: An RGB-Thermal Cross Attention Network and A\n New Benchmark","summary":" The widespread use of various chemical gases in industrial processes\nnecessitates effective measures to prevent their leakage during transportation\nand storage, given their high toxicity. Thermal infrared-based computer vision\ndetection techniques provide a straightforward approach to identify gas leakage\nareas. However, the development of high-quality algorithms has been challenging\ndue to the low texture in thermal images and the lack of open-source datasets.\nIn this paper, we present the RGB-Thermal Cross Attention Network (RT-CAN),\nwhich employs an RGB-assisted two-stream network architecture to integrate\ntexture information from RGB images and gas area information from thermal\nimages. Additionally, to facilitate the research of invisible gas detection, we\nintroduce Gas-DB, an extensive open-source gas detection database including\nabout 1.3K well-annotated RGB-thermal images with eight variant collection\nscenes. Experimental results demonstrate that our method successfully leverages\nthe advantages of both modalities, achieving state-of-the-art (SOTA)\nperformance among RGB-thermal methods, surpassing single-stream SOTA models in\nterms of accuracy, Intersection of Union (IoU), and F2 metrics by 4.86%, 5.65%,\nand 4.88%, respectively. The code and data will be made available soon.\n","authors":["Jue Wang","Yuxiang Lin","Qi Zhao","Dong Luo","Shuaibao Chen","Wei Chen","Xiaojiang Peng"],"pdf_url":"https://arxiv.org/pdf/2403.17712v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15094v2","updated":"2024-03-26T13:57:26Z","published":"2023-05-24T12:22:23Z","title":"InNeRF360: Text-Guided 3D-Consistent Object Inpainting on 360-degree\n Neural Radiance Fields","summary":" We propose InNeRF360, an automatic system that accurately removes\ntext-specified objects from 360-degree Neural Radiance Fields (NeRF). The\nchallenge is to effectively remove objects while inpainting perceptually\nconsistent content for the missing regions, which is particularly demanding for\nexisting NeRF models due to their implicit volumetric representation. Moreover,\nunbounded scenes are more prone to floater artifacts in the inpainted region\nthan frontal-facing scenes, as the change of object appearance and background\nacross views is more sensitive to inaccurate segmentations and inconsistent\ninpainting. With a trained NeRF and a text description, our method efficiently\nremoves specified objects and inpaints visually consistent content without\nartifacts. We apply depth-space warping to enforce consistency across multiview\ntext-encoded segmentations, and then refine the inpainted NeRF model using\nperceptual priors and 3D diffusion-based geometric priors to ensure visual\nplausibility. Through extensive experiments in segmentation and inpainting on\n360-degree and frontal-facing NeRFs, we show that our approach is effective and\nenhances NeRF's editability. Project page: https://ivrl.github.io/InNeRF360.\n","authors":["Dongqing Wang","Tong Zhang","Alaa Abboud","Sabine Süsstrunk"],"pdf_url":"https://arxiv.org/pdf/2305.15094v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.17709v1","updated":"2024-03-26T13:56:34Z","published":"2024-03-26T13:56:34Z","title":"Groupwise Query Specialization and Quality-Aware Multi-Assignment for\n Transformer-based Visual Relationship Detection","summary":" Visual Relationship Detection (VRD) has seen significant advancements with\nTransformer-based architectures recently. However, we identify two key\nlimitations in a conventional label assignment for training Transformer-based\nVRD models, which is a process of mapping a ground-truth (GT) to a prediction.\nUnder the conventional assignment, an unspecialized query is trained since a\nquery is expected to detect every relation, which makes it difficult for a\nquery to specialize in specific relations. Furthermore, a query is also\ninsufficiently trained since a GT is assigned only to a single prediction,\ntherefore near-correct or even correct predictions are suppressed by being\nassigned no relation as a GT. To address these issues, we propose Groupwise\nQuery Specialization and Quality-Aware Multi-Assignment (SpeaQ). Groupwise\nQuery Specialization trains a specialized query by dividing queries and\nrelations into disjoint groups and directing a query in a specific query group\nsolely toward relations in the corresponding relation group. Quality-Aware\nMulti-Assignment further facilitates the training by assigning a GT to multiple\npredictions that are significantly close to a GT in terms of a subject, an\nobject, and the relation in between. Experimental results and analyses show\nthat SpeaQ effectively trains specialized queries, which better utilize the\ncapacity of a model, resulting in consistent performance gains with zero\nadditional inference cost across multiple VRD models and benchmarks. Code is\navailable at https://github.com/mlvlab/SpeaQ.\n","authors":["Jongha Kim","Jihwan Park","Jinyoung Park","Jinyoung Kim","Sehyung Kim","Hyunwoo J. Kim"],"pdf_url":"https://arxiv.org/pdf/2403.17709v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2312.16014v2","updated":"2024-03-26T13:55:40Z","published":"2023-12-26T11:49:23Z","title":"Passive Non-Line-of-Sight Imaging with Light Transport Modulation","summary":" Passive non-line-of-sight (NLOS) imaging has witnessed rapid development in\nrecent years, due to its ability to image objects that are out of sight. The\nlight transport condition plays an important role in this task since changing\nthe conditions will lead to different imaging models. Existing learning-based\nNLOS methods usually train independent models for different light transport\nconditions, which is computationally inefficient and impairs the practicality\nof the models. In this work, we propose NLOS-LTM, a novel passive NLOS imaging\nmethod that effectively handles multiple light transport conditions with a\nsingle network. We achieve this by inferring a latent light transport\nrepresentation from the projection image and using this representation to\nmodulate the network that reconstructs the hidden image from the projection\nimage. We train a light transport encoder together with a vector quantizer to\nobtain the light transport representation. To further regulate this\nrepresentation, we jointly learn both the reconstruction network and the\nreprojection network during training. A set of light transport modulation\nblocks is used to modulate the two jointly trained networks in a multi-scale\nway. Extensive experiments on a large-scale passive NLOS dataset demonstrate\nthe superiority of the proposed method. The code is available at\nhttps://github.com/JerryOctopus/NLOS-LTM.\n","authors":["Jiarui Zhang","Ruixu Geng","Xiaolong Du","Yan Chen","Houqiang Li","Yang Hu"],"pdf_url":"https://arxiv.org/pdf/2312.16014v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17708v1","updated":"2024-03-26T13:54:52Z","published":"2024-03-26T13:54:52Z","title":"Panonut360: A Head and Eye Tracking Dataset for Panoramic Video","summary":" With the rapid development and widespread application of VR/AR technology,\nmaximizing the quality of immersive panoramic video services that match users'\npersonal preferences and habits has become a long-standing challenge.\nUnderstanding the saliency region where users focus, based on data collected\nwith HMDs, can promote multimedia encoding, transmission, and quality\nassessment. At the same time, large-scale datasets are essential for\nresearchers and developers to explore short/long-term user behavior patterns\nand train AI models related to panoramic videos. However, existing panoramic\nvideo datasets often include low-frequency user head or eye movement data\nthrough short-term videos only, lacking sufficient data for analyzing users'\nField of View (FoV) and generating video saliency regions.\n Driven by these practical factors, in this paper, we present a head and eye\ntracking dataset involving 50 users (25 males and 25 females) watching 15\npanoramic videos. The dataset provides details on the viewport and gaze\nattention locations of users. Besides, we present some statistics samples\nextracted from the dataset. For example, the deviation between head and eye\nmovements challenges the widely held assumption that gaze attention decreases\nfrom the center of the FoV following a Gaussian distribution. Our analysis\nreveals a consistent downward offset in gaze fixations relative to the FoV in\nexperimental settings involving multiple users and videos. That's why we name\nthe dataset Panonut, a saliency weighting shaped like a donut. Finally, we also\nprovide a script that generates saliency distributions based on given head or\neye coordinates and pre-generated saliency distribution map sets of each video\nfrom the collected eye tracking data.\n The dataset is available on website: https://dianvrlab.github.io/Panonut360/.\n","authors":["Yutong Xu","Junhao Du","Jiahe Wang","Yuwei Ning","Sihan Zhou Yang Cao"],"pdf_url":"https://arxiv.org/pdf/2403.17708v1.pdf","comment":"7 pages,ACM MMSys'24 accepted"},{"id":"http://arxiv.org/abs/2403.17702v1","updated":"2024-03-26T13:40:52Z","published":"2024-03-26T13:40:52Z","title":"The Solution for the CVPR 2023 1st foundation model challenge-Track2","summary":" In this paper, we propose a solution for cross-modal transportation\nretrieval. Due to the cross-domain problem of traffic images, we divide the\nproblem into two sub-tasks of pedestrian retrieval and vehicle retrieval\nthrough a simple strategy. In pedestrian retrieval tasks, we use IRRA as the\nbase model and specifically design an Attribute Classification to mine the\nknowledge implied by attribute labels. More importantly, We use the strategy of\nInclusion Relation Matching to make the image-text pairs with inclusion\nrelation have similar representation in the feature space. For the vehicle\nretrieval task, we use BLIP as the base model. Since aligning the color\nattributes of vehicles is challenging, we introduce attribute-based object\ndetection techniques to add color patch blocks to vehicle images for color data\naugmentation. This serves as strong prior information, helping the model\nperform the image-text alignment. At the same time, we incorporate labeled\nattributes into the image-text alignment loss to learn fine-grained alignment\nand prevent similar images and texts from being incorrectly separated. Our\napproach ranked first in the final B-board test with a score of 70.9.\n","authors":["Haonan Xu","Yurui Huang","Sishun Pan","Zhihao Guan","Yi Xu","Yang Yang"],"pdf_url":"https://arxiv.org/pdf/2403.17702v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17701v1","updated":"2024-03-26T13:40:18Z","published":"2024-03-26T13:40:18Z","title":"Rotate to Scan: UNet-like Mamba with Triplet SSM Module for Medical\n Image Segmentation","summary":" Image segmentation holds a vital position in the realms of diagnosis and\ntreatment within the medical domain. Traditional convolutional neural networks\n(CNNs) and Transformer models have made significant advancements in this realm,\nbut they still encounter challenges because of limited receptive field or high\ncomputing complexity. Recently, State Space Models (SSMs), particularly Mamba\nand its variants, have demonstrated notable performance in the field of vision.\nHowever, their feature extraction methods may not be sufficiently effective and\nretain some redundant structures, leaving room for parameter reduction.\nMotivated by previous spatial and channel attention methods, we propose Triplet\nMamba-UNet. The method leverages residual VSS Blocks to extract intensive\ncontextual features, while Triplet SSM is employed to fuse features across\nspatial and channel dimensions. We conducted experiments on ISIC17, ISIC18,\nCVC-300, CVC-ClinicDB, Kvasir-SEG, CVC-ColonDB, and Kvasir-Instrument datasets,\ndemonstrating the superior segmentation performance of our proposed TM-UNet.\nAdditionally, compared to the previous VM-UNet, our model achieves a one-third\nreduction in parameters.\n","authors":["Hao Tang","Lianglun Cheng","Guoheng Huang","Zhengguang Tan","Junhao Lu","Kaihong Wu"],"pdf_url":"https://arxiv.org/pdf/2403.17701v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17695v1","updated":"2024-03-26T13:35:10Z","published":"2024-03-26T13:35:10Z","title":"PlainMamba: Improving Non-Hierarchical Mamba in Visual Recognition","summary":" We present PlainMamba: a simple non-hierarchical state space model (SSM)\ndesigned for general visual recognition. The recent Mamba model has shown how\nSSMs can be highly competitive with other architectures on sequential data and\ninitial attempts have been made to apply it to images. In this paper, we\nfurther adapt the selective scanning process of Mamba to the visual domain,\nenhancing its ability to learn features from two-dimensional images by (i) a\ncontinuous 2D scanning process that improves spatial continuity by ensuring\nadjacency of tokens in the scanning sequence, and (ii) direction-aware updating\nwhich enables the model to discern the spatial relations of tokens by encoding\ndirectional information. Our architecture is designed to be easy to use and\neasy to scale, formed by stacking identical PlainMamba blocks, resulting in a\nmodel with constant width throughout all layers. The architecture is further\nsimplified by removing the need for special tokens. We evaluate PlainMamba on a\nvariety of visual recognition tasks including image classification, semantic\nsegmentation, object detection, and instance segmentation. Our method achieves\nperformance gains over previous non-hierarchical models and is competitive with\nhierarchical alternatives. For tasks requiring high-resolution inputs, in\nparticular, PlainMamba requires much less computing while maintaining high\nperformance. Code and models are available at\nhttps://github.com/ChenhongyiYang/PlainMamba\n","authors":["Chenhongyi Yang","Zehui Chen","Miguel Espinosa","Linus Ericsson","Zhenyu Wang","Jiaming Liu","Elliot J. Crowley"],"pdf_url":"https://arxiv.org/pdf/2403.17695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17694v1","updated":"2024-03-26T13:35:02Z","published":"2024-03-26T13:35:02Z","title":"AniPortrait: Audio-Driven Synthesis of Photorealistic Portrait Animation","summary":" In this study, we propose AniPortrait, a novel framework for generating\nhigh-quality animation driven by audio and a reference portrait image. Our\nmethodology is divided into two stages. Initially, we extract 3D intermediate\nrepresentations from audio and project them into a sequence of 2D facial\nlandmarks. Subsequently, we employ a robust diffusion model, coupled with a\nmotion module, to convert the landmark sequence into photorealistic and\ntemporally consistent portrait animation. Experimental results demonstrate the\nsuperiority of AniPortrait in terms of facial naturalness, pose diversity, and\nvisual quality, thereby offering an enhanced perceptual experience. Moreover,\nour methodology exhibits considerable potential in terms of flexibility and\ncontrollability, which can be effectively applied in areas such as facial\nmotion editing or face reenactment. We release code and model weights at\nhttps://github.com/scutzzj/AniPortrait\n","authors":["Huawei Wei","Zejun Yang","Zhisheng Wang"],"pdf_url":"https://arxiv.org/pdf/2403.17694v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17692v1","updated":"2024-03-26T13:33:16Z","published":"2024-03-26T13:33:16Z","title":"Manifold-Guided Lyapunov Control with Diffusion Models","summary":" This paper presents a novel approach to generating stabilizing controllers\nfor a large class of dynamical systems using diffusion models. The core\nobjective is to develop stabilizing control functions by identifying the\nclosest asymptotically stable vector field relative to a predetermined manifold\nand adjusting the control function based on this finding. To achieve this, we\nemploy a diffusion model trained on pairs consisting of asymptotically stable\nvector fields and their corresponding Lyapunov functions. Our numerical results\ndemonstrate that this pre-trained model can achieve stabilization over\npreviously unseen systems efficiently and rapidly, showcasing the potential of\nour approach in fast zero-shot control and generalizability.\n","authors":["Amartya Mukherjee","Thanin Quartz","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2403.17692v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2403.17691v1","updated":"2024-03-26T13:32:32Z","published":"2024-03-26T13:32:32Z","title":"Not All Similarities Are Created Equal: Leveraging Data-Driven Biases to\n Inform GenAI Copyright Disputes","summary":" The advent of Generative Artificial Intelligence (GenAI) models, including\nGitHub Copilot, OpenAI GPT, and Stable Diffusion, has revolutionized content\ncreation, enabling non-professionals to produce high-quality content across\nvarious domains. This transformative technology has led to a surge of synthetic\ncontent and sparked legal disputes over copyright infringement. To address\nthese challenges, this paper introduces a novel approach that leverages the\nlearning capacity of GenAI models for copyright legal analysis, demonstrated\nwith GPT2 and Stable Diffusion models. Copyright law distinguishes between\noriginal expressions and generic ones (Sc\\`enes \\`a faire), protecting the\nformer and permitting reproduction of the latter. However, this distinction has\nhistorically been challenging to make consistently, leading to over-protection\nof copyrighted works. GenAI offers an unprecedented opportunity to enhance this\nlegal analysis by revealing shared patterns in preexisting works. We propose a\ndata-driven approach to identify the genericity of works created by GenAI,\nemploying \"data-driven bias\" to assess the genericity of expressive\ncompositions. This approach aids in copyright scope determination by utilizing\nthe capabilities of GenAI to identify and prioritize expressive elements and\nrank them according to their frequency in the model's dataset. The potential\nimplications of measuring expressive genericity for copyright law are profound.\nSuch scoring could assist courts in determining copyright scope during\nlitigation, inform the registration practices of Copyright Offices, allowing\nregistration of only highly original synthetic works, and help copyright owners\nsignal the value of their works and facilitate fairer licensing deals. More\ngenerally, this approach offers valuable insights to policymakers grappling\nwith adapting copyright law to the challenges posed by the era of GenAI.\n","authors":["Uri Hacohen","Adi Haviv","Shahar Sarfaty","Bruria Friedman","Niva Elkin-Koren","Roi Livni","Amit H Bermano"],"pdf_url":"https://arxiv.org/pdf/2403.17691v1.pdf","comment":"Presented at ACM CSLAW 2024"},{"id":"http://arxiv.org/abs/2311.16081v2","updated":"2024-03-26T13:32:06Z","published":"2023-11-27T18:52:09Z","title":"ViT-Lens: Towards Omni-modal Representations","summary":" Aiming to advance AI agents, large foundation models significantly improve\nreasoning and instruction execution, yet the current focus on vision and\nlanguage neglects the potential of perceiving diverse modalities in open-world\nenvironments. However, the success of data-driven vision and language models is\ncostly or even infeasible to be reproduced for rare modalities. In this paper,\nwe present ViT-Lens-2 that facilitates efficient omni-modal representation\nlearning by perceiving novel modalities with a pretrained ViT and aligning them\nto a pre-defined space. Specifically, the modality-specific lens is tuned to\nproject any-modal signals to an intermediate embedding space, which are then\nprocessed by a strong ViT with pre-trained visual knowledge. The encoded\nrepresentations are optimized toward aligning with the modal-independent space,\npre-defined by off-the-shelf foundation models. ViT-Lens-2 provides a unified\nsolution for representation learning of increasing modalities with two\nappealing advantages: (i) Unlocking the great potential of pretrained ViTs to\nnovel modalities effectively with efficient data regime; (ii) Enabling emergent\ndownstream capabilities through modality alignment and shared ViT parameters.\nWe tailor ViT-Lens-2 to learn representations for 3D point cloud, depth, audio,\ntactile and EEG, and set new state-of-the-art results across various\nunderstanding tasks, such as zero-shot classification. By seamlessly\nintegrating ViT-Lens-2 into Multimodal Foundation Models, we enable\nAny-modality to Text and Image Generation in a zero-shot manner. Code and\nmodels are available at https://github.com/TencentARC/ViT-Lens.\n","authors":["Weixian Lei","Yixiao Ge","Kun Yi","Jianfeng Zhang","Difei Gao","Dylan Sun","Yuying Ge","Ying Shan","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2311.16081v2.pdf","comment":"This work is a follow-up of arXiv:2308.10185. Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2403.11708v3","updated":"2024-03-26T13:21:52Z","published":"2024-03-18T12:12:45Z","title":"Implicit Discriminative Knowledge Learning for Visible-Infrared Person\n Re-Identification","summary":" Visible-Infrared Person Re-identification (VI-ReID) is a challenging\ncross-modal pedestrian retrieval task, due to significant intra-class\nvariations and cross-modal discrepancies among different cameras. Existing\nworks mainly focus on embedding images of different modalities into a unified\nspace to mine modality-shared features. They only seek distinctive information\nwithin these shared features, while ignoring the identity-aware useful\ninformation that is implicit in the modality-specific features. To address this\nissue, we propose a novel Implicit Discriminative Knowledge Learning (IDKL)\nnetwork to uncover and leverage the implicit discriminative information\ncontained within the modality-specific. First, we extract modality-specific and\nmodality-shared features using a novel dual-stream network. Then, the\nmodality-specific features undergo purification to reduce their modality style\ndiscrepancies while preserving identity-aware discriminative knowledge.\nSubsequently, this kind of implicit knowledge is distilled into the\nmodality-shared feature to enhance its distinctiveness. Finally, an alignment\nloss is proposed to minimize modality discrepancy on enhanced modality-shared\nfeatures. Extensive experiments on multiple public datasets demonstrate the\nsuperiority of IDKL network over the state-of-the-art methods. Code is\navailable at https://github.com/1KK077/IDKL.\n","authors":["Kaijie Ren","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.11708v3.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2311.17094v2","updated":"2024-03-26T13:21:43Z","published":"2023-11-28T06:17:49Z","title":"In Search of a Data Transformation That Accelerates Neural Field\n Training","summary":" Neural field is an emerging paradigm in data representation that trains a\nneural network to approximate the given signal. A key obstacle that prevents\nits widespread adoption is the encoding speed-generating neural fields requires\nan overfitting of a neural network, which can take a significant number of SGD\nsteps to reach the desired fidelity level. In this paper, we delve into the\nimpacts of data transformations on the speed of neural field training,\nspecifically focusing on how permuting pixel locations affect the convergence\nspeed of SGD. Counterintuitively, we find that randomly permuting the pixel\nlocations can considerably accelerate the training. To explain this phenomenon,\nwe examine the neural field training through the lens of PSNR curves, loss\nlandscapes, and error patterns. Our analyses suggest that the random pixel\npermutations remove the easy-to-fit patterns, which facilitate easy\noptimization in the early stage but hinder capturing fine details of the\nsignal.\n","authors":["Junwon Seo","Sangyoon Lee","Kwang In Kim","Jaeho Lee"],"pdf_url":"https://arxiv.org/pdf/2311.17094v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2312.02512v2","updated":"2024-03-26T13:21:28Z","published":"2023-12-05T05:36:44Z","title":"AV2AV: Direct Audio-Visual Speech to Audio-Visual Speech Translation\n with Unified Audio-Visual Speech Representation","summary":" This paper proposes a novel direct Audio-Visual Speech to Audio-Visual Speech\nTranslation (AV2AV) framework, where the input and output of the system are\nmultimodal (i.e., audio and visual speech). With the proposed AV2AV, two key\nadvantages can be brought: 1) We can perform real-like conversations with\nindividuals worldwide in a virtual meeting by utilizing our own primary\nlanguages. In contrast to Speech-to-Speech Translation (A2A), which solely\ntranslates between audio modalities, the proposed AV2AV directly translates\nbetween audio-visual speech. This capability enhances the dialogue experience\nby presenting synchronized lip movements along with the translated speech. 2)\nWe can improve the robustness of the spoken language translation system. By\nemploying the complementary information of audio-visual speech, the system can\neffectively translate spoken language even in the presence of acoustic noise,\nshowcasing robust performance. To mitigate the problem of the absence of a\nparallel AV2AV translation dataset, we propose to train our spoken language\ntranslation system with the audio-only dataset of A2A. This is done by learning\nunified audio-visual speech representations through self-supervised learning in\nadvance to train the translation system. Moreover, we propose an AV-Renderer\nthat can generate raw audio and video in parallel. It is designed with\nzero-shot speaker modeling, thus the speaker in source audio-visual speech can\nbe maintained at the target translated audio-visual speech. The effectiveness\nof AV2AV is evaluated with extensive experiments in a many-to-many language\ntranslation setting. Demo page is available on\nhttps://choijeongsoo.github.io/av2av.\n","authors":["Jeongsoo Choi","Se Jin Park","Minsu Kim","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2312.02512v2.pdf","comment":"CVPR 2024. Code & Demo: https://choijeongsoo.github.io/av2av"},{"id":"http://arxiv.org/abs/2304.10417v3","updated":"2024-03-26T13:16:02Z","published":"2023-04-20T16:01:55Z","title":"SINC: Spatial Composition of 3D Human Motions for Simultaneous Action\n Generation","summary":" Our goal is to synthesize 3D human motions given textual inputs describing\nsimultaneous actions, for example 'waving hand' while 'walking' at the same\ntime. We refer to generating such simultaneous movements as performing 'spatial\ncompositions'. In contrast to temporal compositions that seek to transition\nfrom one action to another, spatial compositing requires understanding which\nbody parts are involved in which action, to be able to move them\nsimultaneously. Motivated by the observation that the correspondence between\nactions and body parts is encoded in powerful language models, we extract this\nknowledge by prompting GPT-3 with text such as \"what are the body parts\ninvolved in the action ?\", while also providing the parts list and\nfew-shot examples. Given this action-part mapping, we combine body parts from\ntwo motions together and establish the first automated method to spatially\ncompose two actions. However, training data with compositional actions is\nalways limited by the combinatorics. Hence, we further create synthetic data\nwith this approach, and use it to train a new state-of-the-art text-to-motion\ngeneration model, called SINC (\"SImultaneous actioN Compositions for 3D human\nmotions\"). In our experiments, that training with such GPT-guided synthetic\ndata improves spatial composition generation over baselines. Our code is\npublicly available at https://sinc.is.tue.mpg.de/.\n","authors":["Nikos Athanasiou","Mathis Petrovich","Michael J. Black","Gül Varol"],"pdf_url":"https://arxiv.org/pdf/2304.10417v3.pdf","comment":"Teaser Fixed"},{"id":"http://arxiv.org/abs/2403.14135v2","updated":"2024-03-26T13:15:12Z","published":"2024-03-21T05:10:26Z","title":"Powerful Lossy Compression for Noisy Images","summary":" Image compression and denoising represent fundamental challenges in image\nprocessing with many real-world applications. To address practical demands,\ncurrent solutions can be categorized into two main strategies: 1) sequential\nmethod; and 2) joint method. However, sequential methods have the disadvantage\nof error accumulation as there is information loss between multiple individual\nmodels. Recently, the academic community began to make some attempts to tackle\nthis problem through end-to-end joint methods. Most of them ignore that\ndifferent regions of noisy images have different characteristics. To solve\nthese problems, in this paper, our proposed signal-to-noise ratio~(SNR) aware\njoint solution exploits local and non-local features for image compression and\ndenoising simultaneously. We design an end-to-end trainable network, which\nincludes the main encoder branch, the guidance branch, and the signal-to-noise\nratio~(SNR) aware branch. We conducted extensive experiments on both synthetic\nand real-world datasets, demonstrating that our joint solution outperforms\nexisting state-of-the-art methods.\n","authors":["Shilv Cai","Xiaoguo Liang","Shuning Cao","Luxin Yan","Sheng Zhong","Liqun Chen","Xu Zou"],"pdf_url":"https://arxiv.org/pdf/2403.14135v2.pdf","comment":"Accepted by ICME 2024"},{"id":"http://arxiv.org/abs/2308.10185v2","updated":"2024-03-26T13:11:07Z","published":"2023-08-20T07:26:51Z","title":"ViT-Lens: Initiating Omni-Modal Exploration through 3D Insights","summary":" Though the success of CLIP-based training recipes in vision-language models,\ntheir scalability to more modalities (e.g., 3D, audio, etc.) is limited to\nlarge-scale data, which is expensive or even inapplicable for rare modalities.\nIn this paper, we present ViT-Lens that facilitates efficient omni-modal\nrepresentation learning by perceiving novel modalities with a pretrained ViT\nand aligning to a pre-defined space. Specifically, the modality-specific lens\nis tuned to project multimodal signals to the shared embedding space, which are\nthen processed by a strong ViT that carries pre-trained image knowledge. The\nencoded multimodal representations are optimized toward aligning with the\nmodal-independent space, pre-defined by off-the-shelf foundation models. A\nwell-trained lens with a ViT backbone has the potential to serve as one of\nthese foundation models, supervising the learning of subsequent modalities.\nViT-Lens provides a unified solution for representation learning of increasing\nmodalities with two appealing benefits: (i) Exploiting the pretrained ViT\nacross tasks and domains effectively with efficient data regime; (ii) Emergent\ndownstream capabilities of novel modalities are demonstrated due to the\nmodality alignment space. We evaluate ViT-Lens in the context of 3D as an\ninitial verification. In zero-shot 3D classification, ViT-Lens achieves\nsubstantial improvements over previous state-of-the-art, showing 52.0% accuracy\non Objaverse-LVIS, 87.4% on ModelNet40, and 60.6% on ScanObjectNN. Furthermore,\nwe enable zero-shot 3D question-answering by simply integrating the trained 3D\nlens into the InstructBLIP model without any adaptation. We will release the\nresults of ViT-Lens on more modalities in the near future.\n","authors":["Weixian Lei","Yixiao Ge","Jianfeng Zhang","Dylan Sun","Kun Yi","Ying Shan","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2308.10185v2.pdf","comment":"19 pages, 4 figures and 9 tables"},{"id":"http://arxiv.org/abs/2403.17678v1","updated":"2024-03-26T13:05:49Z","published":"2024-03-26T13:05:49Z","title":"Hierarchical Light Transformer Ensembles for Multimodal Trajectory\n Forecasting","summary":" Accurate trajectory forecasting is crucial for the performance of various\nsystems, such as advanced driver-assistance systems and self-driving vehicles.\nThese forecasts allow to anticipate events leading to collisions and,\ntherefore, to mitigate them. Deep Neural Networks have excelled in motion\nforecasting, but issues like overconfidence and uncertainty quantification\npersist. Deep Ensembles address these concerns, yet applying them to multimodal\ndistributions remains challenging. In this paper, we propose a novel approach\nnamed Hierarchical Light Transformer Ensembles (HLT-Ens), aimed at efficiently\ntraining an ensemble of Transformer architectures using a novel hierarchical\nloss function. HLT-Ens leverages grouped fully connected layers, inspired by\ngrouped convolution techniques, to capture multimodal distributions,\neffectively. Through extensive experimentation, we demonstrate that HLT-Ens\nachieves state-of-the-art performance levels, offering a promising avenue for\nimproving trajectory forecasting techniques.\n","authors":["Adrien Lafage","Mathieu Barbier","Gianni Franchi","David Filliat"],"pdf_url":"https://arxiv.org/pdf/2403.17678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17672v1","updated":"2024-03-26T13:02:38Z","published":"2024-03-26T13:02:38Z","title":"Predicting Perceived Gloss: Do Weak Labels Suffice?","summary":" Estimating perceptual attributes of materials directly from images is a\nchallenging task due to their complex, not fully-understood interactions with\nexternal factors, such as geometry and lighting. Supervised deep learning\nmodels have recently been shown to outperform traditional approaches, but rely\non large datasets of human-annotated images for accurate perception\npredictions. Obtaining reliable annotations is a costly endeavor, aggravated by\nthe limited ability of these models to generalise to different aspects of\nappearance. In this work, we show how a much smaller set of human annotations\n(\"strong labels\") can be effectively augmented with automatically derived \"weak\nlabels\" in the context of learning a low-dimensional image-computable gloss\nmetric. We evaluate three alternative weak labels for predicting human gloss\nperception from limited annotated data. Incorporating weak labels enhances our\ngloss prediction beyond the current state of the art. Moreover, it enables a\nsubstantial reduction in human annotation costs without sacrificing accuracy,\nwhether working with rendered images or real photographs.\n","authors":["Julia Guerrero-Viu","J. Daniel Subias","Ana Serrano","Katherine R. Storrs","Roland W. Fleming","Belen Masia","Diego Gutierrez"],"pdf_url":"https://arxiv.org/pdf/2403.17672v1.pdf","comment":"Computer Graphics Forum (Eurographics 2024)"},{"id":"http://arxiv.org/abs/2310.01819v3","updated":"2024-03-26T12:59:39Z","published":"2023-10-03T06:16:38Z","title":"TP2O: Creative Text Pair-to-Object Generation using Balance\n Swap-Sampling","summary":" Generating creative combinatorial objects from two seemingly unrelated object\ntexts is a challenging task in text-to-image synthesis, often hindered by a\nfocus on emulating existing data distributions. In this paper, we develop a\nstraightforward yet highly effective method, called \\textbf{balance\nswap-sampling}. First, we propose a swapping mechanism that generates a novel\ncombinatorial object image set by randomly exchanging intrinsic elements of two\ntext embeddings through a cutting-edge diffusion model. Second, we introduce a\nbalance swapping region to efficiently sample a small subset from the newly\ngenerated image set by balancing CLIP distances between the new images and\ntheir original generations, increasing the likelihood of accepting the\nhigh-quality combinations. Last, we employ a segmentation method to compare\nCLIP distances among the segmented components, ultimately selecting the most\npromising object from the sampled subset. Extensive experiments demonstrate\nthat our approach outperforms recent SOTA T2I methods. Surprisingly, our\nresults even rival those of human artists, such as frog-broccoli.\n","authors":["Jun Li","Zedong Zhang","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2310.01819v3.pdf","comment":"Project page: https://tp2o.github.io/anon/"},{"id":"http://arxiv.org/abs/2312.00869v2","updated":"2024-03-26T12:56:55Z","published":"2023-12-01T19:00:17Z","title":"Segment and Caption Anything","summary":" We propose a method to efficiently equip the Segment Anything Model (SAM)\nwith the ability to generate regional captions. SAM presents strong\ngeneralizability to segment anything while is short for semantic understanding.\nBy introducing a lightweight query-based feature mixer, we align the\nregion-specific features with the embedding space of language models for later\ncaption generation. As the number of trainable parameters is small (typically\nin the order of tens of millions), it costs less computation, less memory\nusage, and less communication bandwidth, resulting in both fast and scalable\ntraining. To address the scarcity problem of regional caption data, we propose\nto first pre-train our model on objection detection and segmentation tasks. We\ncall this step weak supervision pretraining since the pre-training data only\ncontains category names instead of full-sentence descriptions. The weak\nsupervision pretraining allows us to leverage many publicly available object\ndetection and segmentation datasets. We conduct extensive experiments to\ndemonstrate the superiority of our method and validate each design choice. This\nwork serves as a stepping stone towards scaling up regional captioning data and\nsheds light on exploring efficient ways to augment SAM with regional semantics.\nThe project page, along with the associated code, can be accessed via\nhttps://xk-huang.github.io/segment-caption-anything/.\n","authors":["Xiaoke Huang","Jianfeng Wang","Yansong Tang","Zheng Zhang","Han Hu","Jiwen Lu","Lijuan Wang","Zicheng Liu"],"pdf_url":"https://arxiv.org/pdf/2312.00869v2.pdf","comment":"The project page, along with the associated code, can be accessed via\n https://xk-huang.github.io/segment-caption-anything/; Update author\n information; Accepted by CVPR 24"},{"id":"http://arxiv.org/abs/2403.17664v1","updated":"2024-03-26T12:53:10Z","published":"2024-03-26T12:53:10Z","title":"DiffFAE: Advancing High-fidelity One-shot Facial Appearance Editing with\n Space-sensitive Customization and Semantic Preservation","summary":" Facial Appearance Editing (FAE) aims to modify physical attributes, such as\npose, expression and lighting, of human facial images while preserving\nattributes like identity and background, showing great importance in\nphotograph. In spite of the great progress in this area, current researches\ngenerally meet three challenges: low generation fidelity, poor attribute\npreservation, and inefficient inference. To overcome above challenges, this\npaper presents DiffFAE, a one-stage and highly-efficient diffusion-based\nframework tailored for high-fidelity FAE. For high-fidelity query attributes\ntransfer, we adopt Space-sensitive Physical Customization (SPC), which ensures\nthe fidelity and generalization ability by utilizing rendering texture derived\nfrom 3D Morphable Model (3DMM). In order to preserve source attributes, we\nintroduce the Region-responsive Semantic Composition (RSC). This module is\nguided to learn decoupled source-regarding features, thereby better preserving\nthe identity and alleviating artifacts from non-facial attributes such as hair,\nclothes, and background. We further introduce a consistency regularization for\nour pipeline to enhance editing controllability by leveraging prior knowledge\nin the attention matrices of diffusion model. Extensive experiments demonstrate\nthe superiority of DiffFAE over existing methods, achieving state-of-the-art\nperformance in facial appearance editing.\n","authors":["Qilin Wang","Jiangning Zhang","Chengming Xu","Weijian Cao","Ying Tai","Yue Han","Yanhao Ge","Hong Gu","Chengjie Wang","Yanwei Fu"],"pdf_url":"https://arxiv.org/pdf/2403.17664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14149v4","updated":"2024-03-26T12:47:12Z","published":"2023-12-21T18:59:06Z","title":"TagAlign: Improving Vision-Language Alignment with Multi-Tag\n Classification","summary":" The crux of learning vision-language models is to extract semantically\naligned information from visual and linguistic data. Existing attempts usually\nface the problem of coarse alignment, e.g., the vision encoder struggles in\nlocalizing an attribute-specified object. In this work, we propose an\nembarrassingly simple approach to better align image and text features with no\nneed of additional data formats other than image-text pairs. Concretely, given\nan image and its paired text, we manage to parse objects (e.g., cat) and\nattributes (e.g., black) from the description, which are highly likely to exist\nin the image. It is noteworthy that the parsing pipeline is fully automatic and\nthus enjoys good scalability. With these parsed semantics as supervision\nsignals, we can complement the commonly used image-text contrastive loss with\nthe multi-tag classification loss. Extensive experimental results on a broad\nsuite of semantic segmentation datasets substantiate the average 5.2\\%\nimprovement of our framework over existing alternatives. Furthermore, the\nvisualization results indicate that attribute supervision makes vision-language\nmodels accurately localize attribute-specified objects. Project page can be\nfound at https://qinying-liu.github.io/Tag-Align.\n","authors":["Qinying Liu","Wei Wu","Kecheng Zheng","Zhan Tong","Jiawei Liu","Yu Liu","Wei Chen","Zilei Wang","Yujun Shen"],"pdf_url":"https://arxiv.org/pdf/2312.14149v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03246v5","updated":"2024-03-26T12:35:03Z","published":"2024-02-05T18:03:53Z","title":"SGS-SLAM: Semantic Gaussian Splatting For Neural Dense SLAM","summary":" We present SGS-SLAM, the first semantic visual SLAM system based on Gaussian\nSplatting. It incorporates appearance, geometry, and semantic features through\nmulti-channel optimization, addressing the oversmoothing limitations of neural\nimplicit SLAM systems in high-quality rendering, scene understanding, and\nobject-level geometry. We introduce a unique semantic feature loss that\neffectively compensates for the shortcomings of traditional depth and color\nlosses in object optimization. Through a semantic-guided keyframe selection\nstrategy, we prevent erroneous reconstructions caused by cumulative errors.\nExtensive experiments demonstrate that SGS-SLAM delivers state-of-the-art\nperformance in camera pose estimation, map reconstruction, precise semantic\nsegmentation, and object-level geometric accuracy, while ensuring real-time\nrendering capabilities.\n","authors":["Mingrui Li","Shuhong Liu","Heng Zhou","Guohao Zhu","Na Cheng","Tianchen Deng","Hongyu Wang"],"pdf_url":"https://arxiv.org/pdf/2402.03246v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17651v1","updated":"2024-03-26T12:31:58Z","published":"2024-03-26T12:31:58Z","title":"Exploring Dynamic Transformer for Efficient Object Tracking","summary":" The speed-precision trade-off is a critical problem for visual object\ntracking which usually requires low latency and deployment on constrained\nresources. Existing solutions for efficient tracking mainly focus on adopting\nlight-weight backbones or modules, which nevertheless come at the cost of a\nsacrifice in precision. In this paper, inspired by dynamic network routing, we\npropose DyTrack, a dynamic transformer framework for efficient tracking.\nReal-world tracking scenarios exhibit diverse levels of complexity. We argue\nthat a simple network is sufficient for easy frames in video sequences, while\nmore computation could be assigned to difficult ones. DyTrack automatically\nlearns to configure proper reasoning routes for various inputs, gaining better\nutilization of the available computational budget. Thus, it can achieve higher\nperformance with the same running speed. We formulate instance-specific\ntracking as a sequential decision problem and attach terminating branches to\nintermediate layers of the entire model. Especially, to fully utilize the\ncomputations, we introduce the feature recycling mechanism to reuse the outputs\nof predecessors. Furthermore, a target-aware self-distillation strategy is\ndesigned to enhance the discriminating capabilities of early predictions by\neffectively mimicking the representation pattern of the deep model. Extensive\nexperiments on multiple benchmarks demonstrate that DyTrack achieves promising\nspeed-precision trade-offs with only a single model. For instance, DyTrack\nobtains 64.9% AUC on LaSOT with a speed of 256 fps.\n","authors":["Jiawen Zhu","Xin Chen","Haiwen Diao","Shuai Li","Jun-Yan He","Chenyang Li","Bin Luo","Dong Wang","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2403.17651v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02109v2","updated":"2024-03-26T12:28:02Z","published":"2023-12-04T18:39:00Z","title":"ArtAdapter: Text-to-Image Style Transfer using Multi-Level Style Encoder\n and Explicit Adaptation","summary":" This work introduces ArtAdapter, a transformative text-to-image (T2I) style\ntransfer framework that transcends traditional limitations of color,\nbrushstrokes, and object shape, capturing high-level style elements such as\ncomposition and distinctive artistic expression. The integration of a\nmulti-level style encoder with our proposed explicit adaptation mechanism\nenables ArtAdapter to achieve unprecedented fidelity in style transfer,\nensuring close alignment with textual descriptions. Additionally, the\nincorporation of an Auxiliary Content Adapter (ACA) effectively separates\ncontent from style, alleviating the borrowing of content from style references.\nMoreover, our novel fast finetuning approach could further enhance zero-shot\nstyle representation while mitigating the risk of overfitting. Comprehensive\nevaluations confirm that ArtAdapter surpasses current state-of-the-art methods.\n","authors":["Dar-Yen Chen","Hamish Tennent","Ching-Wen Hsu"],"pdf_url":"https://arxiv.org/pdf/2312.02109v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17639v1","updated":"2024-03-26T12:21:47Z","published":"2024-03-26T12:21:47Z","title":"High-Resolution Image Translation Model Based on Grayscale Redefinition","summary":" Image-to-image translation is a technique that focuses on transferring images\nfrom one domain to another while maintaining the essential content\nrepresentations. In recent years, image-to-image translation has gained\nsignificant attention and achieved remarkable advancements due to its diverse\napplications in computer vision and image processing tasks. In this work, we\npropose an innovative method for image translation between different domains.\nFor high-resolution image translation tasks, we use a grayscale adjustment\nmethod to achieve pixel-level translation. For other tasks, we utilize the\nPix2PixHD model with a coarse-to-fine generator, multi-scale discriminator, and\nimproved loss to enhance the image translation performance. On the other hand,\nto tackle the issue of sparse training data, we adopt model weight\ninitialization from other task to optimize the performance of the current task.\n","authors":["Xixian Wu","Dian Chao","Yang Yang"],"pdf_url":"https://arxiv.org/pdf/2403.17639v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17638v1","updated":"2024-03-26T12:17:46Z","published":"2024-03-26T12:17:46Z","title":"Learning with Unreliability: Fast Few-shot Voxel Radiance Fields with\n Relative Geometric Consistency","summary":" We propose a voxel-based optimization framework, ReVoRF, for few-shot\nradiance fields that strategically address the unreliability in pseudo novel\nview synthesis. Our method pivots on the insight that relative depth\nrelationships within neighboring regions are more reliable than the absolute\ncolor values in disoccluded areas. Consequently, we devise a bilateral\ngeometric consistency loss that carefully navigates the trade-off between color\nfidelity and geometric accuracy in the context of depth consistency for\nuncertain regions. Moreover, we present a reliability-guided learning strategy\nto discern and utilize the variable quality across synthesized views,\ncomplemented by a reliability-aware voxel smoothing algorithm that smoothens\nthe transition between reliable and unreliable data patches. Our approach\nallows for a more nuanced use of all available data, promoting enhanced\nlearning from regions previously considered unsuitable for high-quality\nreconstruction. Extensive experiments across diverse datasets reveal that our\napproach attains significant gains in efficiency and accuracy, delivering\nrendering speeds of 3 FPS, 7 mins to train a $360^\\circ$ scene, and a 5\\%\nimprovement in PSNR over existing few-shot methods. Code is available at\nhttps://github.com/HKCLynn/ReVoRF.\n","authors":["Yingjie Xu","Bangzhen Liu","Hao Tang","Bailin Deng","Shengfeng He"],"pdf_url":"https://arxiv.org/pdf/2403.17638v1.pdf","comment":"CVPR 2024 final version"},{"id":"http://arxiv.org/abs/2403.15010v2","updated":"2024-03-26T12:16:14Z","published":"2024-03-22T07:47:13Z","title":"Clean-image Backdoor Attacks","summary":" To gather a significant quantity of annotated training data for\nhigh-performance image classification models, numerous companies opt to enlist\nthird-party providers to label their unlabeled data. This practice is widely\nregarded as secure, even in cases where some annotated errors occur, as the\nimpact of these minor inaccuracies on the final performance of the models is\nnegligible and existing backdoor attacks require attacker's ability to poison\nthe training images. Nevertheless, in this paper, we propose clean-image\nbackdoor attacks which uncover that backdoors can still be injected via a\nfraction of incorrect labels without modifying the training images.\nSpecifically, in our attacks, the attacker first seeks a trigger feature to\ndivide the training images into two parts: those with the feature and those\nwithout it. Subsequently, the attacker falsifies the labels of the former part\nto a backdoor class. The backdoor will be finally implanted into the target\nmodel after it is trained on the poisoned data. During the inference phase, the\nattacker can activate the backdoor in two ways: slightly modifying the input\nimage to obtain the trigger feature, or taking an image that naturally has the\ntrigger feature as input. We conduct extensive experiments to demonstrate the\neffectiveness and practicality of our attacks. According to the experimental\nresults, we conclude that our attacks seriously jeopardize the fairness and\nrobustness of image classification models, and it is necessary to be vigilant\nabout the incorrect labels in outsourced labeling.\n","authors":["Dazhong Rong","Guoyao Yu","Shuheng Shen","Xinyi Fu","Peng Qian","Jianhai Chen","Qinming He","Xing Fu","Weiqiang Wang"],"pdf_url":"https://arxiv.org/pdf/2403.15010v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06683v2","updated":"2024-03-26T12:10:13Z","published":"2024-03-11T12:57:51Z","title":"Transferring Relative Monocular Depth to Surgical Vision with Temporal\n Consistency","summary":" Relative monocular depth, inferring depth up to shift and scale from a single\nimage, is an active research topic. Recent deep learning models, trained on\nlarge and varied meta-datasets, now provide excellent performance in the domain\nof natural images. However, few datasets exist which provide ground truth depth\nfor endoscopic images, making training such models from scratch unfeasible.\nThis work investigates the transfer of these models into the surgical domain,\nand presents an effective and simple way to improve on standard supervision\nthrough the use of temporal consistency self-supervision. We show temporal\nconsistency significantly improves supervised training alone when transferring\nto the low-data regime of endoscopy, and outperforms the prevalent\nself-supervision technique for this task. In addition we show our method\ndrastically outperforms the state-of-the-art method from within the domain of\nendoscopy. We also release our code, model and ensembled meta-dataset,\nMeta-MED, establishing a strong benchmark for future work.\n","authors":["Charlie Budd","Tom Vercauteren"],"pdf_url":"https://arxiv.org/pdf/2403.06683v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17633v1","updated":"2024-03-26T12:08:14Z","published":"2024-03-26T12:08:14Z","title":"UADA3D: Unsupervised Adversarial Domain Adaptation for 3D Object\n Detection with Sparse LiDAR and Large Domain Gaps","summary":" In this study, we address a gap in existing unsupervised domain adaptation\napproaches on LiDAR-based 3D object detection, which have predominantly\nconcentrated on adapting between established, high-density autonomous driving\ndatasets. We focus on sparser point clouds, capturing scenarios from different\nperspectives: not just from vehicles on the road but also from mobile robots on\nsidewalks, which encounter significantly different environmental conditions and\nsensor configurations. We introduce Unsupervised Adversarial Domain Adaptation\nfor 3D Object Detection (UADA3D). UADA3D does not depend on pre-trained source\nmodels or teacher-student architectures. Instead, it uses an adversarial\napproach to directly learn domain-invariant features. We demonstrate its\nefficacy in various adaptation scenarios, showing significant improvements in\nboth self-driving car and mobile robot domains. Our code is open-source and\nwill be available soon.\n","authors":["Maciej K Wozniak","Mattias Hansson","Marko Thiel","Patric Jensfelt"],"pdf_url":"https://arxiv.org/pdf/2403.17633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17631v1","updated":"2024-03-26T12:08:04Z","published":"2024-03-26T12:08:04Z","title":"AniArtAvatar: Animatable 3D Art Avatar from a Single Image","summary":" We present a novel approach for generating animatable 3D-aware art avatars\nfrom a single image, with controllable facial expressions, head poses, and\nshoulder movements. Unlike previous reenactment methods, our approach utilizes\na view-conditioned 2D diffusion model to synthesize multi-view images from a\nsingle art portrait with a neutral expression. With the generated colors and\nnormals, we synthesize a static avatar using an SDF-based neural surface. For\navatar animation, we extract control points, transfer the motion with these\npoints, and deform the implicit canonical space. Firstly, we render the front\nimage of the avatar, extract the 2D landmarks, and project them to the 3D space\nusing a trained SDF network. We extract 3D driving landmarks using 3DMM and\ntransfer the motion to the avatar landmarks. To animate the avatar pose, we\nmanually set the body height and bound the head and torso of an avatar with two\ncages. The head and torso can be animated by transforming the two cages. Our\napproach is a one-shot pipeline that can be applied to various styles.\nExperiments demonstrate that our method can generate high-quality 3D art\navatars with desired control over different motions.\n","authors":["Shaoxu Li"],"pdf_url":"https://arxiv.org/pdf/2403.17631v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01598v3","updated":"2024-03-26T11:54:40Z","published":"2023-06-02T15:09:19Z","title":"Towards Source-free Domain Adaptive Semantic Segmentation via\n Importance-aware and Prototype-contrast Learning","summary":" Domain adaptive semantic segmentation enables robust pixel-wise understanding\nin real-world driving scenes. Source-free domain adaptation, as a more\npractical technique, addresses the concerns of data privacy and storage\nlimitations in typical unsupervised domain adaptation methods, making it\nespecially relevant in the context of intelligent vehicles. It utilizes a\nwell-trained source model and unlabeled target data to achieve adaptation in\nthe target domain. However, in the absence of source data and target labels,\ncurrent solutions cannot sufficiently reduce the impact of domain shift and\nfully leverage the information from the target data. In this paper, we propose\nan end-to-end source-free domain adaptation semantic segmentation method via\nImportance-Aware and Prototype-Contrast (IAPC) learning. The proposed IAPC\nframework effectively extracts domain-invariant knowledge from the well-trained\nsource model and learns domain-specific knowledge from the unlabeled target\ndomain. Specifically, considering the problem of domain shift in the prediction\nof the target domain by the source model, we put forward an importance-aware\nmechanism for the biased target prediction probability distribution to extract\ndomain-invariant knowledge from the source model. We further introduce a\nprototype-contrast strategy, which includes a prototype-symmetric cross-entropy\nloss and a prototype-enhanced cross-entropy loss, to learn target intra-domain\nknowledge without relying on labels. A comprehensive variety of experiments on\ntwo domain adaptive semantic segmentation benchmarks demonstrates that the\nproposed end-to-end IAPC solution outperforms existing state-of-the-art\nmethods. The source code is publicly available at\nhttps://github.com/yihong-97/Source-free-IAPC.\n","authors":["Yihong Cao","Hui Zhang","Xiao Lu","Zheng Xiao","Kailun Yang","Yaonan Wang"],"pdf_url":"https://arxiv.org/pdf/2306.01598v3.pdf","comment":"Accepted to IEEE Transactions on Intelligent Vehicles (T-IV). The\n source code is publicly available at\n https://github.com/yihong-97/Source-free-IAPC"},{"id":"http://arxiv.org/abs/2310.17569v2","updated":"2024-03-26T11:52:23Z","published":"2023-10-26T16:58:01Z","title":"SD4Match: Learning to Prompt Stable Diffusion Model for Semantic\n Matching","summary":" In this paper, we address the challenge of matching semantically similar\nkeypoints across image pairs. Existing research indicates that the intermediate\noutput of the UNet within the Stable Diffusion (SD) can serve as robust image\nfeature maps for such a matching task. We demonstrate that by employing a basic\nprompt tuning technique, the inherent potential of Stable Diffusion can be\nharnessed, resulting in a significant enhancement in accuracy over previous\napproaches. We further introduce a novel conditional prompting module that\nconditions the prompt on the local details of the input image pairs, leading to\na further improvement in performance. We designate our approach as SD4Match,\nshort for Stable Diffusion for Semantic Matching. Comprehensive evaluations of\nSD4Match on the PF-Pascal, PF-Willow, and SPair-71k datasets show that it sets\nnew benchmarks in accuracy across all these datasets. Particularly, SD4Match\noutperforms the previous state-of-the-art by a margin of 12 percentage points\non the challenging SPair-71k dataset.\n","authors":["Xinghui Li","Jingyi Lu","Kai Han","Victor Prisacariu"],"pdf_url":"https://arxiv.org/pdf/2310.17569v2.pdf","comment":"Accepted to CVPR 2024. Project website:\n https://sd4match.active.vision/"},{"id":"http://arxiv.org/abs/2403.17615v1","updated":"2024-03-26T11:48:37Z","published":"2024-03-26T11:48:37Z","title":"Grad-CAMO: Learning Interpretable Single-Cell Morphological Profiles\n from 3D Cell Painting Images","summary":" Despite their black-box nature, deep learning models are extensively used in\nimage-based drug discovery to extract feature vectors from single cells in\nmicroscopy images. To better understand how these networks perform\nrepresentation learning, we employ visual explainability techniques (e.g.,\nGrad-CAM). Our analyses reveal several mechanisms by which supervised models\ncheat, exploiting biologically irrelevant pixels when extracting morphological\nfeatures from images, such as noise in the background. This raises doubts\nregarding the fidelity of learned single-cell representations and their\nrelevance when investigating downstream biological questions. To address this\nmisalignment between researcher expectations and machine behavior, we introduce\nGrad-CAMO, a novel single-cell interpretability score for supervised feature\nextractors. Grad-CAMO measures the proportion of a model's attention that is\nconcentrated on the cell of interest versus the background. This metric can be\nassessed per-cell or averaged across a validation set, offering a tool to audit\nindividual features vectors or guide the improved design of deep learning\narchitectures. Importantly, Grad-CAMO seamlessly integrates into existing\nworkflows, requiring no dataset or model modifications, and is compatible with\nboth 2D and 3D Cell Painting data. Additional results are available at\nhttps://github.com/eigenvivek/Grad-CAMO.\n","authors":["Vivek Gopalakrishnan","Jingzhe Ma","Zhiyong Xie"],"pdf_url":"https://arxiv.org/pdf/2403.17615v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17610v1","updated":"2024-03-26T11:43:05Z","published":"2024-03-26T11:43:05Z","title":"MMVP: A Multimodal MoCap Dataset with Vision and Pressure Sensors","summary":" Foot contact is an important cue not only for human motion capture but also\nfor motion understanding and physically plausible motion generation. However,\nmost of the foot-contact annotations in existing datasets are estimated by\npurely visual matching and distance thresholding, which results in low accuracy\nand coarse granularity. Even though existing multimodal datasets\nsynergistically capture plantar pressure (foot contact) and visual signals,\nthey are specifically designed for small-range and slow motion such as Taiji\nQuan and Yoga. Therefore, there is still a lack of a vision-pressure multimodal\ndataset with large-range and fast human motion, as well as accurate and dense\nfoot-contact annotation. To fill this gap, we propose a Multimodal MoCap\nDataset with Vision and Pressure sensors, named MMVP. MMVP provides accurate\nand dense plantar pressure signals synchronized with RGBD observations, which\nis especially useful for both plausible shape estimation, robust pose fitting\nwithout foot drifting, and accurate global translation tracking. To validate\nthe dataset, we propose an RGBD-P SMPL fitting method and also a\nmonocular-video-based baseline framework, VP-MoCap, for human motion capture.\nExperiments demonstrate that our RGBD-P SMPL Fitting results significantly\noutperform pure visual motion capture. Moreover, VP-MoCap outperforms SOTA\nmethods in foot-contact and global translation estimation accuracy. We believe\nthe configuration of the dataset and the baseline frameworks will stimulate the\nresearch in this direction and also provide a good reference for MoCap\napplications in various domains. Project page:\nhttps://haolyuan.github.io/MMVP-Dataset/.\n","authors":["He Zhang","Shenghao Ren","Haolei Yuan","Jianhui Zhao","Fan Li","Shuangpeng Sun","Zhenghao Liang","Tao Yu","Qiu Shen","Xun Cao"],"pdf_url":"https://arxiv.org/pdf/2403.17610v1.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2403.17608v1","updated":"2024-03-26T11:39:00Z","published":"2024-03-26T11:39:00Z","title":"Fake or JPEG? Revealing Common Biases in Generated Image Detection\n Datasets","summary":" The widespread adoption of generative image models has highlighted the urgent\nneed to detect artificial content, which is a crucial step in combating\nwidespread manipulation and misinformation. Consequently, numerous detectors\nand associated datasets have emerged. However, many of these datasets\ninadvertently introduce undesirable biases, thereby impacting the effectiveness\nand evaluation of detectors. In this paper, we emphasize that many datasets for\nAI-generated image detection contain biases related to JPEG compression and\nimage size. Using the GenImage dataset, we demonstrate that detectors indeed\nlearn from these undesired factors. Furthermore, we show that removing the\nnamed biases substantially increases robustness to JPEG compression and\nsignificantly alters the cross-generator performance of evaluated detectors.\nSpecifically, it leads to more than 11 percentage points increase in\ncross-generator performance for ResNet50 and Swin-T detectors on the GenImage\ndataset, achieving state-of-the-art results.\n We provide the dataset and source codes of this paper on the anonymous\nwebsite: https://www.unbiased-genimage.org\n","authors":["Patrick Grommelt","Louis Weiss","Franz-Josef Pfreundt","Janis Keuper"],"pdf_url":"https://arxiv.org/pdf/2403.17608v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.04701v3","updated":"2024-03-26T11:26:17Z","published":"2024-03-07T17:48:48Z","title":"ObjectCompose: Evaluating Resilience of Vision-Based Models on\n Object-to-Background Compositional Changes","summary":" Given the large-scale multi-modal training of recent vision-based models and\ntheir generalization capabilities, understanding the extent of their robustness\nis critical for their real-world deployment. In this work, we evaluate the\nresilience of current vision-based models against diverse object-to-background\ncontext variations. The majority of robustness evaluation methods have\nintroduced synthetic datasets to induce changes to object characteristics\n(viewpoints, scale, color) or utilized image transformation techniques\n(adversarial changes, common corruptions) on real images to simulate shifts in\ndistributions. Recent works have explored leveraging large language models and\ndiffusion models to generate changes in the background. However, these methods\neither lack in offering control over the changes to be made or distort the\nobject semantics, making them unsuitable for the task. Our method, on the other\nhand, can induce diverse object-to-background changes while preserving the\noriginal semantics and appearance of the object. To achieve this goal, we\nharness the generative capabilities of text-to-image, image-to-text, and\nimage-to-segment models to automatically generate a broad spectrum of\nobject-to-background changes. We induce both natural and adversarial background\nchanges by either modifying the textual prompts or optimizing the latents and\ntextual embedding of text-to-image models. We produce various versions of\nstandard vision datasets (ImageNet, COCO), incorporating either diverse and\nrealistic backgrounds into the images or introducing color, texture, and\nadversarial changes in the background. We conduct extensive experiment to\nanalyze the robustness of vision-based models against object-to-background\ncontext variations across diverse tasks. Code\nhttps://github.com/Muhammad-Huzaifaa/ObjectCompose.git\n","authors":["Hashmat Shadab Malik","Muhammad Huzaifa","Muzammal Naseer","Salman Khan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2403.04701v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13518v2","updated":"2024-03-26T11:16:47Z","published":"2024-03-20T11:38:30Z","title":"Motion Generation from Fine-grained Textual Descriptions","summary":" The task of text2motion is to generate human motion sequences from given\ntextual descriptions, where the model explores diverse mappings from natural\nlanguage instructions to human body movements. While most existing works are\nconfined to coarse-grained motion descriptions, e.g., \"A man squats.\",\nfine-grained descriptions specifying movements of relevant body parts are\nbarely explored. Models trained with coarse-grained texts may not be able to\nlearn mappings from fine-grained motion-related words to motion primitives,\nresulting in the failure to generate motions from unseen descriptions. In this\npaper, we build a large-scale language-motion dataset specializing in\nfine-grained textual descriptions, FineHumanML3D, by feeding GPT-3.5-turbo with\nstep-by-step instructions with pseudo-code compulsory checks. Accordingly, we\ndesign a new text2motion model, FineMotionDiffuse, making full use of\nfine-grained textual information. Our quantitative evaluation shows that\nFineMotionDiffuse trained on FineHumanML3D improves FID by a large margin of\n0.38, compared with competitive baselines. According to the qualitative\nevaluation and case study, our model outperforms MotionDiffuse in generating\nspatially or chronologically composite motions, by learning the implicit\nmappings from fine-grained descriptions to the corresponding basic motions. We\nrelease our data at https://github.com/KunhangL/finemotiondiffuse.\n","authors":["Kunhang Li","Yansong Feng"],"pdf_url":"https://arxiv.org/pdf/2403.13518v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15905v2","updated":"2024-03-26T11:11:49Z","published":"2024-03-23T18:19:02Z","title":"Towards Low-Energy Adaptive Personalization for Resource-Constrained\n Devices","summary":" The personalization of machine learning (ML) models to address data drift is\na significant challenge in the context of Internet of Things (IoT)\napplications. Presently, most approaches focus on fine-tuning either the full\nbase model or its last few layers to adapt to new data, while often neglecting\nenergy costs. However, various types of data drift exist, and fine-tuning the\nfull base model or the last few layers may not result in optimal performance in\ncertain scenarios. We propose Target Block Fine-Tuning (TBFT), a low-energy\nadaptive personalization framework designed for resource-constrained devices.\nWe categorize data drift and personalization into three types: input-level,\nfeature-level, and output-level. For each type, we fine-tune different blocks\nof the model to achieve optimal performance with reduced energy costs.\nSpecifically, input-, feature-, and output-level correspond to fine-tuning the\nfront, middle, and rear blocks of the model. We evaluate TBFT on a ResNet\nmodel, three datasets, three different training sizes, and a Raspberry Pi.\nCompared with the $Block Avg$, where each block is fine-tuned individually and\ntheir performance improvements are averaged, TBFT exhibits an improvement in\nmodel accuracy by an average of 15.30% whilst saving 41.57% energy consumption\non average compared with full fine-tuning.\n","authors":["Yushan Huang","Josh Millar","Yuxuan Long","Yuchen Zhao","Hamed Hadaddi"],"pdf_url":"https://arxiv.org/pdf/2403.15905v2.pdf","comment":"Accepetd to The 4th Workshop on Machine Learning and Systems\n (EuroMLSys '24)"},{"id":"http://arxiv.org/abs/2403.07576v2","updated":"2024-03-26T10:55:51Z","published":"2024-03-12T12:05:43Z","title":"FPT: Fine-grained Prompt Tuning for Parameter and Memory Efficient Fine\n Tuning in High-resolution Medical Image Classification","summary":" Parameter-efficient fine-tuning (PEFT) is proposed as a cost-effective way to\ntransfer pre-trained models to downstream tasks, avoiding the high cost of\nupdating entire large-scale pre-trained models (LPMs). In this work, we present\nFine-grained Prompt Tuning (FPT), a novel PEFT method for medical image\nclassification. FPT significantly reduces memory consumption compared to other\nPEFT methods, especially in high-resolution contexts. To achieve this, we first\nfreeze the weights of the LPM and construct a learnable lightweight side\nnetwork. The frozen LPM takes high-resolution images as input to extract\nfine-grained features, while the side network is fed low-resolution images to\nreduce memory usage. To allow the side network to access pre-trained knowledge,\nwe introduce fine-grained prompts that summarize information from the LPM\nthrough a fusion module. Important tokens selection and preloading techniques\nare employed to further reduce training cost and memory requirements. We\nevaluate FPT on four medical datasets with varying sizes, modalities, and\ncomplexities. Experimental results demonstrate that FPT achieves comparable\nperformance to fine-tuning the entire LPM while using only 1.8% of the\nlearnable parameters and 13% of the memory costs of an encoder ViT-B model with\na 512 x 512 input resolution.\n","authors":["Yijin Huang","Pujin Cheng","Roger Tam","Xiaoying Tang"],"pdf_url":"https://arxiv.org/pdf/2403.07576v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17589v1","updated":"2024-03-26T10:54:07Z","published":"2024-03-26T10:54:07Z","title":"Dual Memory Networks: A Versatile Adaptation Approach for\n Vision-Language Models","summary":" With the emergence of pre-trained vision-language models like CLIP, how to\nadapt them to various downstream classification tasks has garnered significant\nattention in recent research. The adaptation strategies can be typically\ncategorized into three paradigms: zero-shot adaptation, few-shot adaptation,\nand the recently-proposed training-free few-shot adaptation. Most existing\napproaches are tailored for a specific setting and can only cater to one or two\nof these paradigms. In this paper, we introduce a versatile adaptation approach\nthat can effectively work under all three settings. Specifically, we propose\nthe dual memory networks that comprise dynamic and static memory components.\nThe static memory caches training data knowledge, enabling training-free\nfew-shot adaptation, while the dynamic memory preserves historical test\nfeatures online during the testing process, allowing for the exploration of\nadditional data insights beyond the training set. This novel capability\nenhances model performance in the few-shot setting and enables model usability\nin the absence of training data. The two memory networks employ the same\nflexible memory interactive strategy, which can operate in a training-free mode\nand can be further enhanced by incorporating learnable projection layers. Our\napproach is tested across 11 datasets under the three task settings.\nRemarkably, in the zero-shot scenario, it outperforms existing methods by over\n3\\% and even shows superior results against methods utilizing external training\ndata. Additionally, our method exhibits robust performance against natural\ndistribution shifts. Codes are available at \\url{https://github.com/YBZh/DMN}.\n","authors":["Yabin Zhang","Wenjie Zhu","Hui Tang","Zhiyuan Ma","Kaiyang Zhou","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.17589v1.pdf","comment":"CVPR2024; Codes are available at \\url{https://github.com/YBZh/DMN}"},{"id":"http://arxiv.org/abs/2311.13385v3","updated":"2024-03-26T10:21:46Z","published":"2023-11-22T13:27:36Z","title":"SegVol: Universal and Interactive Volumetric Medical Image Segmentation","summary":" Precise image segmentation provides clinical study with instructive\ninformation. Despite the remarkable progress achieved in medical image\nsegmentation, there is still an absence of 3D foundation segmentation model\nthat can segment a wide range of anatomical categories with easy user\ninteraction. In this paper, we propose a 3D foundation segmentation model,\nnamed SegVol, supporting universal and interactive volumetric medical image\nsegmentation. By scaling up training data to 90K unlabeled Computed Tomography\n(CT) volumes and 6K labeled CT volumes, this foundation model supports the\nsegmentation of over 200 anatomical categories using semantic and spatial\nprompts. Extensive experiments on 10 internal validation tasks and 18 external\nvalidation tasks verify that SegVol outperforms the state of the art by a large\nmargin. Through its capacity to provide precise volumetric segmentation across\nvarious anatomical categories, SegVol has the potential to accelerate\nadvancements in medical imaging diagnosis and facilitate treatment\noptimization. The model and code are publicly available at:\nhttps://github.com/BAAI-DCAI/SegVol.\n","authors":["Yuxin Du","Fan Bai","Tiejun Huang","Bo Zhao"],"pdf_url":"https://arxiv.org/pdf/2311.13385v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03611v2","updated":"2024-03-26T10:13:11Z","published":"2023-12-06T16:55:53Z","title":"DreamComposer: Controllable 3D Object Generation via Multi-View\n Conditions","summary":" Utilizing pre-trained 2D large-scale generative models, recent works are\ncapable of generating high-quality novel views from a single in-the-wild image.\nHowever, due to the lack of information from multiple views, these works\nencounter difficulties in generating controllable novel views. In this paper,\nwe present DreamComposer, a flexible and scalable framework that can enhance\nexisting view-aware diffusion models by injecting multi-view conditions.\nSpecifically, DreamComposer first uses a view-aware 3D lifting module to obtain\n3D representations of an object from multiple views. Then, it renders the\nlatent features of the target view from 3D representations with the multi-view\nfeature fusion module. Finally the target view features extracted from\nmulti-view inputs are injected into a pre-trained diffusion model. Experiments\nshow that DreamComposer is compatible with state-of-the-art diffusion models\nfor zero-shot novel view synthesis, further enhancing them to generate\nhigh-fidelity novel view images with multi-view conditions, ready for\ncontrollable 3D object reconstruction and various other applications.\n","authors":["Yunhan Yang","Yukun Huang","Xiaoyang Wu","Yuan-Chen Guo","Song-Hai Zhang","Hengshuang Zhao","Tong He","Xihui Liu"],"pdf_url":"https://arxiv.org/pdf/2312.03611v2.pdf","comment":"Project Page: https://yhyang-myron.github.io/DreamComposer/"},{"id":"http://arxiv.org/abs/2312.08879v2","updated":"2024-03-26T10:04:11Z","published":"2023-12-12T11:00:39Z","title":"Regularizing Self-supervised 3D Scene Flows with Surface Awareness and\n Cyclic Consistency","summary":" Learning without supervision how to predict 3D scene flows from point clouds\nis essential to many perception systems. We propose a novel learning framework\nfor this task which improves the necessary regularization. Relying on the\nassumption that scene elements are mostly rigid, current smoothness losses are\nbuilt on the definition of ``rigid clusters\" in the input point clouds. The\ndefinition of these clusters is challenging and has a significant impact on the\nquality of predicted flows. We introduce two new consistency losses that\nenlarge clusters while preventing them from spreading over distinct objects. In\nparticular, we enforce \\emph{temporal} consistency with a forward-backward\ncyclic loss and \\emph{spatial} consistency by considering surface orientation\nsimilarity in addition to spatial proximity. The proposed losses are\nmodel-independent and can thus be used in a plug-and-play fashion to\nsignificantly improve the performance of existing models, as demonstrated on\ntwo most widely used architectures. We also showcase the effectiveness and\ngeneralization capability of our framework on four standard sensor-unique\ndriving datasets, achieving state-of-the-art performance in 3D scene flow\nestimation. Our codes are available on https://github.com/ctu-vras/sac-flow.\n","authors":["Patrik Vacek","David Hurych","Karel Zimmermann","Patrick Perez","Tomas Svoboda"],"pdf_url":"https://arxiv.org/pdf/2312.08879v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17550v1","updated":"2024-03-26T09:58:06Z","published":"2024-03-26T09:58:06Z","title":"DeepMIF: Deep Monotonic Implicit Fields for Large-Scale LiDAR 3D Mapping","summary":" Recently, significant progress has been achieved in sensing real large-scale\noutdoor 3D environments, particularly by using modern acquisition equipment\nsuch as LiDAR sensors. Unfortunately, they are fundamentally limited in their\nability to produce dense, complete 3D scenes. To address this issue, recent\nlearning-based methods integrate neural implicit representations and\noptimizable feature grids to approximate surfaces of 3D scenes. However,\nnaively fitting samples along raw LiDAR rays leads to noisy 3D mapping results\ndue to the nature of sparse, conflicting LiDAR measurements. Instead, in this\nwork we depart from fitting LiDAR data exactly, instead letting the network\noptimize a non-metric monotonic implicit field defined in 3D space. To fit our\nfield, we design a learning system integrating a monotonicity loss that enables\noptimizing neural monotonic fields and leverages recent progress in large-scale\n3D mapping. Our algorithm achieves high-quality dense 3D mapping performance as\ncaptured by multiple quantitative and perceptual measures and visual results\nobtained for Mai City, Newer College, and KITTI benchmarks. The code of our\napproach will be made publicly available.\n","authors":["Kutay Yılmaz","Matthias Nießner","Anastasiia Kornilova","Alexey Artemov"],"pdf_url":"https://arxiv.org/pdf/2403.17550v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2403.17549v1","updated":"2024-03-26T09:55:49Z","published":"2024-03-26T09:55:49Z","title":"Practical Applications of Advanced Cloud Services and Generative AI\n Systems in Medical Image Analysis","summary":" The medical field is one of the important fields in the application of\nartificial intelligence technology. With the explosive growth and\ndiversification of medical data, as well as the continuous improvement of\nmedical needs and challenges, artificial intelligence technology is playing an\nincreasingly important role in the medical field. Artificial intelligence\ntechnologies represented by computer vision, natural language processing, and\nmachine learning have been widely penetrated into diverse scenarios such as\nmedical imaging, health management, medical information, and drug research and\ndevelopment, and have become an important driving force for improving the level\nand quality of medical services.The article explores the transformative\npotential of generative AI in medical imaging, emphasizing its ability to\ngenerate syntheticACM-2 data, enhance images, aid in anomaly detection, and\nfacilitate image-to-image translation. Despite challenges like model\ncomplexity, the applications of generative models in healthcare, including\nMed-PaLM 2 technology, show promising results. By addressing limitations in\ndataset size and diversity, these models contribute to more accurate diagnoses\nand improved patient outcomes. However, ethical considerations and\ncollaboration among stakeholders are essential for responsible implementation.\nThrough experiments leveraging GANs to augment brain tumor MRI datasets, the\nstudy demonstrates how generative AI can enhance image quality and diversity,\nultimately advancing medical diagnostics and patient care.\n","authors":["Jingyu Xu","Binbin Wu","Jiaxin Huang","Yulu Gong","Yifan Zhang","Bo Liu"],"pdf_url":"https://arxiv.org/pdf/2403.17549v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17545v1","updated":"2024-03-26T09:49:35Z","published":"2024-03-26T09:49:35Z","title":"A Gaze-grounded Visual Question Answering Dataset for Clarifying\n Ambiguous Japanese Questions","summary":" Situated conversations, which refer to visual information as visual question\nanswering (VQA), often contain ambiguities caused by reliance on directive\ninformation. This problem is exacerbated because some languages, such as\nJapanese, often omit subjective or objective terms. Such ambiguities in\nquestions are often clarified by the contexts in conversational situations,\nsuch as joint attention with a user or user gaze information. In this study, we\npropose the Gaze-grounded VQA dataset (GazeVQA) that clarifies ambiguous\nquestions using gaze information by focusing on a clarification process\ncomplemented by gaze information. We also propose a method that utilizes gaze\ntarget estimation results to improve the accuracy of GazeVQA tasks. Our\nexperimental results showed that the proposed method improved the performance\nin some cases of a VQA system on GazeVQA and identified some typical problems\nof GazeVQA tasks that need to be improved.\n","authors":["Shun Inadumi","Seiya Kawano","Akishige Yuguchi","Yasutomo Kawanishi","Koichiro Yoshino"],"pdf_url":"https://arxiv.org/pdf/2403.17545v1.pdf","comment":"LREC-COLING 2024"},{"id":"http://arxiv.org/abs/2403.17541v1","updated":"2024-03-26T09:44:34Z","published":"2024-03-26T09:44:34Z","title":"WordRobe: Text-Guided Generation of Textured 3D Garments","summary":" In this paper, we tackle a new and challenging problem of text-driven\ngeneration of 3D garments with high-quality textures. We propose \"WordRobe\", a\nnovel framework for the generation of unposed & textured 3D garment meshes from\nuser-friendly text prompts. We achieve this by first learning a latent\nrepresentation of 3D garments using a novel coarse-to-fine training strategy\nand a loss for latent disentanglement, promoting better latent interpolation.\nSubsequently, we align the garment latent space to the CLIP embedding space in\na weakly supervised manner, enabling text-driven 3D garment generation and\nediting. For appearance modeling, we leverage the zero-shot generation\ncapability of ControlNet to synthesize view-consistent texture maps in a single\nfeed-forward inference step, thereby drastically decreasing the generation time\nas compared to existing methods. We demonstrate superior performance over\ncurrent SOTAs for learning 3D garment latent space, garment interpolation, and\ntext-driven texture synthesis, supported by quantitative evaluation and\nqualitative user study. The unposed 3D garment meshes generated using WordRobe\ncan be directly fed to standard cloth simulation & animation pipelines without\nany post-processing.\n","authors":["Astitva Srivastava","Pranav Manu","Amit Raj","Varun Jampani","Avinash Sharma"],"pdf_url":"https://arxiv.org/pdf/2403.17541v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17537v1","updated":"2024-03-26T09:42:28Z","published":"2024-03-26T09:42:28Z","title":"NeRF-HuGS: Improved Neural Radiance Fields in Non-static Scenes Using\n Heuristics-Guided Segmentation","summary":" Neural Radiance Field (NeRF) has been widely recognized for its excellence in\nnovel view synthesis and 3D scene reconstruction. However, their effectiveness\nis inherently tied to the assumption of static scenes, rendering them\nsusceptible to undesirable artifacts when confronted with transient distractors\nsuch as moving objects or shadows. In this work, we propose a novel paradigm,\nnamely \"Heuristics-Guided Segmentation\" (HuGS), which significantly enhances\nthe separation of static scenes from transient distractors by harmoniously\ncombining the strengths of hand-crafted heuristics and state-of-the-art\nsegmentation models, thus significantly transcending the limitations of\nprevious solutions. Furthermore, we delve into the meticulous design of\nheuristics, introducing a seamless fusion of Structure-from-Motion (SfM)-based\nheuristics and color residual heuristics, catering to a diverse range of\ntexture profiles. Extensive experiments demonstrate the superiority and\nrobustness of our method in mitigating transient distractors for NeRFs trained\nin non-static scenes. Project page: https://cnhaox.github.io/NeRF-HuGS/.\n","authors":["Jiahao Chen","Yipeng Qin","Lingjie Liu","Jiangbo Lu","Guanbin Li"],"pdf_url":"https://arxiv.org/pdf/2403.17537v1.pdf","comment":"To appear in CVPR2024"},{"id":"http://arxiv.org/abs/2403.17530v1","updated":"2024-03-26T09:36:20Z","published":"2024-03-26T09:36:20Z","title":"Boosting Few-Shot Learning with Disentangled Self-Supervised Learning\n and Meta-Learning for Medical Image Classification","summary":" Background and objective: Employing deep learning models in critical domains\nsuch as medical imaging poses challenges associated with the limited\navailability of training data. We present a strategy for improving the\nperformance and generalization capabilities of models trained in low-data\nregimes. Methods: The proposed method starts with a pre-training phase, where\nfeatures learned in a self-supervised learning setting are disentangled to\nimprove the robustness of the representations for downstream tasks. We then\nintroduce a meta-fine-tuning step, leveraging related classes between\nmeta-training and meta-testing phases but varying the granularity level. This\napproach aims to enhance the model's generalization capabilities by exposing it\nto more challenging classification tasks during meta-training and evaluating it\non easier tasks but holding greater clinical relevance during meta-testing. We\ndemonstrate the effectiveness of the proposed approach through a series of\nexperiments exploring several backbones, as well as diverse pre-training and\nfine-tuning schemes, on two distinct medical tasks, i.e., classification of\nprostate cancer aggressiveness from MRI data and classification of breast\ncancer malignity from microscopic images. Results: Our results indicate that\nthe proposed approach consistently yields superior performance w.r.t. ablation\nexperiments, maintaining competitiveness even when a distribution shift between\ntraining and evaluation data occurs. Conclusion: Extensive experiments\ndemonstrate the effectiveness and wide applicability of the proposed approach.\nWe hope that this work will add another solution to the arsenal of addressing\nlearning issues in data-scarce imaging domains.\n","authors":["Eva Pachetti","Sotirios A. Tsaftaris","Sara Colantonio"],"pdf_url":"https://arxiv.org/pdf/2403.17530v1.pdf","comment":"20 pages, 4 figures, 4 tables. Submitted to Elsevier on 25 March 2024"},{"id":"http://arxiv.org/abs/2207.12730v2","updated":"2024-03-26T09:35:03Z","published":"2022-07-26T08:34:17Z","title":"P2ANet: A Dataset and Benchmark for Dense Action Detection from Table\n Tennis Match Broadcasting Videos","summary":" While deep learning has been widely used for video analytics, such as video\nclassification and action detection, dense action detection with fast-moving\nsubjects from sports videos is still challenging. In this work, we release yet\nanother sports video benchmark \\TheName{} for \\emph{\\underline{P}}ing\n\\emph{\\underline{P}}ong-\\emph{\\underline{A}}ction detection, which consists of\n2,721 video clips collected from the broadcasting videos of professional table\ntennis matches in World Table Tennis Championships and Olympiads. We work with\na crew of table tennis professionals and referees on a specially designed\nannotation toolbox to obtain fine-grained action labels (in 14 classes) for\nevery ping-pong action that appeared in the dataset, and formulate two sets of\naction detection problems -- \\emph{action localization} and \\emph{action\nrecognition}. We evaluate a number of commonly-seen action recognition (e.g.,\nTSM, TSN, Video SwinTransformer, and Slowfast) and action localization models\n(e.g., BSN, BSN++, BMN, TCANet), using \\TheName{} for both problems, under\nvarious settings. These models can only achieve 48\\% area under the AR-AN curve\nfor localization and 82\\% top-one accuracy for recognition since the ping-pong\nactions are dense with fast-moving subjects but broadcasting videos are with\nonly 25 FPS. The results confirm that \\TheName{} is still a challenging task\nand can be used as a special benchmark for dense action detection from videos.\n","authors":["Jiang Bian","Xuhong Li","Tao Wang","Qingzhong Wang","Jun Huang","Chen Liu","Jun Zhao","Feixiang Lu","Dejing Dou","Haoyi Xiong"],"pdf_url":"https://arxiv.org/pdf/2207.12730v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.12378v2","updated":"2024-03-26T09:31:28Z","published":"2023-09-21T11:47:01Z","title":"Unsupervised Semantic Segmentation Through Depth-Guided Feature\n Correlation and Sampling","summary":" Traditionally, training neural networks to perform semantic segmentation\nrequired expensive human-made annotations. But more recently, advances in the\nfield of unsupervised learning have made significant progress on this issue and\ntowards closing the gap to supervised algorithms. To achieve this, semantic\nknowledge is distilled by learning to correlate randomly sampled features from\nimages across an entire dataset. In this work, we build upon these advances by\nincorporating information about the structure of the scene into the training\nprocess through the use of depth information. We achieve this by (1) learning\ndepth-feature correlation by spatially correlate the feature maps with the\ndepth maps to induce knowledge about the structure of the scene and (2)\nimplementing farthest-point sampling to more effectively select relevant\nfeatures by utilizing 3D sampling techniques on depth information of the scene.\nFinally, we demonstrate the effectiveness of our technical contributions\nthrough extensive experimentation and present significant improvements in\nperformance across multiple benchmark datasets.\n","authors":["Leon Sick","Dominik Engel","Pedro Hermosilla","Timo Ropinski"],"pdf_url":"https://arxiv.org/pdf/2309.12378v2.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.17525v1","updated":"2024-03-26T09:26:12Z","published":"2024-03-26T09:26:12Z","title":"Equipping Sketch Patches with Context-Aware Positional Encoding for\n Graphic Sketch Representation","summary":" The drawing order of a sketch records how it is created stroke-by-stroke by a\nhuman being. For graphic sketch representation learning, recent studies have\ninjected sketch drawing orders into graph edge construction by linking each\npatch to another in accordance to a temporal-based nearest neighboring\nstrategy. However, such constructed graph edges may be unreliable, since a\nsketch could have variants of drawings. In this paper, we propose a\nvariant-drawing-protected method by equipping sketch patches with context-aware\npositional encoding (PE) to make better use of drawing orders for learning\ngraphic sketch representation. Instead of injecting sketch drawings into graph\nedges, we embed these sequential information into graph nodes only. More\nspecifically, each patch embedding is equipped with a sinusoidal absolute PE to\nhighlight the sequential position in the drawing order. And its neighboring\npatches, ranked by the values of self-attention scores between patch\nembeddings, are equipped with learnable relative PEs to restore the contextual\npositions within a neighborhood. During message aggregation via graph\nconvolutional networks, a node receives both semantic contents from patch\nembeddings and contextual patterns from PEs by its neighbors, arriving at\ndrawing-order-enhanced sketch representations. Experimental results indicate\nthat our method significantly improves sketch healing and controllable sketch\nsynthesis.\n","authors":["Sicong Zang","Zhijun Fang"],"pdf_url":"https://arxiv.org/pdf/2403.17525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17520v1","updated":"2024-03-26T09:22:37Z","published":"2024-03-26T09:22:37Z","title":"Boosting Adversarial Training via Fisher-Rao Norm-based Regularization","summary":" Adversarial training is extensively utilized to improve the adversarial\nrobustness of deep neural networks. Yet, mitigating the degradation of standard\ngeneralization performance in adversarial-trained models remains an open\nproblem. This paper attempts to resolve this issue through the lens of model\ncomplexity. First, We leverage the Fisher-Rao norm, a geometrically invariant\nmetric for model complexity, to establish the non-trivial bounds of the\nCross-Entropy Loss-based Rademacher complexity for a ReLU-activated Multi-Layer\nPerceptron. Then we generalize a complexity-related variable, which is\nsensitive to the changes in model width and the trade-off factors in\nadversarial training. Moreover, intensive empirical evidence validates that\nthis variable highly correlates with the generalization gap of Cross-Entropy\nloss between adversarial-trained and standard-trained models, especially during\nthe initial and final phases of the training process. Building upon this\nobservation, we propose a novel regularization framework, called Logit-Oriented\nAdversarial Training (LOAT), which can mitigate the trade-off between\nrobustness and accuracy while imposing only a negligible increase in\ncomputational overhead. Our extensive experiments demonstrate that the proposed\nregularization strategy can boost the performance of the prevalent adversarial\ntraining algorithms, including PGD-AT, TRADES, TRADES (LSE), MART, and DM-AT,\nacross various network architectures. Our code will be available at\nhttps://github.com/TrustAI/LOAT.\n","authors":["Xiangyu Yin","Wenjie Ruan"],"pdf_url":"https://arxiv.org/pdf/2403.17520v1.pdf","comment":"This paper has been accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2312.04529v2","updated":"2024-03-26T09:21:29Z","published":"2023-12-07T18:50:00Z","title":"Diffusion Reflectance Map: Single-Image Stochastic Inverse Rendering of\n Illumination and Reflectance","summary":" Reflectance bounds the frequency spectrum of illumination in the object\nappearance. In this paper, we introduce the first stochastic inverse rendering\nmethod, which recovers the attenuated frequency spectrum of an illumination\njointly with the reflectance of an object of known geometry from a single\nimage. Our key idea is to solve this blind inverse problem in the reflectance\nmap, an appearance representation invariant to the underlying geometry, by\nlearning to reverse the image formation with a novel diffusion model which we\nrefer to as the Diffusion Reflectance Map Network (DRMNet). Given an observed\nreflectance map converted and completed from the single input image, DRMNet\ngenerates a reflectance map corresponding to a perfect mirror sphere while\njointly estimating the reflectance. The forward process can be understood as\ngradually filtering a natural illumination with lower and lower frequency\nreflectance and additive Gaussian noise. DRMNet learns to invert this process\nwith two subnetworks, IllNet and RefNet, which work in concert towards this\njoint estimation. The network is trained on an extensive synthetic dataset and\nis demonstrated to generalize to real images, showing state-of-the-art accuracy\non established datasets.\n","authors":["Yuto Enyo","Ko Nishino"],"pdf_url":"https://arxiv.org/pdf/2312.04529v2.pdf","comment":"to be published in CVPR 2024"},{"id":"http://arxiv.org/abs/2403.17512v1","updated":"2024-03-26T09:13:06Z","published":"2024-03-26T09:13:06Z","title":"Random-coupled Neural Network","summary":" Improving the efficiency of current neural networks and modeling them in\nbiological neural systems have become popular research directions in recent\nyears. Pulse-coupled neural network (PCNN) is a well applicated model for\nimitating the computation characteristics of the human brain in computer vision\nand neural network fields. However, differences between the PCNN and biological\nneural systems remain: limited neural connection, high computational cost, and\nlack of stochastic property. In this study, random-coupled neural network\n(RCNN) is proposed. It overcomes these difficulties in PCNN's neuromorphic\ncomputing via a random inactivation process. This process randomly closes some\nneural connections in the RCNN model, realized by the random inactivation\nweight matrix of link input. This releases the computational burden of PCNN,\nmaking it affordable to achieve vast neural connections. Furthermore, the image\nand video processing mechanisms of RCNN are researched. It encodes constant\nstimuli as periodic spike trains and periodic stimuli as chaotic spike trains,\nthe same as biological neural information encoding characteristics. Finally,\nthe RCNN is applicated to image segmentation, fusion, and pulse shape\ndiscrimination subtasks. It is demonstrated to be robust, efficient, and highly\nanti-noised, with outstanding performance in all applications mentioned above.\n","authors":["Haoran Liu","Mingzhe Liu","Peng Li","Jiahui Wu","Xin Jiang","Zhuo Zuo","Bingqi Liu"],"pdf_url":"https://arxiv.org/pdf/2403.17512v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13660v2","updated":"2024-03-26T09:09:15Z","published":"2024-03-20T15:08:57Z","title":"ProMamba: Prompt-Mamba for polyp segmentation","summary":" Detecting polyps through colonoscopy is an important task in medical image\nsegmentation, which provides significant assistance and reference value for\nclinical surgery. However, accurate segmentation of polyps is a challenging\ntask due to two main reasons. Firstly, polyps exhibit various shapes and\ncolors. Secondly, the boundaries between polyps and their normal surroundings\nare often unclear. Additionally, significant differences between different\ndatasets lead to limited generalization capabilities of existing methods. To\naddress these issues, we propose a segmentation model based on Prompt-Mamba,\nwhich incorporates the latest Vision-Mamba and prompt technologies. Compared to\nprevious models trained on the same dataset, our model not only maintains high\nsegmentation accuracy on the validation part of the same dataset but also\ndemonstrates superior accuracy on unseen datasets, exhibiting excellent\ngeneralization capabilities. Notably, we are the first to apply the\nVision-Mamba architecture to polyp segmentation and the first to utilize prompt\ntechnology in a polyp segmentation model. Our model efficiently accomplishes\nsegmentation tasks, surpassing previous state-of-the-art methods by an average\nof 5% across six datasets. Furthermore, we have developed multiple versions of\nour model with scaled parameter counts, achieving better performance than\nprevious models even with fewer parameters. Our code and trained weights will\nbe released soon.\n","authors":["Jianhao Xie","Ruofan Liao","Ziang Zhang","Sida Yi","Yuesheng Zhu","Guibo Luo"],"pdf_url":"https://arxiv.org/pdf/2403.13660v2.pdf","comment":"10 pages, 2 figures,3 tabels"},{"id":"http://arxiv.org/abs/2403.17503v1","updated":"2024-03-26T09:04:18Z","published":"2024-03-26T09:04:18Z","title":"DS-AL: A Dual-Stream Analytic Learning for Exemplar-Free\n Class-Incremental Learning","summary":" Class-incremental learning (CIL) under an exemplar-free constraint has\npresented a significant challenge. Existing methods adhering to this constraint\nare prone to catastrophic forgetting, far more so than replay-based techniques\nthat retain access to past samples. In this paper, to solve the exemplar-free\nCIL problem, we propose a Dual-Stream Analytic Learning (DS-AL) approach. The\nDS-AL contains a main stream offering an analytical (i.e., closed-form) linear\nsolution, and a compensation stream improving the inherent under-fitting\nlimitation due to adopting linear mapping. The main stream redefines the CIL\nproblem into a Concatenated Recursive Least Squares (C-RLS) task, allowing an\nequivalence between the CIL and its joint-learning counterpart. The\ncompensation stream is governed by a Dual-Activation Compensation (DAC) module.\nThis module re-activates the embedding with a different activation function\nfrom the main stream one, and seeks fitting compensation by projecting the\nembedding to the null space of the main stream's linear mapping. Empirical\nresults demonstrate that the DS-AL, despite being an exemplar-free technique,\ndelivers performance comparable with or better than that of replay-based\nmethods across various datasets, including CIFAR-100, ImageNet-100 and\nImageNet-Full. Additionally, the C-RLS' equivalent property allows the DS-AL to\nexecute CIL in a phase-invariant manner. This is evidenced by a\nnever-before-seen 500-phase CIL ImageNet task, which performs on a level\nidentical to a 5-phase one. Our codes are available at\nhttps://github.com/ZHUANGHP/Analytic-continual-learning.\n","authors":["Huiping Zhuang","Run He","Kai Tong","Ziqian Zeng","Cen Chen","Zhiping Lin"],"pdf_url":"https://arxiv.org/pdf/2403.17503v1.pdf","comment":"Accepted in AAAI 2024"},{"id":"http://arxiv.org/abs/2403.17502v1","updated":"2024-03-26T09:03:40Z","published":"2024-03-26T09:03:40Z","title":"SeNM-VAE: Semi-Supervised Noise Modeling with Hierarchical Variational\n Autoencoder","summary":" The data bottleneck has emerged as a fundamental challenge in learning based\nimage restoration methods. Researchers have attempted to generate synthesized\ntraining data using paired or unpaired samples to address this challenge. This\nstudy proposes SeNM-VAE, a semi-supervised noise modeling method that leverages\nboth paired and unpaired datasets to generate realistic degraded data. Our\napproach is based on modeling the conditional distribution of degraded and\nclean images with a specially designed graphical model. Under the variational\ninference framework, we develop an objective function for handling both paired\nand unpaired data. We employ our method to generate paired training samples for\nreal-world image denoising and super-resolution tasks. Our approach excels in\nthe quality of synthetic degraded images compared to other unpaired and paired\nnoise modeling methods. Furthermore, our approach demonstrates remarkable\nperformance in downstream image restoration tasks, even with limited paired\ndata. With more paired data, our method achieves the best performance on the\nSIDD dataset.\n","authors":["Dihan Zheng","Yihang Zou","Xiaowen Zhang","Chenglong Bao"],"pdf_url":"https://arxiv.org/pdf/2403.17502v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17497v1","updated":"2024-03-26T08:58:28Z","published":"2024-03-26T08:58:28Z","title":"Sharing the Cost of Success: A Game for Evaluating and Learning\n Collaborative Multi-Agent Instruction Giving and Following Policies","summary":" In collaborative goal-oriented settings, the participants are not only\ninterested in achieving a successful outcome, but do also implicitly negotiate\nthe effort they put into the interaction (by adapting to each other). In this\nwork, we propose a challenging interactive reference game that requires two\nplayers to coordinate on vision and language observations. The learning signal\nin this game is a score (given after playing) that takes into account the\nachieved goal and the players' assumed efforts during the interaction. We show\nthat a standard Proximal Policy Optimization (PPO) setup achieves a high\nsuccess rate when bootstrapped with heuristic partner behaviors that implement\ninsights from the analysis of human-human interactions. And we find that a\npairing of neural partners indeed reduces the measured joint effort when\nplaying together repeatedly. However, we observe that in comparison to a\nreasonable heuristic pairing there is still room for improvement -- which\ninvites further research in the direction of cost-sharing in collaborative\ninteractions.\n","authors":["Philipp Sadler","Sherzod Hakimov","David Schlangen"],"pdf_url":"https://arxiv.org/pdf/2403.17497v1.pdf","comment":"9 pages, Accepted at LREC-COLING 2024"},{"id":"http://arxiv.org/abs/2310.05370v2","updated":"2024-03-26T08:54:49Z","published":"2023-10-09T02:59:21Z","title":"SocialCircle: Learning the Angle-based Social Interaction Representation\n for Pedestrian Trajectory Prediction","summary":" Analyzing and forecasting trajectories of agents like pedestrians and cars in\ncomplex scenes has become more and more significant in many intelligent systems\nand applications. The diversity and uncertainty in socially interactive\nbehaviors among a rich variety of agents make this task more challenging than\nother deterministic computer vision tasks. Researchers have made a lot of\nefforts to quantify the effects of these interactions on future trajectories\nthrough different mathematical models and network structures, but this problem\nhas not been well solved. Inspired by marine animals that localize the\npositions of their companions underwater through echoes, we build a new\nanglebased trainable social interaction representation, named SocialCircle, for\ncontinuously reflecting the context of social interactions at different angular\norientations relative to the target agent. We validate the effect of the\nproposed SocialCircle by training it along with several newly released\ntrajectory prediction models, and experiments show that the SocialCircle not\nonly quantitatively improves the prediction performance, but also qualitatively\nhelps better simulate social interactions when forecasting pedestrian\ntrajectories in a way that is consistent with human intuitions.\n","authors":["Conghao Wong","Beihao Xia","Ziqian Zou","Yulong Wang","Xinge You"],"pdf_url":"https://arxiv.org/pdf/2310.05370v2.pdf","comment":"CVPR 2024 accepted"},{"id":"http://arxiv.org/abs/2403.17496v1","updated":"2024-03-26T08:53:25Z","published":"2024-03-26T08:53:25Z","title":"Dr.Hair: Reconstructing Scalp-Connected Hair Strands without\n Pre-training via Differentiable Rendering of Line Segments","summary":" In the film and gaming industries, achieving a realistic hair appearance\ntypically involves the use of strands originating from the scalp. However,\nreconstructing these strands from observed surface images of hair presents\nsignificant challenges. The difficulty in acquiring Ground Truth (GT) data has\nled state-of-the-art learning-based methods to rely on pre-training with\nmanually prepared synthetic CG data. This process is not only labor-intensive\nand costly but also introduces complications due to the domain gap when\ncompared to real-world data. In this study, we propose an optimization-based\napproach that eliminates the need for pre-training. Our method represents hair\nstrands as line segments growing from the scalp and optimizes them using a\nnovel differentiable rendering algorithm. To robustly optimize a substantial\nnumber of slender explicit geometries, we introduce 3D orientation estimation\nutilizing global optimization, strand initialization based on Laplace's\nequation, and reparameterization that leverages geometric connectivity and\nspatial proximity. Unlike existing optimization-based methods, our method is\ncapable of reconstructing internal hair flow in an absolute direction. Our\nmethod exhibits robust and accurate inverse rendering, surpassing the quality\nof existing methods and significantly improving processing speed.\n","authors":["Yusuke Takimoto","Hikari Takehara","Hiroyuki Sato","Zihao Zhu","Bo Zheng"],"pdf_url":"https://arxiv.org/pdf/2403.17496v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.13039v2","updated":"2024-03-26T08:52:05Z","published":"2024-03-19T16:21:47Z","title":"Emotic Masked Autoencoder with Attention Fusion for Facial Expression\n Recognition","summary":" Facial Expression Recognition (FER) is a critical task within computer vision\nwith diverse applications across various domains. Addressing the challenge of\nlimited FER datasets, which hampers the generalization capability of expression\nrecognition models, is imperative for enhancing performance. Our paper presents\nan innovative approach integrating the MAE-Face self-supervised learning (SSL)\nmethod and Fusion Attention mechanism for expression classification,\nparticularly showcased in the 6th Affective Behavior 32 pages harvmac; added\nreferences for section 5Analysis in-the-wild (ABAW) competition. Additionally,\nwe propose preprocessing techniques to emphasize essential facial features,\nthereby enhancing model performance on both training and validation sets,\nnotably demonstrated on the Aff-wild2 dataset.\n","authors":["Bach Nguyen-Xuan","Thien Nguyen-Hoang","Nhu Tai-Do"],"pdf_url":"https://arxiv.org/pdf/2403.13039v2.pdf","comment":"6 pages; added references for section 1; corrected typo for email\n author"},{"id":"http://arxiv.org/abs/2403.13653v2","updated":"2024-03-26T08:45:09Z","published":"2024-03-20T14:58:40Z","title":"Learning User Embeddings from Human Gaze for Personalised Saliency\n Prediction","summary":" Reusable embeddings of user behaviour have shown significant performance\nimprovements for the personalised saliency prediction task. However, prior\nworks require explicit user characteristics and preferences as input, which are\noften difficult to obtain. We present a novel method to extract user embeddings\nfrom pairs of natural images and corresponding saliency maps generated from a\nsmall amount of user-specific eye tracking data. At the core of our method is a\nSiamese convolutional neural encoder that learns the user embeddings by\ncontrasting the image and personal saliency map pairs of different users.\nEvaluations on two public saliency datasets show that the generated embeddings\nhave high discriminative power, are effective at refining universal saliency\nmaps to the individual users, and generalise well across users and images.\nFinally, based on our model's ability to encode individual user\ncharacteristics, our work points towards other applications that can benefit\nfrom reusable embeddings of gaze behaviour.\n","authors":["Florian Strohm","Mihai Bâce","Andreas Bulling"],"pdf_url":"https://arxiv.org/pdf/2403.13653v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17726v2","updated":"2024-03-26T08:38:52Z","published":"2024-02-27T17:58:09Z","title":"VRP-SAM: SAM with Visual Reference Prompt","summary":" In this paper, we propose a novel Visual Reference Prompt (VRP) encoder that\nempowers the Segment Anything Model (SAM) to utilize annotated reference images\nas prompts for segmentation, creating the VRP-SAM model. In essence, VRP-SAM\ncan utilize annotated reference images to comprehend specific objects and\nperform segmentation of specific objects in target image. It is note that the\nVRP encoder can support a variety of annotation formats for reference images,\nincluding \\textbf{point}, \\textbf{box}, \\textbf{scribble}, and \\textbf{mask}.\nVRP-SAM achieves a breakthrough within the SAM framework by extending its\nversatility and applicability while preserving SAM's inherent strengths, thus\nenhancing user-friendliness. To enhance the generalization ability of VRP-SAM,\nthe VRP encoder adopts a meta-learning strategy. To validate the effectiveness\nof VRP-SAM, we conducted extensive empirical studies on the Pascal and COCO\ndatasets. Remarkably, VRP-SAM achieved state-of-the-art performance in visual\nreference segmentation with minimal learnable parameters. Furthermore, VRP-SAM\ndemonstrates strong generalization capabilities, allowing it to perform\nsegmentation of unseen objects and enabling cross-domain segmentation. The\nsource code and models will be available at\n\\url{https://github.com/syp2ysy/VRP-SAM}\n","authors":["Yanpeng Sun","Jiahui Chen","Shan Zhang","Xinyu Zhang","Qiang Chen","Gang Zhang","Errui Ding","Jingdong Wang","Zechao Li"],"pdf_url":"https://arxiv.org/pdf/2402.17726v2.pdf","comment":"Accepted by CVPR 2024; The camera-ready version"},{"id":"http://arxiv.org/abs/2403.13972v2","updated":"2024-03-26T08:34:16Z","published":"2024-03-20T20:47:53Z","title":"SeFFeC: Semantic Facial Feature Control for Fine-grained Face Editing","summary":" We propose Semantic Facial Feature Control (SeFFeC) - a novel method for\nfine-grained face shape editing. Our method enables the manipulation of\nhuman-understandable, semantic face features, such as nose length or mouth\nwidth, which are defined by different groups of facial landmarks. In contrast\nto existing methods, the use of facial landmarks enables precise measurement of\nthe facial features, which then enables training SeFFeC without any manually\nannotated labels. SeFFeC consists of a transformer-based encoder network that\ntakes a latent vector of a pre-trained generative model and a facial feature\nembedding as input, and learns to modify the latent vector to perform the\ndesired face edit operation. To ensure that the desired feature measurement is\nchanged towards the target value without altering uncorrelated features, we\nintroduced a novel semantic face feature loss. Qualitative and quantitative\nresults show that SeFFeC enables precise and fine-grained control of 23 facial\nfeatures, some of which could not previously be controlled by other methods,\nwithout requiring manual annotations. Unlike existing methods, SeFFeC also\nprovides deterministic control over the exact values of the facial features and\nmore localised and disentangled face edits.\n","authors":["Florian Strohm","Mihai Bâce","Markus Kaltenecker","Andreas Bulling"],"pdf_url":"https://arxiv.org/pdf/2403.13972v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17477v1","updated":"2024-03-26T08:13:02Z","published":"2024-03-26T08:13:02Z","title":"DiffGaze: A Diffusion Model for Continuous Gaze Sequence Generation on\n 360° Images","summary":" We present DiffGaze, a novel method for generating realistic and diverse\ncontinuous human gaze sequences on 360{\\deg} images based on a conditional\nscore-based denoising diffusion model. Generating human gaze on 360{\\deg}\nimages is important for various human-computer interaction and computer\ngraphics applications, e.g. for creating large-scale eye tracking datasets or\nfor realistic animation of virtual humans. However, existing methods are\nlimited to predicting discrete fixation sequences or aggregated saliency maps,\nthereby neglecting crucial parts of natural gaze behaviour. Our method uses\nfeatures extracted from 360{\\deg} images as condition and uses two transformers\nto model the temporal and spatial dependencies of continuous human gaze. We\nevaluate DiffGaze on two 360{\\deg} image benchmarks for gaze sequence\ngeneration as well as scanpath prediction and saliency prediction. Our\nevaluations show that DiffGaze outperforms state-of-the-art methods on all\ntasks on both benchmarks. We also report a 21-participant user study showing\nthat our method generates gaze sequences that are indistinguishable from real\nhuman sequences.\n","authors":["Chuhan Jiao","Yao Wang","Guanhua Zhang","Mihai Bâce","Zhiming Hu","Andreas Bulling"],"pdf_url":"https://arxiv.org/pdf/2403.17477v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.12036v3","updated":"2024-03-26T08:09:43Z","published":"2022-11-22T06:19:17Z","title":"Dual Prototype Attention for Unsupervised Video Object Segmentation","summary":" Unsupervised video object segmentation (VOS) aims to detect and segment the\nmost salient object in videos. The primary techniques used in unsupervised VOS\nare 1) the collaboration of appearance and motion information; and 2) temporal\nfusion between different frames. This paper proposes two novel prototype-based\nattention mechanisms, inter-modality attention (IMA) and inter-frame attention\n(IFA), to incorporate these techniques via dense propagation across different\nmodalities and frames. IMA densely integrates context information from\ndifferent modalities based on a mutual refinement. IFA injects global context\nof a video to the query frame, enabling a full utilization of useful properties\nfrom multiple frames. Experimental results on public benchmark datasets\ndemonstrate that our proposed approach outperforms all existing methods by a\nsubstantial margin. The proposed two components are also thoroughly validated\nvia ablative study.\n","authors":["Suhwan Cho","Minhyeok Lee","Seunghoon Lee","Dogyoon Lee","Heeseung Choi","Ig-Jae Kim","Sangyoun Lee"],"pdf_url":"https://arxiv.org/pdf/2211.12036v3.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2311.09974v2","updated":"2024-03-26T08:04:00Z","published":"2023-11-16T15:47:49Z","title":"From Pretext to Purpose: Batch-Adaptive Self-Supervised Learning","summary":" In recent years, self-supervised contrastive learning has emerged as a\ndistinguished paradigm in the artificial intelligence landscape. It facilitates\nunsupervised feature learning through contrastive delineations at the instance\nlevel. However, crafting an effective self-supervised paradigm remains a\npivotal challenge within this field. This paper delves into two crucial factors\nimpacting self-supervised contrastive learning-bach size and pretext tasks, and\nfrom a data processing standpoint, proposes an adaptive technique of batch\nfusion. The proposed method, via dimensionality reduction and reconstruction of\nbatch data, enables formerly isolated individual data to partake in intra-batch\ncommunication through the Embedding Layer. Moreover, it adaptively amplifies\nthe self-supervised feature encoding capability as the training progresses. We\nconducted a linear classification test of this method based on the classic\ncontrastive learning framework on ImageNet-1k. The empirical findings\nillustrate that our approach achieves state-of-the-art performance under\nequitable comparisons. Benefiting from its \"plug-and-play\" characteristics, we\nfurther explored other contrastive learning methods. On the ImageNet-100,\ncompared to the original performance, the top1 has seen a maximum increase of\n1.25%. We suggest that the proposed method may contribute to the advancement of\ndata-driven self-supervised learning research, bringing a fresh perspective to\nthis community.\n","authors":["Jiansong Zhang","Linlin Shen","Peizhong Liu"],"pdf_url":"https://arxiv.org/pdf/2311.09974v2.pdf","comment":"14 pages, 2 figures, the code of this paper will be released soon"},{"id":"http://arxiv.org/abs/2311.16926v4","updated":"2024-03-26T07:55:24Z","published":"2023-11-28T16:31:27Z","title":"LLaFS: When Large Language Models Meet Few-Shot Segmentation","summary":" This paper proposes LLaFS, the first attempt to leverage large language\nmodels (LLMs) in few-shot segmentation. In contrast to the conventional\nfew-shot segmentation methods that only rely on the limited and biased\ninformation from the annotated support images, LLaFS leverages the vast prior\nknowledge gained by LLM as an effective supplement and directly uses the LLM to\nsegment images in a few-shot manner. To enable the text-based LLM to handle\nimage-related tasks, we carefully design an input instruction that allows the\nLLM to produce segmentation results represented as polygons, and propose a\nregion-attribute table to simulate the human visual mechanism and provide\nmulti-modal guidance. We also synthesize pseudo samples and use curriculum\nlearning for pretraining to augment data and achieve better optimization. LLaFS\nachieves state-of-the-art results on multiple datasets, showing the potential\nof using LLMs for few-shot computer vision tasks.\n","authors":["Lanyun Zhu","Tianrun Chen","Deyi Ji","Jieping Ye","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2311.16926v4.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2403.17465v1","updated":"2024-03-26T07:55:16Z","published":"2024-03-26T07:55:16Z","title":"LaRE^2: Latent Reconstruction Error Based Method for Diffusion-Generated\n Image Detection","summary":" The evolution of Diffusion Models has dramatically improved image generation\nquality, making it increasingly difficult to differentiate between real and\ngenerated images. This development, while impressive, also raises significant\nprivacy and security concerns. In response to this, we propose a novel Latent\nREconstruction error guided feature REfinement method (LaRE^2) for detecting\nthe diffusion-generated images. We come up with the Latent Reconstruction Error\n(LaRE), the first reconstruction-error based feature in the latent space for\ngenerated image detection. LaRE surpasses existing methods in terms of feature\nextraction efficiency while preserving crucial cues required to differentiate\nbetween the real and the fake. To exploit LaRE, we propose an Error-Guided\nfeature REfinement module (EGRE), which can refine the image feature guided by\nLaRE to enhance the discriminativeness of the feature. Our EGRE utilizes an\nalign-then-refine mechanism, which effectively refines the image feature for\ngenerated-image detection from both spatial and channel perspectives. Extensive\nexperiments on the large-scale GenImage benchmark demonstrate the superiority\nof our LaRE^2, which surpasses the best SoTA method by up to 11.9%/12.1%\naverage ACC/AP across 8 different image generators. LaRE also surpasses\nexisting methods in terms of feature extraction cost, delivering an impressive\nspeed enhancement of 8 times.\n","authors":["Yunpeng Luo","Junlong Du","Ke Yan","Shouhong Ding"],"pdf_url":"https://arxiv.org/pdf/2403.17465v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.17460v1","updated":"2024-03-26T07:48:49Z","published":"2024-03-26T07:48:49Z","title":"Building Bridges across Spatial and Temporal Resolutions:\n Reference-Based Super-Resolution via Change Priors and Conditional Diffusion\n Model","summary":" Reference-based super-resolution (RefSR) has the potential to build bridges\nacross spatial and temporal resolutions of remote sensing images. However,\nexisting RefSR methods are limited by the faithfulness of content\nreconstruction and the effectiveness of texture transfer in large scaling\nfactors. Conditional diffusion models have opened up new opportunities for\ngenerating realistic high-resolution images, but effectively utilizing\nreference images within these models remains an area for further exploration.\nFurthermore, content fidelity is difficult to guarantee in areas without\nrelevant reference information. To solve these issues, we propose a\nchange-aware diffusion model named Ref-Diff for RefSR, using the land cover\nchange priors to guide the denoising process explicitly. Specifically, we\ninject the priors into the denoising model to improve the utilization of\nreference information in unchanged areas and regulate the reconstruction of\nsemantically relevant content in changed areas. With this powerful guidance, we\ndecouple the semantics-guided denoising and reference texture-guided denoising\nprocesses to improve the model performance. Extensive experiments demonstrate\nthe superior effectiveness and robustness of the proposed method compared with\nstate-of-the-art RefSR methods in both quantitative and qualitative\nevaluations. The code and data are available at\nhttps://github.com/dongrunmin/RefDiff.\n","authors":["Runmin Dong","Shuai Yuan","Bin Luo","Mengxuan Chen","Jinxiao Zhang","Lixian Zhang","Weijia Li","Juepeng Zheng","Haohuan Fu"],"pdf_url":"https://arxiv.org/pdf/2403.17460v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.14027v2","updated":"2024-03-26T07:47:20Z","published":"2024-03-20T22:52:34Z","title":"EcoSense: Energy-Efficient Intelligent Sensing for In-Shore Ship\n Detection through Edge-Cloud Collaboration","summary":" Detecting marine objects inshore presents challenges owing to algorithmic\nintricacies and complexities in system deployment. We propose a\ndifficulty-aware edge-cloud collaborative sensing system that splits the task\ninto object localization and fine-grained classification. Objects are\nclassified either at the edge or within the cloud, based on their estimated\ndifficulty. The framework comprises a low-power device-tailored front-end model\nfor object localization, classification, and difficulty estimation, along with\na transformer-graph convolutional network-based back-end model for fine-grained\nclassification. Our system demonstrates superior performance (mAP@0.5 +4.3%})\non widely used marine object detection datasets, significantly reducing both\ndata transmission volume (by 95.43%) and energy consumption (by 72.7%}) at the\nsystem level. We validate the proposed system across various embedded system\nplatforms and in real-world scenarios involving drone deployment.\n","authors":["Wenjun Huang","Hanning Chen","Yang Ni","Arghavan Rezvani","Sanggeon Yun","Sungheon Jeon","Eric Pedley","Mohsen Imani"],"pdf_url":"https://arxiv.org/pdf/2403.14027v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.03180v5","updated":"2024-03-26T07:44:45Z","published":"2021-06-06T17:01:13Z","title":"Vision Transformers with Hierarchical Attention","summary":" This paper tackles the high computational/space complexity associated with\nMulti-Head Self-Attention (MHSA) in vanilla vision transformers. To this end,\nwe propose Hierarchical MHSA (H-MHSA), a novel approach that computes\nself-attention in a hierarchical fashion. Specifically, we first divide the\ninput image into patches as commonly done, and each patch is viewed as a token.\nThen, the proposed H-MHSA learns token relationships within local patches,\nserving as local relationship modeling. Then, the small patches are merged into\nlarger ones, and H-MHSA models the global dependencies for the small number of\nthe merged tokens. At last, the local and global attentive features are\naggregated to obtain features with powerful representation capacity. Since we\nonly calculate attention for a limited number of tokens at each step, the\ncomputational load is reduced dramatically. Hence, H-MHSA can efficiently model\nglobal relationships among tokens without sacrificing fine-grained information.\nWith the H-MHSA module incorporated, we build a family of\nHierarchical-Attention-based Transformer Networks, namely HAT-Net. To\ndemonstrate the superiority of HAT-Net in scene understanding, we conduct\nextensive experiments on fundamental vision tasks, including image\nclassification, semantic segmentation, object detection, and instance\nsegmentation. Therefore, HAT-Net provides a new perspective for vision\ntransformers. Code and pretrained models are available at\nhttps://github.com/yun-liu/HAT-Net.\n","authors":["Yun Liu","Yu-Huan Wu","Guolei Sun","Le Zhang","Ajad Chhatkuli","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2106.03180v5.pdf","comment":"Machine Intelligence Research (MIR), DOI: 10.1007/s11633-024-1393-8"},{"id":"http://arxiv.org/abs/2308.07728v5","updated":"2024-03-26T07:43:08Z","published":"2023-08-15T12:08:43Z","title":"Domain-Aware Fine-Tuning: Enhancing Neural Network Adaptability","summary":" Fine-tuning pre-trained neural network models has become a widely adopted\napproach across various domains. However, it can lead to the distortion of\npre-trained feature extractors that already possess strong generalization\ncapabilities. Mitigating feature distortion during adaptation to new target\ndomains is crucial. Recent studies have shown promising results in handling\nfeature distortion by aligning the head layer on in-distribution datasets\nbefore performing fine-tuning. Nonetheless, a significant limitation arises\nfrom the treatment of batch normalization layers during fine-tuning, leading to\nsuboptimal performance. In this paper, we propose Domain-Aware Fine-Tuning\n(DAFT), a novel approach that incorporates batch normalization conversion and\nthe integration of linear probing and fine-tuning. Our batch normalization\nconversion method effectively mitigates feature distortion by reducing\nmodifications to the neural network during fine-tuning. Additionally, we\nintroduce the integration of linear probing and fine-tuning to optimize the\nhead layer with gradual adaptation of the feature extractor. By leveraging\nbatch normalization layers and integrating linear probing and fine-tuning, our\nDAFT significantly mitigates feature distortion and achieves improved model\nperformance on both in-distribution and out-of-distribution datasets. Extensive\nexperiments demonstrate that our method outperforms other baseline methods,\ndemonstrating its effectiveness in not only improving performance but also\nmitigating feature distortion.\n","authors":["Seokhyeon Ha","Sunbeom Jung","Jungwoo Lee"],"pdf_url":"https://arxiv.org/pdf/2308.07728v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17447v1","updated":"2024-03-26T07:26:00Z","published":"2024-03-26T07:26:00Z","title":"Chain of Compression: A Systematic Approach to Combinationally Compress\n Convolutional Neural Networks","summary":" Convolutional neural networks (CNNs) have achieved significant popularity,\nbut their computational and memory intensity poses challenges for\nresource-constrained computing systems, particularly with the prerequisite of\nreal-time performance. To release this burden, model compression has become an\nimportant research focus. Many approaches like quantization, pruning, early\nexit, and knowledge distillation have demonstrated the effect of reducing\nredundancy in neural networks. Upon closer examination, it becomes apparent\nthat each approach capitalizes on its unique features to compress the neural\nnetwork, and they can also exhibit complementary behavior when combined. To\nexplore the interactions and reap the benefits from the complementary features,\nwe propose the Chain of Compression, which works on the combinational sequence\nto apply these common techniques to compress the neural network. Validated on\nthe image-based regression and classification networks across different data\nsets, our proposed Chain of Compression can significantly compress the\ncomputation cost by 100-1000 times with ignorable accuracy loss compared with\nthe baseline model.\n","authors":["Yingtao Shen","Minqing Sun","Jie Zhao","An Zou"],"pdf_url":"https://arxiv.org/pdf/2403.17447v1.pdf","comment":"10 pages, 15 figures"},{"id":"http://arxiv.org/abs/2306.07632v3","updated":"2024-03-26T07:00:27Z","published":"2023-06-13T09:02:57Z","title":"NeuS-PIR: Learning Relightable Neural Surface using Pre-Integrated\n Rendering","summary":" This paper presents a method, namely NeuS-PIR, for recovering relightable\nneural surfaces using pre-integrated rendering from multi-view images or video.\nUnlike methods based on NeRF and discrete meshes, our method utilizes implicit\nneural surface representation to reconstruct high-quality geometry, which\nfacilitates the factorization of the radiance field into two components: a\nspatially varying material field and an all-frequency lighting representation.\nThis factorization, jointly optimized using an adapted differentiable\npre-integrated rendering framework with material encoding regularization, in\nturn addresses the ambiguity of geometry reconstruction and leads to better\ndisentanglement and refinement of each scene property. Additionally, we\nintroduced a method to distil indirect illumination fields from the learned\nrepresentations, further recovering the complex illumination effect like\ninter-reflection. Consequently, our method enables advanced applications such\nas relighting, which can be seamlessly integrated with modern graphics engines.\nQualitative and quantitative experiments have shown that NeuS-PIR outperforms\nexisting methods across various tasks on both synthetic and real datasets.\nSource code is available at https://github.com/Sheldonmao/NeuSPIR\n","authors":["Shi Mao","Chenming Wu","Zhelun Shen","Yifan Wang","Dayan Wu","Liangjun Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.07632v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17432v1","updated":"2024-03-26T06:57:50Z","published":"2024-03-26T06:57:50Z","title":"Integrating Mamba Sequence Model and Hierarchical Upsampling Network for\n Accurate Semantic Segmentation of Multiple Sclerosis Legion","summary":" Integrating components from convolutional neural networks and state space\nmodels in medical image segmentation presents a compelling approach to enhance\naccuracy and efficiency. We introduce Mamba HUNet, a novel architecture\ntailored for robust and efficient segmentation tasks. Leveraging strengths from\nMamba UNet and the lighter version of Hierarchical Upsampling Network (HUNet),\nMamba HUNet combines convolutional neural networks local feature extraction\npower with state space models long range dependency modeling capabilities. We\nfirst converted HUNet into a lighter version, maintaining performance parity\nand then integrated this lighter HUNet into Mamba HUNet, further enhancing its\nefficiency. The architecture partitions input grayscale images into patches,\ntransforming them into 1D sequences for processing efficiency akin to Vision\nTransformers and Mamba models. Through Visual State Space blocks and patch\nmerging layers, hierarchical features are extracted while preserving spatial\ninformation. Experimental results on publicly available Magnetic Resonance\nImaging scans, notably in Multiple Sclerosis lesion segmentation, demonstrate\nMamba HUNet's effectiveness across diverse segmentation tasks. The model's\nrobustness and flexibility underscore its potential in handling complex\nanatomical structures. These findings establish Mamba HUNet as a promising\nsolution in advancing medical image segmentation, with implications for\nimproving clinical decision making processes.\n","authors":["Kazi Shahriar Sanjid","Md. Tanzim Hossain","Md. Shakib Shahariar Junayed","Dr. Mohammad Monir Uddin"],"pdf_url":"https://arxiv.org/pdf/2403.17432v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2403.17423v1","updated":"2024-03-26T06:40:03Z","published":"2024-03-26T06:40:03Z","title":"Test-time Adaptation Meets Image Enhancement: Improving Accuracy via\n Uncertainty-aware Logit Switching","summary":" Deep neural networks have achieved remarkable success in a variety of\ncomputer vision applications. However, there is a problem of degrading accuracy\nwhen the data distribution shifts between training and testing. As a solution\nof this problem, Test-time Adaptation~(TTA) has been well studied because of\nits practicality. Although TTA methods increase accuracy under distribution\nshift by updating the model at test time, using high-uncertainty predictions is\nknown to degrade accuracy. Since the input image is the root of the\ndistribution shift, we incorporate a new perspective on enhancing the input\nimage into TTA methods to reduce the prediction's uncertainty. We hypothesize\nthat enhancing the input image reduces prediction's uncertainty and increase\nthe accuracy of TTA methods. On the basis of our hypothesis, we propose a novel\nmethod: Test-time Enhancer and Classifier Adaptation~(TECA). In TECA, the\nclassification model is combined with the image enhancement model that\ntransforms input images into recognition-friendly ones, and these models are\nupdated by existing TTA methods. Furthermore, we found that the prediction from\nthe enhanced image does not always have lower uncertainty than the prediction\nfrom the original image. Thus, we propose logit switching, which compares the\nuncertainty measure of these predictions and outputs the lower one. In our\nexperiments, we evaluate TECA with various TTA methods and show that TECA\nreduces prediction's uncertainty and increases accuracy of TTA methods despite\nhaving no hyperparameters and little parameter overhead.\n","authors":["Shohei Enomoto","Naoya Hasegawa","Kazuki Adachi","Taku Sasaki","Shin'ya Yamaguchi","Satoshi Suzuki","Takeharu Eda"],"pdf_url":"https://arxiv.org/pdf/2403.17423v1.pdf","comment":"Accepted to IJCNN2024"},{"id":"http://arxiv.org/abs/2403.16169v2","updated":"2024-03-26T06:39:30Z","published":"2024-03-24T14:24:13Z","title":"Gaze-guided Hand-Object Interaction Synthesis: Benchmark and Method","summary":" Gaze plays a crucial role in revealing human attention and intention,\nshedding light on the cognitive processes behind human actions. The integration\nof gaze guidance with the dynamics of hand-object interactions boosts the\naccuracy of human motion prediction. However, the lack of datasets that capture\nthe intricate relationship and consistency among gaze, hand, and object\nmovements remains a substantial hurdle. In this paper, we introduce the first\nGaze-guided Hand-Object Interaction dataset, GazeHOI, and present a novel task\nfor synthesizing gaze-guided hand-object interactions. Our dataset, GazeHOI,\nfeatures simultaneous 3D modeling of gaze, hand, and object interactions,\ncomprising 479 sequences with an average duration of 19.1 seconds, 812\nsub-sequences, and 33 objects of various sizes. We propose a hierarchical\nframework centered on a gaze-guided hand-object interaction diffusion model,\nnamed GHO-Diffusion. In the pre-diffusion phase, we separate gaze conditions\ninto spatial-temporal features and goal pose conditions at different levels of\ninformation granularity. During the diffusion phase, two gaze-conditioned\ndiffusion models are stacked to simplify the complex synthesis of hand-object\nmotions. Here, the object motion diffusion model generates sequences of object\nmotions based on gaze conditions, while the hand motion diffusion model\nproduces hand motions based on the generated object motion. To improve\nfine-grained goal pose alignment, we introduce a Spherical Gaussian constraint\nto guide the denoising step. In the subsequent post-diffusion phase, we\noptimize the generated hand motions using contact consistency. Our extensive\nexperiments highlight the uniqueness of our dataset and the effectiveness of\nour approach.\n","authors":["Jie Tian","Lingxiao Yang","Ran Ji","Yuexin Ma","Lan Xu","Jingyi Yu","Ye Shi","Jingya Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16169v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17422v1","updated":"2024-03-26T06:35:55Z","published":"2024-03-26T06:35:55Z","title":"InterHandGen: Two-Hand Interaction Generation via Cascaded Reverse\n Diffusion","summary":" We present InterHandGen, a novel framework that learns the generative prior\nof two-hand interaction. Sampling from our model yields plausible and diverse\ntwo-hand shapes in close interaction with or without an object. Our prior can\nbe incorporated into any optimization or learning methods to reduce ambiguity\nin an ill-posed setup. Our key observation is that directly modeling the joint\ndistribution of multiple instances imposes high learning complexity due to its\ncombinatorial nature. Thus, we propose to decompose the modeling of joint\ndistribution into the modeling of factored unconditional and conditional single\ninstance distribution. In particular, we introduce a diffusion model that\nlearns the single-hand distribution unconditional and conditional to another\nhand via conditioning dropout. For sampling, we combine anti-penetration and\nclassifier-free guidance to enable plausible generation. Furthermore, we\nestablish the rigorous evaluation protocol of two-hand synthesis, where our\nmethod significantly outperforms baseline generative models in terms of\nplausibility and diversity. We also demonstrate that our diffusion prior can\nboost the performance of two-hand reconstruction from monocular in-the-wild\nimages, achieving new state-of-the-art accuracy.\n","authors":["Jihyun Lee","Shunsuke Saito","Giljoo Nam","Minhyuk Sung","Tae-Kyun Kim"],"pdf_url":"https://arxiv.org/pdf/2403.17422v1.pdf","comment":"Accepted to CVPR 2024, project page:\n https://jyunlee.github.io/projects/interhandgen/"},{"id":"http://arxiv.org/abs/2403.17420v1","updated":"2024-03-26T06:27:50Z","published":"2024-03-26T06:27:50Z","title":"Learning to Visually Localize Sound Sources from Mixtures without Prior\n Source Knowledge","summary":" The goal of the multi-sound source localization task is to localize sound\nsources from the mixture individually. While recent multi-sound source\nlocalization methods have shown improved performance, they face challenges due\nto their reliance on prior information about the number of objects to be\nseparated. In this paper, to overcome this limitation, we present a novel\nmulti-sound source localization method that can perform localization without\nprior knowledge of the number of sound sources. To achieve this goal, we\npropose an iterative object identification (IOI) module, which can recognize\nsound-making objects in an iterative manner. After finding the regions of\nsound-making objects, we devise object similarity-aware clustering (OSC) loss\nto guide the IOI module to effectively combine regions of the same object but\nalso distinguish between different objects and backgrounds. It enables our\nmethod to perform accurate localization of sound-making objects without any\nprior knowledge. Extensive experimental results on the MUSIC and VGGSound\nbenchmarks show the significant performance improvements of the proposed method\nover the existing methods for both single and multi-source. Our code is\navailable at: https://github.com/VisualAIKHU/NoPrior_MultiSSL\n","authors":["Dongjin Kim","Sung Jin Um","Sangmin Lee","Jung Uk Kim"],"pdf_url":"https://arxiv.org/pdf/2403.17420v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2312.09551v2","updated":"2024-03-26T06:05:17Z","published":"2023-12-15T06:04:42Z","title":"Learning-based Axial Video Motion Magnification","summary":" Video motion magnification amplifies invisible small motions to be\nperceptible, which provides humans with a spatially dense and holistic\nunderstanding of small motions in the scene of interest. This is based on the\npremise that magnifying small motions enhances the legibility of motions. In\nthe real world, however, vibrating objects often possess convoluted systems\nthat have complex natural frequencies, modes, and directions. Existing motion\nmagnification often fails to improve legibility since the intricate motions\nstill retain complex characteristics even after being magnified, which may\ndistract us from analyzing them. In this work, we focus on improving legibility\nby proposing a new concept, axial motion magnification, which magnifies\ndecomposed motions along the user-specified direction. Axial motion\nmagnification can be applied to various applications where motions of specific\naxes are critical, by providing simplified and easily readable motion\ninformation. To achieve this, we propose a novel Motion Separation Module that\nenables to disentangle and magnify the motion representation along axes of\ninterest. Furthermore, we build a new synthetic training dataset for the axial\nmotion magnification task. Our proposed method improves the legibility of\nresulting motions along certain axes by adding a new feature: user\ncontrollability. Axial motion magnification is a more generalized concept;\nthus, our method can be directly adapted to the generic motion magnification\nand achieves favorable performance against competing methods.\n","authors":["Kwon Byung-Ki","Oh Hyun-Bin","Kim Jun-Seong","Hyunwoo Ha","Tae-Hyun Oh"],"pdf_url":"https://arxiv.org/pdf/2312.09551v2.pdf","comment":"main paper: 12 pages, supplementary: 10 pages, 20 figures, 1 table"},{"id":"http://arxiv.org/abs/2403.17409v1","updated":"2024-03-26T06:04:50Z","published":"2024-03-26T06:04:50Z","title":"Neural Clustering based Visual Representation Learning","summary":" We investigate a fundamental aspect of machine vision: the measurement of\nfeatures, by revisiting clustering, one of the most classic approaches in\nmachine learning and data analysis. Existing visual feature extractors,\nincluding ConvNets, ViTs, and MLPs, represent an image as rectangular regions.\nThough prevalent, such a grid-style paradigm is built upon engineering practice\nand lacks explicit modeling of data distribution. In this work, we propose\nfeature extraction with clustering (FEC), a conceptually elegant yet\nsurprisingly ad-hoc interpretable neural clustering framework, which views\nfeature extraction as a process of selecting representatives from data and thus\nautomatically captures the underlying data distribution. Given an image, FEC\nalternates between grouping pixels into individual clusters to abstract\nrepresentatives and updating the deep features of pixels with current\nrepresentatives. Such an iterative working mechanism is implemented in the form\nof several neural layers and the final representatives can be used for\ndownstream tasks. The cluster assignments across layers, which can be viewed\nand inspected by humans, make the forward process of FEC fully transparent and\nempower it with promising ad-hoc interpretability. Extensive experiments on\nvarious visual recognition models and tasks verify the effectiveness,\ngenerality, and interpretability of FEC. We expect this work will provoke a\nrethink of the current de facto grid-style paradigm.\n","authors":["Guikun Chen","Xia Li","Yi Yang","Wenguan Wang"],"pdf_url":"https://arxiv.org/pdf/2403.17409v1.pdf","comment":"CVPR 2024. Code: https://github.com/guikunchen/FEC/"},{"id":"http://arxiv.org/abs/2403.17390v1","updated":"2024-03-26T05:19:15Z","published":"2024-03-26T05:19:15Z","title":"SSF3D: Strict Semi-Supervised 3D Object Detection with Switching Filter","summary":" SSF3D modified the semi-supervised 3D object detection (SS3DOD) framework,\nwhich designed specifically for point cloud data. Leveraging the\ncharacteristics of non-coincidence and weak correlation of target objects in\npoint cloud, we adopt a strategy of retaining only the truth-determining pseudo\nlabels and trimming the other fuzzy labels with points, instead of pursuing a\nbalance between the quantity and quality of pseudo labels. Besides, we notice\nthat changing the filter will make the model meet different distributed\ntargets, which is beneficial to break the training bottleneck. Two mechanism\nare introduced to achieve above ideas: strict threshold and filter switching.\nThe experiments are conducted to analyze the effectiveness of above approaches\nand their impact on the overall performance of the system. Evaluating on the\nKITTI dataset, SSF3D exhibits superior performance compared to the current\nstate-of-the-art methods. The code will be released here.\n","authors":["Songbur Wong"],"pdf_url":"https://arxiv.org/pdf/2403.17390v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17387v1","updated":"2024-03-26T05:12:18Z","published":"2024-03-26T05:12:18Z","title":"Decoupled Pseudo-labeling for Semi-Supervised Monocular 3D Object\n Detection","summary":" We delve into pseudo-labeling for semi-supervised monocular 3D object\ndetection (SSM3OD) and discover two primary issues: a misalignment between the\nprediction quality of 3D and 2D attributes and the tendency of depth\nsupervision derived from pseudo-labels to be noisy, leading to significant\noptimization conflicts with other reliable forms of supervision. We introduce a\nnovel decoupled pseudo-labeling (DPL) approach for SSM3OD. Our approach\nfeatures a Decoupled Pseudo-label Generation (DPG) module, designed to\nefficiently generate pseudo-labels by separately processing 2D and 3D\nattributes. This module incorporates a unique homography-based method for\nidentifying dependable pseudo-labels in BEV space, specifically for 3D\nattributes. Additionally, we present a DepthGradient Projection (DGP) module to\nmitigate optimization conflicts caused by noisy depth supervision of\npseudo-labels, effectively decoupling the depth gradient and removing\nconflicting gradients. This dual decoupling strategy-at both the pseudo-label\ngeneration and gradient levels-significantly improves the utilization of\npseudo-labels in SSM3OD. Our comprehensive experiments on the KITTI benchmark\ndemonstrate the superiority of our method over existing approaches.\n","authors":["Jiacheng Zhang","Jiaming Li","Xiangru Lin","Wei Zhang","Xiao Tan","Junyu Han","Errui Ding","Jingdong Wang","Guanbin Li"],"pdf_url":"https://arxiv.org/pdf/2403.17387v1.pdf","comment":"To appear in CVPR2024"},{"id":"http://arxiv.org/abs/2403.17377v1","updated":"2024-03-26T04:49:11Z","published":"2024-03-26T04:49:11Z","title":"Self-Rectifying Diffusion Sampling with Perturbed-Attention Guidance","summary":" Recent studies have demonstrated that diffusion models are capable of\ngenerating high-quality samples, but their quality heavily depends on sampling\nguidance techniques, such as classifier guidance (CG) and classifier-free\nguidance (CFG). These techniques are often not applicable in unconditional\ngeneration or in various downstream tasks such as image restoration. In this\npaper, we propose a novel sampling guidance, called Perturbed-Attention\nGuidance (PAG), which improves diffusion sample quality across both\nunconditional and conditional settings, achieving this without requiring\nadditional training or the integration of external modules. PAG is designed to\nprogressively enhance the structure of samples throughout the denoising\nprocess. It involves generating intermediate samples with degraded structure by\nsubstituting selected self-attention maps in diffusion U-Net with an identity\nmatrix, by considering the self-attention mechanisms' ability to capture\nstructural information, and guiding the denoising process away from these\ndegraded samples. In both ADM and Stable Diffusion, PAG surprisingly improves\nsample quality in conditional and even unconditional scenarios. Moreover, PAG\nsignificantly improves the baseline performance in various downstream tasks\nwhere existing guidances such as CG or CFG cannot be fully utilized, including\nControlNet with empty prompts and image restoration such as inpainting and\ndeblurring.\n","authors":["Donghoon Ahn","Hyoungwon Cho","Jaewon Min","Wooseok Jang","Jungwoo Kim","SeonHwa Kim","Hyun Hee Park","Kyong Hwan Jin","Seungryong Kim"],"pdf_url":"https://arxiv.org/pdf/2403.17377v1.pdf","comment":"Project page is available at\n https://ku-cvlab.github.io/Perturbed-Attention-Guidance"},{"id":"http://arxiv.org/abs/2403.17373v1","updated":"2024-03-26T04:27:56Z","published":"2024-03-26T04:27:56Z","title":"AIDE: An Automatic Data Engine for Object Detection in Autonomous\n Driving","summary":" Autonomous vehicle (AV) systems rely on robust perception models as a\ncornerstone of safety assurance. However, objects encountered on the road\nexhibit a long-tailed distribution, with rare or unseen categories posing\nchallenges to a deployed perception model. This necessitates an expensive\nprocess of continuously curating and annotating data with significant human\neffort. We propose to leverage recent advances in vision-language and large\nlanguage models to design an Automatic Data Engine (AIDE) that automatically\nidentifies issues, efficiently curates data, improves the model through\nauto-labeling, and verifies the model through generation of diverse scenarios.\nThis process operates iteratively, allowing for continuous self-improvement of\nthe model. We further establish a benchmark for open-world detection on AV\ndatasets to comprehensively evaluate various learning paradigms, demonstrating\nour method's superior performance at a reduced cost.\n","authors":["Mingfu Liang","Jong-Chyi Su","Samuel Schulter","Sparsh Garg","Shiyu Zhao","Ying Wu","Manmohan Chandraker"],"pdf_url":"https://arxiv.org/pdf/2403.17373v1.pdf","comment":"Accepted by CVPR-2024"},{"id":"http://arxiv.org/abs/2403.07636v2","updated":"2024-03-26T04:26:21Z","published":"2024-03-12T13:18:22Z","title":"Decomposing Disease Descriptions for Enhanced Pathology Detection: A\n Multi-Aspect Vision-Language Pre-training Framework","summary":" Medical vision language pre-training (VLP) has emerged as a frontier of\nresearch, enabling zero-shot pathological recognition by comparing the query\nimage with the textual descriptions for each disease. Due to the complex\nsemantics of biomedical texts, current methods struggle to align medical images\nwith key pathological findings in unstructured reports. This leads to the\nmisalignment with the target disease's textual representation. In this paper,\nwe introduce a novel VLP framework designed to dissect disease descriptions\ninto their fundamental aspects, leveraging prior knowledge about the visual\nmanifestations of pathologies. This is achieved by consulting a large language\nmodel and medical experts. Integrating a Transformer module, our approach\naligns an input image with the diverse elements of a disease, generating\naspect-centric image representations. By consolidating the matches from each\naspect, we improve the compatibility between an image and its associated\ndisease. Additionally, capitalizing on the aspect-oriented representations, we\npresent a dual-head Transformer tailored to process known and unknown diseases,\noptimizing the comprehensive detection efficacy. Conducting experiments on\nseven downstream datasets, ours improves the accuracy of recent methods by up\nto 8.56% and 17.0% for seen and unseen categories, respectively. Our code is\nreleased at https://github.com/HieuPhan33/MAVL.\n","authors":["Vu Minh Hieu Phan","Yutong Xie","Yuankai Qi","Lingqiao Liu","Liyang Liu","Bowen Zhang","Zhibin Liao","Qi Wu","Minh-Son To","Johan W. Verjans"],"pdf_url":"https://arxiv.org/pdf/2403.07636v2.pdf","comment":"Accepted at CVPR2024. Pre-print before final camera-ready version"},{"id":"http://arxiv.org/abs/2403.10518v2","updated":"2024-03-26T04:24:13Z","published":"2024-03-15T17:59:33Z","title":"Lodge: A Coarse to Fine Diffusion Network for Long Dance Generation\n Guided by the Characteristic Dance Primitives","summary":" We propose Lodge, a network capable of generating extremely long dance\nsequences conditioned on given music. We design Lodge as a two-stage coarse to\nfine diffusion architecture, and propose the characteristic dance primitives\nthat possess significant expressiveness as intermediate representations between\ntwo diffusion models. The first stage is global diffusion, which focuses on\ncomprehending the coarse-level music-dance correlation and production\ncharacteristic dance primitives. In contrast, the second-stage is the local\ndiffusion, which parallelly generates detailed motion sequences under the\nguidance of the dance primitives and choreographic rules. In addition, we\npropose a Foot Refine Block to optimize the contact between the feet and the\nground, enhancing the physical realism of the motion. Our approach can\nparallelly generate dance sequences of extremely long length, striking a\nbalance between global choreographic patterns and local motion quality and\nexpressiveness. Extensive experiments validate the efficacy of our method.\n","authors":["Ronghui Li","YuXiang Zhang","Yachao Zhang","Hongwen Zhang","Jie Guo","Yan Zhang","Yebin Liu","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2403.10518v2.pdf","comment":"Accepted by CVPR2024, Project page:\n https://li-ronghui.github.io/lodge"},{"id":"http://arxiv.org/abs/2403.16209v2","updated":"2024-03-26T04:22:02Z","published":"2024-03-24T16:08:10Z","title":"Image Captioning in news report scenario","summary":" Image captioning strives to generate pertinent captions for specified images,\nsituating itself at the crossroads of Computer Vision (CV) and Natural Language\nProcessing (NLP). This endeavor is of paramount importance with far-reaching\napplications in recommendation systems, news outlets, social media, and beyond.\nParticularly within the realm of news reporting, captions are expected to\nencompass detailed information, such as the identities of celebrities captured\nin the images. However, much of the existing body of work primarily centers\naround understanding scenes and actions. In this paper, we explore the realm of\nimage captioning specifically tailored for celebrity photographs, illustrating\nits broad potential for enhancing news industry practices. This exploration\naims to augment automated news content generation, thereby facilitating a more\nnuanced dissemination of information. Our endeavor shows a broader horizon,\nenriching the narrative in news reporting through a more intuitive image\ncaptioning framework.\n","authors":["Tianrui Liu","Qi Cai","Changxin Xu","Bo Hong","Jize Xiong","Yuxin Qiao","Tsungwei Yang"],"pdf_url":"https://arxiv.org/pdf/2403.16209v2.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2311.12342v3","updated":"2024-03-26T04:17:42Z","published":"2023-11-21T04:28:12Z","title":"LoCo: Locally Constrained Training-Free Layout-to-Image Synthesis","summary":" Recent text-to-image diffusion models have reached an unprecedented level in\ngenerating high-quality images. However, their exclusive reliance on textual\nprompts often falls short in precise control of image compositions. In this\npaper, we propose LoCo, a training-free approach for layout-to-image Synthesis\nthat excels in producing high-quality images aligned with both textual prompts\nand layout instructions. Specifically, we introduce a Localized Attention\nConstraint (LAC), leveraging semantic affinity between pixels in self-attention\nmaps to create precise representations of desired objects and effectively\nensure the accurate placement of objects in designated regions. We further\npropose a Padding Token Constraint (PTC) to leverage the semantic information\nembedded in previously neglected padding tokens, improving the consistency\nbetween object appearance and layout instructions. LoCo seamlessly integrates\ninto existing text-to-image and layout-to-image models, enhancing their\nperformance in spatial control and addressing semantic failures observed in\nprior methods. Extensive experiments showcase the superiority of our approach,\nsurpassing existing state-of-the-art training-free layout-to-image methods both\nqualitatively and quantitatively across multiple benchmarks.\n","authors":["Peiang Zhao","Han Li","Ruiyang Jin","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.12342v3.pdf","comment":"Demo: https://huggingface.co/spaces/Pusheen/LoCo; Project page:\n https://momopusheen.github.io/LoCo/"},{"id":"http://arxiv.org/abs/2309.07322v2","updated":"2024-03-26T04:17:40Z","published":"2023-09-13T21:21:50Z","title":"$\\texttt{NePhi}$: Neural Deformation Fields for Approximately\n Diffeomorphic Medical Image Registration","summary":" This work proposes NePhi, a generalizable neural deformation model which\nresults in approximately diffeomorphic transformations. In contrast to the\npredominant voxel-based transformation fields used in learning-based\nregistration approaches, NePhi represents deformations functionally, leading to\ngreat flexibility within the design space of memory consumption during training\nand inference, inference time, registration accuracy, as well as transformation\nregularity. Specifically, NePhi 1) requires less memory compared to voxel-based\nlearning approaches, 2) improves inference speed by predicting latent codes,\ncompared to current existing neural deformation based registration approaches\nthat \\emph{only} rely on optimization, 3) improves accuracy via instance\noptimization, and 4) shows excellent deformation regularity which is highly\ndesirable for medical image registration. We demonstrate the performance of\nNePhi on a 2D synthetic dataset as well as for real 3D lung registration. Our\nresults show that NePhi can match the accuracy of voxel-based representations\nin a single-resolution registration setting. For multi-resolution registration,\nour method matches the accuracy of current SOTA learning-based registration\napproaches with instance optimization while reducing memory requirements by a\nfactor of five.\n","authors":["Lin Tian","Hastings Greer","Raúl San José Estépar","Soumyadip Sengupta","Marc Niethammer"],"pdf_url":"https://arxiv.org/pdf/2309.07322v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.19330v2","updated":"2024-03-26T04:15:53Z","published":"2024-02-29T16:33:12Z","title":"A Novel Approach to Industrial Defect Generation through Blended Latent\n Diffusion Model with Online Adaptation","summary":" Effectively addressing the challenge of industrial Anomaly Detection (AD)\nnecessitates an ample supply of defective samples, a constraint often hindered\nby their scarcity in industrial contexts. This paper introduces a novel\nalgorithm designed to augment defective samples, thereby enhancing AD\nperformance. The proposed method tailors the blended latent diffusion model for\ndefect sample generation, employing a diffusion model to generate defective\nsamples in the latent space. A feature editing process, controlled by a\n``trimap\" mask and text prompts, refines the generated samples. The image\ngeneration inference process is structured into three stages: a free diffusion\nstage, an editing diffusion stage, and an online decoder adaptation stage. This\nsophisticated inference strategy yields high-quality synthetic defective\nsamples with diverse pattern variations, leading to significantly improved AD\naccuracies based on the augmented training set. Specifically, on the widely\nrecognized MVTec AD dataset, the proposed method elevates the state-of-the-art\n(SOTA) performance of AD with augmented data by 1.5%, 1.9%, and 3.1% for AD\nmetrics AP, IAP, and IAP90, respectively. The implementation code of this work\ncan be found at the GitHub repository\nhttps://github.com/GrandpaXun242/AdaBLDM.git\n","authors":["Hanxi Li","Zhengxun Zhang","Hao Chen","Lin Wu","Bo Li","Deyin Liu","Mingwen Wang"],"pdf_url":"https://arxiv.org/pdf/2402.19330v2.pdf","comment":"13 pages,7 figures"},{"id":"http://arxiv.org/abs/2403.15931v2","updated":"2024-03-26T04:15:02Z","published":"2024-03-23T20:30:28Z","title":"X-Portrait: Expressive Portrait Animation with Hierarchical Motion\n Attention","summary":" We propose X-Portrait, an innovative conditional diffusion model tailored for\ngenerating expressive and temporally coherent portrait animation. Specifically,\ngiven a single portrait as appearance reference, we aim to animate it with\nmotion derived from a driving video, capturing both highly dynamic and subtle\nfacial expressions along with wide-range head movements. As its core, we\nleverage the generative prior of a pre-trained diffusion model as the rendering\nbackbone, while achieve fine-grained head pose and expression control with\nnovel controlling signals within the framework of ControlNet. In contrast to\nconventional coarse explicit controls such as facial landmarks, our motion\ncontrol module is learned to interpret the dynamics directly from the original\ndriving RGB inputs. The motion accuracy is further enhanced with a patch-based\nlocal control module that effectively enhance the motion attention to\nsmall-scale nuances like eyeball positions. Notably, to mitigate the identity\nleakage from the driving signals, we train our motion control modules with\nscaling-augmented cross-identity images, ensuring maximized disentanglement\nfrom the appearance reference modules. Experimental results demonstrate the\nuniversal effectiveness of X-Portrait across a diverse range of facial\nportraits and expressive driving sequences, and showcase its proficiency in\ngenerating captivating portrait animations with consistently maintained\nidentity characteristics.\n","authors":["You Xie","Hongyi Xu","Guoxian Song","Chao Wang","Yichun Shi","Linjie Luo"],"pdf_url":"https://arxiv.org/pdf/2403.15931v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17369v1","updated":"2024-03-26T04:09:08Z","published":"2024-03-26T04:09:08Z","title":"CoDA: Instructive Chain-of-Domain Adaptation with Severity-Aware Visual\n Prompt Tuning","summary":" Unsupervised Domain Adaptation (UDA) aims to adapt models from labeled source\ndomains to unlabeled target domains. When adapting to adverse scenes, existing\nUDA methods fail to perform well due to the lack of instructions, leading their\nmodels to overlook discrepancies within all adverse scenes. To tackle this, we\npropose CoDA which instructs models to distinguish, focus, and learn from these\ndiscrepancies at scene and image levels. Specifically, CoDA consists of a\nChain-of-Domain (CoD) strategy and a Severity-Aware Visual Prompt Tuning\n(SAVPT) mechanism. CoD focuses on scene-level instructions to divide all\nadverse scenes into easy and hard scenes, guiding models to adapt from source\nto easy domains with easy scene images, and then to hard domains with hard\nscene images, thereby laying a solid foundation for whole adaptations. Building\nupon this foundation, we employ SAVPT to dive into more detailed image-level\ninstructions to boost performance. SAVPT features a novel metric Severity that\ndivides all adverse scene images into low-severity and high-severity images.\nThen Severity directs visual prompts and adapters, instructing models to\nconcentrate on unified severity features instead of scene-specific features,\nwithout adding complexity to the model architecture. CoDA achieves SOTA\nperformances on widely-used benchmarks under all adverse scenes. Notably, CoDA\noutperforms the existing ones by 4.6%, and 10.3% mIoU on the Foggy Driving, and\nFoggy Zurich benchmarks, respectively. Our code is available at\nhttps://github.com/Cuzyoung/CoDA\n","authors":["Ziyang Gong","Fuhao Li","Yupeng Deng","Deblina Bhattacharjee","Xiangwei Zhu","Zhenming Ji"],"pdf_url":"https://arxiv.org/pdf/2403.17369v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16536v2","updated":"2024-03-26T03:56:34Z","published":"2024-03-25T08:26:42Z","title":"VMRNN: Integrating Vision Mamba and LSTM for Efficient and Accurate\n Spatiotemporal Forecasting","summary":" Combining CNNs or ViTs, with RNNs for spatiotemporal forecasting, has yielded\nunparalleled results in predicting temporal and spatial dynamics. However,\nmodeling extensive global information remains a formidable challenge; CNNs are\nlimited by their narrow receptive fields, and ViTs struggle with the intensive\ncomputational demands of their attention mechanisms. The emergence of recent\nMamba-based architectures has been met with enthusiasm for their exceptional\nlong-sequence modeling capabilities, surpassing established vision models in\nefficiency and accuracy, which motivates us to develop an innovative\narchitecture tailored for spatiotemporal forecasting. In this paper, we propose\nthe VMRNN cell, a new recurrent unit that integrates the strengths of Vision\nMamba blocks with LSTM. We construct a network centered on VMRNN cells to\ntackle spatiotemporal prediction tasks effectively. Our extensive evaluations\nshow that our proposed approach secures competitive results on a variety of\ntasks while maintaining a smaller model size. Our code is available at\nhttps://github.com/yyyujintang/VMRNN-PyTorch.\n","authors":["Yujin Tang","Peijie Dong","Zhenheng Tang","Xiaowen Chu","Junwei Liang"],"pdf_url":"https://arxiv.org/pdf/2403.16536v2.pdf","comment":"11 pages, 7 figures, 6 tables"},{"id":"http://arxiv.org/abs/2403.17360v1","updated":"2024-03-26T03:53:00Z","published":"2024-03-26T03:53:00Z","title":"Activity-Biometrics: Person Identification from Daily Activities","summary":" In this work, we study a novel problem which focuses on person identification\nwhile performing daily activities. Learning biometric features from RGB videos\nis challenging due to spatio-temporal complexity and presence of appearance\nbiases such as clothing color and background. We propose ABNet, a novel\nframework which leverages disentanglement of biometric and non-biometric\nfeatures to perform effective person identification from daily activities.\nABNet relies on a bias-less teacher to learn biometric features from RGB videos\nand explicitly disentangle non-biometric features with the help of biometric\ndistortion. In addition, ABNet also exploits activity prior for biometrics\nwhich is enabled by joint biometric and activity learning. We perform\ncomprehensive evaluation of the proposed approach across five different\ndatasets which are derived from existing activity recognition benchmarks.\nFurthermore, we extensively compare ABNet with existing works in person\nidentification and demonstrate its effectiveness for activity-based biometrics\nacross all five datasets. The code and dataset can be accessed at:\n\\url{https://github.com/sacrcv/Activity-Biometrics/}\n","authors":["Shehreen Azad","Yogesh Singh Rawat"],"pdf_url":"https://arxiv.org/pdf/2403.17360v1.pdf","comment":"CVPR 2024 Main conference"},{"id":"http://arxiv.org/abs/2312.06734v2","updated":"2024-03-26T03:52:48Z","published":"2023-12-11T11:26:32Z","title":"DiffCast: A Unified Framework via Residual Diffusion for Precipitation\n Nowcasting","summary":" Precipitation nowcasting is an important spatio-temporal prediction task to\npredict the radar echoes sequences based on current observations, which can\nserve both meteorological science and smart city applications. Due to the\nchaotic evolution nature of the precipitation systems, it is a very challenging\nproblem. Previous studies address the problem either from the perspectives of\ndeterministic modeling or probabilistic modeling. However, their predictions\nsuffer from the blurry, high-value echoes fading away and position inaccurate\nissues. The root reason of these issues is that the chaotic evolutionary\nprecipitation systems are not appropriately modeled. Inspired by the nature of\nthe systems, we propose to decompose and model them from the perspective of\nglobal deterministic motion and local stochastic variations with residual\nmechanism. A unified and flexible framework that can equip any type of\nspatio-temporal models is proposed based on residual diffusion, which\neffectively tackles the shortcomings of previous methods. Extensive\nexperimental results on four publicly available radar datasets demonstrate the\neffectiveness and superiority of the proposed framework, compared to\nstate-of-the-art techniques. Our code is publicly available at\nhttps://github.com/DeminYu98/DiffCast.\n","authors":["Demin Yu","Xutao Li","Yunming Ye","Baoquan Zhang","Chuyao Luo","Kuai Dai","Rui Wang","Xunlai Chen"],"pdf_url":"https://arxiv.org/pdf/2312.06734v2.pdf","comment":"CVPR 2024; https://github.com/DeminYu98/DiffCast"},{"id":"http://arxiv.org/abs/2303.02490v2","updated":"2024-03-26T03:41:26Z","published":"2023-03-04T20:08:57Z","title":"Diffusion Models Generate Images Like Painters: an Analytical Theory of\n Outline First, Details Later","summary":" How do diffusion generative models convert pure noise into meaningful images?\nIn a variety of pretrained diffusion models (including conditional latent space\nmodels like Stable Diffusion), we observe that the reverse diffusion process\nthat underlies image generation has the following properties: (i) individual\ntrajectories tend to be low-dimensional and resemble 2D `rotations'; (ii)\nhigh-variance scene features like layout tend to emerge earlier, while\nlow-variance details tend to emerge later; and (iii) early perturbations tend\nto have a greater impact on image content than later perturbations. To\nunderstand these phenomena, we derive and study a closed-form solution to the\nprobability flow ODE for a Gaussian distribution, which shows that the reverse\ndiffusion state rotates towards a gradually-specified target on the image\nmanifold. It also shows that generation involves first committing to an\noutline, and then to finer and finer details. We find that this solution\naccurately describes the initial phase of image generation for pretrained\nmodels, and can in principle be used to make image generation more efficient by\nskipping reverse diffusion steps. Finally, we use our solution to characterize\nthe image manifold in Stable Diffusion. Our viewpoint reveals an unexpected\nsimilarity between generation by GANs and diffusion and provides a conceptual\nlink between diffusion and image retrieval.\n","authors":["Binxu Wang","John J. Vastola"],"pdf_url":"https://arxiv.org/pdf/2303.02490v2.pdf","comment":"44 pages, 28 figures. A briefer version was presented at NeurIPS23\n Workshop on Diffusion Models [arXiv:2311.10892]"},{"id":"http://arxiv.org/abs/1902.00615v3","updated":"2024-03-26T03:40:54Z","published":"2019-02-02T01:52:53Z","title":"Confidence-Triggered Detection: Accelerating Real-time\n Tracking-by-detection Systems","summary":" Real-time object tracking necessitates a delicate balance between speed and\naccuracy, a challenge exacerbated by the computational demands of deep learning\nmethods. In this paper, we propose Confidence-Triggered Detection (CTD), an\ninnovative approach that strategically bypasses object detection for frames\nclosely resembling intermediate states, leveraging tracker confidence scores.\nCTD not only enhances tracking speed but also preserves accuracy, surpassing\nexisting tracking algorithms. Through extensive evaluation across various\ntracker confidence thresholds, we identify an optimal trade-off between\ntracking speed and accuracy, providing crucial insights for parameter\nfine-tuning and enhancing CTD's practicality in real-world scenarios. Our\nexperiments across diverse detection models underscore the robustness and\nversatility of the CTD framework, demonstrating its potential to enable\nreal-time tracking in resource-constrained environments.\n","authors":["Zhicheng Ding","Zhixin Lai","Siyang Li","Edward Wong"],"pdf_url":"https://arxiv.org/pdf/1902.00615v3.pdf","comment":"9 pages, 5 figures, 1 table"},{"id":"http://arxiv.org/abs/2403.17346v1","updated":"2024-03-26T03:10:45Z","published":"2024-03-26T03:10:45Z","title":"TRAM: Global Trajectory and Motion of 3D Humans from in-the-wild Videos","summary":" We propose TRAM, a two-stage method to reconstruct a human's global\ntrajectory and motion from in-the-wild videos. TRAM robustifies SLAM to recover\nthe camera motion in the presence of dynamic humans and uses the scene\nbackground to derive the motion scale. Using the recovered camera as a\nmetric-scale reference frame, we introduce a video transformer model (VIMO) to\nregress the kinematic body motion of a human. By composing the two motions, we\nachieve accurate recovery of 3D humans in the world space, reducing global\nmotion errors by 60% from prior work. https://yufu-wang.github.io/tram4d/\n","authors":["Yufu Wang","Ziyun Wang","Lingjie Liu","Kostas Daniilidis"],"pdf_url":"https://arxiv.org/pdf/2403.17346v1.pdf","comment":"The project website: https://yufu-wang.github.io/tram4d/"},{"id":"http://arxiv.org/abs/2303.15230v2","updated":"2024-03-26T03:07:56Z","published":"2023-03-27T14:10:26Z","title":"Troika: Multi-Path Cross-Modal Traction for Compositional Zero-Shot\n Learning","summary":" Recent compositional zero-shot learning (CZSL) methods adapt pre-trained\nvision-language models (VLMs) by constructing trainable prompts only for\ncomposed state-object pairs. Relying on learning the joint representation of\nseen compositions, these methods ignore the explicit modeling of the state and\nobject, thus limiting the exploitation of pre-trained knowledge and\ngeneralization to unseen compositions. With a particular focus on the\nuniversality of the solution, in this work, we propose a novel paradigm for\nCZSL models that establishes three identification branches (i.e., Multi-Path)\nto jointly model the state, object, and composition. The presented Troika is\nour implementation that aligns the branch-specific prompt representations with\ndecomposed visual features. To calibrate the bias between semantically similar\nmulti-modal representations, we further devise a Cross-Modal Traction module\ninto Troika that shifts the prompt representation towards the current visual\ncontent. We conduct extensive experiments on three popular benchmarks, where\nour method significantly outperforms existing methods in both closed-world and\nopen-world settings. The code will be available at\nhttps://github.com/bighuang624/Troika.\n","authors":["Siteng Huang","Biao Gong","Yutong Feng","Min Zhang","Yiliang Lv","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2303.15230v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.17343v1","updated":"2024-03-26T03:05:20Z","published":"2024-03-26T03:05:20Z","title":"Language Models are Free Boosters for Biomedical Imaging Tasks","summary":" In this study, we uncover the unexpected efficacy of residual-based large\nlanguage models (LLMs) as part of encoders for biomedical imaging tasks, a\ndomain traditionally devoid of language or textual data. The approach diverges\nfrom established methodologies by utilizing a frozen transformer block,\nextracted from pre-trained LLMs, as an innovative encoder layer for the direct\nprocessing of visual tokens. This strategy represents a significant departure\nfrom the standard multi-modal vision-language frameworks, which typically hinge\non language-driven prompts and inputs. We found that these LLMs could boost\nperformance across a spectrum of biomedical imaging applications, including\nboth 2D and 3D visual classification tasks, serving as plug-and-play boosters.\nMore interestingly, as a byproduct, we found that the proposed framework\nachieved superior performance, setting new state-of-the-art results on\nextensive, standardized datasets in MedMNIST-2D and 3D. Through this work, we\naim to open new avenues for employing LLMs in biomedical imaging and enriching\nthe understanding of their potential in this specialized domain.\n","authors":["Zhixin Lai","Jing Wu","Suiyao Chen","Yucheng Zhou","Anna Hovakimyan","Naira Hovakimyan"],"pdf_url":"https://arxiv.org/pdf/2403.17343v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17342v1","updated":"2024-03-26T03:03:50Z","published":"2024-03-26T03:03:50Z","title":"The Solution for the ICCV 2023 1st Scientific Figure Captioning\n Challenge","summary":" In this paper, we propose a solution for improving the quality of captions\ngenerated for figures in papers. We adopt the approach of summarizing the\ntextual content in the paper to generate image captions. Throughout our study,\nwe encounter discrepancies in the OCR information provided in the official\ndataset. To rectify this, we employ the PaddleOCR toolkit to extract OCR\ninformation from all images. Moreover, we observe that certain textual content\nin the official paper pertains to images that are not relevant for captioning,\nthereby introducing noise during caption generation. To mitigate this issue, we\nleverage LLaMA to extract image-specific information by querying the textual\ncontent based on image mentions, effectively filtering out extraneous\ninformation. Additionally, we recognize a discrepancy between the primary use\nof maximum likelihood estimation during text generation and the evaluation\nmetrics such as ROUGE employed to assess the quality of generated captions. To\nbridge this gap, we integrate the BRIO model framework, enabling a more\ncoherent alignment between the generation and evaluation processes. Our\napproach ranked first in the final test with a score of 4.49.\n","authors":["Dian Chao","Xin Song","Shupeng Zhong","Boyuan Wang","Xiangyu Wu","Chen Zhu","Yang Yang"],"pdf_url":"https://arxiv.org/pdf/2403.17342v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14518v2","updated":"2024-03-26T02:45:29Z","published":"2023-12-22T08:31:11Z","title":"Joint Learning Neuronal Skeleton and Brain Circuit Topology with\n Permutation Invariant Encoders for Neuron Classification","summary":" Determining the types of neurons within a nervous system plays a significant\nrole in the analysis of brain connectomics and the investigation of\nneurological diseases. However, the efficiency of utilizing anatomical,\nphysiological, or molecular characteristics of neurons is relatively low and\ncostly. With the advancements in electron microscopy imaging and analysis\ntechniques for brain tissue, we are able to obtain whole-brain connectome\nconsisting neuronal high-resolution morphology and connectivity information.\nHowever, few models are built based on such data for automated neuron\nclassification. In this paper, we propose NeuNet, a framework that combines\nmorphological information of neurons obtained from skeleton and topological\ninformation between neurons obtained from neural circuit. Specifically, NeuNet\nconsists of three components, namely Skeleton Encoder, Connectome Encoder, and\nReadout Layer. Skeleton Encoder integrates the local information of neurons in\na bottom-up manner, with a one-dimensional convolution in neural skeleton's\npoint data; Connectome Encoder uses a graph neural network to capture the\ntopological information of neural circuit; finally, Readout Layer fuses the\nabove two information and outputs classification results. We reprocess and\nrelease two new datasets for neuron classification task from volume electron\nmicroscopy(VEM) images of human brain cortex and Drosophila brain. Experiments\non these two datasets demonstrated the effectiveness of our model with accuracy\nof 0.9169 and 0.9363, respectively. Code and data are available at:\nhttps://github.com/WHUminghui/NeuNet.\n","authors":["Minghui Liao","Guojia Wan","Bo Du"],"pdf_url":"https://arxiv.org/pdf/2312.14518v2.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2403.02981v2","updated":"2024-03-26T02:39:15Z","published":"2024-03-05T13:59:21Z","title":"Doubly Abductive Counterfactual Inference for Text-based Image Editing","summary":" We study text-based image editing (TBIE) of a single image by counterfactual\ninference because it is an elegant formulation to precisely address the\nrequirement: the edited image should retain the fidelity of the original one.\nThrough the lens of the formulation, we find that the crux of TBIE is that\nexisting techniques hardly achieve a good trade-off between editability and\nfidelity, mainly due to the overfitting of the single-image fine-tuning. To\nthis end, we propose a Doubly Abductive Counterfactual inference framework\n(DAC). We first parameterize an exogenous variable as a UNet LoRA, whose\nabduction can encode all the image details. Second, we abduct another exogenous\nvariable parameterized by a text encoder LoRA, which recovers the lost\neditability caused by the overfitted first abduction. Thanks to the second\nabduction, which exclusively encodes the visual transition from post-edit to\npre-edit, its inversion -- subtracting the LoRA -- effectively reverts pre-edit\nback to post-edit, thereby accomplishing the edit. Through extensive\nexperiments, our DAC achieves a good trade-off between editability and\nfidelity. Thus, we can support a wide spectrum of user editing intents,\nincluding addition, removal, manipulation, replacement, style transfer, and\nfacial change, which are extensively validated in both qualitative and\nquantitative evaluations. Codes are in https://github.com/xuesong39/DAC.\n","authors":["Xue Song","Jiequan Cui","Hanwang Zhang","Jingjing Chen","Richang Hong","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2403.02981v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.17334v1","updated":"2024-03-26T02:34:48Z","published":"2024-03-26T02:34:48Z","title":"OVER-NAV: Elevating Iterative Vision-and-Language Navigation with\n Open-Vocabulary Detection and StructurEd Representation","summary":" Recent advances in Iterative Vision-and-Language Navigation (IVLN) introduce\na more meaningful and practical paradigm of VLN by maintaining the agent's\nmemory across tours of scenes. Although the long-term memory aligns better with\nthe persistent nature of the VLN task, it poses more challenges on how to\nutilize the highly unstructured navigation memory with extremely sparse\nsupervision. Towards this end, we propose OVER-NAV, which aims to go over and\nbeyond the current arts of IVLN techniques. In particular, we propose to\nincorporate LLMs and open-vocabulary detectors to distill key information and\nestablish correspondence between multi-modal signals. Such a mechanism\nintroduces reliable cross-modal supervision and enables on-the-fly\ngeneralization to unseen scenes without the need of extra annotation and\nre-training. To fully exploit the interpreted navigation data, we further\nintroduce a structured representation, coded Omnigraph, to effectively\nintegrate multi-modal information along the tour. Accompanied with a novel\nomnigraph fusion mechanism, OVER-NAV is able to extract the most relevant\nknowledge from omnigraph for a more accurate navigating action. In addition,\nOVER-NAV seamlessly supports both discrete and continuous environments under a\nunified framework. We demonstrate the superiority of OVER-NAV in extensive\nexperiments.\n","authors":["Ganlong Zhao","Guanbin Li","Weikai Chen","Yizhou Yu"],"pdf_url":"https://arxiv.org/pdf/2403.17334v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.17332v1","updated":"2024-03-26T02:32:52Z","published":"2024-03-26T02:32:52Z","title":"Labeling subtypes in a Parkinson's Cohort using Multifeatures in MRI -\n Integrating Grey and White Matter Information","summary":" Thresholding of networks has long posed a challenge in brain connectivity\nanalysis. Weighted networks are typically binarized using threshold measures to\nfacilitate network analysis. Previous studies on MRI-based brain networks have\npredominantly utilized density or sparsity-based thresholding techniques,\noptimized within specific ranges derived from network metrics such as path\nlength, clustering coefficient, and small-world index. Thus, determination of a\nsingle threshold value for facilitating comparative analysis of networks\nremains elusive. To address this, our study introduces Mutual K-Nearest\nNeighbor (MKNN)-based thresholding for brain network analysis. Here, nearest\nneighbor selection is based on the highest correlation between features of\nbrain regions. Construction of brain networks was accomplished by computing\nPearson correlations between grey matter volume and white matter volume for\neach pair of brain regions. Structural MRI data from 180 Parkinsons patients\nand 70 controls from the NIMHANS, India were analyzed. Subtypes within\nParkinsons disease were identified based on grey and white matter volume\natrophy using source-based morphometric decomposition. The loading coefficients\nwere correlated with clinical features to discern clinical relationship with\nthe deciphered subtypes. Our data-mining approach revealed: Subtype A (N = 51,\nintermediate type), Subtype B (N = 57, mild-severe type with mild motor\nsymptoms), and Subtype AB (N = 36, most-severe type with predominance in motor\nimpairment). Subtype-specific weighted matrices were binarized using MKNN-based\nthresholding for brain network analysis. Permutation tests on network metrics\nof resulting bipartite graphs demonstrated significant group differences in\nbetweenness centrality and participation coefficient. The identified hubs were\nspecific to each subtype, with some hubs conserved across different subtypes.\n","authors":["Tanmayee Samantaray","Jitender Saini","Pramod Kumar Pal","Bithiah Grace Jaganathan","Vijaya V Saradhi","Gupta CN"],"pdf_url":"https://arxiv.org/pdf/2403.17332v1.pdf","comment":"31 pages, 10 figures, 3 tables"},{"id":"http://arxiv.org/abs/2403.17330v1","updated":"2024-03-26T02:28:49Z","published":"2024-03-26T02:28:49Z","title":"Staircase Localization for Autonomous Exploration in Urban Environments","summary":" A staircase localization method is proposed for robots to explore urban\nenvironments autonomously. The proposed method employs a modular design in the\nform of a cascade pipeline consisting of three modules of stair detection, line\nsegment detection, and stair localization modules. The stair detection module\nutilizes an object detection algorithm based on deep learning to generate a\nregion of interest (ROI). From the ROI, line segment features are extracted\nusing a deep line segment detection algorithm. The extracted line segments are\nused to localize a staircase in terms of position, orientation, and stair\ndirection. The stair detection and localization are performed only with a\nsingle RGB-D camera. Each component of the proposed pipeline does not need to\nbe designed particularly for staircases, which makes it easy to maintain the\nwhole pipeline and replace each component with state-of-the-art deep learning\ndetection techniques. The results of real-world experiments show that the\nproposed method can perform accurate stair detection and localization during\nautonomous exploration for various structured and unstructured upstairs and\ndownstairs with shadows, dirt, and occlusions by artificial and natural\nobjects.\n","authors":["Jinrae Kim","Sunggoo Jung","Sung-Kyun Kim","Youdan Kim","Ali-akbar Agha-mohammadi"],"pdf_url":"https://arxiv.org/pdf/2403.17330v1.pdf","comment":"9 pages, 10 figures"},{"id":"http://arxiv.org/abs/2403.16080v2","updated":"2024-03-26T02:25:58Z","published":"2024-03-24T10:06:40Z","title":"PKU-DyMVHumans: A Multi-View Video Benchmark for High-Fidelity Dynamic\n Human Modeling","summary":" High-quality human reconstruction and photo-realistic rendering of a dynamic\nscene is a long-standing problem in computer vision and graphics. Despite\nconsiderable efforts invested in developing various capture systems and\nreconstruction algorithms, recent advancements still struggle with loose or\noversized clothing and overly complex poses. In part, this is due to the\nchallenges of acquiring high-quality human datasets. To facilitate the\ndevelopment of these fields, in this paper, we present PKU-DyMVHumans, a\nversatile human-centric dataset for high-fidelity reconstruction and rendering\nof dynamic human scenarios from dense multi-view videos. It comprises 8.2\nmillion frames captured by more than 56 synchronized cameras across diverse\nscenarios. These sequences comprise 32 human subjects across 45 different\nscenarios, each with a high-detailed appearance and realistic human motion.\nInspired by recent advancements in neural radiance field (NeRF)-based scene\nrepresentations, we carefully set up an off-the-shelf framework that is easy to\nprovide those state-of-the-art NeRF-based implementations and benchmark on\nPKU-DyMVHumans dataset. It is paving the way for various applications like\nfine-grained foreground/background decomposition, high-quality human\nreconstruction and photo-realistic novel view synthesis of a dynamic scene.\nExtensive studies are performed on the benchmark, demonstrating new\nobservations and challenges that emerge from using such high-fidelity dynamic\ndata. The dataset is available at: https://pku-dymvhumans.github.io.\n","authors":["Xiaoyun Zheng","Liwei Liao","Xufeng Li","Jianbo Jiao","Rongjie Wang","Feng Gao","Shiqi Wang","Ronggang Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16080v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17327v1","updated":"2024-03-26T02:21:36Z","published":"2024-03-26T02:21:36Z","title":"Accuracy enhancement method for speech emotion recognition from\n spectrogram using temporal frequency correlation and positional information\n learning through knowledge transfer","summary":" In this paper, we propose a method to improve the accuracy of speech emotion\nrecognition (SER) by using vision transformer (ViT) to attend to the\ncorrelation of frequency (y-axis) with time (x-axis) in spectrogram and\ntransferring positional information between ViT through knowledge transfer. The\nproposed method has the following originality i) We use vertically segmented\npatches of log-Mel spectrogram to analyze the correlation of frequencies over\ntime. This type of patch allows us to correlate the most relevant frequencies\nfor a particular emotion with the time they were uttered. ii) We propose the\nuse of image coordinate encoding, an absolute positional encoding suitable for\nViT. By normalizing the x, y coordinates of the image to -1 to 1 and\nconcatenating them to the image, we can effectively provide valid absolute\npositional information for ViT. iii) Through feature map matching, the locality\nand location information of the teacher network is effectively transmitted to\nthe student network. Teacher network is a ViT that contains locality of\nconvolutional stem and absolute position information through image coordinate\nencoding, and student network is a structure that lacks positional encoding in\nthe basic ViT structure. In feature map matching stage, we train through the\nmean absolute error (L1 loss) to minimize the difference between the feature\nmaps of the two networks. To validate the proposed method, three emotion\ndatasets (SAVEE, EmoDB, and CREMA-D) consisting of speech were converted into\nlog-Mel spectrograms for comparison experiments. The experimental results show\nthat the proposed method significantly outperforms the state-of-the-art methods\nin terms of weighted accuracy while requiring significantly fewer floating\npoint operations (FLOPs). Overall, the proposed method offers an promising\nsolution for SER by providing improved efficiency and performance.\n","authors":["Jeong-Yoon Kim","Seung-Ho Lee"],"pdf_url":"https://arxiv.org/pdf/2403.17327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10474v3","updated":"2024-03-26T01:11:52Z","published":"2023-05-17T17:59:16Z","title":"Preserve Your Own Correlation: A Noise Prior for Video Diffusion Models","summary":" Despite tremendous progress in generating high-quality images using diffusion\nmodels, synthesizing a sequence of animated frames that are both photorealistic\nand temporally coherent is still in its infancy. While off-the-shelf\nbillion-scale datasets for image generation are available, collecting similar\nvideo data of the same scale is still challenging. Also, training a video\ndiffusion model is computationally much more expensive than its image\ncounterpart. In this work, we explore finetuning a pretrained image diffusion\nmodel with video data as a practical solution for the video synthesis task. We\nfind that naively extending the image noise prior to video noise prior in video\ndiffusion leads to sub-optimal performance. Our carefully designed video noise\nprior leads to substantially better performance. Extensive experimental\nvalidation shows that our model, Preserve Your Own Correlation (PYoCo), attains\nSOTA zero-shot text-to-video results on the UCF-101 and MSR-VTT benchmarks. It\nalso achieves SOTA video generation quality on the small-scale UCF-101\nbenchmark with a $10\\times$ smaller model using significantly less computation\nthan the prior art.\n","authors":["Songwei Ge","Seungjun Nah","Guilin Liu","Tyler Poon","Andrew Tao","Bryan Catanzaro","David Jacobs","Jia-Bin Huang","Ming-Yu Liu","Yogesh Balaji"],"pdf_url":"https://arxiv.org/pdf/2305.10474v3.pdf","comment":"ICCV 2023. Project webpage:\n https://research.nvidia.com/labs/dir/pyoco"},{"id":"http://arxiv.org/abs/2403.17301v1","updated":"2024-03-26T01:06:47Z","published":"2024-03-26T01:06:47Z","title":"Physical 3D Adversarial Attacks against Monocular Depth Estimation in\n Autonomous Driving","summary":" Deep learning-based monocular depth estimation (MDE), extensively applied in\nautonomous driving, is known to be vulnerable to adversarial attacks. Previous\nphysical attacks against MDE models rely on 2D adversarial patches, so they\nonly affect a small, localized region in the MDE map but fail under various\nviewpoints. To address these limitations, we propose 3D Depth Fool\n(3D$^2$Fool), the first 3D texture-based adversarial attack against MDE models.\n3D$^2$Fool is specifically optimized to generate 3D adversarial textures\nagnostic to model types of vehicles and to have improved robustness in bad\nweather conditions, such as rain and fog. Experimental results validate the\nsuperior performance of our 3D$^2$Fool across various scenarios, including\nvehicles, MDE models, weather conditions, and viewpoints. Real-world\nexperiments with printed 3D textures on physical vehicle models further\ndemonstrate that our 3D$^2$Fool can cause an MDE error of over 10 meters.\n","authors":["Junhao Zheng","Chenhao Lin","Jiahao Sun","Zhengyu Zhao","Qian Li","Chao Shen"],"pdf_url":"https://arxiv.org/pdf/2403.17301v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.17293v1","updated":"2024-03-26T00:41:54Z","published":"2024-03-26T00:41:54Z","title":"Tracing and segmentation of molecular patterns in 3-dimensional\n cryo-et/em density maps through algorithmic image processing and deep\n learning-based techniques","summary":" Understanding the structures of biological macromolecules is highly important\nas they are closely associated with cellular functionalities. Comprehending the\nprecise organization actin filaments is crucial because they form the dynamic\ncytoskeleton, which offers structural support to cells and connects the cell's\ninterior with its surroundings. However, determining the precise organization\nof actin filaments is challenging due to the poor quality of cryo-electron\ntomography (cryo-ET) images, which suffer from low signal-to-noise (SNR) ratios\nand the presence of missing wedge, as well as diverse shape characteristics of\nactin filaments. To address these formidable challenges, the primary component\nof this dissertation focuses on developing sophisticated computational\ntechniques for tracing actin filaments. In particular, three novel\nmethodologies have been developed: i) BundleTrac, for tracing bundle-like actin\nfilaments found in Stereocilium, ii) Spaghetti Tracer, for tracing filaments\nthat move individually with loosely cohesive movements, and iii) Struwwel\nTracer, for tracing randomly orientated actin filaments in the actin network.\nThe second component of the dissertation introduces a convolutional neural\nnetwork (CNN) based segmentation model to determine the location of protein\nsecondary structures, such as helices and beta-sheets, in medium-resolution\n(5-10 Angstrom) 3-dimensional cryo-electron microscopy (cryo-EM) images. This\nmethodology later evolved into a tool named DeepSSETracer. The final component\nof the dissertation presents a novel algorithm, cylindrical fit measure, to\nestimate image structure match at helix regions in medium-resolution cryo-EM\nimages. Overall, my dissertation has made significant contributions to\naddressing critical research challenges in structural biology by introducing\nvarious computational methods and tools.\n","authors":["Salim Sazzed"],"pdf_url":"https://arxiv.org/pdf/2403.17293v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18158v1","updated":"2024-03-26T23:47:17Z","published":"2024-03-26T23:47:17Z","title":"The Effects of Short Video-Sharing Services on Video Copy Detection","summary":" The short video-sharing services that allow users to post 10-30 second videos\n(e.g., YouTube Shorts and TikTok) have attracted a lot of attention in recent\nyears. However, conventional video copy detection (VCD) methods mainly focus on\ngeneral video-sharing services (e.g., YouTube and Bilibili), and the effects of\nshort video-sharing services on video copy detection are still unclear.\nConsidering that illegally copied videos in short video-sharing services have\nservice-distinctive characteristics, especially in those time lengths, the pros\nand cons of VCD in those services are required to be analyzed. In this paper,\nwe examine the effects of short video-sharing services on VCD by constructing a\ndataset that has short video-sharing service characteristics. Our novel dataset\nis automatically constructed from the publicly available dataset to have\nreference videos and fixed short-time-length query videos, and such automation\nprocedures assure the reproducibility and data privacy preservation of this\npaper. From the experimental results focusing on segment-level and video-level\nsituations, we can see that three effects: \"Segment-level VCD in short\nvideo-sharing services is more difficult than those in general video-sharing\nservices\", \"Video-level VCD in short video-sharing services is easier than\nthose in general video-sharing services\", \"The video alignment component mainly\nsuppress the detection performance in short video-sharing services\".\n","authors":["Rintaro Yanagi","Yamato Okamoto","Shuhei Yokoo","Shin'ichi Satoh"],"pdf_url":"https://arxiv.org/pdf/2403.18158v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18151v1","updated":"2024-03-26T23:32:29Z","published":"2024-03-26T23:32:29Z","title":"Automated Report Generation for Lung Cytological Images Using a CNN\n Vision Classifier and Multiple-Transformer Text Decoders: Preliminary Study","summary":" Cytology plays a crucial role in lung cancer diagnosis. Pulmonary cytology\ninvolves cell morphological characterization in the specimen and reporting the\ncorresponding findings, which are extremely burdensome tasks. In this study, we\npropose a report-generation technique for lung cytology images. In total, 71\nbenign and 135 malignant pulmonary cytology specimens were collected. Patch\nimages were extracted from the captured specimen images, and the findings were\nassigned to each image as a dataset for report generation. The proposed method\nconsists of a vision model and a text decoder. In the former, a convolutional\nneural network (CNN) is used to classify a given image as benign or malignant,\nand the features related to the image are extracted from the intermediate\nlayer. Independent text decoders for benign and malignant cells are prepared\nfor text generation, and the text decoder switches according to the CNN\nclassification results. The text decoder is configured using a Transformer that\nuses the features obtained from the CNN for report generation. Based on the\nevaluation results, the sensitivity and specificity were 100% and 96.4%,\nrespectively, for automated benign and malignant case classification, and the\nsaliency map indicated characteristic benign and malignant areas. The grammar\nand style of the generated texts were confirmed as correct and in better\nagreement with gold standard compared to existing LLM-based image-captioning\nmethods and single-text-decoder ablation model. These results indicate that the\nproposed method is useful for pulmonary cytology classification and reporting.\n","authors":["Atsushi Teramoto","Ayano Michiba","Yuka Kiriyama","Tetsuya Tsukamoto","Kazuyoshi Imaizumi","Hiroshi Fujita"],"pdf_url":"https://arxiv.org/pdf/2403.18151v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2403.16335v2","updated":"2024-03-26T23:29:49Z","published":"2024-03-25T00:17:43Z","title":"MEDDAP: Medical Dataset Enhancement via Diversified Augmentation\n Pipeline","summary":" The effectiveness of Deep Neural Networks (DNNs) heavily relies on the\nabundance and accuracy of available training data. However, collecting and\nannotating data on a large scale is often both costly and time-intensive,\nparticularly in medical cases where practitioners are already occupied with\ntheir duties. Moreover, ensuring that the model remains robust across various\nscenarios of image capture is crucial in medical domains, especially when\ndealing with ultrasound images that vary based on the settings of different\ndevices and the manual operation of the transducer. To address this challenge,\nwe introduce a novel pipeline called MEDDAP, which leverages Stable Diffusion\n(SD) models to augment existing small datasets by automatically generating new\ninformative labeled samples. Pretrained checkpoints for SD are typically based\non natural images, and training them for medical images requires significant\nGPU resources due to their heavy parameters. To overcome this challenge, we\nintroduce USLoRA (Ultrasound Low-Rank Adaptation), a novel fine-tuning method\ntailored specifically for ultrasound applications. USLoRA allows for selective\nfine-tuning of weights within SD, requiring fewer than 0.1\\% of parameters\ncompared to fully fine-tuning only the UNet portion of SD. To enhance dataset\ndiversity, we incorporate different adjectives into the generation process\nprompts, thereby desensitizing the classifiers to intensity changes across\ndifferent images. This approach is inspired by clinicians' decision-making\nprocesses regarding breast tumors, where tumor shape often plays a more crucial\nrole than intensity. In conclusion, our pipeline not only outperforms\nclassifiers trained on the original dataset but also demonstrates superior\nperformance when encountering unseen datasets. The source code is available at\nhttps://github.com/yasamin-med/MEDDAP.\n","authors":["Yasamin Medghalchi","Niloufar Zakariaei","Arman Rahmim","Ilker Hacihaliloglu"],"pdf_url":"https://arxiv.org/pdf/2403.16335v2.pdf","comment":"submitted to miccai 2024 submitted to miccai 2024 Submitted to\n MICCAI-2024"},{"id":"http://arxiv.org/abs/2308.02396v2","updated":"2024-03-26T23:17:24Z","published":"2023-07-24T17:09:40Z","title":"HOOD: Real-Time Human Presence and Out-of-Distribution Detection Using\n FMCW Radar","summary":" Detecting human presence indoors with millimeter-wave frequency-modulated\ncontinuous-wave (FMCW) radar faces challenges from both moving and stationary\nclutter. This work proposes a robust and real-time capable human presence and\nout-of-distribution (OOD) detection method using 60 GHz short-range FMCW radar.\nHOOD solves the human presence and OOD detection problems simultaneously in a\nsingle pipeline. Our solution relies on a reconstruction-based architecture and\nworks with radar macro and micro range-Doppler images (RDIs). HOOD aims to\naccurately detect the presence of humans in the presence or absence of moving\nand stationary disturbers. Since HOOD is also an OOD detector, it aims to\ndetect moving or stationary clutters as OOD in humans' absence and predicts the\ncurrent scene's output as \"no presence.\" HOOD performs well in diverse\nscenarios, demonstrating its effectiveness across different human activities\nand situations. On our dataset collected with a 60 GHz short-range FMCW radar,\nwe achieve an average AUROC of 94.36%. Additionally, our extensive evaluations\nand experiments demonstrate that HOOD outperforms state-of-the-art (SOTA) OOD\ndetection methods in terms of common OOD detection metrics. Importantly, HOOD\nalso perfectly fits on Raspberry Pi 3B+ with an ARM Cortex-A53 CPU, which\nshowcases its versatility across different hardware environments. Videos of our\nhuman presence detection experiments are available at:\nhttps://muskahya.github.io/HOOD\n","authors":["Sabri Mustafa Kahya","Muhammet Sami Yavuz","Eckehard Steinbach"],"pdf_url":"https://arxiv.org/pdf/2308.02396v2.pdf","comment":"10 pages, 2 figures, project page: https://muskahya.github.io/HOOD"},{"id":"http://arxiv.org/abs/2403.18144v1","updated":"2024-03-26T23:05:24Z","published":"2024-03-26T23:05:24Z","title":"Leak and Learn: An Attacker's Cookbook to Train Using Leaked Data from\n Federated Learning","summary":" Federated learning is a decentralized learning paradigm introduced to\npreserve privacy of client data. Despite this, prior work has shown that an\nattacker at the server can still reconstruct the private training data using\nonly the client updates. These attacks are known as data reconstruction attacks\nand fall into two major categories: gradient inversion (GI) and linear layer\nleakage attacks (LLL). However, despite demonstrating the effectiveness of\nthese attacks in breaching privacy, prior work has not investigated the\nusefulness of the reconstructed data for downstream tasks. In this work, we\nexplore data reconstruction attacks through the lens of training and improving\nmodels with leaked data. We demonstrate the effectiveness of both GI and LLL\nattacks in maliciously training models using the leaked data more accurately\nthan a benign federated learning strategy. Counter-intuitively, this bump in\ntraining quality can occur despite limited reconstruction quality or a small\ntotal number of leaked images. Finally, we show the limitations of these\nattacks for downstream training, individually for GI attacks and for LLL\nattacks.\n","authors":["Joshua C. Zhao","Ahaan Dabholkar","Atul Sharma","Saurabh Bagchi"],"pdf_url":"https://arxiv.org/pdf/2403.18144v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2303.09618v2","updated":"2024-03-26T22:59:52Z","published":"2023-03-16T19:47:41Z","title":"HIVE: Harnessing Human Feedback for Instructional Visual Editing","summary":" Incorporating human feedback has been shown to be crucial to align text\ngenerated by large language models to human preferences. We hypothesize that\nstate-of-the-art instructional image editing models, where outputs are\ngenerated based on an input image and an editing instruction, could similarly\nbenefit from human feedback, as their outputs may not adhere to the correct\ninstructions and preferences of users. In this paper, we present a novel\nframework to harness human feedback for instructional visual editing (HIVE).\nSpecifically, we collect human feedback on the edited images and learn a reward\nfunction to capture the underlying user preferences. We then introduce scalable\ndiffusion model fine-tuning methods that can incorporate human preferences\nbased on the estimated reward. Besides, to mitigate the bias brought by the\nlimitation of data, we contribute a new 1M training dataset, a 3.6K reward\ndataset for rewards learning, and a 1K evaluation dataset to boost the\nperformance of instructional image editing. We conduct extensive empirical\nexperiments quantitatively and qualitatively, showing that HIVE is favored over\nprevious state-of-the-art instructional image editing approaches by a large\nmargin.\n","authors":["Shu Zhang","Xinyi Yang","Yihao Feng","Can Qin","Chia-Chih Chen","Ning Yu","Zeyuan Chen","Huan Wang","Silvio Savarese","Stefano Ermon","Caiming Xiong","Ran Xu"],"pdf_url":"https://arxiv.org/pdf/2303.09618v2.pdf","comment":"In CVPR, 2024"},{"id":"http://arxiv.org/abs/2403.18139v1","updated":"2024-03-26T22:50:36Z","published":"2024-03-26T22:50:36Z","title":"Pseudo-MRI-Guided PET Image Reconstruction Method Based on a Diffusion\n Probabilistic Model","summary":" Anatomically guided PET reconstruction using MRI information has been shown\nto have the potential to improve PET image quality. However, these improvements\nare limited to PET scans with paired MRI information. In this work we employed\na diffusion probabilistic model (DPM) to infer T1-weighted-MRI (deep-MRI)\nimages from FDG-PET brain images. We then use the DPM-generated T1w-MRI to\nguide the PET reconstruction. The model was trained with brain FDG scans, and\ntested in datasets containing multiple levels of counts. Deep-MRI images\nappeared somewhat degraded than the acquired MRI images. Regarding PET image\nquality, volume of interest analysis in different brain regions showed that\nboth PET reconstructed images using the acquired and the deep-MRI images\nimproved image quality compared to OSEM. Same conclusions were found analysing\nthe decimated datasets. A subjective evaluation performed by two physicians\nconfirmed that OSEM scored consistently worse than the MRI-guided PET images\nand no significant differences were observed between the MRI-guided PET images.\nThis proof of concept shows that it is possible to infer DPM-based MRI imagery\nto guide the PET reconstruction, enabling the possibility of changing\nreconstruction parameters such as the strength of the prior on anatomically\nguided PET reconstruction in the absence of MRI.\n","authors":["Weijie Gan","Huidong Xie","Carl von Gall","Günther Platsch","Michael T. Jurkiewicz","Andrea Andrade","Udunna C. Anazodo","Ulugbek S. Kamilov","Hongyu An","Jorge Cabello"],"pdf_url":"https://arxiv.org/pdf/2403.18139v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.01973v3","updated":"2024-03-26T22:46:10Z","published":"2023-04-04T17:31:15Z","title":"ERM++: An Improved Baseline for Domain Generalization","summary":" Domain Generalization (DG) measures a classifier's ability to generalize to\nnew distributions of data it was not trained on. Recent work has shown that a\nhyperparameter-tuned Empirical Risk Minimization (ERM) training procedure, that\nis simply minimizing the empirical risk on the source domains, can outperform\nmost existing DG methods. ERM has achieved such strong results while only\ntuning hyper-parameters such as learning rate, weight decay, batch size, and\ndropout. However there are additional hyperparameters which further limit\noverfitting and catastrophic forgetting. We therefore focus on tuning\npreviously untuned hyper-parameters, including training amount, initialization,\nand additional regularizers. We call the resulting stronger baseline ERM++.\nERM++ improves the performance of DG by over 5% compared to prior ERM baselines\non a standard benchmark of 5 datasets with a ResNet-50 and over 15% with a\nViT-B/16, and outperforms all SOTA methods on DomainBed with both\narchitectures. We also explore the relationship between DG performance and\nsimilarity to pre-training data, and find that similarity to pre-training data\ndistributions is an important driver of performance, but that ERM++ with\nstronger initializations can deliver strong performance even on dissimilar\ndatasets.Code is released at https://github.com/piotr-teterwak/erm_plusplus.\n","authors":["Piotr Teterwak","Kuniaki Saito","Theodoros Tsiligkaridis","Kate Saenko","Bryan A. Plummer"],"pdf_url":"https://arxiv.org/pdf/2304.01973v3.pdf","comment":"An improved baseline for Domain Generalization"},{"id":"http://arxiv.org/abs/2403.13680v2","updated":"2024-03-26T22:45:20Z","published":"2024-03-20T15:38:53Z","title":"Step-Calibrated Diffusion for Biomedical Optical Image Restoration","summary":" High-quality, high-resolution medical imaging is essential for clinical care.\nRaman-based biomedical optical imaging uses non-ionizing infrared radiation to\nevaluate human tissues in real time and is used for early cancer detection,\nbrain tumor diagnosis, and intraoperative tissue analysis. Unfortunately,\noptical imaging is vulnerable to image degradation due to laser scattering and\nabsorption, which can result in diagnostic errors and misguided treatment.\nRestoration of optical images is a challenging computer vision task because the\nsources of image degradation are multi-factorial, stochastic, and\ntissue-dependent, preventing a straightforward method to obtain paired\nlow-quality/high-quality data. Here, we present Restorative Step-Calibrated\nDiffusion (RSCD), an unpaired image restoration method that views the image\nrestoration problem as completing the finishing steps of a diffusion-based\nimage generation task. RSCD uses a step calibrator model to dynamically\ndetermine the severity of image degradation and the number of steps required to\ncomplete the reverse diffusion process for image restoration. RSCD outperforms\nother widely used unpaired image restoration methods on both image quality and\nperceptual evaluation metrics for restoring optical images. Medical imaging\nexperts consistently prefer images restored using RSCD in blinded comparison\nexperiments and report minimal to no hallucinations. Finally, we show that RSCD\nimproves performance on downstream clinical imaging tasks, including automated\nbrain tumor diagnosis and deep tissue imaging. Our code is available at\nhttps://github.com/MLNeurosurg/restorative_step-calibrated_diffusion.\n","authors":["Yiwei Lyu","Sung Jik Cha","Cheng Jiang","Asadur Chowdury","Xinhai Hou","Edward Harake","Akhil Kondepudi","Christian Freudiger","Honglak Lee","Todd C. Hollon"],"pdf_url":"https://arxiv.org/pdf/2403.13680v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18134v1","updated":"2024-03-26T22:31:05Z","published":"2024-03-26T22:31:05Z","title":"Integrative Graph-Transformer Framework for Histopathology Whole Slide\n Image Representation and Classification","summary":" In digital pathology, the multiple instance learning (MIL) strategy is widely\nused in the weakly supervised histopathology whole slide image (WSI)\nclassification task where giga-pixel WSIs are only labeled at the slide level.\nHowever, existing attention-based MIL approaches often overlook contextual\ninformation and intrinsic spatial relationships between neighboring tissue\ntiles, while graph-based MIL frameworks have limited power to recognize the\nlong-range dependencies. In this paper, we introduce the integrative\ngraph-transformer framework that simultaneously captures the context-aware\nrelational features and global WSI representations through a novel Graph\nTransformer Integration (GTI) block. Specifically, each GTI block consists of a\nGraph Convolutional Network (GCN) layer modeling neighboring relations at the\nlocal instance level and an efficient global attention model capturing\ncomprehensive global information from extensive feature embeddings. Extensive\nexperiments on three publicly available WSI datasets: TCGA-NSCLC, TCGA-RCC and\nBRIGHT, demonstrate the superiority of our approach over current\nstate-of-the-art MIL methods, achieving an improvement of 1.0% to 2.6% in\naccuracy and 0.7%-1.6% in AUROC.\n","authors":["Zhan Shi","Jingwei Zhang","Jun Kong","Fusheng Wang"],"pdf_url":"https://arxiv.org/pdf/2403.18134v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18132v1","updated":"2024-03-26T22:26:39Z","published":"2024-03-26T22:26:39Z","title":"Recommendation of data-free class-incremental learning algorithms by\n simulating future data","summary":" Class-incremental learning deals with sequential data streams composed of\nbatches of classes. Various algorithms have been proposed to address the\nchallenging case where samples from past classes cannot be stored. However,\nselecting an appropriate algorithm for a user-defined setting is an open\nproblem, as the relative performance of these algorithms depends on the\nincremental settings. To solve this problem, we introduce an algorithm\nrecommendation method that simulates the future data stream. Given an initial\nset of classes, it leverages generative models to simulate future classes from\nthe same visual domain. We evaluate recent algorithms on the simulated stream\nand recommend the one which performs best in the user-defined incremental\nsetting. We illustrate the effectiveness of our method on three large datasets\nusing six algorithms and six incremental settings. Our method outperforms\ncompetitive baselines, and performance is close to that of an oracle choosing\nthe best algorithm in each setting. This work contributes to facilitate the\npractical deployment of incremental learning.\n","authors":["Eva Feillet","Adrian Popescu","Céline Hudelot"],"pdf_url":"https://arxiv.org/pdf/2403.18132v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16967v2","updated":"2024-03-26T22:00:27Z","published":"2024-03-25T17:26:08Z","title":"Visual Whole-Body Control for Legged Loco-Manipulation","summary":" We study the problem of mobile manipulation using legged robots equipped with\nan arm, namely legged loco-manipulation. The robot legs, while usually utilized\nfor mobility, offer an opportunity to amplify the manipulation capabilities by\nconducting whole-body control. That is, the robot can control the legs and the\narm at the same time to extend its workspace. We propose a framework that can\nconduct the whole-body control autonomously with visual observations. Our\napproach, namely Visual Whole-Body Control(VBC), is composed of a low-level\npolicy using all degrees of freedom to track the end-effector manipulator\nposition and a high-level policy proposing the end-effector position based on\nvisual inputs. We train both levels of policies in simulation and perform\nSim2Real transfer for real robot deployment. We perform extensive experiments\nand show significant improvements over baselines in picking up diverse objects\nin different configurations (heights, locations, orientations) and\nenvironments. Project page: https://wholebody-b1.github.io\n","authors":["Minghuan Liu","Zixuan Chen","Xuxin Cheng","Yandong Ji","Ruihan Yang","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16967v2.pdf","comment":"The first two authors contribute equally. Project page:\n https://wholebody-b1.github.io"},{"id":"http://arxiv.org/abs/2312.01629v2","updated":"2024-03-26T21:58:28Z","published":"2023-12-04T05:13:59Z","title":"CLAMP: Contrastive LAnguage Model Prompt-tuning","summary":" Large language models (LLMs) have emerged as powerful general-purpose\ninterfaces for many machine learning problems. Recent work has adapted LLMs to\ngenerative visual tasks like image captioning, visual question answering, and\nvisual chat, using a relatively small amount of instruction-tuning data. In\nthis paper, we explore whether modern LLMs can also be adapted to classifying\nan image into a set of categories. First, we evaluate multimodal LLMs that are\ntuned for generative tasks on zero-shot image classification and find that\ntheir performance is far below that of specialized models like CLIP. We then\npropose an approach for light fine-tuning of LLMs using the same contrastive\nimage-caption matching objective as CLIP. Our results show that LLMs can,\nindeed, achieve good image classification performance when adapted this way.\nOur approach beats state-of-the-art mLLMs by 13% and slightly outperforms\ncontrastive learning with a custom text model, while also retaining the LLM's\ngenerative abilities. LLM initialization appears to particularly help\nclassification in domains under-represented in the visual pre-training data.\n","authors":["Piotr Teterwak","Ximeng Sun","Bryan A. Plummer","Kate Saenko","Ser-Nam Lim"],"pdf_url":"https://arxiv.org/pdf/2312.01629v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18118v1","updated":"2024-03-26T21:48:27Z","published":"2024-03-26T21:48:27Z","title":"EgoLifter: Open-world 3D Segmentation for Egocentric Perception","summary":" In this paper we present EgoLifter, a novel system that can automatically\nsegment scenes captured from egocentric sensors into a complete decomposition\nof individual 3D objects. The system is specifically designed for egocentric\ndata where scenes contain hundreds of objects captured from natural\n(non-scanning) motion. EgoLifter adopts 3D Gaussians as the underlying\nrepresentation of 3D scenes and objects and uses segmentation masks from the\nSegment Anything Model (SAM) as weak supervision to learn flexible and\npromptable definitions of object instances free of any specific object\ntaxonomy. To handle the challenge of dynamic objects in ego-centric videos, we\ndesign a transient prediction module that learns to filter out dynamic objects\nin the 3D reconstruction. The result is a fully automatic pipeline that is able\nto reconstruct 3D object instances as collections of 3D Gaussians that\ncollectively compose the entire scene. We created a new benchmark on the Aria\nDigital Twin dataset that quantitatively demonstrates its state-of-the-art\nperformance in open-world 3D segmentation from natural egocentric input. We run\nEgoLifter on various egocentric activity datasets which shows the promise of\nthe method for 3D egocentric perception at scale.\n","authors":["Qiao Gu","Zhaoyang Lv","Duncan Frost","Simon Green","Julian Straub","Chris Sweeney"],"pdf_url":"https://arxiv.org/pdf/2403.18118v1.pdf","comment":"Preprint. Project page: https://egolifter.github.io/"},{"id":"http://arxiv.org/abs/2403.18117v1","updated":"2024-03-26T21:47:24Z","published":"2024-03-26T21:47:24Z","title":"TDIP: Tunable Deep Image Processing, a Real Time Melt Pool Monitoring\n Solution","summary":" In the era of Industry 4.0, Additive Manufacturing (AM), particularly metal\nAM, has emerged as a significant contributor due to its innovative and\ncost-effective approach to fabricate highly intricate geometries. Despite its\npotential, this industry still lacks real-time capable process monitoring\nalgorithms. Recent advancements in this field suggest that Melt Pool (MP)\nsignatures during the fabrication process contain crucial information about\nprocess dynamics and quality. To obtain this information, various sensory\napproaches, such as high-speed cameras-based vision modules are employed for\nonline fabrication monitoring. However, many conventional in-depth analyses\nstill cannot process all the recorded data simultaneously. Although\nconventional Image Processing (ImP) solutions provide a targeted tunable\napproach, they pose a trade-off between convergence certainty and convergence\nspeed. As a result, conventional methods are not suitable for a dynamically\nchanging application like MP monitoring. Therefore, this article proposes the\nimplementation of a Tunable Deep Image Processing (TDIP) method to address the\ndata-rich monitoring needs in real-time. The proposed model is first trained to\nreplicate an ImP algorithm with tunable features and methodology. The TDIP\nmodel is then further improved to account for MP geometries and fabrication\nquality based on the vision input and process parameters. The TDIP model\nachieved over 94% estimation accuracy with more than 96% R2 score for quality,\ngeometry, and MP signature estimation and isolation. The TDIP model can process\n500 images per second, while conventional methods taking a few minutes per\nimage. This significant processing time reduction enables the integration of\nvision-based monitoring in real-time for processes and quality estimation.\n","authors":["Javid Akhavan","Youmna Mahmoud","Ke Xu","Jiaqi Lyu","Souran Manoochehri"],"pdf_url":"https://arxiv.org/pdf/2403.18117v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18116v1","updated":"2024-03-26T21:45:29Z","published":"2024-03-26T21:45:29Z","title":"QuakeSet: A Dataset and Low-Resource Models to Monitor Earthquakes\n through Sentinel-1","summary":" Earthquake monitoring is necessary to promptly identify the affected areas,\nthe severity of the events, and, finally, to estimate damages and plan the\nactions needed for the restoration process. The use of seismic stations to\nmonitor the strength and origin of earthquakes is limited when dealing with\nremote areas (we cannot have global capillary coverage). Identification and\nanalysis of all affected areas is mandatory to support areas not monitored by\ntraditional stations. Using social media images in crisis management has proven\neffective in various situations. However, they are still limited by the\npossibility of using communication infrastructures in case of an earthquake and\nby the presence of people in the area. Moreover, social media images and\nmessages cannot be used to estimate the actual severity of earthquakes and\ntheir characteristics effectively. The employment of satellites to monitor\nchanges around the globe grants the possibility of exploiting instrumentation\nthat is not limited by the visible spectrum, the presence of land\ninfrastructures, and people in the affected areas. In this work, we propose a\nnew dataset composed of images taken from Sentinel-1 and a new series of tasks\nto help monitor earthquakes from a new detailed view. Coupled with the data, we\nprovide a series of traditional machine learning and deep learning models as\nbaselines to assess the effectiveness of ML-based models in earthquake\nanalysis.\n","authors":["Daniele Rege Cambrin","Paolo Garza"],"pdf_url":"https://arxiv.org/pdf/2403.18116v1.pdf","comment":"Accepted at ISCRAM 2024"},{"id":"http://arxiv.org/abs/2311.02749v3","updated":"2024-03-26T21:42:34Z","published":"2023-11-05T19:59:36Z","title":"Fast Point Cloud to Mesh Reconstruction for Deformable Object Tracking","summary":" The world around us is full of soft objects we perceive and deform with\ndexterous hand movements. For a robotic hand to control soft objects, it has to\nacquire online state feedback of the deforming object. While RGB-D cameras can\ncollect occluded point clouds at a rate of 30Hz, this does not represent a\ncontinuously trackable object surface. Hence, in this work, we developed a\nmethod that takes as input a template mesh which is the mesh of an object in\nits non-deformed state and a deformed point cloud of the same object, and then\nshapes the template mesh such that it matches the deformed point cloud. The\nreconstruction of meshes from point clouds has long been studied in the field\nof Computer graphics under 3D reconstruction and 4D reconstruction, however,\nboth lack the speed and generalizability needed for robotics applications. Our\nmodel is designed using a point cloud auto-encoder and a Real-NVP architecture.\nOur trained model can perform mesh reconstruction and tracking at a rate of\n58Hz on a template mesh of 3000 vertices and a deformed point cloud of 5000\npoints and is generalizable to the deformations of six different object\ncategories which are assumed to be made of soft material in our experiments\n(scissors, hammer, foam brick, cleanser bottle, orange, and dice). The object\nmeshes are taken from the YCB benchmark dataset. An instance of a downstream\napplication can be the control algorithm for a robotic hand that requires\nonline feedback from the state of the manipulated object which would allow\nonline grasp adaptation in a closed-loop manner. Furthermore, the tracking\ncapacity of our method can help in the system identification of deforming\nobjects in a marker-free approach. In future work, we will extend our trained\nmodel to generalize beyond six object categories and additionally to real-world\ndeforming point clouds.\n","authors":["Elham Amin Mansour","Hehui Zheng","Robert K. Katzschmann"],"pdf_url":"https://arxiv.org/pdf/2311.02749v3.pdf","comment":"8 pages with appendix,16 figures"},{"id":"http://arxiv.org/abs/2403.18114v1","updated":"2024-03-26T21:37:25Z","published":"2024-03-26T21:37:25Z","title":"Segment Any Medical Model Extended","summary":" The Segment Anything Model (SAM) has drawn significant attention from\nresearchers who work on medical image segmentation because of its\ngeneralizability. However, researchers have found that SAM may have limited\nperformance on medical images compared to state-of-the-art non-foundation\nmodels. Regardless, the community sees potential in extending, fine-tuning,\nmodifying, and evaluating SAM for analysis of medical imaging. An increasing\nnumber of works have been published focusing on the mentioned four directions,\nwhere variants of SAM are proposed. To this end, a unified platform helps push\nthe boundary of the foundation model for medical images, facilitating the use,\nmodification, and validation of SAM and its variants in medical image\nsegmentation. In this work, we introduce SAMM Extended (SAMME), a platform that\nintegrates new SAM variant models, adopts faster communication protocols,\naccommodates new interactive modes, and allows for fine-tuning of subcomponents\nof the models. These features can expand the potential of foundation models\nlike SAM, and the results can be translated to applications such as\nimage-guided therapy, mixed reality interaction, robotic navigation, and data\naugmentation.\n","authors":["Yihao Liu","Jiaming Zhang","Andres Diaz-Pinto","Haowei Li","Alejandro Martin-Gomez","Amir Kheradmand","Mehran Armand"],"pdf_url":"https://arxiv.org/pdf/2403.18114v1.pdf","comment":"The content of the manuscript has been presented in SPIE Medical\n Imaging 2024, and had been accepted to appear in the proceedings of the\n conference"},{"id":"http://arxiv.org/abs/2312.02126v2","updated":"2024-03-26T21:20:57Z","published":"2023-12-04T18:53:24Z","title":"SplaTAM: Splat, Track & Map 3D Gaussians for Dense RGB-D SLAM","summary":" Dense simultaneous localization and mapping (SLAM) is crucial for robotics\nand augmented reality applications. However, current methods are often hampered\nby the non-volumetric or implicit way they represent a scene. This work\nintroduces SplaTAM, an approach that, for the first time, leverages explicit\nvolumetric representations, i.e., 3D Gaussians, to enable high-fidelity\nreconstruction from a single unposed RGB-D camera, surpassing the capabilities\nof existing methods. SplaTAM employs a simple online tracking and mapping\nsystem tailored to the underlying Gaussian representation. It utilizes a\nsilhouette mask to elegantly capture the presence of scene density. This\ncombination enables several benefits over prior representations, including fast\nrendering and dense optimization, quickly determining if areas have been\npreviously mapped, and structured map expansion by adding more Gaussians.\nExtensive experiments show that SplaTAM achieves up to 2x superior performance\nin camera pose estimation, map construction, and novel-view synthesis over\nexisting methods, paving the way for more immersive high-fidelity SLAM\napplications.\n","authors":["Nikhil Keetha","Jay Karhade","Krishna Murthy Jatavallabhula","Gengshan Yang","Sebastian Scherer","Deva Ramanan","Jonathon Luiten"],"pdf_url":"https://arxiv.org/pdf/2312.02126v2.pdf","comment":"CVPR 2024. Website: https://spla-tam.github.io/"},{"id":"http://arxiv.org/abs/2403.18104v1","updated":"2024-03-26T21:04:18Z","published":"2024-03-26T21:04:18Z","title":"Mathematical Foundation and Corrections for Full Range Head Pose\n Estimation","summary":" Numerous works concerning head pose estimation (HPE) offer algorithms or\nproposed neural network-based approaches for extracting Euler angles from\neither facial key points or directly from images of the head region. However,\nmany works failed to provide clear definitions of the coordinate systems and\nEuler or Tait-Bryan angles orders in use. It is a well-known fact that rotation\nmatrices depend on coordinate systems, and yaw, roll, and pitch angles are\nsensitive to their application order. Without precise definitions, it becomes\nchallenging to validate the correctness of the output head pose and drawing\nroutines employed in prior works. In this paper, we thoroughly examined the\nEuler angles defined in the 300W-LP dataset, head pose estimation such as\n3DDFA-v2, 6D-RepNet, WHENet, etc, and the validity of their drawing routines of\nthe Euler angles. When necessary, we infer their coordinate system and sequence\nof yaw, roll, pitch from provided code. This paper presents (1) code and\nalgorithms for inferring coordinate system from provided source code, code for\nEuler angle application order and extracting precise rotation matrices and the\nEuler angles, (2) code and algorithms for converting poses from one rotation\nsystem to another, (3) novel formulae for 2D augmentations of the rotation\nmatrices, and (4) derivations and code for the correct drawing routines for\nrotation matrices and poses. This paper also addresses the feasibility of\ndefining rotations with right-handed coordinate system in Wikipedia and SciPy,\nwhich makes the Euler angle extraction much easier for full-range head pose\nresearch.\n","authors":["Huei-Chung Hu","Xuyang Wu","Yuan Wang","Yi Fang","Hsin-Tai Wu"],"pdf_url":"https://arxiv.org/pdf/2403.18104v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18103v1","updated":"2024-03-26T21:01:41Z","published":"2024-03-26T21:01:41Z","title":"Tutorial on Diffusion Models for Imaging and Vision","summary":" The astonishing growth of generative tools in recent years has empowered many\nexciting applications in text-to-image generation and text-to-video generation.\nThe underlying principle behind these generative tools is the concept of\ndiffusion, a particular sampling mechanism that has overcome some shortcomings\nthat were deemed difficult in the previous approaches. The goal of this\ntutorial is to discuss the essential ideas underlying the diffusion models. The\ntarget audience of this tutorial includes undergraduate and graduate students\nwho are interested in doing research on diffusion models or applying these\nmodels to solve other problems.\n","authors":["Stanley H. Chan"],"pdf_url":"https://arxiv.org/pdf/2403.18103v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18096v1","updated":"2024-03-26T20:41:35Z","published":"2024-03-26T20:41:35Z","title":"Efficient Multi-Band Temporal Video Filter for Reducing Human-Robot\n Interaction","summary":" Although mobile robots have on-board sensors to perform navigation, their\nefficiency in completing paths can be enhanced by planning to avoid human\ninteraction. Infrastructure cameras can capture human activity continuously for\nthe purpose of compiling activity analytics to choose efficient times and\nroutes. We describe a cascade temporal filtering method to efficiently extract\nshort- and long-term activity in two time dimensions, isochronal and\nchronological, for use in global path planning and local navigation\nrespectively. The temporal filter has application either independently, or, if\nobject recognition is also required, it can be used as a pre-filter to perform\nactivity-gating of the more computationally expensive neural network\nprocessing. For a testbed 32-camera network, we show how this hybrid approach\ncan achieve over 8 times improvement in frames per second throughput and 6.5\ntimes reduction of system power use. We also show how the cost map of static\nobjects in the ROS robot software development framework is augmented with\ndynamic regions determined from the temporal filter.\n","authors":["Lawrence O'Gorman"],"pdf_url":"https://arxiv.org/pdf/2403.18096v1.pdf","comment":"15 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2403.18094v1","updated":"2024-03-26T20:30:55Z","published":"2024-03-26T20:30:55Z","title":"A Personalized Video-Based Hand Taxonomy: Application for Individuals\n with Spinal Cord Injury","summary":" Hand function is critical for our interactions and quality of life. Spinal\ncord injuries (SCI) can impair hand function, reducing independence. A\ncomprehensive evaluation of function in home and community settings requires a\nhand grasp taxonomy for individuals with impaired hand function. Developing\nsuch a taxonomy is challenging due to unrepresented grasp types in standard\ntaxonomies, uneven data distribution across injury levels, and limited data.\nThis study aims to automatically identify the dominant distinct hand grasps in\negocentric video using semantic clustering. Egocentric video recordings\ncollected in the homes of 19 individual with cervical SCI were used to cluster\ngrasping actions with semantic significance. A deep learning model integrating\nposture and appearance data was employed to create a personalized hand\ntaxonomy. Quantitative analysis reveals a cluster purity of 67.6% +- 24.2% with\nwith 18.0% +- 21.8% redundancy. Qualitative assessment revealed meaningful\nclusters in video content. This methodology provides a flexible and effective\nstrategy to analyze hand function in the wild. It offers researchers and\nclinicians an efficient tool for evaluating hand function, aiding sensitive\nassessments and tailored intervention plans.\n","authors":["Mehdy Dousty","David J. Fleet","José Zariffa"],"pdf_url":"https://arxiv.org/pdf/2403.18094v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18092v1","updated":"2024-03-26T20:23:48Z","published":"2024-03-26T20:23:48Z","title":"OCAI: Improving Optical Flow Estimation by Occlusion and Consistency\n Aware Interpolation","summary":" The scarcity of ground-truth labels poses one major challenge in developing\noptical flow estimation models that are both generalizable and robust. While\ncurrent methods rely on data augmentation, they have yet to fully exploit the\nrich information available in labeled video sequences. We propose OCAI, a\nmethod that supports robust frame interpolation by generating intermediate\nvideo frames alongside optical flows in between. Utilizing a forward warping\napproach, OCAI employs occlusion awareness to resolve ambiguities in pixel\nvalues and fills in missing values by leveraging the forward-backward\nconsistency of optical flows. Additionally, we introduce a teacher-student\nstyle semi-supervised learning method on top of the interpolated frames. Using\na pair of unlabeled frames and the teacher model's predicted optical flow, we\ngenerate interpolated frames and flows to train a student model. The teacher's\nweights are maintained using Exponential Moving Averaging of the student. Our\nevaluations demonstrate perceptually superior interpolation quality and\nenhanced optical flow accuracy on established benchmarks such as Sintel and\nKITTI.\n","authors":["Jisoo Jeong","Hong Cai","Risheek Garrepalli","Jamie Menjay Lin","Munawar Hayat","Fatih Porikli"],"pdf_url":"https://arxiv.org/pdf/2403.18092v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.18080v1","updated":"2024-03-26T20:02:48Z","published":"2024-03-26T20:02:48Z","title":"EgoPoseFormer: A Simple Baseline for Egocentric 3D Human Pose Estimation","summary":" We present EgoPoseFormer, a simple yet effective transformer-based model for\nstereo egocentric human pose estimation. The main challenge in egocentric pose\nestimation is overcoming joint invisibility, which is caused by self-occlusion\nor a limited field of view (FOV) of head-mounted cameras. Our approach\novercomes this challenge by incorporating a two-stage pose estimation paradigm:\nin the first stage, our model leverages the global information to estimate each\njoint's coarse location, then in the second stage, it employs a DETR style\ntransformer to refine the coarse locations by exploiting fine-grained stereo\nvisual features. In addition, we present a deformable stereo operation to\nenable our transformer to effectively process multi-view features, which\nenables it to accurately localize each joint in the 3D world. We evaluate our\nmethod on the stereo UnrealEgo dataset and show it significantly outperforms\nprevious approaches while being computationally efficient: it improves MPJPE by\n27.4mm (45% improvement) with only 7.9% model parameters and 13.1% FLOPs\ncompared to the state-of-the-art. Surprisingly, with proper training\ntechniques, we find that even our first-stage pose proposal network can achieve\nsuperior performance compared to previous arts. We also show that our method\ncan be seamlessly extended to monocular settings, which achieves\nstate-of-the-art performance on the SceneEgo dataset, improving MPJPE by 25.5mm\n(21% improvement) compared to the best existing method with only 60.7% model\nparameters and 36.4% FLOPs.\n","authors":["Chenhongyi Yang","Anastasia Tkach","Shreyas Hampali","Linguang Zhang","Elliot J. Crowley","Cem Keskin"],"pdf_url":"https://arxiv.org/pdf/2403.18080v1.pdf","comment":"Tech Report"},{"id":"http://arxiv.org/abs/2403.18074v1","updated":"2024-03-26T19:54:21Z","published":"2024-03-26T19:54:21Z","title":"Every Shot Counts: Using Exemplars for Repetition Counting in Videos","summary":" Video repetition counting infers the number of repetitions of recurring\nactions or motion within a video. We propose an exemplar-based approach that\ndiscovers visual correspondence of video exemplars across repetitions within\ntarget videos. Our proposed Every Shot Counts (ESCounts) model is an\nattention-based encoder-decoder that encodes videos of varying lengths\nalongside exemplars from the same and different videos. In training, ESCounts\nregresses locations of high correspondence to the exemplars within the video.\nIn tandem, our method learns a latent that encodes representations of general\nrepetitive motions, which we use for exemplar-free, zero-shot inference.\nExtensive experiments over commonly used datasets (RepCount, Countix, and\nUCFRep) showcase ESCounts obtaining state-of-the-art performance across all\nthree datasets. On RepCount, ESCounts increases the off-by-one from 0.39 to\n0.56 and decreases the mean absolute error from 0.38 to 0.21. Detailed\nablations further demonstrate the effectiveness of our method.\n","authors":["Saptarshi Sinha","Alexandros Stergiou","Dima Damen"],"pdf_url":"https://arxiv.org/pdf/2403.18074v1.pdf","comment":"Project website: https://sinhasaptarshi.github.io/escounts"},{"id":"http://arxiv.org/abs/2403.18067v1","updated":"2024-03-26T19:36:50Z","published":"2024-03-26T19:36:50Z","title":"State of the art applications of deep learning within tracking and\n detecting marine debris: A survey","summary":" Deep learning techniques have been explored within the marine litter problem\nfor approximately 20 years but the majority of the research has developed\nrapidly in the last five years. We provide an in-depth, up to date, summary and\nanalysis of 28 of the most recent and significant contributions of deep\nlearning in marine debris. From cross referencing the research paper results,\nthe YOLO family significantly outperforms all other methods of object detection\nbut there are many respected contributions to this field that have\ncategorically agreed that a comprehensive database of underwater debris is not\ncurrently available for machine learning. Using a small dataset curated and\nlabelled by us, we tested YOLOv5 on a binary classification task and found the\naccuracy was low and the rate of false positives was high; highlighting the\nimportance of a comprehensive database. We conclude this survey with over 40\nfuture research recommendations and open challenges.\n","authors":["Zoe Moorton","Dr. Zeyneb Kurt","Dr. Wai Lok Woo"],"pdf_url":"https://arxiv.org/pdf/2403.18067v1.pdf","comment":"Review paper, 60 pages including references, 1 figure, 3 tables, 1\n supplementary data"},{"id":"http://arxiv.org/abs/2403.18063v1","updated":"2024-03-26T19:29:21Z","published":"2024-03-26T19:29:21Z","title":"Spectral Convolutional Transformer: Harmonizing Real vs. Complex\n Multi-View Spectral Operators for Vision Transformer","summary":" Transformers used in vision have been investigated through diverse\narchitectures - ViT, PVT, and Swin. These have worked to improve the attention\nmechanism and make it more efficient. Differently, the need for including local\ninformation was felt, leading to incorporating convolutions in transformers\nsuch as CPVT and CvT. Global information is captured using a complex Fourier\nbasis to achieve global token mixing through various methods, such as AFNO,\nGFNet, and Spectformer. We advocate combining three diverse views of data -\nlocal, global, and long-range dependence. We also investigate the simplest\nglobal representation using only the real domain spectral representation -\nobtained through the Hartley transform. We use a convolutional operator in the\ninitial layers to capture local information. Through these two contributions,\nwe are able to optimize and obtain a spectral convolution transformer (SCT)\nthat provides improved performance over the state-of-the-art methods while\nreducing the number of parameters. Through extensive experiments, we show that\nSCT-C-small gives state-of-the-art performance on the ImageNet dataset and\nreaches 84.5\\% top-1 accuracy, while SCT-C-Large reaches 85.9\\% and SCT-C-Huge\nreaches 86.4\\%. We evaluate SCT on transfer learning on datasets such as\nCIFAR-10, CIFAR-100, Oxford Flower, and Stanford Car. We also evaluate SCT on\ndownstream tasks i.e. instance segmentation on the MSCOCO dataset. The project\npage is available on this webpage.\\url{https://github.com/badripatro/sct}\n","authors":["Badri N. Patro","Vinay P. Namboodiri","Vijay S. Agneeswaran"],"pdf_url":"https://arxiv.org/pdf/2403.18063v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08344v2","updated":"2024-03-26T19:25:53Z","published":"2023-12-13T18:28:09Z","title":"FoundationPose: Unified 6D Pose Estimation and Tracking of Novel Objects","summary":" We present FoundationPose, a unified foundation model for 6D object pose\nestimation and tracking, supporting both model-based and model-free setups. Our\napproach can be instantly applied at test-time to a novel object without\nfine-tuning, as long as its CAD model is given, or a small number of reference\nimages are captured. We bridge the gap between these two setups with a neural\nimplicit representation that allows for effective novel view synthesis, keeping\nthe downstream pose estimation modules invariant under the same unified\nframework. Strong generalizability is achieved via large-scale synthetic\ntraining, aided by a large language model (LLM), a novel transformer-based\narchitecture, and contrastive learning formulation. Extensive evaluation on\nmultiple public datasets involving challenging scenarios and objects indicate\nour unified approach outperforms existing methods specialized for each task by\na large margin. In addition, it even achieves comparable results to\ninstance-level methods despite the reduced assumptions. Project page:\nhttps://nvlabs.github.io/FoundationPose/\n","authors":["Bowen Wen","Wei Yang","Jan Kautz","Stan Birchfield"],"pdf_url":"https://arxiv.org/pdf/2312.08344v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18040v1","updated":"2024-03-26T18:52:48Z","published":"2024-03-26T18:52:48Z","title":"Global Point Cloud Registration Network for Large Transformations","summary":" Three-dimensional data registration is an established yet challenging problem\nthat is key in many different applications, such as mapping the environment for\nautonomous vehicles, and modeling objects and people for avatar creation, among\nmany others. Registration refers to the process of mapping multiple data into\nthe same coordinate system by means of matching correspondences and\ntransformation estimation. Novel proposals exploit the benefits of deep\nlearning architectures for this purpose, as they learn the best features for\nthe data, providing better matches and hence results. However, the state of the\nart is usually focused on cases of relatively small transformations, although\nin certain applications and in a real and practical environment, large\ntransformations are very common. In this paper, we present ReLaTo (Registration\nfor Large Transformations), an architecture that faces the cases where large\ntransformations happen while maintaining good performance for local\ntransformations. This proposal uses a novel Softmax pooling layer to find\ncorrespondences in a bilateral consensus manner between two point sets,\nsampling the most confident matches. These matches are used to estimate a\ncoarse and global registration using weighted Singular Value Decomposition\n(SVD). A target-guided denoising step is then applied to both the obtained\nmatches and latent features, estimating the final fine registration considering\nthe local geometry. All these steps are carried out following an end-to-end\napproach, which has been shown to improve 10 state-of-the-art registration\nmethods in two datasets commonly used for this task (ModelNet40 and KITTI),\nespecially in the case of large transformations.\n","authors":["Hanz Cuevas-Velasquez","Alejandro Galán-Cuenca","Antonio Javier Gallego","Marcelo Saval-Calvo","Robert B. Fisher"],"pdf_url":"https://arxiv.org/pdf/2403.18040v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18038v1","updated":"2024-03-26T18:49:56Z","published":"2024-03-26T18:49:56Z","title":"TGGLinesPlus: A robust topological graph-guided computer vision\n algorithm for line detection from images","summary":" Line detection is a classic and essential problem in image processing,\ncomputer vision and machine intelligence. Line detection has many important\napplications, including image vectorization (e.g., document recognition and art\ndesign), indoor mapping, and important societal challenges (e.g., sea ice\nfracture line extraction from satellite imagery). Many line detection\nalgorithms and methods have been developed, but robust and intuitive methods\nare still lacking. In this paper, we proposed and implemented a topological\ngraph-guided algorithm, named TGGLinesPlus, for line detection. Our experiments\non images from a wide range of domains have demonstrated the flexibility of our\nTGGLinesPlus algorithm. We also benchmarked our algorithm with five classic and\nstate-of-the-art line detection methods and the results demonstrate the\nrobustness of TGGLinesPlus. We hope our open-source implementation of\nTGGLinesPlus will inspire and pave the way for many applications where spatial\nscience matters.\n","authors":["Liping Yang","Joshua Driscol","Ming Gong","Shujie Wang","Catherine G. Potts"],"pdf_url":"https://arxiv.org/pdf/2403.18038v1.pdf","comment":"Our TGGLinesPlus Python implementation is open source. 27 pages, 8\n figures and 4 tables"},{"id":"http://arxiv.org/abs/2403.18036v1","updated":"2024-03-26T18:41:07Z","published":"2024-03-26T18:41:07Z","title":"Move as You Say, Interact as You Can: Language-guided Human Motion\n Generation with Scene Affordance","summary":" Despite significant advancements in text-to-motion synthesis, generating\nlanguage-guided human motion within 3D environments poses substantial\nchallenges. These challenges stem primarily from (i) the absence of powerful\ngenerative models capable of jointly modeling natural language, 3D scenes, and\nhuman motion, and (ii) the generative models' intensive data requirements\ncontrasted with the scarcity of comprehensive, high-quality,\nlanguage-scene-motion datasets. To tackle these issues, we introduce a novel\ntwo-stage framework that employs scene affordance as an intermediate\nrepresentation, effectively linking 3D scene grounding and conditional motion\ngeneration. Our framework comprises an Affordance Diffusion Model (ADM) for\npredicting explicit affordance map and an Affordance-to-Motion Diffusion Model\n(AMDM) for generating plausible human motions. By leveraging scene affordance\nmaps, our method overcomes the difficulty in generating human motion under\nmultimodal condition signals, especially when training with limited data\nlacking extensive language-scene-motion pairs. Our extensive experiments\ndemonstrate that our approach consistently outperforms all baselines on\nestablished benchmarks, including HumanML3D and HUMANISE. Additionally, we\nvalidate our model's exceptional generalization capabilities on a specially\ncurated evaluation set featuring previously unseen descriptions and scenes.\n","authors":["Zan Wang","Yixin Chen","Baoxiong Jia","Puhao Li","Jinlu Zhang","Jingze Zhang","Tengyu Liu","Yixin Zhu","Wei Liang","Siyuan Huang"],"pdf_url":"https://arxiv.org/pdf/2403.18036v1.pdf","comment":"CVPR 2024; 16 pages"},{"id":"http://arxiv.org/abs/2403.18035v1","updated":"2024-03-26T18:40:36Z","published":"2024-03-26T18:40:36Z","title":"Bidirectional Consistency Models","summary":" Diffusion models (DMs) are capable of generating remarkably high-quality\nsamples by iteratively denoising a random vector, a process that corresponds to\nmoving along the probability flow ordinary differential equation (PF ODE).\nInterestingly, DMs can also invert an input image to noise by moving backward\nalong the PF ODE, a key operation for downstream tasks such as interpolation\nand image editing. However, the iterative nature of this process restricts its\nspeed, hindering its broader application. Recently, Consistency Models (CMs)\nhave emerged to address this challenge by approximating the integral of the PF\nODE, thereby bypassing the need to iterate. Yet, the absence of an explicit ODE\nsolver complicates the inversion process. To resolve this, we introduce the\nBidirectional Consistency Model (BCM), which learns a single neural network\nthat enables both forward and backward traversal along the PF ODE, efficiently\nunifying generation and inversion tasks within one framework. Notably, our\nproposed method enables one-step generation and inversion while also allowing\nthe use of additional steps to enhance generation quality or reduce\nreconstruction error. Furthermore, by leveraging our model's bidirectional\nconsistency, we introduce a sampling strategy that can enhance FID while\npreserving the generated image content. We further showcase our model's\ncapabilities in several downstream tasks, such as interpolation and inpainting,\nand present demonstrations of potential applications, including blind\nrestoration of compressed images and defending black-box adversarial attacks.\n","authors":["Liangchen Li","Jiajun He"],"pdf_url":"https://arxiv.org/pdf/2403.18035v1.pdf","comment":"40 pages, 25 figures"},{"id":"http://arxiv.org/abs/2403.18033v1","updated":"2024-03-26T18:39:38Z","published":"2024-03-26T18:39:38Z","title":"SpectralWaste Dataset: Multimodal Data for Waste Sorting Automation","summary":" The increase in non-biodegradable waste is a worldwide concern. Recycling\nfacilities play a crucial role, but their automation is hindered by the complex\ncharacteristics of waste recycling lines like clutter or object deformation. In\naddition, the lack of publicly available labeled data for these environments\nmakes developing robust perception systems challenging. Our work explores the\nbenefits of multimodal perception for object segmentation in real waste\nmanagement scenarios. First, we present SpectralWaste, the first dataset\ncollected from an operational plastic waste sorting facility that provides\nsynchronized hyperspectral and conventional RGB images. This dataset contains\nlabels for several categories of objects that commonly appear in sorting plants\nand need to be detected and separated from the main trash flow for several\nreasons, such as security in the management line or reuse. Additionally, we\npropose a pipeline employing different object segmentation architectures and\nevaluate the alternatives on our dataset, conducting an extensive analysis for\nboth multimodal and unimodal alternatives. Our evaluation pays special\nattention to efficiency and suitability for real-time processing and\ndemonstrates how HSI can bring a boost to RGB-only perception in these\nrealistic industrial settings without much computational overhead.\n","authors":["Sara Casao","Fernando Peña","Alberto Sabater","Rosa Castillón","Darío Suárez","Eduardo Montijano","Ana C. Murillo"],"pdf_url":"https://arxiv.org/pdf/2403.18033v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18028v1","updated":"2024-03-26T18:29:39Z","published":"2024-03-26T18:29:39Z","title":"Predicting species occurrence patterns from partial observations","summary":" To address the interlinked biodiversity and climate crises, we need an\nunderstanding of where species occur and how these patterns are changing.\nHowever, observational data on most species remains very limited, and the\namount of data available varies greatly between taxonomic groups. We introduce\nthe problem of predicting species occurrence patterns given (a) satellite\nimagery, and (b) known information on the occurrence of other species. To\nevaluate algorithms on this task, we introduce SatButterfly, a dataset of\nsatellite images, environmental data and observational data for butterflies,\nwhich is designed to pair with the existing SatBird dataset of bird\nobservational data. To address this task, we propose a general model, R-Tran,\nfor predicting species occurrence patterns that enables the use of partial\nobservational data wherever found. We find that R-Tran outperforms other\nmethods in predicting species encounter rates with partial information both\nwithin a taxon (birds) and across taxa (birds and butterflies). Our approach\nopens new perspectives to leveraging insights from species with abundant data\nto other species with scarce data, by modelling the ecosystems in which they\nco-occur.\n","authors":["Hager Radi Abdelwahed","Mélisande Teng","David Rolnick"],"pdf_url":"https://arxiv.org/pdf/2403.18028v1.pdf","comment":"Tackling Climate Change with Machine Learning workshop at ICLR 2024"},{"id":"http://arxiv.org/abs/2312.09138v2","updated":"2024-03-26T18:16:26Z","published":"2023-12-14T17:09:57Z","title":"Living Scenes: Multi-object Relocalization and Reconstruction in\n Changing 3D Environments","summary":" Research into dynamic 3D scene understanding has primarily focused on\nshort-term change tracking from dense observations, while little attention has\nbeen paid to long-term changes with sparse observations. We address this gap\nwith MoRE, a novel approach for multi-object relocalization and reconstruction\nin evolving environments. We view these environments as \"living scenes\" and\nconsider the problem of transforming scans taken at different points in time\ninto a 3D reconstruction of the object instances, whose accuracy and\ncompleteness increase over time. At the core of our method lies an\nSE(3)-equivariant representation in a single encoder-decoder network, trained\non synthetic data. This representation enables us to seamlessly tackle instance\nmatching, registration, and reconstruction. We also introduce a joint\noptimization algorithm that facilitates the accumulation of point clouds\noriginating from the same instance across multiple scans taken at different\npoints in time. We validate our method on synthetic and real-world data and\ndemonstrate state-of-the-art performance in both end-to-end performance and\nindividual subtasks.\n","authors":["Liyuan Zhu","Shengyu Huang","Konrad Schindler","Iro Armeni"],"pdf_url":"https://arxiv.org/pdf/2312.09138v2.pdf","comment":"CVPR 2024 camera-ready"},{"id":"http://arxiv.org/abs/2403.16271v2","updated":"2024-03-26T18:11:28Z","published":"2024-03-24T19:32:39Z","title":"Object Detectors in the Open Environment: Challenges, Solutions, and\n Outlook","summary":" With the emergence of foundation models, deep learning-based object detectors\nhave shown practical usability in closed set scenarios. However, for real-world\ntasks, object detectors often operate in open environments, where crucial\nfactors (e.g., data distribution, objective) that influence model learning are\noften changing. The dynamic and intricate nature of the open environment poses\nnovel and formidable challenges to object detectors. Unfortunately, current\nresearch on object detectors in open environments lacks a comprehensive\nanalysis of their distinctive characteristics, challenges, and corresponding\nsolutions, which hinders their secure deployment in critical real-world\nscenarios. This paper aims to bridge this gap by conducting a comprehensive\nreview and analysis of object detectors in open environments. We initially\nidentified limitations of key structural components within the existing\ndetection pipeline and propose the open environment object detector challenge\nframework that includes four quadrants (i.e., out-of-domain, out-of-category,\nrobust learning, and incremental learning) based on the dimensions of the data\n/ target changes. For each quadrant of challenges in the proposed framework, we\npresent a detailed description and systematic analysis of the overarching goals\nand core difficulties, systematically review the corresponding solutions, and\nbenchmark their performance over multiple widely adopted datasets. In addition,\nwe engage in a discussion of open problems and potential avenues for future\nresearch. This paper aims to provide a fresh, comprehensive, and systematic\nunderstanding of the challenges and solutions associated with open-environment\nobject detectors, thus catalyzing the development of more solid applications in\nreal-world scenarios. A project related to this survey can be found at\nhttps://github.com/LiangSiyuan21/OEOD_Survey.\n","authors":["Siyuan Liang","Wei Wang","Ruoyu Chen","Aishan Liu","Boxi Wu","Ee-Chien Chang","Xiaochun Cao","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2403.16271v2.pdf","comment":"32 pages, 17 figures"},{"id":"http://arxiv.org/abs/2312.07472v4","updated":"2024-03-26T18:08:05Z","published":"2023-12-12T17:55:45Z","title":"MP5: A Multi-modal Open-ended Embodied System in Minecraft via Active\n Perception","summary":" It is a long-lasting goal to design an embodied system that can solve\nlong-horizon open-world tasks in human-like ways. However, existing approaches\nusually struggle with compound difficulties caused by the logic-aware\ndecomposition and context-aware execution of these tasks. To this end, we\nintroduce MP5, an open-ended multimodal embodied system built upon the\nchallenging Minecraft simulator, which can decompose feasible sub-objectives,\ndesign sophisticated situation-aware plans, and perform embodied action\ncontrol, with frequent communication with a goal-conditioned active perception\nscheme. Specifically, MP5 is developed on top of recent advances in Multimodal\nLarge Language Models (MLLMs), and the system is modulated into functional\nmodules that can be scheduled and collaborated to ultimately solve pre-defined\ncontext- and process-dependent tasks. Extensive experiments prove that MP5 can\nachieve a 22% success rate on difficult process-dependent tasks and a 91%\nsuccess rate on tasks that heavily depend on the context. Moreover, MP5\nexhibits a remarkable ability to address many open-ended tasks that are\nentirely novel.\n","authors":["Yiran Qin","Enshen Zhou","Qichang Liu","Zhenfei Yin","Lu Sheng","Ruimao Zhang","Yu Qiao","Jing Shao"],"pdf_url":"https://arxiv.org/pdf/2312.07472v4.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2306.16772v5","updated":"2024-03-26T18:04:33Z","published":"2023-06-29T08:13:57Z","title":"Learning from Synthetic Human Group Activities","summary":" The study of complex human interactions and group activities has become a\nfocal point in human-centric computer vision. However, progress in related\ntasks is often hindered by the challenges of obtaining large-scale labeled\ndatasets from real-world scenarios. To address the limitation, we introduce\nM3Act, a synthetic data generator for multi-view multi-group multi-person human\natomic actions and group activities. Powered by Unity Engine, M3Act features\nmultiple semantic groups, highly diverse and photorealistic images, and a\ncomprehensive set of annotations, which facilitates the learning of\nhuman-centered tasks across single-person, multi-person, and multi-group\nconditions. We demonstrate the advantages of M3Act across three core\nexperiments. The results suggest our synthetic dataset can significantly\nimprove the performance of several downstream methods and replace real-world\ndatasets to reduce cost. Notably, M3Act improves the state-of-the-art MOTRv2 on\nDanceTrack dataset, leading to a hop on the leaderboard from 10th to 2nd place.\nMoreover, M3Act opens new research for controllable 3D group activity\ngeneration. We define multiple metrics and propose a competitive baseline for\nthe novel task. Our code and data are available at our project page:\nhttp://cjerry1243.github.io/M3Act.\n","authors":["Che-Jui Chang","Danrui Li","Deep Patel","Parth Goel","Honglu Zhou","Seonghyeon Moon","Samuel S. Sohn","Sejong Yoon","Vladimir Pavlovic","Mubbasir Kapadia"],"pdf_url":"https://arxiv.org/pdf/2306.16772v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17998v1","updated":"2024-03-26T17:59:52Z","published":"2024-03-26T17:59:52Z","title":"Text Is MASS: Modeling as Stochastic Embedding for Text-Video Retrieval","summary":" The increasing prevalence of video clips has sparked growing interest in\ntext-video retrieval. Recent advances focus on establishing a joint embedding\nspace for text and video, relying on consistent embedding representations to\ncompute similarity. However, the text content in existing datasets is generally\nshort and concise, making it hard to fully describe the redundant semantics of\na video. Correspondingly, a single text embedding may be less expressive to\ncapture the video embedding and empower the retrieval. In this study, we\npropose a new stochastic text modeling method T-MASS, i.e., text is modeled as\na stochastic embedding, to enrich text embedding with a flexible and resilient\nsemantic range, yielding a text mass. To be specific, we introduce a\nsimilarity-aware radius module to adapt the scale of the text mass upon the\ngiven text-video pairs. Plus, we design and develop a support text\nregularization to further control the text mass during the training. The\ninference pipeline is also tailored to fully exploit the text mass for accurate\nretrieval. Empirical evidence suggests that T-MASS not only effectively\nattracts relevant text-video pairs while distancing irrelevant ones, but also\nenables the determination of precise text embeddings for relevant pairs. Our\nexperimental results show a substantial improvement of T-MASS over baseline (3%\nto 6.3% by R@1). Also, T-MASS achieves state-of-the-art performance on five\nbenchmark datasets, including MSRVTT, LSMDC, DiDeMo, VATEX, and Charades.\n","authors":["Jiamian Wang","Guohao Sun","Pichao Wang","Dongfang Liu","Sohail Dianat","Majid Rabbani","Raghuveer Rao","Zhiqiang Tao"],"pdf_url":"https://arxiv.org/pdf/2403.17998v1.pdf","comment":"Accepted by CVPR 2024, code and model are available at\n https://github.com/Jiamian-Wang/T-MASS-text-video-retrieval"},{"id":"http://arxiv.org/abs/2403.17837v1","updated":"2024-03-26T16:24:42Z","published":"2024-03-26T16:24:42Z","title":"GTA-HDR: A Large-Scale Synthetic Dataset for HDR Image Reconstruction","summary":" High Dynamic Range (HDR) content (i.e., images and videos) has a broad range\nof applications. However, capturing HDR content from real-world scenes is\nexpensive and time-consuming. Therefore, the challenging task of reconstructing\nvisually accurate HDR images from their Low Dynamic Range (LDR) counterparts is\ngaining attention in the vision research community. A major challenge in this\nresearch problem is the lack of datasets, which capture diverse scene\nconditions (e.g., lighting, shadows, weather, locations, landscapes, objects,\nhumans, buildings) and various image features (e.g., color, contrast,\nsaturation, hue, luminance, brightness, radiance). To address this gap, in this\npaper, we introduce GTA-HDR, a large-scale synthetic dataset of photo-realistic\nHDR images sampled from the GTA-V video game. We perform thorough evaluation of\nthe proposed dataset, which demonstrates significant qualitative and\nquantitative improvements of the state-of-the-art HDR image reconstruction\nmethods. Furthermore, we demonstrate the effectiveness of the proposed dataset\nand its impact on additional computer vision tasks including 3D human pose\nestimation, human body part segmentation, and holistic scene segmentation. The\ndataset, data collection pipeline, and evaluation code are available at:\nhttps://github.com/HrishavBakulBarua/GTA-HDR.\n","authors":["Hrishav Bakul Barua","Kalin Stefanov","KokSheik Wong","Abhinav Dhall","Ganesh Krishnasamy"],"pdf_url":"https://arxiv.org/pdf/2403.17837v1.pdf","comment":"Submitted to IEEE"},{"id":"http://arxiv.org/abs/2403.17757v1","updated":"2024-03-26T14:49:22Z","published":"2024-03-26T14:49:22Z","title":"Noise2Noise Denoising of CRISM Hyperspectral Data","summary":" Hyperspectral data acquired by the Compact Reconnaissance Imaging\nSpectrometer for Mars (CRISM) have allowed for unparalleled mapping of the\nsurface mineralogy of Mars. Due to sensor degradation over time, a significant\nportion of the recently acquired data is considered unusable. Here a new\ndata-driven model architecture, Noise2Noise4Mars (N2N4M), is introduced to\nremove noise from CRISM images. Our model is self-supervised and does not\nrequire zero-noise target data, making it well suited for use in Planetary\nScience applications where high quality labelled data is scarce. We demonstrate\nits strong performance on synthetic-noise data and CRISM images, and its impact\non downstream classification performance, outperforming benchmark methods on\nmost metrics. This allows for detailed analysis for critical sites of interest\non the Martian surface, including proposed lander sites.\n","authors":["Robert Platt","Rossella Arcucci","Cédric M. John"],"pdf_url":"https://arxiv.org/pdf/2403.17757v1.pdf","comment":"5 pages, 3 figures. Accepted as a conference paper at the ICLR 2024\n ML4RS Workshop"},{"id":"http://arxiv.org/abs/2403.17995v1","updated":"2024-03-26T14:47:05Z","published":"2024-03-26T14:47:05Z","title":"Semi-Supervised Image Captioning Considering Wasserstein Graph Matching","summary":" Image captioning can automatically generate captions for the given images,\nand the key challenge is to learn a mapping function from visual features to\nnatural language features. Existing approaches are mostly supervised ones,\ni.e., each image has a corresponding sentence in the training set. However,\nconsidering that describing images always requires a huge of manpower, we\nusually have limited amount of described images (i.e., image-text pairs) and a\nlarge number of undescribed images in real-world applications. Thereby, a\ndilemma is the \"Semi-Supervised Image Captioning\". To solve this problem, we\npropose a novel Semi-Supervised Image Captioning method considering Wasserstein\nGraph Matching (SSIC-WGM), which turns to adopt the raw image inputs to\nsupervise the generated sentences. Different from traditional single modal\nsemi-supervised methods, the difficulty of semi-supervised cross-modal learning\nlies in constructing intermediately comparable information among heterogeneous\nmodalities. In this paper, SSIC-WGM adopts the successful scene graphs as\nintermediate information, and constrains the generated sentences from two\naspects: 1) inter-modal consistency. SSIC-WGM constructs the scene graphs of\nthe raw image and generated sentence respectively, then employs the wasserstein\ndistance to better measure the similarity between region embeddings of\ndifferent graphs. 2) intra-modal consistency. SSIC-WGM takes the data\naugmentation techniques for the raw images, then constrains the consistency\namong augmented images and generated sentences. Consequently, SSIC-WGM combines\nthe cross-modal pseudo supervision and structure invariant measure for\nefficiently using the undescribed images, and learns more reasonable mapping\nfunction.\n","authors":["Yang Yang"],"pdf_url":"https://arxiv.org/pdf/2403.17995v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17725v1","updated":"2024-03-26T14:13:44Z","published":"2024-03-26T14:13:44Z","title":"Deep Learning for Segmentation of Cracks in High-Resolution Images of\n Steel Bridges","summary":" Automating the current bridge visual inspection practices using drones and\nimage processing techniques is a prominent way to make these inspections more\neffective, robust, and less expensive. In this paper, we investigate the\ndevelopment of a novel deep-learning method for the detection of fatigue cracks\nin high-resolution images of steel bridges. First, we present a novel and\nchallenging dataset comprising of images of cracks in steel bridges. Secondly,\nwe integrate the ConvNext neural network with a previous state-of-the-art\nencoder-decoder network for crack segmentation. We study and report, the\neffects of the use of background patches on the network performance when\napplied to high-resolution images of cracks in steel bridges. Finally, we\nintroduce a loss function that allows the use of more background patches for\nthe training process, which yields a significant reduction in false positive\nrates.\n","authors":["Andrii Kompanets","Gautam Pai","Remco Duits","Davide Leonetti","Bert Snijder"],"pdf_url":"https://arxiv.org/pdf/2403.17725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17994v1","updated":"2024-03-26T13:50:39Z","published":"2024-03-26T13:50:39Z","title":"Solution for Point Tracking Task of ICCV 1st Perception Test Challenge\n 2023","summary":" This report proposes an improved method for the Tracking Any Point (TAP)\ntask, which tracks any physical surface through a video. Several existing\napproaches have explored the TAP by considering the temporal relationships to\nobtain smooth point motion trajectories, however, they still suffer from the\ncumulative error caused by temporal prediction. To address this issue, we\npropose a simple yet effective approach called TAP with confident static points\n(TAPIR+), which focuses on rectifying the tracking of the static point in the\nvideos shot by a static camera. To clarify, our approach contains two key\ncomponents: (1) Multi-granularity Camera Motion Detection, which could identify\nthe video sequence by the static camera shot. (2) CMR-based point trajectory\nprediction with one moving object segmentation approach to isolate the static\npoint from the moving object. Our approach ranked first in the final test with\na score of 0.46.\n","authors":["Hongpeng Pan","Yang Yang","Zhongtian Fu","Yuxuan Zhang","Shian Du","Yi Xu","Xiangyang Ji"],"pdf_url":"https://arxiv.org/pdf/2403.17994v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17332v1","updated":"2024-03-26T02:32:52Z","published":"2024-03-26T02:32:52Z","title":"Labeling subtypes in a Parkinson's Cohort using Multifeatures in MRI --\n Integrating Grey and White Matter Information","summary":" Thresholding of networks has long posed a challenge in brain connectivity\nanalysis. Weighted networks are typically binarized using threshold measures to\nfacilitate network analysis. Previous studies on MRI-based brain networks have\npredominantly utilized density or sparsity-based thresholding techniques,\noptimized within specific ranges derived from network metrics such as path\nlength, clustering coefficient, and small-world index. Thus, determination of a\nsingle threshold value for facilitating comparative analysis of networks\nremains elusive. To address this, our study introduces Mutual K-Nearest\nNeighbor (MKNN)-based thresholding for brain network analysis. Here, nearest\nneighbor selection is based on the highest correlation between features of\nbrain regions. Construction of brain networks was accomplished by computing\nPearson correlations between grey matter volume and white matter volume for\neach pair of brain regions. Structural MRI data from 180 Parkinsons patients\nand 70 controls from the NIMHANS, India were analyzed. Subtypes within\nParkinsons disease were identified based on grey and white matter volume\natrophy using source-based morphometric decomposition. The loading coefficients\nwere correlated with clinical features to discern clinical relationship with\nthe deciphered subtypes. Our data-mining approach revealed: Subtype A (N = 51,\nintermediate type), Subtype B (N = 57, mild-severe type with mild motor\nsymptoms), and Subtype AB (N = 36, most-severe type with predominance in motor\nimpairment). Subtype-specific weighted matrices were binarized using MKNN-based\nthresholding for brain network analysis. Permutation tests on network metrics\nof resulting bipartite graphs demonstrated significant group differences in\nbetweenness centrality and participation coefficient. The identified hubs were\nspecific to each subtype, with some hubs conserved across different subtypes.\n","authors":["Tanmayee Samantaray","Jitender Saini","Pramod Kumar Pal","Bithiah Grace Jaganathan","Vijaya V Saradhi","Gupta CN"],"pdf_url":"https://arxiv.org/pdf/2403.17332v1.pdf","comment":"31 pages, 10 figures, 3 tables"},{"id":"http://arxiv.org/abs/2403.18873v1","updated":"2024-03-26T14:42:46Z","published":"2024-03-26T14:42:46Z","title":"Predicting risk of cardiovascular disease using retinal OCT imaging","summary":" We investigated the potential of optical coherence tomography (OCT) as an\nadditional imaging technique to predict future cardiovascular disease (CVD). We\nutilised a self-supervised deep learning approach based on Variational\nAutoencoders (VAE) to learn low-dimensional representations of high-dimensional\n3D OCT images and to capture distinct characteristics of different retinal\nlayers within the OCT image. A Random Forest (RF) classifier was subsequently\ntrained using the learned latent features and participant demographic and\nclinical data, to differentiate between patients at risk of CVD events (MI or\nstroke) and non-CVD cases. Our predictive model, trained on multimodal data,\nwas assessed based on its ability to correctly identify individuals likely to\nsuffer from a CVD event(MI or stroke), within a 5-year interval after image\nacquisition. Our self-supervised VAE feature selection and multimodal Random\nForest classifier differentiate between patients at risk of future CVD events\nand the control group with an AUC of 0.75, outperforming the clinically\nestablished QRISK3 score (AUC= 0.597). The choroidal layer visible in OCT\nimages was identified as an important predictor of future CVD events using a\nnovel approach to model explanability. Retinal OCT imaging provides a\ncost-effective and non-invasive alternative to predict the risk of\ncardiovascular disease and is readily accessible in optometry practices and\nhospitals.\n","authors":["Cynthia Maldonado-Garcia","Rodrigo Bonazzola","Enzo Ferrante","Thomas H Julian","Panagiotis I Sergouniotis","Nishant Ravikumara","Alejandro F Frangi"],"pdf_url":"https://arxiv.org/pdf/2403.18873v1.pdf","comment":"18 pages for main manuscript, 7 figures, 2 pages for appendix and\n preprint for a journal"},{"id":"http://arxiv.org/abs/2403.18871v1","updated":"2024-03-26T11:40:06Z","published":"2024-03-26T11:40:06Z","title":"Clinical Domain Knowledge-Derived Template Improves Post Hoc AI\n Explanations in Pneumothorax Classification","summary":" Background: Pneumothorax is an acute thoracic disease caused by abnormal air\ncollection between the lungs and chest wall. To address the opaqueness often\nassociated with deep learning (DL) models, explainable artificial intelligence\n(XAI) methods have been introduced to outline regions related to pneumothorax\ndiagnoses made by DL models. However, these explanations sometimes diverge from\nactual lesion areas, highlighting the need for further improvement. Method: We\npropose a template-guided approach to incorporate the clinical knowledge of\npneumothorax into model explanations generated by XAI methods, thereby\nenhancing the quality of these explanations. Utilizing one lesion delineation\ncreated by radiologists, our approach first generates a template that\nrepresents potential areas of pneumothorax occurrence. This template is then\nsuperimposed on model explanations to filter out extraneous explanations that\nfall outside the template's boundaries. To validate its efficacy, we carried\nout a comparative analysis of three XAI methods with and without our template\nguidance when explaining two DL models in two real-world datasets. Results: The\nproposed approach consistently improved baseline XAI methods across twelve\nbenchmark scenarios built on three XAI methods, two DL models, and two\ndatasets. The average incremental percentages, calculated by the performance\nimprovements over the baseline performance, were 97.8% in Intersection over\nUnion (IoU) and 94.1% in Dice Similarity Coefficient (DSC) when comparing model\nexplanations and ground-truth lesion areas. Conclusions: In the context of\npneumothorax diagnoses, we proposed a template-guided approach for improving AI\nexplanations. We anticipate that our template guidance will forge a fresh\napproach to elucidating AI models by integrating clinical domain expertise.\n","authors":["Han Yuan","Chuan Hong","Pengtao Jiang","Gangming Zhao","Nguyen Tuan Anh Tran","Xinxing Xu","Yet Yen Yan","Nan Liu"],"pdf_url":"https://arxiv.org/pdf/2403.18871v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18870v1","updated":"2024-03-26T11:23:08Z","published":"2024-03-26T11:23:08Z","title":"SugarcaneNet2024: An Optimized Weighted Average Ensemble Approach of\n LASSO Regularized Pre-trained Models for Sugarcane Disease Classification","summary":" Sugarcane, a key crop for the world's sugar industry, is prone to several\ndiseases that have a substantial negative influence on both its yield and\nquality. To effectively manage and implement preventative initiatives, diseases\nmust be detected promptly and accurately. In this study, we present a unique\nmodel called sugarcaneNet2024 that outperforms previous methods for\nautomatically and quickly detecting sugarcane disease through leaf image\nprocessing. Our proposed model consolidates an optimized weighted average\nensemble of seven customized and LASSO-regularized pre-trained models,\nparticularly InceptionV3, InceptionResNetV2, DenseNet201, DenseNet169,\nXception, and ResNet152V2. Initially, we added three more dense layers with\n0.0001 LASSO regularization, three 30% dropout layers, and three batch\nnormalizations with renorm enabled at the bottom of these pre-trained models to\nimprove the performance. The accuracy of sugarcane leaf disease classification\nwas greatly increased by this addition. Following this, several comparative\nstudies between the average ensemble and individual models were carried out,\nindicating that the ensemble technique performed better. The average ensemble\nof all modified pre-trained models produced outstanding outcomes: 100%, 99%,\n99%, and 99.45% for f1 score, precision, recall, and accuracy, respectively.\nPerformance was further enhanced by the implementation of an optimized weighted\naverage ensemble technique incorporated with grid search. This optimized\nsugarcaneNet2024 model performed the best for detecting sugarcane diseases,\nhaving achieved accuracy, precision, recall, and F1 score of 99.67%, 100%,\n100%, and 100% , respectively.\n","authors":["Md. Simul Hasan Talukder","Sharmin Akter","Abdullah Hafez Nur"],"pdf_url":"https://arxiv.org/pdf/2403.18870v1.pdf","comment":"32 pages, 11 Figures, 13 Tables"}]},"2024-03-27T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2403.18821v1","updated":"2024-03-27T17:59:56Z","published":"2024-03-27T17:59:56Z","title":"Real Acoustic Fields: An Audio-Visual Room Acoustics Dataset and\n Benchmark","summary":" We present a new dataset called Real Acoustic Fields (RAF) that captures real\nacoustic room data from multiple modalities. The dataset includes high-quality\nand densely captured room impulse response data paired with multi-view images,\nand precise 6DoF pose tracking data for sound emitters and listeners in the\nrooms. We used this dataset to evaluate existing methods for novel-view\nacoustic synthesis and impulse response generation which previously relied on\nsynthetic data. In our evaluation, we thoroughly assessed existing audio and\naudio-visual models against multiple criteria and proposed settings to enhance\ntheir performance on real-world data. We also conducted experiments to\ninvestigate the impact of incorporating visual data (i.e., images and depth)\ninto neural acoustic field models. Additionally, we demonstrated the\neffectiveness of a simple sim2real approach, where a model is pre-trained with\nsimulated data and fine-tuned with sparse real-world data, resulting in\nsignificant improvements in the few-shot learning approach. RAF is the first\ndataset to provide densely captured room acoustic data, making it an ideal\nresource for researchers working on audio and audio-visual neural acoustic\nfield modeling techniques. Demos and datasets are available on our project\npage: https://facebookresearch.github.io/real-acoustic-fields/\n","authors":["Ziyang Chen","Israel D. Gebru","Christian Richardt","Anurag Kumar","William Laney","Andrew Owens","Alexander Richard"],"pdf_url":"https://arxiv.org/pdf/2403.18821v1.pdf","comment":"Accepted to CVPR 2024. Project site:\n https://facebookresearch.github.io/real-acoustic-fields/"},{"id":"http://arxiv.org/abs/2403.18820v1","updated":"2024-03-27T17:59:54Z","published":"2024-03-27T17:59:54Z","title":"MetaCap: Meta-learning Priors from Multi-View Imagery for Sparse-view\n Human Performance Capture and Rendering","summary":" Faithful human performance capture and free-view rendering from sparse RGB\nobservations is a long-standing problem in Vision and Graphics. The main\nchallenges are the lack of observations and the inherent ambiguities of the\nsetting, e.g. occlusions and depth ambiguity. As a result, radiance fields,\nwhich have shown great promise in capturing high-frequency appearance and\ngeometry details in dense setups, perform poorly when na\\\"ively supervising\nthem on sparse camera views, as the field simply overfits to the sparse-view\ninputs. To address this, we propose MetaCap, a method for efficient and\nhigh-quality geometry recovery and novel view synthesis given very sparse or\neven a single view of the human. Our key idea is to meta-learn the radiance\nfield weights solely from potentially sparse multi-view videos, which can serve\nas a prior when fine-tuning them on sparse imagery depicting the human. This\nprior provides a good network weight initialization, thereby effectively\naddressing ambiguities in sparse-view capture. Due to the articulated structure\nof the human body and motion-induced surface deformations, learning such a\nprior is non-trivial. Therefore, we propose to meta-learn the field weights in\na pose-canonicalized space, which reduces the spatial feature range and makes\nfeature learning more effective. Consequently, one can fine-tune our field\nparameters to quickly generalize to unseen poses, novel illumination conditions\nas well as novel and sparse (even monocular) camera views. For evaluating our\nmethod under different scenarios, we collect a new dataset, WildDynaCap, which\ncontains subjects captured in, both, a dense camera dome and in-the-wild sparse\ncamera rigs, and demonstrate superior results compared to recent\nstate-of-the-art methods on both public and WildDynaCap dataset.\n","authors":["Guoxing Sun","Rishabh Dabral","Pascal Fua","Christian Theobalt","Marc Habermann"],"pdf_url":"https://arxiv.org/pdf/2403.18820v1.pdf","comment":"Project page: https://vcai.mpi-inf.mpg.de/projects/MetaCap/"},{"id":"http://arxiv.org/abs/2403.18819v1","updated":"2024-03-27T17:59:53Z","published":"2024-03-27T17:59:53Z","title":"Benchmarking Object Detectors with COCO: A New Path Forward","summary":" The Common Objects in Context (COCO) dataset has been instrumental in\nbenchmarking object detectors over the past decade. Like every dataset, COCO\ncontains subtle errors and imperfections stemming from its annotation\nprocedure. With the advent of high-performing models, we ask whether these\nerrors of COCO are hindering its utility in reliably benchmarking further\nprogress. In search for an answer, we inspect thousands of masks from COCO\n(2017 version) and uncover different types of errors such as imprecise mask\nboundaries, non-exhaustively annotated instances, and mislabeled masks. Due to\nthe prevalence of COCO, we choose to correct these errors to maintain\ncontinuity with prior research. We develop COCO-ReM (Refined Masks), a cleaner\nset of annotations with visibly better mask quality than COCO-2017. We evaluate\nfifty object detectors and find that models that predict visually sharper masks\nscore higher on COCO-ReM, affirming that they were being incorrectly penalized\ndue to errors in COCO-2017. Moreover, our models trained using COCO-ReM\nconverge faster and score higher than their larger variants trained using\nCOCO-2017, highlighting the importance of data quality in improving object\ndetectors. With these findings, we advocate using COCO-ReM for future object\ndetection research. Our dataset is available at https://cocorem.xyz\n","authors":["Shweta Singh","Aayan Yadav","Jitesh Jain","Humphrey Shi","Justin Johnson","Karan Desai"],"pdf_url":"https://arxiv.org/pdf/2403.18819v1.pdf","comment":"Technical report. Dataset website: https://cocorem.xyz and code:\n https://github.com/kdexd/coco-rem"},{"id":"http://arxiv.org/abs/2403.18818v1","updated":"2024-03-27T17:59:52Z","published":"2024-03-27T17:59:52Z","title":"ObjectDrop: Bootstrapping Counterfactuals for Photorealistic Object\n Removal and Insertion","summary":" Diffusion models have revolutionized image editing but often generate images\nthat violate physical laws, particularly the effects of objects on the scene,\ne.g., occlusions, shadows, and reflections. By analyzing the limitations of\nself-supervised approaches, we propose a practical solution centered on a\n\\q{counterfactual} dataset. Our method involves capturing a scene before and\nafter removing a single object, while minimizing other changes. By fine-tuning\na diffusion model on this dataset, we are able to not only remove objects but\nalso their effects on the scene. However, we find that applying this approach\nfor photorealistic object insertion requires an impractically large dataset. To\ntackle this challenge, we propose bootstrap supervision; leveraging our object\nremoval model trained on a small counterfactual dataset, we synthetically\nexpand this dataset considerably. Our approach significantly outperforms prior\nmethods in photorealistic object removal and insertion, particularly at\nmodeling the effects of objects on the scene.\n","authors":["Daniel Winter","Matan Cohen","Shlomi Fruchter","Yael Pritch","Alex Rav-Acha","Yedid Hoshen"],"pdf_url":"https://arxiv.org/pdf/2403.18818v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18816v1","updated":"2024-03-27T17:59:33Z","published":"2024-03-27T17:59:33Z","title":"Garment3DGen: 3D Garment Stylization and Texture Generation","summary":" We introduce Garment3DGen a new method to synthesize 3D garment assets from a\nbase mesh given a single input image as guidance. Our proposed approach allows\nusers to generate 3D textured clothes based on both real and synthetic images,\nsuch as those generated by text prompts. The generated assets can be directly\ndraped and simulated on human bodies. First, we leverage the recent progress of\nimage to 3D diffusion methods to generate 3D garment geometries. However, since\nthese geometries cannot be utilized directly for downstream tasks, we propose\nto use them as pseudo ground-truth and set up a mesh deformation optimization\nprocedure that deforms a base template mesh to match the generated 3D target.\nSecond, we introduce carefully designed losses that allow the input base mesh\nto freely deform towards the desired target, yet preserve mesh quality and\ntopology such that they can be simulated. Finally, a texture estimation module\ngenerates high-fidelity texture maps that are globally and locally consistent\nand faithfully capture the input guidance, allowing us to render the generated\n3D assets. With Garment3DGen users can generate the textured 3D garment of\ntheir choice without the need of artist intervention. One can provide a textual\nprompt describing the garment they desire to generate a simulation-ready 3D\nasset. We present a plethora of quantitative and qualitative comparisons on\nvarious assets both real and generated and provide use-cases of how one can\ngenerate simulation-ready 3D garments.\n","authors":["Nikolaos Sarafianos","Tuur Stuyck","Xiaoyu Xiang","Yilei Li","Jovan Popovic","Rakesh Ranjan"],"pdf_url":"https://arxiv.org/pdf/2403.18816v1.pdf","comment":"Project Page: https://nsarafianos.github.io/garment3dgen"},{"id":"http://arxiv.org/abs/2403.18814v1","updated":"2024-03-27T17:59:04Z","published":"2024-03-27T17:59:04Z","title":"Mini-Gemini: Mining the Potential of Multi-modality Vision Language\n Models","summary":" In this work, we introduce Mini-Gemini, a simple and effective framework\nenhancing multi-modality Vision Language Models (VLMs). Despite the\nadvancements in VLMs facilitating basic visual dialog and reasoning, a\nperformance gap persists compared to advanced models like GPT-4 and Gemini. We\ntry to narrow the gap by mining the potential of VLMs for better performance\nand any-to-any workflow from three aspects, i.e., high-resolution visual\ntokens, high-quality data, and VLM-guided generation. To enhance visual tokens,\nwe propose to utilize an additional visual encoder for high-resolution\nrefinement without increasing the visual token count. We further construct a\nhigh-quality dataset that promotes precise image comprehension and\nreasoning-based generation, expanding the operational scope of current VLMs. In\ngeneral, Mini-Gemini further mines the potential of VLMs and empowers current\nframeworks with image understanding, reasoning, and generation simultaneously.\nMini-Gemini supports a series of dense and MoE Large Language Models (LLMs)\nfrom 2B to 34B. It is demonstrated to achieve leading performance in several\nzero-shot benchmarks and even surpasses the developed private models. Code and\nmodels are available at https://github.com/dvlab-research/MiniGemini.\n","authors":["Yanwei Li","Yuechen Zhang","Chengyao Wang","Zhisheng Zhong","Yixin Chen","Ruihang Chu","Shaoteng Liu","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2403.18814v1.pdf","comment":"Code and models are available at\n https://github.com/dvlab-research/MiniGemini"},{"id":"http://arxiv.org/abs/2403.18811v1","updated":"2024-03-27T17:57:02Z","published":"2024-03-27T17:57:02Z","title":"Duolando: Follower GPT with Off-Policy Reinforcement Learning for Dance\n Accompaniment","summary":" We introduce a novel task within the field of 3D dance generation, termed\ndance accompaniment, which necessitates the generation of responsive movements\nfrom a dance partner, the \"follower\", synchronized with the lead dancer's\nmovements and the underlying musical rhythm. Unlike existing solo or group\ndance generation tasks, a duet dance scenario entails a heightened degree of\ninteraction between the two participants, requiring delicate coordination in\nboth pose and position. To support this task, we first build a large-scale and\ndiverse duet interactive dance dataset, DD100, by recording about 117 minutes\nof professional dancers' performances. To address the challenges inherent in\nthis task, we propose a GPT-based model, Duolando, which autoregressively\npredicts the subsequent tokenized motion conditioned on the coordinated\ninformation of the music, the leader's and the follower's movements. To further\nenhance the GPT's capabilities of generating stable results on unseen\nconditions (music and leader motions), we devise an off-policy reinforcement\nlearning strategy that allows the model to explore viable trajectories from\nout-of-distribution samplings, guided by human-defined rewards. Based on the\ncollected dataset and proposed method, we establish a benchmark with several\ncarefully designed metrics.\n","authors":["Li Siyao","Tianpei Gu","Zhitao Yang","Zhengyu Lin","Ziwei Liu","Henghui Ding","Lei Yang","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2403.18811v1.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2403.18807v1","updated":"2024-03-27T17:53:30Z","published":"2024-03-27T17:53:30Z","title":"ECoDepth: Effective Conditioning of Diffusion Models for Monocular Depth\n Estimation","summary":" In the absence of parallax cues, a learning-based single image depth\nestimation (SIDE) model relies heavily on shading and contextual cues in the\nimage. While this simplicity is attractive, it is necessary to train such\nmodels on large and varied datasets, which are difficult to capture. It has\nbeen shown that using embeddings from pre-trained foundational models, such as\nCLIP, improves zero shot transfer in several applications. Taking inspiration\nfrom this, in our paper we explore the use of global image priors generated\nfrom a pre-trained ViT model to provide more detailed contextual information.\nWe argue that the embedding vector from a ViT model, pre-trained on a large\ndataset, captures greater relevant information for SIDE than the usual route of\ngenerating pseudo image captions, followed by CLIP based text embeddings. Based\non this idea, we propose a new SIDE model using a diffusion backbone which is\nconditioned on ViT embeddings. Our proposed design establishes a new\nstate-of-the-art (SOTA) for SIDE on NYUv2 dataset, achieving Abs Rel error of\n0.059(14% improvement) compared to 0.069 by the current SOTA (VPD). And on\nKITTI dataset, achieving Sq Rel error of 0.139 (2% improvement) compared to\n0.142 by the current SOTA (GEDepth). For zero-shot transfer with a model\ntrained on NYUv2, we report mean relative improvement of (20%, 23%, 81%, 25%)\nover NeWCRFs on (Sun-RGBD, iBims1, DIODE, HyperSim) datasets, compared to (16%,\n18%, 45%, 9%) by ZoeDepth. The code is available at\nhttps://github.com/Aradhye2002/EcoDepth.\n","authors":["Suraj Patni","Aradhye Agarwal","Chetan Arora"],"pdf_url":"https://arxiv.org/pdf/2403.18807v1.pdf","comment":"Accepted at IEEE/CVF Conference on Computer Vision and Pattern\n Recognition (CVPR) 2024"},{"id":"http://arxiv.org/abs/2311.10319v4","updated":"2024-03-27T17:41:50Z","published":"2023-11-17T04:04:29Z","title":"Shifting to Machine Supervision: Annotation-Efficient Semi and\n Self-Supervised Learning for Automatic Medical Image Segmentation and\n Classification","summary":" Advancements in clinical treatment are increasingly constrained by the\nlimitations of supervised learning techniques, which depend heavily on large\nvolumes of annotated data. The annotation process is not only costly but also\ndemands substantial time from clinical specialists. Addressing this issue, we\nintroduce the S4MI (Self-Supervision and Semi-Supervision for Medical Imaging)\npipeline, a novel approach that leverages advancements in self-supervised and\nsemi-supervised learning. These techniques engage in auxiliary tasks that do\nnot require labeling, thus simplifying the scaling of machine supervision\ncompared to fully-supervised methods. Our study benchmarks these techniques on\nthree distinct medical imaging datasets to evaluate their effectiveness in\nclassification and segmentation tasks. Notably, we observed that self\nsupervised learning significantly surpassed the performance of supervised\nmethods in the classification of all evaluated datasets. Remarkably, the\nsemi-supervised approach demonstrated superior outcomes in segmentation,\noutperforming fully-supervised methods while using 50% fewer labels across all\ndatasets. In line with our commitment to contributing to the scientific\ncommunity, we have made the S4MI code openly accessible, allowing for broader\napplication and further development of these methods.\n","authors":["Pranav Singh","Raviteja Chukkapalli","Shravan Chaudhari","Luoyao Chen","Mei Chen","Jinqian Pan","Craig Smuda","Jacopo Cirrone"],"pdf_url":"https://arxiv.org/pdf/2311.10319v4.pdf","comment":"Seventeen pages (incl. references), five figures, and one table.\n (Under Review)"},{"id":"http://arxiv.org/abs/2403.18795v1","updated":"2024-03-27T17:40:14Z","published":"2024-03-27T17:40:14Z","title":"Gamba: Marry Gaussian Splatting with Mamba for single view 3D\n reconstruction","summary":" We tackle the challenge of efficiently reconstructing a 3D asset from a\nsingle image with growing demands for automated 3D content creation pipelines.\nPrevious methods primarily rely on Score Distillation Sampling (SDS) and Neural\nRadiance Fields (NeRF). Despite their significant success, these approaches\nencounter practical limitations due to lengthy optimization and considerable\nmemory usage. In this report, we introduce Gamba, an end-to-end amortized 3D\nreconstruction model from single-view images, emphasizing two main insights:\n(1) 3D representation: leveraging a large number of 3D Gaussians for an\nefficient 3D Gaussian splatting process; (2) Backbone design: introducing a\nMamba-based sequential network that facilitates context-dependent reasoning and\nlinear scalability with the sequence (token) length, accommodating a\nsubstantial number of Gaussians. Gamba incorporates significant advancements in\ndata preprocessing, regularization design, and training methodologies. We\nassessed Gamba against existing optimization-based and feed-forward 3D\ngeneration approaches using the real-world scanned OmniObject3D dataset. Here,\nGamba demonstrates competitive generation capabilities, both qualitatively and\nquantitatively, while achieving remarkable speed, approximately 0.6 second on a\nsingle NVIDIA A100 GPU.\n","authors":["Qiuhong Shen","Xuanyu Yi","Zike Wu","Pan Zhou","Hanwang Zhang","Shuicheng Yan","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2403.18795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18791v1","updated":"2024-03-27T17:35:24Z","published":"2024-03-27T17:35:24Z","title":"Object Pose Estimation via the Aggregation of Diffusion Features","summary":" Estimating the pose of objects from images is a crucial task of 3D scene\nunderstanding, and recent approaches have shown promising results on very large\nbenchmarks. However, these methods experience a significant performance drop\nwhen dealing with unseen objects. We believe that it results from the limited\ngeneralizability of image features. To address this problem, we have an\nin-depth analysis on the features of diffusion models, e.g. Stable Diffusion,\nwhich hold substantial potential for modeling unseen objects. Based on this\nanalysis, we then innovatively introduce these diffusion features for object\npose estimation. To achieve this, we propose three distinct architectures that\ncan effectively capture and aggregate diffusion features of different\ngranularity, greatly improving the generalizability of object pose estimation.\nOur approach outperforms the state-of-the-art methods by a considerable margin\non three popular benchmark datasets, LM, O-LM, and T-LESS. In particular, our\nmethod achieves higher accuracy than the previous best arts on unseen objects:\n98.2% vs. 93.5% on Unseen LM, 85.9% vs. 76.3% on Unseen O-LM, showing the\nstrong generalizability of our method. Our code is released at\nhttps://github.com/Tianfu18/diff-feats-pose.\n","authors":["Tianfu Wang","Guosheng Hu","Hongguang Wang"],"pdf_url":"https://arxiv.org/pdf/2403.18791v1.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2403.18784v1","updated":"2024-03-27T17:32:04Z","published":"2024-03-27T17:32:04Z","title":"SplatFace: Gaussian Splat Face Reconstruction Leveraging an Optimizable\n Surface","summary":" We present SplatFace, a novel Gaussian splatting framework designed for 3D\nhuman face reconstruction without reliance on accurate pre-determined geometry.\nOur method is designed to simultaneously deliver both high-quality novel view\nrendering and accurate 3D mesh reconstructions. We incorporate a generic 3D\nMorphable Model (3DMM) to provide a surface geometric structure, making it\npossible to reconstruct faces with a limited set of input images. We introduce\na joint optimization strategy that refines both the Gaussians and the morphable\nsurface through a synergistic non-rigid alignment process. A novel distance\nmetric, splat-to-surface, is proposed to improve alignment by considering both\nthe Gaussian position and covariance. The surface information is also utilized\nto incorporate a world-space densification process, resulting in superior\nreconstruction quality. Our experimental analysis demonstrates that the\nproposed method is competitive with both other Gaussian splatting techniques in\nnovel view synthesis and other 3D reconstruction methods in producing 3D face\nmeshes with high geometric precision.\n","authors":["Jiahao Luo","Jing Liu","James Davis"],"pdf_url":"https://arxiv.org/pdf/2403.18784v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18775v1","updated":"2024-03-27T17:23:39Z","published":"2024-03-27T17:23:39Z","title":"ImageNet-D: Benchmarking Neural Network Robustness on Diffusion\n Synthetic Object","summary":" We establish rigorous benchmarks for visual perception robustness. Synthetic\nimages such as ImageNet-C, ImageNet-9, and Stylized ImageNet provide specific\ntype of evaluation over synthetic corruptions, backgrounds, and textures, yet\nthose robustness benchmarks are restricted in specified variations and have low\nsynthetic quality. In this work, we introduce generative model as a data source\nfor synthesizing hard images that benchmark deep models' robustness. Leveraging\ndiffusion models, we are able to generate images with more diversified\nbackgrounds, textures, and materials than any prior work, where we term this\nbenchmark as ImageNet-D. Experimental results show that ImageNet-D results in a\nsignificant accuracy drop to a range of vision models, from the standard ResNet\nvisual classifier to the latest foundation models like CLIP and MiniGPT-4,\nsignificantly reducing their accuracy by up to 60\\%. Our work suggests that\ndiffusion models can be an effective source to test vision models. The code and\ndataset are available at https://github.com/chenshuang-zhang/imagenet_d.\n","authors":["Chenshuang Zhang","Fei Pan","Junmo Kim","In So Kweon","Chengzhi Mao"],"pdf_url":"https://arxiv.org/pdf/2403.18775v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2312.01220v2","updated":"2024-03-27T17:23:16Z","published":"2023-12-02T20:11:48Z","title":"Boosting Object Detection with Zero-Shot Day-Night Domain Adaptation","summary":" Detecting objects in low-light scenarios presents a persistent challenge, as\ndetectors trained on well-lit data exhibit significant performance degradation\non low-light data due to low visibility. Previous methods mitigate this issue\nby exploring image enhancement or object detection techniques with real\nlow-light image datasets. However, the progress is impeded by the inherent\ndifficulties about collecting and annotating low-light images. To address this\nchallenge, we propose to boost low-light object detection with zero-shot\nday-night domain adaptation, which aims to generalize a detector from well-lit\nscenarios to low-light ones without requiring real low-light data. Revisiting\nRetinex theory in the low-level vision, we first design a reflectance\nrepresentation learning module to learn Retinex-based illumination invariance\nin images with a carefully designed illumination invariance reinforcement\nstrategy. Next, an interchange-redecomposition-coherence procedure is\nintroduced to improve over the vanilla Retinex image decomposition process by\nperforming two sequential image decompositions and introducing a\nredecomposition cohering loss. Extensive experiments on ExDark, DARK FACE, and\nCODaN datasets show strong low-light generalizability of our method. Our code\nis available at https://github.com/ZPDu/DAI-Net.\n","authors":["Zhipeng Du","Miaojing Shi","Jiankang Deng"],"pdf_url":"https://arxiv.org/pdf/2312.01220v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.06054v4","updated":"2024-03-27T17:06:10Z","published":"2024-03-10T00:47:05Z","title":"Decoupled Data Consistency with Diffusion Purification for Image\n Restoration","summary":" Diffusion models have recently gained traction as a powerful class of deep\ngenerative priors, excelling in a wide range of image restoration tasks due to\ntheir exceptional ability to model data distributions. To solve image\nrestoration problems, many existing techniques achieve data consistency by\nincorporating additional likelihood gradient steps into the reverse sampling\nprocess of diffusion models. However, the additional gradient steps pose a\nchallenge for real-world practical applications as they incur a large\ncomputational overhead, thereby increasing inference time. They also present\nadditional difficulties when using accelerated diffusion model samplers, as the\nnumber of data consistency steps is limited by the number of reverse sampling\nsteps. In this work, we propose a novel diffusion-based image restoration\nsolver that addresses these issues by decoupling the reverse process from the\ndata consistency steps. Our method involves alternating between a\nreconstruction phase to maintain data consistency and a refinement phase that\nenforces the prior via diffusion purification. Our approach demonstrates\nversatility, making it highly adaptable for efficient problem-solving in latent\nspace. Additionally, it reduces the necessity for numerous sampling steps\nthrough the integration of consistency models. The efficacy of our approach is\nvalidated through comprehensive experiments across various image restoration\ntasks, including image denoising, deblurring, inpainting, and super-resolution.\n","authors":["Xiang Li","Soo Min Kwon","Ismail R. Alkhouri","Saiprasad Ravishankar","Qing Qu"],"pdf_url":"https://arxiv.org/pdf/2403.06054v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18762v1","updated":"2024-03-27T17:01:10Z","published":"2024-03-27T17:01:10Z","title":"ModaLink: Unifying Modalities for Efficient Image-to-PointCloud Place\n Recognition","summary":" Place recognition is an important task for robots and autonomous cars to\nlocalize themselves and close loops in pre-built maps. While single-modal\nsensor-based methods have shown satisfactory performance, cross-modal place\nrecognition that retrieving images from a point-cloud database remains a\nchallenging problem. Current cross-modal methods transform images into 3D\npoints using depth estimation for modality conversion, which are usually\ncomputationally intensive and need expensive labeled data for depth\nsupervision. In this work, we introduce a fast and lightweight framework to\nencode images and point clouds into place-distinctive descriptors. We propose\nan effective Field of View (FoV) transformation module to convert point clouds\ninto an analogous modality as images. This module eliminates the necessity for\ndepth estimation and helps subsequent modules achieve real-time performance. We\nfurther design a non-negative factorization-based encoder to extract mutually\nconsistent semantic features between point clouds and images. This encoder\nyields more distinctive global descriptors for retrieval. Experimental results\non the KITTI dataset show that our proposed methods achieve state-of-the-art\nperformance while running in real time. Additional evaluation on the HAOMO\ndataset covering a 17 km trajectory further shows the practical generalization\ncapabilities. We have released the implementation of our methods as open source\nat: https://github.com/haomo-ai/ModaLink.git.\n","authors":["Weidong Xie","Lun Luo","Nanfei Ye","Yi Ren","Shaoyi Du","Minhang Wang","Jintao Xu","Rui Ai","Weihao Gu","Xieyuanli Chen"],"pdf_url":"https://arxiv.org/pdf/2403.18762v1.pdf","comment":"8 pages, 11 figures, conference"},{"id":"http://arxiv.org/abs/2403.18756v1","updated":"2024-03-27T16:56:14Z","published":"2024-03-27T16:56:14Z","title":"Detection of subclinical atherosclerosis by image-based deep learning on\n chest x-ray","summary":" Aims. To develop a deep-learning based system for recognition of subclinical\natherosclerosis on a plain frontal chest x-ray. Methods and Results. A\ndeep-learning algorithm to predict coronary artery calcium (CAC) score (the\nAI-CAC model) was developed on 460 chest x-ray (80% training cohort, 20%\ninternal validation cohort) of primary prevention patients (58.4% male, median\nage 63 [51-74] years) with available paired chest x-ray and chest computed\ntomography (CT) indicated for any clinical reason and performed within 3\nmonths. The CAC score calculated on chest CT was used as ground truth. The\nmodel was validated on an temporally-independent cohort of 90 patients from the\nsame institution (external validation). The diagnostic accuracy of the AI-CAC\nmodel assessed by the area under the curve (AUC) was the primary outcome.\nOverall, median AI-CAC score was 35 (0-388) and 28.9% patients had no AI-CAC.\nAUC of the AI-CAC model to identify a CAC>0 was 0.90 in the internal validation\ncohort and 0.77 in the external validation cohort. Sensitivity was consistently\nabove 92% in both cohorts. In the overall cohort (n=540), among patients with\nAI-CAC=0, a single ASCVD event occurred, after 4.3 years. Patients with\nAI-CAC>0 had significantly higher Kaplan Meier estimates for ASCVD events\n(13.5% vs. 3.4%, log-rank=0.013). Conclusion. The AI-CAC model seems to\naccurately detect subclinical atherosclerosis on chest x-ray with elevated\nsensitivity, and to predict ASCVD events with elevated negative predictive\nvalue. Adoption of the AI-CAC model to refine CV risk stratification or as an\nopportunistic screening tool requires prospective evaluation.\n","authors":["Guglielmo Gallone","Francesco Iodice","Alberto Presta","Davide Tore","Ovidio de Filippo","Michele Visciano","Carlo Alberto Barbano","Alessandro Serafini","Paola Gorrini","Alessandro Bruno","Walter Grosso Marra","James Hughes","Mario Iannaccone","Paolo Fonio","Attilio Fiandrotti","Alessandro Depaoli","Marco Grangetto","Gaetano Maria de Ferrari","Fabrizio D'Ascenzo"],"pdf_url":"https://arxiv.org/pdf/2403.18756v1.pdf","comment":"Submitted to European Heart Journal - Cardiovascular Imaging Added\n also the additional material 44 pages (30 main paper, 14 additional\n material), 14 figures (5 main manuscript, 9 additional material)"},{"id":"http://arxiv.org/abs/2303.09817v2","updated":"2024-03-27T16:52:59Z","published":"2023-03-17T07:53:18Z","title":"Interpretable machine learning for time-to-event prediction in medicine\n and healthcare","summary":" Time-to-event prediction, e.g. cancer survival analysis or hospital length of\nstay, is a highly prominent machine learning task in medical and healthcare\napplications. However, only a few interpretable machine learning methods comply\nwith its challenges. To facilitate a comprehensive explanatory analysis of\nsurvival models, we formally introduce time-dependent feature effects and\nglobal feature importance explanations. We show how post-hoc interpretation\nmethods allow for finding biases in AI systems predicting length of stay using\na novel multi-modal dataset created from 1235 X-ray images with textual\nradiology reports annotated by human experts. Moreover, we evaluate cancer\nsurvival models beyond predictive performance to include the importance of\nmulti-omics feature groups based on a large-scale benchmark comprising 11\ndatasets from The Cancer Genome Atlas (TCGA). Model developers can use the\nproposed methods to debug and improve machine learning algorithms, while\nphysicians can discover disease biomarkers and assess their significance. We\nhope the contributed open data and code resources facilitate future work in the\nemerging research direction of explainable survival analysis.\n","authors":["Hubert Baniecki","Bartlomiej Sobieski","Patryk Szatkowski","Przemyslaw Bombinski","Przemyslaw Biecek"],"pdf_url":"https://arxiv.org/pdf/2303.09817v2.pdf","comment":"An extended version of an AIME 2023 paper submitted to Artificial\n Intelligence in Medicine"},{"id":"http://arxiv.org/abs/2403.14623v2","updated":"2024-03-27T16:49:35Z","published":"2024-03-21T17:59:41Z","title":"Simplified Diffusion Schrödinger Bridge","summary":" This paper introduces a novel theoretical simplification of the Diffusion\nSchr\\\"odinger Bridge (DSB) that facilitates its unification with Score-based\nGenerative Models (SGMs), addressing the limitations of DSB in complex data\ngeneration and enabling faster convergence and enhanced performance. By\nemploying SGMs as an initial solution for DSB, our approach capitalizes on the\nstrengths of both frameworks, ensuring a more efficient training process and\nimproving the performance of SGM. We also propose a reparameterization\ntechnique that, despite theoretical approximations, practically improves the\nnetwork's fitting capabilities. Our extensive experimental evaluations confirm\nthe effectiveness of the simplified DSB, demonstrating its significant\nimprovements. We believe the contributions of this work pave the way for\nadvanced generative modeling. The code is available at\nhttps://github.com/checkcrab/SDSB.\n","authors":["Zhicong Tang","Tiankai Hang","Shuyang Gu","Dong Chen","Baining Guo"],"pdf_url":"https://arxiv.org/pdf/2403.14623v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11107v2","updated":"2024-03-27T16:48:34Z","published":"2024-03-17T06:21:21Z","title":"Self-supervised co-salient object detection via feature correspondence\n at multiple scales","summary":" Our paper introduces a novel two-stage self-supervised approach for detecting\nco-occurring salient objects (CoSOD) in image groups without requiring\nsegmentation annotations. Unlike existing unsupervised methods that rely solely\non patch-level information (e.g. clustering patch descriptors) or on\ncomputation heavy off-the-shelf components for CoSOD, our lightweight model\nleverages feature correspondences at both patch and region levels,\nsignificantly improving prediction performance. In the first stage, we train a\nself-supervised network that detects co-salient regions by computing local\npatch-level feature correspondences across images. We obtain the segmentation\npredictions using confidence-based adaptive thresholding. In the next stage, we\nrefine these intermediate segmentations by eliminating the detected regions\n(within each image) whose averaged feature representations are dissimilar to\nthe foreground feature representation averaged across all the cross-attention\nmaps (from the previous stage). Extensive experiments on three CoSOD benchmark\ndatasets show that our self-supervised model outperforms the corresponding\nstate-of-the-art models by a huge margin (e.g. on the CoCA dataset, our model\nhas a 13.7% F-measure gain over the SOTA unsupervised CoSOD model). Notably,\nour self-supervised model also outperforms several recent fully supervised\nCoSOD models on the three test datasets (e.g., on the CoCA dataset, our model\nhas a 4.6% F-measure gain over a recent supervised CoSOD model).\n","authors":["Souradeep Chakraborty","Dimitris Samaras"],"pdf_url":"https://arxiv.org/pdf/2403.11107v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18734v1","updated":"2024-03-27T16:22:45Z","published":"2024-03-27T16:22:45Z","title":"A vascular synthetic model for improved aneurysm segmentation and\n detection via Deep Neural Networks","summary":" We hereby present a full synthetic model, able to mimic the various\nconstituents of the cerebral vascular tree: the cerebral arteries, the\nbifurcations and the intracranial aneurysms. By building this model, our goal\nwas to provide a substantial dataset of brain arteries which could be used by a\n3D Convolutional Neural Network (CNN) to either segment or detect/recognize\nvarious vascular diseases (such as artery dissection/thrombosis) or even some\nportions of the cerebral vasculature, such as the bifurcations or aneurysms. In\nthis study, we will particularly focus on Intra-Cranial Aneurysm (ICA)\ndetection and segmentation. The cerebral aneurysms most often occur on a\nparticular structure of the vascular tree named the Circle of Willis. Various\nstudies have been conducted to detect and monitor the ICAs and those based on\nDeep Learning (DL) achieve the best performances. Specifically, in this work,\nwe propose a full synthetic 3D model able to mimic the brain vasculature as\nacquired by Magnetic Resonance Angiography (MRA), and more particularly the\nTime Of Flight (TOF) principle. Among the various MRI modalities, the MRA-TOF\nallows to have a relatively good rendering of the blood vessels and is\nnon-invasive (no contrast liquid injection). Our model has been designed to\nsimultaneously mimic the arteries geometry, the ICA shape and the background\nnoise. The geometry of the vascular tree is modeled thanks to an interpolation\nwith 3D Spline functions, and the statistical properties of the background MRI\nnoise is collected from MRA acquisitions and reproduced within the model. In\nthis work, we thoroughly describe the synthetic vasculature model, we build up\na neural network designed for ICA segmentation and detection, and finally, we\ncarry out an in-depth evaluation of the performance gap gained thanks to the\nsynthetic model data augmentation.\n","authors":["Rafic Nader","Florent Autrusseau","Vincent L'Allinec","Romain Bourcier"],"pdf_url":"https://arxiv.org/pdf/2403.18734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18731v1","updated":"2024-03-27T16:21:24Z","published":"2024-03-27T16:21:24Z","title":"Enhancing Manufacturing Quality Prediction Models through the\n Integration of Explainability Methods","summary":" This research presents a method that utilizes explainability techniques to\namplify the performance of machine learning (ML) models in forecasting the\nquality of milling processes, as demonstrated in this paper through a\nmanufacturing use case. The methodology entails the initial training of ML\nmodels, followed by a fine-tuning phase where irrelevant features identified\nthrough explainability methods are eliminated. This procedural refinement\nresults in performance enhancements, paving the way for potential reductions in\nmanufacturing costs and a better understanding of the trained ML models. This\nstudy highlights the usefulness of explainability techniques in both explaining\nand optimizing predictive models in the manufacturing realm.\n","authors":["Dennis Gross","Helge Spieker","Arnaud Gotlieb","Ricardo Knoblauch"],"pdf_url":"https://arxiv.org/pdf/2403.18731v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18730v1","updated":"2024-03-27T16:20:55Z","published":"2024-03-27T16:20:55Z","title":"Towards Image Ambient Lighting Normalization","summary":" Lighting normalization is a crucial but underexplored restoration task with\nbroad applications. However, existing works often simplify this task within the\ncontext of shadow removal, limiting the light sources to one and\noversimplifying the scene, thus excluding complex self-shadows and restricting\nsurface classes to smooth ones. Although promising, such simplifications hinder\ngeneralizability to more realistic settings encountered in daily use. In this\npaper, we propose a new challenging task termed Ambient Lighting Normalization\n(ALN), which enables the study of interactions between shadows, unifying image\nrestoration and shadow removal in a broader context. To address the lack of\nappropriate datasets for ALN, we introduce the large-scale high-resolution\ndataset Ambient6K, comprising samples obtained from multiple light sources and\nincluding self-shadows resulting from complex geometries, which is the first of\nits kind. For benchmarking, we select various mainstream methods and rigorously\nevaluate them on Ambient6K. Additionally, we propose IFBlend, a novel strong\nbaseline that maximizes Image-Frequency joint entropy to selectively restore\nlocal areas under different lighting conditions, without relying on shadow\nlocalization priors. Experiments show that IFBlend achieves SOTA scores on\nAmbient6K and exhibits competitive performance on conventional shadow removal\nbenchmarks compared to shadow-specific models with mask priors. The dataset,\nbenchmark, and code are available at https://github.com/fvasluianu97/IFBlend.\n","authors":["Florin-Alexandru Vasluianu","Tim Seizinger","Zongwei Wu","Rakesh Ranjan","Radu Timofte"],"pdf_url":"https://arxiv.org/pdf/2403.18730v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09992v3","updated":"2024-03-27T16:20:52Z","published":"2023-03-17T14:07:55Z","title":"LION: Implicit Vision Prompt Tuning","summary":" Despite recent competitive performance across a range of vision tasks, vision\nTransformers still have an issue of heavy computational costs. Recently, vision\nprompt learning has provided an economic solution to this problem without\nfine-tuning the whole large-scale models. However, the efficiency of existing\nmodels are still far from satisfactory due to insertion of extensive prompts\nblocks and trick prompt designs. In this paper, we propose an efficient vision\nmodel named impLicit vIsion prOmpt tuNing (LION), which is motivated by deep\nimplicit models with stable memory costs for various complex tasks. In\nparticular, we merely insect two equilibrium implicit layers in two ends of the\npre-trained main backbone with parameters in the backbone frozen. Moreover, we\nprune the parameters in these two layers according to lottery hypothesis. The\nperformance obtained by our LION are promising on a wide range of datasets. In\nparticular, our LION reduces up to 11.5% of training parameter numbers while\nobtaining higher performance compared with the state-of-the-art baseline VPT,\nespecially under challenging scenes. Furthermore, we find that our proposed\nLION had a good generalization performance, making it an easy way to boost\ntransfer learning in the future.\n","authors":["Haixin Wang","Jianlong Chang","Xiao Luo","Jinan Sun","Zhouchen Lin","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2303.09992v3.pdf","comment":"Accepted by AAAI2024; 9 pages, 3 figures, 4 tables"},{"id":"http://arxiv.org/abs/2403.18717v1","updated":"2024-03-27T16:06:37Z","published":"2024-03-27T16:06:37Z","title":"Semi-Supervised Learning for Deep Causal Generative Models","summary":" Developing models that can answer questions of the form \"How would $x$ change\nif $y$ had been $z$?\" is fundamental for advancing medical image analysis.\nTraining causal generative models that address such counterfactual questions,\nthough, currently requires that all relevant variables have been observed and\nthat corresponding labels are available in training data. However, clinical\ndata may not have complete records for all patients and state of the art causal\ngenerative models are unable to take full advantage of this. We thus develop,\nfor the first time, a semi-supervised deep causal generative model that\nexploits the causal relationships between variables to maximise the use of all\navailable data. We explore this in the setting where each sample is either\nfully labelled or fully unlabelled, as well as the more clinically realistic\ncase of having different labels missing for each sample. We leverage techniques\nfrom causal inference to infer missing values and subsequently generate\nrealistic counterfactuals, even for samples with incomplete labels.\n","authors":["Yasin Ibrahim","Hermione Warr","Konstantinos Kamnitsas"],"pdf_url":"https://arxiv.org/pdf/2403.18717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18715v1","updated":"2024-03-27T16:04:47Z","published":"2024-03-27T16:04:47Z","title":"Mitigating Hallucinations in Large Vision-Language Models with\n Instruction Contrastive Decoding","summary":" Large Vision-Language Models (LVLMs) are increasingly adept at generating\ncontextually detailed and coherent responses from visual inputs. However, their\napplication in multimodal decision-making and open-ended generation is hindered\nby a notable rate of hallucinations, where generated text inaccurately\nrepresents the visual contents. To address this issue, this paper introduces\nthe Instruction Contrastive Decoding (ICD) method, a novel approach designed to\nreduce hallucinations during LVLM inference. Our method is inspired by our\nobservation that what we call disturbance instructions significantly exacerbate\nhallucinations in multimodal fusion modules. ICD contrasts distributions from\nstandard and instruction disturbance, thereby increasing alignment uncertainty\nand effectively subtracting hallucinated concepts from the original\ndistribution. Through comprehensive experiments on discriminative benchmarks\n(POPE and MME) and a generative benchmark (LLaVa-Bench), we demonstrate that\nICD significantly mitigates both object-level and attribute-level\nhallucinations. Moreover, our method not only addresses hallucinations but also\nsignificantly enhances the general perception and recognition capabilities of\nLVLMs.\n","authors":["Xintong Wang","Jingheng Pan","Liang Ding","Chris Biemann"],"pdf_url":"https://arxiv.org/pdf/2403.18715v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18714v1","updated":"2024-03-27T16:02:00Z","published":"2024-03-27T16:02:00Z","title":"Bringing Textual Prompt to AI-Generated Image Quality Assessment","summary":" AI-Generated Images (AGIs) have inherent multimodal nature. Unlike\ntraditional image quality assessment (IQA) on natural scenarios, AGIs quality\nassessment (AGIQA) takes the correspondence of image and its textual prompt\ninto consideration. This is coupled in the ground truth score, which confuses\nthe unimodal IQA methods. To solve this problem, we introduce IP-IQA (AGIs\nQuality Assessment via Image and Prompt), a multimodal framework for AGIQA via\ncorresponding image and prompt incorporation. Specifically, we propose a novel\nincremental pretraining task named Image2Prompt for better understanding of\nAGIs and their corresponding textual prompts. An effective and efficient\nimage-prompt fusion module, along with a novel special [QA] token, are also\napplied. Both are plug-and-play and beneficial for the cooperation of image and\nits corresponding prompt. Experiments demonstrate that our IP-IQA achieves the\nstate-of-the-art on AGIQA-1k and AGIQA-3k datasets. Code will be available.\n","authors":["Bowen Qu","Haohui Li","Wei Gao"],"pdf_url":"https://arxiv.org/pdf/2403.18714v1.pdf","comment":"6 pages, 3 figures, accepted by ICME2024"},{"id":"http://arxiv.org/abs/2403.18711v1","updated":"2024-03-27T15:58:25Z","published":"2024-03-27T15:58:25Z","title":"SAT-NGP : Unleashing Neural Graphics Primitives for Fast Relightable\n Transient-Free 3D reconstruction from Satellite Imagery","summary":" Current stereo-vision pipelines produce high accuracy 3D reconstruction when\nusing multiple pairs or triplets of satellite images. However, these pipelines\nare sensitive to the changes between images that can occur as a result of\nmulti-date acquisitions. Such variations are mainly due to variable shadows,\nreflexions and transient objects (cars, vegetation). To take such changes into\naccount, Neural Radiance Fields (NeRF) have recently been applied to multi-date\nsatellite imagery. However, Neural methods are very compute-intensive, taking\ndozens of hours to learn, compared with minutes for standard stereo-vision\npipelines. Following the ideas of Instant Neural Graphics Primitives we propose\nto use an efficient sampling strategy and multi-resolution hash encoding to\naccelerate the learning. Our model, Satellite Neural Graphics Primitives\n(SAT-NGP) decreases the learning time to 15 minutes while maintaining the\nquality of the 3D reconstruction.\n","authors":["Camille Billouard","Dawa Derksen","Emmanuelle Sarrazin","Bruno Vallet"],"pdf_url":"https://arxiv.org/pdf/2403.18711v1.pdf","comment":"5 pages, 3 figures, 1 table; Accepted to International Geoscience and\n Remote Sensing Symposium (IGARSS) 2024; Code available at\n https://github.com/Ellimac0/SAT-NGP"},{"id":"http://arxiv.org/abs/2403.18708v1","updated":"2024-03-27T15:56:42Z","published":"2024-03-27T15:56:42Z","title":"Dense Vision Transformer Compression with Few Samples","summary":" Few-shot model compression aims to compress a large model into a more compact\none with only a tiny training set (even without labels). Block-level pruning\nhas recently emerged as a leading technique in achieving high accuracy and low\nlatency in few-shot CNN compression. But, few-shot compression for Vision\nTransformers (ViT) remains largely unexplored, which presents a new challenge.\nIn particular, the issue of sparse compression exists in traditional CNN\nfew-shot methods, which can only produce very few compressed models of\ndifferent model sizes. This paper proposes a novel framework for few-shot ViT\ncompression named DC-ViT. Instead of dropping the entire block, DC-ViT\nselectively eliminates the attention module while retaining and reusing\nportions of the MLP module. DC-ViT enables dense compression, which outputs\nnumerous compressed models that densely populate the range of model complexity.\nDC-ViT outperforms state-of-the-art few-shot compression methods by a\nsignificant margin of 10 percentage points, along with lower latency in the\ncompression of ViT and its variants.\n","authors":["Hanxiao Zhang","Yifan Zhou","Guo-Hua Wang","Jianxin Wu"],"pdf_url":"https://arxiv.org/pdf/2403.18708v1.pdf","comment":"Accepted to CVPR 2024. Note: Jianxin Wu is a contributing author for\n the arXiv version of this paper but is not listed as an author in the CVPR\n version due to his role as Program Chair"},{"id":"http://arxiv.org/abs/2401.15120v2","updated":"2024-03-27T15:49:52Z","published":"2024-01-26T03:44:58Z","title":"Incorporating simulated spatial context information improves the\n effectiveness of contrastive learning models","summary":" Visual learning often occurs in a specific context, where an agent acquires\nskills through exploration and tracking of its location in a consistent\nenvironment. The historical spatial context of the agent provides a similarity\nsignal for self-supervised contrastive learning. We present a unique approach,\ntermed Environmental Spatial Similarity (ESS), that complements existing\ncontrastive learning methods. Using images from simulated, photorealistic\nenvironments as an experimental setting, we demonstrate that ESS outperforms\ntraditional instance discrimination approaches. Moreover, sampling additional\ndata from the same environment substantially improves accuracy and provides new\naugmentations. ESS allows remarkable proficiency in room classification and\nspatial prediction tasks, especially in unfamiliar environments. This learning\nparadigm has the potential to enable rapid visual learning in agents operating\nin new environments with unique visual characteristics. Potentially\ntransformative applications span from robotics to space exploration. Our proof\nof concept demonstrates improved efficiency over methods that rely on\nextensive, disconnected datasets.\n","authors":["Lizhen Zhu","James Z. Wang","Wonseuk Lee","Brad Wyble"],"pdf_url":"https://arxiv.org/pdf/2401.15120v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12091v3","updated":"2024-03-27T15:44:25Z","published":"2023-03-21T09:07:15Z","title":"Adaptive Negative Evidential Deep Learning for Open-set Semi-supervised\n Learning","summary":" Semi-supervised learning (SSL) methods assume that labeled data, unlabeled\ndata and test data are from the same distribution. Open-set semi-supervised\nlearning (Open-set SSL) considers a more practical scenario, where unlabeled\ndata and test data contain new categories (outliers) not observed in labeled\ndata (inliers). Most previous works focused on outlier detection via binary\nclassifiers, which suffer from insufficient scalability and inability to\ndistinguish different types of uncertainty. In this paper, we propose a novel\nframework, Adaptive Negative Evidential Deep Learning (ANEDL) to tackle these\nlimitations. Concretely, we first introduce evidential deep learning (EDL) as\nan outlier detector to quantify different types of uncertainty, and design\ndifferent uncertainty metrics for self-training and inference. Furthermore, we\npropose a novel adaptive negative optimization strategy, making EDL more\ntailored to the unlabeled dataset containing both inliers and outliers. As\ndemonstrated empirically, our proposed method outperforms existing\nstate-of-the-art methods across four datasets.\n","authors":["Yang Yu","Danruo Deng","Furui Liu","Yueming Jin","Qi Dou","Guangyong Chen","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2303.12091v3.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2403.18690v1","updated":"2024-03-27T15:41:23Z","published":"2024-03-27T15:41:23Z","title":"Annolid: Annotate, Segment, and Track Anything You Need","summary":" Annolid is a deep learning-based software package designed for the\nsegmentation, labeling, and tracking of research targets within video files,\nfocusing primarily on animal behavior analysis. Based on state-of-the-art\ninstance segmentation methods, Annolid now harnesses the Cutie video object\nsegmentation model to achieve resilient, markerless tracking of multiple\nanimals from single annotated frames, even in environments in which they may be\npartially or entirely concealed by environmental features or by one another.\nOur integration of Segment Anything and Grounding-DINO strategies additionally\nenables the automatic masking and segmentation of recognizable animals and\nobjects by text command, removing the need for manual annotation. Annolid's\ncomprehensive approach to object segmentation flexibly accommodates a broad\nspectrum of behavior analysis applications, enabling the classification of\ndiverse behavioral states such as freezing, digging, pup huddling, and social\ninteractions in addition to the tracking of animals and their body parts.\n","authors":["Chen Yang","Thomas A. Cleland"],"pdf_url":"https://arxiv.org/pdf/2403.18690v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08479v2","updated":"2024-03-27T15:38:27Z","published":"2023-12-13T19:38:50Z","title":"Vision Transformer-Based Deep Learning for Histologic Classification of\n Endometrial Cancer","summary":" Endometrial cancer, the fourth most common cancer in females in the United\nStates, with the lifetime risk for developing this disease is approximately\n2.8% in women. Precise histologic evaluation and molecular classification of\nendometrial cancer is important for effective patient management and\ndetermining the best treatment modalities. This study introduces EndoNet, which\nuses convolutional neural networks for extracting histologic features and a\nvision transformer for aggregating these features and classifying slides based\non their visual characteristics into high- and low- grade. The model was\ntrained on 929 digitized hematoxylin and eosin-stained whole-slide images of\nendometrial cancer from hysterectomy cases at Dartmouth-Health. It classifies\nthese slides into low-grade (Endometroid Grades 1 and 2) and high-grade\n(endometroid carcinoma FIGO grade 3, uterine serous carcinoma, carcinosarcoma)\ncategories. EndoNet was evaluated on an internal test set of 110 patients and\nan external test set of 100 patients from the public TCGA database. The model\nachieved a weighted average F1-score of 0.91 (95% CI: 0.86-0.95) and an AUC of\n0.95 (95% CI: 0.89-0.99) on the internal test, and 0.86 (95% CI: 0.80-0.94) for\nF1-score and 0.86 (95% CI: 0.75-0.93) for AUC on the external test. Pending\nfurther validation, EndoNet has the potential to support pathologists without\nthe need of manual annotations in classifying the grades of gynecologic\npathology tumors.\n","authors":["Manu Goyal","Laura J. Tafe","James X. Feng","Kristen E. Muller","Liesbeth Hondelink","Jessica L. Bentz","Saeed Hassanpour"],"pdf_url":"https://arxiv.org/pdf/2312.08479v2.pdf","comment":"4 Tables and 3 Figures"},{"id":"http://arxiv.org/abs/2308.06098v2","updated":"2024-03-27T15:26:44Z","published":"2023-08-11T12:18:53Z","title":"Automated Construction of Time-Space Diagrams for Traffic Analysis Using\n Street-View Video Sequence","summary":" Time-space diagrams are essential tools for analyzing traffic patterns and\noptimizing transportation infrastructure and traffic management strategies.\nTraditional data collection methods for these diagrams have limitations in\nterms of temporal and spatial coverage. Recent advancements in camera\ntechnology have overcome these limitations and provided extensive urban data.\nIn this study, we propose an innovative approach to constructing time-space\ndiagrams by utilizing street-view video sequences captured by cameras mounted\non moving vehicles. Using the state-of-the-art YOLOv5, StrongSORT, and\nphotogrammetry techniques for distance calculation, we can infer vehicle\ntrajectories from the video data and generate time-space diagrams. To evaluate\nthe effectiveness of our proposed method, we utilized datasets from the KITTI\ncomputer vision benchmark suite. The evaluation results demonstrate that our\napproach can generate trajectories from video data, although there are some\nerrors that can be mitigated by improving the performance of the detector,\ntracker, and distance calculation components. In conclusion, the utilization of\nstreet-view video sequences captured by cameras mounted on moving vehicles,\ncombined with state-of-the-art computer vision techniques, has immense\npotential for constructing comprehensive time-space diagrams. These diagrams\noffer valuable insights into traffic patterns and contribute to the design of\ntransportation infrastructure and traffic management strategies.\n","authors":["Tanay Rastogi","Mårten Björkman"],"pdf_url":"https://arxiv.org/pdf/2308.06098v2.pdf","comment":"The paper is published in 2023 IEEE 26th International Conference on\n Intelligent Transportation Systems (ITSC)"},{"id":"http://arxiv.org/abs/2403.18674v1","updated":"2024-03-27T15:17:10Z","published":"2024-03-27T15:17:10Z","title":"Deep Learning for Robust and Explainable Models in Computer Vision","summary":" Recent breakthroughs in machine and deep learning (ML and DL) research have\nprovided excellent tools for leveraging enormous amounts of data and optimizing\nhuge models with millions of parameters to obtain accurate networks for image\nprocessing. These developments open up tremendous opportunities for using\nartificial intelligence (AI) in the automation and human assisted AI industry.\nHowever, as more and more models are deployed and used in practice, many\nchallenges have emerged. This thesis presents various approaches that address\nrobustness and explainability challenges for using ML and DL in practice.\n Robustness and reliability are the critical components of any model before\ncertification and deployment in practice. Deep convolutional neural networks\n(CNNs) exhibit vulnerability to transformations of their inputs, such as\nrotation and scaling, or intentional manipulations as described in the\nadversarial attack literature. In addition, building trust in AI-based models\nrequires a better understanding of current models and developing methods that\nare more explainable and interpretable a priori.\n This thesis presents developments in computer vision models' robustness and\nexplainability. Furthermore, this thesis offers an example of using vision\nmodels' feature response visualization (models' interpretations) to improve\nrobustness despite interpretability and robustness being seemingly unrelated in\nthe related research. Besides methodological developments for robust and\nexplainable vision models, a key message of this thesis is introducing model\ninterpretation techniques as a tool for understanding vision models and\nimproving their design and robustness. In addition to the theoretical\ndevelopments, this thesis demonstrates several applications of ML and DL in\ndifferent contexts, such as medical imaging and affective computing.\n","authors":["Mohammadreza Amirian"],"pdf_url":"https://arxiv.org/pdf/2403.18674v1.pdf","comment":"150 pages, 37 figures, 12 tables"},{"id":"http://arxiv.org/abs/2311.15803v3","updated":"2024-03-27T15:05:19Z","published":"2023-11-27T13:25:47Z","title":"SOAC: Spatio-Temporal Overlap-Aware Multi-Sensor Calibration using\n Neural Radiance Fields","summary":" In rapidly-evolving domains such as autonomous driving, the use of multiple\nsensors with different modalities is crucial to ensure high operational\nprecision and stability. To correctly exploit the provided information by each\nsensor in a single common frame, it is essential for these sensors to be\naccurately calibrated. In this paper, we leverage the ability of Neural\nRadiance Fields (NeRF) to represent different sensors modalities in a common\nvolumetric representation to achieve robust and accurate spatio-temporal sensor\ncalibration. By designing a partitioning approach based on the visible part of\nthe scene for each sensor, we formulate the calibration problem using only the\noverlapping areas. This strategy results in a more robust and accurate\ncalibration that is less prone to failure. We demonstrate that our approach\nworks on outdoor urban scenes by validating it on multiple established driving\ndatasets. Results show that our method is able to get better accuracy and\nrobustness compared to existing methods.\n","authors":["Quentin Herau","Nathan Piasco","Moussab Bennehar","Luis Roldão","Dzmitry Tsishkou","Cyrille Migniot","Pascal Vasseur","Cédric Demonceaux"],"pdf_url":"https://arxiv.org/pdf/2311.15803v3.pdf","comment":"Accepted at CVPR 2024. Project page: https://qherau.github.io/SOAC/"},{"id":"http://arxiv.org/abs/2403.18660v1","updated":"2024-03-27T15:03:38Z","published":"2024-03-27T15:03:38Z","title":"InstructBrush: Learning Attention-based Instruction Optimization for\n Image Editing","summary":" In recent years, instruction-based image editing methods have garnered\nsignificant attention in image editing. However, despite encompassing a wide\nrange of editing priors, these methods are helpless when handling editing tasks\nthat are challenging to accurately describe through language. We propose\nInstructBrush, an inversion method for instruction-based image editing methods\nto bridge this gap. It extracts editing effects from exemplar image pairs as\nediting instructions, which are further applied for image editing. Two key\ntechniques are introduced into InstructBrush, Attention-based Instruction\nOptimization and Transformation-oriented Instruction Initialization, to address\nthe limitations of the previous method in terms of inversion effects and\ninstruction generalization. To explore the ability of instruction inversion\nmethods to guide image editing in open scenarios, we establish a\nTransformationOriented Paired Benchmark (TOP-Bench), which contains a rich set\nof scenes and editing types. The creation of this benchmark paves the way for\nfurther exploration of instruction inversion. Quantitatively and qualitatively,\nour approach achieves superior performance in editing and is more semantically\nconsistent with the target editing effects.\n","authors":["Ruoyu Zhao","Qingnan Fan","Fei Kou","Shuai Qin","Hong Gu","Wei Wu","Pengcheng Xu","Mingrui Zhu","Nannan Wang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2403.18660v1.pdf","comment":"Project Page: https://royzhao926.github.io/InstructBrush/"},{"id":"http://arxiv.org/abs/2311.12386v3","updated":"2024-03-27T15:01:44Z","published":"2023-11-21T06:55:21Z","title":"Point, Segment and Count: A Generalized Framework for Object Counting","summary":" Class-agnostic object counting aims to count all objects in an image with\nrespect to example boxes or class names, \\emph{a.k.a} few-shot and zero-shot\ncounting. In this paper, we propose a generalized framework for both few-shot\nand zero-shot object counting based on detection. Our framework combines the\nsuperior advantages of two foundation models without compromising their\nzero-shot capability: (\\textbf{i}) SAM to segment all possible objects as mask\nproposals, and (\\textbf{ii}) CLIP to classify proposals to obtain accurate\nobject counts. However, this strategy meets the obstacles of efficiency\noverhead and the small crowded objects that cannot be localized and\ndistinguished. To address these issues, our framework, termed PseCo, follows\nthree steps: point, segment, and count. Specifically, we first propose a\nclass-agnostic object localization to provide accurate but least point prompts\nfor SAM, which consequently not only reduces computation costs but also avoids\nmissing small objects. Furthermore, we propose a generalized object\nclassification that leverages CLIP image/text embeddings as the classifier,\nfollowing a hierarchical knowledge distillation to obtain discriminative\nclassifications among hierarchical mask proposals. Extensive experimental\nresults on FSC-147, COCO, and LVIS demonstrate that PseCo achieves\nstate-of-the-art performance in both few-shot/zero-shot object\ncounting/detection. Code: https://github.com/Hzzone/PseCo\n","authors":["Zhizhong Huang","Mingliang Dai","Yi Zhang","Junping Zhang","Hongming Shan"],"pdf_url":"https://arxiv.org/pdf/2311.12386v3.pdf","comment":"Accepted by CVPR 2024. Camera ready"},{"id":"http://arxiv.org/abs/2311.17532v3","updated":"2024-03-27T15:01:22Z","published":"2023-11-29T11:10:40Z","title":"Weakly-Supervised Emotion Transition Learning for Diverse 3D Co-speech\n Gesture Generation","summary":" Generating vivid and emotional 3D co-speech gestures is crucial for virtual\navatar animation in human-machine interaction applications. While the existing\nmethods enable generating the gestures to follow a single emotion label, they\noverlook that long gesture sequence modeling with emotion transition is more\npractical in real scenes. In addition, the lack of large-scale available\ndatasets with emotional transition speech and corresponding 3D human gestures\nalso limits the addressing of this task. To fulfill this goal, we first\nincorporate the ChatGPT-4 and an audio inpainting approach to construct the\nhigh-fidelity emotion transition human speeches. Considering obtaining the\nrealistic 3D pose annotations corresponding to the dynamically inpainted\nemotion transition audio is extremely difficult, we propose a novel weakly\nsupervised training strategy to encourage authority gesture transitions.\nSpecifically, to enhance the coordination of transition gestures w.r.t\ndifferent emotional ones, we model the temporal association representation\nbetween two different emotional gesture sequences as style guidance and infuse\nit into the transition generation. We further devise an emotion mixture\nmechanism that provides weak supervision based on a learnable mixed emotion\nlabel for transition gestures. Last, we present a keyframe sampler to supply\neffective initial posture cues in long sequences, enabling us to generate\ndiverse gestures. Extensive experiments demonstrate that our method outperforms\nthe state-of-the-art models constructed by adapting single emotion-conditioned\ncounterparts on our newly defined emotion transition task and datasets. Our\ncode and dataset will be released on the project page:\nhttps://xingqunqi-lab.github.io/Emo-Transition-Gesture/.\n","authors":["Xingqun Qi","Jiahao Pan","Peng Li","Ruibin Yuan","Xiaowei Chi","Mengfei Li","Wenhan Luo","Wei Xue","Shanghang Zhang","Qifeng Liu","Yike Guo"],"pdf_url":"https://arxiv.org/pdf/2311.17532v3.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.18649v1","updated":"2024-03-27T14:56:44Z","published":"2024-03-27T14:56:44Z","title":"Addressing Data Annotation Challenges in Multiple Sensors: A Solution\n for Scania Collected Datasets","summary":" Data annotation in autonomous vehicles is a critical step in the development\nof Deep Neural Network (DNN) based models or the performance evaluation of the\nperception system. This often takes the form of adding 3D bounding boxes on\ntime-sequential and registered series of point-sets captured from active\nsensors like Light Detection and Ranging (LiDAR) and Radio Detection and\nRanging (RADAR). When annotating multiple active sensors, there is a need to\nmotion compensate and translate the points to a consistent coordinate frame and\ntimestamp respectively. However, highly dynamic objects pose a unique\nchallenge, as they can appear at different timestamps in each sensor's data.\nWithout knowing the speed of the objects, their position appears to be\ndifferent in different sensor outputs. Thus, even after motion compensation,\nhighly dynamic objects are not matched from multiple sensors in the same frame,\nand human annotators struggle to add unique bounding boxes that capture all\nobjects. This article focuses on addressing this challenge, primarily within\nthe context of Scania collected datasets. The proposed solution takes a track\nof an annotated object as input and uses the Moving Horizon Estimation (MHE) to\nrobustly estimate its speed. The estimated speed profile is utilized to correct\nthe position of the annotated box and add boxes to object clusters missed by\nthe original annotation.\n","authors":["Ajinkya Khoche","Aron Asefaw","Alejandro Gonzalez","Bogdan Timus","Sina Sharif Mansouri","Patric Jensfelt"],"pdf_url":"https://arxiv.org/pdf/2403.18649v1.pdf","comment":"Accepted to European Control Conference 2024"},{"id":"http://arxiv.org/abs/2403.18637v1","updated":"2024-03-27T14:42:08Z","published":"2024-03-27T14:42:08Z","title":"Transformers-based architectures for stroke segmentation: A review","summary":" Stroke remains a significant global health concern, necessitating precise and\nefficient diagnostic tools for timely intervention and improved patient\noutcomes. The emergence of deep learning methodologies has transformed the\nlandscape of medical image analysis. Recently, Transformers, initially designed\nfor natural language processing, have exhibited remarkable capabilities in\nvarious computer vision applications, including medical image analysis. This\ncomprehensive review aims to provide an in-depth exploration of the\ncutting-edge Transformer-based architectures applied in the context of stroke\nsegmentation. It commences with an exploration of stroke pathology, imaging\nmodalities, and the challenges associated with accurate diagnosis and\nsegmentation. Subsequently, the review delves into the fundamental ideas of\nTransformers, offering detailed insights into their architectural intricacies\nand the underlying mechanisms that empower them to effectively capture complex\nspatial information within medical images. The existing literature is\nsystematically categorized and analyzed, discussing various approaches that\nleverage Transformers for stroke segmentation. A critical assessment is\nprovided, highlighting the strengths and limitations of these methods,\nincluding considerations of performance and computational efficiency.\nAdditionally, this review explores potential avenues for future research and\ndevelopment\n","authors":["Yalda Zafari-Ghadim","Essam A. Rashed","Mohamed Mabrok"],"pdf_url":"https://arxiv.org/pdf/2403.18637v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.11041v3","updated":"2024-03-27T14:29:27Z","published":"2022-04-23T10:19:58Z","title":"Learning by Erasing: Conditional Entropy based Transferable\n Out-Of-Distribution Detection","summary":" Out-of-distribution (OOD) detection is essential to handle the distribution\nshifts between training and test scenarios. For a new in-distribution (ID)\ndataset, existing methods require retraining to capture the dataset-specific\nfeature representation or data distribution. In this paper, we propose a deep\ngenerative models (DGM) based transferable OOD detection method, which is\nunnecessary to retrain on a new ID dataset. We design an image erasing strategy\nto equip exclusive conditional entropy distribution for each ID dataset, which\ndetermines the discrepancy of DGM's posteriori ucertainty distribution on\ndifferent ID datasets. Owing to the powerful representation capacity of\nconvolutional neural networks, the proposed model trained on complex dataset\ncan capture the above discrepancy between ID datasets without retraining and\nthus achieve transferable OOD detection. We validate the proposed method on\nfive datasets and verity that ours achieves comparable performance to the\nstate-of-the-art group based OOD detection methods that need to be retrained to\ndeploy on new ID datasets. Our code is available at\nhttps://github.com/oOHCIOo/CETOOD.\n","authors":["Meng Xing","Zhiyong Feng","Yong Su","Changjae Oh"],"pdf_url":"https://arxiv.org/pdf/2204.11041v3.pdf","comment":"update new experimental results"},{"id":"http://arxiv.org/abs/2403.18605v1","updated":"2024-03-27T14:24:30Z","published":"2024-03-27T14:24:30Z","title":"FlexEdit: Flexible and Controllable Diffusion-based Object-centric Image\n Editing","summary":" Our work addresses limitations seen in previous approaches for object-centric\nediting problems, such as unrealistic results due to shape discrepancies and\nlimited control in object replacement or insertion. To this end, we introduce\nFlexEdit, a flexible and controllable editing framework for objects where we\niteratively adjust latents at each denoising step using our FlexEdit block.\nInitially, we optimize latents at test time to align with specified object\nconstraints. Then, our framework employs an adaptive mask, automatically\nextracted during denoising, to protect the background while seamlessly blending\nnew content into the target image. We demonstrate the versatility of FlexEdit\nin various object editing tasks and curate an evaluation test suite with\nsamples from both real and synthetic images, along with novel evaluation\nmetrics designed for object-centric editing. We conduct extensive experiments\non different editing scenarios, demonstrating the superiority of our editing\nframework over recent advanced text-guided image editing methods. Our project\npage is published at https://flex-edit.github.io/.\n","authors":["Trong-Tung Nguyen","Duc-Anh Nguyen","Anh Tran","Cuong Pham"],"pdf_url":"https://arxiv.org/pdf/2403.18605v1.pdf","comment":"Our project page: https://flex-edit.github.io/"},{"id":"http://arxiv.org/abs/2403.18600v1","updated":"2024-03-27T14:22:40Z","published":"2024-03-27T14:22:40Z","title":"RAP: Retrieval-Augmented Planner for Adaptive Procedure Planning in\n Instructional Videos","summary":" Procedure Planning in instructional videos entails generating a sequence of\naction steps based on visual observations of the initial and target states.\nDespite the rapid progress in this task, there remain several critical\nchallenges to be solved: (1) Adaptive procedures: Prior works hold an\nunrealistic assumption that the number of action steps is known and fixed,\nleading to non-generalizable models in real-world scenarios where the sequence\nlength varies. (2) Temporal relation: Understanding the step temporal relation\nknowledge is essential in producing reasonable and executable plans. (3)\nAnnotation cost: Annotating instructional videos with step-level labels (i.e.,\ntimestamp) or sequence-level labels (i.e., action category) is demanding and\nlabor-intensive, limiting its generalizability to large-scale datasets.In this\nwork, we propose a new and practical setting, called adaptive procedure\nplanning in instructional videos, where the procedure length is not fixed or\npre-determined. To address these challenges we introduce Retrieval-Augmented\nPlanner (RAP) model. Specifically, for adaptive procedures, RAP adaptively\ndetermines the conclusion of actions using an auto-regressive model\narchitecture. For temporal relation, RAP establishes an external memory module\nto explicitly retrieve the most relevant state-action pairs from the training\nvideos and revises the generated procedures. To tackle high annotation cost,\nRAP utilizes a weakly-supervised learning manner to expand the training dataset\nto other task-relevant, unannotated videos by generating pseudo labels for\naction steps. Experiments on CrossTask and COIN benchmarks show the superiority\nof RAP over traditional fixed-length models, establishing it as a strong\nbaseline solution for adaptive procedure planning.\n","authors":["Ali Zare","Yulei Niu","Hammad Ayyubi","Shih-fu Chang"],"pdf_url":"https://arxiv.org/pdf/2403.18600v1.pdf","comment":"23 pages, 6 figures, 12 tables"},{"id":"http://arxiv.org/abs/2403.18593v1","updated":"2024-03-27T14:18:09Z","published":"2024-03-27T14:18:09Z","title":"Homogeneous Tokenizer Matters: Homogeneous Visual Tokenizer for Remote\n Sensing Image Understanding","summary":" The tokenizer, as one of the fundamental components of large models, has long\nbeen overlooked or even misunderstood in visual tasks. One key factor of the\ngreat comprehension power of the large language model is that natural language\ntokenizers utilize meaningful words or subwords as the basic elements of\nlanguage. In contrast, mainstream visual tokenizers, represented by patch-based\nmethods such as Patch Embed, rely on meaningless rectangular patches as basic\nelements of vision, which cannot serve as effectively as words or subwords in\nlanguage. Starting from the essence of the tokenizer, we defined semantically\nindependent regions (SIRs) for vision. We designed a simple HOmogeneous visual\ntOKenizer: HOOK. HOOK mainly consists of two modules: the Object Perception\nModule (OPM) and the Object Vectorization Module (OVM). To achieve homogeneity,\nthe OPM splits the image into 4*4 pixel seeds and then utilizes the attention\nmechanism to perceive SIRs. The OVM employs cross-attention to merge seeds\nwithin the same SIR. To achieve adaptability, the OVM defines a variable number\nof learnable vectors as cross-attention queries, allowing for the adjustment of\ntoken quantity. We conducted experiments on the NWPU-RESISC45, WHU-RS19\nclassification dataset, and GID5 segmentation dataset for sparse and dense\ntasks. The results demonstrate that the visual tokens obtained by HOOK\ncorrespond to individual objects, which demonstrates homogeneity. HOOK\noutperformed Patch Embed by 6\\% and 10\\% in the two tasks and achieved\nstate-of-the-art performance compared to the baselines used for comparison.\nCompared to Patch Embed, which requires more than one hundred tokens for one\nimage, HOOK requires only 6 and 8 tokens for sparse and dense tasks,\nrespectively, resulting in efficiency improvements of 1.5 to 2.8 times. The\ncode is available at https://github.com/GeoX-Lab/Hook.\n","authors":["Run Shao","Zhaoyang Zhang","Chao Tao","Yunsheng Zhang","Chengli Peng","Haifeng Li"],"pdf_url":"https://arxiv.org/pdf/2403.18593v1.pdf","comment":"20 pages, 8 figures, 6 tables"},{"id":"http://arxiv.org/abs/2403.18589v1","updated":"2024-03-27T14:12:56Z","published":"2024-03-27T14:12:56Z","title":"Users prefer Jpegli over same-sized libjpeg-turbo or MozJPEG","summary":" We performed pairwise comparisons by human raters of JPEG images from\nMozJPEG, libjpeg-turbo and our new Jpegli encoder. When compressing images at a\nquality similar to libjpeg-turbo quality 95, the Jpegli images were 54% likely\nto be preferred over both libjpeg-turbo and MozJPEG images, but used only 2.8\nbits per pixel compared to libjpeg-turbo and MozJPEG that used 3.8 and 3.5 bits\nper pixel respectively. The raw ratings and source images are publicly\navailable for further analysis and study.\n","authors":["Martin Bruse","Luca Versari","Zoltan Szabadka","Jyrki Alakuijala"],"pdf_url":"https://arxiv.org/pdf/2403.18589v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18587v1","updated":"2024-03-27T14:11:23Z","published":"2024-03-27T14:11:23Z","title":"The Impact of Uniform Inputs on Activation Sparsity and Energy-Latency\n Attacks in Computer Vision","summary":" Resource efficiency plays an important role for machine learning nowadays.\nThe energy and decision latency are two critical aspects to ensure a\nsustainable and practical application. Unfortunately, the energy consumption\nand decision latency are not robust against adversaries. Researchers have\nrecently demonstrated that attackers can compute and submit so-called sponge\nexamples at inference time to increase the energy consumption and decision\nlatency of neural networks. In computer vision, the proposed strategy crafts\ninputs with less activation sparsity which could otherwise be used to\naccelerate the computation. In this paper, we analyze the mechanism how these\nenergy-latency attacks reduce activation sparsity. In particular, we find that\ninput uniformity is a key enabler. A uniform image, that is, an image with\nmostly flat, uniformly colored surfaces, triggers more activations due to a\nspecific interplay of convolution, batch normalization, and ReLU activation.\nBased on these insights, we propose two new simple, yet effective strategies\nfor crafting sponge examples: sampling images from a probability distribution\nand identifying dense, yet inconspicuous inputs in natural datasets. We\nempirically examine our findings in a comprehensive evaluation with multiple\nimage classification models and show that our attack achieves the same sparsity\neffect as prior sponge-example methods, but at a fraction of computation\neffort. We also show that our sponge examples transfer between different neural\nnetworks. Finally, we discuss applications of our findings for the good by\nimproving efficiency by increasing sparsity.\n","authors":["Andreas Müller","Erwin Quiring"],"pdf_url":"https://arxiv.org/pdf/2403.18587v1.pdf","comment":"Accepted at the DLSP 2024"},{"id":"http://arxiv.org/abs/2312.07264v2","updated":"2024-03-27T14:09:10Z","published":"2023-12-12T13:44:53Z","title":"Dual Structure-Aware Image Filterings for Semi-supervised Medical Image\n Segmentation","summary":" Semi-supervised image segmentation has attracted great attention recently.\nThe key is how to leverage unlabeled images in the training process. Most\nmethods maintain consistent predictions of the unlabeled images under\nvariations (e.g., adding noise/perturbations, or creating alternative versions)\nin the image and/or model level. In most image-level variation, medical images\noften have prior structure information, which has not been well explored. In\nthis paper, we propose novel dual structure-aware image filterings (DSAIF) as\nthe image-level variations for semi-supervised medical image segmentation.\nMotivated by connected filtering that simplifies image via filtering in\nstructure-aware tree-based image representation, we resort to the dual contrast\ninvariant Max-tree and Min-tree representation. Specifically, we propose a\nnovel connected filtering that removes topologically equivalent nodes (i.e.\nconnected components) having no siblings in the Max/Min-tree. This results in\ntwo filtered images preserving topologically critical structure. Applying the\nproposed DSAIF to mutually supervised networks decreases the consensus of their\nerroneous predictions on unlabeled images. This helps to alleviate the\nconfirmation bias issue of overfitting to noisy pseudo labels of unlabeled\nimages, and thus effectively improves the segmentation performance. Extensive\nexperimental results on three benchmark datasets demonstrate that the proposed\nmethod significantly/consistently outperforms some state-of-the-art methods.\nThe source codes will be publicly available.\n","authors":["Yuliang Gu","Zhichao Sun","Tian Chen","Xin Xiao","Yepeng Liu","Yongchao Xu","Laurent Najman"],"pdf_url":"https://arxiv.org/pdf/2312.07264v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18575v1","updated":"2024-03-27T13:56:08Z","published":"2024-03-27T13:56:08Z","title":"HandBooster: Boosting 3D Hand-Mesh Reconstruction by Conditional\n Synthesis and Sampling of Hand-Object Interactions","summary":" Reconstructing 3D hand mesh robustly from a single image is very challenging,\ndue to the lack of diversity in existing real-world datasets. While data\nsynthesis helps relieve the issue, the syn-to-real gap still hinders its usage.\nIn this work, we present HandBooster, a new approach to uplift the data\ndiversity and boost the 3D hand-mesh reconstruction performance by training a\nconditional generative space on hand-object interactions and purposely sampling\nthe space to synthesize effective data samples. First, we construct versatile\ncontent-aware conditions to guide a diffusion model to produce realistic images\nwith diverse hand appearances, poses, views, and backgrounds; favorably,\naccurate 3D annotations are obtained for free. Then, we design a novel\ncondition creator based on our similarity-aware distribution sampling\nstrategies to deliberately find novel and realistic interaction poses that are\ndistinctive from the training set. Equipped with our method, several baselines\ncan be significantly improved beyond the SOTA on the HO3D and DexYCB\nbenchmarks. Our code will be released on\nhttps://github.com/hxwork/HandBooster_Pytorch.\n","authors":["Hao Xu","Haipeng Li","Yinqiao Wang","Shuaicheng Liu","Chi-Wing Fu"],"pdf_url":"https://arxiv.org/pdf/2403.18575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.07636v3","updated":"2024-03-27T13:51:59Z","published":"2024-03-12T13:18:22Z","title":"Decomposing Disease Descriptions for Enhanced Pathology Detection: A\n Multi-Aspect Vision-Language Pre-training Framework","summary":" Medical vision language pre-training (VLP) has emerged as a frontier of\nresearch, enabling zero-shot pathological recognition by comparing the query\nimage with the textual descriptions for each disease. Due to the complex\nsemantics of biomedical texts, current methods struggle to align medical images\nwith key pathological findings in unstructured reports. This leads to the\nmisalignment with the target disease's textual representation. In this paper,\nwe introduce a novel VLP framework designed to dissect disease descriptions\ninto their fundamental aspects, leveraging prior knowledge about the visual\nmanifestations of pathologies. This is achieved by consulting a large language\nmodel and medical experts. Integrating a Transformer module, our approach\naligns an input image with the diverse elements of a disease, generating\naspect-centric image representations. By consolidating the matches from each\naspect, we improve the compatibility between an image and its associated\ndisease. Additionally, capitalizing on the aspect-oriented representations, we\npresent a dual-head Transformer tailored to process known and unknown diseases,\noptimizing the comprehensive detection efficacy. Conducting experiments on\nseven downstream datasets, ours improves the accuracy of recent methods by up\nto 8.56% and 17.0% for seen and unseen categories, respectively. Our code is\nreleased at https://github.com/HieuPhan33/MAVL.\n","authors":["Vu Minh Hieu Phan","Yutong Xie","Yuankai Qi","Lingqiao Liu","Liyang Liu","Bowen Zhang","Zhibin Liao","Qi Wu","Minh-Son To","Johan W. Verjans"],"pdf_url":"https://arxiv.org/pdf/2403.07636v3.pdf","comment":"Accepted at CVPR2024. Pre-print before final camera-ready version"},{"id":"http://arxiv.org/abs/2403.18565v1","updated":"2024-03-27T13:46:01Z","published":"2024-03-27T13:46:01Z","title":"Artifact Reduction in 3D and 4D Cone-beam Computed Tomography Images\n with Deep Learning -- A Review","summary":" Deep learning based approaches have been used to improve image quality in\ncone-beam computed tomography (CBCT), a medical imaging technique often used in\napplications such as image-guided radiation therapy, implant dentistry or\northopaedics. In particular, while deep learning methods have been applied to\nreduce various types of CBCT image artifacts arising from motion, metal\nobjects, or low-dose acquisition, a comprehensive review summarizing the\nsuccesses and shortcomings of these approaches, with a primary focus on the\ntype of artifacts rather than the architecture of neural networks, is lacking\nin the literature. In this review, the data generation and simulation\npipelines, and artifact reduction techniques are specifically investigated for\neach type of artifact. We provide an overview of deep learning techniques that\nhave successfully been shown to reduce artifacts in 3D, as well as in\ntime-resolved (4D) CBCT through the use of projection- and/or volume-domain\noptimizations, or by introducing neural networks directly within the CBCT\nreconstruction algorithms. Research gaps are identified to suggest avenues for\nfuture exploration. One of the key findings of this work is an observed trend\ntowards the use of generative models including GANs and score-based or\ndiffusion models, accompanied with the need for more diverse and open training\ndatasets and simulations.\n","authors":["Mohammadreza Amirian","Daniel Barco","Ivo Herzig","Frank-Peter Schilling"],"pdf_url":"https://arxiv.org/pdf/2403.18565v1.pdf","comment":"16 pages, 4 figures, 1 Table, published in IEEE Access Journal"},{"id":"http://arxiv.org/abs/2403.09700v2","updated":"2024-03-27T13:42:25Z","published":"2024-03-05T22:19:21Z","title":"Shapley Values-Powered Framework for Fair Reward Split in Content\n Produced by GenAI","summary":" It is evident that, currently, generative models are surpassed in quality by\nhuman professionals. However, with the advancements in Artificial Intelligence,\nthis gap will narrow, leading to scenarios where individuals who have dedicated\nyears of their lives to mastering a skill become obsolete due to their high\ncosts, which are inherently linked to the time they require to complete a task\n-- a task that AI could accomplish in minutes or seconds. To avoid future\nsocial upheavals, we must, even now, contemplate how to fairly assess the\ncontributions of such individuals in training generative models and how to\ncompensate them for the reduction or complete loss of their incomes. In this\nwork, we propose a method to structure collaboration between model developers\nand data providers. To achieve this, we employ Shapley Values to quantify the\ncontribution of artist(s) in an image generated by the Stable Diffusion-v1.5\nmodel and to equitably allocate the reward among them.\n","authors":["Alex Glinsky","Alexey Sokolsky"],"pdf_url":"https://arxiv.org/pdf/2403.09700v2.pdf","comment":"36 pages, 32 figures"},{"id":"http://arxiv.org/abs/2403.18554v1","updated":"2024-03-27T13:33:14Z","published":"2024-03-27T13:33:14Z","title":"CosalPure: Learning Concept from Group Images for Robust Co-Saliency\n Detection","summary":" Co-salient object detection (CoSOD) aims to identify the common and salient\n(usually in the foreground) regions across a given group of images. Although\nachieving significant progress, state-of-the-art CoSODs could be easily\naffected by some adversarial perturbations, leading to substantial accuracy\nreduction. The adversarial perturbations can mislead CoSODs but do not change\nthe high-level semantic information (e.g., concept) of the co-salient objects.\nIn this paper, we propose a novel robustness enhancement framework by first\nlearning the concept of the co-salient objects based on the input group images\nand then leveraging this concept to purify adversarial perturbations, which are\nsubsequently fed to CoSODs for robustness enhancement. Specifically, we propose\nCosalPure containing two modules, i.e., group-image concept learning and\nconcept-guided diffusion purification. For the first module, we adopt a\npre-trained text-to-image diffusion model to learn the concept of co-salient\nobjects within group images where the learned concept is robust to adversarial\nexamples. For the second module, we map the adversarial image to the latent\nspace and then perform diffusion generation by embedding the learned concept\ninto the noise prediction function as an extra condition. Our method can\neffectively alleviate the influence of the SOTA adversarial attack containing\ndifferent adversarial patterns, including exposure and noise. The extensive\nresults demonstrate that our method could enhance the robustness of CoSODs\nsignificantly.\n","authors":["Jiayi Zhu","Qing Guo","Felix Juefei-Xu","Yihao Huang","Yang Liu","Geguang Pu"],"pdf_url":"https://arxiv.org/pdf/2403.18554v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2403.18551v1","updated":"2024-03-27T13:31:39Z","published":"2024-03-27T13:31:39Z","title":"Attention Calibration for Disentangled Text-to-Image Personalization","summary":" Recent thrilling progress in large-scale text-to-image (T2I) models has\nunlocked unprecedented synthesis quality of AI-generated content (AIGC)\nincluding image generation, 3D and video composition. Further, personalized\ntechniques enable appealing customized production of a novel concept given only\nseveral images as reference. However, an intriguing problem persists: Is it\npossible to capture multiple, novel concepts from one single reference image?\nIn this paper, we identify that existing approaches fail to preserve visual\nconsistency with the reference image and eliminate cross-influence from\nconcepts. To alleviate this, we propose an attention calibration mechanism to\nimprove the concept-level understanding of the T2I model. Specifically, we\nfirst introduce new learnable modifiers bound with classes to capture\nattributes of multiple concepts. Then, the classes are separated and\nstrengthened following the activation of the cross-attention operation,\nensuring comprehensive and self-contained concepts. Additionally, we suppress\nthe attention activation of different classes to mitigate mutual influence\namong concepts. Together, our proposed method, dubbed DisenDiff, can learn\ndisentangled multiple concepts from one single image and produce novel\ncustomized images with learned concepts. We demonstrate that our method\noutperforms the current state of the art in both qualitative and quantitative\nevaluations. More importantly, our proposed techniques are compatible with LoRA\nand inpainting pipelines, enabling more interactive experiences.\n","authors":["Yanbing Zhang","Mengping Yang","Qin Zhou","Zhe Wang"],"pdf_url":"https://arxiv.org/pdf/2403.18551v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.18550v1","updated":"2024-03-27T13:30:48Z","published":"2024-03-27T13:30:48Z","title":"OrCo: Towards Better Generalization via Orthogonality and Contrast for\n Few-Shot Class-Incremental Learning","summary":" Few-Shot Class-Incremental Learning (FSCIL) introduces a paradigm in which\nthe problem space expands with limited data. FSCIL methods inherently face the\nchallenge of catastrophic forgetting as data arrives incrementally, making\nmodels susceptible to overwriting previously acquired knowledge. Moreover,\ngiven the scarcity of labeled samples available at any given time, models may\nbe prone to overfitting and find it challenging to strike a balance between\nextensive pretraining and the limited incremental data. To address these\nchallenges, we propose the OrCo framework built on two core principles:\nfeatures' orthogonality in the representation space, and contrastive learning.\nIn particular, we improve the generalization of the embedding space by\nemploying a combination of supervised and self-supervised contrastive losses\nduring the pretraining phase. Additionally, we introduce OrCo loss to address\nchallenges arising from data limitations during incremental sessions. Through\nfeature space perturbations and orthogonality between classes, the OrCo loss\nmaximizes margins and reserves space for the following incremental data. This,\nin turn, ensures the accommodation of incoming classes in the feature space\nwithout compromising previously acquired knowledge. Our experimental results\nshowcase state-of-the-art performance across three benchmark datasets,\nincluding mini-ImageNet, CIFAR100, and CUB datasets. Code is available at\nhttps://github.com/noorahmedds/OrCo\n","authors":["Noor Ahmed","Anna Kukleva","Bernt Schiele"],"pdf_url":"https://arxiv.org/pdf/2403.18550v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18548v1","updated":"2024-03-27T13:27:02Z","published":"2024-03-27T13:27:02Z","title":"A Semi-supervised Nighttime Dehazing Baseline with Spatial-Frequency\n Aware and Realistic Brightness Constraint","summary":" Existing research based on deep learning has extensively explored the problem\nof daytime image dehazing. However, few studies have considered the\ncharacteristics of nighttime hazy scenes. There are two distinctions between\nnighttime and daytime haze. First, there may be multiple active colored light\nsources with lower illumination intensity in nighttime scenes, which may cause\nhaze, glow and noise with localized, coupled and frequency inconsistent\ncharacteristics. Second, due to the domain discrepancy between simulated and\nreal-world data, unrealistic brightness may occur when applying a dehazing\nmodel trained on simulated data to real-world data. To address the above two\nissues, we propose a semi-supervised model for real-world nighttime dehazing.\nFirst, the spatial attention and frequency spectrum filtering are implemented\nas a spatial-frequency domain information interaction module to handle the\nfirst issue. Second, a pseudo-label-based retraining strategy and a local\nwindow-based brightness loss for semi-supervised training process is designed\nto suppress haze and glow while achieving realistic brightness. Experiments on\npublic benchmarks validate the effectiveness of the proposed method and its\nsuperiority over state-of-the-art methods. The source code and Supplementary\nMaterials are placed in the https://github.com/Xiaofeng-life/SFSNiD.\n","authors":["Xiaofeng Cong","Jie Gui","Jing Zhang","Junming Hou","Hao Shen"],"pdf_url":"https://arxiv.org/pdf/2403.18548v1.pdf","comment":"This paper is accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.18546v1","updated":"2024-03-27T13:24:58Z","published":"2024-03-27T13:24:58Z","title":"Efficient Heatmap-Guided 6-Dof Grasp Detection in Cluttered Scenes","summary":" Fast and robust object grasping in clutter is a crucial component of\nrobotics. Most current works resort to the whole observed point cloud for 6-Dof\ngrasp generation, ignoring the guidance information excavated from global\nsemantics, thus limiting high-quality grasp generation and real-time\nperformance. In this work, we show that the widely used heatmaps are\nunderestimated in the efficiency of 6-Dof grasp generation. Therefore, we\npropose an effective local grasp generator combined with grasp heatmaps as\nguidance, which infers in a global-to-local semantic-to-point way.\nSpecifically, Gaussian encoding and the grid-based strategy are applied to\npredict grasp heatmaps as guidance to aggregate local points into graspable\nregions and provide global semantic information. Further, a novel non-uniform\nanchor sampling mechanism is designed to improve grasp accuracy and diversity.\nBenefiting from the high-efficiency encoding in the image space and focusing on\npoints in local graspable regions, our framework can perform high-quality grasp\ndetection in real-time and achieve state-of-the-art results. In addition, real\nrobot experiments demonstrate the effectiveness of our method with a success\nrate of 94% and a clutter completion rate of 100%. Our code is available at\nhttps://github.com/THU-VCLab/HGGD.\n","authors":["Siang Chen","Wei Tang","Pengwei Xie","Wenming Yang","Guijin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.18546v1.pdf","comment":"Extensive results on GraspNet-1B dataset"},{"id":"http://arxiv.org/abs/2310.15081v3","updated":"2024-03-27T13:23:28Z","published":"2023-10-23T16:41:13Z","title":"E4S: Fine-grained Face Swapping via Editing With Regional GAN Inversion","summary":" This paper proposes a novel approach to face swapping from the perspective of\nfine-grained facial editing, dubbed \"editing for swapping\" (E4S). The\ntraditional face swapping methods rely on global feature extraction and fail to\npreserve the detailed source identity. In contrast, we propose a Regional GAN\nInversion (RGI) method, which allows the explicit disentanglement of shape and\ntexture. Specifically, our E4S performs face swapping in the latent space of a\npretrained StyleGAN, where a multi-scale mask-guided encoder is applied to\nproject the texture of each facial component into regional style codes and a\nmask-guided injection module manipulating feature maps with the style codes.\nBased on this disentanglement, face swapping can be simplified as style and\nmask swapping. Besides, due to the large lighting condition gap, transferring\nthe source skin into the target image may lead to disharmony lighting. We\npropose a re-coloring network to make the swapped face maintain the target\nlighting condition while preserving the source skin. Further, to deal with the\npotential mismatch areas during mask exchange, we design a face inpainting\nmodule to refine the face shape. The extensive comparisons with\nstate-of-the-art methods demonstrate that our E4S outperforms existing methods\nin preserving texture, shape, and lighting. Our implementation is available at\nhttps://github.com/e4s2024/E4S2024.\n","authors":["Maomao Li","Ge Yuan","Cairong Wang","Zhian Liu","Yong Zhang","Yongwei Nie","Jue Wang","Dong Xu"],"pdf_url":"https://arxiv.org/pdf/2310.15081v3.pdf","comment":"Project Page: https://e4s2024.github.io/ ;. arXiv admin note: text\n overlap with arXiv:2211.14068"},{"id":"http://arxiv.org/abs/2403.18525v1","updated":"2024-03-27T12:59:44Z","published":"2024-03-27T12:59:44Z","title":"Language Plays a Pivotal Role in the Object-Attribute Compositional\n Generalization of CLIP","summary":" Vision-language models, such as CLIP, have shown promising\nOut-of-Distribution (OoD) generalization under various types of distribution\nshifts. Recent studies attempted to investigate the leading cause of this\ncapability. In this work, we follow the same path, but focus on a specific type\nof OoD data - images with novel compositions of attribute-object pairs - and\nstudy whether such models can successfully classify those images into\ncomposition classes. We carefully designed an authentic image test dataset\ncalled ImageNet-AO, consisting of attributes for objects that are unlikely\nencountered in the CLIP training sets. We found that CLIPs trained with large\ndatasets such as OpenAI CLIP, LAION-400M, and LAION-2B show orders-of-magnitude\nimprovement in effective compositional OoD generalization compared to both\nsupervised models and CLIPs trained with smaller datasets, such as CC-12M and\nYFCC-15M. Our results provide evidence that the scale and diversity of training\ndata and language supervision play a key role in unlocking the compositional\ngeneralization abilities of vision-language models.\n","authors":["Reza Abbasi","Mohammad Samiei","Mohammad Hossein Rohban","Mahdieh Soleymani Baghshah"],"pdf_url":"https://arxiv.org/pdf/2403.18525v1.pdf","comment":"Oral accepted at OODCV 2023(http://www.ood-cv.org)"},{"id":"http://arxiv.org/abs/2403.18514v1","updated":"2024-03-27T12:44:57Z","published":"2024-03-27T12:44:57Z","title":"CT-3DFlow : Leveraging 3D Normalizing Flows for Unsupervised Detection\n of Pathological Pulmonary CT scans","summary":" Unsupervised pathology detection can be implemented by training a model on\nhealthy data only and measuring the deviation from the training set upon\ninference, for example with CNN-based feature extraction and one-class\nclassifiers, or reconstruction-score-based methods such as AEs, GANs and\nDiffusion models. Normalizing Flows (NF) have the ability to directly learn the\nprobability distribution of training examples through an invertible\narchitecture. We leverage this property in a novel 3D NF-based model named\nCT-3DFlow, specifically tailored for patient-level pulmonary pathology\ndetection in chest CT data. Our model is trained unsupervised on healthy 3D\npulmonary CT patches, and detects deviations from its log-likelihood\ndistribution as anomalies. We aggregate patches-level likelihood values from a\npatient's CT scan to provide a patient-level 'normal'/'abnormal' prediction.\nOut-of-distribution detection performance is evaluated using expert annotations\non a separate chest CT test dataset, outperforming other state-of-the-art\nmethods.\n","authors":["Aissam Djahnine","Alexandre Popoff","Emilien Jupin-Delevaux","Vincent Cottin","Olivier Nempont","Loic Boussel"],"pdf_url":"https://arxiv.org/pdf/2403.18514v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04344v3","updated":"2024-03-27T12:44:55Z","published":"2023-06-07T11:18:53Z","title":"ViDA: Homeostatic Visual Domain Adapter for Continual Test Time\n Adaptation","summary":" Since real-world machine systems are running in non-stationary environments,\nContinual Test-Time Adaptation (CTTA) task is proposed to adapt the pre-trained\nmodel to continually changing target domains. Recently, existing methods mainly\nfocus on model-based adaptation, which aims to leverage a self-training manner\nto extract the target domain knowledge. However, pseudo labels can be noisy and\nthe updated model parameters are unreliable under dynamic data distributions,\nleading to error accumulation and catastrophic forgetting in the continual\nadaptation process. To tackle these challenges and maintain the model\nplasticity, we design a Visual Domain Adapter (ViDA) for CTTA, explicitly\nhandling both domain-specific and domain-shared knowledge. Specifically, we\nfirst comprehensively explore the different domain representations of the\nadapters with trainable high-rank or low-rank embedding spaces. Then we inject\nViDAs into the pre-trained model, which leverages high-rank and low-rank\nfeatures to adapt the current domain distribution and maintain the continual\ndomain-shared knowledge, respectively. To exploit the low-rank and high-rank\nViDAs more effectively, we further propose a Homeostatic Knowledge Allotment\n(HKA) strategy, which adaptively combines different knowledge from each ViDA.\nExtensive experiments conducted on four widely used benchmarks demonstrate that\nour proposed method achieves state-of-the-art performance in both\nclassification and segmentation CTTA tasks. Note that, our method can be\nregarded as a novel transfer paradigm for large-scale models, delivering\npromising results in adaptation to continually changing distributions. Project\npage: https://sites.google.com/view/iclr2024-vida/home.\n","authors":["Jiaming Liu","Senqiao Yang","Peidong Jia","Renrui Zhang","Ming Lu","Yandong Guo","Wei Xue","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.04344v3.pdf","comment":"Accepted by ICLR2024"},{"id":"http://arxiv.org/abs/2403.18512v1","updated":"2024-03-27T12:41:30Z","published":"2024-03-27T12:41:30Z","title":"ParCo: Part-Coordinating Text-to-Motion Synthesis","summary":" We study a challenging task: text-to-motion synthesis, aiming to generate\nmotions that align with textual descriptions and exhibit coordinated movements.\nCurrently, the part-based methods introduce part partition into the motion\nsynthesis process to achieve finer-grained generation. However, these methods\nencounter challenges such as the lack of coordination between different part\nmotions and difficulties for networks to understand part concepts. Moreover,\nintroducing finer-grained part concepts poses computational complexity\nchallenges. In this paper, we propose Part-Coordinating Text-to-Motion\nSynthesis (ParCo), endowed with enhanced capabilities for understanding part\nmotions and communication among different part motion generators, ensuring a\ncoordinated and fined-grained motion synthesis. Specifically, we discretize\nwhole-body motion into multiple part motions to establish the prior concept of\ndifferent parts. Afterward, we employ multiple lightweight generators designed\nto synthesize different part motions and coordinate them through our part\ncoordination module. Our approach demonstrates superior performance on common\nbenchmarks with economic computations, including HumanML3D and KIT-ML,\nproviding substantial evidence of its effectiveness. Code is available at\nhttps://github.com/qrzou/ParCo .\n","authors":["Qiran Zou","Shangyuan Yuan","Shian Du","Yu Wang","Chang Liu","Yi Xu","Jie Chen","Xiangyang Ji"],"pdf_url":"https://arxiv.org/pdf/2403.18512v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16516v2","updated":"2024-03-27T12:32:31Z","published":"2024-03-25T08:00:43Z","title":"Visually Guided Generative Text-Layout Pre-training for Document\n Intelligence","summary":" Prior study shows that pre-training techniques can boost the performance of\nvisual document understanding (VDU), which typically requires models to gain\nabilities to perceive and reason both document texts and layouts (e.g.,\nlocations of texts and table-cells). To this end, we propose visually guided\ngenerative text-layout pre-training, named ViTLP. Given a document image, the\nmodel optimizes hierarchical language and layout modeling objectives to\ngenerate the interleaved text and layout sequence. In addition, to address the\nlimitation of processing long documents by Transformers, we introduce a\nstraightforward yet effective multi-segment generative pre-training scheme,\nfacilitating ViTLP to process word-intensive documents of any length. ViTLP can\nfunction as a native OCR model to localize and recognize texts of document\nimages. Besides, ViTLP can be effectively applied to various downstream VDU\ntasks. Extensive experiments show that ViTLP achieves competitive performance\nover existing baselines on benchmark VDU tasks, including information\nextraction, document classification, and document question answering.\n","authors":["Zhiming Mao","Haoli Bai","Lu Hou","Jiansheng Wei","Xin Jiang","Qun Liu","Kam-Fai Wong"],"pdf_url":"https://arxiv.org/pdf/2403.16516v2.pdf","comment":"Accepted to NAACL 2024 main conference. The first version of this\n paper was submitted to OpenReview\n (https://openreview.net/forum?id=ARtBIBAmNR) in June 2023"},{"id":"http://arxiv.org/abs/2312.06358v2","updated":"2024-03-27T12:24:29Z","published":"2023-12-11T13:05:54Z","title":"Intraoperative 2D/3D Image Registration via Differentiable X-ray\n Rendering","summary":" Surgical decisions are informed by aligning rapid portable 2D intraoperative\nimages (e.g., X-rays) to a high-fidelity 3D preoperative reference scan (e.g.,\nCT). 2D/3D image registration often fails in practice: conventional\noptimization methods are prohibitively slow and susceptible to local minima,\nwhile neural networks trained on small datasets fail on new patients or require\nimpractical landmark supervision. We present DiffPose, a self-supervised\napproach that leverages patient-specific simulation and differentiable\nphysics-based rendering to achieve accurate 2D/3D registration without relying\non manually labeled data. Preoperatively, a CNN is trained to regress the pose\nof a randomly oriented synthetic X-ray rendered from the preoperative CT. The\nCNN then initializes rapid intraoperative test-time optimization that uses the\ndifferentiable X-ray renderer to refine the solution. Our work further proposes\nseveral geometrically principled methods for sampling camera poses from\n$\\mathbf{SE}(3)$, for sparse differentiable rendering, and for driving\nregistration in the tangent space $\\mathfrak{se}(3)$ with geodesic and\nmultiscale locality-sensitive losses. DiffPose achieves sub-millimeter accuracy\nacross surgical datasets at intraoperative speeds, improving upon existing\nunsupervised methods by an order of magnitude and even outperforming supervised\nbaselines. Our code is available at https://github.com/eigenvivek/DiffPose.\n","authors":["Vivek Gopalakrishnan","Neel Dey","Polina Golland"],"pdf_url":"https://arxiv.org/pdf/2312.06358v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.18501v1","updated":"2024-03-27T12:24:20Z","published":"2024-03-27T12:24:20Z","title":"HEMIT: H&E to Multiplex-immunohistochemistry Image Translation with\n Dual-Branch Pix2pix Generator","summary":" Computational analysis of multiplexed immunofluorescence histology data is\nemerging as an important method for understanding the tumour micro-environment\nin cancer. This work presents HEMIT, a dataset designed for translating\nHematoxylin and Eosin (H&E) sections to multiplex-immunohistochemistry (mIHC)\nimages, featuring DAPI, CD3, and panCK markers. Distinctively, HEMIT's mIHC\nimages are multi-component and cellular-level aligned with H&E, enriching\nsupervised stain translation tasks. To our knowledge, HEMIT is the first\npublicly available cellular-level aligned dataset that enables H&E to\nmulti-target mIHC image translation. This dataset provides the computer vision\ncommunity with a valuable resource to develop novel computational methods which\nhave the potential to gain new insights from H&E slide archives.\n We also propose a new dual-branch generator architecture, using residual\nConvolutional Neural Networks (CNNs) and Swin Transformers which achieves\nbetter translation outcomes than other popular algorithms. When evaluated on\nHEMIT, it outperforms pix2pixHD, pix2pix, U-Net, and ResNet, achieving the\nhighest overall score on key metrics including the Structural Similarity Index\nMeasure (SSIM), Pearson correlation score (R), and Peak signal-to-noise Ratio\n(PSNR). Additionally, downstream analysis has been used to further validate the\nquality of the generated mIHC images. These results set a new benchmark in the\nfield of stain translation tasks.\n","authors":["Chang Bian","Beth Philips","Tim Cootes","Martin Fergie"],"pdf_url":"https://arxiv.org/pdf/2403.18501v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04698v3","updated":"2024-03-27T12:24:17Z","published":"2023-11-08T14:10:19Z","title":"Challenging Common Paradigms in Multi-Task Learning","summary":" While multi-task learning (MTL) has gained significant attention in recent\nyears, its underlying mechanisms remain poorly understood. Recent methods did\nnot yield consistent performance improvements over single task learning (STL)\nbaselines, underscoring the importance of gaining more profound insights about\nchallenges specific to MTL. In our study, we challenge paradigms in MTL in the\ncontext of STL: First, the impact of the choice of optimizer has only been\nmildly investigated in MTL. We show the pivotal role of common STL tools such\nas the Adam optimizer in MTL empirically in various experiments. To further\ninvestigate Adam's effectiveness, we theoretical derive a partial loss-scale\ninvariance under mild assumptions. Second, the notion of gradient conflicts has\noften been phrased as a specific problem in MTL. We delve into the role of\ngradient conflicts in MTL and compare it to STL. For angular gradient alignment\nwe find no evidence that this is a unique problem in MTL. We emphasize\ndifferences in gradient magnitude as the main distinguishing factor. Lastly, we\ncompare the transferability of features learned through MTL and STL on common\nimage corruptions, and find light evidence that MTL can lead to superior\ntransferability. Overall, we find surprising similarities between STL and MTL\nsuggesting to consider methods from both fields in a broader context.\n","authors":["Cathrin Elich","Lukas Kirchdorfer","Jan M. Köhler","Lukas Schott"],"pdf_url":"https://arxiv.org/pdf/2311.04698v3.pdf","comment":"-"},{"id":"http://arxiv.org/abs/2403.18495v1","updated":"2024-03-27T12:15:22Z","published":"2024-03-27T12:15:22Z","title":"Direct mineral content prediction from drill core images via transfer\n learning","summary":" Deep subsurface exploration is important for mining, oil and gas industries,\nas well as in the assessment of geological units for the disposal of chemical\nor nuclear waste, or the viability of geothermal energy systems. Typically,\ndetailed examinations of subsurface formations or units are performed on\ncuttings or core materials extracted during drilling campaigns, as well as on\ngeophysical borehole data, which provide detailed information about the\npetrophysical properties of the rocks. Depending on the volume of rock samples\nand the analytical program, the laboratory analysis and diagnostics can be very\ntime-consuming. This study investigates the potential of utilizing machine\nlearning, specifically convolutional neural networks (CNN), to assess the\nlithology and mineral content solely from analysis of drill core images, aiming\nto support and expedite the subsurface geological exploration. The paper\noutlines a comprehensive methodology, encompassing data preprocessing, machine\nlearning methods, and transfer learning techniques. The outcome reveals a\nremarkable 96.7% accuracy in the classification of drill core segments into\ndistinct formation classes. Furthermore, a CNN model was trained for the\nevaluation of mineral content using a learning data set from multidimensional\nlog analysis data (silicate, total clay, carbonate). When benchmarked against\nlaboratory XRD measurements on samples from the cores, both the advanced\nmultidimensional log analysis model and the neural network approach developed\nhere provide equally good performance. This work demonstrates that deep\nlearning and particularly transfer learning can support extracting\npetrophysical properties, including mineral content and formation\nclassification, from drill core images, thus offering a road map for enhancing\nmodel performance and data set quality in image-based analysis of drill cores.\n","authors":["Romana Boiger","Sergey V. Churakov","Ignacio Ballester Llagaria","Georg Kosakowski","Raphael Wüst","Nikolaos I. Prasianakis"],"pdf_url":"https://arxiv.org/pdf/2403.18495v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02203v5","updated":"2024-03-27T12:12:45Z","published":"2023-07-05T10:54:50Z","title":"Neural Fields for Interactive Visualization of Statistical Dependencies\n in 3D Simulation Ensembles","summary":" We present the first neural network that has learned to compactly represent\nand can efficiently reconstruct the statistical dependencies between the values\nof physical variables at different spatial locations in large 3D simulation\nensembles. Going beyond linear dependencies, we consider mutual information as\na measure of non-linear dependence. We demonstrate learning and reconstruction\nwith a large weather forecast ensemble comprising 1000 members, each storing\nmultiple physical variables at a 250 x 352 x 20 simulation grid. By\ncircumventing compute-intensive statistical estimators at runtime, we\ndemonstrate significantly reduced memory and computation requirements for\nreconstructing the major dependence structures. This enables embedding the\nestimator into a GPU-accelerated direct volume renderer and interactively\nvisualizing all mutual dependencies for a selected domain point.\n","authors":["Fatemeh Farokhmanesh","Kevin Höhlein","Christoph Neuhauser","Tobias Necker","Martin Weissmann","Takemasa Miyoshi","Rüdiger Westermann"],"pdf_url":"https://arxiv.org/pdf/2307.02203v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18493v1","updated":"2024-03-27T12:08:41Z","published":"2024-03-27T12:08:41Z","title":"VersaT2I: Improving Text-to-Image Models with Versatile Reward","summary":" Recent text-to-image (T2I) models have benefited from large-scale and\nhigh-quality data, demonstrating impressive performance. However, these T2I\nmodels still struggle to produce images that are aesthetically pleasing,\ngeometrically accurate, faithful to text, and of good low-level quality. We\npresent VersaT2I, a versatile training framework that can boost the performance\nwith multiple rewards of any T2I model. We decompose the quality of the image\ninto several aspects such as aesthetics, text-image alignment, geometry,\nlow-level quality, etc. Then, for every quality aspect, we select high-quality\nimages in this aspect generated by the model as the training set to finetune\nthe T2I model using the Low-Rank Adaptation (LoRA). Furthermore, we introduce a\ngating function to combine multiple quality aspects, which can avoid conflicts\nbetween different quality aspects. Our method is easy to extend and does not\nrequire any manual annotation, reinforcement learning, or model architecture\nchanges. Extensive experiments demonstrate that VersaT2I outperforms the\nbaseline methods across various quality criteria.\n","authors":["Jianshu Guo","Wenhao Chai","Jie Deng","Hsiang-Wei Huang","Tian Ye","Yichen Xu","Jiawei Zhang","Jenq-Neng Hwang","Gaoang Wang"],"pdf_url":"https://arxiv.org/pdf/2403.18493v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18490v1","updated":"2024-03-27T12:05:22Z","published":"2024-03-27T12:05:22Z","title":"I2CKD : Intra- and Inter-Class Knowledge Distillation for Semantic\n Segmentation","summary":" This paper proposes a new knowledge distillation method tailored for image\nsemantic segmentation, termed Intra- and Inter-Class Knowledge Distillation\n(I2CKD). The focus of this method is on capturing and transferring knowledge\nbetween the intermediate layers of teacher (cumbersome model) and student\n(compact model). For knowledge extraction, we exploit class prototypes derived\nfrom feature maps. To facilitate knowledge transfer, we employ a triplet loss\nin order to minimize intra-class variances and maximize inter-class variances\nbetween teacher and student prototypes. Consequently, I2CKD enables the student\nto better mimic the feature representation of the teacher for each class,\nthereby enhancing the segmentation performance of the compact network.\nExtensive experiments on three segmentation datasets, i.e., Cityscapes, Pascal\nVOC and CamVid, using various teacher-student network pairs demonstrate the\neffectiveness of the proposed method.\n","authors":["Ayoub Karine","Thibault Napoléon","Maher Jridi"],"pdf_url":"https://arxiv.org/pdf/2403.18490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16943v2","updated":"2024-03-27T11:46:36Z","published":"2023-12-28T10:40:11Z","title":"SAR-Net: Multi-scale Direction-aware SAR Network via Global Information\n Fusion","summary":" Deep learning has driven significant progress in object detection using\nSynthetic Aperture Radar (SAR) imagery. Existing methods, while achieving\npromising results, often struggle to effectively integrate local and global\ninformation, particularly direction-aware features. This paper proposes\nSAR-Net, a novel framework specifically designed for global fusion of\ndirection-aware information in SAR object detection. SAR-Net leverages two key\ninnovations: the Unity Compensation Mechanism (UCM) and the Direction-aware\nAttention Module (DAM). UCM facilitates the establishment of complementary\nrelationships among features across different scales, enabling efficient global\ninformation fusion. Among them, Multi-scale Alignment Module (MAM) and distinct\nMulti-level Fusion Module (MFM) enhance feature integration by capturing both\ntexture detail and semantic information. Then, Multi-feature Embedding Module\n(MEM) feeds back global features into the primary branches, further improving\ninformation transmission. Additionally, DAM, through bidirectional attention\npolymerization, captures direction-aware information, effectively eliminating\nbackground interference. Extensive experiments demonstrate the effectiveness of\nSAR-Net, achieving state-of-the-art results on aircraft (SAR-AIRcraft-1.0) and\nship datasets (SSDD, HRSID), confirming its generalization capability and\nrobustness.\n","authors":["Mingxiang Cao","Jie Lei","Weiying Xie","Jiaqing Zhang","Daixun Li","Yunsong Li"],"pdf_url":"https://arxiv.org/pdf/2312.16943v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18476v1","updated":"2024-03-27T11:45:08Z","published":"2024-03-27T11:45:08Z","title":"Modeling uncertainty for Gaussian Splatting","summary":" We present Stochastic Gaussian Splatting (SGS): the first framework for\nuncertainty estimation using Gaussian Splatting (GS). GS recently advanced the\nnovel-view synthesis field by achieving impressive reconstruction quality at a\nfraction of the computational cost of Neural Radiance Fields (NeRF). However,\ncontrary to the latter, it still lacks the ability to provide information about\nthe confidence associated with their outputs. To address this limitation, in\nthis paper, we introduce a Variational Inference-based approach that seamlessly\nintegrates uncertainty prediction into the common rendering pipeline of GS.\nAdditionally, we introduce the Area Under Sparsification Error (AUSE) as a new\nterm in the loss function, enabling optimization of uncertainty estimation\nalongside image reconstruction. Experimental results on the LLFF dataset\ndemonstrate that our method outperforms existing approaches in terms of both\nimage rendering quality and uncertainty estimation accuracy. Overall, our\nframework equips practitioners with valuable insights into the reliability of\nsynthesized views, facilitating safer decision-making in real-world\napplications.\n","authors":["Luca Savant","Diego Valsesia","Enrico Magli"],"pdf_url":"https://arxiv.org/pdf/2403.18476v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12028v2","updated":"2024-03-27T11:43:28Z","published":"2023-11-20T18:59:51Z","title":"Hourglass Tokenizer for Efficient Transformer-Based 3D Human Pose\n Estimation","summary":" Transformers have been successfully applied in the field of video-based 3D\nhuman pose estimation. However, the high computational costs of these video\npose transformers (VPTs) make them impractical on resource-constrained devices.\nIn this paper, we present a plug-and-play pruning-and-recovering framework,\ncalled Hourglass Tokenizer (HoT), for efficient transformer-based 3D human pose\nestimation from videos. Our HoT begins with pruning pose tokens of redundant\nframes and ends with recovering full-length tokens, resulting in a few pose\ntokens in the intermediate transformer blocks and thus improving the model\nefficiency. To effectively achieve this, we propose a token pruning cluster\n(TPC) that dynamically selects a few representative tokens with high semantic\ndiversity while eliminating the redundancy of video frames. In addition, we\ndevelop a token recovering attention (TRA) to restore the detailed\nspatio-temporal information based on the selected tokens, thereby expanding the\nnetwork output to the original full-length temporal resolution for fast\ninference. Extensive experiments on two benchmark datasets (i.e., Human3.6M and\nMPI-INF-3DHP) demonstrate that our method can achieve both high efficiency and\nestimation accuracy compared to the original VPT models. For instance, applying\nto MotionBERT and MixSTE on Human3.6M, our HoT can save nearly 50% FLOPs\nwithout sacrificing accuracy and nearly 40% FLOPs with only 0.2% accuracy drop,\nrespectively. Code and models are available at\nhttps://github.com/NationalGAILab/HoT.\n","authors":["Wenhao Li","Mengyuan Liu","Hong Liu","Pichao Wang","Jialun Cai","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2311.12028v2.pdf","comment":"Accepted by CVPR 2024, Open Sourced"},{"id":"http://arxiv.org/abs/2403.18471v1","updated":"2024-03-27T11:32:44Z","published":"2024-03-27T11:32:44Z","title":"DiffusionFace: Towards a Comprehensive Dataset for Diffusion-Based Face\n Forgery Analysis","summary":" The rapid progress in deep learning has given rise to hyper-realistic facial\nforgery methods, leading to concerns related to misinformation and security\nrisks. Existing face forgery datasets have limitations in generating\nhigh-quality facial images and addressing the challenges posed by evolving\ngenerative techniques. To combat this, we present DiffusionFace, the first\ndiffusion-based face forgery dataset, covering various forgery categories,\nincluding unconditional and Text Guide facial image generation, Img2Img,\nInpaint, and Diffusion-based facial exchange algorithms. Our DiffusionFace\ndataset stands out with its extensive collection of 11 diffusion models and the\nhigh-quality of the generated images, providing essential metadata and a\nreal-world internet-sourced forgery facial image dataset for evaluation.\nAdditionally, we provide an in-depth analysis of the data and introduce\npractical evaluation protocols to rigorously assess discriminative models'\neffectiveness in detecting counterfeit facial images, aiming to enhance\nsecurity in facial image authentication processes. The dataset is available for\ndownload at \\url{https://github.com/Rapisurazurite/DiffFace}.\n","authors":["Zhongxi Chen","Ke Sun","Ziyin Zhou","Xianming Lin","Xiaoshuai Sun","Liujuan Cao","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2403.18471v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18469v1","updated":"2024-03-27T11:28:57Z","published":"2024-03-27T11:28:57Z","title":"Density-guided Translator Boosts Synthetic-to-Real Unsupervised Domain\n Adaptive Segmentation of 3D Point Clouds","summary":" 3D synthetic-to-real unsupervised domain adaptive segmentation is crucial to\nannotating new domains. Self-training is a competitive approach for this task,\nbut its performance is limited by different sensor sampling patterns (i.e.,\nvariations in point density) and incomplete training strategies. In this work,\nwe propose a density-guided translator (DGT), which translates point density\nbetween domains, and integrates it into a two-stage self-training pipeline\nnamed DGT-ST. First, in contrast to existing works that simultaneously conduct\ndata generation and feature/output alignment within unstable adversarial\ntraining, we employ the non-learnable DGT to bridge the domain gap at the input\nlevel. Second, to provide a well-initialized model for self-training, we\npropose a category-level adversarial network in stage one that utilizes the\nprototype to prevent negative transfer. Finally, by leveraging the designs\nabove, a domain-mixed self-training method with source-aware consistency loss\nis proposed in stage two to narrow the domain gap further. Experiments on two\nsynthetic-to-real segmentation tasks (SynLiDAR $\\rightarrow$ semanticKITTI and\nSynLiDAR $\\rightarrow$ semanticPOSS) demonstrate that DGT-ST outperforms\nstate-of-the-art methods, achieving 9.4$\\%$ and 4.3$\\%$ mIoU improvements,\nrespectively. Code is available at \\url{https://github.com/yuan-zm/DGT-ST}.\n","authors":["Zhimin Yuan","Wankang Zeng","Yanfei Su","Weiquan Liu","Ming Cheng","Yulan Guo","Cheng Wang"],"pdf_url":"https://arxiv.org/pdf/2403.18469v1.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2403.18468v1","updated":"2024-03-27T11:28:32Z","published":"2024-03-27T11:28:32Z","title":"Deep Learning Segmentation and Classification of Red Blood Cells Using a\n Large Multi-Scanner Dataset","summary":" Digital pathology has recently been revolutionized by advancements in\nartificial intelligence, deep learning, and high-performance computing. With\nits advanced tools, digital pathology can help improve and speed up the\ndiagnostic process, reduce human errors, and streamline the reporting step. In\nthis paper, we report a new large red blood cell (RBC) image dataset and\npropose a two-stage deep learning framework for RBC image segmentation and\nclassification. The dataset is a highly diverse dataset of more than 100K RBCs\ncontaining eight different classes. The dataset, which is considerably larger\nthan any publicly available hematopathology dataset, was labeled independently\nby two hematopathologists who also manually created masks for RBC cell\nsegmentation. Subsequently, in the proposed framework, first, a U-Net model was\ntrained to achieve automatic RBC image segmentation. Second, an EfficientNetB0\nmodel was trained to classify RBC images into one of the eight classes using a\ntransfer learning approach with a 5X2 cross-validation scheme. An IoU of 98.03%\nand an average classification accuracy of 96.5% were attained on the test set.\nMoreover, we have performed experimental comparisons against several prominent\nCNN models. These comparisons show the superiority of the proposed model with a\ngood balance between performance and computational cost.\n","authors":["Mohamed Elmanna","Ahmed Elsafty","Yomna Ahmed","Muhammad Rushdi","Ahmed Morsy"],"pdf_url":"https://arxiv.org/pdf/2403.18468v1.pdf","comment":"15 pages, 12 figures, 8 tables"},{"id":"http://arxiv.org/abs/2403.18461v1","updated":"2024-03-27T11:19:34Z","published":"2024-03-27T11:19:34Z","title":"DiffStyler: Diffusion-based Localized Image Style Transfer","summary":" Image style transfer aims to imbue digital imagery with the distinctive\nattributes of style targets, such as colors, brushstrokes, shapes, whilst\nconcurrently preserving the semantic integrity of the content. Despite the\nadvancements in arbitrary style transfer methods, a prevalent challenge remains\nthe delicate equilibrium between content semantics and style attributes. Recent\ndevelopments in large-scale text-to-image diffusion models have heralded\nunprecedented synthesis capabilities, albeit at the expense of relying on\nextensive and often imprecise textual descriptions to delineate artistic\nstyles. Addressing these limitations, this paper introduces DiffStyler, a novel\napproach that facilitates efficient and precise arbitrary image style transfer.\nDiffStyler lies the utilization of a text-to-image Stable Diffusion model-based\nLoRA to encapsulate the essence of style targets. This approach, coupled with\nstrategic cross-LoRA feature and attention injection, guides the style transfer\nprocess. The foundation of our methodology is rooted in the observation that\nLoRA maintains the spatial feature consistency of UNet, a discovery that\nfurther inspired the development of a mask-wise style transfer technique. This\ntechnique employs masks extracted through a pre-trained FastSAM model,\nutilizing mask prompts to facilitate feature fusion during the denoising\nprocess, thereby enabling localized style transfer that preserves the original\nimage's unaffected regions. Moreover, our approach accommodates multiple style\ntargets through the use of corresponding masks. Through extensive\nexperimentation, we demonstrate that DiffStyler surpasses previous methods in\nachieving a more harmonious balance between content preservation and style\nintegration.\n","authors":["Shaoxu Li"],"pdf_url":"https://arxiv.org/pdf/2403.18461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10522v4","updated":"2024-03-27T11:18:51Z","published":"2023-11-17T13:43:43Z","title":"Enhancing Object Coherence in Layout-to-Image Synthesis","summary":" Layout-to-image synthesis is an emerging technique in conditional image\ngeneration. It aims to generate complex scenes, where users require fine\ncontrol over the layout of the objects in a scene. However, it remains\nchallenging to control the object coherence, including semantic coherence\n(e.g., the cat looks at the flowers or not) and physical coherence (e.g., the\nhand and the racket should not be misaligned). In this paper, we propose a\nnovel diffusion model with effective global semantic fusion (GSF) and\nself-similarity feature enhancement modules to guide the object coherence for\nthis task. For semantic coherence, we argue that the image caption contains\nrich information for defining the semantic relationship within the objects in\nthe images. Instead of simply employing cross-attention between captions and\ngenerated images, which addresses the highly relevant layout restriction and\nsemantic coherence separately and thus leads to unsatisfying results shown in\nour experiments, we develop GSF to fuse the supervision from the layout\nrestriction and semantic coherence requirement and exploit it to guide the\nimage synthesis process. Moreover, to improve the physical coherence, we\ndevelop a Self-similarity Coherence Attention (SCA) module to explicitly\nintegrate local contextual physical coherence into each pixel's generation\nprocess. Specifically, we adopt a self-similarity map to encode the coherence\nrestrictions and employ it to extract coherent features from text embedding.\nThrough visualization of our self-similarity map, we explore the essence of\nSCA, revealing that its effectiveness is not only in capturing reliable\nphysical coherence patterns but also in enhancing complex texture generation.\nExtensive experiments demonstrate the superiority of our proposed method in\nboth image generation quality and controllability.\n","authors":["Yibin Wang","Weizhong Zhang","Jianwei Zheng","Cheng Jin"],"pdf_url":"https://arxiv.org/pdf/2311.10522v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18454v1","updated":"2024-03-27T11:13:20Z","published":"2024-03-27T11:13:20Z","title":"Scaling Vision-and-Language Navigation With Offline RL","summary":" The study of vision-and-language navigation (VLN) has typically relied on\nexpert trajectories, which may not always be available in real-world situations\ndue to the significant effort required to collect them. On the other hand,\nexisting approaches to training VLN agents that go beyond available expert data\ninvolve data augmentations or online exploration which can be tedious and\nrisky. In contrast, it is easy to access large repositories of suboptimal\noffline trajectories. Inspired by research in offline reinforcement learning\n(ORL), we introduce a new problem setup of VLN-ORL which studies VLN using\nsuboptimal demonstration data. We introduce a simple and effective\nreward-conditioned approach that can account for dataset suboptimality for\ntraining VLN agents, as well as benchmarks to evaluate progress and promote\nresearch in this area. We empirically study various noise models for\ncharacterizing dataset suboptimality among other unique challenges in VLN-ORL\nand instantiate it for the VLN$\\circlearrowright$BERT and MTVM architectures in\nthe R2R and RxR environments. Our experiments demonstrate that the proposed\nreward-conditioned approach leads to significant performance improvements, even\nin complex and intricate environments.\n","authors":["Valay Bundele","Mahesh Bhupati","Biplab Banerjee","Aditya Grover"],"pdf_url":"https://arxiv.org/pdf/2403.18454v1.pdf","comment":"Published in Transactions on Machine Learning Research (04/2024)"},{"id":"http://arxiv.org/abs/2403.18452v1","updated":"2024-03-27T11:11:08Z","published":"2024-03-27T11:11:08Z","title":"SingularTrajectory: Universal Trajectory Predictor Using Diffusion Model","summary":" There are five types of trajectory prediction tasks: deterministic,\nstochastic, domain adaptation, momentary observation, and few-shot. These\nassociated tasks are defined by various factors, such as the length of input\npaths, data split and pre-processing methods. Interestingly, even though they\ncommonly take sequential coordinates of observations as input and infer future\npaths in the same coordinates as output, designing specialized architectures\nfor each task is still necessary. For the other task, generality issues can\nlead to sub-optimal performances. In this paper, we propose SingularTrajectory,\na diffusion-based universal trajectory prediction framework to reduce the\nperformance gap across the five tasks. The core of SingularTrajectory is to\nunify a variety of human dynamics representations on the associated tasks. To\ndo this, we first build a Singular space to project all types of motion\npatterns from each task into one embedding space. We next propose an adaptive\nanchor working in the Singular space. Unlike traditional fixed anchor methods\nthat sometimes yield unacceptable paths, our adaptive anchor enables correct\nanchors, which are put into a wrong location, based on a traversability map.\nFinally, we adopt a diffusion-based predictor to further enhance the prototype\npaths using a cascaded denoising process. Our unified framework ensures the\ngenerality across various benchmark settings such as input modality, and\ntrajectory lengths. Extensive experiments on five public benchmarks demonstrate\nthat SingularTrajectory substantially outperforms existing models, highlighting\nits effectiveness in estimating general dynamics of human movements. Code is\npublicly available at https://github.com/inhwanbae/SingularTrajectory .\n","authors":["Inhwan Bae","Young-Jae Park","Hae-Gon Jeon"],"pdf_url":"https://arxiv.org/pdf/2403.18452v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.18447v1","updated":"2024-03-27T11:06:44Z","published":"2024-03-27T11:06:44Z","title":"Can Language Beat Numerical Regression? Language-Based Multimodal\n Trajectory Prediction","summary":" Language models have demonstrated impressive ability in context understanding\nand generative performance. Inspired by the recent success of language\nfoundation models, in this paper, we propose LMTraj (Language-based Multimodal\nTrajectory predictor), which recasts the trajectory prediction task into a sort\nof question-answering problem. Departing from traditional numerical regression\nmodels, which treat the trajectory coordinate sequence as continuous signals,\nwe consider them as discrete signals like text prompts. Specially, we first\ntransform an input space for the trajectory coordinate into the natural\nlanguage space. Here, the entire time-series trajectories of pedestrians are\nconverted into a text prompt, and scene images are described as text\ninformation through image captioning. The transformed numerical and image data\nare then wrapped into the question-answering template for use in a language\nmodel. Next, to guide the language model in understanding and reasoning\nhigh-level knowledge, such as scene context and social relationships between\npedestrians, we introduce an auxiliary multi-task question and answering. We\nthen train a numerical tokenizer with the prompt data. We encourage the\ntokenizer to separate the integer and decimal parts well, and leverage it to\ncapture correlations between the consecutive numbers in the language model.\nLastly, we train the language model using the numerical tokenizer and all of\nthe question-answer prompts. Here, we propose a beam-search-based most-likely\nprediction and a temperature-based multimodal prediction to implement both\ndeterministic and stochastic inferences. Applying our LMTraj, we show that the\nlanguage-based model can be a powerful pedestrian trajectory predictor, and\noutperforms existing numerical-based predictor methods. Code is publicly\navailable at https://github.com/inhwanbae/LMTrajectory .\n","authors":["Inhwan Bae","Junoh Lee","Hae-Gon Jeon"],"pdf_url":"https://arxiv.org/pdf/2403.18447v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.18443v1","updated":"2024-03-27T11:00:33Z","published":"2024-03-27T11:00:33Z","title":"$\\mathrm{F^2Depth}$: Self-supervised Indoor Monocular Depth Estimation\n via Optical Flow Consistency and Feature Map Synthesis","summary":" Self-supervised monocular depth estimation methods have been increasingly\ngiven much attention due to the benefit of not requiring large, labelled\ndatasets. Such self-supervised methods require high-quality salient features\nand consequently suffer from severe performance drop for indoor scenes, where\nlow-textured regions dominant in the scenes are almost indiscriminative. To\naddress the issue, we propose a self-supervised indoor monocular depth\nestimation framework called $\\mathrm{F^2Depth}$. A self-supervised optical flow\nestimation network is introduced to supervise depth learning. To improve\noptical flow estimation performance in low-textured areas, only some patches of\npoints with more discriminative features are adopted for finetuning based on\nour well-designed patch-based photometric loss. The finetuned optical flow\nestimation network generates high-accuracy optical flow as a supervisory signal\nfor depth estimation. Correspondingly, an optical flow consistency loss is\ndesigned. Multi-scale feature maps produced by finetuned optical flow\nestimation network perform warping to compute feature map synthesis loss as\nanother supervisory signal for depth learning. Experimental results on the NYU\nDepth V2 dataset demonstrate the effectiveness of the framework and our\nproposed losses. To evaluate the generalization ability of our\n$\\mathrm{F^2Depth}$, we collect a Campus Indoor depth dataset composed of\napproximately 1500 points selected from 99 images in 18 scenes. Zero-shot\ngeneralization experiments on 7-Scenes dataset and Campus Indoor achieve\n$\\delta_1$ accuracy of 75.8% and 76.0% respectively. The accuracy results show\nthat our model can generalize well to monocular images captured in unknown\nindoor scenes.\n","authors":["Xiaotong Guo","Huijie Zhao","Shuwei Shao","Xudong Li","Baochang Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.18443v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.17126v2","updated":"2024-03-27T10:50:54Z","published":"2022-11-30T16:03:24Z","title":"BEVUDA: Multi-geometric Space Alignments for Domain Adaptive BEV 3D\n Object Detection","summary":" Vision-centric bird-eye-view (BEV) perception has shown promising potential\nin autonomous driving. Recent works mainly focus on improving efficiency or\naccuracy but neglect the challenges when facing environment changing, resulting\nin severe degradation of transfer performance. For BEV perception, we figure\nout the significant domain gaps existing in typical real-world cross-domain\nscenarios and comprehensively solve the Domain Adaption (DA) problem for\nmulti-view 3D object detection. Since BEV perception approaches are complicated\nand contain several components, the domain shift accumulation on multiple\ngeometric spaces (i.e., 2D, 3D Voxel, BEV) makes BEV DA even challenging. In\nthis paper, we propose a Multi-space Alignment Teacher-Student (MATS) framework\nto ease the domain shift accumulation, which consists of a Depth-Aware Teacher\n(DAT) and a Geometric-space Aligned Student (GAS) model. DAT tactfully combines\ntarget lidar and reliable depth prediction to construct depth-aware\ninformation, extracting target domain-specific knowledge in Voxel and BEV\nfeature spaces. It then transfers the sufficient domain knowledge of multiple\nspaces to the student model. In order to jointly alleviate the domain shift,\nGAS projects multi-geometric space features to a shared geometric embedding\nspace and decreases data distribution distance between two domains. To verify\nthe effectiveness of our method, we conduct BEV 3D object detection experiments\non three cross-domain scenarios and achieve state-of-the-art performance.\n","authors":["Jiaming Liu","Rongyu Zhang","Xiaoqi Li","Xiaowei Chi","Zehui Chen","Ming Lu","Yandong Guo","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2211.17126v2.pdf","comment":"Accepted by ICRA2024"},{"id":"http://arxiv.org/abs/2403.18442v1","updated":"2024-03-27T10:50:24Z","published":"2024-03-27T10:50:24Z","title":"Backpropagation-free Network for 3D Test-time Adaptation","summary":" Real-world systems often encounter new data over time, which leads to\nexperiencing target domain shifts. Existing Test-Time Adaptation (TTA) methods\ntend to apply computationally heavy and memory-intensive backpropagation-based\napproaches to handle this. Here, we propose a novel method that uses a\nbackpropagation-free approach for TTA for the specific case of 3D data. Our\nmodel uses a two-stream architecture to maintain knowledge about the source\ndomain as well as complementary target-domain-specific information. The\nbackpropagation-free property of our model helps address the well-known\nforgetting problem and mitigates the error accumulation issue. The proposed\nmethod also eliminates the need for the usually noisy process of\npseudo-labeling and reliance on costly self-supervised training. Moreover, our\nmethod leverages subspace learning, effectively reducing the distribution\nvariance between the two domains. Furthermore, the source-domain-specific and\nthe target-domain-specific streams are aligned using a novel entropy-based\nadaptive fusion strategy. Extensive experiments on popular benchmarks\ndemonstrate the effectiveness of our method. The code will be available at\nhttps://github.com/abie-e/BFTT3D.\n","authors":["Yanshuo Wang","Ali Cheraghian","Zeeshan Hayder","Jie Hong","Sameera Ramasinghe","Shafin Rahman","David Ahmedt-Aristizabal","Xuesong Li","Lars Petersson","Mehrtash Harandi"],"pdf_url":"https://arxiv.org/pdf/2403.18442v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2311.18113v2","updated":"2024-03-27T10:46:59Z","published":"2023-11-29T21:58:41Z","title":"Back to 3D: Few-Shot 3D Keypoint Detection with Back-Projected 2D\n Features","summary":" With the immense growth of dataset sizes and computing resources in recent\nyears, so-called foundation models have become popular in NLP and vision tasks.\nIn this work, we propose to explore foundation models for the task of keypoint\ndetection on 3D shapes. A unique characteristic of keypoint detection is that\nit requires semantic and geometric awareness while demanding high localization\naccuracy. To address this problem, we propose, first, to back-project features\nfrom large pre-trained 2D vision models onto 3D shapes and employ them for this\ntask. We show that we obtain robust 3D features that contain rich semantic\ninformation and analyze multiple candidate features stemming from different 2D\nfoundation models. Second, we employ a keypoint candidate optimization module\nwhich aims to match the average observed distribution of keypoints on the shape\nand is guided by the back-projected features. The resulting approach achieves a\nnew state of the art for few-shot keypoint detection on the KeyPointNet\ndataset, almost doubling the performance of the previous best methods.\n","authors":["Thomas Wimmer","Peter Wonka","Maks Ovsjanikov"],"pdf_url":"https://arxiv.org/pdf/2311.18113v2.pdf","comment":"Accepted to CVPR 2024, Project page:\n https://wimmerth.github.io/back-to-3d.html"},{"id":"http://arxiv.org/abs/2401.08742v2","updated":"2024-03-27T10:33:02Z","published":"2024-01-16T18:58:36Z","title":"Fast Dynamic 3D Object Generation from a Single-view Video","summary":" Generating dynamic 3D object from a single-view video is challenging due to\nthe lack of 4D labeled data. Extending image-to-3D pipelines by transferring\noff-the-shelf image generation models such as score distillation sampling,\nexisting methods tend to be slow and expensive to scale due to the need for\nback-propagating the information-limited supervision signals through a large\npretrained model. To address this, we propose an efficient video-to-4D object\ngeneration framework called Efficient4D. It generates high-quality\nspacetime-consistent images under different camera views, and then uses them as\nlabeled data to directly train a novel 4D Gaussian splatting model with\nexplicit point cloud geometry, enabling real-time rendering under continuous\ncamera trajectories. Extensive experiments on synthetic and real videos show\nthat Efficient4D offers a remarkable 20-fold increase in speed when compared to\nprior art alternatives while preserving the quality of novel view synthesis.\nFor example, Efficient4D takes only 6 mins to model a dynamic object, vs 120\nmins by Consistent4D.\n","authors":["Zijie Pan","Zeyu Yang","Xiatian Zhu","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.08742v2.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2403.18425v1","updated":"2024-03-27T10:26:42Z","published":"2024-03-27T10:26:42Z","title":"U-Sketch: An Efficient Approach for Sketch to Image Diffusion Models","summary":" Diffusion models have demonstrated remarkable performance in text-to-image\nsynthesis, producing realistic and high resolution images that faithfully\nadhere to the corresponding text-prompts. Despite their great success, they\nstill fall behind in sketch-to-image synthesis tasks, where in addition to\ntext-prompts, the spatial layout of the generated images has to closely follow\nthe outlines of certain reference sketches. Employing an MLP latent edge\npredictor to guide the spatial layout of the synthesized image by predicting\nedge maps at each denoising step has been recently proposed. Despite yielding\npromising results, the pixel-wise operation of the MLP does not take into\naccount the spatial layout as a whole, and demands numerous denoising\niterations to produce satisfactory images, leading to time inefficiency. To\nthis end, we introduce U-Sketch, a framework featuring a U-Net type latent edge\npredictor, which is capable of efficiently capturing both local and global\nfeatures, as well as spatial correlations between pixels. Moreover, we propose\nthe addition of a sketch simplification network that offers the user the choice\nof preprocessing and simplifying input sketches for enhanced outputs. The\nexperimental results, corroborated by user feedback, demonstrate that our\nproposed U-Net latent edge predictor leads to more realistic results, that are\nbetter aligned with the spatial outlines of the reference sketches, while\ndrastically reducing the number of required denoising steps and, consequently,\nthe overall execution time.\n","authors":["Ilias Mitsouras","Eleftherios Tsonis","Paraskevi Tzouveli","Athanasios Voulodimos"],"pdf_url":"https://arxiv.org/pdf/2403.18425v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15098v2","updated":"2024-03-27T10:26:23Z","published":"2024-03-22T10:36:50Z","title":"UniTraj: A Unified Framework for Scalable Vehicle Trajectory Prediction","summary":" Vehicle trajectory prediction has increasingly relied on data-driven\nsolutions, but their ability to scale to different data domains and the impact\nof larger dataset sizes on their generalization remain under-explored. While\nthese questions can be studied by employing multiple datasets, it is\nchallenging due to several discrepancies, e.g., in data formats, map\nresolution, and semantic annotation types. To address these challenges, we\nintroduce UniTraj, a comprehensive framework that unifies various datasets,\nmodels, and evaluation criteria, presenting new opportunities for the vehicle\ntrajectory prediction field. In particular, using UniTraj, we conduct extensive\nexperiments and find that model performance significantly drops when\ntransferred to other datasets. However, enlarging data size and diversity can\nsubstantially improve performance, leading to a new state-of-the-art result for\nthe nuScenes dataset. We provide insights into dataset characteristics to\nexplain these findings. The code can be found here:\nhttps://github.com/vita-epfl/UniTraj\n","authors":["Lan Feng","Mohammadhossein Bahari","Kaouther Messaoud Ben Amor","Éloi Zablocki","Matthieu Cord","Alexandre Alahi"],"pdf_url":"https://arxiv.org/pdf/2403.15098v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12359v2","updated":"2024-03-27T10:18:04Z","published":"2023-12-19T17:40:27Z","title":"CLIP-DINOiser: Teaching CLIP a few DINO tricks for open-vocabulary\n semantic segmentation","summary":" The popular CLIP model displays impressive zero-shot capabilities thanks to\nits seamless interaction with arbitrary text prompts. However, its lack of\nspatial awareness makes it unsuitable for dense computer vision tasks, e.g.,\nsemantic segmentation, without an additional fine-tuning step that often uses\nannotations and can potentially suppress its original open-vocabulary\nproperties. Meanwhile, self-supervised representation methods have demonstrated\ngood localization properties without human-made annotations nor explicit\nsupervision. In this work, we take the best of both worlds and propose an\nopen-vocabulary semantic segmentation method, which does not require any\nannotations. We propose to locally improve dense MaskCLIP features, which are\ncomputed with a simple modification of CLIP's last pooling layer, by\nintegrating localization priors extracted from self-supervised features. By\ndoing so, we greatly improve the performance of MaskCLIP and produce smooth\noutputs. Moreover, we show that the used self-supervised feature properties can\ndirectly be learnt from CLIP features. Our method CLIP-DINOiser needs only a\nsingle forward pass of CLIP and two light convolutional layers at inference, no\nextra supervision nor extra memory and reaches state-of-the-art results on\nchallenging and fine-grained benchmarks such as COCO, Pascal Context,\nCityscapes and ADE20k. The code to reproduce our results is available at\nhttps://github.com/wysoczanska/clip_dinoiser.\n","authors":["Monika Wysoczańska","Oriane Siméoni","Michaël Ramamonjisoa","Andrei Bursuc","Tomasz Trzciński","Patrick Pérez"],"pdf_url":"https://arxiv.org/pdf/2312.12359v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12480v2","updated":"2024-03-27T10:12:32Z","published":"2023-12-19T15:34:52Z","title":"Continual-MAE: Adaptive Distribution Masked Autoencoders for Continual\n Test-Time Adaptation","summary":" Continual Test-Time Adaptation (CTTA) is proposed to migrate a source\npre-trained model to continually changing target distributions, addressing\nreal-world dynamism. Existing CTTA methods mainly rely on entropy minimization\nor teacher-student pseudo-labeling schemes for knowledge extraction in\nunlabeled target domains. However, dynamic data distributions cause\nmiscalibrated predictions and noisy pseudo-labels in existing self-supervised\nlearning methods, hindering the effective mitigation of error accumulation and\ncatastrophic forgetting problems during the continual adaptation process. To\ntackle these issues, we propose a continual self-supervised method, Adaptive\nDistribution Masked Autoencoders (ADMA), which enhances the extraction of\ntarget domain knowledge while mitigating the accumulation of distribution\nshifts. Specifically, we propose a Distribution-aware Masking (DaM) mechanism\nto adaptively sample masked positions, followed by establishing consistency\nconstraints between the masked target samples and the original target samples.\nAdditionally, for masked tokens, we utilize an efficient decoder to reconstruct\na hand-crafted feature descriptor (e.g., Histograms of Oriented Gradients),\nleveraging its invariant properties to boost task-relevant representations.\nThrough conducting extensive experiments on four widely recognized benchmarks,\nour proposed method attains state-of-the-art performance in both classification\nand segmentation CTTA tasks. Our project page:\nhttps://sites.google.com/view/continual-mae/home.\n","authors":["Jiaming Liu","Ran Xu","Senqiao Yang","Renrui Zhang","Qizhe Zhang","Zehui Chen","Yandong Guo","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.12480v2.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.18417v1","updated":"2024-03-27T10:09:38Z","published":"2024-03-27T10:09:38Z","title":"ECNet: Effective Controllable Text-to-Image Diffusion Models","summary":" The conditional text-to-image diffusion models have garnered significant\nattention in recent years. However, the precision of these models is often\ncompromised mainly for two reasons, ambiguous condition input and inadequate\ncondition guidance over single denoising loss. To address the challenges, we\nintroduce two innovative solutions. Firstly, we propose a Spatial Guidance\nInjector (SGI) which enhances conditional detail by encoding text inputs with\nprecise annotation information. This method directly tackles the issue of\nambiguous control inputs by providing clear, annotated guidance to the model.\nSecondly, to overcome the issue of limited conditional supervision, we\nintroduce Diffusion Consistency Loss (DCL), which applies supervision on the\ndenoised latent code at any given time step. This encourages consistency\nbetween the latent code at each time step and the input signal, thereby\nenhancing the robustness and accuracy of the output. The combination of SGI and\nDCL results in our Effective Controllable Network (ECNet), which offers a more\naccurate controllable end-to-end text-to-image generation framework with a more\nprecise conditioning input and stronger controllable supervision. We validate\nour approach through extensive experiments on generation under various\nconditions, such as human body skeletons, facial landmarks, and sketches of\ngeneral objects. The results consistently demonstrate that our method\nsignificantly enhances the controllability and robustness of the generated\nimages, outperforming existing state-of-the-art controllable text-to-image\nmodels.\n","authors":["Sicheng Li","Keqiang Sun","Zhixin Lai","Xiaoshi Wu","Feng Qiu","Haoran Xie","Kazunori Miyata","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2403.18417v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06075v2","updated":"2024-03-27T09:51:15Z","published":"2023-09-12T09:12:37Z","title":"A2V: A Semi-Supervised Domain Adaptation Framework for Brain Vessel\n Segmentation via Two-Phase Training Angiography-to-Venography Translation","summary":" We present a semi-supervised domain adaptation framework for brain vessel\nsegmentation from different image modalities. Existing state-of-the-art methods\nfocus on a single modality, despite the wide range of available cerebrovascular\nimaging techniques. This can lead to significant distribution shifts that\nnegatively impact the generalization across modalities. By relying on annotated\nangiographies and a limited number of annotated venographies, our framework\naccomplishes image-to-image translation and semantic segmentation, leveraging a\ndisentangled and semantically rich latent space to represent heterogeneous data\nand perform image-level adaptation from source to target domains. Moreover, we\nreduce the typical complexity of cycle-based architectures and minimize the use\nof adversarial training, which allows us to build an efficient and intuitive\nmodel with stable training. We evaluate our method on magnetic resonance\nangiographies and venographies. While achieving state-of-the-art performance in\nthe source domain, our method attains a Dice score coefficient in the target\ndomain that is only 8.9% lower, highlighting its promising potential for robust\ncerebrovascular image segmentation across different modalities.\n","authors":["Francesco Galati","Daniele Falcetta","Rosa Cortese","Barbara Casolla","Ferran Prados","Ninon Burgos","Maria A. Zuluaga"],"pdf_url":"https://arxiv.org/pdf/2309.06075v2.pdf","comment":"Accepted at the 34th British Machine Vision Conference (BMVC)"},{"id":"http://arxiv.org/abs/2403.18407v1","updated":"2024-03-27T09:49:37Z","published":"2024-03-27T09:49:37Z","title":"A Channel-ensemble Approach: Unbiased and Low-variance Pseudo-labels is\n Critical for Semi-supervised Classification","summary":" Semi-supervised learning (SSL) is a practical challenge in computer vision.\nPseudo-label (PL) methods, e.g., FixMatch and FreeMatch, obtain the State Of\nThe Art (SOTA) performances in SSL. These approaches employ a\nthreshold-to-pseudo-label (T2L) process to generate PLs by truncating the\nconfidence scores of unlabeled data predicted by the self-training method.\nHowever, self-trained models typically yield biased and high-variance\npredictions, especially in the scenarios when a little labeled data are\nsupplied. To address this issue, we propose a lightweight channel-based\nensemble method to effectively consolidate multiple inferior PLs into the\ntheoretically guaranteed unbiased and low-variance one. Importantly, our\napproach can be readily extended to any SSL framework, such as FixMatch or\nFreeMatch. Experimental results demonstrate that our method significantly\noutperforms state-of-the-art techniques on CIFAR10/100 in terms of\neffectiveness and efficiency.\n","authors":["Jiaqi Wu","Junbiao Pang","Baochang Zhang","Qingming Huang"],"pdf_url":"https://arxiv.org/pdf/2403.18407v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18406v1","updated":"2024-03-27T09:48:23Z","published":"2024-03-27T09:48:23Z","title":"An Image Grid Can Be Worth a Video: Zero-shot Video Question Answering\n Using a VLM","summary":" Stimulated by the sophisticated reasoning capabilities of recent Large\nLanguage Models (LLMs), a variety of strategies for bridging video modality\nhave been devised. A prominent strategy involves Video Language Models\n(VideoLMs), which train a learnable interface with video data to connect\nadvanced vision encoders with LLMs. Recently, an alternative strategy has\nsurfaced, employing readily available foundation models, such as VideoLMs and\nLLMs, across multiple stages for modality bridging. In this study, we introduce\na simple yet novel strategy where only a single Vision Language Model (VLM) is\nutilized. Our starting point is the plain insight that a video comprises a\nseries of images, or frames, interwoven with temporal information. The essence\nof video comprehension lies in adeptly managing the temporal aspects along with\nthe spatial details of each frame. Initially, we transform a video into a\nsingle composite image by arranging multiple frames in a grid layout. The\nresulting single image is termed as an image grid. This format, while\nmaintaining the appearance of a solitary image, effectively retains temporal\ninformation within the grid structure. Therefore, the image grid approach\nenables direct application of a single high-performance VLM without\nnecessitating any video-data training. Our extensive experimental analysis\nacross ten zero-shot video question answering benchmarks, including five\nopen-ended and five multiple-choice benchmarks, reveals that the proposed Image\nGrid Vision Language Model (IG-VLM) surpasses the existing methods in nine out\nof ten benchmarks.\n","authors":["Wonkyun Kim","Changin Choi","Wonseok Lee","Wonjong Rhee"],"pdf_url":"https://arxiv.org/pdf/2403.18406v1.pdf","comment":"Our code is available at https://github.com/imagegridworth/IG-VLM"},{"id":"http://arxiv.org/abs/2403.05262v2","updated":"2024-03-27T09:43:41Z","published":"2024-03-08T12:35:07Z","title":"Debiasing Multimodal Large Language Models","summary":" In the realms of computer vision and natural language processing, Large\nVision-Language Models (LVLMs) have become indispensable tools, proficient in\ngenerating textual descriptions based on visual inputs. Despite their\nadvancements, our investigation reveals a noteworthy bias in the generated\ncontent, where the output is primarily influenced by the underlying Large\nLanguage Models (LLMs) prior rather than the input image. Our empirical\nexperiments underscore the persistence of this bias, as LVLMs often provide\nconfident answers even in the absence of relevant images or given incongruent\nvisual input. To rectify these biases and redirect the model's focus toward\nvision information, we introduce two simple, training-free strategies. Firstly,\nfor tasks such as classification or multi-choice question-answering (QA), we\npropose a ``calibration'' step through affine transformation to adjust the\noutput distribution. This ``Post-Hoc debias'' approach ensures uniform scores\nfor each answer when the image is absent, serving as an effective\nregularization technique to alleviate the influence of LLM priors. For more\nintricate open-ended generation tasks, we extend this method to ``Debias\nsampling'', drawing inspirations from contrastive decoding methods.\nFurthermore, our investigation sheds light on the instability of LVLMs across\nvarious decoding configurations. Through systematic exploration of different\nsettings, we significantly enhance performance, surpassing reported results and\nraising concerns about the fairness of existing evaluations. Comprehensive\nexperiments substantiate the effectiveness of our proposed strategies in\nmitigating biases. These strategies not only prove beneficial in minimizing\nhallucinations but also contribute to the generation of more helpful and\nprecise illustrations.\n","authors":["Yi-Fan Zhang","Weichen Yu","Qingsong Wen","Xue Wang","Zhang Zhang","Liang Wang","Rong Jin","Tieniu Tan"],"pdf_url":"https://arxiv.org/pdf/2403.05262v2.pdf","comment":"38 pages, 17 figures"},{"id":"http://arxiv.org/abs/2401.01647v2","updated":"2024-03-27T09:39:41Z","published":"2024-01-03T09:46:43Z","title":"SIGNeRF: Scene Integrated Generation for Neural Radiance Fields","summary":" Advances in image diffusion models have recently led to notable improvements\nin the generation of high-quality images. In combination with Neural Radiance\nFields (NeRFs), they enabled new opportunities in 3D generation. However, most\ngenerative 3D approaches are object-centric and applying them to editing\nexisting photorealistic scenes is not trivial. We propose SIGNeRF, a novel\napproach for fast and controllable NeRF scene editing and scene-integrated\nobject generation. A new generative update strategy ensures 3D consistency\nacross the edited images, without requiring iterative optimization. We find\nthat depth-conditioned diffusion models inherently possess the capability to\ngenerate 3D consistent views by requesting a grid of images instead of single\nviews. Based on these insights, we introduce a multi-view reference sheet of\nmodified images. Our method updates an image collection consistently based on\nthe reference sheet and refines the original NeRF with the newly generated\nimage set in one go. By exploiting the depth conditioning mechanism of the\nimage diffusion model, we gain fine control over the spatial location of the\nedit and enforce shape guidance by a selected region or an external mesh.\n","authors":["Jan-Niklas Dihlmann","Andreas Engelhardt","Hendrik Lensch"],"pdf_url":"https://arxiv.org/pdf/2401.01647v2.pdf","comment":"Project Page: https://signerf.jdihlmann.com"},{"id":"http://arxiv.org/abs/2403.18397v1","updated":"2024-03-27T09:35:56Z","published":"2024-03-27T09:35:56Z","title":"Colour and Brush Stroke Pattern Recognition in Abstract Art using\n Modified Deep Convolutional Generative Adversarial Networks","summary":" Abstract Art is an immensely popular, discussed form of art that often has\nthe ability to depict the emotions of an artist. Many researchers have made\nattempts to study abstract art in the form of edge detection, brush stroke and\nemotion recognition algorithms using machine and deep learning. This papers\ndescribes the study of a wide distribution of abstract paintings using\nGenerative Adversarial Neural Networks(GAN). GANs have the ability to learn and\nreproduce a distribution enabling researchers and scientists to effectively\nexplore and study the generated image space. However, the challenge lies in\ndeveloping an efficient GAN architecture that overcomes common training\npitfalls. This paper addresses this challenge by introducing a modified-DCGAN\n(mDCGAN) specifically designed for high-quality artwork generation. The\napproach involves a thorough exploration of the modifications made, delving\ninto the intricate workings of DCGANs, optimisation techniques, and\nregularisation methods aimed at improving stability and realism in art\ngeneration enabling effective study of generated patterns. The proposed mDCGAN\nincorporates meticulous adjustments in layer configurations and architectural\nchoices, offering tailored solutions to the unique demands of art generation\nwhile effectively combating issues like mode collapse and gradient vanishing.\nFurther this paper explores the generated latent space by performing random\nwalks to understand vector relationships between brush strokes and colours in\nthe abstract art space and a statistical analysis of unstable outputs after a\ncertain period of GAN training and compare its significant difference. These\nfindings validate the effectiveness of the proposed approach, emphasising its\npotential to revolutionise the field of digital art generation and digital art\necosystem.\n","authors":["Srinitish Srinivasan","Varenya Pathak"],"pdf_url":"https://arxiv.org/pdf/2403.18397v1.pdf","comment":"28 pages, 5 tables, 7 figures"},{"id":"http://arxiv.org/abs/2403.11656v2","updated":"2024-03-27T09:34:44Z","published":"2024-03-18T10:53:00Z","title":"LocalStyleFool: Regional Video Style Transfer Attack Using Segment\n Anything Model","summary":" Previous work has shown that well-crafted adversarial perturbations can\nthreaten the security of video recognition systems. Attackers can invade such\nmodels with a low query budget when the perturbations are semantic-invariant,\nsuch as StyleFool. Despite the query efficiency, the naturalness of the minutia\nareas still requires amelioration, since StyleFool leverages style transfer to\nall pixels in each frame. To close the gap, we propose LocalStyleFool, an\nimproved black-box video adversarial attack that superimposes regional\nstyle-transfer-based perturbations on videos. Benefiting from the popularity\nand scalably usability of Segment Anything Model (SAM), we first extract\ndifferent regions according to semantic information and then track them through\nthe video stream to maintain the temporal consistency. Then, we add\nstyle-transfer-based perturbations to several regions selected based on the\nassociative criterion of transfer-based gradient information and regional area.\nPerturbation fine adjustment is followed to make stylized videos adversarial.\nWe demonstrate that LocalStyleFool can improve both intra-frame and inter-frame\nnaturalness through a human-assessed survey, while maintaining competitive\nfooling rate and query efficiency. Successful experiments on the\nhigh-resolution dataset also showcase that scrupulous segmentation of SAM helps\nto improve the scalability of adversarial attacks under high-resolution data.\n","authors":["Yuxin Cao","Jinghao Li","Xi Xiao","Derui Wang","Minhui Xue","Hao Ge","Wei Liu","Guangwu Hu"],"pdf_url":"https://arxiv.org/pdf/2403.11656v2.pdf","comment":"Accepted to 2024 IEEE Security and Privacy Workshops (SPW)"},{"id":"http://arxiv.org/abs/2403.18388v1","updated":"2024-03-27T09:25:20Z","published":"2024-03-27T09:25:20Z","title":"FTBC: Forward Temporal Bias Correction for Optimizing ANN-SNN Conversion","summary":" Spiking Neural Networks (SNNs) offer a promising avenue for energy-efficient\ncomputing compared with Artificial Neural Networks (ANNs), closely mirroring\nbiological neural processes. However, this potential comes with inherent\nchallenges in directly training SNNs through spatio-temporal backpropagation --\nstemming from the temporal dynamics of spiking neurons and their discrete\nsignal processing -- which necessitates alternative ways of training, most\nnotably through ANN-SNN conversion. In this work, we introduce a lightweight\nForward Temporal Bias Correction (FTBC) technique, aimed at enhancing\nconversion accuracy without the computational overhead. We ground our method on\nprovided theoretical findings that through proper temporal bias calibration the\nexpected error of ANN-SNN conversion can be reduced to be zero after each time\nstep. We further propose a heuristic algorithm for finding the temporal bias\nonly in the forward pass, thus eliminating the computational burden of\nbackpropagation and we evaluate our method on CIFAR-10/100 and ImageNet\ndatasets, achieving a notable increase in accuracy on all datasets. Codes are\nreleased at a GitHub repository.\n","authors":["Xiaofeng Wu","Velibor Bojkovic","Bin Gu","Kun Suo","Kai Zou"],"pdf_url":"https://arxiv.org/pdf/2403.18388v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06733v3","updated":"2024-03-27T09:24:56Z","published":"2023-12-11T10:43:28Z","title":"TULIP: Transformer for Upsampling of LiDAR Point Cloud","summary":" LiDAR Upsampling is a challenging task for the perception systems of robots\nand autonomous vehicles, due to the sparse and irregular structure of\nlarge-scale scene contexts. Recent works propose to solve this problem by\nconverting LiDAR data from 3D Euclidean space into an image super-resolution\nproblem in 2D image space. Although their methods can generate high-resolution\nrange images with fine-grained details, the resulting 3D point clouds often\nblur out details and predict invalid points. In this paper, we propose TULIP, a\nnew method to reconstruct high-resolution LiDAR point clouds from\nlow-resolution LiDAR input. We also follow a range image-based approach but\nspecifically modify the patch and window geometries of a Swin-Transformer-based\nnetwork to better fit the characteristics of range images. We conducted several\nexperiments on three public real-world and simulated datasets. TULIP\noutperforms state-of-the-art methods in all relevant metrics and generates\nrobust and more realistic point clouds than prior works.\n","authors":["Bin Yang","Patrick Pfreundschuh","Roland Siegwart","Marco Hutter","Peyman Moghadam","Vaishakh Patil"],"pdf_url":"https://arxiv.org/pdf/2312.06733v3.pdf","comment":"The paper was accepted by CVPR20224"},{"id":"http://arxiv.org/abs/2403.05218v2","updated":"2024-03-27T09:21:42Z","published":"2024-03-08T11:09:46Z","title":"3D Face Reconstruction Using A Spectral-Based Graph Convolution Encoder","summary":" Monocular 3D face reconstruction plays a crucial role in avatar generation,\nwith significant demand in web-related applications such as generating virtual\nfinancial advisors in FinTech. Current reconstruction methods predominantly\nrely on deep learning techniques and employ 2D self-supervision as a means to\nguide model learning. However, these methods encounter challenges in capturing\nthe comprehensive 3D structural information of the face due to the utilization\nof 2D images for model training purposes. To overcome this limitation and\nenhance the reconstruction of 3D structural features, we propose an innovative\napproach that integrates existing 2D features with 3D features to guide the\nmodel learning process. Specifically, we introduce the 3D-ID Loss, which\nleverages the high-dimensional structure features extracted from a\nSpectral-Based Graph Convolution Encoder applied to the facial mesh. This\napproach surpasses the sole reliance on the 3D information provided by the\nfacial mesh vertices coordinates. Our model is trained using 2D-3D data pairs\nfrom a combination of datasets and achieves state-of-the-art performance on the\nNoW benchmark.\n","authors":["Haoxin Xu","Zezheng Zhao","Yuxin Cao","Chunyu Chen","Hao Ge","Ziyao Liu"],"pdf_url":"https://arxiv.org/pdf/2403.05218v2.pdf","comment":"4 pages, 3 figures. Accepted to WWW 2024"},{"id":"http://arxiv.org/abs/2403.18383v1","updated":"2024-03-27T09:21:07Z","published":"2024-03-27T09:21:07Z","title":"Generative Multi-modal Models are Good Class-Incremental Learners","summary":" In class-incremental learning (CIL) scenarios, the phenomenon of catastrophic\nforgetting caused by the classifier's bias towards the current task has long\nposed a significant challenge. It is mainly caused by the characteristic of\ndiscriminative models. With the growing popularity of the generative\nmulti-modal models, we would explore replacing discriminative models with\ngenerative ones for CIL. However, transitioning from discriminative to\ngenerative models requires addressing two key challenges. The primary challenge\nlies in transferring the generated textual information into the classification\nof distinct categories. Additionally, it requires formulating the task of CIL\nwithin a generative framework. To this end, we propose a novel generative\nmulti-modal model (GMM) framework for class-incremental learning. Our approach\ndirectly generates labels for images using an adapted generative model. After\nobtaining the detailed text, we use a text encoder to extract text features and\nemploy feature matching to determine the most similar label as the\nclassification prediction. In the conventional CIL settings, we achieve\nsignificantly better results in long-sequence task scenarios. Under the\nFew-shot CIL setting, we have improved by at least 14\\% accuracy over all the\ncurrent state-of-the-art methods with significantly less forgetting. Our code\nis available at \\url{https://github.com/DoubleClass/GMM}.\n","authors":["Xusheng Cao","Haori Lu","Linlan Huang","Xialei Liu","Ming-Ming Cheng"],"pdf_url":"https://arxiv.org/pdf/2403.18383v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2401.17879v2","updated":"2024-03-27T09:17:14Z","published":"2024-01-31T14:36:49Z","title":"AEROBLADE: Training-Free Detection of Latent Diffusion Images Using\n Autoencoder Reconstruction Error","summary":" With recent text-to-image models, anyone can generate deceptively realistic\nimages with arbitrary contents, fueling the growing threat of visual\ndisinformation. A key enabler for generating high-resolution images with low\ncomputational cost has been the development of latent diffusion models (LDMs).\nIn contrast to conventional diffusion models, LDMs perform the denoising\nprocess in the low-dimensional latent space of a pre-trained autoencoder (AE)\ninstead of the high-dimensional image space. Despite their relevance, the\nforensic analysis of LDMs is still in its infancy. In this work we propose\nAEROBLADE, a novel detection method which exploits an inherent component of\nLDMs: the AE used to transform images between image and latent space. We find\nthat generated images can be more accurately reconstructed by the AE than real\nimages, allowing for a simple detection approach based on the reconstruction\nerror. Most importantly, our method is easy to implement and does not require\nany training, yet nearly matches the performance of detectors that rely on\nextensive training. We empirically demonstrate that AEROBLADE is effective\nagainst state-of-the-art LDMs, including Stable Diffusion and Midjourney.\nBeyond detection, our approach allows for the qualitative analysis of images,\nwhich can be leveraged for identifying inpainted regions. We release our code\nand data at https://github.com/jonasricker/aeroblade .\n","authors":["Jonas Ricker","Denis Lukovnikov","Asja Fischer"],"pdf_url":"https://arxiv.org/pdf/2401.17879v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.00174v2","updated":"2024-03-27T09:13:19Z","published":"2024-02-29T22:58:13Z","title":"A citizen science toolkit to collect human perceptions of urban\n environments using open street view images","summary":" Street View-level Imagery (SVI) is a valuable data source for studies (e.g.,\nenvironmental assessments, green space identification or land cover\nclassification). While commercial SVI is available, such providers commonly\nrestrict copying or reuse in ways necessary for research. Open SVI datasets are\nreadily available from less restrictive sources, such as Mapillary, but due to\nthe heterogeneity of the images, these require substantial preprocessing,\nfiltering, and careful quality checks. We present an efficient method for\nautomated downloading, processing, cropping, and filtering open SVI, to be used\nin a survey of human perceptions of the streets portrayed in these images. We\ndemonstrate our open-source reusable SVI preparation and smartphone-friendly\nperception-survey software with Amsterdam (Netherlands) as the case study.\nUsing a citizen science approach, we collected from 331 people 22,637 ratings\nabout their perceptions for various criteria. We have published our software in\na public repository for future re-use and reproducibility.\n","authors":["Matthew Danish","SM Labib","Britta Ricker","Marco Helbich"],"pdf_url":"https://arxiv.org/pdf/2403.00174v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18373v1","updated":"2024-03-27T09:10:01Z","published":"2024-03-27T09:10:01Z","title":"BAM: Box Abstraction Monitors for Real-time OoD Detection in Object\n Detection","summary":" Out-of-distribution (OoD) detection techniques for deep neural networks\n(DNNs) become crucial thanks to their filtering of abnormal inputs, especially\nwhen DNNs are used in safety-critical applications and interact with an open\nand dynamic environment. Nevertheless, integrating OoD detection into\nstate-of-the-art (SOTA) object detection DNNs poses significant challenges,\npartly due to the complexity introduced by the SOTA OoD construction methods,\nwhich require the modification of DNN architecture and the introduction of\ncomplex loss functions. This paper proposes a simple, yet surprisingly\neffective, method that requires neither retraining nor architectural change in\nobject detection DNN, called Box Abstraction-based Monitors (BAM). The novelty\nof BAM stems from using a finite union of convex box abstractions to capture\nthe learned features of objects for in-distribution (ID) data, and an important\nobservation that features from OoD data are more likely to fall outside of\nthese boxes. The union of convex regions within the feature space allows the\nformation of non-convex and interpretable decision boundaries, overcoming the\nlimitations of VOS-like detectors without sacrificing real-time performance.\nExperiments integrating BAM into Faster R-CNN-based object detection DNNs\ndemonstrate a considerably improved performance against SOTA OoD detection\ntechniques.\n","authors":["Changshun Wu","Weicheng He","Chih-Hong Cheng","Xiaowei Huang","Saddek Bensalem"],"pdf_url":"https://arxiv.org/pdf/2403.18373v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17905v2","updated":"2024-03-27T09:07:02Z","published":"2024-03-26T17:45:06Z","title":"Scalable Non-Cartesian Magnetic Resonance Imaging with R2D2","summary":" We propose a new approach for non-Cartesian magnetic resonance image\nreconstruction. While unrolled architectures provide robustness via\ndata-consistency layers, embedding measurement operators in Deep Neural Network\n(DNN) can become impractical at large scale. Alternative Plug-and-Play (PnP)\napproaches, where the denoising DNNs are blind to the measurement setting, are\nnot affected by this limitation and have also proven effective, but their\nhighly iterative nature also affects scalability. To address this scalability\nchallenge, we leverage the \"Residual-to-Residual DNN series for high-Dynamic\nrange imaging (R2D2)\" approach recently introduced in astronomical imaging.\nR2D2's reconstruction is formed as a series of residual images, iteratively\nestimated as outputs of DNNs taking the previous iteration's image estimate and\nassociated data residual as inputs. The method can be interpreted as a learned\nversion of the Matching Pursuit algorithm. We demonstrate R2D2 in simulation,\nconsidering radial k-space sampling acquisition sequences. Our preliminary\nresults suggest that R2D2 achieves: (i) suboptimal performance compared to its\nunrolled incarnation R2D2-Net, which is however non-scalable due to the\nnecessary embedding of NUFFT-based data-consistency layers; (ii) superior\nreconstruction quality to a scalable version of R2D2-Net embedding an FFT-based\napproximation for data consistency; (iii) superior reconstruction quality to\nPnP, while only requiring few iterations.\n","authors":["Yiwei Chen","Chao Tang","Amir Aghabiglou","Chung San Chu","Yves Wiaux"],"pdf_url":"https://arxiv.org/pdf/2403.17905v2.pdf","comment":"submitted to IEEE EUSIPCO 2024"},{"id":"http://arxiv.org/abs/2403.18370v1","updated":"2024-03-27T09:06:36Z","published":"2024-03-27T09:06:36Z","title":"Ship in Sight: Diffusion Models for Ship-Image Super Resolution","summary":" In recent years, remarkable advancements have been achieved in the field of\nimage generation, primarily driven by the escalating demand for high-quality\noutcomes across various image generation subtasks, such as inpainting,\ndenoising, and super resolution. A major effort is devoted to exploring the\napplication of super-resolution techniques to enhance the quality of\nlow-resolution images. In this context, our method explores in depth the\nproblem of ship image super resolution, which is crucial for coastal and port\nsurveillance. We investigate the opportunity given by the growing interest in\ntext-to-image diffusion models, taking advantage of the prior knowledge that\nsuch foundation models have already learned. In particular, we present a\ndiffusion-model-based architecture that leverages text conditioning during\ntraining while being class-aware, to best preserve the crucial details of the\nships during the generation of the super-resoluted image. Since the specificity\nof this task and the scarcity availability of off-the-shelf data, we also\nintroduce a large labeled ship dataset scraped from online ship images, mostly\nfrom ShipSpotting\\footnote{\\url{www.shipspotting.com}} website. Our method\nachieves more robust results than other deep learning models previously\nemployed for super resolution, as proven by the multiple experiments performed.\nMoreover, we investigate how this model can benefit downstream tasks, such as\nclassification and object detection, thus emphasizing practical implementation\nin a real-world scenario. Experimental results show flexibility, reliability,\nand impressive performance of the proposed framework over state-of-the-art\nmethods for different tasks. The code is available at:\nhttps://github.com/LuigiSigillo/ShipinSight .\n","authors":["Luigi Sigillo","Riccardo Fosco Gramaccioni","Alessandro Nicolosi","Danilo Comminiello"],"pdf_url":"https://arxiv.org/pdf/2403.18370v1.pdf","comment":"Accepted at 2024 International Joint Conference on Neural Networks\n (IJCNN)"},{"id":"http://arxiv.org/abs/2312.10114v2","updated":"2024-03-27T09:00:54Z","published":"2023-12-15T09:49:21Z","title":"FoMo-Bench: a multi-modal, multi-scale and multi-task Forest Monitoring\n Benchmark for remote sensing foundation models","summary":" Forests are an essential part of Earth's ecosystems and natural systems, as\nwell as providing services on which humanity depends, yet they are rapidly\nchanging as a result of land use decisions and climate change. Understanding\nand mitigating negative effects requires parsing data on forests at global\nscale from a broad array of sensory modalities, and recently many such problems\nhave been approached using machine learning algorithms for remote sensing. To\ndate, forest-monitoring problems have largely been addressed in isolation.\nInspired by the rise of foundation models for computer vision and remote\nsensing, we here present the first unified Forest Monitoring Benchmark\n(FoMo-Bench). FoMo-Bench consists of 15 diverse datasets encompassing\nsatellite, aerial, and inventory data, covering a variety of geographical\nregions, and including multispectral, red-green-blue, synthetic aperture radar\n(SAR) and LiDAR data with various temporal, spatial and spectral resolutions.\nFoMo-Bench includes multiple types of forest-monitoring tasks, spanning\nclassification, segmentation, and object detection. To further enhance the\ndiversity of tasks and geographies represented in FoMo-Bench, we introduce a\nnovel global dataset, TalloS, combining satellite imagery with ground-based\nannotations for tree species classification, encompassing 1,000+ categories\nacross multiple hierarchical taxonomic levels (species, genus, family).\nFinally, we propose FoMo-Net, a baseline foundation model with the capacity to\nprocess any combination of commonly used spectral bands in remote sensing,\nacross diverse ground sampling distances and geographical locations worldwide.\nThis work aims to inspire research collaborations between machine learning and\nforest biology researchers in exploring scalable multi-modal and multi-task\nmodels for forest monitoring. All code and data will be made publicly\navailable.\n","authors":["Nikolaos Ioannis Bountos","Arthur Ouaknine","David Rolnick"],"pdf_url":"https://arxiv.org/pdf/2312.10114v2.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2402.19473v2","updated":"2024-03-27T09:00:25Z","published":"2024-02-29T18:59:01Z","title":"Retrieval-Augmented Generation for AI-Generated Content: A Survey","summary":" The development of Artificial Intelligence Generated Content (AIGC) has been\nfacilitated by advancements in model algorithms, the increasing scale of\nfoundation models, and the availability of ample high-quality datasets. While\nAIGC has achieved remarkable performance, it still faces several challenges,\nsuch as the difficulty of maintaining up-to-date and long-tail knowledge, the\nrisk of data leakage, and the high costs associated with training and\ninference. Retrieval-Augmented Generation(RAG) has recently emerged as a\nparadigm to address such challenges. In particular, RAG introduces the\ninformation retrieval process, which enhances the generation process by\nretrieving relevant objects from available data stores, leading to higher\naccuracy and better robustness. In this paper, we comprehensively review\nexisting efforts that integrate RAG technique into AIGC scenarios. We first\nclassify RAG foundations according to how the retriever augments the generator,\ndistilling the fundamental abstractions of the augmentation methodologies for\nvarious retrievers and generators. This unified perspective encompasses all RAG\nscenarios, illuminating advancements and pivotal technologies that help with\npotential future progress. We also summarize additional enhancements methods\nfor RAG, facilitating effective engineering and implementation of RAG systems.\nThen from another view, we survey on practical applications of RAG across\ndifferent modalities and tasks, offering valuable references for researchers\nand practitioners. Furthermore, we introduce the benchmarks for RAG, discuss\nthe limitations of current RAG systems, and suggest potential directions for\nfuture research.Project Repo: https://github.com/hymie122/RAG-Survey.\n","authors":["Penghao Zhao","Hailin Zhang","Qinhan Yu","Zhengren Wang","Yunteng Geng","Fangcheng Fu","Ling Yang","Wentao Zhang","Bin Cui"],"pdf_url":"https://arxiv.org/pdf/2402.19473v2.pdf","comment":"Citing 380 papers, 36 pages, 16 figures. Project:\n https://github.com/hymie122/RAG-Survey"},{"id":"http://arxiv.org/abs/2310.03325v2","updated":"2024-03-27T08:54:35Z","published":"2023-10-05T05:41:21Z","title":"Learning Concept-Based Causal Transition and Symbolic Reasoning for\n Visual Planning","summary":" Visual planning simulates how humans make decisions to achieve desired goals\nin the form of searching for visual causal transitions between an initial\nvisual state and a final visual goal state. It has become increasingly\nimportant in egocentric vision with its advantages in guiding agents to perform\ndaily tasks in complex environments. In this paper, we propose an interpretable\nand generalizable visual planning framework consisting of i) a novel\nSubstitution-based Concept Learner (SCL) that abstracts visual inputs into\ndisentangled concept representations, ii) symbol abstraction and reasoning that\nperforms task planning via the self-learned symbols, and iii) a Visual Causal\nTransition model (ViCT) that grounds visual causal transitions to semantically\nsimilar real-world actions. Given an initial state, we perform goal-conditioned\nvisual planning with a symbolic reasoning method fueled by the learned\nrepresentations and causal transitions to reach the goal state. To verify the\neffectiveness of the proposed model, we collect a large-scale visual planning\ndataset based on AI2-THOR, dubbed as CCTP. Extensive experiments on this\nchallenging dataset demonstrate the superior performance of our method in\nvisual task planning. Empirically, we show that our framework can generalize to\nunseen task trajectories, unseen object categories, and real-world data.\nFurther details of this work are provided at\nhttps://fqyqc.github.io/ConTranPlan/.\n","authors":["Yilue Qian","Peiyu Yu","Ying Nian Wu","Yao Su","Wei Wang","Lifeng Fan"],"pdf_url":"https://arxiv.org/pdf/2310.03325v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15837v2","updated":"2024-03-27T08:54:06Z","published":"2024-03-23T13:24:31Z","title":"Centered Masking for Language-Image Pre-Training","summary":" We introduce Gaussian masking for Language-Image Pre-Training (GLIP) a novel,\nstraightforward, and effective technique for masking image patches during\npre-training of a vision-language model. GLIP builds on Fast Language-Image\nPre-Training (FLIP), which randomly masks image patches while training a CLIP\nmodel. GLIP replaces random masking with centered masking, that uses a Gaussian\ndistribution and is inspired by the importance of image patches at the center\nof the image. GLIP retains the same computational savings as FLIP, while\nimproving performance across a range of downstream datasets and tasks, as\ndemonstrated by our experimental results. We show the benefits of GLIP to be\neasy to obtain, requiring no delicate tuning of the Gaussian, and also\napplicable to data sets containing images without an obvious center focus.\n","authors":["Mingliang Liang","Martha Larson"],"pdf_url":"https://arxiv.org/pdf/2403.15837v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18361v1","updated":"2024-03-27T08:53:13Z","published":"2024-03-27T08:53:13Z","title":"ViTAR: Vision Transformer with Any Resolution","summary":" his paper tackles a significant challenge faced by Vision Transformers\n(ViTs): their constrained scalability across different image resolutions.\nTypically, ViTs experience a performance decline when processing resolutions\ndifferent from those seen during training. Our work introduces two key\ninnovations to address this issue. Firstly, we propose a novel module for\ndynamic resolution adjustment, designed with a single Transformer block,\nspecifically to achieve highly efficient incremental token integration.\nSecondly, we introduce fuzzy positional encoding in the Vision Transformer to\nprovide consistent positional awareness across multiple resolutions, thereby\npreventing overfitting to any single training resolution. Our resulting model,\nViTAR (Vision Transformer with Any Resolution), demonstrates impressive\nadaptability, achieving 83.3\\% top-1 accuracy at a 1120x1120 resolution and\n80.4\\% accuracy at a 4032x4032 resolution, all while reducing computational\ncosts. ViTAR also shows strong performance in downstream tasks such as instance\nand semantic segmentation and can easily combined with self-supervised learning\ntechniques like Masked AutoEncoder. Our work provides a cost-effective solution\nfor enhancing the resolution scalability of ViTs, paving the way for more\nversatile and efficient high-resolution image processing.\n","authors":["Qihang Fan","Quanzeng You","Xiaotian Han","Yongfei Liu","Yunzhe Tao","Huaibo Huang","Ran He","Hongxia Yang"],"pdf_url":"https://arxiv.org/pdf/2403.18361v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18360v1","updated":"2024-03-27T08:52:44Z","published":"2024-03-27T08:52:44Z","title":"Learning CNN on ViT: A Hybrid Model to Explicitly Class-specific\n Boundaries for Domain Adaptation","summary":" Most domain adaptation (DA) methods are based on either a convolutional\nneural networks (CNNs) or a vision transformers (ViTs). They align the\ndistribution differences between domains as encoders without considering their\nunique characteristics. For instance, ViT excels in accuracy due to its\nsuperior ability to capture global representations, while CNN has an advantage\nin capturing local representations. This fact has led us to design a hybrid\nmethod to fully take advantage of both ViT and CNN, called Explicitly\nClass-specific Boundaries (ECB). ECB learns CNN on ViT to combine their\ndistinct strengths. In particular, we leverage ViT's properties to explicitly\nfind class-specific decision boundaries by maximizing the discrepancy between\nthe outputs of the two classifiers to detect target samples far from the source\nsupport. In contrast, the CNN encoder clusters target features based on the\npreviously defined class-specific boundaries by minimizing the discrepancy\nbetween the probabilities of the two classifiers. Finally, ViT and CNN mutually\nexchange knowledge to improve the quality of pseudo labels and reduce the\nknowledge discrepancies of these models. Compared to conventional DA methods,\nour ECB achieves superior performance, which verifies its effectiveness in this\nhybrid model. The project website can be found\nhttps://dotrannhattuong.github.io/ECB/website/.\n","authors":["Ba Hung Ngo","Nhat-Tuong Do-Tran","Tuan-Ngoc Nguyen","Hae-Gon Jeon","Tae Jong Choi"],"pdf_url":"https://arxiv.org/pdf/2403.18360v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18356v1","updated":"2024-03-27T08:48:47Z","published":"2024-03-27T08:48:47Z","title":"MonoHair: High-Fidelity Hair Modeling from a Monocular Video","summary":" Undoubtedly, high-fidelity 3D hair is crucial for achieving realism, artistic\nexpression, and immersion in computer graphics. While existing 3D hair modeling\nmethods have achieved impressive performance, the challenge of achieving\nhigh-quality hair reconstruction persists: they either require strict capture\nconditions, making practical applications difficult, or heavily rely on learned\nprior data, obscuring fine-grained details in images. To address these\nchallenges, we propose MonoHair,a generic framework to achieve high-fidelity\nhair reconstruction from a monocular video, without specific requirements for\nenvironments. Our approach bifurcates the hair modeling process into two main\nstages: precise exterior reconstruction and interior structure inference. The\nexterior is meticulously crafted using our Patch-based Multi-View Optimization\n(PMVO). This method strategically collects and integrates hair information from\nmultiple views, independent of prior data, to produce a high-fidelity exterior\n3D line map. This map not only captures intricate details but also facilitates\nthe inference of the hair's inner structure. For the interior, we employ a\ndata-driven, multi-view 3D hair reconstruction method. This method utilizes 2D\nstructural renderings derived from the reconstructed exterior, mirroring the\nsynthetic 2D inputs used during training. This alignment effectively bridges\nthe domain gap between our training data and real-world data, thereby enhancing\nthe accuracy and reliability of our interior structure inference. Lastly, we\ngenerate a strand model and resolve the directional ambiguity by our hair\ngrowth algorithm. Our experiments demonstrate that our method exhibits\nrobustness across diverse hairstyles and achieves state-of-the-art performance.\nFor more results, please refer to our project page\nhttps://keyuwu-cs.github.io/MonoHair/.\n","authors":["Keyu Wu","Lingchen Yang","Zhiyi Kuang","Yao Feng","Xutao Han","Yuefan Shen","Hongbo Fu","Kun Zhou","Youyi Zheng"],"pdf_url":"https://arxiv.org/pdf/2403.18356v1.pdf","comment":"Accepted by IEEE CVPR 2024"},{"id":"http://arxiv.org/abs/2403.18351v1","updated":"2024-03-27T08:42:47Z","published":"2024-03-27T08:42:47Z","title":"Generating Diverse Agricultural Data for Vision-Based Farming\n Applications","summary":" We present a specialized procedural model for generating synthetic\nagricultural scenes, focusing on soybean crops, along with various weeds. This\nmodel is capable of simulating distinct growth stages of these plants, diverse\nsoil conditions, and randomized field arrangements under varying lighting\nconditions. The integration of real-world textures and environmental factors\ninto the procedural generation process enhances the photorealism and\napplicability of the synthetic data. Our dataset includes 12,000 images with\nsemantic labels, offering a comprehensive resource for computer vision tasks in\nprecision agriculture, such as semantic segmentation for autonomous weed\ncontrol. We validate our model's effectiveness by comparing the synthetic data\nagainst real agricultural images, demonstrating its potential to significantly\naugment training data for machine learning models in agriculture. This approach\nnot only provides a cost-effective solution for generating high-quality,\ndiverse data but also addresses specific needs in agricultural vision tasks\nthat are not fully covered by general-purpose models.\n","authors":["Mikolaj Cieslak","Umabharathi Govindarajan","Alejandro Garcia","Anuradha Chandrashekar","Torsten Hädrich","Aleksander Mendoza-Drosik","Dominik L. Michels","Sören Pirk","Chia-Chun Fu","Wojciech Pałubicki"],"pdf_url":"https://arxiv.org/pdf/2403.18351v1.pdf","comment":"10 pages, 8 figures, 3 tables"},{"id":"http://arxiv.org/abs/2403.18347v1","updated":"2024-03-27T08:38:56Z","published":"2024-03-27T08:38:56Z","title":"A Quantum Fuzzy-based Approach for Real-Time Detection of Solar Coronal\n Holes","summary":" The detection and analysis of the solar coronal holes (CHs) is an important\nfield of study in the domain of solar physics. Mainly, it is required for the\nproper prediction of the geomagnetic storms which directly or indirectly affect\nvarious space and ground-based systems. For the detection of CHs till date, the\nsolar scientist depends on manual hand-drawn approaches. However, with the\nadvancement of image processing technologies, some automated image segmentation\nmethods have been used for the detection of CHs. In-spite of this, fast and\naccurate detection of CHs are till a major issues. Here in this work, a novel\nquantum computing-based fast fuzzy c-mean technique has been developed for fast\ndetection of the CHs region. The task has been carried out in two stages, in\nfirst stage the solar image has been segmented using a quantum computing based\nfast fuzzy c-mean (QCFFCM) and in the later stage the CHs has been extracted\nout from the segmented image based on image morphological operation. In the\nwork, quantum computing has been used to optimize the cost function of the fast\nfuzzy c-mean (FFCM) algorithm, where quantum approximate optimization algorithm\n(QAOA) has been used to optimize the quadratic part of the cost function. The\nproposed method has been tested for 193 \\AA{} SDO/AIA full-disk solar image\ndatasets and has been compared with the existing techniques. The outcome shows\nthe comparable performance of the proposed method with the existing one within\na very lesser time.\n","authors":["Sanmoy Bandyopadhyay","Suman Kundu"],"pdf_url":"https://arxiv.org/pdf/2403.18347v1.pdf","comment":"14 pages, 5 figures, 3 tables"},{"id":"http://arxiv.org/abs/2403.18346v1","updated":"2024-03-27T08:38:49Z","published":"2024-03-27T08:38:49Z","title":"Quantifying and Mitigating Unimodal Biases in Multimodal Large Language\n Models: A Causal Perspective","summary":" Recent advancements in Large Language Models (LLMs) have facilitated the\ndevelopment of Multimodal LLMs (MLLMs). Despite their impressive capabilities,\nMLLMs often suffer from an over-reliance on unimodal biases (e.g., language\nbias and vision bias), leading to incorrect answers in complex multimodal\ntasks. To investigate this issue, we propose a causal framework to interpret\nthe biases in Visual Question Answering (VQA) problems. Within our framework,\nwe devise a causal graph to elucidate the predictions of MLLMs on VQA problems,\nand assess the causal effect of biases through an in-depth causal analysis.\nMotivated by the causal graph, we introduce a novel MORE dataset, consisting of\n12,000 VQA instances. This dataset is designed to challenge MLLMs' abilities,\nnecessitating multi-hop reasoning and the surmounting of unimodal biases.\nFurthermore, we propose two strategies to mitigate unimodal biases and enhance\nMLLMs' reasoning capabilities, including a Decompose-Verify-Answer (DeVA)\nframework for limited-access MLLMs and the refinement of open-source MLLMs\nthrough fine-tuning. Extensive quantitative and qualitative experiments offer\nvaluable insights for future research.\n","authors":["Meiqi Chen","Yixin Cao","Yan Zhang","Chaochao Lu"],"pdf_url":"https://arxiv.org/pdf/2403.18346v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18342v1","updated":"2024-03-27T08:32:48Z","published":"2024-03-27T08:32:48Z","title":"Learning Inclusion Matching for Animation Paint Bucket Colorization","summary":" Colorizing line art is a pivotal task in the production of hand-drawn cel\nanimation. This typically involves digital painters using a paint bucket tool\nto manually color each segment enclosed by lines, based on RGB values\npredetermined by a color designer. This frame-by-frame process is both arduous\nand time-intensive. Current automated methods mainly focus on segment matching.\nThis technique migrates colors from a reference to the target frame by aligning\nfeatures within line-enclosed segments across frames. However, issues like\nocclusion and wrinkles in animations often disrupt these direct\ncorrespondences, leading to mismatches. In this work, we introduce a new\nlearning-based inclusion matching pipeline, which directs the network to\ncomprehend the inclusion relationships between segments rather than relying\nsolely on direct visual correspondences. Our method features a two-stage\npipeline that integrates a coarse color warping module with an inclusion\nmatching module, enabling more nuanced and accurate colorization. To facilitate\nthe training of our network, we also develope a unique dataset, referred to as\nPaintBucket-Character. This dataset includes rendered line arts alongside their\ncolorized counterparts, featuring various 3D characters. Extensive experiments\ndemonstrate the effectiveness and superiority of our method over existing\ntechniques.\n","authors":["Yuekun Dai","Shangchen Zhou","Qinyue Li","Chongyi Li","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2403.18342v1.pdf","comment":"accepted to CVPR 2024. Project Page:\n https://ykdai.github.io/projects/InclusionMatching"},{"id":"http://arxiv.org/abs/2403.18339v1","updated":"2024-03-27T08:28:14Z","published":"2024-03-27T08:28:14Z","title":"H2ASeg: Hierarchical Adaptive Interaction and Weighting Network for\n Tumor Segmentation in PET/CT Images","summary":" Positron emission tomography (PET) combined with computed tomography (CT)\nimaging is routinely used in cancer diagnosis and prognosis by providing\ncomplementary information. Automatically segmenting tumors in PET/CT images can\nsignificantly improve examination efficiency. Traditional multi-modal\nsegmentation solutions mainly rely on concatenation operations for modality\nfusion, which fail to effectively model the non-linear dependencies between PET\nand CT modalities. Recent studies have investigated various approaches to\noptimize the fusion of modality-specific features for enhancing joint\nrepresentations. However, modality-specific encoders used in these methods\noperate independently, inadequately leveraging the synergistic relationships\ninherent in PET and CT modalities, for example, the complementarity between\nsemantics and structure. To address these issues, we propose a Hierarchical\nAdaptive Interaction and Weighting Network termed H2ASeg to explore the\nintrinsic cross-modal correlations and transfer potential complementary\ninformation. Specifically, we design a Modality-Cooperative Spatial Attention\n(MCSA) module that performs intra- and inter-modal interactions globally and\nlocally. Additionally, a Target-Aware Modality Weighting (TAMW) module is\ndeveloped to highlight tumor-related features within multi-modal features,\nthereby refining tumor segmentation. By embedding these modules across\ndifferent layers, H2ASeg can hierarchically model cross-modal correlations,\nenabling a nuanced understanding of both semantic and structural tumor\nfeatures. Extensive experiments demonstrate the superiority of H2ASeg,\noutperforming state-of-the-art methods on AutoPet-II and Hecktor2022\nbenchmarks. The code is released at https://github.com/G14nTDo4/H2ASeg.\n","authors":["Jinpeng Lu","Jingyun Chen","Linghan Cai","Songhan Jiang","Yongbing Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.18339v1.pdf","comment":"10 pages,4 figures"},{"id":"http://arxiv.org/abs/2403.17301v2","updated":"2024-03-27T08:23:09Z","published":"2024-03-26T01:06:47Z","title":"Physical 3D Adversarial Attacks against Monocular Depth Estimation in\n Autonomous Driving","summary":" Deep learning-based monocular depth estimation (MDE), extensively applied in\nautonomous driving, is known to be vulnerable to adversarial attacks. Previous\nphysical attacks against MDE models rely on 2D adversarial patches, so they\nonly affect a small, localized region in the MDE map but fail under various\nviewpoints. To address these limitations, we propose 3D Depth Fool\n(3D$^2$Fool), the first 3D texture-based adversarial attack against MDE models.\n3D$^2$Fool is specifically optimized to generate 3D adversarial textures\nagnostic to model types of vehicles and to have improved robustness in bad\nweather conditions, such as rain and fog. Experimental results validate the\nsuperior performance of our 3D$^2$Fool across various scenarios, including\nvehicles, MDE models, weather conditions, and viewpoints. Real-world\nexperiments with printed 3D textures on physical vehicle models further\ndemonstrate that our 3D$^2$Fool can cause an MDE error of over 10 meters.\n","authors":["Junhao Zheng","Chenhao Lin","Jiahao Sun","Zhengyu Zhao","Qian Li","Chao Shen"],"pdf_url":"https://arxiv.org/pdf/2403.17301v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2306.02928v2","updated":"2024-03-27T08:21:17Z","published":"2023-06-05T14:45:38Z","title":"Weakly-Supervised Conditional Embedding for Referred Visual Search","summary":" This paper introduces a new challenge for image similarity search in the\ncontext of fashion, addressing the inherent ambiguity in this domain stemming\nfrom complex images. We present Referred Visual Search (RVS), a task allowing\nusers to define more precisely the desired similarity, following recent\ninterest in the industry. We release a new large public dataset,\nLAION-RVS-Fashion, consisting of 272k fashion products with 842k images\nextracted from LAION, designed explicitly for this task. However, unlike\ntraditional visual search methods in the industry, we demonstrate that superior\nperformance can be achieved by bypassing explicit object detection and adopting\nweakly-supervised conditional contrastive learning on image tuples. Our method\nis lightweight and demonstrates robustness, reaching Recall at one superior to\nstrong detection-based baselines against 2M distractors. Code, data and models\nare available at https://www.github.com/Simon-Lepage/CondViT-LRVSF .\n","authors":["Simon Lepage","Jérémie Mary","David Picard"],"pdf_url":"https://arxiv.org/pdf/2306.02928v2.pdf","comment":"28 pages, 13 figures, 5 tables"},{"id":"http://arxiv.org/abs/2403.18334v1","updated":"2024-03-27T08:16:33Z","published":"2024-03-27T08:16:33Z","title":"DODA: Diffusion for Object-detection Domain Adaptation in Agriculture","summary":" The diverse and high-quality content generated by recent generative models\ndemonstrates the great potential of using synthetic data to train downstream\nmodels. However, in vision, especially in objection detection, related areas\nare not fully explored, the synthetic images are merely used to balance the\nlong tails of existing datasets, and the accuracy of the generated labels is\nlow, the full potential of generative models has not been exploited. In this\npaper, we propose DODA, a data synthesizer that can generate high-quality\nobject detection data for new domains in agriculture. Specifically, we improve\nthe controllability of layout-to-image through encoding layout as an image,\nthereby improving the quality of labels, and use a visual encoder to provide\nvisual clues for the diffusion model to decouple visual features from the\ndiffusion model, and empowering the model the ability to generate data in new\ndomains. On the Global Wheat Head Detection (GWHD) Dataset, which is the\nlargest dataset in agriculture and contains diverse domains, using the data\nsynthesized by DODA improves the performance of the object detector by\n12.74-17.76 AP$_{50}$ in the domain that was significantly shifted from the\ntraining data.\n","authors":["Shuai Xiang","Pieter M. Blok","James Burridge","Haozhou Wang","Wei Guo"],"pdf_url":"https://arxiv.org/pdf/2403.18334v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18330v1","updated":"2024-03-27T08:11:25Z","published":"2024-03-27T08:11:25Z","title":"Tracking-Assisted Object Detection with Event Cameras","summary":" Event-based object detection has recently garnered attention in the computer\nvision community due to the exceptional properties of event cameras, such as\nhigh dynamic range and no motion blur. However, feature asynchronism and\nsparsity cause invisible objects due to no relative motion to the camera,\nposing a significant challenge in the task. Prior works have studied various\nmemory mechanisms to preserve as many features as possible at the current time,\nguided by temporal clues. While these implicit-learned memories retain some\nshort-term information, they still struggle to preserve long-term features\neffectively. In this paper, we consider those invisible objects as\npseudo-occluded objects and aim to reveal their features. Firstly, we introduce\nvisibility attribute of objects and contribute an auto-labeling algorithm to\nappend additional visibility labels on an existing event camera dataset.\nSecondly, we exploit tracking strategies for pseudo-occluded objects to\nmaintain their permanence and retain their bounding boxes, even when features\nhave not been available for a very long time. These strategies can be treated\nas an explicit-learned memory guided by the tracking objective to record the\ndisplacements of objects across frames. Lastly, we propose a spatio-temporal\nfeature aggregation module to enrich the latent features and a consistency loss\nto increase the robustness of the overall pipeline. We conduct comprehensive\nexperiments to verify our method's effectiveness where still objects are\nretained but real occluded objects are discarded. The results demonstrate that\n(1) the additional visibility labels can assist in supervised training, and (2)\nour method outperforms state-of-the-art approaches with a significant\nimprovement of 7.9% absolute mAP.\n","authors":["Ting-Kang Yen","Igor Morawski","Shusil Dangi","Kai He","Chung-Yi Lin","Jia-Fong Yeh","Hung-Ting Su","Winston Hsu"],"pdf_url":"https://arxiv.org/pdf/2403.18330v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18328v1","updated":"2024-03-27T08:09:04Z","published":"2024-03-27T08:09:04Z","title":"PIPNet3D: Interpretable Detection of Alzheimer in MRI Scans","summary":" Information from neuroimaging examinations (CT, MRI) is increasingly used to\nsupport diagnoses of dementia, e.g., Alzheimer's disease. While current\nclinical practice is mainly based on visual inspection and feature engineering,\nDeep Learning approaches can be used to automate the analysis and to discover\nnew image-biomarkers. Part-prototype neural networks (PP-NN) are an alternative\nto standard blackbox models, and have shown promising results in general\ncomputer vision. PP-NN's base their reasoning on prototypical image regions\nthat are learned fully unsupervised, and combined with a simple-to-understand\ndecision layer. We present PIPNet3D, a PP-NN for volumetric images. We apply\nPIPNet3D to the clinical case study of Alzheimer's Disease diagnosis from\nstructural Magnetic Resonance Imaging (sMRI). We assess the quality of\nprototypes under a systematic evaluation framework, propose new metrics to\nevaluate brain prototypes and perform an evaluation with domain experts. Our\nresults show that PIPNet3D is an interpretable, compact model for Alzheimer's\ndiagnosis with its reasoning well aligned to medical domain knowledge. Notably,\nPIPNet3D achieves the same accuracy as its blackbox counterpart; and removing\nthe remaining clinically irrelevant prototypes from its decision process does\nnot decrease predictive performance.\n","authors":["Lisa Anita De Santi","Jörg Schlötterer","Michael Scheschenja","Joel Wessendorf","Meike Nauta","Vincenzo Positano","Christin Seifert"],"pdf_url":"https://arxiv.org/pdf/2403.18328v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10030v2","updated":"2024-03-27T07:52:10Z","published":"2024-03-15T05:30:29Z","title":"Multi-criteria Token Fusion with One-step-ahead Attention for Efficient\n Vision Transformers","summary":" Vision Transformer (ViT) has emerged as a prominent backbone for computer\nvision. For more efficient ViTs, recent works lessen the quadratic cost of the\nself-attention layer by pruning or fusing the redundant tokens. However, these\nworks faced the speed-accuracy trade-off caused by the loss of information.\nHere, we argue that token fusion needs to consider diverse relations between\ntokens to minimize information loss. In this paper, we propose a Multi-criteria\nToken Fusion (MCTF), that gradually fuses the tokens based on multi-criteria\n(e.g., similarity, informativeness, and size of fused tokens). Further, we\nutilize the one-step-ahead attention, which is the improved approach to capture\nthe informativeness of the tokens. By training the model equipped with MCTF\nusing a token reduction consistency, we achieve the best speed-accuracy\ntrade-off in the image classification (ImageNet1K). Experimental results prove\nthat MCTF consistently surpasses the previous reduction methods with and\nwithout training. Specifically, DeiT-T and DeiT-S with MCTF reduce FLOPs by\nabout 44% while improving the performance (+0.5%, and +0.3%) over the base\nmodel, respectively. We also demonstrate the applicability of MCTF in various\nVision Transformers (e.g., T2T-ViT, LV-ViT), achieving at least 31% speedup\nwithout performance degradation. Code is available at\nhttps://github.com/mlvlab/MCTF.\n","authors":["Sanghyeok Lee","Joonmyung Choi","Hyunwoo J. Kim"],"pdf_url":"https://arxiv.org/pdf/2403.10030v2.pdf","comment":"Conference on Computer Vision and Pattern Recognition (CVPR), 2024"},{"id":"http://arxiv.org/abs/2403.18321v1","updated":"2024-03-27T07:50:45Z","published":"2024-03-27T07:50:45Z","title":"Implementation of the Principal Component Analysis onto High-Performance\n Computer Facilities for Hyperspectral Dimensionality Reduction: Results and\n Comparisons","summary":" Dimensionality reduction represents a critical preprocessing step in order to\nincrease the efficiency and the performance of many hyperspectral imaging\nalgorithms. However, dimensionality reduction algorithms, such as the Principal\nComponent Analysis (PCA), suffer from their computationally demanding nature,\nbecoming advisable for their implementation onto high-performance computer\narchitectures for applications under strict latency constraints. This work\npresents the implementation of the PCA algorithm onto two different\nhigh-performance devices, namely, an NVIDIA Graphics Processing Unit (GPU) and\na Kalray manycore, uncovering a highly valuable set of tips and tricks in order\nto take full advantage of the inherent parallelism of these high-performance\ncomputing platforms, and hence, reducing the time that is required to process a\ngiven hyperspectral image. Moreover, the achieved results obtained with\ndifferent hyperspectral images have been compared with the ones that were\nobtained with a field programmable gate array (FPGA)-based implementation of\nthe PCA algorithm that has been recently published, providing, for the first\ntime in the literature, a comprehensive analysis in order to highlight the pros\nand cons of each option.\n","authors":["E. Martel","R. Lazcano","J. Lopez","D. Madroñal","R. Salvador","S. Lopez","E. Juarez","R. Guerra","C. Sanz","R. Sarmiento"],"pdf_url":"https://arxiv.org/pdf/2403.18321v1.pdf","comment":"30 pages, 10 figures"},{"id":"http://arxiv.org/abs/2403.18318v1","updated":"2024-03-27T07:40:51Z","published":"2024-03-27T07:40:51Z","title":"Uncertainty-Aware SAR ATR: Defending Against Adversarial Attacks via\n Bayesian Neural Networks","summary":" Adversarial attacks have demonstrated the vulnerability of Machine Learning\n(ML) image classifiers in Synthetic Aperture Radar (SAR) Automatic Target\nRecognition (ATR) systems. An adversarial attack can deceive the classifier\ninto making incorrect predictions by perturbing the input SAR images, for\nexample, with a few scatterers attached to the on-ground objects. Therefore, it\nis critical to develop robust SAR ATR systems that can detect potential\nadversarial attacks by leveraging the inherent uncertainty in ML classifiers,\nthereby effectively alerting human decision-makers. In this paper, we propose a\nnovel uncertainty-aware SAR ATR for detecting adversarial attacks.\nSpecifically, we leverage the capability of Bayesian Neural Networks (BNNs) in\nperforming image classification with quantified epistemic uncertainty to\nmeasure the confidence for each input SAR image. By evaluating the uncertainty,\nour method alerts when the input SAR image is likely to be adversarially\ngenerated. Simultaneously, we also generate visual explanations that reveal the\nspecific regions in the SAR image where the adversarial scatterers are likely\nto to be present, thus aiding human decision-making with hints of evidence of\nadversarial attacks. Experiments on the MSTAR dataset demonstrate that our\napproach can identify over 80% adversarial SAR images with fewer than 20% false\nalarms, and our visual explanations can identify up to over 90% of scatterers\nin an adversarial SAR image.\n","authors":["Tian Ye","Rajgopal Kannan","Viktor Prasanna","Carl Busart"],"pdf_url":"https://arxiv.org/pdf/2403.18318v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.08251v2","updated":"2024-03-27T07:33:42Z","published":"2022-12-16T02:43:52Z","title":"Task-Adaptive Saliency Guidance for Exemplar-free Class Incremental\n Learning","summary":" Exemplar-free Class Incremental Learning (EFCIL) aims to sequentially learn\ntasks with access only to data from the current one. EFCIL is of interest\nbecause it mitigates concerns about privacy and long-term storage of data,\nwhile at the same time alleviating the problem of catastrophic forgetting in\nincremental learning. In this work, we introduce task-adaptive saliency for\nEFCIL and propose a new framework, which we call Task-Adaptive Saliency\nSupervision (TASS), for mitigating the negative effects of saliency drift\nbetween different tasks. We first apply boundary-guided saliency to maintain\ntask adaptivity and \\textit{plasticity} on model attention. Besides, we\nintroduce task-agnostic low-level signals as auxiliary supervision to increase\nthe \\textit{stability} of model attention. Finally, we introduce a module for\ninjecting and recovering saliency noise to increase the robustness of saliency\npreservation. Our experiments demonstrate that our method can better preserve\nsaliency maps across tasks and achieve state-of-the-art results on the\nCIFAR-100, Tiny-ImageNet, and ImageNet-Subset EFCIL benchmarks. Code is\navailable at \\url{https://github.com/scok30/tass}.\n","authors":["Xialei Liu","Jiang-Tian Zhai","Andrew D. Bagdanov","Ke Li","Ming-Ming Cheng"],"pdf_url":"https://arxiv.org/pdf/2212.08251v2.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2307.09136v2","updated":"2024-03-27T07:16:28Z","published":"2023-07-18T10:34:21Z","title":"The Effects of Mixed Sample Data Augmentation are Class Dependent","summary":" Mixed Sample Data Augmentation (MSDA) techniques, such as Mixup, CutMix, and\nPuzzleMix, have been widely acknowledged for enhancing performance in a variety\nof tasks. A previous study reported the class dependency of traditional data\naugmentation (DA), where certain classes benefit disproportionately compared to\nothers. This paper reveals a class dependent effect of MSDA, where some classes\nexperience improved performance while others experience degraded performance.\nThis research addresses the issue of class dependency in MSDA and proposes an\nalgorithm to mitigate it. The approach involves training on a mixture of MSDA\nand non-MSDA data, which not only mitigates the negative impact on the affected\nclasses, but also improves overall accuracy. Furthermore, we provide in-depth\nanalysis and discussion of why MSDA introduced class dependencies and which\nclasses are most likely to have them.\n","authors":["Haeil Lee","Hansang Lee","Junmo Kim"],"pdf_url":"https://arxiv.org/pdf/2307.09136v2.pdf","comment":"21 pages, 18 figures, Overall Revision"},{"id":"http://arxiv.org/abs/2402.18920v5","updated":"2024-03-27T07:16:21Z","published":"2024-02-29T07:26:23Z","title":"Spectral Meets Spatial: Harmonising 3D Shape Matching and Interpolation","summary":" Although 3D shape matching and interpolation are highly interrelated, they\nare often studied separately and applied sequentially to relate different 3D\nshapes, thus resulting in sub-optimal performance. In this work we present a\nunified framework to predict both point-wise correspondences and shape\ninterpolation between 3D shapes. To this end, we combine the deep functional\nmap framework with classical surface deformation models to map shapes in both\nspectral and spatial domains. On the one hand, by incorporating spatial maps,\nour method obtains more accurate and smooth point-wise correspondences compared\nto previous functional map methods for shape matching. On the other hand, by\nintroducing spectral maps, our method gets rid of commonly used but\ncomputationally expensive geodesic distance constraints that are only valid for\nnear-isometric shape deformations. Furthermore, we propose a novel test-time\nadaptation scheme to capture both pose-dominant and shape-dominant\ndeformations. Using different challenging datasets, we demonstrate that our\nmethod outperforms previous state-of-the-art methods for both shape matching\nand interpolation, even compared to supervised approaches.\n","authors":["Dongliang Cao","Marvin Eisenberger","Nafie El Amrani","Daniel Cremers","Florian Bernard"],"pdf_url":"https://arxiv.org/pdf/2402.18920v5.pdf","comment":"accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2308.13356v3","updated":"2024-03-27T07:12:09Z","published":"2023-08-25T13:05:06Z","title":"CEIMVEN: An Approach of Cutting Edge Implementation of Modified Versions\n of EfficientNet (V1-V2) Architecture for Breast Cancer Detection and\n Classification from Ultrasound Images","summary":" Undoubtedly breast cancer identifies itself as one of the most widespread and\nterrifying cancers across the globe. Millions of women are getting affected\neach year from it. Breast cancer remains the major one for being the reason of\nlargest number of demise of women. In the recent time of research, Medical\nImage Computing and Processing has been playing a significant role for\ndetecting and classifying breast cancers from ultrasound images and mammograms,\nalong with the celestial touch of deep neural networks. In this research, we\nfocused mostly on our rigorous implementations and iterative result analysis of\ndifferent cutting-edge modified versions of EfficientNet architectures namely\nEfficientNet-V1 (b0-b7) and EfficientNet-V2 (b0-b3) with ultrasound image,\nnamed as CEIMVEN. We utilized transfer learning approach here for using the\npre-trained models of EfficientNet versions. We activated the hyper-parameter\ntuning procedures, added fully connected layers, discarded the unprecedented\noutliers and recorded the accuracy results from our custom modified\nEfficientNet architectures. Our deep learning model training approach was\nrelated to both identifying the cancer affected areas with region of interest\n(ROI) techniques and multiple classifications (benign, malignant and normal).\nThe approximate testing accuracies we got from the modified versions of\nEfficientNet-V1 (b0- 99.15%, b1- 98.58%, b2- 98.43%, b3- 98.01%, b4- 98.86%,\nb5- 97.72%, b6- 97.72%, b7- 98.72%) and EfficientNet-V2 (b0- 99.29%, b1-\n99.01%, b2- 98.72%, b3- 99.43%) are showing very bright future and strong\npotentials of deep learning approach for the successful detection and\nclassification of breast cancers from the ultrasound images at a very early\nstage. The code for this research is available here:\nhttps://github.com/ac005sheekar/CEIMVEN-Breast.\n","authors":["Sheekar Banerjee","Md. Kamrul Hasan Monir"],"pdf_url":"https://arxiv.org/pdf/2308.13356v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18301v1","updated":"2024-03-27T06:55:23Z","published":"2024-03-27T06:55:23Z","title":"Selective Mixup Fine-Tuning for Optimizing Non-Decomposable Objectives","summary":" The rise in internet usage has led to the generation of massive amounts of\ndata, resulting in the adoption of various supervised and semi-supervised\nmachine learning algorithms, which can effectively utilize the colossal amount\nof data to train models. However, before deploying these models in the real\nworld, these must be strictly evaluated on performance measures like worst-case\nrecall and satisfy constraints such as fairness. We find that current\nstate-of-the-art empirical techniques offer sub-optimal performance on these\npractical, non-decomposable performance objectives. On the other hand, the\ntheoretical techniques necessitate training a new model from scratch for each\nperformance objective. To bridge the gap, we propose SelMix, a selective\nmixup-based inexpensive fine-tuning technique for pre-trained models, to\noptimize for the desired objective. The core idea of our framework is to\ndetermine a sampling distribution to perform a mixup of features between\nsamples from particular classes such that it optimizes the given objective. We\ncomprehensively evaluate our technique against the existing empirical and\ntheoretically principled methods on standard benchmark datasets for imbalanced\nclassification. We find that proposed SelMix fine-tuning significantly improves\nthe performance for various practical non-decomposable objectives across\nbenchmarks.\n","authors":["Shrinivas Ramasubramanian","Harsh Rangwani","Sho Takemori","Kunal Samanta","Yuhei Umeda","Venkatesh Babu Radhakrishnan"],"pdf_url":"https://arxiv.org/pdf/2403.18301v1.pdf","comment":"ICLR 2024 SpotLight"},{"id":"http://arxiv.org/abs/2403.07392v3","updated":"2024-03-27T06:44:13Z","published":"2024-03-12T07:59:41Z","title":"ViT-CoMer: Vision Transformer with Convolutional Multi-scale Feature\n Interaction for Dense Predictions","summary":" Although Vision Transformer (ViT) has achieved significant success in\ncomputer vision, it does not perform well in dense prediction tasks due to the\nlack of inner-patch information interaction and the limited diversity of\nfeature scale. Most existing studies are devoted to designing vision-specific\ntransformers to solve the above problems, which introduce additional\npre-training costs. Therefore, we present a plain, pre-training-free, and\nfeature-enhanced ViT backbone with Convolutional Multi-scale feature\ninteraction, named ViT-CoMer, which facilitates bidirectional interaction\nbetween CNN and transformer. Compared to the state-of-the-art, ViT-CoMer has\nthe following advantages: (1) We inject spatial pyramid multi-receptive field\nconvolutional features into the ViT architecture, which effectively alleviates\nthe problems of limited local information interaction and single-feature\nrepresentation in ViT. (2) We propose a simple and efficient CNN-Transformer\nbidirectional fusion interaction module that performs multi-scale fusion across\nhierarchical features, which is beneficial for handling dense prediction tasks.\n(3) We evaluate the performance of ViT-CoMer across various dense prediction\ntasks, different frameworks, and multiple advanced pre-training. Notably, our\nViT-CoMer-L achieves 64.3% AP on COCO val2017 without extra training data, and\n62.1% mIoU on ADE20K val, both of which are comparable to state-of-the-art\nmethods. We hope ViT-CoMer can serve as a new backbone for dense prediction\ntasks to facilitate future research. The code will be released at\nhttps://github.com/Traffic-X/ViT-CoMer.\n","authors":["Chunlong Xia","Xinliang Wang","Feng Lv","Xin Hao","Yifeng Shi"],"pdf_url":"https://arxiv.org/pdf/2403.07392v3.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2403.18294v1","updated":"2024-03-27T06:40:26Z","published":"2024-03-27T06:40:26Z","title":"Multi-scale Unified Network for Image Classification","summary":" Convolutional Neural Networks (CNNs) have advanced significantly in visual\nrepresentation learning and recognition. However, they face notable challenges\nin performance and computational efficiency when dealing with real-world,\nmulti-scale image inputs. Conventional methods rescale all input images into a\nfixed size, wherein a larger fixed size favors performance but rescaling small\nsize images to a larger size incurs digitization noise and increased\ncomputation cost. In this work, we carry out a comprehensive, layer-wise\ninvestigation of CNN models in response to scale variation, based on Centered\nKernel Alignment (CKA) analysis. The observations reveal lower layers are more\nsensitive to input image scale variations than high-level layers. Inspired by\nthis insight, we propose Multi-scale Unified Network (MUSN) consisting of\nmulti-scale subnets, a unified network, and scale-invariant constraint. Our\nmethod divides the shallow layers into multi-scale subnets to enable feature\nextraction from multi-scale inputs, and the low-level features are unified in\ndeep layers for extracting high-level semantic features. A scale-invariant\nconstraint is posed to maintain feature consistency across different scales.\nExtensive experiments on ImageNet and other scale-diverse datasets, demonstrate\nthat MSUN achieves significant improvements in both model performance and\ncomputational efficiency. Particularly, MSUN yields an accuracy increase up to\n44.53% and diminishes FLOPs by 7.01-16.13% in multi-scale scenarios.\n","authors":["Wenzhuo Liu","Fei Zhu","Cheng-Lin Liu"],"pdf_url":"https://arxiv.org/pdf/2403.18294v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18293v1","updated":"2024-03-27T06:37:51Z","published":"2024-03-27T06:37:51Z","title":"Efficient Test-Time Adaptation of Vision-Language Models","summary":" Test-time adaptation with pre-trained vision-language models has attracted\nincreasing attention for tackling distribution shifts during the test time.\nThough prior studies have achieved very promising performance, they involve\nintensive computation which is severely unaligned with test-time adaptation. We\ndesign TDA, a training-free dynamic adapter that enables effective and\nefficient test-time adaptation with vision-language models. TDA works with a\nlightweight key-value cache that maintains a dynamic queue with few-shot pseudo\nlabels as values and the corresponding test-sample features as keys. Leveraging\nthe key-value cache, TDA allows adapting to test data gradually via progressive\npseudo label refinement which is super-efficient without incurring any\nbackpropagation. In addition, we introduce negative pseudo labeling that\nalleviates the adverse impact of pseudo label noises by assigning pseudo labels\nto certain negative classes when the model is uncertain about its pseudo label\npredictions. Extensive experiments over two benchmarks demonstrate TDA's\nsuperior effectiveness and efficiency as compared with the state-of-the-art.\nThe code has been released in \\url{https://kdiaaa.github.io/tda/}.\n","authors":["Adilbek Karmanov","Dayan Guan","Shijian Lu","Abdulmotaleb El Saddik","Eric Xing"],"pdf_url":"https://arxiv.org/pdf/2403.18293v1.pdf","comment":"Accepted to CVPR 2024. The code has been released in\n \\url{https://kdiaaa.github.io/tda/}"},{"id":"http://arxiv.org/abs/2403.18291v1","updated":"2024-03-27T06:28:19Z","published":"2024-03-27T06:28:19Z","title":"Towards Non-Exemplar Semi-Supervised Class-Incremental Learning","summary":" Deep neural networks perform remarkably well in close-world scenarios.\nHowever, novel classes emerged continually in real applications, making it\nnecessary to learn incrementally. Class-incremental learning (CIL) aims to\ngradually recognize new classes while maintaining the discriminability of old\nones. Existing CIL methods have two limitations: a heavy reliance on preserving\nold data for forgetting mitigation and the need for vast labeled data for\nknowledge adaptation. To overcome these issues, we propose a non-exemplar\nsemi-supervised CIL framework with contrastive learning and semi-supervised\nincremental prototype classifier (Semi-IPC). On the one hand, contrastive\nlearning helps the model learn rich representations, easing the trade-off\nbetween learning representations of new classes and forgetting that of old\nclasses. On the other hand, Semi-IPC learns a prototype for each class with\nunsupervised regularization, enabling the model to incrementally learn from\npartially labeled new data while maintaining the knowledge of old classes.\nExperiments on benchmark datasets demonstrate the strong performance of our\nmethod: without storing any old samples and only using less than 1% of labels,\nSemi-IPC outperforms advanced exemplar-based methods. We hope our work offers\nnew insights for future CIL research. The code will be made publicly available.\n","authors":["Wenzhuo Liu","Fei Zhu","Cheng-Lin Liu"],"pdf_url":"https://arxiv.org/pdf/2403.18291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15864v2","updated":"2024-03-27T06:26:09Z","published":"2023-11-27T14:32:33Z","title":"InterControl: Generate Human Motion Interactions by Controlling Every\n Joint","summary":" Text-conditioned human motion synthesis has made remarkable progress with the\nemergence of diffusion models in recent research. However, the majority of\nthese motion diffusion models are primarily designed for a single character and\noverlook multi-human interactions. In our approach, we strive to explore this\nproblem by synthesizing human motion with interactions for a group of\ncharacters of any size. The key aspect of our approach is the adaptation of\nhuman-wise interactions as pairs of human joints that can be either in contact\nor separated by a desired distance. In contrast to existing methods that\nnecessitate training motion generation models on multi-human motion datasets\nwith a fixed number of characters, our approach inherently possesses the\nflexibility to model human interactions involving an arbitrary number of\nindividuals, thereby transcending the limitations imposed by the training data.\nWe introduce a novel controllable motion generation method, InterControl, to\nencourage the synthesized motions maintaining the desired distance between\njoint pairs. It consists of a motion controller and an inverse kinematics\nguidance module that realistically and accurately aligns the joints of\nsynthesized characters to the desired location. Furthermore, we demonstrate\nthat the distance between joint pairs for human-wise interactions can be\ngenerated using an off-the-shelf Large Language Model (LLM). Experimental\nresults highlight the capability of our framework to generate interactions with\nmultiple human characters and its potential to work with off-the-shelf\nphysics-based character simulators.\n","authors":["Zhenzhi Wang","Jingbo Wang","Yixuan Li","Dahua Lin","Bo Dai"],"pdf_url":"https://arxiv.org/pdf/2311.15864v2.pdf","comment":"Generate human interactions with only single-person data via joint\n contact pairs, code https://github.com/zhenzhiwang/intercontrol"},{"id":"http://arxiv.org/abs/2403.18282v1","updated":"2024-03-27T06:18:40Z","published":"2024-03-27T06:18:40Z","title":"SGDM: Static-Guided Dynamic Module Make Stronger Visual Models","summary":" The spatial attention mechanism has been widely used to improve object\ndetection performance. However, its operation is currently limited to static\nconvolutions lacking content-adaptive features. This paper innovatively\napproaches from the perspective of dynamic convolution. We propose Razor\nDynamic Convolution (RDConv) to address thetwo flaws in dynamic weight\nconvolution, making it hard to implement in spatial mechanism: 1) it is\ncomputation-heavy; 2) when generating weights, spatial information is\ndisregarded. Firstly, by using Razor Operation to generate certain features, we\nvastly reduce the parameters of the entire dynamic convolution operation.\nSecondly, we added a spatial branch inside RDConv to generate convolutional\nkernel parameters with richer spatial information. Embedding dynamic\nconvolution will also bring the problem of sensitivity to high-frequency noise.\nWe propose the Static-Guided Dynamic Module (SGDM) to address this limitation.\nBy using SGDM, we utilize a set of asymmetric static convolution kernel\nparameters to guide the construction of dynamic convolution. We introduce the\nmechanism of shared weights in static convolution to solve the problem of\ndynamic convolution being sensitive to high-frequency noise. Extensive\nexperiments illustrate that multiple different object detection backbones\nequipped with SGDM achieve a highly competitive boost in performance(e.g., +4%\nmAP with YOLOv5n on VOC and +1.7% mAP with YOLOv8n on COCO) with negligible\nparameter increase(i.e., +0.33M on YOLOv5n and +0.19M on YOLOv8n).\n","authors":["Wenjie Xing","Zhenchao Cui","Jing Qi"],"pdf_url":"https://arxiv.org/pdf/2403.18282v1.pdf","comment":"16 pages, 4 figures"},{"id":"http://arxiv.org/abs/2403.18281v1","updated":"2024-03-27T06:17:21Z","published":"2024-03-27T06:17:21Z","title":"AIR-HLoc: Adaptive Image Retrieval for Efficient Visual Localisation","summary":" State-of-the-art (SOTA) hierarchical localisation pipelines (HLoc) rely on\nimage retrieval (IR) techniques to establish 2D-3D correspondences by selecting\nthe $k$ most similar images from a reference image database for a given query\nimage. Although higher values of $k$ enhance localisation robustness, the\ncomputational cost for feature matching increases linearly with $k$. In this\npaper, we observe that queries that are the most similar to images in the\ndatabase result in a higher proportion of feature matches and, thus, more\naccurate positioning. Thus, a small number of images is sufficient for queries\nvery similar to images in the reference database. We then propose a novel\napproach, AIR-HLoc, which divides query images into different localisation\ndifficulty levels based on their similarity to the reference image database. We\nconsider an image with high similarity to the reference image as an easy query\nand an image with low similarity as a hard query. Easy queries show a limited\nimprovement in accuracy when increasing $k$. Conversely, higher values of $k$\nsignificantly improve accuracy for hard queries. Given the limited improvement\nin accuracy when increasing $k$ for easy queries and the significant\nimprovement for hard queries, we adapt the value of $k$ to the query's\ndifficulty level. Therefore, AIR-HLoc optimizes processing time by adaptively\nassigning different values of $k$ based on the similarity between the query and\nreference images without losing accuracy. Our extensive experiments on the\nCambridge Landmarks, 7Scenes, and Aachen Day-Night-v1.1 datasets demonstrate\nour algorithm's efficacy, reducing 30\\%, 26\\%, and 11\\% in computational\noverhead while maintaining SOTA accuracy compared to HLoc with fixed image\nretrieval.\n","authors":["Changkun Liu","Huajian Huang","Zhengyang Ma","Tristan Braud"],"pdf_url":"https://arxiv.org/pdf/2403.18281v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.07711v2","updated":"2024-03-27T06:02:38Z","published":"2024-03-12T14:53:56Z","title":"SSM Meets Video Diffusion Models: Efficient Video Generation with\n Structured State Spaces","summary":" Given the remarkable achievements in image generation through diffusion\nmodels, the research community has shown increasing interest in extending these\nmodels to video generation. Recent diffusion models for video generation have\npredominantly utilized attention layers to extract temporal features. However,\nattention layers are limited by their memory consumption, which increases\nquadratically with the length of the sequence. This limitation presents\nsignificant challenges when attempting to generate longer video sequences using\ndiffusion models. To overcome this challenge, we propose leveraging state-space\nmodels (SSMs). SSMs have recently gained attention as viable alternatives due\nto their linear memory consumption relative to sequence length. In the\nexperiments, we first evaluate our SSM-based model with UCF101, a standard\nbenchmark of video generation. In addition, to investigate the potential of\nSSMs for longer video generation, we perform an experiment using the MineRL\nNavigate dataset, varying the number of frames to 64, 200, and 400. In these\nsettings, our SSM-based model can considerably save memory consumption for\nlonger sequences, while maintaining competitive FVD scores to the\nattention-based models. Our codes are available at\nhttps://github.com/shim0114/SSM-Meets-Video-Diffusion-Models.\n","authors":["Yuta Oshima","Shohei Taniguchi","Masahiro Suzuki","Yutaka Matsuo"],"pdf_url":"https://arxiv.org/pdf/2403.07711v2.pdf","comment":"Accepted as workshop paper at ICLR 2024"},{"id":"http://arxiv.org/abs/2303.08231v3","updated":"2024-03-27T06:00:18Z","published":"2023-03-14T20:55:27Z","title":"Rotation-Invariant Transformer for Point Cloud Matching","summary":" The intrinsic rotation invariance lies at the core of matching point clouds\nwith handcrafted descriptors. However, it is widely despised by recent deep\nmatchers that obtain the rotation invariance extrinsically via data\naugmentation. As the finite number of augmented rotations can never span the\ncontinuous SO(3) space, these methods usually show instability when facing\nrotations that are rarely seen. To this end, we introduce RoITr, a\nRotation-Invariant Transformer to cope with the pose variations in the point\ncloud matching task. We contribute both on the local and global levels.\nStarting from the local level, we introduce an attention mechanism embedded\nwith Point Pair Feature (PPF)-based coordinates to describe the pose-invariant\ngeometry, upon which a novel attention-based encoder-decoder architecture is\nconstructed. We further propose a global transformer with rotation-invariant\ncross-frame spatial awareness learned by the self-attention mechanism, which\nsignificantly improves the feature distinctiveness and makes the model robust\nwith respect to the low overlap. Experiments are conducted on both the rigid\nand non-rigid public benchmarks, where RoITr outperforms all the\nstate-of-the-art models by a considerable margin in the low-overlapping\nscenarios. Especially when the rotations are enlarged on the challenging\n3DLoMatch benchmark, RoITr surpasses the existing methods by at least 13 and 5\npercentage points in terms of Inlier Ratio and Registration Recall,\nrespectively.\n","authors":["Hao Yu","Zheng Qin","Ji Hou","Mahdi Saleh","Dongsheng Li","Benjamin Busam","Slobodan Ilic"],"pdf_url":"https://arxiv.org/pdf/2303.08231v3.pdf","comment":"Accepted to CVPR 2023"},{"id":"http://arxiv.org/abs/2403.18274v1","updated":"2024-03-27T05:57:45Z","published":"2024-03-27T05:57:45Z","title":"DVLO: Deep Visual-LiDAR Odometry with Local-to-Global Feature Fusion and\n Bi-Directional Structure Alignment","summary":" Information inside visual and LiDAR data is well complementary derived from\nthe fine-grained texture of images and massive geometric information in point\nclouds. However, it remains challenging to explore effective visual-LiDAR\nfusion, mainly due to the intrinsic data structure inconsistency between two\nmodalities: Images are regular and dense, but LiDAR points are unordered and\nsparse. To address the problem, we propose a local-to-global fusion network\nwith bi-directional structure alignment. To obtain locally fused features, we\nproject points onto image plane as cluster centers and cluster image pixels\naround each center. Image pixels are pre-organized as pseudo points for\nimage-to-point structure alignment. Then, we convert points to pseudo images by\ncylindrical projection (point-to-image structure alignment) and perform\nadaptive global feature fusion between point features with local fused\nfeatures. Our method achieves state-of-the-art performance on KITTI odometry\nand FlyingThings3D scene flow datasets compared to both single-modal and\nmulti-modal methods. Codes will be released later.\n","authors":["Jiuming Liu","Dong Zhuo","Zhiheng Feng","Siting Zhu","Chensheng Peng","Zhe Liu","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2403.18274v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18271v1","updated":"2024-03-27T05:55:16Z","published":"2024-03-27T05:55:16Z","title":"Unleashing the Potential of SAM for Medical Adaptation via Hierarchical\n Decoding","summary":" The Segment Anything Model (SAM) has garnered significant attention for its\nversatile segmentation abilities and intuitive prompt-based interface. However,\nits application in medical imaging presents challenges, requiring either\nsubstantial training costs and extensive medical datasets for full model\nfine-tuning or high-quality prompts for optimal performance. This paper\nintroduces H-SAM: a prompt-free adaptation of SAM tailored for efficient\nfine-tuning of medical images via a two-stage hierarchical decoding procedure.\nIn the initial stage, H-SAM employs SAM's original decoder to generate a prior\nprobabilistic mask, guiding a more intricate decoding process in the second\nstage. Specifically, we propose two key designs: 1) A class-balanced,\nmask-guided self-attention mechanism addressing the unbalanced label\ndistribution, enhancing image embedding; 2) A learnable mask cross-attention\nmechanism spatially modulating the interplay among different image regions\nbased on the prior mask. Moreover, the inclusion of a hierarchical pixel\ndecoder in H-SAM enhances its proficiency in capturing fine-grained and\nlocalized details. This approach enables SAM to effectively integrate learned\nmedical priors, facilitating enhanced adaptation for medical image segmentation\nwith limited samples. Our H-SAM demonstrates a 4.78% improvement in average\nDice compared to existing prompt-free SAM variants for multi-organ segmentation\nusing only 10% of 2D slices. Notably, without using any unlabeled data, H-SAM\neven outperforms state-of-the-art semi-supervised models relying on extensive\nunlabeled training data across various medical datasets. Our code is available\nat https://github.com/Cccccczh404/H-SAM.\n","authors":["Zhiheng Cheng","Qingyue Wei","Hongru Zhu","Yan Wang","Liangqiong Qu","Wei Shao","Yuyin Zhou"],"pdf_url":"https://arxiv.org/pdf/2403.18271v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.18270v1","updated":"2024-03-27T05:52:39Z","published":"2024-03-27T05:52:39Z","title":"Image Deraining via Self-supervised Reinforcement Learning","summary":" The quality of images captured outdoors is often affected by the weather. One\nfactor that interferes with sight is rain, which can obstruct the view of\nobservers and computer vision applications that rely on those images. The work\naims to recover rain images by removing rain streaks via Self-supervised\nReinforcement Learning (RL) for image deraining (SRL-Derain). We locate rain\nstreak pixels from the input rain image via dictionary learning and use\npixel-wise RL agents to take multiple inpainting actions to remove rain\nprogressively. To our knowledge, this work is the first attempt where\nself-supervised RL is applied to image deraining. Experimental results on\nseveral benchmark image-deraining datasets show that the proposed SRL-Derain\nperforms favorably against state-of-the-art few-shot and self-supervised\nderaining and denoising methods.\n","authors":["He-Hao Liao","Yan-Tsung Peng","Wen-Tao Chu","Ping-Chun Hsieh","Chung-Chi Tsai"],"pdf_url":"https://arxiv.org/pdf/2403.18270v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18266v1","updated":"2024-03-27T05:38:48Z","published":"2024-03-27T05:38:48Z","title":"Branch-Tuning: Balancing Stability and Plasticity for Continual\n Self-Supervised Learning","summary":" Self-supervised learning (SSL) has emerged as an effective paradigm for\nderiving general representations from vast amounts of unlabeled data. However,\nas real-world applications continually integrate new content, the high\ncomputational and resource demands of SSL necessitate continual learning rather\nthan complete retraining. This poses a challenge in striking a balance between\nstability and plasticity when adapting to new information. In this paper, we\nemploy Centered Kernel Alignment for quantitatively analyzing model stability\nand plasticity, revealing the critical roles of batch normalization layers for\nstability and convolutional layers for plasticity. Motivated by this, we\npropose Branch-tuning, an efficient and straightforward method that achieves a\nbalance between stability and plasticity in continual SSL. Branch-tuning\nconsists of branch expansion and compression, and can be easily applied to\nvarious SSL methods without the need of modifying the original methods,\nretaining old data or models. We validate our method through incremental\nexperiments on various benchmark datasets, demonstrating its effectiveness and\npractical value in real-world scenarios. We hope our work offers new insights\nfor future continual self-supervised learning research. The code will be made\npublicly available.\n","authors":["Wenzhuo Liu","Fei Zhu","Cheng-Lin Liu"],"pdf_url":"https://arxiv.org/pdf/2403.18266v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03532v2","updated":"2024-03-27T05:28:55Z","published":"2024-03-06T08:18:02Z","title":"Extend Your Own Correspondences: Unsupervised Distant Point Cloud\n Registration by Progressive Distance Extension","summary":" Registration of point clouds collected from a pair of distant vehicles\nprovides a comprehensive and accurate 3D view of the driving scenario, which is\nvital for driving safety related applications, yet existing literature suffers\nfrom the expensive pose label acquisition and the deficiency to generalize to\nnew data distributions. In this paper, we propose EYOC, an unsupervised distant\npoint cloud registration method that adapts to new point cloud distributions on\nthe fly, requiring no global pose labels. The core idea of EYOC is to train a\nfeature extractor in a progressive fashion, where in each round, the feature\nextractor, trained with near point cloud pairs, can label slightly farther\npoint cloud pairs, enabling self-supervision on such far point cloud pairs.\nThis process continues until the derived extractor can be used to register\ndistant point clouds. Particularly, to enable high-fidelity correspondence\nlabel generation, we devise an effective spatial filtering scheme to select the\nmost representative correspondences to register a point cloud pair, and then\nutilize the aligned point clouds to discover more correct correspondences.\nExperiments show that EYOC can achieve comparable performance with\nstate-of-the-art supervised methods at a lower training cost. Moreover, it\noutwits supervised methods regarding generalization performance on new data\ndistributions.\n","authors":["Quan Liu","Hongzi Zhu","Zhenxi Wang","Yunsong Zhou","Shan Chang","Minyi Guo"],"pdf_url":"https://arxiv.org/pdf/2403.03532v2.pdf","comment":"In Proceedings of the IEEE/CVF Conference on Computer Vision and\n Pattern Recognition (CVPR), 2024"},{"id":"http://arxiv.org/abs/2402.02561v2","updated":"2024-03-27T05:23:40Z","published":"2024-02-04T16:27:37Z","title":"Foundation Model Makes Clustering A Better Initialization For Cold-Start\n Active Learning","summary":" Active learning selects the most informative samples from the unlabelled\ndataset to annotate in the context of a limited annotation budget. While\nnumerous methods have been proposed for subsequent sample selection based on an\ninitialized model, scant attention has been paid to the indispensable phase of\nactive learning: selecting samples for model cold-start initialization. Most of\nthe previous studies resort to random sampling or naive clustering. However,\nrandom sampling is prone to fluctuation, and naive clustering suffers from\nconvergence speed, particularly when dealing with high-dimensional data such as\nimaging data. In this work, we propose to integrate foundation models with\nclustering methods to select samples for cold-start active learning\ninitialization. Foundation models refer to those trained on massive datasets by\nthe self-supervised paradigm and capable of generating informative and\ncompacted embeddings for various downstream tasks. Leveraging these embeddings\nto replace raw features such as pixel values, clustering quickly converges and\nidentifies better initial samples. For a comprehensive comparison, we included\na classic ImageNet-supervised model to acquire embeddings. Experiments on two\nclinical tasks of image classification and segmentation demonstrated that\nfoundation model-based clustering efficiently pinpointed informative initial\nsamples, leading to models showcasing enhanced performance than the baseline\nmethods. We envisage that this study provides an effective paradigm for future\ncold-start active learning.\n","authors":["Han Yuan","Chuan Hong"],"pdf_url":"https://arxiv.org/pdf/2402.02561v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17456v3","updated":"2024-03-27T05:22:18Z","published":"2023-11-29T08:56:24Z","title":"DifFlow3D: Toward Robust Uncertainty-Aware Scene Flow Estimation with\n Iterative Diffusion-Based Refinement","summary":" Scene flow estimation, which aims to predict per-point 3D displacements of\ndynamic scenes, is a fundamental task in the computer vision field. However,\nprevious works commonly suffer from unreliable correlation caused by locally\nconstrained searching ranges, and struggle with accumulated inaccuracy arising\nfrom the coarse-to-fine structure. To alleviate these problems, we propose a\nnovel uncertainty-aware scene flow estimation network (DifFlow3D) with the\ndiffusion probabilistic model. Iterative diffusion-based refinement is designed\nto enhance the correlation robustness and resilience to challenging cases, e.g.\ndynamics, noisy inputs, repetitive patterns, etc. To restrain the generation\ndiversity, three key flow-related features are leveraged as conditions in our\ndiffusion model. Furthermore, we also develop an uncertainty estimation module\nwithin diffusion to evaluate the reliability of estimated scene flow. Our\nDifFlow3D achieves state-of-the-art performance, with 24.0% and 29.1% EPE3D\nreduction respectively on FlyingThings3D and KITTI 2015 datasets. Notably, our\nmethod achieves an unprecedented millimeter-level accuracy (0.0078m in EPE3D)\non the KITTI dataset. Additionally, our diffusion-based refinement paradigm can\nbe readily integrated as a plug-and-play module into existing scene flow\nnetworks, significantly increasing their estimation accuracy. Codes are\nreleased at https://github.com/IRMVLab/DifFlow3D.\n","authors":["Jiuming Liu","Guangming Wang","Weicai Ye","Chaokang Jiang","Jinru Han","Zhe Liu","Guofeng Zhang","Dalong Du","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2311.17456v3.pdf","comment":"Camera-ready version of CVPR 2024. Codes are released at\n https://github.com/IRMVLab/DifFlow3D"},{"id":"http://arxiv.org/abs/2403.18260v1","updated":"2024-03-27T05:22:06Z","published":"2024-03-27T05:22:06Z","title":"Toward Interactive Regional Understanding in Vision-Large Language\n Models","summary":" Recent Vision-Language Pre-training (VLP) models have demonstrated\nsignificant advancements. Nevertheless, these models heavily rely on image-text\npairs that capture only coarse and global information of an image, leading to a\nlimitation in their regional understanding ability. In this work, we introduce\n\\textbf{RegionVLM}, equipped with explicit regional modeling capabilities,\nallowing them to understand user-indicated image regions. To achieve this, we\ndesign a simple yet innovative architecture, requiring no modifications to the\nmodel architecture or objective function. Additionally, we leverage a dataset\nthat contains a novel source of information, namely Localized Narratives, which\nhas been overlooked in previous VLP research. Our experiments demonstrate that\nour single generalist model not only achieves an interactive dialogue system\nbut also exhibits superior performance on various zero-shot region\nunderstanding tasks, without compromising its ability for global image\nunderstanding.\n","authors":["Jungbeom Lee","Sanghyuk Chun","Sangdoo Yun"],"pdf_url":"https://arxiv.org/pdf/2403.18260v1.pdf","comment":"NAACL 2024 Main Conference"},{"id":"http://arxiv.org/abs/2209.02200v3","updated":"2024-03-27T05:16:02Z","published":"2022-09-06T03:42:18Z","title":"Task-wise Sampling Convolutions for Arbitrary-Oriented Object Detection\n in Aerial Images","summary":" Arbitrary-oriented object detection (AOOD) has been widely applied to locate\nand classify objects with diverse orientations in remote sensing images.\nHowever, the inconsistent features for the localization and classification\ntasks in AOOD models may lead to ambiguity and low-quality object predictions,\nwhich constrains the detection performance. In this article, an AOOD method\ncalled task-wise sampling convolutions (TS-Conv) is proposed. TS-Conv\nadaptively samples task-wise features from respective sensitive regions and\nmaps these features together in alignment to guide a dynamic label assignment\nfor better predictions. Specifically, sampling positions of the localization\nconvolution in TS-Conv are supervised by the oriented bounding box (OBB)\nprediction associated with spatial coordinates, while sampling positions and\nconvolutional kernel of the classification convolution are designed to be\nadaptively adjusted according to different orientations for improving the\norientation robustness of features. Furthermore, a dynamic\ntask-consistent-aware label assignment (DTLA) strategy is developed to select\noptimal candidate positions and assign labels dynamically according to ranked\ntask-aware scores obtained from TS-Conv. Extensive experiments on several\npublic datasets covering multiple scenes, multimodal images, and multiple\ncategories of objects demonstrate the effectiveness, scalability, and superior\nperformance of the proposed TS-Conv.\n","authors":["Zhanchao Huang","Wei Li","Xiang-Gen Xia","Hao Wang","Ran Tao"],"pdf_url":"https://arxiv.org/pdf/2209.02200v3.pdf","comment":"15 pages, 13 figures, 11 tables"},{"id":"http://arxiv.org/abs/2403.07359v4","updated":"2024-03-27T05:14:09Z","published":"2024-03-12T06:45:34Z","title":"FSC: Few-point Shape Completion","summary":" While previous studies have demonstrated successful 3D object shape\ncompletion with a sufficient number of points, they often fail in scenarios\nwhen a few points, e.g. tens of points, are observed. Surprisingly, via entropy\nanalysis, we find that even a few points, e.g. 64 points, could retain\nsubstantial information to help recover the 3D shape of the object. To address\nthe challenge of shape completion with very sparse point clouds, we then\npropose Few-point Shape Completion (FSC) model, which contains a novel\ndual-branch feature extractor for handling extremely sparse inputs, coupled\nwith an extensive branch for maximal point utilization with a saliency branch\nfor dynamic importance assignment. This model is further bolstered by a\ntwo-stage revision network that refines both the extracted features and the\ndecoder output, enhancing the detail and authenticity of the completed point\ncloud. Our experiments demonstrate the feasibility of recovering 3D shapes from\na few points. The proposed Few-point Shape Completion (FSC) model outperforms\nprevious methods on both few-point inputs and many-point inputs, and shows good\ngeneralizability to different object categories.\n","authors":["Xianzu Wu","Xianfeng Wu","Tianyu Luan","Yajing Bai","Zhongyuan Lai","Junsong Yuan"],"pdf_url":"https://arxiv.org/pdf/2403.07359v4.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.18258v1","updated":"2024-03-27T05:10:38Z","published":"2024-03-27T05:10:38Z","title":"Enhancing Generative Class Incremental Learning Performance with Model\n Forgetting Approach","summary":" This study presents a novel approach to Generative Class Incremental Learning\n(GCIL) by introducing the forgetting mechanism, aimed at dynamically managing\nclass information for better adaptation to streaming data. GCIL is one of the\nhot topics in the field of computer vision, and this is considered one of the\ncrucial tasks in society, specifically the continual learning of generative\nmodels. The ability to forget is a crucial brain function that facilitates\ncontinual learning by selectively discarding less relevant information for\nhumans. However, in the field of machine learning models, the concept of\nintentionally forgetting has not been extensively investigated. In this study\nwe aim to bridge this gap by incorporating the forgetting mechanisms into GCIL,\nthereby examining their impact on the models' ability to learn in continual\nlearning. Through our experiments, we have found that integrating the\nforgetting mechanisms significantly enhances the models' performance in\nacquiring new knowledge, underscoring the positive role that strategic\nforgetting plays in the process of continual learning.\n","authors":["Taro Togo","Ren Togo","Keisuke Maeda","Takahiro Ogawa","Miki Haseyama"],"pdf_url":"https://arxiv.org/pdf/2403.18258v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18252v1","updated":"2024-03-27T04:49:23Z","published":"2024-03-27T04:49:23Z","title":"Beyond Embeddings: The Promise of Visual Table in Multi-Modal Models","summary":" Visual representation learning has been a cornerstone in computer vision,\nevolving from supervised learning with human-annotated labels to aligning\nimage-text pairs from the Internet. Despite recent advancements in multi-modal\nlarge language models (MLLMs), the visual representations they rely on, such as\nCLIP embeddings, often lack access to external world knowledge critical for\nreal-world visual reasoning. In this work, we propose Visual Table, a novel\nvisual representation tailored for MLLMs. It provides hierarchical text\ndescriptions of holistic visual scenes, consisting of a scene description and\nmultiple object-centric descriptions that encompass categories, attributes, and\nknowledge at instance level. We further develop a scalable generator for visual\ntable generation and train it on small-scale annotations from GPT4V. Extensive\nevaluations demonstrate that, with generated visual tables as additional visual\nrepresentations, our model can consistently outperform the state-of-the-art\n(SOTA) MLLMs across diverse benchmarks. When visual tables serve as standalone\nvisual representations, our model can closely match or even beat the SOTA MLLMs\nthat are built on CLIP visual embeddings. Our code is available at\nhttps://github.com/LaVi-Lab/Visual-Table.\n","authors":["Yiwu Zhong","Zi-Yuan Hu","Michael R. Lyu","Liwei Wang"],"pdf_url":"https://arxiv.org/pdf/2403.18252v1.pdf","comment":"Project page: https://github.com/LaVi-Lab/Visual-Table"},{"id":"http://arxiv.org/abs/2403.18241v1","updated":"2024-03-27T04:09:34Z","published":"2024-03-27T04:09:34Z","title":"NeuSDFusion: A Spatial-Aware Generative Model for 3D Shape Completion,\n Reconstruction, and Generation","summary":" 3D shape generation aims to produce innovative 3D content adhering to\nspecific conditions and constraints. Existing methods often decompose 3D shapes\ninto a sequence of localized components, treating each element in isolation\nwithout considering spatial consistency. As a result, these approaches exhibit\nlimited versatility in 3D data representation and shape generation, hindering\ntheir ability to generate highly diverse 3D shapes that comply with the\nspecified constraints. In this paper, we introduce a novel spatial-aware 3D\nshape generation framework that leverages 2D plane representations for enhanced\n3D shape modeling. To ensure spatial coherence and reduce memory usage, we\nincorporate a hybrid shape representation technique that directly learns a\ncontinuous signed distance field representation of the 3D shape using\northogonal 2D planes. Additionally, we meticulously enforce spatial\ncorrespondences across distinct planes using a transformer-based autoencoder\nstructure, promoting the preservation of spatial relationships in the generated\n3D shapes. This yields an algorithm that consistently outperforms\nstate-of-the-art 3D shape generation methods on various tasks, including\nunconditional shape generation, multi-modal shape completion, single-view\nreconstruction, and text-to-shape synthesis.\n","authors":["Ruikai Cui","Weizhe Liu","Weixuan Sun","Senbo Wang","Taizhang Shang","Yang Li","Xibin Song","Han Yan","Zhennan Wu","Shenzhou Chen","Hongdong Li","Pan Ji"],"pdf_url":"https://arxiv.org/pdf/2403.18241v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00374v4","updated":"2024-03-27T04:06:36Z","published":"2023-12-31T02:25:41Z","title":"EMAGE: Towards Unified Holistic Co-Speech Gesture Generation via\n Expressive Masked Audio Gesture Modeling","summary":" We propose EMAGE, a framework to generate full-body human gestures from audio\nand masked gestures, encompassing facial, local body, hands, and global\nmovements. To achieve this, we first introduce BEAT2 (BEAT-SMPLX-FLAME), a new\nmesh-level holistic co-speech dataset. BEAT2 combines MoShed SMPLX body with\nFLAME head parameters and further refines the modeling of head, neck, and\nfinger movements, offering a community-standardized, high-quality 3D motion\ncaptured dataset. EMAGE leverages masked body gesture priors during training to\nboost inference performance. It involves a Masked Audio Gesture Transformer,\nfacilitating joint training on audio-to-gesture generation and masked gesture\nreconstruction to effectively encode audio and body gesture hints. Encoded body\nhints from masked gestures are then separately employed to generate facial and\nbody movements. Moreover, EMAGE adaptively merges speech features from the\naudio's rhythm and content and utilizes four compositional VQ-VAEs to enhance\nthe results' fidelity and diversity. Experiments demonstrate that EMAGE\ngenerates holistic gestures with state-of-the-art performance and is flexible\nin accepting predefined spatial-temporal gesture inputs, generating complete,\naudio-synchronized results. Our code and dataset are available at\nhttps://pantomatrix.github.io/EMAGE/\n","authors":["Haiyang Liu","Zihao Zhu","Giorgio Becherini","Yichen Peng","Mingyang Su","You Zhou","Xuefei Zhe","Naoya Iwamoto","Bo Zheng","Michael J. Black"],"pdf_url":"https://arxiv.org/pdf/2401.00374v4.pdf","comment":"Conflict of Interest Disclosure; CVPR Camera Ready; Project Page:\n https://pantomatrix.github.io/EMAGE/"},{"id":"http://arxiv.org/abs/2403.18238v1","updated":"2024-03-27T04:03:55Z","published":"2024-03-27T04:03:55Z","title":"TAFormer: A Unified Target-Aware Transformer for Video and Motion Joint\n Prediction in Aerial Scenes","summary":" As drone technology advances, using unmanned aerial vehicles for aerial\nsurveys has become the dominant trend in modern low-altitude remote sensing.\nThe surge in aerial video data necessitates accurate prediction for future\nscenarios and motion states of the interested target, particularly in\napplications like traffic management and disaster response. Existing video\nprediction methods focus solely on predicting future scenes (video frames),\nsuffering from the neglect of explicitly modeling target's motion states, which\nis crucial for aerial video interpretation. To address this issue, we introduce\na novel task called Target-Aware Aerial Video Prediction, aiming to\nsimultaneously predict future scenes and motion states of the target. Further,\nwe design a model specifically for this task, named TAFormer, which provides a\nunified modeling approach for both video and target motion states.\nSpecifically, we introduce Spatiotemporal Attention (STA), which decouples the\nlearning of video dynamics into spatial static attention and temporal dynamic\nattention, effectively modeling the scene appearance and motion. Additionally,\nwe design an Information Sharing Mechanism (ISM), which elegantly unifies the\nmodeling of video and target motion by facilitating information interaction\nthrough two sets of messenger tokens. Moreover, to alleviate the difficulty of\ndistinguishing targets in blurry predictions, we introduce Target-Sensitive\nGaussian Loss (TSGL), enhancing the model's sensitivity to both target's\nposition and content. Extensive experiments on UAV123VP and VisDroneVP (derived\nfrom single-object tracking datasets) demonstrate the exceptional performance\nof TAFormer in target-aware video prediction, showcasing its adaptability to\nthe additional requirements of aerial video interpretation for target\nawareness.\n","authors":["Liangyu Xu","Wanxuan Lu","Hongfeng Yu","Yongqiang Mao","Hanbo Bi","Chenglong Liu","Xian Sun","Kun Fu"],"pdf_url":"https://arxiv.org/pdf/2403.18238v1.pdf","comment":"17 pages, 9 figures"},{"id":"http://arxiv.org/abs/2311.08100v3","updated":"2024-03-27T04:00:07Z","published":"2023-11-14T11:53:24Z","title":"PPAD: Iterative Interactions of Prediction and Planning for End-to-end\n Autonomous Driving","summary":" We present a new interaction mechanism of prediction and planning for\nend-to-end autonomous driving, called PPAD (Iterative Interaction of Prediction\nand Planning Autonomous Driving), which considers the timestep-wise interaction\nto better integrate prediction and planning. An ego vehicle performs motion\nplanning at each timestep based on the trajectory prediction of surrounding\nagents (e.g., vehicles and pedestrians) and its local road conditions. Unlike\nexisting end-to-end autonomous driving frameworks, PPAD models the interactions\namong ego, agents, and the dynamic environment in an autoregressive manner by\ninterleaving the Prediction and Planning processes at every timestep, instead\nof a single sequential process of prediction followed by planning.\nSpecifically, we design ego-to-agent, ego-to-map, and ego-to-BEV interaction\nmechanisms with hierarchical dynamic key objects attention to better model the\ninteractions. The experiments on the nuScenes benchmark show that our approach\noutperforms state-of-the-art methods.\n","authors":["Zhili Chen","Maosheng Ye","Shuangjie Xu","Tongyi Cao","Qifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2311.08100v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01616v2","updated":"2024-03-27T03:56:35Z","published":"2023-12-04T04:14:09Z","title":"SchurVINS: Schur Complement-Based Lightweight Visual Inertial Navigation\n System","summary":" Accuracy and computational efficiency are the most important metrics to\nVisual Inertial Navigation System (VINS). The existing VINS algorithms with\neither high accuracy or low computational complexity, are difficult to provide\nthe high precision localization in resource-constrained devices. To this end,\nwe propose a novel filter-based VINS framework named SchurVINS, which could\nguarantee both high accuracy by building a complete residual model and low\ncomputational complexity with Schur complement. Technically, we first formulate\nthe full residual model where Gradient, Hessian and observation covariance are\nexplicitly modeled. Then Schur complement is employed to decompose the full\nmodel into ego-motion residual model and landmark residual model. Finally,\nExtended Kalman Filter (EKF) update is implemented in these two models with\nhigh efficiency. Experiments on EuRoC and TUM-VI datasets show that our method\nnotably outperforms state-of-the-art (SOTA) methods in both accuracy and\ncomputational complexity. The experimental code of SchurVINS is available at\nhttps://github.com/bytedance/SchurVINS.\n","authors":["Yunfei Fan","Tianyu Zhao","Guidong Wang"],"pdf_url":"https://arxiv.org/pdf/2312.01616v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08106v3","updated":"2024-03-27T03:55:39Z","published":"2023-10-12T08:01:11Z","title":"Generalized Logit Adjustment: Calibrating Fine-tuned Models by Removing\n Label Bias in Foundation Models","summary":" Foundation models like CLIP allow zero-shot transfer on various tasks without\nadditional training data. Yet, the zero-shot performance is less competitive\nthan a fully supervised one. Thus, to enhance the performance, fine-tuning and\nensembling are also commonly adopted to better fit the downstream tasks.\nHowever, we argue that such prior work has overlooked the inherent biases in\nfoundation models. Due to the highly imbalanced Web-scale training set, these\nfoundation models are inevitably skewed toward frequent semantics, and thus the\nsubsequent fine-tuning or ensembling is still biased. In this study, we\nsystematically examine the biases in foundation models and demonstrate the\nefficacy of our proposed Generalized Logit Adjustment (GLA) method. Note that\nbias estimation in foundation models is challenging, as most pre-train data\ncannot be explicitly accessed like in traditional long-tailed classification\ntasks. To this end, GLA has an optimization-based bias estimation approach for\ndebiasing foundation models. As our work resolves a fundamental flaw in the\npre-training, the proposed GLA demonstrates significant improvements across a\ndiverse range of tasks: it achieves 1.5 pp accuracy gains on ImageNet, an large\naverage improvement (1.4-4.6 pp) on 11 few-shot datasets, 2.4 pp gains on\nlong-tailed classification. Codes are in \\url{https://github.com/BeierZhu/GLA}.\n","authors":["Beier Zhu","Kaihua Tang","Qianru Sun","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.08106v3.pdf","comment":"V2 proposed a more effective method for label distribution\n estimation. V1 fixed a typo in abstract; Accepted by NeurIPS2023"},{"id":"http://arxiv.org/abs/2206.08657v6","updated":"2024-03-27T03:53:23Z","published":"2022-06-17T09:42:35Z","title":"BridgeTower: Building Bridges Between Encoders in Vision-Language\n Representation Learning","summary":" Vision-Language (VL) models with the Two-Tower architecture have dominated\nvisual-language representation learning in recent years. Current VL models\neither use lightweight uni-modal encoders and learn to extract, align and fuse\nboth modalities simultaneously in a deep cross-modal encoder, or feed the\nlast-layer uni-modal representations from the deep pre-trained uni-modal\nencoders into the top cross-modal encoder. Both approaches potentially restrict\nvision-language representation learning and limit model performance. In this\npaper, we propose BridgeTower, which introduces multiple bridge layers that\nbuild a connection between the top layers of uni-modal encoders and each layer\nof the cross-modal encoder. This enables effective bottom-up cross-modal\nalignment and fusion between visual and textual representations of different\nsemantic levels of pre-trained uni-modal encoders in the cross-modal encoder.\nPre-trained with only 4M images, BridgeTower achieves state-of-the-art\nperformance on various downstream vision-language tasks. In particular, on the\nVQAv2 test-std set, BridgeTower achieves an accuracy of 78.73%, outperforming\nthe previous state-of-the-art model METER by 1.09% with the same pre-training\ndata and almost negligible additional parameters and computational costs.\nNotably, when further scaling the model, BridgeTower achieves an accuracy of\n81.15%, surpassing models that are pre-trained on orders-of-magnitude larger\ndatasets. Code and checkpoints are available at\nhttps://github.com/microsoft/BridgeTower.\n","authors":["Xiao Xu","Chenfei Wu","Shachar Rosenman","Vasudev Lal","Wanxiang Che","Nan Duan"],"pdf_url":"https://arxiv.org/pdf/2206.08657v6.pdf","comment":"Accepted by AAAI 2023, Oral"},{"id":"http://arxiv.org/abs/2403.04125v2","updated":"2024-03-27T03:53:14Z","published":"2024-03-07T00:44:21Z","title":"Scalable and Robust Transformer Decoders for Interpretable Image\n Classification with Foundation Models","summary":" Interpretable computer vision models can produce transparent predictions,\nwhere the features of an image are compared with prototypes from a training\ndataset and the similarity between them forms a basis for classification.\nNevertheless these methods are computationally expensive to train, introduce\nadditional complexity and may require domain knowledge to adapt\nhyper-parameters to a new dataset. Inspired by developments in object\ndetection, segmentation and large-scale self-supervised foundation vision\nmodels, we introduce Component Features (ComFe), a novel explainable-by-design\nimage classification approach using a transformer-decoder head and hierarchical\nmixture-modelling. With only global image labels and no segmentation or part\nannotations, ComFe can identify consistent image components, such as the head,\nbody, wings and tail of a bird, and the image background, and determine which\nof these features are informative in making a prediction. We demonstrate that\nComFe obtains higher accuracy compared to previous interpretable models across\na range of fine-grained vision benchmarks, without the need to individually\ntune hyper-parameters for each dataset. We also show that ComFe outperforms a\nnon-interpretable linear head across a range of datasets, including ImageNet,\nand improves performance on generalisation and robustness benchmarks.\n","authors":["Evelyn Mannix","Howard Bondell"],"pdf_url":"https://arxiv.org/pdf/2403.04125v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.11104v4","updated":"2024-03-27T03:47:20Z","published":"2023-01-26T13:58:46Z","title":"Discovering and Mitigating Visual Biases through Keyword Explanation","summary":" Addressing biases in computer vision models is crucial for real-world AI\ndeployments. However, mitigating visual biases is challenging due to their\nunexplainable nature, often identified indirectly through visualization or\nsample statistics, which necessitates additional human supervision for\ninterpretation. To tackle this issue, we propose the Bias-to-Text (B2T)\nframework, which interprets visual biases as keywords. Specifically, we extract\ncommon keywords from the captions of mispredicted images to identify potential\nbiases in the model. We then validate these keywords by measuring their\nsimilarity to the mispredicted images using a vision-language scoring model.\nThe keyword explanation form of visual bias offers several advantages, such as\na clear group naming for bias discovery and a natural extension for debiasing\nusing these group names. Our experiments demonstrate that B2T can identify\nknown biases, such as gender bias in CelebA, background bias in Waterbirds, and\ndistribution shifts in ImageNet-R/C. Additionally, B2T uncovers novel biases in\nlarger datasets, such as Dollar Street and ImageNet. For example, we discovered\na contextual bias between \"bee\" and \"flower\" in ImageNet. We also highlight\nvarious applications of B2T keywords, including debiased training, CLIP\nprompting, and model comparison.\n","authors":["Younghyun Kim","Sangwoo Mo","Minkyu Kim","Kyungmin Lee","Jaeho Lee","Jinwoo Shin"],"pdf_url":"https://arxiv.org/pdf/2301.11104v4.pdf","comment":"CVPR 2024. First two authors contributed equally"},{"id":"http://arxiv.org/abs/2403.18233v1","updated":"2024-03-27T03:39:57Z","published":"2024-03-27T03:39:57Z","title":"Benchmarking Image Transformers for Prostate Cancer Detection from\n Ultrasound Data","summary":" PURPOSE: Deep learning methods for classifying prostate cancer (PCa) in\nultrasound images typically employ convolutional networks (CNNs) to detect\ncancer in small regions of interest (ROI) along a needle trace region. However,\nthis approach suffers from weak labelling, since the ground-truth\nhistopathology labels do not describe the properties of individual ROIs.\nRecently, multi-scale approaches have sought to mitigate this issue by\ncombining the context awareness of transformers with a CNN feature extractor to\ndetect cancer from multiple ROIs using multiple-instance learning (MIL). In\nthis work, we present a detailed study of several image transformer\narchitectures for both ROI-scale and multi-scale classification, and a\ncomparison of the performance of CNNs and transformers for ultrasound-based\nprostate cancer classification. We also design a novel multi-objective learning\nstrategy that combines both ROI and core predictions to further mitigate label\nnoise. METHODS: We evaluate 3 image transformers on ROI-scale cancer\nclassification, then use the strongest model to tune a multi-scale classifier\nwith MIL. We train our MIL models using our novel multi-objective learning\nstrategy and compare our results to existing baselines. RESULTS: We find that\nfor both ROI-scale and multi-scale PCa detection, image transformer backbones\nlag behind their CNN counterparts. This deficit in performance is even more\nnoticeable for larger models. When using multi-objective learning, we can\nimprove performance of MIL, with a 77.9% AUROC, a sensitivity of 75.9%, and a\nspecificity of 66.3%. CONCLUSION: Convolutional networks are better suited for\nmodelling sparse datasets of prostate ultrasounds, producing more robust\nfeatures than transformers in PCa detection. Multi-scale methods remain the\nbest architecture for this task, with multi-objective learning presenting an\neffective way to improve performance.\n","authors":["Mohamed Harmanani","Paul F. R. Wilson","Fahimeh Fooladgar","Amoon Jamzad","Mahdi Gilany","Minh Nguyen Nhat To","Brian Wodlinger","Purang Abolmaesumi","Parvin Mousavi"],"pdf_url":"https://arxiv.org/pdf/2403.18233v1.pdf","comment":"early draft, 7 pages; Accepted to SPIE Medical Imaging 2024"},{"id":"http://arxiv.org/abs/2403.02649v2","updated":"2024-03-27T03:34:00Z","published":"2024-03-05T04:38:13Z","title":"Few-shot Learner Parameterization by Diffusion Time-steps","summary":" Even when using large multi-modal foundation models, few-shot learning is\nstill challenging -- if there is no proper inductive bias, it is nearly\nimpossible to keep the nuanced class attributes while removing the visually\nprominent attributes that spuriously correlate with class labels. To this end,\nwe find an inductive bias that the time-steps of a Diffusion Model (DM) can\nisolate the nuanced class attributes, i.e., as the forward diffusion adds noise\nto an image at each time-step, nuanced attributes are usually lost at an\nearlier time-step than the spurious attributes that are visually prominent.\nBuilding on this, we propose Time-step Few-shot (TiF) learner. We train\nclass-specific low-rank adapters for a text-conditioned DM to make up for the\nlost attributes, such that images can be accurately reconstructed from their\nnoisy ones given a prompt. Hence, at a small time-step, the adapter and prompt\nare essentially a parameterization of only the nuanced class attributes. For a\ntest image, we can use the parameterization to only extract the nuanced class\nattributes for classification. TiF learner significantly outperforms OpenCLIP\nand its adapters on a variety of fine-grained and customized few-shot learning\ntasks. Codes are in https://github.com/yue-zhongqi/tif.\n","authors":["Zhongqi Yue","Pan Zhou","Richang Hong","Hanwang Zhang","Qianru Sun"],"pdf_url":"https://arxiv.org/pdf/2403.02649v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.18228v1","updated":"2024-03-27T03:31:16Z","published":"2024-03-27T03:31:16Z","title":"Fourier or Wavelet bases as counterpart self-attention in spikformer for\n efficient visual classification","summary":" Energy-efficient spikformer has been proposed by integrating the biologically\nplausible spiking neural network (SNN) and artificial Transformer, whereby the\nSpiking Self-Attention (SSA) is used to achieve both higher accuracy and lower\ncomputational cost. However, it seems that self-attention is not always\nnecessary, especially in sparse spike-form calculation manners. In this paper,\nwe innovatively replace vanilla SSA (using dynamic bases calculating from Query\nand Key) with spike-form Fourier Transform, Wavelet Transform, and their\ncombinations (using fixed triangular or wavelets bases), based on a key\nhypothesis that both of them use a set of basis functions for information\ntransformation. Hence, the Fourier-or-Wavelet-based spikformer (FWformer) is\nproposed and verified in visual classification tasks, including both static\nimage and event-based video datasets. The FWformer can achieve comparable or\neven higher accuracies ($0.4\\%$-$1.5\\%$), higher running speed ($9\\%$-$51\\%$\nfor training and $19\\%$-$70\\%$ for inference), reduced theoretical energy\nconsumption ($20\\%$-$25\\%$), and reduced GPU memory usage ($4\\%$-$26\\%$),\ncompared to the standard spikformer. Our result indicates the continuous\nrefinement of new Transformers, that are inspired either by biological\ndiscovery (spike-form), or information theory (Fourier or Wavelet Transform),\nis promising.\n","authors":["Qingyu Wang","Duzhen Zhang","Tilelin Zhang","Bo Xu"],"pdf_url":"https://arxiv.org/pdf/2403.18228v1.pdf","comment":"18 pages, 2 figures. arXiv admin note: substantial text overlap with\n arXiv:2308.02557"},{"id":"http://arxiv.org/abs/2304.14394v3","updated":"2024-03-27T03:23:12Z","published":"2023-04-27T17:56:29Z","title":"Unified Sequence-to-Sequence Learning for Single- and Multi-Modal Visual\n Object Tracking","summary":" In this paper, we introduce a new sequence-to-sequence learning framework for\nRGB-based and multi-modal object tracking. First, we present SeqTrack for\nRGB-based tracking. It casts visual tracking as a sequence generation task,\nforecasting object bounding boxes in an autoregressive manner. This differs\nfrom previous trackers, which depend on the design of intricate head networks,\nsuch as classification and regression heads. SeqTrack employs a basic\nencoder-decoder transformer architecture. The encoder utilizes a bidirectional\ntransformer for feature extraction, while the decoder generates bounding box\nsequences autoregressively using a causal transformer. The loss function is a\nplain cross-entropy. Second, we introduce SeqTrackv2, a unified\nsequence-to-sequence framework for multi-modal tracking tasks. Expanding upon\nSeqTrack, SeqTrackv2 integrates a unified interface for auxiliary modalities\nand a set of task-prompt tokens to specify the task. This enables it to manage\nmulti-modal tracking tasks using a unified model and parameter set. This\nsequence learning paradigm not only simplifies the tracking framework, but also\nshowcases superior performance across 14 challenging benchmarks spanning five\nsingle- and multi-modal tracking tasks. The code and models are available at\nhttps://github.com/chenxin-dlut/SeqTrackv2.\n","authors":["Xin Chen","Ben Kang","Jiawen Zhu","Dong Wang","Houwen Peng","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2304.14394v3.pdf","comment":"This is a new expanded version of our previous CVPR2023 paper\n \"SeqTrack: Sequence to Sequence Learning for Visual Object Tracking.\"\n SeqTrackv2 extends SeqTrack to four multi-modal tracking tasks with a unified\n model and parameter set"},{"id":"http://arxiv.org/abs/2402.17464v3","updated":"2024-03-27T03:13:52Z","published":"2024-02-27T12:42:06Z","title":"Generative 3D Part Assembly via Part-Whole-Hierarchy Message Passing","summary":" Generative 3D part assembly involves understanding part relationships and\npredicting their 6-DoF poses for assembling a realistic 3D shape. Prior work\noften focus on the geometry of individual parts, neglecting part-whole\nhierarchies of objects. Leveraging two key observations: 1) super-part poses\nprovide strong hints about part poses, and 2) predicting super-part poses is\neasier due to fewer superparts, we propose a part-whole-hierarchy message\npassing network for efficient 3D part assembly. We first introduce super-parts\nby grouping geometrically similar parts without any semantic labels. Then we\nemploy a part-whole hierarchical encoder, wherein a super-part encoder predicts\nlatent super-part poses based on input parts. Subsequently, we transform the\npoint cloud using the latent poses, feeding it to the part encoder for\naggregating super-part information and reasoning about part relationships to\npredict all part poses. In training, only ground-truth part poses are required.\nDuring inference, the predicted latent poses of super-parts enhance\ninterpretability. Experimental results on the PartNet dataset show that our\nmethod achieves state-of-the-art performance in part and connectivity accuracy\nand enables an interpretable hierarchical part assembly. Code is available at\nhttps://github.com/pkudba/3DHPA.\n","authors":["Bi'an Du","Xiang Gao","Wei Hu","Renjie Liao"],"pdf_url":"https://arxiv.org/pdf/2402.17464v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16421v2","updated":"2024-03-27T03:07:20Z","published":"2023-09-28T13:12:18Z","title":"Distilling ODE Solvers of Diffusion Models into Smaller Steps","summary":" Abstract Diffusion models have recently gained prominence as a novel category\nof generative models. Despite their success, these models face a notable\ndrawback in terms of slow sampling speeds, requiring a high number of function\nevaluations (NFE) in the order of hundreds or thousands. In response, both\nlearning-free and learning-based sampling strategies have been explored to\nexpedite the sampling process. Learning-free sampling employs various ordinary\ndifferential equation (ODE) solvers based on the formulation of diffusion ODEs.\nHowever, it encounters challenges in faithfully tracking the true sampling\ntrajectory, particularly for small NFE. Conversely, learning-based sampling\nmethods, such as knowledge distillation, demand extensive additional training,\nlimiting their practical applicability. To overcome these limitations, we\nintroduce Distilled-ODE solvers (D-ODE solvers), a straightforward distillation\napproach grounded in ODE solver formulations. Our method seamlessly integrates\nthe strengths of both learning-free and learning-based sampling. D-ODE solvers\nare constructed by introducing a single parameter adjustment to existing ODE\nsolvers. Furthermore, we optimize D-ODE solvers with smaller steps using\nknowledge distillation from ODE solvers with larger steps across a batch of\nsamples. Comprehensive experiments demonstrate the superior performance of\nD-ODE solvers compared to existing ODE solvers, including DDIM, PNDM,\nDPM-Solver, DEIS, and EDM, particularly in scenarios with fewer NFE. Notably,\nour method incurs negligible computational overhead compared to previous\ndistillation techniques, facilitating straightforward and rapid integration\nwith existing samplers. Qualitative analysis reveals that D-ODE solvers not\nonly enhance image quality but also faithfully follow the target ODE\ntrajectory.\n","authors":["Sanghwan Kim","Hao Tang","Fisher Yu"],"pdf_url":"https://arxiv.org/pdf/2309.16421v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.04181v2","updated":"2024-03-27T02:51:24Z","published":"2023-10-06T11:53:04Z","title":"DiffPrompter: Differentiable Implicit Visual Prompts for\n Semantic-Segmentation in Adverse Conditions","summary":" Semantic segmentation in adverse weather scenarios is a critical task for\nautonomous driving systems. While foundation models have shown promise, the\nneed for specialized adaptors becomes evident for handling more challenging\nscenarios. We introduce DiffPrompter, a novel differentiable visual and latent\nprompting mechanism aimed at expanding the learning capabilities of existing\nadaptors in foundation models. Our proposed $\\nabla$HFC image processing block\nexcels particularly in adverse weather conditions, where conventional methods\noften fall short. Furthermore, we investigate the advantages of jointly\ntraining visual and latent prompts, demonstrating that this combined approach\nsignificantly enhances performance in out-of-distribution scenarios. Our\ndifferentiable visual prompts leverage parallel and series architectures to\ngenerate prompts, effectively improving object segmentation tasks in adverse\nconditions. Through a comprehensive series of experiments and evaluations, we\nprovide empirical evidence to support the efficacy of our approach. Project\npage at https://diffprompter.github.io.\n","authors":["Sanket Kalwar","Mihir Ungarala","Shruti Jain","Aaron Monis","Krishna Reddy Konda","Sourav Garg","K Madhava Krishna"],"pdf_url":"https://arxiv.org/pdf/2310.04181v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17343v2","updated":"2024-03-27T02:49:16Z","published":"2024-03-26T03:05:20Z","title":"Language Models are Free Boosters for Biomedical Imaging Tasks","summary":" In this study, we uncover the unexpected efficacy of residual-based large\nlanguage models (LLMs) as part of encoders for biomedical imaging tasks, a\ndomain traditionally devoid of language or textual data. The approach diverges\nfrom established methodologies by utilizing a frozen transformer block,\nextracted from pre-trained LLMs, as an innovative encoder layer for the direct\nprocessing of visual tokens. This strategy represents a significant departure\nfrom the standard multi-modal vision-language frameworks, which typically hinge\non language-driven prompts and inputs. We found that these LLMs could boost\nperformance across a spectrum of biomedical imaging applications, including\nboth 2D and 3D visual classification tasks, serving as plug-and-play boosters.\nMore interestingly, as a byproduct, we found that the proposed framework\nachieved superior performance, setting new state-of-the-art results on\nextensive, standardized datasets in MedMNIST-2D and 3D. Through this work, we\naim to open new avenues for employing LLMs in biomedical imaging and enriching\nthe understanding of their potential in this specialized domain.\n","authors":["Zhixin Lai","Jing Wu","Suiyao Chen","Yucheng Zhou","Naira Hovakimyan"],"pdf_url":"https://arxiv.org/pdf/2403.17343v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18211v1","updated":"2024-03-27T02:42:52Z","published":"2024-03-27T02:42:52Z","title":"NeuroPictor: Refining fMRI-to-Image Reconstruction via Multi-individual\n Pretraining and Multi-level Modulation","summary":" Recent fMRI-to-image approaches mainly focused on associating fMRI signals\nwith specific conditions of pre-trained diffusion models. These approaches,\nwhile producing high-quality images, capture only a limited aspect of the\ncomplex information in fMRI signals and offer little detailed control over\nimage creation. In contrast, this paper proposes to directly modulate the\ngeneration process of diffusion models using fMRI signals. Our approach,\nNeuroPictor, divides the fMRI-to-image process into three steps: i) fMRI\ncalibrated-encoding, to tackle multi-individual pre-training for a shared\nlatent space to minimize individual difference and enable the subsequent\ncross-subject training; ii) fMRI-to-image cross-subject pre-training,\nperceptually learning to guide diffusion model with high- and low-level\nconditions across different individuals; iii) fMRI-to-image single-subject\nrefining, similar with step ii but focus on adapting to particular individual.\nNeuroPictor extracts high-level semantic features from fMRI signals that\ncharacterizing the visual stimulus and incrementally fine-tunes the diffusion\nmodel with a low-level manipulation network to provide precise structural\ninstructions. By training with over 60,000 fMRI-image pairs from various\nindividuals, our model enjoys superior fMRI-to-image decoding capacity,\nparticularly in the within-subject setting, as evidenced in benchmark datasets.\nProject page: https://jingyanghuo.github.io/neuropictor/.\n","authors":["Jingyang Huo","Yikai Wang","Xuelin Qian","Yun Wang","Chong Li","Jianfeng Feng","Yanwei Fu"],"pdf_url":"https://arxiv.org/pdf/2403.18211v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18208v1","updated":"2024-03-27T02:39:23Z","published":"2024-03-27T02:39:23Z","title":"An Evolutionary Network Architecture Search Framework with Adaptive\n Multimodal Fusion for Hand Gesture Recognition","summary":" Hand gesture recognition (HGR) based on multimodal data has attracted\nconsiderable attention owing to its great potential in applications. Various\nmanually designed multimodal deep networks have performed well in multimodal\nHGR (MHGR), but most of existing algorithms require a lot of expert experience\nand time-consuming manual trials. To address these issues, we propose an\nevolutionary network architecture search framework with the adaptive multimodel\nfusion (AMF-ENAS). Specifically, we design an encoding space that\nsimultaneously considers fusion positions and ratios of the multimodal data,\nallowing for the automatic construction of multimodal networks with different\narchitectures through decoding. Additionally, we consider three input streams\ncorresponding to intra-modal surface electromyography (sEMG), intra-modal\naccelerometer (ACC), and inter-modal sEMG-ACC. To automatically adapt to\nvarious datasets, the ENAS framework is designed to automatically search a MHGR\nnetwork with appropriate fusion positions and ratios. To the best of our\nknowledge, this is the first time that ENAS has been utilized in MHGR to tackle\nissues related to the fusion position and ratio of multimodal data.\nExperimental results demonstrate that AMF-ENAS achieves state-of-the-art\nperformance on the Ninapro DB2, DB3, and DB7 datasets.\n","authors":["Yizhang Xia","Shihao Song","Zhanglu Hou","Junwen Xu","Juan Zou","Yuan Liu","Shengxiang Yang"],"pdf_url":"https://arxiv.org/pdf/2403.18208v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18207v1","updated":"2024-03-27T02:35:36Z","published":"2024-03-27T02:35:36Z","title":"Road Obstacle Detection based on Unknown Objectness Scores","summary":" The detection of unknown traffic obstacles is vital to ensure safe autonomous\ndriving. The standard object-detection methods cannot identify unknown objects\nthat are not included under predefined categories. This is because\nobject-detection methods are trained to assign a background label to pixels\ncorresponding to the presence of unknown objects. To address this problem, the\npixel-wise anomaly-detection approach has attracted increased research\nattention. Anomaly-detection techniques, such as uncertainty estimation and\nperceptual difference from reconstructed images, make it possible to identify\npixels of unknown objects as out-of-distribution (OoD) samples. However, when\napplied to images with many unknowns and complex components, such as driving\nscenes, these methods often exhibit unstable performance. The purpose of this\nstudy is to achieve stable performance for detecting unknown objects by\nincorporating the object-detection fashions into the pixel-wise anomaly\ndetection methods. To achieve this goal, we adopt a semantic-segmentation\nnetwork with a sigmoid head that simultaneously provides pixel-wise anomaly\nscores and objectness scores. Our experimental results show that the objectness\nscores play an important role in improving the detection performance. Based on\nthese results, we propose a novel anomaly score by integrating these two\nscores, which we term as unknown objectness score. Quantitative evaluations\nshow that the proposed method outperforms state-of-the-art methods when applied\nto the publicly available datasets.\n","authors":["Chihiro Noguchi","Toshiaki Ohgushi","Masao Yamanaka"],"pdf_url":"https://arxiv.org/pdf/2403.18207v1.pdf","comment":"ICRA 2024"},{"id":"http://arxiv.org/abs/2403.10066v3","updated":"2024-03-27T02:25:51Z","published":"2024-03-15T07:16:07Z","title":"Contrastive Pre-Training with Multi-View Fusion for No-Reference Point\n Cloud Quality Assessment","summary":" No-reference point cloud quality assessment (NR-PCQA) aims to automatically\nevaluate the perceptual quality of distorted point clouds without available\nreference, which have achieved tremendous improvements due to the utilization\nof deep neural networks. However, learning-based NR-PCQA methods suffer from\nthe scarcity of labeled data and usually perform suboptimally in terms of\ngeneralization. To solve the problem, we propose a novel contrastive\npre-training framework tailored for PCQA (CoPA), which enables the pre-trained\nmodel to learn quality-aware representations from unlabeled data. To obtain\nanchors in the representation space, we project point clouds with different\ndistortions into images and randomly mix their local patches to form mixed\nimages with multiple distortions. Utilizing the generated anchors, we constrain\nthe pre-training process via a quality-aware contrastive loss following the\nphilosophy that perceptual quality is closely related to both content and\ndistortion. Furthermore, in the model fine-tuning stage, we propose a\nsemantic-guided multi-view fusion module to effectively integrate the features\nof projected images from multiple perspectives. Extensive experiments show that\nour method outperforms the state-of-the-art PCQA methods on popular benchmarks.\nFurther investigations demonstrate that CoPA can also benefit existing\nlearning-based PCQA models.\n","authors":["Ziyu Shan","Yujie Zhang","Qi Yang","Haichen Yang","Yiling Xu","Jenq-Neng Hwang","Xiaozhong Xu","Shan Liu"],"pdf_url":"https://arxiv.org/pdf/2403.10066v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18201v1","updated":"2024-03-27T02:24:00Z","published":"2024-03-27T02:24:00Z","title":"Few-shot Online Anomaly Detection and Segmentation","summary":" Detecting anomaly patterns from images is a crucial artificial intelligence\ntechnique in industrial applications. Recent research in this domain has\nemphasized the necessity of a large volume of training data, overlooking the\npractical scenario where, post-deployment of the model, unlabeled data\ncontaining both normal and abnormal samples can be utilized to enhance the\nmodel's performance. Consequently, this paper focuses on addressing the\nchallenging yet practical few-shot online anomaly detection and segmentation\n(FOADS) task. Under the FOADS framework, models are trained on a few-shot\nnormal dataset, followed by inspection and improvement of their capabilities by\nleveraging unlabeled streaming data containing both normal and abnormal samples\nsimultaneously.\n To tackle this issue, we propose modeling the feature distribution of normal\nimages using a Neural Gas network, which offers the flexibility to adapt the\ntopology structure to identify outliers in the data flow. In order to achieve\nimproved performance with limited training samples, we employ multi-scale\nfeature embedding extracted from a CNN pre-trained on ImageNet to obtain a\nrobust representation. Furthermore, we introduce an algorithm that can\nincrementally update parameters without the need to store previous samples.\nComprehensive experimental results demonstrate that our method can achieve\nsubstantial performance under the FOADS setting, while ensuring that the time\ncomplexity remains within an acceptable range on MVTec AD and BTAD datasets.\n","authors":["Shenxing Wei","Xing Wei","Zhiheng Ma","Songlin Dong","Shaochen Zhang","Yihong Gong"],"pdf_url":"https://arxiv.org/pdf/2403.18201v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00154v2","updated":"2024-03-27T02:21:03Z","published":"2024-02-29T22:11:20Z","title":"LLMs in Political Science: Heralding a New Era of Visual Analysis","summary":" Interest is increasing among political scientists in leveraging the extensive\ninformation available in images. However, the challenge of interpreting these\nimages lies in the need for specialized knowledge in computer vision and access\nto specialized hardware. As a result, image analysis has been limited to a\nrelatively small group within the political science community. This landscape\ncould potentially change thanks to the rise of large language models (LLMs).\nThis paper aims to raise awareness of the feasibility of using Gemini for image\ncontent analysis. A retrospective analysis was conducted on a corpus of 688\nimages. Content reports were elicited from Gemini for each image and then\nmanually evaluated by the authors. We find that Gemini is highly accurate in\nperforming object detection, which is arguably the most common and fundamental\ntask in image analysis for political scientists. Equally important, we show\nthat it is easy to implement as the entire command consists of a single prompt\nin natural language; it is fast to run and should meet the time budget of most\nresearchers; and it is free to use and does not require any specialized\nhardware. In addition, we illustrate how political scientists can leverage\nGemini for other image understanding tasks, including face identification,\nsentiment analysis, and caption generation. Our findings suggest that Gemini\nand other similar LLMs have the potential to drastically stimulate and\naccelerate image research in political science and social sciences more\nbroadly.\n","authors":["Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2403.00154v2.pdf","comment":"7 pages, 3 tables"},{"id":"http://arxiv.org/abs/2403.18198v1","updated":"2024-03-27T02:16:04Z","published":"2024-03-27T02:16:04Z","title":"Generative Medical Segmentation","summary":" Rapid advancements in medical image segmentation performance have been\nsignificantly driven by the development of Convolutional Neural Networks (CNNs)\nand Vision Transformers (ViTs). However, these models introduce high\ncomputational demands and often have limited ability to generalize across\ndiverse medical imaging datasets. In this manuscript, we introduce Generative\nMedical Segmentation (GMS), a novel approach leveraging a generative model for\nimage segmentation. Concretely, GMS employs a robust pre-trained Variational\nAutoencoder (VAE) to derive latent representations of both images and masks,\nfollowed by a mapping model that learns the transition from image to mask in\nthe latent space. This process culminates in generating a precise segmentation\nmask within the image space using the pre-trained VAE decoder. The design of\nGMS leads to fewer learnable parameters in the model, resulting in a reduced\ncomputational burden and enhanced generalization capability. Our extensive\nexperimental analysis across five public datasets in different medical imaging\ndomains demonstrates GMS outperforms existing discriminative segmentation\nmodels and has remarkable domain generalization. Our experiments suggest GMS\ncould set a new benchmark for medical image segmentation, offering a scalable\nand effective solution. GMS implementation and model weights are available at\nhttps://github.com/King-HAW/GMS.\n","authors":["Jiayu Huo","Xi Ouyang","Sébastien Ourselin","Rachel Sparks"],"pdf_url":"https://arxiv.org/pdf/2403.18198v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18196v1","updated":"2024-03-27T02:13:20Z","published":"2024-03-27T02:13:20Z","title":"Looking Beyond What You See: An Empirical Analysis on Subgroup\n Intersectional Fairness for Multi-label Chest X-ray Classification Using\n Social Determinants of Racial Health Inequities","summary":" There has been significant progress in implementing deep learning models in\ndisease diagnosis using chest X- rays. Despite these advancements, inherent\nbiases in these models can lead to disparities in prediction accuracy across\nprotected groups. In this study, we propose a framework to achieve accurate\ndiagnostic outcomes and ensure fairness across intersectional groups in\nhigh-dimensional chest X- ray multi-label classification. Transcending\ntraditional protected attributes, we consider complex interactions within\nsocial determinants, enabling a more granular benchmark and evaluation of\nfairness. We present a simple and robust method that involves retraining the\nlast classification layer of pre-trained models using a balanced dataset across\ngroups. Additionally, we account for fairness constraints and integrate\nclass-balanced fine-tuning for multi-label settings. The evaluation of our\nmethod on the MIMIC-CXR dataset demonstrates that our framework achieves an\noptimal tradeoff between accuracy and fairness compared to baseline methods.\n","authors":["Dana Moukheiber","Saurabh Mahindre","Lama Moukheiber","Mira Moukheiber","Mingchen Gao"],"pdf_url":"https://arxiv.org/pdf/2403.18196v1.pdf","comment":"ICCV CVAMD 2023"},{"id":"http://arxiv.org/abs/2403.18193v1","updated":"2024-03-27T02:06:25Z","published":"2024-03-27T02:06:25Z","title":"Middle Fusion and Multi-Stage, Multi-Form Prompts for Robust RGB-T\n Tracking","summary":" RGB-T tracking, a vital downstream task of object tracking, has made\nremarkable progress in recent years. Yet, it remains hindered by two major\nchallenges: 1) the trade-off between performance and efficiency; 2) the\nscarcity of training data. To address the latter challenge, some recent methods\nemploy prompts to fine-tune pre-trained RGB tracking models and leverage\nupstream knowledge in a parameter-efficient manner. However, these methods\ninadequately explore modality-independent patterns and disregard the dynamic\nreliability of different modalities in open scenarios. We propose M3PT, a novel\nRGB-T prompt tracking method that leverages middle fusion and multi-modal and\nmulti-stage visual prompts to overcome these challenges. We pioneer the use of\nthe middle fusion framework for RGB-T tracking, which achieves a balance\nbetween performance and efficiency. Furthermore, we incorporate the pre-trained\nRGB tracking model into the framework and utilize multiple flexible prompt\nstrategies to adapt the pre-trained model to the comprehensive exploration of\nuni-modal patterns and the improved modeling of fusion-modal features,\nharnessing the potential of prompt learning in RGB-T tracking. Our method\noutperforms the state-of-the-art methods on four challenging benchmarks, while\nattaining 46.1 fps inference speed.\n","authors":["Qiming Wang","Yongqiang Bai","Hongxing Song"],"pdf_url":"https://arxiv.org/pdf/2403.18193v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00211v2","updated":"2024-03-27T01:50:06Z","published":"2024-03-01T01:07:40Z","title":"Trustworthy Self-Attention: Enabling the Network to Focus Only on the\n Most Relevant References","summary":" The prediction of optical flow for occluded points is still a difficult\nproblem that has not yet been solved. Recent methods use self-attention to find\nrelevant non-occluded points as references for estimating the optical flow of\noccluded points based on the assumption of self-similarity. However, they rely\non visual features of a single image and weak constraints, which are not\nsufficient to constrain the trained network to focus on erroneous and weakly\nrelevant reference points. We make full use of online occlusion recognition\ninformation to construct occlusion extended visual features and two strong\nconstraints, allowing the network to learn to focus only on the most relevant\nreferences without requiring occlusion ground truth to participate in the\ntraining of the network. Our method adds very few network parameters to the\noriginal framework, making it very lightweight. Extensive experiments show that\nour model has the greatest cross-dataset generalization. Our method achieves\nmuch greater error reduction, 18.6%, 16.2%, and 20.1% for all points,\nnon-occluded points, and occluded points respectively from the state-of-the-art\nGMA-base method, MATCHFlow(GMA), on Sintel Albedo pass. Furthermore, our model\nachieves state-of-the-art performance on the Sintel bench-marks, ranking \\#1\namong all published methods on Sintel clean pass. The code will be open-source.\n","authors":["Yu Jing","Tan Yujuan","Ren Ao","Liu Duo"],"pdf_url":"https://arxiv.org/pdf/2403.00211v2.pdf","comment":"Correct Figure 1"},{"id":"http://arxiv.org/abs/2403.18187v1","updated":"2024-03-27T01:40:21Z","published":"2024-03-27T01:40:21Z","title":"LayoutFlow: Flow Matching for Layout Generation","summary":" Finding a suitable layout represents a crucial task for diverse applications\nin graphic design. Motivated by simpler and smoother sampling trajectories, we\nexplore the use of Flow Matching as an alternative to current diffusion-based\nlayout generation models. Specifically, we propose LayoutFlow, an efficient\nflow-based model capable of generating high-quality layouts. Instead of\nprogressively denoising the elements of a noisy layout, our method learns to\ngradually move, or flow, the elements of an initial sample until it reaches its\nfinal prediction. In addition, we employ a conditioning scheme that allows us\nto handle various generation tasks with varying degrees of conditioning with a\nsingle model. Empirically, LayoutFlow performs on par with state-of-the-art\nmodels while being significantly faster.\n","authors":["Julian Jorge Andrade Guerreiro","Naoto Inoue","Kento Masui","Mayu Otani","Hideki Nakayama"],"pdf_url":"https://arxiv.org/pdf/2403.18187v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09069v2","updated":"2024-03-27T01:32:10Z","published":"2024-03-14T03:21:33Z","title":"Dyadic Interaction Modeling for Social Behavior Generation","summary":" Human-human communication is like a delicate dance where listeners and\nspeakers concurrently interact to maintain conversational dynamics. Hence, an\neffective model for generating listener nonverbal behaviors requires\nunderstanding the dyadic context and interaction. In this paper, we present an\neffective framework for creating 3D facial motions in dyadic interactions.\nExisting work consider a listener as a reactive agent with reflexive behaviors\nto the speaker's voice and facial motions. The heart of our framework is Dyadic\nInteraction Modeling (DIM), a pre-training approach that jointly models\nspeakers' and listeners' motions through masking and contrastive learning to\nlearn representations that capture the dyadic context. To enable the generation\nof non-deterministic behaviors, we encode both listener and speaker motions\ninto discrete latent representations, through VQ-VAE. The pre-trained model is\nfurther fine-tuned for motion generation. Extensive experiments demonstrate the\nsuperiority of our framework in generating listener motions, establishing a new\nstate-of-the-art according to the quantitative measures capturing the diversity\nand realism of generated motions. Qualitative results demonstrate the superior\ncapabilities of the proposed approach in generating diverse and realistic\nexpressions, eye blinks and head gestures.\n","authors":["Minh Tran","Di Chang","Maksim Siniukov","Mohammad Soleymani"],"pdf_url":"https://arxiv.org/pdf/2403.09069v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18186v1","updated":"2024-03-27T01:28:36Z","published":"2024-03-27T01:28:36Z","title":"Don't Look into the Dark: Latent Codes for Pluralistic Image Inpainting","summary":" We present a method for large-mask pluralistic image inpainting based on the\ngenerative framework of discrete latent codes. Our method learns latent priors,\ndiscretized as tokens, by only performing computations at the visible locations\nof the image. This is realized by a restrictive partial encoder that predicts\nthe token label for each visible block, a bidirectional transformer that infers\nthe missing labels by only looking at these tokens, and a dedicated synthesis\nnetwork that couples the tokens with the partial image priors to generate\ncoherent and pluralistic complete image even under extreme mask settings.\nExperiments on public benchmarks validate our design choices as the proposed\nmethod outperforms strong baselines in both visual quality and diversity\nmetrics.\n","authors":["Haiwei Chen","Yajie Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.18186v1.pdf","comment":"cvpr 2024"},{"id":"http://arxiv.org/abs/2403.18180v1","updated":"2024-03-27T01:15:05Z","published":"2024-03-27T01:15:05Z","title":"Multi-Layer Dense Attention Decoder for Polyp Segmentation","summary":" Detecting and segmenting polyps is crucial for expediting the diagnosis of\ncolon cancer. This is a challenging task due to the large variations of polyps\nin color, texture, and lighting conditions, along with subtle differences\nbetween the polyp and its surrounding area. Recently, vision Transformers have\nshown robust abilities in modeling global context for polyp segmentation.\nHowever, they face two major limitations: the inability to learn local\nrelations among multi-level layers and inadequate feature aggregation in the\ndecoder. To address these issues, we propose a novel decoder architecture aimed\nat hierarchically aggregating locally enhanced multi-level dense features.\nSpecifically, we introduce a novel module named Dense Attention Gate (DAG),\nwhich adaptively fuses all previous layers' features to establish local feature\nrelations among all layers. Furthermore, we propose a novel nested decoder\narchitecture that hierarchically aggregates decoder features, thereby enhancing\nsemantic features. We incorporate our novel dense decoder with the PVT backbone\nnetwork and conduct evaluations on five polyp segmentation datasets: Kvasir,\nCVC-300, CVC-ColonDB, CVC-ClinicDB, and ETIS. Our experiments and comparisons\nwith nine competing segmentation models demonstrate that the proposed\narchitecture achieves state-of-the-art performance and outperforms the previous\nmodels on four datasets. The source code is available at:\nhttps://github.com/krushi1992/Dense-Decoder.\n","authors":["Krushi Patel","Fengjun Li","Guanghui Wang"],"pdf_url":"https://arxiv.org/pdf/2403.18180v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18178v1","updated":"2024-03-27T01:12:31Z","published":"2024-03-27T01:12:31Z","title":"Online Embedding Multi-Scale CLIP Features into 3D Maps","summary":" This study introduces a novel approach to online embedding of multi-scale\nCLIP (Contrastive Language-Image Pre-Training) features into 3D maps. By\nharnessing CLIP, this methodology surpasses the constraints of conventional\nvocabulary-limited methods and enables the incorporation of semantic\ninformation into the resultant maps. While recent approaches have explored the\nembedding of multi-modal features in maps, they often impose significant\ncomputational costs, lacking practicality for exploring unfamiliar environments\nin real time. Our approach tackles these challenges by efficiently computing\nand embedding multi-scale CLIP features, thereby facilitating the exploration\nof unfamiliar environments through real-time map generation. Moreover, the\nembedding CLIP features into the resultant maps makes offline retrieval via\nlinguistic queries feasible. In essence, our approach simultaneously achieves\nreal-time object search and mapping of unfamiliar environments. Additionally,\nwe propose a zero-shot object-goal navigation system based on our mapping\napproach, and we validate its efficacy through object-goal navigation, offline\nobject retrieval, and multi-object-goal navigation in both simulated\nenvironments and real robot experiments. The findings demonstrate that our\nmethod not only exhibits swifter performance than state-of-the-art mapping\nmethods but also surpasses them in terms of the success rate of object-goal\nnavigation tasks.\n","authors":["Shun Taguchi","Hideki Deguchi"],"pdf_url":"https://arxiv.org/pdf/2403.18178v1.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2402.13729v2","updated":"2024-03-27T00:51:01Z","published":"2024-02-21T11:46:16Z","title":"Hybrid Video Diffusion Models with 2D Triplane and 3D Wavelet\n Representation","summary":" Generating high-quality videos that synthesize desired realistic content is a\nchallenging task due to their intricate high-dimensionality and complexity of\nvideos. Several recent diffusion-based methods have shown comparable\nperformance by compressing videos to a lower-dimensional latent space, using\ntraditional video autoencoder architecture. However, such method that employ\nstandard frame-wise 2D and 3D convolution fail to fully exploit the\nspatio-temporal nature of videos. To address this issue, we propose a novel\nhybrid video diffusion model, called HVDM, which can capture spatio-temporal\ndependencies more effectively. The HVDM is trained by a hybrid video\nautoencoder which extracts a disentangled representation of the video\nincluding: (i) a global context information captured by a 2D projected latent\n(ii) a local volume information captured by 3D convolutions with wavelet\ndecomposition (iii) a frequency information for improving the video\nreconstruction. Based on this disentangled representation, our hybrid\nautoencoder provide a more comprehensive video latent enriching the generated\nvideos with fine structures and details. Experiments on video generation\nbenchamarks (UCF101, SkyTimelapse, and TaiChi) demonstrate that the proposed\napproach achieves state-of-the-art video generation quality, showing a wide\nrange of video applications (e.g., long video generation, image-to-video, and\nvideo dynamics control).\n","authors":["Kihong Kim","Haneol Lee","Jihye Park","Seyeon Kim","Kwanghee Lee","Seungryong Kim","Jaejun Yoo"],"pdf_url":"https://arxiv.org/pdf/2402.13729v2.pdf","comment":"17 pages, 13 figures"},{"id":"http://arxiv.org/abs/2401.17098v2","updated":"2024-03-27T00:46:26Z","published":"2024-01-30T15:29:32Z","title":"Deep Learning-Driven Approach for Handwritten Chinese Character\n Classification","summary":" Handwritten character recognition (HCR) is a challenging problem for machine\nlearning researchers. Unlike printed text data, handwritten character datasets\nhave more variation due to human-introduced bias. With numerous unique\ncharacter classes present, some data, such as Logographic Scripts or\nSino-Korean character sequences, bring new complications to the HCR problem.\nThe classification task on such datasets requires the model to learn\nhigh-complexity details of the images that share similar features. With recent\nadvances in computational resource availability and further computer vision\ntheory development, some research teams have effectively addressed the arising\nchallenges. Although known for achieving high accuracy while keeping the number\nof parameters small, many common approaches are still not generalizable and use\ndataset-specific solutions to achieve better results. Due to complex structure,\nexisting methods frequently prevent the solutions from gaining popularity. This\npaper proposes a highly scalable approach for detailed character image\nclassification by introducing the model architecture, data preprocessing steps,\nand testing design instructions. We also perform experiments to compare the\nperformance of our method with that of existing ones to show the improvements\nachieved.\n","authors":["Boris Kriuk","Fedor Kriuk"],"pdf_url":"https://arxiv.org/pdf/2401.17098v2.pdf","comment":"30 pages, 9 figures, 2 tables, preprint v2"},{"id":"http://arxiv.org/abs/2403.15931v3","updated":"2024-03-27T23:57:47Z","published":"2024-03-23T20:30:28Z","title":"X-Portrait: Expressive Portrait Animation with Hierarchical Motion\n Attention","summary":" We propose X-Portrait, an innovative conditional diffusion model tailored for\ngenerating expressive and temporally coherent portrait animation. Specifically,\ngiven a single portrait as appearance reference, we aim to animate it with\nmotion derived from a driving video, capturing both highly dynamic and subtle\nfacial expressions along with wide-range head movements. As its core, we\nleverage the generative prior of a pre-trained diffusion model as the rendering\nbackbone, while achieve fine-grained head pose and expression control with\nnovel controlling signals within the framework of ControlNet. In contrast to\nconventional coarse explicit controls such as facial landmarks, our motion\ncontrol module is learned to interpret the dynamics directly from the original\ndriving RGB inputs. The motion accuracy is further enhanced with a patch-based\nlocal control module that effectively enhance the motion attention to\nsmall-scale nuances like eyeball positions. Notably, to mitigate the identity\nleakage from the driving signals, we train our motion control modules with\nscaling-augmented cross-identity images, ensuring maximized disentanglement\nfrom the appearance reference modules. Experimental results demonstrate the\nuniversal effectiveness of X-Portrait across a diverse range of facial\nportraits and expressive driving sequences, and showcase its proficiency in\ngenerating captivating portrait animations with consistently maintained\nidentity characteristics.\n","authors":["You Xie","Hongyi Xu","Guoxian Song","Chao Wang","Yichun Shi","Linjie Luo"],"pdf_url":"https://arxiv.org/pdf/2403.15931v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.12627v2","updated":"2024-03-27T23:54:26Z","published":"2023-06-22T01:33:47Z","title":"Targeted collapse regularized autoencoder for anomaly detection: black\n hole at the center","summary":" Autoencoders have been extensively used in the development of recent anomaly\ndetection techniques. The premise of their application is based on the notion\nthat after training the autoencoder on normal training data, anomalous inputs\nwill exhibit a significant reconstruction error. Consequently, this enables a\nclear differentiation between normal and anomalous samples. In practice,\nhowever, it is observed that autoencoders can generalize beyond the normal\nclass and achieve a small reconstruction error on some of the anomalous\nsamples. To improve the performance, various techniques propose additional\ncomponents and more sophisticated training procedures. In this work, we propose\na remarkably straightforward alternative: instead of adding neural network\ncomponents, involved computations, and cumbersome training, we complement the\nreconstruction loss with a computationally light term that regulates the norm\nof representations in the latent space. The simplicity of our approach\nminimizes the requirement for hyperparameter tuning and customization for new\napplications which, paired with its permissive data modality constraint,\nenhances the potential for successful adoption across a broad range of\napplications. We test the method on various visual and tabular benchmarks and\ndemonstrate that the technique matches and frequently outperforms more complex\nalternatives. We further demonstrate that implementing this idea in the context\nof state-of-the-art methods can further improve their performance. We also\nprovide a theoretical analysis and numerical simulations that help demonstrate\nthe underlying process that unfolds during training and how it helps with\nanomaly detection. This mitigates the black-box nature of autoencoder-based\nanomaly detection algorithms and offers an avenue for further investigation of\nadvantages, fail cases, and potential new directions.\n","authors":["Amin Ghafourian","Huanyi Shui","Devesh Upadhyay","Rajesh Gupta","Dimitar Filev","Iman Soltani Bozchalooi"],"pdf_url":"https://arxiv.org/pdf/2306.12627v2.pdf","comment":"18 pages, 4 figures, 8 tables"},{"id":"http://arxiv.org/abs/2311.13099v2","updated":"2024-03-27T23:49:07Z","published":"2023-11-22T01:58:26Z","title":"PIE-NeRF: Physics-based Interactive Elastodynamics with NeRF","summary":" We show that physics-based simulations can be seamlessly integrated with NeRF\nto generate high-quality elastodynamics of real-world objects. Unlike existing\nmethods, we discretize nonlinear hyperelasticity in a meshless way, obviating\nthe necessity for intermediate auxiliary shape proxies like a tetrahedral mesh\nor voxel grid. A quadratic generalized moving least square (Q-GMLS) is employed\nto capture nonlinear dynamics and large deformation on the implicit model. Such\nmeshless integration enables versatile simulations of complex and codimensional\nshapes. We adaptively place the least-square kernels according to the NeRF\ndensity field to significantly reduce the complexity of the nonlinear\nsimulation. As a result, physically realistic animations can be conveniently\nsynthesized using our method for a wide range of hyperelastic materials at an\ninteractive rate. For more information, please visit our project page at\nhttps://fytalon.github.io/pienerf/.\n","authors":["Yutao Feng","Yintong Shang","Xuan Li","Tianjia Shao","Chenfanfu Jiang","Yin Yang"],"pdf_url":"https://arxiv.org/pdf/2311.13099v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17729v2","updated":"2024-03-27T23:33:15Z","published":"2024-02-27T18:01:59Z","title":"Towards Fairness-Aware Adversarial Learning","summary":" Although adversarial training (AT) has proven effective in enhancing the\nmodel's robustness, the recently revealed issue of fairness in robustness has\nnot been well addressed, i.e. the robust accuracy varies significantly among\ndifferent categories. In this paper, instead of uniformly evaluating the\nmodel's average class performance, we delve into the issue of robust fairness,\nby considering the worst-case distribution across various classes. We propose a\nnovel learning paradigm, named Fairness-Aware Adversarial Learning (FAAL). As a\ngeneralization of conventional AT, we re-define the problem of adversarial\ntraining as a min-max-max framework, to ensure both robustness and fairness of\nthe trained model. Specifically, by taking advantage of distributional robust\noptimization, our method aims to find the worst distribution among different\ncategories, and the solution is guaranteed to obtain the upper bound\nperformance with high probability. In particular, FAAL can fine-tune an unfair\nrobust model to be fair within only two epochs, without compromising the\noverall clean and robust accuracies. Extensive experiments on various image\ndatasets validate the superior performance and efficiency of the proposed FAAL\ncompared to other state-of-the-art methods.\n","authors":["Yanghao Zhang","Tianle Zhang","Ronghui Mu","Xiaowei Huang","Wenjie Ruan"],"pdf_url":"https://arxiv.org/pdf/2402.17729v2.pdf","comment":"This work will appear in the CVPR 2024 conference proceedings"},{"id":"http://arxiv.org/abs/2203.13883v6","updated":"2024-03-27T23:27:58Z","published":"2022-03-25T19:45:33Z","title":"Multi-modal Misinformation Detection: Approaches, Challenges and\n Opportunities","summary":" As social media platforms are evolving from text-based forums into\nmulti-modal environments, the nature of misinformation in social media is also\ntransforming accordingly. Taking advantage of the fact that visual modalities\nsuch as images and videos are more favorable and attractive to the users and\ntextual contents are sometimes skimmed carelessly, misinformation spreaders\nhave recently targeted contextual connections between the modalities e.g., text\nand image. Hence many researchers have developed automatic techniques for\ndetecting possible cross-modal discordance in web-based content. We analyze,\ncategorize and identify existing approaches in addition to challenges and\nshortcomings they face in order to unearth new research opportunities in the\nfield of multi-modal misinformation detection.\n","authors":["Sara Abdali","Sina shaham","Bhaskar Krishnamachari"],"pdf_url":"https://arxiv.org/pdf/2203.13883v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03160v2","updated":"2024-03-27T22:58:34Z","published":"2023-12-05T22:04:49Z","title":"HybridNeRF: Efficient Neural Rendering via Adaptive Volumetric Surfaces","summary":" Neural radiance fields provide state-of-the-art view synthesis quality but\ntend to be slow to render. One reason is that they make use of volume\nrendering, thus requiring many samples (and model queries) per ray at render\ntime. Although this representation is flexible and easy to optimize, most\nreal-world objects can be modeled more efficiently with surfaces instead of\nvolumes, requiring far fewer samples per ray. This observation has spurred\nconsiderable progress in surface representations such as signed distance\nfunctions, but these may struggle to model semi-opaque and thin structures. We\npropose a method, HybridNeRF, that leverages the strengths of both\nrepresentations by rendering most objects as surfaces while modeling the\n(typically) small fraction of challenging regions volumetrically. We evaluate\nHybridNeRF against the challenging Eyeful Tower dataset along with other\ncommonly used view synthesis datasets. When comparing to state-of-the-art\nbaselines, including recent rasterization-based approaches, we improve error\nrates by 15-30% while achieving real-time framerates (at least 36 FPS) for\nvirtual-reality resolutions (2Kx2K).\n","authors":["Haithem Turki","Vasu Agrawal","Samuel Rota Bulò","Lorenzo Porzi","Peter Kontschieder","Deva Ramanan","Michael Zollhöfer","Christian Richardt"],"pdf_url":"https://arxiv.org/pdf/2312.03160v2.pdf","comment":"CVPR 2024 Project page: https://haithemturki.com/hybrid-nerf/"},{"id":"http://arxiv.org/abs/2403.19046v1","updated":"2024-03-27T22:50:48Z","published":"2024-03-27T22:50:48Z","title":"LITA: Language Instructed Temporal-Localization Assistant","summary":" There has been tremendous progress in multimodal Large Language Models\n(LLMs). Recent works have extended these models to video input with promising\ninstruction following capabilities. However, an important missing piece is\ntemporal localization. These models cannot accurately answer the \"When?\"\nquestions. We identify three key aspects that limit their temporal localization\ncapabilities: (i) time representation, (ii) architecture, and (iii) data. We\naddress these shortcomings by proposing Language Instructed\nTemporal-Localization Assistant (LITA) with the following features: (1) We\nintroduce time tokens that encode timestamps relative to the video length to\nbetter represent time in videos. (2) We introduce SlowFast tokens in the\narchitecture to capture temporal information at fine temporal resolution. (3)\nWe emphasize temporal localization data for LITA. In addition to leveraging\nexisting video datasets with timestamps, we propose a new task, Reasoning\nTemporal Localization (RTL), along with the dataset, ActivityNet-RTL, for\nlearning and evaluating this task. Reasoning temporal localization requires\nboth the reasoning and temporal localization of Video LLMs. LITA demonstrates\nstrong performance on this challenging task, nearly doubling the temporal mean\nintersection-over-union (mIoU) of baselines. In addition, we show that our\nemphasis on temporal localization also substantially improves video-based text\ngeneration compared to existing Video LLMs, including a 36% relative\nimprovement of Temporal Understanding. Code is available at:\nhttps://github.com/NVlabs/LITA\n","authors":["De-An Huang","Shijia Liao","Subhashree Radhakrishnan","Hongxu Yin","Pavlo Molchanov","Zhiding Yu","Jan Kautz"],"pdf_url":"https://arxiv.org/pdf/2403.19046v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19043v1","updated":"2024-03-27T22:36:02Z","published":"2024-03-27T22:36:02Z","title":"Illicit object detection in X-ray images using Vision Transformers","summary":" Illicit object detection is a critical task performed at various\nhigh-security locations, including airports, train stations, subways, and\nports. The continuous and tedious work of examining thousands of X-ray images\nper hour can be mentally taxing. Thus, Deep Neural Networks (DNNs) can be used\nto automate the X-ray image analysis process, improve efficiency and alleviate\nthe security officers' inspection burden. The neural architectures typically\nutilized in relevant literature are Convolutional Neural Networks (CNNs), with\nVision Transformers (ViTs) rarely employed. In order to address this gap, this\npaper conducts a comprehensive evaluation of relevant ViT architectures on\nillicit item detection in X-ray images. This study utilizes both Transformer\nand hybrid backbones, such as SWIN and NextViT, and detectors, such as DINO and\nRT-DETR. The results demonstrate the remarkable accuracy of the DINO\nTransformer detector in the low-data regime, the impressive real-time\nperformance of YOLOv8, and the effectiveness of the hybrid NextViT backbone.\n","authors":["Jorgen Cani","Ioannis Mademlis","Adamantia Anna Rebolledo Chrysochoou","Georgios Th. Papadopoulos"],"pdf_url":"https://arxiv.org/pdf/2403.19043v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.04291v2","updated":"2024-03-27T22:33:13Z","published":"2022-02-09T05:57:08Z","title":"L2B: Learning to Bootstrap Robust Models for Combating Label Noise","summary":" Deep neural networks have shown great success in representation learning.\nHowever, when learning with noisy labels (LNL), they can easily overfit and\nfail to generalize to new data. This paper introduces a simple and effective\nmethod, named Learning to Bootstrap (L2B), which enables models to bootstrap\nthemselves using their own predictions without being adversely affected by\nerroneous pseudo-labels. It achieves this by dynamically adjusting the\nimportance weight between real observed and generated labels, as well as\nbetween different samples through meta-learning. Unlike existing instance\nreweighting methods, the key to our method lies in a new, versatile objective\nthat enables implicit relabeling concurrently, leading to significant\nimprovements without incurring additional costs.\n L2B offers several benefits over the baseline methods. It yields more robust\nmodels that are less susceptible to the impact of noisy labels by guiding the\nbootstrapping procedure more effectively. It better exploits the valuable\ninformation contained in corrupted instances by adapting the weights of both\ninstances and labels. Furthermore, L2B is compatible with existing LNL methods\nand delivers competitive results spanning natural and medical imaging tasks\nincluding classification and segmentation under both synthetic and real-world\nnoise. Extensive experiments demonstrate that our method effectively mitigates\nthe challenges of noisy labels, often necessitating few to no validation\nsamples, and is well generalized to other tasks such as image segmentation.\nThis not only positions it as a robust complement to existing LNL techniques\nbut also underscores its practical applicability. The code and models are\navailable at https://github.com/yuyinzhou/l2b.\n","authors":["Yuyin Zhou","Xianhang Li","Fengze Liu","Qingyue Wei","Xuxi Chen","Lequan Yu","Cihang Xie","Matthew P. Lungren","Lei Xing"],"pdf_url":"https://arxiv.org/pdf/2202.04291v2.pdf","comment":"CVPR 2024; code is available at https://github.com/yuyinzhou/l2b"},{"id":"http://arxiv.org/abs/2204.11970v3","updated":"2024-03-27T22:02:30Z","published":"2022-04-25T21:20:27Z","title":"Visual Acuity Prediction on Real-Life Patient Data Using a Machine\n Learning Based Multistage System","summary":" In ophthalmology, intravitreal operative medication therapy (IVOM) is a\nwidespread treatment for diseases related to the age-related macular\ndegeneration (AMD), the diabetic macular edema (DME), as well as the retinal\nvein occlusion (RVO). However, in real-world settings, patients often suffer\nfrom loss of vision on time scales of years despite therapy, whereas the\nprediction of the visual acuity (VA) and the earliest possible detection of\ndeterioration under real-life conditions is challenging due to heterogeneous\nand incomplete data. In this contribution, we present a workflow for the\ndevelopment of a research-compatible data corpus fusing different IT systems of\nthe department of ophthalmology of a German maximum care hospital. The\nextensive data corpus allows predictive statements of the expected progression\nof a patient and his or her VA in each of the three diseases. For the disease\nAMD, we found out a significant deterioration of the visual acuity over time.\nWithin our proposed multistage system, we subsequently classify the VA\nprogression into the three groups of therapy \"winners\", \"stabilizers\", and\n\"losers\" (WSL classification scheme). Our OCT biomarker classification using an\nensemble of deep neural networks results in a classification accuracy\n(F1-score) of over 98 %, enabling us to complete incomplete OCT documentations\nwhile allowing us to exploit them for a more precise VA modelling process. Our\nVA prediction requires at least four VA examinations and optionally OCT\nbiomarkers from the same time period to predict the VA progression within a\nforecasted time frame, whereas our prediction is currently restricted to IVOM /\nno therapy. We achieve a final prediction accuracy of 69 % in macro average\nF1-score, while being in the same range as the ophthalmologists with 57.8 and\n50 +- 10.7 % F1-score.\n","authors":["Tobias Schlosser","Frederik Beuth","Trixy Meyer","Arunodhayan Sampath Kumar","Gabriel Stolze","Olga Furashova","Katrin Engelmann","Danny Kowerko"],"pdf_url":"https://arxiv.org/pdf/2204.11970v3.pdf","comment":"Preprint for journal Scientific Reports (Springer)"},{"id":"http://arxiv.org/abs/2403.13171v2","updated":"2024-03-27T21:43:37Z","published":"2024-03-19T21:52:19Z","title":"LUWA Dataset: Learning Lithic Use-Wear Analysis on Microscopic Images","summary":" Lithic Use-Wear Analysis (LUWA) using microscopic images is an underexplored\nvision-for-science research area. It seeks to distinguish the worked material,\nwhich is critical for understanding archaeological artifacts, material\ninteractions, tool functionalities, and dental records. However, this\nchallenging task goes beyond the well-studied image classification problem for\ncommon objects. It is affected by many confounders owing to the complex wear\nmechanism and microscopic imaging, which makes it difficult even for human\nexperts to identify the worked material successfully. In this paper, we\ninvestigate the following three questions on this unique vision task for the\nfirst time:(i) How well can state-of-the-art pre-trained models (like DINOv2)\ngeneralize to the rarely seen domain? (ii) How can few-shot learning be\nexploited for scarce microscopic images? (iii) How do the ambiguous\nmagnification and sensing modality influence the classification accuracy? To\nstudy these, we collaborated with archaeologists and built the first\nopen-source and the largest LUWA dataset containing 23,130 microscopic images\nwith different magnifications and sensing modalities. Extensive experiments\nshow that existing pre-trained models notably outperform human experts but\nstill leave a large gap for improvements. Most importantly, the LUWA dataset\nprovides an underexplored opportunity for vision and learning communities and\ncomplements existing image classification problems on common objects.\n","authors":["Jing Zhang","Irving Fang","Juexiao Zhang","Hao Wu","Akshat Kaushik","Alice Rodriguez","Hanwen Zhao","Zhuo Zheng","Radu Iovita","Chen Feng"],"pdf_url":"https://arxiv.org/pdf/2403.13171v2.pdf","comment":"CVPR"},{"id":"http://arxiv.org/abs/2403.19026v1","updated":"2024-03-27T21:43:12Z","published":"2024-03-27T21:43:12Z","title":"Egocentric Scene-aware Human Trajectory Prediction","summary":" Wearable collaborative robots stand to assist human wearers who need fall\nprevention assistance or wear exoskeletons. Such a robot needs to be able to\npredict the ego motion of the wearer based on egocentric vision and the\nsurrounding scene. In this work, we leveraged body-mounted cameras and sensors\nto anticipate the trajectory of human wearers through complex surroundings. To\nfacilitate research in ego-motion prediction, we have collected a comprehensive\nwalking scene navigation dataset centered on the user's perspective. We present\na method to predict human motion conditioning on the surrounding static scene.\nOur method leverages a diffusion model to produce a distribution of potential\nfuture trajectories, taking into account the user's observation of the\nenvironment. We introduce a compact representation to encode the user's visual\nmemory of the surroundings, as well as an efficient sample-generating technique\nto speed up real-time inference of a diffusion model. We ablate our model and\ncompare it to baselines, and results show that our model outperforms existing\nmethods on key metrics of collision avoidance and trajectory mode coverage.\n","authors":["Weizhuo Wang","C. Karen Liu","Monroe Kennedy III"],"pdf_url":"https://arxiv.org/pdf/2403.19026v1.pdf","comment":"14 pages, 9 figures"},{"id":"http://arxiv.org/abs/2403.19022v1","updated":"2024-03-27T21:24:20Z","published":"2024-03-27T21:24:20Z","title":"WALT3D: Generating Realistic Training Data from Time-Lapse Imagery for\n Reconstructing Dynamic Objects under Occlusion","summary":" Current methods for 2D and 3D object understanding struggle with severe\nocclusions in busy urban environments, partly due to the lack of large-scale\nlabeled ground-truth annotations for learning occlusion. In this work, we\nintroduce a novel framework for automatically generating a large, realistic\ndataset of dynamic objects under occlusions using freely available time-lapse\nimagery. By leveraging off-the-shelf 2D (bounding box, segmentation, keypoint)\nand 3D (pose, shape) predictions as pseudo-groundtruth, unoccluded 3D objects\nare identified automatically and composited into the background in a clip-art\nstyle, ensuring realistic appearances and physically accurate occlusion\nconfigurations. The resulting clip-art image with pseudo-groundtruth enables\nefficient training of object reconstruction methods that are robust to\nocclusions. Our method demonstrates significant improvements in both 2D and 3D\nreconstruction, particularly in scenarios with heavily occluded objects like\nvehicles and people in urban scenes.\n","authors":["Khiem Vuong","N. Dinesh Reddy","Robert Tamburo","Srinivasa G. Narasimhan"],"pdf_url":"https://arxiv.org/pdf/2403.19022v1.pdf","comment":"To appear in CVPR 2024"},{"id":"http://arxiv.org/abs/2309.13863v2","updated":"2024-03-27T21:15:27Z","published":"2023-09-25T04:27:06Z","title":"SuPerPM: A Large Deformation-Robust Surgical Perception Framework Based\n on Deep Point Matching Learned from Physical Constrained Simulation Data","summary":" Manipulation of tissue with surgical tools often results in large\ndeformations that current methods in tracking and reconstructing algorithms\nhave not effectively addressed. A major source of tracking errors during large\ndeformations stems from wrong data association between observed sensor\nmeasurements with previously tracked scene. To mitigate this issue, we present\na surgical perception framework, SuPerPM, that leverages learning-based\nnon-rigid point cloud matching for data association, thus accommodating larger\ndeformations. The learning models typically require training data with ground\ntruth point cloud correspondences, which is challenging or even impractical to\ncollect in surgical environments. Thus, for tuning the learning model, we\ngather endoscopic data of soft tissue being manipulated by a surgical robot and\nthen establish correspondences between point clouds at different time points to\nserve as ground truth. This was achieved by employing a position-based dynamics\n(PBD) simulation to ensure that the correspondences adhered to physical\nconstraints. The proposed framework is demonstrated on several challenging\nsurgical datasets that are characterized by large deformations, achieving\nsuperior performance over state-of-the-art surgical scene tracking algorithms.\n","authors":["Shan Lin","Albert J. Miao","Ali Alabiad","Fei Liu","Kaiyuan Wang","Jingpei Lu","Florian Richter","Michael C. Yip"],"pdf_url":"https://arxiv.org/pdf/2309.13863v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19002v1","updated":"2024-03-27T20:52:30Z","published":"2024-03-27T20:52:30Z","title":"Robust Active Speaker Detection in Noisy Environments","summary":" This paper addresses the issue of active speaker detection (ASD) in noisy\nenvironments and formulates a robust active speaker detection (rASD) problem.\nExisting ASD approaches leverage both audio and visual modalities, but\nnon-speech sounds in the surrounding environment can negatively impact\nperformance. To overcome this, we propose a novel framework that utilizes\naudio-visual speech separation as guidance to learn noise-free audio features.\nThese features are then utilized in an ASD model, and both tasks are jointly\noptimized in an end-to-end framework. Our proposed framework mitigates residual\nnoise and audio quality reduction issues that can occur in a naive cascaded\ntwo-stage framework that directly uses separated speech for ASD, and enables\nthe two tasks to be optimized simultaneously. To further enhance the robustness\nof the audio features and handle inherent speech noises, we propose a dynamic\nweighted loss approach to train the speech separator. We also collected a\nreal-world noise audio dataset to facilitate investigations. Experiments\ndemonstrate that non-speech audio noises significantly impact ASD models, and\nour proposed approach improves ASD performance in noisy environments. The\nframework is general and can be applied to different ASD approaches to improve\ntheir robustness. Our code, models, and data will be released.\n","authors":["Siva Sai Nagender Vasireddy","Chenxu Zhang","Xiaohu Guo","Yapeng Tian"],"pdf_url":"https://arxiv.org/pdf/2403.19002v1.pdf","comment":"15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2403.19001v1","updated":"2024-03-27T20:51:02Z","published":"2024-03-27T20:51:02Z","title":"Cross--domain Fiber Cluster Shape Analysis for Language Performance\n Cognitive Score Prediction","summary":" Shape plays an important role in computer graphics, offering informative\nfeatures to convey an object's morphology and functionality. Shape analysis in\nbrain imaging can help interpret structural and functionality correlations of\nthe human brain. In this work, we investigate the shape of the brain's 3D white\nmatter connections and its potential predictive relationship to human cognitive\nfunction. We reconstruct brain connections as sequences of 3D points using\ndiffusion magnetic resonance imaging (dMRI) tractography. To describe each\nconnection, we extract 12 shape descriptors in addition to traditional dMRI\nconnectivity and tissue microstructure features. We introduce a novel\nframework, Shape--fused Fiber Cluster Transformer (SFFormer), that leverages a\nmulti-head cross-attention feature fusion module to predict subject-specific\nlanguage performance based on dMRI tractography. We assess the performance of\nthe method on a large dataset including 1065 healthy young adults. The results\ndemonstrate that both the transformer-based SFFormer model and its inter/intra\nfeature fusion with shape, microstructure, and connectivity are informative,\nand together, they improve the prediction of subject-specific language\nperformance scores. Overall, our results indicate that the shape of the brain's\nconnections is predictive of human language function.\n","authors":["Yui Lo","Yuqian Chen","Dongnan Liu","Wan Liu","Leo Zekelman","Fan Zhang","Yogesh Rathi","Nikos Makris","Alexandra J. Golby","Weidong Cai","Lauren J. O'Donnell"],"pdf_url":"https://arxiv.org/pdf/2403.19001v1.pdf","comment":"2 figures, 11 pages"},{"id":"http://arxiv.org/abs/2310.14344v2","updated":"2024-03-27T20:48:37Z","published":"2023-10-22T16:31:01Z","title":"What's in a Prior? Learned Proximal Networks for Inverse Problems","summary":" Proximal operators are ubiquitous in inverse problems, commonly appearing as\npart of algorithmic strategies to regularize problems that are otherwise\nill-posed. Modern deep learning models have been brought to bear for these\ntasks too, as in the framework of plug-and-play or deep unrolling, where they\nloosely resemble proximal operators. Yet, something essential is lost in\nemploying these purely data-driven approaches: there is no guarantee that a\ngeneral deep network represents the proximal operator of any function, nor is\nthere any characterization of the function for which the network might provide\nsome approximate proximal. This not only makes guaranteeing convergence of\niterative schemes challenging but, more fundamentally, complicates the analysis\nof what has been learned by these networks about their training data. Herein we\nprovide a framework to develop learned proximal networks (LPN), prove that they\nprovide exact proximal operators for a data-driven nonconvex regularizer, and\nshow how a new training strategy, dubbed proximal matching, provably promotes\nthe recovery of the log-prior of the true data distribution. Such LPN provide\ngeneral, unsupervised, expressive proximal operators that can be used for\ngeneral inverse problems with convergence guarantees. We illustrate our results\nin a series of cases of increasing complexity, demonstrating that these models\nnot only result in state-of-the-art performance, but provide a window into the\nresulting priors learned from data.\n","authors":["Zhenghan Fang","Sam Buchanan","Jeremias Sulam"],"pdf_url":"https://arxiv.org/pdf/2310.14344v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18996v1","updated":"2024-03-27T20:30:01Z","published":"2024-03-27T20:30:01Z","title":"Envisioning MedCLIP: A Deep Dive into Explainability for Medical\n Vision-Language Models","summary":" Explaining Deep Learning models is becoming increasingly important in the\nface of daily emerging multimodal models, particularly in safety-critical\ndomains like medical imaging. However, the lack of detailed investigations into\nthe performance of explainability methods on these models is widening the gap\nbetween their development and safe deployment. In this work, we analyze the\nperformance of various explainable AI methods on a vision-language model,\nMedCLIP, to demystify its inner workings. We also provide a simple methodology\nto overcome the shortcomings of these methods. Our work offers a different new\nperspective on the explainability of a recent well-known VLM in the medical\ndomain and our assessment method is generalizable to other current and possible\nfuture VLMs.\n","authors":["Anees Ur Rehman Hashmi","Dwarikanath Mahapatra","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2403.18996v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11505v2","updated":"2024-03-27T20:10:05Z","published":"2024-03-18T06:20:49Z","title":"COVID-19 detection from pulmonary CT scans using a novel EfficientNet\n with attention mechanism","summary":" Manual analysis and diagnosis of COVID-19 through the examination of Computed\nTomography (CT) images of the lungs can be time-consuming and result in errors,\nespecially given high volume of patients and numerous images per patient. So,\nwe address the need for automation of this task by developing a new deep\nlearning model-based pipeline. Our motivation was sparked by the CVPR Workshop\non \"Domain Adaptation, Explainability and Fairness in AI for Medical Image\nAnalysis\", more specifically, the \"COVID-19 Diagnosis Competition (DEF-AI-MIA\nCOV19D)\" under the same Workshop. This challenge provides an opportunity to\nassess our proposed pipeline for COVID-19 detection from CT scan images. The\nsame pipeline incorporates the original EfficientNet, but with an added\nAttention Mechanism: EfficientNet-AM. Also, unlike the traditional/past\npipelines, which relied on a pre-processing step, our pipeline takes the raw\nselected input images without any such step, except for an image-selection step\nto simply reduce the number of CT images required for training and/or testing.\nMoreover, our pipeline is computationally efficient, as, for example, it does\nnot incorporate a decoder for segmenting the lungs. It also does not combine\ndifferent backbones nor combine RNN with a backbone, as other pipelines in the\npast did. Nevertheless, our pipeline still outperforms all approaches presented\nby other teams in last year's instance of the same challenge, at least based on\nthe validation subset of the competition dataset.\n","authors":["Ramy Farag","Parth Upadhyay","Yixiang Gao","Jacket Demby","Katherin Garces Montoya","Seyed Mohamad Ali Tousi","Gbenga Omotara","Guilherme DeSouza"],"pdf_url":"https://arxiv.org/pdf/2403.11505v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18985v1","updated":"2024-03-27T20:07:39Z","published":"2024-03-27T20:07:39Z","title":"Robustness and Visual Explanation for Black Box Image, Video, and ECG\n Signal Classification with Reinforcement Learning","summary":" We present a generic Reinforcement Learning (RL) framework optimized for\ncrafting adversarial attacks on different model types spanning from ECG signal\nanalysis (1D), image classification (2D), and video classification (3D). The\nframework focuses on identifying sensitive regions and inducing\nmisclassifications with minimal distortions and various distortion types. The\nnovel RL method outperforms state-of-the-art methods for all three\napplications, proving its efficiency. Our RL approach produces superior\nlocalization masks, enhancing interpretability for image classification and ECG\nanalysis models. For applications such as ECG analysis, our platform highlights\ncritical ECG segments for clinicians while ensuring resilience against\nprevalent distortions. This comprehensive tool aims to bolster both resilience\nwith adversarial training and transparency across varied applications and data\ntypes.\n","authors":["Soumyendu Sarkar","Ashwin Ramesh Babu","Sajad Mousavi","Vineet Gundecha","Avisek Naug","Sahand Ghorbanpour"],"pdf_url":"https://arxiv.org/pdf/2403.18985v1.pdf","comment":"AAAI Proceedings reference:\n https://ojs.aaai.org/index.php/AAAI/article/view/30579"},{"id":"http://arxiv.org/abs/1903.06811v3","updated":"2024-03-27T20:03:41Z","published":"2019-03-15T21:35:13Z","title":"Multi-camera calibration with pattern rigs, including for\n non-overlapping cameras: CALICO","summary":" This paper describes CALICO, a method for multi-camera calibration suitable\nfor challenging contexts: stationary and mobile multi-camera systems, cameras\nwithout overlapping fields of view, and non-synchronized cameras. Recent\napproaches are roughly divided into infrastructure- and pattern-based.\nInfrastructure-based approaches use the scene's features to calibrate, while\npattern-based approaches use calibration patterns. Infrastructure-based\napproaches are not suitable for stationary camera systems, and pattern-based\napproaches may constrain camera placement because shared fields of view or\nextremely large patterns are required.\n CALICO is a pattern-based approach, where the multi-calibration problem is\nformulated using rigidity constraints between patterns and cameras. We use a\n{\\it pattern rig}: several patterns rigidly attached to each other or some\nstructure. We express the calibration problem as that of algebraic and\nreprojection error minimization problems. Simulated and real experiments\ndemonstrate the method in a variety of settings. CALICO compared favorably to\nKalibr. Mean reconstruction accuracy error was $\\le 0.71$ mm for real camera\nrigs, and $\\le 1.11$ for simulated camera rigs. Code and data releases are\navailable at \\cite{tabb_amy_2019_3520866} and\n\\url{https://github.com/amy-tabb/calico}.\n","authors":["Amy Tabb","Henry Medeiros","Mitchell J. Feldmann","Thiago T. Santos"],"pdf_url":"https://arxiv.org/pdf/1903.06811v3.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2403.18978v1","updated":"2024-03-27T19:52:55Z","published":"2024-03-27T19:52:55Z","title":"TextCraftor: Your Text Encoder Can be Image Quality Controller","summary":" Diffusion-based text-to-image generative models, e.g., Stable Diffusion, have\nrevolutionized the field of content generation, enabling significant\nadvancements in areas like image editing and video synthesis. Despite their\nformidable capabilities, these models are not without their limitations. It is\nstill challenging to synthesize an image that aligns well with the input text,\nand multiple runs with carefully crafted prompts are required to achieve\nsatisfactory results. To mitigate these limitations, numerous studies have\nendeavored to fine-tune the pre-trained diffusion models, i.e., UNet, utilizing\nvarious technologies. Yet, amidst these efforts, a pivotal question of\ntext-to-image diffusion model training has remained largely unexplored: Is it\npossible and feasible to fine-tune the text encoder to improve the performance\nof text-to-image diffusion models? Our findings reveal that, instead of\nreplacing the CLIP text encoder used in Stable Diffusion with other large\nlanguage models, we can enhance it through our proposed fine-tuning approach,\nTextCraftor, leading to substantial improvements in quantitative benchmarks and\nhuman assessments. Interestingly, our technique also empowers controllable\nimage generation through the interpolation of different text encoders\nfine-tuned with various rewards. We also demonstrate that TextCraftor is\northogonal to UNet finetuning, and can be combined to further improve\ngenerative quality.\n","authors":["Yanyu Li","Xian Liu","Anil Kag","Ju Hu","Yerlan Idelbayev","Dhritiman Sagar","Yanzhi Wang","Sergey Tulyakov","Jian Ren"],"pdf_url":"https://arxiv.org/pdf/2403.18978v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.05995v2","updated":"2024-03-27T18:21:12Z","published":"2023-12-10T20:57:31Z","title":"From Correspondences to Pose: Non-minimal Certifiably Optimal Relative\n Pose without Disambiguation","summary":" Estimating the relative camera pose from $n \\geq 5$ correspondences between\ntwo calibrated views is a fundamental task in computer vision. This process\ntypically involves two stages: 1) estimating the essential matrix between the\nviews, and 2) disambiguating among the four candidate relative poses that\nsatisfy the epipolar geometry. In this paper, we demonstrate a novel approach\nthat, for the first time, bypasses the second stage. Specifically, we show that\nit is possible to directly estimate the correct relative camera pose from\ncorrespondences without needing a post-processing step to enforce the\ncheirality constraint on the correspondences. Building on recent advances in\ncertifiable non-minimal optimization, we frame the relative pose estimation as\na Quadratically Constrained Quadratic Program (QCQP). By applying the\nappropriate constraints, we ensure the estimation of a camera pose that\ncorresponds to a valid 3D geometry and that is globally optimal when certified.\nWe validate our method through exhaustive synthetic and real-world experiments,\nconfirming the efficacy, efficiency and accuracy of the proposed approach. Code\nis available at https://github.com/javrtg/C2P.\n","authors":["Javier Tirado-Garín","Javier Civera"],"pdf_url":"https://arxiv.org/pdf/2312.05995v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2307.04132v3","updated":"2024-03-27T18:17:46Z","published":"2023-07-09T09:04:26Z","title":"Reasoning over the Behaviour of Objects in Video-Clips for Adverb-Type\n Recognition","summary":" In this work, following the intuition that adverbs describing scene-sequences\nare best identified by reasoning over high-level concepts of object-behavior,\nwe propose the design of a new framework that reasons over object-behaviours\nextracted from raw-video-clips to recognize the clip's corresponding\nadverb-types. Importantly, while previous works for general scene\nadverb-recognition assume knowledge of the clips underlying action-types, our\nmethod is directly applicable in the more general problem setting where the\naction-type of a video-clip is unknown. Specifically, we propose a novel\npipeline that extracts human-interpretable object-behaviour-facts from raw\nvideo clips and propose novel symbolic and transformer based reasoning methods\nthat operate over these extracted facts to identify adverb-types. Experiment\nresults demonstrate that our proposed methods perform favourably against the\nprevious state-of-the-art. Additionally, to support efforts in symbolic\nvideo-processing, we release two new datasets of object-behaviour-facts\nextracted from raw video clips - the MSR-VTT-ASP and ActivityNet-ASP datasets.\n","authors":["Amrit Diggavi Seshadri","Alessandra Russo"],"pdf_url":"https://arxiv.org/pdf/2307.04132v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18922v1","updated":"2024-03-27T18:13:16Z","published":"2024-03-27T18:13:16Z","title":"Lift3D: Zero-Shot Lifting of Any 2D Vision Model to 3D","summary":" In recent years, there has been an explosion of 2D vision models for numerous\ntasks such as semantic segmentation, style transfer or scene editing, enabled\nby large-scale 2D image datasets. At the same time, there has been renewed\ninterest in 3D scene representations such as neural radiance fields from\nmulti-view images. However, the availability of 3D or multiview data is still\nsubstantially limited compared to 2D image datasets, making extending 2D vision\nmodels to 3D data highly desirable but also very challenging. Indeed, extending\na single 2D vision operator like scene editing to 3D typically requires a\nhighly creative method specialized to that task and often requires per-scene\noptimization. In this paper, we ask the question of whether any 2D vision model\ncan be lifted to make 3D consistent predictions. We answer this question in the\naffirmative; our new Lift3D method trains to predict unseen views on feature\nspaces generated by a few visual models (i.e. DINO and CLIP), but then\ngeneralizes to novel vision operators and tasks, such as style transfer,\nsuper-resolution, open vocabulary segmentation and image colorization; for some\nof these tasks, there is no comparable previous 3D method. In many cases, we\neven outperform state-of-the-art methods specialized for the task in question.\nMoreover, Lift3D is a zero-shot method, in the sense that it requires no\ntask-specific training, nor scene-specific optimization.\n","authors":["Mukund Varma T","Peihao Wang","Zhiwen Fan","Zhangyang Wang","Hao Su","Ravi Ramamoorthi"],"pdf_url":"https://arxiv.org/pdf/2403.18922v1.pdf","comment":"Computer Vision and Pattern Recognition Conference (CVPR), 2024"},{"id":"http://arxiv.org/abs/2403.18921v1","updated":"2024-03-27T18:12:24Z","published":"2024-03-27T18:12:24Z","title":"SMOF: Streaming Modern CNNs on FPGAs with Smart Off-Chip Eviction","summary":" Convolutional Neural Networks (CNNs) have demonstrated their effectiveness in\nnumerous vision tasks. However, their high processing requirements necessitate\nefficient hardware acceleration to meet the application's performance targets.\nIn the space of FPGAs, streaming-based dataflow architectures are often adopted\nby users, as significant performance gains can be achieved through layer-wise\npipelining and reduced off-chip memory access by retaining data on-chip.\nHowever, modern topologies, such as the UNet, YOLO, and X3D models, utilise\nlong skip connections, requiring significant on-chip storage and thus limiting\nthe performance achieved by such system architectures. The paper addresses the\nabove limitation by introducing weight and activation eviction mechanisms to\noff-chip memory along the computational pipeline, taking into account the\navailable compute and memory resources. The proposed mechanism is incorporated\ninto an existing toolflow, expanding the design space by utilising off-chip\nmemory as a buffer. This enables the mapping of such modern CNNs to devices\nwith limited on-chip memory, under the streaming architecture design approach.\nSMOF has demonstrated the capacity to deliver competitive and, in some cases,\nstate-of-the-art performance across a spectrum of computer vision tasks,\nachieving up to 10.65 X throughput improvement compared to previous works.\n","authors":["Petros Toupas","Zhewen Yu","Christos-Savvas Bouganis","Dimitrios Tzovaras"],"pdf_url":"https://arxiv.org/pdf/2403.18921v1.pdf","comment":"12 pages, 8 figures, 5 tables"},{"id":"http://arxiv.org/abs/2403.18920v1","updated":"2024-03-27T18:09:55Z","published":"2024-03-27T18:09:55Z","title":"CPR: Retrieval Augmented Generation for Copyright Protection","summary":" Retrieval Augmented Generation (RAG) is emerging as a flexible and robust\ntechnique to adapt models to private users data without training, to handle\ncredit attribution, and to allow efficient machine unlearning at scale.\nHowever, RAG techniques for image generation may lead to parts of the retrieved\nsamples being copied in the model's output. To reduce risks of leaking private\ninformation contained in the retrieved set, we introduce Copy-Protected\ngeneration with Retrieval (CPR), a new method for RAG with strong copyright\nprotection guarantees in a mixed-private setting for diffusion models.CPR\nallows to condition the output of diffusion models on a set of retrieved\nimages, while also guaranteeing that unique identifiable information about\nthose example is not exposed in the generated outputs. In particular, it does\nso by sampling from a mixture of public (safe) distribution and private (user)\ndistribution by merging their diffusion scores at inference. We prove that CPR\nsatisfies Near Access Freeness (NAF) which bounds the amount of information an\nattacker may be able to extract from the generated images. We provide two\nalgorithms for copyright protection, CPR-KL and CPR-Choose. Unlike previously\nproposed rejection-sampling-based NAF methods, our methods enable efficient\ncopyright-protected sampling with a single run of backward diffusion. We show\nthat our method can be applied to any pre-trained conditional diffusion model,\nsuch as Stable Diffusion or unCLIP. In particular, we empirically show that\napplying CPR on top of unCLIP improves quality and text-to-image alignment of\nthe generated results (81.4 to 83.17 on TIFA benchmark), while enabling credit\nattribution, copy-right protection, and deterministic, constant time,\nunlearning.\n","authors":["Aditya Golatkar","Alessandro Achille","Luca Zancato","Yu-Xiang Wang","Ashwin Swaminathan","Stefano Soatto"],"pdf_url":"https://arxiv.org/pdf/2403.18920v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.18915v1","updated":"2024-03-27T18:08:14Z","published":"2024-03-27T18:08:14Z","title":"PLOT-TAL -- Prompt Learning with Optimal Transport for Few-Shot Temporal\n Action Localization","summary":" This paper introduces a novel approach to temporal action localization (TAL)\nin few-shot learning. Our work addresses the inherent limitations of\nconventional single-prompt learning methods that often lead to overfitting due\nto the inability to generalize across varying contexts in real-world videos.\nRecognizing the diversity of camera views, backgrounds, and objects in videos,\nwe propose a multi-prompt learning framework enhanced with optimal transport.\nThis design allows the model to learn a set of diverse prompts for each action,\ncapturing general characteristics more effectively and distributing the\nrepresentation to mitigate the risk of overfitting. Furthermore, by employing\noptimal transport theory, we efficiently align these prompts with action\nfeatures, optimizing for a comprehensive representation that adapts to the\nmultifaceted nature of video data. Our experiments demonstrate significant\nimprovements in action localization accuracy and robustness in few-shot\nsettings on the standard challenging datasets of THUMOS-14 and EpicKitchens100,\nhighlighting the efficacy of our multi-prompt optimal transport approach in\novercoming the challenges of conventional few-shot TAL methods.\n","authors":["Edward Fish","Jon Weinbren","Andrew Gilbert"],"pdf_url":"https://arxiv.org/pdf/2403.18915v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2403.18913v1","updated":"2024-03-27T18:06:31Z","published":"2024-03-27T18:06:31Z","title":"UniDepth: Universal Monocular Metric Depth Estimation","summary":" Accurate monocular metric depth estimation (MMDE) is crucial to solving\ndownstream tasks in 3D perception and modeling. However, the remarkable\naccuracy of recent MMDE methods is confined to their training domains. These\nmethods fail to generalize to unseen domains even in the presence of moderate\ndomain gaps, which hinders their practical applicability. We propose a new\nmodel, UniDepth, capable of reconstructing metric 3D scenes from solely single\nimages across domains. Departing from the existing MMDE methods, UniDepth\ndirectly predicts metric 3D points from the input image at inference time\nwithout any additional information, striving for a universal and flexible MMDE\nsolution. In particular, UniDepth implements a self-promptable camera module\npredicting dense camera representation to condition depth features. Our model\nexploits a pseudo-spherical output representation, which disentangles camera\nand depth representations. In addition, we propose a geometric invariance loss\nthat promotes the invariance of camera-prompted depth features. Thorough\nevaluations on ten datasets in a zero-shot regime consistently demonstrate the\nsuperior performance of UniDepth, even when compared with methods directly\ntrained on the testing domains. Code and models are available at:\nhttps://github.com/lpiccinelli-eth/unidepth\n","authors":["Luigi Piccinelli","Yung-Hsu Yang","Christos Sakaridis","Mattia Segu","Siyuan Li","Luc Van Gool","Fisher Yu"],"pdf_url":"https://arxiv.org/pdf/2403.18913v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18910v1","updated":"2024-03-27T18:02:49Z","published":"2024-03-27T18:02:49Z","title":"A Geometric Explanation of the Likelihood OOD Detection Paradox","summary":" Likelihood-based deep generative models (DGMs) commonly exhibit a puzzling\nbehaviour: when trained on a relatively complex dataset, they assign higher\nlikelihood values to out-of-distribution (OOD) data from simpler sources.\nAdding to the mystery, OOD samples are never generated by these DGMs despite\nhaving higher likelihoods. This two-pronged paradox has yet to be conclusively\nexplained, making likelihood-based OOD detection unreliable. Our primary\nobservation is that high-likelihood regions will not be generated if they\ncontain minimal probability mass. We demonstrate how this seeming contradiction\nof large densities yet low probability mass can occur around data confined to\nlow-dimensional manifolds. We also show that this scenario can be identified\nthrough local intrinsic dimension (LID) estimation, and propose a method for\nOOD detection which pairs the likelihoods and LID estimates obtained from a\npre-trained DGM. Our method can be applied to normalizing flows and score-based\ndiffusion models, and obtains results which match or surpass state-of-the-art\nOOD detection benchmarks using the same DGM backbones. Our code is available at\nhttps://github.com/layer6ai-labs/dgm_ood_detection.\n","authors":["Hamidreza Kamkari","Brendan Leigh Ross","Jesse C. Cresswell","Anthony L. Caterini","Rahul G. Krishnan","Gabriel Loaiza-Ganem"],"pdf_url":"https://arxiv.org/pdf/2403.18910v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18908v1","updated":"2024-03-27T18:02:23Z","published":"2024-03-27T18:02:23Z","title":"Enhancing Multiple Object Tracking Accuracy via Quantum Annealing","summary":" Multiple object tracking (MOT), a key task in image recognition, presents a\npersistent challenge in balancing processing speed and tracking accuracy. This\nstudy introduces a novel approach that leverages quantum annealing (QA) to\nexpedite computation speed, while enhancing tracking accuracy through the\nensembling of object tracking processes. A method to improve the matching\nintegration process is also proposed. By utilizing the sequential nature of\nMOT, this study further augments the tracking method via reverse annealing\n(RA). Experimental validation confirms the maintenance of high accuracy with an\nannealing time of a mere 3 $\\mu$s per tracking process. The proposed method\nholds significant potential for real-time MOT applications, including traffic\nflow measurement for urban traffic light control, collision prediction for\nautonomous robots and vehicles, and management of products mass-produced in\nfactories.\n","authors":["Yasuyuki Ihara"],"pdf_url":"https://arxiv.org/pdf/2403.18908v1.pdf","comment":"19pages, 15 figures"},{"id":"http://arxiv.org/abs/2403.18886v1","updated":"2024-03-27T17:59:21Z","published":"2024-03-27T17:59:21Z","title":"Self-Expansion of Pre-trained Models with Mixture of Adapters for\n Continual Learning","summary":" Continual learning aims to learn from a stream of continuously arriving data\nwith minimum forgetting of previously learned knowledge. While previous works\nhave explored the effectiveness of leveraging the generalizable knowledge from\npre-trained models in continual learning, existing parameter-efficient\nfine-tuning approaches focus on the use of a predetermined or task-wise set of\nadapters or prompts. However, these approaches still suffer from forgetting due\nto task interference on jointly used parameters or restricted flexibility. The\nreliance on a static model architecture may lead to the allocation of excessive\nparameters that are not essential or, conversely, inadequate adaptation for\ndownstream tasks, given that the scale and distribution of incoming data are\nunpredictable in continual learning. We propose Self-Expansion of pre-trained\nmodels with Modularized Adaptation (SEMA), a novel fine-tuning approach which\nautomatically decides to reuse or add adapter modules on demand in continual\nlearning, depending on whether drastic distribution shift that could not be\nhandled by existing modules is detected at different representation levels. We\ndesign each adapter module to consist of an adapter and a representation\ndescriptor, specifically, implemented as an autoencoder. The representation\ndescriptor functions as a distributional shift indicator during training and\ntriggers adapter expansion. For better usage of the adapters, an expandable\nweighting router is learned jointly for mixture of adapter outputs. By\ncomparing with vision-transformer-based continual learning adaptation methods,\nwe demonstrate that the proposed framework outperforms the state-of-the-art\nwithout memory rehearsal.\n","authors":["Huiyi Wang","Haodong Lu","Lina Yao","Dong Gong"],"pdf_url":"https://arxiv.org/pdf/2403.18886v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18878v1","updated":"2024-03-27T10:46:24Z","published":"2024-03-27T10:46:24Z","title":"AIC-UNet: Anatomy-informed Cascaded UNet for Robust Multi-Organ\n Segmentation","summary":" Imposing key anatomical features, such as the number of organs, their shapes,\nsizes, and relative positions, is crucial for building a robust multi-organ\nsegmentation model. Current attempts to incorporate anatomical features include\nbroadening effective receptive fields (ERF) size with resource- and\ndata-intensive modules such as self-attention or introducing organ-specific\ntopology regularizers, which may not scale to multi-organ segmentation problems\nwhere inter-organ relation also plays a huge role. We introduce a new approach\nto impose anatomical constraints on any existing encoder-decoder segmentation\nmodel by conditioning model prediction with learnable anatomy prior. More\nspecifically, given an abdominal scan, a part of the encoder spatially warps a\nlearnable prior to align with the given input scan using thin plate spline\n(TPS) grid interpolation. The warped prior is then integrated during the\ndecoding phase to guide the model for more anatomy-informed predictions. Code\nis available at\n\\hyperlink{https://anonymous.4open.science/r/AIC-UNet-7048}{https://anonymous.4open.science/r/AIC-UNet-7048}.\n","authors":["Young Seok Jeon","Hongfei Yang","Huazhu Fu","Mengling Feng"],"pdf_url":"https://arxiv.org/pdf/2403.18878v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16456v2","updated":"2024-03-27T04:14:59Z","published":"2024-01-29T09:12:23Z","title":"SHViT: Single-Head Vision Transformer with Memory Efficient Macro Design","summary":" Recently, efficient Vision Transformers have shown great performance with low\nlatency on resource-constrained devices. Conventionally, they use 4x4 patch\nembeddings and a 4-stage structure at the macro level, while utilizing\nsophisticated attention with multi-head configuration at the micro level. This\npaper aims to address computational redundancy at all design levels in a\nmemory-efficient manner. We discover that using larger-stride patchify stem not\nonly reduces memory access costs but also achieves competitive performance by\nleveraging token representations with reduced spatial redundancy from the early\nstages. Furthermore, our preliminary analyses suggest that attention layers in\nthe early stages can be substituted with convolutions, and several attention\nheads in the latter stages are computationally redundant. To handle this, we\nintroduce a single-head attention module that inherently prevents head\nredundancy and simultaneously boosts accuracy by parallelly combining global\nand local information. Building upon our solutions, we introduce SHViT, a\nSingle-Head Vision Transformer that obtains the state-of-the-art speed-accuracy\ntradeoff. For example, on ImageNet-1k, our SHViT-S4 is 3.3x, 8.1x, and 2.4x\nfaster than MobileViTv2 x1.0 on GPU, CPU, and iPhone12 mobile device,\nrespectively, while being 1.3% more accurate. For object detection and instance\nsegmentation on MS COCO using Mask-RCNN head, our model achieves performance\ncomparable to FastViT-SA12 while exhibiting 3.8x and 2.0x lower backbone\nlatency on GPU and mobile device, respectively.\n","authors":["Seokju Yun","Youngmin Ro"],"pdf_url":"https://arxiv.org/pdf/2401.16456v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19716v1","updated":"2024-03-27T17:41:16Z","published":"2024-03-27T17:41:16Z","title":"Capability-aware Prompt Reformulation Learning for Text-to-Image\n Generation","summary":" Text-to-image generation systems have emerged as revolutionary tools in the\nrealm of artistic creation, offering unprecedented ease in transforming textual\nprompts into visual art. However, the efficacy of these systems is intricately\nlinked to the quality of user-provided prompts, which often poses a challenge\nto users unfamiliar with prompt crafting. This paper addresses this challenge\nby leveraging user reformulation data from interaction logs to develop an\nautomatic prompt reformulation model. Our in-depth analysis of these logs\nreveals that user prompt reformulation is heavily dependent on the individual\nuser's capability, resulting in significant variance in the quality of\nreformulation pairs. To effectively use this data for training, we introduce\nthe Capability-aware Prompt Reformulation (CAPR) framework. CAPR innovatively\nintegrates user capability into the reformulation process through two key\ncomponents: the Conditional Reformulation Model (CRM) and Configurable\nCapability Features (CCF). CRM reformulates prompts according to a specified\nuser capability, as represented by CCF. The CCF, in turn, offers the\nflexibility to tune and guide the CRM's behavior. This enables CAPR to\neffectively learn diverse reformulation strategies across various user\ncapacities and to simulate high-capability user reformulation during inference.\nExtensive experiments on standard text-to-image generation benchmarks showcase\nCAPR's superior performance over existing baselines and its remarkable\nrobustness on unseen systems. Furthermore, comprehensive analyses validate the\neffectiveness of different components. CAPR can facilitate user-friendly\ninteraction with text-to-image systems and make advanced artistic creation more\nachievable for a broader range of users.\n","authors":["Jingtao Zhan","Qingyao Ai","Yiqun Liu","Jia Chen","Shaoping Ma"],"pdf_url":"https://arxiv.org/pdf/2403.19716v1.pdf","comment":"Accepted at SIGIR 2024"}]},"2024-03-28T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2403.19655v1","updated":"2024-03-28T17:59:50Z","published":"2024-03-28T17:59:50Z","title":"GaussianCube: Structuring Gaussian Splatting using Optimal Transport for\n 3D Generative Modeling","summary":" 3D Gaussian Splatting (GS) have achieved considerable improvement over Neural\nRadiance Fields in terms of 3D fitting fidelity and rendering speed. However,\nthis unstructured representation with scattered Gaussians poses a significant\nchallenge for generative modeling. To address the problem, we introduce\nGaussianCube, a structured GS representation that is both powerful and\nefficient for generative modeling. We achieve this by first proposing a\nmodified densification-constrained GS fitting algorithm which can yield\nhigh-quality fitting results using a fixed number of free Gaussians, and then\nre-arranging the Gaussians into a predefined voxel grid via Optimal Transport.\nThe structured grid representation allows us to use standard 3D U-Net as our\nbackbone in diffusion generative modeling without elaborate designs. Extensive\nexperiments conducted on ShapeNet and OmniObject3D show that our model achieves\nstate-of-the-art generation results both qualitatively and quantitatively,\nunderscoring the potential of GaussianCube as a powerful and versatile 3D\nrepresentation.\n","authors":["Bowen Zhang","Yiji Cheng","Jiaolong Yang","Chunyu Wang","Feng Zhao","Yansong Tang","Dong Chen","Baining Guo"],"pdf_url":"https://arxiv.org/pdf/2403.19655v1.pdf","comment":"Project Page: https://gaussiancube.github.io/"},{"id":"http://arxiv.org/abs/2403.19654v1","updated":"2024-03-28T17:59:49Z","published":"2024-03-28T17:59:49Z","title":"RSMamba: Remote Sensing Image Classification with State Space Model","summary":" Remote sensing image classification forms the foundation of various\nunderstanding tasks, serving a crucial function in remote sensing image\ninterpretation. The recent advancements of Convolutional Neural Networks (CNNs)\nand Transformers have markedly enhanced classification accuracy. Nonetheless,\nremote sensing scene classification remains a significant challenge, especially\ngiven the complexity and diversity of remote sensing scenarios and the\nvariability of spatiotemporal resolutions. The capacity for whole-image\nunderstanding can provide more precise semantic cues for scene discrimination.\nIn this paper, we introduce RSMamba, a novel architecture for remote sensing\nimage classification. RSMamba is based on the State Space Model (SSM) and\nincorporates an efficient, hardware-aware design known as the Mamba. It\nintegrates the advantages of both a global receptive field and linear modeling\ncomplexity. To overcome the limitation of the vanilla Mamba, which can only\nmodel causal sequences and is not adaptable to two-dimensional image data, we\npropose a dynamic multi-path activation mechanism to augment Mamba's capacity\nto model non-causal data. Notably, RSMamba maintains the inherent modeling\nmechanism of the vanilla Mamba, yet exhibits superior performance across\nmultiple remote sensing image classification datasets. This indicates that\nRSMamba holds significant potential to function as the backbone of future\nvisual foundation models. The code will be available at\n\\url{https://github.com/KyanChen/RSMamba}.\n","authors":["Keyan Chen","Bowen Chen","Chenyang Liu","Wenyuan Li","Zhengxia Zou","Zhenwei Shi"],"pdf_url":"https://arxiv.org/pdf/2403.19654v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19653v1","updated":"2024-03-28T17:59:42Z","published":"2024-03-28T17:59:42Z","title":"Detecting Image Attribution for Text-to-Image Diffusion Models in RGB\n and Beyond","summary":" Modern text-to-image (T2I) diffusion models can generate images with\nremarkable realism and creativity. These advancements have sparked research in\nfake image detection and attribution, yet prior studies have not fully explored\nthe practical and scientific dimensions of this task. In addition to\nattributing images to 12 state-of-the-art T2I generators, we provide extensive\nanalyses on what inference stage hyperparameters and image modifications are\ndiscernible. Our experiments reveal that initialization seeds are highly\ndetectable, along with other subtle variations in the image generation process\nto some extent. We further investigate what visual traces are leveraged in\nimage attribution by perturbing high-frequency details and employing mid-level\nrepresentations of image style and structure. Notably, altering high-frequency\ninformation causes only slight reductions in accuracy, and training an\nattributor on style representations outperforms training on RGB images. Our\nanalyses underscore that fake images are detectable and attributable at various\nlevels of visual granularity than previously explored.\n","authors":["Katherine Xu","Lingzhi Zhang","Jianbo Shi"],"pdf_url":"https://arxiv.org/pdf/2403.19653v1.pdf","comment":"Code available at https://github.com/k8xu/ImageAttribution"},{"id":"http://arxiv.org/abs/2403.19652v1","updated":"2024-03-28T17:59:30Z","published":"2024-03-28T17:59:30Z","title":"InterDreamer: Zero-Shot Text to 3D Dynamic Human-Object Interaction","summary":" Text-conditioned human motion generation has experienced significant\nadvancements with diffusion models trained on extensive motion capture data and\ncorresponding textual annotations. However, extending such success to 3D\ndynamic human-object interaction (HOI) generation faces notable challenges,\nprimarily due to the lack of large-scale interaction data and comprehensive\ndescriptions that align with these interactions. This paper takes the\ninitiative and showcases the potential of generating human-object interactions\nwithout direct training on text-interaction pair data. Our key insight in\nachieving this is that interaction semantics and dynamics can be decoupled.\nBeing unable to learn interaction semantics through supervised training, we\ninstead leverage pre-trained large models, synergizing knowledge from a large\nlanguage model and a text-to-motion model. While such knowledge offers\nhigh-level control over interaction semantics, it cannot grasp the intricacies\nof low-level interaction dynamics. To overcome this issue, we further introduce\na world model designed to comprehend simple physics, modeling how human actions\ninfluence object motion. By integrating these components, our novel framework,\nInterDreamer, is able to generate text-aligned 3D HOI sequences in a zero-shot\nmanner. We apply InterDreamer to the BEHAVE and CHAIRS datasets, and our\ncomprehensive experimental analysis demonstrates its capability to generate\nrealistic and coherent interaction sequences that seamlessly align with the\ntext directives.\n","authors":["Sirui Xu","Ziyin Wang","Yu-Xiong Wang","Liang-Yan Gui"],"pdf_url":"https://arxiv.org/pdf/2403.19652v1.pdf","comment":"Project Page: https://sirui-xu.github.io/InterDreamer/"},{"id":"http://arxiv.org/abs/2403.19651v1","updated":"2024-03-28T17:59:20Z","published":"2024-03-28T17:59:20Z","title":"MagicLens: Self-Supervised Image Retrieval with Open-Ended Instructions","summary":" Image retrieval, i.e., finding desired images given a reference image,\ninherently encompasses rich, multi-faceted search intents that are difficult to\ncapture solely using image-based measures. Recent work leverages text\ninstructions to allow users to more freely express their search intents.\nHowever, existing work primarily focuses on image pairs that are visually\nsimilar and/or can be characterized by a small set of pre-defined relations.\nThe core thesis of this paper is that text instructions can enable retrieving\nimages with richer relations beyond visual similarity. To show this, we\nintroduce MagicLens, a series of self-supervised image retrieval models that\nsupport open-ended instructions. MagicLens is built on a key novel insight:\nimage pairs that naturally occur on the same web pages contain a wide range of\nimplicit relations (e.g., inside view of), and we can bring those implicit\nrelations explicit by synthesizing instructions via large multimodal models\n(LMMs) and large language models (LLMs). Trained on 36.7M (query image,\ninstruction, target image) triplets with rich semantic relations mined from the\nweb, MagicLens achieves comparable or better results on eight benchmarks of\nvarious image retrieval tasks than prior state-of-the-art (SOTA) methods.\nRemarkably, it outperforms previous SOTA but with a 50X smaller model size on\nmultiple benchmarks. Additional human analyses on a 1.4M-image unseen corpus\nfurther demonstrate the diversity of search intents supported by MagicLens.\n","authors":["Kai Zhang","Yi Luan","Hexiang Hu","Kenton Lee","Siyuan Qiao","Wenhu Chen","Yu Su","Ming-Wei Chang"],"pdf_url":"https://arxiv.org/pdf/2403.19651v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2311.14097v3","updated":"2024-03-28T17:59:06Z","published":"2023-11-23T16:49:06Z","title":"ACT-Diffusion: Efficient Adversarial Consistency Training for One-step\n Diffusion Models","summary":" Though diffusion models excel in image generation, their step-by-step\ndenoising leads to slow generation speeds. Consistency training addresses this\nissue with single-step sampling but often produces lower-quality generations\nand requires high training costs. In this paper, we show that optimizing\nconsistency training loss minimizes the Wasserstein distance between target and\ngenerated distributions. As timestep increases, the upper bound accumulates\nprevious consistency training losses. Therefore, larger batch sizes are needed\nto reduce both current and accumulated losses. We propose Adversarial\nConsistency Training (ACT), which directly minimizes the Jensen-Shannon (JS)\ndivergence between distributions at each timestep using a discriminator.\nTheoretically, ACT enhances generation quality, and convergence. By\nincorporating a discriminator into the consistency training framework, our\nmethod achieves improved FID scores on CIFAR10 and ImageNet 64$\\times$64 and\nLSUN Cat 256$\\times$256 datasets, retains zero-shot image inpainting\ncapabilities, and uses less than $1/6$ of the original batch size and fewer\nthan $1/2$ of the model parameters and training steps compared to the baseline\nmethod, this leads to a substantial reduction in resource consumption. Our code\nis available:https://github.com/kong13661/ACT\n","authors":["Fei Kong","Jinhao Duan","Lichao Sun","Hao Cheng","Renjing Xu","Hengtao Shen","Xiaofeng Zhu","Xiaoshuang Shi","Kaidi Xu"],"pdf_url":"https://arxiv.org/pdf/2311.14097v3.pdf","comment":"To appear in CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19649v1","updated":"2024-03-28T17:57:27Z","published":"2024-03-28T17:57:27Z","title":"GraspXL: Generating Grasping Motions for Diverse Objects at Scale","summary":" Human hands possess the dexterity to interact with diverse objects such as\ngrasping specific parts of the objects and/or approaching them from desired\ndirections. More importantly, humans can grasp objects of any shape without\nobject-specific skills. Recent works synthesize grasping motions following\nsingle objectives such as a desired approach heading direction or a grasping\narea. Moreover, they usually rely on expensive 3D hand-object data during\ntraining and inference, which limits their capability to synthesize grasping\nmotions for unseen objects at scale. In this paper, we unify the generation of\nhand-object grasping motions across multiple motion objectives, diverse object\nshapes and dexterous hand morphologies in a policy learning framework GraspXL.\nThe objectives are composed of the graspable area, heading direction during\napproach, wrist rotation, and hand position. Without requiring any 3D\nhand-object interaction data, our policy trained with 58 objects can robustly\nsynthesize diverse grasping motions for more than 500k unseen objects with a\nsuccess rate of 82.2%. At the same time, the policy adheres to objectives,\nwhich enables the generation of diverse grasps per object. Moreover, we show\nthat our framework can be deployed to different dexterous hands and work with\nreconstructed or generated objects. We quantitatively and qualitatively\nevaluate our method to show the efficacy of our approach. Our model and code\nwill be available.\n","authors":["Hui Zhang","Sammy Christen","Zicong Fan","Otmar Hilliges","Jie Song"],"pdf_url":"https://arxiv.org/pdf/2403.19649v1.pdf","comment":"Project Page: https://eth-ait.github.io/graspxl/"},{"id":"http://arxiv.org/abs/2403.19646v1","updated":"2024-03-28T17:55:42Z","published":"2024-03-28T17:55:42Z","title":"Change-Agent: Towards Interactive Comprehensive Change Interpretation\n and Analysis from Change Detection and Change Captioning","summary":" Monitoring changes in the Earth's surface is crucial for understanding\nnatural processes and human impacts, necessitating precise and comprehensive\ninterpretation methodologies. Remote sensing satellite imagery offers a unique\nperspective for monitoring these changes, leading to the emergence of remote\nsensing image change interpretation (RSICI) as a significant research focus.\nCurrent RSICI technology encompasses change detection and change captioning,\neach with its limitations in providing comprehensive interpretation. To address\nthis, we propose an interactive Change-Agent which integrates a multi-level\nchange interpretation (MCI) model as eyes and a large language model (LLM) as\nthe brain. Our Change-Agent can follow user instructions to achieve\ncomprehensive change interpretation and insightful analysis according to user\ninstructions, such as change detection and change captioning, change object\ncounting, change cause analysis, etc. Our proposed MCI model contains two\nbranches of pixel-level change detection and semantic-level change captioning,\nin which multiple BI-temporal Iterative Interaction (BI3) layers utilize Local\nPerception Enhancement (LPE) and the Global Difference Fusion Attention (GDFA)\nmodules to enhance the model's discriminative feature representation\ncapabilities. To train the MCI model, we build the LEVIR-MCI dataset with\nchange masks and captions of bi-temporal images. Extensive experiments\ndemonstrate the effectiveness of the proposed change interpretation model and\nhighlight the promising potential of our Change-Agent in facilitating\ncomprehensive and intelligent interpretation of surface changes. We will make\nour dataset and codebase of the change interpretation model and Change-Agent\npublicly available to facilitate future research at\nhttps://github.com/Chen-Yang-Liu/Change-Agent\n","authors":["Chenyang Liu","Keyan Chen","Haotian Zhang","Zipeng Qi","Zhengxia Zou","Zhenwei Shi"],"pdf_url":"https://arxiv.org/pdf/2403.19646v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.01362v3","updated":"2024-03-28T17:55:39Z","published":"2023-07-03T21:33:40Z","title":"Direct Superpoints Matching for Robust Point Cloud Registration","summary":" Deep neural networks endow the downsampled superpoints with highly\ndiscriminative feature representations. Previous dominant point cloud\nregistration approaches match these feature representations as the first step,\ne.g., using the Sinkhorn algorithm. A RANSAC-like method is then usually\nadopted as a post-processing refinement to filter the outliers. Other dominant\nmethod is to directly predict the superpoint matchings using learned MLP\nlayers. Both of them have drawbacks: RANSAC-based methods are computationally\nintensive and prediction-based methods suffer from outputing non-existing\npoints in the point cloud. In this paper, we propose a straightforward and\neffective baseline to find correspondences of superpoints in a global matching\nmanner. We employ the normalized matching scores as weights for each\ncorrespondence, allowing us to reject the outliers and further weigh the rest\ninliers when fitting the transformation matrix without relying on the\ncumbersome RANSAC. Moreover, the entire model can be trained in an end-to-end\nfashion, leading to better accuracy. Our simple yet effective baseline shows\ncomparable or even better results than state-of-the-art methods on three\ndatasets including ModelNet, 3DMatch, and KITTI. We do not advocate our\napproach to be \\emph{the} solution for point cloud registration but use the\nresults to emphasize the role of matching strategy for point cloud\nregistration. The code and models are available at\nhttps://github.com/neu-vi/Superpoints_Registration.\n","authors":["Aniket Gupta","Yiming Xie","Hanumant Singh","Huaizu Jiang"],"pdf_url":"https://arxiv.org/pdf/2307.01362v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19645v1","updated":"2024-03-28T17:55:16Z","published":"2024-03-28T17:55:16Z","title":"GANTASTIC: GAN-based Transfer of Interpretable Directions for\n Disentangled Image Editing in Text-to-Image Diffusion Models","summary":" The rapid advancement in image generation models has predominantly been\ndriven by diffusion models, which have demonstrated unparalleled success in\ngenerating high-fidelity, diverse images from textual prompts. Despite their\nsuccess, diffusion models encounter substantial challenges in the domain of\nimage editing, particularly in executing disentangled edits-changes that target\nspecific attributes of an image while leaving irrelevant parts untouched. In\ncontrast, Generative Adversarial Networks (GANs) have been recognized for their\nsuccess in disentangled edits through their interpretable latent spaces. We\nintroduce GANTASTIC, a novel framework that takes existing directions from\npre-trained GAN models-representative of specific, controllable attributes-and\ntransfers these directions into diffusion-based models. This novel approach not\nonly maintains the generative quality and diversity that diffusion models are\nknown for but also significantly enhances their capability to perform precise,\ntargeted image edits, thereby leveraging the best of both worlds.\n","authors":["Yusuf Dalva","Hidir Yesiltepe","Pinar Yanardag"],"pdf_url":"https://arxiv.org/pdf/2403.19645v1.pdf","comment":"Project page: https://gantastic.github.io"},{"id":"http://arxiv.org/abs/2304.09704v2","updated":"2024-03-28T17:53:08Z","published":"2023-04-19T14:49:31Z","title":"Learnable Earth Parser: Discovering 3D Prototypes in Aerial Scans","summary":" We propose an unsupervised method for parsing large 3D scans of real-world\nscenes with easily-interpretable shapes. This work aims to provide a practical\ntool for analyzing 3D scenes in the context of aerial surveying and mapping,\nwithout the need for user annotations. Our approach is based on a probabilistic\nreconstruction model that decomposes an input 3D point cloud into a small set\nof learned prototypical 3D shapes. The resulting reconstruction is visually\ninterpretable and can be used to perform unsupervised instance and low-shot\nsemantic segmentation of complex scenes. We demonstrate the usefulness of our\nmodel on a novel dataset of seven large aerial LiDAR scans from diverse\nreal-world scenarios. Our approach outperforms state-of-the-art unsupervised\nmethods in terms of decomposition accuracy while remaining visually\ninterpretable. Our code and dataset are available at\nhttps://romainloiseau.fr/learnable-earth-parser/\n","authors":["Romain Loiseau","Elliot Vincent","Mathieu Aubry","Loic Landrieu"],"pdf_url":"https://arxiv.org/pdf/2304.09704v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19638v1","updated":"2024-03-28T17:52:24Z","published":"2024-03-28T17:52:24Z","title":"Siamese Vision Transformers are Scalable Audio-visual Learners","summary":" Traditional audio-visual methods rely on independent audio and visual\nbackbones, which is costly and not scalable. In this work, we investigate using\nan audio-visual siamese network (AVSiam) for efficient and scalable\naudio-visual pretraining. Our framework uses a single shared vision transformer\nbackbone to process audio and visual inputs, improving its parameter\nefficiency, reducing the GPU memory footprint, and allowing us to scale our\nmethod to larger datasets and model sizes. We pretrain our model using a\ncontrastive audio-visual matching objective with a multi-ratio random masking\nscheme, which enables our model to process larger audio-visual instance\nbatches, helpful for contrastive learning. Unlike prior audio-visual methods,\nour method can robustly handle audio, visual, and audio-visual inputs with a\nsingle shared ViT backbone. Furthermore, despite using the shared backbone for\nboth modalities, AVSiam achieves competitive or even better results than prior\nmethods on AudioSet and VGGSound for audio-visual classification and retrieval.\nOur code is available at https://github.com/GenjiB/AVSiam\n","authors":["Yan-Bo Lin","Gedas Bertasius"],"pdf_url":"https://arxiv.org/pdf/2403.19638v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19632v1","updated":"2024-03-28T17:47:31Z","published":"2024-03-28T17:47:31Z","title":"GauStudio: A Modular Framework for 3D Gaussian Splatting and Beyond","summary":" We present GauStudio, a novel modular framework for modeling 3D Gaussian\nSplatting (3DGS) to provide standardized, plug-and-play components for users to\neasily customize and implement a 3DGS pipeline. Supported by our framework, we\npropose a hybrid Gaussian representation with foreground and skyball background\nmodels. Experiments demonstrate this representation reduces artifacts in\nunbounded outdoor scenes and improves novel view synthesis. Finally, we propose\nGaussian Splatting Surface Reconstruction (GauS), a novel render-then-fuse\napproach for high-fidelity mesh reconstruction from 3DGS inputs without\nfine-tuning. Overall, our GauStudio framework, hybrid representation, and GauS\napproach enhance 3DGS modeling and rendering capabilities, enabling\nhigher-quality novel view synthesis and surface reconstruction.\n","authors":["Chongjie Ye","Yinyu Nie","Jiahao Chang","Yuantao Chen","Yihao Zhi","Xiaoguang Han"],"pdf_url":"https://arxiv.org/pdf/2403.19632v1.pdf","comment":"Code: https://github.com/GAP-LAB-CUHK-SZ/gaustudio"},{"id":"http://arxiv.org/abs/2403.19622v1","updated":"2024-03-28T17:42:54Z","published":"2024-03-28T17:42:54Z","title":"RH20T-P: A Primitive-Level Robotic Dataset Towards Composable\n Generalization Agents","summary":" The ultimate goals of robotic learning is to acquire a comprehensive and\ngeneralizable robotic system capable of performing both seen skills within the\ntraining distribution and unseen skills in novel environments. Recent progress\nin utilizing language models as high-level planners has demonstrated that the\ncomplexity of tasks can be reduced through decomposing them into\nprimitive-level plans, making it possible to generalize on novel robotic tasks\nin a composable manner. Despite the promising future, the community is not yet\nadequately prepared for composable generalization agents, particularly due to\nthe lack of primitive-level real-world robotic datasets. In this paper, we\npropose a primitive-level robotic dataset, namely RH20T-P, which contains about\n33000 video clips covering 44 diverse and complicated robotic tasks. Each clip\nis manually annotated according to a set of meticulously designed primitive\nskills, facilitating the future development of composable generalization\nagents. To validate the effectiveness of RH20T-P, we also construct a potential\nand scalable agent based on RH20T-P, called RA-P. Equipped with two planners\nspecialized in task decomposition and motion planning, RA-P can adapt to novel\nphysical skills through composable generalization. Our website and videos can\nbe found at https://sites.google.com/view/rh20t-primitive/main. Dataset and\ncode will be made available soon.\n","authors":["Zeren Chen","Zhelun Shi","Xiaoya Lu","Lehan He","Sucheng Qian","Hao Shu Fang","Zhenfei Yin","Wanli Ouyang","Jing Shao","Yu Qiao","Cewu Lu","Lu Sheng"],"pdf_url":"https://arxiv.org/pdf/2403.19622v1.pdf","comment":"24 pages, 12 figures, 6 tables"},{"id":"http://arxiv.org/abs/2403.19620v1","updated":"2024-03-28T17:40:15Z","published":"2024-03-28T17:40:15Z","title":"Collaborative Interactive Evolution of Art in the Latent Space of Deep\n Generative Models","summary":" Generative Adversarial Networks (GANs) have shown great success in generating\nhigh quality images and are thus used as one of the main approaches to generate\nart images. However, usually the image generation process involves sampling\nfrom the latent space of the learned art representations, allowing little\ncontrol over the output. In this work, we first employ GANs that are trained to\nproduce creative images using an architecture known as Creative Adversarial\nNetworks (CANs), then, we employ an evolutionary approach to navigate within\nthe latent space of the models to discover images. We use automatic aesthetic\nand collaborative interactive human evaluation metrics to assess the generated\nimages. In the human interactive evaluation case, we propose a collaborative\nevaluation based on the assessments of several participants. Furthermore, we\nalso experiment with an intelligent mutation operator that aims to improve the\nquality of the images through local search based on an aesthetic measure. We\nevaluate the effectiveness of this approach by comparing the results produced\nby the automatic and collaborative interactive evolution. The results show that\nthe proposed approach can generate highly attractive art images when the\nevolution is guided by collaborative human feedback.\n","authors":["Ole Hall","Anil Yaman"],"pdf_url":"https://arxiv.org/pdf/2403.19620v1.pdf","comment":"Preprint. The Version of Record of this contribution is to be\n published in the proceedings of the 13th International Conference on\n Artificial Intelligence in Music, Sound, Art and Design (EvoMUSART) 2024"},{"id":"http://arxiv.org/abs/2304.09224v2","updated":"2024-03-28T17:36:50Z","published":"2023-04-18T18:23:20Z","title":"Quantum machine learning for image classification","summary":" Image classification, a pivotal task in multiple industries, faces\ncomputational challenges due to the burgeoning volume of visual data. This\nresearch addresses these challenges by introducing two quantum machine learning\nmodels that leverage the principles of quantum mechanics for effective\ncomputations. Our first model, a hybrid quantum neural network with parallel\nquantum circuits, enables the execution of computations even in the noisy\nintermediate-scale quantum era, where circuits with a large number of qubits\nare currently infeasible. This model demonstrated a record-breaking\nclassification accuracy of 99.21% on the full MNIST dataset, surpassing the\nperformance of known quantum-classical models, while having eight times fewer\nparameters than its classical counterpart. Also, the results of testing this\nhybrid model on a Medical MNIST (classification accuracy over 99%), and on\nCIFAR-10 (classification accuracy over 82%), can serve as evidence of the\ngeneralizability of the model and highlights the efficiency of quantum layers\nin distinguishing common features of input data. Our second model introduces a\nhybrid quantum neural network with a Quanvolutional layer, reducing image\nresolution via a convolution process. The model matches the performance of its\nclassical counterpart, having four times fewer trainable parameters, and\noutperforms a classical model with equal weight parameters. These models\nrepresent advancements in quantum machine learning research and illuminate the\npath towards more accurate image classification systems.\n","authors":["Arsenii Senokosov","Alexandr Sedykh","Asel Sagingalieva","Basil Kyriacou","Alexey Melnikov"],"pdf_url":"https://arxiv.org/pdf/2304.09224v2.pdf","comment":"13 pages, 10 figures, 1 table"},{"id":"http://arxiv.org/abs/2312.07360v2","updated":"2024-03-28T17:35:29Z","published":"2023-12-12T15:30:24Z","title":"Boosting Latent Diffusion with Flow Matching","summary":" Recently, there has been tremendous progress in visual synthesis and the\nunderlying generative models. Here, diffusion models (DMs) stand out\nparticularly, but lately, flow matching (FM) has also garnered considerable\ninterest. While DMs excel in providing diverse images, they suffer from long\ntraining and slow generation. With latent diffusion, these issues are only\npartially alleviated. Conversely, FM offers faster training and inference but\nexhibits less diversity in synthesis. We demonstrate that introducing FM\nbetween the Diffusion model and the convolutional decoder offers\nhigh-resolution image synthesis with reduced computational cost and model size.\nDiffusion can then efficiently provide the necessary generation diversity. FM\ncompensates for the lower resolution, mapping the small latent space to a\nhigh-dimensional one. Subsequently, the convolutional decoder of the LDM maps\nthese latents to high-resolution images. By combining the diversity of DMs, the\nefficiency of FMs, and the effectiveness of convolutional decoders, we achieve\nstate-of-the-art high-resolution image synthesis at $1024^2$ with minimal\ncomputational cost. Importantly, our approach is orthogonal to recent\napproximation and speed-up strategies for the underlying DMs, making it easily\nintegrable into various DM frameworks.\n","authors":["Johannes S. Fischer","Ming Gui","Pingchuan Ma","Nick Stracke","Stefan A. Baumann","Björn Ommer"],"pdf_url":"https://arxiv.org/pdf/2312.07360v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19615v1","updated":"2024-03-28T17:32:58Z","published":"2024-03-28T17:32:58Z","title":"SA-GS: Scale-Adaptive Gaussian Splatting for Training-Free Anti-Aliasing","summary":" In this paper, we present a Scale-adaptive method for Anti-aliasing Gaussian\nSplatting (SA-GS). While the state-of-the-art method Mip-Splatting needs\nmodifying the training procedure of Gaussian splatting, our method functions at\ntest-time and is training-free. Specifically, SA-GS can be applied to any\npretrained Gaussian splatting field as a plugin to significantly improve the\nfield's anti-alising performance. The core technique is to apply 2D\nscale-adaptive filters to each Gaussian during test time. As pointed out by\nMip-Splatting, observing Gaussians at different frequencies leads to mismatches\nbetween the Gaussian scales during training and testing. Mip-Splatting resolves\nthis issue using 3D smoothing and 2D Mip filters, which are unfortunately not\naware of testing frequency. In this work, we show that a 2D scale-adaptive\nfilter that is informed of testing frequency can effectively match the Gaussian\nscale, thus making the Gaussian primitive distribution remain consistent across\ndifferent testing frequencies. When scale inconsistency is eliminated, sampling\nrates smaller than the scene frequency result in conventional jaggedness, and\nwe propose to integrate the projected 2D Gaussian within each pixel during\ntesting. This integration is actually a limiting case of super-sampling, which\nsignificantly improves anti-aliasing performance over vanilla Gaussian\nSplatting. Through extensive experiments using various settings and both\nbounded and unbounded scenes, we show SA-GS performs comparably with or better\nthan Mip-Splatting. Note that super-sampling and integration are only effective\nwhen our scale-adaptive filtering is activated. Our codes, data and models are\navailable at https://github.com/zsy1987/SA-GS.\n","authors":["Xiaowei Song","Jv Zheng","Shiran Yuan","Huan-ang Gao","Jingwei Zhao","Xiang He","Weihao Gu","Hao Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.19615v1.pdf","comment":"Project page: https://kevinsong729.github.io/project-pages/SA-GS/\n Code: https://github.com/zsy1987/SA-GS"},{"id":"http://arxiv.org/abs/2403.19612v1","updated":"2024-03-28T17:32:01Z","published":"2024-03-28T17:32:01Z","title":"ILPO-NET: Network for the invariant recognition of arbitrary volumetric\n patterns in 3D","summary":" Effective recognition of spatial patterns and learning their hierarchy is\ncrucial in modern spatial data analysis. Volumetric data applications seek\ntechniques ensuring invariance not only to shifts but also to pattern\nrotations. While traditional methods can readily achieve translational\ninvariance, rotational invariance possesses multiple challenges and remains an\nactive area of research. Here, we present ILPO-Net (Invariant to Local Patterns\nOrientation Network), a novel approach that handles arbitrarily shaped patterns\nwith the convolutional operation inherently invariant to local spatial pattern\norientations using the Wigner matrix expansions. Our architecture seamlessly\nintegrates the new convolution operator and, when benchmarked on diverse\nvolumetric datasets such as MedMNIST and CATH, demonstrates superior\nperformance over the baselines with significantly reduced parameter counts - up\nto 1000 times fewer in the case of MedMNIST. Beyond these demonstrations,\nILPO-Net's rotational invariance paves the way for other applications across\nmultiple disciplines. Our code is publicly available at\nhttps://gricad-gitlab.univ-grenoble-alpes.fr/GruLab/ILPONet.\n","authors":["Dmitrii Zhemchuzhnikov","Sergei Grudinin"],"pdf_url":"https://arxiv.org/pdf/2403.19612v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19611v1","updated":"2024-03-28T17:31:23Z","published":"2024-03-28T17:31:23Z","title":"Nearest Neighbor Classication for Classical Image Upsampling","summary":" Given a set of ordered pixel data in the form of an image, our goal is to\nperform upsampling on the data such that: the resulting resolution is improved\nby some factor, the final result passes the human test, having added new,\nbelievable, and realistic information and detail to the image, the time\ncomplexity for upscaling is relatively close to that of lossy upscaling\nimplementations.\n","authors":["Evan Matthews","Nicolas Prate"],"pdf_url":"https://arxiv.org/pdf/2403.19611v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2403.19607v1","updated":"2024-03-28T17:28:32Z","published":"2024-03-28T17:28:32Z","title":"SAID-NeRF: Segmentation-AIDed NeRF for Depth Completion of Transparent\n Objects","summary":" Acquiring accurate depth information of transparent objects using\noff-the-shelf RGB-D cameras is a well-known challenge in Computer Vision and\nRobotics. Depth estimation/completion methods are typically employed and\ntrained on datasets with quality depth labels acquired from either simulation,\nadditional sensors or specialized data collection setups and known 3d models.\nHowever, acquiring reliable depth information for datasets at scale is not\nstraightforward, limiting training scalability and generalization. Neural\nRadiance Fields (NeRFs) are learning-free approaches and have demonstrated wide\nsuccess in novel view synthesis and shape recovery. However, heuristics and\ncontrolled environments (lights, backgrounds, etc) are often required to\naccurately capture specular surfaces. In this paper, we propose using Visual\nFoundation Models (VFMs) for segmentation in a zero-shot, label-free way to\nguide the NeRF reconstruction process for these objects via the simultaneous\nreconstruction of semantic fields and extensions to increase robustness. Our\nproposed method Segmentation-AIDed NeRF (SAID-NeRF) shows significant\nperformance on depth completion datasets for transparent objects and robotic\ngrasping.\n","authors":["Avinash Ummadisingu","Jongkeum Choi","Koki Yamane","Shimpei Masuda","Naoki Fukaya","Kuniyuki Takahashi"],"pdf_url":"https://arxiv.org/pdf/2403.19607v1.pdf","comment":"8 pages. An accompanying video is available at\n https://www.youtube.com/watch?v=S4NCoUq4bmE"},{"id":"http://arxiv.org/abs/2403.19603v1","updated":"2024-03-28T17:27:44Z","published":"2024-03-28T17:27:44Z","title":"Semantic Map-based Generation of Navigation Instructions","summary":" We are interested in the generation of navigation instructions, either in\ntheir own right or as training material for robotic navigation task. In this\npaper, we propose a new approach to navigation instruction generation by\nframing the problem as an image captioning task using semantic maps as visual\ninput. Conventional approaches employ a sequence of panorama images to generate\nnavigation instructions. Semantic maps abstract away from visual details and\nfuse the information in multiple panorama images into a single top-down\nrepresentation, thereby reducing computational complexity to process the input.\nWe present a benchmark dataset for instruction generation using semantic maps,\npropose an initial model and ask human subjects to manually assess the quality\nof generated instructions. Our initial investigations show promise in using\nsemantic maps for instruction generation instead of a sequence of panorama\nimages, but there is vast scope for improvement. We release the code for data\npreparation and model training at https://github.com/chengzu-li/VLGen.\n","authors":["Chengzu Li","Chao Zhang","Simone Teufel","Rama Sanand Doddipatla","Svetlana Stoyanchev"],"pdf_url":"https://arxiv.org/pdf/2403.19603v1.pdf","comment":"5 pages, 2 figures, 3 tables (13 pages, 3 figures, 5 tables including\n references and appendices), accepted at LREC-COLING 2024"},{"id":"http://arxiv.org/abs/2311.11278v2","updated":"2024-03-28T17:25:51Z","published":"2023-11-19T09:41:10Z","title":"Transcending Forgery Specificity with Latent Space Augmentation for\n Generalizable Deepfake Detection","summary":" Deepfake detection faces a critical generalization hurdle, with performance\ndeteriorating when there is a mismatch between the distributions of training\nand testing data. A broadly received explanation is the tendency of these\ndetectors to be overfitted to forgery-specific artifacts, rather than learning\nfeatures that are widely applicable across various forgeries. To address this\nissue, we propose a simple yet effective detector called LSDA\n(\\underline{L}atent \\underline{S}pace \\underline{D}ata\n\\underline{A}ugmentation), which is based on a heuristic idea: representations\nwith a wider variety of forgeries should be able to learn a more generalizable\ndecision boundary, thereby mitigating the overfitting of method-specific\nfeatures (see Fig.~\\ref{fig:toy}). Following this idea, we propose to enlarge\nthe forgery space by constructing and simulating variations within and across\nforgery features in the latent space. This approach encompasses the acquisition\nof enriched, domain-specific features and the facilitation of smoother\ntransitions between different forgery types, effectively bridging domain gaps.\nOur approach culminates in refining a binary classifier that leverages the\ndistilled knowledge from the enhanced features, striving for a generalizable\ndeepfake detector. Comprehensive experiments show that our proposed method is\nsurprisingly effective and transcends state-of-the-art detectors across several\nwidely used benchmarks.\n","authors":["Zhiyuan Yan","Yuhao Luo","Siwei Lyu","Qingshan Liu","Baoyuan Wu"],"pdf_url":"https://arxiv.org/pdf/2311.11278v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19600v1","updated":"2024-03-28T17:23:45Z","published":"2024-03-28T17:23:45Z","title":"Enhance Image Classification via Inter-Class Image Mixup with Diffusion\n Model","summary":" Text-to-image (T2I) generative models have recently emerged as a powerful\ntool, enabling the creation of photo-realistic images and giving rise to a\nmultitude of applications. However, the effective integration of T2I models\ninto fundamental image classification tasks remains an open question. A\nprevalent strategy to bolster image classification performance is through\naugmenting the training set with synthetic images generated by T2I models. In\nthis study, we scrutinize the shortcomings of both current generative and\nconventional data augmentation techniques. Our analysis reveals that these\nmethods struggle to produce images that are both faithful (in terms of\nforeground objects) and diverse (in terms of background contexts) for\ndomain-specific concepts. To tackle this challenge, we introduce an innovative\ninter-class data augmentation method known as Diff-Mix\n(https://github.com/Zhicaiwww/Diff-Mix), which enriches the dataset by\nperforming image translations between classes. Our empirical results\ndemonstrate that Diff-Mix achieves a better balance between faithfulness and\ndiversity, leading to a marked improvement in performance across diverse image\nclassification scenarios, including few-shot, conventional, and long-tail\nclassifications for domain-specific datasets.\n","authors":["Zhicai Wang","Longhui Wei","Tan Wang","Heyu Chen","Yanbin Hao","Xiang Wang","Xiangnan He","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2403.19600v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17048v2","updated":"2024-03-28T17:23:15Z","published":"2023-11-28T18:55:37Z","title":"Zero-shot Referring Expression Comprehension via Structural Similarity\n Between Images and Captions","summary":" Zero-shot referring expression comprehension aims at localizing bounding\nboxes in an image corresponding to provided textual prompts, which requires:\n(i) a fine-grained disentanglement of complex visual scene and textual context,\nand (ii) a capacity to understand relationships among disentangled entities.\nUnfortunately, existing large vision-language alignment (VLA) models, e.g.,\nCLIP, struggle with both aspects so cannot be directly used for this task. To\nmitigate this gap, we leverage large foundation models to disentangle both\nimages and texts into triplets in the format of (subject, predicate, object).\nAfter that, grounding is accomplished by calculating the structural similarity\nmatrix between visual and textual triplets with a VLA model, and subsequently\npropagate it to an instance-level similarity matrix. Furthermore, to equip VLA\nmodels with the ability of relationship understanding, we design a\ntriplet-matching objective to fine-tune the VLA models on a collection of\ncurated dataset containing abundant entity relationships. Experiments\ndemonstrate that our visual grounding performance increase of up to 19.5% over\nthe SOTA zero-shot model on RefCOCO/+/g. On the more challenging Who's Waldo\ndataset, our zero-shot approach achieves comparable accuracy to the fully\nsupervised model. Code is available at\nhttps://github.com/Show-han/Zeroshot_REC.\n","authors":["Zeyu Han","Fangrui Zhu","Qianru Lao","Huaizu Jiang"],"pdf_url":"https://arxiv.org/pdf/2311.17048v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19596v1","updated":"2024-03-28T17:20:39Z","published":"2024-03-28T17:20:39Z","title":"LocCa: Visual Pretraining with Location-aware Captioners","summary":" Image captioning has been shown as an effective pretraining method similar to\ncontrastive pretraining. However, the incorporation of location-aware\ninformation into visual pretraining remains an area with limited research. In\nthis paper, we propose a simple visual pretraining method with location-aware\ncaptioners (LocCa). LocCa uses a simple image captioner task interface, to\nteach a model to read out rich information, i.e. bounding box coordinates, and\ncaptions, conditioned on the image pixel input. Thanks to the multitask\ncapabilities of an encoder-decoder architecture, we show that an image\ncaptioner can easily handle multiple tasks during pretraining. Our experiments\ndemonstrate that LocCa outperforms standard captioners significantly on\nlocalization downstream tasks while maintaining comparable performance on\nholistic tasks.\n","authors":["Bo Wan","Michael Tschannen","Yongqin Xian","Filip Pavetic","Ibrahim Alabdulmohsin","Xiao Wang","André Susano Pinto","Andreas Steiner","Lucas Beyer","Xiaohua Zhai"],"pdf_url":"https://arxiv.org/pdf/2403.19596v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19595v1","updated":"2024-03-28T17:19:16Z","published":"2024-03-28T17:19:16Z","title":"Situation Awareness for Driver-Centric Driving Style Adaptation","summary":" There is evidence that the driving style of an autonomous vehicle is\nimportant to increase the acceptance and trust of the passengers. The driving\nsituation has been found to have a significant influence on human driving\nbehavior. However, current driving style models only partially incorporate\ndriving environment information, limiting the alignment between an agent and\nthe given situation. Therefore, we propose a situation-aware driving style\nmodel based on different visual feature encoders pretrained on fleet data, as\nwell as driving behavior predictors, which are adapted to the driving style of\na specific driver. Our experiments show that the proposed method outperforms\nstatic driving styles significantly and forms plausible situation clusters.\nFurthermore, we found that feature encoders pretrained on our dataset lead to\nmore precise driving behavior modeling. In contrast, feature encoders\npretrained supervised and unsupervised on different data sources lead to more\nspecific situation clusters, which can be utilized to constrain and control the\ndriving style adaptation for specific situations. Moreover, in a real-world\nsetting, where driving style adaptation is happening iteratively, we found the\nMLP-based behavior predictors achieve good performance initially but suffer\nfrom catastrophic forgetting. In contrast, behavior predictors based on\nsituationdependent statistics can learn iteratively from continuous data\nstreams by design. Overall, our experiments show that important information for\ndriving behavior prediction is contained within the visual feature encoder. The\ndataset is publicly available at\nhuggingface.co/datasets/jHaselberger/SADC-Situation-Awareness-for-Driver-Centric-Driving-Style-Adaptation.\n","authors":["Johann Haselberger","Bonifaz Stuhr","Bernhard Schick","Steffen Müller"],"pdf_url":"https://arxiv.org/pdf/2403.19595v1.pdf","comment":"14 pages, 6 figures. This work has been submitted to the IEEE for\n possible publication. Copyright may be transferred without notice, after\n which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2403.19593v1","updated":"2024-03-28T17:15:23Z","published":"2024-03-28T17:15:23Z","title":"Frame by Familiar Frame: Understanding Replication in Video Diffusion\n Models","summary":" Building on the momentum of image generation diffusion models, there is an\nincreasing interest in video-based diffusion models. However, video generation\nposes greater challenges due to its higher-dimensional nature, the scarcity of\ntraining data, and the complex spatiotemporal relationships involved. Image\ngeneration models, due to their extensive data requirements, have already\nstrained computational resources to their limits. There have been instances of\nthese models reproducing elements from the training samples, leading to\nconcerns and even legal disputes over sample replication. Video diffusion\nmodels, which operate with even more constrained datasets and are tasked with\ngenerating both spatial and temporal content, may be more prone to replicating\nsamples from their training sets. Compounding the issue, these models are often\nevaluated using metrics that inadvertently reward replication. In our paper, we\npresent a systematic investigation into the phenomenon of sample replication in\nvideo diffusion models. We scrutinize various recent diffusion models for video\nsynthesis, assessing their tendency to replicate spatial and temporal content\nin both unconditional and conditional generation scenarios. Our study\nidentifies strategies that are less likely to lead to replication. Furthermore,\nwe propose new evaluation strategies that take replication into account,\noffering a more accurate measure of a model's ability to generate the original\ncontent.\n","authors":["Aimon Rahman","Malsha V. Perera","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2403.19593v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05950v2","updated":"2024-03-28T17:14:53Z","published":"2024-03-09T16:05:31Z","title":"Classifying Objects in 3D Point Clouds Using Recurrent Neural Network: A\n GRU LSTM Hybrid Approach","summary":" Accurate classification of objects in 3D point clouds is a significant\nproblem in several applications, such as autonomous navigation and\naugmented/virtual reality scenarios, which has become a research hot spot. In\nthis paper, we presented a deep learning strategy for 3D object classification\nin augmented reality. The proposed approach is a combination of the GRU and\nLSTM. LSTM networks learn longer dependencies well, but due to the number of\ngates, it takes longer to train; on the other hand, GRU networks have a weaker\nperformance than LSTM, but their training speed is much higher than GRU, which\nis The speed is due to its fewer gates. The proposed approach used the\ncombination of speed and accuracy of these two networks. The proposed approach\nachieved an accuracy of 0.99 in the 4,499,0641 points dataset, which includes\neight classes (unlabeled, man-made terrain, natural terrain, high vegetation,\nlow vegetation, buildings, hardscape, scanning artifacts, cars). Meanwhile, the\ntraditional machine learning approaches could achieve a maximum accuracy of\n0.9489 in the best case. Keywords: Point Cloud Classification, Virtual Reality,\nHybrid Model, GRULSTM, GRU, LSTM\n","authors":["Ramin Mousa","Mitra Khezli","Mohamadreza Azadi","Vahid Nikoofard","Saba Hesaraki"],"pdf_url":"https://arxiv.org/pdf/2403.05950v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19589v1","updated":"2024-03-28T17:12:55Z","published":"2024-03-28T17:12:55Z","title":"TOD3Cap: Towards 3D Dense Captioning in Outdoor Scenes","summary":" 3D dense captioning stands as a cornerstone in achieving a comprehensive\nunderstanding of 3D scenes through natural language. It has recently witnessed\nremarkable achievements, particularly in indoor settings. However, the\nexploration of 3D dense captioning in outdoor scenes is hindered by two major\nchallenges: 1) the \\textbf{domain gap} between indoor and outdoor scenes, such\nas dynamics and sparse visual inputs, makes it difficult to directly adapt\nexisting indoor methods; 2) the \\textbf{lack of data} with comprehensive\nbox-caption pair annotations specifically tailored for outdoor scenes. To this\nend, we introduce the new task of outdoor 3D dense captioning. As input, we\nassume a LiDAR point cloud and a set of RGB images captured by the panoramic\ncamera rig. The expected output is a set of object boxes with captions. To\ntackle this task, we propose the TOD3Cap network, which leverages the BEV\nrepresentation to generate object box proposals and integrates Relation\nQ-Former with LLaMA-Adapter to generate rich captions for these objects. We\nalso introduce the TOD3Cap dataset, the largest one to our knowledge for 3D\ndense captioning in outdoor scenes, which contains 2.3M descriptions of 64.3K\noutdoor objects from 850 scenes. Notably, our TOD3Cap network can effectively\nlocalize and caption 3D objects in outdoor scenes, which outperforms baseline\nmethods by a significant margin (+9.6 CiDEr@0.5IoU). Code, data, and models are\npublicly available at https://github.com/jxbbb/TOD3Cap.\n","authors":["Bu Jin","Yupeng Zheng","Pengfei Li","Weize Li","Yuhang Zheng","Sujie Hu","Xinyu Liu","Jinwei Zhu","Zhijie Yan","Haiyang Sun","Kun Zhan","Peng Jia","Xiaoxiao Long","Yilun Chen","Hao Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.19589v1.pdf","comment":"Code, data, and models are publicly available at\n https://github.com/jxbbb/TOD3Cap"},{"id":"http://arxiv.org/abs/2403.19588v1","updated":"2024-03-28T17:12:39Z","published":"2024-03-28T17:12:39Z","title":"DenseNets Reloaded: Paradigm Shift Beyond ResNets and ViTs","summary":" This paper revives Densely Connected Convolutional Networks (DenseNets) and\nreveals the underrated effectiveness over predominant ResNet-style\narchitectures. We believe DenseNets' potential was overlooked due to untouched\ntraining methods and traditional design elements not fully revealing their\ncapabilities. Our pilot study shows dense connections through concatenation are\nstrong, demonstrating that DenseNets can be revitalized to compete with modern\narchitectures. We methodically refine suboptimal components - architectural\nadjustments, block redesign, and improved training recipes towards widening\nDenseNets and boosting memory efficiency while keeping concatenation shortcuts.\nOur models, employing simple architectural elements, ultimately surpass Swin\nTransformer, ConvNeXt, and DeiT-III - key architectures in the residual\nlearning lineage. Furthermore, our models exhibit near state-of-the-art\nperformance on ImageNet-1K, competing with the very recent models and\ndownstream tasks, ADE20k semantic segmentation, and COCO object\ndetection/instance segmentation. Finally, we provide empirical analyses that\nuncover the merits of the concatenation over additive shortcuts, steering a\nrenewed preference towards DenseNet-style designs. Our code is available at\nhttps://github.com/naver-ai/rdnet.\n","authors":["Donghyun Kim","Byeongho Heo","Dongyoon Han"],"pdf_url":"https://arxiv.org/pdf/2403.19588v1.pdf","comment":"Code at https://github.com/naver-ai/rdnet"},{"id":"http://arxiv.org/abs/2403.18346v2","updated":"2024-03-28T17:09:36Z","published":"2024-03-27T08:38:49Z","title":"Quantifying and Mitigating Unimodal Biases in Multimodal Large Language\n Models: A Causal Perspective","summary":" Recent advancements in Large Language Models (LLMs) have facilitated the\ndevelopment of Multimodal LLMs (MLLMs). Despite their impressive capabilities,\nMLLMs often suffer from an over-reliance on unimodal biases (e.g., language\nbias and vision bias), leading to incorrect answers in complex multimodal\ntasks. To investigate this issue, we propose a causal framework to interpret\nthe biases in Visual Question Answering (VQA) problems. Within our framework,\nwe devise a causal graph to elucidate the predictions of MLLMs on VQA problems,\nand assess the causal effect of biases through an in-depth causal analysis.\nMotivated by the causal graph, we introduce a novel MORE dataset, consisting of\n12,000 VQA instances. This dataset is designed to challenge MLLMs' abilities,\nnecessitating multi-hop reasoning and the surmounting of unimodal biases.\nFurthermore, we propose two strategies to mitigate unimodal biases and enhance\nMLLMs' reasoning capabilities, including a Decompose-Verify-Answer (DeVA)\nframework for limited-access MLLMs and the refinement of open-source MLLMs\nthrough fine-tuning. Extensive quantitative and qualitative experiments offer\nvaluable insights for future research. Our project page is at\nhttps://opencausalab.github.io/MORE.\n","authors":["Meiqi Chen","Yixin Cao","Yan Zhang","Chaochao Lu"],"pdf_url":"https://arxiv.org/pdf/2403.18346v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19586v1","updated":"2024-03-28T17:08:58Z","published":"2024-03-28T17:08:58Z","title":"TOGS: Gaussian Splatting with Temporal Opacity Offset for Real-Time 4D\n DSA Rendering","summary":" Four-dimensional Digital Subtraction Angiography (4D DSA) is a medical\nimaging technique that provides a series of 2D images captured at different\nstages and angles during the process of contrast agent filling blood vessels.\nIt plays a significant role in the diagnosis of cerebrovascular diseases.\nImproving the rendering quality and speed under sparse sampling is important\nfor observing the status and location of lesions. The current methods exhibit\ninadequate rendering quality in sparse views and suffer from slow rendering\nspeed. To overcome these limitations, we propose TOGS, a Gaussian splatting\nmethod with opacity offset over time, which can effectively improve the\nrendering quality and speed of 4D DSA. We introduce an opacity offset table for\neach Gaussian to model the temporal variations in the radiance of the contrast\nagent. By interpolating the opacity offset table, the opacity variation of the\nGaussian at different time points can be determined. This enables us to render\nthe 2D DSA image at that specific moment. Additionally, we introduced a Smooth\nloss term in the loss function to mitigate overfitting issues that may arise in\nthe model when dealing with sparse view scenarios. During the training phase,\nwe randomly prune Gaussians, thereby reducing the storage overhead of the\nmodel. The experimental results demonstrate that compared to previous methods,\nthis model achieves state-of-the-art reconstruction quality under the same\nnumber of training views. Additionally, it enables real-time rendering while\nmaintaining low storage overhead. The code will be publicly available.\n","authors":["Shuai Zhang","Huangxuan Zhao","Zhenghong Zhou","Guanjun Wu","Chuansheng Zheng","Xinggang Wang","Wenyu Liu"],"pdf_url":"https://arxiv.org/pdf/2403.19586v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07330v2","updated":"2024-03-28T17:07:38Z","published":"2023-12-12T14:45:45Z","title":"Learned representation-guided diffusion models for large-image\n generation","summary":" To synthesize high-fidelity samples, diffusion models typically require\nauxiliary data to guide the generation process. However, it is impractical to\nprocure the painstaking patch-level annotation effort required in specialized\ndomains like histopathology and satellite imagery; it is often performed by\ndomain experts and involves hundreds of millions of patches. Modern-day\nself-supervised learning (SSL) representations encode rich semantic and visual\ninformation. In this paper, we posit that such representations are expressive\nenough to act as proxies to fine-grained human labels. We introduce a novel\napproach that trains diffusion models conditioned on embeddings from SSL. Our\ndiffusion models successfully project these features back to high-quality\nhistopathology and remote sensing images. In addition, we construct larger\nimages by assembling spatially consistent patches inferred from SSL embeddings,\npreserving long-range dependencies. Augmenting real data by generating\nvariations of real images improves downstream classifier accuracy for\npatch-level and larger, image-scale classification tasks. Our models are\neffective even on datasets not encountered during training, demonstrating their\nrobustness and generalizability. Generating images from learned embeddings is\nagnostic to the source of the embeddings. The SSL embeddings used to generate a\nlarge image can either be extracted from a reference image, or sampled from an\nauxiliary model conditioned on any related modality (e.g. class labels, text,\ngenomic data). As proof of concept, we introduce the text-to-large image\nsynthesis paradigm where we successfully synthesize large pathology and\nsatellite images out of text descriptions.\n","authors":["Alexandros Graikos","Srikar Yellapragada","Minh-Quan Le","Saarthak Kapse","Prateek Prasanna","Joel Saltz","Dimitris Samaras"],"pdf_url":"https://arxiv.org/pdf/2312.07330v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17113v2","updated":"2024-03-28T17:07:28Z","published":"2023-11-28T12:05:41Z","title":"Human Gaussian Splatting: Real-time Rendering of Animatable Avatars","summary":" This work addresses the problem of real-time rendering of photorealistic\nhuman body avatars learned from multi-view videos. While the classical\napproaches to model and render virtual humans generally use a textured mesh,\nrecent research has developed neural body representations that achieve\nimpressive visual quality. However, these models are difficult to render in\nreal-time and their quality degrades when the character is animated with body\nposes different than the training observations. We propose an animatable human\nmodel based on 3D Gaussian Splatting, that has recently emerged as a very\nefficient alternative to neural radiance fields. The body is represented by a\nset of gaussian primitives in a canonical space which is deformed with a coarse\nto fine approach that combines forward skinning and local non-rigid refinement.\nWe describe how to learn our Human Gaussian Splatting (HuGS) model in an\nend-to-end fashion from multi-view observations, and evaluate it against the\nstate-of-the-art approaches for novel pose synthesis of clothed body. Our\nmethod achieves 1.5 dB PSNR improvement over the state-of-the-art on THuman4\ndataset while being able to render in real-time (80 fps for 512x512\nresolution).\n","authors":["Arthur Moreau","Jifei Song","Helisa Dhamo","Richard Shaw","Yiren Zhou","Eduardo Pérez-Pellitero"],"pdf_url":"https://arxiv.org/pdf/2311.17113v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19584v1","updated":"2024-03-28T17:07:02Z","published":"2024-03-28T17:07:02Z","title":"Img2Loc: Revisiting Image Geolocalization using Multi-modality\n Foundation Models and Image-based Retrieval-Augmented Generation","summary":" Geolocating precise locations from images presents a challenging problem in\ncomputer vision and information retrieval.Traditional methods typically employ\neither classification, which dividing the Earth surface into grid cells and\nclassifying images accordingly, or retrieval, which identifying locations by\nmatching images with a database of image-location pairs. However,\nclassification-based approaches are limited by the cell size and cannot yield\nprecise predictions, while retrieval-based systems usually suffer from poor\nsearch quality and inadequate coverage of the global landscape at varied scale\nand aggregation levels. To overcome these drawbacks, we present Img2Loc, a\nnovel system that redefines image geolocalization as a text generation task.\nThis is achieved using cutting-edge large multi-modality models like GPT4V or\nLLaVA with retrieval augmented generation. Img2Loc first employs CLIP-based\nrepresentations to generate an image-based coordinate query database. It then\nuniquely combines query results with images itself, forming elaborate prompts\ncustomized for LMMs. When tested on benchmark datasets such as Im2GPS3k and\nYFCC4k, Img2Loc not only surpasses the performance of previous state-of-the-art\nmodels but does so without any model training.\n","authors":["Zhongliang Zhou","Jielu Zhang","Zihan Guan","Mengxuan Hu","Ni Lao","Lan Mu","Sheng Li","Gengchen Mai"],"pdf_url":"https://arxiv.org/pdf/2403.19584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18028v2","updated":"2024-03-28T17:06:15Z","published":"2024-03-26T18:29:39Z","title":"Predicting Species Occurrence Patterns from Partial Observations","summary":" To address the interlinked biodiversity and climate crises, we need an\nunderstanding of where species occur and how these patterns are changing.\nHowever, observational data on most species remains very limited, and the\namount of data available varies greatly between taxonomic groups. We introduce\nthe problem of predicting species occurrence patterns given (a) satellite\nimagery, and (b) known information on the occurrence of other species. To\nevaluate algorithms on this task, we introduce SatButterfly, a dataset of\nsatellite images, environmental data and observational data for butterflies,\nwhich is designed to pair with the existing SatBird dataset of bird\nobservational data. To address this task, we propose a general model, R-Tran,\nfor predicting species occurrence patterns that enables the use of partial\nobservational data wherever found. We find that R-Tran outperforms other\nmethods in predicting species encounter rates with partial information both\nwithin a taxon (birds) and across taxa (birds and butterflies). Our approach\nopens new perspectives to leveraging insights from species with abundant data\nto other species with scarce data, by modelling the ecosystems in which they\nco-occur.\n","authors":["Hager Radi Abdelwahed","Mélisande Teng","David Rolnick"],"pdf_url":"https://arxiv.org/pdf/2403.18028v2.pdf","comment":"Tackling Climate Change with Machine Learning workshop at ICLR 2024"},{"id":"http://arxiv.org/abs/2403.19580v1","updated":"2024-03-28T17:05:04Z","published":"2024-03-28T17:05:04Z","title":"OV-Uni3DETR: Towards Unified Open-Vocabulary 3D Object Detection via\n Cycle-Modality Propagation","summary":" In the current state of 3D object detection research, the severe scarcity of\nannotated 3D data, substantial disparities across different data modalities,\nand the absence of a unified architecture, have impeded the progress towards\nthe goal of universality. In this paper, we propose \\textbf{OV-Uni3DETR}, a\nunified open-vocabulary 3D detector via cycle-modality propagation. Compared\nwith existing 3D detectors, OV-Uni3DETR offers distinct advantages: 1)\nOpen-vocabulary 3D detection: During training, it leverages various accessible\ndata, especially extensive 2D detection images, to boost training diversity.\nDuring inference, it can detect both seen and unseen classes. 2) Modality\nunifying: It seamlessly accommodates input data from any given modality,\neffectively addressing scenarios involving disparate modalities or missing\nsensor information, thereby supporting test-time modality switching. 3) Scene\nunifying: It provides a unified multi-modal model architecture for diverse\nscenes collected by distinct sensors. Specifically, we propose the\ncycle-modality propagation, aimed at propagating knowledge bridging 2D and 3D\nmodalities, to support the aforementioned functionalities. 2D semantic\nknowledge from large-vocabulary learning guides novel class discovery in the 3D\ndomain, and 3D geometric knowledge provides localization supervision for 2D\ndetection images. OV-Uni3DETR achieves the state-of-the-art performance on\nvarious scenarios, surpassing existing methods by more than 6\\% on average. Its\nperformance using only RGB images is on par with or even surpasses that of\nprevious point cloud based methods. Code and pre-trained models will be\nreleased later.\n","authors":["Zhenyu Wang","Yali Li","Taichi Liu","Hengshuang Zhao","Shengjin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.19580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19579v1","updated":"2024-03-28T17:04:07Z","published":"2024-03-28T17:04:07Z","title":"The Bad Batches: Enhancing Self-Supervised Learning in Image\n Classification Through Representative Batch Curation","summary":" The pursuit of learning robust representations without human supervision is a\nlongstanding challenge. The recent advancements in self-supervised contrastive\nlearning approaches have demonstrated high performance across various\nrepresentation learning challenges. However, current methods depend on the\nrandom transformation of training examples, resulting in some cases of\nunrepresentative positive pairs that can have a large impact on learning. This\nlimitation not only impedes the convergence of the learning process but the\nrobustness of the learnt representation as well as requiring larger batch sizes\nto improve robustness to such bad batches. This paper attempts to alleviate the\ninfluence of false positive and false negative pairs by employing pairwise\nsimilarity calculations through the Fr\\'echet ResNet Distance (FRD), thereby\nobtaining robust representations from unlabelled data. The effectiveness of the\nproposed method is substantiated by empirical results, where a linear\nclassifier trained on self-supervised contrastive representations achieved an\nimpressive 87.74\\% top-1 accuracy on STL10 and 99.31\\% on the Flower102\ndataset. These results emphasize the potential of the proposed approach in\npushing the boundaries of the state-of-the-art in self-supervised contrastive\nlearning, particularly for image classification tasks.\n","authors":["Ozgu Goksu","Nicolas Pugeault"],"pdf_url":"https://arxiv.org/pdf/2403.19579v1.pdf","comment":"8 Pages, 4 figures, IEEE WCCI 2024 Conference"},{"id":"http://arxiv.org/abs/2402.19470v2","updated":"2024-03-28T16:52:45Z","published":"2024-02-29T18:57:39Z","title":"Towards Generalizable Tumor Synthesis","summary":" Tumor synthesis enables the creation of artificial tumors in medical images,\nfacilitating the training of AI models for tumor detection and segmentation.\nHowever, success in tumor synthesis hinges on creating visually realistic\ntumors that are generalizable across multiple organs and, furthermore, the\nresulting AI models being capable of detecting real tumors in images sourced\nfrom different domains (e.g., hospitals). This paper made a progressive stride\ntoward generalizable tumor synthesis by leveraging a critical observation:\nearly-stage tumors (< 2cm) tend to have similar imaging characteristics in\ncomputed tomography (CT), whether they originate in the liver, pancreas, or\nkidneys. We have ascertained that generative AI models, e.g., Diffusion Models,\ncan create realistic tumors generalized to a range of organs even when trained\non a limited number of tumor examples from only one organ. Moreover, we have\nshown that AI models trained on these synthetic tumors can be generalized to\ndetect and segment real tumors from CT volumes, encompassing a broad spectrum\nof patient demographics, imaging protocols, and healthcare facilities.\n","authors":["Qi Chen","Xiaoxi Chen","Haorui Song","Zhiwei Xiong","Alan Yuille","Chen Wei","Zongwei Zhou"],"pdf_url":"https://arxiv.org/pdf/2402.19470v2.pdf","comment":"The IEEE / CVF Computer Vision and Pattern Recognition Conference\n (CVPR 2024)"},{"id":"http://arxiv.org/abs/2311.17112v2","updated":"2024-03-28T16:51:18Z","published":"2023-11-28T11:23:34Z","title":"Parameter Efficient Fine-tuning via Cross Block Orchestration for\n Segment Anything Model","summary":" Parameter-efficient fine-tuning (PEFT) is an effective methodology to unleash\nthe potential of large foundation models in novel scenarios with limited\ntraining data. In the computer vision community, PEFT has shown effectiveness\nin image classification, but little research has studied its ability for image\nsegmentation. Fine-tuning segmentation models usually require a heavier\nadjustment of parameters to align the proper projection directions in the\nparameter space for new scenarios. This raises a challenge to existing PEFT\nalgorithms, as they often inject a limited number of individual parameters into\neach block, which prevents substantial adjustment of the projection direction\nof the parameter space due to the limitation of Hidden Markov Chain along\nblocks. In this paper, we equip PEFT with a cross-block orchestration mechanism\nto enable the adaptation of the Segment Anything Model (SAM) to various\ndownstream scenarios. We introduce a novel inter-block communication module,\nwhich integrates a learnable relation matrix to facilitate communication among\ndifferent coefficient sets of each PEFT block's parameter space. Moreover, we\npropose an intra-block enhancement module, which introduces a linear projection\nhead whose weights are generated from a hyper-complex layer, further enhancing\nthe impact of the adjustment of projection directions on the entire parameter\nspace. Extensive experiments on diverse benchmarks demonstrate that our\nproposed approach consistently improves the segmentation performance\nsignificantly on novel scenarios with only around 1K additional parameters.\n","authors":["Zelin Peng","Zhengqin Xu","Zhilin Zeng","Lingxi Xie","Qi Tian","Wei Shen"],"pdf_url":"https://arxiv.org/pdf/2311.17112v2.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2312.02137v2","updated":"2024-03-28T16:50:37Z","published":"2023-12-04T18:56:22Z","title":"MANUS: Markerless Grasp Capture using Articulated 3D Gaussians","summary":" Understanding how we grasp objects with our hands has important applications\nin areas like robotics and mixed reality. However, this challenging problem\nrequires accurate modeling of the contact between hands and objects. To capture\ngrasps, existing methods use skeletons, meshes, or parametric models that does\nnot represent hand shape accurately resulting in inaccurate contacts. We\npresent MANUS, a method for Markerless Hand-Object Grasp Capture using\nArticulated 3D Gaussians. We build a novel articulated 3D Gaussians\nrepresentation that extends 3D Gaussian splatting for high-fidelity\nrepresentation of articulating hands. Since our representation uses Gaussian\nprimitives, it enables us to efficiently and accurately estimate contacts\nbetween the hand and the object. For the most accurate results, our method\nrequires tens of camera views that current datasets do not provide. We\ntherefore build MANUS-Grasps, a new dataset that contains hand-object grasps\nviewed from 50+ cameras across 30+ scenes, 3 subjects, and comprising over 7M\nframes. In addition to extensive qualitative results, we also show that our\nmethod outperforms others on a quantitative contact evaluation method that uses\npaint transfer from the object to the hand.\n","authors":["Chandradeep Pokhariya","Ishaan N Shah","Angela Xing","Zekun Li","Kefan Chen","Avinash Sharma","Srinath Sridhar"],"pdf_url":"https://arxiv.org/pdf/2312.02137v2.pdf","comment":"IEEE / CVF Computer Vision and Pattern Recognition Conference (CVPR)\n 2024"},{"id":"http://arxiv.org/abs/2312.11598v3","updated":"2024-03-28T16:49:40Z","published":"2023-12-18T18:16:52Z","title":"SkillDiffuser: Interpretable Hierarchical Planning via Skill\n Abstractions in Diffusion-Based Task Execution","summary":" Diffusion models have demonstrated strong potential for robotic trajectory\nplanning. However, generating coherent trajectories from high-level\ninstructions remains challenging, especially for long-range composition tasks\nrequiring multiple sequential skills. We propose SkillDiffuser, an end-to-end\nhierarchical planning framework integrating interpretable skill learning with\nconditional diffusion planning to address this problem. At the higher level,\nthe skill abstraction module learns discrete, human-understandable skill\nrepresentations from visual observations and language instructions. These\nlearned skill embeddings are then used to condition the diffusion model to\ngenerate customized latent trajectories aligned with the skills. This allows\ngenerating diverse state trajectories that adhere to the learnable skills. By\nintegrating skill learning with conditional trajectory generation,\nSkillDiffuser produces coherent behavior following abstract instructions across\ndiverse tasks. Experiments on multi-task robotic manipulation benchmarks like\nMeta-World and LOReL demonstrate state-of-the-art performance and\nhuman-interpretable skill representations from SkillDiffuser. More\nvisualization results and information could be found on our website.\n","authors":["Zhixuan Liang","Yao Mu","Hengbo Ma","Masayoshi Tomizuka","Mingyu Ding","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2312.11598v3.pdf","comment":"Accepted by CVPR 2024. Camera ready version. Project page:\n https://skilldiffuser.github.io/"},{"id":"http://arxiv.org/abs/2403.16385v2","updated":"2024-03-28T16:45:44Z","published":"2024-03-25T03:02:27Z","title":"Synthesize Step-by-Step: Tools, Templates and LLMs as Data Generators\n for Reasoning-Based Chart VQA","summary":" Understanding data visualizations like charts and plots requires reasoning\nabout both visual elements and numerics. Although strong in extractive\nquestions, current chart visual question answering (chart VQA) models suffer on\ncomplex reasoning questions. In this work, we address the lack of reasoning\nability by data augmentation. We leverage Large Language Models (LLMs), which\nhave shown to have strong reasoning ability, as an automatic data annotator\nthat generates question-answer annotations for chart images. The key innovation\nin our method lies in the Synthesize Step-by-Step strategy: our LLM-based data\ngenerator learns to decompose the complex question into step-by-step\nsub-questions (rationales), which are then used to derive the final answer\nusing external tools, i.e. Python. This step-wise generation procedure is\ntrained on synthetic data generated using a template-based QA generation\npipeline. Experimental results highlight the significance of the proposed\nstep-by-step generation. By training with the LLM-augmented data (LAMENDA), we\nsignificantly enhance the chart VQA models, achieving the state-of-the-art\naccuracy on the ChartQA and PlotQA datasets. In particular, our approach\nimproves the accuracy of the previous state-of-the-art approach from 38% to 54%\non the human-written questions in the ChartQA dataset, which needs strong\nreasoning. We hope our work underscores the potential of synthetic data and\nencourages further exploration of data augmentation using LLMs for\nreasoning-heavy tasks.\n","authors":["Zhuowan Li","Bhavan Jasani","Peng Tang","Shabnam Ghadar"],"pdf_url":"https://arxiv.org/pdf/2403.16385v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19554v1","updated":"2024-03-28T16:38:04Z","published":"2024-03-28T16:38:04Z","title":"Cross-Attention is Not Always Needed: Dynamic Cross-Attention for\n Audio-Visual Dimensional Emotion Recognition","summary":" In video-based emotion recognition, audio and visual modalities are often\nexpected to have a complementary relationship, which is widely explored using\ncross-attention. However, they may also exhibit weak complementary\nrelationships, resulting in poor representations of audio-visual features, thus\ndegrading the performance of the system. To address this issue, we propose\nDynamic Cross-Attention (DCA) that can dynamically select cross-attended or\nunattended features on the fly based on their strong or weak complementary\nrelationship with each other, respectively. Specifically, a simple yet\nefficient gating layer is designed to evaluate the contribution of the\ncross-attention mechanism and choose cross-attended features only when they\nexhibit a strong complementary relationship, otherwise unattended features. We\nevaluate the performance of the proposed approach on the challenging RECOLA and\nAff-Wild2 datasets. We also compare the proposed approach with other variants\nof cross-attention and show that the proposed model consistently improves the\nperformance on both datasets.\n","authors":["R. Gnana Praveen","Jahangir Alam"],"pdf_url":"https://arxiv.org/pdf/2403.19554v1.pdf","comment":"Accepted at IEEE ICME2024"},{"id":"http://arxiv.org/abs/2403.19549v1","updated":"2024-03-28T16:32:06Z","published":"2024-03-28T16:32:06Z","title":"GlORIE-SLAM: Globally Optimized RGB-only Implicit Encoding Point Cloud\n SLAM","summary":" Recent advancements in RGB-only dense Simultaneous Localization and Mapping\n(SLAM) have predominantly utilized grid-based neural implicit encodings and/or\nstruggle to efficiently realize global map and pose consistency. To this end,\nwe propose an efficient RGB-only dense SLAM system using a flexible neural\npoint cloud scene representation that adapts to keyframe poses and depth\nupdates, without needing costly backpropagation. Another critical challenge of\nRGB-only SLAM is the lack of geometric priors. To alleviate this issue, with\nthe aid of a monocular depth estimator, we introduce a novel DSPO layer for\nbundle adjustment which optimizes the pose and depth of keyframes along with\nthe scale of the monocular depth. Finally, our system benefits from loop\nclosure and online global bundle adjustment and performs either better or\ncompetitive to existing dense neural RGB SLAM methods in tracking, mapping and\nrendering accuracy on the Replica, TUM-RGBD and ScanNet datasets. The source\ncode will be made available.\n","authors":["Ganlin Zhang","Erik Sandström","Youmin Zhang","Manthan Patel","Luc Van Gool","Martin R. Oswald"],"pdf_url":"https://arxiv.org/pdf/2403.19549v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15981v2","updated":"2024-03-28T16:21:30Z","published":"2024-03-24T02:15:14Z","title":"Exploring Accurate 3D Phenotyping in Greenhouse through Neural Radiance\n Fields","summary":" Accurate collection of plant phenotyping is critical to optimising\nsustainable farming practices in precision agriculture. Traditional phenotyping\nin controlled laboratory environments, while valuable, falls short in\nunderstanding plant growth under real-world conditions. Emerging sensor and\ndigital technologies offer a promising approach for direct phenotyping of\nplants in farm environments. This study investigates a learning-based\nphenotyping method using the Neural Radiance Field to achieve accurate in-situ\nphenotyping of pepper plants in greenhouse environments. To quantitatively\nevaluate the performance of this method, traditional point cloud registration\non 3D scanning data is implemented for comparison. Experimental result shows\nthat NeRF(Neural Radiance Fields) achieves competitive accuracy compared to the\n3D scanning methods. The mean distance error between the scanner-based method\nand the NeRF-based method is 0.865mm. This study shows that the learning-based\nNeRF method achieves similar accuracy to 3D scanning-based methods but with\nimproved scalability and robustness.\n","authors":["Junhong Zhao","Wei Ying","Yaoqiang Pan","Zhenfeng Yi","Chao Chen","Kewei Hu","Hanwen Kang"],"pdf_url":"https://arxiv.org/pdf/2403.15981v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.08989v3","updated":"2024-03-28T16:17:43Z","published":"2022-04-13T21:20:42Z","title":"Efficient Deep Learning-based Estimation of the Vital Signs on\n Smartphones","summary":" With the increasing use of smartphones in our daily lives, these devices have\nbecome capable of performing many complex tasks. Concerning the need for\ncontinuous monitoring of vital signs, especially for the elderly or those with\ncertain types of diseases, the development of algorithms that can estimate\nvital signs using smartphones has attracted researchers worldwide. In\nparticular, researchers have been exploring ways to estimate vital signs, such\nas heart rate, oxygen saturation levels, and respiratory rate, using algorithms\nthat can be run on smartphones. However, many of these algorithms require\nmultiple pre-processing steps that might introduce some implementation\noverheads or require the design of a couple of hand-crafted stages to obtain an\noptimal result. To address this issue, this research proposes a novel\nend-to-end solution to mobile-based vital sign estimation using deep learning\nthat eliminates the need for pre-processing. By using a fully convolutional\narchitecture, the proposed model has much fewer parameters and less\ncomputational complexity compared to the architectures that use fully-connected\nlayers as the prediction heads. This also reduces the risk of overfitting.\nAdditionally, a public dataset for vital sign estimation, which includes 62\nvideos collected from 35 men and 27 women, is provided. Overall, the proposed\nend-to-end approach promises significantly improved efficiency and performance\nfor on-device health monitoring on readily available consumer electronics.\n","authors":["Taha Samavati","Mahdi Farvardin","Aboozar Ghaffari"],"pdf_url":"https://arxiv.org/pdf/2204.08989v3.pdf","comment":"10 pages, 8 figures, 11 tables"},{"id":"http://arxiv.org/abs/2403.19539v1","updated":"2024-03-28T16:13:22Z","published":"2024-03-28T16:13:22Z","title":"De-confounded Data-free Knowledge Distillation for Handling Distribution\n Shifts","summary":" Data-Free Knowledge Distillation (DFKD) is a promising task to train\nhigh-performance small models to enhance actual deployment without relying on\nthe original training data. Existing methods commonly avoid relying on private\ndata by utilizing synthetic or sampled data. However, a long-overlooked issue\nis that the severe distribution shifts between their substitution and original\ndata, which manifests as huge differences in the quality of images and class\nproportions. The harmful shifts are essentially the confounder that\nsignificantly causes performance bottlenecks. To tackle the issue, this paper\nproposes a novel perspective with causal inference to disentangle the student\nmodels from the impact of such shifts. By designing a customized causal graph,\nwe first reveal the causalities among the variables in the DFKD task.\nSubsequently, we propose a Knowledge Distillation Causal Intervention (KDCI)\nframework based on the backdoor adjustment to de-confound the confounder. KDCI\ncan be flexibly combined with most existing state-of-the-art baselines.\nExperiments in combination with six representative DFKD methods demonstrate the\neffectiveness of our KDCI, which can obviously help existing methods under\nalmost all settings, \\textit{e.g.}, improving the baseline by up to 15.54\\%\naccuracy on the CIFAR-100 dataset.\n","authors":["Yuzheng Wang","Dingkang Yang","Zhaoyu Chen","Yang Liu","Siao Liu","Wenqiang Zhang","Lihua Zhang","Lizhe Qi"],"pdf_url":"https://arxiv.org/pdf/2403.19539v1.pdf","comment":"Accepted by CVPR24"},{"id":"http://arxiv.org/abs/2403.19534v1","updated":"2024-03-28T16:07:55Z","published":"2024-03-28T16:07:55Z","title":"Locate, Assign, Refine: Taming Customized Image Inpainting with\n Text-Subject Guidance","summary":" Prior studies have made significant progress in image inpainting guided by\neither text or subject image. However, the research on editing with their\ncombined guidance is still in the early stages. To tackle this challenge, we\npresent LAR-Gen, a novel approach for image inpainting that enables seamless\ninpainting of masked scene images, incorporating both the textual prompts and\nspecified subjects. Our approach adopts a coarse-to-fine manner to ensure\nsubject identity preservation and local semantic coherence. The process\ninvolves (i) Locate: concatenating the noise with masked scene image to achieve\nprecise regional editing, (ii) Assign: employing decoupled cross-attention\nmechanism to accommodate multi-modal guidance, and (iii) Refine: using a novel\nRefineNet to supplement subject details. Additionally, to address the issue of\nscarce training data, we introduce a novel data construction pipeline. This\npipeline extracts substantial pairs of data consisting of local text prompts\nand corresponding visual instances from a vast image dataset, leveraging\npublicly available large models. Extensive experiments and varied application\nscenarios demonstrate the superiority of LAR-Gen in terms of both identity\npreservation and text semantic consistency. Project page can be found at\n\\url{https://ali-vilab.github.io/largen-page/}.\n","authors":["Yulin Pan","Chaojie Mao","Zeyinzi Jiang","Zhen Han","Jingfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.19534v1.pdf","comment":"22 pages, 14 figures"},{"id":"http://arxiv.org/abs/2403.19527v1","updated":"2024-03-28T16:02:03Z","published":"2024-03-28T16:02:03Z","title":"Instance-Adaptive and Geometric-Aware Keypoint Learning for\n Category-Level 6D Object Pose Estimation","summary":" Category-level 6D object pose estimation aims to estimate the rotation,\ntranslation and size of unseen instances within specific categories. In this\narea, dense correspondence-based methods have achieved leading performance.\nHowever, they do not explicitly consider the local and global geometric\ninformation of different instances, resulting in poor generalization ability to\nunseen instances with significant shape variations. To deal with this problem,\nwe propose a novel Instance-Adaptive and Geometric-Aware Keypoint Learning\nmethod for category-level 6D object pose estimation (AG-Pose), which includes\ntwo key designs: (1) The first design is an Instance-Adaptive Keypoint\nDetection module, which can adaptively detect a set of sparse keypoints for\nvarious instances to represent their geometric structures. (2) The second\ndesign is a Geometric-Aware Feature Aggregation module, which can efficiently\nintegrate the local and global geometric information into keypoint features.\nThese two modules can work together to establish robust keypoint-level\ncorrespondences for unseen instances, thus enhancing the generalization ability\nof the model.Experimental results on CAMERA25 and REAL275 datasets show that\nthe proposed AG-Pose outperforms state-of-the-art methods by a large margin\nwithout category-specific shape priors.\n","authors":["Xiao Lin","Wenfei Yang","Yuan Gao","Tianzhu Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.19527v1.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2403.19522v1","updated":"2024-03-28T15:57:20Z","published":"2024-03-28T15:57:20Z","title":"Model Stock: All we need is just a few fine-tuned models","summary":" This paper introduces an efficient fine-tuning method for large pre-trained\nmodels, offering strong in-distribution (ID) and out-of-distribution (OOD)\nperformance. Breaking away from traditional practices that need a multitude of\nfine-tuned models for averaging, our approach employs significantly fewer\nmodels to achieve final weights yet yield superior accuracy. Drawing from key\ninsights in the weight space of fine-tuned weights, we uncover a strong link\nbetween the performance and proximity to the center of weight space. Based on\nthis, we introduce a method that approximates a center-close weight using only\ntwo fine-tuned models, applicable during or after training. Our innovative\nlayer-wise weight averaging technique surpasses state-of-the-art model methods\nsuch as Model Soup, utilizing only two fine-tuned models. This strategy can be\naptly coined Model Stock, highlighting its reliance on selecting a minimal\nnumber of models to draw a more optimized-averaged model. We demonstrate the\nefficacy of Model Stock with fine-tuned models based upon pre-trained CLIP\narchitectures, achieving remarkable performance on both ID and OOD tasks on the\nstandard benchmarks, all while barely bringing extra computational demands. Our\ncode and pre-trained models are available at\nhttps://github.com/naver-ai/model-stock.\n","authors":["Dong-Hwan Jang","Sangdoo Yun","Dongyoon Han"],"pdf_url":"https://arxiv.org/pdf/2403.19522v1.pdf","comment":"Code at https://github.com/naver-ai/model-stock"},{"id":"http://arxiv.org/abs/2401.01286v4","updated":"2024-03-28T15:56:55Z","published":"2024-01-02T16:54:58Z","title":"A Comprehensive Study of Knowledge Editing for Large Language Models","summary":" Large Language Models (LLMs) have shown extraordinary capabilities in\nunderstanding and generating text that closely mirrors human communication.\nHowever, a primary limitation lies in the significant computational demands\nduring training, arising from their extensive parameterization. This challenge\nis further intensified by the dynamic nature of the world, necessitating\nfrequent updates to LLMs to correct outdated information or integrate new\nknowledge, thereby ensuring their continued relevance. Note that many\napplications demand continual model adjustments post-training to address\ndeficiencies or undesirable behaviors. There is an increasing interest in\nefficient, lightweight methods for on-the-fly model modifications. To this end,\nrecent years have seen a burgeoning in the techniques of knowledge editing for\nLLMs, which aim to efficiently modify LLMs' behaviors within specific domains\nwhile preserving overall performance across various inputs. In this paper, we\nfirst define the knowledge editing problem and then provide a comprehensive\nreview of cutting-edge approaches. Drawing inspiration from educational and\ncognitive research theories, we propose a unified categorization criterion that\nclassifies knowledge editing methods into three groups: resorting to external\nknowledge, merging knowledge into the model, and editing intrinsic knowledge.\nFurthermore, we introduce a new benchmark, KnowEdit, for a comprehensive\nempirical evaluation of representative knowledge editing approaches.\nAdditionally, we provide an in-depth analysis of knowledge location, which can\ngive a deeper understanding of the knowledge structures inherent within LLMs.\nFinally, we discuss several potential applications of knowledge editing,\noutlining its broad and impactful implications.\n","authors":["Ningyu Zhang","Yunzhi Yao","Bozhong Tian","Peng Wang","Shumin Deng","Mengru Wang","Zekun Xi","Shengyu Mao","Jintian Zhang","Yuansheng Ni","Siyuan Cheng","Ziwen Xu","Xin Xu","Jia-Chen Gu","Yong Jiang","Pengjun Xie","Fei Huang","Lei Liang","Zhiqiang Zhang","Xiaowei Zhu","Jun Zhou","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2401.01286v4.pdf","comment":"Ongoing work; 52 pages, 282 citations; benchmark is available at\n https://huggingface.co/datasets/zjunlp/KnowEdit code is available at\n https://github.com/zjunlp/EasyEdit paper list is available at\n https://github.com/zjunlp/KnowledgeEditingPapers"},{"id":"http://arxiv.org/abs/2309.13610v2","updated":"2024-03-28T15:52:16Z","published":"2023-09-24T11:19:13Z","title":"VisionKG: Unleashing the Power of Visual Datasets via Knowledge Graph","summary":" The availability of vast amounts of visual data with heterogeneous features\nis a key factor for developing, testing, and benchmarking of new computer\nvision (CV) algorithms and architectures. Most visual datasets are created and\ncurated for specific tasks or with limited image data distribution for very\nspecific situations, and there is no unified approach to manage and access them\nacross diverse sources, tasks, and taxonomies. This not only creates\nunnecessary overheads when building robust visual recognition systems, but also\nintroduces biases into learning systems and limits the capabilities of\ndata-centric AI. To address these problems, we propose the Vision Knowledge\nGraph (VisionKG), a novel resource that interlinks, organizes and manages\nvisual datasets via knowledge graphs and Semantic Web technologies. It can\nserve as a unified framework facilitating simple access and querying of\nstate-of-the-art visual datasets, regardless of their heterogeneous formats and\ntaxonomies. One of the key differences between our approach and existing\nmethods is that ours is knowledge-based rather than metadatabased. It enhances\nthe enrichment of the semantics at both image and instance levels and offers\nvarious data retrieval and exploratory services via SPARQL. VisionKG currently\ncontains 519 million RDF triples that describe approximately 40 million\nentities, and are accessible at https://vision.semkg.org and through APIs. With\nthe integration of 30 datasets and four popular CV tasks, we demonstrate its\nusefulness across various scenarios when working with CV pipelines.\n","authors":["Jicheng Yuan","Anh Le-Tuan","Manh Nguyen-Duc","Trung-Kien Tran","Manfred Hauswirth","Danh Le-Phuoc"],"pdf_url":"https://arxiv.org/pdf/2309.13610v2.pdf","comment":"Accepted at ESWC 2024"},{"id":"http://arxiv.org/abs/2312.02069v2","updated":"2024-03-28T15:51:05Z","published":"2023-12-04T17:28:35Z","title":"GaussianAvatars: Photorealistic Head Avatars with Rigged 3D Gaussians","summary":" We introduce GaussianAvatars, a new method to create photorealistic head\navatars that are fully controllable in terms of expression, pose, and\nviewpoint. The core idea is a dynamic 3D representation based on 3D Gaussian\nsplats that are rigged to a parametric morphable face model. This combination\nfacilitates photorealistic rendering while allowing for precise animation\ncontrol via the underlying parametric model, e.g., through expression transfer\nfrom a driving sequence or by manually changing the morphable model parameters.\nWe parameterize each splat by a local coordinate frame of a triangle and\noptimize for explicit displacement offset to obtain a more accurate geometric\nrepresentation. During avatar reconstruction, we jointly optimize for the\nmorphable model parameters and Gaussian splat parameters in an end-to-end\nfashion. We demonstrate the animation capabilities of our photorealistic avatar\nin several challenging scenarios. For instance, we show reenactments from a\ndriving video, where our method outperforms existing works by a significant\nmargin.\n","authors":["Shenhan Qian","Tobias Kirschstein","Liam Schoneveld","Davide Davoli","Simon Giebenhain","Matthias Nießner"],"pdf_url":"https://arxiv.org/pdf/2312.02069v2.pdf","comment":"Project page: https://shenhanqian.github.io/gaussian-avatars"},{"id":"http://arxiv.org/abs/2308.16682v2","updated":"2024-03-28T15:49:42Z","published":"2023-08-31T12:36:50Z","title":"DiffusionPoser: Real-time Human Motion Reconstruction From Arbitrary\n Sparse Sensors Using Autoregressive Diffusion","summary":" Motion capture from a limited number of body-worn sensors, such as inertial\nmeasurement units (IMUs) and pressure insoles, has important applications in\nhealth, human performance, and entertainment. Recent work has focused on\naccurately reconstructing whole-body motion from a specific sensor\nconfiguration using six IMUs. While a common goal across applications is to use\nthe minimal number of sensors to achieve required accuracy, the optimal\narrangement of the sensors might differ from application to application. We\npropose a single diffusion model, DiffusionPoser, which reconstructs human\nmotion in real-time from an arbitrary combination of sensors, including IMUs\nplaced at specified locations, and, pressure insoles. Unlike existing methods,\nour model grants users the flexibility to determine the number and arrangement\nof sensors tailored to the specific activity of interest, without the need for\nretraining. A novel autoregressive inferencing scheme ensures real-time motion\nreconstruction that closely aligns with measured sensor signals. The generative\nnature of DiffusionPoser ensures realistic behavior, even for\ndegrees-of-freedom not directly measured. Qualitative results can be found on\nour website: https://diffusionposer.github.io/.\n","authors":["Tom Van Wouwe","Seunghwan Lee","Antoine Falisse","Scott Delp","C. Karen Liu"],"pdf_url":"https://arxiv.org/pdf/2308.16682v2.pdf","comment":"accepted at CVPR2024"},{"id":"http://arxiv.org/abs/2403.19517v1","updated":"2024-03-28T15:48:16Z","published":"2024-03-28T15:48:16Z","title":"XScale-NVS: Cross-Scale Novel View Synthesis with Hash Featurized\n Manifold","summary":" We propose XScale-NVS for high-fidelity cross-scale novel view synthesis of\nreal-world large-scale scenes. Existing representations based on explicit\nsurface suffer from discretization resolution or UV distortion, while implicit\nvolumetric representations lack scalability for large scenes due to the\ndispersed weight distribution and surface ambiguity. In light of the above\nchallenges, we introduce hash featurized manifold, a novel hash-based\nfeaturization coupled with a deferred neural rendering framework. This approach\nfully unlocks the expressivity of the representation by explicitly\nconcentrating the hash entries on the 2D manifold, thus effectively\nrepresenting highly detailed contents independent of the discretization\nresolution. We also introduce a novel dataset, namely GigaNVS, to benchmark\ncross-scale, high-resolution novel view synthesis of realworld large-scale\nscenes. Our method significantly outperforms competing baselines on various\nreal-world scenes, yielding an average LPIPS that is 40% lower than prior\nstate-of-the-art on the challenging GigaNVS benchmark. Please see our project\npage at: xscalenvs.github.io.\n","authors":["Guangyu Wang","Jinzhi Zhang","Fan Wang","Ruqi Huang","Lu Fang"],"pdf_url":"https://arxiv.org/pdf/2403.19517v1.pdf","comment":"Accepted to CVPR 2024. Project page: xscalenvs.github.io/"},{"id":"http://arxiv.org/abs/2403.19514v1","updated":"2024-03-28T15:45:03Z","published":"2024-03-28T15:45:03Z","title":"CDIMC-net: Cognitive Deep Incomplete Multi-view Clustering Network","summary":" In recent years, incomplete multi-view clustering, which studies the\nchallenging multi-view clustering problem on missing views, has received\ngrowing research interests. Although a series of methods have been proposed to\naddress this issue, the following problems still exist: 1) Almost all of the\nexisting methods are based on shallow models, which is difficult to obtain\ndiscriminative common representations. 2) These methods are generally sensitive\nto noise or outliers since the negative samples are treated equally as the\nimportant samples. In this paper, we propose a novel incomplete multi-view\nclustering network, called Cognitive Deep Incomplete Multi-view Clustering\nNetwork (CDIMC-net), to address these issues. Specifically, it captures the\nhigh-level features and local structure of each view by incorporating the\nview-specific deep encoders and graph embedding strategy into a framework.\nMoreover, based on the human cognition, i.e., learning from easy to hard, it\nintroduces a self-paced strategy to select the most confident samples for model\ntraining, which can reduce the negative influence of outliers. Experimental\nresults on several incomplete datasets show that CDIMC-net outperforms the\nstate-of-the-art incomplete multi-view clustering methods.\n","authors":["Jie Wen","Zheng Zhang","Yong Xu","Bob Zhang","Lunke Fei","Guo-Sen Xie"],"pdf_url":"https://arxiv.org/pdf/2403.19514v1.pdf","comment":"Accepted by IJCAI 2020"},{"id":"http://arxiv.org/abs/2403.19508v1","updated":"2024-03-28T15:41:43Z","published":"2024-03-28T15:41:43Z","title":"Debiasing Cardiac Imaging with Controlled Latent Diffusion Models","summary":" The progress in deep learning solutions for disease diagnosis and prognosis\nbased on cardiac magnetic resonance imaging is hindered by highly imbalanced\nand biased training data. To address this issue, we propose a method to\nalleviate imbalances inherent in datasets through the generation of synthetic\ndata based on sensitive attributes such as sex, age, body mass index, and\nhealth condition. We adopt ControlNet based on a denoising diffusion\nprobabilistic model to condition on text assembled from patient metadata and\ncardiac geometry derived from segmentation masks using a large-cohort study,\nspecifically, the UK Biobank. We assess our method by evaluating the realism of\nthe generated images using established quantitative metrics. Furthermore, we\nconduct a downstream classification task aimed at debiasing a classifier by\nrectifying imbalances within underrepresented groups through synthetically\ngenerated samples. Our experiments demonstrate the effectiveness of the\nproposed approach in mitigating dataset imbalances, such as the scarcity of\nyounger patients or individuals with normal BMI level suffering from heart\nfailure. This work represents a major step towards the adoption of synthetic\ndata for the development of fair and generalizable models for medical\nclassification tasks. Notably, we conduct all our experiments using a single,\nconsumer-level GPU to highlight the feasibility of our approach within\nresource-constrained environments. Our code is available at\nhttps://github.com/faildeny/debiasing-cardiac-mri.\n","authors":["Grzegorz Skorupko","Richard Osuala","Zuzanna Szafranowska","Kaisar Kushibar","Nay Aung","Steffen E Petersen","Karim Lekadir","Polyxeni Gkontra"],"pdf_url":"https://arxiv.org/pdf/2403.19508v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02496v3","updated":"2024-03-28T15:33:42Z","published":"2023-07-04T08:00:31Z","title":"Learning to reconstruct the bubble distribution with conductivity maps\n using Invertible Neural Networks and Error Diffusion","summary":" Electrolysis is crucial for eco-friendly hydrogen production, but gas bubbles\ngenerated during the process hinder reactions, reduce cell efficiency, and\nincrease energy consumption. Additionally, these gas bubbles cause changes in\nthe conductivity inside the cell, resulting in corresponding variations in the\ninduced magnetic field around the cell. Therefore, measuring these gas\nbubble-induced magnetic field fluctuations using external magnetic sensors and\nsolving the inverse problem of Biot-Savart Law allows for estimating the\nconductivity in the cell and, thus, bubble size and location. However,\ndetermining high-resolution conductivity maps from only a few induced magnetic\nfield measurements is an ill-posed inverse problem. To overcome this, we\nexploit Invertible Neural Networks (INNs) to reconstruct the conductivity\nfield. Our qualitative results and quantitative evaluation using random error\ndiffusion show that INN achieves far superior performance compared to Tikhonov\nregularization.\n","authors":["Nishant Kumar","Lukas Krause","Thomas Wondrak","Sven Eckert","Kerstin Eckert","Stefan Gumhold"],"pdf_url":"https://arxiv.org/pdf/2307.02496v3.pdf","comment":"Accepted for Oral presentation at WCIPT11 (11th World Congress on\n Industrial Process Tomography)"},{"id":"http://arxiv.org/abs/2403.19501v1","updated":"2024-03-28T15:31:36Z","published":"2024-03-28T15:31:36Z","title":"RELI11D: A Comprehensive Multimodal Human Motion Dataset and Method","summary":" Comprehensive capturing of human motions requires both accurate captures of\ncomplex poses and precise localization of the human within scenes. Most of the\nHPE datasets and methods primarily rely on RGB, LiDAR, or IMU data. However,\nsolely using these modalities or a combination of them may not be adequate for\nHPE, particularly for complex and fast movements. For holistic human motion\nunderstanding, we present RELI11D, a high-quality multimodal human motion\ndataset involves LiDAR, IMU system, RGB camera, and Event camera. It records\nthe motions of 10 actors performing 5 sports in 7 scenes, including 3.32 hours\nof synchronized LiDAR point clouds, IMU measurement data, RGB videos and Event\nsteams. Through extensive experiments, we demonstrate that the RELI11D presents\nconsiderable challenges and opportunities as it contains many rapid and complex\nmotions that require precise location. To address the challenge of integrating\ndifferent modalities, we propose LEIR, a multimodal baseline that effectively\nutilizes LiDAR Point Cloud, Event stream, and RGB through our cross-attention\nfusion strategy. We show that LEIR exhibits promising results for rapid motions\nand daily motions and that utilizing the characteristics of multiple modalities\ncan indeed improve HPE performance. Both the dataset and source code will be\nreleased publicly to the research community, fostering collaboration and\nenabling further exploration in this field.\n","authors":["Ming Yan","Yan Zhang","Shuqiang Cai","Shuqi Fan","Xincheng Lin","Yudi Dai","Siqi Shen","Chenglu Wen","Lan Xu","Yuexin Ma","Cheng Wang"],"pdf_url":"https://arxiv.org/pdf/2403.19501v1.pdf","comment":"CVPR2024, Project website: http://www.lidarhumanmotion.net/reli11d/"},{"id":"http://arxiv.org/abs/2403.19497v1","updated":"2024-03-28T15:27:34Z","published":"2024-03-28T15:27:34Z","title":"Surface-based parcellation and vertex-wise analysis of ultra\n high-resolution ex vivo 7 tesla MRI in neurodegenerative diseases","summary":" Magnetic resonance imaging (MRI) is the standard modality to understand human\nbrain structure and function in vivo (antemortem). Decades of research in human\nneuroimaging has led to the widespread development of methods and tools to\nprovide automated volume-based segmentations and surface-based parcellations\nwhich help localize brain functions to specialized anatomical regions. Recently\nex vivo (postmortem) imaging of the brain has opened-up avenues to study brain\nstructure at sub-millimeter ultra high-resolution revealing details not\npossible to observe with in vivo MRI. Unfortunately, there has been limited\nmethodological development in ex vivo MRI primarily due to lack of datasets and\nlimited centers with such imaging resources. Therefore, in this work, we\npresent one-of-its-kind dataset of 82 ex vivo T2w whole brain hemispheres MRI\nat 0.3 mm isotropic resolution spanning Alzheimer's disease and related\ndementias. We adapted and developed a fast and easy-to-use automated\nsurface-based pipeline to parcellate, for the first time, ultra high-resolution\nex vivo brain tissue at the native subject space resolution using the\nDesikan-Killiany-Tourville (DKT) brain atlas. This allows us to perform\nvertex-wise analysis in the template space and thereby link morphometry\nmeasures with pathology measurements derived from histology. We will\nopen-source our dataset docker container, Jupyter notebooks for ready-to-use\nout-of-the-box set of tools and command line options to advance ex vivo MRI\nclinical brain imaging research on the project webpage.\n","authors":["Pulkit Khandelwal","Michael Tran Duong","Constanza Fuentes","Amanda Denning","Winifred Trotman","Ranjit Ittyerah","Alejandra Bahena","Theresa Schuck","Marianna Gabrielyan","Karthik Prabhakaran","Daniel Ohm","Gabor Mizsei","John Robinson","Monica Munoz","John Detre","Edward Lee","David Irwin","Corey McMillan","M. Dylan Tisdall","Sandhitsu Das","David Wolk","Paul A. Yushkevich"],"pdf_url":"https://arxiv.org/pdf/2403.19497v1.pdf","comment":"Under review at MICCAI 2024"},{"id":"http://arxiv.org/abs/2403.19495v1","updated":"2024-03-28T15:27:13Z","published":"2024-03-28T15:27:13Z","title":"CoherentGS: Sparse Novel View Synthesis with Coherent 3D Gaussians","summary":" The field of 3D reconstruction from images has rapidly evolved in the past\nfew years, first with the introduction of Neural Radiance Field (NeRF) and more\nrecently with 3D Gaussian Splatting (3DGS). The latter provides a significant\nedge over NeRF in terms of the training and inference speed, as well as the\nreconstruction quality. Although 3DGS works well for dense input images, the\nunstructured point-cloud like representation quickly overfits to the more\nchallenging setup of extremely sparse input images (e.g., 3 images), creating a\nrepresentation that appears as a jumble of needles from novel views. To address\nthis issue, we propose regularized optimization and depth-based initialization.\nOur key idea is to introduce a structured Gaussian representation that can be\ncontrolled in 2D image space. We then constraint the Gaussians, in particular\ntheir position, and prevent them from moving independently during optimization.\nSpecifically, we introduce single and multiview constraints through an implicit\nconvolutional decoder and a total variation loss, respectively. With the\ncoherency introduced to the Gaussians, we further constrain the optimization\nthrough a flow-based loss function. To support our regularized optimization, we\npropose an approach to initialize the Gaussians using monocular depth estimates\nat each input view. We demonstrate significant improvements compared to the\nstate-of-the-art sparse-view NeRF-based approaches on a variety of scenes.\n","authors":["Avinash Paliwal","Wei Ye","Jinhui Xiong","Dmytro Kotovenko","Rakesh Ranjan","Vikas Chandra","Nima Khademi Kalantari"],"pdf_url":"https://arxiv.org/pdf/2403.19495v1.pdf","comment":"Project page: https://people.engr.tamu.edu/nimak/Papers/CoherentGS"},{"id":"http://arxiv.org/abs/2403.14472v2","updated":"2024-03-28T15:24:17Z","published":"2024-03-21T15:18:30Z","title":"Detoxifying Large Language Models via Knowledge Editing","summary":" This paper investigates using knowledge editing techniques to detoxify Large\nLanguage Models (LLMs). We construct a benchmark, SafeEdit, which covers nine\nunsafe categories with various powerful attack prompts and equips comprehensive\nmetrics for systematic evaluation. We conduct experiments with several\nknowledge editing approaches, indicating that knowledge editing has the\npotential to efficiently detoxify LLMs with limited impact on general\nperformance. Then, we propose a simple yet effective baseline, dubbed\nDetoxifying with Intraoperative Neural Monitoring (DINM), to diminish the\ntoxicity of LLMs within a few tuning steps via only one instance. We further\nprovide an in-depth analysis of the internal mechanism for various detoxify\napproaches, demonstrating that previous methods like SFT and DPO may merely\nsuppress the activations of toxic parameters, while DINM mitigates the toxicity\nof the toxic parameters to a certain extent, making permanent adjustments. We\nhope that these insights could shed light on future work of developing\ndetoxifying approaches and the underlying knowledge mechanisms of LLMs. Code\nand benchmark are available at https://github.com/zjunlp/EasyEdit.\n","authors":["Mengru Wang","Ningyu Zhang","Ziwen Xu","Zekun Xi","Shumin Deng","Yunzhi Yao","Qishen Zhang","Linyi Yang","Jindong Wang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2403.14472v2.pdf","comment":"Ongoing work. Project website:\n https://zjunlp.github.io/project/SafeEdit Due to the specificity of the\n knowledge editing setting, we revise Tables 1 and 3 to present a fair\n comparison of experimental results. More experimental results will be updated\n soon"},{"id":"http://arxiv.org/abs/2403.17608v2","updated":"2024-03-28T15:24:16Z","published":"2024-03-26T11:39:00Z","title":"Fake or JPEG? Revealing Common Biases in Generated Image Detection\n Datasets","summary":" The widespread adoption of generative image models has highlighted the urgent\nneed to detect artificial content, which is a crucial step in combating\nwidespread manipulation and misinformation. Consequently, numerous detectors\nand associated datasets have emerged. However, many of these datasets\ninadvertently introduce undesirable biases, thereby impacting the effectiveness\nand evaluation of detectors. In this paper, we emphasize that many datasets for\nAI-generated image detection contain biases related to JPEG compression and\nimage size. Using the GenImage dataset, we demonstrate that detectors indeed\nlearn from these undesired factors. Furthermore, we show that removing the\nnamed biases substantially increases robustness to JPEG compression and\nsignificantly alters the cross-generator performance of evaluated detectors.\nSpecifically, it leads to more than 11 percentage points increase in\ncross-generator performance for ResNet50 and Swin-T detectors on the GenImage\ndataset, achieving state-of-the-art results.\n We provide the dataset and source codes of this paper on the anonymous\nwebsite: https://www.unbiased-genimage.org\n","authors":["Patrick Grommelt","Louis Weiss","Franz-Josef Pfreundt","Janis Keuper"],"pdf_url":"https://arxiv.org/pdf/2403.17608v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19492v1","updated":"2024-03-28T15:23:52Z","published":"2024-03-28T15:23:52Z","title":"Segmentation tool for images of cracks","summary":" Safety-critical infrastructures, such as bridges, are periodically inspected\nto check for existing damage, such as fatigue cracks and corrosion, and to\nguarantee the safe use of the infrastructure. Visual inspection is the most\nfrequent type of general inspection, despite the fact that its detection\ncapability is rather limited, especially for fatigue cracks. Machine learning\nalgorithms can be used for augmenting the capability of classical visual\ninspection of bridge structures, however, the implementation of such an\nalgorithm requires a massive annotated training dataset, which is\ntime-consuming to produce. This paper proposes a semi-automatic crack\nsegmentation tool that eases the manual segmentation of cracks on images needed\nto create a training dataset for a machine learning algorithm. Also, it can be\nused to measure the geometry of the crack. This tool makes use of an image\nprocessing algorithm, which was initially developed for the analysis of\nvascular systems on retinal images. The algorithm relies on a multi-orientation\nwavelet transform, which is applied to the image to construct the so-called\n\"orientation scores\", i.e. a modified version of the image. Afterwards, the\nfiltered orientation scores are used to formulate an optimal path problem that\nidentifies the crack. The globally optimal path between manually selected crack\nendpoints is computed, using a state-of-the-art geometric tracking method. The\npixel-wise segmentation is done afterwards using the obtained crack path. The\nproposed method outperforms fully automatic methods and shows potential to be\nan adequate alternative to the manual data annotation.\n","authors":["Andrii Kompanets","Remco Duits","Davide Leonetti","Nicky van den Berg","H. H."," Snijder"],"pdf_url":"https://arxiv.org/pdf/2403.19492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19490v1","updated":"2024-03-28T15:22:29Z","published":"2024-03-28T15:22:29Z","title":"Jointly Training and Pruning CNNs via Learnable Agent Guidance and\n Alignment","summary":" Structural model pruning is a prominent approach used for reducing the\ncomputational cost of Convolutional Neural Networks (CNNs) before their\ndeployment on resource-constrained devices. Yet, the majority of proposed ideas\nrequire a pretrained model before pruning, which is costly to secure. In this\npaper, we propose a novel structural pruning approach to jointly learn the\nweights and structurally prune architectures of CNN models. The core element of\nour method is a Reinforcement Learning (RL) agent whose actions determine the\npruning ratios of the CNN model's layers, and the resulting model's accuracy\nserves as its reward. We conduct the joint training and pruning by iteratively\ntraining the model's weights and the agent's policy, and we regularize the\nmodel's weights to align with the selected structure by the agent. The evolving\nmodel's weights result in a dynamic reward function for the agent, which\nprevents using prominent episodic RL methods with stationary environment\nassumption for our purpose. We address this challenge by designing a mechanism\nto model the complex changing dynamics of the reward function and provide a\nrepresentation of it to the RL agent. To do so, we take a learnable embedding\nfor each training epoch and employ a recurrent model to calculate a\nrepresentation of the changing environment. We train the recurrent model and\nembeddings using a decoder model to reconstruct observed rewards. Such a design\nempowers our agent to effectively leverage episodic observations along with the\nenvironment representations to learn a proper policy to determine performant\nsub-networks of the CNN model. Our extensive experiments on CIFAR-10 and\nImageNet using ResNets and MobileNets demonstrate the effectiveness of our\nmethod.\n","authors":["Alireza Ganjdanesh","Shangqian Gao","Heng Huang"],"pdf_url":"https://arxiv.org/pdf/2403.19490v1.pdf","comment":"IEEE/CVF Conference on Computer Vision and Pattern Recognition, CVPR\n 2024"},{"id":"http://arxiv.org/abs/2311.16516v4","updated":"2024-03-28T15:15:04Z","published":"2023-11-27T18:20:03Z","title":"Segment Every Out-of-Distribution Object","summary":" Semantic segmentation models, while effective for in-distribution categories,\nface challenges in real-world deployment due to encountering\nout-of-distribution (OoD) objects. Detecting these OoD objects is crucial for\nsafety-critical applications. Existing methods rely on anomaly scores, but\nchoosing a suitable threshold for generating masks presents difficulties and\ncan lead to fragmentation and inaccuracy. This paper introduces a method to\nconvert anomaly \\textbf{S}core \\textbf{T}o segmentation \\textbf{M}ask, called\nS2M, a simple and effective framework for OoD detection in semantic\nsegmentation. Unlike assigning anomaly scores to pixels, S2M directly segments\nthe entire OoD object. By transforming anomaly scores into prompts for a\npromptable segmentation model, S2M eliminates the need for threshold selection.\nExtensive experiments demonstrate that S2M outperforms the state-of-the-art by\napproximately 20% in IoU and 40% in mean F1 score, on average, across various\nbenchmarks including Fishyscapes, Segment-Me-If-You-Can, and RoadAnomaly\ndatasets.\n","authors":["Wenjie Zhao","Jia Li","Xin Dong","Yu Xiang","Yunhui Guo"],"pdf_url":"https://arxiv.org/pdf/2311.16516v4.pdf","comment":"20 pages, 14 figures"},{"id":"http://arxiv.org/abs/2403.19474v1","updated":"2024-03-28T15:01:58Z","published":"2024-03-28T15:01:58Z","title":"SG-PGM: Partial Graph Matching Network with Semantic Geometric Fusion\n for 3D Scene Graph Alignment and Its Downstream Tasks","summary":" Scene graphs have been recently introduced into 3D spatial understanding as a\ncomprehensive representation of the scene. The alignment between 3D scene\ngraphs is the first step of many downstream tasks such as scene graph aided\npoint cloud registration, mosaicking, overlap checking, and robot navigation.\nIn this work, we treat 3D scene graph alignment as a partial graph-matching\nproblem and propose to solve it with a graph neural network. We reuse the\ngeometric features learned by a point cloud registration method and associate\nthe clustered point-level geometric features with the node-level semantic\nfeature via our designed feature fusion module. Partial matching is enabled by\nusing a learnable method to select the top-k similar node pairs. Subsequent\ndownstream tasks such as point cloud registration are achieved by running a\npre-trained registration network within the matched regions. We further propose\na point-matching rescoring method, that uses the node-wise alignment of the 3D\nscene graph to reweight the matching candidates from a pre-trained point cloud\nregistration method. It reduces the false point correspondences estimated\nespecially in low-overlapping cases. Experiments show that our method improves\nthe alignment accuracy by 10~20% in low-overlap and random transformation\nscenarios and outperforms the existing work in multiple downstream tasks.\n","authors":["Yaxu Xie","Alain Pagani","Didier Stricker"],"pdf_url":"https://arxiv.org/pdf/2403.19474v1.pdf","comment":"16 pages, 10 figures"},{"id":"http://arxiv.org/abs/2403.15905v3","updated":"2024-03-28T15:00:04Z","published":"2024-03-23T18:19:02Z","title":"Towards Low-Energy Adaptive Personalization for Resource-Constrained\n Devices","summary":" The personalization of machine learning (ML) models to address data drift is\na significant challenge in the context of Internet of Things (IoT)\napplications. Presently, most approaches focus on fine-tuning either the full\nbase model or its last few layers to adapt to new data, while often neglecting\nenergy costs. However, various types of data drift exist, and fine-tuning the\nfull base model or the last few layers may not result in optimal performance in\ncertain scenarios. We propose Target Block Fine-Tuning (TBFT), a low-energy\nadaptive personalization framework designed for resource-constrained devices.\nWe categorize data drift and personalization into three types: input-level,\nfeature-level, and output-level. For each type, we fine-tune different blocks\nof the model to achieve optimal performance with reduced energy costs.\nSpecifically, input-, feature-, and output-level correspond to fine-tuning the\nfront, middle, and rear blocks of the model. We evaluate TBFT on a ResNet\nmodel, three datasets, three different training sizes, and a Raspberry Pi.\nCompared with the $Block Avg$, where each block is fine-tuned individually and\ntheir performance improvements are averaged, TBFT exhibits an improvement in\nmodel accuracy by an average of 15.30% whilst saving 41.57% energy consumption\non average compared with full fine-tuning.\n","authors":["Yushan Huang","Josh Millar","Yuxuan Long","Yuchen Zhao","Hamed Hadaddi"],"pdf_url":"https://arxiv.org/pdf/2403.15905v3.pdf","comment":"Accepetd to The 4th Workshop on Machine Learning and Systems\n (EuroMLSys '24)"},{"id":"http://arxiv.org/abs/2403.19473v1","updated":"2024-03-28T14:59:56Z","published":"2024-03-28T14:59:56Z","title":"Benchmarking Implicit Neural Representation and Geometric Rendering in\n Real-Time RGB-D SLAM","summary":" Implicit neural representation (INR), in combination with geometric\nrendering, has recently been employed in real-time dense RGB-D SLAM. Despite\nactive research endeavors being made, there lacks a unified protocol for fair\nevaluation, impeding the evolution of this area. In this work, we establish, to\nour knowledge, the first open-source benchmark framework to evaluate the\nperformance of a wide spectrum of commonly used INRs and rendering functions\nfor mapping and localization. The goal of our benchmark is to 1) gain an\nintuition of how different INRs and rendering functions impact mapping and\nlocalization and 2) establish a unified evaluation protocol w.r.t. the design\nchoices that may impact the mapping and localization. With the framework, we\nconduct a large suite of experiments, offering various insights in choosing the\nINRs and geometric rendering functions: for example, the dense feature grid\noutperforms other INRs (e.g. tri-plane and hash grid), even when geometric and\ncolor features are jointly encoded for memory efficiency. To extend the\nfindings into the practical scenario, a hybrid encoding strategy is proposed to\nbring the best of the accuracy and completion from the grid-based and\ndecomposition-based INRs. We further propose explicit hybrid encoding for\nhigh-fidelity dense grid mapping to comply with the RGB-D SLAM system that puts\nthe premise on robustness and computation efficiency.\n","authors":["Tongyan Hua","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.19473v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.18361v2","updated":"2024-03-28T14:59:44Z","published":"2024-03-27T08:53:13Z","title":"ViTAR: Vision Transformer with Any Resolution","summary":" This paper tackles a significant challenge faced by Vision Transformers\n(ViTs): their constrained scalability across different image resolutions.\nTypically, ViTs experience a performance decline when processing resolutions\ndifferent from those seen during training. Our work introduces two key\ninnovations to address this issue. Firstly, we propose a novel module for\ndynamic resolution adjustment, designed with a single Transformer block,\nspecifically to achieve highly efficient incremental token integration.\nSecondly, we introduce fuzzy positional encoding in the Vision Transformer to\nprovide consistent positional awareness across multiple resolutions, thereby\npreventing overfitting to any single training resolution. Our resulting model,\nViTAR (Vision Transformer with Any Resolution), demonstrates impressive\nadaptability, achieving 83.3\\% top-1 accuracy at a 1120x1120 resolution and\n80.4\\% accuracy at a 4032x4032 resolution, all while reducing computational\ncosts. ViTAR also shows strong performance in downstream tasks such as instance\nand semantic segmentation and can easily combined with self-supervised learning\ntechniques like Masked AutoEncoder. Our work provides a cost-effective solution\nfor enhancing the resolution scalability of ViTs, paving the way for more\nversatile and efficient high-resolution image processing.\n","authors":["Qihang Fan","Quanzeng You","Xiaotian Han","Yongfei Liu","Yunzhe Tao","Huaibo Huang","Ran He","Hongxia Yang"],"pdf_url":"https://arxiv.org/pdf/2403.18361v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17216v2","updated":"2024-03-28T14:58:59Z","published":"2023-11-28T20:40:45Z","title":"Self-Discovering Interpretable Diffusion Latent Directions for\n Responsible Text-to-Image Generation","summary":" Diffusion-based models have gained significant popularity for text-to-image\ngeneration due to their exceptional image-generation capabilities. A risk with\nthese models is the potential generation of inappropriate content, such as\nbiased or harmful images. However, the underlying reasons for generating such\nundesired content from the perspective of the diffusion model's internal\nrepresentation remain unclear. Previous work interprets vectors in an\ninterpretable latent space of diffusion models as semantic concepts. However,\nexisting approaches cannot discover directions for arbitrary concepts, such as\nthose related to inappropriate concepts. In this work, we propose a novel\nself-supervised approach to find interpretable latent directions for a given\nconcept. With the discovered vectors, we further propose a simple approach to\nmitigate inappropriate generation. Extensive experiments have been conducted to\nverify the effectiveness of our mitigation approach, namely, for fair\ngeneration, safe generation, and responsible text-enhancing generation. Project\npage: \\url{https://interpretdiffusion.github.io}.\n","authors":["Hang Li","Chengzhi Shen","Philip Torr","Volker Tresp","Jindong Gu"],"pdf_url":"https://arxiv.org/pdf/2311.17216v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19467v1","updated":"2024-03-28T14:47:32Z","published":"2024-03-28T14:47:32Z","title":"Beyond Talking -- Generating Holistic 3D Human Dyadic Motion for\n Communication","summary":" In this paper, we introduce an innovative task focused on human\ncommunication, aiming to generate 3D holistic human motions for both speakers\nand listeners. Central to our approach is the incorporation of factorization to\ndecouple audio features and the combination of textual semantic information,\nthereby facilitating the creation of more realistic and coordinated movements.\nWe separately train VQ-VAEs with respect to the holistic motions of both\nspeaker and listener. We consider the real-time mutual influence between the\nspeaker and the listener and propose a novel chain-like transformer-based\nauto-regressive model specifically designed to characterize real-world\ncommunication scenarios effectively which can generate the motions of both the\nspeaker and the listener simultaneously. These designs ensure that the results\nwe generate are both coordinated and diverse. Our approach demonstrates\nstate-of-the-art performance on two benchmark datasets. Furthermore, we\nintroduce the HoCo holistic communication dataset, which is a valuable resource\nfor future research. Our HoCo dataset and code will be released for research\npurposes upon acceptance.\n","authors":["Mingze Sun","Chao Xu","Xinyu Jiang","Yang Liu","Baigui Sun","Ruqi Huang"],"pdf_url":"https://arxiv.org/pdf/2403.19467v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19456v1","updated":"2024-03-28T14:27:36Z","published":"2024-03-28T14:27:36Z","title":"Break-for-Make: Modular Low-Rank Adaptations for Composable\n Content-Style Customization","summary":" Personalized generation paradigms empower designers to customize visual\nintellectual properties with the help of textual descriptions by tuning or\nadapting pre-trained text-to-image models on a few images. Recent works explore\napproaches for concurrently customizing both content and detailed visual style\nappearance. However, these existing approaches often generate images where the\ncontent and style are entangled. In this study, we reconsider the customization\nof content and style concepts from the perspective of parameter space\nconstruction. Unlike existing methods that utilize a shared parameter space for\ncontent and style, we propose a learning framework that separates the parameter\nspace to facilitate individual learning of content and style, thereby enabling\ndisentangled content and style. To achieve this goal, we introduce \"partly\nlearnable projection\" (PLP) matrices to separate the original adapters into\ndivided sub-parameter spaces. We propose \"break-for-make\" customization\nlearning pipeline based on PLP, which is simple yet effective. We break the\noriginal adapters into \"up projection\" and \"down projection\", train content and\nstyle PLPs individually with the guidance of corresponding textual prompts in\nthe separate adapters, and maintain generalization by employing a\nmulti-correspondence projection learning strategy. Based on the adapters broken\napart for separate training content and style, we then make the entity\nparameter space by reconstructing the content and style PLPs matrices, followed\nby fine-tuning the combined adapter to generate the target object with the\ndesired appearance. Experiments on various styles, including textures,\nmaterials, and artistic style, show that our method outperforms\nstate-of-the-art single/multiple concept learning pipelines in terms of\ncontent-style-prompt alignment.\n","authors":["Yu Xu","Fan Tang","Juan Cao","Yuxin Zhang","Oliver Deussen","Weiming Dong","Jintao Li","Tong-Yee Lee"],"pdf_url":"https://arxiv.org/pdf/2403.19456v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03008v2","updated":"2024-03-28T14:16:09Z","published":"2023-09-06T13:54:31Z","title":"Sparse 3D Reconstruction via Object-Centric Ray Sampling","summary":" We propose a novel method for 3D object reconstruction from a sparse set of\nviews captured from a 360-degree calibrated camera rig. We represent the object\nsurface through a hybrid model that uses both an MLP-based neural\nrepresentation and a triangle mesh. A key contribution in our work is a novel\nobject-centric sampling scheme of the neural representation, where rays are\nshared among all views. This efficiently concentrates and reduces the number of\nsamples used to update the neural model at each iteration. This sampling scheme\nrelies on the mesh representation to ensure also that samples are\nwell-distributed along its normals. The rendering is then performed efficiently\nby a differentiable renderer. We demonstrate that this sampling scheme results\nin a more effective training of the neural representation, does not require the\nadditional supervision of segmentation masks, yields state of the art 3D\nreconstructions, and works with sparse views on the Google's Scanned Objects,\nTank and Temples and MVMC Car datasets. Code available at:\nhttps://github.com/llukmancerkezi/ROSTER\n","authors":["Llukman Cerkezi","Paolo Favaro"],"pdf_url":"https://arxiv.org/pdf/2309.03008v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19444v1","updated":"2024-03-28T14:15:13Z","published":"2024-03-28T14:15:13Z","title":"Transparent and Clinically Interpretable AI for Lung Cancer Detection in\n Chest X-Rays","summary":" The rapidly advancing field of Explainable Artificial Intelligence (XAI) aims\nto tackle the issue of trust regarding the use of complex black-box deep\nlearning models in real-world applications. Existing post-hoc XAI techniques\nhave recently been shown to have poor performance on medical data, producing\nunreliable explanations which are infeasible for clinical use. To address this,\nwe propose an ante-hoc approach based on concept bottleneck models which\nintroduces for the first time clinical concepts into the classification\npipeline, allowing the user valuable insight into the decision-making process.\nOn a large public dataset of chest X-rays and associated medical reports, we\nfocus on the binary classification task of lung cancer detection. Our approach\nyields improved classification performance in lung cancer detection when\ncompared to baseline deep learning models (F1 > 0.9), while also generating\nclinically relevant and more reliable explanations than existing techniques. We\nevaluate our approach against post-hoc image XAI techniques LIME and SHAP, as\nwell as CXR-LLaVA, a recent textual XAI tool which operates in the context of\nquestion answering on chest X-rays.\n","authors":["Amy Rafferty","Rishi Ramaesh","Ajitha Rajan"],"pdf_url":"https://arxiv.org/pdf/2403.19444v1.pdf","comment":"12 pages, 10 figures"},{"id":"http://arxiv.org/abs/2403.09412v2","updated":"2024-03-28T14:10:08Z","published":"2024-03-14T14:03:29Z","title":"OpenGraph: Open-Vocabulary Hierarchical 3D Graph Representation in\n Large-Scale Outdoor Environments","summary":" Environment representations endowed with sophisticated semantics are pivotal\nfor facilitating seamless interaction between robots and humans, enabling them\nto effectively carry out various tasks. Open-vocabulary maps, powered by\nVisual-Language models (VLMs), possess inherent advantages, including zero-shot\nlearning and support for open-set classes. However, existing open-vocabulary\nmaps are primarily designed for small-scale environments, such as desktops or\nrooms, and are typically geared towards limited-area tasks involving robotic\nindoor navigation or in-place manipulation. They face challenges in direct\ngeneralization to outdoor environments characterized by numerous objects and\ncomplex tasks, owing to limitations in both understanding level and map\nstructure. In this work, we propose OpenGraph, the first open-vocabulary\nhierarchical graph representation designed for large-scale outdoor\nenvironments. OpenGraph initially extracts instances and their captions from\nvisual images, enhancing textual reasoning by encoding them. Subsequently, it\nachieves 3D incremental object-centric mapping with feature embedding by\nprojecting images onto LiDAR point clouds. Finally, the environment is\nsegmented based on lane graph connectivity to construct a hierarchical graph.\nValidation results from public dataset SemanticKITTI demonstrate that OpenGraph\nachieves the highest segmentation and query accuracy. The source code of\nOpenGraph is publicly available at https://github.com/BIT-DYN/OpenGraph.\n","authors":["Yinan Deng","Jiahui Wang","Jingyu Zhao","Xinyu Tian","Guangyan Chen","Yi Yang","Yufeng Yue"],"pdf_url":"https://arxiv.org/pdf/2403.09412v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19438v1","updated":"2024-03-28T14:07:13Z","published":"2024-03-28T14:07:13Z","title":"SubjectDrive: Scaling Generative Data in Autonomous Driving via Subject\n Control","summary":" Autonomous driving progress relies on large-scale annotated datasets. In this\nwork, we explore the potential of generative models to produce vast quantities\nof freely-labeled data for autonomous driving applications and present\nSubjectDrive, the first model proven to scale generative data production in a\nway that could continuously improve autonomous driving applications. We\ninvestigate the impact of scaling up the quantity of generative data on the\nperformance of downstream perception models and find that enhancing data\ndiversity plays a crucial role in effectively scaling generative data\nproduction. Therefore, we have developed a novel model equipped with a subject\ncontrol mechanism, which allows the generative model to leverage diverse\nexternal data sources for producing varied and useful data. Extensive\nevaluations confirm SubjectDrive's efficacy in generating scalable autonomous\ndriving training data, marking a significant step toward revolutionizing data\nproduction methods in this field.\n","authors":["Binyuan Huang","Yuqing Wen","Yucheng Zhao","Yaosi Hu","Yingfei Liu","Fan Jia","Weixin Mao","Tiancai Wang","Chi Zhang","Chang Wen Chen","Zhenzhong Chen","Xiangyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.19438v1.pdf","comment":"Project page: https://subjectdrive.github.io/"},{"id":"http://arxiv.org/abs/2403.19435v1","updated":"2024-03-28T14:04:17Z","published":"2024-03-28T14:04:17Z","title":"BAMM: Bidirectional Autoregressive Motion Model","summary":" Generating human motion from text has been dominated by denoising motion\nmodels either through diffusion or generative masking process. However, these\nmodels face great limitations in usability by requiring prior knowledge of the\nmotion length. Conversely, autoregressive motion models address this limitation\nby adaptively predicting motion endpoints, at the cost of degraded generation\nquality and editing capabilities. To address these challenges, we propose\nBidirectional Autoregressive Motion Model (BAMM), a novel text-to-motion\ngeneration framework. BAMM consists of two key components: (1) a motion\ntokenizer that transforms 3D human motion into discrete tokens in latent space,\nand (2) a masked self-attention transformer that autoregressively predicts\nrandomly masked tokens via a hybrid attention masking strategy. By unifying\ngenerative masked modeling and autoregressive modeling, BAMM captures rich and\nbidirectional dependencies among motion tokens, while learning the\nprobabilistic mapping from textual inputs to motion outputs with\ndynamically-adjusted motion sequence length. This feature enables BAMM to\nsimultaneously achieving high-quality motion generation with enhanced usability\nand built-in motion editability. Extensive experiments on HumanML3D and KIT-ML\ndatasets demonstrate that BAMM surpasses current state-of-the-art methods in\nboth qualitative and quantitative measures.\n","authors":["Ekkasit Pinyoanuntapong","Muhammad Usama Saleem","Pu Wang","Minwoo Lee","Srijan Das","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2403.19435v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19428v1","updated":"2024-03-28T13:58:05Z","published":"2024-03-28T13:58:05Z","title":"Burst Super-Resolution with Diffusion Models for Improving Perceptual\n Quality","summary":" While burst LR images are useful for improving the SR image quality compared\nwith a single LR image, prior SR networks accepting the burst LR images are\ntrained in a deterministic manner, which is known to produce a blurry SR image.\nIn addition, it is difficult to perfectly align the burst LR images, making the\nSR image more blurry. Since such blurry images are perceptually degraded, we\naim to reconstruct the sharp high-fidelity boundaries. Such high-fidelity\nimages can be reconstructed by diffusion models. However, prior SR methods\nusing the diffusion model are not properly optimized for the burst SR task.\nSpecifically, the reverse process starting from a random sample is not\noptimized for image enhancement and restoration methods, including burst SR. In\nour proposed method, on the other hand, burst LR features are used to\nreconstruct the initial burst SR image that is fed into an intermediate step in\nthe diffusion model. This reverse process from the intermediate step 1) skips\ndiffusion steps for reconstructing the global structure of the image and 2)\nfocuses on steps for refining detailed textures. Our experimental results\ndemonstrate that our method can improve the scores of the perceptual quality\nmetrics. Code: https://github.com/placerkyo/BSRD\n","authors":["Kyotaro Tokoro","Kazutoshi Akita","Norimichi Ukita"],"pdf_url":"https://arxiv.org/pdf/2403.19428v1.pdf","comment":"Accepted to IJCNN 2024 (International Joint Conference on Neural\n Networks)"},{"id":"http://arxiv.org/abs/2403.19425v1","updated":"2024-03-28T13:56:26Z","published":"2024-03-28T13:56:26Z","title":"A Robust Ensemble Algorithm for Ischemic Stroke Lesion Segmentation:\n Generalizability and Clinical Utility Beyond the ISLES Challenge","summary":" Diffusion-weighted MRI (DWI) is essential for stroke diagnosis, treatment\ndecisions, and prognosis. However, image and disease variability hinder the\ndevelopment of generalizable AI algorithms with clinical value. We address this\ngap by presenting a novel ensemble algorithm derived from the 2022 Ischemic\nStroke Lesion Segmentation (ISLES) challenge. ISLES'22 provided 400 patient\nscans with ischemic stroke from various medical centers, facilitating the\ndevelopment of a wide range of cutting-edge segmentation algorithms by the\nresearch community. Through collaboration with leading teams, we combined\ntop-performing algorithms into an ensemble model that overcomes the limitations\nof individual solutions. Our ensemble model achieved superior ischemic lesion\ndetection and segmentation accuracy on our internal test set compared to\nindividual algorithms. This accuracy generalized well across diverse image and\ndisease variables. Furthermore, the model excelled in extracting clinical\nbiomarkers. Notably, in a Turing-like test, neuroradiologists consistently\npreferred the algorithm's segmentations over manual expert efforts,\nhighlighting increased comprehensiveness and precision. Validation using a\nreal-world external dataset (N=1686) confirmed the model's generalizability.\nThe algorithm's outputs also demonstrated strong correlations with clinical\nscores (admission NIHSS and 90-day mRS) on par with or exceeding expert-derived\nresults, underlining its clinical relevance. This study offers two key\nfindings. First, we present an ensemble algorithm\n(https://github.com/Tabrisrei/ISLES22_Ensemble) that detects and segments\nischemic stroke lesions on DWI across diverse scenarios on par with expert\n(neuro)radiologists. Second, we show the potential for biomedical challenge\noutputs to extend beyond the challenge's initial objectives, demonstrating\ntheir real-world clinical applicability.\n","authors":["Ezequiel de la Rosa","Mauricio Reyes","Sook-Lei Liew","Alexandre Hutton","Roland Wiest","Johannes Kaesmacher","Uta Hanning","Arsany Hakim","Richard Zubal","Waldo Valenzuela","David Robben","Diana M. Sima","Vincenzo Anania","Arne Brys","James A. Meakin","Anne Mickan","Gabriel Broocks","Christian Heitkamp","Shengbo Gao","Kongming Liang","Ziji Zhang","Md Mahfuzur Rahman Siddiquee","Andriy Myronenko","Pooya Ashtari","Sabine Van Huffel","Hyun-su Jeong","Chi-ho Yoon","Chulhong Kim","Jiayu Huo","Sebastien Ourselin","Rachel Sparks","Albert Clèrigues","Arnau Oliver","Xavier Lladó","Liam Chalcroft","Ioannis Pappas","Jeroen Bertels","Ewout Heylen","Juliette Moreau","Nima Hatami","Carole Frindel","Abdul Qayyum","Moona Mazher","Domenec Puig","Shao-Chieh Lin","Chun-Jung Juan","Tianxi Hu","Lyndon Boone","Maged Goubran","Yi-Jui Liu","Susanne Wegener","Florian Kofler","Ivan Ezhov","Suprosanna Shit","Moritz R. Hernandez Petzsche","Bjoern Menze","Jan S. Kirschke","Benedikt Wiestler"],"pdf_url":"https://arxiv.org/pdf/2403.19425v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19507v3","updated":"2024-03-28T13:51:37Z","published":"2023-05-31T02:35:41Z","title":"Manifold Constraint Regularization for Remote Sensing Image Generation","summary":" Generative Adversarial Networks (GANs) have shown notable accomplishments in\nremote sensing domain. However, this paper reveals that their performance on\nremote sensing images falls short when compared to their impressive results\nwith natural images. This study identifies a previously overlooked issue: GANs\nexhibit a heightened susceptibility to overfitting on remote sensing images.To\naddress this challenge, this paper analyzes the characteristics of remote\nsensing images and proposes manifold constraint regularization, a novel\napproach that tackles overfitting of GANs on remote sensing images for the\nfirst time. Our method includes a new measure for evaluating the structure of\nthe data manifold. Leveraging this measure, we propose the manifold constraint\nregularization term, which not only alleviates the overfitting problem, but\nalso promotes alignment between the generated and real data manifolds, leading\nto enhanced quality in the generated images. The effectiveness and versatility\nof this method have been corroborated through extensive validation on various\nremote sensing datasets and GAN models. The proposed method not only enhances\nthe quality of the generated images, reflected in a 3.13\\% improvement in\nFrechet Inception Distance (FID) score, but also boosts the performance of the\nGANs on downstream tasks, evidenced by a 3.76\\% increase in classification\naccuracy.\n","authors":["Xingzhe Su","Changwen Zheng","Wenwen Qiang","Fengge Wu","Junsuo Zhao","Fuchun Sun","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2305.19507v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08471v2","updated":"2024-03-28T13:47:42Z","published":"2023-10-09T20:18:10Z","title":"WinSyn: A High Resolution Testbed for Synthetic Data","summary":" We present WinSyn, a unique dataset and testbed for creating high-quality\nsynthetic data with procedural modeling techniques. The dataset contains\nhigh-resolution photographs of windows, selected from locations around the\nworld, with 89,318 individual window crops showcasing diverse geometric and\nmaterial characteristics. We evaluate a procedural model by training semantic\nsegmentation networks on both synthetic and real images and then comparing\ntheir performances on a shared test set of real images. Specifically, we\nmeasure the difference in mean Intersection over Union (mIoU) and determine the\neffective number of real images to match synthetic data's training performance.\nWe design a baseline procedural model as a benchmark and provide 21,290\nsynthetically generated images. By tuning the procedural model, key factors are\nidentified which significantly influence the model's fidelity in replicating\nreal-world scenarios. Importantly, we highlight the challenge of procedural\nmodeling using current techniques, especially in their ability to replicate the\nspatial semantics of real-world scenarios. This insight is critical because of\nthe potential of procedural models to bridge to hidden scene aspects such as\ndepth, reflectivity, material properties, and lighting conditions.\n","authors":["Tom Kelly","John Femiani","Peter Wonka"],"pdf_url":"https://arxiv.org/pdf/2310.08471v2.pdf","comment":"cvpr version"},{"id":"http://arxiv.org/abs/2403.19417v1","updated":"2024-03-28T13:47:19Z","published":"2024-03-28T13:47:19Z","title":"OAKINK2: A Dataset of Bimanual Hands-Object Manipulation in Complex Task\n Completion","summary":" We present OAKINK2, a dataset of bimanual object manipulation tasks for\ncomplex daily activities. In pursuit of constructing the complex tasks into a\nstructured representation, OAKINK2 introduces three level of abstraction to\norganize the manipulation tasks: Affordance, Primitive Task, and Complex Task.\nOAKINK2 features on an object-centric perspective for decoding the complex\ntasks, treating them as a sequence of object affordance fulfillment. The first\nlevel, Affordance, outlines the functionalities that objects in the scene can\nafford, the second level, Primitive Task, describes the minimal interaction\nunits that humans interact with the object to achieve its affordance, and the\nthird level, Complex Task, illustrates how Primitive Tasks are composed and\ninterdependent. OAKINK2 dataset provides multi-view image streams and precise\npose annotations for the human body, hands and various interacting objects.\nThis extensive collection supports applications such as interaction\nreconstruction and motion synthesis. Based on the 3-level abstraction of\nOAKINK2, we explore a task-oriented framework for Complex Task Completion\n(CTC). CTC aims to generate a sequence of bimanual manipulation to achieve task\nobjectives. Within the CTC framework, we employ Large Language Models (LLMs) to\ndecompose the complex task objectives into sequences of Primitive Tasks and\nhave developed a Motion Fulfillment Model that generates bimanual hand motion\nfor each Primitive Task. OAKINK2 datasets and models are available at\nhttps://oakink.net/v2.\n","authors":["Xinyu Zhan","Lixin Yang","Yifei Zhao","Kangrui Mao","Hanlin Xu","Zenan Lin","Kailin Li","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2403.19417v1.pdf","comment":"To be appeared in CVPR 2024. 26 pages"},{"id":"http://arxiv.org/abs/2403.05369v4","updated":"2024-03-28T13:41:05Z","published":"2024-03-08T15:00:44Z","title":"Frequency-Adaptive Dilated Convolution for Semantic Segmentation","summary":" Dilated convolution, which expands the receptive field by inserting gaps\nbetween its consecutive elements, is widely employed in computer vision. In\nthis study, we propose three strategies to improve individual phases of dilated\nconvolution from the view of spectrum analysis. Departing from the conventional\npractice of fixing a global dilation rate as a hyperparameter, we introduce\nFrequency-Adaptive Dilated Convolution (FADC), which dynamically adjusts\ndilation rates spatially based on local frequency components. Subsequently, we\ndesign two plug-in modules to directly enhance effective bandwidth and\nreceptive field size. The Adaptive Kernel (AdaKern) module decomposes\nconvolution weights into low-frequency and high-frequency components,\ndynamically adjusting the ratio between these components on a per-channel\nbasis. By increasing the high-frequency part of convolution weights, AdaKern\ncaptures more high-frequency components, thereby improving effective bandwidth.\nThe Frequency Selection (FreqSelect) module optimally balances high- and\nlow-frequency components in feature representations through spatially variant\nreweighting. It suppresses high frequencies in the background to encourage FADC\nto learn a larger dilation, thereby increasing the receptive field for an\nexpanded scope. Extensive experiments on segmentation and object detection\nconsistently validate the efficacy of our approach. The code is publicly\navailable at \\url{https://github.com/Linwei-Chen/FADC}.\n","authors":["Linwei Chen","Lin Gu","Ying Fu"],"pdf_url":"https://arxiv.org/pdf/2403.05369v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19415v1","updated":"2024-03-28T13:39:55Z","published":"2024-03-28T13:39:55Z","title":"Brain-Shift: Unsupervised Pseudo-Healthy Brain Synthesis for Novel\n Biomarker Extraction in Chronic Subdural Hematoma","summary":" Chronic subdural hematoma (cSDH) is a common neurological condition\ncharacterized by the accumulation of blood between the brain and the dura\nmater. This accumulation of blood can exert pressure on the brain, potentially\nleading to fatal outcomes. Treatment options for cSDH are limited to invasive\nsurgery or non-invasive management. Traditionally, the midline shift,\nhand-measured by experts from an ideal sagittal plane, and the hematoma volume\nhave been the primary metrics for quantifying and analyzing cSDH. However,\nthese approaches do not quantify the local 3D brain deformation caused by cSDH.\nWe propose a novel method using anatomy-aware unsupervised diffeomorphic\npseudo-healthy synthesis to generate brain deformation fields. The deformation\nfields derived from this process are utilized to extract biomarkers that\nquantify the shift in the brain due to cSDH. We use CT scans of 121 patients\nfor training and validation of our method and find that our metrics allow the\nidentification of patients who require surgery. Our results indicate that\nautomatically obtained brain deformation fields might contain prognostic value\nfor personalized cSDH treatment. Our implementation is available on:\ngithub.com/Barisimre/brain-morphing\n","authors":["Baris Imre","Elina Thibeau-Sutre","Jorieke Reimer","Kuan Kho","Jelmer M. Wolterink"],"pdf_url":"https://arxiv.org/pdf/2403.19415v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19412v1","updated":"2024-03-28T13:36:00Z","published":"2024-03-28T13:36:00Z","title":"A Simple and Effective Point-based Network for Event Camera 6-DOFs Pose\n Relocalization","summary":" Event cameras exhibit remarkable attributes such as high dynamic range,\nasynchronicity, and low latency, making them highly suitable for vision tasks\nthat involve high-speed motion in challenging lighting conditions. These\ncameras implicitly capture movement and depth information in events, making\nthem appealing sensors for Camera Pose Relocalization (CPR) tasks.\nNevertheless, existing CPR networks based on events neglect the pivotal\nfine-grained temporal information in events, resulting in unsatisfactory\nperformance. Moreover, the energy-efficient features are further compromised by\nthe use of excessively complex models, hindering efficient deployment on edge\ndevices. In this paper, we introduce PEPNet, a simple and effective point-based\nnetwork designed to regress six degrees of freedom (6-DOFs) event camera poses.\nWe rethink the relationship between the event camera and CPR tasks, leveraging\nthe raw Point Cloud directly as network input to harness the high-temporal\nresolution and inherent sparsity of events. PEPNet is adept at abstracting the\nspatial and implicit temporal features through hierarchical structure and\nexplicit temporal features by Attentive Bi-directional Long Short-Term Memory\n(A-Bi-LSTM). By employing a carefully crafted lightweight design, PEPNet\ndelivers state-of-the-art (SOTA) performance on both indoor and outdoor\ndatasets with meager computational resources. Specifically, PEPNet attains a\nsignificant 38% and 33% performance improvement on the random split IJRR and\nM3ED datasets, respectively. Moreover, the lightweight design version\nPEPNet$_{tiny}$ accomplishes results comparable to the SOTA while employing a\nmere 0.5% of the parameters.\n","authors":["Hongwei Ren","Jiadong Zhu","Yue Zhou","Haotian FU","Yulong Huang","Bojun Cheng"],"pdf_url":"https://arxiv.org/pdf/2403.19412v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19407v1","updated":"2024-03-28T13:32:49Z","published":"2024-03-28T13:32:49Z","title":"Towards Temporally Consistent Referring Video Object Segmentation","summary":" Referring Video Object Segmentation (R-VOS) methods face challenges in\nmaintaining consistent object segmentation due to temporal context variability\nand the presence of other visually similar objects. We propose an end-to-end\nR-VOS paradigm that explicitly models temporal instance consistency alongside\nthe referring segmentation. Specifically, we introduce a novel hybrid memory\nthat facilitates inter-frame collaboration for robust spatio-temporal matching\nand propagation. Features of frames with automatically generated high-quality\nreference masks are propagated to segment the remaining frames based on\nmulti-granularity association to achieve temporally consistent R-VOS.\nFurthermore, we propose a new Mask Consistency Score (MCS) metric to evaluate\nthe temporal consistency of video segmentation. Extensive experiments\ndemonstrate that our approach enhances temporal consistency by a significant\nmargin, leading to top-ranked performance on popular R-VOS benchmarks, i.e.,\nRef-YouTube-VOS (67.1%) and Ref-DAVIS17 (65.6%).\n","authors":["Bo Miao","Mohammed Bennamoun","Yongsheng Gao","Mubarak Shah","Ajmal Mian"],"pdf_url":"https://arxiv.org/pdf/2403.19407v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18331v2","updated":"2024-03-28T13:27:33Z","published":"2023-11-30T08:02:49Z","title":"MRFP: Learning Generalizable Semantic Segmentation from Sim-2-Real with\n Multi-Resolution Feature Perturbation","summary":" Deep neural networks have shown exemplary performance on semantic scene\nunderstanding tasks on source domains, but due to the absence of style\ndiversity during training, enhancing performance on unseen target domains using\nonly single source domain data remains a challenging task. Generation of\nsimulated data is a feasible alternative to retrieving large style-diverse\nreal-world datasets as it is a cumbersome and budget-intensive process.\nHowever, the large domain-specfic inconsistencies between simulated and\nreal-world data pose a significant generalization challenge in semantic\nsegmentation. In this work, to alleviate this problem, we propose a novel\nMultiResolution Feature Perturbation (MRFP) technique to randomize\ndomain-specific fine-grained features and perturb style of coarse features. Our\nexperimental results on various urban-scene segmentation datasets clearly\nindicate that, along with the perturbation of style-information, perturbation\nof fine-feature components is paramount to learn domain invariant robust\nfeature maps for semantic segmentation models. MRFP is a simple and\ncomputationally efficient, transferable module with no additional learnable\nparameters or objective functions, that helps state-of-the-art deep neural\nnetworks to learn robust domain invariant features for simulation-to-real\nsemantic segmentation.\n","authors":["Sumanth Udupa","Prajwal Gurunath","Aniruddh Sikdar","Suresh Sundaram"],"pdf_url":"https://arxiv.org/pdf/2311.18331v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2311.11908v3","updated":"2024-03-28T13:16:50Z","published":"2023-11-20T16:40:29Z","title":"Continual Learning: Applications and the Road Forward","summary":" Continual learning is a subfield of machine learning, which aims to allow\nmachine learning models to continuously learn on new data, by accumulating\nknowledge without forgetting what was learned in the past. In this work, we\ntake a step back, and ask: \"Why should one care about continual learning in the\nfirst place?\". We set the stage by examining recent continual learning papers\npublished at four major machine learning conferences, and show that\nmemory-constrained settings dominate the field. Then, we discuss five open\nproblems in machine learning, and even though they might seem unrelated to\ncontinual learning at first sight, we show that continual learning will\ninevitably be part of their solution. These problems are model editing,\npersonalization and specialization, on-device learning, faster (re-)training\nand reinforcement learning. Finally, by comparing the desiderata from these\nunsolved problems and the current assumptions in continual learning, we\nhighlight and discuss four future directions for continual learning research.\nWe hope that this work offers an interesting perspective on the future of\ncontinual learning, while displaying its potential value and the paths we have\nto pursue in order to make it successful. This work is the result of the many\ndiscussions the authors had at the Dagstuhl seminar on Deep Continual Learning,\nin March 2023.\n","authors":["Eli Verwimp","Rahaf Aljundi","Shai Ben-David","Matthias Bethge","Andrea Cossu","Alexander Gepperth","Tyler L. Hayes","Eyke Hüllermeier","Christopher Kanan","Dhireesha Kudithipudi","Christoph H. Lampert","Martin Mundt","Razvan Pascanu","Adrian Popescu","Andreas S. Tolias","Joost van de Weijer","Bing Liu","Vincenzo Lomonaco","Tinne Tuytelaars","Gido M. van de Ven"],"pdf_url":"https://arxiv.org/pdf/2311.11908v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19386v1","updated":"2024-03-28T12:51:15Z","published":"2024-03-28T12:51:15Z","title":"PointCloud-Text Matching: Benchmark Datasets and a Baseline","summary":" In this paper, we present and study a new instance-level retrieval task:\nPointCloud-Text Matching~(PTM), which aims to find the exact cross-modal\ninstance that matches a given point-cloud query or text query. PTM could be\napplied to various scenarios, such as indoor/urban-canyon localization and\nscene retrieval. However, there exists no suitable and targeted dataset for PTM\nin practice. Therefore, we construct three new PTM benchmark datasets, namely\n3D2T-SR, 3D2T-NR, and 3D2T-QA. We observe that the data is challenging and with\nnoisy correspondence due to the sparsity, noise, or disorder of point clouds\nand the ambiguity, vagueness, or incompleteness of texts, which make existing\ncross-modal matching methods ineffective for PTM. To tackle these challenges,\nwe propose a PTM baseline, named Robust PointCloud-Text Matching method (RoMa).\nRoMa consists of two modules: a Dual Attention Perception module (DAP) and a\nRobust Negative Contrastive Learning module (RNCL). Specifically, DAP leverages\ntoken-level and feature-level attention to adaptively focus on useful local and\nglobal features, and aggregate them into common representations, thereby\nreducing the adverse impact of noise and ambiguity. To handle noisy\ncorrespondence, RNCL divides negative pairs, which are much less error-prone\nthan positive pairs, into clean and noisy subsets, and assigns them forward and\nreverse optimization directions respectively, thus enhancing robustness against\nnoisy correspondence. We conduct extensive experiments on our benchmarks and\ndemonstrate the superiority of our RoMa.\n","authors":["Yanglin Feng","Yang Qin","Dezhong Peng","Hongyuan Zhu","Xi Peng","Peng Hu"],"pdf_url":"https://arxiv.org/pdf/2403.19386v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02051v2","updated":"2024-03-28T12:41:14Z","published":"2023-12-04T17:09:52Z","title":"TimeChat: A Time-sensitive Multimodal Large Language Model for Long\n Video Understanding","summary":" This work proposes TimeChat, a time-sensitive multimodal large language model\nspecifically designed for long video understanding. Our model incorporates two\nkey architectural contributions: (1) a timestamp-aware frame encoder that binds\nvisual content with the timestamp of each frame, and (2) a sliding video\nQ-Former that produces a video token sequence of varying lengths to accommodate\nvideos of various durations. Additionally, we construct an instruction-tuning\ndataset, encompassing 6 tasks and a total of 125K instances, to further enhance\nTimeChat's instruction-following performance. Experiment results across various\nvideo understanding tasks, such as dense captioning, temporal grounding, and\nhighlight detection, demonstrate TimeChat's strong zero-shot temporal\nlocalization and reasoning capabilities. For example, it achieves +9.2 F1 score\nand +2.8 CIDEr on YouCook2, +5.8 HIT@1 on QVHighlights, and +27.5 R@1 (IoU=0.5)\non Charades-STA, compared to state-of-the-art video large language models,\nholding the potential to serve as a versatile video assistant for long-form\nvideo comprehension tasks and satisfy realistic user requirements.\n","authors":["Shuhuai Ren","Linli Yao","Shicheng Li","Xu Sun","Lu Hou"],"pdf_url":"https://arxiv.org/pdf/2312.02051v2.pdf","comment":"CVPR 2024 camera-ready version, code is available at\n https://github.com/RenShuhuai-Andy/TimeChat"},{"id":"http://arxiv.org/abs/2403.19376v1","updated":"2024-03-28T12:38:21Z","published":"2024-03-28T12:38:21Z","title":"NIGHT -- Non-Line-of-Sight Imaging from Indirect Time of Flight Data","summary":" The acquisition of objects outside the Line-of-Sight of cameras is a very\nintriguing but also extremely challenging research topic. Recent works showed\nthe feasibility of this idea exploiting transient imaging data produced by\ncustom direct Time of Flight sensors. In this paper, for the first time, we\ntackle this problem using only data from an off-the-shelf indirect Time of\nFlight sensor without any further hardware requirement. We introduced a Deep\nLearning model able to re-frame the surfaces where light bounces happen as a\nvirtual mirror. This modeling makes the task easier to handle and also\nfacilitates the construction of annotated training data. From the obtained data\nit is possible to retrieve the depth information of the hidden scene. We also\nprovide a first-in-its-kind synthetic dataset for the task and demonstrate the\nfeasibility of the proposed idea over it.\n","authors":["Matteo Caligiuri","Adriano Simonetto","Gianluca Agresti","Pietro Zanuttigh"],"pdf_url":"https://arxiv.org/pdf/2403.19376v1.pdf","comment":"Submitted to ECCV 24, 17 pages, 6 figures, 2 tables"},{"id":"http://arxiv.org/abs/2403.19366v1","updated":"2024-03-28T12:28:58Z","published":"2024-03-28T12:28:58Z","title":"Infrared Small Target Detection with Scale and Location Sensitivity","summary":" Recently, infrared small target detection (IRSTD) has been dominated by\ndeep-learning-based methods. However, these methods mainly focus on the design\nof complex model structures to extract discriminative features, leaving the\nloss functions for IRSTD under-explored. For example, the widely used\nIntersection over Union (IoU) and Dice losses lack sensitivity to the scales\nand locations of targets, limiting the detection performance of detectors. In\nthis paper, we focus on boosting detection performance with a more effective\nloss but a simpler model structure. Specifically, we first propose a novel\nScale and Location Sensitive (SLS) loss to handle the limitations of existing\nlosses: 1) for scale sensitivity, we compute a weight for the IoU loss based on\ntarget scales to help the detector distinguish targets with different scales:\n2) for location sensitivity, we introduce a penalty term based on the center\npoints of targets to help the detector localize targets more precisely. Then,\nwe design a simple Multi-Scale Head to the plain U-Net (MSHNet). By applying\nSLS loss to each scale of the predictions, our MSHNet outperforms existing\nstate-of-the-art methods by a large margin. In addition, the detection\nperformance of existing detectors can be further improved when trained with our\nSLS loss, demonstrating the effectiveness and generalization of our SLS loss.\nThe code is available at https://github.com/ying-fu/MSHNet.\n","authors":["Qiankun Liu","Rui Liu","Bolun Zheng","Hongkui Wang","Ying Fu"],"pdf_url":"https://arxiv.org/pdf/2403.19366v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2304.03198v6","updated":"2024-03-28T12:07:44Z","published":"2023-04-06T16:21:56Z","title":"RFAConv: Innovating Spatial Attention and Standard Convolutional\n Operation","summary":" Spatial attention has been widely used to improve the performance of\nconvolutional neural networks. However, it has certain limitations. In this\npaper, we propose a new perspective on the effectiveness of spatial attention,\nwhich is that the spatial attention mechanism essentially solves the problem of\nconvolutional kernel parameter sharing. However, the information contained in\nthe attention map generated by spatial attention is not sufficient for\nlarge-size convolutional kernels. Therefore, we propose a novel attention\nmechanism called Receptive-Field Attention (RFA). Existing spatial attention,\nsuch as Convolutional Block Attention Module (CBAM) and Coordinated Attention\n(CA) focus only on spatial features, which does not fully address the problem\nof convolutional kernel parameter sharing. In contrast, RFA not only focuses on\nthe receptive-field spatial feature but also provides effective attention\nweights for large-size convolutional kernels. The Receptive-Field Attention\nconvolutional operation (RFAConv), developed by RFA, represents a new approach\nto replace the standard convolution operation. It offers nearly negligible\nincrement of computational cost and parameters, while significantly improving\nnetwork performance. We conducted a series of experiments on ImageNet-1k, COCO,\nand VOC datasets to demonstrate the superiority of our approach. Of particular\nimportance, we believe that it is time to shift focus from spatial features to\nreceptive-field spatial features for current spatial attention mechanisms. In\nthis way, we can further improve network performance and achieve even better\nresults. The code and pre-trained models for the relevant tasks can be found at\nhttps://github.com/Liuchen1997/RFAConv.\n","authors":["Xin Zhang","Chen Liu","Degang Yang","Tingting Song","Yichen Ye","Ke Li","Yingze Song"],"pdf_url":"https://arxiv.org/pdf/2304.03198v6.pdf","comment":"12 pages, 11figures"},{"id":"http://arxiv.org/abs/2306.16324v2","updated":"2024-03-28T12:05:23Z","published":"2023-06-28T15:58:53Z","title":"DoseDiff: Distance-aware Diffusion Model for Dose Prediction in\n Radiotherapy","summary":" Treatment planning, which is a critical component of the radiotherapy\nworkflow, is typically carried out by a medical physicist in a time-consuming\ntrial-and-error manner. Previous studies have proposed knowledge-based or\ndeep-learning-based methods for predicting dose distribution maps to assist\nmedical physicists in improving the efficiency of treatment planning. However,\nthese dose prediction methods usually fail to effectively utilize distance\ninformation between surrounding tissues and targets or organs-at-risk (OARs).\nMoreover, they are poor at maintaining the distribution characteristics of ray\npaths in the predicted dose distribution maps, resulting in a loss of valuable\ninformation. In this paper, we propose a distance-aware diffusion model\n(DoseDiff) for precise prediction of dose distribution. We define dose\nprediction as a sequence of denoising steps, wherein the predicted dose\ndistribution map is generated with the conditions of the computed tomography\n(CT) image and signed distance maps (SDMs). The SDMs are obtained by distance\ntransformation from the masks of targets or OARs, which provide the distance\nfrom each pixel in the image to the outline of the targets or OARs. We further\npropose a multi-encoder and multi-scale fusion network (MMFNet) that\nincorporates multi-scale and transformer-based fusion modules to enhance\ninformation fusion between the CT image and SDMs at the feature level. We\nevaluate our model on two in-house datasets and a public dataset, respectively.\nThe results demonstrate that our DoseDiff method outperforms state-of-the-art\ndose prediction methods in terms of both quantitative performance and visual\nquality.\n","authors":["Yiwen Zhang","Chuanpu Li","Liming Zhong","Zeli Chen","Wei Yang","Xuetao Wang"],"pdf_url":"https://arxiv.org/pdf/2306.16324v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19336v1","updated":"2024-03-28T11:52:42Z","published":"2024-03-28T11:52:42Z","title":"IVLMap: Instance-Aware Visual Language Grounding for Consumer Robot\n Navigation","summary":" Vision-and-Language Navigation (VLN) is a challenging task that requires a\nrobot to navigate in photo-realistic environments with human natural language\npromptings. Recent studies aim to handle this task by constructing the semantic\nspatial map representation of the environment, and then leveraging the strong\nability of reasoning in large language models for generalizing code for guiding\nthe robot navigation. However, these methods face limitations in instance-level\nand attribute-level navigation tasks as they cannot distinguish different\ninstances of the same object. To address this challenge, we propose a new\nmethod, namely, Instance-aware Visual Language Map (IVLMap), to empower the\nrobot with instance-level and attribute-level semantic mapping, where it is\nautonomously constructed by fusing the RGBD video data collected from the robot\nagent with special-designed natural language map indexing in the bird's-in-eye\nview. Such indexing is instance-level and attribute-level. In particular, when\nintegrated with a large language model, IVLMap demonstrates the capability to\ni) transform natural language into navigation targets with instance and\nattribute information, enabling precise localization, and ii) accomplish\nzero-shot end-to-end navigation tasks based on natural language commands.\nExtensive navigation experiments are conducted. Simulation results illustrate\nthat our method can achieve an average improvement of 14.4\\% in navigation\naccuracy. Code and demo are released at https://ivlmap.github.io/.\n","authors":["Jiacui Huang","Hongtao Zhang","Mingbo Zhao","Zhou Wu"],"pdf_url":"https://arxiv.org/pdf/2403.19336v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19334v1","updated":"2024-03-28T11:50:23Z","published":"2024-03-28T11:50:23Z","title":"Test-Time Domain Generalization for Face Anti-Spoofing","summary":" Face Anti-Spoofing (FAS) is pivotal in safeguarding facial recognition\nsystems against presentation attacks. While domain generalization (DG) methods\nhave been developed to enhance FAS performance, they predominantly focus on\nlearning domain-invariant features during training, which may not guarantee\ngeneralizability to unseen data that differs largely from the source\ndistributions. Our insight is that testing data can serve as a valuable\nresource to enhance the generalizability beyond mere evaluation for DG FAS. In\nthis paper, we introduce a novel Test-Time Domain Generalization (TTDG)\nframework for FAS, which leverages the testing data to boost the model's\ngeneralizability. Our method, consisting of Test-Time Style Projection (TTSP)\nand Diverse Style Shifts Simulation (DSSS), effectively projects the unseen\ndata to the seen domain space. In particular, we first introduce the innovative\nTTSP to project the styles of the arbitrarily unseen samples of the testing\ndistribution to the known source space of the training distributions. We then\ndesign the efficient DSSS to synthesize diverse style shifts via learnable\nstyle bases with two specifically designed losses in a hyperspherical feature\nspace. Our method eliminates the need for model updates at the test time and\ncan be seamlessly integrated into not only the CNN but also ViT backbones.\nComprehensive experiments on widely used cross-domain FAS benchmarks\ndemonstrate our method's state-of-the-art performance and effectiveness.\n","authors":["Qianyu Zhou","Ke-Yue Zhang","Taiping Yao","Xuequan Lu","Shouhong Ding","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2403.19334v1.pdf","comment":"Accepted to IEEE/CVF Conference on Computer Vision and Pattern\n Recognition (CVPR), 2024"},{"id":"http://arxiv.org/abs/2403.18339v2","updated":"2024-03-28T11:46:25Z","published":"2024-03-27T08:28:14Z","title":"H2ASeg: Hierarchical Adaptive Interaction and Weighting Network for\n Tumor Segmentation in PET/CT Images","summary":" Positron emission tomography (PET) combined with computed tomography (CT)\nimaging is routinely used in cancer diagnosis and prognosis by providing\ncomplementary information. Automatically segmenting tumors in PET/CT images can\nsignificantly improve examination efficiency. Traditional multi-modal\nsegmentation solutions mainly rely on concatenation operations for modality\nfusion, which fail to effectively model the non-linear dependencies between PET\nand CT modalities. Recent studies have investigated various approaches to\noptimize the fusion of modality-specific features for enhancing joint\nrepresentations. However, modality-specific encoders used in these methods\noperate independently, inadequately leveraging the synergistic relationships\ninherent in PET and CT modalities, for example, the complementarity between\nsemantics and structure. To address these issues, we propose a Hierarchical\nAdaptive Interaction and Weighting Network termed H2ASeg to explore the\nintrinsic cross-modal correlations and transfer potential complementary\ninformation. Specifically, we design a Modality-Cooperative Spatial Attention\n(MCSA) module that performs intra- and inter-modal interactions globally and\nlocally. Additionally, a Target-Aware Modality Weighting (TAMW) module is\ndeveloped to highlight tumor-related features within multi-modal features,\nthereby refining tumor segmentation. By embedding these modules across\ndifferent layers, H2ASeg can hierarchically model cross-modal correlations,\nenabling a nuanced understanding of both semantic and structural tumor\nfeatures. Extensive experiments demonstrate the superiority of H2ASeg,\noutperforming state-of-the-art methods on AutoPet-II and Hecktor2022\nbenchmarks. The code is released at https://github.com/JinPLu/H2ASeg.\n","authors":["Jinpeng Lu","Jingyun Chen","Linghan Cai","Songhan Jiang","Yongbing Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.18339v2.pdf","comment":"10 pages,4 figures"},{"id":"http://arxiv.org/abs/2311.15596v2","updated":"2024-03-28T11:35:55Z","published":"2023-11-27T07:44:25Z","title":"EgoThink: Evaluating First-Person Perspective Thinking Capability of\n Vision-Language Models","summary":" Vision-language models (VLMs) have recently shown promising results in\ntraditional downstream tasks. Evaluation studies have emerged to assess their\nabilities, with the majority focusing on the third-person perspective, and only\na few addressing specific tasks from the first-person perspective. However, the\ncapability of VLMs to \"think\" from a first-person perspective, a crucial\nattribute for advancing autonomous agents and robotics, remains largely\nunexplored. To bridge this research gap, we introduce EgoThink, a novel visual\nquestion-answering benchmark that encompasses six core capabilities with twelve\ndetailed dimensions. The benchmark is constructed using selected clips from\negocentric videos, with manually annotated question-answer pairs containing\nfirst-person information. To comprehensively assess VLMs, we evaluate eighteen\npopular VLMs on EgoThink. Moreover, given the open-ended format of the answers,\nwe use GPT-4 as the automatic judge to compute single-answer grading.\nExperimental results indicate that although GPT-4V leads in numerous\ndimensions, all evaluated VLMs still possess considerable potential for\nimprovement in first-person perspective tasks. Meanwhile, enlarging the number\nof trainable parameters has the most significant impact on model performance on\nEgoThink. In conclusion, EgoThink serves as a valuable addition to existing\nevaluation benchmarks for VLMs, providing an indispensable resource for future\nresearch in the realm of embodied artificial intelligence and robotics.\n","authors":["Sijie Cheng","Zhicheng Guo","Jingwen Wu","Kechen Fang","Peng Li","Huaping Liu","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2311.15596v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19326v1","updated":"2024-03-28T11:33:02Z","published":"2024-03-28T11:33:02Z","title":"MedBN: Robust Test-Time Adaptation against Malicious Test Samples","summary":" Test-time adaptation (TTA) has emerged as a promising solution to address\nperformance decay due to unforeseen distribution shifts between training and\ntest data. While recent TTA methods excel in adapting to test data variations,\nsuch adaptability exposes a model to vulnerability against malicious examples,\nan aspect that has received limited attention. Previous studies have uncovered\nsecurity vulnerabilities within TTA even when a small proportion of the test\nbatch is maliciously manipulated. In response to the emerging threat, we\npropose median batch normalization (MedBN), leveraging the robustness of the\nmedian for statistics estimation within the batch normalization layer during\ntest-time inference. Our method is algorithm-agnostic, thus allowing seamless\nintegration with existing TTA frameworks. Our experimental results on benchmark\ndatasets, including CIFAR10-C, CIFAR100-C and ImageNet-C, consistently\ndemonstrate that MedBN outperforms existing approaches in maintaining robust\nperformance across different attack scenarios, encompassing both instant and\ncumulative attacks. Through extensive experiments, we show that our approach\nsustains the performance even in the absence of attacks, achieving a practical\nbalance between robustness and performance.\n","authors":["Hyejin Park","Jeongyeon Hwang","Sunung Mun","Sangdon Park","Jungseul Ok"],"pdf_url":"https://arxiv.org/pdf/2403.19326v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19322v1","updated":"2024-03-28T11:26:30Z","published":"2024-03-28T11:26:30Z","title":"Plug-and-Play Grounding of Reasoning in Multimodal Large Language Models","summary":" The surge of Multimodal Large Language Models (MLLMs), given their prominent\nemergent capabilities in instruction following and reasoning, has greatly\nadvanced the field of visual reasoning. However, constrained by their\nnon-lossless image tokenization, most MLLMs fall short of comprehensively\ncapturing details of text and objects, especially in high-resolution images. To\naddress this, we propose P2G, a novel framework for plug-and-play grounding of\nreasoning in MLLMs. Specifically, P2G exploits the tool-usage potential of\nMLLMs to employ expert agents to achieve on-the-fly grounding to critical\nvisual and textual objects of image, thus achieving deliberate reasoning via\nmultimodal prompting. We further create P2GB, a benchmark aimed at assessing\nMLLMs' ability to understand inter-object relationships and text in challenging\nhigh-resolution images. Comprehensive experiments on visual reasoning tasks\ndemonstrate the superiority of P2G. Noteworthy, P2G achieved comparable\nperformance with GPT-4V on P2GB, with a 7B backbone. Our work highlights the\npotential of plug-and-play grounding of reasoning and opens up a promising\nalternative beyond model scaling.\n","authors":["Jiaxing Chen","Yuxuan Liu","Dehu Li","Xiang An","Ziyong Feng","Yongle Zhao","Yin Xie"],"pdf_url":"https://arxiv.org/pdf/2403.19322v1.pdf","comment":"14 pages, 3 figures"},{"id":"http://arxiv.org/abs/2403.19319v1","updated":"2024-03-28T11:22:53Z","published":"2024-03-28T11:22:53Z","title":"Mesh2NeRF: Direct Mesh Supervision for Neural Radiance Field\n Representation and Generation","summary":" We present Mesh2NeRF, an approach to derive ground-truth radiance fields from\ntextured meshes for 3D generation tasks. Many 3D generative approaches\nrepresent 3D scenes as radiance fields for training. Their ground-truth\nradiance fields are usually fitted from multi-view renderings from a\nlarge-scale synthetic 3D dataset, which often results in artifacts due to\nocclusions or under-fitting issues. In Mesh2NeRF, we propose an analytic\nsolution to directly obtain ground-truth radiance fields from 3D meshes,\ncharacterizing the density field with an occupancy function featuring a defined\nsurface thickness, and determining view-dependent color through a reflection\nfunction considering both the mesh and environment lighting. Mesh2NeRF extracts\naccurate radiance fields which provides direct supervision for training\ngenerative NeRFs and single scene representation. We validate the effectiveness\nof Mesh2NeRF across various tasks, achieving a noteworthy 3.12dB improvement in\nPSNR for view synthesis in single scene representation on the ABO dataset, a\n0.69 PSNR enhancement in the single-view conditional generation of ShapeNet\nCars, and notably improved mesh extraction from NeRF in the unconditional\ngeneration of Objaverse Mugs.\n","authors":["Yujin Chen","Yinyu Nie","Benjamin Ummenhofer","Reiner Birkl","Michael Paulitsch","Matthias Müller","Matthias Nießner"],"pdf_url":"https://arxiv.org/pdf/2403.19319v1.pdf","comment":"Project page: https://terencecyj.github.io/projects/Mesh2NeRF/ Video:\n https://youtu.be/oufv1N3f7iY"},{"id":"http://arxiv.org/abs/2403.19316v1","updated":"2024-03-28T11:17:00Z","published":"2024-03-28T11:17:00Z","title":"Hypergraph-based Multi-View Action Recognition using Event Cameras","summary":" Action recognition from video data forms a cornerstone with wide-ranging\napplications. Single-view action recognition faces limitations due to its\nreliance on a single viewpoint. In contrast, multi-view approaches capture\ncomplementary information from various viewpoints for improved accuracy.\nRecently, event cameras have emerged as innovative bio-inspired sensors,\nleading to advancements in event-based action recognition. However, existing\nworks predominantly focus on single-view scenarios, leaving a gap in multi-view\nevent data exploitation, particularly in challenges like information deficit\nand semantic misalignment. To bridge this gap, we introduce HyperMV, a\nmulti-view event-based action recognition framework. HyperMV converts discrete\nevent data into frame-like representations and extracts view-related features\nusing a shared convolutional network. By treating segments as vertices and\nconstructing hyperedges using rule-based and KNN-based strategies, a multi-view\nhypergraph neural network that captures relationships across viewpoint and\ntemporal features is established. The vertex attention hypergraph propagation\nis also introduced for enhanced feature fusion. To prompt research in this\narea, we present the largest multi-view event-based action dataset\n$\\text{THU}^{\\text{MV-EACT}}\\text{-50}$, comprising 50 actions from 6\nviewpoints, which surpasses existing datasets by over tenfold. Experimental\nresults show that HyperMV significantly outperforms baselines in both\ncross-subject and cross-view scenarios, and also exceeds the state-of-the-arts\nin frame-based multi-view action recognition.\n","authors":["Yue Gao","Jiaxuan Lu","Siqi Li","Yipeng Li","Shaoyi Du"],"pdf_url":"https://arxiv.org/pdf/2403.19316v1.pdf","comment":"Accepted by IEEE Transactions on Pattern Analysis and Machine\n Intelligence (TPAMI 2024)"},{"id":"http://arxiv.org/abs/2403.19314v1","updated":"2024-03-28T11:12:33Z","published":"2024-03-28T11:12:33Z","title":"Total-Decom: Decomposed 3D Scene Reconstruction with Minimal Interaction","summary":" Scene reconstruction from multi-view images is a fundamental problem in\ncomputer vision and graphics. Recent neural implicit surface reconstruction\nmethods have achieved high-quality results; however, editing and manipulating\nthe 3D geometry of reconstructed scenes remains challenging due to the absence\nof naturally decomposed object entities and complex object/background\ncompositions. In this paper, we present Total-Decom, a novel method for\ndecomposed 3D reconstruction with minimal human interaction. Our approach\nseamlessly integrates the Segment Anything Model (SAM) with hybrid\nimplicit-explicit neural surface representations and a mesh-based\nregion-growing technique for accurate 3D object decomposition. Total-Decom\nrequires minimal human annotations while providing users with real-time control\nover the granularity and quality of decomposition. We extensively evaluate our\nmethod on benchmark datasets and demonstrate its potential for downstream\napplications, such as animation and scene editing. The code is available at\n\\href{https://github.com/CVMI-Lab/Total-Decom.git}{https://github.com/CVMI-Lab/Total-Decom.git}.\n","authors":["Xiaoyang Lyu","Chirui Chang","Peng Dai","Yang-tian Sun","Xiaojuang Qi"],"pdf_url":"https://arxiv.org/pdf/2403.19314v1.pdf","comment":"8 pages, 7 figures, accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2211.01579v3","updated":"2024-03-28T10:53:54Z","published":"2022-11-03T04:19:27Z","title":"Data-free Defense of Black Box Models Against Adversarial Attacks","summary":" Several companies often safeguard their trained deep models (i.e., details of\narchitecture, learnt weights, training details etc.) from third-party users by\nexposing them only as black boxes through APIs. Moreover, they may not even\nprovide access to the training data due to proprietary reasons or sensitivity\nconcerns. In this work, we propose a novel defense mechanism for black box\nmodels against adversarial attacks in a data-free set up. We construct\nsynthetic data via generative model and train surrogate network using model\nstealing techniques. To minimize adversarial contamination on perturbed\nsamples, we propose 'wavelet noise remover' (WNR) that performs discrete\nwavelet decomposition on input images and carefully select only a few important\ncoefficients determined by our 'wavelet coefficient selection module' (WCSM).\nTo recover the high-frequency content of the image after noise removal via WNR,\nwe further train a 'regenerator' network with an objective to retrieve the\ncoefficients such that the reconstructed image yields similar to original\npredictions on the surrogate model. At test time, WNR combined with trained\nregenerator network is prepended to the black box network, resulting in a high\nboost in adversarial accuracy. Our method improves the adversarial accuracy on\nCIFAR-10 by 38.98% and 32.01% on state-of-the-art Auto Attack compared to\nbaseline, even when the attacker uses surrogate architecture (Alexnet-half and\nAlexnet) similar to the black box architecture (Alexnet) with same model\nstealing strategy as defender. The code is available at\nhttps://github.com/vcl-iisc/data-free-black-box-defense\n","authors":["Gaurav Kumar Nayak","Inder Khatri","Ruchit Rawal","Anirban Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2211.01579v3.pdf","comment":"CVPR Workshop (Under Review)"},{"id":"http://arxiv.org/abs/2403.19306v1","updated":"2024-03-28T10:42:49Z","published":"2024-03-28T10:42:49Z","title":"Sparse Generation: Making Pseudo Labels Sparse for weakly supervision\n with points","summary":" In recent years, research on point weakly supervised object detection (PWSOD)\nmethods in the field of computer vision has attracted people's attention.\nHowever, existing pseudo labels generation methods perform poorly in a small\namount of supervised annotation data and dense object detection tasks. We\nconsider the generation of weakly supervised pseudo labels as the result of\nmodel's sparse output, and propose a method called Sparse Generation to make\npseudo labels sparse. It constructs dense tensors through the relationship\nbetween data and detector model, optimizes three of its parameters, and obtains\na sparse tensor via coordinated calculation, thereby indirectly obtaining\nhigher quality pseudo labels, and solving the model's density problem in the\nsituation of only a small amount of supervised annotation data can be used. On\ntwo broadly used open-source datasets (RSOD, SIMD) and a self-built dataset\n(Bullet-Hole), the experimental results showed that the proposed method has a\nsignificant advantage in terms of overall performance metrics, comparing to\nthat state-of-the-art method.\n","authors":["Tian Ma","Chuyang Shang","Wanzhu Ren","Yuancheng Li","Jiiayi Yang","Jiali Qian"],"pdf_url":"https://arxiv.org/pdf/2403.19306v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19294v1","updated":"2024-03-28T10:31:23Z","published":"2024-03-28T10:31:23Z","title":"FlowDepth: Decoupling Optical Flow for Self-Supervised Monocular Depth\n Estimation","summary":" Self-supervised multi-frame methods have currently achieved promising results\nin depth estimation. However, these methods often suffer from mismatch problems\ndue to the moving objects, which break the static assumption. Additionally,\nunfairness can occur when calculating photometric errors in high-freq or\nlow-texture regions of the images. To address these issues, existing approaches\nuse additional semantic priori black-box networks to separate moving objects\nand improve the model only at the loss level. Therefore, we propose FlowDepth,\nwhere a Dynamic Motion Flow Module (DMFM) decouples the optical flow by a\nmechanism-based approach and warps the dynamic regions thus solving the\nmismatch problem. For the unfairness of photometric errors caused by high-freq\nand low-texture regions, we use Depth-Cue-Aware Blur (DCABlur) and Cost-Volume\nsparsity loss respectively at the input and the loss level to solve the\nproblem. Experimental results on the KITTI and Cityscapes datasets show that\nour method outperforms the state-of-the-art methods.\n","authors":["Yiyang Sun","Zhiyuan Xu","Xiaonian Wang","Jing Yao"],"pdf_url":"https://arxiv.org/pdf/2403.19294v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19278v1","updated":"2024-03-28T10:02:08Z","published":"2024-03-28T10:02:08Z","title":"CAT: Exploiting Inter-Class Dynamics for Domain Adaptive Object\n Detection","summary":" Domain adaptive object detection aims to adapt detection models to domains\nwhere annotated data is unavailable. Existing methods have been proposed to\naddress the domain gap using the semi-supervised student-teacher framework.\nHowever, a fundamental issue arises from the class imbalance in the labelled\ntraining set, which can result in inaccurate pseudo-labels. The relationship\nbetween classes, especially where one class is a majority and the other\nminority, has a large impact on class bias. We propose Class-Aware Teacher\n(CAT) to address the class bias issue in the domain adaptation setting. In our\nwork, we approximate the class relationships with our Inter-Class Relation\nmodule (ICRm) and exploit it to reduce the bias within the model. In this way,\nwe are able to apply augmentations to highly related classes, both inter- and\nintra-domain, to boost the performance of minority classes while having minimal\nimpact on majority classes. We further reduce the bias by implementing a\nclass-relation weight to our classification loss. Experiments conducted on\nvarious datasets and ablation studies show that our method is able to address\nthe class bias in the domain adaptation setting. On the Cityscapes to Foggy\nCityscapes dataset, we attained a 52.5 mAP, a substantial improvement over the\n51.2 mAP achieved by the state-of-the-art method.\n","authors":["Mikhail Kennerley","Jian-Gang Wang","Bharadwaj Veeravalli","Robby T. Tan"],"pdf_url":"https://arxiv.org/pdf/2403.19278v1.pdf","comment":"Accepted into CVPR 2024"},{"id":"http://arxiv.org/abs/2307.10924v2","updated":"2024-03-28T09:54:38Z","published":"2023-07-20T14:51:28Z","title":"Intrinsic Image Decomposition Using Point Cloud Representation","summary":" The purpose of intrinsic decomposition is to separate an image into its\nalbedo (reflective properties) and shading components (illumination\nproperties). This is challenging because it's an ill-posed problem.\nConventional approaches primarily concentrate on 2D imagery and fail to fully\nexploit the capabilities of 3D data representation. 3D point clouds offer a\nmore comprehensive format for representing scenes, as they combine geometric\nand color information effectively. To this end, in this paper, we introduce\nPoint Intrinsic Net (PoInt-Net), which leverages 3D point cloud data to\nconcurrently estimate albedo and shading maps. The merits of PoInt-Net include\nthe following aspects. First, the model is efficient, achieving consistent\nperformance across point clouds of any size with training only required on\nsmall-scale point clouds. Second, it exhibits remarkable robustness; even when\ntrained exclusively on datasets comprising individual objects, PoInt-Net\ndemonstrates strong generalization to unseen objects and scenes. Third, it\ndelivers superior accuracy over conventional 2D approaches, demonstrating\nenhanced performance across various metrics on different datasets. (Code\nReleased)\n","authors":["Xiaoyan Xing","Konrad Groh","Sezer Karaoglu","Theo Gevers"],"pdf_url":"https://arxiv.org/pdf/2307.10924v2.pdf","comment":"Code: https://github.com/xyxingx/PoInt-Net"},{"id":"http://arxiv.org/abs/2403.17633v2","updated":"2024-03-28T09:47:45Z","published":"2024-03-26T12:08:14Z","title":"UADA3D: Unsupervised Adversarial Domain Adaptation for 3D Object\n Detection with Sparse LiDAR and Large Domain Gaps","summary":" In this study, we address a gap in existing unsupervised domain adaptation\napproaches on LiDAR-based 3D object detection, which have predominantly\nconcentrated on adapting between established, high-density autonomous driving\ndatasets. We focus on sparser point clouds, capturing scenarios from different\nperspectives: not just from vehicles on the road but also from mobile robots on\nsidewalks, which encounter significantly different environmental conditions and\nsensor configurations. We introduce Unsupervised Adversarial Domain Adaptation\nfor 3D Object Detection (UADA3D). UADA3D does not depend on pre-trained source\nmodels or teacher-student architectures. Instead, it uses an adversarial\napproach to directly learn domain-invariant features. We demonstrate its\nefficacy in various adaptation scenarios, showing significant improvements in\nboth self-driving car and mobile robot domains. Our code is open-source and\nwill be available soon.\n","authors":["Maciej K Wozniak","Mattias Hansson","Marko Thiel","Patric Jensfelt"],"pdf_url":"https://arxiv.org/pdf/2403.17633v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19265v1","updated":"2024-03-28T09:44:20Z","published":"2024-03-28T09:44:20Z","title":"Neural Fields for 3D Tracking of Anatomy and Surgical Instruments in\n Monocular Laparoscopic Video Clips","summary":" Laparoscopic video tracking primarily focuses on two target types: surgical\ninstruments and anatomy. The former could be used for skill assessment, while\nthe latter is necessary for the projection of virtual overlays. Where\ninstrument and anatomy tracking have often been considered two separate\nproblems, in this paper, we propose a method for joint tracking of all\nstructures simultaneously. Based on a single 2D monocular video clip, we train\na neural field to represent a continuous spatiotemporal scene, used to create\n3D tracks of all surfaces visible in at least one frame. Due to the small size\nof instruments, they generally cover a small part of the image only, resulting\nin decreased tracking accuracy. Therefore, we propose enhanced class weighting\nto improve the instrument tracks. We evaluate tracking on video clips from\nlaparoscopic cholecystectomies, where we find mean tracking accuracies of 92.4%\nfor anatomical structures and 87.4% for instruments. Additionally, we assess\nthe quality of depth maps obtained from the method's scene reconstructions. We\nshow that these pseudo-depths have comparable quality to a state-of-the-art\npre-trained depth estimator. On laparoscopic videos in the SCARED dataset, the\nmethod predicts depth with an MAE of 2.9 mm and a relative error of 9.2%. These\nresults show the feasibility of using neural fields for monocular 3D\nreconstruction of laparoscopic scenes.\n","authors":["Beerend G. A. Gerats","Jelmer M. Wolterink","Seb P. Mol","Ivo A. M. J. Broeders"],"pdf_url":"https://arxiv.org/pdf/2403.19265v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03795v3","updated":"2024-03-28T09:40:08Z","published":"2023-12-06T14:13:54Z","title":"AnimatableDreamer: Text-Guided Non-rigid 3D Model Generation and\n Reconstruction with Canonical Score Distillation","summary":" Advances in 3D generation have facilitated sequential 3D model generation\n(a.k.a 4D generation), yet its application for animatable objects with large\nmotion remains scarce. Our work proposes AnimatableDreamer, a text-to-4D\ngeneration framework capable of generating diverse categories of non-rigid\nobjects on skeletons extracted from a monocular video. At its core,\nAnimatableDreamer is equipped with our novel optimization design dubbed\nCanonical Score Distillation (CSD), which lifts 2D diffusion for temporal\nconsistent 4D generation. CSD, designed from a score gradient perspective,\ngenerates a canonical model with warp-robustness across different\narticulations. Notably, it also enhances the authenticity of bones and skinning\nby integrating inductive priors from a diffusion model. Furthermore, with\nmulti-view distillation, CSD infers invisible regions, thereby improving the\nfidelity of monocular non-rigid reconstruction. Extensive experiments\ndemonstrate the capability of our method in generating high-flexibility\ntext-guided 3D models from the monocular video, while also showing improved\nreconstruction performance over existing non-rigid reconstruction methods.\n","authors":["Xinzhou Wang","Yikai Wang","Junliang Ye","Zhengyi Wang","Fuchun Sun","Pengkun Liu","Ling Wang","Kai Sun","Xintong Wang","Bin He"],"pdf_url":"https://arxiv.org/pdf/2312.03795v3.pdf","comment":"Project page: https://animatabledreamer.github.io/"},{"id":"http://arxiv.org/abs/2311.15977v2","updated":"2024-03-28T09:31:05Z","published":"2023-11-27T16:23:01Z","title":"Text2Loc: 3D Point Cloud Localization from Natural Language","summary":" We tackle the problem of 3D point cloud localization based on a few natural\nlinguistic descriptions and introduce a novel neural network, Text2Loc, that\nfully interprets the semantic relationship between points and text. Text2Loc\nfollows a coarse-to-fine localization pipeline: text-submap global place\nrecognition, followed by fine localization. In global place recognition,\nrelational dynamics among each textual hint are captured in a hierarchical\ntransformer with max-pooling (HTM), whereas a balance between positive and\nnegative pairs is maintained using text-submap contrastive learning. Moreover,\nwe propose a novel matching-free fine localization method to further refine the\nlocation predictions, which completely removes the need for complicated\ntext-instance matching and is lighter, faster, and more accurate than previous\nmethods. Extensive experiments show that Text2Loc improves the localization\naccuracy by up to $2\\times$ over the state-of-the-art on the KITTI360Pose\ndataset. Our project page is publicly available at\n\\url{https://yan-xia.github.io/projects/text2loc/}.\n","authors":["Yan Xia","Letian Shi","Zifeng Ding","João F. Henriques","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2311.15977v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19254v1","updated":"2024-03-28T09:21:00Z","published":"2024-03-28T09:21:00Z","title":"Imperceptible Protection against Style Imitation from Diffusion Models","summary":" Recent progress in diffusion models has profoundly enhanced the fidelity of\nimage generation. However, this has raised concerns about copyright\ninfringements. While prior methods have introduced adversarial perturbations to\nprevent style imitation, most are accompanied by the degradation of artworks'\nvisual quality. Recognizing the importance of maintaining this, we develop a\nvisually improved protection method that preserves its protection capability.\nTo this end, we create a perceptual map to identify areas most sensitive to\nhuman eyes. We then adjust the protection intensity guided by an instance-aware\nrefinement. We also integrate a perceptual constraints bank to further improve\nthe imperceptibility. Results show that our method substantially elevates the\nquality of the protected image without compromising on protection efficacy.\n","authors":["Namhyuk Ahn","Wonhyuk Ahn","KiYoon Yoo","Daesik Kim","Seung-Hun Nam"],"pdf_url":"https://arxiv.org/pdf/2403.19254v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08080v2","updated":"2024-03-28T09:20:33Z","published":"2023-10-12T07:10:12Z","title":"RT-SRTS: Angle-Agnostic Real-Time Simultaneous 3D Reconstruction and\n Tumor Segmentation from Single X-Ray Projection","summary":" Radiotherapy is one of the primary treatment methods for tumors, but the\norgan movement caused by respiration limits its accuracy. Recently, 3D imaging\nfrom a single X-ray projection has received extensive attention as a promising\napproach to address this issue. However, current methods can only reconstruct\n3D images without directly locating the tumor and are only validated for\nfixed-angle imaging, which fails to fully meet the requirements of motion\ncontrol in radiotherapy. In this study, a novel imaging method RT-SRTS is\nproposed which integrates 3D imaging and tumor segmentation into one network\nbased on multi-task learning (MTL) and achieves real-time simultaneous 3D\nreconstruction and tumor segmentation from a single X-ray projection at any\nangle. Furthermore, the attention enhanced calibrator (AEC) and\nuncertain-region elaboration (URE) modules have been proposed to aid feature\nextraction and improve segmentation accuracy. The proposed method was evaluated\non fifteen patient cases and compared with three state-of-the-art methods. It\nnot only delivers superior 3D reconstruction but also demonstrates commendable\ntumor segmentation results. Simultaneous reconstruction and segmentation can be\ncompleted in approximately 70 ms, significantly faster than the required time\nthreshold for real-time tumor tracking. The efficacies of both AEC and URE have\nalso been validated in ablation studies. The code of work is available at\nhttps://github.com/ZywooSimple/RT-SRTS.\n","authors":["Miao Zhu","Qiming Fu","Bo Liu","Mengxi Zhang","Bojian Li","Xiaoyan Luo","Fugen Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.08080v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.11342v2","updated":"2024-03-28T09:20:19Z","published":"2023-04-22T07:48:17Z","title":"NaviNeRF: NeRF-based 3D Representation Disentanglement by Latent\n Semantic Navigation","summary":" 3D representation disentanglement aims to identify, decompose, and manipulate\nthe underlying explanatory factors of 3D data, which helps AI fundamentally\nunderstand our 3D world. This task is currently under-explored and poses great\nchallenges: (i) the 3D representations are complex and in general contains much\nmore information than 2D image; (ii) many 3D representations are not well\nsuited for gradient-based optimization, let alone disentanglement. To address\nthese challenges, we use NeRF as a differentiable 3D representation, and\nintroduce a self-supervised Navigation to identify interpretable semantic\ndirections in the latent space. To our best knowledge, this novel method,\ndubbed NaviNeRF, is the first work to achieve fine-grained 3D disentanglement\nwithout any priors or supervisions. Specifically, NaviNeRF is built upon the\ngenerative NeRF pipeline, and equipped with an Outer Navigation Branch and an\nInner Refinement Branch. They are complementary -- the outer navigation is to\nidentify global-view semantic directions, and the inner refinement dedicates to\nfine-grained attributes. A synergistic loss is further devised to coordinate\ntwo branches. Extensive experiments demonstrate that NaviNeRF has a superior\nfine-grained 3D disentanglement ability than the previous 3D-aware models. Its\nperformance is also comparable to editing-oriented models relying on semantic\nor geometry priors.\n","authors":["Baao Xie","Bohan Li","Zequn Zhang","Junting Dong","Xin Jin","Jingyu Yang","Wenjun Zeng"],"pdf_url":"https://arxiv.org/pdf/2304.11342v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19243v1","updated":"2024-03-28T08:58:20Z","published":"2024-03-28T08:58:20Z","title":"Sine Activated Low-Rank Matrices for Parameter Efficient Learning","summary":" Low-rank decomposition has emerged as a vital tool for enhancing parameter\nefficiency in neural network architectures, gaining traction across diverse\napplications in machine learning. These techniques significantly lower the\nnumber of parameters, striking a balance between compactness and performance.\nHowever, a common challenge has been the compromise between parameter\nefficiency and the accuracy of the model, where reduced parameters often lead\nto diminished accuracy compared to their full-rank counterparts. In this work,\nwe propose a novel theoretical framework that integrates a sinusoidal function\nwithin the low-rank decomposition process. This approach not only preserves the\nbenefits of the parameter efficiency characteristic of low-rank methods but\nalso increases the decomposition's rank, thereby enhancing model accuracy. Our\nmethod proves to be an adaptable enhancement for existing low-rank models, as\nevidenced by its successful application in Vision Transformers (ViT), Large\nLanguage Models (LLMs), Neural Radiance Fields (NeRF), and 3D shape modeling.\nThis demonstrates the wide-ranging potential and efficiency of our proposed\ntechnique.\n","authors":["Yiping Ji","Hemanth Saratchandran","Cameron Gordon","Zeyu Zhang","Simon Lucey"],"pdf_url":"https://arxiv.org/pdf/2403.19243v1.pdf","comment":"The first two authors contributed equally"},{"id":"http://arxiv.org/abs/2403.19242v1","updated":"2024-03-28T08:54:40Z","published":"2024-03-28T08:54:40Z","title":"RTracker: Recoverable Tracking via PN Tree Structured Memory","summary":" Existing tracking methods mainly focus on learning better target\nrepresentation or developing more robust prediction models to improve tracking\nperformance. While tracking performance has significantly improved, the target\nloss issue occurs frequently due to tracking failures, complete occlusion, or\nout-of-view situations. However, considerably less attention is paid to the\nself-recovery issue of tracking methods, which is crucial for practical\napplications. To this end, we propose a recoverable tracking framework,\nRTracker, that uses a tree-structured memory to dynamically associate a tracker\nand a detector to enable self-recovery ability. Specifically, we propose a\nPositive-Negative Tree-structured memory to chronologically store and maintain\npositive and negative target samples. Upon the PN tree memory, we develop\ncorresponding walking rules for determining the state of the target and define\na set of control flows to unite the tracker and the detector in different\ntracking scenarios. Our core idea is to use the support samples of positive and\nnegative target categories to establish a relative distance-based criterion for\na reliable assessment of target loss. The favorable performance in comparison\nagainst the state-of-the-art methods on numerous challenging benchmarks\ndemonstrates the effectiveness of the proposed algorithm.\n","authors":["Yuqing Huang","Xin Li","Zikun Zhou","Yaowei Wang","Zhenyu He","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2403.19242v1.pdf","comment":"accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19238v1","updated":"2024-03-28T08:49:35Z","published":"2024-03-28T08:49:35Z","title":"Taming Lookup Tables for Efficient Image Retouching","summary":" The widespread use of high-definition screens in edge devices, such as\nend-user cameras, smartphones, and televisions, is spurring a significant\ndemand for image enhancement. Existing enhancement models often optimize for\nhigh performance while falling short of reducing hardware inference time and\npower consumption, especially on edge devices with constrained computing and\nstorage resources. To this end, we propose Image Color Enhancement Lookup Table\n(ICELUT) that adopts LUTs for extremely efficient edge inference, without any\nconvolutional neural network (CNN). During training, we leverage pointwise\n(1x1) convolution to extract color information, alongside a split fully\nconnected layer to incorporate global information. Both components are then\nseamlessly converted into LUTs for hardware-agnostic deployment. ICELUT\nachieves near-state-of-the-art performance and remarkably low power\nconsumption. We observe that the pointwise network structure exhibits robust\nscalability, upkeeping the performance even with a heavily downsampled 32x32\ninput image. These enable ICELUT, the first-ever purely LUT-based image\nenhancer, to reach an unprecedented speed of 0.4ms on GPU and 7ms on CPU, at\nleast one order faster than any CNN solution. Codes are available at\nhttps://github.com/Stephen0808/ICELUT.\n","authors":["Sidi Yang","Binxiao Huang","Mingdeng Cao","Yatai Ji","Hanzhong Guo","Ngai Wong","Yujiu Yang"],"pdf_url":"https://arxiv.org/pdf/2403.19238v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.19654v2","updated":"2024-03-28T08:47:14Z","published":"2023-10-30T15:38:43Z","title":"MCAD: Multi-teacher Cross-modal Alignment Distillation for efficient\n image-text retrieval","summary":" Due to the success of large-scale visual-language pretraining (VLP) models\nand the widespread use of image-text retrieval in industry areas, it is now\ncritically necessary to reduce the model size and streamline their\nmobile-device deployment. Single- and dual-stream model structures are commonly\nused in image-text retrieval with the goal of closing the semantic gap between\ntextual and visual modalities. While single-stream models use deep feature\nfusion to achieve more accurate cross-model alignment, dual-stream models are\nbetter at offline indexing and fast inference.We propose a Multi-teacher\nCross-modality Alignment Distillation (MCAD) technique to integrate the\nadvantages of single- and dual-stream models. By incorporating the fused\nsingle-stream features into the image and text features of the dual-stream\nmodel, we formulate new modified teacher similarity distributions and features.\nThen, we conduct both distribution and feature distillation to boost the\ncapability of the student dual-stream model, achieving high retrieval\nperformance without increasing inference complexity.Extensive experiments\ndemonstrate the remarkable performance and high efficiency of MCAD on\nimage-text retrieval tasks. Furthermore, we implement a lightweight CLIP model\non Snapdragon/Dimensity chips with only $\\sim$100M running memory and\n$\\sim$8.0ms search latency, achieving the mobile-device application of VLP\nmodels.\n","authors":["Youbo Lei","Feifei He","Chen Chen","Yingbin Mo","Si Jia Li","Defeng Xie","Haonan Lu"],"pdf_url":"https://arxiv.org/pdf/2310.19654v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19235v1","updated":"2024-03-28T08:47:02Z","published":"2024-03-28T08:47:02Z","title":"DreamSalon: A Staged Diffusion Framework for Preserving Identity-Context\n in Editable Face Generation","summary":" While large-scale pre-trained text-to-image models can synthesize diverse and\nhigh-quality human-centered images, novel challenges arise with a nuanced task\nof \"identity fine editing\": precisely modifying specific features of a subject\nwhile maintaining its inherent identity and context. Existing personalization\nmethods either require time-consuming optimization or learning additional\nencoders, adept in \"identity re-contextualization\". However, they often\nstruggle with detailed and sensitive tasks like human face editing. To address\nthese challenges, we introduce DreamSalon, a noise-guided, staged-editing\nframework, uniquely focusing on detailed image manipulations and\nidentity-context preservation. By discerning editing and boosting stages via\nthe frequency and gradient of predicted noises, DreamSalon first performs\ndetailed manipulations on specific features in the editing stage, guided by\nhigh-frequency information, and then employs stochastic denoising in the\nboosting stage to improve image quality. For more precise editing, DreamSalon\nsemantically mixes source and target textual prompts, guided by differences in\ntheir embedding covariances, to direct the model's focus on specific\nmanipulation areas. Our experiments demonstrate DreamSalon's ability to\nefficiently and faithfully edit fine details on human faces, outperforming\nexisting methods both qualitatively and quantitatively.\n","authors":["Haonan Lin","Mengmeng Wang","Yan Chen","Wenbin An","Yuzhe Yao","Guang Dai","Qianying Wang","Yong Liu","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2403.19235v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19232v1","updated":"2024-03-28T08:44:36Z","published":"2024-03-28T08:44:36Z","title":"AZ-NAS: Assembling Zero-Cost Proxies for Network Architecture Search","summary":" Training-free network architecture search (NAS) aims to discover\nhigh-performing networks with zero-cost proxies, capturing network\ncharacteristics related to the final performance. However, network rankings\nestimated by previous training-free NAS methods have shown weak correlations\nwith the performance. To address this issue, we propose AZ-NAS, a novel\napproach that leverages the ensemble of various zero-cost proxies to enhance\nthe correlation between a predicted ranking of networks and the ground truth\nsubstantially in terms of the performance. To achieve this, we introduce four\nnovel zero-cost proxies that are complementary to each other, analyzing\ndistinct traits of architectures in the views of expressivity, progressivity,\ntrainability, and complexity. The proxy scores can be obtained simultaneously\nwithin a single forward and backward pass, making an overall NAS process highly\nefficient. In order to integrate the rankings predicted by our proxies\neffectively, we introduce a non-linear ranking aggregation method that\nhighlights the networks highly-ranked consistently across all the proxies.\nExperimental results conclusively demonstrate the efficacy and efficiency of\nAZ-NAS, outperforming state-of-the-art methods on standard benchmarks, all\nwhile maintaining a reasonable runtime cost.\n","authors":["Junghyup Lee","Bumsub Ham"],"pdf_url":"https://arxiv.org/pdf/2403.19232v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2401.11874v2","updated":"2024-03-28T08:40:08Z","published":"2024-01-22T12:00:37Z","title":"Detect-Order-Construct: A Tree Construction based Approach for\n Hierarchical Document Structure Analysis","summary":" Document structure analysis (aka document layout analysis) is crucial for\nunderstanding the physical layout and logical structure of documents, with\napplications in information retrieval, document summarization, knowledge\nextraction, etc. In this paper, we concentrate on Hierarchical Document\nStructure Analysis (HDSA) to explore hierarchical relationships within\nstructured documents created using authoring software employing hierarchical\nschemas, such as LaTeX, Microsoft Word, and HTML. To comprehensively analyze\nhierarchical document structures, we propose a tree construction based approach\nthat addresses multiple subtasks concurrently, including page object detection\n(Detect), reading order prediction of identified objects (Order), and the\nconstruction of intended hierarchical structure (Construct). We present an\neffective end-to-end solution based on this framework to demonstrate its\nperformance. To assess our approach, we develop a comprehensive benchmark\ncalled Comp-HRDoc, which evaluates the above subtasks simultaneously. Our\nend-to-end system achieves state-of-the-art performance on two large-scale\ndocument layout analysis datasets (PubLayNet and DocLayNet), a high-quality\nhierarchical document structure reconstruction dataset (HRDoc), and our\nComp-HRDoc benchmark. The Comp-HRDoc benchmark will be released to facilitate\nfurther research in this field.\n","authors":["Jiawei Wang","Kai Hu","Zhuoyao Zhong","Lei Sun","Qiang Huo"],"pdf_url":"https://arxiv.org/pdf/2401.11874v2.pdf","comment":"Submitted to Pattern Recognition"},{"id":"http://arxiv.org/abs/2403.19225v1","updated":"2024-03-28T08:39:44Z","published":"2024-03-28T08:39:44Z","title":"Efficient and Effective Weakly-Supervised Action Segmentation via\n Action-Transition-Aware Boundary Alignment","summary":" Weakly-supervised action segmentation is a task of learning to partition a\nlong video into several action segments, where training videos are only\naccompanied by transcripts (ordered list of actions). Most of existing methods\nneed to infer pseudo segmentation for training by serial alignment between all\nframes and the transcript, which is time-consuming and hard to be parallelized\nwhile training. In this work, we aim to escape from this inefficient alignment\nwith massive but redundant frames, and instead to directly localize a few\naction transitions for pseudo segmentation generation, where a transition\nrefers to the change from an action segment to its next adjacent one in the\ntranscript. As the true transitions are submerged in noisy boundaries due to\nintra-segment visual variation, we propose a novel Action-Transition-Aware\nBoundary Alignment (ATBA) framework to efficiently and effectively filter out\nnoisy boundaries and detect transitions. In addition, to boost the semantic\nlearning in the case that noise is inevitably present in the pseudo\nsegmentation, we also introduce video-level losses to utilize the trusted\nvideo-level supervision. Extensive experiments show the effectiveness of our\napproach on both performance and training speed.\n","authors":["Angchi Xu","Wei-Shi Zheng"],"pdf_url":"https://arxiv.org/pdf/2403.19225v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19221v1","updated":"2024-03-28T08:35:46Z","published":"2024-03-28T08:35:46Z","title":"Towards Multimodal Video Paragraph Captioning Models Robust to Missing\n Modality","summary":" Video paragraph captioning (VPC) involves generating detailed narratives for\nlong videos, utilizing supportive modalities such as speech and event\nboundaries. However, the existing models are constrained by the assumption of\nconstant availability of a single auxiliary modality, which is impractical\ngiven the diversity and unpredictable nature of real-world scenarios. To this\nend, we propose a Missing-Resistant framework MR-VPC that effectively harnesses\nall available auxiliary inputs and maintains resilience even in the absence of\ncertain modalities. Under this framework, we propose the Multimodal VPC (MVPC)\narchitecture integrating video, speech, and event boundary inputs in a unified\nmanner to process various auxiliary inputs. Moreover, to fortify the model\nagainst incomplete data, we introduce DropAM, a data augmentation strategy that\nrandomly omits auxiliary inputs, paired with DistillAM, a regularization target\nthat distills knowledge from teacher models trained on modality-complete data,\nenabling efficient learning in modality-deficient environments. Through\nexhaustive experimentation on YouCook2 and ActivityNet Captions, MR-VPC has\nproven to deliver superior performance on modality-complete and\nmodality-missing test data. This work highlights the significance of developing\nresilient VPC models and paves the way for more adaptive, robust multimodal\nvideo understanding.\n","authors":["Sishuo Chen","Lei Li","Shuhuai Ren","Rundong Gao","Yuanxin Liu","Xiaohan Bi","Xu Sun","Lu Hou"],"pdf_url":"https://arxiv.org/pdf/2403.19221v1.pdf","comment":"Code available at https://github.com/lancopku/MR-VPC"},{"id":"http://arxiv.org/abs/2403.19220v1","updated":"2024-03-28T08:34:04Z","published":"2024-03-28T08:34:04Z","title":"GeoAuxNet: Towards Universal 3D Representation Learning for Multi-sensor\n Point Clouds","summary":" Point clouds captured by different sensors such as RGB-D cameras and LiDAR\npossess non-negligible domain gaps. Most existing methods design different\nnetwork architectures and train separately on point clouds from various\nsensors. Typically, point-based methods achieve outstanding performances on\neven-distributed dense point clouds from RGB-D cameras, while voxel-based\nmethods are more efficient for large-range sparse LiDAR point clouds. In this\npaper, we propose geometry-to-voxel auxiliary learning to enable voxel\nrepresentations to access point-level geometric information, which supports\nbetter generalisation of the voxel-based backbone with additional\ninterpretations of multi-sensor point clouds. Specifically, we construct\nhierarchical geometry pools generated by a voxel-guided dynamic point network,\nwhich efficiently provide auxiliary fine-grained geometric information adapted\nto different stages of voxel features. We conduct experiments on joint\nmulti-sensor datasets to demonstrate the effectiveness of GeoAuxNet. Enjoying\nelaborate geometric information, our method outperforms other models\ncollectively trained on multi-sensor datasets, and achieve competitive results\nwith the-state-of-art experts on each single dataset.\n","authors":["Shengjun Zhang","Xin Fei","Yueqi Duan"],"pdf_url":"https://arxiv.org/pdf/2403.19220v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2311.13120v3","updated":"2024-03-28T08:30:56Z","published":"2023-11-22T02:46:57Z","title":"Multi-modal In-Context Learning Makes an Ego-evolving Scene Text\n Recognizer","summary":" Scene text recognition (STR) in the wild frequently encounters challenges\nwhen coping with domain variations, font diversity, shape deformations, etc. A\nstraightforward solution is performing model fine-tuning tailored to a specific\nscenario, but it is computationally intensive and requires multiple model\ncopies for various scenarios. Recent studies indicate that large language\nmodels (LLMs) can learn from a few demonstration examples in a training-free\nmanner, termed \"In-Context Learning\" (ICL). Nevertheless, applying LLMs as a\ntext recognizer is unacceptably resource-consuming. Moreover, our pilot\nexperiments on LLMs show that ICL fails in STR, mainly attributed to the\ninsufficient incorporation of contextual information from diverse samples in\nthe training stage. To this end, we introduce E$^2$STR, a STR model trained\nwith context-rich scene text sequences, where the sequences are generated via\nour proposed in-context training strategy. E$^2$STR demonstrates that a\nregular-sized model is sufficient to achieve effective ICL capabilities in STR.\nExtensive experiments show that E$^2$STR exhibits remarkable training-free\nadaptation in various scenarios and outperforms even the fine-tuned\nstate-of-the-art approaches on public benchmarks. The code is released at\nhttps://github.com/bytedance/E2STR .\n","authors":["Zhen Zhao","Jingqun Tang","Chunhui Lin","Binghong Wu","Can Huang","Hao Liu","Xin Tan","Zhizhong Zhang","Yuan Xie"],"pdf_url":"https://arxiv.org/pdf/2311.13120v3.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2402.05608v3","updated":"2024-03-28T08:28:44Z","published":"2024-02-08T12:08:42Z","title":"Scalable Diffusion Models with State Space Backbone","summary":" This paper presents a new exploration into a category of diffusion models\nbuilt upon state space architecture. We endeavor to train diffusion models for\nimage data, wherein the traditional U-Net backbone is supplanted by a state\nspace backbone, functioning on raw patches or latent space. Given its notable\nefficacy in accommodating long-range dependencies, Diffusion State Space Models\n(DiS) are distinguished by treating all inputs including time, condition, and\nnoisy image patches as tokens. Our assessment of DiS encompasses both\nunconditional and class-conditional image generation scenarios, revealing that\nDiS exhibits comparable, if not superior, performance to CNN-based or\nTransformer-based U-Net architectures of commensurate size. Furthermore, we\nanalyze the scalability of DiS, gauged by the forward pass complexity\nquantified in Gflops. DiS models with higher Gflops, achieved through\naugmentation of depth/width or augmentation of input tokens, consistently\ndemonstrate lower FID. In addition to demonstrating commendable scalability\ncharacteristics, DiS-H/2 models in latent space achieve performance levels akin\nto prior diffusion models on class-conditional ImageNet benchmarks at the\nresolution of 256$\\times$256 and 512$\\times$512, while significantly reducing\nthe computational burden. The code and models are available at:\nhttps://github.com/feizc/DiS.\n","authors":["Zhengcong Fei","Mingyuan Fan","Changqian Yu","Junshi Huang"],"pdf_url":"https://arxiv.org/pdf/2402.05608v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00096v2","updated":"2024-03-28T08:25:27Z","published":"2023-11-30T13:32:43Z","title":"OST: Refining Text Knowledge with Optimal Spatio-Temporal Descriptor for\n General Video Recognition","summary":" Due to the resource-intensive nature of training vision-language models on\nexpansive video data, a majority of studies have centered on adapting\npre-trained image-language models to the video domain. Dominant pipelines\npropose to tackle the visual discrepancies with additional temporal learners\nwhile overlooking the substantial discrepancy for web-scaled descriptive\nnarratives and concise action category names, leading to less distinct semantic\nspace and potential performance limitations. In this work, we prioritize the\nrefinement of text knowledge to facilitate generalizable video recognition. To\naddress the limitations of the less distinct semantic space of category names,\nwe prompt a large language model (LLM) to augment action class names into\nSpatio-Temporal Descriptors thus bridging the textual discrepancy and serving\nas a knowledge base for general recognition. Moreover, to assign the best\ndescriptors with different video instances, we propose Optimal Descriptor\nSolver, forming the video recognition problem as solving the optimal matching\nflow across frame-level representations and descriptors. Comprehensive\nevaluations in zero-shot, few-shot, and fully supervised video recognition\nhighlight the effectiveness of our approach. Our best model achieves a\nstate-of-the-art zero-shot accuracy of 75.1% on Kinetics-600.\n","authors":["Tongjia Chen","Hongshan Yu","Zhengeng Yang","Zechuan Li","Wei Sun","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2312.00096v2.pdf","comment":"Technical report. Project Page: https://tomchen-ctj.github.io/OST/"},{"id":"http://arxiv.org/abs/2308.12532v6","updated":"2024-03-28T08:23:02Z","published":"2023-08-24T03:43:02Z","title":"FedSOL: Stabilized Orthogonal Learning with Proximal Restrictions in\n Federated Learning","summary":" Federated Learning (FL) aggregates locally trained models from individual\nclients to construct a global model. While FL enables learning a model with\ndata privacy, it often suffers from significant performance degradation when\nclients have heterogeneous data distributions. This data heterogeneity causes\nthe model to forget the global knowledge acquired from previously sampled\nclients after being trained on local datasets. Although the introduction of\nproximal objectives in local updates helps to preserve global knowledge, it can\nalso hinder local learning by interfering with local objectives. To address\nthis problem, we propose a novel method, Federated Stabilized Orthogonal\nLearning (FedSOL), which adopts an orthogonal learning strategy to balance the\ntwo conflicting objectives. FedSOL is designed to identify gradients of local\nobjectives that are inherently orthogonal to directions affecting the proximal\nobjective. Specifically, FedSOL targets parameter regions where learning on the\nlocal objective is minimally influenced by proximal weight perturbations. Our\nexperiments demonstrate that FedSOL consistently achieves state-of-the-art\nperformance across various scenarios.\n","authors":["Gihun Lee","Minchan Jeong","Sangmook Kim","Jaehoon Oh","Se-Young Yun"],"pdf_url":"https://arxiv.org/pdf/2308.12532v6.pdf","comment":"The IEEE/CVF Conference on Computer Vision and Pattern Recognition\n 2024 (CVPR 2024)"},{"id":"http://arxiv.org/abs/2403.19213v1","updated":"2024-03-28T08:21:56Z","published":"2024-03-28T08:21:56Z","title":"Learning Multiple Representations with Inconsistency-Guided Detail\n Regularization for Mask-Guided Matting","summary":" Mask-guided matting networks have achieved significant improvements and have\nshown great potential in practical applications in recent years. However,\nsimply learning matting representation from synthetic and\nlack-of-real-world-diversity matting data, these approaches tend to overfit\nlow-level details in wrong regions, lack generalization to objects with complex\nstructures and real-world scenes such as shadows, as well as suffer from\ninterference of background lines or textures. To address these challenges, in\nthis paper, we propose a novel auxiliary learning framework for mask-guided\nmatting models, incorporating three auxiliary tasks: semantic segmentation,\nedge detection, and background line detection besides matting, to learn\ndifferent and effective representations from different types of data and\nannotations. Our framework and model introduce the following key aspects: (1)\nto learn real-world adaptive semantic representation for objects with diverse\nand complex structures under real-world scenes, we introduce extra semantic\nsegmentation and edge detection tasks on more diverse real-world data with\nsegmentation annotations; (2) to avoid overfitting on low-level details, we\npropose a module to utilize the inconsistency between learned segmentation and\nmatting representations to regularize detail refinement; (3) we propose a novel\nbackground line detection task into our auxiliary learning framework, to\nsuppress interference of background lines or textures. In addition, we propose\na high-quality matting benchmark, Plant-Mat, to evaluate matting methods on\ncomplex structures. Extensively quantitative and qualitative results show that\nour approach outperforms state-of-the-art mask-guided methods.\n","authors":["Weihao Jiang","Zhaozhi Xie","Yuxiang Lu","Longjie Qi","Jingyong Cai","Hiroyuki Uchiyama","Bin Chen","Yue Ding","Hongtao Lu"],"pdf_url":"https://arxiv.org/pdf/2403.19213v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03441v4","updated":"2024-03-28T08:09:07Z","published":"2023-12-06T11:50:14Z","title":"UFineBench: Towards Text-based Person Retrieval with Ultra-fine\n Granularity","summary":" Existing text-based person retrieval datasets often have relatively\ncoarse-grained text annotations. This hinders the model to comprehend the\nfine-grained semantics of query texts in real scenarios. To address this\nproblem, we contribute a new benchmark named \\textbf{UFineBench} for text-based\nperson retrieval with ultra-fine granularity.\n Firstly, we construct a new \\textbf{dataset} named UFine6926. We collect a\nlarge number of person images and manually annotate each image with two\ndetailed textual descriptions, averaging 80.8 words each. The average word\ncount is three to four times that of the previous datasets. In addition of\nstandard in-domain evaluation, we also propose a special \\textbf{evaluation\nparadigm} more representative of real scenarios. It contains a new evaluation\nset with cross domains, cross textual granularity and cross textual styles,\nnamed UFine3C, and a new evaluation metric for accurately measuring retrieval\nability, named mean Similarity Distribution (mSD). Moreover, we propose CFAM, a\nmore efficient \\textbf{algorithm} especially designed for text-based person\nretrieval with ultra fine-grained texts. It achieves fine granularity mining by\nadopting a shared cross-modal granularity decoder and hard negative match\nmechanism.\n With standard in-domain evaluation, CFAM establishes competitive performance\nacross various datasets, especially on our ultra fine-grained UFine6926.\nFurthermore, by evaluating on UFine3C, we demonstrate that training on our\nUFine6926 significantly improves generalization to real scenarios compared with\nother coarse-grained datasets. The dataset and code will be made publicly\navailable at \\url{https://github.com/Zplusdragon/UFineBench}.\n","authors":["Jialong Zuo","Hanyu Zhou","Ying Nie","Feng Zhang","Tianyu Guo","Nong Sang","Yunhe Wang","Changxin Gao"],"pdf_url":"https://arxiv.org/pdf/2312.03441v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19205v1","updated":"2024-03-28T08:06:48Z","published":"2024-03-28T08:06:48Z","title":"From Activation to Initialization: Scaling Insights for Optimizing\n Neural Fields","summary":" In the realm of computer vision, Neural Fields have gained prominence as a\ncontemporary tool harnessing neural networks for signal representation. Despite\nthe remarkable progress in adapting these networks to solve a variety of\nproblems, the field still lacks a comprehensive theoretical framework. This\narticle aims to address this gap by delving into the intricate interplay\nbetween initialization and activation, providing a foundational basis for the\nrobust optimization of Neural Fields. Our theoretical insights reveal a\ndeep-seated connection among network initialization, architectural choices, and\nthe optimization process, emphasizing the need for a holistic approach when\ndesigning cutting-edge Neural Fields.\n","authors":["Hemanth Saratchandran","Sameera Ramasinghe","Simon Lucey"],"pdf_url":"https://arxiv.org/pdf/2403.19205v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.11956v3","updated":"2024-03-28T08:04:51Z","published":"2024-03-18T16:52:49Z","title":"Subjective-Aligned Dataset and Metric for Text-to-Video Quality\n Assessment","summary":" With the rapid development of generative models, Artificial\nIntelligence-Generated Contents (AIGC) have exponentially increased in daily\nlives. Among them, Text-to-Video (T2V) generation has received widespread\nattention. Though many T2V models have been released for generating high\nperceptual quality videos, there is still lack of a method to evaluate the\nquality of these videos quantitatively. To solve this issue, we establish the\nlargest-scale Text-to-Video Quality Assessment DataBase (T2VQA-DB) to date. The\ndataset is composed of 10,000 videos generated by 9 different T2V models. We\nalso conduct a subjective study to obtain each video's corresponding mean\nopinion score. Based on T2VQA-DB, we propose a novel transformer-based model\nfor subjective-aligned Text-to-Video Quality Assessment (T2VQA). The model\nextracts features from text-video alignment and video fidelity perspectives,\nthen it leverages the ability of a large language model to give the prediction\nscore. Experimental results show that T2VQA outperforms existing T2V metrics\nand SOTA video quality assessment models. Quantitative analysis indicates that\nT2VQA is capable of giving subjective-align predictions, validating its\neffectiveness. The dataset and code will be released at\nhttps://github.com/QMME/T2VQA.\n","authors":["Tengchuan Kou","Xiaohong Liu","Zicheng Zhang","Chunyi Li","Haoning Wu","Xiongkuo Min","Guangtao Zhai","Ning Liu"],"pdf_url":"https://arxiv.org/pdf/2403.11956v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18807v2","updated":"2024-03-28T08:01:34Z","published":"2024-03-27T17:53:30Z","title":"ECoDepth: Effective Conditioning of Diffusion Models for Monocular Depth\n Estimation","summary":" In the absence of parallax cues, a learning-based single image depth\nestimation (SIDE) model relies heavily on shading and contextual cues in the\nimage. While this simplicity is attractive, it is necessary to train such\nmodels on large and varied datasets, which are difficult to capture. It has\nbeen shown that using embeddings from pre-trained foundational models, such as\nCLIP, improves zero shot transfer in several applications. Taking inspiration\nfrom this, in our paper we explore the use of global image priors generated\nfrom a pre-trained ViT model to provide more detailed contextual information.\nWe argue that the embedding vector from a ViT model, pre-trained on a large\ndataset, captures greater relevant information for SIDE than the usual route of\ngenerating pseudo image captions, followed by CLIP based text embeddings. Based\non this idea, we propose a new SIDE model using a diffusion backbone which is\nconditioned on ViT embeddings. Our proposed design establishes a new\nstate-of-the-art (SOTA) for SIDE on NYUv2 dataset, achieving Abs Rel error of\n0.059(14% improvement) compared to 0.069 by the current SOTA (VPD). And on\nKITTI dataset, achieving Sq Rel error of 0.139 (2% improvement) compared to\n0.142 by the current SOTA (GEDepth). For zero-shot transfer with a model\ntrained on NYUv2, we report mean relative improvement of (20%, 23%, 81%, 25%)\nover NeWCRFs on (Sun-RGBD, iBims1, DIODE, HyperSim) datasets, compared to (16%,\n18%, 45%, 9%) by ZoeDepth. The code is available at\nhttps://ecodepth-iitd.github.io\n","authors":["Suraj Patni","Aradhye Agarwal","Chetan Arora"],"pdf_url":"https://arxiv.org/pdf/2403.18807v2.pdf","comment":"IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)\n 2024"},{"id":"http://arxiv.org/abs/2403.19203v1","updated":"2024-03-28T08:00:14Z","published":"2024-03-28T08:00:14Z","title":"Single-Shared Network with Prior-Inspired Loss for Parameter-Efficient\n Multi-Modal Imaging Skin Lesion Classification","summary":" In this study, we introduce a multi-modal approach that efficiently\nintegrates multi-scale clinical and dermoscopy features within a single\nnetwork, thereby substantially reducing model parameters. The proposed method\nincludes three novel fusion schemes.\n Firstly, unlike current methods that usually employ two individual models for\nfor clinical and dermoscopy modalities, we verified that multimodal feature can\nbe learned by sharing the parameters of encoder while leaving the individual\nmodal-specific classifiers.\n Secondly, the shared cross-attention module can replace the individual one to\nefficiently interact between two modalities at multiple layers.\n Thirdly, different from current methods that equally optimize dermoscopy and\nclinical branches, inspired by prior knowledge that dermoscopy images play a\nmore significant role than clinical images, we propose a novel biased loss.\nThis loss guides the single-shared network to prioritize dermoscopy information\nover clinical information, implicitly learning a better joint feature\nrepresentation for the modal-specific task.\n Extensive experiments on a well-recognized Seven-Point Checklist (SPC)\ndataset and a collected dataset demonstrate the effectiveness of our method on\nboth CNN and Transformer structures. Furthermore, our method exhibits\nsuperiority in both accuracy and model parameters compared to currently\nadvanced methods.\n","authors":["Peng Tang","Tobias Lasser"],"pdf_url":"https://arxiv.org/pdf/2403.19203v1.pdf","comment":"This paper have submitted to Journal for review"},{"id":"http://arxiv.org/abs/2403.14760v2","updated":"2024-03-28T07:46:49Z","published":"2024-03-21T18:02:20Z","title":"Can 3D Vision-Language Models Truly Understand Natural Language?","summary":" Rapid advancements in 3D vision-language (3D-VL) tasks have opened up new\navenues for human interaction with embodied agents or robots using natural\nlanguage. Despite this progress, we find a notable limitation: existing 3D-VL\nmodels exhibit sensitivity to the styles of language input, struggling to\nunderstand sentences with the same semantic meaning but written in different\nvariants. This observation raises a critical question: Can 3D vision-language\nmodels truly understand natural language? To test the language\nunderstandability of 3D-VL models, we first propose a language robustness task\nfor systematically assessing 3D-VL models across various tasks, benchmarking\ntheir performance when presented with different language style variants.\nImportantly, these variants are commonly encountered in applications requiring\ndirect interaction with humans, such as embodied robotics, given the diversity\nand unpredictability of human language. We propose a 3D Language Robustness\nDataset, designed based on the characteristics of human language, to facilitate\nthe systematic study of robustness. Our comprehensive evaluation uncovers a\nsignificant drop in the performance of all existing models across various 3D-VL\ntasks. Even the state-of-the-art 3D-LLM fails to understand some variants of\nthe same sentences. Further in-depth analysis suggests that the existing models\nhave a fragile and biased fusion module, which stems from the low diversity of\nthe existing dataset. Finally, we propose a training-free module driven by LLM,\nwhich improves language robustness. Datasets and code will be available at\ngithub.\n","authors":["Weipeng Deng","Runyu Ding","Jihan Yang","Jiahui Liu","Yijiang Li","Xiaojuan Qi","Edith Ngai"],"pdf_url":"https://arxiv.org/pdf/2403.14760v2.pdf","comment":"https://github.com/VincentDENGP/3D-LR"},{"id":"http://arxiv.org/abs/2403.19193v1","updated":"2024-03-28T07:43:49Z","published":"2024-03-28T07:43:49Z","title":"Text Data-Centric Image Captioning with Interactive Prompts","summary":" Supervised image captioning approaches have made great progress, but it is\nchallenging to collect high-quality human-annotated image-text data. Recently,\nlarge-scale vision and language models (e.g., CLIP) and large-scale generative\nlanguage models (e.g., GPT-2) have shown strong performances in various tasks,\nwhich also provide some new solutions for image captioning with web paired\ndata, unpaired data or even text-only data. Among them, the mainstream solution\nis to project image embeddings into the text embedding space with the\nassistance of consistent representations between image-text pairs from the CLIP\nmodel. However, the current methods still face several challenges in adapting\nto the diversity of data configurations in a unified solution, accurately\nestimating image-text embedding bias, and correcting unsatisfactory prediction\nresults in the inference stage. This paper proposes a new Text data-centric\napproach with Interactive Prompts for image Captioning, named TIPCap. 1) We\nconsider four different settings which gradually reduce the dependence on\npaired data. 2) We construct a mapping module driven by multivariate Gaussian\ndistribution to mitigate the modality gap, which is applicable to the above\nfour different settings. 3) We propose a prompt interaction module that can\nincorporate optional prompt information before generating captions. Extensive\nexperiments show that our TIPCap outperforms other weakly or unsupervised image\ncaptioning methods and achieves a new state-of-the-art performance on two\nwidely used datasets, i.e., MS-COCO and Flickr30K.\n","authors":["Yiyu Wang","Hao Luo","Jungang Xu","Yingfei Sun","Fan Wang"],"pdf_url":"https://arxiv.org/pdf/2403.19193v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15955v2","updated":"2024-03-28T07:30:25Z","published":"2024-03-23T23:22:54Z","title":"Finding needles in a haystack: A Black-Box Approach to Invisible\n Watermark Detection","summary":" In this paper, we propose WaterMark Detection (WMD), the first invisible\nwatermark detection method under a black-box and annotation-free setting. WMD\nis capable of detecting arbitrary watermarks within a given reference dataset\nusing a clean non-watermarked dataset as a reference, without relying on\nspecific decoding methods or prior knowledge of the watermarking techniques. We\ndevelop WMD using foundations of offset learning, where a clean non-watermarked\ndataset enables us to isolate the influence of only watermarked samples in the\nreference dataset. Our comprehensive evaluations demonstrate the effectiveness\nof WMD, significantly outperforming naive detection methods, which only yield\nAUC scores around 0.5. In contrast, WMD consistently achieves impressive\ndetection AUC scores, surpassing 0.9 in most single-watermark datasets and\nexceeding 0.7 in more challenging multi-watermark scenarios across diverse\ndatasets and watermarking methods. As invisible watermarks become increasingly\nprevalent, while specific decoding techniques remain undisclosed, our approach\nprovides a versatile solution and establishes a path toward increasing\naccountability, transparency, and trust in our digital visual content.\n","authors":["Minzhou Pan","Zhengting Wang","Xin Dong","Vikash Sehwag","Lingjuan Lyu","Xue Lin"],"pdf_url":"https://arxiv.org/pdf/2403.15955v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09911v3","updated":"2024-03-28T07:16:11Z","published":"2023-08-19T05:34:13Z","title":"Noisy-Correspondence Learning for Text-to-Image Person Re-identification","summary":" Text-to-image person re-identification (TIReID) is a compelling topic in the\ncross-modal community, which aims to retrieve the target person based on a\ntextual query. Although numerous TIReID methods have been proposed and achieved\npromising performance, they implicitly assume the training image-text pairs are\ncorrectly aligned, which is not always the case in real-world scenarios. In\npractice, the image-text pairs inevitably exist under-correlated or even\nfalse-correlated, a.k.a noisy correspondence (NC), due to the low quality of\nthe images and annotation errors. To address this problem, we propose a novel\nRobust Dual Embedding method (RDE) that can learn robust visual-semantic\nassociations even with NC. Specifically, RDE consists of two main components:\n1) A Confident Consensus Division (CCD) module that leverages the dual-grained\ndecisions of dual embedding modules to obtain a consensus set of clean training\ndata, which enables the model to learn correct and reliable visual-semantic\nassociations. 2) A Triplet Alignment Loss (TAL) relaxes the conventional\nTriplet Ranking loss with the hardest negative samples to a log-exponential\nupper bound over all negative ones, thus preventing the model collapse under NC\nand can also focus on hard-negative samples for promising performance. We\nconduct extensive experiments on three public benchmarks, namely CUHK-PEDES,\nICFG-PEDES, and RSTPReID, to evaluate the performance and robustness of our\nRDE. Our method achieves state-of-the-art results both with and without\nsynthetic noisy correspondences on all three datasets. Code is available at\nhttps://github.com/QinYang79/RDE.\n","authors":["Yang Qin","Yingke Chen","Dezhong Peng","Xi Peng","Joey Tianyi Zhou","Peng Hu"],"pdf_url":"https://arxiv.org/pdf/2308.09911v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13102v2","updated":"2024-03-28T07:13:53Z","published":"2023-12-20T15:20:25Z","title":"SpecNeRF: Gaussian Directional Encoding for Specular Reflections","summary":" Neural radiance fields have achieved remarkable performance in modeling the\nappearance of 3D scenes. However, existing approaches still struggle with the\nview-dependent appearance of glossy surfaces, especially under complex lighting\nof indoor environments. Unlike existing methods, which typically assume distant\nlighting like an environment map, we propose a learnable Gaussian directional\nencoding to better model the view-dependent effects under near-field lighting\nconditions. Importantly, our new directional encoding captures the\nspatially-varying nature of near-field lighting and emulates the behavior of\nprefiltered environment maps. As a result, it enables the efficient evaluation\nof preconvolved specular color at any 3D location with varying roughness\ncoefficients. We further introduce a data-driven geometry prior that helps\nalleviate the shape radiance ambiguity in reflection modeling. We show that our\nGaussian directional encoding and geometry prior significantly improve the\nmodeling of challenging specular reflections in neural radiance fields, which\nhelps decompose appearance into more physically meaningful components.\n","authors":["Li Ma","Vasu Agrawal","Haithem Turki","Changil Kim","Chen Gao","Pedro Sander","Michael Zollhöfer","Christian Richardt"],"pdf_url":"https://arxiv.org/pdf/2312.13102v2.pdf","comment":"Accepted to CVPR2024, Project page:\n https://limacv.github.io/SpecNeRF_web/"},{"id":"http://arxiv.org/abs/2403.19177v1","updated":"2024-03-28T07:01:11Z","published":"2024-03-28T07:01:11Z","title":"Rethinking Information Loss in Medical Image Segmentation with\n Various-sized Targets","summary":" Medical image segmentation presents the challenge of segmenting various-size\ntargets, demanding the model to effectively capture both local and global\ninformation. Despite recent efforts using CNNs and ViTs to predict annotations\nof different scales, these approaches often struggle to effectively balance the\ndetection of targets across varying sizes. Simply utilizing local information\nfrom CNNs and global relationships from ViTs without considering potential\nsignificant divergence in latent feature distributions may result in\nsubstantial information loss. To address this issue, in this paper, we will\nintroduce a novel Stagger Network (SNet) and argues that a well-designed fusion\nstructure can mitigate the divergence in latent feature distributions between\nCNNs and ViTs, thereby reducing information loss. Specifically, to emphasize\nboth global dependencies and local focus, we design a Parallel Module to bridge\nthe semantic gap. Meanwhile, we propose the Stagger Module, trying to fuse the\nselected features that are more semantically similar. An Information Recovery\nModule is further adopted to recover complementary information back to the\nnetwork. As a key contribution, we theoretically analyze that the proposed\nparallel and stagger strategies would lead to less information loss, thus\ncertifying the SNet's rationale. Experimental results clearly proved that the\nproposed SNet excels comparisons with recent SOTAs in segmenting on the Synapse\ndataset where targets are in various sizes. Besides, it also demonstrates\nsuperiority on the ACDC and the MoNuSeg datasets where targets are with more\nconsistent dimensions.\n","authors":["Tianyi Liu","Zhaorui Tan","Kaizhu Huang","Haochuan Jiang"],"pdf_url":"https://arxiv.org/pdf/2403.19177v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16169v3","updated":"2024-03-28T06:56:45Z","published":"2024-03-24T14:24:13Z","title":"Gaze-guided Hand-Object Interaction Synthesis: Benchmark and Method","summary":" Gaze plays a crucial role in revealing human attention and intention,\nshedding light on the cognitive processes behind human actions. The integration\nof gaze guidance with the dynamics of hand-object interactions boosts the\naccuracy of human motion prediction. However, the lack of datasets that capture\nthe intricate relationship and consistency among gaze, hand, and object\nmovements remains a substantial hurdle. In this paper, we introduce the first\nGaze-guided Hand-Object Interaction dataset, GazeHOI, and present a novel task\nfor synthesizing gaze-guided hand-object interactions. Our dataset, GazeHOI,\nfeatures simultaneous 3D modeling of gaze, hand, and object interactions,\ncomprising 479 sequences with an average duration of 19.1 seconds, 812\nsub-sequences, and 33 objects of various sizes. We propose a hierarchical\nframework centered on a gaze-guided hand-object interaction diffusion model,\nnamed GHO-Diffusion. In the pre-diffusion phase, we separate gaze conditions\ninto spatial-temporal features and goal pose conditions at different levels of\ninformation granularity. During the diffusion phase, two gaze-conditioned\ndiffusion models are stacked to simplify the complex synthesis of hand-object\nmotions. Here, the object motion diffusion model generates sequences of object\nmotions based on gaze conditions, while the hand motion diffusion model\nproduces hand motions based on the generated object motion. To improve\nfine-grained goal pose alignment, we introduce a Spherical Gaussian constraint\nto guide the denoising step. In the subsequent post-diffusion phase, we\noptimize the generated hand motions using contact consistency. Our extensive\nexperiments highlight the uniqueness of our dataset and the effectiveness of\nour approach.\n","authors":["Jie Tian","Lingxiao Yang","Ran Ji","Yuexin Ma","Lan Xu","Jingyi Yu","Ye Shi","Jingya Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16169v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19174v1","updated":"2024-03-28T06:46:45Z","published":"2024-03-28T06:46:45Z","title":"Algorithmic Ways of Seeing: Using Object Detection to Facilitate Art\n Exploration","summary":" This Research through Design paper explores how object detection may be\napplied to a large digital art museum collection to facilitate new ways of\nencountering and experiencing art. We present the design and evaluation of an\ninteractive application called SMKExplore, which allows users to explore a\nmuseum's digital collection of paintings by browsing through objects detected\nin the images, as a novel form of open-ended exploration. We provide three\ncontributions. First, we show how an object detection pipeline can be\nintegrated into a design process for visual exploration. Second, we present the\ndesign and development of an app that enables exploration of an art museum's\ncollection. Third, we offer reflections on future possibilities for museums and\nHCI researchers to incorporate object detection techniques into the\ndigitalization of museums.\n","authors":["Louie Søs Meyer","Johanne Engel Aaen","Anitamalina Regitse Tranberg","Peter Kun","Matthias Freiberger","Sebastian Risi","Anders Sundnes Løvlie"],"pdf_url":"https://arxiv.org/pdf/2403.19174v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00365v2","updated":"2024-03-28T06:38:55Z","published":"2023-12-31T01:39:38Z","title":"HQ-VAE: Hierarchical Discrete Representation Learning with Variational\n Bayes","summary":" Vector quantization (VQ) is a technique to deterministically learn features\nwith discrete codebook representations. It is commonly performed with a\nvariational autoencoding model, VQ-VAE, which can be further extended to\nhierarchical structures for making high-fidelity reconstructions. However, such\nhierarchical extensions of VQ-VAE often suffer from the codebook/layer collapse\nissue, where the codebook is not efficiently used to express the data, and\nhence degrades reconstruction accuracy. To mitigate this problem, we propose a\nnovel unified framework to stochastically learn hierarchical discrete\nrepresentation on the basis of the variational Bayes framework, called\nhierarchically quantized variational autoencoder (HQ-VAE). HQ-VAE naturally\ngeneralizes the hierarchical variants of VQ-VAE, such as VQ-VAE-2 and\nresidual-quantized VAE (RQ-VAE), and provides them with a Bayesian training\nscheme. Our comprehensive experiments on image datasets show that HQ-VAE\nenhances codebook usage and improves reconstruction performance. We also\nvalidated HQ-VAE in terms of its applicability to a different modality with an\naudio dataset.\n","authors":["Yuhta Takida","Yukara Ikemiya","Takashi Shibuya","Kazuki Shimada","Woosung Choi","Chieh-Hsin Lai","Naoki Murata","Toshimitsu Uesaka","Kengo Uchida","Wei-Hsiang Liao","Yuki Mitsufuji"],"pdf_url":"https://arxiv.org/pdf/2401.00365v2.pdf","comment":"34 pages with 17 figures, accepted for TMLR"},{"id":"http://arxiv.org/abs/2403.19164v1","updated":"2024-03-28T06:22:45Z","published":"2024-03-28T06:22:45Z","title":"RecDiffusion: Rectangling for Image Stitching with Diffusion Models","summary":" Image stitching from different captures often results in non-rectangular\nboundaries, which is often considered unappealing. To solve non-rectangular\nboundaries, current solutions involve cropping, which discards image content,\ninpainting, which can introduce unrelated content, or warping, which can\ndistort non-linear features and introduce artifacts. To overcome these issues,\nwe introduce a novel diffusion-based learning framework, \\textbf{RecDiffusion},\nfor image stitching rectangling. This framework combines Motion Diffusion\nModels (MDM) to generate motion fields, effectively transitioning from the\nstitched image's irregular borders to a geometrically corrected intermediary.\nFollowed by Content Diffusion Models (CDM) for image detail refinement.\nNotably, our sampling process utilizes a weighted map to identify regions\nneeding correction during each iteration of CDM. Our RecDiffusion ensures\ngeometric accuracy and overall visual appeal, surpassing all previous methods\nin both quantitative and qualitative measures when evaluated on public\nbenchmarks. Code is released at https://github.com/lhaippp/RecDiffusion.\n","authors":["Tianhao Zhou","Haipeng Li","Ziyi Wang","Ao Luo","Chen-Lin Zhang","Jiajun Li","Bing Zeng","Shuaicheng Liu"],"pdf_url":"https://arxiv.org/pdf/2403.19164v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10522v5","updated":"2024-03-28T06:20:10Z","published":"2023-11-17T13:43:43Z","title":"Enhancing Object Coherence in Layout-to-Image Synthesis","summary":" Layout-to-image synthesis is an emerging technique in conditional image\ngeneration. It aims to generate complex scenes, where users require fine\ncontrol over the layout of the objects in a scene. However, it remains\nchallenging to control the object coherence, including semantic coherence\n(e.g., the cat looks at the flowers or not) and physical coherence (e.g., the\nhand and the racket should not be misaligned). In this paper, we propose a\nnovel diffusion model with effective global semantic fusion (GSF) and\nself-similarity feature enhancement modules to guide the object coherence for\nthis task. For semantic coherence, we argue that the image caption contains\nrich information for defining the semantic relationship within the objects in\nthe images. Instead of simply employing cross-attention between captions and\ngenerated images, which addresses the highly relevant layout restriction and\nsemantic coherence separately and thus leads to unsatisfying results shown in\nour experiments, we develop GSF to fuse the supervision from the layout\nrestriction and semantic coherence requirement and exploit it to guide the\nimage synthesis process. Moreover, to improve the physical coherence, we\ndevelop a Self-similarity Coherence Attention (SCA) module to explicitly\nintegrate local contextual physical coherence into each pixel's generation\nprocess. Specifically, we adopt a self-similarity map to encode the coherence\nrestrictions and employ it to extract coherent features from text embedding.\nThrough visualization of our self-similarity map, we explore the essence of\nSCA, revealing that its effectiveness is not only in capturing reliable\nphysical coherence patterns but also in enhancing complex texture generation.\nExtensive experiments demonstrate the superiority of our proposed method in\nboth image generation quality and controllability.\n","authors":["Yibin Wang","Weizhong Zhang","Jianwei Zheng","Cheng Jin"],"pdf_url":"https://arxiv.org/pdf/2311.10522v5.pdf","comment":"GitHub: https://github.com/CodeGoat24/EOCNet"},{"id":"http://arxiv.org/abs/2403.19163v1","updated":"2024-03-28T06:18:12Z","published":"2024-03-28T06:18:12Z","title":"D'OH: Decoder-Only random Hypernetworks for Implicit Neural\n Representations","summary":" Deep implicit functions have been found to be an effective tool for\nefficiently encoding all manner of natural signals. Their attractiveness stems\nfrom their ability to compactly represent signals with little to no off-line\ntraining data. Instead, they leverage the implicit bias of deep networks to\ndecouple hidden redundancies within the signal. In this paper, we explore the\nhypothesis that additional compression can be achieved by leveraging the\nredundancies that exist between layers. We propose to use a novel run-time\ndecoder-only hypernetwork - that uses no offline training data - to better\nmodel this cross-layer parameter redundancy. Previous applications of\nhyper-networks with deep implicit functions have applied feed-forward\nencoder/decoder frameworks that rely on large offline datasets that do not\ngeneralize beyond the signals they were trained on. We instead present a\nstrategy for the initialization of run-time deep implicit functions for\nsingle-instance signals through a Decoder-Only randomly projected Hypernetwork\n(D'OH). By directly changing the dimension of a latent code to approximate a\ntarget implicit neural architecture, we provide a natural way to vary the\nmemory footprint of neural representations without the costly need for neural\narchitecture search on a space of alternative low-rate structures.\n","authors":["Cameron Gordon","Lachlan Ewen MacDonald","Hemanth Saratchandran","Simon Lucey"],"pdf_url":"https://arxiv.org/pdf/2403.19163v1.pdf","comment":"29 pages, 17 figures"},{"id":"http://arxiv.org/abs/2403.19160v1","updated":"2024-03-28T06:05:14Z","published":"2024-03-28T06:05:14Z","title":"Within the Dynamic Context: Inertia-aware 3D Human Modeling with Pose\n Sequence","summary":" Neural rendering techniques have significantly advanced 3D human body\nmodeling. However, previous approaches often overlook dynamics induced by\nfactors such as motion inertia, leading to challenges in scenarios like abrupt\nstops after rotation, where the pose remains static while the appearance\nchanges. This limitation arises from reliance on a single pose as conditional\ninput, resulting in ambiguity in mapping one pose to multiple appearances. In\nthis study, we elucidate that variations in human appearance depend not only on\nthe current frame's pose condition but also on past pose states. Therefore, we\nintroduce Dyco, a novel method utilizing the delta pose sequence representation\nfor non-rigid deformations and canonical space to effectively model temporal\nappearance variations. To prevent a decrease in the model's generalization\nability to novel poses, we further propose low-dimensional global context to\nreduce unnecessary inter-body part dependencies and a quantization operation to\nmitigate overfitting of the delta pose sequence by the model. To validate the\neffectiveness of our approach, we collected a novel dataset named I3D-Human,\nwith a focus on capturing temporal changes in clothing appearance under\napproximate poses. Through extensive experiments on both I3D-Human and existing\ndatasets, our approach demonstrates superior qualitative and quantitative\nperformance. In addition, our inertia-aware 3D human method can unprecedentedly\nsimulate appearance changes caused by inertia at different velocities.\n","authors":["Yutong Chen","Yifan Zhan","Zhihang Zhong","Wei Wang","Xiao Sun","Yu Qiao","Yinqiang Zheng"],"pdf_url":"https://arxiv.org/pdf/2403.19160v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16473v3","updated":"2024-03-28T05:47:24Z","published":"2023-11-26T02:35:09Z","title":"GS-IR: 3D Gaussian Splatting for Inverse Rendering","summary":" We propose GS-IR, a novel inverse rendering approach based on 3D Gaussian\nSplatting (GS) that leverages forward mapping volume rendering to achieve\nphotorealistic novel view synthesis and relighting results. Unlike previous\nworks that use implicit neural representations and volume rendering (e.g.\nNeRF), which suffer from low expressive power and high computational\ncomplexity, we extend GS, a top-performance representation for novel view\nsynthesis, to estimate scene geometry, surface material, and environment\nillumination from multi-view images captured under unknown lighting conditions.\nThere are two main problems when introducing GS to inverse rendering: 1) GS\ndoes not support producing plausible normal natively; 2) forward mapping (e.g.\nrasterization and splatting) cannot trace the occlusion like backward mapping\n(e.g. ray tracing). To address these challenges, our GS-IR proposes an\nefficient optimization scheme that incorporates a depth-derivation-based\nregularization for normal estimation and a baking-based occlusion to model\nindirect lighting. The flexible and expressive GS representation allows us to\nachieve fast and compact geometry reconstruction, photorealistic novel view\nsynthesis, and effective physically-based rendering. We demonstrate the\nsuperiority of our method over baseline methods through qualitative and\nquantitative evaluations on various challenging scenes.\n","authors":["Zhihao Liang","Qi Zhang","Ying Feng","Ying Shan","Kui Jia"],"pdf_url":"https://arxiv.org/pdf/2311.16473v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19158v1","updated":"2024-03-28T05:44:48Z","published":"2024-03-28T05:44:48Z","title":"Uncertainty-Aware Deep Video Compression with Ensembles","summary":" Deep learning-based video compression is a challenging task, and many\nprevious state-of-the-art learning-based video codecs use optical flows to\nexploit the temporal correlation between successive frames and then compress\nthe residual error. Although these two-stage models are end-to-end optimized,\nthe epistemic uncertainty in the motion estimation and the aleatoric\nuncertainty from the quantization operation lead to errors in the intermediate\nrepresentations and introduce artifacts in the reconstructed frames. This\ninherent flaw limits the potential for higher bit rate savings. To address this\nissue, we propose an uncertainty-aware video compression model that can\neffectively capture the predictive uncertainty with deep ensembles.\nAdditionally, we introduce an ensemble-aware loss to encourage the diversity\namong ensemble members and investigate the benefits of incorporating\nadversarial training in the video compression task. Experimental results on\n1080p sequences show that our model can effectively save bits by more than 20%\ncompared to DVC Pro.\n","authors":["Wufei Ma","Jiahao Li","Bin Li","Yan Lu"],"pdf_url":"https://arxiv.org/pdf/2403.19158v1.pdf","comment":"Published on IEEE Transactions on Multimedia"},{"id":"http://arxiv.org/abs/2306.02240v2","updated":"2024-03-28T05:35:46Z","published":"2023-06-04T02:55:25Z","title":"ProTeCt: Prompt Tuning for Taxonomic Open Set Classification","summary":" Visual-language foundation models, like CLIP, learn generalized\nrepresentations that enable zero-shot open-set classification. Few-shot\nadaptation methods, based on prompt tuning, have been shown to further improve\nperformance on downstream datasets. However, these methods do not fare well in\nthe taxonomic open set (TOS) setting, where the classifier is asked to make\npredictions from label sets across different levels of semantic granularity.\nFrequently, they infer incorrect labels at coarser taxonomic class levels, even\nwhen the inference at the leaf level (original class labels) is correct. To\naddress this problem, we propose a prompt tuning technique that calibrates the\nhierarchical consistency of model predictions. A set of metrics of hierarchical\nconsistency, the Hierarchical Consistent Accuracy (HCA) and the Mean Treecut\nAccuracy (MTA), are first proposed to evaluate TOS model performance. A new\nPrompt Tuning for Hierarchical Consistency (ProTeCt) technique is then proposed\nto calibrate classification across label set granularities. Results show that\nProTeCt can be combined with existing prompt tuning methods to significantly\nimprove TOS classification without degrading the leaf level classification\nperformance.\n","authors":["Tz-Ying Wu","Chih-Hui Ho","Nuno Vasconcelos"],"pdf_url":"https://arxiv.org/pdf/2306.02240v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.14302v2","updated":"2024-03-28T05:13:43Z","published":"2024-03-21T11:16:42Z","title":"SpikingResformer: Bridging ResNet and Vision Transformer in Spiking\n Neural Networks","summary":" The remarkable success of Vision Transformers in Artificial Neural Networks\n(ANNs) has led to a growing interest in incorporating the self-attention\nmechanism and transformer-based architecture into Spiking Neural Networks\n(SNNs). While existing methods propose spiking self-attention mechanisms that\nare compatible with SNNs, they lack reasonable scaling methods, and the overall\narchitectures proposed by these methods suffer from a bottleneck in effectively\nextracting local features. To address these challenges, we propose a novel\nspiking self-attention mechanism named Dual Spike Self-Attention (DSSA) with a\nreasonable scaling method. Based on DSSA, we propose a novel spiking Vision\nTransformer architecture called SpikingResformer, which combines the\nResNet-based multi-stage architecture with our proposed DSSA to improve both\nperformance and energy efficiency while reducing parameters. Experimental\nresults show that SpikingResformer achieves higher accuracy with fewer\nparameters and lower energy consumption than other spiking Vision Transformer\ncounterparts. Notably, our SpikingResformer-L achieves 79.40% top-1 accuracy on\nImageNet with 4 time-steps, which is the state-of-the-art result in the SNN\nfield.\n","authors":["Xinyu Shi","Zecheng Hao","Zhaofei Yu"],"pdf_url":"https://arxiv.org/pdf/2403.14302v2.pdf","comment":"To be published in the 2024 IEEE/CVF Conference on Computer Vision\n and Pattern Recognition (CVPR)"},{"id":"http://arxiv.org/abs/2403.19150v1","updated":"2024-03-28T05:08:25Z","published":"2024-03-28T05:08:25Z","title":"Towards Understanding Dual BN In Hybrid Adversarial Training","summary":" There is a growing concern about applying batch normalization (BN) in\nadversarial training (AT), especially when the model is trained on both\nadversarial samples and clean samples (termed Hybrid-AT). With the assumption\nthat adversarial and clean samples are from two different domains, a common\npractice in prior works is to adopt Dual BN, where BN and BN are used for\nadversarial and clean branches, respectively. A popular belief for motivating\nDual BN is that estimating normalization statistics of this mixture\ndistribution is challenging and thus disentangling it for normalization\nachieves stronger robustness. In contrast to this belief, we reveal that\ndisentangling statistics plays a less role than disentangling affine parameters\nin model training. This finding aligns with prior work (Rebuffi et al., 2023),\nand we build upon their research for further investigations. We demonstrate\nthat the domain gap between adversarial and clean samples is not very large,\nwhich is counter-intuitive considering the significant influence of adversarial\nperturbation on the model accuracy. We further propose a two-task hypothesis\nwhich serves as the empirical foundation and a unified framework for Hybrid-AT\nimprovement. We also investigate Dual BN in test-time and reveal that affine\nparameters characterize the robustness during inference. Overall, our work\nsheds new light on understanding the mechanism of Dual BN in Hybrid-AT and its\nunderlying justification.\n","authors":["Chenshuang Zhang","Chaoning Zhang","Kang Zhang","Axi Niu","Junmo Kim","In So Kweon"],"pdf_url":"https://arxiv.org/pdf/2403.19150v1.pdf","comment":"Accepted at TMLR"},{"id":"http://arxiv.org/abs/2403.19144v1","updated":"2024-03-28T04:35:42Z","published":"2024-03-28T04:35:42Z","title":"MoDiTalker: Motion-Disentangled Diffusion Model for High-Fidelity\n Talking Head Generation","summary":" Conventional GAN-based models for talking head generation often suffer from\nlimited quality and unstable training. Recent approaches based on diffusion\nmodels aimed to address these limitations and improve fidelity. However, they\nstill face challenges, including extensive sampling times and difficulties in\nmaintaining temporal consistency due to the high stochasticity of diffusion\nmodels. To overcome these challenges, we propose a novel motion-disentangled\ndiffusion model for high-quality talking head generation, dubbed MoDiTalker. We\nintroduce the two modules: audio-to-motion (AToM), designed to generate a\nsynchronized lip motion from audio, and motion-to-video (MToV), designed to\nproduce high-quality head video following the generated motion. AToM excels in\ncapturing subtle lip movements by leveraging an audio attention mechanism. In\naddition, MToV enhances temporal consistency by leveraging an efficient\ntri-plane representation. Our experiments conducted on standard benchmarks\ndemonstrate that our model achieves superior performance compared to existing\nmodels. We also provide comprehensive ablation studies and user study results.\n","authors":["Seyeon Kim","Siyoon Jin","Jihye Park","Kihong Kim","Jiyoung Kim","Jisu Nam","Seungryong Kim"],"pdf_url":"https://arxiv.org/pdf/2403.19144v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19140v1","updated":"2024-03-28T04:24:56Z","published":"2024-03-28T04:24:56Z","title":"QNCD: Quantization Noise Correction for Diffusion Models","summary":" Diffusion models have revolutionized image synthesis, setting new benchmarks\nin quality and creativity. However, their widespread adoption is hindered by\nthe intensive computation required during the iterative denoising process.\nPost-training quantization (PTQ) presents a solution to accelerate sampling,\naibeit at the expense of sample quality, extremely in low-bit settings.\nAddressing this, our study introduces a unified Quantization Noise Correction\nScheme (QNCD), aimed at minishing quantization noise throughout the sampling\nprocess. We identify two primary quantization challenges: intra and inter\nquantization noise. Intra quantization noise, mainly exacerbated by embeddings\nin the resblock module, extends activation quantization ranges, increasing\ndisturbances in each single denosing step. Besides, inter quantization noise\nstems from cumulative quantization deviations across the entire denoising\nprocess, altering data distributions step-by-step. QNCD combats these through\nembedding-derived feature smoothing for eliminating intra quantization noise\nand an effective runtime noise estimatiation module for dynamicly filtering\ninter quantization noise. Extensive experiments demonstrate that our method\noutperforms previous quantization methods for diffusion models, achieving\nlossless results in W4A8 and W8A8 quantization settings on ImageNet (LDM-4).\nCode is available at: https://github.com/huanpengchu/QNCD\n","authors":["Huanpeng Chu","Wei Wu","Chengjie Zang","Kun Yuan"],"pdf_url":"https://arxiv.org/pdf/2403.19140v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19137v1","updated":"2024-03-28T04:15:58Z","published":"2024-03-28T04:15:58Z","title":"CLAP4CLIP: Continual Learning with Probabilistic Finetuning for\n Vision-Language Models","summary":" Continual learning (CL) aims to help deep neural networks to learn new\nknowledge while retaining what has been learned. Recently, pre-trained\nvision-language models such as CLIP, with powerful generalization ability, have\nbeen gaining traction as practical CL candidates. However, the domain mismatch\nbetween the pre-training and the downstream CL tasks calls for finetuning of\nthe CLIP on the latter. The deterministic nature of the existing finetuning\nmethods makes them overlook the many possible interactions across the\nmodalities and deems them unsafe for high-risk CL tasks requiring reliable\nuncertainty estimation. To address these, our work proposes Continual LeArning\nwith Probabilistic finetuning (CLAP). CLAP develops probabilistic modeling over\ntask-specific modules with visual-guided text features, providing more reliable\nfine-tuning in CL. It further alleviates forgetting by exploiting the rich\npre-trained knowledge of CLIP for weight initialization and distribution\nregularization of task-specific modules. Cooperating with the diverse range of\nexisting prompting methods, CLAP can surpass the predominant deterministic\nfinetuning approaches for CL with CLIP. Lastly, we study the superior\nuncertainty estimation abilities of CLAP for novel data detection and exemplar\nselection within CL setups. Our code is available at\n\\url{https://github.com/srvCodes/clap4clip}.\n","authors":["Saurav Jha","Dong Gong","Lina Yao"],"pdf_url":"https://arxiv.org/pdf/2403.19137v1.pdf","comment":"Work under review"},{"id":"http://arxiv.org/abs/2402.19161v2","updated":"2024-03-28T04:07:57Z","published":"2024-02-29T13:45:13Z","title":"MemoNav: Working Memory Model for Visual Navigation","summary":" Image-goal navigation is a challenging task that requires an agent to\nnavigate to a goal indicated by an image in unfamiliar environments. Existing\nmethods utilizing diverse scene memories suffer from inefficient exploration\nsince they use all historical observations for decision-making without\nconsidering the goal-relevant fraction. To address this limitation, we present\nMemoNav, a novel memory model for image-goal navigation, which utilizes a\nworking memory-inspired pipeline to improve navigation performance.\nSpecifically, we employ three types of navigation memory. The node features on\na map are stored in the short-term memory (STM), as these features are\ndynamically updated. A forgetting module then retains the informative STM\nfraction to increase efficiency. We also introduce long-term memory (LTM) to\nlearn global scene representations by progressively aggregating STM features.\nSubsequently, a graph attention module encodes the retained STM and the LTM to\ngenerate working memory (WM) which contains the scene features essential for\nefficient navigation. The synergy among these three memory types boosts\nnavigation performance by enabling the agent to learn and leverage\ngoal-relevant scene features within a topological map. Our evaluation on\nmulti-goal tasks demonstrates that MemoNav significantly outperforms previous\nmethods across all difficulty levels in both Gibson and Matterport3D scenes.\nQualitative results further illustrate that MemoNav plans more efficient\nroutes.\n","authors":["Hongxin Li","Zeyu Wang","Xu Yang","Yuran Yang","Shuqi Mei","Zhaoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.19161v2.pdf","comment":"Accepted to CVPR 2024. Code: https://github.com/ZJULiHongxin/MemoNav"},{"id":"http://arxiv.org/abs/2403.18605v2","updated":"2024-03-28T03:56:07Z","published":"2024-03-27T14:24:30Z","title":"FlexEdit: Flexible and Controllable Diffusion-based Object-centric Image\n Editing","summary":" Our work addresses limitations seen in previous approaches for object-centric\nediting problems, such as unrealistic results due to shape discrepancies and\nlimited control in object replacement or insertion. To this end, we introduce\nFlexEdit, a flexible and controllable editing framework for objects where we\niteratively adjust latents at each denoising step using our FlexEdit block.\nInitially, we optimize latents at test time to align with specified object\nconstraints. Then, our framework employs an adaptive mask, automatically\nextracted during denoising, to protect the background while seamlessly blending\nnew content into the target image. We demonstrate the versatility of FlexEdit\nin various object editing tasks and curate an evaluation test suite with\nsamples from both real and synthetic images, along with novel evaluation\nmetrics designed for object-centric editing. We conduct extensive experiments\non different editing scenarios, demonstrating the superiority of our editing\nframework over recent advanced text-guided image editing methods. Our project\npage is published at https://flex-edit.github.io/.\n","authors":["Trong-Tung Nguyen","Duc-Anh Nguyen","Anh Tran","Cuong Pham"],"pdf_url":"https://arxiv.org/pdf/2403.18605v2.pdf","comment":"Our project page: https://flex-edit.github.io/"},{"id":"http://arxiv.org/abs/2403.19128v1","updated":"2024-03-28T03:51:14Z","published":"2024-03-28T03:51:14Z","title":"OmniParser: A Unified Framework for Text Spotting, Key Information\n Extraction and Table Recognition","summary":" Recently, visually-situated text parsing (VsTP) has experienced notable\nadvancements, driven by the increasing demand for automated document\nunderstanding and the emergence of Generative Large Language Models (LLMs)\ncapable of processing document-based questions. Various methods have been\nproposed to address the challenging problem of VsTP. However, due to the\ndiversified targets and heterogeneous schemas, previous works usually design\ntask-specific architectures and objectives for individual tasks, which\ninadvertently leads to modal isolation and complex workflow. In this paper, we\npropose a unified paradigm for parsing visually-situated text across diverse\nscenarios. Specifically, we devise a universal model, called OmniParser, which\ncan simultaneously handle three typical visually-situated text parsing tasks:\ntext spotting, key information extraction, and table recognition. In\nOmniParser, all tasks share the unified encoder-decoder architecture, the\nunified objective: point-conditioned text generation, and the unified input &\noutput representation: prompt & structured sequences. Extensive experiments\ndemonstrate that the proposed OmniParser achieves state-of-the-art (SOTA) or\nhighly competitive performances on 7 datasets for the three visually-situated\ntext parsing tasks, despite its unified, concise design. The code is available\nat https://github.com/AlibabaResearch/AdvancedLiterateMachinery.\n","authors":["Jianqiang Wan","Sibo Song","Wenwen Yu","Yuliang Liu","Wenqing Cheng","Fei Huang","Xiang Bai","Cong Yao","Zhibo Yang"],"pdf_url":"https://arxiv.org/pdf/2403.19128v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2303.05699v4","updated":"2024-03-28T03:48:40Z","published":"2023-03-10T04:49:01Z","title":"Feature Unlearning for Pre-trained GANs and VAEs","summary":" We tackle the problem of feature unlearning from a pre-trained image\ngenerative model: GANs and VAEs. Unlike a common unlearning task where an\nunlearning target is a subset of the training set, we aim to unlearn a specific\nfeature, such as hairstyle from facial images, from the pre-trained generative\nmodels. As the target feature is only presented in a local region of an image,\nunlearning the entire image from the pre-trained model may result in losing\nother details in the remaining region of the image. To specify which features\nto unlearn, we collect randomly generated images that contain the target\nfeatures. We then identify a latent representation corresponding to the target\nfeature and then use the representation to fine-tune the pre-trained model.\nThrough experiments on MNIST, CelebA, and FFHQ datasets, we show that target\nfeatures are successfully removed while keeping the fidelity of the original\nmodels. Further experiments with an adversarial attack show that the unlearned\nmodel is more robust under the presence of malicious parties.\n","authors":["Saemi Moon","Seunghyuk Cho","Dongwoo Kim"],"pdf_url":"https://arxiv.org/pdf/2303.05699v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19124v1","updated":"2024-03-28T03:35:00Z","published":"2024-03-28T03:35:00Z","title":"PoCo: A Self-Supervised Approach via Polar Transformation Based\n Progressive Contrastive Learning for Ophthalmic Disease Diagnosis","summary":" Automatic ophthalmic disease diagnosis on fundus images is important in\nclinical practice. However, due to complex fundus textures and limited\nannotated data, developing an effective automatic method for this problem is\nstill challenging. In this paper, we present a self-supervised method via polar\ntransformation based progressive contrastive learning, called PoCo, for\nophthalmic disease diagnosis. Specifically, we novelly inject the polar\ntransformation into contrastive learning to 1) promote contrastive learning\npre-training to be faster and more stable and 2) naturally capture task-free\nand rotation-related textures, which provides insights into disease recognition\non fundus images. Beneficially, simple normal translation-invariant convolution\non transformed images can equivalently replace the complex rotation-invariant\nand sector convolution on raw images. After that, we develop a progressive\ncontrastive learning method to efficiently utilize large unannotated images and\na novel progressive hard negative sampling scheme to gradually reduce the\nnegative sample number for efficient training and performance enhancement.\nExtensive experiments on three public ophthalmic disease datasets show that our\nPoCo achieves state-of-the-art performance with good generalization ability,\nvalidating that our method can reduce annotation efforts and provide reliable\ndiagnosis. Codes are available at \\url{https://github.com/wjh892521292/PoCo}.\n","authors":["Jinhong Wang","Tingting Chen","Jintai Chen","Yixuan Wu","Yuyang Xu","Danny Chen","Haochao Ying","Jian Wu"],"pdf_url":"https://arxiv.org/pdf/2403.19124v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03596v2","updated":"2024-03-28T03:26:51Z","published":"2023-12-06T16:35:59Z","title":"MMM: Generative Masked Motion Model","summary":" Recent advances in text-to-motion generation using diffusion and\nautoregressive models have shown promising results. However, these models often\nsuffer from a trade-off between real-time performance, high fidelity, and\nmotion editability. To address this gap, we introduce MMM, a novel yet simple\nmotion generation paradigm based on Masked Motion Model. MMM consists of two\nkey components: (1) a motion tokenizer that transforms 3D human motion into a\nsequence of discrete tokens in latent space, and (2) a conditional masked\nmotion transformer that learns to predict randomly masked motion tokens,\nconditioned on the pre-computed text tokens. By attending to motion and text\ntokens in all directions, MMM explicitly captures inherent dependency among\nmotion tokens and semantic mapping between motion and text tokens. During\ninference, this allows parallel and iterative decoding of multiple motion\ntokens that are highly consistent with fine-grained text descriptions,\ntherefore simultaneously achieving high-fidelity and high-speed motion\ngeneration. In addition, MMM has innate motion editability. By simply placing\nmask tokens in the place that needs editing, MMM automatically fills the gaps\nwhile guaranteeing smooth transitions between editing and non-editing parts.\nExtensive experiments on the HumanML3D and KIT-ML datasets demonstrate that MMM\nsurpasses current leading methods in generating high-quality motion (evidenced\nby superior FID scores of 0.08 and 0.429), while offering advanced editing\nfeatures such as body-part modification, motion in-betweening, and the\nsynthesis of long motion sequences. In addition, MMM is two orders of magnitude\nfaster on a single mid-range GPU than editable motion diffusion models. Our\nproject page is available at \\url{https://exitudio.github.io/MMM-page}.\n","authors":["Ekkasit Pinyoanuntapong","Pu Wang","Minwoo Lee","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2312.03596v2.pdf","comment":"accepted to CVPR"},{"id":"http://arxiv.org/abs/2403.16002v2","updated":"2024-03-28T03:22:52Z","published":"2024-03-24T04:15:50Z","title":"SDSTrack: Self-Distillation Symmetric Adapter Learning for Multi-Modal\n Visual Object Tracking","summary":" Multimodal Visual Object Tracking (VOT) has recently gained significant\nattention due to its robustness. Early research focused on fully fine-tuning\nRGB-based trackers, which was inefficient and lacked generalized representation\ndue to the scarcity of multimodal data. Therefore, recent studies have utilized\nprompt tuning to transfer pre-trained RGB-based trackers to multimodal data.\nHowever, the modality gap limits pre-trained knowledge recall, and the\ndominance of the RGB modality persists, preventing the full utilization of\ninformation from other modalities. To address these issues, we propose a novel\nsymmetric multimodal tracking framework called SDSTrack. We introduce\nlightweight adaptation for efficient fine-tuning, which directly transfers the\nfeature extraction ability from RGB to other domains with a small number of\ntrainable parameters and integrates multimodal features in a balanced,\nsymmetric manner. Furthermore, we design a complementary masked patch\ndistillation strategy to enhance the robustness of trackers in complex\nenvironments, such as extreme weather, poor imaging, and sensor failure.\nExtensive experiments demonstrate that SDSTrack outperforms state-of-the-art\nmethods in various multimodal tracking scenarios, including RGB+Depth,\nRGB+Thermal, and RGB+Event tracking, and exhibits impressive results in extreme\nconditions. Our source code is available at https://github.com/hoqolo/SDSTrack.\n","authors":["Xiaojun Hou","Jiazheng Xing","Yijie Qian","Yaowei Guo","Shuo Xin","Junhao Chen","Kai Tang","Mengmeng Wang","Zhengkai Jiang","Liang Liu","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2403.16002v2.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2304.05684v3","updated":"2024-03-28T03:15:57Z","published":"2023-04-12T08:12:29Z","title":"InterGen: Diffusion-based Multi-human Motion Generation under Complex\n Interactions","summary":" We have recently seen tremendous progress in diffusion advances for\ngenerating realistic human motions. Yet, they largely disregard the multi-human\ninteractions. In this paper, we present InterGen, an effective diffusion-based\napproach that incorporates human-to-human interactions into the motion\ndiffusion process, which enables layman users to customize high-quality\ntwo-person interaction motions, with only text guidance. We first contribute a\nmultimodal dataset, named InterHuman. It consists of about 107M frames for\ndiverse two-person interactions, with accurate skeletal motions and 23,337\nnatural language descriptions. For the algorithm side, we carefully tailor the\nmotion diffusion model to our two-person interaction setting. To handle the\nsymmetry of human identities during interactions, we propose two cooperative\ntransformer-based denoisers that explicitly share weights, with a mutual\nattention mechanism to further connect the two denoising processes. Then, we\npropose a novel representation for motion input in our interaction diffusion\nmodel, which explicitly formulates the global relations between the two\nperformers in the world frame. We further introduce two novel regularization\nterms to encode spatial relations, equipped with a corresponding damping scheme\nduring the training of our interaction diffusion model. Extensive experiments\nvalidate the effectiveness and generalizability of InterGen. Notably, it can\ngenerate more diverse and compelling two-person motions than previous methods\nand enables various downstream applications for human interactions.\n","authors":["Han Liang","Wenqian Zhang","Wenxuan Li","Jingyi Yu","Lan Xu"],"pdf_url":"https://arxiv.org/pdf/2304.05684v3.pdf","comment":"accepted by IJCV 2024"},{"id":"http://arxiv.org/abs/2403.19111v1","updated":"2024-03-28T03:07:16Z","published":"2024-03-28T03:07:16Z","title":"Patch Spatio-Temporal Relation Prediction for Video Anomaly Detection","summary":" Video Anomaly Detection (VAD), aiming to identify abnormalities within a\nspecific context and timeframe, is crucial for intelligent Video Surveillance\nSystems. While recent deep learning-based VAD models have shown promising\nresults by generating high-resolution frames, they often lack competence in\npreserving detailed spatial and temporal coherence in video frames. To tackle\nthis issue, we propose a self-supervised learning approach for VAD through an\ninter-patch relationship prediction task. Specifically, we introduce a\ntwo-branch vision transformer network designed to capture deep visual features\nof video frames, addressing spatial and temporal dimensions responsible for\nmodeling appearance and motion patterns, respectively. The inter-patch\nrelationship in each dimension is decoupled into inter-patch similarity and the\norder information of each patch. To mitigate memory consumption, we convert the\norder information prediction task into a multi-label learning problem, and the\ninter-patch similarity prediction task into a distance matrix regression\nproblem. Comprehensive experiments demonstrate the effectiveness of our method,\nsurpassing pixel-generation-based methods by a significant margin across three\npublic benchmarks. Additionally, our approach outperforms other self-supervised\nlearning-based methods.\n","authors":["Hao Shen","Lu Shi","Wanru Xu","Yigang Cen","Linna Zhang","Gaoyun An"],"pdf_url":"https://arxiv.org/pdf/2403.19111v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19107v1","updated":"2024-03-28T02:51:33Z","published":"2024-03-28T02:51:33Z","title":"Synthetic Medical Imaging Generation with Generative Adversarial\n Networks For Plain Radiographs","summary":" In medical imaging, access to data is commonly limited due to patient privacy\nrestrictions and the issue that it can be difficult to acquire enough data in\nthe case of rare diseases.[1] The purpose of this investigation was to develop\na reusable open-source synthetic image generation pipeline, the GAN Image\nSynthesis Tool (GIST), that is easy to use as well as easy to deploy. The\npipeline helps to improve and standardize AI algorithms in the digital health\nspace by generating high quality synthetic image data that is not linked to\nspecific patients. Its image generation capabilities include the ability to\ngenerate imaging of pathologies or injuries with low incidence rates. This\nimprovement of digital health AI algorithms could improve diagnostic accuracy,\naid in patient care, decrease medicolegal claims, and ultimately decrease the\noverall cost of healthcare. The pipeline builds on existing Generative\nAdversarial Networks (GANs) algorithms, and preprocessing and evaluation steps\nwere included for completeness. For this work, we focused on ensuring the\npipeline supports radiography, with a focus on synthetic knee and elbow x-ray\nimages. In designing the pipeline, we evaluated the performance of current GAN\narchitectures, studying the performance on available x-ray data. We show that\nthe pipeline is capable of generating high quality and clinically relevant\nimages based on a lay person's evaluation and the Fr\\'echet Inception Distance\n(FID) metric.\n","authors":["John R. McNulty","Lee Kho","Alexandria L. Case","Charlie Fornaca","Drew Johnston","David Slater","Joshua M. Abzug","Sybil A. Russell"],"pdf_url":"https://arxiv.org/pdf/2403.19107v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19104v1","updated":"2024-03-28T02:39:45Z","published":"2024-03-28T02:39:45Z","title":"CRKD: Enhanced Camera-Radar Object Detection with Cross-modality\n Knowledge Distillation","summary":" In the field of 3D object detection for autonomous driving, LiDAR-Camera (LC)\nfusion is the top-performing sensor configuration. Still, LiDAR is relatively\nhigh cost, which hinders adoption of this technology for consumer automobiles.\nAlternatively, camera and radar are commonly deployed on vehicles already on\nthe road today, but performance of Camera-Radar (CR) fusion falls behind LC\nfusion. In this work, we propose Camera-Radar Knowledge Distillation (CRKD) to\nbridge the performance gap between LC and CR detectors with a novel\ncross-modality KD framework. We use the Bird's-Eye-View (BEV) representation as\nthe shared feature space to enable effective knowledge distillation. To\naccommodate the unique cross-modality KD path, we propose four distillation\nlosses to help the student learn crucial features from the teacher model. We\npresent extensive evaluations on the nuScenes dataset to demonstrate the\neffectiveness of the proposed CRKD framework. The project page for CRKD is\nhttps://song-jingyu.github.io/CRKD.\n","authors":["Lingjun Zhao","Jingyu Song","Katherine A. Skinner"],"pdf_url":"https://arxiv.org/pdf/2403.19104v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19103v1","updated":"2024-03-28T02:35:53Z","published":"2024-03-28T02:35:53Z","title":"Automated Black-box Prompt Engineering for Personalized Text-to-Image\n Generation","summary":" Prompt engineering is effective for controlling the output of text-to-image\n(T2I) generative models, but it is also laborious due to the need for manually\ncrafted prompts. This challenge has spurred the development of algorithms for\nautomated prompt generation. However, these methods often struggle with\ntransferability across T2I models, require white-box access to the underlying\nmodel, and produce non-intuitive prompts. In this work, we introduce PRISM, an\nalgorithm that automatically identifies human-interpretable and transferable\nprompts that can effectively generate desired concepts given only black-box\naccess to T2I models. Inspired by large language model (LLM) jailbreaking,\nPRISM leverages the in-context learning ability of LLMs to iteratively refine\nthe candidate prompts distribution for given reference images. Our experiments\ndemonstrate the versatility and effectiveness of PRISM in generating accurate\nprompts for objects, styles and images across multiple T2I models, including\nStable Diffusion, DALL-E, and Midjourney.\n","authors":["Yutong He","Alexander Robey","Naoki Murata","Yiding Jiang","Joshua Williams","George J. Pappas","Hamed Hassani","Yuki Mitsufuji","Ruslan Salakhutdinov","J. Zico Kolter"],"pdf_url":"https://arxiv.org/pdf/2403.19103v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19101v1","updated":"2024-03-28T02:31:06Z","published":"2024-03-28T02:31:06Z","title":"AAPMT: AGI Assessment Through Prompt and Metric Transformer","summary":" The emergence of text-to-image models marks a significant milestone in the\nevolution of AI-generated images (AGIs), expanding their use in diverse domains\nlike design, entertainment, and more. Despite these breakthroughs, the quality\nof AGIs often remains suboptimal, highlighting the need for effective\nevaluation methods. These methods are crucial for assessing the quality of\nimages relative to their textual descriptions, and they must accurately mirror\nhuman perception. Substantial progress has been achieved in this domain, with\ninnovative techniques such as BLIP and DBCNN contributing significantly.\nHowever, recent studies, including AGIQA-3K, reveal a notable discrepancy\nbetween current methods and state-of-the-art (SOTA) standards. This gap\nemphasizes the necessity for a more sophisticated and precise evaluation\nmetric. In response, our objective is to develop a model that could give\nratings for metrics, which focuses on parameters like perceptual quality,\nauthenticity, and the correspondence between text and image, that more closely\naligns with human perception. In our paper, we introduce a range of effective\nmethods, including prompt designs and the Metric Transformer. The Metric\nTransformer is a novel structure inspired by the complex interrelationships\namong various AGI quality metrics. The code is available at\nhttps://github.com/huskydoge/CS3324-Digital-Image-Processing/tree/main/Assignment1\n","authors":["Benhao Huang"],"pdf_url":"https://arxiv.org/pdf/2403.19101v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.13302v3","updated":"2024-03-28T02:24:38Z","published":"2023-09-23T08:24:36Z","title":"Gaining the Sparse Rewards by Exploring Lottery Tickets in Spiking\n Neural Network","summary":" Deploying energy-efficient deep learning algorithms on computational-limited\ndevices, such as robots, is still a pressing issue for real-world applications.\nSpiking Neural Networks (SNNs), a novel brain-inspired algorithm, offer a\npromising solution due to their low-latency and low-energy properties over\ntraditional Artificial Neural Networks (ANNs). Despite their advantages, the\ndense structure of deep SNNs can still result in extra energy consumption. The\nLottery Ticket Hypothesis (LTH) posits that within dense neural networks, there\nexist winning Lottery Tickets (LTs), namely sub-networks, that can be obtained\nwithout compromising performance. Inspired by this, this paper delves into the\nspiking-based LTs (SLTs), examining their unique properties and potential for\nextreme efficiency. Then, two significant sparse \\textbf{\\textit{Rewards}} are\ngained through comprehensive explorations and meticulous experiments on SLTs\nacross various dense structures. Moreover, a sparse algorithm tailored for\nspiking transformer structure, which incorporates convolution operations into\nthe Patch Embedding Projection (ConvPEP) module, has been proposed to achieve\nMulti-level Sparsity (MultiSp). MultiSp refers to (1) Patch number sparsity;\n(2) ConvPEP weights sparsity and binarization; and (3) ConvPEP activation layer\nbinarization. Extensive experiments demonstrate that our method achieves\nextreme sparsity with only a slight performance decrease, paving the way for\ndeploying energy-efficient neural networks in robotics and beyond.\n","authors":["Hao Cheng","Jiahang Cao","Erjia Xiao","Mengshu Sun","Renjing Xu"],"pdf_url":"https://arxiv.org/pdf/2309.13302v3.pdf","comment":"This paper is under submission"},{"id":"http://arxiv.org/abs/2403.19098v1","updated":"2024-03-28T02:22:28Z","published":"2024-03-28T02:22:28Z","title":"GraphAD: Interaction Scene Graph for End-to-end Autonomous Driving","summary":" Modeling complicated interactions among the ego-vehicle, road agents, and map\nelements has been a crucial part for safety-critical autonomous driving.\nPrevious works on end-to-end autonomous driving rely on the attention mechanism\nfor handling heterogeneous interactions, which fails to capture the geometric\npriors and is also computationally intensive. In this paper, we propose the\nInteraction Scene Graph (ISG) as a unified method to model the interactions\namong the ego-vehicle, road agents, and map elements. With the representation\nof the ISG, the driving agents aggregate essential information from the most\ninfluential elements, including the road agents with potential collisions and\nthe map elements to follow. Since a mass of unnecessary interactions are\nomitted, the more efficient scene-graph-based framework is able to focus on\nindispensable connections and leads to better performance. We evaluate the\nproposed method for end-to-end autonomous driving on the nuScenes dataset.\nCompared with strong baselines, our method significantly outperforms in the\nfull-stack driving tasks, including perception, prediction, and planning. Code\nwill be released at https://github.com/zhangyp15/GraphAD.\n","authors":["Yunpeng Zhang","Deheng Qian","Ding Li","Yifeng Pan","Yong Chen","Zhenbao Liang","Zhiyao Zhang","Shurui Zhang","Hongxu Li","Maolei Fu","Yun Ye","Zhujin Liang","Yi Shan","Dalong Du"],"pdf_url":"https://arxiv.org/pdf/2403.19098v1.pdf","comment":"project page: https://github.com/zhangyp15/GraphAD"},{"id":"http://arxiv.org/abs/2403.19080v1","updated":"2024-03-28T01:05:06Z","published":"2024-03-28T01:05:06Z","title":"MMCert: Provable Defense against Adversarial Attacks to Multi-modal\n Models","summary":" Different from a unimodal model whose input is from a single modality, the\ninput (called multi-modal input) of a multi-modal model is from multiple\nmodalities such as image, 3D points, audio, text, etc. Similar to unimodal\nmodels, many existing studies show that a multi-modal model is also vulnerable\nto adversarial perturbation, where an attacker could add small perturbation to\nall modalities of a multi-modal input such that the multi-modal model makes\nincorrect predictions for it. Existing certified defenses are mostly designed\nfor unimodal models, which achieve sub-optimal certified robustness guarantees\nwhen extended to multi-modal models as shown in our experimental results. In\nour work, we propose MMCert, the first certified defense against adversarial\nattacks to a multi-modal model. We derive a lower bound on the performance of\nour MMCert under arbitrary adversarial attacks with bounded perturbations to\nboth modalities (e.g., in the context of auto-driving, we bound the number of\nchanged pixels in both RGB image and depth image). We evaluate our MMCert using\ntwo benchmark datasets: one for the multi-modal road segmentation task and the\nother for the multi-modal emotion recognition task. Moreover, we compare our\nMMCert with a state-of-the-art certified defense extended from unimodal models.\nOur experimental results show that our MMCert outperforms the baseline.\n","authors":["Yanting Wang","Hongye Fu","Wei Zou","Jinyuan Jia"],"pdf_url":"https://arxiv.org/pdf/2403.19080v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19079v1","updated":"2024-03-28T01:00:08Z","published":"2024-03-28T01:00:08Z","title":"A Real-Time Framework for Domain-Adaptive Underwater Object Detection\n with Image Enhancement","summary":" In recent years, significant progress has been made in the field of\nunderwater image enhancement (UIE). However, its practical utility for\nhigh-level vision tasks, such as underwater object detection (UOD) in\nAutonomous Underwater Vehicles (AUVs), remains relatively unexplored. It may be\nattributed to several factors: (1) Existing methods typically employ UIE as a\npre-processing step, which inevitably introduces considerable computational\noverhead and latency. (2) The process of enhancing images prior to training\nobject detectors may not necessarily yield performance improvements. (3) The\ncomplex underwater environments can induce significant domain shifts across\ndifferent scenarios, seriously deteriorating the UOD performance. To address\nthese challenges, we introduce EnYOLO, an integrated real-time framework\ndesigned for simultaneous UIE and UOD with domain-adaptation capability.\nSpecifically, both the UIE and UOD task heads share the same network backbone\nand utilize a lightweight design. Furthermore, to ensure balanced training for\nboth tasks, we present a multi-stage training strategy aimed at consistently\nenhancing their performance. Additionally, we propose a novel domain-adaptation\nstrategy to align feature embeddings originating from diverse underwater\nenvironments. Comprehensive experiments demonstrate that our framework not only\nachieves state-of-the-art (SOTA) performance in both UIE and UOD tasks, but\nalso shows superior adaptability when applied to different underwater\nscenarios. Our efficiency analysis further highlights the substantial potential\nof our framework for onboard deployment.\n","authors":["Junjie Wen","Jinqiang Cui","Benyun Zhao","Bingxin Han","Xuchen Liu","Zhi Gao","Ben M. Chen"],"pdf_url":"https://arxiv.org/pdf/2403.19079v1.pdf","comment":"accepted by ICRA24"},{"id":"http://arxiv.org/abs/2403.08059v2","updated":"2024-03-28T00:59:37Z","published":"2024-03-12T20:11:38Z","title":"FluoroSAM: A Language-aligned Foundation Model for X-ray Image\n Segmentation","summary":" Automated X-ray image segmentation would accelerate research and development\nin diagnostic and interventional precision medicine. Prior efforts have\ncontributed task-specific models capable of solving specific image analysis\nproblems, but the utility of these models is restricted to their particular\ntask domain, and expanding to broader use requires additional data, labels, and\nretraining efforts. Recently, foundation models (FMs) -- machine learning\nmodels trained on large amounts of highly variable data thus enabling broad\napplicability -- have emerged as promising tools for automated image analysis.\nExisting FMs for medical image analysis focus on scenarios and modalities where\nobjects are clearly defined by visually apparent boundaries, such as surgical\ntool segmentation in endoscopy. X-ray imaging, by contrast, does not generally\noffer such clearly delineated boundaries or structure priors. During X-ray\nimage formation, complex 3D structures are projected in transmission onto the\nimaging plane, resulting in overlapping features of varying opacity and shape.\nTo pave the way toward an FM for comprehensive and automated analysis of\narbitrary medical X-ray images, we develop FluoroSAM, a language-aligned\nvariant of the Segment-Anything Model, trained from scratch on 1.6M synthetic\nX-ray images. FluoroSAM is trained on data including masks for 128 organ types\nand 464 non-anatomical objects, such as tools and implants. In real X-ray\nimages of cadaveric specimens, FluoroSAM is able to segment bony anatomical\nstructures based on text-only prompting with 0.51 and 0.79 DICE with\npoint-based refinement, outperforming competing SAM variants for all\nstructures. FluoroSAM is also capable of zero-shot generalization to segmenting\nclasses beyond the training set thanks to its language alignment, which we\ndemonstrate for full lung segmentation on real chest X-rays.\n","authors":["Benjamin D. Killeen","Liam J. Wang","Han Zhang","Mehran Armand","Russell H. Taylor","Dave Dreizin","Greg Osgood","Mathias Unberath"],"pdf_url":"https://arxiv.org/pdf/2403.08059v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19078v1","updated":"2024-03-28T00:50:02Z","published":"2024-03-28T00:50:02Z","title":"MVEB: Self-Supervised Learning with Multi-View Entropy Bottleneck","summary":" Self-supervised learning aims to learn representation that can be effectively\ngeneralized to downstream tasks. Many self-supervised approaches regard two\nviews of an image as both the input and the self-supervised signals, assuming\nthat either view contains the same task-relevant information and the shared\ninformation is (approximately) sufficient for predicting downstream tasks.\nRecent studies show that discarding superfluous information not shared between\nthe views can improve generalization. Hence, the ideal representation is\nsufficient for downstream tasks and contains minimal superfluous information,\ntermed minimal sufficient representation. One can learn this representation by\nmaximizing the mutual information between the representation and the supervised\nview while eliminating superfluous information. Nevertheless, the computation\nof mutual information is notoriously intractable. In this work, we propose an\nobjective termed multi-view entropy bottleneck (MVEB) to learn minimal\nsufficient representation effectively. MVEB simplifies the minimal sufficient\nlearning to maximizing both the agreement between the embeddings of two views\nand the differential entropy of the embedding distribution. Our experiments\nconfirm that MVEB significantly improves performance. For example, it achieves\ntop-1 accuracy of 76.9\\% on ImageNet with a vanilla ResNet-50 backbone on\nlinear evaluation. To the best of our knowledge, this is the new\nstate-of-the-art result with ResNet-50.\n","authors":["Liangjian Wen","Xiasi Wang","Jianzhuang Liu","Zenglin Xu"],"pdf_url":"https://arxiv.org/pdf/2403.19078v1.pdf","comment":"Accepted by TPAMI"},{"id":"http://arxiv.org/abs/2401.03707v2","updated":"2024-03-28T00:43:21Z","published":"2024-01-08T07:34:43Z","title":"FMA-Net: Flow-Guided Dynamic Filtering and Iterative Feature Refinement\n with Multi-Attention for Joint Video Super-Resolution and Deblurring","summary":" We present a joint learning scheme of video super-resolution and deblurring,\ncalled VSRDB, to restore clean high-resolution (HR) videos from blurry\nlow-resolution (LR) ones. This joint restoration problem has drawn much less\nattention compared to single restoration problems. In this paper, we propose a\nnovel flow-guided dynamic filtering (FGDF) and iterative feature refinement\nwith multi-attention (FRMA), which constitutes our VSRDB framework, denoted as\nFMA-Net. Specifically, our proposed FGDF enables precise estimation of both\nspatio-temporally-variant degradation and restoration kernels that are aware of\nmotion trajectories through sophisticated motion representation learning.\nCompared to conventional dynamic filtering, the FGDF enables the FMA-Net to\neffectively handle large motions into the VSRDB. Additionally, the stacked FRMA\nblocks trained with our novel temporal anchor (TA) loss, which temporally\nanchors and sharpens features, refine features in a course-to-fine manner\nthrough iterative updates. Extensive experiments demonstrate the superiority of\nthe proposed FMA-Net over state-of-the-art methods in terms of both\nquantitative and qualitative quality. Codes and pre-trained models are\navailable at: https://kaist-viclab.github.io/fmanet-site\n","authors":["Geunhyuk Youk","Jihyong Oh","Munchurl Kim"],"pdf_url":"https://arxiv.org/pdf/2401.03707v2.pdf","comment":"CVPR2024 (camera-ready version). The last two authors are\n co-corresponding authors. Please visit our project page at\n https://kaist-viclab.github.io/fmanet-site"},{"id":"http://arxiv.org/abs/2403.19076v1","updated":"2024-03-28T00:34:56Z","published":"2024-03-28T00:34:56Z","title":"Tiny Machine Learning: Progress and Futures","summary":" Tiny Machine Learning (TinyML) is a new frontier of machine learning. By\nsqueezing deep learning models into billions of IoT devices and\nmicrocontrollers (MCUs), we expand the scope of AI applications and enable\nubiquitous intelligence. However, TinyML is challenging due to hardware\nconstraints: the tiny memory resource makes it difficult to hold deep learning\nmodels designed for cloud and mobile platforms. There is also limited compiler\nand inference engine support for bare-metal devices. Therefore, we need to\nco-design the algorithm and system stack to enable TinyML. In this review, we\nwill first discuss the definition, challenges, and applications of TinyML. We\nthen survey the recent progress in TinyML and deep learning on MCUs. Next, we\nwill introduce MCUNet, showing how we can achieve ImageNet-scale AI\napplications on IoT devices with system-algorithm co-design. We will further\nextend the solution from inference to training and introduce tiny on-device\ntraining techniques. Finally, we present future directions in this area.\nToday's large model might be tomorrow's tiny model. The scope of TinyML should\nevolve and adapt over time.\n","authors":["Ji Lin","Ligeng Zhu","Wei-Ming Chen","Wei-Chen Wang","Song Han"],"pdf_url":"https://arxiv.org/pdf/2403.19076v1.pdf","comment":"IEEE Circuits and Systems Magazine (2023). arXiv admin note: text\n overlap with arXiv:2206.15472"},{"id":"http://arxiv.org/abs/2403.19067v1","updated":"2024-03-28T00:14:53Z","published":"2024-03-28T00:14:53Z","title":"Low-Rank Rescaled Vision Transformer Fine-Tuning: A Residual Design\n Approach","summary":" Parameter-efficient fine-tuning for pre-trained Vision Transformers aims to\nadeptly tailor a model to downstream tasks by learning a minimal set of new\nadaptation parameters while preserving the frozen majority of pre-trained\nparameters. Striking a balance between retaining the generalizable\nrepresentation capacity of the pre-trained model and acquiring task-specific\nfeatures poses a key challenge. Currently, there is a lack of focus on guiding\nthis delicate trade-off. In this study, we approach the problem from the\nperspective of Singular Value Decomposition (SVD) of pre-trained parameter\nmatrices, providing insights into the tuning dynamics of existing methods.\nBuilding upon this understanding, we propose a Residual-based Low-Rank\nRescaling (RLRR) fine-tuning strategy. This strategy not only enhances\nflexibility in parameter tuning but also ensures that new parameters do not\ndeviate excessively from the pre-trained model through a residual design.\nExtensive experiments demonstrate that our method achieves competitive\nperformance across various downstream image classification tasks, all while\nmaintaining comparable new parameters. We believe this work takes a step\nforward in offering a unified perspective for interpreting existing methods and\nserves as motivation for the development of new approaches that move closer to\neffectively considering the crucial trade-off mentioned above. Our code is\navailable at\n\\href{https://github.com/zstarN70/RLRR.git}{https://github.com/zstarN70/RLRR.git}.\n","authors":["Wei Dong","Xing Zhang","Bihui Chen","Dawei Yan","Zhijun Lin","Qingsen Yan","Peng Wang","Yang Yang"],"pdf_url":"https://arxiv.org/pdf/2403.19067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19066v1","updated":"2024-03-28T00:11:12Z","published":"2024-03-28T00:11:12Z","title":"Generative Quanta Color Imaging","summary":" The astonishing development of single-photon cameras has created an\nunprecedented opportunity for scientific and industrial imaging. However, the\nhigh data throughput generated by these 1-bit sensors creates a significant\nbottleneck for low-power applications. In this paper, we explore the\npossibility of generating a color image from a single binary frame of a\nsingle-photon camera. We evidently find this problem being particularly\ndifficult to standard colorization approaches due to the substantial degree of\nexposure variation. The core innovation of our paper is an exposure synthesis\nmodel framed under a neural ordinary differential equation (Neural ODE) that\nallows us to generate a continuum of exposures from a single observation. This\ninnovation ensures consistent exposure in binary images that colorizers take\non, resulting in notably enhanced colorization. We demonstrate applications of\nthe method in single-image and burst colorization and show superior generative\nperformance over baselines. Project website can be found at\nhttps://vishal-s-p.github.io/projects/2023/generative_quanta_color.html.\n","authors":["Vishal Purohit","Junjie Luo","Yiheng Chi","Qi Guo","Stanley H. Chan","Qiang Qiu"],"pdf_url":"https://arxiv.org/pdf/2403.19066v1.pdf","comment":"Accepted at IEEE Conference on Computer Vision and Pattern\n Recognition (CVPR), 2024"},{"id":"http://arxiv.org/abs/2403.19203v1","updated":"2024-03-28T08:00:14Z","published":"2024-03-28T08:00:14Z","title":"Single-Shared Network with Prior-Inspired Loss for Parameter-Efficient\n Multi-Modal Imaging Skin Lesion Classification","summary":" In this study, we introduce a multi-modal approach that efficiently\nintegrates multi-scale clinical and dermoscopy features within a single\nnetwork, thereby substantially reducing model parameters. The proposed method\nincludes three novel fusion schemes. Firstly, unlike current methods that\nusually employ two individual models for for clinical and dermoscopy\nmodalities, we verified that multimodal feature can be learned by sharing the\nparameters of encoder while leaving the individual modal-specific classifiers.\nSecondly, the shared cross-attention module can replace the individual one to\nefficiently interact between two modalities at multiple layers. Thirdly,\ndifferent from current methods that equally optimize dermoscopy and clinical\nbranches, inspired by prior knowledge that dermoscopy images play a more\nsignificant role than clinical images, we propose a novel biased loss. This\nloss guides the single-shared network to prioritize dermoscopy information over\nclinical information, implicitly learning a better joint feature representation\nfor the modal-specific task. Extensive experiments on a well-recognized\nSeven-Point Checklist (SPC) dataset and a collected dataset demonstrate the\neffectiveness of our method on both CNN and Transformer structures.\nFurthermore, our method exhibits superiority in both accuracy and model\nparameters compared to currently advanced methods.\n","authors":["Peng Tang","Tobias Lasser"],"pdf_url":"https://arxiv.org/pdf/2403.19203v1.pdf","comment":"This paper have submitted to Journal for review"},{"id":"http://arxiv.org/abs/2403.19076v1","updated":"2024-03-28T00:34:56Z","published":"2024-03-28T00:34:56Z","title":"Tiny Machine Learning: Progress and Futures","summary":" Tiny Machine Learning (TinyML) is a new frontier of machine learning. By\nsqueezing deep learning models into billions of IoT devices and\nmicrocontrollers (MCUs), we expand the scope of AI applications and enable\nubiquitous intelligence. However, TinyML is challenging due to hardware\nconstraints: the tiny memory resource makes it difficult to hold deep learning\nmodels designed for cloud and mobile platforms. There is also limited compiler\nand inference engine support for bare-metal devices. Therefore, we need to\nco-design the algorithm and system stack to enable TinyML. In this review, we\nwill first discuss the definition, challenges, and applications of TinyML. We\nthen survey the recent progress in TinyML and deep learning on MCUs. Next, we\nwill introduce MCUNet, showing how we can achieve ImageNet-scale AI\napplications on IoT devices with system-algorithm co-design. We will further\nextend the solution from inference to training and introduce tiny on-device\ntraining techniques. Finally, we present future directions in this area.\nToday's large model might be tomorrow's tiny model. The scope of TinyML should\nevolve and adapt over time.\n","authors":["Ji Lin","Ligeng Zhu","Wei-Ming Chen","Wei-Chen Wang","Song Han"],"pdf_url":"https://arxiv.org/pdf/2403.19076v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2206.15472"},{"id":"http://arxiv.org/abs/2403.19885v1","updated":"2024-03-28T23:51:51Z","published":"2024-03-28T23:51:51Z","title":"Towards Long Term SLAM on Thermal Imagery","summary":" Visual SLAM with thermal imagery, and other low contrast visually degraded\nenvironments such as underwater, or in areas dominated by snow and ice, remain\na difficult problem for many state of the art (SOTA) algorithms. In addition to\nchallenging front-end data association, thermal imagery presents an additional\ndifficulty for long term relocalization and map reuse. The relative\ntemperatures of objects in thermal imagery change dramatically from day to\nnight. Feature descriptors typically used for relocalization in SLAM are unable\nto maintain consistency over these diurnal changes. We show that learned\nfeature descriptors can be used within existing Bag of Word based localization\nschemes to dramatically improve place recognition across large temporal gaps in\nthermal imagery. In order to demonstrate the effectiveness of our trained\nvocabulary, we have developed a baseline SLAM system, integrating learned\nfeatures and matching into a classical SLAM algorithm. Our system demonstrates\ngood local tracking on challenging thermal imagery, and relocalization that\novercomes dramatic day to night thermal appearance changes. Our code and\ndatasets are available here:\nhttps://github.com/neufieldrobotics/IRSLAM_Baseline\n","authors":["Colin Keil","Aniket Gupta","Pushyami Kaveti","Hanumant Singh"],"pdf_url":"https://arxiv.org/pdf/2403.19885v1.pdf","comment":"8 pages, 7 figures, Submitted to IROS 2024"},{"id":"http://arxiv.org/abs/2403.19882v1","updated":"2024-03-28T23:31:59Z","published":"2024-03-28T23:31:59Z","title":"Enhancing Efficiency in Vision Transformer Networks: Design Techniques\n and Insights","summary":" Intrigued by the inherent ability of the human visual system to identify\nsalient regions in complex scenes, attention mechanisms have been seamlessly\nintegrated into various Computer Vision (CV) tasks. Building upon this\nparadigm, Vision Transformer (ViT) networks exploit attention mechanisms for\nimproved efficiency. This review navigates the landscape of redesigned\nattention mechanisms within ViTs, aiming to enhance their performance. This\npaper provides a comprehensive exploration of techniques and insights for\ndesigning attention mechanisms, systematically reviewing recent literature in\nthe field of CV. This survey begins with an introduction to the theoretical\nfoundations and fundamental concepts underlying attention mechanisms. We then\npresent a systematic taxonomy of various attention mechanisms within ViTs,\nemploying redesigned approaches. A multi-perspective categorization is proposed\nbased on their application, objectives, and the type of attention applied. The\nanalysis includes an exploration of the novelty, strengths, weaknesses, and an\nin-depth evaluation of the different proposed strategies. This culminates in\nthe development of taxonomies that highlight key properties and contributions.\nFinally, we gather the reviewed studies along with their available open-source\nimplementations at our\n\\href{https://github.com/mindflow-institue/Awesome-Attention-Mechanism-in-Medical-Imaging}{GitHub}\\footnote{\\url{https://github.com/xmindflow/Awesome-Attention-Mechanism-in-Medical-Imaging}}.\nWe aim to regularly update it with the most recent relevant papers.\n","authors":["Moein Heidari","Reza Azad","Sina Ghorbani Kolahi","René Arimond","Leon Niggemeier","Alaa Sulaiman","Afshin Bozorgpour","Ehsan Khodapanah Aghdam","Amirhossein Kazerouni","Ilker Hacihaliloglu","Dorit Merhof"],"pdf_url":"https://arxiv.org/pdf/2403.19882v1.pdf","comment":"Submitted to Computational Visual Media Journal"},{"id":"http://arxiv.org/abs/2403.19880v1","updated":"2024-03-28T23:26:45Z","published":"2024-03-28T23:26:45Z","title":"Vision-Language Synthetic Data Enhances Echocardiography Downstream\n Tasks","summary":" High-quality, large-scale data is essential for robust deep learning models\nin medical applications, particularly ultrasound image analysis. Diffusion\nmodels facilitate high-fidelity medical image generation, reducing the costs\nassociated with acquiring and annotating new images. This paper utilizes recent\nvision-language models to produce diverse and realistic synthetic\nechocardiography image data, preserving key features of the original images\nguided by textual and semantic label maps. Specifically, we investigate three\npotential avenues: unconditional generation, generation guided by text, and a\nhybrid approach incorporating both textual and semantic supervision. We show\nthat the rich contextual information present in the synthesized data\npotentially enhances the accuracy and interpretability of downstream tasks,\nsuch as echocardiography segmentation and classification with improved metrics\nand faster convergence. Our implementation with checkpoints, prompts, and the\ncreated synthetic dataset will be publicly available at\n\\href{https://github.com/Pooria90/DiffEcho}{GitHub}.\n","authors":["Pooria Ashrafian","Milad Yazdani","Moein Heidari","Dena Shahriari","Ilker Hacihaliloglu"],"pdf_url":"https://arxiv.org/pdf/2403.19880v1.pdf","comment":"Submitted as a conference paper to MICCAI 2024"},{"id":"http://arxiv.org/abs/2310.01779v3","updated":"2024-03-28T22:27:12Z","published":"2023-10-03T04:01:27Z","title":"HallE-Control: Controlling Object Hallucination in Large Multimodal\n Models","summary":" Current Large Multimodal Models (LMMs) achieve remarkable progress, yet there\nremains significant uncertainty regarding their ability to accurately apprehend\nvisual details, that is, in performing detailed captioning. To address this, we\nintroduce $\\textit{CCEval}$, a GPT-4 assisted evaluation method for detailed\ncaptioning. Interestingly, while LMMs demonstrate minimal object existence\nhallucination in existing VQA benchmarks, our proposed evaluation reveals\ncontinued susceptibility to such hallucinations. In this paper, we make the\nfirst attempt to investigate such hallucination from different aspects,\nincluding image resolution, the language decoder size, and instruction data\namount, quality, granularity. Our findings underscore the unwarranted inference\nwhen the language description includes details at a finer object granularity\nthan what the vision module can ground or verify, thus inducing hallucination.\nTo control such hallucinations, we further attribute the reliability of\ncaptioning to contextual knowledge (involving only contextually grounded\nobjects) and parametric knowledge (containing inferred objects by the model).\nThus, we introduce $\\textit{HallE-Control}$, a controllable LMM in terms of\n$\\textbf{Hall}$ucination in object $\\textbf{E}$xistence. HallE-Control can\ncondition the captioning to shift between (i) exclusively depicting contextual\nknowledge for grounded objects and (ii) blending it with parametric knowledge\nto imagine inferred objects. Our method reduces hallucination by 44% compared\nto LLaVA$_{7B}$ and maintains the object coverage.\n","authors":["Bohan Zhai","Shijia Yang","Chenfeng Xu","Sheng Shen","Kurt Keutzer","Chunyuan Li","Manling Li"],"pdf_url":"https://arxiv.org/pdf/2310.01779v3.pdf","comment":"Our code is publicly available at\n https://github.com/bronyayang/HallE_Control"},{"id":"http://arxiv.org/abs/2403.19866v1","updated":"2024-03-28T22:25:05Z","published":"2024-03-28T22:25:05Z","title":"Is Synthetic Image Useful for Transfer Learning? An Investigation into\n Data Generation, Volume, and Utilization","summary":" Synthetic image data generation represents a promising avenue for training\ndeep learning models, particularly in the realm of transfer learning, where\nobtaining real images within a specific domain can be prohibitively expensive\ndue to privacy and intellectual property considerations. This work delves into\nthe generation and utilization of synthetic images derived from text-to-image\ngenerative models in facilitating transfer learning paradigms. Despite the high\nvisual fidelity of the generated images, we observe that their naive\nincorporation into existing real-image datasets does not consistently enhance\nmodel performance due to the inherent distribution gap between synthetic and\nreal images. To address this issue, we introduce a novel two-stage framework\ncalled bridged transfer, which initially employs synthetic images for\nfine-tuning a pre-trained model to improve its transferability and subsequently\nuses real data for rapid adaptation. Alongside, We propose dataset style\ninversion strategy to improve the stylistic alignment between synthetic and\nreal images. Our proposed methods are evaluated across 10 different datasets\nand 5 distinct models, demonstrating consistent improvements, with up to 30%\naccuracy increase on classification tasks. Intriguingly, we note that the\nenhancements were not yet saturated, indicating that the benefits may further\nincrease with an expanded volume of synthetic data.\n","authors":["Yuhang Li","Xin Dong","Chen Chen","Jingtao Li","Yuxin Wen","Michael Spranger","Lingjuan Lyu"],"pdf_url":"https://arxiv.org/pdf/2403.19866v1.pdf","comment":"ICLR24 Score 6865\n https://openreview.net/forum?id=CjPt1AC6w0&referrer=%5Bthe%20profile%20of%20Chen%20Chen%5D(%2Fprofile%3Fid%3D~Chen_Chen20)"},{"id":"http://arxiv.org/abs/2403.19863v1","updated":"2024-03-28T22:17:19Z","published":"2024-03-28T22:17:19Z","title":"DeNetDM: Debiasing by Network Depth Modulation","summary":" When neural networks are trained on biased datasets, they tend to\ninadvertently learn spurious correlations, leading to challenges in achieving\nstrong generalization and robustness. Current approaches to address such biases\ntypically involve utilizing bias annotations, reweighting based on pseudo-bias\nlabels, or enhancing diversity within bias-conflicting data points through\naugmentation techniques. We introduce DeNetDM, a novel debiasing method based\non the observation that shallow neural networks prioritize learning core\nattributes, while deeper ones emphasize biases when tasked with acquiring\ndistinct information. Using a training paradigm derived from Product of\nExperts, we create both biased and debiased branches with deep and shallow\narchitectures and then distill knowledge to produce the target debiased model.\nExtensive experiments and analyses demonstrate that our approach outperforms\ncurrent debiasing techniques, achieving a notable improvement of around 5% in\nthree datasets, encompassing both synthetic and real-world data. Remarkably,\nDeNetDM accomplishes this without requiring annotations pertaining to bias\nlabels or bias types, while still delivering performance on par with supervised\ncounterparts. Furthermore, our approach effectively harnesses the diversity of\nbias-conflicting points within the data, surpassing previous methods and\nobviating the need for explicit augmentation-based methods to enhance the\ndiversity of such bias-conflicting points. The source code will be available\nupon acceptance.\n","authors":["Silpa Vadakkeeveetil Sreelatha","Adarsh Kappiyath","Anjan Dutta"],"pdf_url":"https://arxiv.org/pdf/2403.19863v1.pdf","comment":"23 pages including supplementary"},{"id":"http://arxiv.org/abs/2312.10144v3","updated":"2024-03-28T21:32:10Z","published":"2023-12-15T19:00:07Z","title":"Data-Efficient Multimodal Fusion on a Single GPU","summary":" The goal of multimodal alignment is to learn a single latent space that is\nshared between multimodal inputs. The most powerful models in this space have\nbeen trained using massive datasets of paired inputs and large-scale\ncomputational resources, making them prohibitively expensive to train in many\npractical scenarios. We surmise that existing unimodal encoders pre-trained on\nlarge amounts of unimodal data should provide an effective bootstrap to create\nmultimodal models from unimodal ones at much lower costs. We therefore propose\nFuseMix, a multimodal augmentation scheme that operates on the latent spaces of\narbitrary pre-trained unimodal encoders. Using FuseMix for multimodal\nalignment, we achieve competitive performance -- and in certain cases\noutperform state-of-the art methods -- in both image-text and audio-text\nretrieval, with orders of magnitude less compute and data: for example, we\noutperform CLIP on the Flickr30K text-to-image retrieval task with $\\sim \\!\n600\\times$ fewer GPU days and $\\sim \\! 80\\times$ fewer image-text pairs.\nAdditionally, we show how our method can be applied to convert pre-trained\ntext-to-image generative models into audio-to-image ones. Code is available at:\nhttps://github.com/layer6ai-labs/fusemix.\n","authors":["Noël Vouitsis","Zhaoyan Liu","Satya Krishna Gorti","Valentin Villecroze","Jesse C. Cresswell","Guangwei Yu","Gabriel Loaiza-Ganem","Maksims Volkovs"],"pdf_url":"https://arxiv.org/pdf/2312.10144v3.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2402.17951v3","updated":"2024-03-28T21:29:56Z","published":"2024-02-28T00:20:25Z","title":"QN-Mixer: A Quasi-Newton MLP-Mixer Model for Sparse-View CT\n Reconstruction","summary":" Inverse problems span across diverse fields. In medical contexts, computed\ntomography (CT) plays a crucial role in reconstructing a patient's internal\nstructure, presenting challenges due to artifacts caused by inherently\nill-posed inverse problems. Previous research advanced image quality via\npost-processing and deep unrolling algorithms but faces challenges, such as\nextended convergence times with ultra-sparse data. Despite enhancements,\nresulting images often show significant artifacts, limiting their effectiveness\nfor real-world diagnostic applications. We aim to explore deep second-order\nunrolling algorithms for solving imaging inverse problems, emphasizing their\nfaster convergence and lower time complexity compared to common first-order\nmethods like gradient descent. In this paper, we introduce QN-Mixer, an\nalgorithm based on the quasi-Newton approach. We use learned parameters through\nthe BFGS algorithm and introduce Incept-Mixer, an efficient neural architecture\nthat serves as a non-local regularization term, capturing long-range\ndependencies within images. To address the computational demands typically\nassociated with quasi-Newton algorithms that require full Hessian matrix\ncomputations, we present a memory-efficient alternative. Our approach\nintelligently downsamples gradient information, significantly reducing\ncomputational requirements while maintaining performance. The approach is\nvalidated through experiments on the sparse-view CT problem, involving various\ndatasets and scanning protocols, and is compared with post-processing and deep\nunrolling state-of-the-art approaches. Our method outperforms existing\napproaches and achieves state-of-the-art performance in terms of SSIM and PSNR,\nall while reducing the number of unrolling iterations required.\n","authors":["Ishak Ayad","Nicolas Larue","Maï K. Nguyen"],"pdf_url":"https://arxiv.org/pdf/2402.17951v3.pdf","comment":"Accepted at CVPR 2024. Project page:\n https://towzeur.github.io/QN-Mixer/"},{"id":"http://arxiv.org/abs/2312.00598v2","updated":"2024-03-28T21:29:55Z","published":"2023-12-01T14:03:30Z","title":"Learning from One Continuous Video Stream","summary":" We introduce a framework for online learning from a single continuous video\nstream -- the way people and animals learn, without mini-batches, data\naugmentation or shuffling. This poses great challenges given the high\ncorrelation between consecutive video frames and there is very little prior\nwork on it. Our framework allows us to do a first deep dive into the topic and\nincludes a collection of streams and tasks composed from two existing video\ndatasets, plus methodology for performance evaluation that considers both\nadaptation and generalization. We employ pixel-to-pixel modelling as a\npractical and flexible way to switch between pre-training and single-stream\nevaluation as well as between arbitrary tasks, without ever requiring changes\nto models and always using the same pixel loss. Equipped with this framework we\nobtained large single-stream learning gains from pre-training with a novel\nfamily of future prediction tasks, found that momentum hurts, and that the pace\nof weight updates matters. The combination of these insights leads to matching\nthe performance of IID learning with batch size 1, when using the same\narchitecture and without costly replay buffers.\n","authors":["João Carreira","Michael King","Viorica Pătrăucean","Dilara Gokay","Cătălin Ionescu","Yi Yang","Daniel Zoran","Joseph Heyward","Carl Doersch","Yusuf Aytar","Dima Damen","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2312.00598v2.pdf","comment":"CVPR camera ready version"},{"id":"http://arxiv.org/abs/2403.17343v3","updated":"2024-03-28T21:28:00Z","published":"2024-03-26T03:05:20Z","title":"Residual-based Language Models are Free Boosters for Biomedical Imaging","summary":" In this study, we uncover the unexpected efficacy of residual-based large\nlanguage models (LLMs) as part of encoders for biomedical imaging tasks, a\ndomain traditionally devoid of language or textual data. The approach diverges\nfrom established methodologies by utilizing a frozen transformer block,\nextracted from pre-trained LLMs, as an innovative encoder layer for the direct\nprocessing of visual tokens. This strategy represents a significant departure\nfrom the standard multi-modal vision-language frameworks, which typically hinge\non language-driven prompts and inputs. We found that these LLMs could boost\nperformance across a spectrum of biomedical imaging applications, including\nboth 2D and 3D visual classification tasks, serving as plug-and-play boosters.\nMore interestingly, as a byproduct, we found that the proposed framework\nachieved superior performance, setting new state-of-the-art results on\nextensive, standardized datasets in MedMNIST-2D and 3D. Through this work, we\naim to open new avenues for employing LLMs in biomedical imaging and enriching\nthe understanding of their potential in this specialized domain.\n","authors":["Zhixin Lai","Jing Wu","Suiyao Chen","Yucheng Zhou","Naira Hovakimyan"],"pdf_url":"https://arxiv.org/pdf/2403.17343v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19838v1","updated":"2024-03-28T21:18:33Z","published":"2024-03-28T21:18:33Z","title":"Multi-Frame, Lightweight & Efficient Vision-Language Models for Question\n Answering in Autonomous Driving","summary":" Vision-Language Models (VLMs) and Multi-Modal Language models (MMLMs) have\nbecome prominent in autonomous driving research, as these models can provide\ninterpretable textual reasoning and responses for end-to-end autonomous driving\nsafety tasks using traffic scene images and other data modalities. However,\ncurrent approaches to these systems use expensive large language model (LLM)\nbackbones and image encoders, making such systems unsuitable for real-time\nautonomous driving systems where tight memory constraints exist and fast\ninference time is necessary. To address these previous issues, we develop\nEM-VLM4AD, an efficient, lightweight, multi-frame vision language model which\nperforms Visual Question Answering for autonomous driving. In comparison to\nprevious approaches, EM-VLM4AD requires at least 10 times less memory and\nfloating point operations, while also achieving higher BLEU-4, METEOR, CIDEr,\nand ROGUE scores than the existing baseline on the DriveLM dataset. EM-VLM4AD\nalso exhibits the ability to extract relevant information from traffic views\nrelated to prompts and can answer questions for various autonomous driving\nsubtasks. We release our code to train and evaluate our model at\nhttps://github.com/akshaygopalkr/EM-VLM4AD.\n","authors":["Akshay Gopalkrishnan","Ross Greer","Mohan Trivedi"],"pdf_url":"https://arxiv.org/pdf/2403.19838v1.pdf","comment":"9 pages, 3 figures"},{"id":"http://arxiv.org/abs/2403.19837v1","updated":"2024-03-28T21:15:38Z","published":"2024-03-28T21:15:38Z","title":"Concept-based Analysis of Neural Networks via Vision-Language Models","summary":" Formal analysis of vision-based deep neural networks (DNNs) is highly\ndesirable but it is very challenging due to the difficulty of expressing formal\nspecifications for vision tasks and the lack of efficient verification\nprocedures. In this paper, we propose to leverage emerging multimodal,\nvision-language, foundation models (VLMs) as a lens through which we can reason\nabout vision models. VLMs have been trained on a large body of images\naccompanied by their textual description, and are thus implicitly aware of\nhigh-level, human-understandable concepts describing the images. We describe a\nlogical specification language $\\texttt{Con}_{\\texttt{spec}}$ designed to\nfacilitate writing specifications in terms of these concepts. To define and\nformally check $\\texttt{Con}_{\\texttt{spec}}$ specifications, we leverage a\nVLM, which provides a means to encode and efficiently check natural-language\nproperties of vision models. We demonstrate our techniques on a ResNet-based\nclassifier trained on the RIVAL-10 dataset leveraging CLIP as the multimodal\nmodel.\n","authors":["Ravi Mangal","Nina Narodytska","Divya Gopinath","Boyue Caroline Hu","Anirban Roy","Susmit Jha","Corina Pasareanu"],"pdf_url":"https://arxiv.org/pdf/2403.19837v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.17166v2","updated":"2024-03-28T20:49:55Z","published":"2023-09-29T11:59:57Z","title":"Advances in Kidney Biopsy Lesion Assessment through Dense Instance\n Segmentation","summary":" Renal biopsies are the gold standard for diagnosis of kidney diseases. Lesion\nscores made by renal pathologists are semi-quantitative and exhibit high\ninter-observer variability. Automating lesion classification within segmented\nanatomical structures can provide decision support in quantification analysis\nand reduce the inter-observer variability. Nevertheless, classifying lesions in\nregions-of-interest (ROIs) is clinically challenging due to (a) a large amount\nof densely packed anatomical objects (up to 1000), (b) class imbalance across\ndifferent compartments (at least 3), (c) significant variation in object scales\n(i.e. sizes and shapes), and (d) the presence of multi-label lesions per\nanatomical structure. Existing models lack the capacity to address these\ncomplexities efficiently and generically. This paper presents \\textbf{a\ngeneralized technical solution} for large-scale, multi-source datasets with\ndiverse lesions. Our approach utilizes two sub-networks: dense instance\nsegmentation and lesion classification. We introduce \\textbf{DiffRegFormer}, an\nend-to-end dense instance segmentation model designed for multi-class,\nmulti-scale objects within ROIs. Combining diffusion models, transformers, and\nRCNNs, DiffRegFormer efficiently recognizes over 500 objects across three\nanatomical classes (glomeruli, tubuli, arteries) within ROIs on a single NVIDIA\nGeForce RTX 3090 GPU. On a dataset of 303 ROIs (from 148 Jones' silver-stained\nrenal WSIs), it outperforms state of art models, achieving AP of 52.1\\%\n(detection) and 46.8\\% (segmentation). Our lesion classification sub-network\nachieves 89.2\\% precision and 64.6\\% recall on 21889 object patches (from the\n303 ROIs). Importantly, the model demonstrates direct domain transfer to\nPAS-stained WSIs without fine-tuning.\n","authors":["Zhan Xiong","Junling He","Pieter Valkema","Tri Q. Nguyen","Maarten Naesens","Jesper Kers","Fons J. Verbeek"],"pdf_url":"https://arxiv.org/pdf/2309.17166v2.pdf","comment":"16 pages, 15 figures, 6 tables, Journal"},{"id":"http://arxiv.org/abs/2403.13199v2","updated":"2024-03-28T20:06:38Z","published":"2024-03-19T23:23:35Z","title":"DecentNeRFs: Decentralized Neural Radiance Fields from Crowdsourced\n Images","summary":" Neural radiance fields (NeRFs) show potential for transforming images\ncaptured worldwide into immersive 3D visual experiences. However, most of this\ncaptured visual data remains siloed in our camera rolls as these images contain\npersonal details. Even if made public, the problem of learning 3D\nrepresentations of billions of scenes captured daily in a centralized manner is\ncomputationally intractable. Our approach, DecentNeRF, is the first attempt at\ndecentralized, crowd-sourced NeRFs that require $\\sim 10^4\\times$ less server\ncomputing for a scene than a centralized approach. Instead of sending the raw\ndata, our approach requires users to send a 3D representation, distributing the\nhigh computation cost of training centralized NeRFs between the users. It\nlearns photorealistic scene representations by decomposing users' 3D views into\npersonal and global NeRFs and a novel optimally weighted aggregation of only\nthe latter. We validate the advantage of our approach to learn NeRFs with\nphotorealism and minimal server computation cost on structured synthetic and\nreal-world photo tourism datasets. We further analyze how secure aggregation of\nglobal NeRFs in DecentNeRF minimizes the undesired reconstruction of personal\ncontent by the server.\n","authors":["Zaid Tasneem","Akshat Dave","Abhishek Singh","Kushagra Tiwary","Praneeth Vepakomma","Ashok Veeraraghavan","Ramesh Raskar"],"pdf_url":"https://arxiv.org/pdf/2403.13199v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.02402v2","updated":"2024-03-28T20:01:02Z","published":"2023-11-04T13:28:06Z","title":"Hybrid quantum image classification and federated learning for hepatic\n steatosis diagnosis","summary":" In the realm of liver transplantation, accurately determining hepatic\nsteatosis levels is crucial. Recognizing the essential need for improved\ndiagnostic precision, particularly for optimizing diagnosis time by swiftly\nhandling easy-to-solve cases and allowing the expert time to focus on more\ncomplex cases, this study aims to develop cutting-edge algorithms that enhance\nthe classification of liver biopsy images. Additionally, the challenge of\nmaintaining data privacy arises when creating automated algorithmic solutions,\nas sharing patient data between hospitals is restricted, further complicating\nthe development and validation process. This research tackles diagnostic\naccuracy by leveraging novel techniques from the rapidly evolving field of\nquantum machine learning, known for their superior generalization abilities.\nConcurrently, it addresses privacy concerns through the implementation of\nprivacy-conscious collaborative machine learning with federated learning. We\nintroduce a hybrid quantum neural network model that leverages real-world\nclinical data to assess non-alcoholic liver steatosis accurately. This model\nachieves an image classification accuracy of 97%, surpassing traditional\nmethods by 1.8%. Moreover, by employing a federated learning approach that\nallows data from different clients to be shared while ensuring privacy, we\nmaintain an accuracy rate exceeding 90%. This initiative marks a significant\nstep towards a scalable, collaborative, efficient, and dependable computational\nframework that aids clinical pathologists in their daily diagnostic tasks.\n","authors":["Luca Lusnig","Asel Sagingalieva","Mikhail Surmach","Tatjana Protasevich","Ovidiu Michiu","Joseph McLoughlin","Christopher Mansell","Graziano de' Petris","Deborah Bonazza","Fabrizio Zanconati","Alexey Melnikov","Fabio Cavalli"],"pdf_url":"https://arxiv.org/pdf/2311.02402v2.pdf","comment":"13 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2403.19811v1","updated":"2024-03-28T19:45:35Z","published":"2024-03-28T19:45:35Z","title":"X-MIC: Cross-Modal Instance Conditioning for Egocentric Action\n Generalization","summary":" Lately, there has been growing interest in adapting vision-language models\n(VLMs) to image and third-person video classification due to their success in\nzero-shot recognition. However, the adaptation of these models to egocentric\nvideos has been largely unexplored. To address this gap, we propose a simple\nyet effective cross-modal adaptation framework, which we call X-MIC. Using a\nvideo adapter, our pipeline learns to align frozen text embeddings to each\negocentric video directly in the shared embedding space. Our novel adapter\narchitecture retains and improves generalization of the pre-trained VLMs by\ndisentangling learnable temporal modeling and frozen visual encoder. This\nresults in an enhanced alignment of text embeddings to each egocentric video,\nleading to a significant improvement in cross-dataset generalization. We\nevaluate our approach on the Epic-Kitchens, Ego4D, and EGTEA datasets for\nfine-grained cross-dataset action generalization, demonstrating the\neffectiveness of our method. Code is available at\nhttps://github.com/annusha/xmic\n","authors":["Anna Kukleva","Fadime Sener","Edoardo Remelli","Bugra Tekin","Eric Sauser","Bernt Schiele","Shugao Ma"],"pdf_url":"https://arxiv.org/pdf/2403.19811v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19787v1","updated":"2024-03-28T19:11:26Z","published":"2024-03-28T19:11:26Z","title":"JIST: Joint Image and Sequence Training for Sequential Visual Place\n Recognition","summary":" Visual Place Recognition aims at recognizing previously visited places by\nrelying on visual clues, and it is used in robotics applications for SLAM and\nlocalization. Since typically a mobile robot has access to a continuous stream\nof frames, this task is naturally cast as a sequence-to-sequence localization\nproblem. Nevertheless, obtaining sequences of labelled data is much more\nexpensive than collecting isolated images, which can be done in an automated\nway with little supervision. As a mitigation to this problem, we propose a\nnovel Joint Image and Sequence Training protocol (JIST) that leverages large\nuncurated sets of images through a multi-task learning framework. With JIST we\nalso introduce SeqGeM, an aggregation layer that revisits the popular GeM\npooling to produce a single robust and compact embedding from a sequence of\nsingle-frame embeddings. We show that our model is able to outperform previous\nstate of the art while being faster, using 8 times smaller descriptors, having\na lighter architecture and allowing to process sequences of various lengths.\nCode is available at https://github.com/ga1i13o/JIST\n","authors":["Gabriele Berton","Gabriele Trivigno","Barbara Caputo","Carlo Masone"],"pdf_url":"https://arxiv.org/pdf/2403.19787v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19786v1","updated":"2024-03-28T19:10:54Z","published":"2024-03-28T19:10:54Z","title":"Zero-shot Prompt-based Video Encoder for Surgical Gesture Recognition","summary":" Purpose: Surgical video is an important data stream for gesture recognition.\nThus, robust visual encoders for those data-streams is similarly important.\nMethods: Leveraging the Bridge-Prompt framework, we fine-tune a pre-trained\nvision-text model (CLIP) for gesture recognition in surgical videos. This can\nutilize extensive outside video data such as text, but also make use of label\nmeta-data and weakly supervised contrastive losses. Results: Our experiments\nshow that prompt-based video encoder outperforms standard encoders in surgical\ngesture recognition tasks. Notably, it displays strong performance in zero-shot\nscenarios, where gestures/tasks that were not provided during the encoder\ntraining phase are included in the prediction phase. Additionally, we measure\nthe benefit of inclusion text descriptions in the feature extractor training\nschema. Conclusion: Bridge-Prompt and similar pre-trained+fine-tuned video\nencoder models present significant visual representation for surgical robotics,\nespecially in gesture recognition tasks. Given the diverse range of surgical\ntasks (gestures), the ability of these models to zero-shot transfer without the\nneed for any task (gesture) specific retraining makes them invaluable.\n","authors":["Mingxing Rao","Yinhong Qin","Soheil Kolouri","Jie Ying Wu","Daniel Moyer"],"pdf_url":"https://arxiv.org/pdf/2403.19786v1.pdf","comment":"17 pages,4 figures, 7 tables, IPCAI 2024"},{"id":"http://arxiv.org/abs/2403.19782v1","updated":"2024-03-28T19:07:26Z","published":"2024-03-28T19:07:26Z","title":"ENet-21: An Optimized light CNN Structure for Lane Detection","summary":" Lane detection for autonomous vehicles is an important concept, yet it is a\nchallenging issue of driver assistance systems in modern vehicles. The\nemergence of deep learning leads to significant progress in self-driving cars.\nConventional deep learning-based methods handle lane detection problems as a\nbinary segmentation task and determine whether a pixel belongs to a line. These\nmethods rely on the assumption of a fixed number of lanes, which does not\nalways work. This study aims to develop an optimal structure for the lane\ndetection problem, offering a promising solution for driver assistance features\nin modern vehicles by utilizing a machine learning method consisting of binary\nsegmentation and Affinity Fields that can manage varying numbers of lanes and\nlane change scenarios. In this approach, the Convolutional Neural Network\n(CNN), is selected as a feature extractor, and the final output is obtained\nthrough clustering of the semantic segmentation and Affinity Field outputs. Our\nmethod uses less complex CNN architecture than exi\n","authors":["Seyed Rasoul Hosseini","Mohammad Teshnehlab"],"pdf_url":"https://arxiv.org/pdf/2403.19782v1.pdf","comment":"The paper is under review by Soft Computing journal"},{"id":"http://arxiv.org/abs/2403.19780v1","updated":"2024-03-28T19:06:37Z","published":"2024-03-28T19:06:37Z","title":"Mitigating Motion Blur in Neural Radiance Fields with Events and Frames","summary":" Neural Radiance Fields (NeRFs) have shown great potential in novel view\nsynthesis. However, they struggle to render sharp images when the data used for\ntraining is affected by motion blur. On the other hand, event cameras excel in\ndynamic scenes as they measure brightness changes with microsecond resolution\nand are thus only marginally affected by blur. Recent methods attempt to\nenhance NeRF reconstructions under camera motion by fusing frames and events.\nHowever, they face challenges in recovering accurate color content or constrain\nthe NeRF to a set of predefined camera poses, harming reconstruction quality in\nchallenging conditions. This paper proposes a novel formulation addressing\nthese issues by leveraging both model- and learning-based modules. We\nexplicitly model the blur formation process, exploiting the event double\nintegral as an additional model-based prior. Additionally, we model the\nevent-pixel response using an end-to-end learnable response function, allowing\nour method to adapt to non-idealities in the real event-camera sensor. We show,\non synthetic and real data, that the proposed approach outperforms existing\ndeblur NeRFs that use only frames as well as those that combine frames and\nevents by +6.13dB and +2.48dB, respectively.\n","authors":["Marco Cannici","Davide Scaramuzza"],"pdf_url":"https://arxiv.org/pdf/2403.19780v1.pdf","comment":"IEEE Conference on Computer Vision and Pattern Recognition (CVPR),\n 2024"},{"id":"http://arxiv.org/abs/2403.19776v1","updated":"2024-03-28T18:58:43Z","published":"2024-03-28T18:58:43Z","title":"CLoRA: A Contrastive Approach to Compose Multiple LoRA Models","summary":" Low-Rank Adaptations (LoRAs) have emerged as a powerful and popular technique\nin the field of image generation, offering a highly effective way to adapt and\nrefine pre-trained deep learning models for specific tasks without the need for\ncomprehensive retraining. By employing pre-trained LoRA models, such as those\nrepresenting a specific cat and a particular dog, the objective is to generate\nan image that faithfully embodies both animals as defined by the LoRAs.\nHowever, the task of seamlessly blending multiple concept LoRAs to capture a\nvariety of concepts in one image proves to be a significant challenge. Common\napproaches often fall short, primarily because the attention mechanisms within\ndifferent LoRA models overlap, leading to scenarios where one concept may be\ncompletely ignored (e.g., omitting the dog) or where concepts are incorrectly\ncombined (e.g., producing an image of two cats instead of one cat and one dog).\nTo overcome these issues, CLoRA addresses them by updating the attention maps\nof multiple LoRA models and leveraging them to create semantic masks that\nfacilitate the fusion of latent representations. Our method enables the\ncreation of composite images that truly reflect the characteristics of each\nLoRA, successfully merging multiple concepts or styles. Our comprehensive\nevaluations, both qualitative and quantitative, demonstrate that our approach\noutperforms existing methodologies, marking a significant advancement in the\nfield of image generation with LoRAs. Furthermore, we share our source code,\nbenchmark dataset, and trained LoRA models to promote further research on this\ntopic.\n","authors":["Tuna Han Salih Meral","Enis Simsar","Federico Tombari","Pinar Yanardag"],"pdf_url":"https://arxiv.org/pdf/2403.19776v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03629v2","updated":"2024-03-28T18:50:50Z","published":"2023-10-05T16:03:25Z","title":"Wasserstein Distortion: Unifying Fidelity and Realism","summary":" We introduce a distortion measure for images, Wasserstein distortion, that\nsimultaneously generalizes pixel-level fidelity on the one hand and realism or\nperceptual quality on the other. We show how Wasserstein distortion reduces to\na pure fidelity constraint or a pure realism constraint under different\nparameter choices and discuss its metric properties. Pairs of images that are\nclose under Wasserstein distortion illustrate its utility. In particular, we\ngenerate random textures that have high fidelity to a reference texture in one\nlocation of the image and smoothly transition to an independent realization of\nthe texture as one moves away from this point. Wasserstein distortion attempts\nto generalize and unify prior work on texture generation, image realism and\ndistortion, and models of the early human visual system, in the form of an\noptimizable metric in the mathematical sense.\n","authors":["Yang Qiu","Aaron B. Wagner","Johannes Ballé","Lucas Theis"],"pdf_url":"https://arxiv.org/pdf/2310.03629v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19773v1","updated":"2024-03-28T18:50:19Z","published":"2024-03-28T18:50:19Z","title":"ShapeFusion: A 3D diffusion model for localized shape editing","summary":" In the realm of 3D computer vision, parametric models have emerged as a\nground-breaking methodology for the creation of realistic and expressive 3D\navatars. Traditionally, they rely on Principal Component Analysis (PCA), given\nits ability to decompose data to an orthonormal space that maximally captures\nshape variations. However, due to the orthogonality constraints and the global\nnature of PCA's decomposition, these models struggle to perform localized and\ndisentangled editing of 3D shapes, which severely affects their use in\napplications requiring fine control such as face sculpting. In this paper, we\nleverage diffusion models to enable diverse and fully localized edits on 3D\nmeshes, while completely preserving the un-edited regions. We propose an\neffective diffusion masking training strategy that, by design, facilitates\nlocalized manipulation of any shape region, without being limited to predefined\nregions or to sparse sets of predefined control vertices. Following our\nframework, a user can explicitly set their manipulation region of choice and\ndefine an arbitrary set of vertices as handles to edit a 3D mesh. Compared to\nthe current state-of-the-art our method leads to more interpretable shape\nmanipulations than methods relying on latent code state, greater localization\nand generation diversity while offering faster inference than optimization\nbased approaches. Project page: https://rolpotamias.github.io/Shapefusion/\n","authors":["Rolandos Alexandros Potamias","Michail Tarasiou Stylianos Ploumpis","Stefanos Zafeiriou"],"pdf_url":"https://arxiv.org/pdf/2403.19773v1.pdf","comment":"Project Page: https://rolpotamias.github.io/Shapefusion/"},{"id":"http://arxiv.org/abs/2210.06186v3","updated":"2024-03-28T18:49:33Z","published":"2022-10-12T13:15:54Z","title":"GOTCHA: Real-Time Video Deepfake Detection via Challenge-Response","summary":" With the rise of AI-enabled Real-Time Deepfakes (RTDFs), the integrity of\nonline video interactions has become a growing concern. RTDFs have now made it\nfeasible to replace an imposter's face with their victim in live video\ninteractions. Such advancement in deepfakes also coaxes detection to rise to\nthe same standard. However, existing deepfake detection techniques are\nasynchronous and hence ill-suited for RTDFs. To bridge this gap, we propose a\nchallenge-response approach that establishes authenticity in live settings. We\nfocus on talking-head style video interaction and present a taxonomy of\nchallenges that specifically target inherent limitations of RTDF generation\npipelines. We evaluate representative examples from the taxonomy by collecting\na unique dataset comprising eight challenges, which consistently and visibly\ndegrades the quality of state-of-the-art deepfake generators. These results are\ncorroborated both by humans and a new automated scoring function, leading to\n88.6% and 80.1% AUC, respectively. The findings underscore the promising\npotential of challenge-response systems for explainable and scalable real-time\ndeepfake detection in practical scenarios. We provide access to data and code\nat https://github.com/mittalgovind/GOTCHA-Deepfakes\n","authors":["Govind Mittal","Chinmay Hegde","Nasir Memon"],"pdf_url":"https://arxiv.org/pdf/2210.06186v3.pdf","comment":"20 pages, 19 figures, Code and data released"},{"id":"http://arxiv.org/abs/2403.19768v1","updated":"2024-03-28T18:43:25Z","published":"2024-03-28T18:43:25Z","title":"Using Deep Learning to Increase Eye-Tracking Robustness, Accuracy, and\n Precision in Virtual Reality","summary":" Algorithms for the estimation of gaze direction from mobile and video-based\neye trackers typically involve tracking a feature of the eye that moves through\nthe eye camera image in a way that covaries with the shifting gaze direction,\nsuch as the center or boundaries of the pupil. Tracking these features using\ntraditional computer vision techniques can be difficult due to partial\nocclusion and environmental reflections. Although recent efforts to use machine\nlearning (ML) for pupil tracking have demonstrated superior results when\nevaluated using standard measures of segmentation performance, little is known\nof how these networks may affect the quality of the final gaze estimate. This\nwork provides an objective assessment of the impact of several contemporary\nML-based methods for eye feature tracking when the subsequent gaze estimate is\nproduced using either feature-based or model-based methods. Metrics include the\naccuracy and precision of the gaze estimate, as well as drop-out rate.\n","authors":["Kevin Barkevich","Reynold Bailey","Gabriel J. Diaz"],"pdf_url":"https://arxiv.org/pdf/2403.19768v1.pdf","comment":"16 pages, 10 figures, accepted to ETRA 2024 Full Papers"},{"id":"http://arxiv.org/abs/2302.06089v5","updated":"2024-03-28T18:31:28Z","published":"2023-02-13T04:17:47Z","title":"Federated attention consistent learning models for prostate cancer\n diagnosis and Gleason grading","summary":" Artificial intelligence (AI) holds significant promise in transforming\nmedical imaging, enhancing diagnostics, and refining treatment strategies.\nHowever, the reliance on extensive multicenter datasets for training AI models\nposes challenges due to privacy concerns. Federated learning provides a\nsolution by facilitating collaborative model training across multiple centers\nwithout sharing raw data. This study introduces a federated\nattention-consistent learning (FACL) framework to address challenges associated\nwith large-scale pathological images and data heterogeneity. FACL enhances\nmodel generalization by maximizing attention consistency between local clients\nand the server model. To ensure privacy and validate robustness, we\nincorporated differential privacy by introducing noise during parameter\ntransfer. We assessed the effectiveness of FACL in cancer diagnosis and Gleason\ngrading tasks using 19,461 whole-slide images of prostate cancer from multiple\ncenters. In the diagnosis task, FACL achieved an area under the curve (AUC) of\n0.9718, outperforming seven centers with an average AUC of 0.9499 when\ncategories are relatively balanced. For the Gleason grading task, FACL attained\na Kappa score of 0.8463, surpassing the average Kappa score of 0.7379 from six\ncenters. In conclusion, FACL offers a robust, accurate, and cost-effective AI\ntraining model for prostate cancer pathology while maintaining effective data\nsafeguards.\n","authors":["Fei Kong","Xiyue Wang","Jinxi Xiang","Sen Yang","Xinran Wang","Meng Yue","Jun Zhang","Junhan Zhao","Xiao Han","Yuhan Dong","Biyue Zhu","Fang Wang","Yueping Liu"],"pdf_url":"https://arxiv.org/pdf/2302.06089v5.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2311.17693v2","updated":"2024-03-28T18:24:46Z","published":"2023-11-29T15:00:06Z","title":"Toward a Surgeon-in-the-Loop Ophthalmic Robotic Apprentice using\n Reinforcement and Imitation Learning","summary":" Robotic-assisted surgical systems have demonstrated significant potential in\nenhancing surgical precision and minimizing human errors. However, existing\nsystems lack the ability to accommodate the unique preferences and requirements\nof individual surgeons. Additionally, they primarily focus on general surgeries\n(e.g., laparoscopy) and are not suitable for highly precise microsurgeries,\nsuch as ophthalmic procedures. Thus, we propose a simulation-based image-guided\napproach for surgeon-centered autonomous agents that can adapt to the\nindividual surgeon's skill level and preferred surgical techniques during\nophthalmic cataract surgery. Our approach utilizes a simulated environment to\ntrain reinforcement and imitation learning agents guided by image data to\nperform all tasks of the incision phase of cataract surgery. By integrating the\nsurgeon's actions and preferences into the training process with the\nsurgeon-in-the-loop, our approach enables the robot to implicitly learn and\nadapt to the individual surgeon's unique approach through demonstrations. This\nresults in a more intuitive and personalized surgical experience for the\nsurgeon. Simultaneously, it ensures consistent performance for the autonomous\nrobotic apprentice. We define and evaluate the effectiveness of our approach\nusing our proposed metrics; and highlight the trade-off between a generic agent\nand a surgeon-centered adapted agent. Moreover, our approach has the potential\nto extend to other ophthalmic surgical procedures, opening the door to a new\ngeneration of surgeon-in-the-loop autonomous surgical robots. We provide an\nopen-source simulation framework for future development and reproducibility.\n","authors":["Amr Gomaa","Bilal Mahdy","Niko Kleer","Antonio Krüger"],"pdf_url":"https://arxiv.org/pdf/2311.17693v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09997v3","updated":"2024-03-28T18:24:20Z","published":"2023-07-19T14:10:55Z","title":"TUNeS: A Temporal U-Net with Self-Attention for Video-based Surgical\n Phase Recognition","summary":" To enable context-aware computer assistance in the operating room of the\nfuture, cognitive systems need to understand automatically which surgical phase\nis being performed by the medical team. The primary source of information for\nsurgical phase recognition is typically video, which presents two challenges:\nextracting meaningful features from the video stream and effectively modeling\ntemporal information in the sequence of visual features. For temporal modeling,\nattention mechanisms have gained popularity due to their ability to capture\nlong-range dependencies. In this paper, we explore design choices for attention\nin existing temporal models for surgical phase recognition and propose a novel\napproach that uses attention more effectively and does not require hand-crafted\nconstraints: TUNeS, an efficient and simple temporal model that incorporates\nself-attention at the core of a convolutional U-Net structure. In addition, we\npropose to train the feature extractor, a standard CNN, together with an LSTM\non preferably long video segments, i.e., with long temporal context. In our\nexperiments, almost all temporal models performed better on top of feature\nextractors that were trained with longer temporal context. On these\ncontextualized features, TUNeS achieves state-of-the-art results on the\nCholec80 and AutoLaparo datasets.\n","authors":["Isabel Funke","Dominik Rivoir","Stefanie Krell","Stefanie Speidel"],"pdf_url":"https://arxiv.org/pdf/2307.09997v3.pdf","comment":"Major revision: comparison to Temporal U-Transformer"},{"id":"http://arxiv.org/abs/2403.19738v1","updated":"2024-03-28T17:54:38Z","published":"2024-03-28T17:54:38Z","title":"MIST: Mitigating Intersectional Bias with Disentangled Cross-Attention\n Editing in Text-to-Image Diffusion Models","summary":" Diffusion-based text-to-image models have rapidly gained popularity for their\nability to generate detailed and realistic images from textual descriptions.\nHowever, these models often reflect the biases present in their training data,\nespecially impacting marginalized groups. While prior efforts to debias\nlanguage models have focused on addressing specific biases, such as racial or\ngender biases, efforts to tackle intersectional bias have been limited.\nIntersectional bias refers to the unique form of bias experienced by\nindividuals at the intersection of multiple social identities. Addressing\nintersectional bias is crucial because it amplifies the negative effects of\ndiscrimination based on race, gender, and other identities. In this paper, we\nintroduce a method that addresses intersectional bias in diffusion-based\ntext-to-image models by modifying cross-attention maps in a disentangled\nmanner. Our approach utilizes a pre-trained Stable Diffusion model, eliminates\nthe need for an additional set of reference images, and preserves the original\nquality for unaltered concepts. Comprehensive experiments demonstrate that our\nmethod surpasses existing approaches in mitigating both single and\nintersectional biases across various attributes. We make our source code and\ndebiased models for various attributes available to encourage fairness in\ngenerative models and to support further research.\n","authors":["Hidir Yesiltepe","Kiymet Akdemir","Pinar Yanardag"],"pdf_url":"https://arxiv.org/pdf/2403.19738v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09669v3","updated":"2024-03-28T04:45:23Z","published":"2024-01-30T08:18:20Z","title":"STREAM: Spatio-TempoRal Evaluation and Analysis Metric for Video\n Generative Models","summary":" Image generative models have made significant progress in generating\nrealistic and diverse images, supported by comprehensive guidance from various\nevaluation metrics. However, current video generative models struggle to\ngenerate even short video clips, with limited tools that provide insights for\nimprovements. Current video evaluation metrics are simple adaptations of image\nmetrics by switching the embeddings with video embedding networks, which may\nunderestimate the unique characteristics of video. Our analysis reveals that\nthe widely used Frechet Video Distance (FVD) has a stronger emphasis on the\nspatial aspect than the temporal naturalness of video and is inherently\nconstrained by the input size of the embedding networks used, limiting it to 16\nframes. Additionally, it demonstrates considerable instability and diverges\nfrom human evaluations. To address the limitations, we propose STREAM, a new\nvideo evaluation metric uniquely designed to independently evaluate spatial and\ntemporal aspects. This feature allows comprehensive analysis and evaluation of\nvideo generative models from various perspectives, unconstrained by video\nlength. We provide analytical and experimental evidence demonstrating that\nSTREAM provides an effective evaluation tool for both visual and temporal\nquality of videos, offering insights into area of improvement for video\ngenerative models. To the best of our knowledge, STREAM is the first evaluation\nmetric that can separately assess the temporal and spatial aspects of videos.\nOur code is available at https://github.com/pro2nit/STREAM.\n","authors":["Pum Jun Kim","Seojun Kim","Jaejun Yoo"],"pdf_url":"https://arxiv.org/pdf/2403.09669v3.pdf","comment":"Our work is accepted to ICLR 2024"}]},"2024-03-29T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2403.20331v1","updated":"2024-03-29T17:59:53Z","published":"2024-03-29T17:59:53Z","title":"Unsolvable Problem Detection: Evaluating Trustworthiness of Vision\n Language Models","summary":" This paper introduces a novel and significant challenge for Vision Language\nModels (VLMs), termed Unsolvable Problem Detection (UPD). UPD examines the\nVLM's ability to withhold answers when faced with unsolvable problems in the\ncontext of Visual Question Answering (VQA) tasks. UPD encompasses three\ndistinct settings: Absent Answer Detection (AAD), Incompatible Answer Set\nDetection (IASD), and Incompatible Visual Question Detection (IVQD). To deeply\ninvestigate the UPD problem, extensive experiments indicate that most VLMs,\nincluding GPT-4V and LLaVA-Next-34B, struggle with our benchmarks to varying\nextents, highlighting significant room for the improvements. To address UPD, we\nexplore both training-free and training-based solutions, offering new insights\ninto their effectiveness and limitations. We hope our insights, together with\nfuture efforts within the proposed UPD settings, will enhance the broader\nunderstanding and development of more practical and reliable VLMs.\n","authors":["Atsuyuki Miyai","Jingkang Yang","Jingyang Zhang","Yifei Ming","Qing Yu","Go Irie","Yixuan Li","Hai Li","Ziwei Liu","Kiyoharu Aizawa"],"pdf_url":"https://arxiv.org/pdf/2403.20331v1.pdf","comment":"Code: https://github.com/AtsuMiyai/UPD"},{"id":"http://arxiv.org/abs/2403.20330v1","updated":"2024-03-29T17:59:34Z","published":"2024-03-29T17:59:34Z","title":"Are We on the Right Way for Evaluating Large Vision-Language Models?","summary":" Large vision-language models (LVLMs) have recently achieved rapid progress,\nsparking numerous studies to evaluate their multi-modal capabilities. However,\nwe dig into current evaluation works and identify two primary issues: 1) Visual\ncontent is unnecessary for many samples. The answers can be directly inferred\nfrom the questions and options, or the world knowledge embedded in LLMs. This\nphenomenon is prevalent across current benchmarks. For instance, GeminiPro\nachieves 42.9% on the MMMU benchmark without any visual input, and outperforms\nthe random choice baseline across six benchmarks over 20% on average. 2)\nUnintentional data leakage exists in LLM and LVLM training. LLM and LVLM could\nstill answer some visual-necessary questions without visual content, indicating\nthe memorizing of these samples within large-scale training data. For example,\nSphinx-X-MoE gets 43.6% on MMMU without accessing images, surpassing its LLM\nbackbone with 17.9%. Both problems lead to misjudgments of actual multi-modal\ngains and potentially misguide the study of LVLM. To this end, we present\nMMStar, an elite vision-indispensable multi-modal benchmark comprising 1,500\nsamples meticulously selected by humans. MMStar benchmarks 6 core capabilities\nand 18 detailed axes, aiming to evaluate LVLMs' multi-modal capacities with\ncarefully balanced and purified samples. These samples are first roughly\nselected from current benchmarks with an automated pipeline, human review is\nthen involved to ensure each curated sample exhibits visual dependency, minimal\ndata leakage, and requires advanced multi-modal capabilities. Moreover, two\nmetrics are developed to measure data leakage and actual performance gain in\nmulti-modal training. We evaluate 16 leading LVLMs on MMStar to assess their\nmulti-modal capabilities, and on 7 benchmarks with the proposed metrics to\ninvestigate their data leakage and actual multi-modal gain.\n","authors":["Lin Chen","Jinsong Li","Xiaoyi Dong","Pan Zhang","Yuhang Zang","Zehui Chen","Haodong Duan","Jiaqi Wang","Yu Qiao","Dahua Lin","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.20330v1.pdf","comment":"Project page: https://mmstar-benchmark.github.io/"},{"id":"http://arxiv.org/abs/2311.17245v5","updated":"2024-03-29T17:58:34Z","published":"2023-11-28T21:39:20Z","title":"LightGaussian: Unbounded 3D Gaussian Compression with 15x Reduction and\n 200+ FPS","summary":" Recent advancements in real-time neural rendering using point-based\ntechniques have paved the way for the widespread adoption of 3D\nrepresentations. However, foundational approaches like 3D Gaussian Splatting\ncome with a substantial storage overhead caused by growing the SfM points to\nmillions, often demanding gigabyte-level disk space for a single unbounded\nscene, posing significant scalability challenges and hindering the splatting\nefficiency.\n To address this challenge, we introduce LightGaussian, a novel method\ndesigned to transform 3D Gaussians into a more efficient and compact format.\nDrawing inspiration from the concept of Network Pruning, LightGaussian\nidentifies Gaussians that are insignificant in contributing to the scene\nreconstruction and adopts a pruning and recovery process, effectively reducing\nredundancy in Gaussian counts while preserving visual effects. Additionally,\nLightGaussian employs distillation and pseudo-view augmentation to distill\nspherical harmonics to a lower degree, allowing knowledge transfer to more\ncompact representations while maintaining reflectance. Furthermore, we propose\na hybrid scheme, VecTree Quantization, to quantize all attributes, resulting in\nlower bitwidth representations with minimal accuracy losses.\n In summary, LightGaussian achieves an averaged compression rate over 15x\nwhile boosting the FPS from 139 to 215, enabling an efficient representation of\ncomplex scenes on Mip-NeRF 360, Tank and Temple datasets.\n Project website: https://lightgaussian.github.io/\n","authors":["Zhiwen Fan","Kevin Wang","Kairun Wen","Zehao Zhu","Dejia Xu","Zhangyang Wang"],"pdf_url":"https://arxiv.org/pdf/2311.17245v5.pdf","comment":"16pages, 8figures"},{"id":"http://arxiv.org/abs/2310.11256v2","updated":"2024-03-29T17:50:17Z","published":"2023-10-17T13:22:36Z","title":"Gromov-Wassertein-like Distances in the Gaussian Mixture Models Space","summary":" The Gromov-Wasserstein (GW) distance is frequently used in machine learning\nto compare distributions across distinct metric spaces. Despite its utility, it\nremains computationally intensive, especially for large-scale problems.\nRecently, a novel Wasserstein distance specifically tailored for Gaussian\nmixture models and known as MW (mixture Wasserstein) has been introduced by\nseveral authors. In scenarios where data exhibit clustering, this approach\nsimplifies to a small-scale discrete optimal transport problem, which\ncomplexity depends solely on the number of Gaussian components in the GMMs.\nThis paper aims to extend MW by introducing new Gromov-type distances. These\ndistances are designed to be isometry-invariant in Euclidean spaces and are\napplicable for comparing GMMs across different dimensional spaces. Our first\ncontribution is the Mixture Gromov Wasserstein distance (MGW), which can be\nviewed as a Gromovized version of MW. This new distance has a straightforward\ndiscrete formulation, making it highly efficient for estimating distances\nbetween GMMs in practical applications. To facilitate the derivation of a\ntransport plan between GMMs, we present a second distance, the Embedded\nWasserstein distance (EW). This distance turns out to be closely related to\nseveral recent alternatives to Gromov-Wasserstein. We show that EW can be\nadapted to derive a distance as well as optimal transportation plans between\nGMMs. We demonstrate the efficiency of these newly proposed distances on medium\nto large-scale problems, including shape matching and hyperspectral image color\ntransfer.\n","authors":["Antoine Salmona","Julie Delon","Agnès Desolneux"],"pdf_url":"https://arxiv.org/pdf/2310.11256v2.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2310.05737v3","updated":"2024-03-29T17:44:41Z","published":"2023-10-09T14:10:29Z","title":"Language Model Beats Diffusion -- Tokenizer is Key to Visual Generation","summary":" While Large Language Models (LLMs) are the dominant models for generative\ntasks in language, they do not perform as well as diffusion models on image and\nvideo generation. To effectively use LLMs for visual generation, one crucial\ncomponent is the visual tokenizer that maps pixel-space inputs to discrete\ntokens appropriate for LLM learning. In this paper, we introduce MAGVIT-v2, a\nvideo tokenizer designed to generate concise and expressive tokens for both\nvideos and images using a common token vocabulary. Equipped with this new\ntokenizer, we show that LLMs outperform diffusion models on standard image and\nvideo generation benchmarks including ImageNet and Kinetics. In addition, we\ndemonstrate that our tokenizer surpasses the previously top-performing video\ntokenizer on two more tasks: (1) video compression comparable to the\nnext-generation video codec (VCC) according to human evaluations, and (2)\nlearning effective representations for action recognition tasks.\n","authors":["Lijun Yu","José Lezama","Nitesh B. Gundavarapu","Luca Versari","Kihyuk Sohn","David Minnen","Yong Cheng","Vighnesh Birodkar","Agrim Gupta","Xiuye Gu","Alexander G. Hauptmann","Boqing Gong","Ming-Hsuan Yang","Irfan Essa","David A. Ross","Lu Jiang"],"pdf_url":"https://arxiv.org/pdf/2310.05737v3.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2403.20320v1","updated":"2024-03-29T17:43:58Z","published":"2024-03-29T17:43:58Z","title":"MTLoRA: A Low-Rank Adaptation Approach for Efficient Multi-Task Learning","summary":" Adapting models pre-trained on large-scale datasets to a variety of\ndownstream tasks is a common strategy in deep learning. Consequently,\nparameter-efficient fine-tuning methods have emerged as a promising way to\nadapt pre-trained models to different tasks while training only a minimal\nnumber of parameters. While most of these methods are designed for single-task\nadaptation, parameter-efficient training in Multi-Task Learning (MTL)\narchitectures is still unexplored. In this paper, we introduce MTLoRA, a novel\nframework for parameter-efficient training of MTL models. MTLoRA employs\nTask-Agnostic and Task-Specific Low-Rank Adaptation modules, which effectively\ndisentangle the parameter space in MTL fine-tuning, thereby enabling the model\nto adeptly handle both task specialization and interaction within MTL contexts.\nWe applied MTLoRA to hierarchical-transformer-based MTL architectures, adapting\nthem to multiple downstream dense prediction tasks. Our extensive experiments\non the PASCAL dataset show that MTLoRA achieves higher accuracy on downstream\ntasks compared to fully fine-tuning the MTL model while reducing the number of\ntrainable parameters by 3.6x. Furthermore, MTLoRA establishes a Pareto-optimal\ntrade-off between the number of trainable parameters and the accuracy of the\ndownstream tasks, outperforming current state-of-the-art parameter-efficient\ntraining methods in both accuracy and efficiency. Our code is publicly\navailable.\n","authors":["Ahmed Agiza","Marina Neseem","Sherief Reda"],"pdf_url":"https://arxiv.org/pdf/2403.20320v1.pdf","comment":"Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern\n Recognition (CVPR), 2024"},{"id":"http://arxiv.org/abs/2401.15741v4","updated":"2024-03-29T17:42:21Z","published":"2024-01-28T19:58:19Z","title":"SERNet-Former: Semantic Segmentation by Efficient Residual Network with\n Attention-Boosting Gates and Attention-Fusion Networks","summary":" Improving the efficiency of state-of-the-art methods in semantic segmentation\nrequires overcoming the increasing computational cost as well as issues such as\nfusing semantic information from global and local contexts. Based on the recent\nsuccess and problems that convolutional neural networks (CNNs) encounter in\nsemantic segmentation, this research proposes an encoder-decoder architecture\nwith a unique efficient residual network, Efficient-ResNet. Attention-boosting\ngates (AbGs) and attention-boosting modules (AbMs) are deployed by aiming to\nfuse the equivariant and feature-based semantic information with the equivalent\nsizes of the output of global context of the efficient residual network in the\nencoder. Respectively, the decoder network is developed with the additional\nattention-fusion networks (AfNs) inspired by AbM. AfNs are designed to improve\nthe efficiency in the one-to-one conversion of the semantic information by\ndeploying additional convolution layers in the decoder part. Our network is\ntested on the challenging CamVid and Cityscapes datasets, and the proposed\nmethods reveal significant improvements on the residual networks. To the best\nof our knowledge, the developed network, SERNet-Former, achieves\nstate-of-the-art results (84.62 % mean IoU) on CamVid dataset and challenging\nresults (87.35 % mean IoU) on Cityscapes validation dataset.\n","authors":["Serdar Erisen"],"pdf_url":"https://arxiv.org/pdf/2401.15741v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20318v1","updated":"2024-03-29T17:41:57Z","published":"2024-03-29T17:41:57Z","title":"SeaBird: Segmentation in Bird's View with Dice Loss Improves Monocular\n 3D Detection of Large Objects","summary":" Monocular 3D detectors achieve remarkable performance on cars and smaller\nobjects. However, their performance drops on larger objects, leading to fatal\naccidents. Some attribute the failures to training data scarcity or their\nreceptive field requirements of large objects. In this paper, we highlight this\nunderstudied problem of generalization to large objects. We find that modern\nfrontal detectors struggle to generalize to large objects even on nearly\nbalanced datasets. We argue that the cause of failure is the sensitivity of\ndepth regression losses to noise of larger objects. To bridge this gap, we\ncomprehensively investigate regression and dice losses, examining their\nrobustness under varying error levels and object sizes. We mathematically prove\nthat the dice loss leads to superior noise-robustness and model convergence for\nlarge objects compared to regression losses for a simplified case. Leveraging\nour theoretical insights, we propose SeaBird (Segmentation in Bird's View) as\nthe first step towards generalizing to large objects. SeaBird effectively\nintegrates BEV segmentation on foreground objects for 3D detection, with the\nsegmentation head trained with the dice loss. SeaBird achieves SoTA results on\nthe KITTI-360 leaderboard and improves existing detectors on the nuScenes\nleaderboard, particularly for large objects. Code and models at\nhttps://github.com/abhi1kumar/SeaBird\n","authors":["Abhinav Kumar","Yuliang Guo","Xinyu Huang","Liu Ren","Xiaoming Liu"],"pdf_url":"https://arxiv.org/pdf/2403.20318v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.20317v1","updated":"2024-03-29T17:40:37Z","published":"2024-03-29T17:40:37Z","title":"Convolutional Prompting meets Language Models for Continual Learning","summary":" Continual Learning (CL) enables machine learning models to learn from\ncontinuously shifting new training data in absence of data from old tasks.\nRecently, pretrained vision transformers combined with prompt tuning have shown\npromise for overcoming catastrophic forgetting in CL. These approaches rely on\na pool of learnable prompts which can be inefficient in sharing knowledge\nacross tasks leading to inferior performance. In addition, the lack of\nfine-grained layer specific prompts does not allow these to fully express the\nstrength of the prompts for CL. We address these limitations by proposing\nConvPrompt, a novel convolutional prompt creation mechanism that maintains\nlayer-wise shared embeddings, enabling both layer-specific learning and better\nconcept transfer across tasks. The intelligent use of convolution enables us to\nmaintain a low parameter overhead without compromising performance. We further\nleverage Large Language Models to generate fine-grained text descriptions of\neach category which are used to get task similarity and dynamically decide the\nnumber of prompts to be learned. Extensive experiments demonstrate the\nsuperiority of ConvPrompt and improves SOTA by ~3% with significantly less\nparameter overhead. We also perform strong ablation over various modules to\ndisentangle the importance of different components.\n","authors":["Anurag Roy","Riddhiman Moulick","Vinay K. Verma","Saptarshi Ghosh","Abir Das"],"pdf_url":"https://arxiv.org/pdf/2403.20317v1.pdf","comment":"CVPR 2024 Camera Ready"},{"id":"http://arxiv.org/abs/2307.08727v2","updated":"2024-03-29T17:38:00Z","published":"2023-07-17T17:48:06Z","title":"Learning to Count without Annotations","summary":" While recent supervised methods for reference-based object counting continue\nto improve the performance on benchmark datasets, they have to rely on small\ndatasets due to the cost associated with manually annotating dozens of objects\nin images. We propose UnCounTR, a model that can learn this task without\nrequiring any manual annotations. To this end, we construct \"Self-Collages\",\nimages with various pasted objects as training samples, that provide a rich\nlearning signal covering arbitrary object types and counts. Our method builds\non existing unsupervised representations and segmentation techniques to\nsuccessfully demonstrate for the first time the ability of reference-based\ncounting without manual supervision. Our experiments show that our method not\nonly outperforms simple baselines and generic models such as FasterRCNN and\nDETR, but also matches the performance of supervised counting models in some\ndomains.\n","authors":["Lukas Knobel","Tengda Han","Yuki M. Asano"],"pdf_url":"https://arxiv.org/pdf/2307.08727v2.pdf","comment":"Accepted at CVPR'24. Code available at\n https://github.com/lukasknobel/SelfCollages"},{"id":"http://arxiv.org/abs/2310.18274v2","updated":"2024-03-29T17:34:40Z","published":"2023-10-27T16:59:51Z","title":"LipSim: A Provably Robust Perceptual Similarity Metric","summary":" Recent years have seen growing interest in developing and applying perceptual\nsimilarity metrics. Research has shown the superiority of perceptual metrics\nover pixel-wise metrics in aligning with human perception and serving as a\nproxy for the human visual system. On the other hand, as perceptual metrics\nrely on neural networks, there is a growing concern regarding their resilience,\ngiven the established vulnerability of neural networks to adversarial attacks.\nIt is indeed logical to infer that perceptual metrics may inherit both the\nstrengths and shortcomings of neural networks. In this work, we demonstrate the\nvulnerability of state-of-the-art perceptual similarity metrics based on an\nensemble of ViT-based feature extractors to adversarial attacks. We then\npropose a framework to train a robust perceptual similarity metric called\nLipSim (Lipschitz Similarity Metric) with provable guarantees. By leveraging\n1-Lipschitz neural networks as the backbone, LipSim provides guarded areas\naround each data point and certificates for all perturbations within an\n$\\ell_2$ ball. Finally, a comprehensive set of experiments shows the\nperformance of LipSim in terms of natural and certified scores and on the image\nretrieval application. The code is available at\nhttps://github.com/SaraGhazanfari/LipSim.\n","authors":["Sara Ghazanfari","Alexandre Araujo","Prashanth Krishnamurthy","Farshad Khorrami","Siddharth Garg"],"pdf_url":"https://arxiv.org/pdf/2310.18274v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20312v1","updated":"2024-03-29T17:33:42Z","published":"2024-03-29T17:33:42Z","title":"Learn \"No\" to Say \"Yes\" Better: Improving Vision-Language Models via\n Negations","summary":" Existing vision-language models (VLMs) treat text descriptions as a unit,\nconfusing individual concepts in a prompt and impairing visual semantic\nmatching and reasoning. An important aspect of reasoning in logic and language\nis negations. This paper highlights the limitations of popular VLMs such as\nCLIP, at understanding the implications of negations, i.e., the effect of the\nword \"not\" in a given prompt. To enable evaluation of VLMs on fluent prompts\nwith negations, we present CC-Neg, a dataset containing 228,246 images, true\ncaptions and their corresponding negated captions. Using CC-Neg along with\nmodifications to the contrastive loss of CLIP, our proposed CoN-CLIP framework,\nhas an improved understanding of negations. This training paradigm improves\nCoN-CLIP's ability to encode semantics reliably, resulting in 3.85% average\ngain in top-1 accuracy for zero-shot image classification across 8 datasets.\nFurther, CoN-CLIP outperforms CLIP on challenging compositionality benchmarks\nsuch as SugarCREPE by 4.4%, showcasing emergent compositional understanding of\nobjects, relations, and attributes in text. Overall, our work addresses a\ncrucial limitation of VLMs by introducing a dataset and framework that\nstrengthens semantic associations between images and text, demonstrating\nimproved large-scale foundation models with significantly reduced computational\ncost, promoting efficiency and accessibility.\n","authors":["Jaisidh Singh","Ishaan Shrivastava","Mayank Vatsa","Richa Singh","Aparna Bharati"],"pdf_url":"https://arxiv.org/pdf/2403.20312v1.pdf","comment":"14 pages + 6 figures in main manuscript (excluding references)"},{"id":"http://arxiv.org/abs/2312.01215v2","updated":"2024-03-29T17:30:58Z","published":"2023-12-02T19:49:27Z","title":"RNb-NeuS: Reflectance and Normal-based Multi-View 3D Reconstruction","summary":" This paper introduces a versatile paradigm for integrating multi-view\nreflectance (optional) and normal maps acquired through photometric stereo. Our\napproach employs a pixel-wise joint re-parameterization of reflectance and\nnormal, considering them as a vector of radiances rendered under simulated,\nvarying illumination. This re-parameterization enables the seamless integration\nof reflectance and normal maps as input data in neural volume rendering-based\n3D reconstruction while preserving a single optimization objective. In\ncontrast, recent multi-view photometric stereo (MVPS) methods depend on\nmultiple, potentially conflicting objectives. Despite its apparent simplicity,\nour proposed approach outperforms state-of-the-art approaches in MVPS\nbenchmarks across F-score, Chamfer distance, and mean angular error metrics.\nNotably, it significantly improves the detailed 3D reconstruction of areas with\nhigh curvature or low visibility.\n","authors":["Baptiste Brument","Robin Bruneau","Yvain Quéau","Jean Mélou","François Bernard Lauze"," Jean-Denis","Jean-Denis Durou","Lilian Calvet"],"pdf_url":"https://arxiv.org/pdf/2312.01215v2.pdf","comment":"14 pages, 13 figures, 7 tables. Accepted to CVPR 2024. The project\n page can be accessed via\n https://robinbruneau.github.io/publications/rnb_neus.html. The source code is\n available at https://github.com/bbrument/RNb-NeuS"},{"id":"http://arxiv.org/abs/2403.20309v1","updated":"2024-03-29T17:29:58Z","published":"2024-03-29T17:29:58Z","title":"InstantSplat: Unbounded Sparse-view Pose-free Gaussian Splatting in 40\n Seconds","summary":" While novel view synthesis (NVS) has made substantial progress in 3D computer\nvision, it typically requires an initial estimation of camera intrinsics and\nextrinsics from dense viewpoints. This pre-processing is usually conducted via\na Structure-from-Motion (SfM) pipeline, a procedure that can be slow and\nunreliable, particularly in sparse-view scenarios with insufficient matched\nfeatures for accurate reconstruction. In this work, we integrate the strengths\nof point-based representations (e.g., 3D Gaussian Splatting, 3D-GS) with\nend-to-end dense stereo models (DUSt3R) to tackle the complex yet unresolved\nissues in NVS under unconstrained settings, which encompasses pose-free and\nsparse view challenges. Our framework, InstantSplat, unifies dense stereo\npriors with 3D-GS to build 3D Gaussians of large-scale scenes from sparseview &\npose-free images in less than 1 minute. Specifically, InstantSplat comprises a\nCoarse Geometric Initialization (CGI) module that swiftly establishes a\npreliminary scene structure and camera parameters across all training views,\nutilizing globally-aligned 3D point maps derived from a pre-trained dense\nstereo pipeline. This is followed by the Fast 3D-Gaussian Optimization (F-3DGO)\nmodule, which jointly optimizes the 3D Gaussian attributes and the initialized\nposes with pose regularization. Experiments conducted on the large-scale\noutdoor Tanks & Temples datasets demonstrate that InstantSplat significantly\nimproves SSIM (by 32%) while concurrently reducing Absolute Trajectory Error\n(ATE) by 80%. These establish InstantSplat as a viable solution for scenarios\ninvolving posefree and sparse-view conditions. Project page:\ninstantsplat.github.io.\n","authors":["Zhiwen Fan","Wenyan Cong","Kairun Wen","Kevin Wang","Jian Zhang","Xinghao Ding","Danfei Xu","Boris Ivanovic","Marco Pavone","Georgios Pavlakos","Zhangyang Wang","Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2403.20309v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.01362v4","updated":"2024-03-29T17:11:38Z","published":"2023-07-03T21:33:40Z","title":"A Strong Baseline for Point Cloud Registration via Direct Superpoints\n Matching","summary":" Deep neural networks endow the downsampled superpoints with highly\ndiscriminative feature representations. Previous dominant point cloud\nregistration approaches match these feature representations as the first step,\ne.g., using the Sinkhorn algorithm. A RANSAC-like method is then usually\nadopted as a post-processing refinement to filter the outliers. Other dominant\nmethod is to directly predict the superpoint matchings using learned MLP\nlayers. Both of them have drawbacks: RANSAC-based methods are computationally\nintensive and prediction-based methods suffer from outputing non-existing\npoints in the point cloud. In this paper, we propose a straightforward and\neffective baseline to find correspondences of superpoints in a global matching\nmanner. We employ the normalized matching scores as weights for each\ncorrespondence, allowing us to reject the outliers and further weigh the rest\ninliers when fitting the transformation matrix without relying on the\ncumbersome RANSAC. Moreover, the entire model can be trained in an end-to-end\nfashion, leading to better accuracy. Our simple yet effective baseline shows\ncomparable or even better results than state-of-the-art methods on three\ndatasets including ModelNet, 3DMatch, and KITTI. We do not advocate our\napproach to be \\emph{the} solution for point cloud registration but use the\nresults to emphasize the role of matching strategy for point cloud\nregistration. The code and models are available at\nhttps://github.com/neu-vi/Superpoints_Registration.\n","authors":["Aniket Gupta","Yiming Xie","Hanumant Singh","Huaizu Jiang"],"pdf_url":"https://arxiv.org/pdf/2307.01362v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20287v1","updated":"2024-03-29T16:58:13Z","published":"2024-03-29T16:58:13Z","title":"Benchmarking Counterfactual Image Generation","summary":" Counterfactual image generation is pivotal for understanding the causal\nrelations of variables, with applications in interpretability and generation of\nunbiased synthetic data. However, evaluating image generation is a\nlong-standing challenge in itself. The need to evaluate counterfactual\ngeneration compounds on this challenge, precisely because counterfactuals, by\ndefinition, are hypothetical scenarios without observable ground truths. In\nthis paper, we present a novel comprehensive framework aimed at benchmarking\ncounterfactual image generation methods. We incorporate metrics that focus on\nevaluating diverse aspects of counterfactuals, such as composition,\neffectiveness, minimality of interventions, and image realism. We assess the\nperformance of three distinct conditional image generation model types, based\non the Structural Causal Model paradigm. Our work is accompanied by a\nuser-friendly Python package which allows to further evaluate and benchmark\nexisting and future counterfactual image generation methods. Our framework is\nextendable to additional SCM and other causal methods, generative models, and\ndatasets.\n","authors":["Thomas Melistas","Nikos Spyrou","Nefeli Gkouti","Pedro Sanchez","Athanasios Vlontzos","Giorgos Papanastasiou","Sotirios A. Tsaftaris"],"pdf_url":"https://arxiv.org/pdf/2403.20287v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.02560v2","updated":"2024-03-29T16:56:33Z","published":"2023-04-05T16:30:36Z","title":"VicTR: Video-conditioned Text Representations for Activity Recognition","summary":" Vision-Language models (VLMs) have excelled in the image-domain -- especially\nin zero-shot settings -- thanks to the availability of vast pretraining data\n(i.e., paired image-text samples). However for videos, such paired data is not\nas abundant. Therefore, video-VLMs are usually designed by adapting pretrained\nimage-VLMs to the video-domain, instead of training from scratch. All such\nrecipes rely on augmenting visual embeddings with temporal information (i.e.,\nimage $\\rightarrow$ video), often keeping text embeddings unchanged or even\nbeing discarded. In this paper, we argue the contrary, that better video-VLMs\ncan be designed by focusing more on augmenting text, rather than visual\ninformation. More specifically, we introduce Video-conditioned Text\nRepresentations (VicTR): a form of text embeddings optimized w.r.t. visual\nembeddings, creating a more-flexible contrastive latent space. Our model can\nfurther make use of freely-available semantic information, in the form of\nvisually-grounded auxiliary text (e.g. object or scene information). We\nevaluate our model on few-shot, zero-shot (HMDB-51, UCF-101), short-form\n(Kinetics-400) and long-form (Charades) activity recognition benchmarks,\nshowing strong performance among video-VLMs.\n","authors":["Kumara Kahatapitiya","Anurag Arnab","Arsha Nagrani","Michael S. Ryoo"],"pdf_url":"https://arxiv.org/pdf/2304.02560v2.pdf","comment":"To appear at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.15905v4","updated":"2024-03-29T16:53:58Z","published":"2024-03-23T18:19:02Z","title":"Towards Low-Energy Adaptive Personalization for Resource-Constrained\n Devices","summary":" The personalization of machine learning (ML) models to address data drift is\na significant challenge in the context of Internet of Things (IoT)\napplications. Presently, most approaches focus on fine-tuning either the full\nbase model or its last few layers to adapt to new data, while often neglecting\nenergy costs. However, various types of data drift exist, and fine-tuning the\nfull base model or the last few layers may not result in optimal performance in\ncertain scenarios. We propose Target Block Fine-Tuning (TBFT), a low-energy\nadaptive personalization framework designed for resource-constrained devices.\nWe categorize data drift and personalization into three types: input-level,\nfeature-level, and output-level. For each type, we fine-tune different blocks\nof the model to achieve optimal performance with reduced energy costs.\nSpecifically, input-, feature-, and output-level correspond to fine-tuning the\nfront, middle, and rear blocks of the model. We evaluate TBFT on a ResNet\nmodel, three datasets, three different training sizes, and a Raspberry Pi.\nCompared with the $Block Avg$, where each block is fine-tuned individually and\ntheir performance improvements are averaged, TBFT exhibits an improvement in\nmodel accuracy by an average of 15.30% whilst saving 41.57% energy consumption\non average compared with full fine-tuning.\n","authors":["Yushan Huang","Josh Millar","Yuxuan Long","Yuchen Zhao","Hamed Haddadi"],"pdf_url":"https://arxiv.org/pdf/2403.15905v4.pdf","comment":"Accepetd to The 4th Workshop on Machine Learning and Systems\n (EuroMLSys '24)"},{"id":"http://arxiv.org/abs/2312.05291v2","updated":"2024-03-29T16:49:59Z","published":"2023-12-08T18:14:21Z","title":"GlitchBench: Can large multimodal models detect video game glitches?","summary":" Large multimodal models (LMMs) have evolved from large language models (LLMs)\nto integrate multiple input modalities, such as visual inputs. This integration\naugments the capacity of LLMs for tasks requiring visual comprehension and\nreasoning. However, the extent and limitations of their enhanced abilities are\nnot fully understood, especially when it comes to real-world tasks. To address\nthis gap, we introduce GlitchBench, a novel benchmark derived from video game\nquality assurance tasks, to test and evaluate the reasoning capabilities of\nLMMs. Our benchmark is curated from a variety of unusual and glitched scenarios\nfrom video games and aims to challenge both the visual and linguistic reasoning\npowers of LMMs in detecting and interpreting out-of-the-ordinary events. We\nevaluate multiple state-of-the-art LMMs, and we show that GlitchBench presents\na new challenge for these models. Code and data are available at:\nhttps://glitchbench.github.io/\n","authors":["Mohammad Reza Taesiri","Tianjun Feng","Anh Nguyen","Cor-Paul Bezemer"],"pdf_url":"https://arxiv.org/pdf/2312.05291v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2312.04670v2","updated":"2024-03-29T16:39:28Z","published":"2023-12-07T20:11:03Z","title":"Rapid Motor Adaptation for Robotic Manipulator Arms","summary":" Developing generalizable manipulation skills is a core challenge in embodied\nAI. This includes generalization across diverse task configurations,\nencompassing variations in object shape, density, friction coefficient, and\nexternal disturbances such as forces applied to the robot. Rapid Motor\nAdaptation (RMA) offers a promising solution to this challenge. It posits that\nessential hidden variables influencing an agent's task performance, such as\nobject mass and shape, can be effectively inferred from the agent's action and\nproprioceptive history. Drawing inspiration from RMA in locomotion and in-hand\nrotation, we use depth perception to develop agents tailored for rapid motor\nadaptation in a variety of manipulation tasks. We evaluated our agents on four\nchallenging tasks from the Maniskill2 benchmark, namely pick-and-place\noperations with hundreds of objects from the YCB and EGAD datasets, peg\ninsertion with precise position and orientation, and operating a variety of\nfaucets and handles, with customized environment variations. Empirical results\ndemonstrate that our agents surpass state-of-the-art methods like automatic\ndomain randomization and vision-based policies, obtaining better generalization\nperformance and sample efficiency.\n","authors":["Yichao Liang","Kevin Ellis","João Henriques"],"pdf_url":"https://arxiv.org/pdf/2312.04670v2.pdf","comment":"Accepted at CVPR 2024. 12 pages"},{"id":"http://arxiv.org/abs/2312.02214v2","updated":"2024-03-29T16:31:44Z","published":"2023-12-03T07:23:53Z","title":"FlashAvatar: High-fidelity Head Avatar with Efficient Gaussian Embedding","summary":" We propose FlashAvatar, a novel and lightweight 3D animatable avatar\nrepresentation that could reconstruct a digital avatar from a short monocular\nvideo sequence in minutes and render high-fidelity photo-realistic images at\n300FPS on a consumer-grade GPU. To achieve this, we maintain a uniform 3D\nGaussian field embedded in the surface of a parametric face model and learn\nextra spatial offset to model non-surface regions and subtle facial details.\nWhile full use of geometric priors can capture high-frequency facial details\nand preserve exaggerated expressions, proper initialization can help reduce the\nnumber of Gaussians, thus enabling super-fast rendering speed. Extensive\nexperimental results demonstrate that FlashAvatar outperforms existing works\nregarding visual quality and personalized details and is almost an order of\nmagnitude faster in rendering speed. Project page:\nhttps://ustc3dv.github.io/FlashAvatar/\n","authors":["Jun Xiang","Xuan Gao","Yudong Guo","Juyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.02214v2.pdf","comment":"Project page: https://ustc3dv.github.io/FlashAvatar/"},{"id":"http://arxiv.org/abs/2403.20275v1","updated":"2024-03-29T16:30:17Z","published":"2024-03-29T16:30:17Z","title":"Snap-it, Tap-it, Splat-it: Tactile-Informed 3D Gaussian Splatting for\n Reconstructing Challenging Surfaces","summary":" Touch and vision go hand in hand, mutually enhancing our ability to\nunderstand the world. From a research perspective, the problem of mixing touch\nand vision is underexplored and presents interesting challenges. To this end,\nwe propose Tactile-Informed 3DGS, a novel approach that incorporates touch data\n(local depth maps) with multi-view vision data to achieve surface\nreconstruction and novel view synthesis. Our method optimises 3D Gaussian\nprimitives to accurately model the object's geometry at points of contact. By\ncreating a framework that decreases the transmittance at touch locations, we\nachieve a refined surface reconstruction, ensuring a uniformly smooth depth\nmap. Touch is particularly useful when considering non-Lambertian objects (e.g.\nshiny or reflective surfaces) since contemporary methods tend to fail to\nreconstruct with fidelity specular highlights. By combining vision and tactile\nsensing, we achieve more accurate geometry reconstructions with fewer images\nthan prior methods. We conduct evaluation on objects with glossy and reflective\nsurfaces and demonstrate the effectiveness of our approach, offering\nsignificant improvements in reconstruction quality.\n","authors":["Mauro Comi","Alessio Tonioni","Max Yang","Jonathan Tremblay","Valts Blukis","Yijiong Lin","Nathan F. Lepora","Laurence Aitchison"],"pdf_url":"https://arxiv.org/pdf/2403.20275v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2403.20273v1","updated":"2024-03-29T16:27:40Z","published":"2024-03-29T16:27:40Z","title":"CATSNet: a context-aware network for Height Estimation in a Forested\n Area based on Pol-TomoSAR data","summary":" Tropical forests are a key component of the global carbon cycle. With plans\nfor upcoming space-borne missions like BIOMASS to monitor forestry, several\nairborne missions, including TropiSAR and AfriSAR campaigns, have been\nsuccessfully launched and experimented. Typical Synthetic Aperture Radar\nTomography (TomoSAR) methods involve complex models with low accuracy and high\ncomputation costs. In recent years, deep learning methods have also gained\nattention in the TomoSAR framework, showing interesting performance. Recently,\na solution based on a fully connected Tomographic Neural Network (TSNN) has\ndemonstrated its effectiveness in accurately estimating forest and ground\nheights by exploiting the pixel-wise elements of the covariance matrix derived\nfrom TomoSAR data. This work instead goes beyond the pixel-wise approach to\ndefine a context-aware deep learning-based solution named CATSNet. A\nconvolutional neural network is considered to leverage patch-based information\nand extract features from a neighborhood rather than focus on a single pixel.\nThe training is conducted by considering TomoSAR data as the input and Light\nDetection and Ranging (LiDAR) values as the ground truth. The experimental\nresults show striking advantages in both performance and generalization ability\nby leveraging context information within Multiple Baselines (MB) TomoSAR data\nacross different polarimetric modalities, surpassing existing techniques.\n","authors":["Wenyu Yang","Sergio Vitale","Hossein Aghababaei","Giampaolo Ferraioli","Vito Pascazio","Gilda Schirinzi"],"pdf_url":"https://arxiv.org/pdf/2403.20273v1.pdf","comment":"Submitted to IEEE TGRS, under review"},{"id":"http://arxiv.org/abs/2403.20271v1","updated":"2024-03-29T16:26:20Z","published":"2024-03-29T16:26:20Z","title":"Draw-and-Understand: Leveraging Visual Prompts to Enable MLLMs to\n Comprehend What You Want","summary":" The interaction between humans and artificial intelligence (AI) is a crucial\nfactor that reflects the effectiveness of multimodal large language models\n(MLLMs). However, current MLLMs primarily focus on image-level comprehension\nand limit interaction to textual instructions, thereby constraining their\nflexibility in usage and depth of response. In this paper, we introduce the\nDraw-and-Understand project: a new model, a multi-domain dataset, and a\nchallenging benchmark for visual prompting. Specifically, we propose SPHINX-V,\na new end-to-end trained Multimodal Large Language Model (MLLM) that connects a\nvision encoder, a visual prompt encoder and an LLM for various visual prompts\n(points, bounding boxes, and free-form shape) and language understanding. To\nadvance visual prompting research for MLLMs, we introduce MDVP-Data and\nMDVP-Bench. MDVP-Data features a multi-domain dataset containing 1.6M unique\nimage-visual prompt-text instruction-following samples, including natural\nimages, document images, OCR images, mobile screenshots, web screenshots, and\nmulti-panel images. Furthermore, we present MDVP-Bench, a comprehensive and\nchallenging benchmark to assess a model's capability in understanding visual\nprompting instructions. Our experiments demonstrate SPHINX-V's impressive\nmultimodal interaction capabilities through visual prompting, revealing\nsignificant improvements in detailed pixel-level description and\nquestion-answering abilities.\n","authors":["Weifeng Lin","Xinyu Wei","Ruichuan An","Peng Gao","Bocheng Zou","Yulin Luo","Siyuan Huang","Shanghang Zhang","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2403.20271v1.pdf","comment":"16 pages, 7 figures"},{"id":"http://arxiv.org/abs/2403.16970v2","updated":"2024-03-29T16:14:41Z","published":"2024-03-25T17:31:12Z","title":"Joint chest X-ray diagnosis and clinical visual attention prediction\n with multi-stage cooperative learning: enhancing interpretability","summary":" As deep learning has become the state-of-the-art for computer-assisted\ndiagnosis, interpretability of the automatic decisions is crucial for clinical\ndeployment. While various methods were proposed in this domain, visual\nattention maps of clinicians during radiological screening offer a unique asset\nto provide important insights and can potentially enhance the quality of\ncomputer-assisted diagnosis. With this paper, we introduce a novel\ndeep-learning framework for joint disease diagnosis and prediction of\ncorresponding visual saliency maps for chest X-ray scans. Specifically, we\ndesigned a novel dual-encoder multi-task UNet, which leverages both a\nDenseNet201 backbone and a Residual and Squeeze-and-Excitation block-based\nencoder to extract diverse features for saliency map prediction, and a\nmulti-scale feature-fusion classifier to perform disease classification. To\ntackle the issue of asynchronous training schedules of individual tasks in\nmulti-task learning, we proposed a multi-stage cooperative learning strategy,\nwith contrastive learning for feature encoder pretraining to boost performance.\nExperiments show that our proposed method outperformed existing techniques for\nchest X-ray diagnosis and the quality of visual saliency map prediction.\n","authors":["Zirui Qiu","Hassan Rivaz","Yiming Xiao"],"pdf_url":"https://arxiv.org/pdf/2403.16970v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01482v3","updated":"2024-03-29T16:13:13Z","published":"2024-03-03T11:24:16Z","title":"EAGLE: Eigen Aggregation Learning for Object-Centric Unsupervised\n Semantic Segmentation","summary":" Semantic segmentation has innately relied on extensive pixel-level annotated\ndata, leading to the emergence of unsupervised methodologies. Among them,\nleveraging self-supervised Vision Transformers for unsupervised semantic\nsegmentation (USS) has been making steady progress with expressive deep\nfeatures. Yet, for semantically segmenting images with complex objects, a\npredominant challenge remains: the lack of explicit object-level semantic\nencoding in patch-level features. This technical limitation often leads to\ninadequate segmentation of complex objects with diverse structures. To address\nthis gap, we present a novel approach, EAGLE, which emphasizes object-centric\nrepresentation learning for unsupervised semantic segmentation. Specifically,\nwe introduce EiCue, a spectral technique providing semantic and structural cues\nthrough an eigenbasis derived from the semantic similarity matrix of deep image\nfeatures and color affinity from an image. Further, by incorporating our\nobject-centric contrastive loss with EiCue, we guide our model to learn\nobject-level representations with intra- and inter-image object-feature\nconsistency, thereby enhancing semantic accuracy. Extensive experiments on\nCOCO-Stuff, Cityscapes, and Potsdam-3 datasets demonstrate the state-of-the-art\nUSS results of EAGLE with accurate and consistent semantic segmentation across\ncomplex scenes.\n","authors":["Chanyoung Kim","Woojung Han","Dayun Ju","Seong Jae Hwang"],"pdf_url":"https://arxiv.org/pdf/2403.01482v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20260v1","updated":"2024-03-29T16:08:59Z","published":"2024-03-29T16:08:59Z","title":"Prototype-based Interpretable Breast Cancer Prediction Models: Analysis\n and Challenges","summary":" Deep learning models have achieved high performance in medical applications,\nhowever, their adoption in clinical practice is hindered due to their black-box\nnature. Self-explainable models, like prototype-based models, can be especially\nbeneficial as they are interpretable by design. However, if the learnt\nprototypes are of low quality then the prototype-based models are as good as\nblack-box. Having high quality prototypes is a pre-requisite for a truly\ninterpretable model. In this work, we propose a prototype evaluation framework\nfor coherence (PEF-C) for quantitatively evaluating the quality of the\nprototypes based on domain knowledge. We show the use of PEF-C in the context\nof breast cancer prediction using mammography. Existing works on\nprototype-based models on breast cancer prediction using mammography have\nfocused on improving the classification performance of prototype-based models\ncompared to black-box models and have evaluated prototype quality through\nanecdotal evidence. We are the first to go beyond anecdotal evidence and\nevaluate the quality of the mammography prototypes systematically using our\nPEF-C. Specifically, we apply three state-of-the-art prototype-based models,\nProtoPNet, BRAIxProtoPNet++ and PIP-Net on mammography images for breast cancer\nprediction and evaluate these models w.r.t. i) classification performance, and\nii) quality of the prototypes, on three public datasets. Our results show that\nprototype-based models are competitive with black-box models in terms of\nclassification performance, and achieve a higher score in detecting ROIs.\nHowever, the quality of the prototypes are not yet sufficient and can be\nimproved in aspects of relevance, purity and learning a variety of prototypes.\nWe call the XAI community to systematically evaluate the quality of the\nprototypes to check their true usability in high stake decisions and improve\nsuch models further.\n","authors":["Shreyasi Pathak","Jörg Schlötterer","Jeroen Veltman","Jeroen Geerdink","Maurice van Keulen","Christin Seifert"],"pdf_url":"https://arxiv.org/pdf/2403.20260v1.pdf","comment":"21 pages, 5 figures, 3 tables"},{"id":"http://arxiv.org/abs/2403.20254v1","updated":"2024-03-29T16:01:00Z","published":"2024-03-29T16:01:00Z","title":"Benchmarking the Robustness of Temporal Action Detection Models Against\n Temporal Corruptions","summary":" Temporal action detection (TAD) aims to locate action positions and recognize\naction categories in long-term untrimmed videos. Although many methods have\nachieved promising results, their robustness has not been thoroughly studied.\nIn practice, we observe that temporal information in videos can be occasionally\ncorrupted, such as missing or blurred frames. Interestingly, existing methods\noften incur a significant performance drop even if only one frame is affected.\nTo formally evaluate the robustness, we establish two temporal corruption\nrobustness benchmarks, namely THUMOS14-C and ActivityNet-v1.3-C. In this paper,\nwe extensively analyze the robustness of seven leading TAD methods and obtain\nsome interesting findings: 1) Existing methods are particularly vulnerable to\ntemporal corruptions, and end-to-end methods are often more susceptible than\nthose with a pre-trained feature extractor; 2) Vulnerability mainly comes from\nlocalization error rather than classification error; 3) When corruptions occur\nin the middle of an action instance, TAD models tend to yield the largest\nperformance drop. Besides building a benchmark, we further develop a simple but\neffective robust training method to defend against temporal corruptions,\nthrough the FrameDrop augmentation and Temporal-Robust Consistency loss.\nRemarkably, our approach not only improves robustness but also yields promising\nimprovements on clean data. We believe that this study will serve as a\nbenchmark for future research in robust video analysis. Source code and models\nare available at https://github.com/Alvin-Zeng/temporal-robustness-benchmark.\n","authors":["Runhao Zeng","Xiaoyong Chen","Jiaming Liang","Huisi Wu","Guangzhong Cao","Yong Guo"],"pdf_url":"https://arxiv.org/pdf/2403.20254v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.20253v1","updated":"2024-03-29T15:59:11Z","published":"2024-03-29T15:59:11Z","title":"MedCLIP-SAM: Bridging Text and Image Towards Universal Medical Image\n Segmentation","summary":" Medical image segmentation of anatomical structures and pathology is crucial\nin modern clinical diagnosis, disease study, and treatment planning. To date,\ngreat progress has been made in deep learning-based segmentation techniques,\nbut most methods still lack data efficiency, generalizability, and\ninteractability. Consequently, the development of new, precise segmentation\nmethods that demand fewer labeled datasets is of utmost importance in medical\nimage analysis. Recently, the emergence of foundation models, such as CLIP and\nSegment-Anything-Model (SAM), with comprehensive cross-domain representation\nopened the door for interactive and universal image segmentation. However,\nexploration of these models for data-efficient medical image segmentation is\nstill limited, but is highly necessary. In this paper, we propose a novel\nframework, called MedCLIP-SAM that combines CLIP and SAM models to generate\nsegmentation of clinical scans using text prompts in both zero-shot and weakly\nsupervised settings. To achieve this, we employed a new Decoupled Hard Negative\nNoise Contrastive Estimation (DHN-NCE) loss to fine-tune the BiomedCLIP model\nand the recent gScoreCAM to generate prompts to obtain segmentation masks from\nSAM in a zero-shot setting. Additionally, we explored the use of zero-shot\nsegmentation labels in a weakly supervised paradigm to improve the segmentation\nquality further. By extensively testing three diverse segmentation tasks and\nmedical image modalities (breast tumor ultrasound, brain tumor MRI, and lung\nX-ray), our proposed framework has demonstrated excellent accuracy.\n","authors":["Taha Koleilat","Hojat Asgariandehkordi","Hassan Rivaz","Yiming Xiao"],"pdf_url":"https://arxiv.org/pdf/2403.20253v1.pdf","comment":"10 pages, 2 figures"},{"id":"http://arxiv.org/abs/2403.20251v1","updated":"2024-03-29T15:57:38Z","published":"2024-03-29T15:57:38Z","title":"Latent Embedding Clustering for Occlusion Robust Head Pose Estimation","summary":" Head pose estimation has become a crucial area of research in computer vision\ngiven its usefulness in a wide range of applications, including robotics,\nsurveillance, or driver attention monitoring. One of the most difficult\nchallenges in this field is managing head occlusions that frequently take place\nin real-world scenarios. In this paper, we propose a novel and efficient\nframework that is robust in real world head occlusion scenarios. In particular,\nwe propose an unsupervised latent embedding clustering with regression and\nclassification components for each pose angle. The model optimizes latent\nfeature representations for occluded and non-occluded images through a\nclustering term while improving fine-grained angle predictions. Experimental\nevaluation on in-the-wild head pose benchmark datasets reveal competitive\nperformance in comparison to state-of-the-art methodologies with the advantage\nof having a significant data reduction. We observe a substantial improvement in\noccluded head pose estimation. Also, an ablation study is conducted to\nascertain the impact of the clustering term within our proposed framework.\n","authors":["José Celestino","Manuel Marques","Jacinto C. Nascimento"],"pdf_url":"https://arxiv.org/pdf/2403.20251v1.pdf","comment":"Accepted at 18th IEEE International Conference on Automatic Face and\n Gesture Recognition (FG'24)"},{"id":"http://arxiv.org/abs/2311.13612v2","updated":"2024-03-29T15:55:48Z","published":"2023-11-21T23:30:01Z","title":"Descriptor and Word Soups: Overcoming the Parameter Efficiency Accuracy\n Tradeoff for Out-of-Distribution Few-shot Learning","summary":" Over the past year, a large body of multimodal research has emerged around\nzero-shot evaluation using GPT descriptors. These studies boost the zero-shot\naccuracy of pretrained VL models with an ensemble of label-specific text\ngenerated by GPT. A recent study, WaffleCLIP, demonstrated that similar\nzero-shot accuracy can be achieved with an ensemble of random descriptors.\nHowever, both zero-shot methods are un-trainable and consequently sub-optimal\nwhen some few-shot out-of-distribution (OOD) training data is available.\nInspired by these prior works, we present two more flexible methods called\ndescriptor and word soups, which do not require an LLM at test time and can\nleverage training data to increase OOD target accuracy. Descriptor soup\ngreedily selects a small set of textual descriptors using generic few-shot\ntraining data, then calculates robust class embeddings using the selected\ndescriptors. Word soup greedily assembles a chain of words in a similar manner.\nCompared to existing few-shot soft prompt tuning methods, word soup requires\nfewer parameters by construction and less GPU memory, since it does not require\nbackpropagation. Both soups outperform current published few-shot methods, even\nwhen combined with SoTA zero-shot methods, on cross-dataset and domain\ngeneralization benchmarks. Compared with SoTA prompt and descriptor ensembling\nmethods, such as ProDA and WaffleCLIP, word soup achieves higher OOD accuracy\nwith fewer ensemble members. Please checkout our code:\ngithub.com/Chris210634/word_soups\n","authors":["Christopher Liao","Theodoros Tsiligkaridis","Brian Kulis"],"pdf_url":"https://arxiv.org/pdf/2311.13612v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20249v1","updated":"2024-03-29T15:54:36Z","published":"2024-03-29T15:54:36Z","title":"Relation Rectification in Diffusion Model","summary":" Despite their exceptional generative abilities, large text-to-image diffusion\nmodels, much like skilled but careless artists, often struggle with accurately\ndepicting visual relationships between objects. This issue, as we uncover\nthrough careful analysis, arises from a misaligned text encoder that struggles\nto interpret specific relationships and differentiate the logical order of\nassociated objects. To resolve this, we introduce a novel task termed Relation\nRectification, aiming to refine the model to accurately represent a given\nrelationship it initially fails to generate. To address this, we propose an\ninnovative solution utilizing a Heterogeneous Graph Convolutional Network\n(HGCN). It models the directional relationships between relation terms and\ncorresponding objects within the input prompts. Specifically, we optimize the\nHGCN on a pair of prompts with identical relational words but reversed object\norders, supplemented by a few reference images. The lightweight HGCN adjusts\nthe text embeddings generated by the text encoder, ensuring the accurate\nreflection of the textual relation in the embedding space. Crucially, our\nmethod retains the parameters of the text encoder and diffusion model,\npreserving the model's robust performance on unrelated descriptions. We\nvalidated our approach on a newly curated dataset of diverse relational data,\ndemonstrating both quantitative and qualitative enhancements in generating\nimages with precise visual relations. Project page:\nhttps://wuyinwei-hah.github.io/rrnet.github.io/.\n","authors":["Yinwei Wu","Xingyi Yang","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2403.20249v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.05269v2","updated":"2024-03-29T15:44:05Z","published":"2023-12-07T19:19:25Z","title":"LifelongMemory: Leveraging LLMs for Answering Queries in Long-form\n Egocentric Videos","summary":" In this paper we introduce LifelongMemory, a new framework for accessing\nlong-form egocentric videographic memory through natural language question\nanswering and retrieval. LifelongMemory generates concise video activity\ndescriptions of the camera wearer and leverages the zero-shot capabilities of\npretrained large language models to perform reasoning over long-form video\ncontext. Furthermore, Lifelong Memory uses a confidence and explanation module\nto produce confident, high-quality, and interpretable answers. Our approach\nachieves state-of-the-art performance on the EgoSchema benchmark for question\nanswering and is highly competitive on the natural language query (NLQ)\nchallenge of Ego4D. Code is available at\nhttps://github.com/Agentic-Learning-AI-Lab/lifelong-memory.\n","authors":["Ying Wang","Yanlai Yang","Mengye Ren"],"pdf_url":"https://arxiv.org/pdf/2312.05269v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.04119v2","updated":"2024-03-29T15:27:47Z","published":"2022-12-08T07:29:07Z","title":"DialogCC: An Automated Pipeline for Creating High-Quality Multi-Modal\n Dialogue Dataset","summary":" As sharing images in an instant message is a crucial factor, there has been\nactive research on learning an image-text multi-modal dialogue models. However,\ntraining a well-generalized multi-modal dialogue model remains challenging due\nto the low quality and limited diversity of images per dialogue in existing\nmulti-modal dialogue datasets. In this paper, we propose an automated pipeline\nto construct a multi-modal dialogue dataset, ensuring both dialogue quality and\nimage diversity without requiring minimum human effort. In our pipeline, to\nguarantee the coherence between images and dialogue, we prompt GPT-4 to infer\npotential image-sharing moments - specifically, the utterance, speaker,\nrationale, and image description. Furthermore, we leverage CLIP similarity to\nmaintain consistency between aligned multiple images to the utterance. Through\nthis pipeline, we introduce DialogCC, a high-quality and diverse multi-modal\ndialogue dataset that surpasses existing datasets in terms of quality and\ndiversity in human evaluation. Our comprehensive experiments highlight that\nwhen multi-modal dialogue models are trained using our dataset, their\ngeneralization performance on unseen dialogue datasets is significantly\nenhanced. We make our source code and dataset publicly available.\n","authors":["Young-Jun Lee","Byungsoo Ko","Han-Gyu Kim","Jonghwan Hyeon","Ho-Jin Choi"],"pdf_url":"https://arxiv.org/pdf/2212.04119v2.pdf","comment":"NAACL 2024"},{"id":"http://arxiv.org/abs/2403.20236v1","updated":"2024-03-29T15:26:44Z","published":"2024-03-29T15:26:44Z","title":"Long-Tailed Anomaly Detection with Learnable Class Names","summary":" Anomaly detection (AD) aims to identify defective images and localize their\ndefects (if any). Ideally, AD models should be able to detect defects over many\nimage classes; without relying on hard-coded class names that can be\nuninformative or inconsistent across datasets; learn without anomaly\nsupervision; and be robust to the long-tailed distributions of real-world\napplications. To address these challenges, we formulate the problem of\nlong-tailed AD by introducing several datasets with different levels of class\nimbalance and metrics for performance evaluation. We then propose a novel\nmethod, LTAD, to detect defects from multiple and long-tailed classes, without\nrelying on dataset class names. LTAD combines AD by reconstruction and semantic\nAD modules. AD by reconstruction is implemented with a transformer-based\nreconstruction module. Semantic AD is implemented with a binary classifier,\nwhich relies on learned pseudo class names and a pretrained foundation model.\nThese modules are learned over two phases. Phase 1 learns the pseudo-class\nnames and a variational autoencoder (VAE) for feature synthesis that augments\nthe training data to combat long-tails. Phase 2 then learns the parameters of\nthe reconstruction and classification modules of LTAD. Extensive experiments\nusing the proposed long-tailed datasets show that LTAD substantially\noutperforms the state-of-the-art methods for most forms of dataset imbalance.\nThe long-tailed dataset split is available at\nhttps://zenodo.org/records/10854201 .\n","authors":["Chih-Hui Ho","Kuan-Chuan Peng","Nuno Vasconcelos"],"pdf_url":"https://arxiv.org/pdf/2403.20236v1.pdf","comment":"This paper is accepted to CVPR 2024. The supplementary material is\n included. The long-tailed dataset split is available at\n https://zenodo.org/records/10854201"},{"id":"http://arxiv.org/abs/2403.20231v1","updated":"2024-03-29T15:20:34Z","published":"2024-03-29T15:20:34Z","title":"U-VAP: User-specified Visual Appearance Personalization via Decoupled\n Self Augmentation","summary":" Concept personalization methods enable large text-to-image models to learn\nspecific subjects (e.g., objects/poses/3D models) and synthesize renditions in\nnew contexts. Given that the image references are highly biased towards visual\nattributes, state-of-the-art personalization models tend to overfit the whole\nsubject and cannot disentangle visual characteristics in pixel space. In this\nstudy, we proposed a more challenging setting, namely fine-grained visual\nappearance personalization. Different from existing methods, we allow users to\nprovide a sentence describing the desired attributes. A novel decoupled\nself-augmentation strategy is proposed to generate target-related and\nnon-target samples to learn user-specified visual attributes. These augmented\ndata allow for refining the model's understanding of the target attribute while\nmitigating the impact of unrelated attributes. At the inference stage,\nadjustments are conducted on semantic space through the learned target and\nnon-target embeddings to further enhance the disentanglement of target\nattributes. Extensive experiments on various kinds of visual attributes with\nSOTA personalization methods show the ability of the proposed method to mimic\ntarget visual appearance in novel contexts, thus improving the controllability\nand flexibility of personalization.\n","authors":["You Wu","Kean Liu","Xiaoyue Mi","Fan Tang","Juan Cao","Jintao Li"],"pdf_url":"https://arxiv.org/pdf/2403.20231v1.pdf","comment":"14 pages, 13 figures, 2 tables"},{"id":"http://arxiv.org/abs/2303.06346v2","updated":"2024-03-29T15:10:29Z","published":"2023-03-11T08:42:54Z","title":"3DInAction: Understanding Human Actions in 3D Point Clouds","summary":" We propose a novel method for 3D point cloud action recognition.\nUnderstanding human actions in RGB videos has been widely studied in recent\nyears, however, its 3D point cloud counterpart remains under-explored. This is\nmostly due to the inherent limitation of the point cloud data modality -- lack\nof structure, permutation invariance, and varying number of points -- which\nmakes it difficult to learn a spatio-temporal representation. To address this\nlimitation, we propose the 3DinAction pipeline that first estimates patches\nmoving in time (t-patches) as a key building block, alongside a hierarchical\narchitecture that learns an informative spatio-temporal representation. We show\nthat our method achieves improved performance on existing datasets, including\nDFAUST and IKEA ASM. Code is publicly available at\nhttps://github.com/sitzikbs/3dincaction.\n","authors":["Yizhak Ben-Shabat","Oren Shrout","Stephen Gould"],"pdf_url":"https://arxiv.org/pdf/2303.06346v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20225v1","updated":"2024-03-29T15:08:37Z","published":"2024-03-29T15:08:37Z","title":"MTMMC: A Large-Scale Real-World Multi-Modal Camera Tracking Benchmark","summary":" Multi-target multi-camera tracking is a crucial task that involves\nidentifying and tracking individuals over time using video streams from\nmultiple cameras. This task has practical applications in various fields, such\nas visual surveillance, crowd behavior analysis, and anomaly detection.\nHowever, due to the difficulty and cost of collecting and labeling data,\nexisting datasets for this task are either synthetically generated or\nartificially constructed within a controlled camera network setting, which\nlimits their ability to model real-world dynamics and generalize to diverse\ncamera configurations. To address this issue, we present MTMMC, a real-world,\nlarge-scale dataset that includes long video sequences captured by 16\nmulti-modal cameras in two different environments - campus and factory - across\nvarious time, weather, and season conditions. This dataset provides a\nchallenging test-bed for studying multi-camera tracking under diverse\nreal-world complexities and includes an additional input modality of spatially\naligned and temporally synchronized RGB and thermal cameras, which enhances the\naccuracy of multi-camera tracking. MTMMC is a super-set of existing datasets,\nbenefiting independent fields such as person detection, re-identification, and\nmultiple object tracking. We provide baselines and new learning setups on this\ndataset and set the reference scores for future studies. The datasets, models,\nand test server will be made publicly available.\n","authors":["Sanghyun Woo","Kwanyong Park","Inkyu Shin","Myungchul Kim","In So Kweon"],"pdf_url":"https://arxiv.org/pdf/2403.20225v1.pdf","comment":"Accepted on CVPR 2024"},{"id":"http://arxiv.org/abs/2312.02216v2","updated":"2024-03-29T14:59:13Z","published":"2023-12-03T10:41:06Z","title":"DragVideo: Interactive Drag-style Video Editing","summary":" Video generation models have shown their superior ability to generate\nphoto-realistic video. However, how to accurately control (or edit) the video\nremains a formidable challenge. The main issues are: 1) how to perform direct\nand accurate user control in editing; 2) how to execute editings like changing\nshape, expression, and layout without unsightly distortion and artifacts to the\nedited content; and 3) how to maintain spatio-temporal consistency of video\nafter editing. To address the above issues, we propose DragVideo, a general\ndrag-style video editing framework. Inspired by DragGAN, DragVideo addresses\nissues 1) and 2) by proposing the drag-style video latent optimization method\nwhich gives desired control by updating noisy video latent according to drag\ninstructions through video-level drag objective function. We amend issue 3) by\nintegrating the video diffusion model with sample-specific LoRA and Mutual\nSelf-Attention in DragVideo to ensure the edited result is spatio-temporally\nconsistent. We also present a series of testing examples for drag-style video\nediting and conduct extensive experiments across a wide array of challenging\nediting tasks, such as motion, skeleton editing, etc, underscoring DragVideo\ncan edit video in an intuitive, faithful to the user's intention manner, with\nnearly unnoticeable distortion and artifacts, while maintaining spatio-temporal\nconsistency. While traditional prompt-based video editing fails to do the\nformer two and directly applying image drag editing fails in the last,\nDragVideo's versatility and generality are emphasized. Github link:\nhttps://github.com/RickySkywalker/DragVideo-Official.\n","authors":["Yufan Deng","Ruida Wang","Yuhao Zhang","Yu-Wing Tai","Chi-Keung Tang"],"pdf_url":"https://arxiv.org/pdf/2312.02216v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12440v2","updated":"2024-03-29T14:55:50Z","published":"2024-03-19T04:54:59Z","title":"Self-learning Canonical Space for Multi-view 3D Human Pose Estimation","summary":" Multi-view 3D human pose estimation is naturally superior to single view one,\nbenefiting from more comprehensive information provided by images of multiple\nviews. The information includes camera poses, 2D/3D human poses, and 3D\ngeometry. However, the accurate annotation of these information is hard to\nobtain, making it challenging to predict accurate 3D human pose from multi-view\nimages. To deal with this issue, we propose a fully self-supervised framework,\nnamed cascaded multi-view aggregating network (CMANet), to construct a\ncanonical parameter space to holistically integrate and exploit multi-view\ninformation. In our framework, the multi-view information is grouped into two\ncategories: 1) intra-view information , 2) inter-view information. Accordingly,\nCMANet consists of two components: intra-view module (IRV) and inter-view\nmodule (IEV). IRV is used for extracting initial camera pose and 3D human pose\nof each view; IEV is to fuse complementary pose information and cross-view 3D\ngeometry for a final 3D human pose. To facilitate the aggregation of the intra-\nand inter-view, we define a canonical parameter space, depicted by per-view\ncamera pose and human pose and shape parameters ($\\theta$ and $\\beta$) of SMPL\nmodel, and propose a two-stage learning procedure. At first stage, IRV learns\nto estimate camera pose and view-dependent 3D human pose supervised by\nconfident output of an off-the-shelf 2D keypoint detector. At second stage, IRV\nis frozen and IEV further refines the camera pose and optimizes the 3D human\npose by implicitly encoding the cross-view complement and 3D geometry\nconstraint, achieved by jointly fitting predicted multi-view 2D keypoints. The\nproposed framework, modules, and learning strategy are demonstrated to be\neffective by comprehensive experiments and CMANet is superior to\nstate-of-the-art methods in extensive quantitative and qualitative analysis.\n","authors":["Xiaoben Li","Mancheng Meng","Ziyan Wu","Terrence Chen","Fan Yang","Dinggang Shen"],"pdf_url":"https://arxiv.org/pdf/2403.12440v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.13612v2","updated":"2024-03-29T23:42:05Z","published":"2023-03-23T18:55:43Z","title":"NOPE: Novel Object Pose Estimation from a Single Image","summary":" The practicality of 3D object pose estimation remains limited for many\napplications due to the need for prior knowledge of a 3D model and a training\nperiod for new objects. To address this limitation, we propose an approach that\ntakes a single image of a new object as input and predicts the relative pose of\nthis object in new images without prior knowledge of the object's 3D model and\nwithout requiring training time for new objects and categories. We achieve this\nby training a model to directly predict discriminative embeddings for\nviewpoints surrounding the object. This prediction is done using a simple U-Net\narchitecture with attention and conditioned on the desired pose, which yields\nextremely fast inference. We compare our approach to state-of-the-art methods\nand show it outperforms them both in terms of accuracy and robustness. Our\nsource code is publicly available at https://github.com/nv-nguyen/nope\n","authors":["Van Nguyen Nguyen","Thibault Groueix","Yinlin Hu","Mathieu Salzmann","Vincent Lepetit"],"pdf_url":"https://arxiv.org/pdf/2303.13612v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2402.18528v2","updated":"2024-03-29T23:41:13Z","published":"2024-02-28T18:08:03Z","title":"Gradient Reweighting: Towards Imbalanced Class-Incremental Learning","summary":" Class-Incremental Learning (CIL) trains a model to continually recognize new\nclasses from non-stationary data while retaining learned knowledge. A major\nchallenge of CIL arises when applying to real-world data characterized by\nnon-uniform distribution, which introduces a dual imbalance problem involving\n(i) disparities between stored exemplars of old tasks and new class data\n(inter-phase imbalance), and (ii) severe class imbalances within each\nindividual task (intra-phase imbalance). We show that this dual imbalance issue\ncauses skewed gradient updates with biased weights in FC layers, thus inducing\nover/under-fitting and catastrophic forgetting in CIL. Our method addresses it\nby reweighting the gradients towards balanced optimization and unbiased\nclassifier learning. Additionally, we observe imbalanced forgetting where\nparadoxically the instance-rich classes suffer higher performance degradation\nduring CIL due to a larger amount of training data becoming unavailable in\nsubsequent learning phases. To tackle this, we further introduce a\ndistribution-aware knowledge distillation loss to mitigate forgetting by\naligning output logits proportionally with the distribution of lost training\ndata. We validate our method on CIFAR-100, ImageNetSubset, and Food101 across\nvarious evaluation protocols and demonstrate consistent improvements compared\nto existing works, showing great potential to apply CIL in real-world scenarios\nwith enhanced robustness and effectiveness.\n","authors":["Jiangpeng He","Fengqing Zhu"],"pdf_url":"https://arxiv.org/pdf/2402.18528v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2401.10171v2","updated":"2024-03-29T22:47:59Z","published":"2024-01-18T18:01:19Z","title":"SHINOBI: Shape and Illumination using Neural Object Decomposition via\n BRDF Optimization In-the-wild","summary":" We present SHINOBI, an end-to-end framework for the reconstruction of shape,\nmaterial, and illumination from object images captured with varying lighting,\npose, and background. Inverse rendering of an object based on unconstrained\nimage collections is a long-standing challenge in computer vision and graphics\nand requires a joint optimization over shape, radiance, and pose. We show that\nan implicit shape representation based on a multi-resolution hash encoding\nenables faster and robust shape reconstruction with joint camera alignment\noptimization that outperforms prior work. Further, to enable the editing of\nillumination and object reflectance (i.e. material) we jointly optimize BRDF\nand illumination together with the object's shape. Our method is class-agnostic\nand works on in-the-wild image collections of objects to produce relightable 3D\nassets for several use cases such as AR/VR, movies, games, etc. Project page:\nhttps://shinobi.aengelhardt.com Video:\nhttps://www.youtube.com/watch?v=iFENQ6AcYd8&feature=youtu.be\n","authors":["Andreas Engelhardt","Amit Raj","Mark Boss","Yunzhi Zhang","Abhishek Kar","Yuanzhen Li","Deqing Sun","Ricardo Martin Brualla","Jonathan T. Barron","Hendrik P. A. Lensch","Varun Jampani"],"pdf_url":"https://arxiv.org/pdf/2401.10171v2.pdf","comment":"Accepted by IEEE/CVF Conference on Computer Vision and Pattern\n Recognition (CVPR 2024). Updated supplementary material and acknowledgements"},{"id":"http://arxiv.org/abs/2403.17173v2","updated":"2024-03-29T22:46:03Z","published":"2024-03-25T20:39:58Z","title":"Task2Box: Box Embeddings for Modeling Asymmetric Task Relationships","summary":" Modeling and visualizing relationships between tasks or datasets is an\nimportant step towards solving various meta-tasks such as dataset discovery,\nmulti-tasking, and transfer learning. However, many relationships, such as\ncontainment and transferability, are naturally asymmetric and current\napproaches for representation and visualization (e.g., t-SNE) do not readily\nsupport this. We propose Task2Box, an approach to represent tasks using box\nembeddings -- axis-aligned hyperrectangles in low dimensional spaces -- that\ncan capture asymmetric relationships between them through volumetric overlaps.\nWe show that Task2Box accurately predicts unseen hierarchical relationships\nbetween nodes in ImageNet and iNaturalist datasets, as well as transferability\nbetween tasks in the Taskonomy benchmark. We also show that box embeddings\nestimated from task representations (e.g., CLIP, Task2Vec, or attribute based)\ncan be used to predict relationships between unseen tasks more accurately than\nclassifiers trained on the same representations, as well as handcrafted\nasymmetric distances (e.g., KL divergence). This suggests that low-dimensional\nbox embeddings can effectively capture these task relationships and have the\nadded advantage of being interpretable. We use the approach to visualize\nrelationships among publicly available image classification datasets on popular\ndataset hosting platform called Hugging Face.\n","authors":["Rangel Daroya","Aaron Sun","Subhransu Maji"],"pdf_url":"https://arxiv.org/pdf/2403.17173v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.08237v2","updated":"2024-03-29T22:29:03Z","published":"2023-01-19T18:54:43Z","title":"LoCoNet: Long-Short Context Network for Active Speaker Detection","summary":" Active Speaker Detection (ASD) aims to identify who is speaking in each frame\nof a video. ASD reasons from audio and visual information from two contexts:\nlong-term intra-speaker context and short-term inter-speaker context. Long-term\nintra-speaker context models the temporal dependencies of the same speaker,\nwhile short-term inter-speaker context models the interactions of speakers in\nthe same scene. These two contexts are complementary to each other and can help\ninfer the active speaker. Motivated by these observations, we propose LoCoNet,\na simple yet effective Long-Short Context Network that models the long-term\nintra-speaker context and short-term inter-speaker context. We use\nself-attention to model long-term intra-speaker context due to its\neffectiveness in modeling long-range dependencies, and convolutional blocks\nthat capture local patterns to model short-term inter-speaker context.\nExtensive experiments show that LoCoNet achieves state-of-the-art performance\non multiple datasets, achieving an mAP of 95.2%(+1.1%) on AVA-ActiveSpeaker,\n68.1%(+22%) on Columbia dataset, 97.2%(+2.8%) on Talkies dataset and\n59.7%(+8.0%) on Ego4D dataset. Moreover, in challenging cases where multiple\nspeakers are present, or face of active speaker is much smaller than other\nfaces in the same scene, LoCoNet outperforms previous state-of-the-art methods\nby 3.4% on the AVA-ActiveSpeaker dataset. The code will be released at\nhttps://github.com/SJTUwxz/LoCoNet_ASD.\n","authors":["Xizi Wang","Feng Cheng","Gedas Bertasius","David Crandall"],"pdf_url":"https://arxiv.org/pdf/2301.08237v2.pdf","comment":"accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19076v2","updated":"2024-03-29T21:33:39Z","published":"2024-03-28T00:34:56Z","title":"Tiny Machine Learning: Progress and Futures","summary":" Tiny Machine Learning (TinyML) is a new frontier of machine learning. By\nsqueezing deep learning models into billions of IoT devices and\nmicrocontrollers (MCUs), we expand the scope of AI applications and enable\nubiquitous intelligence. However, TinyML is challenging due to hardware\nconstraints: the tiny memory resource makes it difficult to hold deep learning\nmodels designed for cloud and mobile platforms. There is also limited compiler\nand inference engine support for bare-metal devices. Therefore, we need to\nco-design the algorithm and system stack to enable TinyML. In this review, we\nwill first discuss the definition, challenges, and applications of TinyML. We\nthen survey the recent progress in TinyML and deep learning on MCUs. Next, we\nwill introduce MCUNet, showing how we can achieve ImageNet-scale AI\napplications on IoT devices with system-algorithm co-design. We will further\nextend the solution from inference to training and introduce tiny on-device\ntraining techniques. Finally, we present future directions in this area.\nToday's large model might be tomorrow's tiny model. The scope of TinyML should\nevolve and adapt over time.\n","authors":["Ji Lin","Ligeng Zhu","Wei-Ming Chen","Wei-Chen Wang","Song Han"],"pdf_url":"https://arxiv.org/pdf/2403.19076v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2206.15472"},{"id":"http://arxiv.org/abs/2403.08848v2","updated":"2024-03-29T21:22:06Z","published":"2024-03-13T16:57:04Z","title":"FocusMAE: Gallbladder Cancer Detection from Ultrasound Videos with\n Focused Masked Autoencoders","summary":" In recent years, automated Gallbladder Cancer (GBC) detection has gained the\nattention of researchers. Current state-of-the-art (SOTA) methodologies relying\non ultrasound sonography (US) images exhibit limited generalization,\nemphasizing the need for transformative approaches. We observe that individual\nUS frames may lack sufficient information to capture disease manifestation.\nThis study advocates for a paradigm shift towards video-based GBC detection,\nleveraging the inherent advantages of spatiotemporal representations. Employing\nthe Masked Autoencoder (MAE) for representation learning, we address\nshortcomings in conventional image-based methods. We propose a novel design\ncalled FocusMAE to systematically bias the selection of masking tokens from\nhigh-information regions, fostering a more refined representation of\nmalignancy. Additionally, we contribute the most extensive US video dataset for\nGBC detection. We also note that, this is the first study on US video-based GBC\ndetection. We validate the proposed methods on the curated dataset, and report\na new state-of-the-art (SOTA) accuracy of 96.4% for the GBC detection problem,\nagainst an accuracy of 84% by current Image-based SOTA - GBCNet, and RadFormer,\nand 94.7% by Video-based SOTA - AdaMAE. We further demonstrate the generality\nof the proposed FocusMAE on a public CT-based Covid detection dataset,\nreporting an improvement in accuracy by 3.3% over current baselines. The source\ncode and pretrained models are available at:\nhttps://gbc-iitd.github.io/focusmae\n","authors":["Soumen Basu","Mayuna Gupta","Chetan Madan","Pankaj Gupta","Chetan Arora"],"pdf_url":"https://arxiv.org/pdf/2403.08848v2.pdf","comment":"To Appear at CVPR 2024"},{"id":"http://arxiv.org/abs/2402.01858v2","updated":"2024-03-29T21:18:37Z","published":"2024-02-02T19:28:33Z","title":"Explaining latent representations of generative models with large\n multimodal models","summary":" Learning interpretable representations of data generative latent factors is\nan important topic for the development of artificial intelligence. With the\nrise of the large multimodal model, it can align images with text to generate\nanswers. In this work, we propose a framework to comprehensively explain each\nlatent variable in the generative models using a large multimodal model. We\nfurther measure the uncertainty of our generated explanations, quantitatively\nevaluate the performance of explanation generation among multiple large\nmultimodal models, and qualitatively visualize the variations of each latent\nvariable to learn the disentanglement effects of different generative models on\nexplanations. Finally, we discuss the explanatory capabilities and limitations\nof state-of-the-art large multimodal models.\n","authors":["Mengdan Zhu","Zhenke Liu","Bo Pan","Abhinav Angirekula","Liang Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.01858v2.pdf","comment":"ICLR 2024 Workshop Paper on Reliable and Responsible Foundation\n Models"},{"id":"http://arxiv.org/abs/2311.10696v2","updated":"2024-03-29T20:17:29Z","published":"2023-11-17T18:28:32Z","title":"Versatile Medical Image Segmentation Learned from Multi-Source Datasets\n via Model Self-Disambiguation","summary":" A versatile medical image segmentation model applicable to images acquired\nwith diverse equipment and protocols can facilitate model deployment and\nmaintenance. However, building such a model typically demands a large, diverse,\nand fully annotated dataset, which is challenging to obtain due to the\nlabor-intensive nature of data curation. To address this challenge, we propose\na cost-effective alternative that harnesses multi-source data with only partial\nor sparse segmentation labels for training, substantially reducing the cost of\ndeveloping a versatile model. We devise strategies for model\nself-disambiguation, prior knowledge incorporation, and imbalance mitigation to\ntackle challenges associated with inconsistently labeled multi-source data,\nincluding label ambiguity and modality, dataset, and class imbalances.\nExperimental results on a multi-modal dataset compiled from eight different\nsources for abdominal structure segmentation have demonstrated the\neffectiveness and superior performance of our method compared to\nstate-of-the-art alternative approaches. We anticipate that its cost-saving\nfeatures, which optimize the utilization of existing annotated data and reduce\nannotation efforts for new data, will have a significant impact in the field.\n","authors":["Xiaoyang Chen","Hao Zheng","Yuemeng Li","Yuncong Ma","Liang Ma","Hongming Li","Yong Fan"],"pdf_url":"https://arxiv.org/pdf/2311.10696v2.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2201.09929v3","updated":"2024-03-29T20:10:38Z","published":"2022-01-24T19:37:04Z","title":"Euclidean and Affine Curve Reconstruction","summary":" We consider practical aspects of reconstructing planar curves with prescribed\nEuclidean or affine curvatures. These curvatures are invariant under the\nspecial Euclidean group and the equi-affine groups, respectively, and play an\nimportant role in computer vision and shape analysis. We discuss and implement\nalgorithms for such reconstruction, and give estimates on how close\nreconstructed curves are relative to the closeness of their curvatures in\nappropriate metrics. Several illustrative examples are provided.\n","authors":["Jose Agudelo","Brooke Dippold","Ian Klein","Alex Kokot","Eric Geiger","Irina Kogan"],"pdf_url":"https://arxiv.org/pdf/2201.09929v3.pdf","comment":"This paper is a result of an REU project conducted at the North\n Carolina State University in the Summer and Fall 2020. This version has\n several minor corrections"},{"id":"http://arxiv.org/abs/2312.03816v3","updated":"2024-03-29T19:50:38Z","published":"2023-12-06T18:56:14Z","title":"AVID: Any-Length Video Inpainting with Diffusion Model","summary":" Recent advances in diffusion models have successfully enabled text-guided\nimage inpainting. While it seems straightforward to extend such editing\ncapability into the video domain, there have been fewer works regarding\ntext-guided video inpainting. Given a video, a masked region at its initial\nframe, and an editing prompt, it requires a model to do infilling at each frame\nfollowing the editing guidance while keeping the out-of-mask region intact.\nThere are three main challenges in text-guided video inpainting: ($i$) temporal\nconsistency of the edited video, ($ii$) supporting different inpainting types\nat different structural fidelity levels, and ($iii$) dealing with variable\nvideo length. To address these challenges, we introduce Any-Length Video\nInpainting with Diffusion Model, dubbed as AVID. At its core, our model is\nequipped with effective motion modules and adjustable structure guidance, for\nfixed-length video inpainting. Building on top of that, we propose a novel\nTemporal MultiDiffusion sampling pipeline with a middle-frame attention\nguidance mechanism, facilitating the generation of videos with any desired\nduration. Our comprehensive experiments show our model can robustly deal with\nvarious inpainting types at different video duration ranges, with high quality.\nMore visualization results are made publicly available at\nhttps://zhang-zx.github.io/AVID/ .\n","authors":["Zhixing Zhang","Bichen Wu","Xiaoyan Wang","Yaqiao Luo","Luxin Zhang","Yinan Zhao","Peter Vajda","Dimitris Metaxas","Licheng Yu"],"pdf_url":"https://arxiv.org/pdf/2312.03816v3.pdf","comment":"Project website: https://zhang-zx.github.io/AVID/"},{"id":"http://arxiv.org/abs/2402.07245v2","updated":"2024-03-29T19:47:50Z","published":"2024-02-11T17:09:21Z","title":"Semi-Mamba-UNet: Pixel-Level Contrastive and Pixel-Level\n Cross-Supervised Visual Mamba-based UNet for Semi-Supervised Medical Image\n Segmentation","summary":" Medical image segmentation is essential in diagnostics, treatment planning,\nand healthcare, with deep learning offering promising advancements. Notably,\nConvolutional Neural Network (CNN) excel in capturing local image features,\nwhereas Vision Transformer (ViT) adeptly model long-range dependencies through\nmulti-head self-attention mechanisms. Despite their strengths, both CNN and ViT\nface challenges in efficiently processing long-range dependencies within\nmedical images, often requiring substantial computational resources. This\nissue, combined with the high cost and limited availability of expert\nannotations, poses significant obstacles to achieving precise segmentation. To\naddress these challenges, this paper introduces the Semi-Mamba-UNet, which\nintegrates a visual mamba-based UNet architecture with a conventional UNet into\na semi-supervised learning (SSL) framework. This innovative SSL approach\nleverages dual networks to jointly generate pseudo labels and cross supervise\neach other, drawing inspiration from consistency regularization techniques.\nFurthermore, we introduce a self-supervised pixel-level contrastive learning\nstrategy, employing a projector pair to further enhance feature learning\ncapabilities. Our comprehensive evaluation on a publicly available MRI cardiac\nsegmentation dataset, comparing against various SSL frameworks with different\nUNet-based segmentation networks, highlights the superior performance of\nSemi-Mamba-UNet. The source code has been made publicly accessible.\n","authors":["Chao Ma","Ziyang Wang"],"pdf_url":"https://arxiv.org/pdf/2402.07245v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12194v2","updated":"2024-03-29T19:37:18Z","published":"2023-11-20T21:20:37Z","title":"DiffAvatar: Simulation-Ready Garment Optimization with Differentiable\n Simulation","summary":" The realism of digital avatars is crucial in enabling telepresence\napplications with self-expression and customization. While physical simulations\ncan produce realistic motions for clothed humans, they require high-quality\ngarment assets with associated physical parameters for cloth simulations.\nHowever, manually creating these assets and calibrating their parameters is\nlabor-intensive and requires specialized expertise. Current methods focus on\nreconstructing geometry, but don't generate complete assets for physics-based\napplications. To address this gap, we propose \\papername,~a novel approach that\nperforms body and garment co-optimization using differentiable simulation. By\nintegrating physical simulation into the optimization loop and accounting for\nthe complex nonlinear behavior of cloth and its intricate interaction with the\nbody, our framework recovers body and garment geometry and extracts important\nmaterial parameters in a physically plausible way. Our experiments demonstrate\nthat our approach generates realistic clothing and body shape suitable for\ndownstream applications. We provide additional insights and results on our\nwebpage: https://people.csail.mit.edu/liyifei/publication/diffavatar/\n","authors":["Yifei Li","Hsiao-yu Chen","Egor Larionov","Nikolaos Sarafianos","Wojciech Matusik","Tuur Stuyck"],"pdf_url":"https://arxiv.org/pdf/2311.12194v2.pdf","comment":"CVPR 2024; Project page:\n https://people.csail.mit.edu/liyifei/publication/diffavatar/"},{"id":"http://arxiv.org/abs/2403.11821v2","updated":"2024-03-29T19:27:23Z","published":"2024-03-18T14:24:20Z","title":"Evaluating Text-to-Image Synthesis: Survey and Taxonomy of Image Quality\n Metrics","summary":" Recent advances in text-to-image synthesis enabled through a combination of\nlanguage and vision foundation models have led to a proliferation of the tools\navailable and an increased attention to the field. When conducting\ntext-to-image synthesis, a central goal is to ensure that the content between\ntext and image is aligned. As such, there exist numerous evaluation metrics\nthat aim to mimic human judgement. However, it is often unclear which metric to\nuse for evaluating text-to-image synthesis systems as their evaluation is\nhighly nuanced. In this work, we provide a comprehensive overview of existing\ntext-to-image evaluation metrics. Based on our findings, we propose a new\ntaxonomy for categorizing these metrics. Our taxonomy is grounded in the\nassumption that there are two main quality criteria, namely compositionality\nand generality, which ideally map to human preferences. Ultimately, we derive\nguidelines for practitioners conducting text-to-image evaluation, discuss open\nchallenges of evaluation mechanisms, and surface limitations of current\nmetrics.\n","authors":["Sebastian Hartwig","Dominik Engel","Leon Sick","Hannah Kniesel","Tristan Payer","Poonam Poonam","Michael Glöckler","Alex Bäuerle","Timo Ropinski"],"pdf_url":"https://arxiv.org/pdf/2403.11821v2.pdf","comment":"preprint, 21 pages, 2 figures, 1 table"},{"id":"http://arxiv.org/abs/2403.15571v2","updated":"2024-03-29T19:14:36Z","published":"2024-03-22T18:52:10Z","title":"Augmented Reality Warnings in Roadway Work Zones: Evaluating the Effect\n of Modality on Worker Reaction Times","summary":" Given the aging highway infrastructure requiring extensive rebuilding and\nenhancements, and the consequent rise in the number of work zones, there is an\nurgent need to develop advanced safety systems to protect workers. While\nAugmented Reality (AR) holds significant potential for delivering warnings to\nworkers, its integration into roadway work zones remains relatively unexplored.\nThe primary objective of this study is to improve safety measures within\nroadway work zones by conducting an extensive analysis of how different\ncombinations of multimodal AR warnings influence the reaction times of workers.\nThis paper addresses this gap through a series of experiments that aim to\nreplicate the distinctive conditions of roadway work zones, both in real-world\nand virtual reality environments. Our approach comprises three key components:\nan advanced AR system prototype, a VR simulation of AR functionality within the\nwork zone environment, and the Wizard of Oz technique to synchronize user\nexperiences across experiments. To assess reaction times, we leverage both the\nsimple reaction time (SRT) technique and an innovative vision-based metric that\nutilizes real-time pose estimation. By conducting five experiments in\ncontrolled outdoor work zones and indoor VR settings, our study provides\nvaluable information on how various multimodal AR warnings impact workers\nreaction times. Furthermore, our findings reveal the disparities in reaction\ntimes between VR simulations and real-world scenarios, thereby gauging VR's\ncapability to mirror the dynamics of roadway work zones. Furthermore, our\nresults substantiate the potential and reliability of vision-based reaction\ntime measurements. These insights resonate well with those derived using the\nSRT technique, underscoring the viability of this approach for tangible\nreal-world uses.\n","authors":["Sepehr Sabeti","Fatemeh Banani Ardecani","Omidreza Shoghli"],"pdf_url":"https://arxiv.org/pdf/2403.15571v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.01482v2","updated":"2024-03-29T18:52:59Z","published":"2024-01-03T01:11:16Z","title":"Incorporating Geo-Diverse Knowledge into Prompting for Increased\n Geographical Robustness in Object Recognition","summary":" Existing object recognition models have been shown to lack robustness in\ndiverse geographical scenarios due to domain shifts in design and context.\nClass representations need to be adapted to more accurately reflect an object\nconcept under these shifts. In the absence of training data from target\ngeographies, we hypothesize that geographically diverse descriptive knowledge\nof categories can enhance robustness. For this purpose, we explore the\nfeasibility of probing a large language model for geography-based object\nknowledge, and we examine the effects of integrating knowledge into zero-shot\nand learnable soft prompting with CLIP. Within this exploration, we propose\ngeography knowledge regularization to ensure that soft prompts trained on a\nsource set of geographies generalize to an unseen target set. Accuracy gains\nover prompting baselines on DollarStreet while training only on Europe data are\nup to +2.8/1.2/1.6 on target data from Africa/Asia/Americas, and +4.6 overall\non the hardest classes. Competitive performance is shown vs. few-shot target\ntraining, and analysis is provided to direct future study of geographical\nrobustness.\n","authors":["Kyle Buettner","Sina Malakouti","Xiang Lorraine Li","Adriana Kovashka"],"pdf_url":"https://arxiv.org/pdf/2401.01482v2.pdf","comment":"To appear in IEEE/CVF Computer Vision and Pattern Recognition\n Conference (CVPR), 2024"},{"id":"http://arxiv.org/abs/2403.17801v2","updated":"2024-03-29T18:45:35Z","published":"2024-03-26T15:40:05Z","title":"Towards 3D Vision with Low-Cost Single-Photon Cameras","summary":" We present a method for reconstructing 3D shape of arbitrary Lambertian\nobjects based on measurements by miniature, energy-efficient, low-cost\nsingle-photon cameras. These cameras, operating as time resolved image sensors,\nilluminate the scene with a very fast pulse of diffuse light and record the\nshape of that pulse as it returns back from the scene at a high temporal\nresolution. We propose to model this image formation process, account for its\nnon-idealities, and adapt neural rendering to reconstruct 3D geometry from a\nset of spatially distributed sensors with known poses. We show that our\napproach can successfully recover complex 3D shapes from simulated data. We\nfurther demonstrate 3D object reconstruction from real-world captures,\nutilizing measurements from a commodity proximity sensor. Our work draws a\nconnection between image-based modeling and active range scanning and is a step\ntowards 3D vision with single-photon cameras.\n","authors":["Fangzhou Mu","Carter Sifferman","Sacha Jungerman","Yiquan Li","Mark Han","Michael Gleicher","Mohit Gupta","Yin Li"],"pdf_url":"https://arxiv.org/pdf/2403.17801v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00604v2","updated":"2024-03-29T18:33:30Z","published":"2023-12-31T23:04:25Z","title":"SteinDreamer: Variance Reduction for Text-to-3D Score Distillation via\n Stein Identity","summary":" Score distillation has emerged as one of the most prevalent approaches for\ntext-to-3D asset synthesis. Essentially, score distillation updates 3D\nparameters by lifting and back-propagating scores averaged over different\nviews. In this paper, we reveal that the gradient estimation in score\ndistillation is inherent to high variance. Through the lens of variance\nreduction, the effectiveness of SDS and VSD can be interpreted as applications\nof various control variates to the Monte Carlo estimator of the distilled\nscore. Motivated by this rethinking and based on Stein's identity, we propose a\nmore general solution to reduce variance for score distillation, termed Stein\nScore Distillation (SSD). SSD incorporates control variates constructed by\nStein identity, allowing for arbitrary baseline functions. This enables us to\ninclude flexible guidance priors and network architectures to explicitly\noptimize for variance reduction. In our experiments, the overall pipeline,\ndubbed SteinDreamer, is implemented by instantiating the control variate with a\nmonocular depth estimator. The results suggest that SSD can effectively reduce\nthe distillation variance and consistently improve visual quality for both\nobject- and scene-level generation. Moreover, we demonstrate that SteinDreamer\nachieves faster convergence than existing methods due to more stable gradient\nupdates.\n","authors":["Peihao Wang","Zhiwen Fan","Dejia Xu","Dilin Wang","Sreyas Mohan","Forrest Iandola","Rakesh Ranjan","Yilei Li","Qiang Liu","Zhangyang Wang","Vikas Chandra"],"pdf_url":"https://arxiv.org/pdf/2401.00604v2.pdf","comment":"Project page: https://vita-group.github.io/SteinDreamer/"},{"id":"http://arxiv.org/abs/2307.08919v3","updated":"2024-03-29T18:19:36Z","published":"2023-07-18T01:31:47Z","title":"Systematic comparison of semi-supervised and self-supervised learning\n for medical image classification","summary":" In typical medical image classification problems, labeled data is scarce\nwhile unlabeled data is more available. Semi-supervised learning and\nself-supervised learning are two different research directions that can improve\naccuracy by learning from extra unlabeled data. Recent methods from both\ndirections have reported significant gains on traditional benchmarks. Yet past\nbenchmarks do not focus on medical tasks and rarely compare self- and semi-\nmethods together on an equal footing. Furthermore, past benchmarks often handle\nhyperparameter tuning suboptimally. First, they may not tune hyperparameters at\nall, leading to underfitting. Second, when tuning does occur, it often\nunrealistically uses a labeled validation set that is much larger than the\ntraining set. Therefore currently published rankings might not always\ncorroborate with their practical utility This study contributes a systematic\nevaluation of self- and semi- methods with a unified experimental protocol\nintended to guide a practitioner with scarce overall labeled data and a limited\ncompute budget. We answer two key questions: Can hyperparameter tuning be\neffective with realistic-sized validation sets? If so, when all methods are\ntuned well, which self- or semi-supervised methods achieve the best accuracy?\nOur study compares 13 representative semi- and self-supervised methods to\nstrong labeled-set-only baselines on 4 medical datasets. From 20000+ GPU hours\nof computation, we provide valuable best practices to resource-constrained\npractitioners: hyperparameter tuning is effective, and the semi-supervised\nmethod known as MixMatch delivers the most reliable gains across 4 datasets.\n","authors":["Zhe Huang","Ruijie Jiang","Shuchin Aeron","Michael C. Hughes"],"pdf_url":"https://arxiv.org/pdf/2307.08919v3.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.18784v2","updated":"2024-03-29T18:16:29Z","published":"2024-03-27T17:32:04Z","title":"SplatFace: Gaussian Splat Face Reconstruction Leveraging an Optimizable\n Surface","summary":" We present SplatFace, a novel Gaussian splatting framework designed for 3D\nhuman face reconstruction without reliance on accurate pre-determined geometry.\nOur method is designed to simultaneously deliver both high-quality novel view\nrendering and accurate 3D mesh reconstructions. We incorporate a generic 3D\nMorphable Model (3DMM) to provide a surface geometric structure, making it\npossible to reconstruct faces with a limited set of input images. We introduce\na joint optimization strategy that refines both the Gaussians and the morphable\nsurface through a synergistic non-rigid alignment process. A novel distance\nmetric, splat-to-surface, is proposed to improve alignment by considering both\nthe Gaussian position and covariance. The surface information is also utilized\nto incorporate a world-space densification process, resulting in superior\nreconstruction quality. Our experimental analysis demonstrates that the\nproposed method is competitive with both other Gaussian splatting techniques in\nnovel view synthesis and other 3D reconstruction methods in producing 3D face\nmeshes with high geometric precision.\n","authors":["Jiahao Luo","Jing Liu","James Davis"],"pdf_url":"https://arxiv.org/pdf/2403.18784v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00909v2","updated":"2024-03-29T18:04:37Z","published":"2023-12-31T22:47:06Z","title":"Taming Mode Collapse in Score Distillation for Text-to-3D Generation","summary":" Despite the remarkable performance of score distillation in text-to-3D\ngeneration, such techniques notoriously suffer from view inconsistency issues,\nalso known as \"Janus\" artifact, where the generated objects fake each view with\nmultiple front faces. Although empirically effective methods have approached\nthis problem via score debiasing or prompt engineering, a more rigorous\nperspective to explain and tackle this problem remains elusive. In this paper,\nwe reveal that the existing score distillation-based text-to-3D generation\nframeworks degenerate to maximal likelihood seeking on each view independently\nand thus suffer from the mode collapse problem, manifesting as the Janus\nartifact in practice. To tame mode collapse, we improve score distillation by\nre-establishing the entropy term in the corresponding variational objective,\nwhich is applied to the distribution of rendered images. Maximizing the entropy\nencourages diversity among different views in generated 3D assets, thereby\nmitigating the Janus problem. Based on this new objective, we derive a new\nupdate rule for 3D score distillation, dubbed Entropic Score Distillation\n(ESD). We theoretically reveal that ESD can be simplified and implemented by\njust adopting the classifier-free guidance trick upon variational score\ndistillation. Although embarrassingly straightforward, our extensive\nexperiments successfully demonstrate that ESD can be an effective treatment for\nJanus artifacts in score distillation.\n","authors":["Peihao Wang","Dejia Xu","Zhiwen Fan","Dilin Wang","Sreyas Mohan","Forrest Iandola","Rakesh Ranjan","Yilei Li","Qiang Liu","Zhangyang Wang","Vikas Chandra"],"pdf_url":"https://arxiv.org/pdf/2401.00909v2.pdf","comment":"Project page: https://vita-group.github.io/3D-Mode-Collapse/"},{"id":"http://arxiv.org/abs/2401.06407v2","updated":"2024-03-29T18:02:27Z","published":"2024-01-12T07:04:44Z","title":"UAV-Borne Mapping Algorithms for Low-Altitude and High-Speed Drone\n Applications","summary":" This article presents an analysis of current state-of-the-art sensors and how\nthese sensors work with several mapping algorithms for UAV (Unmanned Aerial\nVehicle) applications, focusing on low-altitude and high-speed scenarios. A new\nexperimental construct is created using highly realistic environments made\npossible by integrating the AirSim simulator with Google 3D maps models using\nthe Cesium Tiles plugin. Experiments are conducted in this high-realism\nsimulated environment to evaluate the performance of three distinct mapping\nalgorithms: (1) Direct Sparse Odometry (DSO), (2) Stereo DSO (SDSO), and (3)\nDSO Lite (DSOL). Experimental results evaluate algorithms based on their\nmeasured geometric accuracy and computational speed. The results provide\nvaluable insights into the strengths and limitations of each algorithm.\nFindings quantify compromises in UAV algorithm selection, allowing researchers\nto find the mapping solution best suited to their application, which often\nrequires a compromise between computational performance and the density and\naccuracy of geometric map estimates. Results indicate that for UAVs with\nrestrictive computing resources, DSOL is the best option. For systems with\npayload capacity and modest compute resources, SDSO is the best option. If only\none camera is available, DSO is the option to choose for applications that\nrequire dense mapping results.\n","authors":["Jincheng Zhang","Artur Wolek","Andrew R. Willis"],"pdf_url":"https://arxiv.org/pdf/2401.06407v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00762v2","updated":"2024-03-29T18:02:03Z","published":"2024-03-01T18:59:03Z","title":"Point Cloud Mamba: Point Cloud Learning via State Space Model","summary":" In this work, for the first time, we demonstrate that Mamba-based point cloud\nmethods can outperform point-based methods. Mamba exhibits strong global\nmodeling capabilities and linear computational complexity, making it highly\nattractive for point cloud analysis. To enable more effective processing of 3-D\npoint cloud data by Mamba, we propose a novel Consistent Traverse Serialization\nto convert point clouds into 1-D point sequences while ensuring that\nneighboring points in the sequence are also spatially adjacent. Consistent\nTraverse Serialization yields six variants by permuting the order of x, y, and\nz coordinates, and the synergistic use of these variants aids Mamba in\ncomprehensively observing point cloud data. Furthermore, to assist Mamba in\nhandling point sequences with different orders more effectively, we introduce\npoint prompts to inform Mamba of the sequence's arrangement rules. Finally, we\npropose positional encoding based on spatial coordinate mapping to inject\npositional information into point cloud sequences better. Based on these\nimprovements, we construct a point cloud network named Point Cloud Mamba, which\ncombines local and global modeling. Point Cloud Mamba surpasses the SOTA\npoint-based method PointNeXt and achieves new SOTA performance on the\nScanObjectNN, ModelNet40, and ShapeNetPart datasets.\n","authors":["Tao Zhang","Xiangtai Li","Haobo Yuan","Shunping Ji","Shuicheng Yan"],"pdf_url":"https://arxiv.org/pdf/2403.00762v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20213v1","updated":"2024-03-29T14:50:43Z","published":"2024-03-29T14:50:43Z","title":"H2RSVLM: Towards Helpful and Honest Remote Sensing Large Vision Language\n Model","summary":" The generic large Vision-Language Models (VLMs) is rapidly developing, but\nstill perform poorly in Remote Sensing (RS) domain, which is due to the unique\nand specialized nature of RS imagery and the comparatively limited spatial\nperception of current VLMs. Existing Remote Sensing specific Vision Language\nModels (RSVLMs) still have considerable potential for improvement, primarily\nowing to the lack of large-scale, high-quality RS vision-language datasets. We\nconstructed HqDC-1.4M, the large scale High quality and Detailed Captions for\nRS images, containing 1.4 million image-caption pairs, which not only enhance\nthe RSVLM's understanding of RS images but also significantly improve the\nmodel's spatial perception abilities, such as localization and counting,\nthereby increasing the helpfulness of the RSVLM. Moreover, to address the\ninevitable \"hallucination\" problem in RSVLM, we developed RSSA, the first\ndataset aimed at enhancing the Self-Awareness capability of RSVLMs. By\nincorporating a variety of unanswerable questions into typical RS visual\nquestion-answering tasks, RSSA effectively improves the truthfulness and\nreduces the hallucinations of the model's outputs, thereby enhancing the\nhonesty of the RSVLM. Based on these datasets, we proposed the H2RSVLM, the\nHelpful and Honest Remote Sensing Vision Language Model. H2RSVLM has achieved\noutstanding performance on multiple RS public datasets and is capable of\nrecognizing and refusing to answer the unanswerable questions, effectively\nmitigating the incorrect generations. We will release the code, data and model\nweights at https://github.com/opendatalab/H2RSVLM .\n","authors":["Chao Pang","Jiang Wu","Jiayu Li","Yi Liu","Jiaxing Sun","Weijia Li","Xingxing Weng","Shuai Wang","Litong Feng","Gui-Song Xia","Conghui He"],"pdf_url":"https://arxiv.org/pdf/2403.20213v1.pdf","comment":"Equal contribution: Chao Pang, Jiang Wu; Corresponding author:\n Gui-Song Xia, Conghui He"},{"id":"http://arxiv.org/abs/2403.10897v2","updated":"2024-03-29T14:49:11Z","published":"2024-03-16T11:21:24Z","title":"Rethinking Multi-view Representation Learning via Distilled\n Disentangling","summary":" Multi-view representation learning aims to derive robust representations that\nare both view-consistent and view-specific from diverse data sources. This\npaper presents an in-depth analysis of existing approaches in this domain,\nhighlighting a commonly overlooked aspect: the redundancy between\nview-consistent and view-specific representations. To this end, we propose an\ninnovative framework for multi-view representation learning, which incorporates\na technique we term 'distilled disentangling'. Our method introduces the\nconcept of masked cross-view prediction, enabling the extraction of compact,\nhigh-quality view-consistent representations from various sources without\nincurring extra computational overhead. Additionally, we develop a distilled\ndisentangling module that efficiently filters out consistency-related\ninformation from multi-view representations, resulting in purer view-specific\nrepresentations. This approach significantly reduces redundancy between\nview-consistent and view-specific representations, enhancing the overall\nefficiency of the learning process. Our empirical evaluations reveal that\nhigher mask ratios substantially improve the quality of view-consistent\nrepresentations. Moreover, we find that reducing the dimensionality of\nview-consistent representations relative to that of view-specific\nrepresentations further refines the quality of the combined representations.\nOur code is accessible at: https://github.com/Guanzhou-Ke/MRDD.\n","authors":["Guanzhou Ke","Bo Wang","Xiaoli Wang","Shengfeng He"],"pdf_url":"https://arxiv.org/pdf/2403.10897v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.12931v2","updated":"2024-03-29T14:48:42Z","published":"2024-03-19T17:34:27Z","title":"You Only Sample Once: Taming One-Step Text-To-Image Synthesis by\n Self-Cooperative Diffusion GANs","summary":" We introduce YOSO, a novel generative model designed for rapid, scalable, and\nhigh-fidelity one-step image synthesis. This is achieved by integrating the\ndiffusion process with GANs. Specifically, we smooth the distribution by the\ndenoising generator itself, performing self-cooperative learning. We show that\nour method can serve as a one-step generation model training from scratch with\ncompetitive performance. Moreover, we show that our method can be extended to\nfinetune pre-trained text-to-image diffusion for high-quality one-step\ntext-to-image synthesis even with LoRA fine-tuning. In particular, we provide\nthe first diffusion transformer that can generate images in one step trained on\n512 resolution, with the capability of adapting to 1024 resolution without\nexplicit training. Our code is provided at https://github.com/Luo-Yihong/YOSO.\n","authors":["Yihong Luo","Xiaolong Chen","Jing Tang"],"pdf_url":"https://arxiv.org/pdf/2403.12931v2.pdf","comment":"Early version"},{"id":"http://arxiv.org/abs/2403.11371v4","updated":"2024-03-29T14:19:56Z","published":"2024-03-17T23:29:41Z","title":"V2X-DGW: Domain Generalization for Multi-agent Perception under Adverse\n Weather Conditions","summary":" Current LiDAR-based Vehicle-to-Everything (V2X) multi-agent perception\nsystems have shown the significant success on 3D object detection. While these\nmodels perform well in the trained clean weather, they struggle in unseen\nadverse weather conditions with the real-world domain gap. In this paper, we\npropose a domain generalization approach, named V2X-DGW, for LiDAR-based 3D\nobject detection on multi-agent perception system under adverse weather\nconditions. Not only in the clean weather does our research aim to ensure\nfavorable multi-agent performance, but also in the unseen adverse weather\nconditions by learning only on the clean weather data. To advance research in\nthis area, we have simulated the impact of three prevalent adverse weather\nconditions on two widely-used multi-agent datasets, resulting in the creation\nof two novel benchmark datasets: OPV2V-w and V2XSet-w.\n To this end, we first introduce the Adaptive Weather Augmentation (AWA) to\nmimic the unseen adverse weather conditions, and then propose two alignments\nfor generalizable representation learning: Trust-region Weather-invariant\nAlignment (TWA) and Agent-aware Contrastive Alignment (ACA). Extensive\nexperimental results demonstrate that our V2X-DGW achieved improvements in the\nunseen adverse weather conditions.\n","authors":["Baolu Li","Jinlong Li","Xinyu Liu","Runsheng Xu","Zhengzhong Tu","Jiacheng Guo","Xiaopeng Li","Hongkai Yu"],"pdf_url":"https://arxiv.org/pdf/2403.11371v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20195v1","updated":"2024-03-29T14:17:30Z","published":"2024-03-29T14:17:30Z","title":"Enhancing Lithological Mapping with Spatially Constrained Bayesian\n Network (SCB-Net): An Approach for Field Data-Constrained Predictions with\n Uncertainty Evaluation","summary":" Geological maps are an extremely valuable source of information for the Earth\nsciences. They provide insights into mineral exploration, vulnerability to\nnatural hazards, and many other applications. These maps are created using\nnumerical or conceptual models that use geological observations to extrapolate\ndata. Geostatistical techniques have traditionally been used to generate\nreliable predictions that take into account the spatial patterns inherent in\nthe data. However, as the number of auxiliary variables increases, these\nmethods become more labor-intensive. Additionally, traditional machine learning\nmethods often struggle with spatially correlated data and extracting valuable\nnon-linear information from geoscientific datasets. To address these\nlimitations, a new architecture called the Spatially Constrained Bayesian\nNetwork (SCB-Net) has been developed. The SCB-Net aims to effectively exploit\nthe information from auxiliary variables while producing spatially constrained\npredictions. It is made up of two parts, the first part focuses on learning\nunderlying patterns in the auxiliary variables while the second part integrates\nground-truth data and the learned embeddings from the first part. Moreover, to\nassess model uncertainty, a technique called Monte Carlo dropout is used as a\nBayesian approximation. The SCB-Net has been applied to two selected areas in\nnorthern Quebec, Canada, and has demonstrated its potential in generating\nfield-data-constrained lithological maps while allowing assessment of\nprediction uncertainty for decision-making. This study highlights the promising\nadvancements of deep neural networks in geostatistics, particularly in handling\ncomplex spatial feature learning tasks, leading to improved spatial information\ntechniques.\n","authors":["Victor Silva dos Santos","Erwan Gloaguen","Shiva Tirdad"],"pdf_url":"https://arxiv.org/pdf/2403.20195v1.pdf","comment":"17 pages, 3559 words, 14 figures"},{"id":"http://arxiv.org/abs/2403.20193v1","updated":"2024-03-29T14:14:22Z","published":"2024-03-29T14:14:22Z","title":"Motion Inversion for Video Customization","summary":" In this research, we present a novel approach to motion customization in\nvideo generation, addressing the widespread gap in the thorough exploration of\nmotion representation within video generative models. Recognizing the unique\nchallenges posed by video's spatiotemporal nature, our method introduces Motion\nEmbeddings, a set of explicit, temporally coherent one-dimensional embeddings\nderived from a given video. These embeddings are designed to integrate\nseamlessly with the temporal transformer modules of video diffusion models,\nmodulating self-attention computations across frames without compromising\nspatial integrity. Our approach offers a compact and efficient solution to\nmotion representation and enables complex manipulations of motion\ncharacteristics through vector arithmetic in the embedding space. Furthermore,\nwe identify the Temporal Discrepancy in video generative models, which refers\nto variations in how different motion modules process temporal relationships\nbetween frames. We leverage this understanding to optimize the integration of\nour motion embeddings. Our contributions include the introduction of a tailored\nmotion embedding for customization tasks, insights into the temporal processing\ndifferences in video models, and a demonstration of the practical advantages\nand effectiveness of our method through extensive experiments.\n","authors":["Luozhou Wang","Guibao Shen","Yixun Liang","Xin Tao","Pengfei Wan","Di Zhang","Yijun Li","Yingcong Chen"],"pdf_url":"https://arxiv.org/pdf/2403.20193v1.pdf","comment":"Project Page:\n \\href{https://wileewang.github.io/MotionInversion/}{https://wileewang.github.io/MotionInversion/}"},{"id":"http://arxiv.org/abs/2403.20186v1","updated":"2024-03-29T14:04:45Z","published":"2024-03-29T14:04:45Z","title":"Sketch-to-Architecture: Generative AI-aided Architectural Design","summary":" Recently, the development of large-scale models has paved the way for various\ninterdisciplinary research, including architecture. By using generative AI, we\npresent a novel workflow that utilizes AI models to generate conceptual\nfloorplans and 3D models from simple sketches, enabling rapid ideation and\ncontrolled generation of architectural renderings based on textual\ndescriptions. Our work demonstrates the potential of generative AI in the\narchitectural design process, pointing towards a new direction of\ncomputer-aided architectural design. Our project website is available at:\nhttps://zrealli.github.io/sketch2arc\n","authors":["Pengzhi Li","Baijuan Li","Zhiheng Li"],"pdf_url":"https://arxiv.org/pdf/2403.20186v1.pdf","comment":"Pacific Graphics 2023, accepted as Poster"},{"id":"http://arxiv.org/abs/2403.20183v1","updated":"2024-03-29T13:57:46Z","published":"2024-03-29T13:57:46Z","title":"HARMamba: Efficient Wearable Sensor Human Activity Recognition Based on\n Bidirectional Selective SSM","summary":" Wearable sensor human activity recognition (HAR) is a crucial area of\nresearch in activity sensing. While transformer-based temporal deep learning\nmodels have been extensively studied and implemented, their large number of\nparameters present significant challenges in terms of system computing load and\nmemory usage, rendering them unsuitable for real-time mobile activity\nrecognition applications. Recently, an efficient hardware-aware state space\nmodel (SSM) called Mamba has emerged as a promising alternative. Mamba\ndemonstrates strong potential in long sequence modeling, boasts a simpler\nnetwork architecture, and offers an efficient hardware-aware design. Leveraging\nSSM for activity recognition represents an appealing avenue for exploration. In\nthis study, we introduce HARMamba, which employs a more lightweight selective\nSSM as the foundational model architecture for activity recognition. The goal\nis to address the computational resource constraints encountered in real-time\nactivity recognition scenarios. Our approach involves processing sensor data\nflow by independently learning each channel and segmenting the data into\n\"patches\". The marked sensor sequence's position embedding serves as the input\ntoken for the bidirectional state space model, ultimately leading to activity\ncategorization through the classification head. Compared to established\nactivity recognition frameworks like Transformer-based models, HARMamba\nachieves superior performance while also reducing computational and memory\noverhead. Furthermore, our proposed method has been extensively tested on four\npublic activity datasets: PAMAP2, WISDM, UNIMIB, and UCI, demonstrating\nimpressive performance in activity recognition tasks.\n","authors":["Shuangjian Li","Tao Zhu","Furong Duan","Liming Chen","Huansheng Ning","Yaping Wan"],"pdf_url":"https://arxiv.org/pdf/2403.20183v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.01705v2","updated":"2024-03-29T13:53:33Z","published":"2023-04-04T11:01:46Z","title":"Cross-modal tumor segmentation using generative blending augmentation\n and self training","summary":" \\textit{Objectives}: Data scarcity and domain shifts lead to biased training\nsets that do not accurately represent deployment conditions. A related\npractical problem is cross-modal image segmentation, where the objective is to\nsegment unlabelled images using previously labelled datasets from other imaging\nmodalities. \\textit{Methods}: We propose a cross-modal segmentation method\nbased on conventional image synthesis boosted by a new data augmentation\ntechnique called Generative Blending Augmentation (GBA). GBA leverages a SinGAN\nmodel to learn representative generative features from a single training image\nto diversify realistically tumor appearances. This way, we compensate for image\nsynthesis errors, subsequently improving the generalization power of a\ndownstream segmentation model. The proposed augmentation is further combined to\nan iterative self-training procedure leveraging pseudo labels at each pass.\n\\textit{Results}: The proposed solution ranked first for vestibular schwannoma\n(VS) segmentation during the validation and test phases of the MICCAI CrossMoDA\n2022 challenge, with best mean Dice similarity and average symmetric surface\ndistance measures. \\textit{Conclusion and significance}: Local contrast\nalteration of tumor appearances and iterative self-training with pseudo labels\nare likely to lead to performance improvements in a variety of segmentation\ncontexts.\n","authors":["Guillaume Sallé","Pierre-Henri Conze","Julien Bert","Nicolas Boussion","Dimitris Visvikis","Vincent Jaouen"],"pdf_url":"https://arxiv.org/pdf/2304.01705v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20173v1","updated":"2024-03-29T13:40:44Z","published":"2024-03-29T13:40:44Z","title":"MCNet: A crowd denstity estimation network based on integrating\n multiscale attention module","summary":" Aiming at the metro video surveillance system has not been able to\neffectively solve the metro crowd density estimation problem, a Metro Crowd\ndensity estimation Network (called MCNet) is proposed to automatically classify\ncrowd density level of passengers. Firstly, an Integrating Multi-scale\nAttention (IMA) module is proposed to enhance the ability of the plain\nclassifiers to extract semantic crowd texture features to accommodate to the\ncharacteristics of the crowd texture feature. The innovation of the IMA module\nis to fuse the dilation convolution, multiscale feature extraction and\nattention mechanism to obtain multi-scale crowd feature activation from a\nlarger receptive field with lower computational cost, and to strengthen the\ncrowds activation state of convolutional features in top layers. Secondly, a\nnovel lightweight crowd texture feature extraction network is proposed, which\ncan directly process video frames and automatically extract texture features\nfor crowd density estimation, while its faster image processing speed and fewer\nnetwork parameters make it flexible to be deployed on embedded platforms with\nlimited hardware resources. Finally, this paper integrates IMA module and the\nlightweight crowd texture feature extraction network to construct the MCNet,\nand validate the feasibility of this network on image classification dataset:\nCifar10 and four crowd density datasets: PETS2009, Mall, QUT and SH_METRO to\nvalidate the MCNet whether can be a suitable solution for crowd density\nestimation in metro video surveillance where there are image processing\nchallenges such as high density, high occlusion, perspective distortion and\nlimited hardware resources.\n","authors":["Qiang Guo","Rubo Zhang","Di Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.20173v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20168v1","updated":"2024-03-29T13:35:37Z","published":"2024-03-29T13:35:37Z","title":"Unsupervised Tumor-Aware Distillation for Multi-Modal Brain Image\n Translation","summary":" Multi-modal brain images from MRI scans are widely used in clinical diagnosis\nto provide complementary information from different modalities. However,\nobtaining fully paired multi-modal images in practice is challenging due to\nvarious factors, such as time, cost, and artifacts, resulting in\nmodality-missing brain images. To address this problem, unsupervised\nmulti-modal brain image translation has been extensively studied. Existing\nmethods suffer from the problem of brain tumor deformation during translation,\nas they fail to focus on the tumor areas when translating the whole images. In\nthis paper, we propose an unsupervised tumor-aware distillation teacher-student\nnetwork called UTAD-Net, which is capable of perceiving and translating tumor\nareas precisely. Specifically, our model consists of two parts: a teacher\nnetwork and a student network. The teacher network learns an end-to-end mapping\nfrom source to target modality using unpaired images and corresponding tumor\nmasks first. Then, the translation knowledge is distilled into the student\nnetwork, enabling it to generate more realistic tumor areas and whole images\nwithout masks. Experiments show that our model achieves competitive performance\non both quantitative and qualitative evaluations of image quality compared with\nstate-of-the-art methods. Furthermore, we demonstrate the effectiveness of the\ngenerated images on downstream segmentation tasks. Our code is available at\nhttps://github.com/scut-HC/UTAD-Net.\n","authors":["Chuan Huang","Jia Wei","Rui Li"],"pdf_url":"https://arxiv.org/pdf/2403.20168v1.pdf","comment":"8 pages, 5 figures. It has been provisionally accepted for IJCNN 2024"},{"id":"http://arxiv.org/abs/2304.00746v4","updated":"2024-03-29T13:32:53Z","published":"2023-04-03T06:40:52Z","title":"VGTS: Visually Guided Text Spotting for Novel Categories in Historical\n Manuscripts","summary":" In the field of historical manuscript research, scholars frequently encounter\nnovel symbols in ancient texts, investing considerable effort in their\nidentification and documentation. Although existing object detection methods\nachieve impressive performance on known categories, they struggle to recognize\nnovel symbols without retraining. To address this limitation, we propose a\nVisually Guided Text Spotting (VGTS) approach that accurately spots novel\ncharacters using just one annotated support sample. The core of VGTS is a\nspatial alignment module consisting of a Dual Spatial Attention (DSA) block and\na Geometric Matching (GM) block. The DSA block aims to identify, focus on, and\nlearn discriminative spatial regions in the support and query images, mimicking\nthe human visual spotting process. It first refines the support image by\nanalyzing inter-channel relationships to identify critical areas, and then\nrefines the query image by focusing on informative key points. The GM block, on\nthe other hand, establishes the spatial correspondence between the two images,\nenabling accurate localization of the target character in the query image. To\ntackle the example imbalance problem in low-resource spotting tasks, we develop\na novel torus loss function that enhances the discriminative power of the\nembedding space for distance metric learning. To further validate our approach,\nwe introduce a new dataset featuring ancient Dongba hieroglyphics (DBH)\nassociated with the Naxi minority of China. Extensive experiments on the DBH\ndataset and other public datasets, including EGY, VML-HD, TKH, and NC, show\nthat VGTS consistently surpasses state-of-the-art methods. The proposed\nframework exhibits great potential for application in historical manuscript\ntext spotting, enabling scholars to efficiently identify and document novel\nsymbols with minimal annotation effort.\n","authors":["Wenbo Hu","Hongjian Zhan","Xinchen Ma","Cong Liu","Bing Yin","Yue Lu"],"pdf_url":"https://arxiv.org/pdf/2304.00746v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20159v1","updated":"2024-03-29T13:16:05Z","published":"2024-03-29T13:16:05Z","title":"HGS-Mapping: Online Dense Mapping Using Hybrid Gaussian Representation\n in Urban Scenes","summary":" Online dense mapping of urban scenes forms a fundamental cornerstone for\nscene understanding and navigation of autonomous vehicles. Recent advancements\nin mapping methods are mainly based on NeRF, whose rendering speed is too slow\nto meet online requirements. 3D Gaussian Splatting (3DGS), with its rendering\nspeed hundreds of times faster than NeRF, holds greater potential in online\ndense mapping. However, integrating 3DGS into a street-view dense mapping\nframework still faces two challenges, including incomplete reconstruction due\nto the absence of geometric information beyond the LiDAR coverage area and\nextensive computation for reconstruction in large urban scenes. To this end, we\npropose HGS-Mapping, an online dense mapping framework in unbounded large-scale\nscenes. To attain complete construction, our framework introduces Hybrid\nGaussian Representation, which models different parts of the entire scene using\nGaussians with distinct properties. Furthermore, we employ a hybrid Gaussian\ninitialization mechanism and an adaptive update method to achieve high-fidelity\nand rapid reconstruction. To the best of our knowledge, we are the first to\nintegrate Gaussian representation into online dense mapping of urban scenes.\nOur approach achieves SOTA reconstruction accuracy while only employing 66%\nnumber of Gaussians, leading to 20% faster reconstruction speed.\n","authors":["Ke Wu","Kaizhao Zhang","Zhiwei Zhang","Shanshuai Yuan","Muer Tie","Julong Wei","Zijun Xu","Jieru Zhao","Zhongxue Gan","Wenchao Ding"],"pdf_url":"https://arxiv.org/pdf/2403.20159v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00274v2","updated":"2024-03-29T13:14:59Z","published":"2024-03-01T04:31:56Z","title":"CustomListener: Text-guided Responsive Interaction for User-friendly\n Listening Head Generation","summary":" Listening head generation aims to synthesize a non-verbal responsive listener\nhead by modeling the correlation between the speaker and the listener in\ndynamic conversion.The applications of listener agent generation in virtual\ninteraction have promoted many works achieving the diverse and fine-grained\nmotion generation. However, they can only manipulate motions through simple\nemotional labels, but cannot freely control the listener's motions. Since\nlistener agents should have human-like attributes (e.g. identity, personality)\nwhich can be freely customized by users, this limits their realism. In this\npaper, we propose a user-friendly framework called CustomListener to realize\nthe free-form text prior guided listener generation. To achieve\nspeaker-listener coordination, we design a Static to Dynamic Portrait module\n(SDP), which interacts with speaker information to transform static text into\ndynamic portrait token with completion rhythm and amplitude information. To\nachieve coherence between segments, we design a Past Guided Generation Module\n(PGG) to maintain the consistency of customized listener attributes through the\nmotion prior, and utilize a diffusion-based structure conditioned on the\nportrait token and the motion prior to realize the controllable generation. To\ntrain and evaluate our model, we have constructed two text-annotated listening\nhead datasets based on ViCo and RealTalk, which provide text-video paired\nlabels. Extensive experiments have verified the effectiveness of our model.\n","authors":["Xi Liu","Ying Guo","Cheng Zhen","Tong Li","Yingying Ao","Pengfei Yan"],"pdf_url":"https://arxiv.org/pdf/2403.00274v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2401.06312v4","updated":"2024-03-29T13:10:56Z","published":"2024-01-12T00:49:49Z","title":"Video Super-Resolution Transformer with Masked Inter&Intra-Frame\n Attention","summary":" Recently, Vision Transformer has achieved great success in recovering missing\ndetails in low-resolution sequences, i.e., the video super-resolution (VSR)\ntask. Despite its superiority in VSR accuracy, the heavy computational burden\nas well as the large memory footprint hinder the deployment of\nTransformer-based VSR models on constrained devices. In this paper, we address\nthe above issue by proposing a novel feature-level masked processing framework:\nVSR with Masked Intra and inter frame Attention (MIA-VSR). The core of MIA-VSR\nis leveraging feature-level temporal continuity between adjacent frames to\nreduce redundant computations and make more rational use of previously enhanced\nSR features. Concretely, we propose an intra-frame and inter-frame attention\nblock which takes the respective roles of past features and input features into\nconsideration and only exploits previously enhanced features to provide\nsupplementary information. In addition, an adaptive block-wise mask prediction\nmodule is developed to skip unimportant computations according to feature\nsimilarity between adjacent frames. We conduct detailed ablation studies to\nvalidate our contributions and compare the proposed method with recent\nstate-of-the-art VSR approaches. The experimental results demonstrate that\nMIA-VSR improves the memory and computation efficiency over state-of-the-art\nmethods, without trading off PSNR accuracy. The code is available at\nhttps://github.com/LabShuHangGU/MIA-VSR.\n","authors":["Xingyu Zhou","Leheng Zhang","Xiaorui Zhao","Keze Wang","Leida Li","Shuhang Gu"],"pdf_url":"https://arxiv.org/pdf/2401.06312v4.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2307.09591v2","updated":"2024-03-29T13:04:03Z","published":"2023-07-18T19:56:20Z","title":"Gradient strikes back: How filtering out high frequencies improves\n explanations","summary":" Attribution methods correspond to a class of explainability methods (XAI)\nthat aim to assess how individual inputs contribute to a model's\ndecision-making process. We have identified a significant limitation in one\ntype of attribution methods, known as \"white-box\" methods. Although highly\nefficient, these methods rely on a gradient signal that is often contaminated\nby high-frequency noise. To overcome this limitation, we introduce a new\napproach called \"FORGrad\". This simple method effectively filters out noise\nartifacts by using optimal cut-off frequencies tailored to the unique\ncharacteristics of each model architecture. Our findings show that FORGrad\nconsistently enhances the performance of already existing white-box methods,\nenabling them to compete effectively with more accurate yet computationally\ndemanding \"black-box\" methods. We anticipate that our research will foster\nbroader adoption of simpler and more efficient white-box methods for\nexplainability, offering a better balance between faithfulness and\ncomputational efficiency.\n","authors":["Sabine Muzellec","Thomas Fel","Victor Boutin","Léo andéol","Rufin VanRullen","Thomas Serre"],"pdf_url":"https://arxiv.org/pdf/2307.09591v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.08396v5","updated":"2024-03-29T12:50:38Z","published":"2023-05-15T07:23:54Z","title":"MaxViT-UNet: Multi-Axis Attention for Medical Image Segmentation","summary":" Since their emergence, Convolutional Neural Networks (CNNs) have made\nsignificant strides in medical image analysis. However, the local nature of the\nconvolution operator may pose a limitation for capturing global and long-range\ninteractions in CNNs. Recently, Transformers have gained popularity in the\ncomputer vision community and also in medical image segmentation due to their\nability to process global features effectively. The scalability issues of the\nself-attention mechanism and lack of the CNN-like inductive bias may have\nlimited their adoption. Therefore, hybrid Vision transformers\n(CNN-Transformer), exploiting the advantages of both Convolution and\nSelf-attention Mechanisms, have gained importance. In this work, we present\nMaxViT-UNet, a new Encoder-Decoder based UNet type hybrid vision transformer\n(CNN-Transformer) for medical image segmentation. The proposed Hybrid Decoder\nis designed to harness the power of both the convolution and self-attention\nmechanisms at each decoding stage with a nominal memory and computational\nburden. The inclusion of multi-axis self-attention, within each decoder stage,\nsignificantly enhances the discriminating capacity between the object and\nbackground regions, thereby helping in improving the segmentation efficiency.\nIn the Hybrid Decoder, a new block is also proposed. The fusion process\ncommences by integrating the upsampled lower-level decoder features, obtained\nthrough transpose convolution, with the skip-connection features derived from\nthe hybrid encoder. Subsequently, the fused features undergo refinement through\nthe utilization of a multi-axis attention mechanism. The proposed decoder block\nis repeated multiple times to segment the nuclei regions progressively.\nExperimental results on MoNuSeg18 and MoNuSAC20 datasets demonstrate the\neffectiveness of the proposed technique.\n","authors":["Abdul Rehman Khan","Asifullah Khan"],"pdf_url":"https://arxiv.org/pdf/2305.08396v5.pdf","comment":"19 pages, 6 figures, 5 tables"},{"id":"http://arxiv.org/abs/2403.20153v1","updated":"2024-03-29T12:49:40Z","published":"2024-03-29T12:49:40Z","title":"Talk3D: High-Fidelity Talking Portrait Synthesis via Personalized 3D\n Generative Prior","summary":" Recent methods for audio-driven talking head synthesis often optimize neural\nradiance fields (NeRF) on a monocular talking portrait video, leveraging its\ncapability to render high-fidelity and 3D-consistent novel-view frames.\nHowever, they often struggle to reconstruct complete face geometry due to the\nabsence of comprehensive 3D information in the input monocular videos. In this\npaper, we introduce a novel audio-driven talking head synthesis framework,\ncalled Talk3D, that can faithfully reconstruct its plausible facial geometries\nby effectively adopting the pre-trained 3D-aware generative prior. Given the\npersonalized 3D generative model, we present a novel audio-guided attention\nU-Net architecture that predicts the dynamic face variations in the NeRF space\ndriven by audio. Furthermore, our model is further modulated by audio-unrelated\nconditioning tokens which effectively disentangle variations unrelated to audio\nfeatures. Compared to existing methods, our method excels in generating\nrealistic facial geometries even under extreme head poses. We also conduct\nextensive experiments showing our approach surpasses state-of-the-art\nbenchmarks in terms of both quantitative and qualitative evaluations.\n","authors":["Jaehoon Ko","Kyusun Cho","Joungbin Lee","Heeji Yoon","Sangmin Lee","Sangjun Ahn","Seungryong Kim"],"pdf_url":"https://arxiv.org/pdf/2403.20153v1.pdf","comment":"Project page: https://ku-cvlab.github.io/Talk3D/"},{"id":"http://arxiv.org/abs/2403.12687v2","updated":"2024-03-29T12:45:27Z","published":"2024-03-19T12:45:52Z","title":"Audio-Visual Compound Expression Recognition Method based on Late\n Modality Fusion and Rule-based Decision","summary":" This paper presents the results of the SUN team for the Compound Expressions\nRecognition Challenge of the 6th ABAW Competition. We propose a novel\naudio-visual method for compound expression recognition. Our method relies on\nemotion recognition models that fuse modalities at the emotion probability\nlevel, while decisions regarding the prediction of compound expressions are\nbased on predefined rules. Notably, our method does not use any training data\nspecific to the target task. Thus, the problem is a zero-shot classification\ntask. The method is evaluated in multi-corpus training and cross-corpus\nvalidation setups. Using our proposed method is achieved an F1-score value\nequals to 22.01% on the C-EXPR-DB test subset. Our findings from the challenge\ndemonstrate that the proposed method can potentially form a basis for\ndeveloping intelligent tools for annotating audio-visual data in the context of\nhuman's basic and compound emotions.\n","authors":["Elena Ryumina","Maxim Markitantov","Dmitry Ryumin","Heysem Kaya","Alexey Karpov"],"pdf_url":"https://arxiv.org/pdf/2403.12687v2.pdf","comment":"7 pages, 3 figures"},{"id":"http://arxiv.org/abs/2402.11677v2","updated":"2024-03-29T12:34:34Z","published":"2024-02-18T18:56:13Z","title":"MultiCorrupt: A Multi-Modal Robustness Dataset and Benchmark of\n LiDAR-Camera Fusion for 3D Object Detection","summary":" Multi-modal 3D object detection models for automated driving have\ndemonstrated exceptional performance on computer vision benchmarks like\nnuScenes. However, their reliance on densely sampled LiDAR point clouds and\nmeticulously calibrated sensor arrays poses challenges for real-world\napplications. Issues such as sensor misalignment, miscalibration, and disparate\nsampling frequencies lead to spatial and temporal misalignment in data from\nLiDAR and cameras. Additionally, the integrity of LiDAR and camera data is\noften compromised by adverse environmental conditions such as inclement\nweather, leading to occlusions and noise interference. To address this\nchallenge, we introduce MultiCorrupt, a comprehensive benchmark designed to\nevaluate the robustness of multi-modal 3D object detectors against ten distinct\ntypes of corruptions. We evaluate five state-of-the-art multi-modal detectors\non MultiCorrupt and analyze their performance in terms of their resistance\nability. Our results show that existing methods exhibit varying degrees of\nrobustness depending on the type of corruption and their fusion strategy. We\nprovide insights into which multi-modal design choices make such models robust\nagainst certain perturbations. The dataset generation code and benchmark are\nopen-sourced at https://github.com/ika-rwth-aachen/MultiCorrupt.\n","authors":["Till Beemelmanns","Quan Zhang","Lutz Eckstein"],"pdf_url":"https://arxiv.org/pdf/2402.11677v2.pdf","comment":"Code: https://github.com/ika-rwth-aachen/MultiCorrupt"},{"id":"http://arxiv.org/abs/2311.15851v3","updated":"2024-03-29T12:25:45Z","published":"2023-11-27T14:17:41Z","title":"Single-Model and Any-Modality for Video Object Tracking","summary":" In the realm of video object tracking, auxiliary modalities such as depth,\nthermal, or event data have emerged as valuable assets to complement the RGB\ntrackers. In practice, most existing RGB trackers learn a single set of\nparameters to use them across datasets and applications. However, a similar\nsingle-model unification for multi-modality tracking presents several\nchallenges. These challenges stem from the inherent heterogeneity of inputs --\neach with modality-specific representations, the scarcity of multi-modal\ndatasets, and the absence of all the modalities at all times. In this work, we\nintroduce Un-Track, a Unified Tracker of a single set of parameters for any\nmodality. To handle any modality, our method learns their common latent space\nthrough low-rank factorization and reconstruction techniques. More importantly,\nwe use only the RGB-X pairs to learn the common latent space. This unique\nshared representation seamlessly binds all modalities together, enabling\neffective unification and accommodating any missing modality, all within a\nsingle transformer-based architecture. Our Un-Track achieves +8.1 absolute\nF-score gain, on the DepthTrack dataset, by introducing only +2.14 (over 21.50)\nGFLOPs with +6.6M (over 93M) parameters, through a simple yet efficient\nprompting strategy. Extensive comparisons on five benchmark datasets with\ndifferent modalities show that Un-Track surpasses both SOTA unified trackers\nand modality-specific counterparts, validating our effectiveness and\npracticality. The source code is publicly available at\nhttps://github.com/Zongwei97/UnTrack.\n","authors":["Zongwei Wu","Jilai Zheng","Xiangxuan Ren","Florin-Alexandru Vasluianu","Chao Ma","Danda Pani Paudel","Luc Van Gool","Radu Timofte"],"pdf_url":"https://arxiv.org/pdf/2311.15851v3.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.20142v1","updated":"2024-03-29T12:23:58Z","published":"2024-03-29T12:23:58Z","title":"StegoGAN: Leveraging Steganography for Non-Bijective Image-to-Image\n Translation","summary":" Most image-to-image translation models postulate that a unique correspondence\nexists between the semantic classes of the source and target domains. However,\nthis assumption does not always hold in real-world scenarios due to divergent\ndistributions, different class sets, and asymmetrical information\nrepresentation. As conventional GANs attempt to generate images that match the\ndistribution of the target domain, they may hallucinate spurious instances of\nclasses absent from the source domain, thereby diminishing the usefulness and\nreliability of translated images. CycleGAN-based methods are also known to hide\nthe mismatched information in the generated images to bypass cycle consistency\nobjectives, a process known as steganography. In response to the challenge of\nnon-bijective image translation, we introduce StegoGAN, a novel model that\nleverages steganography to prevent spurious features in generated images. Our\napproach enhances the semantic consistency of the translated images without\nrequiring additional postprocessing or supervision. Our experimental\nevaluations demonstrate that StegoGAN outperforms existing GAN-based models\nacross various non-bijective image-to-image translation tasks, both\nqualitatively and quantitatively. Our code and pretrained models are accessible\nat https://github.com/sian-wusidi/StegoGAN.\n","authors":["Sidi Wu","Yizi Chen","Samuel Mermet","Lorenz Hurni","Konrad Schindler","Nicolas Gonthier","Loic Landrieu"],"pdf_url":"https://arxiv.org/pdf/2403.20142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00320v2","updated":"2024-03-29T12:04:52Z","published":"2023-12-30T20:52:20Z","title":"DXAI: Explaining Classification by Image Decomposition","summary":" We propose a new way to explain and to visualize neural network\nclassification through a decomposition-based explainable AI (DXAI). Instead of\nproviding an explanation heatmap, our method yields a decomposition of the\nimage into class-agnostic and class-distinct parts, with respect to the data\nand chosen classifier. Following a fundamental signal processing paradigm of\nanalysis and synthesis, the original image is the sum of the decomposed parts.\nWe thus obtain a radically different way of explaining classification. The\nclass-agnostic part ideally is composed of all image features which do not\nposses class information, where the class-distinct part is its complementary.\nThis new visualization can be more helpful and informative in certain\nscenarios, especially when the attributes are dense, global and additive in\nnature, for instance, when colors or textures are essential for class\ndistinction. Code is available at https://github.com/dxai2024/dxai.\n","authors":["Elnatan Kadar","Guy Gilboa"],"pdf_url":"https://arxiv.org/pdf/2401.00320v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20126v1","updated":"2024-03-29T11:31:12Z","published":"2024-03-29T11:31:12Z","title":"ECLIPSE: Efficient Continual Learning in Panoptic Segmentation with\n Visual Prompt Tuning","summary":" Panoptic segmentation, combining semantic and instance segmentation, stands\nas a cutting-edge computer vision task. Despite recent progress with deep\nlearning models, the dynamic nature of real-world applications necessitates\ncontinual learning, where models adapt to new classes (plasticity) over time\nwithout forgetting old ones (catastrophic forgetting). Current continual\nsegmentation methods often rely on distillation strategies like knowledge\ndistillation and pseudo-labeling, which are effective but result in increased\ntraining complexity and computational overhead. In this paper, we introduce a\nnovel and efficient method for continual panoptic segmentation based on Visual\nPrompt Tuning, dubbed ECLIPSE. Our approach involves freezing the base model\nparameters and fine-tuning only a small set of prompt embeddings, addressing\nboth catastrophic forgetting and plasticity and significantly reducing the\ntrainable parameters. To mitigate inherent challenges such as error propagation\nand semantic drift in continual segmentation, we propose logit manipulation to\neffectively leverage common knowledge across the classes. Experiments on ADE20K\ncontinual panoptic segmentation benchmark demonstrate the superiority of\nECLIPSE, notably its robustness against catastrophic forgetting and its\nreasonable plasticity, achieving a new state-of-the-art. The code is available\nat https://github.com/clovaai/ECLIPSE.\n","authors":["Beomyoung Kim","Joonsang Yu","Sung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2403.20126v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2312.00648v2","updated":"2024-03-29T11:30:20Z","published":"2023-12-01T15:20:58Z","title":"SPOT: Self-Training with Patch-Order Permutation for Object-Centric\n Learning with Autoregressive Transformers","summary":" Unsupervised object-centric learning aims to decompose scenes into\ninterpretable object entities, termed slots. Slot-based auto-encoders stand out\nas a prominent method for this task. Within them, crucial aspects include\nguiding the encoder to generate object-specific slots and ensuring the decoder\nutilizes them during reconstruction. This work introduces two novel techniques,\n(i) an attention-based self-training approach, which distills superior\nslot-based attention masks from the decoder to the encoder, enhancing object\nsegmentation, and (ii) an innovative patch-order permutation strategy for\nautoregressive transformers that strengthens the role of slot vectors in\nreconstruction. The effectiveness of these strategies is showcased\nexperimentally. The combined approach significantly surpasses prior slot-based\nautoencoder methods in unsupervised object segmentation, especially with\ncomplex real-world images. We provide the implementation code at\nhttps://github.com/gkakogeorgiou/spot .\n","authors":["Ioannis Kakogeorgiou","Spyros Gidaris","Konstantinos Karantzalos","Nikos Komodakis"],"pdf_url":"https://arxiv.org/pdf/2312.00648v2.pdf","comment":"CVPR 2024. Code: https://github.com/gkakogeorgiou/spot"},{"id":"http://arxiv.org/abs/2401.00616v3","updated":"2024-03-29T11:27:32Z","published":"2024-01-01T00:08:39Z","title":"GD^2-NeRF: Generative Detail Compensation via GAN and Diffusion for\n One-shot Generalizable Neural Radiance Fields","summary":" In this paper, we focus on the One-shot Novel View Synthesis (O-NVS) task\nwhich targets synthesizing photo-realistic novel views given only one reference\nimage per scene. Previous One-shot Generalizable Neural Radiance Fields\n(OG-NeRF) methods solve this task in an inference-time finetuning-free manner,\nyet suffer the blurry issue due to the encoder-only architecture that highly\nrelies on the limited reference image. On the other hand, recent\ndiffusion-based image-to-3d methods show vivid plausible results via distilling\npre-trained 2D diffusion models into a 3D representation, yet require tedious\nper-scene optimization. Targeting these issues, we propose the GD$^2$-NeRF, a\nGenerative Detail compensation framework via GAN and Diffusion that is both\ninference-time finetuning-free and with vivid plausible details. In detail,\nfollowing a coarse-to-fine strategy, GD$^2$-NeRF is mainly composed of a\nOne-stage Parallel Pipeline (OPP) and a 3D-consistent Detail Enhancer\n(Diff3DE). At the coarse stage, OPP first efficiently inserts the GAN model\ninto the existing OG-NeRF pipeline for primarily relieving the blurry issue\nwith in-distribution priors captured from the training dataset, achieving a\ngood balance between sharpness (LPIPS, FID) and fidelity (PSNR, SSIM). Then, at\nthe fine stage, Diff3DE further leverages the pre-trained image diffusion\nmodels to complement rich out-distribution details while maintaining decent 3D\nconsistency. Extensive experiments on both the synthetic and real-world\ndatasets show that GD$^2$-NeRF noticeably improves the details while without\nper-scene finetuning.\n","authors":["Xiao Pan","Zongxin Yang","Shuai Bai","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2401.00616v3.pdf","comment":"Submitted to Journal"},{"id":"http://arxiv.org/abs/2403.20112v1","updated":"2024-03-29T10:49:02Z","published":"2024-03-29T10:49:02Z","title":"Segmentation, Classification and Interpretation of Breast Cancer Medical\n Images using Human-in-the-Loop Machine Learning","summary":" This paper explores the application of Human-in-the-Loop (HITL) strategies in\ntraining machine learning models in the medical domain. In this case a\ndoctor-in-the-loop approach is proposed to leverage human expertise in dealing\nwith large and complex data. Specifically, the paper deals with the integration\nof genomic data and Whole Slide Imaging (WSI) analysis of breast cancer. Three\ndifferent tasks were developed: segmentation of histopathological images,\nclassification of this images regarding the genomic subtype of the cancer and,\nfinally, interpretation of the machine learning results. The involvement of a\npathologist helped us to develop a better segmentation model and to enhance the\nexplainatory capabilities of the models, but the classification results were\nsuboptimal, highlighting the limitations of this approach: despite involving\nhuman experts, complex domains can still pose challenges, and a HITL approach\nmay not always be effective.\n","authors":["David Vázquez-Lema","Eduardo Mosqueira-Rey","Elena Hernández-Pereira","Carlos Fernández-Lozano","Fernando Seara-Romera","Jorge Pombo-Otero"],"pdf_url":"https://arxiv.org/pdf/2403.20112v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20106v1","updated":"2024-03-29T10:40:41Z","published":"2024-03-29T10:40:41Z","title":"Aggregating Local and Global Features via Selective State Spaces Model\n for Efficient Image Deblurring","summary":" Image deblurring is a process of restoring a high quality image from the\ncorresponding blurred image. Significant progress in this field has been made\npossible by the emergence of various effective deep learning models, including\nCNNs and Transformers. However, these methods often face the dilemma between\neliminating long-range blur degradation perturbations and maintaining\ncomputational efficiency, which hinders their practical application. To address\nthis issue, we propose an efficient image deblurring network that leverages\nselective structured state spaces model to aggregate enriched and accurate\nfeatures. Specifically, we design an aggregate local and global block\n(ALGBlock) to capture and fuse both local invariant properties and non-local\ninformation. The ALGBlock consists of two blocks: (1) The local block models\nlocal connectivity using simplified channel attention. (2) The global block\ncaptures long-range dependency features with linear complexity through\nselective structured state spaces. Nevertheless, we note that the image details\nare local features of images, we accentuate the local part for restoration by\nrecalibrating the weight when aggregating the two branches for recovery.\nExperimental results demonstrate that the proposed method outperforms\nstate-of-the-art approaches on widely used benchmarks, highlighting its\nsuperior performance.\n","authors":["Hu Gao","Depeng Dang"],"pdf_url":"https://arxiv.org/pdf/2403.20106v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20105v1","updated":"2024-03-29T10:38:25Z","published":"2024-03-29T10:38:25Z","title":"FreeSeg-Diff: Training-Free Open-Vocabulary Segmentation with Diffusion\n Models","summary":" Foundation models have exhibited unprecedented capabilities in tackling many\ndomains and tasks. Models such as CLIP are currently widely used to bridge\ncross-modal representations, and text-to-image diffusion models are arguably\nthe leading models in terms of realistic image generation. Image generative\nmodels are trained on massive datasets that provide them with powerful internal\nspatial representations. In this work, we explore the potential benefits of\nsuch representations, beyond image generation, in particular, for dense visual\nprediction tasks. We focus on the task of image segmentation, which is\ntraditionally solved by training models on closed-vocabulary datasets, with\npixel-level annotations. To avoid the annotation cost or training large\ndiffusion models, we constraint our setup to be zero-shot and training-free. In\na nutshell, our pipeline leverages different and relatively small-sized,\nopen-source foundation models for zero-shot open-vocabulary segmentation. The\npipeline is as follows: the image is passed to both a captioner model (i.e.\nBLIP) and a diffusion model (i.e., Stable Diffusion Model) to generate a text\ndescription and visual representation, respectively. The features are clustered\nand binarized to obtain class agnostic masks for each object. These masks are\nthen mapped to a textual class, using the CLIP model to support\nopen-vocabulary. Finally, we add a refinement step that allows to obtain a more\nprecise segmentation mask. Our approach (dubbed FreeSeg-Diff), which does not\nrely on any training, outperforms many training-based approaches on both Pascal\nVOC and COCO datasets. In addition, we show very competitive results compared\nto the recent weakly-supervised segmentation approaches. We provide\ncomprehensive experiments showing the superiority of diffusion model features\ncompared to other pretrained models. Project page:\nhttps://bcorrad.github.io/freesegdiff/\n","authors":["Barbara Toniella Corradini","Mustafa Shukor","Paul Couairon","Guillaume Couairon","Franco Scarselli","Matthieu Cord"],"pdf_url":"https://arxiv.org/pdf/2403.20105v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20101v1","updated":"2024-03-29T10:31:32Z","published":"2024-03-29T10:31:32Z","title":"RealKIE: Five Novel Datasets for Enterprise Key Information Extraction","summary":" We introduce RealKIE, a benchmark of five challenging datasets aimed at\nadvancing key information extraction methods, with an emphasis on enterprise\napplications. The datasets include a diverse range of documents including SEC\nS1 Filings, US Non-disclosure Agreements, UK Charity Reports, FCC Invoices, and\nResource Contracts. Each presents unique challenges: poor text serialization,\nsparse annotations in long documents, and complex tabular layouts. These\ndatasets provide a realistic testing ground for key information extraction\ntasks like investment analysis and legal data processing.\n In addition to presenting these datasets, we offer an in-depth description of\nthe annotation process, document processing techniques, and baseline modeling\napproaches. This contribution facilitates the development of NLP models capable\nof handling practical challenges and supports further research into information\nextraction technologies applicable to industry-specific problems.\n The annotated data and OCR outputs are available to download at\nhttps://indicodatasolutions.github.io/RealKIE/ code to reproduce the baselines\nwill be available shortly.\n","authors":["Benjamin Townsend","Madison May","Christopher Wells"],"pdf_url":"https://arxiv.org/pdf/2403.20101v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11600v2","updated":"2024-03-29T10:18:02Z","published":"2023-11-20T08:27:56Z","title":"Deep Equilibrium Diffusion Restoration with Parallel Sampling","summary":" Diffusion model-based image restoration (IR) aims to use diffusion models to\nrecover high-quality (HQ) images from degraded images, achieving promising\nperformance. Due to the inherent property of diffusion models, most existing\nmethods need long serial sampling chains to restore HQ images step-by-step,\nresulting in expensive sampling time and high computation costs. Moreover, such\nlong sampling chains hinder understanding the relationship between inputs and\nrestoration results since it is hard to compute the gradients in the whole\nchains. In this work, we aim to rethink the diffusion model-based IR models\nthrough a different perspective, i.e., a deep equilibrium (DEQ) fixed point\nsystem, called DeqIR. Specifically, we derive an analytical solution by\nmodeling the entire sampling chain in these IR models as a joint multivariate\nfixed point system. Based on the analytical solution, we can conduct parallel\nsampling and restore HQ images without training. Furthermore, we compute fast\ngradients via DEQ inversion and found that initialization optimization can\nboost image quality and control the generation direction. Extensive experiments\non benchmarks demonstrate the effectiveness of our method on typical IR tasks\nand real-world settings.\n","authors":["Jiezhang Cao","Yue Shi","Kai Zhang","Yulun Zhang","Radu Timofte","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2311.11600v2.pdf","comment":"CVPR'2024"},{"id":"http://arxiv.org/abs/2311.01025v2","updated":"2024-03-29T10:09:33Z","published":"2023-11-02T06:38:19Z","title":"Integrating Language-Derived Appearance Elements with Visual Cues in\n Pedestrian Detection","summary":" Large language models (LLMs) have shown their capabilities in understanding\ncontextual and semantic information regarding knowledge of instance\nappearances. In this paper, we introduce a novel approach to utilize the\nstrengths of LLMs in understanding contextual appearance variations and to\nleverage this knowledge into a vision model (here, pedestrian detection). While\npedestrian detection is considered one of the crucial tasks directly related to\nour safety (e.g., intelligent driving systems), it is challenging because of\nvarying appearances and poses in diverse scenes. Therefore, we propose to\nformulate language-derived appearance elements and incorporate them with visual\ncues in pedestrian detection. To this end, we establish a description corpus\nthat includes numerous narratives describing various appearances of pedestrians\nand other instances. By feeding them through an LLM, we extract appearance\nknowledge sets that contain the representations of appearance variations.\nSubsequently, we perform a task-prompting process to obtain appearance elements\nwhich are guided representative appearance knowledge relevant to a downstream\npedestrian detection task. The obtained knowledge elements are adaptable to\nvarious detection frameworks, so that we can provide plentiful appearance\ninformation by integrating the language-derived appearance elements with visual\ncues within a detector. Through comprehensive experiments with various\npedestrian detectors, we verify the adaptability and effectiveness of our\nmethod showing noticeable performance gains and achieving state-of-the-art\ndetection performance on two public pedestrian detection benchmarks (i.e.,\nCrowdHuman and WiderPedestrian).\n","authors":["Sungjune Park","Hyunjun Kim","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2311.01025v2.pdf","comment":"11 pages, 5 figures, 5 tables"},{"id":"http://arxiv.org/abs/2312.03203v2","updated":"2024-03-29T10:09:30Z","published":"2023-12-06T00:46:30Z","title":"Feature 3DGS: Supercharging 3D Gaussian Splatting to Enable Distilled\n Feature Fields","summary":" 3D scene representations have gained immense popularity in recent years.\nMethods that use Neural Radiance fields are versatile for traditional tasks\nsuch as novel view synthesis. In recent times, some work has emerged that aims\nto extend the functionality of NeRF beyond view synthesis, for semantically\naware tasks such as editing and segmentation using 3D feature field\ndistillation from 2D foundation models. However, these methods have two major\nlimitations: (a) they are limited by the rendering speed of NeRF pipelines, and\n(b) implicitly represented feature fields suffer from continuity artifacts\nreducing feature quality. Recently, 3D Gaussian Splatting has shown\nstate-of-the-art performance on real-time radiance field rendering. In this\nwork, we go one step further: in addition to radiance field rendering, we\nenable 3D Gaussian splatting on arbitrary-dimension semantic features via 2D\nfoundation model distillation. This translation is not straightforward: naively\nincorporating feature fields in the 3DGS framework encounters significant\nchallenges, notably the disparities in spatial resolution and channel\nconsistency between RGB images and feature maps. We propose architectural and\ntraining changes to efficiently avert this problem. Our proposed method is\ngeneral, and our experiments showcase novel view semantic segmentation,\nlanguage-guided editing and segment anything through learning feature fields\nfrom state-of-the-art 2D foundation models such as SAM and CLIP-LSeg. Across\nexperiments, our distillation method is able to provide comparable or better\nresults, while being significantly faster to both train and render.\nAdditionally, to the best of our knowledge, we are the first method to enable\npoint and bounding-box prompting for radiance field manipulation, by leveraging\nthe SAM model. Project website at: https://feature-3dgs.github.io/\n","authors":["Shijie Zhou","Haoran Chang","Sicheng Jiang","Zhiwen Fan","Zehao Zhu","Dejia Xu","Pradyumna Chari","Suya You","Zhangyang Wang","Achuta Kadambi"],"pdf_url":"https://arxiv.org/pdf/2312.03203v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20092v1","updated":"2024-03-29T10:05:29Z","published":"2024-03-29T10:05:29Z","title":"Modeling Weather Uncertainty for Multi-weather Co-Presence Estimation","summary":" Images from outdoor scenes may be taken under various weather conditions. It\nis well studied that weather impacts the performance of computer vision\nalgorithms and needs to be handled properly. However, existing algorithms model\nweather condition as a discrete status and estimate it using multi-label\nclassification. The fact is that, physically, specifically in meteorology,\nweather are modeled as a continuous and transitional status. Instead of\ndirectly implementing hard classification as existing multi-weather\nclassification methods do, we consider the physical formulation of\nmulti-weather conditions and model the impact of physical-related parameter on\nlearning from the image appearance. In this paper, we start with solid revisit\nof the physics definition of weather and how it can be described as a\ncontinuous machine learning and computer vision task. Namely, we propose to\nmodel the weather uncertainty, where the level of probability and co-existence\nof multiple weather conditions are both considered. A Gaussian mixture model is\nused to encapsulate the weather uncertainty and a uncertainty-aware\nmulti-weather learning scheme is proposed based on prior-posterior learning. A\nnovel multi-weather co-presence estimation transformer (MeFormer) is proposed.\nIn addition, a new multi-weather co-presence estimation (MePe) dataset, along\nwith 14 fine-grained weather categories and 16,078 samples, is proposed to\nbenchmark both conventional multi-label weather classification task and\nmulti-weather co-presence estimation task. Large scale experiments show that\nthe proposed method achieves state-of-the-art performance and substantial\ngeneralization capabilities on both the conventional multi-label weather\nclassification task and the proposed multi-weather co-presence estimation task.\nBesides, modeling weather uncertainty also benefits adverse-weather semantic\nsegmentation.\n","authors":["Qi Bi","Shaodi You","Theo Gevers"],"pdf_url":"https://arxiv.org/pdf/2403.20092v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2309.13604v2","updated":"2024-03-29T09:59:34Z","published":"2023-09-24T10:48:20Z","title":"Distribution-Aware Continual Test-Time Adaptation for Semantic\n Segmentation","summary":" Since autonomous driving systems usually face dynamic and ever-changing\nenvironments, continual test-time adaptation (CTTA) has been proposed as a\nstrategy for transferring deployed models to continually changing target\ndomains. However, the pursuit of long-term adaptation often introduces\ncatastrophic forgetting and error accumulation problems, which impede the\npractical implementation of CTTA in the real world. Recently, existing CTTA\nmethods mainly focus on utilizing a majority of parameters to fit target domain\nknowledge through self-training. Unfortunately, these approaches often amplify\nthe challenge of error accumulation due to noisy pseudo-labels, and pose\npractical limitations stemming from the heavy computational costs associated\nwith entire model updates. In this paper, we propose a distribution-aware\ntuning (DAT) method to make the semantic segmentation CTTA efficient and\npractical in real-world applications. DAT adaptively selects and updates two\nsmall groups of trainable parameters based on data distribution during the\ncontinual adaptation process, including domain-specific parameters (DSP) and\ntask-relevant parameters (TRP). Specifically, DSP exhibits sensitivity to\noutputs with substantial distribution shifts, effectively mitigating the\nproblem of error accumulation. In contrast, TRP are allocated to positions that\nare responsive to outputs with minor distribution shifts, which are fine-tuned\nto avoid the catastrophic forgetting problem. In addition, since CTTA is a\ntemporal task, we introduce the Parameter Accumulation Update (PAU) strategy to\ncollect the updated DSP and TRP in target domain sequences. We conduct\nextensive experiments on two widely-used semantic segmentation CTTA benchmarks,\nachieving promising performance compared to previous state-of-the-art methods.\n","authors":["Jiayi Ni","Senqiao Yang","Ran Xu","Jiaming Liu","Xiaoqi Li","Wenyu Jiao","Zehui Chen","Yi Liu","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.13604v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20086v1","updated":"2024-03-29T09:46:14Z","published":"2024-03-29T09:46:14Z","title":"Selective Attention-based Modulation for Continual Learning","summary":" We present SAM, a biologically-plausible selective attention-driven\nmodulation approach to enhance classification models in a continual learning\nsetting. Inspired by neurophysiological evidence that the primary visual cortex\ndoes not contribute to object manifold untangling for categorization and that\nprimordial attention biases are still embedded in the modern brain, we propose\nto employ auxiliary saliency prediction features as a modulation signal to\ndrive and stabilize the learning of a sequence of non-i.i.d. classification\ntasks. Experimental results confirm that SAM effectively enhances the\nperformance (in some cases up to about twenty percent points) of\nstate-of-the-art continual learning methods, both in class-incremental and\ntask-incremental settings. Moreover, we show that attention-based modulation\nsuccessfully encourages the learning of features that are more robust to the\npresence of spurious features and to adversarial attacks than baseline methods.\nCode is available at: https://github.com/perceivelab/SAM.\n","authors":["Giovanni Bellitto","Federica Proietto Salanitri","Matteo Pennisi","Matteo Boschini","Angelo Porrello","Simone Calderara","Simone Palazzo","Concetto Spampinato"],"pdf_url":"https://arxiv.org/pdf/2403.20086v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20080v1","updated":"2024-03-29T09:22:44Z","published":"2024-03-29T09:22:44Z","title":"Mixed-precision Supernet Training from Vision Foundation Models using\n Low Rank Adapter","summary":" Compression of large and performant vision foundation models (VFMs) into\narbitrary bit-wise operations (BitOPs) allows their deployment on various\nhardware. We propose to fine-tune a VFM to a mixed-precision quantized\nsupernet. The supernet-based neural architecture search (NAS) can be adopted\nfor this purpose, which trains a supernet, and then subnets within arbitrary\nhardware budgets can be extracted. However, existing methods face difficulties\nin optimizing the mixed-precision search space and incurring large memory costs\nduring training. To tackle these challenges, first, we study the effective\nsearch space design for fine-tuning a VFM by comparing different operators\n(such as resolution, feature size, width, depth, and bit-widths) in terms of\nperformance and BitOPs reduction. Second, we propose memory-efficient supernet\ntraining using a low-rank adapter (LoRA) and a progressive training strategy.\nThe proposed method is evaluated for the recently proposed VFM, Segment\nAnything Model, fine-tuned on segmentation tasks. The searched model yields\nabout a 95% reduction in BitOPs without incurring performance degradation.\n","authors":["Yuiko Sakuma","Masakazu Yoshimura","Junji Otsuka","Atsushi Irie","Takeshi Ohashi"],"pdf_url":"https://arxiv.org/pdf/2403.20080v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20079v1","updated":"2024-03-29T09:20:29Z","published":"2024-03-29T09:20:29Z","title":"SGD: Street View Synthesis with Gaussian Splatting and Diffusion Prior","summary":" Novel View Synthesis (NVS) for street scenes play a critical role in the\nautonomous driving simulation. The current mainstream technique to achieve it\nis neural rendering, such as Neural Radiance Fields (NeRF) and 3D Gaussian\nSplatting (3DGS). Although thrilling progress has been made, when handling\nstreet scenes, current methods struggle to maintain rendering quality at the\nviewpoint that deviates significantly from the training viewpoints. This issue\nstems from the sparse training views captured by a fixed camera on a moving\nvehicle. To tackle this problem, we propose a novel approach that enhances the\ncapacity of 3DGS by leveraging prior from a Diffusion Model along with\ncomplementary multi-modal data. Specifically, we first fine-tune a Diffusion\nModel by adding images from adjacent frames as condition, meanwhile exploiting\ndepth data from LiDAR point clouds to supply additional spatial information.\nThen we apply the Diffusion Model to regularize the 3DGS at unseen views during\ntraining. Experimental results validate the effectiveness of our method\ncompared with current state-of-the-art models, and demonstrate its advance in\nrendering images from broader views.\n","authors":["Zhongrui Yu","Haoran Wang","Jinze Yang","Hanzhang Wang","Zeke Xie","Yunfeng Cai","Jiale Cao","Zhong Ji","Mingming Sun"],"pdf_url":"https://arxiv.org/pdf/2403.20079v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20078v1","updated":"2024-03-29T09:19:52Z","published":"2024-03-29T09:19:52Z","title":"Negative Label Guided OOD Detection with Pretrained Vision-Language\n Models","summary":" Out-of-distribution (OOD) detection aims at identifying samples from unknown\nclasses, playing a crucial role in trustworthy models against errors on\nunexpected inputs. Extensive research has been dedicated to exploring OOD\ndetection in the vision modality. Vision-language models (VLMs) can leverage\nboth textual and visual information for various multi-modal applications,\nwhereas few OOD detection methods take into account information from the text\nmodality. In this paper, we propose a novel post hoc OOD detection method,\ncalled NegLabel, which takes a vast number of negative labels from extensive\ncorpus databases. We design a novel scheme for the OOD score collaborated with\nnegative labels. Theoretical analysis helps to understand the mechanism of\nnegative labels. Extensive experiments demonstrate that our method NegLabel\nachieves state-of-the-art performance on various OOD detection benchmarks and\ngeneralizes well on multiple VLM architectures. Furthermore, our method\nNegLabel exhibits remarkable robustness against diverse domain shifts. The\ncodes are available at https://github.com/tmlr-group/NegLabel.\n","authors":["Xue Jiang","Feng Liu","Zhen Fang","Hong Chen","Tongliang Liu","Feng Zheng","Bo Han"],"pdf_url":"https://arxiv.org/pdf/2403.20078v1.pdf","comment":"ICLR 2024 Spotlight"},{"id":"http://arxiv.org/abs/2403.20058v1","updated":"2024-03-29T08:47:49Z","published":"2024-03-29T08:47:49Z","title":"Revolutionizing Disease Diagnosis with simultaneous functional PET/MR\n and Deeply Integrated Brain Metabolic, Hemodynamic, and Perfusion Networks","summary":" Simultaneous functional PET/MR (sf-PET/MR) presents a cutting-edge multimodal\nneuroimaging technique. It provides an unprecedented opportunity for\nconcurrently monitoring and integrating multifaceted brain networks built by\nspatiotemporally covaried metabolic activity, neural activity, and cerebral\nblood flow (perfusion). Albeit high scientific/clinical values, short in\nhardware accessibility of PET/MR hinders its applications, let alone modern\nAI-based PET/MR fusion models. Our objective is to develop a clinically\nfeasible AI-based disease diagnosis model trained on comprehensive sf-PET/MR\ndata with the power of, during inferencing, allowing single modality input\n(e.g., PET only) as well as enforcing multimodal-based accuracy. To this end,\nwe propose MX-ARM, a multimodal MiXture-of-experts Alignment and Reconstruction\nModel. It is modality detachable and exchangeable, allocating different\nmulti-layer perceptrons dynamically (\"mixture of experts\") through learnable\nweights to learn respective representations from different modalities. Such\ndesign will not sacrifice model performance in uni-modal situation. To fully\nexploit the inherent complex and nonlinear relation among modalities while\nproducing fine-grained representations for uni-modal inference, we subsequently\nadd a modal alignment module to line up a dominant modality (e.g., PET) with\nrepresentations of auxiliary modalities (MR). We further adopt multimodal\nreconstruction to promote the quality of learned features. Experiments on\nprecious multimodal sf-PET/MR data for Mild Cognitive Impairment diagnosis\nshowcase the efficacy of our model toward clinically feasible precision\nmedicine.\n","authors":["Luoyu Wang","Yitian Tao","Qing Yang","Yan Liang","Siwei Liu","Hongcheng Shi","Dinggang Shen","Han Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.20058v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2209.11964v2","updated":"2024-03-29T08:46:46Z","published":"2022-09-24T08:57:10Z","title":"Strong Transferable Adversarial Attacks via Ensembled Asymptotically\n Normal Distribution Learning","summary":" Strong adversarial examples are crucial for evaluating and enhancing the\nrobustness of deep neural networks. However, the performance of popular attacks\nis usually sensitive, for instance, to minor image transformations, stemming\nfrom limited information -- typically only one input example, a handful of\nwhite-box source models, and undefined defense strategies. Hence, the crafted\nadversarial examples are prone to overfit the source model, which hampers their\ntransferability to unknown architectures. In this paper, we propose an approach\nnamed Multiple Asymptotically Normal Distribution Attacks (MultiANDA) which\nexplicitly characterize adversarial perturbations from a learned distribution.\nSpecifically, we approximate the posterior distribution over the perturbations\nby taking advantage of the asymptotic normality property of stochastic gradient\nascent (SGA), then employ the deep ensemble strategy as an effective proxy for\nBayesian marginalization in this process, aiming to estimate a mixture of\nGaussians that facilitates a more thorough exploration of the potential\noptimization space. The approximated posterior essentially describes the\nstationary distribution of SGA iterations, which captures the geometric\ninformation around the local optimum. Thus, MultiANDA allows drawing an\nunlimited number of adversarial perturbations for each input and reliably\nmaintains the transferability. Our proposed method outperforms ten\nstate-of-the-art black-box attacks on deep learning models with or without\ndefenses through extensive experiments on seven normally trained and seven\ndefense models.\n","authors":["Zhengwei Fang","Rui Wang","Tao Huang","Liping Jing"],"pdf_url":"https://arxiv.org/pdf/2209.11964v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16653v2","updated":"2024-03-29T08:39:23Z","published":"2023-09-28T17:55:05Z","title":"DreamGaussian: Generative Gaussian Splatting for Efficient 3D Content\n Creation","summary":" Recent advances in 3D content creation mostly leverage optimization-based 3D\ngeneration via score distillation sampling (SDS). Though promising results have\nbeen exhibited, these methods often suffer from slow per-sample optimization,\nlimiting their practical usage. In this paper, we propose DreamGaussian, a\nnovel 3D content generation framework that achieves both efficiency and quality\nsimultaneously. Our key insight is to design a generative 3D Gaussian Splatting\nmodel with companioned mesh extraction and texture refinement in UV space. In\ncontrast to the occupancy pruning used in Neural Radiance Fields, we\ndemonstrate that the progressive densification of 3D Gaussians converges\nsignificantly faster for 3D generative tasks. To further enhance the texture\nquality and facilitate downstream applications, we introduce an efficient\nalgorithm to convert 3D Gaussians into textured meshes and apply a fine-tuning\nstage to refine the details. Extensive experiments demonstrate the superior\nefficiency and competitive generation quality of our proposed approach.\nNotably, DreamGaussian produces high-quality textured meshes in just 2 minutes\nfrom a single-view image, achieving approximately 10 times acceleration\ncompared to existing methods.\n","authors":["Jiaxiang Tang","Jiawei Ren","Hang Zhou","Ziwei Liu","Gang Zeng"],"pdf_url":"https://arxiv.org/pdf/2309.16653v2.pdf","comment":"Camera-ready version. Project page: https://dreamgaussian.github.io/"},{"id":"http://arxiv.org/abs/2311.14671v2","updated":"2024-03-29T08:36:41Z","published":"2023-11-24T18:59:42Z","title":"SEGIC: Unleashing the Emergent Correspondence for In-Context\n Segmentation","summary":" In-context segmentation aims at segmenting novel images using a few labeled\nexample images, termed as \"in-context examples\", exploring content similarities\nbetween examples and the target. The resulting models can be generalized\nseamlessly to novel segmentation tasks, significantly reducing the labeling and\ntraining costs compared with conventional pipelines. However, in-context\nsegmentation is more challenging than classic ones requiring the model to learn\nsegmentation rules conditioned on a few samples. Unlike previous work with\nad-hoc or non-end-to-end designs, we propose SEGIC, an end-to-end\nsegment-in-context framework built upon a single vision foundation model (VFM).\nIn particular, SEGIC leverages the emergent correspondence within VFM to\ncapture dense relationships between target images and in-context samples. As\nsuch, information from in-context samples is then extracted into three types of\ninstructions, i.e. geometric, visual, and meta instructions, serving as\nexplicit conditions for the final mask prediction. SEGIC is a straightforward\nyet effective approach that yields state-of-the-art performance on one-shot\nsegmentation benchmarks. Notably, SEGIC can be easily generalized to diverse\ntasks, including video object segmentation and open-vocabulary segmentation.\nCode will be available at https://github.com/MengLcool/SEGIC.\n","authors":["Lingchen Meng","Shiyi Lan","Hengduo Li","Jose M. Alvarez","Zuxuan Wu","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2311.14671v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20047v1","updated":"2024-03-29T08:33:05Z","published":"2024-03-29T08:33:05Z","title":"Embracing Unknown Step by Step: Towards Reliable Sparse Training in Real\n World","summary":" Sparse training has emerged as a promising method for resource-efficient deep\nneural networks (DNNs) in real-world applications. However, the reliability of\nsparse models remains a crucial concern, particularly in detecting unknown\nout-of-distribution (OOD) data. This study addresses the knowledge gap by\ninvestigating the reliability of sparse training from an OOD perspective and\nreveals that sparse training exacerbates OOD unreliability. The lack of unknown\ninformation and the sparse constraints hinder the effective exploration of\nweight space and accurate differentiation between known and unknown knowledge.\nTo tackle these challenges, we propose a new unknown-aware sparse training\nmethod, which incorporates a loss modification, auto-tuning strategy, and a\nvoting scheme to guide weight space exploration and mitigate confusion between\nknown and unknown information without incurring significant additional costs or\nrequiring access to additional OOD data. Theoretical insights demonstrate how\nour method reduces model confidence when faced with OOD samples. Empirical\nexperiments across multiple datasets, model architectures, and sparsity levels\nvalidate the effectiveness of our method, with improvements of up to\n\\textbf{8.4\\%} in AUROC while maintaining comparable or higher accuracy and\ncalibration. This research enhances the understanding and readiness of sparse\nDNNs for deployment in resource-limited applications. Our code is available on:\n\\url{https://github.com/StevenBoys/MOON}.\n","authors":["Bowen Lei","Dongkuan Xu","Ruqi Zhang","Bani Mallick"],"pdf_url":"https://arxiv.org/pdf/2403.20047v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.09866v2","updated":"2024-03-29T08:25:36Z","published":"2023-12-15T15:09:30Z","title":"PLGSLAM: Progressive Neural Scene Represenation with Local to Global\n Bundle Adjustment","summary":" Neural implicit scene representations have recently shown encouraging results\nin dense visual SLAM. However, existing methods produce low-quality scene\nreconstruction and low-accuracy localization performance when scaling up to\nlarge indoor scenes and long sequences. These limitations are mainly due to\ntheir single, global radiance field with finite capacity, which does not adapt\nto large scenarios. Their end-to-end pose networks are also not robust enough\nwith the growth of cumulative errors in large scenes. To this end, we introduce\nPLGSLAM, a neural visual SLAM system capable of high-fidelity surface\nreconstruction and robust camera tracking in real-time. To handle large-scale\nindoor scenes, PLGSLAM proposes a progressive scene representation method which\ndynamically allocates new local scene representation trained with frames within\na local sliding window. This allows us to scale up to larger indoor scenes and\nimproves robustness (even under pose drifts). In local scene representation,\nPLGSLAM utilizes tri-planes for local high-frequency features with multi-layer\nperceptron (MLP) networks for the low-frequency feature, achieving smoothness\nand scene completion in unobserved areas. Moreover, we propose local-to-global\nbundle adjustment method with a global keyframe database to address the\nincreased pose drifts on long sequences. Experimental results demonstrate that\nPLGSLAM achieves state-of-the-art scene reconstruction results and tracking\nperformance across various datasets and scenarios (both in small and\nlarge-scale indoor environments).\n","authors":["Tianchen Deng","Guole Shen","Tong Qin","Jianyu Wang","Wentao Zhao","Jingchuan Wang","Danwei Wang","Weidong Chen"],"pdf_url":"https://arxiv.org/pdf/2312.09866v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2304.14178v3","updated":"2024-03-29T08:13:38Z","published":"2023-04-27T13:27:01Z","title":"mPLUG-Owl: Modularization Empowers Large Language Models with\n Multimodality","summary":" Large language models (LLMs) have demonstrated impressive zero-shot abilities\non a variety of open-ended tasks, while recent research has also explored the\nuse of LLMs for multi-modal generation. In this study, we introduce mPLUG-Owl,\na novel training paradigm that equips LLMs with multi-modal abilities through\nmodularized learning of foundation LLM, a visual knowledge module, and a visual\nabstractor module. This approach can support multiple modalities and facilitate\ndiverse unimodal and multimodal abilities through modality collaboration. The\ntraining paradigm of mPLUG-Owl involves a two-stage method for aligning image\nand text, which learns visual knowledge with the assistance of LLM while\nmaintaining and even improving the generation abilities of LLM. In the first\nstage, the visual knowledge module and abstractor module are trained with a\nfrozen LLM module to align the image and text. In the second stage,\nlanguage-only and multi-modal supervised datasets are used to jointly fine-tune\na low-rank adaption (LoRA) module on LLM and the abstractor module by freezing\nthe visual knowledge module. We carefully build a visually-related instruction\nevaluation set OwlEval. Experimental results show that our model outperforms\nexisting multi-modal models, demonstrating mPLUG-Owl's impressive instruction\nand visual understanding ability, multi-turn conversation ability, and\nknowledge reasoning ability. Besides, we observe some unexpected and exciting\nabilities such as multi-image correlation and scene text understanding, which\nmakes it possible to leverage it for harder real scenarios, such as vision-only\ndocument comprehension. Our code, pre-trained model, instruction-tuned models,\nand evaluation set are available at https://github.com/X-PLUG/mPLUG-Owl. The\nonline demo is available at https://www.modelscope.cn/studios/damo/mPLUG-Owl.\n","authors":["Qinghao Ye","Haiyang Xu","Guohai Xu","Jiabo Ye","Ming Yan","Yiyang Zhou","Junyang Wang","Anwen Hu","Pengcheng Shi","Yaya Shi","Chenliang Li","Yuanhong Xu","Hehong Chen","Junfeng Tian","Qi Qian","Ji Zhang","Fei Huang","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2304.14178v3.pdf","comment":"Working in Process"},{"id":"http://arxiv.org/abs/2402.18918v2","updated":"2024-03-29T08:06:38Z","published":"2024-02-29T07:20:02Z","title":"SNE-RoadSegV2: Advancing Heterogeneous Feature Fusion and Fallibility\n Awareness for Freespace Detection","summary":" Feature-fusion networks with duplex encoders have proven to be an effective\ntechnique to solve the freespace detection problem. However, despite the\ncompelling results achieved by previous research efforts, the exploration of\nadequate and discriminative heterogeneous feature fusion, as well as the\ndevelopment of fallibility-aware loss functions remains relatively scarce. This\npaper makes several significant contributions to address these limitations: (1)\nIt presents a novel heterogeneous feature fusion block, comprising a holistic\nattention module, a heterogeneous feature contrast descriptor, and an\naffinity-weighted feature recalibrator, enabling a more in-depth exploitation\nof the inherent characteristics of the extracted features, (2) it incorporates\nboth inter-scale and intra-scale skip connections into the decoder architecture\nwhile eliminating redundant ones, leading to both improved accuracy and\ncomputational efficiency, and (3) it introduces two fallibility-aware loss\nfunctions that separately focus on semantic-transition and depth-inconsistent\nregions, collectively contributing to greater supervision during model\ntraining. Our proposed heterogeneous feature fusion network (SNE-RoadSegV2),\nwhich incorporates all these innovative components, demonstrates superior\nperformance in comparison to all other freespace detection algorithms across\nmultiple public datasets. Notably, it ranks the 1st on the official KITTI Road\nbenchmark.\n","authors":["Yi Feng","Yu Ma","Qijun Chen","Ioannis Pitas","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2402.18918v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20035v1","updated":"2024-03-29T08:03:42Z","published":"2024-03-29T08:03:42Z","title":"UltraLight VM-UNet: Parallel Vision Mamba Significantly Reduces\n Parameters for Skin Lesion Segmentation","summary":" Traditionally for improving the segmentation performance of models, most\napproaches prefer to use adding more complex modules. And this is not suitable\nfor the medical field, especially for mobile medical devices, where\ncomputationally loaded models are not suitable for real clinical environments\ndue to computational resource constraints. Recently, state-space models (SSMs),\nrepresented by Mamba, have become a strong competitor to traditional CNNs and\nTransformers. In this paper, we deeply explore the key elements of parameter\ninfluence in Mamba and propose an UltraLight Vision Mamba UNet (UltraLight\nVM-UNet) based on this. Specifically, we propose a method for processing\nfeatures in parallel Vision Mamba, named PVM Layer, which achieves excellent\nperformance with the lowest computational load while keeping the overall number\nof processing channels constant. We conducted comparisons and ablation\nexperiments with several state-of-the-art lightweight models on three skin\nlesion public datasets and demonstrated that the UltraLight VM-UNet exhibits\nthe same strong performance competitiveness with parameters of only 0.049M and\nGFLOPs of 0.060. In addition, this study deeply explores the key elements of\nparameter influence in Mamba, which will lay a theoretical foundation for Mamba\nto possibly become a new mainstream module for lightweighting in the future.\nThe code is available from https://github.com/wurenkai/UltraLight-VM-UNet .\n","authors":["Renkai Wu","Yinghao Liu","Pengchen Liang","Qing Chang"],"pdf_url":"https://arxiv.org/pdf/2403.20035v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18795v2","updated":"2024-03-29T08:02:14Z","published":"2024-03-27T17:40:14Z","title":"Gamba: Marry Gaussian Splatting with Mamba for single view 3D\n reconstruction","summary":" We tackle the challenge of efficiently reconstructing a 3D asset from a\nsingle image with growing demands for automated 3D content creation pipelines.\nPrevious methods primarily rely on Score Distillation Sampling (SDS) and Neural\nRadiance Fields (NeRF). Despite their significant success, these approaches\nencounter practical limitations due to lengthy optimization and considerable\nmemory usage. In this report, we introduce Gamba, an end-to-end amortized 3D\nreconstruction model from single-view images, emphasizing two main insights:\n(1) 3D representation: leveraging a large number of 3D Gaussians for an\nefficient 3D Gaussian splatting process; (2) Backbone design: introducing a\nMamba-based sequential network that facilitates context-dependent reasoning and\nlinear scalability with the sequence (token) length, accommodating a\nsubstantial number of Gaussians. Gamba incorporates significant advancements in\ndata preprocessing, regularization design, and training methodologies. We\nassessed Gamba against existing optimization-based and feed-forward 3D\ngeneration approaches using the real-world scanned OmniObject3D dataset. Here,\nGamba demonstrates competitive generation capabilities, both qualitatively and\nquantitatively, while achieving remarkable speed, approximately 0.6 second on a\nsingle NVIDIA A100 GPU.\n","authors":["Qiuhong Shen","Xuanyu Yi","Zike Wu","Pan Zhou","Hanwang Zhang","Shuicheng Yan","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2403.18795v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20034v1","updated":"2024-03-29T07:59:37Z","published":"2024-03-29T07:59:37Z","title":"NeSLAM: Neural Implicit Mapping and Self-Supervised Feature Tracking\n With Depth Completion and Denoising","summary":" In recent years, there have been significant advancements in 3D\nreconstruction and dense RGB-D SLAM systems. One notable development is the\napplication of Neural Radiance Fields (NeRF) in these systems, which utilizes\nimplicit neural representation to encode 3D scenes. This extension of NeRF to\nSLAM has shown promising results. However, the depth images obtained from\nconsumer-grade RGB-D sensors are often sparse and noisy, which poses\nsignificant challenges for 3D reconstruction and affects the accuracy of the\nrepresentation of the scene geometry. Moreover, the original hierarchical\nfeature grid with occupancy value is inaccurate for scene geometry\nrepresentation. Furthermore, the existing methods select random pixels for\ncamera tracking, which leads to inaccurate localization and is not robust in\nreal-world indoor environments. To this end, we present NeSLAM, an advanced\nframework that achieves accurate and dense depth estimation, robust camera\ntracking, and realistic synthesis of novel views. First, a depth completion and\ndenoising network is designed to provide dense geometry prior and guide the\nneural implicit representation optimization. Second, the occupancy scene\nrepresentation is replaced with Signed Distance Field (SDF) hierarchical scene\nrepresentation for high-quality reconstruction and view synthesis. Furthermore,\nwe also propose a NeRF-based self-supervised feature tracking algorithm for\nrobust real-time tracking. Experiments on various indoor datasets demonstrate\nthe effectiveness and accuracy of the system in reconstruction, tracking\nquality, and novel view synthesis.\n","authors":["Tianchen Deng","Yanbo Wang","Hongle Xie","Hesheng Wang","Jingchuan Wang","Danwei Wang","Weidong Chen"],"pdf_url":"https://arxiv.org/pdf/2403.20034v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20032v1","updated":"2024-03-29T07:58:21Z","published":"2024-03-29T07:58:21Z","title":"HO-Gaussian: Hybrid Optimization of 3D Gaussian Splatting for Urban\n Scenes","summary":" The rapid growth of 3D Gaussian Splatting (3DGS) has revolutionized neural\nrendering, enabling real-time production of high-quality renderings. However,\nthe previous 3DGS-based methods have limitations in urban scenes due to\nreliance on initial Structure-from-Motion(SfM) points and difficulties in\nrendering distant, sky and low-texture areas. To overcome these challenges, we\npropose a hybrid optimization method named HO-Gaussian, which combines a\ngrid-based volume with the 3DGS pipeline. HO-Gaussian eliminates the dependency\non SfM point initialization, allowing for rendering of urban scenes, and\nincorporates the Point Densitification to enhance rendering quality in\nproblematic regions during training. Furthermore, we introduce Gaussian\nDirection Encoding as an alternative for spherical harmonics in the rendering\npipeline, which enables view-dependent color representation. To account for\nmulti-camera systems, we introduce neural warping to enhance object consistency\nacross different cameras. Experimental results on widely used autonomous\ndriving datasets demonstrate that HO-Gaussian achieves photo-realistic\nrendering in real-time on multi-camera urban datasets.\n","authors":["Zhuopeng Li","Yilin Zhang","Chenming Wu","Jianke Zhu","Liangjun Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.20032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20031v1","updated":"2024-03-29T07:53:06Z","published":"2024-03-29T07:53:06Z","title":"A Unified Framework for Human-centric Point Cloud Video Understanding","summary":" Human-centric Point Cloud Video Understanding (PVU) is an emerging field\nfocused on extracting and interpreting human-related features from sequences of\nhuman point clouds, further advancing downstream human-centric tasks and\napplications. Previous works usually focus on tackling one specific task and\nrely on huge labeled data, which has poor generalization capability.\nConsidering that human has specific characteristics, including the structural\nsemantics of human body and the dynamics of human motions, we propose a unified\nframework to make full use of the prior knowledge and explore the inherent\nfeatures in the data itself for generalized human-centric point cloud video\nunderstanding. Extensive experiments demonstrate that our method achieves\nstate-of-the-art performance on various human-related tasks, including action\nrecognition and 3D pose estimation. All datasets and code will be released\nsoon.\n","authors":["Yiteng Xu","Kecheng Ye","Xiao Han","Yiming Ren","Xinge Zhu","Yuexin Ma"],"pdf_url":"https://arxiv.org/pdf/2403.20031v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.17496v2","updated":"2024-03-29T07:38:21Z","published":"2024-03-26T08:53:25Z","title":"Dr.Hair: Reconstructing Scalp-Connected Hair Strands without\n Pre-training via Differentiable Rendering of Line Segments","summary":" In the film and gaming industries, achieving a realistic hair appearance\ntypically involves the use of strands originating from the scalp. However,\nreconstructing these strands from observed surface images of hair presents\nsignificant challenges. The difficulty in acquiring Ground Truth (GT) data has\nled state-of-the-art learning-based methods to rely on pre-training with\nmanually prepared synthetic CG data. This process is not only labor-intensive\nand costly but also introduces complications due to the domain gap when\ncompared to real-world data. In this study, we propose an optimization-based\napproach that eliminates the need for pre-training. Our method represents hair\nstrands as line segments growing from the scalp and optimizes them using a\nnovel differentiable rendering algorithm. To robustly optimize a substantial\nnumber of slender explicit geometries, we introduce 3D orientation estimation\nutilizing global optimization, strand initialization based on Laplace's\nequation, and reparameterization that leverages geometric connectivity and\nspatial proximity. Unlike existing optimization-based methods, our method is\ncapable of reconstructing internal hair flow in an absolute direction. Our\nmethod exhibits robust and accurate inverse rendering, surpassing the quality\nof existing methods and significantly improving processing speed.\n","authors":["Yusuke Takimoto","Hikari Takehara","Hiroyuki Sato","Zihao Zhu","Bo Zheng"],"pdf_url":"https://arxiv.org/pdf/2403.17496v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.20026v1","updated":"2024-03-29T07:28:50Z","published":"2024-03-29T07:28:50Z","title":"FSMR: A Feature Swapping Multi-modal Reasoning Approach with Joint\n Textual and Visual Clues","summary":" Multi-modal reasoning plays a vital role in bridging the gap between textual\nand visual information, enabling a deeper understanding of the context. This\npaper presents the Feature Swapping Multi-modal Reasoning (FSMR) model,\ndesigned to enhance multi-modal reasoning through feature swapping. FSMR\nleverages a pre-trained visual-language model as an encoder, accommodating both\ntext and image inputs for effective feature representation from both\nmodalities. It introduces a unique feature swapping module, enabling the\nexchange of features between identified objects in images and corresponding\nvocabulary words in text, thereby enhancing the model's comprehension of the\ninterplay between images and text. To further bolster its multi-modal alignment\ncapabilities, FSMR incorporates a multi-modal cross-attention mechanism,\nfacilitating the joint modeling of textual and visual information. During\ntraining, we employ image-text matching and cross-entropy losses to ensure\nsemantic consistency between visual and language elements. Extensive\nexperiments on the PMR dataset demonstrate FSMR's superiority over\nstate-of-the-art baseline models across various performance metrics.\n","authors":["Shuang Li","Jiahua Wang","Lijie Wen"],"pdf_url":"https://arxiv.org/pdf/2403.20026v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10908v2","updated":"2024-03-29T07:24:23Z","published":"2023-12-18T03:34:07Z","title":"CLOVA: A Closed-Loop Visual Assistant with Tool Usage and Update","summary":" Utilizing large language models (LLMs) to compose off-the-shelf visual tools\nrepresents a promising avenue of research for developing robust visual\nassistants capable of addressing diverse visual tasks. However, these methods\noften overlook the potential for continual learning, typically by freezing the\nutilized tools, thus limiting their adaptation to environments requiring new\nknowledge. To tackle this challenge, we propose CLOVA, a Closed-Loop Visual\nAssistant, which operates within a framework encompassing inference,\nreflection, and learning phases. During the inference phase, LLMs generate\nprograms and execute corresponding tools to complete assigned tasks. In the\nreflection phase, a multimodal global-local reflection scheme analyzes human\nfeedback to determine which tools require updating. Lastly, the learning phase\nemploys three flexible approaches to automatically gather training data and\nintroduces a novel prompt tuning scheme to update the tools, allowing CLOVA to\nefficiently acquire new knowledge. Experimental findings demonstrate that CLOVA\nsurpasses existing tool-usage methods by 5% in visual question answering and\nmultiple-image reasoning, by 10% in knowledge tagging, and by 20% in image\nediting. These results underscore the significance of the continual learning\ncapability in general visual assistants.\n","authors":["Zhi Gao","Yuntao Du","Xintong Zhang","Xiaojian Ma","Wenjuan Han","Song-Chun Zhu","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2312.10908v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.15082v3","updated":"2024-03-29T07:20:42Z","published":"2024-03-22T10:06:31Z","title":"Cell Variational Information Bottleneck Network","summary":" In this work, we propose Cell Variational Information Bottleneck Network\n(cellVIB), a convolutional neural network using information bottleneck\nmechanism, which can be combined with the latest feedforward network\narchitecture in an end-to-end training method. Our Cell Variational Information\nBottleneck Network is constructed by stacking VIB cells, which generate feature\nmaps with uncertainty. As layers going deeper, the regularization effect will\ngradually increase, instead of directly adding excessive regular constraints to\nthe output layer of the model as in Deep VIB. Under each VIB cell, the\nfeedforward process learns an independent mean term and an standard deviation\nterm, and predicts the Gaussian distribution based on them. The feedback\nprocess is based on reparameterization trick for effective training. This work\nperforms an extensive analysis on MNIST dataset to verify the effectiveness of\neach VIB cells, and provides an insightful analysis on how the VIB cells affect\nmutual information. Experiments conducted on CIFAR-10 also prove that our\ncellVIB is robust against noisy labels during training and against corrupted\nimages during testing. Then, we validate our method on PACS dataset, whose\nresults show that the VIB cells can significantly improve the generalization\nperformance of the basic model. Finally, in a more complex representation\nlearning task, face recognition, our network structure has also achieved very\ncompetitive results.\n","authors":["Zhonghua Zhai","Chen Ju","Jinsong Lan","Shuai Xiao"],"pdf_url":"https://arxiv.org/pdf/2403.15082v3.pdf","comment":"Found errors in the article, therefore postponing publication for now"},{"id":"http://arxiv.org/abs/2403.20022v1","updated":"2024-03-29T07:16:34Z","published":"2024-03-29T07:16:34Z","title":"Psychometry: An Omnifit Model for Image Reconstruction from Human Brain\n Activity","summary":" Reconstructing the viewed images from human brain activity bridges human and\ncomputer vision through the Brain-Computer Interface. The inherent variability\nin brain function between individuals leads existing literature to focus on\nacquiring separate models for each individual using their respective brain\nsignal data, ignoring commonalities between these data. In this article, we\ndevise Psychometry, an omnifit model for reconstructing images from functional\nMagnetic Resonance Imaging (fMRI) obtained from different subjects. Psychometry\nincorporates an omni mixture-of-experts (Omni MoE) module where all the experts\nwork together to capture the inter-subject commonalities, while each expert\nassociated with subject-specific parameters copes with the individual\ndifferences. Moreover, Psychometry is equipped with a retrieval-enhanced\ninference strategy, termed Ecphory, which aims to enhance the learned fMRI\nrepresentation via retrieving from prestored subject-specific memories. These\ndesigns collectively render Psychometry omnifit and efficient, enabling it to\ncapture both inter-subject commonality and individual specificity across\nsubjects. As a result, the enhanced fMRI representations serve as conditional\nsignals to guide a generation model to reconstruct high-quality and realistic\nimages, establishing Psychometry as state-of-the-art in terms of both\nhigh-level and low-level metrics.\n","authors":["Ruijie Quan","Wenguan Wang","Zhibo Tian","Fan Ma","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2403.20022v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.20018v1","updated":"2024-03-29T07:14:14Z","published":"2024-03-29T07:14:14Z","title":"SCINeRF: Neural Radiance Fields from a Snapshot Compressive Image","summary":" In this paper, we explore the potential of Snapshot Compressive Imaging (SCI)\ntechnique for recovering the underlying 3D scene representation from a single\ntemporal compressed image. SCI is a cost-effective method that enables the\nrecording of high-dimensional data, such as hyperspectral or temporal\ninformation, into a single image using low-cost 2D imaging sensors. To achieve\nthis, a series of specially designed 2D masks are usually employed, which not\nonly reduces storage requirements but also offers potential privacy protection.\nInspired by this, to take one step further, our approach builds upon the\npowerful 3D scene representation capabilities of neural radiance fields (NeRF).\nSpecifically, we formulate the physical imaging process of SCI as part of the\ntraining of NeRF, allowing us to exploit its impressive performance in\ncapturing complex scene structures. To assess the effectiveness of our method,\nwe conduct extensive evaluations using both synthetic data and real data\ncaptured by our SCI system. Extensive experimental results demonstrate that our\nproposed approach surpasses the state-of-the-art methods in terms of image\nreconstruction and novel view image synthesis. Moreover, our method also\nexhibits the ability to restore high frame-rate multi-view consistent images by\nleveraging SCI and the rendering capabilities of NeRF. The code is available at\nhttps://github.com/WU-CVGL/SCINeRF.\n","authors":["Yunhao Li","Xiaodong Wang","Ping Wang","Xin Yuan","Peidong Liu"],"pdf_url":"https://arxiv.org/pdf/2403.20018v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20013v1","updated":"2024-03-29T06:58:57Z","published":"2024-03-29T06:58:57Z","title":"DerainNeRF: 3D Scene Estimation with Adhesive Waterdrop Removal","summary":" When capturing images through the glass during rainy or snowy weather\nconditions, the resulting images often contain waterdrops adhered on the glass\nsurface, and these waterdrops significantly degrade the image quality and\nperformance of many computer vision algorithms. To tackle these limitations, we\npropose a method to reconstruct the clear 3D scene implicitly from multi-view\nimages degraded by waterdrops. Our method exploits an attention network to\npredict the location of waterdrops and then train a Neural Radiance Fields to\nrecover the 3D scene implicitly. By leveraging the strong scene representation\ncapabilities of NeRF, our method can render high-quality novel-view images with\nwaterdrops removed. Extensive experimental results on both synthetic and real\ndatasets show that our method is able to generate clear 3D scenes and\noutperforms existing state-of-the-art (SOTA) image adhesive waterdrop removal\nmethods.\n","authors":["Yunhao Li","Jing Wu","Lingzhe Zhao","Peidong Liu"],"pdf_url":"https://arxiv.org/pdf/2403.20013v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20012v1","updated":"2024-03-29T06:53:52Z","published":"2024-03-29T06:53:52Z","title":"Colorful Cutout: Enhancing Image Data Augmentation with Curriculum\n Learning","summary":" Data augmentation is one of the regularization strategies for the training of\ndeep learning models, which enhances generalizability and prevents overfitting,\nleading to performance improvement. Although researchers have proposed various\ndata augmentation techniques, they often lack consideration for the difficulty\nof augmented data. Recently, another line of research suggests incorporating\nthe concept of curriculum learning with data augmentation in the field of\nnatural language processing. In this study, we adopt curriculum data\naugmentation for image data augmentation and propose colorful cutout, which\ngradually increases the noise and difficulty introduced in the augmented image.\nOur experimental results highlight the possibility of curriculum data\naugmentation for image data. We publicly released our source code to improve\nthe reproducibility of our study.\n","authors":["Juhwan Choi","YoungBin Kim"],"pdf_url":"https://arxiv.org/pdf/2403.20012v1.pdf","comment":"ICLR 2024 Tiny Papers"},{"id":"http://arxiv.org/abs/2403.12236v2","updated":"2024-03-29T06:41:07Z","published":"2024-03-18T20:33:44Z","title":"Improving Generalization via Meta-Learning on Hard Samples","summary":" Learned reweighting (LRW) approaches to supervised learning use an\noptimization criterion to assign weights for training instances, in order to\nmaximize performance on a representative validation dataset. We pose and\nformalize the problem of optimized selection of the validation set used in LRW\ntraining, to improve classifier generalization. In particular, we show that\nusing hard-to-classify instances in the validation set has both a theoretical\nconnection to, and strong empirical evidence of generalization. We provide an\nefficient algorithm for training this meta-optimized model, as well as a simple\ntrain-twice heuristic for careful comparative study. We demonstrate that LRW\nwith easy validation data performs consistently worse than LRW with hard\nvalidation data, establishing the validity of our meta-optimization problem.\nOur proposed algorithm outperforms a wide range of baselines on a range of\ndatasets and domain shift challenges (Imagenet-1K, CIFAR-100, Clothing-1M,\nCAMELYON, WILDS, etc.), with ~1% gains using VIT-B on Imagenet. We also show\nthat using naturally hard examples for validation (Imagenet-R / Imagenet-A) in\nLRW training for Imagenet improves performance on both clean and naturally hard\ntest instances by 1-2%. Secondary analyses show that using hard validation data\nin an LRW framework improves margins on test data, hinting at the mechanism\nunderlying our empirical gains. We believe this work opens up new research\ndirections for the meta-optimization of meta-learning in a supervised learning\ncontext.\n","authors":["Nishant Jain","Arun S. Suggala","Pradeep Shenoy"],"pdf_url":"https://arxiv.org/pdf/2403.12236v2.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.20002v1","updated":"2024-03-29T06:33:13Z","published":"2024-03-29T06:33:13Z","title":"Grounding and Enhancing Grid-based Models for Neural Fields","summary":" Many contemporary studies utilize grid-based models for neural field\nrepresentation, but a systematic analysis of grid-based models is still\nmissing, hindering the improvement of those models. Therefore, this paper\nintroduces a theoretical framework for grid-based models. This framework points\nout that these models' approximation and generalization behaviors are\ndetermined by grid tangent kernels (GTK), which are intrinsic properties of\ngrid-based models. The proposed framework facilitates a consistent and\nsystematic analysis of diverse grid-based models. Furthermore, the introduced\nframework motivates the development of a novel grid-based model named the\nMultiplicative Fourier Adaptive Grid (MulFAGrid). The numerical analysis\ndemonstrates that MulFAGrid exhibits a lower generalization bound than its\npredecessors, indicating its robust generalization performance. Empirical\nstudies reveal that MulFAGrid achieves state-of-the-art performance in various\ntasks, including 2D image fitting, 3D signed distance field (SDF)\nreconstruction, and novel view synthesis, demonstrating superior representation\nability. The project website is available at\nhttps://sites.google.com/view/cvpr24-2034-submission/home.\n","authors":["Zelin Zhao","Fenglei Fan","Wenlong Liao","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2403.20002v1.pdf","comment":"Accepted in CVPR24"},{"id":"http://arxiv.org/abs/2312.13528v2","updated":"2024-03-29T05:57:33Z","published":"2023-12-21T02:01:19Z","title":"DyBluRF: Dynamic Deblurring Neural Radiance Fields for Blurry Monocular\n Video","summary":" Neural Radiance Fields (NeRF), initially developed for static scenes, have\ninspired many video novel view synthesis techniques. However, the challenge for\nvideo view synthesis arises from motion blur, a consequence of object or camera\nmovement during exposure, which hinders the precise synthesis of sharp\nspatio-temporal views. In response, we propose a novel dynamic deblurring NeRF\nframework for blurry monocular video, called DyBluRF, consisting of a Base Ray\nInitialization (BRI) stage and a Motion Decomposition-based Deblurring (MDD)\nstage. Our DyBluRF is the first that handles the novel view synthesis for\nblurry monocular video with a novel two-stage framework. In the BRI stage, we\ncoarsely reconstruct dynamic 3D scenes and jointly initialize the base ray,\nwhich is further used to predict latent sharp rays, using the inaccurate camera\npose information from the given blurry frames. In the MDD stage, we introduce a\nnovel Incremental Latent Sharp-rays Prediction (ILSP) approach for the blurry\nmonocular video frames by decomposing the latent sharp rays into global camera\nmotion and local object motion components. We further propose two loss\nfunctions for effective geometry regularization and decomposition of static and\ndynamic scene components without any mask supervision. Experiments show that\nDyBluRF outperforms qualitatively and quantitatively the SOTA methods.\n","authors":["Minh-Quan Viet Bui","Jongmin Park","Jihyong Oh","Munchurl Kim"],"pdf_url":"https://arxiv.org/pdf/2312.13528v2.pdf","comment":"The first two authors contributed equally to this work (equal\n contribution). The last two authors advised equally to this work. Please\n visit our project page at https://kaist-viclab.github.io/dyblurf-site/"},{"id":"http://arxiv.org/abs/2403.19985v1","updated":"2024-03-29T05:39:47Z","published":"2024-03-29T05:39:47Z","title":"Stable Surface Regularization for Fast Few-Shot NeRF","summary":" This paper proposes an algorithm for synthesizing novel views under few-shot\nsetup. The main concept is to develop a stable surface regularization technique\ncalled Annealing Signed Distance Function (ASDF), which anneals the surface in\na coarse-to-fine manner to accelerate convergence speed. We observe that the\nEikonal loss - which is a widely known geometric regularization - requires\ndense training signal to shape different level-sets of SDF, leading to\nlow-fidelity results under few-shot training. In contrast, the proposed surface\nregularization successfully reconstructs scenes and produce high-fidelity\ngeometry with stable training. Our method is further accelerated by utilizing\ngrid representation and monocular geometric priors. Finally, the proposed\napproach is up to 45 times faster than existing few-shot novel view synthesis\nmethods, and it produces comparable results in the ScanNet dataset and\nNeRF-Real dataset.\n","authors":["Byeongin Joung","Byeong-Uk Lee","Jaesung Choe","Ukcheol Shin","Minjun Kang","Taeyeop Lee","In So Kweon","Kuk-Jin Yoon"],"pdf_url":"https://arxiv.org/pdf/2403.19985v1.pdf","comment":"3DV 2024"},{"id":"http://arxiv.org/abs/2403.19983v1","updated":"2024-03-29T05:35:04Z","published":"2024-03-29T05:35:04Z","title":"A multi-stage semi-supervised learning for ankle fracture classification\n on CT images","summary":" Because of the complicated mechanism of ankle injury, it is very difficult to\ndiagnose ankle fracture in clinic. In order to simplify the process of fracture\ndiagnosis, an automatic diagnosis model of ankle fracture was proposed.\nFirstly, a tibia-fibula segmentation network is proposed for the joint\ntibiofibular region of the ankle joint, and the corresponding segmentation\ndataset is established on the basis of fracture data. Secondly, the image\nregistration method is used to register the bone segmentation mask with the\nnormal bone mask. Finally, a semi-supervised classifier is constructed to make\nfull use of a large number of unlabeled data to classify ankle fractures.\nExperiments show that the proposed method can segment fractures with fracture\nlines accurately and has better performance than the general method. At the\nsame time, this method is superior to classification network in several\nindexes.\n","authors":["Hongzhi Liu","Guicheng Li","Jiacheng Nie","Hui Tang","Chunfeng Yang","Qianjin Feng","Hailin Xu","Yang Chen"],"pdf_url":"https://arxiv.org/pdf/2403.19983v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19980v1","updated":"2024-03-29T05:23:34Z","published":"2024-03-29T05:23:34Z","title":"A Parallel Attention Network for Cattle Face Recognition","summary":" Cattle face recognition holds paramount significance in domains such as\nanimal husbandry and behavioral research. Despite significant progress in\nconfined environments, applying these accomplishments in wild settings remains\nchallenging. Thus, we create the first large-scale cattle face recognition\ndataset, ICRWE, for wild environments. It encompasses 483 cattle and 9,816\nhigh-resolution image samples. Each sample undergoes annotation for face\nfeatures, light conditions, and face orientation. Furthermore, we introduce a\nnovel parallel attention network, PANet. Comprising several cascaded\nTransformer modules, each module incorporates two parallel Position Attention\nModules (PAM) and Feature Mapping Modules (FMM). PAM focuses on local and\nglobal features at each image position through parallel channel attention, and\nFMM captures intricate feature patterns through non-linear mappings.\nExperimental results indicate that PANet achieves a recognition accuracy of\n88.03% on the ICRWE dataset, establishing itself as the current\nstate-of-the-art approach. The source code is available in the supplementary\nmaterials.\n","authors":["Jiayu Li","Xuechao Zou","Shiying Wang","Ben Chen","Junliang Xing","Pin Tao"],"pdf_url":"https://arxiv.org/pdf/2403.19980v1.pdf","comment":"Accepted by ICME 2024"},{"id":"http://arxiv.org/abs/2403.19979v1","updated":"2024-03-29T05:23:12Z","published":"2024-03-29T05:23:12Z","title":"Semantically-Shifted Incremental Adapter-Tuning is A Continual\n ViTransformer","summary":" Class-incremental learning (CIL) aims to enable models to continuously learn\nnew classes while overcoming catastrophic forgetting. The introduction of\npre-trained models has brought new tuning paradigms to CIL. In this paper, we\nrevisit different parameter-efficient tuning (PET) methods within the context\nof continual learning. We observe that adapter tuning demonstrates superiority\nover prompt-based methods, even without parameter expansion in each learning\nsession. Motivated by this, we propose incrementally tuning the shared adapter\nwithout imposing parameter update constraints, enhancing the learning capacity\nof the backbone. Additionally, we employ feature sampling from stored\nprototypes to retrain a unified classifier, further improving its performance.\nWe estimate the semantic shift of old prototypes without access to past samples\nand update stored prototypes session by session. Our proposed method eliminates\nmodel expansion and avoids retaining any image samples. It surpasses previous\npre-trained model-based CIL methods and demonstrates remarkable continual\nlearning capabilities. Experimental results on five CIL benchmarks validate the\neffectiveness of our approach, achieving state-of-the-art (SOTA) performance.\n","authors":["Yuwen Tan","Qinhao Zhou","Xiang Xiang","Ke Wang","Yuchuan Wu","Yongbin Li"],"pdf_url":"https://arxiv.org/pdf/2403.19979v1.pdf","comment":"To appear at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.16558v2","updated":"2024-03-29T05:12:45Z","published":"2024-03-25T09:17:15Z","title":"Elysium: Exploring Object-level Perception in Videos via MLLM","summary":" Multi-modal Large Language Models (MLLMs) have demonstrated their ability to\nperceive objects in still images, but their application in video-related tasks,\nsuch as object tracking, remains understudied. This lack of exploration is\nprimarily due to two key challenges. Firstly, extensive pretraining on\nlarge-scale video datasets is required to equip MLLMs with the capability to\nperceive objects across multiple frames and understand inter-frame\nrelationships. Secondly, processing a large number of frames within the context\nwindow of Large Language Models (LLMs) can impose a significant computational\nburden. To address the first challenge, we introduce ElysiumTrack-1M, a\nlarge-scale video dataset supported for three tasks: Single Object Tracking\n(SOT), Referring Single Object Tracking (RSOT), and Video Referring Expression\nGeneration (Video-REG). ElysiumTrack-1M contains 1.27 million annotated video\nframes with corresponding object boxes and descriptions. Leveraging this\ndataset, we conduct training of MLLMs and propose a token-compression model\nT-Selector to tackle the second challenge. Our proposed approach, Elysium:\nExploring Object-level Perception in Videos via MLLM, is an end-to-end\ntrainable MLLM that attempts to conduct object-level tasks in videos without\nrequiring any additional plug-in or expert models. All codes and datasets are\navailable at https://github.com/Hon-Wong/Elysium.\n","authors":["Han Wang","Yanjie Wang","Yongjie Ye","Yuxiang Nie","Can Huang"],"pdf_url":"https://arxiv.org/pdf/2403.16558v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19976v1","updated":"2024-03-29T04:58:56Z","published":"2024-03-29T04:58:56Z","title":"eTraM: Event-based Traffic Monitoring Dataset","summary":" Event cameras, with their high temporal and dynamic range and minimal memory\nusage, have found applications in various fields. However, their potential in\nstatic traffic monitoring remains largely unexplored. To facilitate this\nexploration, we present eTraM - a first-of-its-kind, fully event-based traffic\nmonitoring dataset. eTraM offers 10 hr of data from different traffic scenarios\nin various lighting and weather conditions, providing a comprehensive overview\nof real-world situations. Providing 2M bounding box annotations, it covers\neight distinct classes of traffic participants, ranging from vehicles to\npedestrians and micro-mobility. eTraM's utility has been assessed using\nstate-of-the-art methods for traffic participant detection, including RVT, RED,\nand YOLOv8. We quantitatively evaluate the ability of event-based models to\ngeneralize on nighttime and unseen scenes. Our findings substantiate the\ncompelling potential of leveraging event cameras for traffic monitoring,\nopening new avenues for research and application. eTraM is available at\nhttps://eventbasedvision.github.io/eTraM\n","authors":["Aayush Atul Verma","Bharatesh Chakravarthi","Arpitsinh Vaghela","Hua Wei","Yezhou Yang"],"pdf_url":"https://arxiv.org/pdf/2403.19976v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19975v1","updated":"2024-03-29T04:58:33Z","published":"2024-03-29T04:58:33Z","title":"Context-Aware Integration of Language and Visual References for Natural\n Language Tracking","summary":" Tracking by natural language specification (TNL) aims to consistently\nlocalize a target in a video sequence given a linguistic description in the\ninitial frame. Existing methodologies perform language-based and template-based\nmatching for target reasoning separately and merge the matching results from\ntwo sources, which suffer from tracking drift when language and visual\ntemplates miss-align with the dynamic target state and ambiguity in the later\nmerging stage. To tackle the issues, we propose a joint multi-modal tracking\nframework with 1) a prompt modulation module to leverage the complementarity\nbetween temporal visual templates and language expressions, enabling precise\nand context-aware appearance and linguistic cues, and 2) a unified target\ndecoding module to integrate the multi-modal reference cues and executes the\nintegrated queries on the search image to predict the target location in an\nend-to-end manner directly. This design ensures spatio-temporal consistency by\nleveraging historical visual information and introduces an integrated solution,\ngenerating predictions in a single step. Extensive experiments conducted on\nTNL2K, OTB-Lang, LaSOT, and RefCOCOg validate the efficacy of our proposed\napproach. The results demonstrate competitive performance against\nstate-of-the-art methods for both tracking and grounding.\n","authors":["Yanyan Shao","Shuting He","Qi Ye","Yuchao Feng","Wenhan Luo","Jiming Chen"],"pdf_url":"https://arxiv.org/pdf/2403.19975v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2311.17132v2","updated":"2024-03-29T04:55:51Z","published":"2023-11-28T18:03:27Z","title":"TransNeXt: Robust Foveal Visual Perception for Vision Transformers","summary":" Due to the depth degradation effect in residual connections, many efficient\nVision Transformers models that rely on stacking layers for information\nexchange often fail to form sufficient information mixing, leading to unnatural\nvisual perception. To address this issue, in this paper, we propose Aggregated\nAttention, a biomimetic design-based token mixer that simulates biological\nfoveal vision and continuous eye movement while enabling each token on the\nfeature map to have a global perception. Furthermore, we incorporate learnable\ntokens that interact with conventional queries and keys, which further\ndiversifies the generation of affinity matrices beyond merely relying on the\nsimilarity between queries and keys. Our approach does not rely on stacking for\ninformation exchange, thus effectively avoiding depth degradation and achieving\nnatural visual perception. Additionally, we propose Convolutional GLU, a\nchannel mixer that bridges the gap between GLU and SE mechanism, which empowers\neach token to have channel attention based on its nearest neighbor image\nfeatures, enhancing local modeling capability and model robustness. We combine\naggregated attention and convolutional GLU to create a new visual backbone\ncalled TransNeXt. Extensive experiments demonstrate that our TransNeXt achieves\nstate-of-the-art performance across multiple model sizes. At a resolution of\n$224^2$, TransNeXt-Tiny attains an ImageNet accuracy of 84.0%, surpassing\nConvNeXt-B with 69% fewer parameters. Our TransNeXt-Base achieves an ImageNet\naccuracy of 86.2% and an ImageNet-A accuracy of 61.6% at a resolution of\n$384^2$, a COCO object detection mAP of 57.1, and an ADE20K semantic\nsegmentation mIoU of 54.7.\n","authors":["Dai Shi"],"pdf_url":"https://arxiv.org/pdf/2311.17132v2.pdf","comment":"CVPR 2024 Camera-ready Version. Project Page:\n https://github.com/DaiShiResearch/TransNeXt"},{"id":"http://arxiv.org/abs/2312.11461v2","updated":"2024-03-29T04:32:57Z","published":"2023-12-18T18:59:12Z","title":"GAvatar: Animatable 3D Gaussian Avatars with Implicit Mesh Learning","summary":" Gaussian splatting has emerged as a powerful 3D representation that harnesses\nthe advantages of both explicit (mesh) and implicit (NeRF) 3D representations.\nIn this paper, we seek to leverage Gaussian splatting to generate realistic\nanimatable avatars from textual descriptions, addressing the limitations (e.g.,\nflexibility and efficiency) imposed by mesh or NeRF-based representations.\nHowever, a naive application of Gaussian splatting cannot generate high-quality\nanimatable avatars and suffers from learning instability; it also cannot\ncapture fine avatar geometries and often leads to degenerate body parts. To\ntackle these problems, we first propose a primitive-based 3D Gaussian\nrepresentation where Gaussians are defined inside pose-driven primitives to\nfacilitate animation. Second, to stabilize and amortize the learning of\nmillions of Gaussians, we propose to use neural implicit fields to predict the\nGaussian attributes (e.g., colors). Finally, to capture fine avatar geometries\nand extract detailed meshes, we propose a novel SDF-based implicit mesh\nlearning approach for 3D Gaussians that regularizes the underlying geometries\nand extracts highly detailed textured meshes. Our proposed method, GAvatar,\nenables the large-scale generation of diverse animatable avatars using only\ntext prompts. GAvatar significantly surpasses existing methods in terms of both\nappearance and geometry quality, and achieves extremely fast rendering (100\nfps) at 1K resolution.\n","authors":["Ye Yuan","Xueting Li","Yangyi Huang","Shalini De Mello","Koki Nagano","Jan Kautz","Umar Iqbal"],"pdf_url":"https://arxiv.org/pdf/2312.11461v2.pdf","comment":"CVPR 2024. Project website: https://nvlabs.github.io/GAvatar"},{"id":"http://arxiv.org/abs/2403.19969v1","updated":"2024-03-29T04:28:06Z","published":"2024-03-29T04:28:06Z","title":"Separate, Dynamic and Differentiable (SMART) Pruner for Block/Output\n Channel Pruning on Computer Vision Tasks","summary":" Deep Neural Network (DNN) pruning has emerged as a key strategy to reduce\nmodel size, improve inference latency, and lower power consumption on DNN\naccelerators. Among various pruning techniques, block and output channel\npruning have shown significant potential in accelerating hardware performance.\nHowever, their accuracy often requires further improvement. In response to this\nchallenge, we introduce a separate, dynamic and differentiable (SMART) pruner.\nThis pruner stands out by utilizing a separate, learnable probability mask for\nweight importance ranking, employing a differentiable Top k operator to achieve\ntarget sparsity, and leveraging a dynamic temperature parameter trick to escape\nfrom non-sparse local minima. In our experiments, the SMART pruner consistently\ndemonstrated its superiority over existing pruning methods across a wide range\nof tasks and models on block and output channel pruning. Additionally, we\nextend our testing to Transformer-based models in N:M pruning scenarios, where\nSMART pruner also yields state-of-the-art results, demonstrating its\nadaptability and robustness across various neural network architectures, and\npruning types.\n","authors":["Guanhua Ding","Zexi Ye","Zhen Zhong","Gang Li","David Shao"],"pdf_url":"https://arxiv.org/pdf/2403.19969v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10521v3","updated":"2024-03-29T04:21:27Z","published":"2024-03-15T17:59:53Z","title":"P-MapNet: Far-seeing Map Generator Enhanced by both SDMap and HDMap\n Priors","summary":" Autonomous vehicles are gradually entering city roads today, with the help of\nhigh-definition maps (HDMaps). However, the reliance on HDMaps prevents\nautonomous vehicles from stepping into regions without this expensive digital\ninfrastructure. This fact drives many researchers to study online HDMap\ngeneration algorithms, but the performance of these algorithms at far regions\nis still unsatisfying. We present P-MapNet, in which the letter P highlights\nthe fact that we focus on incorporating map priors to improve model\nperformance. Specifically, we exploit priors in both SDMap and HDMap. On one\nhand, we extract weakly aligned SDMap from OpenStreetMap, and encode it as an\nadditional conditioning branch. Despite the misalignment challenge, our\nattention-based architecture adaptively attends to relevant SDMap skeletons and\nsignificantly improves performance. On the other hand, we exploit a masked\nautoencoder to capture the prior distribution of HDMap, which can serve as a\nrefinement module to mitigate occlusions and artifacts. We benchmark on the\nnuScenes and Argoverse2 datasets. Through comprehensive experiments, we show\nthat: (1) our SDMap prior can improve online map generation performance, using\nboth rasterized (by up to $+18.73$ $\\rm mIoU$) and vectorized (by up to $+8.50$\n$\\rm mAP$) output representations. (2) our HDMap prior can improve map\nperceptual metrics by up to $6.34\\%$. (3) P-MapNet can be switched into\ndifferent inference modes that covers different regions of the\naccuracy-efficiency trade-off landscape. (4) P-MapNet is a far-seeing solution\nthat brings larger improvements on longer ranges. Codes and models are publicly\navailable at https://jike5.github.io/P-MapNet.\n","authors":["Zhou Jiang","Zhenxin Zhu","Pengfei Li","Huan-ang Gao","Tianyuan Yuan","Yongliang Shi","Hang Zhao","Hao Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.10521v3.pdf","comment":"Code: https://jike5.github.io/P-MapNet"},{"id":"http://arxiv.org/abs/2211.13398v3","updated":"2024-03-29T04:13:49Z","published":"2022-11-24T03:27:00Z","title":"CPPF++: Uncertainty-Aware Sim2Real Object Pose Estimation by Vote\n Aggregation","summary":" Object pose estimation constitutes a critical area within the domain of 3D\nvision. While contemporary state-of-the-art methods that leverage real-world\npose annotations have demonstrated commendable performance, the procurement of\nsuch real training data incurs substantial costs. This paper focuses on a\nspecific setting wherein only 3D CAD models are utilized as a priori knowledge,\ndevoid of any background or clutter information. We introduce a novel method,\nCPPF++, designed for sim-to-real pose estimation. This method builds upon the\nfoundational point-pair voting scheme of CPPF, reformulating it through a\nprobabilistic view. To address the challenge posed by vote collision, we\npropose a novel approach that involves modeling the voting uncertainty by\nestimating the probabilistic distribution of each point pair within the\ncanonical space. Furthermore, we augment the contextual information provided by\neach voting unit through the introduction of N-point tuples. To enhance the\nrobustness and accuracy of the model, we incorporate several innovative\nmodules, including noisy pair filtering, online alignment optimization, and a\ntuple feature ensemble. Alongside these methodological advancements, we\nintroduce a new category-level pose estimation dataset, named DiversePose 300.\nEmpirical evidence demonstrates that our method significantly surpasses\nprevious sim-to-real approaches and achieves comparable or superior performance\non novel datasets. Our code is available on https://github.com/qq456cvb/CPPF2.\n","authors":["Yang You","Wenhao He","Jin Liu","Hongkai Xiong","Weiming Wang","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2211.13398v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19967v1","updated":"2024-03-29T04:10:07Z","published":"2024-03-29T04:10:07Z","title":"Rewrite the Stars","summary":" Recent studies have drawn attention to the untapped potential of the \"star\noperation\" (element-wise multiplication) in network design. While intuitive\nexplanations abound, the foundational rationale behind its application remains\nlargely unexplored. Our study attempts to reveal the star operation's ability\nto map inputs into high-dimensional, non-linear feature spaces -- akin to\nkernel tricks -- without widening the network. We further introduce StarNet, a\nsimple yet powerful prototype, demonstrating impressive performance and low\nlatency under compact network structure and efficient budget. Like stars in the\nsky, the star operation appears unremarkable but holds a vast universe of\npotential. Our work encourages further exploration across tasks, with codes\navailable at https://github.com/ma-xu/Rewrite-the-Stars.\n","authors":["Xu Ma","Xiyang Dai","Yue Bai","Yizhou Wang","Yun Fu"],"pdf_url":"https://arxiv.org/pdf/2403.19967v1.pdf","comment":"Accepted by CVPR 2024. Codes are made publically available at\n https://github.com/ma-xu/Rewrite-the-Stars"},{"id":"http://arxiv.org/abs/2311.16714v2","updated":"2024-03-29T04:07:25Z","published":"2023-11-28T11:53:56Z","title":"Embodied Multi-Modal Agent trained by an LLM from a Parallel TextWorld","summary":" While large language models (LLMs) excel in a simulated world of texts, they\nstruggle to interact with the more realistic world without perceptions of other\nmodalities such as visual or audio signals. Although vision-language models\n(VLMs) integrate LLM modules (1) aligned with static image features, and (2)\nmay possess prior knowledge of world dynamics (as demonstrated in the text\nworld), they have not been trained in an embodied visual world and thus cannot\nalign with its dynamics. On the other hand, training an embodied agent in a\nnoisy visual world without expert guidance is often challenging and\ninefficient. In this paper, we train a VLM agent living in a visual world using\nan LLM agent excelling in a parallel text world. Specifically, we distill LLM's\nreflection outcomes (improved actions by analyzing mistakes) in a text world's\ntasks to finetune the VLM on the same tasks of the visual world, resulting in\nan Embodied Multi-Modal Agent (EMMA) quickly adapting to the visual world\ndynamics. Such cross-modality imitation learning between the two parallel\nworlds is achieved by a novel DAgger-DPO algorithm, enabling EMMA to generalize\nto a broad scope of new tasks without any further guidance from the LLM expert.\nExtensive evaluations on the ALFWorld benchmark's diverse tasks highlight\nEMMA's superior performance to SOTA VLM-based agents, e.g., 20%-70% improvement\nin the success rate.\n","authors":["Yijun Yang","Tianyi Zhou","Kanxue Li","Dapeng Tao","Lusong Li","Li Shen","Xiaodong He","Jing Jiang","Yuhui Shi"],"pdf_url":"https://arxiv.org/pdf/2311.16714v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19966v1","updated":"2024-03-29T04:02:51Z","published":"2024-03-29T04:02:51Z","title":"Multi-task Magnetic Resonance Imaging Reconstruction using Meta-learning","summary":" Using single-task deep learning methods to reconstruct Magnetic Resonance\nImaging (MRI) data acquired with different imaging sequences is inherently\nchallenging. The trained deep learning model typically lacks generalizability,\nand the dissimilarity among image datasets with different types of contrast\nleads to suboptimal learning performance. This paper proposes a meta-learning\napproach to efficiently learn image features from multiple MR image datasets.\nOur algorithm can perform multi-task learning to simultaneously reconstruct MR\nimages acquired using different imaging sequences with different image\ncontrasts. The experiment results demonstrate the ability of our new\nmeta-learning reconstruction method to successfully reconstruct\nhighly-undersampled k-space data from multiple MRI datasets simultaneously,\noutperforming other compelling reconstruction methods previously developed for\nsingle-task learning.\n","authors":["Wanyu Bian","Albert Jang","Fang Liu"],"pdf_url":"https://arxiv.org/pdf/2403.19966v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19964v1","updated":"2024-03-29T03:56:19Z","published":"2024-03-29T03:56:19Z","title":"FairRAG: Fair Human Generation via Fair Retrieval Augmentation","summary":" Existing text-to-image generative models reflect or even amplify societal\nbiases ingrained in their training data. This is especially concerning for\nhuman image generation where models are biased against certain demographic\ngroups. Existing attempts to rectify this issue are hindered by the inherent\nlimitations of the pre-trained models and fail to substantially improve\ndemographic diversity. In this work, we introduce Fair Retrieval Augmented\nGeneration (FairRAG), a novel framework that conditions pre-trained generative\nmodels on reference images retrieved from an external image database to improve\nfairness in human generation. FairRAG enables conditioning through a\nlightweight linear module that projects reference images into the textual\nspace. To enhance fairness, FairRAG applies simple-yet-effective debiasing\nstrategies, providing images from diverse demographic groups during the\ngenerative process. Extensive experiments demonstrate that FairRAG outperforms\nexisting methods in terms of demographic diversity, image-text alignment, and\nimage fidelity while incurring minimal computational overhead during inference.\n","authors":["Robik Shrestha","Yang Zou","Qiuyu Chen","Zhiheng Li","Yusheng Xie","Siqi Deng"],"pdf_url":"https://arxiv.org/pdf/2403.19964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19963v1","updated":"2024-03-29T03:48:35Z","published":"2024-03-29T03:48:35Z","title":"Efficient Modulation for Vision Networks","summary":" In this work, we present efficient modulation, a novel design for efficient\nvision networks. We revisit the modulation mechanism, which operates input\nthrough convolutional context modeling and feature projection layers, and fuses\nfeatures via element-wise multiplication and an MLP block. We demonstrate that\nthe modulation mechanism is particularly well suited for efficient networks and\nfurther tailor the modulation design by proposing the efficient modulation\n(EfficientMod) block, which is considered the essential building block for our\nnetworks. Benefiting from the prominent representational ability of modulation\nmechanism and the proposed efficient design, our network can accomplish better\ntrade-offs between accuracy and efficiency and set new state-of-the-art\nperformance in the zoo of efficient networks. When integrating EfficientMod\nwith the vanilla self-attention block, we obtain the hybrid architecture which\nfurther improves the performance without loss of efficiency. We carry out\ncomprehensive experiments to verify EfficientMod's performance. With fewer\nparameters, our EfficientMod-s performs 0.6 top-1 accuracy better than\nEfficientFormerV2-s2 and is 25% faster on GPU, and 2.9 better than\nMobileViTv2-1.0 at the same GPU latency. Additionally, our method presents a\nnotable improvement in downstream tasks, outperforming EfficientFormerV2-s by\n3.6 mIoU on the ADE20K benchmark. Code and checkpoints are available at\nhttps://github.com/ma-xu/EfficientMod.\n","authors":["Xu Ma","Xiyang Dai","Jianwei Yang","Bin Xiao","Yinpeng Chen","Yun Fu","Lu Yuan"],"pdf_url":"https://arxiv.org/pdf/2403.19963v1.pdf","comment":"Accepted by ICLR 2024. Codes are made publically available at\n https://github.com/ma-xu/EfficientMod"},{"id":"http://arxiv.org/abs/2310.05916v4","updated":"2024-03-29T03:40:47Z","published":"2023-10-09T17:59:04Z","title":"Interpreting CLIP's Image Representation via Text-Based Decomposition","summary":" We investigate the CLIP image encoder by analyzing how individual model\ncomponents affect the final representation. We decompose the image\nrepresentation as a sum across individual image patches, model layers, and\nattention heads, and use CLIP's text representation to interpret the summands.\nInterpreting the attention heads, we characterize each head's role by\nautomatically finding text representations that span its output space, which\nreveals property-specific roles for many heads (e.g. location or shape). Next,\ninterpreting the image patches, we uncover an emergent spatial localization\nwithin CLIP. Finally, we use this understanding to remove spurious features\nfrom CLIP and to create a strong zero-shot image segmenter. Our results\nindicate that a scalable understanding of transformer models is attainable and\ncan be used to repair and improve models.\n","authors":["Yossi Gandelsman","Alexei A. Efros","Jacob Steinhardt"],"pdf_url":"https://arxiv.org/pdf/2310.05916v4.pdf","comment":"Project page and code:\n https://yossigandelsman.github.io/clip_decomposition/"},{"id":"http://arxiv.org/abs/2403.19949v1","updated":"2024-03-29T03:15:31Z","published":"2024-03-29T03:15:31Z","title":"FairCLIP: Harnessing Fairness in Vision-Language Learning","summary":" Fairness is a critical concern in deep learning, especially in healthcare,\nwhere these models influence diagnoses and treatment decisions. Although\nfairness has been investigated in the vision-only domain, the fairness of\nmedical vision-language (VL) models remains unexplored due to the scarcity of\nmedical VL datasets for studying fairness. To bridge this research gap, we\nintroduce the first fair vision-language medical dataset FairVLMed that\nprovides detailed demographic attributes, ground-truth labels, and clinical\nnotes to facilitate an in-depth examination of fairness within VL foundation\nmodels. Using FairVLMed, we conduct a comprehensive fairness analysis of two\nwidely-used VL models (CLIP and BLIP2), pre-trained on both natural and medical\ndomains, across four different protected attributes. Our results highlight\nsignificant biases in all VL models, with Asian, Male, Non-Hispanic, and\nSpanish being the preferred subgroups across the protected attributes of race,\ngender, ethnicity, and language, respectively. In order to alleviate these\nbiases, we propose FairCLIP, an optimal-transport-based approach that achieves\na favorable trade-off between performance and fairness by reducing the Sinkhorn\ndistance between the overall sample distribution and the distributions\ncorresponding to each demographic group. As the first VL dataset of its kind,\nFairVLMed holds the potential to catalyze advancements in the development of\nmachine learning models that are both ethically aware and clinically effective.\nOur dataset and code are available at\nhttps://ophai.hms.harvard.edu/datasets/fairvlmed10k.\n","authors":["Yan Luo","Min Shi","Muhammad Osama Khan","Muhammad Muneeb Afzal","Hao Huang","Shuaihang Yuan","Yu Tian","Luo Song","Ava Kouhana","Tobias Elze","Yi Fang","Mengyu Wang"],"pdf_url":"https://arxiv.org/pdf/2403.19949v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19944v1","updated":"2024-03-29T02:55:07Z","published":"2024-03-29T02:55:07Z","title":"Binarized Low-light Raw Video Enhancement","summary":" Recently, deep neural networks have achieved excellent performance on\nlow-light raw video enhancement. However, they often come with high\ncomputational complexity and large memory costs, which hinder their\napplications on resource-limited devices. In this paper, we explore the\nfeasibility of applying the extremely compact binary neural network (BNN) to\nlow-light raw video enhancement. Nevertheless, there are two main issues with\nbinarizing video enhancement models. One is how to fuse the temporal\ninformation to improve low-light denoising without complex modules. The other\nis how to narrow the performance gap between binary convolutions with the full\nprecision ones. To address the first issue, we introduce a spatial-temporal\nshift operation, which is easy-to-binarize and effective. The temporal shift\nefficiently aggregates the features of neighbor frames and the spatial shift\nhandles the misalignment caused by the large motion in videos. For the second\nissue, we present a distribution-aware binary convolution, which captures the\ndistribution characteristics of real-valued input and incorporates them into\nplain binary convolutions to alleviate the degradation in performance.\nExtensive quantitative and qualitative experiments have shown our\nhigh-efficiency binarized low-light raw video enhancement method can attain a\npromising performance.\n","authors":["Gengchen Zhang","Yulun Zhang","Xin Yuan","Ying Fu"],"pdf_url":"https://arxiv.org/pdf/2403.19944v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19935v1","updated":"2024-03-29T02:42:22Z","published":"2024-03-29T02:42:22Z","title":"CP HDR: A feature point detection and description library for LDR and\n HDR images","summary":" In computer vision, characteristics refer to image regions with unique\nproperties, such as corners, edges, textures, or areas with high contrast.\nThese regions can be represented through feature points (FPs). FP detection and\ndescription are fundamental steps to many computer vision tasks. Most FP\ndetection and description methods use low dynamic range (LDR) images,\nsufficient for most applications involving digital images. However, LDR images\nmay have saturated pixels in scenes with extreme light conditions, which\ndegrade FP detection. On the other hand, high dynamic range (HDR) images\nusually present a greater dynamic range but FP detection algorithms do not take\nadvantage of all the information in such images. In this study, we present a\nsystematic review of image detection and description algorithms that use HDR\nimages as input. We developed a library called CP_HDR that implements the\nHarris corner detector, SIFT detector and descriptor, and two modifications of\nthose algorithms specialized in HDR images, called SIFT for HDR (SfHDR) and\nHarris for HDR (HfHDR). Previous studies investigated the use of HDR images in\nFP detection, but we did not find studies investigating the use of HDR images\nin FP description. Using uniformity, repeatability rate, mean average\nprecision, and matching rate metrics, we compared the performance of the CP_HDR\nalgorithms using LDR and HDR images. We observed an increase in the uniformity\nof the distribution of FPs among the high-light, mid-light, and low-light areas\nof the images. The results show that using HDR images as input to detection\nalgorithms improves performance and that SfHDR and HfHDR enhance FP\ndescription.\n","authors":["Artur Santos Nascimento","Valter Guilherme Silva de Souza","Daniel Oliveira Dantas","Beatriz Trinchão Andrade"],"pdf_url":"https://arxiv.org/pdf/2403.19935v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19080v2","updated":"2024-03-29T02:31:10Z","published":"2024-03-28T01:05:06Z","title":"MMCert: Provable Defense against Adversarial Attacks to Multi-modal\n Models","summary":" Different from a unimodal model whose input is from a single modality, the\ninput (called multi-modal input) of a multi-modal model is from multiple\nmodalities such as image, 3D points, audio, text, etc. Similar to unimodal\nmodels, many existing studies show that a multi-modal model is also vulnerable\nto adversarial perturbation, where an attacker could add small perturbation to\nall modalities of a multi-modal input such that the multi-modal model makes\nincorrect predictions for it. Existing certified defenses are mostly designed\nfor unimodal models, which achieve sub-optimal certified robustness guarantees\nwhen extended to multi-modal models as shown in our experimental results. In\nour work, we propose MMCert, the first certified defense against adversarial\nattacks to a multi-modal model. We derive a lower bound on the performance of\nour MMCert under arbitrary adversarial attacks with bounded perturbations to\nboth modalities (e.g., in the context of auto-driving, we bound the number of\nchanged pixels in both RGB image and depth image). We evaluate our MMCert using\ntwo benchmark datasets: one for the multi-modal road segmentation task and the\nother for the multi-modal emotion recognition task. Moreover, we compare our\nMMCert with a state-of-the-art certified defense extended from unimodal models.\nOur experimental results show that our MMCert outperforms the baseline.\n","authors":["Yanting Wang","Hongye Fu","Wei Zou","Jinyuan Jia"],"pdf_url":"https://arxiv.org/pdf/2403.19080v2.pdf","comment":"To appear in CVPR'24"},{"id":"http://arxiv.org/abs/2403.19924v1","updated":"2024-03-29T02:22:54Z","published":"2024-03-29T02:22:54Z","title":"SceneTracker: Long-term Scene Flow Estimation Network","summary":" Considering the complementarity of scene flow estimation in the spatial\ndomain's focusing capability and 3D object tracking in the temporal domain's\ncoherence, this study aims to address a comprehensive new task that can\nsimultaneously capture fine-grained and long-term 3D motion in an online\nmanner: long-term scene flow estimation (LSFE). We introduce SceneTracker, a\nnovel learning-based LSFE network that adopts an iterative approach to\napproximate the optimal trajectory. Besides, it dynamically indexes and\nconstructs appearance and depth correlation features simultaneously and employs\nthe Transformer to explore and utilize long-range connections within and\nbetween trajectories. With detailed experiments, SceneTracker shows superior\ncapabilities in handling 3D spatial occlusion and depth noise interference,\nhighly tailored to the LSFE task's needs. The code for SceneTracker is\navailable at https://github.com/wwsource/SceneTracker.\n","authors":["Bo Wang","Jian Li","Yang Yu","Li Liu","Zhenping Sun","Dewen Hu"],"pdf_url":"https://arxiv.org/pdf/2403.19924v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17095v2","updated":"2024-03-29T02:18:40Z","published":"2023-11-28T06:42:58Z","title":"Emergent Open-Vocabulary Semantic Segmentation from Off-the-shelf\n Vision-Language Models","summary":" From image-text pairs, large-scale vision-language models (VLMs) learn to\nimplicitly associate image regions with words, which prove effective for tasks\nlike visual question answering. However, leveraging the learned association for\nopen-vocabulary semantic segmentation remains a challenge. In this paper, we\npropose a simple, yet extremely effective, training-free technique,\nPlug-and-Play Open-Vocabulary Semantic Segmentation (PnP-OVSS) for this task.\nPnP-OVSS leverages a VLM with direct text-to-image cross-attention and an\nimage-text matching loss. To balance between over-segmentation and\nunder-segmentation, we introduce Salience Dropout; by iteratively dropping\npatches that the model is most attentive to, we are able to better resolve the\nentire extent of the segmentation mask. \\shortname{} does not require any\nneural network training and performs hyperparameter tuning without the need for\nany segmentation annotations, even for a validation set. PnP-OVSS demonstrates\nsubstantial improvements over comparable baselines (+29.4% mIoU on Pascal VOC,\n+13.2% mIoU on Pascal Context, +14.0% mIoU on MS COCO, and +11.4% mIoU on\nADE-20K.) and even outperforms most baselines that conduct additional network\ntraining on top of pretrained VLMs. Our codebase is at\nhttps://github.com/letitiabanana/PnP-OVSS.\n","authors":["Jiayun Luo","Siddhesh Khandelwal","Leonid Sigal","Boyang Li"],"pdf_url":"https://arxiv.org/pdf/2311.17095v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19920v1","updated":"2024-03-29T02:17:09Z","published":"2024-03-29T02:17:09Z","title":"MI-NeRF: Learning a Single Face NeRF from Multiple Identities","summary":" In this work, we introduce a method that learns a single dynamic neural\nradiance field (NeRF) from monocular talking face videos of multiple\nidentities. NeRFs have shown remarkable results in modeling the 4D dynamics and\nappearance of human faces. However, they require per-identity optimization.\nAlthough recent approaches have proposed techniques to reduce the training and\nrendering time, increasing the number of identities can be expensive. We\nintroduce MI-NeRF (multi-identity NeRF), a single unified network that models\ncomplex non-rigid facial motion for multiple identities, using only monocular\nvideos of arbitrary length. The core premise in our method is to learn the\nnon-linear interactions between identity and non-identity specific information\nwith a multiplicative module. By training on multiple videos simultaneously,\nMI-NeRF not only reduces the total training time compared to standard\nsingle-identity NeRFs, but also demonstrates robustness in synthesizing novel\nexpressions for any input identity. We present results for both facial\nexpression transfer and talking face video synthesis. Our method can be further\npersonalized for a target identity given only a short video.\n","authors":["Aggelina Chatziagapi","Grigorios G. Chrysos","Dimitris Samaras"],"pdf_url":"https://arxiv.org/pdf/2403.19920v1.pdf","comment":"Project page: https://aggelinacha.github.io/MI-NeRF/"},{"id":"http://arxiv.org/abs/2403.19919v1","updated":"2024-03-29T02:10:38Z","published":"2024-03-29T02:10:38Z","title":"Diff-Reg v1: Diffusion Matching Model for Registration Problem","summary":" Establishing reliable correspondences is essential for registration tasks\nsuch as 3D and 2D3D registration. Existing methods commonly leverage geometric\nor semantic point features to generate potential correspondences. However,\nthese features may face challenges such as large deformation, scale\ninconsistency, and ambiguous matching problems (e.g., symmetry). Additionally,\nmany previous methods, which rely on single-pass prediction, may struggle with\nlocal minima in complex scenarios. To mitigate these challenges, we introduce a\ndiffusion matching model for robust correspondence construction. Our approach\ntreats correspondence estimation as a denoising diffusion process within the\ndoubly stochastic matrix space, which gradually denoises (refines) a doubly\nstochastic matching matrix to the ground-truth one for high-quality\ncorrespondence estimation. It involves a forward diffusion process that\ngradually introduces Gaussian noise into the ground truth matching matrix and a\nreverse denoising process that iteratively refines the noisy matching matrix.\nIn particular, the feature extraction from the backbone occurs only once during\nthe inference phase. Our lightweight denoising module utilizes the same feature\nat each reverse sampling step. Evaluation of our method on both 3D and 2D3D\nregistration tasks confirms its effectiveness.\n","authors":["Qianliang Wu","Haobo Jiang","Lei Luo","Jun Li","Yaqing Ding","Jin Xie","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2403.19919v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2401.00436"},{"id":"http://arxiv.org/abs/2403.19915v1","updated":"2024-03-29T02:03:00Z","published":"2024-03-29T02:03:00Z","title":"Using Images as Covariates: Measuring Curb Appeal with Deep Learning","summary":" This paper details an innovative methodology to integrate image data into\ntraditional econometric models. Motivated by forecasting sales prices for\nresidential real estate, we harness the power of deep learning to add\n\"information\" contained in images as covariates. Specifically, images of homes\nwere categorized and encoded using an ensemble of image classifiers (ResNet-50,\nVGG16, MobileNet, and Inception V3). Unique features presented within each\nimage were further encoded through panoptic segmentation. Forecasts from a\nneural network trained on the encoded data results in improved out-of-sample\npredictive power. We also combine these image-based forecasts with standard\nhedonic real estate property and location characteristics, resulting in a\nunified dataset. We show that image-based forecasts increase the accuracy of\nhedonic forecasts when encoded features are regarded as additional covariates.\nWe also attempt to \"explain\" which covariates the image-based forecasts are\nmost highly correlated with. The study exemplifies the benefits of\ninterdisciplinary methodologies, merging machine learning and econometrics to\nharness untapped data sources for more accurate forecasting.\n","authors":["Ardyn Nordstrom","Morgan Nordstrom","Matthew D. Webb"],"pdf_url":"https://arxiv.org/pdf/2403.19915v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19912v1","updated":"2024-03-29T01:46:11Z","published":"2024-03-29T01:46:11Z","title":"Automated Identification and Segmentation of Hi Sources in CRAFTS Using\n Deep Learning Method","summary":" We introduce a machine learning-based method for extracting HI sources from\n3D spectral data, and construct a dedicated dataset of HI sources from CRAFTS.\nOur custom dataset provides comprehensive resources for HI source detection.\nUtilizing the 3D-Unet segmentation architecture, our method reliably identifies\nand segments HI sources, achieving notable performance metrics with recall\nrates reaching 91.6% and accuracy levels at 95.7%. These outcomes substantiate\nthe value of our custom dataset and the efficacy of our proposed network in\nidentifying HI source. Our code is publicly available at\nhttps://github.com/fishszh/HISF.\n","authors":["Zihao Song","Huaxi Chen","Donghui Quan","Di Li","Yinghui Zheng","Shulei Ni","Yunchuan Chen","Yun Zheng"],"pdf_url":"https://arxiv.org/pdf/2403.19912v1.pdf","comment":"6 pages, 4 figures"},{"id":"http://arxiv.org/abs/2402.13729v3","updated":"2024-03-29T01:42:02Z","published":"2024-02-21T11:46:16Z","title":"Hybrid Video Diffusion Models with 2D Triplane and 3D Wavelet\n Representation","summary":" Generating high-quality videos that synthesize desired realistic content is a\nchallenging task due to their intricate high-dimensionality and complexity of\nvideos. Several recent diffusion-based methods have shown comparable\nperformance by compressing videos to a lower-dimensional latent space, using\ntraditional video autoencoder architecture. However, such method that employ\nstandard frame-wise 2D and 3D convolution fail to fully exploit the\nspatio-temporal nature of videos. To address this issue, we propose a novel\nhybrid video diffusion model, called HVDM, which can capture spatio-temporal\ndependencies more effectively. The HVDM is trained by a hybrid video\nautoencoder which extracts a disentangled representation of the video\nincluding: (i) a global context information captured by a 2D projected latent\n(ii) a local volume information captured by 3D convolutions with wavelet\ndecomposition (iii) a frequency information for improving the video\nreconstruction. Based on this disentangled representation, our hybrid\nautoencoder provide a more comprehensive video latent enriching the generated\nvideos with fine structures and details. Experiments on video generation\nbenchamarks (UCF101, SkyTimelapse, and TaiChi) demonstrate that the proposed\napproach achieves state-of-the-art video generation quality, showing a wide\nrange of video applications (e.g., long video generation, image-to-video, and\nvideo dynamics control).\n","authors":["Kihong Kim","Haneol Lee","Jihye Park","Seyeon Kim","Kwanghee Lee","Seungryong Kim","Jaejun Yoo"],"pdf_url":"https://arxiv.org/pdf/2402.13729v3.pdf","comment":"17 pages, 13 figures"},{"id":"http://arxiv.org/abs/2311.15153v4","updated":"2024-03-29T01:18:37Z","published":"2023-11-26T01:05:55Z","title":"Predicting Gradient is Better: Exploring Self-Supervised Learning for\n SAR ATR with a Joint-Embedding Predictive Architecture","summary":" The growing Synthetic Aperture Radar (SAR) data has the potential to build a\nfoundation model through Self-Supervised Learning (SSL) methods, which can\nachieve various SAR Automatic Target Recognition (ATR) tasks with pre-training\nin large-scale unlabeled data and fine-tuning in small labeled samples. SSL\naims to construct supervision signals directly from the data, which minimizes\nthe need for expensive expert annotation and maximizes the use of the expanding\ndata pool for a foundational model. This study investigates an effective SSL\nmethod for SAR ATR, which can pave the way for a foundation model in SAR ATR.\nThe primary obstacles faced in SSL for SAR ATR are the small targets in remote\nsensing and speckle noise in SAR images, corresponding to the SSL approach and\nsignals. To overcome these challenges, we present a novel Joint-Embedding\nPredictive Architecture for SAR ATR (SAR-JEPA), which leverages local masked\npatches to predict the multi-scale SAR gradient representations of unseen\ncontext. The key aspect of SAR-JEPA is integrating SAR domain features to\nensure high-quality self-supervised signals as target features. Besides, we\nemploy local masks and multi-scale features to accommodate the various small\ntargets in remote sensing. By fine-tuning and evaluating our framework on three\ntarget recognition datasets (vehicle, ship, and aircraft) with four other\ndatasets as pre-training, we demonstrate its outperformance over other SSL\nmethods and its effectiveness with increasing SAR data. This study showcases\nthe potential of SSL for SAR target recognition across diverse targets, scenes,\nand sensors.\n","authors":["Weijie Li","Yang Wei","Tianpeng Liu","Yuenan Hou","Yuxuan Li","Zhen Liu","Yongxiang Liu","Li Liu"],"pdf_url":"https://arxiv.org/pdf/2311.15153v4.pdf","comment":"Our codes at https://github.com/waterdisappear/SAR-JEPA"},{"id":"http://arxiv.org/abs/2403.19905v1","updated":"2024-03-29T01:11:56Z","published":"2024-03-29T01:11:56Z","title":"Classification of Diabetic Retinopathy using Pre-Trained Deep Learning\n Models","summary":" Diabetic Retinopathy (DR) stands as the leading cause of blindness globally,\nparticularly affecting individuals between the ages of 20 and 70. This paper\npresents a Computer-Aided Diagnosis (CAD) system designed for the automatic\nclassification of retinal images into five distinct classes: Normal, Mild,\nModerate, Severe, and Proliferative Diabetic Retinopathy (PDR). The proposed\nsystem leverages Convolutional Neural Networks (CNNs) employing pre-trained\ndeep learning models. Through the application of fine-tuning techniques, our\nmodel is trained on fundus images of diabetic retinopathy with resolutions of\n350x350x3 and 224x224x3. Experimental results obtained on the Kaggle platform,\nutilizing resources comprising 4 CPUs, 17 GB RAM, and 1 GB Disk, demonstrate\nthe efficacy of our approach. The achieved Area Under the Curve (AUC) values\nfor CNN, MobileNet, VGG-16, InceptionV3, and InceptionResNetV2 models are 0.50,\n0.70, 0.53, 0.63, and 0.69, respectively.\n","authors":["Inas Al-Kamachy","Prof. Dr. Reza Hassanpour","Prof. Roya Choupani"],"pdf_url":"https://arxiv.org/pdf/2403.19905v1.pdf","comment":"3 pages, 1 figure, 1 table"},{"id":"http://arxiv.org/abs/2310.10375v2","updated":"2024-03-29T01:08:12Z","published":"2023-10-16T13:16:09Z","title":"GTA: A Geometry-Aware Attention Mechanism for Multi-View Transformers","summary":" As transformers are equivariant to the permutation of input tokens, encoding\nthe positional information of tokens is necessary for many tasks. However,\nsince existing positional encoding schemes have been initially designed for NLP\ntasks, their suitability for vision tasks, which typically exhibit different\nstructural properties in their data, is questionable. We argue that existing\npositional encoding schemes are suboptimal for 3D vision tasks, as they do not\nrespect their underlying 3D geometric structure. Based on this hypothesis, we\npropose a geometry-aware attention mechanism that encodes the geometric\nstructure of tokens as relative transformation determined by the geometric\nrelationship between queries and key-value pairs. By evaluating on multiple\nnovel view synthesis (NVS) datasets in the sparse wide-baseline multi-view\nsetting, we show that our attention, called Geometric Transform Attention\n(GTA), improves learning efficiency and performance of state-of-the-art\ntransformer-based NVS models without any additional learned parameters and only\nminor computational overhead.\n","authors":["Takeru Miyato","Bernhard Jaeger","Max Welling","Andreas Geiger"],"pdf_url":"https://arxiv.org/pdf/2310.10375v2.pdf","comment":"Published as a conference paper at ICLR 2024"},{"id":"http://arxiv.org/abs/2403.19904v1","updated":"2024-03-29T01:07:20Z","published":"2024-03-29T01:07:20Z","title":"Fully Geometric Panoramic Localization","summary":" We introduce a lightweight and accurate localization method that only\nutilizes the geometry of 2D-3D lines. Given a pre-captured 3D map, our approach\nlocalizes a panorama image, taking advantage of the holistic 360 view. The\nsystem mitigates potential privacy breaches or domain discrepancies by avoiding\ntrained or hand-crafted visual descriptors. However, as lines alone can be\nambiguous, we express distinctive yet compact spatial contexts from\nrelationships between lines, namely the dominant directions of parallel lines\nand the intersection between non-parallel lines. The resulting representations\nare efficient in processing time and memory compared to conventional visual\ndescriptor-based methods. Given the groups of dominant line directions and\ntheir intersections, we accelerate the search process to test thousands of pose\ncandidates in less than a millisecond without sacrificing accuracy. We\nempirically show that the proposed 2D-3D matching can localize panoramas for\nchallenging scenes with similar structures, dramatic domain shifts or\nillumination changes. Our fully geometric approach does not involve extensive\nparameter tuning or neural network training, making it a practical algorithm\nthat can be readily deployed in the real world. Project page including the code\nis available through this link: https://82magnolia.github.io/fgpl/.\n","authors":["Junho Kim","Jiwon Jeong","Young Min Kim"],"pdf_url":"https://arxiv.org/pdf/2403.19904v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19902v1","updated":"2024-03-29T01:05:23Z","published":"2024-03-29T01:05:23Z","title":"Heterogeneous Network Based Contrastive Learning Method for PolSAR Land\n Cover Classification","summary":" Polarimetric synthetic aperture radar (PolSAR) image interpretation is widely\nused in various fields. Recently, deep learning has made significant progress\nin PolSAR image classification. Supervised learning (SL) requires a large\namount of labeled PolSAR data with high quality to achieve better performance,\nhowever, manually labeled data is insufficient. This causes the SL to fail into\noverfitting and degrades its generalization performance. Furthermore, the\nscattering confusion problem is also a significant challenge that attracts more\nattention. To solve these problems, this article proposes a Heterogeneous\nNetwork based Contrastive Learning method(HCLNet). It aims to learn high-level\nrepresentation from unlabeled PolSAR data for few-shot classification according\nto multi-features and superpixels. Beyond the conventional CL, HCLNet\nintroduces the heterogeneous architecture for the first time to utilize\nheterogeneous PolSAR features better. And it develops two easy-to-use plugins\nto narrow the domain gap between optics and PolSAR, including feature filter\nand superpixel-based instance discrimination, which the former is used to\nenhance the complementarity of multi-features, and the latter is used to\nincrease the diversity of negative samples. Experiments demonstrate the\nsuperiority of HCLNet on three widely used PolSAR benchmark datasets compared\nwith state-of-the-art methods. Ablation studies also verify the importance of\neach component. Besides, this work has implications for how to efficiently\nutilize the multi-features of PolSAR data to learn better high-level\nrepresentation in CL and how to construct networks suitable for PolSAR data\nbetter.\n","authors":["Jianfeng Cai","Yue Ma","Zhixi Feng","Shuyuan Yang"],"pdf_url":"https://arxiv.org/pdf/2403.19902v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15585v3","updated":"2024-03-29T00:44:18Z","published":"2024-03-22T19:19:51Z","title":"MedPromptX: Grounded Multimodal Prompting for Chest X-ray Diagnosis","summary":" Chest X-ray images are commonly used for predicting acute and chronic\ncardiopulmonary conditions, but efforts to integrate them with structured\nclinical data face challenges due to incomplete electronic health records\n(EHR). This paper introduces MedPromptX, the first model to integrate\nmultimodal large language models (MLLMs), few-shot prompting (FP) and visual\ngrounding (VG) to combine imagery with EHR data for chest X-ray diagnosis. A\npre-trained MLLM is utilized to complement the missing EHR information,\nproviding a comprehensive understanding of patients' medical history.\nAdditionally, FP reduces the necessity for extensive training of MLLMs while\neffectively tackling the issue of hallucination. Nevertheless, the process of\ndetermining the optimal number of few-shot examples and selecting high-quality\ncandidates can be burdensome, yet it profoundly influences model performance.\nHence, we propose a new technique that dynamically refines few-shot data for\nreal-time adjustment to new patient scenarios. Moreover, VG aids in focusing\nthe model's attention on relevant regions of interest in X-ray images,\nenhancing the identification of abnormalities. We release MedPromptX-VQA, a new\nin-context visual question answering dataset encompassing interleaved image and\nEHR data derived from MIMIC-IV and MIMIC-CXR databases. Results demonstrate the\nSOTA performance of MedPromptX, achieving an 11% improvement in F1-score\ncompared to the baselines. Code and data are available at\nhttps://github.com/BioMedIA-MBZUAI/MedPromptX\n","authors":["Mai A. Shaaban","Adnan Khan","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2403.15585v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19897v1","updated":"2024-03-29T00:36:38Z","published":"2024-03-29T00:36:38Z","title":"Disentangling Racial Phenotypes: Fine-Grained Control of Race-related\n Facial Phenotype Characteristics","summary":" Achieving an effective fine-grained appearance variation over 2D facial\nimages, whilst preserving facial identity, is a challenging task due to the\nhigh complexity and entanglement of common 2D facial feature encoding spaces.\nDespite these challenges, such fine-grained control, by way of disentanglement\nis a crucial enabler for data-driven racial bias mitigation strategies across\nmultiple automated facial analysis tasks, as it allows to analyse, characterise\nand synthesise human facial diversity. In this paper, we propose a novel GAN\nframework to enable fine-grained control over individual race-related phenotype\nattributes of the facial images. Our framework factors the latent (feature)\nspace into elements that correspond to race-related facial phenotype\nrepresentations, thereby separating phenotype aspects (e.g. skin, hair colour,\nnose, eye, mouth shapes), which are notoriously difficult to annotate robustly\nin real-world facial data. Concurrently, we also introduce a high quality\naugmented, diverse 2D face image dataset drawn from CelebA-HQ for GAN training.\nUnlike prior work, our framework only relies upon 2D imagery and related\nparameters to achieve state-of-the-art individual control over race-related\nphenotype attributes with improved photo-realistic output.\n","authors":["Seyma Yucer","Amir Atapour Abarghouei","Noura Al Moubayed","Toby P. Breckon"],"pdf_url":"https://arxiv.org/pdf/2403.19897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19896v1","updated":"2024-03-29T00:33:37Z","published":"2024-03-29T00:33:37Z","title":"Nonlinearity Enhanced Adaptive Activation Function","summary":" A simply implemented activation function with even cubic nonlinearity is\nintroduced that increases the accuracy of neural networks without substantial\nadditional computational resources. This is partially enabled through an\napparent tradeoff between convergence and accuracy. The activation function\ngeneralizes the standard RELU function by introducing additional degrees of\nfreedom through optimizable parameters that enable the degree of nonlinearity\nto be adjusted. The associated accuracy enhancement is quantified in the\ncontext of the MNIST digit data set through a comparison with standard\ntechniques.\n","authors":["David Yevick"],"pdf_url":"https://arxiv.org/pdf/2403.19896v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19893v1","updated":"2024-03-29T00:28:26Z","published":"2024-03-29T00:28:26Z","title":"PLoc: A New Evaluation Criterion Based on Physical Location for\n Autonomous Driving Datasets","summary":" Autonomous driving has garnered significant attention as a key research area\nwithin artificial intelligence. In the context of autonomous driving scenarios,\nthe varying physical locations of objects correspond to different levels of\ndanger. However, conventional evaluation criteria for automatic driving object\ndetection often overlook the crucial aspect of an object's physical location,\nleading to evaluation results that may not accurately reflect the genuine\nthreat posed by the object to the autonomous driving vehicle. To enhance the\nsafety of autonomous driving, this paper introduces a novel evaluation\ncriterion based on physical location information, termed PLoc. This criterion\ntranscends the limitations of traditional criteria by acknowledging that the\nphysical location of pedestrians in autonomous driving scenarios can provide\nvaluable safety-related information. Furthermore, this paper presents a newly\nre-annotated dataset (ApolloScape-R) derived from ApolloScape. ApolloScape-R\ninvolves the relabeling of pedestrians based on the significance of their\nphysical location. The dataset is utilized to assess the performance of various\nobject detection models under the proposed PLoc criterion. Experimental results\ndemonstrate that the average accuracy of all object detection models in\nidentifying a person situated in the travel lane of an autonomous vehicle is\nlower than that for a person on a sidewalk. The dataset is publicly available\nat https://github.com/lnyrlyed/ApolloScape-R.git\n","authors":["Ruining Yang","Yuqi Peng"],"pdf_url":"https://arxiv.org/pdf/2403.19893v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19888v1","updated":"2024-03-29T00:05:13Z","published":"2024-03-29T00:05:13Z","title":"MambaMixer: Efficient Selective State Space Models with Dual Token and\n Channel Selection","summary":" Recent advances in deep learning have mainly relied on Transformers due to\ntheir data dependency and ability to learn at scale. The attention module in\nthese architectures, however, exhibits quadratic time and space in input size,\nlimiting their scalability for long-sequence modeling. Despite recent attempts\nto design efficient and effective architecture backbone for multi-dimensional\ndata, such as images and multivariate time series, existing models are either\ndata independent, or fail to allow inter- and intra-dimension communication.\nRecently, State Space Models (SSMs), and more specifically Selective State\nSpace Models, with efficient hardware-aware implementation, have shown\npromising potential for long sequence modeling. Motivated by the success of\nSSMs, we present MambaMixer, a new architecture with data-dependent weights\nthat uses a dual selection mechanism across tokens and channels, called\nSelective Token and Channel Mixer. MambaMixer connects selective mixers using a\nweighted averaging mechanism, allowing layers to have direct access to early\nfeatures. As a proof of concept, we design Vision MambaMixer (ViM2) and Time\nSeries MambaMixer (TSM2) architectures based on the MambaMixer block and\nexplore their performance in various vision and time series forecasting tasks.\nOur results underline the importance of selective mixing across both tokens and\nchannels. In ImageNet classification, object detection, and semantic\nsegmentation tasks, ViM2 achieves competitive performance with well-established\nvision models and outperforms SSM-based vision models. In time series\nforecasting, TSM2 achieves outstanding performance compared to state-of-the-art\nmethods while demonstrating significantly improved computational cost. These\nresults show that while Transformers, cross-channel attention, and MLPs are\nsufficient for good performance in time series forecasting, neither is\nnecessary.\n","authors":["Ali Behrouz","Michele Santacatterina","Ramin Zabih"],"pdf_url":"https://arxiv.org/pdf/2403.19888v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2404.00191v1","updated":"2024-03-29T23:17:49Z","published":"2024-03-29T23:17:49Z","title":"Optimal Blackjack Strategy Recommender: A Comprehensive Study on\n Computer Vision Integration for Enhanced Gameplay","summary":" This research project investigates the application of several computer vision\ntechniques for playing card detection and recognition in the context of the\npopular casino game, blackjack. The primary objective is to develop a robust\nsystem that is capable of detecting and accurately classifying playing cards in\nreal-time, and displaying the optimal move recommendation based on the given\nimage of the current game. The proposed methodology involves using K-Means for\nimage segmentation, card reprojection and feature extraction, training of the\nKNN classifier using a labeled dataset, and integration of the detection system\ninto a Blackjack Basic Strategy recommendation algorithm. Further, the study\naims to observe the effectiveness of this approach in detecting various card\ndesigns under different lighting conditions and occlusions. Overall, the\nproject examines the potential benefits of incorporating computer vision\ntechniques, with a specific focus on card detection, into commonly played games\naiming to enhance player decision-making and optimize strategic outcomes. The\nresults obtained from our experimental evaluations with models developed under\nconsiderable time constraints, highlight the potential for practical\nimplementation in real-world casino environments and across other similarly\nstructured games.\n","authors":["Krishnanshu Gupta","Devon Bolt","Ben Hinchliff"],"pdf_url":"https://arxiv.org/pdf/2404.00191v1.pdf","comment":"24 pages, 13 figures"},{"id":"http://arxiv.org/abs/2404.00185v1","updated":"2024-03-29T22:51:45Z","published":"2024-03-29T22:51:45Z","title":"On Inherent Adversarial Robustness of Active Vision Systems","summary":" Current Deep Neural Networks are vulnerable to adversarial examples, which\nalter their predictions by adding carefully crafted noise. Since human eyes are\nrobust to such inputs, it is possible that the vulnerability stems from the\nstandard way of processing inputs in one shot by processing every pixel with\nthe same importance. In contrast, neuroscience suggests that the human vision\nsystem can differentiate salient features by (1) switching between multiple\nfixation points (saccades) and (2) processing the surrounding with a\nnon-uniform external resolution (foveation). In this work, we advocate that the\nintegration of such active vision mechanisms into current deep learning systems\ncan offer robustness benefits. Specifically, we empirically demonstrate the\ninherent robustness of two active vision methods - GFNet and FALcon - under a\nblack box threat model. By learning and inferencing based on downsampled\nglimpses obtained from multiple distinct fixation points within an input, we\nshow that these active methods achieve (2-3) times greater robustness compared\nto a standard passive convolutional network under state-of-the-art adversarial\nattacks. More importantly, we provide illustrative and interpretable\nvisualization analysis that demonstrates how performing inference from distinct\nfixation points makes active vision methods less vulnerable to malicious\ninputs.\n","authors":["Amitangshu Mukherjee","Timur Ibrayev","Kaushik Roy"],"pdf_url":"https://arxiv.org/pdf/2404.00185v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00179v1","updated":"2024-03-29T22:24:12Z","published":"2024-03-29T22:24:12Z","title":"Multi-Region Transfer Learning for Segmentation of Crop Field Boundaries\n in Satellite Images with Limited Labels","summary":" The goal of field boundary delineation is to predict the polygonal boundaries\nand interiors of individual crop fields in overhead remotely sensed images\n(e.g., from satellites or drones). Automatic delineation of field boundaries is\na necessary task for many real-world use cases in agriculture, such as\nestimating cultivated area in a region or predicting end-of-season yield in a\nfield. Field boundary delineation can be framed as an instance segmentation\nproblem, but presents unique research challenges compared to traditional\ncomputer vision datasets used for instance segmentation. The practical\napplicability of previous work is also limited by the assumption that a\nsufficiently-large labeled dataset is available where field boundary\ndelineation models will be applied, which is not the reality for most regions\n(especially under-resourced regions such as Sub-Saharan Africa). We present an\napproach for segmentation of crop field boundaries in satellite images in\nregions lacking labeled data that uses multi-region transfer learning to adapt\nmodel weights for the target region. We show that our approach outperforms\nexisting methods and that multi-region transfer learning substantially boosts\nperformance for multiple model architectures. Our implementation and datasets\nare publicly available to enable use of the approach by end-users and serve as\na benchmark for future work.\n","authors":["Hannah Kerner","Saketh Sundar","Mathan Satish"],"pdf_url":"https://arxiv.org/pdf/2404.00179v1.pdf","comment":"Accepted for 2023 AAAI Workshop on AI to Accelerate Science and\n Engineering"},{"id":"http://arxiv.org/abs/2404.00172v1","updated":"2024-03-29T22:03:53Z","published":"2024-03-29T22:03:53Z","title":"Universal Bovine Identification via Depth Data and Deep Metric Learning","summary":" This paper proposes and evaluates, for the first time, a top-down (dorsal\nview), depth-only deep learning system for accurately identifying individual\ncattle and provides associated code, datasets, and training weights for\nimmediate reproducibility. An increase in herd size skews the cow-to-human\nratio at the farm and makes the manual monitoring of individuals more\nchallenging. Therefore, real-time cattle identification is essential for the\nfarms and a crucial step towards precision livestock farming. Underpinned by\nour previous work, this paper introduces a deep-metric learning method for\ncattle identification using depth data from an off-the-shelf 3D camera. The\nmethod relies on CNN and MLP backbones that learn well-generalised embedding\nspaces from the body shape to differentiate individuals -- requiring neither\nspecies-specific coat patterns nor close-up muzzle prints for operation. The\nnetwork embeddings are clustered using a simple algorithm such as $k$-NN for\nhighly accurate identification, thus eliminating the need to retrain the\nnetwork for enrolling new individuals. We evaluate two backbone architectures,\nResNet, as previously used to identify Holstein Friesians using RGB images, and\nPointNet, which is specialised to operate on 3D point clouds. We also present\nCowDepth2023, a new dataset containing 21,490 synchronised colour-depth image\npairs of 99 cows, to evaluate the backbones. Both ResNet and PointNet\narchitectures, which consume depth maps and point clouds, respectively, led to\nhigh accuracy that is on par with the coat pattern-based backbone.\n","authors":["Asheesh Sharma","Lucy Randewich","William Andrew","Sion Hannuna","Neill Campbell","Siobhan Mullan","Andrew W. Dowsey","Melvyn Smith","Mark Hansen","Tilo Burghardt"],"pdf_url":"https://arxiv.org/pdf/2404.00172v1.pdf","comment":"LaTeX, 38 pages, 14 figures, 3 tables"},{"id":"http://arxiv.org/abs/2404.00168v1","updated":"2024-03-29T21:52:01Z","published":"2024-03-29T21:52:01Z","title":"Multi-Level Neural Scene Graphs for Dynamic Urban Environments","summary":" We estimate the radiance field of large-scale dynamic areas from multiple\nvehicle captures under varying environmental conditions. Previous works in this\ndomain are either restricted to static environments, do not scale to more than\na single short video, or struggle to separately represent dynamic object\ninstances. To this end, we present a novel, decomposable radiance field\napproach for dynamic urban environments. We propose a multi-level neural scene\ngraph representation that scales to thousands of images from dozens of\nsequences with hundreds of fast-moving objects. To enable efficient training\nand rendering of our representation, we develop a fast composite ray sampling\nand rendering scheme. To test our approach in urban driving scenarios, we\nintroduce a new, novel view synthesis benchmark. We show that our approach\noutperforms prior art by a significant margin on both established and our\nproposed benchmark while being faster in training and rendering.\n","authors":["Tobias Fischer","Lorenzo Porzi","Samuel Rota Bulò","Marc Pollefeys","Peter Kontschieder"],"pdf_url":"https://arxiv.org/pdf/2404.00168v1.pdf","comment":"CVPR 2024. Project page is available at\n https://tobiasfshr.github.io/pub/ml-nsg/"},{"id":"http://arxiv.org/abs/2404.00166v1","updated":"2024-03-29T21:45:53Z","published":"2024-03-29T21:45:53Z","title":"Uncovering Bias in Large Vision-Language Models with Counterfactuals","summary":" With the advent of Large Language Models (LLMs) possessing increasingly\nimpressive capabilities, a number of Large Vision-Language Models (LVLMs) have\nbeen proposed to augment LLMs with visual inputs. Such models condition\ngenerated text on both an input image and a text prompt, enabling a variety of\nuse cases such as visual question answering and multimodal chat. While prior\nstudies have examined the social biases contained in text generated by LLMs,\nthis topic has been relatively unexplored in LVLMs. Examining social biases in\nLVLMs is particularly challenging due to the confounding contributions of bias\ninduced by information contained across the text and visual modalities. To\naddress this challenging problem, we conduct a large-scale study of text\ngenerated by different LVLMs under counterfactual changes to input images.\nSpecifically, we present LVLMs with identical open-ended text prompts while\nconditioning on images from different counterfactual sets, where each set\ncontains images which are largely identical in their depiction of a common\nsubject (e.g., a doctor), but vary only in terms of intersectional social\nattributes (e.g., race and gender). We comprehensively evaluate the text\nproduced by different LVLMs under this counterfactual generation setting and\nfind that social attributes such as race, gender, and physical characteristics\ndepicted in input images can significantly influence toxicity and the\ngeneration of competency-associated words.\n","authors":["Phillip Howard","Anahita Bhiwandiwalla","Kathleen C. Fraser","Svetlana Kiritchenko"],"pdf_url":"https://arxiv.org/pdf/2404.00166v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00163v1","updated":"2024-03-29T21:40:12Z","published":"2024-03-29T21:40:12Z","title":"CT respiratory motion synthesis using joint supervised and adversarial\n learning","summary":" Objective: Four-dimensional computed tomography (4DCT) imaging consists in\nreconstructing a CT acquisition into multiple phases to track internal organ\nand tumor motion. It is commonly used in radiotherapy treatment planning to\nestablish planning target volumes. However, 4DCT increases protocol complexity,\nmay not align with patient breathing during treatment, and lead to higher\nradiation delivery. Approach: In this study, we propose a deep synthesis method\nto generate pseudo respiratory CT phases from static images for motion-aware\ntreatment planning. The model produces patient-specific deformation vector\nfields (DVFs) by conditioning synthesis on external patient surface-based\nestimation, mimicking respiratory monitoring devices. A key methodological\ncontribution is to encourage DVF realism through supervised DVF training while\nusing an adversarial term jointly not only on the warped image but also on the\nmagnitude of the DVF itself. This way, we avoid excessive smoothness typically\nobtained through deep unsupervised learning, and encourage correlations with\nthe respiratory amplitude. Main results: Performance is evaluated using real\n4DCT acquisitions with smaller tumor volumes than previously reported. Results\ndemonstrate for the first time that the generated pseudo-respiratory CT phases\ncan capture organ and tumor motion with similar accuracy to repeated 4DCT scans\nof the same patient. Mean inter-scans tumor center-of-mass distances and Dice\nsimilarity coefficients were $1.97$mm and $0.63$, respectively, for real 4DCT\nphases and $2.35$mm and $0.71$ for synthetic phases, and compares favorably to\na state-of-the-art technique (RMSim).\n","authors":["Yi-Heng Cao","Vincent Bourbonne","François Lucia","Ulrike Schick","Julien Bert","Vincent Jaouen","Dimitris Visvikis"],"pdf_url":"https://arxiv.org/pdf/2404.00163v1.pdf","comment":"to appear in Phys. Med. Biol"},{"id":"http://arxiv.org/abs/2404.00149v1","updated":"2024-03-29T20:43:55Z","published":"2024-03-29T20:43:55Z","title":"VSRD: Instance-Aware Volumetric Silhouette Rendering for Weakly\n Supervised 3D Object Detection","summary":" Monocular 3D object detection poses a significant challenge in 3D scene\nunderstanding due to its inherently ill-posed nature in monocular depth\nestimation. Existing methods heavily rely on supervised learning using abundant\n3D labels, typically obtained through expensive and labor-intensive annotation\non LiDAR point clouds. To tackle this problem, we propose a novel weakly\nsupervised 3D object detection framework named VSRD (Volumetric Silhouette\nRendering for Detection) to train 3D object detectors without any 3D\nsupervision but only weak 2D supervision. VSRD consists of multi-view 3D\nauto-labeling and subsequent training of monocular 3D object detectors using\nthe pseudo labels generated in the auto-labeling stage. In the auto-labeling\nstage, we represent the surface of each instance as a signed distance field\n(SDF) and render its silhouette as an instance mask through our proposed\ninstance-aware volumetric silhouette rendering. To directly optimize the 3D\nbounding boxes through rendering, we decompose the SDF of each instance into\nthe SDF of a cuboid and the residual distance field (RDF) that represents the\nresidual from the cuboid. This mechanism enables us to optimize the 3D bounding\nboxes in an end-to-end manner by comparing the rendered instance masks with the\nground truth instance masks. The optimized 3D bounding boxes serve as effective\ntraining data for 3D object detection. We conduct extensive experiments on the\nKITTI-360 dataset, demonstrating that our method outperforms the existing\nweakly supervised 3D object detection methods. The code is available at\nhttps://github.com/skmhrk1209/VSRD.\n","authors":["Zihua Liu","Hiroki Sakuma","Masatoshi Okutomi"],"pdf_url":"https://arxiv.org/pdf/2404.00149v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00146v1","updated":"2024-03-29T20:39:37Z","published":"2024-03-29T20:39:37Z","title":"Fast OMP for Exact Recovery and Sparse Approximation","summary":" Orthogonal Matching Pursuit (OMP) has been a powerful method in sparse signal\nrecovery and approximation. However OMP suffers computational issue when the\nsignal has large number of non-zeros. This paper advances OMP in two fronts: it\noffers a fast algorithm for the orthogonal projection of the input signal at\neach iteration, and a new selection criterion for making the greedy choice,\nwhich reduces the number of iterations it takes to recover the signal. The\nproposed modifications to OMP directly reduce the computational complexity.\nExperiment results show significant improvement over the classical OMP in\ncomputation time. The paper also provided a sufficient condition for exact\nrecovery under the new greedy choice criterion. For general signals that may\nnot have sparse representations, the paper provides a bound for the\napproximation error. The approximation error is at the same order as OMP but is\nobtained within fewer iterations and less time.\n","authors":["Huiyuan Yu","Jia He","Maggie Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.00146v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00144v1","updated":"2024-03-29T20:32:30Z","published":"2024-03-29T20:32:30Z","title":"An Interpretable Cross-Attentive Multi-modal MRI Fusion Framework for\n Schizophrenia Diagnosis","summary":" Both functional and structural magnetic resonance imaging (fMRI and sMRI) are\nwidely used for the diagnosis of mental disorder. However, combining\ncomplementary information from these two modalities is challenging due to their\nheterogeneity. Many existing methods fall short of capturing the interaction\nbetween these modalities, frequently defaulting to a simple combination of\nlatent features. In this paper, we propose a novel Cross-Attentive Multi-modal\nFusion framework (CAMF), which aims to capture both intra-modal and inter-modal\nrelationships between fMRI and sMRI, enhancing multi-modal data representation.\nSpecifically, our CAMF framework employs self-attention modules to identify\ninteractions within each modality while cross-attention modules identify\ninteractions between modalities. Subsequently, our approach optimizes the\nintegration of latent features from both modalities. This approach\nsignificantly improves classification accuracy, as demonstrated by our\nevaluations on two extensive multi-modal brain imaging datasets, where CAMF\nconsistently outperforms existing methods. Furthermore, the gradient-guided\nScore-CAM is applied to interpret critical functional networks and brain\nregions involved in schizophrenia. The bio-markers identified by CAMF align\nwith established research, potentially offering new insights into the diagnosis\nand pathological endophenotypes of schizophrenia.\n","authors":["Ziyu Zhou","Anton Orlichenko","Gang Qu","Zening Fu","Vince D Calhoun","Zhengming Ding","Yu-Ping Wang"],"pdf_url":"https://arxiv.org/pdf/2404.00144v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00132v1","updated":"2024-03-29T19:58:13Z","published":"2024-03-29T19:58:13Z","title":"FetalDiffusion: Pose-Controllable 3D Fetal MRI Synthesis with\n Conditional Diffusion Model","summary":" The quality of fetal MRI is significantly affected by unpredictable and\nsubstantial fetal motion, leading to the introduction of artifacts even when\nfast acquisition sequences are employed. The development of 3D real-time fetal\npose estimation approaches on volumetric EPI fetal MRI opens up a promising\navenue for fetal motion monitoring and prediction. Challenges arise in fetal\npose estimation due to limited number of real scanned fetal MR training images,\nhindering model generalization when the acquired fetal MRI lacks adequate pose.\n In this study, we introduce FetalDiffusion, a novel approach utilizing a\nconditional diffusion model to generate 3D synthetic fetal MRI with\ncontrollable pose. Additionally, an auxiliary pose-level loss is adopted to\nenhance model performance. Our work demonstrates the success of this proposed\nmodel by producing high-quality synthetic fetal MRI images with accurate and\nrecognizable fetal poses, comparing favorably with in-vivo real fetal MRI.\nFurthermore, we show that the integration of synthetic fetal MR images enhances\nthe fetal pose estimation model's performance, particularly when the number of\navailable real scanned data is limited resulting in 15.4% increase in PCK and\n50.2% reduced in mean error. All experiments are done on a single 32GB V100\nGPU. Our method holds promise for improving real-time tracking models, thereby\naddressing fetal motion issues more effectively.\n","authors":["Molin Zhang","Polina Golland","Patricia Ellen Grant","Elfar Adalsteinsson"],"pdf_url":"https://arxiv.org/pdf/2404.00132v1.pdf","comment":"8 pages, 3 figures, 2 tables, submitted to MICCAI 2024, code\n available if accepted"},{"id":"http://arxiv.org/abs/2404.00130v1","updated":"2024-03-29T19:51:34Z","published":"2024-03-29T19:51:34Z","title":"FISBe: A real-world benchmark dataset for instance segmentation of\n long-range thin filamentous structures","summary":" Instance segmentation of neurons in volumetric light microscopy images of\nnervous systems enables groundbreaking research in neuroscience by facilitating\njoint functional and morphological analyses of neural circuits at cellular\nresolution. Yet said multi-neuron light microscopy data exhibits extremely\nchallenging properties for the task of instance segmentation: Individual\nneurons have long-ranging, thin filamentous and widely branching morphologies,\nmultiple neurons are tightly inter-weaved, and partial volume effects, uneven\nillumination and noise inherent to light microscopy severely impede local\ndisentangling as well as long-range tracing of individual neurons. These\nproperties reflect a current key challenge in machine learning research, namely\nto effectively capture long-range dependencies in the data. While respective\nmethodological research is buzzing, to date methods are typically benchmarked\non synthetic datasets. To address this gap, we release the FlyLight Instance\nSegmentation Benchmark (FISBe) dataset, the first publicly available\nmulti-neuron light microscopy dataset with pixel-wise annotations. In addition,\nwe define a set of instance segmentation metrics for benchmarking that we\ndesigned to be meaningful with regard to downstream analyses. Lastly, we\nprovide three baselines to kick off a competition that we envision to both\nadvance the field of machine learning regarding methodology for capturing\nlong-range data dependencies, and facilitate scientific discovery in basic\nneuroscience.\n","authors":["Lisa Mais","Peter Hirsch","Claire Managan","Ramya Kandarpa","Josef Lorenz Rumberger","Annika Reinke","Lena Maier-Hein","Gudrun Ihrke","Dagmar Kainmueller"],"pdf_url":"https://arxiv.org/pdf/2404.00130v1.pdf","comment":"CVPR2024, Project page: https://kainmueller-lab.github.io/fisbe"},{"id":"http://arxiv.org/abs/2404.00122v1","updated":"2024-03-29T19:25:09Z","published":"2024-03-29T19:25:09Z","title":"AgileFormer: Spatially Agile Transformer UNet for Medical Image\n Segmentation","summary":" In the past decades, deep neural networks, particularly convolutional neural\nnetworks, have achieved state-of-the-art performance in a variety of medical\nimage segmentation tasks. Recently, the introduction of the vision transformer\n(ViT) has significantly altered the landscape of deep segmentation models.\nThere has been a growing focus on ViTs, driven by their excellent performance\nand scalability. However, we argue that the current design of the vision\ntransformer-based UNet (ViT-UNet) segmentation models may not effectively\nhandle the heterogeneous appearance (e.g., varying shapes and sizes) of objects\nof interest in medical image segmentation tasks. To tackle this challenge, we\npresent a structured approach to introduce spatially dynamic components to the\nViT-UNet. This adaptation enables the model to effectively capture features of\ntarget objects with diverse appearances. This is achieved by three main\ncomponents: \\textbf{(i)} deformable patch embedding; \\textbf{(ii)} spatially\ndynamic multi-head attention; \\textbf{(iii)} deformable positional encoding.\nThese components were integrated into a novel architecture, termed AgileFormer.\nAgileFormer is a spatially agile ViT-UNet designed for medical image\nsegmentation. Experiments in three segmentation tasks using publicly available\ndatasets demonstrated the effectiveness of the proposed method. The code is\navailable at\n\\href{https://github.com/sotiraslab/AgileFormer}{https://github.com/sotiraslab/AgileFormer}.\n","authors":["Peijie Qiu","Jin Yang","Sayantan Kumar","Soumyendu Sekhar Ghosh","Aristeidis Sotiras"],"pdf_url":"https://arxiv.org/pdf/2404.00122v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00114v1","updated":"2024-03-29T19:09:08Z","published":"2024-03-29T19:09:08Z","title":"Deepfake Sentry: Harnessing Ensemble Intelligence for Resilient\n Detection and Generalisation","summary":" Recent advancements in Generative Adversarial Networks (GANs) have enabled\nphotorealistic image generation with high quality. However, the malicious use\nof such generated media has raised concerns regarding visual misinformation.\nAlthough deepfake detection research has demonstrated high accuracy, it is\nvulnerable to advances in generation techniques and adversarial iterations on\ndetection countermeasures. To address this, we propose a proactive and\nsustainable deepfake training augmentation solution that introduces artificial\nfingerprints into models. We achieve this by employing an ensemble learning\napproach that incorporates a pool of autoencoders that mimic the effect of the\nartefacts introduced by the deepfake generator models. Experiments on three\ndatasets reveal that our proposed ensemble autoencoder-based data augmentation\nlearning approach offers improvements in terms of generalisation, resistance\nagainst basic data perturbations such as noise, blurring, sharpness\nenhancement, and affine transforms, resilience to commonly used lossy\ncompression algorithms such as JPEG, and enhanced resistance against\nadversarial attacks.\n","authors":["Liviu-Daniel Ştefan","Dan-Cristian Stanciu","Mihai Dogariu","Mihai Gabriel Constantin","Andrei Cosmin Jitaru","Bogdan Ionescu"],"pdf_url":"https://arxiv.org/pdf/2404.00114v1.pdf","comment":"16 pages, 1 figure, U.P.B. Sci. Bull., Series C, Vol. 85, Iss. 4,\n 2023"},{"id":"http://arxiv.org/abs/2202.03583v4","updated":"2024-03-29T18:57:25Z","published":"2022-02-08T00:43:57Z","title":"Multi-Label Classification of Thoracic Diseases using Dense\n Convolutional Network on Chest Radiographs","summary":" Traditional methods of identifying pathologies in X-ray images rely heavily\non skilled human interpretation and are often time-consuming. The advent of\ndeep learning techniques has enabled the development of automated disease\ndiagnosis systems. Still, the performance of such systems is opaque to\nend-users and limited to detecting a single pathology. In this paper, we\npropose a multi-label disease prediction model that allows the detection of\nmore than one pathology at a given test time. We use a dense convolutional\nneural network (DenseNet) for disease diagnosis. Our proposed model achieved\nthe highest AUC score of 0.896 for the condition Cardiomegaly with an accuracy\nof 0.826, while the lowest AUC score was obtained for Nodule, at 0.655 with an\naccuracy of 0.66. To build trust in decision-making, we generated heatmaps on\nX-rays to visualize the regions where the model paid attention to make certain\npredictions. Our proposed automated disease prediction model obtained highly\nconfident high-performance metrics in multi-label disease prediction tasks.\n","authors":["Dipkamal Bhusal","Sanjeeb Prasad Panday"],"pdf_url":"https://arxiv.org/pdf/2202.03583v4.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2404.00107v1","updated":"2024-03-29T18:38:59Z","published":"2024-03-29T18:38:59Z","title":"Robust Ensemble Person Re-Identification via Orthogonal Fusion with\n Occlusion Handling","summary":" Occlusion remains one of the major challenges in person reidentification\n(ReID) as a result of the diversity of poses and the variation of appearances.\nDeveloping novel architectures to improve the robustness of occlusion-aware\nperson Re-ID requires new insights, especially on low-resolution edge cameras.\nWe propose a deep ensemble model that harnesses both CNN and Transformer\narchitectures to generate robust feature representations. To achieve robust\nRe-ID without the need to manually label occluded regions, we propose to take\nan ensemble learning-based approach derived from the analogy between\narbitrarily shaped occluded regions and robust feature representation. Using\nthe orthogonality principle, our developed deep CNN model makes use of masked\nautoencoder (MAE) and global-local feature fusion for robust person\nidentification. Furthermore, we present a part occlusion-aware transformer\ncapable of learning feature space that is robust to occluded regions.\nExperimental results are reported on several Re-ID datasets to show the\neffectiveness of our developed ensemble model named orthogonal fusion with\nocclusion handling (OFOH). Compared to competing methods, the proposed OFOH\napproach has achieved competent rank-1 and mAP performance.\n","authors":["Syeda Nyma Ferdous","Xin Li"],"pdf_url":"https://arxiv.org/pdf/2404.00107v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00103v1","updated":"2024-03-29T18:23:34Z","published":"2024-03-29T18:23:34Z","title":"PikeLPN: Mitigating Overlooked Inefficiencies of Low-Precision Neural\n Networks","summary":" Low-precision quantization is recognized for its efficacy in neural network\noptimization. Our analysis reveals that non-quantized elementwise operations\nwhich are prevalent in layers such as parameterized activation functions, batch\nnormalization, and quantization scaling dominate the inference cost of\nlow-precision models. These non-quantized elementwise operations are commonly\noverlooked in SOTA efficiency metrics such as Arithmetic Computation Effort\n(ACE). In this paper, we propose ACEv2 - an extended version of ACE which\noffers a better alignment with the inference cost of quantized models and their\nenergy consumption on ML hardware. Moreover, we introduce PikeLPN, a model that\naddresses these efficiency issues by applying quantization to both elementwise\noperations and multiply-accumulate operations. In particular, we present a\nnovel quantization technique for batch normalization layers named QuantNorm\nwhich allows for quantizing the batch normalization parameters without\ncompromising the model performance. Additionally, we propose applying Double\nQuantization where the quantization scaling parameters are quantized.\nFurthermore, we recognize and resolve the issue of distribution mismatch in\nSeparable Convolution layers by introducing Distribution-Heterogeneous\nQuantization which enables quantizing them to low-precision. PikeLPN achieves\nPareto-optimality in efficiency-accuracy trade-off with up to 3X efficiency\nimprovement compared to SOTA low-precision models.\n","authors":["Marina Neseem","Conor McCullough","Randy Hsin","Chas Leichner","Shan Li","In Suk Chong","Andrew G. Howard","Lukasz Lew","Sherief Reda","Ville-Mikko Rautio","Daniele Moro"],"pdf_url":"https://arxiv.org/pdf/2404.00103v1.pdf","comment":"Accepted in CVPR 2024. 10 Figures, 9 Tables"},{"id":"http://arxiv.org/abs/2404.00098v1","updated":"2024-03-29T18:09:11Z","published":"2024-03-29T18:09:11Z","title":"Sparse Views, Near Light: A Practical Paradigm for Uncalibrated\n Point-light Photometric Stereo","summary":" Neural approaches have shown a significant progress on camera-based\nreconstruction. But they require either a fairly dense sampling of the viewing\nsphere, or pre-training on an existing dataset, thereby limiting their\ngeneralizability. In contrast, photometric stereo (PS) approaches have shown\ngreat potential for achieving high-quality reconstruction under sparse\nviewpoints. Yet, they are impractical because they typically require tedious\nlaboratory conditions, are restricted to dark rooms, and often multi-staged,\nmaking them subject to accumulated errors. To address these shortcomings, we\npropose an end-to-end uncalibrated multi-view PS framework for reconstructing\nhigh-resolution shapes acquired from sparse viewpoints in a real-world\nenvironment. We relax the dark room assumption, and allow a combination of\nstatic ambient lighting and dynamic near LED lighting, thereby enabling easy\ndata capture outside the lab. Experimental validation confirms that it\noutperforms existing baseline approaches in the regime of sparse viewpoints by\na large margin. This allows to bring high-accuracy 3D reconstruction from the\ndark room to the real world, while maintaining a reasonable data capture\ncomplexity.\n","authors":["Mohammed Brahimi","Bjoern Haefner","Zhenzhang Ye","Bastian Goldluecke","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2404.00098v1.pdf","comment":"Accepted in CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00086v1","updated":"2024-03-29T17:58:50Z","published":"2024-03-29T17:58:50Z","title":"DVIS-DAQ: Improving Video Segmentation via Dynamic Anchor Queries","summary":" Modern video segmentation methods adopt object queries to perform inter-frame\nassociation and demonstrate satisfactory performance in tracking continuously\nappearing objects despite large-scale motion and transient occlusion.\n However, they all underperform on newly emerging and disappearing objects\nthat are common in the real world because they attempt to model object\nemergence and disappearance through feature transitions between background and\nforeground queries that have significant feature gaps. We introduce Dynamic\nAnchor Queries (DAQ) to shorten the transition gap between the anchor and\ntarget queries by dynamically generating anchor queries based on the features\nof potential candidates.\n Furthermore, we introduce a query-level object Emergence and Disappearance\nSimulation (EDS) strategy, which unleashes DAQ's potential without any\nadditional cost.\n Finally, we combine our proposed DAQ and EDS with DVIS~\\cite{zhang2023dvis}\nto obtain DVIS-DAQ.\n Extensive experiments demonstrate that DVIS-DAQ achieves a new\nstate-of-the-art (SOTA) performance on five mainstream video segmentation\nbenchmarks. Code and models are available at\n\\url{https://github.com/SkyworkAI/DAQ-VS}.\n","authors":["Yikang Zhou","Tao Zhang","Shunping JI","Shuicheng Yan","Xiangtai Li"],"pdf_url":"https://arxiv.org/pdf/2404.00086v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01330v1","updated":"2024-03-29T15:27:28Z","published":"2024-03-29T15:27:28Z","title":"Holo-VQVAE: VQ-VAE for phase-only holograms","summary":" Holography stands at the forefront of visual technology innovation, offering\nimmersive, three-dimensional visualizations through the manipulation of light\nwave amplitude and phase. Contemporary research in hologram generation has\npredominantly focused on image-to-hologram conversion, producing holograms from\nexisting images. These approaches, while effective, inherently limit the scope\nof innovation and creativity in hologram generation. In response to this\nlimitation, we present Holo-VQVAE, a novel generative framework tailored for\nphase-only holograms (POHs). Holo-VQVAE leverages the architecture of Vector\nQuantized Variational AutoEncoders, enabling it to learn the complex\ndistributions of POHs. Furthermore, it integrates the Angular Spectrum Method\ninto the training process, facilitating learning in the image domain. This\nframework allows for the generation of unseen, diverse holographic content\ndirectly from its intricately learned latent space without requiring\npre-existing images. This pioneering work paves the way for groundbreaking\napplications and methodologies in holographic content creation, opening a new\nera in the exploration of holographic content.\n","authors":["Joohyun Park","Hyeongyeop Kang"],"pdf_url":"https://arxiv.org/pdf/2404.01330v1.pdf","comment":null}]},"2024-04-01T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2403.17701v2","updated":"2024-04-01T02:31:10Z","published":"2024-03-26T13:40:18Z","title":"Rotate to Scan: UNet-like Mamba with Triplet SSM Module for Medical\n Image Segmentation","summary":" Image segmentation holds a vital position in the realms of diagnosis and\ntreatment within the medical domain. Traditional convolutional neural networks\n(CNNs) and Transformer models have made significant advancements in this realm,\nbut they still encounter challenges because of limited receptive field or high\ncomputing complexity. Recently, State Space Models (SSMs), particularly Mamba\nand its variants, have demonstrated notable performance in the field of vision.\nHowever, their feature extraction methods may not be sufficiently effective and\nretain some redundant structures, leaving room for parameter reduction.\nMotivated by previous spatial and channel attention methods, we propose Triplet\nMamba-UNet. The method leverages residual VSS Blocks to extract intensive\ncontextual features, while Triplet SSM is employed to fuse features across\nspatial and channel dimensions. We conducted experiments on ISIC17, ISIC18,\nCVC-300, CVC-ClinicDB, Kvasir-SEG, CVC-ColonDB, and Kvasir-Instrument datasets,\ndemonstrating the superior segmentation performance of our proposed TM-UNet.\nAdditionally, compared to the previous VM-UNet, our model achieves a one-third\nreduction in parameters.\n","authors":["Hao Tang","Lianglun Cheng","Guoheng Huang","Zhengguang Tan","Junhao Lu","Kaihong Wu"],"pdf_url":"https://arxiv.org/pdf/2403.17701v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18933v2","updated":"2024-04-01T02:06:07Z","published":"2024-02-29T08:01:31Z","title":"Modality-Agnostic Structural Image Representation Learning for\n Deformable Multi-Modality Medical Image Registration","summary":" Establishing dense anatomical correspondence across distinct imaging\nmodalities is a foundational yet challenging procedure for numerous medical\nimage analysis studies and image-guided radiotherapy. Existing multi-modality\nimage registration algorithms rely on statistical-based similarity measures or\nlocal structural image representations. However, the former is sensitive to\nlocally varying noise, while the latter is not discriminative enough to cope\nwith complex anatomical structures in multimodal scans, causing ambiguity in\ndetermining the anatomical correspondence across scans with different\nmodalities. In this paper, we propose a modality-agnostic structural\nrepresentation learning method, which leverages Deep Neighbourhood\nSelf-similarity (DNS) and anatomy-aware contrastive learning to learn\ndiscriminative and contrast-invariance deep structural image representations\n(DSIR) without the need for anatomical delineations or pre-aligned training\nimages. We evaluate our method on multiphase CT, abdomen MR-CT, and brain MR\nT1w-T2w registration. Comprehensive results demonstrate that our method is\nsuperior to the conventional local structural representation and\nstatistical-based similarity measures in terms of discriminability and\naccuracy.\n","authors":["Tony C. W. Mok","Zi Li","Yunhao Bai","Jianpeng Zhang","Wei Liu","Yan-Jie Zhou","Ke Yan","Dakai Jin","Yu Shi","Xiaoli Yin","Le Lu","Ling Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.18933v2.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.19898v2","updated":"2024-04-01T01:27:14Z","published":"2024-03-29T00:40:12Z","title":"Structure Matters: Tackling the Semantic Discrepancy in Diffusion Models\n for Image Inpainting","summary":" Denoising diffusion probabilistic models for image inpainting aim to add the\nnoise to the texture of image during the forward process and recover masked\nregions with unmasked ones of the texture via the reverse denoising\nprocess.Despite the meaningful semantics generation,the existing arts suffer\nfrom the semantic discrepancy between masked and unmasked regions, since the\nsemantically dense unmasked texture fails to be completely degraded while the\nmasked regions turn to the pure noise in diffusion process,leading to the large\ndiscrepancy between them. In this paper,we aim to answer how unmasked semantics\nguide texture denoising process;together with how to tackle the semantic\ndiscrepancy,to facilitate the consistent and meaningful semantics generation.\nTo this end,we propose a novel structure-guided diffusion model named\nStrDiffusion,to reformulate the conventional texture denoising process under\nstructure guidance to derive a simplified denoising objective for image\ninpainting,while revealing:1)the semantically sparse structure is beneficial to\ntackle semantic discrepancy in early stage, while dense texture generates\nreasonable semantics in late stage;2)the semantics from unmasked regions\nessentially offer the time-dependent structure guidance for the texture\ndenoising process,benefiting from the time-dependent sparsity of the structure\nsemantics.For the denoising process,a structure-guided neural network is\ntrained to estimate the simplified denoising objective by exploiting the\nconsistency of the denoised structure between masked and unmasked\nregions.Besides,we devise an adaptive resampling strategy as a formal criterion\nas whether structure is competent to guide the texture denoising process,while\nregulate their semantic correlations.Extensive experiments validate the merits\nof StrDiffusion over the state-of-the-arts.Our code is available at\nhttps://github.com/htyjers/StrDiffusion.\n","authors":["Haipeng Liu","Yang Wang","Biao Qian","Meng Wang","Yong Rui"],"pdf_url":"https://arxiv.org/pdf/2403.19898v2.pdf","comment":"15 pages, 10 figures, to appear CVPR 2024"},{"id":"http://arxiv.org/abs/2401.13964v3","updated":"2024-04-01T01:26:12Z","published":"2024-01-25T05:55:03Z","title":"An Extensible Framework for Open Heterogeneous Collaborative Perception","summary":" Collaborative perception aims to mitigate the limitations of single-agent\nperception, such as occlusions, by facilitating data exchange among multiple\nagents. However, most current works consider a homogeneous scenario where all\nagents use identity sensors and perception models. In reality, heterogeneous\nagent types may continually emerge and inevitably face a domain gap when\ncollaborating with existing agents. In this paper, we introduce a new open\nheterogeneous problem: how to accommodate continually emerging new\nheterogeneous agent types into collaborative perception, while ensuring high\nperception performance and low integration cost? To address this problem, we\npropose HEterogeneous ALliance (HEAL), a novel extensible collaborative\nperception framework. HEAL first establishes a unified feature space with\ninitial agents via a novel multi-scale foreground-aware Pyramid Fusion network.\nWhen heterogeneous new agents emerge with previously unseen modalities or\nmodels, we align them to the established unified space with an innovative\nbackward alignment. This step only involves individual training on the new\nagent type, thus presenting extremely low training costs and high\nextensibility. To enrich agents' data heterogeneity, we bring OPV2V-H, a new\nlarge-scale dataset with more diverse sensor types. Extensive experiments on\nOPV2V-H and DAIR-V2X datasets show that HEAL surpasses SOTA methods in\nperformance while reducing the training parameters by 91.5% when integrating 3\nnew agent types. We further implement a comprehensive codebase at:\nhttps://github.com/yifanlu0227/HEAL\n","authors":["Yifan Lu","Yue Hu","Yiqi Zhong","Dequan Wang","Yanfeng Wang","Siheng Chen"],"pdf_url":"https://arxiv.org/pdf/2401.13964v3.pdf","comment":"Accepted by ICLR 2024. The code and data are open-sourced at\n https://github.com/yifanlu0227/HEAL"},{"id":"http://arxiv.org/abs/2403.12686v2","updated":"2024-04-01T01:23:16Z","published":"2024-03-19T12:45:18Z","title":"WaterVG: Waterway Visual Grounding based on Text-Guided Vision and\n mmWave Radar","summary":" The perception of waterways based on human intent is significant for\nautonomous navigation and operations of Unmanned Surface Vehicles (USVs) in\nwater environments. Inspired by visual grounding, we introduce WaterVG, the\nfirst visual grounding dataset designed for USV-based waterway perception based\non human prompts. WaterVG encompasses prompts describing multiple targets, with\nannotations at the instance level including bounding boxes and masks. Notably,\nWaterVG includes 11,568 samples with 34,987 referred targets, whose prompts\nintegrates both visual and radar characteristics. The pattern of text-guided\ntwo sensors equips a finer granularity of text prompts with visual and radar\nfeatures of referred targets. Moreover, we propose a low-power visual grounding\nmodel, Potamoi, which is a multi-task model with a well-designed Phased\nHeterogeneous Modality Fusion (PHMF) mode, including Adaptive Radar Weighting\n(ARW) and Multi-Head Slim Cross Attention (MHSCA). Exactly, ARW extracts\nrequired radar features to fuse with vision for prompt alignment. MHSCA is an\nefficient fusion module with a remarkably small parameter count and FLOPs,\nelegantly fusing scenario context captured by two sensors with linguistic\nfeatures, which performs expressively on visual grounding tasks. Comprehensive\nexperiments and evaluations have been conducted on WaterVG, where our Potamoi\narchives state-of-the-art performances compared with counterparts.\n","authors":["Runwei Guan","Liye Jia","Fengyufan Yang","Shanliang Yao","Erick Purwanto","Xiaohui Zhu","Eng Gee Lim","Jeremy Smith","Ka Lok Man","Xuming Hu","Yutao Yue"],"pdf_url":"https://arxiv.org/pdf/2403.12686v2.pdf","comment":"10 pages, 10 figures"},{"id":"http://arxiv.org/abs/2312.15130v2","updated":"2024-04-01T00:22:18Z","published":"2023-12-23T01:38:41Z","title":"PACE: A Large-Scale Dataset with Pose Annotations in Cluttered\n Environments","summary":" Pose estimation is a crucial task in computer vision and robotics, enabling\nthe tracking and manipulation of objects in images or videos. While several\ndatasets exist for pose estimation, there is a lack of large-scale datasets\nspecifically focusing on cluttered scenes with occlusions. We introduce PACE\n(Pose Annotations in Cluttered Environments), a large-scale benchmark designed\nto advance the development and evaluation of pose estimation methods in\ncluttered scenarios. PACE consists of 54,945 frames with 257,673 annotations\nacross 300 videos, covering 576 objects from 44 categories and featuring a mix\nof rigid and articulated items in cluttered scenes. To annotate the real-world\ndata efficiently, we developed an innovative annotation system utilizing a\ncalibrated 3-camera setup. We test state-of-the-art algorithms in PACE along\ntwo tracks: pose estimation, and object pose tracking, revealing the\nbenchmark's challenges and research opportunities. Our code and data is\navailable on https://github.com/qq456cvb/PACE.\n","authors":["Yang You","Kai Xiong","Zhening Yang","Zhengxiang Huang","Junwei Zhou","Ruoxi Shi","Zhou Fang","Adam W. Harley","Leonidas Guibas","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2312.15130v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20271v2","updated":"2024-04-01T03:25:30Z","published":"2024-03-29T16:26:20Z","title":"Draw-and-Understand: Leveraging Visual Prompts to Enable MLLMs to\n Comprehend What You Want","summary":" The interaction between humans and artificial intelligence (AI) is a crucial\nfactor that reflects the effectiveness of multimodal large language models\n(MLLMs). However, current MLLMs primarily focus on image-level comprehension\nand limit interaction to textual instructions, thereby constraining their\nflexibility in usage and depth of response. In this paper, we introduce the\nDraw-and-Understand project: a new model, a multi-domain dataset, and a\nchallenging benchmark for visual prompting. Specifically, we propose SPHINX-V,\na new end-to-end trained Multimodal Large Language Model (MLLM) that connects a\nvision encoder, a visual prompt encoder and an LLM for various visual prompts\n(points, bounding boxes, and free-form shape) and language understanding. To\nadvance visual prompting research for MLLMs, we introduce MDVP-Data and\nMDVP-Bench. MDVP-Data features a multi-domain dataset containing 1.6M unique\nimage-visual prompt-text instruction-following samples, including natural\nimages, document images, OCR images, mobile screenshots, web screenshots, and\nmulti-panel images. Furthermore, we present MDVP-Bench, a comprehensive and\nchallenging benchmark to assess a model's capability in understanding visual\nprompting instructions. Our experiments demonstrate SPHINX-V's impressive\nmultimodal interaction capabilities through visual prompting, revealing\nsignificant improvements in detailed pixel-level description and\nquestion-answering abilities.\n","authors":["Weifeng Lin","Xinyu Wei","Ruichuan An","Peng Gao","Bocheng Zou","Yulin Luo","Siyuan Huang","Shanghang Zhang","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2403.20271v2.pdf","comment":"16 pages, 7 figures"},{"id":"http://arxiv.org/abs/2403.19435v3","updated":"2024-04-01T13:02:20Z","published":"2024-03-28T14:04:17Z","title":"BAMM: Bidirectional Autoregressive Motion Model","summary":" Generating human motion from text has been dominated by denoising motion\nmodels either through diffusion or generative masking process. However, these\nmodels face great limitations in usability by requiring prior knowledge of the\nmotion length. Conversely, autoregressive motion models address this limitation\nby adaptively predicting motion endpoints, at the cost of degraded generation\nquality and editing capabilities. To address these challenges, we propose\nBidirectional Autoregressive Motion Model (BAMM), a novel text-to-motion\ngeneration framework. BAMM consists of two key components: (1) a motion\ntokenizer that transforms 3D human motion into discrete tokens in latent space,\nand (2) a masked self-attention transformer that autoregressively predicts\nrandomly masked tokens via a hybrid attention masking strategy. By unifying\ngenerative masked modeling and autoregressive modeling, BAMM captures rich and\nbidirectional dependencies among motion tokens, while learning the\nprobabilistic mapping from textual inputs to motion outputs with\ndynamically-adjusted motion sequence length. This feature enables BAMM to\nsimultaneously achieving high-quality motion generation with enhanced usability\nand built-in motion editability. Extensive experiments on HumanML3D and KIT-ML\ndatasets demonstrate that BAMM surpasses current state-of-the-art methods in\nboth qualitative and quantitative measures. Our project page is available at\nhttps://exitudio.github.io/BAMM-page\n","authors":["Ekkasit Pinyoanuntapong","Muhammad Usama Saleem","Pu Wang","Minwoo Lee","Srijan Das","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2403.19435v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19926v2","updated":"2024-04-01T08:52:20Z","published":"2024-03-29T02:26:22Z","title":"Video-Based Human Pose Regression via Decoupled Space-Time Aggregation","summary":" By leveraging temporal dependency in video sequences, multi-frame human pose\nestimation algorithms have demonstrated remarkable results in complicated\nsituations, such as occlusion, motion blur, and video defocus. These algorithms\nare predominantly based on heatmaps, resulting in high computation and storage\nrequirements per frame, which limits their flexibility and real-time\napplication in video scenarios, particularly on edge devices. In this paper, we\ndevelop an efficient and effective video-based human pose regression method,\nwhich bypasses intermediate representations such as heatmaps and instead\ndirectly maps the input to the output joint coordinates. Despite the inherent\nspatial correlation among adjacent joints of the human pose, the temporal\ntrajectory of each individual joint exhibits relative independence. In light of\nthis, we propose a novel Decoupled Space-Time Aggregation network (DSTA) to\nseparately capture the spatial contexts between adjacent joints and the\ntemporal cues of each individual joint, thereby avoiding the conflation of\nspatiotemporal dimensions. Concretely, DSTA learns a dedicated feature token\nfor each joint to facilitate the modeling of their spatiotemporal dependencies.\nWith the proposed joint-wise local-awareness attention mechanism, our method is\ncapable of efficiently and flexibly utilizing the spatial dependency of\nadjacent joints and the temporal dependency of each joint itself. Extensive\nexperiments demonstrate the superiority of our method. Compared to previous\nregression-based single-frame human pose estimation methods, DSTA significantly\nenhances performance, achieving an 8.9 mAP improvement on PoseTrack2017.\nFurthermore, our approach either surpasses or is on par with the\nstate-of-the-art heatmap-based multi-frame human pose estimation methods.\nProject page: https://github.com/zgspose/DSTA.\n","authors":["Jijie He","Wenwu Yang"],"pdf_url":"https://arxiv.org/pdf/2403.19926v2.pdf","comment":"12 pages, 3 figures"},{"id":"http://arxiv.org/abs/2311.17076v3","updated":"2024-04-01T03:17:09Z","published":"2023-11-27T22:23:27Z","title":"Compositional Chain-of-Thought Prompting for Large Multimodal Models","summary":" The combination of strong visual backbones and Large Language Model (LLM)\nreasoning has led to Large Multimodal Models (LMMs) becoming the current\nstandard for a wide range of vision and language (VL) tasks. However, recent\nresearch has shown that even the most advanced LMMs still struggle to capture\naspects of compositional visual reasoning, such as attributes and relationships\nbetween objects. One solution is to utilize scene graphs (SGs)--a formalization\nof objects and their relations and attributes that has been extensively used as\na bridge between the visual and textual domains. Yet, scene graph data requires\nscene graph annotations, which are expensive to collect and thus not easily\nscalable. Moreover, finetuning an LMM based on SG data can lead to catastrophic\nforgetting of the pretraining objective. To overcome this, inspired by\nchain-of-thought methods, we propose Compositional Chain-of-Thought (CCoT), a\nnovel zero-shot Chain-of-Thought prompting method that utilizes SG\nrepresentations in order to extract compositional knowledge from an LMM.\nSpecifically, we first generate an SG using the LMM, and then use that SG in\nthe prompt to produce a response. Through extensive experiments, we find that\nthe proposed CCoT approach not only improves LMM performance on several vision\nand language VL compositional benchmarks but also improves the performance of\nseveral popular LMMs on general multimodal benchmarks, without the need for\nfine-tuning or annotated ground-truth SGs. Code:\nhttps://github.com/chancharikmitra/CCoT\n","authors":["Chancharik Mitra","Brandon Huang","Trevor Darrell","Roei Herzig"],"pdf_url":"https://arxiv.org/pdf/2311.17076v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19797v2","updated":"2024-04-01T02:57:07Z","published":"2024-03-28T19:25:25Z","title":"Efficient 3D Instance Mapping and Localization with Neural Fields","summary":" We tackle the problem of learning an implicit scene representation for 3D\ninstance segmentation from a sequence of posed RGB images. Towards this, we\nintroduce 3DIML, a novel framework that efficiently learns a label field that\nmay be rendered from novel viewpoints to produce view-consistent instance\nsegmentation masks. 3DIML significantly improves upon training and inference\nruntimes of existing implicit scene representation based methods. Opposed to\nprior art that optimizes a neural field in a self-supervised manner, requiring\ncomplicated training procedures and loss function design, 3DIML leverages a\ntwo-phase process. The first phase, InstanceMap, takes as input 2D segmentation\nmasks of the image sequence generated by a frontend instance segmentation\nmodel, and associates corresponding masks across images to 3D labels. These\nalmost view-consistent pseudolabel masks are then used in the second phase,\nInstanceLift, to supervise the training of a neural label field, which\ninterpolates regions missed by InstanceMap and resolves ambiguities.\nAdditionally, we introduce InstanceLoc, which enables near realtime\nlocalization of instance masks given a trained label field and an off-the-shelf\nimage segmentation model by fusing outputs from both. We evaluate 3DIML on\nsequences from the Replica and ScanNet datasets and demonstrate 3DIML's\neffectiveness under mild assumptions for the image sequences. We achieve a\nlarge practical speedup over existing implicit scene representation methods\nwith comparable quality, showcasing its potential to facilitate faster and more\neffective 3D scene understanding.\n","authors":["George Tang","Krishna Murthy Jatavallabhula","Antonio Torralba"],"pdf_url":"https://arxiv.org/pdf/2403.19797v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19646v2","updated":"2024-04-01T08:00:56Z","published":"2024-03-28T17:55:42Z","title":"Change-Agent: Towards Interactive Comprehensive Remote Sensing Change\n Interpretation and Analysis","summary":" Monitoring changes in the Earth's surface is crucial for understanding\nnatural processes and human impacts, necessitating precise and comprehensive\ninterpretation methodologies. Remote sensing satellite imagery offers a unique\nperspective for monitoring these changes, leading to the emergence of remote\nsensing image change interpretation (RSICI) as a significant research focus.\nCurrent RSICI technology encompasses change detection and change captioning,\neach with its limitations in providing comprehensive interpretation. To address\nthis, we propose an interactive Change-Agent, which can follow user\ninstructions to achieve comprehensive change interpretation and insightful\nanalysis according to user instructions, such as change detection and change\ncaptioning, change object counting, change cause analysis, etc. The\nChange-Agent integrates a multi-level change interpretation (MCI) model as the\neyes and a large language model (LLM) as the brain. The MCI model contains two\nbranches of pixel-level change detection and semantic-level change captioning,\nin which multiple BI-temporal Iterative Interaction (BI3) layers utilize Local\nPerception Enhancement (LPE) and the Global Difference Fusion Attention (GDFA)\nmodules to enhance the model's discriminative feature representation\ncapabilities. To support the training of the MCI model, we build the LEVIR-MCI\ndataset with a large number of change masks and captions of changes. Extensive\nexperiments demonstrate the effectiveness of the proposed MCI model and\nhighlight the promising potential of our Change-Agent in facilitating\ncomprehensive and intelligent interpretation of surface changes. To facilitate\nfuture research, we will make our dataset and codebase of the MCI model and\nChange-Agent publicly available at\nhttps://github.com/Chen-Yang-Liu/Change-Agent\n","authors":["Chenyang Liu","Keyan Chen","Haotian Zhang","Zipeng Qi","Zhengxia Zou","Zhenwei Shi"],"pdf_url":"https://arxiv.org/pdf/2403.19646v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05369v5","updated":"2024-04-01T07:26:06Z","published":"2024-03-08T15:00:44Z","title":"Frequency-Adaptive Dilated Convolution for Semantic Segmentation","summary":" Dilated convolution, which expands the receptive field by inserting gaps\nbetween its consecutive elements, is widely employed in computer vision. In\nthis study, we propose three strategies to improve individual phases of dilated\nconvolution from the view of spectrum analysis. Departing from the conventional\npractice of fixing a global dilation rate as a hyperparameter, we introduce\nFrequency-Adaptive Dilated Convolution (FADC), which dynamically adjusts\ndilation rates spatially based on local frequency components. Subsequently, we\ndesign two plug-in modules to directly enhance effective bandwidth and\nreceptive field size. The Adaptive Kernel (AdaKern) module decomposes\nconvolution weights into low-frequency and high-frequency components,\ndynamically adjusting the ratio between these components on a per-channel\nbasis. By increasing the high-frequency part of convolution weights, AdaKern\ncaptures more high-frequency components, thereby improving effective bandwidth.\nThe Frequency Selection (FreqSelect) module optimally balances high- and\nlow-frequency components in feature representations through spatially variant\nreweighting. It suppresses high frequencies in the background to encourage FADC\nto learn a larger dilation, thereby increasing the receptive field for an\nexpanded scope. Extensive experiments on segmentation and object detection\nconsistently validate the efficacy of our approach. The code is publicly\navailable at https://github.com/Linwei-Chen/FADC.\n","authors":["Linwei Chen","Lin Gu","Ying Fu"],"pdf_url":"https://arxiv.org/pdf/2403.05369v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10030v3","updated":"2024-04-01T05:22:52Z","published":"2024-03-15T05:30:29Z","title":"Multi-criteria Token Fusion with One-step-ahead Attention for Efficient\n Vision Transformers","summary":" Vision Transformer (ViT) has emerged as a prominent backbone for computer\nvision. For more efficient ViTs, recent works lessen the quadratic cost of the\nself-attention layer by pruning or fusing the redundant tokens. However, these\nworks faced the speed-accuracy trade-off caused by the loss of information.\nHere, we argue that token fusion needs to consider diverse relations between\ntokens to minimize information loss. In this paper, we propose a Multi-criteria\nToken Fusion (MCTF), that gradually fuses the tokens based on multi-criteria\n(e.g., similarity, informativeness, and size of fused tokens). Further, we\nutilize the one-step-ahead attention, which is the improved approach to capture\nthe informativeness of the tokens. By training the model equipped with MCTF\nusing a token reduction consistency, we achieve the best speed-accuracy\ntrade-off in the image classification (ImageNet1K). Experimental results prove\nthat MCTF consistently surpasses the previous reduction methods with and\nwithout training. Specifically, DeiT-T and DeiT-S with MCTF reduce FLOPs by\nabout 44% while improving the performance (+0.5%, and +0.3%) over the base\nmodel, respectively. We also demonstrate the applicability of MCTF in various\nVision Transformers (e.g., T2T-ViT, LV-ViT), achieving at least 31% speedup\nwithout performance degradation. Code is available at\nhttps://github.com/mlvlab/MCTF.\n","authors":["Sanghyeok Lee","Joonmyung Choi","Hyunwoo J. Kim"],"pdf_url":"https://arxiv.org/pdf/2403.10030v3.pdf","comment":"Conference on Computer Vision and Pattern Recognition (CVPR), 2024"},{"id":"http://arxiv.org/abs/2403.17639v2","updated":"2024-04-01T14:15:51Z","published":"2024-03-26T12:21:47Z","title":"High-Resolution Image Translation Model Based on Grayscale Redefinition","summary":" Image-to-image translation is a technique that focuses on transferring images\nfrom one domain to another while maintaining the essential content\nrepresentations. In recent years, image-to-image translation has gained\nsignificant attention and achieved remarkable advancements due to its diverse\napplications in computer vision and image processing tasks. In this work, we\npropose an innovative method for image translation between different domains.\nFor high-resolution image translation tasks, we use a grayscale adjustment\nmethod to achieve pixel-level translation. For other tasks, we utilize the\nPix2PixHD model with a coarse-to-fine generator, multi-scale discriminator, and\nimproved loss to enhance the image translation performance. On the other hand,\nto tackle the issue of sparse training data, we adopt model weight\ninitialization from other task to optimize the performance of the current task.\n","authors":["Xixian Wu","Dian Chao","Yang Yang"],"pdf_url":"https://arxiv.org/pdf/2403.17639v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15388v3","updated":"2024-04-01T14:08:06Z","published":"2024-03-22T17:59:52Z","title":"LLaVA-PruMerge: Adaptive Token Reduction for Efficient Large Multimodal\n Models","summary":" Large Multimodal Models (LMMs) have shown significant reasoning capabilities\nby connecting a visual encoder and a large language model. LMMs typically use a\nfixed amount of visual tokens, such as the penultimate layer features in the\nCLIP visual encoder, as the prefix content. Recent LMMs incorporate more\ncomplex visual inputs, such as high-resolution images and videos, which\nincrease the number of visual tokens significantly. However, due to the design\nof the Transformer architecture, computational costs associated with these\nmodels tend to increase quadratically with the number of input tokens. To\ntackle this problem, we explore a token reduction mechanism and find, similar\nto prior work, that many visual tokens are spatially redundant. Based on this,\nwe propose PruMerge, a novel adaptive visual token reduction approach, which\nlargely reduces the number of visual tokens while maintaining comparable model\nperformance. We first select the unpruned visual tokens based on their\nsimilarity to class tokens and spatial tokens. We then cluster the pruned\ntokens based on key similarity and merge the clustered tokens with the unpruned\ntokens to supplement their information. Empirically, when applied to LLaVA-1.5,\nour approach can compress the visual tokens by 18 times on average, and achieve\ncomparable performance across diverse visual question-answering and reasoning\ntasks. Code and checkpoints are at https://llava-prumerge.github.io/.\n","authors":["Yuzhang Shang","Mu Cai","Bingxin Xu","Yong Jae Lee","Yan Yan"],"pdf_url":"https://arxiv.org/pdf/2403.15388v3.pdf","comment":"Project page: https://llava-prumerge.github.io/"},{"id":"http://arxiv.org/abs/2404.01524v1","updated":"2024-04-01T23:11:15Z","published":"2024-04-01T23:11:15Z","title":"On Train-Test Class Overlap and Detection for Image Retrieval","summary":" How important is it for training and evaluation sets to not have class\noverlap in image retrieval? We revisit Google Landmarks v2 clean, the most\npopular training set, by identifying and removing class overlap with Revisited\nOxford and Paris [34], the most popular evaluation set. By comparing the\noriginal and the new RGLDv2-clean on a benchmark of reproduced state-of-the-art\nmethods, our findings are striking. Not only is there a dramatic drop in\nperformance, but it is inconsistent across methods, changing the ranking.What\ndoes it take to focus on objects or interest and ignore background clutter when\nindexing? Do we need to train an object detector and the representation\nseparately? Do we need location supervision? We introduce Single-stage\nDetect-to-Retrieve (CiDeR), an end-to-end, single-stage pipeline to detect\nobjects of interest and extract a global image representation. We outperform\nprevious state-of-the-art on both existing training sets and the new\nRGLDv2-clean. Our dataset is available at\nhttps://github.com/dealicious-inc/RGLDv2-clean.\n","authors":["Chull Hwan Song","Jooyoung Yoon","Taebaek Hwang","Shunghyun Choi","Yeong Hyeon Gu","Yannis Avrithis"],"pdf_url":"https://arxiv.org/pdf/2404.01524v1.pdf","comment":"CVPR2024 Accepted"},{"id":"http://arxiv.org/abs/2404.01518v1","updated":"2024-04-01T22:53:47Z","published":"2024-04-01T22:53:47Z","title":"Temporally Consistent Unbalanced Optimal Transport for Unsupervised\n Action Segmentation","summary":" We propose a novel approach to the action segmentation task for long,\nuntrimmed videos, based on solving an optimal transport problem. By encoding a\ntemporal consistency prior into a Gromov-Wasserstein problem, we are able to\ndecode a temporally consistent segmentation from a noisy affinity/matching cost\nmatrix between video frames and action classes. Unlike previous approaches, our\nmethod does not require knowing the action order for a video to attain temporal\nconsistency. Furthermore, our resulting (fused) Gromov-Wasserstein problem can\nbe efficiently solved on GPUs using a few iterations of projected mirror\ndescent. We demonstrate the effectiveness of our method in an unsupervised\nlearning setting, where our method is used to generate pseudo-labels for\nself-training. We evaluate our segmentation approach and unsupervised learning\npipeline on the Breakfast, 50-Salads, YouTube Instructions and Desktop Assembly\ndatasets, yielding state-of-the-art results for the unsupervised video action\nsegmentation task.\n","authors":["Ming Xu","Stephen Gould"],"pdf_url":"https://arxiv.org/pdf/2404.01518v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19837v2","updated":"2024-04-01T22:34:37Z","published":"2024-03-28T21:15:38Z","title":"Concept-based Analysis of Neural Networks via Vision-Language Models","summary":" The analysis of vision-based deep neural networks (DNNs) is highly desirable\nbut it is very challenging due to the difficulty of expressing formal\nspecifications for vision tasks and the lack of efficient verification\nprocedures. In this paper, we propose to leverage emerging multimodal,\nvision-language, foundation models (VLMs) as a lens through which we can reason\nabout vision models. VLMs have been trained on a large body of images\naccompanied by their textual description, and are thus implicitly aware of\nhigh-level, human-understandable concepts describing the images. We describe a\nlogical specification language $\\texttt{Con}_{\\texttt{spec}}$ designed to\nfacilitate writing specifications in terms of these concepts. To define and\nformally check $\\texttt{Con}_{\\texttt{spec}}$ specifications, we build a map\nbetween the internal representations of a given vision model and a VLM, leading\nto an efficient verification procedure of natural-language properties for\nvision models. We demonstrate our techniques on a ResNet-based classifier\ntrained on the RIVAL-10 dataset using CLIP as the multimodal model.\n","authors":["Ravi Mangal","Nina Narodytska","Divya Gopinath","Boyue Caroline Hu","Anirban Roy","Susmit Jha","Corina Pasareanu"],"pdf_url":"https://arxiv.org/pdf/2403.19837v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01509v1","updated":"2024-04-01T22:25:48Z","published":"2024-04-01T22:25:48Z","title":"Can Biases in ImageNet Models Explain Generalization?","summary":" The robust generalization of models to rare, in-distribution (ID) samples\ndrawn from the long tail of the training distribution and to\nout-of-training-distribution (OOD) samples is one of the major challenges of\ncurrent deep learning methods. For image classification, this manifests in the\nexistence of adversarial attacks, the performance drops on distorted images,\nand a lack of generalization to concepts such as sketches. The current\nunderstanding of generalization in neural networks is very limited, but some\nbiases that differentiate models from human vision have been identified and\nmight be causing these limitations. Consequently, several attempts with varying\nsuccess have been made to reduce these biases during training to improve\ngeneralization. We take a step back and sanity-check these attempts. Fixing the\narchitecture to the well-established ResNet-50, we perform a large-scale study\non 48 ImageNet models obtained via different training methods to understand how\nand if these biases - including shape bias, spectral biases, and critical bands\n- interact with generalization. Our extensive study results reveal that\ncontrary to previous findings, these biases are insufficient to accurately\npredict the generalization of a model holistically. We provide access to all\ncheckpoints and evaluation code at\nhttps://github.com/paulgavrikov/biases_vs_generalization\n","authors":["Paul Gavrikov","Janis Keuper"],"pdf_url":"https://arxiv.org/pdf/2404.01509v1.pdf","comment":"Accepted at CVPR2024"},{"id":"http://arxiv.org/abs/2401.02402v2","updated":"2024-04-01T22:21:00Z","published":"2024-01-04T18:39:32Z","title":"3D Open-Vocabulary Panoptic Segmentation with 2D-3D Vision-Language\n Distillation","summary":" 3D panoptic segmentation is a challenging perception task, especially in\nautonomous driving. It aims to predict both semantic and instance annotations\nfor 3D points in a scene. Although prior 3D panoptic segmentation approaches\nhave achieved great performance on closed-set benchmarks, generalizing these\napproaches to unseen things and unseen stuff categories remains an open\nproblem. For unseen object categories, 2D open-vocabulary segmentation has\nachieved promising results that solely rely on frozen CLIP backbones and\nensembling multiple classification outputs. However, we find that simply\nextending these 2D models to 3D does not guarantee good performance due to poor\nper-mask classification quality, especially for novel stuff categories. In this\npaper, we propose the first method to tackle 3D open-vocabulary panoptic\nsegmentation. Our model takes advantage of the fusion between learnable LiDAR\nfeatures and dense frozen vision CLIP features, using a single classification\nhead to make predictions for both base and novel classes. To further improve\nthe classification performance on novel classes and leverage the CLIP model, we\npropose two novel loss functions: object-level distillation loss and\nvoxel-level distillation loss. Our experiments on the nuScenes and\nSemanticKITTI datasets show that our method outperforms the strong baseline by\na large margin.\n","authors":["Zihao Xiao","Longlong Jing","Shangxuan Wu","Alex Zihao Zhu","Jingwei Ji","Chiyu Max Jiang","Wei-Chih Hung","Thomas Funkhouser","Weicheng Kuo","Anelia Angelova","Yin Zhou","Shiwei Sheng"],"pdf_url":"https://arxiv.org/pdf/2401.02402v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01501v1","updated":"2024-04-01T21:49:05Z","published":"2024-04-01T21:49:05Z","title":"MosquitoFusion: A Multiclass Dataset for Real-Time Detection of\n Mosquitoes, Swarms, and Breeding Sites Using Deep Learning","summary":" In this paper, we present an integrated approach to real-time mosquito\ndetection using our multiclass dataset (MosquitoFusion) containing 1204 diverse\nimages and leverage cutting-edge technologies, specifically computer vision, to\nautomate the identification of Mosquitoes, Swarms, and Breeding Sites. The\npre-trained YOLOv8 model, trained on this dataset, achieved a mean Average\nPrecision (mAP@50) of 57.1%, with precision at 73.4% and recall at 50.5%. The\nintegration of Geographic Information Systems (GIS) further enriches the depth\nof our analysis, providing valuable insights into spatial patterns. The dataset\nand code are available at https://github.com/faiyazabdullah/MosquitoFusion.\n","authors":["Md. Faiyaz Abdullah Sayeedi","Fahim Hafiz","Md Ashiqur Rahman"],"pdf_url":"https://arxiv.org/pdf/2404.01501v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01492v1","updated":"2024-04-01T21:28:50Z","published":"2024-04-01T21:28:50Z","title":"Modality Translation for Object Detection Adaptation Without Forgetting\n Prior Knowledge","summary":" A common practice in deep learning consists of training large neural networks\non massive datasets to perform accurately for different domains and tasks.\nWhile this methodology may work well in numerous application areas, it only\napplies across modalities due to a larger distribution shift in data captured\nusing different sensors. This paper focuses on the problem of adapting a large\nobject detection model to one or multiple modalities while being efficient. To\ndo so, we propose ModTr as an alternative to the common approach of fine-tuning\nlarge models. ModTr consists of adapting the input with a small transformation\nnetwork trained to minimize the detection loss directly. The original model can\ntherefore work on the translated inputs without any further change or\nfine-tuning to its parameters. Experimental results on translating from IR to\nRGB images on two well-known datasets show that this simple ModTr approach\nprovides detectors that can perform comparably or better than the standard\nfine-tuning without forgetting the original knowledge. This opens the doors to\na more flexible and efficient service-based detection pipeline in which,\ninstead of using a different detector for each modality, a unique and unaltered\nserver is constantly running, where multiple modalities with the corresponding\ntranslations can query it. Code: https://github.com/heitorrapela/ModTr.\n","authors":["Heitor Rapela Medeiros","Masih Aminbeidokhti","Fidel Guerrero Pena","David Latortue","Eric Granger","Marco Pedersoli"],"pdf_url":"https://arxiv.org/pdf/2404.01492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01491v1","updated":"2024-04-01T21:23:03Z","published":"2024-04-01T21:23:03Z","title":"SUGAR: Pre-training 3D Visual Representations for Robotics","summary":" Learning generalizable visual representations from Internet data has yielded\npromising results for robotics. Yet, prevailing approaches focus on\npre-training 2D representations, being sub-optimal to deal with occlusions and\naccurately localize objects in complex 3D scenes. Meanwhile, 3D representation\nlearning has been limited to single-object understanding. To address these\nlimitations, we introduce a novel 3D pre-training framework for robotics named\nSUGAR that captures semantic, geometric and affordance properties of objects\nthrough 3D point clouds. We underscore the importance of cluttered scenes in 3D\nrepresentation learning, and automatically construct a multi-object dataset\nbenefiting from cost-free supervision in simulation. SUGAR employs a versatile\ntransformer-based model to jointly address five pre-training tasks, namely\ncross-modal knowledge distillation for semantic learning, masked point modeling\nto understand geometry structures, grasping pose synthesis for object\naffordance, 3D instance segmentation and referring expression grounding to\nanalyze cluttered scenes. We evaluate our learned representation on three\nrobotic-related tasks, namely, zero-shot 3D object recognition, referring\nexpression grounding, and language-driven robotic manipulation. Experimental\nresults show that SUGAR's 3D representation outperforms state-of-the-art 2D and\n3D representations.\n","authors":["Shizhe Chen","Ricardo Garcia","Ivan Laptev","Cordelia Schmid"],"pdf_url":"https://arxiv.org/pdf/2404.01491v1.pdf","comment":"Accepted to CVPR 2024. Project webpage:\n https://cshizhe.github.io/projects/robot_sugar.html"},{"id":"http://arxiv.org/abs/2404.01486v1","updated":"2024-04-01T21:11:43Z","published":"2024-04-01T21:11:43Z","title":"QuAD: Query-based Interpretable Neural Motion Planning for Autonomous\n Driving","summary":" A self-driving vehicle must understand its environment to determine the\nappropriate action. Traditional autonomy systems rely on object detection to\nfind the agents in the scene. However, object detection assumes a discrete set\nof objects and loses information about uncertainty, so any errors compound when\npredicting the future behavior of those agents. Alternatively, dense occupancy\ngrid maps have been utilized to understand free-space. However, predicting a\ngrid for the entire scene is wasteful since only certain spatio-temporal\nregions are reachable and relevant to the self-driving vehicle. We present a\nunified, interpretable, and efficient autonomy framework that moves away from\ncascading modules that first perceive, then predict, and finally plan. Instead,\nwe shift the paradigm to have the planner query occupancy at relevant\nspatio-temporal points, restricting the computation to those regions of\ninterest. Exploiting this representation, we evaluate candidate trajectories\naround key factors such as collision avoidance, comfort, and progress for\nsafety and interpretability. Our approach achieves better highway driving\nquality than the state-of-the-art in high-fidelity closed-loop simulations.\n","authors":["Sourav Biswas","Sergio Casas","Quinlan Sykora","Ben Agro","Abbas Sadat","Raquel Urtasun"],"pdf_url":"https://arxiv.org/pdf/2404.01486v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01476v1","updated":"2024-04-01T20:58:24Z","published":"2024-04-01T20:58:24Z","title":"TraveLER: A Multi-LMM Agent Framework for Video Question-Answering","summary":" Recently, Large Multimodal Models (LMMs) have made significant progress in\nvideo question-answering using a frame-wise approach by leveraging large-scale,\nimage-based pretraining in a zero-shot manner. While image-based methods for\nvideos have shown impressive performance, a current limitation is that they\noften overlook how key timestamps are selected and cannot adjust when incorrect\ntimestamps are identified. Moreover, they are unable to extract details\nrelevant to the question, instead providing general descriptions of the frame.\nTo overcome this, we design a multi-LMM agent framework that travels along the\nvideo, iteratively collecting relevant information from keyframes through\ninteractive question-asking until there is sufficient information to answer the\nquestion. Specifically, we propose TraveLER, a model that can create a plan to\n\"Traverse\" through the video, ask questions about individual frames to \"Locate\"\nand store key information, and then \"Evaluate\" if there is enough information\nto answer the question. Finally, if there is not enough information, our method\nis able to \"Replan\" based on its collected knowledge. Through extensive\nexperiments, we find that the proposed TraveLER approach improves performance\non several video question-answering benchmarks, such as NExT-QA, STAR, and\nPerception Test, without the need to fine-tune on specific datasets.\n","authors":["Chuyi Shang","Amos You","Sanjay Subramanian","Trevor Darrell","Roei Herzig"],"pdf_url":"https://arxiv.org/pdf/2404.01476v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10516v2","updated":"2024-04-01T20:57:45Z","published":"2024-03-15T17:57:06Z","title":"FeatUp: A Model-Agnostic Framework for Features at Any Resolution","summary":" Deep features are a cornerstone of computer vision research, capturing image\nsemantics and enabling the community to solve downstream tasks even in the\nzero- or few-shot regime. However, these features often lack the spatial\nresolution to directly perform dense prediction tasks like segmentation and\ndepth prediction because models aggressively pool information over large areas.\nIn this work, we introduce FeatUp, a task- and model-agnostic framework to\nrestore lost spatial information in deep features. We introduce two variants of\nFeatUp: one that guides features with high-resolution signal in a single\nforward pass, and one that fits an implicit model to a single image to\nreconstruct features at any resolution. Both approaches use a multi-view\nconsistency loss with deep analogies to NeRFs. Our features retain their\noriginal semantics and can be swapped into existing applications to yield\nresolution and performance gains even without re-training. We show that FeatUp\nsignificantly outperforms other feature upsampling and image super-resolution\napproaches in class activation map generation, transfer learning for\nsegmentation and depth prediction, and end-to-end training for semantic\nsegmentation.\n","authors":["Stephanie Fu","Mark Hamilton","Laura Brandt","Axel Feldman","Zhoutong Zhang","William T. Freeman"],"pdf_url":"https://arxiv.org/pdf/2403.10516v2.pdf","comment":"Accepted to the International Conference on Learning Representations\n (ICLR) 2024"},{"id":"http://arxiv.org/abs/2404.01464v1","updated":"2024-04-01T20:25:04Z","published":"2024-04-01T20:25:04Z","title":"Data-Efficient Unsupervised Interpolation Without Any Intermediate Frame\n for 4D Medical Images","summary":" 4D medical images, which represent 3D images with temporal information, are\ncrucial in clinical practice for capturing dynamic changes and monitoring\nlong-term disease progression. However, acquiring 4D medical images poses\nchallenges due to factors such as radiation exposure and imaging duration,\nnecessitating a balance between achieving high temporal resolution and\nminimizing adverse effects. Given these circumstances, not only is data\nacquisition challenging, but increasing the frame rate for each dataset also\nproves difficult. To address this challenge, this paper proposes a simple yet\neffective Unsupervised Volumetric Interpolation framework, UVI-Net. This\nframework facilitates temporal interpolation without the need for any\nintermediate frames, distinguishing it from the majority of other existing\nunsupervised methods. Experiments on benchmark datasets demonstrate significant\nimprovements across diverse evaluation metrics compared to unsupervised and\nsupervised baselines. Remarkably, our approach achieves this superior\nperformance even when trained with a dataset as small as one, highlighting its\nexceptional robustness and efficiency in scenarios with sparse supervision.\nThis positions UVI-Net as a compelling alternative for 4D medical imaging,\nparticularly in settings where data availability is limited. The source code is\navailable at https://github.com/jungeun122333/UVI-Net.\n","authors":["JungEun Kim","Hangyul Yoon","Geondo Park","Kyungsu Kim","Eunho Yang"],"pdf_url":"https://arxiv.org/pdf/2404.01464v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2310.16112v2","updated":"2024-04-01T20:18:02Z","published":"2023-10-24T18:26:22Z","title":"Towards long-tailed, multi-label disease classification from chest\n X-ray: Overview of the CXR-LT challenge","summary":" Many real-world image recognition problems, such as diagnostic medical\nimaging exams, are \"long-tailed\" $\\unicode{x2013}$ there are a few common\nfindings followed by many more relatively rare conditions. In chest\nradiography, diagnosis is both a long-tailed and multi-label problem, as\npatients often present with multiple findings simultaneously. While researchers\nhave begun to study the problem of long-tailed learning in medical image\nrecognition, few have studied the interaction of label imbalance and label\nco-occurrence posed by long-tailed, multi-label disease classification. To\nengage with the research community on this emerging topic, we conducted an open\nchallenge, CXR-LT, on long-tailed, multi-label thorax disease classification\nfrom chest X-rays (CXRs). We publicly release a large-scale benchmark dataset\nof over 350,000 CXRs, each labeled with at least one of 26 clinical findings\nfollowing a long-tailed distribution. We synthesize common themes of\ntop-performing solutions, providing practical recommendations for long-tailed,\nmulti-label medical image classification. Finally, we use these insights to\npropose a path forward involving vision-language foundation models for few- and\nzero-shot disease classification.\n","authors":["Gregory Holste","Yiliang Zhou","Song Wang","Ajay Jaiswal","Mingquan Lin","Sherry Zhuge","Yuzhe Yang","Dongkyun Kim","Trong-Hieu Nguyen-Mau","Minh-Triet Tran","Jaehyup Jeong","Wongi Park","Jongbin Ryu","Feng Hong","Arsh Verma","Yosuke Yamagishi","Changhyun Kim","Hyeryeong Seo","Myungjoo Kang","Leo Anthony Celi","Zhiyong Lu","Ronald M. Summers","George Shih","Zhangyang Wang","Yifan Peng"],"pdf_url":"https://arxiv.org/pdf/2310.16112v2.pdf","comment":"Update after major revision"},{"id":"http://arxiv.org/abs/2403.02090v2","updated":"2024-04-01T20:03:38Z","published":"2024-03-04T14:46:58Z","title":"Modeling Multimodal Social Interactions: New Challenges and Baselines\n with Densely Aligned Representations","summary":" Understanding social interactions involving both verbal and non-verbal cues\nis essential for effectively interpreting social situations. However, most\nprior works on multimodal social cues focus predominantly on single-person\nbehaviors or rely on holistic visual representations that are not aligned to\nutterances in multi-party environments. Consequently, they are limited in\nmodeling the intricate dynamics of multi-party interactions. In this paper, we\nintroduce three new challenging tasks to model the fine-grained dynamics\nbetween multiple people: speaking target identification, pronoun coreference\nresolution, and mentioned player prediction. We contribute extensive data\nannotations to curate these new challenges in social deduction game settings.\nFurthermore, we propose a novel multimodal baseline that leverages densely\naligned language-visual representations by synchronizing visual features with\ntheir corresponding utterances. This facilitates concurrently capturing verbal\nand non-verbal cues pertinent to social reasoning. Experiments demonstrate the\neffectiveness of the proposed approach with densely aligned multimodal\nrepresentations in modeling fine-grained social interactions. Project website:\nhttps://sangmin-git.github.io/projects/MMSI.\n","authors":["Sangmin Lee","Bolin Lai","Fiona Ryan","Bikram Boote","James M. Rehg"],"pdf_url":"https://arxiv.org/pdf/2403.02090v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01446v1","updated":"2024-04-01T19:33:41Z","published":"2024-04-01T19:33:41Z","title":"Finding Regions of Interest in Whole Slide Images Using Multiple\n Instance Learning","summary":" Whole Slide Images (WSI), obtained by high-resolution digital scanning of\nmicroscope slides at multiple scales, are the cornerstone of modern Digital\nPathology. However, they represent a particular challenge to\nAI-based/AI-mediated analysis because pathology labeling is typically done at\nslide-level, instead of tile-level. It is not just that medical diagnostics is\nrecorded at the specimen level, the detection of oncogene mutation is also\nexperimentally obtained, and recorded by initiatives like The Cancer Genome\nAtlas (TCGA), at the slide level. This configures a dual challenge: a)\naccurately predicting the overall cancer phenotype and b) finding out what\ncellular morphologies are associated with it at the tile level. To address\nthese challenges, a weakly supervised Multiple Instance Learning (MIL) approach\nwas explored for two prevalent cancer types, Invasive Breast Carcinoma\n(TCGA-BRCA) and Lung Squamous Cell Carcinoma (TCGA-LUSC). This approach was\nexplored for tumor detection at low magnification levels and TP53 mutations at\nvarious levels. Our results show that a novel additive implementation of MIL\nmatched the performance of reference implementation (AUC 0.96), and was only\nslightly outperformed by Attention MIL (AUC 0.97). More interestingly from the\nperspective of the molecular pathologist, these different AI architectures\nidentify distinct sensitivities to morphological features (through the\ndetection of Regions of Interest, RoI) at different amplification levels.\nTellingly, TP53 mutation was most sensitive to features at the higher\napplications where cellular morphology is resolved.\n","authors":["Martim Afonso","Praphulla M. S. Bhawsar","Monjoy Saha","Jonas S. Almeida","Arlindo L. Oliveira"],"pdf_url":"https://arxiv.org/pdf/2404.01446v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01440v1","updated":"2024-04-01T19:23:00Z","published":"2024-04-01T19:23:00Z","title":"Neural Implicit Representation for Building Digital Twins of Unknown\n Articulated Objects","summary":" We address the problem of building digital twins of unknown articulated\nobjects from two RGBD scans of the object at different articulation states. We\ndecompose the problem into two stages, each addressing distinct aspects. Our\nmethod first reconstructs object-level shape at each state, then recovers the\nunderlying articulation model including part segmentation and joint\narticulations that associate the two states. By explicitly modeling point-level\ncorrespondences and exploiting cues from images, 3D reconstructions, and\nkinematics, our method yields more accurate and stable results compared to\nprior work. It also handles more than one movable part and does not rely on any\nobject shape or structure priors. Project page:\nhttps://github.com/NVlabs/DigitalTwinArt\n","authors":["Yijia Weng","Bowen Wen","Jonathan Tremblay","Valts Blukis","Dieter Fox","Leonidas Guibas","Stan Birchfield"],"pdf_url":"https://arxiv.org/pdf/2404.01440v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01438v1","updated":"2024-04-01T19:22:43Z","published":"2024-04-01T19:22:43Z","title":"Generation and Detection of Sign Language Deepfakes - A Linguistic and\n Visual Analysis","summary":" A question in the realm of deepfakes is slowly emerging pertaining to whether\nwe can go beyond facial deepfakes and whether it would be beneficial to\nsociety. Therefore, this research presents a positive application of deepfake\ntechnology in upper body generation, while performing sign-language for the\nDeaf and Hard of Hearing (DHoH) community. The resulting videos are later\nvetted with a sign language expert. This is particularly helpful, given the\nintricate nature of sign language, a scarcity of sign language experts, and\npotential benefits for health and education. The objectives of this work\nencompass constructing a reliable deepfake dataset, evaluating its technical\nand visual credibility through computer vision and natural language processing\nmodels, and assessing the plausibility of the generated content. With over 1200\nvideos, featuring both previously seen and unseen individuals for the\ngeneration model, using the help of a sign language expert, we establish a\ndeepfake dataset in sign language that can further be utilized to detect fake\nvideos that may target certain people of determination.\n","authors":["Shahzeb Naeem","Muhammad Riyyan Khan","Usman Tariq","Abhinav Dhall","Carlos Ivan Colon","Hasan Al-Nashash"],"pdf_url":"https://arxiv.org/pdf/2404.01438v1.pdf","comment":"13 pages, 13 figures, Computer Vision and Image Understanding Journal"},{"id":"http://arxiv.org/abs/2404.01437v1","updated":"2024-04-01T19:20:32Z","published":"2024-04-01T19:20:32Z","title":"The Radar Ghost Dataset -- An Evaluation of Ghost Objects in Automotive\n Radar Data","summary":" Radar sensors have a long tradition in advanced driver assistance systems\n(ADAS) and also play a major role in current concepts for autonomous vehicles.\nTheir importance is reasoned by their high robustness against meteorological\neffects, such as rain, snow, or fog, and the radar's ability to measure\nrelative radial velocity differences via the Doppler effect. The cause for\nthese advantages, namely the large wavelength, is also one of the drawbacks of\nradar sensors. Compared to camera or lidar sensor, a lot more surfaces in a\ntypical traffic scenario appear flat relative to the radar's emitted signal.\nThis results in multi-path reflections or so called ghost detections in the\nradar signal. Ghost objects pose a major source for potential false positive\ndetections in a vehicle's perception pipeline. Therefore, it is important to be\nable to segregate multi-path reflections from direct ones. In this article, we\npresent a dataset with detailed manual annotations for different kinds of ghost\ndetections. Moreover, two different approaches for identifying these kinds of\nobjects are evaluated. We hope that our dataset encourages more researchers to\nengage in the fields of multi-path object suppression or exploitation.\n","authors":["Florian Kraus","Nicolas Scheiner","Werner Ritter","Klaus Dietmayer"],"pdf_url":"https://arxiv.org/pdf/2404.01437v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14334v2","updated":"2024-04-01T19:16:24Z","published":"2023-05-23T17:58:05Z","title":"Diffusion Hyperfeatures: Searching Through Time and Space for Semantic\n Correspondence","summary":" Diffusion models have been shown to be capable of generating high-quality\nimages, suggesting that they could contain meaningful internal representations.\nUnfortunately, the feature maps that encode a diffusion model's internal\ninformation are spread not only over layers of the network, but also over\ndiffusion timesteps, making it challenging to extract useful descriptors. We\npropose Diffusion Hyperfeatures, a framework for consolidating multi-scale and\nmulti-timestep feature maps into per-pixel feature descriptors that can be used\nfor downstream tasks. These descriptors can be extracted for both synthetic and\nreal images using the generation and inversion processes. We evaluate the\nutility of our Diffusion Hyperfeatures on the task of semantic keypoint\ncorrespondence: our method achieves superior performance on the SPair-71k real\nimage benchmark. We also demonstrate that our method is flexible and\ntransferable: our feature aggregation network trained on the inversion features\nof real image pairs can be used on the generation features of synthetic image\npairs with unseen objects and compositions. Our code is available at\nhttps://diffusion-hyperfeatures.github.io.\n","authors":["Grace Luo","Lisa Dunlap","Dong Huk Park","Aleksander Holynski","Trevor Darrell"],"pdf_url":"https://arxiv.org/pdf/2305.14334v2.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2306.06189v2","updated":"2024-04-01T19:14:25Z","published":"2023-06-09T18:41:37Z","title":"FasterViT: Fast Vision Transformers with Hierarchical Attention","summary":" We design a new family of hybrid CNN-ViT neural networks, named FasterViT,\nwith a focus on high image throughput for computer vision (CV) applications.\nFasterViT combines the benefits of fast local representation learning in CNNs\nand global modeling properties in ViT. Our newly introduced Hierarchical\nAttention (HAT) approach decomposes global self-attention with quadratic\ncomplexity into a multi-level attention with reduced computational costs. We\nbenefit from efficient window-based self-attention. Each window has access to\ndedicated carrier tokens that participate in local and global representation\nlearning. At a high level, global self-attentions enable the efficient\ncross-window communication at lower costs. FasterViT achieves a SOTA\nPareto-front in terms of accuracy and image throughput. We have extensively\nvalidated its effectiveness on various CV tasks including classification,\nobject detection and segmentation. We also show that HAT can be used as a\nplug-and-play module for existing networks and enhance them. We further\ndemonstrate significantly faster and more accurate performance than competitive\ncounterparts for images with high resolution. Code is available at\nhttps://github.com/NVlabs/FasterViT.\n","authors":["Ali Hatamizadeh","Greg Heinrich","Hongxu Yin","Andrew Tao","Jose M. Alvarez","Jan Kautz","Pavlo Molchanov"],"pdf_url":"https://arxiv.org/pdf/2306.06189v2.pdf","comment":"ICLR'24 Accepted Paper"},{"id":"http://arxiv.org/abs/2404.01424v1","updated":"2024-04-01T18:59:13Z","published":"2024-04-01T18:59:13Z","title":"DPMesh: Exploiting Diffusion Prior for Occluded Human Mesh Recovery","summary":" The recovery of occluded human meshes presents challenges for current methods\ndue to the difficulty in extracting effective image features under severe\nocclusion. In this paper, we introduce DPMesh, an innovative framework for\noccluded human mesh recovery that capitalizes on the profound diffusion prior\nabout object structure and spatial relationships embedded in a pre-trained\ntext-to-image diffusion model. Unlike previous methods reliant on conventional\nbackbones for vanilla feature extraction, DPMesh seamlessly integrates the\npre-trained denoising U-Net with potent knowledge as its image backbone and\nperforms a single-step inference to provide occlusion-aware information. To\nenhance the perception capability for occluded poses, DPMesh incorporates\nwell-designed guidance via condition injection, which produces effective\ncontrols from 2D observations for the denoising U-Net. Furthermore, we explore\na dedicated noisy key-point reasoning approach to mitigate disturbances arising\nfrom occlusion and crowded scenarios. This strategy fully unleashes the\nperceptual capability of the diffusion prior, thereby enhancing accuracy.\nExtensive experiments affirm the efficacy of our framework, as we outperform\nstate-of-the-art methods on both occlusion-specific and standard datasets. The\npersuasive results underscore its ability to achieve precise and robust 3D\nhuman mesh recovery, particularly in challenging scenarios involving occlusion\nand crowded scenes.\n","authors":["Yixuan Zhu","Ao Li","Yansong Tang","Wenliang Zhao","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2404.01424v1.pdf","comment":"Accepted by IEEE/CVF Conference on Computer Vision and Pattern\n Recognition (CVPR) 2024"},{"id":"http://arxiv.org/abs/2312.02139v2","updated":"2024-04-01T18:55:16Z","published":"2023-12-04T18:57:01Z","title":"DiffiT: Diffusion Vision Transformers for Image Generation","summary":" Diffusion models with their powerful expressivity and high sample quality\nhave achieved State-Of-The-Art (SOTA) performance in the generative domain. The\npioneering Vision Transformer (ViT) has also demonstrated strong modeling\ncapabilities and scalability, especially for recognition tasks. In this paper,\nwe study the effectiveness of ViTs in diffusion-based generative learning and\npropose a new model denoted as Diffusion Vision Transformers (DiffiT).\nSpecifically, we propose a methodology for finegrained control of the denoising\nprocess and introduce the Time-dependant Multihead Self Attention (TMSA)\nmechanism. DiffiT is surprisingly effective in generating high-fidelity images\nwith significantly better parameter efficiency. We also propose latent and\nimage space DiffiT models and show SOTA performance on a variety of\nclass-conditional and unconditional synthesis tasks at different resolutions.\nThe Latent DiffiT model achieves a new SOTA FID score of 1.73 on ImageNet-256\ndataset while having 19.85%, 16.88% less parameters than other\nTransformer-based diffusion models such as MDT and DiT, respectively. Code:\nhttps://github.com/NVlabs/DiffiT\n","authors":["Ali Hatamizadeh","Jiaming Song","Guilin Liu","Jan Kautz","Arash Vahdat"],"pdf_url":"https://arxiv.org/pdf/2312.02139v2.pdf","comment":"Revised Tech report"},{"id":"http://arxiv.org/abs/2404.01415v1","updated":"2024-04-01T18:41:30Z","published":"2024-04-01T18:41:30Z","title":"On the Faithfulness of Vision Transformer Explanations","summary":" To interpret Vision Transformers, post-hoc explanations assign salience\nscores to input pixels, providing human-understandable heatmaps. However,\nwhether these interpretations reflect true rationales behind the model's output\nis still underexplored. To address this gap, we study the faithfulness\ncriterion of explanations: the assigned salience scores should represent the\ninfluence of the corresponding input pixels on the model's predictions. To\nevaluate faithfulness, we introduce Salience-guided Faithfulness Coefficient\n(SaCo), a novel evaluation metric leveraging essential information of salience\ndistribution. Specifically, we conduct pair-wise comparisons among distinct\npixel groups and then aggregate the differences in their salience scores,\nresulting in a coefficient that indicates the explanation's degree of\nfaithfulness. Our explorations reveal that current metrics struggle to\ndifferentiate between advanced explanation methods and Random Attribution,\nthereby failing to capture the faithfulness property. In contrast, our proposed\nSaCo offers a reliable faithfulness measurement, establishing a robust metric\nfor interpretations. Furthermore, our SaCo demonstrates that the use of\ngradient and multi-layer aggregation can markedly enhance the faithfulness of\nattention-based explanation, shedding light on potential paths for advancing\nVision Transformer explainability.\n","authors":["Junyi Wu","Weitai Kang","Hao Tang","Yuan Hong","Yan Yan"],"pdf_url":"https://arxiv.org/pdf/2404.01415v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01409v1","updated":"2024-04-01T18:26:29Z","published":"2024-04-01T18:26:29Z","title":"OVFoodSeg: Elevating Open-Vocabulary Food Image Segmentation via\n Image-Informed Textual Representation","summary":" In the realm of food computing, segmenting ingredients from images poses\nsubstantial challenges due to the large intra-class variance among the same\ningredients, the emergence of new ingredients, and the high annotation costs\nassociated with large food segmentation datasets. Existing approaches primarily\nutilize a closed-vocabulary and static text embeddings setting. These methods\noften fall short in effectively handling the ingredients, particularly new and\ndiverse ones. In response to these limitations, we introduce OVFoodSeg, a\nframework that adopts an open-vocabulary setting and enhances text embeddings\nwith visual context. By integrating vision-language models (VLMs), our approach\nenriches text embedding with image-specific information through two innovative\nmodules, eg, an image-to-text learner FoodLearner and an Image-Informed Text\nEncoder. The training process of OVFoodSeg is divided into two stages: the\npre-training of FoodLearner and the subsequent learning phase for segmentation.\nThe pre-training phase equips FoodLearner with the capability to align visual\ninformation with corresponding textual representations that are specifically\nrelated to food, while the second phase adapts both the FoodLearner and the\nImage-Informed Text Encoder for the segmentation task. By addressing the\ndeficiencies of previous models, OVFoodSeg demonstrates a significant\nimprovement, achieving an 4.9\\% increase in mean Intersection over Union (mIoU)\non the FoodSeg103 dataset, setting a new milestone for food image segmentation.\n","authors":["Xiongwei Wu","Sicheng Yu","Ee-Peng Lim","Chong-Wah Ngo"],"pdf_url":"https://arxiv.org/pdf/2404.01409v1.pdf","comment":"CVPR 2024; 12 pages"},{"id":"http://arxiv.org/abs/2403.18807v3","updated":"2024-04-01T18:26:22Z","published":"2024-03-27T17:53:30Z","title":"ECoDepth: Effective Conditioning of Diffusion Models for Monocular Depth\n Estimation","summary":" In the absence of parallax cues, a learning-based single image depth\nestimation (SIDE) model relies heavily on shading and contextual cues in the\nimage. While this simplicity is attractive, it is necessary to train such\nmodels on large and varied datasets, which are difficult to capture. It has\nbeen shown that using embeddings from pre-trained foundational models, such as\nCLIP, improves zero shot transfer in several applications. Taking inspiration\nfrom this, in our paper we explore the use of global image priors generated\nfrom a pre-trained ViT model to provide more detailed contextual information.\nWe argue that the embedding vector from a ViT model, pre-trained on a large\ndataset, captures greater relevant information for SIDE than the usual route of\ngenerating pseudo image captions, followed by CLIP based text embeddings. Based\non this idea, we propose a new SIDE model using a diffusion backbone which is\nconditioned on ViT embeddings. Our proposed design establishes a new\nstate-of-the-art (SOTA) for SIDE on NYUv2 dataset, achieving Abs Rel error of\n0.059(14% improvement) compared to 0.069 by the current SOTA (VPD). And on\nKITTI dataset, achieving Sq Rel error of 0.139 (2% improvement) compared to\n0.142 by the current SOTA (GEDepth). For zero-shot transfer with a model\ntrained on NYUv2, we report mean relative improvement of (20%, 23%, 81%, 25%)\nover NeWCRFs on (Sun-RGBD, iBims1, DIODE, HyperSim) datasets, compared to (16%,\n18%, 45%, 9%) by ZoeDepth. The project page is available at\nhttps://ecodepth-iitd.github.io\n","authors":["Suraj Patni","Aradhye Agarwal","Chetan Arora"],"pdf_url":"https://arxiv.org/pdf/2403.18807v3.pdf","comment":"IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)\n 2024"},{"id":"http://arxiv.org/abs/2404.01402v1","updated":"2024-04-01T18:12:09Z","published":"2024-04-01T18:12:09Z","title":"ContactHandover: Contact-Guided Robot-to-Human Object Handover","summary":" Robot-to-human object handover is an important step in many human robot\ncollaboration tasks. A successful handover requires the robot to maintain a\nstable grasp on the object while making sure the human receives the object in a\nnatural and easy-to-use manner. We propose ContactHandover, a robot to human\nhandover system that consists of two phases: a contact-guided grasping phase\nand an object delivery phase. During the grasping phase, ContactHandover\npredicts both 6-DoF robot grasp poses and a 3D affordance map of human contact\npoints on the object. The robot grasp poses are reranked by penalizing those\nthat block human contact points, and the robot executes the highest ranking\ngrasp. During the delivery phase, the robot end effector pose is computed by\nmaximizing human contact points close to the human while minimizing the human\narm joint torques and displacements. We evaluate our system on 27 diverse\nhousehold objects and show that our system achieves better visibility and\nreachability of human contacts to the receiver compared to several baselines.\nMore results can be found on\nhttps://clairezixiwang.github.io/ContactHandover.github.io\n","authors":["Zixi Wang","Zeyi Liu","Nicolas Ouporov","Shuran Song"],"pdf_url":"https://arxiv.org/pdf/2404.01402v1.pdf","comment":"Project website:\n https://clairezixiwang.github.io/ContactHandover.github.io/"},{"id":"http://arxiv.org/abs/2308.00622v2","updated":"2024-04-01T18:10:28Z","published":"2023-08-01T15:49:40Z","title":"NeRT: Implicit Neural Representations for General Unsupervised\n Turbulence Mitigation","summary":" The atmospheric and water turbulence mitigation problems have emerged as\nchallenging inverse problems in computer vision and optics communities over the\nyears. However, current methods either rely heavily on the quality of the\ntraining dataset or fail to generalize over various scenarios, such as static\nscenes, dynamic scenes, and text reconstructions. We propose a general implicit\nneural representation for unsupervised atmospheric and water turbulence\nmitigation (NeRT). NeRT leverages the implicit neural representations and the\nphysically correct tilt-then-blur turbulence model to reconstruct the clean,\nundistorted image, given only dozens of distorted input images. Moreover, we\nshow that NeRT outperforms the state-of-the-art through various qualitative and\nquantitative evaluations of atmospheric and water turbulence datasets.\nFurthermore, we demonstrate the ability of NeRT to eliminate uncontrolled\nturbulence from real-world environments. Lastly, we incorporate NeRT into\ncontinuously captured video sequences and demonstrate $48 \\times$ speedup.\n","authors":["Weiyun Jiang","Yuhao Liu","Vivek Boominathan","Ashok Veeraraghavan"],"pdf_url":"https://arxiv.org/pdf/2308.00622v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19022v2","updated":"2024-04-01T18:09:49Z","published":"2024-03-27T21:24:20Z","title":"WALT3D: Generating Realistic Training Data from Time-Lapse Imagery for\n Reconstructing Dynamic Objects under Occlusion","summary":" Current methods for 2D and 3D object understanding struggle with severe\nocclusions in busy urban environments, partly due to the lack of large-scale\nlabeled ground-truth annotations for learning occlusion. In this work, we\nintroduce a novel framework for automatically generating a large, realistic\ndataset of dynamic objects under occlusions using freely available time-lapse\nimagery. By leveraging off-the-shelf 2D (bounding box, segmentation, keypoint)\nand 3D (pose, shape) predictions as pseudo-groundtruth, unoccluded 3D objects\nare identified automatically and composited into the background in a clip-art\nstyle, ensuring realistic appearances and physically accurate occlusion\nconfigurations. The resulting clip-art image with pseudo-groundtruth enables\nefficient training of object reconstruction methods that are robust to\nocclusions. Our method demonstrates significant improvements in both 2D and 3D\nreconstruction, particularly in scenarios with heavily occluded objects like\nvehicles and people in urban scenes.\n","authors":["Khiem Vuong","N. Dinesh Reddy","Robert Tamburo","Srinivasa G. Narasimhan"],"pdf_url":"https://arxiv.org/pdf/2403.19022v2.pdf","comment":"To appear in CVPR 2024. Homepage: https://www.cs.cmu.edu/~walt3d"},{"id":"http://arxiv.org/abs/2404.01397v1","updated":"2024-04-01T18:08:58Z","published":"2024-04-01T18:08:58Z","title":"Object-conditioned Bag of Instances for Few-Shot Personalized Instance\n Recognition","summary":" Nowadays, users demand for increased personalization of vision systems to\nlocalize and identify personal instances of objects (e.g., my dog rather than\ndog) from a few-shot dataset only. Despite outstanding results of deep networks\non classical label-abundant benchmarks (e.g., those of the latest YOLOv8 model\nfor standard object detection), they struggle to maintain within-class\nvariability to represent different instances rather than object categories\nonly. We construct an Object-conditioned Bag of Instances (OBoI) based on\nmulti-order statistics of extracted features, where generic object detection\nmodels are extended to search and identify personal instances from the OBoI's\nmetric space, without need for backpropagation. By relying on multi-order\nstatistics, OBoI achieves consistent superior accuracy in distinguishing\ndifferent instances. In the results, we achieve 77.1% personal object\nrecognition accuracy in case of 18 personal instances, showing about 12%\nrelative gain over the state of the art.\n","authors":["Umberto Michieli","Jijoong Moon","Daehyun Kim","Mete Ozay"],"pdf_url":"https://arxiv.org/pdf/2404.01397v1.pdf","comment":"ICASSP 2024. Copyright 2024 IEEE. Personal use of this material is\n permitted. Permission from IEEE must be obtained for all other uses, in any\n current or future media, including reprinting/republishing this material for\n advertising or promotional purposes, creating new collective works, for\n resale or redistribution to servers or lists, or reuse of any copyrighted\n component of this work in other"},{"id":"http://arxiv.org/abs/2404.01300v1","updated":"2024-04-01T17:59:55Z","published":"2024-04-01T17:59:55Z","title":"NeRF-MAE : Masked AutoEncoders for Self Supervised 3D representation\n Learning for Neural Radiance Fields","summary":" Neural fields excel in computer vision and robotics due to their ability to\nunderstand the 3D visual world such as inferring semantics, geometry, and\ndynamics. Given the capabilities of neural fields in densely representing a 3D\nscene from 2D images, we ask the question: Can we scale their self-supervised\npretraining, specifically using masked autoencoders, to generate effective 3D\nrepresentations from posed RGB images. Owing to the astounding success of\nextending transformers to novel data modalities, we employ standard 3D Vision\nTransformers to suit the unique formulation of NeRFs. We leverage NeRF's\nvolumetric grid as a dense input to the transformer, contrasting it with other\n3D representations such as pointclouds where the information density can be\nuneven, and the representation is irregular. Due to the difficulty of applying\nmasked autoencoders to an implicit representation, such as NeRF, we opt for\nextracting an explicit representation that canonicalizes scenes across domains\nby employing the camera trajectory for sampling. Our goal is made possible by\nmasking random patches from NeRF's radiance and density grid and employing a\nstandard 3D Swin Transformer to reconstruct the masked patches. In doing so,\nthe model can learn the semantic and spatial structure of complete scenes. We\npretrain this representation at scale on our proposed curated posed-RGB data,\ntotaling over 1.6 million images. Once pretrained, the encoder is used for\neffective 3D transfer learning. Our novel self-supervised pretraining for\nNeRFs, NeRF-MAE, scales remarkably well and improves performance on various\nchallenging 3D tasks. Utilizing unlabeled posed 2D data for pretraining,\nNeRF-MAE significantly outperforms self-supervised 3D pretraining and NeRF\nscene understanding baselines on Front3D and ScanNet datasets with an absolute\nperformance improvement of over 20% AP50 and 8% AP25 for 3D object detection.\n","authors":["Muhammad Zubair Irshad","Sergey Zakahrov","Vitor Guizilini","Adrien Gaidon","Zsolt Kira","Rares Ambrus"],"pdf_url":"https://arxiv.org/pdf/2404.01300v1.pdf","comment":"29 pages, 13 figures. Project Page: https://nerf-mae.github.io/"},{"id":"http://arxiv.org/abs/2404.01298v1","updated":"2024-04-01T17:59:53Z","published":"2024-04-01T17:59:53Z","title":"Noise2Image: Noise-Enabled Static Scene Recovery for Event Cameras","summary":" Event cameras capture changes of intensity over time as a stream of 'events'\nand generally cannot measure intensity itself; hence, they are only used for\nimaging dynamic scenes. However, fluctuations due to random photon arrival\ninevitably trigger noise events, even for static scenes. While previous efforts\nhave been focused on filtering out these undesirable noise events to improve\nsignal quality, we find that, in the photon-noise regime, these noise events\nare correlated with the static scene intensity. We analyze the noise event\ngeneration and model its relationship to illuminance. Based on this\nunderstanding, we propose a method, called Noise2Image, to leverage the\nilluminance-dependent noise characteristics to recover the static parts of a\nscene, which are otherwise invisible to event cameras. We experimentally\ncollect a dataset of noise events on static scenes to train and validate\nNoise2Image. Our results show that Noise2Image can robustly recover intensity\nimages solely from noise events, providing a novel approach for capturing\nstatic scenes in event cameras, without additional hardware.\n","authors":["Ruiming Cao","Dekel Galor","Amit Kohli","Jacob L Yates","Laura Waller"],"pdf_url":"https://arxiv.org/pdf/2404.01298v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01299v1","updated":"2024-04-01T17:59:53Z","published":"2024-04-01T17:59:53Z","title":"CausalChaos! Dataset for Comprehensive Causal Action Question Answering\n Over Longer Causal Chains Grounded in Dynamic Visual Scenes","summary":" Causal video question answering (QA) has garnered increasing interest, yet\nexisting datasets often lack depth in causal reasoning analysis. To address\nthis gap, we capitalize on the unique properties of cartoons and construct\nCausalChaos!, a novel, challenging causal Why-QA dataset built upon the iconic\n\"Tom and Jerry\" cartoon series. With thoughtful questions and multi-level\nanswers, our dataset contains much longer causal chains embedded in dynamic\ninteractions and visuals, at the same time principles of animation allows\nanimators to create well-defined, unambiguous causal relationships. These\nfactors allow models to solve more challenging, yet well-defined causal\nrelationships. We also introduce hard negative mining, including\nCausalConfusion version. While models perform well, there is much room for\nimprovement, especially, on open-ended answers. We identify more\nadvanced/explicit causal relationship modeling and joint modeling of vision and\nlanguage as the immediate areas for future efforts to focus upon. Along with\nthe other complementary datasets, our new challenging dataset will pave the way\nfor these developments in the field. We will release our dataset, codes, and\nmodels to help future efforts in this domain.\n","authors":["Ting En Lam","Yuhan Chen","Elston Tan","Eric Peh","Ruirui Chen","Paritosh Parmar","Basura Fernando"],"pdf_url":"https://arxiv.org/pdf/2404.01299v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00610v2","updated":"2024-04-01T17:59:52Z","published":"2023-09-01T17:57:02Z","title":"CityDreamer: Compositional Generative Model of Unbounded 3D Cities","summary":" 3D city generation is a desirable yet challenging task, since humans are more\nsensitive to structural distortions in urban environments. Additionally,\ngenerating 3D cities is more complex than 3D natural scenes since buildings, as\nobjects of the same class, exhibit a wider range of appearances compared to the\nrelatively consistent appearance of objects like trees in natural scenes. To\naddress these challenges, we propose \\textbf{CityDreamer}, a compositional\ngenerative model designed specifically for unbounded 3D cities. Our key insight\nis that 3D city generation should be a composition of different types of neural\nfields: 1) various building instances, and 2) background stuff, such as roads\nand green lands. Specifically, we adopt the bird's eye view scene\nrepresentation and employ a volumetric render for both instance-oriented and\nstuff-oriented neural fields. The generative hash grid and periodic positional\nembedding are tailored as scene parameterization to suit the distinct\ncharacteristics of building instances and background stuff. Furthermore, we\ncontribute a suite of CityGen Datasets, including OSM and GoogleEarth, which\ncomprises a vast amount of real-world city imagery to enhance the realism of\nthe generated 3D cities both in their layouts and appearances. CityDreamer\nachieves state-of-the-art performance not only in generating realistic 3D\ncities but also in localized editing within the generated cities.\n","authors":["Haozhe Xie","Zhaoxi Chen","Fangzhou Hong","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2309.00610v2.pdf","comment":"CVPR 2024. Project page: https://haozhexie.com/project/city-dreamer"},{"id":"http://arxiv.org/abs/2404.01367v1","updated":"2024-04-01T17:59:48Z","published":"2024-04-01T17:59:48Z","title":"Bigger is not Always Better: Scaling Properties of Latent Diffusion\n Models","summary":" We study the scaling properties of latent diffusion models (LDMs) with an\nemphasis on their sampling efficiency. While improved network architecture and\ninference algorithms have shown to effectively boost sampling efficiency of\ndiffusion models, the role of model size -- a critical determinant of sampling\nefficiency -- has not been thoroughly examined. Through empirical analysis of\nestablished text-to-image diffusion models, we conduct an in-depth\ninvestigation into how model size influences sampling efficiency across varying\nsampling steps. Our findings unveil a surprising trend: when operating under a\ngiven inference budget, smaller models frequently outperform their larger\nequivalents in generating high-quality results. Moreover, we extend our study\nto demonstrate the generalizability of the these findings by applying various\ndiffusion samplers, exploring diverse downstream tasks, evaluating\npost-distilled models, as well as comparing performance relative to training\ncompute. These findings open up new pathways for the development of LDM scaling\nstrategies which can be employed to enhance generative capabilities within\nlimited inference budgets.\n","authors":["Kangfu Mei","Zhengzhong Tu","Mauricio Delbracio","Hossein Talebi","Vishal M. Patel","Peyman Milanfar"],"pdf_url":"https://arxiv.org/pdf/2404.01367v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01297v1","updated":"2024-04-01T17:59:15Z","published":"2024-04-01T17:59:15Z","title":"Streaming Dense Video Captioning","summary":" An ideal model for dense video captioning -- predicting captions localized\ntemporally in a video -- should be able to handle long input videos, predict\nrich, detailed textual descriptions, and be able to produce outputs before\nprocessing the entire video. Current state-of-the-art models, however, process\na fixed number of downsampled frames, and make a single full prediction after\nseeing the whole video. We propose a streaming dense video captioning model\nthat consists of two novel components: First, we propose a new memory module,\nbased on clustering incoming tokens, which can handle arbitrarily long videos\nas the memory is of a fixed size. Second, we develop a streaming decoding\nalgorithm that enables our model to make predictions before the entire video\nhas been processed. Our model achieves this streaming ability, and\nsignificantly improves the state-of-the-art on three dense video captioning\nbenchmarks: ActivityNet, YouCook2 and ViTT. Our code is released at\nhttps://github.com/google-research/scenic.\n","authors":["Xingyi Zhou","Anurag Arnab","Shyamal Buch","Shen Yan","Austin Myers","Xuehan Xiong","Arsha Nagrani","Cordelia Schmid"],"pdf_url":"https://arxiv.org/pdf/2404.01297v1.pdf","comment":"CVPR 2024. Code is available at\n https://github.com/google-research/scenic/tree/main/scenic/projects/streaming_dvc"},{"id":"http://arxiv.org/abs/2404.01296v1","updated":"2024-04-01T17:59:11Z","published":"2024-04-01T17:59:11Z","title":"MagicMirror: Fast and High-Quality Avatar Generation with a Constrained\n Search Space","summary":" We introduce a novel framework for 3D human avatar generation and\npersonalization, leveraging text prompts to enhance user engagement and\ncustomization. Central to our approach are key innovations aimed at overcoming\nthe challenges in photo-realistic avatar synthesis. Firstly, we utilize a\nconditional Neural Radiance Fields (NeRF) model, trained on a large-scale\nunannotated multi-view dataset, to create a versatile initial solution space\nthat accelerates and diversifies avatar generation. Secondly, we develop a\ngeometric prior, leveraging the capabilities of Text-to-Image Diffusion Models,\nto ensure superior view invariance and enable direct optimization of avatar\ngeometry. These foundational ideas are complemented by our optimization\npipeline built on Variational Score Distillation (VSD), which mitigates texture\nloss and over-saturation issues. As supported by our extensive experiments,\nthese strategies collectively enable the creation of custom avatars with\nunparalleled visual quality and better adherence to input text prompts. You can\nfind more results and videos in our website:\nhttps://syntec-research.github.io/MagicMirror\n","authors":["Armand Comas-Massagué","Di Qiu","Menglei Chai","Marcel Bühler","Amit Raj","Ruiqi Gao","Qiangeng Xu","Mark Matthews","Paulo Gotardo","Octavia Camps","Sergio Orts-Escolano","Thabo Beeler"],"pdf_url":"https://arxiv.org/pdf/2404.01296v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01294v1","updated":"2024-04-01T17:59:05Z","published":"2024-04-01T17:59:05Z","title":"CosmicMan: A Text-to-Image Foundation Model for Humans","summary":" We present CosmicMan, a text-to-image foundation model specialized for\ngenerating high-fidelity human images. Unlike current general-purpose\nfoundation models that are stuck in the dilemma of inferior quality and\ntext-image misalignment for humans, CosmicMan enables generating\nphoto-realistic human images with meticulous appearance, reasonable structure,\nand precise text-image alignment with detailed dense descriptions. At the heart\nof CosmicMan's success are the new reflections and perspectives on data and\nmodels: (1) We found that data quality and a scalable data production flow are\nessential for the final results from trained models. Hence, we propose a new\ndata production paradigm, Annotate Anyone, which serves as a perpetual data\nflywheel to produce high-quality data with accurate yet cost-effective\nannotations over time. Based on this, we constructed a large-scale dataset,\nCosmicMan-HQ 1.0, with 6 Million high-quality real-world human images in a mean\nresolution of 1488x1255, and attached with precise text annotations deriving\nfrom 115 Million attributes in diverse granularities. (2) We argue that a\ntext-to-image foundation model specialized for humans must be pragmatic -- easy\nto integrate into down-streaming tasks while effective in producing\nhigh-quality human images. Hence, we propose to model the relationship between\ndense text descriptions and image pixels in a decomposed manner, and present\nDecomposed-Attention-Refocusing (Daring) training framework. It seamlessly\ndecomposes the cross-attention features in existing text-to-image diffusion\nmodel, and enforces attention refocusing without adding extra modules. Through\nDaring, we show that explicitly discretizing continuous text space into several\nbasic groups that align with human body structure is the key to tackling the\nmisalignment problem in a breeze.\n","authors":["Shikai Li","Jianglin Fu","Kaiyuan Liu","Wentao Wang","Kwan-Yee Lin","Wayne Wu"],"pdf_url":"https://arxiv.org/pdf/2404.01294v1.pdf","comment":"Accepted by CVPR 2024. The supplementary material is included.\n Project Page: https://cosmicman-cvpr2024.github.io"},{"id":"http://arxiv.org/abs/2404.01292v1","updated":"2024-04-01T17:58:30Z","published":"2024-04-01T17:58:30Z","title":"Measuring Style Similarity in Diffusion Models","summary":" Generative models are now widely used by graphic designers and artists. Prior\nworks have shown that these models remember and often replicate content from\ntheir training data during generation. Hence as their proliferation increases,\nit has become important to perform a database search to determine whether the\nproperties of the image are attributable to specific training data, every time\nbefore a generated image is used for professional purposes. Existing tools for\nthis purpose focus on retrieving images of similar semantic content. Meanwhile,\nmany artists are concerned with style replication in text-to-image models. We\npresent a framework for understanding and extracting style descriptors from\nimages. Our framework comprises a new dataset curated using the insight that\nstyle is a subjective property of an image that captures complex yet meaningful\ninteractions of factors including but not limited to colors, textures, shapes,\netc. We also propose a method to extract style descriptors that can be used to\nattribute style of a generated image to the images used in the training dataset\nof a text-to-image model. We showcase promising results in various style\nretrieval tasks. We also quantitatively and qualitatively analyze style\nattribution and matching in the Stable Diffusion model. Code and artifacts are\navailable at https://github.com/learn2phoenix/CSD.\n","authors":["Gowthami Somepalli","Anubhav Gupta","Kamal Gupta","Shramay Palta","Micah Goldblum","Jonas Geiping","Abhinav Shrivastava","Tom Goldstein"],"pdf_url":"https://arxiv.org/pdf/2404.01292v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01291v1","updated":"2024-04-01T17:58:06Z","published":"2024-04-01T17:58:06Z","title":"Evaluating Text-to-Visual Generation with Image-to-Text Generation","summary":" Despite significant progress in generative AI, comprehensive evaluation\nremains challenging because of the lack of effective metrics and standardized\nbenchmarks. For instance, the widely-used CLIPScore measures the alignment\nbetween a (generated) image and text prompt, but it fails to produce reliable\nscores for complex prompts involving compositions of objects, attributes, and\nrelations. One reason is that text encoders of CLIP can notoriously act as a\n\"bag of words\", conflating prompts such as \"the horse is eating the grass\" with\n\"the grass is eating the horse\". To address this, we introduce the VQAScore,\nwhich uses a visual-question-answering (VQA) model to produce an alignment\nscore by computing the probability of a \"Yes\" answer to a simple \"Does this\nfigure show '{text}'?\" question. Though simpler than prior art, VQAScore\ncomputed with off-the-shelf models produces state-of-the-art results across\nmany (8) image-text alignment benchmarks. We also compute VQAScore with an\nin-house model that follows best practices in the literature. For example, we\nuse a bidirectional image-question encoder that allows image embeddings to\ndepend on the question being asked (and vice versa). Our in-house model,\nCLIP-FlanT5, outperforms even the strongest baselines that make use of the\nproprietary GPT-4V. Interestingly, although we train with only images, VQAScore\ncan also align text with video and 3D models. VQAScore allows researchers to\nbenchmark text-to-visual generation using complex texts that capture the\ncompositional structure of real-world prompts. We introduce GenAI-Bench, a more\nchallenging benchmark with 1,600 compositional text prompts that require\nparsing scenes, objects, attributes, relationships, and high-order reasoning\nlike comparison and logic. GenAI-Bench also offers over 15,000 human ratings\nfor leading image and video generation models such as Stable Diffusion, DALL-E\n3, and Gen2.\n","authors":["Zhiqiu Lin","Deepak Pathak","Baiqi Li","Jiayao Li","Xide Xia","Graham Neubig","Pengchuan Zhang","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2404.01291v1.pdf","comment":"We open-source our data, model, and code at:\n https://github.com/linzhiqiu/t2v_metrics ; Project page:\n https://linzhiqiu.github.io/papers/vqascore"},{"id":"http://arxiv.org/abs/2403.13802v2","updated":"2024-04-01T17:58:02Z","published":"2024-03-20T17:59:14Z","title":"ZigMa: A DiT-style Zigzag Mamba Diffusion Model","summary":" The diffusion model has long been plagued by scalability and quadratic\ncomplexity issues, especially within transformer-based structures. In this\nstudy, we aim to leverage the long sequence modeling capability of a\nState-Space Model called Mamba to extend its applicability to visual data\ngeneration. Firstly, we identify a critical oversight in most current\nMamba-based vision methods, namely the lack of consideration for spatial\ncontinuity in the scan scheme of Mamba. Secondly, building upon this insight,\nwe introduce a simple, plug-and-play, zero-parameter method named Zigzag Mamba,\nwhich outperforms Mamba-based baselines and demonstrates improved speed and\nmemory utilization compared to transformer-based baselines. Lastly, we\nintegrate Zigzag Mamba with the Stochastic Interpolant framework to investigate\nthe scalability of the model on large-resolution visual datasets, such as\nFacesHQ $1024\\times 1024$ and UCF101, MultiModal-CelebA-HQ, and MS COCO\n$256\\times 256$ . Code will be released at https://taohu.me/zigma/\n","authors":["Vincent Tao Hu","Stefan Andreas Baumann","Ming Gui","Olga Grebenkova","Pingchuan Ma","Johannes Fischer","Björn Ommer"],"pdf_url":"https://arxiv.org/pdf/2403.13802v2.pdf","comment":"Project Page: https://taohu.me/zigma/"},{"id":"http://arxiv.org/abs/2404.01284v1","updated":"2024-04-01T17:55:11Z","published":"2024-04-01T17:55:11Z","title":"Large Motion Model for Unified Multi-Modal Motion Generation","summary":" Human motion generation, a cornerstone technique in animation and video\nproduction, has widespread applications in various tasks like text-to-motion\nand music-to-dance. Previous works focus on developing specialist models\ntailored for each task without scalability. In this work, we present Large\nMotion Model (LMM), a motion-centric, multi-modal framework that unifies\nmainstream motion generation tasks into a generalist model. A unified motion\nmodel is appealing since it can leverage a wide range of motion data to achieve\nbroad generalization beyond a single task. However, it is also challenging due\nto the heterogeneous nature of substantially different motion data and tasks.\nLMM tackles these challenges from three principled aspects: 1) Data: We\nconsolidate datasets with different modalities, formats and tasks into a\ncomprehensive yet unified motion generation dataset, MotionVerse, comprising 10\ntasks, 16 datasets, a total of 320k sequences, and 100 million frames. 2)\nArchitecture: We design an articulated attention mechanism ArtAttention that\nincorporates body part-aware modeling into Diffusion Transformer backbone. 3)\nPre-Training: We propose a novel pre-training strategy for LMM, which employs\nvariable frame rates and masking forms, to better exploit knowledge from\ndiverse training data. Extensive experiments demonstrate that our generalist\nLMM achieves competitive performance across various standard motion generation\ntasks over state-of-the-art specialist models. Notably, LMM exhibits strong\ngeneralization capabilities and emerging properties across many unseen tasks.\nAdditionally, our ablation studies reveal valuable insights about training and\nscaling up large motion models for future research.\n","authors":["Mingyuan Zhang","Daisheng Jin","Chenyang Gu","Fangzhou Hong","Zhongang Cai","Jingfang Huang","Chongzhi Zhang","Xinying Guo","Lei Yang","Ying He","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2404.01284v1.pdf","comment":"Homepage: https://mingyuan-zhang.github.io/projects/LMM.html"},{"id":"http://arxiv.org/abs/2404.01282v1","updated":"2024-04-01T17:54:34Z","published":"2024-04-01T17:54:34Z","title":"LoSA: Long-Short-range Adapter for Scaling End-to-End Temporal Action\n Localization","summary":" Temporal Action Localization (TAL) involves localizing and classifying action\nsnippets in an untrimmed video. The emergence of large video foundation models\nhas led RGB-only video backbones to outperform previous methods needing both\nRGB and optical flow modalities. Leveraging these large models is often limited\nto training only the TAL head due to the prohibitively large GPU memory\nrequired to adapt the video backbone for TAL. To overcome this limitation, we\nintroduce LoSA, the first memory-and-parameter-efficient backbone adapter\ndesigned specifically for TAL to handle untrimmed videos. LoSA specializes for\nTAL by introducing Long-Short-range Adapters that adapt the intermediate layers\nof the video backbone over different temporal ranges. These adapters run\nparallel to the video backbone to significantly reduce memory footprint. LoSA\nalso includes Long-Short-range Fusion that strategically combines the output of\nthese adapters from the video backbone layers to enhance the video features\nprovided to the TAL head. Experiments show that LoSA significantly outperforms\nall existing methods on standard TAL benchmarks, THUMOS-14 and\nActivityNet-v1.3, by scaling end-to-end backbone adaptation to\nbillion-parameter-plus models like VideoMAEv2~(ViT-g) and leveraging them\nbeyond head-only transfer learning.\n","authors":["Akshita Gupta","Gaurav Mittal","Ahmed Magooda","Ye Yu","Graham W. Taylor","Mei Chen"],"pdf_url":"https://arxiv.org/pdf/2404.01282v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01278v1","updated":"2024-04-01T17:52:17Z","published":"2024-04-01T17:52:17Z","title":"BiPer: Binary Neural Networks using a Periodic Function","summary":" Quantized neural networks employ reduced precision representations for both\nweights and activations. This quantization process significantly reduces the\nmemory requirements and computational complexity of the network. Binary Neural\nNetworks (BNNs) are the extreme quantization case, representing values with\njust one bit. Since the sign function is typically used to map real values to\nbinary values, smooth approximations are introduced to mimic the gradients\nduring error backpropagation. Thus, the mismatch between the forward and\nbackward models corrupts the direction of the gradient, causing training\ninconsistency problems and performance degradation. In contrast to current BNN\napproaches, we propose to employ a binary periodic (BiPer) function during\nbinarization. Specifically, we use a square wave for the forward pass to obtain\nthe binary values and employ the trigonometric sine function with the same\nperiod of the square wave as a differentiable surrogate during the backward\npass. We demonstrate that this approach can control the quantization error by\nusing the frequency of the periodic function and improves network performance.\nExtensive experiments validate the effectiveness of BiPer in benchmark datasets\nand network architectures, with improvements of up to 1% and 0.69% with respect\nto state-of-the-art methods in the classification task over CIFAR-10 and\nImageNet, respectively. Our code is publicly available at\nhttps://github.com/edmav4/BiPer.\n","authors":["Edwin Vargas","Claudia Correa","Carlos Hinojosa","Henry Arguello"],"pdf_url":"https://arxiv.org/pdf/2404.01278v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13549v2","updated":"2024-04-01T17:51:54Z","published":"2023-06-23T15:21:52Z","title":"A Survey on Multimodal Large Language Models","summary":" Recently, Multimodal Large Language Model (MLLM) represented by GPT-4V has\nbeen a new rising research hotspot, which uses powerful Large Language Models\n(LLMs) as a brain to perform multimodal tasks. The surprising emergent\ncapabilities of MLLM, such as writing stories based on images and OCR-free math\nreasoning, are rare in traditional multimodal methods, suggesting a potential\npath to artificial general intelligence. To this end, both academia and\nindustry have endeavored to develop MLLMs that can compete with or even better\nthan GPT-4V, pushing the limit of research at a surprising speed. In this\npaper, we aim to trace and summarize the recent progress of MLLMs. First of\nall, we present the basic formulation of MLLM and delineate its related\nconcepts, including architecture, training strategy and data, as well as\nevaluation. Then, we introduce research topics about how MLLMs can be extended\nto support more granularity, modalities, languages, and scenarios. We continue\nwith multimodal hallucination and extended techniques, including Multimodal ICL\n(M-ICL), Multimodal CoT (M-CoT), and LLM-Aided Visual Reasoning (LAVR). To\nconclude the paper, we discuss existing challenges and point out promising\nresearch directions. In light of the fact that the era of MLLM has only just\nbegun, we will keep updating this survey and hope it can inspire more research.\nAn associated GitHub link collecting the latest papers is available at\nhttps://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models.\n","authors":["Shukang Yin","Chaoyou Fu","Sirui Zhao","Ke Li","Xing Sun","Tong Xu","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2306.13549v2.pdf","comment":"Project\n page:https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models"},{"id":"http://arxiv.org/abs/2404.01272v1","updated":"2024-04-01T17:48:15Z","published":"2024-04-01T17:48:15Z","title":"Language Guided Domain Generalized Medical Image Segmentation","summary":" Single source domain generalization (SDG) holds promise for more reliable and\nconsistent image segmentation across real-world clinical settings particularly\nin the medical domain, where data privacy and acquisition cost constraints\noften limit the availability of diverse datasets. Depending solely on visual\nfeatures hampers the model's capacity to adapt effectively to various domains,\nprimarily because of the presence of spurious correlations and domain-specific\ncharacteristics embedded within the image features. Incorporating text features\nalongside visual features is a potential solution to enhance the model's\nunderstanding of the data, as it goes beyond pixel-level information to provide\nvaluable context. Textual cues describing the anatomical structures, their\nappearances, and variations across various imaging modalities can guide the\nmodel in domain adaptation, ultimately contributing to more robust and\nconsistent segmentation. In this paper, we propose an approach that explicitly\nleverages textual information by incorporating a contrastive learning mechanism\nguided by the text encoder features to learn a more robust feature\nrepresentation. We assess the effectiveness of our text-guided contrastive\nfeature alignment technique in various scenarios, including cross-modality,\ncross-sequence, and cross-site settings for different segmentation tasks. Our\napproach achieves favorable performance against existing methods in literature.\nOur code and model weights are available at\nhttps://github.com/ShahinaKK/LG_SDG.git.\n","authors":["Shahina Kunhimon","Muzammal Naseer","Salman Khan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2404.01272v1.pdf","comment":"Accepted at ISBI2024"},{"id":"http://arxiv.org/abs/2310.01393v3","updated":"2024-04-01T17:40:59Z","published":"2023-10-02T17:52:24Z","title":"DST-Det: Simple Dynamic Self-Training for Open-Vocabulary Object\n Detection","summary":" Open-vocabulary object detection (OVOD) aims to detect the objects beyond the\nset of classes observed during training. This work introduces a straightforward\nand efficient strategy that utilizes pre-trained vision-language models (VLM),\nlike CLIP, to identify potential novel classes through zero-shot\nclassification. Previous methods use a class-agnostic region proposal network\nto detect object proposals and consider the proposals that do not match the\nground truth as background. Unlike these methods, our method will select a\nsubset of proposals that will be considered as background during the training.\nThen, we treat them as novel classes during training. We refer to this approach\nas the self-training strategy, which enhances recall and accuracy for novel\nclasses without requiring extra annotations, datasets, and re-training.\nCompared to previous pseudo methods, our approach does not require re-training\nand offline labeling processing, which is more efficient and effective in\none-shot training. Empirical evaluations on three datasets, including LVIS,\nV3Det, and COCO, demonstrate significant improvements over the baseline\nperformance without incurring additional parameters or computational costs\nduring inference. In addition, we also apply our method to various baselines.\nIn particular, compared with the previous method, F-VLM, our method achieves a\n1.7% improvement on the LVIS dataset. Combined with the recent method CLIPSelf,\nour method also achieves 46.7 novel class AP on COCO without introducing extra\ndata for pertaining. We also achieve over 6.5% improvement over the F-VLM\nbaseline in the recent challenging V3Det dataset. We release our code and\nmodels at https://github.com/xushilin1/dst-det.\n","authors":["Shilin Xu","Xiangtai Li","Size Wu","Wenwei Zhang","Yunhai Tong","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2310.01393v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01990v1","updated":"2024-04-01T17:38:25Z","published":"2024-04-01T17:38:25Z","title":"What is Point Supervision Worth in Video Instance Segmentation?","summary":" Video instance segmentation (VIS) is a challenging vision task that aims to\ndetect, segment, and track objects in videos. Conventional VIS methods rely on\ndensely-annotated object masks which are expensive. We reduce the human\nannotations to only one point for each object in a video frame during training,\nand obtain high-quality mask predictions close to fully supervised models. Our\nproposed training method consists of a class-agnostic proposal generation\nmodule to provide rich negative samples and a spatio-temporal point-based\nmatcher to match the object queries with the provided point annotations.\nComprehensive experiments on three VIS benchmarks demonstrate competitive\nperformance of the proposed framework, nearly matching fully supervised\nmethods.\n","authors":["Shuaiyi Huang","De-An Huang","Zhiding Yu","Shiyi Lan","Subhashree Radhakrishnan","Jose M. Alvarez","Abhinav Shrivastava","Anima Anandkumar"],"pdf_url":"https://arxiv.org/pdf/2404.01990v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11696v5","updated":"2024-04-01T17:34:34Z","published":"2023-08-22T17:59:30Z","title":"Efficient Benchmarking of Language Models","summary":" The increasing versatility of language models (LMs) has given rise to a new\nclass of benchmarks that comprehensively assess a broad range of capabilities.\nSuch benchmarks are associated with massive computational costs, extending to\nthousands of GPU hours per model. However, the efficiency aspect of these\nevaluation efforts had raised little discussion in the literature. In this\nwork, we present the problem of Efficient Benchmarking, namely, intelligently\nreducing the computation costs of LM evaluation without compromising\nreliability. Using the HELM benchmark as a test case, we investigate how\ndifferent benchmark design choices affect the computation-reliability\ntrade-off. We propose to evaluate the reliability of such decisions, by using a\nnew measure -- Decision Impact on Reliability, DIoR for short. We find, for\nexample, that a benchmark leader may change by merely removing a low-ranked\nmodel from the benchmark, and observe that a correct benchmark ranking can be\nobtained by considering only a fraction of the evaluation examples. Based on\nour findings, we outline a set of concrete recommendations for efficient\nbenchmark design and utilization practices. To take a step further, we use our\nfindings to propose an evaluation algorithm, that, when applied to the HELM\nbenchmark, leads to dramatic cost savings with minimal loss of benchmark\nreliability, often reducing computation by x100 or more.\n","authors":["Yotam Perlitz","Elron Bandel","Ariel Gera","Ofir Arviv","Liat Ein-Dor","Eyal Shnarch","Noam Slonim","Michal Shmueli-Scheuer","Leshem Choshen"],"pdf_url":"https://arxiv.org/pdf/2308.11696v5.pdf","comment":"Accepted to NAACL main track"},{"id":"http://arxiv.org/abs/2404.01260v1","updated":"2024-04-01T17:30:56Z","published":"2024-04-01T17:30:56Z","title":"Bridging Remote Sensors with Multisensor Geospatial Foundation Models","summary":" In the realm of geospatial analysis, the diversity of remote sensors,\nencompassing both optical and microwave technologies, offers a wealth of\ndistinct observational capabilities. Recognizing this, we present msGFM, a\nmultisensor geospatial foundation model that effectively unifies data from four\nkey sensor modalities. This integration spans an expansive dataset of two\nmillion multisensor images. msGFM is uniquely adept at handling both paired and\nunpaired sensor data. For data originating from identical geolocations, our\nmodel employs an innovative cross-sensor pretraining approach in masked image\nmodeling, enabling the synthesis of joint representations from diverse sensors.\nmsGFM, incorporating four remote sensors, upholds strong performance, forming a\ncomprehensive model adaptable to various sensor types. msGFM has demonstrated\nenhanced proficiency in a range of both single-sensor and multisensor\ndownstream tasks. These include scene classification, segmentation, cloud\nremoval, and pan-sharpening. A key discovery of our research is that\nrepresentations derived from natural images are not always compatible with the\ndistinct characteristics of geospatial remote sensors, underscoring the\nlimitations of existing representations in this field. Our work can serve as a\nguide for developing multisensor geospatial pretraining models, paving the way\nfor more advanced geospatial capabilities.\n","authors":["Boran Han","Shuai Zhang","Xingjian Shi","Markus Reichstein"],"pdf_url":"https://arxiv.org/pdf/2404.01260v1.pdf","comment":"Accepted to CVPR"},{"id":"http://arxiv.org/abs/2310.00031v3","updated":"2024-04-01T17:27:12Z","published":"2023-09-29T05:16:41Z","title":"Text-image Alignment for Diffusion-based Perception","summary":" Diffusion models are generative models with impressive text-to-image\nsynthesis capabilities and have spurred a new wave of creative methods for\nclassical machine learning tasks. However, the best way to harness the\nperceptual knowledge of these generative models for visual tasks is still an\nopen question. Specifically, it is unclear how to use the prompting interface\nwhen applying diffusion backbones to vision tasks. We find that automatically\ngenerated captions can improve text-image alignment and significantly enhance a\nmodel's cross-attention maps, leading to better perceptual performance. Our\napproach improves upon the current state-of-the-art (SOTA) in diffusion-based\nsemantic segmentation on ADE20K and the current overall SOTA for depth\nestimation on NYUv2. Furthermore, our method generalizes to the cross-domain\nsetting. We use model personalization and caption modifications to align our\nmodel to the target domain and find improvements over unaligned baselines. Our\ncross-domain object detection model, trained on Pascal VOC, achieves SOTA\nresults on Watercolor2K. Our cross-domain segmentation method, trained on\nCityscapes, achieves SOTA results on Dark Zurich-val and Nighttime Driving.\nProject page: https://www.vision.caltech.edu/tadp/. Code:\nhttps://github.com/damaggu/TADP.\n","authors":["Neehar Kondapaneni","Markus Marks","Manuel Knott","Rogerio Guimaraes","Pietro Perona"],"pdf_url":"https://arxiv.org/pdf/2310.00031v3.pdf","comment":"Project page: https://www.vision.caltech.edu/tadp/, Code page:\n github.com/damaggu/TADP"},{"id":"http://arxiv.org/abs/2212.00210v3","updated":"2024-04-01T17:19:02Z","published":"2022-12-01T01:39:28Z","title":"Shape-Guided Diffusion with Inside-Outside Attention","summary":" We introduce precise object silhouette as a new form of user control in\ntext-to-image diffusion models, which we dub Shape-Guided Diffusion. Our\ntraining-free method uses an Inside-Outside Attention mechanism during the\ninversion and generation process to apply a shape constraint to the cross- and\nself-attention maps. Our mechanism designates which spatial region is the\nobject (inside) vs. background (outside) then associates edits to the correct\nregion. We demonstrate the efficacy of our method on the shape-guided editing\ntask, where the model must replace an object according to a text prompt and\nobject mask. We curate a new ShapePrompts benchmark derived from MS-COCO and\nachieve SOTA results in shape faithfulness without a degradation in text\nalignment or image realism according to both automatic metrics and annotator\nratings. Our data and code will be made available at\nhttps://shape-guided-diffusion.github.io.\n","authors":["Dong Huk Park","Grace Luo","Clayton Toste","Samaneh Azadi","Xihui Liu","Maka Karalashvili","Anna Rohrbach","Trevor Darrell"],"pdf_url":"https://arxiv.org/pdf/2212.00210v3.pdf","comment":"WACV 2024"},{"id":"http://arxiv.org/abs/2404.01249v1","updated":"2024-04-01T17:12:47Z","published":"2024-04-01T17:12:47Z","title":"FireANTs: Adaptive Riemannian Optimization for Multi-Scale Diffeomorphic\n Registration","summary":" Diffeomorphic Image Registration is a critical part of the analysis in\nvarious imaging modalities and downstream tasks like image translation,\nsegmentation, and atlas building. Registration algorithms based on optimization\nhave stood the test of time in terms of accuracy, reliability, and robustness\nacross a wide spectrum of modalities and acquisition settings. However, these\nalgorithms converge slowly, are prohibitively expensive to run, and their usage\nrequires a steep learning curve, limiting their scalability to larger clinical\nand scientific studies. In this paper, we develop multi-scale Adaptive\nRiemannian Optimization algorithms for diffeomorphic image registration. We\ndemonstrate compelling improvements on image registration across a spectrum of\nmodalities and anatomies by measuring structural and landmark overlap of the\nregistered image volumes. Our proposed framework leads to a consistent\nimprovement in performance, and from 300x up to 2000x speedup over existing\nalgorithms. Our modular library design makes it easy to use and allows\ncustomization via user-defined cost functions.\n","authors":["Rohit Jena","Pratik Chaudhari","James C. Gee"],"pdf_url":"https://arxiv.org/pdf/2404.01249v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01248v1","updated":"2024-04-01T17:09:40Z","published":"2024-04-01T17:09:40Z","title":"Scalable Scene Modeling from Perspective Imaging: Physics-based\n Appearance and Geometry Inference","summary":" 3D scene modeling techniques serve as the bedrocks in the geospatial\nengineering and computer science, which drives many applications ranging from\nautomated driving, terrain mapping, navigation, virtual, augmented, mixed, and\nextended reality (for gaming and movie industry etc.). This dissertation\npresents a fraction of contributions that advances 3D scene modeling to its\nstate of the art, in the aspects of both appearance and geometry modeling. In\ncontrast to the prevailing deep learning methods, as a core contribution, this\nthesis aims to develop algorithms that follow first principles, where\nsophisticated physic-based models are introduced alongside with simpler\nlearning and inference tasks. The outcomes of these algorithms yield processes\nthat can consume much larger volume of data for highly accurate reconstructing\n3D scenes at a scale without losing methodological generality, which are not\npossible by contemporary complex-model based deep learning methods.\nSpecifically, the dissertation introduces three novel methodologies that\naddress the challenges of inferring appearance and geometry through\nphysics-based modeling.\n Overall, the research encapsulated in this dissertation marks a series of\nmethodological triumphs in the processing of complex datasets. By navigating\nthe confluence of deep learning, computational geometry, and photogrammetry,\nthis work lays down a robust framework for future exploration and practical\napplication in the rapidly evolving field of 3D scene reconstruction. The\noutcomes of these studies are evidenced through rigorous experiments and\ncomparisons with existing state-of-the-art methods, demonstrating the efficacy\nand scalability of the proposed approaches.\n","authors":["Shuang Song"],"pdf_url":"https://arxiv.org/pdf/2404.01248v1.pdf","comment":"Ph.D. Dissertation, Geospatial Data Analytics Lab, The Ohio State\n University, 2024. arXiv admin note: text overlap with arXiv:2108.08378"},{"id":"http://arxiv.org/abs/2404.01247v1","updated":"2024-04-01T17:08:50Z","published":"2024-04-01T17:08:50Z","title":"An image speaks a thousand words, but can everyone listen? On\n translating images for cultural relevance","summary":" Given the rise of multimedia content, human translators increasingly focus on\nculturally adapting not only words but also other modalities such as images to\nconvey the same meaning. While several applications stand to benefit from this,\nmachine translation systems remain confined to dealing with language in speech\nand text. In this work, we take a first step towards translating images to make\nthem culturally relevant. First, we build three pipelines comprising\nstate-of-the-art generative models to do the task. Next, we build a two-part\nevaluation dataset: i) concept: comprising 600 images that are cross-culturally\ncoherent, focusing on a single concept per image, and ii) application:\ncomprising 100 images curated from real-world applications. We conduct a\nmulti-faceted human evaluation of translated images to assess for cultural\nrelevance and meaning preservation. We find that as of today, image-editing\nmodels fail at this task, but can be improved by leveraging LLMs and retrievers\nin the loop. Best pipelines can only translate 5% of images for some countries\nin the easier concept dataset and no translation is successful for some\ncountries in the application dataset, highlighting the challenging nature of\nthe task. Our code and data is released here:\nhttps://github.com/simran-khanuja/image-transcreation.\n","authors":["Simran Khanuja","Sathyanarayanan Ramamoorthy","Yueqi Song","Graham Neubig"],"pdf_url":"https://arxiv.org/pdf/2404.01247v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.10460v2","updated":"2024-04-01T17:04:01Z","published":"2023-01-25T08:40:34Z","title":"HAL3D: Hierarchical Active Learning for Fine-Grained 3D Part Labeling","summary":" We present the first active learning tool for fine-grained 3D part labeling,\na problem which challenges even the most advanced deep learning (DL) methods\ndue to the significant structural variations among the small and intricate\nparts. For the same reason, the necessary data annotation effort is tremendous,\nmotivating approaches to minimize human involvement. Our labeling tool\niteratively verifies or modifies part labels predicted by a deep neural\nnetwork, with human feedback continually improving the network prediction. To\neffectively reduce human efforts, we develop two novel features in our tool,\nhierarchical and symmetry-aware active labeling. Our human-in-the-loop\napproach, coined HAL3D, achieves 100% accuracy (barring human errors) on any\ntest set with pre-defined hierarchical part labels, with 80% time-saving over\nmanual effort.\n","authors":["Fenggen Yu","Yiming Qian","Francisca Gil-Ureta","Brian Jackson","Eric Bennett","Hao Zhang"],"pdf_url":"https://arxiv.org/pdf/2301.10460v2.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2404.01243v1","updated":"2024-04-01T17:03:29Z","published":"2024-04-01T17:03:29Z","title":"A Unified and Interpretable Emotion Representation and Expression\n Generation","summary":" Canonical emotions, such as happy, sad, and fearful, are easy to understand\nand annotate. However, emotions are often compound, e.g. happily surprised, and\ncan be mapped to the action units (AUs) used for expressing emotions, and\ntrivially to the canonical ones. Intuitively, emotions are continuous as\nrepresented by the arousal-valence (AV) model. An interpretable unification of\nthese four modalities - namely, Canonical, Compound, AUs, and AV - is highly\ndesirable, for a better representation and understanding of emotions. However,\nsuch unification remains to be unknown in the current literature. In this work,\nwe propose an interpretable and unified emotion model, referred as C2A2. We\nalso develop a method that leverages labels of the non-unified models to\nannotate the novel unified one. Finally, we modify the text-conditional\ndiffusion models to understand continuous numbers, which are then used to\ngenerate continuous expressions using our unified emotion model. Through\nquantitative and qualitative experiments, we show that our generated images are\nrich and capture subtle expressions. Our work allows a fine-grained generation\nof expressions in conjunction with other textual inputs and offers a new label\nspace for emotions at the same time.\n","authors":["Reni Paskaleva","Mykyta Holubakha","Andela Ilic","Saman Motamed","Luc Van Gool","Danda Paudel"],"pdf_url":"https://arxiv.org/pdf/2404.01243v1.pdf","comment":"10 pages, 9 figures, 3 tables Accepted at CVPR 2024. Project page:\n https://emotion-diffusion.github.io"},{"id":"http://arxiv.org/abs/2401.09627v4","updated":"2024-04-01T17:03:08Z","published":"2024-01-17T22:34:20Z","title":"SymTC: A Symbiotic Transformer-CNN Net for Instance Segmentation of\n Lumbar Spine MRI","summary":" Intervertebral disc disease, a prevalent ailment, frequently leads to\nintermittent or persistent low back pain, and diagnosing and assessing of this\ndisease rely on accurate measurement of vertebral bone and intervertebral disc\ngeometries from lumbar MR images. Deep neural network (DNN) models may assist\nclinicians with more efficient image segmentation of individual instances\n(disks and vertebrae) of the lumbar spine in an automated way, which is termed\nas instance image segmentation. In this work, we proposed SymTC, an innovative\nlumbar spine MR image segmentation model that combines the strengths of\nTransformer and Convolutional Neural Network (CNN). Specifically, we designed a\nparallel dual-path architecture to merge CNN layers and Transformer layers, and\nwe integrated a novel position embedding into the self-attention module of\nTransformer, enhancing the utilization of positional information for more\naccurate segmentation. To further improves model performance, we introduced a\nnew data augmentation technique to create synthetic yet realistic MR image\ndataset, named SSMSpine, which is made publicly available. We evaluated our\nSymTC and the other 15 existing image segmentation models on our private\nin-house dataset and the public SSMSpine dataset, using two metrics, Dice\nSimilarity Coefficient and 95% Hausdorff Distance. The results show that our\nSymTC has the best performance for segmenting vertebral bones and\nintervertebral discs in lumbar spine MR images. The SymTC code and SSMSpine\ndataset are available at https://github.com/jiasongchen/SymTC.\n","authors":["Jiasong Chen","Linchen Qian","Linhai Ma","Timur Urakov","Weiyong Gu","Liang Liang"],"pdf_url":"https://arxiv.org/pdf/2401.09627v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01240v1","updated":"2024-04-01T16:58:32Z","published":"2024-04-01T16:58:32Z","title":"AURORA: Navigating UI Tarpits via Automated Neural Screen Understanding","summary":" Nearly a decade of research in software engineering has focused on automating\nmobile app testing to help engineers in overcoming the unique challenges\nassociated with the software platform. Much of this work has come in the form\nof Automated Input Generation tools (AIG tools) that dynamically explore app\nscreens. However, such tools have repeatedly been demonstrated to achieve\nlower-than-expected code coverage - particularly on sophisticated proprietary\napps. Prior work has illustrated that a primary cause of these coverage\ndeficiencies is related to so-called tarpits, or complex screens that are\ndifficult to navigate.\n In this paper, we take a critical step toward enabling AIG tools to\neffectively navigate tarpits during app exploration through a new form of\nautomated semantic screen understanding. We introduce AURORA, a technique that\nlearns from the visual and textual patterns that exist in mobile app UIs to\nautomatically detect common screen designs and navigate them accordingly. The\nkey idea of AURORA is that there are a finite number of mobile app screen\ndesigns, albeit with subtle variations, such that the general patterns of\ndifferent categories of UI designs can be learned. As such, AURORA employs a\nmulti-modal, neural screen classifier that is able to recognize the most common\ntypes of UI screen designs. After recognizing a given screen, it then applies a\nset of flexible and generalizable heuristics to properly navigate the screen.\nWe evaluated AURORA both on a set of 12 apps with known tarpits from prior\nwork, and on a new set of five of the most popular apps from the Google Play\nstore. Our results indicate that AURORA is able to effectively navigate tarpit\nscreens, outperforming prior approaches that avoid tarpits by 19.6% in terms of\nmethod coverage. The improvements can be attributed to AURORA's UI design\nclassification and heuristic navigation techniques.\n","authors":["Safwat Ali Khan","Wenyu Wang","Yiran Ren","Bin Zhu","Jiangfan Shi","Alyssa McGowan","Wing Lam","Kevin Moran"],"pdf_url":"https://arxiv.org/pdf/2404.01240v1.pdf","comment":"Published at 17th IEEE International Conference on Software Testing,\n Verification and Validation (ICST) 2024, 12 pages"},{"id":"http://arxiv.org/abs/2311.10707v2","updated":"2024-04-01T16:56:13Z","published":"2023-11-17T18:57:40Z","title":"Multimodal Representation Learning by Alternating Unimodal Adaptation","summary":" Multimodal learning, which integrates data from diverse sensory modes, plays\na pivotal role in artificial intelligence. However, existing multimodal\nlearning methods often struggle with challenges where some modalities appear\nmore dominant than others during multimodal learning, resulting in suboptimal\nperformance. To address this challenge, we propose MLA (Multimodal Learning\nwith Alternating Unimodal Adaptation). MLA reframes the conventional joint\nmultimodal learning process by transforming it into an alternating unimodal\nlearning process, thereby minimizing interference between modalities.\nSimultaneously, it captures cross-modal interactions through a shared head,\nwhich undergoes continuous optimization across different modalities. This\noptimization process is controlled by a gradient modification mechanism to\nprevent the shared head from losing previously acquired information. During the\ninference phase, MLA utilizes a test-time uncertainty-based model fusion\nmechanism to integrate multimodal information. Extensive experiments are\nconducted on five diverse datasets, encompassing scenarios with complete\nmodalities and scenarios with missing modalities. These experiments demonstrate\nthe superiority of MLA over competing prior approaches. Our code is available\nat\nhttps://github.com/Cecile-hi/Multimodal-Learning-with-Alternating-Unimodal-Adaptation.\n","authors":["Xiaohui Zhang","Jaehong Yoon","Mohit Bansal","Huaxiu Yao"],"pdf_url":"https://arxiv.org/pdf/2311.10707v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2402.12259v2","updated":"2024-04-01T16:55:10Z","published":"2024-02-19T16:15:03Z","title":"Open3DSG: Open-Vocabulary 3D Scene Graphs from Point Clouds with\n Queryable Objects and Open-Set Relationships","summary":" Current approaches for 3D scene graph prediction rely on labeled datasets to\ntrain models for a fixed set of known object classes and relationship\ncategories. We present Open3DSG, an alternative approach to learn 3D scene\ngraph prediction in an open world without requiring labeled scene graph data.\nWe co-embed the features from a 3D scene graph prediction backbone with the\nfeature space of powerful open world 2D vision language foundation models. This\nenables us to predict 3D scene graphs from 3D point clouds in a zero-shot\nmanner by querying object classes from an open vocabulary and predicting the\ninter-object relationships from a grounded LLM with scene graph features and\nqueried object classes as context. Open3DSG is the first 3D point cloud method\nto predict not only explicit open-vocabulary object classes, but also open-set\nrelationships that are not limited to a predefined label set, making it\npossible to express rare as well as specific objects and relationships in the\npredicted 3D scene graph. Our experiments show that Open3DSG is effective at\npredicting arbitrary object classes as well as their complex inter-object\nrelationships describing spatial, supportive, semantic and comparative\nrelationships.\n","authors":["Sebastian Koch","Narunas Vaskevicius","Mirco Colosi","Pedro Hermosilla","Timo Ropinski"],"pdf_url":"https://arxiv.org/pdf/2402.12259v2.pdf","comment":"CVPR 2024. Project page: https://kochsebastian.com/open3dsg"},{"id":"http://arxiv.org/abs/2403.15443v2","updated":"2024-04-01T16:37:08Z","published":"2024-03-17T16:12:50Z","title":"Introducing an ensemble method for the early detection of Alzheimer's\n disease through the analysis of PET scan images","summary":" Alzheimer's disease is a progressive neurodegenerative disorder that\nprimarily affects cognitive functions such as memory, thinking, and behavior.\nIn this disease, there is a critical phase, mild cognitive impairment, that is\nreally important to be diagnosed early since some patients with progressive MCI\nwill develop the disease. This study delves into the challenging task of\nclassifying Alzheimer's disease into four distinct groups: control normal (CN),\nprogressive mild cognitive impairment (pMCI), stable mild cognitive impairment\n(sMCI), and Alzheimer's disease (AD). This classification is based on a\nthorough examination of PET scan images obtained from the ADNI dataset, which\nprovides a thorough understanding of the disease's progression. Several\ndeep-learning and traditional machine-learning models have been used to detect\nAlzheimer's disease. In this paper, three deep-learning models, namely VGG16\nand AlexNet, and a custom Convolutional neural network (CNN) with 8-fold\ncross-validation have been used for classification. Finally, an ensemble\ntechnique is used to improve the overall result of these models. The results\nshow that using deep-learning models to tell the difference between MCI\npatients gives an overall average accuracy of 93.13% and an AUC of 94.4%.\n","authors":["Arezoo Borji","Taha-Hossein Hejazi","Abbas Seifi"],"pdf_url":"https://arxiv.org/pdf/2403.15443v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.11143v2","updated":"2024-04-01T16:37:00Z","published":"2022-04-23T21:46:17Z","title":"Supplementing Missing Visions via Dialog for Scene Graph Generations","summary":" Most current AI systems rely on the premise that the input visual data are\nsufficient to achieve competitive performance in various computer vision tasks.\nHowever, the classic task setup rarely considers the challenging, yet common\npractical situations where the complete visual data may be inaccessible due to\nvarious reasons (e.g., restricted view range and occlusions). To this end, we\ninvestigate a computer vision task setting with incomplete visual input data.\nSpecifically, we exploit the Scene Graph Generation (SGG) task with various\nlevels of visual data missingness as input. While insufficient visual input\nintuitively leads to performance drop, we propose to supplement the missing\nvisions via the natural language dialog interactions to better accomplish the\ntask objective. We design a model-agnostic Supplementary Interactive Dialog\n(SI-Dial) framework that can be jointly learned with most existing models,\nendowing the current AI systems with the ability of question-answer\ninteractions in natural language. We demonstrate the feasibility of such a task\nsetting with missing visual input and the effectiveness of our proposed dialog\nmodule as the supplementary information source through extensive experiments\nand analysis, by achieving promising performance improvement over multiple\nbaselines.\n","authors":["Zhenghao Zhao","Ye Zhu","Xiaoguang Zhu","Yuzhang Shang","Yan Yan"],"pdf_url":"https://arxiv.org/pdf/2204.11143v2.pdf","comment":"ICASSP 2024"},{"id":"http://arxiv.org/abs/2404.01223v1","updated":"2024-04-01T16:31:04Z","published":"2024-04-01T16:31:04Z","title":"Feature Splatting: Language-Driven Physics-Based Scene Synthesis and\n Editing","summary":" Scene representations using 3D Gaussian primitives have produced excellent\nresults in modeling the appearance of static and dynamic 3D scenes. Many\ngraphics applications, however, demand the ability to manipulate both the\nappearance and the physical properties of objects. We introduce Feature\nSplatting, an approach that unifies physics-based dynamic scene synthesis with\nrich semantics from vision language foundation models that are grounded by\nnatural language. Our first contribution is a way to distill high-quality,\nobject-centric vision-language features into 3D Gaussians, that enables\nsemi-automatic scene decomposition using text queries. Our second contribution\nis a way to synthesize physics-based dynamics from an otherwise static scene\nusing a particle-based simulator, in which material properties are assigned\nautomatically via text queries. We ablate key techniques used in this pipeline,\nto illustrate the challenge and opportunities in using feature-carrying 3D\nGaussians as a unified format for appearance, geometry, material properties and\nsemantics grounded on natural language. Project website:\nhttps://feature-splatting.github.io/\n","authors":["Ri-Zhao Qiu","Ge Yang","Weijia Zeng","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2404.01223v1.pdf","comment":"Project website: https://feature-splatting.github.io/"},{"id":"http://arxiv.org/abs/2404.01220v1","updated":"2024-04-01T16:25:08Z","published":"2024-04-01T16:25:08Z","title":"Entity-Centric Reinforcement Learning for Object Manipulation from\n Pixels","summary":" Manipulating objects is a hallmark of human intelligence, and an important\ntask in domains such as robotics. In principle, Reinforcement Learning (RL)\noffers a general approach to learn object manipulation. In practice, however,\ndomains with more than a few objects are difficult for RL agents due to the\ncurse of dimensionality, especially when learning from raw image observations.\nIn this work we propose a structured approach for visual RL that is suitable\nfor representing multiple objects and their interaction, and use it to learn\ngoal-conditioned manipulation of several objects. Key to our method is the\nability to handle goals with dependencies between the objects (e.g., moving\nobjects in a certain order). We further relate our architecture to the\ngeneralization capability of the trained agent, based on a theoretical result\nfor compositional generalization, and demonstrate agents that learn with 3\nobjects but generalize to similar tasks with over 10 objects. Videos and code\nare available on the project website:\nhttps://sites.google.com/view/entity-centric-rl\n","authors":["Dan Haramati","Tal Daniel","Aviv Tamar"],"pdf_url":"https://arxiv.org/pdf/2404.01220v1.pdf","comment":"ICLR 2024 Spotlight. Videos and code are available on the project\n website: https://sites.google.com/view/entity-centric-rl"},{"id":"http://arxiv.org/abs/2304.12306v3","updated":"2024-04-01T16:18:16Z","published":"2023-04-24T17:56:12Z","title":"Segment Anything in Medical Images","summary":" Medical image segmentation is a critical component in clinical practice,\nfacilitating accurate diagnosis, treatment planning, and disease monitoring.\nHowever, existing methods, often tailored to specific modalities or disease\ntypes, lack generalizability across the diverse spectrum of medical image\nsegmentation tasks. Here we present MedSAM, a foundation model designed for\nbridging this gap by enabling universal medical image segmentation. The model\nis developed on a large-scale medical image dataset with 1,570,263 image-mask\npairs, covering 10 imaging modalities and over 30 cancer types. We conduct a\ncomprehensive evaluation on 86 internal validation tasks and 60 external\nvalidation tasks, demonstrating better accuracy and robustness than\nmodality-wise specialist models. By delivering accurate and efficient\nsegmentation across a wide spectrum of tasks, MedSAM holds significant\npotential to expedite the evolution of diagnostic tools and the personalization\nof treatment plans.\n","authors":["Jun Ma","Yuting He","Feifei Li","Lin Han","Chenyu You","Bo Wang"],"pdf_url":"https://arxiv.org/pdf/2304.12306v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05864v2","updated":"2024-04-01T16:11:58Z","published":"2023-08-10T21:59:23Z","title":"The Multi-modality Cell Segmentation Challenge: Towards Universal\n Solutions","summary":" Cell segmentation is a critical step for quantitative single-cell analysis in\nmicroscopy images. Existing cell segmentation methods are often tailored to\nspecific modalities or require manual interventions to specify hyper-parameters\nin different experimental settings. Here, we present a multi-modality cell\nsegmentation benchmark, comprising over 1500 labeled images derived from more\nthan 50 diverse biological experiments. The top participants developed a\nTransformer-based deep-learning algorithm that not only exceeds existing\nmethods but can also be applied to diverse microscopy images across imaging\nplatforms and tissue types without manual parameter adjustments. This benchmark\nand the improved algorithm offer promising avenues for more accurate and\nversatile cell analysis in microscopy imaging.\n","authors":["Jun Ma","Ronald Xie","Shamini Ayyadhury","Cheng Ge","Anubha Gupta","Ritu Gupta","Song Gu","Yao Zhang","Gihun Lee","Joonkee Kim","Wei Lou","Haofeng Li","Eric Upschulte","Timo Dickscheid","José Guilherme de Almeida","Yixin Wang","Lin Han","Xin Yang","Marco Labagnara","Vojislav Gligorovski","Maxime Scheder","Sahand Jamal Rahi","Carly Kempster","Alice Pollitt","Leon Espinosa","Tâm Mignot","Jan Moritz Middeke","Jan-Niklas Eckardt","Wangkai Li","Zhaoyang Li","Xiaochen Cai","Bizhe Bai","Noah F. Greenwald","David Van Valen","Erin Weisbart","Beth A. Cimini","Trevor Cheung","Oscar Brück","Gary D. Bader","Bo Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05864v2.pdf","comment":"NeurIPS22 Cell Segmentation Challenge:\n https://neurips22-cellseg.grand-challenge.org/ . Nature Methods (2024)"},{"id":"http://arxiv.org/abs/2404.01207v1","updated":"2024-04-01T16:09:12Z","published":"2024-04-01T16:09:12Z","title":"Vision-language models for decoding provider attention during neonatal\n resuscitation","summary":" Neonatal resuscitations demand an exceptional level of attentiveness from\nproviders, who must process multiple streams of information simultaneously.\nGaze strongly influences decision making; thus, understanding where a provider\nis looking during neonatal resuscitations could inform provider training,\nenhance real-time decision support, and improve the design of delivery rooms\nand neonatal intensive care units (NICUs). Current approaches to quantifying\nneonatal providers' gaze rely on manual coding or simulations, which limit\nscalability and utility. Here, we introduce an automated, real-time, deep\nlearning approach capable of decoding provider gaze into semantic classes\ndirectly from first-person point-of-view videos recorded during live\nresuscitations. Combining state-of-the-art, real-time segmentation with\nvision-language models (CLIP), our low-shot pipeline attains 91\\%\nclassification accuracy in identifying gaze targets without training. Upon\nfine-tuning, the performance of our gaze-guided vision transformer exceeds 98\\%\naccuracy in gaze classification, approaching human-level precision. This\nsystem, capable of real-time inference, enables objective quantification of\nprovider attention dynamics during live neonatal resuscitation. Our approach\noffers a scalable solution that seamlessly integrates with existing\ninfrastructure for data-scarce gaze analysis, thereby offering new\nopportunities for understanding and refining clinical decision making.\n","authors":["Felipe Parodi","Jordan Matelsky","Alejandra Regla-Vargas","Elizabeth Foglia","Charis Lim","Danielle Weinberg","Konrad Kording","Heidi Herrick","Michael Platt"],"pdf_url":"https://arxiv.org/pdf/2404.01207v1.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.01203v1","updated":"2024-04-01T15:59:32Z","published":"2024-04-01T15:59:32Z","title":"Video Interpolation with Diffusion Models","summary":" We present VIDIM, a generative model for video interpolation, which creates\nshort videos given a start and end frame. In order to achieve high fidelity and\ngenerate motions unseen in the input data, VIDIM uses cascaded diffusion models\nto first generate the target video at low resolution, and then generate the\nhigh-resolution video conditioned on the low-resolution generated video. We\ncompare VIDIM to previous state-of-the-art methods on video interpolation, and\ndemonstrate how such works fail in most settings where the underlying motion is\ncomplex, nonlinear, or ambiguous while VIDIM can easily handle such cases. We\nadditionally demonstrate how classifier-free guidance on the start and end\nframe and conditioning the super-resolution model on the original\nhigh-resolution frames without additional parameters unlocks high-fidelity\nresults. VIDIM is fast to sample from as it jointly denoises all the frames to\nbe generated, requires less than a billion parameters per diffusion model to\nproduce compelling results, and still enjoys scalability and improved quality\nat larger parameter counts.\n","authors":["Siddhant Jain","Daniel Watson","Eric Tabellion","Aleksander Hołyński","Ben Poole","Janne Kontkanen"],"pdf_url":"https://arxiv.org/pdf/2404.01203v1.pdf","comment":"CVPR 2024, Project page at https://vidim-interpolation.github.io/"},{"id":"http://arxiv.org/abs/2404.01197v1","updated":"2024-04-01T15:55:25Z","published":"2024-04-01T15:55:25Z","title":"Getting it Right: Improving Spatial Consistency in Text-to-Image Models","summary":" One of the key shortcomings in current text-to-image (T2I) models is their\ninability to consistently generate images which faithfully follow the spatial\nrelationships specified in the text prompt. In this paper, we offer a\ncomprehensive investigation of this limitation, while also developing datasets\nand methods that achieve state-of-the-art performance. First, we find that\ncurrent vision-language datasets do not represent spatial relationships well\nenough; to alleviate this bottleneck, we create SPRIGHT, the first\nspatially-focused, large scale dataset, by re-captioning 6 million images from\n4 widely used vision datasets. Through a 3-fold evaluation and analysis\npipeline, we find that SPRIGHT largely improves upon existing datasets in\ncapturing spatial relationships. To demonstrate its efficacy, we leverage only\n~0.25% of SPRIGHT and achieve a 22% improvement in generating spatially\naccurate images while also improving the FID and CMMD scores. Secondly, we find\nthat training on images containing a large number of objects results in\nsubstantial improvements in spatial consistency. Notably, we attain\nstate-of-the-art on T2I-CompBench with a spatial score of 0.2133, by\nfine-tuning on <500 images. Finally, through a set of controlled experiments\nand ablations, we document multiple findings that we believe will enhance the\nunderstanding of factors that affect spatial consistency in text-to-image\nmodels. We publicly release our dataset and model to foster further research in\nthis area.\n","authors":["Agneet Chatterjee","Gabriela Ben Melech Stan","Estelle Aflalo","Sayak Paul","Dhruba Ghosh","Tejas Gokhale","Ludwig Schmidt","Hannaneh Hajishirzi","Vasudev Lal","Chitta Baral","Yezhou Yang"],"pdf_url":"https://arxiv.org/pdf/2404.01197v1.pdf","comment":"project webpage : https://spright-t2i.github.io/"},{"id":"http://arxiv.org/abs/2404.01194v1","updated":"2024-04-01T15:52:14Z","published":"2024-04-01T15:52:14Z","title":"Adaptive Query Prompting for Multi-Domain Landmark Detection","summary":" Medical landmark detection is crucial in various medical imaging modalities\nand procedures. Although deep learning-based methods have achieve promising\nperformance, they are mostly designed for specific anatomical regions or tasks.\nIn this work, we propose a universal model for multi-domain landmark detection\nby leveraging transformer architecture and developing a prompting component,\nnamed as Adaptive Query Prompting (AQP). Instead of embedding additional\nmodules in the backbone network, we design a separate module to generate\nprompts that can be effectively extended to any other transformer network. In\nour proposed AQP, prompts are learnable parameters maintained in a memory space\ncalled prompt pool. The central idea is to keep the backbone frozen and then\noptimize prompts to instruct the model inference process. Furthermore, we\nemploy a lightweight decoder to decode landmarks from the extracted features,\nnamely Light-MLD. Thanks to the lightweight nature of the decoder and AQP, we\ncan handle multiple datasets by sharing the backbone encoder and then only\nperform partial parameter tuning without incurring much additional cost. It has\nthe potential to be extended to more landmark detection tasks. We conduct\nexperiments on three widely used X-ray datasets for different medical landmark\ndetection tasks. Our proposed Light-MLD coupled with AQP achieves SOTA\nperformance on many metrics even without the use of elaborate structural\ndesigns or complex frameworks.\n","authors":["Qiusen Wei","Guoheng Huang","Xiaochen Yuan","Xuhang Chen","Guo Zhong","Jianwen Huang","Jiajie Huang"],"pdf_url":"https://arxiv.org/pdf/2404.01194v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01192v1","updated":"2024-04-01T15:49:50Z","published":"2024-04-01T15:49:50Z","title":"iMD4GC: Incomplete Multimodal Data Integration to Advance Precise\n Treatment Response Prediction and Survival Analysis for Gastric Cancer","summary":" Gastric cancer (GC) is a prevalent malignancy worldwide, ranking as the fifth\nmost common cancer with over 1 million new cases and 700 thousand deaths in\n2020. Locally advanced gastric cancer (LAGC) accounts for approximately\ntwo-thirds of GC diagnoses, and neoadjuvant chemotherapy (NACT) has emerged as\nthe standard treatment for LAGC. However, the effectiveness of NACT varies\nsignificantly among patients, with a considerable subset displaying treatment\nresistance. Ineffective NACT not only leads to adverse effects but also misses\nthe optimal therapeutic window, resulting in lower survival rate. However,\nexisting multimodal learning methods assume the availability of all modalities\nfor each patient, which does not align with the reality of clinical practice.\nThe limited availability of modalities for each patient would cause information\nloss, adversely affecting predictive accuracy. In this study, we propose an\nincomplete multimodal data integration framework for GC (iMD4GC) to address the\nchallenges posed by incomplete multimodal data, enabling precise response\nprediction and survival analysis. Specifically, iMD4GC incorporates unimodal\nattention layers for each modality to capture intra-modal information.\nSubsequently, the cross-modal interaction layers explore potential inter-modal\ninteractions and capture complementary information across modalities, thereby\nenabling information compensation for missing modalities. To evaluate iMD4GC,\nwe collected three multimodal datasets for GC study: GastricRes (698 cases) for\nresponse prediction, GastricSur (801 cases) for survival analysis, and\nTCGA-STAD (400 cases) for survival analysis. The scale of our datasets is\nsignificantly larger than previous studies. The iMD4GC achieved impressive\nperformance with an 80.2% AUC on GastricRes, 71.4% C-index on GastricSur, and\n66.1% C-index on TCGA-STAD, significantly surpassing other compared methods.\n","authors":["Fengtao Zhou","Yingxue Xu","Yanfen Cui","Shenyan Zhang","Yun Zhu","Weiyang He","Jiguang Wang","Xin Wang","Ronald Chan","Louis Ho Shing Lau","Chu Han","Dafu Zhang","Zhenhui Li","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2404.01192v1.pdf","comment":"27 pages, 9 figures, 3 tables (under review)"},{"id":"http://arxiv.org/abs/2311.01017v4","updated":"2024-04-01T15:41:50Z","published":"2023-11-02T06:21:56Z","title":"Copilot4D: Learning Unsupervised World Models for Autonomous Driving via\n Discrete Diffusion","summary":" Learning world models can teach an agent how the world works in an\nunsupervised manner. Even though it can be viewed as a special case of sequence\nmodeling, progress for scaling world models on robotic applications such as\nautonomous driving has been somewhat less rapid than scaling language models\nwith Generative Pre-trained Transformers (GPT). We identify two reasons as\nmajor bottlenecks: dealing with complex and unstructured observation space, and\nhaving a scalable generative model. Consequently, we propose Copilot4D, a novel\nworld modeling approach that first tokenizes sensor observations with VQVAE,\nthen predicts the future via discrete diffusion. To efficiently decode and\ndenoise tokens in parallel, we recast Masked Generative Image Transformer as\ndiscrete diffusion and enhance it with a few simple changes, resulting in\nnotable improvement. When applied to learning world models on point cloud\nobservations, Copilot4D reduces prior SOTA Chamfer distance by more than 65%\nfor 1s prediction, and more than 50% for 3s prediction, across NuScenes, KITTI\nOdometry, and Argoverse2 datasets. Our results demonstrate that discrete\ndiffusion on tokenized agent experience can unlock the power of GPT-like\nunsupervised learning for robotics.\n","authors":["Lunjun Zhang","Yuwen Xiong","Ze Yang","Sergio Casas","Rui Hu","Raquel Urtasun"],"pdf_url":"https://arxiv.org/pdf/2311.01017v4.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2404.01179v1","updated":"2024-04-01T15:31:04Z","published":"2024-04-01T15:31:04Z","title":"BEM: Balanced and Entropy-based Mix for Long-Tailed Semi-Supervised\n Learning","summary":" Data mixing methods play a crucial role in semi-supervised learning (SSL),\nbut their application is unexplored in long-tailed semi-supervised learning\n(LTSSL). The primary reason is that the in-batch mixing manner fails to address\nclass imbalance. Furthermore, existing LTSSL methods mainly focus on\nre-balancing data quantity but ignore class-wise uncertainty, which is also\nvital for class balance. For instance, some classes with sufficient samples\nmight still exhibit high uncertainty due to indistinguishable features. To this\nend, this paper introduces the Balanced and Entropy-based Mix (BEM), a\npioneering mixing approach to re-balance the class distribution of both data\nquantity and uncertainty. Specifically, we first propose a class balanced mix\nbank to store data of each class for mixing. This bank samples data based on\nthe estimated quantity distribution, thus re-balancing data quantity. Then, we\npresent an entropy-based learning approach to re-balance class-wise\nuncertainty, including entropy-based sampling strategy, entropy-based selection\nmodule, and entropy-based class balanced loss. Our BEM first leverages data\nmixing for improving LTSSL, and it can also serve as a complement to the\nexisting re-balancing methods. Experimental results show that BEM significantly\nenhances various LTSSL frameworks and achieves state-of-the-art performances\nacross multiple benchmarks.\n","authors":["Hongwei Zheng","Linyuan Zhou","Han Li","Jinming Su","Xiaoming Wei","Xiaoming Xu"],"pdf_url":"https://arxiv.org/pdf/2404.01179v1.pdf","comment":"This paper is accepted to CVPR 2024. The supplementary material is\n included"},{"id":"http://arxiv.org/abs/2404.01174v1","updated":"2024-04-01T15:26:44Z","published":"2024-04-01T15:26:44Z","title":"SpikeMba: Multi-Modal Spiking Saliency Mamba for Temporal Video\n Grounding","summary":" Temporal video grounding (TVG) is a critical task in video content\nunderstanding. Despite significant advancements, existing methods often limit\nin capturing the fine-grained relationships between multimodal inputs and the\nhigh computational costs with processing long video sequences. To address these\nlimitations, we introduce a novel SpikeMba: multi-modal spiking saliency mamba\nfor temporal video grounding. In our work, we integrate the Spiking Neural\nNetworks (SNNs) and state space models (SSMs) to capture the fine-grained\nrelationships of multimodal features effectively. Specifically, we introduce\nthe relevant slots to enhance the model's memory capabilities, enabling a\ndeeper contextual understanding of video sequences. The contextual moment\nreasoner leverages these slots to maintain a balance between contextual\ninformation preservation and semantic relevance exploration. Simultaneously,\nthe spiking saliency detector capitalizes on the unique properties of SNNs to\naccurately locate salient proposals. Our experiments demonstrate the\neffectiveness of SpikeMba, which consistently outperforms state-of-the-art\nmethods across mainstream benchmarks.\n","authors":["Wenrui Li","Xiaopeng Hong","Xiaopeng Fan"],"pdf_url":"https://arxiv.org/pdf/2404.01174v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01168v1","updated":"2024-04-01T15:16:33Z","published":"2024-04-01T15:16:33Z","title":"Mirror-3DGS: Incorporating Mirror Reflections into 3D Gaussian Splatting","summary":" 3D Gaussian Splatting (3DGS) has marked a significant breakthrough in the\nrealm of 3D scene reconstruction and novel view synthesis. However, 3DGS, much\nlike its predecessor Neural Radiance Fields (NeRF), struggles to accurately\nmodel physical reflections, particularly in mirrors that are ubiquitous in\nreal-world scenes. This oversight mistakenly perceives reflections as separate\nentities that physically exist, resulting in inaccurate reconstructions and\ninconsistent reflective properties across varied viewpoints. To address this\npivotal challenge, we introduce Mirror-3DGS, an innovative rendering framework\ndevised to master the intricacies of mirror geometries and reflections, paving\nthe way for the generation of realistically depicted mirror reflections. By\ningeniously incorporating mirror attributes into the 3DGS and leveraging the\nprinciple of plane mirror imaging, Mirror-3DGS crafts a mirrored viewpoint to\nobserve from behind the mirror, enriching the realism of scene renderings.\nExtensive assessments, spanning both synthetic and real-world scenes, showcase\nour method's ability to render novel views with enhanced fidelity in real-time,\nsurpassing the state-of-the-art Mirror-NeRF specifically within the challenging\nmirror regions. Our code will be made publicly available for reproducible\nresearch.\n","authors":["Jiarui Meng","Haijie Li","Yanmin Wu","Qiankun Gao","Shuzhou Yang","Jian Zhang","Siwei Ma"],"pdf_url":"https://arxiv.org/pdf/2404.01168v1.pdf","comment":"22 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.01160v1","updated":"2024-04-01T15:06:20Z","published":"2024-04-01T15:06:20Z","title":"Diagnosis of Skin Cancer Using VGG16 and VGG19 Based Transfer Learning\n Models","summary":" Today, skin cancer is considered as one of the most dangerous and common\ncancers in the world which demands special attention. Skin cancer may be\ndeveloped in different types; including melanoma, actinic keratosis, basal cell\ncarcinoma, squamous cell carcinoma, and Merkel cell carcinoma. Among them,\nmelanoma is more unpredictable. Melanoma cancer can be diagnosed at early\nstages increasing the possibility of disease treatment. Automatic\nclassification of skin lesions is a challenging task due to diverse forms and\ngrades of the disease, demanding the requirement of novel methods\nimplementation. Deep convolution neural networks (CNN) have shown an excellent\npotential for data and image classification. In this article, we inspect skin\nlesion classification problem using CNN techniques. Remarkably, we present that\nprominent classification accuracy of lesion detection can be obtained by proper\ndesigning and applying of transfer learning framework on pre-trained neural\nnetworks, without any requirement for data enlargement procedures i.e. merging\nVGG16 and VGG19 architectures pre-trained by a generic dataset with modified\nAlexNet network, and then, fine-tuned by a subject-specific dataset containing\ndermatology images. The convolution neural network was trained using 2541\nimages and, in particular, dropout was used to prevent the network from\noverfitting. Finally, the validity of the model was checked by applying the\nK-fold cross validation method. The proposed model increased classification\naccuracy by 3% (from 94.2% to 98.18%) in comparison with other methods.\n","authors":["Amir Faghihi","Mohammadreza Fathollahi","Roozbeh Rajabi"],"pdf_url":"https://arxiv.org/pdf/2404.01160v1.pdf","comment":"15 pages, journal"},{"id":"http://arxiv.org/abs/2404.01156v1","updated":"2024-04-01T15:01:38Z","published":"2024-04-01T15:01:38Z","title":"SyncMask: Synchronized Attentional Masking for Fashion-centric\n Vision-Language Pretraining","summary":" Vision-language models (VLMs) have made significant strides in cross-modal\nunderstanding through large-scale paired datasets. However, in fashion domain,\ndatasets often exhibit a disparity between the information conveyed in image\nand text. This issue stems from datasets containing multiple images of a single\nfashion item all paired with one text, leading to cases where some textual\ndetails are not visible in individual images. This mismatch, particularly when\nnon-co-occurring elements are masked, undermines the training of conventional\nVLM objectives like Masked Language Modeling and Masked Image Modeling, thereby\nhindering the model's ability to accurately align fine-grained visual and\ntextual features. Addressing this problem, we propose Synchronized attentional\nMasking (SyncMask), which generate masks that pinpoint the image patches and\nword tokens where the information co-occur in both image and text. This\nsynchronization is accomplished by harnessing cross-attentional features\nobtained from a momentum model, ensuring a precise alignment between the two\nmodalities. Additionally, we enhance grouped batch sampling with semi-hard\nnegatives, effectively mitigating false negative issues in Image-Text Matching\nand Image-Text Contrastive learning objectives within fashion datasets. Our\nexperiments demonstrate the effectiveness of the proposed approach,\noutperforming existing methods in three downstream tasks.\n","authors":["Chull Hwan Song","Taebaek Hwang","Jooyoung Yoon","Shunghyun Choi","Yeong Hyeon Gu"],"pdf_url":"https://arxiv.org/pdf/2404.01156v1.pdf","comment":"CVPR2024 Accepted"},{"id":"http://arxiv.org/abs/2404.01154v1","updated":"2024-04-01T14:59:13Z","published":"2024-04-01T14:59:13Z","title":"Uncovering the Text Embedding in Text-to-Image Diffusion Models","summary":" The correspondence between input text and the generated image exhibits\nopacity, wherein minor textual modifications can induce substantial deviations\nin the generated image. While, text embedding, as the pivotal intermediary\nbetween text and images, remains relatively underexplored. In this paper, we\naddress this research gap by delving into the text embedding space, unleashing\nits capacity for controllable image editing and explicable semantic direction\nattributes within a learning-free framework. Specifically, we identify two\ncritical insights regarding the importance of per-word embedding and their\ncontextual correlations within text embedding, providing instructive principles\nfor learning-free image editing. Additionally, we find that text embedding\ninherently possesses diverse semantic potentials, and further reveal this\nproperty through the lens of singular value decomposition (SVD). These\nuncovered properties offer practical utility for image editing and semantic\ndiscovery. More importantly, we expect the in-depth analyses and findings of\nthe text embedding can enhance the understanding of text-to-image diffusion\nmodels.\n","authors":["Hu Yu","Hao Luo","Fan Wang","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.01154v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01151v1","updated":"2024-04-01T14:53:36Z","published":"2024-04-01T14:53:36Z","title":"Detect2Interact: Localizing Object Key Field in Visual Question\n Answering (VQA) with LLMs","summary":" Localization plays a crucial role in enhancing the practicality and precision\nof VQA systems. By enabling fine-grained identification and interaction with\nspecific parts of an object, it significantly improves the system's ability to\nprovide contextually relevant and spatially accurate responses, crucial for\napplications in dynamic environments like robotics and augmented reality.\nHowever, traditional systems face challenges in accurately mapping objects\nwithin images to generate nuanced and spatially aware responses. In this work,\nwe introduce \"Detect2Interact\", which addresses these challenges by introducing\nan advanced approach for fine-grained object visual key field detection. First,\nwe use the segment anything model (SAM) to generate detailed spatial maps of\nobjects in images. Next, we use Vision Studio to extract semantic object\ndescriptions. Third, we employ GPT-4's common sense knowledge, bridging the gap\nbetween an object's semantics and its spatial map. As a result, Detect2Interact\nachieves consistent qualitative results on object key field detection across\nextensive test cases and outperforms the existing VQA system with object\ndetection by providing a more reasonable and finer visual representation.\n","authors":["Jialou Wang","Manli Zhu","Yulei Li","Honglei Li","Longzhi Yang","Wai Lok Woo"],"pdf_url":"https://arxiv.org/pdf/2404.01151v1.pdf","comment":"Accepted to IEEE Intelligent Systems"},{"id":"http://arxiv.org/abs/2401.10786v2","updated":"2024-04-01T14:53:00Z","published":"2024-01-19T16:15:37Z","title":"Sat2Scene: 3D Urban Scene Generation from Satellite Images with\n Diffusion","summary":" Directly generating scenes from satellite imagery offers exciting\npossibilities for integration into applications like games and map services.\nHowever, challenges arise from significant view changes and scene scale.\nPrevious efforts mainly focused on image or video generation, lacking\nexploration into the adaptability of scene generation for arbitrary views.\nExisting 3D generation works either operate at the object level or are\ndifficult to utilize the geometry obtained from satellite imagery. To overcome\nthese limitations, we propose a novel architecture for direct 3D scene\ngeneration by introducing diffusion models into 3D sparse representations and\ncombining them with neural rendering techniques. Specifically, our approach\ngenerates texture colors at the point level for a given geometry using a 3D\ndiffusion model first, which is then transformed into a scene representation in\na feed-forward manner. The representation can be utilized to render arbitrary\nviews which would excel in both single-frame quality and inter-frame\nconsistency. Experiments in two city-scale datasets show that our model\ndemonstrates proficiency in generating photo-realistic street-view image\nsequences and cross-view urban scenes from satellite imagery.\n","authors":["Zuoyue Li","Zhenqiang Li","Zhaopeng Cui","Marc Pollefeys","Martin R. Oswald"],"pdf_url":"https://arxiv.org/pdf/2401.10786v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01143v1","updated":"2024-04-01T14:42:57Z","published":"2024-04-01T14:42:57Z","title":"Condition-Aware Neural Network for Controlled Image Generation","summary":" We present Condition-Aware Neural Network (CAN), a new method for adding\ncontrol to image generative models. In parallel to prior conditional control\nmethods, CAN controls the image generation process by dynamically manipulating\nthe weight of the neural network. This is achieved by introducing a\ncondition-aware weight generation module that generates conditional weight for\nconvolution/linear layers based on the input condition. We test CAN on\nclass-conditional image generation on ImageNet and text-to-image generation on\nCOCO. CAN consistently delivers significant improvements for diffusion\ntransformer models, including DiT and UViT. In particular, CAN combined with\nEfficientViT (CaT) achieves 2.78 FID on ImageNet 512x512, surpassing DiT-XL/2\nwhile requiring 52x fewer MACs per sampling step.\n","authors":["Han Cai","Muyang Li","Zhuoyang Zhang","Qinsheng Zhang","Ming-Yu Liu","Song Han"],"pdf_url":"https://arxiv.org/pdf/2404.01143v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2211.14049v3","updated":"2024-04-01T14:38:13Z","published":"2022-11-25T12:09:12Z","title":"Task-Oriented Communication for Edge Video Analytics","summary":" With the development of artificial intelligence (AI) techniques and the\nincreasing popularity of camera-equipped devices, many edge video analytics\napplications are emerging, calling for the deployment of computation-intensive\nAI models at the network edge. Edge inference is a promising solution to move\nthe computation-intensive workloads from low-end devices to a powerful edge\nserver for video analytics, but the device-server communications will remain a\nbottleneck due to the limited bandwidth. This paper proposes a task-oriented\ncommunication framework for edge video analytics, where multiple devices\ncollect the visual sensory data and transmit the informative features to an\nedge server for processing. To enable low-latency inference, this framework\nremoves video redundancy in spatial and temporal domains and transmits minimal\ninformation that is essential for the downstream task, rather than\nreconstructing the videos at the edge server. Specifically, it extracts compact\ntask-relevant features based on the deterministic information bottleneck (IB)\nprinciple, which characterizes a tradeoff between the informativeness of the\nfeatures and the communication cost. As the features of consecutive frames are\ntemporally correlated, we propose a temporal entropy model (TEM) to reduce the\nbitrate by taking the previous features as side information in feature\nencoding. To further improve the inference performance, we build a\nspatial-temporal fusion module at the server to integrate features of the\ncurrent and previous frames for joint inference. Extensive experiments on video\nanalytics tasks evidence that the proposed framework effectively encodes\ntask-relevant information of video data and achieves a better rate-performance\ntradeoff than existing methods.\n","authors":["Jiawei Shao","Xinjie Zhang","Jun Zhang"],"pdf_url":"https://arxiv.org/pdf/2211.14049v3.pdf","comment":"This paper was accepted to IEEE Transactions on Wireless\n Communications (TWC)"},{"id":"http://arxiv.org/abs/2404.01139v1","updated":"2024-04-01T14:34:47Z","published":"2024-04-01T14:34:47Z","title":"Structured Initialization for Attention in Vision Transformers","summary":" The training of vision transformer (ViT) networks on small-scale datasets\nposes a significant challenge. By contrast, convolutional neural networks\n(CNNs) have an architectural inductive bias enabling them to perform well on\nsuch problems. In this paper, we argue that the architectural bias inherent to\nCNNs can be reinterpreted as an initialization bias within ViT. This insight is\nsignificant as it empowers ViTs to perform equally well on small-scale problems\nwhile maintaining their flexibility for large-scale applications. Our\ninspiration for this ``structured'' initialization stems from our empirical\nobservation that random impulse filters can achieve comparable performance to\nlearned filters within CNNs. Our approach achieves state-of-the-art performance\nfor data-efficient ViT learning across numerous benchmarks including CIFAR-10,\nCIFAR-100, and SVHN.\n","authors":["Jianqiao Zheng","Xueqian Li","Simon Lucey"],"pdf_url":"https://arxiv.org/pdf/2404.01139v1.pdf","comment":"20 pages, 5 figures, 8 tables"},{"id":"http://arxiv.org/abs/2404.01133v1","updated":"2024-04-01T14:24:40Z","published":"2024-04-01T14:24:40Z","title":"CityGaussian: Real-time High-quality Large-Scale Scene Rendering with\n Gaussians","summary":" The advancement of real-time 3D scene reconstruction and novel view synthesis\nhas been significantly propelled by 3D Gaussian Splatting (3DGS). However,\neffectively training large-scale 3DGS and rendering it in real-time across\nvarious scales remains challenging. This paper introduces CityGaussian\n(CityGS), which employs a novel divide-and-conquer training approach and\nLevel-of-Detail (LoD) strategy for efficient large-scale 3DGS training and\nrendering. Specifically, the global scene prior and adaptive training data\nselection enables efficient training and seamless fusion. Based on fused\nGaussian primitives, we generate different detail levels through compression,\nand realize fast rendering across various scales through the proposed\nblock-wise detail levels selection and aggregation strategy. Extensive\nexperimental results on large-scale scenes demonstrate that our approach\nattains state-of-theart rendering quality, enabling consistent real-time\nrendering of largescale scenes across vastly different scales. Our project page\nis available at https://dekuliutesla.github.io/citygs/.\n","authors":["Yang Liu","He Guan","Chuanchen Luo","Lue Fan","Junran Peng","Zhaoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.01133v1.pdf","comment":"Project Page: https://dekuliutesla.github.io/citygs/"},{"id":"http://arxiv.org/abs/2404.01127v1","updated":"2024-04-01T14:06:48Z","published":"2024-04-01T14:06:48Z","title":"Medical Visual Prompting (MVP): A Unified Framework for Versatile and\n High-Quality Medical Image Segmentation","summary":" Accurate segmentation of lesion regions is crucial for clinical diagnosis and\ntreatment across various diseases. While deep convolutional networks have\nachieved satisfactory results in medical image segmentation, they face\nchallenges such as loss of lesion shape information due to continuous\nconvolution and downsampling, as well as the high cost of manually labeling\nlesions with varying shapes and sizes. To address these issues, we propose a\nnovel medical visual prompting (MVP) framework that leverages pre-training and\nprompting concepts from natural language processing (NLP). The framework\nutilizes three key components: Super-Pixel Guided Prompting (SPGP) for\nsuperpixelating the input image, Image Embedding Guided Prompting (IEGP) for\nfreezing patch embedding and merging with superpixels to provide visual\nprompts, and Adaptive Attention Mechanism Guided Prompting (AAGP) for\npinpointing prompt content and efficiently adapting all layers. By integrating\nSPGP, IEGP, and AAGP, the MVP enables the segmentation network to better learn\nshape prompting information and facilitates mutual learning across different\ntasks. Extensive experiments conducted on five datasets demonstrate superior\nperformance of this method in various challenging medical image tasks, while\nsimplifying single-task medical segmentation models. This novel framework\noffers improved performance with fewer parameters and holds significant\npotential for accurate segmentation of lesion regions in various medical tasks,\nmaking it clinically valuable.\n","authors":["Yulin Chen","Guoheng Huang","Kai Huang","Zijin Lin","Guo Zhong","Shenghong Luo","Jie Deng","Jian Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.01127v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01123v1","updated":"2024-04-01T13:57:46Z","published":"2024-04-01T13:57:46Z","title":"CLIPtone: Unsupervised Learning for Text-based Image Tone Adjustment","summary":" Recent image tone adjustment (or enhancement) approaches have predominantly\nadopted supervised learning for learning human-centric perceptual assessment.\nHowever, these approaches are constrained by intrinsic challenges of supervised\nlearning. Primarily, the requirement for expertly-curated or retouched images\nescalates the data acquisition expenses. Moreover, their coverage of target\nstyle is confined to stylistic variants inferred from the training data. To\nsurmount the above challenges, we propose an unsupervised learning-based\napproach for text-based image tone adjustment method, CLIPtone, that extends an\nexisting image enhancement method to accommodate natural language descriptions.\nSpecifically, we design a hyper-network to adaptively modulate the pretrained\nparameters of the backbone model based on text description. To assess whether\nthe adjusted image aligns with the text description without ground truth image,\nwe utilize CLIP, which is trained on a vast set of language-image pairs and\nthus encompasses knowledge of human perception. The major advantages of our\napproach are three fold: (i) minimal data collection expenses, (ii) support for\na range of adjustments, and (iii) the ability to handle novel text descriptions\nunseen in training. Our approach's efficacy is demonstrated through\ncomprehensive experiments, including a user study.\n","authors":["Hyeongmin Lee","Kyoungkook Kang","Jungseul Ok","Sunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2404.01123v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01121v1","updated":"2024-04-01T13:55:44Z","published":"2024-04-01T13:55:44Z","title":"CMT: Cross Modulation Transformer with Hybrid Loss for Pansharpening","summary":" Pansharpening aims to enhance remote sensing image (RSI) quality by merging\nhigh-resolution panchromatic (PAN) with multispectral (MS) images. However,\nprior techniques struggled to optimally fuse PAN and MS images for enhanced\nspatial and spectral information, due to a lack of a systematic framework\ncapable of effectively coordinating their individual strengths. In response, we\npresent the Cross Modulation Transformer (CMT), a pioneering method that\nmodifies the attention mechanism. This approach utilizes a robust modulation\ntechnique from signal processing, integrating it into the attention mechanism's\ncalculations. It dynamically tunes the weights of the carrier's value (V)\nmatrix according to the modulator's features, thus resolving historical\nchallenges and achieving a seamless integration of spatial and spectral\nattributes. Furthermore, considering that RSI exhibits large-scale features and\nedge details along with local textures, we crafted a hybrid loss function that\ncombines Fourier and wavelet transforms to effectively capture these\ncharacteristics, thereby enhancing both spatial and spectral accuracy in\npansharpening. Extensive experiments demonstrate our framework's superior\nperformance over existing state-of-the-art methods. The code will be publicly\navailable to encourage further research.\n","authors":["Wen-Jie Shu","Hong-Xia Dou","Rui Wen","Xiao Wu","Liang-Jian Deng"],"pdf_url":"https://arxiv.org/pdf/2404.01121v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01120v1","updated":"2024-04-01T13:55:40Z","published":"2024-04-01T13:55:40Z","title":"Motion Blur Decomposition with Cross-shutter Guidance","summary":" Motion blur is a frequently observed image artifact, especially under\ninsufficient illumination where exposure time has to be prolonged so as to\ncollect more photons for a bright enough image. Rather than simply removing\nsuch blurring effects, recent researches have aimed at decomposing a blurry\nimage into multiple sharp images with spatial and temporal coherence. Since\nmotion blur decomposition itself is highly ambiguous, priors from neighbouring\nframes or human annotation are usually needed for motion disambiguation. In\nthis paper, inspired by the complementary exposure characteristics of a global\nshutter (GS) camera and a rolling shutter (RS) camera, we propose to utilize\nthe ordered scanline-wise delay in a rolling shutter image to robustify motion\ndecomposition of a single blurry image. To evaluate this novel dual imaging\nsetting, we construct a triaxial system to collect realistic data, as well as a\ndeep network architecture that explicitly addresses temporal and contextual\ninformation through reciprocal branches for cross-shutter motion blur\ndecomposition. Experiment results have verified the effectiveness of our\nproposed algorithm, as well as the validity of our dual imaging setting.\n","authors":["Xiang Ji","Haiyang Jiang","Yinqiang Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.01120v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2312.06505v4","updated":"2024-04-01T13:52:30Z","published":"2023-12-11T16:31:55Z","title":"Grounded Question-Answering in Long Egocentric Videos","summary":" Existing approaches to video understanding, mainly designed for short videos\nfrom a third-person perspective, are limited in their applicability in certain\nfields, such as robotics. In this paper, we delve into open-ended\nquestion-answering (QA) in long, egocentric videos, which allows individuals or\nrobots to inquire about their own past visual experiences. This task presents\nunique challenges, including the complexity of temporally grounding queries\nwithin extensive video content, the high resource demands for precise data\nannotation, and the inherent difficulty of evaluating open-ended answers due to\ntheir ambiguous nature. Our proposed approach tackles these challenges by (i)\nintegrating query grounding and answering within a unified model to reduce\nerror propagation; (ii) employing large language models for efficient and\nscalable data synthesis; and (iii) introducing a close-ended QA task for\nevaluation, to manage answer ambiguity. Extensive experiments demonstrate the\neffectiveness of our method, which also achieves state-of-the-art performance\non the QaEgo4D and Ego4D-NLQ benchmarks. Code, data, and models are available\nat https://github.com/Becomebright/GroundVQA.\n","authors":["Shangzhe Di","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2312.06505v4.pdf","comment":"Accepted to CVPR 2024. Project website at https://dszdsz.cn/GroundVQA"},{"id":"http://arxiv.org/abs/2404.01102v1","updated":"2024-04-01T13:23:04Z","published":"2024-04-01T13:23:04Z","title":"Diffusion based Zero-shot Medical Image-to-Image Translation for Cross\n Modality Segmentation","summary":" Cross-modality image segmentation aims to segment the target modalities using\na method designed in the source modality. Deep generative models can translate\nthe target modality images into the source modality, thus enabling\ncross-modality segmentation. However, a vast body of existing cross-modality\nimage translation methods relies on supervised learning. In this work, we aim\nto address the challenge of zero-shot learning-based image translation tasks\n(extreme scenarios in the target modality is unseen in the training phase). To\nleverage generative learning for zero-shot cross-modality image segmentation,\nwe propose a novel unsupervised image translation method. The framework learns\nto translate the unseen source image to the target modality for image\nsegmentation by leveraging the inherent statistical consistency between\ndifferent modalities for diffusion guidance. Our framework captures identical\ncross-modality features in the statistical domain, offering diffusion guidance\nwithout relying on direct mappings between the source and target domains. This\nadvantage allows our method to adapt to changing source domains without the\nneed for retraining, making it highly practical when sufficient labeled source\ndomain data is not available. The proposed framework is validated in zero-shot\ncross-modality image segmentation tasks through empirical comparisons with\ninfluential generative models, including adversarial-based and diffusion-based\nmodels.\n","authors":["Zihao Wang","Yingyu Yang","Yuzhou Chen","Tingting Yuan","Maxime Sermesant","Herve Delingette"],"pdf_url":"https://arxiv.org/pdf/2404.01102v1.pdf","comment":"Neurips 2023 Diffusion Workshop"},{"id":"http://arxiv.org/abs/2404.01101v1","updated":"2024-04-01T13:21:05Z","published":"2024-04-01T13:21:05Z","title":"UFID: A Unified Framework for Input-level Backdoor Detection on\n Diffusion Models","summary":" Diffusion Models are vulnerable to backdoor attacks, where malicious\nattackers inject backdoors by poisoning some parts of the training samples\nduring the training stage. This poses a serious threat to the downstream users,\nwho query the diffusion models through the API or directly download them from\nthe internet. To mitigate the threat of backdoor attacks, there have been a\nplethora of investigations on backdoor detections. However, none of them\ndesigned a specialized backdoor detection method for diffusion models,\nrendering the area much under-explored. Moreover, these prior methods mainly\nfocus on the traditional neural networks in the classification task, which\ncannot be adapted to the backdoor detections on the generative task easily.\nAdditionally, most of the prior methods require white-box access to model\nweights and architectures, or the probability logits as additional information,\nwhich are not always practical. In this paper, we propose a Unified Framework\nfor Input-level backdoor Detection (UFID) on the diffusion models, which is\nmotivated by observations in the diffusion models and further validated with a\ntheoretical causality analysis. Extensive experiments across different datasets\non both conditional and unconditional diffusion models show that our method\nachieves a superb performance on detection effectiveness and run-time\nefficiency. The code is available at\nhttps://github.com/GuanZihan/official_UFID.\n","authors":["Zihan Guan","Mengxuan Hu","Sheng Li","Anil Vullikanti"],"pdf_url":"https://arxiv.org/pdf/2404.01101v1.pdf","comment":"20 pages,18 figures"},{"id":"http://arxiv.org/abs/2402.19231v2","updated":"2024-04-01T13:16:01Z","published":"2024-02-29T15:05:11Z","title":"CricaVPR: Cross-image Correlation-aware Representation Learning for\n Visual Place Recognition","summary":" Over the past decade, most methods in visual place recognition (VPR) have\nused neural networks to produce feature representations. These networks\ntypically produce a global representation of a place image using only this\nimage itself and neglect the cross-image variations (e.g. viewpoint and\nillumination), which limits their robustness in challenging scenes. In this\npaper, we propose a robust global representation method with cross-image\ncorrelation awareness for VPR, named CricaVPR. Our method uses the attention\nmechanism to correlate multiple images within a batch. These images can be\ntaken in the same place with different conditions or viewpoints, or even\ncaptured from different places. Therefore, our method can utilize the\ncross-image variations as a cue to guide the representation learning, which\nensures more robust features are produced. To further facilitate the\nrobustness, we propose a multi-scale convolution-enhanced adaptation method to\nadapt pre-trained visual foundation models to the VPR task, which introduces\nthe multi-scale local information to further enhance the cross-image\ncorrelation-aware representation. Experimental results show that our method\noutperforms state-of-the-art methods by a large margin with significantly less\ntraining time. The code is released at https://github.com/Lu-Feng/CricaVPR.\n","authors":["Feng Lu","Xiangyuan Lan","Lijun Zhang","Dongmei Jiang","Yaowei Wang","Chun Yuan"],"pdf_url":"https://arxiv.org/pdf/2402.19231v2.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2311.17049v2","updated":"2024-04-01T13:06:06Z","published":"2023-11-28T18:55:42Z","title":"MobileCLIP: Fast Image-Text Models through Multi-Modal Reinforced\n Training","summary":" Contrastive pretraining of image-text foundation models, such as CLIP,\ndemonstrated excellent zero-shot performance and improved robustness on a wide\nrange of downstream tasks. However, these models utilize large\ntransformer-based encoders with significant memory and latency overhead which\npose challenges for deployment on mobile devices. In this work, we introduce\nMobileCLIP -- a new family of efficient image-text models optimized for runtime\nperformance along with a novel and efficient training approach, namely\nmulti-modal reinforced training. The proposed training approach leverages\nknowledge transfer from an image captioning model and an ensemble of strong\nCLIP encoders to improve the accuracy of efficient models. Our approach avoids\ntrain-time compute overhead by storing the additional knowledge in a reinforced\ndataset. MobileCLIP sets a new state-of-the-art latency-accuracy tradeoff for\nzero-shot classification and retrieval tasks on several datasets. Our\nMobileCLIP-S2 variant is 2.3$\\times$ faster while more accurate compared to\nprevious best CLIP model based on ViT-B/16. We further demonstrate the\neffectiveness of our multi-modal reinforced training by training a CLIP model\nbased on ViT-B/16 image backbone and achieving +2.9% average performance\nimprovement on 38 evaluation benchmarks compared to the previous best.\nMoreover, we show that the proposed approach achieves 10$\\times$-1000$\\times$\nimproved learning efficiency when compared with non-reinforced CLIP training.\nCode and models are available at https://github.com/apple/ml-mobileclip .\n","authors":["Pavan Kumar Anasosalu Vasu","Hadi Pouransari","Fartash Faghri","Raviteja Vemulapalli","Oncel Tuzel"],"pdf_url":"https://arxiv.org/pdf/2311.17049v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01094v1","updated":"2024-04-01T12:59:49Z","published":"2024-04-01T12:59:49Z","title":"HairFastGAN: Realistic and Robust Hair Transfer with a Fast\n Encoder-Based Approach","summary":" Our paper addresses the complex task of transferring a hairstyle from a\nreference image to an input photo for virtual hair try-on. This task is\nchallenging due to the need to adapt to various photo poses, the sensitivity of\nhairstyles, and the lack of objective metrics. The current state of the art\nhairstyle transfer methods use an optimization process for different parts of\nthe approach, making them inexcusably slow. At the same time, faster\nencoder-based models are of very low quality because they either operate in\nStyleGAN's W+ space or use other low-dimensional image generators.\nAdditionally, both approaches have a problem with hairstyle transfer when the\nsource pose is very different from the target pose, because they either don't\nconsider the pose at all or deal with it inefficiently. In our paper, we\npresent the HairFast model, which uniquely solves these problems and achieves\nhigh resolution, near real-time performance, and superior reconstruction\ncompared to optimization problem-based methods. Our solution includes a new\narchitecture operating in the FS latent space of StyleGAN, an enhanced\ninpainting approach, and improved encoders for better alignment, color\ntransfer, and a new encoder for post-processing. The effectiveness of our\napproach is demonstrated on realism metrics after random hairstyle transfer and\nreconstruction when the original hairstyle is transferred. In the most\ndifficult scenario of transferring both shape and color of a hairstyle from\ndifferent images, our method performs in less than a second on the Nvidia V100.\nOur code is available at https://github.com/AIRI-Institute/HairFastGAN.\n","authors":["Maxim Nikolaev","Mikhail Kuznetsov","Dmitry Vetrov","Aibek Alanov"],"pdf_url":"https://arxiv.org/pdf/2404.01094v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01089v1","updated":"2024-04-01T12:43:22Z","published":"2024-04-01T12:43:22Z","title":"Texture-Preserving Diffusion Models for High-Fidelity Virtual Try-On","summary":" Image-based virtual try-on is an increasingly important task for online\nshopping. It aims to synthesize images of a specific person wearing a specified\ngarment. Diffusion model-based approaches have recently become popular, as they\nare excellent at image synthesis tasks. However, these approaches usually\nemploy additional image encoders and rely on the cross-attention mechanism for\ntexture transfer from the garment to the person image, which affects the\ntry-on's efficiency and fidelity. To address these issues, we propose an\nTexture-Preserving Diffusion (TPD) model for virtual try-on, which enhances the\nfidelity of the results and introduces no additional image encoders.\nAccordingly, we make contributions from two aspects. First, we propose to\nconcatenate the masked person and reference garment images along the spatial\ndimension and utilize the resulting image as the input for the diffusion\nmodel's denoising UNet. This enables the original self-attention layers\ncontained in the diffusion model to achieve efficient and accurate texture\ntransfer. Second, we propose a novel diffusion-based method that predicts a\nprecise inpainting mask based on the person and reference garment images,\nfurther enhancing the reliability of the try-on results. In addition, we\nintegrate mask prediction and image synthesis into a single compact model. The\nexperimental results show that our approach can be applied to various try-on\ntasks, e.g., garment-to-person and person-to-person try-ons, and significantly\noutperforms state-of-the-art methods on popular VITON, VITON-HD databases.\n","authors":["Xu Yang","Changxing Ding","Zhibin Hong","Junhao Huang","Jin Tao","Xiangmin Xu"],"pdf_url":"https://arxiv.org/pdf/2404.01089v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01081v1","updated":"2024-04-01T12:21:56Z","published":"2024-04-01T12:21:56Z","title":"PhysReaction: Physically Plausible Real-Time Humanoid Reaction Synthesis\n via Forward Dynamics Guided 4D Imitation","summary":" Humanoid Reaction Synthesis is pivotal for creating highly interactive and\nempathetic robots that can seamlessly integrate into human environments,\nenhancing the way we live, work, and communicate. However, it is difficult to\nlearn the diverse interaction patterns of multiple humans and generate\nphysically plausible reactions. The kinematics-based approaches face\nchallenges, including issues like floating feet, sliding, penetration, and\nother problems that defy physical plausibility. The existing physics-based\nmethod often relies on kinematics-based methods to generate reference states,\nwhich struggle with the challenges posed by kinematic noise during action\nexecution. Constrained by their reliance on diffusion models, these methods are\nunable to achieve real-time inference. In this work, we propose a Forward\nDynamics Guided 4D Imitation method to generate physically plausible human-like\nreactions. The learned policy is capable of generating physically plausible and\nhuman-like reactions in real-time, significantly improving the speed(x33) and\nquality of reactions compared with the existing method. Our experiments on the\nInterHuman and Chi3D datasets, along with ablation studies, demonstrate the\neffectiveness of our approach.\n","authors":["Yunze Liu","Changxi Chen","Chenjing Ding","Li Yi"],"pdf_url":"https://arxiv.org/pdf/2404.01081v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01079v1","updated":"2024-04-01T12:19:54Z","published":"2024-04-01T12:19:54Z","title":"Stale Diffusion: Hyper-realistic 5D Movie Generation Using Old-school\n Methods","summary":" Two years ago, Stable Diffusion achieved super-human performance at\ngenerating images with super-human numbers of fingers. Following the steady\ndecline of its technical novelty, we propose Stale Diffusion, a method that\nsolidifies and ossifies Stable Diffusion in a maximum-entropy state. Stable\nDiffusion works analogously to a barn (the Stable) from which an infinite set\nof horses have escaped (the Diffusion). As the horses have long left the barn,\nour proposal may be seen as antiquated and irrelevant. Nevertheless, we\nvigorously defend our claim of novelty by identifying as early adopters of the\nSlow Science Movement, which will produce extremely important pearls of wisdom\nin the future. Our speed of contributions can also be seen as a quasi-static\nimplementation of the recent call to pause AI experiments, which we\nwholeheartedly support. As a result of a careful archaeological expedition to\n18-months-old Git commit histories, we found that naturally-accumulating errors\nhave produced a novel entropy-maximising Stale Diffusion method, that can\nproduce sleep-inducing hyper-realistic 5D video that is as good as one's\nimagination.\n","authors":["Joao F. Henriques","Dylan Campbell","Tengda Han"],"pdf_url":"https://arxiv.org/pdf/2404.01079v1.pdf","comment":"SIGBOVIK 2024"},{"id":"http://arxiv.org/abs/2404.01074v1","updated":"2024-04-01T12:16:00Z","published":"2024-04-01T12:16:00Z","title":"Prompt Learning for Oriented Power Transmission Tower Detection in\n High-Resolution SAR Images","summary":" Detecting transmission towers from synthetic aperture radar (SAR) images\nremains a challenging task due to the comparatively small size and side-looking\ngeometry, with background clutter interference frequently hindering tower\nidentification. A large number of interfering signals superimposes the return\nsignal from the tower. We found that localizing or prompting positions of power\ntransmission towers is beneficial to address this obstacle. Based on this\nrevelation, this paper introduces prompt learning into the oriented object\ndetector (P2Det) for multimodal information learning. P2Det contains the sparse\nprompt coding and cross-attention between the multimodal data. Specifically,\nthe sparse prompt encoder (SPE) is proposed to represent point locations,\nconverting prompts into sparse embeddings. The image embeddings are generated\nthrough the Transformer layers. Then a two-way fusion module (TWFM) is proposed\nto calculate the cross-attention of the two different embeddings. The\ninteraction of image-level and prompt-level features is utilized to address the\nclutter interference. A shape-adaptive refinement module (SARM) is proposed to\nreduce the effect of aspect ratio. Extensive experiments demonstrated the\neffectiveness of the proposed model on high-resolution SAR images. P2Det\nprovides a novel insight for multimodal object detection due to its competitive\nperformance.\n","authors":["Tianyang Li","Chao Wang","Hong Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.01074v1.pdf","comment":"22 pages, 12figures"},{"id":"http://arxiv.org/abs/2404.01065v1","updated":"2024-04-01T11:57:40Z","published":"2024-04-01T11:57:40Z","title":"T-Mamba: Frequency-Enhanced Gated Long-Range Dependency for Tooth 3D\n CBCT Segmentation","summary":" Efficient tooth segmentation in three-dimensional (3D) imaging, critical for\northodontic diagnosis, remains challenging due to noise, low contrast, and\nartifacts in CBCT images. Both convolutional Neural Networks (CNNs) and\ntransformers have emerged as popular architectures for image segmentation.\nHowever, their efficacy in handling long-range dependencies is limited due to\ninherent locality or computational complexity. To address this issue, we\npropose T-Mamba, integrating shared positional encoding and frequency-based\nfeatures into vision mamba, to address limitations in spatial position\npreservation and feature enhancement in frequency domain. Besides, we also\ndesign a gate selection unit to integrate two features in spatial domain and\none feature in frequency domain adaptively. T-Mamba is the first work to\nintroduce frequency-based features into vision mamba. Extensive experiments\ndemonstrate that T-Mamba achieves new SOTA results on the public Tooth CBCT\ndataset and outperforms previous SOTA methods by a large margin, i.e., IoU +\n3.63%, SO + 2.43%, DSC +2.30%, HD -4.39mm, and ASSD -0.37mm. The code and\nmodels are publicly available at https://github.com/isbrycee/T-Mamba.\n","authors":["Jing Hao","Lei He","Kuo Feng Hung"],"pdf_url":"https://arxiv.org/pdf/2404.01065v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01064v1","updated":"2024-04-01T11:57:34Z","published":"2024-04-01T11:57:34Z","title":"Roadside Monocular 3D Detection via 2D Detection Prompting","summary":" The problem of roadside monocular 3D detection requires detecting objects of\ninterested classes in a 2D RGB frame and predicting their 3D information such\nas locations in bird's-eye-view (BEV). It has broad applications in traffic\ncontrol, vehicle-vehicle communication, and vehicle-infrastructure cooperative\nperception. To approach this problem, we present a novel and simple method by\nprompting the 3D detector using 2D detections. Our method builds on a key\ninsight that, compared with 3D detectors, a 2D detector is much easier to train\nand performs significantly better w.r.t detections on the 2D image plane. That\nsaid, one can exploit 2D detections of a well-trained 2D detector as prompts to\na 3D detector, being trained in a way of inflating such 2D detections to 3D\ntowards 3D detection. To construct better prompts using the 2D detector, we\nexplore three techniques: (a) concatenating both 2D and 3D detectors' features,\n(b) attentively fusing 2D and 3D detectors' features, and (c) encoding\npredicted 2D boxes x, y, width, height, label and attentively fusing such with\nthe 3D detector's features. Surprisingly, the third performs the best.\nMoreover, we present a yaw tuning tactic and a class-grouping strategy that\nmerges classes based on their functionality; these techniques improve 3D\ndetection performance further. Comprehensive ablation studies and extensive\nexperiments demonstrate that our method resoundingly outperforms prior works,\nachieving the state-of-the-art on two large-scale roadside 3D detection\nbenchmarks.\n","authors":["Yechi Ma","Shuoquan Wei","Churun Zhang","Wei Hua","Yanan Li","Shu Kong"],"pdf_url":"https://arxiv.org/pdf/2404.01064v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03377v2","updated":"2024-04-01T11:55:46Z","published":"2023-06-06T03:37:41Z","title":"TextFormer: A Query-based End-to-End Text Spotter with Mixed Supervision","summary":" End-to-end text spotting is a vital computer vision task that aims to\nintegrate scene text detection and recognition into a unified framework.\nTypical methods heavily rely on Region-of-Interest (RoI) operations to extract\nlocal features and complex post-processing steps to produce final predictions.\nTo address these limitations, we propose TextFormer, a query-based end-to-end\ntext spotter with Transformer architecture. Specifically, using query embedding\nper text instance, TextFormer builds upon an image encoder and a text decoder\nto learn a joint semantic understanding for multi-task modeling. It allows for\nmutual training and optimization of classification, segmentation, and\nrecognition branches, resulting in deeper feature sharing without sacrificing\nflexibility or simplicity. Additionally, we design an Adaptive Global\naGgregation (AGG) module to transfer global features into sequential features\nfor reading arbitrarily-shaped texts, which overcomes the sub-optimization\nproblem of RoI operations. Furthermore, potential corpus information is\nutilized from weak annotations to full labels through mixed supervision,\nfurther improving text detection and end-to-end text spotting results.\nExtensive experiments on various bilingual (i.e., English and Chinese)\nbenchmarks demonstrate the superiority of our method. Especially on TDA-ReCTS\ndataset, TextFormer surpasses the state-of-the-art method in terms of 1-NED by\n13.2%.\n","authors":["Yukun Zhai","Xiaoqiang Zhang","Xiameng Qin","Sanyuan Zhao","Xingping Dong","Jianbing Shen"],"pdf_url":"https://arxiv.org/pdf/2306.03377v2.pdf","comment":"Machine Intelligence Research, MIR 2024"},{"id":"http://arxiv.org/abs/2312.08568v2","updated":"2024-04-01T11:49:22Z","published":"2023-12-13T23:41:17Z","title":"NViST: In the Wild New View Synthesis from a Single Image with\n Transformers","summary":" We propose NViST, a transformer-based model for efficient and generalizable\nnovel-view synthesis from a single image for real-world scenes. In contrast to\nmany methods that are trained on synthetic data, object-centred scenarios, or\nin a category-specific manner, NViST is trained on MVImgNet, a large-scale\ndataset of casually-captured real-world videos of hundreds of object categories\nwith diverse backgrounds. NViST transforms image inputs directly into a\nradiance field, conditioned on camera parameters via adaptive layer\nnormalisation. In practice, NViST exploits fine-tuned masked autoencoder (MAE)\nfeatures and translates them to 3D output tokens via cross-attention, while\naddressing occlusions with self-attention. To move away from object-centred\ndatasets and enable full scene synthesis, NViST adopts a 6-DOF camera pose\nmodel and only requires relative pose, dropping the need for canonicalization\nof the training data, which removes a substantial barrier to it being used on\ncasually captured datasets. We show results on unseen objects and categories\nfrom MVImgNet and even generalization to casual phone captures. We conduct\nqualitative and quantitative evaluations on MVImgNet and ShapeNet to show that\nour model represents a step forward towards enabling true in-the-wild\ngeneralizable novel-view synthesis from a single image. Project webpage:\nhttps://wbjang.github.io/nvist_webpage.\n","authors":["Wonbong Jang","Lourdes Agapito"],"pdf_url":"https://arxiv.org/pdf/2312.08568v2.pdf","comment":"CVPR 2024, Project page: https://wbjang.github.io/nvist_webpage"},{"id":"http://arxiv.org/abs/2311.18608v2","updated":"2024-04-01T11:44:25Z","published":"2023-11-30T15:06:10Z","title":"Contrastive Denoising Score for Text-guided Latent Diffusion Image\n Editing","summary":" With the remarkable advent of text-to-image diffusion models, image editing\nmethods have become more diverse and continue to evolve. A promising recent\napproach in this realm is Delta Denoising Score (DDS) - an image editing\ntechnique based on Score Distillation Sampling (SDS) framework that leverages\nthe rich generative prior of text-to-image diffusion models. However, relying\nsolely on the difference between scoring functions is insufficient for\npreserving specific structural elements from the original image, a crucial\naspect of image editing. To address this, here we present an embarrassingly\nsimple yet very powerful modification of DDS, called Contrastive Denoising\nScore (CDS), for latent diffusion models (LDM). Inspired by the similarities\nand differences between DDS and the contrastive learning for unpaired\nimage-to-image translation(CUT), we introduce a straightforward approach using\nCUT loss within the DDS framework. Rather than employing auxiliary networks as\nin the original CUT approach, we leverage the intermediate features of LDM,\nspecifically those from the self-attention layers, which possesses rich spatial\ninformation. Our approach enables zero-shot image-to-image translation and\nneural radiance field (NeRF) editing, achieving structural correspondence\nbetween the input and output while maintaining content controllability.\nQualitative results and comparisons demonstrates the effectiveness of our\nproposed method. Project page: https://hyelinnam.github.io/CDS/\n","authors":["Hyelin Nam","Gihyun Kwon","Geon Yeong Park","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2311.18608v2.pdf","comment":"CVPR 2024 (poster); Project page: https://hyelinnam.github.io/CDS/"},{"id":"http://arxiv.org/abs/2312.14457v2","updated":"2024-04-01T11:42:43Z","published":"2023-12-22T06:15:03Z","title":"QUAR-VLA: Vision-Language-Action Model for Quadruped Robots","summary":" The important manifestation of robot intelligence is the ability to naturally\ninteract and autonomously make decisions. Traditional approaches to robot\ncontrol often compartmentalize perception, planning, and decision-making,\nsimplifying system design but limiting the synergy between different\ninformation streams. This compartmentalization poses challenges in achieving\nseamless autonomous reasoning, decision-making, and action execution. To\naddress these limitations, a novel paradigm, named Vision-Language-Action tasks\nfor QUAdruped Robots (QUAR-VLA), has been introduced in this paper. This\napproach tightly integrates visual information and instructions to generate\nexecutable actions, effectively merging perception, planning, and\ndecision-making. The central idea is to elevate the overall intelligence of the\nrobot. Within this framework, a notable challenge lies in aligning fine-grained\ninstructions with visual perception information. This emphasizes the complexity\ninvolved in ensuring that the robot accurately interprets and acts upon\ndetailed instructions in harmony with its visual observations. Consequently, we\npropose QUAdruped Robotic Transformer (QUART), a family of VLA models to\nintegrate visual information and instructions from diverse modalities as input\nand generates executable actions for real-world robots and present QUAdruped\nRobot Dataset (QUARD), a large-scale multi-task dataset including navigation,\ncomplex terrain locomotion, and whole-body manipulation tasks for training\nQUART models. Our extensive evaluation (4000 evaluation trials) shows that our\napproach leads to performant robotic policies and enables QUART to obtain a\nrange of emergent capabilities.\n","authors":["Pengxiang Ding","Han Zhao","Zhitao Wang","Zhenyu Wei","Shangke Lyu","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2312.14457v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01053v1","updated":"2024-04-01T11:23:38Z","published":"2024-04-01T11:23:38Z","title":"HAHA: Highly Articulated Gaussian Human Avatars with Textured Mesh Prior","summary":" We present HAHA - a novel approach for animatable human avatar generation\nfrom monocular input videos. The proposed method relies on learning the\ntrade-off between the use of Gaussian splatting and a textured mesh for\nefficient and high fidelity rendering. We demonstrate its efficiency to animate\nand render full-body human avatars controlled via the SMPL-X parametric model.\nOur model learns to apply Gaussian splatting only in areas of the SMPL-X mesh\nwhere it is necessary, like hair and out-of-mesh clothing. This results in a\nminimal number of Gaussians being used to represent the full avatar, and\nreduced rendering artifacts. This allows us to handle the animation of small\nbody parts such as fingers that are traditionally disregarded. We demonstrate\nthe effectiveness of our approach on two open datasets: SnapshotPeople and\nX-Humans. Our method demonstrates on par reconstruction quality to the\nstate-of-the-art on SnapshotPeople, while using less than a third of Gaussians.\nHAHA outperforms previous state-of-the-art on novel poses from X-Humans both\nquantitatively and qualitatively.\n","authors":["David Svitov","Pietro Morerio","Lourdes Agapito","Alessio Del Bue"],"pdf_url":"https://arxiv.org/pdf/2404.01053v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01051v1","updated":"2024-04-01T11:12:06Z","published":"2024-04-01T11:12:06Z","title":"Action Detection via an Image Diffusion Process","summary":" Action detection aims to localize the starting and ending points of action\ninstances in untrimmed videos, and predict the classes of those instances. In\nthis paper, we make the observation that the outputs of the action detection\ntask can be formulated as images. Thus, from a novel perspective, we tackle\naction detection via a three-image generation process to generate starting\npoint, ending point and action-class predictions as images via our proposed\nAction Detection Image Diffusion (ADI-Diff) framework. Furthermore, since our\nimages differ from natural images and exhibit special properties, we further\nexplore a Discrete Action-Detection Diffusion Process and a Row-Column\nTransformer design to better handle their processing. Our ADI-Diff framework\nachieves state-of-the-art results on two widely-used datasets.\n","authors":["Lin Geng Foo","Tianjiao Li","Hossein Rahmani","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2404.01051v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01050v1","updated":"2024-04-01T11:09:40Z","published":"2024-04-01T11:09:40Z","title":"Drag Your Noise: Interactive Point-based Editing via Diffusion Semantic\n Propagation","summary":" Point-based interactive editing serves as an essential tool to complement the\ncontrollability of existing generative models. A concurrent work,\nDragDiffusion, updates the diffusion latent map in response to user inputs,\ncausing global latent map alterations. This results in imprecise preservation\nof the original content and unsuccessful editing due to gradient vanishing. In\ncontrast, we present DragNoise, offering robust and accelerated editing without\nretracing the latent map. The core rationale of DragNoise lies in utilizing the\npredicted noise output of each U-Net as a semantic editor. This approach is\ngrounded in two critical observations: firstly, the bottleneck features of\nU-Net inherently possess semantically rich features ideal for interactive\nediting; secondly, high-level semantics, established early in the denoising\nprocess, show minimal variation in subsequent stages. Leveraging these\ninsights, DragNoise edits diffusion semantics in a single denoising step and\nefficiently propagates these changes, ensuring stability and efficiency in\ndiffusion editing. Comparative experiments reveal that DragNoise achieves\nsuperior control and semantic retention, reducing the optimization time by over\n50% compared to DragDiffusion. Our codes are available at\nhttps://github.com/haofengl/DragNoise.\n","authors":["Haofeng Liu","Chenshu Xu","Yifei Yang","Lihua Zeng","Shengfeng He"],"pdf_url":"https://arxiv.org/pdf/2404.01050v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01036v1","updated":"2024-04-01T10:43:50Z","published":"2024-04-01T10:43:50Z","title":"Higher education assessment practice in the era of generative AI tools","summary":" The higher education (HE) sector benefits every nation's economy and society\nat large. However, their contributions are challenged by advanced technologies\nlike generative artificial intelligence (GenAI) tools. In this paper, we\nprovide a comprehensive assessment of GenAI tools towards assessment and\npedagogic practice and, subsequently, discuss the potential impacts. This study\nexperimented using three assessment instruments from data science, data\nanalytics, and construction management disciplines. Our findings are two-fold:\nfirst, the findings revealed that GenAI tools exhibit subject knowledge,\nproblem-solving, analytical, critical thinking, and presentation skills and\nthus can limit learning when used unethically. Secondly, the design of the\nassessment of certain disciplines revealed the limitations of the GenAI tools.\nBased on our findings, we made recommendations on how AI tools can be utilised\nfor teaching and learning in HE.\n","authors":["Bayode Ogunleye","Kudirat Ibilola Zakariyyah","Oluwaseun Ajao","Olakunle Olayinka","Hemlata Sharma"],"pdf_url":"https://arxiv.org/pdf/2404.01036v1.pdf","comment":"11 pages, 7 tables published in the Journal of Applied Learning &\n Teaching"},{"id":"http://arxiv.org/abs/2404.01024v1","updated":"2024-04-01T10:08:23Z","published":"2024-04-01T10:08:23Z","title":"AIGCOIQA2024: Perceptual Quality Assessment of AI Generated\n Omnidirectional Images","summary":" In recent years, the rapid advancement of Artificial Intelligence Generated\nContent (AIGC) has attracted widespread attention. Among the AIGC, AI generated\nomnidirectional images hold significant potential for Virtual Reality (VR) and\nAugmented Reality (AR) applications, hence omnidirectional AIGC techniques have\nalso been widely studied. AI-generated omnidirectional images exhibit unique\ndistortions compared to natural omnidirectional images, however, there is no\ndedicated Image Quality Assessment (IQA) criteria for assessing them. This\nstudy addresses this gap by establishing a large-scale AI generated\nomnidirectional image IQA database named AIGCOIQA2024 and constructing a\ncomprehensive benchmark. We first generate 300 omnidirectional images based on\n5 AIGC models utilizing 25 text prompts. A subjective IQA experiment is\nconducted subsequently to assess human visual preferences from three\nperspectives including quality, comfortability, and correspondence. Finally, we\nconduct a benchmark experiment to evaluate the performance of state-of-the-art\nIQA models on our database. The database will be released to facilitate future\nresearch.\n","authors":["Liu Yang","Huiyu Duan","Long Teng","Yucheng Zhu","Xiaohong Liu","Menghan Hu","Xiongkuo Min","Guangtao Zhai","Patrick Le Callet"],"pdf_url":"https://arxiv.org/pdf/2404.01024v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00343v5","updated":"2024-04-01T09:49:27Z","published":"2023-12-01T04:35:47Z","title":"OpenStereo: A Comprehensive Benchmark for Stereo Matching and Strong\n Baseline","summary":" Stereo matching aims to estimate the disparity between matching pixels in a\nstereo image pair, which is of great importance to robotics, autonomous\ndriving, and other computer vision tasks. Despite the development of numerous\nimpressive methods in recent years, replicating their results and determining\nthe most suitable architecture for practical application remains challenging.\nAddressing this gap, our paper introduces a comprehensive benchmark focusing on\npractical applicability rather than solely on performance enhancement.\nSpecifically, we develop a flexible and efficient stereo matching codebase,\ncalled OpenStereo. OpenStereo includes training and inference codes of more\nthan 10 network models, making it, to our knowledge, the most complete stereo\nmatching toolbox available. Based on OpenStereo, we conducted experiments and\nhave achieved or surpassed the performance metrics reported in the original\npaper. Additionally, we carry out an exhaustive analysis and deconstruction of\nrecent developments in stereo matching through comprehensive ablative\nexperiments. These investigations inspired the creation of StereoBase, a strong\nbaseline model. Our StereoBase ranks 1st on SceneFlow, KITTI 2015, 2012\n(Reflective) among published methods and achieves the best performance across\nall metrics. In addition, StereoBase has strong cross-dataset\ngeneralization.Code is available at\n\\url{https://github.com/XiandaGuo/OpenStereo}.\n","authors":["Xianda Guo","Juntao Lu","Chenming Zhang","Yiqi Wang","Yiqun Duan","Tian Yang","Zheng Zhu","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2312.00343v5.pdf","comment":"Code is available at: https://github.com/XiandaGuo/OpenStereo"},{"id":"http://arxiv.org/abs/2308.14418v4","updated":"2024-04-01T09:35:30Z","published":"2023-08-28T08:54:27Z","title":"Multiscale and Multilayer Contrastive Learning for Domain Generalization","summary":" During the past decade, deep neural networks have led to fast-paced progress\nand significant achievements in computer vision problems, for both academia and\nindustry. Yet despite their success, state-of-the-art image classification\napproaches fail to generalize well in previously unseen visual contexts, as\nrequired by many real-world applications. In this paper, we focus on this\ndomain generalization (DG) problem and argue that the generalization ability of\ndeep convolutional neural networks can be improved by taking advantage of\nmulti-layer and multi-scaled representations of the network. We introduce a\nframework that aims at improving domain generalization of image classifiers by\ncombining both low-level and high-level features at multiple scales, enabling\nthe network to implicitly disentangle representations in its latent space and\nlearn domain-invariant attributes of the depicted objects. Additionally, to\nfurther facilitate robust representation learning, we propose a novel objective\nfunction, inspired by contrastive learning, which aims at constraining the\nextracted representations to remain invariant under distribution shifts. We\ndemonstrate the effectiveness of our method by evaluating on the domain\ngeneralization datasets of PACS, VLCS, Office-Home and NICO. Through extensive\nexperimentation, we show that our model is able to surpass the performance of\nprevious DG methods and consistently produce competitive and state-of-the-art\nresults in all datasets\n","authors":["Aristotelis Ballas","Christos Diou"],"pdf_url":"https://arxiv.org/pdf/2308.14418v4.pdf","comment":"Manuscript accepted in: IEEE Transactions on Artificial Intelligence\n (March 2024)"},{"id":"http://arxiv.org/abs/2404.01014v1","updated":"2024-04-01T09:34:55Z","published":"2024-04-01T09:34:55Z","title":"Harnessing Large Language Models for Training-free Video Anomaly\n Detection","summary":" Video anomaly detection (VAD) aims to temporally locate abnormal events in a\nvideo. Existing works mostly rely on training deep models to learn the\ndistribution of normality with either video-level supervision, one-class\nsupervision, or in an unsupervised setting. Training-based methods are prone to\nbe domain-specific, thus being costly for practical deployment as any domain\nchange will involve data collection and model training. In this paper, we\nradically depart from previous efforts and propose LAnguage-based VAD (LAVAD),\na method tackling VAD in a novel, training-free paradigm, exploiting the\ncapabilities of pre-trained large language models (LLMs) and existing\nvision-language models (VLMs). We leverage VLM-based captioning models to\ngenerate textual descriptions for each frame of any test video. With the\ntextual scene description, we then devise a prompting mechanism to unlock the\ncapability of LLMs in terms of temporal aggregation and anomaly score\nestimation, turning LLMs into an effective video anomaly detector. We further\nleverage modality-aligned VLMs and propose effective techniques based on\ncross-modal similarity for cleaning noisy captions and refining the LLM-based\nanomaly scores. We evaluate LAVAD on two large datasets featuring real-world\nsurveillance scenarios (UCF-Crime and XD-Violence), showing that it outperforms\nboth unsupervised and one-class methods without requiring any training or data\ncollection.\n","authors":["Luca Zanella","Willi Menapace","Massimiliano Mancini","Yiming Wang","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2404.01014v1.pdf","comment":"CVPR 2024. Project website at https://lucazanella.github.io/lavad/"},{"id":"http://arxiv.org/abs/2404.01013v1","updated":"2024-04-01T09:34:51Z","published":"2024-04-01T09:34:51Z","title":"Teeth-SEG: An Efficient Instance Segmentation Framework for Orthodontic\n Treatment based on Anthropic Prior Knowledge","summary":" Teeth localization, segmentation, and labeling in 2D images have great\npotential in modern dentistry to enhance dental diagnostics, treatment\nplanning, and population-based studies on oral health. However, general\ninstance segmentation frameworks are incompetent due to 1) the subtle\ndifferences between some teeth' shapes (e.g., maxillary first premolar and\nsecond premolar), 2) the teeth's position and shape variation across subjects,\nand 3) the presence of abnormalities in the dentition (e.g., caries and\nedentulism). To address these problems, we propose a ViT-based framework named\nTeethSEG, which consists of stacked Multi-Scale Aggregation (MSA) blocks and an\nAnthropic Prior Knowledge (APK) layer. Specifically, to compose the two\nmodules, we design 1) a unique permutation-based upscaler to ensure high\nefficiency while establishing clear segmentation boundaries with 2) multi-head\nself/cross-gating layers to emphasize particular semantics meanwhile\nmaintaining the divergence between token embeddings. Besides, we collect 3) the\nfirst open-sourced intraoral image dataset IO150K, which comprises over 150k\nintraoral photos, and all photos are annotated by orthodontists using a\nhuman-machine hybrid algorithm. Experiments on IO150K demonstrate that our\nTeethSEG outperforms the state-of-the-art segmentation models on dental image\nsegmentation.\n","authors":["Bo Zou","Shaofeng Wang","Hao Liu","Gaoyue Sun","Yajie Wang","FeiFei Zuo","Chengbin Quan","Youjian Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.01013v1.pdf","comment":"This paper has been accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2305.15357v5","updated":"2024-04-01T09:29:49Z","published":"2023-05-24T17:09:54Z","title":"Solving Diffusion ODEs with Optimal Boundary Conditions for Better Image\n Super-Resolution","summary":" Diffusion models, as a kind of powerful generative model, have given\nimpressive results on image super-resolution (SR) tasks. However, due to the\nrandomness introduced in the reverse process of diffusion models, the\nperformances of diffusion-based SR models are fluctuating at every time of\nsampling, especially for samplers with few resampled steps. This inherent\nrandomness of diffusion models results in ineffectiveness and instability,\nmaking it challenging for users to guarantee the quality of SR results.\nHowever, our work takes this randomness as an opportunity: fully analyzing and\nleveraging it leads to the construction of an effective plug-and-play sampling\nmethod that owns the potential to benefit a series of diffusion-based SR\nmethods. More in detail, we propose to steadily sample high-quality SR images\nfrom pre-trained diffusion-based SR models by solving diffusion ordinary\ndifferential equations (diffusion ODEs) with optimal boundary conditions (BCs)\nand analyze the characteristics between the choices of BCs and their\ncorresponding SR results. Our analysis shows the route to obtain an\napproximately optimal BC via an efficient exploration in the whole space. The\nquality of SR results sampled by the proposed method with fewer steps\noutperforms the quality of results sampled by current methods with randomness\nfrom the same pre-trained diffusion-based SR model, which means that our\nsampling method \"boosts\" current diffusion-based SR models without any\nadditional training.\n","authors":["Yiyang Ma","Huan Yang","Wenhan Yang","Jianlong Fu","Jiaying Liu"],"pdf_url":"https://arxiv.org/pdf/2305.15357v5.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2403.11270v2","updated":"2024-04-01T09:11:13Z","published":"2024-03-17T16:48:46Z","title":"Bilateral Propagation Network for Depth Completion","summary":" Depth completion aims to derive a dense depth map from sparse depth\nmeasurements with a synchronized color image. Current state-of-the-art (SOTA)\nmethods are predominantly propagation-based, which work as an iterative\nrefinement on the initial estimated dense depth. However, the initial depth\nestimations mostly result from direct applications of convolutional layers on\nthe sparse depth map. In this paper, we present a Bilateral Propagation Network\n(BP-Net), that propagates depth at the earliest stage to avoid directly\nconvolving on sparse data. Specifically, our approach propagates the target\ndepth from nearby depth measurements via a non-linear model, whose coefficients\nare generated through a multi-layer perceptron conditioned on both\n\\emph{radiometric difference} and \\emph{spatial distance}. By integrating\nbilateral propagation with multi-modal fusion and depth refinement in a\nmulti-scale framework, our BP-Net demonstrates outstanding performance on both\nindoor and outdoor scenes. It achieves SOTA on the NYUv2 dataset and ranks 1st\non the KITTI depth completion benchmark at the time of submission. Experimental\nresults not only show the effectiveness of bilateral propagation but also\nemphasize the significance of early-stage propagation in contrast to the\nrefinement stage. Our code and trained models will be available on the project\npage.\n","authors":["Jie Tang","Fei-Peng Tian","Boshi An","Jian Li","Ping Tan"],"pdf_url":"https://arxiv.org/pdf/2403.11270v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2312.04466v2","updated":"2024-04-01T08:56:59Z","published":"2023-12-07T17:39:25Z","title":"Emotional Speech-driven 3D Body Animation via Disentangled Latent\n Diffusion","summary":" Existing methods for synthesizing 3D human gestures from speech have shown\npromising results, but they do not explicitly model the impact of emotions on\nthe generated gestures. Instead, these methods directly output animations from\nspeech without control over the expressed emotion. To address this limitation,\nwe present AMUSE, an emotional speech-driven body animation model based on\nlatent diffusion. Our observation is that content (i.e., gestures related to\nspeech rhythm and word utterances), emotion, and personal style are separable.\nTo account for this, AMUSE maps the driving audio to three disentangled latent\nvectors: one for content, one for emotion, and one for personal style. A latent\ndiffusion model, trained to generate gesture motion sequences, is then\nconditioned on these latent vectors. Once trained, AMUSE synthesizes 3D human\ngestures directly from speech with control over the expressed emotions and\nstyle by combining the content from the driving speech with the emotion and\nstyle of another speech sequence. Randomly sampling the noise of the diffusion\nmodel further generates variations of the gesture with the same emotional\nexpressivity. Qualitative, quantitative, and perceptual evaluations demonstrate\nthat AMUSE outputs realistic gesture sequences. Compared to the state of the\nart, the generated gestures are better synchronized with the speech content,\nand better represent the emotion expressed by the input speech. Our code is\navailable at amuse.is.tue.mpg.de.\n","authors":["Kiran Chhatre","Radek Daněček","Nikos Athanasiou","Giorgio Becherini","Christopher Peters","Michael J. Black","Timo Bolkart"],"pdf_url":"https://arxiv.org/pdf/2312.04466v2.pdf","comment":"Conference on Computer Vision and Pattern Recognition (CVPR) 2024.\n Webpage: https://amuse.is.tue.mpg.de/"},{"id":"http://arxiv.org/abs/2404.00994v1","updated":"2024-04-01T08:44:11Z","published":"2024-04-01T08:44:11Z","title":"AMOR: Ambiguous Authorship Order","summary":" As we all know, writing scientific papers together with our beloved\ncolleagues is a truly remarkable experience (partially): endless discussions\nabout the same useless paragraph over and over again, followed by long days and\nlong nights -- both at the same time. What a wonderful ride it is! What a\nbeautiful life we have. But wait, there's one tiny little problem that utterly\nshatters the peace, turning even renowned scientists into bloodthirsty\nmonsters: author order. The reason is that, contrary to widespread opinion,\nit's not the font size that matters, but the way things are ordered. Of course,\nthis is a fairly well-known fact among scientists all across the planet (and\nbeyond) and explains clearly why we regularly have to read about yet another\nescalated paper submission in local police reports.\n In this paper, we take an important step backwards to tackle this issue by\nsolving the so-called author ordering problem (AOP) once and for all.\nSpecifically, we propose AMOR, a system that replaces silly constructs like\nco-first or co-middle authorship with a simple yet easy probabilistic approach\nbased on random shuffling of the author list at viewing time. In addition to\nAOP, we also solve the ambiguous author ordering citation problem} (AAOCP) on\nthe fly. Stop author violence, be human.\n","authors":["Maximilian Weiherer","Andreea Dogaru","Shreya Kapoor","Hannah Schieber","Bernhard Egger"],"pdf_url":"https://arxiv.org/pdf/2404.00994v1.pdf","comment":"SIGBOVIK '24 submission"},{"id":"http://arxiv.org/abs/2404.00992v1","updated":"2024-04-01T08:37:57Z","published":"2024-04-01T08:37:57Z","title":"SGCNeRF: Few-Shot Neural Rendering via Sparse Geometric Consistency\n Guidance","summary":" Neural Radiance Field (NeRF) technology has made significant strides in\ncreating novel viewpoints. However, its effectiveness is hampered when working\nwith sparsely available views, often leading to performance dips due to\noverfitting. FreeNeRF attempts to overcome this limitation by integrating\nimplicit geometry regularization, which incrementally improves both geometry\nand textures. Nonetheless, an initial low positional encoding bandwidth results\nin the exclusion of high-frequency elements. The quest for a holistic approach\nthat simultaneously addresses overfitting and the preservation of\nhigh-frequency details remains ongoing. This study introduces a novel feature\nmatching based sparse geometry regularization module. This module excels in\npinpointing high-frequency keypoints, thereby safeguarding the integrity of\nfine details. Through progressive refinement of geometry and textures across\nNeRF iterations, we unveil an effective few-shot neural rendering architecture,\ndesignated as SGCNeRF, for enhanced novel view synthesis. Our experiments\ndemonstrate that SGCNeRF not only achieves superior geometry-consistent\noutcomes but also surpasses FreeNeRF, with improvements of 0.7 dB and 0.6 dB in\nPSNR on the LLFF and DTU datasets, respectively.\n","authors":["Yuru Xiao","Xianming Liu","Deming Zhai","Kui Jiang","Junjun Jiang","Xiangyang Ji"],"pdf_url":"https://arxiv.org/pdf/2404.00992v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00989v1","updated":"2024-04-01T08:34:42Z","published":"2024-04-01T08:34:42Z","title":"360+x: A Panoptic Multi-modal Scene Understanding Dataset","summary":" Human perception of the world is shaped by a multitude of viewpoints and\nmodalities. While many existing datasets focus on scene understanding from a\ncertain perspective (e.g. egocentric or third-person views), our dataset offers\na panoptic perspective (i.e. multiple viewpoints with multiple data\nmodalities). Specifically, we encapsulate third-person panoramic and front\nviews, as well as egocentric monocular/binocular views with rich modalities\nincluding video, multi-channel audio, directional binaural delay, location data\nand textual scene descriptions within each scene captured, presenting\ncomprehensive observation of the world. Figure 1 offers a glimpse of all 28\nscene categories of our 360+x dataset. To the best of our knowledge, this is\nthe first database that covers multiple viewpoints with multiple data\nmodalities to mimic how daily information is accessed in the real world.\nThrough our benchmark analysis, we presented 5 different scene understanding\ntasks on the proposed 360+x dataset to evaluate the impact and benefit of each\ndata modality and perspective in panoptic scene understanding. We hope this\nunique dataset could broaden the scope of comprehensive scene understanding and\nencourage the community to approach these problems from more diverse\nperspectives.\n","authors":["Hao Chen","Yuqi Hou","Chenyuan Qu","Irene Testini","Xiaohan Hong","Jianbo Jiao"],"pdf_url":"https://arxiv.org/pdf/2404.00989v1.pdf","comment":"To access the public dataset, please visit\n https://x360dataset.github.io"},{"id":"http://arxiv.org/abs/2107.11267v3","updated":"2024-04-01T08:28:33Z","published":"2021-07-23T14:34:57Z","title":"Dense Supervision Propagation for Weakly Supervised Semantic\n Segmentation on 3D Point Clouds","summary":" Semantic segmentation on 3D point clouds is an important task for 3D scene\nunderstanding. While dense labeling on 3D data is expensive and time-consuming,\nonly a few works address weakly supervised semantic point cloud segmentation\nmethods to relieve the labeling cost by learning from simpler and cheaper\nlabels. Meanwhile, there are still huge performance gaps between existing\nweakly supervised methods and state-of-the-art fully supervised methods. In\nthis paper, we train a semantic point cloud segmentation network with only a\nsmall portion of points being labeled. We argue that we can better utilize the\nlimited supervision information as we densely propagate the supervision signal\nfrom the labeled points to other points within and across the input samples.\nSpecifically, we propose a cross-sample feature reallocating module to transfer\nsimilar features and therefore re-route the gradients across two samples with\ncommon classes and an intra-sample feature redistribution module to propagate\nsupervision signals on unlabeled points across and within point cloud samples.\nWe conduct extensive experiments on public datasets S3DIS and ScanNet. Our\nweakly supervised method with only 10% and 1% of labels can produce compatible\nresults with the fully supervised counterpart.\n","authors":["Jiacheng Wei","Guosheng Lin","Kim-Hui Yap","Fayao Liu","Tzu-Yi Hung"],"pdf_url":"https://arxiv.org/pdf/2107.11267v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00987v1","updated":"2024-04-01T08:20:18Z","published":"2024-04-01T08:20:18Z","title":"FlexiDreamer: Single Image-to-3D Generation with FlexiCubes","summary":" 3D content generation from text prompts or single images has made remarkable\nprogress in quality and speed recently. One of its dominant paradigms involves\ngenerating consistent multi-view images followed by a sparse-view\nreconstruction. However, due to the challenge of directly deforming the mesh\nrepresentation to approach the target topology, most methodologies learn an\nimplicit representation (such as NeRF) during the sparse-view reconstruction\nand acquire the target mesh by a post-processing extraction. Although the\nimplicit representation can effectively model rich 3D information, its training\ntypically entails a long convergence time. In addition, the post-extraction\noperation from the implicit field also leads to undesirable visual artifacts.\nIn this paper, we propose FlexiDreamer, a novel single image-to-3d generation\nframework that reconstructs the target mesh in an end-to-end manner. By\nleveraging a flexible gradient-based extraction known as FlexiCubes, our method\ncircumvents the defects brought by the post-processing and facilitates a direct\nacquisition of the target mesh. Furthermore, we incorporate a multi-resolution\nhash grid encoding scheme that progressively activates the encoding levels into\nthe implicit field in FlexiCubes to help capture geometric details for per-step\noptimization. Notably, FlexiDreamer recovers a dense 3D structure from a\nsingle-view image in approximately 1 minute on a single NVIDIA A100 GPU,\noutperforming previous methodologies by a large margin.\n","authors":["Ruowen Zhao","Zhengyi Wang","Yikai Wang","Zihan Zhou","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.00987v1.pdf","comment":"project page:https://flexidreamer.github.io"},{"id":"http://arxiv.org/abs/2404.00986v1","updated":"2024-04-01T08:18:38Z","published":"2024-04-01T08:18:38Z","title":"Make Continual Learning Stronger via C-Flat","summary":" Model generalization ability upon incrementally acquiring dynamically\nupdating knowledge from sequentially arriving tasks is crucial to tackle the\nsensitivity-stability dilemma in Continual Learning (CL). Weight loss landscape\nsharpness minimization seeking for flat minima lying in neighborhoods with\nuniform low loss or smooth gradient is proven to be a strong training regime\nimproving model generalization compared with loss minimization based optimizer\nlike SGD. Yet only a few works have discussed this training regime for CL,\nproving that dedicated designed zeroth-order sharpness optimizer can improve CL\nperformance. In this work, we propose a Continual Flatness (C-Flat) method\nfeaturing a flatter loss landscape tailored for CL. C-Flat could be easily\ncalled with only one line of code and is plug-and-play to any CL methods. A\ngeneral framework of C-Flat applied to all CL categories and a thorough\ncomparison with loss minima optimizer and flat minima based CL approaches is\npresented in this paper, showing that our method can boost CL performance in\nalmost all cases. Code will be publicly available upon publication.\n","authors":["Ang Bian","Wei Li","Hangjie Yuan","Chengrong Yu","Zixiang Zhao","Mang Wang","Aojun Lu","Tao Feng"],"pdf_url":"https://arxiv.org/pdf/2404.00986v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00980v1","updated":"2024-04-01T07:52:05Z","published":"2024-04-01T07:52:05Z","title":"CAMO: Correlation-Aware Mask Optimization with Modulated Reinforcement\n Learning","summary":" Optical proximity correction (OPC) is a vital step to ensure printability in\nmodern VLSI manufacturing. Various OPC approaches based on machine learning\nhave been proposed to pursue performance and efficiency, which are typically\ndata-driven and hardly involve any particular considerations of the OPC\nproblem, leading to potential performance or efficiency bottlenecks. In this\npaper, we propose CAMO, a reinforcement learning-based OPC system that\nspecifically integrates important principles of the OPC problem. CAMO\nexplicitly involves the spatial correlation among the movements of neighboring\nsegments and an OPC-inspired modulation for movement action selection.\nExperiments are conducted on both via layer patterns and metal layer patterns.\nThe results demonstrate that CAMO outperforms state-of-the-art OPC engines from\nboth academia and industry.\n","authors":["Xiaoxiao Liang","Haoyu Yang","Kang Liu","Bei Yu","Yuzhe Ma"],"pdf_url":"https://arxiv.org/pdf/2404.00980v1.pdf","comment":"Accepted by DAC 2024"},{"id":"http://arxiv.org/abs/2404.00979v1","updated":"2024-04-01T07:50:10Z","published":"2024-04-01T07:50:10Z","title":"PDF: A Probability-Driven Framework for Open World 3D Point Cloud\n Semantic Segmentation","summary":" Existing point cloud semantic segmentation networks cannot identify unknown\nclasses and update their knowledge, due to a closed-set and static perspective\nof the real world, which would induce the intelligent agent to make bad\ndecisions. To address this problem, we propose a Probability-Driven Framework\n(PDF) for open world semantic segmentation that includes (i) a lightweight\nU-decoder branch to identify unknown classes by estimating the uncertainties,\n(ii) a flexible pseudo-labeling scheme to supply geometry features along with\nprobability distribution features of unknown classes by generating pseudo\nlabels, and (iii) an incremental knowledge distillation strategy to incorporate\nnovel classes into the existing knowledge base gradually. Our framework enables\nthe model to behave like human beings, which could recognize unknown objects\nand incrementally learn them with the corresponding knowledge. Experimental\nresults on the S3DIS and ScanNetv2 datasets demonstrate that the proposed PDF\noutperforms other methods by a large margin in both important tasks of open\nworld semantic segmentation.\n","authors":["Jinfeng Xu","Siyuan Yang","Xianzhi Li","Yuan Tang","Yixue Hao","Long Hu","Min Chen"],"pdf_url":"https://arxiv.org/pdf/2404.00979v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00974v1","updated":"2024-04-01T07:45:42Z","published":"2024-04-01T07:45:42Z","title":"Improving Visual Recognition with Hyperbolical Visual Hierarchy Mapping","summary":" Visual scenes are naturally organized in a hierarchy, where a coarse semantic\nis recursively comprised of several fine details. Exploring such a visual\nhierarchy is crucial to recognize the complex relations of visual elements,\nleading to a comprehensive scene understanding. In this paper, we propose a\nVisual Hierarchy Mapper (Hi-Mapper), a novel approach for enhancing the\nstructured understanding of the pre-trained Deep Neural Networks (DNNs).\nHi-Mapper investigates the hierarchical organization of the visual scene by 1)\npre-defining a hierarchy tree through the encapsulation of probability\ndensities; and 2) learning the hierarchical relations in hyperbolic space with\na novel hierarchical contrastive loss. The pre-defined hierarchy tree\nrecursively interacts with the visual features of the pre-trained DNNs through\nhierarchy decomposition and encoding procedures, thereby effectively\nidentifying the visual hierarchy and enhancing the recognition of an entire\nscene. Extensive experiments demonstrate that Hi-Mapper significantly enhances\nthe representation capability of DNNs, leading to an improved performance on\nvarious tasks, including image classification and dense prediction tasks.\n","authors":["Hyeongjun Kwon","Jinhyun Jang","Jin Kim","Kwonyoung Kim","Kwanghoon Sohn"],"pdf_url":"https://arxiv.org/pdf/2404.00974v1.pdf","comment":"This paper is accepted to CVPR 2024. The supplementary material is\n included. The code is available at\n \\url{https://github.com/kwonjunn01/Hi-Mapper}"},{"id":"http://arxiv.org/abs/2404.00973v1","updated":"2024-04-01T07:44:24Z","published":"2024-04-01T07:44:24Z","title":"VideoDistill: Language-aware Vision Distillation for Video Question\n Answering","summary":" Significant advancements in video question answering (VideoQA) have been made\nthanks to thriving large image-language pretraining frameworks. Although these\nimage-language models can efficiently represent both video and language\nbranches, they typically employ a goal-free vision perception process and do\nnot interact vision with language well during the answer generation, thus\nomitting crucial visual cues. In this paper, we are inspired by the human\nrecognition and learning pattern and propose VideoDistill, a framework with\nlanguage-aware (i.e., goal-driven) behavior in both vision perception and\nanswer generation process. VideoDistill generates answers only from\nquestion-related visual embeddings and follows a thinking-observing-answering\napproach that closely resembles human behavior, distinguishing it from previous\nresearch. Specifically, we develop a language-aware gating mechanism to replace\nthe standard cross-attention, avoiding language's direct fusion into visual\nrepresentations. We incorporate this mechanism into two key components of the\nentire framework. The first component is a differentiable sparse sampling\nmodule, which selects frames containing the necessary dynamics and semantics\nrelevant to the questions. The second component is a vision refinement module\nthat merges existing spatial-temporal attention layers to ensure the extraction\nof multi-grained visual semantics associated with the questions. We conduct\nexperimental evaluations on various challenging video question-answering\nbenchmarks, and VideoDistill achieves state-of-the-art performance in both\ngeneral and long-form VideoQA datasets. In Addition, we verify that\nVideoDistill can effectively alleviate the utilization of language shortcut\nsolutions in the EgoTaskQA dataset.\n","authors":["Bo Zou","Chao Yang","Yu Qiao","Chengbin Quan","Youjian Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.00973v1.pdf","comment":"This paper is accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2312.02010v3","updated":"2024-04-01T07:21:52Z","published":"2023-12-04T16:32:51Z","title":"Towards Learning a Generalist Model for Embodied Navigation","summary":" Building a generalist agent that can interact with the world is the\nintriguing target of AI systems, thus spurring the research for embodied\nnavigation, where an agent is required to navigate according to instructions or\nrespond to queries. Despite the major progress attained, previous works\nprimarily focus on task-specific agents and lack generalizability to unseen\nscenarios. Recently, LLMs have presented remarkable capabilities across various\nfields, and provided a promising opportunity for embodied navigation. Drawing\non this, we propose the first generalist model for embodied navigation,\nNaviLLM. It adapts LLMs to embodied navigation by introducing schema-based\ninstruction. The schema-based instruction flexibly casts various tasks into\ngeneration problems, thereby unifying a wide range of tasks. This approach\nallows us to integrate diverse data sources from various datasets into the\ntraining, equipping NaviLLM with a wide range of capabilities required by\nembodied navigation. We conduct extensive experiments to evaluate the\nperformance and generalizability of our model. The experimental results\ndemonstrate that our unified model achieves state-of-the-art performance on\nCVDN, SOON, and ScanQA. Specifically, it surpasses the previous\nstats-of-the-art method by a significant margin of 29% in goal progress on\nCVDN. Moreover, our model also demonstrates strong generalizability and\npresents impressive results on unseen tasks, e.g., embodied question answering\nand 3D captioning.\n","authors":["Duo Zheng","Shijia Huang","Lin Zhao","Yiwu Zhong","Liwei Wang"],"pdf_url":"https://arxiv.org/pdf/2312.02010v3.pdf","comment":"Accepted by CVPR 2024 (14 pages, 3 figures)"},{"id":"http://arxiv.org/abs/2401.05698v2","updated":"2024-04-01T07:19:40Z","published":"2024-01-11T07:00:07Z","title":"HiCMAE: Hierarchical Contrastive Masked Autoencoder for Self-Supervised\n Audio-Visual Emotion Recognition","summary":" Audio-Visual Emotion Recognition (AVER) has garnered increasing attention in\nrecent years for its critical role in creating emotion-ware intelligent\nmachines. Previous efforts in this area are dominated by the supervised\nlearning paradigm. Despite significant progress, supervised learning is meeting\nits bottleneck due to the longstanding data scarcity issue in AVER. Motivated\nby recent advances in self-supervised learning, we propose Hierarchical\nContrastive Masked Autoencoder (HiCMAE), a novel self-supervised framework that\nleverages large-scale self-supervised pre-training on vast unlabeled\naudio-visual data to promote the advancement of AVER. Following prior arts in\nself-supervised audio-visual representation learning, HiCMAE adopts two primary\nforms of self-supervision for pre-training, namely masked data modeling and\ncontrastive learning. Unlike them which focus exclusively on top-layer\nrepresentations while neglecting explicit guidance of intermediate layers,\nHiCMAE develops a three-pronged strategy to foster hierarchical audio-visual\nfeature learning and improve the overall quality of learned representations. To\nverify the effectiveness of HiCMAE, we conduct extensive experiments on 9\ndatasets covering both categorical and dimensional AVER tasks. Experimental\nresults show that our method significantly outperforms state-of-the-art\nsupervised and self-supervised audio-visual methods, which indicates that\nHiCMAE is a powerful audio-visual emotion representation learner. Codes and\nmodels will be publicly available at https://github.com/sunlicai/HiCMAE.\n","authors":["Licai Sun","Zheng Lian","Bin Liu","Jianhua Tao"],"pdf_url":"https://arxiv.org/pdf/2401.05698v2.pdf","comment":"Accepted by Information Fusion. The code is available at\n https://github.com/sunlicai/HiCMAE"},{"id":"http://arxiv.org/abs/2404.00964v1","updated":"2024-04-01T07:17:02Z","published":"2024-04-01T07:17:02Z","title":"S2RC-GCN: A Spatial-Spectral Reliable Contrastive Graph Convolutional\n Network for Complex Land Cover Classification Using Hyperspectral Images","summary":" Spatial correlations between different ground objects are an important\nfeature of mining land cover research. Graph Convolutional Networks (GCNs) can\neffectively capture such spatial feature representations and have demonstrated\npromising results in performing hyperspectral imagery (HSI) classification\ntasks of complex land. However, the existing GCN-based HSI classification\nmethods are prone to interference from redundant information when extracting\ncomplex features. To classify complex scenes more effectively, this study\nproposes a novel spatial-spectral reliable contrastive graph convolutional\nclassification framework named S2RC-GCN. Specifically, we fused the spectral\nand spatial features extracted by the 1D- and 2D-encoder, and the 2D-encoder\nincludes an attention model to automatically extract important information. We\nthen leveraged the fused high-level features to construct graphs and fed the\nresulting graphs into the GCNs to determine more effective graph\nrepresentations. Furthermore, a novel reliable contrastive graph convolution\nwas proposed for reliable contrastive learning to learn and fuse robust\nfeatures. Finally, to test the performance of the model on complex object\nclassification, we used imagery taken by Gaofen-5 in the Jiang Xia area to\nconstruct complex land cover datasets. The test results show that compared with\nother models, our model achieved the best results and effectively improved the\nclassification performance of complex remote sensing imagery.\n","authors":["Renxiang Guan","Zihao Li","Chujia Song","Guo Yu","Xianju Li","Ruyi Feng"],"pdf_url":"https://arxiv.org/pdf/2404.00964v1.pdf","comment":"Accepted to IJCNN 2024 (International Joint Conference on Neural\n Networks)"},{"id":"http://arxiv.org/abs/2404.00959v1","updated":"2024-04-01T06:59:56Z","published":"2024-04-01T06:59:56Z","title":"Equivariant Local Reference Frames for Unsupervised Non-rigid Point\n Cloud Shape Correspondence","summary":" Unsupervised non-rigid point cloud shape correspondence underpins a multitude\nof 3D vision tasks, yet itself is non-trivial given the exponential complexity\nstemming from inter-point degree-of-freedom, i.e., pose transformations. Based\non the assumption of local rigidity, one solution for reducing complexity is to\ndecompose the overall shape into independent local regions using Local\nReference Frames (LRFs) that are invariant to SE(3) transformations. However,\nthe focus solely on local structure neglects global geometric contexts,\nresulting in less distinctive LRFs that lack crucial semantic information\nnecessary for effective matching. Furthermore, such complexity introduces\nout-of-distribution geometric contexts during inference, thus complicating\ngeneralization. To this end, we introduce 1) EquiShape, a novel structure\ntailored to learn pair-wise LRFs with global structural cues for both spatial\nand semantic consistency, and 2) LRF-Refine, an optimization strategy generally\napplicable to LRF-based methods, aimed at addressing the generalization\nchallenges. Specifically, for EquiShape, we employ cross-talk within separate\nequivariant graph neural networks (Cross-GVP) to build long-range dependencies\nto compensate for the lack of semantic information in local structure modeling,\ndeducing pair-wise independent SE(3)-equivariant LRF vectors for each point.\nFor LRF-Refine, the optimization adjusts LRFs within specific contexts and\nknowledge, enhancing the geometric and semantic generalizability of point\nfeatures. Our overall framework surpasses the state-of-the-art methods by a\nlarge margin on three benchmarks. Code and models will be publicly available.\n","authors":["Ling Wang","Runfa Chen","Yikai Wang","Fuchun Sun","Xinzhou Wang","Sun Kai","Guangyuan Fu","Jianwei Zhang","Wenbing Huang"],"pdf_url":"https://arxiv.org/pdf/2404.00959v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06226v2","updated":"2024-04-01T06:57:31Z","published":"2023-12-11T09:14:42Z","title":"Invariant Representation via Decoupling Style and Spurious Features from\n Images","summary":" This paper considers the out-of-distribution (OOD) generalization problem\nunder the setting that both style distribution shift and spurious features\nexist and domain labels are missing. This setting frequently arises in\nreal-world applications and is underlooked because previous approaches mainly\nhandle either of these two factors. The critical challenge is decoupling style\nand spurious features in the absence of domain labels. To address this\nchallenge, we first propose a structural causal model (SCM) for the image\ngeneration process, which captures both style distribution shift and spurious\nfeatures. The proposed SCM enables us to design a new framework called IRSS,\nwhich can gradually separate style distribution and spurious features from\nimages by introducing adversarial neural networks and multi-environment\noptimization, thus achieving OOD generalization. Moreover, it does not require\nadditional supervision (e.g., domain labels) other than the images and their\ncorresponding labels. Experiments on benchmark datasets demonstrate that IRSS\noutperforms traditional OOD methods and solves the problem of Invariant risk\nminimization (IRM) degradation, enabling the extraction of invariant features\nunder distribution shift.\n","authors":["Ruimeng Li","Yuanhao Pu","Zhaoyi Li","Hong Xie","Defu Lian"],"pdf_url":"https://arxiv.org/pdf/2312.06226v2.pdf","comment":"10 pages, 12 figures"},{"id":"http://arxiv.org/abs/2403.07494v2","updated":"2024-04-01T06:52:46Z","published":"2024-03-12T10:33:26Z","title":"SemGauss-SLAM: Dense Semantic Gaussian Splatting SLAM","summary":" We propose SemGauss-SLAM, the first semantic SLAM system utilizing 3D\nGaussian representation, that enables accurate 3D semantic mapping, robust\ncamera tracking, and high-quality rendering in real-time. In this system, we\nincorporate semantic feature embedding into 3D Gaussian representation, which\neffectively encodes semantic information within the spatial layout of the\nenvironment for precise semantic scene representation. Furthermore, we propose\nfeature-level loss for updating 3D Gaussian representation, enabling\nhigher-level guidance for 3D Gaussian optimization. In addition, to reduce\ncumulative drift and improve reconstruction accuracy, we introduce\nsemantic-informed bundle adjustment leveraging semantic associations for joint\noptimization of 3D Gaussian representation and camera poses, leading to more\nrobust tracking and consistent mapping. Our SemGauss-SLAM method demonstrates\nsuperior performance over existing dense semantic SLAM methods in terms of\nmapping and tracking accuracy on Replica and ScanNet datasets, while also\nshowing excellent capabilities in novel-view semantic synthesis and 3D semantic\nmapping.\n","authors":["Siting Zhu","Renjie Qin","Guangming Wang","Jiuming Liu","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2403.07494v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.13061v2","updated":"2024-04-01T06:42:17Z","published":"2023-04-25T18:00:08Z","title":"iMixer: hierarchical Hopfield network implies an invertible, implicit\n and iterative MLP-Mixer","summary":" In the last few years, the success of Transformers in computer vision has\nstimulated the discovery of many alternative models that compete with\nTransformers, such as the MLP-Mixer. Despite their weak inductive bias, these\nmodels have achieved performance comparable to well-studied convolutional\nneural networks. Recent studies on modern Hopfield networks suggest the\ncorrespondence between certain energy-based associative memory models and\nTransformers or MLP-Mixer, and shed some light on the theoretical background of\nthe Transformer-type architectures design. In this paper, we generalize the\ncorrespondence to the recently introduced hierarchical Hopfield network, and\nfind iMixer, a novel generalization of MLP-Mixer model. Unlike ordinary\nfeedforward neural networks, iMixer involves MLP layers that propagate forward\nfrom the output side to the input side. We characterize the module as an\nexample of invertible, implicit, and iterative mixing module. We evaluate the\nmodel performance with various datasets on image classification tasks, and find\nthat iMixer, despite its unique architecture, exhibits stable learning\ncapabilities and achieves performance comparable to or better than the baseline\nvanilla MLP-Mixer. The results imply that the correspondence between the\nHopfield networks and the Mixer models serves as a principle for understanding\na broader class of Transformer-like architecture designs.\n","authors":["Toshihiro Ota","Masato Taki"],"pdf_url":"https://arxiv.org/pdf/2304.13061v2.pdf","comment":"19 pages. v2: minor improvements"},{"id":"http://arxiv.org/abs/2403.09055v2","updated":"2024-04-01T06:26:23Z","published":"2024-03-14T02:51:01Z","title":"StreamMultiDiffusion: Real-Time Interactive Generation with Region-Based\n Semantic Control","summary":" The enormous success of diffusion models in text-to-image synthesis has made\nthem promising candidates for the next generation of end-user applications for\nimage generation and editing. Previous works have focused on improving the\nusability of diffusion models by reducing the inference time or increasing user\ninteractivity by allowing new, fine-grained controls such as region-based text\nprompts. However, we empirically find that integrating both branches of works\nis nontrivial, limiting the potential of diffusion models. To solve this\nincompatibility, we present StreamMultiDiffusion, the first real-time\nregion-based text-to-image generation framework. By stabilizing fast inference\ntechniques and restructuring the model into a newly proposed multi-prompt\nstream batch architecture, we achieve $\\times 10$ faster panorama generation\nthan existing solutions, and the generation speed of 1.57 FPS in region-based\ntext-to-image synthesis on a single RTX 2080 Ti GPU. Our solution opens up a\nnew paradigm for interactive image generation named semantic palette, where\nhigh-quality images are generated in real-time from given multiple hand-drawn\nregions, encoding prescribed semantic meanings (e.g., eagle, girl). Our code\nand demo application are available at\nhttps://github.com/ironjr/StreamMultiDiffusion.\n","authors":["Jaerin Lee","Daniel Sungho Jung","Kanggeon Lee","Kyoung Mu Lee"],"pdf_url":"https://arxiv.org/pdf/2403.09055v2.pdf","comment":"29 pages, 16 figures. v2: typos corrected, references added. Project\n page: https://jaerinlee.com/research/StreamMultiDiffusion"},{"id":"http://arxiv.org/abs/2404.00949v1","updated":"2024-04-01T06:22:28Z","published":"2024-04-01T06:22:28Z","title":"Harnessing The Power of Attention For Patch-Based Biomedical Image\n Classification","summary":" Biomedical image analysis can be facilitated by an innovative architecture\nrooted in self-attention mechanisms. The traditional convolutional neural\nnetwork (CNN), characterized by fixed-sized windows, needs help capturing\nintricate spatial and temporal relations at the pixel level. The immutability\nof CNN filter weights post-training further restricts input fluctuations.\nRecognizing these limitations, we propose a new paradigm of attention-based\nmodels instead of convolutions. As an alternative to traditional CNNs, these\nmodels demonstrate robust modelling capabilities and the ability to grasp\ncomprehensive long-range contextual information efficiently. Providing a\nsolution to critical challenges faced by attention-based vision models such as\ninductive bias, weight sharing, receptive field limitations, and data handling\nin high resolution, our work combines non-overlapping (vanilla patching) with\nnovel overlapped Shifted Patching Techniques (S.P.T.s) to induce local context\nthat enhances model generalization. Moreover, we examine the novel Lancoz5\ninterpolation technique, which adapts variable image sizes to higher\nresolutions. Experimental evidence validates our model's generalization\neffectiveness, comparing favourably with existing approaches. Attention-based\nmethods are particularly effective with ample data, especially when advanced\ndata augmentation methodologies are integrated to strengthen their robustness.\n","authors":["Gousia Habib","Shaima Qureshi","Malik ishfaq"],"pdf_url":"https://arxiv.org/pdf/2404.00949v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00946v1","updated":"2024-04-01T06:10:11Z","published":"2024-04-01T06:10:11Z","title":"Exploring the Efficacy of Group-Normalization in Deep Learning Models\n for Alzheimer's Disease Classification","summary":" Batch Normalization is an important approach to advancing deep learning since\nit allows multiple networks to train simultaneously. A problem arises when\nnormalizing along the batch dimension because B.N.'s error increases\nsignificantly as batch size shrinks because batch statistics estimates are\ninaccurate. As a result, computer vision tasks like detection, segmentation,\nand video, which require tiny batches based on memory consumption, aren't\nsuitable for using Batch Normalization for larger model training and feature\ntransfer. Here, we explore Group Normalization as an easy alternative to using\nBatch Normalization A Group Normalization is a channel normalization method in\nwhich each group is divided into different channels, and the corresponding mean\nand variance are calculated for each group. Group Normalization computations\nare accurate across a wide range of batch sizes and are independent of batch\nsize. When trained using a large ImageNet database on ResNet-50, GN achieves a\nvery low error rate of 10.6% compared to Batch Normalization. when a smaller\nbatch size of only 2 is used. For usual batch sizes, the performance of G.N. is\ncomparable to that of Batch Normalization, but at the same time, it outperforms\nother normalization techniques. Implementing Group Normalization as a direct\nalternative to B.N to combat the serious challenges faced by the Batch\nNormalization in deep learning models with comparable or improved\nclassification accuracy. Additionally, Group Normalization can be naturally\ntransferred from the pre-training to the fine-tuning phase. .\n","authors":["Gousia Habib","Ishfaq Ahmed Malik","Jameel Ahmad","Imtiaz Ahmed","Shaima Qureshi"],"pdf_url":"https://arxiv.org/pdf/2404.00946v1.pdf","comment":"19 pages, 3 figures"},{"id":"http://arxiv.org/abs/2312.09935v2","updated":"2024-04-01T05:57:55Z","published":"2023-12-15T16:44:38Z","title":"LogoStyleFool: Vitiating Video Recognition Systems via Logo Style\n Transfer","summary":" Video recognition systems are vulnerable to adversarial examples. Recent\nstudies show that style transfer-based and patch-based unrestricted\nperturbations can effectively improve attack efficiency. These attacks,\nhowever, face two main challenges: 1) Adding large stylized perturbations to\nall pixels reduces the naturalness of the video and such perturbations can be\neasily detected. 2) Patch-based video attacks are not extensible to targeted\nattacks due to the limited search space of reinforcement learning that has been\nwidely used in video attacks recently. In this paper, we focus on the video\nblack-box setting and propose a novel attack framework named LogoStyleFool by\nadding a stylized logo to the clean video. We separate the attack into three\nstages: style reference selection, reinforcement-learning-based logo style\ntransfer, and perturbation optimization. We solve the first challenge by\nscaling down the perturbation range to a regional logo, while the second\nchallenge is addressed by complementing an optimization stage after\nreinforcement learning. Experimental results substantiate the overall\nsuperiority of LogoStyleFool over three state-of-the-art patch-based attacks in\nterms of attack performance and semantic preservation. Meanwhile, LogoStyleFool\nstill maintains its performance against two existing patch-based defense\nmethods. We believe that our research is beneficial in increasing the attention\nof the security community to such subregional style transfer attacks.\n","authors":["Yuxin Cao","Ziyu Zhao","Xi Xiao","Derui Wang","Minhui Xue","Jin Lu"],"pdf_url":"https://arxiv.org/pdf/2312.09935v2.pdf","comment":"14 pages, 3 figures. Accepted to AAAI 2024"},{"id":"http://arxiv.org/abs/2203.16000v4","updated":"2024-04-01T05:51:31Z","published":"2022-03-30T02:18:16Z","title":"StyleFool: Fooling Video Classification Systems via Style Transfer","summary":" Video classification systems are vulnerable to adversarial attacks, which can\ncreate severe security problems in video verification. Current black-box\nattacks need a large number of queries to succeed, resulting in high\ncomputational overhead in the process of attack. On the other hand, attacks\nwith restricted perturbations are ineffective against defenses such as\ndenoising or adversarial training. In this paper, we focus on unrestricted\nperturbations and propose StyleFool, a black-box video adversarial attack via\nstyle transfer to fool the video classification system. StyleFool first\nutilizes color theme proximity to select the best style image, which helps\navoid unnatural details in the stylized videos. Meanwhile, the target class\nconfidence is additionally considered in targeted attacks to influence the\noutput distribution of the classifier by moving the stylized video closer to or\neven across the decision boundary. A gradient-free method is then employed to\nfurther optimize the adversarial perturbations. We carry out extensive\nexperiments to evaluate StyleFool on two standard datasets, UCF-101 and\nHMDB-51. The experimental results demonstrate that StyleFool outperforms the\nstate-of-the-art adversarial attacks in terms of both the number of queries and\nthe robustness against existing defenses. Moreover, 50% of the stylized videos\nin untargeted attacks do not need any query since they can already fool the\nvideo classification model. Furthermore, we evaluate the indistinguishability\nthrough a user study to show that the adversarial samples of StyleFool look\nimperceptible to human eyes, despite unrestricted perturbations.\n","authors":["Yuxin Cao","Xi Xiao","Ruoxi Sun","Derui Wang","Minhui Xue","Sheng Wen"],"pdf_url":"https://arxiv.org/pdf/2203.16000v4.pdf","comment":"18 pages, 9 figures. Accepted to S&P 2023"},{"id":"http://arxiv.org/abs/2404.00938v1","updated":"2024-04-01T05:50:56Z","published":"2024-04-01T05:50:56Z","title":"How Can Large Language Models Enable Better Socially Assistive\n Human-Robot Interaction: A Brief Survey","summary":" Socially assistive robots (SARs) have shown great success in providing\npersonalized cognitive-affective support for user populations with special\nneeds such as older adults, children with autism spectrum disorder (ASD), and\nindividuals with mental health challenges. The large body of work on SAR\ndemonstrates its potential to provide at-home support that complements\nclinic-based interventions delivered by mental health professionals, making\nthese interventions more effective and accessible. However, there are still\nseveral major technical challenges that hinder SAR-mediated interactions and\ninterventions from reaching human-level social intelligence and efficacy. With\nthe recent advances in large language models (LLMs), there is an increased\npotential for novel applications within the field of SAR that can significantly\nexpand the current capabilities of SARs. However, incorporating LLMs introduces\nnew risks and ethical concerns that have not yet been encountered, and must be\ncarefully be addressed to safely deploy these more advanced systems. In this\nwork, we aim to conduct a brief survey on the use of LLMs in SAR technologies,\nand discuss the potentials and risks of applying LLMs to the following three\nmajor technical challenges of SAR: 1) natural language dialog; 2) multimodal\nunderstanding; 3) LLMs as robot policies.\n","authors":["Zhonghao Shi","Ellen Landrum","Amy O' Connell","Mina Kian","Leticia Pinto-Alva","Kaleen Shrestha","Xiaoyuan Zhu","Maja J Matarić"],"pdf_url":"https://arxiv.org/pdf/2404.00938v1.pdf","comment":"2 pages, to be submitted to 2024 AAAI Spring Symposium"},{"id":"http://arxiv.org/abs/2403.01414v2","updated":"2024-04-01T05:44:41Z","published":"2024-03-03T06:58:35Z","title":"Unsigned Orthogonal Distance Fields: An Accurate Neural Implicit\n Representation for Diverse 3D Shapes","summary":" Neural implicit representation of geometric shapes has witnessed considerable\nadvancements in recent years. However, common distance field based implicit\nrepresentations, specifically signed distance field (SDF) for watertight shapes\nor unsigned distance field (UDF) for arbitrary shapes, routinely suffer from\ndegradation of reconstruction accuracy when converting to explicit surface\npoints and meshes. In this paper, we introduce a novel neural implicit\nrepresentation based on unsigned orthogonal distance fields (UODFs). In UODFs,\nthe minimal unsigned distance from any spatial point to the shape surface is\ndefined solely in one orthogonal direction, contrasting with the\nmulti-directional determination made by SDF and UDF. Consequently, every point\nin the 3D UODFs can directly access its closest surface points along three\northogonal directions. This distinctive feature leverages the accurate\nreconstruction of surface points without interpolation errors. We verify the\neffectiveness of UODFs through a range of reconstruction examples, extending\nfrom simple watertight or non-watertight shapes to complex shapes that include\nhollows, internal or assembling structures.\n","authors":["Yujie Lu","Long Wan","Nayu Ding","Yulong Wang","Shuhan Shen","Shen Cai","Lin Gao"],"pdf_url":"https://arxiv.org/pdf/2403.01414v2.pdf","comment":"accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2311.03149v2","updated":"2024-04-01T05:37:19Z","published":"2023-11-06T14:44:34Z","title":"Asymmetric Masked Distillation for Pre-Training Small Foundation Models","summary":" Self-supervised foundation models have shown great potential in computer\nvision thanks to the pre-training paradigm of masked autoencoding. Scale is a\nprimary factor influencing the performance of these foundation models. However,\nthese large foundation models often result in high computational cost. This\npaper focuses on pre-training relatively small vision transformer models that\ncould be efficiently adapted to downstream tasks. Specifically, taking\ninspiration from knowledge distillation in model compression, we propose a new\nasymmetric masked distillation (AMD) framework for pre-training relatively\nsmall models with autoencoding. The core of AMD is to devise an asymmetric\nmasking strategy, where the teacher model is enabled to see more context\ninformation with a lower masking ratio, while the student model is still\nequipped with a high masking ratio. We design customized multi-layer feature\nalignment between the teacher encoder and student encoder to regularize the\npre-training of student MAE. To demonstrate the effectiveness and versatility\nof AMD, we apply it to both ImageMAE and VideoMAE for pre-training relatively\nsmall ViT models. AMD achieved 84.6% classification accuracy on IN1K using the\nViT-B model. And AMD achieves 73.3% classification accuracy using the ViT-B\nmodel on the Something-in-Something V2 dataset, a 3.7% improvement over the\noriginal ViT-B model from VideoMAE. We also transfer AMD pre-trained models to\ndownstream tasks and obtain consistent performance improvement over the\noriginal masked autoencoding. The code and models are available at\nhttps://github.com/MCG-NJU/AMD.\n","authors":["Zhiyu Zhao","Bingkun Huang","Sen Xing","Gangshan Wu","Yu Qiao","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2311.03149v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00931v1","updated":"2024-04-01T05:19:50Z","published":"2024-04-01T05:19:50Z","title":"GOV-NeSF: Generalizable Open-Vocabulary Neural Semantic Fields","summary":" Recent advancements in vision-language foundation models have significantly\nenhanced open-vocabulary 3D scene understanding. However, the generalizability\nof existing methods is constrained due to their framework designs and their\nreliance on 3D data. We address this limitation by introducing Generalizable\nOpen-Vocabulary Neural Semantic Fields (GOV-NeSF), a novel approach offering a\ngeneralizable implicit representation of 3D scenes with open-vocabulary\nsemantics. We aggregate the geometry-aware features using a cost volume, and\npropose a Multi-view Joint Fusion module to aggregate multi-view features\nthrough a cross-view attention mechanism, which effectively predicts\nview-specific blending weights for both colors and open-vocabulary features.\nRemarkably, our GOV-NeSF exhibits state-of-the-art performance in both 2D and\n3D open-vocabulary semantic segmentation, eliminating the need for ground truth\nsemantic labels or depth priors, and effectively generalize across scenes and\ndatasets without fine-tuning.\n","authors":["Yunsong Wang","Hanlin Chen","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2404.00931v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01352v1","updated":"2024-04-01T05:12:55Z","published":"2024-04-01T05:12:55Z","title":"VortexViz: Finding Vortex Boundaries by Learning from Particle\n Trajectories","summary":" Vortices are studied in various scientific disciplines, offering insights\ninto fluid flow behavior. Visualizing the boundary of vortices is crucial for\nunderstanding flow phenomena and detecting flow irregularities. This paper\naddresses the challenge of accurately extracting vortex boundaries using deep\nlearning techniques. While existing methods primarily train on velocity\ncomponents, we propose a novel approach incorporating particle trajectories\n(streamlines or pathlines) into the learning process. By leveraging the\nregional/local characteristics of the flow field captured by streamlines or\npathlines, our methodology aims to enhance the accuracy of vortex boundary\nextraction.\n","authors":["Akila de Silva","Nicholas Tee","Omkar Ghanekar","Fahim Hasan Khan","Gregory Dusek","James Davis","Alex Pang"],"pdf_url":"https://arxiv.org/pdf/2404.01352v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2404.00928v1","updated":"2024-04-01T05:12:30Z","published":"2024-04-01T05:12:30Z","title":"Instance-Aware Group Quantization for Vision Transformers","summary":" Post-training quantization (PTQ) is an efficient model compression technique\nthat quantizes a pretrained full-precision model using only a small calibration\nset of unlabeled samples without retraining. PTQ methods for convolutional\nneural networks (CNNs) provide quantization results comparable to\nfull-precision counterparts. Directly applying them to vision transformers\n(ViTs), however, incurs severe performance degradation, mainly due to the\ndifferences in architectures between CNNs and ViTs. In particular, the\ndistribution of activations for each channel vary drastically according to\ninput instances, making PTQ methods for CNNs inappropriate for ViTs. To address\nthis, we introduce instance-aware group quantization for ViTs (IGQ-ViT). To\nthis end, we propose to split the channels of activation maps into multiple\ngroups dynamically for each input instance, such that activations within each\ngroup share similar statistical properties. We also extend our scheme to\nquantize softmax attentions across tokens. In addition, the number of groups\nfor each layer is adjusted to minimize the discrepancies between predictions\nfrom quantized and full-precision models, under a bit-operation (BOP)\nconstraint. We show extensive experimental results on image classification,\nobject detection, and instance segmentation, with various transformer\narchitectures, demonstrating the effectiveness of our approach.\n","authors":["Jaehyeon Moon","Dohyung Kim","Junyong Cheon","Bumsub Ham"],"pdf_url":"https://arxiv.org/pdf/2404.00928v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00925v1","updated":"2024-04-01T05:07:13Z","published":"2024-04-01T05:07:13Z","title":"LLMs are Good Sign Language Translators","summary":" Sign Language Translation (SLT) is a challenging task that aims to translate\nsign videos into spoken language. Inspired by the strong translation\ncapabilities of large language models (LLMs) that are trained on extensive\nmultilingual text corpora, we aim to harness off-the-shelf LLMs to handle SLT.\nIn this paper, we regularize the sign videos to embody linguistic\ncharacteristics of spoken language, and propose a novel SignLLM framework to\ntransform sign videos into a language-like representation for improved\nreadability by off-the-shelf LLMs. SignLLM comprises two key modules: (1) The\nVector-Quantized Visual Sign module converts sign videos into a sequence of\ndiscrete character-level sign tokens, and (2) the Codebook Reconstruction and\nAlignment module converts these character-level tokens into word-level sign\nrepresentations using an optimal transport formulation. A sign-text alignment\nloss further bridges the gap between sign and text tokens, enhancing semantic\ncompatibility. We achieve state-of-the-art gloss-free results on two\nwidely-used SLT benchmarks.\n","authors":["Jia Gong","Lin Geng Foo","Yixuan He","Hossein Rahmani","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2404.00925v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00924v1","updated":"2024-04-01T05:01:52Z","published":"2024-04-01T05:01:52Z","title":"BadPart: Unified Black-box Adversarial Patch Attacks against Pixel-wise\n Regression Tasks","summary":" Pixel-wise regression tasks (e.g., monocular depth estimation (MDE) and\noptical flow estimation (OFE)) have been widely involved in our daily life in\napplications like autonomous driving, augmented reality and video composition.\nAlthough certain applications are security-critical or bear societal\nsignificance, the adversarial robustness of such models are not sufficiently\nstudied, especially in the black-box scenario. In this work, we introduce the\nfirst unified black-box adversarial patch attack framework against pixel-wise\nregression tasks, aiming to identify the vulnerabilities of these models under\nquery-based black-box attacks. We propose a novel square-based adversarial\npatch optimization framework and employ probabilistic square sampling and\nscore-based gradient estimation techniques to generate the patch effectively\nand efficiently, overcoming the scalability problem of previous black-box patch\nattacks. Our attack prototype, named BadPart, is evaluated on both MDE and OFE\ntasks, utilizing a total of 7 models. BadPart surpasses 3 baseline methods in\nterms of both attack performance and efficiency. We also apply BadPart on the\nGoogle online service for portrait depth estimation, causing 43.5% relative\ndistance error with 50K queries. State-of-the-art (SOTA) countermeasures cannot\ndefend our attack effectively.\n","authors":["Zhiyuan Cheng","Zhaoyi Liu","Tengda Guo","Shiwei Feng","Dongfang Liu","Mingjie Tang","Xiangyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.00924v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00923v1","updated":"2024-04-01T04:57:41Z","published":"2024-04-01T04:57:41Z","title":"MM3DGS SLAM: Multi-modal 3D Gaussian Splatting for SLAM Using Vision,\n Depth, and Inertial Measurements","summary":" Simultaneous localization and mapping is essential for position tracking and\nscene understanding. 3D Gaussian-based map representations enable\nphotorealistic reconstruction and real-time rendering of scenes using multiple\nposed cameras. We show for the first time that using 3D Gaussians for map\nrepresentation with unposed camera images and inertial measurements can enable\naccurate SLAM. Our method, MM3DGS, addresses the limitations of prior neural\nradiance field-based representations by enabling faster rendering, scale\nawareness, and improved trajectory tracking. Our framework enables\nkeyframe-based mapping and tracking utilizing loss functions that incorporate\nrelative pose transformations from pre-integrated inertial measurements, depth\nestimates, and measures of photometric rendering quality. We also release a\nmulti-modal dataset, UT-MM, collected from a mobile robot equipped with a\ncamera and an inertial measurement unit. Experimental evaluation on several\nscenes from the dataset shows that MM3DGS achieves 3x improvement in tracking\nand 5% improvement in photometric rendering quality compared to the current\n3DGS SLAM state-of-the-art, while allowing real-time rendering of a\nhigh-resolution dense 3D map. Project Webpage:\nhttps://vita-group.github.io/MM3DGS-SLAM\n","authors":["Lisong C. Sun","Neel P. Bhatt","Jonathan C. Liu","Zhiwen Fan","Zhangyang Wang","Todd E. Humphreys","Ufuk Topcu"],"pdf_url":"https://arxiv.org/pdf/2404.00923v1.pdf","comment":"Project Webpage: https://vita-group.github.io/MM3DGS-SLAM"},{"id":"http://arxiv.org/abs/2401.02400v2","updated":"2024-04-01T04:56:37Z","published":"2024-01-04T18:32:48Z","title":"Learning the 3D Fauna of the Web","summary":" Learning 3D models of all animals on the Earth requires massively scaling up\nexisting solutions. With this ultimate goal in mind, we develop 3D-Fauna, an\napproach that learns a pan-category deformable 3D animal model for more than\n100 animal species jointly. One crucial bottleneck of modeling animals is the\nlimited availability of training data, which we overcome by simply learning\nfrom 2D Internet images. We show that prior category-specific attempts fail to\ngeneralize to rare species with limited training images. We address this\nchallenge by introducing the Semantic Bank of Skinned Models (SBSM), which\nautomatically discovers a small set of base animal shapes by combining\ngeometric inductive priors with semantic knowledge implicitly captured by an\noff-the-shelf self-supervised feature extractor. To train such a model, we also\ncontribute a new large-scale dataset of diverse animal species. At inference\ntime, given a single image of any quadruped animal, our model reconstructs an\narticulated 3D mesh in a feed-forward fashion within seconds.\n","authors":["Zizhang Li","Dor Litvak","Ruining Li","Yunzhi Zhang","Tomas Jakab","Christian Rupprecht","Shangzhe Wu","Andrea Vedaldi","Jiajun Wu"],"pdf_url":"https://arxiv.org/pdf/2401.02400v2.pdf","comment":"The first two authors contributed equally to this work. The last\n three authors contributed equally. Project page:\n https://kyleleey.github.io/3DFauna/"},{"id":"http://arxiv.org/abs/2404.00922v1","updated":"2024-04-01T04:55:02Z","published":"2024-04-01T04:55:02Z","title":"Towards Memorization-Free Diffusion Models","summary":" Pretrained diffusion models and their outputs are widely accessible due to\ntheir exceptional capacity for synthesizing high-quality images and their\nopen-source nature. The users, however, may face litigation risks owing to the\nmodels' tendency to memorize and regurgitate training data during inference. To\naddress this, we introduce Anti-Memorization Guidance (AMG), a novel framework\nemploying three targeted guidance strategies for the main causes of\nmemorization: image and caption duplication, and highly specific user prompts.\nConsequently, AMG ensures memorization-free outputs while maintaining high\nimage quality and text alignment, leveraging the synergy of its guidance\nmethods, each indispensable in its own right. AMG also features an innovative\nautomatic detection system for potential memorization during each step of\ninference process, allows selective application of guidance strategies,\nminimally interfering with the original sampling process to preserve output\nutility. We applied AMG to pretrained Denoising Diffusion Probabilistic Models\n(DDPM) and Stable Diffusion across various generation tasks. The results\ndemonstrate that AMG is the first approach to successfully eradicates all\ninstances of memorization with no or marginal impacts on image quality and\ntext-alignment, as evidenced by FID and CLIP scores.\n","authors":["Chen Chen","Daochang Liu","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2404.00922v1.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2404.00921v1","updated":"2024-04-01T04:53:06Z","published":"2024-04-01T04:53:06Z","title":"Towards Label-Efficient Human Matting: A Simple Baseline for Weakly\n Semi-Supervised Trimap-Free Human Matting","summary":" This paper presents a new practical training method for human matting, which\ndemands delicate pixel-level human region identification and significantly\nlaborious annotations. To reduce the annotation cost, most existing matting\napproaches often rely on image synthesis to augment the dataset. However, the\nunnaturalness of synthesized training images brings in a new domain\ngeneralization challenge for natural images. To address this challenge, we\nintroduce a new learning paradigm, weakly semi-supervised human matting\n(WSSHM), which leverages a small amount of expensive matte labels and a large\namount of budget-friendly segmentation labels, to save the annotation cost and\nresolve the domain generalization problem. To achieve the goal of WSSHM, we\npropose a simple and effective training method, named Matte Label Blending\n(MLB), that selectively guides only the beneficial knowledge of the\nsegmentation and matte data to the matting model. Extensive experiments with\nour detailed analysis demonstrate our method can substantially improve the\nrobustness of the matting model using a few matte data and numerous\nsegmentation data. Our training method is also easily applicable to real-time\nmodels, achieving competitive accuracy with breakneck inference speed (328 FPS\non NVIDIA V100 GPU). The implementation code is available at\n\\url{https://github.com/clovaai/WSSHM}.\n","authors":["Beomyoung Kim","Myeong Yeon Yi","Joonsang Yu","Young Joon Yoo","Sung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2404.00921v1.pdf","comment":"Preprint, 15 pages, 13 figures"},{"id":"http://arxiv.org/abs/2305.19556v3","updated":"2024-04-01T04:45:30Z","published":"2023-05-31T04:50:32Z","title":"Exploring Phonetic Context-Aware Lip-Sync For Talking Face Generation","summary":" Talking face generation is the challenging task of synthesizing a natural and\nrealistic face that requires accurate synchronization with a given audio. Due\nto co-articulation, where an isolated phone is influenced by the preceding or\nfollowing phones, the articulation of a phone varies upon the phonetic context.\nTherefore, modeling lip motion with the phonetic context can generate more\nspatio-temporally aligned lip movement. In this respect, we investigate the\nphonetic context in generating lip motion for talking face generation. We\npropose Context-Aware Lip-Sync framework (CALS), which explicitly leverages\nphonetic context to generate lip movement of the target face. CALS is comprised\nof an Audio-to-Lip module and a Lip-to-Face module. The former is pretrained\nbased on masked learning to map each phone to a contextualized lip motion unit.\nThe contextualized lip motion unit then guides the latter in synthesizing a\ntarget identity with context-aware lip motion. From extensive experiments, we\nverify that simply exploiting the phonetic context in the proposed CALS\nframework effectively enhances spatio-temporal alignment. We also demonstrate\nthe extent to which the phonetic context assists in lip synchronization and\nfind the effective window size for lip generation to be approximately 1.2\nseconds.\n","authors":["Se Jin Park","Minsu Kim","Jeongsoo Choi","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2305.19556v3.pdf","comment":"Accepted at ICASSP 2024"},{"id":"http://arxiv.org/abs/2404.00916v1","updated":"2024-04-01T04:43:45Z","published":"2024-04-01T04:43:45Z","title":"Gyro-based Neural Single Image Deblurring","summary":" In this paper, we present GyroDeblurNet, a novel single image deblurring\nmethod that utilizes a gyro sensor to effectively resolve the ill-posedness of\nimage deblurring. The gyro sensor provides valuable information about camera\nmotion during exposure time that can significantly improve deblurring quality.\nHowever, effectively exploiting real-world gyro data is challenging due to\nsignificant errors from various sources including sensor noise, the disparity\nbetween the positions of a camera module and a gyro sensor, the absence of\ntranslational motion information, and moving objects whose motions cannot be\ncaptured by a gyro sensor. To handle gyro error, GyroDeblurNet is equipped with\ntwo novel neural network blocks: a gyro refinement block and a gyro deblurring\nblock. The gyro refinement block refines the error-ridden gyro data using the\nblur information from the input image. On the other hand, the gyro deblurring\nblock removes blur from the input image using the refined gyro data and further\ncompensates for gyro error by leveraging the blur information from the input\nimage. For training a neural network with erroneous gyro data, we propose a\ntraining strategy based on the curriculum learning. We also introduce a novel\ngyro data embedding scheme to represent real-world intricate camera shakes.\nFinally, we present a synthetic dataset and a real dataset for the training and\nevaluation of gyro-based single image deblurring. Our experiments demonstrate\nthat our approach achieves state-of-the-art deblurring quality by effectively\nutilizing erroneous gyro data.\n","authors":["Heemin Yang","Jaesung Rim","Seung-Hwan Baek","Sunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2404.00916v1.pdf","comment":"14 pages, 11 figures"},{"id":"http://arxiv.org/abs/2404.00915v1","updated":"2024-04-01T04:43:39Z","published":"2024-04-01T04:43:39Z","title":"Scalable 3D Registration via Truncated Entry-wise Absolute Residuals","summary":" Given an input set of $3$D point pairs, the goal of outlier-robust $3$D\nregistration is to compute some rotation and translation that align as many\npoint pairs as possible. This is an important problem in computer vision, for\nwhich many highly accurate approaches have been recently proposed. Despite\ntheir impressive performance, these approaches lack scalability, often\noverflowing the $16$GB of memory of a standard laptop to handle roughly\n$30,000$ point pairs. In this paper, we propose a $3$D registration approach\nthat can process more than ten million ($10^7$) point pairs with over $99\\%$\nrandom outliers. Moreover, our method is efficient, entails low memory costs,\nand maintains high accuracy at the same time. We call our method TEAR, as it\ninvolves minimizing an outlier-robust loss that computes Truncated Entry-wise\nAbsolute Residuals. To minimize this loss, we decompose the original\n$6$-dimensional problem into two subproblems of dimensions $3$ and $2$,\nrespectively, solved in succession to global optimality via a customized\nbranch-and-bound method. While branch-and-bound is often slow and unscalable,\nthis does not apply to TEAR as we propose novel bounding functions that are\ntight and computationally efficient. Experiments on various datasets are\nconducted to validate the scalability and efficiency of our method.\n","authors":["Tianyu Huang","Liangzu Peng","René Vidal","Yun-Hui Liu"],"pdf_url":"https://arxiv.org/pdf/2404.00915v1.pdf","comment":"24 pages, 12 figures. Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00913v1","updated":"2024-04-01T04:39:21Z","published":"2024-04-01T04:39:21Z","title":"LLaMA-Excitor: General Instruction Tuning via Indirect Feature\n Interaction","summary":" Existing methods to fine-tune LLMs, like Adapter, Prefix-tuning, and LoRA,\nwhich introduce extra modules or additional input sequences to inject new\nskills or knowledge, may compromise the innate abilities of LLMs. In this\npaper, we propose LLaMA-Excitor, a lightweight method that stimulates the LLMs'\npotential to better follow instructions by gradually paying more attention to\nworthwhile information. Specifically, the LLaMA-Excitor does not directly\nchange the intermediate hidden state during the self-attention calculation of\nthe transformer structure. We designed the Excitor block as a bypass module for\nthe similarity score computation in LLMs' self-attention to reconstruct keys\nand change the importance of values by learnable prompts. LLaMA-Excitor ensures\na self-adaptive allocation of additional attention to input instructions, thus\neffectively preserving LLMs' pre-trained knowledge when fine-tuning LLMs on\nlow-quality instruction-following datasets. Furthermore, we unify the modeling\nof multi-modal tuning and language-only tuning, extending LLaMA-Excitor to a\npowerful visual instruction follower without the need for complex multi-modal\nalignment. Our proposed approach is evaluated in language-only and multi-modal\ntuning experimental scenarios. Notably, LLaMA-Excitor is the only method that\nmaintains basic capabilities while achieving a significant improvement (+6%) on\nthe MMLU benchmark. In the visual instruction tuning, we achieve a new\nstate-of-the-art image captioning performance of 157.5 CIDEr on MSCOCO, and a\ncomparable performance (88.39%) on ScienceQA to cutting-edge models with more\nparameters and extensive vision-language pertaining.\n","authors":["Bo Zou","Chao Yang","Yu Qiao","Chengbin Quan","Youjian Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.00913v1.pdf","comment":"This paper is accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00909v1","updated":"2024-04-01T04:28:01Z","published":"2024-04-01T04:28:01Z","title":"Learning by Correction: Efficient Tuning Task for Zero-Shot Generative\n Vision-Language Reasoning","summary":" Generative vision-language models (VLMs) have shown impressive performance in\nzero-shot vision-language tasks like image captioning and visual question\nanswering. However, improving their zero-shot reasoning typically requires\nsecond-stage instruction tuning, which relies heavily on human-labeled or large\nlanguage model-generated annotation, incurring high labeling costs. To tackle\nthis challenge, we introduce Image-Conditioned Caption Correction (ICCC), a\nnovel pre-training task designed to enhance VLMs' zero-shot performance without\nthe need for labeled task-aware data. The ICCC task compels VLMs to rectify\nmismatches between visual and language concepts, thereby enhancing instruction\nfollowing and text generation conditioned on visual inputs. Leveraging language\nstructure and a lightweight dependency parser, we construct data samples of\nICCC task from image-text datasets with low labeling and computation costs.\nExperimental results on BLIP-2 and InstructBLIP demonstrate significant\nimprovements in zero-shot image-text generation-based VL tasks through ICCC\ninstruction tuning.\n","authors":["Rongjie Li","Yu Wu","Xuming He"],"pdf_url":"https://arxiv.org/pdf/2404.00909v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2404.01351v1","updated":"2024-04-01T04:21:49Z","published":"2024-04-01T04:21:49Z","title":"AETTA: Label-Free Accuracy Estimation for Test-Time Adaptation","summary":" Test-time adaptation (TTA) has emerged as a viable solution to adapt\npre-trained models to domain shifts using unlabeled test data. However, TTA\nfaces challenges of adaptation failures due to its reliance on blind adaptation\nto unknown test samples in dynamic scenarios. Traditional methods for\nout-of-distribution performance estimation are limited by unrealistic\nassumptions in the TTA context, such as requiring labeled data or re-training\nmodels. To address this issue, we propose AETTA, a label-free accuracy\nestimation algorithm for TTA. We propose the prediction disagreement as the\naccuracy estimate, calculated by comparing the target model prediction with\ndropout inferences. We then improve the prediction disagreement to extend the\napplicability of AETTA under adaptation failures. Our extensive evaluation with\nfour baselines and six TTA methods demonstrates that AETTA shows an average of\n19.8%p more accurate estimation compared with the baselines. We further\ndemonstrate the effectiveness of accuracy estimation with a model recovery case\nstudy, showcasing the practicality of our model recovery based on accuracy\nestimation. The source code is available at https://github.com/taeckyung/AETTA.\n","authors":["Taeckyung Lee","Sorn Chottananurak","Taesik Gong","Sung-Ju Lee"],"pdf_url":"https://arxiv.org/pdf/2404.01351v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00906v1","updated":"2024-04-01T04:21:01Z","published":"2024-04-01T04:21:01Z","title":"From Pixels to Graphs: Open-Vocabulary Scene Graph Generation with\n Vision-Language Models","summary":" Scene graph generation (SGG) aims to parse a visual scene into an\nintermediate graph representation for downstream reasoning tasks. Despite\nrecent advancements, existing methods struggle to generate scene graphs with\nnovel visual relation concepts. To address this challenge, we introduce a new\nopen-vocabulary SGG framework based on sequence generation. Our framework\nleverages vision-language pre-trained models (VLM) by incorporating an\nimage-to-graph generation paradigm. Specifically, we generate scene graph\nsequences via image-to-text generation with VLM and then construct scene graphs\nfrom these sequences. By doing so, we harness the strong capabilities of VLM\nfor open-vocabulary SGG and seamlessly integrate explicit relational modeling\nfor enhancing the VL tasks. Experimental results demonstrate that our design\nnot only achieves superior performance with an open vocabulary but also\nenhances downstream vision-language task performance through explicit relation\nmodeling knowledge.\n","authors":["Rongjie Li","Songyang Zhang","Dahua Lin","Kai Chen","Xuming He"],"pdf_url":"https://arxiv.org/pdf/2404.00906v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.15918v2","updated":"2024-04-01T04:15:46Z","published":"2024-03-23T19:21:31Z","title":"An Embarrassingly Simple Defense Against Backdoor Attacks On SSL","summary":" Self Supervised Learning (SSL) has emerged as a powerful paradigm to tackle\ndata landscapes with absence of human supervision. The ability to learn\nmeaningful tasks without the use of labeled data makes SSL a popular method to\nmanage large chunks of data in the absence of labels. However, recent work\nindicates SSL to be vulnerable to backdoor attacks, wherein models can be\ncontrolled, possibly maliciously, to suit an adversary's motives. Li et. al\n(2022) introduce a novel frequency-based backdoor attack: CTRL. They show that\nCTRL can be used to efficiently and stealthily gain control over a victim's\nmodel trained using SSL. In this work, we devise two defense strategies against\nfrequency-based attacks in SSL: One applicable before model training and the\nsecond to be applied during model inference. Our first contribution utilizes\nthe invariance property of the downstream task to defend against backdoor\nattacks in a generalizable fashion. We observe the ASR (Attack Success Rate) to\nreduce by over 60% across experiments. Our Inference-time defense relies on\nevasiveness of the attack and uses the luminance channel to defend against\nattacks. Using object classification as the downstream task for SSL, we\ndemonstrate successful defense strategies that do not require re-training of\nthe model. Code is available at https://github.com/Aryan-Satpathy/Backdoor.\n","authors":["Aryan Satpathy","Nilaksh Nilaksh","Dhruva Rajwade"],"pdf_url":"https://arxiv.org/pdf/2403.15918v2.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2302.10174v2","updated":"2024-04-01T04:00:31Z","published":"2023-02-20T18:59:04Z","title":"Towards Universal Fake Image Detectors that Generalize Across Generative\n Models","summary":" With generative models proliferating at a rapid rate, there is a growing need\nfor general purpose fake image detectors. In this work, we first show that the\nexisting paradigm, which consists of training a deep network for real-vs-fake\nclassification, fails to detect fake images from newer breeds of generative\nmodels when trained to detect GAN fake images. Upon analysis, we find that the\nresulting classifier is asymmetrically tuned to detect patterns that make an\nimage fake. The real class becomes a sink class holding anything that is not\nfake, including generated images from models not accessible during training.\nBuilding upon this discovery, we propose to perform real-vs-fake classification\nwithout learning; i.e., using a feature space not explicitly trained to\ndistinguish real from fake images. We use nearest neighbor and linear probing\nas instantiations of this idea. When given access to the feature space of a\nlarge pretrained vision-language model, the very simple baseline of nearest\nneighbor classification has surprisingly good generalization ability in\ndetecting fake images from a wide variety of generative models; e.g., it\nimproves upon the SoTA by +15.07 mAP and +25.90% acc when tested on unseen\ndiffusion and autoregressive models.\n","authors":["Utkarsh Ojha","Yuheng Li","Yong Jae Lee"],"pdf_url":"https://arxiv.org/pdf/2302.10174v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00901v1","updated":"2024-04-01T03:58:51Z","published":"2024-04-01T03:58:51Z","title":"Slightly Shift New Classes to Remember Old Classes for Video\n Class-Incremental Learning","summary":" Recent video class-incremental learning usually excessively pursues the\naccuracy of the newly seen classes and relies on memory sets to mitigate\ncatastrophic forgetting of the old classes. However, limited storage only\nallows storing a few representative videos. So we propose SNRO, which slightly\nshifts the features of new classes to remember old classes. Specifically, SNRO\ncontains Examples Sparse(ES) and Early Break(EB). ES decimates at a lower\nsample rate to build memory sets and uses interpolation to align those sparse\nframes in the future. By this, SNRO stores more examples under the same memory\nconsumption and forces the model to focus on low-semantic features which are\nharder to be forgotten. EB terminates the training at a small epoch, preventing\nthe model from overstretching into the high-semantic space of the current task.\nExperiments on UCF101, HMDB51, and UESTC-MMEA-CL datasets show that SNRO\nperforms better than other approaches while consuming the same memory\nconsumption.\n","authors":["Jian Jiao","Yu Dai","Hefei Mei","Heqian Qiu","Chuanyang Gong","Shiyuan Tang","Xinpeng Hao","Hongliang Li"],"pdf_url":"https://arxiv.org/pdf/2404.00901v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18363v3","updated":"2024-04-01T03:41:27Z","published":"2023-11-30T09:03:47Z","title":"Each Test Image Deserves A Specific Prompt: Continual Test-Time\n Adaptation for 2D Medical Image Segmentation","summary":" Distribution shift widely exists in medical images acquired from different\nmedical centres and poses a significant obstacle to deploying the pre-trained\nsemantic segmentation model in real-world applications. Test-time adaptation\nhas proven its effectiveness in tackling the cross-domain distribution shift\nduring inference. However, most existing methods achieve adaptation by updating\nthe pre-trained models, rendering them susceptible to error accumulation and\ncatastrophic forgetting when encountering a series of distribution shifts\n(i.e., under the continual test-time adaptation setup). To overcome these\nchallenges caused by updating the models, in this paper, we freeze the\npre-trained model and propose the Visual Prompt-based Test-Time Adaptation\n(VPTTA) method to train a specific prompt for each test image to align the\nstatistics in the batch normalization layers. Specifically, we present the\nlow-frequency prompt, which is lightweight with only a few parameters and can\nbe effectively trained in a single iteration. To enhance prompt initialization,\nwe equip VPTTA with a memory bank to benefit the current prompt from previous\nones. Additionally, we design a warm-up mechanism, which mixes source and\ntarget statistics to construct warm-up statistics, thereby facilitating the\ntraining process. Extensive experiments demonstrate the superiority of our\nVPTTA over other state-of-the-art methods on two medical image segmentation\nbenchmark tasks. The code and weights of pre-trained source models are\navailable at https://github.com/Chen-Ziyang/VPTTA.\n","authors":["Ziyang Chen","Yiwen Ye","Mengkang Lu","Yongsheng Pan","Yong Xia"],"pdf_url":"https://arxiv.org/pdf/2311.18363v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15011v2","updated":"2024-04-01T03:37:24Z","published":"2023-11-25T12:34:02Z","title":"VSCode: General Visual Salient and Camouflaged Object Detection with 2D\n Prompt Learning","summary":" Salient object detection (SOD) and camouflaged object detection (COD) are\nrelated yet distinct binary mapping tasks. These tasks involve multiple\nmodalities, sharing commonalities and unique cues. Existing research often\nemploys intricate task-specific specialist models, potentially leading to\nredundancy and suboptimal results. We introduce VSCode, a generalist model with\nnovel 2D prompt learning, to jointly address four SOD tasks and three COD\ntasks. We utilize VST as the foundation model and introduce 2D prompts within\nthe encoder-decoder architecture to learn domain and task-specific knowledge on\ntwo separate dimensions. A prompt discrimination loss helps disentangle\npeculiarities to benefit model optimization. VSCode outperforms\nstate-of-the-art methods across six tasks on 26 datasets and exhibits zero-shot\ngeneralization to unseen tasks by combining 2D prompts, such as RGB-D COD.\nSource code has been available at https://github.com/Sssssuperior/VSCode.\n","authors":["Ziyang Luo","Nian Liu","Wangbo Zhao","Xuguang Yang","Dingwen Zhang","Deng-Ping Fan","Fahad Khan","Junwei Han"],"pdf_url":"https://arxiv.org/pdf/2311.15011v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03477v3","updated":"2024-04-01T03:35:25Z","published":"2024-03-06T05:33:50Z","title":"Continual Segmentation with Disentangled Objectness Learning and Class\n Recognition","summary":" Most continual segmentation methods tackle the problem as a per-pixel\nclassification task. However, such a paradigm is very challenging, and we find\nquery-based segmenters with built-in objectness have inherent advantages\ncompared with per-pixel ones, as objectness has strong transfer ability and\nforgetting resistance. Based on these findings, we propose CoMasTRe by\ndisentangling continual segmentation into two stages: forgetting-resistant\ncontinual objectness learning and well-researched continual classification.\nCoMasTRe uses a two-stage segmenter learning class-agnostic mask proposals at\nthe first stage and leaving recognition to the second stage. During continual\nlearning, a simple but effective distillation is adopted to strengthen\nobjectness. To further mitigate the forgetting of old classes, we design a\nmulti-label class distillation strategy suited for segmentation. We assess the\neffectiveness of CoMasTRe on PASCAL VOC and ADE20K. Extensive experiments show\nthat our method outperforms per-pixel and query-based methods on both datasets.\nCode will be available at https://github.com/jordangong/CoMasTRe.\n","authors":["Yizheng Gong","Siyue Yu","Xiaoyang Wang","Jimin Xiao"],"pdf_url":"https://arxiv.org/pdf/2403.03477v3.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00891v1","updated":"2024-04-01T03:35:09Z","published":"2024-04-01T03:35:09Z","title":"Marrying NeRF with Feature Matching for One-step Pose Estimation","summary":" Given the image collection of an object, we aim at building a real-time\nimage-based pose estimation method, which requires neither its CAD model nor\nhours of object-specific training. Recent NeRF-based methods provide a\npromising solution by directly optimizing the pose from pixel loss between\nrendered and target images. However, during inference, they require long\nconverging time, and suffer from local minima, making them impractical for\nreal-time robot applications. We aim at solving this problem by marrying image\nmatching with NeRF. With 2D matches and depth rendered by NeRF, we directly\nsolve the pose in one step by building 2D-3D correspondences between target and\ninitial view, thus allowing for real-time prediction. Moreover, to improve the\naccuracy of 2D-3D correspondences, we propose a 3D consistent point mining\nstrategy, which effectively discards unfaithful points reconstruted by NeRF.\nMoreover, current NeRF-based methods naively optimizing pixel loss fail at\noccluded images. Thus, we further propose a 2D matches based sampling strategy\nto preclude the occluded area. Experimental results on representative datasets\nprove that our method outperforms state-of-the-art methods, and improves\ninference efficiency by 90x, achieving real-time prediction at 6 FPS.\n","authors":["Ronghan Chen","Yang Cong","Yu Ren"],"pdf_url":"https://arxiv.org/pdf/2404.00891v1.pdf","comment":"ICRA, 2024. Video https://www.youtube.com/watch?v=70fgUobOFWo"},{"id":"http://arxiv.org/abs/2404.00879v1","updated":"2024-04-01T03:18:12Z","published":"2024-04-01T03:18:12Z","title":"Model-Agnostic Human Preference Inversion in Diffusion Models","summary":" Efficient text-to-image generation remains a challenging task due to the high\ncomputational costs associated with the multi-step sampling in diffusion\nmodels. Although distillation of pre-trained diffusion models has been\nsuccessful in reducing sampling steps, low-step image generation often falls\nshort in terms of quality. In this study, we propose a novel sampling design to\nachieve high-quality one-step image generation aligning with human preferences,\nparticularly focusing on exploring the impact of the prior noise distribution.\nOur approach, Prompt Adaptive Human Preference Inversion (PAHI), optimizes the\nnoise distributions for each prompt based on human preferences without the need\nfor fine-tuning diffusion models. Our experiments showcase that the tailored\nnoise distributions significantly improve image quality with only a marginal\nincrease in computational cost. Our findings underscore the importance of noise\noptimization and pave the way for efficient and high-quality text-to-image\nsynthesis.\n","authors":["Jeeyung Kim","Ze Wang","Qiang Qiu"],"pdf_url":"https://arxiv.org/pdf/2404.00879v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00878v1","updated":"2024-04-01T03:15:41Z","published":"2024-04-01T03:15:41Z","title":"TryOn-Adapter: Efficient Fine-Grained Clothing Identity Adaptation for\n High-Fidelity Virtual Try-On","summary":" Virtual try-on focuses on adjusting the given clothes to fit a specific\nperson seamlessly while avoiding any distortion of the patterns and textures of\nthe garment. However, the clothing identity uncontrollability and training\ninefficiency of existing diffusion-based methods, which struggle to maintain\nthe identity even with full parameter training, are significant limitations\nthat hinder the widespread applications. In this work, we propose an effective\nand efficient framework, termed TryOn-Adapter. Specifically, we first decouple\nclothing identity into fine-grained factors: style for color and category\ninformation, texture for high-frequency details, and structure for smooth\nspatial adaptive transformation. Our approach utilizes a pre-trained\nexemplar-based diffusion model as the fundamental network, whose parameters are\nfrozen except for the attention layers. We then customize three lightweight\nmodules (Style Preserving, Texture Highlighting, and Structure Adapting)\nincorporated with fine-tuning techniques to enable precise and efficient\nidentity control. Meanwhile, we introduce the training-free T-RePaint strategy\nto further enhance clothing identity preservation while maintaining the\nrealistic try-on effect during the inference. Our experiments demonstrate that\nour approach achieves state-of-the-art performance on two widely-used\nbenchmarks. Additionally, compared with recent full-tuning diffusion-based\nmethods, we only use about half of their tunable parameters during training.\nThe code will be made publicly available at\nhttps://github.com/jiazheng-xing/TryOn-Adapter.\n","authors":["Jiazheng Xing","Chao Xu","Yijie Qian","Yang Liu","Guang Dai","Baigui Sun","Yong Liu","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2404.00878v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00876v1","updated":"2024-04-01T03:13:32Z","published":"2024-04-01T03:13:32Z","title":"MGMap: Mask-Guided Learning for Online Vectorized HD Map Construction","summary":" Currently, high-definition (HD) map construction leans towards a lightweight\nonline generation tendency, which aims to preserve timely and reliable road\nscene information. However, map elements contain strong shape priors. Subtle\nand sparse annotations make current detection-based frameworks ambiguous in\nlocating relevant feature scopes and cause the loss of detailed structures in\nprediction. To alleviate these problems, we propose MGMap, a mask-guided\napproach that effectively highlights the informative regions and achieves\nprecise map element localization by introducing the learned masks.\nSpecifically, MGMap employs learned masks based on the enhanced multi-scale BEV\nfeatures from two perspectives. At the instance level, we propose the\nMask-activated instance (MAI) decoder, which incorporates global instance and\nstructural information into instance queries by the activation of instance\nmasks. At the point level, a novel position-guided mask patch refinement\n(PG-MPR) module is designed to refine point locations from a finer-grained\nperspective, enabling the extraction of point-specific patch information.\nCompared to the baselines, our proposed MGMap achieves a notable improvement of\naround 10 mAP for different input modalities. Extensive experiments also\ndemonstrate that our approach showcases strong robustness and generalization\ncapabilities. Our code can be found at https://github.com/xiaolul2/MGMap.\n","authors":["Xiaolu Liu","Song Wang","Wentong Li","Ruizi Yang","Junbo Chen","Jianke Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.00876v1.pdf","comment":"18 pages, 11 figures, accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2310.16279v2","updated":"2024-04-01T03:13:21Z","published":"2023-10-25T01:24:12Z","title":"TransPose: 6D Object Pose Estimation with Geometry-Aware Transformer","summary":" Estimating the 6D object pose is an essential task in many applications. Due\nto the lack of depth information, existing RGB-based methods are sensitive to\nocclusion and illumination changes. How to extract and utilize the geometry\nfeatures in depth information is crucial to achieve accurate predictions. To\nthis end, we propose TransPose, a novel 6D pose framework that exploits\nTransformer Encoder with geometry-aware module to develop better learning of\npoint cloud feature representations. Specifically, we first uniformly sample\npoint cloud and extract local geometry features with the designed local feature\nextractor base on graph convolution network. To improve robustness to\nocclusion, we adopt Transformer to perform the exchange of global information,\nmaking each local feature contains global information. Finally, we introduce\ngeometry-aware module in Transformer Encoder, which to form an effective\nconstrain for point cloud feature learning and makes the global information\nexchange more tightly coupled with point cloud tasks. Extensive experiments\nindicate the effectiveness of TransPose, our pose estimation pipeline achieves\ncompetitive results on three benchmark datasets.\n","authors":["Xiao Lin","Deming Wang","Guangliang Zhou","Chengju Liu","Qijun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.16279v2.pdf","comment":"accept by NEUROCOMPUTING"},{"id":"http://arxiv.org/abs/2312.16457v2","updated":"2024-04-01T03:10:53Z","published":"2023-12-27T08:00:47Z","title":"City-on-Web: Real-time Neural Rendering of Large-scale Scenes on the Web","summary":" Existing neural radiance field-based methods can achieve real-time rendering\nof small scenes on the web platform. However, extending these methods to\nlarge-scale scenes still poses significant challenges due to limited resources\nin computation, memory, and bandwidth. In this paper, we propose City-on-Web,\nthe first method for real-time rendering of large-scale scenes on the web. We\npropose a block-based volume rendering method to guarantee 3D consistency and\ncorrect occlusion between blocks, and introduce a Level-of-Detail strategy\ncombined with dynamic loading/unloading of resources to significantly reduce\nmemory demands. Our system achieves real-time rendering of large-scale scenes\nat approximately 32FPS with RTX 3060 GPU on the web and maintains rendering\nquality comparable to the current state-of-the-art novel view synthesis\nmethods.\n","authors":["Kaiwen Song","Xiaoyi Zeng","Chenqu Ren","Juyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.16457v2.pdf","comment":"Project page: https://ustc3dv.github.io/City-on-Web/"},{"id":"http://arxiv.org/abs/2404.00874v1","updated":"2024-04-01T03:06:23Z","published":"2024-04-01T03:06:23Z","title":"DiSR-NeRF: Diffusion-Guided View-Consistent Super-Resolution NeRF","summary":" We present DiSR-NeRF, a diffusion-guided framework for view-consistent\nsuper-resolution (SR) NeRF. Unlike prior works, we circumvent the requirement\nfor high-resolution (HR) reference images by leveraging existing powerful 2D\nsuper-resolution models. Nonetheless, independent SR 2D images are often\ninconsistent across different views. We thus propose Iterative 3D\nSynchronization (I3DS) to mitigate the inconsistency problem via the inherent\nmulti-view consistency property of NeRF. Specifically, our I3DS alternates\nbetween upscaling low-resolution (LR) rendered images with diffusion models,\nand updating the underlying 3D representation with standard NeRF training. We\nfurther introduce Renoised Score Distillation (RSD), a novel score-distillation\nobjective for 2D image resolution. Our RSD combines features from ancestral\nsampling and Score Distillation Sampling (SDS) to generate sharp images that\nare also LR-consistent. Qualitative and quantitative results on both synthetic\nand real-world datasets demonstrate that our DiSR-NeRF can achieve better\nresults on NeRF super-resolution compared with existing works. Code and video\nresults available at the project website.\n","authors":["Jie Long Lee","Chen Li","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2404.00874v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01901v2","updated":"2024-04-01T03:00:21Z","published":"2024-03-04T09:59:48Z","title":"FaceChain-ImagineID: Freely Crafting High-Fidelity Diverse Talking Faces\n from Disentangled Audio","summary":" In this paper, we abstract the process of people hearing speech, extracting\nmeaningful cues, and creating various dynamically audio-consistent talking\nfaces, termed Listening and Imagining, into the task of high-fidelity diverse\ntalking faces generation from a single audio. Specifically, it involves two\ncritical challenges: one is to effectively decouple identity, content, and\nemotion from entangled audio, and the other is to maintain intra-video\ndiversity and inter-video consistency. To tackle the issues, we first dig out\nthe intricate relationships among facial factors and simplify the decoupling\nprocess, tailoring a Progressive Audio Disentanglement for accurate facial\ngeometry and semantics learning, where each stage incorporates a customized\ntraining module responsible for a specific factor. Secondly, to achieve\nvisually diverse and audio-synchronized animation solely from input audio\nwithin a single model, we introduce the Controllable Coherent Frame generation,\nwhich involves the flexible integration of three trainable adapters with frozen\nLatent Diffusion Models (LDMs) to focus on maintaining facial geometry and\nsemantics, as well as texture and temporal coherence between frames. In this\nway, we inherit high-quality diverse generation from LDMs while significantly\nimproving their controllability at a low training cost. Extensive experiments\ndemonstrate the flexibility and effectiveness of our method in handling this\nparadigm. The codes will be released at\nhttps://github.com/modelscope/facechain.\n","authors":["Chao Xu","Yang Liu","Jiazheng Xing","Weida Wang","Mingze Sun","Jun Dan","Tianxin Huang","Siyuan Li","Zhi-Qi Cheng","Ying Tai","Baigui Sun"],"pdf_url":"https://arxiv.org/pdf/2403.01901v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06742v2","updated":"2024-04-01T03:00:06Z","published":"2023-12-11T18:59:06Z","title":"Honeybee: Locality-enhanced Projector for Multimodal LLM","summary":" In Multimodal Large Language Models (MLLMs), a visual projector plays a\ncrucial role in bridging pre-trained vision encoders with LLMs, enabling\nprofound visual understanding while harnessing the LLMs' robust capabilities.\nDespite the importance of the visual projector, it has been relatively less\nexplored. In this study, we first identify two essential projector properties:\n(i) flexibility in managing the number of visual tokens, crucial for MLLMs'\noverall efficiency, and (ii) preservation of local context from visual\nfeatures, vital for spatial understanding. Based on these findings, we propose\na novel projector design that is both flexible and locality-enhanced,\neffectively satisfying the two desirable properties. Additionally, we present\ncomprehensive strategies to effectively utilize multiple and multifaceted\ninstruction datasets. Through extensive experiments, we examine the impact of\nindividual design choices. Finally, our proposed MLLM, Honeybee, remarkably\noutperforms previous state-of-the-art methods across various benchmarks,\nincluding MME, MMBench, SEED-Bench, and LLaVA-Bench, achieving significantly\nhigher efficiency. Code and models are available at\nhttps://github.com/kakaobrain/honeybee.\n","authors":["Junbum Cha","Wooyoung Kang","Jonghwan Mun","Byungseok Roh"],"pdf_url":"https://arxiv.org/pdf/2312.06742v2.pdf","comment":"CVPR 2024 camera-ready"},{"id":"http://arxiv.org/abs/2309.16211v2","updated":"2024-04-01T02:49:49Z","published":"2023-09-28T07:37:18Z","title":"VDC: Versatile Data Cleanser based on Visual-Linguistic Inconsistency by\n Multimodal Large Language Models","summary":" The role of data in building AI systems has recently been emphasized by the\nemerging concept of data-centric AI. Unfortunately, in the real-world, datasets\nmay contain dirty samples, such as poisoned samples from backdoor attack, noisy\nlabels in crowdsourcing, and even hybrids of them. The presence of such dirty\nsamples makes the DNNs vunerable and unreliable.Hence, it is critical to detect\ndirty samples to improve the quality and realiability of dataset. Existing\ndetectors only focus on detecting poisoned samples or noisy labels, that are\noften prone to weak generalization when dealing with dirty samples from other\ndomains.In this paper, we find a commonality of various dirty samples is\nvisual-linguistic inconsistency between images and associated labels. To\ncapture the semantic inconsistency between modalities, we propose versatile\ndata cleanser (VDC) leveraging the surpassing capabilities of multimodal large\nlanguage models (MLLM) in cross-modal alignment and reasoning.It consists of\nthree consecutive modules: the visual question generation module to generate\ninsightful questions about the image; the visual question answering module to\nacquire the semantics of the visual content by answering the questions with\nMLLM; followed by the visual answer evaluation module to evaluate the\ninconsistency.Extensive experiments demonstrate its superior performance and\ngeneralization to various categories and types of dirty samples. The code is\navailable at \\url{https://github.com/zihao-ai/vdc}.\n","authors":["Zihao Zhu","Mingda Zhang","Shaokui Wei","Bingzhe Wu","Baoyuan Wu"],"pdf_url":"https://arxiv.org/pdf/2309.16211v2.pdf","comment":"Accepted to ICLR 2024"},{"id":"http://arxiv.org/abs/2312.14985v2","updated":"2024-04-01T02:29:20Z","published":"2023-12-22T05:00:30Z","title":"UniHuman: A Unified Model for Editing Human Images in the Wild","summary":" Human image editing includes tasks like changing a person's pose, their\nclothing, or editing the image according to a text prompt. However, prior work\noften tackles these tasks separately, overlooking the benefit of mutual\nreinforcement from learning them jointly. In this paper, we propose UniHuman, a\nunified model that addresses multiple facets of human image editing in\nreal-world settings. To enhance the model's generation quality and\ngeneralization capacity, we leverage guidance from human visual encoders and\nintroduce a lightweight pose-warping module that can exploit different pose\nrepresentations, accommodating unseen textures and patterns. Furthermore, to\nbridge the disparity between existing human editing benchmarks with real-world\ndata, we curated 400K high-quality human image-text pairs for training and\ncollected 2K human images for out-of-domain testing, both encompassing diverse\nclothing styles, backgrounds, and age groups. Experiments on both in-domain and\nout-of-domain test sets demonstrate that UniHuman outperforms task-specific\nmodels by a significant margin. In user studies, UniHuman is preferred by the\nusers in an average of 77% of cases. Our project is available at\nhttps://github.com/NannanLi999/UniHuman.\n","authors":["Nannan Li","Qing Liu","Krishna Kumar Singh","Yilin Wang","Jianming Zhang","Bryan A. Plummer","Zhe Lin"],"pdf_url":"https://arxiv.org/pdf/2312.14985v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00860v1","updated":"2024-04-01T02:01:33Z","published":"2024-04-01T02:01:33Z","title":"Lipsum-FT: Robust Fine-Tuning of Zero-Shot Models Using Random Text\n Guidance","summary":" Large-scale contrastive vision-language pre-trained models provide the\nzero-shot model achieving competitive performance across a range of image\nclassification tasks without requiring training on downstream data. Recent\nworks have confirmed that while additional fine-tuning of the zero-shot model\non the reference data results in enhanced downstream performance, it\ncompromises the model's robustness against distribution shifts. Our\ninvestigation begins by examining the conditions required to achieve the goals\nof robust fine-tuning, employing descriptions based on feature distortion\ntheory and joint energy-based models. Subsequently, we propose a novel robust\nfine-tuning algorithm, Lipsum-FT, that effectively utilizes the language\nmodeling aspect of the vision-language pre-trained models. Extensive\nexperiments conducted on distribution shift scenarios in DomainNet and ImageNet\nconfirm the superiority of our proposed Lipsum-FT approach over existing robust\nfine-tuning methods.\n","authors":["Giung Nam","Byeongho Heo","Juho Lee"],"pdf_url":"https://arxiv.org/pdf/2404.00860v1.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2404.00857v1","updated":"2024-04-01T01:56:27Z","published":"2024-04-01T01:56:27Z","title":"Meta Episodic learning with Dynamic Task Sampling for CLIP-based Point\n Cloud Classification","summary":" Point cloud classification refers to the process of assigning semantic labels\nor categories to individual points within a point cloud data structure. Recent\nworks have explored the extension of pre-trained CLIP to 3D recognition. In\nthis direction, CLIP-based point cloud models like PointCLIP, CLIP2Point have\nbecome state-of-the-art methods in the few-shot setup. Although these methods\nshow promising performance for some classes like airplanes, desks, guitars,\netc, the performance for some classes like the cup, flower pot, sink,\nnightstand, etc is still far from satisfactory. This is due to the fact that\nthe adapter of CLIP-based models is trained using randomly sampled N-way K-shot\ndata in the standard supervised learning setup. In this paper, we propose a\nnovel meta-episodic learning framework for CLIP-based point cloud\nclassification, addressing the challenges of limited training examples and\nsampling unknown classes. Additionally, we introduce dynamic task sampling\nwithin the episode based on performance memory. This sampling strategy\neffectively addresses the challenge of sampling unknown classes, ensuring that\nthe model learns from a diverse range of classes and promotes the exploration\nof underrepresented categories. By dynamically updating the performance memory,\nwe adaptively prioritize the sampling of classes based on their performance,\nenhancing the model's ability to handle challenging and real-world scenarios.\nExperiments show an average performance gain of 3-6\\% on ModelNet40 and\nScanobjectNN datasets in a few-shot setup.\n","authors":["Shuvozit Ghose","Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.00857v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00855v1","updated":"2024-04-01T01:49:08Z","published":"2024-04-01T01:49:08Z","title":"TSOM: Small Object Motion Detection Neural Network Inspired by Avian\n Visual Circuit","summary":" Detecting small moving objects in complex backgrounds from an overhead\nperspective is a highly challenging task for machine vision systems. As an\ninspiration from nature, the avian visual system is capable of processing\nmotion information in various complex aerial scenes, and its Retina-OT-Rt\nvisual circuit is highly sensitive to capturing the motion information of small\nobjects from high altitudes. However, more needs to be done on small object\nmotion detection algorithms based on the avian visual system. In this paper, we\nconducted mathematical modeling based on extensive studies of the biological\nmechanisms of the Retina-OT-Rt visual circuit. Based on this, we proposed a\nnovel tectum small object motion detection neural network (TSOM). The neural\nnetwork includes the retina, SGC dendritic, SGC Soma, and Rt layers, each layer\ncorresponding to neurons in the visual pathway. The Retina layer is responsible\nfor accurately projecting input content, the SGC dendritic layer perceives and\nencodes spatial-temporal information, the SGC Soma layer computes complex\nmotion information and extracts small objects, and the Rt layer integrates and\ndecodes motion information from multiple directions to determine the position\nof small objects. Extensive experiments on pigeon neurophysiological\nexperiments and image sequence data showed that the TSOM is biologically\ninterpretable and effective in extracting reliable small object motion features\nfrom complex high-altitude backgrounds.\n","authors":["Pignge Hu","Xiaoteng Zhang","Mengmeng Li","Yingjie Zhu","Li Shi"],"pdf_url":"https://arxiv.org/pdf/2404.00855v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00852v1","updated":"2024-04-01T01:45:30Z","published":"2024-04-01T01:45:30Z","title":"Ensemble Learning for Vietnamese Scene Text Spotting in Urban\n Environments","summary":" This paper presents a simple yet efficient ensemble learning framework for\nVietnamese scene text spotting. Leveraging the power of ensemble learning,\nwhich combines multiple models to yield more accurate predictions, our approach\naims to significantly enhance the performance of scene text spotting in\nchallenging urban settings. Through experimental evaluations on the VinText\ndataset, our proposed method achieves a significant improvement in accuracy\ncompared to existing methods with an impressive accuracy of 5%. These results\nunequivocally demonstrate the efficacy of ensemble learning in the context of\nVietnamese scene text spotting in urban environments, highlighting its\npotential for real world applications, such as text detection and recognition\nin urban signage, advertisements, and various text-rich urban scenes.\n","authors":["Hieu Nguyen","Cong-Hoang Ta","Phuong-Thuy Le-Nguyen","Minh-Triet Tran","Trung-Nghia Le"],"pdf_url":"https://arxiv.org/pdf/2404.00852v1.pdf","comment":"RIVF 2023"},{"id":"http://arxiv.org/abs/2404.00851v1","updated":"2024-04-01T01:42:23Z","published":"2024-04-01T01:42:23Z","title":"Prompt Learning via Meta-Regularization","summary":" Pre-trained vision-language models have shown impressive success on various\ncomputer vision tasks with their zero-shot generalizability. Recently, prompt\nlearning approaches have been explored to efficiently and effectively adapt the\nvision-language models to a variety of downstream tasks. However, most existing\nprompt learning methods suffer from task overfitting since the general\nknowledge of the pre-trained vision language models is forgotten while the\nprompts are finetuned on a small data set from a specific target task. To\naddress this issue, we propose a Prompt Meta-Regularization (ProMetaR) to\nimprove the generalizability of prompt learning for vision-language models.\nSpecifically, ProMetaR meta-learns both the regularizer and the soft prompts to\nharness the task-specific knowledge from the downstream tasks and task-agnostic\ngeneral knowledge from the vision-language models. Further, ProMetaR augments\nthe task to generate multiple virtual tasks to alleviate the meta-overfitting.\nIn addition, we provide the analysis to comprehend how ProMetaR improves the\ngeneralizability of prompt tuning in the perspective of the gradient alignment.\nOur extensive experiments demonstrate that our ProMetaR improves the\ngeneralizability of conventional prompt learning methods under\nbase-to-base/base-to-new and domain generalization settings. The code of\nProMetaR is available at https://github.com/mlvlab/ProMetaR.\n","authors":["Jinyoung Park","Juyeon Ko","Hyunwoo J. Kim"],"pdf_url":"https://arxiv.org/pdf/2404.00851v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2307.16368v3","updated":"2024-04-01T01:33:53Z","published":"2023-07-31T02:14:19Z","title":"AntGPT: Can Large Language Models Help Long-term Action Anticipation\n from Videos?","summary":" Can we better anticipate an actor's future actions (e.g. mix eggs) by knowing\nwhat commonly happens after his/her current action (e.g. crack eggs)? What if\nwe also know the longer-term goal of the actor (e.g. making egg fried rice)?\nThe long-term action anticipation (LTA) task aims to predict an actor's future\nbehavior from video observations in the form of verb and noun sequences, and it\nis crucial for human-machine interaction. We propose to formulate the LTA task\nfrom two perspectives: a bottom-up approach that predicts the next actions\nautoregressively by modeling temporal dynamics; and a top-down approach that\ninfers the goal of the actor and plans the needed procedure to accomplish the\ngoal. We hypothesize that large language models (LLMs), which have been\npretrained on procedure text data (e.g. recipes, how-tos), have the potential\nto help LTA from both perspectives. It can help provide the prior knowledge on\nthe possible next actions, and infer the goal given the observed part of a\nprocedure, respectively. To leverage the LLMs, we propose a two-stage\nframework, AntGPT. It first recognizes the actions already performed in the\nobserved videos and then asks an LLM to predict the future actions via\nconditioned generation, or to infer the goal and plan the whole procedure by\nchain-of-thought prompting. Empirical results on the Ego4D LTA v1 and v2\nbenchmarks, EPIC-Kitchens-55, as well as EGTEA GAZE+ demonstrate the\neffectiveness of our proposed approach. AntGPT achieves state-of-the-art\nperformance on all above benchmarks, and can successfully infer the goal and\nthus perform goal-conditioned \"counterfactual\" prediction via qualitative\nanalysis. Code and model will be released at\nhttps://brown-palm.github.io/AntGPT\n","authors":["Qi Zhao","Shijie Wang","Ce Zhang","Changcheng Fu","Minh Quan Do","Nakul Agarwal","Kwonjoon Lee","Chen Sun"],"pdf_url":"https://arxiv.org/pdf/2307.16368v3.pdf","comment":"ICLR 2024 Camera Ready"},{"id":"http://arxiv.org/abs/2404.00849v1","updated":"2024-04-01T01:32:11Z","published":"2024-04-01T01:32:11Z","title":"Generating Content for HDR Deghosting from Frequency View","summary":" Recovering ghost-free High Dynamic Range (HDR) images from multiple Low\nDynamic Range (LDR) images becomes challenging when the LDR images exhibit\nsaturation and significant motion. Recent Diffusion Models (DMs) have been\nintroduced in HDR imaging field, demonstrating promising performance,\nparticularly in achieving visually perceptible results compared to previous\nDNN-based methods. However, DMs require extensive iterations with large models\nto estimate entire images, resulting in inefficiency that hinders their\npractical application. To address this challenge, we propose the Low-Frequency\naware Diffusion (LF-Diff) model for ghost-free HDR imaging. The key idea of\nLF-Diff is implementing the DMs in a highly compacted latent space and\nintegrating it into a regression-based model to enhance the details of\nreconstructed images. Specifically, as low-frequency information is closely\nrelated to human visual perception we propose to utilize DMs to create compact\nlow-frequency priors for the reconstruction process. In addition, to take full\nadvantage of the above low-frequency priors, the Dynamic HDR Reconstruction\nNetwork (DHRNet) is carried out in a regression-based manner to obtain final\nHDR images. Extensive experiments conducted on synthetic and real-world\nbenchmark datasets demonstrate that our LF-Diff performs favorably against\nseveral state-of-the-art methods and is 10$\\times$ faster than previous\nDM-based methods.\n","authors":["Tao Hu","Qingsen Yan","Yuankai Qi","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.00849v1.pdf","comment":"This paper is accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.19898v2","updated":"2024-04-01T01:27:14Z","published":"2024-03-29T00:40:12Z","title":"Structure Matters: Tackling the Semantic Discrepancy in Diffusion Models\n for Image Inpainting","summary":" Denoising diffusion probabilistic models for image inpainting aim to add the\nnoise to the texture of image during the forward process and recover masked\nregions with unmasked ones of the texture via the reverse denoising process.\nDespite the meaningful semantics generation, the existing arts suffer from the\nsemantic discrepancy between masked and unmasked regions, since the\nsemantically dense unmasked texture fails to be completely degraded while the\nmasked regions turn to the pure noise in diffusion process, leading to the\nlarge discrepancy between them. In this paper, we aim to answer how unmasked\nsemantics guide texture denoising process;together with how to tackle the\nsemantic discrepancy, to facilitate the consistent and meaningful semantics\ngeneration. To this end, we propose a novel structure-guided diffusion model\nnamed StrDiffusion, to reformulate the conventional texture denoising process\nunder structure guidance to derive a simplified denoising objective for image\ninpainting, while revealing: 1) the semantically sparse structure is beneficial\nto tackle semantic discrepancy in early stage, while dense texture generates\nreasonable semantics in late stage; 2) the semantics from unmasked regions\nessentially offer the time-dependent structure guidance for the texture\ndenoising process, benefiting from the time-dependent sparsity of the structure\nsemantics. For the denoising process, a structure-guided neural network is\ntrained to estimate the simplified denoising objective by exploiting the\nconsistency of the denoised structure between masked and unmasked regions.\nBesides, we devise an adaptive resampling strategy as a formal criterion as\nwhether structure is competent to guide the texture denoising process, while\nregulate their semantic correlations. Extensive experiments validate the merits\nof StrDiffusion over the state-of-the-arts. Our code is available at\nhttps://github.com/htyjers/StrDiffusion.\n","authors":["Haipeng Liu","Yang Wang","Biao Qian","Meng Wang","Yong Rui"],"pdf_url":"https://arxiv.org/pdf/2403.19898v2.pdf","comment":"15 pages, 10 figures, to appear CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00847v1","updated":"2024-04-01T01:25:06Z","published":"2024-04-01T01:25:06Z","title":"Collaborative Learning of Anomalies with Privacy (CLAP) for Unsupervised\n Video Anomaly Detection: A New Baseline","summary":" Unsupervised (US) video anomaly detection (VAD) in surveillance applications\nis gaining more popularity recently due to its practical real-world\napplications. As surveillance videos are privacy sensitive and the availability\nof large-scale video data may enable better US-VAD systems, collaborative\nlearning can be highly rewarding in this setting. However, due to the extremely\nchallenging nature of the US-VAD task, where learning is carried out without\nany annotations, privacy-preserving collaborative learning of US-VAD systems\nhas not been studied yet. In this paper, we propose a new baseline for anomaly\ndetection capable of localizing anomalous events in complex surveillance videos\nin a fully unsupervised fashion without any labels on a privacy-preserving\nparticipant-based distributed training configuration. Additionally, we propose\nthree new evaluation protocols to benchmark anomaly detection approaches on\nvarious scenarios of collaborations and data availability. Based on these\nprotocols, we modify existing VAD datasets to extensively evaluate our approach\nas well as existing US SOTA methods on two large-scale datasets including\nUCF-Crime and XD-Violence. All proposed evaluation protocols, dataset splits,\nand codes are available here: https://github.com/AnasEmad11/CLAP\n","authors":["Anas Al-lahham","Muhammad Zaigham Zaheer","Nurbek Tastan","Karthik Nandakumar"],"pdf_url":"https://arxiv.org/pdf/2404.00847v1.pdf","comment":"Accepted in IEEE/CVF Computer Vision and Pattern Recognition\n Conference (CVPR), 2024"},{"id":"http://arxiv.org/abs/2404.00846v1","updated":"2024-04-01T01:23:58Z","published":"2024-04-01T01:23:58Z","title":"Transfer Learning with Point Transformers","summary":" Point Transformers are near state-of-the-art models for classification,\nsegmentation, and detection tasks on Point Cloud data. They utilize a self\nattention based mechanism to model large range spatial dependencies between\nmultiple point sets. In this project we explore two things: classification\nperformance of these attention based networks on ModelNet10 dataset and then,\nwe use the trained model to classify 3D MNIST dataset after finetuning. We also\ntrain the model from scratch on 3D MNIST dataset to compare the performance of\nfinetuned and from-scratch model on the MNIST dataset. We observe that since\nthe two datasets have a large difference in the degree of the distributions,\ntransfer learned models do not outperform the from-scratch models in this case.\nAlthough we do expect transfer learned models to converge faster since they\nalready know the lower level edges, corners, etc features from the ModelNet10\ndataset.\n","authors":["Kartik Gupta","Rahul Vippala","Sahima Srivastava"],"pdf_url":"https://arxiv.org/pdf/2404.00846v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13831v3","updated":"2024-04-01T01:18:26Z","published":"2023-11-23T07:25:31Z","title":"Posterior Distillation Sampling","summary":" We introduce Posterior Distillation Sampling (PDS), a novel optimization\nmethod for parametric image editing based on diffusion models. Existing\noptimization-based methods, which leverage the powerful 2D prior of diffusion\nmodels to handle various parametric images, have mainly focused on generation.\nUnlike generation, editing requires a balance between conforming to the target\nattribute and preserving the identity of the source content. Recent 2D image\nediting methods have achieved this balance by leveraging the stochastic latent\nencoded in the generative process of diffusion models. To extend the editing\ncapabilities of diffusion models shown in pixel space to parameter space, we\nreformulate the 2D image editing method into an optimization form named PDS.\nPDS matches the stochastic latents of the source and the target, enabling the\nsampling of targets in diverse parameter spaces that align with a desired\nattribute while maintaining the source's identity. We demonstrate that this\noptimization resembles running a generative process with the target attribute,\nbut aligning this process with the trajectory of the source's generative\nprocess. Extensive editing results in Neural Radiance Fields and Scalable\nVector Graphics representations demonstrate that PDS is capable of sampling\ntargets to fulfill the aforementioned balance across various parameter spaces.\n","authors":["Juil Koo","Chanho Park","Minhyuk Sung"],"pdf_url":"https://arxiv.org/pdf/2311.13831v3.pdf","comment":"Project page: https://posterior-distillation-sampling.github.io/"},{"id":"http://arxiv.org/abs/2404.00842v1","updated":"2024-04-01T00:47:02Z","published":"2024-04-01T00:47:02Z","title":"An N-Point Linear Solver for Line and Motion Estimation with Event\n Cameras","summary":" Event cameras respond primarily to edges--formed by strong gradients--and are\nthus particularly well-suited for line-based motion estimation. Recent work has\nshown that events generated by a single line each satisfy a polynomial\nconstraint which describes a manifold in the space-time volume. Multiple such\nconstraints can be solved simultaneously to recover the partial linear velocity\nand line parameters. In this work, we show that, with a suitable line\nparametrization, this system of constraints is actually linear in the unknowns,\nwhich allows us to design a novel linear solver. Unlike existing solvers, our\nlinear solver (i) is fast and numerically stable since it does not rely on\nexpensive root finding, (ii) can solve both minimal and overdetermined systems\nwith more than 5 events, and (iii) admits the characterization of all\ndegenerate cases and multiple solutions. The found line parameters are\nsingularity-free and have a fixed scale, which eliminates the need for\nauxiliary constraints typically encountered in previous work. To recover the\nfull linear camera velocity we fuse observations from multiple lines with a\nnovel velocity averaging scheme that relies on a geometrically-motivated\nresidual, and thus solves the problem more efficiently than previous schemes\nwhich minimize an algebraic residual. Extensive experiments in synthetic and\nreal-world settings demonstrate that our method surpasses the previous work in\nnumerical stability, and operates over 600 times faster.\n","authors":["Ling Gao","Daniel Gehrig","Hang Su","Davide Scaramuzza","Laurent Kneip"],"pdf_url":"https://arxiv.org/pdf/2404.00842v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00838v1","updated":"2024-04-01T00:31:11Z","published":"2024-04-01T00:31:11Z","title":"3MOS: Multi-sources, Multi-resolutions, and Multi-scenes dataset for\n Optical-SAR image matching","summary":" Optical-SAR image matching is a fundamental task for image fusion and visual\nnavigation. However, all large-scale open SAR dataset for methods development\nare collected from single platform, resulting in limited satellite types and\nspatial resolutions. Since images captured by different sensors vary\nsignificantly in both geometric and radiometric appearance, existing methods\nmay fail to match corresponding regions containing the same content. Besides,\nmost of existing datasets have not been categorized based on the\ncharacteristics of different scenes. To encourage the design of more general\nmulti-modal image matching methods, we introduce a large-scale\nMulti-sources,Multi-resolutions, and Multi-scenes dataset for Optical-SAR image\nmatching(3MOS). It consists of 155K optical-SAR image pairs, including SAR data\nfrom six commercial satellites, with resolutions ranging from 1.25m to 12.5m.\nThe data has been classified into eight scenes including urban, rural, plains,\nhills, mountains, water, desert, and frozen earth. Extensively experiments show\nthat none of state-of-the-art methods achieve consistently superior performance\nacross different sources, resolutions and scenes. In addition, the distribution\nof data has a substantial impact on the matching capability of deep learning\nmodels, this proposes the domain adaptation challenge in optical-SAR image\nmatching. Our data and code will be available at:https://github.com/3M-OS/3MOS.\n","authors":["Yibin Ye","Xichao Teng","Shuo Chen","Yijie Bian","Tao Tan","Zhang Li"],"pdf_url":"https://arxiv.org/pdf/2404.00838v1.pdf","comment":"20pages 17 figures"},{"id":"http://arxiv.org/abs/2404.00837v1","updated":"2024-04-01T00:23:22Z","published":"2024-04-01T00:23:22Z","title":"Automated HER2 Scoring in Breast Cancer Images Using Deep Learning and\n Pyramid Sampling","summary":" Human epidermal growth factor receptor 2 (HER2) is a critical protein in\ncancer cell growth that signifies the aggressiveness of breast cancer (BC) and\nhelps predict its prognosis. Accurate assessment of immunohistochemically (IHC)\nstained tissue slides for HER2 expression levels is essential for both\ntreatment guidance and understanding of cancer mechanisms. Nevertheless, the\ntraditional workflow of manual examination by board-certified pathologists\nencounters challenges, including inter- and intra-observer inconsistency and\nextended turnaround times. Here, we introduce a deep learning-based approach\nutilizing pyramid sampling for the automated classification of HER2 status in\nIHC-stained BC tissue images. Our approach analyzes morphological features at\nvarious spatial scales, efficiently managing the computational load and\nfacilitating a detailed examination of cellular and larger-scale tissue-level\ndetails. This method addresses the tissue heterogeneity of HER2 expression by\nproviding a comprehensive view, leading to a blind testing classification\naccuracy of 84.70%, on a dataset of 523 core images from tissue microarrays.\nOur automated system, proving reliable as an adjunct pathology tool, has the\npotential to enhance diagnostic precision and evaluation speed, and might\nsignificantly impact cancer treatment planning.\n","authors":["Sahan Yoruc Selcuk","Xilin Yang","Bijie Bai","Yijie Zhang","Yuzhu Li","Musa Aydin","Aras Firat Unal","Aditya Gomatam","Zhen Guo","Darrow Morgan Angus","Goren Kolodney","Karine Atlan","Tal Keidar Haran","Nir Pillar","Aydogan Ozcan"],"pdf_url":"https://arxiv.org/pdf/2404.00837v1.pdf","comment":"21 Pages, 7 Figures"},{"id":"http://arxiv.org/abs/2404.00834v1","updated":"2024-04-01T00:18:17Z","published":"2024-04-01T00:18:17Z","title":"Towards Robust Event-guided Low-Light Image Enhancement: A Large-Scale\n Real-World Event-Image Dataset and Novel Approach","summary":" Event camera has recently received much attention for low-light image\nenhancement (LIE) thanks to their distinct advantages, such as high dynamic\nrange. However, current research is prohibitively restricted by the lack of\nlarge-scale, real-world, and spatial-temporally aligned event-image datasets.\nTo this end, we propose a real-world (indoor and outdoor) dataset comprising\nover 30K pairs of images and events under both low and normal illumination\nconditions. To achieve this, we utilize a robotic arm that traces a consistent\nnon-linear trajectory to curate the dataset with spatial alignment precision\nunder 0.03mm. We then introduce a matching alignment strategy, rendering 90% of\nour dataset with errors less than 0.01s. Based on the dataset, we propose a\nnovel event-guided LIE approach, called EvLight, towards robust performance in\nreal-world low-light scenes. Specifically, we first design the multi-scale\nholistic fusion branch to extract holistic structural and textural information\nfrom both events and images. To ensure robustness against variations in the\nregional illumination and noise, we then introduce a Signal-to-Noise-Ratio\n(SNR)-guided regional feature selection to selectively fuse features of images\nfrom regions with high SNR and enhance those with low SNR by extracting\nregional structure information from events. Extensive experiments on our\ndataset and the synthetic SDSD dataset demonstrate our EvLight significantly\nsurpasses the frame-based methods. Code and datasets are available at\nhttps://vlislab22.github.io/eg-lowlight/.\n","authors":["Guoqiang Liang","Kanghao Chen","Hangyu Li","Yunfan Lu","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.00834v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01438v1","updated":"2024-04-01T19:22:43Z","published":"2024-04-01T19:22:43Z","title":"Generation and Detection of Sign Language Deepfakes -- A Linguistic and\n Visual Analysis","summary":" A question in the realm of deepfakes is slowly emerging pertaining to whether\nwe can go beyond facial deepfakes and whether it would be beneficial to\nsociety. Therefore, this research presents a positive application of deepfake\ntechnology in upper body generation, while performing sign-language for the\nDeaf and Hard of Hearing (DHoH) community. The resulting videos are later\nvetted with a sign language expert. This is particularly helpful, given the\nintricate nature of sign language, a scarcity of sign language experts, and\npotential benefits for health and education. The objectives of this work\nencompass constructing a reliable deepfake dataset, evaluating its technical\nand visual credibility through computer vision and natural language processing\nmodels, and assessing the plausibility of the generated content. With over 1200\nvideos, featuring both previously seen and unseen individuals for the\ngeneration model, using the help of a sign language expert, we establish a\ndeepfake dataset in sign language that can further be utilized to detect fake\nvideos that may target certain people of determination.\n","authors":["Shahzeb Naeem","Muhammad Riyyan Khan","Usman Tariq","Abhinav Dhall","Carlos Ivan Colon","Hasan Al-Nashash"],"pdf_url":"https://arxiv.org/pdf/2404.01438v1.pdf","comment":"13 pages, 13 figures, Computer Vision and Image Understanding Journal"},{"id":"http://arxiv.org/abs/2404.03687v1","updated":"2024-04-01T20:44:28Z","published":"2024-04-01T20:44:28Z","title":"DRIVE: Dual Gradient-Based Rapid Iterative Pruning","summary":" Modern deep neural networks (DNNs) consist of millions of parameters,\nnecessitating high-performance computing during training and inference. Pruning\nis one solution that significantly reduces the space and time complexities of\nDNNs. Traditional pruning methods that are applied post-training focus on\nstreamlining inference, but there are recent efforts to leverage sparsity early\non by pruning before training. Pruning methods, such as iterative\nmagnitude-based pruning (IMP) achieve up to a 90% parameter reduction while\nretaining accuracy comparable to the original model. However, this leads to\nimpractical runtime as it relies on multiple train-prune-reset cycles to\nidentify and eliminate redundant parameters. In contrast, training agnostic\nearly pruning methods, such as SNIP and SynFlow offer fast pruning but fall\nshort of the accuracy achieved by IMP at high sparsities. To bridge this gap,\nwe present Dual Gradient-Based Rapid Iterative Pruning (DRIVE), which leverages\ndense training for initial epochs to counteract the randomness inherent at the\ninitialization. Subsequently, it employs a unique dual gradient-based metric\nfor parameter ranking. It has been experimentally demonstrated for VGG and\nResNet architectures on CIFAR-10/100 and Tiny ImageNet, and ResNet on ImageNet\nthat DRIVE consistently has superior performance over other training-agnostic\nearly pruning methods in accuracy. Notably, DRIVE is 43$\\times$ to 869$\\times$\nfaster than IMP for pruning.\n","authors":["Dhananjay Saikumar","Blesson Varghese"],"pdf_url":"https://arxiv.org/pdf/2404.03687v1.pdf","comment":null}]},"2024-03-31T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2312.01998v2","updated":"2024-03-31T22:58:09Z","published":"2023-12-04T16:22:06Z","title":"Language-only Efficient Training of Zero-shot Composed Image Retrieval","summary":" Composed image retrieval (CIR) task takes a composed query of image and text,\naiming to search relative images for both conditions. Conventional CIR\napproaches need a training dataset composed of triplets of query image, query\ntext, and target image, which is very expensive to collect. Several recent\nworks have worked on the zero-shot (ZS) CIR paradigm to tackle the issue\nwithout using pre-collected triplets. However, the existing ZS-CIR methods show\nlimited backbone scalability and generalizability due to the lack of diversity\nof the input texts during training. We propose a novel CIR framework, only\nusing language for its training. Our LinCIR (Language-only training for CIR)\ncan be trained only with text datasets by a novel self-supervision named\nself-masking projection (SMP). We project the text latent embedding to the\ntoken embedding space and construct a new text by replacing the keyword tokens\nof the original text. Then, we let the new and original texts have the same\nlatent embedding vector. With this simple strategy, LinCIR is surprisingly\nefficient and highly effective; LinCIR with CLIP ViT-G backbone is trained in\n48 minutes and shows the best ZS-CIR performances on four different CIR\nbenchmarks, CIRCO, GeneCIS, FashionIQ, and CIRR, even outperforming supervised\nmethod on FashionIQ. Code is available at https://github.com/navervision/lincir\n","authors":["Geonmo Gu","Sanghyuk Chun","Wonjae Kim","Yoohoon Kang","Sangdoo Yun"],"pdf_url":"https://arxiv.org/pdf/2312.01998v2.pdf","comment":"CVPR 2024 camera-ready; First two authors contributed equally; 17\n pages, 3.1MB"},{"id":"http://arxiv.org/abs/2311.13958v2","updated":"2024-03-31T22:39:12Z","published":"2023-11-23T12:16:33Z","title":"Handling The Non-Smooth Challenge in Tensor SVD: A Multi-Objective\n Tensor Recovery Framework","summary":" Recently, numerous tensor singular value decomposition (t-SVD)-based tensor\nrecovery methods have shown promise in processing visual data, such as color\nimages and videos. However, these methods often suffer from severe performance\ndegradation when confronted with tensor data exhibiting non-smooth changes. It\nhas been commonly observed in real-world scenarios but ignored by the\ntraditional t-SVD-based methods. In this work, we introduce a novel tensor\nrecovery model with a learnable tensor nuclear norm to address such a\nchallenge. We develop a new optimization algorithm named the Alternating\nProximal Multiplier Method (APMM) to iteratively solve the proposed tensor\ncompletion model. Theoretical analysis demonstrates the convergence of the\nproposed APMM to the Karush-Kuhn-Tucker (KKT) point of the optimization\nproblem. In addition, we propose a multi-objective tensor recovery framework\nbased on APMM to efficiently explore the correlations of tensor data across its\nvarious dimensions, providing a new perspective on extending the t-SVD-based\nmethod to higher-order tensor cases. Numerical experiments demonstrated the\neffectiveness of the proposed method in tensor completion.\n","authors":["Jingjing Zheng","Wanglong Lu","Wenzhe Wang","Yankai Cao","Xiaoqin Zhang","Xianta Jiang"],"pdf_url":"https://arxiv.org/pdf/2311.13958v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18822v2","updated":"2024-03-31T21:11:59Z","published":"2023-11-30T18:58:17Z","title":"ElasticDiffusion: Training-free Arbitrary Size Image Generation through\n Global-Local Content Separation","summary":" Diffusion models have revolutionized image generation in recent years, yet\nthey are still limited to a few sizes and aspect ratios. We propose\nElasticDiffusion, a novel training-free decoding method that enables pretrained\ntext-to-image diffusion models to generate images with various sizes.\nElasticDiffusion attempts to decouple the generation trajectory of a pretrained\nmodel into local and global signals. The local signal controls low-level pixel\ninformation and can be estimated on local patches, while the global signal is\nused to maintain overall structural consistency and is estimated with a\nreference image. We test our method on CelebA-HQ (faces) and LAION-COCO\n(objects/indoor/outdoor scenes). Our experiments and qualitative results show\nsuperior image coherence quality across aspect ratios compared to\nMultiDiffusion and the standard decoding strategy of Stable Diffusion. Project\npage: https://elasticdiffusion.github.io/\n","authors":["Moayed Haji-Ali","Guha Balakrishnan","Vicente Ordonez"],"pdf_url":"https://arxiv.org/pdf/2311.18822v2.pdf","comment":"Accepted at CVPR 2024. Project Page:\n https://elasticdiffusion.github.io/"},{"id":"http://arxiv.org/abs/2309.15204v2","updated":"2024-03-31T20:59:03Z","published":"2023-09-26T19:05:18Z","title":"CLRmatchNet: Enhancing Curved Lane Detection with Deep Matching Process","summary":" Lane detection plays a crucial role in autonomous driving by providing vital\ndata to ensure safe navigation. Modern algorithms rely on anchor-based\ndetectors, which are then followed by a label-assignment process to categorize\ntraining detections as positive or negative instances based on learned\ngeometric attributes. Accurate label assignment has great impact on the model\nperformance, that is usually relying on a pre-defined classical cost function\nevaluating GT-prediction alignment. However, classical label assignment methods\nface limitations due to their reliance on predefined cost functions derived\nfrom low-dimensional models, potentially impacting their optimality. Our\nresearch introduces MatchNet, a deep learning submodule-based approach aimed at\nimproving the label assignment process. Integrated into a state-of-the-art lane\ndetection network such as the Cross Layer Refinement Network for Lane Detection\n(CLRNet), MatchNet replaces the conventional label assignment process with a\nsubmodule network. The integrated model, CLRmatchNet, surpasses CLRNet, showing\nsubstantial improvements in scenarios involving curved lanes, with remarkable\nimprovement across all backbones of +2.8% for ResNet34, +2.3% for ResNet101,\nand +2.96% for DLA34. In addition, it maintains or even improves comparable\nresults in other sections. Our method boosts the confidence level in lane\ndetection, allowing an increase in the confidence threshold. Our code is\navailable at: https://github.com/sapirkontente/CLRmatchNet.git\n","authors":["Sapir Kontente","Roy Orfaig","Ben-Zion Bobrovsky"],"pdf_url":"https://arxiv.org/pdf/2309.15204v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13716v2","updated":"2024-03-31T19:23:55Z","published":"2023-11-22T22:20:10Z","title":"DiverseNet: Decision Diversified Semi-supervised Semantic Segmentation\n Networks for Remote Sensing Imagery","summary":" Semi-supervised learning aims to help reduce the cost of the manual labelling\nprocess by leveraging valuable features extracted from a substantial pool of\nunlabeled data alongside a limited set of labelled data during the training\nphase. Since pixel-level manual labelling in large-scale remote sensing imagery\nis expensive, semi-supervised learning becomes an appropriate solution to this.\nHowever, most of the existing consistency learning frameworks based on network\nperturbation are very bulky. There is still a lack of lightweight and efficient\nperturbation methods to promote the diversity of features and the precision of\npseudo labels during training. In order to fill this gap, we propose DiverseNet\nwhich explores multi-head and multi-model semi-supervised learning algorithms\nby simultaneously enhancing precision and diversity during training. The two\nproposed methods in the DiverseNet family, namely DiverseHead and DiverseModel,\nboth achieve the better semantic segmentation performance in four widely\nutilised remote sensing imagery data sets compared to state-of-the-art\nsemi-supervised learning methods. Meanwhile, the proposed DiverseHead\narchitecture is simple and relatively lightweight in terms of parameter space\ncompared to the state-of-the-art methods whilst reaching high-performance\nresults for all the tested data sets.\n","authors":["Wanli Ma","Oktay Karakus","Paul L. Rosin"],"pdf_url":"https://arxiv.org/pdf/2311.13716v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08409v2","updated":"2024-03-31T19:01:07Z","published":"2024-01-16T14:49:26Z","title":"Faster ISNet for Background Bias Mitigation on Deep Neural Networks","summary":" Bias or spurious correlations in image backgrounds can impact neural\nnetworks, causing shortcut learning (Clever Hans Effect) and hampering\ngeneralization to real-world data. ISNet, a recently introduced architecture,\nproposed the optimization of Layer-Wise Relevance Propagation (LRP, an\nexplanation technique) heatmaps, to mitigate the influence of backgrounds on\ndeep classifiers. However, ISNet's training time scales linearly with the\nnumber of classes in an application. Here, we propose reformulated\narchitectures whose training time becomes independent from this number.\nAdditionally, we introduce a concise and model-agnostic LRP implementation. We\nchallenge the proposed architectures using synthetic background bias, and\nCOVID-19 detection in chest X-rays, an application that commonly presents\nbackground bias. The networks hindered background attention and shortcut\nlearning, surpassing multiple state-of-the-art models on out-of-distribution\ntest datasets. Representing a potentially massive training speed improvement\nover ISNet, the proposed architectures introduce LRP optimization into a gamut\nof applications that the original model cannot feasibly handle.\n","authors":["Pedro R. A. S. Bassi","Sergio Decherchi","Andrea Cavalli"],"pdf_url":"https://arxiv.org/pdf/2401.08409v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10671v2","updated":"2024-03-31T18:37:10Z","published":"2023-12-17T10:07:03Z","title":"Open3DIS: Open-Vocabulary 3D Instance Segmentation with 2D Mask Guidance","summary":" We introduce Open3DIS, a novel solution designed to tackle the problem of\nOpen-Vocabulary Instance Segmentation within 3D scenes. Objects within 3D\nenvironments exhibit diverse shapes, scales, and colors, making precise\ninstance-level identification a challenging task. Recent advancements in\nOpen-Vocabulary scene understanding have made significant strides in this area\nby employing class-agnostic 3D instance proposal networks for object\nlocalization and learning queryable features for each 3D mask. While these\nmethods produce high-quality instance proposals, they struggle with identifying\nsmall-scale and geometrically ambiguous objects. The key idea of our method is\na new module that aggregates 2D instance masks across frames and maps them to\ngeometrically coherent point cloud regions as high-quality object proposals\naddressing the above limitations. These are then combined with 3D\nclass-agnostic instance proposals to include a wide range of objects in the\nreal world. To validate our approach, we conducted experiments on three\nprominent datasets, including ScanNet200, S3DIS, and Replica, demonstrating\nsignificant performance gains in segmenting objects with diverse categories\nover the state-of-the-art approaches.\n","authors":["Phuc D. A. Nguyen","Tuan Duc Ngo","Chuang Gan","Evangelos Kalogerakis","Anh Tran","Cuong Pham","Khoi Nguyen"],"pdf_url":"https://arxiv.org/pdf/2312.10671v2.pdf","comment":"CVPR 2024. Project page: https://open3dis.github.io/"},{"id":"http://arxiv.org/abs/2312.02142v4","updated":"2024-03-31T18:11:18Z","published":"2023-12-04T18:58:40Z","title":"Object Recognition as Next Token Prediction","summary":" We present an approach to pose object recognition as next token prediction.\nThe idea is to apply a language decoder that auto-regressively predicts the\ntext tokens from image embeddings to form labels. To ground this prediction\nprocess in auto-regression, we customize a non-causal attention mask for the\ndecoder, incorporating two key features: modeling tokens from different labels\nto be independent, and treating image tokens as a prefix. This masking\nmechanism inspires an efficient method - one-shot sampling - to simultaneously\nsample tokens of multiple labels in parallel and rank generated labels by their\nprobabilities during inference. To further enhance the efficiency, we propose a\nsimple strategy to construct a compact decoder by simply discarding the\nintermediate blocks of a pretrained language model. This approach yields a\ndecoder that matches the full model's performance while being notably more\nefficient. The code is available at https://github.com/kaiyuyue/nxtp\n","authors":["Kaiyu Yue","Bor-Chun Chen","Jonas Geiping","Hengduo Li","Tom Goldstein","Ser-Nam Lim"],"pdf_url":"https://arxiv.org/pdf/2312.02142v4.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2402.12289v3","updated":"2024-03-31T17:08:00Z","published":"2024-02-19T17:04:04Z","title":"DriveVLM: The Convergence of Autonomous Driving and Large\n Vision-Language Models","summary":" A primary hurdle of autonomous driving in urban environments is understanding\ncomplex and long-tail scenarios, such as challenging road conditions and\ndelicate human behaviors. We introduce DriveVLM, an autonomous driving system\nleveraging Vision-Language Models (VLMs) for enhanced scene understanding and\nplanning capabilities. DriveVLM integrates a unique combination of\nchain-of-thought (CoT) modules for scene description, scene analysis, and\nhierarchical planning. Furthermore, recognizing the limitations of VLMs in\nspatial reasoning and heavy computational requirements, we propose\nDriveVLM-Dual, a hybrid system that synergizes the strengths of DriveVLM with\nthe traditional autonomous driving pipeline. DriveVLM-Dual achieves robust\nspatial understanding and real-time inference speed. Extensive experiments on\nboth the nuScenes dataset and our SUP-AD dataset demonstrate the effectiveness\nof DriveVLM and the enhanced performance of DriveVLM-Dual, surpassing existing\nmethods in complex and unpredictable driving conditions.\n","authors":["Xiaoyu Tian","Junru Gu","Bailin Li","Yicheng Liu","Chenxu Hu","Yang Wang","Kun Zhan","Peng Jia","Xianpeng Lang","Hang Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.12289v3.pdf","comment":"Project Page: https://tsinghua-mars-lab.github.io/DriveVLM/"},{"id":"http://arxiv.org/abs/2311.14218v2","updated":"2024-03-31T17:05:15Z","published":"2023-11-23T22:27:31Z","title":"A New Benchmark and Model for Challenging Image Manipulation Detection","summary":" The ability to detect manipulation in multimedia data is vital in digital\nforensics. Existing Image Manipulation Detection (IMD) methods are mainly based\non detecting anomalous features arisen from image editing or double compression\nartifacts. All existing IMD techniques encounter challenges when it comes to\ndetecting small tampered regions from a large image. Moreover,\ncompression-based IMD approaches face difficulties in cases of double\ncompression of identical quality factors. To investigate the State-of-The-Art\n(SoTA) IMD methods in those challenging conditions, we introduce a new\nChallenging Image Manipulation Detection (CIMD) benchmark dataset, which\nconsists of two subsets, for evaluating editing-based and compression-based IMD\nmethods, respectively. The dataset images were manually taken and tampered with\nhigh-quality annotations. In addition, we propose a new two-branch network\nmodel based on HRNet that can better detect both the image-editing and\ncompression artifacts in those challenging conditions. Extensive experiments on\nthe CIMD benchmark show that our model significantly outperforms SoTA IMD\nmethods on CIMD.\n","authors":["Zhenfei Zhang","Mingyang Li","Ming-Ching Chang"],"pdf_url":"https://arxiv.org/pdf/2311.14218v2.pdf","comment":"9 pages, 6 figures, 3 tabels. AAAI-24"},{"id":"http://arxiv.org/abs/2309.11281v3","updated":"2024-03-31T16:59:45Z","published":"2023-09-20T13:05:42Z","title":"Language-driven Object Fusion into Neural Radiance Fields with\n Pose-Conditioned Dataset Updates","summary":" Neural radiance field is an emerging rendering method that generates\nhigh-quality multi-view consistent images from a neural scene representation\nand volume rendering. Although neural radiance field-based techniques are\nrobust for scene reconstruction, their ability to add or remove objects remains\nlimited. This paper proposes a new language-driven approach for object\nmanipulation with neural radiance fields through dataset updates. Specifically,\nto insert a new foreground object represented by a set of multi-view images\ninto a background radiance field, we use a text-to-image diffusion model to\nlearn and generate combined images that fuse the object of interest into the\ngiven background across views. These combined images are then used for refining\nthe background radiance field so that we can render view-consistent images\ncontaining both the object and the background. To ensure view consistency, we\npropose a dataset updates strategy that prioritizes radiance field training\nwith camera views close to the already-trained views prior to propagating the\ntraining to remaining views. We show that under the same dataset updates\nstrategy, we can easily adapt our method for object insertion using data from\ntext-to-3D models as well as object removal. Experimental results show that our\nmethod generates photorealistic images of the edited scenes, and outperforms\nstate-of-the-art methods in 3D reconstruction and neural radiance field\nblending.\n","authors":["Ka Chun Shum","Jaeyeon Kim","Binh-Son Hua","Duc Thanh Nguyen","Sai-Kit Yeung"],"pdf_url":"https://arxiv.org/pdf/2309.11281v3.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2402.19276v4","updated":"2024-03-31T15:19:30Z","published":"2024-02-29T15:44:00Z","title":"Modular Blind Video Quality Assessment","summary":" Blind video quality assessment (BVQA) plays a pivotal role in evaluating and\nimproving the viewing experience of end-users across a wide range of\nvideo-based platforms and services. Contemporary deep learning-based models\nprimarily analyze video content in its aggressively subsampled format, while\nbeing blind to the impact of the actual spatial resolution and frame rate on\nvideo quality. In this paper, we propose a modular BVQA model and a method of\ntraining it to improve its modularity. Our model comprises a base quality\npredictor, a spatial rectifier, and a temporal rectifier, responding to the\nvisual content and distortion, spatial resolution, and frame rate changes on\nvideo quality, respectively. During training, spatial and temporal rectifiers\nare dropped out with some probabilities to render the base quality predictor a\nstandalone BVQA model, which should work better with the rectifiers. Extensive\nexperiments on both professionally-generated content and user-generated content\nvideo databases show that our quality model achieves superior or comparable\nperformance to current methods. Additionally, the modularity of our model\noffers an opportunity to analyze existing video quality databases in terms of\ntheir spatial and temporal complexity.\n","authors":["Wen Wen","Mu Li","Yabin Zhang","Yiting Liao","Junlin Li","Li Zhang","Kede Ma"],"pdf_url":"https://arxiv.org/pdf/2402.19276v4.pdf","comment":"Accepted by CVPR 2024; Camera-ready version"},{"id":"http://arxiv.org/abs/2303.06797v2","updated":"2024-03-31T14:35:18Z","published":"2023-03-13T01:07:32Z","title":"Multi-Channel Orthogonal Transform-Based Perceptron Layers for Efficient\n ResNets","summary":" In this paper, we propose a set of transform-based neural network layers as\nan alternative to the $3\\times3$ Conv2D layers in Convolutional Neural Networks\n(CNNs). The proposed layers can be implemented based on orthogonal transforms\nsuch as the Discrete Cosine Transform (DCT), Hadamard transform (HT), and\nbiorthogonal Block Wavelet Transform (BWT). Furthermore, by taking advantage of\nthe convolution theorems, convolutional filtering operations are performed in\nthe transform domain using element-wise multiplications. Trainable\nsoft-thresholding layers, that remove noise in the transform domain, bring\nnonlinearity to the transform domain layers. Compared to the Conv2D layer,\nwhich is spatial-agnostic and channel-specific, the proposed layers are\nlocation-specific and channel-specific. Moreover, these proposed layers reduce\nthe number of parameters and multiplications significantly while improving the\naccuracy results of regular ResNets on the ImageNet-1K classification task.\nFurthermore, they can be inserted with a batch normalization layer before the\nglobal average pooling layer in the conventional ResNets as an additional layer\nto improve classification accuracy.\n","authors":["Hongyi Pan","Emadeldeen Hamdan","Xin Zhu","Salih Atici","Ahmet Enis Cetin"],"pdf_url":"https://arxiv.org/pdf/2303.06797v2.pdf","comment":"This work is accepted to IEEE Transactions on Neural Networks and\n Learning Systems. The initial title is \"Orthogonal Transform Domain\n Approaches for the Convolutional Layer\". We changed it to \"Multi-Channel\n Orthogonal Transform-Based Perceptron Layers for Efficient ResNets\" based on\n reviewer's comment. arXiv admin note: text overlap with arXiv:2211.08577"},{"id":"http://arxiv.org/abs/2403.14119v3","updated":"2024-03-31T13:36:54Z","published":"2024-03-21T04:08:29Z","title":"C-TPT: Calibrated Test-Time Prompt Tuning for Vision-Language Models via\n Text Feature Dispersion","summary":" In deep learning, test-time adaptation has gained attention as a method for\nmodel fine-tuning without the need for labeled data. A prime exemplification is\nthe recently proposed test-time prompt tuning for large-scale vision-language\nmodels such as CLIP. Unfortunately, these prompts have been mainly developed to\nimprove accuracy, overlooking the importance of calibration, which is a crucial\naspect for quantifying prediction uncertainty. However, traditional calibration\nmethods rely on substantial amounts of labeled data, making them impractical\nfor test-time scenarios. To this end, this paper explores calibration during\ntest-time prompt tuning by leveraging the inherent properties of CLIP. Through\na series of observations, we find that the prompt choice significantly affects\nthe calibration in CLIP, where the prompts leading to higher text feature\ndispersion result in better-calibrated predictions. Introducing the Average\nText Feature Dispersion (ATFD), we establish its relationship with calibration\nerror and present a novel method, Calibrated Test-time Prompt Tuning (C-TPT),\nfor optimizing prompts during test-time with enhanced calibration. Through\nextensive experiments on different CLIP architectures and datasets, we show\nthat C-TPT can effectively improve the calibration of test-time prompt tuning\nwithout needing labeled data. The code is publicly accessible at\nhttps://github.com/hee-suk-yoon/C-TPT.\n","authors":["Hee Suk Yoon","Eunseop Yoon","Joshua Tian Jin Tee","Mark Hasegawa-Johnson","Yingzhen Li","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2403.14119v3.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2403.19456v2","updated":"2024-03-31T13:26:11Z","published":"2024-03-28T14:27:36Z","title":"Break-for-Make: Modular Low-Rank Adaptations for Composable\n Content-Style Customization","summary":" Personalized generation paradigms empower designers to customize visual\nintellectual properties with the help of textual descriptions by tuning or\nadapting pre-trained text-to-image models on a few images. Recent works explore\napproaches for concurrently customizing both content and detailed visual style\nappearance. However, these existing approaches often generate images where the\ncontent and style are entangled. In this study, we reconsider the customization\nof content and style concepts from the perspective of parameter space\nconstruction. Unlike existing methods that utilize a shared parameter space for\ncontent and style, we propose a learning framework that separates the parameter\nspace to facilitate individual learning of content and style, thereby enabling\ndisentangled content and style. To achieve this goal, we introduce \"partly\nlearnable projection\" (PLP) matrices to separate the original adapters into\ndivided sub-parameter spaces. We propose \"break-for-make\" customization\nlearning pipeline based on PLP, which is simple yet effective. We break the\noriginal adapters into \"up projection\" and \"down projection\", train content and\nstyle PLPs individually with the guidance of corresponding textual prompts in\nthe separate adapters, and maintain generalization by employing a\nmulti-correspondence projection learning strategy. Based on the adapters broken\napart for separate training content and style, we then make the entity\nparameter space by reconstructing the content and style PLPs matrices, followed\nby fine-tuning the combined adapter to generate the target object with the\ndesired appearance. Experiments on various styles, including textures,\nmaterials, and artistic style, show that our method outperforms\nstate-of-the-art single/multiple concept learning pipelines in terms of\ncontent-style-prompt alignment.\n","authors":["Yu Xu","Fan Tang","Juan Cao","Yuxin Zhang","Oliver Deussen","Weiming Dong","Jintao Li","Tong-Yee Lee"],"pdf_url":"https://arxiv.org/pdf/2403.19456v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18254v2","updated":"2024-03-31T13:13:37Z","published":"2023-11-30T05:05:38Z","title":"Sketch Input Method Editor: A Comprehensive Dataset and Methodology for\n Systematic Input Recognition","summary":" With the recent surge in the use of touchscreen devices, free-hand sketching\nhas emerged as a promising modality for human-computer interaction. While\nprevious research has focused on tasks such as recognition, retrieval, and\ngeneration of familiar everyday objects, this study aims to create a Sketch\nInput Method Editor (SketchIME) specifically designed for a professional C4I\nsystem. Within this system, sketches are utilized as low-fidelity prototypes\nfor recommending standardized symbols in the creation of comprehensive\nsituation maps. This paper also presents a systematic dataset comprising 374\nspecialized sketch types, and proposes a simultaneous recognition and\nsegmentation architecture with multilevel supervision between recognition and\nsegmentation to improve performance and enhance interpretability. By\nincorporating few-shot domain adaptation and class-incremental learning, the\nnetwork's ability to adapt to new users and extend to new task-specific classes\nis significantly enhanced. Results from experiments conducted on both the\nproposed dataset and the SPG dataset illustrate the superior performance of the\nproposed architecture. Our dataset and code are publicly available at\nhttps://github.com/GuangmingZhu/SketchIME.\n","authors":["Guangming Zhu","Siyuan Wang","Qing Cheng","Kelong Wu","Hao Li","Liang Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.18254v2.pdf","comment":"The paper has been accepted by ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.08443v2","updated":"2024-03-31T12:39:48Z","published":"2023-08-16T15:51:05Z","title":"High-Fidelity Lake Extraction via Two-Stage Prompt Enhancement:\n Establishing a Novel Baseline and Benchmark","summary":" Lake extraction from remote sensing imagery is a complex challenge due to the\nvaried lake shapes and data noise. Current methods rely on multispectral image\ndatasets, making it challenging to learn lake features accurately from pixel\narrangements. This, in turn, affects model learning and the creation of\naccurate segmentation masks. This paper introduces a prompt-based dataset\nconstruction approach that provides approximate lake locations using point,\nbox, and mask prompts. We also propose a two-stage prompt enhancement\nframework, LEPrompter, with prompt-based and prompt-free stages during\ntraining. The prompt-based stage employs a prompt encoder to extract prior\ninformation, integrating prompt tokens and image embedding through self- and\ncross-attention in the prompt decoder. Prompts are deactivated to ensure\nindependence during inference, enabling automated lake extraction without\nintroducing additional parameters and GFlops. Extensive experiments showcase\nperformance improvements of our proposed approach compared to the previous\nstate-of-the-art method. The source code is available at\nhttps://github.com/BastianChen/LEPrompter.\n","authors":["Ben Chen","Xuechao Zou","Kai Li","Yu Zhang","Junliang Xing","Pin Tao"],"pdf_url":"https://arxiv.org/pdf/2308.08443v2.pdf","comment":"Accepted by ICME 2024"},{"id":"http://arxiv.org/abs/2308.01813v2","updated":"2024-03-31T12:27:16Z","published":"2023-08-03T15:21:08Z","title":"Deep Neural Networks Fused with Textures for Image Classification","summary":" Fine-grained image classification (FGIC) is a challenging task in computer\nvision for due to small visual differences among inter-subcategories, but,\nlarge intra-class variations. Deep learning methods have achieved remarkable\nsuccess in solving FGIC. In this paper, we propose a fusion approach to address\nFGIC by combining global texture with local patch-based information. The first\npipeline extracts deep features from various fixed-size non-overlapping patches\nand encodes features by sequential modelling using the long short-term memory\n(LSTM). Another path computes image-level textures at multiple scales using the\nlocal binary patterns (LBP). The advantages of both streams are integrated to\nrepresent an efficient feature vector for image classification. The method is\ntested on eight datasets representing the human faces, skin lesions, food\ndishes, marine lives, etc. using four standard backbone CNNs. Our method has\nattained better classification accuracy over existing methods with notable\nmargins.\n","authors":["Asish Bera","Debotosh Bhattacharjee","Mita Nasipuri"],"pdf_url":"https://arxiv.org/pdf/2308.01813v2.pdf","comment":"14 pages, 6 figures, 4 tables, conference"},{"id":"http://arxiv.org/abs/2402.12677v2","updated":"2024-03-31T12:18:51Z","published":"2024-02-20T02:54:03Z","title":"Object-level Geometric Structure Preserving for Natural Image Stitching","summary":" The topic of stitching images with globally natural structures holds\nparamount significance. Current methodologies exhibit the ability to preserve\nlocal geometric structures, yet fall short in maintaining relationships between\nthese geometric structures. In this paper, we endeavor to safeguard the\noverall, OBJect-level structures within images based on Global Similarity\nPrior, while concurrently mitigating distortion and ghosting artifacts with\nOBJ-GSP. Our approach leverages the Segment Anything Model to extract geometric\nstructures with semantic information, enhancing the algorithm's ability to\npreserve objects in a manner that aligns more intuitively with human\nperception. We seek to identify spatial constraints that govern the\nrelationships between various geometric boundaries. Recognizing that multiple\ngeometric boundaries collectively define complete objects, we employ triangular\nmeshes to safeguard not only individual geometric structures but also the\noverall shapes of objects within the images. Empirical evaluations across\nmultiple image stitching datasets demonstrate that our method establishes a new\nstate-of-the-art benchmark in image stitching. Our implementation and dataset\nis publicly available at https://github.com/RussRobin/OBJ-GSP .\n","authors":["Wenxiao Cai","Wankou Yang"],"pdf_url":"https://arxiv.org/pdf/2402.12677v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04192v4","updated":"2024-03-31T12:10:24Z","published":"2023-07-09T14:54:30Z","title":"Self-Adaptive Sampling for Efficient Video Question-Answering on\n Image--Text Models","summary":" Video question-answering is a fundamental task in the field of video\nunderstanding. Although current vision--language models (VLMs) equipped with\nVideo Transformers have enabled temporal modeling and yielded superior results,\nthey are at the cost of huge computational power and thus too expensive to\ndeploy in real-time application scenarios. An economical workaround only\nsamples a small portion of frames to represent the main content of that video\nand tune an image--text model on these sampled frames. Recent video\nunderstanding models usually randomly sample a set of frames or clips,\nregardless of internal correlations between their visual contents, nor their\nrelevance to the problem. We argue that such kinds of aimless sampling may omit\nthe key frames from which the correct answer can be deduced, and the situation\ngets worse when the sampling sparsity increases, which always happens as the\nvideo lengths increase. To mitigate this issue, we propose two frame sampling\nstrategies, namely the most domain frames (MDF) and most implied frames (MIF),\nto maximally preserve those frames that are most likely vital to the given\nquestions. MDF passively minimizes the risk of key frame omission in a\nbootstrap manner, while MIS actively searches key frames customized for each\nvideo--question pair with the assistance of auxiliary models. The experimental\nresults on three public datasets from three advanced VLMs (CLIP, GIT and\nAll-in-one) demonstrate that our proposed strategies can boost the performance\nfor image-text pretrained models. The source codes pertaining to the method\nproposed in this paper are publicly available at\nhttps://github.com/declare-lab/sas-vqa.\n","authors":["Wei Han","Hui Chen","Min-Yen Kan","Soujanya Poria"],"pdf_url":"https://arxiv.org/pdf/2307.04192v4.pdf","comment":"13 pages, 7 figures, accepted to Findings of NAACL 2024"},{"id":"http://arxiv.org/abs/2311.12588v2","updated":"2024-03-31T12:06:55Z","published":"2023-11-21T13:21:22Z","title":"HiPose: Hierarchical Binary Surface Encoding and Correspondence Pruning\n for RGB-D 6DoF Object Pose Estimation","summary":" In this work, we present a novel dense-correspondence method for 6DoF object\npose estimation from a single RGB-D image. While many existing data-driven\nmethods achieve impressive performance, they tend to be time-consuming due to\ntheir reliance on rendering-based refinement approaches. To circumvent this\nlimitation, we present HiPose, which establishes 3D-3D correspondences in a\ncoarse-to-fine manner with a hierarchical binary surface encoding. Unlike\nprevious dense-correspondence methods, we estimate the correspondence surface\nby employing point-to-surface matching and iteratively constricting the surface\nuntil it becomes a correspondence point while gradually removing outliers.\nExtensive experiments on public benchmarks LM-O, YCB-V, and T-Less demonstrate\nthat our method surpasses all refinement-free methods and is even on par with\nexpensive refinement-based approaches. Crucially, our approach is\ncomputationally efficient and enables real-time critical applications with high\naccuracy requirements.\n","authors":["Yongliang Lin","Yongzhi Su","Praveen Nathan","Sandeep Inuganti","Yan Di","Martin Sundermeyer","Fabian Manhardt","Didier Stricke","Jason Rambach","Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.12588v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2302.02314v4","updated":"2024-03-31T11:58:28Z","published":"2023-02-05T06:27:45Z","title":"CECT: Controllable Ensemble CNN and Transformer for COVID-19 Image\n Classification","summary":" The COVID-19 pandemic has resulted in hundreds of million cases and numerous\ndeaths worldwide. Here, we develop a novel classification network CECT by\ncontrollable ensemble convolutional neural network and transformer to provide a\ntimely and accurate COVID-19 diagnosis. The CECT is composed of a parallel\nconvolutional encoder block, an aggregate transposed-convolutional decoder\nblock, and a windowed attention classification block. Each block captures\nfeatures at different scales from 28 $\\times$ 28 to 224 $\\times$ 224 from the\ninput, composing enriched and comprehensive information. Different from\nexisting methods, our CECT can capture features at both multi-local and global\nscales without any sophisticated module design. Moreover, the contribution of\nlocal features at different scales can be controlled with the proposed ensemble\ncoefficients. We evaluate CECT on two public COVID-19 datasets and it reaches\nthe highest accuracy of 98.1% in the intra-dataset evaluation, outperforming\nexisting state-of-the-art methods. Moreover, the developed CECT achieves an\naccuracy of 90.9% on the unseen dataset in the inter-dataset evaluation,\nshowing extraordinary generalization ability. With remarkable feature capture\nability and generalization ability, we believe CECT can be extended to other\nmedical scenarios as a powerful diagnosis tool. Code is available at\nhttps://github.com/NUS-Tim/CECT.\n","authors":["Zhaoshan Liu","Lei Shen"],"pdf_url":"https://arxiv.org/pdf/2302.02314v4.pdf","comment":"Computers in Biology and Medicine Accepted"},{"id":"http://arxiv.org/abs/2303.11797v2","updated":"2024-03-31T11:53:55Z","published":"2023-03-21T12:28:21Z","title":"CAT-Seg: Cost Aggregation for Open-Vocabulary Semantic Segmentation","summary":" Open-vocabulary semantic segmentation presents the challenge of labeling each\npixel within an image based on a wide range of text descriptions. In this work,\nwe introduce a novel cost-based approach to adapt vision-language foundation\nmodels, notably CLIP, for the intricate task of semantic segmentation. Through\naggregating the cosine similarity score, i.e., the cost volume between image\nand text embeddings, our method potently adapts CLIP for segmenting seen and\nunseen classes by fine-tuning its encoders, addressing the challenges faced by\nexisting methods in handling unseen classes. Building upon this, we explore\nmethods to effectively aggregate the cost volume considering its multi-modal\nnature of being established between image and text embeddings. Furthermore, we\nexamine various methods for efficiently fine-tuning CLIP.\n","authors":["Seokju Cho","Heeseong Shin","Sunghwan Hong","Anurag Arnab","Paul Hongsuck Seo","Seungryong Kim"],"pdf_url":"https://arxiv.org/pdf/2303.11797v2.pdf","comment":"Accepted to CVPR 2024. Project page:\n https://ku-cvlab.github.io/CAT-Seg/"},{"id":"http://arxiv.org/abs/2310.14159v3","updated":"2024-03-31T10:51:06Z","published":"2023-10-22T03:01:38Z","title":"Can Language Models Laugh at YouTube Short-form Videos?","summary":" As short-form funny videos on social networks are gaining popularity, it\nbecomes demanding for AI models to understand them for better communication\nwith humans. Unfortunately, previous video humor datasets target specific\ndomains, such as speeches or sitcoms, and mostly focus on verbal cues. We\ncurate a user-generated dataset of 10K multimodal funny videos from YouTube,\ncalled ExFunTube. Using a video filtering pipeline with GPT-3.5, we verify both\nverbal and visual elements contributing to humor. After filtering, we annotate\neach video with timestamps and text explanations for funny moments. Our\nExFunTube is unique over existing datasets in that our videos cover a wide\nrange of domains with various types of humor that necessitate a multimodal\nunderstanding of the content. Also, we develop a zero-shot video-to-text\nprompting to maximize video humor understanding of large language models\n(LLMs). With three different evaluation methods using automatic scores,\nrationale quality experiments, and human evaluations, we show that our\nprompting significantly improves LLMs' ability for humor explanation.\n","authors":["Dayoon Ko","Sangho Lee","Gunhee Kim"],"pdf_url":"https://arxiv.org/pdf/2310.14159v3.pdf","comment":"EMNLP 2023; references added"},{"id":"http://arxiv.org/abs/2312.04553v2","updated":"2024-03-31T10:27:03Z","published":"2023-12-07T18:59:21Z","title":"SPIDeRS: Structured Polarization for Invisible Depth and Reflectance\n Sensing","summary":" Can we capture shape and reflectance in stealth? Such capability would be\nvaluable for many application domains in vision, xR, robotics, and HCI. We\nintroduce structured polarization for invisible depth and reflectance sensing\n(SPIDeRS), the first depth and reflectance sensing method using patterns of\npolarized light. The key idea is to modulate the angle of linear polarization\n(AoLP) of projected light at each pixel. The use of polarization makes it\ninvisible and lets us recover not only depth but also directly surface normals\nand even reflectance. We implement SPIDeRS with a liquid crystal spatial light\nmodulator (SLM) and a polarimetric camera. We derive a novel method for\nrobustly extracting the projected structured polarization pattern from the\npolarimetric object appearance. We evaluate the effectiveness of SPIDeRS by\napplying it to a number of real-world objects. The results show that our method\nsuccessfully reconstructs object shapes of various materials and is robust to\ndiffuse reflection and ambient light. We also demonstrate relighting using\nrecovered surface normals and reflectance. We believe SPIDeRS opens a new\navenue of polarization use in visual sensing.\n","authors":["Tomoki Ichikawa","Shohei Nobuhara","Ko Nishino"],"pdf_url":"https://arxiv.org/pdf/2312.04553v2.pdf","comment":"to be published in CVPR 2024"},{"id":"http://arxiv.org/abs/2312.01196v2","updated":"2024-03-31T10:20:37Z","published":"2023-12-02T18:06:24Z","title":"Neural Parametric Gaussians for Monocular Non-Rigid Object\n Reconstruction","summary":" Reconstructing dynamic objects from monocular videos is a severely\nunderconstrained and challenging problem, and recent work has approached it in\nvarious directions. However, owing to the ill-posed nature of this problem,\nthere has been no solution that can provide consistent, high-quality novel\nviews from camera positions that are significantly different from the training\nviews. In this work, we introduce Neural Parametric Gaussians (NPGs) to take on\nthis challenge by imposing a two-stage approach: first, we fit a low-rank\nneural deformation model, which then is used as regularization for non-rigid\nreconstruction in the second stage. The first stage learns the object's\ndeformations such that it preserves consistency in novel views. The second\nstage obtains high reconstruction quality by optimizing 3D Gaussians that are\ndriven by the coarse model. To this end, we introduce a local 3D Gaussian\nrepresentation, where temporally shared Gaussians are anchored in and deformed\nby local oriented volumes. The resulting combined model can be rendered as\nradiance fields, resulting in high-quality photo-realistic reconstructions of\nthe non-rigidly deforming objects. We demonstrate that NPGs achieve superior\nresults compared to previous works, especially in challenging scenarios with\nfew multi-view cues.\n","authors":["Devikalyan Das","Christopher Wewer","Raza Yunus","Eddy Ilg","Jan Eric Lenssen"],"pdf_url":"https://arxiv.org/pdf/2312.01196v2.pdf","comment":"Accepted at CVPR 2024 | Project Website:\n https://geometric-rl.mpi-inf.mpg.de/npg"},{"id":"http://arxiv.org/abs/2302.10306v2","updated":"2024-03-31T10:01:44Z","published":"2023-01-25T11:00:32Z","title":"Deep Convolutional Framelet Denoising for Panoramic by Mixed Wavelet\n Integration","summary":" Enhancing quality and removing noise during preprocessing is one of the most\ncritical steps in image processing. X-ray images are created by photons\ncolliding with atoms and the variation in scattered noise absorption. This\nnoise leads to a deterioration in the graph's medical quality and, at times,\nresults in repetition, thereby increasing the patient's effective dose. One of\nthe most critical challenges in this area has consistently been lowering the\nimage noise. Techniques like BM3d, low-pass filters, and Autoencoder have taken\nthis step. Owing to their structural design and high rate of repetition, neural\nnetworks employing diverse architectures have, over the past decade, achieved\nnoise reduction with satisfactory outcomes, surpassing the traditional BM3D and\nlow-pass filters. The combination of the Hankel matrix with neural networks\nrepresents one of these configurations. The Hankel matrix aims to identify a\nlocal circle by separating individual values into local and non-local\ncomponents, utilizing a non-local matrix. A non-local matrix can be created\nusing the wave or DCT. This paper suggests integrating the waveform with the\nDaubechies (D4) wavelet due to its higher energy concentration and employs the\nu-Net neural network architecture, which incorporates the waveform exclusively\nat each stage. The outcomes were evaluated using the PSNR and SSIM criteria,\nand the outcomes were verified by using various waves. The effectiveness of a\none-wave network has increased from 0.5% to 1.2%, according to studies done on\nother datasets.\n","authors":["Masoud Shahraki Mohammadi","Seyed Javad Seyed Mahdavi Chabok"],"pdf_url":"https://arxiv.org/pdf/2302.10306v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08255v3","updated":"2024-03-31T09:33:50Z","published":"2023-12-13T16:18:40Z","title":"OCTDL: Optical Coherence Tomography Dataset for Image-Based Deep\n Learning Methods","summary":" Optical coherence tomography (OCT) is a non-invasive imaging technique with\nextensive clinical applications in ophthalmology. OCT enables the visualization\nof the retinal layers, playing a vital role in the early detection and\nmonitoring of retinal diseases. OCT uses the principle of light wave\ninterference to create detailed images of the retinal microstructures, making\nit a valuable tool for diagnosing ocular conditions. This work presents an\nopen-access OCT dataset (OCTDL) comprising over 2000 OCT images labeled\naccording to disease group and retinal pathology. The dataset consists of OCT\nrecords of patients with Age-related Macular Degeneration (AMD), Diabetic\nMacular Edema (DME), Epiretinal Membrane (ERM), Retinal Artery Occlusion (RAO),\nRetinal Vein Occlusion (RVO), and Vitreomacular Interface Disease (VID). The\nimages were acquired with an Optovue Avanti RTVue XR using raster scanning\nprotocols with dynamic scan length and image resolution. Each retinal b-scan\nwas acquired by centering on the fovea and interpreted and cataloged by an\nexperienced retinal specialist. In this work, we applied Deep Learning\nclassification techniques to this new open-access dataset.\n","authors":["Mikhail Kulyabin","Aleksei Zhdanov","Anastasia Nikiforova","Andrey Stepichev","Anna Kuznetsova","Mikhail Ronkin","Vasilii Borisov","Alexander Bogachev","Sergey Korotkich","Paul A Constable","Andreas Maier"],"pdf_url":"https://arxiv.org/pdf/2312.08255v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05869v2","updated":"2024-03-31T09:31:56Z","published":"2024-02-08T17:57:59Z","title":"Adaptive Surface Normal Constraint for Geometric Estimation from\n Monocular Images","summary":" We introduce a novel approach to learn geometries such as depth and surface\nnormal from images while incorporating geometric context. The difficulty of\nreliably capturing geometric context in existing methods impedes their ability\nto accurately enforce the consistency between the different geometric\nproperties, thereby leading to a bottleneck of geometric estimation quality. We\ntherefore propose the Adaptive Surface Normal (ASN) constraint, a simple yet\nefficient method. Our approach extracts geometric context that encodes the\ngeometric variations present in the input image and correlates depth estimation\nwith geometric constraints. By dynamically determining reliable local geometry\nfrom randomly sampled candidates, we establish a surface normal constraint,\nwhere the validity of these candidates is evaluated using the geometric\ncontext. Furthermore, our normal estimation leverages the geometric context to\nprioritize regions that exhibit significant geometric variations, which makes\nthe predicted normals accurately capture intricate and detailed geometric\ninformation. Through the integration of geometric context, our method unifies\ndepth and surface normal estimations within a cohesive framework, which enables\nthe generation of high-quality 3D geometry from images. We validate the\nsuperiority of our approach over state-of-the-art methods through extensive\nevaluations and comparisons on diverse indoor and outdoor datasets, showcasing\nits efficiency and robustness.\n","authors":["Xiaoxiao Long","Yuhang Zheng","Yupeng Zheng","Beiwen Tian","Cheng Lin","Lingjie Liu","Hao Zhao","Guyue Zhou","Wenping Wang"],"pdf_url":"https://arxiv.org/pdf/2402.05869v2.pdf","comment":"Accepted by TPAMI. arXiv admin note: substantial text overlap with\n arXiv:2103.15483"},{"id":"http://arxiv.org/abs/2311.15672v2","updated":"2024-03-31T09:10:24Z","published":"2023-11-27T10:01:31Z","title":"HAVE-FUN: Human Avatar Reconstruction from Few-Shot Unconstrained Images","summary":" As for human avatar reconstruction, contemporary techniques commonly\nnecessitate the acquisition of costly data and struggle to achieve satisfactory\nresults from a small number of casual images. In this paper, we investigate\nthis task from a few-shot unconstrained photo album. The reconstruction of\nhuman avatars from such data sources is challenging because of limited data\namount and dynamic articulated poses. For handling dynamic data, we integrate a\nskinning mechanism with deep marching tetrahedra (DMTet) to form a drivable\ntetrahedral representation, which drives arbitrary mesh topologies generated by\nthe DMTet for the adaptation of unconstrained images. To effectively mine\ninstructive information from few-shot data, we devise a two-phase optimization\nmethod with few-shot reference and few-shot guidance. The former focuses on\naligning avatar identity with reference images, while the latter aims to\ngenerate plausible appearances for unseen regions. Overall, our framework,\ncalled HaveFun, can undertake avatar reconstruction, rendering, and animation.\nExtensive experiments on our developed benchmarks demonstrate that HaveFun\nexhibits substantially superior performance in reconstructing the human body\nand hand. Project website: https://seanchenxy.github.io/HaveFunWeb/.\n","authors":["Xihe Yang","Xingyu Chen","Daiheng Gao","Shaohui Wang","Xiaoguang Han","Baoyuan Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15672v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15070v2","updated":"2024-03-31T08:36:57Z","published":"2023-08-29T07:11:52Z","title":"DiffBIR: Towards Blind Image Restoration with Generative Diffusion Prior","summary":" We present DiffBIR, a general restoration pipeline that could handle\ndifferent blind image restoration tasks in a unified framework. DiffBIR\ndecouples blind image restoration problem into two stages: 1) degradation\nremoval: removing image-independent content; 2) information regeneration:\ngenerating the lost image content. Each stage is developed independently but\nthey work seamlessly in a cascaded manner. In the first stage, we use\nrestoration modules to remove degradations and obtain high-fidelity restored\nresults. For the second stage, we propose IRControlNet that leverages the\ngenerative ability of latent diffusion models to generate realistic details.\nSpecifically, IRControlNet is trained based on specially produced condition\nimages without distracting noisy content for stable generation performance.\nMoreover, we design a region-adaptive restoration guidance that can modify the\ndenoising process during inference without model re-training, allowing users to\nbalance realness and fidelity through a tunable guidance scale. Extensive\nexperiments have demonstrated DiffBIR's superiority over state-of-the-art\napproaches for blind image super-resolution, blind face restoration and blind\nimage denoising tasks on both synthetic and real-world datasets. The code is\navailable at https://github.com/XPixelGroup/DiffBIR.\n","authors":["Xinqi Lin","Jingwen He","Ziyan Chen","Zhaoyang Lyu","Bo Dai","Fanghua Yu","Wanli Ouyang","Yu Qiao","Chao Dong"],"pdf_url":"https://arxiv.org/pdf/2308.15070v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.13474v2","updated":"2024-03-31T08:12:21Z","published":"2022-08-29T10:19:10Z","title":"Prompt Tuning with Soft Context Sharing for Vision-Language Models","summary":" Vision-language models have recently shown great potential on many tasks in\ncomputer vision. Meanwhile, prior work demonstrates prompt tuning designed for\nvision-language models could acquire superior performance on few-shot image\nrecognition compared to linear probe, a strong baseline. In practice, many\nfew-shot tasks are inherently correlated, particularly within specialized\ndomains. However, such information is overlooked previously. Inspired by the\nfact that modeling task relationship by multi-task learning can usually boost\nperformance, we propose a novel method SoftCPT (Soft Context Sharing for Prompt\nTuning) to tune pre-trained vision-language models on multiple target few-shot\ntasks jointly. Specifically, we design a task-shared meta network to generate\nprompt context for each task using task name together with a learnable task\ncontext as input. The parameters of this meta network as well as the task\ncontext are tuned on the joint training set of all tasks. As such, the prompt\ncontext of all tasks will be shared in a soft manner. Extensive experiments\nacross four multi-task few-shot datasets covering 44 tasks and 1593 categories\ndemonstrate that SoftCPT significantly outperforms single-task prompt tuning\nmethods, highlighting the effectiveness of multi-task learning for\nvision-language prompt tuning. Code is available at\nhttps://github.com/kding1225/softcpt.\n","authors":["Kun Ding","Ying Wang","Pengzhang Liu","Qiang Yu","Haojian Zhang","Shiming Xiang","Chunhong Pan"],"pdf_url":"https://arxiv.org/pdf/2208.13474v2.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2311.16096v3","updated":"2024-03-31T08:06:12Z","published":"2023-11-27T18:59:04Z","title":"Animatable Gaussians: Learning Pose-dependent Gaussian Maps for\n High-fidelity Human Avatar Modeling","summary":" Modeling animatable human avatars from RGB videos is a long-standing and\nchallenging problem. Recent works usually adopt MLP-based neural radiance\nfields (NeRF) to represent 3D humans, but it remains difficult for pure MLPs to\nregress pose-dependent garment details. To this end, we introduce Animatable\nGaussians, a new avatar representation that leverages powerful 2D CNNs and 3D\nGaussian splatting to create high-fidelity avatars. To associate 3D Gaussians\nwith the animatable avatar, we learn a parametric template from the input\nvideos, and then parameterize the template on two front \\& back canonical\nGaussian maps where each pixel represents a 3D Gaussian. The learned template\nis adaptive to the wearing garments for modeling looser clothes like dresses.\nSuch template-guided 2D parameterization enables us to employ a powerful\nStyleGAN-based CNN to learn the pose-dependent Gaussian maps for modeling\ndetailed dynamic appearances. Furthermore, we introduce a pose projection\nstrategy for better generalization given novel poses. Overall, our method can\ncreate lifelike avatars with dynamic, realistic and generalized appearances.\nExperiments show that our method outperforms other state-of-the-art approaches.\nCode: https://github.com/lizhe00/AnimatableGaussians\n","authors":["Zhe Li","Zerong Zheng","Lizhen Wang","Yebin Liu"],"pdf_url":"https://arxiv.org/pdf/2311.16096v3.pdf","comment":"Accepted by CVPR 2024, Projectpage:\n https://animatable-gaussians.github.io/, Code:\n https://github.com/lizhe00/AnimatableGaussians"},{"id":"http://arxiv.org/abs/2210.09846v3","updated":"2024-03-31T07:50:22Z","published":"2022-10-15T11:00:54Z","title":"G-PECNet: Towards a Generalizable Pedestrian Trajectory Prediction\n System","summary":" Navigating dynamic physical environments without obstructing or damaging\nhuman assets is of quintessential importance for social robots. In this work,\nwe solve autonomous drone navigation's sub-problem of predicting out-of-domain\nhuman and agent trajectories using a deep generative model. Our method:\nGeneral-PECNet or G-PECNet observes an improvement of 9.5\\% on the Final\nDisplacement Error (FDE) on 2020's benchmark: PECNet through a combination of\narchitectural improvements inspired by periodic activation functions and\nsynthetic trajectory (data) augmentations using Hidden Markov Models (HMMs) and\nReinforcement Learning (RL). Additionally, we propose a simple\ngeometry-inspired metric for trajectory non-linearity and outlier detection,\nhelpful for the task. Code available at\nhttps://github.com/Aryan-Garg/PECNet-Pedestrian-Trajectory-Prediction.git\n","authors":["Aryan Garg","Renu M. Rameshan"],"pdf_url":"https://arxiv.org/pdf/2210.09846v3.pdf","comment":"Notable ICLR Tiny Paper 2024"},{"id":"http://arxiv.org/abs/2402.09989v3","updated":"2024-03-31T07:47:59Z","published":"2024-02-15T14:54:33Z","title":"LLMs as Bridges: Reformulating Grounded Multimodal Named Entity\n Recognition","summary":" Grounded Multimodal Named Entity Recognition (GMNER) is a nascent multimodal\ntask that aims to identify named entities, entity types and their corresponding\nvisual regions. GMNER task exhibits two challenging properties: 1) The weak\ncorrelation between image-text pairs in social media results in a significant\nportion of named entities being ungroundable. 2) There exists a distinction\nbetween coarse-grained referring expressions commonly used in similar tasks\n(e.g., phrase localization, referring expression comprehension) and\nfine-grained named entities. In this paper, we propose RiVEG, a unified\nframework that reformulates GMNER into a joint MNER-VE-VG task by leveraging\nlarge language models (LLMs) as a connecting bridge. This reformulation brings\ntwo benefits: 1) It maintains the optimal MNER performance and eliminates the\nneed for employing object detection methods to pre-extract regional features,\nthereby naturally addressing two major limitations of existing GMNER methods.\n2) The introduction of entity expansion expression and Visual Entailment (VE)\nModule unifies Visual Grounding (VG) and Entity Grounding (EG). It enables\nRiVEG to effortlessly inherit the Visual Entailment and Visual Grounding\ncapabilities of any current or prospective multimodal pretraining models.\nExtensive experiments demonstrate that RiVEG outperforms state-of-the-art\nmethods on the existing GMNER dataset and achieves absolute leads of 10.65%,\n6.21%, and 8.83% in all three subtasks.\n","authors":["Jinyuan Li","Han Li","Di Sun","Jiahao Wang","Wenkun Zhang","Zan Wang","Gang Pan"],"pdf_url":"https://arxiv.org/pdf/2402.09989v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.07636v4","updated":"2024-03-31T07:42:17Z","published":"2024-03-12T13:18:22Z","title":"Decomposing Disease Descriptions for Enhanced Pathology Detection: A\n Multi-Aspect Vision-Language Pre-training Framework","summary":" Medical vision language pre-training (VLP) has emerged as a frontier of\nresearch, enabling zero-shot pathological recognition by comparing the query\nimage with the textual descriptions for each disease. Due to the complex\nsemantics of biomedical texts, current methods struggle to align medical images\nwith key pathological findings in unstructured reports. This leads to the\nmisalignment with the target disease's textual representation. In this paper,\nwe introduce a novel VLP framework designed to dissect disease descriptions\ninto their fundamental aspects, leveraging prior knowledge about the visual\nmanifestations of pathologies. This is achieved by consulting a large language\nmodel and medical experts. Integrating a Transformer module, our approach\naligns an input image with the diverse elements of a disease, generating\naspect-centric image representations. By consolidating the matches from each\naspect, we improve the compatibility between an image and its associated\ndisease. Additionally, capitalizing on the aspect-oriented representations, we\npresent a dual-head Transformer tailored to process known and unknown diseases,\noptimizing the comprehensive detection efficacy. Conducting experiments on\nseven downstream datasets, ours improves the accuracy of recent methods by up\nto 8.56% and 17.26% for seen and unseen categories, respectively. Our code is\nreleased at https://github.com/HieuPhan33/MAVL.\n","authors":["Vu Minh Hieu Phan","Yutong Xie","Yuankai Qi","Lingqiao Liu","Liyang Liu","Bowen Zhang","Zhibin Liao","Qi Wu","Minh-Son To","Johan W. Verjans"],"pdf_url":"https://arxiv.org/pdf/2403.07636v4.pdf","comment":"Accepted at CVPR2024. Pre-print before final camera-ready version"},{"id":"http://arxiv.org/abs/2312.01616v3","updated":"2024-03-31T05:57:57Z","published":"2023-12-04T04:14:09Z","title":"SchurVINS: Schur Complement-Based Lightweight Visual Inertial Navigation\n System","summary":" Accuracy and computational efficiency are the most important metrics to\nVisual Inertial Navigation System (VINS). The existing VINS algorithms with\neither high accuracy or low computational complexity, are difficult to provide\nthe high precision localization in resource-constrained devices. To this end,\nwe propose a novel filter-based VINS framework named SchurVINS, which could\nguarantee both high accuracy by building a complete residual model and low\ncomputational complexity with Schur complement. Technically, we first formulate\nthe full residual model where Gradient, Hessian and observation covariance are\nexplicitly modeled. Then Schur complement is employed to decompose the full\nmodel into ego-motion residual model and landmark residual model. Finally,\nExtended Kalman Filter (EKF) update is implemented in these two models with\nhigh efficiency. Experiments on EuRoC and TUM-VI datasets show that our method\nnotably outperforms state-of-the-art (SOTA) methods in both accuracy and\ncomputational complexity. The experimental code of SchurVINS is available at\nhttps://github.com/bytedance/SchurVINS.\n","authors":["Yunfei Fan","Tianyu Zhao","Guidong Wang"],"pdf_url":"https://arxiv.org/pdf/2312.01616v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16051v2","updated":"2024-03-31T05:51:58Z","published":"2024-03-24T07:36:38Z","title":"Segment Anything Model for Road Network Graph Extraction","summary":" We propose SAM-Road, an adaptation of the Segment Anything Model (SAM) for\nextracting large-scale, vectorized road network graphs from satellite imagery.\nTo predict graph geometry, we formulate it as a dense semantic segmentation\ntask, leveraging the inherent strengths of SAM. The image encoder of SAM is\nfine-tuned to produce probability masks for roads and intersections, from which\nthe graph vertices are extracted via simple non-maximum suppression. To predict\ngraph topology, we designed a lightweight transformer-based graph neural\nnetwork, which leverages the SAM image embeddings to estimate the edge\nexistence probabilities between vertices. Our approach directly predicts the\ngraph vertices and edges for large regions without expensive and complex\npost-processing heuristics, and is capable of building complete road network\ngraphs spanning multiple square kilometers in a matter of seconds. With its\nsimple, straightforward, and minimalist design, SAM-Road achieves comparable\naccuracy with the state-of-the-art method RNGDet++, while being 40 times faster\non the City-scale dataset. We thus demonstrate the power of a foundational\nvision model when applied to a graph learning task. The code is available at\nhttps://github.com/htcr/sam_road.\n","authors":["Congrui Hetang","Haoru Xue","Cindy Le","Tianwei Yue","Wenping Wang","Yihui He"],"pdf_url":"https://arxiv.org/pdf/2403.16051v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10145v2","updated":"2024-03-31T05:24:35Z","published":"2024-03-15T09:44:02Z","title":"RCooper: A Real-world Large-scale Dataset for Roadside Cooperative\n Perception","summary":" The value of roadside perception, which could extend the boundaries of\nautonomous driving and traffic management, has gradually become more prominent\nand acknowledged in recent years. However, existing roadside perception\napproaches only focus on the single-infrastructure sensor system, which cannot\nrealize a comprehensive understanding of a traffic area because of the limited\nsensing range and blind spots. Orienting high-quality roadside perception, we\nneed Roadside Cooperative Perception (RCooper) to achieve practical\narea-coverage roadside perception for restricted traffic areas. Rcooper has its\nown domain-specific challenges, but further exploration is hindered due to the\nlack of datasets. We hence release the first real-world, large-scale RCooper\ndataset to bloom the research on practical roadside cooperative perception,\nincluding detection and tracking. The manually annotated dataset comprises 50k\nimages and 30k point clouds, including two representative traffic scenes (i.e.,\nintersection and corridor). The constructed benchmarks prove the effectiveness\nof roadside cooperation perception and demonstrate the direction of further\nresearch. Codes and dataset can be accessed at:\nhttps://github.com/AIR-THU/DAIR-RCooper.\n","authors":["Ruiyang Hao","Siqi Fan","Yingru Dai","Zhenlin Zhang","Chenxi Li","Yuntian Wang","Haibao Yu","Wenxian Yang","Jirui Yuan","Zaiqing Nie"],"pdf_url":"https://arxiv.org/pdf/2403.10145v2.pdf","comment":"Accepted by CVPR2024. 10 pages with 6 figures"},{"id":"http://arxiv.org/abs/2401.06415v2","updated":"2024-03-31T05:22:00Z","published":"2024-01-12T07:23:02Z","title":"3D Reconstruction of Interacting Multi-Person in Clothing from a Single\n Image","summary":" This paper introduces a novel pipeline to reconstruct the geometry of\ninteracting multi-person in clothing on a globally coherent scene space from a\nsingle image. The main challenge arises from the occlusion: a part of a human\nbody is not visible from a single view due to the occlusion by others or the\nself, which introduces missing geometry and physical implausibility (e.g.,\npenetration). We overcome this challenge by utilizing two human priors for\ncomplete 3D geometry and surface contacts. For the geometry prior, an encoder\nlearns to regress the image of a person with missing body parts to the latent\nvectors; a decoder decodes these vectors to produce 3D features of the\nassociated geometry; and an implicit network combines these features with a\nsurface normal map to reconstruct a complete and detailed 3D humans. For the\ncontact prior, we develop an image-space contact detector that outputs a\nprobability distribution of surface contacts between people in 3D. We use these\npriors to globally refine the body poses, enabling the penetration-free and\naccurate reconstruction of interacting multi-person in clothing on the scene\nspace. The results demonstrate that our method is complete, globally coherent,\nand physically plausible compared to existing methods.\n","authors":["Junuk Cha","Hansol Lee","Jaewon Kim","Nhat Nguyen Bao Truong","Jae Shin Yoon","Seungryul Baek"],"pdf_url":"https://arxiv.org/pdf/2401.06415v2.pdf","comment":"Accepted to WACV 2024"},{"id":"http://arxiv.org/abs/2312.16084v2","updated":"2024-03-31T04:45:58Z","published":"2023-12-26T15:14:37Z","title":"LangSplat: 3D Language Gaussian Splatting","summary":" Humans live in a 3D world and commonly use natural language to interact with\na 3D scene. Modeling a 3D language field to support open-ended language queries\nin 3D has gained increasing attention recently. This paper introduces\nLangSplat, which constructs a 3D language field that enables precise and\nefficient open-vocabulary querying within 3D spaces. Unlike existing methods\nthat ground CLIP language embeddings in a NeRF model, LangSplat advances the\nfield by utilizing a collection of 3D Gaussians, each encoding language\nfeatures distilled from CLIP, to represent the language field. By employing a\ntile-based splatting technique for rendering language features, we circumvent\nthe costly rendering process inherent in NeRF. Instead of directly learning\nCLIP embeddings, LangSplat first trains a scene-wise language autoencoder and\nthen learns language features on the scene-specific latent space, thereby\nalleviating substantial memory demands imposed by explicit modeling. Existing\nmethods struggle with imprecise and vague 3D language fields, which fail to\ndiscern clear boundaries between objects. We delve into this issue and propose\nto learn hierarchical semantics using SAM, thereby eliminating the need for\nextensively querying the language field across various scales and the\nregularization of DINO features. Extensive experimental results show that\nLangSplat significantly outperforms the previous state-of-the-art method LERF\nby a large margin. Notably, LangSplat is extremely efficient, achieving a 199\n$\\times$ speedup compared to LERF at the resolution of 1440 $\\times$ 1080. We\nstrongly recommend readers to check out our video results at\nhttps://langsplat.github.io/\n","authors":["Minghan Qin","Wanhua Li","Jiawei Zhou","Haoqian Wang","Hanspeter Pfister"],"pdf_url":"https://arxiv.org/pdf/2312.16084v2.pdf","comment":"CVPR 2024. Project Page: https://langsplat.github.io"},{"id":"http://arxiv.org/abs/2303.08314v3","updated":"2024-03-31T04:11:30Z","published":"2023-03-15T02:08:20Z","title":"Guided Slot Attention for Unsupervised Video Object Segmentation","summary":" Unsupervised video object segmentation aims to segment the most prominent\nobject in a video sequence. However, the existence of complex backgrounds and\nmultiple foreground objects make this task challenging. To address this issue,\nwe propose a guided slot attention network to reinforce spatial structural\ninformation and obtain better foreground--background separation. The foreground\nand background slots, which are initialized with query guidance, are\niteratively refined based on interactions with template information.\nFurthermore, to improve slot--template interaction and effectively fuse global\nand local features in the target and reference frames, K-nearest neighbors\nfiltering and a feature aggregation transformer are introduced. The proposed\nmodel achieves state-of-the-art performance on two popular datasets.\nAdditionally, we demonstrate the robustness of the proposed model in\nchallenging scenes through various comparative experiments.\n","authors":["Minhyeok Lee","Suhwan Cho","Dogyoon Lee","Chaewon Park","Jungho Lee","Sangyoun Lee"],"pdf_url":"https://arxiv.org/pdf/2303.08314v3.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2312.08963v2","updated":"2024-03-31T02:18:23Z","published":"2023-12-14T14:10:57Z","title":"LEMON: Learning 3D Human-Object Interaction Relation from 2D Images","summary":" Learning 3D human-object interaction relation is pivotal to embodied AI and\ninteraction modeling. Most existing methods approach the goal by learning to\npredict isolated interaction elements, e.g., human contact, object affordance,\nand human-object spatial relation, primarily from the perspective of either the\nhuman or the object. Which underexploit certain correlations between the\ninteraction counterparts (human and object), and struggle to address the\nuncertainty in interactions. Actually, objects' functionalities potentially\naffect humans' interaction intentions, which reveals what the interaction is.\nMeanwhile, the interacting humans and objects exhibit matching geometric\nstructures, which presents how to interact. In light of this, we propose\nharnessing these inherent correlations between interaction counterparts to\nmitigate the uncertainty and jointly anticipate the above interaction elements\nin 3D space. To achieve this, we present LEMON (LEarning 3D huMan-Object\niNteraction relation), a unified model that mines interaction intentions of the\ncounterparts and employs curvatures to guide the extraction of geometric\ncorrelations, combining them to anticipate the interaction elements. Besides,\nthe 3D Interaction Relation dataset (3DIR) is collected to serve as the test\nbed for training and evaluation. Extensive experiments demonstrate the\nsuperiority of LEMON over methods estimating each element in isolation.\n","authors":["Yuhang Yang","Wei Zhai","Hongchen Luo","Yang Cao","Zheng-Jun Zha"],"pdf_url":"https://arxiv.org/pdf/2312.08963v2.pdf","comment":"accept by CVPR2024"},{"id":"http://arxiv.org/abs/2305.18171v4","updated":"2024-03-31T22:58:38Z","published":"2023-05-29T16:02:09Z","title":"Improved Probabilistic Image-Text Representations","summary":" Image-Text Matching (ITM) task, a fundamental vision-language (VL) task,\nsuffers from the inherent ambiguity arising from multiplicity and imperfect\nannotations. Deterministic functions are not sufficiently powerful to capture\nambiguity, prompting the exploration of probabilistic embeddings to tackle the\nchallenge. However, the existing probabilistic ITM approach encounters two key\nshortcomings; the burden of heavy computations due to the Monte Carlo\napproximation, and the loss saturation issue in the face of abundant false\nnegatives. To overcome the issues, this paper presents an improved\nProbabilistic Cross-Modal Embeddings (named PCME++) by introducing a new\nprobabilistic distance with a closed-form solution. In addition, two\noptimization techniques are proposed to enhance PCME++ further: first, the\nincorporation of pseudo-positives to prevent the negative effect under massive\nfalse negatives; second, mixed sample data augmentation for probabilistic\nmatching. Experimental results on MS-COCO Caption and two extended benchmarks,\nCxC and ECCV Caption, demonstrate the effectiveness of PCME++ compared to\nstate-of-the-art ITM methods. The robustness of PCME++ is also evaluated under\nnoisy image-text correspondences. In addition, the potential applicability of\nPCME++ in automatic prompt-filtering for zero-shot classification is shown. The\ncode is available at https://github.com/naver-ai/pcmepp\n","authors":["Sanghyuk Chun"],"pdf_url":"https://arxiv.org/pdf/2305.18171v4.pdf","comment":"ICLR 2024 camera-ready; Code: https://github.com/naver-ai/pcmepp.\n Project page: https://naver-ai.github.io/pcmepp/. 30 pages, 2.2 MB"},{"id":"http://arxiv.org/abs/2404.00815v1","updated":"2024-03-31T22:18:56Z","published":"2024-03-31T22:18:56Z","title":"Towards Realistic Scene Generation with LiDAR Diffusion Models","summary":" Diffusion models (DMs) excel in photo-realistic image synthesis, but their\nadaptation to LiDAR scene generation poses a substantial hurdle. This is\nprimarily because DMs operating in the point space struggle to preserve the\ncurve-like patterns and 3D geometry of LiDAR scenes, which consumes much of\ntheir representation power. In this paper, we propose LiDAR Diffusion Models\n(LiDMs) to generate LiDAR-realistic scenes from a latent space tailored to\ncapture the realism of LiDAR scenes by incorporating geometric priors into the\nlearning pipeline. Our method targets three major desiderata: pattern realism,\ngeometry realism, and object realism. Specifically, we introduce curve-wise\ncompression to simulate real-world LiDAR patterns, point-wise coordinate\nsupervision to learn scene geometry, and patch-wise encoding for a full 3D\nobject context. With these three core designs, our method achieves competitive\nperformance on unconditional LiDAR generation in 64-beam scenario and state of\nthe art on conditional LiDAR generation, while maintaining high efficiency\ncompared to point-based DMs (up to 107$\\times$ faster). Furthermore, by\ncompressing LiDAR scenes into a latent space, we enable the controllability of\nDMs with various conditions such as semantic maps, camera views, and text\nprompts. Our code and pretrained weights are available at\nhttps://github.com/hancyran/LiDAR-Diffusion.\n","authors":["Haoxi Ran","Vitor Guizilini","Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2404.00815v1.pdf","comment":"CVPR 2024. Code available at\n https://github.com/hancyran/LiDAR-Diffusion"},{"id":"http://arxiv.org/abs/2404.00807v1","updated":"2024-03-31T21:43:08Z","published":"2024-03-31T21:43:08Z","title":"GAMA-IR: Global Additive Multidimensional Averaging for Fast Image\n Restoration","summary":" Deep learning-based methods have shown remarkable success for various image\nrestoration tasks such as denoising and deblurring. The current\nstate-of-the-art networks are relatively deep and utilize (variants of) self\nattention mechanisms. Those networks are significantly slower than shallow\nconvolutional networks, which however perform worse. In this paper, we\nintroduce an image restoration network that is both fast and yields excellent\nimage quality. The network is designed to minimize the latency and memory\nconsumption when executed on a standard GPU, while maintaining state-of-the-art\nperformance. The network is a simple shallow network with an efficient block\nthat implements global additive multidimensional averaging operations. This\nblock can capture global information and enable a large receptive field even\nwhen used in shallow networks with minimal computational overhead. Through\nextensive experiments and evaluations on diverse tasks, we demonstrate that our\nnetwork achieves comparable or even superior results to existing\nstate-of-the-art image restoration networks with less latency. For instance, we\nexceed the state-of-the-art result on real-world SIDD denoising by 0.11dB,\nwhile being 2 to 10 times faster.\n","authors":["Youssef Mansour","Reinhard Heckel"],"pdf_url":"https://arxiv.org/pdf/2404.00807v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00801v1","updated":"2024-03-31T21:17:48Z","published":"2024-03-31T21:17:48Z","title":"$R^2$-Tuning: Efficient Image-to-Video Transfer Learning for Video\n Temporal Grounding","summary":" Video temporal grounding (VTG) is a fine-grained video understanding problem\nthat aims to ground relevant clips in untrimmed videos given natural language\nqueries. Most existing VTG models are built upon frame-wise final-layer CLIP\nfeatures, aided by additional temporal backbones (e.g., SlowFast) with\nsophisticated temporal reasoning mechanisms. In this work, we claim that CLIP\nitself already shows great potential for fine-grained spatial-temporal\nmodeling, as each layer offers distinct yet useful information under different\ngranularity levels. Motivated by this, we propose Reversed Recurrent Tuning\n($R^2$-Tuning), a parameter- and memory-efficient transfer learning framework\nfor video temporal grounding. Our method learns a lightweight $R^2$ Block\ncontaining only 1.5% of the total parameters to perform progressive\nspatial-temporal modeling. Starting from the last layer of CLIP, $R^2$ Block\nrecurrently aggregates spatial features from earlier layers, then refines\ntemporal correlation conditioning on the given query, resulting in a\ncoarse-to-fine scheme. $R^2$-Tuning achieves state-of-the-art performance\nacross three VTG tasks (i.e., moment retrieval, highlight detection, and video\nsummarization) on six public benchmarks (i.e., QVHighlights, Charades-STA,\nEgo4D-NLQ, TACoS, YouTube Highlights, and TVSum) even without the additional\nbackbone, demonstrating the significance and effectiveness of the proposed\nscheme. Our code is available at https://github.com/yeliudev/R2-Tuning.\n","authors":["Ye Liu","Jixuan He","Wanhua Li","Junsik Kim","Donglai Wei","Hanspeter Pfister","Chang Wen Chen"],"pdf_url":"https://arxiv.org/pdf/2404.00801v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00785v1","updated":"2024-03-31T20:08:23Z","published":"2024-03-31T20:08:23Z","title":"Disentangling Hippocampal Shape Variations: A Study of Neurological\n Disorders Using Graph Variational Autoencoder with Contrastive Learning","summary":" This paper presents a comprehensive study focused on disentangling\nhippocampal shape variations from diffusion tensor imaging (DTI) datasets\nwithin the context of neurological disorders. Leveraging a Graph Variational\nAutoencoder (VAE) enhanced with Supervised Contrastive Learning, our approach\naims to improve interpretability by disentangling two distinct latent variables\ncorresponding to age and the presence of diseases. In our ablation study, we\ninvestigate a range of VAE architectures and contrastive loss functions,\nshowcasing the enhanced disentanglement capabilities of our approach. This\nevaluation uses synthetic 3D torus mesh data and real 3D hippocampal mesh\ndatasets derived from the DTI hippocampal dataset. Our supervised\ndisentanglement model outperforms several state-of-the-art (SOTA) methods like\nattribute and guided VAEs in terms of disentanglement scores. Our model\ndistinguishes between age groups and disease status in patients with Multiple\nSclerosis (MS) using the hippocampus data. Our Graph VAE with Supervised\nContrastive Learning shows the volume changes of the hippocampus of MS\npopulations at different ages, and the result is consistent with the current\nneuroimaging literature. This research provides valuable insights into the\nrelationship between neurological disorder and hippocampal shape changes in\ndifferent age groups of MS populations using a Graph VAE with Supervised\nContrastive loss.\n","authors":["Jakaria Rabbi","Johannes Kiechle","Christian Beaulieu","Nilanjan Ray","Dana Cobzas"],"pdf_url":"https://arxiv.org/pdf/2404.00785v1.pdf","comment":"Length: 23 pages and submitted to the journal: MELBA (Machine\n Learning for Biomedical Imaging)"},{"id":"http://arxiv.org/abs/2404.00777v1","updated":"2024-03-31T19:28:04Z","published":"2024-03-31T19:28:04Z","title":"Privacy-preserving Optics for Enhancing Protection in Face\n De-identification","summary":" The modern surge in camera usage alongside widespread computer vision\ntechnology applications poses significant privacy and security concerns.\nCurrent artificial intelligence (AI) technologies aid in recognizing relevant\nevents and assisting in daily tasks in homes, offices, hospitals, etc. The need\nto access or process personal information for these purposes raises privacy\nconcerns. While software-level solutions like face de-identification provide a\ngood privacy/utility trade-off, they present vulnerabilities to sniffing\nattacks. In this paper, we propose a hardware-level face de-identification\nmethod to solve this vulnerability. Specifically, our approach first learns an\noptical encoder along with a regression model to obtain a face heatmap while\nhiding the face identity from the source image. We also propose an\nanonymization framework that generates a new face using the privacy-preserving\nimage, face heatmap, and a reference face image from a public dataset as input.\nWe validate our approach with extensive simulations and hardware experiments.\n","authors":["Jhon Lopez","Carlos Hinojosa","Henry Arguello","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2404.00777v1.pdf","comment":"Accepted to CVPR 2024. Project Website and Code coming soon"},{"id":"http://arxiv.org/abs/2404.00767v1","updated":"2024-03-31T18:45:13Z","published":"2024-03-31T18:45:13Z","title":"Intensity-based 3D motion correction for cardiac MR images","summary":" Cardiac magnetic resonance (CMR) image acquisition requires subjects to hold\ntheir breath while 2D cine images are acquired. This process assumes that the\nheart remains in the same position across all slices. However, differences in\nbreathhold positions or patient motion introduce 3D slice misalignments. In\nthis work, we propose an algorithm that simultaneously aligns all SA and LA\nslices by maximizing the pair-wise intensity agreement between their\nintersections. Unlike previous works, our approach is formulated as a\nsubject-specific optimization problem and requires no prior knowledge of the\nunderlying anatomy. We quantitatively demonstrate that the proposed method is\nrobust against a large range of rotations and translations by synthetically\nmisaligning 10 motion-free datasets and aligning them back using the proposed\nmethod.\n","authors":["Nil Stolt-Ansó","Vasiliki Sideri-Lampretsa","Maik Dannecker","Daniel Rueckert"],"pdf_url":"https://arxiv.org/pdf/2404.00767v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00742v1","updated":"2024-03-31T17:18:57Z","published":"2024-03-31T17:18:57Z","title":"Adapting to Length Shift: FlexiLength Network for Trajectory Prediction","summary":" Trajectory prediction plays an important role in various applications,\nincluding autonomous driving, robotics, and scene understanding. Existing\napproaches mainly focus on developing compact neural networks to increase\nprediction precision on public datasets, typically employing a standardized\ninput duration. However, a notable issue arises when these models are evaluated\nwith varying observation lengths, leading to a significant performance drop, a\nphenomenon we term the Observation Length Shift. To address this issue, we\nintroduce a general and effective framework, the FlexiLength Network (FLN), to\nenhance the robustness of existing trajectory prediction techniques against\nvarying observation periods. Specifically, FLN integrates trajectory data with\ndiverse observation lengths, incorporates FlexiLength Calibration (FLC) to\nacquire temporal invariant representations, and employs FlexiLength Adaptation\n(FLA) to further refine these representations for more accurate future\ntrajectory predictions. Comprehensive experiments on multiple datasets, ie,\nETH/UCY, nuScenes, and Argoverse 1, demonstrate the effectiveness and\nflexibility of our proposed FLN framework.\n","authors":["Yi Xu","Yun Fu"],"pdf_url":"https://arxiv.org/pdf/2404.00742v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00741v1","updated":"2024-03-31T17:02:24Z","published":"2024-03-31T17:02:24Z","title":"Rethinking Interactive Image Segmentation with Low Latency, High\n Quality, and Diverse Prompts","summary":" The goal of interactive image segmentation is to delineate specific regions\nwithin an image via visual or language prompts. Low-latency and high-quality\ninteractive segmentation with diverse prompts remain challenging for existing\nspecialist and generalist models. Specialist models, with their limited prompts\nand task-specific designs, experience high latency because the image must be\nrecomputed every time the prompt is updated, due to the joint encoding of image\nand visual prompts. Generalist models, exemplified by the Segment Anything\nModel (SAM), have recently excelled in prompt diversity and efficiency, lifting\nimage segmentation to the foundation model era. However, for high-quality\nsegmentations, SAM still lags behind state-of-the-art specialist models despite\nSAM being trained with x100 more segmentation masks. In this work, we delve\ndeep into the architectural differences between the two types of models. We\nobserve that dense representation and fusion of visual prompts are the key\ndesign choices contributing to the high segmentation quality of specialist\nmodels. In light of this, we reintroduce this dense design into the generalist\nmodels, to facilitate the development of generalist models with high\nsegmentation quality. To densely represent diverse visual prompts, we propose\nto use a dense map to capture five types: clicks, boxes, polygons, scribbles,\nand masks. Thus, we propose SegNext, a next-generation interactive segmentation\napproach offering low latency, high quality, and diverse prompt support. Our\nmethod outperforms current state-of-the-art methods on HQSeg-44K and DAVIS,\nboth quantitatively and qualitatively.\n","authors":["Qin Liu","Jaemin Cho","Mohit Bansal","Marc Niethammer"],"pdf_url":"https://arxiv.org/pdf/2404.00741v1.pdf","comment":"CVPR 2024 https://github.com/uncbiag/SegNext"},{"id":"http://arxiv.org/abs/2404.00726v1","updated":"2024-03-31T15:56:41Z","published":"2024-03-31T15:56:41Z","title":"MugenNet: A Novel Combined Convolution Neural Network and Transformer\n Network with its Application for Colonic Polyp Image Segmentation","summary":" Biomedical image segmentation is a very important part in disease diagnosis.\nThe term \"colonic polyps\" refers to polypoid lesions that occur on the surface\nof the colonic mucosa within the intestinal lumen. In clinical practice, early\ndetection of polyps is conducted through colonoscopy examinations and\nbiomedical image processing. Therefore, the accurate polyp image segmentation\nis of great significance in colonoscopy examinations. Convolutional Neural\nNetwork (CNN) is a common automatic segmentation method, but its main\ndisadvantage is the long training time. Transformer utilizes a self-attention\nmechanism, which essentially assigns different importance weights to each piece\nof information, thus achieving high computational efficiency during\nsegmentation. However, a potential drawback is the risk of information loss. In\nthe study reported in this paper, based on the well-known hybridization\nprinciple, we proposed a method to combine CNN and Transformer to retain the\nstrengths of both, and we applied this method to build a system called MugenNet\nfor colonic polyp image segmentation. We conducted a comprehensive experiment\nto compare MugenNet with other CNN models on five publicly available datasets.\nThe ablation experiment on MugentNet was conducted as well. The experimental\nresults show that MugenNet achieves significantly higher processing speed and\naccuracy compared with CNN alone. The generalized implication with our work is\na method to optimally combine two complimentary methods of machine learning.\n","authors":["Chen Peng","Zhiqin Qian","Kunyu Wang","Qi Luo","Zhuming Bi","Wenjun Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.00726v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00724v1","updated":"2024-03-31T15:50:52Z","published":"2024-03-31T15:50:52Z","title":"Absolute-Unified Multi-Class Anomaly Detection via Class-Agnostic\n Distribution Alignment","summary":" Conventional unsupervised anomaly detection (UAD) methods build separate\nmodels for each object category. Recent studies have proposed to train a\nunified model for multiple classes, namely model-unified UAD. However, such\nmethods still implement the unified model separately on each class during\ninference with respective anomaly decision thresholds, which hinders their\napplication when the image categories are entirely unavailable. In this work,\nwe present a simple yet powerful method to address multi-class anomaly\ndetection without any class information, namely \\textit{absolute-unified} UAD.\nWe target the crux of prior works in this challenging setting: different\nobjects have mismatched anomaly score distributions. We propose Class-Agnostic\nDistribution Alignment (CADA) to align the mismatched score distribution of\neach implicit class without knowing class information, which enables unified\nanomaly detection for all classes and samples. The essence of CADA is to\npredict each class's score distribution of normal samples given any image,\nnormal or anomalous, of this class. As a general component, CADA can activate\nthe potential of nearly all UAD methods under absolute-unified setting. Our\napproach is extensively evaluated under the proposed setting on two popular UAD\nbenchmark datasets, MVTec AD and VisA, where we exceed previous\nstate-of-the-art by a large margin.\n","authors":["Jia Guo","Shuai Lu","Weihang Zhang","Huiqi Li"],"pdf_url":"https://arxiv.org/pdf/2404.00724v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00717v1","updated":"2024-03-31T15:22:11Z","published":"2024-03-31T15:22:11Z","title":"End-to-End Autonomous Driving through V2X Cooperation","summary":" Cooperatively utilizing both ego-vehicle and infrastructure sensor data via\nV2X communication has emerged as a promising approach for advanced autonomous\ndriving. However, current research mainly focuses on improving individual\nmodules, rather than taking end-to-end learning to optimize final planning\nperformance, resulting in underutilized data potential. In this paper, we\nintroduce UniV2X, a pioneering cooperative autonomous driving framework that\nseamlessly integrates all key driving modules across diverse views into a\nunified network. We propose a sparse-dense hybrid data transmission and fusion\nmechanism for effective vehicle-infrastructure cooperation, offering three\nadvantages: 1) Effective for simultaneously enhancing agent perception, online\nmapping, and occupancy prediction, ultimately improving planning performance.\n2) Transmission-friendly for practical and limited communication conditions. 3)\nReliable data fusion with interpretability of this hybrid data. We implement\nUniV2X, as well as reproducing several benchmark methods, on the challenging\nDAIR-V2X, the real-world cooperative driving dataset. Experimental results\ndemonstrate the effectiveness of UniV2X in significantly enhancing planning\nperformance, as well as all intermediate output performance. Code is at\nhttps://github.com/AIR-THU/UniV2X.\n","authors":["Haibao Yu","Wenxian Yang","Jiaru Zhong","Zhenwei Yang","Siqi Fan","Ping Luo","Zaiqing Nie"],"pdf_url":"https://arxiv.org/pdf/2404.00717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00714v1","updated":"2024-03-31T15:18:38Z","published":"2024-03-31T15:18:38Z","title":"Neural Radiance Field-based Visual Rendering: A Comprehensive Review","summary":" In recent years, Neural Radiance Fields (NeRF) has made remarkable progress\nin the field of computer vision and graphics, providing strong technical\nsupport for solving key tasks including 3D scene understanding, new perspective\nsynthesis, human body reconstruction, robotics, and so on, the attention of\nacademics to this research result is growing. As a revolutionary neural\nimplicit field representation, NeRF has caused a continuous research boom in\nthe academic community. Therefore, the purpose of this review is to provide an\nin-depth analysis of the research literature on NeRF within the past two years,\nto provide a comprehensive academic perspective for budding researchers. In\nthis paper, the core architecture of NeRF is first elaborated in detail,\nfollowed by a discussion of various improvement strategies for NeRF, and case\nstudies of NeRF in diverse application scenarios, demonstrating its practical\nutility in different domains. In terms of datasets and evaluation metrics, This\npaper details the key resources needed for NeRF model training. Finally, this\npaper provides a prospective discussion on the future development trends and\npotential challenges of NeRF, aiming to provide research inspiration for\nresearchers in the field and to promote the further development of related\ntechnologies.\n","authors":["Mingyuan Yao","Yukang Huo","Yang Ran","Qingbin Tian","Ruifeng Wang","Haihua Wang"],"pdf_url":"https://arxiv.org/pdf/2404.00714v1.pdf","comment":"35 pages, 22 figures, 14 tables, 18 formulas"},{"id":"http://arxiv.org/abs/2404.00710v1","updated":"2024-03-31T15:03:31Z","published":"2024-03-31T15:03:31Z","title":"Unknown Prompt, the only Lacuna: Unveiling CLIP's Potential for Open\n Domain Generalization","summary":" We delve into Open Domain Generalization (ODG), marked by domain and category\nshifts between training's labeled source and testing's unlabeled target\ndomains. Existing solutions to ODG face limitations due to constrained\ngeneralizations of traditional CNN backbones and errors in detecting target\nopen samples in the absence of prior knowledge. Addressing these pitfalls, we\nintroduce ODG-CLIP, harnessing the semantic prowess of the vision-language\nmodel, CLIP. Our framework brings forth three primary innovations: Firstly,\ndistinct from prevailing paradigms, we conceptualize ODG as a multi-class\nclassification challenge encompassing both known and novel categories. Central\nto our approach is modeling a unique prompt tailored for detecting unknown\nclass samples, and to train this, we employ a readily accessible stable\ndiffusion model, elegantly generating proxy images for the open class.\nSecondly, aiming for domain-tailored classification (prompt) weights while\nensuring a balance of precision and simplicity, we devise a novel visual\nstylecentric prompt learning mechanism. Finally, we infuse images with\nclass-discriminative knowledge derived from the prompt space to augment the\nfidelity of CLIP's visual embeddings. We introduce a novel objective to\nsafeguard the continuity of this infused semantic intel across domains,\nespecially for the shared classes. Through rigorous testing on diverse\ndatasets, covering closed and open-set DG contexts, ODG-CLIP demonstrates clear\nsupremacy, consistently outpacing peers with performance boosts between 8%-16%.\nCode will be available at https://github.com/mainaksingha01/ODG-CLIP.\n","authors":["Mainak Singha","Ankit Jha","Shirsha Bose","Ashwin Nair","Moloud Abdar","Biplab Banerjee"],"pdf_url":"https://arxiv.org/pdf/2404.00710v1.pdf","comment":"Accepted in CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00701v1","updated":"2024-03-31T14:37:25Z","published":"2024-03-31T14:37:25Z","title":"Training-Free Semantic Segmentation via LLM-Supervision","summary":" Recent advancements in open vocabulary models, like CLIP, have notably\nadvanced zero-shot classification and segmentation by utilizing natural\nlanguage for class-specific embeddings. However, most research has focused on\nimproving model accuracy through prompt engineering, prompt learning, or\nfine-tuning with limited labeled data, thereby overlooking the importance of\nrefining the class descriptors. This paper introduces a new approach to\ntext-supervised semantic segmentation using supervision by a large language\nmodel (LLM) that does not require extra training. Our method starts from an\nLLM, like GPT-3, to generate a detailed set of subclasses for more accurate\nclass representation. We then employ an advanced text-supervised semantic\nsegmentation model to apply the generated subclasses as target labels,\nresulting in diverse segmentation results tailored to each subclass's unique\ncharacteristics. Additionally, we propose an assembly that merges the\nsegmentation maps from the various subclass descriptors to ensure a more\ncomprehensive representation of the different aspects in the test images.\nThrough comprehensive experiments on three standard benchmarks, our method\noutperforms traditional text-supervised semantic segmentation methods by a\nmarked margin.\n","authors":["Wenfang Sun","Yingjun Du","Gaowen Liu","Ramana Kompella","Cees G. M. Snoek"],"pdf_url":"https://arxiv.org/pdf/2404.00701v1.pdf","comment":"22 pages,10 figures, conference"},{"id":"http://arxiv.org/abs/2404.00694v1","updated":"2024-03-31T14:04:57Z","published":"2024-03-31T14:04:57Z","title":"DMSSN: Distilled Mixed Spectral-Spatial Network for Hyperspectral\n Salient Object Detection","summary":" Hyperspectral salient object detection (HSOD) has exhibited remarkable\npromise across various applications, particularly in intricate scenarios where\nconventional RGB-based approaches fall short. Despite the considerable progress\nin HSOD method advancements, two critical challenges require immediate\nattention. Firstly, existing hyperspectral data dimension reduction techniques\nincur a loss of spectral information, which adversely affects detection\naccuracy. Secondly, previous methods insufficiently harness the inherent\ndistinctive attributes of hyperspectral images (HSIs) during the feature\nextraction process. To address these challenges, we propose a novel approach\ntermed the Distilled Mixed Spectral-Spatial Network (DMSSN), comprising a\nDistilled Spectral Encoding process and a Mixed Spectral-Spatial Transformer\n(MSST) feature extraction network. The encoding process utilizes knowledge\ndistillation to construct a lightweight autoencoder for dimension reduction,\nstriking a balance between robust encoding capabilities and low computational\ncosts. The MSST extracts spectral-spatial features through multiple attention\nhead groups, collaboratively enhancing its resistance to intricate scenarios.\nMoreover, we have created a large-scale HSOD dataset, HSOD-BIT, to tackle the\nissue of data scarcity in this field and meet the fundamental data requirements\nof deep network training. Extensive experiments demonstrate that our proposed\nDMSSN achieves state-of-the-art performance on multiple datasets. We will soon\nmake the code and dataset publicly available on\nhttps://github.com/anonymous0519/HSOD-BIT.\n","authors":["Haolin Qin","Tingfa Xu","Peifu Liu","Jingxuan Xu","Jianan Li"],"pdf_url":"https://arxiv.org/pdf/2404.00694v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00680v1","updated":"2024-03-31T13:12:41Z","published":"2024-03-31T13:12:41Z","title":"Learning to Rank Patches for Unbiased Image Redundancy Reduction","summary":" Images suffer from heavy spatial redundancy because pixels in neighboring\nregions are spatially correlated. Existing approaches strive to overcome this\nlimitation by reducing less meaningful image regions. However, current leading\nmethods rely on supervisory signals. They may compel models to preserve content\nthat aligns with labeled categories and discard content belonging to unlabeled\ncategories. This categorical inductive bias makes these methods less effective\nin real-world scenarios. To address this issue, we propose a self-supervised\nframework for image redundancy reduction called Learning to Rank Patches\n(LTRP). We observe that image reconstruction of masked image modeling models is\nsensitive to the removal of visible patches when the masking ratio is high\n(e.g., 90\\%). Building upon it, we implement LTRP via two steps: inferring the\nsemantic density score of each patch by quantifying variation between\nreconstructions with and without this patch, and learning to rank the patches\nwith the pseudo score. The entire process is self-supervised, thus getting out\nof the dilemma of categorical inductive bias. We design extensive experiments\non different datasets and tasks. The results demonstrate that LTRP outperforms\nboth supervised and other self-supervised methods due to the fair assessment of\nimage content.\n","authors":["Yang Luo","Zhineng Chen","Peng Zhou","Zuxuan Wu","Xieping Gao","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.00680v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00679v1","updated":"2024-03-31T13:09:06Z","published":"2024-03-31T13:09:06Z","title":"Weak-to-Strong 3D Object Detection with X-Ray Distillation","summary":" This paper addresses the critical challenges of sparsity and occlusion in\nLiDAR-based 3D object detection. Current methods often rely on supplementary\nmodules or specific architectural designs, potentially limiting their\napplicability to new and evolving architectures. To our knowledge, we are the\nfirst to propose a versatile technique that seamlessly integrates into any\nexisting framework for 3D Object Detection, marking the first instance of\nWeak-to-Strong generalization in 3D computer vision. We introduce a novel\nframework, X-Ray Distillation with Object-Complete Frames, suitable for both\nsupervised and semi-supervised settings, that leverages the temporal aspect of\npoint cloud sequences. This method extracts crucial information from both\nprevious and subsequent LiDAR frames, creating Object-Complete frames that\nrepresent objects from multiple viewpoints, thus addressing occlusion and\nsparsity. Given the limitation of not being able to generate Object-Complete\nframes during online inference, we utilize Knowledge Distillation within a\nTeacher-Student framework. This technique encourages the strong Student model\nto emulate the behavior of the weaker Teacher, which processes simple and\ninformative Object-Complete frames, effectively offering a comprehensive view\nof objects as if seen through X-ray vision. Our proposed methods surpass\nstate-of-the-art in semi-supervised learning by 1-1.5 mAP and enhance the\nperformance of five established supervised models by 1-2 mAP on standard\nautonomous driving datasets, even with default hyperparameters. Code for\nObject-Complete frames is available here:\nhttps://github.com/sakharok13/X-Ray-Teacher-Patching-Tools.\n","authors":["Alexander Gambashidze","Aleksandr Dadukin","Maksim Golyadkin","Maria Razzhivina","Ilya Makarov"],"pdf_url":"https://arxiv.org/pdf/2404.00679v1.pdf","comment":"Computer Vision and Pattern Recognition 2024"},{"id":"http://arxiv.org/abs/2404.00678v1","updated":"2024-03-31T13:07:00Z","published":"2024-03-31T13:07:00Z","title":"OmniSDF: Scene Reconstruction using Omnidirectional Signed Distance\n Functions and Adaptive Binoctrees","summary":" We present a method to reconstruct indoor and outdoor static scene geometry\nand appearance from an omnidirectional video moving in a small circular sweep.\nThis setting is challenging because of the small baseline and large depth\nranges, making it difficult to find ray crossings. To better constrain the\noptimization, we estimate geometry as a signed distance field within a\nspherical binoctree data structure and use a complementary efficient tree\ntraversal strategy based on a breadth-first search for sampling. Unlike regular\ngrids or trees, the shape of this structure well-matches the camera setting,\ncreating a better memory-quality trade-off. From an initial depth estimate, the\nbinoctree is adaptively subdivided throughout the optimization; previous\nmethods use a fixed depth that leaves the scene undersampled. In comparison\nwith three neural optimization methods and two non-neural methods, ours shows\ndecreased geometry error on average, especially in a detailed scene, while\nsignificantly reducing the required number of voxels to represent such details.\n","authors":["Hakyeong Kim","Andreas Meuleman","Hyeonjoong Jang","James Tompkin","Min H. Kim"],"pdf_url":"https://arxiv.org/pdf/2404.00678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00676v1","updated":"2024-03-31T12:55:05Z","published":"2024-03-31T12:55:05Z","title":"OmniLocalRF: Omnidirectional Local Radiance Fields from Dynamic Videos","summary":" Omnidirectional cameras are extensively used in various applications to\nprovide a wide field of vision. However, they face a challenge in synthesizing\nnovel views due to the inevitable presence of dynamic objects, including the\nphotographer, in their wide field of view. In this paper, we introduce a new\napproach called Omnidirectional Local Radiance Fields (OmniLocalRF) that can\nrender static-only scene views, removing and inpainting dynamic objects\nsimultaneously. Our approach combines the principles of local radiance fields\nwith the bidirectional optimization of omnidirectional rays. Our input is an\nomnidirectional video, and we evaluate the mutual observations of the entire\nangle between the previous and current frames. To reduce ghosting artifacts of\ndynamic objects and inpaint occlusions, we devise a multi-resolution motion\nmask prediction module. Unlike existing methods that primarily separate dynamic\ncomponents through the temporal domain, our method uses multi-resolution neural\nfeature planes for precise segmentation, which is more suitable for long\n360-degree videos. Our experiments validate that OmniLocalRF outperforms\nexisting methods in both qualitative and quantitative metrics, especially in\nscenarios with complex real-world scenes. In particular, our approach\neliminates the need for manual interaction, such as drawing motion masks by\nhand and additional pose estimation, making it a highly effective and efficient\nsolution.\n","authors":["Dongyoung Choi","Hyeonjoong Jang","Min H. Kim"],"pdf_url":"https://arxiv.org/pdf/2404.00676v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00674v1","updated":"2024-03-31T12:45:23Z","published":"2024-03-31T12:45:23Z","title":"Knowledge NeRF: Few-shot Novel View Synthesis for Dynamic Articulated\n Objects","summary":" We present Knowledge NeRF to synthesize novel views for dynamic\nscenes.Reconstructing dynamic 3D scenes from few sparse views and rendering\nthem from arbitrary perspectives is a challenging problem with applications in\nvarious domains. Previous dynamic NeRF methods learn the deformation of\narticulated objects from monocular videos. However, qualities of their\nreconstructed scenes are limited.To clearly reconstruct dynamic scenes, we\npropose a new framework by considering two frames at a time.We pretrain a NeRF\nmodel for an articulated object.When articulated objects moves, Knowledge NeRF\nlearns to generate novel views at the new state by incorporating past knowledge\nin the pretrained NeRF model with minimal observations in the present state. We\npropose a projection module to adapt NeRF for dynamic scenes, learning the\ncorrespondence between pretrained knowledge base and current states.\nExperimental results demonstrate the effectiveness of our method in\nreconstructing dynamic 3D scenes with 5 input images in one state. Knowledge\nNeRF is a new pipeline and promising solution for novel view synthesis in\ndynamic articulated objects. The data and implementation are publicly available\nat https://github.com/RussRobin/Knowledge_NeRF.\n","authors":["Wenxiao Cai","Xinyue Leiınst","Xinyu He","Junming Leo Chen","Yangang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.00674v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00672v1","updated":"2024-03-31T12:44:24Z","published":"2024-03-31T12:44:24Z","title":"A General and Efficient Training for Transformer via Token Expansion","summary":" The remarkable performance of Vision Transformers (ViTs) typically requires\nan extremely large training cost. Existing methods have attempted to accelerate\nthe training of ViTs, yet typically disregard method universality with accuracy\ndropping. Meanwhile, they break the training consistency of the original\ntransformers, including the consistency of hyper-parameters, architecture, and\nstrategy, which prevents them from being widely applied to different\nTransformer networks. In this paper, we propose a novel token growth scheme\nToken Expansion (termed ToE) to achieve consistent training acceleration for\nViTs. We introduce an \"initialization-expansion-merging\" pipeline to maintain\nthe integrity of the intermediate feature distribution of original\ntransformers, preventing the loss of crucial learnable information in the\ntraining process. ToE can not only be seamlessly integrated into the training\nand fine-tuning process of transformers (e.g., DeiT and LV-ViT), but also\neffective for efficient training frameworks (e.g., EfficientTrain), without\ntwisting the original training hyper-parameters, architecture, and introducing\nadditional training strategies. Extensive experiments demonstrate that ToE\nachieves about 1.3x faster for the training of ViTs in a lossless manner, or\neven with performance gains over the full-token training baselines. Code is\navailable at https://github.com/Osilly/TokenExpansion .\n","authors":["Wenxuan Huang","Yunhang Shen","Jiao Xie","Baochang Zhang","Gaoqi He","Ke Li","Xing Sun","Shaohui Lin"],"pdf_url":"https://arxiv.org/pdf/2404.00672v1.pdf","comment":"Accepted to CVPR 2024. Code is available at\n https://github.com/Osilly/TokenExpansion"},{"id":"http://arxiv.org/abs/2404.00670v1","updated":"2024-03-31T12:35:23Z","published":"2024-03-31T12:35:23Z","title":"Statistical Analysis by Semiparametric Additive Regression and LSTM-FCN\n Based Hierarchical Classification for Computer Vision Quantification of\n Parkinsonian Bradykinesia","summary":" Bradykinesia, characterized by involuntary slowing or decrement of movement,\nis a fundamental symptom of Parkinson's Disease (PD) and is vital for its\nclinical diagnosis. Despite various methodologies explored to quantify\nbradykinesia, computer vision-based approaches have shown promising results.\nHowever, these methods often fall short in adequately addressing key\nbradykinesia characteristics in repetitive limb movements: \"occasional arrest\"\nand \"decrement in amplitude.\"\n This research advances vision-based quantification of bradykinesia by\nintroducing nuanced numerical analysis to capture decrement in amplitudes and\nemploying a simple deep learning technique, LSTM-FCN, for precise\nclassification of occasional arrests. Our approach structures the\nclassification process hierarchically, tailoring it to the unique dynamics of\nbradykinesia in PD.\n Statistical analysis of the extracted features, including those representing\narrest and fatigue, has demonstrated their statistical significance in most\ncases. This finding underscores the importance of considering \"occasional\narrest\" and \"decrement in amplitude\" in bradykinesia quantification of limb\nmovement. Our enhanced diagnostic tool has been rigorously tested on an\nextensive dataset comprising 1396 motion videos from 310 PD patients, achieving\nan accuracy of 80.3%. The results confirm the robustness and reliability of our\nmethod.\n","authors":["Youngseo Cho","In Hee Kwak","Dohyeon Kim","Jinhee Na","Hanjoo Sung","Jeongjae Lee","Young Eun Kim","Hyeo-il Ma"],"pdf_url":"https://arxiv.org/pdf/2404.00670v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00667v1","updated":"2024-03-31T12:22:23Z","published":"2024-03-31T12:22:23Z","title":"Weakly-Supervised Cross-Domain Segmentation of Electron Microscopy with\n Sparse Point Annotation","summary":" Accurate segmentation of organelle instances from electron microscopy (EM)\nimages plays an essential role in many neuroscience researches. However,\npractical scenarios usually suffer from high annotation costs, label scarcity,\nand large domain diversity. While unsupervised domain adaptation (UDA) that\nassumes no annotation effort on the target data is promising to alleviate these\nchallenges, its performance on complicated segmentation tasks is still far from\npractical usage. To address these issues, we investigate a highly\nannotation-efficient weak supervision, which assumes only sparse center-points\non a small subset of object instances in the target training images. To achieve\naccurate segmentation with partial point annotations, we introduce instance\ncounting and center detection as auxiliary tasks and design a multitask\nlearning framework to leverage correlations among the counting, detection, and\nsegmentation, which are all tasks with partial or no supervision. Building upon\nthe different domain-invariances of the three tasks, we enforce counting\nestimation with a novel soft consistency loss as a global prior for center\ndetection, which further guides the per-pixel segmentation. To further\ncompensate for annotation sparsity, we develop a cross-position cut-and-paste\nfor label augmentation and an entropy-based pseudo-label selection. The\nexperimental results highlight that, by simply using extremely weak annotation,\ne.g., 15\\% sparse points, for model training, the proposed model is capable of\nsignificantly outperforming UDA methods and produces comparable performance as\nthe supervised counterpart. The high robustness of our model shown in the\nvalidations and the low requirement of expert knowledge for sparse point\nannotation further improve the potential application value of our model.\n","authors":["Dafei Qiu","Shan Xiong","Jiajin Yi","Jialin Peng"],"pdf_url":"https://arxiv.org/pdf/2404.00667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00661v1","updated":"2024-03-31T12:07:04Z","published":"2024-03-31T12:07:04Z","title":"DeeDSR: Towards Real-World Image Super-Resolution via Degradation-Aware\n Stable Diffusion","summary":" Diffusion models, known for their powerful generative capabilities, play a\ncrucial role in addressing real-world super-resolution challenges. However,\nthese models often focus on improving local textures while neglecting the\nimpacts of global degradation, which can significantly reduce semantic fidelity\nand lead to inaccurate reconstructions and suboptimal super-resolution\nperformance. To address this issue, we introduce a novel two-stage,\ndegradation-aware framework that enhances the diffusion model's ability to\nrecognize content and degradation in low-resolution images. In the first stage,\nwe employ unsupervised contrastive learning to obtain representations of image\ndegradations. In the second stage, we integrate a degradation-aware module into\na simplified ControlNet, enabling flexible adaptation to various degradations\nbased on the learned representations. Furthermore, we decompose the\ndegradation-aware features into global semantics and local details branches,\nwhich are then injected into the diffusion denoising module to modulate the\ntarget generation. Our method effectively recovers semantically precise and\nphotorealistic details, particularly under significant degradation conditions,\ndemonstrating state-of-the-art performance across various benchmarks. Codes\nwill be released at https://github.com/bichunyang419/DeeDSR.\n","authors":["Chunyang Bi","Xin Luo","Sheng Shen","Mengxi Zhang","Huanjing Yue","Jingyu Yang"],"pdf_url":"https://arxiv.org/pdf/2404.00661v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00653v1","updated":"2024-03-31T11:43:39Z","published":"2024-03-31T11:43:39Z","title":"Dual DETRs for Multi-Label Temporal Action Detection","summary":" Temporal Action Detection (TAD) aims to identify the action boundaries and\nthe corresponding category within untrimmed videos. Inspired by the success of\nDETR in object detection, several methods have adapted the query-based\nframework to the TAD task. However, these approaches primarily followed DETR to\npredict actions at the instance level (i.e., identify each action by its center\npoint), leading to sub-optimal boundary localization. To address this issue, we\npropose a new Dual-level query-based TAD framework, namely DualDETR, to detect\nactions from both instance-level and boundary-level. Decoding at different\nlevels requires semantics of different granularity, therefore we introduce a\ntwo-branch decoding structure. This structure builds distinctive decoding\nprocesses for different levels, facilitating explicit capture of temporal cues\nand semantics at each level. On top of the two-branch design, we present a\njoint query initialization strategy to align queries from both levels.\nSpecifically, we leverage encoder proposals to match queries from each level in\na one-to-one manner. Then, the matched queries are initialized using position\nand content prior from the matched action proposal. The aligned dual-level\nqueries can refine the matched proposal with complementary cues during\nsubsequent decoding. We evaluate DualDETR on three challenging multi-label TAD\nbenchmarks. The experimental results demonstrate the superior performance of\nDualDETR to the existing state-of-the-art methods, achieving a substantial\nimprovement under det-mAP and delivering impressive results under seg-mAP.\n","authors":["Yuhan Zhu","Guozhen Zhang","Jing Tan","Gangshan Wu","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.00653v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00650v1","updated":"2024-03-31T11:37:43Z","published":"2024-03-31T11:37:43Z","title":"Deep Instruction Tuning for Segment Anything Model","summary":" Segment Anything Model (SAM) exhibits powerful yet versatile capabilities on\n(un) conditional image segmentation tasks recently. Although SAM can support\nvarious segmentation prompts, we note that, compared to point- and box-guided\nsegmentation, it performs much worse on text-instructed tasks. We argue that\ndeep text instruction tuning is key to mitigate such shortcoming caused by the\nshallow fusion scheme in its default light-weight mask decoder. In this paper,\ntwo \\emph{deep instruction tuning} (DIT) methods are proposed, one is\nend-to-end and the other is layer-wise. With these tuning methods, we can\nregard the image encoder of SAM as a stand-alone vision-language learner in\ncontrast to building another deep fusion branch. Extensive experiments on three\nhighly competitive benchmark datasets of referring image segmentation show that\na simple end-to-end DIT improves SAM by a large margin, with layer-wise DIT\nfurther boosts the performance to state-of-the-art. Our code is anonymously\nreleased at: https://github.com/wysnzzzz/DIT.\n","authors":["Xiaorui Huang","Gen Luo","Chaoyang Zhu","Bo Tong","Yiyi Zhou","Xiaoshuai Sun","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2404.00650v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00648v1","updated":"2024-03-31T11:33:39Z","published":"2024-03-31T11:33:39Z","title":"SpiralMLP: A Lightweight Vision MLP Architecture","summary":" We present SpiralMLP, a novel architecture that introduces a Spiral FC layer\nas a replacement for the conventional Token Mixing approach. Differing from\nseveral existing MLP-based models that primarily emphasize axes, our Spiral FC\nlayer is designed as a deformable convolution layer with spiral-like offsets.\nWe further adapt Spiral FC into two variants: Self-Spiral FC and Cross-Spiral\nFC, which enable both local and global feature integration seamlessly,\neliminating the need for additional processing steps. To thoroughly investigate\nthe effectiveness of the spiral-like offsets and validate our design, we\nconduct ablation studies and explore optimal configurations. In empirical\ntests, SpiralMLP reaches state-of-the-art performance, similar to Transformers,\nCNNs, and other MLPs, benchmarking on ImageNet-1k, COCO and ADE20K. SpiralMLP\nstill maintains linear computational complexity O(HW) and is compatible with\nvarying input image resolutions. Our study reveals that targeting the full\nreceptive field is not essential for achieving high performance, instead,\nadopting a refined approach offers better results.\n","authors":["Haojie Mu","Burhan Ul Tayyab","Nicholas Chua"],"pdf_url":"https://arxiv.org/pdf/2404.00648v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00645v1","updated":"2024-03-31T11:09:19Z","published":"2024-03-31T11:09:19Z","title":"Attire-Based Anomaly Detection in Restricted Areas Using YOLOv8 for\n Enhanced CCTV Security","summary":" This research introduces an innovative security enhancement approach,\nemploying advanced image analysis and soft computing. The focus is on an\nintelligent surveillance system that detects unauthorized individuals in\nrestricted areas by analyzing attire. Traditional security measures face\nchallenges in monitoring unauthorized access. Leveraging YOLOv8, an advanced\nobject detection algorithm, our system identifies authorized personnel based on\ntheir attire in CCTV footage. The methodology involves training the YOLOv8\nmodel on a comprehensive dataset of uniform patterns, ensuring precise\nrecognition in specific regions. Soft computing techniques enhance adaptability\nto dynamic environments and varying lighting conditions. This research\ncontributes to image analysis and soft computing, providing a sophisticated\nsecurity solution. Emphasizing uniform-based anomaly detection, it establishes\na foundation for robust security systems in restricted areas. The outcomes\nhighlight the potential of YOLOv8-based surveillance in ensuring safety in\nsensitive locations.\n","authors":["Abdul Aziz A. B","Aindri Bajpai"],"pdf_url":"https://arxiv.org/pdf/2404.00645v1.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.00633v1","updated":"2024-03-31T10:01:20Z","published":"2024-03-31T10:01:20Z","title":"IPT-V2: Efficient Image Processing Transformer using Hierarchical\n Attentions","summary":" Recent advances have demonstrated the powerful capability of transformer\narchitecture in image restoration. However, our analysis indicates that\nexisting transformerbased methods can not establish both exact global and local\ndependencies simultaneously, which are much critical to restore the details and\nmissing content of degraded images. To this end, we present an efficient image\nprocessing transformer architecture with hierarchical attentions, called IPTV2,\nadopting a focal context self-attention (FCSA) and a global grid self-attention\n(GGSA) to obtain adequate token interactions in local and global receptive\nfields. Specifically, FCSA applies the shifted window mechanism into the\nchannel self-attention, helps capture the local context and mutual interaction\nacross channels. And GGSA constructs long-range dependencies in the\ncross-window grid, aggregates global information in spatial dimension.\nMoreover, we introduce structural re-parameterization technique to feed-forward\nnetwork to further improve the model capability. Extensive experiments\ndemonstrate that our proposed IPT-V2 achieves state-of-the-art results on\nvarious image processing tasks, covering denoising, deblurring, deraining and\nobtains much better trade-off for performance and computational complexity than\nprevious methods. Besides, we extend our method to image generation as latent\ndiffusion backbone, and significantly outperforms DiTs.\n","authors":["Zhijun Tu","Kunpeng Du","Hanting Chen","Hailing Wang","Wei Li","Jie Hu","Yunhe Wang"],"pdf_url":"https://arxiv.org/pdf/2404.00633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00626v1","updated":"2024-03-31T09:32:31Z","published":"2024-03-31T09:32:31Z","title":"Domain Generalizable Person Search Using Unreal Dataset","summary":" Collecting and labeling real datasets to train the person search networks not\nonly requires a lot of time and effort, but also accompanies privacy issues.\nThe weakly-supervised and unsupervised domain adaptation methods have been\nproposed to alleviate the labeling burden for target datasets, however, their\ngeneralization capability is limited. We introduce a novel person search method\nbased on the domain generalization framework, that uses an automatically\nlabeled unreal dataset only for training but is applicable to arbitrary unseen\nreal datasets. To alleviate the domain gaps when transferring the knowledge\nfrom the unreal source dataset to the real target datasets, we estimate the\nfidelity of person instances which is then used to train the end-to-end network\nadaptively. Moreover, we devise a domain-invariant feature learning scheme to\nencourage the network to suppress the domain-related features. Experimental\nresults demonstrate that the proposed method provides the competitive\nperformance to existing person search methods even though it is applicable to\narbitrary unseen datasets without any prior knowledge and re-training burdens.\n","authors":["Minyoung Oh","Duhyun Kim","Jae-Young Sim"],"pdf_url":"https://arxiv.org/pdf/2404.00626v1.pdf","comment":"AAAI2024 accepted"},{"id":"http://arxiv.org/abs/2404.00618v1","updated":"2024-03-31T09:10:32Z","published":"2024-03-31T09:10:32Z","title":"A Multi-Branched Radial Basis Network Approach to Predicting Complex\n Chaotic Behaviours","summary":" In this study, we propose a multi branched network approach to predict the\ndynamics of a physics attractor characterized by intricate and chaotic\nbehavior. We introduce a unique neural network architecture comprised of Radial\nBasis Function (RBF) layers combined with an attention mechanism designed to\neffectively capture nonlinear inter-dependencies inherent in the attractor's\ntemporal evolution. Our results demonstrate successful prediction of the\nattractor's trajectory across 100 predictions made using a real-world dataset\nof 36,700 time-series observations encompassing approximately 28 minutes of\nactivity. To further illustrate the performance of our proposed technique, we\nprovide comprehensive visualizations depicting the attractor's original and\npredicted behaviors alongside quantitative measures comparing observed versus\nestimated outcomes. Overall, this work showcases the potential of advanced\nmachine learning algorithms in elucidating hidden structures in complex\nphysical systems while offering practical applications in various domains\nrequiring accurate short-term forecasting capabilities.\n","authors":["Aarush Sinha"],"pdf_url":"https://arxiv.org/pdf/2404.00618v1.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.00611v1","updated":"2024-03-31T09:01:17Z","published":"2024-03-31T09:01:17Z","title":"Object-level Copy-Move Forgery Image Detection based on Inconsistency\n Mining","summary":" In copy-move tampering operations, perpetrators often employ techniques, such\nas blurring, to conceal tampering traces, posing significant challenges to the\ndetection of object-level targets with intact structures. Focus on these\nchallenges, this paper proposes an Object-level Copy-Move Forgery Image\nDetection based on Inconsistency Mining (IMNet). To obtain complete\nobject-level targets, we customize prototypes for both the source and tampered\nregions and dynamically update them. Additionally, we extract inconsistent\nregions between coarse similar regions obtained through self-correlation\ncalculations and regions composed of prototypes. The detected inconsistent\nregions are used as supplements to coarse similar regions to refine pixel-level\ndetection. We operate experiments on three public datasets which validate the\neffectiveness and the robustness of the proposed IMNet.\n","authors":["Jingyu Wang","Niantai Jing","Ziyao Liu","Jie Nie","Yuxin Qi","Chi-Hung Chi","Kwok-Yan Lam"],"pdf_url":"https://arxiv.org/pdf/2404.00611v1.pdf","comment":"4 pages, 2 figures"},{"id":"http://arxiv.org/abs/2404.00603v1","updated":"2024-03-31T08:28:42Z","published":"2024-03-31T08:28:42Z","title":"Weak Distribution Detectors Lead to Stronger Generalizability of\n Vision-Language Prompt Tuning","summary":" We propose a generalized method for boosting the generalization ability of\npre-trained vision-language models (VLMs) while fine-tuning on downstream\nfew-shot tasks. The idea is realized by exploiting out-of-distribution (OOD)\ndetection to predict whether a sample belongs to a base distribution or a novel\ndistribution and then using the score generated by a dedicated competition\nbased scoring function to fuse the zero-shot and few-shot classifier. The fused\nclassifier is dynamic, which will bias towards the zero-shot classifier if a\nsample is more likely from the distribution pre-trained on, leading to improved\nbase-to-novel generalization ability. Our method is performed only in test\nstage, which is applicable to boost existing methods without time-consuming\nre-training. Extensive experiments show that even weak distribution detectors\ncan still improve VLMs' generalization ability. Specifically, with the help of\nOOD detectors, the harmonic mean of CoOp and ProGrad increase by 2.6 and 1.5\npercentage points over 11 recognition datasets in the base-to-novel setting.\n","authors":["Kun Ding","Haojian Zhang","Qiang Yu","Ying Wang","Shiming Xiang","Chunhong Pan"],"pdf_url":"https://arxiv.org/pdf/2404.00603v1.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2404.00597v1","updated":"2024-03-31T08:08:05Z","published":"2024-03-31T08:08:05Z","title":"Parameter and Data-Efficient Spectral StyleDCGAN","summary":" We present a simple, highly parameter, and data-efficient adversarial network\nfor unconditional face generation. Our method: Spectral Style-DCGAN or SSD\nutilizes only 6.574 million parameters and 4739 dog faces from the Animal Faces\nHQ (AFHQ) dataset as training samples while preserving fidelity at low\nresolutions up to 64x64. Code available at\nhttps://github.com/Aryan-Garg/StyleDCGAN.\n","authors":["Aryan Garg"],"pdf_url":"https://arxiv.org/pdf/2404.00597v1.pdf","comment":"Notable ICLR Tiny Paper 2024"},{"id":"http://arxiv.org/abs/2404.00593v1","updated":"2024-03-31T07:56:07Z","published":"2024-03-31T07:56:07Z","title":"LAESI: Leaf Area Estimation with Synthetic Imagery","summary":" We introduce LAESI, a Synthetic Leaf Dataset of 100,000 synthetic leaf images\non millimeter paper, each with semantic masks and surface area labels. This\ndataset provides a resource for leaf morphology analysis primarily aimed at\nbeech and oak leaves. We evaluate the applicability of the dataset by training\nmachine learning models for leaf surface area prediction and semantic\nsegmentation, using real images for validation. Our validation shows that these\nmodels can be trained to predict leaf surface area with a relative error not\ngreater than an average human annotator. LAESI also provides an efficient\nframework based on 3D procedural models and generative AI for the large-scale,\ncontrollable generation of data with potential further applications in\nagriculture and biology. We evaluate the inclusion of generative AI in our\nprocedural data generation pipeline and show how data filtering based on\nannotation consistency results in datasets which allow training the highest\nperforming vision models.\n","authors":["Jacek Kałużny","Yannik Schreckenberg","Karol Cyganik","Peter Annighöfer","Sören Pirk","Dominik L. Michels","Mikolaj Cieslak","Farhah Assaad-Gerbert","Bedrich Benes","Wojciech Pałubicki"],"pdf_url":"https://arxiv.org/pdf/2404.00593v1.pdf","comment":"10 pages, 12 figures, 1 table"},{"id":"http://arxiv.org/abs/2404.00588v1","updated":"2024-03-31T07:30:41Z","published":"2024-03-31T07:30:41Z","title":"Memory-based Cross-modal Semantic Alignment Network for Radiology Report\n Generation","summary":" Generating radiology reports automatically reduces the workload of\nradiologists and helps the diagnoses of specific diseases. Many existing\nmethods take this task as modality transfer process. However, since the key\ninformation related to disease accounts for a small proportion in both image\nand report, it is hard for the model to learn the latent relation between the\nradiology image and its report, thus failing to generate fluent and accurate\nradiology reports. To tackle this problem, we propose a memory-based\ncross-modal semantic alignment model (MCSAM) following an encoder-decoder\nparadigm. MCSAM includes a well initialized long-term clinical memory bank to\nlearn disease-related representations as well as prior knowledge for different\nmodalities to retrieve and use the retrieved memory to perform feature\nconsolidation. To ensure the semantic consistency of the retrieved cross modal\nprior knowledge, a cross-modal semantic alignment module (SAM) is proposed. SAM\nis also able to generate semantic visual feature embeddings which can be added\nto the decoder and benefits report generation. More importantly, to memorize\nthe state and additional information while generating reports with the decoder,\nwe use learnable memory tokens which can be seen as prompts. Extensive\nexperiments demonstrate the promising performance of our proposed method which\ngenerates state-of-the-art performance on the MIMIC-CXR dataset.\n","authors":["Yitian Tao","Liyan Ma","Jing Yu","Han Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.00588v1.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.00578v1","updated":"2024-03-31T06:55:12Z","published":"2024-03-31T06:55:12Z","title":"M3D: Advancing 3D Medical Image Analysis with Multi-Modal Large Language\n Models","summary":" Medical image analysis is essential to clinical diagnosis and treatment,\nwhich is increasingly supported by multi-modal large language models (MLLMs).\nHowever, previous research has primarily focused on 2D medical images, leaving\n3D images under-explored, despite their richer spatial information. This paper\naims to advance 3D medical image analysis with MLLMs. To this end, we present a\nlarge-scale 3D multi-modal medical dataset, M3D-Data, comprising 120K\nimage-text pairs and 662K instruction-response pairs specifically tailored for\nvarious 3D medical tasks, such as image-text retrieval, report generation,\nvisual question answering, positioning, and segmentation. Additionally, we\npropose M3D-LaMed, a versatile multi-modal large language model for 3D medical\nimage analysis. Furthermore, we introduce a new 3D multi-modal medical\nbenchmark, M3D-Bench, which facilitates automatic evaluation across eight\ntasks. Through comprehensive evaluation, our method proves to be a robust model\nfor 3D medical image analysis, outperforming existing solutions. All code,\ndata, and models are publicly available at: https://github.com/BAAI-DCAI/M3D.\n","authors":["Fan Bai","Yuxin Du","Tiejun Huang","Max Q. -H. Meng","Bo Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.00578v1.pdf","comment":"MLLM, 3D medical image analysis"},{"id":"http://arxiv.org/abs/2404.00576v1","updated":"2024-03-31T06:38:08Z","published":"2024-03-31T06:38:08Z","title":"Automated Bi-Fold Weighted Ensemble Algorithms and its Application to\n Brain Tumor Detection and Classification","summary":" The uncontrolled and unstructured growth of brain cells is known as brain\ntumor, which has one of the highest mortality rates among diseases from all\ntypes of cancers. Due to limited diagnostic and treatment capabilities, they\npose significant challenges, especially in third-world countries. Early\ndiagnosis plays a vital role in effectively managing brain tumors and reducing\nmortality rates. However, the availability of diagnostic methods is hindered by\nvarious limitations, including high costs and lengthy result acquisition times,\nimpeding early detection of the disease. In this study, we present two\ncutting-edge bi-fold weighted voting ensemble models that aim to boost the\neffectiveness of weighted ensemble methods. These two proposed methods combine\nthe classification outcomes from multiple classifiers and determine the optimal\nresult by selecting the one with the highest probability in the first approach,\nand the highest weighted prediction in the second technique. These approaches\nsignificantly improve the overall performance of weighted ensemble techniques.\nIn the first proposed method, we improve the soft voting technique (SVT) by\nintroducing a novel unsupervised weight calculating schema (UWCS) to enhance\nits weight assigning capability, known as the extended soft voting technique\n(ESVT). Secondly, we propose a novel weighted method (NWM) by using the\nproposed UWCS. Both of our approaches incorporate three distinct models: a\ncustom-built CNN, VGG-16, and InceptionResNetV2 which has been trained on\npublicly available datasets. The effectiveness of our proposed systems is\nevaluated through blind testing, where exceptional results are achieved. We\nthen establish a comparative analysis of the performance of our proposed\nmethods with that of SVT to show their superiority and effectiveness.\n","authors":["PoTsang B. Huang","Muhammad Rizwan","Mehboob Ali"],"pdf_url":"https://arxiv.org/pdf/2404.00576v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00563v1","updated":"2024-03-31T05:07:06Z","published":"2024-03-31T05:07:06Z","title":"Exploiting Inter-sample and Inter-feature Relations in Dataset\n Distillation","summary":" Dataset distillation has emerged as a promising approach in deep learning,\nenabling efficient training with small synthetic datasets derived from larger\nreal ones. Particularly, distribution matching-based distillation methods\nattract attention thanks to its effectiveness and low computational cost.\nHowever, these methods face two primary limitations: the dispersed feature\ndistribution within the same class in synthetic datasets, reducing class\ndiscrimination, and an exclusive focus on mean feature consistency, lacking\nprecision and comprehensiveness. To address these challenges, we introduce two\nnovel constraints: a class centralization constraint and a covariance matching\nconstraint. The class centralization constraint aims to enhance class\ndiscrimination by more closely clustering samples within classes. The\ncovariance matching constraint seeks to achieve more accurate feature\ndistribution matching between real and synthetic datasets through local feature\ncovariance matrices, particularly beneficial when sample sizes are much smaller\nthan the number of features. Experiments demonstrate notable improvements with\nthese constraints, yielding performance boosts of up to 6.6% on CIFAR10, 2.9%\non SVHN, 2.5% on CIFAR100, and 2.5% on TinyImageNet, compared to the\nstate-of-the-art relevant methods. In addition, our method maintains robust\nperformance in cross-architecture settings, with a maximum performance drop of\n1.7% on four architectures. Code is available at\nhttps://github.com/VincenDen/IID.\n","authors":["Wenxiao Deng","Wenbin Li","Tianyu Ding","Lei Wang","Hongguang Zhang","Kuihua Huang","Jing Huo","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2404.00563v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00558v1","updated":"2024-03-31T04:39:40Z","published":"2024-03-31T04:39:40Z","title":"GAN with Skip Patch Discriminator for Biological Electron Microscopy\n Image Generation","summary":" Generating realistic electron microscopy (EM) images has been a challenging\nproblem due to their complex global and local structures. Isola et al. proposed\npix2pix, a conditional Generative Adversarial Network (GAN), for the general\npurpose of image-to-image translation; which fails to generate realistic EM\nimages. We propose a new architecture for the discriminator in the GAN\nproviding access to multiple patch sizes using skip patches and generating\nrealistic EM images.\n","authors":["Nishith Ranjon Roy","Nailah Rawnaq","Tulin Kaman"],"pdf_url":"https://arxiv.org/pdf/2404.00558v1.pdf","comment":"4 pages, International Conference on Computational and Mathematical\n Biomedical Engineering"},{"id":"http://arxiv.org/abs/2404.00552v1","updated":"2024-03-31T03:53:45Z","published":"2024-03-31T03:53:45Z","title":"Comparison of Methods in Human Skin Decomposition","summary":" Decomposition of skin pigment plays an important role in medical fields.\nHuman skin can be decomposed into two primitive components, hemoglobin and\nmelanin. It is our goal to apply these results for diagnosis of skin cancer. In\nthis paper, various methods for skin pigment decomposition are reviewed\ncomparatively and the performance of each method is evaluated both\ntheoretically and experimentally. In addition, isometric feature mapping\n(Isomap) is introduced in order to improve the dimensionality reduction\nperformance in context of skin decomposition.\n","authors":["Hao Gong","Michel Desvignes"],"pdf_url":"https://arxiv.org/pdf/2404.00552v1.pdf","comment":"4 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.00549v1","updated":"2024-03-31T03:35:43Z","published":"2024-03-31T03:35:43Z","title":"Pneumonia App: a mobile application for efficient pediatric pneumonia\n diagnosis using explainable convolutional neural networks (CNN)","summary":" Mycoplasma pneumoniae pneumonia (MPP) poses significant diagnostic challenges\nin pediatric healthcare, especially in regions like China where it's prevalent.\nWe introduce PneumoniaAPP, a mobile application leveraging deep learning\ntechniques for rapid MPP detection. Our approach capitalizes on convolutional\nneural networks (CNNs) trained on a comprehensive dataset comprising 3345 chest\nX-ray (CXR) images, which includes 833 CXR images revealing MPP and\nadditionally augmented with samples from a public dataset. The CNN model\nachieved an accuracy of 88.20% and an AUROC of 0.9218 across all classes, with\na specific accuracy of 97.64% for the mycoplasma class, as demonstrated on the\ntesting dataset. Furthermore, we integrated explainability techniques into\nPneumoniaAPP to aid respiratory physicians in lung opacity localization. Our\ncontribution extends beyond existing research by targeting pediatric MPP,\nemphasizing the age group of 0-12 years, and prioritizing deployment on mobile\ndevices. This work signifies a significant advancement in pediatric pneumonia\ndiagnosis, offering a reliable and accessible tool to alleviate diagnostic\nburdens in healthcare settings.\n","authors":["Jiaming Deng","Zhenglin Chen","Minjiang Chen","Lulu Xu","Jiaqi Yang","Zhendong Luo","Peiwu Qin"],"pdf_url":"https://arxiv.org/pdf/2404.00549v1.pdf","comment":"27 Pages,7 figures"},{"id":"http://arxiv.org/abs/2404.00548v1","updated":"2024-03-31T03:30:37Z","published":"2024-03-31T03:30:37Z","title":"Denoising Distillation Makes Event-Frame Transformers as Accurate Gaze\n Trackers","summary":" This paper tackles the problem of passive gaze estimation using both event\nand frame data. Considering inherently different physiological structures, it's\nintractable to accurately estimate purely based on a given state. Thus, we\nreformulate the gaze estimation as the quantification of state transitions from\nthe current state to several prior registered anchor states. Technically, we\npropose a two-stage learning-based gaze estimation framework to divide the\nwhole gaze estimation process into a coarse-to-fine process of anchor state\nselection and final gaze location. Moreover, to improve generalization ability,\nwe align a group of local experts with a student network, where a novel\ndenoising distillation algorithm is introduced to utilize denoising diffusion\ntechnique to iteratively remove inherent noise of event data. Extensive\nexperiments demonstrate the effectiveness of the proposed method, which greatly\nsurpasses state-of-the-art methods by a large extent of 15$\\%$. The code will\nbe publicly available at\nhttps://github.com/jdjdli/Denoise_distill_EF_gazetracker.\n","authors":["Jiading Li","Zhiyu Zhu","Jinhui Hou","Junhui Hou","Jinjian Wu"],"pdf_url":"https://arxiv.org/pdf/2404.00548v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00546v1","updated":"2024-03-31T03:24:48Z","published":"2024-03-31T03:24:48Z","title":"On the Estimation of Image-matching Uncertainty in Visual Place\n Recognition","summary":" In Visual Place Recognition (VPR) the pose of a query image is estimated by\ncomparing the image to a map of reference images with known reference poses. As\nis typical for image retrieval problems, a feature extractor maps the query and\nreference images to a feature space, where a nearest neighbor search is then\nperformed. However, till recently little attention has been given to\nquantifying the confidence that a retrieved reference image is a correct match.\nHighly certain but incorrect retrieval can lead to catastrophic failure of\nVPR-based localization pipelines. This work compares for the first time the\nmain approaches for estimating the image-matching uncertainty, including the\ntraditional retrieval-based uncertainty estimation, more recent data-driven\naleatoric uncertainty estimation, and the compute-intensive geometric\nverification. We further formulate a simple baseline method, ``SUE'', which\nunlike the other methods considers the freely-available poses of the reference\nimages in the map. Our experiments reveal that a simple L2-distance between the\nquery and reference descriptors is already a better estimate of image-matching\nuncertainty than current data-driven approaches. SUE outperforms the other\nefficient uncertainty estimation methods, and its uncertainty estimates\ncomplement the computationally expensive geometric verification approach.\nFuture works for uncertainty estimation in VPR should consider the baselines\ndiscussed in this work.\n","authors":["Mubariz Zaffar","Liangliang Nan","Julian F. P. Kooij"],"pdf_url":"https://arxiv.org/pdf/2404.00546v1.pdf","comment":"To appear in the proceedings of the IEEE/CVF Conference on Computer\n Vision and Pattern Recognition (CVPR) 2024"},{"id":"http://arxiv.org/abs/2404.00544v1","updated":"2024-03-31T03:16:08Z","published":"2024-03-31T03:16:08Z","title":"Deep Extrinsic Manifold Representation for Vision Tasks","summary":" Non-Euclidean data is frequently encountered across different fields, yet\nthere is limited literature that addresses the fundamental challenge of\ntraining neural networks with manifold representations as outputs. We introduce\nthe trick named Deep Extrinsic Manifold Representation (DEMR) for visual tasks\nin this context. DEMR incorporates extrinsic manifold embedding into deep\nneural networks, which helps generate manifold representations. The DEMR\napproach does not directly optimize the complex geodesic loss. Instead, it\nfocuses on optimizing the computation graph within the embedded Euclidean\nspace, allowing for adaptability to various architectural requirements. We\nprovide empirical evidence supporting the proposed concept on two types of\nmanifolds, $SE(3)$ and its associated quotient manifolds. This evidence offers\ntheoretical assurances regarding feasibility, asymptotic properties, and\ngeneralization capability. The experimental results show that DEMR effectively\nadapts to point cloud alignment, producing outputs in $ SE(3) $, as well as in\nillumination subspace learning with outputs on the Grassmann manifold.\n","authors":["Tongtong Zhang","Xian Wei","Yuanxiang Li"],"pdf_url":"https://arxiv.org/pdf/2404.00544v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00540v1","updated":"2024-03-31T03:02:35Z","published":"2024-03-31T03:02:35Z","title":"Embodied Active Defense: Leveraging Recurrent Feedback to Counter\n Adversarial Patches","summary":" The vulnerability of deep neural networks to adversarial patches has\nmotivated numerous defense strategies for boosting model robustness. However,\nthe prevailing defenses depend on single observation or pre-established\nadversary information to counter adversarial patches, often failing to be\nconfronted with unseen or adaptive adversarial attacks and easily exhibiting\nunsatisfying performance in dynamic 3D environments. Inspired by active human\nperception and recurrent feedback mechanisms, we develop Embodied Active\nDefense (EAD), a proactive defensive strategy that actively contextualizes\nenvironmental information to address misaligned adversarial patches in 3D\nreal-world settings. To achieve this, EAD develops two central recurrent\nsub-modules, i.e., a perception module and a policy module, to implement two\ncritical functions of active vision. These models recurrently process a series\nof beliefs and observations, facilitating progressive refinement of their\ncomprehension of the target object and enabling the development of strategic\nactions to counter adversarial patches in 3D environments. To optimize learning\nefficiency, we incorporate a differentiable approximation of environmental\ndynamics and deploy patches that are agnostic to the adversary strategies.\nExtensive experiments demonstrate that EAD substantially enhances robustness\nagainst a variety of patches within just a few steps through its action policy\nin safety-critical tasks (e.g., face recognition and object detection), without\ncompromising standard accuracy. Furthermore, due to the attack-agnostic\ncharacteristic, EAD facilitates excellent generalization to unseen attacks,\ndiminishing the averaged attack success rate by 95 percent across a range of\nunseen adversarial attacks.\n","authors":["Lingxuan Wu","Xiao Yang","Yinpeng Dong","Liuwei Xie","Hang Su","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.00540v1.pdf","comment":"27pages"},{"id":"http://arxiv.org/abs/1910.07655v4","updated":"2024-03-31T02:57:09Z","published":"2019-10-16T06:35:50Z","title":"Deep Semantic Segmentation of Natural and Medical Images: A Review","summary":" The semantic image segmentation task consists of classifying each pixel of an\nimage into an instance, where each instance corresponds to a class. This task\nis a part of the concept of scene understanding or better explaining the global\ncontext of an image. In the medical image analysis domain, image segmentation\ncan be used for image-guided interventions, radiotherapy, or improved\nradiological diagnostics. In this review, we categorize the leading deep\nlearning-based medical and non-medical image segmentation solutions into six\nmain groups of deep architectural, data synthesis-based, loss function-based,\nsequenced models, weakly supervised, and multi-task methods and provide a\ncomprehensive review of the contributions in each of these groups. Further, for\neach group, we analyze each variant of these groups and discuss the limitations\nof the current approaches and present potential future research directions for\nsemantic image segmentation.\n","authors":["Saeid Asgari Taghanaki","Kumar Abhishek","Joseph Paul Cohen","Julien Cohen-Adad","Ghassan Hamarneh"],"pdf_url":"https://arxiv.org/pdf/1910.07655v4.pdf","comment":"45 pages, 16 figures. Accepted for publication in Springer Artificial\n Intelligence Review"},{"id":"http://arxiv.org/abs/2404.00532v1","updated":"2024-03-31T02:16:16Z","published":"2024-03-31T02:16:16Z","title":"LLMs are Good Action Recognizers","summary":" Skeleton-based action recognition has attracted lots of research attention.\nRecently, to build an accurate skeleton-based action recognizer, a variety of\nworks have been proposed. Among them, some works use large model architectures\nas backbones of their recognizers to boost the skeleton data representation\ncapability, while some other works pre-train their recognizers on external data\nto enrich the knowledge. In this work, we observe that large language models\nwhich have been extensively used in various natural language processing tasks\ngenerally hold both large model architectures and rich implicit knowledge.\nMotivated by this, we propose a novel LLM-AR framework, in which we investigate\ntreating the Large Language Model as an Action Recognizer. In our framework, we\npropose a linguistic projection process to project each input action signal\n(i.e., each skeleton sequence) into its ``sentence format'' (i.e., an ``action\nsentence''). Moreover, we also incorporate our framework with several designs\nto further facilitate this linguistic projection process. Extensive experiments\ndemonstrate the efficacy of our proposed framework.\n","authors":["Haoxuan Qu","Yujun Cai","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2404.00532v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00524v1","updated":"2024-03-31T01:58:04Z","published":"2024-03-31T01:58:04Z","title":"TexVocab: Texture Vocabulary-conditioned Human Avatars","summary":" To adequately utilize the available image evidence in multi-view video-based\navatar modeling, we propose TexVocab, a novel avatar representation that\nconstructs a texture vocabulary and associates body poses with texture maps for\nanimation. Given multi-view RGB videos, our method initially back-projects all\nthe available images in the training videos to the posed SMPL surface,\nproducing texture maps in the SMPL UV domain. Then we construct pairs of human\nposes and texture maps to establish a texture vocabulary for encoding dynamic\nhuman appearances under various poses. Unlike the commonly used joint-wise\nmanner, we further design a body-part-wise encoding strategy to learn the\nstructural effects of the kinematic chain. Given a driving pose, we query the\npose feature hierarchically by decomposing the pose vector into several body\nparts and interpolating the texture features for synthesizing fine-grained\nhuman dynamics. Overall, our method is able to create animatable human avatars\nwith detailed and dynamic appearances from RGB videos, and the experiments show\nthat our method outperforms state-of-the-art approaches. The project page can\nbe found at https://texvocab.github.io/.\n","authors":["Yuxiao Liu","Zhe Li","Yebin Liu","Haoqian Wang"],"pdf_url":"https://arxiv.org/pdf/2404.00524v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00513v1","updated":"2024-03-31T01:20:16Z","published":"2024-03-31T01:20:16Z","title":"Transformer based Pluralistic Image Completion with Reduced Information\n Loss","summary":" Transformer based methods have achieved great success in image inpainting\nrecently. However, we find that these solutions regard each pixel as a token,\nthus suffering from an information loss issue from two aspects: 1) They\ndownsample the input image into much lower resolutions for efficiency\nconsideration. 2) They quantize $256^3$ RGB values to a small number (such as\n512) of quantized color values. The indices of quantized pixels are used as\ntokens for the inputs and prediction targets of the transformer. To mitigate\nthese issues, we propose a new transformer based framework called \"PUT\".\nSpecifically, to avoid input downsampling while maintaining computation\nefficiency, we design a patch-based auto-encoder P-VQVAE. The encoder converts\nthe masked image into non-overlapped patch tokens and the decoder recovers the\nmasked regions from the inpainted tokens while keeping the unmasked regions\nunchanged. To eliminate the information loss caused by input quantization, an\nUn-quantized Transformer is applied. It directly takes features from the\nP-VQVAE encoder as input without any quantization and only regards the\nquantized tokens as prediction targets. Furthermore, to make the inpainting\nprocess more controllable, we introduce semantic and structural conditions as\nextra guidance. Extensive experiments show that our method greatly outperforms\nexisting transformer based methods on image fidelity and achieves much higher\ndiversity and better fidelity than state-of-the-art pluralistic inpainting\nmethods on complex large-scale datasets (e.g., ImageNet). Codes are available\nat https://github.com/liuqk3/PUT.\n","authors":["Qiankun Liu","Yuqi Jiang","Zhentao Tan","Dongdong Chen","Ying Fu","Qi Chu","Gang Hua","Nenghai Yu"],"pdf_url":"https://arxiv.org/pdf/2404.00513v1.pdf","comment":"Accepted by TPAMI (2024)"},{"id":"http://arxiv.org/abs/2403.17719v2","updated":"2024-03-31T01:14:36Z","published":"2024-03-25T05:21:26Z","title":"Resolution Limit of Single-Photon LiDAR","summary":" Single-photon Light Detection and Ranging (LiDAR) systems are often equipped\nwith an array of detectors for improved spatial resolution and sensing speed.\nHowever, given a fixed amount of flux produced by the laser transmitter across\nthe scene, the per-pixel Signal-to-Noise Ratio (SNR) will decrease when more\npixels are packed in a unit space. This presents a fundamental trade-off\nbetween the spatial resolution of the sensor array and the SNR received at each\npixel. Theoretical characterization of this fundamental limit is explored. By\nderiving the photon arrival statistics and introducing a series of new\napproximation techniques, the Mean Squared Error (MSE) of the\nmaximum-likelihood estimator of the time delay is derived. The theoretical\npredictions align well with simulations and real data.\n","authors":["Stanley H. Chan","Hashan K. Weerasooriya","Weijian Zhang","Pamela Abshire","Istvan Gyongy","Robert K. Henderson"],"pdf_url":"https://arxiv.org/pdf/2403.17719v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00510v1","updated":"2024-03-31T01:05:28Z","published":"2024-03-31T01:05:28Z","title":"Denoising Low-dose Images Using Deep Learning of Time Series Images","summary":" Digital image devices have been widely applied in many fields, including\nscientific imaging, recognition of individuals, and remote sensing. As the\napplication of these imaging technologies to autonomous driving and\nmeasurement, image noise generated when observation cannot be performed with a\nsufficient dose has become a major problem. Machine learning denoise technology\nis expected to be the solver of this problem, but there are the following\nproblems. Here we report, artifacts generated by machine learning denoise in\nultra-low dose observation using an in-situ observation video of an electron\nmicroscope as an example. And as a method to solve this problem, we propose a\nmethod to decompose a time series image into a 2D image of the spatial axis and\ntime to perform machine learning denoise. Our method opens new avenues accurate\nand stable reconstruction of continuous high-resolution images from low-dose\nimaging in science, industry, and life.\n","authors":["Yang Shao","Toshie Yaguchi","Toshiaki Tanigaki"],"pdf_url":"https://arxiv.org/pdf/2404.00510v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00509v1","updated":"2024-03-31T00:59:10Z","published":"2024-03-31T00:59:10Z","title":"DailyMAE: Towards Pretraining Masked Autoencoders in One Day","summary":" Recently, masked image modeling (MIM), an important self-supervised learning\n(SSL) method, has drawn attention for its effectiveness in learning data\nrepresentation from unlabeled data. Numerous studies underscore the advantages\nof MIM, highlighting how models pretrained on extensive datasets can enhance\nthe performance of downstream tasks. However, the high computational demands of\npretraining pose significant challenges, particularly within academic\nenvironments, thereby impeding the SSL research progress. In this study, we\npropose efficient training recipes for MIM based SSL that focuses on mitigating\ndata loading bottlenecks and employing progressive training techniques and\nother tricks to closely maintain pretraining performance. Our library enables\nthe training of a MAE-Base/16 model on the ImageNet 1K dataset for 800 epochs\nwithin just 18 hours, using a single machine equipped with 8 A100 GPUs. By\nachieving speed gains of up to 5.8 times, this work not only demonstrates the\nfeasibility of conducting high-efficiency SSL training but also paves the way\nfor broader accessibility and promotes advancement in SSL research particularly\nfor prototyping and initial testing of SSL ideas. The code is available in\nhttps://github.com/erow/FastSSL.\n","authors":["Jiantao Wu","Shentong Mo","Sara Atito","Zhenhua Feng","Josef Kittler","Muhammad Awais"],"pdf_url":"https://arxiv.org/pdf/2404.00509v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00504v1","updated":"2024-03-31T00:20:53Z","published":"2024-03-31T00:20:53Z","title":"NYC-Indoor-VPR: A Long-Term Indoor Visual Place Recognition Dataset with\n Semi-Automatic Annotation","summary":" Visual Place Recognition (VPR) in indoor environments is beneficial to humans\nand robots for better localization and navigation. It is challenging due to\nappearance changes at various frequencies, and difficulties of obtaining ground\ntruth metric trajectories for training and evaluation. This paper introduces\nthe NYC-Indoor-VPR dataset, a unique and rich collection of over 36,000 images\ncompiled from 13 distinct crowded scenes in New York City taken under varying\nlighting conditions with appearance changes. Each scene has multiple revisits\nacross a year. To establish the ground truth for VPR, we propose a\nsemiautomatic annotation approach that computes the positional information of\neach image. Our method specifically takes pairs of videos as input and yields\nmatched pairs of images along with their estimated relative locations. The\naccuracy of this matching is refined by human annotators, who utilize our\nannotation software to correlate the selected keyframes. Finally, we present a\nbenchmark evaluation of several state-of-the-art VPR algorithms using our\nannotated dataset, revealing its challenge and thus value for VPR research.\n","authors":["Diwei Sheng","Anbang Yang","John-Ross Rizzo","Chen Feng"],"pdf_url":"https://arxiv.org/pdf/2404.00504v1.pdf","comment":"7 pages, 7 figures, published in 2024 IEEE International Conference\n on Robotics and Automation (ICRA 2024)"}]},"2024-03-30T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2402.03769v2","updated":"2024-03-30T23:04:10Z","published":"2024-02-06T07:22:50Z","title":"AttackNet: Enhancing Biometric Security via Tailored Convolutional\n Neural Network Architectures for Liveness Detection","summary":" Biometric security is the cornerstone of modern identity verification and\nauthentication systems, where the integrity and reliability of biometric\nsamples is of paramount importance. This paper introduces AttackNet, a bespoke\nConvolutional Neural Network architecture, meticulously designed to combat\nspoofing threats in biometric systems. Rooted in deep learning methodologies,\nthis model offers a layered defense mechanism, seamlessly transitioning from\nlow-level feature extraction to high-level pattern discernment. Three\ndistinctive architectural phases form the crux of the model, each underpinned\nby judiciously chosen activation functions, normalization techniques, and\ndropout layers to ensure robustness and resilience against adversarial attacks.\nBenchmarking our model across diverse datasets affirms its prowess, showcasing\nsuperior performance metrics in comparison to contemporary models. Furthermore,\na detailed comparative analysis accentuates the model's efficacy, drawing\nparallels with prevailing state-of-the-art methodologies. Through iterative\nrefinement and an informed architectural strategy, AttackNet underscores the\npotential of deep learning in safeguarding the future of biometric security.\n","authors":["Oleksandr Kuznetsov","Dmytro Zakharov","Emanuele Frontoni","Andrea Maranesi"],"pdf_url":"https://arxiv.org/pdf/2402.03769v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09373v3","updated":"2024-03-30T22:10:36Z","published":"2023-03-16T15:01:50Z","title":"MAPSeg: Unified Unsupervised Domain Adaptation for Heterogeneous Medical\n Image Segmentation Based on 3D Masked Autoencoding and Pseudo-Labeling","summary":" Robust segmentation is critical for deriving quantitative measures from\nlarge-scale, multi-center, and longitudinal medical scans. Manually annotating\nmedical scans, however, is expensive and labor-intensive and may not always be\navailable in every domain. Unsupervised domain adaptation (UDA) is a\nwell-studied technique that alleviates this label-scarcity problem by\nleveraging available labels from another domain. In this study, we introduce\nMasked Autoencoding and Pseudo-Labeling Segmentation (MAPSeg), a\n$\\textbf{unified}$ UDA framework with great versatility and superior\nperformance for heterogeneous and volumetric medical image segmentation. To the\nbest of our knowledge, this is the first study that systematically reviews and\ndevelops a framework to tackle four different domain shifts in medical image\nsegmentation. More importantly, MAPSeg is the first framework that can be\napplied to $\\textbf{centralized}$, $\\textbf{federated}$, and\n$\\textbf{test-time}$ UDA while maintaining comparable performance. We compare\nMAPSeg with previous state-of-the-art methods on a private infant brain MRI\ndataset and a public cardiac CT-MRI dataset, and MAPSeg outperforms others by a\nlarge margin (10.5 Dice improvement on the private MRI dataset and 5.7 on the\npublic CT-MRI dataset). MAPSeg poses great practical value and can be applied\nto real-world problems. GitHub: https://github.com/XuzheZ/MAPSeg/.\n","authors":["Xuzhe Zhang","Yuhao Wu","Elsa Angelini","Ang Li","Jia Guo","Jerod M. Rasmussen","Thomas G. O'Connor","Pathik D. Wadhwa","Andrea Parolin Jackowski","Hai Li","Jonathan Posner","Andrew F. Laine","Yun Wang"],"pdf_url":"https://arxiv.org/pdf/2303.09373v3.pdf","comment":"CVPR 2024 camera-ready (8 pages, 3 figures) with the supplemental\n materials (5 pages, 4 figures). Xuzhe Zhang and Yuhao Wu are co-first\n authors. Andrew F. Laine and Yun Wang are co-senior supervising authors"},{"id":"http://arxiv.org/abs/2310.07889v2","updated":"2024-03-30T22:00:22Z","published":"2023-10-11T20:52:30Z","title":"LangNav: Language as a Perceptual Representation for Navigation","summary":" We explore the use of language as a perceptual representation for\nvision-and-language navigation (VLN), with a focus on low-data settings. Our\napproach uses off-the-shelf vision systems for image captioning and object\ndetection to convert an agent's egocentric panoramic view at each time step\ninto natural language descriptions. We then finetune a pretrained language\nmodel to select an action, based on the current view and the trajectory\nhistory, that would best fulfill the navigation instructions. In contrast to\nthe standard setup which adapts a pretrained language model to work directly\nwith continuous visual features from pretrained vision models, our approach\ninstead uses (discrete) language as the perceptual representation. We explore\nseveral use cases of our language-based navigation (LangNav) approach on the\nR2R VLN benchmark: generating synthetic trajectories from a prompted language\nmodel (GPT-4) with which to finetune a smaller language model; domain transfer\nwhere we transfer a policy learned on one simulated environment (ALFRED) to\nanother (more realistic) environment (R2R); and combining both vision- and\nlanguage-based representations for VLN. Our approach is found to improve upon\nbaselines that rely on visual features in settings where only a few expert\ntrajectories (10-100) are available, demonstrating the potential of language as\na perceptual representation for navigation.\n","authors":["Bowen Pan","Rameswar Panda","SouYoung Jin","Rogerio Feris","Aude Oliva","Phillip Isola","Yoon Kim"],"pdf_url":"https://arxiv.org/pdf/2310.07889v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17791v2","updated":"2024-03-30T20:51:33Z","published":"2023-11-29T16:35:24Z","title":"U-Net v2: Rethinking the Skip Connections of U-Net for Medical Image\n Segmentation","summary":" In this paper, we introduce U-Net v2, a new robust and efficient U-Net\nvariant for medical image segmentation. It aims to augment the infusion of\nsemantic information into low-level features while simultaneously refining\nhigh-level features with finer details. For an input image, we begin by\nextracting multi-level features with a deep neural network encoder. Next, we\nenhance the feature map of each level by infusing semantic information from\nhigher-level features and integrating finer details from lower-level features\nthrough Hadamard product. Our novel skip connections empower features of all\nthe levels with enriched semantic characteristics and intricate details. The\nimproved features are subsequently transmitted to the decoder for further\nprocessing and segmentation. Our method can be seamlessly integrated into any\nEncoder-Decoder network. We evaluate our method on several public medical image\nsegmentation datasets for skin lesion segmentation and polyp segmentation, and\nthe experimental results demonstrate the segmentation accuracy of our new\nmethod over state-of-the-art methods, while preserving memory and computational\nefficiency. Code is available at: https://github.com/yaoppeng/U-Net_v2\n","authors":["Yaopeng Peng","Milan Sonka","Danny Z. Chen"],"pdf_url":"https://arxiv.org/pdf/2311.17791v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13659v2","updated":"2024-03-30T20:14:03Z","published":"2024-03-20T15:08:43Z","title":"Recursive Joint Cross-Modal Attention for Multimodal Fusion in\n Dimensional Emotion Recognition","summary":" Though multimodal emotion recognition has achieved significant progress over\nrecent years, the potential of rich synergic relationships across the\nmodalities is not fully exploited. In this paper, we introduce Recursive Joint\nCross-Modal Attention (RJCMA) to effectively capture both intra-and inter-modal\nrelationships across audio, visual and text modalities for dimensional emotion\nrecognition. In particular, we compute the attention weights based on\ncross-correlation between the joint audio-visual-text feature representations\nand the feature representations of individual modalities to simultaneously\ncapture intra- and inter-modal relationships across the modalities. The\nattended features of the individual modalities are again fed as input to the\nfusion model in a recursive mechanism to obtain more refined feature\nrepresentations. We have also explored Temporal Convolutional Networks (TCNs)\nto improve the temporal modeling of the feature representations of individual\nmodalities. Extensive experiments are conducted to evaluate the performance of\nthe proposed fusion model on the challenging Affwild2 dataset. By effectively\ncapturing the synergic intra- and inter-modal relationships across audio,\nvisual and text modalities, the proposed fusion model achieves a Concordance\nCorrelation Coefficient (CCC) of 0.585 (0.542) and 0.659 (0.619) for valence\nand arousal respectively on the validation set (test set). This shows a\nsignificant improvement over the baseline of 0.24 (0.211) and 0.20 (0.191) for\nvalence and arousal respectively on the validation set (test set) of the\nvalence-arousal challenge of 6th Affective Behavior Analysis in-the-Wild (ABAW)\ncompetition.\n","authors":["R. Gnana Praveen","Jahangir Alam"],"pdf_url":"https://arxiv.org/pdf/2403.13659v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09383v3","updated":"2024-03-30T18:22:34Z","published":"2023-03-16T15:13:09Z","title":"Unifying Top-down and Bottom-up Scanpath Prediction Using Transformers","summary":" Most models of visual attention aim at predicting either top-down or\nbottom-up control, as studied using different visual search and free-viewing\ntasks. In this paper we propose the Human Attention Transformer (HAT), a single\nmodel that predicts both forms of attention control. HAT uses a novel\ntransformer-based architecture and a simplified foveated retina that\ncollectively create a spatio-temporal awareness akin to the dynamic visual\nworking memory of humans. HAT not only establishes a new state-of-the-art in\npredicting the scanpath of fixations made during target-present and\ntarget-absent visual search and ``taskless'' free viewing, but also makes human\ngaze behavior interpretable. Unlike previous methods that rely on a coarse grid\nof fixation cells and experience information loss due to fixation\ndiscretization, HAT features a sequential dense prediction architecture and\noutputs a dense heatmap for each fixation, thus avoiding discretizing\nfixations. HAT sets a new standard in computational attention, which emphasizes\neffectiveness, generality, and interpretability. HAT's demonstrated scope and\napplicability will likely inspire the development of new attention models that\ncan better predict human behavior in various attention-demanding scenarios.\nCode is available at https://github.com/cvlab-stonybrook/HAT.\n","authors":["Zhibo Yang","Sounak Mondal","Seoyoung Ahn","Ruoyu Xue","Gregory Zelinsky","Minh Hoai","Dimitris Samaras"],"pdf_url":"https://arxiv.org/pdf/2303.09383v3.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2402.05079v2","updated":"2024-03-30T17:51:35Z","published":"2024-02-07T18:33:04Z","title":"Mamba-UNet: UNet-Like Pure Visual Mamba for Medical Image Segmentation","summary":" In recent advancements in medical image analysis, Convolutional Neural\nNetworks (CNN) and Vision Transformers (ViT) have set significant benchmarks.\nWhile the former excels in capturing local features through its convolution\noperations, the latter achieves remarkable global context understanding by\nleveraging self-attention mechanisms. However, both architectures exhibit\nlimitations in efficiently modeling long-range dependencies within medical\nimages, which is a critical aspect for precise segmentation. Inspired by the\nMamba architecture, known for its proficiency in handling long sequences and\nglobal contextual information with enhanced computational efficiency as a State\nSpace Model (SSM), we propose Mamba-UNet, a novel architecture that synergizes\nthe U-Net in medical image segmentation with Mamba's capability. Mamba-UNet\nadopts a pure Visual Mamba (VMamba)-based encoder-decoder structure, infused\nwith skip connections to preserve spatial information across different scales\nof the network. This design facilitates a comprehensive feature learning\nprocess, capturing intricate details and broader semantic contexts within\nmedical images. We introduce a novel integration mechanism within the VMamba\nblocks to ensure seamless connectivity and information flow between the encoder\nand decoder paths, enhancing the segmentation performance. We conducted\nexperiments on publicly available ACDC MRI Cardiac segmentation dataset, and\nSynapse CT Abdomen segmentation dataset. The results show that Mamba-UNet\noutperforms several types of UNet in medical image segmentation under the same\nhyper-parameter setting. The source code and baseline implementations are\navailable.\n","authors":["Ziyang Wang","Jian-Qing Zheng","Yichi Zhang","Ge Cui","Lei Li"],"pdf_url":"https://arxiv.org/pdf/2402.05079v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18216v2","updated":"2024-03-30T17:00:18Z","published":"2023-05-29T17:00:40Z","title":"Towards minimizing efforts for Morphing Attacks -- Deep embeddings for\n morphing pair selection and improved Morphing Attack Detection","summary":" Face Morphing Attacks pose a threat to the security of identity documents,\nespecially with respect to a subsequent access control process, because it\nenables both individuals involved to exploit the same document. In this study,\nface embeddings serve two purposes: pre-selecting images for large-scale\nMorphing Attack generation and detecting potential Morphing Attacks. We build\nupon previous embedding studies in both use cases using the MagFace model. For\nthe first objective, we employ an pre-selection algorithm that pairs\nindividuals based on face embedding similarity. We quantify the attack\npotential of differently morphed face images to compare the usability of\npre-selection in automatically generating numerous successful Morphing Attacks.\nRegarding the second objective, we compare embeddings from two state-of-the-art\nface recognition systems in terms of their ability to detect Morphing Attacks.\nOur findings demonstrate that ArcFace and MagFace provide valuable face\nembeddings for image pre-selection. Both open-source and COTS face recognition\nsystems are susceptible to generated attacks, particularly when pre-selection\nis based on embeddings rather than random pairing which was only constrained by\nsoft biometrics. More accurate face recognition systems exhibit greater\nvulnerability to attacks, with COTS systems being the most susceptible.\nAdditionally, MagFace embeddings serve as a robust alternative for detecting\nmorphed face images compared to the previously used ArcFace embeddings. The\nresults endorse the advantages of face embeddings in more effective image\npre-selection for face morphing and accurate detection of morphed face images.\nThis is supported by extensive analysis of various designed attacks. The\nMagFace model proves to be a powerful alternative to the commonly used ArcFace\nmodel for both objectives, pre-selection and attack detection.\n","authors":["Roman Kessler","Kiran Raja","Juan Tapia","Christoph Busch"],"pdf_url":"https://arxiv.org/pdf/2305.18216v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19314v2","updated":"2024-03-30T16:36:17Z","published":"2024-03-28T11:12:33Z","title":"Total-Decom: Decomposed 3D Scene Reconstruction with Minimal Interaction","summary":" Scene reconstruction from multi-view images is a fundamental problem in\ncomputer vision and graphics. Recent neural implicit surface reconstruction\nmethods have achieved high-quality results; however, editing and manipulating\nthe 3D geometry of reconstructed scenes remains challenging due to the absence\nof naturally decomposed object entities and complex object/background\ncompositions. In this paper, we present Total-Decom, a novel method for\ndecomposed 3D reconstruction with minimal human interaction. Our approach\nseamlessly integrates the Segment Anything Model (SAM) with hybrid\nimplicit-explicit neural surface representations and a mesh-based\nregion-growing technique for accurate 3D object decomposition. Total-Decom\nrequires minimal human annotations while providing users with real-time control\nover the granularity and quality of decomposition. We extensively evaluate our\nmethod on benchmark datasets and demonstrate its potential for downstream\napplications, such as animation and scene editing. The code is available at\nhttps://github.com/CVMI-Lab/Total-Decom.git.\n","authors":["Xiaoyang Lyu","Chirui Chang","Peng Dai","Yang-Tian Sun","Xiaojuan Qi"],"pdf_url":"https://arxiv.org/pdf/2403.19314v2.pdf","comment":"8 pages, 7 figures, accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2402.10038v2","updated":"2024-03-30T16:10:47Z","published":"2024-02-15T16:00:58Z","title":"RS-DPO: A Hybrid Rejection Sampling and Direct Preference Optimization\n Method for Alignment of Large Language Models","summary":" Reinforcement learning from human feedback (RLHF) has been extensively\nemployed to align large language models with user intent. However, proximal\npolicy optimization (PPO) based RLHF is occasionally unstable requiring\nsignificant hyperparameter finetuning, and computationally expensive to\nmaximize the estimated reward during alignment. Recently, direct preference\noptimization (DPO) is proposed to address those challenges. However, DPO relies\non contrastive responses generated from human annotator and alternative LLM,\ninstead of the policy model, limiting the effectiveness of the RLHF. In this\npaper, we addresses both challenges by systematically combining rejection\nsampling (RS) and DPO. Our proposed method, RS-DPO, initiates with the\ndevelopment of a supervised fine-tuned policy model (SFT). A varied set of k\nresponses per prompt are sampled directly from the SFT model. RS-DPO identifies\npairs of contrastive samples based on their reward distribution. Finally, we\napply DPO with the contrastive samples to align the model to human preference.\nOur experiments indicate that our proposed method effectively fine-tunes LLMs\nwith limited resource environments, leading to improved alignment with user\nintent. Furthermore, it outperforms existing methods, including RS, PPO, and\nDPO.\n","authors":["Saeed Khaki","JinJin Li","Lan Ma","Liu Yang","Prathap Ramachandra"],"pdf_url":"https://arxiv.org/pdf/2402.10038v2.pdf","comment":"16 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.10509v2","updated":"2024-03-30T16:09:14Z","published":"2023-08-21T06:50:29Z","title":"An Examination of the Compositionality of Large Generative\n Vision-Language Models","summary":" With the success of Large Language Models (LLMs), many Generative\nVision-Language Models (GVLMs) have been constructed via multimodal instruction\ntuning. However, the performance of GVLMs in multimodal compositional reasoning\nremains under-explored. In this paper, we examine both the evaluation metrics\n(VisualGPTScore, etc.) and current benchmarks for evaluating the\ncompositionality of GVLMs. We identify the syntactical bias in current\nbenchmarks, which is exploited by the linguistic capability of GVLMs. The bias\nrenders VisualGPTScore an insufficient metric for assessing GVLMs. To combat\nthis, we first introduce a SyntaxBias Score, leveraging LLMs to quantify such\nbias for mitigation. A challenging new task is subsequently added to evaluate\nthe robustness of GVLMs against inherent inclination toward syntactical\ncorrectness. Using the bias-mitigated datasets and the new task, we propose a\nnovel benchmark, namely SyntActically DE-biased benchmark (SADE). Our study\nprovides an unbiased benchmark for the compositionality of GVLMs, facilitating\nfuture research in this direction (Code and dataset are available at\nhttps://github.com/TeleeMa/SADE).\n","authors":["Teli Ma","Rong Li","Junwei Liang"],"pdf_url":"https://arxiv.org/pdf/2308.10509v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.05471v2","updated":"2024-03-30T16:06:36Z","published":"2023-07-11T17:56:22Z","title":"Scale Alone Does not Improve Mechanistic Interpretability in Vision\n Models","summary":" In light of the recent widespread adoption of AI systems, understanding the\ninternal information processing of neural networks has become increasingly\ncritical. Most recently, machine vision has seen remarkable progress by scaling\nneural networks to unprecedented levels in dataset and model size. We here ask\nwhether this extraordinary increase in scale also positively impacts the field\nof mechanistic interpretability. In other words, has our understanding of the\ninner workings of scaled neural networks improved as well? We use a\npsychophysical paradigm to quantify one form of mechanistic interpretability\nfor a diverse suite of nine models and find no scaling effect for\ninterpretability - neither for model nor dataset size. Specifically, none of\nthe investigated state-of-the-art models are easier to interpret than the\nGoogLeNet model from almost a decade ago. Latest-generation vision models\nappear even less interpretable than older architectures, hinting at a\nregression rather than improvement, with modern models sacrificing\ninterpretability for accuracy. These results highlight the need for models\nexplicitly designed to be mechanistically interpretable and the need for more\nhelpful interpretability methods to increase our understanding of networks at\nan atomic level. We release a dataset containing more than 130'000 human\nresponses from our psychophysical evaluation of 767 units across nine models.\nThis dataset facilitates research on automated instead of human-based\ninterpretability evaluations, which can ultimately be leveraged to directly\noptimize the mechanistic interpretability of models.\n","authors":["Roland S. Zimmermann","Thomas Klein","Wieland Brendel"],"pdf_url":"https://arxiv.org/pdf/2307.05471v2.pdf","comment":"Spotlight at NeurIPS 2023. The first two authors contributed equally.\n Code available at https://brendel-group.github.io/imi/"},{"id":"http://arxiv.org/abs/2403.13589v2","updated":"2024-03-30T15:53:31Z","published":"2024-03-20T13:37:29Z","title":"ReGround: Improving Textual and Spatial Grounding at No Cost","summary":" When an image generation process is guided by both a text prompt and spatial\ncues, such as a set of bounding boxes, do these elements work in harmony, or\ndoes one dominate the other? Our analysis of a pretrained image diffusion model\nthat integrates gated self-attention into the U-Net reveals that spatial\ngrounding often outweighs textual grounding due to the sequential flow from\ngated self-attention to cross-attention. We demonstrate that such bias can be\nsignificantly mitigated without sacrificing accuracy in either grounding by\nsimply rewiring the network architecture, changing from sequential to parallel\nfor gated self-attention and cross-attention. This surprisingly simple yet\neffective solution does not require any fine-tuning of the network but\nsignificantly reduces the trade-off between the two groundings. Our experiments\ndemonstrate significant improvements from the original GLIGEN to the rewired\nversion in the trade-off between textual grounding and spatial grounding.\n","authors":["Yuseung Lee","Minhyuk Sung"],"pdf_url":"https://arxiv.org/pdf/2403.13589v2.pdf","comment":"Project page: https://re-ground.github.io/"},{"id":"http://arxiv.org/abs/2401.17759v3","updated":"2024-03-30T15:46:35Z","published":"2024-01-31T11:36:12Z","title":"Rapid post-disaster infrastructure damage characterisation enabled by\n remote sensing and deep learning technologies -- a tiered approach","summary":" Critical infrastructure are systematically targeted during wars and extensive\nnatural disasters because critical infrastructure is vital for enabling\nconnectivity and transportation of people and goods, and hence, underpins\nnational and international economic growth. Mass destruction of transport\nassets, in conjunction with minimal or no accessibility in the wake of natural\nand anthropogenic disasters, prevents us from delivering rapid recovery and\nadaptation. A solution to this challenge is to use technology that enables\nstand-off observations. Nevertheless, no methods exist for the integrated\ncharacterisation of damage at multiple scales, i.e. regional, asset, and\nstructural scales, while there is no systematic correlation between\ninfrastructure damage assessments across these scales. We propose a methodology\nbased on an integrated multi-scale tiered approach to fill this capability gap.\nIn doing so, we demonstrate how damage characterisation can be enabled by\nfit-for-purpose digital technologies. Next, the methodology is applied and\nvalidated to a case study in Ukraine that includes 17 bridges all damages by\nhuman targeted interventions. From macro to micro, we deploy technology to\nintegrate assessments at scale, using from Sentinel-1 SAR images, crowdsourced\ninformation, and high-resolution images to deep learning to characterise\ninfrastructure damage. For the first time, the interferometric coherence\ndifference and semantic segmentation of images were deployed to improve the\nreliability of damage characterisations at different scales, i.e. regional,\ninfrastructure asset and component, with the aim of enhancing the damage\ncharacterisation accuracy. This integrated approach accelerates\ndecision-making, and therefore, facilitates more efficient restoration and\nadaptation efforts, ultimately fostering resilience into our infrastructure.\n","authors":["Nadiia Kopiika","Andreas Karavias","Pavlos Krassakis","Zehao Ye","Jelena Ninic","Nataliya Shakhovska","Nikolaos Koukouzas","Sotirios Argyroudis","Stergios-Aristoteles Mitoulis"],"pdf_url":"https://arxiv.org/pdf/2401.17759v3.pdf","comment":"Main text (33 pages,15 figures); Supplementary materials (19 pages)"},{"id":"http://arxiv.org/abs/2312.09238v2","updated":"2024-03-30T15:35:16Z","published":"2023-12-14T18:58:12Z","title":"Auto MC-Reward: Automated Dense Reward Design with Large Language Models\n for Minecraft","summary":" Many reinforcement learning environments (e.g., Minecraft) provide only\nsparse rewards that indicate task completion or failure with binary values. The\nchallenge in exploration efficiency in such environments makes it difficult for\nreinforcement-learning-based agents to learn complex tasks. To address this,\nthis paper introduces an advanced learning system, named Auto MC-Reward, that\nleverages Large Language Models (LLMs) to automatically design dense reward\nfunctions, thereby enhancing the learning efficiency. Auto MC-Reward consists\nof three important components: Reward Designer, Reward Critic, and Trajectory\nAnalyzer. Given the environment information and task descriptions, the Reward\nDesigner first design the reward function by coding an executable Python\nfunction with predefined observation inputs. Then, our Reward Critic will be\nresponsible for verifying the code, checking whether the code is\nself-consistent and free of syntax and semantic errors. Further, the Trajectory\nAnalyzer summarizes possible failure causes and provides refinement suggestions\naccording to collected trajectories. In the next round, Reward Designer will\nfurther refine and iterate the dense reward function based on feedback.\nExperiments demonstrate a significant improvement in the success rate and\nlearning efficiency of our agents in complex tasks in Minecraft, such as\nobtaining diamond with the efficient ability to avoid lava, and efficiently\nexplore trees and animals that are sparse in the plains biome.\n","authors":["Hao Li","Xue Yang","Zhaokai Wang","Xizhou Zhu","Jie Zhou","Yu Qiao","Xiaogang Wang","Hongsheng Li","Lewei Lu","Jifeng Dai"],"pdf_url":"https://arxiv.org/pdf/2312.09238v2.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2402.16774v2","updated":"2024-03-30T14:38:44Z","published":"2024-02-26T17:45:00Z","title":"Video-Based Autism Detection with Deep Learning","summary":" Individuals with Autism Spectrum Disorder (ASD) often experience challenges\nin health, communication, and sensory processing; therefore, early diagnosis is\nnecessary for proper treatment and care. In this work, we consider the problem\nof detecting or classifying ASD children to aid medical professionals in early\ndiagnosis. We develop a deep learning model that analyzes video clips of\nchildren reacting to sensory stimuli, with the intent of capturing key\ndifferences in reactions and behavior between ASD and non-ASD participants.\nUnlike many recent studies in ASD classification with MRI data, which require\nexpensive specialized equipment, our method utilizes a powerful but relatively\naffordable GPU, a standard computer setup, and a video camera for inference.\nResults show that our model effectively generalizes and understands key\ndifferences in the distinct movements of the children. It is noteworthy that\nour model exhibits successful classification performance despite the limited\namount of data for a deep learning problem and limited temporal information\navailable for learning, even with the motion artifacts.\n","authors":["M. Serna-Aguilera","X. B. Nguyen","A. Singh","L. Rockers","S. Park","L. Neely","H. Seo","K. Luu"],"pdf_url":"https://arxiv.org/pdf/2402.16774v2.pdf","comment":"Poster Abstract. Accepted into 2024 IEEE Green Technologies\n Conference"},{"id":"http://arxiv.org/abs/2311.15855v2","updated":"2024-03-30T14:21:40Z","published":"2023-11-27T14:22:07Z","title":"SiTH: Single-view Textured Human Reconstruction with Image-Conditioned\n Diffusion","summary":" A long-standing goal of 3D human reconstruction is to create lifelike and\nfully detailed 3D humans from single-view images. The main challenge lies in\ninferring unknown body shapes, appearances, and clothing details in areas not\nvisible in the images. To address this, we propose SiTH, a novel pipeline that\nuniquely integrates an image-conditioned diffusion model into a 3D mesh\nreconstruction workflow. At the core of our method lies the decomposition of\nthe challenging single-view reconstruction problem into generative\nhallucination and reconstruction subproblems. For the former, we employ a\npowerful generative diffusion model to hallucinate unseen back-view appearance\nbased on the input images. For the latter, we leverage skinned body meshes as\nguidance to recover full-body texture meshes from the input and back-view\nimages. SiTH requires as few as 500 3D human scans for training while\nmaintaining its generality and robustness to diverse images. Extensive\nevaluations on two 3D human benchmarks, including our newly created one,\nhighlighted our method's superior accuracy and perceptual quality in 3D\ntextured human reconstruction. Our code and evaluation benchmark are available\nat https://ait.ethz.ch/sith\n","authors":["Hsuan-I Ho","Jie Song","Otmar Hilliges"],"pdf_url":"https://arxiv.org/pdf/2311.15855v2.pdf","comment":"23 pages, 23 figures, CVPR 2024"},{"id":"http://arxiv.org/abs/2312.03029v2","updated":"2024-03-30T14:19:10Z","published":"2023-12-05T11:01:44Z","title":"Gaussian Head Avatar: Ultra High-fidelity Head Avatar via Dynamic\n Gaussians","summary":" Creating high-fidelity 3D head avatars has always been a research hotspot,\nbut there remains a great challenge under lightweight sparse view setups. In\nthis paper, we propose Gaussian Head Avatar represented by controllable 3D\nGaussians for high-fidelity head avatar modeling. We optimize the neutral 3D\nGaussians and a fully learned MLP-based deformation field to capture complex\nexpressions. The two parts benefit each other, thereby our method can model\nfine-grained dynamic details while ensuring expression accuracy. Furthermore,\nwe devise a well-designed geometry-guided initialization strategy based on\nimplicit SDF and Deep Marching Tetrahedra for the stability and convergence of\nthe training procedure. Experiments show our approach outperforms other\nstate-of-the-art sparse-view methods, achieving ultra high-fidelity rendering\nquality at 2K resolution even under exaggerated expressions.\n","authors":["Yuelang Xu","Benwang Chen","Zhe Li","Hongwen Zhang","Lizhen Wang","Zerong Zheng","Yebin Liu"],"pdf_url":"https://arxiv.org/pdf/2312.03029v2.pdf","comment":"Projectpage: https://yuelangx.github.io/gaussianheadavatar, Code:\n https://github.com/YuelangX/Gaussian-Head-Avatar"},{"id":"http://arxiv.org/abs/2312.09313v3","updated":"2024-03-30T14:01:27Z","published":"2023-12-14T19:38:06Z","title":"LatentEditor: Text Driven Local Editing of 3D Scenes","summary":" While neural fields have made significant strides in view synthesis and scene\nreconstruction, editing them poses a formidable challenge due to their implicit\nencoding of geometry and texture information from multi-view inputs. In this\npaper, we introduce \\textsc{LatentEditor}, an innovative framework designed to\nempower users with the ability to perform precise and locally controlled\nediting of neural fields using text prompts. Leveraging denoising diffusion\nmodels, we successfully embed real-world scenes into the latent space,\nresulting in a faster and more adaptable NeRF backbone for editing compared to\ntraditional methods. To enhance editing precision, we introduce a delta score\nto calculate the 2D mask in the latent space that serves as a guide for local\nmodifications while preserving irrelevant regions. Our novel pixel-level\nscoring approach harnesses the power of InstructPix2Pix (IP2P) to discern the\ndisparity between IP2P conditional and unconditional noise predictions in the\nlatent space. The edited latents conditioned on the 2D masks are then\niteratively updated in the training set to achieve 3D local editing. Our\napproach achieves faster editing speeds and superior output quality compared to\nexisting 3D editing models, bridging the gap between textual instructions and\nhigh-quality 3D scene editing in latent space. We show the superiority of our\napproach on four benchmark 3D datasets, LLFF, IN2N, NeRFStudio and NeRF-Art.\n","authors":["Umar Khalid","Hasan Iqbal","Nazmul Karim","Jing Hua","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2312.09313v3.pdf","comment":"Project Page: https://latenteditor.github.io/"},{"id":"http://arxiv.org/abs/2403.19002v2","updated":"2024-03-30T14:00:27Z","published":"2024-03-27T20:52:30Z","title":"Robust Active Speaker Detection in Noisy Environments","summary":" This paper addresses the issue of active speaker detection (ASD) in noisy\nenvironments and formulates a robust active speaker detection (rASD) problem.\nExisting ASD approaches leverage both audio and visual modalities, but\nnon-speech sounds in the surrounding environment can negatively impact\nperformance. To overcome this, we propose a novel framework that utilizes\naudio-visual speech separation as guidance to learn noise-free audio features.\nThese features are then utilized in an ASD model, and both tasks are jointly\noptimized in an end-to-end framework. Our proposed framework mitigates residual\nnoise and audio quality reduction issues that can occur in a naive cascaded\ntwo-stage framework that directly uses separated speech for ASD, and enables\nthe two tasks to be optimized simultaneously. To further enhance the robustness\nof the audio features and handle inherent speech noises, we propose a dynamic\nweighted loss approach to train the speech separator. We also collected a\nreal-world noise audio dataset to facilitate investigations. Experiments\ndemonstrate that non-speech audio noises significantly impact ASD models, and\nour proposed approach improves ASD performance in noisy environments. The\nframework is general and can be applied to different ASD approaches to improve\ntheir robustness. Our code, models, and data will be released.\n","authors":["Siva Sai Nagender Vasireddy","Chenxu Zhang","Xiaohu Guo","Yapeng Tian"],"pdf_url":"https://arxiv.org/pdf/2403.19002v2.pdf","comment":"15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2205.14375v5","updated":"2024-03-30T13:49:58Z","published":"2022-05-28T09:08:50Z","title":"WaveMix: A Resource-efficient Neural Network for Image Analysis","summary":" We propose a novel neural architecture for computer vision -- WaveMix -- that\nis resource-efficient and yet generalizable and scalable. While using fewer\ntrainable parameters, GPU RAM, and computations, WaveMix networks achieve\ncomparable or better accuracy than the state-of-the-art convolutional neural\nnetworks, vision transformers, and token mixers for several tasks. This\nefficiency can translate to savings in time, cost, and energy. To achieve these\ngains we used multi-level two-dimensional discrete wavelet transform (2D-DWT)\nin WaveMix blocks, which has the following advantages: (1) It reorganizes\nspatial information based on three strong image priors -- scale-invariance,\nshift-invariance, and sparseness of edges -- (2) in a lossless manner without\nadding parameters, (3) while also reducing the spatial sizes of feature maps,\nwhich reduces the memory and time required for forward and backward passes, and\n(4) expanding the receptive field faster than convolutions do. The whole\narchitecture is a stack of self-similar and resolution-preserving WaveMix\nblocks, which allows architectural flexibility for various tasks and levels of\nresource availability. WaveMix establishes new benchmarks for segmentation on\nCityscapes; and for classification on Galaxy 10 DECals, Places-365, five EMNIST\ndatasets, and iNAT-mini and performs competitively on other benchmarks. Our\ncode and trained models are publicly available.\n","authors":["Pranav Jeevan","Kavitha Viswanathan","Anandu A S","Amit Sethi"],"pdf_url":"https://arxiv.org/pdf/2205.14375v5.pdf","comment":"20 pages, 5 figures"},{"id":"http://arxiv.org/abs/2403.18035v2","updated":"2024-03-30T13:28:54Z","published":"2024-03-26T18:40:36Z","title":"Bidirectional Consistency Models","summary":" Diffusion models (DMs) are capable of generating remarkably high-quality\nsamples by iteratively denoising a random vector, a process that corresponds to\nmoving along the probability flow ordinary differential equation (PF ODE).\nInterestingly, DMs can also invert an input image to noise by moving backward\nalong the PF ODE, a key operation for downstream tasks such as interpolation\nand image editing. However, the iterative nature of this process restricts its\nspeed, hindering its broader application. Recently, Consistency Models (CMs)\nhave emerged to address this challenge by approximating the integral of the PF\nODE, largely reducing the number of iterations. Yet, the absence of an explicit\nODE solver complicates the inversion process. To resolve this, we introduce the\nBidirectional Consistency Model (BCM), which learns a single neural network\nthat enables both forward and backward traversal along the PF ODE, efficiently\nunifying generation and inversion tasks within one framework. Notably, our\nproposed method enables one-step generation and inversion while also allowing\nthe use of additional steps to enhance generation quality or reduce\nreconstruction error. Furthermore, by leveraging our model's bidirectional\nconsistency, we introduce a sampling strategy that can enhance FID while\npreserving the generated image content. We further showcase our model's\ncapabilities in several downstream tasks, such as interpolation and inpainting,\nand present demonstrations of potential applications, including blind\nrestoration of compressed images and defending black-box adversarial attacks.\n","authors":["Liangchen Li","Jiajun He"],"pdf_url":"https://arxiv.org/pdf/2403.18035v2.pdf","comment":"40 pages, 25 figures"},{"id":"http://arxiv.org/abs/2403.15952v2","updated":"2024-03-30T13:21:42Z","published":"2024-03-23T23:06:32Z","title":"IllusionVQA: A Challenging Optical Illusion Dataset for Vision Language\n Models","summary":" The advent of Vision Language Models (VLM) has allowed researchers to\ninvestigate the visual understanding of a neural network using natural\nlanguage. Beyond object classification and detection, VLMs are capable of\nvisual comprehension and common-sense reasoning. This naturally led to the\nquestion: How do VLMs respond when the image itself is inherently unreasonable?\nTo this end, we present IllusionVQA: a diverse dataset of challenging optical\nillusions and hard-to-interpret scenes to test the capability of VLMs in two\ndistinct multiple-choice VQA tasks - comprehension and soft localization.\nGPT4V, the best-performing VLM, achieves 62.99% accuracy (4-shot) on the\ncomprehension task and 49.7% on the localization task (4-shot and\nChain-of-Thought). Human evaluation reveals that humans achieve 91.03% and 100%\naccuracy in comprehension and localization. We discover that In-Context\nLearning (ICL) and Chain-of-Thought reasoning substantially degrade the\nperformance of GeminiPro on the localization task. Tangentially, we discover a\npotential weakness in the ICL capabilities of VLMs: they fail to locate optical\nillusions even when the correct answer is in the context window as a few-shot\nexample.\n","authors":["Haz Sameen Shahgir","Khondker Salman Sayeed","Abhik Bhattacharjee","Wasi Uddin Ahmad","Yue Dong","Rifat Shahriyar"],"pdf_url":"https://arxiv.org/pdf/2403.15952v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00081v2","updated":"2024-03-30T12:45:08Z","published":"2023-11-30T03:20:37Z","title":"Synthesize, Diagnose, and Optimize: Towards Fine-Grained Vision-Language\n Understanding","summary":" Vision language models (VLM) have demonstrated remarkable performance across\nvarious downstream tasks. However, understanding fine-grained visual-linguistic\nconcepts, such as attributes and inter-object relationships, remains a\nsignificant challenge. While several benchmarks aim to evaluate VLMs in finer\ngranularity, their primary focus remains on the linguistic aspect, neglecting\nthe visual dimension. Here, we highlight the importance of evaluating VLMs from\nboth a textual and visual perspective. We introduce a progressive pipeline to\nsynthesize images that vary in a specific attribute while ensuring consistency\nin all other aspects. Utilizing this data engine, we carefully design a\nbenchmark, SPEC, to diagnose the comprehension of object size, position,\nexistence, and count. Subsequently, we conduct a thorough evaluation of four\nleading VLMs on SPEC. Surprisingly, their performance is close to random guess,\nrevealing significant limitations. With this in mind, we propose a simple yet\neffective approach to optimize VLMs in fine-grained understanding, achieving\nsignificant improvements on SPEC without compromising the zero-shot\nperformance. Results on two additional fine-grained benchmarks also show\nconsistent improvements, further validating the transferability of our\napproach. Code and data are available at https://github.com/wjpoom/SPEC.\n","authors":["Wujian Peng","Sicheng Xie","Zuyao You","Shiyi Lan","Zuxuan Wu"],"pdf_url":"https://arxiv.org/pdf/2312.00081v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2312.14024v2","updated":"2024-03-30T12:39:06Z","published":"2023-12-21T16:54:09Z","title":"NICP: Neural ICP for 3D Human Registration at Scale","summary":" Aligning a template to 3D human point clouds is a long-standing problem\ncrucial for tasks like animation, reconstruction, and enabling supervised\nlearning pipelines. Recent data-driven methods leverage predicted surface\ncorrespondences; however, they are not robust to varied poses, identities, or\nnoise. In contrast, industrial solutions often rely on expensive manual\nannotations or multi-view capturing systems. Recently, neural fields have shown\npromising results. Still, their purely data-driven and extrinsic nature does\nnot incorporate any guidance toward the target surface, often resulting in a\ntrivial misalignment of the template registration. Currently, no method can be\nconsidered the standard for 3D Human registration, limiting the scalability of\ndownstream applications. In this work, we propose NSR, a pipeline that, for the\nfirst time, generalizes and scales across thousands of shapes and more than ten\ndifferent data sources. Our essential contribution is NICP, an ICP-style\nself-supervised task tailored to neural fields. NICP takes a few seconds, is\nself-supervised, and works out of the box on pre-trained neural fields. We\ncombine it with a localized Neural Field trained on a large MoCap dataset. NSR\nachieves the state of the art over public benchmarks, and the release of its\ncode and checkpoints will provide the community with a powerful tool useful for\nmany downstream tasks like dataset alignments, cleaning, or asset animation.\n","authors":["Riccardo Marin","Enric Corona","Gerard Pons-Moll"],"pdf_url":"https://arxiv.org/pdf/2312.14024v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16739v2","updated":"2024-03-30T12:08:08Z","published":"2023-11-28T12:35:13Z","title":"As-Plausible-As-Possible: Plausibility-Aware Mesh Deformation Using 2D\n Diffusion Priors","summary":" We present As-Plausible-as-Possible (APAP) mesh deformation technique that\nleverages 2D diffusion priors to preserve the plausibility of a mesh under\nuser-controlled deformation. Our framework uses per-face Jacobians to represent\nmesh deformations, where mesh vertex coordinates are computed via a\ndifferentiable Poisson Solve. The deformed mesh is rendered, and the resulting\n2D image is used in the Score Distillation Sampling (SDS) process, which\nenables extracting meaningful plausibility priors from a pretrained 2D\ndiffusion model. To better preserve the identity of the edited mesh, we\nfine-tune our 2D diffusion model with LoRA. Gradients extracted by SDS and a\nuser-prescribed handle displacement are then backpropagated to the per-face\nJacobians, and we use iterative gradient descent to compute the final\ndeformation that balances between the user edit and the output plausibility. We\nevaluate our method with 2D and 3D meshes and demonstrate qualitative and\nquantitative improvements when using plausibility priors over\ngeometry-preservation or distortion-minimization priors used by previous\ntechniques. Our project page is at: https://as-plausible-aspossible.github.io/\n","authors":["Seungwoo Yoo","Kunho Kim","Vladimir G. Kim","Minhyuk Sung"],"pdf_url":"https://arxiv.org/pdf/2311.16739v2.pdf","comment":"Project page: https://as-plausible-as-possible.github.io/"},{"id":"http://arxiv.org/abs/2401.04071v2","updated":"2024-03-30T12:05:52Z","published":"2024-01-08T18:18:02Z","title":"Fun with Flags: Robust Principal Directions via Flag Manifolds","summary":" Principal component analysis (PCA), along with its extensions to manifolds\nand outlier contaminated data, have been indispensable in computer vision and\nmachine learning. In this work, we present a unifying formalism for PCA and its\nvariants, and introduce a framework based on the flags of linear subspaces, ie\na hierarchy of nested linear subspaces of increasing dimension, which not only\nallows for a common implementation but also yields novel variants, not explored\npreviously. We begin by generalizing traditional PCA methods that either\nmaximize variance or minimize reconstruction error. We expand these\ninterpretations to develop a wide array of new dimensionality reduction\nalgorithms by accounting for outliers and the data manifold. To devise a common\ncomputational approach, we recast robust and dual forms of PCA as optimization\nproblems on flag manifolds. We then integrate tangent space approximations of\nprincipal geodesic analysis (tangent-PCA) into this flag-based framework,\ncreating novel robust and dual geodesic PCA variations. The remarkable\nflexibility offered by the 'flagification' introduced here enables even more\nalgorithmic variants identified by specific flag types. Last but not least, we\npropose an effective convergent solver for these flag-formulations employing\nthe Stiefel manifold. Our empirical results on both real-world and synthetic\nscenarios, demonstrate the superiority of our novel algorithms, especially in\nterms of robustness to outliers on manifolds.\n","authors":["Nathan Mankovich","Gustau Camps-Valls","Tolga Birdal"],"pdf_url":"https://arxiv.org/pdf/2401.04071v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12463v2","updated":"2024-03-30T11:35:52Z","published":"2023-12-18T19:02:07Z","title":"Open Vocabulary Semantic Scene Sketch Understanding","summary":" We study the underexplored but fundamental vision problem of machine\nunderstanding of abstract freehand scene sketches. We introduce a sketch\nencoder that results in semantically-aware feature space, which we evaluate by\ntesting its performance on a semantic sketch segmentation task. To train our\nmodel we rely only on the availability of bitmap sketches with their brief\ncaptions and do not require any pixel-level annotations. To obtain\ngeneralization to a large set of sketches and categories, we build on a vision\ntransformer encoder pretrained with the CLIP model. We freeze the text encoder\nand perform visual-prompt tuning of the visual encoder branch while introducing\na set of critical modifications. Firstly, we augment the classical key-query\n(k-q) self-attention blocks with value-value (v-v) self-attention blocks.\nCentral to our model is a two-level hierarchical network design that enables\nefficient semantic disentanglement: The first level ensures holistic scene\nsketch encoding, and the second level focuses on individual categories. We,\nthen, in the second level of the hierarchy, introduce a cross-attention between\ntextual and visual branches. Our method outperforms zero-shot CLIP pixel\naccuracy of segmentation results by 37 points, reaching an accuracy of $85.5\\%$\non the FS-COCO sketch dataset. Finally, we conduct a user study that allows us\nto identify further improvements needed over our method to reconcile machine\nand human understanding of scene sketches.\n","authors":["Ahmed Bourouis","Judith Ellen Fan","Yulia Gryaditskaya"],"pdf_url":"https://arxiv.org/pdf/2312.12463v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08835v3","updated":"2024-03-30T11:01:17Z","published":"2023-11-15T10:22:35Z","title":"Correlation-guided Query-Dependency Calibration in Video Representation\n Learning for Temporal Grounding","summary":" Video Temporal Grounding is to identify specific moments or highlights from a\nvideo corresponding to textual descriptions. Typical approaches in temporal\ngrounding treat all video clips equally during the encoding process regardless\nof their semantic relevance with the text query. Therefore, we propose\nCorrelation-Guided DEtection TRansformer(CG-DETR), exploring to provide clues\nfor query-associated video clips within the cross-modal attention. First, we\ndesign an adaptive cross-attention with dummy tokens. Dummy tokens conditioned\nby text query take portions of the attention weights, preventing irrelevant\nvideo clips from being represented by the text query. Yet, not all words\nequally inherit the text query's correlation to video clips. Thus, we further\nguide the cross-attention map by inferring the fine-grained correlation between\nvideo clips and words. We enable this by learning a joint embedding space for\nhigh-level concepts, i.e., moment and sentence level, and inferring the\nclip-word correlation. Lastly, we exploit the moment-specific characteristics\nand combine them with the context of each video to form a moment-adaptive\nsaliency detector. By exploiting the degrees of text engagement in each video\nclip, it precisely measures the highlightness of each clip. CG-DETR achieves\nstate-of-the-art results on various benchmarks for temporal grounding.\n","authors":["WonJun Moon","Sangeek Hyun","SuBeen Lee","Jae-Pil Heo"],"pdf_url":"https://arxiv.org/pdf/2311.08835v3.pdf","comment":"34 pages, 16 figures, 13 tables, Code is available at\n https://github.com/wjun0830/CGDETR"},{"id":"http://arxiv.org/abs/2312.02244v2","updated":"2024-03-30T10:49:41Z","published":"2023-12-04T12:30:07Z","title":"Geometrically-driven Aggregation for Zero-shot 3D Point Cloud\n Understanding","summary":" Zero-shot 3D point cloud understanding can be achieved via 2D Vision-Language\nModels (VLMs). Existing strategies directly map Vision-Language Models from 2D\npixels of rendered or captured views to 3D points, overlooking the inherent and\nexpressible point cloud geometric structure. Geometrically similar or close\nregions can be exploited for bolstering point cloud understanding as they are\nlikely to share semantic information. To this end, we introduce the first\ntraining-free aggregation technique that leverages the point cloud's 3D\ngeometric structure to improve the quality of the transferred Vision-Language\nModels. Our approach operates iteratively, performing local-to-global\naggregation based on geometric and semantic point-level reasoning. We benchmark\nour approach on three downstream tasks, including classification, part\nsegmentation, and semantic segmentation, with a variety of datasets\nrepresenting both synthetic/real-world, and indoor/outdoor scenarios. Our\napproach achieves new state-of-the-art results in all benchmarks. We will\nrelease the source code publicly.\n","authors":["Guofeng Mei","Luigi Riz","Yiming Wang","Fabio Poiesi"],"pdf_url":"https://arxiv.org/pdf/2312.02244v2.pdf","comment":"Zero-shot, point cloud, 2D Vision-Language Models, geometric\n structure, training-free"},{"id":"http://arxiv.org/abs/2312.01307v2","updated":"2024-03-30T10:46:34Z","published":"2023-12-03T07:22:42Z","title":"SAGE: Bridging Semantic and Actionable Parts for GEneralizable\n Manipulation of Articulated Objects","summary":" To interact with daily-life articulated objects of diverse structures and\nfunctionalities, understanding the object parts plays a central role in both\nuser instruction comprehension and task execution. However, the possible\ndiscordance between the semantic meaning and physics functionalities of the\nparts poses a challenge for designing a general system. To address this\nproblem, we propose SAGE, a novel framework that bridges semantic and\nactionable parts of articulated objects to achieve generalizable manipulation\nunder natural language instructions. More concretely, given an articulated\nobject, we first observe all the semantic parts on it, conditioned on which an\ninstruction interpreter proposes possible action programs that concretize the\nnatural language instruction. Then, a part-grounding module maps the semantic\nparts into so-called Generalizable Actionable Parts (GAParts), which inherently\ncarry information about part motion. End-effector trajectories are predicted on\nthe GAParts, which, together with the action program, form an executable\npolicy. Additionally, an interactive feedback module is incorporated to respond\nto failures, which closes the loop and increases the robustness of the overall\nframework. Key to the success of our framework is the joint proposal and\nknowledge fusion between a large vision-language model (VLM) and a small\ndomain-specific model for both context comprehension and part perception, with\nthe former providing general intuitions and the latter serving as expert facts.\nBoth simulation and real-robot experiments show our effectiveness in handling a\nlarge variety of articulated objects with diverse language-instructed goals.\n","authors":["Haoran Geng","Songlin Wei","Congyue Deng","Bokui Shen","He Wang","Leonidas Guibas"],"pdf_url":"https://arxiv.org/pdf/2312.01307v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13347v2","updated":"2024-03-30T09:45:38Z","published":"2024-03-20T07:15:22Z","title":"vid-TLDR: Training Free Token merging for Light-weight Video Transformer","summary":" Video Transformers have become the prevalent solution for various video\ndownstream tasks with superior expressive power and flexibility. However, these\nvideo transformers suffer from heavy computational costs induced by the massive\nnumber of tokens across the entire video frames, which has been the major\nbarrier to training the model. Further, the patches irrelevant to the main\ncontents, e.g., backgrounds, degrade the generalization performance of models.\nTo tackle these issues, we propose training free token merging for lightweight\nvideo Transformer (vid-TLDR) that aims to enhance the efficiency of video\nTransformers by merging the background tokens without additional training. For\nvid-TLDR, we introduce a novel approach to capture the salient regions in\nvideos only with the attention map. Further, we introduce the saliency-aware\ntoken merging strategy by dropping the background tokens and sharpening the\nobject scores. Our experiments show that vid-TLDR significantly mitigates the\ncomputational complexity of video Transformers while achieving competitive\nperformance compared to the base model without vid-TLDR. Code is available at\nhttps://github.com/mlvlab/vid-TLDR.\n","authors":["Joonmyung Choi","Sanghyeok Lee","Jaewon Chu","Minhyuk Choi","Hyunwoo J. Kim"],"pdf_url":"https://arxiv.org/pdf/2403.13347v2.pdf","comment":"Conference on Computer Vision and Pattern Recognition (CVPR), 2024"},{"id":"http://arxiv.org/abs/2402.17726v3","updated":"2024-03-30T09:35:47Z","published":"2024-02-27T17:58:09Z","title":"VRP-SAM: SAM with Visual Reference Prompt","summary":" In this paper, we propose a novel Visual Reference Prompt (VRP) encoder that\nempowers the Segment Anything Model (SAM) to utilize annotated reference images\nas prompts for segmentation, creating the VRP-SAM model. In essence, VRP-SAM\ncan utilize annotated reference images to comprehend specific objects and\nperform segmentation of specific objects in target image. It is note that the\nVRP encoder can support a variety of annotation formats for reference images,\nincluding \\textbf{point}, \\textbf{box}, \\textbf{scribble}, and \\textbf{mask}.\nVRP-SAM achieves a breakthrough within the SAM framework by extending its\nversatility and applicability while preserving SAM's inherent strengths, thus\nenhancing user-friendliness. To enhance the generalization ability of VRP-SAM,\nthe VRP encoder adopts a meta-learning strategy. To validate the effectiveness\nof VRP-SAM, we conducted extensive empirical studies on the Pascal and COCO\ndatasets. Remarkably, VRP-SAM achieved state-of-the-art performance in visual\nreference segmentation with minimal learnable parameters. Furthermore, VRP-SAM\ndemonstrates strong generalization capabilities, allowing it to perform\nsegmentation of unseen objects and enabling cross-domain segmentation. The\nsource code and models will be available at\n\\url{https://github.com/syp2ysy/VRP-SAM}\n","authors":["Yanpeng Sun","Jiahui Chen","Shan Zhang","Xinyu Zhang","Qiang Chen","Gang Zhang","Errui Ding","Jingdong Wang","Zechao Li"],"pdf_url":"https://arxiv.org/pdf/2402.17726v3.pdf","comment":"Accepted by CVPR 2024; The camera-ready version"},{"id":"http://arxiv.org/abs/2011.14598v4","updated":"2024-03-30T09:24:13Z","published":"2020-11-30T07:44:52Z","title":"Video Self-Stitching Graph Network for Temporal Action Localization","summary":" Temporal action localization (TAL) in videos is a challenging task,\nespecially due to the large variation in action temporal scales. Short actions\nusually occupy a major proportion in the datasets, but tend to have the lowest\nperformance. In this paper, we confront the challenge of short actions and\npropose a multi-level cross-scale solution dubbed as video self-stitching graph\nnetwork (VSGN). We have two key components in VSGN: video self-stitching (VSS)\nand cross-scale graph pyramid network (xGPN). In VSS, we focus on a short\nperiod of a video and magnify it along the temporal dimension to obtain a\nlarger scale. We stitch the original clip and its magnified counterpart in one\ninput sequence to take advantage of the complementary properties of both\nscales. The xGPN component further exploits the cross-scale correlations by a\npyramid of cross-scale graph networks, each containing a hybrid module to\naggregate features from across scales as well as within the same scale. Our\nVSGN not only enhances the feature representations, but also generates more\npositive anchors for short actions and more short training samples. Experiments\ndemonstrate that VSGN obviously improves the localization performance of short\nactions as well as achieving the state-of-the-art overall performance on\nTHUMOS-14 and ActivityNet-v1.3.\n","authors":["Chen Zhao","Ali Thabet","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2011.14598v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07710v2","updated":"2024-03-30T09:20:36Z","published":"2024-02-12T15:23:19Z","title":"Optimizing Sparse Convolution on GPUs with CUDA for 3D Point Cloud\n Processing in Embedded Systems","summary":" In recent years, there has been a significant increase in the utilization of\ndeep learning methods, particularly convolutional neural networks (CNNs), which\nhave emerged as the dominant approach in various domains that involve\nstructured grid data, such as picture analysis and processing. Nevertheless,\nthe exponential growth in the utilization of LiDAR and 3D sensors across many\ndomains has resulted in an increased need for the analysis of 3D point clouds.\nThe utilization of 3D point clouds is crucial in various applications,\nincluding object recognition and segmentation, as they offer a spatial\ndepiction of things within a three-dimensional environment. In contrast to\nphotos, point clouds exhibit sparsity and lack a regular grid, hence posing\ndistinct processing and computational issues.\n","authors":["Chester Luo","Kevin Lai"],"pdf_url":"https://arxiv.org/pdf/2402.07710v2.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2205.10490v2","updated":"2024-03-30T08:52:40Z","published":"2022-05-21T02:38:16Z","title":"Aligning Logits Generatively for Principled Black-Box Knowledge\n Distillation","summary":" Black-Box Knowledge Distillation (B2KD) is a formulated problem for\ncloud-to-edge model compression with invisible data and models hosted on the\nserver. B2KD faces challenges such as limited Internet exchange and edge-cloud\ndisparity of data distributions. In this paper, we formalize a two-step\nworkflow consisting of deprivatization and distillation, and theoretically\nprovide a new optimization direction from logits to cell boundary different\nfrom direct logits alignment. With its guidance, we propose a new method\nMapping-Emulation KD (MEKD) that distills a black-box cumbersome model into a\nlightweight one. Our method does not differentiate between treating soft or\nhard responses, and consists of: 1) deprivatization: emulating the inverse\nmapping of the teacher function with a generator, and 2) distillation: aligning\nlow-dimensional logits of the teacher and student models by reducing the\ndistance of high-dimensional image points. For different teacher-student pairs,\nour method yields inspiring distillation performance on various benchmarks, and\noutperforms the previous state-of-the-art approaches.\n","authors":["Jing Ma","Xiang Xiang","Ke Wang","Yuchuan Wu","Yongbin Li"],"pdf_url":"https://arxiv.org/pdf/2205.10490v2.pdf","comment":"To appear at CVPR 2024; significantly rewritten with extra\n experiments since the preliminary report"},{"id":"http://arxiv.org/abs/2311.17516v4","updated":"2024-03-30T08:35:17Z","published":"2023-11-29T10:39:53Z","title":"MMA-Diffusion: MultiModal Attack on Diffusion Models","summary":" In recent years, Text-to-Image (T2I) models have seen remarkable\nadvancements, gaining widespread adoption. However, this progress has\ninadvertently opened avenues for potential misuse, particularly in generating\ninappropriate or Not-Safe-For-Work (NSFW) content. Our work introduces\nMMA-Diffusion, a framework that presents a significant and realistic threat to\nthe security of T2I models by effectively circumventing current defensive\nmeasures in both open-source models and commercial online services. Unlike\nprevious approaches, MMA-Diffusion leverages both textual and visual modalities\nto bypass safeguards like prompt filters and post-hoc safety checkers, thus\nexposing and highlighting the vulnerabilities in existing defense mechanisms.\n","authors":["Yijun Yang","Ruiyuan Gao","Xiaosen Wang","Tsung-Yi Ho","Nan Xu","Qiang Xu"],"pdf_url":"https://arxiv.org/pdf/2311.17516v4.pdf","comment":"CVPR 2024. Our codes and benchmarks are available at\n https://github.com/cure-lab/MMA-Diffusion"},{"id":"http://arxiv.org/abs/2401.04105v2","updated":"2024-03-30T08:06:01Z","published":"2024-01-08T18:59:31Z","title":"Dr$^2$Net: Dynamic Reversible Dual-Residual Networks for\n Memory-Efficient Finetuning","summary":" Large pretrained models are increasingly crucial in modern computer vision\ntasks. These models are typically used in downstream tasks by end-to-end\nfinetuning, which is highly memory-intensive for tasks with high-resolution\ndata, e.g., video understanding, small object detection, and point cloud\nanalysis. In this paper, we propose Dynamic Reversible Dual-Residual Networks,\nor Dr$^2$Net, a novel family of network architectures that acts as a surrogate\nnetwork to finetune a pretrained model with substantially reduced memory\nconsumption. Dr$^2$Net contains two types of residual connections, one\nmaintaining the residual structure in the pretrained models, and the other\nmaking the network reversible. Due to its reversibility, intermediate\nactivations, which can be reconstructed from output, are cleared from memory\nduring training. We use two coefficients on either type of residual connections\nrespectively, and introduce a dynamic training strategy that seamlessly\ntransitions the pretrained model to a reversible network with much higher\nnumerical precision. We evaluate Dr$^2$Net on various pretrained models and\nvarious tasks, and show that it can reach comparable performance to\nconventional finetuning but with significantly less memory usage.\n","authors":["Chen Zhao","Shuming Liu","Karttikeya Mangalam","Guocheng Qian","Fatimah Zohra","Abdulmohsen Alghannam","Jitendra Malik","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2401.04105v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08869v2","updated":"2024-03-30T07:23:20Z","published":"2023-12-10T08:25:41Z","title":"I'M HOI: Inertia-aware Monocular Capture of 3D Human-Object Interactions","summary":" We are living in a world surrounded by diverse and \"smart\" devices with rich\nmodalities of sensing ability. Conveniently capturing the interactions between\nus humans and these objects remains far-reaching. In this paper, we present\nI'm-HOI, a monocular scheme to faithfully capture the 3D motions of both the\nhuman and object in a novel setting: using a minimal amount of RGB camera and\nobject-mounted Inertial Measurement Unit (IMU). It combines general motion\ninference and category-aware refinement. For the former, we introduce a\nholistic human-object tracking method to fuse the IMU signals and the RGB\nstream and progressively recover the human motions and subsequently the\ncompanion object motions. For the latter, we tailor a category-aware motion\ndiffusion model, which is conditioned on both the raw IMU observations and the\nresults from the previous stage under over-parameterization representation. It\nsignificantly refines the initial results and generates vivid body, hand, and\nobject motions. Moreover, we contribute a large dataset with ground truth human\nand object motions, dense RGB inputs, and rich object-mounted IMU measurements.\nExtensive experiments demonstrate the effectiveness of I'm-HOI under a hybrid\ncapture setting. Our dataset and code will be released to the community.\n","authors":["Chengfeng Zhao","Juze Zhang","Jiashen Du","Ziwei Shan","Junye Wang","Jingyi Yu","Jingya Wang","Lan Xu"],"pdf_url":"https://arxiv.org/pdf/2312.08869v2.pdf","comment":"Accepted to CVPR 2024. Project page:\n https://afterjourney00.github.io/IM-HOI.github.io/"},{"id":"http://arxiv.org/abs/2312.05716v2","updated":"2024-03-30T07:04:15Z","published":"2023-12-10T00:51:05Z","title":"Initialization Matters for Adversarial Transfer Learning","summary":" With the prevalence of the Pretraining-Finetuning paradigm in transfer\nlearning, the robustness of downstream tasks has become a critical concern. In\nthis work, we delve into adversarial robustness in transfer learning and reveal\nthe critical role of initialization, including both the pretrained model and\nthe linear head. First, we discover the necessity of an adversarially robust\npretrained model. Specifically, we reveal that with a standard pretrained\nmodel, Parameter-Efficient Finetuning (PEFT) methods either fail to be\nadversarially robust or continue to exhibit significantly degraded adversarial\nrobustness on downstream tasks, even with adversarial training during\nfinetuning. Leveraging a robust pretrained model, surprisingly, we observe that\na simple linear probing can outperform full finetuning and other PEFT methods\nwith random initialization on certain datasets. We further identify that linear\nprobing excels in preserving robustness from the robust pretraining. Based on\nthis, we propose Robust Linear Initialization (RoLI) for adversarial\nfinetuning, which initializes the linear head with the weights obtained by\nadversarial linear probing to maximally inherit the robustness from\npretraining. Across five different image classification datasets, we\ndemonstrate the effectiveness of RoLI and achieve new state-of-the-art results.\nOur code is available at \\url{https://github.com/DongXzz/RoLI}.\n","authors":["Andong Hua","Jindong Gu","Zhiyu Xue","Nicholas Carlini","Eric Wong","Yao Qin"],"pdf_url":"https://arxiv.org/pdf/2312.05716v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2307.12872v2","updated":"2024-03-30T06:59:35Z","published":"2023-07-24T15:10:22Z","title":"Latent Code Augmentation Based on Stable Diffusion for Data-free\n Substitute Attacks","summary":" Since the training data of the target model is not available in the black-box\nsubstitute attack, most recent schemes utilize GANs to generate data for\ntraining the substitute model. However, these GANs-based schemes suffer from\nlow training efficiency as the generator needs to be retrained for each target\nmodel during the substitute training process, as well as low generation\nquality. To overcome these limitations, we consider utilizing the diffusion\nmodel to generate data, and propose a novel data-free substitute attack scheme\nbased on the Stable Diffusion (SD) to improve the efficiency and accuracy of\nsubstitute training. Despite the data generated by the SD exhibiting high\nquality, it presents a different distribution of domains and a large variation\nof positive and negative samples for the target model. For this problem, we\npropose Latent Code Augmentation (LCA) to facilitate SD in generating data that\naligns with the data distribution of the target model. Specifically, we augment\nthe latent codes of the inferred member data with LCA and use them as guidance\nfor SD. With the guidance of LCA, the data generated by the SD not only meets\nthe discriminative criteria of the target model but also exhibits high\ndiversity. By utilizing this data, it is possible to train the substitute model\nthat closely resembles the target model more efficiently. Extensive experiments\ndemonstrate that our LCA achieves higher attack success rates and requires\nfewer query budgets compared to GANs-based schemes for different target models.\nOur codes are available at \\url{https://github.com/LzhMeng/LCA}.\n","authors":["Mingwen Shao","Lingzhuang Meng","Yuanjian Qiao","Lixu Zhang","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2307.12872v2.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2309.01327v2","updated":"2024-03-30T06:50:28Z","published":"2023-09-04T03:06:04Z","title":"Can I Trust Your Answer? Visually Grounded Video Question Answering","summary":" We study visually grounded VideoQA in response to the emerging trends of\nutilizing pretraining techniques for video-language understanding.\nSpecifically, by forcing vision-language models (VLMs) to answer questions and\nsimultaneously provide visual evidence, we seek to ascertain the extent to\nwhich the predictions of such techniques are genuinely anchored in relevant\nvideo content, versus spurious correlations from language or irrelevant visual\ncontext. Towards this, we construct NExT-GQA -- an extension of NExT-QA with\n10.5$K$ temporal grounding (or location) labels tied to the original QA pairs.\nWith NExT-GQA, we scrutinize a series of state-of-the-art VLMs. Through\npost-hoc attention analysis, we find that these models are extremely weak in\nsubstantiating the answers despite their strong QA performance. This exposes\nthe limitation of current VLMs in making reliable predictions. As a remedy, we\nfurther explore and propose a grounded-QA method via Gaussian mask optimization\nand cross-modal learning. Experiments with different backbones demonstrate that\nthis grounding mechanism improves both grounding and QA. With these efforts, we\naim to push towards trustworthy VLMs in VQA systems. Our dataset and code are\navailable at https://github.com/doc-doc/NExT-GQA.\n","authors":["Junbin Xiao","Angela Yao","Yicong Li","Tat Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2309.01327v2.pdf","comment":"Accepted to CVPR'24. (Compared with preprint version, we mainly\n improve the presentation, discuss more related works, and extend experiments\n in Appendix.)"},{"id":"http://arxiv.org/abs/2312.02224v2","updated":"2024-03-30T06:50:25Z","published":"2023-12-03T22:05:05Z","title":"Tracing Hyperparameter Dependencies for Model Parsing via Learnable\n Graph Pooling Network","summary":" Model Parsing defines the research task of predicting hyperparameters of the\ngenerative model (GM), given a generated image as input. Since a diverse set of\nhyperparameters is jointly employed by the generative model, and dependencies\noften exist among them, it is crucial to learn these hyperparameter\ndependencies for the improved model parsing performance. To explore such\nimportant dependencies, we propose a novel model parsing method called\nLearnable Graph Pooling Network (LGPN). Specifically, we transform model\nparsing into a graph node classification task, using graph nodes and edges to\nrepresent hyperparameters and their dependencies, respectively. Furthermore,\nLGPN incorporates a learnable pooling-unpooling mechanism tailored to model\nparsing, which adaptively learns hyperparameter dependencies of GMs used to\ngenerate the input image. We also extend our proposed method to CNN-generated\nimage detection and coordinate attacks detection. Empirically, we achieve\nstate-of-the-art results in model parsing and its extended applications,\nshowing the effectiveness of our method. Our source code are available.\n","authors":["Xiao Guo","Vishal Asnani","Sijia Liu","Xiaoming Liu"],"pdf_url":"https://arxiv.org/pdf/2312.02224v2.pdf","comment":"24 pages, 15 figures, 17 tables"},{"id":"http://arxiv.org/abs/2403.15955v3","updated":"2024-03-30T06:42:02Z","published":"2024-03-23T23:22:54Z","title":"Finding needles in a haystack: A Black-Box Approach to Invisible\n Watermark Detection","summary":" In this paper, we propose WaterMark Detection (WMD), the first invisible\nwatermark detection method under a black-box and annotation-free setting. WMD\nis capable of detecting arbitrary watermarks within a given reference dataset\nusing a clean non-watermarked dataset as a reference, without relying on\nspecific decoding methods or prior knowledge of the watermarking techniques. We\ndevelop WMD using foundations of offset learning, where a clean non-watermarked\ndataset enables us to isolate the influence of only watermarked samples in the\nreference dataset. Our comprehensive evaluations demonstrate the effectiveness\nof WMD, significantly outperforming naive detection methods, which only yield\nAUC scores around 0.5. In contrast, WMD consistently achieves impressive\ndetection AUC scores, surpassing 0.9 in most single-watermark datasets and\nexceeding 0.7 in more challenging multi-watermark scenarios across diverse\ndatasets and watermarking methods. As invisible watermarks become increasingly\nprevalent, while specific decoding techniques remain undisclosed, our approach\nprovides a versatile solution and establishes a path toward increasing\naccountability, transparency, and trust in our digital visual content.\n","authors":["Minzhou Pan","Zhenting Wang","Xin Dong","Vikash Sehwag","Lingjuan Lyu","Xue Lin"],"pdf_url":"https://arxiv.org/pdf/2403.15955v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.12790v2","updated":"2024-03-30T06:36:06Z","published":"2023-09-22T11:02:57Z","title":"NTO3D: Neural Target Object 3D Reconstruction with Segment Anything","summary":" Neural 3D reconstruction from multi-view images has recently attracted\nincreasing attention from the community. Existing methods normally learn a\nneural field for the whole scene, while it is still under-explored how to\nreconstruct a target object indicated by users. Considering the Segment\nAnything Model (SAM) has shown effectiveness in segmenting any 2D images, in\nthis paper, we propose NTO3D, a novel high-quality Neural Target Object 3D\n(NTO3D) reconstruction method, which leverages the benefits of both neural\nfield and SAM. We first propose a novel strategy to lift the multi-view 2D\nsegmentation masks of SAM into a unified 3D occupancy field. The 3D occupancy\nfield is then projected into 2D space and generates the new prompts for SAM.\nThis process is iterative until convergence to separate the target object from\nthe scene. After this, we then lift the 2D features of the SAM encoder into a\n3D feature field in order to improve the reconstruction quality of the target\nobject. NTO3D lifts the 2D masks and features of SAM into the 3D neural field\nfor high-quality neural target object 3D reconstruction. We conduct detailed\nexperiments on several benchmark datasets to demonstrate the advantages of our\nmethod. The code will be available at: https://github.com/ucwxb/NTO3D.\n","authors":["Xiaobao Wei","Renrui Zhang","Jiarui Wu","Jiaming Liu","Ming Lu","Yandong Guo","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.12790v2.pdf","comment":"accepted by CVPR24"},{"id":"http://arxiv.org/abs/2310.02279v3","updated":"2024-03-30T06:29:48Z","published":"2023-10-01T05:07:17Z","title":"Consistency Trajectory Models: Learning Probability Flow ODE Trajectory\n of Diffusion","summary":" Consistency Models (CM) (Song et al., 2023) accelerate score-based diffusion\nmodel sampling at the cost of sample quality but lack a natural way to\ntrade-off quality for speed. To address this limitation, we propose Consistency\nTrajectory Model (CTM), a generalization encompassing CM and score-based models\nas special cases. CTM trains a single neural network that can -- in a single\nforward pass -- output scores (i.e., gradients of log-density) and enables\nunrestricted traversal between any initial and final time along the Probability\nFlow Ordinary Differential Equation (ODE) in a diffusion process. CTM enables\nthe efficient combination of adversarial training and denoising score matching\nloss to enhance performance and achieves new state-of-the-art FIDs for\nsingle-step diffusion model sampling on CIFAR-10 (FID 1.73) and ImageNet at\n64x64 resolution (FID 1.92). CTM also enables a new family of sampling schemes,\nboth deterministic and stochastic, involving long jumps along the ODE solution\ntrajectories. It consistently improves sample quality as computational budgets\nincrease, avoiding the degradation seen in CM. Furthermore, unlike CM, CTM's\naccess to the score function can streamline the adoption of established\ncontrollable/conditional generation methods from the diffusion community. This\naccess also enables the computation of likelihood. The code is available at\nhttps://github.com/sony/ctm.\n","authors":["Dongjun Kim","Chieh-Hsin Lai","Wei-Hsiang Liao","Naoki Murata","Yuhta Takida","Toshimitsu Uesaka","Yutong He","Yuki Mitsufuji","Stefano Ermon"],"pdf_url":"https://arxiv.org/pdf/2310.02279v3.pdf","comment":"International Conference on Learning Representations"},{"id":"http://arxiv.org/abs/2403.19026v2","updated":"2024-03-30T06:15:36Z","published":"2024-03-27T21:43:12Z","title":"Egocentric Scene-aware Human Trajectory Prediction","summary":" Wearable collaborative robots stand to assist human wearers who need fall\nprevention assistance or wear exoskeletons. Such a robot needs to be able to\npredict the ego motion of the wearer based on egocentric vision and the\nsurrounding scene. In this work, we leveraged body-mounted cameras and sensors\nto anticipate the trajectory of human wearers through complex surroundings. To\nfacilitate research in ego-motion prediction, we have collected a comprehensive\nwalking scene navigation dataset centered on the user's perspective. We present\na method to predict human motion conditioning on the surrounding static scene.\nOur method leverages a diffusion model to produce a distribution of potential\nfuture trajectories, taking into account the user's observation of the\nenvironment. We introduce a compact representation to encode the user's visual\nmemory of the surroundings, as well as an efficient sample-generating technique\nto speed up real-time inference of a diffusion model. We ablate our model and\ncompare it to baselines, and results show that our model outperforms existing\nmethods on key metrics of collision avoidance and trajectory mode coverage.\n","authors":["Weizhuo Wang","C. Karen Liu","Monroe Kennedy III"],"pdf_url":"https://arxiv.org/pdf/2403.19026v2.pdf","comment":"14 pages, 9 figures"},{"id":"http://arxiv.org/abs/2402.18102v2","updated":"2024-03-30T06:06:38Z","published":"2024-02-28T06:45:47Z","title":"Passive Snapshot Coded Aperture Dual-Pixel RGB-D Imaging","summary":" Passive, compact, single-shot 3D sensing is useful in many application areas\nsuch as microscopy, medical imaging, surgical navigation, and autonomous\ndriving where form factor, time, and power constraints can exist. Obtaining\nRGB-D scene information over a short imaging distance, in an ultra-compact form\nfactor, and in a passive, snapshot manner is challenging. Dual-pixel (DP)\nsensors are a potential solution to achieve the same. DP sensors collect light\nrays from two different halves of the lens in two interleaved pixel arrays,\nthus capturing two slightly different views of the scene, like a stereo camera\nsystem. However, imaging with a DP sensor implies that the defocus blur size is\ndirectly proportional to the disparity seen between the views. This creates a\ntrade-off between disparity estimation vs. deblurring accuracy. To improve this\ntrade-off effect, we propose CADS (Coded Aperture Dual-Pixel Sensing), in which\nwe use a coded aperture in the imaging lens along with a DP sensor. In our\napproach, we jointly learn an optimal coded pattern and the reconstruction\nalgorithm in an end-to-end optimization setting. Our resulting CADS imaging\nsystem demonstrates improvement of >1.5dB PSNR in all-in-focus (AIF) estimates\nand 5-6% in depth estimation quality over naive DP sensing for a wide range of\naperture settings. Furthermore, we build the proposed CADS prototypes for DSLR\nphotography settings and in an endoscope and a dermoscope form factor. Our\nnovel coded dual-pixel sensing approach demonstrates accurate RGB-D\nreconstruction results in simulations and real-world experiments in a passive,\nsnapshot, and compact manner.\n","authors":["Bhargav Ghanekar","Salman Siddique Khan","Pranav Sharma","Shreyas Singh","Vivek Boominathan","Kaushik Mitra","Ashok Veeraraghavan"],"pdf_url":"https://arxiv.org/pdf/2402.18102v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08653v2","updated":"2024-03-30T06:05:40Z","published":"2023-12-14T04:47:20Z","title":"SKDF: A Simple Knowledge Distillation Framework for Distilling\n Open-Vocabulary Knowledge to Open-world Object Detector","summary":" In this paper, we attempt to specialize the VLM model for OWOD tasks by\ndistilling its open-world knowledge into a language-agnostic detector.\nSurprisingly, we observe that the combination of a simple \\textbf{knowledge\ndistillation} approach and the automatic pseudo-labeling mechanism in OWOD can\nachieve better performance for unknown object detection, even with a small\namount of data. Unfortunately, knowledge distillation for unknown objects\nseverely affects the learning of detectors with conventional structures for\nknown objects, leading to catastrophic forgetting. To alleviate these problems,\nwe propose the \\textbf{down-weight loss function} for knowledge distillation\nfrom vision-language to single vision modality. Meanwhile, we propose the\n\\textbf{cascade decouple decoding structure} that decouples the learning of\nlocalization and recognition to reduce the impact of category interactions of\nknown and unknown objects on the localization learning process. Ablation\nexperiments demonstrate that both of them are effective in mitigating the\nimpact of open-world knowledge distillation on the learning of known objects.\nAdditionally, to alleviate the current lack of comprehensive benchmarks for\nevaluating the ability of the open-world detector to detect unknown objects in\nthe open world, we propose two benchmarks, which we name\n\"\\textbf{StandardSet}$\\heartsuit$\" and \"\\textbf{IntensiveSet}$\\spadesuit$\"\nrespectively, based on the complexity of their testing scenarios. Comprehensive\nexperiments performed on OWOD, MS-COCO, and our proposed benchmarks demonstrate\nthe effectiveness of our methods. The code and proposed dataset are available\nat \\url{https://github.com/xiaomabufei/SKDF}.\n","authors":["Shuailei Ma","Yuefeng Wang","Ying Wei","Jiaqi Fan","Enming Zhang","Xinyu Sun","Peihao Chen"],"pdf_url":"https://arxiv.org/pdf/2312.08653v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2303.11623"},{"id":"http://arxiv.org/abs/2312.04551v2","updated":"2024-03-30T06:02:45Z","published":"2023-12-07T18:59:18Z","title":"Free3D: Consistent Novel View Synthesis without 3D Representation","summary":" We introduce Free3D, a simple accurate method for monocular open-set novel\nview synthesis (NVS). Similar to Zero-1-to-3, we start from a pre-trained 2D\nimage generator for generalization, and fine-tune it for NVS. Compared to other\nworks that took a similar approach, we obtain significant improvements without\nresorting to an explicit 3D representation, which is slow and memory-consuming,\nand without training an additional network for 3D reconstruction. Our key\ncontribution is to improve the way the target camera pose is encoded in the\nnetwork, which we do by introducing a new ray conditioning normalization (RCN)\nlayer. The latter injects pose information in the underlying 2D image generator\nby telling each pixel its viewing direction. We further improve multi-view\nconsistency by using light-weight multi-view attention layers and by sharing\ngeneration noise between the different views. We train Free3D on the Objaverse\ndataset and demonstrate excellent generalization to new categories in new\ndatasets, including OmniObject3D and GSO. The project page is available at\nhttps://chuanxiaz.com/free3d/.\n","authors":["Chuanxia Zheng","Andrea Vedaldi"],"pdf_url":"https://arxiv.org/pdf/2312.04551v2.pdf","comment":"webpage: https://chuanxiaz.com/free3d/, code:\n https://github.com/lyndonzheng/Free3D"},{"id":"http://arxiv.org/abs/2402.04476v2","updated":"2024-03-30T05:18:05Z","published":"2024-02-06T23:52:10Z","title":"Dual-View Visual Contextualization for Web Navigation","summary":" Automatic web navigation aims to build a web agent that can follow language\ninstructions to execute complex and diverse tasks on real-world websites.\nExisting work primarily takes HTML documents as input, which define the\ncontents and action spaces (i.e., actionable elements and operations) of\nwebpages. Nevertheless, HTML documents may not provide a clear task-related\ncontext for each element, making it hard to select the right (sequence of)\nactions. In this paper, we propose to contextualize HTML elements through their\n\"dual views\" in webpage screenshots: each HTML element has its corresponding\nbounding box and visual content in the screenshot. We build upon the insight --\nweb developers tend to arrange task-related elements nearby on webpages to\nenhance user experiences -- and propose to contextualize each element with its\nneighbor elements, using both textual and visual features. The resulting\nrepresentations of HTML elements are more informative for the agent to take\naction. We validate our method on the recently released Mind2Web dataset, which\nfeatures diverse navigation domains and tasks on real-world websites. Our\nmethod consistently outperforms the baseline in all the scenarios, including\ncross-task, cross-website, and cross-domain ones.\n","authors":["Jihyung Kil","Chan Hee Song","Boyuan Zheng","Xiang Deng","Yu Su","Wei-Lun Chao"],"pdf_url":"https://arxiv.org/pdf/2402.04476v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2309.04372v2","updated":"2024-03-30T05:05:52Z","published":"2023-09-08T15:06:05Z","title":"MoEController: Instruction-based Arbitrary Image Manipulation with\n Mixture-of-Expert Controllers","summary":" Diffusion-model-based text-guided image generation has recently made\nastounding progress, producing fascinating results in open-domain image\nmanipulation tasks. Few models, however, currently have complete zero-shot\ncapabilities for both global and local image editing due to the complexity and\ndiversity of image manipulation tasks. In this work, we propose a method with a\nmixture-of-expert (MOE) controllers to align the text-guided capacity of\ndiffusion models with different kinds of human instructions, enabling our model\nto handle various open-domain image manipulation tasks with natural language\ninstructions. First, we use large language models (ChatGPT) and conditional\nimage synthesis models (ControlNet) to generate a large number of global image\ntransfer dataset in addition to the instruction-based local image editing\ndataset. Then, using an MOE technique and task-specific adaptation training on\na large-scale dataset, our conditional diffusion model can edit images globally\nand locally. Extensive experiments demonstrate that our approach performs\nsurprisingly well on various image manipulation tasks when dealing with\nopen-domain images and arbitrary human instructions. Please refer to our\nproject page: [https://oppo-mente-lab.github.io/moe_controller/]\n","authors":["Sijia Li","Chen Chen","Haonan Lu"],"pdf_url":"https://arxiv.org/pdf/2309.04372v2.pdf","comment":"6 pages,6 figures"},{"id":"http://arxiv.org/abs/2403.10988v2","updated":"2024-03-30T04:56:05Z","published":"2024-03-16T18:04:12Z","title":"Boosting Flow-based Generative Super-Resolution Models via Learned Prior","summary":" Flow-based super-resolution (SR) models have demonstrated astonishing\ncapabilities in generating high-quality images. However, these methods\nencounter several challenges during image generation, such as grid artifacts,\nexploding inverses, and suboptimal results due to a fixed sampling temperature.\nTo overcome these issues, this work introduces a conditional learned prior to\nthe inference phase of a flow-based SR model. This prior is a latent code\npredicted by our proposed latent module conditioned on the low-resolution\nimage, which is then transformed by the flow model into an SR image. Our\nframework is designed to seamlessly integrate with any contemporary flow-based\nSR model without modifying its architecture or pre-trained weights. We evaluate\nthe effectiveness of our proposed framework through extensive experiments and\nablation analyses. The proposed framework successfully addresses all the\ninherent issues in flow-based SR models and enhances their performance in\nvarious SR scenarios. Our code is available at:\nhttps://github.com/liyuantsao/FlowSR-LP\n","authors":["Li-Yuan Tsao","Yi-Chen Lo","Chia-Che Chang","Hao-Wei Chen","Roy Tseng","Chien Feng","Chun-Yi Lee"],"pdf_url":"https://arxiv.org/pdf/2403.10988v2.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2312.02134v3","updated":"2024-03-30T04:22:34Z","published":"2023-12-04T18:55:45Z","title":"GaussianAvatar: Towards Realistic Human Avatar Modeling from a Single\n Video via Animatable 3D Gaussians","summary":" We present GaussianAvatar, an efficient approach to creating realistic human\navatars with dynamic 3D appearances from a single video. We start by\nintroducing animatable 3D Gaussians to explicitly represent humans in various\nposes and clothing styles. Such an explicit and animatable representation can\nfuse 3D appearances more efficiently and consistently from 2D observations. Our\nrepresentation is further augmented with dynamic properties to support\npose-dependent appearance modeling, where a dynamic appearance network along\nwith an optimizable feature tensor is designed to learn the\nmotion-to-appearance mapping. Moreover, by leveraging the differentiable motion\ncondition, our method enables a joint optimization of motions and appearances\nduring avatar modeling, which helps to tackle the long-standing issue of\ninaccurate motion estimation in monocular settings. The efficacy of\nGaussianAvatar is validated on both the public dataset and our collected\ndataset, demonstrating its superior performances in terms of appearance quality\nand rendering efficiency.\n","authors":["Liangxiao Hu","Hongwen Zhang","Yuxiang Zhang","Boyao Zhou","Boning Liu","Shengping Zhang","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2312.02134v3.pdf","comment":"Project Page: https://huliangxiao.github.io/GaussianAvatar"},{"id":"http://arxiv.org/abs/2401.00374v5","updated":"2024-03-30T04:15:34Z","published":"2023-12-31T02:25:41Z","title":"EMAGE: Towards Unified Holistic Co-Speech Gesture Generation via\n Expressive Masked Audio Gesture Modeling","summary":" We propose EMAGE, a framework to generate full-body human gestures from audio\nand masked gestures, encompassing facial, local body, hands, and global\nmovements. To achieve this, we first introduce BEAT2 (BEAT-SMPLX-FLAME), a new\nmesh-level holistic co-speech dataset. BEAT2 combines a MoShed SMPL-X body with\nFLAME head parameters and further refines the modeling of head, neck, and\nfinger movements, offering a community-standardized, high-quality 3D motion\ncaptured dataset. EMAGE leverages masked body gesture priors during training to\nboost inference performance. It involves a Masked Audio Gesture Transformer,\nfacilitating joint training on audio-to-gesture generation and masked gesture\nreconstruction to effectively encode audio and body gesture hints. Encoded body\nhints from masked gestures are then separately employed to generate facial and\nbody movements. Moreover, EMAGE adaptively merges speech features from the\naudio's rhythm and content and utilizes four compositional VQ-VAEs to enhance\nthe results' fidelity and diversity. Experiments demonstrate that EMAGE\ngenerates holistic gestures with state-of-the-art performance and is flexible\nin accepting predefined spatial-temporal gesture inputs, generating complete,\naudio-synchronized results. Our code and dataset are available\nhttps://pantomatrix.github.io/EMAGE/\n","authors":["Haiyang Liu","Zihao Zhu","Giorgio Becherini","Yichen Peng","Mingyang Su","You Zhou","Xuefei Zhe","Naoya Iwamoto","Bo Zheng","Michael J. Black"],"pdf_url":"https://arxiv.org/pdf/2401.00374v5.pdf","comment":"Fix typos; Conflict of Interest Disclosure; CVPR Camera Ready;\n Project Page: https://pantomatrix.github.io/EMAGE/"},{"id":"http://arxiv.org/abs/2302.05043v2","updated":"2024-03-30T03:50:28Z","published":"2023-02-10T04:12:11Z","title":"A Review of Predictive and Contrastive Self-supervised Learning for\n Medical Images","summary":" Over the last decade, supervised deep learning on manually annotated big data\nhas been progressing significantly on computer vision tasks. But the\napplication of deep learning in medical image analysis was limited by the\nscarcity of high-quality annotated medical imaging data. An emerging solution\nis self-supervised learning (SSL), among which contrastive SSL is the most\nsuccessful approach to rivalling or outperforming supervised learning. This\nreview investigates several state-of-the-art contrastive SSL algorithms\noriginally on natural images as well as their adaptations for medical images,\nand concludes by discussing recent advances, current limitations, and future\ndirections in applying contrastive SSL in the medical domain.\n","authors":["Wei-Chien Wang","Euijoon Ahn","Dagan Feng","Jinman Kim"],"pdf_url":"https://arxiv.org/pdf/2302.05043v2.pdf","comment":"Article links:\n https://link.springer.com/article/10.1007/s11633-022-1406-4"},{"id":"http://arxiv.org/abs/2403.17610v2","updated":"2024-03-30T03:46:10Z","published":"2024-03-26T11:43:05Z","title":"MMVP: A Multimodal MoCap Dataset with Vision and Pressure Sensors","summary":" Foot contact is an important cue for human motion capture, understanding, and\ngeneration. Existing datasets tend to annotate dense foot contact using visual\nmatching with thresholding or incorporating pressure signals. However, these\napproaches either suffer from low accuracy or are only designed for small-range\nand slow motion. There is still a lack of a vision-pressure multimodal dataset\nwith large-range and fast human motion, as well as accurate and dense\nfoot-contact annotation. To fill this gap, we propose a Multimodal MoCap\nDataset with Vision and Pressure sensors, named MMVP. MMVP provides accurate\nand dense plantar pressure signals synchronized with RGBD observations, which\nis especially useful for both plausible shape estimation, robust pose fitting\nwithout foot drifting, and accurate global translation tracking. To validate\nthe dataset, we propose an RGBD-P SMPL fitting method and also a\nmonocular-video-based baseline framework, VP-MoCap, for human motion capture.\nExperiments demonstrate that our RGBD-P SMPL Fitting results significantly\noutperform pure visual motion capture. Moreover, VP-MoCap outperforms SOTA\nmethods in foot-contact and global translation estimation accuracy. We believe\nthe configuration of the dataset and the baseline frameworks will stimulate the\nresearch in this direction and also provide a good reference for MoCap\napplications in various domains. Project page:\nhttps://metaverse-ai-lab-thu.github.io/MMVP-Dataset/.\n","authors":["He Zhang","Shenghao Ren","Haolei Yuan","Jianhui Zhao","Fan Li","Shuangpeng Sun","Zhenghao Liang","Tao Yu","Qiu Shen","Xun Cao"],"pdf_url":"https://arxiv.org/pdf/2403.17610v2.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2312.01280v2","updated":"2024-03-30T03:30:18Z","published":"2023-12-03T04:36:04Z","title":"Brain Decodes Deep Nets","summary":" We developed a tool for visualizing and analyzing large pre-trained vision\nmodels by mapping them onto the brain, thus exposing their hidden inside. Our\ninnovation arises from a surprising usage of brain encoding: predicting brain\nfMRI measurements in response to images. We report two findings. First,\nexplicit mapping between the brain and deep-network features across dimensions\nof space, layers, scales, and channels is crucial. This mapping method,\nFactorTopy, is plug-and-play for any deep-network; with it, one can paint a\npicture of the network onto the brain (literally!). Second, our visualization\nshows how different training methods matter: they lead to remarkable\ndifferences in hierarchical organization and scaling behavior, growing with\nmore data or network capacity. It also provides insight into fine-tuning: how\npre-trained models change when adapting to small datasets. We found brain-like\nhierarchically organized network suffer less from catastrophic forgetting after\nfine-tuned.\n","authors":["Huzheng Yang","James Gee","Jianbo Shi"],"pdf_url":"https://arxiv.org/pdf/2312.01280v2.pdf","comment":"Website: see https://huzeyann.github.io/brain-decodes-deep-nets .\n Code: see https://github.com/huzeyann/BrainDecodesDeepNets"},{"id":"http://arxiv.org/abs/2312.09243v2","updated":"2024-03-30T03:08:43Z","published":"2023-12-14T18:58:52Z","title":"OccNeRF: Advancing 3D Occupancy Prediction in LiDAR-Free Environments","summary":" As a fundamental task of vision-based perception, 3D occupancy prediction\nreconstructs 3D structures of surrounding environments. It provides detailed\ninformation for autonomous driving planning and navigation. However, most\nexisting methods heavily rely on the LiDAR point clouds to generate occupancy\nground truth, which is not available in the vision-based system. In this paper,\nwe propose an OccNeRF method for training occupancy networks without 3D\nsupervision. Different from previous works which consider a bounded scene, we\nparameterize the reconstructed occupancy fields and reorganize the sampling\nstrategy to align with the cameras' infinite perceptive range. The neural\nrendering is adopted to convert occupancy fields to multi-camera depth maps,\nsupervised by multi-frame photometric consistency. Moreover, for semantic\noccupancy prediction, we design several strategies to polish the prompts and\nfilter the outputs of a pretrained open-vocabulary 2D segmentation model.\nExtensive experiments for both self-supervised depth estimation and 3D\noccupancy prediction tasks on nuScenes and SemanticKITTI datasets demonstrate\nthe effectiveness of our method.\n","authors":["Chubin Zhang","Juncheng Yan","Yi Wei","Jiaxin Li","Li Liu","Yansong Tang","Yueqi Duan","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2312.09243v2.pdf","comment":"Code: https://github.com/LinShan-Bin/OccNeRF"},{"id":"http://arxiv.org/abs/2403.19001v2","updated":"2024-03-30T02:42:08Z","published":"2024-03-27T20:51:02Z","title":"Cross-domain Fiber Cluster Shape Analysis for Language Performance\n Cognitive Score Prediction","summary":" Shape plays an important role in computer graphics, offering informative\nfeatures to convey an object's morphology and functionality. Shape analysis in\nbrain imaging can help interpret structural and functionality correlations of\nthe human brain. In this work, we investigate the shape of the brain's 3D white\nmatter connections and its potential predictive relationship to human cognitive\nfunction. We reconstruct brain connections as sequences of 3D points using\ndiffusion magnetic resonance imaging (dMRI) tractography. To describe each\nconnection, we extract 12 shape descriptors in addition to traditional dMRI\nconnectivity and tissue microstructure features. We introduce a novel\nframework, Shape--fused Fiber Cluster Transformer (SFFormer), that leverages a\nmulti-head cross-attention feature fusion module to predict subject-specific\nlanguage performance based on dMRI tractography. We assess the performance of\nthe method on a large dataset including 1065 healthy young adults. The results\ndemonstrate that both the transformer-based SFFormer model and its inter/intra\nfeature fusion with shape, microstructure, and connectivity are informative,\nand together, they improve the prediction of subject-specific language\nperformance scores. Overall, our results indicate that the shape of the brain's\nconnections is predictive of human language function.\n","authors":["Yui Lo","Yuqian Chen","Dongnan Liu","Wan Liu","Leo Zekelman","Fan Zhang","Yogesh Rathi","Nikos Makris","Alexandra J. Golby","Weidong Cai","Lauren J. O'Donnell"],"pdf_url":"https://arxiv.org/pdf/2403.19001v2.pdf","comment":"2 figures, 11 pages"},{"id":"http://arxiv.org/abs/2401.00901v2","updated":"2024-03-30T02:30:14Z","published":"2023-12-31T13:53:37Z","title":"Video-GroundingDINO: Towards Open-Vocabulary Spatio-Temporal Video\n Grounding","summary":" Video grounding aims to localize a spatio-temporal section in a video\ncorresponding to an input text query. This paper addresses a critical\nlimitation in current video grounding methodologies by introducing an\nOpen-Vocabulary Spatio-Temporal Video Grounding task. Unlike prevalent\nclosed-set approaches that struggle with open-vocabulary scenarios due to\nlimited training data and predefined vocabularies, our model leverages\npre-trained representations from foundational spatial grounding models. This\nempowers it to effectively bridge the semantic gap between natural language and\ndiverse visual content, achieving strong performance in closed-set and\nopen-vocabulary settings. Our contributions include a novel spatio-temporal\nvideo grounding model, surpassing state-of-the-art results in closed-set\nevaluations on multiple datasets and demonstrating superior performance in\nopen-vocabulary scenarios. Notably, the proposed model outperforms\nstate-of-the-art methods in closed-set settings on VidSTG (Declarative and\nInterrogative) and HC-STVG (V1 and V2) datasets. Furthermore, in\nopen-vocabulary evaluations on HC-STVG V1 and YouCook-Interactions, our model\nsurpasses the recent best-performing models by $4.88$ m_vIoU and $1.83\\%$\naccuracy, demonstrating its efficacy in handling diverse linguistic and visual\nconcepts for improved video understanding. Our codes will be publicly released.\n","authors":["Syed Talal Wasim","Muzammal Naseer","Salman Khan","Ming-Hsuan Yang","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2401.00901v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.09788v2","updated":"2024-03-30T01:21:42Z","published":"2023-12-15T13:43:24Z","title":"Collaborating Foundation Models for Domain Generalized Semantic\n Segmentation","summary":" Domain Generalized Semantic Segmentation (DGSS) deals with training a model\non a labeled source domain with the aim of generalizing to unseen domains\nduring inference. Existing DGSS methods typically effectuate robust features by\nmeans of Domain Randomization (DR). Such an approach is often limited as it can\nonly account for style diversification and not content. In this work, we take\nan orthogonal approach to DGSS and propose to use an assembly of CoLlaborative\nFOUndation models for Domain Generalized Semantic Segmentation (CLOUDS). In\ndetail, CLOUDS is a framework that integrates FMs of various kinds: (i) CLIP\nbackbone for its robust feature representation, (ii) generative models to\ndiversify the content, thereby covering various modes of the possible target\ndistribution, and (iii) Segment Anything Model (SAM) for iteratively refining\nthe predictions of the segmentation model. Extensive experiments show that our\nCLOUDS excels in adapting from synthetic to real DGSS benchmarks and under\nvarying weather conditions, notably outperforming prior methods by 5.6% and\n6.7% on averaged miou, respectively. The code is available at :\nhttps://github.com/yasserben/CLOUDS\n","authors":["Yasser Benigmim","Subhankar Roy","Slim Essid","Vicky Kalogeiton","Stéphane Lathuilière"],"pdf_url":"https://arxiv.org/pdf/2312.09788v2.pdf","comment":"https://github.com/yasserben/CLOUDS ; Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2303.02835v2","updated":"2024-03-30T01:21:22Z","published":"2023-03-06T02:05:14Z","title":"Traffic Scene Parsing through the TSP6K Dataset","summary":" Traffic scene perception in computer vision is a critically important task to\nachieve intelligent cities. To date, most existing datasets focus on autonomous\ndriving scenes. We observe that the models trained on those driving datasets\noften yield unsatisfactory results on traffic monitoring scenes. However,\nlittle effort has been put into improving the traffic monitoring scene\nunderstanding, mainly due to the lack of specific datasets. To fill this gap,\nwe introduce a specialized traffic monitoring dataset, termed TSP6K, containing\nimages from the traffic monitoring scenario, with high-quality pixel-level and\ninstance-level annotations. The TSP6K dataset captures more crowded traffic\nscenes with several times more traffic participants than the existing driving\nscenes. We perform a detailed analysis of the dataset and comprehensively\nevaluate previous popular scene parsing methods, instance segmentation methods\nand unsupervised domain adaption methods. Furthermore, considering the vast\ndifference in instance sizes, we propose a detail refining decoder for scene\nparsing, which recovers the details of different semantic regions in traffic\nscenes owing to the proposed TSP6K dataset. Experiments show its effectiveness\nin parsing the traffic monitoring scenes. Code and dataset are available at\nhttps://github.com/PengtaoJiang/TSP6K.\n","authors":["Peng-Tao Jiang","Yuqi Yang","Yang Cao","Qibin Hou","Ming-Ming Cheng","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2303.02835v2.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2311.12981v2","updated":"2024-03-30T01:18:21Z","published":"2023-11-21T20:33:17Z","title":"SD-NAE: Generating Natural Adversarial Examples with Stable Diffusion","summary":" Natural Adversarial Examples (NAEs), images arising naturally from the\nenvironment and capable of deceiving classifiers, are instrumental in robustly\nevaluating and identifying vulnerabilities in trained models. In this work,\nunlike prior works that passively collect NAEs from real images, we propose to\nactively synthesize NAEs using the state-of-the-art Stable Diffusion.\nSpecifically, our method formulates a controlled optimization process, where we\nperturb the token embedding that corresponds to a specified class to generate\nNAEs. This generation process is guided by the gradient of loss from the target\nclassifier, ensuring that the created image closely mimics the ground-truth\nclass yet fools the classifier. Named SD-NAE (Stable Diffusion for Natural\nAdversarial Examples), our innovative method is effective in producing valid\nand useful NAEs, which is demonstrated through a meticulously designed\nexperiment. Code is available at https://github.com/linyueqian/SD-NAE.\n","authors":["Yueqian Lin","Jingyang Zhang","Yiran Chen","Hai Li"],"pdf_url":"https://arxiv.org/pdf/2311.12981v2.pdf","comment":"Accepted by ICLR 2024 TinyPapers"},{"id":"http://arxiv.org/abs/2403.19549v2","updated":"2024-03-30T00:24:44Z","published":"2024-03-28T16:32:06Z","title":"GlORIE-SLAM: Globally Optimized RGB-only Implicit Encoding Point Cloud\n SLAM","summary":" Recent advancements in RGB-only dense Simultaneous Localization and Mapping\n(SLAM) have predominantly utilized grid-based neural implicit encodings and/or\nstruggle to efficiently realize global map and pose consistency. To this end,\nwe propose an efficient RGB-only dense SLAM system using a flexible neural\npoint cloud scene representation that adapts to keyframe poses and depth\nupdates, without needing costly backpropagation. Another critical challenge of\nRGB-only SLAM is the lack of geometric priors. To alleviate this issue, with\nthe aid of a monocular depth estimator, we introduce a novel DSPO layer for\nbundle adjustment which optimizes the pose and depth of keyframes along with\nthe scale of the monocular depth. Finally, our system benefits from loop\nclosure and online global bundle adjustment and performs either better or\ncompetitive to existing dense neural RGB SLAM methods in tracking, mapping and\nrendering accuracy on the Replica, TUM-RGBD and ScanNet datasets. The source\ncode will be made available.\n","authors":["Ganlin Zhang","Erik Sandström","Youmin Zhang","Manthan Patel","Luc Van Gool","Martin R. Oswald"],"pdf_url":"https://arxiv.org/pdf/2403.19549v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00498v1","updated":"2024-03-30T23:42:23Z","published":"2024-03-30T23:42:23Z","title":"94% on CIFAR-10 in 3.29 Seconds on a Single GPU","summary":" CIFAR-10 is among the most widely used datasets in machine learning,\nfacilitating thousands of research projects per year. To accelerate research\nand reduce the cost of experiments, we introduce training methods for CIFAR-10\nwhich reach 94% accuracy in 3.29 seconds, 95% in 10.4 seconds, and 96% in 46.3\nseconds, when run on a single NVIDIA A100 GPU. As one factor contributing to\nthese training speeds, we propose a derandomized variant of horizontal flipping\naugmentation, which we show improves over the standard method in every case\nwhere flipping is beneficial over no flipping at all. Our code is released at\nhttps://github.com/KellerJordan/cifar10-airbench.\n","authors":["Keller Jordan"],"pdf_url":"https://arxiv.org/pdf/2404.00498v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00491v1","updated":"2024-03-30T23:19:40Z","published":"2024-03-30T23:19:40Z","title":"Denoising Monte Carlo Renders With Diffusion Models","summary":" Physically-based renderings contain Monte-Carlo noise, with variance that\nincreases as the number of rays per pixel decreases. This noise, while\nzero-mean for good modern renderers, can have heavy tails (most notably, for\nscenes containing specular or refractive objects). Learned methods for\nrestoring low fidelity renders are highly developed, because suppressing render\nnoise means one can save compute and use fast renders with few rays per pixel.\nWe demonstrate that a diffusion model can denoise low fidelity renders\nsuccessfully. Furthermore, our method can be conditioned on a variety of\nnatural render information, and this conditioning helps performance.\nQuantitative experiments show that our method is competitive with SOTA across a\nrange of sampling rates, but current metrics slightly favor competitor methods.\nQualitative examination of the reconstructions suggests that the metrics\nthemselves may not be reliable. The image prior applied by a diffusion method\nstrongly favors reconstructions that are \"like\" real images -- so have straight\nshadow boundaries, curved specularities, no \"fireflies\" and the like -- and\nmetrics do not account for this. We show numerous examples where methods\npreferred by current metrics produce qualitatively weaker reconstructions than\nours.\n","authors":["Vaibhav Vavilala","Rahul Vasanth","David Forsyth"],"pdf_url":"https://arxiv.org/pdf/2404.00491v1.pdf","comment":"14 pages, 12 figures"},{"id":"http://arxiv.org/abs/2404.00485v1","updated":"2024-03-30T22:28:29Z","published":"2024-03-30T22:28:29Z","title":"DiffHuman: Probabilistic Photorealistic 3D Reconstruction of Humans","summary":" We present DiffHuman, a probabilistic method for photorealistic 3D human\nreconstruction from a single RGB image. Despite the ill-posed nature of this\nproblem, most methods are deterministic and output a single solution, often\nresulting in a lack of geometric detail and blurriness in unseen or uncertain\nregions. In contrast, DiffHuman predicts a probability distribution over 3D\nreconstructions conditioned on an input 2D image, which allows us to sample\nmultiple detailed 3D avatars that are consistent with the image. DiffHuman is\nimplemented as a conditional diffusion model that denoises pixel-aligned 2D\nobservations of an underlying 3D shape representation. During inference, we may\nsample 3D avatars by iteratively denoising 2D renders of the predicted 3D\nrepresentation. Furthermore, we introduce a generator neural network that\napproximates rendering with considerably reduced runtime (55x speed up),\nresulting in a novel dual-branch diffusion framework. Our experiments show that\nDiffHuman can produce diverse and detailed reconstructions for the parts of the\nperson that are unseen or uncertain in the input image, while remaining\ncompetitive with the state-of-the-art when reconstructing visible surfaces.\n","authors":["Akash Sengupta","Thiemo Alldieck","Nikos Kolotouros","Enric Corona","Andrei Zanfir","Cristian Sminchisescu"],"pdf_url":"https://arxiv.org/pdf/2404.00485v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00471v1","updated":"2024-03-30T20:34:49Z","published":"2024-03-30T20:34:49Z","title":"Score-Based Diffusion Models for Photoacoustic Tomography Image\n Reconstruction","summary":" Photoacoustic tomography (PAT) is a rapidly-evolving medical imaging modality\nthat combines optical absorption contrast with ultrasound imaging depth. One\nchallenge in PAT is image reconstruction with inadequate acoustic signals due\nto limited sensor coverage or due to the density of the transducer array. Such\ncases call for solving an ill-posed inverse reconstruction problem. In this\nwork, we use score-based diffusion models to solve the inverse problem of\nreconstructing an image from limited PAT measurements. The proposed approach\nallows us to incorporate an expressive prior learned by a diffusion model on\nsimulated vessel structures while still being robust to varying transducer\nsparsity conditions.\n","authors":["Sreemanti Dey","Snigdha Saha","Berthy T. Feng","Manxiu Cui","Laure Delisle","Oscar Leong","Lihong V. Wang","Katherine L. Bouman"],"pdf_url":"https://arxiv.org/pdf/2404.00471v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2404.00469v1","updated":"2024-03-30T20:25:16Z","published":"2024-03-30T20:25:16Z","title":"SceneGraphLoc: Cross-Modal Coarse Visual Localization on 3D Scene Graphs","summary":" We introduce a novel problem, i.e., the localization of an input image within\na multi-modal reference map represented by a database of 3D scene graphs. These\ngraphs comprise multiple modalities, including object-level point clouds,\nimages, attributes, and relationships between objects, offering a lightweight\nand efficient alternative to conventional methods that rely on extensive image\ndatabases. Given the available modalities, the proposed method SceneGraphLoc\nlearns a fixed-sized embedding for each node (i.e., representing an object\ninstance) in the scene graph, enabling effective matching with the objects\nvisible in the input query image. This strategy significantly outperforms other\ncross-modal methods, even without incorporating images into the map embeddings.\nWhen images are leveraged, SceneGraphLoc achieves performance close to that of\nstate-of-the-art techniques depending on large image databases, while requiring\nthree orders-of-magnitude less storage and operating orders-of-magnitude\nfaster. The code will be made public.\n","authors":["Yang Miao","Francis Engelmann","Olga Vysotska","Federico Tombari","Marc Pollefeys","Dániel Béla Baráth"],"pdf_url":"https://arxiv.org/pdf/2404.00469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00429v1","updated":"2024-03-30T17:29:13Z","published":"2024-03-30T17:29:13Z","title":"Multiway Point Cloud Mosaicking with Diffusion and Global Optimization","summary":" We introduce a novel framework for multiway point cloud mosaicking (named\nWednesday), designed to co-align sets of partially overlapping point clouds --\ntypically obtained from 3D scanners or moving RGB-D cameras -- into a unified\ncoordinate system. At the core of our approach is ODIN, a learned pairwise\nregistration algorithm that iteratively identifies overlaps and refines\nattention scores, employing a diffusion-based process for denoising pairwise\ncorrelation matrices to enhance matching accuracy. Further steps include\nconstructing a pose graph from all point clouds, performing rotation averaging,\na novel robust algorithm for re-estimating translations optimally in terms of\nconsensus maximization and translation optimization. Finally, the point cloud\nrotations and positions are optimized jointly by a diffusion-based approach.\nTested on four diverse, large-scale datasets, our method achieves\nstate-of-the-art pairwise and multiway registration results by a large margin\non all benchmarks. Our code and models are available at\nhttps://github.com/jinsz/Multiway-Point-Cloud-Mosaicking-with-Diffusion-and-Global-Optimization.\n","authors":["Shengze Jin","Iro Armeni","Marc Pollefeys","Daniel Barath"],"pdf_url":"https://arxiv.org/pdf/2404.00429v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00427v1","updated":"2024-03-30T17:21:07Z","published":"2024-03-30T17:21:07Z","title":"Extracting Manifold Information from Point Clouds","summary":" A kernel based method is proposed for the construction of signature\n(defining) functions of subsets of $\\mathbb{R}^d$. The subsets can range from\nfull dimensional manifolds (open subsets) to point clouds (a finite number of\npoints) and include bounded smooth manifolds of any codimension. The\ninterpolation and analysis of point clouds are the main application. Two\nextreme cases in terms of regularity are considered, where the data set is\ninterpolated by an analytic surface, at the one extreme, and by a H\\\"older\ncontinuous surface, at the other. The signature function can be computed as a\nlinear combination of translated kernels, the coefficients of which are the\nsolution of a finite dimensional linear problem. Once it is obtained, it can be\nused to estimate the dimension as well as the normal and the curvatures of the\ninterpolated surface. The method is global and does not require explicit\nknowledge of local neighborhoods or any other structure present in the data\nset. It admits a variational formulation with a natural ``regularized''\ncounterpart, that proves to be useful in dealing with data sets corrupted by\nnumerical error or noise. The underlying analytical structure of the approach\nis presented in general before it is applied to the case of point clouds.\n","authors":["Patrick Guidotti"],"pdf_url":"https://arxiv.org/pdf/2404.00427v1.pdf","comment":"27 pages, 16 figures, 5 tables"},{"id":"http://arxiv.org/abs/2404.00419v1","updated":"2024-03-30T16:54:45Z","published":"2024-03-30T16:54:45Z","title":"Do Vision-Language Models Understand Compound Nouns?","summary":" Open-vocabulary vision-language models (VLMs) like CLIP, trained using\ncontrastive loss, have emerged as a promising new paradigm for text-to-image\nretrieval. However, do VLMs understand compound nouns (CNs) (e.g., lab coat) as\nwell as they understand nouns (e.g., lab)? We curate Compun, a novel benchmark\nwith 400 unique and commonly used CNs, to evaluate the effectiveness of VLMs in\ninterpreting CNs. The Compun benchmark challenges a VLM for text-to-image\nretrieval where, given a text prompt with a CN, the task is to select the\ncorrect image that shows the CN among a pair of distractor images that show the\nconstituent nouns that make up the CN. Next, we perform an in-depth analysis to\nhighlight CLIPs' limited understanding of certain types of CNs. Finally, we\npresent an alternative framework that moves beyond hand-written templates for\ntext prompts widely used by CLIP-like models. We employ a Large Language Model\nto generate multiple diverse captions that include the CN as an object in the\nscene described by the caption. Our proposed method improves CN understanding\nof CLIP by 8.25% on Compun. Code and benchmark are available at:\nhttps://github.com/sonalkum/Compun\n","authors":["Sonal Kumar","Sreyan Ghosh","S Sakshi","Utkarsh Tyagi","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2404.00419v1.pdf","comment":"Accepted to NAACL 2024 Main Conference"},{"id":"http://arxiv.org/abs/2404.00418v1","updated":"2024-03-30T16:54:35Z","published":"2024-03-30T16:54:35Z","title":"Continual Learning for Autonomous Robots: A Prototype-based Approach","summary":" Humans and animals learn throughout their lives from limited amounts of\nsensed data, both with and without supervision. Autonomous, intelligent robots\nof the future are often expected to do the same. The existing continual\nlearning (CL) methods are usually not directly applicable to robotic settings:\nthey typically require buffering and a balanced replay of training data. A\nfew-shot online continual learning (FS-OCL) setting has been proposed to\naddress more realistic scenarios where robots must learn from a non-repeated\nsparse data stream. To enable truly autonomous life-long learning, an\nadditional challenge of detecting novelties and learning new items without\nsupervision needs to be addressed. We address this challenge with our new\nprototype-based approach called Continually Learning Prototypes (CLP). In\naddition to being capable of FS-OCL learning, CLP also detects novel objects\nand learns them without supervision. To mitigate forgetting, CLP utilizes a\nnovel metaplasticity mechanism that adapts the learning rate individually per\nprototype. CLP is rehearsal-free, hence does not require a memory buffer, and\nis compatible with neuromorphic hardware, characterized by ultra-low power\nconsumption, real-time processing abilities, and on-chip learning. Indeed, we\nhave open-sourced a simple version of CLP in the neuromorphic software\nframework Lava, targetting Intel's neuromorphic chip Loihi 2. We evaluate CLP\non a robotic vision dataset, OpenLORIS. In a low-instance FS-OCL scenario, CLP\nshows state-of-the-art results. In the open world, CLP detects novelties with\nsuperior precision and recall and learns features of the detected novel classes\nwithout supervision, achieving a strong baseline of 99% base class and 65%/76%\n(5-shot/10-shot) novel class accuracy.\n","authors":["Elvin Hajizada","Balachandran Swaminathan","Yulia Sandamirskaya"],"pdf_url":"https://arxiv.org/pdf/2404.00418v1.pdf","comment":"Submitted to IEEE/RSJ International Conference on Intelligent Robots\n and Systems (IROS)"},{"id":"http://arxiv.org/abs/2404.00417v1","updated":"2024-03-30T16:53:10Z","published":"2024-03-30T16:53:10Z","title":"Orchestrate Latent Expertise: Advancing Online Continual Learning with\n Multi-Level Supervision and Reverse Self-Distillation","summary":" To accommodate real-world dynamics, artificial intelligence systems need to\ncope with sequentially arriving content in an online manner. Beyond regular\nContinual Learning (CL) attempting to address catastrophic forgetting with\noffline training of each task, Online Continual Learning (OCL) is a more\nchallenging yet realistic setting that performs CL in a one-pass data stream.\nCurrent OCL methods primarily rely on memory replay of old training samples.\nHowever, a notable gap from CL to OCL stems from the additional\noverfitting-underfitting dilemma associated with the use of rehearsal buffers:\nthe inadequate learning of new training samples (underfitting) and the repeated\nlearning of a few old training samples (overfitting). To this end, we introduce\na novel approach, Multi-level Online Sequential Experts (MOSE), which\ncultivates the model as stacked sub-experts, integrating multi-level\nsupervision and reverse self-distillation. Supervision signals across multiple\nstages facilitate appropriate convergence of the new task while gathering\nvarious strengths from experts by knowledge distillation mitigates the\nperformance decline of old tasks. MOSE demonstrates remarkable efficacy in\nlearning new samples and preserving past knowledge through multi-level experts,\nthereby significantly advancing OCL performance over state-of-the-art baselines\n(e.g., up to 7.3% on Split CIFAR-100 and 6.1% on Split Tiny-ImageNet).\n","authors":["HongWei Yan","Liyuan Wang","Kaisheng Ma","Yi Zhong"],"pdf_url":"https://arxiv.org/pdf/2404.00417v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00412v1","updated":"2024-03-30T16:43:40Z","published":"2024-03-30T16:43:40Z","title":"SVGCraft: Beyond Single Object Text-to-SVG Synthesis with Comprehensive\n Canvas Layout","summary":" Generating VectorArt from text prompts is a challenging vision task,\nrequiring diverse yet realistic depictions of the seen as well as unseen\nentities. However, existing research has been mostly limited to the generation\nof single objects, rather than comprehensive scenes comprising multiple\nelements. In response, this work introduces SVGCraft, a novel end-to-end\nframework for the creation of vector graphics depicting entire scenes from\ntextual descriptions. Utilizing a pre-trained LLM for layout generation from\ntext prompts, this framework introduces a technique for producing masked\nlatents in specified bounding boxes for accurate object placement. It\nintroduces a fusion mechanism for integrating attention maps and employs a\ndiffusion U-Net for coherent composition, speeding up the drawing process. The\nresulting SVG is optimized using a pre-trained encoder and LPIPS loss with\nopacity modulation to maximize similarity. Additionally, this work explores the\npotential of primitive shapes in facilitating canvas completion in constrained\nenvironments. Through both qualitative and quantitative assessments, SVGCraft\nis demonstrated to surpass prior works in abstraction, recognizability, and\ndetail, as evidenced by its performance metrics (CLIP-T: 0.4563, Cosine\nSimilarity: 0.6342, Confusion: 0.66, Aesthetic: 6.7832). The code will be\navailable at https://github.com/ayanban011/SVGCraft.\n","authors":["Ayan Banerjee","Nityanand Mathur","Josep Lladós","Umapada Pal","Anjan Dutta"],"pdf_url":"https://arxiv.org/pdf/2404.00412v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00409v1","updated":"2024-03-30T16:35:38Z","published":"2024-03-30T16:35:38Z","title":"3DGSR: Implicit Surface Reconstruction with 3D Gaussian Splatting","summary":" In this paper, we present an implicit surface reconstruction method with 3D\nGaussian Splatting (3DGS), namely 3DGSR, that allows for accurate 3D\nreconstruction with intricate details while inheriting the high efficiency and\nrendering quality of 3DGS. The key insight is incorporating an implicit signed\ndistance field (SDF) within 3D Gaussians to enable them to be aligned and\njointly optimized. First, we introduce a differentiable SDF-to-opacity\ntransformation function that converts SDF values into corresponding Gaussians'\nopacities. This function connects the SDF and 3D Gaussians, allowing for\nunified optimization and enforcing surface constraints on the 3D Gaussians.\nDuring learning, optimizing the 3D Gaussians provides supervisory signals for\nSDF learning, enabling the reconstruction of intricate details. However, this\nonly provides sparse supervisory signals to the SDF at locations occupied by\nGaussians, which is insufficient for learning a continuous SDF. Then, to\naddress this limitation, we incorporate volumetric rendering and align the\nrendered geometric attributes (depth, normal) with those derived from 3D\nGaussians. This consistency regularization introduces supervisory signals to\nlocations not covered by discrete 3D Gaussians, effectively eliminating\nredundant surfaces outside the Gaussian sampling range. Our extensive\nexperimental results demonstrate that our 3DGSR method enables high-quality 3D\nsurface reconstruction while preserving the efficiency and rendering quality of\n3DGS. Besides, our method competes favorably with leading surface\nreconstruction techniques while offering a more efficient learning process and\nmuch better rendering qualities. The code will be available at\nhttps://github.com/CVMI-Lab/3DGSR.\n","authors":["Xiaoyang Lyu","Yang-Tian Sun","Yi-Hua Huang","Xiuzhe Wu","Ziyi Yang","Yilun Chen","Jiangmiao Pang","Xiaojuan Qi"],"pdf_url":"https://arxiv.org/pdf/2404.00409v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00385v1","updated":"2024-03-30T14:58:40Z","published":"2024-03-30T14:58:40Z","title":"Constrained Layout Generation with Factor Graphs","summary":" This paper addresses the challenge of object-centric layout generation under\nspatial constraints, seen in multiple domains including floorplan design\nprocess. The design process typically involves specifying a set of spatial\nconstraints that include object attributes like size and inter-object relations\nsuch as relative positioning. Existing works, which typically represent objects\nas single nodes, lack the granularity to accurately model complex interactions\nbetween objects. For instance, often only certain parts of an object, like a\nroom's right wall, interact with adjacent objects. To address this gap, we\nintroduce a factor graph based approach with four latent variable nodes for\neach room, and a factor node for each constraint. The factor nodes represent\ndependencies among the variables to which they are connected, effectively\ncapturing constraints that are potentially of a higher order. We then develop\nmessage-passing on the bipartite graph, forming a factor graph neural network\nthat is trained to produce a floorplan that aligns with the desired\nrequirements. Our approach is simple and generates layouts faithful to the user\nrequirements, demonstrated by a large improvement in IOU scores over existing\nmethods. Additionally, our approach, being inferential and accurate, is\nwell-suited to the practical human-in-the-loop design process where\nspecifications evolve iteratively, offering a practical and powerful tool for\nAI-guided design.\n","authors":["Mohammed Haroon Dupty","Yanfei Dong","Sicong Leng","Guoji Fu","Yong Liang Goh","Wei Lu","Wee Sun Lee"],"pdf_url":"https://arxiv.org/pdf/2404.00385v1.pdf","comment":"To be published at IEEE/CVF CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00384v1","updated":"2024-03-30T14:51:07Z","published":"2024-03-30T14:51:07Z","title":"TTD: Text-Tag Self-Distillation Enhancing Image-Text Alignment in CLIP\n to Alleviate Single Tag Bias","summary":" We identify a critical bias in contemporary CLIP-based models, which we\ndenote as \\textit{single tag bias}. This bias manifests as a disproportionate\nfocus on a singular tag (word) while neglecting other pertinent tags, stemming\nfrom CLIP's text embeddings that prioritize one specific tag in image-text\nrelationships. When deconstructing text into individual tags, only one tag\ntends to have high relevancy with CLIP's image embedding, leading to an\nimbalanced tag relevancy. This results in an uneven alignment among multiple\ntags present in the text. To tackle this challenge, we introduce a novel\ntwo-step fine-tuning approach. First, our method leverages the similarity\nbetween tags and their nearest pixels for scoring, enabling the extraction of\nimage-relevant tags from the text. Second, we present a self-distillation\nstrategy aimed at aligning the combined masks from extracted tags with the\ntext-derived mask. This approach mitigates the single tag bias, thereby\nsignificantly improving the alignment of CLIP's model without necessitating\nadditional data or supervision. Our technique demonstrates model-agnostic\nimprovements in multi-tag classification and segmentation tasks, surpassing\ncompeting methods that rely on external resources. Code is available at\nhttps://github.com/shjo-april/TTD.\n","authors":["Sanghyun Jo","Soohyun Ryu","Sungyub Kim","Eunho Yang","Kyungsu Kim"],"pdf_url":"https://arxiv.org/pdf/2404.00384v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00380v1","updated":"2024-03-30T14:35:31Z","published":"2024-03-30T14:35:31Z","title":"DHR: Dual Features-Driven Hierarchical Rebalancing in Inter- and\n Intra-Class Regions for Weakly-Supervised Semantic Segmentation","summary":" Weakly-supervised semantic segmentation (WSS) ensures high-quality\nsegmentation with limited data and excels when employed as input seed masks for\nlarge-scale vision models such as Segment Anything. However, WSS faces\nchallenges related to minor classes since those are overlooked in images with\nadjacent multiple classes, a limitation originating from the overfitting of\ntraditional expansion methods like Random Walk. We first address this by\nemploying unsupervised and weakly-supervised feature maps instead of\nconventional methodologies, allowing for hierarchical mask enhancement. This\nmethod distinctly categorizes higher-level classes and subsequently separates\ntheir associated lower-level classes, ensuring all classes are correctly\nrestored in the mask without losing minor ones. Our approach, validated through\nextensive experimentation, significantly improves WSS across five benchmarks\n(VOC: 79.8\\%, COCO: 53.9\\%, Context: 49.0\\%, ADE: 32.9\\%, Stuff: 37.4\\%),\nreducing the gap with fully supervised methods by over 84\\% on the VOC\nvalidation set. Code is available at https://github.com/shjo-april/DHR.\n","authors":["Sanghyun Jo","Fei Pan","In-Jae Yu","Kyungsu Kim"],"pdf_url":"https://arxiv.org/pdf/2404.00380v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00373v1","updated":"2024-03-30T13:58:19Z","published":"2024-03-30T13:58:19Z","title":"The Devil is in the Edges: Monocular Depth Estimation with Edge-aware\n Consistency Fusion","summary":" This paper presents a novel monocular depth estimation method, named ECFNet,\nfor estimating high-quality monocular depth with clear edges and valid overall\nstructure from a single RGB image. We make a thorough inquiry about the key\nfactor that affects the edge depth estimation of the MDE networks, and come to\na ratiocination that the edge information itself plays a critical role in\npredicting depth details. Driven by this analysis, we propose to explicitly\nemploy the image edges as input for ECFNet and fuse the initial depths from\ndifferent sources to produce the final depth. Specifically, ECFNet first uses a\nhybrid edge detection strategy to get the edge map and edge-highlighted image\nfrom the input image, and then leverages a pre-trained MDE network to infer the\ninitial depths of the aforementioned three images. After that, ECFNet utilizes\na layered fusion module (LFM) to fuse the initial depth, which will be further\nupdated by a depth consistency module (DCM) to form the final estimation.\nExtensive experimental results on public datasets and ablation studies indicate\nthat our method achieves state-of-the-art performance. Project page:\nhttps://zrealli.github.io/edgedepth.\n","authors":["Pengzhi Li","Yikang Ding","Haohan Wang","Chengshuai Tang","Zhiheng Li"],"pdf_url":"https://arxiv.org/pdf/2404.00373v1.pdf","comment":"17 pages, 19 figures"},{"id":"http://arxiv.org/abs/2404.00368v1","updated":"2024-03-30T13:41:57Z","published":"2024-03-30T13:41:57Z","title":"Towards Variable and Coordinated Holistic Co-Speech Motion Generation","summary":" This paper addresses the problem of generating lifelike holistic co-speech\nmotions for 3D avatars, focusing on two key aspects: variability and\ncoordination. Variability allows the avatar to exhibit a wide range of motions\neven with similar speech content, while coordination ensures a harmonious\nalignment among facial expressions, hand gestures, and body poses. We aim to\nachieve both with ProbTalk, a unified probabilistic framework designed to\njointly model facial, hand, and body movements in speech. ProbTalk builds on\nthe variational autoencoder (VAE) architecture and incorporates three core\ndesigns. First, we introduce product quantization (PQ) to the VAE, which\nenriches the representation of complex holistic motion. Second, we devise a\nnovel non-autoregressive model that embeds 2D positional encoding into the\nproduct-quantized representation, thereby preserving essential structure\ninformation of the PQ codes. Last, we employ a secondary stage to refine the\npreliminary prediction, further sharpening the high-frequency details. Coupling\nthese three designs enables ProbTalk to generate natural and diverse holistic\nco-speech motions, outperforming several state-of-the-art methods in\nqualitative and quantitative evaluations, particularly in terms of realism. Our\ncode and model will be released for research purposes at\nhttps://feifeifeiliu.github.io/probtalk/.\n","authors":["Yifei Liu","Qiong Cao","Yandong Wen","Huaiguang Jiang","Changxing Ding"],"pdf_url":"https://arxiv.org/pdf/2404.00368v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00366v1","updated":"2024-03-30T13:38:07Z","published":"2024-03-30T13:38:07Z","title":"Efficient Multi-branch Segmentation Network for Situation Awareness in\n Autonomous Navigation","summary":" Real-time and high-precision situational awareness technology is critical for\nautonomous navigation of unmanned surface vehicles (USVs). In particular,\nrobust and fast obstacle semantic segmentation methods are essential. However,\ndistinguishing between the sea and the sky is challenging due to the\ndifferences between port and maritime environments. In this study, we built a\ndataset that captured perspectives from USVs and unmanned aerial vehicles in a\nmaritime port environment and analysed the data features. Statistical analysis\nrevealed a high correlation between the distribution of the sea and sky and row\npositional information. Based on this finding, a three-branch semantic\nsegmentation network with a row position encoding module (RPEM) was proposed to\nimprove the prediction accuracy between the sea and the sky. The proposed RPEM\nhighlights the effect of row coordinates on feature extraction. Compared to the\nbaseline, the three-branch network with RPEM significantly improved the ability\nto distinguish between the sea and the sky without significantly reducing the\ncomputational speed.\n","authors":["Guan-Cheng Zhou","Chen Chengb","Yan-zhou Chena"],"pdf_url":"https://arxiv.org/pdf/2404.00366v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00362v1","updated":"2024-03-30T13:28:53Z","published":"2024-03-30T13:28:53Z","title":"STBA: Towards Evaluating the Robustness of DNNs for Query-Limited\n Black-box Scenario","summary":" Many attack techniques have been proposed to explore the vulnerability of\nDNNs and further help to improve their robustness. Despite the significant\nprogress made recently, existing black-box attack methods still suffer from\nunsatisfactory performance due to the vast number of queries needed to optimize\ndesired perturbations. Besides, the other critical challenge is that\nadversarial examples built in a noise-adding manner are abnormal and struggle\nto successfully attack robust models, whose robustness is enhanced by\nadversarial training against small perturbations. There is no doubt that these\ntwo issues mentioned above will significantly increase the risk of exposure and\nresult in a failure to dig deeply into the vulnerability of DNNs. Hence, it is\nnecessary to evaluate DNNs' fragility sufficiently under query-limited settings\nin a non-additional way. In this paper, we propose the Spatial Transform\nBlack-box Attack (STBA), a novel framework to craft formidable adversarial\nexamples in the query-limited scenario. Specifically, STBA introduces a flow\nfield to the high-frequency part of clean images to generate adversarial\nexamples and adopts the following two processes to enhance their naturalness\nand significantly improve the query efficiency: a) we apply an estimated flow\nfield to the high-frequency part of clean images to generate adversarial\nexamples instead of introducing external noise to the benign image, and b) we\nleverage an efficient gradient estimation method based on a batch of samples to\noptimize such an ideal flow field under query-limited settings. Compared to\nexisting score-based black-box baselines, extensive experiments indicated that\nSTBA could effectively improve the imperceptibility of the adversarial examples\nand remarkably boost the attack success rate under query-limited settings.\n","authors":["Renyang Liu","Kwok-Yan Lam","Wei Zhou","Sixing Wu","Jun Zhao","Dongting Hu","Mingming Gong"],"pdf_url":"https://arxiv.org/pdf/2404.00362v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00360v1","updated":"2024-03-30T13:24:58Z","published":"2024-03-30T13:24:58Z","title":"Reusable Architecture Growth for Continual Stereo Matching","summary":" The remarkable performance of recent stereo depth estimation models benefits\nfrom the successful use of convolutional neural networks to regress dense\ndisparity. Akin to most tasks, this needs gathering training data that covers a\nnumber of heterogeneous scenes at deployment time. However, training samples\nare typically acquired continuously in practical applications, making the\ncapability to learn new scenes continually even more crucial. For this purpose,\nwe propose to perform continual stereo matching where a model is tasked to 1)\ncontinually learn new scenes, 2) overcome forgetting previously learned scenes,\nand 3) continuously predict disparities at inference. We achieve this goal by\nintroducing a Reusable Architecture Growth (RAG) framework. RAG leverages\ntask-specific neural unit search and architecture growth to learn new scenes\ncontinually in both supervised and self-supervised manners. It can maintain\nhigh reusability during growth by reusing previous units while obtaining good\nperformance. Additionally, we present a Scene Router module to adaptively\nselect the scene-specific architecture path at inference. Comprehensive\nexperiments on numerous datasets show that our framework performs impressively\nin various weather, road, and city circumstances and surpasses the\nstate-of-the-art methods in more challenging cross-dataset settings. Further\nexperiments also demonstrate the adaptability of our method to unseen scenes,\nwhich can facilitate end-to-end stereo architecture learning and practical\ndeployment.\n","authors":["Chenghao Zhang","Gaofeng Meng","Bin Fan","Kun Tian","Zhaoxiang Zhang","Shiming Xiang","Chunhong Pan"],"pdf_url":"https://arxiv.org/pdf/2404.00360v1.pdf","comment":"Extended version of CVPR 2022 paper \"Continual Stereo Matching of\n Continuous Driving Scenes with Growing Architecture\" - Accepted to TPAMI in\n 2024"},{"id":"http://arxiv.org/abs/2404.00358v1","updated":"2024-03-30T13:20:04Z","published":"2024-03-30T13:20:04Z","title":"Spread Your Wings: A Radial Strip Transformer for Image Deblurring","summary":" Exploring motion information is important for the motion deblurring task.\nRecent the window-based transformer approaches have achieved decent performance\nin image deblurring. Note that the motion causing blurry results is usually\ncomposed of translation and rotation movements and the window-shift operation\nin the Cartesian coordinate system by the window-based transformer approaches\nonly directly explores translation motion in orthogonal directions. Thus, these\nmethods have the limitation of modeling the rotation part. To alleviate this\nproblem, we introduce the polar coordinate-based transformer, which has the\nangles and distance to explore rotation motion and translation information\ntogether. In this paper, we propose a Radial Strip Transformer (RST), which is\na transformer-based architecture that restores the blur images in a polar\ncoordinate system instead of a Cartesian one. RST contains a dynamic radial\nembedding module (DRE) to extract the shallow feature by a radial deformable\nconvolution. We design a polar mask layer to generate the offsets for the\ndeformable convolution, which can reshape the convolution kernel along the\nradius to better capture the rotation motion information. Furthermore, we\nproposed a radial strip attention solver (RSAS) as deep feature extraction,\nwhere the relationship of windows is organized by azimuth and radius. This\nattention module contains radial strip windows to reweight image features in\nthe polar coordinate, which preserves more useful information in rotation and\ntranslation motion together for better recovering the sharp images.\nExperimental results on six synthesis and real-world datasets prove that our\nmethod performs favorably against other SOTA methods for the image deblurring\ntask.\n","authors":["Duosheng Chen","Shihao Zhou","Jinshan Pan","Jinglei Shi","Lishen Qu","Jufeng Yang"],"pdf_url":"https://arxiv.org/pdf/2404.00358v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00351v1","updated":"2024-03-30T13:04:46Z","published":"2024-03-30T13:04:46Z","title":"Rethinking Attention-Based Multiple Instance Learning for Whole-Slide\n Pathological Image Classification: An Instance Attribute Viewpoint","summary":" Multiple instance learning (MIL) is a robust paradigm for whole-slide\npathological image (WSI) analysis, processing gigapixel-resolution images with\nslide-level labels. As pioneering efforts, attention-based MIL (ABMIL) and its\nvariants are increasingly becoming popular due to the characteristics of\nsimultaneously handling clinical diagnosis and tumor localization. However, the\nattention mechanism exhibits limitations in discriminating between instances,\nwhich often misclassifies tissues and potentially impairs MIL performance. This\npaper proposes an Attribute-Driven MIL (AttriMIL) framework to address these\nissues. Concretely, we dissect the calculation process of ABMIL and present an\nattribute scoring mechanism that measures the contribution of each instance to\nbag prediction effectively, quantifying instance attributes. Based on attribute\nquantification, we develop a spatial attribute constraint and an attribute\nranking constraint to model instance correlations within and across slides,\nrespectively. These constraints encourage the network to capture the spatial\ncorrelation and semantic similarity of instances, improving the ability of\nAttriMIL to distinguish tissue types and identify challenging instances.\nAdditionally, AttriMIL employs a histopathology adaptive backbone that\nmaximizes the pre-trained model's feature extraction capability for collecting\npathological features. Extensive experiments on three public benchmarks\ndemonstrate that our AttriMIL outperforms existing state-of-the-art frameworks\nacross multiple evaluation metrics. The implementation code is available at\nhttps://github.com/MedCAI/AttriMIL.\n","authors":["Linghan Cai","Shenjin Huang","Ye Zhang","Jinpeng Lu","Yongbing Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.00351v1.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.00349v1","updated":"2024-03-30T12:55:19Z","published":"2024-03-30T12:55:19Z","title":"SGDFormer: One-stage Transformer-based Architecture for Cross-Spectral\n Stereo Image Guided Denoising","summary":" Cross-spectral image guided denoising has shown its great potential in\nrecovering clean images with rich details, such as using the near-infrared\nimage to guide the denoising process of the visible one. To obtain such image\npairs, a feasible and economical way is to employ a stereo system, which is\nwidely used on mobile devices. Current works attempt to generate an aligned\nguidance image to handle the disparity between two images. However, due to\nocclusion, spectral differences and noise degradation, the aligned guidance\nimage generally exists ghosting and artifacts, leading to an unsatisfactory\ndenoised result. To address this issue, we propose a one-stage\ntransformer-based architecture, named SGDFormer, for cross-spectral Stereo\nimage Guided Denoising. The architecture integrates the correspondence modeling\nand feature fusion of stereo images into a unified network. Our transformer\nblock contains a noise-robust cross-attention (NRCA) module and a spatially\nvariant feature fusion (SVFF) module. The NRCA module captures the long-range\ncorrespondence of two images in a coarse-to-fine manner to alleviate the\ninterference of noise. The SVFF module further enhances salient structures and\nsuppresses harmful artifacts through dynamically selecting useful information.\nThanks to the above design, our SGDFormer can restore artifact-free images with\nfine structures, and achieves state-of-the-art performance on various datasets.\nAdditionally, our SGDFormer can be extended to handle other unaligned\ncross-model guided restoration tasks such as guided depth super-resolution.\n","authors":["Runmin Zhang","Zhu Yu","Zehua Sheng","Jiacheng Ying","Si-Yuan Cao","Shu-Jie Chen","Bailin Yang","Junwei Li","Hui-Liang Shen"],"pdf_url":"https://arxiv.org/pdf/2404.00349v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00345v1","updated":"2024-03-30T12:50:25Z","published":"2024-03-30T12:50:25Z","title":"MaGRITTe: Manipulative and Generative 3D Realization from Image, Topview\n and Text","summary":" The generation of 3D scenes from user-specified conditions offers a promising\navenue for alleviating the production burden in 3D applications. Previous\nstudies required significant effort to realize the desired scene, owing to\nlimited control conditions. We propose a method for controlling and generating\n3D scenes under multimodal conditions using partial images, layout information\nrepresented in the top view, and text prompts. Combining these conditions to\ngenerate a 3D scene involves the following significant difficulties: (1) the\ncreation of large datasets, (2) reflection on the interaction of multimodal\nconditions, and (3) domain dependence of the layout conditions. We decompose\nthe process of 3D scene generation into 2D image generation from the given\nconditions and 3D scene generation from 2D images. 2D image generation is\nachieved by fine-tuning a pretrained text-to-image model with a small\nartificial dataset of partial images and layouts, and 3D scene generation is\nachieved by layout-conditioned depth estimation and neural radiance fields\n(NeRF), thereby avoiding the creation of large datasets. The use of a common\nrepresentation of spatial information using 360-degree images allows for the\nconsideration of multimodal condition interactions and reduces the domain\ndependence of the layout control. The experimental results qualitatively and\nquantitatively demonstrated that the proposed method can generate 3D scenes in\ndiverse domains, from indoor to outdoor, according to multimodal conditions.\n","authors":["Takayuki Hara","Tatsuya Harada"],"pdf_url":"https://arxiv.org/pdf/2404.00345v1.pdf","comment":"Project Page: https://hara012.github.io/MaGRITTe-project"},{"id":"http://arxiv.org/abs/2404.00335v1","updated":"2024-03-30T12:10:34Z","published":"2024-03-30T12:10:34Z","title":"Learing Trimaps via Clicks for Image Matting","summary":" Despite significant advancements in image matting, existing models heavily\ndepend on manually-drawn trimaps for accurate results in natural image\nscenarios. However, the process of obtaining trimaps is time-consuming, lacking\nuser-friendliness and device compatibility. This reliance greatly limits the\npractical application of all trimap-based matting methods. To address this\nissue, we introduce Click2Trimap, an interactive model capable of predicting\nhigh-quality trimaps and alpha mattes with minimal user click inputs. Through\nanalyzing real users' behavioral logic and characteristics of trimaps, we\nsuccessfully propose a powerful iterative three-class training strategy and a\ndedicated simulation function, making Click2Trimap exhibit versatility across\nvarious scenarios. Quantitative and qualitative assessments on synthetic and\nreal-world matting datasets demonstrate Click2Trimap's superior performance\ncompared to all existing trimap-free matting methods. Especially, in the user\nstudy, Click2Trimap achieves high-quality trimap and matting predictions in\njust an average of 5 seconds per image, demonstrating its substantial practical\nvalue in real-world applications.\n","authors":["Chenyi Zhang","Yihan Hu","Henghui Ding","Humphrey Shi","Yao Zhao","Yunchao Wei"],"pdf_url":"https://arxiv.org/pdf/2404.00335v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00330v1","updated":"2024-03-30T12:01:04Z","published":"2024-03-30T12:01:04Z","title":"Memory-Scalable and Simplified Functional Map Learning","summary":" Deep functional maps have emerged in recent years as a prominent\nlearning-based framework for non-rigid shape matching problems. While early\nmethods in this domain only focused on learning in the functional domain, the\nlatest techniques have demonstrated that by promoting consistency between\nfunctional and pointwise maps leads to significant improvements in accuracy.\nUnfortunately, existing approaches rely heavily on the computation of large\ndense matrices arising from soft pointwise maps, which compromises their\nefficiency and scalability. To address this limitation, we introduce a novel\nmemory-scalable and efficient functional map learning pipeline. By leveraging\nthe specific structure of functional maps, we offer the possibility to achieve\nidentical results without ever storing the pointwise map in memory.\nFurthermore, based on the same approach, we present a differentiable map\nrefinement layer adapted from an existing axiomatic refinement algorithm.\nUnlike many functional map learning methods, which use this algorithm at a\npost-processing step, ours can be easily used at train time, enabling to\nenforce consistency between the refined and initial versions of the map. Our\nresulting approach is both simpler, more efficient and more numerically stable,\nby avoiding differentiation through a linear system, while achieving close to\nstate-of-the-art results in challenging scenarios.\n","authors":["Robin Magnet","Maks Ovsjanikov"],"pdf_url":"https://arxiv.org/pdf/2404.00330v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00327v1","updated":"2024-03-30T11:41:19Z","published":"2024-03-30T11:41:19Z","title":"YNetr: Dual-Encoder architecture on Plain Scan Liver Tumors (PSLT)","summary":" Background: Liver tumors are abnormal growths in the liver that can be either\nbenign or malignant, with liver cancer being a significant health concern\nworldwide. However, there is no dataset for plain scan segmentation of liver\ntumors, nor any related algorithms. To fill this gap, we propose Plain Scan\nLiver Tumors(PSLT) and YNetr. Methods: A collection of 40 liver tumor plain\nscan segmentation datasets was assembled and annotated. Concurrently, we\nutilized Dice coefficient as the metric for assessing the segmentation outcomes\nproduced by YNetr, having advantage of capturing different frequency\ninformation. Results: The YNetr model achieved a Dice coefficient of 62.63% on\nthe PSLT dataset, surpassing the other publicly available model by an accuracy\nmargin of 1.22%. Comparative evaluations were conducted against a range of\nmodels including UNet 3+, XNet, UNetr, Swin UNetr, Trans-BTS, COTr, nnUNetv2\n(2D), nnUNetv2 (3D fullres), MedNext (2D) and MedNext(3D fullres). Conclusions:\nWe not only proposed a dataset named PSLT(Plain Scan Liver Tumors), but also\nexplored a structure called YNetr that utilizes wavelet transform to extract\ndifferent frequency information, which having the SOTA in PSLT by experiments.\n","authors":["Wen Sheng","Zhong Zheng","Jiajun Liu","Han Lu","Hanyuan Zhang","Zhengyong Jiang","Zhihong Zhang","Daoping Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.00327v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2404.00323v1","updated":"2024-03-30T11:28:05Z","published":"2024-03-30T11:28:05Z","title":"CLIP-driven Outliers Synthesis for few-shot OOD detection","summary":" Few-shot OOD detection focuses on recognizing out-of-distribution (OOD)\nimages that belong to classes unseen during training, with the use of only a\nsmall number of labeled in-distribution (ID) images. Up to now, a mainstream\nstrategy is based on large-scale vision-language models, such as CLIP. However,\nthese methods overlook a crucial issue: the lack of reliable OOD supervision\ninformation, which can lead to biased boundaries between in-distribution (ID)\nand OOD. To tackle this problem, we propose CLIP-driven Outliers\nSynthesis~(CLIP-OS). Firstly, CLIP-OS enhances patch-level features' perception\nby newly proposed patch uniform convolution, and adaptively obtains the\nproportion of ID-relevant information by employing CLIP-surgery-discrepancy,\nthus achieving separation between ID-relevant and ID-irrelevant. Next, CLIP-OS\nsynthesizes reliable OOD data by mixing up ID-relevant features from different\nclasses to provide OOD supervision information. Afterward, CLIP-OS leverages\nsynthetic OOD samples by unknown-aware prompt learning to enhance the\nseparability of ID and OOD. Extensive experiments across multiple benchmarks\ndemonstrate that CLIP-OS achieves superior few-shot OOD detection capability.\n","authors":["Hao Sun","Rundong He","Zhongyi Han","Zhicong Lin","Yongshun Gong","Yilong Yin"],"pdf_url":"https://arxiv.org/pdf/2404.00323v1.pdf","comment":"9 pages,5 figures"},{"id":"http://arxiv.org/abs/2404.00322v1","updated":"2024-03-30T11:21:11Z","published":"2024-03-30T11:21:11Z","title":"Instrument-tissue Interaction Detection Framework for Surgical Video\n Understanding","summary":" Instrument-tissue interaction detection task, which helps understand surgical\nactivities, is vital for constructing computer-assisted surgery systems but\nwith many challenges. Firstly, most models represent instrument-tissue\ninteraction in a coarse-grained way which only focuses on classification and\nlacks the ability to automatically detect instruments and tissues. Secondly,\nexisting works do not fully consider relations between intra- and inter-frame\nof instruments and tissues. In the paper, we propose to represent\ninstrument-tissue interaction as quintuple and present an\nInstrument-Tissue Interaction Detection Network (ITIDNet) to detect the\nquintuple for surgery videos understanding. Specifically, we propose a Snippet\nConsecutive Feature (SCF) Layer to enhance features by modeling relationships\nof proposals in the current frame using global context information in the video\nsnippet. We also propose a Spatial Corresponding Attention (SCA) Layer to\nincorporate features of proposals between adjacent frames through spatial\nencoding. To reason relationships between instruments and tissues, a Temporal\nGraph (TG) Layer is proposed with intra-frame connections to exploit\nrelationships between instruments and tissues in the same frame and inter-frame\nconnections to model the temporal information for the same instance. For\nevaluation, we build a cataract surgery video (PhacoQ) dataset and a\ncholecystectomy surgery video (CholecQ) dataset. Experimental results\ndemonstrate the promising performance of our model, which outperforms other\nstate-of-the-art models on both datasets.\n","authors":["Wenjun Lin","Yan Hu","Huazhu Fu","Mingming Yang","Chin-Boon Chng","Ryo Kawasaki","Cheekong Chui","Jiang Liu"],"pdf_url":"https://arxiv.org/pdf/2404.00322v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00318v1","updated":"2024-03-30T10:54:59Z","published":"2024-03-30T10:54:59Z","title":"Exploring Unseen Environments with Robots using Large Language and\n Vision Models through a Procedurally Generated 3D Scene Representation","summary":" Recent advancements in Generative Artificial Intelligence, particularly in\nthe realm of Large Language Models (LLMs) and Large Vision Language Models\n(LVLMs), have enabled the prospect of leveraging cognitive planners within\nrobotic systems. This work focuses on solving the object goal navigation\nproblem by mimicking human cognition to attend, perceive and store task\nspecific information and generate plans with the same. We introduce a\ncomprehensive framework capable of exploring an unfamiliar environment in\nsearch of an object by leveraging the capabilities of Large Language\nModels(LLMs) and Large Vision Language Models (LVLMs) in understanding the\nunderlying semantics of our world. A challenging task in using LLMs to generate\nhigh level sub-goals is to efficiently represent the environment around the\nrobot. We propose to use a 3D scene modular representation, with semantically\nrich descriptions of the object, to provide the LLM with task relevant\ninformation. But providing the LLM with a mass of contextual information (rich\n3D scene semantic representation), can lead to redundant and inefficient plans.\nWe propose to use an LLM based pruner that leverages the capabilities of\nin-context learning to prune out irrelevant goal specific information.\n","authors":["Arjun P S","Andrew Melnik","Gora Chand Nandi"],"pdf_url":"https://arxiv.org/pdf/2404.00318v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00313v1","updated":"2024-03-30T10:37:56Z","published":"2024-03-30T10:37:56Z","title":"Harmonizing Light and Darkness: A Symphony of Prior-guided Data\n Synthesis and Adaptive Focus for Nighttime Flare Removal","summary":" Intense light sources often produce flares in captured images at night, which\ndeteriorates the visual quality and negatively affects downstream applications.\nIn order to train an effective flare removal network, a reliable dataset is\nessential. The mainstream flare removal datasets are semi-synthetic to reduce\nhuman labour, but these datasets do not cover typical scenarios involving\nmultiple scattering flares. To tackle this issue, we synthesize a prior-guided\ndataset named Flare7K*, which contains multi-flare images where the brightness\nof flares adheres to the laws of illumination. Besides, flares tend to occupy\nlocalized regions of the image but existing networks perform flare removal on\nthe entire image and sometimes modify clean areas incorrectly. Therefore, we\npropose a plug-and-play Adaptive Focus Module (AFM) that can adaptively mask\nthe clean background areas and assist models in focusing on the regions\nseverely affected by flares. Extensive experiments demonstrate that our data\nsynthesis method can better simulate real-world scenes and several models\nequipped with AFM achieve state-of-the-art performance on the real-world test\ndataset.\n","authors":["Lishen Qu","Shihao Zhou","Jinshan Pan","Jinglei Shi","Duosheng Chen","Jufeng Yang"],"pdf_url":"https://arxiv.org/pdf/2404.00313v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00312v1","updated":"2024-03-30T10:25:28Z","published":"2024-03-30T10:25:28Z","title":"Bayesian Exploration of Pre-trained Models for Low-shot Image\n Classification","summary":" Low-shot image classification is a fundamental task in computer vision, and\nthe emergence of large-scale vision-language models such as CLIP has greatly\nadvanced the forefront of research in this field. However, most existing\nCLIP-based methods lack the flexibility to effectively incorporate other\npre-trained models that encompass knowledge distinct from CLIP. To bridge the\ngap, this work proposes a simple and effective probabilistic model ensemble\nframework based on Gaussian processes, which have previously demonstrated\nremarkable efficacy in processing small data. We achieve the integration of\nprior knowledge by specifying the mean function with CLIP and the kernel\nfunction with an ensemble of deep kernels built upon various pre-trained\nmodels. By regressing the classification label directly, our framework enables\nanalytical inference, straightforward uncertainty quantification, and\nprincipled hyper-parameter tuning. Through extensive experiments on standard\nbenchmarks, we demonstrate that our method consistently outperforms competitive\nensemble baselines regarding predictive performance. Additionally, we assess\nthe robustness of our method and the quality of the yielded uncertainty\nestimates on out-of-distribution datasets. We also illustrate that our method,\ndespite relying on label regression, still enjoys superior model calibration\ncompared to most deterministic baselines.\n","authors":["Yibo Miao","Yu Lei","Feng Zhou","Zhijie Deng"],"pdf_url":"https://arxiv.org/pdf/2404.00312v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00308v1","updated":"2024-03-30T10:11:26Z","published":"2024-03-30T10:11:26Z","title":"ST-LLM: Large Language Models Are Effective Temporal Learners","summary":" Large Language Models (LLMs) have showcased impressive capabilities in text\ncomprehension and generation, prompting research efforts towards video LLMs to\nfacilitate human-AI interaction at the video level. However, how to effectively\nencode and understand videos in video-based dialogue systems remains to be\nsolved. In this paper, we investigate a straightforward yet unexplored\nquestion: Can we feed all spatial-temporal tokens into the LLM, thus delegating\nthe task of video sequence modeling to the LLMs? Surprisingly, this simple\napproach yields significant improvements in video understanding. Based upon\nthis, we propose ST-LLM, an effective video-LLM baseline with Spatial-Temporal\nsequence modeling inside LLM. Furthermore, to address the overhead and\nstability issues introduced by uncompressed video tokens within LLMs, we\ndevelop a dynamic masking strategy with tailor-made training objectives. For\nparticularly long videos, we have also designed a global-local input module to\nbalance efficiency and effectiveness. Consequently, we harness LLM for\nproficient spatial-temporal modeling, while upholding efficiency and stability.\nExtensive experimental results attest to the effectiveness of our method.\nThrough a more concise model and training pipeline, ST-LLM establishes a new\nstate-of-the-art result on VideoChatGPT-Bench and MVBench. Codes have been\navailable at https://github.com/TencentARC/ST-LLM.\n","authors":["Ruyang Liu","Chen Li","Haoran Tang","Yixiao Ge","Ying Shan","Ge Li"],"pdf_url":"https://arxiv.org/pdf/2404.00308v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00301v1","updated":"2024-03-30T09:43:40Z","published":"2024-03-30T09:43:40Z","title":"Monocular Identity-Conditioned Facial Reflectance Reconstruction","summary":" Recent 3D face reconstruction methods have made remarkable advancements, yet\nthere remain huge challenges in monocular high-quality facial reflectance\nreconstruction. Existing methods rely on a large amount of light-stage captured\ndata to learn facial reflectance models. However, the lack of subject diversity\nposes challenges in achieving good generalization and widespread applicability.\nIn this paper, we learn the reflectance prior in image space rather than UV\nspace and present a framework named ID2Reflectance. Our framework can directly\nestimate the reflectance maps of a single image while using limited reflectance\ndata for training. Our key insight is that reflectance data shares facial\nstructures with RGB faces, which enables obtaining expressive facial prior from\ninexpensive RGB data thus reducing the dependency on reflectance data. We first\nlearn a high-quality prior for facial reflectance. Specifically, we pretrain\nmulti-domain facial feature codebooks and design a codebook fusion method to\nalign the reflectance and RGB domains. Then, we propose an identity-conditioned\nswapping module that injects facial identity from the target image into the\npre-trained autoencoder to modify the identity of the source reflectance image.\nFinally, we stitch multi-view swapped reflectance images to obtain renderable\nassets. Extensive experiments demonstrate that our method exhibits excellent\ngeneralization capability and achieves state-of-the-art facial reflectance\nreconstruction results for in-the-wild faces. Our project page is\nhttps://xingyuren.github.io/id2reflectance/.\n","authors":["Xingyu Ren","Jiankang Deng","Yuhao Cheng","Jia Guo","Chao Ma","Yichao Yan","Wenhan Zhu","Xiaokang Yang"],"pdf_url":"https://arxiv.org/pdf/2404.00301v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00292v1","updated":"2024-03-30T08:51:23Z","published":"2024-03-30T08:51:23Z","title":"LAKE-RED: Camouflaged Images Generation by Latent Background Knowledge\n Retrieval-Augmented Diffusion","summary":" Camouflaged vision perception is an important vision task with numerous\npractical applications. Due to the expensive collection and labeling costs,\nthis community struggles with a major bottleneck that the species category of\nits datasets is limited to a small number of object species. However, the\nexisting camouflaged generation methods require specifying the background\nmanually, thus failing to extend the camouflaged sample diversity in a low-cost\nmanner. In this paper, we propose a Latent Background Knowledge\nRetrieval-Augmented Diffusion (LAKE-RED) for camouflaged image generation. To\nour knowledge, our contributions mainly include: (1) For the first time, we\npropose a camouflaged generation paradigm that does not need to receive any\nbackground inputs. (2) Our LAKE-RED is the first knowledge retrieval-augmented\nmethod with interpretability for camouflaged generation, in which we propose an\nidea that knowledge retrieval and reasoning enhancement are separated\nexplicitly, to alleviate the task-specific challenges. Moreover, our method is\nnot restricted to specific foreground targets or backgrounds, offering a\npotential for extending camouflaged vision perception to more diverse domains.\n(3) Experimental results demonstrate that our method outperforms the existing\napproaches, generating more realistic camouflage images.\n","authors":["Pancheng Zhao","Peng Xu","Pengda Qin","Deng-Ping Fan","Zhicheng Zhang","Guoli Jia","Bowen Zhou","Jufeng Yang"],"pdf_url":"https://arxiv.org/pdf/2404.00292v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00288v1","updated":"2024-03-30T08:42:34Z","published":"2024-03-30T08:42:34Z","title":"Seeing the Unseen: A Frequency Prompt Guided Transformer for Image\n Restoration","summary":" How to explore useful features from images as prompts to guide the deep image\nrestoration models is an effective way to solve image restoration. In contrast\nto mining spatial relations within images as prompt, which leads to\ncharacteristics of different frequencies being neglected and further remaining\nsubtle or undetectable artifacts in the restored image, we develop a Frequency\nPrompting image restoration method, dubbed FPro, which can effectively provide\nprompt components from a frequency perspective to guild the restoration model\naddress these differences. Specifically, we first decompose input features into\nseparate frequency parts via dynamically learned filters, where we introduce a\ngating mechanism for suppressing the less informative elements within the\nkernels. To propagate useful frequency information as prompt, we then propose a\ndual prompt block, consisting of a low-frequency prompt modulator (LPM) and a\nhigh-frequency prompt modulator (HPM), to handle signals from different bands\nrespectively. Each modulator contains a generation process to incorporate\nprompting components into the extracted frequency maps, and a modulation part\nthat modifies the prompt feature with the guidance of the decoder features.\nExperimental results on commonly used benchmarks have demonstrated the\nfavorable performance of our pipeline against SOTA methods on 5 image\nrestoration tasks, including deraining, deraindrop, demoir\\'eing, deblurring,\nand dehazing. The source code and pre-trained models will be available at\nhttps://github.com/joshyZhou/FPro.\n","authors":["Shihao Zhou","Jinshan Pan","Jinglei Shi","Duosheng Chen","Lishen Qu","Jufeng Yang"],"pdf_url":"https://arxiv.org/pdf/2404.00288v1.pdf","comment":"18 pages, 10 figrues"},{"id":"http://arxiv.org/abs/2404.00285v1","updated":"2024-03-30T08:37:19Z","published":"2024-03-30T08:37:19Z","title":"Long-Tailed Recognition on Binary Networks by Calibrating A Pre-trained\n Model","summary":" Deploying deep models in real-world scenarios entails a number of challenges,\nincluding computational efficiency and real-world (e.g., long-tailed) data\ndistributions. We address the combined challenge of learning long-tailed\ndistributions using highly resource-efficient binary neural networks as\nbackbones. Specifically, we propose a calibrate-and-distill framework that uses\noff-the-shelf pretrained full-precision models trained on balanced datasets to\nuse as teachers for distillation when learning binary networks on long-tailed\ndatasets. To better generalize to various datasets, we further propose a novel\nadversarial balancing among the terms in the objective function and an\nefficient multiresolution learning scheme. We conducted the largest empirical\nstudy in the literature using 15 datasets, including newly derived long-tailed\ndatasets from existing balanced datasets, and show that our proposed method\noutperforms prior art by large margins (>14.33% on average).\n","authors":["Jihun Kim","Dahyun Kim","Hyungrok Jung","Taeil Oh","Jonghyun Choi"],"pdf_url":"https://arxiv.org/pdf/2404.00285v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00279v1","updated":"2024-03-30T08:05:00Z","published":"2024-03-30T08:05:00Z","title":"Look-Around Before You Leap: High-Frequency Injected Transformer for\n Image Restoration","summary":" Transformer-based approaches have achieved superior performance in image\nrestoration, since they can model long-term dependencies well. However, the\nlimitation in capturing local information restricts their capacity to remove\ndegradations. While existing approaches attempt to mitigate this issue by\nincorporating convolutional operations, the core component in Transformer,\ni.e., self-attention, which serves as a low-pass filter, could unintentionally\ndilute or even eliminate the acquired local patterns. In this paper, we propose\nHIT, a simple yet effective High-frequency Injected Transformer for image\nrestoration. Specifically, we design a window-wise injection module (WIM),\nwhich incorporates abundant high-frequency details into the feature map, to\nprovide reliable references for restoring high-quality images. We also develop\na bidirectional interaction module (BIM) to aggregate features at different\nscales using a mutually reinforced paradigm, resulting in spatially and\ncontextually improved representations. In addition, we introduce a spatial\nenhancement unit (SEU) to preserve essential spatial relationships that may be\nlost due to the computations carried out across channel dimensions in the BIM.\nExtensive experiments on 9 tasks (real noise, real rain streak, raindrop,\nmotion blur, moir\\'e, shadow, snow, haze, and low-light condition) demonstrate\nthat HIT with linear computational complexity performs favorably against the\nstate-of-the-art methods. The source code and pre-trained models will be\navailable at https://github.com/joshyZhou/HIT.\n","authors":["Shihao Zhou","Duosheng Chen","Jinshan Pan","Jufeng Yang"],"pdf_url":"https://arxiv.org/pdf/2404.00279v1.pdf","comment":"19 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.00272v1","updated":"2024-03-30T07:27:36Z","published":"2024-03-30T07:27:36Z","title":"HSIMamba: Hyperpsectral Imaging Efficient Feature Learning with\n Bidirectional State Space for Classification","summary":" Classifying hyperspectral images is a difficult task in remote sensing, due\nto their complex high-dimensional data. To address this challenge, we propose\nHSIMamba, a novel framework that uses bidirectional reversed convolutional\nneural network pathways to extract spectral features more efficiently.\nAdditionally, it incorporates a specialized block for spatial analysis. Our\napproach combines the operational efficiency of CNNs with the dynamic feature\nextraction capability of attention mechanisms found in Transformers. However,\nit avoids the associated high computational demands. HSIMamba is designed to\nprocess data bidirectionally, significantly enhancing the extraction of\nspectral features and integrating them with spatial information for\ncomprehensive analysis. This approach improves classification accuracy beyond\ncurrent benchmarks and addresses computational inefficiencies encountered with\nadvanced models like Transformers. HSIMamba were tested against three widely\nrecognized datasets Houston 2013, Indian Pines, and Pavia University and\ndemonstrated exceptional performance, surpassing existing state-of-the-art\nmodels in HSI classification. This method highlights the methodological\ninnovation of HSIMamba and its practical implications, which are particularly\nvaluable in contexts where computational resources are limited. HSIMamba\nredefines the standards of efficiency and accuracy in HSI classification,\nthereby enhancing the capabilities of remote sensing applications.\nHyperspectral imaging has become a crucial tool for environmental surveillance,\nagriculture, and other critical areas that require detailed analysis of the\nEarth surface. Please see our code in HSIMamba for more details.\n","authors":["Judy X Yang","Jun Zhou","Jing Wang","Hui Tian","Alan Wee Chung Liew"],"pdf_url":"https://arxiv.org/pdf/2404.00272v1.pdf","comment":"11 pages, 2 figures, 8 tables"},{"id":"http://arxiv.org/abs/2404.00269v1","updated":"2024-03-30T07:17:37Z","published":"2024-03-30T07:17:37Z","title":"IPoD: Implicit Field Learning with Point Diffusion for Generalizable 3D\n Object Reconstruction from Single RGB-D Images","summary":" Generalizable 3D object reconstruction from single-view RGB-D images remains\na challenging task, particularly with real-world data. Current state-of-the-art\nmethods develop Transformer-based implicit field learning, necessitating an\nintensive learning paradigm that requires dense query-supervision uniformly\nsampled throughout the entire space. We propose a novel approach, IPoD, which\nharmonizes implicit field learning with point diffusion. This approach treats\nthe query points for implicit field learning as a noisy point cloud for\niterative denoising, allowing for their dynamic adaptation to the target object\nshape. Such adaptive query points harness diffusion learning's capability for\ncoarse shape recovery and also enhances the implicit representation's ability\nto delineate finer details. Besides, an additional self-conditioning mechanism\nis designed to use implicit predictions as the guidance of diffusion learning,\nleading to a cooperative system. Experiments conducted on the CO3D-v2 dataset\naffirm the superiority of IPoD, achieving 7.8% improvement in F-score and 28.6%\nin Chamfer distance over existing methods. The generalizability of IPoD is also\ndemonstrated on the MVImgNet dataset. Our project page is at\nhttps://yushuang-wu.github.io/IPoD.\n","authors":["Yushuang Wu","Luyue Shi","Junhao Cai","Weihao Yuan","Lingteng Qiu","Zilong Dong","Liefeng Bo","Shuguang Cui","Xiaoguang Han"],"pdf_url":"https://arxiv.org/pdf/2404.00269v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00262v1","updated":"2024-03-30T06:29:59Z","published":"2024-03-30T06:29:59Z","title":"Image-to-Image Matching via Foundation Models: A New Perspective for\n Open-Vocabulary Semantic Segmentation","summary":" Open-vocabulary semantic segmentation (OVS) aims to segment images of\narbitrary categories specified by class labels or captions. However, most\nprevious best-performing methods, whether pixel grouping methods or region\nrecognition methods, suffer from false matches between image features and\ncategory labels. We attribute this to the natural gap between the textual\nfeatures and visual features. In this work, we rethink how to mitigate false\nmatches from the perspective of image-to-image matching and propose a novel\nrelation-aware intra-modal matching (RIM) framework for OVS based on visual\nfoundation models. RIM achieves robust region classification by firstly\nconstructing diverse image-modal reference features and then matching them with\nregion features based on relation-aware ranking distribution. The proposed RIM\nenjoys several merits. First, the intra-modal reference features are better\naligned, circumventing potential ambiguities that may arise in cross-modal\nmatching. Second, the ranking-based matching process harnesses the structure\ninformation implicit in the inter-class relationships, making it more robust\nthan comparing individually. Extensive experiments on three benchmarks\ndemonstrate that RIM outperforms previous state-of-the-art methods by large\nmargins, obtaining a lead of more than 10% in mIoU on PASCAL VOC benchmark.\n","authors":["Yuan Wang","Rui Sun","Naisong Luo","Yuwen Pan","Tianzhu Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.00262v1.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2404.00260v1","updated":"2024-03-30T06:18:50Z","published":"2024-03-30T06:18:50Z","title":"Exploiting Self-Supervised Constraints in Image Super-Resolution","summary":" Recent advances in self-supervised learning, predominantly studied in\nhigh-level visual tasks, have been explored in low-level image processing. This\npaper introduces a novel self-supervised constraint for single image\nsuper-resolution, termed SSC-SR. SSC-SR uniquely addresses the divergence in\nimage complexity by employing a dual asymmetric paradigm and a target model\nupdated via exponential moving average to enhance stability. The proposed\nSSC-SR framework works as a plug-and-play paradigm and can be easily applied to\nexisting SR models. Empirical evaluations reveal that our SSC-SR framework\ndelivers substantial enhancements on a variety of benchmark datasets, achieving\nan average increase of 0.1 dB over EDSR and 0.06 dB over SwinIR. In addition,\nextensive ablation studies corroborate the effectiveness of each constituent in\nour SSC-SR framework. Codes are available at https://github.com/Aitical/SSCSR.\n","authors":["Gang Wu","Junjun Jiang","Kui Jiang","Xianming Liu"],"pdf_url":"https://arxiv.org/pdf/2404.00260v1.pdf","comment":"ICME 2024"},{"id":"http://arxiv.org/abs/2404.00257v1","updated":"2024-03-30T06:17:39Z","published":"2024-03-30T06:17:39Z","title":"YOLOOC: YOLO-based Open-Class Incremental Object Detection with Novel\n Class Discovery","summary":" Because of its use in practice, open-world object detection (OWOD) has gotten\na lot of attention recently. The challenge is how can a model detect novel\nclasses and then incrementally learn them without forgetting previously known\nclasses. Previous approaches hinge on strongly-supervised or weakly-supervised\nnovel-class data for novel-class detection, which may not apply to real\napplications. We construct a new benchmark that novel classes are only\nencountered at the inference stage. And we propose a new OWOD detector YOLOOC,\nbased on the YOLO architecture yet for the Open-Class setup. We introduce label\nsmoothing to prevent the detector from over-confidently mapping novel classes\nto known classes and to discover novel classes. Extensive experiments conducted\non our more realistic setup demonstrate the effectiveness of our method for\ndiscovering novel classes in our new benchmark.\n","authors":["Qian Wan","Xiang Xiang","Qinhao Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.00257v1.pdf","comment":"Initially submitted to ACCV 2022"},{"id":"http://arxiv.org/abs/2404.00252v1","updated":"2024-03-30T05:42:17Z","published":"2024-03-30T05:42:17Z","title":"Learned Scanpaths Aid Blind Panoramic Video Quality Assessment","summary":" Panoramic videos have the advantage of providing an immersive and interactive\nviewing experience. Nevertheless, their spherical nature gives rise to various\nand uncertain user viewing behaviors, which poses significant challenges for\npanoramic video quality assessment (PVQA). In this work, we propose an\nend-to-end optimized, blind PVQA method with explicit modeling of user viewing\npatterns through visual scanpaths. Our method consists of two modules: a\nscanpath generator and a quality assessor. The scanpath generator is initially\ntrained to predict future scanpaths by minimizing their expected code length\nand then jointly optimized with the quality assessor for quality prediction.\nOur blind PVQA method enables direct quality assessment of panoramic images by\ntreating them as videos composed of identical frames. Experiments on three\npublic panoramic image and video quality datasets, encompassing both synthetic\nand authentic distortions, validate the superiority of our blind PVQA model\nover existing methods.\n","authors":["Kanglong Fan","Wen Wen","Mu Li","Yifan Peng","Kede Ma"],"pdf_url":"https://arxiv.org/pdf/2404.00252v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00234v1","updated":"2024-03-30T03:50:43Z","published":"2024-03-30T03:50:43Z","title":"Grid Diffusion Models for Text-to-Video Generation","summary":" Recent advances in the diffusion models have significantly improved\ntext-to-image generation. However, generating videos from text is a more\nchallenging task than generating images from text, due to the much larger\ndataset and higher computational cost required. Most existing video generation\nmethods use either a 3D U-Net architecture that considers the temporal\ndimension or autoregressive generation. These methods require large datasets\nand are limited in terms of computational costs compared to text-to-image\ngeneration. To tackle these challenges, we propose a simple but effective novel\ngrid diffusion for text-to-video generation without temporal dimension in\narchitecture and a large text-video paired dataset. We can generate a\nhigh-quality video using a fixed amount of GPU memory regardless of the number\nof frames by representing the video as a grid image. Additionally, since our\nmethod reduces the dimensions of the video to the dimensions of the image,\nvarious image-based methods can be applied to videos, such as text-guided video\nmanipulation from image manipulation. Our proposed method outperforms the\nexisting methods in both quantitative and qualitative evaluations,\ndemonstrating the suitability of our model for real-world video generation.\n","authors":["Taegyeong Lee","Soyeong Kwon","Taehwan Kim"],"pdf_url":"https://arxiv.org/pdf/2404.00234v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00231v1","updated":"2024-03-30T03:23:52Z","published":"2024-03-30T03:23:52Z","title":"Attention-based Shape-Deformation Networks for Artifact-Free Geometry\n Reconstruction of Lumbar Spine from MR Images","summary":" Lumbar disc degeneration, a progressive structural wear and tear of lumbar\nintervertebral disc, is regarded as an essential role on low back pain, a\nsignificant global health concern. Automated lumbar spine geometry\nreconstruction from MR images will enable fast measurement of medical\nparameters to evaluate the lumbar status, in order to determine a suitable\ntreatment. Existing image segmentation-based techniques often generate\nerroneous segments or unstructured point clouds, unsuitable for medical\nparameter measurement. In this work, we present TransDeformer: a novel\nattention-based deep learning approach that reconstructs the contours of the\nlumbar spine with high spatial accuracy and mesh correspondence across\npatients, and we also present a variant of TransDeformer for error estimation.\nSpecially, we devise new attention modules with a new attention formula, which\nintegrates image features and tokenized contour features to predict the\ndisplacements of the points on a shape template without the need for image\nsegmentation. The deformed template reveals the lumbar spine geometry in the\ninput image. We develop a multi-stage training strategy to enhance model\nrobustness with respect to template initialization. Experiment results show\nthat our TransDeformer generates artifact-free geometry outputs, and its\nvariant predicts the error of a reconstructed geometry. Our code is available\nat https://github.com/linchenq/TransDeformer-Mesh.\n","authors":["Linchen Qian","Jiasong Chen","Linhai Ma","Timur Urakov","Weiyong Gu","Liang Liang"],"pdf_url":"https://arxiv.org/pdf/2404.00231v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00230v1","updated":"2024-03-30T03:19:50Z","published":"2024-03-30T03:19:50Z","title":"Latent Watermark: Inject and Detect Watermarks in Latent Diffusion Space","summary":" Watermarking is a tool for actively identifying and attributing the images\ngenerated by latent diffusion models. Existing methods face the dilemma of\nwatermark robustness and image quality. The reason for this dilemma is that\nwatermark detection is performed in pixel space, implying an intrinsic link\nbetween image quality and watermark robustness. In this paper, we highlight\nthat an effective solution to the problem is to both inject and detect\nwatermarks in latent space, and propose Latent Watermark (LW) with a\nprogressive training strategy. Experiments show that compared to the recently\nproposed methods such as StegaStamp, StableSignature, RoSteALS and TreeRing, LW\nnot only surpasses them in terms of robustness but also offers superior image\nquality. When we inject 64-bit messages, LW can achieve an identification\nperformance close to 100% and an attribution performance above 97% under 9\nsingle-attack scenarios and one all-attack scenario. Our code will be available\non GitHub.\n","authors":["Zheling Meng","Bo Peng","Jing Dong"],"pdf_url":"https://arxiv.org/pdf/2404.00230v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00226v1","updated":"2024-03-30T02:56:54Z","published":"2024-03-30T02:56:54Z","title":"Design as Desired: Utilizing Visual Question Answering for Multimodal\n Pre-training","summary":" Multimodal pre-training demonstrates its potential in the medical domain,\nwhich learns medical visual representations from paired medical reports.\nHowever, many pre-training tasks require extra annotations from clinicians, and\nmost of them fail to explicitly guide the model to learn the desired features\nof different pathologies. To the best of our knowledge, we are the first to\nutilize Visual Question Answering (VQA) for multimodal pre-training to guide\nthe framework focusing on targeted pathological features. In this work, we\nleverage descriptions in medical reports to design multi-granular\nquestion-answer pairs associated with different diseases, which assist the\nframework in pre-training without requiring extra annotations from experts. We\nalso propose a novel pre-training framework with a quasi-textual feature\ntransformer, a module designed to transform visual features into a\nquasi-textual space closer to the textual domain via a contrastive learning\nstrategy. This narrows the vision-language gap and facilitates modality\nalignment. Our framework is applied to four downstream tasks: report\ngeneration, classification, segmentation, and detection across five datasets.\nExtensive experiments demonstrate the superiority of our framework compared to\nother state-of-the-art methods. Our code will be released upon acceptance.\n","authors":["Tongkun Su","Jun Li","Xi Zhang","Haibo Jin","Hao Chen","Qiong Wang","Faqin Lv","Baoliang Zhao","Yin Hu"],"pdf_url":"https://arxiv.org/pdf/2404.00226v1.pdf","comment":null}]},"2024-04-02T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.02157v1","updated":"2024-04-02T17:59:10Z","published":"2024-04-02T17:59:10Z","title":"Segment Any 3D Object with Language","summary":" In this paper, we investigate Open-Vocabulary 3D Instance Segmentation\n(OV-3DIS) with free-form language instructions. Earlier works that rely on only\nannotated base categories for training suffer from limited generalization to\nunseen novel categories. Recent works mitigate poor generalizability to novel\ncategories by generating class-agnostic masks or projecting generalized masks\nfrom 2D to 3D, but disregard semantic or geometry information, leading to\nsub-optimal performance. Instead, generating generalizable but semantic-related\nmasks directly from 3D point clouds would result in superior outcomes. In this\npaper, we introduce Segment any 3D Object with LanguagE (SOLE), which is a\nsemantic and geometric-aware visual-language learning framework with strong\ngeneralizability by generating semantic-related masks directly from 3D point\nclouds. Specifically, we propose a multimodal fusion network to incorporate\nmultimodal semantics in both backbone and decoder. In addition, to align the 3D\nsegmentation model with various language instructions and enhance the mask\nquality, we introduce three types of multimodal associations as supervision.\nOur SOLE outperforms previous methods by a large margin on ScanNetv2,\nScanNet200, and Replica benchmarks, and the results are even close to the\nfully-supervised counterpart despite the absence of class annotations in the\ntraining. Furthermore, extensive qualitative results demonstrate the\nversatility of our SOLE to language instructions.\n","authors":["Seungjun Lee","Yuyang Zhao","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2404.02157v1.pdf","comment":"Project Page: https://cvrp-sole.github.io"},{"id":"http://arxiv.org/abs/2404.02155v1","updated":"2024-04-02T17:58:57Z","published":"2024-04-02T17:58:57Z","title":"Alpha Invariance: On Inverse Scaling Between Distance and Volume Density\n in Neural Radiance Fields","summary":" Scale-ambiguity in 3D scene dimensions leads to magnitude-ambiguity of\nvolumetric densities in neural radiance fields, i.e., the densities double when\nscene size is halved, and vice versa. We call this property alpha invariance.\nFor NeRFs to better maintain alpha invariance, we recommend 1) parameterizing\nboth distance and volume densities in log space, and 2) a\ndiscretization-agnostic initialization strategy to guarantee high ray\ntransmittance. We revisit a few popular radiance field models and find that\nthese systems use various heuristics to deal with issues arising from scene\nscaling. We test their behaviors and show our recipe to be more robust.\n","authors":["Joshua Ahn","Haochen Wang","Raymond A. Yeh","Greg Shakhnarovich"],"pdf_url":"https://arxiv.org/pdf/2404.02155v1.pdf","comment":"CVPR 2024. project page https://pals.ttic.edu/p/alpha-invariance"},{"id":"http://arxiv.org/abs/2404.02154v1","updated":"2024-04-02T17:58:49Z","published":"2024-04-02T17:58:49Z","title":"Dynamic Pre-training: Towards Efficient and Scalable All-in-One Image\n Restoration","summary":" All-in-one image restoration tackles different types of degradations with a\nunified model instead of having task-specific, non-generic models for each\ndegradation. The requirement to tackle multiple degradations using the same\nmodel can lead to high-complexity designs with fixed configuration that lack\nthe adaptability to more efficient alternatives. We propose DyNet, a dynamic\nfamily of networks designed in an encoder-decoder style for all-in-one image\nrestoration tasks. Our DyNet can seamlessly switch between its bulkier and\nlightweight variants, thereby offering flexibility for efficient model\ndeployment with a single round of training. This seamless switching is enabled\nby our weights-sharing mechanism, forming the core of our architecture and\nfacilitating the reuse of initialized module weights. Further, to establish\nrobust weights initialization, we introduce a dynamic pre-training strategy\nthat trains variants of the proposed DyNet concurrently, thereby achieving a\n50% reduction in GPU hours. To tackle the unavailability of large-scale dataset\nrequired in pre-training, we curate a high-quality, high-resolution image\ndataset named Million-IRD having 2M image samples. We validate our DyNet for\nimage denoising, deraining, and dehazing in all-in-one setting, achieving\nstate-of-the-art results with 31.34% reduction in GFlops and a 56.75% reduction\nin parameters compared to baseline models. The source codes and trained models\nare available at https://github.com/akshaydudhane16/DyNet.\n","authors":["Akshay Dudhane","Omkar Thawakar","Syed Waqas Zamir","Salman Khan","Fahad Shahbaz Khan","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2404.02154v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02152v1","updated":"2024-04-02T17:58:35Z","published":"2024-04-02T17:58:35Z","title":"GeneAvatar: Generic Expression-Aware Volumetric Head Avatar Editing from\n a Single Image","summary":" Recently, we have witnessed the explosive growth of various volumetric\nrepresentations in modeling animatable head avatars. However, due to the\ndiversity of frameworks, there is no practical method to support high-level\napplications like 3D head avatar editing across different representations. In\nthis paper, we propose a generic avatar editing approach that can be\nuniversally applied to various 3DMM driving volumetric head avatars. To achieve\nthis goal, we design a novel expression-aware modification generative model,\nwhich enables lift 2D editing from a single image to a consistent 3D\nmodification field. To ensure the effectiveness of the generative modification\nprocess, we develop several techniques, including an expression-dependent\nmodification distillation scheme to draw knowledge from the large-scale head\navatar model and 2D facial texture editing tools, implicit latent space\nguidance to enhance model convergence, and a segmentation-based loss reweight\nstrategy for fine-grained texture inversion. Extensive experiments demonstrate\nthat our method delivers high-quality and consistent results across multiple\nexpression and viewpoints. Project page: https://zju3dv.github.io/geneavatar/\n","authors":["Chong Bao","Yinda Zhang","Yuan Li","Xiyu Zhang","Bangbang Yang","Hujun Bao","Marc Pollefeys","Guofeng Zhang","Zhaopeng Cui"],"pdf_url":"https://arxiv.org/pdf/2404.02152v1.pdf","comment":"Accepted to CVPR 2024. Project page:\n https://zju3dv.github.io/geneavatar/"},{"id":"http://arxiv.org/abs/2404.02148v1","updated":"2024-04-02T17:58:03Z","published":"2024-04-02T17:58:03Z","title":"Diffusion$^2$: Dynamic 3D Content Generation via Score Composition of\n Orthogonal Diffusion Models","summary":" Recent advancements in 3D generation are predominantly propelled by\nimprovements in 3D-aware image diffusion models which are pretrained on\nInternet-scale image data and fine-tuned on massive 3D data, offering the\ncapability of producing highly consistent multi-view images. However, due to\nthe scarcity of synchronized multi-view video data, it is impractical to adapt\nthis paradigm to 4D generation directly. Despite that, the available video and\n3D data are adequate for training video and multi-view diffusion models that\ncan provide satisfactory dynamic and geometric priors respectively. In this\npaper, we present Diffusion$^2$, a novel framework for dynamic 3D content\ncreation that leverages the knowledge about geometric consistency and temporal\nsmoothness from these models to directly sample dense multi-view and\nmulti-frame images which can be employed to optimize continuous 4D\nrepresentation. Specifically, we design a simple yet effective denoising\nstrategy via score composition of video and multi-view diffusion models based\non the probability structure of the images to be generated. Owing to the high\nparallelism of the image generation and the efficiency of the modern 4D\nreconstruction pipeline, our framework can generate 4D content within few\nminutes. Furthermore, our method circumvents the reliance on 4D data, thereby\nhaving the potential to benefit from the scalability of the foundation video\nand multi-view diffusion models. Extensive experiments demonstrate the efficacy\nof our proposed framework and its capability to flexibly adapt to various types\nof prompts.\n","authors":["Zeyu Yang","Zijie Pan","Chun Gu","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.02148v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2404.02145v1","updated":"2024-04-02T17:57:31Z","published":"2024-04-02T17:57:31Z","title":"Iterated Learning Improves Compositionality in Large Vision-Language\n Models","summary":" A fundamental characteristic common to both human vision and natural language\nis their compositional nature. Yet, despite the performance gains contributed\nby large vision and language pretraining, recent investigations find that\nmost-if not all-our state-of-the-art vision-language models struggle at\ncompositionality. They are unable to distinguish between images of \" a girl in\nwhite facing a man in black\" and \"a girl in black facing a man in white\".\nMoreover, prior work suggests that compositionality doesn't arise with scale:\nlarger model sizes or training data don't help. This paper develops a new\niterated training algorithm that incentivizes compositionality. We draw on\ndecades of cognitive science research that identifies cultural transmission-the\nneed to teach a new generation-as a necessary inductive prior that incentivizes\nhumans to develop compositional languages. Specifically, we reframe\nvision-language contrastive learning as the Lewis Signaling Game between a\nvision agent and a language agent, and operationalize cultural transmission by\niteratively resetting one of the agent's weights during training. After every\niteration, this training paradigm induces representations that become \"easier\nto learn\", a property of compositional languages: e.g. our model trained on\nCC3M and CC12M improves standard CLIP by 4.7%, 4.0% respectfully in the\nSugarCrepe benchmark.\n","authors":["Chenhao Zheng","Jieyu Zhang","Aniruddha Kembhavi","Ranjay Krishna"],"pdf_url":"https://arxiv.org/pdf/2404.02145v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.02135v1","updated":"2024-04-02T17:48:46Z","published":"2024-04-02T17:48:46Z","title":"ResNet with Integrated Convolutional Block Attention Module for Ship\n Classification Using Transfer Learning on Optical Satellite Imagery","summary":" This study proposes a novel transfer learning framework for effective ship\nclassification using high-resolution optical remote sensing satellite imagery.\nThe framework is based on the deep convolutional neural network model ResNet50\nand incorporates the Convolutional Block Attention Module (CBAM) to enhance\nperformance. CBAM enables the model to attend to salient features in the\nimages, allowing it to better discriminate between subtle differences between\nships and backgrounds. Furthermore, this study adopts a transfer learning\napproach tailored for accurately classifying diverse types of ships by\nfine-tuning a pre-trained model for the specific task. Experimental results\ndemonstrate the efficacy of the proposed framework in ship classification using\noptical remote sensing imagery, achieving a high classification accuracy of 94%\nacross 5 classes, outperforming existing methods. This research holds potential\napplications in maritime surveillance and management, illegal fishing\ndetection, and maritime traffic monitoring.\n","authors":["Ryan Donghan Kwon","Gangjoo Robin Nam","Jisoo Tak","Yeom Hyeok","Junseob Shin","Hyerin Cha","Kim Soo Bin"],"pdf_url":"https://arxiv.org/pdf/2404.02135v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02132v1","updated":"2024-04-02T17:40:29Z","published":"2024-04-02T17:40:29Z","title":"ViTamin: Designing Scalable Vision Models in the Vision-Language Era","summary":" Recent breakthroughs in vision-language models (VLMs) start a new page in the\nvision community. The VLMs provide stronger and more generalizable feature\nembeddings compared to those from ImageNet-pretrained models, thanks to the\ntraining on the large-scale Internet image-text pairs. However, despite the\namazing achievement from the VLMs, vanilla Vision Transformers (ViTs) remain\nthe default choice for the image encoder. Although pure transformer proves its\neffectiveness in the text encoding area, it remains questionable whether it is\nalso the case for image encoding, especially considering that various types of\nnetworks are proposed on the ImageNet benchmark, which, unfortunately, are\nrarely studied in VLMs. Due to small data/model scale, the original conclusions\nof model design on ImageNet can be limited and biased. In this paper, we aim at\nbuilding an evaluation protocol of vision models in the vision-language era\nunder the contrastive language-image pretraining (CLIP) framework. We provide a\ncomprehensive way to benchmark different vision models, covering their\nzero-shot performance and scalability in both model and training data sizes. To\nthis end, we introduce ViTamin, a new vision models tailored for VLMs.\nViTamin-L significantly outperforms ViT-L by 2.0% ImageNet zero-shot accuracy,\nwhen using the same publicly available DataComp-1B dataset and the same\nOpenCLIP training scheme. ViTamin-L presents promising results on 60 diverse\nbenchmarks, including classification, retrieval, open-vocabulary detection and\nsegmentation, and large multi-modal models. When further scaling up the model\nsize, our ViTamin-XL with only 436M parameters attains 82.9% ImageNet zero-shot\naccuracy, surpassing 82.0% achieved by EVA-E that has ten times more parameters\n(4.4B).\n","authors":["Jienneg Chen","Qihang Yu","Xiaohui Shen","Alan Yuille","Liang-Chieh Chen"],"pdf_url":"https://arxiv.org/pdf/2404.02132v1.pdf","comment":"CVPR 2024; https://github.com/Beckschen/ViTamin"},{"id":"http://arxiv.org/abs/2308.12469v3","updated":"2024-04-02T17:40:03Z","published":"2023-08-23T23:44:44Z","title":"Diffuse, Attend, and Segment: Unsupervised Zero-Shot Segmentation using\n Stable Diffusion","summary":" Producing quality segmentation masks for images is a fundamental problem in\ncomputer vision. Recent research has explored large-scale supervised training\nto enable zero-shot segmentation on virtually any image style and unsupervised\ntraining to enable segmentation without dense annotations. However,\nconstructing a model capable of segmenting anything in a zero-shot manner\nwithout any annotations is still challenging. In this paper, we propose to\nutilize the self-attention layers in stable diffusion models to achieve this\ngoal because the pre-trained stable diffusion model has learned inherent\nconcepts of objects within its attention layers. Specifically, we introduce a\nsimple yet effective iterative merging process based on measuring KL divergence\namong attention maps to merge them into valid segmentation masks. The proposed\nmethod does not require any training or language dependency to extract quality\nsegmentation for any images. On COCO-Stuff-27, our method surpasses the prior\nunsupervised zero-shot SOTA method by an absolute 26% in pixel accuracy and 17%\nin mean IoU. The project page is at\n\\url{https://sites.google.com/view/diffseg/home}.\n","authors":["Junjiao Tian","Lavisha Aggarwal","Andrea Colaco","Zsolt Kira","Mar Gonzalez-Franco"],"pdf_url":"https://arxiv.org/pdf/2308.12469v3.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2310.05861v2","updated":"2024-04-02T17:37:42Z","published":"2023-10-09T16:57:57Z","title":"Rephrase, Augment, Reason: Visual Grounding of Questions for\n Vision-Language Models","summary":" An increasing number of vision-language tasks can be handled with little to\nno training, i.e., in a zero and few-shot manner, by marrying large language\nmodels (LLMs) to vision encoders, resulting in large vision-language models\n(LVLMs). While this has huge upsides, such as not requiring training data or\ncustom architectures, how an input is presented to an LVLM can have a major\nimpact on zero-shot model performance. In particular, inputs phrased in an\nunderspecified way can result in incorrect answers due to factors like missing\nvisual information, complex implicit reasoning, or linguistic ambiguity.\nTherefore, adding visually-grounded information to the input as a preemptive\nclarification should improve model performance by reducing underspecification,\ne.g., by localizing objects and disambiguating references. Similarly, in the\nVQA setting, changing the way questions are framed can make them easier for\nmodels to answer. To this end, we present Rephrase, Augment and Reason\n(RepARe), a gradient-free framework that extracts salient details about the\nimage using the underlying LVLM as a captioner and reasoner, in order to\npropose modifications to the original question. We then use the LVLM's\nconfidence over a generated answer as an unsupervised scoring function to\nselect the rephrased question most likely to improve zero-shot performance.\nFocusing on three visual question answering tasks, we show that RepARe can\nresult in a 3.85% (absolute) increase in zero-shot accuracy on VQAv2, 6.41%,\nand 7.94% points increase on A-OKVQA, and VizWiz respectively. Additionally, we\nfind that using gold answers for oracle question candidate selection achieves a\nsubstantial gain in VQA accuracy by up to 14.41%. Through extensive analysis,\nwe demonstrate that outputs from RepARe increase syntactic complexity, and\neffectively utilize vision-language interaction and the frozen LLM.\n","authors":["Archiki Prasad","Elias Stengel-Eskin","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2310.05861v2.pdf","comment":"ICLR 2024 camera-ready (23 pages), Code:\n https://github.com/archiki/RepARe"},{"id":"http://arxiv.org/abs/2404.02125v1","updated":"2024-04-02T17:32:12Z","published":"2024-04-02T17:32:12Z","title":"3D Congealing: 3D-Aware Image Alignment in the Wild","summary":" We propose 3D Congealing, a novel problem of 3D-aware alignment for 2D images\ncapturing semantically similar objects. Given a collection of unlabeled\nInternet images, our goal is to associate the shared semantic parts from the\ninputs and aggregate the knowledge from 2D images to a shared 3D canonical\nspace. We introduce a general framework that tackles the task without assuming\nshape templates, poses, or any camera parameters. At its core is a canonical 3D\nrepresentation that encapsulates geometric and semantic information. The\nframework optimizes for the canonical representation together with the pose for\neach input image, and a per-image coordinate map that warps 2D pixel\ncoordinates to the 3D canonical frame to account for the shape matching. The\noptimization procedure fuses prior knowledge from a pre-trained image\ngenerative model and semantic information from input images. The former\nprovides strong knowledge guidance for this under-constraint task, while the\nlatter provides the necessary information to mitigate the training data bias\nfrom the pre-trained model. Our framework can be used for various tasks such as\ncorrespondence matching, pose estimation, and image editing, achieving strong\nresults on real-world image datasets under challenging illumination conditions\nand on in-the-wild online image collections.\n","authors":["Yunzhi Zhang","Zizhang Li","Amit Raj","Andreas Engelhardt","Yuanzhen Li","Tingbo Hou","Jiajun Wu","Varun Jampani"],"pdf_url":"https://arxiv.org/pdf/2404.02125v1.pdf","comment":"Project page:\n https://ai.stanford.edu/~yzzhang/projects/3d-congealing/"},{"id":"http://arxiv.org/abs/2404.02117v1","updated":"2024-04-02T17:23:22Z","published":"2024-04-02T17:23:22Z","title":"Pre-trained Vision and Language Transformers Are Few-Shot Incremental\n Learners","summary":" Few-Shot Class Incremental Learning (FSCIL) is a task that requires a model\nto learn new classes incrementally without forgetting when only a few samples\nfor each class are given. FSCIL encounters two significant challenges:\ncatastrophic forgetting and overfitting, and these challenges have driven prior\nstudies to primarily rely on shallow models, such as ResNet-18. Even though\ntheir limited capacity can mitigate both forgetting and overfitting issues, it\nleads to inadequate knowledge transfer during few-shot incremental sessions. In\nthis paper, we argue that large models such as vision and language transformers\npre-trained on large datasets can be excellent few-shot incremental learners.\nTo this end, we propose a novel FSCIL framework called PriViLege, Pre-trained\nVision and Language transformers with prompting functions and knowledge\ndistillation. Our framework effectively addresses the challenges of\ncatastrophic forgetting and overfitting in large models through new pre-trained\nknowledge tuning (PKT) and two losses: entropy-based divergence loss and\nsemantic knowledge distillation loss. Experimental results show that the\nproposed PriViLege significantly outperforms the existing state-of-the-art\nmethods with a large margin, e.g., +9.38% in CUB200, +20.58% in CIFAR-100, and\n+13.36% in miniImageNet. Our implementation code is available at\nhttps://github.com/KHU-AGI/PriViLege.\n","authors":["Keon-Hee Park","Kyungwoo Song","Gyeong-Moon Park"],"pdf_url":"https://arxiv.org/pdf/2404.02117v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2312.12337v3","updated":"2024-04-02T17:23:16Z","published":"2023-12-19T17:03:50Z","title":"pixelSplat: 3D Gaussian Splats from Image Pairs for Scalable\n Generalizable 3D Reconstruction","summary":" We introduce pixelSplat, a feed-forward model that learns to reconstruct 3D\nradiance fields parameterized by 3D Gaussian primitives from pairs of images.\nOur model features real-time and memory-efficient rendering for scalable\ntraining as well as fast 3D reconstruction at inference time. To overcome local\nminima inherent to sparse and locally supported representations, we predict a\ndense probability distribution over 3D and sample Gaussian means from that\nprobability distribution. We make this sampling operation differentiable via a\nreparameterization trick, allowing us to back-propagate gradients through the\nGaussian splatting representation. We benchmark our method on wide-baseline\nnovel view synthesis on the real-world RealEstate10k and ACID datasets, where\nwe outperform state-of-the-art light field transformers and accelerate\nrendering by 2.5 orders of magnitude while reconstructing an interpretable and\neditable 3D radiance field.\n","authors":["David Charatan","Sizhe Li","Andrea Tagliasacchi","Vincent Sitzmann"],"pdf_url":"https://arxiv.org/pdf/2312.12337v3.pdf","comment":"Project page: https://dcharatan.github.io/pixelsplat"},{"id":"http://arxiv.org/abs/2404.02112v1","updated":"2024-04-02T17:13:04Z","published":"2024-04-02T17:13:04Z","title":"ImageNot: A contrast with ImageNet preserves model rankings","summary":" We introduce ImageNot, a dataset designed to match the scale of ImageNet\nwhile differing drastically in other aspects. We show that key model\narchitectures developed for ImageNet over the years rank identically when\ntrained and evaluated on ImageNot to how they rank on ImageNet. This is true\nwhen training models from scratch or fine-tuning them. Moreover, the relative\nimprovements of each model over earlier models strongly correlate in both\ndatasets. We further give evidence that ImageNot has a similar utility as\nImageNet for transfer learning purposes. Our work demonstrates a surprising\ndegree of external validity in the relative performance of image classification\nmodels. This stands in contrast with absolute accuracy numbers that typically\ndrop sharply even under small changes to a dataset.\n","authors":["Olawale Salaudeen","Moritz Hardt"],"pdf_url":"https://arxiv.org/pdf/2404.02112v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03849v3","updated":"2024-04-02T17:11:45Z","published":"2024-03-06T16:49:33Z","title":"MedMamba: Vision Mamba for Medical Image Classification","summary":" Medical image classification is a very fundamental and crucial task in the\nfield of computer vision. These years, CNN-based and Transformer-based models\nhave been widely used to classify various medical images. Unfortunately, The\nlimitation of CNNs in long-range modeling capabilities prevents them from\neffectively extracting features in medical images, while Transformers are\nhampered by their quadratic computational complexity. Recent research has shown\nthat the state space model (SSM) represented by Mamba can efficiently model\nlong-range interactions while maintaining linear computational complexity.\nInspired by this, we propose Vision Mamba for medical image classification\n(MedMamba). More specifically, we introduce a novel Conv-SSM module. Conv-SSM\ncombines the local feature extraction ability of convolutional layers with the\nability of SSM to capture long-range dependency, thereby modeling medical\nimages with different modalities. To demonstrate the potential of MedMamba, we\nconducted extensive experiments using 14 publicly available medical datasets\nwith different imaging techniques and two private datasets built by ourselves.\nExtensive experimental results demonstrate that the proposed MedMamba performs\nwell in detecting lesions in various medical images. To the best of our\nknowledge, this is the first Vision Mamba tailored for medical image\nclassification. The purpose of this work is to establish a new baseline for\nmedical image classification tasks and provide valuable insights for the future\ndevelopment of more efficient and effective SSM-based artificial intelligence\nalgorithms and application systems in the medical. Source code has been\navailable at https://github.com/YubiaoYue/MedMamba.\n","authors":["Yubiao Yue","Zhenzhang Li"],"pdf_url":"https://arxiv.org/pdf/2403.03849v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00095v2","updated":"2024-04-02T17:08:35Z","published":"2024-03-29T18:05:26Z","title":"GDA: Generalized Diffusion for Robust Test-time Adaptation","summary":" Machine learning models struggle with generalization when encountering\nout-of-distribution (OOD) samples with unexpected distribution shifts. For\nvision tasks, recent studies have shown that test-time adaptation employing\ndiffusion models can achieve state-of-the-art accuracy improvements on OOD\nsamples by generating new samples that align with the model's domain without\nthe need to modify the model's weights. Unfortunately, those studies have\nprimarily focused on pixel-level corruptions, thereby lacking the\ngeneralization to adapt to a broader range of OOD types. We introduce\nGeneralized Diffusion Adaptation (GDA), a novel diffusion-based test-time\nadaptation method robust against diverse OOD types. Specifically, GDA\niteratively guides the diffusion by applying a marginal entropy loss derived\nfrom the model, in conjunction with style and content preservation losses\nduring the reverse sampling process. In other words, GDA considers the model's\noutput behavior with the semantic information of the samples as a whole, which\ncan reduce ambiguity in downstream tasks during the generation process.\nEvaluation across various popular model architectures and OOD benchmarks shows\nthat GDA consistently outperforms prior work on diffusion-driven adaptation.\nNotably, it achieves the highest classification accuracy improvements, ranging\nfrom 4.4\\% to 5.02\\% on ImageNet-C and 2.5\\% to 7.4\\% on Rendition, Sketch, and\nStylized benchmarks. This performance highlights GDA's generalization to a\nbroader range of OOD benchmarks.\n","authors":["Yun-Yun Tsai","Fu-Chen Chen","Albert Y. C. Chen","Junfeng Yang","Che-Chun Su","Min Sun","Cheng-Hao Kuo"],"pdf_url":"https://arxiv.org/pdf/2404.00095v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02106v1","updated":"2024-04-02T17:04:45Z","published":"2024-04-02T17:04:45Z","title":"Neural Ordinary Differential Equation based Sequential Image\n Registration for Dynamic Characterization","summary":" Deformable image registration (DIR) is crucial in medical image analysis,\nenabling the exploration of biological dynamics such as organ motions and\nlongitudinal changes in imaging. Leveraging Neural Ordinary Differential\nEquations (ODE) for registration, this extension work discusses how this\nframework can aid in the characterization of sequential biological processes.\nUtilizing the Neural ODE's ability to model state derivatives with neural\nnetworks, our Neural Ordinary Differential Equation Optimization-based (NODEO)\nframework considers voxels as particles within a dynamic system, defining\ndeformation fields through the integration of neural differential equations.\nThis method learns dynamics directly from data, bypassing the need for physical\npriors, making it exceptionally suitable for medical scenarios where such\npriors are unavailable or inapplicable. Consequently, the framework can discern\nunderlying dynamics and use sequence data to regularize the transformation\ntrajectory. We evaluated our framework on two clinical datasets: one for\ncardiac motion tracking and another for longitudinal brain MRI analysis.\nDemonstrating its efficacy in both 2D and 3D imaging scenarios, our framework\noffers flexibility and model agnosticism, capable of managing image sequences\nand facilitating label propagation throughout these sequences. This study\nprovides a comprehensive understanding of how the Neural ODE-based framework\nuniquely benefits the image registration challenge.\n","authors":["Yifan Wu","Mengjin Dong","Rohit Jena","Chen Qin","James C. Gee"],"pdf_url":"https://arxiv.org/pdf/2404.02106v1.pdf","comment":"Journal extension of NODEO: A Neural Ordinary Differential Equation\n Based Optimization Framework for Deformable Image Registration, CVPR 2022"},{"id":"http://arxiv.org/abs/2403.18360v2","updated":"2024-04-02T17:02:32Z","published":"2024-03-27T08:52:44Z","title":"Learning CNN on ViT: A Hybrid Model to Explicitly Class-specific\n Boundaries for Domain Adaptation","summary":" Most domain adaptation (DA) methods are based on either a convolutional\nneural networks (CNNs) or a vision transformers (ViTs). They align the\ndistribution differences between domains as encoders without considering their\nunique characteristics. For instance, ViT excels in accuracy due to its\nsuperior ability to capture global representations, while CNN has an advantage\nin capturing local representations. This fact has led us to design a hybrid\nmethod to fully take advantage of both ViT and CNN, called Explicitly\nClass-specific Boundaries (ECB). ECB learns CNN on ViT to combine their\ndistinct strengths. In particular, we leverage ViT's properties to explicitly\nfind class-specific decision boundaries by maximizing the discrepancy between\nthe outputs of the two classifiers to detect target samples far from the source\nsupport. In contrast, the CNN encoder clusters target features based on the\npreviously defined class-specific boundaries by minimizing the discrepancy\nbetween the probabilities of the two classifiers. Finally, ViT and CNN mutually\nexchange knowledge to improve the quality of pseudo labels and reduce the\nknowledge discrepancies of these models. Compared to conventional DA methods,\nour ECB achieves superior performance, which verifies its effectiveness in this\nhybrid model. The project website can be found\nhttps://dotrannhattuong.github.io/ECB/website/.\n","authors":["Ba Hung Ngo","Nhat-Tuong Do-Tran","Tuan-Ngoc Nguyen","Hae-Gon Jeon","Tae Jong Choi"],"pdf_url":"https://arxiv.org/pdf/2403.18360v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02101v1","updated":"2024-04-02T16:52:41Z","published":"2024-04-02T16:52:41Z","title":"CameraCtrl: Enabling Camera Control for Text-to-Video Generation","summary":" Controllability plays a crucial role in video generation since it allows\nusers to create desired content. However, existing models largely overlooked\nthe precise control of camera pose that serves as a cinematic language to\nexpress deeper narrative nuances. To alleviate this issue, we introduce\nCameraCtrl, enabling accurate camera pose control for text-to-video(T2V)\nmodels. After precisely parameterizing the camera trajectory, a plug-and-play\ncamera module is then trained on a T2V model, leaving others untouched.\nAdditionally, a comprehensive study on the effect of various datasets is also\nconducted, suggesting that videos with diverse camera distribution and similar\nappearances indeed enhance controllability and generalization. Experimental\nresults demonstrate the effectiveness of CameraCtrl in achieving precise and\ndomain-adaptive camera control, marking a step forward in the pursuit of\ndynamic and customized video storytelling from textual and camera pose inputs.\nOur project website is at: https://hehao13.github.io/projects-CameraCtrl/.\n","authors":["Hao He","Yinghao Xu","Yuwei Guo","Gordon Wetzstein","Bo Dai","Hongsheng Li","Ceyuan Yang"],"pdf_url":"https://arxiv.org/pdf/2404.02101v1.pdf","comment":"Project page: https://hehao13.github.io/projects-CameraCtrl/ Code:\n https://github.com/hehao13/CameraCtrl"},{"id":"http://arxiv.org/abs/2404.02098v1","updated":"2024-04-02T16:48:20Z","published":"2024-04-02T16:48:20Z","title":"BRAVEn: Improving Self-Supervised Pre-training for Visual and Auditory\n Speech Recognition","summary":" Self-supervision has recently shown great promise for learning visual and\nauditory speech representations from unlabelled data. In this work, we propose\nBRAVEn, an extension to the recent RAVEn method, which learns speech\nrepresentations entirely from raw audio-visual data. Our modifications to RAVEn\nenable BRAVEn to achieve state-of-the-art results among self-supervised methods\nin various settings. Moreover, we observe favourable scaling behaviour by\nincreasing the amount of unlabelled data well beyond other self-supervised\nworks. In particular, we achieve 20.0% / 1.7% word error rate for VSR / ASR on\nthe LRS3 test set, with only 30 hours of labelled data and no external ASR\nmodels. Our results suggest that readily available unlabelled audio-visual data\ncan largely replace costly transcribed data.\n","authors":["Alexandros Haliassos","Andreas Zinonos","Rodrigo Mira","Stavros Petridis","Maja Pantic"],"pdf_url":"https://arxiv.org/pdf/2404.02098v1.pdf","comment":"ICASSP 2024. Code: https://github.com/ahaliassos/raven"},{"id":"http://arxiv.org/abs/2404.00511v2","updated":"2024-04-02T16:46:24Z","published":"2024-03-31T01:16:02Z","title":"MIPS at SemEval-2024 Task 3: Multimodal Emotion-Cause Pair Extraction in\n Conversations with Multimodal Language Models","summary":" This paper presents our winning submission to Subtask 2 of SemEval 2024 Task\n3 on multimodal emotion cause analysis in conversations. We propose a novel\nMultimodal Emotion Recognition and Multimodal Emotion Cause Extraction\n(MER-MCE) framework that integrates text, audio, and visual modalities using\nspecialized emotion encoders. Our approach sets itself apart from\ntop-performing teams by leveraging modality-specific features for enhanced\nemotion understanding and causality inference. Experimental evaluation\ndemonstrates the advantages of our multimodal approach, with our submission\nachieving a competitive weighted F1 score of 0.3435, ranking third with a\nmargin of only 0.0339 behind the 1st team and 0.0025 behind the 2nd team.\nProject: https://github.com/MIPS-COLT/MER-MCE.git\n","authors":["Zebang Cheng","Fuqiang Niu","Yuxiang Lin","Zhi-Qi Cheng","Bowen Zhang","Xiaojiang Peng"],"pdf_url":"https://arxiv.org/pdf/2404.00511v2.pdf","comment":"Ranked 3rd in SemEval '24 Task 3 with F1 of 0.3435, close to 1st &\n 2nd by 0.0339 & 0.0025"},{"id":"http://arxiv.org/abs/2401.08629v2","updated":"2024-04-02T16:35:46Z","published":"2023-12-08T12:10:03Z","title":"Immature Green Apple Detection and Sizing in Commercial Orchards using\n YOLOv8 and Shape Fitting Techniques","summary":" Detecting and estimating size of apples during the early stages of growth is\ncrucial for predicting yield, pest management, and making informed decisions\nrelated to crop-load management, harvest and post-harvest logistics, and\nmarketing. Traditional fruit size measurement methods are laborious and\ntimeconsuming. This study employs the state-of-the-art YOLOv8 object detection\nand instance segmentation algorithm in conjunction with geometric shape fitting\ntechniques on 3D point cloud data to accurately determine the size of immature\ngreen apples (or fruitlet) in a commercial orchard environment. The methodology\nutilized two RGB-D sensors: Intel RealSense D435i and Microsoft Azure Kinect\nDK. Notably, the YOLOv8 instance segmentation models exhibited proficiency in\nimmature green apple detection, with the YOLOv8m-seg model achieving the\nhighest AP@0.5 and AP@0.75 scores of 0.94 and 0.91, respectively. Using the\nellipsoid fitting technique on images from the Azure Kinect, we achieved an\nRMSE of 2.35 mm, MAE of 1.66 mm, MAPE of 6.15 mm, and an R-squared value of 0.9\nin estimating the size of apple fruitlets. Challenges such as partial occlusion\ncaused some error in accurately delineating and sizing green apples using the\nYOLOv8-based segmentation technique, particularly in fruit clusters. In a\ncomparison with 102 outdoor samples, the size estimation technique performed\nbetter on the images acquired with Microsoft Azure Kinect than the same with\nIntel Realsense D435i. This superiority is evident from the metrics: the RMSE\nvalues (2.35 mm for Azure Kinect vs. 9.65 mm for Realsense D435i), MAE values\n(1.66 mm for Azure Kinect vs. 7.8 mm for Realsense D435i), and the R-squared\nvalues (0.9 for Azure Kinect vs. 0.77 for Realsense D435i).\n","authors":["Ranjan Sapkota","Dawood Ahmed","Martin Churuvija","Manoj Karkee"],"pdf_url":"https://arxiv.org/pdf/2401.08629v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02084v1","updated":"2024-04-02T16:30:12Z","published":"2024-04-02T16:30:12Z","title":"Adaptive Feature Fusion Neural Network for Glaucoma Segmentation on\n Unseen Fundus Images","summary":" Fundus image segmentation on unseen domains is challenging, especially for\nthe over-parameterized deep models trained on the small medical datasets. To\naddress this challenge, we propose a method named Adaptive Feature-fusion\nNeural Network (AFNN) for glaucoma segmentation on unseen domains, which mainly\nconsists of three modules: domain adaptor, feature-fusion network, and\nself-supervised multi-task learning. Specifically, the domain adaptor helps the\npretrained-model fast adapt from other image domains to the medical fundus\nimage domain. Feature-fusion network and self-supervised multi-task learning\nfor the encoder and decoder are introduced to improve the domain generalization\nability. In addition, we also design the weighted-dice-loss to improve model\nperformance on complex optic-cup segmentation tasks. Our proposed method\nachieves a competitive performance over existing fundus segmentation methods on\nfour public glaucoma datasets.\n","authors":["Jiyuan Zhong","Hu Ke","Ming Yan"],"pdf_url":"https://arxiv.org/pdf/2404.02084v1.pdf","comment":"17 pages, 11 figures"},{"id":"http://arxiv.org/abs/2404.02082v1","updated":"2024-04-02T16:28:41Z","published":"2024-04-02T16:28:41Z","title":"WcDT: World-centric Diffusion Transformer for Traffic Scene Generation","summary":" In this paper, we introduce a novel approach for autonomous driving\ntrajectory generation by harnessing the complementary strengths of diffusion\nprobabilistic models (a.k.a., diffusion models) and transformers. Our proposed\nframework, termed the \"World-Centric Diffusion Transformer\" (WcDT), optimizes\nthe entire trajectory generation process, from feature extraction to model\ninference. To enhance the scene diversity and stochasticity, the historical\ntrajectory data is first preprocessed and encoded into latent space using\nDenoising Diffusion Probabilistic Models (DDPM) enhanced with Diffusion with\nTransformer (DiT) blocks. Then, the latent features, historical trajectories,\nHD map features, and historical traffic signal information are fused with\nvarious transformer-based encoders. The encoded traffic scenes are then decoded\nby a trajectory decoder to generate multimodal future trajectories.\nComprehensive experimental results show that the proposed approach exhibits\nsuperior performance in generating both realistic and diverse trajectories,\nshowing its potential for integration into automatic driving simulation\nsystems.\n","authors":["Chen Yang","Aaron Xuxiang Tian","Dong Chen","Tianyu Shi","Arsalan Heydarian"],"pdf_url":"https://arxiv.org/pdf/2404.02082v1.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.02072v1","updated":"2024-04-02T16:20:02Z","published":"2024-04-02T16:20:02Z","title":"EGTR: Extracting Graph from Transformer for Scene Graph Generation","summary":" Scene Graph Generation (SGG) is a challenging task of detecting objects and\npredicting relationships between objects. After DETR was developed, one-stage\nSGG models based on a one-stage object detector have been actively studied.\nHowever, complex modeling is used to predict the relationship between objects,\nand the inherent relationship between object queries learned in the multi-head\nself-attention of the object detector has been neglected. We propose a\nlightweight one-stage SGG model that extracts the relation graph from the\nvarious relationships learned in the multi-head self-attention layers of the\nDETR decoder. By fully utilizing the self-attention by-products, the relation\ngraph can be extracted effectively with a shallow relation extraction head.\nConsidering the dependency of the relation extraction task on the object\ndetection task, we propose a novel relation smoothing technique that adjusts\nthe relation label adaptively according to the quality of the detected objects.\nBy the relation smoothing, the model is trained according to the continuous\ncurriculum that focuses on object detection task at the beginning of training\nand performs multi-task learning as the object detection performance gradually\nimproves. Furthermore, we propose a connectivity prediction task that predicts\nwhether a relation exists between object pairs as an auxiliary task of the\nrelation extraction. We demonstrate the effectiveness and efficiency of our\nmethod for the Visual Genome and Open Image V6 datasets. Our code is publicly\navailable at https://github.com/naver-ai/egtr .\n","authors":["Jinbae Im","JeongYeon Nam","Nokyung Park","Hyungmin Lee","Seunghyun Park"],"pdf_url":"https://arxiv.org/pdf/2404.02072v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2306.06077v3","updated":"2024-04-02T16:19:22Z","published":"2023-06-05T17:22:54Z","title":"Semantically-Prompted Language Models Improve Visual Descriptions","summary":" Language-vision models like CLIP have made significant strides in vision\ntasks, such as zero-shot image classification (ZSIC). However, generating\nspecific and expressive visual descriptions remains challenging; descriptions\nproduced by current methods are often ambiguous and lacking in granularity. To\ntackle these issues, we propose V-GLOSS: Visual Glosses, a novel method built\nupon two key ideas. The first is Semantic Prompting, which conditions a\nlanguage model on structured semantic knowledge. The second is a new\ncontrastive algorithm that elicits fine-grained distinctions between similar\nconcepts. With both ideas, we demonstrate that V-GLOSS improves visual\ndescriptions and achieves strong results in the zero-shot setting on general\nand fine-grained image-classification datasets, including ImageNet, STL-10,\nFGVC Aircraft, and Flowers 102. Moreover, these descriptive capabilities\ncontribute to enhancing image-generation performance. Finally, we introduce a\nquality-tested silver dataset with descriptions generated with V-GLOSS for all\nImageNet classes.\n","authors":["Michael Ogezi","Bradley Hauer","Grzegorz Kondrak"],"pdf_url":"https://arxiv.org/pdf/2306.06077v3.pdf","comment":"To appear at NAACL 2024"},{"id":"http://arxiv.org/abs/2404.02067v1","updated":"2024-04-02T16:07:50Z","published":"2024-04-02T16:07:50Z","title":"Red-Teaming Segment Anything Model","summary":" Foundation models have emerged as pivotal tools, tackling many complex tasks\nthrough pre-training on vast datasets and subsequent fine-tuning for specific\napplications. The Segment Anything Model is one of the first and most\nwell-known foundation models for computer vision segmentation tasks. This work\npresents a multi-faceted red-teaming analysis that tests the Segment Anything\nModel against challenging tasks: (1) We analyze the impact of style transfer on\nsegmentation masks, demonstrating that applying adverse weather conditions and\nraindrops to dashboard images of city roads significantly distorts generated\nmasks. (2) We focus on assessing whether the model can be used for attacks on\nprivacy, such as recognizing celebrities' faces, and show that the model\npossesses some undesired knowledge in this task. (3) Finally, we check how\nrobust the model is to adversarial attacks on segmentation masks under text\nprompts. We not only show the effectiveness of popular white-box attacks and\nresistance to black-box attacks but also introduce a novel approach - Focused\nIterative Gradient Attack (FIGA) that combines white-box approaches to\nconstruct an efficient attack resulting in a smaller number of modified pixels.\nAll of our testing methods and analyses indicate a need for enhanced safety\nmeasures in foundation models for image segmentation.\n","authors":["Krzysztof Jankowski","Bartlomiej Sobieski","Mateusz Kwiatkowski","Jakub Szulc","Michal Janik","Hubert Baniecki","Przemyslaw Biecek"],"pdf_url":"https://arxiv.org/pdf/2404.02067v1.pdf","comment":"CVPR 2024 - The 4th Workshop of Adversarial Machine Learning on\n Computer Vision: Robustness of Foundation Models"},{"id":"http://arxiv.org/abs/2404.02065v1","updated":"2024-04-02T16:06:20Z","published":"2024-04-02T16:06:20Z","title":"Multi-Level Label Correction by Distilling Proximate Patterns for\n Semi-supervised Semantic Segmentation","summary":" Semi-supervised semantic segmentation relieves the reliance on large-scale\nlabeled data by leveraging unlabeled data. Recent semi-supervised semantic\nsegmentation approaches mainly resort to pseudo-labeling methods to exploit\nunlabeled data. However, unreliable pseudo-labeling can undermine the\nsemi-supervision processes. In this paper, we propose an algorithm called\nMulti-Level Label Correction (MLLC), which aims to use graph neural networks to\ncapture structural relationships in Semantic-Level Graphs (SLGs) and\nClass-Level Graphs (CLGs) to rectify erroneous pseudo-labels. Specifically,\nSLGs represent semantic affinities between pairs of pixel features, and CLGs\ndescribe classification consistencies between pairs of pixel labels. With the\nsupport of proximate pattern information from graphs, MLLC can rectify\nincorrectly predicted pseudo-labels and can facilitate discriminative feature\nrepresentations. We design an end-to-end network to train and perform this\neffective label corrections mechanism. Experiments demonstrate that MLLC can\nsignificantly improve supervised baselines and outperforms state-of-the-art\napproaches in different scenarios on Cityscapes and PASCAL VOC 2012 datasets.\nSpecifically, MLLC improves the supervised baseline by at least 5% and 2% with\nDeepLabV2 and DeepLabV3+ respectively under different partition protocols.\n","authors":["Hui Xiao","Yuting Hong","Li Dong","Diqun Yan","Jiayan Zhuang","Junjie Xiong","Dongtai Liang","Chengbin Peng"],"pdf_url":"https://arxiv.org/pdf/2404.02065v1.pdf","comment":"12 pages, 8 figures. IEEE Transactions on Multimedia, 2024"},{"id":"http://arxiv.org/abs/2404.02059v1","updated":"2024-04-02T15:58:36Z","published":"2024-04-02T15:58:36Z","title":"IISAN: Efficiently Adapting Multimodal Representation for Sequential\n Recommendation with Decoupled PEFT","summary":" Multimodal foundation models are transformative in sequential recommender\nsystems, leveraging powerful representation learning capabilities. While\nParameter-efficient Fine-tuning (PEFT) is commonly used to adapt foundation\nmodels for recommendation tasks, most research prioritizes parameter\nefficiency, often overlooking critical factors like GPU memory efficiency and\ntraining speed. Addressing this gap, our paper introduces IISAN (Intra- and\nInter-modal Side Adapted Network for Multimodal Representation), a simple\nplug-and-play architecture using a Decoupled PEFT structure and exploiting both\nintra- and inter-modal adaptation.\n IISAN matches the performance of full fine-tuning (FFT) and state-of-the-art\nPEFT. More importantly, it significantly reduces GPU memory usage - from 47GB\nto just 3GB for multimodal sequential recommendation tasks. Additionally, it\naccelerates training time per epoch from 443s to 22s compared to FFT. This is\nalso a notable improvement over the Adapter and LoRA, which require 37-39 GB\nGPU memory and 350-380 seconds per epoch for training.\n Furthermore, we propose a new composite efficiency metric, TPME\n(Training-time, Parameter, and GPU Memory Efficiency) to alleviate the\nprevalent misconception that \"parameter efficiency represents overall\nefficiency\". TPME provides more comprehensive insights into practical\nefficiency comparisons between different methods. Besides, we give an\naccessible efficiency analysis of all PEFT and FFT approaches, which\ndemonstrate the superiority of IISAN. We release our codes and other materials\nat https://github.com/jjGenAILab/IISAN.\n","authors":["Junchen Fu","Xuri Ge","Xin Xin","Alexandros Karatzoglou","Ioannis Arapakis","Jie Wang","Joemon M Jose"],"pdf_url":"https://arxiv.org/pdf/2404.02059v1.pdf","comment":"Accepted by SIGIR2024"},{"id":"http://arxiv.org/abs/2307.09020v3","updated":"2024-04-02T15:46:19Z","published":"2023-07-18T07:20:31Z","title":"FISTNet: FusIon of STyle-path generative Networks for Facial Style\n Transfer","summary":" With the surge in emerging technologies such as Metaverse, spatial computing,\nand generative AI, the application of facial style transfer has gained a lot of\ninterest from researchers as well as startups enthusiasts alike. StyleGAN\nmethods have paved the way for transfer-learning strategies that could reduce\nthe dependency on the huge volume of data that is available for the training\nprocess. However, StyleGAN methods have the tendency of overfitting that\nresults in the introduction of artifacts in the facial images. Studies, such as\nDualStyleGAN, proposed the use of multipath networks but they require the\nnetworks to be trained for a specific style rather than generating a fusion of\nfacial styles at once. In this paper, we propose a FusIon of STyles (FIST)\nnetwork for facial images that leverages pre-trained multipath style transfer\nnetworks to eliminate the problem associated with lack of huge data volume in\nthe training phase along with the fusion of multiple styles at the output. We\nleverage pre-trained styleGAN networks with an external style pass that use\nresidual modulation block instead of a transform coding block. The method also\npreserves facial structure, identity, and details via the gated mapping unit\nintroduced in this study. The aforementioned components enable us to train the\nnetwork with very limited amount of data while generating high-quality stylized\nimages. Our training process adapts curriculum learning strategy to perform\nefficient, flexible style and model fusion in the generative space. We perform\nextensive experiments to show the superiority of FISTNet in comparison to\nexisting state-of-the-art methods.\n","authors":["Sunder Ali Khowaja","Lewis Nkenyereye","Ghulam Mujtaba","Ik Hyun Lee","Giancarlo Fortino","Kapal Dev"],"pdf_url":"https://arxiv.org/pdf/2307.09020v3.pdf","comment":"21 pages, 6 figures, 2 tables"},{"id":"http://arxiv.org/abs/2404.01188v2","updated":"2024-04-02T15:45:38Z","published":"2024-04-01T15:45:58Z","title":"MonoBox: Tightness-free Box-supervised Polyp Segmentation using\n Monotonicity Constraint","summary":" We propose MonoBox, an innovative box-supervised segmentation method\nconstrained by monotonicity to liberate its training from the user-unfriendly\nbox-tightness assumption. In contrast to conventional box-supervised\nsegmentation, where the box edges must precisely touch the target boundaries,\nMonoBox leverages imprecisely-annotated boxes to achieve robust pixel-wise\nsegmentation. The 'linchpin' is that, within the noisy zones around box edges,\nMonoBox discards the traditional misguiding multiple-instance learning loss,\nand instead optimizes a carefully-designed objective, termed monotonicity\nconstraint. Along directions transitioning from the foreground to background,\nthis new constraint steers responses to adhere to a trend of monotonically\ndecreasing values. Consequently, the originally unreliable learning within the\nnoisy zones is transformed into a correct and effective monotonicity\noptimization. Moreover, an adaptive label correction is introduced, enabling\nMonoBox to enhance the tightness of box annotations using predicted masks from\nthe previous epoch and dynamically shrink the noisy zones as training\nprogresses. We verify MonoBox in the box-supervised segmentation task of\npolyps, where satisfying box-tightness is challenging due to the vague\nboundaries between the polyp and normal tissues. Experiments on both public\nsynthetic and in-house real noisy datasets demonstrate that MonoBox exceeds\nother anti-noise state-of-the-arts by improving Dice by at least 5.5% and 3.3%,\nrespectively. Codes are at https://github.com/Huster-Hq/MonoBox.\n","authors":["Qiang Hu","Zhenyu Yi","Ying Zhou","Ting Li","Fan Huang","Mei Liu","Qiang Li","Zhiwei Wang"],"pdf_url":"https://arxiv.org/pdf/2404.01188v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02046v1","updated":"2024-04-02T15:38:18Z","published":"2024-04-02T15:38:18Z","title":"Causality-based Transfer of Driving Scenarios to Unseen Intersections","summary":" Scenario-based testing of automated driving functions has become a promising\nmethod to reduce time and cost compared to real-world testing. In\nscenario-based testing automated functions are evaluated in a set of\npre-defined scenarios. These scenarios provide information about vehicle\nbehaviors, environmental conditions, or road characteristics using parameters.\nTo create realistic scenarios, parameters and parameter dependencies have to be\nfitted utilizing real-world data. However, due to the large variety of\nintersections and movement constellations found in reality, data may not be\navailable for certain scenarios. This paper proposes a methodology to\nsystematically analyze relations between parameters of scenarios. Bayesian\nnetworks are utilized to analyze causal dependencies in order to decrease the\namount of required data and to transfer causal patterns creating unseen\nscenarios. Thereby, infrastructural influences on movement patterns are\ninvestigated to generate realistic scenarios on unobserved intersections. For\nevaluation, scenarios and underlying parameters are extracted from the inD\ndataset. Movement patterns are estimated, transferred and checked against\nrecorded data from those initially unseen intersections.\n","authors":["Christoph Glasmacher","Michael Schuldes","Sleiman El Masri","Lutz Eckstein"],"pdf_url":"https://arxiv.org/pdf/2404.02046v1.pdf","comment":"6 pages, 8 figures, 1 table, Accepted to be published as part of the\n 35th IEEE Intelligent Vehicles Symposium, June 2 - 5, 2024, Korea"},{"id":"http://arxiv.org/abs/2404.02041v1","updated":"2024-04-02T15:34:52Z","published":"2024-04-02T15:34:52Z","title":"SelfPose3d: Self-Supervised Multi-Person Multi-View 3d Pose Estimation","summary":" We present a new self-supervised approach, SelfPose3d, for estimating 3d\nposes of multiple persons from multiple camera views. Unlike current\nstate-of-the-art fully-supervised methods, our approach does not require any 2d\nor 3d ground-truth poses and uses only the multi-view input images from a\ncalibrated camera setup and 2d pseudo poses generated from an off-the-shelf 2d\nhuman pose estimator. We propose two self-supervised learning objectives:\nself-supervised person localization in 3d space and self-supervised 3d pose\nestimation. We achieve self-supervised 3d person localization by training the\nmodel on synthetically generated 3d points, serving as 3d person root\npositions, and on the projected root-heatmaps in all the views. We then model\nthe 3d poses of all the localized persons with a bottleneck representation, map\nthem onto all views obtaining 2d joints, and render them using 2d Gaussian\nheatmaps in an end-to-end differentiable manner. Afterwards, we use the\ncorresponding 2d joints and heatmaps from the pseudo 2d poses for learning. To\nalleviate the intrinsic inaccuracy of the pseudo labels, we propose an adaptive\nsupervision attention mechanism to guide the self-supervision. Our experiments\nand analysis on three public benchmark datasets, including Panoptic, Shelf, and\nCampus, show the effectiveness of our approach, which is comparable to\nfully-supervised methods. Code is available at\n\\url{https://github.com/CAMMA-public/SelfPose3D}\n","authors":["Vinkle Srivastav","Keqi Chen","Nicolas Padoy"],"pdf_url":"https://arxiv.org/pdf/2404.02041v1.pdf","comment":"Accepted for CVPR 2024"},{"id":"http://arxiv.org/abs/2403.10488v2","updated":"2024-04-02T15:34:04Z","published":"2024-03-15T17:23:38Z","title":"Joint Multimodal Transformer for Emotion Recognition in the Wild","summary":" Systems for multimodal emotion recognition (MMER) can typically outperform\nunimodal systems by leveraging the inter- and intra-modal relationships\nbetween, e.g., visual, textual, physiological, and auditory modalities. In this\npaper, an MMER method is proposed that relies on a joint multimodal transformer\nfor fusion with key-based cross-attention. This framework aims to exploit the\ndiverse and complementary nature of different modalities to improve predictive\naccuracy. Separate backbones capture intra-modal spatiotemporal dependencies\nwithin each modality over video sequences. Subsequently, a joint multimodal\ntransformer fusion architecture integrates the individual modality embeddings,\nallowing the model to capture inter-modal and intra-modal relationships\neffectively. Extensive experiments on two challenging expression recognition\ntasks: (1) dimensional emotion recognition on the Affwild2 dataset (with face\nand voice), and (2) pain estimation on the Biovid dataset (with face and\nbiosensors), indicate that the proposed method can work effectively with\ndifferent modalities. Empirical results show that MMER systems with our\nproposed fusion method allow us to outperform relevant baseline and\nstate-of-the-art methods.\n","authors":["Paul Waligora","Haseeb Aslam","Osama Zeeshan","Soufiane Belharbi","Alessandro Lameiras Koerich","Marco Pedersoli","Simon Bacon","Eric Granger"],"pdf_url":"https://arxiv.org/pdf/2403.10488v2.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2309.10172v2","updated":"2024-04-02T15:24:24Z","published":"2023-09-18T21:41:04Z","title":"Enhancing wind field resolution in complex terrain through a\n knowledge-driven machine learning approach","summary":" Atmospheric flows are governed by a broad variety of spatio-temporal scales,\nthus making real-time numerical modeling of such turbulent flows in complex\nterrain at high resolution computationally intractable. In this study, we\ndemonstrate a neural network approach motivated by Enhanced Super-Resolution\nGenerative Adversarial Networks to upscale low-resolution wind fields to\ngenerate high-resolution wind fields in an actual wind farm in Bessaker,\nNorway. The neural network-based model is shown to successfully reconstruct\nfully resolved 3D velocity fields from a coarser scale while respecting the\nlocal terrain and that it easily outperforms trilinear interpolation. We also\ndemonstrate that by using appropriate cost function based on domain knowledge,\nwe can alleviate the use of adversarial training.\n","authors":["Jacob Wulff Wold","Florian Stadtmann","Adil Rasheed","Mandar Tabib","Omer San","Jan-Tore Horn"],"pdf_url":"https://arxiv.org/pdf/2309.10172v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17922v2","updated":"2024-04-02T15:20:58Z","published":"2023-11-29T18:59:59Z","title":"A Simple Recipe for Language-guided Domain Generalized Segmentation","summary":" Generalization to new domains not seen during training is one of the\nlong-standing challenges in deploying neural networks in real-world\napplications. Existing generalization techniques either necessitate external\nimages for augmentation, and/or aim at learning invariant representations by\nimposing various alignment constraints. Large-scale pretraining has recently\nshown promising generalization capabilities, along with the potential of\nbinding different modalities. For instance, the advent of vision-language\nmodels like CLIP has opened the doorway for vision models to exploit the\ntextual modality. In this paper, we introduce a simple framework for\ngeneralizing semantic segmentation networks by employing language as the source\nof randomization. Our recipe comprises three key ingredients: (i) the\npreservation of the intrinsic CLIP robustness through minimal fine-tuning, (ii)\nlanguage-driven local style augmentation, and (iii) randomization by locally\nmixing the source and augmented styles during training. Extensive experiments\nreport state-of-the-art results on various generalization benchmarks. Code is\naccessible at https://github.com/astra-vision/FAMix .\n","authors":["Mohammad Fahes","Tuan-Hung Vu","Andrei Bursuc","Patrick Pérez","Raoul de Charette"],"pdf_url":"https://arxiv.org/pdf/2311.17922v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01232v2","updated":"2024-04-02T15:03:33Z","published":"2024-04-01T16:51:13Z","title":"Open-Vocabulary Federated Learning with Multimodal Prototyping","summary":" Existing federated learning (FL) studies usually assume the training label\nspace and test label space are identical. However, in real-world applications,\nthis assumption is too ideal to be true. A new user could come up with queries\nthat involve data from unseen classes, and such open-vocabulary queries would\ndirectly defect such FL systems. Therefore, in this work, we explicitly focus\non the under-explored open-vocabulary challenge in FL. That is, for a new user,\nthe global server shall understand her/his query that involves arbitrary\nunknown classes. To address this problem, we leverage the pre-trained\nvision-language models (VLMs). In particular, we present a novel adaptation\nframework tailored for VLMs in the context of FL, named as Federated Multimodal\nPrototyping (Fed-MP). Fed-MP adaptively aggregates the local model weights\nbased on light-weight client residuals, and makes predictions based on a novel\nmultimodal prototyping mechanism. Fed-MP exploits the knowledge learned from\nthe seen classes, and robustifies the adapted VLM to unseen categories. Our\nempirical evaluation on various datasets validates the effectiveness of Fed-MP.\n","authors":["Huimin Zeng","Zhenrui Yue","Dong Wang"],"pdf_url":"https://arxiv.org/pdf/2404.01232v2.pdf","comment":"Accepted at NAACL 2024"},{"id":"http://arxiv.org/abs/2403.07888v2","updated":"2024-04-02T14:47:23Z","published":"2024-02-02T18:54:48Z","title":"Cross-modality debiasing: using language to mitigate sub-population\n shifts in imaging","summary":" Sub-population shift is a specific type of domain shift that highlights\nchanges in data distribution within specific sub-groups or populations between\ntraining and testing. Sub-population shift accounts for a significant source of\nalgorithmic bias and calls for distributional robustness. Recent studies found\ninherent distributional robustness in multi-modality foundation models, such as\nthe vision-language model CLIP, yet this robustness is vulnerable through\nparameter fine-tuning. In this paper, we propose leveraging the connection of\nrobustness among different modalities and reshaping the distributional\nrobustness of one modality with another. Specifically, in the context of the\ndistributional robustness of CLIP, we propose to leverage natural language\ninputs to debias the image feature representations, to improve worst-case\nperformance on sub-populations. Our extensive empirical studies show that image\nrepresentations debiased by natural language can achieve significant\nperformance improvement and reduction of performance instability under\nsub-population shifts.\n","authors":["Yijiang Pang","Bao Hoang","Jiayu Zhou"],"pdf_url":"https://arxiv.org/pdf/2403.07888v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01998v1","updated":"2024-04-02T14:41:42Z","published":"2024-04-02T14:41:42Z","title":"Specularity Factorization for Low-Light Enhancement","summary":" We present a new additive image factorization technique that treats images to\nbe composed of multiple latent specular components which can be simply\nestimated recursively by modulating the sparsity during decomposition. Our\nmodel-driven {\\em RSFNet} estimates these factors by unrolling the optimization\ninto network layers requiring only a few scalars to be learned. The resultant\nfactors are interpretable by design and can be fused for different image\nenhancement tasks via a network or combined directly by the user in a\ncontrollable fashion. Based on RSFNet, we detail a zero-reference Low Light\nEnhancement (LLE) application trained without paired or unpaired supervision.\nOur system improves the state-of-the-art performance on standard benchmarks and\nachieves better generalization on multiple other datasets. We also integrate\nour factors with other task specific fusion networks for applications like\nderaining, deblurring and dehazing with negligible overhead thereby\nhighlighting the multi-domain and multi-task generalizability of our proposed\nRSFNet. The code and data is released for reproducibility on the project\nhomepage.\n","authors":["Saurabh Saini","P J Narayanan"],"pdf_url":"https://arxiv.org/pdf/2404.01998v1.pdf","comment":"CVPR 2024, Pages: 8(main)+4(references)+17(supp) = 29"},{"id":"http://arxiv.org/abs/2404.01995v1","updated":"2024-04-02T14:40:11Z","published":"2024-04-02T14:40:11Z","title":"A discussion about violin reduction: geometric analysis of contour lines\n and channel of minima","summary":" Some early violins have been reduced during their history to fit imposed\nmorphological standards, while more recent ones have been built directly to\nthese standards. We can observe differences between reduced and unreduced\ninstruments, particularly in their contour lines and channel of minima. In a\nrecent preliminary work, we computed and highlighted those two features for two\ninstruments using triangular 3D meshes acquired by photogrammetry, whose\nfidelity has been assessed and validated with sub-millimetre accuracy. We\npropose here an extension to a corpus of 38 violins, violas and cellos, and\nintroduce improved procedures, leading to a stronger discussion of the\ngeometric analysis. We first recall the material we are working with. We then\ndiscuss how to derive the best reference plane for the violin alignment, which\nis crucial for the computation of contour lines and channel of minima. Finally,\nwe show how to compute efficiently both characteristics and we illustrate our\nresults with a few examples.\n","authors":["Philémon Beghin","Anne-Emmanuelle Ceulemans","François Glineur"],"pdf_url":"https://arxiv.org/pdf/2404.01995v1.pdf","comment":"Paper accepted (before reviewing) for the Florence Heri-Tech 2024\n Conference"},{"id":"http://arxiv.org/abs/2404.01994v1","updated":"2024-04-02T14:40:04Z","published":"2024-04-02T14:40:04Z","title":"DELAN: Dual-Level Alignment for Vision-and-Language Navigation by\n Cross-Modal Contrastive Learning","summary":" Vision-and-Language navigation (VLN) requires an agent to navigate in unseen\nenvironment by following natural language instruction. For task completion, the\nagent needs to align and integrate various navigation modalities, including\ninstruction, observation and navigation history. Existing works primarily\nconcentrate on cross-modal attention at the fusion stage to achieve this\nobjective. Nevertheless, modality features generated by disparate uni-encoders\nreside in their own spaces, leading to a decline in the quality of cross-modal\nfusion and decision. To address this problem, we propose a Dual-levEL AligNment\n(DELAN) framework by cross-modal contrastive learning. This framework is\ndesigned to align various navigation-related modalities before fusion, thereby\nenhancing cross-modal interaction and action decision-making. Specifically, we\ndivide the pre-fusion alignment into dual levels: instruction-history level and\nlandmark-observation level according to their semantic correlations. We also\nreconstruct a dual-level instruction for adaptation to the dual-level\nalignment. As the training signals for pre-fusion alignment are extremely\nlimited, self-supervised contrastive learning strategies are employed to\nenforce the matching between different modalities. Our approach seamlessly\nintegrates with the majority of existing models, resulting in improved\nnavigation performance on various VLN benchmarks, including R2R, R4R, RxR and\nCVDN.\n","authors":["Mengfei Du","Binhao Wu","Jiwen Zhang","Zhihao Fan","Zejun Li","Ruipu Luo","Xuanjing Huang","Zhongyu Wei"],"pdf_url":"https://arxiv.org/pdf/2404.01994v1.pdf","comment":"Accepted by LREC-COLING 2024"},{"id":"http://arxiv.org/abs/2312.00057v2","updated":"2024-04-02T14:28:26Z","published":"2023-11-29T12:10:00Z","title":"VA3: Virtually Assured Amplification Attack on Probabilistic Copyright\n Protection for Text-to-Image Generative Models","summary":" The booming use of text-to-image generative models has raised concerns about\ntheir high risk of producing copyright-infringing content. While probabilistic\ncopyright protection methods provide a probabilistic guarantee against such\ninfringement, in this paper, we introduce Virtually Assured Amplification\nAttack (VA3), a novel online attack framework that exposes the vulnerabilities\nof these protection mechanisms. The proposed framework significantly amplifies\nthe probability of generating infringing content on the sustained interactions\nwith generative models and a non-trivial lower-bound on the success probability\nof each engagement. Our theoretical and experimental results demonstrate the\neffectiveness of our approach under various scenarios. These findings highlight\nthe potential risk of implementing probabilistic copyright protection in\npractical applications of text-to-image generative models. Code is available at\nhttps://github.com/South7X/VA3.\n","authors":["Xiang Li","Qianli Shen","Kenji Kawaguchi"],"pdf_url":"https://arxiv.org/pdf/2312.00057v2.pdf","comment":"18 pages, 9 figures. Accept to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01988v1","updated":"2024-04-02T14:26:18Z","published":"2024-04-02T14:26:18Z","title":"Cooperative Students: Navigating Unsupervised Domain Adaptation in\n Nighttime Object Detection","summary":" Unsupervised Domain Adaptation (UDA) has shown significant advancements in\nobject detection under well-lit conditions; however, its performance degrades\nnotably in low-visibility scenarios, especially at night, posing challenges not\nonly for its adaptability in low signal-to-noise ratio (SNR) conditions but\nalso for the reliability and efficiency of automated vehicles. To address this\nproblem, we propose a \\textbf{Co}operative \\textbf{S}tudents (\\textbf{CoS})\nframework that innovatively employs global-local transformations (GLT) and a\nproxy-based target consistency (PTC) mechanism to capture the spatial\nconsistency in day- and night-time scenarios effectively, and thus bridge the\nsignificant domain shift across contexts. Building upon this, we further devise\nan adaptive IoU-informed thresholding (AIT) module to gradually avoid\noverlooking potential true positives and enrich the latent information in the\ntarget domain. Comprehensive experiments show that CoS essentially enhanced UDA\nperformance in low-visibility conditions and surpasses current state-of-the-art\ntechniques, achieving an increase in mAP of 3.0\\%, 1.9\\%, and 2.5\\% on BDD100K,\nSHIFT, and ACDC datasets, respectively. Code is available at\nhttps://github.com/jichengyuan/Cooperitive_Students.\n","authors":["Jicheng Yuan","Anh Le-Tuan","Manfred Hauswirth","Danh Le-Phuoc"],"pdf_url":"https://arxiv.org/pdf/2404.01988v1.pdf","comment":"Code is available at\n https://github.com/jichengyuan/Cooperitive_Students"},{"id":"http://arxiv.org/abs/2404.01984v1","updated":"2024-04-02T14:22:04Z","published":"2024-04-02T14:22:04Z","title":"Fashion Style Editing with Generative Human Prior","summary":" Image editing has been a long-standing challenge in the research community\nwith its far-reaching impact on numerous applications. Recently, text-driven\nmethods started to deliver promising results in domains like human faces, but\ntheir applications to more complex domains have been relatively limited. In\nthis work, we explore the task of fashion style editing, where we aim to\nmanipulate the fashion style of human imagery using text descriptions.\nSpecifically, we leverage a generative human prior and achieve fashion style\nediting by navigating its learned latent space. We first verify that the\nexisting text-driven editing methods fall short for our problem due to their\noverly simplified guidance signal, and propose two directions to reinforce the\nguidance: textual augmentation and visual referencing. Combined with our\nempirical findings on the latent space structure, our Fashion Style Editing\nframework (FaSE) successfully projects abstract fashion concepts onto human\nimages and introduces exciting new applications to the field.\n","authors":["Chaerin Kong","Seungyong Lee","Soohyeok Im","Wonsuk Yang"],"pdf_url":"https://arxiv.org/pdf/2404.01984v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2404.01976v1","updated":"2024-04-02T14:16:59Z","published":"2024-04-02T14:16:59Z","title":"Joint-Task Regularization for Partially Labeled Multi-Task Learning","summary":" Multi-task learning has become increasingly popular in the machine learning\nfield, but its practicality is hindered by the need for large, labeled\ndatasets. Most multi-task learning methods depend on fully labeled datasets\nwherein each input example is accompanied by ground-truth labels for all target\ntasks. Unfortunately, curating such datasets can be prohibitively expensive and\nimpractical, especially for dense prediction tasks which require per-pixel\nlabels for each image. With this in mind, we propose Joint-Task Regularization\n(JTR), an intuitive technique which leverages cross-task relations to\nsimultaneously regularize all tasks in a single joint-task latent space to\nimprove learning when data is not fully labeled for all tasks. JTR stands out\nfrom existing approaches in that it regularizes all tasks jointly rather than\nseparately in pairs -- therefore, it achieves linear complexity relative to the\nnumber of tasks while previous methods scale quadratically. To demonstrate the\nvalidity of our approach, we extensively benchmark our method across a wide\nvariety of partially labeled scenarios based on NYU-v2, Cityscapes, and\nTaskonomy.\n","authors":["Kento Nishi","Junsik Kim","Wanhua Li","Hanspeter Pfister"],"pdf_url":"https://arxiv.org/pdf/2404.01976v1.pdf","comment":"Accepted paper to CVPR 2024 (main conference)"},{"id":"http://arxiv.org/abs/2308.13150v8","updated":"2024-04-02T14:14:26Z","published":"2023-08-25T03:08:41Z","title":"Dual-Activated Lightweight Attention ResNet50 for Automatic\n Histopathology Breast Cancer Image Classification","summary":" Automatic breast cancer classification in histopathology images is crucial\nfor precise diagnosis and treatment planning. Recently, classification\napproaches based on the ResNet architecture have gained popularity for\nsignificantly improving accuracy by using skip connections to mitigate\nvanishing gradient problems, thereby integrating low-level and high-level\nfeature information. Nevertheless, the conventional ResNet architecture faces\nchallenges such as data imbalance and limited interpretability, necessitating\ncross-domain knowledge and collaboration among medical experts. This study\neffectively addresses these challenges by introducing a novel method for breast\ncancer classification, the Dual-Activated Lightweight Attention ResNet50\n(DALAResNet50) model. It integrates a pre-trained ResNet50 model with a\nlightweight attention mechanism, embedding an attention module in the fourth\nlayer of ResNet50 and incorporating two fully connected layers with LeakyReLU\nand ReLU activation functions to enhance feature learning capabilities. The\nDALAResNet50 method was tested on breast cancer histopathology images from the\nBreakHis Database across magnification factors of 40X, 100X, 200X, and 400X,\nachieving accuracies of 98.5%, 98.7%, 97.9%, and 94.3%, respectively. It was\nalso compared with established deep learning models such as SEResNet50,\nDenseNet121, VGG16, VGG16Inception, ViT, Swin-Transformer, Dinov2_Vitb14, and\nResNet50. The reported results of DALAResNet50 have been shown to outperform\nthe compared approaches regarding accuracy, F1 score, IBA, and GMean,\ndemonstrating significant robustness and broad applicability when dealing with\ndifferent magnifications and imbalanced breast cancer datasets\n","authors":["Suxing Liu"],"pdf_url":"https://arxiv.org/pdf/2308.13150v8.pdf","comment":"13 pages, 7 figures,7 tables"},{"id":"http://arxiv.org/abs/2310.13076v2","updated":"2024-04-02T14:14:16Z","published":"2023-10-19T18:14:33Z","title":"PatchCURE: Improving Certifiable Robustness, Model Utility, and\n Computation Efficiency of Adversarial Patch Defenses","summary":" State-of-the-art defenses against adversarial patch attacks can now achieve\nstrong certifiable robustness with a marginal drop in model utility. However,\nthis impressive performance typically comes at the cost of 10-100x more\ninference-time computation compared to undefended models -- the research\ncommunity has witnessed an intense three-way trade-off between certifiable\nrobustness, model utility, and computation efficiency. In this paper, we\npropose a defense framework named PatchCURE to approach this trade-off problem.\nPatchCURE provides sufficient \"knobs\" for tuning defense performance and allows\nus to build a family of defenses: the most robust PatchCURE instance can match\nthe performance of any existing state-of-the-art defense (without efficiency\nconsiderations); the most efficient PatchCURE instance has similar inference\nefficiency as undefended models. Notably, PatchCURE achieves state-of-the-art\nrobustness and utility performance across all different efficiency levels,\ne.g., 16-23% absolute clean accuracy and certified robust accuracy advantages\nover prior defenses when requiring computation efficiency to be close to\nundefended models. The family of PatchCURE defenses enables us to flexibly\nchoose appropriate defenses to satisfy given computation and/or utility\nconstraints in practice.\n","authors":["Chong Xiang","Tong Wu","Sihui Dai","Jonathan Petit","Suman Jana","Prateek Mittal"],"pdf_url":"https://arxiv.org/pdf/2310.13076v2.pdf","comment":"USENIX Security 2024. (extended) technical report"},{"id":"http://arxiv.org/abs/2404.01964v1","updated":"2024-04-02T13:57:30Z","published":"2024-04-02T13:57:30Z","title":"CAM-Based Methods Can See through Walls","summary":" CAM-based methods are widely-used post-hoc interpretability method that\nproduce a saliency map to explain the decision of an image classification\nmodel. The saliency map highlights the important areas of the image relevant to\nthe prediction. In this paper, we show that most of these methods can\nincorrectly attribute an important score to parts of the image that the model\ncannot see. We show that this phenomenon occurs both theoretically and\nexperimentally. On the theory side, we analyze the behavior of GradCAM on a\nsimple masked CNN model at initialization. Experimentally, we train a VGG-like\nmodel constrained to not use the lower part of the image and nevertheless\nobserve positive scores in the unseen part of the image. This behavior is\nevaluated quantitatively on two new datasets. We believe that this is\nproblematic, potentially leading to mis-interpretation of the model's behavior.\n","authors":["Magamed Taimeskhanov","Ronan Sicre","Damien Garreau"],"pdf_url":"https://arxiv.org/pdf/2404.01964v1.pdf","comment":"25 pages, 9 figures"},{"id":"http://arxiv.org/abs/2307.10974v2","updated":"2024-04-02T13:57:22Z","published":"2023-07-20T16:00:19Z","title":"Deep Multi-Threshold Spiking-UNet for Image Processing","summary":" U-Net, known for its simple yet efficient architecture, is widely utilized\nfor image processing tasks and is particularly suitable for deployment on\nneuromorphic chips. This paper introduces the novel concept of Spiking-UNet for\nimage processing, which combines the power of Spiking Neural Networks (SNNs)\nwith the U-Net architecture. To achieve an efficient Spiking-UNet, we face two\nprimary challenges: ensuring high-fidelity information propagation through the\nnetwork via spikes and formulating an effective training strategy. To address\nthe issue of information loss, we introduce multi-threshold spiking neurons,\nwhich improve the efficiency of information transmission within the\nSpiking-UNet. For the training strategy, we adopt a conversion and fine-tuning\npipeline that leverage pre-trained U-Net models. During the conversion process,\nsignificant variability in data distribution across different parts is observed\nwhen utilizing skip connections. Therefore, we propose a connection-wise\nnormalization method to prevent inaccurate firing rates. Furthermore, we adopt\na flow-based training method to fine-tune the converted models, reducing time\nsteps while preserving performance. Experimental results show that, on image\nsegmentation and denoising, our Spiking-UNet achieves comparable performance to\nits non-spiking counterpart, surpassing existing SNN methods. Compared with the\nconverted Spiking-UNet without fine-tuning, our Spiking-UNet reduces inference\ntime by approximately 90\\%. This research broadens the application scope of\nSNNs in image processing and is expected to inspire further exploration in the\nfield of neuromorphic engineering. The code for our Spiking-UNet implementation\nis available at https://github.com/SNNresearch/Spiking-UNet.\n","authors":["Hebei Li","Yueyi Zhang","Zhiwei Xiong","Zheng-jun Zha","Xiaoyan Sun"],"pdf_url":"https://arxiv.org/pdf/2307.10974v2.pdf","comment":"Accepted in NeuroComputing"},{"id":"http://arxiv.org/abs/2404.01959v1","updated":"2024-04-02T13:54:22Z","published":"2024-04-02T13:54:22Z","title":"Bi-LORA: A Vision-Language Approach for Synthetic Image Detection","summary":" Advancements in deep image synthesis techniques, such as generative\nadversarial networks (GANs) and diffusion models (DMs), have ushered in an era\nof generating highly realistic images. While this technological progress has\ncaptured significant interest, it has also raised concerns about the potential\ndifficulty in distinguishing real images from their synthetic counterparts.\nThis paper takes inspiration from the potent convergence capabilities between\nvision and language, coupled with the zero-shot nature of vision-language\nmodels (VLMs). We introduce an innovative method called Bi-LORA that leverages\nVLMs, combined with low-rank adaptation (LORA) tuning techniques, to enhance\nthe precision of synthetic image detection for unseen model-generated images.\nThe pivotal conceptual shift in our methodology revolves around reframing\nbinary classification as an image captioning task, leveraging the distinctive\ncapabilities of cutting-edge VLM, notably bootstrapping language image\npre-training (BLIP2). Rigorous and comprehensive experiments are conducted to\nvalidate the effectiveness of our proposed approach, particularly in detecting\nunseen diffusion-generated images from unknown diffusion-based generative\nmodels during training, showcasing robustness to noise, and demonstrating\ngeneralization capabilities to GANs. The obtained results showcase an\nimpressive average accuracy of 93.41% in synthetic image detection on unseen\ngeneration models. The code and models associated with this research can be\npublicly accessed at https://github.com/Mamadou-Keita/VLM-DETECT.\n","authors":["Mamadou Keita","Wassim Hamidouche","Hessen Bougueffa Eutamene","Abdenour Hadid","Abdelmalik Taleb-Ahmed"],"pdf_url":"https://arxiv.org/pdf/2404.01959v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01952v1","updated":"2024-04-02T13:47:15Z","published":"2024-04-02T13:47:15Z","title":"Automatic Wood Pith Detector: Local Orientation Estimation and Robust\n Accumulation","summary":" A fully automated technique for wood pith detection (APD), relying on the\nconcentric shape of the structure of wood ring slices, is introduced. The\nmethod estimates the ring's local orientations using the 2D structure tensor\nand finds the pith position, optimizing a cost function designed for this\nproblem. We also present a variant (APD-PCL), using the parallel coordinates\nspace, that enhances the method's effectiveness when there are no clear tree\nring patterns. Furthermore, refining previous work by Kurdthongmee, a YoloV8\nnet is trained for pith detection, producing a deep learning-based approach to\nthe same problem (APD-DL). All methods were tested on seven datasets, including\nimages captured under diverse conditions (controlled laboratory settings,\nsawmill, and forest) and featuring various tree species (Pinus taeda, Douglas\nfir, Abies alba, and Gleditsia triacanthos). All proposed approaches outperform\nexisting state-of-the-art methods and can be used in CPU-based real-time\napplications. Additionally, we provide a novel dataset comprising images of\ngymnosperm and angiosperm species. Dataset and source code are available at\nhttp://github.com/hmarichal93/apd.\n","authors":["Henry Marichal","Diego Passarella","Gregory Randall"],"pdf_url":"https://arxiv.org/pdf/2404.01952v1.pdf","comment":"18 pages, presented to ICPR 2024 conference"},{"id":"http://arxiv.org/abs/2404.01948v1","updated":"2024-04-02T13:43:08Z","published":"2024-04-02T13:43:08Z","title":"Quantifying Noise of Dynamic Vision Sensor","summary":" Dynamic visual sensors (DVS) are characterized by a large amount of\nbackground activity (BA) noise, which it is mixed with the original (cleaned)\nsensor signal. The dynamic nature of the signal and the absence in practical\napplication of the ground truth, it clearly makes difficult to distinguish\nbetween noise and the cleaned sensor signals using standard image processing\ntechniques. In this letter, a new technique is presented to characterise BA\nnoise derived from the Detrended Fluctuation Analysis (DFA). The proposed\ntechnique can be used to address an existing DVS issues, which is how to\nquantitatively characterised noise and signal without ground truth, and how to\nderive an optimal denoising filter parameters. The solution of the latter\nproblem is demonstrated for the popular real moving-car dataset.\n","authors":["Evgeny V. Votyakov","Alessandro Artusi"],"pdf_url":"https://arxiv.org/pdf/2404.01948v1.pdf","comment":"5 pages, 4 figures, submitted to the IEEE Signal Processing Letters"},{"id":"http://arxiv.org/abs/2404.01946v1","updated":"2024-04-02T13:42:29Z","published":"2024-04-02T13:42:29Z","title":"Synthetic Data for Robust Stroke Segmentation","summary":" Deep learning-based semantic segmentation in neuroimaging currently requires\nhigh-resolution scans and extensive annotated datasets, posing significant\nbarriers to clinical applicability. We present a novel synthetic framework for\nthe task of lesion segmentation, extending the capabilities of the established\nSynthSeg approach to accommodate large heterogeneous pathologies with\nlesion-specific augmentation strategies. Our method trains deep learning\nmodels, demonstrated here with the UNet architecture, using label maps derived\nfrom healthy and stroke datasets, facilitating the segmentation of both healthy\ntissue and pathological lesions without sequence-specific training data.\nEvaluated against in-domain and out-of-domain (OOD) datasets, our framework\ndemonstrates robust performance, rivaling current methods within the training\ndomain and significantly outperforming them on OOD data. This contribution\nholds promise for advancing medical imaging analysis in clinical settings,\nespecially for stroke pathology, by enabling reliable segmentation across\nvaried imaging sequences with reduced dependency on large annotated corpora.\nCode and weights available at https://github.com/liamchalcroft/SynthStroke.\n","authors":["Liam Chalcroft","Ioannis Pappas","Cathy J. Price","John Ashburner"],"pdf_url":"https://arxiv.org/pdf/2404.01946v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01945v1","updated":"2024-04-02T13:41:22Z","published":"2024-04-02T13:41:22Z","title":"Event-assisted Low-Light Video Object Segmentation","summary":" In the realm of video object segmentation (VOS), the challenge of operating\nunder low-light conditions persists, resulting in notably degraded image\nquality and compromised accuracy when comparing query and memory frames for\nsimilarity computation. Event cameras, characterized by their high dynamic\nrange and ability to capture motion information of objects, offer promise in\nenhancing object visibility and aiding VOS methods under such low-light\nconditions. This paper introduces a pioneering framework tailored for low-light\nVOS, leveraging event camera data to elevate segmentation accuracy. Our\napproach hinges on two pivotal components: the Adaptive Cross-Modal Fusion\n(ACMF) module, aimed at extracting pertinent features while fusing image and\nevent modalities to mitigate noise interference, and the Event-Guided Memory\nMatching (EGMM) module, designed to rectify the issue of inaccurate matching\nprevalent in low-light settings. Additionally, we present the creation of a\nsynthetic LLE-DAVIS dataset and the curation of a real-world LLE-VOS dataset,\nencompassing frames and events. Experimental evaluations corroborate the\nefficacy of our method across both datasets, affirming its effectiveness in\nlow-light scenarios.\n","authors":["Hebei Li","Jin Wang","Jiahui Yuan","Yue Li","Wenming Weng","Yansong Peng","Yueyi Zhang","Zhiwei Xiong","Xiaoyan Sun"],"pdf_url":"https://arxiv.org/pdf/2404.01945v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01943v1","updated":"2024-04-02T13:36:03Z","published":"2024-04-02T13:36:03Z","title":"Lookahead Exploration with Neural Radiance Representation for Continuous\n Vision-Language Navigation","summary":" Vision-and-language navigation (VLN) enables the agent to navigate to a\nremote location following the natural language instruction in 3D environments.\nAt each navigation step, the agent selects from possible candidate locations\nand then makes the move. For better navigation planning, the lookahead\nexploration strategy aims to effectively evaluate the agent's next action by\naccurately anticipating the future environment of candidate locations. To this\nend, some existing works predict RGB images for future environments, while this\nstrategy suffers from image distortion and high computational cost. To address\nthese issues, we propose the pre-trained hierarchical neural radiance\nrepresentation model (HNR) to produce multi-level semantic features for future\nenvironments, which are more robust and efficient than pixel-wise RGB\nreconstruction. Furthermore, with the predicted future environmental\nrepresentations, our lookahead VLN model is able to construct the navigable\nfuture path tree and select the optimal path via efficient parallel evaluation.\nExtensive experiments on the VLN-CE datasets confirm the effectiveness of our\nmethod.\n","authors":["Zihan Wang","Xiangyang Li","Jiahao Yang","Yeqi Liu","Junjie Hu","Ming Jiang","Shuqiang Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.01943v1.pdf","comment":"Accepted by CVPR 2024. The code is available at\n https://github.com/MrZihan/HNR-VLN"},{"id":"http://arxiv.org/abs/2404.01941v1","updated":"2024-04-02T13:33:31Z","published":"2024-04-02T13:33:31Z","title":"LPSNet: End-to-End Human Pose and Shape Estimation with Lensless Imaging","summary":" Human pose and shape (HPS) estimation with lensless imaging is not only\nbeneficial to privacy protection but also can be used in covert surveillance\nscenarios due to the small size and simple structure of this device. However,\nthis task presents significant challenges due to the inherent ambiguity of the\ncaptured measurements and lacks effective methods for directly estimating human\npose and shape from lensless data. In this paper, we propose the first\nend-to-end framework to recover 3D human poses and shapes from lensless\nmeasurements to our knowledge. We specifically design a multi-scale lensless\nfeature decoder to decode the lensless measurements through the optically\nencoded mask for efficient feature extraction. We also propose a double-head\nauxiliary supervision mechanism to improve the estimation accuracy of human\nlimb ends. Besides, we establish a lensless imaging system and verify the\neffectiveness of our method on various datasets acquired by our lensless\nimaging system.\n","authors":["Haoyang Ge","Qiao Feng","Hailong Jia","Xiongzheng Li","Xiangjun Yin","You Zhou","Jingyu Yang","Kun Li"],"pdf_url":"https://arxiv.org/pdf/2404.01941v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16003v2","updated":"2024-04-02T13:31:41Z","published":"2024-03-24T04:22:37Z","title":"Diverse Representation Embedding for Lifelong Person Re-Identification","summary":" Lifelong Person Re-Identification (LReID) aims to continuously learn from\nsuccessive data streams, matching individuals across multiple cameras. The key\nchallenge for LReID is how to effectively preserve old knowledge while\nincrementally learning new information, which is caused by task-level domain\ngaps and limited old task datasets. Existing methods based on CNN backbone are\ninsufficient to explore the representation of each instance from different\nperspectives, limiting model performance on limited old task datasets and new\ntask datasets. Unlike these methods, we propose a Diverse Representations\nEmbedding (DRE) framework that first explores a pure transformer for LReID. The\nproposed DRE preserves old knowledge while adapting to new information based on\ninstance-level and task-level layout. Concretely, an Adaptive Constraint Module\n(ACM) is proposed to implement integration and push away operations between\nmultiple overlapping representations generated by transformer-based backbone,\nobtaining rich and discriminative representations for each instance to improve\nadaptive ability of LReID. Based on the processed diverse representations, we\npropose Knowledge Update (KU) and Knowledge Preservation (KP) strategies at the\ntask-level layout by introducing the adjustment model and the learner model. KU\nstrategy enhances the adaptive learning ability of learner models for new\ninformation under the adjustment model prior, and KP strategy preserves old\nknowledge operated by representation-level alignment and logit-level\nsupervision in limited old task datasets while guaranteeing the adaptive\nlearning information capacity of the LReID model. Compared to state-of-the-art\nmethods, our method achieves significantly improved performance in holistic,\nlarge-scale, and occluded datasets.\n","authors":["Shiben Liu","Huijie Fan","Qiang Wang","Xiai Chen","Zhi Han","Yandong Tang"],"pdf_url":"https://arxiv.org/pdf/2403.16003v2.pdf","comment":"11 pages,7 Tables,3 Figures"},{"id":"http://arxiv.org/abs/2404.01933v1","updated":"2024-04-02T13:27:28Z","published":"2024-04-02T13:27:28Z","title":"PREGO: online mistake detection in PRocedural EGOcentric videos","summary":" Promptly identifying procedural errors from egocentric videos in an online\nsetting is highly challenging and valuable for detecting mistakes as soon as\nthey happen. This capability has a wide range of applications across various\nfields, such as manufacturing and healthcare. The nature of procedural mistakes\nis open-set since novel types of failures might occur, which calls for\none-class classifiers trained on correctly executed procedures. However, no\ntechnique can currently detect open-set procedural mistakes online. We propose\nPREGO, the first online one-class classification model for mistake detection in\nPRocedural EGOcentric videos. PREGO is based on an online action recognition\ncomponent to model the current action, and a symbolic reasoning module to\npredict the next actions. Mistake detection is performed by comparing the\nrecognized current action with the expected future one. We evaluate PREGO on\ntwo procedural egocentric video datasets, Assembly101 and Epic-tent, which we\nadapt for online benchmarking of procedural mistake detection to establish\nsuitable benchmarks, thus defining the Assembly101-O and Epic-tent-O datasets,\nrespectively.\n","authors":["Alessandro Flaborea","Guido Maria D'Amely di Melendugno","Leonardo Plini","Luca Scofano","Edoardo De Matteis","Antonino Furnari","Giovanni Maria Farinella","Fabio Galasso"],"pdf_url":"https://arxiv.org/pdf/2404.01933v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2312.16476v5","updated":"2024-04-02T13:25:04Z","published":"2023-12-27T08:50:01Z","title":"SVGDreamer: Text Guided SVG Generation with Diffusion Model","summary":" Recently, text-guided scalable vector graphics (SVGs) synthesis has shown\npromise in domains such as iconography and sketch. However, existing\ntext-to-SVG generation methods lack editability and struggle with visual\nquality and result diversity. To address these limitations, we propose a novel\ntext-guided vector graphics synthesis method called SVGDreamer. SVGDreamer\nincorporates a semantic-driven image vectorization (SIVE) process that enables\nthe decomposition of synthesis into foreground objects and background, thereby\nenhancing editability. Specifically, the SIVE process introduces\nattention-based primitive control and an attention-mask loss function for\neffective control and manipulation of individual elements. Additionally, we\npropose a Vectorized Particle-based Score Distillation (VPSD) approach to\naddress issues of shape over-smoothing, color over-saturation, limited\ndiversity, and slow convergence of the existing text-to-SVG generation methods\nby modeling SVGs as distributions of control points and colors. Furthermore,\nVPSD leverages a reward model to re-weight vector particles, which improves\naesthetic appeal and accelerates convergence. Extensive experiments are\nconducted to validate the effectiveness of SVGDreamer, demonstrating its\nsuperiority over baseline methods in terms of editability, visual quality, and\ndiversity. Project page:\n\\href{https://ximinng.github.io/SVGDreamer-project/}{https://ximinng.github.io/SVGDreamer-project/}\n","authors":["Ximing Xing","Haitao Zhou","Chuang Wang","Jing Zhang","Dong Xu","Qian Yu"],"pdf_url":"https://arxiv.org/pdf/2312.16476v5.pdf","comment":"Accepted by CVPR 2024. project link:\n https://ximinng.github.io/SVGDreamer-project/"},{"id":"http://arxiv.org/abs/2204.14030v5","updated":"2024-04-02T13:24:37Z","published":"2022-04-29T11:55:35Z","title":"Neural Implicit Representations for Physical Parameter Inference from a\n Single Video","summary":" Neural networks have recently been used to analyze diverse physical systems\nand to identify the underlying dynamics. While existing methods achieve\nimpressive results, they are limited by their strong demand for training data\nand their weak generalization abilities to out-of-distribution data. To\novercome these limitations, in this work we propose to combine neural implicit\nrepresentations for appearance modeling with neural ordinary differential\nequations (ODEs) for modelling physical phenomena to obtain a dynamic scene\nrepresentation that can be identified directly from visual observations. Our\nproposed model combines several unique advantages: (i) Contrary to existing\napproaches that require large training datasets, we are able to identify\nphysical parameters from only a single video. (ii) The use of neural implicit\nrepresentations enables the processing of high-resolution videos and the\nsynthesis of photo-realistic images. (iii) The embedded neural ODE has a known\nparametric form that allows for the identification of interpretable physical\nparameters, and (iv) long-term prediction in state space. (v) Furthermore, the\nphoto-realistic rendering of novel scenes with modified physical parameters\nbecomes possible.\n","authors":["Florian Hofherr","Lukas Koestler","Florian Bernard","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2204.14030v5.pdf","comment":"Published in IEEE/CVF Winter Conference on Applications of Computer\n Vision (WACV) 2023"},{"id":"http://arxiv.org/abs/2404.01929v1","updated":"2024-04-02T13:23:21Z","published":"2024-04-02T13:23:21Z","title":"Towards Enhanced Analysis of Lung Cancer Lesions in EBUS-TBNA -- A\n Semi-Supervised Video Object Detection Method","summary":" This study aims to establish a computer-aided diagnostic system for lung\nlesions using bronchoscope endobronchial ultrasound (EBUS) to assist physicians\nin identifying lesion areas. During EBUS-transbronchial needle aspiration\n(EBUS-TBNA) procedures, physicians rely on grayscale ultrasound images to\ndetermine the location of lesions. However, these images often contain\nsignificant noise and can be influenced by surrounding tissues or blood\nvessels, making interpretation challenging. Previous research has lacked the\napplication of object detection models to EBUS-TBNA, and there has been no\nwell-defined solution for annotating the EBUS-TBNA dataset. In related studies\non ultrasound images, although models have been successful in capturing target\nregions for their respective tasks, their training and predictions have been\nbased on two-dimensional images, limiting their ability to leverage temporal\nfeatures for improved predictions. This study introduces a three-dimensional\nimage-based object detection model. It utilizes an attention mechanism to\ncapture temporal correlations and we will implements a filtering mechanism to\nselect relevant information from previous frames. Subsequently, a\nteacher-student model training approach is employed to optimize the model\nfurther, leveraging unlabeled data. To mitigate the impact of poor-quality\npseudo-labels on the student model, we will add a special Gaussian Mixture\nModel (GMM) to ensure the quality of pseudo-labels.\n","authors":["Jyun-An Lin","Yun-Chien Cheng","Ching-Kai Lin"],"pdf_url":"https://arxiv.org/pdf/2404.01929v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01925v1","updated":"2024-04-02T13:19:45Z","published":"2024-04-02T13:19:45Z","title":"Improving Bird's Eye View Semantic Segmentation by Task Decomposition","summary":" Semantic segmentation in bird's eye view (BEV) plays a crucial role in\nautonomous driving. Previous methods usually follow an end-to-end pipeline,\ndirectly predicting the BEV segmentation map from monocular RGB inputs.\nHowever, the challenge arises when the RGB inputs and BEV targets from distinct\nperspectives, making the direct point-to-point predicting hard to optimize. In\nthis paper, we decompose the original BEV segmentation task into two stages,\nnamely BEV map reconstruction and RGB-BEV feature alignment. In the first\nstage, we train a BEV autoencoder to reconstruct the BEV segmentation maps\ngiven corrupted noisy latent representation, which urges the decoder to learn\nfundamental knowledge of typical BEV patterns. The second stage involves\nmapping RGB input images into the BEV latent space of the first stage, directly\noptimizing the correlations between the two views at the feature level. Our\napproach simplifies the complexity of combining perception and generation into\ndistinct steps, equipping the model to handle intricate and challenging scenes\neffectively. Besides, we propose to transform the BEV segmentation map from the\nCartesian to the polar coordinate system to establish the column-wise\ncorrespondence between RGB images and BEV maps. Moreover, our method requires\nneither multi-scale features nor camera intrinsic parameters for depth\nestimation and saves computational overhead. Extensive experiments on nuScenes\nand Argoverse show the effectiveness and efficiency of our method. Code is\navailable at https://github.com/happytianhao/TaDe.\n","authors":["Tianhao Zhao","Yongcan Chen","Yu Wu","Tianyang Liu","Bo Du","Peilun Xiao","Shi Qiu","Hongda Yang","Guozhen Li","Yi Yang","Yutian Lin"],"pdf_url":"https://arxiv.org/pdf/2404.01925v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01924v1","updated":"2024-04-02T13:19:06Z","published":"2024-04-02T13:19:06Z","title":"Toward Efficient Visual Gyroscopes: Spherical Moments, Harmonics\n Filtering, and Masking Techniques for Spherical Camera Applications","summary":" Unlike a traditional gyroscope, a visual gyroscope estimates camera rotation\nthrough images. The integration of omnidirectional cameras, offering a larger\nfield of view compared to traditional RGB cameras, has proven to yield more\naccurate and robust results. However, challenges arise in situations that lack\nfeatures, have substantial noise causing significant errors, and where certain\nfeatures in the images lack sufficient strength, leading to less precise\nprediction results.\n Here, we address these challenges by introducing a novel visual gyroscope,\nwhich combines an analytical method with a neural network approach to provide a\nmore efficient and accurate rotation estimation from spherical images. The\npresented method relies on three key contributions: an adapted analytical\napproach to compute the spherical moments coefficients, introduction of masks\nfor better global feature representation, and the use of a multilayer\nperceptron to adaptively choose the best combination of masks and filters.\nExperimental results demonstrate superior performance of the proposed approach\nin terms of accuracy. The paper emphasizes the advantages of integrating\nmachine learning to optimize analytical solutions, discusses limitations, and\nsuggests directions for future research.\n","authors":["Yao Du","Carlos M. Mateo","Mirjana Maras","Tsun-Hsuan Wang","Marc Blanchon","Alexander Amini","Daniela Rus","Omar Tahri"],"pdf_url":"https://arxiv.org/pdf/2404.01924v1.pdf","comment":"Submitted to 2024 IEEE/RSJ International Conference on Intelligent\n Robots and Systems (IROS 2024)"},{"id":"http://arxiv.org/abs/2403.15769v2","updated":"2024-04-02T13:16:46Z","published":"2024-03-23T08:54:03Z","title":"FusionINN: Invertible Image Fusion for Brain Tumor Monitoring","summary":" Image fusion typically employs non-invertible neural networks to merge\nmultiple source images into a single fused image. However, for clinical\nexperts, solely relying on fused images may be insufficient for making\ndiagnostic decisions, as the fusion mechanism blends features from source\nimages, thereby making it difficult to interpret the underlying tumor\npathology. We introduce FusionINN, a novel invertible image fusion framework,\ncapable of efficiently generating fused images and also decomposing them back\nto the source images by solving the inverse of the fusion process. FusionINN\nguarantees lossless one-to-one pixel mapping by integrating a normally\ndistributed latent image alongside the fused image to facilitate the generative\nmodeling of the decomposition process. To the best of our knowledge, we are the\nfirst to investigate the decomposability of fused images, which is particularly\ncrucial for life-sensitive applications such as medical image fusion compared\nto other tasks like multi-focus or multi-exposure image fusion. Our extensive\nexperimentation validates FusionINN over existing discriminative and generative\nfusion methods, both subjectively and objectively. Moreover, compared to a\nrecent denoising diffusion-based fusion model, our approach offers faster and\nqualitatively better fusion results. We also exhibit the clinical utility of\nour results in aiding disease prognosis.\n","authors":["Nishant Kumar","Ziyan Tao","Jaikirat Singh","Yang Li","Peiwen Sun","Binghui Zhao","Stefan Gumhold"],"pdf_url":"https://arxiv.org/pdf/2403.15769v2.pdf","comment":"Source code available at https://github.com/nish03/FusionINN"},{"id":"http://arxiv.org/abs/2404.00722v2","updated":"2024-04-02T13:15:36Z","published":"2024-03-31T15:34:45Z","title":"DRCT: Saving Image Super-resolution away from Information Bottleneck","summary":" In recent years, Vision Transformer-based applications to low-level vision\ntasks have achieved widespread success. Unlike CNN-based models, Transformers\nare more adept at capturing long-range dependencies, enabling the\nreconstruction of images utilizing information from non-local areas. In the\ndomain of super-resolution, Swin-transformer-based approaches have become\nmainstream due to their capacity to capture global spatial information and\ntheir shifting-window attention mechanism that facilitates the interchange of\ninformation between different windows. Many researchers have enhanced image\nquality and network efficiency by expanding the receptive field or designing\ncomplex networks, yielding commendable results. However, we observed that\nspatial information tends to diminish during the forward propagation process\ndue to increased depth, leading to a loss of spatial information and,\nconsequently, limiting the model's potential. To address this, we propose the\nDense-residual-connected Transformer (DRCT), aimed at mitigating the loss of\nspatial information through dense-residual connections between layers, thereby\nunleashing the model's potential and enhancing performance. Experiment results\nindicate that our approach is not only straightforward but also achieves\nremarkable efficiency, surpassing state-of-the-art methods and performing\ncommendably at NTIRE2024.\n","authors":["Chih-Chung Hsu","Chia-Ming Lee","Yi-Shiuan Chou"],"pdf_url":"https://arxiv.org/pdf/2404.00722v2.pdf","comment":"Submitted to NTIRE 2024"},{"id":"http://arxiv.org/abs/2404.01911v1","updated":"2024-04-02T12:57:22Z","published":"2024-04-02T12:57:22Z","title":"VLRM: Vision-Language Models act as Reward Models for Image Captioning","summary":" In this work, we present an unsupervised method for enhancing an image\ncaptioning model (in our case, BLIP2) using reinforcement learning and\nvision-language models like CLIP and BLIP2-ITM as reward models. The RL-tuned\nmodel is able to generate longer and more comprehensive descriptions. Our model\nreaches impressive 0.90 R@1 CLIP Recall score on MS-COCO Carpathy Test Split.\n Weights are available at\nhttps://huggingface.co/sashakunitsyn/vlrm-blip2-opt-2.7b.\n","authors":["Maksim Dzabraev","Alexander Kunitsyn","Andrei Ivaniuta"],"pdf_url":"https://arxiv.org/pdf/2404.01911v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01258v2","updated":"2024-04-02T12:47:49Z","published":"2024-04-01T17:28:16Z","title":"Direct Preference Optimization of Video Large Multimodal Models from\n Language Model Reward","summary":" Preference modeling techniques, such as direct preference optimization (DPO),\nhas shown effective in enhancing the generalization abilities of large language\nmodel (LLM). However, in tasks involving video instruction-following, providing\ninformative feedback, especially for detecting hallucinations in generated\nresponses, remains a significant challenge. Previous studies have explored\nusing large large multimodal models (LMMs) as reward models to guide preference\nmodeling, but their ability to accurately assess the factuality of generated\nresponses compared to corresponding videos has not been conclusively\nestablished. This paper introduces a novel framework that utilizes detailed\nvideo captions as a proxy of video content, enabling language models to\nincorporate this information as supporting evidence for scoring video Question\nAnswering (QA) predictions. Our approach demonstrates robust alignment with\nOpenAI GPT-4V model's reward mechanism, which directly takes video frames as\ninput. Furthermore, we show that applying this tailored reward through DPO\nsignificantly improves the performance of video LMMs on video QA tasks.\n","authors":["Ruohong Zhang","Liangke Gui","Zhiqing Sun","Yihao Feng","Keyang Xu","Yuanhan Zhang","Di Fu","Chunyuan Li","Alexander Hauptmann","Yonatan Bisk","Yiming Yang"],"pdf_url":"https://arxiv.org/pdf/2404.01258v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00129v3","updated":"2024-04-02T12:42:42Z","published":"2024-01-31T19:14:12Z","title":"CMRNext: Camera to LiDAR Matching in the Wild for Localization and\n Extrinsic Calibration","summary":" LiDARs are widely used for mapping and localization in dynamic environments.\nHowever, their high cost limits their widespread adoption. On the other hand,\nmonocular localization in LiDAR maps using inexpensive cameras is a\ncost-effective alternative for large-scale deployment. Nevertheless, most\nexisting approaches struggle to generalize to new sensor setups and\nenvironments, requiring retraining or fine-tuning. In this paper, we present\nCMRNext, a novel approach for camera-LIDAR matching that is independent of\nsensor-specific parameters, generalizable, and can be used in the wild for\nmonocular localization in LiDAR maps and camera-LiDAR extrinsic calibration.\nCMRNext exploits recent advances in deep neural networks for matching\ncross-modal data and standard geometric techniques for robust pose estimation.\nWe reformulate the point-pixel matching problem as an optical flow estimation\nproblem and solve the Perspective-n-Point problem based on the resulting\ncorrespondences to find the relative pose between the camera and the LiDAR\npoint cloud. We extensively evaluate CMRNext on six different robotic\nplatforms, including three publicly available datasets and three in-house\nrobots. Our experimental evaluations demonstrate that CMRNext outperforms\nexisting approaches on both tasks and effectively generalizes to previously\nunseen environments and sensor setups in a zero-shot manner. We make the code\nand pre-trained models publicly available at http://cmrnext.cs.uni-freiburg.de .\n","authors":["Daniele Cattaneo","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2402.00129v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00299v2","updated":"2024-04-02T12:34:09Z","published":"2024-03-30T09:24:25Z","title":"HOI-M3:Capture Multiple Humans and Objects Interaction within Contextual\n Environment","summary":" Humans naturally interact with both others and the surrounding multiple\nobjects, engaging in various social activities. However, recent advances in\nmodeling human-object interactions mostly focus on perceiving isolated\nindividuals and objects, due to fundamental data scarcity. In this paper, we\nintroduce HOI-M3, a novel large-scale dataset for modeling the interactions of\nMultiple huMans and Multiple objects. Notably, it provides accurate 3D tracking\nfor both humans and objects from dense RGB and object-mounted IMU inputs,\ncovering 199 sequences and 181M frames of diverse humans and objects under rich\nactivities. With the unique HOI-M3 dataset, we introduce two novel data-driven\ntasks with companion strong baselines: monocular capture and unstructured\ngeneration of multiple human-object interactions. Extensive experiments\ndemonstrate that our dataset is challenging and worthy of further research\nabout multiple human-object interactions and behavior analysis. Our HOI-M3\ndataset, corresponding codes, and pre-trained models will be disseminated to\nthe community for future research.\n","authors":["Juze Zhang","Jingyan Zhang","Zining Song","Zhanhe Shi","Chengfeng Zhao","Ye Shi","Jingyi Yu","Lan Xu","Jingya Wang"],"pdf_url":"https://arxiv.org/pdf/2404.00299v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01892v1","updated":"2024-04-02T12:29:31Z","published":"2024-04-02T12:29:31Z","title":"Minimize Quantization Output Error with Bias Compensation","summary":" Quantization is a promising method that reduces memory usage and\ncomputational intensity of Deep Neural Networks (DNNs), but it often leads to\nsignificant output error that hinder model deployment. In this paper, we\npropose Bias Compensation (BC) to minimize the output error, thus realizing\nultra-low-precision quantization without model fine-tuning. Instead of\noptimizing the non-convex quantization process as in most previous methods, the\nproposed BC bypasses the step to directly minimize the quantizing output error\nby identifying a bias vector for compensation. We have established that the\nminimization of output error through BC is a convex problem and provides an\nefficient strategy to procure optimal solutions associated with minimal output\nerror,without the need for training or fine-tuning. We conduct extensive\nexperiments on Vision Transformer models and Large Language Models, and the\nresults show that our method notably reduces quantization output error, thereby\npermitting ultra-low-precision post-training quantization and enhancing the\ntask performance of models. Especially, BC improves the accuracy of ViT-B with\n4-bit PTQ4ViT by 36.89% on the ImageNet-1k task, and decreases the perplexity\nof OPT-350M with 3-bit GPTQ by 5.97 on WikiText2.The code is in\nhttps://github.com/GongCheng1919/bias-compensation.\n","authors":["Cheng Gong","Haoshuai Zheng","Mengting Hu","Zheng Lin","Deng-Ping Fan","Yuzhi Zhang","Tao Li"],"pdf_url":"https://arxiv.org/pdf/2404.01892v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.01891v1","updated":"2024-04-02T12:29:04Z","published":"2024-04-02T12:29:04Z","title":"ASTRA: An Action Spotting TRAnsformer for Soccer Videos","summary":" In this paper, we introduce ASTRA, a Transformer-based model designed for the\ntask of Action Spotting in soccer matches. ASTRA addresses several challenges\ninherent in the task and dataset, including the requirement for precise action\nlocalization, the presence of a long-tail data distribution, non-visibility in\ncertain actions, and inherent label noise. To do so, ASTRA incorporates (a) a\nTransformer encoder-decoder architecture to achieve the desired output temporal\nresolution and to produce precise predictions, (b) a balanced mixup strategy to\nhandle the long-tail distribution of the data, (c) an uncertainty-aware\ndisplacement head to capture the label variability, and (d) input audio signal\nto enhance detection of non-visible actions. Results demonstrate the\neffectiveness of ASTRA, achieving a tight Average-mAP of 66.82 on the test set.\nMoreover, in the SoccerNet 2023 Action Spotting challenge, we secure the 3rd\nposition with an Average-mAP of 70.21 on the challenge set.\n","authors":["Artur Xarles","Sergio Escalera","Thomas B. Moeslund","Albert Clapés"],"pdf_url":"https://arxiv.org/pdf/2404.01891v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01889v1","updated":"2024-04-02T12:28:40Z","published":"2024-04-02T12:28:40Z","title":"RAVE: Residual Vector Embedding for CLIP-Guided Backlit Image\n Enhancement","summary":" In this paper we propose a novel modification of Contrastive Language-Image\nPre-Training (CLIP) guidance for the task of unsupervised backlit image\nenhancement. Our work builds on the state-of-the-art CLIP-LIT approach, which\nlearns a prompt pair by constraining the text-image similarity between a prompt\n(negative/positive sample) and a corresponding image (backlit image/well-lit\nimage) in the CLIP embedding space. Learned prompts then guide an image\nenhancement network. Based on the CLIP-LIT framework, we propose two novel\nmethods for CLIP guidance. First, we show that instead of tuning prompts in the\nspace of text embeddings, it is possible to directly tune their embeddings in\nthe latent space without any loss in quality. This accelerates training and\npotentially enables the use of additional encoders that do not have a text\nencoder. Second, we propose a novel approach that does not require any prompt\ntuning. Instead, based on CLIP embeddings of backlit and well-lit images from\ntraining data, we compute the residual vector in the embedding space as a\nsimple difference between the mean embeddings of the well-lit and backlit\nimages. This vector then guides the enhancement network during training,\npushing a backlit image towards the space of well-lit images. This approach\nfurther dramatically reduces training time, stabilizes training and produces\nhigh quality enhanced images without artifacts, both in supervised and\nunsupervised training regimes. Additionally, we show that residual vectors can\nbe interpreted, revealing biases in training data, and thereby enabling\npotential bias correction.\n","authors":["Tatiana Gaintseva","Marting Benning","Gregory Slabaugh"],"pdf_url":"https://arxiv.org/pdf/2404.01889v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01887v1","updated":"2024-04-02T12:26:17Z","published":"2024-04-02T12:26:17Z","title":"3D Scene Generation from Scene Graphs and Self-Attention","summary":" Synthesizing realistic and diverse indoor 3D scene layouts in a controllable\nfashion opens up applications in simulated navigation and virtual reality. As\nconcise and robust representations of a scene, scene graphs have proven to be\nwell-suited as the semantic control on the generated layout. We present a\nvariant of the conditional variational autoencoder (cVAE) model to synthesize\n3D scenes from scene graphs and floor plans. We exploit the properties of\nself-attention layers to capture high-level relationships between objects in a\nscene, and use these as the building blocks of our model. Our model, leverages\ngraph transformers to estimate the size, dimension and orientation of the\nobjects in a room while satisfying relationships in the given scene graph. Our\nexperiments shows self-attention layers leads to sparser (HOW MUCH) and more\ndiverse scenes (HOW MUCH)\\. Included in this work, we publish the first\nlarge-scale dataset for conditioned scene generation from scene graphs,\ncontaining over XXX rooms (of floor plans and scene graphs).\n","authors":["Pietro Bonazzi","Mengqi Wang","Diego Martin Arroyo","Fabian Manhardt","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2404.01887v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01112v2","updated":"2024-04-02T12:21:38Z","published":"2024-04-01T13:38:16Z","title":"Few-shot point cloud reconstruction and denoising via learned Guassian\n splats renderings and fine-tuned diffusion features","summary":" Existing deep learning methods for the reconstruction and denoising of point\nclouds rely on small datasets of 3D shapes. We circumvent the problem by\nleveraging deep learning methods trained on billions of images. We propose a\nmethod to reconstruct point clouds from few images and to denoise point clouds\nfrom their rendering by exploiting prior knowledge distilled from image-based\ndeep learning models. To improve reconstruction in constraint settings, we\nregularize the training of a differentiable renderer with hybrid surface and\nappearance by introducing semantic consistency supervision. In addition, we\npropose a pipeline to finetune Stable Diffusion to denoise renderings of noisy\npoint clouds and we demonstrate how these learned filters can be used to remove\npoint cloud noise coming without 3D supervision. We compare our method with DSS\nand PointRadiance and achieved higher quality 3D reconstruction on the\nSketchfab Testset and SCUT Dataset.\n","authors":["Pietro Bonazzi"],"pdf_url":"https://arxiv.org/pdf/2404.01112v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01882v1","updated":"2024-04-02T12:15:25Z","published":"2024-04-02T12:15:25Z","title":"Scene Adaptive Sparse Transformer for Event-based Object Detection","summary":" While recent Transformer-based approaches have shown impressive performances\non event-based object detection tasks, their high computational costs still\ndiminish the low power consumption advantage of event cameras. Image-based\nworks attempt to reduce these costs by introducing sparse Transformers.\nHowever, they display inadequate sparsity and adaptability when applied to\nevent-based object detection, since these approaches cannot balance the fine\ngranularity of token-level sparsification and the efficiency of window-based\nTransformers, leading to reduced performance and efficiency. Furthermore, they\nlack scene-specific sparsity optimization, resulting in information loss and a\nlower recall rate. To overcome these limitations, we propose the Scene Adaptive\nSparse Transformer (SAST). SAST enables window-token co-sparsification,\nsignificantly enhancing fault tolerance and reducing computational overhead.\nLeveraging the innovative scoring and selection modules, along with the Masked\nSparse Window Self-Attention, SAST showcases remarkable scene-aware\nadaptability: It focuses only on important objects and dynamically optimizes\nsparsity level according to scene complexity, maintaining a remarkable balance\nbetween performance and computational cost. The evaluation results show that\nSAST outperforms all other dense and sparse networks in both performance and\nefficiency on two large-scale event-based object detection datasets (1Mpx and\nGen1). Code: https://github.com/Peterande/SAST\n","authors":["Yansong Peng","Hebei Li","Yueyi Zhang","Xiaoyan Sun","Feng Wu"],"pdf_url":"https://arxiv.org/pdf/2404.01882v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01878v1","updated":"2024-04-02T12:08:26Z","published":"2024-04-02T12:08:26Z","title":"Real, fake and synthetic faces - does the coin have three sides?","summary":" With the ever-growing power of generative artificial intelligence, deepfake\nand artificially generated (synthetic) media have continued to spread online,\nwhich creates various ethical and moral concerns regarding their usage. To\ntackle this, we thus present a novel exploration of the trends and patterns\nobserved in real, deepfake and synthetic facial images. The proposed analysis\nis done in two parts: firstly, we incorporate eight deep learning models and\nanalyze their performances in distinguishing between the three classes of\nimages. Next, we look to further delve into the similarities and differences\nbetween these three sets of images by investigating their image properties both\nin the context of the entire image as well as in the context of specific\nregions within the image. ANOVA test was also performed and provided further\nclarity amongst the patterns associated between the images of the three\nclasses. From our findings, we observe that the investigated deeplearning\nmodels found it easier to detect synthetic facial images, with the ViT Patch-16\nmodel performing best on this task with a class-averaged sensitivity,\nspecificity, precision, and accuracy of 97.37%, 98.69%, 97.48%, and 98.25%,\nrespectively. This observation was supported by further analysis of various\nimage properties. We saw noticeable differences across the three category of\nimages. This analysis can help us build better algorithms for facial image\ngeneration, and also shows that synthetic, deepfake and real face images are\nindeed three different classes.\n","authors":["Shahzeb Naeem","Ramzi Al-Sharawi","Muhammad Riyyan Khan","Usman Tariq","Abhinav Dhall","Hasan Al-Nashash"],"pdf_url":"https://arxiv.org/pdf/2404.01878v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13659v3","updated":"2024-04-02T11:42:50Z","published":"2024-03-20T15:08:43Z","title":"Recursive Joint Cross-Modal Attention for Multimodal Fusion in\n Dimensional Emotion Recognition","summary":" Though multimodal emotion recognition has achieved significant progress over\nrecent years, the potential of rich synergic relationships across the\nmodalities is not fully exploited. In this paper, we introduce Recursive Joint\nCross-Modal Attention (RJCMA) to effectively capture both intra-and inter-modal\nrelationships across audio, visual and text modalities for dimensional emotion\nrecognition. In particular, we compute the attention weights based on\ncross-correlation between the joint audio-visual-text feature representations\nand the feature representations of individual modalities to simultaneously\ncapture intra- and inter-modal relationships across the modalities. The\nattended features of the individual modalities are again fed as input to the\nfusion model in a recursive mechanism to obtain more refined feature\nrepresentations. We have also explored Temporal Convolutional Networks (TCNs)\nto improve the temporal modeling of the feature representations of individual\nmodalities. Extensive experiments are conducted to evaluate the performance of\nthe proposed fusion model on the challenging Affwild2 dataset. By effectively\ncapturing the synergic intra- and inter-modal relationships across audio,\nvisual and text modalities, the proposed fusion model achieves a Concordance\nCorrelation Coefficient (CCC) of 0.585 (0.542) and 0.659 (0.619) for valence\nand arousal respectively on the validation set (test set). This shows a\nsignificant improvement over the baseline of 0.24 (0.211) and 0.20 (0.191) for\nvalence and arousal respectively on the validation set (test set) of the\nvalence-arousal challenge of 6th Affective Behavior Analysis in-the-Wild (ABAW)\ncompetition.\n","authors":["R. Gnana Praveen","Jahangir Alam"],"pdf_url":"https://arxiv.org/pdf/2403.13659v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01862v1","updated":"2024-04-02T11:40:34Z","published":"2024-04-02T11:40:34Z","title":"Co-Speech Gesture Video Generation via Motion-Decoupled Diffusion Model","summary":" Co-speech gestures, if presented in the lively form of videos, can achieve\nsuperior visual effects in human-machine interaction. While previous works\nmostly generate structural human skeletons, resulting in the omission of\nappearance information, we focus on the direct generation of audio-driven\nco-speech gesture videos in this work. There are two main challenges: 1) A\nsuitable motion feature is needed to describe complex human movements with\ncrucial appearance information. 2) Gestures and speech exhibit inherent\ndependencies and should be temporally aligned even of arbitrary length. To\nsolve these problems, we present a novel motion-decoupled framework to generate\nco-speech gesture videos. Specifically, we first introduce a well-designed\nnonlinear TPS transformation to obtain latent motion features preserving\nessential appearance information. Then a transformer-based diffusion model is\nproposed to learn the temporal correlation between gestures and speech, and\nperforms generation in the latent motion space, followed by an optimal motion\nselection module to produce long-term coherent and consistent gesture videos.\nFor better visual perception, we further design a refinement network focusing\non missing details of certain areas. Extensive experimental results show that\nour proposed framework significantly outperforms existing approaches in both\nmotion and video-related evaluations. Our code, demos, and more resources are\navailable at https://github.com/thuhcsi/S2G-MDDiffusion.\n","authors":["Xu He","Qiaochu Huang","Zhensong Zhang","Zhiwei Lin","Zhiyong Wu","Sicheng Yang","Minglei Li","Zhiyi Chen","Songcen Xu","Xiaofei Wu"],"pdf_url":"https://arxiv.org/pdf/2404.01862v1.pdf","comment":"22 pages, 8 figures, CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00636v2","updated":"2024-04-02T11:31:50Z","published":"2024-03-31T10:13:55Z","title":"Learning to Generate Conditional Tri-plane for 3D-aware Expression\n Controllable Portrait Animation","summary":" In this paper, we present Export3D, a one-shot 3D-aware portrait animation\nmethod that is able to control the facial expression and camera view of a given\nportrait image. To achieve this, we introduce a tri-plane generator that\ndirectly generates a tri-plane of 3D prior by transferring the expression\nparameter of 3DMM into the source image. The tri-plane is then decoded into the\nimage of different view through a differentiable volume rendering. Existing\nportrait animation methods heavily rely on image warping to transfer the\nexpression in the motion space, challenging on disentanglement of appearance\nand expression. In contrast, we propose a contrastive pre-training framework\nfor appearance-free expression parameter, eliminating undesirable appearance\nswap when transferring a cross-identity expression. Extensive experiments show\nthat our pre-training framework can learn the appearance-free expression\nrepresentation hidden in 3DMM, and our model can generate 3D-aware expression\ncontrollable portrait image without appearance swap in the cross-identity\nmanner.\n","authors":["Taekyung Ki","Dongchan Min","Gyeongsu Chae"],"pdf_url":"https://arxiv.org/pdf/2404.00636v2.pdf","comment":"Project page: https://export3d.github.io"},{"id":"http://arxiv.org/abs/2404.01853v1","updated":"2024-04-02T11:30:22Z","published":"2024-04-02T11:30:22Z","title":"Pairwise Similarity Distribution Clustering for Noisy Label Learning","summary":" Noisy label learning aims to train deep neural networks using a large amount\nof samples with noisy labels, whose main challenge comes from how to deal with\nthe inaccurate supervision caused by wrong labels. Existing works either take\nthe label correction or sample selection paradigm to involve more samples with\naccurate labels into the training process. In this paper, we propose a simple\nyet effective sample selection algorithm, termed as Pairwise Similarity\nDistribution Clustering~(PSDC), to divide the training samples into one clean\nset and another noisy set, which can power any of the off-the-shelf\nsemi-supervised learning regimes to further train networks for different\ndownstream tasks. Specifically, we take the pairwise similarity between sample\npairs to represent the sample structure, and the Gaussian Mixture Model~(GMM)\nto model the similarity distribution between sample pairs belonging to the same\nnoisy cluster, therefore each sample can be confidently divided into the clean\nset or noisy set. Even under severe label noise rate, the resulting data\npartition mechanism has been proved to be more robust in judging the label\nconfidence in both theory and practice. Experimental results on various\nbenchmark datasets, such as CIFAR-10, CIFAR-100 and Clothing1M, demonstrate\nsignificant improvements over state-of-the-art methods.\n","authors":["Sihan Bai"],"pdf_url":"https://arxiv.org/pdf/2404.01853v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18403v2","updated":"2024-04-02T11:17:49Z","published":"2023-11-30T09:55:46Z","title":"Corrupting Convolution-based Unlearnable Datasets with Pixel-based Image\n Transformations","summary":" Unlearnable datasets lead to a drastic drop in the generalization performance\nof models trained on them by introducing elaborate and imperceptible\nperturbations into clean training sets. Many existing defenses, e.g., JPEG\ncompression and adversarial training, effectively counter UDs based on\nnorm-constrained additive noise. However, a fire-new type of convolution-based\nUDs have been proposed and render existing defenses all ineffective, presenting\na greater challenge to defenders. To address this, we express the\nconvolution-based unlearnable sample as the result of multiplying a matrix by a\nclean sample in a simplified scenario, and formalize the intra-class matrix\ninconsistency as $\\Theta_{imi}$, inter-class matrix consistency as\n$\\Theta_{imc}$ to investigate the working mechanism of the convolution-based\nUDs. We conjecture that increasing both of these metrics will mitigate the\nunlearnability effect. Through validation experiments that commendably support\nour hypothesis, we further design a random matrix to boost both $\\Theta_{imi}$\nand $\\Theta_{imc}$, achieving a notable degree of defense effect. Hence, by\nbuilding upon and extending these facts, we first propose a brand-new image\nCOrruption that employs randomly multiplicative transformation via\nINterpolation operation to successfully defend against convolution-based UDs.\nOur approach leverages global pixel random interpolations, effectively\nsuppressing the impact of multiplicative noise in convolution-based UDs.\nAdditionally, we have also designed two new forms of convolution-based UDs, and\nfind that our defense is the most effective against them.\n","authors":["Xianlong Wang","Shengshan Hu","Minghui Li","Zhifei Yu","Ziqi Zhou","Leo Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.18403v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08707v4","updated":"2024-04-02T11:08:12Z","published":"2023-06-14T19:15:49Z","title":"VidEdit: Zero-Shot and Spatially Aware Text-Driven Video Editing","summary":" Recently, diffusion-based generative models have achieved remarkable success\nfor image generation and edition. However, existing diffusion-based video\nediting approaches lack the ability to offer precise control over generated\ncontent that maintains temporal consistency in long-term videos. On the other\nhand, atlas-based methods provide strong temporal consistency but are costly to\nedit a video and lack spatial control. In this work, we introduce VidEdit, a\nnovel method for zero-shot text-based video editing that guarantees robust\ntemporal and spatial consistency. In particular, we combine an atlas-based\nvideo representation with a pre-trained text-to-image diffusion model to\nprovide a training-free and efficient video editing method, which by design\nfulfills temporal smoothness. To grant precise user control over generated\ncontent, we utilize conditional information extracted from off-the-shelf\npanoptic segmenters and edge detectors which guides the diffusion sampling\nprocess. This method ensures a fine spatial control on targeted regions while\nstrictly preserving the structure of the original video. Our quantitative and\nqualitative experiments show that VidEdit outperforms state-of-the-art methods\non DAVIS dataset, regarding semantic faithfulness, image preservation, and\ntemporal consistency metrics. With this framework, processing a single video\nonly takes approximately one minute, and it can generate multiple compatible\nedits based on a unique text prompt. Project web-page at\nhttps://videdit.github.io\n","authors":["Paul Couairon","Clément Rambour","Jean-Emmanuel Haugeard","Nicolas Thome"],"pdf_url":"https://arxiv.org/pdf/2306.08707v4.pdf","comment":"TMLR 2024. Project web-page at https://videdit.github.io"},{"id":"http://arxiv.org/abs/2404.01843v1","updated":"2024-04-02T11:03:24Z","published":"2024-04-02T11:03:24Z","title":"Sketch3D: Style-Consistent Guidance for Sketch-to-3D Generation","summary":" Recently, image-to-3D approaches have achieved significant results with a\nnatural image as input. However, it is not always possible to access these\nenriched color input samples in practical applications, where only sketches are\navailable. Existing sketch-to-3D researches suffer from limitations in broad\napplications due to the challenges of lacking color information and multi-view\ncontent. To overcome them, this paper proposes a novel generation paradigm\nSketch3D to generate realistic 3D assets with shape aligned with the input\nsketch and color matching the textual description. Concretely, Sketch3D first\ninstantiates the given sketch in the reference image through the\nshape-preserving generation process. Second, the reference image is leveraged\nto deduce a coarse 3D Gaussian prior, and multi-view style-consistent guidance\nimages are generated based on the renderings of the 3D Gaussians. Finally,\nthree strategies are designed to optimize 3D Gaussians, i.e., structural\noptimization via a distribution transfer mechanism, color optimization with a\nstraightforward MSE loss and sketch similarity optimization with a CLIP-based\ngeometric similarity loss. Extensive visual comparisons and quantitative\nanalysis illustrate the advantage of our Sketch3D in generating realistic 3D\nassets while preserving consistency with the input.\n","authors":["Wangguandong Zheng","Haifeng Xia","Rui Chen","Ming Shao","Siyu Xia","Zhengming Ding"],"pdf_url":"https://arxiv.org/pdf/2404.01843v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01842v1","updated":"2024-04-02T11:03:13Z","published":"2024-04-02T11:03:13Z","title":"Semi-Supervised Domain Adaptation for Wildfire Detection","summary":" Recently, both the frequency and intensity of wildfires have increased\nworldwide, primarily due to climate change. In this paper, we propose a novel\nprotocol for wildfire detection, leveraging semi-supervised Domain Adaptation\nfor object detection, accompanied by a corresponding dataset designed for use\nby both academics and industries. Our dataset encompasses 30 times more diverse\nlabeled scenes for the current largest benchmark wildfire dataset, HPWREN, and\nintroduces a new labeling policy for wildfire detection. Inspired by CoordConv,\nwe propose a robust baseline, Location-Aware Object Detection for\nSemi-Supervised Domain Adaptation (LADA), utilizing a teacher-student based\nframework capable of extracting translational variance features characteristic\nof wildfires. With only using 1% target domain labeled data, our framework\nsignificantly outperforms our source-only baseline by a notable margin of 3.8%\nin mean Average Precision on the HPWREN wildfire dataset. Our dataset is\navailable at https://github.com/BloomBerry/LADA.\n","authors":["JooYoung Jang","Youngseo Cha","Jisu Kim","SooHyung Lee","Geonu Lee","Minkook Cho","Young Hwang","Nojun Kwak"],"pdf_url":"https://arxiv.org/pdf/2404.01842v1.pdf","comment":"16 pages, 5 figures, 22 tables"},{"id":"http://arxiv.org/abs/2301.13418v4","updated":"2024-04-02T11:03:02Z","published":"2023-01-31T05:14:49Z","title":"BRAIxDet: Learning to Detect Malignant Breast Lesion with Incomplete\n Annotations","summary":" Methods to detect malignant lesions from screening mammograms are usually\ntrained with fully annotated datasets, where images are labelled with the\nlocalisation and classification of cancerous lesions. However, real-world\nscreening mammogram datasets commonly have a subset that is fully annotated and\nanother subset that is weakly annotated with just the global classification\n(i.e., without lesion localisation). Given the large size of such datasets,\nresearchers usually face a dilemma with the weakly annotated subset: to not use\nit or to fully annotate it. The first option will reduce detection accuracy\nbecause it does not use the whole dataset, and the second option is too\nexpensive given that the annotation needs to be done by expert radiologists. In\nthis paper, we propose a middle-ground solution for the dilemma, which is to\nformulate the training as a weakly- and semi-supervised learning problem that\nwe refer to as malignant breast lesion detection with incomplete annotations.\nTo address this problem, our new method comprises two stages, namely: 1)\npre-training a multi-view mammogram classifier with weak supervision from the\nwhole dataset, and 2) extending the trained classifier to become a multi-view\ndetector that is trained with semi-supervised student-teacher learning, where\nthe training set contains fully and weakly-annotated mammograms. We provide\nextensive detection results on two real-world screening mammogram datasets\ncontaining incomplete annotations, and show that our proposed approach achieves\nstate-of-the-art results in the detection of malignant breast lesions with\nincomplete annotations.\n","authors":["Yuanhong Chen","Yuyuan Liu","Chong Wang","Michael Elliott","Chun Fung Kwok","Carlos Pena-Solorzano","Yu Tian","Fengbei Liu","Helen Frazer","Davis J. McCarthy","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2301.13418v4.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2404.00675v2","updated":"2024-04-02T10:59:05Z","published":"2024-03-31T12:48:07Z","title":"LLM meets Vision-Language Models for Zero-Shot One-Class Classification","summary":" We consider the problem of zero-shot one-class visual classification. In this\nsetting, only the label of the target class is available, and the goal is to\ndiscriminate between positive and negative query samples without requiring any\nvalidation example from the target task. We propose a two-step solution that\nfirst queries large language models for visually confusing objects and then\nrelies on vision-language pre-trained models (e.g., CLIP) to perform\nclassification. By adapting large-scale vision benchmarks, we demonstrate the\nability of the proposed method to outperform adapted off-the-shelf alternatives\nin this setting. Namely, we propose a realistic benchmark where negative query\nsamples are drawn from the same original dataset as positive ones, including a\ngranularity-controlled version of iNaturalist, where negative samples are at a\nfixed distance in the taxonomy tree from the positive ones. Our work shows that\nit is possible to discriminate between a single category and other semantically\nrelated ones using only its label\n","authors":["Yassir Bendou","Giulia Lioi","Bastien Pasdeloup","Lukas Mauch","Ghouthi Boukli Hacene","Fabien Cardinaux","Vincent Gripon"],"pdf_url":"https://arxiv.org/pdf/2404.00675v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13720v8","updated":"2024-04-02T10:57:06Z","published":"2023-06-23T18:08:00Z","title":"Decoupled Diffusion Models: Simultaneous Image to Zero and Zero to Noise","summary":" We propose decoupled diffusion models (DDMs) for high-quality (un)conditioned\nimage generation in less than 10 function evaluations. In a nutshell, DDMs\ndecouple the forward image-to-noise mapping into \\textit{image-to-zero} mapping\nand \\textit{zero-to-noise} mapping. Under this framework, we mathematically\nderive 1) the training objectives and 2) for the reverse time the sampling\nformula based on an analytic transition probability which models image to zero\ntransition. The former enables DDMs to learn noise and image components\nsimultaneously which simplifies learning. Importantly, because of the latter's\nanalyticity in the \\textit{zero-to-image} sampling function, DDMs can avoid the\nordinary differential equation-based accelerators and instead naturally perform\nsampling with an arbitrary step size. Under the few function evaluation setups,\nDDMs experimentally yield very competitive performance compared with the state\nof the art in 1) unconditioned image generation, \\textit{e.g.}, CIFAR-10 and\nCelebA-HQ-256 and 2) image-conditioned downstream tasks such as\nsuper-resolution, saliency detection, edge detection, and image inpainting.\n","authors":["Yuhang Huang","Zheng Qin","Xinwang Liu","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2306.13720v8.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07322v2","updated":"2024-04-02T10:35:32Z","published":"2023-12-12T14:37:36Z","title":"GenHowTo: Learning to Generate Actions and State Transformations from\n Instructional Videos","summary":" We address the task of generating temporally consistent and physically\nplausible images of actions and object state transformations. Given an input\nimage and a text prompt describing the targeted transformation, our generated\nimages preserve the environment and transform objects in the initial image. Our\ncontributions are threefold. First, we leverage a large body of instructional\nvideos and automatically mine a dataset of triplets of consecutive frames\ncorresponding to initial object states, actions, and resulting object\ntransformations. Second, equipped with this data, we develop and train a\nconditioned diffusion model dubbed GenHowTo. Third, we evaluate GenHowTo on a\nvariety of objects and actions and show superior performance compared to\nexisting methods. In particular, we introduce a quantitative evaluation where\nGenHowTo achieves 88% and 74% on seen and unseen interaction categories,\nrespectively, outperforming prior work by a large margin.\n","authors":["Tomáš Souček","Dima Damen","Michael Wray","Ivan Laptev","Josef Sivic"],"pdf_url":"https://arxiv.org/pdf/2312.07322v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2402.10636v2","updated":"2024-04-02T10:27:45Z","published":"2024-02-16T12:35:35Z","title":"PEGASUS: Personalized Generative 3D Avatars with Composable Attributes","summary":" We present PEGASUS, a method for constructing a personalized generative 3D\nface avatar from monocular video sources. Our generative 3D avatar enables\ndisentangled controls to selectively alter the facial attributes (e.g., hair or\nnose) while preserving the identity. Our approach consists of two stages:\nsynthetic database generation and constructing a personalized generative\navatar. We generate a synthetic video collection of the target identity with\nvarying facial attributes, where the videos are synthesized by borrowing the\nattributes from monocular videos of diverse identities. Then, we build a\nperson-specific generative 3D avatar that can modify its attributes\ncontinuously while preserving its identity. Through extensive experiments, we\ndemonstrate that our method of generating a synthetic database and creating a\n3D generative avatar is the most effective in preserving identity while\nachieving high realism. Subsequently, we introduce a zero-shot approach to\nachieve the same goal of generative modeling more efficiently by leveraging a\npreviously constructed personalized generative model.\n","authors":["Hyunsoo Cha","Byungjun Kim","Hanbyul Joo"],"pdf_url":"https://arxiv.org/pdf/2402.10636v2.pdf","comment":"Accepted at CVPR 2024, Project Page:\n https://snuvclab.github.io/pegasus/"},{"id":"http://arxiv.org/abs/2403.16368v2","updated":"2024-04-02T10:25:07Z","published":"2024-03-25T02:17:20Z","title":"Distilling Semantic Priors from SAM to Efficient Image Restoration\n Models","summary":" In image restoration (IR), leveraging semantic priors from segmentation\nmodels has been a common approach to improve performance. The recent segment\nanything model (SAM) has emerged as a powerful tool for extracting advanced\nsemantic priors to enhance IR tasks. However, the computational cost of SAM is\nprohibitive for IR, compared to existing smaller IR models. The incorporation\nof SAM for extracting semantic priors considerably hampers the model inference\nefficiency. To address this issue, we propose a general framework to distill\nSAM's semantic knowledge to boost exiting IR models without interfering with\ntheir inference process. Specifically, our proposed framework consists of the\nsemantic priors fusion (SPF) scheme and the semantic priors distillation (SPD)\nscheme. SPF fuses two kinds of information between the restored image predicted\nby the original IR model and the semantic mask predicted by SAM for the refined\nrestored image. SPD leverages a self-distillation manner to distill the fused\nsemantic priors to boost the performance of original IR models. Additionally,\nwe design a semantic-guided relation (SGR) module for SPD, which ensures\nsemantic feature representation space consistency to fully distill the priors.\nWe demonstrate the effectiveness of our framework across multiple IR models and\ntasks, including deraining, deblurring, and denoising.\n","authors":["Quan Zhang","Xiaoyu Liu","Wei Li","Hanting Chen","Junchao Liu","Jie Hu","Zhiwei Xiong","Chun Yuan","Yunhe Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16368v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01819v1","updated":"2024-04-02T10:22:23Z","published":"2024-04-02T10:22:23Z","title":"Sparse Semi-DETR: Sparse Learnable Queries for Semi-Supervised Object\n Detection","summary":" In this paper, we address the limitations of the DETR-based semi-supervised\nobject detection (SSOD) framework, particularly focusing on the challenges\nposed by the quality of object queries. In DETR-based SSOD, the one-to-one\nassignment strategy provides inaccurate pseudo-labels, while the one-to-many\nassignments strategy leads to overlapping predictions. These issues compromise\ntraining efficiency and degrade model performance, especially in detecting\nsmall or occluded objects. We introduce Sparse Semi-DETR, a novel\ntransformer-based, end-to-end semi-supervised object detection solution to\novercome these challenges. Sparse Semi-DETR incorporates a Query Refinement\nModule to enhance the quality of object queries, significantly improving\ndetection capabilities for small and partially obscured objects. Additionally,\nwe integrate a Reliable Pseudo-Label Filtering Module that selectively filters\nhigh-quality pseudo-labels, thereby enhancing detection accuracy and\nconsistency. On the MS-COCO and Pascal VOC object detection benchmarks, Sparse\nSemi-DETR achieves a significant improvement over current state-of-the-art\nmethods that highlight Sparse Semi-DETR's effectiveness in semi-supervised\nobject detection, particularly in challenging scenarios involving small or\npartially obscured objects.\n","authors":["Tahira Shehzadi","Khurram Azeem Hashmi","Didier Stricker","Muhammad Zeshan Afzal"],"pdf_url":"https://arxiv.org/pdf/2404.01819v1.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2404.00918v2","updated":"2024-04-02T10:20:28Z","published":"2024-04-01T04:49:47Z","title":"Rethinking Saliency-Guided Weakly-Supervised Semantic Segmentation","summary":" This paper presents a fresh perspective on the role of saliency maps in\nweakly-supervised semantic segmentation (WSSS) and offers new insights and\nresearch directions based on our empirical findings. We conduct comprehensive\nexperiments and observe that the quality of the saliency map is a critical\nfactor in saliency-guided WSSS approaches. Nonetheless, we find that the\nsaliency maps used in previous works are often arbitrarily chosen, despite\ntheir significant impact on WSSS. Additionally, we observe that the choice of\nthe threshold, which has received less attention before, is non-trivial in\nWSSS. To facilitate more meaningful and rigorous research for saliency-guided\nWSSS, we introduce \\texttt{WSSS-BED}, a standardized framework for conducting\nresearch under unified conditions. \\texttt{WSSS-BED} provides various saliency\nmaps and activation maps for seven WSSS methods, as well as saliency maps from\nunsupervised salient object detection models.\n","authors":["Beomyoung Kim","Donghyun Kim","Sung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2404.00918v2.pdf","comment":"Preprint, 17 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.01816v1","updated":"2024-04-02T10:19:17Z","published":"2024-04-02T10:19:17Z","title":"Rethinking Annotator Simulation: Realistic Evaluation of Whole-Body PET\n Lesion Interactive Segmentation Methods","summary":" Interactive segmentation plays a crucial role in accelerating the annotation,\nparticularly in domains requiring specialized expertise such as nuclear\nmedicine. For example, annotating lesions in whole-body Positron Emission\nTomography (PET) images can require over an hour per volume. While previous\nworks evaluate interactive segmentation models through either real user studies\nor simulated annotators, both approaches present challenges. Real user studies\nare expensive and often limited in scale, while simulated annotators, also\nknown as robot users, tend to overestimate model performance due to their\nidealized nature. To address these limitations, we introduce four evaluation\nmetrics that quantify the user shift between real and simulated annotators. In\nan initial user study involving four annotators, we assess existing robot users\nusing our proposed metrics and find that robot users significantly deviate in\nperformance and annotation behavior compared to real annotators. Based on these\nfindings, we propose a more realistic robot user that reduces the user shift by\nincorporating human factors such as click variation and inter-annotator\ndisagreement. We validate our robot user in a second user study, involving four\nother annotators, and show it consistently reduces the simulated-to-real user\nshift compared to traditional robot users. By employing our robot user, we can\nconduct more large-scale and cost-efficient evaluations of interactive\nsegmentation models, while preserving the fidelity of real user studies. Our\nimplementation is based on MONAI Label and will be made publicly available.\n","authors":["Zdravko Marinov","Moon Kim","Jens Kleesiek","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2404.01816v1.pdf","comment":"10 pages, 5 figures, 1 table"},{"id":"http://arxiv.org/abs/2403.16080v3","updated":"2024-04-02T10:16:05Z","published":"2024-03-24T10:06:40Z","title":"PKU-DyMVHumans: A Multi-View Video Benchmark for High-Fidelity Dynamic\n Human Modeling","summary":" High-quality human reconstruction and photo-realistic rendering of a dynamic\nscene is a long-standing problem in computer vision and graphics. Despite\nconsiderable efforts invested in developing various capture systems and\nreconstruction algorithms, recent advancements still struggle with loose or\noversized clothing and overly complex poses. In part, this is due to the\nchallenges of acquiring high-quality human datasets. To facilitate the\ndevelopment of these fields, in this paper, we present PKU-DyMVHumans, a\nversatile human-centric dataset for high-fidelity reconstruction and rendering\nof dynamic human scenarios from dense multi-view videos. It comprises 8.2\nmillion frames captured by more than 56 synchronized cameras across diverse\nscenarios. These sequences comprise 32 human subjects across 45 different\nscenarios, each with a high-detailed appearance and realistic human motion.\nInspired by recent advancements in neural radiance field (NeRF)-based scene\nrepresentations, we carefully set up an off-the-shelf framework that is easy to\nprovide those state-of-the-art NeRF-based implementations and benchmark on\nPKU-DyMVHumans dataset. It is paving the way for various applications like\nfine-grained foreground/background decomposition, high-quality human\nreconstruction and photo-realistic novel view synthesis of a dynamic scene.\nExtensive studies are performed on the benchmark, demonstrating new\nobservations and challenges that emerge from using such high-fidelity dynamic\ndata.\n","authors":["Xiaoyun Zheng","Liwei Liao","Xufeng Li","Jianbo Jiao","Rongjie Wang","Feng Gao","Shiqi Wang","Ronggang Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16080v3.pdf","comment":"CVPR2024(accepted). Project page: https://pku-dymvhumans.github.io"},{"id":"http://arxiv.org/abs/2404.01810v1","updated":"2024-04-02T10:13:18Z","published":"2024-04-02T10:13:18Z","title":"Surface Reconstruction from Gaussian Splatting via Novel Stereo Views","summary":" The Gaussian splatting for radiance field rendering method has recently\nemerged as an efficient approach for accurate scene representation. It\noptimizes the location, size, color, and shape of a cloud of 3D Gaussian\nelements to visually match, after projection, or splatting, a set of given\nimages taken from various viewing directions. And yet, despite the proximity of\nGaussian elements to the shape boundaries, direct surface reconstruction of\nobjects in the scene is a challenge.\n We propose a novel approach for surface reconstruction from Gaussian\nsplatting models. Rather than relying on the Gaussian elements' locations as a\nprior for surface reconstruction, we leverage the superior novel-view synthesis\ncapabilities of 3DGS. To that end, we use the Gaussian splatting model to\nrender pairs of stereo-calibrated novel views from which we extract depth\nprofiles using a stereo matching method. We then combine the extracted RGB-D\nimages into a geometrically consistent surface. The resulting reconstruction is\nmore accurate and shows finer details when compared to other methods for\nsurface reconstruction from Gaussian splatting models, while requiring\nsignificantly less compute time compared to other surface reconstruction\nmethods.\n We performed extensive testing of the proposed method on in-the-wild scenes,\ntaken by a smartphone, showcasing its superior reconstruction abilities.\nAdditionally, we tested the proposed method on the Tanks and Temples benchmark,\nand it has surpassed the current leading method for surface reconstruction from\nGaussian splatting models. Project page: https://gs2mesh.github.io/.\n","authors":["Yaniv Wolf","Amit Bracha","Ron Kimmel"],"pdf_url":"https://arxiv.org/pdf/2404.01810v1.pdf","comment":"Project Page: https://gs2mesh.github.io/"},{"id":"http://arxiv.org/abs/2402.14000v2","updated":"2024-04-02T10:06:33Z","published":"2024-02-21T18:36:26Z","title":"Real-time 3D-aware Portrait Editing from a Single Image","summary":" This work presents 3DPE, a practical method that can efficiently edit a face\nimage following given prompts, like reference images or text descriptions, in a\n3D-aware manner. To this end, a lightweight module is distilled from a 3D\nportrait generator and a text-to-image model, which provide prior knowledge of\nface geometry and superior editing capability, respectively. Such a design\nbrings two compelling advantages over existing approaches. First, our system\nachieves real-time editing with a feedforward network (i.e., ~0.04s per image),\nover 100x faster than the second competitor. Second, thanks to the powerful\npriors, our module could focus on the learning of editing-related variations,\nsuch that it manages to handle various types of editing simultaneously in the\ntraining phase and further supports fast adaptation to user-specified\ncustomized types of editing during inference (e.g., with ~5min fine-tuning per\nstyle). The code, the model, and the interface will be made publicly available\nto facilitate future research.\n","authors":["Qingyan Bai","Zifan Shi","Yinghao Xu","Hao Ouyang","Qiuyu Wang","Ceyuan Yang","Xuan Wang","Gordon Wetzstein","Yujun Shen","Qifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2402.14000v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01801v1","updated":"2024-04-02T10:03:23Z","published":"2024-04-02T10:03:23Z","title":"EventSleep: Sleep Activity Recognition with Event Cameras","summary":" Event cameras are a promising technology for activity recognition in dark\nenvironments due to their unique properties. However, real event camera\ndatasets under low-lighting conditions are still scarce, which also limits the\nnumber of approaches to solve these kind of problems, hindering the potential\nof this technology in many applications. We present EventSleep, a new dataset\nand methodology to address this gap and study the suitability of event cameras\nfor a very relevant medical application: sleep monitoring for sleep disorders\nanalysis. The dataset contains synchronized event and infrared recordings\nemulating common movements that happen during the sleep, resulting in a new\nchallenging and unique dataset for activity recognition in dark environments.\nOur novel pipeline is able to achieve high accuracy under these challenging\nconditions and incorporates a Bayesian approach (Laplace ensembles) to increase\nthe robustness in the predictions, which is fundamental for medical\napplications. Our work is the first application of Bayesian neural networks for\nevent cameras, the first use of Laplace ensembles in a realistic problem, and\nalso demonstrates for the first time the potential of event cameras in a new\napplication domain: to enhance current sleep evaluation procedures. Our\nactivity recognition results highlight the potential of event cameras under\ndark conditions, and its capacity and robustness for sleep activity\nrecognition, and open problems as the adaptation of event data pre-processing\ntechniques to dark environments.\n","authors":["Carlos Plou","Nerea Gallego","Alberto Sabater","Eduardo Montijano","Pablo Urcola","Luis Montesano","Ruben Martinez-Cantin","Ana C. Murillo"],"pdf_url":"https://arxiv.org/pdf/2404.01801v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.13756v2","updated":"2024-04-02T10:01:44Z","published":"2024-02-21T12:34:31Z","title":"High-throughput Visual Nano-drone to Nano-drone Relative Localization\n using Onboard Fully Convolutional Networks","summary":" Relative drone-to-drone localization is a fundamental building block for any\nswarm operations. We address this task in the context of miniaturized\nnano-drones, i.e., 10cm in diameter, which show an ever-growing interest due to\nnovel use cases enabled by their reduced form factor. The price for their\nversatility comes with limited onboard resources, i.e., sensors, processing\nunits, and memory, which limits the complexity of the onboard algorithms. A\ntraditional solution to overcome these limitations is represented by\nlightweight deep learning models directly deployed aboard nano-drones. This\nwork tackles the challenging relative pose estimation between nano-drones using\nonly a gray-scale low-resolution camera and an ultra-low-power System-on-Chip\n(SoC) hosted onboard. We present a vertically integrated system based on a\nnovel vision-based fully convolutional neural network (FCNN), which runs at\n39Hz within 101mW onboard a Crazyflie nano-drone extended with the GWT GAP8\nSoC. We compare our FCNN against three State-of-the-Art (SoA) systems.\nConsidering the best-performing SoA approach, our model results in an R-squared\nimprovement from 32 to 47% on the horizontal image coordinate and from 18 to\n55% on the vertical image coordinate, on a real-world dataset of 30k images.\nFinally, our in-field tests show a reduction of the average tracking error of\n37% compared to a previous SoA work and an endurance performance up to the\nentire battery lifetime of 4 minutes.\n","authors":["Luca Crupi","Alessandro Giusti","Daniele Palossi"],"pdf_url":"https://arxiv.org/pdf/2402.13756v2.pdf","comment":"ICRA 2024, IEEE Conference"},{"id":"http://arxiv.org/abs/2403.16578v2","updated":"2024-04-02T09:55:02Z","published":"2024-03-25T09:43:56Z","title":"SegICL: A Universal In-context Learning Framework for Enhanced\n Segmentation in Medical Imaging","summary":" Medical image segmentation models adapting to new tasks in a training-free\nmanner through in-context learning is an exciting advancement. Universal\nsegmentation models aim to generalize across the diverse modality of medical\nimages, yet their effectiveness often diminishes when applied to\nout-of-distribution (OOD) data modalities and tasks, requiring intricate\nfine-tuning of model for optimal performance. For addressing this challenge, we\nintroduce SegICL, a novel approach leveraging In-Context Learning (ICL) for\nimage segmentation. Unlike existing methods, SegICL has the capability to\nemploy text-guided segmentation and conduct in-context learning with a small\nset of image-mask pairs, eliminating the need for training the model from\nscratch or fine-tuning for OOD tasks (including OOD modality and dataset).\nExtensive experimental validation of SegICL demonstrates a positive correlation\nbetween the number of prompt samples and segmentation performance on OOD\nmodalities and tasks. This indicates that SegICL effectively address new\nsegmentation tasks based on contextual information. Additionally, SegICL also\nexhibits comparable segmentation performance to mainstream models on OOD and\nin-distribution tasks. Our code will be released soon.\n","authors":["Lingdong Shen","Fangxin Shang","Yehui Yang","Xiaoshuang Huang","Shiming Xiang"],"pdf_url":"https://arxiv.org/pdf/2403.16578v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17702v2","updated":"2024-04-02T09:54:39Z","published":"2024-03-26T13:40:52Z","title":"The Solution for the CVPR 2023 1st foundation model challenge-Track2","summary":" In this paper, we propose a solution for cross-modal transportation\nretrieval. Due to the cross-domain problem of traffic images, we divide the\nproblem into two sub-tasks of pedestrian retrieval and vehicle retrieval\nthrough a simple strategy. In pedestrian retrieval tasks, we use IRRA as the\nbase model and specifically design an Attribute Classification to mine the\nknowledge implied by attribute labels. More importantly, We use the strategy of\nInclusion Relation Matching to make the image-text pairs with inclusion\nrelation have similar representation in the feature space. For the vehicle\nretrieval task, we use BLIP as the base model. Since aligning the color\nattributes of vehicles is challenging, we introduce attribute-based object\ndetection techniques to add color patch blocks to vehicle images for color data\naugmentation. This serves as strong prior information, helping the model\nperform the image-text alignment. At the same time, we incorporate labeled\nattributes into the image-text alignment loss to learn fine-grained alignment\nand prevent similar images and texts from being incorrectly separated. Our\napproach ranked first in the final B-board test with a score of 70.9.\n","authors":["Haonan Xu","Yurui Huang","Sishun Pan","Zhihao Guan","Yi Xu","Yang Yang"],"pdf_url":"https://arxiv.org/pdf/2403.17702v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01790v1","updated":"2024-04-02T09:53:20Z","published":"2024-04-02T09:53:20Z","title":"Super-Resolution Analysis for Landfill Waste Classification","summary":" Illegal landfills are a critical issue due to their environmental, economic,\nand public health impacts. This study leverages aerial imagery for\nenvironmental crime monitoring. While advances in artificial intelligence and\ncomputer vision hold promise, the challenge lies in training models with\nhigh-resolution literature datasets and adapting them to open-access\nlow-resolution images. Considering the substantial quality differences and\nlimited annotation, this research explores the adaptability of models across\nthese domains. Motivated by the necessity for a comprehensive evaluation of\nwaste detection algorithms, it advocates cross-domain classification and\nsuper-resolution enhancement to analyze the impact of different image\nresolutions on waste classification as an evaluation to combat the\nproliferation of illegal landfills. We observed performance improvements by\nenhancing image quality but noted an influence on model sensitivity,\nnecessitating careful threshold fine-tuning.\n","authors":["Matias Molina","Rita P. Ribeiro","Bruno Veloso","João Gama"],"pdf_url":"https://arxiv.org/pdf/2404.01790v1.pdf","comment":"This article has been accepted by the Symposium on Intelligent Data\n Analysis (IDA 2024)"},{"id":"http://arxiv.org/abs/2305.05726v2","updated":"2024-04-02T09:52:41Z","published":"2023-05-09T19:17:07Z","title":"Vision-Language Models in Remote Sensing: Current Progress and Future\n Trends","summary":" The remarkable achievements of ChatGPT and GPT-4 have sparked a wave of\ninterest and research in the field of large language models for Artificial\nGeneral Intelligence (AGI). These models provide intelligent solutions close to\nhuman thinking, enabling us to use general artificial intelligence to solve\nproblems in various applications. However, in remote sensing (RS), the\nscientific literature on the implementation of AGI remains relatively scant.\nExisting AI-related research in remote sensing primarily focuses on visual\nunderstanding tasks while neglecting the semantic understanding of the objects\nand their relationships. This is where vision-language models excel, as they\nenable reasoning about images and their associated textual descriptions,\nallowing for a deeper understanding of the underlying semantics.\nVision-language models can go beyond visual recognition of RS images, model\nsemantic relationships, and generate natural language descriptions of the\nimage. This makes them better suited for tasks requiring visual and textual\nunderstanding, such as image captioning, and visual question answering. This\npaper provides a comprehensive review of the research on vision-language models\nin remote sensing, summarizing the latest progress, highlighting challenges,\nand identifying potential research opportunities.\n","authors":["Xiang Li","Congcong Wen","Yuan Hu","Zhenghang Yuan","Xiao Xiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2305.05726v2.pdf","comment":"Accepted by IEEE Geoscience and Remote Sensing Magazine"},{"id":"http://arxiv.org/abs/2404.01780v1","updated":"2024-04-02T09:44:30Z","published":"2024-04-02T09:44:30Z","title":"CSST Strong Lensing Preparation: a Framework for Detecting Strong Lenses\n in the Multi-color Imaging Survey by the China Survey Space Telescope (CSST)","summary":" Strong gravitational lensing is a powerful tool for investigating dark matter\nand dark energy properties. With the advent of large-scale sky surveys, we can\ndiscover strong lensing systems on an unprecedented scale, which requires\nefficient tools to extract them from billions of astronomical objects. The\nexisting mainstream lens-finding tools are based on machine learning algorithms\nand applied to cut-out-centered galaxies. However, according to the design and\nsurvey strategy of optical surveys by CSST, preparing cutouts with multiple\nbands requires considerable efforts. To overcome these challenges, we have\ndeveloped a framework based on a hierarchical visual Transformer with a sliding\nwindow technique to search for strong lensing systems within entire images.\nMoreover, given that multi-color images of strong lensing systems can provide\ninsights into their physical characteristics, our framework is specifically\ncrafted to identify strong lensing systems in images with any number of\nchannels. As evaluated using CSST mock data based on an Semi-Analytic Model\nnamed CosmoDC2, our framework achieves precision and recall rates of 0.98 and\n0.90, respectively. To evaluate the effectiveness of our method in real\nobservations, we have applied it to a subset of images from the DESI Legacy\nImaging Surveys and media images from Euclid Early Release Observations. 61 new\nstrong lensing system candidates are discovered by our method. However, we also\nidentified false positives arising primarily from the simplified galaxy\nmorphology assumptions within the simulation. This underscores the practical\nlimitations of our approach while simultaneously highlighting potential avenues\nfor future improvements.\n","authors":["Xu Li","Ruiqi Sun","Jiameng Lv","Peng Jia","Nan Li","Chengliang Wei","Zou Hu","Xinzhong Er","Yun Chen","Zhang Ban","Yuedong Fang","Qi Guo","Dezi Liu","Guoliang Li","Lin Lin","Ming Li","Ran Li","Xiaobo Li","Yu Luo","Xianmin Meng","Jundan Nie","Zhaoxiang Qi","Yisheng Qiu","Li Shao","Hao Tian","Lei Wang","Wei Wang","Jingtian Xian","Youhua Xu","Tianmeng Zhang","Xin Zhang","Zhimin Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.01780v1.pdf","comment":"The paper is accepted by the AJ. The complete code could be\n downloaded with DOI of: 10.12149/101393. Comments are welcome"},{"id":"http://arxiv.org/abs/2312.02152v2","updated":"2024-04-02T09:40:33Z","published":"2023-12-04T18:59:44Z","title":"Steerers: A framework for rotation equivariant keypoint descriptors","summary":" Image keypoint descriptions that are discriminative and matchable over large\nchanges in viewpoint are vital for 3D reconstruction. However, descriptions\noutput by learned descriptors are typically not robust to camera rotation.\nWhile they can be made more robust by, e.g., data augmentation, this degrades\nperformance on upright images. Another approach is test-time augmentation,\nwhich incurs a significant increase in runtime. Instead, we learn a linear\ntransform in description space that encodes rotations of the input image. We\ncall this linear transform a steerer since it allows us to transform the\ndescriptions as if the image was rotated. From representation theory, we know\nall possible steerers for the rotation group. Steerers can be optimized (A)\ngiven a fixed descriptor, (B) jointly with a descriptor or (C) we can optimize\na descriptor given a fixed steerer. We perform experiments in these three\nsettings and obtain state-of-the-art results on the rotation invariant image\nmatching benchmarks AIMS and Roto-360. We publish code and model weights at\nhttps://github.com/georg-bn/rotation-steerers.\n","authors":["Georg Bökman","Johan Edstedt","Michael Felsberg","Fredrik Kahl"],"pdf_url":"https://arxiv.org/pdf/2312.02152v2.pdf","comment":"CVPR 2024 Camera ready"},{"id":"http://arxiv.org/abs/2404.01775v1","updated":"2024-04-02T09:40:22Z","published":"2024-04-02T09:40:22Z","title":"A noisy elephant in the room: Is your out-of-distribution detector\n robust to label noise?","summary":" The ability to detect unfamiliar or unexpected images is essential for safe\ndeployment of computer vision systems. In the context of classification, the\ntask of detecting images outside of a model's training domain is known as\nout-of-distribution (OOD) detection. While there has been a growing research\ninterest in developing post-hoc OOD detection methods, there has been\ncomparably little discussion around how these methods perform when the\nunderlying classifier is not trained on a clean, carefully curated dataset. In\nthis work, we take a closer look at 20 state-of-the-art OOD detection methods\nin the (more realistic) scenario where the labels used to train the underlying\nclassifier are unreliable (e.g. crowd-sourced or web-scraped labels). Extensive\nexperiments across different datasets, noise types & levels, architectures and\ncheckpointing strategies provide insights into the effect of class label noise\non OOD detection, and show that poor separation between incorrectly classified\nID samples vs. OOD samples is an overlooked yet important limitation of\nexisting methods. Code: https://github.com/glhr/ood-labelnoise\n","authors":["Galadrielle Humblot-Renaux","Sergio Escalera","Thomas B. Moeslund"],"pdf_url":"https://arxiv.org/pdf/2404.01775v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2402.17971v2","updated":"2024-04-02T09:32:51Z","published":"2024-02-28T01:32:59Z","title":"All in an Aggregated Image for In-Image Learning","summary":" This paper introduces a new in-context learning (ICL) mechanism called\nIn-Image Learning (I$^2$L) that combines demonstration examples, visual cues,\nand chain-of-thought reasoning into an aggregated image to enhance the\ncapabilities of Large Multimodal Models (e.g., GPT-4V) in multimodal reasoning\ntasks. Unlike previous approaches that rely on converting images to text or\nincorporating visual input into language models, I$^2$L consolidates all\ninformation into an aggregated image and leverages image processing,\nunderstanding, and reasoning abilities. This has several advantages: it reduces\ninaccurate textual descriptions of complex images, provides flexibility in\npositioning demonstration examples, and avoids multiple input images and\nlengthy prompts. We also introduce I$^2$L-Hybrid, a method that combines the\nstrengths of I$^2$L with other ICL methods. Specifically, it uses an automatic\nstrategy to select the most suitable method (I$^2$L or another certain ICL\nmethod) for a specific task instance. We conduct extensive experiments to\nassess the effectiveness of I$^2$L and I$^2$L-Hybrid on MathVista, which covers\na variety of complex multimodal reasoning tasks. Additionally, we investigate\nthe influence of image resolution, the number of demonstration examples in a\nsingle image, and the positions of these demonstrations in the aggregated image\non the effectiveness of I$^2$L. Our code is publicly available at\nhttps://github.com/AGI-Edgerunners/IIL.\n","authors":["Lei Wang","Wanyu Xu","Zhiqiang Hu","Yihuai Lan","Shan Dong","Hao Wang","Roy Ka-Wei Lee","Ee-Peng Lim"],"pdf_url":"https://arxiv.org/pdf/2402.17971v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2404.01765v1","updated":"2024-04-02T09:31:06Z","published":"2024-04-02T09:31:06Z","title":"Guidelines for Cerebrovascular Segmentation: Managing Imperfect\n Annotations in the context of Semi-Supervised Learning","summary":" Segmentation in medical imaging is an essential and often preliminary task in\nthe image processing chain, driving numerous efforts towards the design of\nrobust segmentation algorithms. Supervised learning methods achieve excellent\nperformances when fed with a sufficient amount of labeled data. However, such\nlabels are typically highly time-consuming, error-prone and expensive to\nproduce. Alternatively, semi-supervised learning approaches leverage both\nlabeled and unlabeled data, and are very useful when only a small fraction of\nthe dataset is labeled. They are particularly useful for cerebrovascular\nsegmentation, given that labeling a single volume requires several hours for an\nexpert. In addition to the challenge posed by insufficient annotations, there\nare concerns regarding annotation consistency. The task of annotating the\ncerebrovascular tree is inherently ambiguous. Due to the discrete nature of\nimages, the borders and extremities of vessels are often unclear. Consequently,\nannotations heavily rely on the expert subjectivity and on the underlying\nclinical objective. These discrepancies significantly increase the complexity\nof the segmentation task for the model and consequently impair the results.\nConsequently, it becomes imperative to provide clinicians with precise\nguidelines to improve the annotation process and construct more uniform\ndatasets. In this article, we investigate the data dependency of deep learning\nmethods within the context of imperfect data and semi-supervised learning, for\ncerebrovascular segmentation. Specifically, this study compares various\nstate-of-the-art semi-supervised methods based on unsupervised regularization\nand evaluates their performance in diverse quantity and quality data scenarios.\nBased on these experiments, we provide guidelines for the annotation and\ntraining of cerebrovascular segmentation models.\n","authors":["Pierre Rougé","Pierre-Henri Conze","Nicolas Passat","Odyssée Merveille"],"pdf_url":"https://arxiv.org/pdf/2404.01765v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10208v2","updated":"2024-04-02T09:20:50Z","published":"2024-01-18T18:50:16Z","title":"MM-Interleaved: Interleaved Image-Text Generative Modeling via\n Multi-modal Feature Synchronizer","summary":" Developing generative models for interleaved image-text data has both\nresearch and practical value. It requires models to understand the interleaved\nsequences and subsequently generate images and text. However, existing attempts\nare limited by the issue that the fixed number of visual tokens cannot\nefficiently capture image details, which is particularly problematic in the\nmulti-image scenarios. To address this, this paper presents MM-Interleaved, an\nend-to-end generative model for interleaved image-text data. It introduces a\nmulti-scale and multi-image feature synchronizer module, allowing direct access\nto fine-grained image features in the previous context during the generation\nprocess. MM-Interleaved is end-to-end pre-trained on both paired and\ninterleaved image-text corpora. It is further enhanced through a supervised\nfine-tuning phase, wherein the model improves its ability to follow complex\nmulti-modal instructions. Experiments demonstrate the versatility of\nMM-Interleaved in recognizing visual details following multi-modal instructions\nand generating consistent images following both textual and visual conditions.\nCode and models are available at\n\\url{https://github.com/OpenGVLab/MM-Interleaved}.\n","authors":["Changyao Tian","Xizhou Zhu","Yuwen Xiong","Weiyun Wang","Zhe Chen","Wenhai Wang","Yuntao Chen","Lewei Lu","Tong Lu","Jie Zhou","Hongsheng Li","Yu Qiao","Jifeng Dai"],"pdf_url":"https://arxiv.org/pdf/2401.10208v2.pdf","comment":"20 pages, 9 figures, 17 tables"},{"id":"http://arxiv.org/abs/2404.01758v1","updated":"2024-04-02T09:18:52Z","published":"2024-04-02T09:18:52Z","title":"GEARS: Local Geometry-aware Hand-object Interaction Synthesis","summary":" Generating realistic hand motion sequences in interaction with objects has\ngained increasing attention with the growing interest in digital humans. Prior\nwork has illustrated the effectiveness of employing occupancy-based or\ndistance-based virtual sensors to extract hand-object interaction features.\nNonetheless, these methods show limited generalizability across object\ncategories, shapes and sizes. We hypothesize that this is due to two reasons:\n1) the limited expressiveness of employed virtual sensors, and 2) scarcity of\navailable training data. To tackle this challenge, we introduce a novel\njoint-centered sensor designed to reason about local object geometry near\npotential interaction regions. The sensor queries for object surface points in\nthe neighbourhood of each hand joint. As an important step towards mitigating\nthe learning complexity, we transform the points from global frame to hand\ntemplate frame and use a shared module to process sensor features of each\nindividual joint. This is followed by a spatio-temporal transformer network\naimed at capturing correlation among the joints in different dimensions.\nMoreover, we devise simple heuristic rules to augment the limited training\nsequences with vast static hand grasping samples. This leads to a broader\nspectrum of grasping types observed during training, in turn enhancing our\nmodel's generalization capability. We evaluate on two public datasets, GRAB and\nInterCap, where our method shows superiority over baselines both quantitatively\nand perceptually.\n","authors":["Keyang Zhou","Bharat Lal Bhatnagar","Jan Eric Lenssen","Gerard Pons-moll"],"pdf_url":"https://arxiv.org/pdf/2404.01758v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06725v3","updated":"2024-04-02T09:18:36Z","published":"2023-12-11T05:20:52Z","title":"EpiDiff: Enhancing Multi-View Synthesis via Localized\n Epipolar-Constrained Diffusion","summary":" Generating multiview images from a single view facilitates the rapid\ngeneration of a 3D mesh conditioned on a single image. Recent methods that\nintroduce 3D global representation into diffusion models have shown the\npotential to generate consistent multiviews, but they have reduced generation\nspeed and face challenges in maintaining generalizability and quality. To\naddress this issue, we propose EpiDiff, a localized interactive multiview\ndiffusion model. At the core of the proposed approach is to insert a\nlightweight epipolar attention block into the frozen diffusion model,\nleveraging epipolar constraints to enable cross-view interaction among feature\nmaps of neighboring views. The newly initialized 3D modeling module preserves\nthe original feature distribution of the diffusion model, exhibiting\ncompatibility with a variety of base diffusion models. Experiments show that\nEpiDiff generates 16 multiview images in just 12 seconds, and it surpasses\nprevious methods in quality evaluation metrics, including PSNR, SSIM and LPIPS.\nAdditionally, EpiDiff can generate a more diverse distribution of views,\nimproving the reconstruction quality from generated multiviews. Please see our\nproject page at https://huanngzh.github.io/EpiDiff/.\n","authors":["Zehuan Huang","Hao Wen","Junting Dong","Yaohui Wang","Yangguang Li","Xinyuan Chen","Yan-Pei Cao","Ding Liang","Yu Qiao","Bo Dai","Lu Sheng"],"pdf_url":"https://arxiv.org/pdf/2312.06725v3.pdf","comment":"Project page: https://huanngzh.github.io/EpiDiff/"},{"id":"http://arxiv.org/abs/2306.05423v2","updated":"2024-04-02T09:12:29Z","published":"2023-06-08T17:59:32Z","title":"ADDP: Learning General Representations for Image Recognition and\n Generation with Alternating Denoising Diffusion Process","summary":" Image recognition and generation have long been developed independently of\neach other. With the recent trend towards general-purpose representation\nlearning, the development of general representations for both recognition and\ngeneration tasks is also promoted. However, preliminary attempts mainly focus\non generation performance, but are still inferior on recognition tasks. These\nmethods are modeled in the vector-quantized (VQ) space, whereas leading\nrecognition methods use pixels as inputs. Our key insights are twofold: (1)\npixels as inputs are crucial for recognition tasks; (2) VQ tokens as\nreconstruction targets are beneficial for generation tasks. These observations\nmotivate us to propose an Alternating Denoising Diffusion Process (ADDP) that\nintegrates these two spaces within a single representation learning framework.\nIn each denoising step, our method first decodes pixels from previous VQ\ntokens, then generates new VQ tokens from the decoded pixels. The diffusion\nprocess gradually masks out a portion of VQ tokens to construct the training\nsamples. The learned representations can be used to generate diverse\nhigh-fidelity images and also demonstrate excellent transfer performance on\nrecognition tasks. Extensive experiments show that our method achieves\ncompetitive performance on unconditional generation, ImageNet classification,\nCOCO detection, and ADE20k segmentation. Importantly, our method represents the\nfirst successful development of general representations applicable to both\ngeneration and dense recognition tasks. Code is released at\n\\url{https://github.com/ChangyaoTian/ADDP}.\n","authors":["Changyao Tian","Chenxin Tao","Jifeng Dai","Hao Li","Ziheng Li","Lewei Lu","Xiaogang Wang","Hongsheng Li","Gao Huang","Xizhou Zhu"],"pdf_url":"https://arxiv.org/pdf/2306.05423v2.pdf","comment":"Accepted by ICLR2024"},{"id":"http://arxiv.org/abs/2404.01751v1","updated":"2024-04-02T09:07:05Z","published":"2024-04-02T09:07:05Z","title":"T-VSL: Text-Guided Visual Sound Source Localization in Mixtures","summary":" Visual sound source localization poses a significant challenge in identifying\nthe semantic region of each sounding source within a video. Existing\nself-supervised and weakly supervised source localization methods struggle to\naccurately distinguish the semantic regions of each sounding object,\nparticularly in multi-source mixtures. These methods often rely on audio-visual\ncorrespondence as guidance, which can lead to substantial performance drops in\ncomplex multi-source localization scenarios. The lack of access to individual\nsource sounds in multi-source mixtures during training exacerbates the\ndifficulty of learning effective audio-visual correspondence for localization.\nTo address this limitation, in this paper, we propose incorporating the text\nmodality as an intermediate feature guide using tri-modal joint embedding\nmodels (e.g., AudioCLIP) to disentangle the semantic audio-visual source\ncorrespondence in multi-source mixtures. Our framework, dubbed T-VSL, begins by\npredicting the class of sounding entities in mixtures. Subsequently, the\ntextual representation of each sounding source is employed as guidance to\ndisentangle fine-grained audio-visual source correspondence from multi-source\nmixtures, leveraging the tri-modal AudioCLIP embedding. This approach enables\nour framework to handle a flexible number of sources and exhibits promising\nzero-shot transferability to unseen classes during test time. Extensive\nexperiments conducted on the MUSIC, VGGSound, and VGGSound-Instruments datasets\ndemonstrate significant performance improvements over state-of-the-art methods.\n","authors":["Tanvir Mahmud","Yapeng Tian","Diana Marculescu"],"pdf_url":"https://arxiv.org/pdf/2404.01751v1.pdf","comment":"Tech report. Accepted in CVPR-2024"},{"id":"http://arxiv.org/abs/2404.01750v1","updated":"2024-04-02T09:05:47Z","published":"2024-04-02T09:05:47Z","title":"Exploring Latent Pathways: Enhancing the Interpretability of Autonomous\n Driving with a Variational Autoencoder","summary":" Autonomous driving presents a complex challenge, which is usually addressed\nwith artificial intelligence models that are end-to-end or modular in nature.\nWithin the landscape of modular approaches, a bio-inspired neural circuit\npolicy model has emerged as an innovative control module, offering a compact\nand inherently interpretable system to infer a steering wheel command from\nabstract visual features. Here, we take a leap forward by integrating a\nvariational autoencoder with the neural circuit policy controller, forming a\nsolution that directly generates steering commands from input camera images. By\nsubstituting the traditional convolutional neural network approach to feature\nextraction with a variational autoencoder, we enhance the system's\ninterpretability, enabling a more transparent and understandable\ndecision-making process.\n In addition to the architectural shift toward a variational autoencoder, this\nstudy introduces the automatic latent perturbation tool, a novel contribution\ndesigned to probe and elucidate the latent features within the variational\nautoencoder. The automatic latent perturbation tool automates the\ninterpretability process, offering granular insights into how specific latent\nvariables influence the overall model's behavior. Through a series of numerical\nexperiments, we demonstrate the interpretative power of the variational\nautoencoder-neural circuit policy model and the utility of the automatic latent\nperturbation tool in making the inner workings of autonomous driving systems\nmore transparent.\n","authors":["Anass Bairouk","Mirjana Maras","Simon Herlin","Alexander Amini","Marc Blanchon","Ramin Hasani","Patrick Chareyre","Daniela Rus"],"pdf_url":"https://arxiv.org/pdf/2404.01750v1.pdf","comment":"Submitted to 2024 IEEE/RSJ International Conference on Intelligent\n Robots and Systems (IROS 2024)"},{"id":"http://arxiv.org/abs/2404.01748v1","updated":"2024-04-02T09:04:56Z","published":"2024-04-02T09:04:56Z","title":"Global Mapping of Exposure and Physical Vulnerability Dynamics in Least\n Developed Countries using Remote Sensing and Machine Learning","summary":" As the world marked the midterm of the Sendai Framework for Disaster Risk\nReduction 2015-2030, many countries are still struggling to monitor their\nclimate and disaster risk because of the expensive large-scale survey of the\ndistribution of exposure and physical vulnerability and, hence, are not on\ntrack in reducing risks amidst the intensifying effects of climate change. We\npresent an ongoing effort in mapping this vital information using machine\nlearning and time-series remote sensing from publicly available Sentinel-1 SAR\nGRD and Sentinel-2 Harmonized MSI. We introduce the development of\n\"OpenSendaiBench\" consisting of 47 countries wherein most are least developed\n(LDCs), trained ResNet-50 deep learning models, and demonstrated the region of\nDhaka, Bangladesh by mapping the distribution of its informal constructions. As\na pioneering effort in auditing global disaster risk over time, this paper aims\nto advance the area of large-scale risk quantification in informing our\ncollective long-term efforts in reducing climate and disaster risk.\n","authors":["Joshua Dimasaka","Christian Geiß","Emily So"],"pdf_url":"https://arxiv.org/pdf/2404.01748v1.pdf","comment":"This is the camera-ready paper for the accepted poster at the 2nd\n Machine Learning for Remote Sensing Workshop, 12th International Conference\n on Learning Representations (ICLR) in Vienna, Austria, on the 11th of May\n 2024. Access the poster here: https://zenodo.org/doi/10.5281/zenodo.10903886\n Watch the video version of our poster here: https://youtu.be/N6ithJeCF4M"},{"id":"http://arxiv.org/abs/2402.11791v4","updated":"2024-04-02T09:02:04Z","published":"2024-02-19T02:41:37Z","title":"SDGE: Stereo Guided Depth Estimation for 360$^\\circ$ Camera Sets","summary":" Depth estimation is a critical technology in autonomous driving, and\nmulti-camera systems are often used to achieve a 360$^\\circ$ perception. These\n360$^\\circ$ camera sets often have limited or low-quality overlap regions,\nmaking multi-view stereo methods infeasible for the entire image.\nAlternatively, monocular methods may not produce consistent cross-view\npredictions. To address these issues, we propose the Stereo Guided Depth\nEstimation (SGDE) method, which enhances depth estimation of the full image by\nexplicitly utilizing multi-view stereo results on the overlap. We suggest\nbuilding virtual pinhole cameras to resolve the distortion problem of fisheye\ncameras and unify the processing for the two types of 360$^\\circ$ cameras. For\nhandling the varying noise on camera poses caused by unstable movement, the\napproach employs a self-calibration method to obtain highly accurate relative\nposes of the adjacent cameras with minor overlap. These enable the use of\nrobust stereo methods to obtain high-quality depth prior in the overlap region.\nThis prior serves not only as an additional input but also as pseudo-labels\nthat enhance the accuracy of depth estimation methods and improve cross-view\nprediction consistency. The effectiveness of SGDE is evaluated on one fisheye\ncamera dataset, Synthetic Urban, and two pinhole camera datasets, DDAD and\nnuScenes. Our experiments demonstrate that SGDE is effective for both\nsupervised and self-supervised depth estimation, and highlight the potential of\nour method for advancing downstream autonomous driving technologies, such as 3D\nobject detection and occupancy prediction.\n","authors":["Jialei Xu","Wei Yin","Dong Gong","Junjun Jiang","Xianming Liu"],"pdf_url":"https://arxiv.org/pdf/2402.11791v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01745v1","updated":"2024-04-02T09:01:58Z","published":"2024-04-02T09:01:58Z","title":"Unleash the Potential of CLIP for Video Highlight Detection","summary":" Multimodal and large language models (LLMs) have revolutionized the\nutilization of open-world knowledge, unlocking novel potentials across various\ntasks and applications. Among these domains, the video domain has notably\nbenefited from their capabilities. In this paper, we present Highlight-CLIP\n(HL-CLIP), a method designed to excel in the video highlight detection task by\nleveraging the pre-trained knowledge embedded in multimodal models. By simply\nfine-tuning the multimodal encoder in combination with our innovative saliency\npooling technique, we have achieved the state-of-the-art performance in the\nhighlight detection task, the QVHighlight Benchmark, to the best of our\nknowledge.\n","authors":["Donghoon Han","Seunghyeon Seo","Eunhwan Park","Seong-Uk Nam","Nojun Kwak"],"pdf_url":"https://arxiv.org/pdf/2404.01745v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01743v1","updated":"2024-04-02T09:01:21Z","published":"2024-04-02T09:01:21Z","title":"Atom-Level Optical Chemical Structure Recognition with Limited\n Supervision","summary":" Identifying the chemical structure from a graphical representation, or image,\nof a molecule is a challenging pattern recognition task that would greatly\nbenefit drug development. Yet, existing methods for chemical structure\nrecognition do not typically generalize well, and show diminished effectiveness\nwhen confronted with domains where data is sparse, or costly to generate, such\nas hand-drawn molecule images. To address this limitation, we propose a new\nchemical structure recognition tool that delivers state-of-the-art performance\nand can adapt to new domains with a limited number of data samples and\nsupervision. Unlike previous approaches, our method provides atom-level\nlocalization, and can therefore segment the image into the different atoms and\nbonds. Our model is the first model to perform OCSR with atom-level entity\ndetection with only SMILES supervision. Through rigorous and extensive\nbenchmarking, we demonstrate the preeminence of our chemical structure\nrecognition approach in terms of data efficiency, accuracy, and atom-level\nentity prediction.\n","authors":["Martijn Oldenhof","Edward De Brouwer","Adam Arany","Yves Moreau"],"pdf_url":"https://arxiv.org/pdf/2404.01743v1.pdf","comment":"Accepted in IEEE/CVF Conference on Computer Vision and Pattern\n Recognition 2024"},{"id":"http://arxiv.org/abs/2311.02072v2","updated":"2024-04-02T09:00:38Z","published":"2023-11-03T17:54:59Z","title":"HIPTrack: Visual Tracking with Historical Prompts","summary":" Trackers that follow Siamese paradigm utilize similarity matching between\ntemplate and search region features for tracking. Many methods have been\nexplored to enhance tracking performance by incorporating tracking history to\nbetter handle scenarios involving target appearance variations such as\ndeformation and occlusion. However, the utilization of historical information\nin existing methods is insufficient and incomprehensive, which typically\nrequires repetitive training and introduces a large amount of computation. In\nthis paper, we show that by providing a tracker that follows Siamese paradigm\nwith precise and updated historical information, a significant performance\nimprovement can be achieved with completely unchanged parameters. Based on\nthis, we propose a historical prompt network that uses refined historical\nforeground masks and historical visual features of the target to provide\ncomprehensive and precise prompts for the tracker. We build a novel tracker\ncalled HIPTrack based on the historical prompt network, which achieves\nconsiderable performance improvements without the need to retrain the entire\nmodel. We conduct experiments on seven datasets and experimental results\ndemonstrate that our method surpasses the current state-of-the-art trackers on\nLaSOT, LaSOText, GOT-10k and NfS. Furthermore, the historical prompt network\ncan seamlessly integrate as a plug-and-play module into existing trackers,\nproviding performance enhancements. The source code is available at\nhttps://github.com/WenRuiCai/HIPTrack.\n","authors":["Wenrui Cai","Qingjie Liu","Yunhong Wang"],"pdf_url":"https://arxiv.org/pdf/2311.02072v2.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2403.03954v2","updated":"2024-04-02T08:59:57Z","published":"2024-03-06T18:58:49Z","title":"3D Diffusion Policy: Generalizable Visuomotor Policy Learning via Simple\n 3D Representations","summary":" Imitation learning provides an efficient way to teach robots dexterous\nskills; however, learning complex skills robustly and generalizablely usually\nconsumes large amounts of human demonstrations. To tackle this challenging\nproblem, we present 3D Diffusion Policy (DP3), a novel visual imitation\nlearning approach that incorporates the power of 3D visual representations into\ndiffusion policies, a class of conditional action generative models. The core\ndesign of DP3 is the utilization of a compact 3D visual representation,\nextracted from sparse point clouds with an efficient point encoder. In our\nexperiments involving 72 simulation tasks, DP3 successfully handles most tasks\nwith just 10 demonstrations and surpasses baselines with a 24.2% relative\nimprovement. In 4 real robot tasks, DP3 demonstrates precise control with a\nhigh success rate of 85%, given only 40 demonstrations of each task, and shows\nexcellent generalization abilities in diverse aspects, including space,\nviewpoint, appearance, and instance. Interestingly, in real robot experiments,\nDP3 rarely violates safety requirements, in contrast to baseline methods which\nfrequently do, necessitating human intervention. Our extensive evaluation\nhighlights the critical importance of 3D representations in real-world robot\nlearning. Videos, code, and data are available on\nhttps://3d-diffusion-policy.github.io .\n","authors":["Yanjie Ze","Gu Zhang","Kangning Zhang","Chenyuan Hu","Muhan Wang","Huazhe Xu"],"pdf_url":"https://arxiv.org/pdf/2403.03954v2.pdf","comment":"Videos, code, and data: https://3d-diffusion-policy.github.io"},{"id":"http://arxiv.org/abs/2307.09591v3","updated":"2024-04-02T08:55:51Z","published":"2023-07-18T19:56:20Z","title":"Saliency strikes back: How filtering out high frequencies improves\n white-box explanations","summary":" Attribution methods correspond to a class of explainability methods (XAI)\nthat aim to assess how individual inputs contribute to a model's\ndecision-making process. We have identified a significant limitation in one\ntype of attribution methods, known as \"white-box\" methods. Although highly\nefficient, these methods rely on a gradient signal that is often contaminated\nby high-frequency noise. To overcome this limitation, we introduce a new\napproach called \"FORGrad\". This simple method effectively filters out noise\nartifacts by using optimal cut-off frequencies tailored to the unique\ncharacteristics of each model architecture. Our findings show that FORGrad\nconsistently enhances the performance of already existing white-box methods,\nenabling them to compete effectively with more accurate yet computationally\ndemanding \"black-box\" methods. We anticipate that our research will foster\nbroader adoption of simpler and more efficient white-box methods for\nexplainability, offering a better balance between faithfulness and\ncomputational efficiency.\n","authors":["Sabine Muzellec","Thomas Fel","Victor Boutin","Léo andéol","Rufin VanRullen","Thomas Serre"],"pdf_url":"https://arxiv.org/pdf/2307.09591v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03517v2","updated":"2024-04-02T08:40:54Z","published":"2023-12-06T14:24:26Z","title":"FRDiff : Feature Reuse for Universal Training-free Acceleration of\n Diffusion Models","summary":" The substantial computational costs of diffusion models, especially due to\nthe repeated denoising steps necessary for high-quality image generation,\npresent a major obstacle to their widespread adoption. While several studies\nhave attempted to address this issue by reducing the number of score function\nevaluations (NFE) using advanced ODE solvers without fine-tuning, the decreased\nnumber of denoising iterations misses the opportunity to update fine details,\nresulting in noticeable quality degradation. In our work, we introduce an\nadvanced acceleration technique that leverages the temporal redundancy inherent\nin diffusion models. Reusing feature maps with high temporal similarity opens\nup a new opportunity to save computation resources without compromising output\nquality. To realize the practical benefits of this intuition, we conduct an\nextensive analysis and propose a novel method, FRDiff. FRDiff is designed to\nharness the advantages of both reduced NFE and feature reuse, achieving a\nPareto frontier that balances fidelity and latency trade-offs in various\ngenerative tasks.\n","authors":["Junhyuk So","Jungwon Lee","Eunhyeok Park"],"pdf_url":"https://arxiv.org/pdf/2312.03517v2.pdf","comment":"Work in progress. Project page :\n https://jungwon-lee.github.io/Project_FRDiff/"},{"id":"http://arxiv.org/abs/2404.01727v1","updated":"2024-04-02T08:33:21Z","published":"2024-04-02T08:33:21Z","title":"Generalizing 6-DoF Grasp Detection via Domain Prior Knowledge","summary":" We focus on the generalization ability of the 6-DoF grasp detection method in\nthis paper. While learning-based grasp detection methods can predict grasp\nposes for unseen objects using the grasp distribution learned from the training\nset, they often exhibit a significant performance drop when encountering\nobjects with diverse shapes and structures. To enhance the grasp detection\nmethods' generalization ability, we incorporate domain prior knowledge of\nrobotic grasping, enabling better adaptation to objects with significant shape\nand structure differences. More specifically, we employ the physical constraint\nregularization during the training phase to guide the model towards predicting\ngrasps that comply with the physical rule on grasping. For the unstable grasp\nposes predicted on novel objects, we design a contact-score joint optimization\nusing the projection contact map to refine these poses in cluttered scenarios.\nExtensive experiments conducted on the GraspNet-1billion benchmark demonstrate\na substantial performance gain on the novel object set and the real-world\ngrasping experiments also demonstrate the effectiveness of our generalizing\n6-DoF grasp detection method.\n","authors":["Haoxiang Ma","Modi Shi","Boyang Gao","Di Huang"],"pdf_url":"https://arxiv.org/pdf/2404.01727v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2401.04728v2","updated":"2024-04-02T08:29:09Z","published":"2024-01-09T18:59:04Z","title":"Morphable Diffusion: 3D-Consistent Diffusion for Single-image Avatar\n Creation","summary":" Recent advances in generative diffusion models have enabled the previously\nunfeasible capability of generating 3D assets from a single input image or a\ntext prompt. In this work, we aim to enhance the quality and functionality of\nthese models for the task of creating controllable, photorealistic human\navatars. We achieve this by integrating a 3D morphable model into the\nstate-of-the-art multi-view-consistent diffusion approach. We demonstrate that\naccurate conditioning of a generative pipeline on the articulated 3D model\nenhances the baseline model performance on the task of novel view synthesis\nfrom a single image. More importantly, this integration facilitates a seamless\nand accurate incorporation of facial expression and body pose control into the\ngeneration process. To the best of our knowledge, our proposed framework is the\nfirst diffusion model to enable the creation of fully 3D-consistent,\nanimatable, and photorealistic human avatars from a single image of an unseen\nsubject; extensive quantitative and qualitative evaluations demonstrate the\nadvantages of our approach over existing state-of-the-art avatar creation\nmodels on both novel view and novel expression synthesis tasks. The code for\nour project is publicly available.\n","authors":["Xiyi Chen","Marko Mihajlovic","Shaofei Wang","Sergey Prokudin","Siyu Tang"],"pdf_url":"https://arxiv.org/pdf/2401.04728v2.pdf","comment":"[CVPR 2024] Project page:\n https://xiyichen.github.io/morphablediffusion/"},{"id":"http://arxiv.org/abs/2402.10739v3","updated":"2024-04-02T08:26:43Z","published":"2024-02-16T14:56:13Z","title":"PointMamba: A Simple State Space Model for Point Cloud Analysis","summary":" Transformers have become one of the foundational architectures in point cloud\nanalysis tasks due to their excellent global modeling ability. However, the\nattention mechanism has quadratic complexity and is difficult to extend to long\nsequence modeling due to limited computational resources and so on. Recently,\nstate space models (SSM), a new family of deep sequence models, have presented\ngreat potential for sequence modeling in NLP tasks. In this paper, taking\ninspiration from the success of SSM in NLP, we propose PointMamba, a framework\nwith global modeling and linear complexity. Specifically, by taking embedded\npoint patches as input, we proposed a reordering strategy to enhance SSM's\nglobal modeling ability by providing a more logical geometric scanning order.\nThe reordered point tokens are then sent to a series of Mamba blocks to\ncausally capture the point cloud structure. Experimental results show our\nproposed PointMamba outperforms the transformer-based counterparts on different\npoint cloud analysis datasets, while significantly saving about 44.3%\nparameters and 25% FLOPs, demonstrating the potential option for constructing\nfoundational 3D vision models. We hope our PointMamba can provide a new\nperspective for point cloud analysis. The code is available at\nhttps://github.com/LMD0311/PointMamba.\n","authors":["Dingkang Liang","Xin Zhou","Xinyu Wang","Xingkui Zhu","Wei Xu","Zhikang Zou","Xiaoqing Ye","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2402.10739v3.pdf","comment":"Work in progress. The code is available at\n https://github.com/LMD0311/PointMamba"},{"id":"http://arxiv.org/abs/2404.01725v1","updated":"2024-04-02T08:21:16Z","published":"2024-04-02T08:21:16Z","title":"Disentangled Pre-training for Human-Object Interaction Detection","summary":" Detecting human-object interaction (HOI) has long been limited by the amount\nof supervised data available. Recent approaches address this issue by\npre-training according to pseudo-labels, which align object regions with HOI\ntriplets parsed from image captions. However, pseudo-labeling is tricky and\nnoisy, making HOI pre-training a complex process. Therefore, we propose an\nefficient disentangled pre-training method for HOI detection (DP-HOI) to\naddress this problem. First, DP-HOI utilizes object detection and action\nrecognition datasets to pre-train the detection and interaction decoder layers,\nrespectively. Then, we arrange these decoder layers so that the pre-training\narchitecture is consistent with the downstream HOI detection task. This\nfacilitates efficient knowledge transfer. Specifically, the detection decoder\nidentifies reliable human instances in each action recognition dataset image,\ngenerates one corresponding query, and feeds it into the interaction decoder\nfor verb classification. Next, we combine the human instance verb predictions\nin the same image and impose image-level supervision. The DP-HOI structure can\nbe easily adapted to the HOI detection task, enabling effective model parameter\ninitialization. Therefore, it significantly enhances the performance of\nexisting HOI detection models on a broad range of rare categories. The code and\npre-trained weight are available at https://github.com/xingaoli/DP-HOI.\n","authors":["Zhuolong Li","Xingao Li","Changxing Ding","Xiangmin Xu"],"pdf_url":"https://arxiv.org/pdf/2404.01725v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2404.01723v1","updated":"2024-04-02T08:17:39Z","published":"2024-04-02T08:17:39Z","title":"Contextual Embedding Learning to Enhance 2D Networks for Volumetric\n Image Segmentation","summary":" The segmentation of organs in volumetric medical images plays an important\nrole in computer-aided diagnosis and treatment/surgery planning. Conventional\n2D convolutional neural networks (CNNs) can hardly exploit the spatial\ncorrelation of volumetric data. Current 3D CNNs have the advantage to extract\nmore powerful volumetric representations but they usually suffer from occupying\nexcessive memory and computation nevertheless. In this study we aim to enhance\nthe 2D networks with contextual information for better volumetric image\nsegmentation. Accordingly, we propose a contextual embedding learning approach\nto facilitate 2D CNNs capturing spatial information properly. Our approach\nleverages the learned embedding and the slice-wisely neighboring matching as a\nsoft cue to guide the network. In such a way, the contextual information can be\ntransferred slice-by-slice thus boosting the volumetric representation of the\nnetwork. Experiments on challenging prostate MRI dataset (PROMISE12) and\nabdominal CT dataset (CHAOS) show that our contextual embedding learning can\neffectively leverage the inter-slice context and improve segmentation\nperformance. The proposed approach is a plug-and-play, and memory-efficient\nsolution to enhance the 2D networks for volumetric segmentation. The code will\nbe publicly available.\n","authors":["Zhuoyuan Wang","Dong Sun","Xiangyun Zeng","Ruodai Wu","Yi Wang"],"pdf_url":"https://arxiv.org/pdf/2404.01723v1.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.12113v4","updated":"2024-04-02T08:14:57Z","published":"2023-08-23T13:06:59Z","title":"Advancements in Point Cloud Data Augmentation for Deep Learning: A\n Survey","summary":" Deep learning (DL) has become one of the mainstream and effective methods for\npoint cloud analysis tasks such as detection, segmentation and classification.\nTo reduce overfitting during training DL models and improve model performance\nespecially when the amount and/or diversity of training data are limited,\naugmentation is often crucial. Although various point cloud data augmentation\nmethods have been widely used in different point cloud processing tasks, there\nare currently no published systematic surveys or reviews of these methods.\nTherefore, this article surveys these methods, categorizing them into a\ntaxonomy framework that comprises basic and specialized point cloud data\naugmentation methods. Through a comprehensive evaluation of these augmentation\nmethods, this article identifies their potentials and limitations, serving as a\nuseful reference for choosing appropriate augmentation methods. In addition,\npotential directions for future research are recommended. This survey\ncontributes to providing a holistic overview of the current state of point\ncloud data augmentation, promoting its wider application and development.\n","authors":["Qinfeng Zhu","Lei Fan","Ningxin Weng"],"pdf_url":"https://arxiv.org/pdf/2308.12113v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01717v1","updated":"2024-04-02T08:07:38Z","published":"2024-04-02T08:07:38Z","title":"AddSR: Accelerating Diffusion-based Blind Super-Resolution with\n Adversarial Diffusion Distillation","summary":" Blind super-resolution methods based on stable diffusion showcase formidable\ngenerative capabilities in reconstructing clear high-resolution images with\nintricate details from low-resolution inputs. However, their practical\napplicability is often hampered by poor efficiency, stemming from the\nrequirement of thousands or hundreds of sampling steps. Inspired by the\nefficient text-to-image approach adversarial diffusion distillation (ADD), we\ndesign AddSR to address this issue by incorporating the ideas of both\ndistillation and ControlNet. Specifically, we first propose a prediction-based\nself-refinement strategy to provide high-frequency information in the student\nmodel output with marginal additional time cost. Furthermore, we refine the\ntraining process by employing HR images, rather than LR images, to regulate the\nteacher model, providing a more robust constraint for distillation. Second, we\nintroduce a timestep-adapting loss to address the perception-distortion\nimbalance problem introduced by ADD. Extensive experiments demonstrate our\nAddSR generates better restoration results, while achieving faster speed than\nprevious SD-based state-of-the-art models (e.g., 7x faster than SeeSR).\n","authors":["Rui Xie","Ying Tai","Kai Zhang","Zhenyu Zhang","Jun Zhou","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2404.01717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01714v1","updated":"2024-04-02T07:57:17Z","published":"2024-04-02T07:57:17Z","title":"Conjugate-Gradient-like Based Adaptive Moment Estimation Optimization\n Algorithm for Deep Learning","summary":" Training deep neural networks is a challenging task. In order to speed up\ntraining and enhance the performance of deep neural networks, we rectify the\nvanilla conjugate gradient as conjugate-gradient-like and incorporate it into\nthe generic Adam, and thus propose a new optimization algorithm named\nCG-like-Adam for deep learning. Specifically, both the first-order and the\nsecond-order moment estimation of generic Adam are replaced by the\nconjugate-gradient-like. Convergence analysis handles the cases where the\nexponential moving average coefficient of the first-order moment estimation is\nconstant and the first-order moment estimation is unbiased. Numerical\nexperiments show the superiority of the proposed algorithm based on the\nCIFAR10/100 dataset.\n","authors":["Jiawu Tian","Liwei Xu","Xiaowei Zhang","Yongqi Li"],"pdf_url":"https://arxiv.org/pdf/2404.01714v1.pdf","comment":"32 pages, 13 figures"},{"id":"http://arxiv.org/abs/2404.01709v1","updated":"2024-04-02T07:49:08Z","published":"2024-04-02T07:49:08Z","title":"Upsample Guidance: Scale Up Diffusion Models without Training","summary":" Diffusion models have demonstrated superior performance across various\ngenerative tasks including images, videos, and audio. However, they encounter\ndifficulties in directly generating high-resolution samples. Previously\nproposed solutions to this issue involve modifying the architecture, further\ntraining, or partitioning the sampling process into multiple stages. These\nmethods have the limitation of not being able to directly utilize pre-trained\nmodels as-is, requiring additional work. In this paper, we introduce upsample\nguidance, a technique that adapts pretrained diffusion model (e.g., $512^2$) to\ngenerate higher-resolution images (e.g., $1536^2$) by adding only a single term\nin the sampling process. Remarkably, this technique does not necessitate any\nadditional training or relying on external models. We demonstrate that upsample\nguidance can be applied to various models, such as pixel-space, latent space,\nand video diffusion models. We also observed that the proper selection of\nguidance scale can improve image quality, fidelity, and prompt alignment.\n","authors":["Juno Hwang","Yong-Hyun Park","Junghyo Jo"],"pdf_url":"https://arxiv.org/pdf/2404.01709v1.pdf","comment":"15 pages, 15 Figures"},{"id":"http://arxiv.org/abs/2404.01705v1","updated":"2024-04-02T07:38:16Z","published":"2024-04-02T07:38:16Z","title":"Samba: Semantic Segmentation of Remotely Sensed Images with State Space\n Model","summary":" High-resolution remotely sensed images poses a challenge for commonly used\nsemantic segmentation methods such as Convolutional Neural Network (CNN) and\nVision Transformer (ViT). CNN-based methods struggle with handling such\nhigh-resolution images due to their limited receptive field, while ViT faces\nchallenges to handle long sequences. Inspired by Mamba, which adopts a State\nSpace Model (SSM) to efficiently capture global semantic information, we\npropose a semantic segmentation framework for high-resolution remotely sensed\nimages, named Samba. Samba utilizes an encoder-decoder architecture, with Samba\nblocks serving as the encoder for efficient multi-level semantic information\nextraction, and UperNet functioning as the decoder. We evaluate Samba on the\nLoveDA dataset, comparing its performance against top-performing CNN and ViT\nmethods. The results reveal that Samba achieved unparalleled performance on\nLoveDA. This represents that the proposed Samba is an effective application of\nthe SSM in semantic segmentation of remotely sensed images, setting a new\nbenchmark in performance for Mamba-based techniques in this specific\napplication. The source code and baseline implementations are available at\nhttps://github.com/zhuqinfeng1999/Samba.\n","authors":["Qinfeng Zhu","Yuanzhi Cai","Yuan Fang","Yihan Yang","Cheng Chen","Lei Fan","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.01705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01703v1","updated":"2024-04-02T07:16:56Z","published":"2024-04-02T07:16:56Z","title":"Boosting Visual Recognition for Autonomous Driving in Real-world\n Degradations with Deep Channel Prior","summary":" The environmental perception of autonomous vehicles in normal conditions have\nachieved considerable success in the past decade. However, various unfavourable\nconditions such as fog, low-light, and motion blur will degrade image quality\nand pose tremendous threats to the safety of autonomous driving. That is, when\napplied to degraded images, state-of-the-art visual models often suffer\nperformance decline due to the feature content loss and artifact interference\ncaused by statistical and structural properties disruption of captured images.\nTo address this problem, this work proposes a novel Deep Channel Prior (DCP)\nfor degraded visual recognition. Specifically, we observe that, in the deep\nrepresentation space of pre-trained models, the channel correlations of\ndegraded features with the same degradation type have uniform distribution even\nif they have different content and semantics, which can facilitate the mapping\nrelationship learning between degraded and clear representations in\nhigh-sparsity feature space. Based on this, a novel plug-and-play Unsupervised\nFeature Enhancement Module (UFEM) is proposed to achieve unsupervised feature\ncorrection, where the multi-adversarial mechanism is introduced in the first\nstage of UFEM to achieve the latent content restoration and artifact removal in\nhigh-sparsity feature space. Then, the generated features are transferred to\nthe second stage for global correlation modulation under the guidance of DCP to\nobtain high-quality and recognition-friendly features. Evaluations of three\ntasks and eight benchmark datasets demonstrate that our proposed method can\ncomprehensively improve the performance of pre-trained models in real\ndegradation conditions. The source code is available at\nhttps://github.com/liyuhang166/Deep_Channel_Prior\n","authors":["Zhanwen Liu","Yuhang Li","Yang Wang","Bolin Gao","Yisheng An","Xiangmo Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.01703v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00521v2","updated":"2024-04-02T07:15:34Z","published":"2024-03-31T01:41:36Z","title":"CHAIN: Enhancing Generalization in Data-Efficient GANs via lipsCHitz\n continuity constrAIned Normalization","summary":" Generative Adversarial Networks (GANs) significantly advanced image\ngeneration but their performance heavily depends on abundant training data. In\nscenarios with limited data, GANs often struggle with discriminator overfitting\nand unstable training. Batch Normalization (BN), despite being known for\nenhancing generalization and training stability, has rarely been used in the\ndiscriminator of Data-Efficient GANs. Our work addresses this gap by\nidentifying a critical flaw in BN: the tendency for gradient explosion during\nthe centering and scaling steps. To tackle this issue, we present CHAIN\n(lipsCHitz continuity constrAIned Normalization), which replaces the\nconventional centering step with zero-mean regularization and integrates a\nLipschitz continuity constraint in the scaling step. CHAIN further enhances GAN\ntraining by adaptively interpolating the normalized and unnormalized features,\neffectively avoiding discriminator overfitting. Our theoretical analyses firmly\nestablishes CHAIN's effectiveness in reducing gradients in latent features and\nweights, improving stability and generalization in GAN training. Empirical\nevidence supports our theory. CHAIN achieves state-of-the-art results in\ndata-limited scenarios on CIFAR-10/100, ImageNet, five low-shot and seven\nhigh-resolution few-shot image datasets.\n","authors":["Yao Ni","Piotr Koniusz"],"pdf_url":"https://arxiv.org/pdf/2404.00521v2.pdf","comment":"Accepted by CVPR2024, 26 pages full version"},{"id":"http://arxiv.org/abs/2404.01700v1","updated":"2024-04-02T07:09:29Z","published":"2024-04-02T07:09:29Z","title":"MotionChain: Conversational Motion Controllers via Multimodal Prompts","summary":" Recent advancements in language models have demonstrated their adeptness in\nconducting multi-turn dialogues and retaining conversational context. However,\nthis proficiency remains largely unexplored in other multimodal generative\nmodels, particularly in human motion models. By integrating multi-turn\nconversations in controlling continuous virtual human movements, generative\nhuman motion models can achieve an intuitive and step-by-step process of human\ntask execution for humanoid robotics, game agents, or other embodied systems.\nIn this work, we present MotionChain, a conversational human motion controller\nto generate continuous and long-term human motion through multimodal prompts.\nSpecifically, MotionChain consists of multi-modal tokenizers that transform\nvarious data types such as text, image, and motion, into discrete tokens,\ncoupled with a Vision-Motion-aware Language model. By leveraging large-scale\nlanguage, vision-language, and vision-motion data to assist motion-related\ngeneration tasks, MotionChain thus comprehends each instruction in multi-turn\nconversation and generates human motions followed by these prompts. Extensive\nexperiments validate the efficacy of MotionChain, demonstrating\nstate-of-the-art performance in conversational motion generation, as well as\nmore intuitive manners of controlling and interacting with virtual humans.\n","authors":["Biao Jiang","Xin Chen","Chi Zhang","Fukun Yin","Zhuoyuan Li","Gang YU","Jiayuan Fan"],"pdf_url":"https://arxiv.org/pdf/2404.01700v1.pdf","comment":"14 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.01699v1","updated":"2024-04-02T07:08:15Z","published":"2024-04-02T07:08:15Z","title":"Task Integration Distillation for Object Detectors","summary":" Knowledge distillation is a widely adopted technique for model lightening.\nHowever, the performance of most knowledge distillation methods in the domain\nof object detection is not satisfactory. Typically, knowledge distillation\napproaches consider only the classification task among the two sub-tasks of an\nobject detector, largely overlooking the regression task. This oversight leads\nto a partial understanding of the object detector's comprehensive task,\nresulting in skewed estimations and potentially adverse effects. Therefore, we\npropose a knowledge distillation method that addresses both the classification\nand regression tasks, incorporating a task significance strategy. By evaluating\nthe importance of features based on the output of the detector's two sub-tasks,\nour approach ensures a balanced consideration of both classification and\nregression tasks in object detection. Drawing inspiration from real-world\nteaching processes and the definition of learning condition, we introduce a\nmethod that focuses on both key and weak areas. By assessing the value of\nfeatures for knowledge distillation based on their importance differences, we\naccurately capture the current model's learning situation. This method\neffectively prevents the issue of biased predictions about the model's learning\nreality caused by an incomplete utilization of the detector's outputs.\n","authors":["Hai Su","ZhenWen Jian","Songsen Yu"],"pdf_url":"https://arxiv.org/pdf/2404.01699v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14828v2","updated":"2024-04-02T06:55:59Z","published":"2024-01-26T12:57:05Z","title":"TIP-Editor: An Accurate 3D Editor Following Both Text-Prompts And\n Image-Prompts","summary":" Text-driven 3D scene editing has gained significant attention owing to its\nconvenience and user-friendliness. However, existing methods still lack\naccurate control of the specified appearance and location of the editing result\ndue to the inherent limitations of the text description. To this end, we\npropose a 3D scene editing framework, TIPEditor, that accepts both text and\nimage prompts and a 3D bounding box to specify the editing region. With the\nimage prompt, users can conveniently specify the detailed appearance/style of\nthe target content in complement to the text description, enabling accurate\ncontrol of the appearance. Specifically, TIP-Editor employs a stepwise 2D\npersonalization strategy to better learn the representation of the existing\nscene and the reference image, in which a localization loss is proposed to\nencourage correct object placement as specified by the bounding box.\nAdditionally, TIPEditor utilizes explicit and flexible 3D Gaussian splatting as\nthe 3D representation to facilitate local editing while keeping the background\nunchanged. Extensive experiments have demonstrated that TIP-Editor conducts\naccurate editing following the text and image prompts in the specified bounding\nbox region, consistently outperforming the baselines in editing quality, and\nthe alignment to the prompts, qualitatively and quantitatively.\n","authors":["Jingyu Zhuang","Di Kang","Yan-Pei Cao","Guanbin Li","Liang Lin","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2401.14828v2.pdf","comment":"Accpeted by Siggraph 2024 & ACM Transactions on Graphics"},{"id":"http://arxiv.org/abs/2404.01692v1","updated":"2024-04-02T06:52:31Z","published":"2024-04-02T06:52:31Z","title":"Beyond Image Super-Resolution for Image Recognition with Task-Driven\n Perceptual Loss","summary":" In real-world scenarios, image recognition tasks, such as semantic\nsegmentation and object detection, often pose greater challenges due to the\nlack of information available within low-resolution (LR) content. Image\nsuper-resolution (SR) is one of the promising solutions for addressing the\nchallenges. However, due to the ill-posed property of SR, it is challenging for\ntypical SR methods to restore task-relevant high-frequency contents, which may\ndilute the advantage of utilizing the SR method. Therefore, in this paper, we\npropose Super-Resolution for Image Recognition (SR4IR) that effectively guides\nthe generation of SR images beneficial to achieving satisfactory image\nrecognition performance when processing LR images. The critical component of\nour SR4IR is the task-driven perceptual (TDP) loss that enables the SR network\nto acquire task-specific knowledge from a network tailored for a specific task.\nMoreover, we propose a cross-quality patch mix and an alternate training\nframework that significantly enhances the efficacy of the TDP loss by\naddressing potential problems when employing the TDP loss. Through extensive\nexperiments, we demonstrate that our SR4IR achieves outstanding task\nperformance by generating SR images useful for a specific image recognition\ntask, including semantic segmentation, object detection, and image\nclassification. The implementation code is available at\nhttps://github.com/JaehaKim97/SR4IR.\n","authors":["Jaeha Kim","Junghun Oh","Kyoung Mu Lee"],"pdf_url":"https://arxiv.org/pdf/2404.01692v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01690v1","updated":"2024-04-02T06:49:38Z","published":"2024-04-02T06:49:38Z","title":"RefQSR: Reference-based Quantization for Image Super-Resolution Networks","summary":" Single image super-resolution (SISR) aims to reconstruct a high-resolution\nimage from its low-resolution observation. Recent deep learning-based SISR\nmodels show high performance at the expense of increased computational costs,\nlimiting their use in resource-constrained environments. As a promising\nsolution for computationally efficient network design, network quantization has\nbeen extensively studied. However, existing quantization methods developed for\nSISR have yet to effectively exploit image self-similarity, which is a new\ndirection for exploration in this study. We introduce a novel method called\nreference-based quantization for image super-resolution (RefQSR) that applies\nhigh-bit quantization to several representative patches and uses them as\nreferences for low-bit quantization of the rest of the patches in an image. To\nthis end, we design dedicated patch clustering and reference-based quantization\nmodules and integrate them into existing SISR network quantization methods. The\nexperimental results demonstrate the effectiveness of RefQSR on various SISR\nnetworks and quantization methods.\n","authors":["Hongjae Lee","Jun-Sang Yoo","Seung-Won Jung"],"pdf_url":"https://arxiv.org/pdf/2404.01690v1.pdf","comment":"Accepted by IEEE Transactions on Image Processing (TIP)"},{"id":"http://arxiv.org/abs/2404.01686v1","updated":"2024-04-02T06:43:22Z","published":"2024-04-02T06:43:22Z","title":"JRDB-PanoTrack: An Open-world Panoptic Segmentation and Tracking Robotic\n Dataset in Crowded Human Environments","summary":" Autonomous robot systems have attracted increasing research attention in\nrecent years, where environment understanding is a crucial step for robot\nnavigation, human-robot interaction, and decision. Real-world robot systems\nusually collect visual data from multiple sensors and are required to recognize\nnumerous objects and their movements in complex human-crowded settings.\nTraditional benchmarks, with their reliance on single sensors and limited\nobject classes and scenarios, fail to provide the comprehensive environmental\nunderstanding robots need for accurate navigation, interaction, and\ndecision-making. As an extension of JRDB dataset, we unveil JRDB-PanoTrack, a\nnovel open-world panoptic segmentation and tracking benchmark, towards more\ncomprehensive environmental perception. JRDB-PanoTrack includes (1) various\ndata involving indoor and outdoor crowded scenes, as well as comprehensive 2D\nand 3D synchronized data modalities; (2) high-quality 2D spatial panoptic\nsegmentation and temporal tracking annotations, with additional 3D label\nprojections for further spatial understanding; (3) diverse object classes for\nclosed- and open-world recognition benchmarks, with OSPA-based metrics for\nevaluation. Extensive evaluation of leading methods shows significant\nchallenges posed by our dataset.\n","authors":["Duy-Tho Le","Chenhui Gou","Stavya Datta","Hengcan Shi","Ian Reid","Jianfei Cai","Hamid Rezatofighi"],"pdf_url":"https://arxiv.org/pdf/2404.01686v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.07711v3","updated":"2024-04-02T06:38:18Z","published":"2024-03-12T14:53:56Z","title":"SSM Meets Video Diffusion Models: Efficient Video Generation with\n Structured State Spaces","summary":" Given the remarkable achievements in image generation through diffusion\nmodels, the research community has shown increasing interest in extending these\nmodels to video generation. Recent diffusion models for video generation have\npredominantly utilized attention layers to extract temporal features. However,\nattention layers are limited by their memory consumption, which increases\nquadratically with the length of the sequence. This limitation presents\nsignificant challenges when attempting to generate longer video sequences using\ndiffusion models. To overcome this challenge, we propose leveraging state-space\nmodels (SSMs). SSMs have recently gained attention as viable alternatives due\nto their linear memory consumption relative to sequence length. In the\nexperiments, we first evaluate our SSM-based model with UCF101, a standard\nbenchmark of video generation. In addition, to investigate the potential of\nSSMs for longer video generation, we perform an experiment using the MineRL\nNavigate dataset, varying the number of frames to 64, 200, and 400. In these\nsettings, our SSM-based model can considerably save memory consumption for\nlonger sequences, while maintaining competitive FVD scores to the\nattention-based models. Our codes are available at\nhttps://github.com/shim0114/SSM-Meets-Video-Diffusion-Models.\n","authors":["Yuta Oshima","Shohei Taniguchi","Masahiro Suzuki","Yutaka Matsuo"],"pdf_url":"https://arxiv.org/pdf/2403.07711v3.pdf","comment":"Accepted as a workshop paper at ICLR 2024"},{"id":"http://arxiv.org/abs/2404.01674v1","updated":"2024-04-02T06:25:16Z","published":"2024-04-02T06:25:16Z","title":"PRISM-TopoMap: Online Topological Mapping with Place Recognition and\n Scan Matching","summary":" Mapping is one of the crucial tasks enabling autonomous navigation of a\nmobile robot. Conventional mapping methods output dense geometric map\nrepresentation, e.g. an occupancy grid, which is not trivial to keep consistent\nfor the prolonged runs covering large environments. Meanwhile, capturing the\ntopological structure of the workspace enables fast path planning, is less\nprone to odometry error accumulation and does not consume much memory.\nFollowing this idea, this paper introduces PRISM-TopoMap -- a topological\nmapping method that maintains a graph of locally aligned locations not relying\non global metric coordinates. The proposed method involves learnable multimodal\nplace recognition paired with the scan matching pipeline for localization and\nloop closure in the graph of locations. The latter is updated online and the\nrobot is localized in a proper node at each time step. We conduct a broad\nexperimental evaluation of the suggested approach in a range of photo-realistic\nenvironments and on a real robot (wheeled differential driven Husky robot), and\ncompare it to state of the art. The results of the empirical evaluation confirm\nthat PRISM-Topomap consistently outperforms competitors across several measures\nof mapping and navigation efficiency and performs well on a real robot. The\ncode of PRISM-Topomap is open-sourced and available at\nhttps://github.com/kirillMouraviev/prism-topomap.\n","authors":["Kirill Muravyev","Alexander Melekhin","Dmitriy Yudin","Konstantin Yakovlev"],"pdf_url":"https://arxiv.org/pdf/2404.01674v1.pdf","comment":"This is a pre-print of the paper submitted to an IROS 2024 conference"},{"id":"http://arxiv.org/abs/2404.01673v1","updated":"2024-04-02T06:24:21Z","published":"2024-04-02T06:24:21Z","title":"A Universal Knowledge Embedded Contrastive Learning Framework for\n Hyperspectral Image Classification","summary":" Hyperspectral image (HSI) classification techniques have been intensively\nstudied and a variety of models have been developed. However, these HSI\nclassification models are confined to pocket models and unrealistic ways of\ndatasets partitioning. The former limits the generalization performance of the\nmodel and the latter is partitioned leads to inflated model evaluation metrics,\nwhich results in plummeting model performance in the real world. Therefore, we\npropose a universal knowledge embedded contrastive learning framework (KnowCL)\nfor supervised, unsupervised, and semisupervised HSI classification, which\nlargely closes the gap of HSI classification models between pocket models and\nstandard vision backbones. We present a new HSI processing pipeline in\nconjunction with a range of data transformation and augmentation techniques\nthat provide diverse data representations and realistic data partitioning. The\nproposed framework based on this pipeline is compatible with all kinds of\nbackbones and can fully exploit labeled and unlabeled samples with expected\ntraining time. Furthermore, we design a new loss function, which can adaptively\nfuse the supervised loss and unsupervised loss, enhancing the learning\nperformance. This proposed new classification paradigm shows great potentials\nin exploring for HSI classification technology. The code can be accessed at\nhttps://github.com/quanweiliu/KnowCL.\n","authors":["Quanwei Liu","Yanni Dong","Tao Huang","Lefei Zhang","Bo Do"],"pdf_url":"https://arxiv.org/pdf/2404.01673v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16416v4","updated":"2024-04-02T06:17:34Z","published":"2024-01-29T18:55:29Z","title":"Endo-4DGS: Endoscopic Monocular Scene Reconstruction with 4D Gaussian\n Splatting","summary":" In the realm of robot-assisted minimally invasive surgery, dynamic scene\nreconstruction can significantly enhance downstream tasks and improve surgical\noutcomes. Neural Radiance Fields (NeRF)-based methods have recently risen to\nprominence for their exceptional ability to reconstruct scenes but are hampered\nby slow inference speed, prolonged training, and inconsistent depth estimation.\nSome previous work utilizes ground truth depth for optimization but is hard to\nacquire in the surgical domain. To overcome these obstacles, we present\nEndo-4DGS, a real-time endoscopic dynamic reconstruction approach that utilizes\n3D Gaussian Splatting (GS) for 3D representation. Specifically, we propose\nlightweight MLPs to capture temporal dynamics with Gaussian deformation fields.\nTo obtain a satisfactory Gaussian Initialization, we exploit a powerful depth\nestimation foundation model, Depth-Anything, to generate pseudo-depth maps as a\ngeometry prior. We additionally propose confidence-guided learning to tackle\nthe ill-pose problems in monocular depth estimation and enhance the\ndepth-guided reconstruction with surface normal constraints and depth\nregularization. Our approach has been validated on two surgical datasets, where\nit can effectively render in real-time, compute efficiently, and reconstruct\nwith remarkable accuracy.\n","authors":["Yiming Huang","Beilei Cui","Long Bai","Ziqi Guo","Mengya Xu","Mobarakol Islam","Hongliang Ren"],"pdf_url":"https://arxiv.org/pdf/2401.16416v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00658v2","updated":"2024-04-02T06:15:15Z","published":"2024-03-31T12:04:27Z","title":"KTPFormer: Kinematics and Trajectory Prior Knowledge-Enhanced\n Transformer for 3D Human Pose Estimation","summary":" This paper presents a novel Kinematics and Trajectory Prior\nKnowledge-Enhanced Transformer (KTPFormer), which overcomes the weakness in\nexisting transformer-based methods for 3D human pose estimation that the\nderivation of Q, K, V vectors in their self-attention mechanisms are all based\non simple linear mapping. We propose two prior attention modules, namely\nKinematics Prior Attention (KPA) and Trajectory Prior Attention (TPA) to take\nadvantage of the known anatomical structure of the human body and motion\ntrajectory information, to facilitate effective learning of global dependencies\nand features in the multi-head self-attention. KPA models kinematic\nrelationships in the human body by constructing a topology of kinematics, while\nTPA builds a trajectory topology to learn the information of joint motion\ntrajectory across frames. Yielding Q, K, V vectors with prior knowledge, the\ntwo modules enable KTPFormer to model both spatial and temporal correlations\nsimultaneously. Extensive experiments on three benchmarks (Human3.6M,\nMPI-INF-3DHP and HumanEva) show that KTPFormer achieves superior performance in\ncomparison to state-of-the-art methods. More importantly, our KPA and TPA\nmodules have lightweight plug-and-play designs and can be integrated into\nvarious transformer-based networks (i.e., diffusion-based) to improve the\nperformance with only a very small increase in the computational overhead. The\ncode is available at: https://github.com/JihuaPeng/KTPFormer.\n","authors":["Jihua Peng","Yanghong Zhou","P. Y. Mok"],"pdf_url":"https://arxiv.org/pdf/2404.00658v2.pdf","comment":"Accepted by CVPR 2024,GitHub\n code:https://github.com/JihuaPeng/KTPFormer"},{"id":"http://arxiv.org/abs/2403.19976v2","updated":"2024-04-02T06:03:32Z","published":"2024-03-29T04:58:56Z","title":"eTraM: Event-based Traffic Monitoring Dataset","summary":" Event cameras, with their high temporal and dynamic range and minimal memory\nusage, have found applications in various fields. However, their potential in\nstatic traffic monitoring remains largely unexplored. To facilitate this\nexploration, we present eTraM - a first-of-its-kind, fully event-based traffic\nmonitoring dataset. eTraM offers 10 hr of data from different traffic scenarios\nin various lighting and weather conditions, providing a comprehensive overview\nof real-world situations. Providing 2M bounding box annotations, it covers\neight distinct classes of traffic participants, ranging from vehicles to\npedestrians and micro-mobility. eTraM's utility has been assessed using\nstate-of-the-art methods for traffic participant detection, including RVT, RED,\nand YOLOv8. We quantitatively evaluate the ability of event-based models to\ngeneralize on nighttime and unseen scenes. Our findings substantiate the\ncompelling potential of leveraging event cameras for traffic monitoring,\nopening new avenues for research and application. eTraM is available at\nhttps://eventbasedvision.github.io/eTraM\n","authors":["Aayush Atul Verma","Bharatesh Chakravarthi","Arpitsinh Vaghela","Hua Wei","Yezhou Yang"],"pdf_url":"https://arxiv.org/pdf/2403.19976v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12468v2","updated":"2024-04-02T06:00:11Z","published":"2023-12-19T07:05:39Z","title":"MaskINT: Video Editing via Interpolative Non-autoregressive Masked\n Transformers","summary":" Recent advances in generative AI have significantly enhanced image and video\nediting, particularly in the context of text prompt control. State-of-the-art\napproaches predominantly rely on diffusion models to accomplish these tasks.\nHowever, the computational demands of diffusion-based methods are substantial,\noften necessitating large-scale paired datasets for training, and therefore\nchallenging the deployment in real applications. To address these issues, this\npaper breaks down the text-based video editing task into two stages. First, we\nleverage an pre-trained text-to-image diffusion model to simultaneously edit\nfew keyframes in an zero-shot way. Second, we introduce an efficient model\ncalled MaskINT, which is built on non-autoregressive masked generative\ntransformers and specializes in frame interpolation between the edited\nkeyframes, using the structural guidance from intermediate frames. Experimental\nresults suggest that our MaskINT achieves comparable performance with\ndiffusion-based methodologies, while significantly improve the inference time.\nThis research offers a practical solution for text-based video editing and\nshowcases the potential of non-autoregressive masked generative transformers in\nthis domain.\n","authors":["Haoyu Ma","Shahin Mahdizadehaghdam","Bichen Wu","Zhipeng Fan","Yuchao Gu","Wenliang Zhao","Lior Shapira","Xiaohui Xie"],"pdf_url":"https://arxiv.org/pdf/2312.12468v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01657v1","updated":"2024-04-02T05:59:43Z","published":"2024-04-02T05:59:43Z","title":"Release of Pre-Trained Models for the Japanese Language","summary":" AI democratization aims to create a world in which the average person can\nutilize AI techniques. To achieve this goal, numerous research institutes have\nattempted to make their results accessible to the public. In particular, large\npre-trained models trained on large-scale data have shown unprecedented\npotential, and their release has had a significant impact. However, most of the\nreleased models specialize in the English language, and thus, AI\ndemocratization in non-English-speaking communities is lagging significantly.\nTo reduce this gap in AI access, we released Generative Pre-trained Transformer\n(GPT), Contrastive Language and Image Pre-training (CLIP), Stable Diffusion,\nand Hidden-unit Bidirectional Encoder Representations from Transformers\n(HuBERT) pre-trained in Japanese. By providing these models, users can freely\ninterface with AI that aligns with Japanese cultural values and ensures the\nidentity of Japanese culture, thus enhancing the democratization of AI.\nAdditionally, experiments showed that pre-trained models specialized for\nJapanese can efficiently achieve high performance in Japanese tasks.\n","authors":["Kei Sawada","Tianyu Zhao","Makoto Shing","Kentaro Mitsui","Akio Kaga","Yukiya Hono","Toshiaki Wakatsuki","Koh Mitsuda"],"pdf_url":"https://arxiv.org/pdf/2404.01657v1.pdf","comment":"9 pages, 1 figure, 5 tables, accepted for LREC-COLING 2024. Models\n are publicly available at https://huggingface.co/rinna"},{"id":"http://arxiv.org/abs/2404.01656v1","updated":"2024-04-02T05:57:35Z","published":"2024-04-02T05:57:35Z","title":"Supporting Mitosis Detection AI Training with Inter-Observer Eye-Gaze\n Consistencies","summary":" The expansion of artificial intelligence (AI) in pathology tasks has\nintensified the demand for doctors' annotations in AI development. However,\ncollecting high-quality annotations from doctors is costly and time-consuming,\ncreating a bottleneck in AI progress. This study investigates eye-tracking as a\ncost-effective technology to collect doctors' behavioral data for AI training\nwith a focus on the pathology task of mitosis detection. One major challenge in\nusing eye-gaze data is the low signal-to-noise ratio, which hinders the\nextraction of meaningful information. We tackled this by levering the\nproperties of inter-observer eye-gaze consistencies and creating eye-gaze\nlabels from consistent eye-fixations shared by a group of observers. Our study\ninvolved 14 non-medical participants, from whom we collected eye-gaze data and\ngenerated eye-gaze labels based on varying group sizes. We assessed the\nefficacy of such eye-gaze labels by training Convolutional Neural Networks\n(CNNs) and comparing their performance to those trained with ground truth\nannotations and a heuristic-based baseline. Results indicated that CNNs trained\nwith our eye-gaze labels closely followed the performance of ground-truth-based\nCNNs, and significantly outperformed the baseline. Although primarily focused\non mitosis, we envision that insights from this study can be generalized to\nother medical imaging tasks.\n","authors":["Hongyan Gu","Zihan Yan","Ayesha Alvi","Brandon Day","Chunxu Yang","Zida Wu","Shino Magaki","Mohammad Haeri","Xiang 'Anthony' Chen"],"pdf_url":"https://arxiv.org/pdf/2404.01656v1.pdf","comment":"Accepted by IEEE International Conference on Healthcare Informatics\n 2024"},{"id":"http://arxiv.org/abs/2404.01655v1","updated":"2024-04-02T05:56:17Z","published":"2024-04-02T05:56:17Z","title":"FashionEngine: Interactive Generation and Editing of 3D Clothed Humans","summary":" We present FashionEngine, an interactive 3D human generation and editing\nsystem that allows us to design 3D digital humans in a way that aligns with how\nhumans interact with the world, such as natural languages, visual perceptions,\nand hand-drawing. FashionEngine automates the 3D human production with three\nkey components: 1) A pre-trained 3D human diffusion model that learns to model\n3D humans in a semantic UV latent space from 2D image training data, which\nprovides strong priors for diverse generation and editing tasks. 2)\nMultimodality-UV Space encoding the texture appearance, shape topology, and\ntextual semantics of human clothing in a canonical UV-aligned space, which\nfaithfully aligns the user multimodal inputs with the implicit UV latent space\nfor controllable 3D human editing. The multimodality-UV space is shared across\ndifferent user inputs, such as texts, images, and sketches, which enables\nvarious joint multimodal editing tasks. 3) Multimodality-UV Aligned Sampler\nlearns to sample high-quality and diverse 3D humans from the diffusion prior\nfor multimodal user inputs. Extensive experiments validate FashionEngine's\nstate-of-the-art performance for conditional generation/editing tasks. In\naddition, we present an interactive user interface for our FashionEngine that\nenables both conditional and unconditional generation tasks, and editing tasks\nincluding pose/view/shape control, text-, image-, and sketch-driven 3D human\nediting and 3D virtual try-on, in a unified framework. Our project page is at:\nhttps://taohuumd.github.io/projects/FashionEngine.\n","authors":["Tao Hu","Fangzhou Hong","Zhaoxi Chen","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2404.01655v1.pdf","comment":"Project Page: https://taohuumd.github.io/projects/FashionEngine"},{"id":"http://arxiv.org/abs/2404.01654v1","updated":"2024-04-02T05:53:34Z","published":"2024-04-02T05:53:34Z","title":"AI WALKUP: A Computer-Vision Approach to Quantifying MDS-UPDRS in\n Parkinson's Disease","summary":" Parkinson's Disease (PD) is the second most common neurodegenerative\ndisorder. The existing assessment method for PD is usually the Movement\nDisorder Society - Unified Parkinson's Disease Rating Scale (MDS-UPDRS) to\nassess the severity of various types of motor symptoms and disease progression.\nHowever, manual assessment suffers from high subjectivity, lack of consistency,\nand high cost and low efficiency of manual communication. We want to use a\ncomputer vision based solution to capture human pose images based on a camera,\nreconstruct and perform motion analysis using algorithms, and extract the\nfeatures of the amount of motion through feature engineering. The proposed\napproach can be deployed on different smartphones, and the video recording and\nartificial intelligence analysis can be done quickly and easily through our\nAPP.\n","authors":["Xiang Xiang","Zihan Zhang","Jing Ma","Yao Deng"],"pdf_url":"https://arxiv.org/pdf/2404.01654v1.pdf","comment":"Technical report for AI WALKUP, an APP winning 3rd Prize of 2022 HUST\n GS AI Innovation and Design Competition"},{"id":"http://arxiv.org/abs/2310.16781v3","updated":"2024-04-02T05:50:21Z","published":"2023-10-25T17:15:55Z","title":"Kiki or Bouba? Sound Symbolism in Vision-and-Language Models","summary":" Although the mapping between sound and meaning in human language is assumed\nto be largely arbitrary, research in cognitive science has shown that there are\nnon-trivial correlations between particular sounds and meanings across\nlanguages and demographic groups, a phenomenon known as sound symbolism. Among\nthe many dimensions of meaning, sound symbolism is particularly salient and\nwell-demonstrated with regards to cross-modal associations between language and\nthe visual domain. In this work, we address the question of whether sound\nsymbolism is reflected in vision-and-language models such as CLIP and Stable\nDiffusion. Using zero-shot knowledge probing to investigate the inherent\nknowledge of these models, we find strong evidence that they do show this\npattern, paralleling the well-known kiki-bouba effect in psycholinguistics. Our\nwork provides a novel method for demonstrating sound symbolism and\nunderstanding its nature using computational tools. Our code will be made\npublicly available.\n","authors":["Morris Alper","Hadar Averbuch-Elor"],"pdf_url":"https://arxiv.org/pdf/2310.16781v3.pdf","comment":"Accepted to NeurIPS 2023 (spotlight). Project webpage:\n https://kiki-bouba.github.io/"},{"id":"http://arxiv.org/abs/2312.12470v3","updated":"2024-04-02T05:37:25Z","published":"2023-12-19T08:14:14Z","title":"Rotated Multi-Scale Interaction Network for Referring Remote Sensing\n Image Segmentation","summary":" Referring Remote Sensing Image Segmentation (RRSIS) is a new challenge that\ncombines computer vision and natural language processing, delineating specific\nregions in aerial images as described by textual queries. Traditional Referring\nImage Segmentation (RIS) approaches have been impeded by the complex spatial\nscales and orientations found in aerial imagery, leading to suboptimal\nsegmentation results. To address these challenges, we introduce the Rotated\nMulti-Scale Interaction Network (RMSIN), an innovative approach designed for\nthe unique demands of RRSIS. RMSIN incorporates an Intra-scale Interaction\nModule (IIM) to effectively address the fine-grained detail required at\nmultiple scales and a Cross-scale Interaction Module (CIM) for integrating\nthese details coherently across the network. Furthermore, RMSIN employs an\nAdaptive Rotated Convolution (ARC) to account for the diverse orientations of\nobjects, a novel contribution that significantly enhances segmentation\naccuracy. To assess the efficacy of RMSIN, we have curated an expansive dataset\ncomprising 17,402 image-caption-mask triplets, which is unparalleled in terms\nof scale and variety. This dataset not only presents the model with a wide\nrange of spatial and rotational scenarios but also establishes a stringent\nbenchmark for the RRSIS task, ensuring a rigorous evaluation of performance.\nOur experimental evaluations demonstrate the exceptional performance of RMSIN,\nsurpassing existing state-of-the-art models by a significant margin. All\ndatasets and code are made available at https://github.com/Lsan2401/RMSIN.\n","authors":["Sihan Liu","Yiwei Ma","Xiaoqing Zhang","Haowei Wang","Jiayi Ji","Xiaoshuai Sun","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2312.12470v3.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01647v1","updated":"2024-04-02T05:32:39Z","published":"2024-04-02T05:32:39Z","title":"EDTalk: Efficient Disentanglement for Emotional Talking Head Synthesis","summary":" Achieving disentangled control over multiple facial motions and accommodating\ndiverse input modalities greatly enhances the application and entertainment of\nthe talking head generation. This necessitates a deep exploration of the\ndecoupling space for facial features, ensuring that they a) operate\nindependently without mutual interference and b) can be preserved to share with\ndifferent modal input, both aspects often neglected in existing methods. To\naddress this gap, this paper proposes a novel Efficient Disentanglement\nframework for Talking head generation (EDTalk). Our framework enables\nindividual manipulation of mouth shape, head pose, and emotional expression,\nconditioned on video or audio inputs. Specifically, we employ three lightweight\nmodules to decompose the facial dynamics into three distinct latent spaces\nrepresenting mouth, pose, and expression, respectively. Each space is\ncharacterized by a set of learnable bases whose linear combinations define\nspecific motions. To ensure independence and accelerate training, we enforce\northogonality among bases and devise an efficient training strategy to allocate\nmotion responsibilities to each space without relying on external knowledge.\nThe learned bases are then stored in corresponding banks, enabling shared\nvisual priors with audio input. Furthermore, considering the properties of each\nspace, we propose an Audio-to-Motion module for audio-driven talking head\nsynthesis. Experiments are conducted to demonstrate the effectiveness of\nEDTalk. We recommend watching the project website:\nhttps://tanshuai0219.github.io/EDTalk/\n","authors":["Shuai Tan","Bin Ji","Mengxiao Bi","Ye Pan"],"pdf_url":"https://arxiv.org/pdf/2404.01647v1.pdf","comment":"22 pages, 15 figures"},{"id":"http://arxiv.org/abs/2404.01645v1","updated":"2024-04-02T05:30:39Z","published":"2024-04-02T05:30:39Z","title":"ContrastCAD: Contrastive Learning-based Representation Learning for\n Computer-Aided Design Models","summary":" The success of Transformer-based models has encouraged many researchers to\nlearn CAD models using sequence-based approaches. However, learning CAD models\nis still a challenge, because they can be represented as complex shapes with\nlong construction sequences. Furthermore, the same CAD model can be expressed\nusing different CAD construction sequences. We propose a novel contrastive\nlearning-based approach, named ContrastCAD, that effectively captures semantic\ninformation within the construction sequences of the CAD model. ContrastCAD\ngenerates augmented views using dropout techniques without altering the shape\nof the CAD model. We also propose a new CAD data augmentation method, called a\nRandom Replace and Extrude (RRE) method, to enhance the learning performance of\nthe model when training an imbalanced training CAD dataset. Experimental\nresults show that the proposed RRE augmentation method significantly enhances\nthe learning performance of Transformer-based autoencoders, even for complex\nCAD models having very long construction sequences. The proposed ContrastCAD\nmodel is shown to be robust to permutation changes of construction sequences\nand performs better representation learning by generating representation spaces\nwhere similar CAD models are more closely clustered. Our codes are available at\nhttps://github.com/cm8908/ContrastCAD.\n","authors":["Minseop Jung","Minseong Kim","Jibum Kim"],"pdf_url":"https://arxiv.org/pdf/2404.01645v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01390v2","updated":"2024-04-02T05:20:01Z","published":"2023-09-04T06:41:29Z","title":"Bridging the Projection Gap: Overcoming Projection Bias Through\n Parameterized Distance Learning","summary":" Generalized zero-shot learning (GZSL) aims to recognize samples from both\nseen and unseen classes using only seen class samples for training. However,\nGZSL methods are prone to bias towards seen classes during inference due to the\nprojection function being learned from seen classes. Most methods focus on\nlearning an accurate projection, but bias in the projection is inevitable. We\naddress this projection bias by proposing to learn a parameterized Mahalanobis\ndistance metric for robust inference. Our key insight is that the distance\ncomputation during inference is critical, even with a biased projection. We\nmake two main contributions - (1) We extend the VAEGAN (Variational Autoencoder\n\\& Generative Adversarial Networks) architecture with two branches to\nseparately output the projection of samples from seen and unseen classes,\nenabling more robust distance learning. (2) We introduce a novel loss function\nto optimize the Mahalanobis distance representation and reduce projection bias.\nExtensive experiments on four datasets show that our approach outperforms\nstate-of-the-art GZSL techniques with improvements of up to 3.5 \\% on the\nharmonic mean metric.\n","authors":["Chong Zhang","Mingyu Jin","Qinkai Yu","Haochen Xue","Shreyank N Gowda","Xiaobo Jin"],"pdf_url":"https://arxiv.org/pdf/2309.01390v2.pdf","comment":"18 pages, 9 figures"},{"id":"http://arxiv.org/abs/2404.01643v1","updated":"2024-04-02T05:19:27Z","published":"2024-04-02T05:19:27Z","title":"A Closer Look at Spatial-Slice Features Learning for COVID-19 Detection","summary":" Conventional Computed Tomography (CT) imaging recognition faces two\nsignificant challenges: (1) There is often considerable variability in the\nresolution and size of each CT scan, necessitating strict requirements for the\ninput size and adaptability of models. (2) CT-scan contains large number of\nout-of-distribution (OOD) slices. The crucial features may only be present in\nspecific spatial regions and slices of the entire CT scan. How can we\neffectively figure out where these are located? To deal with this, we introduce\nan enhanced Spatial-Slice Feature Learning (SSFL++) framework specifically\ndesigned for CT scan. It aim to filter out a OOD data within whole CT scan,\nenabling our to select crucial spatial-slice for analysis by reducing 70%\nredundancy totally. Meanwhile, we proposed Kernel-Density-based slice Sampling\n(KDS) method to improve the stability when training and inference stage,\ntherefore speeding up the rate of convergence and boosting performance. As a\nresult, the experiments demonstrate the promising performance of our model\nusing a simple EfficientNet-2D (E2D) model, even with only 1% of the training\ndata. The efficacy of our approach has been validated on the COVID-19-CT-DB\ndatasets provided by the DEF-AI-MIA workshop, in conjunction with CVPR 2024.\nOur source code will be made available.\n","authors":["Chih-Chung Hsu","Chia-Ming Lee","Yang Fan Chiang","Yi-Shiuan Chou","Chih-Yu Jiang","Shen-Chieh Tai","Chi-Han Tsai"],"pdf_url":"https://arxiv.org/pdf/2404.01643v1.pdf","comment":"Submitted to DEF-AI-MIA workshop. arXiv admin note: text overlap with\n arXiv:2403.11230"},{"id":"http://arxiv.org/abs/2404.00995v2","updated":"2024-04-02T05:16:55Z","published":"2024-04-01T08:46:35Z","title":"PosterLlama: Bridging Design Ability of Langauge Model to Contents-Aware\n Layout Generation","summary":" Visual layout plays a critical role in graphic design fields such as\nadvertising, posters, and web UI design. The recent trend towards content-aware\nlayout generation through generative models has shown promise, yet it often\noverlooks the semantic intricacies of layout design by treating it as a simple\nnumerical optimization. To bridge this gap, we introduce PosterLlama, a network\ndesigned for generating visually and textually coherent layouts by reformatting\nlayout elements into HTML code and leveraging the rich design knowledge\nembedded within language models. Furthermore, we enhance the robustness of our\nmodel with a unique depth-based poster augmentation strategy. This ensures our\ngenerated layouts remain semantically rich but also visually appealing, even\nwith limited data. Our extensive evaluations across several benchmarks\ndemonstrate that PosterLlama outperforms existing methods in producing\nauthentic and content-aware layouts. It supports an unparalleled range of\nconditions, including but not limited to unconditional layout generation,\nelement conditional layout generation, layout completion, among others, serving\nas a highly versatile user manipulation tool.\n","authors":["Jaejung Seol","Seojun Kim","Jaejun Yoo"],"pdf_url":"https://arxiv.org/pdf/2404.00995v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02957v3","updated":"2024-04-02T05:12:10Z","published":"2023-12-05T18:41:03Z","title":"Classification for everyone : Building geography agnostic models for\n fairer recognition","summary":" In this paper, we analyze different methods to mitigate inherent geographical\nbiases present in state of the art image classification models. We first\nquantitatively present this bias in two datasets - The Dollar Street Dataset\nand ImageNet, using images with location information. We then present different\nmethods which can be employed to reduce this bias. Finally, we analyze the\neffectiveness of the different techniques on making these models more robust to\ngeographical locations of the images.\n","authors":["Akshat Jindal","Shreya Singh","Soham Gadgil"],"pdf_url":"https://arxiv.org/pdf/2312.02957v3.pdf","comment":"typos corrected, references added"},{"id":"http://arxiv.org/abs/2309.16585v4","updated":"2024-04-02T05:10:02Z","published":"2023-09-28T16:44:31Z","title":"Text-to-3D using Gaussian Splatting","summary":" Automatic text-to-3D generation that combines Score Distillation Sampling\n(SDS) with the optimization of volume rendering has achieved remarkable\nprogress in synthesizing realistic 3D objects. Yet most existing text-to-3D\nmethods by SDS and volume rendering suffer from inaccurate geometry, e.g., the\nJanus issue, since it is hard to explicitly integrate 3D priors into implicit\n3D representations. Besides, it is usually time-consuming for them to generate\nelaborate 3D models with rich colors. In response, this paper proposes GSGEN, a\nnovel method that adopts Gaussian Splatting, a recent state-of-the-art\nrepresentation, to text-to-3D generation. GSGEN aims at generating high-quality\n3D objects and addressing existing shortcomings by exploiting the explicit\nnature of Gaussian Splatting that enables the incorporation of 3D prior.\nSpecifically, our method adopts a progressive optimization strategy, which\nincludes a geometry optimization stage and an appearance refinement stage. In\ngeometry optimization, a coarse representation is established under 3D point\ncloud diffusion prior along with the ordinary 2D SDS optimization, ensuring a\nsensible and 3D-consistent rough shape. Subsequently, the obtained Gaussians\nundergo an iterative appearance refinement to enrich texture details. In this\nstage, we increase the number of Gaussians by compactness-based densification\nto enhance continuity and improve fidelity. With these designs, our approach\ncan generate 3D assets with delicate details and accurate geometry. Extensive\nevaluations demonstrate the effectiveness of our method, especially for\ncapturing high-frequency components. Our code is available at\nhttps://github.com/gsgen3d/gsgen\n","authors":["Zilong Chen","Feng Wang","Yikai Wang","Huaping Liu"],"pdf_url":"https://arxiv.org/pdf/2309.16585v4.pdf","comment":"To appear at CVPR 2024. Project page: https://gsgen3d.github.io.\n Code: https://github.com/gsgen3d/gsgen"},{"id":"http://arxiv.org/abs/2404.00875v2","updated":"2024-04-02T05:09:25Z","published":"2024-04-01T03:10:36Z","title":"DPA-Net: Structured 3D Abstraction from Sparse Views via Differentiable\n Primitive Assembly","summary":" We present a differentiable rendering framework to learn structured 3D\nabstractions in the form of primitive assemblies from sparse RGB images\ncapturing a 3D object. By leveraging differentiable volume rendering, our\nmethod does not require 3D supervision. Architecturally, our network follows\nthe general pipeline of an image-conditioned neural radiance field (NeRF)\nexemplified by pixelNeRF for color prediction. As our core contribution, we\nintroduce differential primitive assembly (DPA) into NeRF to output a 3D\noccupancy field in place of density prediction, where the predicted occupancies\nserve as opacity values for volume rendering. Our network, coined DPA-Net,\nproduces a union of convexes, each as an intersection of convex quadric\nprimitives, to approximate the target 3D object, subject to an abstraction loss\nand a masking loss, both defined in the image space upon volume rendering. With\ntest-time adaptation and additional sampling and loss designs aimed at\nimproving the accuracy and compactness of the obtained assemblies, our method\ndemonstrates superior performance over state-of-the-art alternatives for 3D\nprimitive abstraction from sparse views.\n","authors":["Fenggen Yu","Yiming Qian","Xu Zhang","Francisca Gil-Ureta","Brian Jackson","Eric Bennett","Hao Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.00875v2.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2404.01225v2","updated":"2024-04-02T04:59:48Z","published":"2024-04-01T16:34:27Z","title":"SurMo: Surface-based 4D Motion Modeling for Dynamic Human Rendering","summary":" Dynamic human rendering from video sequences has achieved remarkable progress\nby formulating the rendering as a mapping from static poses to human images.\nHowever, existing methods focus on the human appearance reconstruction of every\nsingle frame while the temporal motion relations are not fully explored. In\nthis paper, we propose a new 4D motion modeling paradigm, SurMo, that jointly\nmodels the temporal dynamics and human appearances in a unified framework with\nthree key designs: 1) Surface-based motion encoding that models 4D human\nmotions with an efficient compact surface-based triplane. It encodes both\nspatial and temporal motion relations on the dense surface manifold of a\nstatistical body template, which inherits body topology priors for\ngeneralizable novel view synthesis with sparse training observations. 2)\nPhysical motion decoding that is designed to encourage physical motion learning\nby decoding the motion triplane features at timestep t to predict both spatial\nderivatives and temporal derivatives at the next timestep t+1 in the training\nstage. 3) 4D appearance decoding that renders the motion triplanes into images\nby an efficient volumetric surface-conditioned renderer that focuses on the\nrendering of body surfaces with motion learning conditioning. Extensive\nexperiments validate the state-of-the-art performance of our new paradigm and\nillustrate the expressiveness of surface-based motion triplanes for rendering\nhigh-fidelity view-consistent humans with fast motions and even\nmotion-dependent shadows. Our project page is at:\nhttps://taohuumd.github.io/projects/SurMo/\n","authors":["Tao Hu","Fangzhou Hong","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2404.01225v2.pdf","comment":"Accepted to CVPR 2024. Project Page:\n https://taohuumd.github.io/projects/SurMo/"},{"id":"http://arxiv.org/abs/2404.01241v2","updated":"2024-04-02T04:56:45Z","published":"2024-04-01T17:00:18Z","title":"StructLDM: Structured Latent Diffusion for 3D Human Generation","summary":" Recent 3D human generative models have achieved remarkable progress by\nlearning 3D-aware GANs from 2D images. However, existing 3D human generative\nmethods model humans in a compact 1D latent space, ignoring the articulated\nstructure and semantics of human body topology. In this paper, we explore more\nexpressive and higher-dimensional latent space for 3D human modeling and\npropose StructLDM, a diffusion-based unconditional 3D human generative model,\nwhich is learned from 2D images. StructLDM solves the challenges imposed due to\nthe high-dimensional growth of latent space with three key designs: 1) A\nsemantic structured latent space defined on the dense surface manifold of a\nstatistical human body template. 2) A structured 3D-aware auto-decoder that\nfactorizes the global latent space into several semantic body parts\nparameterized by a set of conditional structured local NeRFs anchored to the\nbody template, which embeds the properties learned from the 2D training data\nand can be decoded to render view-consistent humans under different poses and\nclothing styles. 3) A structured latent diffusion model for generative human\nappearance sampling. Extensive experiments validate StructLDM's\nstate-of-the-art generation performance and illustrate the expressiveness of\nthe structured latent space over the well-adopted 1D latent space. Notably,\nStructLDM enables different levels of controllable 3D human generation and\nediting, including pose/view/shape control, and high-level tasks including\ncompositional generations, part-aware clothing editing, 3D virtual try-on, etc.\nOur project page is at: https://taohuumd.github.io/projects/StructLDM/.\n","authors":["Tao Hu","Fangzhou Hong","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2404.01241v2.pdf","comment":"Project page: https://taohuumd.github.io/projects/StructLDM/"},{"id":"http://arxiv.org/abs/2404.00936v2","updated":"2024-04-02T04:55:27Z","published":"2024-04-01T05:46:15Z","title":"A Comprehensive Review of Knowledge Distillation in Computer Vision","summary":" Deep learning techniques have been demonstrated to surpass preceding\ncutting-edge machine learning techniques in recent years, with computer vision\nbeing one of the most prominent examples. However, deep learning models suffer\nfrom significant drawbacks when deployed in resource-constrained environments\ndue to their large model size and high complexity. Knowledge Distillation is\none of the prominent solutions to overcome this challenge. This review paper\nexamines the current state of research on knowledge distillation, a technique\nfor compressing complex models into smaller and simpler ones. The paper\nprovides an overview of the major principles and techniques associated with\nknowledge distillation and reviews the applications of knowledge distillation\nin the domain of computer vision. The review focuses on the benefits of\nknowledge distillation, as well as the problems that must be overcome to\nimprove its effectiveness.\n","authors":["Sheikh Musa Kaleem","Tufail Rouf","Gousia Habib","Tausifa jan Saleem","Brejesh Lall"],"pdf_url":"https://arxiv.org/pdf/2404.00936v2.pdf","comment":"37 pages ,10 figures"},{"id":"http://arxiv.org/abs/2404.01636v1","updated":"2024-04-02T04:53:39Z","published":"2024-04-02T04:53:39Z","title":"Learning to Control Camera Exposure via Reinforcement Learning","summary":" Adjusting camera exposure in arbitrary lighting conditions is the first step\nto ensure the functionality of computer vision applications. Poorly adjusted\ncamera exposure often leads to critical failure and performance degradation.\nTraditional camera exposure control methods require multiple convergence steps\nand time-consuming processes, making them unsuitable for dynamic lighting\nconditions. In this paper, we propose a new camera exposure control framework\nthat rapidly controls camera exposure while performing real-time processing by\nexploiting deep reinforcement learning. The proposed framework consists of four\ncontributions: 1) a simplified training ground to simulate real-world's diverse\nand dynamic lighting changes, 2) flickering and image attribute-aware reward\ndesign, along with lightweight state design for real-time processing, 3) a\nstatic-to-dynamic lighting curriculum to gradually improve the agent's\nexposure-adjusting capability, and 4) domain randomization techniques to\nalleviate the limitation of the training ground and achieve seamless\ngeneralization in the wild.As a result, our proposed method rapidly reaches a\ndesired exposure level within five steps with real-time processing (1 ms).\nAlso, the acquired images are well-exposed and show superiority in various\ncomputer vision tasks, such as feature extraction and object detection.\n","authors":["Kyunghyun Lee","Ukcheol Shin","Byeong-Uk Lee"],"pdf_url":"https://arxiv.org/pdf/2404.01636v1.pdf","comment":"Accepted at CVPR 2024, *First two authors contributed equally to this\n work. Project page link: https://sites.google.com/view/drl-ae"},{"id":"http://arxiv.org/abs/2404.01628v1","updated":"2024-04-02T04:29:01Z","published":"2024-04-02T04:29:01Z","title":"Learning Equi-angular Representations for Online Continual Learning","summary":" Online continual learning suffers from an underfitted solution due to\ninsufficient training for prompt model update (e.g., single-epoch training). To\naddress the challenge, we propose an efficient online continual learning method\nusing the neural collapse phenomenon. In particular, we induce neural collapse\nto form a simplex equiangular tight frame (ETF) structure in the representation\nspace so that the continuously learned model with a single epoch can better fit\nto the streamed data by proposing preparatory data training and residual\ncorrection in the representation space. With an extensive set of empirical\nvalidations using CIFAR-10/100, TinyImageNet, ImageNet-200, and ImageNet-1K, we\nshow that our proposed method outperforms state-of-the-art methods by a\nnoticeable margin in various online continual learning scenarios such as\ndisjoint and Gaussian scheduled continuous (i.e., boundary-free) data setups.\n","authors":["Minhyuk Seo","Hyunseo Koh","Wonje Jeung","Minjae Lee","San Kim","Hankook Lee","Sungjun Cho","Sungik Choi","Hyunwoo Kim","Jonghyun Choi"],"pdf_url":"https://arxiv.org/pdf/2404.01628v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2311.07362v4","updated":"2024-04-02T04:12:43Z","published":"2023-11-13T14:26:24Z","title":"Volcano: Mitigating Multimodal Hallucination through Self-Feedback\n Guided Revision","summary":" Large multimodal models suffer from multimodal hallucination, where they\nprovide incorrect responses misaligned with the given visual information.\nRecent works have conjectured that one of the reasons behind multimodal\nhallucination is due to the vision encoder failing to ground on the image\nproperly. To mitigate this issue, we propose a novel approach that leverages\nself-feedback as visual cues. Building on this approach, we introduce Volcano,\na multimodal self-feedback guided revision model. Volcano generates natural\nlanguage feedback to its initial response based on the provided visual\ninformation and utilizes this feedback to self-revise its initial response.\nVolcano effectively reduces multimodal hallucination and achieves\nstate-of-the-art on MMHal-Bench, POPE, and GAVIE. It also improves on general\nmultimodal abilities and outperforms previous models on MM-Vet and MMBench.\nThrough qualitative analysis, we show that Volcano's feedback is properly\ngrounded on the image than the initial response. This indicates that Volcano\ncan provide itself with richer visual information through feedback generation,\nleading to self-correct hallucinations. We publicly release our model, data,\nand code at https://github.com/kaistAI/Volcano}{github.com/kaistAI/Volcano\n","authors":["Seongyun Lee","Sue Hyun Park","Yongrae Jo","Minjoon Seo"],"pdf_url":"https://arxiv.org/pdf/2311.07362v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06415v3","updated":"2024-04-02T04:00:30Z","published":"2024-01-12T07:23:02Z","title":"3D Reconstruction of Interacting Multi-Person in Clothing from a Single\n Image","summary":" This paper introduces a novel pipeline to reconstruct the geometry of\ninteracting multi-person in clothing on a globally coherent scene space from a\nsingle image. The main challenge arises from the occlusion: a part of a human\nbody is not visible from a single view due to the occlusion by others or the\nself, which introduces missing geometry and physical implausibility (e.g.,\npenetration). We overcome this challenge by utilizing two human priors for\ncomplete 3D geometry and surface contacts. For the geometry prior, an encoder\nlearns to regress the image of a person with missing body parts to the latent\nvectors; a decoder decodes these vectors to produce 3D features of the\nassociated geometry; and an implicit network combines these features with a\nsurface normal map to reconstruct a complete and detailed 3D humans. For the\ncontact prior, we develop an image-space contact detector that outputs a\nprobability distribution of surface contacts between people in 3D. We use these\npriors to globally refine the body poses, enabling the penetration-free and\naccurate reconstruction of interacting multi-person in clothing on the scene\nspace. The results demonstrate that our method is complete, globally coherent,\nand physically plausible compared to existing methods.\n","authors":["Junuk Cha","Hansol Lee","Jaewon Kim","Nhat Nguyen Bao Truong","Jae Shin Yoon","Seungryul Baek"],"pdf_url":"https://arxiv.org/pdf/2401.06415v3.pdf","comment":"Accepted to WACV 2024"},{"id":"http://arxiv.org/abs/2402.18490v2","updated":"2024-04-02T03:50:34Z","published":"2024-02-28T17:18:38Z","title":"TAMM: TriAdapter Multi-Modal Learning for 3D Shape Understanding","summary":" The limited scale of current 3D shape datasets hinders the advancements in 3D\nshape understanding, and motivates multi-modal learning approaches which\ntransfer learned knowledge from data-abundant 2D image and language modalities\nto 3D shapes. However, even though the image and language representations have\nbeen aligned by cross-modal models like CLIP, we find that the image modality\nfails to contribute as much as the language in existing multi-modal 3D\nrepresentation learning methods. This is attributed to the domain shift in the\n2D images and the distinct focus of each modality. To more effectively leverage\nboth modalities in the pre-training, we introduce TriAdapter Multi-Modal\nLearning (TAMM) -- a novel two-stage learning approach based on three\nsynergistic adapters. First, our CLIP Image Adapter mitigates the domain gap\nbetween 3D-rendered images and natural images, by adapting the visual\nrepresentations of CLIP for synthetic image-text pairs. Subsequently, our Dual\nAdapters decouple the 3D shape representation space into two complementary\nsub-spaces: one focusing on visual attributes and the other for semantic\nunderstanding, which ensure a more comprehensive and effective multi-modal\npre-training. Extensive experiments demonstrate that TAMM consistently enhances\n3D representations for a wide range of 3D encoder architectures, pre-training\ndatasets, and downstream tasks. Notably, we boost the zero-shot classification\naccuracy on Objaverse-LVIS from 46.8\\% to 50.7\\%, and improve the 5-way 10-shot\nlinear probing classification accuracy on ModelNet40 from 96.1\\% to 99.0\\%.\nProject page: https://alanzhangcs.github.io/tamm-page.\n","authors":["Zhihao Zhang","Shengcao Cao","Yu-Xiong Wang"],"pdf_url":"https://arxiv.org/pdf/2402.18490v2.pdf","comment":"This paper is accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01030v2","updated":"2024-04-02T03:36:28Z","published":"2024-04-01T10:19:05Z","title":"Survey of Bias In Text-to-Image Generation: Definition, Evaluation, and\n Mitigation","summary":" The recent advancement of large and powerful models with Text-to-Image (T2I)\ngeneration abilities -- such as OpenAI's DALLE-3 and Google's Gemini -- enables\nusers to generate high-quality images from textual prompts. However, it has\nbecome increasingly evident that even simple prompts could cause T2I models to\nexhibit conspicuous social bias in generated images. Such bias might lead to\nboth allocational and representational harms in society, further marginalizing\nminority groups. Noting this problem, a large body of recent works has been\ndedicated to investigating different dimensions of bias in T2I systems.\nHowever, an extensive review of these studies is lacking, hindering a\nsystematic understanding of current progress and research gaps. We present the\nfirst extensive survey on bias in T2I generative models. In this survey, we\nreview prior studies on dimensions of bias: Gender, Skintone, and Geo-Culture.\nSpecifically, we discuss how these works define, evaluate, and mitigate\ndifferent aspects of bias. We found that: (1) while gender and skintone biases\nare widely studied, geo-cultural bias remains under-explored; (2) most works on\ngender and skintone bias investigated occupational association, while other\naspects are less frequently studied; (3) almost all gender bias works overlook\nnon-binary identities in their studies; (4) evaluation datasets and metrics are\nscattered, with no unified framework for measuring biases; and (5) current\nmitigation methods fail to resolve biases comprehensively. Based on current\nlimitations, we point out future research directions that contribute to\nhuman-centric definitions, evaluations, and mitigation of biases. We hope to\nhighlight the importance of studying biases in T2I systems, as well as\nencourage future efforts to holistically understand and tackle biases, building\nfair and trustworthy T2I technologies for everyone.\n","authors":["Yixin Wan","Arjun Subramonian","Anaelia Ovalle","Zongyu Lin","Ashima Suvarna","Christina Chance","Hritik Bansal","Rebecca Pattichis","Kai-Wei Chang"],"pdf_url":"https://arxiv.org/pdf/2404.01030v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01614v1","updated":"2024-04-02T03:36:07Z","published":"2024-04-02T03:36:07Z","title":"LR-FPN: Enhancing Remote Sensing Object Detection with Location Refined\n Feature Pyramid Network","summary":" Remote sensing target detection aims to identify and locate critical targets\nwithin remote sensing images, finding extensive applications in agriculture and\nurban planning. Feature pyramid networks (FPNs) are commonly used to extract\nmulti-scale features. However, existing FPNs often overlook extracting\nlow-level positional information and fine-grained context interaction. To\naddress this, we propose a novel location refined feature pyramid network\n(LR-FPN) to enhance the extraction of shallow positional information and\nfacilitate fine-grained context interaction. The LR-FPN consists of two primary\nmodules: the shallow position information extraction module (SPIEM) and the\ncontextual interaction module (CIM). Specifically, SPIEM first maximizes the\nretention of solid location information of the target by simultaneously\nextracting positional and saliency information from the low-level feature map.\nSubsequently, CIM injects this robust location information into different\nlayers of the original FPN through spatial and channel interaction, explicitly\nenhancing the object area. Moreover, in spatial interaction, we introduce a\nsimple local and non-local interaction strategy to learn and retain the\nsaliency information of the object. Lastly, the LR-FPN can be readily\nintegrated into common object detection frameworks to improve performance\nsignificantly. Extensive experiments on two large-scale remote sensing datasets\n(i.e., DOTAV1.0 and HRSC2016) demonstrate that the proposed LR-FPN is superior\nto state-of-the-art object detection approaches. Our code and models will be\npublicly available.\n","authors":["Hanqian Li","Ruinan Zhang","Ye Pan","Junchi Ren","Fei Shen"],"pdf_url":"https://arxiv.org/pdf/2404.01614v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01612v1","updated":"2024-04-02T03:29:23Z","published":"2024-04-02T03:29:23Z","title":"Spin-UP: Spin Light for Natural Light Uncalibrated Photometric Stereo","summary":" Natural Light Uncalibrated Photometric Stereo (NaUPS) relieves the strict\nenvironment and light assumptions in classical Uncalibrated Photometric Stereo\n(UPS) methods. However, due to the intrinsic ill-posedness and high-dimensional\nambiguities, addressing NaUPS is still an open question. Existing works impose\nstrong assumptions on the environment lights and objects' material, restricting\nthe effectiveness in more general scenarios. Alternatively, some methods\nleverage supervised learning with intricate models while lacking\ninterpretability, resulting in a biased estimation. In this work, we proposed\nSpin Light Uncalibrated Photometric Stereo (Spin-UP), an unsupervised method to\ntackle NaUPS in various environment lights and objects. The proposed method\nuses a novel setup that captures the object's images on a rotatable platform,\nwhich mitigates NaUPS's ill-posedness by reducing unknowns and provides\nreliable priors to alleviate NaUPS's ambiguities. Leveraging neural inverse\nrendering and the proposed training strategies, Spin-UP recovers surface\nnormals, environment light, and isotropic reflectance under complex natural\nlight with low computational cost. Experiments have shown that Spin-UP\noutperforms other supervised / unsupervised NaUPS methods and achieves\nstate-of-the-art performance on synthetic and real-world datasets. Codes and\ndata are available at https://github.com/LMozart/CVPR2024-SpinUP.\n","authors":["Zongrui Li","Zhan Lu","Haojie Yan","Boxin Shi","Gang Pan","Qian Zheng","Xudong Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.01612v1.pdf","comment":"Paper accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2402.17275v2","updated":"2024-04-02T03:18:07Z","published":"2024-02-27T07:42:55Z","title":"One-Shot Structure-Aware Stylized Image Synthesis","summary":" While GAN-based models have been successful in image stylization tasks, they\noften struggle with structure preservation while stylizing a wide range of\ninput images. Recently, diffusion models have been adopted for image\nstylization but still lack the capability to maintain the original quality of\ninput images. Building on this, we propose OSASIS: a novel one-shot stylization\nmethod that is robust in structure preservation. We show that OSASIS is able to\neffectively disentangle the semantics from the structure of an image, allowing\nit to control the level of content and style implemented to a given input. We\napply OSASIS to various experimental settings, including stylization with\nout-of-domain reference images and stylization with text-driven manipulation.\nResults show that OSASIS outperforms other stylization methods, especially for\ninput images that were rarely encountered during training, providing a\npromising solution to stylization via diffusion models.\n","authors":["Hansam Cho","Jonghyun Lee","Seunggyu Chang","Yonghyun Jeong"],"pdf_url":"https://arxiv.org/pdf/2402.17275v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2306.08736v3","updated":"2024-04-02T03:10:53Z","published":"2023-06-14T20:40:28Z","title":"LoSh: Long-Short Text Joint Prediction Network for Referring Video\n Object Segmentation","summary":" Referring video object segmentation (RVOS) aims to segment the target\ninstance referred by a given text expression in a video clip. The text\nexpression normally contains sophisticated description of the instance's\nappearance, action, and relation with others. It is therefore rather difficult\nfor a RVOS model to capture all these attributes correspondingly in the video;\nin fact, the model often favours more on the action- and relation-related\nvisual attributes of the instance. This can end up with partial or even\nincorrect mask prediction of the target instance. We tackle this problem by\ntaking a subject-centric short text expression from the original long text\nexpression. The short one retains only the appearance-related information of\nthe target instance so that we can use it to focus the model's attention on the\ninstance's appearance. We let the model make joint predictions using both long\nand short text expressions; and insert a long-short cross-attention module to\ninteract the joint features and a long-short predictions intersection loss to\nregulate the joint predictions. Besides the improvement on the linguistic part,\nwe also introduce a forward-backward visual consistency loss, which utilizes\noptical flows to warp visual features between the annotated frames and their\ntemporal neighbors for consistency. We build our method on top of two state of\nthe art pipelines. Extensive experiments on A2D-Sentences, Refer-YouTube-VOS,\nJHMDB-Sentences and Refer-DAVIS17 show impressive improvements of our\nmethod.Code is available at https://github.com/LinfengYuan1997/Losh.\n","authors":["Linfeng Yuan","Miaojing Shi","Zijie Yue","Qijun Chen"],"pdf_url":"https://arxiv.org/pdf/2306.08736v3.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2401.04332v2","updated":"2024-04-02T02:56:28Z","published":"2024-01-09T03:05:53Z","title":"Flexible filtrations for multiparameter persistent homology detect\n digital images","summary":" Two important problems in the field of Topological Data Analysis are defining\npractical multifiltrations on objects and showing ability of TDA to detect the\ngeometry. Motivated by the problems, we constuct three multifiltrations named\nmulti-GENEO, multi-DGENEO and mix-GENEO, and prove the stability of both the\ninterleaving distance and multiparameter persistence landscape of multi-GENEO\nwith respect to the pseudometric of the subspace of bounded functions. We also\ngive the estimations of upper bound for multi-DGENEO and mix-GENEO. Finally, we\nprovide experiment results on MNIST dataset to demonstrate our bifiltrations\nhave ability to detect geometric and topological differences of digital images.\n","authors":["Jiaxing He","Bingzhe Hou","Tieru Wu","Yue Xin"],"pdf_url":"https://arxiv.org/pdf/2401.04332v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11679v2","updated":"2024-04-02T02:55:28Z","published":"2024-03-18T11:31:03Z","title":"NEDS-SLAM: A Novel Neural Explicit Dense Semantic SLAM Framework using\n 3D Gaussian Splatting","summary":" We propose NEDS-SLAM, an Explicit Dense semantic SLAM system based on 3D\nGaussian representation, that enables robust 3D semantic mapping, accurate\ncamera tracking, and high-quality rendering in real-time. In the system, we\npropose a Spatially Consistent Feature Fusion model to reduce the effect of\nerroneous estimates from pre-trained segmentation head on semantic\nreconstruction, achieving robust 3D semantic Gaussian mapping. Additionally, we\nemploy a lightweight encoder-decoder to compress the high-dimensional semantic\nfeatures into a compact 3D Gaussian representation, mitigating the burden of\nexcessive memory consumption. Furthermore, we leverage the advantage of 3D\nGaussian splatting, which enables efficient and differentiable novel view\nrendering, and propose a Virtual Camera View Pruning method to eliminate\noutlier GS points, thereby effectively enhancing the quality of scene\nrepresentations. Our NEDS-SLAM method demonstrates competitive performance over\nexisting dense semantic SLAM methods in terms of mapping and tracking accuracy\non Replica and ScanNet datasets, while also showing excellent capabilities in\n3D dense semantic mapping.\n","authors":["Yiming Ji","Yang Liu","Guanghu Xie","Boyu Ma","Zongwu Xie"],"pdf_url":"https://arxiv.org/pdf/2403.11679v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01604v1","updated":"2024-04-02T02:52:05Z","published":"2024-04-02T02:52:05Z","title":"WaveDH: Wavelet Sub-bands Guided ConvNet for Efficient Image Dehazing","summary":" The surge in interest regarding image dehazing has led to notable\nadvancements in deep learning-based single image dehazing approaches,\nexhibiting impressive performance in recent studies. Despite these strides,\nmany existing methods fall short in meeting the efficiency demands of practical\napplications. In this paper, we introduce WaveDH, a novel and compact ConvNet\ndesigned to address this efficiency gap in image dehazing. Our WaveDH leverages\nwavelet sub-bands for guided up-and-downsampling and frequency-aware feature\nrefinement. The key idea lies in utilizing wavelet decomposition to extract\nlow-and-high frequency components from feature levels, allowing for faster\nprocessing while upholding high-quality reconstruction. The downsampling block\nemploys a novel squeeze-and-attention scheme to optimize the feature\ndownsampling process in a structurally compact manner through wavelet domain\nlearning, preserving discriminative features while discarding noise components.\nIn our upsampling block, we introduce a dual-upsample and fusion mechanism to\nenhance high-frequency component awareness, aiding in the reconstruction of\nhigh-frequency details. Departing from conventional dehazing methods that treat\nlow-and-high frequency components equally, our feature refinement block\nstrategically processes features with a frequency-aware approach. By employing\na coarse-to-fine methodology, it not only refines the details at frequency\nlevels but also significantly optimizes computational costs. The refinement is\nperformed in a maximum 8x downsampled feature space, striking a favorable\nefficiency-vs-accuracy trade-off. Extensive experiments demonstrate that our\nmethod, WaveDH, outperforms many state-of-the-art methods on several image\ndehazing benchmarks with significantly reduced computational costs. Our code is\navailable at https://github.com/AwesomeHwang/WaveDH.\n","authors":["Seongmin Hwang","Daeyoung Han","Cheolkon Jung","Moongu Jeon"],"pdf_url":"https://arxiv.org/pdf/2404.01604v1.pdf","comment":"Submitted to TMM"},{"id":"http://arxiv.org/abs/2403.19964v2","updated":"2024-04-02T02:34:22Z","published":"2024-03-29T03:56:19Z","title":"FairRAG: Fair Human Generation via Fair Retrieval Augmentation","summary":" Existing text-to-image generative models reflect or even amplify societal\nbiases ingrained in their training data. This is especially concerning for\nhuman image generation where models are biased against certain demographic\ngroups. Existing attempts to rectify this issue are hindered by the inherent\nlimitations of the pre-trained models and fail to substantially improve\ndemographic diversity. In this work, we introduce Fair Retrieval Augmented\nGeneration (FairRAG), a novel framework that conditions pre-trained generative\nmodels on reference images retrieved from an external image database to improve\nfairness in human generation. FairRAG enables conditioning through a\nlightweight linear module that projects reference images into the textual\nspace. To enhance fairness, FairRAG applies simple-yet-effective debiasing\nstrategies, providing images from diverse demographic groups during the\ngenerative process. Extensive experiments demonstrate that FairRAG outperforms\nexisting methods in terms of demographic diversity, image-text alignment, and\nimage fidelity while incurring minimal computational overhead during inference.\n","authors":["Robik Shrestha","Yang Zou","Qiuyu Chen","Zhiheng Li","Yusheng Xie","Siqi Deng"],"pdf_url":"https://arxiv.org/pdf/2403.19964v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01591v1","updated":"2024-04-02T02:31:13Z","published":"2024-04-02T02:31:13Z","title":"Language Model Guided Interpretable Video Action Reasoning","summary":" While neural networks have excelled in video action recognition tasks, their\nblack-box nature often obscures the understanding of their decision-making\nprocesses. Recent approaches used inherently interpretable models to analyze\nvideo actions in a manner akin to human reasoning. These models, however,\nusually fall short in performance compared to their black-box counterparts. In\nthis work, we present a new framework named Language-guided Interpretable\nAction Recognition framework (LaIAR). LaIAR leverages knowledge from language\nmodels to enhance both the recognition capabilities and the interpretability of\nvideo models. In essence, we redefine the problem of understanding video model\ndecisions as a task of aligning video and language models. Using the logical\nreasoning captured by the language model, we steer the training of the video\nmodel. This integrated approach not only improves the video model's\nadaptability to different domains but also boosts its overall performance.\nExtensive experiments on two complex video action datasets, Charades & CAD-120,\nvalidates the improved performance and interpretability of our LaIAR framework.\nThe code of LaIAR is available at https://github.com/NingWang2049/LaIAR.\n","authors":["Ning Wang","Guangming Zhu","HS Li","Liang Zhang","Syed Afaq Ali Shah","Mohammed Bennamoun"],"pdf_url":"https://arxiv.org/pdf/2404.01591v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01587v1","updated":"2024-04-02T02:29:41Z","published":"2024-04-02T02:29:41Z","title":"TSCM: A Teacher-Student Model for Vision Place Recognition Using\n Cross-Metric Knowledge Distillation","summary":" Visual place recognition (VPR) plays a pivotal role in autonomous exploration\nand navigation of mobile robots within complex outdoor environments. While\ncost-effective and easily deployed, camera sensors are sensitive to lighting\nand weather changes, and even slight image alterations can greatly affect VPR\nefficiency and precision. Existing methods overcome this by exploiting powerful\nyet large networks, leading to significant consumption of computational\nresources. In this paper, we propose a high-performance teacher and lightweight\nstudent distillation framework called TSCM. It exploits our devised\ncross-metric knowledge distillation to narrow the performance gap between the\nteacher and student models, maintaining superior performance while enabling\nminimal computational load during deployment. We conduct comprehensive\nevaluations on large-scale datasets, namely Pittsburgh30k and Pittsburgh250k.\nExperimental results demonstrate the superiority of our method over baseline\nmodels in terms of recognition accuracy and model parameter efficiency.\nMoreover, our ablation studies show that the proposed knowledge distillation\ntechnique surpasses other counterparts. The code of our method has been\nreleased at https://github.com/nubot-nudt/TSCM.\n","authors":["Yehui Shen","Mingmin Liu","Huimin Lu","Xieyuanli Chen"],"pdf_url":"https://arxiv.org/pdf/2404.01587v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01580v1","updated":"2024-04-02T02:20:47Z","published":"2024-04-02T02:20:47Z","title":"Learning Temporal Cues by Predicting Objects Move for Multi-camera 3D\n Object Detection","summary":" In autonomous driving and robotics, there is a growing interest in utilizing\nshort-term historical data to enhance multi-camera 3D object detection,\nleveraging the continuous and correlated nature of input video streams. Recent\nwork has focused on spatially aligning BEV-based features over timesteps.\nHowever, this is often limited as its gain does not scale well with long-term\npast observations. To address this, we advocate for supervising a model to\npredict objects' poses given past observations, thus explicitly guiding to\nlearn objects' temporal cues. To this end, we propose a model called DAP\n(Detection After Prediction), consisting of a two-branch network: (i) a branch\nresponsible for forecasting the current objects' poses given past observations\nand (ii) another branch that detects objects based on the current and past\nobservations. The features predicting the current objects from branch (i) is\nfused into branch (ii) to transfer predictive knowledge. We conduct extensive\nexperiments with the large-scale nuScenes datasets, and we observe that\nutilizing such predictive information significantly improves the overall\ndetection performance. Our model can be used plug-and-play, showing consistent\nperformance gain.\n","authors":["Seokha Moon","Hongbeen Park","Jungphil Kwon","Jaekoo Lee","Jinkyu Kim"],"pdf_url":"https://arxiv.org/pdf/2404.01580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01579v1","updated":"2024-04-02T02:17:50Z","published":"2024-04-02T02:17:50Z","title":"Diffusion Deepfake","summary":" Recent progress in generative AI, primarily through diffusion models,\npresents significant challenges for real-world deepfake detection. The\nincreased realism in image details, diverse content, and widespread\naccessibility to the general public complicates the identification of these\nsophisticated deepfakes. Acknowledging the urgency to address the vulnerability\nof current deepfake detectors to this evolving threat, our paper introduces two\nextensive deepfake datasets generated by state-of-the-art diffusion models as\nother datasets are less diverse and low in quality. Our extensive experiments\nalso showed that our dataset is more challenging compared to the other face\ndeepfake datasets. Our strategic dataset creation not only challenge the\ndeepfake detectors but also sets a new benchmark for more evaluation. Our\ncomprehensive evaluation reveals the struggle of existing detection methods,\noften optimized for specific image domains and manipulations, to effectively\nadapt to the intricate nature of diffusion deepfakes, limiting their practical\nutility. To address this critical issue, we investigate the impact of enhancing\ntraining data diversity on representative detection methods. This involves\nexpanding the diversity of both manipulation techniques and image domains. Our\nfindings underscore that increasing training data diversity results in improved\ngeneralizability. Moreover, we propose a novel momentum difficulty boosting\nstrategy to tackle the additional challenge posed by training data\nheterogeneity. This strategy dynamically assigns appropriate sample weights\nbased on learning difficulty, enhancing the model's adaptability to both easy\nand challenging samples. Extensive experiments on both existing and newly\nproposed benchmarks demonstrate that our model optimization approach surpasses\nprior alternatives significantly.\n","authors":["Chaitali Bhattacharyya","Hanxiao Wang","Feng Zhang","Sungho Kim","Xiatian Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.01579v1.pdf","comment":"28 pages including Supplementary material"},{"id":"http://arxiv.org/abs/2404.01576v1","updated":"2024-04-02T02:12:00Z","published":"2024-04-02T02:12:00Z","title":"Leveraging Digital Perceptual Technologies for Remote Perception and\n Analysis of Human Biomechanical Processes: A Contactless Approach for\n Workload and Joint Force Assessment","summary":" This study presents an innovative computer vision framework designed to\nanalyze human movements in industrial settings, aiming to enhance biomechanical\nanalysis by integrating seamlessly with existing software. Through a\ncombination of advanced imaging and modeling techniques, the framework allows\nfor comprehensive scrutiny of human motion, providing valuable insights into\nkinematic patterns and kinetic data. Utilizing Convolutional Neural Networks\n(CNNs), Direct Linear Transform (DLT), and Long Short-Term Memory (LSTM)\nnetworks, the methodology accurately detects key body points, reconstructs 3D\nlandmarks, and generates detailed 3D body meshes. Extensive evaluations across\nvarious movements validate the framework's effectiveness, demonstrating\ncomparable results to traditional marker-based models with minor differences in\njoint angle estimations and precise estimations of weight and height.\nStatistical analyses consistently support the framework's reliability, with\njoint angle estimations showing less than a 5-degree difference for hip\nflexion, elbow flexion, and knee angle methods. Additionally, weight estimation\nexhibits an average error of less than 6 % for weight and less than 2 % for\nheight when compared to ground-truth values from 10 subjects. The integration\nof the Biomech-57 landmark skeleton template further enhances the robustness\nand reinforces the framework's credibility. This framework shows significant\npromise for meticulous biomechanical analysis in industrial contexts,\neliminating the need for cumbersome markers and extending its utility to\ndiverse research domains, including the study of specific exoskeleton devices'\nimpact on facilitating the prompt return of injured workers to their tasks.\n","authors":["Jesudara Omidokun","Darlington Egeonu","Bochen Jia","Liang Yang"],"pdf_url":"https://arxiv.org/pdf/2404.01576v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2010.08755v3","updated":"2024-04-02T02:09:15Z","published":"2020-10-17T09:54:51Z","title":"Variational Dynamic for Self-Supervised Exploration in Deep\n Reinforcement Learning","summary":" Efficient exploration remains a challenging problem in reinforcement\nlearning, especially for tasks where extrinsic rewards from environments are\nsparse or even totally disregarded. Significant advances based on intrinsic\nmotivation show promising results in simple environments but often get stuck in\nenvironments with multimodal and stochastic dynamics. In this work, we propose\na variational dynamic model based on the conditional variational inference to\nmodel the multimodality and stochasticity. We consider the environmental\nstate-action transition as a conditional generative process by generating the\nnext-state prediction under the condition of the current state, action, and\nlatent variable, which provides a better understanding of the dynamics and\nleads a better performance in exploration. We derive an upper bound of the\nnegative log-likelihood of the environmental transition and use such an upper\nbound as the intrinsic reward for exploration, which allows the agent to learn\nskills by self-supervised exploration without observing extrinsic rewards. We\nevaluate the proposed method on several image-based simulation tasks and a real\nrobotic manipulating task. Our method outperforms several state-of-the-art\nenvironment model-based exploration approaches.\n","authors":["Chenjia Bai","Peng Liu","Kaiyu Liu","Lingxiao Wang","Yingnan Zhao","Lei Han"],"pdf_url":"https://arxiv.org/pdf/2010.08755v3.pdf","comment":"IEEE Transactions on Neural Networks and Learning Systems (TNNLS)\n 2021"},{"id":"http://arxiv.org/abs/2404.00562v2","updated":"2024-04-02T02:08:55Z","published":"2024-03-31T04:56:30Z","title":"Text2HOI: Text-guided 3D Motion Generation for Hand-Object Interaction","summary":" This paper introduces the first text-guided work for generating the sequence\nof hand-object interaction in 3D. The main challenge arises from the lack of\nlabeled data where existing ground-truth datasets are nowhere near\ngeneralizable in interaction type and object category, which inhibits the\nmodeling of diverse 3D hand-object interaction with the correct physical\nimplication (e.g., contacts and semantics) from text prompts. To address this\nchallenge, we propose to decompose the interaction generation task into two\nsubtasks: hand-object contact generation; and hand-object motion generation.\nFor contact generation, a VAE-based network takes as input a text and an object\nmesh, and generates the probability of contacts between the surfaces of hands\nand the object during the interaction. The network learns a variety of local\ngeometry structure of diverse objects that is independent of the objects'\ncategory, and thus, it is applicable to general objects. For motion generation,\na Transformer-based diffusion model utilizes this 3D contact map as a strong\nprior for generating physically plausible hand-object motion as a function of\ntext prompts by learning from the augmented labeled dataset; where we annotate\ntext labels from many existing 3D hand and object motion data. Finally, we\nfurther introduce a hand refiner module that minimizes the distance between the\nobject surface and hand joints to improve the temporal stability of the\nobject-hand contacts and to suppress the penetration artifacts. In the\nexperiments, we demonstrate that our method can generate more realistic and\ndiverse interactions compared to other baseline methods. We also show that our\nmethod is applicable to unseen objects. We will release our model and newly\nlabeled data as a strong foundation for future research. Codes and data are\navailable in: https://github.com/JunukCha/Text2HOI.\n","authors":["Junuk Cha","Jihyeon Kim","Jae Shin Yoon","Seungryul Baek"],"pdf_url":"https://arxiv.org/pdf/2404.00562v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01571v1","updated":"2024-04-02T02:07:00Z","published":"2024-04-02T02:07:00Z","title":"Leveraging YOLO-World and GPT-4V LMMs for Zero-Shot Person Detection and\n Action Recognition in Drone Imagery","summary":" In this article, we explore the potential of zero-shot Large Multimodal\nModels (LMMs) in the domain of drone perception. We focus on person detection\nand action recognition tasks and evaluate two prominent LMMs, namely YOLO-World\nand GPT-4V(ision) using a publicly available dataset captured from aerial\nviews. Traditional deep learning approaches rely heavily on large and\nhigh-quality training datasets. However, in certain robotic settings, acquiring\nsuch datasets can be resource-intensive or impractical within a reasonable\ntimeframe. The flexibility of prompt-based Large Multimodal Models (LMMs) and\ntheir exceptional generalization capabilities have the potential to\nrevolutionize robotics applications in these scenarios. Our findings suggest\nthat YOLO-World demonstrates good detection performance. GPT-4V struggles with\naccurately classifying action classes but delivers promising results in\nfiltering out unwanted region proposals and in providing a general description\nof the scenery. This research represents an initial step in leveraging LMMs for\ndrone perception and establishes a foundation for future investigations in this\narea.\n","authors":["Christian Limberg","Artur Gonçalves","Bastien Rigault","Helmut Prendinger"],"pdf_url":"https://arxiv.org/pdf/2404.01571v1.pdf","comment":"4 pages"},{"id":"http://arxiv.org/abs/2403.19080v3","updated":"2024-04-02T02:05:46Z","published":"2024-03-28T01:05:06Z","title":"MMCert: Provable Defense against Adversarial Attacks to Multi-modal\n Models","summary":" Different from a unimodal model whose input is from a single modality, the\ninput (called multi-modal input) of a multi-modal model is from multiple\nmodalities such as image, 3D points, audio, text, etc. Similar to unimodal\nmodels, many existing studies show that a multi-modal model is also vulnerable\nto adversarial perturbation, where an attacker could add small perturbation to\nall modalities of a multi-modal input such that the multi-modal model makes\nincorrect predictions for it. Existing certified defenses are mostly designed\nfor unimodal models, which achieve sub-optimal certified robustness guarantees\nwhen extended to multi-modal models as shown in our experimental results. In\nour work, we propose MMCert, the first certified defense against adversarial\nattacks to a multi-modal model. We derive a lower bound on the performance of\nour MMCert under arbitrary adversarial attacks with bounded perturbations to\nboth modalities (e.g., in the context of auto-driving, we bound the number of\nchanged pixels in both RGB image and depth image). We evaluate our MMCert using\ntwo benchmark datasets: one for the multi-modal road segmentation task and the\nother for the multi-modal emotion recognition task. Moreover, we compare our\nMMCert with a state-of-the-art certified defense extended from unimodal models.\nOur experimental results show that our MMCert outperforms the baseline.\n","authors":["Yanting Wang","Hongye Fu","Wei Zou","Jinyuan Jia"],"pdf_url":"https://arxiv.org/pdf/2403.19080v3.pdf","comment":"To appear in CVPR'24"},{"id":"http://arxiv.org/abs/2404.01568v1","updated":"2024-04-02T02:01:21Z","published":"2024-04-02T02:01:21Z","title":"A Linear Time and Space Local Point Cloud Geometry Encoder via\n Vectorized Kernel Mixture (VecKM)","summary":" We propose VecKM, a novel local point cloud geometry encoder that is\ndescriptive, efficient and robust to noise. VecKM leverages a unique approach\nby vectorizing a kernel mixture to represent the local point clouds. Such\nrepresentation is descriptive and robust to noise, which is supported by two\ntheorems that confirm its ability to reconstruct and preserve the similarity of\nthe local shape. Moreover, VecKM is the first successful attempt to reduce the\ncomputation and memory costs from $O(n^2+nKd)$ to $O(nd)$ by sacrificing a\nmarginal constant factor, where $n$ is the size of the point cloud and $K$ is\nneighborhood size. The efficiency is primarily due to VecKM's unique\nfactorizable property that eliminates the need of explicitly grouping points\ninto neighborhoods. In the normal estimation task, VecKM demonstrates not only\n100x faster inference speed but also strongest descriptiveness and robustness\ncompared with existing popular encoders. In classification and segmentation\ntasks, integrating VecKM as a preprocessing module achieves consistently better\nperformance than the PointNet, PointNet++, and point transformer baselines, and\nruns consistently faster by up to 10x.\n","authors":["Dehao Yuan","Cornelia Fermüller","Tahseen Rabbani","Furong Huang","Yiannis Aloimonos"],"pdf_url":"https://arxiv.org/pdf/2404.01568v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01563v1","updated":"2024-04-02T01:57:08Z","published":"2024-04-02T01:57:08Z","title":"Two-Phase Multi-Dose-Level PET Image Reconstruction with Dose Level\n Awareness","summary":" To obtain high-quality positron emission tomography (PET) while minimizing\nradiation exposure, a range of methods have been designed to reconstruct\nstandard-dose PET (SPET) from corresponding low-dose PET (LPET) images.\nHowever, most current methods merely learn the mapping between\nsingle-dose-level LPET and SPET images, but omit the dose disparity of LPET\nimages in clinical scenarios. In this paper, to reconstruct high-quality SPET\nimages from multi-dose-level LPET images, we design a novel two-phase\nmulti-dose-level PET reconstruction algorithm with dose level awareness,\ncontaining a pre-training phase and a SPET prediction phase. Specifically, the\npre-training phase is devised to explore both fine-grained discriminative\nfeatures and effective semantic representation. The SPET prediction phase\nadopts a coarse prediction network utilizing pre-learned dose level prior to\ngenerate preliminary result, and a refinement network to precisely preserve the\ndetails. Experiments on MICCAI 2022 Ultra-low Dose PET Imaging Challenge\nDataset have demonstrated the superiority of our method.\n","authors":["Yuchen Fei","Yanmei Luo","Yan Wang","Jiaqi Cui","Yuanyuan Xu","Jiliu Zhou","Dinggang Shen"],"pdf_url":"https://arxiv.org/pdf/2404.01563v1.pdf","comment":"Accepted by ISBI2024"},{"id":"http://arxiv.org/abs/2403.16209v3","updated":"2024-04-02T01:57:00Z","published":"2024-03-24T16:08:10Z","title":"Image Captioning in news report scenario","summary":" Image captioning strives to generate pertinent captions for specified images,\nsituating itself at the crossroads of Computer Vision (CV) and Natural Language\nProcessing (NLP). This endeavor is of paramount importance with far-reaching\napplications in recommendation systems, news outlets, social media, and beyond.\nParticularly within the realm of news reporting, captions are expected to\nencompass detailed information, such as the identities of celebrities captured\nin the images. However, much of the existing body of work primarily centers\naround understanding scenes and actions. In this paper, we explore the realm of\nimage captioning specifically tailored for celebrity photographs, illustrating\nits broad potential for enhancing news industry practices. This exploration\naims to augment automated news content generation, thereby facilitating a more\nnuanced dissemination of information. Our endeavor shows a broader horizon,\nenriching the narrative in news reporting through a more intuitive image\ncaptioning framework.\n","authors":["Tianrui Liu","Qi Cai","Changxin Xu","Bo Hong","Jize Xiong","Yuxin Qiao","Tsungwei Yang"],"pdf_url":"https://arxiv.org/pdf/2403.16209v3.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.01548v1","updated":"2024-04-02T01:28:44Z","published":"2024-04-02T01:28:44Z","title":"mChartQA: A universal benchmark for multimodal Chart Question Answer\n based on Vision-Language Alignment and Reasoning","summary":" In the fields of computer vision and natural language processing, multimodal\nchart question-answering, especially involving color, structure, and textless\ncharts, poses significant challenges. Traditional methods, which typically\ninvolve either direct multimodal processing or a table-to-text conversion\nfollowed by language model analysis, have limitations in effectively handling\nthese complex scenarios. This paper introduces a novel multimodal chart\nquestion-answering model, specifically designed to address these intricate\ntasks. Our model integrates visual and linguistic processing, overcoming the\nconstraints of existing methods. We adopt a dual-phase training approach: the\ninitial phase focuses on aligning image and text representations, while the\nsubsequent phase concentrates on optimizing the model's interpretative and\nanalytical abilities in chart-related queries. This approach has demonstrated\nsuperior performance on multiple public datasets, particularly in handling\ncolor, structure, and textless chart questions, indicating its effectiveness in\ncomplex multimodal tasks.\n","authors":["Jingxuan Wei","Nan Xu","Guiyong Chang","Yin Luo","BiHui Yu","Ruifeng Guo"],"pdf_url":"https://arxiv.org/pdf/2404.01548v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01547v1","updated":"2024-04-02T01:18:16Z","published":"2024-04-02T01:18:16Z","title":"Bidirectional Multi-Scale Implicit Neural Representations for Image\n Deraining","summary":" How to effectively explore multi-scale representations of rain streaks is\nimportant for image deraining. In contrast to existing Transformer-based\nmethods that depend mostly on single-scale rain appearance, we develop an\nend-to-end multi-scale Transformer that leverages the potentially useful\nfeatures in various scales to facilitate high-quality image reconstruction. To\nbetter explore the common degradation representations from spatially-varying\nrain streaks, we incorporate intra-scale implicit neural representations based\non pixel coordinates with the degraded inputs in a closed-loop design, enabling\nthe learned features to facilitate rain removal and improve the robustness of\nthe model in complex scenarios. To ensure richer collaborative representation\nfrom different scales, we embed a simple yet effective inter-scale\nbidirectional feedback operation into our multi-scale Transformer by performing\ncoarse-to-fine and fine-to-coarse information communication. Extensive\nexperiments demonstrate that our approach, named as NeRD-Rain, performs\nfavorably against the state-of-the-art ones on both synthetic and real-world\nbenchmark datasets. The source code and trained models are available at\nhttps://github.com/cschenxiang/NeRD-Rain.\n","authors":["Xiang Chen","Jinshan Pan","Jiangxin Dong"],"pdf_url":"https://arxiv.org/pdf/2404.01547v1.pdf","comment":"Project website: https://github.com/cschenxiang/NeRD-Rain"},{"id":"http://arxiv.org/abs/2404.00228v2","updated":"2024-04-02T01:16:20Z","published":"2024-03-30T03:16:37Z","title":"InfLoRA: Interference-Free Low-Rank Adaptation for Continual Learning","summary":" Continual learning requires the model to learn multiple tasks sequentially.\nIn continual learning, the model should possess the ability to maintain its\nperformance on old tasks (stability) and the ability to adapt to new tasks\ncontinuously (plasticity). Recently, parameter-efficient fine-tuning (PEFT),\nwhich involves freezing a pre-trained model and injecting a small number of\nlearnable parameters to adapt to downstream tasks, has gained increasing\npopularity in continual learning. Although existing continual learning methods\nbased on PEFT have demonstrated superior performance compared to those not\nbased on PEFT, most of them do not consider how to eliminate the interference\nof the new task on the old tasks, which inhibits the model from making a good\ntrade-off between stability and plasticity. In this work, we propose a new PEFT\nmethod, called interference-free low-rank adaptation (InfLoRA), for continual\nlearning. InfLoRA injects a small number of parameters to reparameterize the\npre-trained weights and shows that fine-tuning these injected parameters is\nequivalent to fine-tuning the pre-trained weights within a subspace.\nFurthermore, InfLoRA designs this subspace to eliminate the interference of the\nnew task on the old tasks, making a good trade-off between stability and\nplasticity. Experimental results show that InfLoRA outperforms existing\nstate-of-the-art continual learning methods on multiple datasets.\n","authors":["Yan-Shuo Liang","Wu-Jun Li"],"pdf_url":"https://arxiv.org/pdf/2404.00228v2.pdf","comment":"Accepted by the 2024 IEEE/CVF Conference on Computer Vision and\n Pattern Recognition (CVPR 2024)"},{"id":"http://arxiv.org/abs/2404.01543v1","updated":"2024-04-02T00:55:50Z","published":"2024-04-02T00:55:50Z","title":"Efficient 3D Implicit Head Avatar with Mesh-anchored Hash Table\n Blendshapes","summary":" 3D head avatars built with neural implicit volumetric representations have\nachieved unprecedented levels of photorealism. However, the computational cost\nof these methods remains a significant barrier to their widespread adoption,\nparticularly in real-time applications such as virtual reality and\nteleconferencing. While attempts have been made to develop fast neural\nrendering approaches for static scenes, these methods cannot be simply employed\nto support realistic facial expressions, such as in the case of a dynamic\nfacial performance. To address these challenges, we propose a novel fast 3D\nneural implicit head avatar model that achieves real-time rendering while\nmaintaining fine-grained controllability and high rendering quality. Our key\nidea lies in the introduction of local hash table blendshapes, which are\nlearned and attached to the vertices of an underlying face parametric model.\nThese per-vertex hash-tables are linearly merged with weights predicted via a\nCNN, resulting in expression dependent embeddings. Our novel representation\nenables efficient density and color predictions using a lightweight MLP, which\nis further accelerated by a hierarchical nearest neighbor search method.\nExtensive experiments show that our approach runs in real-time while achieving\ncomparable rendering quality to state-of-the-arts and decent results on\nchallenging expressions.\n","authors":["Ziqian Bai","Feitong Tan","Sean Fanello","Rohit Pandey","Mingsong Dou","Shichen Liu","Ping Tan","Yinda Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.01543v1.pdf","comment":"In CVPR2024. Project page:\n https://augmentedperception.github.io/monoavatar-plus"},{"id":"http://arxiv.org/abs/2211.09321v2","updated":"2024-04-02T00:33:42Z","published":"2022-11-17T03:43:04Z","title":"Interpretable Dimensionality Reduction by Feature Preserving Manifold\n Approximation and Projection","summary":" Nonlinear dimensionality reduction lacks interpretability due to the absence\nof source features in low-dimensional embedding space. We propose an\ninterpretable method featMAP to preserve source features by tangent space\nembedding. The core of our proposal is to utilize local singular value\ndecomposition (SVD) to approximate the tangent space which is embedded to\nlow-dimensional space by maintaining the alignment. Based on the embedding\ntangent space, featMAP enables the interpretability by locally demonstrating\nthe source features and feature importance. Furthermore, featMAP embeds the\ndata points by anisotropic projection to preserve the local similarity and\noriginal density. We apply featMAP to interpreting digit classification, object\ndetection and MNIST adversarial examples. FeatMAP uses source features to\nexplicitly distinguish the digits and objects and to explain the\nmisclassification of adversarial examples. We also compare featMAP with other\nstate-of-the-art methods on local and global metrics.\n","authors":["Yang Yang","Hongjian Sun","Jialei Gong","Di Yu"],"pdf_url":"https://arxiv.org/pdf/2211.09321v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.19654v3","updated":"2024-04-02T00:12:21Z","published":"2023-10-30T15:38:43Z","title":"MCAD: Multi-teacher Cross-modal Alignment Distillation for efficient\n image-text retrieval","summary":" Due to the success of large-scale visual-language pretraining (VLP) models\nand the widespread use of image-text retrieval in industry areas, it is now\ncritically necessary to reduce the model size and streamline their\nmobile-device deployment. Single- and dual-stream model structures are commonly\nused in image-text retrieval with the goal of closing the semantic gap between\ntextual and visual modalities. While single-stream models use deep feature\nfusion to achieve more accurate cross-model alignment, dual-stream models are\nbetter at offline indexing and fast inference.We propose a Multi-teacher\nCross-modality Alignment Distillation (MCAD) technique to integrate the\nadvantages of single- and dual-stream models. By incorporating the fused\nsingle-stream features into the image and text features of the dual-stream\nmodel, we formulate new modified teacher similarity distributions and features.\nThen, we conduct both distribution and feature distillation to boost the\ncapability of the student dual-stream model, achieving high retrieval\nperformance without increasing inference complexity.Extensive experiments\ndemonstrate the remarkable performance and high efficiency of MCAD on\nimage-text retrieval tasks. Furthermore, we implement a lightweight CLIP model\non Snapdragon/Dimensity chips with only $\\sim$100M running memory and\n$\\sim$8.0ms search latency, achieving the mobile-device application of VLP\nmodels.\n","authors":["Youbo Lei","Feifei He","Chen Chen","Yingbin Mo","Si Jia Li","Defeng Xie","Haonan Lu"],"pdf_url":"https://arxiv.org/pdf/2310.19654v3.pdf","comment":"Accepted by NAACL 2024 Findings"},{"id":"http://arxiv.org/abs/2311.09612v2","updated":"2024-04-02T00:11:50Z","published":"2023-11-16T06:50:26Z","title":"Efficient End-to-End Visual Document Understanding with Rationale\n Distillation","summary":" Understanding visually situated language requires interpreting complex\nlayouts of textual and visual elements. Pre-processing tools, such as optical\ncharacter recognition (OCR), can map document image inputs to textual tokens,\nthen large language models (LLMs) can reason over text. However, such methods\nhave high computational and engineering complexity. Can small pretrained\nimage-to-text models accurately understand visual documents through similar\nrecognition and reasoning steps instead? We propose Rationale Distillation\n(RD), which incorporates the outputs of OCR tools, LLMs, and larger multimodal\nmodels as intermediate \"rationales\", and trains a small student model to\npredict both rationales and answers. On three visual document understanding\nbenchmarks representing infographics, scanned documents, and figures, our\nPix2Struct (282M parameters) student model finetuned with RD outperforms the\nbase model by 4-5% absolute accuracy with only 1% higher computational cost.\n","authors":["Wang Zhu","Alekh Agarwal","Mandar Joshi","Robin Jia","Jesse Thomason","Kristina Toutanova"],"pdf_url":"https://arxiv.org/pdf/2311.09612v2.pdf","comment":"Accepted by NAACL 2024"},{"id":"http://arxiv.org/abs/2402.17128v4","updated":"2024-04-02T23:14:42Z","published":"2024-02-27T01:48:19Z","title":"OSCaR: Object State Captioning and State Change Representation","summary":" The capability of intelligent models to extrapolate and comprehend changes in\nobject states is a crucial yet demanding aspect of AI research, particularly\nthrough the lens of human interaction in real-world settings. This task\ninvolves describing complex visual environments, identifying active objects,\nand interpreting their changes as conveyed through language. Traditional\nmethods, which isolate object captioning and state change detection, offer a\nlimited view of dynamic environments. Moreover, relying on a small set of\nsymbolic words to represent changes has restricted the expressiveness of the\nlanguage. To address these challenges, in this paper, we introduce the Object\nState Captioning and State Change Representation (OSCaR) dataset and benchmark.\nOSCaR consists of 14,084 annotated video segments with nearly 1,000 unique\nobjects from various egocentric video collections. It sets a new testbed for\nevaluating multimodal large language models (MLLMs). Our experiments\ndemonstrate that while MLLMs show some skill, they lack a full understanding of\nobject state changes. The benchmark includes a fine-tuned model that, despite\ninitial capabilities, requires significant improvements in accuracy and\ngeneralization ability for effective understanding of these changes. Our code\nand dataset are available at https://github.com/nguyennm1024/OSCaR.\n","authors":["Nguyen Nguyen","Jing Bi","Ali Vosoughi","Yapeng Tian","Pooyan Fazli","Chenliang Xu"],"pdf_url":"https://arxiv.org/pdf/2402.17128v4.pdf","comment":"NAACL 2024"},{"id":"http://arxiv.org/abs/2312.10105v3","updated":"2024-04-02T23:10:13Z","published":"2023-12-15T04:11:34Z","title":"SeiT++: Masked Token Modeling Improves Storage-efficient Training","summary":" Recent advancements in Deep Neural Network (DNN) models have significantly\nimproved performance across computer vision tasks. However, achieving highly\ngeneralizable and high-performing vision models requires expansive datasets,\nresulting in significant storage requirements. This storage challenge is a\ncritical bottleneck for scaling up models. A recent breakthrough by SeiT\nproposed the use of Vector-Quantized (VQ) feature vectors (i.e., tokens) as\nnetwork inputs for vision classification. This approach achieved 90% of the\nperformance of a model trained on full-pixel images with only 1% of the\nstorage. While SeiT needs labeled data, its potential in scenarios beyond fully\nsupervised learning remains largely untapped. In this paper, we extend SeiT by\nintegrating Masked Token Modeling (MTM) for self-supervised pre-training.\nRecognizing that self-supervised approaches often demand more data due to the\nlack of labels, we introduce TokenAdapt and ColorAdapt. These methods\nfacilitate comprehensive token-friendly data augmentation, effectively\naddressing the increased data requirements of self-supervised learning. We\nevaluate our approach across various scenarios, including storage-efficient\nImageNet-1k classification, fine-grained classification, ADE-20k semantic\nsegmentation, and robustness benchmarks. Experimental results demonstrate\nconsistent performance improvement in diverse experiments, validating the\neffectiveness of our method. Code is available at\nhttps://github.com/naver-ai/tokenadapt.\n","authors":["Minhyun Lee","Song Park","Byeongho Heo","Dongyoon Han","Hyunjung Shim"],"pdf_url":"https://arxiv.org/pdf/2312.10105v3.pdf","comment":"First two authors contributed equally"},{"id":"http://arxiv.org/abs/2404.02353v1","updated":"2024-04-02T22:54:24Z","published":"2024-04-02T22:54:24Z","title":"Semantic Augmentation in Images using Language","summary":" Deep Learning models are incredibly data-hungry and require very large\nlabeled datasets for supervised learning. As a consequence, these models often\nsuffer from overfitting, limiting their ability to generalize to real-world\nexamples. Recent advancements in diffusion models have enabled the generation\nof photorealistic images based on textual inputs. Leveraging the substantial\ndatasets used to train these diffusion models, we propose a technique to\nutilize generated images to augment existing datasets. This paper explores\nvarious strategies for effective data augmentation to improve the out-of-domain\ngeneralization capabilities of deep learning models.\n","authors":["Sahiti Yerramilli","Jayant Sravan Tamarapalli","Tanmay Girish Kulkarni","Jonathan Francis","Eric Nyberg"],"pdf_url":"https://arxiv.org/pdf/2404.02353v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02348v1","updated":"2024-04-02T22:49:25Z","published":"2024-04-02T22:49:25Z","title":"COVID-19 Detection Based on Blood Test Parameters using Various\n Artificial Intelligence Methods","summary":" In 2019, the world faced a new challenge: a COVID-19 disease caused by the\nnovel coronavirus, SARS-CoV-2. The virus rapidly spread across the globe,\nleading to a high rate of mortality, which prompted health organizations to\ntake measures to control its transmission. Early disease detection is crucial\nin the treatment process, and computer-based automatic detection systems have\nbeen developed to aid in this effort. These systems often rely on artificial\nintelligence (AI) approaches such as machine learning, neural networks, fuzzy\nsystems, and deep learning to classify diseases. This study aimed to\ndifferentiate COVID-19 patients from others using self-categorizing classifiers\nand employing various AI methods. This study used two datasets: the blood test\nsamples and radiography images. The best results for the blood test samples\nobtained from San Raphael Hospital, which include two classes of individuals,\nthose with COVID-19 and those with non-COVID diseases, were achieved through\nthe use of the Ensemble method (a combination of a neural network and two\nmachines learning methods). The results showed that this approach for COVID-19\ndiagnosis is cost-effective and provides results in a shorter amount of time\nthan other methods. The proposed model achieved an accuracy of 94.09% on the\ndataset used. Secondly, the radiographic images were divided into four classes:\nnormal, viral pneumonia, ground glass opacity, and COVID-19 infection. These\nwere used for segmentation and classification. The lung lobes were extracted\nfrom the images and then categorized into specific classes. We achieved an\naccuracy of 91.1% on the image dataset. Generally, this study highlights the\npotential of AI in detecting and managing COVID-19 and underscores the\nimportance of continued research and development in this field.\n","authors":["Kavian Khanjani","Seyed Rasoul Hosseini","Shahrzad Shashaani","Mohammad Teshnehlab"],"pdf_url":"https://arxiv.org/pdf/2404.02348v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19866v2","updated":"2024-04-02T22:41:53Z","published":"2024-03-28T22:25:05Z","title":"Is Synthetic Image Useful for Transfer Learning? An Investigation into\n Data Generation, Volume, and Utilization","summary":" Synthetic image data generation represents a promising avenue for training\ndeep learning models, particularly in the realm of transfer learning, where\nobtaining real images within a specific domain can be prohibitively expensive\ndue to privacy and intellectual property considerations. This work delves into\nthe generation and utilization of synthetic images derived from text-to-image\ngenerative models in facilitating transfer learning paradigms. Despite the high\nvisual fidelity of the generated images, we observe that their naive\nincorporation into existing real-image datasets does not consistently enhance\nmodel performance due to the inherent distribution gap between synthetic and\nreal images. To address this issue, we introduce a novel two-stage framework\ncalled bridged transfer, which initially employs synthetic images for\nfine-tuning a pre-trained model to improve its transferability and subsequently\nuses real data for rapid adaptation. Alongside, We propose dataset style\ninversion strategy to improve the stylistic alignment between synthetic and\nreal images. Our proposed methods are evaluated across 10 different datasets\nand 5 distinct models, demonstrating consistent improvements, with up to 30%\naccuracy increase on classification tasks. Intriguingly, we note that the\nenhancements were not yet saturated, indicating that the benefits may further\nincrease with an expanded volume of synthetic data.\n","authors":["Yuhang Li","Xin Dong","Chen Chen","Jingtao Li","Yuxin Wen","Michael Spranger","Lingjuan Lyu"],"pdf_url":"https://arxiv.org/pdf/2403.19866v2.pdf","comment":"ICLR24 Score 6865 https://openreview.net/forum?id=CjPt1AC6w0"},{"id":"http://arxiv.org/abs/2404.02345v1","updated":"2024-04-02T22:39:35Z","published":"2024-04-02T22:39:35Z","title":"GaitSTR: Gait Recognition with Sequential Two-stream Refinement","summary":" Gait recognition aims to identify a person based on their walking sequences,\nserving as a useful biometric modality as it can be observed from long\ndistances without requiring cooperation from the subject. In representing a\nperson's walking sequence, silhouettes and skeletons are the two primary\nmodalities used. Silhouette sequences lack detailed part information when\noverlapping occurs between different body segments and are affected by carried\nobjects and clothing. Skeletons, comprising joints and bones connecting the\njoints, provide more accurate part information for different segments; however,\nthey are sensitive to occlusions and low-quality images, causing\ninconsistencies in frame-wise results within a sequence. In this paper, we\nexplore the use of a two-stream representation of skeletons for gait\nrecognition, alongside silhouettes. By fusing the combined data of silhouettes\nand skeletons, we refine the two-stream skeletons, joints, and bones through\nself-correction in graph convolution, along with cross-modal correction with\ntemporal consistency from silhouettes. We demonstrate that with refined\nskeletons, the performance of the gait recognition model can achieve further\nimprovement on public gait recognition datasets compared with state-of-the-art\nmethods without extra annotations.\n","authors":["Wanrong Zheng","Haidong Zhu","Zhaoheng Zheng","Ram Nevatia"],"pdf_url":"https://arxiv.org/pdf/2404.02345v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02344v1","updated":"2024-04-02T22:37:34Z","published":"2024-04-02T22:37:34Z","title":"Effective Malware Detection for Embedded Computing Systems with Limited\n Exposure","summary":" One of the pivotal security threats for the embedded computing systems is\nmalicious software a.k.a malware. With efficiency and efficacy, Machine\nLearning (ML) has been widely adopted for malware detection in recent times.\nDespite being efficient, the existing techniques require a tremendous number of\nbenign and malware samples for training and modeling an efficient malware\ndetector. Furthermore, such constraints limit the detection of emerging malware\nsamples due to the lack of sufficient malware samples required for efficient\ntraining. To address such concerns, we introduce a code-aware data generation\ntechnique that generates multiple mutated samples of the limitedly seen malware\nby the devices. Loss minimization ensures that the generated samples closely\nmimic the limitedly seen malware and mitigate the impractical samples. Such\ndeveloped malware is further incorporated into the training set to formulate\nthe model that can efficiently detect the emerging malware despite having\nlimited exposure. The experimental results demonstrates that the proposed\ntechnique achieves an accuracy of 90% in detecting limitedly seen malware,\nwhich is approximately 3x more than the accuracy attained by state-of-the-art\ntechniques.\n","authors":["Sreenitha Kasarapu","Sanket Shukla","Rakibul Hassan","Avesta Sasan","Houman Homayoun","Sai Manoj Pudukotai Dinakarrao"],"pdf_url":"https://arxiv.org/pdf/2404.02344v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11805v2","updated":"2024-04-02T22:35:21Z","published":"2023-12-19T02:39:27Z","title":"Gemini: A Family of Highly Capable Multimodal Models","summary":" This report introduces a new family of multimodal models, Gemini, that\nexhibit remarkable capabilities across image, audio, video, and text\nunderstanding. The Gemini family consists of Ultra, Pro, and Nano sizes,\nsuitable for applications ranging from complex reasoning tasks to on-device\nmemory-constrained use-cases. Evaluation on a broad range of benchmarks shows\nthat our most-capable Gemini Ultra model advances the state of the art in 30 of\n32 of these benchmarks - notably being the first model to achieve human-expert\nperformance on the well-studied exam benchmark MMLU, and improving the state of\nthe art in every one of the 20 multimodal benchmarks we examined. We believe\nthat the new capabilities of the Gemini family in cross-modal reasoning and\nlanguage understanding will enable a wide variety of use cases. We discuss our\napproach toward post-training and deploying Gemini models responsibly to users\nthrough services including Gemini, Gemini Advanced, Google AI Studio, and Cloud\nVertex AI.\n","authors":[" Gemini Team","Rohan Anil","Sebastian Borgeaud","Jean-Baptiste Alayrac","Jiahui Yu","Radu Soricut","Johan Schalkwyk","Andrew M. Dai","Anja Hauth","Katie Millican","David Silver","Melvin Johnson","Ioannis Antonoglou","Julian Schrittwieser","Amelia Glaese","Jilin Chen","Emily Pitler","Timothy Lillicrap","Angeliki Lazaridou","Orhan Firat","James Molloy","Michael Isard","Paul R. Barham","Tom Hennigan","Benjamin Lee","Fabio Viola","Malcolm Reynolds","Yuanzhong Xu","Ryan Doherty","Eli Collins","Clemens Meyer","Eliza Rutherford","Erica Moreira","Kareem Ayoub","Megha Goel","Jack Krawczyk","Cosmo Du","Ed Chi","Heng-Tze Cheng","Eric Ni","Purvi Shah","Patrick Kane","Betty Chan","Manaal Faruqui","Aliaksei Severyn","Hanzhao Lin","YaGuang Li","Yong Cheng","Abe Ittycheriah","Mahdis Mahdieh","Mia Chen","Pei Sun","Dustin Tran","Sumit Bagri","Balaji Lakshminarayanan","Jeremiah Liu","Andras Orban","Fabian Güra","Hao Zhou","Xinying Song","Aurelien Boffy","Harish Ganapathy","Steven Zheng","HyunJeong Choe","Ágoston Weisz","Tao Zhu","Yifeng Lu","Siddharth Gopal","Jarrod Kahn","Maciej Kula","Jeff Pitman","Rushin Shah","Emanuel Taropa","Majd Al Merey","Martin Baeuml","Zhifeng Chen","Laurent El Shafey","Yujing Zhang","Olcan Sercinoglu","George Tucker","Enrique Piqueras","Maxim Krikun","Iain Barr","Nikolay Savinov","Ivo Danihelka","Becca Roelofs","Anaïs White","Anders Andreassen","Tamara von Glehn","Lakshman Yagati","Mehran Kazemi","Lucas Gonzalez","Misha Khalman","Jakub Sygnowski","Alexandre Frechette","Charlotte Smith","Laura Culp","Lev Proleev","Yi Luan","Xi Chen","James Lottes","Nathan Schucher","Federico Lebron","Alban Rrustemi","Natalie Clay","Phil Crone","Tomas Kocisky","Jeffrey Zhao","Bartek Perz","Dian Yu","Heidi Howard","Adam Bloniarz","Jack W. Rae","Han Lu","Laurent Sifre","Marcello Maggioni","Fred Alcober","Dan Garrette","Megan Barnes","Shantanu Thakoor","Jacob Austin","Gabriel Barth-Maron","William Wong","Rishabh Joshi","Rahma Chaabouni","Deeni Fatiha","Arun Ahuja","Gaurav Singh Tomar","Evan Senter","Martin Chadwick","Ilya Kornakov","Nithya Attaluri","Iñaki Iturrate","Ruibo Liu","Yunxuan Li","Sarah Cogan","Jeremy Chen","Chao Jia","Chenjie Gu","Qiao Zhang","Jordan Grimstad","Ale Jakse Hartman","Xavier Garcia","Thanumalayan Sankaranarayana Pillai","Jacob Devlin","Michael Laskin","Diego de Las Casas","Dasha Valter","Connie Tao","Lorenzo Blanco","Adrià Puigdomènech Badia","David Reitter","Mianna Chen","Jenny Brennan","Clara Rivera","Sergey Brin","Shariq Iqbal","Gabriela Surita","Jane Labanowski","Abhi Rao","Stephanie Winkler","Emilio Parisotto","Yiming Gu","Kate Olszewska","Ravi Addanki","Antoine Miech","Annie Louis","Denis Teplyashin","Geoff Brown","Elliot Catt","Jan Balaguer","Jackie Xiang","Pidong Wang","Zoe Ashwood","Anton Briukhov","Albert Webson","Sanjay Ganapathy","Smit Sanghavi","Ajay Kannan","Ming-Wei Chang","Axel Stjerngren","Josip Djolonga","Yuting Sun","Ankur Bapna","Matthew Aitchison","Pedram Pejman","Henryk Michalewski","Tianhe Yu","Cindy Wang","Juliette Love","Junwhan Ahn","Dawn Bloxwich","Kehang Han","Peter Humphreys","Thibault Sellam","James Bradbury","Varun Godbole","Sina Samangooei","Bogdan Damoc","Alex Kaskasoli","Sébastien M. R. Arnold","Vijay Vasudevan","Shubham Agrawal","Jason Riesa","Dmitry Lepikhin","Richard Tanburn","Srivatsan Srinivasan","Hyeontaek Lim","Sarah Hodkinson","Pranav Shyam","Johan Ferret","Steven Hand","Ankush Garg","Tom Le Paine","Jian Li","Yujia Li","Minh Giang","Alexander Neitz","Zaheer Abbas","Sarah York","Machel Reid","Elizabeth Cole","Aakanksha Chowdhery","Dipanjan Das","Dominika Rogozińska","Vitaliy Nikolaev","Pablo Sprechmann","Zachary Nado","Lukas Zilka","Flavien Prost","Luheng He","Marianne Monteiro","Gaurav Mishra","Chris Welty","Josh Newlan","Dawei Jia","Miltiadis Allamanis","Clara Huiyi Hu","Raoul de Liedekerke","Justin Gilmer","Carl Saroufim","Shruti Rijhwani","Shaobo Hou","Disha Shrivastava","Anirudh Baddepudi","Alex Goldin","Adnan Ozturel","Albin Cassirer","Yunhan Xu","Daniel Sohn","Devendra Sachan","Reinald Kim Amplayo","Craig Swanson","Dessie Petrova","Shashi Narayan","Arthur Guez","Siddhartha Brahma","Jessica Landon","Miteyan Patel","Ruizhe Zhao","Kevin Villela","Luyu Wang","Wenhao Jia","Matthew Rahtz","Mai Giménez","Legg Yeung","James Keeling","Petko Georgiev","Diana Mincu","Boxi Wu","Salem Haykal","Rachel Saputro","Kiran Vodrahalli","James Qin","Zeynep Cankara","Abhanshu Sharma","Nick Fernando","Will Hawkins","Behnam Neyshabur","Solomon Kim","Adrian Hutter","Priyanka Agrawal","Alex Castro-Ros","George van den Driessche","Tao Wang","Fan Yang","Shuo-yiin Chang","Paul Komarek","Ross McIlroy","Mario Lučić","Guodong Zhang","Wael Farhan","Michael Sharman","Paul Natsev","Paul Michel","Yamini Bansal","Siyuan Qiao","Kris Cao","Siamak Shakeri","Christina Butterfield","Justin Chung","Paul Kishan Rubenstein","Shivani Agrawal","Arthur Mensch","Kedar Soparkar","Karel Lenc","Timothy Chung","Aedan Pope","Loren Maggiore","Jackie Kay","Priya Jhakra","Shibo Wang","Joshua Maynez","Mary Phuong","Taylor Tobin","Andrea Tacchetti","Maja Trebacz","Kevin Robinson","Yash Katariya","Sebastian Riedel","Paige Bailey","Kefan Xiao","Nimesh Ghelani","Lora Aroyo","Ambrose Slone","Neil Houlsby","Xuehan Xiong","Zhen Yang","Elena Gribovskaya","Jonas Adler","Mateo Wirth","Lisa Lee","Music Li","Thais Kagohara","Jay Pavagadhi","Sophie Bridgers","Anna Bortsova","Sanjay Ghemawat","Zafarali Ahmed","Tianqi Liu","Richard Powell","Vijay Bolina","Mariko Iinuma","Polina Zablotskaia","James Besley","Da-Woon Chung","Timothy Dozat","Ramona Comanescu","Xiance Si","Jeremy Greer","Guolong Su","Martin Polacek","Raphaël Lopez Kaufman","Simon Tokumine","Hexiang Hu","Elena Buchatskaya","Yingjie Miao","Mohamed Elhawaty","Aditya Siddhant","Nenad Tomasev","Jinwei Xing","Christina Greer","Helen Miller","Shereen Ashraf","Aurko Roy","Zizhao Zhang","Ada Ma","Angelos Filos","Milos Besta","Rory Blevins","Ted Klimenko","Chih-Kuan Yeh","Soravit Changpinyo","Jiaqi Mu","Oscar Chang","Mantas Pajarskas","Carrie Muir","Vered Cohen","Charline Le Lan","Krishna Haridasan","Amit Marathe","Steven Hansen","Sholto Douglas","Rajkumar Samuel","Mingqiu Wang","Sophia Austin","Chang Lan","Jiepu Jiang","Justin Chiu","Jaime Alonso Lorenzo","Lars Lowe Sjösund","Sébastien Cevey","Zach Gleicher","Thi Avrahami","Anudhyan Boral","Hansa Srinivasan","Vittorio Selo","Rhys May","Konstantinos Aisopos","Léonard Hussenot","Livio Baldini Soares","Kate Baumli","Michael B. Chang","Adrià Recasens","Ben Caine","Alexander Pritzel","Filip Pavetic","Fabio Pardo","Anita Gergely","Justin Frye","Vinay Ramasesh","Dan Horgan","Kartikeya Badola","Nora Kassner","Subhrajit Roy","Ethan Dyer","Víctor Campos Campos","Alex Tomala","Yunhao Tang","Dalia El Badawy","Elspeth White","Basil Mustafa","Oran Lang","Abhishek Jindal","Sharad Vikram","Zhitao Gong","Sergi Caelles","Ross Hemsley","Gregory Thornton","Fangxiaoyu Feng","Wojciech Stokowiec","Ce Zheng","Phoebe Thacker","Çağlar Ünlü","Zhishuai Zhang","Mohammad Saleh","James Svensson","Max Bileschi","Piyush Patil","Ankesh Anand","Roman Ring","Katerina Tsihlas","Arpi Vezer","Marco Selvi","Toby Shevlane","Mikel Rodriguez","Tom Kwiatkowski","Samira Daruki","Keran Rong","Allan Dafoe","Nicholas FitzGerald","Keren Gu-Lemberg","Mina Khan","Lisa Anne Hendricks","Marie Pellat","Vladimir Feinberg","James Cobon-Kerr","Tara Sainath","Maribeth Rauh","Sayed Hadi Hashemi","Richard Ives","Yana Hasson","Eric Noland","Yuan Cao","Nathan Byrd","Le Hou","Qingze Wang","Thibault Sottiaux","Michela Paganini","Jean-Baptiste Lespiau","Alexandre Moufarek","Samer Hassan","Kaushik Shivakumar","Joost van Amersfoort","Amol Mandhane","Pratik Joshi","Anirudh Goyal","Matthew Tung","Andrew Brock","Hannah Sheahan","Vedant Misra","Cheng Li","Nemanja Rakićević","Mostafa Dehghani","Fangyu Liu","Sid Mittal","Junhyuk Oh","Seb Noury","Eren Sezener","Fantine Huot","Matthew Lamm","Nicola De Cao","Charlie Chen","Sidharth Mudgal","Romina Stella","Kevin Brooks","Gautam Vasudevan","Chenxi Liu","Mainak Chain","Nivedita Melinkeri","Aaron Cohen","Venus Wang","Kristie Seymore","Sergey Zubkov","Rahul Goel","Summer Yue","Sai Krishnakumaran","Brian Albert","Nate Hurley","Motoki Sano","Anhad Mohananey","Jonah Joughin","Egor Filonov","Tomasz Kępa","Yomna Eldawy","Jiawern Lim","Rahul Rishi","Shirin Badiezadegan","Taylor Bos","Jerry Chang","Sanil Jain","Sri Gayatri Sundara Padmanabhan","Subha Puttagunta","Kalpesh Krishna","Leslie Baker","Norbert Kalb","Vamsi Bedapudi","Adam Kurzrok","Shuntong Lei","Anthony Yu","Oren Litvin","Xiang Zhou","Zhichun Wu","Sam Sobell","Andrea Siciliano","Alan Papir","Robby Neale","Jonas Bragagnolo","Tej Toor","Tina Chen","Valentin Anklin","Feiran Wang","Richie Feng","Milad Gholami","Kevin Ling","Lijuan Liu","Jules Walter","Hamid Moghaddam","Arun Kishore","Jakub Adamek","Tyler Mercado","Jonathan Mallinson","Siddhinita Wandekar","Stephen Cagle","Eran Ofek","Guillermo Garrido","Clemens Lombriser","Maksim Mukha","Botu Sun","Hafeezul Rahman Mohammad","Josip Matak","Yadi Qian","Vikas Peswani","Pawel Janus","Quan Yuan","Leif Schelin","Oana David","Ankur Garg","Yifan He","Oleksii Duzhyi","Anton Älgmyr","Timothée Lottaz","Qi Li","Vikas Yadav","Luyao Xu","Alex Chinien","Rakesh Shivanna","Aleksandr Chuklin","Josie Li","Carrie Spadine","Travis Wolfe","Kareem Mohamed","Subhabrata Das","Zihang Dai","Kyle He","Daniel von Dincklage","Shyam Upadhyay","Akanksha Maurya","Luyan Chi","Sebastian Krause","Khalid Salama","Pam G Rabinovitch","Pavan Kumar Reddy M","Aarush Selvan","Mikhail Dektiarev","Golnaz Ghiasi","Erdem Guven","Himanshu Gupta","Boyi Liu","Deepak Sharma","Idan Heimlich Shtacher","Shachi Paul","Oscar Akerlund","François-Xavier Aubet","Terry Huang","Chen Zhu","Eric Zhu","Elico Teixeira","Matthew Fritze","Francesco Bertolini","Liana-Eleonora Marinescu","Martin Bölle","Dominik Paulus","Khyatti Gupta","Tejasi Latkar","Max Chang","Jason Sanders","Roopa Wilson","Xuewei Wu","Yi-Xuan Tan","Lam Nguyen Thiet","Tulsee Doshi","Sid Lall","Swaroop Mishra","Wanming Chen","Thang Luong","Seth Benjamin","Jasmine Lee","Ewa Andrejczuk","Dominik Rabiej","Vipul Ranjan","Krzysztof Styrc","Pengcheng Yin","Jon Simon","Malcolm Rose Harriott","Mudit Bansal","Alexei Robsky","Geoff Bacon","David Greene","Daniil Mirylenka","Chen Zhou","Obaid Sarvana","Abhimanyu Goyal","Samuel Andermatt","Patrick Siegler","Ben Horn","Assaf Israel","Francesco Pongetti","Chih-Wei \"Louis\" Chen","Marco Selvatici","Pedro Silva","Kathie Wang","Jackson Tolins","Kelvin Guu","Roey Yogev","Xiaochen Cai","Alessandro Agostini","Maulik Shah","Hung Nguyen","Noah Ó Donnaile","Sébastien Pereira","Linda Friso","Adam Stambler","Adam Kurzrok","Chenkai Kuang","Yan Romanikhin","Mark Geller","ZJ Yan","Kane Jang","Cheng-Chun Lee","Wojciech Fica","Eric Malmi","Qijun Tan","Dan Banica","Daniel Balle","Ryan Pham","Yanping Huang","Diana Avram","Hongzhi Shi","Jasjot Singh","Chris Hidey","Niharika Ahuja","Pranab Saxena","Dan Dooley","Srividya Pranavi Potharaju","Eileen O'Neill","Anand Gokulchandran","Ryan Foley","Kai Zhao","Mike Dusenberry","Yuan Liu","Pulkit Mehta","Ragha Kotikalapudi","Chalence Safranek-Shrader","Andrew Goodman","Joshua Kessinger","Eran Globen","Prateek Kolhar","Chris Gorgolewski","Ali Ibrahim","Yang Song","Ali Eichenbaum","Thomas Brovelli","Sahitya Potluri","Preethi Lahoti","Cip Baetu","Ali Ghorbani","Charles Chen","Andy Crawford","Shalini Pal","Mukund Sridhar","Petru Gurita","Asier Mujika","Igor Petrovski","Pierre-Louis Cedoz","Chenmei Li","Shiyuan Chen","Niccolò Dal Santo","Siddharth Goyal","Jitesh Punjabi","Karthik Kappaganthu","Chester Kwak","Pallavi LV","Sarmishta Velury","Himadri Choudhury","Jamie Hall","Premal Shah","Ricardo Figueira","Matt Thomas","Minjie Lu","Ting Zhou","Chintu Kumar","Thomas Jurdi","Sharat Chikkerur","Yenai Ma","Adams Yu","Soo Kwak","Victor Ähdel","Sujeevan Rajayogam","Travis Choma","Fei Liu","Aditya Barua","Colin Ji","Ji Ho Park","Vincent Hellendoorn","Alex Bailey","Taylan Bilal","Huanjie Zhou","Mehrdad Khatir","Charles Sutton","Wojciech Rzadkowski","Fiona Macintosh","Konstantin Shagin","Paul Medina","Chen Liang","Jinjing Zhou","Pararth Shah","Yingying Bi","Attila Dankovics","Shipra Banga","Sabine Lehmann","Marissa Bredesen","Zifan Lin","John Eric Hoffmann","Jonathan Lai","Raynald Chung","Kai Yang","Nihal Balani","Arthur Bražinskas","Andrei Sozanschi","Matthew Hayes","Héctor Fernández Alcalde","Peter Makarov","Will Chen","Antonio Stella","Liselotte Snijders","Michael Mandl","Ante Kärrman","Paweł Nowak","Xinyi Wu","Alex Dyck","Krishnan Vaidyanathan","Raghavender R","Jessica Mallet","Mitch Rudominer","Eric Johnston","Sushil Mittal","Akhil Udathu","Janara Christensen","Vishal Verma","Zach Irving","Andreas Santucci","Gamaleldin Elsayed","Elnaz Davoodi","Marin Georgiev","Ian Tenney","Nan Hua","Geoffrey Cideron","Edouard Leurent","Mahmoud Alnahlawi","Ionut Georgescu","Nan Wei","Ivy Zheng","Dylan Scandinaro","Heinrich Jiang","Jasper Snoek","Mukund Sundararajan","Xuezhi Wang","Zack Ontiveros","Itay Karo","Jeremy Cole","Vinu Rajashekhar","Lara Tumeh","Eyal Ben-David","Rishub Jain","Jonathan Uesato","Romina Datta","Oskar Bunyan","Shimu Wu","John Zhang","Piotr Stanczyk","Ye Zhang","David Steiner","Subhajit Naskar","Michael Azzam","Matthew Johnson","Adam Paszke","Chung-Cheng Chiu","Jaume Sanchez Elias","Afroz Mohiuddin","Faizan Muhammad","Jin Miao","Andrew Lee","Nino Vieillard","Jane Park","Jiageng Zhang","Jeff Stanway","Drew Garmon","Abhijit Karmarkar","Zhe Dong","Jong Lee","Aviral Kumar","Luowei Zhou","Jonathan Evens","William Isaac","Geoffrey Irving","Edward Loper","Michael Fink","Isha Arkatkar","Nanxin Chen","Izhak Shafran","Ivan Petrychenko","Zhe Chen","Johnson Jia","Anselm Levskaya","Zhenkai Zhu","Peter Grabowski","Yu Mao","Alberto Magni","Kaisheng Yao","Javier Snaider","Norman Casagrande","Evan Palmer","Paul Suganthan","Alfonso Castaño","Irene Giannoumis","Wooyeol Kim","Mikołaj Rybiński","Ashwin Sreevatsa","Jennifer Prendki","David Soergel","Adrian Goedeckemeyer","Willi Gierke","Mohsen Jafari","Meenu Gaba","Jeremy Wiesner","Diana Gage Wright","Yawen Wei","Harsha Vashisht","Yana Kulizhskaya","Jay Hoover","Maigo Le","Lu Li","Chimezie Iwuanyanwu","Lu Liu","Kevin Ramirez","Andrey Khorlin","Albert Cui","Tian LIN","Marcus Wu","Ricardo Aguilar","Keith Pallo","Abhishek Chakladar","Ginger Perng","Elena Allica Abellan","Mingyang Zhang","Ishita Dasgupta","Nate Kushman","Ivo Penchev","Alena Repina","Xihui Wu","Tom van der Weide","Priya Ponnapalli","Caroline Kaplan","Jiri Simsa","Shuangfeng Li","Olivier Dousse","Fan Yang","Jeff Piper","Nathan Ie","Rama Pasumarthi","Nathan Lintz","Anitha Vijayakumar","Daniel Andor","Pedro Valenzuela","Minnie Lui","Cosmin Paduraru","Daiyi Peng","Katherine Lee","Shuyuan Zhang","Somer Greene","Duc Dung Nguyen","Paula Kurylowicz","Cassidy Hardin","Lucas Dixon","Lili Janzer","Kiam Choo","Ziqiang Feng","Biao Zhang","Achintya Singhal","Dayou Du","Dan McKinnon","Natasha Antropova","Tolga Bolukbasi","Orgad Keller","David Reid","Daniel Finchelstein","Maria Abi Raad","Remi Crocker","Peter Hawkins","Robert Dadashi","Colin Gaffney","Ken Franko","Anna Bulanova","Rémi Leblond","Shirley Chung","Harry Askham","Luis C. Cobo","Kelvin Xu","Felix Fischer","Jun Xu","Christina Sorokin","Chris Alberti","Chu-Cheng Lin","Colin Evans","Alek Dimitriev","Hannah Forbes","Dylan Banarse","Zora Tung","Mark Omernick","Colton Bishop","Rachel Sterneck","Rohan Jain","Jiawei Xia","Ehsan Amid","Francesco Piccinno","Xingyu Wang","Praseem Banzal","Daniel J. Mankowitz","Alex Polozov","Victoria Krakovna","Sasha Brown","MohammadHossein Bateni","Dennis Duan","Vlad Firoiu","Meghana Thotakuri","Tom Natan","Matthieu Geist","Ser tan Girgin","Hui Li","Jiayu Ye","Ofir Roval","Reiko Tojo","Michael Kwong","James Lee-Thorp","Christopher Yew","Danila Sinopalnikov","Sabela Ramos","John Mellor","Abhishek Sharma","Kathy Wu","David Miller","Nicolas Sonnerat","Denis Vnukov","Rory Greig","Jennifer Beattie","Emily Caveness","Libin Bai","Julian Eisenschlos","Alex Korchemniy","Tomy Tsai","Mimi Jasarevic","Weize Kong","Phuong Dao","Zeyu Zheng","Frederick Liu","Fan Yang","Rui Zhu","Tian Huey Teh","Jason Sanmiya","Evgeny Gladchenko","Nejc Trdin","Daniel Toyama","Evan Rosen","Sasan Tavakkol","Linting Xue","Chen Elkind","Oliver Woodman","John Carpenter","George Papamakarios","Rupert Kemp","Sushant Kafle","Tanya Grunina","Rishika Sinha","Alice Talbert","Diane Wu","Denese Owusu-Afriyie","Cosmo Du","Chloe Thornton","Jordi Pont-Tuset","Pradyumna Narayana","Jing Li","Saaber Fatehi","John Wieting","Omar Ajmeri","Benigno Uria","Yeongil Ko","Laura Knight","Amélie Héliou","Ning Niu","Shane Gu","Chenxi Pang","Yeqing Li","Nir Levine","Ariel Stolovich","Rebeca Santamaria-Fernandez","Sonam Goenka","Wenny Yustalim","Robin Strudel","Ali Elqursh","Charlie Deck","Hyo Lee","Zonglin Li","Kyle Levin","Raphael Hoffmann","Dan Holtmann-Rice","Olivier Bachem","Sho Arora","Christy Koh","Soheil Hassas Yeganeh","Siim Põder","Mukarram Tariq","Yanhua Sun","Lucian Ionita","Mojtaba Seyedhosseini","Pouya Tafti","Zhiyu Liu","Anmol Gulati","Jasmine Liu","Xinyu Ye","Bart Chrzaszcz","Lily Wang","Nikhil Sethi","Tianrun Li","Ben Brown","Shreya Singh","Wei Fan","Aaron Parisi","Joe Stanton","Vinod Koverkathu","Christopher A. Choquette-Choo","Yunjie Li","TJ Lu","Abe Ittycheriah","Prakash Shroff","Mani Varadarajan","Sanaz Bahargam","Rob Willoughby","David Gaddy","Guillaume Desjardins","Marco Cornero","Brona Robenek","Bhavishya Mittal","Ben Albrecht","Ashish Shenoy","Fedor Moiseev","Henrik Jacobsson","Alireza Ghaffarkhah","Morgane Rivière","Alanna Walton","Clément Crepy","Alicia Parrish","Zongwei Zhou","Clement Farabet","Carey Radebaugh","Praveen Srinivasan","Claudia van der Salm","Andreas Fidjeland","Salvatore Scellato","Eri Latorre-Chimoto","Hanna Klimczak-Plucińska","David Bridson","Dario de Cesare","Tom Hudson","Piermaria Mendolicchio","Lexi Walker","Alex Morris","Matthew Mauger","Alexey Guseynov","Alison Reid","Seth Odoom","Lucia Loher","Victor Cotruta","Madhavi Yenugula","Dominik Grewe","Anastasia Petrushkina","Tom Duerig","Antonio Sanchez","Steve Yadlowsky","Amy Shen","Amir Globerson","Lynette Webb","Sahil Dua","Dong Li","Surya Bhupatiraju","Dan Hurt","Haroon Qureshi","Ananth Agarwal","Tomer Shani","Matan Eyal","Anuj Khare","Shreyas Rammohan Belle","Lei Wang","Chetan Tekur","Mihir Sanjay Kale","Jinliang Wei","Ruoxin Sang","Brennan Saeta","Tyler Liechty","Yi Sun","Yao Zhao","Stephan Lee","Pandu Nayak","Doug Fritz","Manish Reddy Vuyyuru","John Aslanides","Nidhi Vyas","Martin Wicke","Xiao Ma","Evgenii Eltyshev","Nina Martin","Hardie Cate","James Manyika","Keyvan Amiri","Yelin Kim","Xi Xiong","Kai Kang","Florian Luisier","Nilesh Tripuraneni","David Madras","Mandy Guo","Austin Waters","Oliver Wang","Joshua Ainslie","Jason Baldridge","Han Zhang","Garima Pruthi","Jakob Bauer","Feng Yang","Riham Mansour","Jason Gelman","Yang Xu","George Polovets","Ji Liu","Honglong Cai","Warren Chen","XiangHai Sheng","Emily Xue","Sherjil Ozair","Christof Angermueller","Xiaowei Li","Anoop Sinha","Weiren Wang","Julia Wiesinger","Emmanouil Koukoumidis","Yuan Tian","Anand Iyer","Madhu Gurumurthy","Mark Goldenson","Parashar Shah","MK Blake","Hongkun Yu","Anthony Urbanowicz","Jennimaria Palomaki","Chrisantha Fernando","Ken Durden","Harsh Mehta","Nikola Momchev","Elahe Rahimtoroghi","Maria Georgaki","Amit Raul","Sebastian Ruder","Morgan Redshaw","Jinhyuk Lee","Denny Zhou","Komal Jalan","Dinghua Li","Blake Hechtman","Parker Schuh","Milad Nasr","Kieran Milan","Vladimir Mikulik","Juliana Franco","Tim Green","Nam Nguyen","Joe Kelley","Aroma Mahendru","Andrea Hu","Joshua Howland","Ben Vargas","Jeffrey Hui","Kshitij Bansal","Vikram Rao","Rakesh Ghiya","Emma Wang","Ke Ye","Jean Michel Sarr","Melanie Moranski Preston","Madeleine Elish","Steve Li","Aakash Kaku","Jigar Gupta","Ice Pasupat","Da-Cheng Juan","Milan Someswar","Tejvi M.","Xinyun Chen","Aida Amini","Alex Fabrikant","Eric Chu","Xuanyi Dong","Amruta Muthal","Senaka Buthpitiya","Sarthak Jauhari","Nan Hua","Urvashi Khandelwal","Ayal Hitron","Jie Ren","Larissa Rinaldi","Shahar Drath","Avigail Dabush","Nan-Jiang Jiang","Harshal Godhia","Uli Sachs","Anthony Chen","Yicheng Fan","Hagai Taitelbaum","Hila Noga","Zhuyun Dai","James Wang","Chen Liang","Jenny Hamer","Chun-Sung Ferng","Chenel Elkind","Aviel Atias","Paulina Lee","Vít Listík","Mathias Carlen","Jan van de Kerkhof","Marcin Pikus","Krunoslav Zaher","Paul Müller","Sasha Zykova","Richard Stefanec","Vitaly Gatsko","Christoph Hirnschall","Ashwin Sethi","Xingyu Federico Xu","Chetan Ahuja","Beth Tsai","Anca Stefanoiu","Bo Feng","Keshav Dhandhania","Manish Katyal","Akshay Gupta","Atharva Parulekar","Divya Pitta","Jing Zhao","Vivaan Bhatia","Yashodha Bhavnani","Omar Alhadlaq","Xiaolin Li","Peter Danenberg","Dennis Tu","Alex Pine","Vera Filippova","Abhipso Ghosh","Ben Limonchik","Bhargava Urala","Chaitanya Krishna Lanka","Derik Clive","Yi Sun","Edward Li","Hao Wu","Kevin Hongtongsak","Ianna Li","Kalind Thakkar","Kuanysh Omarov","Kushal Majmundar","Michael Alverson","Michael Kucharski","Mohak Patel","Mudit Jain","Maksim Zabelin","Paolo Pelagatti","Rohan Kohli","Saurabh Kumar","Joseph Kim","Swetha Sankar","Vineet Shah","Lakshmi Ramachandruni","Xiangkai Zeng","Ben Bariach","Laura Weidinger","Amar Subramanya","Sissie Hsiao","Demis Hassabis","Koray Kavukcuoglu","Adam Sadovsky","Quoc Le","Trevor Strohman","Yonghui Wu","Slav Petrov","Jeffrey Dean","Oriol Vinyals"],"pdf_url":"https://arxiv.org/pdf/2312.11805v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04076v2","updated":"2024-04-02T21:47:35Z","published":"2023-12-07T06:43:34Z","title":"Large Language Models are Good Prompt Learners for Low-Shot Image\n Classification","summary":" Low-shot image classification, where training images are limited or\ninaccessible, has benefited from recent progress on pre-trained vision-language\n(VL) models with strong generalizability, e.g. CLIP. Prompt learning methods\nbuilt with VL models generate text features from the class names that only have\nconfined class-specific information. Large Language Models (LLMs), with their\nvast encyclopedic knowledge, emerge as the complement. Thus, in this paper, we\ndiscuss the integration of LLMs to enhance pre-trained VL models, specifically\non low-shot classification. However, the domain gap between language and vision\nblocks the direct application of LLMs. Thus, we propose LLaMP, Large Language\nModels as Prompt learners, that produces adaptive prompts for the CLIP text\nencoder, establishing it as the connecting bridge. Experiments show that,\ncompared with other state-of-the-art prompt learning methods, LLaMP yields\nbetter performance on both zero-shot generalization and few-shot image\nclassification, over a spectrum of 11 datasets. Code will be made available at:\nhttps://github.com/zhaohengz/LLaMP.\n","authors":["Zhaoheng Zheng","Jingmin Wei","Xuefeng Hu","Haidong Zhu","Ram Nevatia"],"pdf_url":"https://arxiv.org/pdf/2312.04076v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2210.04936v3","updated":"2024-04-02T21:45:06Z","published":"2022-10-10T18:09:35Z","title":"EarthNets: Empowering AI in Earth Observation","summary":" Earth observation (EO), aiming at monitoring the state of planet Earth using\nremote sensing data, is critical for improving our daily lives and living\nenvironment. With a growing number of satellites in orbit, an increasing number\nof datasets with diverse sensors and research domains are being published to\nfacilitate the research of the remote sensing community. This paper presents a\ncomprehensive review of more than 500 publicly published datasets, including\nresearch domains like agriculture, land use and land cover, disaster\nmonitoring, scene understanding, vision-language models, foundation models,\nclimate change, and weather forecasting. We systematically analyze these EO\ndatasets from four aspects: volume, resolution distributions, research domains,\nand the correlation between datasets. Based on the dataset attributes, we\npropose to measure, rank, and select datasets to build a new benchmark for\nmodel evaluation. Furthermore, a new platform for EO, termed EarthNets, is\nreleased to achieve a fair and consistent evaluation of deep learning methods\non remote sensing data. EarthNets supports standard dataset libraries and\ncutting-edge deep learning models to bridge the gap between the remote sensing\nand machine learning communities. Based on this platform, extensive\ndeep-learning methods are evaluated on the new benchmark. The insightful\nresults are beneficial to future research. The platform and dataset collections\nare publicly available at https://earthnets.github.io.\n","authors":["Zhitong Xiong","Fahong Zhang","Yi Wang","Yilei Shi","Xiao Xiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2210.04936v3.pdf","comment":"30 pages"},{"id":"http://arxiv.org/abs/2311.17919v2","updated":"2024-04-02T21:34:29Z","published":"2023-11-29T18:59:59Z","title":"Visual Anagrams: Generating Multi-View Optical Illusions with Diffusion\n Models","summary":" We address the problem of synthesizing multi-view optical illusions: images\nthat change appearance upon a transformation, such as a flip or rotation. We\npropose a simple, zero-shot method for obtaining these illusions from\noff-the-shelf text-to-image diffusion models. During the reverse diffusion\nprocess, we estimate the noise from different views of a noisy image, and then\ncombine these noise estimates together and denoise the image. A theoretical\nanalysis suggests that this method works precisely for views that can be\nwritten as orthogonal transformations, of which permutations are a subset. This\nleads to the idea of a visual anagram--an image that changes appearance under\nsome rearrangement of pixels. This includes rotations and flips, but also more\nexotic pixel permutations such as a jigsaw rearrangement. Our approach also\nnaturally extends to illusions with more than two views. We provide both\nqualitative and quantitative results demonstrating the effectiveness and\nflexibility of our method. Please see our project webpage for additional\nvisualizations and results: https://dangeng.github.io/visual_anagrams/\n","authors":["Daniel Geng","Inbum Park","Andrew Owens"],"pdf_url":"https://arxiv.org/pdf/2311.17919v2.pdf","comment":"CVPR 2024 camera ready"},{"id":"http://arxiv.org/abs/2310.20436v2","updated":"2024-04-02T21:01:56Z","published":"2023-10-31T13:15:49Z","title":"SignAvatars: A Large-scale 3D Sign Language Holistic Motion Dataset and\n Benchmark","summary":" We present SignAvatars, the first large-scale, multi-prompt 3D sign language\n(SL) motion dataset designed to bridge the communication gap for Deaf and\nhard-of-hearing individuals. While there has been an exponentially growing\nnumber of research regarding digital communication, the majority of existing\ncommunication technologies primarily cater to spoken or written languages,\ninstead of SL, the essential communication method for Deaf and hard-of-hearing\ncommunities. Existing SL datasets, dictionaries, and sign language production\n(SLP) methods are typically limited to 2D as annotating 3D models and avatars\nfor SL is usually an entirely manual and labor-intensive process conducted by\nSL experts, often resulting in unnatural avatars. In response to these\nchallenges, we compile and curate the SignAvatars dataset, which comprises\n70,000 videos from 153 signers, totaling 8.34 million frames, covering both\nisolated signs and continuous, co-articulated signs, with multiple prompts\nincluding HamNoSys, spoken language, and words. To yield 3D holistic\nannotations, including meshes and biomechanically-valid poses of body, hands,\nand face, as well as 2D and 3D keypoints, we introduce an automated annotation\npipeline operating on our large corpus of SL videos. SignAvatars facilitates\nvarious tasks such as 3D sign language recognition (SLR) and the novel 3D SL\nproduction (SLP) from diverse inputs like text scripts, individual words, and\nHamNoSys notation. Hence, to evaluate the potential of SignAvatars, we further\npropose a unified benchmark of 3D SL holistic motion production. We believe\nthat this work is a significant step forward towards bringing the digital world\nto the Deaf and hard-of-hearing communities as well as people interacting with\nthem.\n","authors":["Zhengdi Yu","Shaoli Huang","Yongkang Cheng","Tolga Birdal"],"pdf_url":"https://arxiv.org/pdf/2310.20436v2.pdf","comment":"14 pages; Project page available at https://signavatars.github.io/"},{"id":"http://arxiv.org/abs/2402.15276v3","updated":"2024-04-02T20:54:46Z","published":"2024-02-23T11:47:16Z","title":"CFIR: Fast and Effective Long-Text To Image Retrieval for Large Corpora","summary":" Text-to-image retrieval aims to find the relevant images based on a text\nquery, which is important in various use-cases, such as digital libraries,\ne-commerce, and multimedia databases. Although Multimodal Large Language Models\n(MLLMs) demonstrate state-of-the-art performance, they exhibit limitations in\nhandling large-scale, diverse, and ambiguous real-world needs of retrieval, due\nto the computation cost and the injective embeddings they produce. This paper\npresents a two-stage Coarse-to-Fine Index-shared Retrieval (CFIR) framework,\ndesigned for fast and effective large-scale long-text to image retrieval. The\nfirst stage, Entity-based Ranking (ER), adapts to long-text query ambiguity by\nemploying a multiple-queries-to-multiple-targets paradigm, facilitating\ncandidate filtering for the next stage. The second stage, Summary-based\nRe-ranking (SR), refines these rankings using summarized queries. We also\npropose a specialized Decoupling-BEiT-3 encoder, optimized for handling\nambiguous user needs and both stages, which also enhances computational\nefficiency through vector-based similarity inference. Evaluation on the AToMiC\ndataset reveals that CFIR surpasses existing MLLMs by up to 11.06% in\nRecall@1000, while reducing training and retrieval times by 68.75% and 99.79%,\nrespectively. We will release our code to facilitate future research at\nhttps://github.com/longkukuhi/CFIR.\n","authors":["Zijun Long","Xuri Ge","Richard Mccreadie","Joemon Jose"],"pdf_url":"https://arxiv.org/pdf/2402.15276v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13127v3","updated":"2024-04-02T20:42:51Z","published":"2023-11-22T03:31:31Z","title":"MetaCloak: Preventing Unauthorized Subject-driven Text-to-image\n Diffusion-based Synthesis via Meta-learning","summary":" Text-to-image diffusion models allow seamless generation of personalized\nimages from scant reference photos. Yet, these tools, in the wrong hands, can\nfabricate misleading or harmful content, endangering individuals. To address\nthis problem, existing poisoning-based approaches perturb user images in an\nimperceptible way to render them \"unlearnable\" from malicious uses. We identify\ntwo limitations of these defending approaches: i) sub-optimal due to the\nhand-crafted heuristics for solving the intractable bilevel optimization and\nii) lack of robustness against simple data transformations like Gaussian\nfiltering. To solve these challenges, we propose MetaCloak, which solves the\nbi-level poisoning problem with a meta-learning framework with an additional\ntransformation sampling process to craft transferable and robust perturbation.\nSpecifically, we employ a pool of surrogate diffusion models to craft\ntransferable and model-agnostic perturbation. Furthermore, by incorporating an\nadditional transformation process, we design a simple denoising-error\nmaximization loss that is sufficient for causing transformation-robust semantic\ndistortion and degradation in a personalized generation. Extensive experiments\non the VGGFace2 and CelebA-HQ datasets show that MetaCloak outperforms existing\napproaches. Notably, MetaCloak can successfully fool online training services\nlike Replicate, in a black-box manner, demonstrating the effectiveness of\nMetaCloak in real-world scenarios. Our code is available at\nhttps://github.com/liuyixin-louis/MetaCloak.\n","authors":["Yixin Liu","Chenrui Fan","Yutong Dai","Xun Chen","Pan Zhou","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2311.13127v3.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.02287v1","updated":"2024-04-02T20:29:59Z","published":"2024-04-02T20:29:59Z","title":"One Noise to Rule Them All: Multi-View Adversarial Attacks with\n Universal Perturbation","summary":" This paper presents a novel universal perturbation method for generating\nrobust multi-view adversarial examples in 3D object recognition. Unlike\nconventional attacks limited to single views, our approach operates on multiple\n2D images, offering a practical and scalable solution for enhancing model\nscalability and robustness. This generalizable method bridges the gap between\n2D perturbations and 3D-like attack capabilities, making it suitable for\nreal-world applications.\n Existing adversarial attacks may become ineffective when images undergo\ntransformations like changes in lighting, camera position, or natural\ndeformations. We address this challenge by crafting a single universal noise\nperturbation applicable to various object views. Experiments on diverse\nrendered 3D objects demonstrate the effectiveness of our approach. The\nuniversal perturbation successfully identified a single adversarial noise for\neach given set of 3D object renders from multiple poses and viewpoints.\nCompared to single-view attacks, our universal attacks lower classification\nconfidence across multiple viewing angles, especially at low noise levels. A\nsample implementation is made available at\nhttps://github.com/memoatwit/UniversalPerturbation.\n","authors":["Mehmet Ergezer","Phat Duong","Christian Green","Tommy Nguyen","Abdurrahman Zeybey"],"pdf_url":"https://arxiv.org/pdf/2404.02287v1.pdf","comment":"6 pages, 4 figures, presented at ICAIA, Springer to publish under\n Algorithms for Intelligent Systems"},{"id":"http://arxiv.org/abs/2404.02285v1","updated":"2024-04-02T20:23:10Z","published":"2024-04-02T20:23:10Z","title":"LP++: A Surprisingly Strong Linear Probe for Few-Shot CLIP","summary":" In a recent, strongly emergent literature on few-shot CLIP adaptation, Linear\nProbe (LP) has been often reported as a weak baseline. This has motivated\nintensive research building convoluted prompt learning or feature adaptation\nstrategies. In this work, we propose and examine from convex-optimization\nperspectives a generalization of the standard LP baseline, in which the linear\nclassifier weights are learnable functions of the text embedding, with\nclass-wise multipliers blending image and text knowledge. As our objective\nfunction depends on two types of variables, i.e., the class visual prototypes\nand the learnable blending parameters, we propose a computationally efficient\nblock coordinate Majorize-Minimize (MM) descent algorithm. In our full-batch MM\noptimizer, which we coin LP++, step sizes are implicit, unlike standard\ngradient descent practices where learning rates are intensively searched over\nvalidation sets. By examining the mathematical properties of our loss (e.g.,\nLipschitz gradient continuity), we build majorizing functions yielding\ndata-driven learning rates and derive approximations of the loss's minima,\nwhich provide data-informed initialization of the variables. Our image-language\nobjective function, along with these non-trivial optimization insights and\ningredients, yields, surprisingly, highly competitive few-shot CLIP\nperformances. Furthermore, LP++ operates in black-box, relaxes intensive\nvalidation searches for the optimization hyper-parameters, and runs\norders-of-magnitudes faster than state-of-the-art few-shot CLIP adaptation\nmethods. Our code is available at:\n\\url{https://github.com/FereshteShakeri/FewShot-CLIP-Strong-Baseline.git}.\n","authors":["Yunshi Huang","Fereshteh Shakeri","Jose Dolz","Malik Boudiaf","Houda Bahig","Ismail Ben Ayed"],"pdf_url":"https://arxiv.org/pdf/2404.02285v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02282v1","updated":"2024-04-02T20:15:43Z","published":"2024-04-02T20:15:43Z","title":"Smooth Deep Saliency","summary":" In this work, we investigate methods to reduce the noise in deep saliency\nmaps coming from convolutional downsampling, with the purpose of explaining how\na deep learning model detects tumors in scanned histological tissue samples.\nThose methods make the investigated models more interpretable for\ngradient-based saliency maps, computed in hidden layers. We test our approach\non different models trained for image classification on ImageNet1K, and models\ntrained for tumor detection on Camelyon16 and in-house real-world digital\npathology scans of stained tissue samples. Our results show that the\ncheckerboard noise in the gradient gets reduced, resulting in smoother and\ntherefore easier to interpret saliency maps.\n","authors":["Rudolf Herdt","Maximilian Schmidt","Daniel Otero Baguer","Peter Maaß"],"pdf_url":"https://arxiv.org/pdf/2404.02282v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11311v2","updated":"2024-04-02T20:13:03Z","published":"2024-01-20T19:50:51Z","title":"A Novel Benchmark for Few-Shot Semantic Segmentation in the Era of\n Foundation Models","summary":" In recent years, the rapid evolution of computer vision has seen the\nemergence of various foundation models, each tailored to specific data types\nand tasks. In this study, we explore the adaptation of these models for\nfew-shot semantic segmentation. Specifically, we conduct a comprehensive\ncomparative analysis of four prominent foundation models: DINO V2, Segment\nAnything, CLIP, Masked AutoEncoders, and of a straightforward ResNet50\npre-trained on the COCO dataset. We also include 5 adaptation methods, ranging\nfrom linear probing to fine tuning. Our findings show that DINO V2 outperforms\nother models by a large margin, across various datasets and adaptation methods.\nOn the other hand, adaptation methods provide little discrepancy in the\nobtained results, suggesting that a simple linear probing can compete with\nadvanced, more computationally intensive, alternatives\n","authors":["Reda Bensaid","Vincent Gripon","François Leduc-Primeau","Lukas Mauch","Ghouthi Boukli Hacene","Fabien Cardinaux"],"pdf_url":"https://arxiv.org/pdf/2401.11311v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02150v2","updated":"2024-04-02T20:12:20Z","published":"2023-12-04T18:59:32Z","title":"Readout Guidance: Learning Control from Diffusion Features","summary":" We present Readout Guidance, a method for controlling text-to-image diffusion\nmodels with learned signals. Readout Guidance uses readout heads, lightweight\nnetworks trained to extract signals from the features of a pre-trained, frozen\ndiffusion model at every timestep. These readouts can encode single-image\nproperties, such as pose, depth, and edges; or higher-order properties that\nrelate multiple images, such as correspondence and appearance similarity.\nFurthermore, by comparing the readout estimates to a user-defined target, and\nback-propagating the gradient through the readout head, these estimates can be\nused to guide the sampling process. Compared to prior methods for conditional\ngeneration, Readout Guidance requires significantly fewer added parameters and\ntraining samples, and offers a convenient and simple recipe for reproducing\ndifferent forms of conditional control under a single framework, with a single\narchitecture and sampling procedure. We showcase these benefits in the\napplications of drag-based manipulation, identity-consistent generation, and\nspatially aligned control. Project page: https://readout-guidance.github.io.\n","authors":["Grace Luo","Trevor Darrell","Oliver Wang","Dan B Goldman","Aleksander Holynski"],"pdf_url":"https://arxiv.org/pdf/2312.02150v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2312.00968v2","updated":"2024-04-02T19:57:32Z","published":"2023-12-01T23:04:27Z","title":"Omni-SMoLA: Boosting Generalist Multimodal Models with Soft Mixture of\n Low-rank Experts","summary":" Large multi-modal models (LMMs) exhibit remarkable performance across\nnumerous tasks. However, generalist LMMs often suffer from performance\ndegradation when tuned over a large collection of tasks. Recent research\nsuggests that Mixture of Experts (MoE) architectures are useful for instruction\ntuning, but for LMMs of parameter size around O(50-100B), the prohibitive cost\nof replicating and storing the expert models severely limits the number of\nexperts we can use. We propose Omni-SMoLA, an architecture that uses the Soft\nMoE approach to (softly) mix many multimodal low rank experts, and avoids\nintroducing a significant number of new parameters compared to conventional MoE\nmodels. The core intuition here is that the large model provides a foundational\nbackbone, while different lightweight experts residually learn specialized\nknowledge, either per-modality or multimodally. Extensive experiments\ndemonstrate that the SMoLA approach helps improve the generalist performance\nacross a broad range of generative vision-and-language tasks, achieving new\nSoTA generalist performance that often matches or outperforms single\nspecialized LMM baselines, as well as new SoTA specialist performance.\n","authors":["Jialin Wu","Xia Hu","Yaqing Wang","Bo Pang","Radu Soricut"],"pdf_url":"https://arxiv.org/pdf/2312.00968v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02263v1","updated":"2024-04-02T19:37:58Z","published":"2024-04-02T19:37:58Z","title":"OFMPNet: Deep End-to-End Model for Occupancy and Flow Prediction in\n Urban Environment","summary":" The task of motion prediction is pivotal for autonomous driving systems,\nproviding crucial data to choose a vehicle behavior strategy within its\nsurroundings. Existing motion prediction techniques primarily focus on\npredicting the future trajectory of each agent in the scene individually,\nutilizing its past trajectory data. In this paper, we introduce an end-to-end\nneural network methodology designed to predict the future behaviors of all\ndynamic objects in the environment. This approach leverages the occupancy map\nand the scene's motion flow. We are investigatin various alternatives for\nconstructing a deep encoder-decoder model called OFMPNet. This model uses a\nsequence of bird's-eye-view road images, occupancy grid, and prior motion flow\nas input data. The encoder of the model can incorporate transformer,\nattention-based, or convolutional units. The decoder considers the use of both\nconvolutional modules and recurrent blocks. Additionally, we propose a novel\ntime-weighted motion flow loss, whose application has shown a substantial\ndecrease in end-point error. Our approach has achieved state-of-the-art results\non the Waymo Occupancy and Flow Prediction benchmark, with a Soft IoU of 52.1%\nand an AUC of 76.75% on Flow-Grounded Occupancy.\n","authors":["Youshaa Murhij","Dmitry Yudin"],"pdf_url":"https://arxiv.org/pdf/2404.02263v1.pdf","comment":"Accepted in Neurocomputing journal - 2024"},{"id":"http://arxiv.org/abs/2404.02257v1","updated":"2024-04-02T19:25:04Z","published":"2024-04-02T19:25:04Z","title":"SnAG: Scalable and Accurate Video Grounding","summary":" Temporal grounding of text descriptions in videos is a central problem in\nvision-language learning and video understanding. Existing methods often\nprioritize accuracy over scalability -- they have been optimized for grounding\nonly a few text queries within short videos, and fail to scale up to long\nvideos with hundreds of queries. In this paper, we study the effect of\ncross-modal fusion on the scalability of video grounding models. Our analysis\nestablishes late fusion as a more cost-effective fusion scheme for long-form\nvideos with many text queries. Moreover, it leads us to a novel, video-centric\nsampling scheme for efficient training. Based on these findings, we present\nSnAG, a simple baseline for scalable and accurate video grounding. Without\nbells and whistles, SnAG is 43% more accurate and 1.5x faster than CONE, a\nstate of the art for long-form video grounding on the challenging MAD dataset,\nwhile achieving highly competitive results on short videos.\n","authors":["Fangzhou Mu","Sicheng Mo","Yin Li"],"pdf_url":"https://arxiv.org/pdf/2404.02257v1.pdf","comment":"Accepted to CVPR 2024. Code available at\n https://github.com/fmu2/snag_release"},{"id":"http://arxiv.org/abs/2309.07096v4","updated":"2024-04-02T19:12:46Z","published":"2023-08-23T12:37:13Z","title":"Computational limits to the legibility of the imaged human brain","summary":" Our knowledge of the organisation of the human brain at the population-level\nis yet to translate into power to predict functional differences at the\nindividual-level, limiting clinical applications, and casting doubt on the\ngeneralisability of inferred mechanisms. It remains unknown whether the\ndifficulty arises from the absence of individuating biological patterns within\nthe brain, or from limited power to access them with the models and compute at\nour disposal. Here we comprehensively investigate the resolvability of such\npatterns with data and compute at unprecedented scale. Across 23 810 unique\nparticipants from UK Biobank, we systematically evaluate the predictability of\n25 individual biological characteristics, from all available combinations of\nstructural and functional neuroimaging data. Over 4526 GPU hours of\ncomputation, we train, optimize, and evaluate out-of-sample 700 individual\npredictive models, including fully-connected feed-forward neural networks of\ndemographic, psychological, serological, chronic disease, and functional\nconnectivity characteristics, and both uni- and multi-modal 3D convolutional\nneural network models of macro- and micro-structural brain imaging. We find a\nmarked discrepancy between the high predictability of sex (balanced accuracy\n99.7%), age (mean absolute error 2.048 years, R2 0.859), and weight (mean\nabsolute error 2.609Kg, R2 0.625), for which we set new state-of-the-art\nperformance, and the surprisingly low predictability of other characteristics.\nNeither structural nor functional imaging predicted psychology better than the\ncoincidence of chronic disease (p<0.05). Serology predicted chronic disease\n(p<0.05) and was best predicted by it (p<0.001), followed by structural\nneuroimaging (p<0.05). Our findings suggest either more informative imaging or\nmore powerful models are needed to decipher individual level characteristics\nfrom the human brain.\n","authors":["James K Ruffle","Robert J Gray","Samia Mohinta","Guilherme Pombo","Chaitanya Kaul","Harpreet Hyare","Geraint Rees","Parashkev Nachev"],"pdf_url":"https://arxiv.org/pdf/2309.07096v4.pdf","comment":"38 pages, 6 figures, 1 table, 2 supplementary figures, 1\n supplementary table"},{"id":"http://arxiv.org/abs/2311.17024v2","updated":"2024-04-02T19:11:35Z","published":"2023-11-28T18:27:15Z","title":"Diffusion 3D Features (Diff3F): Decorating Untextured Shapes with\n Distilled Semantic Features","summary":" We present Diff3F as a simple, robust, and class-agnostic feature descriptor\nthat can be computed for untextured input shapes (meshes or point clouds). Our\nmethod distills diffusion features from image foundational models onto input\nshapes. Specifically, we use the input shapes to produce depth and normal maps\nas guidance for conditional image synthesis. In the process, we produce\n(diffusion) features in 2D that we subsequently lift and aggregate on the\noriginal surface. Our key observation is that even if the conditional image\ngenerations obtained from multi-view rendering of the input shapes are\ninconsistent, the associated image features are robust and, hence, can be\ndirectly aggregated across views. This produces semantic features on the input\nshapes, without requiring additional data or training. We perform extensive\nexperiments on multiple benchmarks (SHREC'19, SHREC'20, FAUST, and TOSCA) and\ndemonstrate that our features, being semantic instead of geometric, produce\nreliable correspondence across both isometric and non-isometrically related\nshape families. Code is available via the project page at\nhttps://diff3f.github.io/\n","authors":["Niladri Shekhar Dutt","Sanjeev Muralikrishnan","Niloy J. Mitra"],"pdf_url":"https://arxiv.org/pdf/2311.17024v2.pdf","comment":"Accepted at CVPR'24"},{"id":"http://arxiv.org/abs/2404.02242v1","updated":"2024-04-02T19:03:39Z","published":"2024-04-02T19:03:39Z","title":"Towards Robust 3D Pose Transfer with Adversarial Learning","summary":" 3D pose transfer that aims to transfer the desired pose to a target mesh is\none of the most challenging 3D generation tasks. Previous attempts rely on\nwell-defined parametric human models or skeletal joints as driving pose\nsources. However, to obtain those clean pose sources, cumbersome but necessary\npre-processing pipelines are inevitable, hindering implementations of the\nreal-time applications. This work is driven by the intuition that the\nrobustness of the model can be enhanced by introducing adversarial samples into\nthe training, leading to a more invulnerable model to the noisy inputs, which\neven can be further extended to directly handling the real-world data like raw\npoint clouds/scans without intermediate processing. Furthermore, we propose a\nnovel 3D pose Masked Autoencoder (3D-PoseMAE), a customized MAE that\neffectively learns 3D extrinsic presentations (i.e., pose). 3D-PoseMAE\nfacilitates learning from the aspect of extrinsic attributes by simultaneously\ngenerating adversarial samples that perturb the model and learning the\narbitrary raw noisy poses via a multi-scale masking strategy. Both qualitative\nand quantitative studies show that the transferred meshes given by our network\nresult in much better quality. Besides, we demonstrate the strong\ngeneralizability of our method on various poses, different domains, and even\nraw scans. Experimental results also show meaningful insights that the\nintermediate adversarial samples generated in the training can successfully\nattack the existing pose transfer models.\n","authors":["Haoyu Chen","Hao Tang","Ehsan Adeli","Guoying Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.02242v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.02241v1","updated":"2024-04-02T18:59:39Z","published":"2024-04-02T18:59:39Z","title":"Linear Combination of Saved Checkpoints Makes Consistency and Diffusion\n Models Better","summary":" Diffusion Models (DM) and Consistency Models (CM) are two types of popular\ngenerative models with good generation quality on various tasks. When training\nDM and CM, intermediate weight checkpoints are not fully utilized and only the\nlast converged checkpoint is used. In this work, we find that high-quality\nmodel weights often lie in a basin which cannot be reached by SGD but can be\nobtained by proper checkpoint averaging. Based on these observations, we\npropose LCSC, a simple but effective and efficient method to enhance the\nperformance of DM and CM, by combining checkpoints along the training\ntrajectory with coefficients deduced from evolutionary search. We demonstrate\nthe value of LCSC through two use cases: $\\textbf{(a) Reducing training cost.}$\nWith LCSC, we only need to train DM/CM with fewer number of iterations and/or\nlower batch sizes to obtain comparable sample quality with the fully trained\nmodel. For example, LCSC achieves considerable training speedups for CM\n(23$\\times$ on CIFAR-10 and 15$\\times$ on ImageNet-64). $\\textbf{(b) Enhancing\npre-trained models.}$ Assuming full training is already done, LCSC can further\nimprove the generation quality or speed of the final converged models. For\nexample, LCSC achieves better performance using 1 number of function evaluation\n(NFE) than the base model with 2 NFE on consistency distillation, and decreases\nthe NFE of DM from 15 to 9 while maintaining the generation quality on\nCIFAR-10. Our code is available at\nhttps://github.com/imagination-research/LCSC.\n","authors":["Enshu Liu","Junyi Zhu","Zinan Lin","Xuefei Ning","Matthew B. Blaschko","Sergey Yekhanin","Shengen Yan","Guohao Dai","Huazhong Yang","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2404.02241v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.14516v2","updated":"2024-04-02T18:57:12Z","published":"2023-09-25T20:22:47Z","title":"UniBEV: Multi-modal 3D Object Detection with Uniform BEV Encoders for\n Robustness against Missing Sensor Modalities","summary":" Multi-sensor object detection is an active research topic in automated\ndriving, but the robustness of such detection models against missing sensor\ninput (modality missing), e.g., due to a sudden sensor failure, is a critical\nproblem which remains under-studied. In this work, we propose UniBEV, an\nend-to-end multi-modal 3D object detection framework designed for robustness\nagainst missing modalities: UniBEV can operate on LiDAR plus camera input, but\nalso on LiDAR-only or camera-only input without retraining. To facilitate its\ndetector head to handle different input combinations, UniBEV aims to create\nwell-aligned Bird's Eye View (BEV) feature maps from each available modality.\nUnlike prior BEV-based multi-modal detection methods, all sensor modalities\nfollow a uniform approach to resample features from the native sensor\ncoordinate systems to the BEV features. We furthermore investigate the\nrobustness of various fusion strategies w.r.t. missing modalities: the commonly\nused feature concatenation, but also channel-wise averaging, and a\ngeneralization to weighted averaging termed Channel Normalized Weights. To\nvalidate its effectiveness, we compare UniBEV to state-of-the-art BEVFusion and\nMetaBEV on nuScenes over all sensor input combinations. In this setting, UniBEV\nachieves $52.5 \\%$ mAP on average over all input combinations, significantly\nimproving over the baselines ($43.5 \\%$ mAP on average for BEVFusion, $48.7 \\%$\nmAP on average for MetaBEV). An ablation study shows the robustness benefits of\nfusing by weighted averaging over regular concatenation, and of sharing queries\nbetween the BEV encoders of each modality. Our code will be released upon paper\nacceptance.\n","authors":["Shiming Wang","Holger Caesar","Liangliang Nan","Julian F. P. Kooij"],"pdf_url":"https://arxiv.org/pdf/2309.14516v2.pdf","comment":"Accepted by IEEE Intelligent Vehicles Symposium (IV 2024)"},{"id":"http://arxiv.org/abs/2401.10831v2","updated":"2024-04-02T18:54:50Z","published":"2024-01-19T17:27:21Z","title":"Understanding Video Transformers via Universal Concept Discovery","summary":" This paper studies the problem of concept-based interpretability of\ntransformer representations for videos. Concretely, we seek to explain the\ndecision-making process of video transformers based on high-level,\nspatiotemporal concepts that are automatically discovered. Prior research on\nconcept-based interpretability has concentrated solely on image-level tasks.\nComparatively, video models deal with the added temporal dimension, increasing\ncomplexity and posing challenges in identifying dynamic concepts over time. In\nthis work, we systematically address these challenges by introducing the first\nVideo Transformer Concept Discovery (VTCD) algorithm. To this end, we propose\nan efficient approach for unsupervised identification of units of video\ntransformer representations - concepts, and ranking their importance to the\noutput of a model. The resulting concepts are highly interpretable, revealing\nspatio-temporal reasoning mechanisms and object-centric representations in\nunstructured video models. Performing this analysis jointly over a diverse set\nof supervised and self-supervised representations, we discover that some of\nthese mechanism are universal in video transformers. Finally, we show that VTCD\ncan be used for fine-grained action recognition and video object segmentation.\n","authors":["Matthew Kowal","Achal Dave","Rares Ambrus","Adrien Gaidon","Konstantinos G. Derpanis","Pavel Tokmakov"],"pdf_url":"https://arxiv.org/pdf/2401.10831v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.02233v1","updated":"2024-04-02T18:40:55Z","published":"2024-04-02T18:40:55Z","title":"Visual Concept Connectome (VCC): Open World Concept Discovery and their\n Interlayer Connections in Deep Models","summary":" Understanding what deep network models capture in their learned\nrepresentations is a fundamental challenge in computer vision. We present a new\nmethodology to understanding such vision models, the Visual Concept Connectome\n(VCC), which discovers human interpretable concepts and their interlayer\nconnections in a fully unsupervised manner. Our approach simultaneously reveals\nfine-grained concepts at a layer, connection weightings across all layers and\nis amendable to global analysis of network structure (e.g., branching pattern\nof hierarchical concept assemblies). Previous work yielded ways to extract\ninterpretable concepts from single layers and examine their impact on\nclassification, but did not afford multilayer concept analysis across an entire\nnetwork architecture. Quantitative and qualitative empirical results show the\neffectiveness of VCCs in the domain of image classification. Also, we leverage\nVCCs for the application of failure mode debugging to reveal where mistakes\narise in deep networks.\n","authors":["Matthew Kowal","Richard P. Wildes","Konstantinos G. Derpanis"],"pdf_url":"https://arxiv.org/pdf/2404.02233v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.02227v1","updated":"2024-04-02T18:30:29Z","published":"2024-04-02T18:30:29Z","title":"OOSTraj: Out-of-Sight Trajectory Prediction With Vision-Positioning\n Denoising","summary":" Trajectory prediction is fundamental in computer vision and autonomous\ndriving, particularly for understanding pedestrian behavior and enabling\nproactive decision-making. Existing approaches in this field often assume\nprecise and complete observational data, neglecting the challenges associated\nwith out-of-view objects and the noise inherent in sensor data due to limited\ncamera range, physical obstructions, and the absence of ground truth for\ndenoised sensor data. Such oversights are critical safety concerns, as they can\nresult in missing essential, non-visible objects. To bridge this gap, we\npresent a novel method for out-of-sight trajectory prediction that leverages a\nvision-positioning technique. Our approach denoises noisy sensor observations\nin an unsupervised manner and precisely maps sensor-based trajectories of\nout-of-sight objects into visual trajectories. This method has demonstrated\nstate-of-the-art performance in out-of-sight noisy sensor trajectory denoising\nand prediction on the Vi-Fi and JRDB datasets. By enhancing trajectory\nprediction accuracy and addressing the challenges of out-of-sight objects, our\nwork significantly contributes to improving the safety and reliability of\nautonomous driving in complex environments. Our work represents the first\ninitiative towards Out-Of-Sight Trajectory prediction (OOSTraj), setting a new\nbenchmark for future research. The code is available at\n\\url{https://github.com/Hai-chao-Zhang/OOSTraj}.\n","authors":["Haichao Zhang","Yi Xu","Hongsheng Lu","Takayuki Shimizu","Yun Fu"],"pdf_url":"https://arxiv.org/pdf/2404.02227v1.pdf","comment":"In Proceedings of IEEE/CVF Conference on Computer Vision and Pattern\n Recognition 2024 (CVPR)"},{"id":"http://arxiv.org/abs/2404.02225v1","updated":"2024-04-02T18:27:03Z","published":"2024-04-02T18:27:03Z","title":"CHOSEN: Contrastive Hypothesis Selection for Multi-View Depth Refinement","summary":" We propose CHOSEN, a simple yet flexible, robust and effective multi-view\ndepth refinement framework. It can be employed in any existing multi-view\nstereo pipeline, with straightforward generalization capability for different\nmulti-view capture systems such as camera relative positioning and lenses.\nGiven an initial depth estimation, CHOSEN iteratively re-samples and selects\nthe best hypotheses, and automatically adapts to different metric or intrinsic\nscales determined by the capture system. The key to our approach is the\napplication of contrastive learning in an appropriate solution space and a\ncarefully designed hypothesis feature, based on which positive and negative\nhypotheses can be effectively distinguished. Integrated in a simple baseline\nmulti-view stereo pipeline, CHOSEN delivers impressive quality in terms of\ndepth and normal accuracy compared to many current deep learning based\nmulti-view stereo pipelines.\n","authors":["Di Qiu","Yinda Zhang","Thabo Beeler","Vladimir Tankovich","Christian Häne","Sean Fanello","Christoph Rhemann","Sergio Orts Escolano"],"pdf_url":"https://arxiv.org/pdf/2404.02225v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.03835v2","updated":"2024-04-02T18:26:12Z","published":"2024-01-08T11:46:45Z","title":"Limitations of Data-Driven Spectral Reconstruction -- Optics-Aware\n Analysis and Mitigation","summary":" Hyperspectral imaging empowers machine vision systems with the distinct\ncapability of identifying materials through recording their spectral\nsignatures. Recent efforts in data-driven spectral reconstruction aim at\nextracting spectral information from RGB images captured by cost-effective RGB\ncameras, instead of dedicated hardware.\n In this paper we systematically analyze the performance of such methods,\nevaluating both the practical limitations with respect to current datasets and\noverfitting, as well as fundamental limitations with respect to the nature of\nthe information encoded in the RGB images, and the dependency of this\ninformation on the optical system of the camera.\n We find that, the current models are not robust under slight variations,\ne.g., in noise level or compression of the RGB file. Without modeling\nunderrepresented spectral content, existing datasets and the models trained on\nthem are limited in their ability to cope with challenging metameric colors. To\nmitigate this issue, we propose to exploit the combination of metameric data\naugmentation and optical lens aberrations to improve the encoding of the\nmetameric information into the RGB image, which paves the road towards higher\nperforming spectral imaging and reconstruction approaches.\n","authors":["Qiang Fu","Matheus Souza","Eunsue Choi","Suhyun Shin","Seung-Hwan Baek","Wolfgang Heidrich"],"pdf_url":"https://arxiv.org/pdf/2401.03835v2.pdf","comment":"12 pages, 7 figures, 8 tables"},{"id":"http://arxiv.org/abs/2312.03587v2","updated":"2024-04-02T18:14:35Z","published":"2023-12-06T16:24:47Z","title":"Language-Informed Visual Concept Learning","summary":" Our understanding of the visual world is centered around various concept\naxes, characterizing different aspects of visual entities. While different\nconcept axes can be easily specified by language, e.g. color, the exact visual\nnuances along each axis often exceed the limitations of linguistic\narticulations, e.g. a particular style of painting. In this work, our goal is\nto learn a language-informed visual concept representation, by simply\ndistilling large pre-trained vision-language models. Specifically, we train a\nset of concept encoders to encode the information pertinent to a set of\nlanguage-informed concept axes, with an objective of reproducing the input\nimage through a pre-trained Text-to-Image (T2I) model. To encourage better\ndisentanglement of different concept encoders, we anchor the concept embeddings\nto a set of text embeddings obtained from a pre-trained Visual Question\nAnswering (VQA) model. At inference time, the model extracts concept embeddings\nalong various axes from new test images, which can be remixed to generate\nimages with novel compositions of visual concepts. With a lightweight test-time\nfinetuning procedure, it can also generalize to novel concepts unseen at\ntraining.\n","authors":["Sharon Lee","Yunzhi Zhang","Shangzhe Wu","Jiajun Wu"],"pdf_url":"https://arxiv.org/pdf/2312.03587v2.pdf","comment":"ICLR 2024. The first two authors contributed equally and are\n alphabetically ordered. Project page:\n https://ai.stanford.edu/~yzzhang/projects/concept-axes/"},{"id":"http://arxiv.org/abs/2312.12433v3","updated":"2024-04-02T18:09:22Z","published":"2023-12-19T18:58:40Z","title":"TAO-Amodal: A Benchmark for Tracking Any Object Amodally","summary":" Amodal perception, the ability to comprehend complete object structures from\npartial visibility, is a fundamental skill, even for infants. Its significance\nextends to applications like autonomous driving, where a clear understanding of\nheavily occluded objects is essential. However, modern detection and tracking\nalgorithms often overlook this critical capability, perhaps due to the\nprevalence of \\textit{modal} annotations in most benchmarks. To address the\nscarcity of amodal benchmarks, we introduce TAO-Amodal, featuring 833 diverse\ncategories in thousands of video sequences. Our dataset includes\n\\textit{amodal} and modal bounding boxes for visible and partially or fully\noccluded objects, including those that are partially out of the camera frame.\nWe investigate the current lay of the land in both amodal tracking and\ndetection by benchmarking state-of-the-art modal trackers and amodal\nsegmentation methods. We find that existing methods, even when adapted for\namodal tracking, struggle to detect and track objects under heavy occlusion. To\nmitigate this, we explore simple finetuning schemes that can increase the\namodal tracking and detection metrics of occluded objects by 2.1\\% and 3.3\\%.\n","authors":["Cheng-Yen Hsieh","Kaihua Chen","Achal Dave","Tarasha Khurana","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2312.12433v3.pdf","comment":"Project Page: https://tao-amodal.github.io"},{"id":"http://arxiv.org/abs/2311.18832v2","updated":"2024-04-02T17:59:33Z","published":"2023-11-30T18:59:44Z","title":"Exploiting Diffusion Prior for Generalizable Dense Prediction","summary":" Contents generated by recent advanced Text-to-Image (T2I) diffusion models\nare sometimes too imaginative for existing off-the-shelf dense predictors to\nestimate due to the immitigable domain gap. We introduce DMP, a pipeline\nutilizing pre-trained T2I models as a prior for dense prediction tasks. To\naddress the misalignment between deterministic prediction tasks and stochastic\nT2I models, we reformulate the diffusion process through a sequence of\ninterpolations, establishing a deterministic mapping between input RGB images\nand output prediction distributions. To preserve generalizability, we use\nlow-rank adaptation to fine-tune pre-trained models. Extensive experiments\nacross five tasks, including 3D property estimation, semantic segmentation, and\nintrinsic image decomposition, showcase the efficacy of the proposed method.\nDespite limited-domain training data, the approach yields faithful estimations\nfor arbitrary images, surpassing existing state-of-the-art algorithms.\n","authors":["Hsin-Ying Lee","Hung-Yu Tseng","Hsin-Ying Lee","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2311.18832v2.pdf","comment":"To appear in CVPR 2024. Project page: https://shinying.github.io/dmp"},{"id":"http://arxiv.org/abs/2404.02189v1","updated":"2024-04-02T16:48:34Z","published":"2024-04-02T16:48:34Z","title":"Insights from the Use of Previously Unseen Neural Architecture Search\n Datasets","summary":" The boundless possibility of neural networks which can be used to solve a\nproblem -- each with different performance -- leads to a situation where a Deep\nLearning expert is required to identify the best neural network. This goes\nagainst the hope of removing the need for experts. Neural Architecture Search\n(NAS) offers a solution to this by automatically identifying the best\narchitecture. However, to date, NAS work has focused on a small set of datasets\nwhich we argue are not representative of real-world problems. We introduce\neight new datasets created for a series of NAS Challenges: AddNIST, Language,\nMultNIST, CIFARTile, Gutenberg, Isabella, GeoClassing, and Chesseract. These\ndatasets and challenges are developed to direct attention to issues in NAS\ndevelopment and to encourage authors to consider how their models will perform\non datasets unknown to them at development time. We present experimentation\nusing standard Deep Learning methods as well as the best results from challenge\nparticipants.\n","authors":["Rob Geada","David Towers","Matthew Forshaw","Amir Atapour-Abarghouei","A. Stephen McGough"],"pdf_url":"https://arxiv.org/pdf/2404.02189v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02185v1","updated":"2024-04-02T15:49:00Z","published":"2024-04-02T15:49:00Z","title":"NeRFCodec: Neural Feature Compression Meets Neural Radiance Fields for\n Memory-Efficient Scene Representation","summary":" The emergence of Neural Radiance Fields (NeRF) has greatly impacted 3D scene\nmodeling and novel-view synthesis. As a kind of visual media for 3D scene\nrepresentation, compression with high rate-distortion performance is an eternal\ntarget. Motivated by advances in neural compression and neural field\nrepresentation, we propose NeRFCodec, an end-to-end NeRF compression framework\nthat integrates non-linear transform, quantization, and entropy coding for\nmemory-efficient scene representation. Since training a non-linear transform\ndirectly on a large scale of NeRF feature planes is impractical, we discover\nthat pre-trained neural 2D image codec can be utilized for compressing the\nfeatures when adding content-specific parameters. Specifically, we reuse neural\n2D image codec but modify its encoder and decoder heads, while keeping the\nother parts of the pre-trained decoder frozen. This allows us to train the full\npipeline via supervision of rendering loss and entropy loss, yielding the\nrate-distortion balance by updating the content-specific parameters. At test\ntime, the bitstreams containing latent code, feature decoder head, and other\nside information are transmitted for communication. Experimental results\ndemonstrate our method outperforms existing NeRF compression methods, enabling\nhigh-quality novel view synthesis with a memory budget of 0.5 MB.\n","authors":["Sicheng Li","Hao Li","Yiyi Liao","Lu Yu"],"pdf_url":"https://arxiv.org/pdf/2404.02185v1.pdf","comment":"Accepted at CVPR2024. The source code will be released"},{"id":"http://arxiv.org/abs/2307.11957v5","updated":"2024-04-02T12:25:10Z","published":"2023-07-22T01:56:58Z","title":"High-performance real-world optical computing trained by in situ\n model-free optimization","summary":" Optical computing systems provide high-speed and low-energy data processing\nbut face deficiencies in computationally demanding training and\nsimulation-to-reality gaps. We propose a gradient-based model-free optimization\n(G-MFO) method based on a Monte Carlo gradient estimation algorithm for\ncomputationally efficient in situ training of optical computing systems. This\napproach treats an optical computing system as a black box and back-propagates\nthe loss directly to the optical computing weights' probability distributions,\ncircumventing the need for a computationally heavy and biased system\nsimulation. Our experiments on diffractive optical computing systems show that\nG-MFO outperforms hybrid training on the MNIST and FMNIST datasets.\nFurthermore, we demonstrate image-free and high-speed classification of cells\nfrom their marker-free phase maps. Our method's model-free and high-performance\nnature, combined with its low demand for computational resources, paves the way\nfor accelerating the transition of optical computing from laboratory\ndemonstrations to practical, real-world applications.\n","authors":["Guangyuan Zhao","Xin Shu","Renjie Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.11957v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01878v1","updated":"2024-04-02T12:08:26Z","published":"2024-04-02T12:08:26Z","title":"Real, fake and synthetic faces -- does the coin have three sides?","summary":" With the ever-growing power of generative artificial intelligence, deepfake\nand artificially generated (synthetic) media have continued to spread online,\nwhich creates various ethical and moral concerns regarding their usage. To\ntackle this, we thus present a novel exploration of the trends and patterns\nobserved in real, deepfake and synthetic facial images. The proposed analysis\nis done in two parts: firstly, we incorporate eight deep learning models and\nanalyze their performances in distinguishing between the three classes of\nimages. Next, we look to further delve into the similarities and differences\nbetween these three sets of images by investigating their image properties both\nin the context of the entire image as well as in the context of specific\nregions within the image. ANOVA test was also performed and provided further\nclarity amongst the patterns associated between the images of the three\nclasses. From our findings, we observe that the investigated deeplearning\nmodels found it easier to detect synthetic facial images, with the ViT Patch-16\nmodel performing best on this task with a class-averaged sensitivity,\nspecificity, precision, and accuracy of 97.37%, 98.69%, 97.48%, and 98.25%,\nrespectively. This observation was supported by further analysis of various\nimage properties. We saw noticeable differences across the three category of\nimages. This analysis can help us build better algorithms for facial image\ngeneration, and also shows that synthetic, deepfake and real face images are\nindeed three different classes.\n","authors":["Shahzeb Naeem","Ramzi Al-Sharawi","Muhammad Riyyan Khan","Usman Tariq","Abhinav Dhall","Hasan Al-Nashash"],"pdf_url":"https://arxiv.org/pdf/2404.01878v1.pdf","comment":null}]},"2024-04-03T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.02905v1","updated":"2024-04-03T17:59:53Z","published":"2024-04-03T17:59:53Z","title":"Visual Autoregressive Modeling: Scalable Image Generation via Next-Scale\n Prediction","summary":" We present Visual AutoRegressive modeling (VAR), a new generation paradigm\nthat redefines the autoregressive learning on images as coarse-to-fine\n\"next-scale prediction\" or \"next-resolution prediction\", diverging from the\nstandard raster-scan \"next-token prediction\". This simple, intuitive\nmethodology allows autoregressive (AR) transformers to learn visual\ndistributions fast and generalize well: VAR, for the first time, makes AR\nmodels surpass diffusion transformers in image generation. On ImageNet 256x256\nbenchmark, VAR significantly improve AR baseline by improving Frechet inception\ndistance (FID) from 18.65 to 1.80, inception score (IS) from 80.4 to 356.4,\nwith around 20x faster inference speed. It is also empirically verified that\nVAR outperforms the Diffusion Transformer (DiT) in multiple dimensions\nincluding image quality, inference speed, data efficiency, and scalability.\nScaling up VAR models exhibits clear power-law scaling laws similar to those\nobserved in LLMs, with linear correlation coefficients near -0.998 as solid\nevidence. VAR further showcases zero-shot generalization ability in downstream\ntasks including image in-painting, out-painting, and editing. These results\nsuggest VAR has initially emulated the two important properties of LLMs:\nScaling Laws and zero-shot task generalization. We have released all models and\ncodes to promote the exploration of AR/VAR models for visual generation and\nunified learning.\n","authors":["Keyu Tian","Yi Jiang","Zehuan Yuan","Bingyue Peng","Liwei Wang"],"pdf_url":"https://arxiv.org/pdf/2404.02905v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02904v1","updated":"2024-04-03T17:59:36Z","published":"2024-04-03T17:59:36Z","title":"ALOHa: A New Measure for Hallucination in Captioning Models","summary":" Despite recent advances in multimodal pre-training for visual description,\nstate-of-the-art models still produce captions containing errors, such as\nhallucinating objects not present in a scene. The existing prominent metric for\nobject hallucination, CHAIR, is limited to a fixed set of MS COCO objects and\nsynonyms. In this work, we propose a modernized open-vocabulary metric, ALOHa,\nwhich leverages large language models (LLMs) to measure object hallucinations.\nSpecifically, we use an LLM to extract groundable objects from a candidate\ncaption, measure their semantic similarity to reference objects from captions\nand object detections, and use Hungarian matching to produce a final\nhallucination score. We show that ALOHa correctly identifies 13.6% more\nhallucinated objects than CHAIR on HAT, a new gold-standard subset of MS COCO\nCaptions annotated for hallucinations, and 30.8% more on nocaps, where objects\nextend beyond MS COCO categories. Our code is available at\nhttps://davidmchan.github.io/aloha/.\n","authors":["Suzanne Petryk","David M. Chan","Anish Kachinthaya","Haodi Zou","John Canny","Joseph E. Gonzalez","Trevor Darrell"],"pdf_url":"https://arxiv.org/pdf/2404.02904v1.pdf","comment":"To appear at NAACL 2024"},{"id":"http://arxiv.org/abs/2404.02903v1","updated":"2024-04-03T17:59:28Z","published":"2024-04-03T17:59:28Z","title":"LidarDM: Generative LiDAR Simulation in a Generated World","summary":" We present LidarDM, a novel LiDAR generative model capable of producing\nrealistic, layout-aware, physically plausible, and temporally coherent LiDAR\nvideos. LidarDM stands out with two unprecedented capabilities in LiDAR\ngenerative modeling: (i) LiDAR generation guided by driving scenarios, offering\nsignificant potential for autonomous driving simulations, and (ii) 4D LiDAR\npoint cloud generation, enabling the creation of realistic and temporally\ncoherent sequences. At the heart of our model is a novel integrated 4D world\ngeneration framework. Specifically, we employ latent diffusion models to\ngenerate the 3D scene, combine it with dynamic actors to form the underlying 4D\nworld, and subsequently produce realistic sensory observations within this\nvirtual environment. Our experiments indicate that our approach outperforms\ncompeting algorithms in realism, temporal coherency, and layout consistency. We\nadditionally show that LidarDM can be used as a generative world model\nsimulator for training and testing perception models.\n","authors":["Vlas Zyrianov","Henry Che","Zhijian Liu","Shenlong Wang"],"pdf_url":"https://arxiv.org/pdf/2404.02903v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02900v1","updated":"2024-04-03T17:58:21Z","published":"2024-04-03T17:58:21Z","title":"DeiT-LT Distillation Strikes Back for Vision Transformer Training on\n Long-Tailed Datasets","summary":" Vision Transformer (ViT) has emerged as a prominent architecture for various\ncomputer vision tasks. In ViT, we divide the input image into patch tokens and\nprocess them through a stack of self attention blocks. However, unlike\nConvolutional Neural Networks (CNN), ViTs simple architecture has no\ninformative inductive bias (e.g., locality,etc. ). Due to this, ViT requires a\nlarge amount of data for pre-training. Various data efficient approaches (DeiT)\nhave been proposed to train ViT on balanced datasets effectively. However,\nlimited literature discusses the use of ViT for datasets with long-tailed\nimbalances. In this work, we introduce DeiT-LT to tackle the problem of\ntraining ViTs from scratch on long-tailed datasets. In DeiT-LT, we introduce an\nefficient and effective way of distillation from CNN via distillation DIST\ntoken by using out-of-distribution images and re-weighting the distillation\nloss to enhance focus on tail classes. This leads to the learning of local\nCNN-like features in early ViT blocks, improving generalization for tail\nclasses. Further, to mitigate overfitting, we propose distilling from a flat\nCNN teacher, which leads to learning low-rank generalizable features for DIST\ntokens across all ViT blocks. With the proposed DeiT-LT scheme, the\ndistillation DIST token becomes an expert on the tail classes, and the\nclassifier CLS token becomes an expert on the head classes. The experts help to\neffectively learn features corresponding to both the majority and minority\nclasses using a distinct set of tokens within the same ViT architecture. We\nshow the effectiveness of DeiT-LT for training ViT from scratch on datasets\nranging from small-scale CIFAR-10 LT to large-scale iNaturalist-2018.\n","authors":["Harsh Rangwani","Pradipto Mondal","Mayank Mishra","Ashish Ramayee Asokan","R. Venkatesh Babu"],"pdf_url":"https://arxiv.org/pdf/2404.02900v1.pdf","comment":"CVPR 2024. Project Page: https://rangwani-harsh.github.io/DeiT-LT"},{"id":"http://arxiv.org/abs/2312.00947v2","updated":"2024-04-03T17:58:13Z","published":"2023-12-01T22:00:14Z","title":"FreeZe: Training-free zero-shot 6D pose estimation with geometric and\n vision foundation models","summary":" Estimating the 6D pose of objects unseen during training is highly desirable\nyet challenging. Zero-shot object 6D pose estimation methods address this\nchallenge by leveraging additional task-specific supervision provided by\nlarge-scale, photo-realistic synthetic datasets. However, their performance\nheavily depends on the quality and diversity of rendered data and they require\nextensive training. In this work, we show how to tackle the same task but\nwithout training on specific data. We propose FreeZe, a novel solution that\nharnesses the capabilities of pre-trained geometric and vision foundation\nmodels. FreeZe leverages 3D geometric descriptors learned from unrelated 3D\npoint clouds and 2D visual features learned from web-scale 2D images to\ngenerate discriminative 3D point-level descriptors. We then estimate the 6D\npose of unseen objects by 3D registration based on RANSAC. We also introduce a\nnovel algorithm to solve ambiguous cases due to geometrically symmetric objects\nthat is based on visual features. We comprehensively evaluate FreeZe across the\nseven core datasets of the BOP Benchmark, which include over a hundred 3D\nobjects and 20,000 images captured in various scenarios. FreeZe consistently\noutperforms all state-of-the-art approaches, including competitors extensively\ntrained on synthetic 6D pose estimation data. Code will be publicly available\nat https://andreacaraffa.github.io/freeze.\n","authors":["Andrea Caraffa","Davide Boscaini","Amir Hamza","Fabio Poiesi"],"pdf_url":"https://arxiv.org/pdf/2312.00947v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02899v1","updated":"2024-04-03T17:57:15Z","published":"2024-04-03T17:57:15Z","title":"MatAtlas: Text-driven Consistent Geometry Texturing and Material\n Assignment","summary":" We present MatAtlas, a method for consistent text-guided 3D model texturing.\nFollowing recent progress we leverage a large scale text-to-image generation\nmodel (e.g., Stable Diffusion) as a prior to texture a 3D model. We carefully\ndesign an RGB texturing pipeline that leverages a grid pattern diffusion,\ndriven by depth and edges. By proposing a multi-step texture refinement\nprocess, we significantly improve the quality and 3D consistency of the\ntexturing output. To further address the problem of baked-in lighting, we move\nbeyond RGB colors and pursue assigning parametric materials to the assets.\nGiven the high-quality initial RGB texture, we propose a novel material\nretrieval method capitalized on Large Language Models (LLM), enabling\neditabiliy and relightability. We evaluate our method on a wide variety of\ngeometries and show that our method significantly outperform prior arts. We\nalso analyze the role of each component through a detailed ablation study.\n","authors":["Duygu Ceylan","Valentin Deschaintre","Thibault Groueix","Rosalie Martin","Chun-Hao Huang","Romain Rouffet","Vladimir Kim","Gaëtan Lassagne"],"pdf_url":"https://arxiv.org/pdf/2404.02899v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02897v1","updated":"2024-04-03T17:54:37Z","published":"2024-04-03T17:54:37Z","title":"Deep Image Composition Meets Image Forgery","summary":" Image forgery is a topic that has been studied for many years. Before the\nbreakthrough of deep learning, forged images were detected using handcrafted\nfeatures that did not require training. These traditional methods failed to\nperform satisfactorily even on datasets much worse in quality than real-life\nimage manipulations. Advances in deep learning have impacted image forgery\ndetection as much as they have impacted other areas of computer vision and have\nimproved the state of the art. Deep learning models require large amounts of\nlabeled data for training. In the case of image forgery, labeled data at the\npixel level is a very important factor for the models to learn. None of the\nexisting datasets have sufficient size, realism and pixel-level labeling at the\nsame time. This is due to the high cost of producing and labeling quality\nimages. It can take hours for an image editing expert to manipulate just one\nimage. To bridge this gap, we automate data generation using image composition\ntechniques that are very related to image forgery. Unlike other automated data\ngeneration frameworks, we use state of the art image composition deep learning\nmodels to generate spliced images close to the quality of real-life\nmanipulations. Finally, we test the generated dataset on the SOTA image\nmanipulation detection model and show that its prediction performance is lower\ncompared to existing datasets, i.e. we produce realistic images that are more\ndifficult to detect. Dataset will be available at\nhttps://github.com/99eren99/DIS25k .\n","authors":["Eren Tahir","Mert Bal"],"pdf_url":"https://arxiv.org/pdf/2404.02897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02889v1","updated":"2024-04-03T17:44:02Z","published":"2024-04-03T17:44:02Z","title":"Steganographic Passport: An Owner and User Verifiable Credential for\n Deep Model IP Protection Without Retraining","summary":" Ensuring the legal usage of deep models is crucial to promoting trustable,\naccountable, and responsible artificial intelligence innovation. Current\npassport-based methods that obfuscate model functionality for license-to-use\nand ownership verifications suffer from capacity and quality constraints, as\nthey require retraining the owner model for new users. They are also vulnerable\nto advanced Expanded Residual Block ambiguity attacks. We propose\nSteganographic Passport, which uses an invertible steganographic network to\ndecouple license-to-use from ownership verification by hiding the user's\nidentity images into the owner-side passport and recovering them from their\nrespective user-side passports. An irreversible and collision-resistant hash\nfunction is used to avoid exposing the owner-side passport from the derived\nuser-side passports and increase the uniqueness of the model signature. To\nsafeguard both the passport and model's weights against advanced ambiguity\nattacks, an activation-level obfuscation is proposed for the verification\nbranch of the owner's model. By jointly training the verification and\ndeployment branches, their weights become tightly coupled. The proposed method\nsupports agile licensing of deep models by providing a strong ownership proof\nand license accountability without requiring a separate model retraining for\nthe admission of every new user. Experiment results show that our\nSteganographic Passport outperforms other passport-based deep model protection\nmethods in robustness against various known attacks.\n","authors":["Qi Cui","Ruohan Meng","Chaohui Xu","Chip-Hong Chang"],"pdf_url":"https://arxiv.org/pdf/2404.02889v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14740v2","updated":"2024-04-03T17:42:44Z","published":"2023-08-28T17:41:14Z","title":"Total Selfie: Generating Full-Body Selfies","summary":" We present a method to generate full-body selfies from photographs originally\ntaken at arms length. Because self-captured photos are typically taken close\nup, they have limited field of view and exaggerated perspective that distorts\nfacial shapes. We instead seek to generate the photo some one else would take\nof you from a few feet away. Our approach takes as input four selfies of your\nface and body, a background image, and generates a full-body selfie in a\ndesired target pose. We introduce a novel diffusion-based approach to combine\nall of this information into high-quality, well-composed photos of you with the\ndesired pose and background.\n","authors":["Bowei Chen","Brian Curless","Ira Kemelmacher-Shlizerman","Steven M. Seitz"],"pdf_url":"https://arxiv.org/pdf/2308.14740v2.pdf","comment":"Project page:\n https://homes.cs.washington.edu/~boweiche/project_page/totalselfie/"},{"id":"http://arxiv.org/abs/2403.00939v3","updated":"2024-04-03T17:42:11Z","published":"2024-03-01T19:36:11Z","title":"G3DR: Generative 3D Reconstruction in ImageNet","summary":" We introduce a novel 3D generative method, Generative 3D Reconstruction\n(G3DR) in ImageNet, capable of generating diverse and high-quality 3D objects\nfrom single images, addressing the limitations of existing methods. At the\nheart of our framework is a novel depth regularization technique that enables\nthe generation of scenes with high-geometric fidelity. G3DR also leverages a\npretrained language-vision model, such as CLIP, to enable reconstruction in\nnovel views and improve the visual realism of generations. Additionally, G3DR\ndesigns a simple but effective sampling procedure to further improve the\nquality of generations. G3DR offers diverse and efficient 3D asset generation\nbased on class or text conditioning. Despite its simplicity, G3DR is able to\nbeat state-of-theart methods, improving over them by up to 22% in perceptual\nmetrics and 90% in geometry scores, while needing only half of the training\ntime. Code is available at https://github.com/preddy5/G3DR\n","authors":["Pradyumna Reddy","Ismail Elezi","Jiankang Deng"],"pdf_url":"https://arxiv.org/pdf/2403.00939v3.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.02885v1","updated":"2024-04-03T17:38:15Z","published":"2024-04-03T17:38:15Z","title":"PoCo: Point Context Cluster for RGBD Indoor Place Recognition","summary":" We present a novel end-to-end algorithm (PoCo) for the indoor RGB-D place\nrecognition task, aimed at identifying the most likely match for a given query\nframe within a reference database. The task presents inherent challenges\nattributed to the constrained field of view and limited range of perception\nsensors. We propose a new network architecture, which generalizes the recent\nContext of Clusters (CoCs) to extract global descriptors directly from the\nnoisy point clouds through end-to-end learning. Moreover, we develop the\narchitecture by integrating both color and geometric modalities into the point\nfeatures to enhance the global descriptor representation. We conducted\nevaluations on public datasets ScanNet-PR and ARKit with 807 and 5047\nscenarios, respectively. PoCo achieves SOTA performance: on ScanNet-PR, we\nachieve R@1 of 64.63%, a 5.7% improvement from the best-published result CGis\n(61.12%); on Arkit, we achieve R@1 of 45.12%, a 13.3% improvement from the\nbest-published result CGis (39.82%). In addition, PoCo shows higher efficiency\nthan CGis in inference time (1.75X-faster), and we demonstrate the\neffectiveness of PoCo in recognizing places within a real-world laboratory\nenvironment.\n","authors":["Jing Liang","Zhuo Deng","Zheming Zhou","Omid Ghasemalizadeh","Dinesh Manocha","Min Sun","Cheng-Hao Kuo","Arnie Sen"],"pdf_url":"https://arxiv.org/pdf/2404.02885v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02883v1","updated":"2024-04-03T17:34:28Z","published":"2024-04-03T17:34:28Z","title":"On the Scalability of Diffusion-based Text-to-Image Generation","summary":" Scaling up model and data size has been quite successful for the evolution of\nLLMs. However, the scaling law for the diffusion based text-to-image (T2I)\nmodels is not fully explored. It is also unclear how to efficiently scale the\nmodel for better performance at reduced cost. The different training settings\nand expensive training cost make a fair model comparison extremely difficult.\nIn this work, we empirically study the scaling properties of diffusion based\nT2I models by performing extensive and rigours ablations on scaling both\ndenoising backbones and training set, including training scaled UNet and\nTransformer variants ranging from 0.4B to 4B parameters on datasets upto 600M\nimages. For model scaling, we find the location and amount of cross attention\ndistinguishes the performance of existing UNet designs. And increasing the\ntransformer blocks is more parameter-efficient for improving text-image\nalignment than increasing channel numbers. We then identify an efficient UNet\nvariant, which is 45% smaller and 28% faster than SDXL's UNet. On the data\nscaling side, we show the quality and diversity of the training set matters\nmore than simply dataset size. Increasing caption density and diversity\nimproves text-image alignment performance and the learning efficiency. Finally,\nwe provide scaling functions to predict the text-image alignment performance as\nfunctions of the scale of model size, compute and dataset size.\n","authors":["Hao Li","Yang Zou","Ying Wang","Orchid Majumder","Yusheng Xie","R. Manmatha","Ashwin Swaminathan","Zhuowen Tu","Stefano Ermon","Stefano Soatto"],"pdf_url":"https://arxiv.org/pdf/2404.02883v1.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2404.02877v1","updated":"2024-04-03T17:24:27Z","published":"2024-04-03T17:24:27Z","title":"FlightScope: A Deep Comprehensive Assessment of Aircraft Detection\n Algorithms in Satellite Imagery","summary":" Object detection in remotely sensed satellite pictures is fundamental in many\nfields such as biophysical, and environmental monitoring. While deep learning\nalgorithms are constantly evolving, they have been mostly implemented and\ntested on popular ground-based taken photos. This paper critically evaluates\nand compares a suite of advanced object detection algorithms customized for the\ntask of identifying aircraft within satellite imagery. Using the large\nHRPlanesV2 dataset, together with a rigorous validation with the GDIT dataset,\nthis research encompasses an array of methodologies including YOLO versions 5\nand 8, Faster RCNN, CenterNet, RetinaNet, RTMDet, and DETR, all trained from\nscratch. This exhaustive training and validation study reveal YOLOv5 as the\npreeminent model for the specific case of identifying airplanes from remote\nsensing data, showcasing high precision and adaptability across diverse imaging\nconditions. This research highlight the nuanced performance landscapes of these\nalgorithms, with YOLOv5 emerging as a robust solution for aerial object\ndetection, underlining its importance through superior mean average precision,\nRecall, and Intersection over Union scores. The findings described here\nunderscore the fundamental role of algorithm selection aligned with the\nspecific demands of satellite imagery analysis and extend a comprehensive\nframework to evaluate model efficacy. The benchmark toolkit and codes,\navailable via https://github.com/toelt-llc/FlightScope_Bench, aims to further\nexploration and innovation in the realm of remote sensing object detection,\npaving the way for improved analytical methodologies in satellite imagery\napplications.\n","authors":["Safouane El Ghazouali","Arnaud Gucciardi","Nicola Venturi","Michael Rueegsegger","Umberto Michelucci"],"pdf_url":"https://arxiv.org/pdf/2404.02877v1.pdf","comment":"15 figures, 4 tables, comprehensive survey, comparative study"},{"id":"http://arxiv.org/abs/2403.18346v3","updated":"2024-04-03T17:18:51Z","published":"2024-03-27T08:38:49Z","title":"Quantifying and Mitigating Unimodal Biases in Multimodal Large Language\n Models: A Causal Perspective","summary":" Recent advancements in Large Language Models (LLMs) have facilitated the\ndevelopment of Multimodal LLMs (MLLMs). Despite their impressive capabilities,\nMLLMs often suffer from an over-reliance on unimodal biases (e.g., language\nbias and vision bias), leading to incorrect answers in complex multimodal\ntasks. To investigate this issue, we propose a causal framework to interpret\nthe biases in Visual Question Answering (VQA) problems. Within our framework,\nwe devise a causal graph to elucidate the predictions of MLLMs on VQA problems,\nand assess the causal effect of biases through an in-depth causal analysis.\nMotivated by the causal graph, we introduce a novel MORE dataset, consisting of\n12,000 VQA instances. This dataset is designed to challenge MLLMs' abilities,\nnecessitating multi-hop reasoning and the surmounting of unimodal biases.\nFurthermore, we propose two strategies to mitigate unimodal biases and enhance\nMLLMs' reasoning capabilities, including a Decompose-Verify-Answer (DeVA)\nframework for limited-access MLLMs and the refinement of open-source MLLMs\nthrough fine-tuning. Extensive quantitative and qualitative experiments offer\nvaluable insights for future research. Our project page is at\nhttps://opencausalab.github.io/MORE.\n","authors":["Meiqi Chen","Yixin Cao","Yan Zhang","Chaochao Lu"],"pdf_url":"https://arxiv.org/pdf/2403.18346v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11782v2","updated":"2024-04-03T16:57:35Z","published":"2023-12-19T01:33:46Z","title":"Learning Object State Changes in Videos: An Open-World Perspective","summary":" Object State Changes (OSCs) are pivotal for video understanding. While humans\ncan effortlessly generalize OSC understanding from familiar to unknown objects,\ncurrent approaches are confined to a closed vocabulary. Addressing this gap, we\nintroduce a novel open-world formulation for the video OSC problem. The goal is\nto temporally localize the three stages of an OSC -- the object's initial\nstate, its transitioning state, and its end state -- whether or not the object\nhas been observed during training. Towards this end, we develop VidOSC, a\nholistic learning approach that: (1) leverages text and vision-language models\nfor supervisory signals to obviate manually labeling OSC training data, and (2)\nabstracts fine-grained shared state representations from objects to enhance\ngeneralization. Furthermore, we present HowToChange, the first open-world\nbenchmark for video OSC localization, which offers an order of magnitude\nincrease in the label space and annotation volume compared to the best existing\nbenchmark. Experimental results demonstrate the efficacy of our approach, in\nboth traditional closed-world and open-world scenarios.\n","authors":["Zihui Xue","Kumar Ashutosh","Kristen Grauman"],"pdf_url":"https://arxiv.org/pdf/2312.11782v2.pdf","comment":"Accepted by CVPR 2024, Project website:\n https://vision.cs.utexas.edu/projects/VidOSC/"},{"id":"http://arxiv.org/abs/2404.01717v2","updated":"2024-04-03T16:46:27Z","published":"2024-04-02T08:07:38Z","title":"AddSR: Accelerating Diffusion-based Blind Super-Resolution with\n Adversarial Diffusion Distillation","summary":" Blind super-resolution methods based on stable diffusion showcase formidable\ngenerative capabilities in reconstructing clear high-resolution images with\nintricate details from low-resolution inputs. However, their practical\napplicability is often hampered by poor efficiency, stemming from the\nrequirement of thousands or hundreds of sampling steps. Inspired by the\nefficient text-to-image approach adversarial diffusion distillation (ADD), we\ndesign AddSR to address this issue by incorporating the ideas of both\ndistillation and ControlNet. Specifically, we first propose a prediction-based\nself-refinement strategy to provide high-frequency information in the student\nmodel output with marginal additional time cost. Furthermore, we refine the\ntraining process by employing HR images, rather than LR images, to regulate the\nteacher model, providing a more robust constraint for distillation. Second, we\nintroduce a timestep-adapting loss to address the perception-distortion\nimbalance problem introduced by ADD. Extensive experiments demonstrate our\nAddSR generates better restoration results, while achieving faster speed than\nprevious SD-based state-of-the-art models (e.g., 7x faster than SeeSR).\n","authors":["Rui Xie","Ying Tai","Kai Zhang","Zhenyu Zhang","Jun Zhou","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2404.01717v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02845v1","updated":"2024-04-03T16:23:37Z","published":"2024-04-03T16:23:37Z","title":"Cross-Modal Conditioned Reconstruction for Language-guided Medical Image\n Segmentation","summary":" Recent developments underscore the potential of textual information in\nenhancing learning models for a deeper understanding of medical visual\nsemantics. However, language-guided medical image segmentation still faces a\nchallenging issue. Previous works employ implicit and ambiguous architectures\nto embed textual information. This leads to segmentation results that are\ninconsistent with the semantics represented by the language, sometimes even\ndiverging significantly. To this end, we propose a novel cross-modal\nconditioned Reconstruction for Language-guided Medical Image Segmentation\n(RecLMIS) to explicitly capture cross-modal interactions, which assumes that\nwell-aligned medical visual features and medical notes can effectively\nreconstruct each other. We introduce conditioned interaction to adaptively\npredict patches and words of interest. Subsequently, they are utilized as\nconditioning factors for mutual reconstruction to align with regions described\nin the medical notes. Extensive experiments demonstrate the superiority of our\nRecLMIS, surpassing LViT by 3.74% mIoU on the publicly available MosMedData+\ndataset and achieving an average increase of 1.89% mIoU for cross-domain tests\non our QATA-CoV19 dataset. Simultaneously, we achieve a relative reduction of\n20.2% in parameter count and a 55.5% decrease in computational load. The code\nwill be available at https://github.com/ShashankHuang/RecLMIS.\n","authors":["Xiaoshuang Huang","Hongxiang Li","Meng Cao","Long Chen","Chenyu You","Dong An"],"pdf_url":"https://arxiv.org/pdf/2404.02845v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02830v1","updated":"2024-04-03T16:04:59Z","published":"2024-04-03T16:04:59Z","title":"Enhancing Interpretability of Vertebrae Fracture Grading using\n Human-interpretable Prototypes","summary":" Vertebral fracture grading classifies the severity of vertebral fractures,\nwhich is a challenging task in medical imaging and has recently attracted Deep\nLearning (DL) models. Only a few works attempted to make such models\nhuman-interpretable despite the need for transparency and trustworthiness in\ncritical use cases like DL-assisted medical diagnosis. Moreover, such models\neither rely on post-hoc methods or additional annotations. In this work, we\npropose a novel interpretable-by-design method, ProtoVerse, to find relevant\nsub-parts of vertebral fractures (prototypes) that reliably explain the model's\ndecision in a human-understandable way. Specifically, we introduce a novel\ndiversity-promoting loss to mitigate prototype repetitions in small datasets\nwith intricate semantics. We have experimented with the VerSe'19 dataset and\noutperformed the existing prototype-based method. Further, our model provides\nsuperior interpretability against the post-hoc method. Importantly, expert\nradiologists validated the visual interpretability of our results, showing\nclinical applicability.\n","authors":["Poulami Sinhamahapatra","Suprosanna Shit","Anjany Sekuboyina","Malek Husseini","David Schinz","Nicolas Lenhart","Joern Menze","Jan Kirschke","Karsten Roscher","Stephan Guennemann"],"pdf_url":"https://arxiv.org/pdf/2404.02830v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10835v3","updated":"2024-04-03T16:00:18Z","published":"2023-12-17T22:40:38Z","title":"Your Student is Better Than Expected: Adaptive Teacher-Student\n Collaboration for Text-Conditional Diffusion Models","summary":" Knowledge distillation methods have recently shown to be a promising\ndirection to speedup the synthesis of large-scale diffusion models by requiring\nonly a few inference steps. While several powerful distillation methods were\nrecently proposed, the overall quality of student samples is typically lower\ncompared to the teacher ones, which hinders their practical usage. In this\nwork, we investigate the relative quality of samples produced by the teacher\ntext-to-image diffusion model and its distilled student version. As our main\nempirical finding, we discover that a noticeable portion of student samples\nexhibit superior fidelity compared to the teacher ones, despite the\n\"approximate\" nature of the student. Based on this finding, we propose an\nadaptive collaboration between student and teacher diffusion models for\neffective text-to-image synthesis. Specifically, the distilled model produces\nthe initial sample, and then an oracle decides whether it needs further\nimprovements with a slow teacher model. Extensive experiments demonstrate that\nthe designed pipeline surpasses state-of-the-art text-to-image alternatives for\nvarious inference budgets in terms of human preference. Furthermore, the\nproposed approach can be naturally used in popular applications such as\ntext-guided image editing and controllable generation.\n","authors":["Nikita Starodubcev","Artem Fedorov","Artem Babenko","Dmitry Baranchuk"],"pdf_url":"https://arxiv.org/pdf/2312.10835v3.pdf","comment":"CVPR2024 camera ready"},{"id":"http://arxiv.org/abs/2312.10389v2","updated":"2024-04-03T15:54:15Z","published":"2023-12-16T09:04:44Z","title":"ElasticLaneNet: An Efficient Geometry-Flexible Approach for Lane\n Detection","summary":" The task of lane detection involves identifying the boundaries of driving\nareas in real-time. Recognizing lanes with variable and complex geometric\nstructures remains a challenge. In this paper, we explore a novel and flexible\nway of implicit lanes representation named \\textit{Elastic Lane map (ELM)}, and\nintroduce an efficient physics-informed end-to-end lane detection framework,\nnamely, ElasticLaneNet (Elastic interaction energy-informed Lane detection\nNetwork). The approach considers predicted lanes as moving zero-contours on the\nflexibly shaped \\textit{ELM} that are attracted to the ground truth guided by\nan elastic interaction energy-loss function (EIE loss). Our framework well\nintegrates the global information and low-level features. The method performs\nwell in complex lane scenarios, including those with large curvature, weak\ngeometry features at intersections, complicated cross lanes, Y-shapes lanes,\ndense lanes, etc. We apply our approach on three datasets: SDLane, CULane, and\nTuSimple. The results demonstrate exceptional performance of our method, with\nthe state-of-the-art results on the structurally diverse SDLane, achieving\nF1-score of 89.51, Recall rate of 87.50, and Precision of 91.61 with fast\ninference speed.\n","authors":["Yaxin Feng","Yuan Lan","Luchan Zhang","Yang Xiang"],"pdf_url":"https://arxiv.org/pdf/2312.10389v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.05247v2","updated":"2024-04-03T15:40:00Z","published":"2023-12-08T18:55:24Z","title":"Dynamic LiDAR Re-simulation using Compositional Neural Fields","summary":" We introduce DyNFL, a novel neural field-based approach for high-fidelity\nre-simulation of LiDAR scans in dynamic driving scenes. DyNFL processes LiDAR\nmeasurements from dynamic environments, accompanied by bounding boxes of moving\nobjects, to construct an editable neural field. This field, comprising\nseparately reconstructed static background and dynamic objects, allows users to\nmodify viewpoints, adjust object positions, and seamlessly add or remove\nobjects in the re-simulated scene. A key innovation of our method is the neural\nfield composition technique, which effectively integrates reconstructed neural\nassets from various scenes through a ray drop test, accounting for occlusions\nand transparent surfaces. Our evaluation with both synthetic and real-world\nenvironments demonstrates that DyNFL substantially improves dynamic scene LiDAR\nsimulation, offering a combination of physical fidelity and flexible editing\ncapabilities.\n","authors":["Hanfeng Wu","Xingxing Zuo","Stefan Leutenegger","Or Litany","Konrad Schindler","Shengyu Huang"],"pdf_url":"https://arxiv.org/pdf/2312.05247v2.pdf","comment":"Project page: https://shengyuh.github.io/dynfl"},{"id":"http://arxiv.org/abs/2312.15702v2","updated":"2024-04-03T15:38:12Z","published":"2023-12-25T11:54:07Z","title":"Three Heads Are Better Than One: Complementary Experts for Long-Tailed\n Semi-supervised Learning","summary":" We address the challenging problem of Long-Tailed Semi-Supervised Learning\n(LTSSL) where labeled data exhibit imbalanced class distribution and unlabeled\ndata follow an unknown distribution. Unlike in balanced SSL, the generated\npseudo-labels are skewed towards head classes, intensifying the training bias.\nSuch a phenomenon is even amplified as more unlabeled data will be mislabeled\nas head classes when the class distribution of labeled and unlabeled datasets\nare mismatched. To solve this problem, we propose a novel method named\nComPlementary Experts (CPE). Specifically, we train multiple experts to model\nvarious class distributions, each of them yielding high-quality pseudo-labels\nwithin one form of class distribution. Besides, we introduce Classwise Batch\nNormalization for CPE to avoid performance degradation caused by feature\ndistribution mismatch between head and non-head classes. CPE achieves\nstate-of-the-art performances on CIFAR-10-LT, CIFAR-100-LT, and STL-10-LT\ndataset benchmarks. For instance, on CIFAR-10-LT, CPE improves test accuracy by\nover 2.22% compared to baselines. Code is available at\nhttps://github.com/machengcheng2016/CPE-LTSSL.\n","authors":["Chengcheng Ma","Ismail Elezi","Jiankang Deng","Weiming Dong","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2312.15702v2.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2404.02813v1","updated":"2024-04-03T15:37:02Z","published":"2024-04-03T15:37:02Z","title":"GPU-Accelerated RSF Level Set Evolution for Large-Scale Microvascular\n Segmentation","summary":" Microvascular networks are challenging to model because these structures are\ncurrently near the diffraction limit for most advanced three-dimensional\nimaging modalities, including confocal and light sheet microscopy. This makes\nsemantic segmentation difficult, because individual components of these\nnetworks fluctuate within the confines of individual pixels. Level set methods\nare ideally suited to solve this problem by providing surface and topological\nconstraints on the resulting model, however these active contour techniques are\nextremely time intensive and impractical for terabyte-scale images. We propose\na reformulation and implementation of the region-scalable fitting (RSF) level\nset model that makes it amenable to three-dimensional evaluation using both\nsingle-instruction multiple data (SIMD) and single-program multiple-data (SPMD)\nparallel processing. This enables evaluation of the level set equation on\nindependent regions of the data set using graphics processing units (GPUs),\nmaking large-scale segmentation of high-resolution networks practical and\ninexpensive.\n We tested this 3D parallel RSF approach on multiple data sets acquired using\nstate-of-the-art imaging techniques to acquire microvascular data, including\nmicro-CT, light sheet fluorescence microscopy (LSFM) and milling microscopy. To\nassess the performance and accuracy of the RSF model, we conducted a\nMonte-Carlo-based validation technique to compare results to other segmentation\nmethods. We also provide a rigorous profiling to show the gains in processing\nspeed leveraging parallel hardware. This study showcases the practical\napplication of the RSF model, emphasizing its utility in the challenging domain\nof segmenting large-scale high-topology network structures with a particular\nfocus on building microvascular models.\n","authors":["Meher Niger","Helya Goharbavang","Taeyong Ahn","Emily K. Alley","Joshua D. Wythe","Guoning Chen","David Mayerich"],"pdf_url":"https://arxiv.org/pdf/2404.02813v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01449v2","updated":"2024-04-03T15:32:17Z","published":"2023-10-02T01:30:42Z","title":"Elastic Interaction Energy-Informed Real-Time Traffic Scene Perception","summary":" Urban segmentation and lane detection are two important tasks for traffic\nscene perception. Accuracy and fast inference speed of visual perception are\ncrucial for autonomous driving safety. Fine and complex geometric objects are\nthe most challenging but important recognition targets in traffic scene, such\nas pedestrians, traffic signs and lanes. In this paper, a simple and efficient\ntopology-aware energy loss function-based network training strategy named\nEIEGSeg is proposed. EIEGSeg is designed for multi-class segmentation on\nreal-time traffic scene perception. To be specific, the convolutional neural\nnetwork (CNN) extracts image features and produces multiple outputs, and the\nelastic interaction energy loss function (EIEL) drives the predictions moving\ntoward the ground truth until they are completely overlapped. Our strategy\nperforms well especially on fine-scale structure, \\textit{i.e.} small or\nirregularly shaped objects can be identified more accurately, and discontinuity\nissues on slender objects can be improved. We quantitatively and qualitatively\nanalyze our method on three traffic datasets, including urban scene\nsegmentation data Cityscapes and lane detection data TuSimple and CULane. Our\nresults demonstrate that EIEGSeg consistently improves the performance,\nespecially on real-time, lightweight networks that are better suited for\nautonomous driving.\n","authors":["Yaxin Feng","Yuan Lan","Luchan Zhang","Guoqing Liu","Yang Xiang"],"pdf_url":"https://arxiv.org/pdf/2310.01449v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08730v2","updated":"2024-04-03T15:22:23Z","published":"2024-03-13T17:29:45Z","title":"Strengthening Multimodal Large Language Model with Bootstrapped\n Preference Optimization","summary":" Multimodal Large Language Models (MLLMs) excel in generating responses based\non visual inputs. However, they often suffer from a bias towards generating\nresponses similar to their pretraining corpus, overshadowing the importance of\nvisual information. We treat this bias as a \"preference\" for pretraining\nstatistics, which hinders the model's grounding in visual input. To mitigate\nthis issue, we propose Bootstrapped Preference Optimization (BPO), which\nconducts preference learning with datasets containing negative responses\nbootstrapped from the model itself. Specifically, we propose the following two\nstrategies: 1) using distorted image inputs to the MLLM for eliciting responses\nthat contain signified pretraining bias; 2) leveraging text-based LLM to\nexplicitly inject erroneous but common elements into the original response.\nThose undesirable responses are paired with original annotated responses from\nthe datasets to construct the preference dataset, which is subsequently\nutilized to perform preference learning. Our approach effectively suppresses\npretrained LLM bias, enabling enhanced grounding in visual inputs. Extensive\nexperimentation demonstrates significant performance improvements across\nmultiple benchmarks, advancing the state-of-the-art in multimodal\nconversational systems.\n","authors":["Renjie Pi","Tianyang Han","Wei Xiong","Jipeng Zhang","Runtao Liu","Rui Pan","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.08730v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2012.02689v2","updated":"2024-04-03T15:18:00Z","published":"2020-12-04T15:58:34Z","title":"Isometric Multi-Shape Matching","summary":" Finding correspondences between shapes is a fundamental problem in computer\nvision and graphics, which is relevant for many applications, including 3D\nreconstruction, object tracking, and style transfer. The vast majority of\ncorrespondence methods aim to find a solution between pairs of shapes, even if\nmultiple instances of the same class are available. While isometries are often\nstudied in shape correspondence problems, they have not been considered\nexplicitly in the multi-matching setting. This paper closes this gap by\nproposing a novel optimisation formulation for isometric multi-shape matching.\nWe present a suitable optimisation algorithm for solving our formulation and\nprovide a convergence and complexity analysis. Our algorithm obtains\nmulti-matchings that are by construction provably cycle-consistent. We\ndemonstrate the superior performance of our method on various datasets and set\nthe new state-of-the-art in isometric multi-shape matching.\n","authors":["Maolin Gao","Zorah Lähner","Johan Thunberg","Daniel Cremers","Florian Bernard"],"pdf_url":"https://arxiv.org/pdf/2012.02689v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07169v3","updated":"2024-04-03T15:11:33Z","published":"2023-12-12T11:13:17Z","title":"Semi-supervised Active Learning for Video Action Detection","summary":" In this work, we focus on label efficient learning for video action\ndetection. We develop a novel semi-supervised active learning approach which\nutilizes both labeled as well as unlabeled data along with informative sample\nselection for action detection. Video action detection requires spatio-temporal\nlocalization along with classification, which poses several challenges for both\nactive learning informative sample selection as well as semi-supervised\nlearning pseudo label generation. First, we propose NoiseAug, a simple\naugmentation strategy which effectively selects informative samples for video\naction detection. Next, we propose fft-attention, a novel technique based on\nhigh-pass filtering which enables effective utilization of pseudo label for SSL\nin video action detection by emphasizing on relevant activity region within a\nvideo. We evaluate the proposed approach on three different benchmark datasets,\nUCF-101-24, JHMDB-21, and Youtube-VOS. First, we demonstrate its effectiveness\non video action detection where the proposed approach outperforms prior works\nin semi-supervised and weakly-supervised learning along with several baseline\napproaches in both UCF101-24 and JHMDB-21. Next, we also show its effectiveness\non Youtube-VOS for video object segmentation demonstrating its generalization\ncapability for other dense prediction tasks in videos. The code and models is\npublicly available at:\n\\url{https://github.com/AKASH2907/semi-sup-active-learning}.\n","authors":["Ayush Singh","Aayush J Rana","Akash Kumar","Shruti Vyas","Yogesh Singh Rawat"],"pdf_url":"https://arxiv.org/pdf/2312.07169v3.pdf","comment":"AAAI Conference on Artificial Intelligence, Main Technical Track\n (AAAI), 2024, Code: https://github.com/AKASH2907/semi-sup-active-learning"},{"id":"http://arxiv.org/abs/2311.16432v2","updated":"2024-04-03T15:05:28Z","published":"2023-11-28T02:27:31Z","title":"Text-Driven Image Editing via Learnable Regions","summary":" Language has emerged as a natural interface for image editing. In this paper,\nwe introduce a method for region-based image editing driven by textual prompts,\nwithout the need for user-provided masks or sketches. Specifically, our\napproach leverages an existing pre-trained text-to-image model and introduces a\nbounding box generator to identify the editing regions that are aligned with\nthe textual prompts. We show that this simple approach enables flexible editing\nthat is compatible with current image generation models, and is able to handle\ncomplex prompts featuring multiple objects, complex sentences, or lengthy\nparagraphs. We conduct an extensive user study to compare our method against\nstate-of-the-art methods. The experiments demonstrate the competitive\nperformance of our method in manipulating images with high fidelity and realism\nthat correspond to the provided language descriptions. Our project webpage can\nbe found at: https://yuanze-lin.me/LearnableRegions_page.\n","authors":["Yuanze Lin","Yi-Wen Chen","Yi-Hsuan Tsai","Lu Jiang","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2311.16432v2.pdf","comment":"Accepted to CVPR 2024 Project webpage:\n https://yuanze-lin.me/LearnableRegions_page"},{"id":"http://arxiv.org/abs/2308.08393v2","updated":"2024-04-03T15:04:03Z","published":"2023-08-16T14:25:30Z","title":"SIGMA: Scale-Invariant Global Sparse Shape Matching","summary":" We propose a novel mixed-integer programming (MIP) formulation for generating\nprecise sparse correspondences for highly non-rigid shapes. To this end, we\nintroduce a projected Laplace-Beltrami operator (PLBO) which combines intrinsic\nand extrinsic geometric information to measure the deformation quality induced\nby predicted correspondences. We integrate the PLBO, together with an\norientation-aware regulariser, into a novel MIP formulation that can be solved\nto global optimality for many practical problems. In contrast to previous\nmethods, our approach is provably invariant to rigid transformations and global\nscaling, initialisation-free, has optimality guarantees, and scales to high\nresolution meshes with (empirically observed) linear time. We show\nstate-of-the-art results for sparse non-rigid matching on several challenging\n3D datasets, including data with inconsistent meshing, as well as applications\nin mesh-to-point-cloud matching.\n","authors":["Maolin Gao","Paul Roetzer","Marvin Eisenberger","Zorah Lähner","Michael Moeller","Daniel Cremers","Florian Bernard"],"pdf_url":"https://arxiv.org/pdf/2308.08393v2.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2210.01708v3","updated":"2024-04-03T15:01:26Z","published":"2022-10-04T16:08:54Z","title":"Conquering the Communication Constraints to Enable Large Pre-Trained\n Models in Federated Learning","summary":" Federated learning (FL) has emerged as a promising paradigm for enabling the\ncollaborative training of models without centralized access to the raw data on\nlocal devices. In the typical FL paradigm (e.g., FedAvg), model weights are\nsent to and from the server each round to participating clients. Recently, the\nuse of small pre-trained models has been shown effective in federated learning\noptimization and improving convergence. However, recent state-of-the-art\npre-trained models are getting more capable but also have more parameters. In\nconventional FL, sharing the enormous model weights can quickly put a massive\ncommunication burden on the system, especially if more capable models are\nemployed. Can we find a solution to enable those strong and readily-available\npre-trained models in FL to achieve excellent performance while simultaneously\nreducing the communication burden? To this end, we investigate the use of\nparameter-efficient fine-tuning in federated learning and thus introduce a new\nframework: FedPEFT. Specifically, we systemically evaluate the performance of\nFedPEFT across a variety of client stability, data distribution, and\ndifferential privacy settings. By only locally tuning and globally sharing a\nsmall portion of the model weights, significant reductions in the total\ncommunication overhead can be achieved while maintaining competitive or even\nbetter performance in a wide range of federated learning scenarios, providing\ninsight into a new paradigm for practical and effective federated systems.\n","authors":["Guangyu Sun","Umar Khalid","Matias Mendieta","Taojiannan Yang","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2210.01708v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14505v3","updated":"2024-04-03T14:59:08Z","published":"2024-02-22T12:55:01Z","title":"Towards Seamless Adaptation of Pre-trained Models for Visual Place\n Recognition","summary":" Recent studies show that vision models pre-trained in generic visual learning\ntasks with large-scale data can provide useful feature representations for a\nwide range of visual perception problems. However, few attempts have been made\nto exploit pre-trained foundation models in visual place recognition (VPR). Due\nto the inherent difference in training objectives and data between the tasks of\nmodel pre-training and VPR, how to bridge the gap and fully unleash the\ncapability of pre-trained models for VPR is still a key issue to address. To\nthis end, we propose a novel method to realize seamless adaptation of\npre-trained models for VPR. Specifically, to obtain both global and local\nfeatures that focus on salient landmarks for discriminating places, we design a\nhybrid adaptation method to achieve both global and local adaptation\nefficiently, in which only lightweight adapters are tuned without adjusting the\npre-trained model. Besides, to guide effective adaptation, we propose a mutual\nnearest neighbor local feature loss, which ensures proper dense local features\nare produced for local matching and avoids time-consuming spatial verification\nin re-ranking. Experimental results show that our method outperforms the\nstate-of-the-art methods with less training data and training time, and uses\nabout only 3% retrieval runtime of the two-stage VPR methods with RANSAC-based\nspatial verification. It ranks 1st on the MSLS challenge leaderboard (at the\ntime of submission). The code is released at\nhttps://github.com/Lu-Feng/SelaVPR.\n","authors":["Feng Lu","Lijun Zhang","Xiangyuan Lan","Shuting Dong","Yaowei Wang","Chun Yuan"],"pdf_url":"https://arxiv.org/pdf/2402.14505v3.pdf","comment":"ICLR2024"},{"id":"http://arxiv.org/abs/2404.02790v1","updated":"2024-04-03T14:58:00Z","published":"2024-04-03T14:58:00Z","title":"MULAN: A Multi Layer Annotated Dataset for Controllable Text-to-Image\n Generation","summary":" Text-to-image generation has achieved astonishing results, yet precise\nspatial controllability and prompt fidelity remain highly challenging. This\nlimitation is typically addressed through cumbersome prompt engineering, scene\nlayout conditioning, or image editing techniques which often require hand drawn\nmasks. Nonetheless, pre-existing works struggle to take advantage of the\nnatural instance-level compositionality of scenes due to the typically flat\nnature of rasterized RGB output images. Towards adressing this challenge, we\nintroduce MuLAn: a novel dataset comprising over 44K MUlti-Layer ANnotations of\nRGB images as multilayer, instance-wise RGBA decompositions, and over 100K\ninstance images. To build MuLAn, we developed a training free pipeline which\ndecomposes a monocular RGB image into a stack of RGBA layers comprising of\nbackground and isolated instances. We achieve this through the use of\npretrained general-purpose models, and by developing three modules: image\ndecomposition for instance discovery and extraction, instance completion to\nreconstruct occluded areas, and image re-assembly. We use our pipeline to\ncreate MuLAn-COCO and MuLAn-LAION datasets, which contain a variety of image\ndecompositions in terms of style, composition and complexity. With MuLAn, we\nprovide the first photorealistic resource providing instance decomposition and\nocclusion information for high quality images, opening up new avenues for\ntext-to-image generative AI research. With this, we aim to encourage the\ndevelopment of novel generation and editing technology, in particular\nlayer-wise solutions. MuLAn data resources are available at\nhttps://MuLAn-dataset.github.io/.\n","authors":["Petru-Daniel Tudosiu","Yongxin Yang","Shifeng Zhang","Fei Chen","Steven McDonagh","Gerasimos Lampouras","Ignacio Iacobacci","Sarah Parisot"],"pdf_url":"https://arxiv.org/pdf/2404.02790v1.pdf","comment":"CVPR 2024 - Project page: https://MuLAn-dataset.github.io/"},{"id":"http://arxiv.org/abs/2404.02788v1","updated":"2024-04-03T14:56:06Z","published":"2024-04-03T14:56:06Z","title":"GenN2N: Generative NeRF2NeRF Translation","summary":" We present GenN2N, a unified NeRF-to-NeRF translation framework for various\nNeRF translation tasks such as text-driven NeRF editing, colorization,\nsuper-resolution, inpainting, etc. Unlike previous methods designed for\nindividual translation tasks with task-specific schemes, GenN2N achieves all\nthese NeRF editing tasks by employing a plug-and-play image-to-image translator\nto perform editing in the 2D domain and lifting 2D edits into the 3D NeRF\nspace. Since the 3D consistency of 2D edits may not be assured, we propose to\nmodel the distribution of the underlying 3D edits through a generative model\nthat can cover all possible edited NeRFs. To model the distribution of 3D\nedited NeRFs from 2D edited images, we carefully design a VAE-GAN that encodes\nimages while decoding NeRFs. The latent space is trained to align with a\nGaussian distribution and the NeRFs are supervised through an adversarial loss\non its renderings. To ensure the latent code does not depend on 2D viewpoints\nbut truly reflects the 3D edits, we also regularize the latent code through a\ncontrastive learning scheme. Extensive experiments on various editing tasks\nshow GenN2N, as a universal framework, performs as well or better than\ntask-specific specialists while possessing flexible generative power. More\nresults on our project page: https://xiangyueliu.github.io/GenN2N/\n","authors":["Xiangyue Liu","Han Xue","Kunming Luo","Ping Tan","Li Yi"],"pdf_url":"https://arxiv.org/pdf/2404.02788v1.pdf","comment":"Accepted to CVPR 2024. Project page:\n https://xiangyueliu.github.io/GenN2N/"},{"id":"http://arxiv.org/abs/2404.02785v1","updated":"2024-04-03T14:55:17Z","published":"2024-04-03T14:55:17Z","title":"Domain Generalization through Meta-Learning: A Survey","summary":" Deep neural networks (DNNs) have revolutionized artificial intelligence but\noften lack performance when faced with out-of-distribution (OOD) data, a common\nscenario due to the inevitable domain shifts in real-world applications. This\nlimitation stems from the common assumption that training and testing data\nshare the same distribution-an assumption frequently violated in practice.\nDespite their effectiveness with large amounts of data and computational power,\nDNNs struggle with distributional shifts and limited labeled data, leading to\noverfitting and poor generalization across various tasks and domains.\nMeta-learning presents a promising approach by employing algorithms that\nacquire transferable knowledge across various tasks for fast adaptation,\neliminating the need to learn each task from scratch. This survey paper delves\ninto the realm of meta-learning with a focus on its contribution to domain\ngeneralization. We first clarify the concept of meta-learning for domain\ngeneralization and introduce a novel taxonomy based on the feature extraction\nstrategy and the classifier learning methodology, offering a granular view of\nmethodologies. Through an exhaustive review of existing methods and underlying\ntheories, we map out the fundamentals of the field. Our survey provides\npractical insights and an informed discussion on promising research directions,\npaving the way for future innovation in meta-learning for domain\ngeneralization.\n","authors":["Arsham Gholamzadeh Khoee","Yinan Yu","Robert Feldt"],"pdf_url":"https://arxiv.org/pdf/2404.02785v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10206v2","updated":"2024-04-03T14:45:52Z","published":"2023-07-14T07:25:47Z","title":"NEAT: Distilling 3D Wireframes from Neural Attraction Fields","summary":" This paper studies the problem of structured 3D reconstruction using\nwireframes that consist of line segments and junctions, focusing on the\ncomputation of structured boundary geometries of scenes. Instead of leveraging\nmatching-based solutions from 2D wireframes (or line segments) for 3D wireframe\nreconstruction as done in prior arts, we present NEAT, a rendering-distilling\nformulation using neural fields to represent 3D line segments with 2D\nobservations, and bipartite matching for perceiving and distilling of a sparse\nset of 3D global junctions. The proposed {NEAT} enjoys the joint optimization\nof the neural fields and the global junctions from scratch, using\nview-dependent 2D observations without precomputed cross-view feature matching.\nComprehensive experiments on the DTU and BlendedMVS datasets demonstrate our\nNEAT's superiority over state-of-the-art alternatives for 3D wireframe\nreconstruction. Moreover, the distilled 3D global junctions by NEAT, are a\nbetter initialization than SfM points, for the recently-emerged 3D Gaussian\nSplatting for high-fidelity novel view synthesis using about 20 times fewer\ninitial 3D points. Project page: \\url{https://xuenan.net/neat}.\n","authors":["Nan Xue","Bin Tan","Yuxi Xiao","Liang Dong","Gui-Song Xia","Tianfu Wu","Yujun Shen"],"pdf_url":"https://arxiv.org/pdf/2307.10206v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2312.12865v3","updated":"2024-04-03T14:39:32Z","published":"2023-12-20T09:27:41Z","title":"RadEdit: stress-testing biomedical vision models via diffusion image\n editing","summary":" Biomedical imaging datasets are often small and biased, meaning that\nreal-world performance of predictive models can be substantially lower than\nexpected from internal testing. This work proposes using generative image\nediting to simulate dataset shifts and diagnose failure modes of biomedical\nvision models; this can be used in advance of deployment to assess readiness,\npotentially reducing cost and patient harm. Existing editing methods can\nproduce undesirable changes, with spurious correlations learned due to the\nco-occurrence of disease and treatment interventions, limiting practical\napplicability. To address this, we train a text-to-image diffusion model on\nmultiple chest X-ray datasets and introduce a new editing method RadEdit that\nuses multiple masks, if present, to constrain changes and ensure consistency in\nthe edited images. We consider three types of dataset shifts: acquisition\nshift, manifestation shift, and population shift, and demonstrate that our\napproach can diagnose failures and quantify model robustness without additional\ndata collection, complementing more qualitative tools for explainable AI.\n","authors":["Fernando Pérez-García","Sam Bond-Taylor","Pedro P. Sanchez","Boris van Breugel","Daniel C. Castro","Harshita Sharma","Valentina Salvatelli","Maria T. A. Wetscherek","Hannah Richardson","Matthew P. Lungren","Aditya Nori","Javier Alvarez-Valle","Ozan Oktay","Maximilian Ilse"],"pdf_url":"https://arxiv.org/pdf/2312.12865v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.06757v3","updated":"2024-04-03T14:31:43Z","published":"2022-11-12T22:29:42Z","title":"DriftRec: Adapting diffusion models to blind JPEG restoration","summary":" In this work, we utilize the high-fidelity generation abilities of diffusion\nmodels to solve blind JPEG restoration at high compression levels. We propose\nan elegant modification of the forward stochastic differential equation of\ndiffusion models to adapt them to this restoration task and name our method\nDriftRec. Comparing DriftRec against an $L_2$ regression baseline with the same\nnetwork architecture and state-of-the-art techniques for JPEG restoration, we\nshow that our approach can escape the tendency of other methods to generate\nblurry images, and recovers the distribution of clean images significantly more\nfaithfully. For this, only a dataset of clean/corrupted image pairs and no\nknowledge about the corruption operation is required, enabling wider\napplicability to other restoration tasks. In contrast to other conditional and\nunconditional diffusion models, we utilize the idea that the distributions of\nclean and corrupted images are much closer to each other than each is to the\nusual Gaussian prior of the reverse process in diffusion models. Our approach\ntherefore requires only low levels of added noise and needs comparatively few\nsampling steps even without further optimizations. We show that DriftRec\nnaturally generalizes to realistic and difficult scenarios such as unaligned\ndouble JPEG compression and blind restoration of JPEGs found online, without\nhaving encountered such examples during training.\n","authors":["Simon Welker","Henry N. Chapman","Timo Gerkmann"],"pdf_url":"https://arxiv.org/pdf/2211.06757v3.pdf","comment":"(C) 2024 IEEE. Personal use of this material is permitted. Permission\n from IEEE must be obtained for all other uses, in any current or future\n media, including reprinting/republishing this material for advertising or\n promotional purposes, creating new collective works, for resale or\n redistribution to servers or lists, or reuse of any copyrighted component of\n this work in other works"},{"id":"http://arxiv.org/abs/2312.02145v2","updated":"2024-04-03T14:14:18Z","published":"2023-12-04T18:59:13Z","title":"Repurposing Diffusion-Based Image Generators for Monocular Depth\n Estimation","summary":" Monocular depth estimation is a fundamental computer vision task. Recovering\n3D depth from a single image is geometrically ill-posed and requires scene\nunderstanding, so it is not surprising that the rise of deep learning has led\nto a breakthrough. The impressive progress of monocular depth estimators has\nmirrored the growth in model capacity, from relatively modest CNNs to large\nTransformer architectures. Still, monocular depth estimators tend to struggle\nwhen presented with images with unfamiliar content and layout, since their\nknowledge of the visual world is restricted by the data seen during training,\nand challenged by zero-shot generalization to new domains. This motivates us to\nexplore whether the extensive priors captured in recent generative diffusion\nmodels can enable better, more generalizable depth estimation. We introduce\nMarigold, a method for affine-invariant monocular depth estimation that is\nderived from Stable Diffusion and retains its rich prior knowledge. The\nestimator can be fine-tuned in a couple of days on a single GPU using only\nsynthetic training data. It delivers state-of-the-art performance across a wide\nrange of datasets, including over 20% performance gains in specific cases.\nProject page: https://marigoldmonodepth.github.io.\n","authors":["Bingxin Ke","Anton Obukhov","Shengyu Huang","Nando Metzger","Rodrigo Caye Daudt","Konrad Schindler"],"pdf_url":"https://arxiv.org/pdf/2312.02145v2.pdf","comment":"CVPR 2024 camera ready"},{"id":"http://arxiv.org/abs/2306.09320v4","updated":"2024-04-03T14:09:58Z","published":"2023-06-15T17:55:05Z","title":"Learnable Weight Initialization for Volumetric Medical Image\n Segmentation","summary":" Hybrid volumetric medical image segmentation models, combining the advantages\nof local convolution and global attention, have recently received considerable\nattention. While mainly focusing on architectural modifications, most existing\nhybrid approaches still use conventional data-independent weight initialization\nschemes which restrict their performance due to ignoring the inherent\nvolumetric nature of the medical data. To address this issue, we propose a\nlearnable weight initialization approach that utilizes the available medical\ntraining data to effectively learn the contextual and structural cues via the\nproposed self-supervised objectives. Our approach is easy to integrate into any\nhybrid model and requires no external training data. Experiments on multi-organ\nand lung cancer segmentation tasks demonstrate the effectiveness of our\napproach, leading to state-of-the-art segmentation performance. Our proposed\ndata-dependent initialization approach performs favorably as compared to the\nSwin-UNETR model pretrained using large-scale datasets on multi-organ\nsegmentation task. Our source code and models are available at:\nhttps://github.com/ShahinaKK/LWI-VMS.\n","authors":["Shahina Kunhimon","Abdelrahman Shaker","Muzammal Naseer","Salman Khan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2306.09320v4.pdf","comment":"Accepted at Elsevier AI in Medicine Journal"},{"id":"http://arxiv.org/abs/2404.02759v1","updated":"2024-04-03T14:05:39Z","published":"2024-04-03T14:05:39Z","title":"Unsupervised Occupancy Learning from Sparse Point Cloud","summary":" Implicit Neural Representations have gained prominence as a powerful\nframework for capturing complex data modalities, encompassing a wide range from\n3D shapes to images and audio. Within the realm of 3D shape representation,\nNeural Signed Distance Functions (SDF) have demonstrated remarkable potential\nin faithfully encoding intricate shape geometry. However, learning SDFs from 3D\npoint clouds in the absence of ground truth supervision remains a very\nchallenging task. In this paper, we propose a method to infer occupancy fields\ninstead of SDFs as they are easier to learn from sparse inputs. We leverage a\nmargin-based uncertainty measure to differentially sample from the decision\nboundary of the occupancy function and supervise the sampled boundary points\nusing the input point cloud. We further stabilize the optimization process at\nthe early stages of the training by biasing the occupancy function towards\nminimal entropy fields while maximizing its entropy at the input point cloud.\nThrough extensive experiments and evaluations, we illustrate the efficacy of\nour proposed method, highlighting its capacity to improve implicit shape\ninference with respect to baselines and the state-of-the-art using synthetic\nand real data.\n","authors":["Amine Ouasfi","Adnane Boukhayma"],"pdf_url":"https://arxiv.org/pdf/2404.02759v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.02755v1","updated":"2024-04-03T13:57:08Z","published":"2024-04-03T13:57:08Z","title":"DIBS: Enhancing Dense Video Captioning with Unlabeled Videos via Pseudo\n Boundary Enrichment and Online Refinement","summary":" We present Dive Into the BoundarieS (DIBS), a novel pretraining framework for\ndense video captioning (DVC), that elaborates on improving the quality of the\ngenerated event captions and their associated pseudo event boundaries from\nunlabeled videos. By leveraging the capabilities of diverse large language\nmodels (LLMs), we generate rich DVC-oriented caption candidates and optimize\nthe corresponding pseudo boundaries under several meticulously designed\nobjectives, considering diversity, event-centricity, temporal ordering, and\ncoherence. Moreover, we further introduce a novel online boundary refinement\nstrategy that iteratively improves the quality of pseudo boundaries during\ntraining. Comprehensive experiments have been conducted to examine the\neffectiveness of the proposed technique components. By leveraging a substantial\namount of unlabeled video data, such as HowTo100M, we achieve a remarkable\nadvancement on standard DVC datasets like YouCook2 and ActivityNet. We\noutperform the previous state-of-the-art Vid2Seq across a majority of metrics,\nachieving this with just 0.4% of the unlabeled video data used for pre-training\nby Vid2Seq.\n","authors":["Hao Wu","Huabin Liu","Yu Qiao","Xiao Sun"],"pdf_url":"https://arxiv.org/pdf/2404.02755v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.02747v1","updated":"2024-04-03T13:44:41Z","published":"2024-04-03T13:44:41Z","title":"Cross-Attention Makes Inference Cumbersome in Text-to-Image Diffusion\n Models","summary":" This study explores the role of cross-attention during inference in\ntext-conditional diffusion models. We find that cross-attention outputs\nconverge to a fixed point after few inference steps. Accordingly, the time\npoint of convergence naturally divides the entire inference process into two\nstages: an initial semantics-planning stage, during which, the model relies on\ncross-attention to plan text-oriented visual semantics, and a subsequent\nfidelity-improving stage, during which the model tries to generate images from\npreviously planned semantics. Surprisingly, ignoring text conditions in the\nfidelity-improving stage not only reduces computation complexity, but also\nmaintains model performance. This yields a simple and training-free method\ncalled TGATE for efficient generation, which caches the cross-attention output\nonce it converges and keeps it fixed during the remaining inference steps. Our\nempirical study on the MS-COCO validation set confirms its effectiveness. The\nsource code of TGATE is available at https://github.com/HaozheLiu-ST/T-GATE.\n","authors":["Wentian Zhang","Haozhe Liu","Jinheng Xie","Francesco Faccio","Mike Zheng Shou","Jürgen Schmidhuber"],"pdf_url":"https://arxiv.org/pdf/2404.02747v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15528v2","updated":"2024-04-03T13:40:14Z","published":"2024-03-22T17:27:18Z","title":"Evaluating GPT-4 with Vision on Detection of Radiological Findings on\n Chest Radiographs","summary":" The study examines the application of GPT-4V, a multi-modal large language\nmodel equipped with visual recognition, in detecting radiological findings from\na set of 100 chest radiographs and suggests that GPT-4V is currently not ready\nfor real-world diagnostic usage in interpreting chest radiographs.\n","authors":["Yiliang Zhou","Hanley Ong","Patrick Kennedy","Carol Wu","Jacob Kazam","Keith Hentel","Adam Flanders","George Shih","Yifan Peng"],"pdf_url":"https://arxiv.org/pdf/2403.15528v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02742v1","updated":"2024-04-03T13:39:29Z","published":"2024-04-03T13:39:29Z","title":"LiDAR4D: Dynamic Neural Fields for Novel Space-time View LiDAR Synthesis","summary":" Although neural radiance fields (NeRFs) have achieved triumphs in image novel\nview synthesis (NVS), LiDAR NVS remains largely unexplored. Previous LiDAR NVS\nmethods employ a simple shift from image NVS methods while ignoring the dynamic\nnature and the large-scale reconstruction problem of LiDAR point clouds. In\nlight of this, we propose LiDAR4D, a differentiable LiDAR-only framework for\nnovel space-time LiDAR view synthesis. In consideration of the sparsity and\nlarge-scale characteristics, we design a 4D hybrid representation combined with\nmulti-planar and grid features to achieve effective reconstruction in a\ncoarse-to-fine manner. Furthermore, we introduce geometric constraints derived\nfrom point clouds to improve temporal consistency. For the realistic synthesis\nof LiDAR point clouds, we incorporate the global optimization of ray-drop\nprobability to preserve cross-region patterns. Extensive experiments on\nKITTI-360 and NuScenes datasets demonstrate the superiority of our method in\naccomplishing geometry-aware and time-consistent dynamic reconstruction. Codes\nare available at https://github.com/ispc-lab/LiDAR4D.\n","authors":["Zehan Zheng","Fan Lu","Weiyi Xue","Guang Chen","Changjun Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.02742v1.pdf","comment":"Accepted by CVPR 2024. Project Page:\n https://dyfcalid.github.io/LiDAR4D"},{"id":"http://arxiv.org/abs/2404.02135v2","updated":"2024-04-03T13:36:38Z","published":"2024-04-02T17:48:46Z","title":"ResNet with Integrated Convolutional Block Attention Module for Ship\n Classification Using Transfer Learning on Optical Satellite Imagery","summary":" This study proposes a novel transfer learning framework for effective ship\nclassification using high-resolution optical remote sensing satellite imagery.\nThe framework is based on the deep convolutional neural network model ResNet50\nand incorporates the Convolutional Block Attention Module (CBAM) to enhance\nperformance. CBAM enables the model to attend to salient features in the\nimages, allowing it to better discriminate between subtle differences between\nships and backgrounds. Furthermore, this study adopts a transfer learning\napproach tailored for accurately classifying diverse types of ships by\nfine-tuning a pre-trained model for the specific task. Experimental results\ndemonstrate the efficacy of the proposed framework in ship classification using\noptical remote sensing imagery, achieving a high classification accuracy of 94%\nacross 5 classes, outperforming existing methods. This research holds potential\napplications in maritime surveillance and management, illegal fishing\ndetection, and maritime traffic monitoring.\n","authors":["Ryan Donghan Kwon","Gangjoo Robin Nam","Jisoo Tak","Yeom Hyeok","Junseob Shin","Hyerin Cha","Kim Soo Bin"],"pdf_url":"https://arxiv.org/pdf/2404.02135v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02738v1","updated":"2024-04-03T13:35:51Z","published":"2024-04-03T13:35:51Z","title":"Adaptive Affinity-Based Generalization For MRI Imaging Segmentation\n Across Resource-Limited Settings","summary":" The joint utilization of diverse data sources for medical imaging\nsegmentation has emerged as a crucial area of research, aiming to address\nchallenges such as data heterogeneity, domain shift, and data quality\ndiscrepancies. Integrating information from multiple data domains has shown\npromise in improving model generalizability and adaptability. However, this\napproach often demands substantial computational resources, hindering its\npracticality. In response, knowledge distillation (KD) has garnered attention\nas a solution. KD involves training light-weight models to emulate the behavior\nof more resource-intensive models, thereby mitigating the computational burden\nwhile maintaining performance. This paper addresses the pressing need to\ndevelop a lightweight and generalizable model for medical imaging segmentation\nthat can effectively handle data integration challenges. Our proposed approach\nintroduces a novel relation-based knowledge framework by seamlessly combining\nadaptive affinity-based and kernel-based distillation through a gram matrix\nthat can capture the style representation across features. This methodology\nempowers the student model to accurately replicate the feature representations\nof the teacher model, facilitating robust performance even in the face of\ndomain shift and data heterogeneity. To validate our innovative approach, we\nconducted experiments on publicly available multi-source prostate MRI data. The\nresults demonstrate a significant enhancement in segmentation performance using\nlightweight networks. Notably, our method achieves this improvement while\nreducing both inference time and storage usage, rendering it a practical and\nefficient solution for real-time medical imaging segmentation.\n","authors":["Eddardaa B. Loussaief","Mohammed Ayad","Domenc Puig","Hatem A. Rashwan"],"pdf_url":"https://arxiv.org/pdf/2404.02738v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02733v1","updated":"2024-04-03T13:34:09Z","published":"2024-04-03T13:34:09Z","title":"InstantStyle: Free Lunch towards Style-Preserving in Text-to-Image\n Generation","summary":" Tuning-free diffusion-based models have demonstrated significant potential in\nthe realm of image personalization and customization. However, despite this\nnotable progress, current models continue to grapple with several complex\nchallenges in producing style-consistent image generation. Firstly, the concept\nof style is inherently underdetermined, encompassing a multitude of elements\nsuch as color, material, atmosphere, design, and structure, among others.\nSecondly, inversion-based methods are prone to style degradation, often\nresulting in the loss of fine-grained details. Lastly, adapter-based approaches\nfrequently require meticulous weight tuning for each reference image to achieve\na balance between style intensity and text controllability. In this paper, we\ncommence by examining several compelling yet frequently overlooked\nobservations. We then proceed to introduce InstantStyle, a framework designed\nto address these issues through the implementation of two key strategies: 1) A\nstraightforward mechanism that decouples style and content from reference\nimages within the feature space, predicated on the assumption that features\nwithin the same space can be either added to or subtracted from one another. 2)\nThe injection of reference image features exclusively into style-specific\nblocks, thereby preventing style leaks and eschewing the need for cumbersome\nweight tuning, which often characterizes more parameter-heavy designs.Our work\ndemonstrates superior visual stylization outcomes, striking an optimal balance\nbetween the intensity of style and the controllability of textual elements. Our\ncodes will be available at https://github.com/InstantStyle/InstantStyle.\n","authors":["Haofan Wang","Qixun Wang","Xu Bai","Zekui Qin","Anthony Chen"],"pdf_url":"https://arxiv.org/pdf/2404.02733v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2404.02731v1","updated":"2024-04-03T13:30:56Z","published":"2024-04-03T13:30:56Z","title":"Event Camera Demosaicing via Swin Transformer and Pixel-focus Loss","summary":" Recent research has highlighted improvements in high-quality imaging guided\nby event cameras, with most of these efforts concentrating on the RGB domain.\nHowever, these advancements frequently neglect the unique challenges introduced\nby the inherent flaws in the sensor design of event cameras in the RAW domain.\nSpecifically, this sensor design results in the partial loss of pixel values,\nposing new challenges for RAW domain processes like demosaicing. The challenge\nintensifies as most research in the RAW domain is based on the premise that\neach pixel contains a value, making the straightforward adaptation of these\nmethods to event camera demosaicing problematic. To end this, we present a\nSwin-Transformer-based backbone and a pixel-focus loss function for demosaicing\nwith missing pixel values in RAW domain processing. Our core motivation is to\nrefine a general and widely applicable foundational model from the RGB domain\nfor RAW domain processing, thereby broadening the model's applicability within\nthe entire imaging process. Our method harnesses multi-scale processing and\nspace-to-depth techniques to ensure efficiency and reduce computing complexity.\nWe also proposed the Pixel-focus Loss function for network fine-tuning to\nimprove network convergence based on our discovery of a long-tailed\ndistribution in training loss. Our method has undergone validation on the MIPI\nDemosaic Challenge dataset, with subsequent analytical experimentation\nconfirming its efficacy. All code and trained models are released here:\nhttps://github.com/yunfanLu/ev-demosaic\n","authors":["Yunfan Lu","Yijie Xu","Wenzong Ma","Weiyu Guo","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2404.02731v1.pdf","comment":"Accepted for the CVPR 2024 Workshop on Mobile Intelligent Photography\n & Imaging"},{"id":"http://arxiv.org/abs/2404.02726v1","updated":"2024-04-03T13:27:54Z","published":"2024-04-03T13:27:54Z","title":"Harnessing the Power of Large Vision Language Models for Synthetic Image\n Detection","summary":" In recent years, the emergence of models capable of generating images from\ntext has attracted considerable interest, offering the possibility of creating\nrealistic images from text descriptions. Yet these advances have also raised\nconcerns about the potential misuse of these images, including the creation of\nmisleading content such as fake news and propaganda. This study investigates\nthe effectiveness of using advanced vision-language models (VLMs) for synthetic\nimage identification. Specifically, the focus is on tuning state-of-the-art\nimage captioning models for synthetic image detection. By harnessing the robust\nunderstanding capabilities of large VLMs, the aim is to distinguish authentic\nimages from synthetic images produced by diffusion-based models. This study\ncontributes to the advancement of synthetic image detection by exploiting the\ncapabilities of visual language models such as BLIP-2 and ViTGPT2. By tailoring\nimage captioning models, we address the challenges associated with the\npotential misuse of synthetic images in real-world applications. Results\ndescribed in this paper highlight the promising role of VLMs in the field of\nsynthetic image detection, outperforming conventional image-based detection\ntechniques. Code and models can be found at\nhttps://github.com/Mamadou-Keita/VLM-DETECT.\n","authors":["Mamadou Keita","Wassim Hamidouche","Hassen Bougueffa","Abdenour Hadid","Abdelmalik Taleb-Ahmed"],"pdf_url":"https://arxiv.org/pdf/2404.02726v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2404.01959"},{"id":"http://arxiv.org/abs/2312.09056v2","updated":"2024-04-03T13:09:27Z","published":"2023-12-14T15:53:07Z","title":"ReCoRe: Regularized Contrastive Representation Learning of World Model","summary":" While recent model-free Reinforcement Learning (RL) methods have demonstrated\nhuman-level effectiveness in gaming environments, their success in everyday\ntasks like visual navigation has been limited, particularly under significant\nappearance variations. This limitation arises from (i) poor sample efficiency\nand (ii) over-fitting to training scenarios. To address these challenges, we\npresent a world model that learns invariant features using (i) contrastive\nunsupervised learning and (ii) an intervention-invariant regularizer. Learning\nan explicit representation of the world dynamics i.e. a world model, improves\nsample efficiency while contrastive learning implicitly enforces learning of\ninvariant features, which improves generalization. However, the na\\\"ive\nintegration of contrastive loss to world models is not good enough, as\nworld-model-based RL methods independently optimize representation learning and\nagent policy. To overcome this issue, we propose an intervention-invariant\nregularizer in the form of an auxiliary task such as depth prediction, image\ndenoising, image segmentation, etc., that explicitly enforces invariance to\nstyle interventions. Our method outperforms current state-of-the-art\nmodel-based and model-free RL methods and significantly improves on\nout-of-distribution point navigation tasks evaluated on the iGibson benchmark.\nWith only visual observations, we further demonstrate that our approach\noutperforms recent language-guided foundation models for point navigation,\nwhich is essential for deployment on robots with limited computation\ncapabilities. Finally, we demonstrate that our proposed model excels at the\nsim-to-real transfer of its perception module on the Gibson benchmark.\n","authors":["Rudra P. K. Poudel","Harit Pandya","Stephan Liwicki","Roberto Cipolla"],"pdf_url":"https://arxiv.org/pdf/2312.09056v2.pdf","comment":"Accepted at CVPR 2024. arXiv admin note: text overlap with\n arXiv:2209.14932"},{"id":"http://arxiv.org/abs/2403.13352v3","updated":"2024-04-03T13:08:55Z","published":"2024-03-20T07:31:07Z","title":"AGFSync: Leveraging AI-Generated Feedback for Preference Optimization in\n Text-to-Image Generation","summary":" Text-to-Image (T2I) diffusion models have achieved remarkable success in\nimage generation. Despite their progress, challenges remain in both\nprompt-following ability, image quality and lack of high-quality datasets,\nwhich are essential for refining these models. As acquiring labeled data is\ncostly, we introduce AGFSync, a framework that enhances T2I diffusion models\nthrough Direct Preference Optimization (DPO) in a fully AI-driven approach.\nAGFSync utilizes Vision-Language Models (VLM) to assess image quality across\nstyle, coherence, and aesthetics, generating feedback data within an AI-driven\nloop. By applying AGFSync to leading T2I models such as SD v1.4, v1.5, and\nSDXL, our extensive experiments on the TIFA dataset demonstrate notable\nimprovements in VQA scores, aesthetic evaluations, and performance on the HPSv2\nbenchmark, consistently outperforming the base models. AGFSync's method of\nrefining T2I diffusion models paves the way for scalable alignment techniques.\n","authors":["Jingkun An","Yinghao Zhu","Zongjian Li","Haoran Feng","Bohua Chen","Yemin Shi","Chengwei Pan"],"pdf_url":"https://arxiv.org/pdf/2403.13352v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05073v4","updated":"2024-04-03T13:07:55Z","published":"2023-09-10T16:42:11Z","title":"FreeMan: Towards Benchmarking 3D Human Pose Estimation under Real-World\n Conditions","summary":" Estimating the 3D structure of the human body from natural scenes is a\nfundamental aspect of visual perception. 3D human pose estimation is a vital\nstep in advancing fields like AIGC and human-robot interaction, serving as a\ncrucial technique for understanding and interacting with human actions in\nreal-world settings. However, the current datasets, often collected under\nsingle laboratory conditions using complex motion capture equipment and\nunvarying backgrounds, are insufficient. The absence of datasets on variable\nconditions is stalling the progress of this crucial task. To facilitate the\ndevelopment of 3D pose estimation, we present FreeMan, the first large-scale,\nmulti-view dataset collected under the real-world conditions. FreeMan was\ncaptured by synchronizing 8 smartphones across diverse scenarios. It comprises\n11M frames from 8000 sequences, viewed from different perspectives. These\nsequences cover 40 subjects across 10 different scenarios, each with varying\nlighting conditions. We have also established an semi-automated pipeline\ncontaining error detection to reduce the workload of manual check and ensure\nprecise annotation. We provide comprehensive evaluation baselines for a range\nof tasks, underlining the significant challenges posed by FreeMan. Further\nevaluations of standard indoor/outdoor human sensing datasets reveal that\nFreeMan offers robust representation transferability in real and complex\nscenes. Code and data are available at https://wangjiongw.github.io/freeman.\n","authors":["Jiong Wang","Fengyu Yang","Wenbo Gou","Bingliang Li","Danqi Yan","Ailing Zeng","Yijun Gao","Junle Wang","Yanqing Jing","Ruimao Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.05073v4.pdf","comment":"CVPR2024 camera ready version. 19 pages, 16 figures. Project page:\n https://wangjiongw.github.io/freeman/ ; API:\n https://github.com/wangjiongw/FreeMan_API"},{"id":"http://arxiv.org/abs/2404.02697v1","updated":"2024-04-03T12:54:16Z","published":"2024-04-03T12:54:16Z","title":"Model-agnostic Origin Attribution of Generated Images with Few-shot\n Examples","summary":" Recent progress in visual generative models enables the generation of\nhigh-quality images. To prevent the misuse of generated images, it is important\nto identify the origin model that generates them. In this work, we study the\norigin attribution of generated images in a practical setting where only a few\nimages generated by a source model are available and the source model cannot be\naccessed. The goal is to check if a given image is generated by the source\nmodel. We first formulate this problem as a few-shot one-class classification\ntask. To solve the task, we propose OCC-CLIP, a CLIP-based framework for\nfew-shot one-class classification, enabling the identification of an image's\nsource model, even among multiple candidates. Extensive experiments\ncorresponding to various generative models verify the effectiveness of our\nOCC-CLIP framework. Furthermore, an experiment based on the recently released\nDALL-E 3 API verifies the real-world applicability of our solution.\n","authors":["Fengyuan Liu","Haochen Luo","Yiming Li","Philip Torr","Jindong Gu"],"pdf_url":"https://arxiv.org/pdf/2404.02697v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.12685v2","updated":"2024-04-03T12:47:15Z","published":"2023-09-22T07:51:17Z","title":"eWand: A calibration framework for wide baseline frame-based and\n event-based camera systems","summary":" Accurate calibration is crucial for using multiple cameras to triangulate the\nposition of objects precisely. However, it is also a time-consuming process\nthat needs to be repeated for every displacement of the cameras. The standard\napproach is to use a printed pattern with known geometry to estimate the\nintrinsic and extrinsic parameters of the cameras. The same idea can be applied\nto event-based cameras, though it requires extra work. By using frame\nreconstruction from events, a printed pattern can be detected. A blinking\npattern can also be displayed on a screen. Then, the pattern can be directly\ndetected from the events. Such calibration methods can provide accurate\nintrinsic calibration for both frame- and event-based cameras. However, using\n2D patterns has several limitations for multi-camera extrinsic calibration,\nwith cameras possessing highly different points of view and a wide baseline.\nThe 2D pattern can only be detected from one direction and needs to be of\nsignificant size to compensate for its distance to the camera. This makes the\nextrinsic calibration time-consuming and cumbersome. To overcome these\nlimitations, we propose eWand, a new method that uses blinking LEDs inside\nopaque spheres instead of a printed or displayed pattern. Our method provides a\nfaster, easier-to-use extrinsic calibration approach that maintains high\naccuracy for both event- and frame-based cameras.\n","authors":["Thomas Gossard","Andreas Ziegler","Levin Kolmar","Jonas Tebbe","Andreas Zell"],"pdf_url":"https://arxiv.org/pdf/2309.12685v2.pdf","comment":"Accepted for 2024 IEEE International Conference on Robotics and\n Automation (ICRA 2024). Project web page:\n https://cogsys-tuebingen.github.io/ewand/"},{"id":"http://arxiv.org/abs/2401.05827v2","updated":"2024-04-03T12:42:32Z","published":"2024-01-11T10:52:17Z","title":"Hallucination Benchmark in Medical Visual Question Answering","summary":" The recent success of large language and vision models (LLVMs) on vision\nquestion answering (VQA), particularly their applications in medicine\n(Med-VQA), has shown a great potential of realizing effective visual assistants\nfor healthcare. However, these models are not extensively tested on the\nhallucination phenomenon in clinical settings. Here, we created a hallucination\nbenchmark of medical images paired with question-answer sets and conducted a\ncomprehensive evaluation of the state-of-the-art models. The study provides an\nin-depth analysis of current models' limitations and reveals the effectiveness\nof various prompting strategies.\n","authors":["Jinge Wu","Yunsoo Kim","Honghan Wu"],"pdf_url":"https://arxiv.org/pdf/2401.05827v2.pdf","comment":"Accepted to ICLR 2024 Tiny Papers(Notable)"},{"id":"http://arxiv.org/abs/2404.02686v1","updated":"2024-04-03T12:32:13Z","published":"2024-04-03T12:32:13Z","title":"Design2Cloth: 3D Cloth Generation from 2D Masks","summary":" In recent years, there has been a significant shift in the field of digital\navatar research, towards modeling, animating and reconstructing clothed human\nrepresentations, as a key step towards creating realistic avatars. However,\ncurrent 3D cloth generation methods are garment specific or trained completely\non synthetic data, hence lacking fine details and realism. In this work, we\nmake a step towards automatic realistic garment design and propose\nDesign2Cloth, a high fidelity 3D generative model trained on a real world\ndataset from more than 2000 subject scans. To provide vital contribution to the\nfashion industry, we developed a user-friendly adversarial model capable of\ngenerating diverse and detailed clothes simply by drawing a 2D cloth mask.\nUnder a series of both qualitative and quantitative experiments, we showcase\nthat Design2Cloth outperforms current state-of-the-art cloth generative models\nby a large margin. In addition to the generative properties of our network, we\nshowcase that the proposed method can be used to achieve high quality\nreconstructions from single in-the-wild images and 3D scans. Dataset, code and\npre-trained model will become publicly available.\n","authors":["Jiali Zheng","Rolandos Alexandros Potamias","Stefanos Zafeiriou"],"pdf_url":"https://arxiv.org/pdf/2404.02686v1.pdf","comment":"Accepted to CVPR 2024, Project page:\n https://jiali-zheng.github.io/Design2Cloth/"},{"id":"http://arxiv.org/abs/2404.02678v1","updated":"2024-04-03T12:21:41Z","published":"2024-04-03T12:21:41Z","title":"Independently Keypoint Learning for Small Object Semantic Correspondence","summary":" Semantic correspondence remains a challenging task for establishing\ncorrespondences between a pair of images with the same category or similar\nscenes due to the large intra-class appearance. In this paper, we introduce a\nnovel problem called 'Small Object Semantic Correspondence (SOSC).' This\nproblem is challenging due to the close proximity of keypoints associated with\nsmall objects, which results in the fusion of these respective features. It is\ndifficult to identify the corresponding key points of the fused features, and\nit is also difficult to be recognized. To address this challenge, we propose\nthe Keypoint Bounding box-centered Cropping (KBC) method, which aims to\nincrease the spatial separation between keypoints of small objects, thereby\nfacilitating independent learning of these keypoints. The KBC method is\nseamlessly integrated into our proposed inference pipeline and can be easily\nincorporated into other methodologies, resulting in significant performance\nenhancements. Additionally, we introduce a novel framework, named KBCNet, which\nserves as our baseline model. KBCNet comprises a Cross-Scale Feature Alignment\n(CSFA) module and an efficient 4D convolutional decoder. The CSFA module is\ndesigned to align multi-scale features, enriching keypoint representations by\nintegrating fine-grained features and deep semantic features. Meanwhile, the 4D\nconvolutional decoder, based on efficient 4D convolution, ensures efficiency\nand rapid convergence. To empirically validate the effectiveness of our\nproposed methodology, extensive experiments are conducted on three widely used\nbenchmarks: PF-PASCAL, PF-WILLOW, and SPair-71k. Our KBC method demonstrates a\nsubstantial performance improvement of 7.5\\% on the SPair-71K dataset,\nproviding compelling evidence of its efficacy.\n","authors":["Hailong Jin","Huiying Li"],"pdf_url":"https://arxiv.org/pdf/2404.02678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16926v5","updated":"2024-04-03T12:08:08Z","published":"2023-11-28T16:31:27Z","title":"LLaFS: When Large Language Models Meet Few-Shot Segmentation","summary":" This paper proposes LLaFS, the first attempt to leverage large language\nmodels (LLMs) in few-shot segmentation. In contrast to the conventional\nfew-shot segmentation methods that only rely on the limited and biased\ninformation from the annotated support images, LLaFS leverages the vast prior\nknowledge gained by LLM as an effective supplement and directly uses the LLM to\nsegment images in a few-shot manner. To enable the text-based LLM to handle\nimage-related tasks, we carefully design an input instruction that allows the\nLLM to produce segmentation results represented as polygons, and propose a\nregion-attribute table to simulate the human visual mechanism and provide\nmulti-modal guidance. We also synthesize pseudo samples and use curriculum\nlearning for pretraining to augment data and achieve better optimization. LLaFS\nachieves state-of-the-art results on multiple datasets, showing the potential\nof using LLMs for few-shot computer vision tasks.\n","authors":["Lanyun Zhu","Tianrun Chen","Deyi Ji","Jieping Ye","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2311.16926v5.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2404.02668v1","updated":"2024-04-03T12:06:01Z","published":"2024-04-03T12:06:01Z","title":"RS-Mamba for Large Remote Sensing Image Dense Prediction","summary":" The spatial resolution of remote sensing images is becoming increasingly\nhigher, posing challenges in handling large very-high-resolution (VHR) remote\nsensing images for dense prediction tasks. Models based on convolutional neural\nnetworks are limited in their ability to model global features of remote\nsensing images due to local convolution operations. Transformer based models,\ndespite their global modeling capabilities, face computational challenges with\nlarge VHR images due to their quadratic complexity. The common practice of\ncropping large images into smaller patches leads to a significant loss of\ncontextual information. To address these issues, we propose the Remote Sensing\nMamba (RSM) for dense prediction tasks in VHR remote sensing. RSM is designed\nto model global features of remote sensing images with linear complexity,\nenabling it to process large VHR images effectively. It employs an\nomnidirectional selective scan module to globally model the images in multiple\ndirections, capturing large spatial features from various directions.\nExperiments on semantic segmentation and change detection tasks across various\nobjects demonstrate the effectiveness of RSM. With simple model architecture\nand training approach, RSM achieves state-of-the-art performance on the dense\nprediction tasks of VHR remote sensing. The code for this work will be\navailable at https://github.com/walking-shadow/Official_Remote_Sensing_Mamba.\n","authors":["Sijie Zhao","Hao Chen","Xueliang Zhang","Pengfeng Xiao","Lei Bai","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2404.02668v1.pdf","comment":"13 pages,6 figures"},{"id":"http://arxiv.org/abs/2404.02659v1","updated":"2024-04-03T11:47:20Z","published":"2024-04-03T11:47:20Z","title":"A Satellite Band Selection Framework for Amazon Forest Deforestation\n Detection Task","summary":" The conservation of tropical forests is a topic of significant social and\necological relevance due to their crucial role in the global ecosystem.\nUnfortunately, deforestation and degradation impact millions of hectares\nannually, necessitating government or private initiatives for effective forest\nmonitoring. This study introduces a novel framework that employs the Univariate\nMarginal Distribution Algorithm (UMDA) to select spectral bands from Landsat-8\nsatellite, optimizing the representation of deforested areas. This selection\nguides a semantic segmentation architecture, DeepLabv3+, enhancing its\nperformance. Experimental results revealed several band compositions that\nachieved superior balanced accuracy compared to commonly adopted combinations\nfor deforestation detection, utilizing segment classification via a Support\nVector Machine (SVM). Moreover, the optimal band compositions identified by the\nUMDA-based approach improved the performance of the DeepLabv3+ architecture,\nsurpassing state-of-the-art approaches compared in this study. The observation\nthat a few selected bands outperform the total contradicts the data-driven\nparadigm prevalent in the deep learning field. Therefore, this suggests an\nexception to the conventional wisdom that 'more is always better'.\n","authors":["Eduardo Neto","Fabio A. Faria","Amanda A. S. de Oliveira","Álvaro L. Fazenda"],"pdf_url":"https://arxiv.org/pdf/2404.02659v1.pdf","comment":"9 pages, 4 figures, paper accepted for presentation at GECCO 2024"},{"id":"http://arxiv.org/abs/2304.08069v3","updated":"2024-04-03T11:46:48Z","published":"2023-04-17T08:30:02Z","title":"DETRs Beat YOLOs on Real-time Object Detection","summary":" The YOLO series has become the most popular framework for real-time object\ndetection due to its reasonable trade-off between speed and accuracy. However,\nwe observe that the speed and accuracy of YOLOs are negatively affected by the\nNMS. Recently, end-to-end Transformer-based detectors (DETRs) have provided an\nalternative to eliminating NMS. Nevertheless, the high computational cost\nlimits their practicality and hinders them from fully exploiting the advantage\nof excluding NMS. In this paper, we propose the Real-Time DEtection TRansformer\n(RT-DETR), the first real-time end-to-end object detector to our best knowledge\nthat addresses the above dilemma. We build RT-DETR in two steps, drawing on the\nadvanced DETR: first we focus on maintaining accuracy while improving speed,\nfollowed by maintaining speed while improving accuracy. Specifically, we design\nan efficient hybrid encoder to expeditiously process multi-scale features by\ndecoupling intra-scale interaction and cross-scale fusion to improve speed.\nThen, we propose the uncertainty-minimal query selection to provide\nhigh-quality initial queries to the decoder, thereby improving accuracy. In\naddition, RT-DETR supports flexible speed tuning by adjusting the number of\ndecoder layers to adapt to various scenarios without retraining. Our\nRT-DETR-R50 / R101 achieves 53.1% / 54.3% AP on COCO and 108 / 74 FPS on T4\nGPU, outperforming previously advanced YOLOs in both speed and accuracy. We\nalso develop scaled RT-DETRs that outperform the lighter YOLO detectors (S and\nM models). Furthermore, RT-DETR-R50 outperforms DINO-R50 by 2.2% AP in accuracy\nand about 21 times in FPS. After pre-training with Objects365, RT-DETR-R50 /\nR101 achieves 55.3% / 56.2% AP. The project page:\nhttps://zhao-yian.github.io/RTDETR.\n","authors":["Yian Zhao","Wenyu Lv","Shangliang Xu","Jinman Wei","Guanzhong Wang","Qingqing Dang","Yi Liu","Jie Chen"],"pdf_url":"https://arxiv.org/pdf/2304.08069v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02656v1","updated":"2024-04-03T11:37:03Z","published":"2024-04-03T11:37:03Z","title":"Non-negative Subspace Feature Representation for Few-shot Learning in\n Medical Imaging","summary":" Unlike typical visual scene recognition domains, in which massive datasets\nare accessible to deep neural networks, medical image interpretations are often\nobstructed by the paucity of data. In this paper, we investigate the\neffectiveness of data-based few-shot learning in medical imaging by exploring\ndifferent data attribute representations in a low-dimensional space. We\nintroduce different types of non-negative matrix factorization (NMF) in\nfew-shot learning, addressing the data scarcity issue in medical image\nclassification. Extensive empirical studies are conducted in terms of\nvalidating the effectiveness of NMF, especially its supervised variants (e.g.,\ndiscriminative NMF, and supervised and constrained NMF with sparseness), and\nthe comparison with principal component analysis (PCA), i.e., the collaborative\nrepresentation-based dimensionality reduction technique derived from\neigenvectors. With 14 different datasets covering 11 distinct illness\ncategories, thorough experimental results and comparison with related\ntechniques demonstrate that NMF is a competitive alternative to PCA for\nfew-shot learning in medical imaging, and the supervised NMF algorithms are\nmore discriminative in the subspace with greater effectiveness. Furthermore, we\nshow that the part-based representation of NMF, especially its supervised\nvariants, is dramatically impactful in detecting lesion areas in medical\nimaging with limited samples.\n","authors":["Keqiang Fan","Xiaohao Cai","Mahesan Niranjan"],"pdf_url":"https://arxiv.org/pdf/2404.02656v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11389v2","updated":"2024-04-03T11:33:14Z","published":"2023-03-20T18:49:39Z","title":"Creating Ensembles of Classifiers through UMDA for Aerial Scene\n Classification","summary":" Aerial scene classification, which aims to semantically label remote sensing\nimages in a set of predefined classes (e.g., agricultural, beach, and harbor),\nis a very challenging task in remote sensing due to high intra-class\nvariability and the different scales and orientations of the objects present in\nthe dataset images. In remote sensing area, the use of CNN architectures as an\nalternative solution is also a reality for scene classification tasks.\nGenerally, these CNNs are used to perform the traditional image classification\ntask. However, another less used way to classify remote sensing image might be\nthe one that uses deep metric learning (DML) approaches. In this sense, this\nwork proposes to employ six DML approaches for aerial scene classification\ntasks, analysing their behave with four different pre-trained CNNs as well as\ncombining them through the use of evolutionary computation algorithm (UMDA). In\nperformed experiments, it is possible to observe than DML approaches can\nachieve the best classification results when compared to traditional\npre-trained CNNs for three well-known remote sensing aerial scene datasets. In\naddition, the UMDA algorithm proved to be a promising strategy to combine DML\napproaches when there is diversity among them, managing to improve at least\n5.6% of accuracy in the classification results using almost 50\\% of the\navailable classifiers for the construction of the final ensemble of\nclassifiers.\n","authors":["Fabio A. Faria","Luiz H. Buris","Luis A. M. Pereira","Fábio A. M. Cappabianco"],"pdf_url":"https://arxiv.org/pdf/2303.11389v2.pdf","comment":"9 pages, 4 figures, accepted for presentation at the GECCO2024"},{"id":"http://arxiv.org/abs/2401.15204v4","updated":"2024-04-03T11:12:16Z","published":"2024-01-26T21:02:44Z","title":"LYT-Net: Lightweight YUV Transformer-based Network for Low-Light Image\n Enhancement","summary":" In recent years, deep learning-based solutions have proven successful in the\ndomains of image enhancement. This paper introduces LYT-Net, or Lightweight YUV\nTransformer-based Network, as a novel approach for low-light image enhancement.\nThe proposed architecture, distinct from conventional Retinex-based models,\nleverages the YUV color space's natural separation of luminance (Y) and\nchrominance (U and V) to simplify the intricate task of disentangling light and\ncolor information in images. By utilizing the strengths of transformers, known\nfor their capability to capture long-range dependencies, LYT-Net ensures a\ncomprehensive contextual understanding of the image while maintaining reduced\nmodel complexity. By employing a novel hybrid loss function, our proposed\nmethod achieves state-of-the-art results on low-light image enhancement\ndatasets, all while being considerably more compact than its counterparts. The\nsource code and pre-trained models are available at\nhttps://github.com/albrateanu/LYT-Net\n","authors":["A. Brateanu","R. Balmez","A. Avram","C. Orhei"],"pdf_url":"https://arxiv.org/pdf/2401.15204v4.pdf","comment":"10 pages, 6 figures, submitted to ICIP"},{"id":"http://arxiv.org/abs/2212.05315v3","updated":"2024-04-03T11:03:52Z","published":"2022-12-10T14:49:24Z","title":"Mind The Edge: Refining Depth Edges in Sparsely-Supervised Monocular\n Depth Estimation","summary":" Monocular Depth Estimation (MDE) is a fundamental problem in computer vision\nwith numerous applications. Recently, LIDAR-supervised methods have achieved\nremarkable per-pixel depth accuracy in outdoor scenes. However, significant\nerrors are typically found in the proximity of depth discontinuities, i.e.,\ndepth edges, which often hinder the performance of depth-dependent applications\nthat are sensitive to such inaccuracies, e.g., novel view synthesis and\naugmented reality. Since direct supervision for the location of depth edges is\ntypically unavailable in sparse LIDAR-based scenes, encouraging the MDE model\nto produce correct depth edges is not straightforward. To the best of our\nknowledge this paper is the first attempt to address the depth edges issue for\nLIDAR-supervised scenes. In this work we propose to learn to detect the\nlocation of depth edges from densely-supervised synthetic data, and use it to\ngenerate supervision for the depth edges in the MDE training. To quantitatively\nevaluate our approach, and due to the lack of depth edges GT in LIDAR-based\nscenes, we manually annotated subsets of the KITTI and the DDAD datasets with\ndepth edges ground truth. We demonstrate significant gains in the accuracy of\nthe depth edges with comparable per-pixel depth accuracy on several challenging\ndatasets. Code and datasets are available at\n\\url{https://github.com/liortalker/MindTheEdge}.\n","authors":["Lior Talker","Aviad Cohen","Erez Yosef","Alexandra Dana","Michael Dinerstein"],"pdf_url":"https://arxiv.org/pdf/2212.05315v3.pdf","comment":"Appears in CVPR24'"},{"id":"http://arxiv.org/abs/2404.02638v1","updated":"2024-04-03T10:57:47Z","published":"2024-04-03T10:57:47Z","title":"SG-BEV: Satellite-Guided BEV Fusion for Cross-View Semantic Segmentation","summary":" This paper aims at achieving fine-grained building attribute segmentation in\na cross-view scenario, i.e., using satellite and street-view image pairs. The\nmain challenge lies in overcoming the significant perspective differences\nbetween street views and satellite views. In this work, we introduce SG-BEV, a\nnovel approach for satellite-guided BEV fusion for cross-view semantic\nsegmentation. To overcome the limitations of existing cross-view projection\nmethods in capturing the complete building facade features, we innovatively\nincorporate Bird's Eye View (BEV) method to establish a spatially explicit\nmapping of street-view features. Moreover, we fully leverage the advantages of\nmultiple perspectives by introducing a novel satellite-guided reprojection\nmodule, optimizing the uneven feature distribution issues associated with\ntraditional BEV methods. Our method demonstrates significant improvements on\nfour cross-view datasets collected from multiple cities, including New York,\nSan Francisco, and Boston. On average across these datasets, our method\nachieves an increase in mIOU by 10.13% and 5.21% compared with the\nstate-of-the-art satellite-based and cross-view methods. The code and datasets\nof this work will be released at https://github.com/yejy53/SG-BEV.\n","authors":["Junyan Ye","Qiyan Luo","Jinhua Yu","Huaping Zhong","Zhimeng Zheng","Conghui He","Weijia Li"],"pdf_url":"https://arxiv.org/pdf/2404.02638v1.pdf","comment":"accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2311.10224v2","updated":"2024-04-03T10:57:10Z","published":"2023-11-16T22:31:05Z","title":"CV-Attention UNet: Attention-based UNet for 3D Cerebrovascular\n Segmentation of Enhanced TOF-MRA Images","summary":" Due to the lack of automated methods, to diagnose cerebrovascular disease,\ntime-of-flight magnetic resonance angiography (TOF-MRA) is assessed visually,\nmaking it time-consuming. The commonly used encoder-decoder architectures for\ncerebrovascular segmentation utilize redundant features, eventually leading to\nthe extraction of low-level features multiple times. Additionally,\nconvolutional neural networks (CNNs) suffer from performance degradation when\nthe batch size is small, and deeper networks experience the vanishing gradient\nproblem. Methods: In this paper, we attempt to solve these limitations and\npropose the 3D cerebrovascular attention UNet method, named CV-AttentionUNet,\nfor precise extraction of brain vessel images. We proposed a sequence of\npreprocessing techniques followed by deeply supervised UNet to improve the\naccuracy of segmentation of the brain vessels leading to a stroke. To combine\nthe low and high semantics, we applied the attention mechanism. This mechanism\nfocuses on relevant associations and neglects irrelevant anatomical\ninformation. Furthermore, the inclusion of deep supervision incorporates\ndifferent levels of features that prove to be beneficial for network\nconvergence. Results: We demonstrate the efficiency of the proposed method by\ncross-validating with an unlabeled dataset, which was further labeled by us. We\nbelieve that the novelty of this algorithm lies in its ability to perform well\non both labeled and unlabeled data with image processing-based enhancement. The\nresults indicate that our method performed better than the existing\nstate-of-the-art methods on the TubeTK dataset. Conclusion: The proposed method\nwill help in accurate segmentation of cerebrovascular structure leading to\nstroke\n","authors":["Syed Farhan Abbas","Nguyen Thanh Duc","Yoonguu Song","Kyungwon Kim","Ekta Srivastava","Boreom Lee"],"pdf_url":"https://arxiv.org/pdf/2311.10224v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02634v1","updated":"2024-04-03T10:44:06Z","published":"2024-04-03T10:44:06Z","title":"3DStyleGLIP: Part-Tailored Text-Guided 3D Neural Stylization","summary":" 3D stylization, which entails the application of specific styles to\nthree-dimensional objects, holds significant commercial potential as it enables\nthe creation of diverse 3D objects with distinct moods and styles, tailored to\nspecific demands of different scenes. With recent advancements in text-driven\nmethods and artificial intelligence, the stylization process is increasingly\nintuitive and automated, thereby diminishing the reliance on manual labor and\nexpertise. However, existing methods have predominantly focused on holistic\nstylization, thereby leaving the application of styles to individual components\nof a 3D object unexplored. In response, we introduce 3DStyleGLIP, a novel\nframework specifically designed for text-driven, part-tailored 3D stylization.\nGiven a 3D mesh and a text prompt, 3DStyleGLIP leverages the vision-language\nembedding space of the Grounded Language-Image Pre-training (GLIP) model to\nlocalize the individual parts of the 3D mesh and modify their colors and local\ngeometries to align them with the desired styles specified in the text prompt.\n3DStyleGLIP is effectively trained for 3D stylization tasks through a\npart-level style loss working in GLIP's embedding space, supplemented by two\ncomplementary learning techniques. Extensive experimental validation confirms\nthat our method achieves significant part-wise stylization capabilities,\ndemonstrating promising potential in advancing the field of 3D stylization.\n","authors":["SeungJeh Chung","JooHyun Park","Hyewon Kan","HyeongYeop Kang"],"pdf_url":"https://arxiv.org/pdf/2404.02634v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.00553v4","updated":"2024-04-03T10:36:31Z","published":"2023-04-02T15:04:43Z","title":"From Isolated Islands to Pangea: Unifying Semantic Space for Human\n Action Understanding","summary":" Action understanding has attracted long-term attention. It can be formed as\nthe mapping from the physical space to the semantic space. Typically,\nresearchers built datasets according to idiosyncratic choices to define classes\nand push the envelope of benchmarks respectively. Datasets are incompatible\nwith each other like \"Isolated Islands\" due to semantic gaps and various class\ngranularities, e.g., do housework in dataset A and wash plate in dataset B. We\nargue that we need a more principled semantic space to concentrate the\ncommunity efforts and use all datasets together to pursue generalizable action\nlearning. To this end, we design a structured action semantic space given verb\ntaxonomy hierarchy and covering massive actions. By aligning the classes of\nprevious datasets to our semantic space, we gather (image/video/skeleton/MoCap)\ndatasets into a unified database in a unified label system, i.e., bridging\n\"isolated islands\" into a \"Pangea\". Accordingly, we propose a novel model\nmapping from the physical space to semantic space to fully use Pangea. In\nextensive experiments, our new system shows significant superiority, especially\nin transfer learning. Our code and data will be made public at\nhttps://mvig-rhos.com/pangea.\n","authors":["Yong-Lu Li","Xiaoqian Wu","Xinpeng Liu","Zehao Wang","Yiming Dou","Yikun Ji","Junyi Zhang","Yixing Li","Jingru Tan","Xudong Lu","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2304.00553v4.pdf","comment":"CVPR 2024, Project Webpage: https://mvig-rhos.com/pangea"},{"id":"http://arxiv.org/abs/2404.02624v1","updated":"2024-04-03T10:25:45Z","published":"2024-04-03T10:25:45Z","title":"Multi-Scale Spatial-Temporal Self-Attention Graph Convolutional Networks\n for Skeleton-based Action Recognition","summary":" Skeleton-based gesture recognition methods have achieved high success using\nGraph Convolutional Network (GCN). In addition, context-dependent adaptive\ntopology as a neighborhood vertex information and attention mechanism leverages\na model to better represent actions. In this paper, we propose self-attention\nGCN hybrid model, Multi-Scale Spatial-Temporal self-attention (MSST)-GCN to\neffectively improve modeling ability to achieve state-of-the-art results on\nseveral datasets. We utilize spatial self-attention module with adaptive\ntopology to understand intra-frame interactions within a frame among different\nbody parts, and temporal self-attention module to examine correlations between\nframes of a node. These two are followed by multi-scale convolution network\nwith dilations, which not only captures the long-range temporal dependencies of\njoints but also the long-range spatial dependencies (i.e., long-distance\ndependencies) of node temporal behaviors. They are combined into high-level\nspatial-temporal representations and output the predicted action with the\nsoftmax classifier.\n","authors":["Ikuo Nakamura"],"pdf_url":"https://arxiv.org/pdf/2404.02624v1.pdf","comment":"9 pages, 3 figures"},{"id":"http://arxiv.org/abs/2306.05401v3","updated":"2024-04-03T10:16:22Z","published":"2023-06-08T17:52:34Z","title":"RDumb: A simple approach that questions our progress in continual\n test-time adaptation","summary":" Test-Time Adaptation (TTA) allows to update pre-trained models to changing\ndata distributions at deployment time. While early work tested these algorithms\nfor individual fixed distribution shifts, recent work proposed and applied\nmethods for continual adaptation over long timescales. To examine the reported\nprogress in the field, we propose the Continually Changing Corruptions (CCC)\nbenchmark to measure asymptotic performance of TTA techniques. We find that\neventually all but one state-of-the-art methods collapse and perform worse than\na non-adapting model, including models specifically proposed to be robust to\nperformance collapse. In addition, we introduce a simple baseline, \"RDumb\",\nthat periodically resets the model to its pretrained state. RDumb performs\nbetter or on par with the previously proposed state-of-the-art in all\nconsidered benchmarks. Our results show that previous TTA approaches are\nneither effective at regularizing adaptation to avoid collapse nor able to\noutperform a simplistic resetting strategy.\n","authors":["Ori Press","Steffen Schneider","Matthias Kümmerer","Matthias Bethge"],"pdf_url":"https://arxiv.org/pdf/2306.05401v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02618v1","updated":"2024-04-03T10:11:22Z","published":"2024-04-03T10:11:22Z","title":"Diffexplainer: Towards Cross-modal Global Explanations with Diffusion\n Models","summary":" We present DiffExplainer, a novel framework that, leveraging language-vision\nmodels, enables multimodal global explainability. DiffExplainer employs\ndiffusion models conditioned on optimized text prompts, synthesizing images\nthat maximize class outputs and hidden features of a classifier, thus providing\na visual tool for explaining decisions. Moreover, the analysis of generated\nvisual descriptions allows for automatic identification of biases and spurious\nfeatures, as opposed to traditional methods that often rely on manual\nintervention. The cross-modal transferability of language-vision models also\nenables the possibility to describe decisions in a more human-interpretable\nway, i.e., through text. We conduct comprehensive experiments, which include an\nextensive user study, demonstrating the effectiveness of DiffExplainer on 1)\nthe generation of high-quality images explaining model decisions, surpassing\nexisting activation maximization methods, and 2) the automated identification\nof biases and spurious features.\n","authors":["Matteo Pennisi","Giovanni Bellitto","Simone Palazzo","Mubarak Shah","Concetto Spampinato"],"pdf_url":"https://arxiv.org/pdf/2404.02618v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02617v1","updated":"2024-04-03T10:08:55Z","published":"2024-04-03T10:08:55Z","title":"Neural Radiance Fields with Torch Units","summary":" Neural Radiance Fields (NeRF) give rise to learning-based 3D reconstruction\nmethods widely used in industrial applications. Although prevalent methods\nachieve considerable improvements in small-scale scenes, accomplishing\nreconstruction in complex and large-scale scenes is still challenging. First,\nthe background in complex scenes shows a large variance among different views.\nSecond, the current inference pattern, $i.e.$, a pixel only relies on an\nindividual camera ray, fails to capture contextual information. To solve these\nproblems, we propose to enlarge the ray perception field and build up the\nsample points interactions. In this paper, we design a novel inference pattern\nthat encourages a single camera ray possessing more contextual information, and\nmodels the relationship among sample points on each camera ray. To hold\ncontextual information,a camera ray in our proposed method can render a patch\nof pixels simultaneously. Moreover, we replace the MLP in neural radiance field\nmodels with distance-aware convolutions to enhance the feature propagation\namong sample points from the same camera ray. To summarize, as a torchlight, a\nray in our proposed method achieves rendering a patch of image. Thus, we call\nthe proposed method, Torch-NeRF. Extensive experiments on KITTI-360 and LLFF\nshow that the Torch-NeRF exhibits excellent performance.\n","authors":["Bingnan Ni","Huanyu Wang","Dongfeng Bai","Minghe Weng","Dexin Qi","Weichao Qiu","Bingbing Liu"],"pdf_url":"https://arxiv.org/pdf/2404.02617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02614v1","updated":"2024-04-03T10:01:23Z","published":"2024-04-03T10:01:23Z","title":"Vestibular schwannoma growth_prediction from longitudinal MRI by time\n conditioned neural fields","summary":" Vestibular schwannomas (VS) are benign tumors that are generally managed by\nactive surveillance with MRI examination. To further assist clinical\ndecision-making and avoid overtreatment, an accurate prediction of tumor growth\nbased on longitudinal imaging is highly desirable. In this paper, we introduce\nDeepGrowth, a deep learning method that incorporates neural fields and\nrecurrent neural networks for prospective tumor growth prediction. In the\nproposed method, each tumor is represented as a signed distance function (SDF)\nconditioned on a low-dimensional latent code. Unlike previous studies that\nperform tumor shape prediction directly in the image space, we predict the\nlatent codes instead and then reconstruct future shapes from it. To deal with\nirregular time intervals, we introduce a time-conditioned recurrent module\nbased on a ConvLSTM and a novel temporal encoding strategy, which enables the\nproposed model to output varying tumor shapes over time. The experiments on an\nin-house longitudinal VS dataset showed that the proposed model significantly\nimproved the performance ($\\ge 1.6\\%$ Dice score and $\\ge0.20$ mm 95\\%\nHausdorff distance), in particular for top 20\\% tumors that grow or shrink the\nmost ($\\ge 4.6\\%$ Dice score and $\\ge 0.73$ mm 95\\% Hausdorff distance). Our\ncode is available at ~\\burl{https://github.com/cyjdswx/DeepGrowth}\n","authors":["Yunjie Chen","Jelmer M. Wolterink","Olaf M. Neve","Stephan R. Romeijn","Berit M. Verbist","Erik F. Hensen","Qian Tao","Marius Staring"],"pdf_url":"https://arxiv.org/pdf/2404.02614v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07739v2","updated":"2024-04-03T09:50:54Z","published":"2024-02-12T15:57:31Z","title":"Task-conditioned adaptation of visual features in multi-task policy\n learning","summary":" Successfully addressing a wide variety of tasks is a core ability of\nautonomous agents, requiring flexibly adapting the underlying decision-making\nstrategies and, as we argue in this work, also adapting the perception modules.\nAn analogical argument would be the human visual system, which uses top-down\nsignals to focus attention determined by the current task. Similarly, we adapt\npre-trained large vision models conditioned on specific downstream tasks in the\ncontext of multi-task policy learning. We introduce task-conditioned adapters\nthat do not require finetuning any pre-trained weights, combined with a single\npolicy trained with behavior cloning and capable of addressing multiple tasks.\nWe condition the visual adapters on task embeddings, which can be selected at\ninference if the task is known, or alternatively inferred from a set of example\ndemonstrations. To this end, we propose a new optimization-based estimator. We\nevaluate the method on a wide variety of tasks from the CortexBench benchmark\nand show that, compared to existing work, it can be addressed with a single\npolicy. In particular, we demonstrate that adapting visual features is a key\ndesign choice and that the method generalizes to unseen tasks given a few\ndemonstrations.\n","authors":["Pierre Marza","Laetitia Matignon","Olivier Simonin","Christian Wolf"],"pdf_url":"https://arxiv.org/pdf/2402.07739v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00035v2","updated":"2024-04-03T09:28:43Z","published":"2024-01-08T12:19:46Z","title":"Robustness Assessment of a Runway Object Classifier for Safe Aircraft\n Taxiing","summary":" As deep neural networks (DNNs) are becoming the prominent solution for many\ncomputational problems, the aviation industry seeks to explore their potential\nin alleviating pilot workload and in improving operational safety. However, the\nuse of DNNs in this type of safety-critical applications requires a thorough\ncertification process. This need can be addressed through formal verification,\nwhich provides rigorous assurances -- e.g.,~by proving the absence of certain\nmispredictions. In this case-study paper, we demonstrate this process using an\nimage-classifier DNN currently under development at Airbus and intended for use\nduring the aircraft taxiing phase. We use formal methods to assess this DNN's\nrobustness to three common image perturbation types: noise, brightness and\ncontrast, and some of their combinations. This process entails multiple\ninvocations of the underlying verifier, which might be computationally\nexpensive; and we therefore propose a method that leverages the monotonicity of\nthese robustness properties, as well as the results of past verification\nqueries, in order to reduce the overall number of verification queries required\nby nearly 60%. Our results provide an indication of the level of robustness\nachieved by the DNN classifier under study, and indicate that it is\nconsiderably more vulnerable to noise than to brightness or contrast\nperturbations.\n","authors":["Yizhak Elboher","Raya Elsaleh","Omri Isac","Mélanie Ducoffe","Audrey Galametz","Guillaume Povéda","Ryma Boumazouza","Noémie Cohen","Guy Katz"],"pdf_url":"https://arxiv.org/pdf/2402.00035v2.pdf","comment":"This is a preprint version of the paper in the proceedings of 43rd\n Digital Avionics Systems Conference (DASC)"},{"id":"http://arxiv.org/abs/2403.05839v2","updated":"2024-04-03T09:25:34Z","published":"2024-03-09T08:49:50Z","title":"Long-term Frame-Event Visual Tracking: Benchmark Dataset and Baseline","summary":" Current event-/frame-event based trackers undergo evaluation on short-term\ntracking datasets, however, the tracking of real-world scenarios involves\nlong-term tracking, and the performance of existing tracking algorithms in\nthese scenarios remains unclear. In this paper, we first propose a new\nlong-term and large-scale frame-event single object tracking dataset, termed\nFELT. It contains 742 videos and 1,594,474 RGB frames and event stream pairs\nand has become the largest frame-event tracking dataset to date. We re-train\nand evaluate 15 baseline trackers on our dataset for future works to compare.\nMore importantly, we find that the RGB frames and event streams are naturally\nincomplete due to the influence of challenging factors and spatially sparse\nevent flow. In response to this, we propose a novel associative memory\nTransformer network as a unified backbone by introducing modern Hopfield layers\ninto multi-head self-attention blocks to fuse both RGB and event data.\nExtensive experiments on RGB-Event (FELT), RGB-Thermal (RGBT234, LasHeR), and\nRGB-Depth (DepthTrack) datasets fully validated the effectiveness of our model.\nThe dataset and source code can be found at\n\\url{https://github.com/Event-AHU/FELT_SOT_Benchmark}.\n","authors":["Xiao Wang","Ju Huang","Shiao Wang","Chuanming Tang","Bo Jiang","Yonghong Tian","Jin Tang","Bin Luo"],"pdf_url":"https://arxiv.org/pdf/2403.05839v2.pdf","comment":"In Peer Review"},{"id":"http://arxiv.org/abs/2401.04647v2","updated":"2024-04-03T09:25:08Z","published":"2024-01-09T16:16:16Z","title":"Advancing Ante-Hoc Explainable Models through Generative Adversarial\n Networks","summary":" This paper presents a novel concept learning framework for enhancing model\ninterpretability and performance in visual classification tasks. Our approach\nappends an unsupervised explanation generator to the primary classifier network\nand makes use of adversarial training. During training, the explanation module\nis optimized to extract visual concepts from the classifier's latent\nrepresentations, while the GAN-based module aims to discriminate images\ngenerated from concepts, from true images. This joint training scheme enables\nthe model to implicitly align its internally learned concepts with\nhuman-interpretable visual properties. Comprehensive experiments demonstrate\nthe robustness of our approach, while producing coherent concept activations.\nWe analyse the learned concepts, showing their semantic concordance with object\nparts and visual attributes. We also study how perturbations in the adversarial\ntraining protocol impact both classification and concept acquisition. In\nsummary, this work presents a significant step towards building inherently\ninterpretable deep vision models with task-aligned concept representations - a\nkey enabler for developing trustworthy AI for real-world perception tasks.\n","authors":["Tanmay Garg","Deepika Vemuri","Vineeth N Balasubramanian"],"pdf_url":"https://arxiv.org/pdf/2401.04647v2.pdf","comment":"Paper accepted in Human-Centric Representation Learning workshop at\n AAAI 2024 (https://hcrl-workshop.github.io/2024/). Paper accepted and\n presented at Deployable AI Workshop at AAAI-2024\n (https://sites.google.com/view/dai-2024/home)"},{"id":"http://arxiv.org/abs/2404.01889v2","updated":"2024-04-03T09:18:09Z","published":"2024-04-02T12:28:40Z","title":"RAVE: Residual Vector Embedding for CLIP-Guided Backlit Image\n Enhancement","summary":" In this paper we propose a novel modification of Contrastive Language-Image\nPre-Training (CLIP) guidance for the task of unsupervised backlit image\nenhancement. Our work builds on the state-of-the-art CLIP-LIT approach, which\nlearns a prompt pair by constraining the text-image similarity between a prompt\n(negative/positive sample) and a corresponding image (backlit image/well-lit\nimage) in the CLIP embedding space. Learned prompts then guide an image\nenhancement network. Based on the CLIP-LIT framework, we propose two novel\nmethods for CLIP guidance. First, we show that instead of tuning prompts in the\nspace of text embeddings, it is possible to directly tune their embeddings in\nthe latent space without any loss in quality. This accelerates training and\npotentially enables the use of additional encoders that do not have a text\nencoder. Second, we propose a novel approach that does not require any prompt\ntuning. Instead, based on CLIP embeddings of backlit and well-lit images from\ntraining data, we compute the residual vector in the embedding space as a\nsimple difference between the mean embeddings of the well-lit and backlit\nimages. This vector then guides the enhancement network during training,\npushing a backlit image towards the space of well-lit images. This approach\nfurther dramatically reduces training time, stabilizes training and produces\nhigh quality enhanced images without artifacts, both in supervised and\nunsupervised training regimes. Additionally, we show that residual vectors can\nbe interpreted, revealing biases in training data, and thereby enabling\npotential bias correction.\n","authors":["Tatiana Gaintseva","Martin Benning","Gregory Slabaugh"],"pdf_url":"https://arxiv.org/pdf/2404.01889v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02585v1","updated":"2024-04-03T09:09:42Z","published":"2024-04-03T09:09:42Z","title":"Unsegment Anything by Simulating Deformation","summary":" Foundation segmentation models, while powerful, pose a significant risk: they\nenable users to effortlessly extract any objects from any digital content with\na single click, potentially leading to copyright infringement or malicious\nmisuse. To mitigate this risk, we introduce a new task \"Anything Unsegmentable\"\nto grant any image \"the right to be unsegmented\". The ambitious pursuit of the\ntask is to achieve highly transferable adversarial attacks against all\nprompt-based segmentation models, regardless of model parameterizations and\nprompts. We highlight the non-transferable and heterogeneous nature of\nprompt-specific adversarial noises. Our approach focuses on disrupting image\nencoder features to achieve prompt-agnostic attacks. Intriguingly, targeted\nfeature attacks exhibit better transferability compared to untargeted ones,\nsuggesting the optimal update direction aligns with the image manifold. Based\non the observations, we design a novel attack named Unsegment Anything by\nSimulating Deformation (UAD). Our attack optimizes a differentiable deformation\nfunction to create a target deformed image, which alters structural information\nwhile preserving achievable feature distance by adversarial example. Extensive\nexperiments verify the effectiveness of our approach, compromising a variety of\npromptable segmentation models with different architectures and prompt\ninterfaces. We release the code at\nhttps://github.com/jiahaolu97/anything-unsegmentable.\n","authors":["Jiahao Lu","Xingyi Yang","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2404.02585v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2306.13674v3","updated":"2024-04-03T08:56:01Z","published":"2023-06-19T09:47:33Z","title":"MeciFace: Mechanomyography and Inertial Fusion-based Glasses for Edge\n Real-Time Recognition of Facial and Eating Activities","summary":" The increasing prevalence of stress-related eating behaviors and their impact\non overall health highlights the importance of effective and ubiquitous\nmonitoring systems. In this paper, we present MeciFace, an innovative wearable\ntechnology designed to monitor facial expressions and eating activities in\nreal-time on-the-edge (RTE). MeciFace aims to provide a low-power,\nprivacy-conscious, and highly accurate tool for promoting healthy eating\nbehaviors and stress management. We employ lightweight convolutional neural\nnetworks as backbone models for facial expression and eating monitoring\nscenarios. The MeciFace system ensures efficient data processing with a tiny\nmemory footprint, ranging from 11KB to 19 KB. During RTE evaluation, the system\nachieves an F1-score of < 86% for facial expression recognition and 94% for\neating/drinking monitoring, for the RTE of unseen users (user-independent\ncase).\n","authors":["Hymalai Bello","Sungho Suh","Bo Zhou","Paul Lukowicz"],"pdf_url":"https://arxiv.org/pdf/2306.13674v3.pdf","comment":"Submitted to IEEE Transactions on Consumer Electronics"},{"id":"http://arxiv.org/abs/2404.02580v1","updated":"2024-04-03T08:55:44Z","published":"2024-04-03T08:55:44Z","title":"Active learning for efficient annotation in precision agriculture: a\n use-case on crop-weed semantic segmentation","summary":" Optimizing deep learning models requires large amounts of annotated images, a\nprocess that is both time-intensive and costly. Especially for semantic\nsegmentation models in which every pixel must be annotated. A potential\nstrategy to mitigate annotation effort is active learning. Active learning\nfacilitates the identification and selection of the most informative images\nfrom a large unlabelled pool. The underlying premise is that these selected\nimages can improve the model's performance faster than random selection to\nreduce annotation effort. While active learning has demonstrated promising\nresults on benchmark datasets like Cityscapes, its performance in the\nagricultural domain remains largely unexplored. This study addresses this\nresearch gap by conducting a comparative study of three active learning-based\nacquisition functions: Bayesian Active Learning by Disagreement (BALD),\nstochastic-based BALD (PowerBALD), and Random. The acquisition functions were\ntested on two agricultural datasets: Sugarbeet and Corn-Weed, both containing\nthree semantic classes: background, crop and weed. Our results indicated that\nactive learning, especially PowerBALD, yields a higher performance than Random\nsampling on both datasets. But due to the relatively large standard deviations,\nthe differences observed were minimal; this was partly caused by high image\nredundancy and imbalanced classes. Specifically, more than 89\\% of the pixels\nbelonged to the background class on both datasets. The absence of significant\nresults on both datasets indicates that further research is required for\napplying active learning on agricultural datasets, especially if they contain a\nhigh-class imbalance and redundant images. Recommendations and insights are\nprovided in this paper to potentially resolve such issues.\n","authors":["Bart M. van Marrewijk","Charbel Dandjinou","Dan Jeric Arcega Rustia","Nicolas Franco Gonzalez","Boubacar Diallo","Jérôme Dias","Paul Melki","Pieter M. Blok"],"pdf_url":"https://arxiv.org/pdf/2404.02580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02573v1","updated":"2024-04-03T08:47:40Z","published":"2024-04-03T08:47:40Z","title":"Knowledge Distillation with Multi-granularity Mixture of Priors for\n Image Super-Resolution","summary":" Knowledge distillation (KD) is a promising yet challenging model compression\ntechnique that transfers rich learning representations from a well-performing\nbut cumbersome teacher model to a compact student model. Previous methods for\nimage super-resolution (SR) mostly compare the feature maps directly or after\nstandardizing the dimensions with basic algebraic operations (e.g. average,\ndot-product). However, the intrinsic semantic differences among feature maps\nare overlooked, which are caused by the disparate expressive capacity between\nthe networks. This work presents MiPKD, a multi-granularity mixture of prior KD\nframework, to facilitate efficient SR model through the feature mixture in a\nunified latent space and stochastic network block mixture. Extensive\nexperiments demonstrate the effectiveness of the proposed MiPKD method.\n","authors":["Simiao Li","Yun Zhang","Wei Li","Hanting Chen","Wenjia Wang","Bingyi Jing","Shaohui Lin","Jie Hu"],"pdf_url":"https://arxiv.org/pdf/2404.02573v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02562v1","updated":"2024-04-03T08:33:08Z","published":"2024-04-03T08:33:08Z","title":"Representation Alignment Contrastive Regularization for Multi-Object\n Tracking","summary":" Achieving high-performance in multi-object tracking algorithms heavily relies\non modeling spatio-temporal relationships during the data association stage.\nMainstream approaches encompass rule-based and deep learning-based methods for\nspatio-temporal relationship modeling. While the former relies on physical\nmotion laws, offering wider applicability but yielding suboptimal results for\ncomplex object movements, the latter, though achieving high-performance, lacks\ninterpretability and involves complex module designs. This work aims to\nsimplify deep learning-based spatio-temporal relationship models and introduce\ninterpretability into features for data association. Specifically, a\nlightweight single-layer transformer encoder is utilized to model\nspatio-temporal relationships. To make features more interpretative, two\ncontrastive regularization losses based on representation alignment are\nproposed, derived from spatio-temporal consistency rules. By applying weighted\nsummation to affinity matrices, the aligned features can seamlessly integrate\ninto the data association stage of the original tracking workflow. Experimental\nresults showcase that our model enhances the majority of existing tracking\nnetworks' performance without excessive complexity, with minimal increase in\ntraining overhead and nearly negligible computational and storage costs.\n","authors":["Shujie Chen","Zhonglin Liu","Jianfeng Dong","Di Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.02562v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02558v1","updated":"2024-04-03T08:27:24Z","published":"2024-04-03T08:27:24Z","title":"Regional biases in image geolocation estimation: a case study with the\n SenseCity Africa dataset","summary":" Advances in Artificial Intelligence are challenged by the biases rooted in\nthe datasets used to train the models. In image geolocation estimation, models\nare mostly trained using data from specific geographic regions, notably the\nWestern world, and as a result, they may struggle to comprehend the\ncomplexities of underrepresented regions. To assess this issue, we apply a\nstate-of-the-art image geolocation estimation model (ISNs) to a crowd-sourced\ndataset of geolocated images from the African continent (SCA100), and then\nexplore the regional and socioeconomic biases underlying the model's\npredictions. Our findings show that the ISNs model tends to over-predict image\nlocations in high-income countries of the Western world, which is consistent\nwith the geographic distribution of its training data, i.e., the IM2GPS3k\ndataset. Accordingly, when compared to the IM2GPS3k benchmark, the accuracy of\nthe ISNs model notably decreases at all scales. Additionally, we cluster images\nof the SCA100 dataset based on how accurately they are predicted by the ISNs\nmodel and show the model's difficulties in correctly predicting the locations\nof images in low income regions, especially in Sub-Saharan Africa. Therefore,\nour results suggest that using IM2GPS3k as a training set and benchmark for\nimage geolocation estimation and other computer vision models overlooks its\npotential application in the African context.\n","authors":["Ximena Salgado Uribe","Martí Bosch","Jérôme Chenal"],"pdf_url":"https://arxiv.org/pdf/2404.02558v1.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.01272v2","updated":"2024-04-03T08:27:01Z","published":"2024-04-01T17:48:15Z","title":"Language Guided Domain Generalized Medical Image Segmentation","summary":" Single source domain generalization (SDG) holds promise for more reliable and\nconsistent image segmentation across real-world clinical settings particularly\nin the medical domain, where data privacy and acquisition cost constraints\noften limit the availability of diverse datasets. Depending solely on visual\nfeatures hampers the model's capacity to adapt effectively to various domains,\nprimarily because of the presence of spurious correlations and domain-specific\ncharacteristics embedded within the image features. Incorporating text features\nalongside visual features is a potential solution to enhance the model's\nunderstanding of the data, as it goes beyond pixel-level information to provide\nvaluable context. Textual cues describing the anatomical structures, their\nappearances, and variations across various imaging modalities can guide the\nmodel in domain adaptation, ultimately contributing to more robust and\nconsistent segmentation. In this paper, we propose an approach that explicitly\nleverages textual information by incorporating a contrastive learning mechanism\nguided by the text encoder features to learn a more robust feature\nrepresentation. We assess the effectiveness of our text-guided contrastive\nfeature alignment technique in various scenarios, including cross-modality,\ncross-sequence, and cross-site settings for different segmentation tasks. Our\napproach achieves favorable performance against existing methods in literature.\nOur code and model weights are available at\nhttps://github.com/ShahinaKK/LG_SDG.git.\n","authors":["Shahina Kunhimon","Muzammal Naseer","Salman Khan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2404.01272v2.pdf","comment":"Accepted at ISBI2024"},{"id":"http://arxiv.org/abs/2401.13627v2","updated":"2024-04-03T08:12:08Z","published":"2024-01-24T17:58:07Z","title":"Scaling Up to Excellence: Practicing Model Scaling for Photo-Realistic\n Image Restoration In the Wild","summary":" We introduce SUPIR (Scaling-UP Image Restoration), a groundbreaking image\nrestoration method that harnesses generative prior and the power of model\nscaling up. Leveraging multi-modal techniques and advanced generative prior,\nSUPIR marks a significant advance in intelligent and realistic image\nrestoration. As a pivotal catalyst within SUPIR, model scaling dramatically\nenhances its capabilities and demonstrates new potential for image restoration.\nWe collect a dataset comprising 20 million high-resolution, high-quality images\nfor model training, each enriched with descriptive text annotations. SUPIR\nprovides the capability to restore images guided by textual prompts, broadening\nits application scope and potential. Moreover, we introduce negative-quality\nprompts to further improve perceptual quality. We also develop a\nrestoration-guided sampling method to suppress the fidelity issue encountered\nin generative-based restoration. Experiments demonstrate SUPIR's exceptional\nrestoration effects and its novel capacity to manipulate restoration through\ntextual prompts.\n","authors":["Fanghua Yu","Jinjin Gu","Zheyuan Li","Jinfan Hu","Xiangtao Kong","Xintao Wang","Jingwen He","Yu Qiao","Chao Dong"],"pdf_url":"https://arxiv.org/pdf/2401.13627v2.pdf","comment":"This paper has been accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2307.13981v2","updated":"2024-04-03T08:04:55Z","published":"2023-07-26T06:38:33Z","title":"Analysis of Video Quality Datasets via Design of Minimalistic Video\n Quality Models","summary":" Blind video quality assessment (BVQA) plays an indispensable role in\nmonitoring and improving the end-users' viewing experience in various\nreal-world video-enabled media applications. As an experimental field, the\nimprovements of BVQA models have been measured primarily on a few human-rated\nVQA datasets. Thus, it is crucial to gain a better understanding of existing\nVQA datasets in order to properly evaluate the current progress in BVQA.\nTowards this goal, we conduct a first-of-its-kind computational analysis of VQA\ndatasets via designing minimalistic BVQA models. By minimalistic, we restrict\nour family of BVQA models to build only upon basic blocks: a video preprocessor\n(for aggressive spatiotemporal downsampling), a spatial quality analyzer, an\noptional temporal quality analyzer, and a quality regressor, all with the\nsimplest possible instantiations. By comparing the quality prediction\nperformance of different model variants on eight VQA datasets with realistic\ndistortions, we find that nearly all datasets suffer from the easy dataset\nproblem of varying severity, some of which even admit blind image quality\nassessment (BIQA) solutions. We additionally justify our claims by contrasting\nour model generalizability on these VQA datasets, and by ablating a dizzying\nset of BVQA design choices related to the basic building blocks. Our results\ncast doubt on the current progress in BVQA, and meanwhile shed light on good\npractices of constructing next-generation VQA datasets and models.\n","authors":["Wei Sun","Wen Wen","Xiongkuo Min","Long Lan","Guangtao Zhai","Kede Ma"],"pdf_url":"https://arxiv.org/pdf/2307.13981v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02544v1","updated":"2024-04-03T08:01:00Z","published":"2024-04-03T08:01:00Z","title":"Semi-Supervised Unconstrained Head Pose Estimation in the Wild","summary":" Existing head pose estimation datasets are either composed of numerous\nsamples by non-realistic synthesis or lab collection, or limited images by\nlabor-intensive annotating. This makes deep supervised learning based solutions\ncompromised due to the reliance on generous labeled data. To alleviate it, we\npropose the first semi-supervised unconstrained head pose estimation (SemiUHPE)\nmethod, which can leverage a large amount of unlabeled wild head images.\nSpecifically, we follow the recent semi-supervised rotation regression, and\nfocus on the diverse and complex head pose domain. Firstly, we claim that the\naspect-ratio invariant cropping of heads is superior to the previous\nlandmark-based affine alignment, which does not fit unlabeled natural heads or\npractical applications where landmarks are often unavailable. Then, instead of\nusing an empirically fixed threshold to filter out pseudo labels, we propose\nthe dynamic entropy-based filtering by updating thresholds for adaptively\nremoving unlabeled outliers. Moreover, we revisit the design of weak-strong\naugmentations, and further exploit its superiority by devising two novel\nhead-oriented strong augmentations named pose-irrelevant cut-occlusion and\npose-altering rotation consistency. Extensive experiments show that SemiUHPE\ncan surpass SOTAs with remarkable improvements on public benchmarks under both\nfront-range and full-range. Our code is released in\n\\url{https://github.com/hnuzhy/SemiUHPE}.\n","authors":["Huayi Zhou","Fei Jiang","Hongtao Lu"],"pdf_url":"https://arxiv.org/pdf/2404.02544v1.pdf","comment":"14 pages. Semi-Supervised Unconstrained Head Pose Estimation"},{"id":"http://arxiv.org/abs/2403.19425v2","updated":"2024-04-03T07:37:32Z","published":"2024-03-28T13:56:26Z","title":"A Robust Ensemble Algorithm for Ischemic Stroke Lesion Segmentation:\n Generalizability and Clinical Utility Beyond the ISLES Challenge","summary":" Diffusion-weighted MRI (DWI) is essential for stroke diagnosis, treatment\ndecisions, and prognosis. However, image and disease variability hinder the\ndevelopment of generalizable AI algorithms with clinical value. We address this\ngap by presenting a novel ensemble algorithm derived from the 2022 Ischemic\nStroke Lesion Segmentation (ISLES) challenge. ISLES'22 provided 400 patient\nscans with ischemic stroke from various medical centers, facilitating the\ndevelopment of a wide range of cutting-edge segmentation algorithms by the\nresearch community. Through collaboration with leading teams, we combined\ntop-performing algorithms into an ensemble model that overcomes the limitations\nof individual solutions. Our ensemble model achieved superior ischemic lesion\ndetection and segmentation accuracy on our internal test set compared to\nindividual algorithms. This accuracy generalized well across diverse image and\ndisease variables. Furthermore, the model excelled in extracting clinical\nbiomarkers. Notably, in a Turing-like test, neuroradiologists consistently\npreferred the algorithm's segmentations over manual expert efforts,\nhighlighting increased comprehensiveness and precision. Validation using a\nreal-world external dataset (N=1686) confirmed the model's generalizability.\nThe algorithm's outputs also demonstrated strong correlations with clinical\nscores (admission NIHSS and 90-day mRS) on par with or exceeding expert-derived\nresults, underlining its clinical relevance. This study offers two key\nfindings. First, we present an ensemble algorithm\n(https://github.com/Tabrisrei/ISLES22_Ensemble) that detects and segments\nischemic stroke lesions on DWI across diverse scenarios on par with expert\n(neuro)radiologists. Second, we show the potential for biomedical challenge\noutputs to extend beyond the challenge's initial objectives, demonstrating\ntheir real-world clinical applicability.\n","authors":["Ezequiel de la Rosa","Mauricio Reyes","Sook-Lei Liew","Alexandre Hutton","Roland Wiest","Johannes Kaesmacher","Uta Hanning","Arsany Hakim","Richard Zubal","Waldo Valenzuela","David Robben","Diana M. Sima","Vincenzo Anania","Arne Brys","James A. Meakin","Anne Mickan","Gabriel Broocks","Christian Heitkamp","Shengbo Gao","Kongming Liang","Ziji Zhang","Md Mahfuzur Rahman Siddiquee","Andriy Myronenko","Pooya Ashtari","Sabine Van Huffel","Hyun-su Jeong","Chi-ho Yoon","Chulhong Kim","Jiayu Huo","Sebastien Ourselin","Rachel Sparks","Albert Clèrigues","Arnau Oliver","Xavier Lladó","Liam Chalcroft","Ioannis Pappas","Jeroen Bertels","Ewout Heylen","Juliette Moreau","Nima Hatami","Carole Frindel","Abdul Qayyum","Moona Mazher","Domenec Puig","Shao-Chieh Lin","Chun-Jung Juan","Tianxi Hu","Lyndon Boone","Maged Goubran","Yi-Jui Liu","Susanne Wegener","Florian Kofler","Ivan Ezhov","Suprosanna Shit","Moritz R. Hernandez Petzsche","Bjoern Menze","Jan S. Kirschke","Benedikt Wiestler"],"pdf_url":"https://arxiv.org/pdf/2403.19425v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02530v1","updated":"2024-04-03T07:33:30Z","published":"2024-04-03T07:33:30Z","title":"Severity Controlled Text-to-Image Generative Model Bias Manipulation","summary":" Text-to-image (T2I) generative models are gaining wide popularity, especially\nin public domains. However, their intrinsic bias and potential malicious\nmanipulations remain under-explored. Charting the susceptibility of T2I models\nto such manipulation, we first expose the new possibility of a dynamic and\ncomputationally efficient exploitation of model bias by targeting the embedded\nlanguage models. By leveraging mathematical foundations of vector algebra, our\ntechnique enables a scalable and convenient control over the severity of output\nmanipulation through model bias. As a by-product, this control also allows a\nform of precise prompt engineering to generate images which are generally\nimplausible with regular text prompts. We also demonstrate a constructive\napplication of our manipulation for balancing the frequency of generated\nclasses - as in model debiasing. Our technique does not require training and is\nalso framed as a backdoor attack with severity control using semantically-null\ntext triggers in the prompts. With extensive analysis, we present interesting\nqualitative and quantitative results to expose potential manipulation\npossibilities for T2I models.\n Key-words: Text-to-Image Models, Generative Models, Backdoor Attacks, Prompt\nEngineering, Bias\n","authors":["Jordan Vice","Naveed Akhtar","Richard Hartley","Ajmal Mian"],"pdf_url":"https://arxiv.org/pdf/2404.02530v1.pdf","comment":"This research was supported by National Intelligence and Security\n Discovery Research Grants (project# NS220100007), funded by the Department of\n Defence Australia"},{"id":"http://arxiv.org/abs/2404.02527v1","updated":"2024-04-03T07:30:09Z","published":"2024-04-03T07:30:09Z","title":"Weakly-Supervised 3D Scene Graph Generation via Visual-Linguistic\n Assisted Pseudo-labeling","summary":" Learning to build 3D scene graphs is essential for real-world perception in a\nstructured and rich fashion. However, previous 3D scene graph generation\nmethods utilize a fully supervised learning manner and require a large amount\nof entity-level annotation data of objects and relations, which is extremely\nresource-consuming and tedious to obtain. To tackle this problem, we propose\n3D-VLAP, a weakly-supervised 3D scene graph generation method via\nVisual-Linguistic Assisted Pseudo-labeling. Specifically, our 3D-VLAP exploits\nthe superior ability of current large-scale visual-linguistic models to align\nthe semantics between texts and 2D images, as well as the naturally existing\ncorrespondences between 2D images and 3D point clouds, and thus implicitly\nconstructs correspondences between texts and 3D point clouds. First, we\nestablish the positional correspondence from 3D point clouds to 2D images via\ncamera intrinsic and extrinsic parameters, thereby achieving alignment of 3D\npoint clouds and 2D images. Subsequently, a large-scale cross-modal\nvisual-linguistic model is employed to indirectly align 3D instances with the\ntextual category labels of objects by matching 2D images with object category\nlabels. The pseudo labels for objects and relations are then produced for\n3D-VLAP model training by calculating the similarity between visual embeddings\nand textual category embeddings of objects and relations encoded by the\nvisual-linguistic model, respectively. Ultimately, we design an edge\nself-attention based graph neural network to generate scene graphs of 3D point\ncloud scenes. Extensive experiments demonstrate that our 3D-VLAP achieves\ncomparable results with current advanced fully supervised methods, meanwhile\nsignificantly alleviating the pressure of data annotation.\n","authors":["Xu Wang","Yifan Li","Qiudan Zhang","Wenhui Wu","Mark Junjie Li","Jianmin Jinag"],"pdf_url":"https://arxiv.org/pdf/2404.02527v1.pdf","comment":"11 pages, 9 figures"},{"id":"http://arxiv.org/abs/2404.02523v1","updated":"2024-04-03T07:23:03Z","published":"2024-04-03T07:23:03Z","title":"Text-driven Affordance Learning from Egocentric Vision","summary":" Visual affordance learning is a key component for robots to understand how to\ninteract with objects. Conventional approaches in this field rely on\npre-defined objects and actions, falling short of capturing diverse\ninteractions in realworld scenarios. The key idea of our approach is employing\ntextual instruction, targeting various affordances for a wide range of objects.\nThis approach covers both hand-object and tool-object interactions. We\nintroduce text-driven affordance learning, aiming to learn contact points and\nmanipulation trajectories from an egocentric view following textual\ninstruction. In our task, contact points are represented as heatmaps, and the\nmanipulation trajectory as sequences of coordinates that incorporate both\nlinear and rotational movements for various manipulations. However, when we\ngather data for this task, manual annotations of these diverse interactions are\ncostly. To this end, we propose a pseudo dataset creation pipeline and build a\nlarge pseudo-training dataset: TextAFF80K, consisting of over 80K instances of\nthe contact points, trajectories, images, and text tuples. We extend existing\nreferring expression comprehension models for our task, and experimental\nresults show that our approach robustly handles multiple affordances, serving\nas a new standard for affordance learning in real-world scenarios.\n","authors":["Tomoya Yoshida","Shuhei Kurita","Taichi Nishimura","Shinsuke Mori"],"pdf_url":"https://arxiv.org/pdf/2404.02523v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00611v2","updated":"2024-04-03T07:18:11Z","published":"2024-03-31T09:01:17Z","title":"Object-level Copy-Move Forgery Image Detection based on Inconsistency\n Mining","summary":" In copy-move tampering operations, perpetrators often employ techniques, such\nas blurring, to conceal tampering traces, posing significant challenges to the\ndetection of object-level targets with intact structures. Focus on these\nchallenges, this paper proposes an Object-level Copy-Move Forgery Image\nDetection based on Inconsistency Mining (IMNet). To obtain complete\nobject-level targets, we customize prototypes for both the source and tampered\nregions and dynamically update them. Additionally, we extract inconsistent\nregions between coarse similar regions obtained through self-correlation\ncalculations and regions composed of prototypes. The detected inconsistent\nregions are used as supplements to coarse similar regions to refine pixel-level\ndetection. We operate experiments on three public datasets which validate the\neffectiveness and the robustness of the proposed IMNet.\n","authors":["Jingyu Wang","Niantai Jing","Ziyao Liu","Jie Nie","Yuxin Qi","Chi-Hung Chi","Kwok-Yan Lam"],"pdf_url":"https://arxiv.org/pdf/2404.00611v2.pdf","comment":"4 pages, 2 figures, Accepted to WWW 2024"},{"id":"http://arxiv.org/abs/2404.00228v3","updated":"2024-04-03T07:15:05Z","published":"2024-03-30T03:16:37Z","title":"InfLoRA: Interference-Free Low-Rank Adaptation for Continual Learning","summary":" Continual learning requires the model to learn multiple tasks sequentially.\nIn continual learning, the model should possess the ability to maintain its\nperformance on old tasks (stability) and the ability to adapt to new tasks\ncontinuously (plasticity). Recently, parameter-efficient fine-tuning (PEFT),\nwhich involves freezing a pre-trained model and injecting a small number of\nlearnable parameters to adapt to downstream tasks, has gained increasing\npopularity in continual learning. Although existing continual learning methods\nbased on PEFT have demonstrated superior performance compared to those not\nbased on PEFT, most of them do not consider how to eliminate the interference\nof the new task on the old tasks, which inhibits the model from making a good\ntrade-off between stability and plasticity. In this work, we propose a new PEFT\nmethod, called interference-free low-rank adaptation (InfLoRA), for continual\nlearning. InfLoRA injects a small number of parameters to reparameterize the\npre-trained weights and shows that fine-tuning these injected parameters is\nequivalent to fine-tuning the pre-trained weights within a subspace.\nFurthermore, InfLoRA designs this subspace to eliminate the interference of the\nnew task on the old tasks, making a good trade-off between stability and\nplasticity. Experimental results show that InfLoRA outperforms existing\nstate-of-the-art continual learning methods on multiple datasets.\n","authors":["Yan-Shuo Liang","Wu-Jun Li"],"pdf_url":"https://arxiv.org/pdf/2404.00228v3.pdf","comment":"Accepted by the 2024 IEEE/CVF Conference on Computer Vision and\n Pattern Recognition (CVPR 2024)"},{"id":"http://arxiv.org/abs/2404.02518v1","updated":"2024-04-03T07:11:19Z","published":"2024-04-03T07:11:19Z","title":"CPAISD: Core-penumbra acute ischemic stroke dataset","summary":" We introduce the CPAISD: Core-Penumbra Acute Ischemic Stroke Dataset, aimed\nat enhancing the early detection and segmentation of ischemic stroke using\nNon-Contrast Computed Tomography (NCCT) scans. Addressing the challenges in\ndiagnosing acute ischemic stroke during its early stages due to often\nnon-revealing native CT findings, the dataset provides a collection of\nsegmented NCCT images. These include annotations of ischemic core and penumbra\nregions, critical for developing machine learning models for rapid stroke\nidentification and assessment. By offering a carefully collected and annotated\ndataset, we aim to facilitate the development of advanced diagnostic tools,\ncontributing to improved patient care and outcomes in stroke management. Our\ndataset's uniqueness lies in its focus on the acute phase of ischemic stroke,\nwith non-informative native CT scans, and includes a baseline model to\ndemonstrate the dataset's application, encouraging further research and\ninnovation in the field of medical imaging and stroke diagnosis.\n","authors":["D. Umerenkov","S. Kudin","M. Peksheva","D. Pavlov"],"pdf_url":"https://arxiv.org/pdf/2404.02518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03166v3","updated":"2024-04-03T07:10:22Z","published":"2024-02-05T16:35:29Z","title":"RRWNet: Recursive Refinement Network for Effective Retinal Artery/Vein\n Segmentation and Classification","summary":" The caliber and configuration of retinal blood vessels serve as important\nbiomarkers for various diseases and medical conditions. A thorough analysis of\nthe retinal vasculature requires the segmentation of the blood vessels and\ntheir classification into arteries and veins, typically performed on color\nfundus images obtained by retinography. However, manually performing these\ntasks is labor-intensive and prone to human error. While several automated\nmethods have been proposed to address this task, the current state of art faces\nchallenges due to manifest classification errors affecting the topological\nconsistency of segmentation maps. In this work, we introduce RRWNet, a novel\nend-to-end deep learning framework that addresses this limitation. The\nframework consists of a fully convolutional neural network that recursively\nrefines semantic segmentation maps, correcting manifest classification errors\nand thus improving topological consistency. In particular, RRWNet is composed\nof two specialized subnetworks: a Base subnetwork that generates base\nsegmentation maps from the input images, and a Recursive Refinement subnetwork\nthat iteratively and recursively improves these maps. Evaluation on three\ndifferent public datasets demonstrates the state-of-the-art performance of the\nproposed method, yielding more topologically consistent segmentation maps with\nfewer manifest classification errors than existing approaches. In addition, the\nRecursive Refinement module within RRWNet proves effective in post-processing\nsegmentation maps from other methods, further demonstrating its potential. The\nmodel code, weights, and predictions will be publicly available at\nhttps://github.com/j-morano/rrwnet.\n","authors":["José Morano","Guilherme Aresta","Hrvoje Bogunović"],"pdf_url":"https://arxiv.org/pdf/2402.03166v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02517v1","updated":"2024-04-03T07:10:18Z","published":"2024-04-03T07:10:18Z","title":"HENet: Hybrid Encoding for End-to-end Multi-task 3D Perception from\n Multi-view Cameras","summary":" Three-dimensional perception from multi-view cameras is a crucial component\nin autonomous driving systems, which involves multiple tasks like 3D object\ndetection and bird's-eye-view (BEV) semantic segmentation. To improve\nperception precision, large image encoders, high-resolution images, and\nlong-term temporal inputs have been adopted in recent 3D perception models,\nbringing remarkable performance gains. However, these techniques are often\nincompatible in training and inference scenarios due to computational resource\nconstraints. Besides, modern autonomous driving systems prefer to adopt an\nend-to-end framework for multi-task 3D perception, which can simplify the\noverall system architecture and reduce the implementation complexity. However,\nconflict between tasks often arises when optimizing multiple tasks jointly\nwithin an end-to-end 3D perception model. To alleviate these issues, we present\nan end-to-end framework named HENet for multi-task 3D perception in this paper.\nSpecifically, we propose a hybrid image encoding network, using a large image\nencoder for short-term frames and a small image encoder for long-term temporal\nframes. Then, we introduce a temporal feature integration module based on the\nattention mechanism to fuse the features of different frames extracted by the\ntwo aforementioned hybrid image encoders. Finally, according to the\ncharacteristics of each perception task, we utilize BEV features of different\ngrid sizes, independent BEV encoders, and task decoders for different tasks.\nExperimental results show that HENet achieves state-of-the-art end-to-end\nmulti-task 3D perception results on the nuScenes benchmark, including 3D object\ndetection and BEV semantic segmentation. The source code and models will be\nreleased at https://github.com/VDIGPKU/HENet.\n","authors":["Zhongyu Xia","ZhiWei Lin","Xinhao Wang","Yongtao Wang","Yun Xing","Shengxiang Qi","Nan Dong","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2404.02517v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02514v1","updated":"2024-04-03T07:07:02Z","published":"2024-04-03T07:07:02Z","title":"Freditor: High-Fidelity and Transferable NeRF Editing by Frequency\n Decomposition","summary":" This paper enables high-fidelity, transferable NeRF editing by frequency\ndecomposition. Recent NeRF editing pipelines lift 2D stylization results to 3D\nscenes while suffering from blurry results, and fail to capture detailed\nstructures caused by the inconsistency between 2D editings. Our critical\ninsight is that low-frequency components of images are more\nmultiview-consistent after editing compared with their high-frequency parts.\nMoreover, the appearance style is mainly exhibited on the low-frequency\ncomponents, and the content details especially reside in high-frequency parts.\nThis motivates us to perform editing on low-frequency components, which results\nin high-fidelity edited scenes. In addition, the editing is performed in the\nlow-frequency feature space, enabling stable intensity control and novel scene\ntransfer. Comprehensive experiments conducted on photorealistic datasets\ndemonstrate the superior performance of high-fidelity and transferable NeRF\nediting. The project page is at \\url{https://aigc3d.github.io/freditor}.\n","authors":["Yisheng He","Weihao Yuan","Siyu Zhu","Zilong Dong","Liefeng Bo","Qixing Huang"],"pdf_url":"https://arxiv.org/pdf/2404.02514v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02508v1","updated":"2024-04-03T06:53:27Z","published":"2024-04-03T06:53:27Z","title":"VIAssist: Adapting Multi-modal Large Language Models for Users with\n Visual Impairments","summary":" Individuals with visual impairments, encompassing both partial and total\ndifficulties in visual perception, are referred to as visually impaired (VI)\npeople. An estimated 2.2 billion individuals worldwide are affected by visual\nimpairments. Recent advancements in multi-modal large language models (MLLMs)\nhave showcased their extraordinary capabilities across various domains. It is\ndesirable to help VI individuals with MLLMs' great capabilities of visual\nunderstanding and reasoning. However, it is challenging for VI people to use\nMLLMs due to the difficulties in capturing the desirable images to fulfill\ntheir daily requests. For example, the target object is not fully or partially\nplaced in the image. This paper explores how to leverage MLLMs for VI\nindividuals to provide visual-question answers. VIAssist can identify undesired\nimages and provide detailed actions. Finally, VIAssist can provide reliable\nanswers to users' queries based on the images. Our results show that VIAssist\nprovides +0.21 and +0.31 higher BERTScore and ROUGE scores than the baseline,\nrespectively.\n","authors":["Bufang Yang","Lixing He","Kaiwei Liu","Zhenyu Yan"],"pdf_url":"https://arxiv.org/pdf/2404.02508v1.pdf","comment":"Accepted to IEEE International Workshop on Foundation Models for\n Cyber-Physical Systems & Internet of Things (FMSys 2024)"},{"id":"http://arxiv.org/abs/2303.04989v2","updated":"2024-04-03T06:51:21Z","published":"2023-03-09T02:20:56Z","title":"ARS-DETR: Aspect Ratio Sensitive Oriented Object Detection with\n Transformer","summary":" Existing oriented object detection methods commonly use metric AP$_{50}$ to\nmeasure the performance of the model. We argue that AP$_{50}$ is inherently\nunsuitable for oriented object detection due to its large tolerance in angle\ndeviation. Therefore, we advocate using high-precision metric, e.g. AP$_{75}$,\nto measure the performance of models. In this paper, we propose an Aspect Ratio\nSensitive Oriented Object Detector with Transformer, termed ARS-DETR, which\nexhibits a competitive performance in high-precision oriented object detection.\nSpecifically, a new angle classification method, calling Aspect Ratio aware\nCircle Smooth Label (AR-CSL), is proposed to smooth the angle label in a more\nreasonable way and discard the hyperparameter that introduced by previous work\n(e.g. CSL). Then, a rotated deformable attention module is designed to rotate\nthe sampling points with the corresponding angles and eliminate the\nmisalignment between region features and sampling points. Moreover, a dynamic\nweight coefficient according to the aspect ratio is adopted to calculate the\nangle loss. Comprehensive experiments on several challenging datasets show that\nour method achieves competitive performance on the high-precision oriented\nobject detection task.\n","authors":["Ying Zeng","Xue Yang","Qingyun Li","Yushi Chen","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2303.04989v2.pdf","comment":"10 pages, 8 figures, 8 tables, the source code is available at\n https://github.com/httle/ARS-DETR"},{"id":"http://arxiv.org/abs/2403.04492v3","updated":"2024-04-03T06:48:15Z","published":"2024-03-07T13:49:29Z","title":"Discriminative Sample-Guided and Parameter-Efficient Feature Space\n Adaptation for Cross-Domain Few-Shot Learning","summary":" In this paper, we look at cross-domain few-shot classification which presents\nthe challenging task of learning new classes in previously unseen domains with\nfew labelled examples. Existing methods, though somewhat effective, encounter\nseveral limitations, which we alleviate through two significant improvements.\nFirst, we introduce a lightweight parameter-efficient adaptation strategy to\naddress overfitting associated with fine-tuning a large number of parameters on\nsmall datasets. This strategy employs a linear transformation of pre-trained\nfeatures, significantly reducing the trainable parameter count. Second, we\nreplace the traditional nearest centroid classifier with a discriminative\nsample-aware loss function, enhancing the model's sensitivity to the inter- and\nintra-class variances within the training set for improved clustering in\nfeature space. Empirical evaluations on the Meta-Dataset benchmark showcase\nthat our approach not only improves accuracy up to 7.7\\% and 5.3\\% on\npreviously seen and unseen datasets, respectively, but also achieves the above\nperformance while being at least $\\sim3\\times$ more parameter-efficient than\nexisting methods, establishing a new state-of-the-art in cross-domain few-shot\nlearning. Our code is available at https://github.com/rashindrie/DIPA.\n","authors":["Rashindrie Perera","Saman Halgamuge"],"pdf_url":"https://arxiv.org/pdf/2403.04492v3.pdf","comment":"Code is available at this link: https://github.com/rashindrie/DIPA"},{"id":"http://arxiv.org/abs/2404.01700v2","updated":"2024-04-03T06:40:46Z","published":"2024-04-02T07:09:29Z","title":"MotionChain: Conversational Motion Controllers via Multimodal Prompts","summary":" Recent advancements in language models have demonstrated their adeptness in\nconducting multi-turn dialogues and retaining conversational context. However,\nthis proficiency remains largely unexplored in other multimodal generative\nmodels, particularly in human motion models. By integrating multi-turn\nconversations in controlling continuous virtual human movements, generative\nhuman motion models can achieve an intuitive and step-by-step process of human\ntask execution for humanoid robotics, game agents, or other embodied systems.\nIn this work, we present MotionChain, a conversational human motion controller\nto generate continuous and long-term human motion through multimodal prompts.\nSpecifically, MotionChain consists of multi-modal tokenizers that transform\nvarious data types such as text, image, and motion, into discrete tokens,\ncoupled with a Vision-Motion-aware Language model. By leveraging large-scale\nlanguage, vision-language, and vision-motion data to assist motion-related\ngeneration tasks, MotionChain thus comprehends each instruction in multi-turn\nconversation and generates human motions followed by these prompts. Extensive\nexperiments validate the efficacy of MotionChain, demonstrating\nstate-of-the-art performance in conversational motion generation, as well as\nmore intuitive manners of controlling and interacting with virtual humans.\n","authors":["Biao Jiang","Xin Chen","Chi Zhang","Fukun Yin","Zhuoyuan Li","Gang YU","Jiayuan Fan"],"pdf_url":"https://arxiv.org/pdf/2404.01700v2.pdf","comment":"14 pages, 4 figures"},{"id":"http://arxiv.org/abs/2312.12870v2","updated":"2024-04-03T06:11:17Z","published":"2023-12-20T09:34:22Z","title":"The Audio-Visual Conversational Graph: From an Egocentric-Exocentric\n Perspective","summary":" In recent years, the thriving development of research related to egocentric\nvideos has provided a unique perspective for the study of conversational\ninteractions, where both visual and audio signals play a crucial role. While\nmost prior work focus on learning about behaviors that directly involve the\ncamera wearer, we introduce the Ego-Exocentric Conversational Graph Prediction\nproblem, marking the first attempt to infer exocentric conversational\ninteractions from egocentric videos. We propose a unified multi-modal framework\n-- Audio-Visual Conversational Attention (AV-CONV), for the joint prediction of\nconversation behaviors -- speaking and listening -- for both the camera wearer\nas well as all other social partners present in the egocentric video.\nSpecifically, we adopt the self-attention mechanism to model the\nrepresentations across-time, across-subjects, and across-modalities. To\nvalidate our method, we conduct experiments on a challenging egocentric video\ndataset that includes multi-speaker and multi-conversation scenarios. Our\nresults demonstrate the superior performance of our method compared to a series\nof baselines. We also present detailed ablation studies to assess the\ncontribution of each component in our model. Check our project page at\nhttps://vjwq.github.io/AV-CONV/.\n","authors":["Wenqi Jia","Miao Liu","Hao Jiang","Ishwarya Ananthabhotla","James M. Rehg","Vamsi Krishna Ithapu","Ruohan Gao"],"pdf_url":"https://arxiv.org/pdf/2312.12870v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01941v2","updated":"2024-04-03T05:43:15Z","published":"2024-04-02T13:33:31Z","title":"LPSNet: End-to-End Human Pose and Shape Estimation with Lensless Imaging","summary":" Human pose and shape (HPS) estimation with lensless imaging is not only\nbeneficial to privacy protection but also can be used in covert surveillance\nscenarios due to the small size and simple structure of this device. However,\nthis task presents significant challenges due to the inherent ambiguity of the\ncaptured measurements and lacks effective methods for directly estimating human\npose and shape from lensless data. In this paper, we propose the first\nend-to-end framework to recover 3D human poses and shapes from lensless\nmeasurements to our knowledge. We specifically design a multi-scale lensless\nfeature decoder to decode the lensless measurements through the optically\nencoded mask for efficient feature extraction. We also propose a double-head\nauxiliary supervision mechanism to improve the estimation accuracy of human\nlimb ends. Besides, we establish a lensless imaging system and verify the\neffectiveness of our method on various datasets acquired by our lensless\nimaging system.\n","authors":["Haoyang Ge","Qiao Feng","Hailong Jia","Xiongzheng Li","Xiangjun Yin","You Zhou","Jingyu Yang","Kun Li"],"pdf_url":"https://arxiv.org/pdf/2404.01941v2.pdf","comment":"Accepted to CVPR 2024. More results available at\n https://cic.tju.edu.cn/faculty/likun/projects/LPSNet"},{"id":"http://arxiv.org/abs/2403.02561v2","updated":"2024-04-03T05:43:10Z","published":"2024-03-05T00:34:05Z","title":"Semantic Human Mesh Reconstruction with Textures","summary":" The field of 3D detailed human mesh reconstruction has made significant\nprogress in recent years. However, current methods still face challenges when\nused in industrial applications due to unstable results, low-quality meshes,\nand a lack of UV unwrapping and skinning weights. In this paper, we present\nSHERT, a novel pipeline that can reconstruct semantic human meshes with\ntextures and high-precision details. SHERT applies semantic- and normal-based\nsampling between the detailed surface (e.g. mesh and SDF) and the corresponding\nSMPL-X model to obtain a partially sampled semantic mesh and then generates the\ncomplete semantic mesh by our specifically designed self-supervised completion\nand refinement networks. Using the complete semantic mesh as a basis, we employ\na texture diffusion model to create human textures that are driven by both\nimages and texts. Our reconstructed meshes have stable UV unwrapping,\nhigh-quality triangle meshes, and consistent semantic information. The given\nSMPL-X model provides semantic information and shape priors, allowing SHERT to\nperform well even with incorrect and incomplete inputs. The semantic\ninformation also makes it easy to substitute and animate different body parts\nsuch as the face, body, and hands. Quantitative and qualitative experiments\ndemonstrate that SHERT is capable of producing high-fidelity and robust\nsemantic meshes that outperform state-of-the-art methods.\n","authors":["Xiaoyu Zhan","Jianxin Yang","Yuanqi Li","Jie Guo","Yanwen Guo","Wenping Wang"],"pdf_url":"https://arxiv.org/pdf/2403.02561v2.pdf","comment":"Accepted to CVPR 2024. Project page:\n https://zhanxy.xyz/projects/shert/"},{"id":"http://arxiv.org/abs/2404.02462v1","updated":"2024-04-03T05:04:55Z","published":"2024-04-03T05:04:55Z","title":"A Unified Membership Inference Method for Visual Self-supervised Encoder\n via Part-aware Capability","summary":" Self-supervised learning shows promise in harnessing extensive unlabeled\ndata, but it also confronts significant privacy concerns, especially in vision.\nIn this paper, we aim to perform membership inference on visual self-supervised\nmodels in a more realistic setting: self-supervised training method and details\nare unknown for an adversary when attacking as he usually faces a black-box\nsystem in practice. In this setting, considering that self-supervised model\ncould be trained by completely different self-supervised paradigms, e.g.,\nmasked image modeling and contrastive learning, with complex training details,\nwe propose a unified membership inference method called PartCrop. It is\nmotivated by the shared part-aware capability among models and stronger part\nresponse on the training data. Specifically, PartCrop crops parts of objects in\nan image to query responses with the image in representation space. We conduct\nextensive attacks on self-supervised models with different training protocols\nand structures using three widely used image datasets. The results verify the\neffectiveness and generalization of PartCrop. Moreover, to defend against\nPartCrop, we evaluate two common approaches, i.e., early stop and differential\nprivacy, and propose a tailored method called shrinking crop scale range. The\ndefense experiments indicate that all of them are effective. Our code is\navailable at https://github.com/JiePKU/PartCrop\n","authors":["Jie Zhu","Jirong Zha","Ding Li","Leye Wang"],"pdf_url":"https://arxiv.org/pdf/2404.02462v1.pdf","comment":"Membership Inference, Self-supervised learning"},{"id":"http://arxiv.org/abs/2404.02460v1","updated":"2024-04-03T05:02:46Z","published":"2024-04-03T05:02:46Z","title":"TSNet:A Two-stage Network for Image Dehazing with Multi-scale Fusion and\n Adaptive Learning","summary":" Image dehazing has been a popular topic of research for a long time. Previous\ndeep learning-based image dehazing methods have failed to achieve satisfactory\ndehazing effects on both synthetic datasets and real-world datasets, exhibiting\npoor generalization. Moreover, single-stage networks often result in many\nregions with artifacts and color distortion in output images. To address these\nissues, this paper proposes a two-stage image dehazing network called TSNet,\nmainly consisting of the multi-scale fusion module (MSFM) and the adaptive\nlearning module (ALM). Specifically, MSFM and ALM enhance the generalization of\nTSNet. The MSFM can obtain large receptive fields at multiple scales and\nintegrate features at different frequencies to reduce the differences between\ninputs and learning objectives. The ALM can actively learn of regions of\ninterest in images and restore texture details more effectively. Additionally,\nTSNet is designed as a two-stage network, where the first-stage network\nperforms image dehazing, and the second-stage network is employed to improve\nissues such as artifacts and color distortion present in the results of the\nfirst-stage network. We also change the learning objective from ground truth\nimages to opposite fog maps, which improves the learning efficiency of TSNet.\nExtensive experiments demonstrate that TSNet exhibits superior dehazing\nperformance on both synthetic and real-world datasets compared to previous\nstate-of-the-art methods.\n","authors":["Xiaolin Gong","Zehan Zheng","Heyuan Du"],"pdf_url":"https://arxiv.org/pdf/2404.02460v1.pdf","comment":"12 pages, 10 figures, 7 tables"},{"id":"http://arxiv.org/abs/2404.02457v1","updated":"2024-04-03T04:59:28Z","published":"2024-04-03T04:59:28Z","title":"RS3Mamba: Visual State Space Model for Remote Sensing Images Semantic\n Segmentation","summary":" Semantic segmentation of remote sensing images is a fundamental task in\ngeoscience research. However, there are some significant shortcomings for the\nwidely used convolutional neural networks (CNNs) and Transformers. The former\nis limited by its insufficient long-range modeling capabilities, while the\nlatter is hampered by its computational complexity. Recently, a novel visual\nstate space (VSS) model represented by Mamba has emerged, capable of modeling\nlong-range relationships with linear computability. In this work, we propose a\nnovel dual-branch network named remote sensing images semantic segmentation\nMamba (RS3Mamba) to incorporate this innovative technology into remote sensing\ntasks. Specifically, RS3Mamba utilizes VSS blocks to construct an auxiliary\nbranch, providing additional global information to convolution-based main\nbranch. Moreover, considering the distinct characteristics of the two branches,\nwe introduce a collaborative completion module (CCM) to enhance and fuse\nfeatures from the dual-encoder. Experimental results on two widely used\ndatasets, ISPRS Vaihingen and LoveDA Urban, demonstrate the effectiveness and\npotential of the proposed RS3Mamba. To the best of our knowledge, this is the\nfirst vision Mamba specifically designed for remote sensing images semantic\nsegmentation. The source code will be made available at\nhttps://github.com/sstary/SSRS.\n","authors":["Xianping Ma","Xiaokang Zhang","Man-On Pun"],"pdf_url":"https://arxiv.org/pdf/2404.02457v1.pdf","comment":"5 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.02447v1","updated":"2024-04-03T04:26:50Z","published":"2024-04-03T04:26:50Z","title":"A Novel Approach to Breast Cancer Histopathological Image Classification\n Using Cross-Colour Space Feature Fusion and Quantum-Classical Stack Ensemble\n Method","summary":" Breast cancer classification stands as a pivotal pillar in ensuring timely\ndiagnosis and effective treatment. This study with histopathological images\nunderscores the profound significance of harnessing the synergistic\ncapabilities of colour space ensembling and quantum-classical stacking to\nelevate the precision of breast cancer classification. By delving into the\ndistinct colour spaces of RGB, HSV and CIE L*u*v, the authors initiated a\ncomprehensive investigation guided by advanced methodologies. Employing the\nDenseNet121 architecture for feature extraction the authors have capitalized on\nthe robustness of Random Forest, SVM, QSVC, and VQC classifiers. This research\nencompasses a unique feature fusion technique within the colour space ensemble.\nThis approach not only deepens our comprehension of breast cancer\nclassification but also marks a milestone in personalized medical assessment.\nThe amalgamation of quantum and classical classifiers through stacking emerges\nas a potent catalyst, effectively mitigating the inherent constraints of\nindividual classifiers, paving a robust path towards more dependable and\nrefined breast cancer identification. Through rigorous experimentation and\nmeticulous analysis, fusion of colour spaces like RGB with HSV and RGB with CIE\nL*u*v, presents an classification accuracy, nearing the value of unity. This\nunderscores the transformative potential of our approach, where the fusion of\ndiverse colour spaces and the synergy of quantum and classical realms converge\nto establish a new horizon in medical diagnostics. Thus the implications of\nthis research extend across medical disciplines, offering promising avenues for\nadvancing diagnostic accuracy and treatment efficacy.\n","authors":["Sambit Mallick","Snigdha Paul","Anindya Sen"],"pdf_url":"https://arxiv.org/pdf/2404.02447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11994v2","updated":"2024-04-03T04:07:50Z","published":"2023-12-19T09:37:25Z","title":"Optimizing Diffusion Noise Can Serve As Universal Motion Priors","summary":" We propose Diffusion Noise Optimization (DNO), a new method that effectively\nleverages existing motion diffusion models as motion priors for a wide range of\nmotion-related tasks. Instead of training a task-specific diffusion model for\neach new task, DNO operates by optimizing the diffusion latent noise of an\nexisting pre-trained text-to-motion model. Given the corresponding latent noise\nof a human motion, it propagates the gradient from the target criteria defined\non the motion space through the whole denoising process to update the diffusion\nlatent noise. As a result, DNO supports any use cases where criteria can be\ndefined as a function of motion. In particular, we show that, for motion\nediting and control, DNO outperforms existing methods in both achieving the\nobjective and preserving the motion content. DNO accommodates a diverse range\nof editing modes, including changing trajectory, pose, joint locations, or\navoiding newly added obstacles. In addition, DNO is effective in motion\ndenoising and completion, producing smooth and realistic motion from noisy and\npartial inputs. DNO achieves these results at inference time without the need\nfor model retraining, offering great versatility for any defined reward or loss\nfunction on the motion representation.\n","authors":["Korrawe Karunratanakul","Konpat Preechakul","Emre Aksan","Thabo Beeler","Supasorn Suwajanakorn","Siyu Tang"],"pdf_url":"https://arxiv.org/pdf/2312.11994v2.pdf","comment":"CVPR 2024. Project page: https://korrawe.github.io/dno-project/"},{"id":"http://arxiv.org/abs/2403.11056v2","updated":"2024-04-03T04:00:53Z","published":"2024-03-17T02:06:03Z","title":"Analytic-Splatting: Anti-Aliased 3D Gaussian Splatting via Analytic\n Integration","summary":" The 3D Gaussian Splatting (3DGS) gained its popularity recently by combining\nthe advantages of both primitive-based and volumetric 3D representations,\nresulting in improved quality and efficiency for 3D scene rendering. However,\n3DGS is not alias-free, and its rendering at varying resolutions could produce\nsevere blurring or jaggies. This is because 3DGS treats each pixel as an\nisolated, single point rather than as an area, causing insensitivity to changes\nin the footprints of pixels. Consequently, this discrete sampling scheme\ninevitably results in aliasing, owing to the restricted sampling bandwidth. In\nthis paper, we derive an analytical solution to address this issue. More\nspecifically, we use a conditioned logistic function as the analytic\napproximation of the cumulative distribution function (CDF) in a\none-dimensional Gaussian signal and calculate the Gaussian integral by\nsubtracting the CDFs. We then introduce this approximation in the\ntwo-dimensional pixel shading, and present Analytic-Splatting, which\nanalytically approximates the Gaussian integral within the 2D-pixel window area\nto better capture the intensity response of each pixel. Moreover, we use the\napproximated response of the pixel window integral area to participate in the\ntransmittance calculation of volume rendering, making Analytic-Splatting\nsensitive to the changes in pixel footprint at different resolutions.\nExperiments on various datasets validate that our approach has better\nanti-aliasing capability that gives more details and better fidelity.\n","authors":["Zhihao Liang","Qi Zhang","Wenbo Hu","Ying Feng","Lei Zhu","Kui Jia"],"pdf_url":"https://arxiv.org/pdf/2403.11056v2.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2401.13201v2","updated":"2024-04-03T03:52:44Z","published":"2024-01-24T03:07:26Z","title":"MLLMReID: Multimodal Large Language Model-based Person Re-identification","summary":" Multimodal large language models (MLLM) have achieved satisfactory results in\nmany tasks. However, their performance in the task of person re-identification\n(ReID) has not been explored to date. This paper will investigate how to adapt\nthem for the task of ReID. An intuitive idea is to fine-tune MLLM with ReID\nimage-text datasets, and then use their visual encoder as a backbone for ReID.\nHowever, there still exist two apparent issues: (1) Designing instructions for\nReID, MLLMs may overfit specific instructions, and designing a variety of\ninstructions will lead to higher costs. (2) Latent image feature vectors from\nLLMs are not involved in loss computation. Instructional learning, aligning\nimage-text features, results in indirect optimization and a learning objective\nthat inadequately utilizes features, limiting effectiveness in person feature\nlearning. To address these problems, this paper proposes MLLMReID: Multimodal\nLarge Language Model-based ReID. Firstly, we proposed Common Instruction, a\nsimple approach that leverages the essence ability of LLMs to continue writing,\navoiding complex and diverse instruction design. Secondly, we proposed\nDirectReID, which effectively employs the latent image feature vectors of\nimages outputted by LLMs in ReID tasks. The experimental results demonstrate\nthe superiority of our method. We will open-source the code on GitHub.\n","authors":["Shan Yang","Yongfei Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.13201v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.02402v3","updated":"2024-04-03T03:45:38Z","published":"2024-01-04T18:39:32Z","title":"3D Open-Vocabulary Panoptic Segmentation with 2D-3D Vision-Language\n Distillation","summary":" 3D panoptic segmentation is a challenging perception task, especially in\nautonomous driving. It aims to predict both semantic and instance annotations\nfor 3D points in a scene. Although prior 3D panoptic segmentation approaches\nhave achieved great performance on closed-set benchmarks, generalizing these\napproaches to unseen things and unseen stuff categories remains an open\nproblem. For unseen object categories, 2D open-vocabulary segmentation has\nachieved promising results that solely rely on frozen CLIP backbones and\nensembling multiple classification outputs. However, we find that simply\nextending these 2D models to 3D does not guarantee good performance due to poor\nper-mask classification quality, especially for novel stuff categories. In this\npaper, we propose the first method to tackle 3D open-vocabulary panoptic\nsegmentation. Our model takes advantage of the fusion between learnable LiDAR\nfeatures and dense frozen vision CLIP features, using a single classification\nhead to make predictions for both base and novel classes. To further improve\nthe classification performance on novel classes and leverage the CLIP model, we\npropose two novel loss functions: object-level distillation loss and\nvoxel-level distillation loss. Our experiments on the nuScenes and\nSemanticKITTI datasets show that our method outperforms the strong baseline by\na large margin.\n","authors":["Zihao Xiao","Longlong Jing","Shangxuan Wu","Alex Zihao Zhu","Jingwei Ji","Chiyu Max Jiang","Wei-Chih Hung","Thomas Funkhouser","Weicheng Kuo","Anelia Angelova","Yin Zhou","Shiwei Sheng"],"pdf_url":"https://arxiv.org/pdf/2401.02402v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02424v1","updated":"2024-04-03T03:27:01Z","published":"2024-04-03T03:27:01Z","title":"RESSA: Repair Sparse Vision-Language Models via Sparse Cross-Modality\n Adaptation","summary":" Vision-Language Models (VLMs), integrating diverse information from multiple\nmodalities, have shown remarkable success across various tasks. However,\ndeploying VLMs, comprising large-scale vision and language models poses\nchallenges in resource-constrained scenarios. While pruning followed by\nfinetuning offers a potential solution to maintain performance with smaller\nmodel sizes, its application to VLMs remains relatively unexplored, presenting\ntwo main questions: how to distribute sparsity across different\nmodality-specific models, and how to repair the performance of pruned sparse\nVLMs. To answer the first question, we conducted preliminary studies on VLM\npruning and found that pruning vision models and language models with the same\nsparsity ratios contribute to nearly optimal performance. For the second\nquestion, unlike finetuning unimodal sparse models, sparse VLMs involve\ncross-modality interactions, requiring specialized techniques for post-pruning\nperformance repair. Moreover, while parameter-efficient LoRA finetuning has\nbeen proposed to repair the performance of sparse models, a significant\nchallenge of weights merging arises due to the incompatibility of dense LoRA\nmodules with sparse models that destroy the sparsity of pruned models. To\ntackle these challenges, we propose to Repair Sparse Vision-Language Models via\nSparse Cross-modality Adaptation (RESSA). RESSA utilizes cross-modality\nfinetuning to enhance task-specific performance and facilitate knowledge\ndistillation from original dense models. Additionally, we introduce SparseLoRA,\nwhich applies sparsity directly to LoRA weights, enabling seamless integration\nwith sparse models. Our experimental results validate the effectiveness of\nRESSA, showcasing significant enhancements, such as an 11.3\\% improvement under\n2:4 sparsity and a remarkable 47.6\\% enhancement under unstructured 70\\%\nsparsity.\n","authors":["Shwai He","Tianlong Chen"],"pdf_url":"https://arxiv.org/pdf/2404.02424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.15472v4","updated":"2024-04-03T03:15:55Z","published":"2022-06-30T17:59:08Z","title":"On-Device Training Under 256KB Memory","summary":" On-device training enables the model to adapt to new data collected from the\nsensors by fine-tuning a pre-trained model. Users can benefit from customized\nAI models without having to transfer the data to the cloud, protecting the\nprivacy. However, the training memory consumption is prohibitive for IoT\ndevices that have tiny memory resources. We propose an algorithm-system\nco-design framework to make on-device training possible with only 256KB of\nmemory. On-device training faces two unique challenges: (1) the quantized\ngraphs of neural networks are hard to optimize due to low bit-precision and the\nlack of normalization; (2) the limited hardware resource does not allow full\nback-propagation. To cope with the optimization difficulty, we propose\nQuantization-Aware Scaling to calibrate the gradient scales and stabilize 8-bit\nquantized training. To reduce the memory footprint, we propose Sparse Update to\nskip the gradient computation of less important layers and sub-tensors. The\nalgorithm innovation is implemented by a lightweight training system, Tiny\nTraining Engine, which prunes the backward computation graph to support sparse\nupdates and offload the runtime auto-differentiation to compile time. Our\nframework is the first solution to enable tiny on-device training of\nconvolutional neural networks under 256KB SRAM and 1MB Flash without auxiliary\nmemory, using less than 1/1000 of the memory of PyTorch and TensorFlow while\nmatching the accuracy on tinyML application VWW. Our study enables IoT devices\nnot only to perform inference but also to continuously adapt to new data for\non-device lifelong learning. A video demo can be found here:\nhttps://youtu.be/0pUFZYdoMY8.\n","authors":["Ji Lin","Ligeng Zhu","Wei-Ming Chen","Wei-Chen Wang","Chuang Gan","Song Han"],"pdf_url":"https://arxiv.org/pdf/2206.15472v4.pdf","comment":"NeurIPS 2022"},{"id":"http://arxiv.org/abs/2403.19428v2","updated":"2024-04-03T02:59:24Z","published":"2024-03-28T13:58:05Z","title":"Burst Super-Resolution with Diffusion Models for Improving Perceptual\n Quality","summary":" While burst LR images are useful for improving the SR image quality compared\nwith a single LR image, prior SR networks accepting the burst LR images are\ntrained in a deterministic manner, which is known to produce a blurry SR image.\nIn addition, it is difficult to perfectly align the burst LR images, making the\nSR image more blurry. Since such blurry images are perceptually degraded, we\naim to reconstruct the sharp high-fidelity boundaries. Such high-fidelity\nimages can be reconstructed by diffusion models. However, prior SR methods\nusing the diffusion model are not properly optimized for the burst SR task.\nSpecifically, the reverse process starting from a random sample is not\noptimized for image enhancement and restoration methods, including burst SR. In\nour proposed method, on the other hand, burst LR features are used to\nreconstruct the initial burst SR image that is fed into an intermediate step in\nthe diffusion model. This reverse process from the intermediate step 1) skips\ndiffusion steps for reconstructing the global structure of the image and 2)\nfocuses on steps for refining detailed textures. Our experimental results\ndemonstrate that our method can improve the scores of the perceptual quality\nmetrics. Code: https://github.com/placerkyo/BSRD\n","authors":["Kyotaro Tokoro","Kazutoshi Akita","Norimichi Ukita"],"pdf_url":"https://arxiv.org/pdf/2403.19428v2.pdf","comment":"Accepted to IJCNN 2024 (International Joint Conference on Neural\n Networks)"},{"id":"http://arxiv.org/abs/2403.19920v2","updated":"2024-04-03T02:48:47Z","published":"2024-03-29T02:17:09Z","title":"MI-NeRF: Learning a Single Face NeRF from Multiple Identities","summary":" In this work, we introduce a method that learns a single dynamic neural\nradiance field (NeRF) from monocular talking face videos of multiple\nidentities. NeRFs have shown remarkable results in modeling the 4D dynamics and\nappearance of human faces. However, they require per-identity optimization.\nAlthough recent approaches have proposed techniques to reduce the training and\nrendering time, increasing the number of identities can be expensive. We\nintroduce MI-NeRF (multi-identity NeRF), a single unified network that models\ncomplex non-rigid facial motion for multiple identities, using only monocular\nvideos of arbitrary length. The core premise in our method is to learn the\nnon-linear interactions between identity and non-identity specific information\nwith a multiplicative module. By training on multiple videos simultaneously,\nMI-NeRF not only reduces the total training time compared to standard\nsingle-identity NeRFs, but also demonstrates robustness in synthesizing novel\nexpressions for any input identity. We present results for both facial\nexpression transfer and talking face video synthesis. Our method can be further\npersonalized for a target identity given only a short video.\n","authors":["Aggelina Chatziagapi","Grigorios G. Chrysos","Dimitris Samaras"],"pdf_url":"https://arxiv.org/pdf/2403.19920v2.pdf","comment":"Project page: https://aggelinacha.github.io/MI-NeRF/"},{"id":"http://arxiv.org/abs/2403.14530v2","updated":"2024-04-03T02:46:54Z","published":"2024-03-21T16:28:58Z","title":"HAC: Hash-grid Assisted Context for 3D Gaussian Splatting Compression","summary":" 3D Gaussian Splatting (3DGS) has emerged as a promising framework for novel\nview synthesis, boasting rapid rendering speed with high fidelity. However, the\nsubstantial Gaussians and their associated attributes necessitate effective\ncompression techniques. Nevertheless, the sparse and unorganized nature of the\npoint cloud of Gaussians (or anchors in our paper) presents challenges for\ncompression. To address this, we make use of the relations between the\nunorganized anchors and the structured hash grid, leveraging their mutual\ninformation for context modeling, and propose a Hash-grid Assisted Context\n(HAC) framework for highly compact 3DGS representation. Our approach introduces\na binary hash grid to establish continuous spatial consistencies, allowing us\nto unveil the inherent spatial relations of anchors through a carefully\ndesigned context model. To facilitate entropy coding, we utilize Gaussian\ndistributions to accurately estimate the probability of each quantized\nattribute, where an adaptive quantization module is proposed to enable\nhigh-precision quantization of these attributes for improved fidelity\nrestoration. Additionally, we incorporate an adaptive masking strategy to\neliminate invalid Gaussians and anchors. Importantly, our work is the pioneer\nto explore context-based compression for 3DGS representation, resulting in a\nremarkable size reduction of over $75\\times$ compared to vanilla 3DGS, while\nsimultaneously improving fidelity, and achieving over $11\\times$ size reduction\nover SOTA 3DGS compression approach Scaffold-GS. Our code is available here:\nhttps://github.com/YihangChen-ee/HAC\n","authors":["Yihang Chen","Qianyi Wu","Jianfei Cai","Mehrtash Harandi","Weiyao Lin"],"pdf_url":"https://arxiv.org/pdf/2403.14530v2.pdf","comment":"Project Page: https://yihangchen-ee.github.io/project_hac/ Code:\n https://github.com/YihangChen-ee/HAC"},{"id":"http://arxiv.org/abs/2404.02415v1","updated":"2024-04-03T02:40:35Z","published":"2024-04-03T02:40:35Z","title":"What Are We Measuring When We Evaluate Large Vision-Language Models? An\n Analysis of Latent Factors and Biases","summary":" Vision-language (VL) models, pretrained on colossal image-text datasets, have\nattained broad VL competence that is difficult to evaluate. A common belief is\nthat a small number of VL skills underlie the variety of VL tests. In this\npaper, we perform a large-scale transfer learning experiment aimed at\ndiscovering latent VL skills from data. We reveal interesting characteristics\nthat have important implications for test suite design. First, generation tasks\nsuffer from a length bias, suggesting benchmarks should balance tasks with\nvarying output lengths. Second, we demonstrate that factor analysis\nsuccessfully identifies reasonable yet surprising VL skill factors, suggesting\nbenchmarks could leverage similar analyses for task selection. Finally, we\npresent a new dataset, OLIVE (https://github.com/jq-zh/olive-dataset), which\nsimulates user instructions in the wild and presents challenges dissimilar to\nall datasets we tested. Our findings contribute to the design of balanced and\nbroad-coverage vision-language evaluation methods.\n","authors":["Anthony Meng Huat Tiong","Junqi Zhao","Boyang Li","Junnan Li","Steven C. H. Hoi","Caiming Xiong"],"pdf_url":"https://arxiv.org/pdf/2404.02415v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.07935v2","updated":"2024-04-03T02:27:59Z","published":"2022-04-17T05:42:41Z","title":"Causal Intervention for Subject-Deconfounded Facial Action Unit\n Recognition","summary":" Subject-invariant facial action unit (AU) recognition remains challenging for\nthe reason that the data distribution varies among subjects. In this paper, we\npropose a causal inference framework for subject-invariant facial action unit\nrecognition. To illustrate the causal effect existing in AU recognition task,\nwe formulate the causalities among facial images, subjects, latent AU semantic\nrelations, and estimated AU occurrence probabilities via a structural causal\nmodel. By constructing such a causal diagram, we clarify the causal effect\namong variables and propose a plug-in causal intervention module, CIS, to\ndeconfound the confounder \\emph{Subject} in the causal diagram. Extensive\nexperiments conducted on two commonly used AU benchmark datasets, BP4D and\nDISFA, show the effectiveness of our CIS, and the model with CIS inserted,\nCISNet, has achieved state-of-the-art performance.\n","authors":["Yingjie Chen","Diqi Chen","Tao Wang","Yizhou Wang","Yun Liang"],"pdf_url":"https://arxiv.org/pdf/2204.07935v2.pdf","comment":"Accepted by AAAI2022"},{"id":"http://arxiv.org/abs/2404.02410v1","updated":"2024-04-03T02:26:15Z","published":"2024-04-03T02:26:15Z","title":"TCLC-GS: Tightly Coupled LiDAR-Camera Gaussian Splatting for Surrounding\n Autonomous Driving Scenes","summary":" Most 3D Gaussian Splatting (3D-GS) based methods for urban scenes initialize\n3D Gaussians directly with 3D LiDAR points, which not only underutilizes LiDAR\ndata capabilities but also overlooks the potential advantages of fusing LiDAR\nwith camera data. In this paper, we design a novel tightly coupled LiDAR-Camera\nGaussian Splatting (TCLC-GS) to fully leverage the combined strengths of both\nLiDAR and camera sensors, enabling rapid, high-quality 3D reconstruction and\nnovel view RGB/depth synthesis. TCLC-GS designs a hybrid explicit (colorized 3D\nmesh) and implicit (hierarchical octree feature) 3D representation derived from\nLiDAR-camera data, to enrich the properties of 3D Gaussians for splatting. 3D\nGaussian's properties are not only initialized in alignment with the 3D mesh\nwhich provides more completed 3D shape and color information, but are also\nendowed with broader contextual information through retrieved octree implicit\nfeatures. During the Gaussian Splatting optimization process, the 3D mesh\noffers dense depth information as supervision, which enhances the training\nprocess by learning of a robust geometry. Comprehensive evaluations conducted\non the Waymo Open Dataset and nuScenes Dataset validate our method's\nstate-of-the-art (SOTA) performance. Utilizing a single NVIDIA RTX 3090 Ti, our\nmethod demonstrates fast training and achieves real-time RGB and depth\nrendering at 90 FPS in resolution of 1920x1280 (Waymo), and 120 FPS in\nresolution of 1600x900 (nuScenes) in urban scenarios.\n","authors":["Cheng Zhao","Su Sun","Ruoyu Wang","Yuliang Guo","Jun-Jun Wan","Zhou Huang","Xinyu Huang","Yingjie Victor Chen","Liu Ren"],"pdf_url":"https://arxiv.org/pdf/2404.02410v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02405v1","updated":"2024-04-03T02:16:30Z","published":"2024-04-03T02:16:30Z","title":"TE-TAD: Towards Full End-to-End Temporal Action Detection via\n Time-Aligned Coordinate Expression","summary":" In this paper, we investigate that the normalized coordinate expression is a\nkey factor as reliance on hand-crafted components in query-based detectors for\ntemporal action detection (TAD). Despite significant advancements towards an\nend-to-end framework in object detection, query-based detectors have been\nlimited in achieving full end-to-end modeling in TAD. To address this issue, we\npropose \\modelname{}, a full end-to-end temporal action detection transformer\nthat integrates time-aligned coordinate expression. We reformulate coordinate\nexpression utilizing actual timeline values, ensuring length-invariant\nrepresentations from the extremely diverse video duration environment.\nFurthermore, our proposed adaptive query selection dynamically adjusts the\nnumber of queries based on video length, providing a suitable solution for\nvarying video durations compared to a fixed query set. Our approach not only\nsimplifies the TAD process by eliminating the need for hand-crafted components\nbut also significantly improves the performance of query-based detectors. Our\nTE-TAD outperforms the previous query-based detectors and achieves competitive\nperformance compared to state-of-the-art methods on popular benchmark datasets.\nCode is available at: https://github.com/Dotori-HJ/TE-TAD\n","authors":["Ho-Joong Kim","Jung-Ho Hong","Heejon Kong","Seong-Whan Lee"],"pdf_url":"https://arxiv.org/pdf/2404.02405v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02396v1","updated":"2024-04-03T01:55:15Z","published":"2024-04-03T01:55:15Z","title":"Enhancing Diffusion-based Point Cloud Generation with Smoothness\n Constraint","summary":" Diffusion models have been popular for point cloud generation tasks. Existing\nworks utilize the forward diffusion process to convert the original point\ndistribution into a noise distribution and then learn the reverse diffusion\nprocess to recover the point distribution from the noise distribution. However,\nthe reverse diffusion process can produce samples with non-smooth points on the\nsurface because of the ignorance of the point cloud geometric properties. We\npropose alleviating the problem by incorporating the local smoothness\nconstraint into the diffusion framework for point cloud generation. Experiments\ndemonstrate the proposed model can generate realistic shapes and smoother point\nclouds, outperforming multiple state-of-the-art methods.\n","authors":["Yukun Li","Liping Liu"],"pdf_url":"https://arxiv.org/pdf/2404.02396v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02394v1","updated":"2024-04-03T01:36:27Z","published":"2024-04-03T01:36:27Z","title":"Cohort-Individual Cooperative Learning for Multimodal Cancer Survival\n Analysis","summary":" Recently, we have witnessed impressive achievements in cancer survival\nanalysis by integrating multimodal data, e.g., pathology images and genomic\nprofiles. However, the heterogeneity and high dimensionality of these\nmodalities pose significant challenges for extracting discriminative\nrepresentations while maintaining good generalization. In this paper, we\npropose a Cohort-individual Cooperative Learning (CCL) framework to advance\ncancer survival analysis by collaborating knowledge decomposition and cohort\nguidance. Specifically, first, we propose a Multimodal Knowledge Decomposition\n(MKD) module to explicitly decompose multimodal knowledge into four distinct\ncomponents: redundancy, synergy and uniqueness of the two modalities. Such a\ncomprehensive decomposition can enlighten the models to perceive easily\noverlooked yet important information, facilitating an effective multimodal\nfusion. Second, we propose a Cohort Guidance Modeling (CGM) to mitigate the\nrisk of overfitting task-irrelevant information. It can promote a more\ncomprehensive and robust understanding of the underlying multimodal data, while\navoiding the pitfalls of overfitting and enhancing the generalization ability\nof the model. By cooperating the knowledge decomposition and cohort guidance\nmethods, we develop a robust multimodal survival analysis model with enhanced\ndiscrimination and generalization abilities. Extensive experimental results on\nfive cancer datasets demonstrate the effectiveness of our model in integrating\nmultimodal data for survival analysis.\n","authors":["Huajun Zhou","Fengtao Zhou","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2404.02394v1.pdf","comment":"10 pages, 9 figures"},{"id":"http://arxiv.org/abs/2404.02391v1","updated":"2024-04-03T01:29:30Z","published":"2024-04-03T01:29:30Z","title":"APC2Mesh: Bridging the gap from occluded building façades to full 3D\n models","summary":" The benefits of having digital twins of urban buildings are numerous.\nHowever, a major difficulty encountered in their creation from airborne LiDAR\npoint clouds is the effective means of accurately reconstructing significant\nocclusions amidst point density variations and noise. To bridge the\nnoise/sparsity/occlusion gap and generate high fidelity 3D building models, we\npropose APC2Mesh which integrates point completion into a 3D reconstruction\npipeline, enabling the learning of dense geometrically accurate representation\nof buildings. Specifically, we leveraged complete points generated from\noccluded ones as input to a linearized skip attention-based deformation network\nfor 3D mesh reconstruction. In our experiments, conducted on 3 different\nscenes, we demonstrate that: (1) APC2Mesh delivers comparatively superior\nresults, indicating its efficacy in handling the challenges of occluded\nairborne building points of diverse styles and complexities. (2) The\ncombination of point completion with typical deep learning-based 3D point cloud\nreconstruction methods offers a direct and effective solution for\nreconstructing significantly occluded airborne building points. As such, this\nneural integration holds promise for advancing the creation of digital twins\nfor urban buildings with greater accuracy and fidelity.\n","authors":["Perpetual Hope Akwensi","Akshay Bharadwaj","Ruisheng Wang"],"pdf_url":"https://arxiv.org/pdf/2404.02391v1.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2404.02388v1","updated":"2024-04-03T01:13:05Z","published":"2024-04-03T01:13:05Z","title":"CAPE: CAM as a Probabilistic Ensemble for Enhanced DNN Interpretation","summary":" Deep Neural Networks (DNNs) are widely used for visual classification tasks,\nbut their complex computation process and black-box nature hinder decision\ntransparency and interpretability. Class activation maps (CAMs) and recent\nvariants provide ways to visually explain the DNN decision-making process by\ndisplaying 'attention' heatmaps of the DNNs. Nevertheless, the CAM explanation\nonly offers relative attention information, that is, on an attention heatmap,\nwe can interpret which image region is more or less important than the others.\nHowever, these regions cannot be meaningfully compared across classes, and the\ncontribution of each region to the model's class prediction is not revealed. To\naddress these challenges that ultimately lead to better DNN Interpretation, in\nthis paper, we propose CAPE, a novel reformulation of CAM that provides a\nunified and probabilistically meaningful assessment of the contributions of\nimage regions. We quantitatively and qualitatively compare CAPE with\nstate-of-the-art CAM methods on CUB and ImageNet benchmark datasets to\ndemonstrate enhanced interpretability. We also test on a cytology imaging\ndataset depicting a challenging Chronic Myelomonocytic Leukemia (CMML)\ndiagnosis problem. Code is available at: https://github.com/AIML-MED/CAPE.\n","authors":["Townim Faisal Chowdhury","Kewen Liao","Vu Minh Hieu Phan","Minh-Son To","Yutong Xie","Kevin Hung","David Ross","Anton van den Hengel","Johan W. Verjans","Zhibin Liao"],"pdf_url":"https://arxiv.org/pdf/2404.02388v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02370v1","updated":"2024-04-03T00:09:05Z","published":"2024-04-03T00:09:05Z","title":"Enhancing Human-Computer Interaction in Chest X-ray Analysis using\n Vision and Language Model with Eye Gaze Patterns","summary":" Recent advancements in Computer Assisted Diagnosis have shown promising\nperformance in medical imaging tasks, particularly in chest X-ray analysis.\nHowever, the interaction between these models and radiologists has been\nprimarily limited to input images. This work proposes a novel approach to\nenhance human-computer interaction in chest X-ray analysis using\nVision-Language Models (VLMs) enhanced with radiologists' attention by\nincorporating eye gaze data alongside textual prompts. Our approach leverages\nheatmaps generated from eye gaze data, overlaying them onto medical images to\nhighlight areas of intense radiologist's focus during chest X-ray evaluation.\nWe evaluate this methodology in tasks such as visual question answering, chest\nX-ray report automation, error detection, and differential diagnosis. Our\nresults demonstrate the inclusion of eye gaze information significantly\nenhances the accuracy of chest X-ray analysis. Also, the impact of eye gaze on\nfine-tuning was confirmed as it outperformed other medical VLMs in all tasks\nexcept visual question answering. This work marks the potential of leveraging\nboth the VLM's capabilities and the radiologist's domain knowledge to improve\nthe capabilities of AI models in medical imaging, paving a novel way for\nComputer Assisted Diagnosis with a human-centred AI.\n","authors":["Yunsoo Kim","Jinge Wu","Yusuf Abdulle","Yue Gao","Honghan Wu"],"pdf_url":"https://arxiv.org/pdf/2404.02370v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2404.03121v1","updated":"2024-04-03T23:59:59Z","published":"2024-04-03T23:59:59Z","title":"Utilizing Computer Vision for Continuous Monitoring of Vaccine Side\n Effects in Experimental Mice","summary":" The demand for improved efficiency and accuracy in vaccine safety assessments\nis increasing. Here, we explore the application of computer vision technologies\nto automate the monitoring of experimental mice for potential side effects\nafter vaccine administration. Traditional observation methods are\nlabor-intensive and lack the capability for continuous monitoring. By deploying\na computer vision system, our research aims to improve the efficiency and\naccuracy of vaccine safety assessments. The methodology involves training\nmachine learning models on annotated video data of mice behaviors pre- and\npost-vaccination. Preliminary results indicate that computer vision effectively\nidentify subtle changes, signaling possible side effects. Therefore, our\napproach has the potential to significantly enhance the monitoring process in\nvaccine trials in animals, providing a practical solution to the limitations of\nhuman observation.\n","authors":["Chuang Li","Shuai Shao","Willian Mikason","Rubing Lin","Yantong Liu"],"pdf_url":"https://arxiv.org/pdf/2404.03121v1.pdf","comment":"1 figure"},{"id":"http://arxiv.org/abs/2404.03118v1","updated":"2024-04-03T23:57:34Z","published":"2024-04-03T23:57:34Z","title":"LVLM-Intrepret: An Interpretability Tool for Large Vision-Language\n Models","summary":" In the rapidly evolving landscape of artificial intelligence, multi-modal\nlarge language models are emerging as a significant area of interest. These\nmodels, which combine various forms of data input, are becoming increasingly\npopular. However, understanding their internal mechanisms remains a complex\ntask. Numerous advancements have been made in the field of explainability tools\nand mechanisms, yet there is still much to explore. In this work, we present a\nnovel interactive application aimed towards understanding the internal\nmechanisms of large vision-language models. Our interface is designed to\nenhance the interpretability of the image patches, which are instrumental in\ngenerating an answer, and assess the efficacy of the language model in\ngrounding its output in the image. With our application, a user can\nsystematically investigate the model and uncover system limitations, paving the\nway for enhancements in system capabilities. Finally, we present a case study\nof how our application can aid in understanding failure mechanisms in a popular\nlarge multi-modal model: LLaVA.\n","authors":["Gabriela Ben Melech Stan","Raanan Yehezkel Rohekar","Yaniv Gurwicz","Matthew Lyle Olson","Anahita Bhiwandiwalla","Estelle Aflalo","Chenfei Wu","Nan Duan","Shao-Yen Tseng","Vasudev Lal"],"pdf_url":"https://arxiv.org/pdf/2404.03118v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03110v1","updated":"2024-04-03T23:24:25Z","published":"2024-04-03T23:24:25Z","title":"Ego-Motion Aware Target Prediction Module for Robust Multi-Object\n Tracking","summary":" Multi-object tracking (MOT) is a prominent task in computer vision with\napplication in autonomous driving, responsible for the simultaneous tracking of\nmultiple object trajectories. Detection-based multi-object tracking (DBT)\nalgorithms detect objects using an independent object detector and predict the\nimminent location of each target. Conventional prediction methods in DBT\nutilize Kalman Filter(KF) to extrapolate the target location in the upcoming\nframes by supposing a constant velocity motion model. These methods are\nespecially hindered in autonomous driving applications due to dramatic camera\nmotion or unavailable detections. Such limitations lead to tracking failures\nmanifested by numerous identity switches and disrupted trajectories. In this\npaper, we introduce a novel KF-based prediction module called the Ego-motion\nAware Target Prediction (EMAP) module by focusing on the integration of camera\nmotion and depth information with object motion models. Our proposed method\ndecouples the impact of camera rotational and translational velocity from the\nobject trajectories by reformulating the Kalman Filter. This reformulation\nenables us to reject the disturbances caused by camera motion and maximizes the\nreliability of the object motion model. We integrate our module with four\nstate-of-the-art base MOT algorithms, namely OC-SORT, Deep OC-SORT, ByteTrack,\nand BoT-SORT. In particular, our evaluation on the KITTI MOT dataset\ndemonstrates that EMAP remarkably drops the number of identity switches (IDSW)\nof OC-SORT and Deep OC-SORT by 73% and 21%, respectively. At the same time, it\nelevates other performance metrics such as HOTA by more than 5%. Our source\ncode is available at https://github.com/noyzzz/EMAP.\n","authors":["Navid Mahdian","Mohammad Jani","Amir M. Soufi Enayati","Homayoun Najjaran"],"pdf_url":"https://arxiv.org/pdf/2404.03110v1.pdf","comment":"7 pages, 4 figures, submitted to IROS2024"},{"id":"http://arxiv.org/abs/2404.03109v1","updated":"2024-04-03T23:20:40Z","published":"2024-04-03T23:20:40Z","title":"Many-to-many Image Generation with Auto-regressive Diffusion Models","summary":" Recent advancements in image generation have made significant progress, yet\nexisting models present limitations in perceiving and generating an arbitrary\nnumber of interrelated images within a broad context. This limitation becomes\nincreasingly critical as the demand for multi-image scenarios, such as\nmulti-view images and visual narratives, grows with the expansion of multimedia\nplatforms. This paper introduces a domain-general framework for many-to-many\nimage generation, capable of producing interrelated image series from a given\nset of images, offering a scalable solution that obviates the need for\ntask-specific solutions across different multi-image scenarios. To facilitate\nthis, we present MIS, a novel large-scale multi-image dataset, containing 12M\nsynthetic multi-image samples, each with 25 interconnected images. Utilizing\nStable Diffusion with varied latent noises, our method produces a set of\ninterconnected images from a single caption. Leveraging MIS, we learn M2M, an\nautoregressive model for many-to-many generation, where each image is modeled\nwithin a diffusion framework. Throughout training on the synthetic MIS, the\nmodel excels in capturing style and content from preceding images - synthetic\nor real - and generates novel images following the captured patterns.\nFurthermore, through task-specific fine-tuning, our model demonstrates its\nadaptability to various multi-image generation tasks, including Novel View\nSynthesis and Visual Procedure Generation.\n","authors":["Ying Shen","Yizhe Zhang","Shuangfei Zhai","Lifu Huang","Joshua M. Susskind","Jiatao Gu"],"pdf_url":"https://arxiv.org/pdf/2404.03109v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03097v1","updated":"2024-04-03T22:38:54Z","published":"2024-04-03T22:38:54Z","title":"SalFoM: Dynamic Saliency Prediction with Video Foundation Models","summary":" Recent advancements in video saliency prediction (VSP) have shown promising\nperformance compared to the human visual system, whose emulation is the primary\ngoal of VSP. However, current state-of-the-art models employ spatio-temporal\ntransformers trained on limited amounts of data, hindering generalizability\nadaptation to downstream tasks. The benefits of vision foundation models\npresent a potential solution to improve the VSP process. However, adapting\nimage foundation models to the video domain presents significant challenges in\nmodeling scene dynamics and capturing temporal information. To address these\nchallenges, and as the first initiative to design a VSP model based on video\nfoundation models, we introduce SalFoM, a novel encoder-decoder video\ntransformer architecture. Our model employs UnMasked Teacher (UMT) as feature\nextractor and presents a heterogeneous decoder which features a locality-aware\nspatio-temporal transformer and integrates local and global spatio-temporal\ninformation from various perspectives to produce the final saliency map. Our\nqualitative and quantitative experiments on the challenging VSP benchmark\ndatasets of DHF1K, Hollywood-2 and UCF-Sports demonstrate the superiority of\nour proposed model in comparison with the state-of-the-art methods.\n","authors":["Morteza Moradi","Mohammad Moradi","Francesco Rundo","Concetto Spampinato","Ali Borji","Simone Palazzo"],"pdf_url":"https://arxiv.org/pdf/2404.03097v1.pdf","comment":"15 pages, 4 figures"},{"id":"http://arxiv.org/abs/2401.02460v2","updated":"2024-04-03T22:23:25Z","published":"2024-01-04T08:39:13Z","title":"Improved Zero-Shot Classification by Adapting VLMs with Text\n Descriptions","summary":" The zero-shot performance of existing vision-language models (VLMs) such as\nCLIP is limited by the availability of large-scale, aligned image and text\ndatasets in specific domains. In this work, we leverage two complementary\nsources of information -- descriptions of categories generated by large\nlanguage models (LLMs) and abundant, fine-grained image classification datasets\n-- to improve the zero-shot classification performance of VLMs across\nfine-grained domains. On the technical side, we develop methods to train VLMs\nwith this \"bag-level\" image-text supervision. We find that simply using these\nattributes at test-time does not improve performance, but our training\nstrategy, for example, on the iNaturalist dataset, leads to an average\nimprovement of 4-5% in zero-shot classification accuracy for novel categories\nof birds and flowers. Similar improvements are observed in domains where a\nsubset of the categories was used to fine-tune the model. By prompting LLMs in\nvarious ways, we generate descriptions that capture visual appearance, habitat,\nand geographic regions and pair them with existing attributes such as the\ntaxonomic structure of the categories. We systematically evaluate their ability\nto improve zero-shot categorization in natural domains. Our findings suggest\nthat geographic priors can be just as effective and are complementary to visual\nappearance. Our method also outperforms prior work on prompt-based tuning of\nVLMs. We release the benchmark, consisting of 14 datasets at\nhttps://github.com/cvl-umass/AdaptCLIPZS , which will contribute to future\nresearch in zero-shot recognition.\n","authors":["Oindrila Saha","Grant Van Horn","Subhransu Maji"],"pdf_url":"https://arxiv.org/pdf/2401.02460v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01988v2","updated":"2024-04-03T21:47:52Z","published":"2024-04-02T14:26:18Z","title":"Cooperative Students: Navigating Unsupervised Domain Adaptation in\n Nighttime Object Detection","summary":" Unsupervised Domain Adaptation (UDA) has shown significant advancements in\nobject detection under well-lit conditions; however, its performance degrades\nnotably in low-visibility scenarios, especially at night, posing challenges not\nonly for its adaptability in low signal-to-noise ratio (SNR) conditions but\nalso for the reliability and efficiency of automated vehicles. To address this\nproblem, we propose a \\textbf{Co}operative \\textbf{S}tudents (\\textbf{CoS})\nframework that innovatively employs global-local transformations (GLT) and a\nproxy-based target consistency (PTC) mechanism to capture the spatial\nconsistency in day- and night-time scenarios effectively, and thus bridge the\nsignificant domain shift across contexts. Building upon this, we further devise\nan adaptive IoU-informed thresholding (AIT) module to gradually avoid\noverlooking potential true positives and enrich the latent information in the\ntarget domain. Comprehensive experiments show that CoS essentially enhanced UDA\nperformance in low-visibility conditions and surpasses current state-of-the-art\ntechniques, achieving an increase in mAP of 3.0\\%, 1.9\\%, and 2.5\\% on BDD100K,\nSHIFT, and ACDC datasets, respectively. Code is available at\nhttps://github.com/jichengyuan/Cooperitive_Students.\n","authors":["Jicheng Yuan","Anh Le-Tuan","Manfred Hauswirth","Danh Le-Phuoc"],"pdf_url":"https://arxiv.org/pdf/2404.01988v2.pdf","comment":"Code is available at\n https://github.com/jichengyuan/Cooperitive_Students"},{"id":"http://arxiv.org/abs/2404.03070v1","updated":"2024-04-03T21:18:27Z","published":"2024-04-03T21:18:27Z","title":"Behind the Veil: Enhanced Indoor 3D Scene Reconstruction with Occluded\n Surfaces Completion","summary":" In this paper, we present a novel indoor 3D reconstruction method with\noccluded surface completion, given a sequence of depth readings. Prior\nstate-of-the-art (SOTA) methods only focus on the reconstruction of the visible\nareas in a scene, neglecting the invisible areas due to the occlusions, e.g.,\nthe contact surface between furniture, occluded wall and floor. Our method\ntackles the task of completing the occluded scene surfaces, resulting in a\ncomplete 3D scene mesh. The core idea of our method is learning 3D geometry\nprior from various complete scenes to infer the occluded geometry of an unseen\nscene from solely depth measurements. We design a coarse-fine hierarchical\noctree representation coupled with a dual-decoder architecture, i.e.,\nGeo-decoder and 3D Inpainter, which jointly reconstructs the complete 3D scene\ngeometry. The Geo-decoder with detailed representation at fine levels is\noptimized online for each scene to reconstruct visible surfaces. The 3D\nInpainter with abstract representation at coarse levels is trained offline\nusing various scenes to complete occluded surfaces. As a result, while the\nGeo-decoder is specialized for an individual scene, the 3D Inpainter can be\ngenerally applied across different scenes. We evaluate the proposed method on\nthe 3D Completed Room Scene (3D-CRS) and iTHOR datasets, significantly\noutperforming the SOTA methods by a gain of 16.8% and 24.2% in terms of the\ncompleteness of 3D reconstruction. 3D-CRS dataset including a complete 3D mesh\nof each scene is provided at project webpage.\n","authors":["Su Sun","Cheng Zhao","Yuliang Guo","Ruoyu Wang","Xinyu Huang","Yingjie Victor Chen","Liu Ren"],"pdf_url":"https://arxiv.org/pdf/2404.03070v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03067v1","updated":"2024-04-03T21:16:19Z","published":"2024-04-03T21:16:19Z","title":"Self-supervised 6-DoF Robot Grasping by Demonstration via Augmented\n Reality Teleoperation System","summary":" Most existing 6-DoF robot grasping solutions depend on strong supervision on\ngrasp pose to ensure satisfactory performance, which could be laborious and\nimpractical when the robot works in some restricted area. To this end, we\npropose a self-supervised 6-DoF grasp pose detection framework via an Augmented\nReality (AR) teleoperation system that can efficiently learn human\ndemonstrations and provide 6-DoF grasp poses without grasp pose annotations.\nSpecifically, the system collects the human demonstration from the AR\nenvironment and contrastively learns the grasping strategy from the\ndemonstration. For the real-world experiment, the proposed system leads to\nsatisfactory grasping abilities and learning to grasp unknown objects within\nthree demonstrations.\n","authors":["Xiwen Dengxiong","Xueting Wang","Shi Bai","Yunbo Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.03067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03043v1","updated":"2024-04-03T20:05:00Z","published":"2024-04-03T20:05:00Z","title":"Linear Anchored Gaussian Mixture Model for Location and Width\n Computation of Objects in Thick Line Shape","summary":" An accurate detection of the centerlines of linear objects is a challenging\ntopic in many sensitive real-world applications such X-ray imaging, remote\nsensing and lane marking detection in road traffic. Model-based approaches\nusing Hough and Radon transforms are often used but, are not recommended for\nthick line detection, whereas approaches based on image derivatives need\nfurther step-by-step processing, making their efficiency dependent on each step\noutcomes. In this paper, we aim to detect linear structures found in images by\nconsidering the 3D representation of the image gray levels as a finite mixture\nmodel of statistical distribution. The latter, which we named linear anchored\nGaussian distribution could be parametrized by a scale value {\\sigma}\ndescribing the linear structure thickness and a line equation, parametrized, in\nturn, by a radius \\r{ho} and an orientation angle {\\theta}, describing the\nlinear structure centerline location. Expectation-Maximization (EM) algorithm\nis used for the mixture model parameter estimation, where a new paradigm, using\nthe background subtraction for the likelihood function computation, is\nproposed. For the EM algorithm, two {\\theta} parameter initialization schemes\nare used: the first one is based on a random choice of the first component of\n{\\theta} vector, whereas the second is based on the image Hessian with a\nsimultaneous computation of the mixture model components number. Experiments on\nreal world images and synthetic images corrupted by blur and additive noise\nshow the good performance of the proposed methods, where the algorithm using\nbackground subtraction and Hessian-based {\\theta} initialization provides an\noutstanding accuracy of the linear structure detection despite irregular image\nbackground and presence of blur and noise.\n","authors":["Nafaa Nacereddine","Djemel Ziou","Aicha Baya Goumeidane"],"pdf_url":"https://arxiv.org/pdf/2404.03043v1.pdf","comment":"13 pages, 13 figures"},{"id":"http://arxiv.org/abs/2311.05698v3","updated":"2024-04-03T20:04:49Z","published":"2023-11-09T19:15:12Z","title":"Mirasol3B: A Multimodal Autoregressive model for time-aligned and\n contextual modalities","summary":" One of the main challenges of multimodal learning is the need to combine\nheterogeneous modalities (e.g., video, audio, text). For example, video and\naudio are obtained at much higher rates than text and are roughly aligned in\ntime. They are often not synchronized with text, which comes as a global\ncontext, e.g., a title, or a description. Furthermore, video and audio inputs\nare of much larger volumes, and grow as the video length increases, which\nnaturally requires more compute dedicated to these modalities and makes\nmodeling of long-range dependencies harder.\n We here decouple the multimodal modeling, dividing it into separate, focused\nautoregressive models, processing the inputs according to the characteristics\nof the modalities. We propose a multimodal model, called Mirasol3B, consisting\nof an autoregressive component for the time-synchronized modalities (audio and\nvideo), and an autoregressive component for the context modalities which are\nnot necessarily aligned in time but are still sequential. To address the\nlong-sequences of the video-audio inputs, we propose to further partition the\nvideo and audio sequences in consecutive snippets and autoregressively process\ntheir representations. To that end, we propose a Combiner mechanism, which\nmodels the audio-video information jointly within a timeframe. The Combiner\nlearns to extract audio and video features from raw spatio-temporal signals,\nand then learns to fuse these features producing compact but expressive\nrepresentations per snippet.\n Our approach achieves the state-of-the-art on well established multimodal\nbenchmarks, outperforming much larger models. It effectively addresses the high\ncomputational demand of media inputs by both learning compact representations,\ncontrolling the sequence length of the audio-video feature representations, and\nmodeling their dependencies in time.\n","authors":["AJ Piergiovanni","Isaac Noble","Dahun Kim","Michael S. Ryoo","Victor Gomes","Anelia Angelova"],"pdf_url":"https://arxiv.org/pdf/2311.05698v3.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03042v1","updated":"2024-04-03T20:04:44Z","published":"2024-04-03T20:04:44Z","title":"AWOL: Analysis WithOut synthesis using Language","summary":" Many classical parametric 3D shape models exist, but creating novel shapes\nwith such models requires expert knowledge of their parameters. For example,\nimagine creating a specific type of tree using procedural graphics or a new\nkind of animal from a statistical shape model. Our key idea is to leverage\nlanguage to control such existing models to produce novel shapes. This involves\nlearning a mapping between the latent space of a vision-language model and the\nparameter space of the 3D model, which we do using a small set of shape and\ntext pairs. Our hypothesis is that mapping from language to parameters allows\nus to generate parameters for objects that were never seen during training. If\nthe mapping between language and parameters is sufficiently smooth, then\ninterpolation or generalization in language should translate appropriately into\nnovel 3D shapes. We test our approach with two very different types of\nparametric shape models (quadrupeds and arboreal trees). We use a learned\nstatistical shape model of quadrupeds and show that we can use text to generate\nnew animals not present during training. In particular, we demonstrate\nstate-of-the-art shape estimation of 3D dogs. This work also constitutes the\nfirst language-driven method for generating 3D trees. Finally, embedding images\nin the CLIP latent space enables us to generate animals and trees directly from\nimages.\n","authors":["Silvia Zuffi","Michael J. Black"],"pdf_url":"https://arxiv.org/pdf/2404.03042v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02132v2","updated":"2024-04-03T19:45:02Z","published":"2024-04-02T17:40:29Z","title":"ViTamin: Designing Scalable Vision Models in the Vision-Language Era","summary":" Recent breakthroughs in vision-language models (VLMs) start a new page in the\nvision community. The VLMs provide stronger and more generalizable feature\nembeddings compared to those from ImageNet-pretrained models, thanks to the\ntraining on the large-scale Internet image-text pairs. However, despite the\namazing achievement from the VLMs, vanilla Vision Transformers (ViTs) remain\nthe default choice for the image encoder. Although pure transformer proves its\neffectiveness in the text encoding area, it remains questionable whether it is\nalso the case for image encoding, especially considering that various types of\nnetworks are proposed on the ImageNet benchmark, which, unfortunately, are\nrarely studied in VLMs. Due to small data/model scale, the original conclusions\nof model design on ImageNet can be limited and biased. In this paper, we aim at\nbuilding an evaluation protocol of vision models in the vision-language era\nunder the contrastive language-image pretraining (CLIP) framework. We provide a\ncomprehensive way to benchmark different vision models, covering their\nzero-shot performance and scalability in both model and training data sizes. To\nthis end, we introduce ViTamin, a new vision models tailored for VLMs.\nViTamin-L significantly outperforms ViT-L by 2.0% ImageNet zero-shot accuracy,\nwhen using the same publicly available DataComp-1B dataset and the same\nOpenCLIP training scheme. ViTamin-L presents promising results on 60 diverse\nbenchmarks, including classification, retrieval, open-vocabulary detection and\nsegmentation, and large multi-modal models. When further scaling up the model\nsize, our ViTamin-XL with only 436M parameters attains 82.9% ImageNet zero-shot\naccuracy, surpassing 82.0% achieved by EVA-E that has ten times more parameters\n(4.4B).\n","authors":["Jieneng Chen","Qihang Yu","Xiaohui Shen","Alan Yuille","Liang-Chieh Chen"],"pdf_url":"https://arxiv.org/pdf/2404.02132v2.pdf","comment":"CVPR 2024; https://github.com/Beckschen/ViTamin"},{"id":"http://arxiv.org/abs/2404.03022v1","updated":"2024-04-03T19:17:43Z","published":"2024-04-03T19:17:43Z","title":"BCAmirs at SemEval-2024 Task 4: Beyond Words: A Multimodal and\n Multilingual Exploration of Persuasion in Memes","summary":" Memes, combining text and images, frequently use metaphors to convey\npersuasive messages, shaping public opinion. Motivated by this, our team\nengaged in SemEval-2024 Task 4, a hierarchical multi-label classification task\ndesigned to identify rhetorical and psychological persuasion techniques\nembedded within memes. To tackle this problem, we introduced a caption\ngeneration step to assess the modality gap and the impact of additional\nsemantic information from images, which improved our result. Our best model\nutilizes GPT-4 generated captions alongside meme text to fine-tune RoBERTa as\nthe text encoder and CLIP as the image encoder. It outperforms the baseline by\na large margin in all 12 subtasks. In particular, it ranked in top-3 across all\nlanguages in Subtask 2a, and top-4 in Subtask 2b, demonstrating quantitatively\nstrong performance. The improvement achieved by the introduced intermediate\nstep is likely attributable to the metaphorical essence of images that\nchallenges visual encoders. This highlights the potential for improving\nabstract visual semantics encoding.\n","authors":["Amirhossein Abaskohi","Amirhossein Dabiriaghdam","Lele Wang","Giuseppe Carenini"],"pdf_url":"https://arxiv.org/pdf/2404.03022v1.pdf","comment":"11 pages, 5 tables, 2 figures, Proceedings of the 18th International\n Workshop on Semantic Evaluation (SemEval-2024) @ NAACL 2024"},{"id":"http://arxiv.org/abs/2306.08103v4","updated":"2024-04-03T19:16:02Z","published":"2023-06-13T19:48:56Z","title":"Generating Images with 3D Annotations Using Diffusion Models","summary":" Diffusion models have emerged as a powerful generative method, capable of\nproducing stunning photo-realistic images from natural language descriptions.\nHowever, these models lack explicit control over the 3D structure in the\ngenerated images. Consequently, this hinders our ability to obtain detailed 3D\nannotations for the generated images or to craft instances with specific poses\nand distances. In this paper, we propose 3D Diffusion Style Transfer (3D-DST),\nwhich incorporates 3D geometry control into diffusion models. Our method\nexploits ControlNet, which extends diffusion models by using visual prompts in\naddition to text prompts. We generate images of the 3D objects taken from 3D\nshape repositories (e.g., ShapeNet and Objaverse), render them from a variety\nof poses and viewing directions, compute the edge maps of the rendered images,\nand use these edge maps as visual prompts to generate realistic images. With\nexplicit 3D geometry control, we can easily change the 3D structures of the\nobjects in the generated images and obtain ground-truth 3D annotations\nautomatically. This allows us to improve a wide range of vision tasks, e.g.,\nclassification and 3D pose estimation, in both in-distribution (ID) and\nout-of-distribution (OOD) settings. We demonstrate the effectiveness of our\nmethod through extensive experiments on ImageNet-100/200, ImageNet-R,\nPASCAL3D+, ObjectNet3D, and OOD-CV. The results show that our method\nsignificantly outperforms existing methods, e.g., 3.8 percentage points on\nImageNet-100 using DeiT-B.\n","authors":["Wufei Ma","Qihao Liu","Jiahao Wang","Angtian Wang","Xiaoding Yuan","Yi Zhang","Zihao Xiao","Guofeng Zhang","Beijia Lu","Ruxiao Duan","Yongrui Qi","Adam Kortylewski","Yaoyao Liu","Alan Yuille"],"pdf_url":"https://arxiv.org/pdf/2306.08103v4.pdf","comment":"ICLR 2024 Spotlight. Code: https://ccvl.jhu.edu/3D-DST/"},{"id":"http://arxiv.org/abs/2404.03015v1","updated":"2024-04-03T18:54:27Z","published":"2024-04-03T18:54:27Z","title":"DPFT: Dual Perspective Fusion Transformer for Camera-Radar-based Object\n Detection","summary":" The perception of autonomous vehicles has to be efficient, robust, and\ncost-effective. However, cameras are not robust against severe weather\nconditions, lidar sensors are expensive, and the performance of radar-based\nperception is still inferior to the others. Camera-radar fusion methods have\nbeen proposed to address this issue, but these are constrained by the typical\nsparsity of radar point clouds and often designed for radars without elevation\ninformation. We propose a novel camera-radar fusion approach called Dual\nPerspective Fusion Transformer (DPFT), designed to overcome these limitations.\nOur method leverages lower-level radar data (the radar cube) instead of the\nprocessed point clouds to preserve as much information as possible and employs\nprojections in both the camera and ground planes to effectively use radars with\nelevation information and simplify the fusion with camera data. As a result,\nDPFT has demonstrated state-of-the-art performance on the K-Radar dataset while\nshowing remarkable robustness against adverse weather conditions and\nmaintaining a low inference time. The code is made available as open-source\nsoftware under https://github.com/TUMFTM/DPFT.\n","authors":["Felix Fent","Andras Palffy","Holger Caesar"],"pdf_url":"https://arxiv.org/pdf/2404.03015v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03010v1","updated":"2024-04-03T18:42:19Z","published":"2024-04-03T18:42:19Z","title":"Skeleton Recall Loss for Connectivity Conserving and Resource Efficient\n Segmentation of Thin Tubular Structures","summary":" Accurately segmenting thin tubular structures, such as vessels, nerves, roads\nor concrete cracks, is a crucial task in computer vision. Standard deep\nlearning-based segmentation loss functions, such as Dice or Cross-Entropy,\nfocus on volumetric overlap, often at the expense of preserving structural\nconnectivity or topology. This can lead to segmentation errors that adversely\naffect downstream tasks, including flow calculation, navigation, and structural\ninspection. Although current topology-focused losses mark an improvement, they\nintroduce significant computational and memory overheads. This is particularly\nrelevant for 3D data, rendering these losses infeasible for larger volumes as\nwell as increasingly important multi-class segmentation problems. To mitigate\nthis, we propose a novel Skeleton Recall Loss, which effectively addresses\nthese challenges by circumventing intensive GPU-based calculations with\ninexpensive CPU operations. It demonstrates overall superior performance to\ncurrent state-of-the-art approaches on five public datasets for\ntopology-preserving segmentation, while substantially reducing computational\noverheads by more than 90%. In doing so, we introduce the first multi-class\ncapable loss function for thin structure segmentation, excelling in both\nefficiency and efficacy for topology-preservation.\n","authors":["Yannick Kirchhoff","Maximilian R. Rokuss","Saikat Roy","Balint Kovacs","Constantin Ulrich","Tassilo Wald","Maximilian Zenk","Philipp Vollmuth","Jens Kleesiek","Fabian Isensee","Klaus Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2404.03010v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02999v1","updated":"2024-04-03T18:40:48Z","published":"2024-04-03T18:40:48Z","title":"MeshBrush: Painting the Anatomical Mesh with Neural Stylization for\n Endoscopy","summary":" Style transfer is a promising approach to close the sim-to-real gap in\nmedical endoscopy. Rendering realistic endoscopic videos by traversing\npre-operative scans (such as MRI or CT) can generate realistic simulations as\nwell as ground truth camera poses and depth maps. Although image-to-image (I2I)\ntranslation models such as CycleGAN perform well, they are unsuitable for\nvideo-to-video synthesis due to the lack of temporal consistency, resulting in\nartifacts between frames. We propose MeshBrush, a neural mesh stylization\nmethod to synthesize temporally consistent videos with differentiable\nrendering. MeshBrush uses the underlying geometry of patient imaging data while\nleveraging existing I2I methods. With learned per-vertex textures, the stylized\nmesh guarantees consistency while producing high-fidelity outputs. We\ndemonstrate that mesh stylization is a promising approach for creating\nrealistic simulations for downstream tasks such as training and preoperative\nplanning. Although our method is tested and designed for ureteroscopy, its\ncomponents are transferable to general endoscopic and laparoscopic procedures.\n","authors":["John J. Han","Ayberk Acar","Nicholas Kavoussi","Jie Ying Wu"],"pdf_url":"https://arxiv.org/pdf/2404.02999v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.02990v1","updated":"2024-04-03T18:20:41Z","published":"2024-04-03T18:20:41Z","title":"ASAP: Interpretable Analysis and Summarization of AI-generated Image\n Patterns at Scale","summary":" Generative image models have emerged as a promising technology to produce\nrealistic images. Despite potential benefits, concerns grow about its misuse,\nparticularly in generating deceptive images that could raise significant\nethical, legal, and societal issues. Consequently, there is growing demand to\nempower users to effectively discern and comprehend patterns of AI-generated\nimages. To this end, we developed ASAP, an interactive visualization system\nthat automatically extracts distinct patterns of AI-generated images and allows\nusers to interactively explore them via various views. To uncover fake\npatterns, ASAP introduces a novel image encoder, adapted from CLIP, which\ntransforms images into compact \"distilled\" representations, enriched with\ninformation for differentiating authentic and fake images. These\nrepresentations generate gradients that propagate back to the attention maps of\nCLIP's transformer block. This process quantifies the relative importance of\neach pixel to image authenticity or fakeness, exposing key deceptive patterns.\nASAP enables the at scale interactive analysis of these patterns through\nmultiple, coordinated visualizations. This includes a representation overview\nwith innovative cell glyphs to aid in the exploration and qualitative\nevaluation of fake patterns across a vast array of images, as well as a pattern\nview that displays authenticity-indicating patterns in images and quantifies\ntheir impact. ASAP supports the analysis of cutting-edge generative models with\nthe latest architectures, including GAN-based models like proGAN and diffusion\nmodels like the latent diffusion model. We demonstrate ASAP's usefulness\nthrough two usage scenarios using multiple fake image detection benchmark\ndatasets, revealing its ability to identify and understand hidden patterns in\nAI-generated images, especially in detecting fake human faces produced by\ndiffusion-based techniques.\n","authors":["Jinbin Huang","Chen Chen","Aditi Mishra","Bum Chul Kwon","Zhicheng Liu","Chris Bryan"],"pdf_url":"https://arxiv.org/pdf/2404.02990v1.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2312.01734v2","updated":"2024-04-03T18:11:54Z","published":"2023-12-04T08:55:46Z","title":"Effective Adapter for Face Recognition in the Wild","summary":" In this paper, we tackle the challenge of face recognition in the wild, where\nimages often suffer from low quality and real-world distortions. Traditional\nheuristic approaches-either training models directly on these degraded images\nor their enhanced counterparts using face restoration techniques-have proven\nineffective, primarily due to the degradation of facial features and the\ndiscrepancy in image domains. To overcome these issues, we propose an effective\nadapter for augmenting existing face recognition models trained on high-quality\nfacial datasets. The key of our adapter is to process both the unrefined and\nenhanced images using two similar structures, one fixed and the other\ntrainable. Such design can confer two benefits. First, the dual-input system\nminimizes the domain gap while providing varied perspectives for the face\nrecognition model, where the enhanced image can be regarded as a complex\nnon-linear transformation of the original one by the restoration model. Second,\nboth two similar structures can be initialized by the pre-trained models\nwithout dropping the past knowledge. The extensive experiments in zero-shot\nsettings show the effectiveness of our method by surpassing baselines of about\n3%, 4%, and 7% in three datasets. Our code will be publicly available.\n","authors":["Yunhao Liu","Yu-Ju Tsai","Kelvin C. K. Chan","Xiangtai Li","Lu Qi","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2312.01734v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02973v1","updated":"2024-04-03T18:00:36Z","published":"2024-04-03T18:00:36Z","title":"Scaling Laws for Galaxy Images","summary":" We present the first systematic investigation of supervised scaling laws\noutside of an ImageNet-like context - on images of galaxies. We use 840k galaxy\nimages and over 100M annotations by Galaxy Zoo volunteers, comparable in scale\nto Imagenet-1K. We find that adding annotated galaxy images provides a power\nlaw improvement in performance across all architectures and all tasks, while\nadding trainable parameters is effective only for some (typically more\nsubjectively challenging) tasks. We then compare the downstream performance of\nfinetuned models pretrained on either ImageNet-12k alone vs. additionally\npretrained on our galaxy images. We achieve an average relative error rate\nreduction of 31% across 5 downstream tasks of scientific interest. Our\nfinetuned models are more label-efficient and, unlike their\nImageNet-12k-pretrained equivalents, often achieve linear transfer performance\nequal to that of end-to-end finetuning. We find relatively modest additional\ndownstream benefits from scaling model size, implying that scaling alone is not\nsufficient to address our domain gap, and suggest that practitioners with\nqualitatively different images might benefit more from in-domain adaption\nfollowed by targeted downstream labelling.\n","authors":["Mike Walmsley","Micah Bowles","Anna M. M. Scaife","Jason Shingirai Makechemu","Alexander J. Gordon","Annette M. N. Ferguson","Robert G. Mann","James Pearson","Jürgen J. Popp","Jo Bovy","Josh Speagle","Hugh Dickinson","Lucy Fortson","Tobias Géron","Sandor Kruk","Chris J. Lintott","Kameswara Mantha","Devina Mohan","David O'Ryan","Inigo V. Slijepevic"],"pdf_url":"https://arxiv.org/pdf/2404.02973v1.pdf","comment":"10+6 pages, 12 figures. Appendix C2 based on arxiv:2206.11927. Code,\n demos, documentation at https://github.com/mwalmsley/zoobot"},{"id":"http://arxiv.org/abs/2402.13729v4","updated":"2024-04-03T11:03:35Z","published":"2024-02-21T11:46:16Z","title":"Hybrid Video Diffusion Models with 2D Triplane and 3D Wavelet\n Representation","summary":" Generating high-quality videos that synthesize desired realistic content is a\nchallenging task due to their intricate high-dimensionality and complexity of\nvideos. Several recent diffusion-based methods have shown comparable\nperformance by compressing videos to a lower-dimensional latent space, using\ntraditional video autoencoder architecture. However, such method that employ\nstandard frame-wise 2D and 3D convolution fail to fully exploit the\nspatio-temporal nature of videos. To address this issue, we propose a novel\nhybrid video diffusion model, called HVDM, which can capture spatio-temporal\ndependencies more effectively. The HVDM is trained by a hybrid video\nautoencoder which extracts a disentangled representation of the video\nincluding: (i) a global context information captured by a 2D projected latent\n(ii) a local volume information captured by 3D convolutions with wavelet\ndecomposition (iii) a frequency information for improving the video\nreconstruction. Based on this disentangled representation, our hybrid\nautoencoder provide a more comprehensive video latent enriching the generated\nvideos with fine structures and details. Experiments on video generation\nbenchamarks (UCF101, SkyTimelapse, and TaiChi) demonstrate that the proposed\napproach achieves state-of-the-art video generation quality, showing a wide\nrange of video applications (e.g., long video generation, image-to-video, and\nvideo dynamics control).\n","authors":["Kihong Kim","Haneol Lee","Jihye Park","Seyeon Kim","Kwanghee Lee","Seungryong Kim","Jaejun Yoo"],"pdf_url":"https://arxiv.org/pdf/2402.13729v4.pdf","comment":"Project page is available at https://hxngiee.github.io/HVDM/"},{"id":"http://arxiv.org/abs/2110.15352v2","updated":"2024-04-03T03:12:53Z","published":"2021-10-28T17:58:45Z","title":"MCUNetV2: Memory-Efficient Patch-based Inference for Tiny Deep Learning","summary":" Tiny deep learning on microcontroller units (MCUs) is challenging due to the\nlimited memory size. We find that the memory bottleneck is due to the\nimbalanced memory distribution in convolutional neural network (CNN) designs:\nthe first several blocks have an order of magnitude larger memory usage than\nthe rest of the network. To alleviate this issue, we propose a generic\npatch-by-patch inference scheduling, which operates only on a small spatial\nregion of the feature map and significantly cuts down the peak memory. However,\nnaive implementation brings overlapping patches and computation overhead. We\nfurther propose network redistribution to shift the receptive field and FLOPs\nto the later stage and reduce the computation overhead. Manually redistributing\nthe receptive field is difficult. We automate the process with neural\narchitecture search to jointly optimize the neural architecture and inference\nscheduling, leading to MCUNetV2. Patch-based inference effectively reduces the\npeak memory usage of existing networks by 4-8x. Co-designed with neural\nnetworks, MCUNetV2 sets a record ImageNet accuracy on MCU (71.8%), and achieves\n>90% accuracy on the visual wake words dataset under only 32kB SRAM. MCUNetV2\nalso unblocks object detection on tiny devices, achieving 16.9% higher mAP on\nPascal VOC compared to the state-of-the-art result. Our study largely addressed\nthe memory bottleneck in tinyML and paved the way for various vision\napplications beyond image classification.\n","authors":["Ji Lin","Wei-Ming Chen","Han Cai","Chuang Gan","Song Han"],"pdf_url":"https://arxiv.org/pdf/2110.15352v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04283v1","updated":"2024-04-03T17:48:31Z","published":"2024-04-03T17:48:31Z","title":"Translation-based Video-to-Video Synthesis","summary":" Translation-based Video Synthesis (TVS) has emerged as a vital research area\nin computer vision, aiming to facilitate the transformation of videos between\ndistinct domains while preserving both temporal continuity and underlying\ncontent features. This technique has found wide-ranging applications,\nencompassing video super-resolution, colorization, segmentation, and more, by\nextending the capabilities of traditional image-to-image translation to the\ntemporal domain. One of the principal challenges faced in TVS is the inherent\nrisk of introducing flickering artifacts and inconsistencies between frames\nduring the synthesis process. This is particularly challenging due to the\nnecessity of ensuring smooth and coherent transitions between video frames.\nEfforts to tackle this challenge have induced the creation of diverse\nstrategies and algorithms aimed at mitigating these unwanted consequences. This\ncomprehensive review extensively examines the latest progress in the realm of\nTVS. It thoroughly investigates emerging methodologies, shedding light on the\nfundamental concepts and mechanisms utilized for proficient video synthesis.\nThis survey also illuminates their inherent strengths, limitations, appropriate\napplications, and potential avenues for future development.\n","authors":["Pratim Saha","Chengcui Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.04283v1.pdf","comment":"25 pages, 9 figures"},{"id":"http://arxiv.org/abs/2404.05558v1","updated":"2024-04-03T03:28:04Z","published":"2024-04-03T03:28:04Z","title":"JDEC: JPEG Decoding via Enhanced Continuous Cosine Coefficients","summary":" We propose a practical approach to JPEG image decoding, utilizing a local\nimplicit neural representation with continuous cosine formulation. The JPEG\nalgorithm significantly quantizes discrete cosine transform (DCT) spectra to\nachieve a high compression rate, inevitably resulting in quality degradation\nwhile encoding an image. We have designed a continuous cosine spectrum\nestimator to address the quality degradation issue that restores the distorted\nspectrum. By leveraging local DCT formulations, our network has the privilege\nto exploit dequantization and upsampling simultaneously. Our proposed model\nenables decoding compressed images directly across different quality factors\nusing a single pre-trained model without relying on a conventional JPEG\ndecoder. As a result, our proposed network achieves state-of-the-art\nperformance in flexible color image JPEG artifact removal tasks. Our source\ncode is available at https://github.com/WooKyoungHan/JDEC.\n","authors":["Woo Kyoung Han","Sunghoon Im","Jaedeok Kim","Kyong Hwan Jin"],"pdf_url":"https://arxiv.org/pdf/2404.05558v1.pdf","comment":null}]},"2024-04-04T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.03658v1","updated":"2024-04-04T17:59:59Z","published":"2024-04-04T17:59:59Z","title":"Know Your Neighbors: Improving Single-View Reconstruction via Spatial\n Vision-Language Reasoning","summary":" Recovering the 3D scene geometry from a single view is a fundamental yet\nill-posed problem in computer vision. While classical depth estimation methods\ninfer only a 2.5D scene representation limited to the image plane, recent\napproaches based on radiance fields reconstruct a full 3D representation.\nHowever, these methods still struggle with occluded regions since inferring\ngeometry without visual observation requires (i) semantic knowledge of the\nsurroundings, and (ii) reasoning about spatial context. We propose KYN, a novel\nmethod for single-view scene reconstruction that reasons about semantic and\nspatial context to predict each point's density. We introduce a vision-language\nmodulation module to enrich point features with fine-grained semantic\ninformation. We aggregate point representations across the scene through a\nlanguage-guided spatial attention mechanism to yield per-point density\npredictions aware of the 3D semantic context. We show that KYN improves 3D\nshape recovery compared to predicting density for each 3D point in isolation.\nWe achieve state-of-the-art results in scene and object reconstruction on\nKITTI-360, and show improved zero-shot generalization compared to prior work.\nProject page: https://ruili3.github.io/kyn.\n","authors":["Rui Li","Tobias Fischer","Mattia Segu","Marc Pollefeys","Luc Van Gool","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2404.03658v1.pdf","comment":"CVPR 2024. Project page: https://ruili3.github.io/kyn"},{"id":"http://arxiv.org/abs/2404.03657v1","updated":"2024-04-04T17:59:58Z","published":"2024-04-04T17:59:58Z","title":"OW-VISCap: Open-World Video Instance Segmentation and Captioning","summary":" Open-world video instance segmentation is an important video understanding\ntask. Yet most methods either operate in a closed-world setting, require an\nadditional user-input, or use classic region-based proposals to identify never\nbefore seen objects. Further, these methods only assign a one-word label to\ndetected objects, and don't generate rich object-centric descriptions. They\nalso often suffer from highly overlapping predictions. To address these issues,\nwe propose Open-World Video Instance Segmentation and Captioning (OW-VISCap),\nan approach to jointly segment, track, and caption previously seen or unseen\nobjects in a video. For this, we introduce open-world object queries to\ndiscover never before seen objects without additional user-input. We generate\nrich and descriptive object-centric captions for each detected object via a\nmasked attention augmented LLM input. We introduce an inter-query contrastive\nloss to ensure that the object queries differ from one another. Our generalized\napproach matches or surpasses state-of-the-art on three tasks: open-world video\ninstance segmentation on the BURST dataset, dense video object captioning on\nthe VidSTG dataset, and closed-world video instance segmentation on the OVIS\ndataset.\n","authors":["Anwesa Choudhuri","Girish Chowdhary","Alexander G. Schwing"],"pdf_url":"https://arxiv.org/pdf/2404.03657v1.pdf","comment":"Project page: https://anwesachoudhuri.github.io/OpenWorldVISCap/"},{"id":"http://arxiv.org/abs/2404.03656v1","updated":"2024-04-04T17:59:57Z","published":"2024-04-04T17:59:57Z","title":"MVD-Fusion: Single-view 3D via Depth-consistent Multi-view Generation","summary":" We present MVD-Fusion: a method for single-view 3D inference via generative\nmodeling of multi-view-consistent RGB-D images. While recent methods pursuing\n3D inference advocate learning novel-view generative models, these generations\nare not 3D-consistent and require a distillation process to generate a 3D\noutput. We instead cast the task of 3D inference as directly generating\nmutually-consistent multiple views and build on the insight that additionally\ninferring depth can provide a mechanism for enforcing this consistency.\nSpecifically, we train a denoising diffusion model to generate multi-view RGB-D\nimages given a single RGB input image and leverage the (intermediate noisy)\ndepth estimates to obtain reprojection-based conditioning to maintain\nmulti-view consistency. We train our model using large-scale synthetic dataset\nObajverse as well as the real-world CO3D dataset comprising of generic camera\nviewpoints. We demonstrate that our approach can yield more accurate synthesis\ncompared to recent state-of-the-art, including distillation-based 3D inference\nand prior multi-view generation methods. We also evaluate the geometry induced\nby our multi-view depth prediction and find that it yields a more accurate\nrepresentation than other direct 3D inference approaches.\n","authors":["Hanzhe Hu","Zhizhuo Zhou","Varun Jampani","Shubham Tulsiani"],"pdf_url":"https://arxiv.org/pdf/2404.03656v1.pdf","comment":"Project page: https://mvd-fusion.github.io/"},{"id":"http://arxiv.org/abs/2404.03654v1","updated":"2024-04-04T17:59:50Z","published":"2024-04-04T17:59:50Z","title":"RaFE: Generative Radiance Fields Restoration","summary":" NeRF (Neural Radiance Fields) has demonstrated tremendous potential in novel\nview synthesis and 3D reconstruction, but its performance is sensitive to input\nimage quality, which struggles to achieve high-fidelity rendering when provided\nwith low-quality sparse input viewpoints. Previous methods for NeRF restoration\nare tailored for specific degradation type, ignoring the generality of\nrestoration. To overcome this limitation, we propose a generic radiance fields\nrestoration pipeline, named RaFE, which applies to various types of\ndegradations, such as low resolution, blurriness, noise, compression artifacts,\nor their combinations. Our approach leverages the success of off-the-shelf 2D\nrestoration methods to recover the multi-view images individually. Instead of\nreconstructing a blurred NeRF by averaging inconsistencies, we introduce a\nnovel approach using Generative Adversarial Networks (GANs) for NeRF generation\nto better accommodate the geometric and appearance inconsistencies present in\nthe multi-view images. Specifically, we adopt a two-level tri-plane\narchitecture, where the coarse level remains fixed to represent the low-quality\nNeRF, and a fine-level residual tri-plane to be added to the coarse level is\nmodeled as a distribution with GAN to capture potential variations in\nrestoration. We validate RaFE on both synthetic and real cases for various\nrestoration tasks, demonstrating superior performance in both quantitative and\nqualitative evaluations, surpassing other 3D restoration methods specific to\nsingle task. Please see our project website\nhttps://zkaiwu.github.io/RaFE-Project/.\n","authors":["Zhongkai Wu","Ziyu Wan","Jing Zhang","Jing Liao","Dong Xu"],"pdf_url":"https://arxiv.org/pdf/2404.03654v1.pdf","comment":"Project Page: https://zkaiwu.github.io/RaFE-Project/"},{"id":"http://arxiv.org/abs/2404.03653v1","updated":"2024-04-04T17:59:46Z","published":"2024-04-04T17:59:46Z","title":"CoMat: Aligning Text-to-Image Diffusion Model with Image-to-Text Concept\n Matching","summary":" Diffusion models have demonstrated great success in the field of\ntext-to-image generation. However, alleviating the misalignment between the\ntext prompts and images is still challenging. The root reason behind the\nmisalignment has not been extensively investigated. We observe that the\nmisalignment is caused by inadequate token attention activation. We further\nattribute this phenomenon to the diffusion model's insufficient condition\nutilization, which is caused by its training paradigm. To address the issue, we\npropose CoMat, an end-to-end diffusion model fine-tuning strategy with an\nimage-to-text concept matching mechanism. We leverage an image captioning model\nto measure image-to-text alignment and guide the diffusion model to revisit\nignored tokens. A novel attribute concentration module is also proposed to\naddress the attribute binding problem. Without any image or human preference\ndata, we use only 20K text prompts to fine-tune SDXL to obtain CoMat-SDXL.\nExtensive experiments show that CoMat-SDXL significantly outperforms the\nbaseline model SDXL in two text-to-image alignment benchmarks and achieves\nstart-of-the-art performance.\n","authors":["Dongzhi Jiang","Guanglu Song","Xiaoshi Wu","Renrui Zhang","Dazhong Shen","Zhuofan Zong","Yu Liu","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2404.03653v1.pdf","comment":"Project Page: https://caraj7.github.io/comat"},{"id":"http://arxiv.org/abs/2404.03652v1","updated":"2024-04-04T17:59:40Z","published":"2024-04-04T17:59:40Z","title":"The More You See in 2D, the More You Perceive in 3D","summary":" Humans can infer 3D structure from 2D images of an object based on past\nexperience and improve their 3D understanding as they see more images. Inspired\nby this behavior, we introduce SAP3D, a system for 3D reconstruction and novel\nview synthesis from an arbitrary number of unposed images. Given a few unposed\nimages of an object, we adapt a pre-trained view-conditioned diffusion model\ntogether with the camera poses of the images via test-time fine-tuning. The\nadapted diffusion model and the obtained camera poses are then utilized as\ninstance-specific priors for 3D reconstruction and novel view synthesis. We\nshow that as the number of input images increases, the performance of our\napproach improves, bridging the gap between optimization-based prior-less 3D\nreconstruction methods and single-image-to-3D diffusion-based methods. We\ndemonstrate our system on real images as well as standard synthetic benchmarks.\nOur ablation studies confirm that this adaption behavior is key for more\naccurate 3D understanding.\n","authors":["Xinyang Han","Zelin Gao","Angjoo Kanazawa","Shubham Goel","Yossi Gandelsman"],"pdf_url":"https://arxiv.org/pdf/2404.03652v1.pdf","comment":"Project page: https://sap3d.github.io/"},{"id":"http://arxiv.org/abs/2404.03650v1","updated":"2024-04-04T17:59:08Z","published":"2024-04-04T17:59:08Z","title":"OpenNeRF: Open Set 3D Neural Scene Segmentation with Pixel-Wise Features\n and Rendered Novel Views","summary":" Large visual-language models (VLMs), like CLIP, enable open-set image\nsegmentation to segment arbitrary concepts from an image in a zero-shot manner.\nThis goes beyond the traditional closed-set assumption, i.e., where models can\nonly segment classes from a pre-defined training set. More recently, first\nworks on open-set segmentation in 3D scenes have appeared in the literature.\nThese methods are heavily influenced by closed-set 3D convolutional approaches\nthat process point clouds or polygon meshes. However, these 3D scene\nrepresentations do not align well with the image-based nature of the\nvisual-language models. Indeed, point cloud and 3D meshes typically have a\nlower resolution than images and the reconstructed 3D scene geometry might not\nproject well to the underlying 2D image sequences used to compute pixel-aligned\nCLIP features. To address these challenges, we propose OpenNeRF which naturally\noperates on posed images and directly encodes the VLM features within the NeRF.\nThis is similar in spirit to LERF, however our work shows that using pixel-wise\nVLM features (instead of global CLIP features) results in an overall less\ncomplex architecture without the need for additional DINO regularization. Our\nOpenNeRF further leverages NeRF's ability to render novel views and extract\nopen-set VLM features from areas that are not well observed in the initial\nposed images. For 3D point cloud segmentation on the Replica dataset, OpenNeRF\noutperforms recent open-vocabulary methods such as LERF and OpenScene by at\nleast +4.9 mIoU.\n","authors":["Francis Engelmann","Fabian Manhardt","Michael Niemeyer","Keisuke Tateno","Marc Pollefeys","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2404.03650v1.pdf","comment":"ICLR 2024, Project page: https://opennerf.github.io"},{"id":"http://arxiv.org/abs/2404.03645v1","updated":"2024-04-04T17:58:21Z","published":"2024-04-04T17:58:21Z","title":"Decoupling Static and Hierarchical Motion Perception for Referring Video\n Segmentation","summary":" Referring video segmentation relies on natural language expressions to\nidentify and segment objects, often emphasizing motion clues. Previous works\ntreat a sentence as a whole and directly perform identification at the\nvideo-level, mixing up static image-level cues with temporal motion cues.\nHowever, image-level features cannot well comprehend motion cues in sentences,\nand static cues are not crucial for temporal perception. In fact, static cues\ncan sometimes interfere with temporal perception by overshadowing motion cues.\nIn this work, we propose to decouple video-level referring expression\nunderstanding into static and motion perception, with a specific emphasis on\nenhancing temporal comprehension. Firstly, we introduce an\nexpression-decoupling module to make static cues and motion cues perform their\ndistinct role, alleviating the issue of sentence embeddings overlooking motion\ncues. Secondly, we propose a hierarchical motion perception module to capture\ntemporal information effectively across varying timescales. Furthermore, we\nemploy contrastive learning to distinguish the motions of visually similar\nobjects. These contributions yield state-of-the-art performance across five\ndatasets, including a remarkable $\\textbf{9.2%}$ $\\mathcal{J\\&F}$ improvement\non the challenging $\\textbf{MeViS}$ dataset. Code is available at\nhttps://github.com/heshuting555/DsHmp.\n","authors":["Shuting He","Henghui Ding"],"pdf_url":"https://arxiv.org/pdf/2404.03645v1.pdf","comment":"CVPR 2024, code: https://github.com/heshuting555/DsHmp"},{"id":"http://arxiv.org/abs/2404.03642v1","updated":"2024-04-04T17:57:25Z","published":"2024-04-04T17:57:25Z","title":"DiffBody: Human Body Restoration by Imagining with Generative Diffusion\n Prior","summary":" Human body restoration plays a vital role in various applications related to\nthe human body. Despite recent advances in general image restoration using\ngenerative models, their performance in human body restoration remains\nmediocre, often resulting in foreground and background blending, over-smoothing\nsurface textures, missing accessories, and distorted limbs. Addressing these\nchallenges, we propose a novel approach by constructing a human body-aware\ndiffusion model that leverages domain-specific knowledge to enhance\nperformance. Specifically, we employ a pretrained body attention module to\nguide the diffusion model's focus on the foreground, addressing issues caused\nby blending between the subject and background. We also demonstrate the value\nof revisiting the language modality of the diffusion model in restoration tasks\nby seamlessly incorporating text prompt to improve the quality of surface\ntexture and additional clothing and accessories details. Additionally, we\nintroduce a diffusion sampler tailored for fine-grained human body parts,\nutilizing local semantic information to rectify limb distortions. Lastly, we\ncollect a comprehensive dataset for benchmarking and advancing the field of\nhuman body restoration. Extensive experimental validation showcases the\nsuperiority of our approach, both quantitatively and qualitatively, over\nexisting methods.\n","authors":["Yiming Zhang","Zhe Wang","Xinjie Li","Yunchen Yuan","Chengsong Zhang","Xiao Sun","Zhihang Zhong","Jian Wang"],"pdf_url":"https://arxiv.org/pdf/2404.03642v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12790v3","updated":"2024-04-04T17:55:04Z","published":"2023-03-22T17:58:01Z","title":"$CrowdDiff$: Multi-hypothesis Crowd Density Estimation using Diffusion\n Models","summary":" Crowd counting is a fundamental problem in crowd analysis which is typically\naccomplished by estimating a crowd density map and summing over the density\nvalues. However, this approach suffers from background noise accumulation and\nloss of density due to the use of broad Gaussian kernels to create the ground\ntruth density maps. This issue can be overcome by narrowing the Gaussian\nkernel. However, existing approaches perform poorly when trained with ground\ntruth density maps with broad kernels. To deal with this limitation, we propose\nusing conditional diffusion models to predict density maps, as diffusion models\nshow high fidelity to training data during generation. With that, we present\n$CrowdDiff$ that generates the crowd density map as a reverse diffusion\nprocess. Furthermore, as the intermediate time steps of the diffusion process\nare noisy, we incorporate a regression branch for direct crowd estimation only\nduring training to improve the feature learning. In addition, owing to the\nstochastic nature of the diffusion model, we introduce producing multiple\ndensity maps to improve the counting performance contrary to the existing crowd\ncounting pipelines. We conduct extensive experiments on publicly available\ndatasets to validate the effectiveness of our method. $CrowdDiff$ outperforms\nexisting state-of-the-art crowd counting methods on several public crowd\nanalysis benchmarks with significant improvements.\n","authors":["Yasiru Ranasinghe","Nithin Gopalakrishnan Nair","Wele Gedara Chaminda Bandara","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2303.12790v3.pdf","comment":"Accepted at CVPR'24. The project is available at\n https://dylran.github.io/crowddiff.github.io"},{"id":"http://arxiv.org/abs/2404.03635v1","updated":"2024-04-04T17:54:33Z","published":"2024-04-04T17:54:33Z","title":"WorDepth: Variational Language Prior for Monocular Depth Estimation","summary":" Three-dimensional (3D) reconstruction from a single image is an ill-posed\nproblem with inherent ambiguities, i.e. scale. Predicting a 3D scene from text\ndescription(s) is similarly ill-posed, i.e. spatial arrangements of objects\ndescribed. We investigate the question of whether two inherently ambiguous\nmodalities can be used in conjunction to produce metric-scaled reconstructions.\nTo test this, we focus on monocular depth estimation, the problem of predicting\na dense depth map from a single image, but with an additional text caption\ndescribing the scene. To this end, we begin by encoding the text caption as a\nmean and standard deviation; using a variational framework, we learn the\ndistribution of the plausible metric reconstructions of 3D scenes corresponding\nto the text captions as a prior. To \"select\" a specific reconstruction or depth\nmap, we encode the given image through a conditional sampler that samples from\nthe latent space of the variational text encoder, which is then decoded to the\noutput depth map. Our approach is trained alternatingly between the text and\nimage branches: in one optimization step, we predict the mean and standard\ndeviation from the text description and sample from a standard Gaussian, and in\nthe other, we sample using a (image) conditional sampler. Once trained, we\ndirectly predict depth from the encoded text using the conditional sampler. We\ndemonstrate our approach on indoor (NYUv2) and outdoor (KITTI) scenarios, where\nwe show that language can consistently improve performance in both.\n","authors":["Ziyao Zeng","Daniel Wang","Fengyu Yang","Hyoungseob Park","Yangchao Wu","Stefano Soatto","Byung-Woo Hong","Dong Lao","Alex Wong"],"pdf_url":"https://arxiv.org/pdf/2404.03635v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03634v1","updated":"2024-04-04T17:54:12Z","published":"2024-04-04T17:54:12Z","title":"PreAfford: Universal Affordance-Based Pre-Grasping for Diverse Objects\n and Environments","summary":" Robotic manipulation of ungraspable objects with two-finger grippers presents\nsignificant challenges due to the paucity of graspable features, while\ntraditional pre-grasping techniques, which rely on repositioning objects and\nleveraging external aids like table edges, lack the adaptability across object\ncategories and scenes. Addressing this, we introduce PreAfford, a novel\npre-grasping planning framework that utilizes a point-level affordance\nrepresentation and a relay training approach to enhance adaptability across a\nbroad range of environments and object types, including those previously\nunseen. Demonstrated on the ShapeNet-v2 dataset, PreAfford significantly\nimproves grasping success rates by 69% and validates its practicality through\nreal-world experiments. This work offers a robust and adaptable solution for\nmanipulating ungraspable objects.\n","authors":["Kairui Ding","Boyuan Chen","Ruihai Wu","Yuyang Li","Zongzheng Zhang","Huan-ang Gao","Siqi Li","Yixin Zhu","Guyue Zhou","Hao Dong","Hao Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.03634v1.pdf","comment":"Project Page: https://air-discover.github.io/PreAfford/"},{"id":"http://arxiv.org/abs/2404.03632v1","updated":"2024-04-04T17:53:33Z","published":"2024-04-04T17:53:33Z","title":"Reference-Based 3D-Aware Image Editing with Triplane","summary":" Generative Adversarial Networks (GANs) have emerged as powerful tools not\nonly for high-quality image generation but also for real image editing through\nmanipulation of their interpretable latent spaces. Recent advancements in GANs\ninclude the development of 3D-aware models such as EG3D, characterized by\nefficient triplane-based architectures enabling the reconstruction of 3D\ngeometry from single images. However, scant attention has been devoted to\nproviding an integrated framework for high-quality reference-based 3D-aware\nimage editing within this domain. This study addresses this gap by exploring\nand demonstrating the effectiveness of EG3D's triplane space for achieving\nadvanced reference-based edits, presenting a unique perspective on 3D-aware\nimage editing through our novel pipeline. Our approach integrates the encoding\nof triplane features, spatial disentanglement and automatic localization of\nfeatures in the triplane domain, and fusion learning for desired image editing.\nMoreover, our framework demonstrates versatility across domains, extending its\neffectiveness to animal face edits and partial stylization of cartoon\nportraits. The method shows significant improvements over relevant 3D-aware\nlatent editing and 2D reference-based editing methods, both qualitatively and\nquantitatively. Project page: https://three-bee.github.io/triplane_edit\n","authors":["Bahri Batuhan Bilecen","Yigit Yalin","Ning Yu","Aysegul Dundar"],"pdf_url":"https://arxiv.org/pdf/2404.03632v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03631v1","updated":"2024-04-04T17:52:13Z","published":"2024-04-04T17:52:13Z","title":"Robust Concept Erasure Using Task Vectors","summary":" With the rapid growth of text-to-image models, a variety of techniques have\nbeen suggested to prevent undesirable image generations. Yet, these methods\noften only protect against specific user prompts and have been shown to allow\nunsafe generations with other inputs. Here we focus on unconditionally erasing\na concept from a text-to-image model rather than conditioning the erasure on\nthe user's prompt. We first show that compared to input-dependent erasure\nmethods, concept erasure that uses Task Vectors (TV) is more robust to\nunexpected user inputs, not seen during training. However, TV-based erasure can\nalso affect the core performance of the edited model, particularly when the\nrequired edit strength is unknown. To this end, we propose a method called\nDiverse Inversion, which we use to estimate the required strength of the TV\nedit. Diverse Inversion finds within the model input space a large set of word\nembeddings, each of which induces the generation of the target concept. We find\nthat encouraging diversity in the set makes our estimation more robust to\nunexpected prompts. Finally, we show that Diverse Inversion enables us to apply\na TV edit only to a subset of the model weights, enhancing the erasure\ncapabilities while better maintaining the core functionality of the model.\n","authors":["Minh Pham","Kelly O. Marshall","Chinmay Hegde","Niv Cohen"],"pdf_url":"https://arxiv.org/pdf/2404.03631v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03620v1","updated":"2024-04-04T17:43:06Z","published":"2024-04-04T17:43:06Z","title":"LCM-Lookahead for Encoder-based Text-to-Image Personalization","summary":" Recent advancements in diffusion models have introduced fast sampling methods\nthat can effectively produce high-quality images in just one or a few denoising\nsteps. Interestingly, when these are distilled from existing diffusion models,\nthey often maintain alignment with the original model, retaining similar\noutputs for similar prompts and seeds. These properties present opportunities\nto leverage fast sampling methods as a shortcut-mechanism, using them to create\na preview of denoised outputs through which we can backpropagate image-space\nlosses. In this work, we explore the potential of using such\nshortcut-mechanisms to guide the personalization of text-to-image models to\nspecific facial identities. We focus on encoder-based personalization\napproaches, and demonstrate that by tuning them with a lookahead identity loss,\nwe can achieve higher identity fidelity, without sacrificing layout diversity\nor prompt alignment. We further explore the use of attention sharing mechanisms\nand consistent data generation for the task of personalization, and find that\nencoder training can benefit from both.\n","authors":["Rinon Gal","Or Lichter","Elad Richardson","Or Patashnik","Amit H. Bermano","Gal Chechik","Daniel Cohen-Or"],"pdf_url":"https://arxiv.org/pdf/2404.03620v1.pdf","comment":"Project page at https://lcm-lookahead.github.io/"},{"id":"http://arxiv.org/abs/2404.03618v1","updated":"2024-04-04T17:40:06Z","published":"2024-04-04T17:40:06Z","title":"DeViDe: Faceted medical knowledge for improved medical vision-language\n pre-training","summary":" Vision-language pre-training for chest X-rays has made significant strides,\nprimarily by utilizing paired radiographs and radiology reports. However,\nexisting approaches often face challenges in encoding medical knowledge\neffectively. While radiology reports provide insights into the current disease\nmanifestation, medical definitions (as used by contemporary methods) tend to be\noverly abstract, creating a gap in knowledge. To address this, we propose\nDeViDe, a novel transformer-based method that leverages radiographic\ndescriptions from the open web. These descriptions outline general visual\ncharacteristics of diseases in radiographs, and when combined with abstract\ndefinitions and radiology reports, provide a holistic snapshot of knowledge.\nDeViDe incorporates three key features for knowledge-augmented vision language\nalignment: First, a large-language model-based augmentation is employed to\nhomogenise medical knowledge from diverse sources. Second, this knowledge is\naligned with image information at various levels of granularity. Third, a novel\nprojection layer is proposed to handle the complexity of aligning each image\nwith multiple descriptions arising in a multi-label setting. In zero-shot\nsettings, DeViDe performs comparably to fully supervised models on external\ndatasets and achieves state-of-the-art results on three large-scale datasets.\nAdditionally, fine-tuning DeViDe on four downstream tasks and six segmentation\ntasks showcases its superior performance across data from diverse\ndistributions.\n","authors":["Haozhe Luo","Ziyu Zhou","Corentin Royer","Anjany Sekuboyina","Bjoern Menze"],"pdf_url":"https://arxiv.org/pdf/2404.03618v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2208.04060 by other authors"},{"id":"http://arxiv.org/abs/2404.03617v1","updated":"2024-04-04T17:39:41Z","published":"2024-04-04T17:39:41Z","title":"On the Efficiency of Convolutional Neural Networks","summary":" Since the breakthrough performance of AlexNet in 2012, convolutional neural\nnetworks (convnets) have grown into extremely powerful vision models. Deep\nlearning researchers have used convnets to produce accurate results that were\nunachievable a decade ago. Yet computer scientists make computational\nefficiency their primary objective. Accuracy with exorbitant cost is not\nacceptable; an algorithm must also minimize its computational requirements.\nConfronted with the daunting computation that convnets use, deep learning\nresearchers also became interested in efficiency. Researchers applied\ntremendous effort to find the convnet architectures that have the greatest\nefficiency. However, skepticism grew among researchers and engineers alike\nabout the relevance of arithmetic complexity. Contrary to the prevailing view\nthat latency and arithmetic complexity are irreconcilable, a simple formula\nrelates both through computational efficiency. This insight enabled us to\nco-optimize the separate factors that determine latency. We observed that the\ndegenerate conv2d layers that produce the best accuracy-complexity trade-off\nalso have low operational intensity. Therefore, kernels that implement these\nlayers use significant memory resources. We solved this optimization problem\nwith block-fusion kernels that implement all layers of a residual block,\nthereby creating temporal locality, avoiding communication, and reducing\nworkspace size. Our ConvFirst model with block-fusion kernels ran approximately\nfour times as fast as the ConvNeXt baseline with PyTorch Inductor, at equal\naccuracy on the ImageNet-1K classification task. Our unified approach to\nconvnet efficiency envisions a new era of models and kernels that achieve\ngreater accuracy at lower cost.\n","authors":["Andrew Lavin"],"pdf_url":"https://arxiv.org/pdf/2404.03617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03613v1","updated":"2024-04-04T17:34:41Z","published":"2024-04-04T17:34:41Z","title":"Per-Gaussian Embedding-Based Deformation for Deformable 3D Gaussian\n Splatting","summary":" As 3D Gaussian Splatting (3DGS) provides fast and high-quality novel view\nsynthesis, it is a natural extension to deform a canonical 3DGS to multiple\nframes. However, previous works fail to accurately reconstruct dynamic scenes,\nespecially 1) static parts moving along nearby dynamic parts, and 2) some\ndynamic areas are blurry. We attribute the failure to the wrong design of the\ndeformation field, which is built as a coordinate-based function. This approach\nis problematic because 3DGS is a mixture of multiple fields centered at the\nGaussians, not just a single coordinate-based framework. To resolve this\nproblem, we define the deformation as a function of per-Gaussian embeddings and\ntemporal embeddings. Moreover, we decompose deformations as coarse and fine\ndeformations to model slow and fast movements, respectively. Also, we introduce\nan efficient training strategy for faster convergence and higher quality.\nProject page: https://jeongminb.github.io/e-d3dgs/\n","authors":["Jeongmin Bae","Seoha Kim","Youngsik Yun","Hahyun Lee","Gun Bang","Youngjung Uh"],"pdf_url":"https://arxiv.org/pdf/2404.03613v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2404.03611v1","updated":"2024-04-04T17:34:21Z","published":"2024-04-04T17:34:21Z","title":"InsectMamba: Insect Pest Classification with State Space Model","summary":" The classification of insect pests is a critical task in agricultural\ntechnology, vital for ensuring food security and environmental sustainability.\nHowever, the complexity of pest identification, due to factors like high\ncamouflage and species diversity, poses significant obstacles. Existing methods\nstruggle with the fine-grained feature extraction needed to distinguish between\nclosely related pest species. Although recent advancements have utilized\nmodified network structures and combined deep learning approaches to improve\naccuracy, challenges persist due to the similarity between pests and their\nsurroundings. To address this problem, we introduce InsectMamba, a novel\napproach that integrates State Space Models (SSMs), Convolutional Neural\nNetworks (CNNs), Multi-Head Self-Attention mechanism (MSA), and Multilayer\nPerceptrons (MLPs) within Mix-SSM blocks. This integration facilitates the\nextraction of comprehensive visual features by leveraging the strengths of each\nencoding strategy. A selective module is also proposed to adaptively aggregate\nthese features, enhancing the model's ability to discern pest characteristics.\nInsectMamba was evaluated against strong competitors across five insect pest\nclassification datasets. The results demonstrate its superior performance and\nverify the significance of each model component by an ablation study.\n","authors":["Qianning Wang","Chenglin Wang","Zhixin Lai","Yucheng Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.03611v1.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.03590v1","updated":"2024-04-04T16:58:26Z","published":"2024-04-04T16:58:26Z","title":"SemGrasp: Semantic Grasp Generation via Language Aligned Discretization","summary":" Generating natural human grasps necessitates consideration of not just object\ngeometry but also semantic information. Solely depending on object shape for\ngrasp generation confines the applications of prior methods in downstream\ntasks. This paper presents a novel semantic-based grasp generation method,\ntermed SemGrasp, which generates a static human grasp pose by incorporating\nsemantic information into the grasp representation. We introduce a discrete\nrepresentation that aligns the grasp space with semantic space, enabling the\ngeneration of grasp postures in accordance with language instructions. A\nMultimodal Large Language Model (MLLM) is subsequently fine-tuned, integrating\nobject, grasp, and language within a unified semantic space. To facilitate the\ntraining of SemGrasp, we have compiled a large-scale, grasp-text-aligned\ndataset named CapGrasp, featuring about 260k detailed captions and 50k diverse\ngrasps. Experimental findings demonstrate that SemGrasp efficiently generates\nnatural human grasps in alignment with linguistic intentions. Our code, models,\nand dataset are available publicly at: https://kailinli.github.io/SemGrasp.\n","authors":["Kailin Li","Jingbo Wang","Lixin Yang","Cewu Lu","Bo Dai"],"pdf_url":"https://arxiv.org/pdf/2404.03590v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03584v1","updated":"2024-04-04T16:48:40Z","published":"2024-04-04T16:48:40Z","title":"Towards more realistic human motion prediction with attention to motion\n coordination","summary":" Joint relation modeling is a curial component in human motion prediction.\nMost existing methods rely on skeletal-based graphs to build the joint\nrelations, where local interactive relations between joint pairs are well\nlearned. However, the motion coordination, a global joint relation reflecting\nthe simultaneous cooperation of all joints, is usually weakened because it is\nlearned from part to whole progressively and asynchronously. Thus, the final\npredicted motions usually appear unrealistic. To tackle this issue, we learn a\nmedium, called coordination attractor (CA), from the spatiotemporal features of\nmotion to characterize the global motion features, which is subsequently used\nto build new relative joint relations. Through the CA, all joints are related\nsimultaneously, and thus the motion coordination of all joints can be better\nlearned. Based on this, we further propose a novel joint relation modeling\nmodule, Comprehensive Joint Relation Extractor (CJRE), to combine this motion\ncoordination with the local interactions between joint pairs in a unified\nmanner. Additionally, we also present a Multi-timescale Dynamics Extractor\n(MTDE) to extract enriched dynamics from the raw position information for\neffective prediction. Extensive experiments show that the proposed framework\noutperforms state-of-the-art methods in both short- and long-term predictions\non H3.6M, CMU-Mocap, and 3DPW.\n","authors":["Pengxiang Ding","Jianqin Yin"],"pdf_url":"https://arxiv.org/pdf/2404.03584v1.pdf","comment":"Accepted by TCSVT"},{"id":"http://arxiv.org/abs/2312.11972v2","updated":"2024-04-04T16:41:22Z","published":"2023-12-19T09:09:46Z","title":"Expressive Forecasting of 3D Whole-body Human Motions","summary":" Human motion forecasting, with the goal of estimating future human behavior\nover a period of time, is a fundamental task in many real-world applications.\nHowever, existing works typically concentrate on predicting the major joints of\nthe human body without considering the delicate movements of the human hands.\nIn practical applications, hand gesture plays an important role in human\ncommunication with the real world, and expresses the primary intention of human\nbeings. In this work, we are the first to formulate a whole-body human pose\nforecasting task, which jointly predicts the future body and hand activities.\nCorrespondingly, we propose a novel Encoding-Alignment-Interaction (EAI)\nframework that aims to predict both coarse (body joints) and fine-grained\n(gestures) activities collaboratively, enabling expressive and\ncross-facilitated forecasting of 3D whole-body human motions. Specifically, our\nmodel involves two key constituents: cross-context alignment (XCA) and\ncross-context interaction (XCI). Considering the heterogeneous information\nwithin the whole-body, XCA aims to align the latent features of various human\ncomponents, while XCI focuses on effectively capturing the context interaction\namong the human components. We conduct extensive experiments on a\nnewly-introduced large-scale benchmark and achieve state-of-the-art\nperformance. The code is public for research purposes at\nhttps://github.com/Dingpx/EAI.\n","authors":["Pengxiang Ding","Qiongjie Cui","Min Zhang","Mengyuan Liu","Haofan Wang","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2312.11972v2.pdf","comment":"Accepted by AAAI24"},{"id":"http://arxiv.org/abs/2404.03575v1","updated":"2024-04-04T16:38:57Z","published":"2024-04-04T16:38:57Z","title":"DreamScene: 3D Gaussian-based Text-to-3D Scene Generation via Formation\n Pattern Sampling","summary":" Text-to-3D scene generation holds immense potential for the gaming, film, and\narchitecture sectors. Despite significant progress, existing methods struggle\nwith maintaining high quality, consistency, and editing flexibility. In this\npaper, we propose DreamScene, a 3D Gaussian-based novel text-to-3D scene\ngeneration framework, to tackle the aforementioned three challenges mainly via\ntwo strategies. First, DreamScene employs Formation Pattern Sampling (FPS), a\nmulti-timestep sampling strategy guided by the formation patterns of 3D\nobjects, to form fast, semantically rich, and high-quality representations. FPS\nuses 3D Gaussian filtering for optimization stability, and leverages\nreconstruction techniques to generate plausible textures. Second, DreamScene\nemploys a progressive three-stage camera sampling strategy, specifically\ndesigned for both indoor and outdoor settings, to effectively ensure\nobject-environment integration and scene-wide 3D consistency. Last, DreamScene\nenhances scene editing flexibility by integrating objects and environments,\nenabling targeted adjustments. Extensive experiments validate DreamScene's\nsuperiority over current state-of-the-art techniques, heralding its\nwide-ranging potential for diverse applications. Code and demos will be\nreleased at https://dreamscene-project.github.io .\n","authors":["Haoran Li","Haolin Shi","Wenli Zhang","Wenjun Wu","Yong Liao","Lin Wang","Lik-hang Lee","Pengyuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.03575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03574v1","updated":"2024-04-04T16:38:49Z","published":"2024-04-04T16:38:49Z","title":"TinyVQA: Compact Multimodal Deep Neural Network for Visual Question\n Answering on Resource-Constrained Devices","summary":" Traditional machine learning models often require powerful hardware, making\nthem unsuitable for deployment on resource-limited devices. Tiny Machine\nLearning (tinyML) has emerged as a promising approach for running machine\nlearning models on these devices, but integrating multiple data modalities into\ntinyML models still remains a challenge due to increased complexity, latency,\nand power consumption. This paper proposes TinyVQA, a novel multimodal deep\nneural network for visual question answering tasks that can be deployed on\nresource-constrained tinyML hardware. TinyVQA leverages a supervised\nattention-based model to learn how to answer questions about images using both\nvision and language modalities. Distilled knowledge from the supervised\nattention-based VQA model trains the memory aware compact TinyVQA model and low\nbit-width quantization technique is employed to further compress the model for\ndeployment on tinyML devices. The TinyVQA model was evaluated on the FloodNet\ndataset, which is used for post-disaster damage assessment. The compact model\nachieved an accuracy of 79.5%, demonstrating the effectiveness of TinyVQA for\nreal-world applications. Additionally, the model was deployed on a Crazyflie\n2.0 drone, equipped with an AI deck and GAP8 microprocessor. The TinyVQA model\nachieved low latencies of 56 ms and consumes 693 mW power while deployed on the\ntiny drone, showcasing its suitability for resource-constrained embedded\nsystems.\n","authors":["Hasib-Al Rashid","Argho Sarkar","Aryya Gangopadhyay","Maryam Rahnemoonfar","Tinoosh Mohsenin"],"pdf_url":"https://arxiv.org/pdf/2404.03574v1.pdf","comment":"Accepted as a full paper by the tinyML Research Symposium 2024"},{"id":"http://arxiv.org/abs/2404.03572v1","updated":"2024-04-04T16:37:42Z","published":"2024-04-04T16:37:42Z","title":"Terrain Point Cloud Inpainting via Signal Decomposition","summary":" The rapid development of 3D acquisition technology has made it possible to\nobtain point clouds of real-world terrains. However, due to limitations in\nsensor acquisition technology or specific requirements, point clouds often\ncontain defects such as holes with missing data. Inpainting algorithms are\nwidely used to patch these holes. However, existing traditional inpainting\nalgorithms rely on precise hole boundaries, which limits their ability to\nhandle cases where the boundaries are not well-defined. On the other hand,\nlearning-based completion methods often prioritize reconstructing the entire\npoint cloud instead of solely focusing on hole filling. Based on the fact that\nreal-world terrain exhibits both global smoothness and rich local detail, we\npropose a novel representation for terrain point clouds. This representation\ncan help to repair the holes without clear boundaries. Specifically, it\ndecomposes terrains into low-frequency and high-frequency components, which are\nrepresented by B-spline surfaces and relative height maps respectively. In this\nway, the terrain point cloud inpainting problem is transformed into a B-spline\nsurface fitting and 2D image inpainting problem. By solving the two problems,\nthe highly complex and irregular holes on the terrain point clouds can be\nwell-filled, which not only satisfies the global terrain undulation but also\nexhibits rich geometric details. The experimental results also demonstrate the\neffectiveness of our method.\n","authors":["Yizhou Xie","Xiangning Xie","Yuran Wang","Yanci Zhang","Zejun Lv"],"pdf_url":"https://arxiv.org/pdf/2404.03572v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14817v3","updated":"2024-04-04T16:27:06Z","published":"2024-02-22T18:59:56Z","title":"Cameras as Rays: Pose Estimation via Ray Diffusion","summary":" Estimating camera poses is a fundamental task for 3D reconstruction and\nremains challenging given sparsely sampled views (<10). In contrast to existing\napproaches that pursue top-down prediction of global parametrizations of camera\nextrinsics, we propose a distributed representation of camera pose that treats\na camera as a bundle of rays. This representation allows for a tight coupling\nwith spatial image features improving pose precision. We observe that this\nrepresentation is naturally suited for set-level transformers and develop a\nregression-based approach that maps image patches to corresponding rays. To\ncapture the inherent uncertainties in sparse-view pose inference, we adapt this\napproach to learn a denoising diffusion model which allows us to sample\nplausible modes while improving performance. Our proposed methods, both\nregression- and diffusion-based, demonstrate state-of-the-art performance on\ncamera pose estimation on CO3D while generalizing to unseen object categories\nand in-the-wild captures.\n","authors":["Jason Y. Zhang","Amy Lin","Moneish Kumar","Tzu-Hsuan Yang","Deva Ramanan","Shubham Tulsiani"],"pdf_url":"https://arxiv.org/pdf/2402.14817v3.pdf","comment":"In ICLR 2024 (oral). v2-3: updated references. Project webpage:\n https://jasonyzhang.com/RayDiffusion"},{"id":"http://arxiv.org/abs/2404.03566v1","updated":"2024-04-04T16:24:32Z","published":"2024-04-04T16:24:32Z","title":"PointInfinity: Resolution-Invariant Point Diffusion Models","summary":" We present PointInfinity, an efficient family of point cloud diffusion\nmodels. Our core idea is to use a transformer-based architecture with a\nfixed-size, resolution-invariant latent representation. This enables efficient\ntraining with low-resolution point clouds, while allowing high-resolution point\nclouds to be generated during inference. More importantly, we show that scaling\nthe test-time resolution beyond the training resolution improves the fidelity\nof generated point clouds and surfaces. We analyze this phenomenon and draw a\nlink to classifier-free guidance commonly used in diffusion models,\ndemonstrating that both allow trading off fidelity and variability during\ninference. Experiments on CO3D show that PointInfinity can efficiently generate\nhigh-resolution point clouds (up to 131k points, 31 times more than Point-E)\nwith state-of-the-art quality.\n","authors":["Zixuan Huang","Justin Johnson","Shoubhik Debnath","James M. Rehg","Chao-Yuan Wu"],"pdf_url":"https://arxiv.org/pdf/2404.03566v1.pdf","comment":"Accepted to CVPR 2024, project website at\n https://zixuanh.com/projects/pointinfinity"},{"id":"http://arxiv.org/abs/2403.01598v2","updated":"2024-04-04T16:12:51Z","published":"2024-03-03T19:52:43Z","title":"APISR: Anime Production Inspired Real-World Anime Super-Resolution","summary":" While real-world anime super-resolution (SR) has gained increasing attention\nin the SR community, existing methods still adopt techniques from the\nphotorealistic domain. In this paper, we analyze the anime production workflow\nand rethink how to use characteristics of it for the sake of the real-world\nanime SR. First, we argue that video networks and datasets are not necessary\nfor anime SR due to the repetition use of hand-drawing frames. Instead, we\npropose an anime image collection pipeline by choosing the least compressed and\nthe most informative frames from the video sources. Based on this pipeline, we\nintroduce the Anime Production-oriented Image (API) dataset. In addition, we\nidentify two anime-specific challenges of distorted and faint hand-drawn lines\nand unwanted color artifacts. We address the first issue by introducing a\nprediction-oriented compression module in the image degradation model and a\npseudo-ground truth preparation with enhanced hand-drawn lines. In addition, we\nintroduce the balanced twin perceptual loss combining both anime and\nphotorealistic high-level features to mitigate unwanted color artifacts and\nincrease visual clarity. We evaluate our method through extensive experiments\non the public benchmark, showing our method outperforms state-of-the-art anime\ndataset-trained approaches.\n","authors":["Boyang Wang","Fengyu Yang","Xihang Yu","Chao Zhang","Hanbin Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.01598v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03541v1","updated":"2024-04-04T15:49:01Z","published":"2024-04-04T15:49:01Z","title":"Segmentation-Guided Knee Radiograph Generation using Conditional\n Diffusion Models","summary":" Deep learning-based medical image processing algorithms require\nrepresentative data during development. In particular, surgical data might be\ndifficult to obtain, and high-quality public datasets are limited. To overcome\nthis limitation and augment datasets, a widely adopted solution is the\ngeneration of synthetic images. In this work, we employ conditional diffusion\nmodels to generate knee radiographs from contour and bone segmentations.\nRemarkably, two distinct strategies are presented by incorporating the\nsegmentation as a condition into the sampling and training process, namely,\nconditional sampling and conditional training. The results demonstrate that\nboth methods can generate realistic images while adhering to the conditioning\nsegmentation. The conditional training method outperforms the conditional\nsampling method and the conventional U-Net.\n","authors":["Siyuan Mei","Fuxin Fan","Fabian Wagner","Mareike Thies","Mingxuan Gu","Yipeng Sun","Andreas Maier"],"pdf_url":"https://arxiv.org/pdf/2404.03541v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03539v1","updated":"2024-04-04T15:47:30Z","published":"2024-04-04T15:47:30Z","title":"Is CLIP the main roadblock for fine-grained open-world perception?","summary":" Modern applications increasingly demand flexible computer vision models that\nadapt to novel concepts not encountered during training. This necessity is\npivotal in emerging domains like extended reality, robotics, and autonomous\ndriving, which require the ability to respond to open-world stimuli. A key\ningredient is the ability to identify objects based on free-form textual\nqueries defined at inference time - a task known as open-vocabulary object\ndetection. Multimodal backbones like CLIP are the main enabling technology for\ncurrent open-world perception solutions. Despite performing well on generic\nqueries, recent studies highlighted limitations on the fine-grained recognition\ncapabilities in open-vocabulary settings - i.e., for distinguishing subtle\nobject features like color, shape, and material. In this paper, we perform a\ndetailed examination of these open-vocabulary object recognition limitations to\nfind the root cause. We evaluate the performance of CLIP, the most commonly\nused vision-language backbone, against a fine-grained object-matching\nbenchmark, revealing interesting analogies between the limitations of\nopen-vocabulary object detectors and their backbones. Experiments suggest that\nthe lack of fine-grained understanding is caused by the poor separability of\nobject characteristics in the CLIP latent space. Therefore, we try to\nunderstand whether fine-grained knowledge is present in CLIP embeddings but not\nexploited at inference time due, for example, to the unsuitability of the\ncosine similarity matching function, which may discard important object\ncharacteristics. Our preliminary experiments show that simple CLIP latent-space\nre-projections help separate fine-grained concepts, paving the way towards the\ndevelopment of backbones inherently able to process fine-grained details. The\ncode for reproducing these experiments is available at\nhttps://github.com/lorebianchi98/FG-CLIP.\n","authors":["Lorenzo Bianchi","Fabio Carrara","Nicola Messina","Fabrizio Falchi"],"pdf_url":"https://arxiv.org/pdf/2404.03539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03537v1","updated":"2024-04-04T15:45:25Z","published":"2024-04-04T15:45:25Z","title":"If It's Not Enough, Make It So: Reducing Authentic Data Demand in Face\n Recognition through Synthetic Faces","summary":" Recent advances in deep face recognition have spurred a growing demand for\nlarge, diverse, and manually annotated face datasets. Acquiring authentic,\nhigh-quality data for face recognition has proven to be a challenge, primarily\ndue to privacy concerns. Large face datasets are primarily sourced from\nweb-based images, lacking explicit user consent. In this paper, we examine\nwhether and how synthetic face data can be used to train effective face\nrecognition models with reduced reliance on authentic images, thereby\nmitigating data collection concerns. First, we explored the performance gap\namong recent state-of-the-art face recognition models, trained with synthetic\ndata only and authentic (scarce) data only. Then, we deepened our analysis by\ntraining a state-of-the-art backbone with various combinations of synthetic and\nauthentic data, gaining insights into optimizing the limited use of the latter\nfor verification accuracy. Finally, we assessed the effectiveness of data\naugmentation approaches on synthetic and authentic data, with the same goal in\nmind. Our results highlighted the effectiveness of FR trained on combined\ndatasets, particularly when combined with appropriate augmentation techniques.\n","authors":["Andrea Atzori","Fadi Boutros","Naser Damer","Gianni Fenu","Mirko Marras"],"pdf_url":"https://arxiv.org/pdf/2404.03537v1.pdf","comment":"Accepted as a full paper at FG 2024 main track"},{"id":"http://arxiv.org/abs/2404.03531v1","updated":"2024-04-04T15:35:43Z","published":"2024-04-04T15:35:43Z","title":"COMO: Compact Mapping and Odometry","summary":" We present COMO, a real-time monocular mapping and odometry system that\nencodes dense geometry via a compact set of 3D anchor points. Decoding anchor\npoint projections into dense geometry via per-keyframe depth covariance\nfunctions guarantees that depth maps are joined together at visible anchor\npoints. The representation enables joint optimization of camera poses and dense\ngeometry, intrinsic 3D consistency, and efficient second-order inference. To\nmaintain a compact yet expressive map, we introduce a frontend that leverages\nthe covariance function for tracking and initializing potentially visually\nindistinct 3D points across frames. Altogether, we introduce a real-time system\ncapable of estimating accurate poses and consistent geometry.\n","authors":["Eric Dexheimer","Andrew J. Davison"],"pdf_url":"https://arxiv.org/pdf/2404.03531v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03527v1","updated":"2024-04-04T15:31:11Z","published":"2024-04-04T15:31:11Z","title":"HAPNet: Toward Superior RGB-Thermal Scene Parsing via Hybrid,\n Asymmetric, and Progressive Heterogeneous Feature Fusion","summary":" Data-fusion networks have shown significant promise for RGB-thermal scene\nparsing. However, the majority of existing studies have relied on symmetric\nduplex encoders for heterogeneous feature extraction and fusion, paying\ninadequate attention to the inherent differences between RGB and thermal\nmodalities. Recent progress in vision foundation models (VFMs) trained through\nself-supervision on vast amounts of unlabeled data has proven their ability to\nextract informative, general-purpose features. However, this potential has yet\nto be fully leveraged in the domain. In this study, we take one step toward\nthis new research area by exploring a feasible strategy to fully exploit VFM\nfeatures for RGB-thermal scene parsing. Specifically, we delve deeper into the\nunique characteristics of RGB and thermal modalities, thereby designing a\nhybrid, asymmetric encoder that incorporates both a VFM and a convolutional\nneural network. This design allows for more effective extraction of\ncomplementary heterogeneous features, which are subsequently fused in a\ndual-path, progressive manner. Moreover, we introduce an auxiliary task to\nfurther enrich the local semantics of the fused features, thereby improving the\noverall performance of RGB-thermal scene parsing. Our proposed HAPNet, equipped\nwith all these components, demonstrates superior performance compared to all\nother state-of-the-art RGB-thermal scene parsing networks, achieving top ranks\nacross three widely used public RGB-thermal scene parsing datasets. We believe\nthis new paradigm has opened up new opportunities for future developments in\ndata-fusion scene parsing approaches.\n","authors":["Jiahang Li","Peng Yun","Qijun Chen","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2404.03527v1.pdf","comment":"12 pages, 4figures"},{"id":"http://arxiv.org/abs/2404.03518v1","updated":"2024-04-04T15:23:14Z","published":"2024-04-04T15:23:14Z","title":"SDPose: Tokenized Pose Estimation via Circulation-Guide\n Self-Distillation","summary":" Recently, transformer-based methods have achieved state-of-the-art prediction\nquality on human pose estimation(HPE). Nonetheless, most of these\ntop-performing transformer-based models are too computation-consuming and\nstorage-demanding to deploy on edge computing platforms. Those\ntransformer-based models that require fewer resources are prone to\nunder-fitting due to their smaller scale and thus perform notably worse than\ntheir larger counterparts. Given this conundrum, we introduce SDPose, a new\nself-distillation method for improving the performance of small\ntransformer-based models. To mitigate the problem of under-fitting, we design a\ntransformer module named Multi-Cycled Transformer(MCT) based on multiple-cycled\nforwards to more fully exploit the potential of small model parameters.\nFurther, in order to prevent the additional inference compute-consuming brought\nby MCT, we introduce a self-distillation scheme, extracting the knowledge from\nthe MCT module to a naive forward model. Specifically, on the MSCOCO validation\ndataset, SDPose-T obtains 69.7% mAP with 4.4M parameters and 1.8 GFLOPs.\nFurthermore, SDPose-S-V2 obtains 73.5% mAP on the MSCOCO validation dataset\nwith 6.2M parameters and 4.7 GFLOPs, achieving a new state-of-the-art among\npredominant tiny neural network methods. Our code is available at\nhttps://github.com/MartyrPenink/SDPose.\n","authors":["Sichen Chen","Yingyi Zhang","Siming Huang","Ran Yi","Ke Fan","Ruixin Zhang","Peixian Chen","Jun Wang","Shouhong Ding","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2404.03518v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03507v1","updated":"2024-04-04T15:10:24Z","published":"2024-04-04T15:10:24Z","title":"DQ-DETR: DETR with Dynamic Query for Tiny Object Detection","summary":" Despite previous DETR-like methods having performed successfully in generic\nobject detection, tiny object detection is still a challenging task for them\nsince the positional information of object queries is not customized for\ndetecting tiny objects, whose scale is extraordinarily smaller than general\nobjects. Also, DETR-like methods using a fixed number of queries make them\nunsuitable for aerial datasets, which only contain tiny objects, and the\nnumbers of instances are imbalanced between different images. Thus, we present\na simple yet effective model, named DQ-DETR, which consists of three different\ncomponents: categorical counting module, counting-guided feature enhancement,\nand dynamic query selection to solve the above-mentioned problems. DQ-DETR uses\nthe prediction and density maps from the categorical counting module to\ndynamically adjust the number of object queries and improve the positional\ninformation of queries. Our model DQ-DETR outperforms previous CNN-based and\nDETR-like methods, achieving state-of-the-art mAP 30.2% on the AI-TOD-V2\ndataset, which mostly consists of tiny objects.\n","authors":["Yi-Xin Huang","Hou-I Liu","Hong-Han Shuai","Wen-Huang Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.03507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11963v2","updated":"2024-04-04T15:10:23Z","published":"2023-03-21T15:50:08Z","title":"NEMTO: Neural Environment Matting for Novel View and Relighting\n Synthesis of Transparent Objects","summary":" We propose NEMTO, the first end-to-end neural rendering pipeline to model 3D\ntransparent objects with complex geometry and unknown indices of refraction.\nCommonly used appearance modeling such as the Disney BSDF model cannot\naccurately address this challenging problem due to the complex light paths\nbending through refractions and the strong dependency of surface appearance on\nillumination. With 2D images of the transparent object as input, our method is\ncapable of high-quality novel view and relighting synthesis. We leverage\nimplicit Signed Distance Functions (SDF) to model the object geometry and\npropose a refraction-aware ray bending network to model the effects of light\nrefraction within the object. Our ray bending network is more tolerant to\ngeometric inaccuracies than traditional physically-based methods for rendering\ntransparent objects. We provide extensive evaluations on both synthetic and\nreal-world datasets to demonstrate our high-quality synthesis and the\napplicability of our method.\n","authors":["Dongqing Wang","Tong Zhang","Sabine Süsstrunk"],"pdf_url":"https://arxiv.org/pdf/2303.11963v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2312.09228v3","updated":"2024-04-04T15:06:02Z","published":"2023-12-14T18:54:32Z","title":"3DGS-Avatar: Animatable Avatars via Deformable 3D Gaussian Splatting","summary":" We introduce an approach that creates animatable human avatars from monocular\nvideos using 3D Gaussian Splatting (3DGS). Existing methods based on neural\nradiance fields (NeRFs) achieve high-quality novel-view/novel-pose image\nsynthesis but often require days of training, and are extremely slow at\ninference time. Recently, the community has explored fast grid structures for\nefficient training of clothed avatars. Albeit being extremely fast at training,\nthese methods can barely achieve an interactive rendering frame rate with\naround 15 FPS. In this paper, we use 3D Gaussian Splatting and learn a\nnon-rigid deformation network to reconstruct animatable clothed human avatars\nthat can be trained within 30 minutes and rendered at real-time frame rates\n(50+ FPS). Given the explicit nature of our representation, we further\nintroduce as-isometric-as-possible regularizations on both the Gaussian mean\nvectors and the covariance matrices, enhancing the generalization of our model\non highly articulated unseen poses. Experimental results show that our method\nachieves comparable and even better performance compared to state-of-the-art\napproaches on animatable avatar creation from a monocular input, while being\n400x and 250x faster in training and inference, respectively.\n","authors":["Zhiyin Qian","Shaofei Wang","Marko Mihajlovic","Andreas Geiger","Siyu Tang"],"pdf_url":"https://arxiv.org/pdf/2312.09228v3.pdf","comment":"Project page: https://neuralbodies.github.io/3DGS-Avatar"},{"id":"http://arxiv.org/abs/2403.19612v2","updated":"2024-04-04T14:44:23Z","published":"2024-03-28T17:32:01Z","title":"ILPO-NET: Network for the invariant recognition of arbitrary volumetric\n patterns in 3D","summary":" Effective recognition of spatial patterns and learning their hierarchy is\ncrucial in modern spatial data analysis. Volumetric data applications seek\ntechniques ensuring invariance not only to shifts but also to pattern\nrotations. While traditional methods can readily achieve translational\ninvariance, rotational invariance possesses multiple challenges and remains an\nactive area of research. Here, we present ILPO-Net (Invariant to Local Patterns\nOrientation Network), a novel approach that handles arbitrarily shaped patterns\nwith the convolutional operation inherently invariant to local spatial pattern\norientations using the Wigner matrix expansions. Our architecture seamlessly\nintegrates the new convolution operator and, when benchmarked on diverse\nvolumetric datasets such as MedMNIST and CATH, demonstrates superior\nperformance over the baselines with significantly reduced parameter counts - up\nto 1000 times fewer in the case of MedMNIST. Beyond these demonstrations,\nILPO-Net's rotational invariance paves the way for other applications across\nmultiple disciplines. Our code is publicly available at\nhttps://gricad-gitlab.univ-grenoble-alpes.fr/GruLab/ILPONet.\n","authors":["Dmitrii Zhemchuzhnikov","Sergei Grudinin"],"pdf_url":"https://arxiv.org/pdf/2403.19612v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01987v2","updated":"2024-04-04T14:40:21Z","published":"2023-12-04T16:04:41Z","title":"Bootstrapping SparseFormers from Vision Foundation Models","summary":" The recently proposed SparseFormer architecture provides an alternative\napproach to visual understanding by utilizing a significantly lower number of\nvisual tokens via adjusting RoIs, greatly reducing computational costs while\nstill achieving promising performance. However, training SparseFormers from\nscratch is still expensive, and scaling up the number of parameters can be\nchallenging. In this paper, we propose to bootstrap SparseFormers from\nViT-based vision foundation models in a simple and efficient way. Since the\nmajority of SparseFormer blocks are the standard transformer ones, we can\ninherit weights from large-scale pre-trained vision transformers and freeze\nthem as much as possible. Therefore, we only need to train the\nSparseFormer-specific lightweight focusing transformer to adjust token RoIs and\nfine-tune a few early pre-trained blocks to align the final token\nrepresentation. In such a way, we can bootstrap SparseFormer architectures from\nvarious large-scale pre-trained models (e.g., IN-21K pre-trained AugRegs or\nCLIPs) using a rather smaller amount of training samples (e.g., IN-1K) and\nwithout labels or captions within just a few hours. As a result, the\nbootstrapped unimodal SparseFormer (from AugReg-ViT-L/16-384) can reach 84.9%\naccuracy on IN-1K with only 49 tokens, and the multimodal SparseFormer from\nCLIPs also demonstrates notable zero-shot performance with highly reduced\ncomputational cost without seeing any caption during the bootstrapping\nprocedure. In addition, CLIP-bootstrapped SparseFormers, which align the output\nspace with language without seeing a word, can serve as efficient vision\nencoders in multimodal large language models. Code and models are available at\nhttps://github.com/showlab/sparseformer\n","authors":["Ziteng Gao","Zhan Tong","Kevin Qinghong Lin","Joya Chen","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2312.01987v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03482v1","updated":"2024-04-04T14:35:49Z","published":"2024-04-04T14:35:49Z","title":"AdaGlimpse: Active Visual Exploration with Arbitrary Glimpse Position\n and Scale","summary":" Active Visual Exploration (AVE) is a task that involves dynamically selecting\nobservations (glimpses), which is critical to facilitate comprehension and\nnavigation within an environment. While modern AVE methods have demonstrated\nimpressive performance, they are constrained to fixed-scale glimpses from rigid\ngrids. In contrast, existing mobile platforms equipped with optical zoom\ncapabilities can capture glimpses of arbitrary positions and scales. To address\nthis gap between software and hardware capabilities, we introduce AdaGlimpse.\nIt uses Soft Actor-Critic, a reinforcement learning algorithm tailored for\nexploration tasks, to select glimpses of arbitrary position and scale. This\napproach enables our model to rapidly establish a general awareness of the\nenvironment before zooming in for detailed analysis. Experimental results\ndemonstrate that AdaGlimpse surpasses previous methods across various visual\ntasks while maintaining greater applicability in realistic AVE scenarios.\n","authors":["Adam Pardyl","Michał Wronka","Maciej Wołczyk","Kamil Adamczewski","Tomasz Trzciński","Bartosz Zieliński"],"pdf_url":"https://arxiv.org/pdf/2404.03482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03477v1","updated":"2024-04-04T14:28:34Z","published":"2024-04-04T14:28:34Z","title":"Towards Automated Movie Trailer Generation","summary":" Movie trailers are an essential tool for promoting films and attracting\naudiences. However, the process of creating trailers can be time-consuming and\nexpensive. To streamline this process, we propose an automatic trailer\ngeneration framework that generates plausible trailers from a full movie by\nautomating shot selection and composition. Our approach draws inspiration from\nmachine translation techniques and models the movies and trailers as sequences\nof shots, thus formulating the trailer generation problem as a\nsequence-to-sequence task. We introduce Trailer Generation Transformer (TGT), a\ndeep-learning framework utilizing an encoder-decoder architecture. TGT movie\nencoder is tasked with contextualizing each movie shot representation via\nself-attention, while the autoregressive trailer decoder predicts the feature\nrepresentation of the next trailer shot, accounting for the relevance of shots'\ntemporal order in trailers. Our TGT significantly outperforms previous methods\non a comprehensive suite of metrics.\n","authors":["Dawit Mureja Argaw","Mattia Soldan","Alejandro Pardo","Chen Zhao","Fabian Caba Heilbron","Joon Son Chung","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2404.03477v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03474v1","updated":"2024-04-04T14:26:58Z","published":"2024-04-04T14:26:58Z","title":"Performance of computer vision algorithms for fine-grained\n classification using crowdsourced insect images","summary":" With fine-grained classification, we identify unique characteristics to\ndistinguish among classes of the same super-class. We are focusing on species\nrecognition in Insecta, as they are critical for biodiversity monitoring and at\nthe base of many ecosystems. With citizen science campaigns, billions of images\nare collected in the wild. Once these are labelled, experts can use them to\ncreate distribution maps. However, the labelling process is time-consuming,\nwhich is where computer vision comes in. The field of computer vision offers a\nwide range of algorithms, each with its strengths and weaknesses; how do we\nidentify the algorithm that is in line with our application? To answer this\nquestion, we provide a full and detailed evaluation of nine algorithms among\ndeep convolutional networks (CNN), vision transformers (ViT), and\nlocality-based vision transformers (LBVT) on 4 different aspects:\nclassification performance, embedding quality, computational cost, and gradient\nactivity. We offer insights that we haven't yet had in this domain proving to\nwhich extent these algorithms solve the fine-grained tasks in Insecta. We found\nthat the ViT performs the best on inference speed and computational cost while\nthe LBVT outperforms the others on performance and embedding quality; the CNN\nprovide a trade-off among the metrics.\n","authors":["Rita Pucci","Vincent J. Kalkman","Dan Stowell"],"pdf_url":"https://arxiv.org/pdf/2404.03474v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03462v1","updated":"2024-04-04T14:13:56Z","published":"2024-04-04T14:13:56Z","title":"You Only Scan Once: A Dynamic Scene Reconstruction Pipeline for 6-DoF\n Robotic Grasping of Novel Objects","summary":" In the realm of robotic grasping, achieving accurate and reliable\ninteractions with the environment is a pivotal challenge. Traditional methods\nof grasp planning methods utilizing partial point clouds derived from depth\nimage often suffer from reduced scene understanding due to occlusion,\nultimately impeding their grasping accuracy. Furthermore, scene reconstruction\nmethods have primarily relied upon static techniques, which are susceptible to\nenvironment change during manipulation process limits their efficacy in\nreal-time grasping tasks. To address these limitations, this paper introduces a\nnovel two-stage pipeline for dynamic scene reconstruction. In the first stage,\nour approach takes scene scanning as input to register each target object with\nmesh reconstruction and novel object pose tracking. In the second stage, pose\ntracking is still performed to provide object poses in real-time, enabling our\napproach to transform the reconstructed object point clouds back into the\nscene. Unlike conventional methodologies, which rely on static scene snapshots,\nour method continuously captures the evolving scene geometry, resulting in a\ncomprehensive and up-to-date point cloud representation. By circumventing the\nconstraints posed by occlusion, our method enhances the overall grasp planning\nprocess and empowers state-of-the-art 6-DoF robotic grasping algorithms to\nexhibit markedly improved accuracy.\n","authors":["Lei Zhou","Haozhe Wang","Zhengshen Zhang","Zhiyang Liu","Francis EH Tay","adn Marcelo H. Ang. Jr"],"pdf_url":"https://arxiv.org/pdf/2404.03462v1.pdf","comment":"ICRA 2024"},{"id":"http://arxiv.org/abs/2404.03451v1","updated":"2024-04-04T13:55:06Z","published":"2024-04-04T13:55:06Z","title":"How Much Data are Enough? Investigating Dataset Requirements for\n Patch-Based Brain MRI Segmentation Tasks","summary":" Training deep neural networks reliably requires access to large-scale\ndatasets. However, obtaining such datasets can be challenging, especially in\nthe context of neuroimaging analysis tasks, where the cost associated with\nimage acquisition and annotation can be prohibitive. To mitigate both the time\nand financial costs associated with model development, a clear understanding of\nthe amount of data required to train a satisfactory model is crucial. This\npaper focuses on an early stage phase of deep learning research, prior to model\ndevelopment, and proposes a strategic framework for estimating the amount of\nannotated data required to train patch-based segmentation networks. This\nframework includes the establishment of performance expectations using a novel\nMinor Boundary Adjustment for Threshold (MinBAT) method, and standardizing\npatch selection through the ROI-based Expanded Patch Selection (REPS) method.\nOur experiments demonstrate that tasks involving regions of interest (ROIs)\nwith different sizes or shapes may yield variably acceptable Dice Similarity\nCoefficient (DSC) scores. By setting an acceptable DSC as the target, the\nrequired amount of training data can be estimated and even predicted as data\naccumulates. This approach could assist researchers and engineers in estimating\nthe cost associated with data collection and annotation when defining a new\nsegmentation task based on deep neural networks, ultimately contributing to\ntheir efficient translation to real-world applications.\n","authors":["Dongang Wang","Peilin Liu","Hengrui Wang","Heidi Beadnall","Kain Kyle","Linda Ly","Mariano Cabezas","Geng Zhan","Ryan Sullivan","Weidong Cai","Wanli Ouyang","Fernando Calamante","Michael Barnett","Chenyu Wang"],"pdf_url":"https://arxiv.org/pdf/2404.03451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13785v2","updated":"2024-04-04T13:52:17Z","published":"2024-01-24T20:06:59Z","title":"Unified Spatio-Temporal Tri-Perspective View Representation for 3D\n Semantic Occupancy Prediction","summary":" Holistic understanding and reasoning in 3D scenes play a vital role in the\nsuccess of autonomous driving systems. The evolution of 3D semantic occupancy\nprediction as a pretraining task for autonomous driving and robotic downstream\ntasks capture finer 3D details compared to methods like 3D detection. Existing\napproaches predominantly focus on spatial cues such as tri-perspective view\nembeddings (TPV), often overlooking temporal cues. This study introduces a\nspatiotemporal transformer architecture S2TPVFormer for temporally coherent 3D\nsemantic occupancy prediction. We enrich the prior process by including\ntemporal cues using a novel temporal cross-view hybrid attention mechanism\n(TCVHA) and generate spatiotemporal TPV embeddings (i.e. S2TPV embeddings).\nExperimental evaluations on the nuScenes dataset demonstrate a substantial 4.1%\nimprovement in mean Intersection over Union (mIoU) for 3D Semantic Occupancy\ncompared to TPVFormer, confirming the effectiveness of the proposed S2TPVFormer\nin enhancing 3D scene perception.\n","authors":["Sathira Silva","Savindu Bhashitha Wannigama","Gihan Jayatilaka","Muhammad Haris Khan","Roshan Ragel"],"pdf_url":"https://arxiv.org/pdf/2401.13785v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03446v1","updated":"2024-04-04T13:46:52Z","published":"2024-04-04T13:46:52Z","title":"SP$^2$OT: Semantic-Regularized Progressive Partial Optimal Transport for\n Imbalanced Clustering","summary":" Deep clustering, which learns representation and semantic clustering without\nlabels information, poses a great challenge for deep learning-based approaches.\nDespite significant progress in recent years, most existing methods focus on\nuniformly distributed datasets, significantly limiting the practical\napplicability of their methods. In this paper, we propose a more practical\nproblem setting named deep imbalanced clustering, where the underlying classes\nexhibit an imbalance distribution. To address this challenge, we introduce a\nnovel optimal transport-based pseudo-label learning framework. Our framework\nformulates pseudo-label generation as a Semantic-regularized Progressive\nPartial Optimal Transport (SP$^2$OT) problem, which progressively transports\neach sample to imbalanced clusters under several prior distribution and\nsemantic relation constraints, thus generating high-quality and imbalance-aware\npseudo-labels. To solve SP$^2$OT, we develop a Majorization-Minimization-based\noptimization algorithm. To be more precise, we employ the strategy of\nmajorization to reformulate the SP$^2$OT problem into a Progressive Partial\nOptimal Transport problem, which can be transformed into an unbalanced optimal\ntransport problem with augmented constraints and can be solved efficiently by a\nfast matrix scaling algorithm. Experiments on various datasets, including a\nhuman-curated long-tailed CIFAR100, challenging ImageNet-R, and large-scale\nsubsets of fine-grained iNaturalist2018 datasets, demonstrate the superiority\nof our method.\n","authors":["Chuyu Zhang","Hui Ren","Xuming He"],"pdf_url":"https://arxiv.org/pdf/2404.03446v1.pdf","comment":"under review. arXiv admin note: substantial text overlap with\n arXiv:2401.09266"},{"id":"http://arxiv.org/abs/2404.03443v1","updated":"2024-04-04T13:43:11Z","published":"2024-04-04T13:43:11Z","title":"Part-Attention Based Model Make Occluded Person Re-Identification\n Stronger","summary":" The goal of occluded person re-identification (ReID) is to retrieve specific\npedestrians in occluded situations. However, occluded person ReID still suffers\nfrom background clutter and low-quality local feature representations, which\nlimits model performance. In our research, we introduce a new framework called\nPAB-ReID, which is a novel ReID model incorporating part-attention mechanisms\nto tackle the aforementioned issues effectively. Firstly, we introduce the\nhuman parsing label to guide the generation of more accurate human part\nattention maps. In addition, we propose a fine-grained feature focuser for\ngenerating fine-grained human local feature representations while suppressing\nbackground interference. Moreover, We also design a part triplet loss to\nsupervise the learning of human local features, which optimizes\nintra/inter-class distance. We conducted extensive experiments on specialized\nocclusion and regular ReID datasets, showcasing that our approach outperforms\nthe existing state-of-the-art methods.\n","authors":["Zhihao Chen","Yiyuan Ge"],"pdf_url":"https://arxiv.org/pdf/2404.03443v1.pdf","comment":"Accepted By International Joint Conference on Neural Networks"},{"id":"http://arxiv.org/abs/2312.12080v2","updated":"2024-04-04T13:36:21Z","published":"2023-12-19T11:57:54Z","title":"Learning Subject-Aware Cropping by Outpainting Professional Photos","summary":" How to frame (or crop) a photo often depends on the image subject and its\ncontext; e.g., a human portrait. Recent works have defined the subject-aware\nimage cropping task as a nuanced and practical version of image cropping. We\npropose a weakly-supervised approach (GenCrop) to learn what makes a\nhigh-quality, subject-aware crop from professional stock images. Unlike\nsupervised prior work, GenCrop requires no new manual annotations beyond the\nexisting stock image collection. The key challenge in learning from this data,\nhowever, is that the images are already cropped and we do not know what regions\nwere removed. Our insight is to combine a library of stock images with a\nmodern, pre-trained text-to-image diffusion model. The stock image collection\nprovides diversity and its images serve as pseudo-labels for a good crop, while\nthe text-image diffusion model is used to out-paint (i.e., outward inpainting)\nrealistic uncropped images. Using this procedure, we are able to automatically\ngenerate a large dataset of cropped-uncropped training pairs to train a\ncropping model. Despite being weakly-supervised, GenCrop is competitive with\nstate-of-the-art supervised methods and significantly better than comparable\nweakly-supervised baselines on quantitative and qualitative evaluation metrics.\n","authors":["James Hong","Lu Yuan","Michaël Gharbi","Matthew Fisher","Kayvon Fatahalian"],"pdf_url":"https://arxiv.org/pdf/2312.12080v2.pdf","comment":"AAAI 24. Extended version with supplemental materials"},{"id":"http://arxiv.org/abs/2404.02656v2","updated":"2024-04-04T13:30:59Z","published":"2024-04-03T11:37:03Z","title":"Non-negative Subspace Feature Representation for Few-shot Learning in\n Medical Imaging","summary":" Unlike typical visual scene recognition domains, in which massive datasets\nare accessible to deep neural networks, medical image interpretations are often\nobstructed by the paucity of data. In this paper, we investigate the\neffectiveness of data-based few-shot learning in medical imaging by exploring\ndifferent data attribute representations in a low-dimensional space. We\nintroduce different types of non-negative matrix factorization (NMF) in\nfew-shot learning, addressing the data scarcity issue in medical image\nclassification. Extensive empirical studies are conducted in terms of\nvalidating the effectiveness of NMF, especially its supervised variants (e.g.,\ndiscriminative NMF, and supervised and constrained NMF with sparseness), and\nthe comparison with principal component analysis (PCA), i.e., the collaborative\nrepresentation-based dimensionality reduction technique derived from\neigenvectors. With 14 different datasets covering 11 distinct illness\ncategories, thorough experimental results and comparison with related\ntechniques demonstrate that NMF is a competitive alternative to PCA for\nfew-shot learning in medical imaging, and the supervised NMF algorithms are\nmore discriminative in the subspace with greater effectiveness. Furthermore, we\nshow that the part-based representation of NMF, especially its supervised\nvariants, is dramatically impactful in detecting lesion areas in medical\nimaging with limited samples.\n","authors":["Keqiang Fan","Xiaohao Cai","Mahesan Niranjan"],"pdf_url":"https://arxiv.org/pdf/2404.02656v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.14162v3","updated":"2024-04-04T13:29:25Z","published":"2023-09-25T14:13:26Z","title":"Data Upcycling Knowledge Distillation for Image Super-Resolution","summary":" Knowledge distillation (KD) compresses deep neural networks by transferring\ntask-related knowledge from cumbersome pre-trained teacher models to compact\nstudent models. However, current KD methods for super-resolution (SR) networks\noverlook the nature of SR task that the outputs of the teacher model are noisy\napproximations to the ground-truth distribution of high-quality images (GT),\nwhich shades the teacher model's knowledge to result in limited KD effects. To\nutilize the teacher model beyond the GT upper-bound, we present the Data\nUpcycling Knowledge Distillation (DUKD), to transfer the teacher model's\nknowledge to the student model through the upcycled in-domain data derived from\ntraining data. Besides, we impose label consistency regularization to KD for SR\nby the paired invertible augmentations to improve the student model's\nperformance and robustness. Comprehensive experiments demonstrate that the DUKD\nmethod significantly outperforms previous arts on several SR tasks.\n","authors":["Yun Zhang","Wei Li","Simiao Li","Hanting Chen","Zhijun Tu","Wenjia Wang","Bingyi Jing","Shaohui Lin","Jie Hu"],"pdf_url":"https://arxiv.org/pdf/2309.14162v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03425v1","updated":"2024-04-04T13:06:25Z","published":"2024-04-04T13:06:25Z","title":"ChangeMamba: Remote Sensing Change Detection with Spatio-Temporal State\n Space Model","summary":" Convolutional neural networks (CNN) and Transformers have made impressive\nprogress in the field of remote sensing change detection (CD). However, both\narchitectures have their inherent shortcomings. Recently, the Mamba\narchitecture, based on spatial state models, has shown remarkable performance\nin a series of natural language processing tasks, which can effectively\ncompensate for the shortcomings of the above two architectures. In this paper,\nwe explore for the first time the potential of the Mamba architecture for\nremote sensing change detection tasks. We tailor the corresponding frameworks,\ncalled MambaBCD, MambaSCD, and MambaBDA, for binary change detection (BCD),\nsemantic change detection (SCD), and building damage assessment (BDA),\nrespectively. All three frameworks adopt the cutting-edge visual Mamba\narchitecture as the encoder, which allows full learning of global spatial\ncontextual information from the input images. For the change decoder, which is\navailable in all three architectures, we propose three spatio-temporal\nrelationship modeling mechanisms, which can be naturally combined with the\nMamba architecture and fully utilize its attribute to achieve spatio-temporal\ninteraction of multi-temporal features and obtain accurate change information.\nOn five benchmark datasets, our proposed frameworks outperform current CNN- and\nTransformer-based approaches without using any complex strategies or tricks,\nfully demonstrating the potential of the Mamba architecture. Specifically, we\nobtained 83.11%, 88.39% and 94.19% F1 scores on the three BCD datasets SYSU,\nLEVIR-CD+, and WHU-CD; on the SCD dataset SECOND, we obtained 24.04% SeK; and\non the xBD dataset, we obtained 81.41% overall F1 score. The source code will\nbe available in https://github.com/ChenHongruixuan/MambaCD\n","authors":["Hongruixuan Chen","Jian Song","Chengxi Han","Junshi Xia","Naoto Yokoya"],"pdf_url":"https://arxiv.org/pdf/2404.03425v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00778v2","updated":"2024-04-04T13:00:20Z","published":"2023-12-01T18:55:53Z","title":"MorpheuS: Neural Dynamic 360° Surface Reconstruction from Monocular\n RGB-D Video","summary":" Neural rendering has demonstrated remarkable success in dynamic scene\nreconstruction. Thanks to the expressiveness of neural representations, prior\nworks can accurately capture the motion and achieve high-fidelity\nreconstruction of the target object. Despite this, real-world video scenarios\noften feature large unobserved regions where neural representations struggle to\nachieve realistic completion. To tackle this challenge, we introduce MorpheuS,\na framework for dynamic 360{\\deg} surface reconstruction from a casually\ncaptured RGB-D video. Our approach models the target scene as a canonical field\nthat encodes its geometry and appearance, in conjunction with a deformation\nfield that warps points from the current frame to the canonical space. We\nleverage a view-dependent diffusion prior and distill knowledge from it to\nachieve realistic completion of unobserved regions. Experimental results on\nvarious real-world and synthetic datasets show that our method can achieve\nhigh-fidelity 360{\\deg} surface reconstruction of a deformable object from a\nmonocular RGB-D video.\n","authors":["Hengyi Wang","Jingwen Wang","Lourdes Agapito"],"pdf_url":"https://arxiv.org/pdf/2312.00778v2.pdf","comment":"CVPR2024. Project page:\n https://hengyiwang.github.io/projects/morpheus"},{"id":"http://arxiv.org/abs/2404.03421v1","updated":"2024-04-04T12:58:46Z","published":"2024-04-04T12:58:46Z","title":"Generalizable 3D Scene Reconstruction via Divide and Conquer from a\n Single View","summary":" Single-view 3D reconstruction is currently approached from two dominant\nperspectives: reconstruction of scenes with limited diversity using 3D data\nsupervision or reconstruction of diverse singular objects using large image\npriors. However, real-world scenarios are far more complex and exceed the\ncapabilities of these methods. We therefore propose a hybrid method following a\ndivide-and-conquer strategy. We first process the scene holistically,\nextracting depth and semantic information, and then leverage a single-shot\nobject-level method for the detailed reconstruction of individual components.\nBy following a compositional processing approach, the overall framework\nachieves full reconstruction of complex 3D scenes from a single image. We\npurposely design our pipeline to be highly modular by carefully integrating\nspecific procedures for each processing step, without requiring an end-to-end\ntraining of the whole system. This enables the pipeline to naturally improve as\nfuture methods can replace the individual modules. We demonstrate the\nreconstruction performance of our approach on both synthetic and real-world\nscenes, comparing favorable against prior works. Project page:\nhttps://andreeadogaru.github.io/Gen3DSR.\n","authors":["Andreea Dogaru","Mert Özer","Bernhard Egger"],"pdf_url":"https://arxiv.org/pdf/2404.03421v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03417v1","updated":"2024-04-04T12:50:51Z","published":"2024-04-04T12:50:51Z","title":"NMF-Based Analysis of Mobile Eye-Tracking Data","summary":" The depiction of scanpaths from mobile eye-tracking recordings by thumbnails\nfrom the stimulus allows the application of visual computing to detect areas of\ninterest in an unsupervised way. We suggest using nonnegative matrix\nfactorization (NMF) to identify such areas in stimuli. For a user-defined\ninteger k, NMF produces an explainable decomposition into k components, each\nconsisting of a spatial representation associated with a temporal indicator. In\nthe context of multiple eye-tracking recordings, this leads to k spatial\nrepresentations, where the temporal indicator highlights the appearance within\nrecordings. The choice of k provides an opportunity to control the refinement\nof the decomposition, i.e., the number of areas to detect. We combine our\nNMF-based approach with visualization techniques to enable an exploratory\nanalysis of multiple recordings. Finally, we demonstrate the usefulness of our\napproach with mobile eye-tracking data of an art gallery.\n","authors":["Daniel Klötzl","Tim Krake","Frank Heyen","Michael Becher","Maurice Koch","Daniel Weiskopf","Kuno Kurzhals"],"pdf_url":"https://arxiv.org/pdf/2404.03417v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03415v1","updated":"2024-04-04T12:49:42Z","published":"2024-04-04T12:49:42Z","title":"Future Predictive Success-or-Failure Classification for Long-Horizon\n Robotic Tasks","summary":" Automating long-horizon tasks with a robotic arm has been a central research\ntopic in robotics. Optimization-based action planning is an efficient approach\nfor creating an action plan to complete a given task. Construction of a\nreliable planning method requires a design process of conditions, e.g., to\navoid collision between objects. The design process, however, has two critical\nissues: 1) iterative trials--the design process is time-consuming due to the\ntrial-and-error process of modifying conditions, and 2) manual redesign--it is\ndifficult to cover all the necessary conditions manually. To tackle these\nissues, this paper proposes a future-predictive\nsuccess-or-failure-classification method to obtain conditions automatically.\nThe key idea behind the proposed method is an end-to-end approach for\ndetermining whether the action plan can complete a given task instead of\nmanually redesigning the conditions. The proposed method uses a long-horizon\nfuture-prediction method to enable success-or-failure classification without\nthe execution of an action plan. This paper also proposes a regularization term\ncalled transition consistency regularization to provide easy-to-predict feature\ndistribution. The regularization term improves future prediction and\nclassification performance. The effectiveness of our method is demonstrated\nthrough classification and robotic-manipulation experiments.\n","authors":["Naoya Sogi","Hiroyuki Oyama","Takashi Shibata","Makoto Terao"],"pdf_url":"https://arxiv.org/pdf/2404.03415v1.pdf","comment":"IJCNN 2024"},{"id":"http://arxiv.org/abs/2404.03413v1","updated":"2024-04-04T12:46:01Z","published":"2024-04-04T12:46:01Z","title":"MiniGPT4-Video: Advancing Multimodal LLMs for Video Understanding with\n Interleaved Visual-Textual Tokens","summary":" This paper introduces MiniGPT4-Video, a multimodal Large Language Model (LLM)\ndesigned specifically for video understanding. The model is capable of\nprocessing both temporal visual and textual data, making it adept at\nunderstanding the complexities of videos. Building upon the success of\nMiniGPT-v2, which excelled in translating visual features into the LLM space\nfor single images and achieved impressive results on various image-text\nbenchmarks, this paper extends the model's capabilities to process a sequence\nof frames, enabling it to comprehend videos. MiniGPT4-video does not only\nconsider visual content but also incorporates textual conversations, allowing\nthe model to effectively answer queries involving both visual and text\ncomponents. The proposed model outperforms existing state-of-the-art methods,\nregistering gains of 4.22%, 1.13%, 20.82%, and 13.1% on the MSVD, MSRVTT, TGIF,\nand TVQA benchmarks respectively. Our models and code have been made publicly\navailable here https://vision-cair.github.io/MiniGPT4-video/\n","authors":["Kirolos Ataallah","Xiaoqian Shen","Eslam Abdelrahman","Essam Sleiman","Deyao Zhu","Jian Ding","Mohamed Elhoseiny"],"pdf_url":"https://arxiv.org/pdf/2404.03413v1.pdf","comment":"6 pages,8 figures"},{"id":"http://arxiv.org/abs/2403.16612v2","updated":"2024-04-04T12:35:33Z","published":"2024-03-25T10:42:48Z","title":"Calibrating Bayesian UNet++ for Sub-Seasonal Forecasting","summary":" Seasonal forecasting is a crucial task when it comes to detecting the extreme\nheat and colds that occur due to climate change. Confidence in the predictions\nshould be reliable since a small increase in the temperatures in a year has a\nbig impact on the world. Calibration of the neural networks provides a way to\nensure our confidence in the predictions. However, calibrating regression\nmodels is an under-researched topic, especially in forecasters. We calibrate a\nUNet++ based architecture, which was shown to outperform physics-based models\nin temperature anomalies. We show that with a slight trade-off between\nprediction error and calibration error, it is possible to get more reliable and\nsharper forecasts. We believe that calibration should be an important part of\nsafety-critical machine learning applications such as weather forecasters.\n","authors":["Busra Asan","Abdullah Akgül","Alper Unal","Melih Kandemir","Gozde Unal"],"pdf_url":"https://arxiv.org/pdf/2403.16612v2.pdf","comment":"Accepted as a workshop paper at \"ICLR 2024 Tackling Climate Change\n with Machine Learning\""},{"id":"http://arxiv.org/abs/2404.03407v1","updated":"2024-04-04T12:12:24Z","published":"2024-04-04T12:12:24Z","title":"AIGIQA-20K: A Large Database for AI-Generated Image Quality Assessment","summary":" With the rapid advancements in AI-Generated Content (AIGC), AI-Generated\nImages (AIGIs) have been widely applied in entertainment, education, and social\nmedia. However, due to the significant variance in quality among different\nAIGIs, there is an urgent need for models that consistently match human\nsubjective ratings. To address this issue, we organized a challenge towards\nAIGC quality assessment on NTIRE 2024 that extensively considers 15 popular\ngenerative models, utilizing dynamic hyper-parameters (including\nclassifier-free guidance, iteration epochs, and output image resolution), and\ngather subjective scores that consider perceptual quality and text-to-image\nalignment altogether comprehensively involving 21 subjects. This approach\nculminates in the creation of the largest fine-grained AIGI subjective quality\ndatabase to date with 20,000 AIGIs and 420,000 subjective ratings, known as\nAIGIQA-20K. Furthermore, we conduct benchmark experiments on this database to\nassess the correspondence between 16 mainstream AIGI quality models and human\nperception. We anticipate that this large-scale quality database will inspire\nrobust quality indicators for AIGIs and propel the evolution of AIGC for\nvision. The database is released on\nhttps://www.modelscope.cn/datasets/lcysyzxdxc/AIGCQA-30K-Image.\n","authors":["Chunyi Li","Tengchuan Kou","Yixuan Gao","Yuqin Cao","Wei Sun","Zicheng Zhang","Yingjie Zhou","Zhichao Zhang","Weixia Zhang","Haoning Wu","Xiaohong Liu","Xiongkuo Min","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2404.03407v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03398v1","updated":"2024-04-04T11:59:06Z","published":"2024-04-04T11:59:06Z","title":"Scaling Up Video Summarization Pretraining with Large Language Models","summary":" Long-form video content constitutes a significant portion of internet\ntraffic, making automated video summarization an essential research problem.\nHowever, existing video summarization datasets are notably limited in their\nsize, constraining the effectiveness of state-of-the-art methods for\ngeneralization. Our work aims to overcome this limitation by capitalizing on\nthe abundance of long-form videos with dense speech-to-video alignment and the\nremarkable capabilities of recent large language models (LLMs) in summarizing\nlong text. We introduce an automated and scalable pipeline for generating a\nlarge-scale video summarization dataset using LLMs as Oracle summarizers. By\nleveraging the generated dataset, we analyze the limitations of existing\napproaches and propose a new video summarization model that effectively\naddresses them. To facilitate further research in the field, our work also\npresents a new benchmark dataset that contains 1200 long videos each with\nhigh-quality summaries annotated by professionals. Extensive experiments\nclearly indicate that our proposed approach sets a new state-of-the-art in\nvideo summarization across several benchmarks.\n","authors":["Dawit Mureja Argaw","Seunghyun Yoon","Fabian Caba Heilbron","Hanieh Deilamsalehy","Trung Bui","Zhaowen Wang","Franck Dernoncourt","Joon Son Chung"],"pdf_url":"https://arxiv.org/pdf/2404.03398v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03394v1","updated":"2024-04-04T11:53:37Z","published":"2024-04-04T11:53:37Z","title":"Background Noise Reduction of Attention Map for Weakly Supervised\n Semantic Segmentation","summary":" In weakly-supervised semantic segmentation (WSSS) using only image-level\nclass labels, a problem with CNN-based Class Activation Maps (CAM) is that they\ntend to activate the most discriminative local regions of objects. On the other\nhand, methods based on Transformers learn global features but suffer from the\nissue of background noise contamination. This paper focuses on addressing the\nissue of background noise in attention weights within the existing WSSS method\nbased on Conformer, known as TransCAM. The proposed method successfully reduces\nbackground noise, leading to improved accuracy of pseudo labels. Experimental\nresults demonstrate that our model achieves segmentation performance of 70.5%\non the PASCAL VOC 2012 validation data, 71.1% on the test data, and 45.9% on MS\nCOCO 2014 data, outperforming TransCAM in terms of segmentation performance.\n","authors":["Izumi Fujimori","Masaki Oono","Masami Shishibori"],"pdf_url":"https://arxiv.org/pdf/2404.03394v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03392v1","updated":"2024-04-04T11:49:56Z","published":"2024-04-04T11:49:56Z","title":"Two Tricks to Improve Unsupervised Segmentation Learning","summary":" We present two practical improvement techniques for unsupervised segmentation\nlearning. These techniques address limitations in the resolution and accuracy\nof predicted segmentation maps of recent state-of-the-art methods. Firstly, we\nleverage image post-processing techniques such as guided filtering to refine\nthe output masks, improving accuracy while avoiding substantial computational\ncosts. Secondly, we introduce a multi-scale consistency criterion, based on a\nteacher-student training scheme. This criterion matches segmentation masks\npredicted from regions of the input image extracted at different resolutions to\neach other. Experimental results on several benchmarks used in unsupervised\nsegmentation learning demonstrate the effectiveness of our proposed techniques.\n","authors":["Alp Eren Sari","Francesco Locatello","Paolo Favar"],"pdf_url":"https://arxiv.org/pdf/2404.03392v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03384v1","updated":"2024-04-04T11:33:29Z","published":"2024-04-04T11:33:29Z","title":"LongVLM: Efficient Long Video Understanding via Large Language Models","summary":" Empowered by Large Language Models (LLMs), recent advancements in VideoLLMs\nhave driven progress in various video understanding tasks. These models encode\nvideo representations through pooling or query aggregation over a vast number\nof visual tokens, making computational and memory costs affordable. Despite\nsuccessfully providing an overall comprehension of video content, existing\nVideoLLMs still face challenges in achieving detailed understanding in videos\ndue to overlooking local information in long-term videos. To tackle this\nchallenge, we introduce LongVLM, a straightforward yet powerful VideoLLM for\nlong video understanding, building upon the observation that long videos often\nconsist of sequential key events, complex actions, and camera movements. Our\napproach proposes to decompose long videos into multiple short-term segments\nand encode local features for each local segment via a hierarchical token\nmerging module. These features are concatenated in temporal order to maintain\nthe storyline across sequential short-term segments. Additionally, we propose\nto integrate global semantics into each local feature to enhance context\nunderstanding. In this way, we encode video representations that incorporate\nboth local and global information, enabling the LLM to generate comprehensive\nresponses for long-term videos. Experimental results on the VideoChatGPT\nbenchmark and zero-shot video question-answering datasets demonstrate the\nsuperior capabilities of our model over the previous state-of-the-art methods.\nQualitative examples demonstrate that our model produces more precise responses\nfor long videos understanding. Code is available at\n\\url{https://github.com/ziplab/LongVLM}.\n","authors":["Yuetian Weng","Mingfei Han","Haoyu He","Xiaojun Chang","Bohan Zhuang"],"pdf_url":"https://arxiv.org/pdf/2404.03384v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03349v1","updated":"2024-04-04T10:30:28Z","published":"2024-04-04T10:30:28Z","title":"VF-NeRF: Viewshed Fields for Rigid NeRF Registration","summary":" 3D scene registration is a fundamental problem in computer vision that seeks\nthe best 6-DoF alignment between two scenes. This problem was extensively\ninvestigated in the case of point clouds and meshes, but there has been\nrelatively limited work regarding Neural Radiance Fields (NeRF). In this paper,\nwe consider the problem of rigid registration between two NeRFs when the\nposition of the original cameras is not given. Our key novelty is the\nintroduction of Viewshed Fields (VF), an implicit function that determines, for\neach 3D point, how likely it is to be viewed by the original cameras. We\ndemonstrate how VF can help in the various stages of NeRF registration, with an\nextensive evaluation showing that VF-NeRF achieves SOTA results on various\ndatasets with different capturing approaches such as LLFF and Objaverese.\n","authors":["Leo Segre","Shai Avidan"],"pdf_url":"https://arxiv.org/pdf/2404.03349v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03340v1","updated":"2024-04-04T10:10:38Z","published":"2024-04-04T10:10:38Z","title":"Meta Invariance Defense Towards Generalizable Robustness to Unknown\n Adversarial Attacks","summary":" Despite providing high-performance solutions for computer vision tasks, the\ndeep neural network (DNN) model has been proved to be extremely vulnerable to\nadversarial attacks. Current defense mainly focuses on the known attacks, but\nthe adversarial robustness to the unknown attacks is seriously overlooked.\nBesides, commonly used adaptive learning and fine-tuning technique is\nunsuitable for adversarial defense since it is essentially a zero-shot problem\nwhen deployed. Thus, to tackle this challenge, we propose an attack-agnostic\ndefense method named Meta Invariance Defense (MID). Specifically, various\ncombinations of adversarial attacks are randomly sampled from a manually\nconstructed Attacker Pool to constitute different defense tasks against unknown\nattacks, in which a student encoder is supervised by multi-consistency\ndistillation to learn the attack-invariant features via a meta principle. The\nproposed MID has two merits: 1) Full distillation from pixel-, feature- and\nprediction-level between benign and adversarial samples facilitates the\ndiscovery of attack-invariance. 2) The model simultaneously achieves robustness\nto the imperceptible adversarial perturbations in high-level image\nclassification and attack-suppression in low-level robust image regeneration.\nTheoretical and empirical studies on numerous benchmarks such as ImageNet\nverify the generalizable robustness and superiority of MID under various\nattacks.\n","authors":["Lei Zhang","Yuhang Zhou","Yi Yang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2404.03340v1.pdf","comment":"Accepted by IEEE TPAMI in 2024"},{"id":"http://arxiv.org/abs/2404.03327v1","updated":"2024-04-04T09:53:00Z","published":"2024-04-04T09:53:00Z","title":"DI-Retinex: Digital-Imaging Retinex Theory for Low-Light Image\n Enhancement","summary":" Many existing methods for low-light image enhancement (LLIE) based on Retinex\ntheory ignore important factors that affect the validity of this theory in\ndigital imaging, such as noise, quantization error, non-linearity, and dynamic\nrange overflow. In this paper, we propose a new expression called\nDigital-Imaging Retinex theory (DI-Retinex) through theoretical and\nexperimental analysis of Retinex theory in digital imaging. Our new expression\nincludes an offset term in the enhancement model, which allows for pixel-wise\nbrightness contrast adjustment with a non-linear mapping function. In addition,\nto solve the lowlight enhancement problem in an unsupervised manner, we propose\nan image-adaptive masked reverse degradation loss in Gamma space. We also\ndesign a variance suppression loss for regulating the additional offset term.\nExtensive experiments show that our proposed method outperforms all existing\nunsupervised methods in terms of visual quality, model size, and speed. Our\nalgorithm can also assist downstream face detectors in low-light, as it shows\nthe most performance gain after the low-light enhancement compared to other\nmethods.\n","authors":["Shangquan Sun","Wenqi Ren","Jingyang Peng","Fenglong Song","Xiaochun Cao"],"pdf_url":"https://arxiv.org/pdf/2404.03327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01064v2","updated":"2024-04-04T09:48:30Z","published":"2024-04-01T11:57:34Z","title":"Roadside Monocular 3D Detection via 2D Detection Prompting","summary":" The problem of roadside monocular 3D detection requires detecting objects of\ninterested classes in a 2D RGB frame and predicting their 3D information such\nas locations in bird's-eye-view (BEV). It has broad applications in traffic\ncontrol, vehicle-vehicle communication, and vehicle-infrastructure cooperative\nperception. To approach this problem, we present a novel and simple method by\nprompting the 3D detector using 2D detections. Our method builds on a key\ninsight that, compared with 3D detectors, a 2D detector is much easier to train\nand performs significantly better w.r.t detections on the 2D image plane. That\nsaid, one can exploit 2D detections of a well-trained 2D detector as prompts to\na 3D detector, being trained in a way of inflating such 2D detections to 3D\ntowards 3D detection. To construct better prompts using the 2D detector, we\nexplore three techniques: (a) concatenating both 2D and 3D detectors' features,\n(b) attentively fusing 2D and 3D detectors' features, and (c) encoding\npredicted 2D boxes x, y, width, height, label and attentively fusing such with\nthe 3D detector's features. Surprisingly, the third performs the best.\nMoreover, we present a yaw tuning tactic and a class-grouping strategy that\nmerges classes based on their functionality; these techniques improve 3D\ndetection performance further. Comprehensive ablation studies and extensive\nexperiments demonstrate that our method resoundingly outperforms prior works,\nachieving the state-of-the-art on two large-scale roadside 3D detection\nbenchmarks.\n","authors":["Yechi Ma","Shuoquan Wei","Churun Zhang","Wei Hua","Yanan Li","Shu Kong"],"pdf_url":"https://arxiv.org/pdf/2404.01064v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03323v1","updated":"2024-04-04T09:43:43Z","published":"2024-04-04T09:43:43Z","title":"Sparse Concept Bottleneck Models: Gumbel Tricks in Contrastive Learning","summary":" We propose a novel architecture and method of explainable classification with\nConcept Bottleneck Models (CBMs). While SOTA approaches to Image Classification\ntask work as a black box, there is a growing demand for models that would\nprovide interpreted results. Such a models often learn to predict the\ndistribution over class labels using additional description of this target\ninstances, called concepts. However, existing Bottleneck methods have a number\nof limitations: their accuracy is lower than that of a standard model and CBMs\nrequire an additional set of concepts to leverage. We provide a framework for\ncreating Concept Bottleneck Model from pre-trained multi-modal encoder and new\nCLIP-like architectures. By introducing a new type of layers known as Concept\nBottleneck Layers, we outline three methods for training them: with\n$\\ell_1$-loss, contrastive loss and loss function based on Gumbel-Softmax\ndistribution (Sparse-CBM), while final FC layer is still trained with\nCross-Entropy. We show a significant increase in accuracy using sparse hidden\nlayers in CLIP-based bottleneck models. Which means that sparse representation\nof concepts activation vector is meaningful in Concept Bottleneck Models.\nMoreover, with our Concept Matrix Search algorithm we can improve CLIP\npredictions on complex datasets without any additional training or fine-tuning.\nThe code is available at: https://github.com/Andron00e/SparseCBM.\n","authors":["Andrei Semenov","Vladimir Ivanov","Aleksandr Beznosikov","Alexander Gasnikov"],"pdf_url":"https://arxiv.org/pdf/2404.03323v1.pdf","comment":"23 pages, 1 algorithm, 36 figures"},{"id":"http://arxiv.org/abs/2310.00615v3","updated":"2024-04-04T09:18:50Z","published":"2023-10-01T08:32:46Z","title":"Scene-aware Human Motion Forecasting via Mutual Distance Prediction","summary":" In this paper, we tackle the problem of scene-aware 3D human motion\nforecasting. A key challenge of this task is to predict future human motions\nthat are consistent with the scene by modeling the human-scene interactions.\nWhile recent works have demonstrated that explicit constraints on human-scene\ninteractions can prevent the occurrence of ghost motion, they only provide\nconstraints on partial human motion e.g., the global motion of the human or a\nfew joints contacting the scene, leaving the rest of the motion unconstrained.\nTo address this limitation, we propose to model the human-scene interaction\nwith the mutual distance between the human body and the scene. Such mutual\ndistances constrain both the local and global human motion, resulting in a\nwhole-body motion constrained prediction. In particular, mutual distance\nconstraints consist of two components, the signed distance of each vertex on\nthe human mesh to the scene surface and the distance of basis scene points to\nthe human mesh. We further introduce a global scene representation learned from\na signed distance function (SDF) volume to ensure coherence between the global\nscene representation and the explicit constraint from the mutual distance. We\ndevelop a pipeline with two sequential steps: predicting the future mutual\ndistances first, followed by forecasting future human motion. During training,\nwe explicitly encourage consistency between predicted poses and mutual\ndistances. Extensive evaluations on the existing synthetic and real datasets\ndemonstrate that our approach consistently outperforms the state-of-the-art\nmethods.\n","authors":["Chaoyue Xing","Wei Mao","Miaomiao Liu"],"pdf_url":"https://arxiv.org/pdf/2310.00615v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19773v2","updated":"2024-04-04T09:05:49Z","published":"2024-03-28T18:50:19Z","title":"ShapeFusion: A 3D diffusion model for localized shape editing","summary":" In the realm of 3D computer vision, parametric models have emerged as a\nground-breaking methodology for the creation of realistic and expressive 3D\navatars. Traditionally, they rely on Principal Component Analysis (PCA), given\nits ability to decompose data to an orthonormal space that maximally captures\nshape variations. However, due to the orthogonality constraints and the global\nnature of PCA's decomposition, these models struggle to perform localized and\ndisentangled editing of 3D shapes, which severely affects their use in\napplications requiring fine control such as face sculpting. In this paper, we\nleverage diffusion models to enable diverse and fully localized edits on 3D\nmeshes, while completely preserving the un-edited regions. We propose an\neffective diffusion masking training strategy that, by design, facilitates\nlocalized manipulation of any shape region, without being limited to predefined\nregions or to sparse sets of predefined control vertices. Following our\nframework, a user can explicitly set their manipulation region of choice and\ndefine an arbitrary set of vertices as handles to edit a 3D mesh. Compared to\nthe current state-of-the-art our method leads to more interpretable shape\nmanipulations than methods relying on latent code state, greater localization\nand generation diversity while offering faster inference than optimization\nbased approaches. Project page: https://rolpotamias.github.io/Shapefusion/\n","authors":["Rolandos Alexandros Potamias","Michail Tarasiou","Stylianos Ploumpis","Stefanos Zafeiriou"],"pdf_url":"https://arxiv.org/pdf/2403.19773v2.pdf","comment":"Project Page: https://rolpotamias.github.io/Shapefusion/"},{"id":"http://arxiv.org/abs/2404.02614v2","updated":"2024-04-04T08:57:00Z","published":"2024-04-03T10:01:23Z","title":"Vestibular schwannoma growth prediction from longitudinal MRI by time\n conditioned neural fields","summary":" Vestibular schwannomas (VS) are benign tumors that are generally managed by\nactive surveillance with MRI examination. To further assist clinical\ndecision-making and avoid overtreatment, an accurate prediction of tumor growth\nbased on longitudinal imaging is highly desirable. In this paper, we introduce\nDeepGrowth, a deep learning method that incorporates neural fields and\nrecurrent neural networks for prospective tumor growth prediction. In the\nproposed method, each tumor is represented as a signed distance function (SDF)\nconditioned on a low-dimensional latent code. Unlike previous studies that\nperform tumor shape prediction directly in the image space, we predict the\nlatent codes instead and then reconstruct future shapes from it. To deal with\nirregular time intervals, we introduce a time-conditioned recurrent module\nbased on a ConvLSTM and a novel temporal encoding strategy, which enables the\nproposed model to output varying tumor shapes over time. The experiments on an\nin-house longitudinal VS dataset showed that the proposed model significantly\nimproved the performance ($\\ge 1.6\\%$ Dice score and $\\ge0.20$ mm 95\\%\nHausdorff distance), in particular for top 20\\% tumors that grow or shrink the\nmost ($\\ge 4.6\\%$ Dice score and $\\ge 0.73$ mm 95\\% Hausdorff distance). Our\ncode is available at ~\\burl{https://github.com/cyjdswx/DeepGrowth}\n","authors":["Yunjie Chen","Jelmer M. Wolterink","Olaf M. Neve","Stephan R. Romeijn","Berit M. Verbist","Erik F. Hensen","Qian Tao","Marius Staring"],"pdf_url":"https://arxiv.org/pdf/2404.02614v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02282v2","updated":"2024-04-04T08:38:17Z","published":"2024-04-02T20:15:43Z","title":"Smooth Deep Saliency","summary":" In this work, we investigate methods to reduce the noise in deep saliency\nmaps coming from convolutional downsampling, with the purpose of explaining how\na deep learning model detects tumors in scanned histological tissue samples.\nThose methods make the investigated models more interpretable for\ngradient-based saliency maps, computed in hidden layers. We test our approach\non different models trained for image classification on ImageNet1K, and models\ntrained for tumor detection on Camelyon16 and in-house real-world digital\npathology scans of stained tissue samples. Our results show that the\ncheckerboard noise in the gradient gets reduced, resulting in smoother and\ntherefore easier to interpret saliency maps.\n","authors":["Rudolf Herdt","Maximilian Schmidt","Daniel Otero Baguer","Peter Maaß"],"pdf_url":"https://arxiv.org/pdf/2404.02282v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03296v1","updated":"2024-04-04T08:37:27Z","published":"2024-04-04T08:37:27Z","title":"AdaBM: On-the-Fly Adaptive Bit Mapping for Image Super-Resolution","summary":" Although image super-resolution (SR) problem has experienced unprecedented\nrestoration accuracy with deep neural networks, it has yet limited versatile\napplications due to the substantial computational costs. Since different input\nimages for SR face different restoration difficulties, adapting computational\ncosts based on the input image, referred to as adaptive inference, has emerged\nas a promising solution to compress SR networks. Specifically, adapting the\nquantization bit-widths has successfully reduced the inference and memory cost\nwithout sacrificing the accuracy. However, despite the benefits of the\nresultant adaptive network, existing works rely on time-intensive\nquantization-aware training with full access to the original training pairs to\nlearn the appropriate bit allocation policies, which limits its ubiquitous\nusage. To this end, we introduce the first on-the-fly adaptive quantization\nframework that accelerates the processing time from hours to seconds. We\nformulate the bit allocation problem with only two bit mapping modules: one to\nmap the input image to the image-wise bit adaptation factor and one to obtain\nthe layer-wise adaptation factors. These bit mappings are calibrated and\nfine-tuned using only a small number of calibration images. We achieve\ncompetitive performance with the previous adaptive quantization methods, while\nthe processing time is accelerated by x2000. Codes are available at\nhttps://github.com/Cheeun/AdaBM.\n","authors":["Cheeun Hong","Kyoung Mu Lee"],"pdf_url":"https://arxiv.org/pdf/2404.03296v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2211.07459v2","updated":"2024-04-04T08:24:54Z","published":"2022-11-14T15:37:27Z","title":"Self-Aligning Depth-regularized Radiance Fields for Asynchronous RGB-D\n Sequences","summary":" It has been shown that learning radiance fields with depth rendering and\ndepth supervision can effectively promote the quality and convergence of view\nsynthesis. However, this paradigm requires input RGB-D sequences to be\nsynchronized, hindering its usage in the UAV city modeling scenario. As there\nexists asynchrony between RGB images and depth images due to high-speed flight,\nwe propose a novel time-pose function, which is an implicit network that maps\ntimestamps to $\\rm SE(3)$ elements. To simplify the training process, we also\ndesign a joint optimization scheme to jointly learn the large-scale\ndepth-regularized radiance fields and the time-pose function. Our algorithm\nconsists of three steps: (1) time-pose function fitting, (2) radiance field\nbootstrapping, (3) joint pose error compensation and radiance field refinement.\nIn addition, we propose a large synthetic dataset with diverse controlled\nmismatches and ground truth to evaluate this new problem setting\nsystematically. Through extensive experiments, we demonstrate that our method\noutperforms baselines without regularization. We also show qualitatively\nimproved results on a real-world asynchronous RGB-D sequence captured by drone.\nCodes, data, and models will be made publicly available.\n","authors":["Yuxin Huang","Andong Yang","Zirui Wu","Yuantao Chen","Runyi Yang","Zhenxin Zhu","Chao Hou","Hao Zhao","Guyue Zhou"],"pdf_url":"https://arxiv.org/pdf/2211.07459v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01692v2","updated":"2024-04-04T08:07:22Z","published":"2024-04-02T06:52:31Z","title":"Beyond Image Super-Resolution for Image Recognition with Task-Driven\n Perceptual Loss","summary":" In real-world scenarios, image recognition tasks, such as semantic\nsegmentation and object detection, often pose greater challenges due to the\nlack of information available within low-resolution (LR) content. Image\nsuper-resolution (SR) is one of the promising solutions for addressing the\nchallenges. However, due to the ill-posed property of SR, it is challenging for\ntypical SR methods to restore task-relevant high-frequency contents, which may\ndilute the advantage of utilizing the SR method. Therefore, in this paper, we\npropose Super-Resolution for Image Recognition (SR4IR) that effectively guides\nthe generation of SR images beneficial to achieving satisfactory image\nrecognition performance when processing LR images. The critical component of\nour SR4IR is the task-driven perceptual (TDP) loss that enables the SR network\nto acquire task-specific knowledge from a network tailored for a specific task.\nMoreover, we propose a cross-quality patch mix and an alternate training\nframework that significantly enhances the efficacy of the TDP loss by\naddressing potential problems when employing the TDP loss. Through extensive\nexperiments, we demonstrate that our SR4IR achieves outstanding task\nperformance by generating SR images useful for a specific image recognition\ntask, including semantic segmentation, object detection, and image\nclassification. The implementation code is available at\nhttps://github.com/JaehaKim97/SR4IR.\n","authors":["Jaeha Kim","Junghun Oh","Kyoung Mu Lee"],"pdf_url":"https://arxiv.org/pdf/2404.01692v2.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.17369v2","updated":"2024-04-04T08:05:06Z","published":"2024-03-26T04:09:08Z","title":"CoDA: Instructive Chain-of-Domain Adaptation with Severity-Aware Visual\n Prompt Tuning","summary":" Unsupervised Domain Adaptation (UDA) aims to adapt models from labeled source\ndomains to unlabeled target domains. When adapting to adverse scenes, existing\nUDA methods fail to perform well due to the lack of instructions, leading their\nmodels to overlook discrepancies within all adverse scenes. To tackle this, we\npropose CoDA which instructs models to distinguish, focus, and learn from these\ndiscrepancies at scene and image levels. Specifically, CoDA consists of a\nChain-of-Domain (CoD) strategy and a Severity-Aware Visual Prompt Tuning\n(SAVPT) mechanism. CoD focuses on scene-level instructions to divide all\nadverse scenes into easy and hard scenes, guiding models to adapt from source\nto easy domains with easy scene images, and then to hard domains with hard\nscene images, thereby laying a solid foundation for whole adaptations. Building\nupon this foundation, we employ SAVPT to dive into more detailed image-level\ninstructions to boost performance. SAVPT features a novel metric Severity that\ndivides all adverse scene images into low-severity and high-severity images.\nThen Severity directs visual prompts and adapters, instructing models to\nconcentrate on unified severity features instead of scene-specific features,\nwithout adding complexity to the model architecture. CoDA achieves SOTA\nperformances on widely-used benchmarks under all adverse scenes. Notably, CoDA\noutperforms the existing ones by 4.6%, and 10.3% mIoU on the Foggy Driving, and\nFoggy Zurich benchmarks, respectively. Our code is available at\nhttps://github.com/Cuzyoung/CoDA\n","authors":["Ziyang Gong","Fuhao Li","Yupeng Deng","Deblina Bhattacharjee","Xiangwei Zhu","Zhenming Ji"],"pdf_url":"https://arxiv.org/pdf/2403.17369v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03277v1","updated":"2024-04-04T08:04:00Z","published":"2024-04-04T08:04:00Z","title":"Design and Development of a Framework For Stroke-Based Handwritten\n Gujarati Font Generation","summary":" Handwritten font generation is important for preserving cultural heritage and\ncreating personalized designs. It adds an authentic and expressive touch to\nprinted materials, making them visually appealing and establishing a stronger\nconnection with the audience. This paper aims to design a framework for\ngenerating handwritten fonts in the Gujarati script, mimicking the variation of\nhuman handwriting. The proposed font generation model consists of a learning\nphase and a generation phase. In the learning phase, Gujarati scripts are\nanalyzed, and rules for designing each character are formulated. This ruleset\ninvolves the concatenation of strokes in a stroke-based manner, ensuring visual\nconsistency in the resulting glyphs. The generation phase involves the user\nproviding a small subset of characters, and the system automatically generates\nthe remaining character glyphs based on extracted strokes and learned rules,\nresulting in handwritten Gujarati fonts. The resulting character glyphs are\nconverted into an open-type font using the FontForge tool, making them\ncompatible with any Gujarati editor. Both subjective and objective evaluations\nare conducted to assess the synthesized images and fonts. Subjective evaluation\nthrough user studies provides feedback on quality and visual appeal, achieving\nan overall accuracy of 84.84%. Notably, eleven characters demonstrated a\nsuccess ratio above 90%. Objective evaluation using an existing recognition\nsystem achieves an overall accuracy of 84.28% in OCR evaluation. Notably,\nfifteen characters had a success ratio of 80% or higher.\n","authors":["Preeti P. Bhatt","Jitendra V. Nasriwala","Rakesh R. Savant"],"pdf_url":"https://arxiv.org/pdf/2404.03277v1.pdf","comment":"13 pages, 2 column, 12 figures"},{"id":"http://arxiv.org/abs/2404.01758v2","updated":"2024-04-04T08:03:04Z","published":"2024-04-02T09:18:52Z","title":"GEARS: Local Geometry-aware Hand-object Interaction Synthesis","summary":" Generating realistic hand motion sequences in interaction with objects has\ngained increasing attention with the growing interest in digital humans. Prior\nwork has illustrated the effectiveness of employing occupancy-based or\ndistance-based virtual sensors to extract hand-object interaction features.\nNonetheless, these methods show limited generalizability across object\ncategories, shapes and sizes. We hypothesize that this is due to two reasons:\n1) the limited expressiveness of employed virtual sensors, and 2) scarcity of\navailable training data. To tackle this challenge, we introduce a novel\njoint-centered sensor designed to reason about local object geometry near\npotential interaction regions. The sensor queries for object surface points in\nthe neighbourhood of each hand joint. As an important step towards mitigating\nthe learning complexity, we transform the points from global frame to hand\ntemplate frame and use a shared module to process sensor features of each\nindividual joint. This is followed by a spatio-temporal transformer network\naimed at capturing correlation among the joints in different dimensions.\nMoreover, we devise simple heuristic rules to augment the limited training\nsequences with vast static hand grasping samples. This leads to a broader\nspectrum of grasping types observed during training, in turn enhancing our\nmodel's generalization capability. We evaluate on two public datasets, GRAB and\nInterCap, where our method shows superiority over baselines both quantitatively\nand perceptually.\n","authors":["Keyang Zhou","Bharat Lal Bhatnagar","Jan Eric Lenssen","Gerard Pons-moll"],"pdf_url":"https://arxiv.org/pdf/2404.01758v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02935v3","updated":"2024-04-04T07:56:59Z","published":"2023-08-05T18:32:49Z","title":"Bias Behind the Wheel: Fairness Analysis of Autonomous Driving Systems","summary":" This paper analyzes fairness in automated pedestrian detection, a crucial but\nunder-explored issue in autonomous driving systems. We evaluate eight\nstate-of-the-art deep learning-based pedestrian detectors across demographic\ngroups on large-scale real-world datasets. To enable thorough fairness testing,\nwe provide extensive annotations for the datasets, resulting in 8,311 images\nwith 16,070 gender labels, 20,115 age labels, and 3,513 skin tone labels. Our\nfindings reveal significant fairness issues, particularly related to age. The\nundetected proportions for children are 20.14% higher compared to adults.\nFurthermore, we explore how various driving scenarios affect the fairness of\npedestrian detectors. We find that pedestrian detectors demonstrate significant\ngender biases during night time, potentially exacerbating the prevalent\nsocietal issue of female safety concerns during nighttime out. Moreover, we\nobserve that pedestrian detectors can demonstrate both enhanced fairness and\nsuperior performance under specific driving conditions, which challenges the\nfairness-performance trade-off theory widely acknowledged in the fairness\nliterature. We publicly release the code, data, and results to support future\nresearch on fairness in autonomous driving.\n","authors":["Xinyue Li","Zhenpeng Chen","Jie M. Zhang","Federica Sarro","Ying Zhang","Xuanzhe Liu"],"pdf_url":"https://arxiv.org/pdf/2308.02935v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03256v1","updated":"2024-04-04T07:26:26Z","published":"2024-04-04T07:26:26Z","title":"Multi Positive Contrastive Learning with Pose-Consistent Generated\n Images","summary":" Model pre-training has become essential in various recognition tasks.\nMeanwhile, with the remarkable advancements in image generation models,\npre-training methods utilizing generated images have also emerged given their\nability to produce unlimited training data. However, while existing methods\nutilizing generated images excel in classification, they fall short in more\npractical tasks, such as human pose estimation. In this paper, we have\nexperimentally demonstrated it and propose the generation of visually distinct\nimages with identical human poses. We then propose a novel multi-positive\ncontrastive learning, which optimally utilize the previously generated images\nto learn structural features of the human body. We term the entire learning\npipeline as GenPoCCL. Despite using only less than 1% amount of data compared\nto current state-of-the-art method, GenPoCCL captures structural features of\nthe human body more effectively, surpassing existing methods in a variety of\nhuman-centric perception tasks.\n","authors":["Sho Inayoshi","Aji Resindra Widya","Satoshi Ozaki","Junji Otsuka","Takeshi Ohashi"],"pdf_url":"https://arxiv.org/pdf/2404.03256v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03253v1","updated":"2024-04-04T07:19:31Z","published":"2024-04-04T07:19:31Z","title":"A dataset of primary nasopharyngeal carcinoma MRI with multi-modalities\n segmentation","summary":" Multi-modality magnetic resonance imaging data with various sequences\nfacilitate the early diagnosis, tumor segmentation, and disease staging in the\nmanagement of nasopharyngeal carcinoma (NPC). The lack of publicly available,\ncomprehensive datasets limits advancements in diagnosis, treatment planning,\nand the development of machine learning algorithms for NPC. Addressing this\ncritical need, we introduce the first comprehensive NPC MRI dataset,\nencompassing MR axial imaging of 277 primary NPC patients. This dataset\nincludes T1-weighted, T2-weighted, and contrast-enhanced T1-weighted sequences,\ntotaling 831 scans. In addition to the corresponding clinical data, manually\nannotated and labeled segmentations by experienced radiologists offer\nhigh-quality data resources from untreated primary NPC.\n","authors":["Yin Li","Qi Chen","Kai Wang","Meige Li","Liping Si","Yingwei Guo","Yu Xiong","Qixing Wang","Yang Qin","Ling Xu","Patrick van der Smagt","Jun Tang","Nutan Chen"],"pdf_url":"https://arxiv.org/pdf/2404.03253v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03251v1","updated":"2024-04-04T07:14:12Z","published":"2024-04-04T07:14:12Z","title":"Real-time Noise Source Estimation of a Camera System from an Image and\n Metadata","summary":" Autonomous machines must self-maintain proper functionality to ensure the\nsafety of humans and themselves. This pertains particularly to its cameras as\npredominant sensors to perceive the environment and support actions. A\nfundamental camera problem addressed in this study is noise. Solutions often\nfocus on denoising images a posteriori, that is, fighting symptoms rather than\nroot causes. However, tackling root causes requires identifying the noise\nsources, considering the limitations of mobile platforms. This work\ninvestigates a real-time, memory-efficient and reliable noise source estimator\nthat combines data- and physically-based models. To this end, a DNN that\nexamines an image with camera metadata for major camera noise sources is built\nand trained. In addition, it quantifies unexpected factors that impact image\nnoise or metadata. This study investigates seven different estimators on six\ndatasets that include synthetic noise, real-world noise from two camera\nsystems, and real field campaigns. For these, only the model with most metadata\nis capable to accurately and robustly quantify all individual noise\ncontributions. This method outperforms total image noise estimators and can be\nplug-and-play deployed. It also serves as a basis to include more advanced\nnoise sources, or as part of an automatic countermeasure feedback-loop to\napproach fully reliable machines.\n","authors":["Maik Wischow","Patrick Irmisch","Anko Boerner","Guillermo Gallego"],"pdf_url":"https://arxiv.org/pdf/2404.03251v1.pdf","comment":"16 pages, 16 figures, 12 tables, Project page:\n https://github.com/MaikWischow/Noise-Source-Estimation"},{"id":"http://arxiv.org/abs/2404.03248v1","updated":"2024-04-04T07:07:34Z","published":"2024-04-04T07:07:34Z","title":"Learning Transferable Negative Prompts for Out-of-Distribution Detection","summary":" Existing prompt learning methods have shown certain capabilities in\nOut-of-Distribution (OOD) detection, but the lack of OOD images in the target\ndataset in their training can lead to mismatches between OOD images and\nIn-Distribution (ID) categories, resulting in a high false positive rate. To\naddress this issue, we introduce a novel OOD detection method, named\n'NegPrompt', to learn a set of negative prompts, each representing a negative\nconnotation of a given class label, for delineating the boundaries between ID\nand OOD images. It learns such negative prompts with ID data only, without any\nreliance on external outlier data. Further, current methods assume the\navailability of samples of all ID classes, rendering them ineffective in\nopen-vocabulary learning scenarios where the inference stage can contain novel\nID classes not present during training. In contrast, our learned negative\nprompts are transferable to novel class labels. Experiments on various ImageNet\nbenchmarks show that NegPrompt surpasses state-of-the-art prompt-learning-based\nOOD detection methods and maintains a consistent lead in hard OOD detection in\nclosed- and open-vocabulary classification scenarios. Code is available at\nhttps://github.com/mala-lab/negprompt.\n","authors":["Tianqi Li","Guansong Pang","Xiao Bai","Wenjun Miao","Jin Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.03248v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2306.10482v2","updated":"2024-04-04T07:00:56Z","published":"2023-06-18T05:37:38Z","title":"Weighted structure tensor total variation for image denoising","summary":" For image denoising problems, the structure tensor total variation\n(STV)-based models show good performances when compared with other competing\nregularization approaches. However, the STV regularizer does not couple the\nlocal information of the image and may not maintain the image details.\nTherefore, we employ the anisotropic weighted matrix introduced in the\nanisotropic total variation (ATV) model to improve the STV model. By applying\nthe weighted matrix to the discrete gradient of the patch-based Jacobian\noperator in STV, our proposed weighted STV (WSTV) model can effectively capture\nlocal information from images and maintain their details during the denoising\nprocess. The optimization problem in the model is solved by a fast first-order\ngradient projection algorithm with a complexity result of $O(1 / i^2)$. For\nimages with different Gaussian noise levels, the experimental results\ndemonstrate that the WSTV model can effectively improve the quality of restored\nimages compared to other TV and STV-based models.\n","authors":["Xiuhan Sheng","Lijuan Yang","Jingya Chang"],"pdf_url":"https://arxiv.org/pdf/2306.10482v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03242v1","updated":"2024-04-04T06:58:39Z","published":"2024-04-04T06:58:39Z","title":"Would Deep Generative Models Amplify Bias in Future Models?","summary":" We investigate the impact of deep generative models on potential social\nbiases in upcoming computer vision models. As the internet witnesses an\nincreasing influx of AI-generated images, concerns arise regarding inherent\nbiases that may accompany them, potentially leading to the dissemination of\nharmful content. This paper explores whether a detrimental feedback loop,\nresulting in bias amplification, would occur if generated images were used as\nthe training data for future models. We conduct simulations by progressively\nsubstituting original images in COCO and CC3M datasets with images generated\nthrough Stable Diffusion. The modified datasets are used to train OpenCLIP and\nimage captioning models, which we evaluate in terms of quality and bias.\nContrary to expectations, our findings indicate that introducing generated\nimages during training does not uniformly amplify bias. Instead, instances of\nbias mitigation across specific tasks are observed. We further explore the\nfactors that may influence these phenomena, such as artifacts in image\ngeneration (e.g., blurry faces) or pre-existing biases in the original\ndatasets.\n","authors":["Tianwei Chen","Yusuke Hirota","Mayu Otani","Noa Garcia","Yuta Nakashima"],"pdf_url":"https://arxiv.org/pdf/2404.03242v1.pdf","comment":"This paper has been accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2401.12433v2","updated":"2024-04-04T06:57:39Z","published":"2024-01-23T01:52:49Z","title":"A Novel Garment Transfer Method Supervised by Distilled Knowledge of\n Virtual Try-on Model","summary":" This paper proposes a novel garment transfer method supervised with knowledge\ndistillation from virtual try-on. Our method first reasons the transfer parsing\nto provide shape prior to downstream tasks. We employ a multi-phase teaching\nstrategy to supervise the training of the transfer parsing reasoning model,\nlearning the response and feature knowledge from the try-on parsing reasoning\nmodel. To correct the teaching error, it transfers the garment back to its\nowner to absorb the hard knowledge in the self-study phase. Guided by the\ntransfer parsing, we adjust the position of the transferred garment via STN to\nprevent distortion. Afterward, we estimate a progressive flow to precisely warp\nthe garment with shape and content correspondences. To ensure warping\nrationality, we supervise the training of the garment warping model using\ntarget shape and warping knowledge from virtual try-on. To better preserve body\nfeatures in the transfer result, we propose a well-designed training strategy\nfor the arm regrowth task to infer new exposure skin. Experiments demonstrate\nthat our method has state-of-the-art performance compared with other virtual\ntry-on and garment transfer methods in garment transfer, especially for\npreserving garment texture and body features.\n","authors":["Naiyu Fang","Lemiao Qiu","Shuyou Zhang","Zili Wang","Kerui Hu","Jianrong Tan"],"pdf_url":"https://arxiv.org/pdf/2401.12433v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06395v2","updated":"2024-04-04T06:46:42Z","published":"2024-01-12T06:28:54Z","title":"ModaVerse: Efficiently Transforming Modalities with LLMs","summary":" Humans possess the capability to comprehend diverse modalities and seamlessly\ntransfer information between them. In this work, we introduce ModaVerse, a\nMulti-modal Large Language Model (MLLM) capable of comprehending and\ntransforming content across various modalities including images, videos, and\naudio. Predominant MLLM frameworks have largely relied on the alignment of\nlatent spaces of textual and non-textual features. This alignment process,\nwhich synchronizes a language model trained on textual data with encoders and\ndecoders trained on multi-modal data, often necessitates extensive training of\nseveral projection layers in multiple stages. Inspired by LLM-as-agent\nmethodologies, we propose a novel Input/Output (I/O) alignment mechanism that\noperates directly at the level of natural language. It aligns the LLM's output\nwith the input of generative models, avoiding the complexities associated with\nlatent feature alignments, and simplifying the multiple training stages of\nexisting MLLMs into a single, efficient process. This conceptual advancement\nleads to significant reductions in both data and computational costs. By\nconducting experiments on several benchmarks, we demonstrate that our approach\nattains comparable performance with the state of the art while achieving\nconsiderable efficiencies in data usage and training duration.\n","authors":["Xinyu Wang","Bohan Zhuang","Qi Wu"],"pdf_url":"https://arxiv.org/pdf/2401.06395v2.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2404.03225v1","updated":"2024-04-04T06:20:22Z","published":"2024-04-04T06:20:22Z","title":"FACTUAL: A Novel Framework for Contrastive Learning Based Robust SAR\n Image Classification","summary":" Deep Learning (DL) Models for Synthetic Aperture Radar (SAR) Automatic Target\nRecognition (ATR), while delivering improved performance, have been shown to be\nquite vulnerable to adversarial attacks. Existing works improve robustness by\ntraining models on adversarial samples. However, by focusing mostly on attacks\nthat manipulate images randomly, they neglect the real-world feasibility of\nsuch attacks. In this paper, we propose FACTUAL, a novel Contrastive Learning\nframework for Adversarial Training and robust SAR classification. FACTUAL\nconsists of two components: (1) Differing from existing works, a novel\nperturbation scheme that incorporates realistic physical adversarial attacks\n(such as OTSA) to build a supervised adversarial pre-training network. This\nnetwork utilizes class labels for clustering clean and perturbed images\ntogether into a more informative feature space. (2) A linear classifier\ncascaded after the encoder to use the computed representations to predict the\ntarget labels. By pre-training and fine-tuning our model on both clean and\nadversarial samples, we show that our model achieves high prediction accuracy\non both cases. Our model achieves 99.7% accuracy on clean samples, and 89.6% on\nperturbed samples, both outperforming previous state-of-the-art methods.\n","authors":["Xu Wang","Tian Ye","Rajgopal Kannan","Viktor Prasanna"],"pdf_url":"https://arxiv.org/pdf/2404.03225v1.pdf","comment":"2024 IEEE Radar Conference"},{"id":"http://arxiv.org/abs/2404.03219v1","updated":"2024-04-04T05:54:19Z","published":"2024-04-04T05:54:19Z","title":"iSeg: Interactive 3D Segmentation via Interactive Attention","summary":" We present iSeg, a new interactive technique for segmenting 3D shapes.\nPrevious works have focused mainly on leveraging pre-trained 2D foundation\nmodels for 3D segmentation based on text. However, text may be insufficient for\naccurately describing fine-grained spatial segmentations. Moreover, achieving a\nconsistent 3D segmentation using a 2D model is challenging since occluded areas\nof the same semantic region may not be visible together from any 2D view. Thus,\nwe design a segmentation method conditioned on fine user clicks, which operates\nentirely in 3D. Our system accepts user clicks directly on the shape's surface,\nindicating the inclusion or exclusion of regions from the desired shape\npartition. To accommodate various click settings, we propose a novel\ninteractive attention module capable of processing different numbers and types\nof clicks, enabling the training of a single unified interactive segmentation\nmodel. We apply iSeg to a myriad of shapes from different domains,\ndemonstrating its versatility and faithfulness to the user's specifications.\nOur project page is at https://threedle.github.io/iSeg/.\n","authors":["Itai Lang","Fei Xu","Dale Decatur","Sudarshan Babu","Rana Hanocka"],"pdf_url":"https://arxiv.org/pdf/2404.03219v1.pdf","comment":"Project page: https://threedle.github.io/iSeg/"},{"id":"http://arxiv.org/abs/2404.03214v1","updated":"2024-04-04T05:39:09Z","published":"2024-04-04T05:39:09Z","title":"LeGrad: An Explainability Method for Vision Transformers via Feature\n Formation Sensitivity","summary":" Vision Transformers (ViTs), with their ability to model long-range\ndependencies through self-attention mechanisms, have become a standard\narchitecture in computer vision. However, the interpretability of these models\nremains a challenge. To address this, we propose LeGrad, an explainability\nmethod specifically designed for ViTs. LeGrad computes the gradient with\nrespect to the attention maps of ViT layers, considering the gradient itself as\nthe explainability signal. We aggregate the signal over all layers, combining\nthe activations of the last as well as intermediate tokens to produce the\nmerged explainability map. This makes LeGrad a conceptually simple and an\neasy-to-implement tool for enhancing the transparency of ViTs. We evaluate\nLeGrad in challenging segmentation, perturbation, and open-vocabulary settings,\nshowcasing its versatility compared to other SotA explainability methods\ndemonstrating its superior spatial fidelity and robustness to perturbations. A\ndemo and the code is available at https://github.com/WalBouss/LeGrad.\n","authors":["Walid Bousselham","Angie Boggust","Sofian Chaybouti","Hendrik Strobelt","Hilde Kuehne"],"pdf_url":"https://arxiv.org/pdf/2404.03214v1.pdf","comment":"Code available at https://github.com/WalBouss/LeGrad"},{"id":"http://arxiv.org/abs/2404.03210v1","updated":"2024-04-04T05:33:06Z","published":"2024-04-04T05:33:06Z","title":"HDR Imaging for Dynamic Scenes with Events","summary":" High dynamic range imaging (HDRI) for real-world dynamic scenes is\nchallenging because moving objects may lead to hybrid degradation of low\ndynamic range and motion blur. Existing event-based approaches only focus on a\nseparate task, while cascading HDRI and motion deblurring would lead to\nsub-optimal solutions, and unavailable ground-truth sharp HDR images aggravate\nthe predicament. To address these challenges, we propose an Event-based HDRI\nframework within a Self-supervised learning paradigm, i.e., Self-EHDRI, which\ngeneralizes HDRI performance in real-world dynamic scenarios. Specifically, a\nself-supervised learning strategy is carried out by learning cross-domain\nconversions from blurry LDR images to sharp LDR images, which enables sharp HDR\nimages to be accessible in the intermediate process even though ground-truth\nsharp HDR images are missing. Then, we formulate the event-based HDRI and\nmotion deblurring model and conduct a unified network to recover the\nintermediate sharp HDR results, where both the high dynamic range and high\ntemporal resolution of events are leveraged simultaneously for compensation. We\nconstruct large-scale synthetic and real-world datasets to evaluate the\neffectiveness of our method. Comprehensive experiments demonstrate that the\nproposed Self-EHDRI outperforms state-of-the-art approaches by a large margin.\nThe codes, datasets, and results are available at\nhttps://lxp-whu.github.io/Self-EHDRI.\n","authors":["Li Xiaopeng","Zeng Zhaoyuan","Fan Cien","Zhao Chen","Deng Lei","Yu Lei"],"pdf_url":"https://arxiv.org/pdf/2404.03210v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03202v1","updated":"2024-04-04T05:10:26Z","published":"2024-04-04T05:10:26Z","title":"OmniGS: Omnidirectional Gaussian Splatting for Fast Radiance Field\n Reconstruction using Omnidirectional Images","summary":" Photorealistic reconstruction relying on 3D Gaussian Splatting has shown\npromising potential in robotics. However, the current 3D Gaussian Splatting\nsystem only supports radiance field reconstruction using undistorted\nperspective images. In this paper, we present OmniGS, a novel omnidirectional\nGaussian splatting system, to take advantage of omnidirectional images for fast\nradiance field reconstruction. Specifically, we conduct a theoretical analysis\nof spherical camera model derivatives in 3D Gaussian Splatting. According to\nthe derivatives, we then implement a new GPU-accelerated omnidirectional\nrasterizer that directly splats 3D Gaussians onto the equirectangular screen\nspace for omnidirectional image rendering. As a result, we realize\ndifferentiable optimization of the radiance field without the requirement of\ncube-map rectification or tangent-plane approximation. Extensive experiments\nconducted in egocentric and roaming scenarios demonstrate that our method\nachieves state-of-the-art reconstruction quality and high rendering speed using\nomnidirectional images. To benefit the research community, the code will be\nmade publicly available once the paper is published.\n","authors":["Longwei Li","Huajian Huang","Sai-Kit Yeung","Hui Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.03202v1.pdf","comment":"IROS 2024 submission, 7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.03200v1","updated":"2024-04-04T05:08:51Z","published":"2024-04-04T05:08:51Z","title":"Future-Proofing Class Incremental Learning","summary":" Exemplar-Free Class Incremental Learning is a highly challenging setting\nwhere replay memory is unavailable. Methods relying on frozen feature\nextractors have drawn attention recently in this setting due to their\nimpressive performances and lower computational costs. However, those methods\nare highly dependent on the data used to train the feature extractor and may\nstruggle when an insufficient amount of classes are available during the first\nincremental step. To overcome this limitation, we propose to use a pre-trained\ntext-to-image diffusion model in order to generate synthetic images of future\nclasses and use them to train the feature extractor. Experiments on the\nstandard benchmarks CIFAR100 and ImageNet-Subset demonstrate that our proposed\nmethod can be used to improve state-of-the-art methods for exemplar-free class\nincremental learning, especially in the most difficult settings where the first\nincremental step only contains few classes. Moreover, we show that using\nsynthetic samples of future classes achieves higher performance than using real\ndata from different classes, paving the way for better and less costly\npre-training methods for incremental learning.\n","authors":["Quentin Jodelet","Xin Liu","Yin Jun Phua","Tsuyoshi Murata"],"pdf_url":"https://arxiv.org/pdf/2404.03200v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.09934v7","updated":"2024-04-04T04:52:43Z","published":"2022-07-20T14:20:35Z","title":"DeepIPC: Deeply Integrated Perception and Control for an Autonomous\n Vehicle in Real Environments","summary":" In this work, we introduce DeepIPC, a novel end-to-end model tailored for\nautonomous driving, which seamlessly integrates perception and control tasks.\nUnlike traditional models that handle these tasks separately, DeepIPC\ninnovatively combines a perception module, which processes RGBD images for\nsemantic segmentation and generates bird's eye view (BEV) mappings, with a\ncontroller module that utilizes these insights along with GNSS and angular\nspeed measurements to accurately predict navigational waypoints. This\nintegration allows DeepIPC to efficiently translate complex environmental data\ninto actionable driving commands. Our comprehensive evaluation demonstrates\nDeepIPC's superior performance in terms of drivability and multi-task\nefficiency across diverse real-world scenarios, setting a new benchmark for\nend-to-end autonomous driving systems with a leaner model architecture. The\nexperimental results underscore DeepIPC's potential to significantly enhance\nautonomous vehicular navigation, promising a step forward in the development of\nautonomous driving technologies. For further insights and replication, we will\nmake our code and datasets available at https://github.com/oskarnatan/DeepIPC.\n","authors":["Oskar Natan","Jun Miura"],"pdf_url":"https://arxiv.org/pdf/2207.09934v7.pdf","comment":"Accepted for Publication in IEEE Access"},{"id":"http://arxiv.org/abs/2404.02388v2","updated":"2024-04-04T04:23:10Z","published":"2024-04-03T01:13:05Z","title":"CAPE: CAM as a Probabilistic Ensemble for Enhanced DNN Interpretation","summary":" Deep Neural Networks (DNNs) are widely used for visual classification tasks,\nbut their complex computation process and black-box nature hinder decision\ntransparency and interpretability. Class activation maps (CAMs) and recent\nvariants provide ways to visually explain the DNN decision-making process by\ndisplaying 'attention' heatmaps of the DNNs. Nevertheless, the CAM explanation\nonly offers relative attention information, that is, on an attention heatmap,\nwe can interpret which image region is more or less important than the others.\nHowever, these regions cannot be meaningfully compared across classes, and the\ncontribution of each region to the model's class prediction is not revealed. To\naddress these challenges that ultimately lead to better DNN Interpretation, in\nthis paper, we propose CAPE, a novel reformulation of CAM that provides a\nunified and probabilistically meaningful assessment of the contributions of\nimage regions. We quantitatively and qualitatively compare CAPE with\nstate-of-the-art CAM methods on CUB and ImageNet benchmark datasets to\ndemonstrate enhanced interpretability. We also test on a cytology imaging\ndataset depicting a challenging Chronic Myelomonocytic Leukemia (CMML)\ndiagnosis problem. Code is available at: https://github.com/AIML-MED/CAPE.\n","authors":["Townim Faisal Chowdhury","Kewen Liao","Vu Minh Hieu Phan","Minh-Son To","Yutong Xie","Kevin Hung","David Ross","Anton van den Hengel","Johan W. Verjans","Zhibin Liao"],"pdf_url":"https://arxiv.org/pdf/2404.02388v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03191v1","updated":"2024-04-04T04:22:50Z","published":"2024-04-04T04:22:50Z","title":"CORP: A Multi-Modal Dataset for Campus-Oriented Roadside Perception\n Tasks","summary":" Numerous roadside perception datasets have been introduced to propel\nadvancements in autonomous driving and intelligent transportation systems\nresearch and development. However, it has been observed that the majority of\ntheir concentrates is on urban arterial roads, inadvertently overlooking\nresidential areas such as parks and campuses that exhibit entirely distinct\ncharacteristics. In light of this gap, we propose CORP, which stands as the\nfirst public benchmark dataset tailored for multi-modal roadside perception\ntasks under campus scenarios. Collected in a university campus, CORP consists\nof over 205k images plus 102k point clouds captured from 18 cameras and 9 LiDAR\nsensors. These sensors with different configurations are mounted on roadside\nutility poles to provide diverse viewpoints within the campus region. The\nannotations of CORP encompass multi-dimensional information beyond 2D and 3D\nbounding boxes, providing extra support for 3D seamless tracking and instance\nsegmentation with unique IDs and pixel masks for identifying targets, to\nenhance the understanding of objects and their behaviors distributed across the\ncampus premises. Unlike other roadside datasets about urban traffic, CORP\nextends the spectrum to highlight the challenges for multi-modal perception in\ncampuses and other residential areas.\n","authors":["Beibei Wang","Lu Zhang","Shuang Meng","Chenjie Wang","Jingjing Huang","Yao Li","Haojie Ren","Yuxuan Xiao","Yuru Peng","Jianmin Ji","Yu Zhang","Yanyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.03191v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03190v1","updated":"2024-04-04T04:22:25Z","published":"2024-04-04T04:22:25Z","title":"Adaptive Discrete Disparity Volume for Self-supervised Monocular Depth\n Estimation","summary":" In self-supervised monocular depth estimation tasks, discrete disparity\nprediction has been proven to attain higher quality depth maps than common\ncontinuous methods. However, current discretization strategies often divide\ndepth ranges of scenes into bins in a handcrafted and rigid manner, limiting\nmodel performance. In this paper, we propose a learnable module, Adaptive\nDiscrete Disparity Volume (ADDV), which is capable of dynamically sensing depth\ndistributions in different RGB images and generating adaptive bins for them.\nWithout any extra supervision, this module can be integrated into existing CNN\narchitectures, allowing networks to produce representative values for bins and\na probability volume over them. Furthermore, we introduce novel training\nstrategies - uniformizing and sharpening - through a loss term and temperature\nparameter, respectively, to provide regularizations under self-supervised\nconditions, preventing model degradation or collapse. Empirical results\ndemonstrate that ADDV effectively processes global information, generating\nappropriate bins for various scenes and producing higher quality depth maps\ncompared to handcrafted methods.\n","authors":["Jianwei Ren"],"pdf_url":"https://arxiv.org/pdf/2404.03190v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03188v1","updated":"2024-04-04T04:16:31Z","published":"2024-04-04T04:16:31Z","title":"Classification of Nasopharyngeal Cases using DenseNet Deep Learning\n Architecture","summary":" Nasopharyngeal carcinoma (NPC) is one of the understudied yet deadliest\ncancers in South East Asia. In Malaysia, the prevalence is identified mainly in\nSarawak, among the ethnic of Bidayuh. NPC is often late-diagnosed because it is\nasymptomatic at the early stage. There are several tissue representations from\nthe nasopharynx biopsy, such as nasopharyngeal inflammation (NPI), lymphoid\nhyperplasia (LHP), nasopharyngeal carcinoma (NPC) and normal tissue. This paper\nis our first initiative to identify the difference between NPC, NPI and normal\ncases. Seven whole slide images (WSIs) with gigapixel resolutions from seven\ndifferent patients and two hospitals were experimented with using two test\nsetups, consisting of a different set of images. The tissue regions are patched\ninto smaller blocks and classified using DenseNet architecture with 21 dense\nlayers. Two tests are carried out, each for proof of concept (Test 1) and\nreal-test scenario (Test 2). The accuracy achieved for NPC class is 94.8% for\nTest 1 and 67.0% for Test 2.\n","authors":["W. S. H. M. W. Ahmad","M. F. A. Fauzi","M. K. Abdullahi","Jenny T. H. Lee","N. S. A. Basry","A Yahaya","A. M. Ismail","A. Adam","Elaine W. L. Chan","F. S. Abas"],"pdf_url":"https://arxiv.org/pdf/2404.03188v1.pdf","comment":"This article has been accepted in the Journal of Engineering Science\n and Technology (JESTEC) and awaiting publication"},{"id":"http://arxiv.org/abs/2404.03187v1","updated":"2024-04-04T04:12:30Z","published":"2024-04-04T04:12:30Z","title":"AGL-NET: Aerial-Ground Cross-Modal Global Localization with Varying\n Scales","summary":" We present AGL-NET, a novel learning-based method for global localization\nusing LiDAR point clouds and satellite maps. AGL-NET tackles two critical\nchallenges: bridging the representation gap between image and points modalities\nfor robust feature matching, and handling inherent scale discrepancies between\nglobal view and local view. To address these challenges, AGL-NET leverages a\nunified network architecture with a novel two-stage matching design. The first\nstage extracts informative neural features directly from raw sensor data and\nperforms initial feature matching. The second stage refines this matching\nprocess by extracting informative skeleton features and incorporating a novel\nscale alignment step to rectify scale variations between LiDAR and map data.\nFurthermore, a novel scale and skeleton loss function guides the network toward\nlearning scale-invariant feature representations, eliminating the need for\npre-processing satellite maps. This significantly improves real-world\napplicability in scenarios with unknown map scales. To facilitate rigorous\nperformance evaluation, we introduce a meticulously designed dataset within the\nCARLA simulator specifically tailored for metric localization training and\nassessment. The code and dataset will be made publicly available.\n","authors":["Tianrui Guan","Ruiqi Xian","Xijun Wang","Xiyang Wu","Mohamed Elnoor","Daeun Song","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2404.03187v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12301v3","updated":"2024-04-04T04:11:05Z","published":"2023-07-23T11:50:27Z","title":"Image Outlier Detection Without Training using RANSAC","summary":" Image outlier detection (OD) is an essential tool to ensure the quality of\nimages used in computer vision tasks. Existing algorithms often involve\ntraining a model to represent the inlier distribution, and outliers are\ndetermined by some deviation measure. Although existing methods proved\neffective when trained on strictly inlier samples, their performance remains\nquestionable when undesired outliers are included during training. As a result\nof this limitation, it is necessary to carefully examine the data when\ndeveloping OD models for new domains. In this work, we present a novel image OD\nalgorithm called RANSAC-NN that eliminates the need of data examination and\nmodel training altogether. Unlike existing approaches, RANSAC-NN can be\ndirectly applied on datasets containing outliers by sampling and comparing\nsubsets of the data. Our algorithm maintains favorable performance compared to\nexisting methods on a range of benchmarks. Furthermore, we show that RANSAC-NN\ncan enhance the robustness of existing methods by incorporating our algorithm\nas part of the data preparation process.\n","authors":["Chen-Han Tsai","Yu-Shao Peng"],"pdf_url":"https://arxiv.org/pdf/2307.12301v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06647v3","updated":"2024-04-04T04:07:48Z","published":"2023-07-13T09:23:21Z","title":"DeepIPCv2: LiDAR-powered Robust Environmental Perception and\n Navigational Control for Autonomous Vehicle","summary":" We present DeepIPCv2, an autonomous driving model that perceives the\nenvironment using a LiDAR sensor for more robust drivability, especially when\ndriving under poor illumination conditions where everything is not clearly\nvisible. DeepIPCv2 takes a set of LiDAR point clouds as the main perception\ninput. Since point clouds are not affected by illumination changes, they can\nprovide a clear observation of the surroundings no matter what the condition\nis. This results in a better scene understanding and stable features provided\nby the perception module to support the controller module in estimating\nnavigational control properly. To evaluate its performance, we conduct several\ntests by deploying the model to predict a set of driving records and perform\nreal automated driving under three different conditions. We also conduct\nablation and comparative studies with some recent models to justify its\nperformance. Based on the experimental results, DeepIPCv2 shows a robust\nperformance by achieving the best drivability in all driving scenarios.\nFurthermore, to support future research, we will upload the codes and data to\nhttps://github.com/oskarnatan/DeepIPCv2.\n","authors":["Oskar Natan","Jun Miura"],"pdf_url":"https://arxiv.org/pdf/2307.06647v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03183v1","updated":"2024-04-04T03:45:17Z","published":"2024-04-04T03:45:17Z","title":"BodyMAP -- Jointly Predicting Body Mesh and 3D Applied Pressure Map for\n People in Bed","summary":" Accurately predicting the 3D human posture and the pressure exerted on the\nbody for people resting in bed, visualized as a body mesh (3D pose & shape)\nwith a 3D pressure map, holds significant promise for healthcare applications,\nparticularly, in the prevention of pressure ulcers. Current methods focus on\nsingular facets of the problem -- predicting only 2D/3D poses, generating 2D\npressure images, predicting pressure only for certain body regions instead of\nthe full body, or forming indirect approximations to the 3D pressure map. In\ncontrast, we introduce BodyMAP, which jointly predicts the human body mesh and\n3D applied pressure map across the entire human body. Our network leverages\nmultiple visual modalities, incorporating both a depth image of a person in bed\nand its corresponding 2D pressure image acquired from a pressure-sensing\nmattress. The 3D pressure map is represented as a pressure value at each mesh\nvertex and thus allows for precise localization of high-pressure regions on the\nbody. Additionally, we present BodyMAP-WS, a new formulation of pressure\nprediction in which we implicitly learn pressure in 3D by aligning sensed 2D\npressure images with a differentiable 2D projection of the predicted 3D\npressure maps. In evaluations with real-world human data, our method\noutperforms the current state-of-the-art technique by 25% on both body mesh and\n3D applied pressure map prediction tasks for people in bed.\n","authors":["Abhishek Tandon","Anujraaj Goyal","Henry M. Clever","Zackory Erickson"],"pdf_url":"https://arxiv.org/pdf/2404.03183v1.pdf","comment":"Accepted at CVPR 2024 Project Website: https://bodymap3d.github.io/\n Code: https://github.com/RCHI-Lab/BodyMAP"},{"id":"http://arxiv.org/abs/2404.03181v1","updated":"2024-04-04T03:30:49Z","published":"2024-04-04T03:30:49Z","title":"MonoCD: Monocular 3D Object Detection with Complementary Depths","summary":" Monocular 3D object detection has attracted widespread attention due to its\npotential to accurately obtain object 3D localization from a single image at a\nlow cost. Depth estimation is an essential but challenging subtask of monocular\n3D object detection due to the ill-posedness of 2D to 3D mapping. Many methods\nexplore multiple local depth clues such as object heights and keypoints and\nthen formulate the object depth estimation as an ensemble of multiple depth\npredictions to mitigate the insufficiency of single-depth information. However,\nthe errors of existing multiple depths tend to have the same sign, which\nhinders them from neutralizing each other and limits the overall accuracy of\ncombined depth. To alleviate this problem, we propose to increase the\ncomplementarity of depths with two novel designs. First, we add a new depth\nprediction branch named complementary depth that utilizes global and efficient\ndepth clues from the entire image rather than the local clues to reduce the\ncorrelation of depth predictions. Second, we propose to fully exploit the\ngeometric relations between multiple depth clues to achieve complementarity in\nform. Benefiting from these designs, our method achieves higher\ncomplementarity. Experiments on the KITTI benchmark demonstrate that our method\nachieves state-of-the-art performance without introducing extra data. In\naddition, complementary depth can also be a lightweight and plug-and-play\nmodule to boost multiple existing monocular 3d object detectors. Code is\navailable at https://github.com/elvintanhust/MonoCD.\n","authors":["Longfei Yan","Pei Yan","Shengzhou Xiong","Xuanyu Xiang","Yihua Tan"],"pdf_url":"https://arxiv.org/pdf/2404.03181v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03179v1","updated":"2024-04-04T03:28:57Z","published":"2024-04-04T03:28:57Z","title":"UniAV: Unified Audio-Visual Perception for Multi-Task Video Localization","summary":" Video localization tasks aim to temporally locate specific instances in\nvideos, including temporal action localization (TAL), sound event detection\n(SED) and audio-visual event localization (AVEL). Existing methods\nover-specialize on each task, overlooking the fact that these instances often\noccur in the same video to form the complete video content. In this work, we\npresent UniAV, a Unified Audio-Visual perception network, to achieve joint\nlearning of TAL, SED and AVEL tasks for the first time. UniAV can leverage\ndiverse data available in task-specific datasets, allowing the model to learn\nand share mutually beneficial knowledge across tasks and modalities. To tackle\nthe challenges posed by substantial variations in datasets\n(size/domain/duration) and distinct task characteristics, we propose to\nuniformly encode visual and audio modalities of all videos to derive generic\nrepresentations, while also designing task-specific experts to capture unique\nknowledge for each task. Besides, we develop a unified language-aware\nclassifier by utilizing a pre-trained text encoder, enabling the model to\nflexibly detect various types of instances and previously unseen ones by simply\nchanging prompts during inference. UniAV outperforms its single-task\ncounterparts by a large margin with fewer parameters, achieving on-par or\nsuperior performances compared to state-of-the-art task-specific methods across\nActivityNet 1.3, DESED and UnAV-100 benchmarks.\n","authors":["Tiantian Geng","Teng Wang","Yanfu Zhang","Jinming Duan","Weili Guan","Feng Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.03179v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02232v2","updated":"2024-04-04T03:10:58Z","published":"2023-12-04T06:37:11Z","title":"HumanNeRF-SE: A Simple yet Effective Approach to Animate HumanNeRF with\n Diverse Poses","summary":" We present HumanNeRF-SE, a simple yet effective method that synthesizes\ndiverse novel pose images with simple input. Previous HumanNeRF works require a\nlarge number of optimizable parameters to fit the human images. Instead, we\nreload these approaches by combining explicit and implicit human\nrepresentations to design both generalized rigid deformation and specific\nnon-rigid deformation. Our key insight is that explicit shape can reduce the\nsampling points used to fit implicit representation, and frozen blending\nweights from SMPL constructing a generalized rigid deformation can effectively\navoid overfitting and improve pose generalization performance. Our architecture\ninvolving both explicit and implicit representation is simple yet effective.\nExperiments demonstrate our model can synthesize images under arbitrary poses\nwith few-shot input and increase the speed of synthesizing images by 15 times\nthrough a reduction in computational complexity without using any existing\nacceleration modules. Compared to the state-of-the-art HumanNeRF studies,\nHumanNeRF-SE achieves better performance with fewer learnable parameters and\nless training time.\n","authors":["Caoyuan Ma","Yu-Lun Liu","Zhixiang Wang","Wu Liu","Xinchen Liu","Zheng Wang"],"pdf_url":"https://arxiv.org/pdf/2312.02232v2.pdf","comment":"16pages, 17 figures, 10 tables"},{"id":"http://arxiv.org/abs/2404.02405v2","updated":"2024-04-04T02:56:00Z","published":"2024-04-03T02:16:30Z","title":"TE-TAD: Towards Full End-to-End Temporal Action Detection via\n Time-Aligned Coordinate Expression","summary":" In this paper, we investigate that the normalized coordinate expression is a\nkey factor as reliance on hand-crafted components in query-based detectors for\ntemporal action detection (TAD). Despite significant advancements towards an\nend-to-end framework in object detection, query-based detectors have been\nlimited in achieving full end-to-end modeling in TAD. To address this issue, we\npropose \\modelname{}, a full end-to-end temporal action detection transformer\nthat integrates time-aligned coordinate expression. We reformulate coordinate\nexpression utilizing actual timeline values, ensuring length-invariant\nrepresentations from the extremely diverse video duration environment.\nFurthermore, our proposed adaptive query selection dynamically adjusts the\nnumber of queries based on video length, providing a suitable solution for\nvarying video durations compared to a fixed query set. Our approach not only\nsimplifies the TAD process by eliminating the need for hand-crafted components\nbut also significantly improves the performance of query-based detectors. Our\nTE-TAD outperforms the previous query-based detectors and achieves competitive\nperformance compared to state-of-the-art methods on popular benchmark datasets.\nCode is available at: https://github.com/Dotori-HJ/TE-TAD\n","authors":["Ho-Joong Kim","Jung-Ho Hong","Heejo Kong","Seong-Whan Lee"],"pdf_url":"https://arxiv.org/pdf/2404.02405v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2010.13187v2","updated":"2024-04-04T02:47:09Z","published":"2020-10-25T18:51:15Z","title":"Improving the Reconstruction of Disentangled Representation Learners via\n Multi-Stage Modeling","summary":" Current autoencoder-based disentangled representation learning methods\nachieve disentanglement by penalizing the (aggregate) posterior to encourage\nstatistical independence of the latent factors. This approach introduces a\ntrade-off between disentangled representation learning and reconstruction\nquality since the model does not have enough capacity to learn correlated\nlatent variables that capture detail information present in most image data. To\novercome this trade-off, we present a novel multi-stage modeling approach where\nthe disentangled factors are first learned using a penalty-based disentangled\nrepresentation learning method; then, the low-quality reconstruction is\nimproved with another deep generative model that is trained to model the\nmissing correlated latent variables, adding detail information while\nmaintaining conditioning on the previously learned disentangled factors. Taken\ntogether, our multi-stage modelling approach results in a single, coherent\nprobabilistic model that is theoretically justified by the principal of\nD-separation and can be realized with a variety of model classes including\nlikelihood-based models such as variational autoencoders, implicit models such\nas generative adversarial networks, and tractable models like normalizing flows\nor mixtures of Gaussians. We demonstrate that our multi-stage model has higher\nreconstruction quality than current state-of-the-art methods with equivalent\ndisentanglement performance across multiple standard benchmarks. In addition,\nwe apply the multi-stage model to generate synthetic tabular datasets,\nshowcasing an enhanced performance over benchmark models across a variety of\nmetrics. The interpretability analysis further indicates that the multi-stage\nmodel can effectively uncover distinct and meaningful features of variations\nfrom which the original distribution can be recovered.\n","authors":["Akash Srivastava","Yamini Bansal","Yukun Ding","Cole Lincoln Hurwitz","Kai Xu","Bernhard Egger","Prasanna Sattigeri","Joshua B. Tenenbaum","Phuong Le","Arun Prakash R","Nengfeng Zhou","Joel Vaughan","Yaquan Wang","Anwesha Bhattacharyya","Kristjan Greenewald","David D. Cox","Dan Gutfreund"],"pdf_url":"https://arxiv.org/pdf/2010.13187v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.13739v2","updated":"2024-04-04T02:36:44Z","published":"2023-03-24T01:46:25Z","title":"WM-MoE: Weather-aware Multi-scale Mixture-of-Experts for Blind Adverse\n Weather Removal","summary":" Adverse weather removal tasks like deraining, desnowing, and dehazing are\nusually treated as separate tasks. However, in practical autonomous driving\nscenarios, the type, intensity,and mixing degree of weather are unknown, so\nhandling each task separately cannot deal with the complex practical scenarios.\nIn this paper, we study the blind adverse weather removal problem.\nMixture-of-Experts (MoE) is a popular model that adopts a learnable gate to\nroute the input to different expert networks. The principle of MoE involves\nusing adaptive networks to process different types of unknown inputs.\nTherefore, MoE has great potential for blind adverse weather removal. However,\nthe original MoE module is inadequate for coupled multiple weather types and\nfails to utilize multi-scale features for better performance. To this end, we\npropose a method called Weather-aware Multi-scale MoE (WM-MoE) based on\nTransformer for blind weather removal. WM-MoE includes two key designs:\nWEather-Aware Router (WEAR) and Multi-Scale Experts (MSE). WEAR assigns experts\nfor each image token based on decoupled content and weather features, which\nenhances the model's capability to process multiple adverse weathers. To obtain\ndiscriminative weather features from images, we propose Weather Guidance\nFine-grained Contrastive Learning (WGF-CL), which utilizes weather cluster\ninformation to guide the assignment of positive and negative samples for each\nimage token. Since processing different weather types requires different\nreceptive fields, MSE leverages multi-scale features to enhance the spatial\nrelationship modeling capability, facilitating the high-quality restoration of\ndiverse weather types and intensities. Our method achieves state-of-the-art\nperformance in blind adverse weather removal on two public datasets and our\ndataset. We also demonstrate the advantage of our method on downstream\nsegmentation tasks.\n","authors":["Yulin Luo","Rui Zhao","Xiaobao Wei","Jinwei Chen","Yijie Lu","Shenghao Xie","Tianyu Wang","Ruiqin Xiong","Ming Lu","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.13739v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03161v1","updated":"2024-04-04T02:22:37Z","published":"2024-04-04T02:22:37Z","title":"BioVL-QR: Egocentric Biochemical Video-and-Language Dataset Using Micro\n QR Codes","summary":" This paper introduces a biochemical vision-and-language dataset, which\nconsists of 24 egocentric experiment videos, corresponding protocols, and\nvideo-and-language alignments. The key challenge in the wet-lab domain is\ndetecting equipment, reagents, and containers is difficult because the lab\nenvironment is scattered by filling objects on the table and some objects are\nindistinguishable. Therefore, previous studies assume that objects are manually\nannotated and given for downstream tasks, but this is costly and\ntime-consuming. To address this issue, this study focuses on Micro QR Codes to\ndetect objects automatically. From our preliminary study, we found that\ndetecting objects only using Micro QR Codes is still difficult because the\nresearchers manipulate objects, causing blur and occlusion frequently. To\naddress this, we also propose a novel object labeling method by combining a\nMicro QR Code detector and an off-the-shelf hand object detector. As one of the\napplications of our dataset, we conduct the task of generating protocols from\nexperiment videos and find that our approach can generate accurate protocols.\n","authors":["Taichi Nishimura","Koki Yamamoto","Yuto Haneji","Keiya Kajimura","Chihiro Nishiwaki","Eriko Daikoku","Natsuko Okuda","Fumihito Ono","Hirotaka Kameko","Shinsuke Mori"],"pdf_url":"https://arxiv.org/pdf/2404.03161v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2404.03159v1","updated":"2024-04-04T02:15:16Z","published":"2024-04-04T02:15:16Z","title":"HandDiff: 3D Hand Pose Estimation with Diffusion on Image-Point Cloud","summary":" Extracting keypoint locations from input hand frames, known as 3D hand pose\nestimation, is a critical task in various human-computer interaction\napplications. Essentially, the 3D hand pose estimation can be regarded as a 3D\npoint subset generative problem conditioned on input frames. Thanks to the\nrecent significant progress on diffusion-based generative models, hand pose\nestimation can also benefit from the diffusion model to estimate keypoint\nlocations with high quality. However, directly deploying the existing diffusion\nmodels to solve hand pose estimation is non-trivial, since they cannot achieve\nthe complex permutation mapping and precise localization. Based on this\nmotivation, this paper proposes HandDiff, a diffusion-based hand pose\nestimation model that iteratively denoises accurate hand pose conditioned on\nhand-shaped image-point clouds. In order to recover keypoint permutation and\naccurate location, we further introduce joint-wise condition and local detail\ncondition. Experimental results demonstrate that the proposed HandDiff\nsignificantly outperforms the existing approaches on four challenging hand pose\nbenchmark datasets. Codes and pre-trained models are publicly available at\nhttps://github.com/cwc1260/HandDiff.\n","authors":["Wencan Cheng","Hao Tang","Luc Van Gool","Jong Hwan Ko"],"pdf_url":"https://arxiv.org/pdf/2404.03159v1.pdf","comment":"Accepted as a conference paper to the Conference on Computer Vision\n and Pattern Recognition (2024)"},{"id":"http://arxiv.org/abs/2404.01518v2","updated":"2024-04-04T02:06:15Z","published":"2024-04-01T22:53:47Z","title":"Temporally Consistent Unbalanced Optimal Transport for Unsupervised\n Action Segmentation","summary":" We propose a novel approach to the action segmentation task for long,\nuntrimmed videos, based on solving an optimal transport problem. By encoding a\ntemporal consistency prior into a Gromov-Wasserstein problem, we are able to\ndecode a temporally consistent segmentation from a noisy affinity/matching cost\nmatrix between video frames and action classes. Unlike previous approaches, our\nmethod does not require knowing the action order for a video to attain temporal\nconsistency. Furthermore, our resulting (fused) Gromov-Wasserstein problem can\nbe efficiently solved on GPUs using a few iterations of projected mirror\ndescent. We demonstrate the effectiveness of our method in an unsupervised\nlearning setting, where our method is used to generate pseudo-labels for\nself-training. We evaluate our segmentation approach and unsupervised learning\npipeline on the Breakfast, 50-Salads, YouTube Instructions and Desktop Assembly\ndatasets, yielding state-of-the-art results for the unsupervised video action\nsegmentation task.\n","authors":["Ming Xu","Stephen Gould"],"pdf_url":"https://arxiv.org/pdf/2404.01518v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03145v1","updated":"2024-04-04T01:39:01Z","published":"2024-04-04T01:39:01Z","title":"DreamWalk: Style Space Exploration using Diffusion Guidance","summary":" Text-conditioned diffusion models can generate impressive images, but fall\nshort when it comes to fine-grained control. Unlike direct-editing tools like\nPhotoshop, text conditioned models require the artist to perform \"prompt\nengineering,\" constructing special text sentences to control the style or\namount of a particular subject present in the output image. Our goal is to\nprovide fine-grained control over the style and substance specified by the\nprompt, for example to adjust the intensity of styles in different regions of\nthe image (Figure 1). Our approach is to decompose the text prompt into\nconceptual elements, and apply a separate guidance term for each element in a\nsingle diffusion process. We introduce guidance scale functions to control when\nin the diffusion process and \\emph{where} in the image to intervene. Since the\nmethod is based solely on adjusting diffusion guidance, it does not require\nfine-tuning or manipulating the internal layers of the diffusion model's neural\nnetwork, and can be used in conjunction with LoRA- or DreamBooth-trained models\n(Figure2). Project page: https://mshu1.github.io/dreamwalk.github.io/\n","authors":["Michelle Shu","Charles Herrmann","Richard Strong Bowen","Forrester Cole","Ramin Zabih"],"pdf_url":"https://arxiv.org/pdf/2404.03145v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03144v1","updated":"2024-04-04T01:34:36Z","published":"2024-04-04T01:34:36Z","title":"Diverse and Tailored Image Generation for Zero-shot Multi-label\n Classification","summary":" Recently, zero-shot multi-label classification has garnered considerable\nattention for its capacity to operate predictions on unseen labels without\nhuman annotations. Nevertheless, prevailing approaches often use seen classes\nas imperfect proxies for unseen ones, resulting in suboptimal performance.\nDrawing inspiration from the success of text-to-image generation models in\nproducing realistic images, we propose an innovative solution: generating\nsynthetic data to construct a training set explicitly tailored for proxyless\ntraining on unseen labels. Our approach introduces a novel image generation\nframework that produces multi-label synthetic images of unseen classes for\nclassifier training. To enhance diversity in the generated images, we leverage\na pre-trained large language model to generate diverse prompts. Employing a\npre-trained multi-modal CLIP model as a discriminator, we assess whether the\ngenerated images accurately represent the target classes. This enables\nautomatic filtering of inaccurately generated images, preserving classifier\naccuracy. To refine text prompts for more precise and effective multi-label\nobject generation, we introduce a CLIP score-based discriminative loss to\nfine-tune the text encoder in the diffusion model. Additionally, to enhance\nvisual features on the target task while maintaining the generalization of\noriginal features and mitigating catastrophic forgetting resulting from\nfine-tuning the entire visual encoder, we propose a feature fusion module\ninspired by transformer attention mechanisms. This module aids in capturing\nglobal dependencies between multiple objects more effectively. Extensive\nexperimental results validate the effectiveness of our approach, demonstrating\nsignificant improvements over state-of-the-art methods.\n","authors":["Kaixin Zhang","Zhixiang Yuan","Tao Huang"],"pdf_url":"https://arxiv.org/pdf/2404.03144v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03138v1","updated":"2024-04-04T01:22:23Z","published":"2024-04-04T01:22:23Z","title":"Discontinuity-preserving Normal Integration with Auxiliary Edges","summary":" Many surface reconstruction methods incorporate normal integration, which is\na process to obtain a depth map from surface gradients. In this process, the\ninput may represent a surface with discontinuities, e.g., due to\nself-occlusion. To reconstruct an accurate depth map from the input normal map,\nhidden surface gradients occurring from the jumps must be handled. To model\nthese jumps correctly, we design a novel discretization scheme for the domain\nof normal integration. Our key idea is to introduce auxiliary edges, which\nbridge between piecewise-smooth patches in the domain so that the magnitude of\nhidden jumps can be explicitly expressed. Using the auxiliary edges, we design\na novel algorithm to optimize the discontinuity and the depth map from the\ninput normal map. Our method optimizes discontinuities by using a combination\nof iterative re-weighted least squares and iterative filtering of the jump\nmagnitudes on auxiliary edges to provide strong sparsity regularization.\nCompared to previous discontinuity-preserving normal integration methods, which\nmodel the magnitudes of jumps only implicitly, our method reconstructs subtle\ndiscontinuities accurately thanks to our explicit representation of jumps\nallowing for strong sparsity regularization.\n","authors":["Hyomin Kim","Yucheol Jung","Seungyong Lee"],"pdf_url":"https://arxiv.org/pdf/2404.03138v1.pdf","comment":"To appear at CVPR 2024. For supplementary video, see\n https://youtu.be/MTTcW5kAOFE"},{"id":"http://arxiv.org/abs/2404.02072v2","updated":"2024-04-04T00:59:51Z","published":"2024-04-02T16:20:02Z","title":"EGTR: Extracting Graph from Transformer for Scene Graph Generation","summary":" Scene Graph Generation (SGG) is a challenging task of detecting objects and\npredicting relationships between objects. After DETR was developed, one-stage\nSGG models based on a one-stage object detector have been actively studied.\nHowever, complex modeling is used to predict the relationship between objects,\nand the inherent relationship between object queries learned in the multi-head\nself-attention of the object detector has been neglected. We propose a\nlightweight one-stage SGG model that extracts the relation graph from the\nvarious relationships learned in the multi-head self-attention layers of the\nDETR decoder. By fully utilizing the self-attention by-products, the relation\ngraph can be extracted effectively with a shallow relation extraction head.\nConsidering the dependency of the relation extraction task on the object\ndetection task, we propose a novel relation smoothing technique that adjusts\nthe relation label adaptively according to the quality of the detected objects.\nBy the relation smoothing, the model is trained according to the continuous\ncurriculum that focuses on object detection task at the beginning of training\nand performs multi-task learning as the object detection performance gradually\nimproves. Furthermore, we propose a connectivity prediction task that predicts\nwhether a relation exists between object pairs as an auxiliary task of the\nrelation extraction. We demonstrate the effectiveness and efficiency of our\nmethod for the Visual Genome and Open Image V6 datasets. Our code is publicly\navailable at https://github.com/naver-ai/egtr.\n","authors":["Jinbae Im","JeongYeon Nam","Nokyung Park","Hyungmin Lee","Seunghyun Park"],"pdf_url":"https://arxiv.org/pdf/2404.02072v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03126v1","updated":"2024-04-04T00:28:50Z","published":"2024-04-04T00:28:50Z","title":"GaSpCT: Gaussian Splatting for Novel CT Projection View Synthesis","summary":" We present GaSpCT, a novel view synthesis and 3D scene representation method\nused to generate novel projection views for Computer Tomography (CT) scans. We\nadapt the Gaussian Splatting framework to enable novel view synthesis in CT\nbased on limited sets of 2D image projections and without the need for\nStructure from Motion (SfM) methodologies. Therefore, we reduce the total\nscanning duration and the amount of radiation dose the patient receives during\nthe scan. We adapted the loss function to our use-case by encouraging a\nstronger background and foreground distinction using two sparsity promoting\nregularizers: a beta loss and a total variation (TV) loss. Finally, we\ninitialize the Gaussian locations across the 3D space using a uniform prior\ndistribution of where the brain's positioning would be expected to be within\nthe field of view. We evaluate the performance of our model using brain CT\nscans from the Parkinson's Progression Markers Initiative (PPMI) dataset and\ndemonstrate that the rendered novel views closely match the original projection\nviews of the simulated scan, and have better performance than other implicit 3D\nscene representations methodologies. Furthermore, we empirically observe\nreduced training time compared to neural network based image synthesis for\nsparse-view CT image reconstruction. Finally, the memory requirements of the\nGaussian Splatting representations are reduced by 17% compared to the\nequivalent voxel grid image representations.\n","authors":["Emmanouil Nikolakakis","Utkarsh Gupta","Jonathan Vengosh","Justin Bui","Razvan Marinescu"],"pdf_url":"https://arxiv.org/pdf/2404.03126v1.pdf","comment":"Under Review Process for MICCAI 2024"},{"id":"http://arxiv.org/abs/2203.13856v2","updated":"2024-04-04T00:13:42Z","published":"2022-03-25T18:42:20Z","title":"Robust deep learning for eye fundus images: Bridging real and synthetic\n data for enhancing generalization","summary":" Deep learning applications for assessing medical images are limited because\nthe datasets are often small and imbalanced. The use of synthetic data has been\nproposed in the literature, but neither a robust comparison of the different\nmethods nor generalizability has been reported. Our approach integrates a\nretinal image quality assessment model and StyleGAN2 architecture to enhance\nAge-related Macular Degeneration (AMD) detection capabilities and improve\ngeneralizability. This work compares ten different Generative Adversarial\nNetwork (GAN) architectures to generate synthetic eye-fundus images with and\nwithout AMD. We combined subsets of three public databases (iChallenge-AMD,\nODIR-2019, and RIADD) to form a single training and test set. We employed the\nSTARE dataset for external validation, ensuring a comprehensive assessment of\nthe proposed approach. The results show that StyleGAN2 reached the lowest\nFrechet Inception Distance (166.17), and clinicians could not accurately\ndifferentiate between real and synthetic images. ResNet-18 architecture\nobtained the best performance with 85% accuracy and outperformed the two human\nexperts (80% and 75%) in detecting AMD fundus images. The accuracy rates were\n82.8% for the test set and 81.3% for the STARE dataset, demonstrating the\nmodel's generalizability. The proposed methodology for synthetic medical image\ngeneration has been validated for robustness and accuracy, with free access to\nits code for further research and development in this field.\n","authors":["Guilherme C. Oliveira","Gustavo H. Rosa","Daniel C. G. Pedronette","João P. Papa","Himeesh Kumar","Leandro A. Passos","Dinesh Kumar"],"pdf_url":"https://arxiv.org/pdf/2203.13856v2.pdf","comment":"Accepted by the Biomedical Signal Processing and Control"},{"id":"http://arxiv.org/abs/2009.04650v2","updated":"2024-04-04T15:25:22Z","published":"2020-09-10T02:55:27Z","title":"Towards Fine-grained Large Object Segmentation 1st Place Solution to 3D\n AI Challenge 2020 -- Instance Segmentation Track","summary":" This technical report introduces our solutions of Team 'FineGrainedSeg' for\nInstance Segmentation track in 3D AI Challenge 2020. In order to handle\nextremely large objects in 3D-FUTURE, we adopt PointRend as our basic\nframework, which outputs more fine-grained masks compared to HTC and SOLOv2.\nOur final submission is an ensemble of 5 PointRend models, which achieves the\n1st place on both validation and test leaderboards. The code is available at\nhttps://github.com/zehuichen123/3DFuture_ins_seg.\n","authors":["Zehui Chen","Qiaofei Li","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2009.04650v2.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/1902.11122v5","updated":"2024-04-04T11:34:52Z","published":"2019-02-22T10:09:11Z","title":"Deep Learning in Cardiology","summary":" The medical field is creating large amount of data that physicians are unable\nto decipher and use efficiently. Moreover, rule-based expert systems are\ninefficient in solving complicated medical tasks or for creating insights using\nbig data. Deep learning has emerged as a more accurate and effective technology\nin a wide range of medical problems such as diagnosis, prediction and\nintervention. Deep learning is a representation learning method that consists\nof layers that transform the data non-linearly, thus, revealing hierarchical\nrelationships and structures. In this review we survey deep learning\napplication papers that use structured data, signal and imaging modalities from\ncardiology. We discuss the advantages and limitations of applying deep learning\nin cardiology that also apply in medicine in general, while proposing certain\ndirections as the most viable for clinical use.\n","authors":["Paschalis Bizopoulos","Dimitrios Koutsouris"],"pdf_url":"https://arxiv.org/pdf/1902.11122v5.pdf","comment":"27 pages, 2 figures, 10 tables"},{"id":"http://arxiv.org/abs/2404.03836v1","updated":"2024-04-04T23:38:45Z","published":"2024-04-04T23:38:45Z","title":"PARIS3D: Reasoning-based 3D Part Segmentation Using Large Multimodal\n Model","summary":" Recent advancements in 3D perception systems have significantly improved\ntheir ability to perform visual recognition tasks such as segmentation.\nHowever, these systems still heavily rely on explicit human instruction to\nidentify target objects or categories, lacking the capability to actively\nreason and comprehend implicit user intentions. We introduce a novel\nsegmentation task known as reasoning part segmentation for 3D objects, aiming\nto output a segmentation mask based on complex and implicit textual queries\nabout specific parts of a 3D object. To facilitate evaluation and benchmarking,\nwe present a large 3D dataset comprising over 60k instructions paired with\ncorresponding ground-truth part segmentation annotations specifically curated\nfor reasoning-based 3D part segmentation. We propose a model that is capable of\nsegmenting parts of 3D objects based on implicit textual queries and generating\nnatural language explanations corresponding to 3D object segmentation requests.\nExperiments show that our method achieves competitive performance to models\nthat use explicit queries, with the additional abilities to identify part\nconcepts, reason about them, and complement them with world knowledge. Our\nsource code, dataset, and trained models are available at\nhttps://github.com/AmrinKareem/PARIS3D.\n","authors":["Amrin Kareem","Jean Lahoud","Hisham Cholakkal"],"pdf_url":"https://arxiv.org/pdf/2404.03836v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2404.03831v1","updated":"2024-04-04T23:24:14Z","published":"2024-04-04T23:24:14Z","title":"SleepVST: Sleep Staging from Near-Infrared Video Signals using\n Pre-Trained Transformers","summary":" Advances in camera-based physiological monitoring have enabled the robust,\nnon-contact measurement of respiration and the cardiac pulse, which are known\nto be indicative of the sleep stage. This has led to research into camera-based\nsleep monitoring as a promising alternative to \"gold-standard\" polysomnography,\nwhich is cumbersome, expensive to administer, and hence unsuitable for\nlonger-term clinical studies. In this paper, we introduce SleepVST, a\ntransformer model which enables state-of-the-art performance in camera-based\nsleep stage classification (sleep staging). After pre-training on contact\nsensor data, SleepVST outperforms existing methods for cardio-respiratory sleep\nstaging on the SHHS and MESA datasets, achieving total Cohen's kappa scores of\n0.75 and 0.77 respectively. We then show that SleepVST can be successfully\ntransferred to cardio-respiratory waveforms extracted from video, enabling\nfully contact-free sleep staging. Using a video dataset of 50 nights, we\nachieve a total accuracy of 78.8\\% and a Cohen's $\\kappa$ of 0.71 in four-class\nvideo-based sleep staging, setting a new state-of-the-art in the domain.\n","authors":["Jonathan F. Carter","João Jorge","Oliver Gibson","Lionel Tarassenko"],"pdf_url":"https://arxiv.org/pdf/2404.03831v1.pdf","comment":"CVPR 2024 Highlight Paper"},{"id":"http://arxiv.org/abs/2305.05006v2","updated":"2024-04-04T22:51:42Z","published":"2023-05-08T19:25:50Z","title":"Synthesis of Annotated Colorectal Cancer Tissue Images from Gland Layout","summary":" Generating realistic tissue images with annotations is a challenging task\nthat is important in many computational histopathology applications.\nSynthetically generated images and annotations are valuable for training and\nevaluating algorithms in this domain. To address this, we propose an\ninteractive framework generating pairs of realistic colorectal cancer histology\nimages with corresponding glandular masks from glandular structure layouts. The\nframework accurately captures vital features like stroma, goblet cells, and\nglandular lumen. Users can control gland appearance by adjusting parameters\nsuch as the number of glands, their locations, and sizes. The generated images\nexhibit good Frechet Inception Distance (FID) scores compared to the\nstate-of-the-art image-to-image translation model. Additionally, we demonstrate\nthe utility of our synthetic annotations for evaluating gland segmentation\nalgorithms. Furthermore, we present a methodology for constructing glandular\nmasks using advanced deep generative models, such as latent diffusion models.\nThese masks enable tissue image generation through a residual encoder-decoder\nnetwork.\n","authors":["Srijay Deshpande","Fayyaz Minhas","Nasir Rajpoot"],"pdf_url":"https://arxiv.org/pdf/2305.05006v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16812v2","updated":"2024-04-04T22:31:18Z","published":"2023-12-28T04:14:55Z","title":"Spacetime Gaussian Feature Splatting for Real-Time Dynamic View\n Synthesis","summary":" Novel view synthesis of dynamic scenes has been an intriguing yet challenging\nproblem. Despite recent advancements, simultaneously achieving high-resolution\nphotorealistic results, real-time rendering, and compact storage remains a\nformidable task. To address these challenges, we propose Spacetime Gaussian\nFeature Splatting as a novel dynamic scene representation, composed of three\npivotal components. First, we formulate expressive Spacetime Gaussians by\nenhancing 3D Gaussians with temporal opacity and parametric motion/rotation.\nThis enables Spacetime Gaussians to capture static, dynamic, as well as\ntransient content within a scene. Second, we introduce splatted feature\nrendering, which replaces spherical harmonics with neural features. These\nfeatures facilitate the modeling of view- and time-dependent appearance while\nmaintaining small size. Third, we leverage the guidance of training error and\ncoarse depth to sample new Gaussians in areas that are challenging to converge\nwith existing pipelines. Experiments on several established real-world datasets\ndemonstrate that our method achieves state-of-the-art rendering quality and\nspeed, while retaining compact storage. At 8K resolution, our lite-version\nmodel can render at 60 FPS on an Nvidia RTX 4090 GPU. Our code is available at\nhttps://github.com/oppo-us-research/SpacetimeGaussians.\n","authors":["Zhan Li","Zhang Chen","Zhong Li","Yi Xu"],"pdf_url":"https://arxiv.org/pdf/2312.16812v2.pdf","comment":"Accepted to CVPR 2024. Project page:\n https://oppo-us-research.github.io/SpacetimeGaussians-website/"},{"id":"http://arxiv.org/abs/2404.03819v1","updated":"2024-04-04T22:31:15Z","published":"2024-04-04T22:31:15Z","title":"Effective Lymph Nodes Detection in CT Scans Using Location Debiased\n Query Selection and Contrastive Query Representation in Transformer","summary":" Lymph node (LN) assessment is a critical, indispensable yet very challenging\ntask in the routine clinical workflow of radiology and oncology. Accurate LN\nanalysis is essential for cancer diagnosis, staging, and treatment planning.\nFinding scatteredly distributed, low-contrast clinically relevant LNs in 3D CT\nis difficult even for experienced physicians under high inter-observer\nvariations. Previous automatic LN detection works typically yield limited\nrecall and high false positives (FPs) due to adjacent anatomies with similar\nimage intensities, shapes, or textures (vessels, muscles, esophagus, etc). In\nthis work, we propose a new LN DEtection TRansformer, named LN-DETR, to achieve\nmore accurate performance. By enhancing the 2D backbone with a multi-scale 2.5D\nfeature fusion to incorporate 3D context explicitly, more importantly, we make\ntwo main contributions to improve the representation quality of LN queries. 1)\nConsidering that LN boundaries are often unclear, an IoU prediction head and a\nlocation debiased query selection are proposed to select LN queries of higher\nlocalization accuracy as the decoder query's initialization. 2) To reduce FPs,\nquery contrastive learning is employed to explicitly reinforce LN queries\ntowards their best-matched ground-truth queries over unmatched query\npredictions. Trained and tested on 3D CT scans of 1067 patients (with 10,000+\nlabeled LNs) via combining seven LN datasets from different body parts (neck,\nchest, and abdomen) and pathologies/cancers, our method significantly improves\nthe performance of previous leading methods by > 4-5% average recall at the\nsame FP rates in both internal and external testing. We further evaluate on the\nuniversal lesion detection task using NIH DeepLesion benchmark, and our method\nachieves the top performance of 88.46% averaged recall across 0.5 to 4 FPs per\nimage, compared with other leading reported results.\n","authors":["Qinji Yu","Yirui Wang","Ke Yan","Haoshen Li","Dazhou Guo","Li Zhang","Le Lu","Na Shen","Qifeng Wang","Xiaowei Ding","Xianghua Ye","Dakai Jin"],"pdf_url":"https://arxiv.org/pdf/2404.03819v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2404.03799v1","updated":"2024-04-04T20:42:49Z","published":"2024-04-04T20:42:49Z","title":"Language-Guided Instance-Aware Domain-Adaptive Panoptic Segmentation","summary":" The increasing relevance of panoptic segmentation is tied to the advancements\nin autonomous driving and AR/VR applications. However, the deployment of such\nmodels has been limited due to the expensive nature of dense data annotation,\ngiving rise to unsupervised domain adaptation (UDA). A key challenge in\npanoptic UDA is reducing the domain gap between a labeled source and an\nunlabeled target domain while harmonizing the subtasks of semantic and instance\nsegmentation to limit catastrophic interference. While considerable progress\nhas been achieved, existing approaches mainly focus on the adaptation of\nsemantic segmentation. In this work, we focus on incorporating instance-level\nadaptation via a novel instance-aware cross-domain mixing strategy IMix. IMix\nsignificantly enhances the panoptic quality by improving instance segmentation\nperformance. Specifically, we propose inserting high-confidence predicted\ninstances from the target domain onto source images, retaining the\nexhaustiveness of the resulting pseudo-labels while reducing the injected\nconfirmation bias. Nevertheless, such an enhancement comes at the cost of\ndegraded semantic performance, attributed to catastrophic forgetting. To\nmitigate this issue, we regularize our semantic branch by employing CLIP-based\ndomain alignment (CDA), exploiting the domain-robustness of natural language\nprompts. Finally, we present an end-to-end model incorporating these two\nmechanisms called LIDAPS, achieving state-of-the-art results on all popular\npanoptic UDA benchmarks.\n","authors":["Elham Amin Mansour","Ozan Unal","Suman Saha","Benjamin Bejar","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2404.03799v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03789v1","updated":"2024-04-04T20:04:12Z","published":"2024-04-04T20:04:12Z","title":"Quantifying Uncertainty in Motion Prediction with Variational Bayesian\n Mixture","summary":" Safety and robustness are crucial factors in developing trustworthy\nautonomous vehicles. One essential aspect of addressing these factors is to\nequip vehicles with the capability to predict future trajectories for all\nmoving objects in the surroundings and quantify prediction uncertainties. In\nthis paper, we propose the Sequential Neural Variational Agent (SeNeVA), a\ngenerative model that describes the distribution of future trajectories for a\nsingle moving object. Our approach can distinguish Out-of-Distribution data\nwhile quantifying uncertainty and achieving competitive performance compared to\nstate-of-the-art methods on the Argoverse 2 and INTERACTION datasets.\nSpecifically, a 0.446 meters minimum Final Displacement Error, a 0.203 meters\nminimum Average Displacement Error, and a 5.35% Miss Rate are achieved on the\nINTERACTION test set. Extensive qualitative and quantitative analysis is also\nprovided to evaluate the proposed model. Our open-source code is available at\nhttps://github.com/PurdueDigitalTwin/seneva.\n","authors":["Juanwu Lu","Can Cui","Yunsheng Ma","Aniket Bera","Ziran Wang"],"pdf_url":"https://arxiv.org/pdf/2404.03789v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03784v1","updated":"2024-04-04T19:55:11Z","published":"2024-04-04T19:55:11Z","title":"Layerwise Early Stopping for Test Time Adaptation","summary":" Test Time Adaptation (TTA) addresses the problem of distribution shift by\nenabling pretrained models to learn new features on an unseen domain at test\ntime. However, it poses a significant challenge to maintain a balance between\nlearning new features and retaining useful pretrained features. In this paper,\nwe propose Layerwise EArly STopping (LEAST) for TTA to address this problem.\nThe key idea is to stop adapting individual layers during TTA if the features\nbeing learned do not appear beneficial for the new domain. For that purpose, we\npropose using a novel gradient-based metric to measure the relevance of the\ncurrent learnt features to the new domain without the need for supervised\nlabels. More specifically, we propose to use this metric to determine\ndynamically when to stop updating each layer during TTA. This enables a more\nbalanced adaptation, restricted to layers benefiting from it, and only for a\ncertain number of steps. Such an approach also has the added effect of limiting\nthe forgetting of pretrained features useful for dealing with new domains.\nThrough extensive experiments, we demonstrate that Layerwise Early Stopping\nimproves the performance of existing TTA approaches across multiple datasets,\ndomain shifts, model architectures, and TTA losses.\n","authors":["Sabyasachi Sahoo","Mostafa ElAraby","Jonas Ngnawe","Yann Pequignot","Frederic Precioso","Christian Gagne"],"pdf_url":"https://arxiv.org/pdf/2404.03784v1.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.03778v1","updated":"2024-04-04T19:50:57Z","published":"2024-04-04T19:50:57Z","title":"Flattening the Parent Bias: Hierarchical Semantic Segmentation in the\n Poincaré Ball","summary":" Hierarchy is a natural representation of semantic taxonomies, including the\nones routinely used in image segmentation. Indeed, recent work on semantic\nsegmentation reports improved accuracy from supervised training leveraging\nhierarchical label structures. Encouraged by these results, we revisit the\nfundamental assumptions behind that work. We postulate and then empirically\nverify that the reasons for the observed improvement in segmentation accuracy\nmay be entirely unrelated to the use of the semantic hierarchy. To demonstrate\nthis, we design a range of cross-domain experiments with a representative\nhierarchical approach. We find that on the new testing domains, a flat\n(non-hierarchical) segmentation network, in which the parents are inferred from\nthe children, has superior segmentation accuracy to the hierarchical approach\nacross the board. Complementing these findings and inspired by the intrinsic\nproperties of hyperbolic spaces, we study a more principled approach to\nhierarchical segmentation using the Poincar\\'e ball model. The hyperbolic\nrepresentation largely outperforms the previous (Euclidean) hierarchical\napproach as well and is on par with our flat Euclidean baseline in terms of\nsegmentation accuracy. However, it additionally exhibits surprisingly strong\ncalibration quality of the parent nodes in the semantic hierarchy, especially\non the more challenging domains. Our combined analysis suggests that the\nestablished practice of hierarchical segmentation may be limited to in-domain\nsettings, whereas flat classifiers generalize substantially better, especially\nif they are modeled in the hyperbolic space.\n","authors":["Simon Weber","Barış Zöngür","Nikita Araslanov","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2404.03778v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02733v2","updated":"2024-04-04T19:42:32Z","published":"2024-04-03T13:34:09Z","title":"InstantStyle: Free Lunch towards Style-Preserving in Text-to-Image\n Generation","summary":" Tuning-free diffusion-based models have demonstrated significant potential in\nthe realm of image personalization and customization. However, despite this\nnotable progress, current models continue to grapple with several complex\nchallenges in producing style-consistent image generation. Firstly, the concept\nof style is inherently underdetermined, encompassing a multitude of elements\nsuch as color, material, atmosphere, design, and structure, among others.\nSecondly, inversion-based methods are prone to style degradation, often\nresulting in the loss of fine-grained details. Lastly, adapter-based approaches\nfrequently require meticulous weight tuning for each reference image to achieve\na balance between style intensity and text controllability. In this paper, we\ncommence by examining several compelling yet frequently overlooked\nobservations. We then proceed to introduce InstantStyle, a framework designed\nto address these issues through the implementation of two key strategies: 1) A\nstraightforward mechanism that decouples style and content from reference\nimages within the feature space, predicated on the assumption that features\nwithin the same space can be either added to or subtracted from one another. 2)\nThe injection of reference image features exclusively into style-specific\nblocks, thereby preventing style leaks and eschewing the need for cumbersome\nweight tuning, which often characterizes more parameter-heavy designs.Our work\ndemonstrates superior visual stylization outcomes, striking an optimal balance\nbetween the intensity of style and the controllability of textual elements. Our\ncodes will be available at https://github.com/InstantStyle/InstantStyle.\n","authors":["Haofan Wang","Matteo Spinelli","Qixun Wang","Xu Bai","Zekui Qin","Anthony Chen"],"pdf_url":"https://arxiv.org/pdf/2404.02733v2.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2307.00040v3","updated":"2024-04-04T19:41:09Z","published":"2023-06-30T17:37:48Z","title":"DisCo: Disentangled Control for Realistic Human Dance Generation","summary":" Generative AI has made significant strides in computer vision, particularly\nin text-driven image/video synthesis (T2I/T2V). Despite the notable\nadvancements, it remains challenging in human-centric content synthesis such as\nrealistic dance generation. Current methodologies, primarily tailored for human\nmotion transfer, encounter difficulties when confronted with real-world dance\nscenarios (e.g., social media dance), which require to generalize across a wide\nspectrum of poses and intricate human details. In this paper, we depart from\nthe traditional paradigm of human motion transfer and emphasize two additional\ncritical attributes for the synthesis of human dance content in social media\ncontexts: (i) Generalizability: the model should be able to generalize beyond\ngeneric human viewpoints as well as unseen human subjects, backgrounds, and\nposes; (ii) Compositionality: it should allow for the seamless composition of\nseen/unseen subjects, backgrounds, and poses from different sources. To address\nthese challenges, we introduce DISCO, which includes a novel model architecture\nwith disentangled control to improve the compositionality of dance synthesis,\nand an effective human attribute pre-training for better generalizability to\nunseen humans. Extensive qualitative and quantitative results demonstrate that\nDisCc can generate high-quality human dance images and videos with diverse\nappearances and flexible motions. Code is available at\nhttps://disco-dance.github.io/.\n","authors":["Tan Wang","Linjie Li","Kevin Lin","Yuanhao Zhai","Chung-Ching Lin","Zhengyuan Yang","Hanwang Zhang","Zicheng Liu","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2307.00040v3.pdf","comment":"Accepted by CVPR24"},{"id":"http://arxiv.org/abs/2312.12337v4","updated":"2024-04-04T19:04:55Z","published":"2023-12-19T17:03:50Z","title":"pixelSplat: 3D Gaussian Splats from Image Pairs for Scalable\n Generalizable 3D Reconstruction","summary":" We introduce pixelSplat, a feed-forward model that learns to reconstruct 3D\nradiance fields parameterized by 3D Gaussian primitives from pairs of images.\nOur model features real-time and memory-efficient rendering for scalable\ntraining as well as fast 3D reconstruction at inference time. To overcome local\nminima inherent to sparse and locally supported representations, we predict a\ndense probability distribution over 3D and sample Gaussian means from that\nprobability distribution. We make this sampling operation differentiable via a\nreparameterization trick, allowing us to back-propagate gradients through the\nGaussian splatting representation. We benchmark our method on wide-baseline\nnovel view synthesis on the real-world RealEstate10k and ACID datasets, where\nwe outperform state-of-the-art light field transformers and accelerate\nrendering by 2.5 orders of magnitude while reconstructing an interpretable and\neditable 3D radiance field.\n","authors":["David Charatan","Sizhe Li","Andrea Tagliasacchi","Vincent Sitzmann"],"pdf_url":"https://arxiv.org/pdf/2312.12337v4.pdf","comment":"Project page: https://dcharatan.github.io/pixelsplat"},{"id":"http://arxiv.org/abs/2305.07490v6","updated":"2024-04-04T18:55:18Z","published":"2023-05-12T14:04:30Z","title":"ArtGPT-4: Towards Artistic-understanding Large Vision-Language Models\n with Enhanced Adapter","summary":" The success of large language models (LLMs) has inspired an emerging research\nfield of multimodal learning. However, a grand challenge of exploiting LLMs for\nmultimodal learning is the size of pre-trained LLMs which are always with\nbillions of parameters. To tackle this challenge, models such as MiniGPT-4 and\nLLaVA have been developed to fine-tune the pre-trained models using fewer\nparameters. Despite their promising performance, these models remain limited in\ntheir understanding of artistic imagery. To facilitate better\nartistic-understanding, in this paper, we propose ArtGPT-4, a pioneering large\nvision-language model tailored to address the limitations of existing models in\nartistic comprehension. The key innovation of ArtGPT-4 lies in its craft for\nthe sophisticated challenge of artistic image comprehension, setting it apart\nfrom other models that overlook fine details for broader themes. Specifically,\nit works by integrating some specialized adapter layers into the LLM, enabling\nthe model to more efficiently and effectively parse and interpret complex\nvisual tokens, instead of fine-tuning the whole LLM as in the existing method.\nArtGPT-4 has demonstrated its outstanding performance on the efficiency:\nutilizing a Tesla A100 device, its training can be completed in mere 2 hours\nwith an image-text pair dataset comprising approximately 0.52M entries.\nAdditionally, ArtGPT-4 has also achieved state-of-the-art performance on the\nArtEmis and ArtEmis-v2.0 datasets as well as the benchmarks established in this\nwork, lagging behind professional artists' descriptions by a negligible 0.15\npoints on a 6-point scale. The outstanding performance of ArtGPT-4 shows that\nit can render images with an artistic-understanding and convey the emotions\nthey inspire, mirroring human interpretation. The code and the pre-trained\nmodel are accessible in \\url{https://github.com/DLYuanGod/ArtGPT-4}.\n","authors":["Zhengqing Yuan","Yunhong He","Kun Wang","Yanfang Ye","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2305.07490v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16862v2","updated":"2024-04-04T18:53:58Z","published":"2023-12-28T07:11:41Z","title":"TinyGPT-V: Efficient Multimodal Large Language Model via Small Backbones","summary":" In recent years, multimodal large language models (MLLMs) such as GPT-4V have\ndemonstrated remarkable advancements, excelling in a variety of vision-language\ntasks. Despite their prowess, the closed-source nature and computational\ndemands of such models limit their accessibility and applicability. This study\nintroduces TinyGPT-V, a novel open-source MLLM, designed for efficient training\nand inference across various vision-language tasks, including image captioning\n(IC) and visual question answering (VQA). Leveraging a compact yet powerful\narchitecture, TinyGPT-V integrates the Phi-2 language model with pre-trained\nvision encoders, utilizing a unique mapping module for visual and linguistic\ninformation fusion. With a training regimen optimized for small backbones and\nemploying a diverse dataset amalgam, TinyGPT-V requires significantly lower\ncomputational resources 24GB for training and as little as 8GB for inference\nwithout compromising on performance. Our experiments demonstrate that\nTinyGPT-V, with its language model 2.8 billion parameters, achieves comparable\nresults in VQA and image inference tasks to its larger counterparts while being\nuniquely suited for deployment on resource-constrained devices through\ninnovative quantization techniques. This work not only paves the way for more\naccessible and efficient MLLMs but also underscores the potential of smaller,\noptimized models in bridging the gap between high performance and computational\nefficiency in real-world applications. Additionally, this paper introduces a\nnew approach to multimodal large language models using smaller backbones. Our\ncode and training weights are available in\n\\url{https://github.com/DLYuanGod/TinyGPT-V}.\n","authors":["Zhengqing Yuan","Zhaoxu Li","Weiran Huang","Yanfang Ye","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2312.16862v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03754v1","updated":"2024-04-04T18:50:58Z","published":"2024-04-04T18:50:58Z","title":"Data Science for Geographic Information Systems","summary":" The integration of data science into Geographic Information Systems (GIS) has\nfacilitated the evolution of these tools into complete spatial analysis\nplatforms. The adoption of machine learning and big data techniques has\nequipped these platforms with the capacity to handle larger amounts of\nincreasingly complex data, transcending the limitations of more traditional\napproaches. This work traces the historical and technical evolution of data\nscience and GIS as fields of study, highlighting the critical points of\nconvergence between domains, and underlining the many sectors that rely on this\nintegration. A GIS application is presented as a case study in the disaster\nmanagement sector where we utilize aerial data from Tr\\'oia, Portugal, to\nemphasize the process of insight extraction from raw data. We conclude by\noutlining prospects for future research in integration of these fields in\ngeneral, and the developed application in particular.\n","authors":["Afonso Oliveira","Nuno Fachada","João P. Matos-Carvalho"],"pdf_url":"https://arxiv.org/pdf/2404.03754v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03743v1","updated":"2024-04-04T18:31:24Z","published":"2024-04-04T18:31:24Z","title":"Test Time Training for Industrial Anomaly Segmentation","summary":" Anomaly Detection and Segmentation (AD&S) is crucial for industrial quality\ncontrol. While existing methods excel in generating anomaly scores for each\npixel, practical applications require producing a binary segmentation to\nidentify anomalies. Due to the absence of labeled anomalies in many real\nscenarios, standard practices binarize these maps based on some statistics\nderived from a validation set containing only nominal samples, resulting in\npoor segmentation performance. This paper addresses this problem by proposing a\ntest time training strategy to improve the segmentation performance. Indeed, at\ntest time, we can extract rich features directly from anomalous samples to\ntrain a classifier that can discriminate defects effectively. Our general\napproach can work downstream to any AD&S method that provides an anomaly score\nmap as output, even in multimodal settings. We demonstrate the effectiveness of\nour approach over baselines through extensive experimentation and evaluation on\nMVTec AD and MVTec 3D-AD.\n","authors":["Alex Costanzino","Pierluigi Zama Ramirez","Mirko Del Moro","Agostino Aiezzo","Giuseppe Lisanti","Samuele Salti","Luigi Di Stefano"],"pdf_url":"https://arxiv.org/pdf/2404.03743v1.pdf","comment":"Accepted at VAND 2.0, CVPRW 2024"},{"id":"http://arxiv.org/abs/2404.01112v3","updated":"2024-04-04T18:31:05Z","published":"2024-04-01T13:38:16Z","title":"Few-shot point cloud reconstruction and denoising via learned Guassian\n splats renderings and fine-tuned diffusion features","summary":" Existing deep learning methods for the reconstruction and denoising of point\nclouds rely on small datasets of 3D shapes. We circumvent the problem by\nleveraging deep learning methods trained on billions of images. We propose a\nmethod to reconstruct point clouds from few images and to denoise point clouds\nfrom their rendering by exploiting prior knowledge distilled from image-based\ndeep learning models. To improve reconstruction in constraint settings, we\nregularize the training of a differentiable renderer with hybrid surface and\nappearance by introducing semantic consistency supervision. In addition, we\npropose a pipeline to finetune Stable Diffusion to denoise renderings of noisy\npoint clouds and we demonstrate how these learned filters can be used to remove\npoint cloud noise coming without 3D supervision. We compare our method with DSS\nand PointRadiance and achieved higher quality 3D reconstruction on the\nSketchfab Testset and SCUT Dataset.\n","authors":["Pietro Bonazzi"],"pdf_url":"https://arxiv.org/pdf/2404.01112v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01887v2","updated":"2024-04-04T18:29:00Z","published":"2024-04-02T12:26:17Z","title":"3D scene generation from scene graphs and self-attention","summary":" Synthesizing realistic and diverse indoor 3D scene layouts in a controllable\nfashion opens up applications in simulated navigation and virtual reality. As\nconcise and robust representations of a scene, scene graphs have proven to be\nwell-suited as the semantic control on the generated layout. We present a\nvariant of the conditional variational autoencoder (cVAE) model to synthesize\n3D scenes from scene graphs and floor plans. We exploit the properties of\nself-attention layers to capture high-level relationships between objects in a\nscene, and use these as the building blocks of our model. Our model, leverages\ngraph transformers to estimate the size, dimension and orientation of the\nobjects in a room while satisfying relationships in the given scene graph. Our\nexperiments shows self-attention layers leads to sparser (7.9x compared to\nGraphto3D) and more diverse scenes (16%).\n","authors":["Pietro Bonazzi"],"pdf_url":"https://arxiv.org/pdf/2404.01887v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03736v1","updated":"2024-04-04T18:05:18Z","published":"2024-04-04T18:05:18Z","title":"SC4D: Sparse-Controlled Video-to-4D Generation and Motion Transfer","summary":" Recent advances in 2D/3D generative models enable the generation of dynamic\n3D objects from a single-view video. Existing approaches utilize score\ndistillation sampling to form the dynamic scene as dynamic NeRF or dense 3D\nGaussians. However, these methods struggle to strike a balance among reference\nview alignment, spatio-temporal consistency, and motion fidelity under\nsingle-view conditions due to the implicit nature of NeRF or the intricate\ndense Gaussian motion prediction. To address these issues, this paper proposes\nan efficient, sparse-controlled video-to-4D framework named SC4D, that\ndecouples motion and appearance to achieve superior video-to-4D generation.\nMoreover, we introduce Adaptive Gaussian (AG) initialization and Gaussian\nAlignment (GA) loss to mitigate shape degeneration issue, ensuring the fidelity\nof the learned motion and shape. Comprehensive experimental results demonstrate\nthat our method surpasses existing methods in both quality and efficiency. In\naddition, facilitated by the disentangled modeling of motion and appearance of\nSC4D, we devise a novel application that seamlessly transfers the learned\nmotion onto a diverse array of 4D entities according to textual descriptions.\n","authors":["Zijie Wu","Chaohui Yu","Yanqin Jiang","Chenjie Cao","Fan Wang","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2404.03736v1.pdf","comment":"Project Page: https://sc4d.github.io/"},{"id":"http://arxiv.org/abs/2304.01834v4","updated":"2024-04-04T18:01:47Z","published":"2023-04-04T14:39:44Z","title":"Neural Field Convolutions by Repeated Differentiation","summary":" Neural fields are evolving towards a general-purpose continuous\nrepresentation for visual computing. Yet, despite their numerous appealing\nproperties, they are hardly amenable to signal processing. As a remedy, we\npresent a method to perform general continuous convolutions with general\ncontinuous signals such as neural fields. Observing that piecewise polynomial\nkernels reduce to a sparse set of Dirac deltas after repeated differentiation,\nwe leverage convolution identities and train a repeated integral field to\nefficiently execute large-scale convolutions. We demonstrate our approach on a\nvariety of data modalities and spatially-varying kernels.\n","authors":["Ntumba Elie Nsampi","Adarsh Djeacoumar","Hans-Peter Seidel","Tobias Ritschel","Thomas Leimkühler"],"pdf_url":"https://arxiv.org/pdf/2304.01834v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04125v1","updated":"2024-04-04T17:58:02Z","published":"2024-04-04T17:58:02Z","title":"No \"Zero-Shot\" Without Exponential Data: Pretraining Concept Frequency\n Determines Multimodal Model Performance","summary":" Web-crawled pretraining datasets underlie the impressive \"zero-shot\"\nevaluation performance of multimodal models, such as CLIP for\nclassification/retrieval and Stable-Diffusion for image generation. However, it\nis unclear how meaningful the notion of \"zero-shot\" generalization is for such\nmultimodal models, as it is not known to what extent their pretraining datasets\nencompass the downstream concepts targeted for during \"zero-shot\" evaluation.\nIn this work, we ask: How is the performance of multimodal models on downstream\nconcepts influenced by the frequency of these concepts in their pretraining\ndatasets? We comprehensively investigate this question across 34 models and\nfive standard pretraining datasets (CC-3M, CC-12M, YFCC-15M, LAION-400M,\nLAION-Aesthetics), generating over 300GB of data artifacts. We consistently\nfind that, far from exhibiting \"zero-shot\" generalization, multimodal models\nrequire exponentially more data to achieve linear improvements in downstream\n\"zero-shot\" performance, following a sample inefficient log-linear scaling\ntrend. This trend persists even when controlling for sample-level similarity\nbetween pretraining and downstream datasets, and testing on purely synthetic\ndata distributions. Furthermore, upon benchmarking models on long-tailed data\nsampled based on our analysis, we demonstrate that multimodal models across the\nboard perform poorly. We contribute this long-tail test set as the \"Let it\nWag!\" benchmark to further research in this direction. Taken together, our\nstudy reveals an exponential need for training data which implies that the key\nto \"zero-shot\" generalization capabilities under large-scale training paradigms\nremains to be found.\n","authors":["Vishaal Udandarao","Ameya Prabhu","Adhiraj Ghosh","Yash Sharma","Philip H. S. Torr","Adel Bibi","Samuel Albanie","Matthias Bethge"],"pdf_url":"https://arxiv.org/pdf/2404.04125v1.pdf","comment":"Extended version of the short paper accepted at DPFM, ICLR'24"},{"id":"http://arxiv.org/abs/2404.03713v1","updated":"2024-04-04T17:46:20Z","published":"2024-04-04T17:46:20Z","title":"Explaining Explainability: Understanding Concept Activation Vectors","summary":" Recent interpretability methods propose using concept-based explanations to\ntranslate the internal representations of deep learning models into a language\nthat humans are familiar with: concepts. This requires understanding which\nconcepts are present in the representation space of a neural network. One\npopular method for finding concepts is Concept Activation Vectors (CAVs), which\nare learnt using a probe dataset of concept exemplars. In this work, we\ninvestigate three properties of CAVs. CAVs may be: (1) inconsistent between\nlayers, (2) entangled with different concepts, and (3) spatially dependent.\nEach property provides both challenges and opportunities in interpreting\nmodels. We introduce tools designed to detect the presence of these properties,\nprovide insight into how they affect the derived explanations, and provide\nrecommendations to minimise their impact. Understanding these properties can be\nused to our advantage. For example, we introduce spatially dependent CAVs to\ntest if a model is translation invariant with respect to a specific concept and\nclass. Our experiments are performed on ImageNet and a new synthetic dataset,\nElements. Elements is designed to capture a known ground truth relationship\nbetween concepts and classes. We release this dataset to facilitate further\nresearch in understanding and evaluating interpretability methods.\n","authors":["Angus Nicolson","Lisa Schut","J. Alison Noble","Yarin Gal"],"pdf_url":"https://arxiv.org/pdf/2404.03713v1.pdf","comment":"(54 pages, 39 figures)"},{"id":"http://arxiv.org/abs/2404.04120v1","updated":"2024-04-04T10:12:55Z","published":"2024-04-04T10:12:55Z","title":"Cross-Modality Gait Recognition: Bridging LiDAR and Camera Modalities\n for Human Identification","summary":" Current gait recognition research mainly focuses on identifying pedestrians\ncaptured by the same type of sensor, neglecting the fact that individuals may\nbe captured by different sensors in order to adapt to various environments. A\nmore practical approach should involve cross-modality matching across different\nsensors. Hence, this paper focuses on investigating the problem of\ncross-modality gait recognition, with the objective of accurately identifying\npedestrians across diverse vision sensors. We present CrossGait inspired by the\nfeature alignment strategy, capable of cross retrieving diverse data\nmodalities. Specifically, we investigate the cross-modality recognition task by\ninitially extracting features within each modality and subsequently aligning\nthese features across modalities. To further enhance the cross-modality\nperformance, we propose a Prototypical Modality-shared Attention Module that\nlearns modality-shared features from two modality-specific features.\nAdditionally, we design a Cross-modality Feature Adapter that transforms the\nlearned modality-specific features into a unified feature space. Extensive\nexperiments conducted on the SUSTech1K dataset demonstrate the effectiveness of\nCrossGait: (1) it exhibits promising cross-modality ability in retrieving\npedestrians across various modalities from different sensors in diverse scenes,\nand (2) CrossGait not only learns modality-shared features for cross-modality\ngait recognition but also maintains modality-specific features for\nsingle-modality recognition.\n","authors":["Rui Wang","Chuanfu Shen","Manuel J. Marin-Jimenez","George Q. Huang","Shiqi Yu"],"pdf_url":"https://arxiv.org/pdf/2404.04120v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03703v1","updated":"2024-04-04T07:49:39Z","published":"2024-04-04T07:49:39Z","title":"Mitigating analytical variability in fMRI results with style transfer","summary":" We propose a novel approach to improve the reproducibility of neuroimaging\nresults by converting statistic maps across different functional MRI pipelines.\nWe make the assumption that pipelines can be considered as a style component of\ndata and propose to use different generative models, among which, Diffusion\nModels (DM) to convert data between pipelines. We design a new DM-based\nunsupervised multi-domain image-to-image transition framework and constrain the\ngeneration of 3D fMRI statistic maps using the latent space of an auxiliary\nclassifier that distinguishes statistic maps from different pipelines. We\nextend traditional sampling techniques used in DM to improve the transition\nperformance. Our experiments demonstrate that our proposed methods are\nsuccessful: pipelines can indeed be transferred, providing an important source\nof data augmentation for future medical studies.\n","authors":["Elodie Germani","Elisa Fromont","Camille Maumet"],"pdf_url":"https://arxiv.org/pdf/2404.03703v1.pdf","comment":null}]},"2024-04-05T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.00086v2","updated":"2024-04-05T17:59:50Z","published":"2024-03-29T17:58:50Z","title":"DVIS-DAQ: Improving Video Segmentation via Dynamic Anchor Queries","summary":" Modern video segmentation methods adopt object queries to perform inter-frame\nassociation and demonstrate satisfactory performance in tracking continuously\nappearing objects despite large-scale motion and transient occlusion. However,\nthey all underperform on newly emerging and disappearing objects that are\ncommon in the real world because they attempt to model object emergence and\ndisappearance through feature transitions between background and foreground\nqueries that have significant feature gaps. We introduce Dynamic Anchor Queries\n(DAQ) to shorten the transition gap between the anchor and target queries by\ndynamically generating anchor queries based on the features of potential\ncandidates. Furthermore, we introduce a query-level object Emergence and\nDisappearance Simulation (EDS) strategy, which unleashes DAQ's potential\nwithout any additional cost. Finally, we combine our proposed DAQ and EDS with\nDVIS to obtain DVIS-DAQ. Extensive experiments demonstrate that DVIS-DAQ\nachieves a new state-of-the-art (SOTA) performance on five mainstream video\nsegmentation benchmarks. Code and models are available at\n\\url{https://github.com/SkyworkAI/DAQ-VS}.\n","authors":["Yikang Zhou","Tao Zhang","Shunping Ji","Shuicheng Yan","Xiangtai Li"],"pdf_url":"https://arxiv.org/pdf/2404.00086v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04256v1","updated":"2024-04-05T17:59:44Z","published":"2024-04-05T17:59:44Z","title":"Sigma: Siamese Mamba Network for Multi-Modal Semantic Segmentation","summary":" Multi-modal semantic segmentation significantly enhances AI agents'\nperception and scene understanding, especially under adverse conditions like\nlow-light or overexposed environments. Leveraging additional modalities\n(X-modality) like thermal and depth alongside traditional RGB provides\ncomplementary information, enabling more robust and reliable segmentation. In\nthis work, we introduce Sigma, a Siamese Mamba network for multi-modal semantic\nsegmentation, utilizing the Selective Structured State Space Model, Mamba.\nUnlike conventional methods that rely on CNNs, with their limited local\nreceptive fields, or Vision Transformers (ViTs), which offer global receptive\nfields at the cost of quadratic complexity, our model achieves global receptive\nfields coverage with linear complexity. By employing a Siamese encoder and\ninnovating a Mamba fusion mechanism, we effectively select essential\ninformation from different modalities. A decoder is then developed to enhance\nthe channel-wise modeling ability of the model. Our method, Sigma, is\nrigorously evaluated on both RGB-Thermal and RGB-Depth segmentation tasks,\ndemonstrating its superiority and marking the first successful application of\nState Space Models (SSMs) in multi-modal perception tasks. Code is available at\nhttps://github.com/zifuwan/Sigma.\n","authors":["Zifu Wan","Yuhao Wang","Silong Yong","Pingping Zhang","Simon Stepputtis","Katia Sycara","Yaqi Xie"],"pdf_url":"https://arxiv.org/pdf/2404.04256v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04254v1","updated":"2024-04-05T17:58:52Z","published":"2024-04-05T17:58:52Z","title":"Watermark-based Detection and Attribution of AI-Generated Content","summary":" Several companies--such as Google, Microsoft, and OpenAI--have deployed\ntechniques to watermark AI-generated content to enable proactive detection.\nHowever, existing literature mainly focuses on user-agnostic detection.\nAttribution aims to further trace back the user of a generative-AI service who\ngenerated a given content detected as AI-generated. Despite its growing\nimportance, attribution is largely unexplored. In this work, we aim to bridge\nthis gap by providing the first systematic study on watermark-based, user-aware\ndetection and attribution of AI-generated content. Specifically, we\ntheoretically study the detection and attribution performance via rigorous\nprobabilistic analysis. Moreover, we develop an efficient algorithm to select\nwatermarks for the users to enhance attribution performance. Both our\ntheoretical and empirical results show that watermark-based detection and\nattribution inherit the accuracy and (non-)robustness properties of the\nwatermarking method.\n","authors":["Zhengyuan Jiang","Moyang Guo","Yuepeng Hu","Neil Zhenqiang Gong"],"pdf_url":"https://arxiv.org/pdf/2404.04254v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04251v1","updated":"2024-04-05T17:57:16Z","published":"2024-04-05T17:57:16Z","title":"Who Evaluates the Evaluations? Objectively Scoring Text-to-Image Prompt\n Coherence Metrics with T2IScoreScore (TS2)","summary":" With advances in the quality of text-to-image (T2I) models has come interest\nin benchmarking their prompt faithfulness-the semantic coherence of generated\nimages to the prompts they were conditioned on. A variety of T2I faithfulness\nmetrics have been proposed, leveraging advances in cross-modal embeddings and\nvision-language models (VLMs). However, these metrics are not rigorously\ncompared and benchmarked, instead presented against few weak baselines by\ncorrelation to human Likert scores over a set of easy-to-discriminate images.\n We introduce T2IScoreScore (TS2), a curated set of semantic error graphs\ncontaining a prompt and a set increasingly erroneous images. These allow us to\nrigorously judge whether a given prompt faithfulness metric can correctly order\nimages with respect to their objective error count and significantly\ndiscriminate between different error nodes, using meta-metric scores derived\nfrom established statistical tests. Surprisingly, we find that the\nstate-of-the-art VLM-based metrics (e.g., TIFA, DSG, LLMScore, VIEScore) we\ntested fail to significantly outperform simple feature-based metrics like\nCLIPScore, particularly on a hard subset of naturally-occurring T2I model\nerrors. TS2 will enable the development of better T2I prompt faithfulness\nmetrics through more rigorous comparison of their conformity to expected\norderings and separations under objective criteria.\n","authors":["Michael Saxon","Fatima Jahara","Mahsa Khoshnoodi","Yujie Lu","Aditya Sharma","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.04251v1.pdf","comment":"15 pages main, 9 pages appendices, 16 figures, 3 tables"},{"id":"http://arxiv.org/abs/2312.08240v2","updated":"2024-04-05T17:56:12Z","published":"2023-12-13T16:01:50Z","title":"CenterGrasp: Object-Aware Implicit Representation Learning for\n Simultaneous Shape Reconstruction and 6-DoF Grasp Estimation","summary":" Reliable object grasping is a crucial capability for autonomous robots.\nHowever, many existing grasping approaches focus on general clutter removal\nwithout explicitly modeling objects and thus only relying on the visible local\ngeometry. We introduce CenterGrasp, a novel framework that combines object\nawareness and holistic grasping. CenterGrasp learns a general object prior by\nencoding shapes and valid grasps in a continuous latent space. It consists of\nan RGB-D image encoder that leverages recent advances to detect objects and\ninfer their pose and latent code, and a decoder to predict shape and grasps for\neach object in the scene. We perform extensive experiments on simulated as well\nas real-world cluttered scenes and demonstrate strong scene reconstruction and\n6-DoF grasp-pose estimation performance. Compared to the state of the art,\nCenterGrasp achieves an improvement of 38.5 mm in shape reconstruction and 33\npercentage points on average in grasp success. We make the code and trained\nmodels publicly available at http://centergrasp.cs.uni-freiburg.de.\n","authors":["Eugenio Chisari","Nick Heppert","Tim Welschehold","Wolfram Burgard","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2312.08240v2.pdf","comment":"Accepted at RA-L. Video, code and models available at\n http://centergrasp.cs.uni-freiburg.de"},{"id":"http://arxiv.org/abs/2102.05984v2","updated":"2024-04-05T17:55:28Z","published":"2021-02-11T13:04:49Z","title":"Modeling 3D Surface Manifolds with a Locally Conditioned Atlas","summary":" Recently proposed 3D object reconstruction methods represent a mesh with an\natlas - a set of planar patches approximating the surface. However, their\napplication in a real-world scenario is limited since the surfaces of\nreconstructed objects contain discontinuities, which degrades the quality of\nthe final mesh. This is mainly caused by independent processing of individual\npatches, and in this work, we postulate to mitigate this limitation by\npreserving local consistency around patch vertices. To that end, we introduce a\nLocally Conditioned Atlas (LoCondA), a framework for representing a 3D object\nhierarchically in a generative model. Firstly, the model maps a point cloud of\nan object into a sphere. Secondly, by leveraging a spherical prior, we enforce\nthe mapping to be locally consistent on the sphere and on the target object.\nThis way, we can sample a mesh quad on that sphere and project it back onto the\nobject's manifold. With LoCondA, we can produce topologically diverse objects\nwhile maintaining quads to be stitched together. We show that the proposed\napproach provides structurally coherent reconstructions while producing meshes\nof quality comparable to the competitors.\n","authors":["Przemysław Spurek","Sebastian Winczowski","Maciej Zięba","Tomasz Trzciński","Kacper Kania","Marcin Mazur"],"pdf_url":"https://arxiv.org/pdf/2102.05984v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04245v1","updated":"2024-04-05T17:51:58Z","published":"2024-04-05T17:51:58Z","title":"Evaluating Adversarial Robustness: A Comparison Of FGSM, Carlini-Wagner\n Attacks, And The Role of Distillation as Defense Mechanism","summary":" This technical report delves into an in-depth exploration of adversarial\nattacks specifically targeted at Deep Neural Networks (DNNs) utilized for image\nclassification. The study also investigates defense mechanisms aimed at\nbolstering the robustness of machine learning models. The research focuses on\ncomprehending the ramifications of two prominent attack methodologies: the Fast\nGradient Sign Method (FGSM) and the Carlini-Wagner (CW) approach. These attacks\nare examined concerning three pre-trained image classifiers: Resnext50_32x4d,\nDenseNet-201, and VGG-19, utilizing the Tiny-ImageNet dataset. Furthermore, the\nstudy proposes the robustness of defensive distillation as a defense mechanism\nto counter FGSM and CW attacks. This defense mechanism is evaluated using the\nCIFAR-10 dataset, where CNN models, specifically resnet101 and Resnext50_32x4d,\nserve as the teacher and student models, respectively. The proposed defensive\ndistillation model exhibits effectiveness in thwarting attacks such as FGSM.\nHowever, it is noted to remain susceptible to more sophisticated techniques\nlike the CW attack. The document presents a meticulous validation of the\nproposed scheme. It provides detailed and comprehensive results, elucidating\nthe efficacy and limitations of the defense mechanisms employed. Through\nrigorous experimentation and analysis, the study offers insights into the\ndynamics of adversarial attacks on DNNs, as well as the effectiveness of\ndefensive strategies in mitigating their impact.\n","authors":["Trilokesh Ranjan Sarkar","Nilanjan Das","Pralay Sankar Maitra","Bijoy Some","Ritwik Saha","Orijita Adhikary","Bishal Bose","Jaydip Sen"],"pdf_url":"https://arxiv.org/pdf/2404.04245v1.pdf","comment":"This report pertains to the Capstone Project done by Group 1 of the\n Fall batch of 2023 students at Praxis Tech School, Kolkata, India. The\n reports consists of 35 pages and it includes 15 figures and 10 tables. This\n is the preprint which will be submitted to to an IEEE international\n conference for review"},{"id":"http://arxiv.org/abs/2404.04244v1","updated":"2024-04-05T17:46:38Z","published":"2024-04-05T17:46:38Z","title":"DiffOp-net: A Differential Operator-based Fully Convolutional Network\n for Unsupervised Deformable Image Registration","summary":" Existing unsupervised deformable image registration methods usually rely on\nmetrics applied to the gradients of predicted displacement or velocity fields\nas a regularization term to ensure transformation smoothness, which potentially\nlimits registration accuracy. In this study, we propose a novel approach to\nenhance unsupervised deformable image registration by introducing a new\ndifferential operator into the registration framework. This operator, acting on\nthe velocity field and mapping it to a dual space, ensures the smoothness of\nthe velocity field during optimization, facilitating accurate deformable\nregistration. In addition, to tackle the challenge of capturing large\ndeformations inside image pairs, we introduce a Cross-Coordinate Attention\nmodule (CCA) and embed it into a proposed Fully Convolutional Networks\n(FCNs)-based multi-resolution registration architecture. Evaluation experiments\nare conducted on two magnetic resonance imaging (MRI) datasets. Compared to\nvarious state-of-the-art registration approaches, including a traditional\nalgorithm and three representative unsupervised learning-based methods, our\nmethod achieves superior accuracies, maintaining desirable diffeomorphic\nproperties, and exhibiting promising registration speed.\n","authors":["Jiong Wu"],"pdf_url":"https://arxiv.org/pdf/2404.04244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04243v1","updated":"2024-04-05T17:45:22Z","published":"2024-04-05T17:45:22Z","title":"Identity Decoupling for Multi-Subject Personalization of Text-to-Image\n Models","summary":" Text-to-image diffusion models have shown remarkable success in generating a\npersonalized subject based on a few reference images. However, current methods\nstruggle with handling multiple subjects simultaneously, often resulting in\nmixed identities with combined attributes from different subjects. In this\nwork, we present MuDI, a novel framework that enables multi-subject\npersonalization by effectively decoupling identities from multiple subjects.\nOur main idea is to utilize segmented subjects generated by the Segment\nAnything Model for both training and inference, as a form of data augmentation\nfor training and initialization for the generation process. Our experiments\ndemonstrate that MuDI can produce high-quality personalized images without\nidentity mixing, even for highly similar subjects as shown in Figure 1. In\nhuman evaluation, MuDI shows twice as many successes for personalizing multiple\nsubjects without identity mixing over existing baselines and is preferred over\n70% compared to the strongest baseline. More results are available at\nhttps://mudi-t2i.github.io/.\n","authors":["Sangwon Jang","Jaehyeong Jo","Kimin Lee","Sung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2404.04243v1.pdf","comment":"Preprint. Project page: https://mudi-t2i.github.io/"},{"id":"http://arxiv.org/abs/2404.04242v1","updated":"2024-04-05T17:45:07Z","published":"2024-04-05T17:45:07Z","title":"Physical Property Understanding from Language-Embedded Feature Fields","summary":" Can computers perceive the physical properties of objects solely through\nvision? Research in cognitive science and vision science has shown that humans\nexcel at identifying materials and estimating their physical properties based\npurely on visual appearance. In this paper, we present a novel approach for\ndense prediction of the physical properties of objects using a collection of\nimages. Inspired by how humans reason about physics through vision, we leverage\nlarge language models to propose candidate materials for each object. We then\nconstruct a language-embedded point cloud and estimate the physical properties\nof each 3D point using a zero-shot kernel regression approach. Our method is\naccurate, annotation-free, and applicable to any object in the open world.\nExperiments demonstrate the effectiveness of the proposed approach in various\nphysical property reasoning tasks, such as estimating the mass of common\nobjects, as well as other properties like friction and hardness.\n","authors":["Albert J. Zhai","Yuan Shen","Emily Y. Chen","Gloria X. Wang","Xinlei Wang","Sheng Wang","Kaiyu Guan","Shenlong Wang"],"pdf_url":"https://arxiv.org/pdf/2404.04242v1.pdf","comment":"CVPR 2024. Project page (with code):\n https://ajzhai.github.io/NeRF2Physics/"},{"id":"http://arxiv.org/abs/2311.08577v3","updated":"2024-04-05T17:37:36Z","published":"2023-11-14T22:46:01Z","title":"Finding AI-Generated Faces in the Wild","summary":" AI-based image generation has continued to rapidly improve, producing\nincreasingly more realistic images with fewer obvious visual flaws.\nAI-generated images are being used to create fake online profiles which in turn\nare being used for spam, fraud, and disinformation campaigns. As the general\nproblem of detecting any type of manipulated or synthesized content is\nreceiving increasing attention, here we focus on a more narrow task of\ndistinguishing a real face from an AI-generated face. This is particularly\napplicable when tackling inauthentic online accounts with a fake user profile\nphoto. We show that by focusing on only faces, a more resilient and\ngeneral-purpose artifact can be detected that allows for the detection of\nAI-generated faces from a variety of GAN- and diffusion-based synthesis\nengines, and across image resolutions (as low as 128 x 128 pixels) and\nqualities.\n","authors":["Gonzalo J. Aniano Porcile","Jack Gindi","Shivansh Mundra","James R. Verbus","Hany Farid"],"pdf_url":"https://arxiv.org/pdf/2311.08577v3.pdf","comment":"to be published as: G.J.A. Porcile, J. Gindi, S. Mundra, J.R. Verbus,\n and H. Farid, Finding AI-Generated Faces in the Wild, Workshop on Media\n Forensics at CVPR, 2024"},{"id":"http://arxiv.org/abs/2404.03635v2","updated":"2024-04-05T17:27:34Z","published":"2024-04-04T17:54:33Z","title":"WorDepth: Variational Language Prior for Monocular Depth Estimation","summary":" Three-dimensional (3D) reconstruction from a single image is an ill-posed\nproblem with inherent ambiguities, i.e. scale. Predicting a 3D scene from text\ndescription(s) is similarly ill-posed, i.e. spatial arrangements of objects\ndescribed. We investigate the question of whether two inherently ambiguous\nmodalities can be used in conjunction to produce metric-scaled reconstructions.\nTo test this, we focus on monocular depth estimation, the problem of predicting\na dense depth map from a single image, but with an additional text caption\ndescribing the scene. To this end, we begin by encoding the text caption as a\nmean and standard deviation; using a variational framework, we learn the\ndistribution of the plausible metric reconstructions of 3D scenes corresponding\nto the text captions as a prior. To \"select\" a specific reconstruction or depth\nmap, we encode the given image through a conditional sampler that samples from\nthe latent space of the variational text encoder, which is then decoded to the\noutput depth map. Our approach is trained alternatingly between the text and\nimage branches: in one optimization step, we predict the mean and standard\ndeviation from the text description and sample from a standard Gaussian, and in\nthe other, we sample using a (image) conditional sampler. Once trained, we\ndirectly predict depth from the encoded text using the conditional sampler. We\ndemonstrate our approach on indoor (NYUv2) and outdoor (KITTI) scenarios, where\nwe show that language can consistently improve performance in both.\n","authors":["Ziyao Zeng","Daniel Wang","Fengyu Yang","Hyoungseob Park","Yangchao Wu","Stefano Soatto","Byung-Woo Hong","Dong Lao","Alex Wong"],"pdf_url":"https://arxiv.org/pdf/2404.03635v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04231v1","updated":"2024-04-05T17:25:17Z","published":"2024-04-05T17:25:17Z","title":"Image-Text Co-Decomposition for Text-Supervised Semantic Segmentation","summary":" This paper addresses text-supervised semantic segmentation, aiming to learn a\nmodel capable of segmenting arbitrary visual concepts within images by using\nonly image-text pairs without dense annotations. Existing methods have\ndemonstrated that contrastive learning on image-text pairs effectively aligns\nvisual segments with the meanings of texts. We notice that there is a\ndiscrepancy between text alignment and semantic segmentation: A text often\nconsists of multiple semantic concepts, whereas semantic segmentation strives\nto create semantically homogeneous segments. To address this issue, we propose\na novel framework, Image-Text Co-Decomposition (CoDe), where the paired image\nand text are jointly decomposed into a set of image regions and a set of word\nsegments, respectively, and contrastive learning is developed to enforce\nregion-word alignment. To work with a vision-language model, we present a\nprompt learning mechanism that derives an extra representation to highlight an\nimage segment or a word segment of interest, with which more effective features\ncan be extracted from that segment. Comprehensive experimental results\ndemonstrate that our method performs favorably against existing text-supervised\nsemantic segmentation methods on six benchmark datasets.\n","authors":["Ji-Jia Wu","Andy Chia-Hao Chang","Chieh-Yu Chuang","Chun-Pei Chen","Yu-Lun Liu","Min-Hung Chen","Hou-Ning Hu","Yung-Yu Chuang","Yen-Yu Lin"],"pdf_url":"https://arxiv.org/pdf/2404.04231v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.02257v2","updated":"2024-04-05T17:02:31Z","published":"2024-04-02T19:25:04Z","title":"SnAG: Scalable and Accurate Video Grounding","summary":" Temporal grounding of text descriptions in videos is a central problem in\nvision-language learning and video understanding. Existing methods often\nprioritize accuracy over scalability -- they have been optimized for grounding\nonly a few text queries within short videos, and fail to scale up to long\nvideos with hundreds of queries. In this paper, we study the effect of\ncross-modal fusion on the scalability of video grounding models. Our analysis\nestablishes late fusion as a more cost-effective fusion scheme for long-form\nvideos with many text queries. Moreover, it leads us to a novel, video-centric\nsampling scheme for efficient training. Based on these findings, we present\nSnAG, a simple baseline for scalable and accurate video grounding. Without\nbells and whistles, SnAG is 43% more accurate and 1.5x faster than CONE, a\nstate of the art for long-form video grounding on the challenging MAD dataset,\nwhile achieving highly competitive results on short videos.\n","authors":["Fangzhou Mu","Sicheng Mo","Yin Li"],"pdf_url":"https://arxiv.org/pdf/2404.02257v2.pdf","comment":"Accepted to CVPR 2024. Code available at\n https://github.com/fmu2/snag_release"},{"id":"http://arxiv.org/abs/2402.15584v2","updated":"2024-04-05T17:01:34Z","published":"2024-02-23T19:51:55Z","title":"State Space Models for Event Cameras","summary":" Today, state-of-the-art deep neural networks that process event-camera data\nfirst convert a temporal window of events into dense, grid-like input\nrepresentations. As such, they exhibit poor generalizability when deployed at\nhigher inference frequencies (i.e., smaller temporal windows) than the ones\nthey were trained on. We address this challenge by introducing state-space\nmodels (SSMs) with learnable timescale parameters to event-based vision. This\ndesign adapts to varying frequencies without the need to retrain the network at\ndifferent frequencies. Additionally, we investigate two strategies to\ncounteract aliasing effects when deploying the model at higher frequencies. We\ncomprehensively evaluate our approach against existing methods based on RNN and\nTransformer architectures across various benchmarks, including Gen1 and 1 Mpx\nevent camera datasets. Our results demonstrate that SSM-based models train 33%\nfaster and also exhibit minimal performance degradation when tested at higher\nfrequencies than the training input. Traditional RNN and Transformer models\nexhibit performance drops of more than 20 mAP, with SSMs having a drop of 3.31\nmAP, highlighting the effectiveness of SSMs in event-based vision tasks.\n","authors":["Nikola Zubić","Mathias Gehrig","Davide Scaramuzza"],"pdf_url":"https://arxiv.org/pdf/2402.15584v2.pdf","comment":"18 pages, 5 figures, 6 tables, CVPR 2024 Camera Ready paper"},{"id":"http://arxiv.org/abs/2301.07002v3","updated":"2024-04-05T16:50:13Z","published":"2023-01-17T16:44:48Z","title":"Opti-CAM: Optimizing saliency maps for interpretability","summary":" Methods based on class activation maps (CAM) provide a simple mechanism to\ninterpret predictions of convolutional neural networks by using linear\ncombinations of feature maps as saliency maps. By contrast, masking-based\nmethods optimize a saliency map directly in the image space or learn it by\ntraining another network on additional data.\n In this work we introduce Opti-CAM, combining ideas from CAM-based and\nmasking-based approaches. Our saliency map is a linear combination of feature\nmaps, where weights are optimized per image such that the logit of the masked\nimage for a given class is maximized. We also fix a fundamental flaw in two of\nthe most common evaluation metrics of attribution methods. On several datasets,\nOpti-CAM largely outperforms other CAM-based approaches according to the most\nrelevant classification metrics. We provide empirical evidence supporting that\nlocalization and classifier interpretability are not necessarily aligned.\n","authors":["Hanwei Zhang","Felipe Torres","Ronan Sicre","Yannis Avrithis","Stephane Ayache"],"pdf_url":"https://arxiv.org/pdf/2301.07002v3.pdf","comment":"This work is under consideration at \"Computer Vision and Image\n Understanding\""},{"id":"http://arxiv.org/abs/2404.04211v1","updated":"2024-04-05T16:42:16Z","published":"2024-04-05T16:42:16Z","title":"Robust Gaussian Splatting","summary":" In this paper, we address common error sources for 3D Gaussian Splatting\n(3DGS) including blur, imperfect camera poses, and color inconsistencies, with\nthe goal of improving its robustness for practical applications like\nreconstructions from handheld phone captures. Our main contribution involves\nmodeling motion blur as a Gaussian distribution over camera poses, allowing us\nto address both camera pose refinement and motion blur correction in a unified\nway. Additionally, we propose mechanisms for defocus blur compensation and for\naddressing color in-consistencies caused by ambient light, shadows, or due to\ncamera-related factors like varying white balancing settings. Our proposed\nsolutions integrate in a seamless way with the 3DGS formulation while\nmaintaining its benefits in terms of training efficiency and rendering speed.\nWe experimentally validate our contributions on relevant benchmark datasets\nincluding Scannet++ and Deblur-NeRF, obtaining state-of-the-art results and\nthus consistent improvements over relevant baselines.\n","authors":["François Darmon","Lorenzo Porzi","Samuel Rota-Bulò","Peter Kontschieder"],"pdf_url":"https://arxiv.org/pdf/2404.04211v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04202v1","updated":"2024-04-05T16:25:39Z","published":"2024-04-05T16:25:39Z","title":"Deep-learning Segmentation of Small Volumes in CT images for\n Radiotherapy Treatment Planning","summary":" Our understanding of organs at risk is progressing to include physical small\ntissues such as coronary arteries and the radiosensitivities of many small\norgans and tissues are high. Therefore, the accurate segmentation of small\nvolumes in external radiotherapy is crucial to protect them from\nover-irradiation. Moreover, with the development of the particle therapy and\non-board imaging, the treatment becomes more accurate and precise. The purpose\nof this work is to optimize organ segmentation algorithms for small organs. We\nused 50 three-dimensional (3-D) computed tomography (CT) head and neck images\nfrom StructSeg2019 challenge to develop a general-purpose V-Net model to\nsegment 20 organs in the head and neck region. We applied specific strategies\nto improve the segmentation accuracy of the small volumes in this anatomical\nregion, i.e., the lens of the eye. Then, we used 17 additional head images from\nOSF healthcare to validate the robustness of the V Net model optimized for\nsmall-volume segmentation. With the study of the StructSeg2019 images, we found\nthat the optimization of the image normalization range and classification\nthreshold yielded a segmentation improvement of the lens of the eye of\napproximately 50%, compared to the use of the V-Net not optimized for small\nvolumes. We used the optimized model to segment 17 images acquired using\nheterogeneous protocols. We obtained comparable Dice coefficient values for the\nclinical and StructSeg2019 images (0.61 plus/minus 0.07 and 0.58 plus/minus\n0.10 for the left and right lens of the eye, respectively)\n","authors":["Jianxin Zhou","Kadishe Fejza","Massimiliano Salvatori","Daniele Della Latta","Gregory M. Hermann","Angela Di Fulvio"],"pdf_url":"https://arxiv.org/pdf/2404.04202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01482v4","updated":"2024-04-05T16:11:19Z","published":"2024-03-03T11:24:16Z","title":"EAGLE: Eigen Aggregation Learning for Object-Centric Unsupervised\n Semantic Segmentation","summary":" Semantic segmentation has innately relied on extensive pixel-level annotated\ndata, leading to the emergence of unsupervised methodologies. Among them,\nleveraging self-supervised Vision Transformers for unsupervised semantic\nsegmentation (USS) has been making steady progress with expressive deep\nfeatures. Yet, for semantically segmenting images with complex objects, a\npredominant challenge remains: the lack of explicit object-level semantic\nencoding in patch-level features. This technical limitation often leads to\ninadequate segmentation of complex objects with diverse structures. To address\nthis gap, we present a novel approach, EAGLE, which emphasizes object-centric\nrepresentation learning for unsupervised semantic segmentation. Specifically,\nwe introduce EiCue, a spectral technique providing semantic and structural cues\nthrough an eigenbasis derived from the semantic similarity matrix of deep image\nfeatures and color affinity from an image. Further, by incorporating our\nobject-centric contrastive loss with EiCue, we guide our model to learn\nobject-level representations with intra- and inter-image object-feature\nconsistency, thereby enhancing semantic accuracy. Extensive experiments on\nCOCO-Stuff, Cityscapes, and Potsdam-3 datasets demonstrate the state-of-the-art\nUSS results of EAGLE with accurate and consistent semantic segmentation across\ncomplex scenes.\n","authors":["Chanyoung Kim","Woojung Han","Dayun Ju","Seong Jae Hwang"],"pdf_url":"https://arxiv.org/pdf/2403.01482v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00185v2","updated":"2024-04-05T16:10:44Z","published":"2024-03-29T22:51:45Z","title":"On Inherent Adversarial Robustness of Active Vision Systems","summary":" Current Deep Neural Networks are vulnerable to adversarial examples, which\nalter their predictions by adding carefully crafted noise. Since human eyes are\nrobust to such inputs, it is possible that the vulnerability stems from the\nstandard way of processing inputs in one shot by processing every pixel with\nthe same importance. In contrast, neuroscience suggests that the human vision\nsystem can differentiate salient features by (1) switching between multiple\nfixation points (saccades) and (2) processing the surrounding with a\nnon-uniform external resolution (foveation). In this work, we advocate that the\nintegration of such active vision mechanisms into current deep learning systems\ncan offer robustness benefits. Specifically, we empirically demonstrate the\ninherent robustness of two active vision methods - GFNet and FALcon - under a\nblack box threat model. By learning and inferencing based on downsampled\nglimpses obtained from multiple distinct fixation points within an input, we\nshow that these active methods achieve (2-3) times greater robustness compared\nto a standard passive convolutional network under state-of-the-art adversarial\nattacks. More importantly, we provide illustrative and interpretable\nvisualization analysis that demonstrates how performing inference from distinct\nfixation points makes active vision methods less vulnerable to malicious\ninputs.\n","authors":["Amitangshu Mukherjee","Timur Ibrayev","Kaushik Roy"],"pdf_url":"https://arxiv.org/pdf/2404.00185v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10427v2","updated":"2024-04-05T16:04:40Z","published":"2024-03-15T16:00:04Z","title":"SWAG: Splatting in the Wild images with Appearance-conditioned Gaussians","summary":" Implicit neural representation methods have shown impressive advancements in\nlearning 3D scenes from unstructured in-the-wild photo collections but are\nstill limited by the large computational cost of volumetric rendering. More\nrecently, 3D Gaussian Splatting emerged as a much faster alternative with\nsuperior rendering quality and training efficiency, especially for small-scale\nand object-centric scenarios. Nevertheless, this technique suffers from poor\nperformance on unstructured in-the-wild data. To tackle this, we extend over 3D\nGaussian Splatting to handle unstructured image collections. We achieve this by\nmodeling appearance to seize photometric variations in the rendered images.\nAdditionally, we introduce a new mechanism to train transient Gaussians to\nhandle the presence of scene occluders in an unsupervised manner. Experiments\non diverse photo collection scenes and multi-pass acquisition of outdoor\nlandmarks show the effectiveness of our method over prior works achieving\nstate-of-the-art results with improved efficiency.\n","authors":["Hiba Dahmani","Moussab Bennehar","Nathan Piasco","Luis Roldao","Dzmitry Tsishkou"],"pdf_url":"https://arxiv.org/pdf/2403.10427v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04179v1","updated":"2024-04-05T15:48:36Z","published":"2024-04-05T15:48:36Z","title":"SCAResNet: A ResNet Variant Optimized for Tiny Object Detection in\n Transmission and Distribution Towers","summary":" Traditional deep learning-based object detection networks often resize images\nduring the data preprocessing stage to achieve a uniform size and scale in the\nfeature map. Resizing is done to facilitate model propagation and fully\nconnected classification. However, resizing inevitably leads to object\ndeformation and loss of valuable information in the images. This drawback\nbecomes particularly pronounced for tiny objects like distribution towers with\nlinear shapes and few pixels. To address this issue, we propose abandoning the\nresizing operation. Instead, we introduce Positional-Encoding Multi-head\nCriss-Cross Attention. This allows the model to capture contextual information\nand learn from multiple representation subspaces, effectively enriching the\nsemantics of distribution towers. Additionally, we enhance Spatial Pyramid\nPooling by reshaping three pooled feature maps into a new unified one while\nalso reducing the computational burden. This approach allows images of\ndifferent sizes and scales to generate feature maps with uniform dimensions and\ncan be employed in feature map propagation. Our SCAResNet incorporates these\naforementioned improvements into the backbone network ResNet. We evaluated our\nSCAResNet using the Electric Transmission and Distribution Infrastructure\nImagery dataset from Duke University. Without any additional tricks, we\nemployed various object detection models with Gaussian Receptive Field based\nLabel Assignment as the baseline. When incorporating the SCAResNet into the\nbaseline model, we achieved a 2.1% improvement in mAPs. This demonstrates the\nadvantages of our SCAResNet in detecting transmission and distribution towers\nand its value in tiny object detection. The source code is available at\nhttps://github.com/LisavilaLee/SCAResNet_mmdet.\n","authors":["Weile Li","Muqing Shi","Zhonghua Hong"],"pdf_url":"https://arxiv.org/pdf/2404.04179v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09915v2","updated":"2024-04-05T15:45:48Z","published":"2023-07-19T11:35:21Z","title":"Embedded Heterogeneous Attention Transformer for Cross-lingual Image\n Captioning","summary":" Cross-lingual image captioning is a challenging task that requires addressing\nboth cross-lingual and cross-modal obstacles in multimedia analysis. The\ncrucial issue in this task is to model the global and the local matching\nbetween the image and different languages. Existing cross-modal embedding\nmethods based on the transformer architecture oversee the local matching\nbetween the image region and monolingual words, especially when dealing with\ndiverse languages. To overcome these limitations, we propose an Embedded\nHeterogeneous Attention Transformer (EHAT) to establish cross-domain\nrelationships and local correspondences between images and different languages\nby using a heterogeneous network. EHAT comprises Masked Heterogeneous\nCross-attention (MHCA), Heterogeneous Attention Reasoning Network (HARN), and\nHeterogeneous Co-attention (HCA). The HARN serves as the core network and it\ncaptures cross-domain relationships by leveraging visual bounding box\nrepresentation features to connect word features from two languages and to\nlearn heterogeneous maps. MHCA and HCA facilitate cross-domain integration in\nthe encoder through specialized heterogeneous attention mechanisms, enabling a\nsingle model to generate captions in two languages. We evaluate our approach on\nthe MSCOCO dataset to generate captions in English and Chinese, two languages\nthat exhibit significant differences in their language families. The\nexperimental results demonstrate the superior performance of our method\ncompared to existing advanced monolingual methods. Our proposed EHAT framework\neffectively addresses the challenges of cross-lingual image captioning, paving\nthe way for improved multilingual image analysis and understanding.\n","authors":["Zijie Song","Zhenzhen Hu","Yuanen Zhou","Ye Zhao","Richang Hong","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2307.09915v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07087v2","updated":"2024-04-05T15:42:13Z","published":"2024-02-11T02:34:42Z","title":"Self-Correcting Self-Consuming Loops for Generative Model Training","summary":" As synthetic data becomes higher quality and proliferates on the internet,\nmachine learning models are increasingly trained on a mix of human- and\nmachine-generated data. Despite the successful stories of using synthetic data\nfor representation learning, using synthetic data for generative model training\ncreates \"self-consuming loops\" which may lead to training instability or even\ncollapse, unless certain conditions are met. Our paper aims to stabilize\nself-consuming generative model training. Our theoretical results demonstrate\nthat by introducing an idealized correction function, which maps a data point\nto be more likely under the true data distribution, self-consuming loops can be\nmade exponentially more stable. We then propose self-correction functions,\nwhich rely on expert knowledge (e.g. the laws of physics programmed in a\nsimulator), and aim to approximate the idealized corrector automatically and at\nscale. We empirically validate the effectiveness of self-correcting\nself-consuming loops on the challenging human motion synthesis task, and\nobserve that it successfully avoids model collapse, even when the ratio of\nsynthetic data to real data is as high as 100%.\n","authors":["Nate Gillman","Michael Freeman","Daksh Aggarwal","Chia-Hong Hsu","Calvin Luo","Yonglong Tian","Chen Sun"],"pdf_url":"https://arxiv.org/pdf/2402.07087v2.pdf","comment":"This new version contains updated mathematical results (c.f. Remark\n 4.4), as well as experiments for an additional generative modeling task.\n Paper under submission; code is available at\n https://nategillman.com/sc-sc.html"},{"id":"http://arxiv.org/abs/2311.08046v3","updated":"2024-04-05T15:21:09Z","published":"2023-11-14T10:11:36Z","title":"Chat-UniVi: Unified Visual Representation Empowers Large Language Models\n with Image and Video Understanding","summary":" Large language models have demonstrated impressive universal capabilities\nacross a wide range of open-ended tasks and have extended their utility to\nencompass multimodal conversations. However, existing methods encounter\nchallenges in effectively handling both image and video understanding,\nparticularly with limited visual tokens. In this work, we introduce Chat-UniVi,\na Unified Vision-language model capable of comprehending and engaging in\nconversations involving images and videos through a unified visual\nrepresentation. Specifically, we employ a set of dynamic visual tokens to\nuniformly represent images and videos. This representation framework empowers\nthe model to efficiently utilize a limited number of visual tokens to\nsimultaneously capture the spatial details necessary for images and the\ncomprehensive temporal relationship required for videos. Moreover, we leverage\na multi-scale representation, enabling the model to perceive both high-level\nsemantic concepts and low-level visual details. Notably, Chat-UniVi is trained\non a mixed dataset containing both images and videos, allowing direct\napplication to tasks involving both mediums without requiring any\nmodifications. Extensive experimental results demonstrate that Chat-UniVi\nconsistently outperforms even existing methods exclusively designed for either\nimages or videos. Code is available at\nhttps://github.com/PKU-YuanGroup/Chat-UniVi.\n","authors":["Peng Jin","Ryuichi Takanobu","Wancai Zhang","Xiaochun Cao","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2311.08046v3.pdf","comment":"Accepted by CVPR 2024 (Highlight)"},{"id":"http://arxiv.org/abs/2404.04159v1","updated":"2024-04-05T15:11:09Z","published":"2024-04-05T15:11:09Z","title":"Noisy Label Processing for Classification: A Survey","summary":" In recent years, deep neural networks (DNNs) have gained remarkable\nachievement in computer vision tasks, and the success of DNNs often depends\ngreatly on the richness of data. However, the acquisition process of data and\nhigh-quality ground truth requires a lot of manpower and money. In the long,\ntedious process of data annotation, annotators are prone to make mistakes,\nresulting in incorrect labels of images, i.e., noisy labels. The emergence of\nnoisy labels is inevitable. Moreover, since research shows that DNNs can easily\nfit noisy labels, the existence of noisy labels will cause significant damage\nto the model training process. Therefore, it is crucial to combat noisy labels\nfor computer vision tasks, especially for classification tasks. In this survey,\nwe first comprehensively review the evolution of different deep learning\napproaches for noisy label combating in the image classification task. In\naddition, we also review different noise patterns that have been proposed to\ndesign robust algorithms. Furthermore, we explore the inner pattern of\nreal-world label noise and propose an algorithm to generate a synthetic label\nnoise pattern guided by real-world data. We test the algorithm on the\nwell-known real-world dataset CIFAR-10N to form a new real-world data-guided\nsynthetic benchmark and evaluate some typical noise-robust methods on the\nbenchmark.\n","authors":["Mengting Li","Chuang Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.04159v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04155v1","updated":"2024-04-05T15:04:57Z","published":"2024-04-05T15:04:57Z","title":"MarsSeg: Mars Surface Semantic Segmentation with Multi-level Extractor\n and Connector","summary":" The segmentation and interpretation of the Martian surface play a pivotal\nrole in Mars exploration, providing essential data for the trajectory planning\nand obstacle avoidance of rovers. However, the complex topography, similar\nsurface features, and the lack of extensive annotated data pose significant\nchallenges to the high-precision semantic segmentation of the Martian surface.\nTo address these challenges, we propose a novel encoder-decoder based Mars\nsegmentation network, termed MarsSeg. Specifically, we employ an\nencoder-decoder structure with a minimized number of down-sampling layers to\npreserve local details. To facilitate a high-level semantic understanding\nacross the shadow multi-level feature maps, we introduce a feature enhancement\nconnection layer situated between the encoder and decoder. This layer\nincorporates Mini Atrous Spatial Pyramid Pooling (Mini-ASPP), Polarized\nSelf-Attention (PSA), and Strip Pyramid Pooling Module (SPPM). The Mini-ASPP\nand PSA are specifically designed for shadow feature enhancement, thereby\nenabling the expression of local details and small objects. Conversely, the\nSPPM is employed for deep feature enhancement, facilitating the extraction of\nhigh-level semantic category-related information. Experimental results derived\nfrom the Mars-Seg and AI4Mars datasets substantiate that the proposed MarsSeg\noutperforms other state-of-the-art methods in segmentation performance,\nvalidating the efficacy of each proposed component.\n","authors":["Junbo Li","Keyan Chen","Gengju Tian","Lu Li","Zhenwei Shi"],"pdf_url":"https://arxiv.org/pdf/2404.04155v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14239v2","updated":"2024-04-05T15:00:58Z","published":"2023-12-21T18:59:53Z","title":"PlatoNeRF: 3D Reconstruction in Plato's Cave via Single-View Two-Bounce\n Lidar","summary":" 3D reconstruction from a single-view is challenging because of the ambiguity\nfrom monocular cues and lack of information about occluded regions. Neural\nradiance fields (NeRF), while popular for view synthesis and 3D reconstruction,\nare typically reliant on multi-view images. Existing methods for single-view 3D\nreconstruction with NeRF rely on either data priors to hallucinate views of\noccluded regions, which may not be physically accurate, or shadows observed by\nRGB cameras, which are difficult to detect in ambient light and low albedo\nbackgrounds. We propose using time-of-flight data captured by a single-photon\navalanche diode to overcome these limitations. Our method models two-bounce\noptical paths with NeRF, using lidar transient data for supervision. By\nleveraging the advantages of both NeRF and two-bounce light measured by lidar,\nwe demonstrate that we can reconstruct visible and occluded geometry without\ndata priors or reliance on controlled ambient lighting or scene albedo. In\naddition, we demonstrate improved generalization under practical constraints on\nsensor spatial- and temporal-resolution. We believe our method is a promising\ndirection as single-photon lidars become ubiquitous on consumer devices, such\nas phones, tablets, and headsets.\n","authors":["Tzofi Klinghoffer","Xiaoyu Xiang","Siddharth Somasundaram","Yuchen Fan","Christian Richardt","Ramesh Raskar","Rakesh Ranjan"],"pdf_url":"https://arxiv.org/pdf/2312.14239v2.pdf","comment":"CVPR 2024. Project Page: https://platonerf.github.io/"},{"id":"http://arxiv.org/abs/2402.01779v2","updated":"2024-04-05T14:57:56Z","published":"2024-02-01T18:05:47Z","title":"Plug-and-Play image restoration with Stochastic deNOising REgularization","summary":" Plug-and-Play (PnP) algorithms are a class of iterative algorithms that\naddress image inverse problems by combining a physical model and a deep neural\nnetwork for regularization. Even if they produce impressive image restoration\nresults, these algorithms rely on a non-standard use of a denoiser on images\nthat are less and less noisy along the iterations, which contrasts with recent\nalgorithms based on Diffusion Models (DM), where the denoiser is applied only\non re-noised images. We propose a new PnP framework, called Stochastic\ndeNOising REgularization (SNORE), which applies the denoiser only on images\nwith noise of the adequate level. It is based on an explicit stochastic\nregularization, which leads to a stochastic gradient descent algorithm to solve\nill-posed inverse problems. A convergence analysis of this algorithm and its\nannealing extension is provided. Experimentally, we prove that SNORE is\ncompetitive with respect to state-of-the-art methods on deblurring and\ninpainting tasks, both quantitatively and qualitatively.\n","authors":["Marien Renaud","Jean Prost","Arthur Leclaire","Nicolas Papadakis"],"pdf_url":"https://arxiv.org/pdf/2402.01779v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02072v3","updated":"2024-04-05T14:48:43Z","published":"2024-04-02T16:20:02Z","title":"EGTR: Extracting Graph from Transformer for Scene Graph Generation","summary":" Scene Graph Generation (SGG) is a challenging task of detecting objects and\npredicting relationships between objects. After DETR was developed, one-stage\nSGG models based on a one-stage object detector have been actively studied.\nHowever, complex modeling is used to predict the relationship between objects,\nand the inherent relationship between object queries learned in the multi-head\nself-attention of the object detector has been neglected. We propose a\nlightweight one-stage SGG model that extracts the relation graph from the\nvarious relationships learned in the multi-head self-attention layers of the\nDETR decoder. By fully utilizing the self-attention by-products, the relation\ngraph can be extracted effectively with a shallow relation extraction head.\nConsidering the dependency of the relation extraction task on the object\ndetection task, we propose a novel relation smoothing technique that adjusts\nthe relation label adaptively according to the quality of the detected objects.\nBy the relation smoothing, the model is trained according to the continuous\ncurriculum that focuses on object detection task at the beginning of training\nand performs multi-task learning as the object detection performance gradually\nimproves. Furthermore, we propose a connectivity prediction task that predicts\nwhether a relation exists between object pairs as an auxiliary task of the\nrelation extraction. We demonstrate the effectiveness and efficiency of our\nmethod for the Visual Genome and Open Image V6 datasets. Our code is publicly\navailable at https://github.com/naver-ai/egtr.\n","authors":["Jinbae Im","JeongYeon Nam","Nokyung Park","Hyungmin Lee","Seunghyun Park"],"pdf_url":"https://arxiv.org/pdf/2404.02072v3.pdf","comment":"CVPR 2024 (Oral)"},{"id":"http://arxiv.org/abs/2312.00690v3","updated":"2024-04-05T14:44:27Z","published":"2023-12-01T16:17:16Z","title":"Open-vocabulary object 6D pose estimation","summary":" We introduce the new setting of open-vocabulary object 6D pose estimation, in\nwhich a textual prompt is used to specify the object of interest. In contrast\nto existing approaches, in our setting (i) the object of interest is specified\nsolely through the textual prompt, (ii) no object model (e.g., CAD or video\nsequence) is required at inference, and (iii) the object is imaged from two\nRGBD viewpoints of different scenes. To operate in this setting, we introduce a\nnovel approach that leverages a Vision-Language Model to segment the object of\ninterest from the scenes and to estimate its relative 6D pose. The key of our\napproach is a carefully devised strategy to fuse object-level information\nprovided by the prompt with local image features, resulting in a feature space\nthat can generalize to novel concepts. We validate our approach on a new\nbenchmark based on two popular datasets, REAL275 and Toyota-Light, which\ncollectively encompass 34 object instances appearing in four thousand image\npairs. The results demonstrate that our approach outperforms both a\nwell-established hand-crafted method and a recent deep learning-based baseline\nin estimating the relative 6D pose of objects in different scenes. Code and\ndataset are available at https://jcorsetti.github.io/oryon.\n","authors":["Jaime Corsetti","Davide Boscaini","Changjae Oh","Andrea Cavallaro","Fabio Poiesi"],"pdf_url":"https://arxiv.org/pdf/2312.00690v3.pdf","comment":"Camera ready version (CVPR 2024, poster highlight). 21 pages, 15\n figures, 6 tables"},{"id":"http://arxiv.org/abs/2404.04140v1","updated":"2024-04-05T14:39:13Z","published":"2024-04-05T14:39:13Z","title":"Improving Detection in Aerial Images by Capturing Inter-Object\n Relationships","summary":" In many image domains, the spatial distribution of objects in a scene\nexhibits meaningful patterns governed by their semantic relationships. In most\nmodern detection pipelines, however, the detection proposals are processed\nindependently, overlooking the underlying relationships between objects. In\nthis work, we introduce a transformer-based approach to capture these\ninter-object relationships to refine classification and regression outcomes for\ndetected objects. Building on two-stage detectors, we tokenize the region of\ninterest (RoI) proposals to be processed by a transformer encoder. Specific\nspatial and geometric relations are incorporated into the attention weights and\nadaptively modulated and regularized. Experimental results demonstrate that the\nproposed method achieves consistent performance improvement on three benchmarks\nincluding DOTA-v1.0, DOTA-v1.5, and HRSC 2016, especially ranking first on both\nDOTA-v1.5 and HRSC 2016. Specifically, our new method has an increase of 1.59\nmAP on DOTA-v1.0, 4.88 mAP on DOTA-v1.5, and 2.1 mAP on HRSC 2016,\nrespectively, compared to the baselines.\n","authors":["Botao Ren","Botian Xu","Yifan Pu","Jingyi Wang","Zhidong Deng"],"pdf_url":"https://arxiv.org/pdf/2404.04140v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.20092v4","updated":"2024-04-05T14:35:26Z","published":"2023-10-31T00:12:14Z","title":"The Missing U for Efficient Diffusion Models","summary":" Diffusion Probabilistic Models stand as a critical tool in generative\nmodelling, enabling the generation of complex data distributions. This family\nof generative models yields record-breaking performance in tasks such as image\nsynthesis, video generation, and molecule design. Despite their capabilities,\ntheir efficiency, especially in the reverse process, remains a challenge due to\nslow convergence rates and high computational costs. In this paper, we\nintroduce an approach that leverages continuous dynamical systems to design a\nnovel denoising network for diffusion models that is more parameter-efficient,\nexhibits faster convergence, and demonstrates increased noise robustness.\nExperimenting with Denoising Diffusion Probabilistic Models (DDPMs), our\nframework operates with approximately a quarter of the parameters, and $\\sim$\n30\\% of the Floating Point Operations (FLOPs) compared to standard U-Nets in\nDDPMs. Furthermore, our model is notably faster in inference than the baseline\nwhen measured in fair and equal conditions. We also provide a mathematical\nintuition as to why our proposed reverse process is faster as well as a\nmathematical discussion of the empirical tradeoffs in the denoising downstream\ntask. Finally, we argue that our method is compatible with existing performance\nenhancement techniques, enabling further improvements in efficiency, quality,\nand speed.\n","authors":["Sergio Calvo-Ordonez","Chun-Wun Cheng","Jiahao Huang","Lipei Zhang","Guang Yang","Carola-Bibiane Schonlieb","Angelica I Aviles-Rivero"],"pdf_url":"https://arxiv.org/pdf/2310.20092v4.pdf","comment":"23 pages, 14 figures, Accepted at Transactions of Machine Learning\n Research (04/2024)"},{"id":"http://arxiv.org/abs/2304.03560v2","updated":"2024-04-05T14:07:25Z","published":"2023-04-07T09:46:29Z","title":"DualRefine: Self-Supervised Depth and Pose Estimation Through Iterative\n Epipolar Sampling and Refinement Toward Equilibrium","summary":" Self-supervised multi-frame depth estimation achieves high accuracy by\ncomputing matching costs of pixel correspondences between adjacent frames,\ninjecting geometric information into the network. These pixel-correspondence\ncandidates are computed based on the relative pose estimates between the\nframes. Accurate pose predictions are essential for precise matching cost\ncomputation as they influence the epipolar geometry. Furthermore, improved\ndepth estimates can, in turn, be used to align pose estimates.\n Inspired by traditional structure-from-motion (SfM) principles, we propose\nthe DualRefine model, which tightly couples depth and pose estimation through a\nfeedback loop. Our novel update pipeline uses a deep equilibrium model\nframework to iteratively refine depth estimates and a hidden state of feature\nmaps by computing local matching costs based on epipolar geometry. Importantly,\nwe used the refined depth estimates and feature maps to compute pose updates at\neach step. This update in the pose estimates slowly alters the epipolar\ngeometry during the refinement process. Experimental results on the KITTI\ndataset demonstrate competitive depth prediction and odometry prediction\nperformance surpassing published self-supervised baselines.\n","authors":["Antyanta Bangunharcana","Ahmed Magd","Kyung-Soo Kim"],"pdf_url":"https://arxiv.org/pdf/2304.03560v2.pdf","comment":"CVPR 2023. Project page:\n https://antabangun.github.io/projects/DualRefine/ Code:\n https://github.com/antabangun/DualRefine"},{"id":"http://arxiv.org/abs/2404.04104v1","updated":"2024-04-05T14:00:07Z","published":"2024-04-05T14:00:07Z","title":"3D Facial Expressions through Analysis-by-Neural-Synthesis","summary":" While existing methods for 3D face reconstruction from in-the-wild images\nexcel at recovering the overall face shape, they commonly miss subtle, extreme,\nasymmetric, or rarely observed expressions. We improve upon these methods with\nSMIRK (Spatial Modeling for Image-based Reconstruction of Kinesics), which\nfaithfully reconstructs expressive 3D faces from images. We identify two key\nlimitations in existing methods: shortcomings in their self-supervised training\nformulation, and a lack of expression diversity in the training images. For\ntraining, most methods employ differentiable rendering to compare a predicted\nface mesh with the input image, along with a plethora of additional loss\nfunctions. This differentiable rendering loss not only has to provide\nsupervision to optimize for 3D face geometry, camera, albedo, and lighting,\nwhich is an ill-posed optimization problem, but the domain gap between\nrendering and input image further hinders the learning process. Instead, SMIRK\nreplaces the differentiable rendering with a neural rendering module that,\ngiven the rendered predicted mesh geometry, and sparsely sampled pixels of the\ninput image, generates a face image. As the neural rendering gets color\ninformation from sampled image pixels, supervising with neural rendering-based\nreconstruction loss can focus solely on the geometry. Further, it enables us to\ngenerate images of the input identity with varying expressions while training.\nThese are then utilized as input to the reconstruction model and used as\nsupervision with ground truth geometry. This effectively augments the training\ndata and enhances the generalization for diverse expressions. Our qualitative,\nquantitative and particularly our perceptual evaluations demonstrate that SMIRK\nachieves the new state-of-the art performance on accurate expression\nreconstruction. Project webpage: https://georgeretsi.github.io/smirk/.\n","authors":["George Retsinas","Panagiotis P. Filntisis","Radek Danecek","Victoria F. Abrevaya","Anastasios Roussos","Timo Bolkart","Petros Maragos"],"pdf_url":"https://arxiv.org/pdf/2404.04104v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02702v2","updated":"2024-04-05T13:49:17Z","published":"2023-12-05T12:04:34Z","title":"Neural Sign Actors: A diffusion model for 3D sign language production\n from text","summary":" Sign Languages (SL) serve as the primary mode of communication for the Deaf\nand Hard of Hearing communities. Deep learning methods for SL recognition and\ntranslation have achieved promising results. However, Sign Language Production\n(SLP) poses a challenge as the generated motions must be realistic and have\nprecise semantic meaning. Most SLP methods rely on 2D data, which hinders their\nrealism. In this work, a diffusion-based SLP model is trained on a curated\nlarge-scale dataset of 4D signing avatars and their corresponding text\ntranscripts. The proposed method can generate dynamic sequences of 3D avatars\nfrom an unconstrained domain of discourse using a diffusion process formed on a\nnovel and anatomically informed graph neural network defined on the SMPL-X body\nskeleton. Through quantitative and qualitative experiments, we show that the\nproposed method considerably outperforms previous methods of SLP. This work\nmakes an important step towards realistic neural sign avatars, bridging the\ncommunication gap between Deaf and hearing communities.\n","authors":["Vasileios Baltatzis","Rolandos Alexandros Potamias","Evangelos Ververas","Guanxiong Sun","Jiankang Deng","Stefanos Zafeiriou"],"pdf_url":"https://arxiv.org/pdf/2312.02702v2.pdf","comment":"Accepted at CVPR 2024, Project page:\n https://baltatzisv.github.io/neural-sign-actors/"},{"id":"http://arxiv.org/abs/2312.06420v2","updated":"2024-04-05T13:45:11Z","published":"2023-12-11T14:43:23Z","title":"Localization Is All You Evaluate: Data Leakage in Online Mapping\n Datasets and How to Fix It","summary":" The task of online mapping is to predict a local map using current sensor\nobservations, e.g. from lidar and camera, without relying on a pre-built map.\nState-of-the-art methods are based on supervised learning and are trained\npredominantly using two datasets: nuScenes and Argoverse 2. However, these\ndatasets revisit the same geographic locations across training, validation, and\ntest sets. Specifically, over $80$% of nuScenes and $40$% of Argoverse 2\nvalidation and test samples are less than $5$ m from a training sample. At test\ntime, the methods are thus evaluated more on how well they localize within a\nmemorized implicit map built from the training data than on extrapolating to\nunseen locations. Naturally, this data leakage causes inflated performance\nnumbers and we propose geographically disjoint data splits to reveal the true\nperformance in unseen environments. Experimental results show that methods\nperform considerably worse, some dropping more than $45$ mAP, when trained and\nevaluated on proper data splits. Additionally, a reassessment of prior design\nchoices reveals diverging conclusions from those based on the original split.\nNotably, the impact of lifting methods and the support from auxiliary tasks\n(e.g., depth supervision) on performance appears less substantial or follows a\ndifferent trajectory than previously perceived. Splits can be found at\nhttps://github.com/LiljaAdam/geographical-splits\n","authors":["Adam Lilja","Junsheng Fu","Erik Stenborg","Lars Hammarstrand"],"pdf_url":"https://arxiv.org/pdf/2312.06420v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04095v1","updated":"2024-04-05T13:44:39Z","published":"2024-04-05T13:44:39Z","title":"Dynamic Prompt Optimizing for Text-to-Image Generation","summary":" Text-to-image generative models, specifically those based on diffusion models\nlike Imagen and Stable Diffusion, have made substantial advancements. Recently,\nthere has been a surge of interest in the delicate refinement of text prompts.\nUsers assign weights or alter the injection time steps of certain words in the\ntext prompts to improve the quality of generated images. However, the success\nof fine-control prompts depends on the accuracy of the text prompts and the\ncareful selection of weights and time steps, which requires significant manual\nintervention. To address this, we introduce the \\textbf{P}rompt\n\\textbf{A}uto-\\textbf{E}diting (PAE) method. Besides refining the original\nprompts for image generation, we further employ an online reinforcement\nlearning strategy to explore the weights and injection time steps of each word,\nleading to the dynamic fine-control prompts. The reward function during\ntraining encourages the model to consider aesthetic score, semantic\nconsistency, and user preferences. Experimental results demonstrate that our\nproposed method effectively improves the original prompts, generating visually\nmore appealing images while maintaining semantic alignment. Code is available\nat https://github.com/Mowenyii/PAE.\n","authors":["Wenyi Mo","Tianyu Zhang","Yalong Bai","Bing Su","Ji-Rong Wen","Qing Yang"],"pdf_url":"https://arxiv.org/pdf/2404.04095v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/1902.06634v4","updated":"2024-04-05T13:03:08Z","published":"2019-02-18T16:15:25Z","title":"Contextual Encoder-Decoder Network for Visual Saliency Prediction","summary":" Predicting salient regions in natural images requires the detection of\nobjects that are present in a scene. To develop robust representations for this\nchallenging task, high-level visual features at multiple spatial scales must be\nextracted and augmented with contextual information. However, existing models\naimed at explaining human fixation maps do not incorporate such a mechanism\nexplicitly. Here we propose an approach based on a convolutional neural network\npre-trained on a large-scale image classification task. The architecture forms\nan encoder-decoder structure and includes a module with multiple convolutional\nlayers at different dilation rates to capture multi-scale features in parallel.\nMoreover, we combine the resulting representations with global scene\ninformation for accurately predicting visual saliency. Our model achieves\ncompetitive and consistent results across multiple evaluation metrics on two\npublic saliency benchmarks and we demonstrate the effectiveness of the\nsuggested approach on five datasets and selected examples. Compared to state of\nthe art approaches, the network is based on a lightweight image classification\nbackbone and hence presents a suitable choice for applications with limited\ncomputational resources, such as (virtual) robotic systems, to estimate human\nfixations across complex natural scenes.\n","authors":["Alexander Kroner","Mario Senden","Kurt Driessens","Rainer Goebel"],"pdf_url":"https://arxiv.org/pdf/1902.06634v4.pdf","comment":"Updated contact information"},{"id":"http://arxiv.org/abs/2404.04072v1","updated":"2024-04-05T12:58:07Z","published":"2024-04-05T12:58:07Z","title":"Label Propagation for Zero-shot Classification with Vision-Language\n Models","summary":" Vision-Language Models (VLMs) have demonstrated impressive performance on\nzero-shot classification, i.e. classification when provided merely with a list\nof class names. In this paper, we tackle the case of zero-shot classification\nin the presence of unlabeled data. We leverage the graph structure of the\nunlabeled data and introduce ZLaP, a method based on label propagation (LP)\nthat utilizes geodesic distances for classification. We tailor LP to graphs\ncontaining both text and image features and further propose an efficient method\nfor performing inductive inference based on a dual solution and a\nsparsification step. We perform extensive experiments to evaluate the\neffectiveness of our method on 14 common datasets and show that ZLaP\noutperforms the latest related works. Code:\nhttps://github.com/vladan-stojnic/ZLaP\n","authors":["Vladan Stojnić","Yannis Kalantidis","Giorgos Tolias"],"pdf_url":"https://arxiv.org/pdf/2404.04072v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.09124v2","updated":"2024-04-05T12:56:31Z","published":"2024-03-14T06:16:21Z","title":"Single Domain Generalization for Crowd Counting","summary":" Due to its promising results, density map regression has been widely employed\nfor image-based crowd counting. The approach, however, often suffers from\nsevere performance degradation when tested on data from unseen scenarios, the\nso-called \"domain shift\" problem. To address the problem, we investigate in\nthis work single domain generalization (SDG) for crowd counting. The existing\nSDG approaches are mainly for image classification and segmentation, and can\nhardly be extended to our case due to its regression nature and label ambiguity\n(i.e., ambiguous pixel-level ground truths). We propose MPCount, a novel\neffective SDG approach even for narrow source distribution. MPCount stores\ndiverse density values for density map regression and reconstructs\ndomain-invariant features by means of only one memory bank, a content error\nmask and attention consistency loss. By partitioning the image into grids, it\nemploys patch-wise classification as an auxiliary task to mitigate label\nambiguity. Through extensive experiments on different datasets, MPCount is\nshown to significantly improve counting accuracy compared to the state of the\nart under diverse scenarios unobserved in the training data characterized by\nnarrow source distribution. Code is available at\nhttps://github.com/Shimmer93/MPCount.\n","authors":["Zhuoxuan Peng","S. -H. Gary Chan"],"pdf_url":"https://arxiv.org/pdf/2403.09124v2.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2402.19340v2","updated":"2024-04-05T12:49:38Z","published":"2024-02-29T16:46:49Z","title":"One model to use them all: Training a segmentation model with\n complementary datasets","summary":" Understanding a surgical scene is crucial for computer-assisted surgery\nsystems to provide any intelligent assistance functionality. One way of\nachieving this scene understanding is via scene segmentation, where every pixel\nof a frame is classified and therefore identifies the visible structures and\ntissues. Progress on fully segmenting surgical scenes has been made using\nmachine learning. However, such models require large amounts of annotated\ntraining data, containing examples of all relevant object classes. Such fully\nannotated datasets are hard to create, as every pixel in a frame needs to be\nannotated by medical experts and, therefore, are rarely available. In this\nwork, we propose a method to combine multiple partially annotated datasets,\nwhich provide complementary annotations, into one model, enabling better scene\nsegmentation and the use of multiple readily available datasets. Our method\naims to combine available data with complementary labels by leveraging mutual\nexclusive properties to maximize information. Specifically, we propose to use\npositive annotations of other classes as negative samples and to exclude\nbackground pixels of binary annotations, as we cannot tell if they contain a\nclass not annotated but predicted by the model. We evaluate our method by\ntraining a DeepLabV3 on the publicly available Dresden Surgical Anatomy\nDataset, which provides multiple subsets of binary segmented anatomical\nstructures. Our approach successfully combines 6 classes into one model,\nincreasing the overall Dice Score by 4.4% compared to an ensemble of models\ntrained on the classes individually. By including information on multiple\nclasses, we were able to reduce confusion between stomach and colon by 24%. Our\nresults demonstrate the feasibility of training a model on multiple datasets.\nThis paves the way for future work further alleviating the need for one large,\nfully segmented datasets.\n","authors":["Alexander C. Jenke","Sebastian Bodenstedt","Fiona R. Kolbinger","Marius Distler","Jürgen Weitz","Stefanie Speidel"],"pdf_url":"https://arxiv.org/pdf/2402.19340v2.pdf","comment":"Accepted at IPCAI 2024; submitted to IJCARS (under revision)"},{"id":"http://arxiv.org/abs/2404.03443v2","updated":"2024-04-05T12:44:39Z","published":"2024-04-04T13:43:11Z","title":"Part-Attention Based Model Make Occluded Person Re-Identification\n Stronger","summary":" The goal of occluded person re-identification (ReID) is to retrieve specific\npedestrians in occluded situations. However, occluded person ReID still suffers\nfrom background clutter and low-quality local feature representations, which\nlimits model performance. In our research, we introduce a new framework called\nPAB-ReID, which is a novel ReID model incorporating part-attention mechanisms\nto tackle the aforementioned issues effectively. Firstly, we introduce the\nhuman parsing label to guide the generation of more accurate human part\nattention maps. In addition, we propose a fine-grained feature focuser for\ngenerating fine-grained human local feature representations while suppressing\nbackground interference. Moreover, We also design a part triplet loss to\nsupervise the learning of human local features, which optimizes\nintra/inter-class distance. We conducted extensive experiments on specialized\nocclusion and regular ReID datasets, showcasing that our approach outperforms\nthe existing state-of-the-art methods.\n","authors":["Zhihao Chen","Yiyuan Ge"],"pdf_url":"https://arxiv.org/pdf/2404.03443v2.pdf","comment":"Accepted By International Joint Conference on Neural Networks 2024"},{"id":"http://arxiv.org/abs/2403.06546v2","updated":"2024-04-05T12:35:06Z","published":"2024-03-11T09:46:41Z","title":"OMH: Structured Sparsity via Optimally Matched Hierarchy for\n Unsupervised Semantic Segmentation","summary":" Unsupervised Semantic Segmentation (USS) involves segmenting images without\nrelying on predefined labels, aiming to alleviate the burden of extensive human\nlabeling. Existing methods utilize features generated by self-supervised models\nand specific priors for clustering. However, their clustering objectives are\nnot involved in the optimization of the features during training. Additionally,\ndue to the lack of clear class definitions in USS, the resulting segments may\nnot align well with the clustering objective. In this paper, we introduce a\nnovel approach called Optimally Matched Hierarchy (OMH) to simultaneously\naddress the above issues. The core of our method lies in imposing structured\nsparsity on the feature space, which allows the features to encode information\nwith different levels of granularity. The structure of this sparsity stems from\nour hierarchy (OMH). To achieve this, we learn a soft but sparse hierarchy\namong parallel clusters through Optimal Transport. Our OMH yields better\nunsupervised segmentation performance compared to existing USS methods. Our\nextensive experiments demonstrate the benefits of OMH when utilizing our\ndifferentiable paradigm. We will make our code publicly available.\n","authors":["Baran Ozaydin","Tong Zhang","Deblina Bhattacharjee","Sabine Süsstrunk","Mathieu Salzmann"],"pdf_url":"https://arxiv.org/pdf/2403.06546v2.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2404.04057v1","updated":"2024-04-05T12:30:19Z","published":"2024-04-05T12:30:19Z","title":"Score identity Distillation: Exponentially Fast Distillation of\n Pretrained Diffusion Models for One-Step Generation","summary":" We introduce Score identity Distillation (SiD), an innovative data-free\nmethod that distills the generative capabilities of pretrained diffusion models\ninto a single-step generator. SiD not only facilitates an exponentially fast\nreduction in Fr\\'echet inception distance (FID) during distillation but also\napproaches or even exceeds the FID performance of the original teacher\ndiffusion models. By reformulating forward diffusion processes as semi-implicit\ndistributions, we leverage three score-related identities to create an\ninnovative loss mechanism. This mechanism achieves rapid FID reduction by\ntraining the generator using its own synthesized images, eliminating the need\nfor real data or reverse-diffusion-based generation, all accomplished within\nsignificantly shortened generation time. Upon evaluation across four benchmark\ndatasets, the SiD algorithm demonstrates high iteration efficiency during\ndistillation and surpasses competing distillation approaches, whether they are\none-step or few-step, data-free, or dependent on training data, in terms of\ngeneration quality. This achievement not only redefines the benchmarks for\nefficiency and effectiveness in diffusion distillation but also in the broader\nfield of diffusion-based generation. Our PyTorch implementation will be\npublicly accessible on GitHub.\n","authors":["Mingyuan Zhou","Huangjie Zheng","Zhendong Wang","Mingzhang Yin","Hai Huang"],"pdf_url":"https://arxiv.org/pdf/2404.04057v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13377v2","updated":"2024-04-05T12:14:50Z","published":"2023-12-20T19:08:49Z","title":"SADA: Semantic adversarial unsupervised domain adaptation for Temporal\n Action Localization","summary":" Temporal Action Localization (TAL) is a complex task that poses relevant\nchallenges, particularly when attempting to generalize on new -- unseen --\ndomains in real-world applications. These scenarios, despite realistic, are\noften neglected in the literature, exposing these solutions to important\nperformance degradation. In this work, we tackle this issue by introducing, for\nthe first time, an approach for Unsupervised Domain Adaptation (UDA) in sparse\nTAL, which we refer to as Semantic Adversarial unsupervised Domain Adaptation\n(SADA). Our contributions are threefold: (1) we pioneer the development of a\ndomain adaptation model that operates on realistic sparse action detection\nbenchmarks; (2) we tackle the limitations of global-distribution alignment\ntechniques by introducing a novel adversarial loss that is sensitive to local\nclass distributions, ensuring finer-grained adaptation; and (3) we present a\nnovel set of benchmarks based on EpicKitchens100 and CharadesEgo, that evaluate\nmultiple domain shifts in a comprehensive manner. Our experiments indicate that\nSADA improves the adaptation across domains when compared to fully supervised\nstate-of-the-art and alternative UDA methods, attaining a performance boost of\nup to 6.14% mAP.\n","authors":["David Pujol-Perich","Albert Clapés","Sergio Escalera"],"pdf_url":"https://arxiv.org/pdf/2312.13377v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10344v2","updated":"2024-04-05T12:14:15Z","published":"2024-03-15T14:31:17Z","title":"SCILLA: SurfaCe Implicit Learning for Large Urban Area, a volumetric\n hybrid solution","summary":" Neural implicit surface representation methods have recently shown impressive\n3D reconstruction results. However, existing solutions struggle to reconstruct\nurban outdoor scenes due to their large, unbounded, and highly detailed nature.\nHence, to achieve accurate reconstructions, additional supervision data such as\nLiDAR, strong geometric priors, and long training times are required. To tackle\nsuch issues, we present SCILLA, a new hybrid implicit surface learning method\nto reconstruct large driving scenes from 2D images. SCILLA's hybrid\narchitecture models two separate implicit fields: one for the volumetric\ndensity and another for the signed distance to the surface. To accurately\nrepresent urban outdoor scenarios, we introduce a novel volume-rendering\nstrategy that relies on self-supervised probabilistic density estimation to\nsample points near the surface and transition progressively from volumetric to\nsurface representation. Our solution permits a proper and fast initialization\nof the signed distance field without relying on any geometric prior on the\nscene, compared to concurrent methods. By conducting extensive experiments on\nfour outdoor driving datasets, we show that SCILLA can learn an accurate and\ndetailed 3D surface scene representation in various urban scenarios while being\ntwo times faster to train compared to previous state-of-the-art solutions.\n","authors":["Hala Djeghim","Nathan Piasco","Moussab Bennehar","Luis Roldão","Dzmitry Tsishkou","Désiré Sidibé"],"pdf_url":"https://arxiv.org/pdf/2403.10344v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06587v2","updated":"2024-04-05T12:10:30Z","published":"2023-12-11T18:19:36Z","title":"QuickQuakeBuildings: Post-earthquake SAR-Optical Dataset for Quick\n Damaged-building Detection","summary":" Quick and automated earthquake-damaged building detection from post-event\nsatellite imagery is crucial, yet it is challenging due to the scarcity of\ntraining data required to develop robust algorithms. This letter presents the\nfirst dataset dedicated to detecting earthquake-damaged buildings from\npost-event very high resolution (VHR) Synthetic Aperture Radar (SAR) and\noptical imagery. Utilizing open satellite imagery and annotations acquired\nafter the 2023 Turkey-Syria earthquakes, we deliver a dataset of coregistered\nbuilding footprints and satellite image patches of both SAR and optical data,\nencompassing more than four thousand buildings. The task of damaged building\ndetection is formulated as a binary image classification problem, that can also\nbe treated as an anomaly detection problem due to extreme class imbalance. We\nprovide baseline methods and results to serve as references for comparison.\nResearchers can utilize this dataset to expedite algorithm development,\nfacilitating the rapid detection of damaged buildings in response to future\nevents. The dataset and codes together with detailed explanations and\nvisualization are made publicly available at\n\\url{https://github.com/ya0-sun/PostEQ-SARopt-BuildingDamage}.\n","authors":["Yao Sun","Yi Wang","Michael Eineder"],"pdf_url":"https://arxiv.org/pdf/2312.06587v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04050v1","updated":"2024-04-05T12:09:36Z","published":"2024-04-05T12:09:36Z","title":"No Time to Train: Empowering Non-Parametric Networks for Few-shot 3D\n Scene Segmentation","summary":" To reduce the reliance on large-scale datasets, recent works in 3D\nsegmentation resort to few-shot learning. Current 3D few-shot segmentation\nmethods first pre-train models on 'seen' classes, and then evaluate their\ngeneralization performance on 'unseen' classes. However, the prior pre-training\nstage not only introduces excessive time overhead but also incurs a significant\ndomain gap on 'unseen' classes. To tackle these issues, we propose a\nNon-parametric Network for few-shot 3D Segmentation, Seg-NN, and its Parametric\nvariant, Seg-PN. Without training, Seg-NN extracts dense representations by\nhand-crafted filters and achieves comparable performance to existing parametric\nmodels. Due to the elimination of pre-training, Seg-NN can alleviate the domain\ngap issue and save a substantial amount of time. Based on Seg-NN, Seg-PN only\nrequires training a lightweight QUEry-Support Transferring (QUEST) module,\nwhich enhances the interaction between the support set and query set.\nExperiments suggest that Seg-PN outperforms previous state-of-the-art method by\n+4.19% and +7.71% mIoU on S3DIS and ScanNet datasets respectively, while\nreducing training time by -90%, indicating its effectiveness and efficiency.\n","authors":["Xiangyang Zhu","Renrui Zhang","Bowei He","Ziyu Guo","Jiaming Liu","Han Xiao","Chaoyou Fu","Hao Dong","Peng Gao"],"pdf_url":"https://arxiv.org/pdf/2404.04050v1.pdf","comment":"CVPR Highlight. Code is available at\n https://github.com/yangyangyang127/Seg-NN. arXiv admin note: text overlap\n with arXiv:2308.12961"},{"id":"http://arxiv.org/abs/2404.04040v1","updated":"2024-04-05T11:49:29Z","published":"2024-04-05T11:49:29Z","title":"Dynamic Risk Assessment Methodology with an LDM-based System for Parking\n Scenarios","summary":" This paper describes the methodology for building a dynamic risk assessment\nfor ADAS (Advanced Driving Assistance Systems) algorithms in parking scenarios,\nfusing exterior and interior perception for a better understanding of the scene\nand a more comprehensive risk estimation. This includes the definition of a\ndynamic risk methodology that depends on the situation from inside and outside\nthe vehicle, the creation of a multi-sensor dataset of risk assessment for ADAS\nbenchmarking purposes, and a Local Dynamic Map (LDM) that fuses data from the\nexterior and interior of the car to build an LDM-based Dynamic Risk Assessment\nSystem (DRAS).\n","authors":["Paola Natalia Cañas","Mikel García","Nerea Aranjuelo","Marcos Nieto","Aitor Iglesias","Igor Rodríguez"],"pdf_url":"https://arxiv.org/pdf/2404.04040v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04037v1","updated":"2024-04-05T11:45:03Z","published":"2024-04-05T11:45:03Z","title":"InstructHumans: Editing Animated 3D Human Textures with Instructions","summary":" We present InstructHumans, a novel framework for instruction-driven 3D human\ntexture editing. Existing text-based editing methods use Score Distillation\nSampling (SDS) to distill guidance from generative models. This work shows that\nnaively using such scores is harmful to editing as they destroy consistency\nwith the source avatar. Instead, we propose an alternate SDS for Editing\n(SDS-E) that selectively incorporates subterms of SDS across diffusion\ntimesteps. We further enhance SDS-E with spatial smoothness regularization and\ngradient-based viewpoint sampling to achieve high-quality edits with sharp and\nhigh-fidelity detailing. InstructHumans significantly outperforms existing 3D\nediting methods, consistent with the initial avatar while faithful to the\ntextual instructions. Project page: https://jyzhu.top/instruct-humans .\n","authors":["Jiayin Zhu","Linlin Yang","Angela Yao"],"pdf_url":"https://arxiv.org/pdf/2404.04037v1.pdf","comment":"Project Page: https://jyzhu.top/instruct-humans"},{"id":"http://arxiv.org/abs/2312.00648v3","updated":"2024-04-05T11:31:12Z","published":"2023-12-01T15:20:58Z","title":"SPOT: Self-Training with Patch-Order Permutation for Object-Centric\n Learning with Autoregressive Transformers","summary":" Unsupervised object-centric learning aims to decompose scenes into\ninterpretable object entities, termed slots. Slot-based auto-encoders stand out\nas a prominent method for this task. Within them, crucial aspects include\nguiding the encoder to generate object-specific slots and ensuring the decoder\nutilizes them during reconstruction. This work introduces two novel techniques,\n(i) an attention-based self-training approach, which distills superior\nslot-based attention masks from the decoder to the encoder, enhancing object\nsegmentation, and (ii) an innovative patch-order permutation strategy for\nautoregressive transformers that strengthens the role of slot vectors in\nreconstruction. The effectiveness of these strategies is showcased\nexperimentally. The combined approach significantly surpasses prior slot-based\nautoencoder methods in unsupervised object segmentation, especially with\ncomplex real-world images. We provide the implementation code at\nhttps://github.com/gkakogeorgiou/spot .\n","authors":["Ioannis Kakogeorgiou","Spyros Gidaris","Konstantinos Karantzalos","Nikos Komodakis"],"pdf_url":"https://arxiv.org/pdf/2312.00648v3.pdf","comment":"CVPR 2024 (Highlight). Code: https://github.com/gkakogeorgiou/spot"},{"id":"http://arxiv.org/abs/2404.04026v1","updated":"2024-04-05T11:14:19Z","published":"2024-04-05T11:14:19Z","title":"MM-Gaussian: 3D Gaussian-based Multi-modal Fusion for Localization and\n Reconstruction in Unbounded Scenes","summary":" Localization and mapping are critical tasks for various applications such as\nautonomous vehicles and robotics. The challenges posed by outdoor environments\npresent particular complexities due to their unbounded characteristics. In this\nwork, we present MM-Gaussian, a LiDAR-camera multi-modal fusion system for\nlocalization and mapping in unbounded scenes. Our approach is inspired by the\nrecently developed 3D Gaussians, which demonstrate remarkable capabilities in\nachieving high rendering quality and fast rendering speed. Specifically, our\nsystem fully utilizes the geometric structure information provided by\nsolid-state LiDAR to address the problem of inaccurate depth encountered when\nrelying solely on visual solutions in unbounded, outdoor scenarios.\nAdditionally, we utilize 3D Gaussian point clouds, with the assistance of\npixel-level gradient descent, to fully exploit the color information in photos,\nthereby achieving realistic rendering effects. To further bolster the\nrobustness of our system, we designed a relocalization module, which assists in\nreturning to the correct trajectory in the event of a localization failure.\nExperiments conducted in multiple scenarios demonstrate the effectiveness of\nour method.\n","authors":["Chenyang Wu","Yifan Duan","Xinran Zhang","Yu Sheng","Jianmin Ji","Yanyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.04026v1.pdf","comment":"7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.04025v1","updated":"2024-04-05T11:13:59Z","published":"2024-04-05T11:13:59Z","title":"Framework to generate perfusion map from CT and CTA images in patients\n with acute ischemic stroke: A longitudinal and cross-sectional study","summary":" Stroke is a leading cause of disability and death. Effective treatment\ndecisions require early and informative vascular imaging. 4D perfusion imaging\nis ideal but rarely available within the first hour after stroke, whereas plain\nCT and CTA usually are. Hence, we propose a framework to extract a predicted\nperfusion map (PPM) derived from CT and CTA images. In all eighteen patients,\nwe found significantly high spatial similarity (with average Spearman's\ncorrelation = 0.7893) between our predicted perfusion map (PPM) and the T-max\nmap derived from 4D-CTP. Voxelwise correlations between the PPM and National\nInstitutes of Health Stroke Scale (NIHSS) subscores for L/R hand motor, gaze,\nand language on a large cohort of 2,110 subjects reliably mapped symptoms to\nexpected infarct locations. Therefore our PPM could serve as an alternative for\n4D perfusion imaging, if the latter is unavailable, to investigate blood\nperfusion in the first hours after hospital admission.\n","authors":["Chayanin Tangwiriyasakul","Pedro Borges","Stefano Moriconi","Paul Wright","Yee-Haur Mah","James Teo","Parashkev Nachev","Sebastien Ourselin","M. Jorge Cardoso"],"pdf_url":"https://arxiv.org/pdf/2404.04025v1.pdf","comment":"Accepted and presented in SWITCH2023: Stroke Workshop on Imaging and\n Treatment CHallenges (MICCAI 2023, Vancouver Canada)"},{"id":"http://arxiv.org/abs/2404.04007v1","updated":"2024-04-05T10:30:38Z","published":"2024-04-05T10:30:38Z","title":"Neural-Symbolic VideoQA: Learning Compositional Spatio-Temporal\n Reasoning for Real-world Video Question Answering","summary":" Compositional spatio-temporal reasoning poses a significant challenge in the\nfield of video question answering (VideoQA). Existing approaches struggle to\nestablish effective symbolic reasoning structures, which are crucial for\nanswering compositional spatio-temporal questions. To address this challenge,\nwe propose a neural-symbolic framework called Neural-Symbolic VideoQA\n(NS-VideoQA), specifically designed for real-world VideoQA tasks. The\nuniqueness and superiority of NS-VideoQA are two-fold: 1) It proposes a Scene\nParser Network (SPN) to transform static-dynamic video scenes into Symbolic\nRepresentation (SR), structuralizing persons, objects, relations, and action\nchronologies. 2) A Symbolic Reasoning Machine (SRM) is designed for top-down\nquestion decompositions and bottom-up compositional reasonings. Specifically, a\npolymorphic program executor is constructed for internally consistent reasoning\nfrom SR to the final answer. As a result, Our NS-VideoQA not only improves the\ncompositional spatio-temporal reasoning in real-world VideoQA task, but also\nenables step-by-step error analysis by tracing the intermediate results.\nExperimental evaluations on the AGQA Decomp benchmark demonstrate the\neffectiveness of the proposed NS-VideoQA framework. Empirical studies further\nconfirm that NS-VideoQA exhibits internal consistency in answering\ncompositional questions and significantly improves the capability of\nspatio-temporal and logical inference for VideoQA tasks.\n","authors":["Lili Liang","Guanglu Sun","Jin Qiu","Lizhong Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.04007v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20106v2","updated":"2024-04-05T10:29:00Z","published":"2024-03-29T10:40:41Z","title":"Learning Enriched Features via Selective State Spaces Model for\n Efficient Image Deblurring","summary":" Image deblurring aims to restore a high-quality image from its corresponding\nblurred. The emergence of CNNs and Transformers has enabled significant\nprogress. However, these methods often face the dilemma between eliminating\nlong-range degradation perturbations and maintaining computational efficiency.\nWhile the selective state space model (SSM) shows promise in modeling\nlong-range dependencies with linear complexity, it also encounters challenges\nsuch as local pixel forgetting and channel redundancy. To address this issue,\nwe propose an efficient image deblurring network that leverages selective state\nspaces model to aggregate enriched and accurate features. Specifically, we\nintroduce an aggregate local and global information block (ALGBlock) designed\nto effectively capture and integrate both local invariant properties and\nnon-local information. The ALGBlock comprises two primary modules: a module for\ncapturing local and global features (CLGF), and a feature aggregation module\n(FA). The CLGF module is composed of two branches: the global branch captures\nlong-range dependency features via a selective state spaces model, while the\nlocal branch employs simplified channel attention to model local connectivity,\nthereby reducing local pixel forgetting and channel redundancy. In addition, we\ndesign a FA module to accentuate the local part by recalibrating the weight\nduring the aggregation of the two branches for restoration. Experimental\nresults demonstrate that the proposed method outperforms state-of-the-art\napproaches on widely used benchmarks.\n","authors":["Hu Gao","Depeng Dang"],"pdf_url":"https://arxiv.org/pdf/2403.20106v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03999v1","updated":"2024-04-05T10:23:20Z","published":"2024-04-05T10:23:20Z","title":"Finsler-Laplace-Beltrami Operators with Application to Shape Analysis","summary":" The Laplace-Beltrami operator (LBO) emerges from studying manifolds equipped\nwith a Riemannian metric. It is often called the Swiss army knife of geometry\nprocessing as it allows to capture intrinsic shape information and gives rise\nto heat diffusion, geodesic distances, and a multitude of shape descriptors. It\nalso plays a central role in geometric deep learning. In this work, we explore\nFinsler manifolds as a generalization of Riemannian manifolds. We revisit the\nFinsler heat equation and derive a Finsler heat kernel and a\nFinsler-Laplace-Beltrami Operator (FLBO): a novel theoretically justified\nanisotropic Laplace-Beltrami operator (ALBO). In experimental evaluations we\ndemonstrate that the proposed FLBO is a valuable alternative to the traditional\nRiemannian-based LBO and ALBOs for spatial filtering and shape correspondence\nestimation. We hope that the proposed Finsler heat kernel and the FLBO will\ninspire further exploration of Finsler geometry in the computer vision\ncommunity.\n","authors":["Simon Weber","Thomas Dagès","Maolin Gao","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2404.03999v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03998v1","updated":"2024-04-05T10:23:10Z","published":"2024-04-05T10:23:10Z","title":"Physics-Inspired Synthesized Underwater Image Dataset","summary":" This paper introduces the physics-inspired synthesized underwater image\ndataset (PHISWID), a dataset tailored for enhancing underwater image processing\nthrough physics-inspired image synthesis. Deep learning approaches to\nunderwater image enhancement typically demand extensive datasets, yet acquiring\npaired clean and degraded underwater ones poses significant challenges. While\nseveral underwater image datasets have been proposed using physics-based\nsynthesis, a publicly accessible collection has been lacking. Additionally,\nmost underwater image synthesis approaches do not intend to reproduce\natmospheric scenes, resulting in incomplete enhancement. PHISWID addresses this\ngap by offering a set of paired ground-truth (atmospheric) and synthetically\ndegraded underwater images, showcasing not only color degradation but also the\noften-neglected effects of marine snow, a composite of organic matter and sand\nparticles that considerably impairs underwater image clarity. The dataset\napplies these degradations to atmospheric RGB-D images, enhancing the dataset's\nrealism and applicability. PHISWID is particularly valuable for training deep\nneural networks in a supervised learning setting and for objectively assessing\nimage quality in benchmark analyses. Our results reveal that even a basic U-Net\narchitecture, when trained with PHISWID, substantially outperforms existing\nmethods in underwater image enhancement. We intend to release PHISWID publicly,\ncontributing a significant resource to the advancement of underwater imaging\ntechnology.\n","authors":["Reina Kaneko","Hiroshi Higashi","Yuichi Tanaka"],"pdf_url":"https://arxiv.org/pdf/2404.03998v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.01734v2","updated":"2024-04-05T10:11:27Z","published":"2023-11-03T06:05:36Z","title":"Sculpting Holistic 3D Representation in Contrastive Language-Image-3D\n Pre-training","summary":" Contrastive learning has emerged as a promising paradigm for 3D open-world\nunderstanding, i.e., aligning point cloud representation to image and text\nembedding space individually. In this paper, we introduce MixCon3D, a simple\nyet effective method aiming to sculpt holistic 3D representation in contrastive\nlanguage-image-3D pre-training. In contrast to point cloud only, we develop the\n3D object-level representation from complementary perspectives, e.g.,\nmulti-view rendered images with the point cloud. Then, MixCon3D performs\nlanguage-3D contrastive learning, comprehensively depicting real-world 3D\nobjects and bolstering text alignment. Additionally, we pioneer the first\nthorough investigation of various training recipes for the 3D contrastive\nlearning paradigm, building a solid baseline with improved performance.\nExtensive experiments conducted on three representative benchmarks reveal that\nour method significantly improves over the baseline, surpassing the previous\nstate-of-the-art performance on the challenging 1,156-category Objaverse-LVIS\ndataset by 5.7%. The versatility of MixCon3D is showcased in applications such\nas text-to-3D retrieval and point cloud captioning, further evidencing its\nefficacy in diverse scenarios. The code is available at\nhttps://github.com/UCSC-VLAA/MixCon3D.\n","authors":["Yipeng Gao","Zeyu Wang","Wei-Shi Zheng","Cihang Xie","Yuyin Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.01734v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2310.17170v3","updated":"2024-04-05T10:07:24Z","published":"2023-10-26T05:49:44Z","title":"MO-YOLO: End-to-End Multiple-Object Tracking Method with YOLO and\n Decoder","summary":" In the field of multi-object tracking (MOT), recent Transformer based\nend-to-end models like MOTR have demonstrated exceptional performance on\ndatasets such as DanceTracker. However, the computational demands of these\nmodels present challenges in training and deployment. Drawing inspiration from\nsuccessful models like GPT, we present MO-YOLO, an efficient and\ncomputationally frugal end-to-end MOT model. MO-YOLO integrates principles from\nYou Only Look Once (YOLO) and RT-DETR, adopting a decoder-only approach. By\nleveraging the decoder from RT-DETR and architectural components from YOLOv8,\nMO-YOLO achieves high speed, shorter training times, and proficient MOT\nperformance. On the Dancetrack, MO-YOLO not only matches MOTR's performance but\nalso surpasses it, achieving over twice the frames per second (MOTR 9.5 FPS,\nMO-YOLO 19.6 FPS). Furthermore, MO-YOLO demonstrates significantly reduced\ntraining times and lower hardware requirements compared to MOTR. This research\nintroduces a promising paradigm for efficient end-to-end MOT, emphasizing\nenhanced performance and resource efficiency.\n","authors":["Liao Pan","Yang Feng","Wu Di","Liu Bo","Zhang Xingle"],"pdf_url":"https://arxiv.org/pdf/2310.17170v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03992v1","updated":"2024-04-05T10:02:32Z","published":"2024-04-05T10:02:32Z","title":"Rolling the dice for better deep learning performance: A study of\n randomness techniques in deep neural networks","summary":" This paper investigates how various randomization techniques impact Deep\nNeural Networks (DNNs). Randomization, like weight noise and dropout, aids in\nreducing overfitting and enhancing generalization, but their interactions are\npoorly understood. The study categorizes randomness techniques into four types\nand proposes new methods: adding noise to the loss function and random masking\nof gradient updates. Using Particle Swarm Optimizer (PSO) for hyperparameter\noptimization, it explores optimal configurations across MNIST, FASHION-MNIST,\nCIFAR10, and CIFAR100 datasets. Over 30,000 configurations are evaluated,\nrevealing data augmentation and weight initialization randomness as main\nperformance contributors. Correlation analysis shows different optimizers\nprefer distinct randomization types. The complete implementation and dataset\nare available on GitHub.\n","authors":["Mohammed Ghaith Altarabichi","Sławomir Nowaczyk","Sepideh Pashami","Peyman Sheikholharam Mashhadi","Julia Handl"],"pdf_url":"https://arxiv.org/pdf/2404.03992v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03991v1","updated":"2024-04-05T10:01:31Z","published":"2024-04-05T10:01:31Z","title":"Towards Efficient and Accurate CT Segmentation via Edge-Preserving\n Probabilistic Downsampling","summary":" Downsampling images and labels, often necessitated by limited resources or to\nexpedite network training, leads to the loss of small objects and thin\nboundaries. This undermines the segmentation network's capacity to interpret\nimages accurately and predict detailed labels, resulting in diminished\nperformance compared to processing at original resolutions. This situation\nexemplifies the trade-off between efficiency and accuracy, with higher\ndownsampling factors further impairing segmentation outcomes. Preserving\ninformation during downsampling is especially critical for medical image\nsegmentation tasks. To tackle this challenge, we introduce a novel method named\nEdge-preserving Probabilistic Downsampling (EPD). It utilizes class uncertainty\nwithin a local window to produce soft labels, with the window size dictating\nthe downsampling factor. This enables a network to produce quality predictions\nat low resolutions. Beyond preserving edge details more effectively than\nconventional nearest-neighbor downsampling, employing a similar algorithm for\nimages, it surpasses bilinear interpolation in image downsampling, enhancing\noverall performance. Our method significantly improved Intersection over Union\n(IoU) to 2.85%, 8.65%, and 11.89% when downsampling data to 1/2, 1/4, and 1/8,\nrespectively, compared to conventional interpolation methods.\n","authors":["Shahzad Ali","Yu Rim Lee","Soo Young Park","Won Young Tak","Soon Ki Jung"],"pdf_url":"https://arxiv.org/pdf/2404.03991v1.pdf","comment":"5 pages (4 figures, 1 table); This work has been submitted to the\n IEEE Signal Processing Letters. Copyright may be transferred without notice,\n after which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2403.19655v2","updated":"2024-04-05T09:35:37Z","published":"2024-03-28T17:59:50Z","title":"GaussianCube: Structuring Gaussian Splatting using Optimal Transport for\n 3D Generative Modeling","summary":" 3D Gaussian Splatting (GS) have achieved considerable improvement over Neural\nRadiance Fields in terms of 3D fitting fidelity and rendering speed. However,\nthis unstructured representation with scattered Gaussians poses a significant\nchallenge for generative modeling. To address the problem, we introduce\nGaussianCube, a structured GS representation that is both powerful and\nefficient for generative modeling. We achieve this by first proposing a\nmodified densification-constrained GS fitting algorithm which can yield\nhigh-quality fitting results using a fixed number of free Gaussians, and then\nre-arranging the Gaussians into a predefined voxel grid via Optimal Transport.\nThe structured grid representation allows us to use standard 3D U-Net as our\nbackbone in diffusion generative modeling without elaborate designs. Extensive\nexperiments conducted on ShapeNet and OmniObject3D show that our model achieves\nstate-of-the-art generation results both qualitatively and quantitatively,\nunderscoring the potential of GaussianCube as a powerful and versatile 3D\nrepresentation.\n","authors":["Bowen Zhang","Yiji Cheng","Jiaolong Yang","Chunyu Wang","Feng Zhao","Yansong Tang","Dong Chen","Baining Guo"],"pdf_url":"https://arxiv.org/pdf/2403.19655v2.pdf","comment":"Fix typo in Eq.2; Project Page: https://gaussiancube.github.io/"},{"id":"http://arxiv.org/abs/2402.12891v2","updated":"2024-04-05T09:26:07Z","published":"2024-02-20T10:35:51Z","title":"Mind the Exit Pupil Gap: Revisiting the Intrinsics of a Standard\n Plenoptic Camera","summary":" Among the common applications of plenoptic cameras are depth reconstruction\nand post-shot refocusing. These require a calibration relating the camera-side\nlight field to that of the scene. Numerous methods with this goal have been\ndeveloped based on thin lens models for the plenoptic camera's main lens and\nmicrolenses. Our work addresses the often-overlooked role of the main lens exit\npupil in these models and specifically in the decoding process of standard\nplenoptic camera (SPC) images. We formally deduce the connection between the\nrefocusing distance and the resampling parameter for the decoded light field\nand provide an analysis of the errors that arise when the exit pupil is not\nconsidered. In addition, previous work is revisited with respect to the exit\npupil's role and all theoretical results are validated through a\nray-tracing-based simulation. With the public release of the evaluated SPC\ndesigns alongside our simulation and experimental data we aim to contribute to\na more accurate and nuanced understanding of plenoptic camera optics.\n","authors":["Tim Michels","Daniel Mäckelmann","Reinhard Koch"],"pdf_url":"https://arxiv.org/pdf/2402.12891v2.pdf","comment":"29 pages, 16 figures, Accepted for publication in MDPI Sensors,\n Special Issue 'Short-Range Optical 3D Scanning and 3D Data Processing '"},{"id":"http://arxiv.org/abs/2111.05778v2","updated":"2024-04-05T09:19:41Z","published":"2021-11-10T16:31:27Z","title":"Theoretical and Empirical Analysis of a Fast Algorithm for Extracting\n Polygons from Signed Distance Bounds","summary":" Recently there has been renewed interest in signed distance bound\nrepresentations due to their unique properties for 3D shape modelling. This is\nespecially the case for deep learning-based bounds. However, it is beneficial\nto work with polygons in most computer-graphics applications. Thus, in this\npaper we introduce and investigate an asymptotically fast method for\ntransforming signed distance bounds into polygon meshes. This is achieved by\ncombining the principles of sphere tracing (or ray marching) with traditional\npolygonization techniques, such as Marching Cubes. We provide theoretical and\nexperimental evidence that this approach is of the $O(N^2\\log N)$ computational\ncomplexity for a polygonization grid with $N^3$ cells. The algorithm is tested\non both a set of primitive shapes as well as signed distance bounds generated\nfrom point clouds by machine learning (and represented as neural networks).\nGiven its speed, implementation simplicity and portability, we argue that it\ncould prove useful during the modelling stage as well as in shape compression\nfor storage.\n The code is available here: https://github.com/nenadmarkus/gridhopping\n","authors":["Nenad Markuš","Mirko Sužnjević"],"pdf_url":"https://arxiv.org/pdf/2111.05778v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04868v3","updated":"2024-04-05T09:03:48Z","published":"2023-08-09T11:02:00Z","title":"InstantAvatar: Efficient 3D Head Reconstruction via Surface Rendering","summary":" Recent advances in full-head reconstruction have been obtained by optimizing\na neural field through differentiable surface or volume rendering to represent\na single scene. While these techniques achieve an unprecedented accuracy, they\ntake several minutes, or even hours, due to the expensive optimization process\nrequired. In this work, we introduce InstantAvatar, a method that recovers\nfull-head avatars from few images (down to just one) in a few seconds on\ncommodity hardware. In order to speed up the reconstruction process, we propose\na system that combines, for the first time, a voxel-grid neural field\nrepresentation with a surface renderer. Notably, a naive combination of these\ntwo techniques leads to unstable optimizations that do not converge to valid\nsolutions. In order to overcome this limitation, we present a novel statistical\nmodel that learns a prior distribution over 3D head signed distance functions\nusing a voxel-grid based architecture. The use of this prior model, in\ncombination with other design choices, results into a system that achieves 3D\nhead reconstructions with comparable accuracy as the state-of-the-art with a\n100x speed-up.\n","authors":["Antonio Canela","Pol Caselles","Ibrar Malik","Eduard Ramon","Jaime García","Jordi Sánchez-Riera","Gil Triginer","Francesc Moreno-Noguer"],"pdf_url":"https://arxiv.org/pdf/2308.04868v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.00434v2","updated":"2024-04-05T08:53:28Z","published":"2023-04-30T09:28:38Z","title":"EVREAL: Towards a Comprehensive Benchmark and Analysis Suite for\n Event-based Video Reconstruction","summary":" Event cameras are a new type of vision sensor that incorporates asynchronous\nand independent pixels, offering advantages over traditional frame-based\ncameras such as high dynamic range and minimal motion blur. However, their\noutput is not easily understandable by humans, making the reconstruction of\nintensity images from event streams a fundamental task in event-based vision.\nWhile recent deep learning-based methods have shown promise in video\nreconstruction from events, this problem is not completely solved yet. To\nfacilitate comparison between different approaches, standardized evaluation\nprotocols and diverse test datasets are essential. This paper proposes a\nunified evaluation methodology and introduces an open-source framework called\nEVREAL to comprehensively benchmark and analyze various event-based video\nreconstruction methods from the literature. Using EVREAL, we give a detailed\nanalysis of the state-of-the-art methods for event-based video reconstruction,\nand provide valuable insights into the performance of these methods under\nvarying settings, challenging scenarios, and downstream tasks.\n","authors":["Burak Ercan","Onur Eker","Aykut Erdem","Erkut Erdem"],"pdf_url":"https://arxiv.org/pdf/2305.00434v2.pdf","comment":"19 pages, 9 figures. Has been accepted for publication at the IEEE\n Conference on Computer Vision and Pattern Recognition Workshops (CVPRW),\n Vancouver, 2023. The project page can be found at\n https://ercanburak.github.io/evreal.html"},{"id":"http://arxiv.org/abs/2404.03962v1","updated":"2024-04-05T08:52:32Z","published":"2024-04-05T08:52:32Z","title":"RaSim: A Range-aware High-fidelity RGB-D Data Simulation Pipeline for\n Real-world Applications","summary":" In robotic vision, a de-facto paradigm is to learn in simulated environments\nand then transfer to real-world applications, which poses an essential\nchallenge in bridging the sim-to-real domain gap. While mainstream works tackle\nthis problem in the RGB domain, we focus on depth data synthesis and develop a\nrange-aware RGB-D data simulation pipeline (RaSim). In particular,\nhigh-fidelity depth data is generated by imitating the imaging principle of\nreal-world sensors. A range-aware rendering strategy is further introduced to\nenrich data diversity. Extensive experiments show that models trained with\nRaSim can be directly applied to real-world scenarios without any finetuning\nand excel at downstream RGB-D perception tasks.\n","authors":["Xingyu Liu","Chenyangguang Zhang","Gu Wang","Ruida Zhang","Xiangyang Ji"],"pdf_url":"https://arxiv.org/pdf/2404.03962v1.pdf","comment":"accepted by ICRA'24"},{"id":"http://arxiv.org/abs/2403.01300v2","updated":"2024-04-05T08:42:02Z","published":"2024-03-02T19:54:53Z","title":"Causal Mode Multiplexer: A Novel Framework for Unbiased Multispectral\n Pedestrian Detection","summary":" RGBT multispectral pedestrian detection has emerged as a promising solution\nfor safety-critical applications that require day/night operations. However,\nthe modality bias problem remains unsolved as multispectral pedestrian\ndetectors learn the statistical bias in datasets. Specifically, datasets in\nmultispectral pedestrian detection mainly distribute between ROTO (day) and\nRXTO (night) data; the majority of the pedestrian labels statistically co-occur\nwith their thermal features. As a result, multispectral pedestrian detectors\nshow poor generalization ability on examples beyond this statistical\ncorrelation, such as ROTX data. To address this problem, we propose a novel\nCausal Mode Multiplexer (CMM) framework that effectively learns the causalities\nbetween multispectral inputs and predictions. Moreover, we construct a new\ndataset (ROTX-MP) to evaluate modality bias in multispectral pedestrian\ndetection. ROTX-MP mainly includes ROTX examples not presented in previous\ndatasets. Extensive experiments demonstrate that our proposed CMM framework\ngeneralizes well on existing datasets (KAIST, CVC-14, FLIR) and the new\nROTX-MP. We will release our new dataset to the public for future research.\n","authors":["Taeheon Kim","Sebin Shin","Youngjoon Yu","Hak Gu Kim","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2403.01300v2.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2401.01598v2","updated":"2024-04-05T08:23:29Z","published":"2024-01-03T07:59:17Z","title":"Learning Prompt with Distribution-Based Feature Replay for Few-Shot\n Class-Incremental Learning","summary":" Few-shot Class-Incremental Learning (FSCIL) aims to continuously learn new\nclasses based on very limited training data without forgetting the old ones\nencountered. Existing studies solely relied on pure visual networks, while in\nthis paper we solved FSCIL by leveraging the Vision-Language model (e.g., CLIP)\nand propose a simple yet effective framework, named Learning Prompt with\nDistribution-based Feature Replay (LP-DiF). We observe that simply using CLIP\nfor zero-shot evaluation can substantially outperform the most influential\nmethods. Then, prompt tuning technique is involved to further improve its\nadaptation ability, allowing the model to continually capture specific\nknowledge from each session. To prevent the learnable prompt from forgetting\nold knowledge in the new session, we propose a pseudo-feature replay approach.\nSpecifically, we preserve the old knowledge of each class by maintaining a\nfeature-level Gaussian distribution with a diagonal covariance matrix, which is\nestimated by the image features of training images and synthesized features\ngenerated from a VAE. When progressing to a new session, pseudo-features are\nsampled from old-class distributions combined with training images of the\ncurrent session to optimize the prompt, thus enabling the model to learn new\nknowledge while retaining old knowledge. Experiments on three prevalent\nbenchmarks, i.e., CIFAR100, mini-ImageNet, CUB-200, and two more challenging\nbenchmarks, i.e., SUN-397 and CUB-200$^*$ proposed in this paper showcase the\nsuperiority of LP-DiF, achieving new state-of-the-art (SOTA) in FSCIL. Code is\npublicly available at https://github.com/1170300714/LP-DiF.\n","authors":["Zitong Huang","Ze Chen","Zhixing Chen","Erjin Zhou","Xinxing Xu","Rick Siow Mong Goh","Yong Liu","Wangmeng Zuo","Chunmei Feng"],"pdf_url":"https://arxiv.org/pdf/2401.01598v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10835v4","updated":"2024-04-05T08:20:32Z","published":"2023-12-17T22:40:38Z","title":"Your Student is Better Than Expected: Adaptive Teacher-Student\n Collaboration for Text-Conditional Diffusion Models","summary":" Knowledge distillation methods have recently shown to be a promising\ndirection to speedup the synthesis of large-scale diffusion models by requiring\nonly a few inference steps. While several powerful distillation methods were\nrecently proposed, the overall quality of student samples is typically lower\ncompared to the teacher ones, which hinders their practical usage. In this\nwork, we investigate the relative quality of samples produced by the teacher\ntext-to-image diffusion model and its distilled student version. As our main\nempirical finding, we discover that a noticeable portion of student samples\nexhibit superior fidelity compared to the teacher ones, despite the\n\"approximate\" nature of the student. Based on this finding, we propose an\nadaptive collaboration between student and teacher diffusion models for\neffective text-to-image synthesis. Specifically, the distilled model produces\nthe initial sample, and then an oracle decides whether it needs further\nimprovements with a slow teacher model. Extensive experiments demonstrate that\nthe designed pipeline surpasses state-of-the-art text-to-image alternatives for\nvarious inference budgets in terms of human preference. Furthermore, the\nproposed approach can be naturally used in popular applications such as\ntext-guided image editing and controllable generation.\n","authors":["Nikita Starodubcev","Artem Fedorov","Artem Babenko","Dmitry Baranchuk"],"pdf_url":"https://arxiv.org/pdf/2312.10835v4.pdf","comment":"CVPR2024 camera ready v2"},{"id":"http://arxiv.org/abs/2404.03936v1","updated":"2024-04-05T07:44:17Z","published":"2024-04-05T07:44:17Z","title":"Deep Learning for Satellite Image Time Series Analysis: A Review","summary":" Earth observation (EO) satellite missions have been providing detailed images\nabout the state of the Earth and its land cover for over 50 years. Long term\nmissions, such as NASA's Landsat, Terra, and Aqua satellites, and more\nrecently, the ESA's Sentinel missions, record images of the entire world every\nfew days. Although single images provide point-in-time data, repeated images of\nthe same area, or satellite image time series (SITS) provide information about\nthe changing state of vegetation and land use. These SITS are useful for\nmodeling dynamic processes and seasonal changes such as plant phenology. They\nhave potential benefits for many aspects of land and natural resource\nmanagement, including applications in agricultural, forest, water, and disaster\nmanagement, urban planning, and mining. However, the resulting satellite image\ntime series (SITS) are complex, incorporating information from the temporal,\nspatial, and spectral dimensions. Therefore, deep learning methods are often\ndeployed as they can analyze these complex relationships. This review presents\na summary of the state-of-the-art methods of modelling environmental,\nagricultural, and other Earth observation variables from SITS data using deep\nlearning methods. We aim to provide a resource for remote sensing experts\ninterested in using deep learning techniques to enhance Earth observation\nmodels with temporal information.\n","authors":["Lynn Miller","Charlotte Pelletier","Geoffrey I. Webb"],"pdf_url":"https://arxiv.org/pdf/2404.03936v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2404.03930v1","updated":"2024-04-05T07:24:10Z","published":"2024-04-05T07:24:10Z","title":"Real-GDSR: Real-World Guided DSM Super-Resolution via Edge-Enhancing\n Residual Network","summary":" A low-resolution digital surface model (DSM) features distinctive attributes\nimpacted by noise, sensor limitations and data acquisition conditions, which\nfailed to be replicated using simple interpolation methods like bicubic. This\ncauses super-resolution models trained on synthetic data does not perform\neffectively on real ones. Training a model on real low and high resolution DSMs\npairs is also a challenge because of the lack of information. On the other\nhand, the existence of other imaging modalities of the same scene can be used\nto enrich the information needed for large-scale super-resolution. In this\nwork, we introduce a novel methodology to address the intricacies of real-world\nDSM super-resolution, named REAL-GDSR, breaking down this ill-posed problem\ninto two steps. The first step involves the utilization of a residual local\nrefinement network. This strategic approach departs from conventional methods\nthat trained to directly predict height values instead of the differences\n(residuals) and utilize large receptive fields in their networks. The second\nstep introduces a diffusion-based technique that enhances the results on a\nglobal scale, with a primary focus on smoothing and edge preservation. Our\nexperiments underscore the effectiveness of the proposed method. We conduct a\ncomprehensive evaluation, comparing it to recent state-of-the-art techniques in\nthe domain of real-world DSM super-resolution (SR). Our approach consistently\noutperforms these existing methods, as evidenced through qualitative and\nquantitative assessments.\n","authors":["Daniel Panangian","Ksenia Bittner"],"pdf_url":"https://arxiv.org/pdf/2404.03930v1.pdf","comment":"Accepted for publication in the ISPRS Annals of Photogrammetry,\n Remote Sensing, and Spatial Information Sciences"},{"id":"http://arxiv.org/abs/2404.03925v1","updated":"2024-04-05T07:15:06Z","published":"2024-04-05T07:15:06Z","title":"LightOctree: Lightweight 3D Spatially-Coherent Indoor Lighting\n Estimation","summary":" We present a lightweight solution for estimating spatially-coherent indoor\nlighting from a single RGB image. Previous methods for estimating illumination\nusing volumetric representations have overlooked the sparse distribution of\nlight sources in space, necessitating substantial memory and computational\nresources for achieving high-quality results. We introduce a unified, voxel\noctree-based illumination estimation framework to produce 3D spatially-coherent\nlighting. Additionally, a differentiable voxel octree cone tracing rendering\nlayer is proposed to eliminate regular volumetric representation throughout the\nentire process and ensure the retention of features across different frequency\ndomains. This reduction significantly decreases spatial usage and required\nfloating-point operations without substantially compromising precision.\nExperimental results demonstrate that our approach achieves high-quality\ncoherent estimation with minimal cost compared to previous methods.\n","authors":["Xuecan Wang","Shibang Xiao","Xiaohui Liang"],"pdf_url":"https://arxiv.org/pdf/2404.03925v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03924v1","updated":"2024-04-05T07:13:28Z","published":"2024-04-05T07:13:28Z","title":"Learning Correlation Structures for Vision Transformers","summary":" We introduce a new attention mechanism, dubbed structural self-attention\n(StructSA), that leverages rich correlation patterns naturally emerging in\nkey-query interactions of attention. StructSA generates attention maps by\nrecognizing space-time structures of key-query correlations via convolution and\nuses them to dynamically aggregate local contexts of value features. This\neffectively leverages rich structural patterns in images and videos such as\nscene layouts, object motion, and inter-object relations. Using StructSA as a\nmain building block, we develop the structural vision transformer (StructViT)\nand evaluate its effectiveness on both image and video classification tasks,\nachieving state-of-the-art results on ImageNet-1K, Kinetics-400,\nSomething-Something V1 & V2, Diving-48, and FineGym.\n","authors":["Manjin Kim","Paul Hongsuck Seo","Cordelia Schmid","Minsu Cho"],"pdf_url":"https://arxiv.org/pdf/2404.03924v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2401.02723v2","updated":"2024-04-05T07:12:16Z","published":"2024-01-05T09:36:42Z","title":"Predicting Traffic Flow with Federated Learning and Graph Neural with\n Asynchronous Computations Network","summary":" Real-time traffic flow prediction holds significant importance within the\ndomain of Intelligent Transportation Systems (ITS). The task of achieving a\nbalance between prediction precision and computational efficiency presents a\nsignificant challenge. In this article, we present a novel deep-learning method\ncalled Federated Learning and Asynchronous Graph Convolutional Network\n(FLAGCN). Our framework incorporates the principles of asynchronous graph\nconvolutional networks with federated learning to enhance the accuracy and\nefficiency of real-time traffic flow prediction. The FLAGCN model employs a\nspatial-temporal graph convolution technique to asynchronously address\nspatio-temporal dependencies within traffic data effectively. To efficiently\nhandle the computational requirements associated with this deep learning model,\nthis study used a graph federated learning technique known as GraphFL. This\napproach is designed to facilitate the training process. The experimental\nresults obtained from conducting tests on two distinct traffic datasets\ndemonstrate that the utilization of FLAGCN leads to the optimization of both\ntraining and inference durations while maintaining a high level of prediction\naccuracy. FLAGCN outperforms existing models with significant improvements by\nachieving up to approximately 6.85% reduction in RMSE, 20.45% reduction in\nMAPE, compared to the best-performing existing models.\n","authors":["Muhammad Yaqub","Shahzad Ahmad","Malik Abdul Manan","Imran Shabir Chuhan"],"pdf_url":"https://arxiv.org/pdf/2401.02723v2.pdf","comment":"I request to withdraw my paper from arXiv due to significant updates\n and improvements identified post-submission. These enhancements will\n substantially elevate the work's quality and impact. I plan to resubmit the\n revised paper upon completion of these updates. Thank you for accommodating\n this request"},{"id":"http://arxiv.org/abs/2402.19326v2","updated":"2024-04-05T06:56:08Z","published":"2024-02-29T16:29:53Z","title":"Generalizable Whole Slide Image Classification with Fine-Grained\n Visual-Semantic Interaction","summary":" Whole Slide Image (WSI) classification is often formulated as a Multiple\nInstance Learning (MIL) problem. Recently, Vision-Language Models (VLMs) have\ndemonstrated remarkable performance in WSI classification. However, existing\nmethods leverage coarse-grained pathogenetic descriptions for visual\nrepresentation supervision, which are insufficient to capture the complex\nvisual appearance of pathogenetic images, hindering the generalizability of\nmodels on diverse downstream tasks. Additionally, processing high-resolution\nWSIs can be computationally expensive. In this paper, we propose a novel\n\"Fine-grained Visual-Semantic Interaction\" (FiVE) framework for WSI\nclassification. It is designed to enhance the model's generalizability by\nleveraging the interaction between localized visual patterns and fine-grained\npathological semantics. Specifically, with meticulously designed queries, we\nstart by utilizing a large language model to extract fine-grained pathological\ndescriptions from various non-standardized raw reports. The output descriptions\nare then reconstructed into fine-grained labels used for training. By\nintroducing a Task-specific Fine-grained Semantics (TFS) module, we enable\nprompts to capture crucial visual information in WSIs, which enhances\nrepresentation learning and augments generalization capabilities significantly.\nFurthermore, given that pathological visual patterns are redundantly\ndistributed across tissue slices, we sample a subset of visual instances during\ntraining. Our method demonstrates robust generalizability and strong\ntransferability, dominantly outperforming the counterparts on the TCGA Lung\nCancer dataset with at least 9.19% higher accuracy in few-shot experiments. The\ncode is available at: https://github.com/ls1rius/WSI_FiVE.\n","authors":["Hao Li","Ying Chen","Yifei Chen","Wenxian Yang","Bowen Ding","Yuchen Han","Liansheng Wang","Rongshan Yu"],"pdf_url":"https://arxiv.org/pdf/2402.19326v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03913v1","updated":"2024-04-05T06:41:27Z","published":"2024-04-05T06:41:27Z","title":"Concept Weaver: Enabling Multi-Concept Fusion in Text-to-Image Models","summary":" While there has been significant progress in customizing text-to-image\ngeneration models, generating images that combine multiple personalized\nconcepts remains challenging. In this work, we introduce Concept Weaver, a\nmethod for composing customized text-to-image diffusion models at inference\ntime. Specifically, the method breaks the process into two steps: creating a\ntemplate image aligned with the semantics of input prompts, and then\npersonalizing the template using a concept fusion strategy. The fusion strategy\nincorporates the appearance of the target concepts into the template image\nwhile retaining its structural details. The results indicate that our method\ncan generate multiple custom concepts with higher identity fidelity compared to\nalternative approaches. Furthermore, the method is shown to seamlessly handle\nmore than two concepts and closely follow the semantic meaning of the input\nprompt without blending appearances across different subjects.\n","authors":["Gihyun Kwon","Simon Jenni","Dingzeyu Li","Joon-Young Lee","Jong Chul Ye","Fabian Caba Heilbron"],"pdf_url":"https://arxiv.org/pdf/2404.03913v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03906v1","updated":"2024-04-05T05:58:40Z","published":"2024-04-05T05:58:40Z","title":"Deep Phase Coded Image Prior","summary":" Phase-coded imaging is a computational imaging method designed to tackle\ntasks such as passive depth estimation and extended depth of field (EDOF) using\ndepth cues inserted during image capture. Most of the current deep\nlearning-based methods for depth estimation or all-in-focus imaging require a\ntraining dataset with high-quality depth maps and an optimal focus point at\ninfinity for all-in-focus images. Such datasets are difficult to create,\nusually synthetic, and require external graphic programs. We propose a new\nmethod named \"Deep Phase Coded Image Prior\" (DPCIP) for jointly recovering the\ndepth map and all-in-focus image from a coded-phase image using solely the\ncaptured image and the optical information of the imaging system. Our approach\ndoes not depend on any specific dataset and surpasses prior supervised\ntechniques utilizing the same imaging system. This improvement is achieved\nthrough the utilization of a problem formulation based on implicit neural\nrepresentation (INR) and deep image prior (DIP). Due to our zero-shot method,\nwe overcome the barrier of acquiring accurate ground-truth data of depth maps\nand all-in-focus images for each new phase-coded system introduced. This allows\nfocusing mainly on developing the imaging system, and not on ground-truth data\ncollection.\n","authors":["Nimrod Shabtay","Eli Schwartz","Raja Giryes"],"pdf_url":"https://arxiv.org/pdf/2404.03906v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01439v2","updated":"2024-04-05T05:46:59Z","published":"2024-03-03T08:25:04Z","title":"Dynamic Adapter Meets Prompt Tuning: Parameter-Efficient Transfer\n Learning for Point Cloud Analysis","summary":" Point cloud analysis has achieved outstanding performance by transferring\npoint cloud pre-trained models. However, existing methods for model adaptation\nusually update all model parameters, i.e., full fine-tuning paradigm, which is\ninefficient as it relies on high computational costs (e.g., training GPU\nmemory) and massive storage space. In this paper, we aim to study\nparameter-efficient transfer learning for point cloud analysis with an ideal\ntrade-off between task performance and parameter efficiency. To achieve this\ngoal, we freeze the parameters of the default pre-trained models and then\npropose the Dynamic Adapter, which generates a dynamic scale for each token,\nconsidering the token significance to the downstream task. We further\nseamlessly integrate Dynamic Adapter with Prompt Tuning (DAPT) by constructing\nInternal Prompts, capturing the instance-specific features for interaction.\nExtensive experiments conducted on five challenging datasets demonstrate that\nthe proposed DAPT achieves superior performance compared to the full\nfine-tuning counterparts while significantly reducing the trainable parameters\nand training GPU memory by 95% and 35%, respectively. Code is available at\nhttps://github.com/LMD0311/DAPT.\n","authors":["Xin Zhou","Dingkang Liang","Wei Xu","Xingkui Zhu","Yihan Xu","Zhikang Zou","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2403.01439v2.pdf","comment":"Accepted to CVPR 2024. Code is available at\n https://github.com/LMD0311/DAPT"},{"id":"http://arxiv.org/abs/2404.03898v1","updated":"2024-04-05T05:42:23Z","published":"2024-04-05T05:42:23Z","title":"VoltaVision: A Transfer Learning model for electronic component\n classification","summary":" In this paper, we analyze the effectiveness of transfer learning on\nclassifying electronic components. Transfer learning reuses pre-trained models\nto save time and resources in building a robust classifier rather than learning\nfrom scratch. Our work introduces a lightweight CNN, coined as VoltaVision, and\ncompares its performance against more complex models. We test the hypothesis\nthat transferring knowledge from a similar task to our target domain yields\nbetter results than state-of-the-art models trained on general datasets. Our\ndataset and code for this work are available at\nhttps://github.com/AnasIshfaque/VoltaVision.\n","authors":["Anas Mohammad Ishfaqul Muktadir Osmani","Taimur Rahman","Salekul Islam"],"pdf_url":"https://arxiv.org/pdf/2404.03898v1.pdf","comment":"Tiny Paper at ICLR 2024"},{"id":"http://arxiv.org/abs/2310.20550v3","updated":"2024-04-05T05:29:29Z","published":"2023-10-31T15:31:39Z","title":"CapsFusion: Rethinking Image-Text Data at Scale","summary":" Large multimodal models demonstrate remarkable generalist ability to perform\ndiverse multimodal tasks in a zero-shot manner. Large-scale web-based\nimage-text pairs contribute fundamentally to this success, but suffer from\nexcessive noise. Recent studies use alternative captions synthesized by\ncaptioning models and have achieved notable benchmark performance. However, our\nexperiments reveal significant Scalability Deficiency and World Knowledge Loss\nissues in models trained with synthetic captions, which have been largely\nobscured by their initial benchmark success. Upon closer examination, we\nidentify the root cause as the overly-simplified language structure and lack of\nknowledge details in existing synthetic captions. To provide higher-quality and\nmore scalable multimodal pretraining data, we propose CapsFusion, an advanced\nframework that leverages large language models to consolidate and refine\ninformation from both web-based image-text pairs and synthetic captions.\nExtensive experiments show that CapsFusion captions exhibit remarkable\nall-round superiority over existing captions in terms of model performance\n(e.g., 18.8 and 18.3 improvements in CIDEr score on COCO and NoCaps), sample\nefficiency (requiring 11-16 times less computation than baselines), world\nknowledge depth, and scalability. These effectiveness, efficiency and\nscalability advantages position CapsFusion as a promising candidate for future\nscaling of LMM training.\n","authors":["Qiying Yu","Quan Sun","Xiaosong Zhang","Yufeng Cui","Fan Zhang","Yue Cao","Xinlong Wang","Jingjing Liu"],"pdf_url":"https://arxiv.org/pdf/2310.20550v3.pdf","comment":"CVPR 2024. Code & Dataset: https://github.com/baaivision/CapsFusion"},{"id":"http://arxiv.org/abs/2404.03892v1","updated":"2024-04-05T05:00:21Z","published":"2024-04-05T05:00:21Z","title":"Enhancing Breast Cancer Diagnosis in Mammography: Evaluation and\n Integration of Convolutional Neural Networks and Explainable AI","summary":" The study introduces an integrated framework combining Convolutional Neural\nNetworks (CNNs) and Explainable Artificial Intelligence (XAI) for the enhanced\ndiagnosis of breast cancer using the CBIS-DDSM dataset. Utilizing a fine-tuned\nResNet50 architecture, our investigation not only provides effective\ndifferentiation of mammographic images into benign and malignant categories but\nalso addresses the opaque \"black-box\" nature of deep learning models by\nemploying XAI methodologies, namely Grad-CAM, LIME, and SHAP, to interpret CNN\ndecision-making processes for healthcare professionals. Our methodology\nencompasses an elaborate data preprocessing pipeline and advanced data\naugmentation techniques to counteract dataset limitations, and transfer\nlearning using pre-trained networks, such as VGG-16, DenseNet and ResNet was\nemployed. A focal point of our study is the evaluation of XAI's effectiveness\nin interpreting model predictions, highlighted by utilising the Hausdorff\nmeasure to assess the alignment between AI-generated explanations and expert\nannotations quantitatively. This approach plays a critical role for XAI in\npromoting trustworthiness and ethical fairness in AI-assisted diagnostics. The\nfindings from our research illustrate the effective collaboration between CNNs\nand XAI in advancing diagnostic methods for breast cancer, thereby facilitating\na more seamless integration of advanced AI technologies within clinical\nsettings. By enhancing the interpretability of AI-driven decisions, this work\nlays the groundwork for improved collaboration between AI systems and medical\npractitioners, ultimately enriching patient care. Furthermore, the implications\nof our research extend well beyond the current methodologies, advocating for\nsubsequent inquiries into the integration of multimodal data and the refinement\nof AI explanations to satisfy the needs of clinical practice.\n","authors":["Maryam Ahmed","Tooba Bibi","Rizwan Ahmed Khan","Sidra Nasir"],"pdf_url":"https://arxiv.org/pdf/2404.03892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03052v2","updated":"2024-04-05T04:33:23Z","published":"2023-12-05T18:58:37Z","title":"Visual Program Distillation: Distilling Tools and Programmatic Reasoning\n into Vision-Language Models","summary":" Solving complex visual tasks such as \"Who invented the musical instrument on\nthe right?\" involves a composition of skills: understanding space, recognizing\ninstruments, and also retrieving prior knowledge. Recent work shows promise by\ndecomposing such tasks using a large language model (LLM) into an executable\nprogram that invokes specialized vision models. However, generated programs are\nerror-prone: they omit necessary steps, include spurious ones, and are unable\nto recover when the specialized models give incorrect outputs. Moreover, they\nrequire loading multiple models, incurring high latency and computation costs.\nWe propose Visual Program Distillation (VPD), an instruction tuning framework\nthat produces a vision-language model (VLM) capable of solving complex visual\ntasks with a single forward pass. VPD distills the reasoning ability of LLMs by\nusing them to sample multiple candidate programs, which are then executed and\nverified to identify a correct one. It translates each correct program into a\nlanguage description of the reasoning steps, which are then distilled into a\nVLM. Extensive experiments show that VPD improves the VLM's ability to count,\nunderstand spatial relations, and reason compositionally. Our VPD-trained\nPaLI-X outperforms all prior VLMs, achieving state-of-the-art performance\nacross complex vision tasks, including MMBench, OK-VQA, A-OKVQA, TallyQA, POPE,\nand Hateful Memes. An evaluation with human annotators also confirms that VPD\nimproves model response factuality and consistency. Finally, experiments on\ncontent moderation demonstrate that VPD is also helpful for adaptation to\nreal-world applications with limited data.\n","authors":["Yushi Hu","Otilia Stretcu","Chun-Ta Lu","Krishnamurthy Viswanathan","Kenji Hata","Enming Luo","Ranjay Krishna","Ariel Fuxman"],"pdf_url":"https://arxiv.org/pdf/2312.03052v2.pdf","comment":"CVPR 2024 Oral"},{"id":"http://arxiv.org/abs/2404.03883v1","updated":"2024-04-05T04:11:31Z","published":"2024-04-05T04:11:31Z","title":"LiDAR-Guided Cross-Attention Fusion for Hyperspectral Band Selection and\n Image Classification","summary":" The fusion of hyperspectral and LiDAR data has been an active research topic.\nExisting fusion methods have ignored the high-dimensionality and redundancy\nchallenges in hyperspectral images, despite that band selection methods have\nbeen intensively studied for hyperspectral image (HSI) processing. This paper\naddresses this significant gap by introducing a cross-attention mechanism from\nthe transformer architecture for the selection of HSI bands guided by LiDAR\ndata. LiDAR provides high-resolution vertical structural information, which can\nbe useful in distinguishing different types of land cover that may have similar\nspectral signatures but different structural profiles. In our approach, the\nLiDAR data are used as the \"query\" to search and identify the \"key\" from the\nHSI to choose the most pertinent bands for LiDAR. This method ensures that the\nselected HSI bands drastically reduce redundancy and computational requirements\nwhile working optimally with the LiDAR data. Extensive experiments have been\nundertaken on three paired HSI and LiDAR data sets: Houston 2013, Trento and\nMUUFL. The results highlight the superiority of the cross-attention mechanism,\nunderlining the enhanced classification accuracy of the identified HSI bands\nwhen fused with the LiDAR features. The results also show that the use of fewer\nbands combined with LiDAR surpasses the performance of state-of-the-art fusion\nmodels.\n","authors":["Judy X Yang","Jun Zhou","Jing Wang","Hui Tian","Wee Chung Liew"],"pdf_url":"https://arxiv.org/pdf/2404.03883v1.pdf","comment":"15 pages, 13 figures"},{"id":"http://arxiv.org/abs/2404.03876v1","updated":"2024-04-05T03:51:19Z","published":"2024-04-05T03:51:19Z","title":"Increasing Fairness in Classification of Out of Distribution Data for\n Facial Recognition","summary":" Standard classification theory assumes that the distribution of images in the\ntest and training sets are identical. Unfortunately, real-life scenarios\ntypically feature unseen data (\"out-of-distribution data\") which is different\nfrom data in the training distribution(\"in-distribution\"). This issue is most\nprevalent in social justice problems where data from under-represented groups\nmay appear in the test data without representing an equal proportion of the\ntraining data. This may result in a model returning confidently wrong decisions\nand predictions. We are interested in the following question: Can the\nperformance of a neural network improve on facial images of out-of-distribution\ndata when it is trained simultaneously on multiple datasets of in-distribution\ndata? We approach this problem by incorporating the Outlier Exposure model and\ninvestigate how the model's performance changes when other datasets of facial\nimages were implemented. We observe that the accuracy and other metrics of the\nmodel can be increased by applying Outlier Exposure, incorporating a trainable\nweight parameter to increase the machine's emphasis on outlier images, and by\nre-weighting the importance of different class labels. We also experimented\nwith whether sorting the images and determining outliers via image features\nwould have more of an effect on the metrics than sorting by average pixel\nvalue. Our goal was to make models not only more accurate but also more fair by\nscanning a more expanded range of images. We also tested the datasets in\nreverse order to see whether a more fair dataset with balanced features has an\neffect on the model's accuracy.\n","authors":["Gianluca Barone","Aashrit Cunchala","Rudy Nunez"],"pdf_url":"https://arxiv.org/pdf/2404.03876v1.pdf","comment":"18 pages, 6 tables, 6 figures"},{"id":"http://arxiv.org/abs/2306.00003v3","updated":"2024-04-05T03:25:04Z","published":"2023-05-25T18:22:12Z","title":"Detecting Heart Disease from Multi-View Ultrasound Images via Supervised\n Attention Multiple Instance Learning","summary":" Aortic stenosis (AS) is a degenerative valve condition that causes\nsubstantial morbidity and mortality. This condition is under-diagnosed and\nunder-treated. In clinical practice, AS is diagnosed with expert review of\ntransthoracic echocardiography, which produces dozens of ultrasound images of\nthe heart. Only some of these views show the aortic valve. To automate\nscreening for AS, deep networks must learn to mimic a human expert's ability to\nidentify views of the aortic valve then aggregate across these relevant images\nto produce a study-level diagnosis. We find previous approaches to AS detection\nyield insufficient accuracy due to relying on inflexible averages across\nimages. We further find that off-the-shelf attention-based multiple instance\nlearning (MIL) performs poorly. We contribute a new end-to-end MIL approach\nwith two key methodological innovations. First, a supervised attention\ntechnique guides the learned attention mechanism to favor relevant views.\nSecond, a novel self-supervised pretraining strategy applies contrastive\nlearning on the representation of the whole study instead of individual images\nas commonly done in prior literature. Experiments on an open-access dataset and\nan external validation set show that our approach yields higher accuracy while\nreducing model size.\n","authors":["Zhe Huang","Benjamin S. Wessler","Michael C. Hughes"],"pdf_url":"https://arxiv.org/pdf/2306.00003v3.pdf","comment":"Echocardiogram; multiple-instance learning; self-supervised learning;\n semi-supervised learning; medical imaging"},{"id":"http://arxiv.org/abs/2404.01655v2","updated":"2024-04-05T03:15:11Z","published":"2024-04-02T05:56:17Z","title":"FashionEngine: Interactive Generation and Editing of 3D Clothed Humans","summary":" We present FashionEngine, an interactive 3D human generation and editing\nsystem that allows us to design 3D digital humans in a way that aligns with how\nhumans interact with the world, such as natural languages, visual perceptions,\nand hand-drawing. FashionEngine automates the 3D human production with three\nkey components: 1) A pre-trained 3D human diffusion model that learns to model\n3D humans in a semantic UV latent space from 2D image training data, which\nprovides strong priors for diverse generation and editing tasks. 2)\nMultimodality-UV Space encoding the texture appearance, shape topology, and\ntextual semantics of human clothing in a canonical UV-aligned space, which\nfaithfully aligns the user multimodal inputs with the implicit UV latent space\nfor controllable 3D human editing. The multimodality-UV space is shared across\ndifferent user inputs, such as texts, images, and sketches, which enables\nvarious joint multimodal editing tasks. 3) Multimodality-UV Aligned Sampler\nlearns to sample high-quality and diverse 3D humans from the diffusion prior\nfor multimodal user inputs. Extensive experiments validate FashionEngine's\nstate-of-the-art performance for conditional generation/editing tasks. In\naddition, we present an interactive user interface for our FashionEngine that\nenables both conditional and unconditional generation tasks, and editing tasks\nincluding pose/view/shape control, text-, image-, and sketch-driven 3D human\nediting and 3D virtual try-on, in a unified framework. Our project page is at:\nhttps://taohuumd.github.io/projects/FashionEngine.\n","authors":["Tao Hu","Fangzhou Hong","Zhaoxi Chen","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2404.01655v2.pdf","comment":"Project Page: https://taohuumd.github.io/projects/FashionEngine"},{"id":"http://arxiv.org/abs/2403.12686v3","updated":"2024-04-05T02:34:01Z","published":"2024-03-19T12:45:18Z","title":"WaterVG: Waterway Visual Grounding based on Text-Guided Vision and\n mmWave Radar","summary":" The perception of waterways based on human intent is significant for\nautonomous navigation and operations of Unmanned Surface Vehicles (USVs) in\nwater environments. Inspired by visual grounding, we introduce WaterVG, the\nfirst visual grounding dataset designed for USV-based waterway perception based\non human prompts. WaterVG encompasses prompts describing multiple targets, with\nannotations at the instance level including bounding boxes and masks. Notably,\nWaterVG includes 11,568 samples with 34,987 referred targets, whose prompts\nintegrates both visual and radar characteristics. The pattern of text-guided\ntwo sensors equips a finer granularity of text prompts with visual and radar\nfeatures of referred targets. Moreover, we propose a low-power visual grounding\nmodel, Potamoi, which is a multi-task model with a well-designed Phased\nHeterogeneous Modality Fusion (PHMF) mode, including Adaptive Radar Weighting\n(ARW) and Multi-Head Slim Cross Attention (MHSCA). Exactly, ARW extracts\nrequired radar features to fuse with vision for prompt alignment. MHSCA is an\nefficient fusion module with a remarkably small parameter count and FLOPs,\nelegantly fusing scenario context captured by two sensors with linguistic\nfeatures, which performs expressively on visual grounding tasks. Comprehensive\nexperiments and evaluations have been conducted on WaterVG, where our Potamoi\narchives state-of-the-art performances compared with counterparts.\n","authors":["Runwei Guan","Liye Jia","Fengyufan Yang","Shanliang Yao","Erick Purwanto","Xiaohui Zhu","Eng Gee Lim","Jeremy Smith","Ka Lok Man","Xuming Hu","Yutao Yue"],"pdf_url":"https://arxiv.org/pdf/2403.12686v3.pdf","comment":"10 pages, 10 figures"},{"id":"http://arxiv.org/abs/2404.03854v1","updated":"2024-04-05T01:17:25Z","published":"2024-04-05T01:17:25Z","title":"Mitigating Heterogeneity in Federated Multimodal Learning with\n Biomedical Vision-Language Pre-training","summary":" Vision-language pre-training (VLP) has arised as an efficient scheme for\nmultimodal representation learning, but it requires large-scale multimodal data\nfor pre-training, making it an obstacle especially for biomedical applications.\nTo overcome the data limitation, federated learning (FL) can be a promising\nstrategy to scale up the dataset for biomedical VLP while protecting data\nprivacy. However, client data are often heterogeneous in real-world scenarios,\nand we observe that local training on heterogeneous client data would distort\nthe multimodal representation learning and lead to biased cross-modal\nalignment. To address this challenge, we propose Federated distributional\nRobust Guidance-Based (FedRGB) learning framework for federated VLP with\nrobustness to data heterogeneity. Specifically, we utilize a guidance-based\nlocal training scheme to reduce feature distortions, and employ a\ndistribution-based min-max optimization to learn unbiased cross-modal\nalignment. The experiments on real-world datasets show our method successfully\npromotes efficient federated multimodal learning for biomedical VLP with data\nheterogeneity.\n","authors":["Zitao Shuai","Liyue Shen"],"pdf_url":"https://arxiv.org/pdf/2404.03854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05061v2","updated":"2024-04-05T00:43:16Z","published":"2024-03-08T05:15:48Z","title":"RadarDistill: Boosting Radar-based Object Detection Performance via\n Knowledge Distillation from LiDAR Features","summary":" The inherent noisy and sparse characteristics of radar data pose challenges\nin finding effective representations for 3D object detection. In this paper, we\npropose RadarDistill, a novel knowledge distillation (KD) method, which can\nimprove the representation of radar data by leveraging LiDAR data. RadarDistill\nsuccessfully transfers desirable characteristics of LiDAR features into radar\nfeatures using three key components: Cross-Modality Alignment (CMA),\nActivation-based Feature Distillation (AFD), and Proposal-based Feature\nDistillation (PFD). CMA enhances the density of radar features by employing\nmultiple layers of dilation operations, effectively addressing the challenge of\ninefficient knowledge transfer from LiDAR to radar. AFD selectively transfers\nknowledge based on regions of the LiDAR features, with a specific focus on\nareas where activation intensity exceeds a predefined threshold. PFD similarly\nguides the radar network to selectively mimic features from the LiDAR network\nwithin the object proposals. Our comparative analyses conducted on the nuScenes\ndatasets demonstrate that RadarDistill achieves state-of-the-art (SOTA)\nperformance for radar-only object detection task, recording 20.5% in mAP and\n43.7% in NDS. Also, RadarDistill significantly improves the performance of the\ncamera-radar fusion model.\n","authors":["Geonho Bang","Kwangjin Choi","Jisong Kim","Dongsuk Kum","Jun Won Choi"],"pdf_url":"https://arxiv.org/pdf/2403.05061v2.pdf","comment":"Accepted to IEEE/CVF Conference on Computer Vision and Pattern\n Recognition (CVPR) 2024, 10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2404.00498v2","updated":"2024-04-05T00:09:00Z","published":"2024-03-30T23:42:23Z","title":"94% on CIFAR-10 in 3.29 Seconds on a Single GPU","summary":" CIFAR-10 is among the most widely used datasets in machine learning,\nfacilitating thousands of research projects per year. To accelerate research\nand reduce the cost of experiments, we introduce training methods for CIFAR-10\nwhich reach 94% accuracy in 3.29 seconds, 95% in 10.4 seconds, and 96% in 46.3\nseconds, when run on a single NVIDIA A100 GPU. As one factor contributing to\nthese training speeds, we propose a derandomized variant of horizontal flipping\naugmentation, which we show improves over the standard method in every case\nwhere flipping is beneficial over no flipping at all. Our code is released at\nhttps://github.com/KellerJordan/cifar10-airbench.\n","authors":["Keller Jordan"],"pdf_url":"https://arxiv.org/pdf/2404.00498v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09555v2","updated":"2024-04-05T23:58:40Z","published":"2023-07-14T15:17:04Z","title":"Transient Neural Radiance Fields for Lidar View Synthesis and 3D\n Reconstruction","summary":" Neural radiance fields (NeRFs) have become a ubiquitous tool for modeling\nscene appearance and geometry from multiview imagery. Recent work has also\nbegun to explore how to use additional supervision from lidar or depth sensor\nmeasurements in the NeRF framework. However, previous lidar-supervised NeRFs\nfocus on rendering conventional camera imagery and use lidar-derived point\ncloud data as auxiliary supervision; thus, they fail to incorporate the\nunderlying image formation model of the lidar. Here, we propose a novel method\nfor rendering transient NeRFs that take as input the raw, time-resolved photon\ncount histograms measured by a single-photon lidar system, and we seek to\nrender such histograms from novel views. Different from conventional NeRFs, the\napproach relies on a time-resolved version of the volume rendering equation to\nrender the lidar measurements and capture transient light transport phenomena\nat picosecond timescales. We evaluate our method on a first-of-its-kind dataset\nof simulated and captured transient multiview scans from a prototype\nsingle-photon lidar. Overall, our work brings NeRFs to a new dimension of\nimaging at transient timescales, newly enabling rendering of transient imagery\nfrom novel views. Additionally, we show that our approach recovers improved\ngeometry and conventional appearance compared to point cloud-based supervision\nwhen training on few input viewpoints. Transient NeRFs may be especially useful\nfor applications which seek to simulate raw lidar measurements for downstream\ntasks in autonomous driving, robotics, and remote sensing.\n","authors":["Anagh Malik","Parsa Mirdehghan","Sotiris Nousias","Kiriakos N. Kutulakos","David B. Lindell"],"pdf_url":"https://arxiv.org/pdf/2307.09555v2.pdf","comment":"NeurIPS 2023, Project Page: https://anaghmalik.com/TransientNeRF/"},{"id":"http://arxiv.org/abs/2404.04452v1","updated":"2024-04-05T23:38:57Z","published":"2024-04-05T23:38:57Z","title":"Vision Transformers in Domain Adaptation and Generalization: A Study of\n Robustness","summary":" Deep learning models are often evaluated in scenarios where the data\ndistribution is different from those used in the training and validation\nphases. The discrepancy presents a challenge for accurately predicting the\nperformance of models once deployed on the target distribution. Domain\nadaptation and generalization are widely recognized as effective strategies for\naddressing such shifts, thereby ensuring reliable performance. The recent\npromising results in applying vision transformers in computer vision tasks,\ncoupled with advancements in self-attention mechanisms, have demonstrated their\nsignificant potential for robustness and generalization in handling\ndistribution shifts. Motivated by the increased interest from the research\ncommunity, our paper investigates the deployment of vision transformers in\ndomain adaptation and domain generalization scenarios. For domain adaptation\nmethods, we categorize research into feature-level, instance-level, model-level\nadaptations, and hybrid approaches, along with other categorizations with\nrespect to diverse strategies for enhancing domain adaptation. Similarly, for\ndomain generalization, we categorize research into multi-domain learning,\nmeta-learning, regularization techniques, and data augmentation strategies. We\nfurther classify diverse strategies in research, underscoring the various\napproaches researchers have taken to address distribution shifts by integrating\nvision transformers. The inclusion of comprehensive tables summarizing these\ncategories is a distinct feature of our work, offering valuable insights for\nresearchers. These findings highlight the versatility of vision transformers in\nmanaging distribution shifts, crucial for real-world applications, especially\nin critical safety and decision-making scenarios.\n","authors":["Shadi Alijani","Jamil Fayyad","Homayoun Najjaran"],"pdf_url":"https://arxiv.org/pdf/2404.04452v1.pdf","comment":"28 pages, 5 figures, Preprint submitted to Elsevier"},{"id":"http://arxiv.org/abs/2404.04434v1","updated":"2024-04-05T22:21:49Z","published":"2024-04-05T22:21:49Z","title":"Robust Few-Shot Ensemble Learning with Focal Diversity-Based Pruning","summary":" This paper presents FusionShot, a focal diversity optimized few-shot ensemble\nlearning approach for boosting the robustness and generalization performance of\npre-trained few-shot models. The paper makes three original contributions.\nFirst, we explore the unique characteristics of few-shot learning to ensemble\nmultiple few-shot (FS) models by creating three alternative fusion channels.\nSecond, we introduce the concept of focal error diversity to learn the most\nefficient ensemble teaming strategy, rather than assuming that an ensemble of a\nlarger number of base models will outperform those sub-ensembles of smaller\nsize. We develop a focal-diversity ensemble pruning method to effectively prune\nout the candidate ensembles with low ensemble error diversity and recommend\ntop-$K$ FS ensembles with the highest focal error diversity. Finally, we\ncapture the complex non-linear patterns of ensemble few-shot predictions by\ndesigning the learn-to-combine algorithm, which can learn the diverse weight\nassignments for robust ensemble fusion over different member models. Extensive\nexperiments on representative few-shot benchmarks show that the top-K ensembles\nrecommended by FusionShot can outperform the representative SOTA few-shot\nmodels on novel tasks (different distributions and unknown at training), and\ncan prevail over existing few-shot learners in both cross-domain settings and\nadversarial settings. For reproducibility purposes, FusionShot trained models,\nresults, and code are made available at https://github.com/sftekin/fusionshot\n","authors":["Selim Furkan Tekin","Fatih Ilhan","Tiansheng Huang","Sihao Hu","Ka-Ho Chow","Margaret L. Loper","Ling Liu"],"pdf_url":"https://arxiv.org/pdf/2404.04434v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01545v4","updated":"2024-04-05T22:17:46Z","published":"2023-10-02T18:41:23Z","title":"RF-ULM: Ultrasound Localization Microscopy Learned from Radio-Frequency\n Wavefronts","summary":" In Ultrasound Localization Microscopy (ULM), achieving high-resolution images\nrelies on the precise localization of contrast agent particles across a series\nof beamformed frames. However, our study uncovers an enormous potential: The\nprocess of delay-and-sum beamforming leads to an irreversible reduction of\nRadio-Frequency (RF) channel data, while its implications for localization\nremain largely unexplored. The rich contextual information embedded within RF\nwavefronts, including their hyperbolic shape and phase, offers great promise\nfor guiding Deep Neural Networks (DNNs) in challenging localization scenarios.\nTo fully exploit this data, we propose to directly localize scatterers in RF\nchannel data. Our approach involves a custom super-resolution DNN using learned\nfeature channel shuffling, non-maximum suppression, and a semi-global\nconvolutional block for reliable and accurate wavefront localization.\nAdditionally, we introduce a geometric point transformation that facilitates\nseamless mapping to the B-mode coordinate space. To understand the impact of\nbeamforming on ULM, we validate the effectiveness of our method by conducting\nan extensive comparison with State-Of-The-Art (SOTA) techniques. We present the\ninaugural in vivo results from a wavefront-localizing DNN, highlighting its\nreal-world practicality. Our findings show that RF-ULM bridges the domain shift\nbetween synthetic and real datasets, offering a considerable advantage in terms\nof precision and complexity. To enable the broader research community to\nbenefit from our findings, our code and the associated SOTA methods are made\navailable at https://github.com/hahnec/rf-ulm.\n","authors":["Christopher Hahne","Georges Chabouh","Arthur Chavignon","Olivier Couture","Raphael Sznitman"],"pdf_url":"https://arxiv.org/pdf/2310.01545v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04430v1","updated":"2024-04-05T22:07:25Z","published":"2024-04-05T22:07:25Z","title":"PhysPT: Physics-aware Pretrained Transformer for Estimating Human\n Dynamics from Monocular Videos","summary":" While current methods have shown promising progress on estimating 3D human\nmotion from monocular videos, their motion estimates are often physically\nunrealistic because they mainly consider kinematics. In this paper, we\nintroduce Physics-aware Pretrained Transformer (PhysPT), which improves\nkinematics-based motion estimates and infers motion forces. PhysPT exploits a\nTransformer encoder-decoder backbone to effectively learn human dynamics in a\nself-supervised manner. Moreover, it incorporates physics principles governing\nhuman motion. Specifically, we build a physics-based body representation and\ncontact force model. We leverage them to impose novel physics-inspired training\nlosses (i.e., force loss, contact loss, and Euler-Lagrange loss), enabling\nPhysPT to capture physical properties of the human body and the forces it\nexperiences. Experiments demonstrate that, once trained, PhysPT can be directly\napplied to kinematics-based estimates to significantly enhance their physical\nplausibility and generate favourable motion forces. Furthermore, we show that\nthese physically meaningful quantities translate into improved accuracy of an\nimportant downstream task: human action recognition.\n","authors":["Yufei Zhang","Jeffrey O. Kephart","Zijun Cui","Qiang Ji"],"pdf_url":"https://arxiv.org/pdf/2404.04430v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00938v2","updated":"2024-04-05T21:55:47Z","published":"2024-04-01T05:50:56Z","title":"How Can Large Language Models Enable Better Socially Assistive\n Human-Robot Interaction: A Brief Survey","summary":" Socially assistive robots (SARs) have shown great success in providing\npersonalized cognitive-affective support for user populations with special\nneeds such as older adults, children with autism spectrum disorder (ASD), and\nindividuals with mental health challenges. The large body of work on SAR\ndemonstrates its potential to provide at-home support that complements\nclinic-based interventions delivered by mental health professionals, making\nthese interventions more effective and accessible. However, there are still\nseveral major technical challenges that hinder SAR-mediated interactions and\ninterventions from reaching human-level social intelligence and efficacy. With\nthe recent advances in large language models (LLMs), there is an increased\npotential for novel applications within the field of SAR that can significantly\nexpand the current capabilities of SARs. However, incorporating LLMs introduces\nnew risks and ethical concerns that have not yet been encountered, and must be\ncarefully be addressed to safely deploy these more advanced systems. In this\nwork, we aim to conduct a brief survey on the use of LLMs in SAR technologies,\nand discuss the potentials and risks of applying LLMs to the following three\nmajor technical challenges of SAR: 1) natural language dialog; 2) multimodal\nunderstanding; 3) LLMs as robot policies.\n","authors":["Zhonghao Shi","Ellen Landrum","Amy O' Connell","Mina Kian","Leticia Pinto-Alva","Kaleen Shrestha","Xiaoyuan Zhu","Maja J Matarić"],"pdf_url":"https://arxiv.org/pdf/2404.00938v2.pdf","comment":"2 pages, accepted to the Proceedings of the AAAI Symposium Series,\n 2024"},{"id":"http://arxiv.org/abs/2404.04421v1","updated":"2024-04-05T21:44:57Z","published":"2024-04-05T21:44:57Z","title":"PhysAvatar: Learning the Physics of Dressed 3D Avatars from Visual\n Observations","summary":" Modeling and rendering photorealistic avatars is of crucial importance in\nmany applications. Existing methods that build a 3D avatar from visual\nobservations, however, struggle to reconstruct clothed humans. We introduce\nPhysAvatar, a novel framework that combines inverse rendering with inverse\nphysics to automatically estimate the shape and appearance of a human from\nmulti-view video data along with the physical parameters of the fabric of their\nclothes. For this purpose, we adopt a mesh-aligned 4D Gaussian technique for\nspatio-temporal mesh tracking as well as a physically based inverse renderer to\nestimate the intrinsic material properties. PhysAvatar integrates a physics\nsimulator to estimate the physical parameters of the garments using\ngradient-based optimization in a principled manner. These novel capabilities\nenable PhysAvatar to create high-quality novel-view renderings of avatars\ndressed in loose-fitting clothes under motions and lighting conditions not seen\nin the training data. This marks a significant advancement towards modeling\nphotorealistic digital humans using physically based inverse rendering with\nphysics in the loop. Our project website is at:\nhttps://qingqing-zhao.github.io/PhysAvatar\n","authors":["Yang Zheng","Qingqing Zhao","Guandao Yang","Wang Yifan","Donglai Xiang","Florian Dubost","Dmitry Lagun","Thabo Beeler","Federico Tombari","Leonidas Guibas","Gordon Wetzstein"],"pdf_url":"https://arxiv.org/pdf/2404.04421v1.pdf","comment":"Yang Zheng and Qingqing Zhao are project co-leads"},{"id":"http://arxiv.org/abs/2308.02958v2","updated":"2024-04-05T21:39:23Z","published":"2023-08-05T22:07:37Z","title":"K-band: Self-supervised MRI Reconstruction via Stochastic Gradient\n Descent over K-space Subsets","summary":" Although deep learning (DL) methods are powerful for solving inverse\nproblems, their reliance on high-quality training data is a major hurdle. This\nis significant in high-dimensional (dynamic/volumetric) magnetic resonance\nimaging (MRI), where acquisition of high-resolution fully sampled k-space data\nis impractical. We introduce a novel mathematical framework, dubbed k-band,\nthat enables training DL models using only partial, limited-resolution k-space\ndata. Specifically, we introduce training with stochastic gradient descent\n(SGD) over k-space subsets. In each training iteration, rather than using the\nfully sampled k-space for computing gradients, we use only a small k-space\nportion. This concept is compatible with different sampling strategies; here we\ndemonstrate the method for k-space \"bands\", which have limited resolution in\none dimension and can hence be acquired rapidly. We prove analytically that our\nmethod stochastically approximates the gradients computed in a fully-supervised\nsetup, when two simple conditions are met: (i) the limited-resolution axis is\nchosen randomly-uniformly for every new scan, hence k-space is fully covered\nacross the entire training set, and (ii) the loss function is weighed with a\nmask, derived here analytically, which facilitates accurate reconstruction of\nhigh-resolution details. Numerical experiments with raw MRI data indicate that\nk-band outperforms two other methods trained on limited-resolution data and\nperforms comparably to state-of-the-art (SoTA) methods trained on\nhigh-resolution data. k-band hence obtains SoTA performance, with the advantage\nof training using only limited-resolution data. This work hence introduces a\npractical, easy-to-implement, self-supervised training framework, which\ninvolves fast acquisition and self-supervised reconstruction and offers\ntheoretical guarantees.\n","authors":["Frederic Wang","Han Qi","Alfredo De Goyeneche","Reinhard Heckel","Michael Lustig","Efrat Shimron"],"pdf_url":"https://arxiv.org/pdf/2308.02958v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04818v3","updated":"2024-04-05T20:58:34Z","published":"2023-11-08T16:42:14Z","title":"Cross-Silo Federated Learning Across Divergent Domains with Iterative\n Parameter Alignment","summary":" Learning from the collective knowledge of data dispersed across private\nsources can provide neural networks with enhanced generalization capabilities.\nFederated learning, a method for collaboratively training a machine learning\nmodel across remote clients, achieves this by combining client models via the\norchestration of a central server. However, current approaches face two\ncritical limitations: i) they struggle to converge when client domains are\nsufficiently different, and ii) current aggregation techniques produce an\nidentical global model for each client. In this work, we address these issues\nby reformulating the typical federated learning setup: rather than learning a\nsingle global model, we learn N models each optimized for a common objective.\nTo achieve this, we apply a weighted distance minimization to model parameters\nshared in a peer-to-peer topology. The resulting framework, Iterative Parameter\nAlignment, applies naturally to the cross-silo setting, and has the following\nproperties: (i) a unique solution for each participant, with the option to\nglobally converge each model in the federation, and (ii) an optional\nearly-stopping mechanism to elicit fairness among peers in collaborative\nlearning settings. These characteristics jointly provide a flexible new\nframework for iteratively learning from peer models trained on disparate\ndatasets. We find that the technique achieves competitive results on a variety\nof data partitions compared to state-of-the-art approaches. Further, we show\nthat the method is robust to divergent domains (i.e. disjoint classes across\npeers) where existing approaches struggle.\n","authors":["Matt Gorbett","Hossein Shirazi","Indrakshi Ray"],"pdf_url":"https://arxiv.org/pdf/2311.04818v3.pdf","comment":"Published at IEEE Big Data 2023"},{"id":"http://arxiv.org/abs/2404.04394v1","updated":"2024-04-05T20:39:16Z","published":"2024-04-05T20:39:16Z","title":"Analyzing Participants' Engagement during Online Meetings Using\n Unsupervised Remote Photoplethysmography with Behavioral Features","summary":" Engagement measurement finds application in healthcare, education,\nadvertisement, and services. The use of physiological and behavioral features\nis viable, but the impracticality of traditional physiological measurement\narises due to the need for contact sensors. We demonstrate the feasibility of\nunsupervised remote photoplethysmography (rPPG) as an alternative for contact\nsensors in deriving heart rate variability (HRV) features, then fusing these\nwith behavioral features to measure engagement in online group meetings.\nFirstly, a unique Engagement Dataset of online interactions among social\nworkers is collected with granular engagement labels, offering insight into\nvirtual meeting dynamics. Secondly, a pre-trained rPPG model is customized to\nreconstruct accurate rPPG signals from video meetings in an unsupervised\nmanner, enabling the calculation of HRV features. Thirdly, the feasibility of\nestimating engagement from HRV features using short observation windows, with a\nnotable enhancement when using longer observation windows of two to four\nminutes, is demonstrated. Fourthly, the effectiveness of behavioral cues is\nevaluated and fused with physiological data, which further enhances engagement\nestimation performance. An accuracy of 94% is achieved when only HRV features\nare used, eliminating the need for contact sensors or ground truth signals. The\nincorporation of behavioral cues raises the accuracy to 96%. Facial video\nanalysis offers precise engagement measurement, beneficial for future\napplications.\n","authors":["Alexander Vedernikov","Zhaodong Sun","Virpi-Liisa Kykyri","Mikko Pohjola","Miriam Nokia","Xiaobai Li"],"pdf_url":"https://arxiv.org/pdf/2404.04394v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19964v3","updated":"2024-04-05T20:33:14Z","published":"2024-03-29T03:56:19Z","title":"FairRAG: Fair Human Generation via Fair Retrieval Augmentation","summary":" Existing text-to-image generative models reflect or even amplify societal\nbiases ingrained in their training data. This is especially concerning for\nhuman image generation where models are biased against certain demographic\ngroups. Existing attempts to rectify this issue are hindered by the inherent\nlimitations of the pre-trained models and fail to substantially improve\ndemographic diversity. In this work, we introduce Fair Retrieval Augmented\nGeneration (FairRAG), a novel framework that conditions pre-trained generative\nmodels on reference images retrieved from an external image database to improve\nfairness in human generation. FairRAG enables conditioning through a\nlightweight linear module that projects reference images into the textual\nspace. To enhance fairness, FairRAG applies simple-yet-effective debiasing\nstrategies, providing images from diverse demographic groups during the\ngenerative process. Extensive experiments demonstrate that FairRAG outperforms\nexisting methods in terms of demographic diversity, image-text alignment, and\nimage fidelity while incurring minimal computational overhead during inference.\n","authors":["Robik Shrestha","Yang Zou","Qiuyu Chen","Zhiheng Li","Yusheng Xie","Siqi Deng"],"pdf_url":"https://arxiv.org/pdf/2403.19964v3.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19949v2","updated":"2024-04-05T20:08:16Z","published":"2024-03-29T03:15:31Z","title":"FairCLIP: Harnessing Fairness in Vision-Language Learning","summary":" Fairness is a critical concern in deep learning, especially in healthcare,\nwhere these models influence diagnoses and treatment decisions. Although\nfairness has been investigated in the vision-only domain, the fairness of\nmedical vision-language (VL) models remains unexplored due to the scarcity of\nmedical VL datasets for studying fairness. To bridge this research gap, we\nintroduce the first fair vision-language medical dataset Harvard-FairVLMed that\nprovides detailed demographic attributes, ground-truth labels, and clinical\nnotes to facilitate an in-depth examination of fairness within VL foundation\nmodels. Using Harvard-FairVLMed, we conduct a comprehensive fairness analysis\nof two widely-used VL models (CLIP and BLIP2), pre-trained on both natural and\nmedical domains, across four different protected attributes. Our results\nhighlight significant biases in all VL models, with Asian, Male, Non-Hispanic,\nand Spanish being the preferred subgroups across the protected attributes of\nrace, gender, ethnicity, and language, respectively. In order to alleviate\nthese biases, we propose FairCLIP, an optimal-transport-based approach that\nachieves a favorable trade-off between performance and fairness by reducing the\nSinkhorn distance between the overall sample distribution and the distributions\ncorresponding to each demographic group. As the first VL dataset of its kind,\nHarvard-FairVLMed holds the potential to catalyze advancements in the\ndevelopment of machine learning models that are both ethically aware and\nclinically effective. Our dataset and code are available at\nhttps://ophai.hms.harvard.edu/datasets/harvard-fairvlmed10k.\n","authors":["Yan Luo","Min Shi","Muhammad Osama Khan","Muhammad Muneeb Afzal","Hao Huang","Shuaihang Yuan","Yu Tian","Luo Song","Ava Kouhana","Tobias Elze","Yi Fang","Mengyu Wang"],"pdf_url":"https://arxiv.org/pdf/2403.19949v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.04377v1","updated":"2024-04-05T19:42:55Z","published":"2024-04-05T19:42:55Z","title":"LOSS-SLAM: Lightweight Open-Set Semantic Simultaneous Localization and\n Mapping","summary":" Enabling robots to understand the world in terms of objects is a critical\nbuilding block towards higher level autonomy. The success of foundation models\nin vision has created the ability to segment and identify nearly all objects in\nthe world. However, utilizing such objects to localize the robot and build an\nopen-set semantic map of the world remains an open research question. In this\nwork, a system of identifying, localizing, and encoding objects is tightly\ncoupled with probabilistic graphical models for performing open-set semantic\nsimultaneous localization and mapping (SLAM). Results are presented\ndemonstrating that the proposed lightweight object encoding can be used to\nperform more accurate object-based SLAM than existing open-set methods,\nclosed-set methods, and geometric methods while incurring a lower computational\noverhead than existing open-set mapping methods.\n","authors":["Kurran Singh","Tim Magoun","John J. Leonard"],"pdf_url":"https://arxiv.org/pdf/2404.04377v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04376v1","updated":"2024-04-05T19:38:18Z","published":"2024-04-05T19:38:18Z","title":"ClickDiffusion: Harnessing LLMs for Interactive Precise Image Editing","summary":" Recently, researchers have proposed powerful systems for generating and\nmanipulating images using natural language instructions. However, it is\ndifficult to precisely specify many common classes of image transformations\nwith text alone. For example, a user may wish to change the location and breed\nof a particular dog in an image with several similar dogs. This task is quite\ndifficult with natural language alone, and would require a user to write a\nlaboriously complex prompt that both disambiguates the target dog and describes\nthe destination. We propose ClickDiffusion, a system for precise image\nmanipulation and generation that combines natural language instructions with\nvisual feedback provided by the user through a direct manipulation interface.\nWe demonstrate that by serializing both an image and a multi-modal instruction\ninto a textual representation it is possible to leverage LLMs to perform\nprecise transformations of the layout and appearance of an image. Code\navailable at https://github.com/poloclub/ClickDiffusion.\n","authors":["Alec Helbling","Seongmin Lee","Polo Chau"],"pdf_url":"https://arxiv.org/pdf/2404.04376v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2402.07925"},{"id":"http://arxiv.org/abs/2311.17518v2","updated":"2024-04-05T19:18:30Z","published":"2023-11-29T10:40:52Z","title":"The devil is in the fine-grained details: Evaluating open-vocabulary\n object detectors for fine-grained understanding","summary":" Recent advancements in large vision-language models enabled visual object\ndetection in open-vocabulary scenarios, where object classes are defined in\nfree-text formats during inference. In this paper, we aim to probe the\nstate-of-the-art methods for open-vocabulary object detection to determine to\nwhat extent they understand fine-grained properties of objects and their parts.\nTo this end, we introduce an evaluation protocol based on dynamic vocabulary\ngeneration to test whether models detect, discern, and assign the correct\nfine-grained description to objects in the presence of hard-negative classes.\nWe contribute with a benchmark suite of increasing difficulty and probing\ndifferent properties like color, pattern, and material. We further enhance our\ninvestigation by evaluating several state-of-the-art open-vocabulary object\ndetectors using the proposed protocol and find that most existing solutions,\nwhich shine in standard open-vocabulary benchmarks, struggle to accurately\ncapture and distinguish finer object details. We conclude the paper by\nhighlighting the limitations of current methodologies and exploring promising\nresearch directions to overcome the discovered drawbacks. Data and code are\navailable at https://lorebianchi98.github.io/FG-OVD/.\n","authors":["Lorenzo Bianchi","Fabio Carrara","Nicola Messina","Claudio Gennaro","Fabrizio Falchi"],"pdf_url":"https://arxiv.org/pdf/2311.17518v2.pdf","comment":"Accepted as Highlight at CVPR2024"},{"id":"http://arxiv.org/abs/2404.04363v1","updated":"2024-04-05T19:16:30Z","published":"2024-04-05T19:16:30Z","title":"Idea-2-3D: Collaborative LMM Agents Enable 3D Model Generation from\n Interleaved Multimodal Inputs","summary":" In this paper, we pursue a novel 3D AIGC setting: generating 3D content from\nIDEAs. The definition of an IDEA is the composition of multimodal inputs\nincluding text, image, and 3D models. To our knowledge, this challenging and\nappealing 3D AIGC setting has not been studied before. We propose the novel\nframework called Idea-2-3D to achieve this goal, which consists of three agents\nbased upon large multimodel models (LMMs) and several existing algorithmic\ntools for them to invoke. Specifically, these three LMM-based agents are\nprompted to do the jobs of prompt generation, model selection and feedback\nreflection. They work in a cycle that involves both mutual collaboration and\ncriticism. Note that this cycle is done in a fully automatic manner, without\nany human intervention. The framework then outputs a text prompt to generate 3D\nmodels that well align with input IDEAs. We show impressive 3D AIGC results\nthat are beyond any previous methods can achieve. For quantitative comparisons,\nwe construct caption-based baselines using a whole bunch of state-of-the-art 3D\nAIGC models and demonstrate Idea-2-3D out-performs significantly. In 94.2% of\ncases, Idea-2-3D meets users' requirements, marking a degree of match between\nIDEA and 3D models that is 2.3 times higher than baselines. Moreover, in 93.5%\nof the cases, users agreed that Idea-2-3D was better than baselines. Codes,\ndata and models will made publicly available.\n","authors":["Junhao Chen","Xiang Li","Xiaojun Ye","Chao Li","Zhaoxin Fan","Hao Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.04363v1.pdf","comment":"Project Page: https://air-discover.github.io/Idea-2-3D/ Code:\n https://github.com/yisuanwang/Idea23D"},{"id":"http://arxiv.org/abs/2404.04356v1","updated":"2024-04-05T18:56:00Z","published":"2024-04-05T18:56:00Z","title":"Pixel-wise RL on Diffusion Models: Reinforcement Learning from Rich\n Feedback","summary":" Latent diffusion models are the state-of-the-art for synthetic image\ngeneration. To align these models with human preferences, training the models\nusing reinforcement learning on human feedback is crucial. Black et. al 2024\nintroduced denoising diffusion policy optimisation (DDPO), which accounts for\nthe iterative denoising nature of the generation by modelling it as a Markov\nchain with a final reward. As the reward is a single value that determines the\nmodel's performance on the entire image, the model has to navigate a very\nsparse reward landscape and so requires a large sample count. In this work, we\nextend the DDPO by presenting the Pixel-wise Policy Optimisation (PXPO)\nalgorithm, which can take feedback for each pixel, providing a more nuanced\nreward to the model.\n","authors":["Mo Kordzanganeh","Danial Keshvary","Nariman Arian"],"pdf_url":"https://arxiv.org/pdf/2404.04356v1.pdf","comment":"6 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.04346v1","updated":"2024-04-05T18:33:04Z","published":"2024-04-05T18:33:04Z","title":"Koala: Key frame-conditioned long video-LLM","summary":" Long video question answering is a challenging task that involves recognizing\nshort-term activities and reasoning about their fine-grained relationships.\nState-of-the-art video Large Language Models (vLLMs) hold promise as a viable\nsolution due to their demonstrated emergent capabilities on new tasks. However,\ndespite being trained on millions of short seconds-long videos, vLLMs are\nunable to understand minutes-long videos and accurately answer questions about\nthem. To address this limitation, we propose a lightweight and self-supervised\napproach, Key frame-conditioned long video-LLM (Koala), that introduces\nlearnable spatiotemporal queries to adapt pretrained vLLMs for generalizing to\nlonger videos. Our approach introduces two new tokenizers that condition on\nvisual tokens computed from sparse video key frames for understanding short and\nlong video moments. We train our proposed approach on HowTo100M and demonstrate\nits effectiveness on zero-shot long video understanding benchmarks, where it\noutperforms state-of-the-art large models by 3 - 6% in absolute accuracy across\nall tasks. Surprisingly, we also empirically show that our approach not only\nhelps a pretrained vLLM to understand long videos but also improves its\naccuracy on short-term action recognition.\n","authors":["Reuben Tan","Ximeng Sun","Ping Hu","Jui-hsien Wang","Hanieh Deilamsalehy","Bryan A. Plummer","Bryan Russell","Kate Saenko"],"pdf_url":"https://arxiv.org/pdf/2404.04346v1.pdf","comment":"Accepted at CVPR 2024 as a poster highlight"},{"id":"http://arxiv.org/abs/2403.13808v2","updated":"2024-04-05T18:22:02Z","published":"2024-03-20T17:59:58Z","title":"On Pretraining Data Diversity for Self-Supervised Learning","summary":" We explore the impact of training with more diverse datasets, characterized\nby the number of unique samples, on the performance of self-supervised learning\n(SSL) under a fixed computational budget. Our findings consistently demonstrate\nthat increasing pretraining data diversity enhances SSL performance, albeit\nonly when the distribution distance to the downstream data is minimal. Notably,\neven with an exceptionally large pretraining data diversity achieved through\nmethods like web crawling or diffusion-generated data, among other ways, the\ndistribution shift remains a challenge. Our experiments are comprehensive with\nseven SSL methods using large-scale datasets such as ImageNet and YFCC100M\namounting to over 200 GPU days. Code and trained models will be available at\nhttps://github.com/hammoudhasan/DiversitySSL .\n","authors":["Hasan Abed Al Kader Hammoud","Tuhin Das","Fabio Pizzati","Philip Torr","Adel Bibi","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2403.13808v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2403.16271v3","updated":"2024-04-05T18:11:20Z","published":"2024-03-24T19:32:39Z","title":"Object Detectors in the Open Environment: Challenges, Solutions, and\n Outlook","summary":" With the emergence of foundation models, deep learning-based object detectors\nhave shown practical usability in closed set scenarios. However, for real-world\ntasks, object detectors often operate in open environments, where crucial\nfactors (e.g., data distribution, objective) that influence model learning are\noften changing. The dynamic and intricate nature of the open environment poses\nnovel and formidable challenges to object detectors. Unfortunately, current\nresearch on object detectors in open environments lacks a comprehensive\nanalysis of their distinctive characteristics, challenges, and corresponding\nsolutions, which hinders their secure deployment in critical real-world\nscenarios. This paper aims to bridge this gap by conducting a comprehensive\nreview and analysis of object detectors in open environments. We initially\nidentified limitations of key structural components within the existing\ndetection pipeline and propose the open environment object detector challenge\nframework that includes four quadrants (i.e., out-of-domain, out-of-category,\nrobust learning, and incremental learning) based on the dimensions of the data\n/ target changes. For each quadrant of challenges in the proposed framework, we\npresent a detailed description and systematic analysis of the overarching goals\nand core difficulties, systematically review the corresponding solutions, and\nbenchmark their performance over multiple widely adopted datasets. In addition,\nwe engage in a discussion of open problems and potential avenues for future\nresearch. This paper aims to provide a fresh, comprehensive, and systematic\nunderstanding of the challenges and solutions associated with open-environment\nobject detectors, thus catalyzing the development of more solid applications in\nreal-world scenarios. A project related to this survey can be found at\nhttps://github.com/LiangSiyuan21/OEOD_Survey.\n","authors":["Siyuan Liang","Wei Wang","Ruoyu Chen","Aishan Liu","Boxi Wu","Ee-Chien Chang","Xiaochun Cao","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2403.16271v3.pdf","comment":"37 pages, 17 figures"},{"id":"http://arxiv.org/abs/2404.04319v1","updated":"2024-04-05T17:59:25Z","published":"2024-04-05T17:59:25Z","title":"SpatialTracker: Tracking Any 2D Pixels in 3D Space","summary":" Recovering dense and long-range pixel motion in videos is a challenging\nproblem. Part of the difficulty arises from the 3D-to-2D projection process,\nleading to occlusions and discontinuities in the 2D motion domain. While 2D\nmotion can be intricate, we posit that the underlying 3D motion can often be\nsimple and low-dimensional. In this work, we propose to estimate point\ntrajectories in 3D space to mitigate the issues caused by image projection. Our\nmethod, named SpatialTracker, lifts 2D pixels to 3D using monocular depth\nestimators, represents the 3D content of each frame efficiently using a\ntriplane representation, and performs iterative updates using a transformer to\nestimate 3D trajectories. Tracking in 3D allows us to leverage\nas-rigid-as-possible (ARAP) constraints while simultaneously learning a\nrigidity embedding that clusters pixels into different rigid parts. Extensive\nevaluation shows that our approach achieves state-of-the-art tracking\nperformance both qualitatively and quantitatively, particularly in challenging\nscenarios such as out-of-plane rotation.\n","authors":["Yuxi Xiao","Qianqian Wang","Shangzhan Zhang","Nan Xue","Sida Peng","Yujun Shen","Xiaowei Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.04319v1.pdf","comment":"Accepted to CVPR 2024 (selected as highlight paper). Project page:\n https://henry123-boy.github.io/SpaTracker/"},{"id":"http://arxiv.org/abs/2404.04318v1","updated":"2024-04-05T17:55:33Z","published":"2024-04-05T17:55:33Z","title":"Robust Depth Enhancement via Polarization Prompt Fusion Tuning","summary":" Existing depth sensors are imperfect and may provide inaccurate depth values\nin challenging scenarios, such as in the presence of transparent or reflective\nobjects. In this work, we present a general framework that leverages\npolarization imaging to improve inaccurate depth measurements from various\ndepth sensors. Previous polarization-based depth enhancement methods focus on\nutilizing pure physics-based formulas for a single sensor. In contrast, our\nmethod first adopts a learning-based strategy where a neural network is trained\nto estimate a dense and complete depth map from polarization data and a sensor\ndepth map from different sensors. To further improve the performance, we\npropose a Polarization Prompt Fusion Tuning (PPFT) strategy to effectively\nutilize RGB-based models pre-trained on large-scale datasets, as the size of\nthe polarization dataset is limited to train a strong model from scratch. We\nconducted extensive experiments on a public dataset, and the results\ndemonstrate that the proposed method performs favorably compared to existing\ndepth enhancement baselines. Code and demos are available at\nhttps://lastbasket.github.io/PPFT/.\n","authors":["Kei Ikemura","Yiming Huang","Felix Heide","Zhaoxiang Zhang","Qifeng Chen","Chenyang Lei"],"pdf_url":"https://arxiv.org/pdf/2404.04318v1.pdf","comment":"CVPR 2024. Project page: https://lastbasket.github.io/PPFT/. The\n first two authors contribute equally"},{"id":"http://arxiv.org/abs/2404.04308v1","updated":"2024-04-05T07:31:24Z","published":"2024-04-05T07:31:24Z","title":"Visual Knowledge in the Big Model Era: Retrospect and Prospect","summary":" Visual knowledge is a new form of knowledge representation that can\nencapsulate visual concepts and their relations in a succinct, comprehensive,\nand interpretable manner, with a deep root in cognitive psychology. As the\nknowledge about the visual world has been identified as an indispensable\ncomponent of human cognition and intelligence, visual knowledge is poised to\nhave a pivotal role in establishing machine intelligence. With the recent\nadvance of Artificial Intelligence (AI) techniques, large AI models (or\nfoundation models) have emerged as a potent tool capable of extracting\nversatile patterns from broad data as implicit knowledge, and abstracting them\ninto an outrageous amount of numeric parameters. To pave the way for creating\nvisual knowledge empowered AI machines in this coming wave, we present a timely\nreview that investigates the origins and development of visual knowledge in the\npre-big model era, and accentuates the opportunities and unique role of visual\nknowledge in the big model era.\n","authors":["Wenguan Wang","Yi Yang","Yunhe Pan"],"pdf_url":"https://arxiv.org/pdf/2404.04308v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05758v1","updated":"2024-04-05T21:28:56Z","published":"2024-04-05T21:28:56Z","title":"Implicit Assimilation of Sparse In Situ Data for Dense & Global Storm\n Surge Forecasting","summary":" Hurricanes and coastal floods are among the most disastrous natural hazards.\nBoth are intimately related to storm surges, as their causes and effects,\nrespectively. However, the short-term forecasting of storm surges has proven\nchallenging, especially when targeting previously unseen locations or sites\nwithout tidal gauges. Furthermore, recent work improved short and medium-term\nweather forecasting but the handling of raw unassimilated data remains\nnon-trivial. In this paper, we tackle both challenges and demonstrate that\nneural networks can implicitly assimilate sparse in situ tide gauge data with\ncoarse ocean state reanalysis in order to forecast storm surges. We curate a\nglobal dataset to learn and validate the dense prediction of storm surges,\nbuilding on preceding efforts. Other than prior work limited to known gauges,\nour approach extends to ungauged sites, paving the way for global storm surge\nforecasting.\n","authors":["Patrick Ebel","Brandon Victor","Peter Naylor","Gabriele Meoni","Federico Serva","Rochelle Schneider"],"pdf_url":"https://arxiv.org/pdf/2404.05758v1.pdf","comment":"Accepted at CVPR EarthVision 2024"}]},"2024-04-08T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.05729v1","updated":"2024-04-08T17:59:46Z","published":"2024-04-08T17:59:46Z","title":"Finding Visual Task Vectors","summary":" Visual Prompting is a technique for teaching models to perform a visual task\nvia in-context examples, without any additional training. In this work, we\nanalyze the activations of MAE-VQGAN, a recent Visual Prompting model, and find\ntask vectors, activations that encode task-specific information. Equipped with\nthis insight, we demonstrate that it is possible to identify the task vectors\nand use them to guide the network towards performing different tasks without\nproviding any input-output examples. To find task vectors, we compute the\naverage intermediate activations per task and use the REINFORCE algorithm to\nsearch for the subset of task vectors. The resulting task vectors guide the\nmodel towards performing a task better than the original model without the need\nfor input-output examples.\n","authors":["Alberto Hojel","Yutong Bai","Trevor Darrell","Amir Globerson","Amir Bar"],"pdf_url":"https://arxiv.org/pdf/2404.05729v1.pdf","comment":"https://github.com/alhojel/visual_task_vectors"},{"id":"http://arxiv.org/abs/2404.05726v1","updated":"2024-04-08T17:59:24Z","published":"2024-04-08T17:59:24Z","title":"MA-LMM: Memory-Augmented Large Multimodal Model for Long-Term Video\n Understanding","summary":" With the success of large language models (LLMs), integrating the vision\nmodel into LLMs to build vision-language foundation models has gained much more\ninterest recently. However, existing LLM-based large multimodal models (e.g.,\nVideo-LLaMA, VideoChat) can only take in a limited number of frames for short\nvideo understanding. In this study, we mainly focus on designing an efficient\nand effective model for long-term video understanding. Instead of trying to\nprocess more frames simultaneously like most existing work, we propose to\nprocess videos in an online manner and store past video information in a memory\nbank. This allows our model to reference historical video content for long-term\nanalysis without exceeding LLMs' context length constraints or GPU memory\nlimits. Our memory bank can be seamlessly integrated into current multimodal\nLLMs in an off-the-shelf manner. We conduct extensive experiments on various\nvideo understanding tasks, such as long-video understanding, video question\nanswering, and video captioning, and our model can achieve state-of-the-art\nperformances across multiple datasets. Code available at\nhttps://boheumd.github.io/MA-LMM/.\n","authors":["Bo He","Hengduo Li","Young Kyun Jang","Menglin Jia","Xuefei Cao","Ashish Shah","Abhinav Shrivastava","Ser-Nam Lim"],"pdf_url":"https://arxiv.org/pdf/2404.05726v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05719v1","updated":"2024-04-08T17:55:44Z","published":"2024-04-08T17:55:44Z","title":"Ferret-UI: Grounded Mobile UI Understanding with Multimodal LLMs","summary":" Recent advancements in multimodal large language models (MLLMs) have been\nnoteworthy, yet, these general-domain MLLMs often fall short in their ability\nto comprehend and interact effectively with user interface (UI) screens. In\nthis paper, we present Ferret-UI, a new MLLM tailored for enhanced\nunderstanding of mobile UI screens, equipped with referring, grounding, and\nreasoning capabilities. Given that UI screens typically exhibit a more\nelongated aspect ratio and contain smaller objects of interest (e.g., icons,\ntexts) than natural images, we incorporate \"any resolution\" on top of Ferret to\nmagnify details and leverage enhanced visual features. Specifically, each\nscreen is divided into 2 sub-images based on the original aspect ratio (i.e.,\nhorizontal division for portrait screens and vertical division for landscape\nscreens). Both sub-images are encoded separately before being sent to LLMs. We\nmeticulously gather training samples from an extensive range of elementary UI\ntasks, such as icon recognition, find text, and widget listing. These samples\nare formatted for instruction-following with region annotations to facilitate\nprecise referring and grounding. To augment the model's reasoning ability, we\nfurther compile a dataset for advanced tasks, including detailed description,\nperception/interaction conversations, and function inference. After training on\nthe curated datasets, Ferret-UI exhibits outstanding comprehension of UI\nscreens and the capability to execute open-ended instructions. For model\nevaluation, we establish a comprehensive benchmark encompassing all the\naforementioned tasks. Ferret-UI excels not only beyond most open-source UI\nMLLMs, but also surpasses GPT-4V on all the elementary UI tasks.\n","authors":["Keen You","Haotian Zhang","Eldon Schoop","Floris Weers","Amanda Swearngin","Jeffrey Nichols","Yinfei Yang","Zhe Gan"],"pdf_url":"https://arxiv.org/pdf/2404.05719v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05717v1","updated":"2024-04-08T17:52:29Z","published":"2024-04-08T17:52:29Z","title":"SwapAnything: Enabling Arbitrary Object Swapping in Personalized Visual\n Editing","summary":" Effective editing of personal content holds a pivotal role in enabling\nindividuals to express their creativity, weaving captivating narratives within\ntheir visual stories, and elevate the overall quality and impact of their\nvisual content. Therefore, in this work, we introduce SwapAnything, a novel\nframework that can swap any objects in an image with personalized concepts\ngiven by the reference, while keeping the context unchanged. Compared with\nexisting methods for personalized subject swapping, SwapAnything has three\nunique advantages: (1) precise control of arbitrary objects and parts rather\nthan the main subject, (2) more faithful preservation of context pixels, (3)\nbetter adaptation of the personalized concept to the image. First, we propose\ntargeted variable swapping to apply region control over latent feature maps and\nswap masked variables for faithful context preservation and initial semantic\nconcept swapping. Then, we introduce appearance adaptation, to seamlessly adapt\nthe semantic concept into the original image in terms of target location,\nshape, style, and content during the image generation process. Extensive\nresults on both human and automatic evaluation demonstrate significant\nimprovements of our approach over baseline methods on personalized swapping.\nFurthermore, SwapAnything shows its precise and faithful swapping abilities\nacross single object, multiple objects, partial object, and cross-domain\nswapping tasks. SwapAnything also achieves great performance on text-based\nswapping and tasks beyond swapping such as object insertion.\n","authors":["Jing Gu","Yilin Wang","Nanxuan Zhao","Wei Xiong","Qing Liu","Zhifei Zhang","He Zhang","Jianming Zhang","HyunJoon Jung","Xin Eric Wang"],"pdf_url":"https://arxiv.org/pdf/2404.05717v1.pdf","comment":"18 pages, 16 figures, 3 tables"},{"id":"http://arxiv.org/abs/2311.04071v4","updated":"2024-04-08T17:49:58Z","published":"2023-11-07T15:35:56Z","title":"Energy-Calibrated VAE with Test Time Free Lunch","summary":" In this paper, we propose a novel generative model that utilizes a\nconditional Energy-Based Model (EBM) for enhancing Variational Autoencoder\n(VAE), termed Energy-Calibrated VAE (EC-VAE). Specifically, VAEs often suffer\nfrom blurry generated samples due to the lack of a tailored training on the\nsamples generated in the generative direction. On the other hand, EBMs can\ngenerate high-quality samples but require expensive Markov Chain Monte Carlo\n(MCMC) sampling. To address these issues, we introduce a conditional EBM for\ncalibrating the generative direction of VAE during training, without requiring\nit for the generation at test time. In particular, we train EC-VAE upon both\nthe input data and the calibrated samples with adaptive weight to enhance\nefficacy while avoiding MCMC sampling at test time. Furthermore, we extend the\ncalibration idea of EC-VAE to variational learning and normalizing flows, and\napply EC-VAE to an additional application of zero-shot image restoration via\nneural transport prior and range-null theory. We evaluate the proposed method\nwith two applications, including image generation and zero-shot image\nrestoration, and the experimental results show that our method achieves\ncompetitive performance over single-step non-adversarial generation. Our code\nis available at https://github.com/DJ-LYH/EC-VAE.\n","authors":["Yihong Luo","Siya Qiu","Xingjian Tao","Yujun Cai","Jing Tang"],"pdf_url":"https://arxiv.org/pdf/2311.04071v4.pdf","comment":"Revision. Code is available at https://github.com/DJ-LYH/EC-VAE"},{"id":"http://arxiv.org/abs/2404.05705v1","updated":"2024-04-08T17:42:08Z","published":"2024-04-08T17:42:08Z","title":"Learning 3D-Aware GANs from Unposed Images with Template Feature Field","summary":" Collecting accurate camera poses of training images has been shown to well\nserve the learning of 3D-aware generative adversarial networks (GANs) yet can\nbe quite expensive in practice. This work targets learning 3D-aware GANs from\nunposed images, for which we propose to perform on-the-fly pose estimation of\ntraining images with a learned template feature field (TeFF). Concretely, in\naddition to a generative radiance field as in previous approaches, we ask the\ngenerator to also learn a field from 2D semantic features while sharing the\ndensity from the radiance field. Such a framework allows us to acquire a\ncanonical 3D feature template leveraging the dataset mean discovered by the\ngenerative model, and further efficiently estimate the pose parameters on real\ndata. Experimental results on various challenging datasets demonstrate the\nsuperiority of our approach over state-of-the-art alternatives from both the\nqualitative and the quantitative perspectives.\n","authors":["Xinya Chen","Hanlei Guo","Yanrui Bin","Shangzhan Zhang","Yuanbo Yang","Yue Wang","Yujun Shen","Yiyi Liao"],"pdf_url":"https://arxiv.org/pdf/2404.05705v1.pdf","comment":"https://XDimlab.github.io/TeFF"},{"id":"http://arxiv.org/abs/2404.05693v1","updated":"2024-04-08T17:18:30Z","published":"2024-04-08T17:18:30Z","title":"Evaluating the Efficacy of Cut-and-Paste Data Augmentation in Semantic\n Segmentation for Satellite Imagery","summary":" Satellite imagery is crucial for tasks like environmental monitoring and\nurban planning. Typically, it relies on semantic segmentation or Land Use Land\nCover (LULC) classification to categorize each pixel. Despite the advancements\nbrought about by Deep Neural Networks (DNNs), their performance in segmentation\ntasks is hindered by challenges such as limited availability of labeled data,\nclass imbalance and the inherent variability and complexity of satellite\nimages. In order to mitigate those issues, our study explores the effectiveness\nof a Cut-and-Paste augmentation technique for semantic segmentation in\nsatellite images. We adapt this augmentation, which usually requires labeled\ninstances, to the case of semantic segmentation. By leveraging the connected\ncomponents in the semantic segmentation labels, we extract instances that are\nthen randomly pasted during training. Using the DynamicEarthNet dataset and a\nU-Net model for evaluation, we found that this augmentation significantly\nenhances the mIoU score on the test set from 37.9 to 44.1. This finding\nhighlights the potential of the Cut-and-Paste augmentation to improve the\ngeneralization capabilities of semantic segmentation models in satellite\nimagery.\n","authors":["Ionut M. Motoi","Leonardo Saraceni","Daniele Nardi","Thomas A. Ciarfuglia"],"pdf_url":"https://arxiv.org/pdf/2404.05693v1.pdf","comment":"Accepted for publication in IEEE 2024 International Geoscience &\n Remote Sensing Symposium (IGARSS 2024)"},{"id":"http://arxiv.org/abs/2404.05687v1","updated":"2024-04-08T17:10:45Z","published":"2024-04-08T17:10:45Z","title":"Retrieval-Augmented Open-Vocabulary Object Detection","summary":" Open-vocabulary object detection (OVD) has been studied with Vision-Language\nModels (VLMs) to detect novel objects beyond the pre-trained categories.\nPrevious approaches improve the generalization ability to expand the knowledge\nof the detector, using 'positive' pseudo-labels with additional 'class' names,\ne.g., sock, iPod, and alligator. To extend the previous methods in two aspects,\nwe propose Retrieval-Augmented Losses and visual Features (RALF). Our method\nretrieves related 'negative' classes and augments loss functions. Also, visual\nfeatures are augmented with 'verbalized concepts' of classes, e.g., worn on the\nfeet, handheld music player, and sharp teeth. Specifically, RALF consists of\ntwo modules: Retrieval Augmented Losses (RAL) and Retrieval-Augmented visual\nFeatures (RAF). RAL constitutes two losses reflecting the semantic similarity\nwith negative vocabularies. In addition, RAF augments visual features with the\nverbalized concepts from a large language model (LLM). Our experiments\ndemonstrate the effectiveness of RALF on COCO and LVIS benchmark datasets. We\nachieve improvement up to 3.4 box AP$_{50}^{\\text{N}}$ on novel categories of\nthe COCO dataset and 3.6 mask AP$_{\\text{r}}$ gains on the LVIS dataset. Code\nis available at https://github.com/mlvlab/RALF .\n","authors":["Jooyeon Kim","Eulrang Cho","Sehyung Kim","Hyunwoo J. Kim"],"pdf_url":"https://arxiv.org/pdf/2404.05687v1.pdf","comment":"Accepted paper at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05680v1","updated":"2024-04-08T16:58:31Z","published":"2024-04-08T16:58:31Z","title":"SphereHead: Stable 3D Full-head Synthesis with Spherical Tri-plane\n Representation","summary":" While recent advances in 3D-aware Generative Adversarial Networks (GANs) have\naided the development of near-frontal view human face synthesis, the challenge\nof comprehensively synthesizing a full 3D head viewable from all angles still\npersists. Although PanoHead proves the possibilities of using a large-scale\ndataset with images of both frontal and back views for full-head synthesis, it\noften causes artifacts for back views. Based on our in-depth analysis, we found\nthe reasons are mainly twofold. First, from network architecture perspective,\nwe found each plane in the utilized tri-plane/tri-grid representation space\ntends to confuse the features from both sides, causing \"mirroring\" artifacts\n(e.g., the glasses appear in the back). Second, from data supervision aspect,\nwe found that existing discriminator training in 3D GANs mainly focuses on the\nquality of the rendered image itself, and does not care much about its\nplausibility with the perspective from which it was rendered. This makes it\npossible to generate \"face\" in non-frontal views, due to its easiness to fool\nthe discriminator. In response, we propose SphereHead, a novel tri-plane\nrepresentation in the spherical coordinate system that fits the human head's\ngeometric characteristics and efficiently mitigates many of the generated\nartifacts. We further introduce a view-image consistency loss for the\ndiscriminator to emphasize the correspondence of the camera parameters and the\nimages. The combination of these efforts results in visually superior outcomes\nwith significantly fewer artifacts. Our code and dataset are publicly available\nat https://lhyfst.github.io/spherehead.\n","authors":["Heyuan Li","Ce Chen","Tianhao Shi","Yuda Qiu","Sizhe An","Guanying Chen","Xiaoguang Han"],"pdf_url":"https://arxiv.org/pdf/2404.05680v1.pdf","comment":"project page: https://lhyfst.github.io/spherehead"},{"id":"http://arxiv.org/abs/2312.07425v2","updated":"2024-04-08T16:56:17Z","published":"2023-12-12T16:48:53Z","title":"Deep Internal Learning: Deep Learning from a Single Input","summary":" Deep learning, in general, focuses on training a neural network from large\nlabeled datasets. Yet, in many cases there is value in training a network just\nfrom the input at hand. This is particularly relevant in many signal and image\nprocessing problems where training data is scarce and diversity is large on the\none hand, and on the other, there is a lot of structure in the data that can be\nexploited. Using this information is the key to deep internal-learning\nstrategies, which may involve training a network from scratch using a single\ninput or adapting an already trained network to a provided input example at\ninference time. This survey paper aims at covering deep internal-learning\ntechniques that have been proposed in the past few years for these two\nimportant directions. While our main focus will be on image processing\nproblems, most of the approaches that we survey are derived for general signals\n(vectors with recurring patterns that can be distinguished from noise) and are\ntherefore applicable to other modalities.\n","authors":["Tom Tirer","Raja Giryes","Se Young Chun","Yonina C. Eldar"],"pdf_url":"https://arxiv.org/pdf/2312.07425v2.pdf","comment":"Accepted to IEEE Signal Processing Magazine"},{"id":"http://arxiv.org/abs/2404.05675v1","updated":"2024-04-08T16:56:05Z","published":"2024-04-08T16:56:05Z","title":"Normalizing Flows on the Product Space of SO(3) Manifolds for\n Probabilistic Human Pose Modeling","summary":" Normalizing flows have proven their efficacy for density estimation in\nEuclidean space, but their application to rotational representations, crucial\nin various domains such as robotics or human pose modeling, remains\nunderexplored. Probabilistic models of the human pose can benefit from\napproaches that rigorously consider the rotational nature of human joints. For\nthis purpose, we introduce HuProSO3, a normalizing flow model that operates on\na high-dimensional product space of SO(3) manifolds, modeling the joint\ndistribution for human joints with three degrees of freedom. HuProSO3's\nadvantage over state-of-the-art approaches is demonstrated through its superior\nmodeling accuracy in three different applications and its capability to\nevaluate the exact likelihood. This work not only addresses the technical\nchallenge of learning densities on SO(3) manifolds, but it also has broader\nimplications for domains where the probabilistic regression of correlated 3D\nrotations is of importance.\n","authors":["Olaf Dünkel","Tim Salzmann","Florian Pfaff"],"pdf_url":"https://arxiv.org/pdf/2404.05675v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05674v1","updated":"2024-04-08T16:55:49Z","published":"2024-04-08T16:55:49Z","title":"MoMA: Multimodal LLM Adapter for Fast Personalized Image Generation","summary":" In this paper, we present MoMA: an open-vocabulary, training-free\npersonalized image model that boasts flexible zero-shot capabilities. As\nfoundational text-to-image models rapidly evolve, the demand for robust\nimage-to-image translation grows. Addressing this need, MoMA specializes in\nsubject-driven personalized image generation. Utilizing an open-source,\nMultimodal Large Language Model (MLLM), we train MoMA to serve a dual role as\nboth a feature extractor and a generator. This approach effectively synergizes\nreference image and text prompt information to produce valuable image features,\nfacilitating an image diffusion model. To better leverage the generated\nfeatures, we further introduce a novel self-attention shortcut method that\nefficiently transfers image features to an image diffusion model, improving the\nresemblance of the target object in generated images. Remarkably, as a\ntuning-free plug-and-play module, our model requires only a single reference\nimage and outperforms existing methods in generating images with high detail\nfidelity, enhanced identity-preservation and prompt faithfulness. Our work is\nopen-source, thereby providing universal access to these advancements.\n","authors":["Kunpeng Song","Yizhe Zhu","Bingchen Liu","Qing Yan","Ahmed Elgammal","Xiao Yang"],"pdf_url":"https://arxiv.org/pdf/2404.05674v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05673v1","updated":"2024-04-08T16:55:39Z","published":"2024-04-08T16:55:39Z","title":"CoReS: Orchestrating the Dance of Reasoning and Segmentation","summary":" The reasoning segmentation task, which demands a nuanced comprehension of\nintricate queries to accurately pinpoint object regions, is attracting\nincreasing attention. However, Multi-modal Large Language Models (MLLM) often\nfind it difficult to accurately localize the objects described in complex\nreasoning contexts. We believe that the act of reasoning segmentation should\nmirror the cognitive stages of human visual search, where each step is a\nprogressive refinement of thought toward the final object. Thus we introduce\nthe Chains of Reasoning and Segmenting (CoReS) and find this top-down visual\nhierarchy indeed enhances the visual search process. Specifically, we propose a\ndual-chain structure that generates multi-modal, chain-like outputs to aid the\nsegmentation process. Furthermore, to steer the MLLM's outputs into this\nintended hierarchy, we incorporate in-context inputs as guidance. Extensive\nexperiments demonstrate the superior performance of our CoReS, which surpasses\nthe state-of-the-art method by 7.1\\% on the ReasonSeg dataset. The code will be\nreleased at https://github.com/baoxiaoyi/CoReS.\n","authors":["Xiaoyi Bao","Siyang Sun","Shuailei Ma","Kecheng Zheng","Yuxin Guo","Guosheng Zhao","Yun Zheng","Xingang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.05673v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05669v1","updated":"2024-04-08T16:52:21Z","published":"2024-04-08T16:52:21Z","title":"NAF-DPM: A Nonlinear Activation-Free Diffusion Probabilistic Model for\n Document Enhancement","summary":" Real-world documents may suffer various forms of degradation, often resulting\nin lower accuracy in optical character recognition (OCR) systems. Therefore, a\ncrucial preprocessing step is essential to eliminate noise while preserving\ntext and key features of documents. In this paper, we propose NAF-DPM, a novel\ngenerative framework based on a diffusion probabilistic model (DPM) designed to\nrestore the original quality of degraded documents. While DPMs are recognized\nfor their high-quality generated images, they are also known for their large\ninference time. To mitigate this problem we provide the DPM with an efficient\nnonlinear activation-free (NAF) network and we employ as a sampler a fast\nsolver of ordinary differential equations, which can converge in a few\niterations. To better preserve text characters, we introduce an additional\ndifferentiable module based on convolutional recurrent neural networks,\nsimulating the behavior of an OCR system during training. Experiments conducted\non various datasets showcase the superiority of our approach, achieving\nstate-of-the-art performance in terms of pixel-level and perceptual similarity\nmetrics. Furthermore, the results demonstrate a notable character error\nreduction made by OCR systems when transcribing real-world document images\nenhanced by our framework. Code and pre-trained models are available at\nhttps://github.com/ispamm/NAF-DPM.\n","authors":["Giordano Cicchetti","Danilo Comminiello"],"pdf_url":"https://arxiv.org/pdf/2404.05669v1.pdf","comment":"Under review at IEEE Transactions on Pattern Analysis and Machine\n Intelligence"},{"id":"http://arxiv.org/abs/2404.05667v1","updated":"2024-04-08T16:51:33Z","published":"2024-04-08T16:51:33Z","title":"AlignZeg: Mitigating Objective Misalignment for Zero-shot Semantic\n Segmentation","summary":" A serious issue that harms the performance of zero-shot visual recognition is\nnamed objective misalignment, i.e., the learning objective prioritizes\nimproving the recognition accuracy of seen classes rather than unseen classes,\nwhile the latter is the true target to pursue. This issue becomes more\nsignificant in zero-shot image segmentation because the stronger (i.e.,\npixel-level) supervision brings a larger gap between seen and unseen classes.\nTo mitigate it, we propose a novel architecture named AlignZeg, which embodies\na comprehensive improvement of the segmentation pipeline, including proposal\nextraction, classification, and correction, to better fit the goal of zero-shot\nsegmentation. (1) Mutually-Refined Proposal Extraction. AlignZeg harnesses a\nmutual interaction between mask queries and visual features, facilitating\ndetailed class-agnostic mask proposal extraction. (2) Generalization-Enhanced\nProposal Classification. AlignZeg introduces synthetic data and incorporates\nmultiple background prototypes to allocate a more generalizable feature space.\n(3) Predictive Bias Correction. During the inference stage, AlignZeg uses a\nclass indicator to find potential unseen class proposals followed by a\nprediction postprocess to correct the prediction bias. Experiments demonstrate\nthat AlignZeg markedly enhances zero-shot semantic segmentation, as shown by an\naverage 3.8% increase in hIoU, primarily attributed to a 7.1% improvement in\nidentifying unseen classes, and we further validate that the improvement comes\nfrom alleviating the objective misalignment issue.\n","authors":["Jiannan Ge","Lingxi Xie","Hongtao Xie","Pandeng Li","Xiaopeng Zhang","Yongdong Zhang","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2404.05667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05666v1","updated":"2024-04-08T16:51:19Z","published":"2024-04-08T16:51:19Z","title":"YaART: Yet Another ART Rendering Technology","summary":" In the rapidly progressing field of generative models, the development of\nefficient and high-fidelity text-to-image diffusion systems represents a\nsignificant frontier. This study introduces YaART, a novel production-grade\ntext-to-image cascaded diffusion model aligned to human preferences using\nReinforcement Learning from Human Feedback (RLHF). During the development of\nYaART, we especially focus on the choices of the model and training dataset\nsizes, the aspects that were not systematically investigated for text-to-image\ncascaded diffusion models before. In particular, we comprehensively analyze how\nthese choices affect both the efficiency of the training process and the\nquality of the generated images, which are highly important in practice.\nFurthermore, we demonstrate that models trained on smaller datasets of\nhigher-quality images can successfully compete with those trained on larger\ndatasets, establishing a more efficient scenario of diffusion models training.\nFrom the quality perspective, YaART is consistently preferred by users over\nmany existing state-of-the-art models.\n","authors":["Sergey Kastryulin","Artem Konev","Alexander Shishenya","Eugene Lyapustin","Artem Khurshudov","Alexander Tselousov","Nikita Vinokurov","Denis Kuznedelev","Alexander Markovich","Grigoriy Livshits","Alexey Kirillov","Anastasiia Tabisheva","Liubov Chubarova","Marina Kaminskaia","Alexander Ustyuzhanin","Artemii Shvetsov","Daniil Shlenskii","Valerii Startsev","Dmitrii Kornilov","Mikhail Romanov","Artem Babenko","Sergei Ovcharenko","Valentin Khrulkov"],"pdf_url":"https://arxiv.org/pdf/2404.05666v1.pdf","comment":"Prompts and additional information are available on the project page,\n see https://ya.ru/ai/art/paper-yaart-v1"},{"id":"http://arxiv.org/abs/2404.05662v1","updated":"2024-04-08T16:46:25Z","published":"2024-04-08T16:46:25Z","title":"BinaryDM: Towards Accurate Binarization of Diffusion Model","summary":" With the advancement of diffusion models (DMs) and the substantially\nincreased computational requirements, quantization emerges as a practical\nsolution to obtain compact and efficient low-bit DMs. However, the highly\ndiscrete representation leads to severe accuracy degradation, hindering the\nquantization of diffusion models to ultra-low bit-widths. In this paper, we\npropose BinaryDM, a novel accurate quantization-aware training approach to push\nthe weights of diffusion models towards the limit of 1-bit. Firstly, we present\na Learnable Multi-basis Binarizer (LMB) to recover the representations\ngenerated by the binarized DM, which improves the information in details of\nrepresentations crucial to the DM. Secondly, a Low-rank Representation\nMimicking (LRM) is applied to enhance the binarization-aware optimization of\nthe DM, alleviating the optimization direction ambiguity caused by fine-grained\nalignment. Moreover, a progressive initialization strategy is applied to\ntraining DMs to avoid convergence difficulties. Comprehensive experiments\ndemonstrate that BinaryDM achieves significant accuracy and efficiency gains\ncompared to SOTA quantization methods of DMs under ultra-low bit-widths. As the\nfirst binarization method for diffusion models, BinaryDM achieves impressive\n16.0 times FLOPs and 27.1 times storage savings with 1-bit weight and 4-bit\nactivation, showcasing its substantial advantages and potential for deploying\nDMs on resource-limited scenarios.\n","authors":["Xingyu Zheng","Haotong Qin","Xudong Ma","Mingyuan Zhang","Haojie Hao","Jiakai Wang","Zixiang Zhao","Jinyang Guo","Xianglong Liu"],"pdf_url":"https://arxiv.org/pdf/2404.05662v1.pdf","comment":"The code will soon be available at\n https://github.com/Xingyu-Zheng/BinaryDM"},{"id":"http://arxiv.org/abs/2404.05661v1","updated":"2024-04-08T16:46:07Z","published":"2024-04-08T16:46:07Z","title":"Automatic Controllable Colorization via Imagination","summary":" We propose a framework for automatic colorization that allows for iterative\nediting and modifications. The core of our framework lies in an imagination\nmodule: by understanding the content within a grayscale image, we utilize a\npre-trained image generation model to generate multiple images that contain the\nsame content. These images serve as references for coloring, mimicking the\nprocess of human experts. As the synthesized images can be imperfect or\ndifferent from the original grayscale image, we propose a Reference Refinement\nModule to select the optimal reference composition. Unlike most previous\nend-to-end automatic colorization algorithms, our framework allows for\niterative and localized modifications of the colorization results because we\nexplicitly model the coloring samples. Extensive experiments demonstrate the\nsuperiority of our framework over existing automatic colorization algorithms in\neditability and flexibility. Project page:\nhttps://xy-cong.github.io/imagine-colorization.\n","authors":["Xiaoyan Cong","Yue Wu","Qifeng Chen","Chenyang Lei"],"pdf_url":"https://arxiv.org/pdf/2404.05661v1.pdf","comment":"CVPR 2024. Project page:\n https://xy-cong.github.io/imagine-colorization"},{"id":"http://arxiv.org/abs/2404.05657v1","updated":"2024-04-08T16:40:15Z","published":"2024-04-08T16:40:15Z","title":"MLP Can Be A Good Transformer Learner","summary":" Self-attention mechanism is the key of the Transformer but often criticized\nfor its computation demands. Previous token pruning works motivate their\nmethods from the view of computation redundancy but still need to load the full\nnetwork and require same memory costs. This paper introduces a novel strategy\nthat simplifies vision transformers and reduces computational load through the\nselective removal of non-essential attention layers, guided by entropy\nconsiderations. We identify that regarding the attention layer in bottom\nblocks, their subsequent MLP layers, i.e. two feed-forward layers, can elicit\nthe same entropy quantity. Meanwhile, the accompanied MLPs are under-exploited\nsince they exhibit smaller feature entropy compared to those MLPs in the top\nblocks. Therefore, we propose to integrate the uninformative attention layers\ninto their subsequent counterparts by degenerating them into identical mapping,\nyielding only MLP in certain transformer blocks. Experimental results on\nImageNet-1k show that the proposed method can remove 40% attention layer of\nDeiT-B, improving throughput and memory bound without performance compromise.\nCode is available at https://github.com/sihaoevery/lambda_vit.\n","authors":["Sihao Lin","Pumeng Lyu","Dongrui Liu","Tao Tang","Xiaodan Liang","Andy Song","Xiaojun Chang"],"pdf_url":"https://arxiv.org/pdf/2404.05657v1.pdf","comment":"efficient transformer"},{"id":"http://arxiv.org/abs/2404.05641v1","updated":"2024-04-08T16:21:22Z","published":"2024-04-08T16:21:22Z","title":"3D-COCO: extension of MS-COCO dataset for image detection and 3D\n reconstruction modules","summary":" We introduce 3D-COCO, an extension of the original MS-COCO dataset providing\n3D models and 2D-3D alignment annotations. 3D-COCO was designed to achieve\ncomputer vision tasks such as 3D reconstruction or image detection configurable\nwith textual, 2D image, and 3D CAD model queries. We complete the existing\nMS-COCO dataset with 28K 3D models collected on ShapeNet and Objaverse. By\nusing an IoU-based method, we match each MS-COCO annotation with the best 3D\nmodels to provide a 2D-3D alignment. The open-source nature of 3D-COCO is a\npremiere that should pave the way for new research on 3D-related topics. The\ndataset and its source codes is available at\nhttps://kalisteo.cea.fr/index.php/coco3d-object-detection-and-reconstruction/\n","authors":["Maxence Bideaux","Alice Phe","Mohamed Chaouch","Bertrand Luvison","Quoc-Cuong Pham"],"pdf_url":"https://arxiv.org/pdf/2404.05641v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06908v2","updated":"2024-04-08T16:16:56Z","published":"2024-03-11T17:00:27Z","title":"FreGS: 3D Gaussian Splatting with Progressive Frequency Regularization","summary":" 3D Gaussian splatting has achieved very impressive performance in real-time\nnovel view synthesis. However, it often suffers from over-reconstruction during\nGaussian densification where high-variance image regions are covered by a few\nlarge Gaussians only, leading to blur and artifacts in the rendered images. We\ndesign a progressive frequency regularization (FreGS) technique to tackle the\nover-reconstruction issue within the frequency space. Specifically, FreGS\nperforms coarse-to-fine Gaussian densification by exploiting low-to-high\nfrequency components that can be easily extracted with low-pass and high-pass\nfilters in the Fourier space. By minimizing the discrepancy between the\nfrequency spectrum of the rendered image and the corresponding ground truth, it\nachieves high-quality Gaussian densification and alleviates the\nover-reconstruction of Gaussian splatting effectively. Experiments over\nmultiple widely adopted benchmarks (e.g., Mip-NeRF360, Tanks-and-Temples and\nDeep Blending) show that FreGS achieves superior novel view synthesis and\noutperforms the state-of-the-art consistently.\n","authors":["Jiahui Zhang","Fangneng Zhan","Muyu Xu","Shijian Lu","Eric Xing"],"pdf_url":"https://arxiv.org/pdf/2403.06908v2.pdf","comment":"Accepted by CVPR 2024. Project website:\n https://rogeraigc.github.io/FreGS-Page/"},{"id":"http://arxiv.org/abs/2403.15238v2","updated":"2024-04-08T16:14:45Z","published":"2024-03-22T14:32:02Z","title":"WEEP: A method for spatial interpretation of weakly supervised CNN\n models in computational pathology","summary":" Deep learning enables the modelling of high-resolution histopathology\nwhole-slide images (WSI). Weakly supervised learning of tile-level data is\ntypically applied for tasks where labels only exist on the patient or WSI level\n(e.g. patient outcomes or histological grading). In this context, there is a\nneed for improved spatial interpretability of predictions from such models. We\npropose a novel method, Wsi rEgion sElection aPproach (WEEP), for model\ninterpretation. It provides a principled yet straightforward way to establish\nthe spatial area of WSI required for assigning a particular prediction label.\nWe demonstrate WEEP on a binary classification task in the area of breast\ncancer computational pathology. WEEP is easy to implement, is directly\nconnected to the model-based decision process, and offers information relevant\nto both research and diagnostic applications.\n","authors":["Abhinav Sharma","Bojing Liu","Mattias Rantalainen"],"pdf_url":"https://arxiv.org/pdf/2403.15238v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05626v1","updated":"2024-04-08T15:59:29Z","published":"2024-04-08T15:59:29Z","title":"Learning a Category-level Object Pose Estimator without Pose Annotations","summary":" 3D object pose estimation is a challenging task. Previous works always\nrequire thousands of object images with annotated poses for learning the 3D\npose correspondence, which is laborious and time-consuming for labeling. In\nthis paper, we propose to learn a category-level 3D object pose estimator\nwithout pose annotations. Instead of using manually annotated images, we\nleverage diffusion models (e.g., Zero-1-to-3) to generate a set of images under\ncontrolled pose differences and propose to learn our object pose estimator with\nthose images. Directly using the original diffusion model leads to images with\nnoisy poses and artifacts. To tackle this issue, firstly, we exploit an image\nencoder, which is learned from a specially designed contrastive pose learning,\nto filter the unreasonable details and extract image feature maps.\nAdditionally, we propose a novel learning strategy that allows the model to\nlearn object poses from those generated image sets without knowing the\nalignment of their canonical poses. Experimental results show that our method\nhas the capability of category-level object pose estimation from a single shot\nsetting (as pose definition), while significantly outperforming other\nstate-of-the-art methods on the few-shot category-level object pose estimation\nbenchmarks.\n","authors":["Fengrui Tian","Yaoyao Liu","Adam Kortylewski","Yueqi Duan","Shaoyi Du","Alan Yuille","Angtian Wang"],"pdf_url":"https://arxiv.org/pdf/2404.05626v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.05321v3","updated":"2024-04-08T15:59:11Z","published":"2022-09-12T15:26:13Z","title":"Deep Feature Statistics Mapping for Generalized Screen Content Image\n Quality Assessment","summary":" The statistical regularities of natural images, referred to as natural scene\nstatistics, play an important role in no-reference image quality assessment.\nHowever, it has been widely acknowledged that screen content images (SCIs),\nwhich are typically computer generated, do not hold such statistics. Here we\nmake the first attempt to learn the statistics of SCIs, based upon which the\nquality of SCIs can be effectively determined. The underlying mechanism of the\nproposed approach is based upon the mild assumption that the SCIs, which are\nnot physically acquired, still obey certain statistics that could be understood\nin a learning fashion. We empirically show that the statistics deviation could\nbe effectively leveraged in quality assessment, and the proposed method is\nsuperior when evaluated in different settings. Extensive experimental results\ndemonstrate the Deep Feature Statistics based SCI Quality Assessment (DFSS-IQA)\nmodel delivers promising performance compared with existing NR-IQA models and\nshows a high generalization capability in the cross-dataset settings. The\nimplementation of our method is publicly available at\nhttps://github.com/Baoliang93/DFSS-IQA.\n","authors":["Baoliang Chen","Hanwei Zhu","Lingyu Zhu","Shiqi Wang","Sam Kwong"],"pdf_url":"https://arxiv.org/pdf/2209.05321v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.14466v2","updated":"2024-04-08T15:51:37Z","published":"2022-07-29T04:10:22Z","title":"Towards Domain-agnostic Depth Completion","summary":" Existing depth completion methods are often targeted at a specific sparse\ndepth type and generalize poorly across task domains. We present a method to\ncomplete sparse/semi-dense, noisy, and potentially low-resolution depth maps\nobtained by various range sensors, including those in modern mobile phones, or\nby multi-view reconstruction algorithms. Our method leverages a data-driven\nprior in the form of a single image depth prediction network trained on\nlarge-scale datasets, the output of which is used as an input to our model. We\npropose an effective training scheme where we simulate various sparsity\npatterns in typical task domains. In addition, we design two new benchmarks to\nevaluate the generalizability and the robustness of depth completion methods.\nOur simple method shows superior cross-domain generalization ability against\nstate-of-the-art depth completion methods, introducing a practical solution to\nhigh-quality depth capture on a mobile device. The code is available at:\nhttps://github.com/YvanYin/FillDepth.\n","authors":["Guangkai Xu","Wei Yin","Jianming Zhang","Oliver Wang","Simon Niklaus","Simon Chen","Jia-Wang Bian"],"pdf_url":"https://arxiv.org/pdf/2207.14466v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05621v1","updated":"2024-04-08T15:51:21Z","published":"2024-04-08T15:51:21Z","title":"MULTIFLOW: Shifting Towards Task-Agnostic Vision-Language Pruning","summary":" While excellent in transfer learning, Vision-Language models (VLMs) come with\nhigh computational costs due to their large number of parameters. To address\nthis issue, removing parameters via model pruning is a viable solution.\nHowever, existing techniques for VLMs are task-specific, and thus require\npruning the network from scratch for each new task of interest. In this work,\nwe explore a new direction: Task-Agnostic Vision-Language Pruning (TA-VLP).\nGiven a pretrained VLM, the goal is to find a unique pruned counterpart\ntransferable to multiple unknown downstream tasks. In this challenging setting,\nthe transferable representations already encoded in the pretrained model are a\nkey aspect to preserve. Thus, we propose Multimodal Flow Pruning (MULTIFLOW), a\nfirst, gradient-free, pruning framework for TA-VLP where: (i) the importance of\na parameter is expressed in terms of its magnitude and its information flow, by\nincorporating the saliency of the neurons it connects; and (ii) pruning is\ndriven by the emergent (multimodal) distribution of the VLM parameters after\npretraining. We benchmark eight state-of-the-art pruning algorithms in the\ncontext of TA-VLP, experimenting with two VLMs, three vision-language tasks,\nand three pruning ratios. Our experimental results show that MULTIFLOW\noutperforms recent sophisticated, combinatorial competitors in the vast\nmajority of the cases, paving the way towards addressing TA-VLP. The code is\npublicly available at https://github.com/FarinaMatteo/multiflow.\n","authors":["Matteo Farina","Massimiliano Mancini","Elia Cunegatti","Gaowen Liu","Giovanni Iacca","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2404.05621v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2207.12080v4","updated":"2024-04-08T15:50:13Z","published":"2022-07-25T11:57:01Z","title":"Intention-Conditioned Long-Term Human Egocentric Action Forecasting","summary":" To anticipate how a human would act in the future, it is essential to\nunderstand the human intention since it guides the human towards a certain\ngoal. In this paper, we propose a hierarchical architecture which assumes a\nsequence of human action (low-level) can be driven from the human intention\n(high-level). Based on this, we deal with Long-Term Action Anticipation task in\negocentric videos. Our framework first extracts two level of human information\nover the N observed videos human actions through a Hierarchical Multi-task MLP\nMixer (H3M). Then, we condition the uncertainty of the future through an\nIntention-Conditioned Variational Auto-Encoder (I-CVAE) that generates K stable\npredictions of the next Z=20 actions that the observed human might perform. By\nleveraging human intention as high-level information, we claim that our model\nis able to anticipate more time-consistent actions in the long-term, thus\nimproving the results over baseline methods in EGO4D Challenge. This work\nranked first in both CVPR@2022 and ECVV@2022 EGO4D LTA Challenge by providing\nmore plausible anticipated sequences, improving the anticipation of nouns and\noverall actions. Webpage: https://evm7.github.io/icvae-page/\n","authors":["Esteve Valls Mascaro","Hyemin Ahn","Dongheui Lee"],"pdf_url":"https://arxiv.org/pdf/2207.12080v4.pdf","comment":"Winner of CVPR@2022 and ECCV@2022 EGO4D LTA Challenge. Accepted in\n WACV2023. Webpage: https://evm7.github.io/icvae-page/"},{"id":"http://arxiv.org/abs/2302.08274v3","updated":"2024-04-08T15:48:50Z","published":"2023-02-16T13:06:39Z","title":"Robust Human Motion Forecasting using Transformer-based Model","summary":" Comprehending human motion is a fundamental challenge for developing\nHuman-Robot Collaborative applications. Computer vision researchers have\naddressed this field by only focusing on reducing error in predictions, but not\ntaking into account the requirements to facilitate its implementation in\nrobots. In this paper, we propose a new model based on Transformer that\nsimultaneously deals with the real time 3D human motion forecasting in the\nshort and long term. Our 2-Channel Transformer (2CH-TR) is able to efficiently\nexploit the spatio-temporal information of a shortly observed sequence (400ms)\nand generates a competitive accuracy against the current state-of-the-art.\n2CH-TR stands out for the efficient performance of the Transformer, being\nlighter and faster than its competitors. In addition, our model is tested in\nconditions where the human motion is severely occluded, demonstrating its\nrobustness in reconstructing and predicting 3D human motion in a highly noisy\nenvironment. Our experiment results show that the proposed 2CH-TR outperforms\nthe ST-Transformer, which is another state-of-the-art model based on the\nTransformer, in terms of reconstruction and prediction under the same\nconditions of input prefix. Our model reduces in 8.89% the mean squared error\nof ST-Transformer in short-term prediction, and 2.57% in long-term prediction\nin Human3.6M dataset with 400ms input prefix. Webpage:\nhttps://evm7.github.io/2CHTR-page/\n","authors":["Esteve Valls Mascaro","Shuo Ma","Hyemin Ahn","Dongheui Lee"],"pdf_url":"https://arxiv.org/pdf/2302.08274v3.pdf","comment":"Accepted to IROS2022. Webpage: https://evm7.github.io/2CHTR-page/"},{"id":"http://arxiv.org/abs/2308.07301v2","updated":"2024-04-08T15:47:20Z","published":"2023-08-14T17:39:44Z","title":"A Unified Masked Autoencoder with Patchified Skeletons for Motion\n Synthesis","summary":" The synthesis of human motion has traditionally been addressed through\ntask-dependent models that focus on specific challenges, such as predicting\nfuture motions or filling in intermediate poses conditioned on known key-poses.\nIn this paper, we present a novel task-independent model called UNIMASK-M,\nwhich can effectively address these challenges using a unified architecture.\nOur model obtains comparable or better performance than the state-of-the-art in\neach field. Inspired by Vision Transformers (ViTs), our UNIMASK-M model\ndecomposes a human pose into body parts to leverage the spatio-temporal\nrelationships existing in human motion. Moreover, we reformulate various\npose-conditioned motion synthesis tasks as a reconstruction problem with\ndifferent masking patterns given as input. By explicitly informing our model\nabout the masked joints, our UNIMASK-M becomes more robust to occlusions.\nExperimental results show that our model successfully forecasts human motion on\nthe Human3.6M dataset. Moreover, it achieves state-of-the-art results in motion\ninbetweening on the LaFAN1 dataset, particularly in long transition periods.\nMore information can be found on the project website\nhttps://evm7.github.io/UNIMASKM-page/\n","authors":["Esteve Valls Mascaro","Hyemin Ahn","Dongheui Lee"],"pdf_url":"https://arxiv.org/pdf/2308.07301v2.pdf","comment":"Accepted to AAAI2024. Webpage: https://evm7.github.io/UNIMASKM-page/"},{"id":"http://arxiv.org/abs/2309.16524v2","updated":"2024-04-08T15:46:09Z","published":"2023-09-28T15:34:49Z","title":"HOI4ABOT: Human-Object Interaction Anticipation for Human Intention\n Reading Collaborative roBOTs","summary":" Robots are becoming increasingly integrated into our lives, assisting us in\nvarious tasks. To ensure effective collaboration between humans and robots, it\nis essential that they understand our intentions and anticipate our actions. In\nthis paper, we propose a Human-Object Interaction (HOI) anticipation framework\nfor collaborative robots. We propose an efficient and robust transformer-based\nmodel to detect and anticipate HOIs from videos. This enhanced anticipation\nempowers robots to proactively assist humans, resulting in more efficient and\nintuitive collaborations. Our model outperforms state-of-the-art results in HOI\ndetection and anticipation in VidHOI dataset with an increase of 1.76% and\n1.04% in mAP respectively while being 15.4 times faster. We showcase the\neffectiveness of our approach through experimental results in a real robot,\ndemonstrating that the robot's ability to anticipate HOIs is key for better\nHuman-Robot Interaction. More information can be found on our project webpage:\nhttps://evm7.github.io/HOI4ABOT_page/\n","authors":["Esteve Valls Mascaro","Daniel Sliwowski","Dongheui Lee"],"pdf_url":"https://arxiv.org/pdf/2309.16524v2.pdf","comment":"Proceedings in Conference on Robot Learning 2023. Webpage:\n https://evm7.github.io/HOI4ABOT_page/"},{"id":"http://arxiv.org/abs/2402.04768v2","updated":"2024-04-08T15:43:14Z","published":"2024-02-07T11:37:14Z","title":"Robot Interaction Behavior Generation based on Social Motion Forecasting\n for Human-Robot Interaction","summary":" Integrating robots into populated environments is a complex challenge that\nrequires an understanding of human social dynamics. In this work, we propose to\nmodel social motion forecasting in a shared human-robot representation space,\nwhich facilitates us to synthesize robot motions that interact with humans in\nsocial scenarios despite not observing any robot in the motion training. We\ndevelop a transformer-based architecture called ECHO, which operates in the\naforementioned shared space to predict the future motions of the agents\nencountered in social scenarios. Contrary to prior works, we reformulate the\nsocial motion problem as the refinement of the predicted individual motions\nbased on the surrounding agents, which facilitates the training while allowing\nfor single-motion forecasting when only one human is in the scene. We evaluate\nour model in multi-person and human-robot motion forecasting tasks and obtain\nstate-of-the-art performance by a large margin while being efficient and\nperforming in real-time. Additionally, our qualitative results showcase the\neffectiveness of our approach in generating human-robot interaction behaviors\nthat can be controlled via text commands. Webpage: https://evm7.github.io/ECHO/\n","authors":["Esteve Valls Mascaro","Yashuai Yan","Dongheui Lee"],"pdf_url":"https://arxiv.org/pdf/2402.04768v2.pdf","comment":"Accepted at ICRA 2024. Webpage: https://evm7.github.io/ECHO/"},{"id":"http://arxiv.org/abs/2404.05607v1","updated":"2024-04-08T15:29:46Z","published":"2024-04-08T15:29:46Z","title":"A Training-Free Plug-and-Play Watermark Framework for Stable Diffusion","summary":" Nowadays, the family of Stable Diffusion (SD) models has gained prominence\nfor its high quality outputs and scalability. This has also raised security\nconcerns on social media, as malicious users can create and disseminate harmful\ncontent. Existing approaches involve training components or entire SDs to embed\na watermark in generated images for traceability and responsibility\nattribution. However, in the era of AI-generated content (AIGC), the rapid\niteration of SDs renders retraining with watermark models costly. To address\nthis, we propose a training-free plug-and-play watermark framework for SDs.\nWithout modifying any components of SDs, we embed diverse watermarks in the\nlatent space, adapting to the denoising process. Our experimental findings\nreveal that our method effectively harmonizes image quality and watermark\ninvisibility. Furthermore, it performs robustly under various attacks. We also\nhave validated that our method is generalized to multiple versions of SDs, even\nwithout retraining the watermark model.\n","authors":["Guokai Zhang","Lanjun Wang","Yuting Su","An-An Liu"],"pdf_url":"https://arxiv.org/pdf/2404.05607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05606v1","updated":"2024-04-08T15:25:50Z","published":"2024-04-08T15:25:50Z","title":"Learning Topology Uniformed Face Mesh by Volume Rendering for Multi-view\n Reconstruction","summary":" Face meshes in consistent topology serve as the foundation for many\nface-related applications, such as 3DMM constrained face reconstruction and\nexpression retargeting. Traditional methods commonly acquire topology uniformed\nface meshes by two separate steps: multi-view stereo (MVS) to reconstruct\nshapes followed by non-rigid registration to align topology, but struggles with\nhandling noise and non-lambertian surfaces. Recently neural volume rendering\ntechniques have been rapidly evolved and shown great advantages in 3D\nreconstruction or novel view synthesis. Our goal is to leverage the superiority\nof neural volume rendering into multi-view reconstruction of face mesh with\nconsistent topology. We propose a mesh volume rendering method that enables\ndirectly optimizing mesh geometry while preserving topology, and learning\nimplicit features to model complex facial appearance from multi-view images.\nThe key innovation lies in spreading sparse mesh features into the surrounding\nspace to simulate radiance field required for volume rendering, which\nfacilitates backpropagation of gradients from images to mesh geometry and\nimplicit appearance features. Our proposed feature spreading module exhibits\ndeformation invariance, enabling photorealistic rendering seamlessly after mesh\nediting. We conduct experiments on multi-view face image dataset to evaluate\nthe reconstruction and implement an application for photorealistic rendering of\nanimated face mesh.\n","authors":["Yating Wang","Ran Yi","Ke Fan","Jinkun Hao","Jiangbo Lu","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2404.05606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05603v1","updated":"2024-04-08T15:22:38Z","published":"2024-04-08T15:22:38Z","title":"Self-Explainable Affordance Learning with Embodied Caption","summary":" In the field of visual affordance learning, previous methods mainly used\nabundant images or videos that delineate human behavior patterns to identify\naction possibility regions for object manipulation, with a variety of\napplications in robotic tasks. However, they encounter a main challenge of\naction ambiguity, illustrated by the vagueness like whether to beat or carry a\ndrum, and the complexities involved in processing intricate scenes. Moreover,\nit is important for human intervention to rectify robot errors in time. To\naddress these issues, we introduce Self-Explainable Affordance learning (SEA)\nwith embodied caption. This innovation enables robots to articulate their\nintentions and bridge the gap between explainable vision-language caption and\nvisual affordance learning. Due to a lack of appropriate dataset, we unveil a\npioneering dataset and metrics tailored for this task, which integrates images,\nheatmaps, and embodied captions. Furthermore, we propose a novel model to\neffectively combine affordance grounding with self-explanation in a simple but\nefficient manner. Extensive quantitative and qualitative experiments\ndemonstrate our method's effectiveness.\n","authors":["Zhipeng Zhang","Zhimin Wei","Guolei Sun","Peng Wang","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2404.05603v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00722v3","updated":"2024-04-08T15:15:56Z","published":"2024-03-31T15:34:45Z","title":"DRCT: Saving Image Super-resolution away from Information Bottleneck","summary":" In recent years, Vision Transformer-based applications to low-level vision\ntasks have achieved widespread success. Unlike CNN-based models, Transformers\nare more adept at capturing long-range dependencies, enabling the\nreconstruction of images utilizing information from non-local areas. In the\ndomain of super-resolution, Swin-transformer-based approaches have become\nmainstream due to their capacity to capture global spatial information and\ntheir shifting-window attention mechanism that facilitates the interchange of\ninformation between different windows. Many researchers have enhanced image\nquality and network efficiency by expanding the receptive field or designing\ncomplex networks, yielding commendable results. However, we observed that\nspatial information tends to diminish during the forward propagation process\ndue to increased depth, leading to a loss of spatial information and,\nconsequently, limiting the model's potential. To address this, we propose the\nDense-residual-connected Transformer (DRCT), aimed at mitigating the loss of\nspatial information through dense-residual connections between layers, thereby\nunleashing the model's potential and enhancing performance. Experiment results\nindicate that our approach is not only straightforward but also achieves\nremarkable efficiency, surpassing state-of-the-art methods and performing\ncommendably at NTIRE2024.\n","authors":["Chih-Chung Hsu","Chia-Ming Lee","Yi-Shiuan Chou"],"pdf_url":"https://arxiv.org/pdf/2404.00722v3.pdf","comment":"NTIRE 2024 Image Super-resolution (x4)"},{"id":"http://arxiv.org/abs/2404.05595v1","updated":"2024-04-08T15:14:20Z","published":"2024-04-08T15:14:20Z","title":"UniFL: Improve Stable Diffusion via Unified Feedback Learning","summary":" Diffusion models have revolutionized the field of image generation, leading\nto the proliferation of high-quality models and diverse downstream\napplications. However, despite these significant advancements, the current\ncompetitive solutions still suffer from several limitations, including inferior\nvisual quality, a lack of aesthetic appeal, and inefficient inference, without\na comprehensive solution in sight. To address these challenges, we present\nUniFL, a unified framework that leverages feedback learning to enhance\ndiffusion models comprehensively. UniFL stands out as a universal, effective,\nand generalizable solution applicable to various diffusion models, such as\nSD1.5 and SDXL. Notably, UniFL incorporates three key components: perceptual\nfeedback learning, which enhances visual quality; decoupled feedback learning,\nwhich improves aesthetic appeal; and adversarial feedback learning, which\noptimizes inference speed. In-depth experiments and extensive user studies\nvalidate the superior performance of our proposed method in enhancing both the\nquality of generated models and their acceleration. For instance, UniFL\nsurpasses ImageReward by 17% user preference in terms of generation quality and\noutperforms LCM and SDXL Turbo by 57% and 20% in 4-step inference. Moreover, we\nhave verified the efficacy of our approach in downstream tasks, including Lora,\nControlNet, and AnimateDiff.\n","authors":["Jiacheng Zhang","Jie Wu","Yuxi Ren","Xin Xia","Huafeng Kuang","Pan Xie","Jiashi Li","Xuefeng Xiao","Weilin Huang","Min Zheng","Lean Fu","Guanbin Li"],"pdf_url":"https://arxiv.org/pdf/2404.05595v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05584v1","updated":"2024-04-08T14:59:53Z","published":"2024-04-08T14:59:53Z","title":"Neural Cellular Automata for Lightweight, Robust and Explainable\n Classification of White Blood Cell Images","summary":" Diagnosis of hematological malignancies depends on accurate identification of\nwhite blood cells in peripheral blood smears. Deep learning techniques are\nemerging as a viable solution to scale and optimize this process by automatic\nidentification of cells in laboratories. However, these techniques face several\nchallenges such as limited generalizability, sensitivity to domain shifts and\nlack of explainability. Here, we are introducing a novel approach based on\nneural cellular automata (NCA) for white blood cell classification. We test our\napproach on three datasets of white blood cell images and show that we achieve\ncompetitive performance compared to conventional methods. Our NCA-based method\nis significantly smaller in terms of parameters and exhibits robustness to\ndomain shifts. Furthermore, the architecture is inherently explainable,\nproviding insights into the decision process for each classification, helping\nexperts understand and validate model predictions. Results demonstrate that NCA\nnot only can be used for image classification, but also address key challenges\nof conventional methods, indicating a high potential for applicability in\nclinical practice.\n","authors":["Michael Deutges","Ario Sadafi","Nassir Navab","Carsten Marr"],"pdf_url":"https://arxiv.org/pdf/2404.05584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05583v1","updated":"2024-04-08T14:58:52Z","published":"2024-04-08T14:58:52Z","title":"Towards More General Video-based Deepfake Detection through Facial\n Feature Guided Adaptation for Foundation Model","summary":" With the rise of deep learning, generative models have enabled the creation\nof highly realistic synthetic images, presenting challenges due to their\npotential misuse. While research in Deepfake detection has grown rapidly in\nresponse, many detection methods struggle with unseen Deepfakes generated by\nnew synthesis techniques. To address this generalisation challenge, we propose\na novel Deepfake detection approach by adapting rich information encoded inside\nthe Foundation Models with rich information encoded inside, specifically using\nthe image encoder from CLIP which has demonstrated strong zero-shot capability\nfor downstream tasks. Inspired by the recent advances of parameter efficient\nfine-tuning, we propose a novel side-network-based decoder to extract spatial\nand temporal cues from the given video clip, with the promotion of the Facial\nComponent Guidance (FCG) to guidencourage the spatial feature to include\nfeatures of key facial parts for more robust and general Deepfake detection.\nThrough extensive cross-dataset evaluations, our approach exhibits superior\neffectiveness in identifying unseen Deepfake samples, achieving notable\nperformance improvementsuccess even with limited training samples and\nmanipulation types. Our model secures an average performance enhancement of\n0.9% AUROC in cross-dataset assessments comparing with state-of-the-art\nmethods, especiallytablishing a significant lead of achieving 4.4% improvement\non the challenging DFDC dataset.\n","authors":["Yue-Hua Han","Tai-Ming Huang","Shu-Tzu Lo","Po-Han Huang","Kai-Lung Hua","Jun-Cheng Chen"],"pdf_url":"https://arxiv.org/pdf/2404.05583v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05580v1","updated":"2024-04-08T14:56:26Z","published":"2024-04-08T14:56:26Z","title":"Responsible Visual Editing","summary":" With recent advancements in visual synthesis, there is a growing risk of\nencountering images with detrimental effects, such as hate, discrimination, or\nprivacy violations. The research on transforming harmful images into\nresponsible ones remains unexplored. In this paper, we formulate a new task,\nresponsible visual editing, which entails modifying specific concepts within an\nimage to render it more responsible while minimizing changes. However, the\nconcept that needs to be edited is often abstract, making it challenging to\nlocate what needs to be modified and plan how to modify it. To tackle these\nchallenges, we propose a Cognitive Editor (CoEditor) that harnesses the large\nmultimodal model through a two-stage cognitive process: (1) a perceptual\ncognitive process to focus on what needs to be modified and (2) a behavioral\ncognitive process to strategize how to modify. To mitigate the negative\nimplications of harmful images on research, we create a transparent and public\ndataset, AltBear, which expresses harmful information using teddy bears instead\nof humans. Experiments demonstrate that CoEditor can effectively comprehend\nabstract concepts within complex scenes and significantly surpass the\nperformance of baseline models for responsible visual editing. We find that the\nAltBear dataset corresponds well to the harmful content found in real images,\noffering a consistent experimental evaluation, thereby providing a safer\nbenchmark for future research. Moreover, CoEditor also shows great results in\ngeneral editing. We release our code and dataset at\nhttps://github.com/kodenii/Responsible-Visual-Editing.\n","authors":["Minheng Ni","Yeli Shen","Lei Zhang","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.05580v1.pdf","comment":"24 pages, 12 figures"},{"id":"http://arxiv.org/abs/2404.05579v1","updated":"2024-04-08T14:55:35Z","published":"2024-04-08T14:55:35Z","title":"Robust Data Pruning: Uncovering and Overcoming Implicit Bias","summary":" In the era of exceptionally data-hungry models, careful selection of the\ntraining data is essential to mitigate the extensive costs of deep learning.\nData pruning offers a solution by removing redundant or uninformative samples\nfrom the dataset, which yields faster convergence and improved neural scaling\nlaws. However, little is known about its impact on classification bias of the\ntrained models. We conduct the first systematic study of this effect and reveal\nthat existing data pruning algorithms can produce highly biased classifiers. At\nthe same time, we argue that random data pruning with appropriate class ratios\nhas potential to improve the worst-class performance. We propose a\n\"fairness-aware\" approach to pruning and empirically demonstrate its\nperformance on standard computer vision benchmarks. In sharp contrast to\nexisting algorithms, our proposed method continues improving robustness at a\ntolerable drop of average performance as we prune more from the datasets. We\npresent theoretical analysis of the classification risk in a mixture of\nGaussians to further motivate our algorithm and support our findings.\n","authors":["Artem Vysogorets","Kartik Ahuja","Julia Kempe"],"pdf_url":"https://arxiv.org/pdf/2404.05579v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05578v1","updated":"2024-04-08T14:54:54Z","published":"2024-04-08T14:54:54Z","title":"Social-MAE: Social Masked Autoencoder for Multi-person Motion\n Representation Learning","summary":" For a complete comprehension of multi-person scenes, it is essential to go\nbeyond basic tasks like detection and tracking. Higher-level tasks, such as\nunderstanding the interactions and social activities among individuals, are\nalso crucial. Progress towards models that can fully understand scenes\ninvolving multiple people is hindered by a lack of sufficient annotated data\nfor such high-level tasks. To address this challenge, we introduce Social-MAE,\na simple yet effective transformer-based masked autoencoder framework for\nmulti-person human motion data. The framework uses masked modeling to pre-train\nthe encoder to reconstruct masked human joint trajectories, enabling it to\nlearn generalizable and data efficient representations of motion in human\ncrowded scenes. Social-MAE comprises a transformer as the MAE encoder and a\nlighter-weight transformer as the MAE decoder which operates on multi-person\njoints' trajectory in the frequency domain. After the reconstruction task, the\nMAE decoder is replaced with a task-specific decoder and the model is\nfine-tuned end-to-end for a variety of high-level social tasks. Our proposed\nmodel combined with our pre-training approach achieves the state-of-the-art\nresults on various high-level social tasks, including multi-person pose\nforecasting, social grouping, and social action understanding. These\nimprovements are demonstrated across four popular multi-person datasets\nencompassing both human 2D and 3D body pose.\n","authors":["Mahsa Ehsanpour","Ian Reid","Hamid Rezatofighi"],"pdf_url":"https://arxiv.org/pdf/2404.05578v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16741v2","updated":"2024-04-08T14:42:15Z","published":"2024-01-30T04:39:32Z","title":"MESA: Matching Everything by Segmenting Anything","summary":" Feature matching is a crucial task in the field of computer vision, which\ninvolves finding correspondences between images. Previous studies achieve\nremarkable performance using learning-based feature comparison. However, the\npervasive presence of matching redundancy between images gives rise to\nunnecessary and error-prone computations in these methods, imposing limitations\non their accuracy. To address this issue, we propose MESA, a novel approach to\nestablish precise area (or region) matches for efficient matching redundancy\nreduction. MESA first leverages the advanced image understanding capability of\nSAM, a state-of-the-art foundation model for image segmentation, to obtain\nimage areas with implicit semantic. Then, a multi-relational graph is proposed\nto model the spatial structure of these areas and construct their scale\nhierarchy. Based on graphical models derived from the graph, the area matching\nis reformulated as an energy minimization task and effectively resolved.\nExtensive experiments demonstrate that MESA yields substantial precision\nimprovement for multiple point matchers in indoor and outdoor downstream tasks,\ne.g. +13.61% for DKM in indoor pose estimation.\n","authors":["Yesheng Zhang","Xu Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.16741v2.pdf","comment":"CVPR24"},{"id":"http://arxiv.org/abs/2312.01068v2","updated":"2024-04-08T14:33:12Z","published":"2023-12-02T08:34:22Z","title":"DPHMs: Diffusion Parametric Head Models for Depth-based Tracking","summary":" We introduce Diffusion Parametric Head Models (DPHMs), a generative model\nthat enables robust volumetric head reconstruction and tracking from monocular\ndepth sequences. While recent volumetric head models, such as NPHMs, can now\nexcel in representing high-fidelity head geometries, tracking and\nreconstructing heads from real-world single-view depth sequences remains very\nchallenging, as the fitting to partial and noisy observations is\nunderconstrained. To tackle these challenges, we propose a latent\ndiffusion-based prior to regularize volumetric head reconstruction and\ntracking. This prior-based regularizer effectively constrains the identity and\nexpression codes to lie on the underlying latent manifold which represents\nplausible head shapes. To evaluate the effectiveness of the diffusion-based\nprior, we collect a dataset of monocular Kinect sequences consisting of various\ncomplex facial expression motions and rapid transitions. We compare our method\nto state-of-the-art tracking methods and demonstrate improved head identity\nreconstruction as well as robust expression tracking.\n","authors":["Jiapeng Tang","Angela Dai","Yinyu Nie","Lev Markhasin","Justus Thies","Matthias Niessner"],"pdf_url":"https://arxiv.org/pdf/2312.01068v2.pdf","comment":"CVPR 2024; homepage: https://tangjiapeng.github.io/projects/DPHMs/"},{"id":"http://arxiv.org/abs/2404.05559v1","updated":"2024-04-08T14:30:42Z","published":"2024-04-08T14:30:42Z","title":"TIM: A Time Interval Machine for Audio-Visual Action Recognition","summary":" Diverse actions give rise to rich audio-visual signals in long videos. Recent\nworks showcase that the two modalities of audio and video exhibit different\ntemporal extents of events and distinct labels. We address the interplay\nbetween the two modalities in long videos by explicitly modelling the temporal\nextents of audio and visual events. We propose the Time Interval Machine (TIM)\nwhere a modality-specific time interval poses as a query to a transformer\nencoder that ingests a long video input. The encoder then attends to the\nspecified interval, as well as the surrounding context in both modalities, in\norder to recognise the ongoing action.\n We test TIM on three long audio-visual video datasets: EPIC-KITCHENS,\nPerception Test, and AVE, reporting state-of-the-art (SOTA) for recognition. On\nEPIC-KITCHENS, we beat previous SOTA that utilises LLMs and significantly\nlarger pre-training by 2.9% top-1 action recognition accuracy. Additionally, we\nshow that TIM can be adapted for action detection, using dense multi-scale\ninterval queries, outperforming SOTA on EPIC-KITCHENS-100 for most metrics, and\nshowing strong performance on the Perception Test. Our ablations show the\ncritical role of integrating the two modalities and modelling their time\nintervals in achieving this performance. Code and models at:\nhttps://github.com/JacobChalk/TIM\n","authors":["Jacob Chalk","Jaesung Huh","Evangelos Kazakos","Andrew Zisserman","Dima Damen"],"pdf_url":"https://arxiv.org/pdf/2404.05559v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2307.06206v2","updated":"2024-04-08T14:26:52Z","published":"2023-07-12T14:52:21Z","title":"SepVAE: a contrastive VAE to separate pathological patterns from healthy\n ones","summary":" Contrastive Analysis VAE (CA-VAEs) is a family of Variational auto-encoders\n(VAEs) that aims at separating the common factors of variation between a\nbackground dataset (BG) (i.e., healthy subjects) and a target dataset (TG)\n(i.e., patients) from the ones that only exist in the target dataset. To do so,\nthese methods separate the latent space into a set of salient features (i.e.,\nproper to the target dataset) and a set of common features (i.e., exist in both\ndatasets). Currently, all models fail to prevent the sharing of information\nbetween latent spaces effectively and to capture all salient factors of\nvariation. To this end, we introduce two crucial regularization losses: a\ndisentangling term between common and salient representations and a\nclassification term between background and target samples in the salient space.\nWe show a better performance than previous CA-VAEs methods on three medical\napplications and a natural images dataset (CelebA). Code and datasets are\navailable on GitHub https://github.com/neurospin-projects/2023_rlouiset_sepvae.\n","authors":["Robin Louiset","Edouard Duchesnay","Antoine Grigis","Benoit Dufumier","Pietro Gori"],"pdf_url":"https://arxiv.org/pdf/2307.06206v2.pdf","comment":"Workshop on Interpretable ML in Healthcare at International\n Conference on Machine Learning (ICML), Honolulu, Hawaii, USA. 2023"},{"id":"http://arxiv.org/abs/2308.16018v4","updated":"2024-04-08T14:09:27Z","published":"2023-08-30T13:20:54Z","title":"SiT-MLP: A Simple MLP with Point-wise Topology Feature Learning for\n Skeleton-based Action Recognition","summary":" Graph convolution networks (GCNs) have achieved remarkable performance in\nskeleton-based action recognition. However, previous GCN-based methods rely on\nelaborate human priors excessively and construct complex feature aggregation\nmechanisms, which limits the generalizability and effectiveness of networks. To\nsolve these problems, we propose a novel Spatial Topology Gating Unit (STGU),\nan MLP-based variant without extra priors, to capture the co-occurrence\ntopology features that encode the spatial dependency across all joints. In\nSTGU, to learn the point-wise topology features, a new gate-based feature\ninteraction mechanism is introduced to activate the features point-to-point by\nthe attention map generated from the input sample. Based on the STGU, we\npropose the first MLP-based model, SiT-MLP, for skeleton-based action\nrecognition in this work. Compared with previous methods on three large-scale\ndatasets, SiT-MLP achieves competitive performance. In addition, SiT-MLP\nreduces the parameters significantly with favorable results. The code will be\navailable at https://github.com/BUPTSJZhang/SiT?MLP.\n","authors":["Shaojie Zhang","Jianqin Yin","Yonghao Dang","Jiajun Fu"],"pdf_url":"https://arxiv.org/pdf/2308.16018v4.pdf","comment":"Accepted by IEEE TCSVT 2024"},{"id":"http://arxiv.org/abs/2312.07526v2","updated":"2024-04-08T13:40:43Z","published":"2023-12-12T18:55:29Z","title":"RTMO: Towards High-Performance One-Stage Real-Time Multi-Person Pose\n Estimation","summary":" Real-time multi-person pose estimation presents significant challenges in\nbalancing speed and precision. While two-stage top-down methods slow down as\nthe number of people in the image increases, existing one-stage methods often\nfail to simultaneously deliver high accuracy and real-time performance. This\npaper introduces RTMO, a one-stage pose estimation framework that seamlessly\nintegrates coordinate classification by representing keypoints using dual 1-D\nheatmaps within the YOLO architecture, achieving accuracy comparable to\ntop-down methods while maintaining high speed. We propose a dynamic coordinate\nclassifier and a tailored loss function for heatmap learning, specifically\ndesigned to address the incompatibilities between coordinate classification and\ndense prediction models. RTMO outperforms state-of-the-art one-stage pose\nestimators, achieving 1.1% higher AP on COCO while operating about 9 times\nfaster with the same backbone. Our largest model, RTMO-l, attains 74.8% AP on\nCOCO val2017 and 141 FPS on a single V100 GPU, demonstrating its efficiency and\naccuracy. The code and models are available at\nhttps://github.com/open-mmlab/mmpose/tree/main/projects/rtmo.\n","authors":["Peng Lu","Tao Jiang","Yining Li","Xiangtai Li","Kai Chen","Wenming Yang"],"pdf_url":"https://arxiv.org/pdf/2312.07526v2.pdf","comment":"Accepted at CVPR 2024. Project page:\n https://github.com/open-mmlab/mmpose/tree/main/projects/rtmo"},{"id":"http://arxiv.org/abs/2404.05519v1","updated":"2024-04-08T13:40:01Z","published":"2024-04-08T13:40:01Z","title":"Investigating the Effectiveness of Cross-Attention to Unlock Zero-Shot\n Editing of Text-to-Video Diffusion Models","summary":" With recent advances in image and video diffusion models for content\ncreation, a plethora of techniques have been proposed for customizing their\ngenerated content. In particular, manipulating the cross-attention layers of\nText-to-Image (T2I) diffusion models has shown great promise in controlling the\nshape and location of objects in the scene. Transferring image-editing\ntechniques to the video domain, however, is extremely challenging as object\nmotion and temporal consistency are difficult to capture accurately. In this\nwork, we take a first look at the role of cross-attention in Text-to-Video\n(T2V) diffusion models for zero-shot video editing. While one-shot models have\nshown potential in controlling motion and camera movement, we demonstrate\nzero-shot control over object shape, position and movement in T2V models. We\nshow that despite the limitations of current T2V models, cross-attention\nguidance can be a promising approach for editing videos.\n","authors":["Saman Motamed","Wouter Van Gansbeke","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2404.05519v1.pdf","comment":"Generative Models for Computer Vision Generative Models for Computer\n Vision CVPR 2024 Workshop"},{"id":"http://arxiv.org/abs/2404.05518v1","updated":"2024-04-08T13:39:12Z","published":"2024-04-08T13:39:12Z","title":"DepthMOT: Depth Cues Lead to a Strong Multi-Object Tracker","summary":" Accurately distinguishing each object is a fundamental goal of Multi-object\ntracking (MOT) algorithms. However, achieving this goal still remains\nchallenging, primarily due to: (i) For crowded scenes with occluded objects,\nthe high overlap of object bounding boxes leads to confusion among closely\nlocated objects. Nevertheless, humans naturally perceive the depth of elements\nin a scene when observing 2D videos. Inspired by this, even though the bounding\nboxes of objects are close on the camera plane, we can differentiate them in\nthe depth dimension, thereby establishing a 3D perception of the objects. (ii)\nFor videos with rapidly irregular camera motion, abrupt changes in object\npositions can result in ID switches. However, if the camera pose are known, we\ncan compensate for the errors in linear motion models. In this paper, we\npropose \\textit{DepthMOT}, which achieves: (i) detecting and estimating scene\ndepth map \\textit{end-to-end}, (ii) compensating the irregular camera motion by\ncamera pose estimation. Extensive experiments demonstrate the superior\nperformance of DepthMOT in VisDrone-MOT and UAVDT datasets. The code will be\navailable at \\url{https://github.com/JackWoo0831/DepthMOT}.\n","authors":["Jiapeng Wu","Yichen Liu"],"pdf_url":"https://arxiv.org/pdf/2404.05518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05512v1","updated":"2024-04-08T13:35:14Z","published":"2024-04-08T13:35:14Z","title":"Impact of LiDAR visualisations on semantic segmentation of\n archaeological objects","summary":" Deep learning methods in LiDAR-based archaeological research often leverage\nvisualisation techniques derived from Digital Elevation Models to enhance\ncharacteristics of archaeological objects present in the images. This paper\ninvestigates the impact of visualisations on deep learning performance through\na comprehensive testing framework. The study involves the use of eight semantic\nsegmentation models to evaluate seven diverse visualisations across two study\nareas, encompassing five archaeological classes. Experimental results reveal\nthat the choice of appropriate visualisations can influence performance by up\nto 8%. Yet, pinpointing one visualisation that outperforms the others in\nsegmenting all archaeological classes proves challenging. The observed\nperformance variation, reaching up to 25% across different model\nconfigurations, underscores the importance of thoughtfully selecting model\nconfigurations and LiDAR visualisations for successfully segmenting\narchaeological objects.\n","authors":["Raveerat Jaturapitpornchai","Giulio Poggi","Gregory Sech","Ziga Kokalj","Marco Fiorucci","Arianna Traviglia"],"pdf_url":"https://arxiv.org/pdf/2404.05512v1.pdf","comment":"Accepted to IEEE International Geoscience and Remote Sensing\n Symposium 2024 (IGARSS 2024) @IEEE copyright"},{"id":"http://arxiv.org/abs/2404.05505v1","updated":"2024-04-08T13:27:07Z","published":"2024-04-08T13:27:07Z","title":"Taming Transformers for Realistic Lidar Point Cloud Generation","summary":" Diffusion Models (DMs) have achieved State-Of-The-Art (SOTA) results in the\nLidar point cloud generation task, benefiting from their stable training and\niterative refinement during sampling. However, DMs often fail to realistically\nmodel Lidar raydrop noise due to their inherent denoising process. To retain\nthe strength of iterative sampling while enhancing the generation of raydrop\nnoise, we introduce LidarGRIT, a generative model that uses auto-regressive\ntransformers to iteratively sample the range images in the latent space rather\nthan image space. Furthermore, LidarGRIT utilises VQ-VAE to separately decode\nrange images and raydrop masks. Our results show that LidarGRIT achieves\nsuperior performance compared to SOTA models on KITTI-360 and KITTI odometry\ndatasets. Code available at:https://github.com/hamedhaghighi/LidarGRIT.\n","authors":["Hamed Haghighi","Amir Samadi","Mehrdad Dianati","Valentina Donzella","Kurt Debattista"],"pdf_url":"https://arxiv.org/pdf/2404.05505v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08077v2","updated":"2024-04-08T13:23:47Z","published":"2023-11-14T11:05:08Z","title":"Zero-Shot Segmentation of Eye Features Using the Segment Anything Model\n (SAM)","summary":" The advent of foundation models signals a new era in artificial intelligence.\nThe Segment Anything Model (SAM) is the first foundation model for image\nsegmentation. In this study, we evaluate SAM's ability to segment features from\neye images recorded in virtual reality setups. The increasing requirement for\nannotated eye-image datasets presents a significant opportunity for SAM to\nredefine the landscape of data annotation in gaze estimation. Our investigation\ncenters on SAM's zero-shot learning abilities and the effectiveness of prompts\nlike bounding boxes or point clicks. Our results are consistent with studies in\nother domains, demonstrating that SAM's segmentation effectiveness can be\non-par with specialized models depending on the feature, with prompts improving\nits performance, evidenced by an IoU of 93.34% for pupil segmentation in one\ndataset. Foundation models like SAM could revolutionize gaze estimation by\nenabling quick and easy image segmentation, reducing reliance on specialized\nmodels and extensive manual annotation.\n","authors":["Virmarie Maquiling","Sean Anthony Byrne","Diederick C. Niehorster","Marcus Nyström","Enkelejda Kasneci"],"pdf_url":"https://arxiv.org/pdf/2311.08077v2.pdf","comment":"14 pages, 8 figures, 1 table, Accepted to ETRA 2024: ACM Symposium on\n Eye Tracking Research & Applications"},{"id":"http://arxiv.org/abs/2311.16728v2","updated":"2024-04-08T13:17:05Z","published":"2023-11-28T12:19:00Z","title":"Photo-SLAM: Real-time Simultaneous Localization and Photorealistic\n Mapping for Monocular, Stereo, and RGB-D Cameras","summary":" The integration of neural rendering and the SLAM system recently showed\npromising results in joint localization and photorealistic view reconstruction.\nHowever, existing methods, fully relying on implicit representations, are so\nresource-hungry that they cannot run on portable devices, which deviates from\nthe original intention of SLAM. In this paper, we present Photo-SLAM, a novel\nSLAM framework with a hyper primitives map. Specifically, we simultaneously\nexploit explicit geometric features for localization and learn implicit\nphotometric features to represent the texture information of the observed\nenvironment. In addition to actively densifying hyper primitives based on\ngeometric features, we further introduce a Gaussian-Pyramid-based training\nmethod to progressively learn multi-level features, enhancing photorealistic\nmapping performance. The extensive experiments with monocular, stereo, and\nRGB-D datasets prove that our proposed system Photo-SLAM significantly\noutperforms current state-of-the-art SLAM systems for online photorealistic\nmapping, e.g., PSNR is 30% higher and rendering speed is hundreds of times\nfaster in the Replica dataset. Moreover, the Photo-SLAM can run at real-time\nspeed using an embedded platform such as Jetson AGX Orin, showing the potential\nof robotics applications.\n","authors":["Huajian Huang","Longwei Li","Hui Cheng","Sai-Kit Yeung"],"pdf_url":"https://arxiv.org/pdf/2311.16728v2.pdf","comment":"CVPR 2024. Code: https://github.com/HuajianUP/Photo-SLAM - Project\n Page: https://huajianup.github.io/research/Photo-SLAM/"},{"id":"http://arxiv.org/abs/2311.17389v2","updated":"2024-04-08T13:15:03Z","published":"2023-11-29T06:42:12Z","title":"360Loc: A Dataset and Benchmark for Omnidirectional Visual Localization\n with Cross-device Queries","summary":" Portable 360$^\\circ$ cameras are becoming a cheap and efficient tool to\nestablish large visual databases. By capturing omnidirectional views of a\nscene, these cameras could expedite building environment models that are\nessential for visual localization. However, such an advantage is often\noverlooked due to the lack of valuable datasets. This paper introduces a new\nbenchmark dataset, 360Loc, composed of 360$^\\circ$ images with ground truth\nposes for visual localization. We present a practical implementation of\n360$^\\circ$ mapping combining 360$^\\circ$ images with lidar data to generate\nthe ground truth 6DoF poses. 360Loc is the first dataset and benchmark that\nexplores the challenge of cross-device visual positioning, involving\n360$^\\circ$ reference frames, and query frames from pinhole, ultra-wide FoV\nfisheye, and 360$^\\circ$ cameras. We propose a virtual camera approach to\ngenerate lower-FoV query frames from 360$^\\circ$ images, which ensures a fair\ncomparison of performance among different query types in visual localization\ntasks. We also extend this virtual camera approach to feature matching-based\nand pose regression-based methods to alleviate the performance loss caused by\nthe cross-device domain gap, and evaluate its effectiveness against\nstate-of-the-art baselines. We demonstrate that omnidirectional visual\nlocalization is more robust in challenging large-scale scenes with symmetries\nand repetitive structures. These results provide new insights into 360-camera\nmapping and omnidirectional visual localization with cross-device queries.\n","authors":["Huajian Huang","Changkun Liu","Yipeng Zhu","Hui Cheng","Tristan Braud","Sai-Kit Yeung"],"pdf_url":"https://arxiv.org/pdf/2311.17389v2.pdf","comment":"CVPR 2024. Project Page: https://huajianup.github.io/research/360Loc/"},{"id":"http://arxiv.org/abs/2404.05490v1","updated":"2024-04-08T13:11:57Z","published":"2024-04-08T13:11:57Z","title":"Two-Person Interaction Augmentation with Skeleton Priors","summary":" Close and continuous interaction with rich contacts is a crucial aspect of\nhuman activities (e.g. hugging, dancing) and of interest in many domains like\nactivity recognition, motion prediction, character animation, etc. However,\nacquiring such skeletal motion is challenging. While direct motion capture is\nexpensive and slow, motion editing/generation is also non-trivial, as complex\ncontact patterns with topological and geometric constraints have to be\nretained. To this end, we propose a new deep learning method for two-body\nskeletal interaction motion augmentation, which can generate variations of\ncontact-rich interactions with varying body sizes and proportions while\nretaining the key geometric/topological relations between two bodies. Our\nsystem can learn effectively from a relatively small amount of data and\ngeneralize to drastically different skeleton sizes. Through exhaustive\nevaluation and comparison, we show it can generate high-quality motions, has\nstrong generalizability and outperforms traditional optimization-based methods\nand alternative deep learning solutions.\n","authors":["Baiyi Li","Edmond S. L. Ho","Hubert P. H. Shum","He Wang"],"pdf_url":"https://arxiv.org/pdf/2404.05490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00226v2","updated":"2024-04-08T13:05:11Z","published":"2024-03-30T02:56:54Z","title":"Design as Desired: Utilizing Visual Question Answering for Multimodal\n Pre-training","summary":" Multimodal pre-training demonstrates its potential in the medical domain,\nwhich learns medical visual representations from paired medical reports.\nHowever, many pre-training tasks require extra annotations from clinicians, and\nmost of them fail to explicitly guide the model to learn the desired features\nof different pathologies. To the best of our knowledge, we are the first to\nutilize Visual Question Answering (VQA) for multimodal pre-training to guide\nthe framework focusing on targeted pathological features. In this work, we\nleverage descriptions in medical reports to design multi-granular\nquestion-answer pairs associated with different diseases, which assist the\nframework in pre-training without requiring extra annotations from experts. We\nalso propose a novel pre-training framework with a quasi-textual feature\ntransformer, a module designed to transform visual features into a\nquasi-textual space closer to the textual domain via a contrastive learning\nstrategy. This narrows the vision-language gap and facilitates modality\nalignment. Our framework is applied to four downstream tasks: report\ngeneration, classification, segmentation, and detection across five datasets.\nExtensive experiments demonstrate the superiority of our framework compared to\nother state-of-the-art methods. Our code will be released upon acceptance.\n","authors":["Tongkun Su","Jun Li","Xi Zhang","Haibo Jin","Hao Chen","Qiong Wang","Faqin Lv","Baoliang Zhao","Yin Hu"],"pdf_url":"https://arxiv.org/pdf/2404.00226v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01941v3","updated":"2024-04-08T12:51:35Z","published":"2024-04-02T13:33:31Z","title":"LPSNet: End-to-End Human Pose and Shape Estimation with Lensless Imaging","summary":" Human pose and shape (HPS) estimation with lensless imaging is not only\nbeneficial to privacy protection but also can be used in covert surveillance\nscenarios due to the small size and simple structure of this device. However,\nthis task presents significant challenges due to the inherent ambiguity of the\ncaptured measurements and lacks effective methods for directly estimating human\npose and shape from lensless data. In this paper, we propose the first\nend-to-end framework to recover 3D human poses and shapes from lensless\nmeasurements to our knowledge. We specifically design a multi-scale lensless\nfeature decoder to decode the lensless measurements through the optically\nencoded mask for efficient feature extraction. We also propose a double-head\nauxiliary supervision mechanism to improve the estimation accuracy of human\nlimb ends. Besides, we establish a lensless imaging system and verify the\neffectiveness of our method on various datasets acquired by our lensless\nimaging system.\n","authors":["Haoyang Ge","Qiao Feng","Hailong Jia","Xiongzheng Li","Xiangjun Yin","You Zhou","Jingyu Yang","Kun Li"],"pdf_url":"https://arxiv.org/pdf/2404.01941v3.pdf","comment":"Accepted to CVPR 2024. More results available at\n https://cic.tju.edu.cn/faculty/likun/projects/LPSNet"},{"id":"http://arxiv.org/abs/2306.14227v2","updated":"2024-04-08T12:50:51Z","published":"2023-06-25T12:15:44Z","title":"A ground-based dataset and a diffusion model for on-orbit low-light\n image enhancement","summary":" On-orbit service is important for maintaining the sustainability of space\nenvironment. Space-based visible camera is an economical and lightweight sensor\nfor situation awareness during on-orbit service. However, it can be easily\naffected by the low illumination environment. Recently, deep learning has\nachieved remarkable success in image enhancement of natural images, but seldom\napplied in space due to the data bottleneck. In this article, we first propose\na dataset of the Beidou Navigation Satellite for on-orbit low-light image\nenhancement (LLIE). In the automatic data collection scheme, we focus on\nreducing domain gap and improving the diversity of the dataset. we collect\nhardware in-the-loop images based on a robotic simulation testbed imitating\nspace lighting conditions. To evenly sample poses of different orientation and\ndistance without collision, a collision-free working space and pose stratified\nsampling is proposed. Afterwards, a novel diffusion model is proposed. To\nenhance the image contrast without over-exposure and blurring details, we\ndesign a fused attention to highlight the structure and dark region. Finally,\nwe compare our method with previous methods using our dataset, which indicates\nthat our method has a better capacity in on-orbit LLIE.\n","authors":["Yiman Zhu","Lu Wang","Jingyi Yuan","Yu Guo"],"pdf_url":"https://arxiv.org/pdf/2306.14227v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05468v1","updated":"2024-04-08T12:46:39Z","published":"2024-04-08T12:46:39Z","title":"Mind-to-Image: Projecting Visual Mental Imagination of the Brain from\n fMRI","summary":" The reconstruction of images observed by subjects from fMRI data collected\nduring visual stimuli has made significant strides in the past decade, thanks\nto the availability of extensive fMRI datasets and advancements in generative\nmodels for image generation. However, the application of visual reconstruction\nhas remained limited. Reconstructing visual imagination presents a greater\nchallenge, with potentially revolutionary applications ranging from aiding\nindividuals with disabilities to verifying witness accounts in court. The\nprimary hurdles in this field are the absence of data collection protocols for\nvisual imagery and the lack of datasets on the subject. Traditionally,\nfMRI-to-image relies on data collected from subjects exposed to visual stimuli,\nwhich poses issues for generating visual imagery based on the difference of\nbrain activity between visual stimulation and visual imagery. For the first\ntime, we have compiled a substantial dataset (around 6h of scans) on visual\nimagery along with a proposed data collection protocol. We then train a\nmodified version of an fMRI-to-image model and demonstrate the feasibility of\nreconstructing images from two modes of imagination: from memory and from pure\nimagination. This marks an important step towards creating a technology that\nallow direct reconstruction of visual imagery.\n","authors":["Hugo Caselles-Dupré","Charles Mellerio","Paul Hérent","Alizée Lopez-Persem","Benoit Béranger","Mathieu Soularue","Pierre Fautrel","Gauthier Vernier","Matthieu Cord"],"pdf_url":"https://arxiv.org/pdf/2404.05468v1.pdf","comment":"Pre-print to be updated"},{"id":"http://arxiv.org/abs/2404.05466v1","updated":"2024-04-08T12:44:24Z","published":"2024-04-08T12:44:24Z","title":"Enhancing Lip Reading with Multi-Scale Video and Multi-Encoder","summary":" Automatic lip-reading (ALR) aims to automatically transcribe spoken content\nfrom a speaker's silent lip motion captured in video. Current mainstream\nlip-reading approaches only use a single visual encoder to model input videos\nof a single scale. In this paper, we propose to enhance lipreading by\nincorporating multi-scale video data and multi-encoder. Specifically, we first\npropose a novel multi-scale lip extraction algorithm based on the size of the\nspeaker's face and an enhanced ResNet3D visual front-end (VFE) to extract lip\nfeatures at different scales. For the multi-encoder, in addition to the\nmainstream Transformer and Conformer, we also incorporate the recently proposed\nBranchformer and EBranchformer as visual encoders. In the experiments, we\nexplore the influence of different video data scales and encoders on ALR system\nperformance and fuse the texts transcribed by all ALR systems using recognizer\noutput voting error reduction (ROVER). Finally, our proposed approach placed\nsecond in the ICME 2024 ChatCLR Challenge Task 2, with a 21.52% reduction in\ncharacter error rate (CER) compared to the official baseline on the evaluation\nset.\n","authors":["He Wang","Pengcheng Guo","Xucheng Wan","Huan Zhou","Lei Xie"],"pdf_url":"https://arxiv.org/pdf/2404.05466v1.pdf","comment":"6 pages, 3 figures, submitted to ICME2024 GC-ChatCLR"},{"id":"http://arxiv.org/abs/2404.05465v1","updated":"2024-04-08T12:43:32Z","published":"2024-04-08T12:43:32Z","title":"HAMMR: HierArchical MultiModal React agents for generic VQA","summary":" Combining Large Language Models (LLMs) with external specialized tools\n(LLMs+tools) is a recent paradigm to solve multimodal tasks such as Visual\nQuestion Answering (VQA). While this approach was demonstrated to work well\nwhen optimized and evaluated for each individual benchmark, in practice it is\ncrucial for the next generation of real-world AI systems to handle a broad\nrange of multimodal problems. Therefore we pose the VQA problem from a unified\nperspective and evaluate a single system on a varied suite of VQA tasks\nincluding counting, spatial reasoning, OCR-based reasoning, visual pointing,\nexternal knowledge, and more. In this setting, we demonstrate that naively\napplying the LLM+tools approach using the combined set of all tools leads to\npoor results. This motivates us to introduce HAMMR: HierArchical MultiModal\nReact. We start from a multimodal ReAct-based system and make it hierarchical\nby enabling our HAMMR agents to call upon other specialized agents. This\nenhances the compositionality of the LLM+tools approach, which we show to be\ncritical for obtaining high accuracy on generic VQA. Concretely, on our generic\nVQA suite, HAMMR outperforms the naive LLM+tools approach by 19.5%.\nAdditionally, HAMMR achieves state-of-the-art results on this task,\noutperforming the generic standalone PaLI-X VQA model by 5.0%.\n","authors":["Lluis Castrejon","Thomas Mensink","Howard Zhou","Vittorio Ferrari","Andre Araujo","Jasper Uijlings"],"pdf_url":"https://arxiv.org/pdf/2404.05465v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05447v1","updated":"2024-04-08T12:29:46Z","published":"2024-04-08T12:29:46Z","title":"Pansharpening of PRISMA products for archaeological prospection","summary":" Hyperspectral data recorded from satellite platforms are often ill-suited for\ngeo-archaeological prospection due to low spatial resolution. The established\npotential of hyperspectral data from airborne sensors in identifying\narchaeological features has, on the other side, generated increased interest in\nenhancing hyperspectral data to achieve higher spatial resolution. This\nimprovement is crucial for detecting traces linked to sub-surface\ngeo-archaeological features and can make satellite hyperspectral acquisitions\nmore suitable for archaeological research. This research assesses the usability\nof pansharpened PRISMA satellite products in geo-archaeological prospections.\nThree pan-sharpening methods (GSA, MTF-GLP and HySure) are compared\nquantitatively and qualitatively and tested over the archaeological landscape\nof Aquileia (Italy). The results suggest that the application of pansharpening\ntechniques makes hyperspectral satellite imagery highly suitable, under certain\nconditions, to the identification of sub-surface archaeological features of\nsmall and large size.\n","authors":["Gregory Sech","Giulio Poggi","Marina Ljubenovic","Marco Fiorucci","Arianna Traviglia"],"pdf_url":"https://arxiv.org/pdf/2404.05447v1.pdf","comment":"Accepted to IEEE International Geoscience and Remote Sensing\n Symposium 2024 (IGARSS 2024) @IEEE copyright"},{"id":"http://arxiv.org/abs/2301.07409v2","updated":"2024-04-08T12:25:10Z","published":"2023-01-18T10:13:29Z","title":"Representing Noisy Image Without Denoising","summary":" A long-standing topic in artificial intelligence is the effective recognition\nof patterns from noisy images. In this regard, the recent data-driven paradigm\nconsiders 1) improving the representation robustness by adding noisy samples in\ntraining phase (i.e., data augmentation) or 2) pre-processing the noisy image\nby learning to solve the inverse problem (i.e., image denoising). However, such\nmethods generally exhibit inefficient process and unstable result, limiting\ntheir practical applications. In this paper, we explore a non-learning paradigm\nthat aims to derive robust representation directly from noisy images, without\nthe denoising as pre-processing. Here, the noise-robust representation is\ndesigned as Fractional-order Moments in Radon space (FMR), with also beneficial\nproperties of orthogonality and rotation invariance. Unlike earlier\ninteger-order methods, our work is a more generic design taking such classical\nmethods as special cases, and the introduced fractional-order parameter offers\ntime-frequency analysis capability that is not available in classical methods.\nFormally, both implicit and explicit paths for constructing the FMR are\ndiscussed in detail. Extensive simulation experiments and an image security\napplication are provided to demonstrate the uniqueness and usefulness of our\nFMR, especially for noise robustness, rotation invariance, and time-frequency\ndiscriminability.\n","authors":["Shuren Qi","Yushu Zhang","Chao Wang","Tao Xiang","Xiaochun Cao","Yong Xiang"],"pdf_url":"https://arxiv.org/pdf/2301.07409v2.pdf","comment":"Accepted by IEEE Transactions on Pattern Analysis and Machine\n Intelligence, 2024"},{"id":"http://arxiv.org/abs/2404.05439v1","updated":"2024-04-08T12:18:01Z","published":"2024-04-08T12:18:01Z","title":"Action-conditioned video data improves predictability","summary":" Long-term video generation and prediction remain challenging tasks in\ncomputer vision, particularly in partially observable scenarios where cameras\nare mounted on moving platforms. The interaction between observed image frames\nand the motion of the recording agent introduces additional complexities. To\naddress these issues, we introduce the Action-Conditioned Video Generation\n(ACVG) framework, a novel approach that investigates the relationship between\nactions and generated image frames through a deep dual Generator-Actor\narchitecture. ACVG generates video sequences conditioned on the actions of\nrobots, enabling exploration and analysis of how vision and action mutually\ninfluence one another in dynamic environments. We evaluate the framework's\neffectiveness on an indoor robot motion dataset which consists of sequences of\nimage frames along with the sequences of actions taken by the robotic agent,\nconducting a comprehensive empirical study comparing ACVG to other\nstate-of-the-art frameworks along with a detailed ablation study.\n","authors":["Meenakshi Sarkar","Debasish Ghose"],"pdf_url":"https://arxiv.org/pdf/2404.05439v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05297v2","updated":"2024-04-08T12:17:24Z","published":"2024-03-08T13:24:46Z","title":"PEEB: Part-based Image Classifiers with an Explainable and Editable\n Language Bottleneck","summary":" CLIP-based classifiers rely on the prompt containing a {class name} that is\nknown to the text encoder. Therefore, they perform poorly on new classes or the\nclasses whose names rarely appear on the Internet (e.g., scientific names of\nbirds). For fine-grained classification, we propose PEEB - an explainable and\neditable classifier to (1) express the class name into a set of text\ndescriptors that describe the visual parts of that class; and (2) match the\nembeddings of the detected parts to their textual descriptors in each class to\ncompute a logit score for classification. In a zero-shot setting where the\nclass names are unknown, PEEB outperforms CLIP by a huge margin (~10x in top-1\naccuracy). Compared to part-based classifiers, PEEB is not only the\nstate-of-the-art (SOTA) on the supervised-learning setting (88.80% and 92.20%\naccuracy on CUB-200 and Dogs-120, respectively) but also the first to enable\nusers to edit the text descriptors to form a new classifier without any\nre-training. Compared to concept bottleneck models, PEEB is also the SOTA in\nboth zero-shot and supervised-learning settings.\n","authors":["Thang M. Pham","Peijie Chen","Tin Nguyen","Seunghyun Yoon","Trung Bui","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2403.05297v2.pdf","comment":"Findings of NAACL 2024 (long paper)"},{"id":"http://arxiv.org/abs/2305.10874v3","updated":"2024-04-08T12:17:01Z","published":"2023-05-18T11:06:15Z","title":"Swap Attention in Spatiotemporal Diffusions for Text-to-Video Generation","summary":" With the explosive popularity of AI-generated content (AIGC), video\ngeneration has recently received a lot of attention. Generating videos guided\nby text instructions poses significant challenges, such as modeling the complex\nrelationship between space and time, and the lack of large-scale text-video\npaired data. Existing text-video datasets suffer from limitations in both\ncontent quality and scale, or they are not open-source, rendering them\ninaccessible for study and use. For model design, previous approaches extend\npretrained text-to-image generation models by adding temporal 1D\nconvolution/attention modules for video generation. However, these approaches\noverlook the importance of jointly modeling space and time, inevitably leading\nto temporal distortions and misalignment between texts and videos. In this\npaper, we propose a novel approach that strengthens the interaction between\nspatial and temporal perceptions. In particular, we utilize a swapped\ncross-attention mechanism in 3D windows that alternates the ``query'' role\nbetween spatial and temporal blocks, enabling mutual reinforcement for each\nother. Moreover, to fully unlock model capabilities for high-quality video\ngeneration and promote the development of the field, we curate a large-scale\nand open-source video dataset called HD-VG-130M. This dataset comprises 130\nmillion text-video pairs from the open-domain, ensuring high-definition,\nwidescreen and watermark-free characters. A smaller-scale yet more meticulously\ncleaned subset further enhances the data quality, aiding models in achieving\nsuperior performance. Experimental quantitative and qualitative results\ndemonstrate the superiority of our approach in terms of per-frame quality,\ntemporal correlation, and text-video alignment, with clear margins.\n","authors":["Wenjing Wang","Huan Yang","Zixi Tuo","Huiguo He","Junchen Zhu","Jianlong Fu","Jiaying Liu"],"pdf_url":"https://arxiv.org/pdf/2305.10874v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05426v1","updated":"2024-04-08T11:54:49Z","published":"2024-04-08T11:54:49Z","title":"Test-Time Zero-Shot Temporal Action Localization","summary":" Zero-Shot Temporal Action Localization (ZS-TAL) seeks to identify and locate\nactions in untrimmed videos unseen during training. Existing ZS-TAL methods\ninvolve fine-tuning a model on a large amount of annotated training data. While\neffective, training-based ZS-TAL approaches assume the availability of labeled\ndata for supervised learning, which can be impractical in some applications.\nFurthermore, the training process naturally induces a domain bias into the\nlearned model, which may adversely affect the model's generalization ability to\narbitrary videos. These considerations prompt us to approach the ZS-TAL problem\nfrom a radically novel perspective, relaxing the requirement for training data.\nTo this aim, we introduce a novel method that performs Test-Time adaptation for\nTemporal Action Localization (T3AL). In a nutshell, T3AL adapts a pre-trained\nVision and Language Model (VLM). T3AL operates in three steps. First, a\nvideo-level pseudo-label of the action category is computed by aggregating\ninformation from the entire video. Then, action localization is performed\nadopting a novel procedure inspired by self-supervised learning. Finally,\nframe-level textual descriptions extracted with a state-of-the-art captioning\nmodel are employed for refining the action region proposals. We validate the\neffectiveness of T3AL by conducting experiments on the THUMOS14 and the\nActivityNet-v1.3 datasets. Our results demonstrate that T3AL significantly\noutperforms zero-shot baselines based on state-of-the-art VLMs, confirming the\nbenefit of a test-time adaptation approach.\n","authors":["Benedetta Liberatori","Alessandro Conti","Paolo Rota","Yiming Wang","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2404.05426v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05937v3","updated":"2024-04-08T11:46:07Z","published":"2024-02-08T18:59:53Z","title":"InstaGen: Enhancing Object Detection by Training on Synthetic Dataset","summary":" In this paper, we present a novel paradigm to enhance the ability of object\ndetector, e.g., expanding categories or improving detection performance, by\ntraining on synthetic dataset generated from diffusion models. Specifically, we\nintegrate an instance-level grounding head into a pre-trained, generative\ndiffusion model, to augment it with the ability of localising instances in the\ngenerated images. The grounding head is trained to align the text embedding of\ncategory names with the regional visual feature of the diffusion model, using\nsupervision from an off-the-shelf object detector, and a novel self-training\nscheme on (novel) categories not covered by the detector. We conduct thorough\nexperiments to show that, this enhanced version of diffusion model, termed as\nInstaGen, can serve as a data synthesizer, to enhance object detectors by\ntraining on its generated samples, demonstrating superior performance over\nexisting state-of-the-art methods in open-vocabulary (+4.5 AP) and data-sparse\n(+1.2 to 5.2 AP) scenarios. Project page with code:\nhttps://fcjian.github.io/InstaGen.\n","authors":["Chengjian Feng","Yujie Zhong","Zequn Jie","Weidi Xie","Lin Ma"],"pdf_url":"https://arxiv.org/pdf/2402.05937v3.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2404.05414v1","updated":"2024-04-08T11:32:26Z","published":"2024-04-08T11:32:26Z","title":"Two Hands Are Better Than One: Resolving Hand to Hand Intersections via\n Occupancy Networks","summary":" 3D hand pose estimation from images has seen considerable interest from the\nliterature, with new methods improving overall 3D accuracy. One current\nchallenge is to address hand-to-hand interaction where self-occlusions and\nfinger articulation pose a significant problem to estimation. Little work has\napplied physical constraints that minimize the hand intersections that occur as\na result of noisy estimation. This work addresses the intersection of hands by\nexploiting an occupancy network that represents the hand's volume as a\ncontinuous manifold. This allows us to model the probability distribution of\npoints being inside a hand. We designed an intersection loss function to\nminimize the likelihood of hand-to-point intersections. Moreover, we propose a\nnew hand mesh parameterization that is superior to the commonly used MANO model\nin many respects including lower mesh complexity, underlying 3D skeleton\nextraction, watertightness, etc. On the benchmark InterHand2.6M dataset, the\nmodels trained using our intersection loss achieve better results than the\nstate-of-the-art by significantly decreasing the number of hand intersections\nwhile lowering the mean per-joint positional error. Additionally, we\ndemonstrate superior performance for 3D hand uplift on Re:InterHand and SMILE\ndatasets and show reduced hand-to-hand intersections for complex domains such\nas sign-language pose estimation.\n","authors":["Maksym Ivashechkin","Oscar Mendez","Richard Bowden"],"pdf_url":"https://arxiv.org/pdf/2404.05414v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06704v3","updated":"2024-04-08T11:24:30Z","published":"2023-12-10T11:45:45Z","title":"SIFU: Side-view Conditioned Implicit Function for Real-world Usable\n Clothed Human Reconstruction","summary":" Creating high-quality 3D models of clothed humans from single images for\nreal-world applications is crucial. Despite recent advancements, accurately\nreconstructing humans in complex poses or with loose clothing from in-the-wild\nimages, along with predicting textures for unseen areas, remains a significant\nchallenge. A key limitation of previous methods is their insufficient prior\nguidance in transitioning from 2D to 3D and in texture prediction. In response,\nwe introduce SIFU (Side-view Conditioned Implicit Function for Real-world\nUsable Clothed Human Reconstruction), a novel approach combining a Side-view\nDecoupling Transformer with a 3D Consistent Texture Refinement pipeline.SIFU\nemploys a cross-attention mechanism within the transformer, using SMPL-X\nnormals as queries to effectively decouple side-view features in the process of\nmapping 2D features to 3D. This method not only improves the precision of the\n3D models but also their robustness, especially when SMPL-X estimates are not\nperfect. Our texture refinement process leverages text-to-image diffusion-based\nprior to generate realistic and consistent textures for invisible views.\nThrough extensive experiments, SIFU surpasses SOTA methods in both geometry and\ntexture reconstruction, showcasing enhanced robustness in complex scenarios and\nachieving an unprecedented Chamfer and P2S measurement. Our approach extends to\npractical applications such as 3D printing and scene building, demonstrating\nits broad utility in real-world scenarios. Project page\nhttps://river-zhang.github.io/SIFU-projectpage/ .\n","authors":["Zechuan Zhang","Zongxin Yang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2312.06704v3.pdf","comment":"Accepted by CVPR 2024; Project page\n https://river-zhang.github.io/SIFU-projectpage/"},{"id":"http://arxiv.org/abs/2303.13514v3","updated":"2024-04-08T11:22:05Z","published":"2023-03-23T17:59:35Z","title":"SAOR: Single-View Articulated Object Reconstruction","summary":" We introduce SAOR, a novel approach for estimating the 3D shape, texture, and\nviewpoint of an articulated object from a single image captured in the wild.\nUnlike prior approaches that rely on pre-defined category-specific 3D templates\nor tailored 3D skeletons, SAOR learns to articulate shapes from single-view\nimage collections with a skeleton-free part-based model without requiring any\n3D object shape priors. To prevent ill-posed solutions, we propose a\ncross-instance consistency loss that exploits disentangled object shape\ndeformation and articulation. This is helped by a new silhouette-based sampling\nmechanism to enhance viewpoint diversity during training. Our method only\nrequires estimated object silhouettes and relative depth maps from\noff-the-shelf pre-trained networks during training. At inference time, given a\nsingle-view image, it efficiently outputs an explicit mesh representation. We\nobtain improved qualitative and quantitative results on challenging quadruped\nanimals compared to relevant existing work.\n","authors":["Mehmet Aygün","Oisin Mac Aodha"],"pdf_url":"https://arxiv.org/pdf/2303.13514v3.pdf","comment":"Accepted to CVPR 2024, website: https://mehmetaygun.github.io/saor"},{"id":"http://arxiv.org/abs/2404.05409v1","updated":"2024-04-08T11:20:28Z","published":"2024-04-08T11:20:28Z","title":"Anatomical Conditioning for Contrastive Unpaired Image-to-Image\n Translation of Optical Coherence Tomography Images","summary":" For a unified analysis of medical images from different modalities, data\nharmonization using image-to-image (I2I) translation is desired. We study this\nproblem employing an optical coherence tomography (OCT) data set of\nSpectralis-OCT and Home-OCT images. I2I translation is challenging because the\nimages are unpaired, and a bijective mapping does not exist due to the\ninformation discrepancy between both domains. This problem has been addressed\nby the Contrastive Learning for Unpaired I2I Translation (CUT) approach, but it\nreduces semantic consistency. To restore the semantic consistency, we support\nthe style decoder using an additional segmentation decoder. Our approach\nincreases the similarity between the style-translated images and the target\ndistribution. Importantly, we improve the segmentation of biomarkers in\nHome-OCT images in an unsupervised domain adaptation scenario. Our data\nharmonization approach provides potential for the monitoring of diseases, e.g.,\nage related macular disease, using different OCT devices.\n","authors":["Marc S. Seibel","Hristina Uzunova","Timo Kepp","Heinz Handels"],"pdf_url":"https://arxiv.org/pdf/2404.05409v1.pdf","comment":"Accepted at ISBI 2024"},{"id":"http://arxiv.org/abs/2311.10605v2","updated":"2024-04-08T10:59:06Z","published":"2023-11-17T16:01:06Z","title":"CA-Jaccard: Camera-aware Jaccard Distance for Person Re-identification","summary":" Person re-identification (re-ID) is a challenging task that aims to learn\ndiscriminative features for person retrieval. In person re-ID, Jaccard distance\nis a widely used distance metric, especially in re-ranking and clustering\nscenarios. However, we discover that camera variation has a significant\nnegative impact on the reliability of Jaccard distance. In particular, Jaccard\ndistance calculates the distance based on the overlap of relevant neighbors.\nDue to camera variation, intra-camera samples dominate the relevant neighbors,\nwhich reduces the reliability of the neighbors by introducing intra-camera\nnegative samples and excluding inter-camera positive samples. To overcome this\nproblem, we propose a novel camera-aware Jaccard (CA-Jaccard) distance that\nleverages camera information to enhance the reliability of Jaccard distance.\nSpecifically, we design camera-aware k-reciprocal nearest neighbors (CKRNNs) to\nfind k-reciprocal nearest neighbors on the intra-camera and inter-camera\nranking lists, which improves the reliability of relevant neighbors and\nguarantees the contribution of inter-camera samples in the overlap. Moreover,\nwe propose a camera-aware local query expansion (CLQE) to mine reliable samples\nin relevant neighbors by exploiting camera variation as a strong constraint and\nassign these samples higher weights in overlap, further improving the\nreliability. Our CA-Jaccard distance is simple yet effective and can serve as a\ngeneral distance metric for person re-ID methods with high reliability and low\ncomputational cost. Extensive experiments demonstrate the effectiveness of our\nmethod.\n","authors":["Yiyu Chen","Zheyi Fan","Zhaoru Chen","Yixuan Zhu"],"pdf_url":"https://arxiv.org/pdf/2311.10605v2.pdf","comment":"This paper is accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2309.04190v4","updated":"2024-04-08T10:57:42Z","published":"2023-09-08T08:03:42Z","title":"SegmentAnything helps microscopy images based automatic and quantitative\n organoid detection and analysis","summary":" Organoids are self-organized 3D cell clusters that closely mimic the\narchitecture and function of in vivo tissues and organs. Quantification of\norganoid morphology helps in studying organ development, drug discovery, and\ntoxicity assessment. Recent microscopy techniques provide a potent tool to\nacquire organoid morphology features, but manual image analysis remains a labor\nand time-intensive process. Thus, this paper proposes a comprehensive pipeline\nfor microscopy analysis that leverages the SegmentAnything to precisely\ndemarcate individual organoids. Additionally, we introduce a set of\nmorphological properties, including perimeter, area, radius, non-smoothness,\nand non-circularity, allowing researchers to analyze the organoid structures\nquantitatively and automatically. To validate the effectiveness of our\napproach, we conducted tests on bright-field images of human induced\npluripotent stem cells (iPSCs) derived neural-epithelial (NE) organoids. The\nresults obtained from our automatic pipeline closely align with manual organoid\ndetection and measurement, showcasing the capability of our proposed method in\naccelerating organoids morphology analysis.\n","authors":["Xiaodan Xing","Chunling Tang","Yunzhe Guo","Nicholas Kurniawan","Guang Yang"],"pdf_url":"https://arxiv.org/pdf/2309.04190v4.pdf","comment":"Replace Figure 4 with the correct version. The original version is\n wrong due to a column name mismatch"},{"id":"http://arxiv.org/abs/2404.05393v1","updated":"2024-04-08T10:52:29Z","published":"2024-04-08T10:52:29Z","title":"PAT: Pixel-wise Adaptive Training for Long-tailed Segmentation","summary":" Beyond class frequency, we recognize the impact of class-wise relationships\namong various class-specific predictions and the imbalance in label masks on\nlong-tailed segmentation learning. To address these challenges, we propose an\ninnovative Pixel-wise Adaptive Training (PAT) technique tailored for\nlong-tailed segmentation. PAT has two key features: 1) class-wise gradient\nmagnitude homogenization, and 2) pixel-wise class-specific loss adaptation\n(PCLA). First, the class-wise gradient magnitude homogenization helps alleviate\nthe imbalance among label masks by ensuring equal consideration of the\nclass-wise impact on model updates. Second, PCLA tackles the detrimental impact\nof both rare classes within the long-tailed distribution and inaccurate\npredictions from previous training stages by encouraging learning classes with\nlow prediction confidence and guarding against forgetting classes with high\nconfidence. This combined approach fosters robust learning while preventing the\nmodel from forgetting previously learned knowledge. PAT exhibits significant\nperformance improvements, surpassing the current state-of-the-art by 2.2% in\nthe NyU dataset. Moreover, it enhances overall pixel-wise accuracy by 2.85% and\nintersection over union value by 2.07%, with a particularly notable declination\nof 0.39% in detecting rare classes compared to Balance Logits Variation, as\ndemonstrated on the three popular datasets, i.e., OxfordPetIII, CityScape, and\nNYU.\n","authors":["Khoi Do","Duong Nguyen","Nguyen H. Tran","Viet Dung Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.05393v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05392v1","updated":"2024-04-08T10:51:29Z","published":"2024-04-08T10:51:29Z","title":"T-DEED: Temporal-Discriminability Enhancer Encoder-Decoder for Precise\n Event Spotting in Sports Videos","summary":" In this paper, we introduce T-DEED, a Temporal-Discriminability Enhancer\nEncoder-Decoder for Precise Event Spotting in sports videos. T-DEED addresses\nmultiple challenges in the task, including the need for discriminability among\nframe representations, high output temporal resolution to maintain prediction\nprecision, and the necessity to capture information at different temporal\nscales to handle events with varying dynamics. It tackles these challenges\nthrough its specifically designed architecture, featuring an encoder-decoder\nfor leveraging multiple temporal scales and achieving high output temporal\nresolution, along with temporal modules designed to increase token\ndiscriminability. Leveraging these characteristics, T-DEED achieves SOTA\nperformance on the FigureSkating and FineDiving datasets.\n","authors":["Artur Xarles","Sergio Escalera","Thomas B. Moeslund","Albert Clapés"],"pdf_url":"https://arxiv.org/pdf/2404.05392v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15288v2","updated":"2024-04-08T10:48:22Z","published":"2023-12-23T16:05:47Z","title":"Understanding normalization in contrastive representation learning and\n out-of-distribution detection","summary":" Contrastive representation learning has emerged as an outstanding approach\nfor anomaly detection. In this work, we explore the $\\ell_2$-norm of\ncontrastive features and its applications in out-of-distribution detection. We\npropose a simple method based on contrastive learning, which incorporates\nout-of-distribution data by discriminating against normal samples in the\ncontrastive layer space. Our approach can be applied flexibly as an outlier\nexposure (OE) approach, where the out-of-distribution data is a huge collective\nof random images, or as a fully self-supervised learning approach, where the\nout-of-distribution data is self-generated by applying distribution-shifting\ntransformations. The ability to incorporate additional out-of-distribution\nsamples enables a feasible solution for datasets where AD methods based on\ncontrastive learning generally underperform, such as aerial images or\nmicroscopy images. Furthermore, the high-quality features learned through\ncontrastive learning consistently enhance performance in OE scenarios, even\nwhen the available out-of-distribution dataset is not diverse enough. Our\nextensive experiments demonstrate the superiority of our proposed method under\nvarious scenarios, including unimodal and multimodal settings, with various\nimage datasets.\n","authors":["Tai Le-Gia","Jaehyun Ahn"],"pdf_url":"https://arxiv.org/pdf/2312.15288v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05384v1","updated":"2024-04-08T10:45:29Z","published":"2024-04-08T10:45:29Z","title":"Rethinking the Spatial Inconsistency in Classifier-Free Diffusion\n Guidance","summary":" Classifier-Free Guidance (CFG) has been widely used in text-to-image\ndiffusion models, where the CFG scale is introduced to control the strength of\ntext guidance on the whole image space. However, we argue that a global CFG\nscale results in spatial inconsistency on varying semantic strengths and\nsuboptimal image quality. To address this problem, we present a novel approach,\nSemantic-aware Classifier-Free Guidance (S-CFG), to customize the guidance\ndegrees for different semantic units in text-to-image diffusion models.\nSpecifically, we first design a training-free semantic segmentation method to\npartition the latent image into relatively independent semantic regions at each\ndenoising step. In particular, the cross-attention map in the denoising U-net\nbackbone is renormalized for assigning each patch to the corresponding token,\nwhile the self-attention map is used to complete the semantic regions. Then, to\nbalance the amplification of diverse semantic units, we adaptively adjust the\nCFG scales across different semantic regions to rescale the text guidance\ndegrees into a uniform level. Finally, extensive experiments demonstrate the\nsuperiority of S-CFG over the original CFG strategy on various text-to-image\ndiffusion models, without requiring any extra training cost. our codes are\navailable at https://github.com/SmilesDZgk/S-CFG.\n","authors":["Dazhong Shen","Guanglu Song","Zeyue Xue","Fu-Yun Wang","Yu Liu"],"pdf_url":"https://arxiv.org/pdf/2404.05384v1.pdf","comment":"accepted by CVPR-2024"},{"id":"http://arxiv.org/abs/2305.15873v2","updated":"2024-04-08T10:28:38Z","published":"2023-05-25T09:09:32Z","title":"Confronting Ambiguity in 6D Object Pose Estimation via Score-Based\n Diffusion on SE(3)","summary":" Addressing pose ambiguity in 6D object pose estimation from single RGB images\npresents a significant challenge, particularly due to object symmetries or\nocclusions. In response, we introduce a novel score-based diffusion method\napplied to the $SE(3)$ group, marking the first application of diffusion models\nto $SE(3)$ within the image domain, specifically tailored for pose estimation\ntasks. Extensive evaluations demonstrate the method's efficacy in handling pose\nambiguity, mitigating perspective-induced ambiguity, and showcasing the\nrobustness of our surrogate Stein score formulation on $SE(3)$. This\nformulation not only improves the convergence of denoising process but also\nenhances computational efficiency. Thus, we pioneer a promising strategy for 6D\nobject pose estimation.\n","authors":["Tsu-Ching Hsiao","Hao-Wei Chen","Hsuan-Kung Yang","Chun-Yi Lee"],"pdf_url":"https://arxiv.org/pdf/2305.15873v2.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2404.05366v1","updated":"2024-04-08T10:05:24Z","published":"2024-04-08T10:05:24Z","title":"CDAD-Net: Bridging Domain Gaps in Generalized Category Discovery","summary":" In Generalized Category Discovery (GCD), we cluster unlabeled samples of\nknown and novel classes, leveraging a training dataset of known classes. A\nsalient challenge arises due to domain shifts between these datasets. To\naddress this, we present a novel setting: Across Domain Generalized Category\nDiscovery (AD-GCD) and bring forth CDAD-NET (Class Discoverer Across Domains)\nas a remedy. CDAD-NET is architected to synchronize potential known class\nsamples across both the labeled (source) and unlabeled (target) datasets, while\nemphasizing the distinct categorization of the target data. To facilitate this,\nwe propose an entropy-driven adversarial learning strategy that accounts for\nthe distance distributions of target samples relative to source-domain class\nprototypes. Parallelly, the discriminative nature of the shared space is upheld\nthrough a fusion of three metric learning objectives. In the source domain, our\nfocus is on refining the proximity between samples and their affiliated class\nprototypes, while in the target domain, we integrate a neighborhood-centric\ncontrastive learning mechanism, enriched with an adept neighborsmining\napproach. To further accentuate the nuanced feature interrelation among\nsemantically aligned images, we champion the concept of conditional image\ninpainting, underscoring the premise that semantically analogous images prove\nmore efficacious to the task than their disjointed counterparts.\nExperimentally, CDAD-NET eclipses existing literature with a performance\nincrement of 8-15% on three AD-GCD benchmarks we present.\n","authors":["Sai Bhargav Rongali","Sarthak Mehrotra","Ankit Jha","Mohamad Hassan N C","Shirsha Bose","Tanisha Gupta","Mainak Singha","Biplab Banerjee"],"pdf_url":"https://arxiv.org/pdf/2404.05366v1.pdf","comment":"Accepted in L3D-IVU, CVPR Workshop, 2024"},{"id":"http://arxiv.org/abs/2308.13888v3","updated":"2024-04-08T10:04:29Z","published":"2023-08-26T14:12:19Z","title":"Neural Implicit Morphing of Face Images","summary":" Face morphing is a problem in computer graphics with numerous artistic and\nforensic applications. It is challenging due to variations in pose, lighting,\ngender, and ethnicity. This task consists of a warping for feature alignment\nand a blending for a seamless transition between the warped images. We propose\nto leverage coord-based neural networks to represent such warpings and\nblendings of face images. During training, we exploit the smoothness and\nflexibility of such networks by combining energy functionals employed in\nclassical approaches without discretizations. Additionally, our method is\ntime-dependent, allowing a continuous warping/blending of the images. During\nmorphing inference, we need both direct and inverse transformations of the\ntime-dependent warping. The first (second) is responsible for warping the\ntarget (source) image into the source (target) image. Our neural warping stores\nthose maps in a single network dismissing the need for inverting them. The\nresults of our experiments indicate that our method is competitive with both\nclassical and generative models under the lens of image quality and\nface-morphing detectors. Aesthetically, the resulting images present a seamless\nblending of diverse faces not yet usual in the literature.\n","authors":["Guilherme Schardong","Tiago Novello","Hallison Paz","Iurii Medvedev","Vinícius da Silva","Luiz Velho","Nuno Gonçalves"],"pdf_url":"https://arxiv.org/pdf/2308.13888v3.pdf","comment":"14 pages, 20 figures, accepted for CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05362v1","updated":"2024-04-08T09:54:28Z","published":"2024-04-08T09:54:28Z","title":"Multi-head Attention-based Deep Multiple Instance Learning","summary":" This paper introduces MAD-MIL, a Multi-head Attention-based Deep Multiple\nInstance Learning model, designed for weakly supervised Whole Slide Images\n(WSIs) classification in digital pathology. Inspired by the multi-head\nattention mechanism of the Transformer, MAD-MIL simplifies model complexity\nwhile achieving competitive results against advanced models like CLAM and\nDS-MIL. Evaluated on the MNIST-BAGS and public datasets, including TUPAC16,\nTCGA BRCA, TCGA LUNG, and TCGA KIDNEY, MAD-MIL consistently outperforms ABMIL.\nThis demonstrates enhanced information diversity, interpretability, and\nefficiency in slide representation. The model's effectiveness, coupled with\nfewer trainable parameters and lower computational complexity makes it a\npromising solution for automated pathology workflows. Our code is available at\nhttps://github.com/tueimage/MAD-MIL.\n","authors":["Hassan Keshvarikhojasteh","Josien Pluim","Mitko Veta"],"pdf_url":"https://arxiv.org/pdf/2404.05362v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.01585v3","updated":"2024-04-08T09:53:27Z","published":"2023-02-03T07:35:53Z","title":"SegForestNet: Spatial-Partitioning-Based Aerial Image Segmentation","summary":" Aerial image segmentation is the basis for applications such as automatically\ncreating maps or tracking deforestation. In true orthophotos, which are often\nused in these applications, many objects and regions can be approximated well\nby polygons. However, this fact is rarely exploited by state-of-the-art\nsemantic segmentation models. Instead, most models allow unnecessary degrees of\nfreedom in their predictions by allowing arbitrary region shapes. We therefore\npresent a refinement of our deep learning model which predicts binary space\npartitioning trees, an efficient polygon representation. The refinements\ninclude a new feature decoder architecture and a new differentiable BSP tree\nrenderer which both avoid vanishing gradients. Additionally, we designed a\nnovel loss function specifically designed to improve the spatial partitioning\ndefined by the predicted trees. Furthermore, our expanded model can predict\nmultiple trees at once and thus can predict class-specific segmentations. As an\nadditional contribution, we investigate the impact of a non-optimal training\nprocess in comparison to an optimized training process. While model\narchitectures optimized for aerial images, such as PFNet or our own model, show\nan advantage under non-optimal conditions, this advantage disappears under\noptimal training conditions. Despite this observation, our model still makes\nbetter predictions for small rectangular objects, e.g., cars.\n","authors":["Daniel Gritzner","Jörn Ostermann"],"pdf_url":"https://arxiv.org/pdf/2302.01585v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05357v1","updated":"2024-04-08T09:48:02Z","published":"2024-04-08T09:48:02Z","title":"CNN-based Game State Detection for a Foosball Table","summary":" The automation of games using Deep Reinforcement Learning Strategies (DRL) is\na well-known challenge in AI research. While for feature extraction in a video\ngame typically the whole image is used, this is hardly practical for many real\nworld games. Instead, using a smaller game state reducing the dimension of the\nparameter space to include essential parameters only seems to be a promising\napproach. In the game of Foosball, a compact and comprehensive game state\ndescription consists of the positional shifts and rotations of the figures and\nthe position of the ball over time. In particular, velocities and accelerations\ncan be derived from consecutive time samples of the game state. In this paper,\na figure detection system to determine the game state in Foosball is presented.\nWe capture a dataset containing the rotations of the rods which were measured\nusing accelerometers and the positional shifts were derived using traditional\nComputer Vision techniques (in a laboratory setting). This dataset is utilized\nto train Convolutional Neural Network (CNN) based end-to-end regression models\nto predict the rotations and shifts of each rod. We present an evaluation of\nour system using different state-of-the-art CNNs as base architectures for the\nregression model. We show that our system is able to predict the game state\nwith high accuracy. By providing data for both black and white teams, the\npresented system is intended to provide the required data for future\ndevelopments of Imitation Learning techniques w.r.t. to observing human\nplayers.\n","authors":["David Hagens","Jan Knaup","Elke Hergenröther","Andreas Weinmann"],"pdf_url":"https://arxiv.org/pdf/2404.05357v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05348v1","updated":"2024-04-08T09:33:40Z","published":"2024-04-08T09:33:40Z","title":"Iterative Refinement Strategy for Automated Data Labeling: Facial\n Landmark Diagnosis in Medical Imaging","summary":" Automated data labeling techniques are crucial for accelerating the\ndevelopment of deep learning models, particularly in complex medical imaging\napplications. However, ensuring accuracy and efficiency remains challenging.\nThis paper presents iterative refinement strategies for automated data labeling\nin facial landmark diagnosis to enhance accuracy and efficiency for deep\nlearning models in medical applications, including dermatology, plastic\nsurgery, and ophthalmology. Leveraging feedback mechanisms and advanced\nalgorithms, our approach iteratively refines initial labels, reducing reliance\non manual intervention while improving label quality. Through empirical\nevaluation and case studies, we demonstrate the effectiveness of our proposed\nstrategies in deep learning tasks across medical imaging domains. Our results\nhighlight the importance of iterative refinement in automated data labeling to\nenhance the capabilities of deep learning systems in medical imaging\napplications.\n","authors":["Yu-Hsi Chen"],"pdf_url":"https://arxiv.org/pdf/2404.05348v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13263v2","updated":"2024-04-08T09:31:33Z","published":"2023-06-23T02:19:52Z","title":"Synthetic data shuffling accelerates the convergence of federated\n learning under data heterogeneity","summary":" In federated learning, data heterogeneity is a critical challenge. A\nstraightforward solution is to shuffle the clients' data to homogenize the\ndistribution. However, this may violate data access rights, and how and when\nshuffling can accelerate the convergence of a federated optimization algorithm\nis not theoretically well understood. In this paper, we establish a precise and\nquantifiable correspondence between data heterogeneity and parameters in the\nconvergence rate when a fraction of data is shuffled across clients. We prove\nthat shuffling can quadratically reduce the gradient dissimilarity with respect\nto the shuffling percentage, accelerating convergence. Inspired by the theory,\nwe propose a practical approach that addresses the data access rights issue by\nshuffling locally generated synthetic data. The experimental results show that\nshuffling synthetic data improves the performance of multiple existing\nfederated learning algorithms by a large margin.\n","authors":["Bo Li","Yasin Esfandiari","Mikkel N. Schmidt","Tommy S. Alstrøm","Sebastian U. Stich"],"pdf_url":"https://arxiv.org/pdf/2306.13263v2.pdf","comment":"Accepted at TMLR"},{"id":"http://arxiv.org/abs/2404.05341v1","updated":"2024-04-08T09:27:42Z","published":"2024-04-08T09:27:42Z","title":"Comparative Analysis of Image Enhancement Techniques for Brain Tumor\n Segmentation: Contrast, Histogram, and Hybrid Approaches","summary":" This study systematically investigates the impact of image enhancement\ntechniques on Convolutional Neural Network (CNN)-based Brain Tumor\nSegmentation, focusing on Histogram Equalization (HE), Contrast Limited\nAdaptive Histogram Equalization (CLAHE), and their hybrid variations. Employing\nthe U-Net architecture on a dataset of 3064 Brain MRI images, the research\ndelves into preprocessing steps, including resizing and enhancement, to\noptimize segmentation accuracy. A detailed analysis of the CNN-based U-Net\narchitecture, training, and validation processes is provided. The comparative\nanalysis, utilizing metrics such as Accuracy, Loss, MSE, IoU, and DSC, reveals\nthat the hybrid approach CLAHE-HE consistently outperforms others. Results\nhighlight its superior accuracy (0.9982, 0.9939, 0.9936 for training, testing,\nand validation, respectively) and robust segmentation overlap, with Jaccard\nvalues of 0.9862, 0.9847, and 0.9864, and Dice values of 0.993, 0.9923, and\n0.9932 for the same phases, emphasizing its potential in neuro-oncological\napplications. The study concludes with a call for refinement in segmentation\nmethodologies to further enhance diagnostic precision and treatment planning in\nneuro-oncology.\n","authors":["Shoffan Saifullah","Andri Pranolo","Rafał Dreżewski"],"pdf_url":"https://arxiv.org/pdf/2404.05341v1.pdf","comment":"9 Pages, & Figures, 2 Tables, International Conference on Computer\n Science Electronics and Information (ICCSEI 2023)"},{"id":"http://arxiv.org/abs/2404.05331v1","updated":"2024-04-08T09:18:32Z","published":"2024-04-08T09:18:32Z","title":"Mask-ControlNet: Higher-Quality Image Generation with An Additional Mask\n Prompt","summary":" Text-to-image generation has witnessed great progress, especially with the\nrecent advancements in diffusion models. Since texts cannot provide detailed\nconditions like object appearance, reference images are usually leveraged for\nthe control of objects in the generated images. However, existing methods still\nsuffer limited accuracy when the relationship between the foreground and\nbackground is complicated. To address this issue, we develop a framework termed\nMask-ControlNet by introducing an additional mask prompt. Specifically, we\nfirst employ large vision models to obtain masks to segment the objects of\ninterest in the reference image. Then, the object images are employed as\nadditional prompts to facilitate the diffusion model to better understand the\nrelationship between foreground and background regions during image generation.\nExperiments show that the mask prompts enhance the controllability of the\ndiffusion model to maintain higher fidelity to the reference image while\nachieving better image quality. Comparison with previous text-to-image\ngeneration methods demonstrates our method's superior quantitative and\nqualitative performance on the benchmark datasets.\n","authors":["Zhiqi Huang","Huixin Xiong","Haoyu Wang","Longguang Wang","Zhiheng Li"],"pdf_url":"https://arxiv.org/pdf/2404.05331v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05317v1","updated":"2024-04-08T09:08:43Z","published":"2024-04-08T09:08:43Z","title":"WebXR, A-Frame and Networked-Aframe as a Basis for an Open Metaverse: A\n Conceptual Architecture","summary":" This work proposes a WebXR-based cross-platform conceptual architecture,\nleveraging the A-Frame and Networked-Aframe frameworks, in order to facilitate\nthe development of an open, accessible, and interoperable metaverse. By\nintroducing the concept of spatial web app, this research contributes to the\ndiscourse on the metaverse, offering an architecture that democratizes access\nto virtual environments and extended reality through the web, and aligns with\nTim Berners-Lee's original vision of the World Wide Web as an open platform in\nthe digital realm.\n","authors":["Giuseppe Macario"],"pdf_url":"https://arxiv.org/pdf/2404.05317v1.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2303.12017v2","updated":"2024-04-08T09:02:40Z","published":"2023-03-21T16:54:01Z","title":"Learning Optical Flow and Scene Flow with Bidirectional Camera-LiDAR\n Fusion","summary":" In this paper, we study the problem of jointly estimating the optical flow\nand scene flow from synchronized 2D and 3D data. Previous methods either employ\na complex pipeline that splits the joint task into independent stages, or fuse\n2D and 3D information in an ``early-fusion'' or ``late-fusion'' manner. Such\none-size-fits-all approaches suffer from a dilemma of failing to fully utilize\nthe characteristic of each modality or to maximize the inter-modality\ncomplementarity. To address the problem, we propose a novel end-to-end\nframework, which consists of 2D and 3D branches with multiple bidirectional\nfusion connections between them in specific layers. Different from previous\nwork, we apply a point-based 3D branch to extract the LiDAR features, as it\npreserves the geometric structure of point clouds. To fuse dense image features\nand sparse point features, we propose a learnable operator named bidirectional\ncamera-LiDAR fusion module (Bi-CLFM). We instantiate two types of the\nbidirectional fusion pipeline, one based on the pyramidal coarse-to-fine\narchitecture (dubbed CamLiPWC), and the other one based on the recurrent\nall-pairs field transforms (dubbed CamLiRAFT). On FlyingThings3D, both CamLiPWC\nand CamLiRAFT surpass all existing methods and achieve up to a 47.9\\% reduction\nin 3D end-point-error from the best published result. Our best-performing\nmodel, CamLiRAFT, achieves an error of 4.26\\% on the KITTI Scene Flow\nbenchmark, ranking 1st among all submissions with much fewer parameters.\nBesides, our methods have strong generalization performance and the ability to\nhandle non-rigid motion. Code is available at\nhttps://github.com/MCG-NJU/CamLiFlow.\n","authors":["Haisong Liu","Tao Lu","Yihui Xu","Jia Liu","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2303.12017v2.pdf","comment":"Accepted to TPAMI 2023"},{"id":"http://arxiv.org/abs/2404.05309v1","updated":"2024-04-08T08:57:32Z","published":"2024-04-08T08:57:32Z","title":"CLIPping the Limits: Finding the Sweet Spot for Relevant Images in\n Automated Driving Systems Perception Testing","summary":" Perception systems, especially cameras, are the eyes of automated driving\nsystems. Ensuring that they function reliably and robustly is therefore an\nimportant building block in the automation of vehicles. There are various\napproaches to test the perception of automated driving systems. Ultimately,\nhowever, it always comes down to the investigation of the behavior of\nperception systems under specific input data. Camera images are a crucial part\nof the input data. Image data sets are therefore collected for the testing of\nautomated driving systems, but it is non-trivial to find specific images in\nthese data sets. Thanks to recent developments in neural networks, there are\nnow methods for sorting the images in a data set according to their similarity\nto a prompt in natural language. In order to further automate the provision of\nsearch results, we make a contribution by automating the threshold definition\nin these sorted results and returning only the images relevant to the prompt as\na result. Our focus is on preventing false positives and false negatives\nequally. It is also important that our method is robust and in the case that\nour assumptions are not fulfilled, we provide a fallback solution.\n","authors":["Philipp Rigoll","Laurenz Adolph","Lennart Ries","Eric Sax"],"pdf_url":"https://arxiv.org/pdf/2404.05309v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05307v1","updated":"2024-04-08T08:53:54Z","published":"2024-04-08T08:53:54Z","title":"Human Detection from 4D Radar Data in Low-Visibility Field Conditions","summary":" Autonomous driving technology is increasingly being used on public roads and\nin industrial settings such as mines. While it is essential to detect\npedestrians, vehicles, or other obstacles, adverse field conditions negatively\naffect the performance of classical sensors such as cameras or lidars. Radar,\non the other hand, is a promising modality that is less affected by, e.g.,\ndust, smoke, water mist or fog. In particular, modern 4D imaging radars provide\ntarget responses across the range, vertical angle, horizontal angle and Doppler\nvelocity dimensions. We propose TMVA4D, a CNN architecture that leverages this\n4D radar modality for semantic segmentation. The CNN is trained to distinguish\nbetween the background and person classes based on a series of 2D projections\nof the 4D radar data that include the elevation, azimuth, range, and Doppler\nvelocity dimensions. We also outline the process of compiling a novel dataset\nconsisting of data collected in industrial settings with a car-mounted 4D radar\nand describe how the ground-truth labels were generated from reference thermal\nimages. Using TMVA4D on this dataset, we achieve an mIoU score of 78.2% and an\nmDice score of 86.1%, evaluated on the two classes background and person\n","authors":["Mikael Skog","Oleksandr Kotlyar","Vladimír Kubelka","Martin Magnusson"],"pdf_url":"https://arxiv.org/pdf/2404.05307v1.pdf","comment":"Submitted to Radar in Robotics workshop at ICRA 2024"},{"id":"http://arxiv.org/abs/2404.05300v1","updated":"2024-04-08T08:42:47Z","published":"2024-04-08T08:42:47Z","title":"Texture Classification Network Integrating Adaptive Wavelet Transform","summary":" Graves' disease is a common condition that is diagnosed clinically by\ndetermining the smoothness of the thyroid texture and its morphology in\nultrasound images. Currently, the most widely used approach for the automated\ndiagnosis of Graves' disease utilizes Convolutional Neural Networks (CNNs) for\nboth feature extraction and classification. However, these methods demonstrate\nlimited efficacy in capturing texture features. Given the high capacity of\nwavelets in describing texture features, this research integrates learnable\nwavelet modules utilizing the Lifting Scheme into CNNs and incorporates a\nparallel wavelet branch into the ResNet18 model to enhance texture feature\nextraction. Our model can analyze texture features in spatial and frequency\ndomains simultaneously, leading to optimized classification accuracy. We\nconducted experiments on collected ultrasound datasets and publicly available\nnatural image texture datasets, our proposed network achieved 97.27% accuracy\nand 95.60% recall on ultrasound datasets, 60.765% accuracy on natural image\ntexture datasets, surpassing the accuracy of ResNet and conrming the\neffectiveness of our approach.\n","authors":["Su-Xi Yu","Jing-Yuan He","Yi Wang","Yu-Jiao Cai","Jun Yang","Bo Lin","Wei-Bin Yang","Jian Ruan"],"pdf_url":"https://arxiv.org/pdf/2404.05300v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05290v1","updated":"2024-04-08T08:28:19Z","published":"2024-04-08T08:28:19Z","title":"MindSet: Vision. A toolbox for testing DNNs on key psychological\n experiments","summary":" Multiple benchmarks have been developed to assess the alignment between deep\nneural networks (DNNs) and human vision. In almost all cases these benchmarks\nare observational in the sense they are composed of behavioural and brain\nresponses to naturalistic images that have not been manipulated to test\nhypotheses regarding how DNNs or humans perceive and identify objects. Here we\nintroduce the toolbox MindSet: Vision, consisting of a collection of image\ndatasets and related scripts designed to test DNNs on 30 psychological\nfindings. In all experimental conditions, the stimuli are systematically\nmanipulated to test specific hypotheses regarding human visual perception and\nobject recognition. In addition to providing pre-generated datasets of images,\nwe provide code to regenerate these datasets, offering many configurable\nparameters which greatly extend the dataset versatility for different research\ncontexts, and code to facilitate the testing of DNNs on these image datasets\nusing three different methods (similarity judgments, out-of-distribution\nclassification, and decoder method), accessible at\nhttps://github.com/MindSetVision/mindset-vision. We test ResNet-152 on each of\nthese methods as an example of how the toolbox can be used.\n","authors":["Valerio Biscione","Dong Yin","Gaurav Malhotra","Marin Dujmovic","Milton L. Montero","Guillermo Puebla","Federico Adolfi","Rachel F. Heaton","John E. Hummel","Benjamin D. Evans","Karim Habashy","Jeffrey S. Bowers"],"pdf_url":"https://arxiv.org/pdf/2404.05290v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05285v1","updated":"2024-04-08T08:20:53Z","published":"2024-04-08T08:20:53Z","title":"Detecting Every Object from Events","summary":" Object detection is critical in autonomous driving, and it is more practical\nyet challenging to localize objects of unknown categories: an endeavour known\nas Class-Agnostic Object Detection (CAOD). Existing studies on CAOD\npredominantly rely on ordinary cameras, but these frame-based sensors usually\nhave high latency and limited dynamic range, leading to safety risks in\nreal-world scenarios. In this study, we turn to a new modality enabled by the\nso-called event camera, featured by its sub-millisecond latency and high\ndynamic range, for robust CAOD. We propose Detecting Every Object in Events\n(DEOE), an approach tailored for achieving high-speed, class-agnostic\nopen-world object detection in event-based vision. Built upon the fast\nevent-based backbone: recurrent vision transformer, we jointly consider the\nspatial and temporal consistencies to identify potential objects. The\ndiscovered potential objects are assimilated as soft positive samples to avoid\nbeing suppressed as background. Moreover, we introduce a disentangled\nobjectness head to separate the foreground-background classification and novel\nobject discovery tasks, enhancing the model's generalization in localizing\nnovel objects while maintaining a strong ability to filter out the background.\nExtensive experiments confirm the superiority of our proposed DEOE in\ncomparison with three strong baseline methods that integrate the\nstate-of-the-art event-based object detector with advancements in RGB-based\nCAOD. Our code is available at https://github.com/Hatins/DEOE.\n","authors":["Haitian Zhang","Chang Xu","Xinya Wang","Bingde Liu","Guang Hua","Lei Yu","Wen Yang"],"pdf_url":"https://arxiv.org/pdf/2404.05285v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19428v3","updated":"2024-04-08T08:18:33Z","published":"2024-03-28T13:58:05Z","title":"Burst Super-Resolution with Diffusion Models for Improving Perceptual\n Quality","summary":" While burst LR images are useful for improving the SR image quality compared\nwith a single LR image, prior SR networks accepting the burst LR images are\ntrained in a deterministic manner, which is known to produce a blurry SR image.\nIn addition, it is difficult to perfectly align the burst LR images, making the\nSR image more blurry. Since such blurry images are perceptually degraded, we\naim to reconstruct the sharp high-fidelity boundaries. Such high-fidelity\nimages can be reconstructed by diffusion models. However, prior SR methods\nusing the diffusion model are not properly optimized for the burst SR task.\nSpecifically, the reverse process starting from a random sample is not\noptimized for image enhancement and restoration methods, including burst SR. In\nour proposed method, on the other hand, burst LR features are used to\nreconstruct the initial burst SR image that is fed into an intermediate step in\nthe diffusion model. This reverse process from the intermediate step 1) skips\ndiffusion steps for reconstructing the global structure of the image and 2)\nfocuses on steps for refining detailed textures. Our experimental results\ndemonstrate that our method can improve the scores of the perceptual quality\nmetrics. Code: https://github.com/placerkyo/BSRD\n","authors":["Kyotaro Tokoro","Kazutoshi Akita","Norimichi Ukita"],"pdf_url":"https://arxiv.org/pdf/2403.19428v3.pdf","comment":"Accepted to IJCNN 2024 (International Joint Conference on Neural\n Networks)"},{"id":"http://arxiv.org/abs/2404.05280v1","updated":"2024-04-08T08:11:56Z","published":"2024-04-08T08:11:56Z","title":"MOSE: Boosting Vision-based Roadside 3D Object Detection with Scene Cues","summary":" 3D object detection based on roadside cameras is an additional way for\nautonomous driving to alleviate the challenges of occlusion and short\nperception range from vehicle cameras. Previous methods for roadside 3D object\ndetection mainly focus on modeling the depth or height of objects, neglecting\nthe stationary of cameras and the characteristic of inter-frame consistency. In\nthis work, we propose a novel framework, namely MOSE, for MOnocular 3D object\ndetection with Scene cuEs. The scene cues are the frame-invariant\nscene-specific features, which are crucial for object localization and can be\nintuitively regarded as the height between the surface of the real road and the\nvirtual ground plane. In the proposed framework, a scene cue bank is designed\nto aggregate scene cues from multiple frames of the same scene with a carefully\ndesigned extrinsic augmentation strategy. Then, a transformer-based decoder\nlifts the aggregated scene cues as well as the 3D position embeddings for 3D\nobject location, which boosts generalization ability in heterologous scenes.\nThe extensive experiment results on two public benchmarks demonstrate the\nstate-of-the-art performance of the proposed method, which surpasses the\nexisting methods by a large margin.\n","authors":["Xiahan Chen","Mingjian Chen","Sanli Tang","Yi Niu","Jiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.05280v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00916v2","updated":"2024-04-08T08:08:43Z","published":"2024-04-01T04:43:45Z","title":"Gyro-based Neural Single Image Deblurring","summary":" In this paper, we present GyroDeblurNet, a novel single image deblurring\nmethod that utilizes a gyro sensor to effectively resolve the ill-posedness of\nimage deblurring. The gyro sensor provides valuable information about camera\nmotion during exposure time that can significantly improve deblurring quality.\nHowever, effectively exploiting real-world gyro data is challenging due to\nsignificant errors from various sources including sensor noise, the disparity\nbetween the positions of a camera module and a gyro sensor, the absence of\ntranslational motion information, and moving objects whose motions cannot be\ncaptured by a gyro sensor. To handle gyro error, GyroDeblurNet is equipped with\ntwo novel neural network blocks: a gyro refinement block and a gyro deblurring\nblock. The gyro refinement block refines the error-ridden gyro data using the\nblur information from the input image. On the other hand, the gyro deblurring\nblock removes blur from the input image using the refined gyro data and further\ncompensates for gyro error by leveraging the blur information from the input\nimage. For training a neural network with erroneous gyro data, we propose a\ntraining strategy based on the curriculum learning. We also introduce a novel\ngyro data embedding scheme to represent real-world intricate camera shakes.\nFinally, we present a synthetic dataset and a real dataset for the training and\nevaluation of gyro-based single image deblurring. Our experiments demonstrate\nthat our approach achieves state-of-the-art deblurring quality by effectively\nutilizing erroneous gyro data.\n","authors":["Heemin Yang","Jaesung Rim","Seungyong Lee","Seung-Hwan Baek","Sunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2404.00916v2.pdf","comment":"14 pages, 11 figures"},{"id":"http://arxiv.org/abs/2404.05274v1","updated":"2024-04-08T08:04:44Z","published":"2024-04-08T08:04:44Z","title":"Deep Optics for Video Snapshot Compressive Imaging","summary":" Video snapshot compressive imaging (SCI) aims to capture a sequence of video\nframes with only a single shot of a 2D detector, whose backbones rest in\noptical modulation patterns (also known as masks) and a computational\nreconstruction algorithm. Advanced deep learning algorithms and mature hardware\nare putting video SCI into practical applications. Yet, there are two clouds in\nthe sunshine of SCI: i) low dynamic range as a victim of high temporal\nmultiplexing, and ii) existing deep learning algorithms' degradation on real\nsystem. To address these challenges, this paper presents a deep optics\nframework to jointly optimize masks and a reconstruction network. Specifically,\nwe first propose a new type of structural mask to realize motion-aware and\nfull-dynamic-range measurement. Considering the motion awareness property in\nmeasurement domain, we develop an efficient network for video SCI\nreconstruction using Transformer to capture long-term temporal dependencies,\ndubbed Res2former. Moreover, sensor response is introduced into the forward\nmodel of video SCI to guarantee end-to-end model training close to real system.\nFinally, we implement the learned structural masks on a digital micro-mirror\ndevice. Experimental results on synthetic and real data validate the\neffectiveness of the proposed framework. We believe this is a milestone for\nreal-world video SCI. The source code and data are available at\nhttps://github.com/pwangcs/DeepOpticsSCI.\n","authors":["Ping Wang","Lishun Wang","Xin Yuan"],"pdf_url":"https://arxiv.org/pdf/2404.05274v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2404.05268v1","updated":"2024-04-08T07:59:04Z","published":"2024-04-08T07:59:04Z","title":"MC$^2$: Multi-concept Guidance for Customized Multi-concept Generation","summary":" Customized text-to-image generation aims to synthesize instantiations of\nuser-specified concepts and has achieved unprecedented progress in handling\nindividual concept. However, when extending to multiple customized concepts,\nexisting methods exhibit limitations in terms of flexibility and fidelity, only\naccommodating the combination of limited types of models and potentially\nresulting in a mix of characteristics from different concepts. In this paper,\nwe introduce the Multi-concept guidance for Multi-concept customization, termed\nMC$^2$, for improved flexibility and fidelity. MC$^2$ decouples the\nrequirements for model architecture via inference time optimization, allowing\nthe integration of various heterogeneous single-concept customized models. It\nadaptively refines the attention weights between visual and textual tokens,\ndirecting image regions to focus on their associated words while diminishing\nthe impact of irrelevant ones. Extensive experiments demonstrate that MC$^2$\neven surpasses previous methods that require additional training in terms of\nconsistency with input prompt and reference images. Moreover, MC$^2$ can be\nextended to elevate the compositional capabilities of text-to-image generation,\nyielding appealing results. Code will be publicly available at\nhttps://github.com/JIANGJiaXiu/MC-2.\n","authors":["Jiaxiu Jiang","Yabo Zhang","Kailai Feng","Xiaohe Wu","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.05268v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05264v1","updated":"2024-04-08T07:54:18Z","published":"2024-04-08T07:54:18Z","title":"Unbridled Icarus: A Survey of the Potential Perils of Image Inputs in\n Multimodal Large Language Model Security","summary":" Multimodal Large Language Models (MLLMs) demonstrate remarkable capabilities\nthat increasingly influence various aspects of our daily lives, constantly\ndefining the new boundary of Artificial General Intelligence (AGI). Image\nmodalities, enriched with profound semantic information and a more continuous\nmathematical nature compared to other modalities, greatly enhance the\nfunctionalities of MLLMs when integrated. However, this integration serves as a\ndouble-edged sword, providing attackers with expansive vulnerabilities to\nexploit for highly covert and harmful attacks. The pursuit of reliable AI\nsystems like powerful MLLMs has emerged as a pivotal area of contemporary\nresearch. In this paper, we endeavor to demostrate the multifaceted risks\nassociated with the incorporation of image modalities into MLLMs. Initially, we\ndelineate the foundational components and training processes of MLLMs.\nSubsequently, we construct a threat model, outlining the security\nvulnerabilities intrinsic to MLLMs. Moreover, we analyze and summarize existing\nscholarly discourses on MLLMs' attack and defense mechanisms, culminating in\nsuggestions for the future research on MLLM security. Through this\ncomprehensive analysis, we aim to deepen the academic understanding of MLLM\nsecurity challenges and propel forward the development of trustworthy MLLM\nsystems.\n","authors":["Yihe Fan","Yuxin Cao","Ziyu Zhao","Ziyao Liu","Shaofeng Li"],"pdf_url":"https://arxiv.org/pdf/2404.05264v1.pdf","comment":"8 pages, 1 figure"},{"id":"http://arxiv.org/abs/2404.00936v3","updated":"2024-04-08T07:52:38Z","published":"2024-04-01T05:46:15Z","title":"A Comprehensive Review of Knowledge Distillation in Computer Vision","summary":" Deep learning techniques have been demonstrated to surpass preceding\ncutting-edge machine learning techniques in recent years, with computer vision\nbeing one of the most prominent examples. However, deep learning models suffer\nfrom significant drawbacks when deployed in resource-constrained environments\ndue to their large model size and high complexity. Knowledge Distillation is\none of the prominent solutions to overcome this challenge. This review paper\nexamines the current state of research on knowledge distillation, a technique\nfor compressing complex models into smaller and simpler ones. The paper\nprovides an overview of the major principles and techniques associated with\nknowledge distillation and reviews the applications of knowledge distillation\nin the domain of computer vision. The review focuses on the benefits of\nknowledge distillation, as well as the problems that must be overcome to\nimprove its effectiveness.\n","authors":["Sheikh Musa Kaleem","Tufail Rouf","Gousia Habib","Tausifa jan Saleem","Brejesh Lall"],"pdf_url":"https://arxiv.org/pdf/2404.00936v3.pdf","comment":"36 pages ,10 figures"},{"id":"http://arxiv.org/abs/2309.03467v2","updated":"2024-04-08T07:49:47Z","published":"2023-09-07T03:22:59Z","title":"Autoregressive Omni-Aware Outpainting for Open-Vocabulary 360-Degree\n Image Generation","summary":" A 360-degree (omni-directional) image provides an all-encompassing spherical\nview of a scene. Recently, there has been an increasing interest in\nsynthesising 360-degree images from conventional narrow field of view (NFoV)\nimages captured by digital cameras and smartphones, for providing immersive\nexperiences in various scenarios such as virtual reality. Yet, existing methods\ntypically fall short in synthesizing intricate visual details or ensure the\ngenerated images align consistently with user-provided prompts. In this study,\nautoregressive omni-aware generative network (AOG-Net) is proposed for\n360-degree image generation by out-painting an incomplete 360-degree image\nprogressively with NFoV and text guidances joinly or individually. This\nautoregressive scheme not only allows for deriving finer-grained and\ntext-consistent patterns by dynamically generating and adjusting the process\nbut also offers users greater flexibility to edit their conditions throughout\nthe generation process. A global-local conditioning mechanism is devised to\ncomprehensively formulate the outpainting guidance in each autoregressive step.\nText guidances, omni-visual cues, NFoV inputs and omni-geometry are encoded and\nfurther formulated with cross-attention based transformers into a global stream\nand a local stream into a conditioned generative backbone model. As AOG-Net is\ncompatible to leverage large-scale models for the conditional encoder and the\ngenerative prior, it enables the generation to use extensive open-vocabulary\ntext guidances. Comprehensive experiments on two commonly used 360-degree image\ndatasets for both indoor and outdoor settings demonstrate the state-of-the-art\nperformance of our proposed method. Our code will be made publicly available.\n","authors":["Zhuqiang Lu","Kun Hu","Chaoyue Wang","Lei Bai","Zhiyong Wang"],"pdf_url":"https://arxiv.org/pdf/2309.03467v2.pdf","comment":"Accepted by AAAI 24"},{"id":"http://arxiv.org/abs/2404.05258v1","updated":"2024-04-08T07:47:28Z","published":"2024-04-08T07:47:28Z","title":"Unsupervised Band Selection Using Fused HSI and LiDAR Attention\n Integrating With Autoencoder","summary":" Band selection in hyperspectral imaging (HSI) is critical for optimising data\nprocessing and enhancing analytical accuracy. Traditional approaches have\npredominantly concentrated on analysing spectral and pixel characteristics\nwithin individual bands independently. These approaches overlook the potential\nbenefits of integrating multiple data sources, such as Light Detection and\nRanging (LiDAR), and is further challenged by the limited availability of\nlabeled data in HSI processing, which represents a significant obstacle. To\naddress these challenges, this paper introduces a novel unsupervised band\nselection framework that incorporates attention mechanisms and an Autoencoder\nfor reconstruction-based band selection. Our methodology distinctively\nintegrates HSI with LiDAR data through an attention score, using a\nconvolutional Autoencoder to process the combined feature mask. This fusion\neffectively captures essential spatial and spectral features and reduces\nredundancy in hyperspectral datasets. A comprehensive comparative analysis of\nour innovative fused band selection approach is performed against existing\nunsupervised band selection and fusion models. We used data sets such as\nHouston 2013, Trento, and MUUFLE for our experiments. The results demonstrate\nthat our method achieves superior classification accuracy and significantly\noutperforms existing models. This enhancement in HSI band selection,\nfacilitated by the incorporation of LiDAR features, underscores the\nconsiderable advantages of integrating features from different sources.\n","authors":["Judy X Yang","Jun Zhou","Jing Wang","Hui Tian","Alan Wee Chung Liew"],"pdf_url":"https://arxiv.org/pdf/2404.05258v1.pdf","comment":"13 pages, 13figures, 6 tables"},{"id":"http://arxiv.org/abs/2404.05256v1","updated":"2024-04-08T07:43:23Z","published":"2024-04-08T07:43:23Z","title":"Text-to-Image Synthesis for Any Artistic Styles: Advancements in\n Personalized Artistic Image Generation via Subdivision and Dual Binding","summary":" Recent advancements in text-to-image models, such as Stable Diffusion, have\ndemonstrated their ability to synthesize visual images through natural language\nprompts. One approach of personalizing text-to-image models, exemplified by\nDreamBooth, fine-tunes the pre-trained model by binding unique text identifiers\nwith a few images of a specific subject. Although existing fine-tuning methods\nhave demonstrated competence in rendering images according to the styles of\nfamous painters, it is still challenging to learn to produce images\nencapsulating distinct art styles due to abstract and broad visual perceptions\nof stylistic attributes such as lines, shapes, textures, and colors. In this\npaper, we introduce a new method, Single-StyleForge, for personalization. It\nfine-tunes pre-trained text-to-image diffusion models to generate diverse\nimages in specified styles from text prompts. By using around 15-20 images of\nthe target style, the approach establishes a foundational binding of a unique\ntoken identifier with a broad range of the target style. It also utilizes\nauxiliary images to strengthen this binding, resulting in offering specific\nguidance on representing elements such as persons in a target style-consistent\nmanner. In addition, we present ways to improve the quality of style and\ntext-image alignment through a method called Multi-StyleForge, which inherits\nthe strategy used in StyleForge and learns tokens in multiple. Experimental\nevaluation conducted on six distinct artistic styles demonstrates substantial\nimprovements in both the quality of generated images and the perceptual\nfidelity metrics, such as FID, KID, and CLIP scores.\n","authors":["Junseo Park","Beomseok Ko","Hyeryung Jang"],"pdf_url":"https://arxiv.org/pdf/2404.05256v1.pdf","comment":"20 pages, 12 figuers"},{"id":"http://arxiv.org/abs/2404.05253v1","updated":"2024-04-08T07:34:39Z","published":"2024-04-08T07:34:39Z","title":"CodeEnhance: A Codebook-Driven Approach for Low-Light Image Enhancement","summary":" Low-light image enhancement (LLIE) aims to improve low-illumination images.\nHowever, existing methods face two challenges: (1) uncertainty in restoration\nfrom diverse brightness degradations; (2) loss of texture and color information\ncaused by noise suppression and light enhancement. In this paper, we propose a\nnovel enhancement approach, CodeEnhance, by leveraging quantized priors and\nimage refinement to address these challenges. In particular, we reframe LLIE as\nlearning an image-to-code mapping from low-light images to discrete codebook,\nwhich has been learned from high-quality images. To enhance this process, a\nSemantic Embedding Module (SEM) is introduced to integrate semantic information\nwith low-level features, and a Codebook Shift (CS) mechanism, designed to adapt\nthe pre-learned codebook to better suit the distinct characteristics of our\nlow-light dataset. Additionally, we present an Interactive Feature\nTransformation (IFT) module to refine texture and color information during\nimage reconstruction, allowing for interactive enhancement based on user\npreferences. Extensive experiments on both real-world and synthetic benchmarks\ndemonstrate that the incorporation of prior knowledge and controllable\ninformation transfer significantly enhances LLIE performance in terms of\nquality and fidelity. The proposed CodeEnhance exhibits superior robustness to\nvarious degradations, including uneven illumination, noise, and color\ndistortion.\n","authors":["Xu Wu","XianXu Hou","Zhihui Lai","Jie Zhou","Ya-nan Zhang","Witold Pedrycz","Linlin Shen"],"pdf_url":"https://arxiv.org/pdf/2404.05253v1.pdf","comment":"10 pages, 13 figures"},{"id":"http://arxiv.org/abs/2312.03203v3","updated":"2024-04-08T07:19:52Z","published":"2023-12-06T00:46:30Z","title":"Feature 3DGS: Supercharging 3D Gaussian Splatting to Enable Distilled\n Feature Fields","summary":" 3D scene representations have gained immense popularity in recent years.\nMethods that use Neural Radiance fields are versatile for traditional tasks\nsuch as novel view synthesis. In recent times, some work has emerged that aims\nto extend the functionality of NeRF beyond view synthesis, for semantically\naware tasks such as editing and segmentation using 3D feature field\ndistillation from 2D foundation models. However, these methods have two major\nlimitations: (a) they are limited by the rendering speed of NeRF pipelines, and\n(b) implicitly represented feature fields suffer from continuity artifacts\nreducing feature quality. Recently, 3D Gaussian Splatting has shown\nstate-of-the-art performance on real-time radiance field rendering. In this\nwork, we go one step further: in addition to radiance field rendering, we\nenable 3D Gaussian splatting on arbitrary-dimension semantic features via 2D\nfoundation model distillation. This translation is not straightforward: naively\nincorporating feature fields in the 3DGS framework encounters significant\nchallenges, notably the disparities in spatial resolution and channel\nconsistency between RGB images and feature maps. We propose architectural and\ntraining changes to efficiently avert this problem. Our proposed method is\ngeneral, and our experiments showcase novel view semantic segmentation,\nlanguage-guided editing and segment anything through learning feature fields\nfrom state-of-the-art 2D foundation models such as SAM and CLIP-LSeg. Across\nexperiments, our distillation method is able to provide comparable or better\nresults, while being significantly faster to both train and render.\nAdditionally, to the best of our knowledge, we are the first method to enable\npoint and bounding-box prompting for radiance field manipulation, by leveraging\nthe SAM model. Project website at: https://feature-3dgs.github.io/\n","authors":["Shijie Zhou","Haoran Chang","Sicheng Jiang","Zhiwen Fan","Zehao Zhu","Dejia Xu","Pradyumna Chari","Suya You","Zhangyang Wang","Achuta Kadambi"],"pdf_url":"https://arxiv.org/pdf/2312.03203v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05238v1","updated":"2024-04-08T07:09:15Z","published":"2024-04-08T07:09:15Z","title":"Allowing humans to interactively guide machines where to look does not\n always improve a human-AI team's classification accuracy","summary":" Via thousands of papers in Explainable AI (XAI), attention maps\n\\cite{vaswani2017attention} and feature attribution maps \\cite{bansal2020sam}\nhave been established as a common means for explaining the input features that\nare important to AI's decisions. It is an interesting but unexplored question\nwhether allowing users to edit the importance scores of input features at test\ntime would improve the human-AI team's accuracy on downstream tasks. In this\npaper, we address this question by taking CHM-Corr, a state-of-the-art,\nante-hoc explanation method \\cite{taesiri2022visual} that first predicts\npatch-wise correspondences between the input and the training-set images, and\nthen uses them to make classification decisions. We build an interactive\ninterface on top of CHM-Corr, enabling users to directly edit the initial\nfeature attribution map provided by CHM-Corr. Via our CHM-Corr++ interface,\nusers gain insights into if, when, and how the model changes its outputs,\nenhancing understanding beyond static explanations. Our user study with 18\nmachine learning researchers who performed $\\sim$1,400 decisions shows that our\ninteractive approach does not improve user accuracy on CUB-200 bird image\nclassification over static explanations. This challenges the belief that\ninteractivity inherently boosts XAI\neffectiveness~\\cite{sokol2020one,sun2022exploring,shen2024towards,singh2024rethinking,mindlin2024beyond,lakkaraju2022rethinking,cheng2019explaining,liu2021understanding}\nand raises needs for future research. Our work contributes to the field by\nopen-sourcing an interactive tool for manipulating model attention, and it lays\nthe groundwork for future research to enable effective human-AI interaction in\ncomputer vision. We release code and data on\n\\href{https://anonymous.4open.science/r/CHMCorrPlusPlus/}{github}. Our\ninterface are available \\href{http://137.184.82.109:7080/}{here}.\n","authors":["Giang Nguyen","Mohammad Reza Taesiri","Sunnie S. Y. Kim","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.05238v1.pdf","comment":"Accepted for presentation at the XAI4CV Workshop, part of the CVPR\n 2024 proceedings"},{"id":"http://arxiv.org/abs/2312.07246v2","updated":"2024-04-08T07:07:02Z","published":"2023-12-12T13:22:44Z","title":"Unifying Correspondence, Pose and NeRF for Pose-Free Novel View\n Synthesis from Stereo Pairs","summary":" This work delves into the task of pose-free novel view synthesis from stereo\npairs, a challenging and pioneering task in 3D vision. Our innovative\nframework, unlike any before, seamlessly integrates 2D correspondence matching,\ncamera pose estimation, and NeRF rendering, fostering a synergistic enhancement\nof these tasks. We achieve this through designing an architecture that utilizes\na shared representation, which serves as a foundation for enhanced 3D geometry\nunderstanding. Capitalizing on the inherent interplay between the tasks, our\nunified framework is trained end-to-end with the proposed training strategy to\nimprove overall model accuracy. Through extensive evaluations across diverse\nindoor and outdoor scenes from two real-world datasets, we demonstrate that our\napproach achieves substantial improvement over previous methodologies,\nespecially in scenarios characterized by extreme viewpoint changes and the\nabsence of accurate camera poses.\n","authors":["Sunghwan Hong","Jaewoo Jung","Heeseong Shin","Jiaolong Yang","Seungryong Kim","Chong Luo"],"pdf_url":"https://arxiv.org/pdf/2312.07246v2.pdf","comment":"Project page: https://ku-cvlab.github.io/CoPoNeRF/ CVPR2024 camera\n ready version (Highlight)"},{"id":"http://arxiv.org/abs/2404.05236v1","updated":"2024-04-08T07:01:42Z","published":"2024-04-08T07:01:42Z","title":"Stylizing Sparse-View 3D Scenes with Hierarchical Neural Representation","summary":" Recently, a surge of 3D style transfer methods has been proposed that\nleverage the scene reconstruction power of a pre-trained neural radiance field\n(NeRF). To successfully stylize a scene this way, one must first reconstruct a\nphoto-realistic radiance field from collected images of the scene. However,\nwhen only sparse input views are available, pre-trained few-shot NeRFs often\nsuffer from high-frequency artifacts, which are generated as a by-product of\nhigh-frequency details for improving reconstruction quality. Is it possible to\ngenerate more faithful stylized scenes from sparse inputs by directly\noptimizing encoding-based scene representation with target style? In this\npaper, we consider the stylization of sparse-view scenes in terms of\ndisentangling content semantics and style textures. We propose a coarse-to-fine\nsparse-view scene stylization framework, where a novel hierarchical\nencoding-based neural representation is designed to generate high-quality\nstylized scenes directly from implicit scene representations. We also propose a\nnew optimization strategy with content strength annealing to achieve realistic\nstylization and better content preservation. Extensive experiments demonstrate\nthat our method can achieve high-quality stylization of sparse-view scenes and\noutperforms fine-tuning-based baselines in terms of stylization quality and\nefficiency.\n","authors":["Y. Wang","A. Gao","Y. Gong","Y. Zeng"],"pdf_url":"https://arxiv.org/pdf/2404.05236v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05231v1","updated":"2024-04-08T06:53:30Z","published":"2024-04-08T06:53:30Z","title":"PromptAD: Learning Prompts with only Normal Samples for Few-Shot Anomaly\n Detection","summary":" The vision-language model has brought great improvement to few-shot\nindustrial anomaly detection, which usually needs to design of hundreds of\nprompts through prompt engineering. For automated scenarios, we first use\nconventional prompt learning with many-class paradigm as the baseline to\nautomatically learn prompts but found that it can not work well in one-class\nanomaly detection. To address the above problem, this paper proposes a\none-class prompt learning method for few-shot anomaly detection, termed\nPromptAD. First, we propose semantic concatenation which can transpose normal\nprompts into anomaly prompts by concatenating normal prompts with anomaly\nsuffixes, thus constructing a large number of negative samples used to guide\nprompt learning in one-class setting. Furthermore, to mitigate the training\nchallenge caused by the absence of anomaly images, we introduce the concept of\nexplicit anomaly margin, which is used to explicitly control the margin between\nnormal prompt features and anomaly prompt features through a hyper-parameter.\nFor image-level/pixel-level anomaly detection, PromptAD achieves first place in\n11/12 few-shot settings on MVTec and VisA.\n","authors":["Xiaofan Li","Zhizhong Zhang","Xin Tan","Chengwei Chen","Yanyun Qu","Yuan Xie","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2404.05231v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2404.05225v1","updated":"2024-04-08T06:40:28Z","published":"2024-04-08T06:40:28Z","title":"LayoutLLM: Layout Instruction Tuning with Large Language Models for\n Document Understanding","summary":" Recently, leveraging large language models (LLMs) or multimodal large\nlanguage models (MLLMs) for document understanding has been proven very\npromising. However, previous works that employ LLMs/MLLMs for document\nunderstanding have not fully explored and utilized the document layout\ninformation, which is vital for precise document understanding. In this paper,\nwe propose LayoutLLM, an LLM/MLLM based method for document understanding. The\ncore of LayoutLLM is a layout instruction tuning strategy, which is specially\ndesigned to enhance the comprehension and utilization of document layouts. The\nproposed layout instruction tuning strategy consists of two components:\nLayout-aware Pre-training and Layout-aware Supervised Fine-tuning. To capture\nthe characteristics of document layout in Layout-aware Pre-training, three\ngroups of pre-training tasks, corresponding to document-level, region-level and\nsegment-level information, are introduced. Furthermore, a novel module called\nlayout chain-of-thought (LayoutCoT) is devised to enable LayoutLLM to focus on\nregions relevant to the question and generate accurate answers. LayoutCoT is\neffective for boosting the performance of document understanding. Meanwhile, it\nbrings a certain degree of interpretability, which could facilitate manual\ninspection and correction. Experiments on standard benchmarks show that the\nproposed LayoutLLM significantly outperforms existing methods that adopt\nopen-source 7B LLMs/MLLMs for document understanding. The training data of the\nLayoutLLM is publicly available at\nhttps://github.com/AlibabaResearch/AdvancedLiterateMachinery/tree/main/DocumentUnderstanding/LayoutLLM\n","authors":["Chuwei Luo","Yufan Shen","Zhaoqing Zhu","Qi Zheng","Zhi Yu","Cong Yao"],"pdf_url":"https://arxiv.org/pdf/2404.05225v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05220v1","updated":"2024-04-08T06:32:11Z","published":"2024-04-08T06:32:11Z","title":"StylizedGS: Controllable Stylization for 3D Gaussian Splatting","summary":" With the rapid development of XR, 3D generation and editing are becoming more\nand more important, among which, stylization is an important tool of 3D\nappearance editing. It can achieve consistent 3D artistic stylization given a\nsingle reference style image and thus is a user-friendly editing way. However,\nrecent NeRF-based 3D stylization methods face efficiency issues that affect the\nactual user experience and the implicit nature limits its ability to transfer\nthe geometric pattern styles. Additionally, the ability for artists to exert\nflexible control over stylized scenes is considered highly desirable, fostering\nan environment conducive to creative exploration. In this paper, we introduce\nStylizedGS, a 3D neural style transfer framework with adaptable control over\nperceptual factors based on 3D Gaussian Splatting (3DGS) representation. The\n3DGS brings the benefits of high efficiency. We propose a GS filter to\neliminate floaters in the reconstruction which affects the stylization effects\nbefore stylization. Then the nearest neighbor-based style loss is introduced to\nachieve stylization by fine-tuning the geometry and color parameters of 3DGS,\nwhile a depth preservation loss with other regularizations is proposed to\nprevent the tampering of geometry content. Moreover, facilitated by specially\ndesigned losses, StylizedGS enables users to control color, stylized scale and\nregions during the stylization to possess customized capabilities. Our method\ncan attain high-quality stylization results characterized by faithful\nbrushstrokes and geometric consistency with flexible controls. Extensive\nexperiments across various scenes and styles demonstrate the effectiveness and\nefficiency of our method concerning both stylization quality and inference FPS.\n","authors":["Dingxi Zhang","Zhuoxun Chen","Yu-Jie Yuan","Fang-Lue Zhang","Zhenliang He","Shiguang Shan","Lin Gao"],"pdf_url":"https://arxiv.org/pdf/2404.05220v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05773v2","updated":"2024-04-08T06:28:13Z","published":"2024-02-08T16:00:25Z","title":"UAV-Rain1k: A Benchmark for Raindrop Removal from UAV Aerial Imagery","summary":" Raindrops adhering to the lens of UAVs can obstruct visibility of the\nbackground scene and degrade image quality. Despite recent progress in image\nderaining methods and datasets, there is a lack of focus on raindrop removal\nfrom UAV aerial imagery due to the unique challenges posed by varying angles\nand rapid movement during drone flight. To fill the gap in this research, we\nfirst construct a new benchmark dataset for removing raindrops from UAV images,\ncalled UAV-Rain1k. In this letter, we provide a dataset generation pipeline,\nwhich includes modeling raindrop shapes using Blender, collecting background\nimages from various UAV angles, random sampling of rain masks and etc. Based on\nthe proposed benchmark, we further present a comprehensive evaluation of\nexisting representative image deraining algorithms, and reveal future research\nopportunities worth exploring. The proposed dataset is publicly available at\nhttps://github.com/cschenxiang/UAV-Rain1k.\n","authors":["Wenhui Chang","Hongming Chen","Xin He","Xiang Chen","Liangduo Shen"],"pdf_url":"https://arxiv.org/pdf/2402.05773v2.pdf","comment":"Accepted by IEEE/CVF Conference on Computer Vision and Pattern\n Recognition Workshops (CVPRW) 2024"},{"id":"http://arxiv.org/abs/2312.17118v3","updated":"2024-04-08T06:23:12Z","published":"2023-12-28T16:54:53Z","title":"Fully Sparse 3D Occupancy Prediction","summary":" Occupancy prediction plays a pivotal role in autonomous driving. Previous\nmethods typically construct dense 3D volumes, neglecting the inherent sparsity\nof the scene and suffering high computational costs. To bridge the gap, we\nintroduce a novel fully sparse occupancy network, termed SparseOcc. SparseOcc\ninitially reconstructs a sparse 3D representation from visual inputs and\nsubsequently predicts semantic/instance occupancy from the 3D sparse\nrepresentation by sparse queries. A mask-guided sparse sampling is designed to\nenable sparse queries to interact with 2D features in a fully sparse manner,\nthereby circumventing costly dense features or global attention. Additionally,\nwe design a thoughtful ray-based evaluation metric, namely RayIoU, to solve the\ninconsistency penalty along depths raised in traditional voxel-level mIoU\ncriteria. SparseOcc demonstrates its effectiveness by achieving a RayIoU of\n34.0, while maintaining a real-time inference speed of 17.3 FPS, with 7 history\nframes inputs. By incorporating more preceding frames to 15, SparseOcc\ncontinuously improves its performance to 35.1 RayIoU without whistles and\nbells. Code is available at https://github.com/MCG-NJU/SparseOcc.\n","authors":["Haisong Liu","Yang Chen","Haiguang Wang","Zetong Yang","Tianyu Li","Jia Zeng","Li Chen","Hongyang Li","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2312.17118v3.pdf","comment":"Add new metric: RayIoU"},{"id":"http://arxiv.org/abs/2404.05218v1","updated":"2024-04-08T06:15:13Z","published":"2024-04-08T06:15:13Z","title":"Multi-agent Long-term 3D Human Pose Forecasting via Interaction-aware\n Trajectory Conditioning","summary":" Human pose forecasting garners attention for its diverse applications.\nHowever, challenges in modeling the multi-modal nature of human motion and\nintricate interactions among agents persist, particularly with longer\ntimescales and more agents. In this paper, we propose an interaction-aware\ntrajectory-conditioned long-term multi-agent human pose forecasting model,\nutilizing a coarse-to-fine prediction approach: multi-modal global trajectories\nare initially forecasted, followed by respective local pose forecasts\nconditioned on each mode. In doing so, our Trajectory2Pose model introduces a\ngraph-based agent-wise interaction module for a reciprocal forecast of local\nmotion-conditioned global trajectory and trajectory-conditioned local pose. Our\nmodel effectively handles the multi-modality of human motion and the complexity\nof long-term multi-agent interactions, improving performance in complex\nenvironments. Furthermore, we address the lack of long-term (6s+) multi-agent\n(5+) datasets by constructing a new dataset from real-world images and 2D\nannotations, enabling a comprehensive evaluation of our proposed model.\nState-of-the-art prediction performance on both complex and simpler datasets\nconfirms the generalized effectiveness of our method. The code is available at\nhttps://github.com/Jaewoo97/T2P.\n","authors":["Jaewoo Jeong","Daehee Park","Kuk-Jin Yoon"],"pdf_url":"https://arxiv.org/pdf/2404.05218v1.pdf","comment":"2024 CVPR Highlight"},{"id":"http://arxiv.org/abs/2404.02135v3","updated":"2024-04-08T06:11:48Z","published":"2024-04-02T17:48:46Z","title":"Enhancing Ship Classification in Optical Satellite Imagery: Integrating\n Convolutional Block Attention Module with ResNet for Improved Performance","summary":" This study presents an advanced Convolutional Neural Network (CNN)\narchitecture for ship classification from optical satellite imagery,\nsignificantly enhancing performance through the integration of the\nConvolutional Block Attention Module (CBAM) and additional architectural\ninnovations. Building upon the foundational ResNet50 model, we first\nincorporated a standard CBAM to direct the model's focus towards more\ninformative features, achieving an accuracy of 87% compared to the baseline\nResNet50's 85%. Further augmentations involved multi-scale feature integration,\ndepthwise separable convolutions, and dilated convolutions, culminating in the\nEnhanced ResNet Model with Improved CBAM. This model demonstrated a remarkable\naccuracy of 95%, with precision, recall, and f1-scores all witnessing\nsubstantial improvements across various ship classes. The bulk carrier and oil\ntanker classes, in particular, showcased nearly perfect precision and recall\nrates, underscoring the model's enhanced capability in accurately identifying\nand classifying ships. Attention heatmap analyses further validated the\nimproved model's efficacy, revealing a more focused attention on relevant ship\nfeatures, regardless of background complexities. These findings underscore the\npotential of integrating attention mechanisms and architectural innovations in\nCNNs for high-resolution satellite imagery classification. The study navigates\nthrough the challenges of class imbalance and computational costs, proposing\nfuture directions towards scalability and adaptability in new or rare ship type\nrecognition. This research lays a groundwork for the application of advanced\ndeep learning techniques in the domain of remote sensing, offering insights\ninto scalable and efficient satellite image classification.\n","authors":["Ryan Donghan Kwon","Gangjoo Robin Nam","Jisoo Tak","Junseob Shin","Hyerin Cha","Yeom Hyeok","Seung Won Lee"],"pdf_url":"https://arxiv.org/pdf/2404.02135v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05215v1","updated":"2024-04-08T06:07:32Z","published":"2024-04-08T06:07:32Z","title":"Spatio-Temporal Attention and Gaussian Processes for Personalized Video\n Gaze Estimation","summary":" Gaze is an essential prompt for analyzing human behavior and attention.\nRecently, there has been an increasing interest in determining gaze direction\nfrom facial videos. However, video gaze estimation faces significant\nchallenges, such as understanding the dynamic evolution of gaze in video\nsequences, dealing with static backgrounds, and adapting to variations in\nillumination. To address these challenges, we propose a simple and novel deep\nlearning model designed to estimate gaze from videos, incorporating a\nspecialized attention module. Our method employs a spatial attention mechanism\nthat tracks spatial dynamics within videos. This technique enables accurate\ngaze direction prediction through a temporal sequence model, adeptly\ntransforming spatial observations into temporal insights, thereby significantly\nimproving gaze estimation accuracy. Additionally, our approach integrates\nGaussian processes to include individual-specific traits, facilitating the\npersonalization of our model with just a few labeled samples. Experimental\nresults confirm the efficacy of the proposed approach, demonstrating its\nsuccess in both within-dataset and cross-dataset settings. Specifically, our\nproposed approach achieves state-of-the-art performance on the Gaze360 dataset,\nimproving by $2.5^\\circ$ without personalization. Further, by personalizing the\nmodel with just three samples, we achieved an additional improvement of\n$0.8^\\circ$. The code and pre-trained models are available at\n\\url{https://github.com/jswati31/stage}.\n","authors":["Swati Jindal","Mohit Yadav","Roberto Manduchi"],"pdf_url":"https://arxiv.org/pdf/2404.05215v1.pdf","comment":"Accepted at CVPR 2024 Gaze workshop"},{"id":"http://arxiv.org/abs/2404.05212v1","updated":"2024-04-08T05:58:07Z","published":"2024-04-08T05:58:07Z","title":"DiffCJK: Conditional Diffusion Model for High-Quality and Wide-coverage\n CJK Character Generation","summary":" Chinese, Japanese, and Korean (CJK), with a vast number of native speakers,\nhas profound influence on society and culture. The typesetting of CJK languages\ncarries a wide range of requirements due to the complexity of their scripts and\nunique literary traditions. A critical aspect of this typesetting process is\nthat CJK fonts need to provide a set of consistent-looking glyphs for\napproximately one hundred thousand characters. However, creating such a font is\ninherently labor-intensive and expensive, which significantly hampers the\ndevelopment of new CJK fonts for typesetting, historical, aesthetic, or\nartistic purposes.\n To bridge this gap, we are motivated by recent advancements in\ndiffusion-based generative models and propose a novel diffusion method for\ngenerating glyphs in a targeted style from a \\emph{single} conditioned,\nstandard glyph form. Our experiments show that our method is capable of\ngenerating fonts of both printed and hand-written styles, the latter of which\npresents a greater challenge. Moreover, our approach shows remarkable zero-shot\ngeneralization capabilities for non-CJK but Chinese-inspired scripts. We also\nshow our method facilitates smooth style interpolation and generates bitmap\nimages suitable for vectorization, which is crucial in the font creation\nprocess. In summary, our proposed method opens the door to high-quality,\ngenerative model-assisted font creation for CJK characters, for both\ntypesetting and artistic endeavors.\n","authors":["Yingtao Tian"],"pdf_url":"https://arxiv.org/pdf/2404.05212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05211v1","updated":"2024-04-08T05:50:46Z","published":"2024-04-08T05:50:46Z","title":"Multi-level Graph Subspace Contrastive Learning for Hyperspectral Image\n Clustering","summary":" Hyperspectral image (HSI) clustering is a challenging task due to its high\ncomplexity. Despite subspace clustering shows impressive performance for HSI,\ntraditional methods tend to ignore the global-local interaction in HSI data. In\nthis study, we proposed a multi-level graph subspace contrastive learning\n(MLGSC) for HSI clustering. The model is divided into the following main parts.\nGraph convolution subspace construction: utilizing spectral and texture\nfeautures to construct two graph convolution views. Local-global graph\nrepresentation: local graph representations were obtained by step-by-step\nconvolutions and a more representative global graph representation was obtained\nusing an attention-based pooling strategy. Multi-level graph subspace\ncontrastive learning: multi-level contrastive learning was conducted to obtain\nlocal-global joint graph representations, to improve the consistency of the\npositive samples between views, and to obtain more robust graph embeddings.\nSpecifically, graph-level contrastive learning is used to better learn global\nrepresentations of HSI data. Node-level intra-view and inter-view contrastive\nlearning is designed to learn joint representations of local regions of HSI.\nThe proposed model is evaluated on four popular HSI datasets: Indian Pines,\nPavia University, Houston, and Xu Zhou. The overall accuracies are 97.75%,\n99.96%, 92.28%, and 95.73%, which significantly outperforms the current\nstate-of-the-art clustering methods.\n","authors":["Jingxin Wang","Renxiang Guan","Kainan Gao","Zihao Li","Hao Li","Xianju Li","Chang Tang"],"pdf_url":"https://arxiv.org/pdf/2404.05211v1.pdf","comment":"IJCNN 2024"},{"id":"http://arxiv.org/abs/2404.05210v1","updated":"2024-04-08T05:45:03Z","published":"2024-04-08T05:45:03Z","title":"Bidirectional Long-Range Parser for Sequential Data Understanding","summary":" The transformer is a powerful data modelling framework responsible for\nremarkable performance on a wide range of tasks. However, they are limited in\nterms of scalability as it is suboptimal and inefficient to process\nlong-sequence data. To this purpose we introduce BLRP (Bidirectional Long-Range\nParser), a novel and versatile attention mechanism designed to increase\nperformance and efficiency on long-sequence tasks. It leverages short and long\nrange heuristics in the form of a local sliding window approach combined with a\nglobal bidirectional latent space synthesis technique. We show the benefits and\nversatility of our approach on vision and language domains by demonstrating\ncompetitive results against state-of-the-art methods on the Long-Range-Arena\nand CIFAR benchmarks together with ablations demonstrating the computational\nefficiency.\n","authors":["George Leotescu","Daniel Voinea","Alin-Ionut Popa"],"pdf_url":"https://arxiv.org/pdf/2404.05210v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05207v1","updated":"2024-04-08T05:23:12Z","published":"2024-04-08T05:23:12Z","title":"iVPT: Improving Task-relevant Information Sharing in Visual Prompt\n Tuning by Cross-layer Dynamic Connection","summary":" Recent progress has shown great potential of visual prompt tuning (VPT) when\nadapting pre-trained vision transformers to various downstream tasks. However,\nmost existing solutions independently optimize prompts at each layer, thereby\nneglecting the usage of task-relevant information encoded in prompt tokens\nacross layers. Additionally, existing prompt structures are prone to\ninterference from task-irrelevant noise in input images, which can do harm to\nthe sharing of task-relevant information. In this paper, we propose a novel VPT\napproach, \\textbf{iVPT}. It innovatively incorporates a cross-layer dynamic\nconnection (CDC) for input prompt tokens from adjacent layers, enabling\neffective sharing of task-relevant information. Furthermore, we design a\ndynamic aggregation (DA) module that facilitates selective sharing of\ninformation between layers. The combination of CDC and DA enhances the\nflexibility of the attention process within the VPT framework. Building upon\nthese foundations, iVPT introduces an attentive reinforcement (AR) mechanism,\nby automatically identifying salient image tokens, which are further enhanced\nby prompt tokens in an additive manner. Extensive experiments on 24 image\nclassification and semantic segmentation benchmarks clearly demonstrate the\nadvantage of the proposed iVPT, compared to the state-of-the-art counterparts.\n","authors":["Nan Zhou","Jiaxin Chen","Di Huang"],"pdf_url":"https://arxiv.org/pdf/2404.05207v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05206v1","updated":"2024-04-08T05:19:28Z","published":"2024-04-08T05:19:28Z","title":"SoundingActions: Learning How Actions Sound from Narrated Egocentric\n Videos","summary":" We propose a novel self-supervised embedding to learn how actions sound from\nnarrated in-the-wild egocentric videos. Whereas existing methods rely on\ncurated data with known audio-visual correspondence, our multimodal\ncontrastive-consensus coding (MC3) embedding reinforces the associations\nbetween audio, language, and vision when all modality pairs agree, while\ndiminishing those associations when any one pair does not. We show our approach\ncan successfully discover how the long tail of human actions sound from\negocentric video, outperforming an array of recent multimodal embedding\ntechniques on two datasets (Ego4D and EPIC-Sounds) and multiple cross-modal\ntasks.\n","authors":["Changan Chen","Kumar Ashutosh","Rohit Girdhar","David Harwath","Kristen Grauman"],"pdf_url":"https://arxiv.org/pdf/2404.05206v1.pdf","comment":"Accepted at CVPR 2024. Project page:\n https://vision.cs.utexas.edu/projects/soundingactions"},{"id":"http://arxiv.org/abs/2404.05205v1","updated":"2024-04-08T05:18:39Z","published":"2024-04-08T05:18:39Z","title":"A secure and private ensemble matcher using multi-vault obfuscated\n templates","summary":" Given the irrevocability of biometric samples and mounting privacy concerns,\nbiometric template security and secure matching are among the essential\nfeatures of any well-designed modern biometric system. In this paper, we\npropose an obfuscation method that hides the biometric template information\nwith just enough chaff. The main idea is to reduce the number of chaff points\nto a practical level by creating n sub-templates from the original template and\nhiding each sub-template with m chaff points. During verification, s closest\nvectors to the biometric query are retrieved from each vault and then combined\nto generate hash values that are compared with the stored hash value. We\ndemonstrate the effectiveness of synthetic facial images, generated by a\nGenerative Adversarial Network (GAN), as ``random chaff points'' within a\nsecure-vault authorization system. This approach safeguards user identities\nduring training and deployment. We tested our protocol using the AT&T, GT, and\nLFW face datasets, with the ROC areas under the curve being 0.99, 0.99, and\n0.90, respectively. These numbers were close to those of the unprotected\ntemplates, showing that our method does not adversely affect accuracy.\n","authors":["Babak Poorebrahim Gilkalaye","Shubhabrata Mukherjee","Reza Derakhshani"],"pdf_url":"https://arxiv.org/pdf/2404.05205v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11825v2","updated":"2024-04-08T05:11:47Z","published":"2023-11-20T15:03:56Z","title":"Holistic Inverse Rendering of Complex Facade via Aerial 3D Scanning","summary":" In this work, we use multi-view aerial images to reconstruct the geometry,\nlighting, and material of facades using neural signed distance fields (SDFs).\nWithout the requirement of complex equipment, our method only takes simple RGB\nimages captured by a drone as inputs to enable physically based and\nphotorealistic novel-view rendering, relighting, and editing. However, a\nreal-world facade usually has complex appearances ranging from diffuse rocks\nwith subtle details to large-area glass windows with specular reflections,\nmaking it hard to attend to everything. As a result, previous methods can\npreserve the geometry details but fail to reconstruct smooth glass windows or\nverse vise. In order to address this challenge, we introduce three spatial- and\nsemantic-adaptive optimization strategies, including a semantic regularization\napproach based on zero-shot segmentation techniques to improve material\nconsistency, a frequency-aware geometry regularization to balance surface\nsmoothness and details in different surfaces, and a visibility probe-based\nscheme to enable efficient modeling of the local lighting in large-scale\noutdoor environments. In addition, we capture a real-world facade aerial 3D\nscanning image set and corresponding point clouds for training and\nbenchmarking. The experiment demonstrates the superior quality of our method on\nfacade holistic inverse rendering, novel view synthesis, and scene editing\ncompared to state-of-the-art baselines.\n","authors":["Zixuan Xie","Rengan Xie","Rong Li","Kai Huang","Pengju Qiao","Jingsen Zhu","Xu Yin","Qi Ye","Wei Hua","Yuchi Huo","Hujun Bao"],"pdf_url":"https://arxiv.org/pdf/2311.11825v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01518v3","updated":"2024-04-08T05:09:19Z","published":"2024-04-01T22:53:47Z","title":"Temporally Consistent Unbalanced Optimal Transport for Unsupervised\n Action Segmentation","summary":" We propose a novel approach to the action segmentation task for long,\nuntrimmed videos, based on solving an optimal transport problem. By encoding a\ntemporal consistency prior into a Gromov-Wasserstein problem, we are able to\ndecode a temporally consistent segmentation from a noisy affinity/matching cost\nmatrix between video frames and action classes. Unlike previous approaches, our\nmethod does not require knowing the action order for a video to attain temporal\nconsistency. Furthermore, our resulting (fused) Gromov-Wasserstein problem can\nbe efficiently solved on GPUs using a few iterations of projected mirror\ndescent. We demonstrate the effectiveness of our method in an unsupervised\nlearning setting, where our method is used to generate pseudo-labels for\nself-training. We evaluate our segmentation approach and unsupervised learning\npipeline on the Breakfast, 50-Salads, YouTube Instructions and Desktop Assembly\ndatasets, yielding state-of-the-art results for the unsupervised video action\nsegmentation task.\n","authors":["Ming Xu","Stephen Gould"],"pdf_url":"https://arxiv.org/pdf/2404.01518v3.pdf","comment":"Accepted to CVPR 2024 (Oral)"},{"id":"http://arxiv.org/abs/2404.05196v1","updated":"2024-04-08T04:53:29Z","published":"2024-04-08T04:53:29Z","title":"HSViT: Horizontally Scalable Vision Transformer","summary":" While the Vision Transformer (ViT) architecture gains prominence in computer\nvision and attracts significant attention from multimedia communities, its\ndeficiency in prior knowledge (inductive bias) regarding shift, scale, and\nrotational invariance necessitates pre-training on large-scale datasets.\nFurthermore, the growing layers and parameters in both ViT and convolutional\nneural networks (CNNs) impede their applicability to mobile multimedia\nservices, primarily owing to the constrained computational resources on edge\ndevices. To mitigate the aforementioned challenges, this paper introduces a\nnovel horizontally scalable vision transformer (HSViT). Specifically, a novel\nimage-level feature embedding allows ViT to better leverage the inductive bias\ninherent in the convolutional layers. Based on this, an innovative horizontally\nscalable architecture is designed, which reduces the number of layers and\nparameters of the models while facilitating collaborative training and\ninference of ViT models across multiple nodes. The experimental results depict\nthat, without pre-training on large-scale datasets, HSViT achieves up to 10%\nhigher top-1 accuracy than state-of-the-art schemes, ascertaining its superior\npreservation of inductive bias. The code is available at\nhttps://github.com/xuchenhao001/HSViT.\n","authors":["Chenhao Xu","Chang-Tsun Li","Chee Peng Lim","Douglas Creighton"],"pdf_url":"https://arxiv.org/pdf/2404.05196v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05187v1","updated":"2024-04-08T04:27:36Z","published":"2024-04-08T04:27:36Z","title":"LGSDF: Continual Global Learning of Signed Distance Fields Aided by\n Local Updating","summary":" Implicit reconstruction of ESDF (Euclidean Signed Distance Field) involves\ntraining a neural network to regress the signed distance from any point to the\nnearest obstacle, which has the advantages of lightweight storage and\ncontinuous querying. However, existing algorithms usually rely on conflicting\nraw observations as training data, resulting in poor map performance. In this\npaper, we propose LGSDF, an ESDF continual Global learning algorithm aided by\nLocal updating. At the front end, axis-aligned grids are dynamically updated by\npre-processed sensor observations, where incremental fusion alleviates\nestimation error caused by limited viewing directions. At the back end, a\nrandomly initialized implicit ESDF neural network performs continual\nself-supervised learning guided by these grids to generate smooth and\ncontinuous maps. The results on multiple scenes show that LGSDF can construct\nmore accurate ESDF maps and meshes compared with SOTA (State Of The Art)\nexplicit and implicit mapping algorithms. The source code of LGSDF is publicly\navailable at https://github.com/BIT-DYN/LGSDF.\n","authors":["Yufeng Yue","Yinan Deng","Jiahui Wang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2404.05187v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05183v1","updated":"2024-04-08T04:17:27Z","published":"2024-04-08T04:17:27Z","title":"Progressive Alignment with VLM-LLM Feature to Augment Defect\n Classification for the ASE Dataset","summary":" Traditional defect classification approaches are facing with two barriers.\n(1) Insufficient training data and unstable data quality. Collecting sufficient\ndefective sample is expensive and time-costing, consequently leading to dataset\nvariance. It introduces the difficulty on recognition and learning. (2)\nOver-dependence on visual modality. When the image pattern and texture is\nmonotonic for all defect classes in a given dataset, the performance of\nconventional AOI system cannot be guaranteed. In scenarios where image quality\nis compromised due to mechanical failures or when defect information is\ninherently difficult to discern, the performance of deep models cannot be\nguaranteed. A main question is, \"how to solve those two problems when they\noccur at the same time?\" The feasible strategy is to explore another feature\nwithin dataset and combine an eminent vision-language model (VLM) and\nLarge-Language model (LLM) with their astonishing zero-shot capability. In this\nwork, we propose the special ASE dataset, including rich data description\nrecorded on image, for defect classification, but the defect feature is uneasy\nto learn directly. Secondly, We present the prompting for VLM-LLM against\ndefect classification with the proposed ASE dataset to activate extra-modality\nfeature from images to enhance performance. Then, We design the novel\nprogressive feature alignment (PFA) block to refine image-text feature to\nalleviate the difficulty of alignment under few-shot scenario. Finally, the\nproposed Cross-modality attention fusion (CMAF) module can effectively fuse\ndifferent modality feature. Experiment results have demonstrated our method's\neffectiveness over several defect classification methods for the ASE dataset.\n","authors":["Chih-Chung Hsu","Chia-Ming Lee","Chun-Hung Sun","Kuang-Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2404.05183v1.pdf","comment":"MULA 2024"},{"id":"http://arxiv.org/abs/2404.05181v1","updated":"2024-04-08T04:13:35Z","published":"2024-04-08T04:13:35Z","title":"Adaptive Learning for Multi-view Stereo Reconstruction","summary":" Deep learning has recently demonstrated its excellent performance on the task\nof multi-view stereo (MVS). However, loss functions applied for deep MVS are\nrarely studied. In this paper, we first analyze existing loss functions'\nproperties for deep depth based MVS approaches. Regression based loss leads to\ninaccurate continuous results by computing mathematical expectation, while\nclassification based loss outputs discretized depth values. To this end, we\nthen propose a novel loss function, named adaptive Wasserstein loss, which is\nable to narrow down the difference between the true and predicted probability\ndistributions of depth. Besides, a simple but effective offset module is\nintroduced to better achieve sub-pixel prediction accuracy. Extensive\nexperiments on different benchmarks, including DTU, Tanks and Temples and\nBlendedMVS, show that the proposed method with the adaptive Wasserstein loss\nand the offset module achieves state-of-the-art performance.\n","authors":["Qinglu Min","Jie Zhao","Zhihao Zhang","Chen Min"],"pdf_url":"https://arxiv.org/pdf/2404.05181v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05180v1","updated":"2024-04-08T04:10:50Z","published":"2024-04-08T04:10:50Z","title":"GloSoFarID: Global multispectral dataset for Solar Farm IDentification\n in satellite imagery","summary":" Solar Photovoltaic (PV) technology is increasingly recognized as a pivotal\nsolution in the global pursuit of clean and renewable energy. This technology\naddresses the urgent need for sustainable energy alternatives by converting\nsolar power into electricity without greenhouse gas emissions. It not only\ncurtails global carbon emissions but also reduces reliance on finite,\nnon-renewable energy sources. In this context, monitoring solar panel farms\nbecomes essential for understanding and facilitating the worldwide shift toward\nclean energy. This study contributes to this effort by developing the first\ncomprehensive global dataset of multispectral satellite imagery of solar panel\nfarms. This dataset is intended to form the basis for training robust machine\nlearning models, which can accurately map and analyze the expansion and\ndistribution of solar panel farms globally. The insights gained from this\nendeavor will be instrumental in guiding informed decision-making for a\nsustainable energy future. https://github.com/yzyly1992/GloSoFarID\n","authors":["Zhiyuan Yang","Ryan Rad"],"pdf_url":"https://arxiv.org/pdf/2404.05180v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05169v1","updated":"2024-04-08T03:33:01Z","published":"2024-04-08T03:33:01Z","title":"QMix: Quality-aware Learning with Mixed Noise for Robust Retinal Disease\n Diagnosis","summary":" Due to the complexity of medical image acquisition and the difficulty of\nannotation, medical image datasets inevitably contain noise. Noisy data with\nwrong labels affects the robustness and generalization ability of deep neural\nnetworks. Previous noise learning methods mainly considered noise arising from\nimages being mislabeled, i.e. label noise, assuming that all mislabeled images\nare of high image quality. However, medical images are prone to suffering\nextreme quality issues, i.e. data noise, where discriminative visual features\nare missing for disease diagnosis. In this paper, we propose a noise learning\nframework, termed as QMix, that learns a robust disease diagnosis model under\nmixed noise. QMix alternates between sample separation and quality-aware\nsemisupervised training in each training epoch. In the sample separation phase,\nwe design a joint uncertainty-loss criterion to effectively separate (1)\ncorrectly labeled images; (2) mislabeled images with high quality and (3)\nmislabeled images with low quality. In the semi-supervised training phase, we\ntrain a disease diagnosis model to learn robust feature representation from the\nseparated samples. Specifically, we devise a sample-reweighing loss to mitigate\nthe effect of mislabeled images with low quality during training. Meanwhile, a\ncontrastive enhancement loss is proposed to further distinguish mislabeled\nimages with low quality from correctly labeled images. QMix achieved\nstate-of-the-art disease diagnosis performance on five public retinal image\ndatasets and exhibited substantial improvement on robustness against mixed\nnoise.\n","authors":["Junlin Hou","Jilan Xu","Rui Feng","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2404.05169v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05163v1","updated":"2024-04-08T03:06:19Z","published":"2024-04-08T03:06:19Z","title":"Semantic Flow: Learning Semantic Field of Dynamic Scenes from Monocular\n Videos","summary":" In this work, we pioneer Semantic Flow, a neural semantic representation of\ndynamic scenes from monocular videos. In contrast to previous NeRF methods that\nreconstruct dynamic scenes from the colors and volume densities of individual\npoints, Semantic Flow learns semantics from continuous flows that contain rich\n3D motion information. As there is 2D-to-3D ambiguity problem in the viewing\ndirection when extracting 3D flow features from 2D video frames, we consider\nthe volume densities as opacity priors that describe the contributions of flow\nfeatures to the semantics on the frames. More specifically, we first learn a\nflow network to predict flows in the dynamic scene, and propose a flow feature\naggregation module to extract flow features from video frames. Then, we propose\na flow attention module to extract motion information from flow features, which\nis followed by a semantic network to output semantic logits of flows. We\nintegrate the logits with volume densities in the viewing direction to\nsupervise the flow features with semantic labels on video frames. Experimental\nresults show that our model is able to learn from multiple dynamic scenes and\nsupports a series of new tasks such as instance-level scene editing, semantic\ncompletions, dynamic scene tracking and semantic adaption on novel scenes.\nCodes are available at https://github.com/tianfr/Semantic-Flow/.\n","authors":["Fengrui Tian","Yueqi Duan","Angtian Wang","Jianfei Guo","Shaoyi Du"],"pdf_url":"https://arxiv.org/pdf/2404.05163v1.pdf","comment":"Accepted by ICLR 2024, Codes are available at\n https://github.com/tianfr/Semantic-Flow/"},{"id":"http://arxiv.org/abs/2311.08393v3","updated":"2024-04-08T02:57:55Z","published":"2023-11-14T18:53:28Z","title":"MVSA-Net: Multi-View State-Action Recognition for Robust and Deployable\n Trajectory Generation","summary":" The learn-from-observation (LfO) paradigm is a human-inspired mode for a\nrobot to learn to perform a task simply by watching it being performed. LfO can\nfacilitate robot integration on factory floors by minimizing disruption and\nreducing tedious programming. A key component of the LfO pipeline is a\ntransformation of the depth camera frames to the corresponding task state and\naction pairs, which are then relayed to learning techniques such as imitation\nor inverse reinforcement learning for understanding the task parameters. While\nseveral existing computer vision models analyze videos for activity\nrecognition, SA-Net specifically targets robotic LfO from RGB-D data. However,\nSA-Net and many other models analyze frame data captured from a single\nviewpoint. Their analysis is therefore highly sensitive to occlusions of the\nobserved task, which are frequent in deployments. An obvious way of reducing\nocclusions is to simultaneously observe the task from multiple viewpoints and\nsynchronously fuse the multiple streams in the model. Toward this, we present\nmulti-view SA-Net, which generalizes the SA-Net model to allow the perception\nof multiple viewpoints of the task activity, integrate them, and better\nrecognize the state and action in each frame. Performance evaluations on two\ndistinct domains establish that MVSA-Net recognizes the state-action pairs\nunder occlusion more accurately compared to single-view MVSA-Net and other\nbaselines. Our ablation studies further evaluate its performance under\ndifferent ambient conditions and establish the contribution of the architecture\ncomponents. As such, MVSA-Net offers a significantly more robust and deployable\nstate-action trajectory generation compared to previous methods.\n","authors":["Ehsan Asali","Prashant Doshi","Jin Sun"],"pdf_url":"https://arxiv.org/pdf/2311.08393v3.pdf","comment":"Presented at Deployable AI Workshop at AAAI-2024 and 'Towards\n Reliable and Deployable Learning-Based Robotic Systems' Workshop at CoRL2023"},{"id":"http://arxiv.org/abs/2403.05805v2","updated":"2024-04-08T02:47:54Z","published":"2024-03-09T05:50:32Z","title":"And Then the Hammer Broke: Reflections on Machine Ethics from Feminist\n Philosophy of Science","summary":" Vision is an important metaphor in ethical and political questions of\nknowledge. The feminist philosopher Donna Haraway points out the ``perverse''\nnature of an intrusive, alienating, all-seeing vision (to which we might cry\nout ``stop looking at me!''), but also encourages us to embrace the embodied\nnature of sight and its promises for genuinely situated knowledge. Current\ntechnologies of machine vision -- surveillance cameras, drones (for war or\nrecreation), iPhone cameras -- are usually construed as instances of the former\nrather than the latter, and for good reasons. However, although in no way\nattempting to diminish the real suffering these technologies have brought about\nin the world, I make the case for understanding technologies of computer vision\nas material instances of embodied seeing and situated knowing. Furthermore,\nborrowing from Iris Murdoch's concept of moral vision, I suggest that these\ntechnologies direct our labor towards self-reflection in ethically significant\nways. My approach draws upon paradigms in computer vision research,\nphenomenology, and feminist epistemology. Ultimately, this essay is an argument\nfor directing more philosophical attention from merely criticizing technologies\nof vision as ethically deficient towards embracing them as complex,\nmethodologically and epistemologically important objects.\n","authors":["Andre Ye"],"pdf_url":"https://arxiv.org/pdf/2403.05805v2.pdf","comment":"Pacific University Philosophy Conference"},{"id":"http://arxiv.org/abs/2403.03954v3","updated":"2024-04-08T02:46:38Z","published":"2024-03-06T18:58:49Z","title":"3D Diffusion Policy: Generalizable Visuomotor Policy Learning via Simple\n 3D Representations","summary":" Imitation learning provides an efficient way to teach robots dexterous\nskills; however, learning complex skills robustly and generalizablely usually\nconsumes large amounts of human demonstrations. To tackle this challenging\nproblem, we present 3D Diffusion Policy (DP3), a novel visual imitation\nlearning approach that incorporates the power of 3D visual representations into\ndiffusion policies, a class of conditional action generative models. The core\ndesign of DP3 is the utilization of a compact 3D visual representation,\nextracted from sparse point clouds with an efficient point encoder. In our\nexperiments involving 72 simulation tasks, DP3 successfully handles most tasks\nwith just 10 demonstrations and surpasses baselines with a 24.2% relative\nimprovement. In 4 real robot tasks, DP3 demonstrates precise control with a\nhigh success rate of 85%, given only 40 demonstrations of each task, and shows\nexcellent generalization abilities in diverse aspects, including space,\nviewpoint, appearance, and instance. Interestingly, in real robot experiments,\nDP3 rarely violates safety requirements, in contrast to baseline methods which\nfrequently do, necessitating human intervention. Our extensive evaluation\nhighlights the critical importance of 3D representations in real-world robot\nlearning. Videos, code, and data are available on\nhttps://3d-diffusion-policy.github.io .\n","authors":["Yanjie Ze","Gu Zhang","Kangning Zhang","Chenyuan Hu","Muhan Wang","Huazhe Xu"],"pdf_url":"https://arxiv.org/pdf/2403.03954v3.pdf","comment":"Videos, code, and data: https://3d-diffusion-policy.github.io"},{"id":"http://arxiv.org/abs/2404.00989v2","updated":"2024-04-08T02:37:25Z","published":"2024-04-01T08:34:42Z","title":"360+x: A Panoptic Multi-modal Scene Understanding Dataset","summary":" Human perception of the world is shaped by a multitude of viewpoints and\nmodalities. While many existing datasets focus on scene understanding from a\ncertain perspective (e.g. egocentric or third-person views), our dataset offers\na panoptic perspective (i.e. multiple viewpoints with multiple data\nmodalities). Specifically, we encapsulate third-person panoramic and front\nviews, as well as egocentric monocular/binocular views with rich modalities\nincluding video, multi-channel audio, directional binaural delay, location data\nand textual scene descriptions within each scene captured, presenting\ncomprehensive observation of the world. Figure 1 offers a glimpse of all 28\nscene categories of our 360+x dataset. To the best of our knowledge, this is\nthe first database that covers multiple viewpoints with multiple data\nmodalities to mimic how daily information is accessed in the real world.\nThrough our benchmark analysis, we presented 5 different scene understanding\ntasks on the proposed 360+x dataset to evaluate the impact and benefit of each\ndata modality and perspective in panoptic scene understanding. We hope this\nunique dataset could broaden the scope of comprehensive scene understanding and\nencourage the community to approach these problems from more diverse\nperspectives.\n","authors":["Hao Chen","Yuqi Hou","Chenyuan Qu","Irene Testini","Xiaohan Hong","Jianbo Jiao"],"pdf_url":"https://arxiv.org/pdf/2404.00989v2.pdf","comment":"CVPR 2024 (Oral Presentation), Project page:\n https://x360dataset.github.io/"},{"id":"http://arxiv.org/abs/2402.07819v2","updated":"2024-04-08T02:36:23Z","published":"2024-02-12T17:24:35Z","title":"A Benchmark Grocery Dataset of Realworld Point Clouds From Single View","summary":" Fine-grained grocery object recognition is an important computer vision\nproblem with broad applications in automatic checkout, in-store robotic\nnavigation, and assistive technologies for the visually impaired. Existing\ndatasets on groceries are mainly 2D images. Models trained on these datasets\nare limited to learning features from the regular 2D grids. While portable 3D\nsensors such as Kinect were commonly available for mobile phones, sensors such\nas LiDAR and TrueDepth, have recently been integrated into mobile phones.\nDespite the availability of mobile 3D sensors, there are currently no dedicated\nreal-world large-scale benchmark 3D datasets for grocery. In addition, existing\n3D datasets lack fine-grained grocery categories and have limited training\nsamples. Furthermore, collecting data by going around the object versus the\ntraditional photo capture makes data collection cumbersome. Thus, we introduce\na large-scale grocery dataset called 3DGrocery100. It constitutes 100 classes,\nwith a total of 87,898 3D point clouds created from 10,755 RGB-D single-view\nimages. We benchmark our dataset on six recent state-of-the-art 3D point cloud\nclassification models. Additionally, we also benchmark the dataset on few-shot\nand continual learning point cloud classification tasks. Project Page:\nhttps://bigdatavision.org/3DGrocery100/.\n","authors":["Shivanand Venkanna Sheshappanavar","Tejas Anvekar","Shivanand Kundargi","Yufan Wang","Chandra Kambhamettu"],"pdf_url":"https://arxiv.org/pdf/2402.07819v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02241v2","updated":"2024-04-08T02:06:37Z","published":"2024-04-02T18:59:39Z","title":"Linear Combination of Saved Checkpoints Makes Consistency and Diffusion\n Models Better","summary":" Diffusion Models (DM) and Consistency Models (CM) are two types of popular\ngenerative models with good generation quality on various tasks. When training\nDM and CM, intermediate weight checkpoints are not fully utilized and only the\nlast converged checkpoint is used. In this work, we find that high-quality\nmodel weights often lie in a basin which cannot be reached by SGD but can be\nobtained by proper checkpoint averaging. Based on these observations, we\npropose LCSC, a simple but effective and efficient method to enhance the\nperformance of DM and CM, by combining checkpoints along the training\ntrajectory with coefficients deduced from evolutionary search. We demonstrate\nthe value of LCSC through two use cases: $\\textbf{(a) Reducing training cost.}$\nWith LCSC, we only need to train DM/CM with fewer number of iterations and/or\nlower batch sizes to obtain comparable sample quality with the fully trained\nmodel. For example, LCSC achieves considerable training speedups for CM\n(23$\\times$ on CIFAR-10 and 15$\\times$ on ImageNet-64). $\\textbf{(b) Enhancing\npre-trained models.}$ Assuming full training is already done, LCSC can further\nimprove the generation quality or speed of the final converged models. For\nexample, LCSC achieves better performance using 1 number of function evaluation\n(NFE) than the base model with 2 NFE on consistency distillation, and decreases\nthe NFE of DM from 15 to 9 while maintaining the generation quality on\nCIFAR-10. Our code is available at\nhttps://github.com/imagination-research/LCSC.\n","authors":["Enshu Liu","Junyi Zhu","Zinan Lin","Xuefei Ning","Matthew B. Blaschko","Sergey Yekhanin","Shengen Yan","Guohao Dai","Huazhong Yang","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2404.02241v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05145v1","updated":"2024-04-08T02:02:15Z","published":"2024-04-08T02:02:15Z","title":"UniMix: Towards Domain Adaptive and Generalizable LiDAR Semantic\n Segmentation in Adverse Weather","summary":" LiDAR semantic segmentation (LSS) is a critical task in autonomous driving\nand has achieved promising progress. However, prior LSS methods are\nconventionally investigated and evaluated on datasets within the same domain in\nclear weather. The robustness of LSS models in unseen scenes and all weather\nconditions is crucial for ensuring safety and reliability in real applications.\nTo this end, we propose UniMix, a universal method that enhances the\nadaptability and generalizability of LSS models. UniMix first leverages\nphysically valid adverse weather simulation to construct a Bridge Domain, which\nserves to bridge the domain gap between the clear weather scenes and the\nadverse weather scenes. Then, a Universal Mixing operator is defined regarding\nspatial, intensity, and semantic distributions to create the intermediate\ndomain with mixed samples from given domains. Integrating the proposed two\ntechniques into a teacher-student framework, UniMix efficiently mitigates the\ndomain gap and enables LSS models to learn weather-robust and domain-invariant\nrepresentations. We devote UniMix to two main setups: 1) unsupervised domain\nadaption, adapting the model from the clear weather source domain to the\nadverse weather target domain; 2) domain generalization, learning a model that\ngeneralizes well to unseen scenes in adverse weather. Extensive experiments\nvalidate the effectiveness of UniMix across different tasks and datasets, all\nachieving superior performance over state-of-the-art methods. The code will be\nreleased.\n","authors":["Haimei Zhao","Jing Zhang","Zhuo Chen","Shanshan Zhao","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2404.05145v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05144v1","updated":"2024-04-08T01:55:28Z","published":"2024-04-08T01:55:28Z","title":"Enhancing Clinical Efficiency through LLM: Discharge Note Generation for\n Cardiac Patients","summary":" Medical documentation, including discharge notes, is crucial for ensuring\npatient care quality, continuity, and effective medical communication. However,\nthe manual creation of these documents is not only time-consuming but also\nprone to inconsistencies and potential errors. The automation of this\ndocumentation process using artificial intelligence (AI) represents a promising\narea of innovation in healthcare. This study directly addresses the\ninefficiencies and inaccuracies in creating discharge notes manually,\nparticularly for cardiac patients, by employing AI techniques, specifically\nlarge language model (LLM). Utilizing a substantial dataset from a cardiology\ncenter, encompassing wide-ranging medical records and physician assessments,\nour research evaluates the capability of LLM to enhance the documentation\nprocess. Among the various models assessed, Mistral-7B distinguished itself by\naccurately generating discharge notes that significantly improve both\ndocumentation efficiency and the continuity of care for patients. These notes\nunderwent rigorous qualitative evaluation by medical expert, receiving high\nmarks for their clinical relevance, completeness, readability, and contribution\nto informed decision-making and care planning. Coupled with quantitative\nanalyses, these results confirm Mistral-7B's efficacy in distilling complex\nmedical information into concise, coherent summaries. Overall, our findings\nilluminate the considerable promise of specialized LLM, such as Mistral-7B, in\nrefining healthcare documentation workflows and advancing patient care. This\nstudy lays the groundwork for further integrating advanced AI technologies in\nhealthcare, demonstrating their potential to revolutionize patient\ndocumentation and support better care outcomes.\n","authors":["HyoJe Jung","Yunha Kim","Heejung Choi","Hyeram Seo","Minkyoung Kim","JiYe Han","Gaeun Kee","Seohyun Park","Soyoung Ko","Byeolhee Kim","Suyeon Kim","Tae Joon Jun","Young-Hak Kim"],"pdf_url":"https://arxiv.org/pdf/2404.05144v1.pdf","comment":"10 pages, 1 figure, 3 tables, conference"},{"id":"http://arxiv.org/abs/2404.05139v1","updated":"2024-04-08T01:38:43Z","published":"2024-04-08T01:38:43Z","title":"Better Monocular 3D Detectors with LiDAR from the Past","summary":" Accurate 3D object detection is crucial to autonomous driving. Though\nLiDAR-based detectors have achieved impressive performance, the high cost of\nLiDAR sensors precludes their widespread adoption in affordable vehicles.\nCamera-based detectors are cheaper alternatives but often suffer inferior\nperformance compared to their LiDAR-based counterparts due to inherent depth\nambiguities in images. In this work, we seek to improve monocular 3D detectors\nby leveraging unlabeled historical LiDAR data. Specifically, at inference time,\nwe assume that the camera-based detectors have access to multiple unlabeled\nLiDAR scans from past traversals at locations of interest (potentially from\nother high-end vehicles equipped with LiDAR sensors). Under this setup, we\nproposed a novel, simple, and end-to-end trainable framework, termed\nAsyncDepth, to effectively extract relevant features from asynchronous LiDAR\ntraversals of the same location for monocular 3D detectors. We show consistent\nand significant performance gain (up to 9 AP) across multiple state-of-the-art\nmodels and datasets with a negligible additional latency of 9.66 ms and a small\nstorage cost.\n","authors":["Yurong You","Cheng Perng Phoo","Carlos Andres Diaz-Ruiz","Katie Z Luo","Wei-Lun Chao","Mark Campbell","Bharath Hariharan","Kilian Q Weinberger"],"pdf_url":"https://arxiv.org/pdf/2404.05139v1.pdf","comment":"Accepted by ICRA 2022. The code can be found at\n https://github.com/YurongYou/AsyncDepth"},{"id":"http://arxiv.org/abs/2404.05136v1","updated":"2024-04-08T01:29:10Z","published":"2024-04-08T01:29:10Z","title":"Self-Supervised Multi-Object Tracking with Path Consistency","summary":" In this paper, we propose a novel concept of path consistency to learn robust\nobject matching without using manual object identity supervision. Our key idea\nis that, to track a object through frames, we can obtain multiple different\nassociation results from a model by varying the frames it can observe, i.e.,\nskipping frames in observation. As the differences in observations do not alter\nthe identities of objects, the obtained association results should be\nconsistent. Based on this rationale, we generate multiple observation paths,\neach specifying a different set of frames to be skipped, and formulate the Path\nConsistency Loss that enforces the association results are consistent across\ndifferent observation paths. We use the proposed loss to train our object\nmatching model with only self-supervision. By extensive experiments on three\ntracking datasets (MOT17, PersonPath22, KITTI), we demonstrate that our method\noutperforms existing unsupervised methods with consistent margins on various\nevaluation metrics, and even achieves performance close to supervised methods.\n","authors":["Zijia Lu","Bing Shuai","Yanbei Chen","Zhenlin Xu","Davide Modolo"],"pdf_url":"https://arxiv.org/pdf/2404.05136v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05129v1","updated":"2024-04-08T01:14:09Z","published":"2024-04-08T01:14:09Z","title":"Image-based Agarwood Resinous Area Segmentation using Deep Learning","summary":" The manual extraction method of Agarwood resinous compound is laborious work,\nrequires skilled workers, and is subject to human errors. Commercial Agarwood\nindustries have been actively exploring using Computer Numerical Control (CNC)\nmachines to replace human effort for this particular task. The CNC machine\naccepts a G-code script produced from a binary image in which the wood region\nthat needs to be chiselled off is marked with (0, 0, 0) as its RGB value.\nRather than requiring a human expert to perform the region marking, we propose\nusing a Deep learning image segmentation method instead. Our setup involves a\ncamera that captures the cross-section image and then passes the image file to\na computer. The computer performs the automated image segmentation and feeds\nthe CNC machine with a G-code script. In this article, we report the initial\nsegmentation results achieved using a state-of-the-art Deep learning\nsegmentation method and discuss potential improvements to refine the\nsegmentation accuracy.\n","authors":["Irwandi Hipiny","Johari Abdullah","Noor Alamshah Bolhassan"],"pdf_url":"https://arxiv.org/pdf/2404.05129v1.pdf","comment":"15 pages, 6 figures, 3 tables"},{"id":"http://arxiv.org/abs/2207.01200v4","updated":"2024-04-08T01:11:22Z","published":"2022-07-04T05:03:10Z","title":"S$^{5}$Mars: Semi-Supervised Learning for Mars Semantic Segmentation","summary":" Deep learning has become a powerful tool for Mars exploration. Mars terrain\nsemantic segmentation is an important Martian vision task, which is the base of\nrover autonomous planning and safe driving. However, there is a lack of\nsufficient detailed and high-confidence data annotations, which are exactly\nrequired by most deep learning methods to obtain a good model. To address this\nproblem, we propose our solution from the perspective of joint data and method\ndesign. We first present a newdataset S5Mars for Semi-SuperviSed learning on\nMars Semantic Segmentation, which contains 6K high-resolution images and is\nsparsely annotated based on confidence, ensuring the high quality of labels.\nThen to learn from this sparse data, we propose a semi-supervised learning\n(SSL) framework for Mars image semantic segmentation, to learn representations\nfrom limited labeled data. Different from the existing SSL methods which are\nmostly targeted at the Earth image data, our method takes into account Mars\ndata characteristics. Specifically, we first investigate the impact of current\nwidely used natural image augmentations on Mars images. Based on the analysis,\nwe then proposed two novel and effective augmentations for SSL of Mars\nsegmentation, AugIN and SAM-Mix, which serve as strong augmentations to boost\nthe model performance. Meanwhile, to fully leverage the unlabeled data, we\nintroduce a soft-to-hard consistency learning strategy, learning from different\ntargets based on prediction confidence. Experimental results show that our\nmethod can outperform state-of-the-art SSL approaches remarkably. Our proposed\ndataset is available at https://jhang2020.github.io/S5Mars.github.io/.\n","authors":["Jiahang Zhang","Lilang Lin","Zejia Fan","Wenjing Wang","Jiaying Liu"],"pdf_url":"https://arxiv.org/pdf/2207.01200v4.pdf","comment":"IEEE TGRS 2024"},{"id":"http://arxiv.org/abs/2404.05128v1","updated":"2024-04-08T01:08:41Z","published":"2024-04-08T01:08:41Z","title":"Improving Deep Learning Predictions with Simulated Images, and Vice\n Versa","summary":" Artificial neural networks are often used to identify features of crop\nplants. However, training their models requires many annotated images, which\ncan be expensive and time-consuming to acquire. Procedural models of plants,\nsuch as those developed with Lindenmayer-systems (L-systems) can be created to\nproduce visually realistic simulations, and hence images of plant simulations,\nwhere annotations are implicitly known. These synthetic images can either\naugment or completely replace real images in training neural networks for\nphenotyping tasks. In this paper, we systematically vary amounts of real and\nsynthetic images used for training in both maize and canola to better\nunderstand situations where synthetic images generated from L-systems can help\nprediction on real images. This work also explores the degree to which realism\nin the synthetic images improves prediction. Furthermore, we see how neural\nnetwork predictions can be used to help calibrate L-systems themselves,\ncreating a feedback loop.\n","authors":["Nazifa Azam Khan","Mikolaj Cieslak","Ian McQuillan"],"pdf_url":"https://arxiv.org/pdf/2404.05128v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03202v2","updated":"2024-04-08T01:05:57Z","published":"2024-04-04T05:10:26Z","title":"OmniGS: Omnidirectional Gaussian Splatting for Fast Radiance Field\n Reconstruction using Omnidirectional Images","summary":" Photorealistic reconstruction relying on 3D Gaussian Splatting has shown\npromising potential in robotics. However, the current 3D Gaussian Splatting\nsystem only supports radiance field reconstruction using undistorted\nperspective images. In this paper, we present OmniGS, a novel omnidirectional\nGaussian splatting system, to take advantage of omnidirectional images for fast\nradiance field reconstruction. Specifically, we conduct a theoretical analysis\nof spherical camera model derivatives in 3D Gaussian Splatting. According to\nthe derivatives, we then implement a new GPU-accelerated omnidirectional\nrasterizer that directly splats 3D Gaussians onto the equirectangular screen\nspace for omnidirectional image rendering. As a result, we realize\ndifferentiable optimization of the radiance field without the requirement of\ncube-map rectification or tangent-plane approximation. Extensive experiments\nconducted in egocentric and roaming scenarios demonstrate that our method\nachieves state-of-the-art reconstruction quality and high rendering speed using\nomnidirectional images. To benefit the research community, the code will be\nmade publicly available once the paper is published.\n","authors":["Longwei Li","Huajian Huang","Sai-Kit Yeung","Hui Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.03202v2.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.05111v1","updated":"2024-04-08T00:13:05Z","published":"2024-04-08T00:13:05Z","title":"Class Similarity Transition: Decoupling Class Similarities and Imbalance\n from Generalized Few-shot Segmentation","summary":" In Generalized Few-shot Segmentation (GFSS), a model is trained with a large\ncorpus of base class samples and then adapted on limited samples of novel\nclasses. This paper focuses on the relevance between base and novel classes,\nand improves GFSS in two aspects: 1) mining the similarity between base and\nnovel classes to promote the learning of novel classes, and 2) mitigating the\nclass imbalance issue caused by the volume difference between the support set\nand the training set. Specifically, we first propose a similarity transition\nmatrix to guide the learning of novel classes with base class knowledge. Then,\nwe leverage the Label-Distribution-Aware Margin (LDAM) loss and Transductive\nInference to the GFSS task to address the problem of class imbalance as well as\noverfitting the support set. In addition, by extending the probability\ntransition matrix, the proposed method can mitigate the catastrophic forgetting\nof base classes when learning novel classes. With a simple training phase, our\nproposed method can be applied to any segmentation network trained on base\nclasses. We validated our methods on the adapted version of OpenEarthMap.\nCompared to existing GFSS baselines, our method excels them all from 3% to 7%\nand ranks second in the OpenEarthMap Land Cover Mapping Few-Shot Challenge at\nthe completion of this paper. Code:\nhttps://github.com/earth-insights/ClassTrans\n","authors":["Shihong Wang","Ruixun Liu","Kaiyu Li","Jiawei Jiang","Xiangyong Cao"],"pdf_url":"https://arxiv.org/pdf/2404.05111v1.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.09250v2","updated":"2024-04-08T22:40:01Z","published":"2023-12-14T18:59:36Z","title":"Single Mesh Diffusion Models with Field Latents for Texture Generation","summary":" We introduce a framework for intrinsic latent diffusion models operating\ndirectly on the surfaces of 3D shapes, with the goal of synthesizing\nhigh-quality textures. Our approach is underpinned by two contributions: field\nlatents, a latent representation encoding textures as discrete vector fields on\nthe mesh vertices, and field latent diffusion models, which learn to denoise a\ndiffusion process in the learned latent space on the surface. We consider a\nsingle-textured-mesh paradigm, where our models are trained to generate\nvariations of a given texture on a mesh. We show the synthesized textures are\nof superior fidelity compared those from existing single-textured-mesh\ngenerative models. Our models can also be adapted for user-controlled editing\ntasks such as inpainting and label-guided generation. The efficacy of our\napproach is due in part to the equivariance of our proposed framework under\nisometries, allowing our models to seamlessly reproduce details across locally\nsimilar regions and opening the door to a notion of generative texture\ntransfer.\n","authors":["Thomas W. Mitchel","Carlos Esteves","Ameesh Makadia"],"pdf_url":"https://arxiv.org/pdf/2312.09250v2.pdf","comment":"CVPR 2024. Code and additional visualizations available:\n https://single-mesh-diffusion.github.io/"},{"id":"http://arxiv.org/abs/2311.12539v2","updated":"2024-04-08T22:19:23Z","published":"2023-11-21T11:33:15Z","title":"GMISeg: General Medical Image Segmentation without Re-Training","summary":" Although deep learning models have become the main method for medical image\nsegmentation, they often cannot be extended to unknown segmentation tasks\ninvolving new anatomical structures, image shapes, or labels. For new\nsegmentation tasks, researchers often have to retrain or fine-tune the model,\nwhich is time-consuming and poses a significant obstacle to clinical\nresearchers, who often lack the resources and professional knowledge to train\nneural networks. Therefore, we proposed a general method that can solve unknown\nmedical image segmentation tasks without requiring additional training. Given\nan example set of images and prompts for defining new segmentation tasks,\nGMISeg applies a novel low-rank fine-tuning strategy based on the proposed\napproach to the SAM (Segment Anything Model) image encoder, and works with the\nprompt encoder and mask decoder to fine-tune the labeled dataset without the\nneed for additional training. To achieve generalization of new tasks, we used\nmedical image datasets with different imaging modes for different parts. We\ntrained and generalized GMISeg on a different set of anatomical and imaging\nmodes using cardiac images on other site datasets. We have demonstrated that\nGMISeg outperforms the latest methods on unknown tasks and have conducted a\ncomprehensive analysis and summary of the important performance of the proposed\nmethod.\n","authors":["Jing Xu"],"pdf_url":"https://arxiv.org/pdf/2311.12539v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.11470v2","updated":"2024-04-08T22:01:32Z","published":"2022-10-20T17:59:54Z","title":"i-MAE: Are Latent Representations in Masked Autoencoders Linearly\n Separable?","summary":" Masked image modeling (MIM) has been recognized as a strong self-supervised\npre-training approach in the vision domain. However, the mechanism and\nproperties of the learned representations by such a scheme, as well as how to\nfurther enhance the representations are so far not well-explored. In this\npaper, we aim to explore an interactive Masked Autoencoders (i-MAE) framework\nto enhance the representation capability from two aspects: (1) employing a\ntwo-way image reconstruction and a latent feature reconstruction with\ndistillation loss to learn better features; (2) proposing a semantics-enhanced\nsampling strategy to boost the learned semantics in MAE. Upon the proposed\ni-MAE architecture, we can address two critical questions to explore the\nbehaviors of the learned representations in MAE: (1) Whether the separability\nof latent representations in Masked Autoencoders is helpful for model\nperformance? We study it by forcing the input as a mixture of two images\ninstead of one. (2) Whether we can enhance the representations in the latent\nfeature space by controlling the degree of semantics during sampling on Masked\nAutoencoders? To this end, we propose a sampling strategy within a mini-batch\nbased on the semantics of training samples to examine this aspect. Extensive\nexperiments are conducted on CIFAR-10/100, Tiny-ImageNet and ImageNet-1K to\nverify the observations we discovered. Furthermore, in addition to\nqualitatively analyzing the characteristics of the latent representations, we\nexamine the existence of linear separability and the degree of semantics in the\nlatent space by proposing two evaluation schemes. The surprising and consistent\nresults demonstrate that i-MAE is a superior framework design for understanding\nMAE frameworks, as well as achieving better representational ability. Code is\navailable at https://github.com/vision-learning-acceleration-lab/i-mae.\n","authors":["Kevin Zhang","Zhiqiang Shen"],"pdf_url":"https://arxiv.org/pdf/2210.11470v2.pdf","comment":"Project page: https://zhiqiangshen.com/projects/i-mae/"},{"id":"http://arxiv.org/abs/2404.03392v2","updated":"2024-04-08T21:26:47Z","published":"2024-04-04T11:49:56Z","title":"Two Tricks to Improve Unsupervised Segmentation Learning","summary":" We present two practical improvement techniques for unsupervised segmentation\nlearning. These techniques address limitations in the resolution and accuracy\nof predicted segmentation maps of recent state-of-the-art methods. Firstly, we\nleverage image post-processing techniques such as guided filtering to refine\nthe output masks, improving accuracy while avoiding substantial computational\ncosts. Secondly, we introduce a multi-scale consistency criterion, based on a\nteacher-student training scheme. This criterion matches segmentation masks\npredicted from regions of the input image extracted at different resolutions to\neach other. Experimental results on several benchmarks used in unsupervised\nsegmentation learning demonstrate the effectiveness of our proposed techniques.\n","authors":["Alp Eren Sari","Francesco Locatello","Paolo Favaro"],"pdf_url":"https://arxiv.org/pdf/2404.03392v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05872v1","updated":"2024-04-08T21:09:59Z","published":"2024-04-08T21:09:59Z","title":"TabConv: Low-Computation CNN Inference via Table Lookups","summary":" Convolutional Neural Networks (CNNs) have demonstrated remarkable ability\nthroughout the field of computer vision. However, CNN inference requires a\nlarge number of arithmetic operations, making them expensive to deploy in\nhardware. Current approaches alleviate this issue by developing\nhardware-supported, algorithmic processes to simplify spatial convolution\nfunctions. However, these methods still heavily rely on matrix multiplication,\nleading to significant computational overhead. To bridge the gap between\nhardware, algorithmic acceleration, and approximate matrix multiplication, we\npropose TabConv, a novel, table-based approximation for convolution to\nsignificantly reduce arithmetic operations during inference. Additionally, we\nintroduce a priority masking technique based on cosine similarity to select\nlayers for table-based approximation, thereby maintaining the model\nperformance. We evaluate our approach on popular CNNs: ResNet-18, ResNet-34,\nand NetworkInNetwork (NIN). TabConv preserves over 93% of the original model's\nperformance while reducing arithmetic operations by 36.5%, 25.8%, and 99.4% for\nResNet-18 on CIFAR-10, CIFAR-100, and MNIST, respectively, 35.6% and 99.3% for\nResNet-34 on CIFAR-10 and MNIST, and 98.9% for NIN on MNIST, achieving\nlow-computation inference.\n","authors":["Neelesh Gupta","Narayanan Kannan","Pengmiao Zhang","Viktor Prasanna"],"pdf_url":"https://arxiv.org/pdf/2404.05872v1.pdf","comment":"8 pages, Accepted at CF '24"},{"id":"http://arxiv.org/abs/2404.05862v1","updated":"2024-04-08T20:51:30Z","published":"2024-04-08T20:51:30Z","title":"Towards Improved Semiconductor Defect Inspection for high-NA EUVL based\n on SEMI-SuperYOLO-NAS","summary":" Due to potential pitch reduction, the semiconductor industry is adopting\nHigh-NA EUVL technology. However, its low depth of focus presents challenges\nfor High Volume Manufacturing. To address this, suppliers are exploring thinner\nphotoresists and new underlayers/hardmasks. These may suffer from poor SNR,\ncomplicating defect detection. Vision-based ML algorithms offer a promising\nsolution for semiconductor defect inspection. However, developing a robust ML\nmodel across various image resolutions without explicit training remains a\nchallenge for nano-scale defect inspection. This research's goal is to propose\na scale-invariant ADCD framework capable to upscale images, addressing this\nissue. We propose an improvised ADCD framework as SEMI-SuperYOLO-NAS, which\nbuilds upon the baseline YOLO-NAS architecture. This framework integrates a SR\nassisted branch to aid in learning HR features by the defect detection\nbackbone, particularly for detecting nano-scale defect instances from LR\nimages. Additionally, the SR-assisted branch can recursively generate upscaled\nimages from their corresponding downscaled counterparts, enabling defect\ndetection inference across various image resolutions without requiring explicit\ntraining. Moreover, we investigate improved data augmentation strategy aimed at\ngenerating diverse and realistic training datasets to enhance model\nperformance. We have evaluated our proposed approach using two original FAB\ndatasets obtained from two distinct processes and captured using two different\nimaging tools. Finally, we demonstrate zero-shot inference for our model on a\nnew, originating from a process condition distinct from the training dataset\nand possessing different Pitch characteristics. Experimental validation\ndemonstrates that our proposed ADCD framework aids in increasing the throughput\nof imaging tools for defect inspection by reducing the required image pixel\nresolutions.\n","authors":["Ying-Lin Chen","Jacob Deforce","Vic De Ridder","Bappaditya Dey","Victor Blanco","Sandip Halder","Philippe Leray"],"pdf_url":"https://arxiv.org/pdf/2404.05862v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05849v1","updated":"2024-04-08T20:31:27Z","published":"2024-04-08T20:31:27Z","title":"Localizing Moments of Actions in Untrimmed Videos of Infants with Autism\n Spectrum Disorder","summary":" Autism Spectrum Disorder (ASD) presents significant challenges in early\ndiagnosis and intervention, impacting children and their families. With\nprevalence rates rising, there is a critical need for accessible and efficient\nscreening tools. Leveraging machine learning (ML) techniques, in particular\nTemporal Action Localization (TAL), holds promise for automating ASD screening.\nThis paper introduces a self-attention based TAL model designed to identify\nASD-related behaviors in infant videos. Unlike existing methods, our approach\nsimplifies complex modeling and emphasizes efficiency, which is essential for\npractical deployment in real-world scenarios. Importantly, this work\nunderscores the importance of developing computer vision methods capable of\noperating in naturilistic environments with little equipment control,\naddressing key challenges in ASD screening. This study is the first to conduct\nend-to-end temporal action localization in untrimmed videos of infants with\nASD, offering promising avenues for early intervention and support. We report\nbaseline results of behavior detection using our TAL model. We achieve 70%\naccuracy for look face, 79% accuracy for look object, 72% for smile and 65% for\nvocalization.\n","authors":["Halil Ismail Helvaci","Sen-ching Samson Cheung","Chen-Nee Chuah","Sally Ozonoff"],"pdf_url":"https://arxiv.org/pdf/2404.05849v1.pdf","comment":"7 pages, 2 figures, 3 tables"},{"id":"http://arxiv.org/abs/2404.05828v1","updated":"2024-04-08T19:46:20Z","published":"2024-04-08T19:46:20Z","title":"Privacy-Preserving Deep Learning Using Deformable Operators for Secure\n Task Learning","summary":" In the era of cloud computing and data-driven applications, it is crucial to\nprotect sensitive information to maintain data privacy, ensuring truly reliable\nsystems. As a result, preserving privacy in deep learning systems has become a\ncritical concern. Existing methods for privacy preservation rely on image\nencryption or perceptual transformation approaches. However, they often suffer\nfrom reduced task performance and high computational costs. To address these\nchallenges, we propose a novel Privacy-Preserving framework that uses a set of\ndeformable operators for secure task learning. Our method involves shuffling\npixels during the analog-to-digital conversion process to generate visually\nprotected data. Those are then fed into a well-known network enhanced with\ndeformable operators. Using our approach, users can achieve equivalent\nperformance to original images without additional training using a secret key.\nMoreover, our method enables access control against unauthorized users.\nExperimental results demonstrate the efficacy of our approach, showcasing its\npotential in cloud-based scenarios and privacy-sensitive applications.\n","authors":["Fabian Perez","Jhon Lopez","Henry Arguello"],"pdf_url":"https://arxiv.org/pdf/2404.05828v1.pdf","comment":"copyright 2024 IEEE. Personal use of this material is permitted.\n Permission from IEEE must be obtained for all other uses, in any current or\n future media, including reprinting/republishing this material for advertising\n or promotional purposes, creating new collective works, for resale or\n redistribution to servers or lists, or reuse of any copyrighted component of\n this work in other works"},{"id":"http://arxiv.org/abs/2403.04932v2","updated":"2024-04-08T19:45:32Z","published":"2024-03-07T22:39:02Z","title":"Divide and Conquer: High-Resolution Industrial Anomaly Detection via\n Memory Efficient Tiled Ensemble","summary":" Industrial anomaly detection is an important task within computer vision with\na wide range of practical use cases. The small size of anomalous regions in\nmany real-world datasets necessitates processing the images at a high\nresolution. This frequently poses significant challenges concerning memory\nconsumption during the model training and inference stages, leaving some\nexisting methods impractical for widespread adoption. To overcome this\nchallenge, we present the tiled ensemble approach, which reduces memory\nconsumption by dividing the input images into a grid of tiles and training a\ndedicated model for each tile location. The tiled ensemble is compatible with\nany existing anomaly detection model without the need for any modification of\nthe underlying architecture. By introducing overlapping tiles, we utilize the\nbenefits of traditional stacking ensembles, leading to further improvements in\nanomaly detection capabilities beyond high resolution alone. We perform a\ncomprehensive analysis using diverse underlying architectures, including Padim,\nPatchCore, FastFlow, and Reverse Distillation, on two standard anomaly\ndetection datasets: MVTec and VisA. Our method demonstrates a notable\nimprovement across setups while remaining within GPU memory constraints,\nconsuming only as much GPU memory as a single model needs to process a single\ntile.\n","authors":["Blaž Rolih","Dick Ameln","Ashwin Vaidya","Samet Akcay"],"pdf_url":"https://arxiv.org/pdf/2403.04932v2.pdf","comment":"To appear at CVPR 24 Visual Anomaly Detection Workshop. Research\n conducted during Google Summer of Code 2023 at OpenVINO (Intel). GSoC 2023\n page: https://summerofcode.withgoogle.com/archive/2023/projects/WUSjdxGl"},{"id":"http://arxiv.org/abs/2401.00896v2","updated":"2024-04-08T18:40:31Z","published":"2023-12-31T10:51:52Z","title":"TrailBlazer: Trajectory Control for Diffusion-Based Video Generation","summary":" Within recent approaches to text-to-video (T2V) generation, achieving\ncontrollability in the synthesized video is often a challenge. Typically, this\nissue is addressed by providing low-level per-frame guidance in the form of\nedge maps, depth maps, or an existing video to be altered. However, the process\nof obtaining such guidance can be labor-intensive. This paper focuses on\nenhancing controllability in video synthesis by employing straightforward\nbounding boxes to guide the subject in various ways, all without the need for\nneural network training, finetuning, optimization at inference time, or the use\nof pre-existing videos. Our algorithm, TrailBlazer, is constructed upon a\npre-trained (T2V) model, and easy to implement. The subject is directed by a\nbounding box through the proposed spatial and temporal attention map editing.\nMoreover, we introduce the concept of keyframing, allowing the subject\ntrajectory and overall appearance to be guided by both a moving bounding box\nand corresponding prompts, without the need to provide a detailed mask. The\nmethod is efficient, with negligible additional computation relative to the\nunderlying pre-trained model. Despite the simplicity of the bounding box\nguidance, the resulting motion is surprisingly natural, with emergent effects\nincluding perspective and movement toward the virtual camera as the box size\nincreases.\n","authors":["Wan-Duo Kurt Ma","J. P. Lewis","W. Bastiaan Kleijn"],"pdf_url":"https://arxiv.org/pdf/2401.00896v2.pdf","comment":"14 pages, 18 figures, Project Page:\n https://hohonu-vicml.github.io/Trailblazer.Page/"},{"id":"http://arxiv.org/abs/2404.05814v1","updated":"2024-04-08T18:36:18Z","published":"2024-04-08T18:36:18Z","title":"Towards Explainable Automated Neuroanatomy","summary":" We present a novel method for quantifying the microscopic structure of brain\ntissue. It is based on the automated recognition of interpretable features\nobtained by analyzing the shapes of cells. This contrasts with prevailing\nmethods of brain anatomical analysis in two ways. First, contemporary methods\nuse gray-scale values derived from smoothed version of the anatomical images,\nwhich dissipated valuable information from the texture of the images. Second,\ncontemporary analysis uses the output of black-box Convolutional Neural\nNetworks, while our system makes decisions based on interpretable features\nobtained by analyzing the shapes of individual cells. An important benefit of\nthis open-box approach is that the anatomist can understand and correct the\ndecisions made by the computer. Our proposed system can accurately localize and\nidentify existing brain structures. This can be used to align and coregistar\nbrains and will facilitate connectomic studies for reverse engineering of brain\ncircuitry.\n","authors":["Kui Qian","Litao Qiao","Beth Friedman","Edward O'Donnell","David Kleinfeld","Yoav Freund"],"pdf_url":"https://arxiv.org/pdf/2404.05814v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05802v1","updated":"2024-04-08T18:05:24Z","published":"2024-04-08T18:05:24Z","title":"BatSort: Enhanced Battery Classification with Transfer Learning for\n Battery Sorting and Recycling","summary":" Battery recycling is a critical process for minimizing environmental harm and\nresource waste for used batteries. However, it is challenging, largely because\nsorting batteries is costly and hardly automated to group batteries based on\nbattery types. In this paper, we introduce a machine learning-based approach\nfor battery-type classification and address the daunting problem of data\nscarcity for the application. We propose BatSort which applies transfer\nlearning to utilize the existing knowledge optimized with large-scale datasets\nand customizes ResNet to be specialized for classifying battery types. We\ncollected our in-house battery-type dataset of small-scale to guide the\nknowledge transfer as a case study and evaluate the system performance. We\nconducted an experimental study and the results show that BatSort can achieve\noutstanding accuracy of 92.1% on average and up to 96.2% and the performance is\nstable for battery-type classification. Our solution helps realize fast and\nautomated battery sorting with minimized cost and can be transferred to related\nindustry applications with insufficient data.\n","authors":["Yunyi Zhao","Wei Zhang","Erhai Hu","Qingyu Yan","Cheng Xiang","King Jet Tseng","Dusit Niyato"],"pdf_url":"https://arxiv.org/pdf/2404.05802v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05783v1","updated":"2024-04-08T17:53:21Z","published":"2024-04-08T17:53:21Z","title":"Responsible Generative AI: What to Generate and What Not","summary":" In recent years, generative AI (GenAI), like large language models and\ntext-to-image models, has received significant attention across various\ndomains. However, ensuring the responsible generation of content by these\nmodels is crucial for their real-world applicability. This raises an\ninteresting question: \\textit{What should responsible GenAI generate, and what\nshould it not?} To answer the question, this paper investigates the practical\nresponsible requirements of both textual and visual generative models,\noutlining five key considerations: generating truthful content, avoiding toxic\ncontent, refusing harmful instruction, leaking no training data-related\ncontent, and ensuring generated content identifiable. Specifically, we review\nrecent advancements and challenges in addressing these requirements. Besides,\nwe discuss and emphasize the importance of responsible GenAI across healthcare,\neducation, finance, and artificial general intelligence domains. Through a\nunified perspective on both textual and visual generative models, this paper\naims to provide insights into practical safety-related issues and further\nbenefit the community in building responsible GenAI.\n","authors":["Jindong Gu"],"pdf_url":"https://arxiv.org/pdf/2404.05783v1.pdf","comment":"74 pages, 10 figures"},{"id":"http://arxiv.org/abs/2205.10793v2","updated":"2024-04-08T16:59:24Z","published":"2022-05-22T10:26:54Z","title":"Knowledge Distillation via the Target-aware Transformer","summary":" Knowledge distillation becomes a de facto standard to improve the performance\nof small neural networks. Most of the previous works propose to regress the\nrepresentational features from the teacher to the student in a one-to-one\nspatial matching fashion. However, people tend to overlook the fact that, due\nto the architecture differences, the semantic information on the same spatial\nlocation usually vary. This greatly undermines the underlying assumption of the\none-to-one distillation approach. To this end, we propose a novel one-to-all\nspatial matching knowledge distillation approach. Specifically, we allow each\npixel of the teacher feature to be distilled to all spatial locations of the\nstudent features given its similarity, which is generated from a target-aware\ntransformer. Our approach surpasses the state-of-the-art methods by a\nsignificant margin on various computer vision benchmarks, such as ImageNet,\nPascal VOC and COCOStuff10k. Code is available at\nhttps://github.com/sihaoevery/TaT.\n","authors":["Sihao Lin","Hongwei Xie","Bing Wang","Kaicheng Yu","Xiaojun Chang","Xiaodan Liang","Gang Wang"],"pdf_url":"https://arxiv.org/pdf/2205.10793v2.pdf","comment":"CVPR2022(Oral)"},{"id":"http://arxiv.org/abs/2303.17546v3","updated":"2024-04-08T16:49:16Z","published":"2023-03-30T17:13:56Z","title":"PAIR-Diffusion: A Comprehensive Multimodal Object-Level Image Editor","summary":" Generative image editing has recently witnessed extremely fast-paced growth.\nSome works use high-level conditioning such as text, while others use low-level\nconditioning. Nevertheless, most of them lack fine-grained control over the\nproperties of the different objects present in the image, i.e. object-level\nimage editing. In this work, we tackle the task by perceiving the images as an\namalgamation of various objects and aim to control the properties of each\nobject in a fine-grained manner. Out of these properties, we identify structure\nand appearance as the most intuitive to understand and useful for editing\npurposes. We propose PAIR Diffusion, a generic framework that can enable a\ndiffusion model to control the structure and appearance properties of each\nobject in the image. We show that having control over the properties of each\nobject in an image leads to comprehensive editing capabilities. Our framework\nallows for various object-level editing operations on real images such as\nreference image-based appearance editing, free-form shape editing, adding\nobjects, and variations. Thanks to our design, we do not require any inversion\nstep. Additionally, we propose multimodal classifier-free guidance which\nenables editing images using both reference images and text when using our\napproach with foundational diffusion models. We validate the above claims by\nextensively evaluating our framework on both unconditional and foundational\ndiffusion models. Please refer to\nhttps://vidit98.github.io/publication/conference-paper/pair_diff.html for code\nand model release.\n","authors":["Vidit Goel","Elia Peruzzo","Yifan Jiang","Dejia Xu","Xingqian Xu","Nicu Sebe","Trevor Darrell","Zhangyang Wang","Humphrey Shi"],"pdf_url":"https://arxiv.org/pdf/2303.17546v3.pdf","comment":"Accepted in CVPR 2024, Project page\n https://vidit98.github.io/publication/conference-paper/pair_diff.html"},{"id":"http://arxiv.org/abs/2312.03048v2","updated":"2024-04-08T08:59:24Z","published":"2023-12-05T18:34:12Z","title":"DGInStyle: Domain-Generalizable Semantic Segmentation with Image\n Diffusion Models and Stylized Semantic Control","summary":" Large, pretrained latent diffusion models (LDMs) have demonstrated an\nextraordinary ability to generate creative content, specialize to user data\nthrough few-shot fine-tuning, and condition their output on other modalities,\nsuch as semantic maps. However, are they usable as large-scale data generators,\ne.g., to improve tasks in the perception stack, like semantic segmentation? We\ninvestigate this question in the context of autonomous driving, and answer it\nwith a resounding \"yes\". We propose an efficient data generation pipeline\ntermed DGInStyle. First, we examine the problem of specializing a pretrained\nLDM to semantically-controlled generation within a narrow domain. Second, we\npropose a Style Swap technique to endow the rich generative prior with the\nlearned semantic control. Third, we design a Multi-resolution Latent Fusion\ntechnique to overcome the bias of LDMs towards dominant objects. Using\nDGInStyle, we generate a diverse dataset of street scenes, train a\ndomain-agnostic semantic segmentation model on it, and evaluate the model on\nmultiple popular autonomous driving datasets. Our approach consistently\nincreases the performance of several domain generalization methods compared to\nthe previous state-of-the-art methods. Source code and dataset are available at\nhttps://dginstyle.github.io.\n","authors":["Yuru Jia","Lukas Hoyer","Shengyu Huang","Tianfu Wang","Luc Van Gool","Konrad Schindler","Anton Obukhov"],"pdf_url":"https://arxiv.org/pdf/2312.03048v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05776v1","updated":"2024-04-08T06:47:03Z","published":"2024-04-08T06:47:03Z","title":"Forecasting Electric Vehicle Battery Output Voltage: A Predictive\n Modeling Approach","summary":" The battery management system plays a vital role in ensuring the safety and\ndependability of electric and hybrid vehicles. It is responsible for various\nfunctions, including state evaluation, monitoring, charge control, and cell\nbalancing, all integrated within the BMS. Nonetheless, due to the uncertainties\nsurrounding battery performance, implementing these functionalities poses\nsignificant challenges. In this study, we explore the latest approaches for\nassessing battery states, highlight notable advancements in battery management\nsystems (BMS), address existing issues with current BMS technology, and put\nforth possible solutions for predicting battery charging voltage.\n","authors":["Narayana Darapaneni","Ashish K","Ullas M S","Anwesh Reddy Paduri"],"pdf_url":"https://arxiv.org/pdf/2404.05776v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04125v2","updated":"2024-04-08T21:14:43Z","published":"2024-04-04T17:58:02Z","title":"No \"Zero-Shot\" Without Exponential Data: Pretraining Concept Frequency\n Determines Multimodal Model Performance","summary":" Web-crawled pretraining datasets underlie the impressive \"zero-shot\"\nevaluation performance of multimodal models, such as CLIP for\nclassification/retrieval and Stable-Diffusion for image generation. However, it\nis unclear how meaningful the notion of \"zero-shot\" generalization is for such\nmultimodal models, as it is not known to what extent their pretraining datasets\nencompass the downstream concepts targeted for during \"zero-shot\" evaluation.\nIn this work, we ask: How is the performance of multimodal models on downstream\nconcepts influenced by the frequency of these concepts in their pretraining\ndatasets? We comprehensively investigate this question across 34 models and\nfive standard pretraining datasets (CC-3M, CC-12M, YFCC-15M, LAION-400M,\nLAION-Aesthetics), generating over 300GB of data artifacts. We consistently\nfind that, far from exhibiting \"zero-shot\" generalization, multimodal models\nrequire exponentially more data to achieve linear improvements in downstream\n\"zero-shot\" performance, following a sample inefficient log-linear scaling\ntrend. This trend persists even when controlling for sample-level similarity\nbetween pretraining and downstream datasets, and testing on purely synthetic\ndata distributions. Furthermore, upon benchmarking models on long-tailed data\nsampled based on our analysis, we demonstrate that multimodal models across the\nboard perform poorly. We contribute this long-tail test set as the \"Let it\nWag!\" benchmark to further research in this direction. Taken together, our\nstudy reveals an exponential need for training data which implies that the key\nto \"zero-shot\" generalization capabilities under large-scale training paradigms\nremains to be found.\n","authors":["Vishaal Udandarao","Ameya Prabhu","Adhiraj Ghosh","Yash Sharma","Philip H. S. Torr","Adel Bibi","Samuel Albanie","Matthias Bethge"],"pdf_url":"https://arxiv.org/pdf/2404.04125v2.pdf","comment":"Extended version of the short paper accepted at DPFM, ICLR'24"},{"id":"http://arxiv.org/abs/2404.07236v1","updated":"2024-04-08T08:50:09Z","published":"2024-04-08T08:50:09Z","title":"Lightweight Deep Learning for Resource-Constrained Environments: A\n Survey","summary":" Over the past decade, the dominance of deep learning has prevailed across\nvarious domains of artificial intelligence, including natural language\nprocessing, computer vision, and biomedical signal processing. While there have\nbeen remarkable improvements in model accuracy, deploying these models on\nlightweight devices, such as mobile phones and microcontrollers, is constrained\nby limited resources. In this survey, we provide comprehensive design guidance\ntailored for these devices, detailing the meticulous design of lightweight\nmodels, compression methods, and hardware acceleration strategies. The\nprincipal goal of this work is to explore methods and concepts for getting\naround hardware constraints without compromising the model's accuracy.\nAdditionally, we explore two notable paths for lightweight deep learning in the\nfuture: deployment techniques for TinyML and Large Language Models. Although\nthese paths undoubtedly have potential, they also present significant\nchallenges, encouraging research into unexplored areas.\n","authors":["Hou-I Liu","Marco Galindo","Hongxia Xie","Lai-Kuan Wong","Hong-Han Shuai","Yung-Yui Li","Wen-Huang Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.07236v1.pdf","comment":"40 pages"}]},"2024-04-07T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.05107v1","updated":"2024-04-07T23:31:37Z","published":"2024-04-07T23:31:37Z","title":"Reconstructing Retinal Visual Images from 3T fMRI Data Enhanced by\n Unsupervised Learning","summary":" The reconstruction of human visual inputs from brain activity, particularly\nthrough functional Magnetic Resonance Imaging (fMRI), holds promising avenues\nfor unraveling the mechanisms of the human visual system. Despite the\nsignificant strides made by deep learning methods in improving the quality and\ninterpretability of visual reconstruction, there remains a substantial demand\nfor high-quality, long-duration, subject-specific 7-Tesla fMRI experiments. The\nchallenge arises in integrating diverse smaller 3-Tesla datasets or\naccommodating new subjects with brief and low-quality fMRI scans. In response\nto these constraints, we propose a novel framework that generates enhanced 3T\nfMRI data through an unsupervised Generative Adversarial Network (GAN),\nleveraging unpaired training across two distinct fMRI datasets in 7T and 3T,\nrespectively. This approach aims to overcome the limitations of the scarcity of\nhigh-quality 7-Tesla data and the challenges associated with brief and\nlow-quality scans in 3-Tesla experiments. In this paper, we demonstrate the\nreconstruction capabilities of the enhanced 3T fMRI data, highlighting its\nproficiency in generating superior input visual images compared to\ndata-intensive methods trained and tested on a single subject.\n","authors":["Yujian Xiong","Wenhui Zhu","Zhong-Lin Lu","Yalin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.05107v1.pdf","comment":"Accepted by ISBI 2024"},{"id":"http://arxiv.org/abs/2307.05845v5","updated":"2024-04-07T23:27:06Z","published":"2023-07-11T23:36:49Z","title":"PIGEON: Predicting Image Geolocations","summary":" Planet-scale image geolocalization remains a challenging problem due to the\ndiversity of images originating from anywhere in the world. Although approaches\nbased on vision transformers have made significant progress in geolocalization\naccuracy, success in prior literature is constrained to narrow distributions of\nimages of landmarks, and performance has not generalized to unseen places. We\npresent a new geolocalization system that combines semantic geocell creation,\nmulti-task contrastive pretraining, and a novel loss function. Additionally,\nour work is the first to perform retrieval over location clusters for guess\nrefinements. We train two models for evaluations on street-level data and\ngeneral-purpose image geolocalization; the first model, PIGEON, is trained on\ndata from the game of Geoguessr and is capable of placing over 40% of its\nguesses within 25 kilometers of the target location globally. We also develop a\nbot and deploy PIGEON in a blind experiment against humans, ranking in the top\n0.01% of players. We further challenge one of the world's foremost professional\nGeoguessr players to a series of six matches with millions of viewers, winning\nall six games. Our second model, PIGEOTTO, differs in that it is trained on a\ndataset of images from Flickr and Wikipedia, achieving state-of-the-art results\non a wide range of image geolocalization benchmarks, outperforming the previous\nSOTA by up to 7.7 percentage points on the city accuracy level and up to 38.8\npercentage points on the country level. Our findings suggest that PIGEOTTO is\nthe first image geolocalization model that effectively generalizes to unseen\nplaces and that our approach can pave the way for highly accurate, planet-scale\nimage geolocalization systems. Our code is available on GitHub.\n","authors":["Lukas Haas","Michal Skreta","Silas Alberti","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2307.05845v5.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2404.05105v1","updated":"2024-04-07T23:10:26Z","published":"2024-04-07T23:10:26Z","title":"VMambaMorph: a Visual Mamba-based Framework with Cross-Scan Module for\n Deformable 3D Image Registration","summary":" Image registration, a critical process in medical imaging, involves aligning\ndifferent sets of medical imaging data into a single unified coordinate system.\nDeep learning networks, such as the Convolutional Neural Network (CNN)-based\nVoxelMorph, Vision Transformer (ViT)-based TransMorph, and State Space Model\n(SSM)-based MambaMorph, have demonstrated effective performance in this domain.\nThe recent Visual State Space Model (VMamba), which incorporates a cross-scan\nmodule with SSM, has exhibited promising improvements in modeling global-range\ndependencies with efficient computational cost in computer vision tasks. This\npaper hereby introduces an exploration of VMamba with image registration, named\nVMambaMorph. This novel hybrid VMamba-CNN network is designed specifically for\n3D image registration. Utilizing a U-shaped network architecture, VMambaMorph\ncomputes the deformation field based on target and source volumes. The\nVMamba-based block with 2D cross-scan module is redesigned for 3D volumetric\nfeature processing, and a fine-grained feature extraction module is proposed\nfor high-dimensional feature learning. We validate VMambaMorph using a public\nbenchmark brain MR-CT registration dataset, comparing its performance against\ncurrent state-of-the-art methods. The results indicate that VMambaMorph\nachieves competitive registration quality. The code for VMambaMorph is\navailable on GitHub.\n","authors":["Ziyang Wang","Jian-Qing Zheng","Chao Ma","Tao Guo"],"pdf_url":"https://arxiv.org/pdf/2404.05105v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05102v1","updated":"2024-04-07T22:58:18Z","published":"2024-04-07T22:58:18Z","title":"LHU-Net: A Light Hybrid U-Net for Cost-Efficient, High-Performance\n Volumetric Medical Image Segmentation","summary":" As a result of the rise of Transformer architectures in medical image\nanalysis, specifically in the domain of medical image segmentation, a multitude\nof hybrid models have been created that merge the advantages of Convolutional\nNeural Networks (CNNs) and Transformers. These hybrid models have achieved\nnotable success by significantly improving segmentation accuracy. Yet, this\nprogress often comes at the cost of increased model complexity, both in terms\nof parameters and computational demand. Moreover, many of these models fail to\nconsider the crucial interplay between spatial and channel features, which\ncould further refine and improve segmentation outcomes. To address this, we\nintroduce LHU-Net, a Light Hybrid U-Net architecture optimized for volumetric\nmedical image segmentation. LHU-Net is meticulously designed to prioritize\nspatial feature analysis in its initial layers before shifting focus to\nchannel-based features in its deeper layers, ensuring a comprehensive feature\nextraction process. Rigorous evaluation across five benchmark datasets -\nSynapse, LA, Pancreas, ACDC, and BRaTS 2018 - underscores LHU-Net's superior\nperformance, showcasing its dual capacity for efficiency and accuracy. Notably,\nLHU-Net sets new performance benchmarks, such as attaining a Dice score of\n92.66 on the ACDC dataset, while simultaneously reducing parameters by 85% and\nquartering the computational load compared to existing state-of-the-art models.\nAchieved without any reliance on pre-training, additional data, or model\nensemble, LHU-Net's effectiveness is further evidenced by its state-of-the-art\nperformance across all evaluated datasets, utilizing fewer than 11 million\nparameters. This achievement highlights that balancing computational efficiency\nwith high accuracy in medical image segmentation is feasible. Our\nimplementation of LHU-Net is freely accessible to the research community on\nGitHub.\n","authors":["Yousef Sadegheih","Afshin Bozorgpour","Pratibha Kumari","Reza Azad","Dorit Merhof"],"pdf_url":"https://arxiv.org/pdf/2404.05102v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04001v4","updated":"2024-04-07T22:46:13Z","published":"2023-09-07T20:07:57Z","title":"MMSFormer: Multimodal Transformer for Material and Semantic Segmentation","summary":" Leveraging information across diverse modalities is known to enhance\nperformance on multimodal segmentation tasks. However, effectively fusing\ninformation from different modalities remains challenging due to the unique\ncharacteristics of each modality. In this paper, we propose a novel fusion\nstrategy that can effectively fuse information from different modality\ncombinations. We also propose a new model named Multi-Modal Segmentation\nTransFormer (MMSFormer) that incorporates the proposed fusion strategy to\nperform multimodal material and semantic segmentation tasks. MMSFormer\noutperforms current state-of-the-art models on three different datasets. As we\nbegin with only one input modality, performance improves progressively as\nadditional modalities are incorporated, showcasing the effectiveness of the\nfusion block in combining useful information from diverse input modalities.\nAblation studies show that different modules in the fusion block are crucial\nfor overall model performance. Furthermore, our ablation studies also highlight\nthe capacity of different input modalities to improve performance in the\nidentification of different types of materials. The code and pretrained models\nwill be made available at https://github.com/csiplab/MMSFormer.\n","authors":["Md Kaykobad Reza","Ashley Prater-Bennette","M. Salman Asif"],"pdf_url":"https://arxiv.org/pdf/2309.04001v4.pdf","comment":"Accepted by IEEE Open Journal of Signal Processing. 15 pages, 3\n figures, 9 tables"},{"id":"http://arxiv.org/abs/2401.02634v2","updated":"2024-04-07T22:18:52Z","published":"2024-01-05T04:53:33Z","title":"AG-ReID.v2: Bridging Aerial and Ground Views for Person\n Re-identification","summary":" Aerial-ground person re-identification (Re-ID) presents unique challenges in\ncomputer vision, stemming from the distinct differences in viewpoints, poses,\nand resolutions between high-altitude aerial and ground-based cameras. Existing\nresearch predominantly focuses on ground-to-ground matching, with aerial\nmatching less explored due to a dearth of comprehensive datasets. To address\nthis, we introduce AG-ReID.v2, a dataset specifically designed for person Re-ID\nin mixed aerial and ground scenarios. This dataset comprises 100,502 images of\n1,615 unique individuals, each annotated with matching IDs and 15 soft\nattribute labels. Data were collected from diverse perspectives using a UAV,\nstationary CCTV, and smart glasses-integrated camera, providing a rich variety\nof intra-identity variations. Additionally, we have developed an explainable\nattention network tailored for this dataset. This network features a\nthree-stream architecture that efficiently processes pairwise image distances,\nemphasizes key top-down features, and adapts to variations in appearance due to\naltitude differences. Comparative evaluations demonstrate the superiority of\nour approach over existing baselines. We plan to release the dataset and\nalgorithm source code publicly, aiming to advance research in this specialized\nfield of computer vision. For access, please visit\nhttps://github.com/huynguyen792/AG-ReID.v2.\n","authors":["Huy Nguyen","Kien Nguyen","Sridha Sridharan","Clinton Fookes"],"pdf_url":"https://arxiv.org/pdf/2401.02634v2.pdf","comment":"13 pages, Accepted by TIFS 2023"},{"id":"http://arxiv.org/abs/2404.05083v1","updated":"2024-04-07T21:46:47Z","published":"2024-04-07T21:46:47Z","title":"HaVTR: Improving Video-Text Retrieval Through Augmentation Using Large\n Foundation Models","summary":" While recent progress in video-text retrieval has been driven by the\nexploration of powerful model architectures and training strategies, the\nrepresentation learning ability of video-text retrieval models is still limited\ndue to low-quality and scarce training data annotations. To address this issue,\nwe present a novel video-text learning paradigm, HaVTR, which augments video\nand text data to learn more generalized features. Specifically, we first adopt\na simple augmentation method, which generates self-similar data by randomly\nduplicating or dropping subwords and frames. In addition, inspired by the\nrecent advancement in visual and language generative models, we propose a more\npowerful augmentation method through textual paraphrasing and video stylization\nusing large language models (LLMs) and visual generative models (VGMs).\nFurther, to bring richer information into video and text, we propose a\nhallucination-based augmentation method, where we use LLMs and VGMs to generate\nand add new relevant information to the original data. Benefiting from the\nenriched data, extensive experiments on several video-text retrieval benchmarks\ndemonstrate the superiority of HaVTR over existing methods.\n","authors":["Yimu Wang","Shuai Yuan","Xiangru Jian","Wei Pang","Mushi Wang","Ning Yu"],"pdf_url":"https://arxiv.org/pdf/2404.05083v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06886v2","updated":"2024-04-07T21:41:05Z","published":"2023-12-11T23:20:31Z","title":"Relightful Harmonization: Lighting-aware Portrait Background Replacement","summary":" Portrait harmonization aims to composite a subject into a new background,\nadjusting its lighting and color to ensure harmony with the background scene.\nExisting harmonization techniques often only focus on adjusting the global\ncolor and brightness of the foreground and ignore crucial illumination cues\nfrom the background such as apparent lighting direction, leading to unrealistic\ncompositions. We introduce Relightful Harmonization, a lighting-aware diffusion\nmodel designed to seamlessly harmonize sophisticated lighting effect for the\nforeground portrait using any background image. Our approach unfolds in three\nstages. First, we introduce a lighting representation module that allows our\ndiffusion model to encode lighting information from target image background.\nSecond, we introduce an alignment network that aligns lighting features learned\nfrom image background with lighting features learned from panorama environment\nmaps, which is a complete representation for scene illumination. Last, to\nfurther boost the photorealism of the proposed method, we introduce a novel\ndata simulation pipeline that generates synthetic training pairs from a diverse\nrange of natural images, which are used to refine the model. Our method\noutperforms existing benchmarks in visual fidelity and lighting coherence,\nshowing superior generalization in real-world testing scenarios, highlighting\nits versatility and practicality.\n","authors":["Mengwei Ren","Wei Xiong","Jae Shin Yoon","Zhixin Shu","Jianming Zhang","HyunJoon Jung","Guido Gerig","He Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.06886v2.pdf","comment":"CVPR 2024 camera ready"},{"id":"http://arxiv.org/abs/2404.05072v1","updated":"2024-04-07T21:00:14Z","published":"2024-04-07T21:00:14Z","title":"Spatial Cognition from Egocentric Video: Out of Sight, Not Out of Mind","summary":" As humans move around, performing their daily tasks, they are able to recall\nwhere they have positioned objects in their environment, even if these objects\nare currently out of sight. In this paper, we aim to mimic this spatial\ncognition ability. We thus formulate the task of Out of Sight, Not Out of Mind\n- 3D tracking active objects using observations captured through an egocentric\ncamera. We introduce Lift, Match and Keep (LMK), a method which lifts partial\n2D observations to 3D world coordinates, matches them over time using visual\nappearance, 3D location and interactions to form object tracks, and keeps these\nobject tracks even when they go out-of-view of the camera - hence keeping in\nmind what is out of sight. We test LMK on 100 long videos from EPIC-KITCHENS.\nOur results demonstrate that spatial cognition is critical for correctly\nlocating objects over short and long time scales. E.g., for one long egocentric\nvideo, we estimate the 3D location of 50 active objects. Of these, 60% can be\ncorrectly positioned in 3D after 2 minutes of leaving the camera view.\n","authors":["Chiara Plizzari","Shubham Goel","Toby Perrett","Jacob Chalk","Angjoo Kanazawa","Dima Damen"],"pdf_url":"https://arxiv.org/pdf/2404.05072v1.pdf","comment":"21 pages including references and appendix. Project Webpage:\n http://dimadamen.github.io/OSNOM/"},{"id":"http://arxiv.org/abs/2404.05069v1","updated":"2024-04-07T20:39:31Z","published":"2024-04-07T20:39:31Z","title":"AirShot: Efficient Few-Shot Detection for Autonomous Exploration","summary":" Few-shot object detection has drawn increasing attention in the field of\nrobotic exploration, where robots are required to find unseen objects with a\nfew online provided examples. Despite recent efforts have been made to yield\nonline processing capabilities, slow inference speeds of low-powered robots\nfail to meet the demands of real-time detection-making them impractical for\nautonomous exploration. Existing methods still face performance and efficiency\nchallenges, mainly due to unreliable features and exhaustive class loops. In\nthis work, we propose a new paradigm AirShot, and discover that, by fully\nexploiting the valuable correlation map, AirShot can result in a more robust\nand faster few-shot object detection system, which is more applicable to\nrobotics community. The core module Top Prediction Filter (TPF) can operate on\nmulti-scale correlation maps in both the training and inference stages. During\ntraining, TPF supervises the generation of a more representative correlation\nmap, while during inference, it reduces looping iterations by selecting\ntop-ranked classes, thus cutting down on computational costs with better\nperformance. Surprisingly, this dual functionality exhibits general\neffectiveness and efficiency on various off-the-shelf models. Exhaustive\nexperiments on COCO2017, VOC2014, and SubT datasets demonstrate that TPF can\nsignificantly boost the efficacy and efficiency of most off-the-shelf models,\nachieving up to 36.4% precision improvements along with 56.3% faster inference\nspeed. Code and Data are at: https://github.com/ImNotPrepared/AirShot.\n","authors":["Zihan Wang","Bowen Li","Chen Wang","Sebastian Scherer"],"pdf_url":"https://arxiv.org/pdf/2404.05069v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17328v3","updated":"2024-04-07T20:20:09Z","published":"2023-05-27T02:08:51Z","title":"Zero-TPrune: Zero-Shot Token Pruning through Leveraging of the Attention\n Graph in Pre-Trained Transformers","summary":" Deployment of Transformer models on edge devices is becoming increasingly\nchallenging due to the exponentially growing inference cost that scales\nquadratically with the number of tokens in the input sequence. Token pruning is\nan emerging solution to address this challenge due to its ease of deployment on\nvarious Transformer backbones. However, most token pruning methods require\ncomputationally expensive fine-tuning, which is undesirable in many edge\ndeployment cases. In this work, we propose Zero-TPrune, the first zero-shot\nmethod that considers both the importance and similarity of tokens in\nperforming token pruning. It leverages the attention graph of pre-trained\nTransformer models to produce an importance distribution for tokens via our\nproposed Weighted Page Rank (WPR) algorithm. This distribution further guides\ntoken partitioning for efficient similarity-based pruning. Due to the\nelimination of the fine-tuning overhead, Zero-TPrune can prune large models at\nnegligible computational cost, switch between different pruning configurations\nat no computational cost, and perform hyperparameter tuning efficiently. We\nevaluate the performance of Zero-TPrune on vision tasks by applying it to\nvarious vision Transformer backbones and testing them on ImageNet. Without any\nfine-tuning, Zero-TPrune reduces the FLOPs cost of DeiT-S by 34.7% and improves\nits throughput by 45.3% with only 0.4% accuracy loss. Compared with\nstate-of-the-art pruning methods that require fine-tuning, Zero-TPrune not only\neliminates the need for fine-tuning after pruning but also does so with only\n0.1% accuracy loss. Compared with state-of-the-art fine-tuning-free pruning\nmethods, Zero-TPrune reduces accuracy loss by up to 49% with similar FLOPs\nbudgets. Project webpage: https://jha-lab.github.io/zerotprune.\n","authors":["Hongjie Wang","Bhishma Dedhia","Niraj K. Jha"],"pdf_url":"https://arxiv.org/pdf/2305.17328v3.pdf","comment":"IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)\n 2024"},{"id":"http://arxiv.org/abs/2404.05063v1","updated":"2024-04-07T20:19:04Z","published":"2024-04-07T20:19:04Z","title":"AUEditNet: Dual-Branch Facial Action Unit Intensity Manipulation with\n Implicit Disentanglement","summary":" Facial action unit (AU) intensity plays a pivotal role in quantifying\nfine-grained expression behaviors, which is an effective condition for facial\nexpression manipulation. However, publicly available datasets containing\nintensity annotations for multiple AUs remain severely limited, often featuring\na restricted number of subjects. This limitation places challenges to the AU\nintensity manipulation in images due to disentanglement issues, leading\nresearchers to resort to other large datasets with pretrained AU intensity\nestimators for pseudo labels. In addressing this constraint and fully\nleveraging manual annotations of AU intensities for precise manipulation, we\nintroduce AUEditNet. Our proposed model achieves impressive intensity\nmanipulation across 12 AUs, trained effectively with only 18 subjects.\nUtilizing a dual-branch architecture, our approach achieves comprehensive\ndisentanglement of facial attributes and identity without necessitating\nadditional loss functions or implementing with large batch sizes. This approach\noffers a potential solution to achieve desired facial attribute editing despite\nthe dataset's limited subject count. Our experiments demonstrate AUEditNet's\nsuperior accuracy in editing AU intensities, affirming its capability in\ndisentangling facial attributes and identity within a limited subject pool.\nAUEditNet allows conditioning by either intensity values or target images,\neliminating the need for constructing AU combinations for specific facial\nexpression synthesis. Moreover, AU intensity estimation, as a downstream task,\nvalidates the consistency between real and edited images, confirming the\neffectiveness of our proposed AU intensity manipulation method.\n","authors":["Shiwei Jin","Peng Liu","Zhen Wang","Lei Wang","Ning Bi","Truong Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.05063v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05061v1","updated":"2024-04-07T20:15:40Z","published":"2024-04-07T20:15:40Z","title":"Automated Prediction of Breast Cancer Response to Neoadjuvant\n Chemotherapy from DWI Data","summary":" Effective surgical planning for breast cancer hinges on accurately predicting\npathological complete response (pCR) to neoadjuvant chemotherapy (NAC).\nDiffusion-weighted MRI (DWI) and machine learning offer a non-invasive approach\nfor early pCR assessment. However, most machine-learning models require manual\ntumor segmentation, a cumbersome and error-prone task. We propose a deep\nlearning model employing \"Size-Adaptive Lesion Weighting\" for automatic DWI\ntumor segmentation to enhance pCR prediction accuracy. Despite\nhistopathological changes during NAC complicating DWI image segmentation, our\nmodel demonstrates robust performance. Utilizing the BMMR2 challenge dataset,\nit matches human experts in pCR prediction pre-NAC with an area under the curve\n(AUC) of 0.76 vs. 0.796, and surpasses standard automated methods mid-NAC, with\nan AUC of 0.729 vs. 0.654 and 0.576. Our approach represents a significant\nadvancement in automating breast cancer treatment planning, enabling more\nreliable pCR predictions without manual segmentation.\n","authors":["Shir Nitzan","Maya Gilad","Moti Freiman"],"pdf_url":"https://arxiv.org/pdf/2404.05061v1.pdf","comment":"Accepted for presentation at the IEEE International Symposium on\n Biomedical Imaging (ISBI)"},{"id":"http://arxiv.org/abs/2401.04244v2","updated":"2024-04-07T20:13:45Z","published":"2024-01-08T21:35:05Z","title":"Spatio-Temporal Turbulence Mitigation: A Translational Perspective","summary":" Recovering images distorted by atmospheric turbulence is a challenging\ninverse problem due to the stochastic nature of turbulence. Although numerous\nturbulence mitigation (TM) algorithms have been proposed, their efficiency and\ngeneralization to real-world dynamic scenarios remain severely limited.\nBuilding upon the intuitions of classical TM algorithms, we present the Deep\nAtmospheric TUrbulence Mitigation network (DATUM). DATUM aims to overcome major\nchallenges when transitioning from classical to deep learning approaches. By\ncarefully integrating the merits of classical multi-frame TM methods into a\ndeep network structure, we demonstrate that DATUM can efficiently perform\nlong-range temporal aggregation using a recurrent fashion, while deformable\nattention and temporal-channel attention seamlessly facilitate pixel\nregistration and lucky imaging. With additional supervision, tilt and blur\ndegradation can be jointly mitigated. These inductive biases empower DATUM to\nsignificantly outperform existing methods while delivering a tenfold increase\nin processing speed. A large-scale training dataset, ATSyn, is presented as a\nco-invention to enable generalization in real turbulence. Our code and datasets\nare available at https://xg416.github.io/DATUM.\n","authors":["Xingguang Zhang","Nicholas Chimitt","Yiheng Chi","Zhiyuan Mao","Stanley H. Chan"],"pdf_url":"https://arxiv.org/pdf/2401.04244v2.pdf","comment":"Accepted by CVPR 2024, project page https://xg416.github.io/DATUM/"},{"id":"http://arxiv.org/abs/2312.15719v2","updated":"2024-04-07T19:59:00Z","published":"2023-12-25T13:12:36Z","title":"Get a Grip: Reconstructing Hand-Object Stable Grasps in Egocentric\n Videos","summary":" We propose the task of Hand-Object Stable Grasp Reconstruction (HO-SGR), the\nreconstruction of frames during which the hand is stably holding the object. We\nfirst develop the stable grasp definition based on the intuition that the\nin-contact area between the hand and object should remain stable. By analysing\nthe 3D ARCTIC dataset, we identify stable grasp durations and showcase that\nobjects in stable grasps move within a single degree of freedom (1-DoF). We\nthereby propose a method to jointly optimise all frames within a stable grasp,\nminimising object motions to a latent 1-DoF. Finally, we extend the knowledge\nto in-the-wild videos by labelling 2.4K clips of stable grasps. Our proposed\nEPIC-Grasps dataset includes 390 object instances of 9 categories, featuring\nstable grasps from videos of daily interactions in 141 environments. Without 3D\nground truth, we use stable contact areas and 2D projection masks to assess the\nHO-SGR task in the wild. We evaluate relevant methods and our approach\npreserves significantly higher stable contact area, on both EPIC-Grasps and\nstable grasp sub-sequences from the ARCTIC dataset.\n","authors":["Zhifan Zhu","Dima Damen"],"pdf_url":"https://arxiv.org/pdf/2312.15719v2.pdf","comment":"webpage: https://zhifanzhu.github.io/getagrip"},{"id":"http://arxiv.org/abs/2404.05052v1","updated":"2024-04-07T19:23:28Z","published":"2024-04-07T19:23:28Z","title":"Facial Affective Behavior Analysis with Instruction Tuning","summary":" Facial affective behavior analysis (FABA) is crucial for understanding human\nmental states from images. However, traditional approaches primarily deploy\nmodels to discriminate among discrete emotion categories, and lack the fine\ngranularity and reasoning capability for complex facial behaviors. The advent\nof Multi-modal Large Language Models (MLLMs) has been proven successful in\ngeneral visual understanding tasks. However, directly harnessing MLLMs for FABA\nis challenging due to the scarcity of datasets and benchmarks, neglecting\nfacial prior knowledge, and low training efficiency. To address these\nchallenges, we introduce (i) an instruction-following dataset for two FABA\ntasks, e.g., emotion and action unit recognition, (ii) a benchmark FABA-Bench\nwith a new metric considering both recognition and generation ability, and\n(iii) a new MLLM \"EmoLA\" as a strong baseline to the community. Our initiative\non the dataset and benchmarks reveal the nature and rationale of facial\naffective behaviors, i.e., fine-grained facial movement, interpretability, and\nreasoning. Moreover, to build an effective and efficient FABA MLLM, we\nintroduce a facial prior expert module with face structure knowledge and a\nlow-rank adaptation module into pre-trained MLLM. We conduct extensive\nexperiments on FABA-Bench and four commonly-used FABA datasets. The results\ndemonstrate that the proposed facial prior expert can boost the performance and\nEmoLA achieves the best results on our FABA-Bench. On commonly-used FABA\ndatasets, EmoLA is competitive rivaling task-specific state-of-the-art models.\n","authors":["Yifan Li","Anh Dao","Wentao Bao","Zhen Tan","Tianlong Chen","Huan Liu","Yu Kong"],"pdf_url":"https://arxiv.org/pdf/2404.05052v1.pdf","comment":"V1.0"},{"id":"http://arxiv.org/abs/2404.05049v1","updated":"2024-04-07T19:10:02Z","published":"2024-04-07T19:10:02Z","title":"PlateSegFL: A Privacy-Preserving License Plate Detection Using Federated\n Segmentation Learning","summary":" Automatic License Plate Recognition (ALPR) is an integral component of an\nintelligent transport system with extensive applications in secure\ntransportation, vehicle-to-vehicle communication, stolen vehicles detection,\ntraffic violations, and traffic flow management. The existing license plate\ndetection system focuses on one-shot learners or pre-trained models that\noperate with a geometric bounding box, limiting the model's performance.\nFurthermore, continuous video data streams uploaded to the central server\nresult in network and complexity issues. To combat this, PlateSegFL was\nintroduced, which implements U-Net-based segmentation along with Federated\nLearning (FL). U-Net is well-suited for multi-class image segmentation tasks\nbecause it can analyze a large number of classes and generate a pixel-level\nsegmentation map for each class. Federated Learning is used to reduce the\nquantity of data required while safeguarding the user's privacy. Different\ncomputing platforms, such as mobile phones, are able to collaborate on the\ndevelopment of a standard prediction model where it makes efficient use of\none's time; incorporates more diverse data; delivers projections in real-time;\nand requires no physical effort from the user; resulting around 95% F1 score.\n","authors":["Md. Shahriar Rahman Anuvab","Mishkat Sultana","Md. Atif Hossain","Shashwata Das","Suvarthi Chowdhury","Rafeed Rahman","Dibyo Fabian Dofadar","Shahriar Rahman Rana"],"pdf_url":"https://arxiv.org/pdf/2404.05049v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05046v1","updated":"2024-04-07T19:00:45Z","published":"2024-04-07T19:00:45Z","title":"FGAIF: Aligning Large Vision-Language Models with Fine-grained AI\n Feedback","summary":" Large Vision-Language Models (LVLMs) have demonstrated proficiency in\ntackling a variety of visual-language tasks. However, current LVLMs suffer from\nmisalignment between text and image modalities which causes three kinds of\nhallucination problems, i.e., object existence, object attribute, and object\nrelationship. To tackle this issue, existing methods mainly utilize\nReinforcement Learning (RL) to align modalities in LVLMs. However, they still\nsuffer from three main limitations: (1) General feedback can not indicate the\nhallucination type contained in the response; (2) Sparse rewards only give the\nsequence-level reward for the whole response; and (3)Annotation cost is\ntime-consuming and labor-intensive. To handle these limitations, we propose an\ninnovative method to align modalities in LVLMs through Fine-Grained Artificial\nIntelligence Feedback (FGAIF), which mainly consists of three steps: AI-based\nFeedback Collection, Fine-grained Reward Model Training, and Reinforcement\nLearning with Fine-grained Reward. Specifically, We first utilize AI tools to\npredict the types of hallucination for each segment in the response and obtain\na collection of fine-grained feedback. Then, based on the collected reward\ndata, three specialized reward models are trained to produce dense rewards.\nFinally, a novel fine-grained feedback module is integrated into the Proximal\nPolicy Optimization (PPO) algorithm. Extensive experiments are conducted on\nhallucination and general benchmarks, demonstrating the superior performance of\nour proposed method. Notably, compared with previous models trained with the\nRL-based aligning method, our proposed method is effective even with fewer\nparameters.\n","authors":["Liqiang Jing","Xinya Du"],"pdf_url":"https://arxiv.org/pdf/2404.05046v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14435v6","updated":"2024-04-07T18:04:04Z","published":"2023-06-26T06:04:09Z","title":"DragDiffusion: Harnessing Diffusion Models for Interactive Point-based\n Image Editing","summary":" Accurate and controllable image editing is a challenging task that has\nattracted significant attention recently. Notably, DragGAN is an interactive\npoint-based image editing framework that achieves impressive editing results\nwith pixel-level precision. However, due to its reliance on generative\nadversarial networks (GANs), its generality is limited by the capacity of\npretrained GAN models. In this work, we extend this editing framework to\ndiffusion models and propose a novel approach DragDiffusion. By harnessing\nlarge-scale pretrained diffusion models, we greatly enhance the applicability\nof interactive point-based editing on both real and diffusion-generated images.\nOur approach involves optimizing the diffusion latents to achieve precise\nspatial control. The supervision signal of this optimization process is from\nthe diffusion model's UNet features, which are known to contain rich semantic\nand geometric information. Moreover, we introduce two additional techniques,\nnamely LoRA fine-tuning and latent-MasaCtrl, to further preserve the identity\nof the original image. Lastly, we present a challenging benchmark dataset\ncalled DragBench -- the first benchmark to evaluate the performance of\ninteractive point-based image editing methods. Experiments across a wide range\nof challenging cases (e.g., images with multiple objects, diverse object\ncategories, various styles, etc.) demonstrate the versatility and generality of\nDragDiffusion. Code: https://github.com/Yujun-Shi/DragDiffusion.\n","authors":["Yujun Shi","Chuhui Xue","Jun Hao Liew","Jiachun Pan","Hanshu Yan","Wenqing Zhang","Vincent Y. F. Tan","Song Bai"],"pdf_url":"https://arxiv.org/pdf/2306.14435v6.pdf","comment":"Code is released at https://github.com/Yujun-Shi/DragDiffusion"},{"id":"http://arxiv.org/abs/2404.05029v1","updated":"2024-04-07T17:51:53Z","published":"2024-04-07T17:51:53Z","title":"LOGO: A Long-Form Video Dataset for Group Action Quality Assessment","summary":" Action quality assessment (AQA) has become an emerging topic since it can be\nextensively applied in numerous scenarios. However, most existing methods and\ndatasets focus on single-person short-sequence scenes, hindering the\napplication of AQA in more complex situations. To address this issue, we\nconstruct a new multi-person long-form video dataset for action quality\nassessment named LOGO. Distinguished in scenario complexity, our dataset\ncontains 200 videos from 26 artistic swimming events with 8 athletes in each\nsample along with an average duration of 204.2 seconds. As for richness in\nannotations, LOGO includes formation labels to depict group information of\nmultiple athletes and detailed annotations on action procedures. Furthermore,\nwe propose a simple yet effective method to model relations among athletes and\nreason about the potential temporal logic in long-form videos. Specifically, we\ndesign a group-aware attention module, which can be easily plugged into\nexisting AQA methods, to enrich the clip-wise representations based on\ncontextual group information. To benchmark LOGO, we systematically conduct\ninvestigations on the performance of several popular methods in AQA and action\nsegmentation. The results reveal the challenges our dataset brings. Extensive\nexperiments also show that our approach achieves state-of-the-art on the LOGO\ndataset. The dataset and code will be released at\n\\url{https://github.com/shiyi-zh0408/LOGO }.\n","authors":["Shiyi Zhang","Wenxun Dai","Sujia Wang","Xiangwei Shen","Jiwen Lu","Jie Zhou","Yansong Tang"],"pdf_url":"https://arxiv.org/pdf/2404.05029v1.pdf","comment":"Accepted by CVPR 2023"},{"id":"http://arxiv.org/abs/2404.05024v1","updated":"2024-04-07T17:31:53Z","published":"2024-04-07T17:31:53Z","title":"PathFinder: Attention-Driven Dynamic Non-Line-of-Sight Tracking with a\n Mobile Robot","summary":" The study of non-line-of-sight (NLOS) imaging is growing due to its many\npotential applications, including rescue operations and pedestrian detection by\nself-driving cars. However, implementing NLOS imaging on a moving camera\nremains an open area of research. Existing NLOS imaging methods rely on\ntime-resolved detectors and laser configurations that require precise optical\nalignment, making it difficult to deploy them in dynamic environments. This\nwork proposes a data-driven approach to NLOS imaging, PathFinder, that can be\nused with a standard RGB camera mounted on a small, power-constrained mobile\nrobot, such as an aerial drone. Our experimental pipeline is designed to\naccurately estimate the 2D trajectory of a person who moves in a\nManhattan-world environment while remaining hidden from the camera's\nfield-of-view. We introduce a novel approach to process a sequence of dynamic\nsuccessive frames in a line-of-sight (LOS) video using an attention-based\nneural network that performs inference in real-time. The method also includes a\npreprocessing selection metric that analyzes images from a moving camera which\ncontain multiple vertical planar surfaces, such as walls and building facades,\nand extracts planes that return maximum NLOS information. We validate the\napproach on in-the-wild scenes using a drone for video capture, thus\ndemonstrating low-cost NLOS imaging in dynamic capture environments.\n","authors":["Shenbagaraj Kannapiran","Sreenithy Chandran","Suren Jayasuriya","Spring Berman"],"pdf_url":"https://arxiv.org/pdf/2404.05024v1.pdf","comment":"First two authors have equal contribution"},{"id":"http://arxiv.org/abs/2404.05023v1","updated":"2024-04-07T17:30:57Z","published":"2024-04-07T17:30:57Z","title":"Scalable and Efficient Hierarchical Visual Topological Mapping","summary":" Hierarchical topological representations can significantly reduce search\ntimes within mapping and localization algorithms. Although recent research has\nshown the potential for such approaches, limited consideration has been given\nto the suitability and comparative performance of different global feature\nrepresentations within this context. In this work, we evaluate state-of-the-art\nhand-crafted and learned global descriptors using a hierarchical topological\nmapping technique on benchmark datasets and present results of a comprehensive\nevaluation of the impact of the global descriptor used. Although learned\ndescriptors have been incorporated into place recognition methods to improve\nretrieval accuracy and enhance overall recall, the problem of scalability and\nefficiency when applied to longer trajectories has not been adequately\naddressed in a majority of research studies. Based on our empirical analysis of\nmultiple runs, we identify that continuity and distinctiveness are crucial\ncharacteristics for an optimal global descriptor that enable efficient and\nscalable hierarchical mapping, and present a methodology for quantifying and\ncontrasting these characteristics across different global descriptors. Our\nstudy demonstrates that the use of global descriptors based on an unsupervised\nlearned Variational Autoencoder (VAE) excels in these characteristics and\nachieves significantly lower runtime. It runs on a consumer grade desktop, up\nto 2.3x faster than the second best global descriptor, NetVLAD, and up to 9.5x\nfaster than the hand-crafted descriptor, PHOG, on the longest track evaluated\n(St Lucia, 17.6 km), without sacrificing overall recall performance.\n","authors":["Saravanabalagi Ramachandran","Jonathan Horgan","Ganesh Sistu","John McDonald"],"pdf_url":"https://arxiv.org/pdf/2404.05023v1.pdf","comment":"Published in the 21st International Conference on Advanced Robotics\n (ICAR 2023)"},{"id":"http://arxiv.org/abs/2404.05022v1","updated":"2024-04-07T17:25:52Z","published":"2024-04-07T17:25:52Z","title":"DinoBloom: A Foundation Model for Generalizable Cell Embeddings in\n Hematology","summary":" In hematology, computational models offer significant potential to improve\ndiagnostic accuracy, streamline workflows, and reduce the tedious work of\nanalyzing single cells in peripheral blood or bone marrow smears. However,\nclinical adoption of computational models has been hampered by the lack of\ngeneralization due to large batch effects, small dataset sizes, and poor\nperformance in transfer learning from natural images. To address these\nchallenges, we introduce DinoBloom, the first foundation model for single cell\nimages in hematology, utilizing a tailored DINOv2 pipeline. Our model is built\nupon an extensive collection of 13 diverse, publicly available datasets of\nperipheral blood and bone marrow smears, the most substantial open-source\ncohort in hematology so far, comprising over 380,000 white blood cell images.\nTo assess its generalization capability, we evaluate it on an external dataset\nwith a challenging domain shift. We show that our model outperforms existing\nmedical and non-medical vision models in (i) linear probing and k-nearest\nneighbor evaluations for cell-type classification on blood and bone marrow\nsmears and (ii) weakly supervised multiple instance learning for acute myeloid\nleukemia subtyping by a large margin. A family of four DinoBloom models (small,\nbase, large, and giant) can be adapted for a wide range of downstream\napplications, be a strong baseline for classification problems, and facilitate\nthe assessment of batch effects in new datasets. All models are available at\ngithub.com/marrlab/DinoBloom.\n","authors":["Valentin Koch","Sophia J. Wagner","Salome Kazeminia","Ece Sancar","Matthias Hehr","Julia Schnabel","Tingying Peng","Carsten Marr"],"pdf_url":"https://arxiv.org/pdf/2404.05022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16671v4","updated":"2024-04-07T17:22:46Z","published":"2023-09-28T17:59:56Z","title":"Demystifying CLIP Data","summary":" Contrastive Language-Image Pre-training (CLIP) is an approach that has\nadvanced research and applications in computer vision, fueling modern\nrecognition systems and generative models. We believe that the main ingredient\nto the success of CLIP is its data and not the model architecture or\npre-training objective. However, CLIP only provides very limited information\nabout its data and how it has been collected, leading to works that aim to\nreproduce CLIP's data by filtering with its model parameters. In this work, we\nintend to reveal CLIP's data curation approach and in our pursuit of making it\nopen to the community introduce Metadata-Curated Language-Image Pre-training\n(MetaCLIP). MetaCLIP takes a raw data pool and metadata (derived from CLIP's\nconcepts) and yields a balanced subset over the metadata distribution. Our\nexperimental study rigorously isolates the model and training settings,\nconcentrating solely on data. MetaCLIP applied to CommonCrawl with 400M\nimage-text data pairs outperforms CLIP's data on multiple standard benchmarks.\nIn zero-shot ImageNet classification, MetaCLIP achieves 70.8% accuracy,\nsurpassing CLIP's 68.3% on ViT-B models. Scaling to 1B data, while maintaining\nthe same training budget, attains 72.4%. Our observations hold across various\nmodel sizes, exemplified by ViT-H achieving 80.5%, without any\nbells-and-whistles. Curation code and training data distribution on metadata is\nmade available at https://github.com/facebookresearch/MetaCLIP.\n","authors":["Hu Xu","Saining Xie","Xiaoqing Ellen Tan","Po-Yao Huang","Russell Howes","Vasu Sharma","Shang-Wen Li","Gargi Ghosh","Luke Zettlemoyer","Christoph Feichtenhofer"],"pdf_url":"https://arxiv.org/pdf/2309.16671v4.pdf","comment":"17 pages. arXiv admin note: text overlap with arXiv:2103.00020 by\n other authors"},{"id":"http://arxiv.org/abs/2112.04731v5","updated":"2024-04-07T17:09:58Z","published":"2021-12-09T07:20:32Z","title":"Mimicking the Oracle: An Initial Phase Decorrelation Approach for Class\n Incremental Learning","summary":" Class Incremental Learning (CIL) aims at learning a multi-class classifier in\na phase-by-phase manner, in which only data of a subset of the classes are\nprovided at each phase. Previous works mainly focus on mitigating forgetting in\nphases after the initial one. However, we find that improving CIL at its\ninitial phase is also a promising direction. Specifically, we experimentally\nshow that directly encouraging CIL Learner at the initial phase to output\nsimilar representations as the model jointly trained on all classes can greatly\nboost the CIL performance. Motivated by this, we study the difference between a\nna\\\"ively-trained initial-phase model and the oracle model. Specifically, since\none major difference between these two models is the number of training\nclasses, we investigate how such difference affects the model representations.\nWe find that, with fewer training classes, the data representations of each\nclass lie in a long and narrow region; with more training classes, the\nrepresentations of each class scatter more uniformly. Inspired by this\nobservation, we propose Class-wise Decorrelation (CwD) that effectively\nregularizes representations of each class to scatter more uniformly, thus\nmimicking the model jointly trained with all classes (i.e., the oracle model).\nOur CwD is simple to implement and easy to plug into existing methods.\nExtensive experiments on various benchmark datasets show that CwD consistently\nand significantly improves the performance of existing state-of-the-art methods\nby around 1\\% to 3\\%. Code will be released.\n","authors":["Yujun Shi","Kuangqi Zhou","Jian Liang","Zihang Jiang","Jiashi Feng","Philip Torr","Song Bai","Vincent Y. F. Tan"],"pdf_url":"https://arxiv.org/pdf/2112.04731v5.pdf","comment":"CVPR 2022 Camera-Ready Version"},{"id":"http://arxiv.org/abs/2404.05016v1","updated":"2024-04-07T17:06:22Z","published":"2024-04-07T17:06:22Z","title":"Hyperbolic Learning with Synthetic Captions for Open-World Detection","summary":" Open-world detection poses significant challenges, as it requires the\ndetection of any object using either object class labels or free-form texts.\nExisting related works often use large-scale manual annotated caption datasets\nfor training, which are extremely expensive to collect. Instead, we propose to\ntransfer knowledge from vision-language models (VLMs) to enrich the\nopen-vocabulary descriptions automatically. Specifically, we bootstrap dense\nsynthetic captions using pre-trained VLMs to provide rich descriptions on\ndifferent regions in images, and incorporate these captions to train a novel\ndetector that generalizes to novel concepts. To mitigate the noise caused by\nhallucination in synthetic captions, we also propose a novel hyperbolic\nvision-language learning approach to impose a hierarchy between visual and\ncaption embeddings. We call our detector ``HyperLearner''. We conduct extensive\nexperiments on a wide variety of open-world detection benchmarks (COCO, LVIS,\nObject Detection in the Wild, RefCOCO) and our results show that our model\nconsistently outperforms existing state-of-the-art methods, such as GLIP,\nGLIPv2 and Grounding DINO, when using the same backbone.\n","authors":["Fanjie Kong","Yanbei Chen","Jiarui Cai","Davide Modolo"],"pdf_url":"https://arxiv.org/pdf/2404.05016v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2402.05713v3","updated":"2024-04-07T16:59:41Z","published":"2024-02-08T14:40:32Z","title":"Hidden in Plain Sight: Undetectable Adversarial Bias Attacks on\n Vulnerable Patient Populations","summary":" The proliferation of artificial intelligence (AI) in radiology has shed light\non the risk of deep learning (DL) models exacerbating clinical biases towards\nvulnerable patient populations. While prior literature has focused on\nquantifying biases exhibited by trained DL models, demographically targeted\nadversarial bias attacks on DL models and its implication in the clinical\nenvironment remains an underexplored field of research in medical imaging. In\nthis work, we demonstrate that demographically targeted label poisoning attacks\ncan introduce undetectable underdiagnosis bias in DL models. Our results across\nmultiple performance metrics and demographic groups like sex, age, and their\nintersectional subgroups show that adversarial bias attacks demonstrate\nhigh-selectivity for bias in the targeted group by degrading group model\nperformance without impacting overall model performance. Furthermore, our\nresults indicate that adversarial bias attacks result in biased DL models that\npropagate prediction bias even when evaluated with external datasets.\n","authors":["Pranav Kulkarni","Andrew Chan","Nithya Navarathna","Skylar Chan","Paul H. Yi","Vishwa S. Parekh"],"pdf_url":"https://arxiv.org/pdf/2402.05713v3.pdf","comment":"29 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.05014v1","updated":"2024-04-07T16:49:07Z","published":"2024-04-07T16:49:07Z","title":"MagicTime: Time-lapse Video Generation Models as Metamorphic Simulators","summary":" Recent advances in Text-to-Video generation (T2V) have achieved remarkable\nsuccess in synthesizing high-quality general videos from textual descriptions.\nA largely overlooked problem in T2V is that existing models have not adequately\nencoded physical knowledge of the real world, thus generated videos tend to\nhave limited motion and poor variations. In this paper, we propose\n\\textbf{MagicTime}, a metamorphic time-lapse video generation model, which\nlearns real-world physics knowledge from time-lapse videos and implements\nmetamorphic generation. First, we design a MagicAdapter scheme to decouple\nspatial and temporal training, encode more physical knowledge from metamorphic\nvideos, and transform pre-trained T2V models to generate metamorphic videos.\nSecond, we introduce a Dynamic Frames Extraction strategy to adapt to\nmetamorphic time-lapse videos, which have a wider variation range and cover\ndramatic object metamorphic processes, thus embodying more physical knowledge\nthan general videos. Finally, we introduce a Magic Text-Encoder to improve the\nunderstanding of metamorphic video prompts. Furthermore, we create a time-lapse\nvideo-text dataset called \\textbf{ChronoMagic}, specifically curated to unlock\nthe metamorphic video generation ability. Extensive experiments demonstrate the\nsuperiority and effectiveness of MagicTime for generating high-quality and\ndynamic metamorphic videos, suggesting time-lapse video generation is a\npromising path toward building metamorphic simulators of the physical world.\n","authors":["Shenghai Yuan","Jinfa Huang","Yujun Shi","Yongqi Xu","Ruijie Zhu","Bin Lin","Xinhua Cheng","Li Yuan","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/2404.05014v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18172v2","updated":"2024-04-07T16:43:51Z","published":"2024-02-28T09:02:33Z","title":"NiteDR: Nighttime Image De-Raining with Cross-View Sensor Cooperative\n Learning for Dynamic Driving Scenes","summary":" In real-world environments, outdoor imaging systems are often affected by\ndisturbances such as rain degradation. Especially, in nighttime driving scenes,\ninsufficient and uneven lighting shrouds the scenes in darkness, resulting\ndegradation of both the image quality and visibility. Particularly, in the\nfield of autonomous driving, the visual perception ability of RGB sensors\nexperiences a sharp decline in such harsh scenarios. Additionally, driving\nassistance systems suffer from reduced capabilities in capturing and discerning\nthe surrounding environment, posing a threat to driving safety. Single-view\ninformation captured by single-modal sensors cannot comprehensively depict the\nentire scene. To address these challenges, we developed an image de-raining\nframework tailored for rainy nighttime driving scenes. It aims to remove rain\nartifacts, enrich scene representation, and restore useful information.\nSpecifically, we introduce cooperative learning between visible and infrared\nimages captured by different sensors. By cross-view fusion of these\nmulti-source data, the scene within the images gains richer texture details and\nenhanced contrast. We constructed an information cleaning module called\nCleanNet as the first stage of our framework. Moreover, we designed an\ninformation fusion module called FusionNet as the second stage to fuse the\nclean visible images with infrared images. Using this stage-by-stage learning\nstrategy, we obtain de-rained fusion images with higher quality and better\nvisual perception. Extensive experiments demonstrate the effectiveness of our\nproposed Cross-View Cooperative Learning (CVCL) in adverse driving scenarios in\nlow-light rainy environments. The proposed approach addresses the gap in the\nutilization of existing rain removal algorithms in specific low-light\nconditions.\n","authors":["Cidan Shi","Lihuang Fang","Han Wu","Xiaoyu Xian","Yukai Shi","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2402.18172v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12588v3","updated":"2024-04-07T16:05:03Z","published":"2023-11-21T13:21:22Z","title":"HiPose: Hierarchical Binary Surface Encoding and Correspondence Pruning\n for RGB-D 6DoF Object Pose Estimation","summary":" In this work, we present a novel dense-correspondence method for 6DoF object\npose estimation from a single RGB-D image. While many existing data-driven\nmethods achieve impressive performance, they tend to be time-consuming due to\ntheir reliance on rendering-based refinement approaches. To circumvent this\nlimitation, we present HiPose, which establishes 3D-3D correspondences in a\ncoarse-to-fine manner with a hierarchical binary surface encoding. Unlike\nprevious dense-correspondence methods, we estimate the correspondence surface\nby employing point-to-surface matching and iteratively constricting the surface\nuntil it becomes a correspondence point while gradually removing outliers.\nExtensive experiments on public benchmarks LM-O, YCB-V, and T-Less demonstrate\nthat our method surpasses all refinement-free methods and is even on par with\nexpensive refinement-based approaches. Crucially, our approach is\ncomputationally efficient and enables real-time critical applications with high\naccuracy requirements.\n","authors":["Yongliang Lin","Yongzhi Su","Praveen Nathan","Sandeep Inuganti","Yan Di","Martin Sundermeyer","Fabian Manhardt","Didier Stricker","Jason Rambach","Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.12588v3.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05003v1","updated":"2024-04-07T15:58:25Z","published":"2024-04-07T15:58:25Z","title":"Camera-Based Remote Physiology Sensing for Hundreds of Subjects Across\n Skin Tones","summary":" Remote photoplethysmography (rPPG) emerges as a promising method for\nnon-invasive, convenient measurement of vital signs, utilizing the widespread\npresence of cameras. Despite advancements, existing datasets fall short in\nterms of size and diversity, limiting comprehensive evaluation under diverse\nconditions. This paper presents an in-depth analysis of the VitalVideo dataset,\nthe largest real-world rPPG dataset to date, encompassing 893 subjects and 6\nFitzpatrick skin tones. Our experimentation with six unsupervised methods and\nthree supervised models demonstrates that datasets comprising a few hundred\nsubjects(i.e., 300 for UBFC-rPPG, 500 for PURE, and 700 for MMPD-Simple) are\nsufficient for effective rPPG model training. Our findings highlight the\nimportance of diversity and consistency in skin tones for precise performance\nevaluation across different datasets.\n","authors":["Jiankai Tang","Xinyi Li","Jiacheng Liu","Xiyuxing Zhang","Zeyu Wang","Yuntao Wang"],"pdf_url":"https://arxiv.org/pdf/2404.05003v1.pdf","comment":"11 pages, 5 figures, CHI24 Workshop PhysioCHI"},{"id":"http://arxiv.org/abs/2404.05001v1","updated":"2024-04-07T15:53:21Z","published":"2024-04-07T15:53:21Z","title":"Dual-Scale Transformer for Large-Scale Single-Pixel Imaging","summary":" Single-pixel imaging (SPI) is a potential computational imaging technique\nwhich produces image by solving an illposed reconstruction problem from few\nmeasurements captured by a single-pixel detector. Deep learning has achieved\nimpressive success on SPI reconstruction. However, previous poor reconstruction\nperformance and impractical imaging model limit its real-world applications. In\nthis paper, we propose a deep unfolding network with hybrid-attention\nTransformer on Kronecker SPI model, dubbed HATNet, to improve the imaging\nquality of real SPI cameras. Specifically, we unfold the computation graph of\nthe iterative shrinkagethresholding algorithm (ISTA) into two alternative\nmodules: efficient tensor gradient descent and hybrid-attention multiscale\ndenoising. By virtue of Kronecker SPI, the gradient descent module can avoid\nhigh computational overheads rooted in previous gradient descent modules based\non vectorized SPI. The denoising module is an encoder-decoder architecture\npowered by dual-scale spatial attention for high- and low-frequency aggregation\nand channel attention for global information recalibration. Moreover, we build\na SPI prototype to verify the effectiveness of the proposed method. Extensive\nexperiments on synthetic and real data demonstrate that our method achieves the\nstate-of-the-art performance. The source code and pre-trained models are\navailable at https://github.com/Gang-Qu/HATNet-SPI.\n","authors":["Gang Qu","Ping Wang","Xin Yuan"],"pdf_url":"https://arxiv.org/pdf/2404.05001v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.04998v1","updated":"2024-04-07T15:48:33Z","published":"2024-04-07T15:48:33Z","title":"Weakly Supervised Deep Hyperspherical Quantization for Image Retrieval","summary":" Deep quantization methods have shown high efficiency on large-scale image\nretrieval. However, current models heavily rely on ground-truth information,\nhindering the application of quantization in label-hungry scenarios. A more\nrealistic demand is to learn from inexhaustible uploaded images that are\nassociated with informal tags provided by amateur users. Though such sketchy\ntags do not obviously reveal the labels, they actually contain useful semantic\ninformation for supervising deep quantization. To this end, we propose\nWeakly-Supervised Deep Hyperspherical Quantization (WSDHQ), which is the first\nwork to learn deep quantization from weakly tagged images. Specifically, 1) we\nuse word embeddings to represent the tags and enhance their semantic\ninformation based on a tag correlation graph. 2) To better preserve semantic\ninformation in quantization codes and reduce quantization error, we jointly\nlearn semantics-preserving embeddings and supervised quantizer on hypersphere\nby employing a well-designed fusion layer and tailor-made loss functions.\nExtensive experiments show that WSDHQ can achieve state-of-art performance on\nweakly-supervised compact coding. Code is available at\nhttps://github.com/gimpong/AAAI21-WSDHQ.\n","authors":["Jinpeng Wang","Bin Chen","Qiang Zhang","Zaiqiao Meng","Shangsong Liang","Shu-Tao Xia"],"pdf_url":"https://arxiv.org/pdf/2404.04998v1.pdf","comment":"In proceedings of AAAI 2021. Code and data are available"},{"id":"http://arxiv.org/abs/2404.04996v1","updated":"2024-04-07T15:34:40Z","published":"2024-04-07T15:34:40Z","title":"Fantastic Animals and Where to Find Them: Segment Any Marine Animal with\n Dual SAM","summary":" As an important pillar of underwater intelligence, Marine Animal Segmentation\n(MAS) involves segmenting animals within marine environments. Previous methods\ndon't excel in extracting long-range contextual features and overlook the\nconnectivity between discrete pixels. Recently, Segment Anything Model (SAM)\noffers a universal framework for general segmentation tasks. Unfortunately,\ntrained with natural images, SAM does not obtain the prior knowledge from\nmarine images. In addition, the single-position prompt of SAM is very\ninsufficient for prior guidance. To address these issues, we propose a novel\nfeature learning framework, named Dual-SAM for high-performance MAS. To this\nend, we first introduce a dual structure with SAM's paradigm to enhance feature\nlearning of marine images. Then, we propose a Multi-level Coupled Prompt (MCP)\nstrategy to instruct comprehensive underwater prior information, and enhance\nthe multi-level features of SAM's encoder with adapters. Subsequently, we\ndesign a Dilated Fusion Attention Module (DFAM) to progressively integrate\nmulti-level features from SAM's encoder. Finally, instead of directly\npredicting the masks of marine animals, we propose a Criss-Cross Connectivity\nPrediction (C$^3$P) paradigm to capture the inter-connectivity between discrete\npixels. With dual decoders, it generates pseudo-labels and achieves mutual\nsupervision for complementary feature representations, resulting in\nconsiderable improvements over previous techniques. Extensive experiments\nverify that our proposed method achieves state-of-the-art performances on five\nwidely-used MAS datasets. The code is available at\nhttps://github.com/Drchip61/Dual_SAM.\n","authors":["Pingping Zhang","Tianyu Yan","Yang Liu","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2404.04996v1.pdf","comment":"Accepted by CVPR2024 as Poster(Highlight)"},{"id":"http://arxiv.org/abs/2404.04992v1","updated":"2024-04-07T15:27:35Z","published":"2024-04-07T15:27:35Z","title":"Efficient Surgical Tool Recognition via HMM-Stabilized Deep Learning","summary":" Recognizing various surgical tools, actions and phases from surgery videos is\nan important problem in computer vision with exciting clinical applications.\nExisting deep-learning-based methods for this problem either process each\nsurgical video as a series of independent images without considering their\ndependence, or rely on complicated deep learning models to count for dependence\nof video frames. In this study, we revealed from exploratory data analysis that\nsurgical videos enjoy relatively simple semantic structure, where the presence\nof surgical phases and tools can be well modeled by a compact hidden Markov\nmodel (HMM). Based on this observation, we propose an HMM-stabilized deep\nlearning method for tool presence detection. A wide range of experiments\nconfirm that the proposed approaches achieve better performance with lower\ntraining and running costs, and support more flexible ways to construct and\nutilize training data in scenarios where not all surgery videos of interest are\nextensively labelled. These results suggest that popular deep learning\napproaches with over-complicated model structures may suffer from inefficient\nutilization of data, and integrating ingredients of deep learning and\nstatistical learning wisely may lead to more powerful algorithms that enjoy\ncompetitive performance, transparent interpretation and convenient model\ntraining simultaneously.\n","authors":["Haifeng Wang","Hao Xu","Jun Wang","Jian Zhou","Ke Deng"],"pdf_url":"https://arxiv.org/pdf/2404.04992v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04986v1","updated":"2024-04-07T15:06:48Z","published":"2024-04-07T15:06:48Z","title":"Dynamic Distinction Learning: Adaptive Pseudo Anomalies for Video\n Anomaly Detection","summary":" We introduce Dynamic Distinction Learning (DDL) for Video Anomaly Detection,\na novel video anomaly detection methodology that combines pseudo-anomalies,\ndynamic anomaly weighting, and a distinction loss function to improve detection\naccuracy. By training on pseudo-anomalies, our approach adapts to the\nvariability of normal and anomalous behaviors without fixed anomaly thresholds.\nOur model showcases superior performance on the Ped2, Avenue and ShanghaiTech\ndatasets, where individual models are tailored for each scene. These\nachievements highlight DDL's effectiveness in advancing anomaly detection,\noffering a scalable and adaptable solution for video surveillance challenges.\n","authors":["Demetris Lappas","Vasileios Argyriou","Dimitrios Makris"],"pdf_url":"https://arxiv.org/pdf/2404.04986v1.pdf","comment":"To be published in the CVPR2024 Workshop"},{"id":"http://arxiv.org/abs/2404.00521v3","updated":"2024-04-07T15:04:47Z","published":"2024-03-31T01:41:36Z","title":"CHAIN: Enhancing Generalization in Data-Efficient GANs via lipsCHitz\n continuity constrAIned Normalization","summary":" Generative Adversarial Networks (GANs) significantly advanced image\ngeneration but their performance heavily depends on abundant training data. In\nscenarios with limited data, GANs often struggle with discriminator overfitting\nand unstable training. Batch Normalization (BN), despite being known for\nenhancing generalization and training stability, has rarely been used in the\ndiscriminator of Data-Efficient GANs. Our work addresses this gap by\nidentifying a critical flaw in BN: the tendency for gradient explosion during\nthe centering and scaling steps. To tackle this issue, we present CHAIN\n(lipsCHitz continuity constrAIned Normalization), which replaces the\nconventional centering step with zero-mean regularization and integrates a\nLipschitz continuity constraint in the scaling step. CHAIN further enhances GAN\ntraining by adaptively interpolating the normalized and unnormalized features,\neffectively avoiding discriminator overfitting. Our theoretical analyses firmly\nestablishes CHAIN's effectiveness in reducing gradients in latent features and\nweights, improving stability and generalization in GAN training. Empirical\nevidence supports our theory. CHAIN achieves state-of-the-art results in\ndata-limited scenarios on CIFAR-10/100, ImageNet, five low-shot and seven\nhigh-resolution few-shot image datasets. Code:\nhttps://github.com/MaxwellYaoNi/CHAIN\n","authors":["Yao Ni","Piotr Koniusz"],"pdf_url":"https://arxiv.org/pdf/2404.00521v3.pdf","comment":"Accepted by CVPR2024. 26 pages full version. Code:\n https://github.com/MaxwellYaoNi/CHAIN"},{"id":"http://arxiv.org/abs/2404.04983v1","updated":"2024-04-07T15:03:46Z","published":"2024-04-07T15:03:46Z","title":"Primary liver cancer classification from routine tumour biopsy using\n weakly supervised deep learning","summary":" The diagnosis of primary liver cancers (PLCs) can be challenging, especially\non biopsies and for combined hepatocellular-cholangiocarcinoma (cHCC-CCA). We\nautomatically classified PLCs on routine-stained biopsies using a weakly\nsupervised learning method. Weak tumour/non-tumour annotations served as labels\nfor training a Resnet18 neural network, and the network's last convolutional\nlayer was used to extract new tumour tile features. Without knowledge of the\nprecise labels of the malignancies, we then applied an unsupervised clustering\nalgorithm. Our model identified specific features of hepatocellular carcinoma\n(HCC) and intrahepatic cholangiocarcinoma (iCCA). Despite no specific features\nof cHCC-CCA being recognized, the identification of HCC and iCCA tiles within a\nslide could facilitate the diagnosis of primary liver cancers, particularly\ncHCC-CCA.\n Method and results: 166 PLC biopsies were divided into training, internal and\nexternal validation sets: 90, 29 and 47 samples. Two liver pathologists\nreviewed each whole-slide hematein eosin saffron (HES)-stained image (WSI).\nAfter annotating the tumour/non-tumour areas, 256x256 pixel tiles were\nextracted from the WSIs and used to train a ResNet18. The network was used to\nextract new tile features. An unsupervised clustering algorithm was then\napplied to the new tile features. In a two-cluster model, Clusters 0 and 1\ncontained mainly HCC and iCCA histological features. The diagnostic agreement\nbetween the pathological diagnosis and the model predictions in the internal\nand external validation sets was 100% (11/11) and 96% (25/26) for HCC and 78%\n(7/9) and 87% (13/15) for iCCA, respectively. For cHCC-CCA, we observed a\nhighly variable proportion of tiles from each cluster (Cluster 0: 5-97%;\nCluster 1: 2-94%).\n","authors":["Aurélie Beaufrère","Nora Ouzir","Paul Emile Zafar","Astrid Laurent-Bellue","Miguel Albuquerque","Gwladys Lubuela","Jules Grégory","Catherine Guettier","Kévin Mondet","Jean-Christophe Pesquet","Valérie Paradis"],"pdf_url":"https://arxiv.org/pdf/2404.04983v1.pdf","comment":"https://www.sciencedirect.com/science/article/pii/S2589555924000090"},{"id":"http://arxiv.org/abs/2311.15879v2","updated":"2024-04-07T14:43:38Z","published":"2023-11-27T14:51:37Z","title":"EVCap: Retrieval-Augmented Image Captioning with External Visual-Name\n Memory for Open-World Comprehension","summary":" Large language models (LLMs)-based image captioning has the capability of\ndescribing objects not explicitly observed in training data; yet novel objects\noccur frequently, necessitating the requirement of sustaining up-to-date object\nknowledge for open-world comprehension. Instead of relying on large amounts of\ndata and/or scaling up network parameters, we introduce a highly effective\nretrieval-augmented image captioning method that prompts LLMs with object names\nretrieved from External Visual--name memory (EVCap). We build ever-changing\nobject knowledge memory using objects' visuals and names, enabling us to (i)\nupdate the memory at a minimal cost and (ii) effortlessly augment LLMs with\nretrieved object names by utilizing a lightweight and fast-to-train model. Our\nmodel, which was trained only on the COCO dataset, can adapt to out-of-domain\nwithout requiring additional fine-tuning or re-training. Our experiments\nconducted on benchmarks and synthetic commonsense-violating data show that\nEVCap, with only 3.97M trainable parameters, exhibits superior performance\ncompared to other methods based on frozen pre-trained LLMs. Its performance is\nalso competitive to specialist SOTAs that require extensive training.\n","authors":["Jiaxuan Li","Duc Minh Vo","Akihiro Sugimoto","Hideki Nakayama"],"pdf_url":"https://arxiv.org/pdf/2311.15879v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.04971v1","updated":"2024-04-07T14:21:37Z","published":"2024-04-07T14:21:37Z","title":"FPL+: Filtered Pseudo Label-based Unsupervised Cross-Modality Adaptation\n for 3D Medical Image Segmentation","summary":" Adapting a medical image segmentation model to a new domain is important for\nimproving its cross-domain transferability, and due to the expensive annotation\nprocess, Unsupervised Domain Adaptation (UDA) is appealing where only unlabeled\nimages are needed for the adaptation. Existing UDA methods are mainly based on\nimage or feature alignment with adversarial training for regularization, and\nthey are limited by insufficient supervision in the target domain. In this\npaper, we propose an enhanced Filtered Pseudo Label (FPL+)-based UDA method for\n3D medical image segmentation. It first uses cross-domain data augmentation to\ntranslate labeled images in the source domain to a dual-domain training set\nconsisting of a pseudo source-domain set and a pseudo target-domain set. To\nleverage the dual-domain augmented images to train a pseudo label generator,\ndomain-specific batch normalization layers are used to deal with the domain\nshift while learning the domain-invariant structure features, generating\nhigh-quality pseudo labels for target-domain images. We then combine labeled\nsource-domain images and target-domain images with pseudo labels to train a\nfinal segmentor, where image-level weighting based on uncertainty estimation\nand pixel-level weighting based on dual-domain consensus are proposed to\nmitigate the adverse effect of noisy pseudo labels. Experiments on three public\nmulti-modal datasets for Vestibular Schwannoma, brain tumor and whole heart\nsegmentation show that our method surpassed ten state-of-the-art UDA methods,\nand it even achieved better results than fully supervised learning in the\ntarget domain in some cases.\n","authors":["Jianghao Wu","Dong Guo","Guotai Wang","Qiang Yue","Huijun Yu","Kang Li","Shaoting Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.04971v1.pdf","comment":"12 pages, 7 figures"},{"id":"http://arxiv.org/abs/2312.06462v2","updated":"2024-04-07T14:05:53Z","published":"2023-12-11T15:51:38Z","title":"Cooperation Does Matter: Exploring Multi-Order Bilateral Relations for\n Audio-Visual Segmentation","summary":" Recently, an audio-visual segmentation (AVS) task has been introduced, aiming\nto group pixels with sounding objects within a given video. This task\nnecessitates a first-ever audio-driven pixel-level understanding of the scene,\nposing significant challenges. In this paper, we propose an innovative\naudio-visual transformer framework, termed COMBO, an acronym for COoperation of\nMulti-order Bilateral relatiOns. For the first time, our framework explores\nthree types of bilateral entanglements within AVS: pixel entanglement, modality\nentanglement, and temporal entanglement. Regarding pixel entanglement, we\nemploy a Siam-Encoder Module (SEM) that leverages prior knowledge to generate\nmore precise visual features from the foundational model. For modality\nentanglement, we design a Bilateral-Fusion Module (BFM), enabling COMBO to\nalign corresponding visual and auditory signals bi-directionally. As for\ntemporal entanglement, we introduce an innovative adaptive inter-frame\nconsistency loss according to the inherent rules of temporal. Comprehensive\nexperiments and ablation studies on AVSBench-object (84.7 mIoU on S4, 59.2 mIou\non MS3) and AVSBench-semantic (42.1 mIoU on AVSS) datasets demonstrate that\nCOMBO surpasses previous state-of-the-art methods. Code and more results will\nbe publicly available at https://yannqi.github.io/AVS-COMBO/.\n","authors":["Qi Yang","Xing Nie","Tong Li","Pengfei Gao","Ying Guo","Cheng Zhen","Pengfei Yan","Shiming Xiang"],"pdf_url":"https://arxiv.org/pdf/2312.06462v2.pdf","comment":"CVPR 2024 Highlight. 13 pages, 10 figures"},{"id":"http://arxiv.org/abs/2404.04960v1","updated":"2024-04-07T13:40:29Z","published":"2024-04-07T13:40:29Z","title":"PairAug: What Can Augmented Image-Text Pairs Do for Radiology?","summary":" Current vision-language pre-training (VLP) methodologies predominantly depend\non paired image-text datasets, a resource that is challenging to acquire in\nradiology due to privacy considerations and labelling complexities. Data\naugmentation provides a practical solution to overcome the issue of data\nscarcity, however, most augmentation methods exhibit a limited focus,\nprioritising either image or text augmentation exclusively. Acknowledging this\nlimitation, our objective is to devise a framework capable of concurrently\naugmenting medical image and text data. We design a Pairwise Augmentation\n(PairAug) approach that contains an Inter-patient Augmentation (InterAug)\nbranch and an Intra-patient Augmentation (IntraAug) branch. Specifically, the\nInterAug branch of our approach generates radiology images using synthesised\nyet plausible reports derived from a Large Language Model (LLM). The generated\npairs can be considered a collection of new patient cases since they are\nartificially created and may not exist in the original dataset. In contrast,\nthe IntraAug branch uses newly generated reports to manipulate images. This\nprocess allows us to create new paired data for each individual with diverse\nmedical conditions. Our extensive experiments on various downstream tasks\ncovering medical image classification zero-shot and fine-tuning analysis\ndemonstrate that our PairAug, concurrently expanding both image and text data,\nsubstantially outperforms image-/text-only expansion baselines and advanced\nmedical VLP baselines. Our code is released at\n\\url{https://github.com/YtongXie/PairAug}.\n","authors":["Yutong Xie","Qi Chen","Sinuo Wang","Minh-Son To","Iris Lee","Ee Win Khoo","Kerolos Hendy","Daniel Koh","Yong Xia","Qi Wu"],"pdf_url":"https://arxiv.org/pdf/2404.04960v1.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2311.16514v2","updated":"2024-04-07T13:33:56Z","published":"2023-11-27T13:14:06Z","title":"Video Anomaly Detection via Spatio-Temporal Pseudo-Anomaly Generation :\n A Unified Approach","summary":" Video Anomaly Detection (VAD) is an open-set recognition task, which is\nusually formulated as a one-class classification (OCC) problem, where training\ndata is comprised of videos with normal instances while test data contains both\nnormal and anomalous instances. Recent works have investigated the creation of\npseudo-anomalies (PAs) using only the normal data and making strong assumptions\nabout real-world anomalies with regards to abnormality of objects and speed of\nmotion to inject prior information about anomalies in an autoencoder (AE) based\nreconstruction model during training. This work proposes a novel method for\ngenerating generic spatio-temporal PAs by inpainting a masked out region of an\nimage using a pre-trained Latent Diffusion Model and further perturbing the\noptical flow using mixup to emulate spatio-temporal distortions in the data. In\naddition, we present a simple unified framework to detect real-world anomalies\nunder the OCC setting by learning three types of anomaly indicators, namely\nreconstruction quality, temporal irregularity and semantic inconsistency.\nExtensive experiments on four VAD benchmark datasets namely Ped2, Avenue,\nShanghaiTech and UBnormal demonstrate that our method performs on par with\nother existing state-of-the-art PAs generation and reconstruction based methods\nunder the OCC setting. Our analysis also examines the transferability and\ngeneralisation of PAs across these datasets, offering valuable insights by\nidentifying real-world anomalies through PAs.\n","authors":["Ayush K. Rai","Tarun Krishna","Feiyan Hu","Alexandru Drimbarean","Kevin McGuinness","Alan F. Smeaton","Noel E. O'Connor"],"pdf_url":"https://arxiv.org/pdf/2311.16514v2.pdf","comment":"Accepted in CVPRW 2024 - VAND Workshop"},{"id":"http://arxiv.org/abs/2404.04956v1","updated":"2024-04-07T13:30:10Z","published":"2024-04-07T13:30:10Z","title":"Gaussian Shading: Provable Performance-Lossless Image Watermarking for\n Diffusion Models","summary":" Ethical concerns surrounding copyright protection and inappropriate content\ngeneration pose challenges for the practical implementation of diffusion\nmodels. One effective solution involves watermarking the generated images.\nHowever, existing methods often compromise the model performance or require\nadditional training, which is undesirable for operators and users. To address\nthis issue, we propose Gaussian Shading, a diffusion model watermarking\ntechnique that is both performance-lossless and training-free, while serving\nthe dual purpose of copyright protection and tracing of offending content. Our\nwatermark embedding is free of model parameter modifications and thus is\nplug-and-play. We map the watermark to latent representations following a\nstandard Gaussian distribution, which is indistinguishable from latent\nrepresentations obtained from the non-watermarked diffusion model. Therefore we\ncan achieve watermark embedding with lossless performance, for which we also\nprovide theoretical proof. Furthermore, since the watermark is intricately\nlinked with image semantics, it exhibits resilience to lossy processing and\nerasure attempts. The watermark can be extracted by Denoising Diffusion\nImplicit Models (DDIM) inversion and inverse sampling. We evaluate Gaussian\nShading on multiple versions of Stable Diffusion, and the results demonstrate\nthat Gaussian Shading not only is performance-lossless but also outperforms\nexisting methods in terms of robustness.\n","authors":["Zijin Yang","Kai Zeng","Kejiang Chen","Han Fang","Weiming Zhang","Nenghai Yu"],"pdf_url":"https://arxiv.org/pdf/2404.04956v1.pdf","comment":"17 pages, 11 figures, accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.04953v1","updated":"2024-04-07T13:17:47Z","published":"2024-04-07T13:17:47Z","title":"High-Discriminative Attribute Feature Learning for Generalized Zero-Shot\n Learning","summary":" Zero-shot learning(ZSL) aims to recognize new classes without prior exposure\nto their samples, relying on semantic knowledge from observed classes. However,\ncurrent attention-based models may overlook the transferability of visual\nfeatures and the distinctiveness of attribute localization when learning\nregional features in images. Additionally, they often overlook shared\nattributes among different objects. Highly discriminative attribute features\nare crucial for identifying and distinguishing unseen classes. To address these\nissues, we propose an innovative approach called High-Discriminative Attribute\nFeature Learning for Generalized Zero-Shot Learning (HDAFL). HDAFL optimizes\nvisual features by learning attribute features to obtain discriminative visual\nembeddings. Specifically, HDAFL utilizes multiple convolutional kernels to\nautomatically learn discriminative regions highly correlated with attributes in\nimages, eliminating irrelevant interference in image features. Furthermore, we\nintroduce a Transformer-based attribute discrimination encoder to enhance the\ndiscriminative capability among attributes. Simultaneously, the method employs\ncontrastive loss to alleviate dataset biases and enhance the transferability of\nvisual features, facilitating better semantic transfer between seen and unseen\nclasses. Experimental results demonstrate the effectiveness of HDAFL across\nthree widely used datasets.\n","authors":["Yu Lei","Guoshuai Sheng","Fangfang Li","Quanxue Gao","Cheng Deng","Qin Li"],"pdf_url":"https://arxiv.org/pdf/2404.04953v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09434v2","updated":"2024-04-07T13:05:24Z","published":"2024-03-14T14:25:10Z","title":"Reconstruction and Simulation of Elastic Objects with Spring-Mass 3D\n Gaussians","summary":" Reconstructing and simulating elastic objects from visual observations is\ncrucial for applications in computer vision and robotics. Existing methods,\nsuch as 3D Gaussians, model 3D appearance and geometry, but lack the ability to\nestimate physical properties for objects and simulate them. The core challenge\nlies in integrating an expressive yet efficient physical dynamics model. We\npropose Spring-Gaus, a 3D physical object representation for reconstructing and\nsimulating elastic objects from videos of the object from multiple viewpoints.\nIn particular, we develop and integrate a 3D Spring-Mass model into 3D Gaussian\nkernels, enabling the reconstruction of the visual appearance, shape, and\nphysical dynamics of the object. Our approach enables future prediction and\nsimulation under various initial states and environmental properties. We\nevaluate Spring-Gaus on both synthetic and real-world datasets, demonstrating\naccurate reconstruction and simulation of elastic objects. Project page:\nhttps://zlicheng.com/spring_gaus.\n","authors":["Licheng Zhong","Hong-Xing Yu","Jiajun Wu","Yunzhu Li"],"pdf_url":"https://arxiv.org/pdf/2403.09434v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05964v2","updated":"2024-04-07T13:03:58Z","published":"2024-02-05T12:16:28Z","title":"A Survey on Transformer Compression","summary":" Transformer plays a vital role in the realms of natural language processing\n(NLP) and computer vision (CV), specially for constructing large language\nmodels (LLM) and large vision models (LVM). Model compression methods reduce\nthe memory and computational cost of Transformer, which is a necessary step to\nimplement large language/vision models on practical devices. Given the unique\narchitecture of Transformer, featuring alternative attention and feedforward\nneural network (FFN) modules, specific compression techniques are usually\nrequired. The efficiency of these compression methods is also paramount, as\nretraining large models on the entire training dataset is usually impractical.\nThis survey provides a comprehensive review of recent compression methods, with\na specific focus on their application to Transformer-based models. The\ncompression methods are primarily categorized into pruning, quantization,\nknowledge distillation, and efficient architecture design (Mamba, RetNet, RWKV,\netc.). In each category, we discuss compression methods for both language and\nvision tasks, highlighting common underlying principles. Finally, we delve into\nthe relation between various compression methods, and discuss further\ndirections in this domain.\n","authors":["Yehui Tang","Yunhe Wang","Jianyuan Guo","Zhijun Tu","Kai Han","Hailin Hu","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2402.05964v2.pdf","comment":"Model Compression, Transformer, Large Language Model, Large Vision\n Model, LLM"},{"id":"http://arxiv.org/abs/2404.04946v1","updated":"2024-04-07T12:57:41Z","published":"2024-04-07T12:57:41Z","title":"AnimateZoo: Zero-shot Video Generation of Cross-Species Animation via\n Subject Alignment","summary":" Recent video editing advancements rely on accurate pose sequences to animate\nsubjects. However, these efforts are not suitable for cross-species animation\ndue to pose misalignment between species (for example, the poses of a cat\ndiffers greatly from that of a pig due to differences in body structure). In\nthis paper, we present AnimateZoo, a zero-shot diffusion-based video generator\nto address this challenging cross-species animation issue, aiming to accurately\nproduce animal animations while preserving the background. The key technique\nused in our AnimateZoo is subject alignment, which includes two steps. First,\nwe improve appearance feature extraction by integrating a Laplacian detail\nbooster and a prompt-tuning identity extractor. These components are\nspecifically designed to capture essential appearance information, including\nidentity and fine details. Second, we align shape features and address\nconflicts from differing subjects by introducing a scale-information remover.\nThis ensures accurate cross-species animation. Moreover, we introduce two\nhigh-quality animal video datasets featuring a wide variety of species. Trained\non these extensive datasets, our model is capable of generating videos\ncharacterized by accurate movements, consistent appearance, and high-fidelity\nframes, without the need for the pre-inference fine-tuning that prior arts\nrequired. Extensive experiments showcase the outstanding performance of our\nmethod in cross-species action following tasks, demonstrating exceptional shape\nadaptation capability. The project page is available at\nhttps://justinxu0.github.io/AnimateZoo/.\n","authors":["Yuanfeng Xu","Yuhao Chen","Zhongzhan Huang","Zijian He","Guangrun Wang","Philip Torr","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2404.04946v1.pdf","comment":"Technical report,15 pages"},{"id":"http://arxiv.org/abs/2404.03043v2","updated":"2024-04-07T12:37:04Z","published":"2024-04-03T20:05:00Z","title":"Linear Anchored Gaussian Mixture Model for Location and Width\n Computation of Objects in Thick Line Shape","summary":" An accurate detection of the centerlines of linear objects is a challenging\ntopic in many sensitive real-world applications such X-ray imaging, remote\nsensing and lane marking detection in road traffic. Model-based approaches\nusing Hough and Radon transforms are often used but, are not recommended for\nthick line detection, whereas approaches based on image derivatives need\nfurther step-by-step processing, making their efficiency dependent on each step\noutcomes. In this paper, we aim to detect linear structures found in images by\nconsidering the 3D representation of the image gray levels as a finite mixture\nmodel of statistical distribution. The latter, which we named linear anchored\nGaussian distribution could be parametrized by a scale value ${\\sigma}$\ndescribing the linear structure thickness and a line equation, parametrized, in\nturn, by a radius ${\\rho}$ and an orientation angle ${\\theta}$, describing the\nlinear structure centerline location. Expectation-Maximization (EM) algorithm\nis used for the mixture model parameter estimation, where a new paradigm, using\nthe background subtraction for the likelihood function computation, is\nproposed. For the EM algorithm, two ${\\theta}$ parameter initialization schemes\nare used: the first one is based on a random choice of the first component of\n${\\theta}$ vector, whereas the second is based on the image Hessian with a\nsimultaneous computation of the mixture model components number. Experiments on\nreal world images and synthetic images corrupted by blur and additive noise\nshow the good performance of the proposed methods, where the algorithm using\nbackground subtraction and Hessian-based ${\\theta}$ initialization provides an\noutstanding accuracy of the linear structure detection despite irregular image\nbackground and presence of blur and noise.\n","authors":["Nafaa Nacereddine","Aicha Baya Goumeidane","Djemel Ziou"],"pdf_url":"https://arxiv.org/pdf/2404.03043v2.pdf","comment":"13 pages, 13 figures"},{"id":"http://arxiv.org/abs/2305.13799v2","updated":"2024-04-07T12:33:08Z","published":"2023-05-23T08:13:09Z","title":"UPNet: Uncertainty-based Picking Deep Learning Network for Robust First\n Break Picking","summary":" In seismic exploration, first break (FB) picking is a crucial aspect in the\ndetermination of subsurface velocity models, significantly influencing the\nplacement of wells. Many deep neural networks (DNNs)-based automatic picking\nmethods have been proposed to accelerate this processing. Significantly, the\nsegmentation-based DNN methods provide a segmentation map and then estimate FB\nfrom the map using a picking threshold. However, the uncertainty of the results\npicked by DNNs still needs to be analyzed. Thus, the automatic picking methods\napplied in field datasets can not ensure robustness, especially in the case of\na low signal-to-noise ratio (SNR). In this paper, we introduce uncertainty\nquantification into the FB picking task and propose a novel uncertainty-based\npicking deep learning network called UPNet. UPNet not only estimates the\nuncertainty of network output but also can filter the pickings with low\nconfidence. Many experiments evaluate that UPNet exhibits higher accuracy and\nrobustness than the deterministic DNN-based model, achieving State-of-the-Art\n(SOTA) performance in field surveys. In addition, we verify that the\nmeasurement uncertainty is meaningful, which can provide a reference for human\ndecision-making.\n","authors":["Hongtao Wang","Jiangshe Zhang","Xiaoli Wei","Li Long","Chunxia Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.13799v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04936v1","updated":"2024-04-07T12:17:40Z","published":"2024-04-07T12:17:40Z","title":"Bootstrapping Chest CT Image Understanding by Distilling Knowledge from\n X-ray Expert Models","summary":" Radiologists highly desire fully automated versatile AI for medical imaging\ninterpretation. However, the lack of extensively annotated large-scale\nmulti-disease datasets has hindered the achievement of this goal. In this\npaper, we explore the feasibility of leveraging language as a naturally\nhigh-quality supervision for chest CT imaging. In light of the limited\navailability of image-report pairs, we bootstrap the understanding of 3D chest\nCT images by distilling chest-related diagnostic knowledge from an extensively\npre-trained 2D X-ray expert model. Specifically, we propose a language-guided\nretrieval method to match each 3D CT image with its semantically closest 2D\nX-ray image, and perform pair-wise and semantic relation knowledge\ndistillation. Subsequently, we use contrastive learning to align images and\nreports within the same patient while distinguishing them from the other\npatients. However, the challenge arises when patients have similar semantic\ndiagnoses, such as healthy patients, potentially confusing if treated as\nnegatives. We introduce a robust contrastive learning that identifies and\ncorrects these false negatives. We train our model with over 12,000 pairs of\nchest CT images and radiology reports. Extensive experiments across multiple\nscenarios, including zero-shot learning, report generation, and fine-tuning\nprocesses, demonstrate the model's feasibility in interpreting chest CT images.\n","authors":["Weiwei Cao","Jianpeng Zhang","Yingda Xia","Tony C. W. Mok","Zi Li","Xianghua Ye","Le Lu","Jian Zheng","Yuxing Tang","Ling Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.04936v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.04935v1","updated":"2024-04-07T12:15:53Z","published":"2024-04-07T12:15:53Z","title":"Anomaly Detection in Electrocardiograms: Advancing Clinical Diagnosis\n Through Self-Supervised Learning","summary":" The electrocardiogram (ECG) is an essential tool for diagnosing heart\ndisease, with computer-aided systems improving diagnostic accuracy and reducing\nhealthcare costs. Despite advancements, existing systems often miss rare\ncardiac anomalies that could be precursors to serious, life-threatening issues\nor alterations in the cardiac macro/microstructure. We address this gap by\nfocusing on self-supervised anomaly detection (AD), training exclusively on\nnormal ECGs to recognize deviations indicating anomalies. We introduce a novel\nself-supervised learning framework for ECG AD, utilizing a vast dataset of\nnormal ECGs to autonomously detect and localize cardiac anomalies. It proposes\na novel masking and restoration technique alongside a multi-scale\ncross-attention module, enhancing the model's ability to integrate global and\nlocal signal features. The framework emphasizes accurate localization of\nanomalies within ECG signals, ensuring the method's clinical relevance and\nreliability. To reduce the impact of individual variability, the approach\nfurther incorporates crucial patient-specific information from ECG reports,\nsuch as age and gender, thus enabling accurate identification of a broad\nspectrum of cardiac anomalies, including rare ones. Utilizing an extensive\ndataset of 478,803 ECG graphic reports from real-world clinical practice, our\nmethod has demonstrated exceptional effectiveness in AD across all tested\nconditions, regardless of their frequency of occurrence, significantly\noutperforming existing models. It achieved superior performance metrics,\nincluding an AUROC of 91.2%, an F1 score of 83.7%, a sensitivity rate of 84.2%,\na specificity of 83.0%, and a precision of 75.6% with a fixed recall rate of\n90%. It has also demonstrated robust localization capabilities, with an AUROC\nof 76.5% and a Dice coefficient of 65.3% for anomaly localization.\n","authors":["Aofan Jiang","Chaoqin Huang","Qing Cao","Yuchen Xu","Zi Zeng","Kang Chen","Ya Zhang","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2404.04935v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04933v1","updated":"2024-04-07T12:14:42Z","published":"2024-04-07T12:14:42Z","title":"UniMD: Towards Unifying Moment Retrieval and Temporal Action Detection","summary":" Temporal Action Detection (TAD) focuses on detecting pre-defined actions,\nwhile Moment Retrieval (MR) aims to identify the events described by open-ended\nnatural language within untrimmed videos. Despite that they focus on different\nevents, we observe they have a significant connection. For instance, most\ndescriptions in MR involve multiple actions from TAD. In this paper, we aim to\ninvestigate the potential synergy between TAD and MR. Firstly, we propose a\nunified architecture, termed Unified Moment Detection (UniMD), for both TAD and\nMR. It transforms the inputs of the two tasks, namely actions for TAD or events\nfor MR, into a common embedding space, and utilizes two novel query-dependent\ndecoders to generate a uniform output of classification score and temporal\nsegments. Secondly, we explore the efficacy of two task fusion learning\napproaches, pre-training and co-training, in order to enhance the mutual\nbenefits between TAD and MR. Extensive experiments demonstrate that the\nproposed task fusion learning scheme enables the two tasks to help each other\nand outperform the separately trained counterparts. Impressively, UniMD\nachieves state-of-the-art results on three paired datasets Ego4D, Charades-STA,\nand ActivityNet. Our code will be released at\nhttps://github.com/yingsen1/UniMD.\n","authors":["Yingsen Zeng","Yujie Zhong","Chengjian Feng","Lin Ma"],"pdf_url":"https://arxiv.org/pdf/2404.04933v1.pdf","comment":"Tech report"},{"id":"http://arxiv.org/abs/2402.13185v4","updated":"2024-04-07T12:11:28Z","published":"2024-02-20T17:52:12Z","title":"UniEdit: A Unified Tuning-Free Framework for Video Motion and Appearance\n Editing","summary":" Recent advances in text-guided video editing have showcased promising results\nin appearance editing (e.g., stylization). However, video motion editing in the\ntemporal dimension (e.g., from eating to waving), which distinguishes video\nediting from image editing, is underexplored. In this work, we present UniEdit,\na tuning-free framework that supports both video motion and appearance editing\nby harnessing the power of a pre-trained text-to-video generator within an\ninversion-then-generation framework. To realize motion editing while preserving\nsource video content, based on the insights that temporal and spatial\nself-attention layers encode inter-frame and intra-frame dependency\nrespectively, we introduce auxiliary motion-reference and reconstruction\nbranches to produce text-guided motion and source features respectively. The\nobtained features are then injected into the main editing path via temporal and\nspatial self-attention layers. Extensive experiments demonstrate that UniEdit\ncovers video motion editing and various appearance editing scenarios, and\nsurpasses the state-of-the-art methods. Our code will be publicly available.\n","authors":["Jianhong Bai","Tianyu He","Yuchi Wang","Junliang Guo","Haoji Hu","Zuozhu Liu","Jiang Bian"],"pdf_url":"https://arxiv.org/pdf/2402.13185v4.pdf","comment":"Project page: https://jianhongbai.github.io/UniEdit/"},{"id":"http://arxiv.org/abs/2305.13600v2","updated":"2024-04-07T11:50:34Z","published":"2023-05-23T02:02:36Z","title":"SiCL: Silhouette-Driven Contrastive Learning for Unsupervised Person\n Re-Identification with Clothes Change","summary":" In this paper, we address a highly challenging yet critical task:\nunsupervised long-term person re-identification with clothes change. Existing\nunsupervised person re-id methods are mainly designed for short-term scenarios\nand usually rely on RGB cues so that fail to perceive feature patterns that are\nindependent of the clothes. To crack this bottleneck, we propose a\nsilhouette-driven contrastive learning (SiCL) method, which is designed to\nlearn cross-clothes invariance by integrating both the RGB cues and the\nsilhouette information within a contrastive learning framework. To our\nknowledge, this is the first tailor-made framework for unsupervised long-term\nclothes change \\reid{}, with superior performance on six benchmark datasets. We\nconduct extensive experiments to evaluate our proposed SiCL compared to the\nstate-of-the-art unsupervised person reid methods across all the representative\ndatasets. Experimental results demonstrate that our proposed SiCL significantly\noutperforms other unsupervised re-id methods.\n","authors":["Mingkun Li","Peng Xu","Chun-Guang Li","Jun Guo"],"pdf_url":"https://arxiv.org/pdf/2305.13600v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04924v1","updated":"2024-04-07T11:48:07Z","published":"2024-04-07T11:48:07Z","title":"GvT: A Graph-based Vision Transformer with Talking-Heads Utilizing\n Sparsity, Trained from Scratch on Small Datasets","summary":" Vision Transformers (ViTs) have achieved impressive results in large-scale\nimage classification. However, when training from scratch on small datasets,\nthere is still a significant performance gap between ViTs and Convolutional\nNeural Networks (CNNs), which is attributed to the lack of inductive bias. To\naddress this issue, we propose a Graph-based Vision Transformer (GvT) that\nutilizes graph convolutional projection and graph-pooling. In each block,\nqueries and keys are calculated through graph convolutional projection based on\nthe spatial adjacency matrix, while dot-product attention is used in another\ngraph convolution to generate values. When using more attention heads, the\nqueries and keys become lower-dimensional, making their dot product an\nuninformative matching function. To overcome this low-rank bottleneck in\nattention heads, we employ talking-heads technology based on bilinear pooled\nfeatures and sparse selection of attention tensors. This allows interaction\namong filtered attention scores and enables each attention mechanism to depend\non all queries and keys. Additionally, we apply graph-pooling between two\nintermediate blocks to reduce the number of tokens and aggregate semantic\ninformation more effectively. Our experimental results show that GvT produces\ncomparable or superior outcomes to deep convolutional networks and surpasses\nvision transformers without pre-training on large datasets. The code for our\nproposed model is publicly available on the website.\n","authors":["Dongjing Shan","guiqiang chen"],"pdf_url":"https://arxiv.org/pdf/2404.04924v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12488v2","updated":"2024-04-07T11:38:48Z","published":"2024-03-19T06:54:33Z","title":"DetToolChain: A New Prompting Paradigm to Unleash Detection Ability of\n MLLM","summary":" We present DetToolChain, a novel prompting paradigm, to unleash the zero-shot\nobject detection ability of multimodal large language models (MLLMs), such as\nGPT-4V and Gemini. Our approach consists of a detection prompting toolkit\ninspired by high-precision detection priors and a new Chain-of-Thought to\nimplement these prompts. Specifically, the prompts in the toolkit are designed\nto guide the MLLM to focus on regional information (e.g., zooming in), read\ncoordinates according to measure standards (e.g., overlaying rulers and\ncompasses), and infer from the contextual information (e.g., overlaying scene\ngraphs). Building upon these tools, the new detection chain-of-thought can\nautomatically decompose the task into simple subtasks, diagnose the\npredictions, and plan for progressive box refinements. The effectiveness of our\nframework is demonstrated across a spectrum of detection tasks, especially hard\ncases. Compared to existing state-of-the-art methods, GPT-4V with our\nDetToolChain improves state-of-the-art object detectors by +21.5% AP50 on MS\nCOCO Novel class set for open-vocabulary detection, +24.23% Acc on RefCOCO val\nset for zero-shot referring expression comprehension, +14.5% AP on D-cube\ndescribe object detection FULL setting.\n","authors":["Yixuan Wu","Yizhou Wang","Shixiang Tang","Wenhao Wu","Tong He","Wanli Ouyang","Jian Wu","Philip Torr"],"pdf_url":"https://arxiv.org/pdf/2403.12488v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04922v1","updated":"2024-04-07T11:25:04Z","published":"2024-04-07T11:25:04Z","title":"Efficient Learnable Collaborative Attention for Single Image\n Super-Resolution","summary":" Non-Local Attention (NLA) is a powerful technique for capturing long-range\nfeature correlations in deep single image super-resolution (SR). However, NLA\nsuffers from high computational complexity and memory consumption, as it\nrequires aggregating all non-local feature information for each query response\nand recalculating the similarity weight distribution for different abstraction\nlevels of features. To address these challenges, we propose a novel Learnable\nCollaborative Attention (LCoA) that introduces inductive bias into non-local\nmodeling. Our LCoA consists of two components: Learnable Sparse Pattern (LSP)\nand Collaborative Attention (CoA). LSP uses the k-means clustering algorithm to\ndynamically adjust the sparse attention pattern of deep features, which reduces\nthe number of non-local modeling rounds compared with existing sparse\nsolutions. CoA leverages the sparse attention pattern and weights learned by\nLSP, and co-optimizes the similarity matrix across different abstraction\nlevels, which avoids redundant similarity matrix calculations. The experimental\nresults show that our LCoA can reduce the non-local modeling time by about 83%\nin the inference stage. In addition, we integrate our LCoA into a deep\nLearnable Collaborative Attention Network (LCoAN), which achieves competitive\nperformance in terms of inference time, memory consumption, and reconstruction\nquality compared with other state-of-the-art SR methods.\n","authors":["Yigang Zhao Chaowei Zheng","Jiannan Su"," GuangyongChen"," MinGan"],"pdf_url":"https://arxiv.org/pdf/2404.04922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16499v2","updated":"2024-04-07T11:16:15Z","published":"2024-03-25T07:34:06Z","title":"Self-Supervised Learning for Medical Image Data with Anatomy-Oriented\n Imaging Planes","summary":" Self-supervised learning has emerged as a powerful tool for pretraining deep\nnetworks on unlabeled data, prior to transfer learning of target tasks with\nlimited annotation. The relevance between the pretraining pretext and target\ntasks is crucial to the success of transfer learning. Various pretext tasks\nhave been proposed to utilize properties of medical image data (e.g., three\ndimensionality), which are more relevant to medical image analysis than generic\nones for natural images. However, previous work rarely paid attention to data\nwith anatomy-oriented imaging planes, e.g., standard cardiac magnetic resonance\nimaging views. As these imaging planes are defined according to the anatomy of\nthe imaged organ, pretext tasks effectively exploiting this information can\npretrain the networks to gain knowledge on the organ of interest. In this work,\nwe propose two complementary pretext tasks for this group of medical image data\nbased on the spatial relationship of the imaging planes. The first is to learn\nthe relative orientation between the imaging planes and implemented as\nregressing their intersecting lines. The second exploits parallel imaging\nplanes to regress their relative slice locations within a stack. Both pretext\ntasks are conceptually straightforward and easy to implement, and can be\ncombined in multitask learning for better representation learning. Thorough\nexperiments on two anatomical structures (heart and knee) and representative\ntarget tasks (semantic segmentation and classification) demonstrate that the\nproposed pretext tasks are effective in pretraining deep networks for\nremarkably boosted performance on the target tasks, and superior to other\nrecent approaches.\n","authors":["Tianwei Zhang","Dong Wei","Mengmeng Zhu","Shi Gu","Yefeng Zheng"],"pdf_url":"https://arxiv.org/pdf/2403.16499v2.pdf","comment":"Medical Image Analysis"},{"id":"http://arxiv.org/abs/2404.04916v1","updated":"2024-04-07T10:57:54Z","published":"2024-04-07T10:57:54Z","title":"Correcting Diffusion-Based Perceptual Image Compression with Privileged\n End-to-End Decoder","summary":" The images produced by diffusion models can attain excellent perceptual\nquality. However, it is challenging for diffusion models to guarantee\ndistortion, hence the integration of diffusion models and image compression\nmodels still needs more comprehensive explorations. This paper presents a\ndiffusion-based image compression method that employs a privileged end-to-end\ndecoder model as correction, which achieves better perceptual quality while\nguaranteeing the distortion to an extent. We build a diffusion model and design\na novel paradigm that combines the diffusion model and an end-to-end decoder,\nand the latter is responsible for transmitting the privileged information\nextracted at the encoder side. Specifically, we theoretically analyze the\nreconstruction process of the diffusion models at the encoder side with the\noriginal images being visible. Based on the analysis, we introduce an\nend-to-end convolutional decoder to provide a better approximation of the score\nfunction $\\nabla_{\\mathbf{x}_t}\\log p(\\mathbf{x}_t)$ at the encoder side and\neffectively transmit the combination. Experiments demonstrate the superiority\nof our method in both distortion and perception compared with previous\nperceptual compression methods.\n","authors":["Yiyang Ma","Wenhan Yang","Jiaying Liu"],"pdf_url":"https://arxiv.org/pdf/2404.04916v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04913v1","updated":"2024-04-07T10:49:59Z","published":"2024-04-07T10:49:59Z","title":"CodecNeRF: Toward Fast Encoding and Decoding, Compact, and High-quality\n Novel-view Synthesis","summary":" Neural Radiance Fields (NeRF) have achieved huge success in effectively\ncapturing and representing 3D objects and scenes. However, several factors have\nimpeded its further proliferation as next-generation 3D media. To establish a\nubiquitous presence in everyday media formats, such as images and videos, it is\nimperative to devise a solution that effectively fulfills three key objectives:\nfast encoding and decoding time, compact model sizes, and high-quality\nrenderings. Despite significant advancements, a comprehensive algorithm that\nadequately addresses all objectives has yet to be fully realized. In this work,\nwe present CodecNeRF, a neural codec for NeRF representations, consisting of a\nnovel encoder and decoder architecture that can generate a NeRF representation\nin a single forward pass. Furthermore, inspired by the recent\nparameter-efficient finetuning approaches, we develop a novel finetuning method\nto efficiently adapt the generated NeRF representations to a new test instance,\nleading to high-quality image renderings and compact code sizes. The proposed\nCodecNeRF, a newly suggested encoding-decoding-finetuning pipeline for NeRF,\nachieved unprecedented compression performance of more than 150x and 20x\nreduction in encoding time while maintaining (or improving) the image quality\non widely used 3D object datasets, such as ShapeNet and Objaverse.\n","authors":["Gyeongjin Kang","Younggeun Lee","Eunbyung Park"],"pdf_url":"https://arxiv.org/pdf/2404.04913v1.pdf","comment":"34 pages, 22 figures, Project page:\n https://gynjn.github.io/Codec-NeRF/"},{"id":"http://arxiv.org/abs/2404.04910v1","updated":"2024-04-07T10:39:04Z","published":"2024-04-07T10:39:04Z","title":"MonoTAKD: Teaching Assistant Knowledge Distillation for Monocular 3D\n Object Detection","summary":" Monocular 3D object detection (Mono3D) is an indispensable research topic in\nautonomous driving, thanks to the cost-effective monocular camera sensors and\nits wide range of applications. Since the image perspective has depth\nambiguity, the challenges of Mono3D lie in understanding 3D scene geometry and\nreconstructing 3D object information from a single image. Previous methods\nattempted to transfer 3D information directly from the LiDAR-based teacher to\nthe camera-based student. However, a considerable gap in feature representation\nmakes direct cross-modal distillation inefficient, resulting in a significant\nperformance deterioration between the LiDAR-based teacher and the camera-based\nstudent. To address this issue, we propose the Teaching Assistant Knowledge\nDistillation (MonoTAKD) to break down the learning objective by integrating\nintra-modal distillation with cross-modal residual distillation. In particular,\nwe employ a strong camera-based teaching assistant model to distill powerful\nvisual knowledge effectively through intra-modal distillation. Subsequently, we\nintroduce the cross-modal residual distillation to transfer the 3D spatial\ncues. By acquiring both visual knowledge and 3D spatial cues, the predictions\nof our approach are rigorously evaluated on the KITTI 3D object detection\nbenchmark and achieve state-of-the-art performance in Mono3D.\n","authors":["Hou-I Liu","Christine Wu","Jen-Hao Cheng","Wenhao Chai","Shian-Yun Wang","Gaowen Liu","Jenq-Neng Hwang","Hong-Han Shuai","Wen-Huang Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.04910v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2404.04908v1","updated":"2024-04-07T10:28:01Z","published":"2024-04-07T10:28:01Z","title":"Dual-Camera Smooth Zoom on Mobile Phones","summary":" When zooming between dual cameras on a mobile, noticeable jumps in geometric\ncontent and image color occur in the preview, inevitably affecting the user's\nzoom experience. In this work, we introduce a new task, ie, dual-camera smooth\nzoom (DCSZ) to achieve a smooth zoom preview. The frame interpolation (FI)\ntechnique is a potential solution but struggles with ground-truth collection.\nTo address the issue, we suggest a data factory solution where continuous\nvirtual cameras are assembled to generate DCSZ data by rendering reconstructed\n3D models of the scene. In particular, we propose a novel dual-camera smooth\nzoom Gaussian Splatting (ZoomGS), where a camera-specific encoding is\nintroduced to construct a specific 3D model for each virtual camera. With the\nproposed data factory, we construct a synthetic dataset for DCSZ, and we\nutilize it to fine-tune FI models. In addition, we collect real-world dual-zoom\nimages without ground-truth for evaluation. Extensive experiments are conducted\nwith multiple FI methods. The results show that the fine-tuned FI models\nachieve a significant performance improvement over the original ones on DCSZ\ntask. The datasets, codes, and pre-trained models will be publicly available.\n","authors":["Renlong Wu","Zhilu Zhang","Yu Yang","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.04908v1.pdf","comment":"24"},{"id":"http://arxiv.org/abs/2403.16834v2","updated":"2024-04-07T09:56:54Z","published":"2024-03-25T14:57:29Z","title":"From Two-Stream to One-Stream: Efficient RGB-T Tracking via Mutual\n Prompt Learning and Knowledge Distillation","summary":" Due to the complementary nature of visible light and thermal infrared\nmodalities, object tracking based on the fusion of visible light images and\nthermal images (referred to as RGB-T tracking) has received increasing\nattention from researchers in recent years. How to achieve more comprehensive\nfusion of information from the two modalities at a lower cost has been an issue\nthat researchers have been exploring. Inspired by visual prompt learning, we\ndesigned a novel two-stream RGB-T tracking architecture based on cross-modal\nmutual prompt learning, and used this model as a teacher to guide a one-stream\nstudent model for rapid learning through knowledge distillation techniques.\nExtensive experiments have shown that, compared to similar RGB-T trackers, our\ndesigned teacher model achieved the highest precision rate, while the student\nmodel, with comparable precision rate to the teacher model, realized an\ninference speed more than three times faster than the teacher model.(Codes will\nbe available if accepted.)\n","authors":["Yang Luo","Xiqing Guo","Hao Li"],"pdf_url":"https://arxiv.org/pdf/2403.16834v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11700v4","updated":"2024-04-07T09:17:34Z","published":"2023-11-20T12:08:23Z","title":"GS-SLAM: Dense Visual SLAM with 3D Gaussian Splatting","summary":" In this paper, we introduce \\textbf{GS-SLAM} that first utilizes 3D Gaussian\nrepresentation in the Simultaneous Localization and Mapping (SLAM) system. It\nfacilitates a better balance between efficiency and accuracy. Compared to\nrecent SLAM methods employing neural implicit representations, our method\nutilizes a real-time differentiable splatting rendering pipeline that offers\nsignificant speedup to map optimization and RGB-D rendering. Specifically, we\npropose an adaptive expansion strategy that adds new or deletes noisy 3D\nGaussians in order to efficiently reconstruct new observed scene geometry and\nimprove the mapping of previously observed areas. This strategy is essential to\nextend 3D Gaussian representation to reconstruct the whole scene rather than\nsynthesize a static object in existing methods. Moreover, in the pose tracking\nprocess, an effective coarse-to-fine technique is designed to select reliable\n3D Gaussian representations to optimize camera pose, resulting in runtime\nreduction and robust estimation. Our method achieves competitive performance\ncompared with existing state-of-the-art real-time methods on the Replica,\nTUM-RGBD datasets. Project page: https://gs-slam.github.io/.\n","authors":["Chi Yan","Delin Qu","Dan Xu","Bin Zhao","Zhigang Wang","Dong Wang","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2311.11700v4.pdf","comment":"Accepted to CVPR 2024(highlight). Project Page:\n https://gs-slam.github.io/"},{"id":"http://arxiv.org/abs/2404.04891v1","updated":"2024-04-07T09:17:00Z","published":"2024-04-07T09:17:00Z","title":"DL-EWF: Deep Learning Empowering Women's Fashion with\n Grounded-Segment-Anything Segmentation for Body Shape Classification","summary":" The global fashion industry plays a pivotal role in the global economy, and\naddressing fundamental issues within the industry is crucial for developing\ninnovative solutions. One of the most pressing challenges in the fashion\nindustry is the mismatch between body shapes and the garments of individuals\nthey purchase. This issue is particularly prevalent among individuals with\nnon-ideal body shapes, exacerbating the challenges faced. Considering\ninter-individual variability in body shapes is essential for designing and\nproducing garments that are widely accepted by consumers. Traditional methods\nfor determining human body shape are limited due to their low accuracy, high\ncosts, and time-consuming nature. New approaches, utilizing digital imaging and\ndeep neural networks (DNN), have been introduced to identify human body shape.\nIn this study, the Style4BodyShape dataset is used for classifying body shapes\ninto five categories: Rectangle, Triangle, Inverted Triangle, Hourglass, and\nApple. In this paper, the body shape segmentation of a person is extracted from\nthe image, disregarding the surroundings and background. Then, Various\npre-trained models, such as ResNet18, ResNet34, ResNet50, VGG16, VGG19, and\nInception v3, are used to classify the segmentation results. Among these\npre-trained models, the Inception V3 model demonstrates superior performance\nregarding f1-score evaluation metric and accuracy compared to the other models.\n","authors":["Fatemeh Asghari","Mohammad Reza Soheili","Faezeh Gholamrezaie"],"pdf_url":"https://arxiv.org/pdf/2404.04891v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04890v1","updated":"2024-04-07T09:15:45Z","published":"2024-04-07T09:15:45Z","title":"A Unified Diffusion Framework for Scene-aware Human Motion Estimation\n from Sparse Signals","summary":" Estimating full-body human motion via sparse tracking signals from\nhead-mounted displays and hand controllers in 3D scenes is crucial to\napplications in AR/VR. One of the biggest challenges to this task is the\none-to-many mapping from sparse observations to dense full-body motions, which\nendowed inherent ambiguities. To help resolve this ambiguous problem, we\nintroduce a new framework to combine rich contextual information provided by\nscenes to benefit full-body motion tracking from sparse observations. To\nestimate plausible human motions given sparse tracking signals and 3D scenes,\nwe develop $\\text{S}^2$Fusion, a unified framework fusing \\underline{S}cene and\nsparse \\underline{S}ignals with a conditional dif\\underline{Fusion} model.\n$\\text{S}^2$Fusion first extracts the spatial-temporal relations residing in\nthe sparse signals via a periodic autoencoder, and then produces time-alignment\nfeature embedding as additional inputs. Subsequently, by drawing initial noisy\nmotion from a pre-trained prior, $\\text{S}^2$Fusion utilizes conditional\ndiffusion to fuse scene geometry and sparse tracking signals to generate\nfull-body scene-aware motions. The sampling procedure of $\\text{S}^2$Fusion is\nfurther guided by a specially designed scene-penetration loss and\nphase-matching loss, which effectively regularizes the motion of the lower body\neven in the absence of any tracking signals, making the generated motion much\nmore plausible and coherent. Extensive experimental results have demonstrated\nthat our $\\text{S}^2$Fusion outperforms the state-of-the-art in terms of\nestimation quality and smoothness.\n","authors":["Jiangnan Tang","Jingya Wang","Kaiyang Ji","Lan Xu","Jingyi Yu","Ye Shi"],"pdf_url":"https://arxiv.org/pdf/2404.04890v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04887v1","updated":"2024-04-07T09:08:14Z","published":"2024-04-07T09:08:14Z","title":"A Clinical-oriented Multi-level Contrastive Learning Method for Disease\n Diagnosis in Low-quality Medical Images","summary":" Representation learning offers a conduit to elucidate distinctive features\nwithin the latent space and interpret the deep models. However, the randomness\nof lesion distribution and the complexity of low-quality factors in medical\nimages pose great challenges for models to extract key lesion features. Disease\ndiagnosis methods guided by contrastive learning (CL) have shown significant\nadvantages in lesion feature representation. Nevertheless, the effectiveness of\nCL is highly dependent on the quality of the positive and negative sample\npairs. In this work, we propose a clinical-oriented multi-level CL framework\nthat aims to enhance the model's capacity to extract lesion features and\ndiscriminate between lesion and low-quality factors, thereby enabling more\naccurate disease diagnosis from low-quality medical images. Specifically, we\nfirst construct multi-level positive and negative pairs to enhance the model's\ncomprehensive recognition capability of lesion features by integrating\ninformation from different levels and qualities of medical images. Moreover, to\nimprove the quality of the learned lesion embeddings, we introduce a dynamic\nhard sample mining method based on self-paced learning. The proposed CL\nframework is validated on two public medical image datasets, EyeQ and Chest\nX-ray, demonstrating superior performance compared to other state-of-the-art\ndisease diagnostic methods.\n","authors":["Qingshan Hou","Shuai Cheng","Peng Cao","Jinzhu Yang","Xiaoli Liu","Osmar R. Zaiane","Yih Chung Tham"],"pdf_url":"https://arxiv.org/pdf/2404.04887v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04884v1","updated":"2024-04-07T09:05:04Z","published":"2024-04-07T09:05:04Z","title":"LRNet: Change detection of high-resolution remote sensing imagery via\n strategy of localization-then-refinement","summary":" Change detection, as a research hotspot in the field of remote sensing, has\nwitnessed continuous development and progress. However, the discrimination of\nboundary details remains a significant bottleneck due to the complexity of\nsurrounding elements between change areas and backgrounds. Discriminating the\nboundaries of large change areas results in misalignment, while connecting\nboundaries occurs for small change targets. To address the above issues, a\nnovel network based on the localization-then-refinement strategy is proposed in\nthis paper, namely LRNet. LRNet consists of two stages: localization and\nrefinement. In the localization stage, a three-branch encoder simultaneously\nextracts original image features and their differential features for\ninteractive localization of the position of each change area. To minimize\ninformation loss during feature extraction, learnable optimal pooling (LOP) is\nproposed to replace the widely used max-pooling. Additionally, this process is\ntrainable and contributes to the overall optimization of the network. To\neffectively interact features from different branches and accurately locate\nchange areas of various sizes, change alignment attention (C2A) and\nhierarchical change alignment module (HCA) are proposed. In the refinement\nstage, the localization results from the localization stage are corrected by\nconstraining the change areas and change edges through the edge-area alignment\nmodule (E2A). Subsequently, the decoder, combined with the difference features\nstrengthened by C2A in the localization phase, refines change areas of\ndifferent sizes, ultimately achieving accurate boundary discrimination of\nchange areas. The proposed LRNet outperforms 13 other state-of-the-art methods\nin terms of comprehensive evaluation metrics and provides the most precise\nboundary discrimination results on the LEVIR-CD and WHU-CD datasets.\n","authors":["Huan Zhong","Chen Wu","Ziqi Xiao"],"pdf_url":"https://arxiv.org/pdf/2404.04884v1.pdf","comment":"18 pages, 11 figures"},{"id":"http://arxiv.org/abs/2404.04883v1","updated":"2024-04-07T09:01:50Z","published":"2024-04-07T09:01:50Z","title":"Mixture of Low-rank Experts for Transferable AI-Generated Image\n Detection","summary":" Generative models have shown a giant leap in synthesizing photo-realistic\nimages with minimal expertise, sparking concerns about the authenticity of\nonline information. This study aims to develop a universal AI-generated image\ndetector capable of identifying images from diverse sources. Existing methods\nstruggle to generalize across unseen generative models when provided with\nlimited sample sources. Inspired by the zero-shot transferability of\npre-trained vision-language models, we seek to harness the nontrivial\nvisual-world knowledge and descriptive proficiency of CLIP-ViT to generalize\nover unknown domains. This paper presents a novel parameter-efficient\nfine-tuning approach, mixture of low-rank experts, to fully exploit CLIP-ViT's\npotential while preserving knowledge and expanding capacity for transferable\ndetection. We adapt only the MLP layers of deeper ViT blocks via an integration\nof shared and separate LoRAs within an MoE-based structure. Extensive\nexperiments on public benchmarks show that our method achieves superiority over\nstate-of-the-art approaches in cross-generator generalization and robustness to\nperturbations. Remarkably, our best-performing ViT-L/14 variant requires\ntraining only 0.08% of its parameters to surpass the leading baseline by +3.64%\nmAP and +12.72% avg.Acc across unseen diffusion and autoregressive models. This\neven outperforms the baseline with just 0.28% of the training data. Our code\nand pre-trained models will be available at\nhttps://github.com/zhliuworks/CLIPMoLE.\n","authors":["Zihan Liu","Hanyi Wang","Yaoyu Kang","Shilin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.04883v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04880v1","updated":"2024-04-07T08:51:31Z","published":"2024-04-07T08:51:31Z","title":"GauU-Scene V2: Expanse Lidar Image Dataset Shows Unreliable Geometric\n Reconstruction Using Gaussian Splatting and NeRF","summary":" We introduce a novel large-scale scene reconstruction benchmark that utilizes\nnewly developed 3D representation approaches: Gaussian Splatting and Neural\nRadiance Fields, on our expansive GauU-Scene V2 dataset. GauU-Scene V2\nencompasses over 6.5 square kilometers and features a comprehensive RGB dataset\ncoupled with LiDAR ground truth. This dataset offers a unique blend of urban\nand academic environments for advanced spatial analysis, covering more than 6.5\nkm2. We also provide detailed supplementary information on data collection\nprotocols. Furthermore, we present an easy-to-follow pipeline to align the\nCOLMAP sparse point cloud with the detailed LiDAR dataset. Our evaluation of\nU-Scene, which includes a detailed analysis across various novel viewpoints\nusing image-based metrics such as SSIM, LPIPS, and PSNR, shows contradictory\nresults when applying geometric-based metrics, such as Chamfer distance. This\nleads to doubts about the reliability of current image-based measurement\nmatrices and geometric extraction methods on Gaussian Splatting. We also make\nthe dataset available on the following anonymous project page\n","authors":["Butian Xiong","Nanjun Zheng","Zhen Li"],"pdf_url":"https://arxiv.org/pdf/2404.04880v1.pdf","comment":"8 pages(No reference) 6 figures 4 tabs"},{"id":"http://arxiv.org/abs/2404.04878v1","updated":"2024-04-07T08:48:01Z","published":"2024-04-07T08:48:01Z","title":"CycleINR: Cycle Implicit Neural Representation for Arbitrary-Scale\n Volumetric Super-Resolution of Medical Data","summary":" In the realm of medical 3D data, such as CT and MRI images, prevalent\nanisotropic resolution is characterized by high intra-slice but diminished\ninter-slice resolution. The lowered resolution between adjacent slices poses\nchallenges, hindering optimal viewing experiences and impeding the development\nof robust downstream analysis algorithms. Various volumetric super-resolution\nalgorithms aim to surmount these challenges, enhancing inter-slice resolution\nand overall 3D medical imaging quality. However, existing approaches confront\ninherent challenges: 1) often tailored to specific upsampling factors, lacking\nflexibility for diverse clinical scenarios; 2) newly generated slices\nfrequently suffer from over-smoothing, degrading fine details, and leading to\ninter-slice inconsistency. In response, this study presents CycleINR, a novel\nenhanced Implicit Neural Representation model for 3D medical data volumetric\nsuper-resolution. Leveraging the continuity of the learned implicit function,\nthe CycleINR model can achieve results with arbitrary up-sampling rates,\neliminating the need for separate training. Additionally, we enhance the grid\nsampling in CycleINR with a local attention mechanism and mitigate\nover-smoothing by integrating cycle-consistent loss. We introduce a new metric,\nSlice-wise Noise Level Inconsistency (SNLI), to quantitatively assess\ninter-slice noise level inconsistency. The effectiveness of our approach is\ndemonstrated through image quality evaluations on an in-house dataset and a\ndownstream task analysis on the Medical Segmentation Decathlon liver tumor\ndataset.\n","authors":["Wei Fang","Yuxing Tang","Heng Guo","Mingze Yuan","Tony C. W. Mok","Ke Yan","Jiawen Yao","Xin Chen","Zaiyi Liu","Le Lu","Ling Zhang","Minfeng Xu"],"pdf_url":"https://arxiv.org/pdf/2404.04878v1.pdf","comment":"CVPR accepted paper"},{"id":"http://arxiv.org/abs/2404.04876v1","updated":"2024-04-07T08:46:06Z","published":"2024-04-07T08:46:06Z","title":"HiLo: Detailed and Robust 3D Clothed Human Reconstruction with High-and\n Low-Frequency Information of Parametric Models","summary":" Reconstructing 3D clothed human involves creating a detailed geometry of\nindividuals in clothing, with applications ranging from virtual try-on, movies,\nto games. To enable practical and widespread applications, recent advances\npropose to generate a clothed human from an RGB image. However, they struggle\nto reconstruct detailed and robust avatars simultaneously. We empirically find\nthat the high-frequency (HF) and low-frequency (LF) information from a\nparametric model has the potential to enhance geometry details and improve\nrobustness to noise, respectively. Based on this, we propose HiLo, namely\nclothed human reconstruction with high- and low-frequency information, which\ncontains two components. 1) To recover detailed geometry using HF information,\nwe propose a progressive HF Signed Distance Function to enhance the detailed 3D\ngeometry of a clothed human. We analyze that our progressive learning manner\nalleviates large gradients that hinder model convergence. 2) To achieve robust\nreconstruction against inaccurate estimation of the parametric model by using\nLF information, we propose a spatial interaction implicit function. This\nfunction effectively exploits the complementary spatial information from a\nlow-resolution voxel grid of the parametric model. Experimental results\ndemonstrate that HiLo outperforms the state-of-the-art methods by 10.43% and\n9.54% in terms of Chamfer distance on the Thuman2.0 and CAPE datasets,\nrespectively. Additionally, HiLo demonstrates robustness to noise from the\nparametric model, challenging poses, and various clothing styles.\n","authors":["Yifan Yang","Dong Liu","Shuhai Zhang","Zeshuai Deng","Zixiong Huang","Mingkui Tan"],"pdf_url":"https://arxiv.org/pdf/2404.04876v1.pdf","comment":"CVPR 2024 Accepted Paper"},{"id":"http://arxiv.org/abs/2404.04875v1","updated":"2024-04-07T08:42:38Z","published":"2024-04-07T08:42:38Z","title":"NeRF2Points: Large-Scale Point Cloud Generation From Street Views'\n Radiance Field Optimization","summary":" Neural Radiance Fields (NeRF) have emerged as a paradigm-shifting methodology\nfor the photorealistic rendering of objects and environments, enabling the\nsynthesis of novel viewpoints with remarkable fidelity. This is accomplished\nthrough the strategic utilization of object-centric camera poses characterized\nby significant inter-frame overlap. This paper explores a compelling,\nalternative utility of NeRF: the derivation of point clouds from aggregated\nurban landscape imagery. The transmutation of street-view data into point\nclouds is fraught with complexities, attributable to a nexus of interdependent\nvariables. First, high-quality point cloud generation hinges on precise camera\nposes, yet many datasets suffer from inaccuracies in pose metadata. Also, the\nstandard approach of NeRF is ill-suited for the distinct characteristics of\nstreet-view data from autonomous vehicles in vast, open settings. Autonomous\nvehicle cameras often record with limited overlap, leading to blurring,\nartifacts, and compromised pavement representation in NeRF-based point clouds.\nIn this paper, we present NeRF2Points, a tailored NeRF variant for urban point\ncloud synthesis, notable for its high-quality output from RGB inputs alone. Our\npaper is supported by a bespoke, high-resolution 20-kilometer urban street\ndataset, designed for point cloud generation and evaluation. NeRF2Points\nadeptly navigates the inherent challenges of NeRF-based point cloud synthesis\nthrough the implementation of the following strategic innovations: (1)\nIntegration of Weighted Iterative Geometric Optimization (WIGO) and Structure\nfrom Motion (SfM) for enhanced camera pose accuracy, elevating street-view data\nprecision. (2) Layered Perception and Integrated Modeling (LPiM) is designed\nfor distinct radiance field modeling in urban environments, resulting in\ncoherent point cloud representations.\n","authors":["Peng Tu","Xun Zhou","Mingming Wang","Xiaojun Yang","Bo Peng","Ping Chen","Xiu Su","Yawen Huang","Yefeng Zheng","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2404.04875v1.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2404.04871v1","updated":"2024-04-07T08:32:16Z","published":"2024-04-07T08:32:16Z","title":"Data Stream Sampling with Fuzzy Task Boundaries and Noisy Labels","summary":" In the realm of continual learning, the presence of noisy labels within data\nstreams represents a notable obstacle to model reliability and fairness. We\nfocus on the data stream scenario outlined in pertinent literature,\ncharacterized by fuzzy task boundaries and noisy labels. To address this\nchallenge, we introduce a novel and intuitive sampling method called Noisy Test\nDebiasing (NTD) to mitigate noisy labels in evolving data streams and establish\na fair and robust continual learning algorithm. NTD is straightforward to\nimplement, making it feasible across various scenarios. Our experiments\nbenchmark four datasets, including two synthetic noise datasets (CIFAR10 and\nCIFAR100) and real-world noise datasets (mini-WebVision and Food-101N). The\nresults validate the efficacy of NTD for online continual learning in scenarios\nwith noisy labels in data streams. Compared to the previous leading approach,\nNTD achieves a training speedup enhancement over two times while maintaining or\nsurpassing accuracy levels. Moreover, NTD utilizes less than one-fifth of the\nGPU memory resources compared to previous leading methods.\n","authors":["Yu-Hsi Chen"],"pdf_url":"https://arxiv.org/pdf/2404.04871v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04865v1","updated":"2024-04-07T08:17:48Z","published":"2024-04-07T08:17:48Z","title":"On the Learnability of Out-of-distribution Detection","summary":" Supervised learning aims to train a classifier under the assumption that\ntraining and test data are from the same distribution. To ease the above\nassumption, researchers have studied a more realistic setting:\nout-of-distribution (OOD) detection, where test data may come from classes that\nare unknown during training (i.e., OOD data). Due to the unavailability and\ndiversity of OOD data, good generalization ability is crucial for effective OOD\ndetection algorithms, and corresponding learning theory is still an open\nproblem. To study the generalization of OOD detection, this paper investigates\nthe probably approximately correct (PAC) learning theory of OOD detection that\nfits the commonly used evaluation metrics in the literature. First, we find a\nnecessary condition for the learnability of OOD detection. Then, using this\ncondition, we prove several impossibility theorems for the learnability of OOD\ndetection under some scenarios. Although the impossibility theorems are\nfrustrating, we find that some conditions of these impossibility theorems may\nnot hold in some practical scenarios. Based on this observation, we next give\nseveral necessary and sufficient conditions to characterize the learnability of\nOOD detection in some practical scenarios. Lastly, we offer theoretical support\nfor representative OOD detection works based on our OOD theory.\n","authors":["Zhen Fang","Yixuan Li","Feng Liu","Bo Han","Jie Lu"],"pdf_url":"https://arxiv.org/pdf/2404.04865v1.pdf","comment":"Accepted by JMLR in 7th of April, 2024. This is a journal extension\n of the previous NeurIPS 2022 Outstanding Paper \"Is Out-of-distribution\n Detection Learnable?\" [arXiv:2210.14707]"},{"id":"http://arxiv.org/abs/2308.06791v5","updated":"2024-04-07T08:13:38Z","published":"2023-08-13T15:30:02Z","title":"PV-SSD: A Multi-Modal Point Cloud Feature Fusion Method for Projection\n Features and Variable Receptive Field Voxel Features","summary":" LiDAR-based 3D object detection and classification is crucial for autonomous\ndriving. However, real-time inference from extremely sparse 3D data is a\nformidable challenge. To address this problem, a typical class of approaches\ntransforms the point cloud cast into a regular data representation (voxels or\nprojection maps). Then, it performs feature extraction with convolutional\nneural networks. However, such methods often result in a certain degree of\ninformation loss due to down-sampling or over-compression of feature\ninformation. This paper proposes a multi-modal point cloud feature fusion\nmethod for projection features and variable receptive field voxel features\n(PV-SSD) based on projection and variable voxelization to solve the information\nloss problem. We design a two-branch feature extraction structure with a 2D\nconvolutional neural network to extract the point cloud's projection features\nin bird's-eye view to focus on the correlation between local features. A voxel\nfeature extraction branch is used to extract local fine-grained features.\nMeanwhile, we propose a voxel feature extraction method with variable sensory\nfields to reduce the information loss of voxel branches due to downsampling. It\navoids missing critical point information by selecting more useful feature\npoints based on feature point weights for the detection task. In addition, we\npropose a multi-modal feature fusion module for point clouds. To validate the\neffectiveness of our method, we tested it on the KITTI dataset and ONCE\ndataset.\n","authors":["Yongxin Shao","Aihong Tan","Zhetao Sun","Enhui Zheng","Tianhong Yan","Peng Liao"],"pdf_url":"https://arxiv.org/pdf/2308.06791v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04860v1","updated":"2024-04-07T08:07:14Z","published":"2024-04-07T08:07:14Z","title":"ByteEdit: Boost, Comply and Accelerate Generative Image Editing","summary":" Recent advancements in diffusion-based generative image editing have sparked\na profound revolution, reshaping the landscape of image outpainting and\ninpainting tasks. Despite these strides, the field grapples with inherent\nchallenges, including: i) inferior quality; ii) poor consistency; iii)\ninsufficient instrcution adherence; iv) suboptimal generation efficiency. To\naddress these obstacles, we present ByteEdit, an innovative feedback learning\nframework meticulously designed to Boost, Comply, and Accelerate Generative\nImage Editing tasks. ByteEdit seamlessly integrates image reward models\ndedicated to enhancing aesthetics and image-text alignment, while also\nintroducing a dense, pixel-level reward model tailored to foster coherence in\nthe output. Furthermore, we propose a pioneering adversarial and progressive\nfeedback learning strategy to expedite the model's inference speed. Through\nextensive large-scale user evaluations, we demonstrate that ByteEdit surpasses\nleading generative image editing products, including Adobe, Canva, and MeiTu,\nin both generation quality and consistency. ByteEdit-Outpainting exhibits a\nremarkable enhancement of 388% and 135% in quality and consistency,\nrespectively, when compared to the baseline model. Experiments also verfied\nthat our acceleration models maintains excellent performance results in terms\nof quality and consistency.\n","authors":["Yuxi Ren","Jie Wu","Yanzuo Lu","Huafeng Kuang","Xin Xia","Xionghui Wang","Qianqian Wang","Yixing Zhu","Pan Xie","Shiyin Wang","Xuefeng Xiao","Yitong Wang","Min Zheng","Lean Fu"],"pdf_url":"https://arxiv.org/pdf/2404.04860v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04856v1","updated":"2024-04-07T08:03:42Z","published":"2024-04-07T08:03:42Z","title":"Msmsfnet: a multi-stream and multi-scale fusion net for edge detection","summary":" Edge detection is a long standing problem in computer vision. Recent deep\nlearning based algorithms achieve state of-the-art performance in publicly\navailable datasets. Despite the efficiency of these algorithms, their\nperformance, however, relies heavily on the pretrained weights of the backbone\nnetwork on the ImageNet dataset. This limits heavily the design space of deep\nlearning based edge detectors. Whenever we want to devise a new model, we have\nto train this new model on the ImageNet dataset first, and then fine tune the\nmodel using the edge detection datasets. The comparison would be unfair\notherwise. However, it is usually not feasible for many researchers to train a\nmodel on the ImageNet dataset due to the limited computation resources. In this\nwork, we study the performance that can be achieved by state-of-the-art deep\nlearning based edge detectors in publicly available datasets when they are\ntrained from scratch, and devise a new network architecture, the multi-stream\nand multi scale fusion net (msmsfnet), for edge detection. We show in our\nexperiments that by training all models from scratch to ensure the fairness of\ncomparison, out model outperforms state-of-the art deep learning based edge\ndetectors in three publicly available datasets.\n","authors":["Chenguang Liu","Chisheng Wang","Feifei Dong","Xin Su","Chuanhua Zhu","Dejin Zhang","Qingquan Li"],"pdf_url":"https://arxiv.org/pdf/2404.04856v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00292v2","updated":"2024-04-07T07:55:51Z","published":"2024-03-30T08:51:23Z","title":"LAKE-RED: Camouflaged Images Generation by Latent Background Knowledge\n Retrieval-Augmented Diffusion","summary":" Camouflaged vision perception is an important vision task with numerous\npractical applications. Due to the expensive collection and labeling costs,\nthis community struggles with a major bottleneck that the species category of\nits datasets is limited to a small number of object species. However, the\nexisting camouflaged generation methods require specifying the background\nmanually, thus failing to extend the camouflaged sample diversity in a low-cost\nmanner. In this paper, we propose a Latent Background Knowledge\nRetrieval-Augmented Diffusion (LAKE-RED) for camouflaged image generation. To\nour knowledge, our contributions mainly include: (1) For the first time, we\npropose a camouflaged generation paradigm that does not need to receive any\nbackground inputs. (2) Our LAKE-RED is the first knowledge retrieval-augmented\nmethod with interpretability for camouflaged generation, in which we propose an\nidea that knowledge retrieval and reasoning enhancement are separated\nexplicitly, to alleviate the task-specific challenges. Moreover, our method is\nnot restricted to specific foreground targets or backgrounds, offering a\npotential for extending camouflaged vision perception to more diverse domains.\n(3) Experimental results demonstrate that our method outperforms the existing\napproaches, generating more realistic camouflage images.\n","authors":["Pancheng Zhao","Peng Xu","Pengda Qin","Deng-Ping Fan","Zhicheng Zhang","Guoli Jia","Bowen Zhou","Jufeng Yang"],"pdf_url":"https://arxiv.org/pdf/2404.00292v2.pdf","comment":"Accepted by CVPR 2024, Fig.3 revised"},{"id":"http://arxiv.org/abs/2306.08498v2","updated":"2024-04-07T07:50:37Z","published":"2023-06-14T13:27:28Z","title":"Extending CLIP's Image-Text Alignment to Referring Image Segmentation","summary":" Referring Image Segmentation (RIS) is a cross-modal task that aims to segment\nan instance described by a natural language expression. Recent methods leverage\nlarge-scale pretrained unimodal models as backbones along with fusion\ntechniques for joint reasoning across modalities. However, the inherent\ncross-modal nature of RIS raises questions about the effectiveness of unimodal\nbackbones. We propose RISCLIP, a novel framework that effectively leverages the\ncross-modal nature of CLIP for RIS. Observing CLIP's inherent alignment between\nimage and text features, we capitalize on this starting point and introduce\nsimple but strong modules that enhance unimodal feature extraction and leverage\nrich alignment knowledge in CLIP's image-text shared-embedding space. RISCLIP\nexhibits outstanding results on all three major RIS benchmarks and also\noutperforms previous CLIP-based methods, demonstrating the efficacy of our\nstrategy in extending CLIP's image-text alignment to RIS.\n","authors":["Seoyeon Kim","Minguk Kang","Dongwon Kim","Jaesik Park","Suha Kwak"],"pdf_url":"https://arxiv.org/pdf/2306.08498v2.pdf","comment":"NAACL 2024"},{"id":"http://arxiv.org/abs/2404.04848v1","updated":"2024-04-07T07:42:04Z","published":"2024-04-07T07:42:04Z","title":"Task-Aware Encoder Control for Deep Video Compression","summary":" Prior research on deep video compression (DVC) for machine tasks typically\nnecessitates training a unique codec for each specific task, mandating a\ndedicated decoder per task. In contrast, traditional video codecs employ a\nflexible encoder controller, enabling the adaptation of a single codec to\ndifferent tasks through mechanisms like mode prediction. Drawing inspiration\nfrom this, we introduce an innovative encoder controller for deep video\ncompression for machines. This controller features a mode prediction and a\nGroup of Pictures (GoP) selection module. Our approach centralizes control at\nthe encoding stage, allowing for adaptable encoder adjustments across different\ntasks, such as detection and tracking, while maintaining compatibility with a\nstandard pre-trained DVC decoder. Empirical evidence demonstrates that our\nmethod is applicable across multiple tasks with various existing pre-trained\nDVCs. Moreover, extensive experiments demonstrate that our method outperforms\nprevious DVC by about 25% bitrate for different tasks, with only one\npre-trained decoder.\n","authors":["Xingtong Ge","Jixiang Luo","Xinjie Zhang","Tongda Xu","Guo Lu","Dailan He","Jing Geng","Yan Wang","Jun Zhang","Hongwei Qin"],"pdf_url":"https://arxiv.org/pdf/2404.04848v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2403.12434v3","updated":"2024-04-07T07:37:59Z","published":"2024-03-19T04:47:56Z","title":"Human Mesh Recovery from Arbitrary Multi-view Images","summary":" Human mesh recovery from arbitrary multi-view images involves two\ncharacteristics: the arbitrary camera poses and arbitrary number of camera\nviews. Because of the variability, designing a unified framework to tackle this\ntask is challenging. The challenges can be summarized as the dilemma of being\nable to simultaneously estimate arbitrary camera poses and recover human mesh\nfrom arbitrary multi-view images while maintaining flexibility. To solve this\ndilemma, we propose a divide and conquer framework for Unified Human Mesh\nRecovery (U-HMR) from arbitrary multi-view images. In particular, U-HMR\nconsists of a decoupled structure and two main components: camera and body\ndecoupling (CBD), camera pose estimation (CPE), and arbitrary view fusion\n(AVF). As camera poses and human body mesh are independent of each other, CBD\nsplits the estimation of them into two sub-tasks for two individual\nsub-networks (ie, CPE and AVF) to handle respectively, thus the two sub-tasks\nare disentangled. In CPE, since each camera pose is unrelated to the others, we\nadopt a shared MLP to process all views in a parallel way. In AVF, in order to\nfuse multi-view information and make the fusion operation independent of the\nnumber of views, we introduce a transformer decoder with a SMPL parameters\nquery token to extract cross-view features for mesh recovery. To demonstrate\nthe efficacy and flexibility of the proposed framework and effect of each\ncomponent, we conduct extensive experiments on three public datasets:\nHuman3.6M, MPI-INF-3DHP, and TotalCapture.\n","authors":["Xiaoben Li","Mancheng Meng","Ziyan Wu","Terrence Chen","Fan Yang","Dinggang Shen"],"pdf_url":"https://arxiv.org/pdf/2403.12434v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11863v2","updated":"2024-04-07T07:37:15Z","published":"2023-11-20T15:59:41Z","title":"GP-NeRF: Generalized Perception NeRF for Context-Aware 3D Scene\n Understanding","summary":" Applying NeRF to downstream perception tasks for scene understanding and\nrepresentation is becoming increasingly popular. Most existing methods treat\nsemantic prediction as an additional rendering task, \\textit{i.e.}, the \"label\nrendering\" task, to build semantic NeRFs. However, by rendering\nsemantic/instance labels per pixel without considering the contextual\ninformation of the rendered image, these methods usually suffer from unclear\nboundary segmentation and abnormal segmentation of pixels within an object. To\nsolve this problem, we propose Generalized Perception NeRF (GP-NeRF), a novel\npipeline that makes the widely used segmentation model and NeRF work compatibly\nunder a unified framework, for facilitating context-aware 3D scene perception.\nTo accomplish this goal, we introduce transformers to aggregate radiance as\nwell as semantic embedding fields jointly for novel views and facilitate the\njoint volumetric rendering of both fields. In addition, we propose two\nself-distillation mechanisms, i.e., the Semantic Distill Loss and the\nDepth-Guided Semantic Distill Loss, to enhance the discrimination and quality\nof the semantic field and the maintenance of geometric consistency. In\nevaluation, we conduct experimental comparisons under two perception tasks\n(\\textit{i.e.} semantic and instance segmentation) using both synthetic and\nreal-world datasets. Notably, our method outperforms SOTA approaches by 6.94\\%,\n11.76\\%, and 8.47\\% on generalized semantic segmentation, finetuning semantic\nsegmentation, and instance segmentation, respectively.\n","authors":["Hao Li","Dingwen Zhang","Yalun Dai","Nian Liu","Lechao Cheng","Jingfeng Li","Jingdong Wang","Junwei Han"],"pdf_url":"https://arxiv.org/pdf/2311.11863v2.pdf","comment":"CVPR 2024 (Highlight). Project Page:\n https://lifuguan.github.io/gpnerf-pages/"},{"id":"http://arxiv.org/abs/2404.03654v2","updated":"2024-04-07T07:20:31Z","published":"2024-04-04T17:59:50Z","title":"RaFE: Generative Radiance Fields Restoration","summary":" NeRF (Neural Radiance Fields) has demonstrated tremendous potential in novel\nview synthesis and 3D reconstruction, but its performance is sensitive to input\nimage quality, which struggles to achieve high-fidelity rendering when provided\nwith low-quality sparse input viewpoints. Previous methods for NeRF restoration\nare tailored for specific degradation type, ignoring the generality of\nrestoration. To overcome this limitation, we propose a generic radiance fields\nrestoration pipeline, named RaFE, which applies to various types of\ndegradations, such as low resolution, blurriness, noise, compression artifacts,\nor their combinations. Our approach leverages the success of off-the-shelf 2D\nrestoration methods to recover the multi-view images individually. Instead of\nreconstructing a blurred NeRF by averaging inconsistencies, we introduce a\nnovel approach using Generative Adversarial Networks (GANs) for NeRF generation\nto better accommodate the geometric and appearance inconsistencies present in\nthe multi-view images. Specifically, we adopt a two-level tri-plane\narchitecture, where the coarse level remains fixed to represent the low-quality\nNeRF, and a fine-level residual tri-plane to be added to the coarse level is\nmodeled as a distribution with GAN to capture potential variations in\nrestoration. We validate RaFE on both synthetic and real cases for various\nrestoration tasks, demonstrating superior performance in both quantitative and\nqualitative evaluations, surpassing other 3D restoration methods specific to\nsingle task. Please see our project website\nhttps://zkaiwu.github.io/RaFE-Project/.\n","authors":["Zhongkai Wu","Ziyu Wan","Jing Zhang","Jing Liao","Dong Xu"],"pdf_url":"https://arxiv.org/pdf/2404.03654v2.pdf","comment":"Project Page: https://zkaiwu.github.io/RaFE"},{"id":"http://arxiv.org/abs/2305.03238v4","updated":"2024-04-07T07:07:49Z","published":"2023-05-05T01:40:00Z","title":"Reduction of Class Activation Uncertainty with Background Information","summary":" Multitask learning is a popular approach to training high-performing neural\nnetworks with improved generalization. In this paper, we propose a background\nclass to achieve improved generalization at a lower computation compared to\nmultitask learning to help researchers and organizations with limited\ncomputation power. We also present a methodology for selecting background\nimages and discuss potential future improvements. We apply our approach to\nseveral datasets and achieve improved generalization with much lower\ncomputation. Through the class activation mappings (CAMs) of the trained\nmodels, we observed the tendency towards looking at a bigger picture with the\nproposed model training methodology. Applying the vision transformer with the\nproposed background class, we receive state-of-the-art (SOTA) performance on\nSTL-10, Caltech-101, and CINIC-10 datasets. Example scripts are available in\nthe 'CAM' folder of the following GitHub Repository: github.com/dipuk0506/UQ\n","authors":["H M Dipu Kabir"],"pdf_url":"https://arxiv.org/pdf/2305.03238v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04833v1","updated":"2024-04-07T06:56:51Z","published":"2024-04-07T06:56:51Z","title":"ShoeModel: Learning to Wear on the User-specified Shoes via Diffusion\n Model","summary":" With the development of the large-scale diffusion model, Artificial\nIntelligence Generated Content (AIGC) techniques are popular recently. However,\nhow to truly make it serve our daily lives remains an open question. To this\nend, in this paper, we focus on employing AIGC techniques in one filed of\nE-commerce marketing, i.e., generating hyper-realistic advertising images for\ndisplaying user-specified shoes by human. Specifically, we propose a\nshoe-wearing system, called Shoe-Model, to generate plausible images of human\nlegs interacting with the given shoes. It consists of three modules: (1) shoe\nwearable-area detection module (WD), (2) leg-pose synthesis module (LpS) and\nthe final (3) shoe-wearing image generation module (SW). Them three are\nperformed in ordered stages. Compared to baselines, our ShoeModel is shown to\ngeneralize better to different type of shoes and has ability of keeping the\nID-consistency of the given shoes, as well as automatically producing\nreasonable interactions with human. Extensive experiments show the\neffectiveness of our proposed shoe-wearing system. Figure 1 shows the input and\noutput examples of our ShoeModel.\n","authors":["Binghui Chen","Wenyu Li","Yifeng Geng","Xuansong Xie","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.04833v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2401.10891v2","updated":"2024-04-07T06:52:21Z","published":"2024-01-19T18:59:52Z","title":"Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data","summary":" This work presents Depth Anything, a highly practical solution for robust\nmonocular depth estimation. Without pursuing novel technical modules, we aim to\nbuild a simple yet powerful foundation model dealing with any images under any\ncircumstances. To this end, we scale up the dataset by designing a data engine\nto collect and automatically annotate large-scale unlabeled data (~62M), which\nsignificantly enlarges the data coverage and thus is able to reduce the\ngeneralization error. We investigate two simple yet effective strategies that\nmake data scaling-up promising. First, a more challenging optimization target\nis created by leveraging data augmentation tools. It compels the model to\nactively seek extra visual knowledge and acquire robust representations.\nSecond, an auxiliary supervision is developed to enforce the model to inherit\nrich semantic priors from pre-trained encoders. We evaluate its zero-shot\ncapabilities extensively, including six public datasets and randomly captured\nphotos. It demonstrates impressive generalization ability. Further, through\nfine-tuning it with metric depth information from NYUv2 and KITTI, new SOTAs\nare set. Our better depth model also results in a better depth-conditioned\nControlNet. Our models are released at\nhttps://github.com/LiheYoung/Depth-Anything.\n","authors":["Lihe Yang","Bingyi Kang","Zilong Huang","Xiaogang Xu","Jiashi Feng","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.10891v2.pdf","comment":"Accepted by CVPR 2024. Project page: https://depth-anything.github.io"},{"id":"http://arxiv.org/abs/2212.12857v2","updated":"2024-04-07T06:34:37Z","published":"2022-12-25T05:24:08Z","title":"StepNet: Spatial-temporal Part-aware Network for Isolated Sign Language\n Recognition","summary":" The goal of sign language recognition (SLR) is to help those who are hard of\nhearing or deaf overcome the communication barrier. Most existing approaches\ncan be typically divided into two lines, i.e., Skeleton-based and RGB-based\nmethods, but both the two lines of methods have their limitations.\nSkeleton-based methods do not consider facial expressions, while RGB-based\napproaches usually ignore the fine-grained hand structure. To overcome both\nlimitations, we propose a new framework called Spatial-temporal Part-aware\nnetwork~(StepNet), based on RGB parts. As its name suggests, it is made up of\ntwo modules: Part-level Spatial Modeling and Part-level Temporal Modeling.\nPart-level Spatial Modeling, in particular, automatically captures the\nappearance-based properties, such as hands and faces, in the feature space\nwithout the use of any keypoint-level annotations. On the other hand,\nPart-level Temporal Modeling implicitly mines the long-short term context to\ncapture the relevant attributes over time. Extensive experiments demonstrate\nthat our StepNet, thanks to spatial-temporal modules, achieves competitive\nTop-1 Per-instance accuracy on three commonly-used SLR benchmarks, i.e., 56.89%\non WLASL, 77.2% on NMFs-CSL, and 77.1% on BOBSL. Additionally, the proposed\nmethod is compatible with the optical flow input and can produce superior\nperformance if fused. For those who are hard of hearing, we hope that our work\ncan act as a preliminary step.\n","authors":["Xiaolong Shen","Zhedong Zheng","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2212.12857v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01134v2","updated":"2024-04-07T06:30:39Z","published":"2024-02-02T04:17:02Z","title":"DeepAAT: Deep Automated Aerial Triangulation for Fast UAV-based Mapping","summary":" Automated Aerial Triangulation (AAT), aiming to restore image pose and\nreconstruct sparse points simultaneously, plays a pivotal role in earth\nobservation. With its rich research heritage spanning several decades in\nphotogrammetry, AAT has evolved into a fundamental process widely applied in\nlarge-scale Unmanned Aerial Vehicle (UAV) based mapping. Despite its\nadvancements, classic AAT methods still face challenges like low efficiency and\nlimited robustness. This paper introduces DeepAAT, a deep learning network\ndesigned specifically for AAT of UAV imagery. DeepAAT considers both spatial\nand spectral characteristics of imagery, enhancing its capability to resolve\nerroneous matching pairs and accurately predict image poses. DeepAAT marks a\nsignificant leap in AAT's efficiency, ensuring thorough scene coverage and\nprecision. Its processing speed outpaces incremental AAT methods by hundreds of\ntimes and global AAT methods by tens of times while maintaining a comparable\nlevel of reconstruction accuracy. Additionally, DeepAAT's scene clustering and\nmerging strategy facilitate rapid localization and pose determination for\nlarge-scale UAV images, even under constrained computing resources. The\nexperimental results demonstrate DeepAAT's substantial improvements over\nconventional AAT methods, highlighting its potential in the efficiency and\naccuracy of UAV-based 3D reconstruction tasks. To benefit the photogrammetry\nsociety, the code of DeepAAT will be released at:\nhttps://github.com/WHU-USI3DV/DeepAAT.\n","authors":["Zequan Chen","Jianping Li","Qusheng Li","Bisheng Yang","Zhen Dong"],"pdf_url":"https://arxiv.org/pdf/2402.01134v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04828v1","updated":"2024-04-07T06:28:53Z","published":"2024-04-07T06:28:53Z","title":"Strictly-ID-Preserved and Controllable Accessory Advertising Image\n Generation","summary":" Customized generative text-to-image models have the ability to produce images\nthat closely resemble a given subject. However, in the context of generating\nadvertising images for e-commerce scenarios, it is crucial that the generated\nsubject's identity aligns perfectly with the product being advertised. In order\nto address the need for strictly-ID preserved advertising image generation, we\nhave developed a Control-Net based customized image generation pipeline and\nhave taken earring model advertising as an example. Our approach facilitates a\nseamless interaction between the earrings and the model's face, while ensuring\nthat the identity of the earrings remains intact. Furthermore, to achieve a\ndiverse and controllable display, we have proposed a multi-branch\ncross-attention architecture, which allows for control over the scale, pose,\nand appearance of the model, going beyond the limitations of text prompts. Our\nmethod manages to achieve fine-grained control of the generated model's face,\nresulting in controllable and captivating advertising effects.\n","authors":["Youze Xue","Binghui Chen","Yifeng Geng","Xuansong Xie","Jiansheng Chen","Hongbing Ma"],"pdf_url":"https://arxiv.org/pdf/2404.04828v1.pdf","comment":"22 pages"},{"id":"http://arxiv.org/abs/2310.08370v2","updated":"2024-04-07T06:21:21Z","published":"2023-10-12T14:39:58Z","title":"UniPAD: A Universal Pre-training Paradigm for Autonomous Driving","summary":" In the context of autonomous driving, the significance of effective feature\nlearning is widely acknowledged. While conventional 3D self-supervised\npre-training methods have shown widespread success, most methods follow the\nideas originally designed for 2D images. In this paper, we present UniPAD, a\nnovel self-supervised learning paradigm applying 3D volumetric differentiable\nrendering. UniPAD implicitly encodes 3D space, facilitating the reconstruction\nof continuous 3D shape structures and the intricate appearance characteristics\nof their 2D projections. The flexibility of our method enables seamless\nintegration into both 2D and 3D frameworks, enabling a more holistic\ncomprehension of the scenes. We manifest the feasibility and effectiveness of\nUniPAD by conducting extensive experiments on various downstream 3D tasks. Our\nmethod significantly improves lidar-, camera-, and lidar-camera-based baseline\nby 9.1, 7.7, and 6.9 NDS, respectively. Notably, our pre-training pipeline\nachieves 73.2 NDS for 3D object detection and 79.4 mIoU for 3D semantic\nsegmentation on the nuScenes validation set, achieving state-of-the-art results\nin comparison with previous methods. The code will be available at\nhttps://github.com/Nightmare-n/UniPAD.\n","authors":["Honghui Yang","Sha Zhang","Di Huang","Xiaoyang Wu","Haoyi Zhu","Tong He","Shixiang Tang","Hengshuang Zhao","Qibo Qiu","Binbin Lin","Xiaofei He","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2310.08370v2.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2404.04823v1","updated":"2024-04-07T06:17:10Z","published":"2024-04-07T06:17:10Z","title":"3D Building Reconstruction from Monocular Remote Sensing Images with\n Multi-level Supervisions","summary":" 3D building reconstruction from monocular remote sensing images is an\nimportant and challenging research problem that has received increasing\nattention in recent years, owing to its low cost of data acquisition and\navailability for large-scale applications. However, existing methods rely on\nexpensive 3D-annotated samples for fully-supervised training, restricting their\napplication to large-scale cross-city scenarios. In this work, we propose\nMLS-BRN, a multi-level supervised building reconstruction network that can\nflexibly utilize training samples with different annotation levels to achieve\nbetter reconstruction results in an end-to-end manner. To alleviate the demand\non full 3D supervision, we design two new modules, Pseudo Building Bbox\nCalculator and Roof-Offset guided Footprint Extractor, as well as new tasks and\ntraining strategies for different types of samples. Experimental results on\nseveral public and new datasets demonstrate that our proposed MLS-BRN achieves\ncompetitive performance using much fewer 3D-annotated samples, and\nsignificantly improves the footprint extraction and 3D reconstruction\nperformance compared with current state-of-the-art. The code and datasets of\nthis work will be released at https://github.com/opendatalab/MLS-BRN.git.\n","authors":["Weijia Li","Haote Yang","Zhenghao Hu","Juepeng Zheng","Gui-Song Xia","Conghui He"],"pdf_url":"https://arxiv.org/pdf/2404.04823v1.pdf","comment":"accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01133v2","updated":"2024-04-07T06:17:07Z","published":"2024-04-01T14:24:40Z","title":"CityGaussian: Real-time High-quality Large-Scale Scene Rendering with\n Gaussians","summary":" The advancement of real-time 3D scene reconstruction and novel view synthesis\nhas been significantly propelled by 3D Gaussian Splatting (3DGS). However,\neffectively training large-scale 3DGS and rendering it in real-time across\nvarious scales remains challenging. This paper introduces CityGaussian\n(CityGS), which employs a novel divide-and-conquer training approach and\nLevel-of-Detail (LoD) strategy for efficient large-scale 3DGS training and\nrendering. Specifically, the global scene prior and adaptive training data\nselection enables efficient training and seamless fusion. Based on fused\nGaussian primitives, we generate different detail levels through compression,\nand realize fast rendering across various scales through the proposed\nblock-wise detail levels selection and aggregation strategy. Extensive\nexperimental results on large-scale scenes demonstrate that our approach\nattains state-of-theart rendering quality, enabling consistent real-time\nrendering of largescale scenes across vastly different scales. Our project page\nis available at https://dekuliutesla.github.io/citygs/.\n","authors":["Yang Liu","He Guan","Chuanchen Luo","Lue Fan","Junran Peng","Zhaoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.01133v2.pdf","comment":"Project Page: https://dekuliutesla.github.io/citygs/"},{"id":"http://arxiv.org/abs/2404.04819v1","updated":"2024-04-07T06:01:49Z","published":"2024-04-07T06:01:49Z","title":"Joint Reconstruction of 3D Human and Object via Contact-Based Refinement\n Transformer","summary":" Human-object contact serves as a strong cue to understand how humans\nphysically interact with objects. Nevertheless, it is not widely explored to\nutilize human-object contact information for the joint reconstruction of 3D\nhuman and object from a single image. In this work, we present a novel joint 3D\nhuman-object reconstruction method (CONTHO) that effectively exploits contact\ninformation between humans and objects. There are two core designs in our\nsystem: 1) 3D-guided contact estimation and 2) contact-based 3D human and\nobject refinement. First, for accurate human-object contact estimation, CONTHO\ninitially reconstructs 3D humans and objects and utilizes them as explicit 3D\nguidance for contact estimation. Second, to refine the initial reconstructions\nof 3D human and object, we propose a novel contact-based refinement Transformer\nthat effectively aggregates human features and object features based on the\nestimated human-object contact. The proposed contact-based refinement prevents\nthe learning of erroneous correlation between human and object, which enables\naccurate 3D reconstruction. As a result, our CONTHO achieves state-of-the-art\nperformance in both human-object contact estimation and joint reconstruction of\n3D human and object. The code is publicly available at\nhttps://github.com/dqj5182/CONTHO_RELEASE.\n","authors":["Hyeongjin Nam","Daniel Sungho Jung","Gyeongsik Moon","Kyoung Mu Lee"],"pdf_url":"https://arxiv.org/pdf/2404.04819v1.pdf","comment":"Published at CVPR 2024, 19 pages including the supplementary material"},{"id":"http://arxiv.org/abs/2305.00510v3","updated":"2024-04-07T05:59:05Z","published":"2023-04-30T15:38:36Z","title":"Towards AI-Architecture Liberty: A Comprehensive Survey on Designing and\n Collaborating Virtual Architecture by Deep Learning in the Metaverse","summary":" 3D shape generation techniques leveraging deep learning have garnered\nsignificant interest from both the computer vision and architectural design\ncommunities, promising to enrich the content of the future metaverse. However,\nresearch on virtual architectural design remains limited, particularly\nregarding human-AI collaboration and deep learning-assisted design. We first\nilluminate the principles, generation techniques, and current literature of\nvirtual architecture, focusing on challenges such as datasets, multimodality,\ndesign intuition, and generative frameworks. In our survey, we reviewed 187\nrelated articles (80.7\\% of articles published between 2018 and 2022) covering\narchitectural research, virtual environments, and technical approaches. This\nsurvey investigates the latest approaches to 3D object generation with deep\ngenerative models (DGMs) and summarizes four characteristics of deep-learning\ngeneration approaches for virtual architecture. According to our analysis of\nthe survey, we expound on four research agendas, including agency,\ncommunication, user consideration, and integrating tools, and highlight three\nimportant enablers of ubiquitous interaction with immersive systems in deep\nlearning-assisted architectural generation. Our work contributes to fostering\nunderstanding between designers and deep learning techniques, broadening access\nto human-AI collaboration. We advocate for interdisciplinary efforts to address\nthis timely research topic, facilitating content designing and generation in\nthe metaverse.\n","authors":["Anqi Wang","Jiahua Dong","Lik-Hang Lee","Jiachuan Shen","Pan Hui"],"pdf_url":"https://arxiv.org/pdf/2305.00510v3.pdf","comment":"37 pages, 9 figures, and 5 tables"},{"id":"http://arxiv.org/abs/2404.04818v1","updated":"2024-04-07T05:56:42Z","published":"2024-04-07T05:56:42Z","title":"DWE+: Dual-Way Matching Enhanced Framework for Multimodal Entity Linking","summary":" Multimodal entity linking (MEL) aims to utilize multimodal information\n(usually textual and visual information) to link ambiguous mentions to\nunambiguous entities in knowledge base. Current methods facing main issues:\n(1)treating the entire image as input may contain redundant information. (2)the\ninsufficient utilization of entity-related information, such as attributes in\nimages. (3)semantic inconsistency between the entity in knowledge base and its\nrepresentation. To this end, we propose DWE+ for multimodal entity linking.\nDWE+ could capture finer semantics and dynamically maintain semantic\nconsistency with entities. This is achieved by three aspects: (a)we introduce a\nmethod for extracting fine-grained image features by partitioning the image\ninto multiple local objects. Then, hierarchical contrastive learning is used to\nfurther align semantics between coarse-grained information(text and image) and\nfine-grained (mention and visual objects). (b)we explore ways to extract visual\nattributes from images to enhance fusion feature such as facial features and\nidentity. (c)we leverage Wikipedia and ChatGPT to capture the entity\nrepresentation, achieving semantic enrichment from both static and dynamic\nperspectives, which better reflects the real-world entity semantics.\nExperiments on Wikimel, Richpedia, and Wikidiverse datasets demonstrate the\neffectiveness of DWE+ in improving MEL performance. Specifically, we optimize\nthese datasets and achieve state-of-the-art performance on the enhanced\ndatasets. The code and enhanced datasets are released on\nhttps://github.com/season1blue/DWET\n","authors":["Shezheng Song","Shasha Li","Shan Zhao","Xiaopeng Li","Chengyu Wang","Jie Yu","Jun Ma","Tianwei Yan","Bin Ji","Xiaoguang Mao"],"pdf_url":"https://arxiv.org/pdf/2404.04818v1.pdf","comment":"under review on TOIS"},{"id":"http://arxiv.org/abs/2303.04989v3","updated":"2024-04-07T05:50:18Z","published":"2023-03-09T02:20:56Z","title":"ARS-DETR: Aspect Ratio-Sensitive Detection Transformer for Aerial\n Oriented Object Detection","summary":" Existing oriented object detection methods commonly use metric AP$_{50}$ to\nmeasure the performance of the model. We argue that AP$_{50}$ is inherently\nunsuitable for oriented object detection due to its large tolerance in angle\ndeviation. Therefore, we advocate using high-precision metric, e.g. AP$_{75}$,\nto measure the performance of models. In this paper, we propose an Aspect Ratio\nSensitive Oriented Object Detector with Transformer, termed ARS-DETR, which\nexhibits a competitive performance in high-precision oriented object detection.\nSpecifically, a new angle classification method, calling Aspect Ratio aware\nCircle Smooth Label (AR-CSL), is proposed to smooth the angle label in a more\nreasonable way and discard the hyperparameter that introduced by previous work\n(e.g. CSL). Then, a rotated deformable attention module is designed to rotate\nthe sampling points with the corresponding angles and eliminate the\nmisalignment between region features and sampling points. Moreover, a dynamic\nweight coefficient according to the aspect ratio is adopted to calculate the\nangle loss. Comprehensive experiments on several challenging datasets show that\nour method achieves competitive performance on the high-precision oriented\nobject detection task.\n","authors":["Ying Zeng","Yushi Chen","Xue Yang","Qingyun Li","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2303.04989v3.pdf","comment":"15 pages, 13 figures, 13 tables, the source code is available at\n https://github.com/httle/ARS-DETR"},{"id":"http://arxiv.org/abs/2404.01959v2","updated":"2024-04-07T05:26:08Z","published":"2024-04-02T13:54:22Z","title":"Bi-LORA: A Vision-Language Approach for Synthetic Image Detection","summary":" Advancements in deep image synthesis techniques, such as generative\nadversarial networks (GANs) and diffusion models (DMs), have ushered in an era\nof generating highly realistic images. While this technological progress has\ncaptured significant interest, it has also raised concerns about the potential\ndifficulty in distinguishing real images from their synthetic counterparts.\nThis paper takes inspiration from the potent convergence capabilities between\nvision and language, coupled with the zero-shot nature of vision-language\nmodels (VLMs). We introduce an innovative method called Bi-LORA that leverages\nVLMs, combined with low-rank adaptation (LORA) tuning techniques, to enhance\nthe precision of synthetic image detection for unseen model-generated images.\nThe pivotal conceptual shift in our methodology revolves around reframing\nbinary classification as an image captioning task, leveraging the distinctive\ncapabilities of cutting-edge VLM, notably bootstrapping language image\npre-training (BLIP2). Rigorous and comprehensive experiments are conducted to\nvalidate the effectiveness of our proposed approach, particularly in detecting\nunseen diffusion-generated images from unknown diffusion-based generative\nmodels during training, showcasing robustness to noise, and demonstrating\ngeneralization capabilities to GANs. The obtained results showcase an\nimpressive average accuracy of 93.41% in synthetic image detection on unseen\ngeneration models. The code and models associated with this research can be\npublicly accessed at https://github.com/Mamadou-Keita/VLM-DETECT.\n","authors":["Mamadou Keita","Wassim Hamidouche","Hessen Bougueffa Eutamene","Abdenour Hadid","Abdelmalik Taleb-Ahmed"],"pdf_url":"https://arxiv.org/pdf/2404.01959v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04808v1","updated":"2024-04-07T04:56:58Z","published":"2024-04-07T04:56:58Z","title":"MemFlow: Optical Flow Estimation and Prediction with Memory","summary":" Optical flow is a classical task that is important to the vision community.\nClassical optical flow estimation uses two frames as input, whilst some recent\nmethods consider multiple frames to explicitly model long-range information.\nThe former ones limit their ability to fully leverage temporal coherence along\nthe video sequence; and the latter ones incur heavy computational overhead,\ntypically not possible for real-time flow estimation. Some multi-frame-based\napproaches even necessitate unseen future frames for current estimation,\ncompromising real-time applicability in safety-critical scenarios. To this end,\nwe present MemFlow, a real-time method for optical flow estimation and\nprediction with memory. Our method enables memory read-out and update modules\nfor aggregating historical motion information in real-time. Furthermore, we\nintegrate resolution-adaptive re-scaling to accommodate diverse video\nresolutions. Besides, our approach seamlessly extends to the future prediction\nof optical flow based on past observations. Leveraging effective historical\nmotion aggregation, our method outperforms VideoFlow with fewer parameters and\nfaster inference speed on Sintel and KITTI-15 datasets in terms of\ngeneralization performance. At the time of submission, MemFlow also leads in\nperformance on the 1080p Spring dataset. Codes and models will be available at:\nhttps://dqiaole.github.io/MemFlow/.\n","authors":["Qiaole Dong","Yanwei Fu"],"pdf_url":"https://arxiv.org/pdf/2404.04808v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.04807v1","updated":"2024-04-07T04:55:58Z","published":"2024-04-07T04:55:58Z","title":"D2SL: Decouple Defogging and Semantic Learning for Foggy Domain-Adaptive\n Segmentation","summary":" We investigated domain adaptive semantic segmentation in foggy weather\nscenarios, which aims to enhance the utilization of unlabeled foggy data and\nimprove the model's adaptability to foggy conditions. Current methods rely on\nclear images as references, jointly learning defogging and segmentation for\nfoggy images. Despite making some progress, there are still two main drawbacks:\n(1) the coupling of segmentation and defogging feature representations,\nresulting in a decrease in semantic representation capability, and (2) the\nfailure to leverage real fog priors in unlabeled foggy data, leading to\ninsufficient model generalization ability. To address these issues, we propose\na novel training framework, Decouple Defogging and Semantic learning, called\nD2SL, aiming to alleviate the adverse impact of defogging tasks on the final\nsegmentation task. In this framework, we introduce a domain-consistent transfer\nstrategy to establish a connection between defogging and segmentation tasks.\nFurthermore, we design a real fog transfer strategy to improve defogging\neffects by fully leveraging the fog priors from real foggy images. Our approach\nenhances the semantic representations required for segmentation during the\ndefogging learning process and maximizes the representation capability of fog\ninvariance by effectively utilizing real fog data. Comprehensive experiments\nvalidate the effectiveness of the proposed method.\n","authors":["Xuan Sun","Zhanfu An","Yuyu Liu"],"pdf_url":"https://arxiv.org/pdf/2404.04807v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01843v2","updated":"2024-04-07T04:17:32Z","published":"2024-04-02T11:03:24Z","title":"Sketch3D: Style-Consistent Guidance for Sketch-to-3D Generation","summary":" Recently, image-to-3D approaches have achieved significant results with a\nnatural image as input. However, it is not always possible to access these\nenriched color input samples in practical applications, where only sketches are\navailable. Existing sketch-to-3D researches suffer from limitations in broad\napplications due to the challenges of lacking color information and multi-view\ncontent. To overcome them, this paper proposes a novel generation paradigm\nSketch3D to generate realistic 3D assets with shape aligned with the input\nsketch and color matching the textual description. Concretely, Sketch3D first\ninstantiates the given sketch in the reference image through the\nshape-preserving generation process. Second, the reference image is leveraged\nto deduce a coarse 3D Gaussian prior, and multi-view style-consistent guidance\nimages are generated based on the renderings of the 3D Gaussians. Finally,\nthree strategies are designed to optimize 3D Gaussians, i.e., structural\noptimization via a distribution transfer mechanism, color optimization with a\nstraightforward MSE loss and sketch similarity optimization with a CLIP-based\ngeometric similarity loss. Extensive visual comparisons and quantitative\nanalysis illustrate the advantage of our Sketch3D in generating realistic 3D\nassets while preserving consistency with the input.\n","authors":["Wangguandong Zheng","Haifeng Xia","Rui Chen","Ming Shao","Siyu Xia","Zhengming Ding"],"pdf_url":"https://arxiv.org/pdf/2404.01843v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04804v1","updated":"2024-04-07T04:10:06Z","published":"2024-04-07T04:10:06Z","title":"Light the Night: A Multi-Condition Diffusion Framework for Unpaired\n Low-Light Enhancement in Autonomous Driving","summary":" Vision-centric perception systems for autonomous driving have gained\nconsiderable attention recently due to their cost-effectiveness and\nscalability, especially compared to LiDAR-based systems. However, these systems\noften struggle in low-light conditions, potentially compromising their\nperformance and safety. To address this, our paper introduces LightDiff, a\ndomain-tailored framework designed to enhance the low-light image quality for\nautonomous driving applications. Specifically, we employ a multi-condition\ncontrolled diffusion model. LightDiff works without any human-collected paired\ndata, leveraging a dynamic data degradation process instead. It incorporates a\nnovel multi-condition adapter that adaptively controls the input weights from\ndifferent modalities, including depth maps, RGB images, and text captions, to\neffectively illuminate dark scenes while maintaining context consistency.\nFurthermore, to align the enhanced images with the detection model's knowledge,\nLightDiff employs perception-specific scores as rewards to guide the diffusion\ntraining process through reinforcement learning. Extensive experiments on the\nnuScenes datasets demonstrate that LightDiff can significantly improve the\nperformance of several state-of-the-art 3D detectors in night-time conditions\nwhile achieving high visual quality scores, highlighting its potential to\nsafeguard autonomous driving.\n","authors":["Jinlong Li","Baolu Li","Zhengzhong Tu","Xinyu Liu","Qing Guo","Felix Juefei-Xu","Runsheng Xu","Hongkai Yu"],"pdf_url":"https://arxiv.org/pdf/2404.04804v1.pdf","comment":"This paper is accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2306.02416v3","updated":"2024-04-07T03:53:33Z","published":"2023-06-04T17:39:08Z","title":"Training Like a Medical Resident: Context-Prior Learning Toward\n Universal Medical Image Segmentation","summary":" A major focus of clinical imaging workflow is disease diagnosis and\nmanagement, leading to medical imaging datasets strongly tied to specific\nclinical objectives. This scenario has led to the prevailing practice of\ndeveloping task-specific segmentation models, without gaining insights from\nwidespread imaging cohorts. Inspired by the training program of medical\nradiology residents, we propose a shift towards universal medical image\nsegmentation, a paradigm aiming to build medical image understanding foundation\nmodels by leveraging the diversity and commonality across clinical targets,\nbody regions, and imaging modalities. Towards this goal, we develop Hermes, a\nnovel context-prior learning approach to address the challenges of data\nheterogeneity and annotation differences in medical image segmentation. In a\nlarge collection of eleven diverse datasets (2,438 3D images) across five\nmodalities (CT, PET, T1, T2 and cine MRI) and multiple body regions, we\ndemonstrate the merit of the universal paradigm over the traditional paradigm\non addressing multiple tasks within a single model. By exploiting the synergy\nacross tasks, Hermes achieves state-of-the-art performance on all testing\ndatasets and shows superior model scalability. Results on two additional\ndatasets reveals Hermes' strong performance for transfer learning, incremental\nlearning, and generalization to downstream tasks. Hermes's learned priors\ndemonstrate an appealing trait to reflect the intricate relations among tasks\nand modalities, which aligns with the established anatomical and imaging\nprinciples in radiology. The code is available:\nhttps://github.com/yhygao/universal-medical-image-segmentation.\n","authors":["Yunhe Gao","Zhuowei Li","Di Liu","Mu Zhou","Shaoting Zhang","Dimitris N. Metaxas"],"pdf_url":"https://arxiv.org/pdf/2306.02416v3.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2310.08129v3","updated":"2024-04-07T03:53:29Z","published":"2023-10-12T08:36:25Z","title":"Tailored Visions: Enhancing Text-to-Image Generation with Personalized\n Prompt Rewriting","summary":" Despite significant progress in the field, it is still challenging to create\npersonalized visual representations that align closely with the desires and\npreferences of individual users. This process requires users to articulate\ntheir ideas in words that are both comprehensible to the models and accurately\ncapture their vision, posing difficulties for many users. In this paper, we\ntackle this challenge by leveraging historical user interactions with the\nsystem to enhance user prompts. We propose a novel approach that involves\nrewriting user prompts based on a newly collected large-scale text-to-image\ndataset with over 300k prompts from 3115 users. Our rewriting model enhances\nthe expressiveness and alignment of user prompts with their intended visual\noutputs. Experimental results demonstrate the superiority of our methods over\nbaseline approaches, as evidenced in our new offline evaluation method and\nonline tests. Our code and dataset are available at\nhttps://github.com/zzjchen/Tailored-Visions.\n","authors":["Zijie Chen","Lichao Zhang","Fangsheng Weng","Lili Pan","Zhenzhong Lan"],"pdf_url":"https://arxiv.org/pdf/2310.08129v3.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.19098v2","updated":"2024-04-07T03:49:39Z","published":"2024-03-28T02:22:28Z","title":"GraphAD: Interaction Scene Graph for End-to-end Autonomous Driving","summary":" Modeling complicated interactions among the ego-vehicle, road agents, and map\nelements has been a crucial part for safety-critical autonomous driving.\nPrevious works on end-to-end autonomous driving rely on the attention mechanism\nfor handling heterogeneous interactions, which fails to capture the geometric\npriors and is also computationally intensive. In this paper, we propose the\nInteraction Scene Graph (ISG) as a unified method to model the interactions\namong the ego-vehicle, road agents, and map elements. With the representation\nof the ISG, the driving agents aggregate essential information from the most\ninfluential elements, including the road agents with potential collisions and\nthe map elements to follow. Since a mass of unnecessary interactions are\nomitted, the more efficient scene-graph-based framework is able to focus on\nindispensable connections and leads to better performance. We evaluate the\nproposed method for end-to-end autonomous driving on the nuScenes dataset.\nCompared with strong baselines, our method significantly outperforms in the\nfull-stack driving tasks, including perception, prediction, and planning. Code\nwill be released at https://github.com/zhangyp15/GraphAD.\n","authors":["Yunpeng Zhang","Deheng Qian","Ding Li","Yifeng Pan","Yong Chen","Zhenbao Liang","Zhiyao Zhang","Shurui Zhang","Hongxu Li","Maolei Fu","Yun Ye","Zhujin Liang","Yi Shan","Dalong Du"],"pdf_url":"https://arxiv.org/pdf/2403.19098v2.pdf","comment":"project page: https://github.com/zhangyp15/GraphAD"},{"id":"http://arxiv.org/abs/2401.01207v2","updated":"2024-04-07T03:44:59Z","published":"2024-01-02T13:28:39Z","title":"Towards a Simultaneous and Granular Identity-Expression Control in\n Personalized Face Generation","summary":" In human-centric content generation, the pre-trained text-to-image models\nstruggle to produce user-wanted portrait images, which retain the identity of\nindividuals while exhibiting diverse expressions. This paper introduces our\nefforts towards personalized face generation. To this end, we propose a novel\nmulti-modal face generation framework, capable of simultaneous\nidentity-expression control and more fine-grained expression synthesis. Our\nexpression control is so sophisticated that it can be specialized by the\nfine-grained emotional vocabulary. We devise a novel diffusion model that can\nundertake the task of simultaneously face swapping and reenactment. Due to the\nentanglement of identity and expression, it's nontrivial to separately and\nprecisely control them in one framework, thus has not been explored yet. To\novercome this, we propose several innovative designs in the conditional\ndiffusion model, including balancing identity and expression encoder, improved\nmidpoint sampling, and explicitly background conditioning. Extensive\nexperiments have demonstrated the controllability and scalability of the\nproposed framework, in comparison with state-of-the-art text-to-image, face\nswapping, and face reenactment methods.\n","authors":["Renshuai Liu","Bowen Ma","Wei Zhang","Zhipeng Hu","Changjie Fan","Tangjie Lv","Yu Ding","Xuan Cheng"],"pdf_url":"https://arxiv.org/pdf/2401.01207v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04800v1","updated":"2024-04-07T03:41:45Z","published":"2024-04-07T03:41:45Z","title":"Coordinated Sparse Recovery of Label Noise","summary":" Label noise is a common issue in real-world datasets that inevitably impacts\nthe generalization of models. This study focuses on robust classification tasks\nwhere the label noise is instance-dependent. Estimating the transition matrix\naccurately in this task is challenging, and methods based on sample selection\noften exhibit confirmation bias to varying degrees. Sparse over-parameterized\ntraining (SOP) has been theoretically effective in estimating and recovering\nlabel noise, offering a novel solution for noise-label learning. However, this\nstudy empirically observes and verifies a technical flaw of SOP: the lack of\ncoordination between model predictions and noise recovery leads to increased\ngeneralization error. To address this, we propose a method called Coordinated\nSparse Recovery (CSR). CSR introduces a collaboration matrix and confidence\nweights to coordinate model predictions and noise recovery, reducing error\nleakage. Based on CSR, this study designs a joint sample selection strategy and\nconstructs a comprehensive and powerful learning framework called CSR+. CSR+\nsignificantly reduces confirmation bias, especially for datasets with more\nclasses and a high proportion of instance-specific noise. Experimental results\non simulated and real-world noisy datasets demonstrate that both CSR and CSR+\nachieve outstanding performance compared to methods at the same level.\n","authors":["Yukun Yang","Naihao Wang","Haixin Yang","Ruirui Li"],"pdf_url":"https://arxiv.org/pdf/2404.04800v1.pdf","comment":"Pre-print prior to submission to journal"},{"id":"http://arxiv.org/abs/2404.04799v1","updated":"2024-04-07T03:37:29Z","published":"2024-04-07T03:37:29Z","title":"Few-Shot Object Detection: Research Advances and Challenges","summary":" Object detection as a subfield within computer vision has achieved remarkable\nprogress, which aims to accurately identify and locate a specific object from\nimages or videos. Such methods rely on large-scale labeled training samples for\neach object category to ensure accurate detection, but obtaining extensive\nannotated data is a labor-intensive and expensive process in many real-world\nscenarios. To tackle this challenge, researchers have explored few-shot object\ndetection (FSOD) that combines few-shot learning and object detection\ntechniques to rapidly adapt to novel objects with limited annotated samples.\nThis paper presents a comprehensive survey to review the significant\nadvancements in the field of FSOD in recent years and summarize the existing\nchallenges and solutions. Specifically, we first introduce the background and\ndefinition of FSOD to emphasize potential value in advancing the field of\ncomputer vision. We then propose a novel FSOD taxonomy method and survey the\nplentifully remarkable FSOD algorithms based on this fact to report a\ncomprehensive overview that facilitates a deeper understanding of the FSOD\nproblem and the development of innovative solutions. Finally, we discuss the\nadvantages and limitations of these algorithms to summarize the challenges,\npotential research direction, and development trend of object detection in the\ndata scarcity scenario.\n","authors":["Zhimeng Xin","Shiming Chen","Tianxu Wu","Yuanjie Shao","Weiping Ding","Xinge You"],"pdf_url":"https://arxiv.org/pdf/2404.04799v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18331v2","updated":"2024-04-07T02:51:02Z","published":"2024-02-28T13:50:46Z","title":"FineDiffusion: Scaling up Diffusion Models for Fine-grained Image\n Generation with 10,000 Classes","summary":" The class-conditional image generation based on diffusion models is renowned\nfor generating high-quality and diverse images. However, most prior efforts\nfocus on generating images for general categories, e.g., 1000 classes in\nImageNet-1k. A more challenging task, large-scale fine-grained image\ngeneration, remains the boundary to explore. In this work, we present a\nparameter-efficient strategy, called FineDiffusion, to fine-tune large\npre-trained diffusion models scaling to large-scale fine-grained image\ngeneration with 10,000 categories. FineDiffusion significantly accelerates\ntraining and reduces storage overhead by only fine-tuning tiered class\nembedder, bias terms, and normalization layers' parameters. To further improve\nthe image generation quality of fine-grained categories, we propose a novel\nsampling method for fine-grained image generation, which utilizes\nsuperclass-conditioned guidance, specifically tailored for fine-grained\ncategories, to replace the conventional classifier-free guidance sampling.\nCompared to full fine-tuning, FineDiffusion achieves a remarkable 1.56x\ntraining speed-up and requires storing merely 1.77% of the total model\nparameters, while achieving state-of-the-art FID of 9.776 on image generation\nof 10,000 classes. Extensive qualitative and quantitative experiments\ndemonstrate the superiority of our method compared to other parameter-efficient\nfine-tuning methods. The code and more generated results are available at our\nproject website: https://finediffusion.github.io/.\n","authors":["Ziying Pan","Kun Wang","Gang Li","Feihong He","Xiwang Li","Yongxuan Lai"],"pdf_url":"https://arxiv.org/pdf/2402.18331v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17228v2","updated":"2024-04-07T02:43:54Z","published":"2024-02-27T05:42:38Z","title":"Feature Re-Embedding: Towards Foundation Model-Level Performance in\n Computational Pathology","summary":" Multiple instance learning (MIL) is the most widely used framework in\ncomputational pathology, encompassing sub-typing, diagnosis, prognosis, and\nmore. However, the existing MIL paradigm typically requires an offline instance\nfeature extractor, such as a pre-trained ResNet or a foundation model. This\napproach lacks the capability for feature fine-tuning within the specific\ndownstream tasks, limiting its adaptability and performance. To address this\nissue, we propose a Re-embedded Regional Transformer (R$^2$T) for re-embedding\nthe instance features online, which captures fine-grained local features and\nestablishes connections across different regions. Unlike existing works that\nfocus on pre-training powerful feature extractor or designing sophisticated\ninstance aggregator, R$^2$T is tailored to re-embed instance features online.\nIt serves as a portable module that can seamlessly integrate into mainstream\nMIL models. Extensive experimental results on common computational pathology\ntasks validate that: 1) feature re-embedding improves the performance of MIL\nmodels based on ResNet-50 features to the level of foundation model features,\nand further enhances the performance of foundation model features; 2) the\nR$^2$T can introduce more significant performance improvements to various MIL\nmodels; 3) R$^2$T-MIL, as an R$^2$T-enhanced AB-MIL, outperforms other latest\nmethods by a large margin.The code is available at:\nhttps://github.com/DearCaat/RRT-MIL.\n","authors":["Wenhao Tang","Fengtao Zhou","Sheng Huang","Xiang Zhu","Yi Zhang","Bo Liu"],"pdf_url":"https://arxiv.org/pdf/2402.17228v2.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2402.11502v3","updated":"2024-04-07T02:42:27Z","published":"2024-02-18T08:21:05Z","title":"GenAD: Generative End-to-End Autonomous Driving","summary":" Directly producing planning results from raw sensors has been a long-desired\nsolution for autonomous driving and has attracted increasing attention\nrecently. Most existing end-to-end autonomous driving methods factorize this\nproblem into perception, motion prediction, and planning. However, we argue\nthat the conventional progressive pipeline still cannot comprehensively model\nthe entire traffic evolution process, e.g., the future interaction between the\nego car and other traffic participants and the structural trajectory prior. In\nthis paper, we explore a new paradigm for end-to-end autonomous driving, where\nthe key is to predict how the ego car and the surroundings evolve given past\nscenes. We propose GenAD, a generative framework that casts autonomous driving\ninto a generative modeling problem. We propose an instance-centric scene\ntokenizer that first transforms the surrounding scenes into map-aware instance\ntokens. We then employ a variational autoencoder to learn the future trajectory\ndistribution in a structural latent space for trajectory prior modeling. We\nfurther adopt a temporal model to capture the agent and ego movements in the\nlatent space to generate more effective future trajectories. GenAD finally\nsimultaneously performs motion prediction and planning by sampling\ndistributions in the learned structural latent space conditioned on the\ninstance tokens and using the learned temporal model to generate futures.\nExtensive experiments on the widely used nuScenes benchmark show that the\nproposed GenAD achieves state-of-the-art performance on vision-centric\nend-to-end autonomous driving with high efficiency. Code:\nhttps://github.com/wzzheng/GenAD.\n","authors":["Wenzhao Zheng","Ruiqi Song","Xianda Guo","Chenming Zhang","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2402.11502v3.pdf","comment":"Code is available at: https://github.com/wzzheng/GenAD"},{"id":"http://arxiv.org/abs/2309.16496v3","updated":"2024-04-07T02:39:31Z","published":"2023-09-28T15:03:44Z","title":"CCEdit: Creative and Controllable Video Editing via Diffusion Models","summary":" In this paper, we present CCEdit, a versatile generative video editing\nframework based on diffusion models. Our approach employs a novel trident\nnetwork structure that separates structure and appearance control, ensuring\nprecise and creative editing capabilities. Utilizing the foundational\nControlNet architecture, we maintain the structural integrity of the video\nduring editing. The incorporation of an additional appearance branch enables\nusers to exert fine-grained control over the edited key frame. These two side\nbranches seamlessly integrate into the main branch, which is constructed upon\nexisting text-to-image (T2I) generation models, through learnable temporal\nlayers. The versatility of our framework is demonstrated through a diverse\nrange of choices in both structure representations and personalized T2I models,\nas well as the option to provide the edited key frame. To facilitate\ncomprehensive evaluation, we introduce the BalanceCC benchmark dataset,\ncomprising 100 videos and 4 target prompts for each video. Our extensive user\nstudies compare CCEdit with eight state-of-the-art video editing methods. The\noutcomes demonstrate CCEdit's substantial superiority over all other methods.\n","authors":["Ruoyu Feng","Wenming Weng","Yanhui Wang","Yuhui Yuan","Jianmin Bao","Chong Luo","Zhibo Chen","Baining Guo"],"pdf_url":"https://arxiv.org/pdf/2309.16496v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12850v2","updated":"2024-04-07T02:18:23Z","published":"2023-10-19T14:04:53Z","title":"PrivImage: Differentially Private Synthetic Image Generation using\n Diffusion Models with Semantic-Aware Pretraining","summary":" Differential Privacy (DP) image data synthesis, which leverages the DP\ntechnique to generate synthetic data to replace the sensitive data, allowing\norganizations to share and utilize synthetic images without privacy concerns.\nPrevious methods incorporate the advanced techniques of generative models and\npre-training on a public dataset to produce exceptional DP image data, but\nsuffer from problems of unstable training and massive computational resource\ndemands. This paper proposes a novel DP image synthesis method, termed\nPRIVIMAGE, which meticulously selects pre-training data, promoting the\nefficient creation of DP datasets with high fidelity and utility. PRIVIMAGE\nfirst establishes a semantic query function using a public dataset. Then, this\nfunction assists in querying the semantic distribution of the sensitive\ndataset, facilitating the selection of data from the public dataset with\nanalogous semantics for pre-training. Finally, we pre-train an image generative\nmodel using the selected data and then fine-tune this model on the sensitive\ndataset using Differentially Private Stochastic Gradient Descent (DP-SGD).\nPRIVIMAGE allows us to train a lightly parameterized generative model, reducing\nthe noise in the gradient during DP-SGD training and enhancing training\nstability. Extensive experiments demonstrate that PRIVIMAGE uses only 1% of the\npublic dataset for pre-training and 7.6% of the parameters in the generative\nmodel compared to the state-of-the-art method, whereas achieves superior\nsynthetic performance and conserves more computational resources. On average,\nPRIVIMAGE achieves 30.1% lower FID and 12.6% higher Classification Accuracy\nthan the state-of-the-art method. The replication package and datasets can be\naccessed online.\n","authors":["Kecen Li","Chen Gong","Zhixiang Li","Yuzhong Zhao","Xinwen Hou","Tianhao Wang"],"pdf_url":"https://arxiv.org/pdf/2311.12850v2.pdf","comment":"Accepted at USENIX Security 2024"},{"id":"http://arxiv.org/abs/2404.04785v1","updated":"2024-04-07T02:15:43Z","published":"2024-04-07T02:15:43Z","title":"Rethinking Diffusion Model for Multi-Contrast MRI Super-Resolution","summary":" Recently, diffusion models (DM) have been applied in magnetic resonance\nimaging (MRI) super-resolution (SR) reconstruction, exhibiting impressive\nperformance, especially with regard to detailed reconstruction. However, the\ncurrent DM-based SR reconstruction methods still face the following issues: (1)\nThey require a large number of iterations to reconstruct the final image, which\nis inefficient and consumes a significant amount of computational resources.\n(2) The results reconstructed by these methods are often misaligned with the\nreal high-resolution images, leading to remarkable distortion in the\nreconstructed MR images. To address the aforementioned issues, we propose an\nefficient diffusion model for multi-contrast MRI SR, named as DiffMSR.\nSpecifically, we apply DM in a highly compact low-dimensional latent space to\ngenerate prior knowledge with high-frequency detail information. The highly\ncompact latent space ensures that DM requires only a few simple iterations to\nproduce accurate prior knowledge. In addition, we design the Prior-Guide Large\nWindow Transformer (PLWformer) as the decoder for DM, which can extend the\nreceptive field while fully utilizing the prior knowledge generated by DM to\nensure that the reconstructed MR image remains undistorted. Extensive\nexperiments on public and clinical datasets demonstrate that our DiffMSR\noutperforms state-of-the-art methods.\n","authors":["Guangyuan Li","Chen Rao","Juncheng Mo","Zhanjie Zhang","Wei Xing","Lei Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.04785v1.pdf","comment":"14 pages, 12 figures, Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2404.00674v2","updated":"2024-04-07T01:56:15Z","published":"2024-03-31T12:45:23Z","title":"Knowledge NeRF: Few-shot Novel View Synthesis for Dynamic Articulated\n Objects","summary":" We present Knowledge NeRF to synthesize novel views for dynamic scenes.\nReconstructing dynamic 3D scenes from few sparse views and rendering them from\narbitrary perspectives is a challenging problem with applications in various\ndomains. Previous dynamic NeRF methods learn the deformation of articulated\nobjects from monocular videos. However, qualities of their reconstructed scenes\nare limited. To clearly reconstruct dynamic scenes, we propose a new framework\nby considering two frames at a time.We pretrain a NeRF model for an articulated\nobject.When articulated objects moves, Knowledge NeRF learns to generate novel\nviews at the new state by incorporating past knowledge in the pretrained NeRF\nmodel with minimal observations in the present state. We propose a projection\nmodule to adapt NeRF for dynamic scenes, learning the correspondence between\npretrained knowledge base and current states. Experimental results demonstrate\nthe effectiveness of our method in reconstructing dynamic 3D scenes with 5\ninput images in one state. Knowledge NeRF is a new pipeline and promising\nsolution for novel view synthesis in dynamic articulated objects. The data and\nimplementation are publicly available at\nhttps://github.com/RussRobin/Knowledge_NeRF.\n","authors":["Wenxiao Cai","Xinyue Lei","Xinyu He","Junming Leo Chen","Yangang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.00674v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02449v2","updated":"2024-04-07T01:55:40Z","published":"2024-03-04T20:05:28Z","title":"Optimizing Illuminant Estimation in Dual-Exposure HDR Imaging","summary":" High dynamic range (HDR) imaging involves capturing a series of frames of the\nsame scene, each with different exposure settings, to broaden the dynamic range\nof light. This can be achieved through burst capturing or using staggered HDR\nsensors that capture long and short exposures simultaneously in the camera\nimage signal processor (ISP). Within camera ISP pipeline, illuminant estimation\nis a crucial step aiming to estimate the color of the global illuminant in the\nscene. This estimation is used in camera ISP white-balance module to remove\nundesirable color cast in the final image. Despite the multiple frames captured\nin the HDR pipeline, conventional illuminant estimation methods often rely only\non a single frame of the scene. In this paper, we explore leveraging\ninformation from frames captured with different exposure times. Specifically,\nwe introduce a simple feature extracted from dual-exposure images to guide\nilluminant estimators, referred to as the dual-exposure feature (DEF). To\nvalidate the efficiency of DEF, we employed two illuminant estimators using the\nproposed DEF: 1) a multilayer perceptron network (MLP), referred to as\nexposure-based MLP (EMLP), and 2) a modified version of the convolutional color\nconstancy (CCC) to integrate our DEF, that we call ECCC. Both EMLP and ECCC\nachieve promising results, in some cases surpassing prior methods that require\nhundreds of thousands or millions of parameters, with only a few hundred\nparameters for EMLP and a few thousand parameters for ECCC.\n","authors":["Mahmoud Afifi","Zhenhua Hu","Liang Liang"],"pdf_url":"https://arxiv.org/pdf/2403.02449v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.20210v3","updated":"2024-04-07T01:25:09Z","published":"2023-10-31T06:19:09Z","title":"UWFormer: Underwater Image Enhancement via a Semi-Supervised Multi-Scale\n Transformer","summary":" Underwater images often exhibit poor quality, distorted color balance and low\ncontrast due to the complex and intricate interplay of light, water, and\nobjects. Despite the significant contributions of previous underwater\nenhancement techniques, there exist several problems that demand further\nimprovement: (i) The current deep learning methods rely on Convolutional Neural\nNetworks (CNNs) that lack the multi-scale enhancement, and global perception\nfield is also limited. (ii) The scarcity of paired real-world underwater\ndatasets poses a significant challenge, and the utilization of synthetic image\npairs could lead to overfitting. To address the aforementioned problems, this\npaper introduces a Multi-scale Transformer-based Network called UWFormer for\nenhancing images at multiple frequencies via semi-supervised learning, in which\nwe propose a Nonlinear Frequency-aware Attention mechanism and a Multi-Scale\nFusion Feed-forward Network for low-frequency enhancement. Besides, we\nintroduce a special underwater semi-supervised training strategy, where we\npropose a Subaqueous Perceptual Loss function to generate reliable pseudo\nlabels. Experiments using full-reference and non-reference underwater\nbenchmarks demonstrate that our method outperforms state-of-the-art methods in\nterms of both quantity and visual quality.\n","authors":["Weiwen Chen","Yingtie Lei","Shenghong Luo","Ziyang Zhou","Mingxian Li","Chi-Man Pun"],"pdf_url":"https://arxiv.org/pdf/2310.20210v3.pdf","comment":"Accepted by IJCNN 2024"},{"id":"http://arxiv.org/abs/2404.04763v1","updated":"2024-04-07T00:28:13Z","published":"2024-04-07T00:28:13Z","title":"GenEARL: A Training-Free Generative Framework for Multimodal Event\n Argument Role Labeling","summary":" Multimodal event argument role labeling (EARL), a task that assigns a role\nfor each event participant (object) in an image is a complex challenge. It\nrequires reasoning over the entire image, the depicted event, and the\ninteractions between various objects participating in the event. Existing\nmodels heavily rely on high-quality event-annotated training data to understand\nthe event semantics and structures, and they fail to generalize to new event\ntypes and domains. In this paper, we propose GenEARL, a training-free\ngenerative framework that harness the power of the modern generative models to\nunderstand event task descriptions given image contexts to perform the EARL\ntask. Specifically, GenEARL comprises two stages of generative prompting with a\nfrozen vision-language model (VLM) and a frozen large language model (LLM).\nFirst, a generative VLM learns the semantics of the event argument roles and\ngenerates event-centric object descriptions based on the image. Subsequently, a\nLLM is prompted with the generated object descriptions with a predefined\ntemplate for EARL (i.e., assign an object with an event argument role). We show\nthat GenEARL outperforms the contrastive pretraining (CLIP) baseline by 9.4%\nand 14.2% accuracy for zero-shot EARL on the M2E2 and SwiG datasets,\nrespectively. In addition, we outperform CLIP-Event by 22% precision on M2E2\ndataset. The framework also allows flexible adaptation and generalization to\nunseen domains.\n","authors":["Hritik Bansal","Po-Nien Kung","P. Jeffrey Brantingham","Kai-Wei Chang","Nanyun Peng"],"pdf_url":"https://arxiv.org/pdf/2404.04763v1.pdf","comment":"20 pages, 15 Figures, 13 figures"},{"id":"http://arxiv.org/abs/2404.06332v1","updated":"2024-04-07T12:42:02Z","published":"2024-04-07T12:42:02Z","title":"X-VARS: Introducing Explainability in Football Refereeing with\n Multi-Modal Large Language Model","summary":" The rapid advancement of artificial intelligence has led to significant\nimprovements in automated decision-making. However, the increased performance\nof models often comes at the cost of explainability and transparency of their\ndecision-making processes. In this paper, we investigate the capabilities of\nlarge language models to explain decisions, using football refereeing as a\ntesting ground, given its decision complexity and subjectivity. We introduce\nthe Explainable Video Assistant Referee System, X-VARS, a multi-modal large\nlanguage model designed for understanding football videos from the point of\nview of a referee. X-VARS can perform a multitude of tasks, including video\ndescription, question answering, action recognition, and conducting meaningful\nconversations based on video content and in accordance with the Laws of the\nGame for football referees. We validate X-VARS on our novel dataset,\nSoccerNet-XFoul, which consists of more than 22k video-question-answer triplets\nannotated by over 70 experienced football referees. Our experiments and human\nstudy illustrate the impressive capabilities of X-VARS in interpreting complex\nfootball clips. Furthermore, we highlight the potential of X-VARS to reach\nhuman performance and support football referees in the future.\n","authors":["Jan Held","Hani Itani","Anthony Cioppa","Silvio Giancola","Bernard Ghanem","Marc Van Droogenbroeck"],"pdf_url":"https://arxiv.org/pdf/2404.06332v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04818v1","updated":"2024-04-07T05:56:42Z","published":"2024-04-07T05:56:42Z","title":"DWE+: Dual-Way Matching Enhanced Framework for Multimodal Entity Linking","summary":" Multimodal entity linking (MEL) aims to utilize multimodal information\n(usually textual and visual information) to link ambiguous mentions to\nunambiguous entities in knowledge base. Current methods facing main issues:\n(1)treating the entire image as input may contain redundant information. (2)the\ninsufficient utilization of entity-related information, such as attributes in\nimages. (3)semantic inconsistency between the entity in knowledge base and its\nrepresentation. To this end, we propose DWE+ for multimodal entity linking.\nDWE+ could capture finer semantics and dynamically maintain semantic\nconsistency with entities. This is achieved by three aspects: (a)we introduce a\nmethod for extracting fine-grained image features by partitioning the image\ninto multiple local objects. Then, hierarchical contrastive learning is used to\nfurther align semantics between coarse-grained information(text and image) and\nfine-grained (mention and visual objects). (b)we explore ways to extract visual\nattributes from images to enhance fusion feature such as facial features and\nidentity. (c)we leverage Wikipedia and ChatGPT to capture the entity\nrepresentation, achieving semantic enrichment from both static and dynamic\nperspectives, which better reflects the real-world entity semantics.\nExperiments on Wikimel, Richpedia, and Wikidiverse datasets demonstrate the\neffectiveness of DWE+ in improving MEL performance. Specifically, we optimize\nthese datasets and achieve state-of-the-art performance on the enhanced\ndatasets. The code and enhanced datasets are released on\nhttps://github.com/season1blue/DWET\n","authors":["Shezheng Song","Shasha Li","Shan Zhao","Xiaopeng Li","Chengyu Wang","Jie Yu","Jun Ma","Tianwei Yan","Bin Ji","Xiaoguang Mao"],"pdf_url":"https://arxiv.org/pdf/2404.04818v1.pdf","comment":"under review on TOIS. arXiv admin note: substantial text overlap with\n arXiv:2312.11816"}]},"2024-04-06T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2311.06694v3","updated":"2024-04-06T22:14:25Z","published":"2023-11-12T00:21:58Z","title":"Which One? Leveraging Context Between Objects and Multiple Views for\n Language Grounding","summary":" When connecting objects and their language referents in an embodied 3D\nenvironment, it is important to note that: (1) an object can be better\ncharacterized by leveraging comparative information between itself and other\nobjects, and (2) an object's appearance can vary with camera position. As such,\nwe present the Multi-view Approach to Grounding in Context (MAGiC), which\nselects an object referent based on language that distinguishes between two\nsimilar objects. By pragmatically reasoning over both objects and across\nmultiple views of those objects, MAGiC improves over the state-of-the-art model\non the SNARE object reference task with a relative error reduction of 12.9\\%\n(representing an absolute improvement of 2.7\\%). Ablation studies show that\nreasoning jointly over object referent candidates and multiple views of each\nobject both contribute to improved accuracy. Code:\nhttps://github.com/rcorona/magic_snare/\n","authors":["Chancharik Mitra","Abrar Anwar","Rodolfo Corona","Dan Klein","Trevor Darrell","Jesse Thomason"],"pdf_url":"https://arxiv.org/pdf/2311.06694v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04745v1","updated":"2024-04-06T22:08:20Z","published":"2024-04-06T22:08:20Z","title":"Collaborative Feedback Discriminative Propagation for Video\n Super-Resolution","summary":" The key success of existing video super-resolution (VSR) methods stems mainly\nfrom exploring spatial and temporal information, which is usually achieved by a\nrecurrent propagation module with an alignment module. However, inaccurate\nalignment usually leads to aligned features with significant artifacts, which\nwill be accumulated during propagation and thus affect video restoration.\nMoreover, propagation modules only propagate the same timestep features forward\nor backward that may fail in case of complex motion or occlusion, limiting\ntheir performance for high-quality frame restoration. To address these issues,\nwe propose a collaborative feedback discriminative (CFD) method to correct\ninaccurate aligned features and model long -range spatial and temporal\ninformation for better video reconstruction. In detail, we develop a\ndiscriminative alignment correction (DAC) method to adaptively explore\ninformation and reduce the influences of the artifacts caused by inaccurate\nalignment. Then, we propose a collaborative feedback propagation (CFP) module\nthat employs feedback and gating mechanisms to better explore spatial and\ntemporal information of different timestep features from forward and backward\npropagation simultaneously. Finally, we embed the proposed DAC and CFP into\ncommonly used VSR networks to verify the effectiveness of our method.\nQuantitative and qualitative experiments on several benchmarks demonstrate that\nour method can improve the performance of existing VSR models while maintaining\na lower model complexity. The source code and pre-trained models will be\navailable at \\url{https://github.com/House-Leo/CFDVSR}.\n","authors":["Hao Li","Xiang Chen","Jiangxin Dong","Jinhui Tang","Jinshan Pan"],"pdf_url":"https://arxiv.org/pdf/2404.04745v1.pdf","comment":"Project website: https://github.com/House-Leo/CFDVSR"},{"id":"http://arxiv.org/abs/2404.04736v1","updated":"2024-04-06T21:39:49Z","published":"2024-04-06T21:39:49Z","title":"ProtoAL: Interpretable Deep Active Learning with prototypes for medical\n imaging","summary":" The adoption of Deep Learning algorithms in the medical imaging field is a\nprominent area of research, with high potential for advancing AI-based\nComputer-aided diagnosis (AI-CAD) solutions. However, current solutions face\nchallenges due to a lack of interpretability features and high data demands,\nprompting recent efforts to address these issues. In this study, we propose the\nProtoAL method, where we integrate an interpretable DL model into the Deep\nActive Learning (DAL) framework. This approach aims to address both challenges\nby focusing on the medical imaging context and utilizing an inherently\ninterpretable model based on prototypes. We evaluated ProtoAL on the Messidor\ndataset, achieving an area under the precision-recall curve of 0.79 while\nutilizing only 76.54\\% of the available labeled data. These capabilities can\nenhances the practical usability of a DL model in the medical field, providing\na means of trust calibration in domain experts and a suitable solution for\nlearning in the data scarcity context often found.\n","authors":["Iury B. de A. Santos","André C. P. L. F. de Carvalho"],"pdf_url":"https://arxiv.org/pdf/2404.04736v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04734v1","updated":"2024-04-06T21:33:39Z","published":"2024-04-06T21:33:39Z","title":"Towards Generalized Entropic Sparsification for Convolutional Neural\n Networks","summary":" Convolutional neural networks (CNNs) are reported to be overparametrized. The\nsearch for optimal (minimal) and sufficient architecture is an NP-hard problem\nas the hyperparameter space for possible network configurations is vast. Here,\nwe introduce a layer-by-layer data-driven pruning method based on the\nmathematical idea aiming at a computationally-scalable entropic relaxation of\nthe pruning problem. The sparse subnetwork is found from the pre-trained (full)\nCNN using the network entropy minimization as a sparsity constraint. This\nallows deploying a numerically scalable algorithm with a sublinear scaling\ncost. The method is validated on several benchmarks (architectures): (i) MNIST\n(LeNet) with sparsity 55%-84% and loss in accuracy 0.1%-0.5%, and (ii) CIFAR-10\n(VGG-16, ResNet18) with sparsity 73-89% and loss in accuracy 0.1%-0.5%.\n","authors":["Tin Barisin","Illia Horenko"],"pdf_url":"https://arxiv.org/pdf/2404.04734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.07540v2","updated":"2024-04-06T19:56:42Z","published":"2023-03-14T00:05:08Z","title":"Tensor-based Multimodal Learning for Prediction of Pulmonary Arterial\n Wedge Pressure from Cardiac MRI","summary":" Heart failure is a serious and life-threatening condition that can lead to\nelevated pressure in the left ventricle. Pulmonary Arterial Wedge Pressure\n(PAWP) is an important surrogate marker indicating high pressure in the left\nventricle. PAWP is determined by Right Heart Catheterization (RHC) but it is an\ninvasive procedure. A non-invasive method is useful in quickly identifying\nhigh-risk patients from a large population. In this work, we develop a tensor\nlearning-based pipeline for identifying PAWP from multimodal cardiac Magnetic\nResonance Imaging (MRI). This pipeline extracts spatial and temporal features\nfrom high-dimensional scans. For quality control, we incorporate an epistemic\nuncertainty-based binning strategy to identify poor-quality training samples.\nTo improve the performance, we learn complementary information by integrating\nfeatures from multimodal data: cardiac MRI with short-axis and four-chamber\nviews, and Electronic Health Records. The experimental analysis on a large\ncohort of $1346$ subjects who underwent the RHC procedure for PAWP estimation\nindicates that the proposed pipeline has a diagnostic value and can produce\npromising performance with significant improvement over the baseline in\nclinical practice (i.e., $\\Delta$AUC $=0.10$, $\\Delta$Accuracy $=0.06$, and\n$\\Delta$MCC $=0.39$). The decision curve analysis further confirms the clinical\nutility of our method.\n","authors":["Prasun C. Tripathi","Mohammod N. I. Suvon","Lawrence Schobs","Shuo Zhou","Samer Alabed","Andrew J. Swift","Haiping Lu"],"pdf_url":"https://arxiv.org/pdf/2303.07540v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04720v1","updated":"2024-04-06T19:50:48Z","published":"2024-04-06T19:50:48Z","title":"On Exploring PDE Modeling for Point Cloud Video Representation Learning","summary":" Point cloud video representation learning is challenging due to complex\nstructures and unordered spatial arrangement. Traditional methods struggle with\nframe-to-frame correlations and point-wise correspondence tracking. Recently,\npartial differential equations (PDE) have provided a new perspective in\nuniformly solving spatial-temporal data information within certain constraints.\nWhile tracking tangible point correspondence remains challenging, we propose to\nformalize point cloud video representation learning as a PDE-solving problem.\nInspired by fluid analysis, where PDEs are used to solve the deformation of\nspatial shape over time, we employ PDE to solve the variations of spatial\npoints affected by temporal information. By modeling spatial-temporal\ncorrelations, we aim to regularize spatial variations with temporal features,\nthereby enhancing representation learning in point cloud videos. We introduce\nMotion PointNet composed of a PointNet-like encoder and a PDE-solving module.\nInitially, we construct a lightweight yet effective encoder to model an initial\nstate of the spatial variations. Subsequently, we develop our PDE-solving\nmodule in a parameterized latent space, tailored to address the spatio-temporal\ncorrelations inherent in point cloud video. The process of solving PDE is\nguided and refined by a contrastive learning structure, which is pivotal in\nreshaping the feature distribution, thereby optimizing the feature\nrepresentation within point cloud video data. Remarkably, our Motion PointNet\nachieves an impressive accuracy of 97.52% on the MSRAction-3D dataset,\nsurpassing the current state-of-the-art in all aspects while consuming minimal\nresources (only 0.72M parameters and 0.82G FLOPs).\n","authors":["Zhuoxu Huang","Zhenkun Fan","Tao Xu","Jungong Han"],"pdf_url":"https://arxiv.org/pdf/2404.04720v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04718v1","updated":"2024-04-06T19:42:25Z","published":"2024-04-06T19:42:25Z","title":"Interpretable Multimodal Learning for Cardiovascular Hemodynamics\n Assessment","summary":" Pulmonary Arterial Wedge Pressure (PAWP) is an essential cardiovascular\nhemodynamics marker to detect heart failure. In clinical practice, Right Heart\nCatheterization is considered a gold standard for assessing cardiac\nhemodynamics while non-invasive methods are often needed to screen high-risk\npatients from a large population. In this paper, we propose a multimodal\nlearning pipeline to predict PAWP marker. We utilize complementary information\nfrom Cardiac Magnetic Resonance Imaging (CMR) scans (short-axis and\nfour-chamber) and Electronic Health Records (EHRs). We extract spatio-temporal\nfeatures from CMR scans using tensor-based learning. We propose a graph\nattention network to select important EHR features for prediction, where we\nmodel subjects as graph nodes and feature relationships as graph edges using\nthe attention mechanism. We design four feature fusion strategies: early,\nintermediate, late, and hybrid fusion. With a linear classifier and linear\nfusion strategies, our pipeline is interpretable. We validate our pipeline on a\nlarge dataset of $2,641$ subjects from our ASPIRE registry. The comparative\nstudy against state-of-the-art methods confirms the superiority of our\npipeline. The decision curve analysis further validates that our pipeline can\nbe applied to screen a large population. The code is available at\nhttps://github.com/prasunc/hemodynamics.\n","authors":["Prasun C Tripathi","Sina Tabakhi","Mohammod N I Suvon","Lawrence Schöb","Samer Alabed","Andrew J Swift","Shuo Zhou","Haiping Lu"],"pdf_url":"https://arxiv.org/pdf/2404.04718v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02494v3","updated":"2024-04-06T18:10:26Z","published":"2023-07-16T19:36:19Z","title":"Adaptively Placed Multi-Grid Scene Representation Networks for\n Large-Scale Data Visualization","summary":" Scene representation networks (SRNs) have been recently proposed for\ncompression and visualization of scientific data. However, state-of-the-art\nSRNs do not adapt the allocation of available network parameters to the complex\nfeatures found in scientific data, leading to a loss in reconstruction quality.\nWe address this shortcoming with an adaptively placed multi-grid SRN (APMGSRN)\nand propose a domain decomposition training and inference technique for\naccelerated parallel training on multi-GPU systems. We also release an\nopen-source neural volume rendering application that allows plug-and-play\nrendering with any PyTorch-based SRN. Our proposed APMGSRN architecture uses\nmultiple spatially adaptive feature grids that learn where to be placed within\nthe domain to dynamically allocate more neural network resources where error is\nhigh in the volume, improving state-of-the-art reconstruction accuracy of SRNs\nfor scientific data without requiring expensive octree refining, pruning, and\ntraversal like previous adaptive models. In our domain decomposition approach\nfor representing large-scale data, we train an set of APMGSRNs in parallel on\nseparate bricks of the volume to reduce training time while avoiding overhead\nnecessary for an out-of-core solution for volumes too large to fit in GPU\nmemory. After training, the lightweight SRNs are used for realtime neural\nvolume rendering in our open-source renderer, where arbitrary view angles and\ntransfer functions can be explored. A copy of this paper, all code, all models\nused in our experiments, and all supplemental materials and videos are\navailable at https://github.com/skywolf829/APMGSRN.\n","authors":["Skylar Wolfgang Wurster","Tianyu Xiong","Han-Wei Shen","Hanqi Guo","Tom Peterka"],"pdf_url":"https://arxiv.org/pdf/2308.02494v3.pdf","comment":"Accepted to IEEE VIS 2023.\n https://www.computer.org/csdl/journal/tg/2024/01/10297599/1RyYguiNBLO"},{"id":"http://arxiv.org/abs/2404.04693v1","updated":"2024-04-06T17:41:36Z","published":"2024-04-06T17:41:36Z","title":"OmniColor: A Global Camera Pose Optimization Approach of LiDAR-360Camera\n Fusion for Colorizing Point Clouds","summary":" A Colored point cloud, as a simple and efficient 3D representation, has many\nadvantages in various fields, including robotic navigation and scene\nreconstruction. This representation is now commonly used in 3D reconstruction\ntasks relying on cameras and LiDARs. However, fusing data from these two types\nof sensors is poorly performed in many existing frameworks, leading to\nunsatisfactory mapping results, mainly due to inaccurate camera poses. This\npaper presents OmniColor, a novel and efficient algorithm to colorize point\nclouds using an independent 360-degree camera. Given a LiDAR-based point cloud\nand a sequence of panorama images with initial coarse camera poses, our\nobjective is to jointly optimize the poses of all frames for mapping images\nonto geometric reconstructions. Our pipeline works in an off-the-shelf manner\nthat does not require any feature extraction or matching process. Instead, we\nfind optimal poses by directly maximizing the photometric consistency of LiDAR\nmaps. In experiments, we show that our method can overcome the severe visual\ndistortion of omnidirectional images and greatly benefit from the wide field of\nview (FOV) of 360-degree cameras to reconstruct various scenarios with accuracy\nand stability. The code will be released at\nhttps://github.com/liubonan123/OmniColor/.\n","authors":["Bonan Liu","Guoyang Zhao","Jianhao Jiao","Guang Cai","Chengyang Li","Handi Yin","Yuyang Wang","Ming Liu","Pan Hui"],"pdf_url":"https://arxiv.org/pdf/2404.04693v1.pdf","comment":"2024 IEEE International Conference on Robotics and Automation"},{"id":"http://arxiv.org/abs/2404.04687v1","updated":"2024-04-06T17:23:43Z","published":"2024-04-06T17:23:43Z","title":"Z-Splat: Z-Axis Gaussian Splatting for Camera-Sonar Fusion","summary":" Differentiable 3D-Gaussian splatting (GS) is emerging as a prominent\ntechnique in computer vision and graphics for reconstructing 3D scenes. GS\nrepresents a scene as a set of 3D Gaussians with varying opacities and employs\na computationally efficient splatting operation along with analytical\nderivatives to compute the 3D Gaussian parameters given scene images captured\nfrom various viewpoints. Unfortunately, capturing surround view ($360^{\\circ}$\nviewpoint) images is impossible or impractical in many real-world imaging\nscenarios, including underwater imaging, rooms inside a building, and\nautonomous navigation. In these restricted baseline imaging scenarios, the GS\nalgorithm suffers from a well-known 'missing cone' problem, which results in\npoor reconstruction along the depth axis. In this manuscript, we demonstrate\nthat using transient data (from sonars) allows us to address the missing cone\nproblem by sampling high-frequency data along the depth axis. We extend the\nGaussian splatting algorithms for two commonly used sonars and propose fusion\nalgorithms that simultaneously utilize RGB camera data and sonar data. Through\nsimulations, emulations, and hardware experiments across various imaging\nscenarios, we show that the proposed fusion algorithms lead to significantly\nbetter novel view synthesis (5 dB improvement in PSNR) and 3D geometry\nreconstruction (60% lower Chamfer distance).\n","authors":["Ziyuan Qu","Omkar Vengurlekar","Mohamad Qadri","Kevin Zhang","Michael Kaess","Christopher Metzler","Suren Jayasuriya","Adithya Pediredla"],"pdf_url":"https://arxiv.org/pdf/2404.04687v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04686v1","updated":"2024-04-06T17:23:21Z","published":"2024-04-06T17:23:21Z","title":"Predictive Modeling for Breast Cancer Classification in the Context of\n Bangladeshi Patients: A Supervised Machine Learning Approach with Explainable\n AI","summary":" Breast cancer has rapidly increased in prevalence in recent years, making it\none of the leading causes of mortality worldwide. Among all cancers, it is by\nfar the most common. Diagnosing this illness manually requires significant time\nand expertise. Since detecting breast cancer is a time-consuming process,\npreventing its further spread can be aided by creating machine-based forecasts.\nMachine learning and Explainable AI are crucial in classification as they not\nonly provide accurate predictions but also offer insights into how the model\narrives at its decisions, aiding in the understanding and trustworthiness of\nthe classification results. In this study, we evaluate and compare the\nclassification accuracy, precision, recall, and F-1 scores of five different\nmachine learning methods using a primary dataset (500 patients from Dhaka\nMedical College Hospital). Five different supervised machine learning\ntechniques, including decision tree, random forest, logistic regression, naive\nbayes, and XGBoost, have been used to achieve optimal results on our dataset.\nAdditionally, this study applied SHAP analysis to the XGBoost model to\ninterpret the model's predictions and understand the impact of each feature on\nthe model's output. We compared the accuracy with which several algorithms\nclassified the data, as well as contrasted with other literature in this field.\nAfter final evaluation, this study found that XGBoost achieved the best model\naccuracy, which is 97%.\n","authors":["Taminul Islam","Md. Alif Sheakh","Mst. Sazia Tahosin","Most. Hasna Hena","Shopnil Akash","Yousef A. Bin Jardan","Gezahign Fentahun Wondmie","Hiba-Allah Nafidi","Mohammed Bourhia"],"pdf_url":"https://arxiv.org/pdf/2404.04686v1.pdf","comment":"Accepted for the Scientific Reports (Nature) journal. 32 pages, 12\n figures"},{"id":"http://arxiv.org/abs/2404.04677v1","updated":"2024-04-06T16:48:08Z","published":"2024-04-06T16:48:08Z","title":"Salient Sparse Visual Odometry With Pose-Only Supervision","summary":" Visual Odometry (VO) is vital for the navigation of autonomous systems,\nproviding accurate position and orientation estimates at reasonable costs.\nWhile traditional VO methods excel in some conditions, they struggle with\nchallenges like variable lighting and motion blur. Deep learning-based VO,\nthough more adaptable, can face generalization problems in new environments.\nAddressing these drawbacks, this paper presents a novel hybrid visual odometry\n(VO) framework that leverages pose-only supervision, offering a balanced\nsolution between robustness and the need for extensive labeling. We propose two\ncost-effective and innovative designs: a self-supervised homographic\npre-training for enhancing optical flow learning from pose-only labels and a\nrandom patch-based salient point detection strategy for more accurate optical\nflow patch extraction. These designs eliminate the need for dense optical flow\nlabels for training and significantly improve the generalization capability of\nthe system in diverse and challenging environments. Our pose-only supervised\nmethod achieves competitive performance on standard datasets and greater\nrobustness and generalization ability in extreme and unseen scenarios, even\ncompared to dense optical flow-supervised state-of-the-art methods.\n","authors":["Siyu Chen","Kangcheng Liu","Chen Wang","Shenghai Yuan","Jianfei Yang","Lihua Xie"],"pdf_url":"https://arxiv.org/pdf/2404.04677v1.pdf","comment":"Accepted by IEEE Robotics and Automation Letters"},{"id":"http://arxiv.org/abs/2312.08591v2","updated":"2024-04-06T16:38:38Z","published":"2023-12-14T01:24:22Z","title":"Joint2Human: High-quality 3D Human Generation via Compact Spherical\n Embedding of 3D Joints","summary":" 3D human generation is increasingly significant in various applications.\nHowever, the direct use of 2D generative methods in 3D generation often results\nin losing local details, while methods that reconstruct geometry from generated\nimages struggle with global view consistency. In this work, we introduce\nJoint2Human, a novel method that leverages 2D diffusion models to generate\ndetailed 3D human geometry directly, ensuring both global structure and local\ndetails. To achieve this, we employ the Fourier occupancy field (FOF)\nrepresentation, enabling the direct generation of 3D shapes as preliminary\nresults with 2D generative models. With the proposed high-frequency enhancer\nand the multi-view recarving strategy, our method can seamlessly integrate the\ndetails from different views into a uniform global shape. To better utilize the\n3D human prior and enhance control over the generated geometry, we introduce a\ncompact spherical embedding of 3D joints. This allows for an effective guidance\nof pose during the generation process. Additionally, our method can generate 3D\nhumans guided by textual inputs. Our experimental results demonstrate the\ncapability of our method to ensure global structure, local details, high\nresolution, and low computational cost simultaneously. More results and the\ncode can be found on our project page at\nhttp://cic.tju.edu.cn/faculty/likun/projects/Joint2Human.\n","authors":["Muxin Zhang","Qiao Feng","Zhuo Su","Chao Wen","Zhou Xue","Kun Li"],"pdf_url":"https://arxiv.org/pdf/2312.08591v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04673v1","updated":"2024-04-06T16:29:10Z","published":"2024-04-06T16:29:10Z","title":"Neural-ABC: Neural Parametric Models for Articulated Body with Clothes","summary":" In this paper, we introduce Neural-ABC, a novel parametric model based on\nneural implicit functions that can represent clothed human bodies with\ndisentangled latent spaces for identity, clothing, shape, and pose. Traditional\nmesh-based representations struggle to represent articulated bodies with\nclothes due to the diversity of human body shapes and clothing styles, as well\nas the complexity of poses. Our proposed model provides a unified framework for\nparametric modeling, which can represent the identity, clothing, shape and pose\nof the clothed human body. Our proposed approach utilizes the power of neural\nimplicit functions as the underlying representation and integrates\nwell-designed structures to meet the necessary requirements. Specifically, we\nrepresent the underlying body as a signed distance function and clothing as an\nunsigned distance function, and they can be uniformly represented as unsigned\ndistance fields. Different types of clothing do not require predefined\ntopological structures or classifications, and can follow changes in the\nunderlying body to fit the body. Additionally, we construct poses using a\ncontrollable articulated structure. The model is trained on both open and newly\nconstructed datasets, and our decoupling strategy is carefully designed to\nensure optimal performance. Our model excels at disentangling clothing and\nidentity in different shape and poses while preserving the style of the\nclothing. We demonstrate that Neural-ABC fits new observations of different\ntypes of clothing. Compared to other state-of-the-art parametric models,\nNeural-ABC demonstrates powerful advantages in the reconstruction of clothed\nhuman bodies, as evidenced by fitting raw scans, depth maps and images. We show\nthat the attributes of the fitted results can be further edited by adjusting\ntheir identities, clothing, shape and pose codes.\n","authors":["Honghu Chen","Yuxin Yao","Juyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.04673v1.pdf","comment":"Accepted by IEEE Transactions on Visualization and Computer Graphics.\n Project page: https://ustc3dv.github.io/NeuralABC/"},{"id":"http://arxiv.org/abs/2402.15756v3","updated":"2024-04-06T16:04:51Z","published":"2024-02-24T08:07:48Z","title":"Detection Is Tracking: Point Cloud Multi-Sweep Deep Learning Models\n Revisited","summary":" Conventional tracking paradigm takes in instantaneous measurements such as\nrange and bearing, and produces object tracks across time. In applications such\nas autonomous driving, lidar measurements in the form of point clouds are\nusually passed through a \"virtual sensor\" realized by a deep learning model, to\nproduce \"measurements\" such as bounding boxes, which are in turn ingested by a\ntracking module to produce object tracks. Very often multiple lidar sweeps are\naccumulated in a buffer to merge and become the input to the virtual sensor. We\nargue in this paper that such an input already contains temporal information,\nand therefore the virtual sensor output should also contain temporal\ninformation, not just instantaneous values for the time corresponding to the\nend of the buffer. In particular, we present the deep learning model called\nMULti-Sweep PAired Detector (MULSPAD) that produces, for each detected object,\na pair of bounding boxes at both the end time and the beginning time of the\ninput buffer. This is achieved with fairly straightforward changes in commonly\nused lidar detection models, and with only marginal extra processing, but the\nresulting symmetry is satisfying. Such paired detections make it possible not\nonly to construct rudimentary trackers fairly easily, but also to construct\nmore sophisticated trackers that can exploit the extra information conveyed by\nthe pair and be robust to choices of motion models and object birth/death\nmodels. We have conducted preliminary training and experimentation using Waymo\nOpen Dataset, which shows the efficacy of our proposed method.\n","authors":["Lingji Chen"],"pdf_url":"https://arxiv.org/pdf/2402.15756v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04665v1","updated":"2024-04-06T15:48:14Z","published":"2024-04-06T15:48:14Z","title":"Adaptive Intra-Class Variation Contrastive Learning for Unsupervised\n Person Re-Identification","summary":" The memory dictionary-based contrastive learning method has achieved\nremarkable results in the field of unsupervised person Re-ID. However, The\nmethod of updating memory based on all samples does not fully utilize the\nhardest sample to improve the generalization ability of the model, and the\nmethod based on hardest sample mining will inevitably introduce false-positive\nsamples that are incorrectly clustered in the early stages of the model.\nClustering-based methods usually discard a significant number of outliers,\nleading to the loss of valuable information. In order to address the issues\nmentioned before, we propose an adaptive intra-class variation contrastive\nlearning algorithm for unsupervised Re-ID, called AdaInCV. And the algorithm\nquantitatively evaluates the learning ability of the model for each class by\nconsidering the intra-class variations after clustering, which helps in\nselecting appropriate samples during the training process of the model. To be\nmore specific, two new strategies are proposed: Adaptive Sample Mining (AdaSaM)\nand Adaptive Outlier Filter (AdaOF). The first one gradually creates more\nreliable clusters to dynamically refine the memory, while the second can\nidentify and filter out valuable outliers as negative samples.\n","authors":["Lingzhi Liu","Haiyang Zhang","Chengwei Tang","Tiantian Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.04665v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04663v1","updated":"2024-04-06T15:31:57Z","published":"2024-04-06T15:31:57Z","title":"Focused Active Learning for Histopathological Image Classification","summary":" Active Learning (AL) has the potential to solve a major problem of digital\npathology: the efficient acquisition of labeled data for machine learning\nalgorithms. However, existing AL methods often struggle in realistic settings\nwith artifacts, ambiguities, and class imbalances, as commonly seen in the\nmedical field. The lack of precise uncertainty estimations leads to the\nacquisition of images with a low informative value. To address these\nchallenges, we propose Focused Active Learning (FocAL), which combines a\nBayesian Neural Network with Out-of-Distribution detection to estimate\ndifferent uncertainties for the acquisition function. Specifically, the\nweighted epistemic uncertainty accounts for the class imbalance, aleatoric\nuncertainty for ambiguous images, and an OoD score for artifacts. We perform\nextensive experiments to validate our method on MNIST and the real-world Panda\ndataset for the classification of prostate cancer. The results confirm that\nother AL methods are 'distracted' by ambiguities and artifacts which harm the\nperformance. FocAL effectively focuses on the most informative images, avoiding\nambiguities and artifacts during acquisition. For both experiments, FocAL\noutperforms existing AL approaches, reaching a Cohen's kappa of 0.764 with only\n0.69% of the labeled Panda data.\n","authors":["Arne Schmidt","Pablo Morales-Álvarez","Lee A. D. Cooper","Lee A. Newberg","Andinet Enquobahrie","Aggelos K. Katsaggelos","Rafael Molina"],"pdf_url":"https://arxiv.org/pdf/2404.04663v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00335v2","updated":"2024-04-06T15:25:51Z","published":"2024-03-30T12:10:34Z","title":"Learning Trimaps via Clicks for Image Matting","summary":" Despite significant advancements in image matting, existing models heavily\ndepend on manually-drawn trimaps for accurate results in natural image\nscenarios. However, the process of obtaining trimaps is time-consuming, lacking\nuser-friendliness and device compatibility. This reliance greatly limits the\npractical application of all trimap-based matting methods. To address this\nissue, we introduce Click2Trimap, an interactive model capable of predicting\nhigh-quality trimaps and alpha mattes with minimal user click inputs. Through\nanalyzing real users' behavioral logic and characteristics of trimaps, we\nsuccessfully propose a powerful iterative three-class training strategy and a\ndedicated simulation function, making Click2Trimap exhibit versatility across\nvarious scenarios. Quantitative and qualitative assessments on synthetic and\nreal-world matting datasets demonstrate Click2Trimap's superior performance\ncompared to all existing trimap-free matting methods. Especially, in the user\nstudy, Click2Trimap achieves high-quality trimap and matting predictions in\njust an average of 5 seconds per image, demonstrating its substantial practical\nvalue in real-world applications.\n","authors":["Chenyi Zhang","Yihan Hu","Henghui Ding","Humphrey Shi","Yao Zhao","Yunchao Wei"],"pdf_url":"https://arxiv.org/pdf/2404.00335v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04654v1","updated":"2024-04-06T15:14:25Z","published":"2024-04-06T15:14:25Z","title":"Music Recommendation Based on Facial Emotion Recognition","summary":" Introduction: Music provides an incredible avenue for individuals to express\ntheir thoughts and emotions, while also serving as a delightful mode of\nentertainment for enthusiasts and music lovers. Objectives: This paper presents\na comprehensive approach to enhancing the user experience through the\nintegration of emotion recognition, music recommendation, and explainable AI\nusing GRAD-CAM. Methods: The proposed methodology utilizes a ResNet50 model\ntrained on the Facial Expression Recognition (FER) dataset, consisting of real\nimages of individuals expressing various emotions. Results: The system achieves\nan accuracy of 82% in emotion classification. By leveraging GRAD-CAM, the model\nprovides explanations for its predictions, allowing users to understand the\nreasoning behind the system's recommendations. The model is trained on both FER\nand real user datasets, which include labelled facial expressions, and real\nimages of individuals expressing various emotions. The training process\ninvolves pre-processing the input images, extracting features through\nconvolutional layers, reasoning with dense layers, and generating emotion\npredictions through the output layer Conclusion: The proposed methodology,\nleveraging the Resnet50 model with ROI-based analysis and explainable AI\ntechniques, offers a robust and interpretable solution for facial emotion\ndetection paper.\n","authors":["Rajesh B","Keerthana V","Narayana Darapaneni","Anwesh Reddy P"],"pdf_url":"https://arxiv.org/pdf/2404.04654v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2108.09760v2","updated":"2024-04-06T15:11:41Z","published":"2021-08-22T15:44:37Z","title":"Image Inpainting via Conditional Texture and Structure Dual Generation","summary":" Deep generative approaches have recently made considerable progress in image\ninpainting by introducing structure priors. Due to the lack of proper\ninteraction with image texture during structure reconstruction, however,\ncurrent solutions are incompetent in handling the cases with large corruptions,\nand they generally suffer from distorted results. In this paper, we propose a\nnovel two-stream network for image inpainting, which models the\nstructure-constrained texture synthesis and texture-guided structure\nreconstruction in a coupled manner so that they better leverage each other for\nmore plausible generation. Furthermore, to enhance the global consistency, a\nBi-directional Gated Feature Fusion (Bi-GFF) module is designed to exchange and\ncombine the structure and texture information and a Contextual Feature\nAggregation (CFA) module is developed to refine the generated contents by\nregion affinity learning and multi-scale feature aggregation. Qualitative and\nquantitative experiments on the CelebA, Paris StreetView and Places2 datasets\ndemonstrate the superiority of the proposed method. Our code is available at\nhttps://github.com/Xiefan-Guo/CTSDG.\n","authors":["Xiefan Guo","Hongyu Yang","Di Huang"],"pdf_url":"https://arxiv.org/pdf/2108.09760v2.pdf","comment":"Accepted by ICCV 2021"},{"id":"http://arxiv.org/abs/2404.04653v1","updated":"2024-04-06T15:10:29Z","published":"2024-04-06T15:10:29Z","title":"HawkDrive: A Transformer-driven Visual Perception System for Autonomous\n Driving in Night Scene","summary":" Many established vision perception systems for autonomous driving scenarios\nignore the influence of light conditions, one of the key elements for driving\nsafety. To address this problem, we present HawkDrive, a novel perception\nsystem with hardware and software solutions. Hardware that utilizes stereo\nvision perception, which has been demonstrated to be a more reliable way of\nestimating depth information than monocular vision, is partnered with the edge\ncomputing device Nvidia Jetson Xavier AGX. Our software for low light\nenhancement, depth estimation, and semantic segmentation tasks, is a\ntransformer-based neural network. Our software stack, which enables fast\ninference and noise reduction, is packaged into system modules in Robot\nOperating System 2 (ROS2). Our experimental results have shown that the\nproposed end-to-end system is effective in improving the depth estimation and\nsemantic segmentation performance. Our dataset and codes will be released at\nhttps://github.com/ZionGo6/HawkDrive.\n","authors":["Ziang Guo","Stepan Perminov","Mikhail Konenkov","Dzmitry Tsetserukou"],"pdf_url":"https://arxiv.org/pdf/2404.04653v1.pdf","comment":"Accepted by IEEE IV 2024"},{"id":"http://arxiv.org/abs/2404.04650v1","updated":"2024-04-06T14:56:59Z","published":"2024-04-06T14:56:59Z","title":"InitNO: Boosting Text-to-Image Diffusion Models via Initial Noise\n Optimization","summary":" Recent strides in the development of diffusion models, exemplified by\nadvancements such as Stable Diffusion, have underscored their remarkable\nprowess in generating visually compelling images. However, the imperative of\nachieving a seamless alignment between the generated image and the provided\nprompt persists as a formidable challenge. This paper traces the root of these\ndifficulties to invalid initial noise, and proposes a solution in the form of\nInitial Noise Optimization (InitNO), a paradigm that refines this noise.\nConsidering text prompts, not all random noises are effective in synthesizing\nsemantically-faithful images. We design the cross-attention response score and\nthe self-attention conflict score to evaluate the initial noise, bifurcating\nthe initial latent space into valid and invalid sectors. A strategically\ncrafted noise optimization pipeline is developed to guide the initial noise\ntowards valid regions. Our method, validated through rigorous experimentation,\nshows a commendable proficiency in generating images in strict accordance with\ntext prompts. Our code is available at https://github.com/xiefan-guo/initno.\n","authors":["Xiefan Guo","Jinlin Liu","Miaomiao Cui","Jiankai Li","Hongyu Yang","Di Huang"],"pdf_url":"https://arxiv.org/pdf/2404.04650v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2402.00863v3","updated":"2024-04-06T14:55:42Z","published":"2024-02-01T18:58:44Z","title":"Geometry Transfer for Stylizing Radiance Fields","summary":" Shape and geometric patterns are essential in defining stylistic identity.\nHowever, current 3D style transfer methods predominantly focus on transferring\ncolors and textures, often overlooking geometric aspects. In this paper, we\nintroduce Geometry Transfer, a novel method that leverages geometric\ndeformation for 3D style transfer. This technique employs depth maps to extract\na style guide, subsequently applied to stylize the geometry of radiance fields.\nMoreover, we propose new techniques that utilize geometric cues from the 3D\nscene, thereby enhancing aesthetic expressiveness and more accurately\nreflecting intended styles. Our extensive experiments show that Geometry\nTransfer enables a broader and more expressive range of stylizations, thereby\nsignificantly expanding the scope of 3D style transfer.\n","authors":["Hyunyoung Jung","Seonghyeon Nam","Nikolaos Sarafianos","Sungjoo Yoo","Alexander Sorkine-Hornung","Rakesh Ranjan"],"pdf_url":"https://arxiv.org/pdf/2402.00863v3.pdf","comment":"CVPR 2024. Project page: https://hyblue.github.io/geo-srf/"},{"id":"http://arxiv.org/abs/2401.04747v2","updated":"2024-04-06T14:53:51Z","published":"2024-01-09T11:38:18Z","title":"DiffSHEG: A Diffusion-Based Approach for Real-Time Speech-driven\n Holistic 3D Expression and Gesture Generation","summary":" We propose DiffSHEG, a Diffusion-based approach for Speech-driven Holistic 3D\nExpression and Gesture generation with arbitrary length. While previous works\nfocused on co-speech gesture or expression generation individually, the joint\ngeneration of synchronized expressions and gestures remains barely explored. To\naddress this, our diffusion-based co-speech motion generation transformer\nenables uni-directional information flow from expression to gesture,\nfacilitating improved matching of joint expression-gesture distributions.\nFurthermore, we introduce an outpainting-based sampling strategy for arbitrary\nlong sequence generation in diffusion models, offering flexibility and\ncomputational efficiency. Our method provides a practical solution that\nproduces high-quality synchronized expression and gesture generation driven by\nspeech. Evaluated on two public datasets, our approach achieves\nstate-of-the-art performance both quantitatively and qualitatively.\nAdditionally, a user study confirms the superiority of DiffSHEG over prior\napproaches. By enabling the real-time generation of expressive and synchronized\nmotions, DiffSHEG showcases its potential for various applications in the\ndevelopment of digital humans and embodied agents.\n","authors":["Junming Chen","Yunfei Liu","Jianan Wang","Ailing Zeng","Yu Li","Qifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2401.04747v2.pdf","comment":"Accepted by CVPR 2024. Project page:\n https://jeremycjm.github.io/proj/DiffSHEG"},{"id":"http://arxiv.org/abs/2404.04647v1","updated":"2024-04-06T14:49:36Z","published":"2024-04-06T14:49:36Z","title":"Structured Gradient-based Interpretations via Norm-Regularized\n Adversarial Training","summary":" Gradient-based saliency maps have been widely used to explain the decisions\nof deep neural network classifiers. However, standard gradient-based\ninterpretation maps, including the simple gradient and integrated gradient\nalgorithms, often lack desired structures such as sparsity and connectedness in\ntheir application to real-world computer vision models. A frequently used\napproach to inducing sparsity structures into gradient-based saliency maps is\nto alter the simple gradient scheme using sparsification or norm-based\nregularization. A drawback with such post-processing methods is their\nfrequently-observed significant loss in fidelity to the original simple\ngradient map. In this work, we propose to apply adversarial training as an\nin-processing scheme to train neural networks with structured simple gradient\nmaps. We show a duality relation between the regularized norms of the\nadversarial perturbations and gradient-based maps, based on which we design\nadversarial training loss functions promoting sparsity and group-sparsity\nproperties in simple gradient maps. We present several numerical results to\nshow the influence of our proposed norm-based adversarial training methods on\nthe standard gradient-based maps of standard neural network architectures on\nbenchmark image datasets.\n","authors":["Shizhan Gong","Qi Dou","Farzan Farnia"],"pdf_url":"https://arxiv.org/pdf/2404.04647v1.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.04643v1","updated":"2024-04-06T14:28:01Z","published":"2024-04-06T14:28:01Z","title":"Constrained 6-DoF Grasp Generation on Complex Shapes for Improved\n Dual-Arm Manipulation","summary":" Efficiently generating grasp poses tailored to specific regions of an object\nis vital for various robotic manipulation tasks, especially in a dual-arm\nsetup. This scenario presents a significant challenge due to the complex\ngeometries involved, requiring a deep understanding of the local geometry to\ngenerate grasps efficiently on the specified constrained regions. Existing\nmethods only explore settings involving table-top/small objects and require\naugmented datasets to train, limiting their performance on complex objects. We\npropose CGDF: Constrained Grasp Diffusion Fields, a diffusion-based grasp\ngenerative model that generalizes to objects with arbitrary geometries, as well\nas generates dense grasps on the target regions. CGDF uses a part-guided\ndiffusion approach that enables it to get high sample efficiency in constrained\ngrasping without explicitly training on massive constraint-augmented datasets.\nWe provide qualitative and quantitative comparisons using analytical metrics\nand in simulation, in both unconstrained and constrained settings to show that\nour method can generalize to generate stable grasps on complex objects,\nespecially useful for dual-arm manipulation settings, while existing methods\nstruggle to do so.\n","authors":["Gaurav Singh","Sanket Kalwar","Md Faizal Karim","Bipasha Sen","Nagamanikandan Govindan","Srinath Sridhar","K Madhava Krishna"],"pdf_url":"https://arxiv.org/pdf/2404.04643v1.pdf","comment":"Project Page: https://constrained-grasp-diffusion.github.io/"},{"id":"http://arxiv.org/abs/2404.04635v1","updated":"2024-04-06T13:59:41Z","published":"2024-04-06T13:59:41Z","title":"A Deep Look Into -- Automated Lung X-Ray Abnormality Detection System","summary":" Introduction: Automated Lung X-Ray Abnormality Detection System is the\napplication which distinguish the normal x-ray images from infected x-ray\nimages and highlight area considered for prediction, with the recent pandemic a\nneed to have a non-conventional method and faster detecting diseases, for which\nX ray serves the purpose. Obectives: As of current situation any viral disease\nthat is infectious is potential pandemic, so there is need for cheap and early\ndetection system. Methods: This research will help to eases the work of expert\nto do further analysis. Accuracy of three different preexisting models such as\nDenseNet, MobileNet and VGG16 were high but models over-fitted primarily due to\nblack and white images. Results: This led to building up new method such as as\nV-BreathNet which gave more than 96% percent accuracy. Conclusion: Thus, it can\nbe stated that not all state-of art CNN models can be used on B/W images. In\nconclusion not all state-of-art CNN models can be used on B/W images.\n","authors":["Nagullas KS","Vivekanand. V","Narayana Darapaneni","Anwesh R P"],"pdf_url":"https://arxiv.org/pdf/2404.04635v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04629v1","updated":"2024-04-06T13:25:29Z","published":"2024-04-06T13:25:29Z","title":"DifFUSER: Diffusion Model for Robust Multi-Sensor Fusion in 3D Object\n Detection and BEV Segmentation","summary":" Diffusion models have recently gained prominence as powerful deep generative\nmodels, demonstrating unmatched performance across various domains. However,\ntheir potential in multi-sensor fusion remains largely unexplored. In this\nwork, we introduce DifFUSER, a novel approach that leverages diffusion models\nfor multi-modal fusion in 3D object detection and BEV map segmentation.\nBenefiting from the inherent denoising property of diffusion, DifFUSER is able\nto refine or even synthesize sensor features in case of sensor malfunction,\nthereby improving the quality of the fused output. In terms of architecture,\nour DifFUSER blocks are chained together in a hierarchical BiFPN fashion,\ntermed cMini-BiFPN, offering an alternative architecture for latent diffusion.\nWe further introduce a Gated Self-conditioned Modulated (GSM) latent diffusion\nmodule together with a Progressive Sensor Dropout Training (PSDT) paradigm,\ndesigned to add stronger conditioning to the diffusion process and robustness\nto sensor failures. Our extensive evaluations on the Nuscenes dataset reveal\nthat DifFUSER not only achieves state-of-the-art performance with a 69.1% mIOU\nin BEV map segmentation tasks but also competes effectively with leading\ntransformer-based fusion techniques in 3D object detection.\n","authors":["Duy-Tho Le","Hengcan Shi","Jianfei Cai","Hamid Rezatofighi"],"pdf_url":"https://arxiv.org/pdf/2404.04629v1.pdf","comment":"23 pages"},{"id":"http://arxiv.org/abs/2404.04627v1","updated":"2024-04-06T13:25:00Z","published":"2024-04-06T13:25:00Z","title":"Self-Training Large Language Models for Improved Visual Program\n Synthesis With Visual Reinforcement","summary":" Visual program synthesis is a promising approach to exploit the reasoning\nabilities of large language models for compositional computer vision tasks.\nPrevious work has used few-shot prompting with frozen LLMs to synthesize visual\nprograms. Training an LLM to write better visual programs is an attractive\nprospect, but it is unclear how to accomplish this. No dataset of visual\nprograms for training exists, and acquisition of a visual program dataset\ncannot be easily crowdsourced due to the need for expert annotators. To get\naround the lack of direct supervision, we explore improving the program\nsynthesis abilities of an LLM using feedback from interactive experience. We\npropose a method where we exploit existing annotations for a vision-language\ntask to improvise a coarse reward signal for that task, treat the LLM as a\npolicy, and apply reinforced self-training to improve the visual program\nsynthesis ability of the LLM for that task. We describe a series of experiments\non object detection, compositional visual question answering, and image-text\nretrieval, and show that in each case, the self-trained LLM outperforms or\nperforms on par with few-shot frozen LLMs that are an order of magnitude\nlarger. Website: https://zaidkhan.me/ViReP\n","authors":["Zaid Khan","Vijay Kumar BG","Samuel Schulter","Yun Fu","Manmohan Chandraker"],"pdf_url":"https://arxiv.org/pdf/2404.04627v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.04624v1","updated":"2024-04-06T13:14:04Z","published":"2024-04-06T13:14:04Z","title":"Bridging the Gap Between End-to-End and Two-Step Text Spotting","summary":" Modularity plays a crucial role in the development and maintenance of complex\nsystems. While end-to-end text spotting efficiently mitigates the issues of\nerror accumulation and sub-optimal performance seen in traditional two-step\nmethodologies, the two-step methods continue to be favored in many competitions\nand practical settings due to their superior modularity. In this paper, we\nintroduce Bridging Text Spotting, a novel approach that resolves the error\naccumulation and suboptimal performance issues in two-step methods while\nretaining modularity. To achieve this, we adopt a well-trained detector and\nrecognizer that are developed and trained independently and then lock their\nparameters to preserve their already acquired capabilities. Subsequently, we\nintroduce a Bridge that connects the locked detector and recognizer through a\nzero-initialized neural network. This zero-initialized neural network,\ninitialized with weights set to zeros, ensures seamless integration of the\nlarge receptive field features in detection into the locked recognizer.\nFurthermore, since the fixed detector and recognizer cannot naturally acquire\nend-to-end optimization features, we adopt the Adapter to facilitate their\nefficient learning of these features. We demonstrate the effectiveness of the\nproposed method through extensive experiments: Connecting the latest detector\nand recognizer through Bridging Text Spotting, we achieved an accuracy of 83.3%\non Total-Text, 69.8% on CTW1500, and 89.5% on ICDAR 2015. The code is available\nat https://github.com/mxin262/Bridging-Text-Spotting.\n","authors":["Mingxin Huang","Hongliang Li","Yuliang Liu","Xiang Bai","Lianwen Jin"],"pdf_url":"https://arxiv.org/pdf/2404.04624v1.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2404.04619v1","updated":"2024-04-06T12:51:00Z","published":"2024-04-06T12:51:00Z","title":"Do We Really Need a Complex Agent System? Distill Embodied Agent into a\n Single Model","summary":" With the power of large language models (LLMs), open-ended embodied agents\ncan flexibly understand human instructions, generate interpretable guidance\nstrategies, and output executable actions. Nowadays, Multi-modal Language\nModels~(MLMs) integrate multi-modal signals into LLMs, further bringing richer\nperception to entity agents and allowing embodied agents to perceive\nworld-understanding tasks more delicately. However, existing works: 1) operate\nindependently by agents, each containing multiple LLMs, from perception to\naction, resulting in gaps between complex tasks and execution; 2) train MLMs on\nstatic data, struggling with dynamics in open-ended scenarios; 3) input prior\nknowledge directly as prompts, suppressing application flexibility. We propose\nSTEVE-2, a hierarchical knowledge distillation framework for open-ended\nembodied tasks, characterized by 1) a hierarchical system for multi-granular\ntask division, 2) a mirrored distillation method for parallel simulation data,\nand 3) an extra expert model for bringing additional knowledge into parallel\nsimulation. After distillation, embodied agents can complete complex,\nopen-ended tasks without additional expert guidance, utilizing the performance\nand knowledge of a versatile MLM. Extensive evaluations on navigation and\ncreation tasks highlight the superior performance of STEVE-2 in open-ended\ntasks, with $1.4 \\times$ - $7.3 \\times$ in performance.\n","authors":["Zhonghan Zhao","Ke Ma","Wenhao Chai","Xuan Wang","Kewei Chen","Dongxu Guo","Yanting Zhang","Hongwei Wang","Gaoang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.04619v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2403.08282"},{"id":"http://arxiv.org/abs/2404.04617v1","updated":"2024-04-06T12:50:08Z","published":"2024-04-06T12:50:08Z","title":"Empowering Image Recovery_ A Multi-Attention Approach","summary":" We propose Diverse Restormer (DART), a novel image restoration method that\neffectively integrates information from various sources (long sequences, local\nand global regions, feature dimensions, and positional dimensions) to address\nrestoration challenges. While Transformer models have demonstrated excellent\nperformance in image restoration due to their self-attention mechanism, they\nface limitations in complex scenarios. Leveraging recent advancements in\nTransformers and various attention mechanisms, our method utilizes customized\nattention mechanisms to enhance overall performance. DART, our novel network\narchitecture, employs windowed attention to mimic the selective focusing\nmechanism of human eyes. By dynamically adjusting receptive fields, it\noptimally captures the fundamental features crucial for image resolution\nreconstruction. Efficiency and performance balance are achieved through the\nLongIR attention mechanism for long sequence image restoration. Integration of\nattention mechanisms across feature and positional dimensions further enhances\nthe recovery of fine details. Evaluation across five restoration tasks\nconsistently positions DART at the forefront. Upon acceptance, we commit to\nproviding publicly accessible code and models to ensure reproducibility and\nfacilitate further research.\n","authors":["Juan Wen","Yawei Li","Chao Zhang","Weiyan Hou","Radu Timofte","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2404.04617v1.pdf","comment":"12 pages, 10 figures, 12 tables"},{"id":"http://arxiv.org/abs/2402.07710v3","updated":"2024-04-06T12:49:43Z","published":"2024-02-12T15:23:19Z","title":"Optimizing Sparse Convolution on GPUs with CUDA for 3D Point Cloud\n Processing in Embedded Systems","summary":" In recent years, there has been a significant increase in the utilization of\ndeep learning methods, particularly convolutional neural networks (CNNs), which\nhave emerged as the dominant approach in various domains that involve\nstructured grid data, such as picture analysis and processing. Nevertheless,\nthe exponential growth in the utilization of LiDAR and 3D sensors across many\ndomains has resulted in an increased need for the analysis of 3D point clouds.\nThe utilization of 3D point clouds is crucial in various applications,\nincluding object recognition and segmentation, as they offer a spatial\ndepiction of things within a three-dimensional environment. In contrast to\nphotos, point clouds exhibit sparsity and lack a regular grid, hence posing\ndistinct processing and computational issues.\n","authors":["Chester Luo","Kevin Lai"],"pdf_url":"https://arxiv.org/pdf/2402.07710v3.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2402.09329v3","updated":"2024-04-06T12:46:27Z","published":"2024-02-14T17:18:15Z","title":"YOLOv8-AM: YOLOv8 with Attention Mechanisms for Pediatric Wrist Fracture\n Detection","summary":" Wrist trauma and even fractures occur frequently in daily life, particularly\namong children who account for a significant proportion of fracture cases.\nBefore performing surgery, surgeons often request patients to undergo X-ray\nimaging first and prepare for it based on the analysis of the radiologist. With\nthe development of neural networks, You Only Look Once (YOLO) series models\nhave been widely used in fracture detection as computer-assisted diagnosis\n(CAD). In 2023, Ultralytics presented the latest version of the YOLO models,\nwhich has been employed for detecting fractures across various parts of the\nbody. Attention mechanism is one of the hottest methods to improve the model\nperformance. This research work proposes YOLOv8-AM, which incorporates the\nattention mechanism into the original YOLOv8 architecture. Specifically, we\nrespectively employ four attention modules, Convolutional Block Attention\nModule (CBAM), Global Attention Mechanism (GAM), Efficient Channel Attention\n(ECA), and Shuffle Attention (SA), to design the improved models and train them\non GRAZPEDWRI-DX dataset. Experimental results demonstrate that the mean\nAverage Precision at IoU 50 (mAP 50) of the YOLOv8-AM model based on ResBlock +\nCBAM (ResCBAM) increased from 63.6% to 65.8%, which achieves the\nstate-of-the-art (SOTA) performance. Conversely, YOLOv8-AM model incorporating\nGAM obtains the mAP 50 value of 64.2%, which is not a satisfactory enhancement.\nTherefore, we combine ResBlock and GAM, introducing ResGAM to design another\nnew YOLOv8-AM model, whose mAP 50 value is increased to 65.0%. The\nimplementation code for this study is available on GitHub at\nhttps://github.com/RuiyangJu/Fracture_Detection_Improved_YOLOv8.\n","authors":["Chun-Tse Chien","Rui-Yang Ju","Kuang-Yi Chou","Enkaer Xieerke","Jen-Shiun Chiang"],"pdf_url":"https://arxiv.org/pdf/2402.09329v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07063v3","updated":"2024-04-06T12:36:36Z","published":"2023-12-12T08:32:55Z","title":"Template Free Reconstruction of Human-object Interaction with Procedural\n Interaction Generation","summary":" Reconstructing human-object interaction in 3D from a single RGB image is a\nchallenging task and existing data driven methods do not generalize beyond the\nobjects present in the carefully curated 3D interaction datasets. Capturing\nlarge-scale real data to learn strong interaction and 3D shape priors is very\nexpensive due to the combinatorial nature of human-object interactions. In this\npaper, we propose ProciGen (Procedural interaction Generation), a method to\nprocedurally generate datasets with both, plausible interaction and diverse\nobject variation. We generate 1M+ human-object interaction pairs in 3D and\nleverage this large-scale data to train our HDM (Hierarchical Diffusion Model),\na novel method to reconstruct interacting human and unseen objects, without any\ntemplates. Our HDM is an image-conditioned diffusion model that learns both\nrealistic interaction and highly accurate human and object shapes. Experiments\nshow that our HDM trained with ProciGen significantly outperforms prior methods\nthat requires template meshes and that our dataset allows training methods with\nstrong generalization ability to unseen object instances. Our code and data are\nreleased.\n","authors":["Xianghui Xie","Bharat Lal Bhatnagar","Jan Eric Lenssen","Gerard Pons-Moll"],"pdf_url":"https://arxiv.org/pdf/2312.07063v3.pdf","comment":"CVPR'24 camera ready version. 25 pages, 20 figures. Project page:\n https://virtualhumans.mpi-inf.mpg.de/procigen-hdm"},{"id":"http://arxiv.org/abs/2404.04608v1","updated":"2024-04-06T12:27:21Z","published":"2024-04-06T12:27:21Z","title":"Panoptic Perception: A Novel Task and Fine-grained Dataset for Universal\n Remote Sensing Image Interpretation","summary":" Current remote-sensing interpretation models often focus on a single task\nsuch as detection, segmentation, or caption. However, the task-specific\ndesigned models are unattainable to achieve the comprehensive multi-level\ninterpretation of images. The field also lacks support for multi-task joint\ninterpretation datasets. In this paper, we propose Panoptic Perception, a novel\ntask and a new fine-grained dataset (FineGrip) to achieve a more thorough and\nuniversal interpretation for RSIs. The new task, 1) integrates pixel-level,\ninstance-level, and image-level information for universal image perception, 2)\ncaptures image information from coarse to fine granularity, achieving deeper\nscene understanding and description, and 3) enables various independent tasks\nto complement and enhance each other through multi-task learning. By\nemphasizing multi-task interactions and the consistency of perception results,\nthis task enables the simultaneous processing of fine-grained foreground\ninstance segmentation, background semantic segmentation, and global\nfine-grained image captioning. Concretely, the FineGrip dataset includes 2,649\nremote sensing images, 12,054 fine-grained instance segmentation masks\nbelonging to 20 foreground things categories, 7,599 background semantic masks\nfor 5 stuff classes and 13,245 captioning sentences. Furthermore, we propose a\njoint optimization-based panoptic perception model. Experimental results on\nFineGrip demonstrate the feasibility of the panoptic perception task and the\nbeneficial effect of multi-task joint optimization on individual tasks. The\ndataset will be publicly available.\n","authors":["Danpei Zhao","Bo Yuan","Ziqiang Chen","Tian Li","Zhuoran Liu","Wentao Li","Yue Gao"],"pdf_url":"https://arxiv.org/pdf/2404.04608v1.pdf","comment":"Undergoing Review"},{"id":"http://arxiv.org/abs/2403.20002v2","updated":"2024-04-06T11:44:36Z","published":"2024-03-29T06:33:13Z","title":"Grounding and Enhancing Grid-based Models for Neural Fields","summary":" Many contemporary studies utilize grid-based models for neural field\nrepresentation, but a systematic analysis of grid-based models is still\nmissing, hindering the improvement of those models. Therefore, this paper\nintroduces a theoretical framework for grid-based models. This framework points\nout that these models' approximation and generalization behaviors are\ndetermined by grid tangent kernels (GTK), which are intrinsic properties of\ngrid-based models. The proposed framework facilitates a consistent and\nsystematic analysis of diverse grid-based models. Furthermore, the introduced\nframework motivates the development of a novel grid-based model named the\nMultiplicative Fourier Adaptive Grid (MulFAGrid). The numerical analysis\ndemonstrates that MulFAGrid exhibits a lower generalization bound than its\npredecessors, indicating its robust generalization performance. Empirical\nstudies reveal that MulFAGrid achieves state-of-the-art performance in various\ntasks, including 2D image fitting, 3D signed distance field (SDF)\nreconstruction, and novel view synthesis, demonstrating superior representation\nability. The project website is available at\nhttps://sites.google.com/view/cvpr24-2034-submission/home.\n","authors":["Zelin Zhao","Fenglei Fan","Wenlong Liao","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2403.20002v2.pdf","comment":"Accepted in CVPR24 as an oral presentation. Pre-rebuttal scores: 555.\n Post-rebuttal scores: 555"},{"id":"http://arxiv.org/abs/2404.04586v1","updated":"2024-04-06T10:50:02Z","published":"2024-04-06T10:50:02Z","title":"PIE: Physics-inspired Low-light Enhancement","summary":" In this paper, we propose a physics-inspired contrastive learning paradigm\nfor low-light enhancement, called PIE. PIE primarily addresses three issues:\n(i) To resolve the problem of existing learning-based methods often training a\nLLE model with strict pixel-correspondence image pairs, we eliminate the need\nfor pixel-correspondence paired training data and instead train with unpaired\nimages. (ii) To address the disregard for negative samples and the inadequacy\nof their generation in existing methods, we incorporate physics-inspired\ncontrastive learning for LLE and design the Bag of Curves (BoC) method to\ngenerate more reasonable negative samples that closely adhere to the underlying\nphysical imaging principle. (iii) To overcome the reliance on semantic ground\ntruths in existing methods, we propose an unsupervised regional segmentation\nmodule, ensuring regional brightness consistency while eliminating the\ndependency on semantic ground truths. Overall, the proposed PIE can effectively\nlearn from unpaired positive/negative samples and smoothly realize non-semantic\nregional enhancement, which is clearly different from existing LLE efforts.\nBesides the novel architecture of PIE, we explore the gain of PIE on downstream\ntasks such as semantic segmentation and face detection. Training on readily\navailable open data and extensive experiments demonstrate that our method\nsurpasses the state-of-the-art LLE models over six independent cross-scenes\ndatasets. PIE runs fast with reasonable GFLOPs in test time, making it easy to\nuse on mobile devices.\n","authors":["Dong Liang","Zhengyan Xu","Ling Li","Mingqiang Wei","Songcan Chen"],"pdf_url":"https://arxiv.org/pdf/2404.04586v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04584v1","updated":"2024-04-06T10:45:02Z","published":"2024-04-06T10:45:02Z","title":"D$^3$: Scaling Up Deepfake Detection by Learning from Discrepancy","summary":" The boom of Generative AI brings opportunities entangled with risks and\nconcerns. In this work, we seek a step toward a universal deepfake detection\nsystem with better generalization and robustness, to accommodate the\nresponsible deployment of diverse image generative models. We do so by first\nscaling up the existing detection task setup from the one-generator to\nmultiple-generators in training, during which we disclose two challenges\npresented in prior methodological designs. Specifically, we reveal that the\ncurrent methods tailored for training on one specific generator either struggle\nto learn comprehensive artifacts from multiple generators or tend to sacrifice\ntheir ability to identify fake images from seen generators (i.e., In-Domain\nperformance) to exchange the generalization for unseen generators (i.e.,\nOut-Of-Domain performance). To tackle the above challenges, we propose our\nDiscrepancy Deepfake Detector (D$^3$) framework, whose core idea is to learn\nthe universal artifacts from multiple generators by introducing a parallel\nnetwork branch that takes a distorted image as extra discrepancy signal to\nsupplement its original counterpart. Extensive scaled-up experiments on the\nmerged UFD and GenImage datasets with six detection models demonstrate the\neffectiveness of our framework, achieving a 5.3% accuracy improvement in the\nOOD testing compared to the current SOTA methods while maintaining the ID\nperformance.\n","authors":["Yongqi Yang","Zhihao Qian","Ye Zhu","Yu Wu"],"pdf_url":"https://arxiv.org/pdf/2404.04584v1.pdf","comment":"14 pages, 3 figures"},{"id":"http://arxiv.org/abs/2404.04580v1","updated":"2024-04-06T10:30:31Z","published":"2024-04-06T10:30:31Z","title":"SDFR: Synthetic Data for Face Recognition Competition","summary":" Large-scale face recognition datasets are collected by crawling the Internet\nand without individuals' consent, raising legal, ethical, and privacy concerns.\nWith the recent advances in generative models, recently several works proposed\ngenerating synthetic face recognition datasets to mitigate concerns in\nweb-crawled face recognition datasets. This paper presents the summary of the\nSynthetic Data for Face Recognition (SDFR) Competition held in conjunction with\nthe 18th IEEE International Conference on Automatic Face and Gesture\nRecognition (FG 2024) and established to investigate the use of synthetic data\nfor training face recognition models. The SDFR competition was split into two\ntasks, allowing participants to train face recognition systems using new\nsynthetic datasets and/or existing ones. In the first task, the face\nrecognition backbone was fixed and the dataset size was limited, while the\nsecond task provided almost complete freedom on the model backbone, the\ndataset, and the training pipeline. The submitted models were trained on\nexisting and also new synthetic datasets and used clever methods to improve\ntraining with synthetic data. The submissions were evaluated and ranked on a\ndiverse set of seven benchmarking datasets. The paper gives an overview of the\nsubmitted face recognition models and reports achieved performance compared to\nbaseline models trained on real and synthetic datasets. Furthermore, the\nevaluation of submissions is extended to bias assessment across different\ndemography groups. Lastly, an outlook on the current state of the research in\ntraining face recognition models using synthetic data is presented, and\nexisting problems as well as potential future directions are also discussed.\n","authors":["Hatef Otroshi Shahreza","Christophe Ecabert","Anjith George","Alexander Unnervik","Sébastien Marcel","Nicolò Di Domenico","Guido Borghi","Davide Maltoni","Fadi Boutros","Julia Vogel","Naser Damer","Ángela Sánchez-Pérez"," EnriqueMas-Candela","Jorge Calvo-Zaragoza","Bernardo Biesseck","Pedro Vidal","Roger Granada","David Menotti","Ivan DeAndres-Tame","Simone Maurizio La Cava","Sara Concas","Pietro Melzi","Ruben Tolosana","Ruben Vera-Rodriguez","Gianpaolo Perelli","Giulia Orrù","Gian Luca Marcialis","Julian Fierrez"],"pdf_url":"https://arxiv.org/pdf/2404.04580v1.pdf","comment":"The 18th IEEE International Conference on Automatic Face and Gesture\n Recognition (FG 2024)"},{"id":"http://arxiv.org/abs/2404.04578v1","updated":"2024-04-06T10:16:33Z","published":"2024-04-06T10:16:33Z","title":"GLCM-Based Feature Combination for Extraction Model Optimization in\n Object Detection Using Machine Learning","summary":" In the era of modern technology, object detection using the Gray Level\nCo-occurrence Matrix (GLCM) extraction method plays a crucial role in object\nrecognition processes. It finds applications in real-time scenarios such as\nsecurity surveillance and autonomous vehicle navigation, among others.\nComputational efficiency becomes a critical factor in achieving real-time\nobject detection. Hence, there is a need for a detection model with low\ncomplexity and satisfactory accuracy. This research aims to enhance\ncomputational efficiency by selecting appropriate features within the GLCM\nframework. Two classification models, namely K-Nearest Neighbours (K-NN) and\nSupport Vector Machine (SVM), were employed, with the results indicating that\nK-Nearest Neighbours (K-NN) outperforms SVM in terms of computational\ncomplexity. Specifically, K-NN, when utilizing a combination of Correlation,\nEnergy, and Homogeneity features, achieves a 100% accuracy rate with low\ncomplexity. Moreover, when using a combination of Energy and Homogeneity\nfeatures, K-NN attains an almost perfect accuracy level of 99.9889%, while\nmaintaining low complexity. On the other hand, despite SVM achieving 100%\naccuracy in certain feature combinations, its high or very high complexity can\npose challenges, particularly in real-time applications. Therefore, based on\nthe trade-off between accuracy and complexity, the K-NN model with a\ncombination of Correlation, Energy, and Homogeneity features emerges as a more\nsuitable choice for real-time applications that demand high accuracy and low\ncomplexity. This research provides valuable insights for optimizing object\ndetection in various applications requiring both high accuracy and rapid\nresponsiveness.\n","authors":["Florentina Tatrin Kurniati","Daniel HF Manongga","Eko Sediyono","Sri Yulianto Joko Prasetyo","Roy Rudolf Huizen"],"pdf_url":"https://arxiv.org/pdf/2404.04578v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00644v2","updated":"2024-04-06T10:10:33Z","published":"2024-03-01T16:25:17Z","title":"Diff-Plugin: Revitalizing Details for Diffusion-based Low-level Tasks","summary":" Diffusion models trained on large-scale datasets have achieved remarkable\nprogress in image synthesis. However, due to the randomness in the diffusion\nprocess, they often struggle with handling diverse low-level tasks that require\ndetails preservation. To overcome this limitation, we present a new Diff-Plugin\nframework to enable a single pre-trained diffusion model to generate\nhigh-fidelity results across a variety of low-level tasks. Specifically, we\nfirst propose a lightweight Task-Plugin module with a dual branch design to\nprovide task-specific priors, guiding the diffusion process in preserving image\ncontent. We then propose a Plugin-Selector that can automatically select\ndifferent Task-Plugins based on the text instruction, allowing users to edit\nimages by indicating multiple low-level tasks with natural language. We conduct\nextensive experiments on 8 low-level vision tasks. The results demonstrate the\nsuperiority of Diff-Plugin over existing methods, particularly in real-world\nscenarios. Our ablations further validate that Diff-Plugin is stable,\nschedulable, and supports robust training across different dataset sizes.\n","authors":["Yuhao Liu","Zhanghan Ke","Fang Liu","Nanxuan Zhao","Rynson W. H. Lau"],"pdf_url":"https://arxiv.org/pdf/2403.00644v2.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2212.06872v4","updated":"2024-04-06T09:27:45Z","published":"2022-12-13T19:38:13Z","title":"Comparing the Decision-Making Mechanisms by Transformers and CNNs via\n Explanation Methods","summary":" In order to gain insights about the decision-making of different visual\nrecognition backbones, we propose two methodologies, sub-explanation counting\nand cross-testing, that systematically applies deep explanation algorithms on a\ndataset-wide basis, and compares the statistics generated from the amount and\nnature of the explanations. These methodologies reveal the difference among\nnetworks in terms of two properties called compositionality and disjunctivism.\nTransformers and ConvNeXt are found to be more compositional, in the sense that\nthey jointly consider multiple parts of the image in building their decisions,\nwhereas traditional CNNs and distilled transformers are less compositional and\nmore disjunctive, which means that they use multiple diverse but smaller set of\nparts to achieve a confident prediction. Through further experiments, we\npinpointed the choice of normalization to be especially important in the\ncompositionality of a model, in that batch normalization leads to less\ncompositionality while group and layer normalization lead to more. Finally, we\nalso analyze the features shared by different backbones and plot a landscape of\ndifferent models based on their feature-use similarity.\n","authors":["Mingqi Jiang","Saeed Khorram","Li Fuxin"],"pdf_url":"https://arxiv.org/pdf/2212.06872v4.pdf","comment":"25 pages with 37 figures, to be published in CVPR24"},{"id":"http://arxiv.org/abs/2404.04565v1","updated":"2024-04-06T09:13:03Z","published":"2024-04-06T09:13:03Z","title":"SportsHHI: A Dataset for Human-Human Interaction Detection in Sports\n Videos","summary":" Video-based visual relation detection tasks, such as video scene graph\ngeneration, play important roles in fine-grained video understanding. However,\ncurrent video visual relation detection datasets have two main limitations that\nhinder the progress of research in this area. First, they do not explore\ncomplex human-human interactions in multi-person scenarios. Second, the\nrelation types of existing datasets have relatively low-level semantics and can\nbe often recognized by appearance or simple prior information, without the need\nfor detailed spatio-temporal context reasoning. Nevertheless, comprehending\nhigh-level interactions between humans is crucial for understanding complex\nmulti-person videos, such as sports and surveillance videos. To address this\nissue, we propose a new video visual relation detection task: video human-human\ninteraction detection, and build a dataset named SportsHHI for it. SportsHHI\ncontains 34 high-level interaction classes from basketball and volleyball\nsports. 118,075 human bounding boxes and 50,649 interaction instances are\nannotated on 11,398 keyframes. To benchmark this, we propose a two-stage\nbaseline method and conduct extensive experiments to reveal the key factors for\na successful human-human interaction detector. We hope that SportsHHI can\nstimulate research on human interaction understanding in videos and promote the\ndevelopment of spatio-temporal context modeling techniques in video visual\nrelation detection.\n","authors":["Tao Wu","Runyu He","Gangshan Wu","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.04565v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.04564v1","updated":"2024-04-06T09:08:34Z","published":"2024-04-06T09:08:34Z","title":"Enhancing Video Summarization with Context Awareness","summary":" Video summarization is a crucial research area that aims to efficiently\nbrowse and retrieve relevant information from the vast amount of video content\navailable today. With the exponential growth of multimedia data, the ability to\nextract meaningful representations from videos has become essential. Video\nsummarization techniques automatically generate concise summaries by selecting\nkeyframes, shots, or segments that capture the video's essence. This process\nimproves the efficiency and accuracy of various applications, including video\nsurveillance, education, entertainment, and social media. Despite the\nimportance of video summarization, there is a lack of diverse and\nrepresentative datasets, hindering comprehensive evaluation and benchmarking of\nalgorithms. Existing evaluation metrics also fail to fully capture the\ncomplexities of video summarization, limiting accurate algorithm assessment and\nhindering the field's progress. To overcome data scarcity challenges and\nimprove evaluation, we propose an unsupervised approach that leverages video\ndata structure and information for generating informative summaries. By moving\naway from fixed annotations, our framework can produce representative summaries\neffectively. Moreover, we introduce an innovative evaluation pipeline tailored\nspecifically for video summarization. Human participants are involved in the\nevaluation, comparing our generated summaries to ground truth summaries and\nassessing their informativeness. This human-centric approach provides valuable\ninsights into the effectiveness of our proposed techniques. Experimental\nresults demonstrate that our training-free framework outperforms existing\nunsupervised approaches and achieves competitive results compared to\nstate-of-the-art supervised methods.\n","authors":["Hai-Dang Huynh-Lam","Ngoc-Phuong Ho-Thi","Minh-Triet Tran","Trung-Nghia Le"],"pdf_url":"https://arxiv.org/pdf/2404.04564v1.pdf","comment":"115 pages, 1 supplementary paper, undergraduate thesis report at\n US-VNUHCM"},{"id":"http://arxiv.org/abs/2404.04562v1","updated":"2024-04-06T09:03:18Z","published":"2024-04-06T09:03:18Z","title":"Diffusion Time-step Curriculum for One Image to 3D Generation","summary":" Score distillation sampling~(SDS) has been widely adopted to overcome the\nabsence of unseen views in reconstructing 3D objects from a \\textbf{single}\nimage. It leverages pre-trained 2D diffusion models as teacher to guide the\nreconstruction of student 3D models. Despite their remarkable success,\nSDS-based methods often encounter geometric artifacts and texture saturation.\nWe find out the crux is the overlooked indiscriminate treatment of diffusion\ntime-steps during optimization: it unreasonably treats the student-teacher\nknowledge distillation to be equal at all time-steps and thus entangles\ncoarse-grained and fine-grained modeling. Therefore, we propose the Diffusion\nTime-step Curriculum one-image-to-3D pipeline (DTC123), which involves both the\nteacher and student models collaborating with the time-step curriculum in a\ncoarse-to-fine manner. Extensive experiments on NeRF4, RealFusion15, GSO and\nLevel50 benchmark demonstrate that DTC123 can produce multi-view consistent,\nhigh-quality, and diverse 3D assets. Codes and more generation demos will be\nreleased in https://github.com/yxymessi/DTC123.\n","authors":["Xuanyu Yi","Zike Wu","Qingshan Xu","Pan Zhou","Joo-Hwee Lim","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.04562v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04561v1","updated":"2024-04-06T09:01:19Z","published":"2024-04-06T09:01:19Z","title":"Co-Occ: Coupling Explicit Feature Fusion with Volume Rendering\n Regularization for Multi-Modal 3D Semantic Occupancy Prediction","summary":" 3D semantic occupancy prediction is a pivotal task in the field of autonomous\ndriving. Recent approaches have made great advances in 3D semantic occupancy\npredictions on a single modality. However, multi-modal semantic occupancy\nprediction approaches have encountered difficulties in dealing with the\nmodality heterogeneity, modality misalignment, and insufficient modality\ninteractions that arise during the fusion of different modalities data, which\nmay result in the loss of important geometric and semantic information. This\nletter presents a novel multi-modal, i.e., LiDAR-camera 3D semantic occupancy\nprediction framework, dubbed Co-Occ, which couples explicit LiDAR-camera\nfeature fusion with implicit volume rendering regularization. The key insight\nis that volume rendering in the feature space can proficiently bridge the gap\nbetween 3D LiDAR sweeps and 2D images while serving as a physical\nregularization to enhance LiDAR-camera fused volumetric representation.\nSpecifically, we first propose a Geometric- and Semantic-aware Fusion\n(GSFusion) module to explicitly enhance LiDAR features by incorporating\nneighboring camera features through a K-nearest neighbors (KNN) search. Then,\nwe employ volume rendering to project the fused feature back to the image\nplanes for reconstructing color and depth maps. These maps are then supervised\nby input images from the camera and depth estimations derived from LiDAR,\nrespectively. Extensive experiments on the popular nuScenes and SemanticKITTI\nbenchmarks verify the effectiveness of our Co-Occ for 3D semantic occupancy\nprediction. The project page is available at\nhttps://rorisis.github.io/Co-Occ_project-page/.\n","authors":["Jingyi Pan","Zipeng Wang","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.04561v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00906v2","updated":"2024-04-06T08:52:55Z","published":"2024-04-01T04:21:01Z","title":"From Pixels to Graphs: Open-Vocabulary Scene Graph Generation with\n Vision-Language Models","summary":" Scene graph generation (SGG) aims to parse a visual scene into an\nintermediate graph representation for downstream reasoning tasks. Despite\nrecent advancements, existing methods struggle to generate scene graphs with\nnovel visual relation concepts. To address this challenge, we introduce a new\nopen-vocabulary SGG framework based on sequence generation. Our framework\nleverages vision-language pre-trained models (VLM) by incorporating an\nimage-to-graph generation paradigm. Specifically, we generate scene graph\nsequences via image-to-text generation with VLM and then construct scene graphs\nfrom these sequences. By doing so, we harness the strong capabilities of VLM\nfor open-vocabulary SGG and seamlessly integrate explicit relational modeling\nfor enhancing the VL tasks. Experimental results demonstrate that our design\nnot only achieves superior performance with an open vocabulary but also\nenhances downstream vision-language task performance through explicit relation\nmodeling knowledge.\n","authors":["Rongjie Li","Songyang Zhang","Dahua Lin","Kai Chen","Xuming He"],"pdf_url":"https://arxiv.org/pdf/2404.00906v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.04557v1","updated":"2024-04-06T08:51:07Z","published":"2024-04-06T08:51:07Z","title":"Learning Instance-Aware Correspondences for Robust Multi-Instance Point\n Cloud Registration in Cluttered Scenes","summary":" Multi-instance point cloud registration estimates the poses of multiple\ninstances of a model point cloud in a scene point cloud. Extracting accurate\npoint correspondence is to the center of the problem. Existing approaches\nusually treat the scene point cloud as a whole, overlooking the separation of\ninstances. Therefore, point features could be easily polluted by other points\nfrom the background or different instances, leading to inaccurate\ncorrespondences oblivious to separate instances, especially in cluttered\nscenes. In this work, we propose MIRETR, Multi-Instance REgistration\nTRansformer, a coarse-to-fine approach to the extraction of instance-aware\ncorrespondences. At the coarse level, it jointly learns instance-aware\nsuperpoint features and predicts per-instance masks. With instance masks, the\ninfluence from outside of the instance being concerned is minimized, such that\nhighly reliable superpoint correspondences can be extracted. The superpoint\ncorrespondences are then extended to instance candidates at the fine level\naccording to the instance masks. At last, an efficient candidate selection and\nrefinement algorithm is devised to obtain the final registrations. Extensive\nexperiments on three public benchmarks demonstrate the efficacy of our\napproach. In particular, MIRETR outperforms the state of the arts by 16.6\npoints on F1 score on the challenging ROBI benchmark. Code and models are\navailable at https://github.com/zhiyuanYU134/MIRETR.\n","authors":["Zhiyuan Yu","Zheng Qin","Lintao Zheng","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2404.04557v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04556v1","updated":"2024-04-06T08:45:07Z","published":"2024-04-06T08:45:07Z","title":"Rethinking Self-training for Semi-supervised Landmark Detection: A\n Selection-free Approach","summary":" Self-training is a simple yet effective method for semi-supervised learning,\nduring which pseudo-label selection plays an important role for handling\nconfirmation bias. Despite its popularity, applying self-training to landmark\ndetection faces three problems: 1) The selected confident pseudo-labels often\ncontain data bias, which may hurt model performance; 2) It is not easy to\ndecide a proper threshold for sample selection as the localization task can be\nsensitive to noisy pseudo-labels; 3) coordinate regression does not output\nconfidence, making selection-based self-training infeasible. To address the\nabove issues, we propose Self-Training for Landmark Detection (STLD), a method\nthat does not require explicit pseudo-label selection. Instead, STLD constructs\na task curriculum to deal with confirmation bias, which progressively\ntransitions from more confident to less confident tasks over the rounds of\nself-training. Pseudo pretraining and shrink regression are two essential\ncomponents for such a curriculum, where the former is the first task of the\ncurriculum for providing a better model initialization and the latter is\nfurther added in the later rounds to directly leverage the pseudo-labels in a\ncoarse-to-fine manner. Experiments on three facial and one medical landmark\ndetection benchmark show that STLD outperforms the existing methods\nconsistently in both semi- and omni-supervised settings.\n","authors":["Haibo Jin","Haoxuan Che","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2404.04556v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2404.01673v2","updated":"2024-04-06T08:41:46Z","published":"2024-04-02T06:24:21Z","title":"A Universal Knowledge Embedded Contrastive Learning Framework for\n Hyperspectral Image Classification","summary":" Hyperspectral image (HSI) classification techniques have been intensively\nstudied and a variety of models have been developed. However, these HSI\nclassification models are confined to pocket models and unrealistic ways of\ndatasets partitioning. The former limits the generalization performance of the\nmodel and the latter is partitioned leads to inflated model evaluation metrics,\nwhich results in plummeting model performance in the real world. Therefore, we\npropose a universal knowledge embedded contrastive learning framework (KnowCL)\nfor supervised, unsupervised, and semisupervised HSI classification, which\nlargely closes the gap of HSI classification models between pocket models and\nstandard vision backbones. We present a new HSI processing pipeline in\nconjunction with a range of data transformation and augmentation techniques\nthat provide diverse data representations and realistic data partitioning. The\nproposed framework based on this pipeline is compatible with all kinds of\nbackbones and can fully exploit labeled and unlabeled samples with expected\ntraining time. Furthermore, we design a new loss function, which can adaptively\nfuse the supervised loss and unsupervised loss, enhancing the learning\nperformance. This proposed new classification paradigm shows great potentials\nin exploring for HSI classification technology. The code can be accessed at\nhttps://github.com/quanweiliu/KnowCL.\n","authors":["Quanwei Liu","Yanni Dong","Tao Huang","Lefei Zhang","Bo Du"],"pdf_url":"https://arxiv.org/pdf/2404.01673v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04550v1","updated":"2024-04-06T08:25:33Z","published":"2024-04-06T08:25:33Z","title":"NPB-REC: A Non-parametric Bayesian Deep-learning Approach for\n Undersampled MRI Reconstruction with Uncertainty Estimation","summary":" The ability to reconstruct high-quality images from undersampled MRI data is\nvital in improving MRI temporal resolution and reducing acquisition times. Deep\nlearning methods have been proposed for this task, but the lack of verified\nmethods to quantify the uncertainty in the reconstructed images hampered\nclinical applicability. We introduce \"NPB-REC\", a non-parametric fully Bayesian\nframework, for MRI reconstruction from undersampled data with uncertainty\nestimation. We use Stochastic Gradient Langevin Dynamics during training to\ncharacterize the posterior distribution of the network parameters. This enables\nus to both improve the quality of the reconstructed images and quantify the\nuncertainty in the reconstructed images. We demonstrate the efficacy of our\napproach on a multi-coil MRI dataset from the fastMRI challenge and compare it\nto the baseline End-to-End Variational Network (E2E-VarNet). Our approach\noutperforms the baseline in terms of reconstruction accuracy by means of PSNR\nand SSIM ($34.55$, $0.908$ vs. $33.08$, $0.897$, $p<0.01$, acceleration rate\n$R=8$) and provides uncertainty measures that correlate better with the\nreconstruction error (Pearson correlation, $R=0.94$ vs. $R=0.91$).\nAdditionally, our approach exhibits better generalization capabilities against\nanatomical distribution shifts (PSNR and SSIM of $32.38$, $0.849$ vs. $31.63$,\n$0.836$, $p<0.01$, training on brain data, inference on knee data, acceleration\nrate $R=8$). NPB-REC has the potential to facilitate the safe utilization of\ndeep learning-based methods for MRI reconstruction from undersampled data. Code\nand trained models are available at \\url{https://github.com/samahkh/NPB-REC}.\n","authors":["Samah Khawaled","Moti Freiman"],"pdf_url":"https://arxiv.org/pdf/2404.04550v1.pdf","comment":"Published in Artificial Intelligence in Medicine, DOI:\n https://doi.org/10.1016/j.artmed.2024.102798 This is an extension\n representing a more comprehensive work extending preliminary work presented\n at arXiv:2208.03966"},{"id":"http://arxiv.org/abs/2404.04546v1","updated":"2024-04-06T08:02:18Z","published":"2024-04-06T08:02:18Z","title":"A self-attention model for robust rigid slice-to-volume registration of\n functional MRI","summary":" Functional Magnetic Resonance Imaging (fMRI) is vital in neuroscience,\nenabling investigations into brain disorders, treatment monitoring, and brain\nfunction mapping. However, head motion during fMRI scans, occurring between\nshots of slice acquisition, can result in distortion, biased analyses, and\nincreased costs due to the need for scan repetitions. Therefore, retrospective\nslice-level motion correction through slice-to-volume registration (SVR) is\ncrucial. Previous studies have utilized deep learning (DL) based models to\naddress the SVR task; however, they overlooked the uncertainty stemming from\nthe input stack of slices and did not assign weighting or scoring to each\nslice. In this work, we introduce an end-to-end SVR model for aligning 2D fMRI\nslices with a 3D reference volume, incorporating a self-attention mechanism to\nenhance robustness against input data variations and uncertainties. It utilizes\nindependent slice and volume encoders and a self-attention module to assign\npixel-wise scores for each slice. We conducted evaluation experiments on 200\nimages involving synthetic rigid motion generated from 27 subjects belonging to\nthe test set, from the publicly available Healthy Brain Network (HBN) dataset.\nOur experimental results demonstrate that our model achieves competitive\nperformance in terms of alignment accuracy compared to state-of-the-art deep\nlearning-based methods (Euclidean distance of $0.93$ [mm] vs. $1.86$ [mm]).\nFurthermore, our approach exhibits significantly faster registration speed\ncompared to conventional iterative methods ($0.096$ sec. vs. $1.17$ sec.). Our\nend-to-end SVR model facilitates real-time head motion tracking during fMRI\nacquisition, ensuring reliability and robustness against uncertainties in\ninputs. source code, which includes the training and evaluations, will be\navailable soon.\n","authors":["Samah Khawaled","Simon K. Warfield","Moti Freiman"],"pdf_url":"https://arxiv.org/pdf/2404.04546v1.pdf","comment":"Currently under review"},{"id":"http://arxiv.org/abs/2404.04544v1","updated":"2024-04-06T07:53:49Z","published":"2024-04-06T07:53:49Z","title":"BeyondScene: Higher-Resolution Human-Centric Scene Generation With\n Pretrained Diffusion","summary":" Generating higher-resolution human-centric scenes with details and controls\nremains a challenge for existing text-to-image diffusion models. This challenge\nstems from limited training image size, text encoder capacity (limited tokens),\nand the inherent difficulty of generating complex scenes involving multiple\nhumans. While current methods attempted to address training size limit only,\nthey often yielded human-centric scenes with severe artifacts. We propose\nBeyondScene, a novel framework that overcomes prior limitations, generating\nexquisite higher-resolution (over 8K) human-centric scenes with exceptional\ntext-image correspondence and naturalness using existing pretrained diffusion\nmodels. BeyondScene employs a staged and hierarchical approach to initially\ngenerate a detailed base image focusing on crucial elements in instance\ncreation for multiple humans and detailed descriptions beyond token limit of\ndiffusion model, and then to seamlessly convert the base image to a\nhigher-resolution output, exceeding training image size and incorporating\ndetails aware of text and instances via our novel instance-aware hierarchical\nenlargement process that consists of our proposed high-frequency injected\nforward diffusion and adaptive joint diffusion. BeyondScene surpasses existing\nmethods in terms of correspondence with detailed text descriptions and\nnaturalness, paving the way for advanced applications in higher-resolution\nhuman-centric scene creation beyond the capacity of pretrained diffusion models\nwithout costly retraining. Project page:\nhttps://janeyeon.github.io/beyond-scene.\n","authors":["Gwanghyun Kim","Hayeon Kim","Hoigi Seo","Dong Un Kang","Se Young Chun"],"pdf_url":"https://arxiv.org/pdf/2404.04544v1.pdf","comment":"Project page: https://janeyeon.github.io/beyond-scene"},{"id":"http://arxiv.org/abs/2404.03527v2","updated":"2024-04-06T07:49:14Z","published":"2024-04-04T15:31:11Z","title":"HAPNet: Toward Superior RGB-Thermal Scene Parsing via Hybrid,\n Asymmetric, and Progressive Heterogeneous Feature Fusion","summary":" Data-fusion networks have shown significant promise for RGB-thermal scene\nparsing. However, the majority of existing studies have relied on symmetric\nduplex encoders for heterogeneous feature extraction and fusion, paying\ninadequate attention to the inherent differences between RGB and thermal\nmodalities. Recent progress in vision foundation models (VFMs) trained through\nself-supervision on vast amounts of unlabeled data has proven their ability to\nextract informative, general-purpose features. However, this potential has yet\nto be fully leveraged in the domain. In this study, we take one step toward\nthis new research area by exploring a feasible strategy to fully exploit VFM\nfeatures for RGB-thermal scene parsing. Specifically, we delve deeper into the\nunique characteristics of RGB and thermal modalities, thereby designing a\nhybrid, asymmetric encoder that incorporates both a VFM and a convolutional\nneural network. This design allows for more effective extraction of\ncomplementary heterogeneous features, which are subsequently fused in a\ndual-path, progressive manner. Moreover, we introduce an auxiliary task to\nfurther enrich the local semantics of the fused features, thereby improving the\noverall performance of RGB-thermal scene parsing. Our proposed HAPNet, equipped\nwith all these components, demonstrates superior performance compared to all\nother state-of-the-art RGB-thermal scene parsing networks, achieving top ranks\nacross three widely used public RGB-thermal scene parsing datasets. We believe\nthis new paradigm has opened up new opportunities for future developments in\ndata-fusion scene parsing approaches.\n","authors":["Jiahang Li","Peng Yun","Qijun Chen","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2404.03527v2.pdf","comment":"12 pages, 4figures"},{"id":"http://arxiv.org/abs/2404.04531v1","updated":"2024-04-06T07:13:49Z","published":"2024-04-06T07:13:49Z","title":"Frequency Decomposition-Driven Unsupervised Domain Adaptation for Remote\n Sensing Image Semantic Segmentation","summary":" Cross-domain semantic segmentation of remote sensing (RS) imagery based on\nunsupervised domain adaptation (UDA) techniques has significantly advanced\ndeep-learning applications in the geosciences. Recently, with its ingenious and\nversatile architecture, the Transformer model has been successfully applied in\nRS-UDA tasks. However, existing UDA methods mainly focus on domain alignment in\nthe high-level feature space. It is still challenging to retain cross-domain\nlocal spatial details and global contextual semantics simultaneously, which is\ncrucial for the RS image semantic segmentation task. To address these problems,\nwe propose novel high/low-frequency decomposition (HLFD) techniques to guide\nrepresentation alignment in cross-domain semantic segmentation. Specifically,\nHLFD attempts to decompose the feature maps into high- and low-frequency\ncomponents before performing the domain alignment in the corresponding\nsubspaces. Secondly, to further facilitate the alignment of decomposed\nfeatures, we propose a fully global-local generative adversarial network,\nnamely GLGAN, to learn domain-invariant detailed and semantic features across\ndomains by leveraging global-local transformer blocks (GLTBs). By integrating\nHLFD techniques and the GLGAN, a novel UDA framework called FD-GLGAN is\ndeveloped to improve the cross-domain transferability and generalization\ncapability of semantic segmentation models. Extensive experiments on two\nfine-resolution benchmark datasets, namely ISPRS Potsdam and ISPRS Vaihingen,\nhighlight the effectiveness and superiority of the proposed approach as\ncompared to the state-of-the-art UDA methods. The source code for this work\nwill be accessible at https://github.com/sstary/SSRS.\n","authors":["Xianping Ma","Xiaokang Zhang","Xingchen Ding","Man-On Pun","Siwei Ma"],"pdf_url":"https://arxiv.org/pdf/2404.04531v1.pdf","comment":"28 pages, 13 figures"},{"id":"http://arxiv.org/abs/2312.01531v2","updated":"2024-04-06T07:04:29Z","published":"2023-12-03T23:09:38Z","title":"SANeRF-HQ: Segment Anything for NeRF in High Quality","summary":" Recently, the Segment Anything Model (SAM) has showcased remarkable\ncapabilities of zero-shot segmentation, while NeRF (Neural Radiance Fields) has\ngained popularity as a method for various 3D problems beyond novel view\nsynthesis. Though there exist initial attempts to incorporate these two methods\ninto 3D segmentation, they face the challenge of accurately and consistently\nsegmenting objects in complex scenarios. In this paper, we introduce the\nSegment Anything for NeRF in High Quality (SANeRF-HQ) to achieve high-quality\n3D segmentation of any target object in a given scene. SANeRF-HQ utilizes SAM\nfor open-world object segmentation guided by user-supplied prompts, while\nleveraging NeRF to aggregate information from different viewpoints. To overcome\nthe aforementioned challenges, we employ density field and RGB similarity to\nenhance the accuracy of segmentation boundary during the aggregation.\nEmphasizing on segmentation accuracy, we evaluate our method on multiple NeRF\ndatasets where high-quality ground-truths are available or manually annotated.\nSANeRF-HQ shows a significant quality improvement over state-of-the-art methods\nin NeRF object segmentation, provides higher flexibility for object\nlocalization, and enables more consistent object segmentation across multiple\nviews. Results and code are available at the project site:\nhttps://lyclyc52.github.io/SANeRF-HQ/.\n","authors":["Yichen Liu","Benran Hu","Chi-Keung Tang","Yu-Wing Tai"],"pdf_url":"https://arxiv.org/pdf/2312.01531v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.04527v1","updated":"2024-04-06T06:49:55Z","published":"2024-04-06T06:49:55Z","title":"VTR: An Optimized Vision Transformer for SAR ATR Acceleration on FPGA","summary":" Synthetic Aperture Radar (SAR) Automatic Target Recognition (ATR) is a key\ntechnique used in military applications like remote-sensing image recognition.\nVision Transformers (ViTs) are the current state-of-the-art in various computer\nvision applications, outperforming their CNN counterparts. However, using ViTs\nfor SAR ATR applications is challenging due to (1) standard ViTs require\nextensive training data to generalize well due to their low locality; the\nstandard SAR datasets, however, have a limited number of labeled training data\nwhich reduces the learning capability of ViTs; (2) ViTs have a high parameter\ncount and are computation intensive which makes their deployment on\nresource-constrained SAR platforms difficult. In this work, we develop a\nlightweight ViT model that can be trained directly on small datasets without\nany pre-training by utilizing the Shifted Patch Tokenization (SPT) and Locality\nSelf-Attention (LSA) modules. We directly train this model on SAR datasets\nwhich have limited training samples to evaluate its effectiveness for SAR ATR\napplications. We evaluate our proposed model, that we call VTR (ViT for SAR\nATR), on three widely used SAR datasets: MSTAR, SynthWakeSAR, and GBSAR.\nFurther, we propose a novel FPGA accelerator for VTR, in order to enable\ndeployment for real-time SAR ATR applications.\n","authors":["Sachini Wickramasinghe","Dhruv Parikh","Bingyi Zhang","Rajgopal Kannan","Viktor Prasanna","Carl Busart"],"pdf_url":"https://arxiv.org/pdf/2404.04527v1.pdf","comment":"SPIE DCS 2024"},{"id":"http://arxiv.org/abs/2404.04526v1","updated":"2024-04-06T06:48:16Z","published":"2024-04-06T06:48:16Z","title":"DATENeRF: Depth-Aware Text-based Editing of NeRFs","summary":" Recent advancements in diffusion models have shown remarkable proficiency in\nediting 2D images based on text prompts. However, extending these techniques to\nedit scenes in Neural Radiance Fields (NeRF) is complex, as editing individual\n2D frames can result in inconsistencies across multiple views. Our crucial\ninsight is that a NeRF scene's geometry can serve as a bridge to integrate\nthese 2D edits. Utilizing this geometry, we employ a depth-conditioned\nControlNet to enhance the coherence of each 2D image modification. Moreover, we\nintroduce an inpainting approach that leverages the depth information of NeRF\nscenes to distribute 2D edits across different images, ensuring robustness\nagainst errors and resampling challenges. Our results reveal that this\nmethodology achieves more consistent, lifelike, and detailed edits than\nexisting leading methods for text-driven NeRF scene editing.\n","authors":["Sara Rojas","Julien Philip","Kai Zhang","Sai Bi","Fujun Luan","Bernard Ghanem","Kalyan Sunkavall"],"pdf_url":"https://arxiv.org/pdf/2404.04526v1.pdf","comment":"14 pages, Conference paper, 3D Scene Editing, Neural Rendering,\n Diffusion Models"},{"id":"http://arxiv.org/abs/2404.04518v1","updated":"2024-04-06T06:18:11Z","published":"2024-04-06T06:18:11Z","title":"MedIAnomaly: A comparative study of anomaly detection in medical images","summary":" Anomaly detection (AD) aims at detecting abnormal samples that deviate from\nthe expected normal patterns. Generally, it can be trained on merely normal\ndata without the requirement for abnormal samples, and thereby plays an\nimportant role in the recognition of rare diseases and health screening in the\nmedical domain. Despite numerous related studies, we observe a lack of a fair\nand comprehensive evaluation, which causes some ambiguous conclusions and\nhinders the development of this field. This paper focuses on building a\nbenchmark with unified implementation and comparison to address this problem.\nIn particular, seven medical datasets with five image modalities, including\nchest X-rays, brain MRIs, retinal fundus images, dermatoscopic images, and\nhistopathology whole slide images are organized for extensive evaluation.\nTwenty-seven typical AD methods, including reconstruction and self-supervised\nlearning-based methods, are involved in comparison of image-level anomaly\nclassification and pixel-level anomaly segmentation. Furthermore, we for the\nfirst time formally explore the effect of key components in existing methods,\nclearly revealing unresolved challenges and potential future directions. The\ndatasets and code are available at\n\\url{https://github.com/caiyu6666/MedIAnomaly}.\n","authors":["Yu Cai","Weiwen Zhang","Hao Chen","Kwang-Ting Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.04518v1.pdf","comment":"Under submission"},{"id":"http://arxiv.org/abs/2404.04517v1","updated":"2024-04-06T06:15:07Z","published":"2024-04-06T06:15:07Z","title":"Latent-based Diffusion Model for Long-tailed Recognition","summary":" Long-tailed imbalance distribution is a common issue in practical computer\nvision applications. Previous works proposed methods to address this problem,\nwhich can be categorized into several classes: re-sampling, re-weighting,\ntransfer learning, and feature augmentation. In recent years, diffusion models\nhave shown an impressive generation ability in many sub-problems of deep\ncomputer vision. However, its powerful generation has not been explored in\nlong-tailed problems. We propose a new approach, the Latent-based Diffusion\nModel for Long-tailed Recognition (LDMLR), as a feature augmentation method to\ntackle the issue. First, we encode the imbalanced dataset into features using\nthe baseline model. Then, we train a Denoising Diffusion Implicit Model (DDIM)\nusing these encoded features to generate pseudo-features. Finally, we train the\nclassifier using the encoded and pseudo-features from the previous two steps.\nThe model's accuracy shows an improvement on the CIFAR-LT and ImageNet-LT\ndatasets by using the proposed method.\n","authors":["Pengxiao Han","Changkun Ye","Jieming Zhou","Jing Zhang","Jie Hong","Xuesong Li"],"pdf_url":"https://arxiv.org/pdf/2404.04517v1.pdf","comment":"8 pages, 3 figures, accepted by L3DIVU-CVPR2024"},{"id":"http://arxiv.org/abs/2404.04511v1","updated":"2024-04-06T05:55:14Z","published":"2024-04-06T05:55:14Z","title":"Cluster-based Video Summarization with Temporal Context Awareness","summary":" In this paper, we present TAC-SUM, a novel and efficient training-free\napproach for video summarization that addresses the limitations of existing\ncluster-based models by incorporating temporal context. Our method partitions\nthe input video into temporally consecutive segments with clustering\ninformation, enabling the injection of temporal awareness into the clustering\nprocess, setting it apart from prior cluster-based summarization methods. The\nresulting temporal-aware clusters are then utilized to compute the final\nsummary, using simple rules for keyframe selection and frame importance\nscoring. Experimental results on the SumMe dataset demonstrate the\neffectiveness of our proposed approach, outperforming existing unsupervised\nmethods and achieving comparable performance to state-of-the-art supervised\nsummarization techniques. Our source code is available for reference at\n\\url{https://github.com/hcmus-thesis-gulu/TAC-SUM}.\n","authors":["Hai-Dang Huynh-Lam","Ngoc-Phuong Ho-Thi","Minh-Triet Tran","Trung-Nghia Le"],"pdf_url":"https://arxiv.org/pdf/2404.04511v1.pdf","comment":"14 pages, 6 figures, accepted in PSIVT 2023"},{"id":"http://arxiv.org/abs/2212.02190v3","updated":"2024-04-06T04:59:12Z","published":"2022-12-05T11:54:12Z","title":"L2SR: Learning to Sample and Reconstruct for Accelerated MRI via\n Reinforcement Learning","summary":" Magnetic Resonance Imaging (MRI) is a widely used medical imaging technique,\nbut its long acquisition time can be a limiting factor in clinical settings. To\naddress this issue, researchers have been exploring ways to reduce the\nacquisition time while maintaining the reconstruction quality. Previous works\nhave focused on finding either sparse samplers with a fixed reconstructor or\nfinding reconstructors with a fixed sampler. However, these approaches do not\nfully utilize the potential of joint learning of samplers and reconstructors.\nIn this paper, we propose an alternating training framework for jointly\nlearning a good pair of samplers and reconstructors via deep reinforcement\nlearning (RL). In particular, we consider the process of MRI sampling as a\nsampling trajectory controlled by a sampler, and introduce a novel\nsparse-reward Partially Observed Markov Decision Process (POMDP) to formulate\nthe MRI sampling trajectory. Compared to the dense-reward POMDP used in\nexisting works, the proposed sparse-reward POMDP is more computationally\nefficient and has a provable advantage. Moreover, the proposed framework,\ncalled L2SR (Learning to Sample and Reconstruct), overcomes the training\nmismatch problem that arises in previous methods that use dense-reward POMDP.\nBy alternately updating samplers and reconstructors, L2SR learns a pair of\nsamplers and reconstructors that achieve state-of-the-art reconstruction\nperformances on the fastMRI dataset. Codes are available at\n\\url{https://github.com/yangpuPKU/L2SR-Learning-to-Sample-and-Reconstruct}.\n","authors":["Pu Yang","Bin Dong"],"pdf_url":"https://arxiv.org/pdf/2212.02190v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04492v1","updated":"2024-04-06T03:48:29Z","published":"2024-04-06T03:48:29Z","title":"Automated Lane Change Behavior Prediction and Environmental Perception\n Based on SLAM Technology","summary":" In addition to environmental perception sensors such as cameras, radars, etc.\nin the automatic driving system, the external environment of the vehicle is\nperceived, in fact, there is also a perception sensor that has been silently\ndedicated in the system, that is, the positioning module. This paper explores\nthe application of SLAM (Simultaneous Localization and Mapping) technology in\nthe context of automatic lane change behavior prediction and environment\nperception for autonomous vehicles. It discusses the limitations of traditional\npositioning methods, introduces SLAM technology, and compares LIDAR SLAM with\nvisual SLAM. Real-world examples from companies like Tesla, Waymo, and Mobileye\nshowcase the integration of AI-driven technologies, sensor fusion, and SLAM in\nautonomous driving systems. The paper then delves into the specifics of SLAM\nalgorithms, sensor technologies, and the importance of automatic lane changes\nin driving safety and efficiency. It highlights Tesla's recent update to its\nAutopilot system, which incorporates automatic lane change functionality using\nSLAM technology. The paper concludes by emphasizing the crucial role of SLAM in\nenabling accurate environment perception, positioning, and decision-making for\nautonomous vehicles, ultimately enhancing safety and driving experience.\n","authors":["Han Lei","Baoming Wang","Zuwei Shui","Peiyuan Yang","Penghao Liang"],"pdf_url":"https://arxiv.org/pdf/2404.04492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04483v1","updated":"2024-04-06T03:25:24Z","published":"2024-04-06T03:25:24Z","title":"FastHDRNet: A new efficient method for SDR-to-HDR Translation","summary":" Modern displays nowadays possess the capability to render video content with\na high dynamic range (HDR) and an extensive color gamut (WCG).However, the\nmajority of available resources are still in standard dynamic range(SDR).\nTherefore, we need to identify an effective methodology for this objective.The\nexisting deep neural network (DNN) based SDR(Standard dynamic range) to HDR\n(High dynamic range) conversion methods outperform conventional methods, but\nthey are either too large to implement or generate some terrible artifacts. We\npropose a neural network for SDRTV to HDRTV conversion, termed \"FastHDRNet\".\nThis network includes two parts, Adaptive Universal Color Transformation and\nLocal Enhancement.The architecture is designed as a lightweight network that\nutilizes global statistics and local information with super high efficiency.\nAfter the experiment, we find that our proposed method achieve state-of-the-art\nperformance in both quantitative comparisons and visual quality with a\nlightweight structure and a enhanced infer speed.\n","authors":["Siyuan Tian","Hao Wang","Yiren Rong","Junhao Wang","Renjie Dai","Zhengxiao He"],"pdf_url":"https://arxiv.org/pdf/2404.04483v1.pdf","comment":"16 pages, 4 figures"},{"id":"http://arxiv.org/abs/2401.00241v4","updated":"2024-04-06T03:24:26Z","published":"2023-12-30T14:11:08Z","title":"Image Super-resolution Reconstruction Network based on Enhanced Swin\n Transformer via Alternating Aggregation of Local-Global Features","summary":" The Swin Transformer image super-resolution reconstruction network only\nrelies on the long-range relationship of window attention and shifted window\nattention to explore features. This mechanism has two limitations. On the one\nhand, it only focuses on global features while ignoring local features. On the\nother hand, it is only concerned with spatial feature interactions while\nignoring channel features and channel interactions, thus limiting its\nnon-linear mapping ability. To address the above limitations, this paper\nproposes enhanced Swin Transformer modules via alternating aggregation of\nlocal-global features. In the local feature aggregation stage, we introduce a\nshift convolution to realize the interaction between local spatial information\nand channel information. Then, a block sparse global perception module is\nintroduced in the global feature aggregation stage. In this module, we\nreorganize the spatial information first, then send the recombination\ninformation into a dense layer to implement the global perception. After that,\na multi-scale self-attention module and a low-parameter residual channel\nattention module are introduced to realize information aggregation at different\nscales. Finally, the proposed network is validated on five publicly available\ndatasets. The experimental results show that the proposed network outperforms\nthe other state-of-the-art super-resolution networks.\n","authors":["Yuming Huang","Yingpin Chen","Changhui Wu","Hanrong Xie","Binhui Song","Hui Wang"],"pdf_url":"https://arxiv.org/pdf/2401.00241v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09560v2","updated":"2024-04-06T03:17:33Z","published":"2023-10-14T11:03:04Z","title":"You Only Train Once: A Unified Framework for Both Full-Reference and\n No-Reference Image Quality Assessment","summary":" Although recent efforts in image quality assessment (IQA) have achieved\npromising performance, there still exists a considerable gap compared to the\nhuman visual system (HVS). One significant disparity lies in humans' seamless\ntransition between full reference (FR) and no reference (NR) tasks, whereas\nexisting models are constrained to either FR or NR tasks. This disparity\nimplies the necessity of designing two distinct systems, thereby greatly\ndiminishing the model's versatility. Therefore, our focus lies in unifying FR\nand NR IQA under a single framework. Specifically, we first employ an encoder\nto extract multi-level features from input images. Then a Hierarchical\nAttention (HA) module is proposed as a universal adapter for both FR and NR\ninputs to model the spatial distortion at each encoder stage. Furthermore,\nconsidering that different distortions contaminate encoder stages and damage\nimage semantic meaning differently, a Semantic Distortion Aware (SDA) module is\nproposed to examine feature correlations between shallow and deep layers of the\nencoder. By adopting HA and SDA, the proposed network can effectively perform\nboth FR and NR IQA. When our proposed model is independently trained on NR or\nFR IQA tasks, it outperforms existing models and achieves state-of-the-art\nperformance. Moreover, when trained jointly on NR and FR IQA tasks, it further\nenhances the performance of NR IQA while achieving on-par performance in the\nstate-of-the-art FR IQA. You only train once to perform both IQA tasks. Code\nwill be released at: https://github.com/BarCodeReader/YOTO.\n","authors":["Yi Ke Yun","Weisi Lin"],"pdf_url":"https://arxiv.org/pdf/2310.09560v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04478v1","updated":"2024-04-06T02:54:35Z","published":"2024-04-06T02:54:35Z","title":"Diffusion-RWKV: Scaling RWKV-Like Architectures for Diffusion Models","summary":" Transformers have catalyzed advancements in computer vision and natural\nlanguage processing (NLP) fields. However, substantial computational complexity\nposes limitations for their application in long-context tasks, such as\nhigh-resolution image generation. This paper introduces a series of\narchitectures adapted from the RWKV model used in the NLP, with requisite\nmodifications tailored for diffusion model applied to image generation tasks,\nreferred to as Diffusion-RWKV. Similar to the diffusion with Transformers, our\nmodel is designed to efficiently handle patchnified inputs in a sequence with\nextra conditions, while also scaling up effectively, accommodating both\nlarge-scale parameters and extensive datasets. Its distinctive advantage\nmanifests in its reduced spatial aggregation complexity, rendering it\nexceptionally adept at processing high-resolution images, thereby eliminating\nthe necessity for windowing or group cached operations. Experimental results on\nboth condition and unconditional image generation tasks demonstrate that\nDiffison-RWKV achieves performance on par with or surpasses existing CNN or\nTransformer-based diffusion models in FID and IS metrics while significantly\nreducing total computation FLOP usage.\n","authors":["Zhengcong Fei","Mingyuan Fan","Changqian Yu","Debang Li","Junshi Huang"],"pdf_url":"https://arxiv.org/pdf/2404.04478v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.05357v2","updated":"2024-04-06T02:42:44Z","published":"2023-12-08T20:34:37Z","title":"Filtering Pixel Latent Variables for Unmixing Noisy and Undersampled\n Volumetric Images","summary":" The development of robust signal unmixing algorithms is essential for\nleveraging multimodal datasets acquired through a wide array of scientific\nimaging technologies, including hyperspectral or time-resolved acquisitions. In\nexperimental physics, enhancing the spatio-temporal resolution or expanding the\nnumber of detection channels often leads to diminished sampling rate and\nsignal-to-noise ratio, significantly affecting the efficacy of signal unmixing\nalgorithms. We propose applying band-pass filters to the latent space of a\nmulti-dimensional convolutional neural network to disentangle overlapping\nsignal components, enabling the isolation and quantification of their\nindividual contributions. Using multi-dimensional convolution kernels to\nprocess all dimensions simultaneously enhances the network's ability to extract\ninformation from adjacent pixels, time- or spectral-bins. This approach enables\nmore effective separation of components in cases where individual pixels do not\nprovide clear, well-resolved information. We showcase the method's practical\nuse in experimental physics through two test cases that highlight the\nversatility of our approach: fluorescence lifetime microscopy and mode\ndecomposition in optical fibers. The latent unmixing method extracts valuable\ninformation from complex signals that cannot be resolved by standard methods.\nApplication of latent unmixing to real FLIM experiments will increase the\nnumber of distinguishable fluorescent markers. It will also open new\npossibilities in optics and photonics for multichannel separations at increased\nsampling rate.\n","authors":["Catherine Bouchard","Andréanne Deschênes","Vincent Boulanger","Jean-Michel Bellavance","Flavie Lavoie-Cardinal","Christian Gagné"],"pdf_url":"https://arxiv.org/pdf/2312.05357v2.pdf","comment":"16 pages, 8 figures (main paper) + 18 pages, 9 figures (supplementary\n material)"},{"id":"http://arxiv.org/abs/2404.04476v1","updated":"2024-04-06T02:33:04Z","published":"2024-04-06T02:33:04Z","title":"DELTA: Decoupling Long-Tailed Online Continual Learning","summary":" A significant challenge in achieving ubiquitous Artificial Intelligence is\nthe limited ability of models to rapidly learn new information in real-world\nscenarios where data follows long-tailed distributions, all while avoiding\nforgetting previously acquired knowledge. In this work, we study the\nunder-explored problem of Long-Tailed Online Continual Learning (LTOCL), which\naims to learn new tasks from sequentially arriving class-imbalanced data\nstreams. Each data is observed only once for training without knowing the task\ndata distribution. We present DELTA, a decoupled learning approach designed to\nenhance learning representations and address the substantial imbalance in\nLTOCL. We enhance the learning process by adapting supervised contrastive\nlearning to attract similar samples and repel dissimilar (out-of-class)\nsamples. Further, by balancing gradients during training using an equalization\nloss, DELTA significantly enhances learning outcomes and successfully mitigates\ncatastrophic forgetting. Through extensive evaluation, we demonstrate that\nDELTA improves the capacity for incremental learning, surpassing existing OCL\nmethods. Our results suggest considerable promise for applying OCL in\nreal-world applications.\n","authors":["Siddeshwar Raghavan","Jiangpeng He","Fengqing Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.04476v1.pdf","comment":"CVPR Workshop acceptance archival track"},{"id":"http://arxiv.org/abs/2404.04474v1","updated":"2024-04-06T02:08:40Z","published":"2024-04-06T02:08:40Z","title":"RoNet: Rotation-oriented Continuous Image Translation","summary":" The generation of smooth and continuous images between domains has recently\ndrawn much attention in image-to-image (I2I) translation. Linear relationship\nacts as the basic assumption in most existing approaches, while applied to\ndifferent aspects including features, models or labels. However, the linear\nassumption is hard to conform with the element dimension increases and suffers\nfrom the limit that having to obtain both ends of the line. In this paper, we\npropose a novel rotation-oriented solution and model the continuous generation\nwith an in-plane rotation over the style representation of an image, achieving\na network named RoNet. A rotation module is implanted in the generation network\nto automatically learn the proper plane while disentangling the content and the\nstyle of an image. To encourage realistic texture, we also design a patch-based\nsemantic style loss that learns the different styles of the similar object in\ndifferent domains. We conduct experiments on forest scenes (where the complex\ntexture makes the generation very challenging), faces, streetscapes and the\niphone2dslr task. The results validate the superiority of our method in terms\nof visual quality and continuity.\n","authors":["Yi Li","Xin Xie","Lina Lei","Haiyan Fu","Yanqing Guo"],"pdf_url":"https://arxiv.org/pdf/2404.04474v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2404.04469v1","updated":"2024-04-06T01:54:17Z","published":"2024-04-06T01:54:17Z","title":"Mixed-Query Transformer: A Unified Image Segmentation Architecture","summary":" Existing unified image segmentation models either employ a unified\narchitecture across multiple tasks but use separate weights tailored to each\ndataset, or apply a single set of weights to multiple datasets but are limited\nto a single task. In this paper, we introduce the Mixed-Query Transformer\n(MQ-Former), a unified architecture for multi-task and multi-dataset image\nsegmentation using a single set of weights. To enable this, we propose a mixed\nquery strategy, which can effectively and dynamically accommodate different\ntypes of objects without heuristic designs. In addition, the unified\narchitecture allows us to use data augmentation with synthetic masks and\ncaptions to further improve model generalization. Experiments demonstrate that\nMQ-Former can not only effectively handle multiple segmentation datasets and\ntasks compared to specialized state-of-the-art models with competitive\nperformance, but also generalize better to open-set segmentation tasks,\nevidenced by over 7 points higher performance than the prior art on the\nopen-vocabulary SeginW benchmark.\n","authors":["Pei Wang","Zhaowei Cai","Hao Yang","Ashwin Swaminathan","R. Manmatha","Stefano Soatto"],"pdf_url":"https://arxiv.org/pdf/2404.04469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10671v3","updated":"2024-04-06T01:45:45Z","published":"2023-12-17T10:07:03Z","title":"Open3DIS: Open-Vocabulary 3D Instance Segmentation with 2D Mask Guidance","summary":" We introduce Open3DIS, a novel solution designed to tackle the problem of\nOpen-Vocabulary Instance Segmentation within 3D scenes. Objects within 3D\nenvironments exhibit diverse shapes, scales, and colors, making precise\ninstance-level identification a challenging task. Recent advancements in\nOpen-Vocabulary scene understanding have made significant strides in this area\nby employing class-agnostic 3D instance proposal networks for object\nlocalization and learning queryable features for each 3D mask. While these\nmethods produce high-quality instance proposals, they struggle with identifying\nsmall-scale and geometrically ambiguous objects. The key idea of our method is\na new module that aggregates 2D instance masks across frames and maps them to\ngeometrically coherent point cloud regions as high-quality object proposals\naddressing the above limitations. These are then combined with 3D\nclass-agnostic instance proposals to include a wide range of objects in the\nreal world. To validate our approach, we conducted experiments on three\nprominent datasets, including ScanNet200, S3DIS, and Replica, demonstrating\nsignificant performance gains in segmenting objects with diverse categories\nover the state-of-the-art approaches.\n","authors":["Phuc D. A. Nguyen","Tuan Duc Ngo","Evangelos Kalogerakis","Chuang Gan","Anh Tran","Cuong Pham","Khoi Nguyen"],"pdf_url":"https://arxiv.org/pdf/2312.10671v3.pdf","comment":"CVPR 2024. Project page: https://open3dis.github.io/"},{"id":"http://arxiv.org/abs/2404.04465v1","updated":"2024-04-06T01:23:23Z","published":"2024-04-06T01:23:23Z","title":"Aligning Diffusion Models by Optimizing Human Utility","summary":" We present Diffusion-KTO, a novel approach for aligning text-to-image\ndiffusion models by formulating the alignment objective as the maximization of\nexpected human utility. Since this objective applies to each generation\nindependently, Diffusion-KTO does not require collecting costly pairwise\npreference data nor training a complex reward model. Instead, our objective\nrequires simple per-image binary feedback signals, e.g. likes or dislikes,\nwhich are abundantly available. After fine-tuning using Diffusion-KTO,\ntext-to-image diffusion models exhibit superior performance compared to\nexisting techniques, including supervised fine-tuning and Diffusion-DPO, both\nin terms of human judgment and automatic evaluation metrics such as PickScore\nand ImageReward. Overall, Diffusion-KTO unlocks the potential of leveraging\nreadily available per-image binary signals and broadens the applicability of\naligning text-to-image diffusion models with human preferences.\n","authors":["Shufan Li","Konstantinos Kallidromitis","Akash Gokul","Yusuke Kato","Kazuki Kozuka"],"pdf_url":"https://arxiv.org/pdf/2404.04465v1.pdf","comment":"27 pages, 11 figures"},{"id":"http://arxiv.org/abs/2404.04461v1","updated":"2024-04-06T01:07:38Z","published":"2024-04-06T01:07:38Z","title":"Automated Polyp Segmentation in Colonoscopy Images","summary":" It is important to find the polyps in a human system that helps to prevent\ncancer during medical diagnosis. This research discusses using a dilated\nconvolution module along with a criss cross attention-based network to segment\npolyps from the endoscopic images of the colon. To gather the context\ninformation of all pixels in an image more efficiently, criss-cross attention\nmodule has played a vital role. In order to extract maximum information from\ndataset, data augmentation techniques are employed in the dataset. Rotations,\nflips, scaling, and contrast along with varying learning rates were implemented\nto make a better model. Global average pooling was applied over ResNet50 that\nhelped to store the important details of encoder. In our experiment, the\nproposed architecture's performance was compared with existing models like\nU-Net, DeepLabV3, PraNet. This architecture outperformed other models on the\nsubset of dataset which has irregular polyp shapes. The combination of dilated\nconvolution module, RCCA, and global average pooling was found to be effective\nfor irregular shapes. Our architecture demonstrates an enhancement, with an\naverage improvement of 3.75% across all metrics when compared to existing\nmodels.\n","authors":["Swagat Ranjit","Jian Zhang","Bijaya B. Karki"],"pdf_url":"https://arxiv.org/pdf/2404.04461v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2404.04458v1","updated":"2024-04-06T00:33:39Z","published":"2024-04-06T00:33:39Z","title":"JRDB-Social: A Multifaceted Robotic Dataset for Understanding of Context\n and Dynamics of Human Interactions Within Social Groups","summary":" Understanding human social behaviour is crucial in computer vision and\nrobotics. Micro-level observations like individual actions fall short,\nnecessitating a comprehensive approach that considers individual behaviour,\nintra-group dynamics, and social group levels for a thorough understanding. To\naddress dataset limitations, this paper introduces JRDB-Social, an extension of\nJRDB. Designed to fill gaps in human understanding across diverse indoor and\noutdoor social contexts, JRDB-Social provides annotations at three levels:\nindividual attributes, intra-group interactions, and social group context. This\ndataset aims to enhance our grasp of human social dynamics for robotic\napplications. Utilizing the recent cutting-edge multi-modal large language\nmodels, we evaluated our benchmark to explore their capacity to decipher social\nhuman behaviour.\n","authors":["Simindokht Jahangard","Zhixi Cai","Shiki Wen","Hamid Rezatofighi"],"pdf_url":"https://arxiv.org/pdf/2404.04458v1.pdf","comment":"Accepted by CVPR 2024. Project page:\n https://jrdb.erc.monash.edu/dataset/social"},{"id":"http://arxiv.org/abs/2404.04456v1","updated":"2024-04-06T00:04:19Z","published":"2024-04-06T00:04:19Z","title":"Beyond the Known: Adversarial Autoencoders in Novelty Detection","summary":" In novelty detection, the goal is to decide if a new data point should be\ncategorized as an inlier or an outlier, given a training dataset that primarily\ncaptures the inlier distribution. Recent approaches typically use deep encoder\nand decoder network frameworks to derive a reconstruction error, and employ\nthis error either to determine a novelty score, or as the basis for a one-class\nclassifier. In this research, we use a similar framework but with a lightweight\ndeep network, and we adopt a probabilistic score with reconstruction error. Our\nmethodology calculates the probability of whether the sample comes from the\ninlier distribution or not. This work makes two key contributions. The first is\nthat we compute the novelty probability by linearizing the manifold that holds\nthe structure of the inlier distribution. This allows us to interpret how the\nprobability is distributed and can be determined in relation to the local\ncoordinates of the manifold tangent space. The second contribution is that we\nimprove the training protocol for the network. Our results indicate that our\napproach is effective at learning the target class, and it outperforms recent\nstate-of-the-art methods on several benchmark datasets.\n","authors":["Muhammad Asad","Ihsan Ullah","Ganesh Sistu","Michael G. Madden"],"pdf_url":"https://arxiv.org/pdf/2404.04456v1.pdf","comment":"Accepted at the VISAAP 2024"},{"id":"http://arxiv.org/abs/2404.05764v1","updated":"2024-04-06T16:10:48Z","published":"2024-04-06T16:10:48Z","title":"Study of the effect of Sharpness on Blind Video Quality Assessment","summary":" Introduction: Video Quality Assessment (VQA) is one of the important areas of\nstudy in this modern era, where video is a crucial component of communication\nwith applications in every field. Rapid technology developments in mobile\ntechnology enabled anyone to create videos resulting in a varied range of video\nquality scenarios. Objectives: Though VQA was present for some time with the\nclassical metrices like SSIM and PSNR, the advent of machine learning has\nbrought in new techniques of VQAs which are built upon Convolutional Neural\nNetworks (CNNs) or Deep Neural Networks (DNNs). Methods: Over the past years\nvarious research studies such as the BVQA which performed video quality\nassessment of nature-based videos using DNNs exposed the powerful capabilities\nof machine learning algorithms. BVQA using DNNs explored human visual system\neffects such as content dependency and time-related factors normally known as\ntemporal effects. Results: This study explores the sharpness effect on models\nlike BVQA. Sharpness is the measure of the clarity and details of the video\nimage. Sharpness typically involves analyzing the edges and contrast of the\nimage to determine the overall level of detail and sharpness. Conclusion: This\nstudy uses the existing video quality databases such as CVD2014. A comparative\nstudy of the various machine learning parameters such as SRCC and PLCC during\nthe training and testing are presented along with the conclusion.\n","authors":["Anantha Prabhu","David Pratap","Narayana Darapeni","Anwesh P R"],"pdf_url":"https://arxiv.org/pdf/2404.05764v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04654v1","updated":"2024-04-06T15:14:25Z","published":"2024-04-06T15:14:25Z","title":"Music Recommendation Based on Facial Emotion Recognition","summary":" Introduction: Music provides an incredible avenue for individuals to express\ntheir thoughts and emotions, while also serving as a delightful mode of\nentertainment for enthusiasts and music lovers. Objectives: This paper presents\na comprehensive approach to enhancing the user experience through the\nintegration of emotion recognition, music recommendation, and explainable AI\nusing GRAD-CAM. Methods: The proposed methodology utilizes a ResNet50 model\ntrained on the Facial Expression Recognition (FER) dataset, consisting of real\nimages of individuals expressing various emotions. Results: The system achieves\nan accuracy of 82% in emotion classification. By leveraging GRAD-CAM, the model\nprovides explanations for its predictions, allowing users to understand the\nreasoning behind the system's recommendations. The model is trained on both FER\nand real user datasets, which include labelled facial expressions, and real\nimages of individuals expressing various emotions. The training process\ninvolves pre-processing the input images, extracting features through\nconvolutional layers, reasoning with dense layers, and generating emotion\npredictions through the output layer. Conclusion: The proposed methodology,\nleveraging the Resnet50 model with ROI-based analysis and explainable AI\ntechniques, offers a robust and interpretable solution for facial emotion\ndetection paper.\n","authors":["Rajesh B","Keerthana V","Narayana Darapaneni","Anwesh Reddy P"],"pdf_url":"https://arxiv.org/pdf/2404.04654v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05763v1","updated":"2024-04-06T15:09:49Z","published":"2024-04-06T15:09:49Z","title":"Deep Learning-Based Brain Image Segmentation for Automated Tumour\n Detection","summary":" Introduction: The present study on the development and evaluation of an\nautomated brain tumor segmentation technique based on deep learning using the\n3D U-Net model. Objectives: The objective is to leverage state-of-the-art\nconvolutional neural networks (CNNs) on a large dataset of brain MRI scans for\nsegmentation. Methods: The proposed methodology applies pre-processing\ntechniques for enhanced performance and generalizability. Results: Extensive\nvalidation on an independent dataset confirms the model's robustness and\npotential for integration into clinical workflows. The study emphasizes the\nimportance of data pre-processing and explores various hyperparameters to\noptimize the model's performance. The 3D U-Net, has given IoUs for training and\nvalidation dataset have been 0.8181 and 0.66 respectively. Conclusion:\nUltimately, this comprehensive framework showcases the efficacy of deep\nlearning in automating brain tumour detection, offering valuable support in\nclinical practice.\n","authors":["Suman Sourabh","Murugappan Valliappan","Narayana Darapaneni","Anwesh R P"],"pdf_url":"https://arxiv.org/pdf/2404.05763v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04586v1","updated":"2024-04-06T10:50:02Z","published":"2024-04-06T10:50:02Z","title":"PIE: Physics-inspired Low-light Enhancement","summary":" In this paper, we propose a physics-inspired contrastive learning paradigm\nfor low-light enhancement, called PIE. PIE primarily addresses three issues:\n(i) To resolve the problem of existing learning-based methods often training a\nLLE model with strict pixel-correspondence image pairs, we eliminate the need\nfor pixel-correspondence paired training data and instead train with unpaired\nimages. (ii) To address the disregard for negative samples and the inadequacy\nof their generation in existing methods, we incorporate physics-inspired\ncontrastive learning for LLE and design the Bag of Curves (BoC) method to\ngenerate more reasonable negative samples that closely adhere to the underlying\nphysical imaging principle. (iii) To overcome the reliance on semantic ground\ntruths in existing methods, we propose an unsupervised regional segmentation\nmodule, ensuring regional brightness consistency while eliminating the\ndependency on semantic ground truths. Overall, the proposed PIE can effectively\nlearn from unpaired positive/negative samples and smoothly realize non-semantic\nregional enhancement, which is clearly different from existing LLE efforts.\nBesides the novel architecture of PIE, we explore the gain of PIE on downstream\ntasks such as semantic segmentation and face detection. Training on readily\navailable open data and extensive experiments demonstrate that our method\nsurpasses the state-of-the-art LLE models over six independent cross-scenes\ndatasets. PIE runs fast with reasonable GFLOPs in test time, making it easy to\nuse on mobile devices.\n","authors":["Dong Liang","Zhengyan Xu","Ling Li","Mingqiang Wei","Songcan Chen"],"pdf_url":"https://arxiv.org/pdf/2404.04586v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2112.06451"}]},"2024-04-09T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.06512v1","updated":"2024-04-09T17:59:32Z","published":"2024-04-09T17:59:32Z","title":"InternLM-XComposer2-4KHD: A Pioneering Large Vision-Language Model\n Handling Resolutions from 336 Pixels to 4K HD","summary":" The Large Vision-Language Model (LVLM) field has seen significant\nadvancements, yet its progression has been hindered by challenges in\ncomprehending fine-grained visual content due to limited resolution. Recent\nefforts have aimed to enhance the high-resolution understanding capabilities of\nLVLMs, yet they remain capped at approximately 1500 x 1500 pixels and\nconstrained to a relatively narrow resolution range. This paper represents\nInternLM-XComposer2-4KHD, a groundbreaking exploration into elevating LVLM\nresolution capabilities up to 4K HD (3840 x 1600) and beyond. Concurrently,\nconsidering the ultra-high resolution may not be necessary in all scenarios, it\nsupports a wide range of diverse resolutions from 336 pixels to 4K standard,\nsignificantly broadening its scope of applicability. Specifically, this\nresearch advances the patch division paradigm by introducing a novel extension:\ndynamic resolution with automatic patch configuration. It maintains the\ntraining image aspect ratios while automatically varying patch counts and\nconfiguring layouts based on a pre-trained Vision Transformer (ViT) (336 x\n336), leading to dynamic training resolution from 336 pixels to 4K standard.\nOur research demonstrates that scaling training resolution up to 4K HD leads to\nconsistent performance enhancements without hitting the ceiling of potential\nimprovements. InternLM-XComposer2-4KHD shows superb capability that matches or\neven surpasses GPT-4V and Gemini Pro in 10 of the 16 benchmarks. The\nInternLM-XComposer2-4KHD model series with 7B parameters are publicly available\nat https://github.com/InternLM/InternLM-XComposer.\n","authors":["Xiaoyi Dong","Pan Zhang","Yuhang Zang","Yuhang Cao","Bin Wang","Linke Ouyang","Songyang Zhang","Haodong Duan","Wenwei Zhang","Yining Li","Hang Yan","Yang Gao","Zhe Chen","Xinyue Zhang","Wei Li","Jingwen Li","Wenhai Wang","Kai Chen","Conghui He","Xingcheng Zhang","Jifeng Dai","Yu Qiao","Dahua Lin","Jiaqi Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06512v1.pdf","comment":"Code and models are publicly available at\n https://github.com/InternLM/InternLM-XComposer"},{"id":"http://arxiv.org/abs/2404.06511v1","updated":"2024-04-09T17:59:31Z","published":"2024-04-09T17:59:31Z","title":"MoReVQA: Exploring Modular Reasoning Models for Video Question Answering","summary":" This paper addresses the task of video question answering (videoQA) via a\ndecomposed multi-stage, modular reasoning framework. Previous modular methods\nhave shown promise with a single planning stage ungrounded in visual content.\nHowever, through a simple and effective baseline, we find that such systems can\nlead to brittle behavior in practice for challenging videoQA settings. Thus,\nunlike traditional single-stage planning methods, we propose a multi-stage\nsystem consisting of an event parser, a grounding stage, and a final reasoning\nstage in conjunction with an external memory. All stages are training-free, and\nperformed using few-shot prompting of large models, creating interpretable\nintermediate outputs at each stage. By decomposing the underlying planning and\ntask complexity, our method, MoReVQA, improves over prior work on standard\nvideoQA benchmarks (NExT-QA, iVQA, EgoSchema, ActivityNet-QA) with\nstate-of-the-art results, and extensions to related tasks (grounded videoQA,\nparagraph captioning).\n","authors":["Juhong Min","Shyamal Buch","Arsha Nagrani","Minsu Cho","Cordelia Schmid"],"pdf_url":"https://arxiv.org/pdf/2404.06511v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.06510v1","updated":"2024-04-09T17:59:04Z","published":"2024-04-09T17:59:04Z","title":"Can Feedback Enhance Semantic Grounding in Large Vision-Language Models?","summary":" Enhancing semantic grounding abilities in Vision-Language Models (VLMs) often\ninvolves collecting domain-specific training data, refining the network\narchitectures, or modifying the training recipes. In this work, we venture into\nan orthogonal direction and explore whether VLMs can improve their semantic\ngrounding by \"receiving\" feedback, without requiring in-domain data,\nfine-tuning, or modifications to the network architectures. We systematically\nanalyze this hypothesis using a feedback mechanism composed of a binary signal.\nWe find that if prompted appropriately, VLMs can utilize feedback both in a\nsingle step and iteratively, showcasing the potential of feedback as an\nalternative technique to improve grounding in internet-scale VLMs. Furthermore,\nVLMs, like LLMs, struggle to self-correct errors out-of-the-box. However, we\nfind that this issue can be mitigated via a binary verification mechanism.\nFinally, we explore the potential and limitations of amalgamating these\nfindings and applying them iteratively to automatically enhance VLMs' grounding\nperformance, showing grounding accuracy consistently improves using automated\nfeedback across all models in all settings investigated. Overall, our iterative\nframework improves semantic grounding in VLMs by more than 15 accuracy points\nunder noise-free feedback and up to 5 accuracy points under a simple automated\nbinary verification mechanism. The project website is hosted at\nhttps://andrewliao11.github.io/vlms_feedback\n","authors":["Yuan-Hong Liao","Rafid Mahmood","Sanja Fidler","David Acuna"],"pdf_url":"https://arxiv.org/pdf/2404.06510v1.pdf","comment":"31 pages, 15 figures"},{"id":"http://arxiv.org/abs/2404.06507v1","updated":"2024-04-09T17:55:41Z","published":"2024-04-09T17:55:41Z","title":"Reconstructing Hand-Held Objects in 3D","summary":" Objects manipulated by the hand (i.e., manipulanda) are particularly\nchallenging to reconstruct from in-the-wild RGB images or videos. Not only does\nthe hand occlude much of the object, but also the object is often only visible\nin a small number of image pixels. At the same time, two strong anchors emerge\nin this setting: (1) estimated 3D hands help disambiguate the location and\nscale of the object, and (2) the set of manipulanda is small relative to all\npossible objects. With these insights in mind, we present a scalable paradigm\nfor handheld object reconstruction that builds on recent breakthroughs in large\nlanguage/vision models and 3D object datasets. Our model, MCC-Hand-Object\n(MCC-HO), jointly reconstructs hand and object geometry given a single RGB\nimage and inferred 3D hand as inputs. Subsequently, we use GPT-4(V) to retrieve\na 3D object model that matches the object in the image and rigidly align the\nmodel to the network-inferred geometry; we call this alignment\nRetrieval-Augmented Reconstruction (RAR). Experiments demonstrate that MCC-HO\nachieves state-of-the-art performance on lab and Internet datasets, and we show\nhow RAR can be used to automatically obtain 3D labels for in-the-wild images of\nhand-object interactions.\n","authors":["Jane Wu","Georgios Pavlakos","Georgia Gkioxari","Jitendra Malik"],"pdf_url":"https://arxiv.org/pdf/2404.06507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17048v3","updated":"2024-04-09T17:54:12Z","published":"2023-11-28T18:55:37Z","title":"Zero-shot Referring Expression Comprehension via Structural Similarity\n Between Images and Captions","summary":" Zero-shot referring expression comprehension aims at localizing bounding\nboxes in an image corresponding to provided textual prompts, which requires:\n(i) a fine-grained disentanglement of complex visual scene and textual context,\nand (ii) a capacity to understand relationships among disentangled entities.\nUnfortunately, existing large vision-language alignment (VLA) models, e.g.,\nCLIP, struggle with both aspects so cannot be directly used for this task. To\nmitigate this gap, we leverage large foundation models to disentangle both\nimages and texts into triplets in the format of (subject, predicate, object).\nAfter that, grounding is accomplished by calculating the structural similarity\nmatrix between visual and textual triplets with a VLA model, and subsequently\npropagate it to an instance-level similarity matrix. Furthermore, to equip VLA\nmodels with the ability of relationship understanding, we design a\ntriplet-matching objective to fine-tune the VLA models on a collection of\ncurated dataset containing abundant entity relationships. Experiments\ndemonstrate that our visual grounding performance increase of up to 19.5% over\nthe SOTA zero-shot model on RefCOCO/+/g. On the more challenging Who's Waldo\ndataset, our zero-shot approach achieves comparable accuracy to the fully\nsupervised model. Code is available at\nhttps://github.com/Show-han/Zeroshot_REC.\n","authors":["Zeyu Han","Fangrui Zhu","Qianru Lao","Huaizu Jiang"],"pdf_url":"https://arxiv.org/pdf/2311.17048v3.pdf","comment":"CVPR 2024, Code available at https://github.com/Show-han/Zeroshot_REC"},{"id":"http://arxiv.org/abs/2212.08731v3","updated":"2024-04-09T17:52:49Z","published":"2022-12-16T22:03:37Z","title":"Multi-person 3D pose estimation from unlabelled data","summary":" Its numerous applications make multi-human 3D pose estimation a remarkably\nimpactful area of research. Nevertheless, assuming a multiple-view system\ncomposed of several regular RGB cameras, 3D multi-pose estimation presents\nseveral challenges. First of all, each person must be uniquely identified in\nthe different views to separate the 2D information provided by the cameras.\nSecondly, the 3D pose estimation process from the multi-view 2D information of\neach person must be robust against noise and potential occlusions in the\nscenario. In this work, we address these two challenges with the help of deep\nlearning. Specifically, we present a model based on Graph Neural Networks\ncapable of predicting the cross-view correspondence of the people in the\nscenario along with a Multilayer Perceptron that takes the 2D points to yield\nthe 3D poses of each person. These two models are trained in a self-supervised\nmanner, thus avoiding the need for large datasets with 3D annotations.\n","authors":["Daniel Rodriguez-Criado","Pilar Bachiller","George Vogiatzis","Luis J. Manso"],"pdf_url":"https://arxiv.org/pdf/2212.08731v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06493v1","updated":"2024-04-09T17:48:52Z","published":"2024-04-09T17:48:52Z","title":"Flying With Photons: Rendering Novel Views of Propagating Light","summary":" We present an imaging and neural rendering technique that seeks to synthesize\nvideos of light propagating through a scene from novel, moving camera\nviewpoints. Our approach relies on a new ultrafast imaging setup to capture a\nfirst-of-its kind, multi-viewpoint video dataset with picosecond-level temporal\nresolution. Combined with this dataset, we introduce an efficient neural volume\nrendering framework based on the transient field. This field is defined as a\nmapping from a 3D point and 2D direction to a high-dimensional, discrete-time\nsignal that represents time-varying radiance at ultrafast timescales. Rendering\nwith transient fields naturally accounts for effects due to the finite speed of\nlight, including viewpoint-dependent appearance changes caused by light\npropagation delays to the camera. We render a range of complex effects,\nincluding scattering, specular reflection, refraction, and diffraction.\nAdditionally, we demonstrate removing viewpoint-dependent propagation delays\nusing a time warping procedure, rendering of relativistic effects, and video\nsynthesis of direct and global components of light transport.\n","authors":["Anagh Malik","Noah Juravsky","Ryan Po","Gordon Wetzstein","Kiriakos N. Kutulakos","David B. Lindell"],"pdf_url":"https://arxiv.org/pdf/2404.06493v1.pdf","comment":"Project page: https://anaghmalik.com/FlyingWithPhotons/"},{"id":"http://arxiv.org/abs/2303.12054v4","updated":"2024-04-09T17:44:24Z","published":"2023-03-21T17:45:38Z","title":"Influencer Backdoor Attack on Semantic Segmentation","summary":" When a small number of poisoned samples are injected into the training\ndataset of a deep neural network, the network can be induced to exhibit\nmalicious behavior during inferences, which poses potential threats to\nreal-world applications. While they have been intensively studied in\nclassification, backdoor attacks on semantic segmentation have been largely\noverlooked. Unlike classification, semantic segmentation aims to classify every\npixel within a given image. In this work, we explore backdoor attacks on\nsegmentation models to misclassify all pixels of a victim class by injecting a\nspecific trigger on non-victim pixels during inferences, which is dubbed\nInfluencer Backdoor Attack (IBA). IBA is expected to maintain the\nclassification accuracy of non-victim pixels and mislead classifications of all\nvictim pixels in every single inference and could be easily applied to\nreal-world scenes. Based on the context aggregation ability of segmentation\nmodels, we proposed a simple, yet effective, Nearest-Neighbor trigger injection\nstrategy. We also introduce an innovative Pixel Random Labeling strategy which\nmaintains optimal performance even when the trigger is placed far from the\nvictim pixels. Our extensive experiments reveal that current segmentation\nmodels do suffer from backdoor attacks, demonstrate IBA real-world\napplicability, and show that our proposed techniques can further increase\nattack performance.\n","authors":["Haoheng Lan","Jindong Gu","Philip Torr","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2303.12054v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06483v1","updated":"2024-04-09T17:34:19Z","published":"2024-04-09T17:34:19Z","title":"RhythmMamba: Fast Remote Physiological Measurement with Arbitrary Length\n Videos","summary":" Remote photoplethysmography (rPPG) is a non-contact method for detecting\nphysiological signals from facial videos, holding great potential in various\napplications such as healthcare, affective computing, and anti-spoofing.\nExisting deep learning methods struggle to address two core issues of rPPG\nsimultaneously: extracting weak rPPG signals from video segments with large\nspatiotemporal redundancy and understanding the periodic patterns of rPPG among\nlong contexts. This represents a trade-off between computational complexity and\nthe ability to capture long-range dependencies, posing a challenge for rPPG\nthat is suitable for deployment on mobile devices. Based on the in-depth\nexploration of Mamba's comprehension of spatial and temporal information, this\npaper introduces RhythmMamba, an end-to-end Mamba-based method that employs\nmulti-temporal Mamba to constrain both periodic patterns and short-term trends,\ncoupled with frequency domain feed-forward to enable Mamba to robustly\nunderstand the quasi-periodic patterns of rPPG. Extensive experiments show that\nRhythmMamba achieves state-of-the-art performance with reduced parameters and\nlower computational complexity. The proposed RhythmMamba can be applied to\nvideo segments of any length without performance degradation. The codes are\navailable at https://github.com/zizheng-guo/RhythmMamba.\n","authors":["Bochao Zou","Zizheng Guo","Xiaocheng Hu","Huimin Ma"],"pdf_url":"https://arxiv.org/pdf/2404.06483v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2402.12788"},{"id":"http://arxiv.org/abs/2404.06479v1","updated":"2024-04-09T17:30:18Z","published":"2024-04-09T17:30:18Z","title":"Text-Based Reasoning About Vector Graphics","summary":" While large multimodal models excel in broad vision-language benchmarks, they\noften struggle with tasks requiring precise perception of low-level visual\ndetails, such as comparing line lengths or solving simple mazes. In particular,\nthis failure mode persists in question-answering tasks about vector graphics --\nimages composed purely of 2D objects and shapes. To address this challenge, we\npropose the Visually Descriptive Language Model (VDLM), which performs\ntext-based reasoning about vector graphics. VDLM leverages Scalable Vector\nGraphics (SVG) for a more precise visual description and first uses an\noff-the-shelf raster-to-SVG algorithm for encoding. Since existing language\nmodels cannot understand raw SVGs in a zero-shot setting, VDLM then bridges SVG\nwith pretrained language models through a newly introduced intermediate\nsymbolic representation, Primal Visual Description (PVD), comprising primitive\nattributes (e.g., shape, position, measurement) with their corresponding\npredicted values. PVD is task-agnostic and represents visual primitives that\nare universal across all vector graphics. It can be learned with procedurally\ngenerated (SVG, PVD) pairs and also enables the direct use of LLMs for\ngeneralization to complex reasoning tasks. By casting an image to a text-based\nrepresentation, we can leverage the power of language models to learn alignment\nfrom SVG to visual primitives and generalize to unseen question-answering\ntasks. Empirical results show that VDLM achieves stronger zero-shot performance\ncompared to state-of-the-art LMMs, such as GPT-4V, in various low-level\nmultimodal perception and reasoning tasks on vector graphics. We additionally\npresent extensive analyses on VDLM's performance, demonstrating that our\nframework offers better interpretability due to its disentangled perception and\nreasoning processes. Project page: https://mikewangwzhl.github.io/VDLM/\n","authors":["Zhenhailong Wang","Joy Hsu","Xingyao Wang","Kuan-Hao Huang","Manling Li","Jiajun Wu","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2404.06479v1.pdf","comment":"Project page: https://mikewangwzhl.github.io/VDLM/"},{"id":"http://arxiv.org/abs/2404.06470v1","updated":"2024-04-09T17:17:48Z","published":"2024-04-09T17:17:48Z","title":"Learning State-Invariant Representations of Objects from Image\n Collections with State, Pose, and Viewpoint Changes","summary":" We add one more invariance - state invariance - to the more commonly used\nother invariances for learning object representations for recognition and\nretrieval. By state invariance, we mean robust with respect to changes in the\nstructural form of the object, such as when an umbrella is folded, or when an\nitem of clothing is tossed on the floor. Since humans generally have no\ndifficulty in recognizing objects despite such state changes, we are naturally\nfaced with the question of whether it is possible to devise a neural\narchitecture with similar abilities. To that end, we present a novel dataset,\nObjectsWithStateChange, that captures state and pose variations in the object\nimages recorded from arbitrary viewpoints. We believe that this dataset will\nfacilitate research in fine-grained object recognition and retrieval of objects\nthat are capable of state changes. The goal of such research would be to train\nmodels capable of generating object embeddings that remain invariant to state\nchanges while also staying invariant to transformations induced by changes in\nviewpoint, pose, illumination, etc. To demonstrate the usefulness of the\nObjectsWithStateChange dataset, we also propose a curriculum learning strategy\nthat uses the similarity relationships in the learned embedding space after\neach epoch to guide the training process. The model learns discriminative\nfeatures by comparing visually similar objects within and across different\ncategories, encouraging it to differentiate between objects that may be\nchallenging to distinguish due to changes in their state. We believe that this\nstrategy enhances the model's ability to capture discriminative features for\nfine-grained tasks that may involve objects with state changes, leading to\nperformance improvements on object-level tasks not only on our new dataset, but\nalso on two other challenging multi-view datasets such as ModelNet40 and\nObjectPI.\n","authors":["Rohan Sarkar","Avinash Kak"],"pdf_url":"https://arxiv.org/pdf/2404.06470v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2304.06140v3","updated":"2024-04-09T17:09:03Z","published":"2023-04-12T19:47:13Z","title":"An Edit Friendly DDPM Noise Space: Inversion and Manipulations","summary":" Denoising diffusion probabilistic models (DDPMs) employ a sequence of white\nGaussian noise samples to generate an image. In analogy with GANs, those noise\nmaps could be considered as the latent code associated with the generated\nimage. However, this native noise space does not possess a convenient\nstructure, and is thus challenging to work with in editing tasks. Here, we\npropose an alternative latent noise space for DDPM that enables a wide range of\nediting operations via simple means, and present an inversion method for\nextracting these edit-friendly noise maps for any given image (real or\nsynthetically generated). As opposed to the native DDPM noise space, the\nedit-friendly noise maps do not have a standard normal distribution and are not\nstatistically independent across timesteps. However, they allow perfect\nreconstruction of any desired image, and simple transformations on them\ntranslate into meaningful manipulations of the output image (e.g. shifting,\ncolor edits). Moreover, in text-conditional models, fixing those noise maps\nwhile changing the text prompt, modifies semantics while retaining structure.\nWe illustrate how this property enables text-based editing of real images via\nthe diverse DDPM sampling scheme (in contrast to the popular non-diverse DDIM\ninversion). We also show how it can be used within existing diffusion-based\nediting methods to improve their quality and diversity. Webpage:\nhttps://inbarhub.github.io/DDPM_inversion\n","authors":["Inbar Huberman-Spiegelglas","Vladimir Kulikov","Tomer Michaeli"],"pdf_url":"https://arxiv.org/pdf/2304.06140v3.pdf","comment":"CVPR 2024. Code and examples are available at\n https://github.com/inbarhub/DDPM_inversion"},{"id":"http://arxiv.org/abs/2404.06455v1","updated":"2024-04-09T16:55:23Z","published":"2024-04-09T16:55:23Z","title":"A comparative analysis of deep learning models for lung segmentation on\n X-ray images","summary":" Robust and highly accurate lung segmentation in X-rays is crucial in medical\nimaging. This study evaluates deep learning solutions for this task, ranking\nexisting methods and analyzing their performance under diverse image\nmodifications. Out of 61 analyzed papers, only nine offered implementation or\npre-trained models, enabling assessment of three prominent methods: Lung VAE,\nTransResUNet, and CE-Net. The analysis revealed that CE-Net performs best,\ndemonstrating the highest values in dice similarity coefficient and\nintersection over union metric.\n","authors":["Weronika Hryniewska-Guzik","Jakub Bilski","Bartosz Chrostowski","Jakub Drak Sbahi","Przemysław Biecek"],"pdf_url":"https://arxiv.org/pdf/2404.06455v1.pdf","comment":"published at the Polish Conference on Artificial Intelligence\n (PP-RAI), 2024"},{"id":"http://arxiv.org/abs/2404.06453v1","updated":"2024-04-09T16:54:19Z","published":"2024-04-09T16:54:19Z","title":"PURE: Turning Polysemantic Neurons Into Pure Features by Identifying\n Relevant Circuits","summary":" The field of mechanistic interpretability aims to study the role of\nindividual neurons in Deep Neural Networks. Single neurons, however, have the\ncapability to act polysemantically and encode for multiple (unrelated)\nfeatures, which renders their interpretation difficult. We present a method for\ndisentangling polysemanticity of any Deep Neural Network by decomposing a\npolysemantic neuron into multiple monosemantic \"virtual\" neurons. This is\nachieved by identifying the relevant sub-graph (\"circuit\") for each \"pure\"\nfeature. We demonstrate how our approach allows us to find and disentangle\nvarious polysemantic units of ResNet models trained on ImageNet. While\nevaluating feature visualizations using CLIP, our method effectively\ndisentangles representations, improving upon methods based on neuron\nactivations. Our code is available at https://github.com/maxdreyer/PURE.\n","authors":["Maximilian Dreyer","Erblina Purelku","Johanna Vielhaben","Wojciech Samek","Sebastian Lapuschkin"],"pdf_url":"https://arxiv.org/pdf/2404.06453v1.pdf","comment":"14 pages (4 pages manuscript, 2 pages references, 8 pages appendix)"},{"id":"http://arxiv.org/abs/2404.06451v1","updated":"2024-04-09T16:53:43Z","published":"2024-04-09T16:53:43Z","title":"SmartControl: Enhancing ControlNet for Handling Rough Visual Conditions","summary":" Human visual imagination usually begins with analogies or rough sketches. For\nexample, given an image with a girl playing guitar before a building, one may\nanalogously imagine how it seems like if Iron Man playing guitar before Pyramid\nin Egypt. Nonetheless, visual condition may not be precisely aligned with the\nimaginary result indicated by text prompt, and existing layout-controllable\ntext-to-image (T2I) generation models is prone to producing degraded generated\nresults with obvious artifacts. To address this issue, we present a novel T2I\ngeneration method dubbed SmartControl, which is designed to modify the rough\nvisual conditions for adapting to text prompt. The key idea of our SmartControl\nis to relax the visual condition on the areas that are conflicted with text\nprompts. In specific, a Control Scale Predictor (CSP) is designed to identify\nthe conflict regions and predict the local control scales, while a dataset with\ntext prompts and rough visual conditions is constructed for training CSP. It is\nworth noting that, even with a limited number (e.g., 1,000~2,000) of training\nsamples, our SmartControl can generalize well to unseen objects. Extensive\nexperiments on four typical visual condition types clearly show the efficacy of\nour SmartControl against state-of-the-arts. Source code, pre-trained models,\nand datasets are available at https://github.com/liuxiaoyu1104/SmartControl.\n","authors":["Xiaoyu Liu","Yuxiang Wei","Ming Liu","Xianhui Lin","Peiran Ren","Xuansong Xie","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.06451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06447v1","updated":"2024-04-09T16:49:42Z","published":"2024-04-09T16:49:42Z","title":"The Central Spanning Tree Problem","summary":" Spanning trees are an important primitive in many data analysis tasks, when a\ndata set needs to be summarized in terms of its \"skeleton\", or when a\ntree-shaped graph over all observations is required for downstream processing.\nPopular definitions of spanning trees include the minimum spanning tree and the\noptimum distance spanning tree, a.k.a. the minimum routing cost tree. When\nsearching for the shortest spanning tree but admitting additional branching\npoints, even shorter spanning trees can be realized: Steiner trees.\nUnfortunately, both minimum spanning and Steiner trees are not robust with\nrespect to noise in the observations; that is, small perturbations of the\noriginal data set often lead to drastic changes in the associated spanning\ntrees. In response, we make two contributions when the data lies in a Euclidean\nspace: on the theoretical side, we introduce a new optimization problem, the\n\"(branched) central spanning tree\", which subsumes all previously mentioned\ndefinitions as special cases. On the practical side, we show empirically that\nthe (branched) central spanning tree is more robust to noise in the data, and\nas such is better suited to summarize a data set in terms of its skeleton. We\nalso propose a heuristic to address the NP-hard optimization problem, and\nillustrate its use on single cell RNA expression data from biology and 3D point\nclouds of plants.\n","authors":["Enrique Fita Sanmartín","Christoph Schnörr","Fred A. Hamprecht"],"pdf_url":"https://arxiv.org/pdf/2404.06447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06443v1","updated":"2024-04-09T16:45:34Z","published":"2024-04-09T16:45:34Z","title":"Multi-scale Dynamic and Hierarchical Relationship Modeling for Facial\n Action Units Recognition","summary":" Human facial action units (AUs) are mutually related in a hierarchical\nmanner, as not only they are associated with each other in both spatial and\ntemporal domains but also AUs located in the same/close facial regions show\nstronger relationships than those of different facial regions. While none of\nexisting approach thoroughly model such hierarchical inter-dependencies among\nAUs, this paper proposes to comprehensively model multi-scale AU-related\ndynamic and hierarchical spatio-temporal relationship among AUs for their\noccurrences recognition. Specifically, we first propose a novel multi-scale\ntemporal differencing network with an adaptive weighting block to explicitly\ncapture facial dynamics across frames at different spatial scales, which\nspecifically considers the heterogeneity of range and magnitude in different\nAUs' activation. Then, a two-stage strategy is introduced to hierarchically\nmodel the relationship among AUs based on their spatial distribution (i.e.,\nlocal and cross-region AU relationship modelling). Experimental results\nachieved on BP4D and DISFA show that our approach is the new state-of-the-art\nin the field of AU occurrence recognition. Our code is publicly available at\nhttps://github.com/CVI-SZU/MDHR.\n","authors":["Zihan Wang","Siyang Song","Cheng Luo","Songhe Deng","Weicheng Xie","Linlin Shen"],"pdf_url":"https://arxiv.org/pdf/2404.06443v1.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2404.06442v1","updated":"2024-04-09T16:42:54Z","published":"2024-04-09T16:42:54Z","title":"QueSTMaps: Queryable Semantic Topological Maps for 3D Scene\n Understanding","summary":" Understanding the structural organisation of 3D indoor scenes in terms of\nrooms is often accomplished via floorplan extraction. Robotic tasks such as\nplanning and navigation require a semantic understanding of the scene as well.\nThis is typically achieved via object-level semantic segmentation. However,\nsuch methods struggle to segment out topological regions like \"kitchen\" in the\nscene. In this work, we introduce a two-step pipeline. First, we extract a\ntopological map, i.e., floorplan of the indoor scene using a novel\nmulti-channel occupancy representation. Then, we generate CLIP-aligned features\nand semantic labels for every room instance based on the objects it contains\nusing a self-attention transformer. Our language-topology alignment supports\nnatural language querying, e.g., a \"place to cook\" locates the \"kitchen\". We\noutperform the current state-of-the-art on room segmentation by ~20% and room\nclassification by ~12%. Our detailed qualitative analysis and ablation studies\nprovide insights into the problem of joint structural and semantic 3D scene\nunderstanding.\n","authors":["Yash Mehan","Kumaraditya Gupta","Rohit Jayanti","Anirudh Govil","Sourav Garg","Madhava Krishna"],"pdf_url":"https://arxiv.org/pdf/2404.06442v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.12962v2","updated":"2024-04-09T16:39:00Z","published":"2021-10-25T13:56:00Z","title":"Event Data Association via Robust Model Fitting for Event-based Object\n Tracking","summary":" Event-based approaches, which are based on bio-inspired asynchronous event\ncameras, have achieved promising performance on various computer vision tasks.\nHowever, the study of the fundamental event data association problem is still\nin its infancy. In this paper, we propose a novel Event Data Association\n(called EDA) approach to explicitly address the event association and fusion\nproblem. The proposed EDA seeks for event trajectories that best fit the event\ndata, in order to perform unifying data association and information fusion. In\nEDA, we first asynchronously fuse the event data based on its information\nentropy. Then, we introduce a deterministic model hypothesis generation\nstrategy, which effectively generates model hypotheses from the fused events,\nto represent the corresponding event trajectories. After that, we present a\ntwo-stage weighting algorithm, which robustly weighs and selects true models\nfrom the generated model hypotheses, through multi-structural geometric model\nfitting. Meanwhile, we also propose an adaptive model selection strategy to\nautomatically determine the number of the true models. Finally, we use the\nselected true models to associate and fuse the event data, without being\naffected by sensor noise and irrelevant structures. We evaluate the performance\nof the proposed EDA on the object tracking task. The experimental results show\nthe effectiveness of EDA under challenging scenarios, such as high speed,\nmotion blur, and high dynamic range conditions.\n","authors":["Haosheng Chen","Shuyuan Lin","Yan Yan","Hanzi Wang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2110.12962v2.pdf","comment":"32 pages, 7 figures"},{"id":"http://arxiv.org/abs/2403.02408v2","updated":"2024-04-09T16:35:41Z","published":"2024-03-04T19:06:13Z","title":"A Spatio-temporal Aligned SUNet Model for Low-light Video Enhancement","summary":" Distortions caused by low-light conditions are not only visually unpleasant\nbut also degrade the performance of computer vision tasks. The restoration and\nenhancement have proven to be highly beneficial. However, there are only a\nlimited number of enhancement methods explicitly designed for videos acquired\nin low-light conditions. We propose a Spatio-Temporal Aligned SUNet (STA-SUNet)\nmodel using a Swin Transformer as a backbone to capture low light video\nfeatures and exploit their spatio-temporal correlations. The STA-SUNet model is\ntrained on a novel, fully registered dataset (BVI), which comprises dynamic\nscenes captured under varying light conditions. It is further analysed\ncomparatively against various other models over three test datasets. The model\ndemonstrates superior adaptivity across all datasets, obtaining the highest\nPSNR and SSIM values. It is particularly effective in extreme low-light\nconditions, yielding fairly good visualisation results.\n","authors":["Ruirui Lin","Nantheera Anantrasirichai","Alexandra Malyugina","David Bull"],"pdf_url":"https://arxiv.org/pdf/2403.02408v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03108v3","updated":"2024-04-09T16:31:33Z","published":"2023-07-06T16:27:39Z","title":"DIAGNOSIS: Detecting Unauthorized Data Usages in Text-to-image Diffusion\n Models","summary":" Recent text-to-image diffusion models have shown surprising performance in\ngenerating high-quality images. However, concerns have arisen regarding the\nunauthorized data usage during the training or fine-tuning process. One example\nis when a model trainer collects a set of images created by a particular artist\nand attempts to train a model capable of generating similar images without\nobtaining permission and giving credit to the artist. To address this issue, we\npropose a method for detecting such unauthorized data usage by planting the\ninjected memorization into the text-to-image diffusion models trained on the\nprotected dataset. Specifically, we modify the protected images by adding\nunique contents on these images using stealthy image warping functions that are\nnearly imperceptible to humans but can be captured and memorized by diffusion\nmodels. By analyzing whether the model has memorized the injected content\n(i.e., whether the generated images are processed by the injected\npost-processing function), we can detect models that had illegally utilized the\nunauthorized data. Experiments on Stable Diffusion and VQ Diffusion with\ndifferent model training or fine-tuning methods (i.e, LoRA, DreamBooth, and\nstandard training) demonstrate the effectiveness of our proposed method in\ndetecting unauthorized data usages. Code:\nhttps://github.com/ZhentingWang/DIAGNOSIS.\n","authors":["Zhenting Wang","Chen Chen","Lingjuan Lyu","Dimitris N. Metaxas","Shiqing Ma"],"pdf_url":"https://arxiv.org/pdf/2307.03108v3.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2404.06437v1","updated":"2024-04-09T16:28:54Z","published":"2024-04-09T16:28:54Z","title":"Seasonal Fire Prediction using Spatio-Temporal Deep Neural Networks","summary":" With climate change expected to exacerbate fire weather conditions, the\naccurate anticipation of wildfires on a global scale becomes increasingly\ncrucial for disaster mitigation. In this study, we utilize SeasFire, a\ncomprehensive global wildfire dataset with climate, vegetation, oceanic\nindices, and human-related variables, to enable seasonal wildfire forecasting\nwith machine learning. For the predictive analysis, we train deep learning\nmodels with different architectures that capture the spatio-temporal context\nleading to wildfires. Our investigation focuses on assessing the effectiveness\nof these models in predicting the presence of burned areas at varying\nforecasting time horizons globally, extending up to six months into the future,\nand on how different spatial or/and temporal context affects the performance of\nthe models. Our findings demonstrate the great potential of deep learning\nmodels in seasonal fire forecasting; longer input time-series leads to more\nrobust predictions across varying forecasting horizons, while integrating\nspatial information to capture wildfire spatio-temporal dynamics boosts\nperformance. Finally, our results hint that in order to enhance performance at\nlonger forecasting horizons, a larger receptive field spatially needs to be\nconsidered.\n","authors":["Dimitrios Michail","Lefki-Ioanna Panagiotou","Charalampos Davalas","Ioannis Prapas","Spyros Kondylatos","Nikolaos Ioannis Bountos","Ioannis Papoutsis"],"pdf_url":"https://arxiv.org/pdf/2404.06437v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06430v1","updated":"2024-04-09T16:23:01Z","published":"2024-04-09T16:23:01Z","title":"pfl-research: simulation framework for accelerating research in Private\n Federated Learning","summary":" Federated learning (FL) is an emerging machine learning (ML) training\nparadigm where clients own their data and collaborate to train a global model,\nwithout revealing any data to the server and other participants. Researchers\ncommonly perform experiments in a simulation environment to quickly iterate on\nideas. However, existing open-source tools do not offer the efficiency required\nto simulate FL on larger and more realistic FL datasets. We introduce\npfl-research, a fast, modular, and easy-to-use Python framework for simulating\nFL. It supports TensorFlow, PyTorch, and non-neural network models, and is\ntightly integrated with state-of-the-art privacy algorithms. We study the speed\nof open-source FL frameworks and show that pfl-research is 7-72$\\times$ faster\nthan alternative open-source frameworks on common cross-device setups. Such\nspeedup will significantly boost the productivity of the FL research community\nand enable testing hypotheses on realistic FL datasets that were previously too\nresource intensive. We release a suite of benchmarks that evaluates an\nalgorithm's overall performance on a diverse set of realistic scenarios. The\ncode is available on GitHub at https://github.com/apple/pfl-research.\n","authors":["Filip Granqvist","Congzheng Song","Áine Cahill","Rogier van Dalen","Martin Pelikan","Yi Sheng Chan","Xiaojun Feng","Natarajan Krishnaswami","Vojta Jina","Mona Chitnis"],"pdf_url":"https://arxiv.org/pdf/2404.06430v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06429v1","updated":"2024-04-09T16:20:03Z","published":"2024-04-09T16:20:03Z","title":"Magic-Boost: Boost 3D Generation with Mutli-View Conditioned Diffusion","summary":" Benefiting from the rapid development of 2D diffusion models, 3D content\ncreation has made significant progress recently. One promising solution\ninvolves the fine-tuning of pre-trained 2D diffusion models to harness their\ncapacity for producing multi-view images, which are then lifted into accurate\n3D models via methods like fast-NeRFs or large reconstruction models. However,\nas inconsistency still exists and limited generated resolution, the generation\nresults of such methods still lack intricate textures and complex geometries.\nTo solve this problem, we propose Magic-Boost, a multi-view conditioned\ndiffusion model that significantly refines coarse generative results through a\nbrief period of SDS optimization ($\\sim15$min). Compared to the previous text\nor single image based diffusion models, Magic-Boost exhibits a robust\ncapability to generate images with high consistency from pseudo synthesized\nmulti-view images. It provides precise SDS guidance that well aligns with the\nidentity of the input images, enriching the local detail in both geometry and\ntexture of the initial generative results. Extensive experiments show\nMagic-Boost greatly enhances the coarse inputs and generates high-quality 3D\nassets with rich geometric and textural details. (Project Page:\nhttps://magic-research.github.io/magic-boost/)\n","authors":["Fan Yang","Jianfeng Zhang","Yichun Shi","Bowen Chen","Chenxu Zhang","Huichao Zhang","Xiaofeng Yang","Jiashi Feng","Guosheng Lin"],"pdf_url":"https://arxiv.org/pdf/2404.06429v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06425v1","updated":"2024-04-09T16:15:03Z","published":"2024-04-09T16:15:03Z","title":"ZeST: Zero-Shot Material Transfer from a Single Image","summary":" We propose ZeST, a method for zero-shot material transfer to an object in the\ninput image given a material exemplar image. ZeST leverages existing diffusion\nadapters to extract implicit material representation from the exemplar image.\nThis representation is used to transfer the material using pre-trained\ninpainting diffusion model on the object in the input image using depth\nestimates as geometry cue and grayscale object shading as illumination cues.\nThe method works on real images without any training resulting a zero-shot\napproach. Both qualitative and quantitative results on real and synthetic\ndatasets demonstrate that ZeST outputs photorealistic images with transferred\nmaterials. We also show the application of ZeST to perform multiple edits and\nrobust material assignment under different illuminations. Project Page:\nhttps://ttchengab.github.io/zest\n","authors":["Ta-Ying Cheng","Prafull Sharma","Andrew Markham","Niki Trigoni","Varun Jampani"],"pdf_url":"https://arxiv.org/pdf/2404.06425v1.pdf","comment":"Project Page: https://ttchengab.github.io/zest"},{"id":"http://arxiv.org/abs/2404.06406v1","updated":"2024-04-09T15:54:03Z","published":"2024-04-09T15:54:03Z","title":"Emergent Dynamics in Neural Cellular Automata","summary":" Neural Cellular Automata (NCA) models are trainable variations of traditional\nCellular Automata (CA). Emergent motion in the patterns created by NCA has been\nsuccessfully applied to synthesize dynamic textures. However, the conditions\nrequired for an NCA to display dynamic patterns remain unexplored. Here, we\ninvestigate the relationship between the NCA architecture and the emergent\ndynamics of the trained models. Specifically, we vary the number of channels in\nthe cell state and the number of hidden neurons in the MultiLayer Perceptron\n(MLP), and draw a relationship between the combination of these two variables\nand the motion strength between successive frames. Our analysis reveals that\nthe disparity and proportionality between these two variables have a strong\ncorrelation with the emergent dynamics in the NCA output. We thus propose a\ndesign principle for creating dynamic NCA.\n","authors":["Yitao Xu","Ehsan Pajouheshgar","Sabine Süsstrunk"],"pdf_url":"https://arxiv.org/pdf/2404.06406v1.pdf","comment":"2 pages"},{"id":"http://arxiv.org/abs/2312.09168v3","updated":"2024-04-09T15:47:56Z","published":"2023-12-14T17:34:53Z","title":"DiffusionLight: Light Probes for Free by Painting a Chrome Ball","summary":" We present a simple yet effective technique to estimate lighting in a single\ninput image. Current techniques rely heavily on HDR panorama datasets to train\nneural networks to regress an input with limited field-of-view to a full\nenvironment map. However, these approaches often struggle with real-world,\nuncontrolled settings due to the limited diversity and size of their datasets.\nTo address this problem, we leverage diffusion models trained on billions of\nstandard images to render a chrome ball into the input image. Despite its\nsimplicity, this task remains challenging: the diffusion models often insert\nincorrect or inconsistent objects and cannot readily generate images in HDR\nformat. Our research uncovers a surprising relationship between the appearance\nof chrome balls and the initial diffusion noise map, which we utilize to\nconsistently generate high-quality chrome balls. We further fine-tune an LDR\ndiffusion model (Stable Diffusion XL) with LoRA, enabling it to perform\nexposure bracketing for HDR light estimation. Our method produces convincing\nlight estimates across diverse settings and demonstrates superior\ngeneralization to in-the-wild scenarios.\n","authors":["Pakkapon Phongthawee","Worameth Chinchuthakun","Nontaphat Sinsunthithet","Amit Raj","Varun Jampani","Pramook Khungurn","Supasorn Suwajanakorn"],"pdf_url":"https://arxiv.org/pdf/2312.09168v3.pdf","comment":"CVPR 2024 Oral. For more information and code, please visit our\n website https://diffusionlight.github.io/"},{"id":"http://arxiv.org/abs/2204.03330v2","updated":"2024-04-09T15:44:05Z","published":"2022-04-07T09:56:36Z","title":"Learning Local and Global Temporal Contexts for Video Semantic\n Segmentation","summary":" Contextual information plays a core role for video semantic segmentation\n(VSS). This paper summarizes contexts for VSS in two-fold: local temporal\ncontexts (LTC) which define the contexts from neighboring frames, and global\ntemporal contexts (GTC) which represent the contexts from the whole video. As\nfor LTC, it includes static and motional contexts, corresponding to static and\nmoving content in neighboring frames, respectively. Previously, both static and\nmotional contexts have been studied. However, there is no research about\nsimultaneously learning static and motional contexts (highly complementary).\nHence, we propose a Coarse-to-Fine Feature Mining (CFFM) technique to learn a\nunified presentation of LTC. CFFM contains two parts: Coarse-to-Fine Feature\nAssembling (CFFA) and Cross-frame Feature Mining (CFM). CFFA abstracts static\nand motional contexts, and CFM mines useful information from nearby frames to\nenhance target features. To further exploit more temporal contexts, we propose\nCFFM++ by additionally learning GTC from the whole video. Specifically, we\nuniformly sample certain frames from the video and extract global contextual\nprototypes by k-means. The information within those prototypes is mined by CFM\nto refine target features. Experimental results on popular benchmarks\ndemonstrate that CFFM and CFFM++ perform favorably against state-of-the-art\nmethods. Our code is available at https://github.com/GuoleiSun/VSS-CFFM\n","authors":["Guolei Sun","Yun Liu","Henghui Ding","Min Wu","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2204.03330v2.pdf","comment":"Accepted to TPAMI, an extended version of a paper published in CVPR\n 2022"},{"id":"http://arxiv.org/abs/2401.16110v2","updated":"2024-04-09T15:33:10Z","published":"2024-01-29T12:31:13Z","title":"SGV3D:Towards Scenario Generalization for Vision-based Roadside 3D\n Object Detection","summary":" Roadside perception can greatly increase the safety of autonomous vehicles by\nextending their perception ability beyond the visual range and addressing blind\nspots. However, current state-of-the-art vision-based roadside detection\nmethods possess high accuracy on labeled scenes but have inferior performance\non new scenes. This is because roadside cameras remain stationary after\ninstallation and can only collect data from a single scene, resulting in the\nalgorithm overfitting these roadside backgrounds and camera poses. To address\nthis issue, in this paper, we propose an innovative Scenario Generalization\nFramework for Vision-based Roadside 3D Object Detection, dubbed SGV3D.\nSpecifically, we employ a Background-suppressed Module (BSM) to mitigate\nbackground overfitting in vision-centric pipelines by attenuating background\nfeatures during the 2D to bird's-eye-view projection. Furthermore, by\nintroducing the Semi-supervised Data Generation Pipeline (SSDG) using unlabeled\nimages from new scenes, diverse instance foregrounds with varying camera poses\nare generated, addressing the risk of overfitting specific camera poses. We\nevaluate our method on two large-scale roadside benchmarks. Our method\nsurpasses all previous methods by a significant margin in new scenes, including\n+42.57% for vehicle, +5.87% for pedestrian, and +14.89% for cyclist compared to\nBEVHeight on the DAIR-V2X-I heterologous benchmark. On the larger-scale Rope3D\nheterologous benchmark, we achieve notable gains of 14.48% for car and 12.41%\nfor large vehicle. We aspire to contribute insights on the exploration of\nroadside perception techniques, emphasizing their capability for scenario\ngeneralization. The code will be available at\nhttps://github.com/yanglei18/SGV3D\n","authors":["Lei Yang","Xinyu Zhang","Jun Li","Li Wang","Chuang Zhang","Li Ju","Zhiwei Li","Yang Shen"],"pdf_url":"https://arxiv.org/pdf/2401.16110v2.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.06389v1","updated":"2024-04-09T15:31:48Z","published":"2024-04-09T15:31:48Z","title":"Raster Forge: Interactive Raster Manipulation Library and GUI for Python","summary":" Raster Forge is a Python library and graphical user interface for raster data\nmanipulation and analysis. The tool is focused on remote sensing applications,\nparticularly in wildfire management. It allows users to import, visualize, and\nprocess raster layers for tasks such as image compositing or topographical\nanalysis. For wildfire management, it generates fuel maps using predefined\nmodels. Its impact extends from disaster management to hydrological modeling,\nagriculture, and environmental monitoring. Raster Forge can be a valuable asset\nfor geoscientists and researchers who rely on raster data analysis, enhancing\ngeospatial data processing and visualization across various disciplines.\n","authors":["Afonso Oliveira","Nuno Fachada","João P. Matos-Carvalho"],"pdf_url":"https://arxiv.org/pdf/2404.06389v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20330v2","updated":"2024-04-09T15:17:50Z","published":"2024-03-29T17:59:34Z","title":"Are We on the Right Way for Evaluating Large Vision-Language Models?","summary":" Large vision-language models (LVLMs) have recently achieved rapid progress,\nsparking numerous studies to evaluate their multi-modal capabilities. However,\nwe dig into current evaluation works and identify two primary issues: 1) Visual\ncontent is unnecessary for many samples. The answers can be directly inferred\nfrom the questions and options, or the world knowledge embedded in LLMs. This\nphenomenon is prevalent across current benchmarks. For instance, GeminiPro\nachieves 42.9% on the MMMU benchmark without any visual input, and outperforms\nthe random choice baseline across six benchmarks over 24% on average. 2)\nUnintentional data leakage exists in LLM and LVLM training. LLM and LVLM could\nstill answer some visual-necessary questions without visual content, indicating\nthe memorizing of these samples within large-scale training data. For example,\nSphinx-X-MoE gets 43.6% on MMMU without accessing images, surpassing its LLM\nbackbone with 17.9%. Both problems lead to misjudgments of actual multi-modal\ngains and potentially misguide the study of LVLM. To this end, we present\nMMStar, an elite vision-indispensable multi-modal benchmark comprising 1,500\nsamples meticulously selected by humans. MMStar benchmarks 6 core capabilities\nand 18 detailed axes, aiming to evaluate LVLMs' multi-modal capacities with\ncarefully balanced and purified samples. These samples are first roughly\nselected from current benchmarks with an automated pipeline, human review is\nthen involved to ensure each curated sample exhibits visual dependency, minimal\ndata leakage, and requires advanced multi-modal capabilities. Moreover, two\nmetrics are developed to measure data leakage and actual performance gain in\nmulti-modal training. We evaluate 16 leading LVLMs on MMStar to assess their\nmulti-modal capabilities, and on 7 benchmarks with the proposed metrics to\ninvestigate their data leakage and actual multi-modal gain.\n","authors":["Lin Chen","Jinsong Li","Xiaoyi Dong","Pan Zhang","Yuhang Zang","Zehui Chen","Haodong Duan","Jiaqi Wang","Yu Qiao","Dahua Lin","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.20330v2.pdf","comment":"Project page: https://mmstar-benchmark.github.io/"},{"id":"http://arxiv.org/abs/2403.04198v2","updated":"2024-04-09T15:07:08Z","published":"2024-03-07T03:59:47Z","title":"CN-RMA: Combined Network with Ray Marching Aggregation for 3D Indoors\n Object Detection from Multi-view Images","summary":" This paper introduces CN-RMA, a novel approach for 3D indoor object detection\nfrom multi-view images. We observe the key challenge as the ambiguity of image\nand 3D correspondence without explicit geometry to provide occlusion\ninformation. To address this issue, CN-RMA leverages the synergy of 3D\nreconstruction networks and 3D object detection networks, where the\nreconstruction network provides a rough Truncated Signed Distance Function\n(TSDF) and guides image features to vote to 3D space correctly in an end-to-end\nmanner. Specifically, we associate weights to sampled points of each ray\nthrough ray marching, representing the contribution of a pixel in an image to\ncorresponding 3D locations. Such weights are determined by the predicted signed\ndistances so that image features vote only to regions near the reconstructed\nsurface. Our method achieves state-of-the-art performance in 3D object\ndetection from multi-view images, as measured by mAP@0.25 and mAP@0.5 on the\nScanNet and ARKitScenes datasets. The code and models are released at\nhttps://github.com/SerCharles/CN-RMA.\n","authors":["Guanlin Shen","Jingwei Huang","Zhihua Hu","Bin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.04198v2.pdf","comment":"CVPR2024 poster paper, 8 pages of main part, and 4 pages of\n supplementary material"},{"id":"http://arxiv.org/abs/2311.06798v2","updated":"2024-04-09T15:07:02Z","published":"2023-11-12T10:21:04Z","title":"MetaMix: Meta-state Precision Searcher for Mixed-precision Activation\n Quantization","summary":" Mixed-precision quantization of efficient networks often suffer from\nactivation instability encountered in the exploration of bit selections. To\naddress this problem, we propose a novel method called MetaMix which consists\nof bit selection and weight training phases. The bit selection phase iterates\ntwo steps, (1) the mixed-precision-aware weight update, and (2) the bit-search\ntraining with the fixed mixed-precision-aware weights, both of which combined\nreduce activation instability in mixed-precision quantization and contribute to\nfast and high-quality bit selection. The weight training phase exploits the\nweights and step sizes trained in the bit selection phase and fine-tunes them\nthereby offering fast training. Our experiments with efficient and\nhard-to-quantize networks, i.e., MobileNet v2 and v3, and ResNet-18 on ImageNet\nshow that our proposed method pushes the boundary of mixed-precision\nquantization, in terms of accuracy vs. operations, by outperforming both mixed-\nand single-precision SOTA methods.\n","authors":["Han-Byul Kim","Joo Hyung Lee","Sungjoo Yoo","Hong-Seok Kim"],"pdf_url":"https://arxiv.org/pdf/2311.06798v2.pdf","comment":"Proc. The 38th Annual AAAI Conference on Artificial Intelligence\n (AAAI)"},{"id":"http://arxiv.org/abs/2404.06369v1","updated":"2024-04-09T15:05:48Z","published":"2024-04-09T15:05:48Z","title":"VISION2UI: A Real-World Dataset with Layout for Code Generation from UI\n Designs","summary":" Automatically generating UI code from webpage design visions can\nsignificantly alleviate the burden of developers, enabling beginner developers\nor designers to directly generate Web pages from design diagrams. Currently,\nprior research has accomplished the objective of generating UI code from\nrudimentary design visions or sketches through designing deep neural networks.\nInspired by the groundbreaking advancements achieved by Multimodal Large\nLanguage Models (MLLMs), the automatic generation of UI code from high-fidelity\ndesign images is now emerging as a viable possibility. Nevertheless, our\ninvestigation reveals that existing MLLMs are hampered by the scarcity of\nauthentic, high-quality, and large-scale datasets, leading to unsatisfactory\nperformance in automated UI code generation. To mitigate this gap, we present a\nnovel dataset, termed VISION2UI, extracted from real-world scenarios, augmented\nwith comprehensive layout information, tailored specifically for finetuning\nMLLMs in UI code generation. Specifically, this dataset is derived through a\nseries of operations, encompassing collecting, cleaning, and filtering of the\nopen-source Common Crawl dataset. In order to uphold its quality, a neural\nscorer trained on labeled samples is utilized to refine the data, retaining\nhigher-quality instances. Ultimately, this process yields a dataset comprising\n2,000 (Much more is coming soon) parallel samples encompassing design visions\nand UI code. The dataset is available at\nhttps://huggingface.co/datasets/xcodemind/vision2ui.\n","authors":["Yi Gui","Zhen Li","Yao Wan","Yemin Shi","Hongyu Zhang","Yi Su","Shaoling Dong","Xing Zhou","Wenbin Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.06369v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06365v1","updated":"2024-04-09T15:02:01Z","published":"2024-04-09T15:02:01Z","title":"Dynamic Resolution Guidance for Facial Expression Recognition","summary":" Facial expression recognition (FER) is vital for human-computer interaction\nand emotion analysis, yet recognizing expressions in low-resolution images\nremains challenging. This paper introduces a practical method called Dynamic\nResolution Guidance for Facial Expression Recognition (DRGFER) to effectively\nrecognize facial expressions in images with varying resolutions without\ncompromising FER model accuracy. Our framework comprises two main components:\nthe Resolution Recognition Network (RRN) and the Multi-Resolution Adaptation\nFacial Expression Recognition Network (MRAFER). The RRN determines image\nresolution, outputs a binary vector, and the MRAFER assigns images to suitable\nfacial expression recognition networks based on resolution. We evaluated DRGFER\non widely-used datasets RAFDB and FERPlus, demonstrating that our method\nretains optimal model performance at each resolution and outperforms\nalternative resolution approaches. The proposed framework exhibits robustness\nagainst resolution variations and facial expressions, offering a promising\nsolution for real-world applications.\n","authors":["Jie Ou","Xu Li","Tianxiang Jiang","Yuanlun Xie"],"pdf_url":"https://arxiv.org/pdf/2404.06365v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06362v1","updated":"2024-04-09T14:56:34Z","published":"2024-04-09T14:56:34Z","title":"Test-Time Adaptation with SaLIP: A Cascade of SAM and CLIP for Zero shot\n Medical Image Segmentation","summary":" The Segment Anything Model (SAM) and CLIP are remarkable vision foundation\nmodels (VFMs). SAM, a prompt driven segmentation model, excels in segmentation\ntasks across diverse domains, while CLIP is renowned for its zero shot\nrecognition capabilities. However, their unified potential has not yet been\nexplored in medical image segmentation. To adapt SAM to medical imaging,\nexisting methods primarily rely on tuning strategies that require extensive\ndata or prior prompts tailored to the specific task, making it particularly\nchallenging when only a limited number of data samples are available. This work\npresents an in depth exploration of integrating SAM and CLIP into a unified\nframework for medical image segmentation. Specifically, we propose a simple\nunified framework, SaLIP, for organ segmentation. Initially, SAM is used for\npart based segmentation within the image, followed by CLIP to retrieve the mask\ncorresponding to the region of interest (ROI) from the pool of SAM generated\nmasks. Finally, SAM is prompted by the retrieved ROI to segment a specific\norgan. Thus, SaLIP is training and fine tuning free and does not rely on domain\nexpertise or labeled data for prompt engineering. Our method shows substantial\nenhancements in zero shot segmentation, showcasing notable improvements in DICE\nscores across diverse segmentation tasks like brain (63.46%), lung (50.11%),\nand fetal head (30.82%), when compared to un prompted SAM. Code and text\nprompts will be available online.\n","authors":["Sidra Aleem","Fangyijie Wang","Mayug Maniparambil","Eric Arazo","Julia Dietlmeier","Kathleen Curran","Noel E. O'Connor","Suzanne Little"],"pdf_url":"https://arxiv.org/pdf/2404.06362v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06353v1","updated":"2024-04-09T14:44:12Z","published":"2024-04-09T14:44:12Z","title":"High Noise Scheduling is a Must","summary":" Consistency models possess high capabilities for image generation, advancing\nsampling steps to a single step through their advanced techniques. Current\nadvancements move one step forward consistency training techniques and\neliminates the limitation of distillation training. Even though the proposed\ncurriculum and noise scheduling in improved training techniques yield better\nresults than basic consistency models, it lacks well balanced noise\ndistribution and its consistency between curriculum. In this study, it is\ninvestigated the balance between high and low noise levels in noise\ndistribution and offered polynomial noise distribution to maintain the\nstability. This proposed polynomial noise distribution is also supported with a\npredefined Karras noises to prevent unique noise levels arises with Karras\nnoise generation algorithm. Furthermore, by elimination of learned noisy steps\nwith a curriculum based on sinusoidal function increase the performance of the\nmodel in denoising. To make a fair comparison with the latest released\nconsistency model training techniques, experiments are conducted with same\nhyper-parameters except curriculum and noise distribution. The models utilized\nduring experiments are determined with low depth to prove the robustness of our\nproposed technique. The results show that the polynomial noise distribution\noutperforms the model trained with log-normal noise distribution, yielding a\n33.54 FID score after 100,000 training steps with constant discretization\nsteps. Additionally, the implementation of a sinusoidal-based curriculum\nenhances denoising performance, resulting in a FID score of 30.48.\n","authors":["Mahmut S. Gokmen","Cody Bumgardner","Jie Zhang","Ge Wang","Jin Chen"],"pdf_url":"https://arxiv.org/pdf/2404.06353v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06352v1","updated":"2024-04-09T14:43:19Z","published":"2024-04-09T14:43:19Z","title":"DaF-BEVSeg: Distortion-aware Fisheye Camera based Bird's Eye View\n Segmentation with Occlusion Reasoning","summary":" Semantic segmentation is an effective way to perform scene understanding.\nRecently, segmentation in 3D Bird's Eye View (BEV) space has become popular as\nits directly used by drive policy. However, there is limited work on BEV\nsegmentation for surround-view fisheye cameras, commonly used in commercial\nvehicles. As this task has no real-world public dataset and existing synthetic\ndatasets do not handle amodal regions due to occlusion, we create a synthetic\ndataset using the Cognata simulator comprising diverse road types, weather, and\nlighting conditions. We generalize the BEV segmentation to work with any camera\nmodel; this is useful for mixing diverse cameras. We implement a baseline by\napplying cylindrical rectification on the fisheye images and using a standard\nLSS-based BEV segmentation model. We demonstrate that we can achieve better\nperformance without undistortion, which has the adverse effects of increased\nruntime due to pre-processing, reduced field-of-view, and resampling artifacts.\nFurther, we introduce a distortion-aware learnable BEV pooling strategy that is\nmore effective for the fisheye cameras. We extend the model with an occlusion\nreasoning module, which is critical for estimating in BEV space. Qualitative\nperformance of DaF-BEVSeg is showcased in the video at\nhttps://streamable.com/ge4v51.\n","authors":["Senthil Yogamani","David Unger","Venkatraman Narayanan","Varun Ravi Kumar"],"pdf_url":"https://arxiv.org/pdf/2404.06352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06351v1","updated":"2024-04-09T14:42:31Z","published":"2024-04-09T14:42:31Z","title":"HPNet: Dynamic Trajectory Forecasting with Historical Prediction\n Attention","summary":" Predicting the trajectories of road agents is essential for autonomous\ndriving systems. The recent mainstream methods follow a static paradigm, which\npredicts the future trajectory by using a fixed duration of historical frames.\nThese methods make the predictions independently even at adjacent time steps,\nwhich leads to potential instability and temporal inconsistency. As successive\ntime steps have largely overlapping historical frames, their forecasting should\nhave intrinsic correlation, such as overlapping predicted trajectories should\nbe consistent, or be different but share the same motion goal depending on the\nroad situation. Motivated by this, in this work, we introduce HPNet, a novel\ndynamic trajectory forecasting method. Aiming for stable and accurate\ntrajectory forecasting, our method leverages not only historical frames\nincluding maps and agent states, but also historical predictions. Specifically,\nwe newly design a Historical Prediction Attention module to automatically\nencode the dynamic relationship between successive predictions. Besides, it\nalso extends the attention range beyond the currently visible window\nbenefitting from the use of historical predictions. The proposed Historical\nPrediction Attention together with the Agent Attention and Mode Attention is\nfurther formulated as the Triple Factorized Attention module, serving as the\ncore design of HPNet.Experiments on the Argoverse and INTERACTION datasets show\nthat HPNet achieves state-of-the-art performance, and generates accurate and\nstable future trajectories. Our code are available at\nhttps://github.com/XiaolongTang23/HPNet.\n","authors":["Xiaolong Tang","Meina Kan","Shiguang Shan","Zhilong Ji","Jinfeng Bai","Xilin Chen"],"pdf_url":"https://arxiv.org/pdf/2404.06351v1.pdf","comment":"accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2404.06350v1","updated":"2024-04-09T14:40:54Z","published":"2024-04-09T14:40:54Z","title":"Rolling Shutter Correction with Intermediate Distortion Flow Estimation","summary":" This paper proposes to correct the rolling shutter (RS) distorted images by\nestimating the distortion flow from the global shutter (GS) to RS directly.\nExisting methods usually perform correction using the undistortion flow from\nthe RS to GS. They initially predict the flow from consecutive RS frames,\nsubsequently rescaling it as the displacement fields from the RS frame to the\nunderlying GS image using time-dependent scaling factors. Following this,\nRS-aware forward warping is employed to convert the RS image into its GS\ncounterpart. Nevertheless, this strategy is prone to two shortcomings. First,\nthe undistortion flow estimation is rendered inaccurate by merely linear\nscaling the flow, due to the complex non-linear motion nature. Second, RS-aware\nforward warping often results in unavoidable artifacts. To address these\nlimitations, we introduce a new framework that directly estimates the\ndistortion flow and rectifies the RS image with the backward warping operation.\nMore specifically, we first propose a global correlation-based flow attention\nmechanism to estimate the initial distortion flow and GS feature jointly, which\nare then refined by the following coarse-to-fine decoder layers. Additionally,\na multi-distortion flow prediction strategy is integrated to mitigate the issue\nof inaccurate flow estimation further. Experimental results validate the\neffectiveness of the proposed method, which outperforms state-of-the-art\napproaches on various benchmarks while maintaining high efficiency. The project\nis available at \\url{https://github.com/ljzycmd/DFRSC}.\n","authors":["Mingdeng Cao","Sidi Yang","Yujiu Yang","Yinqiang Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.06350v1.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2403.20035v2","updated":"2024-04-09T14:29:10Z","published":"2024-03-29T08:03:42Z","title":"UltraLight VM-UNet: Parallel Vision Mamba Significantly Reduces\n Parameters for Skin Lesion Segmentation","summary":" Traditionally for improving the segmentation performance of models, most\napproaches prefer to use adding more complex modules. And this is not suitable\nfor the medical field, especially for mobile medical devices, where\ncomputationally loaded models are not suitable for real clinical environments\ndue to computational resource constraints. Recently, state-space models (SSMs),\nrepresented by Mamba, have become a strong competitor to traditional CNNs and\nTransformers. In this paper, we deeply explore the key elements of parameter\ninfluence in Mamba and propose an UltraLight Vision Mamba UNet (UltraLight\nVM-UNet) based on this. Specifically, we propose a method for processing\nfeatures in parallel Vision Mamba, named PVM Layer, which achieves excellent\nperformance with the lowest computational load while keeping the overall number\nof processing channels constant. We conducted comparisons and ablation\nexperiments with several state-of-the-art lightweight models on three skin\nlesion public datasets and demonstrated that the UltraLight VM-UNet exhibits\nthe same strong performance competitiveness with parameters of only 0.049M and\nGFLOPs of 0.060. In addition, this study deeply explores the key elements of\nparameter influence in Mamba, which will lay a theoretical foundation for Mamba\nto possibly become a new mainstream module for lightweighting in the future.\nThe code is available from https://github.com/wurenkai/UltraLight-VM-UNet .\n","authors":["Renkai Wu","Yinghao Liu","Pengchen Liang","Qing Chang"],"pdf_url":"https://arxiv.org/pdf/2403.20035v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06337v1","updated":"2024-04-09T14:22:50Z","published":"2024-04-09T14:22:50Z","title":"Matching 2D Images in 3D: Metric Relative Pose from Metric\n Correspondences","summary":" Given two images, we can estimate the relative camera pose between them by\nestablishing image-to-image correspondences. Usually, correspondences are\n2D-to-2D and the pose we estimate is defined only up to scale. Some\napplications, aiming at instant augmented reality anywhere, require\nscale-metric pose estimates, and hence, they rely on external depth estimators\nto recover the scale. We present MicKey, a keypoint matching pipeline that is\nable to predict metric correspondences in 3D camera space. By learning to match\n3D coordinates across images, we are able to infer the metric relative pose\nwithout depth measurements. Depth measurements are also not required for\ntraining, nor are scene reconstructions or image overlap information. MicKey is\nsupervised only by pairs of images and their relative poses. MicKey achieves\nstate-of-the-art performance on the Map-Free Relocalisation benchmark while\nrequiring less supervision than competing approaches.\n","authors":["Axel Barroso-Laguna","Sowmya Munukutla","Victor Adrian Prisacariu","Eric Brachmann"],"pdf_url":"https://arxiv.org/pdf/2404.06337v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04818v4","updated":"2024-04-09T14:15:32Z","published":"2023-11-08T16:42:14Z","title":"Cross-Silo Federated Learning Across Divergent Domains with Iterative\n Parameter Alignment","summary":" Learning from the collective knowledge of data dispersed across private\nsources can provide neural networks with enhanced generalization capabilities.\nFederated learning, a method for collaboratively training a machine learning\nmodel across remote clients, achieves this by combining client models via the\norchestration of a central server. However, current approaches face two\ncritical limitations: i) they struggle to converge when client domains are\nsufficiently different, and ii) current aggregation techniques produce an\nidentical global model for each client. In this work, we address these issues\nby reformulating the typical federated learning setup: rather than learning a\nsingle global model, we learn N models each optimized for a common objective.\nTo achieve this, we apply a weighted distance minimization to model parameters\nshared in a peer-to-peer topology. The resulting framework, Iterative Parameter\nAlignment, applies naturally to the cross-silo setting, and has the following\nproperties: (i) a unique solution for each participant, with the option to\nglobally converge each model in the federation, and (ii) an optional\nearly-stopping mechanism to elicit fairness among peers in collaborative\nlearning settings. These characteristics jointly provide a flexible new\nframework for iteratively learning from peer models trained on disparate\ndatasets. We find that the technique achieves competitive results on a variety\nof data partitions compared to state-of-the-art approaches. Further, we show\nthat the method is robust to divergent domains (i.e. disjoint classes across\npeers) where existing approaches struggle.\n","authors":["Matt Gorbett","Hossein Shirazi","Indrakshi Ray"],"pdf_url":"https://arxiv.org/pdf/2311.04818v4.pdf","comment":"Published at IEEE Big Data 2023"},{"id":"http://arxiv.org/abs/2402.18078v2","updated":"2024-04-09T14:12:02Z","published":"2024-02-28T06:07:07Z","title":"Coarse-to-Fine Latent Diffusion for Pose-Guided Person Image Synthesis","summary":" Diffusion model is a promising approach to image generation and has been\nemployed for Pose-Guided Person Image Synthesis (PGPIS) with competitive\nperformance. While existing methods simply align the person appearance to the\ntarget pose, they are prone to overfitting due to the lack of a high-level\nsemantic understanding on the source person image. In this paper, we propose a\nnovel Coarse-to-Fine Latent Diffusion (CFLD) method for PGPIS. In the absence\nof image-caption pairs and textual prompts, we develop a novel training\nparadigm purely based on images to control the generation process of a\npre-trained text-to-image diffusion model. A perception-refined decoder is\ndesigned to progressively refine a set of learnable queries and extract\nsemantic understanding of person images as a coarse-grained prompt. This allows\nfor the decoupling of fine-grained appearance and pose information controls at\ndifferent stages, and thus circumventing the potential overfitting problem. To\ngenerate more realistic texture details, a hybrid-granularity attention module\nis proposed to encode multi-scale fine-grained appearance features as bias\nterms to augment the coarse-grained prompt. Both quantitative and qualitative\nexperimental results on the DeepFashion benchmark demonstrate the superiority\nof our method over the state of the arts for PGPIS. Code is available at\nhttps://github.com/YanzuoLu/CFLD.\n","authors":["Yanzuo Lu","Manlin Zhang","Andy J Ma","Xiaohua Xie","Jian-Huang Lai"],"pdf_url":"https://arxiv.org/pdf/2402.18078v2.pdf","comment":"Accepted by CVPR 2024 (Highlight)"},{"id":"http://arxiv.org/abs/2401.01558v2","updated":"2024-04-09T13:59:18Z","published":"2024-01-03T06:18:30Z","title":"One-Step Late Fusion Multi-view Clustering with Compressed Subspace","summary":" Late fusion multi-view clustering (LFMVC) has become a rapidly growing class\nof methods in the multi-view clustering (MVC) field, owing to its excellent\ncomputational speed and clustering performance. One bottleneck faced by\nexisting late fusion methods is that they are usually aligned to the average\nkernel function, which makes the clustering performance highly dependent on the\nquality of datasets. Another problem is that they require subsequent k-means\nclustering after obtaining the consensus partition matrix to get the final\ndiscrete labels, and the resulting separation of the label learning and cluster\nstructure optimization processes limits the integrity of these models. To\naddress the above issues, we propose an integrated framework named One-Step\nLate Fusion Multi-view Clustering with Compressed Subspace (OS-LFMVC-CS).\nSpecifically, we use the consensus subspace to align the partition matrix while\noptimizing the partition fusion, and utilize the fused partition matrix to\nguide the learning of discrete labels. A six-step iterative optimization\napproach with verified convergence is proposed. Sufficient experiments on\nmultiple datasets validate the effectiveness and efficiency of our proposed\nmethod.\n","authors":["Qiyuan Ou","Pei Zhang","Sihang Zhou","En Zhu"],"pdf_url":"https://arxiv.org/pdf/2401.01558v2.pdf","comment":"Accepted by ICASSP2024"},{"id":"http://arxiv.org/abs/2403.17881v2","updated":"2024-04-09T13:56:06Z","published":"2024-03-26T17:12:34Z","title":"Deepfake Generation and Detection: A Benchmark and Survey","summary":" In addition to the advancements in deepfake generation, corresponding\ndetection technologies need to continuously evolve to regulate the potential\nmisuse of deepfakes, such as for privacy invasion and phishing attacks. This\nsurvey comprehensively reviews the latest developments in deepfake generation\nand detection, summarizing and analyzing the current state of the art in this\nrapidly evolving field. We first unify task definitions, comprehensively\nintroduce datasets and metrics, and discuss the development of generation and\ndetection technology frameworks. Then, we discuss the development of several\nrelated sub-fields and focus on researching four mainstream deepfake fields:\npopular face swap, face reenactment, talking face generation, and facial\nattribute editing, as well as foreign detection. Subsequently, we\ncomprehensively benchmark representative methods on popular datasets for each\nfield, fully evaluating the latest and influential works published in top\nconferences/journals. Finally, we analyze the challenges and future research\ndirections of the discussed fields. We closely follow the latest developments\nin https://github.com/flyingby/Awesome-Deepfake-Generation-and-Detection.\n","authors":["Gan Pei","Jiangning Zhang","Menghan Hu","Zhenyu Zhang","Chengjie Wang","Yunsheng Wu","Guangtao Zhai","Jian Yang","Chunhua Shen","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2403.17881v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05334v3","updated":"2024-04-09T13:54:48Z","published":"2023-09-11T09:32:45Z","title":"MultIOD: Rehearsal-free Multihead Incremental Object Detector","summary":" Class-Incremental learning (CIL) refers to the ability of artificial agents\nto integrate new classes as they appear in a stream. It is particularly\ninteresting in evolving environments where agents have limited access to memory\nand computational resources. The main challenge of incremental learning is\ncatastrophic forgetting, the inability of neural networks to retain past\nknowledge when learning a new one. Unfortunately, most existing\nclass-incremental methods for object detection are applied to two-stage\nalgorithms such as Faster-RCNN, and rely on rehearsal memory to retain past\nknowledge. We argue that those are not suitable in resource-limited\nenvironments, and more effort should be dedicated to anchor-free and\nrehearsal-free object detection. In this paper, we propose MultIOD, a\nclass-incremental object detector based on CenterNet. Our contributions are:\n(1) we propose a multihead feature pyramid and multihead detection architecture\nto efficiently separate class representations, (2) we employ transfer learning\nbetween classes learned initially and those learned incrementally to tackle\ncatastrophic forgetting, and (3) we use a class-wise non-max-suppression as a\npost-processing technique to remove redundant boxes. Results show that our\nmethod outperforms state-of-the-art methods on two Pascal VOC datasets, while\nonly saving the model in its current state, contrary to other\ndistillation-based counterparts.\n","authors":["Eden Belouadah","Arnaud Dapogny","Kevin Bailly"],"pdf_url":"https://arxiv.org/pdf/2309.05334v3.pdf","comment":"Accepted at the archival track of the Workshop on Continual Learning\n in Computer Vision (CVPR 2024)"},{"id":"http://arxiv.org/abs/2401.17053v3","updated":"2024-04-09T13:47:18Z","published":"2024-01-30T14:34:19Z","title":"BlockFusion: Expandable 3D Scene Generation using Latent Tri-plane\n Extrapolation","summary":" We present BlockFusion, a diffusion-based model that generates 3D scenes as\nunit blocks and seamlessly incorporates new blocks to extend the scene.\nBlockFusion is trained using datasets of 3D blocks that are randomly cropped\nfrom complete 3D scene meshes. Through per-block fitting, all training blocks\nare converted into the hybrid neural fields: with a tri-plane containing the\ngeometry features, followed by a Multi-layer Perceptron (MLP) for decoding the\nsigned distance values. A variational auto-encoder is employed to compress the\ntri-planes into the latent tri-plane space, on which the denoising diffusion\nprocess is performed. Diffusion applied to the latent representations allows\nfor high-quality and diverse 3D scene generation. To expand a scene during\ngeneration, one needs only to append empty blocks to overlap with the current\nscene and extrapolate existing latent tri-planes to populate new blocks. The\nextrapolation is done by conditioning the generation process with the feature\nsamples from the overlapping tri-planes during the denoising iterations. Latent\ntri-plane extrapolation produces semantically and geometrically meaningful\ntransitions that harmoniously blend with the existing scene. A 2D layout\nconditioning mechanism is used to control the placement and arrangement of\nscene elements. Experimental results indicate that BlockFusion is capable of\ngenerating diverse, geometrically consistent and unbounded large 3D scenes with\nunprecedented high-quality shapes in both indoor and outdoor scenarios.\n","authors":["Zhennan Wu","Yang Li","Han Yan","Taizhang Shang","Weixuan Sun","Senbo Wang","Ruikai Cui","Weizhe Liu","Hiroyuki Sato","Hongdong Li","Pan Ji"],"pdf_url":"https://arxiv.org/pdf/2401.17053v3.pdf","comment":"Video: https://www.youtube.com/watch?v=PxIBtd6G0mA"},{"id":"http://arxiv.org/abs/2403.03309v4","updated":"2024-04-09T13:44:54Z","published":"2024-03-05T20:21:49Z","title":"Learning Zero-Shot Material States Segmentation, by Implanting Natural\n Image Patterns in Synthetic Data","summary":" Visual understanding and segmentation of materials and their states is\nfundamental to understanding the physical world. The myriad textures, shapes,\nand often blurry boundaries formed by materials make this task particularly\nhard to generalize. Whether it's identifying wet regions of a surface, minerals\nin rocks, infected regions in plants, or pollution in water, each material\nstate has its own unique form. For neural nets to learn general class-agnostic\nmaterial segmentation, it is necessary to first collect and annotate data that\ncaptures this complexity. Collecting and manually annotating real-world images\nis limited by the cost and precision of manual labor. In contrast, synthetic\nCGI data is highly accurate and almost cost-free, but fails to replicate the\nvast diversity of the material world. This work offers a method to bridge this\ncrucial gap by implanting patterns extracted from real-world images in\nsynthetic data. Hence, patterns automatically collected from natural images are\nused to map materials into synthetic scenes. This unsupervised approach allows\nthe generated data to capture the vast complexity of the real world while\nmaintaining the precision and scale of synthetic data. We also present the\nfirst general benchmark for zero-shot material state segmentation. The\nbenchmark contains a wide range of real-world images of material states, like\nfood, rocks, construction, plants, liquids, and many others, each in various\nstates (wet/dry/stained/cooked/burned/worn/rusted/sediment/foam, etc.). The\nannotation includes both partial similarity between regions with similar but\nnot identical materials, and hard segmentation of only points in the exact same\nmaterial state. We show that net trains on MatSeg significantly outperform\nexisting state-of-the-art methods on this task. The dataset, code, and trained\nmodel are available\n","authors":["Sagi Eppel","Jolina Li","Manuel Drehwald","Alan Aspuru-Guzik"],"pdf_url":"https://arxiv.org/pdf/2403.03309v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18171v5","updated":"2024-04-09T13:42:07Z","published":"2023-05-29T16:02:09Z","title":"Improved Probabilistic Image-Text Representations","summary":" Image-Text Matching (ITM) task, a fundamental vision-language (VL) task,\nsuffers from the inherent ambiguity arising from multiplicity and imperfect\nannotations. Deterministic functions are not sufficiently powerful to capture\nambiguity, prompting the exploration of probabilistic embeddings to tackle the\nchallenge. However, the existing probabilistic ITM approach encounters two key\nshortcomings; the burden of heavy computations due to the Monte Carlo\napproximation, and the loss saturation issue in the face of abundant false\nnegatives. To overcome the issues, this paper presents an improved\nProbabilistic Cross-Modal Embeddings (named PCME++) by introducing a new\nprobabilistic distance with a closed-form solution. In addition, two\noptimization techniques are proposed to enhance PCME++ further: first, the\nincorporation of pseudo-positives to prevent the negative effect under massive\nfalse negatives; second, mixed sample data augmentation for probabilistic\nmatching. Experimental results on MS-COCO Caption and two extended benchmarks,\nCxC and ECCV Caption, demonstrate the effectiveness of PCME++ compared to\nstate-of-the-art ITM methods. The robustness of PCME++ is also evaluated under\nnoisy image-text correspondences. In addition, the potential applicability of\nPCME++ in automatic prompt-filtering for zero-shot classification is shown. The\ncode is available at https://github.com/naver-ai/pcmepp\n","authors":["Sanghyuk Chun"],"pdf_url":"https://arxiv.org/pdf/2305.18171v5.pdf","comment":"ICLR 2024 camera-ready; Code: https://github.com/naver-ai/pcmepp.\n Project page: https://naver-ai.github.io/pcmepp/. 30 pages, 2.2 MB"},{"id":"http://arxiv.org/abs/2404.06309v1","updated":"2024-04-09T13:39:37Z","published":"2024-04-09T13:39:37Z","title":"Audio-Visual Generalized Zero-Shot Learning using Pre-Trained Large\n Multi-Modal Models","summary":" Audio-visual zero-shot learning methods commonly build on features extracted\nfrom pre-trained models, e.g. video or audio classification models. However,\nexisting benchmarks predate the popularization of large multi-modal models,\nsuch as CLIP and CLAP. In this work, we explore such large pre-trained models\nto obtain features, i.e. CLIP for visual features, and CLAP for audio features.\nFurthermore, the CLIP and CLAP text encoders provide class label embeddings\nwhich are combined to boost the performance of the system. We propose a simple\nyet effective model that only relies on feed-forward neural networks,\nexploiting the strong generalization capabilities of the new audio, visual and\ntextual features. Our framework achieves state-of-the-art performance on\nVGGSound-GZSL, UCF-GZSL, and ActivityNet-GZSL with our new features. Code and\ndata available at: https://github.com/dkurzend/ClipClap-GZSL.\n","authors":["David Kurzendörfer","Otniel-Bogdan Mercea","A. Sophia Koepke","Zeynep Akata"],"pdf_url":"https://arxiv.org/pdf/2404.06309v1.pdf","comment":"CVPRw 2024 (L3D-IVU)"},{"id":"http://arxiv.org/abs/2309.14265v2","updated":"2024-04-09T13:33:30Z","published":"2023-09-25T16:23:49Z","title":"Industrial Application of 6D Pose Estimation for Robotic Manipulation in\n Automotive Internal Logistics","summary":" Despite the advances in robotics a large proportion of the of parts handling\ntasks in the automotive industry's internal logistics are not automated but\nstill performed by humans. A key component to competitively automate these\nprocesses is a 6D pose estimation that can handle a large number of different\nparts, is adaptable to new parts with little manual effort, and is sufficiently\naccurate and robust with respect to industry requirements. In this context, the\nquestion arises as to the current status quo with respect to these measures. To\naddress this we built a representative 6D pose estimation pipeline with\nstate-of-the-art components from economically scalable real to synthetic data\ngeneration to pose estimators and evaluated it on automotive parts with regards\nto a realistic sequencing process. We found that using the data generation\napproaches, the performance of the trained 6D pose estimators are promising,\nbut do not meet industry requirements. We reveal that the reason for this is\nthe inability of the estimators to provide reliable uncertainties for their\nposes, rather than the ability of to provide sufficiently accurate poses. In\nthis context we further analyzed how RGB- and RGB-D-based approaches compare\nagainst this background and show that they are differently vulnerable to the\ndomain gap induced by synthetic data.\n","authors":["Philipp Quentin","Dino Knoll","Daniel Goehring"],"pdf_url":"https://arxiv.org/pdf/2309.14265v2.pdf","comment":"Accepted for publication at IEEE International Conference on\n Automation Science and Engineering (CASE 2023)"},{"id":"http://arxiv.org/abs/2212.04227v2","updated":"2024-04-09T13:30:15Z","published":"2022-12-08T12:20:35Z","title":"Self-training via Metric Learning for Source-Free Domain Adaptation of\n Semantic Segmentation","summary":" Unsupervised source-free domain adaptation methods aim to train a model for\nthe target domain utilizing a pretrained source-domain model and unlabeled\ntarget-domain data, particularly when accessibility to source data is\nrestricted due to intellectual property or privacy concerns. Traditional\nmethods usually use self-training with pseudo-labeling, which is often\nsubjected to thresholding based on prediction confidence. However, such\nthresholding limits the effectiveness of self-training due to insufficient\nsupervision. This issue becomes more severe in a source-free setting, where\nsupervision comes solely from the predictions of the pre-trained source model.\nIn this study, we propose a novel approach by incorporating a mean-teacher\nmodel, wherein the student network is trained using all predictions from the\nteacher network. Instead of employing thresholding on predictions, we introduce\na method to weight the gradients calculated from pseudo-labels based on the\nreliability of the teacher's predictions. To assess reliability, we introduce a\nnovel approach using proxy-based metric learning. Our method is evaluated in\nsynthetic-to-real and cross-city scenarios, demonstrating superior performance\ncompared to existing state-of-the-art methods.\n","authors":["Ibrahim Batuhan Akkaya","Ugur Halici"],"pdf_url":"https://arxiv.org/pdf/2212.04227v2.pdf","comment":"This paper is under consideration at Computer Vision and Image\n Understanding"},{"id":"http://arxiv.org/abs/2404.06294v1","updated":"2024-04-09T13:19:43Z","published":"2024-04-09T13:19:43Z","title":"Fortifying Fully Convolutional Generative Adversarial Networks for Image\n Super-Resolution Using Divergence Measures","summary":" Super-Resolution (SR) is a time-hallowed image processing problem that aims\nto improve the quality of a Low-Resolution (LR) sample up to the standard of\nits High-Resolution (HR) counterpart. We aim to address this by introducing\nSuper-Resolution Generator (SuRGe), a fully-convolutional Generative\nAdversarial Network (GAN)-based architecture for SR. We show that distinct\nconvolutional features obtained at increasing depths of a GAN generator can be\noptimally combined by a set of learnable convex weights to improve the quality\nof generated SR samples. In the process, we employ the Jensen-Shannon and the\nGromov-Wasserstein losses respectively between the SR-HR and LR-SR pairs of\ndistributions to further aid the generator of SuRGe to better exploit the\navailable information in an attempt to improve SR. Moreover, we train the\ndiscriminator of SuRGe with the Wasserstein loss with gradient penalty, to\nprimarily prevent mode collapse. The proposed SuRGe, as an end-to-end GAN\nworkflow tailor-made for super-resolution, offers improved performance while\nmaintaining low inference time. The efficacy of SuRGe is substantiated by its\nsuperior performance compared to 18 state-of-the-art contenders on 10 benchmark\ndatasets.\n","authors":["Arkaprabha Basu","Kushal Bose","Sankha Subhra Mullick","Anish Chakrabarty","Swagatam Das"],"pdf_url":"https://arxiv.org/pdf/2404.06294v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02730v3","updated":"2024-04-09T13:18:22Z","published":"2023-07-06T02:30:56Z","title":"Fine-grained Action Analysis: A Multi-modality and Multi-task Dataset of\n Figure Skating","summary":" The fine-grained action analysis of the existing action datasets is\nchallenged by insufficient action categories, low fine granularities, limited\nmodalities, and tasks. In this paper, we propose a Multi-modality and\nMulti-task dataset of Figure Skating (MMFS) which was collected from the World\nFigure Skating Championships. MMFS, which possesses action recognition and\naction quality assessment, captures RGB, skeleton, and is collected the score\nof actions from 11671 clips with 256 categories including spatial and temporal\nlabels. The key contributions of our dataset fall into three aspects as\nfollows. (1) Independently spatial and temporal categories are first proposed\nto further explore fine-grained action recognition and quality assessment. (2)\nMMFS first introduces the skeleton modality for complex fine-grained action\nquality assessment. (3) Our multi-modality and multi-task dataset encourage\nmore action analysis models. To benchmark our dataset, we adopt RGB-based and\nskeleton-based baseline methods for action recognition and action quality\nassessment.\n","authors":["Sheng-Lan Liu","Yu-Ning Ding","Gang Yan","Si-Fan Zhang","Jin-Rong Zhang","Wen-Yue Chen","Xue-Hai Xu"],"pdf_url":"https://arxiv.org/pdf/2307.02730v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06287v1","updated":"2024-04-09T13:13:24Z","published":"2024-04-09T13:13:24Z","title":"Counterfactual Reasoning for Multi-Label Image Classification via\n Patching-Based Training","summary":" The key to multi-label image classification (MLC) is to improve model\nperformance by leveraging label correlations. Unfortunately, it has been shown\nthat overemphasizing co-occurrence relationships can cause the overfitting\nissue of the model, ultimately leading to performance degradation. In this\npaper, we provide a causal inference framework to show that the correlative\nfeatures caused by the target object and its co-occurring objects can be\nregarded as a mediator, which has both positive and negative impacts on model\npredictions. On the positive side, the mediator enhances the recognition\nperformance of the model by capturing co-occurrence relationships; on the\nnegative side, it has the harmful causal effect that causes the model to make\nan incorrect prediction for the target object, even when only co-occurring\nobjects are present in an image. To address this problem, we propose a\ncounterfactual reasoning method to measure the total direct effect, achieved by\nenhancing the direct effect caused only by the target object. Due to the\nunknown location of the target object, we propose patching-based training and\ninference to accomplish this goal, which divides an image into multiple patches\nand identifies the pivot patch that contains the target object. Experimental\nresults on multiple benchmark datasets with diverse configurations validate\nthat the proposed method can achieve state-of-the-art performance.\n","authors":["Ming-Kun Xie","Jia-Hao Xiao","Pei Peng","Gang Niu","Masashi Sugiyama","Sheng-Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2404.06287v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06279v1","updated":"2024-04-09T13:02:33Z","published":"2024-04-09T13:02:33Z","title":"NoiseNCA: Noisy Seed Improves Spatio-Temporal Continuity of Neural\n Cellular Automata","summary":" Neural Cellular Automata (NCA) is a class of Cellular Automata where the\nupdate rule is parameterized by a neural network that can be trained using\ngradient descent. In this paper, we focus on NCA models used for texture\nsynthesis, where the update rule is inspired by partial differential equations\n(PDEs) describing reaction-diffusion systems. To train the NCA model, the\nspatio-termporal domain is discretized, and Euler integration is used to\nnumerically simulate the PDE. However, whether a trained NCA truly learns the\ncontinuous dynamic described by the corresponding PDE or merely overfits the\ndiscretization used in training remains an open question. We study NCA models\nat the limit where space-time discretization approaches continuity. We find\nthat existing NCA models tend to overfit the training discretization,\nespecially in the proximity of the initial condition, also called \"seed\". To\naddress this, we propose a solution that utilizes uniform noise as the initial\ncondition. We demonstrate the effectiveness of our approach in preserving the\nconsistency of NCA dynamics across a wide range of spatio-temporal\ngranularities. Our improved NCA model enables two new test-time interactions by\nallowing continuous control over the speed of pattern formation and the scale\nof the synthesized patterns. We demonstrate this new NCA feature in our\ninteractive online demo. Our work reveals that NCA models can learn continuous\ndynamics and opens new venues for NCA research from a dynamical systems'\nperspective.\n","authors":["Ehsan Pajouheshgar","Yitao Xu","Sabine Süsstrunk"],"pdf_url":"https://arxiv.org/pdf/2404.06279v1.pdf","comment":"9 pages, 12 figures"},{"id":"http://arxiv.org/abs/2404.06277v1","updated":"2024-04-09T13:01:26Z","published":"2024-04-09T13:01:26Z","title":"Learning Embeddings with Centroid Triplet Loss for Object Identification\n in Robotic Grasping","summary":" Foundation models are a strong trend in deep learning and computer vision.\nThese models serve as a base for applications as they require minor or no\nfurther fine-tuning by developers to integrate into their applications.\nFoundation models for zero-shot object segmentation such as Segment Anything\n(SAM) output segmentation masks from images without any further object\ninformation. When they are followed in a pipeline by an object identification\nmodel, they can perform object detection without training. Here, we focus on\ntraining such an object identification model. A crucial practical aspect for an\nobject identification model is to be flexible in input size. As object\nidentification is an image retrieval problem, a suitable method should handle\nmulti-query multi-gallery situations without constraining the number of input\nimages (e.g. by having fixed-size aggregation layers). The key solution to\ntrain such a model is the centroid triplet loss (CTL), which aggregates image\nfeatures to their centroids. CTL yields high accuracy, avoids misleading\ntraining signals and keeps the model input size flexible. In our experiments,\nwe establish a new state of the art on the ArmBench object identification task,\nwhich shows general applicability of our model. We furthermore demonstrate an\nintegrated unseen object detection pipeline on the challenging HOPE dataset,\nwhich requires fine-grained detection. There, our pipeline matches and\nsurpasses related methods which have been trained on dataset-specific data.\n","authors":["Anas Gouda","Max Schwarz","Christopher Reining","Sven Behnke","Alice Kirchheim"],"pdf_url":"https://arxiv.org/pdf/2404.06277v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04561v2","updated":"2024-04-09T12:50:16Z","published":"2024-04-06T09:01:19Z","title":"Co-Occ: Coupling Explicit Feature Fusion with Volume Rendering\n Regularization for Multi-Modal 3D Semantic Occupancy Prediction","summary":" 3D semantic occupancy prediction is a pivotal task in the field of autonomous\ndriving. Recent approaches have made great advances in 3D semantic occupancy\npredictions on a single modality. However, multi-modal semantic occupancy\nprediction approaches have encountered difficulties in dealing with the\nmodality heterogeneity, modality misalignment, and insufficient modality\ninteractions that arise during the fusion of different modalities data, which\nmay result in the loss of important geometric and semantic information. This\nletter presents a novel multi-modal, i.e., LiDAR-camera 3D semantic occupancy\nprediction framework, dubbed Co-Occ, which couples explicit LiDAR-camera\nfeature fusion with implicit volume rendering regularization. The key insight\nis that volume rendering in the feature space can proficiently bridge the gap\nbetween 3D LiDAR sweeps and 2D images while serving as a physical\nregularization to enhance LiDAR-camera fused volumetric representation.\nSpecifically, we first propose a Geometric- and Semantic-aware Fusion\n(GSFusion) module to explicitly enhance LiDAR features by incorporating\nneighboring camera features through a K-nearest neighbors (KNN) search. Then,\nwe employ volume rendering to project the fused feature back to the image\nplanes for reconstructing color and depth maps. These maps are then supervised\nby input images from the camera and depth estimations derived from LiDAR,\nrespectively. Extensive experiments on the popular nuScenes and SemanticKITTI\nbenchmarks verify the effectiveness of our Co-Occ for 3D semantic occupancy\nprediction. The project page is available at\nhttps://rorisis.github.io/Co-Occ_project-page/.\n","authors":["Jingyi Pan","Zipeng Wang","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.04561v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06273v1","updated":"2024-04-09T12:48:24Z","published":"2024-04-09T12:48:24Z","title":"Robust Confidence Intervals in Stereo Matching using Possibility Theory","summary":" We propose a method for estimating disparity confidence intervals in stereo\nmatching problems. Confidence intervals provide complementary information to\nusual confidence measures. To the best of our knowledge, this is the first\nmethod creating disparity confidence intervals based on the cost volume. This\nmethod relies on possibility distributions to interpret the epistemic\nuncertainty of the cost volume. Our method has the benefit of having a\nwhite-box nature, differing in this respect from current state-of-the-art deep\nneural networks approaches. The accuracy and size of confidence intervals are\nvalidated using the Middlebury stereo datasets as well as a dataset of\nsatellite images. This contribution is freely available on GitHub.\n","authors":["Roman Malinowski","Emmanuelle Sarrazin","Loïc Dumas","Emmanuel Dubois","Sébastien Destercke"],"pdf_url":"https://arxiv.org/pdf/2404.06273v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06270v1","updated":"2024-04-09T12:47:30Z","published":"2024-04-09T12:47:30Z","title":"3D Geometry-aware Deformable Gaussian Splatting for Dynamic View\n Synthesis","summary":" In this paper, we propose a 3D geometry-aware deformable Gaussian Splatting\nmethod for dynamic view synthesis. Existing neural radiance fields (NeRF) based\nsolutions learn the deformation in an implicit manner, which cannot incorporate\n3D scene geometry. Therefore, the learned deformation is not necessarily\ngeometrically coherent, which results in unsatisfactory dynamic view synthesis\nand 3D dynamic reconstruction. Recently, 3D Gaussian Splatting provides a new\nrepresentation of the 3D scene, building upon which the 3D geometry could be\nexploited in learning the complex 3D deformation. Specifically, the scenes are\nrepresented as a collection of 3D Gaussian, where each 3D Gaussian is optimized\nto move and rotate over time to model the deformation. To enforce the 3D scene\ngeometry constraint during deformation, we explicitly extract 3D geometry\nfeatures and integrate them in learning the 3D deformation. In this way, our\nsolution achieves 3D geometry-aware deformation modeling, which enables\nimproved dynamic view synthesis and 3D dynamic reconstruction. Extensive\nexperimental results on both synthetic and real datasets prove the superiority\nof our solution, which achieves new state-of-the-art performance.\n The project is available at https://npucvr.github.io/GaGS/\n","authors":["Zhicheng Lu","Xiang Guo","Le Hui","Tianrui Chen","Min Yang","Xiao Tang","Feng Zhu","Yuchao Dai"],"pdf_url":"https://arxiv.org/pdf/2404.06270v1.pdf","comment":"Accepted by CVPR 2024. Project page: https://npucvr.github.io/GaGS/"},{"id":"http://arxiv.org/abs/2404.06265v1","updated":"2024-04-09T12:44:34Z","published":"2024-04-09T12:44:34Z","title":"Spatial-Temporal Multi-level Association for Video Object Segmentation","summary":" Existing semi-supervised video object segmentation methods either focus on\ntemporal feature matching or spatial-temporal feature modeling. However, they\ndo not address the issues of sufficient target interaction and efficient\nparallel processing simultaneously, thereby constraining the learning of\ndynamic, target-aware features. To tackle these limitations, this paper\nproposes a spatial-temporal multi-level association framework, which jointly\nassociates reference frame, test frame, and object features to achieve\nsufficient interaction and parallel target ID association with a\nspatial-temporal memory bank for efficient video object segmentation.\nSpecifically, we construct a spatial-temporal multi-level feature association\nmodule to learn better target-aware features, which formulates feature\nextraction and interaction as the efficient operations of object\nself-attention, reference object enhancement, and test reference correlation.\nIn addition, we propose a spatial-temporal memory to assist feature association\nand temporal ID assignment and correlation. We evaluate the proposed method by\nconducting extensive experiments on numerous video object segmentation\ndatasets, including DAVIS 2016/2017 val, DAVIS 2017 test-dev, and YouTube-VOS\n2018/2019 val. The favorable performance against the state-of-the-art methods\ndemonstrates the effectiveness of our approach. All source code and trained\nmodels will be made publicly available.\n","authors":["Deshui Miao","Xin Li","Zhenyu He","Huchuan Lu","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2404.06265v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07166v2","updated":"2024-04-09T12:40:18Z","published":"2023-10-11T03:29:13Z","title":"Anchor-based Multi-view Subspace Clustering with Hierarchical Feature\n Descent","summary":" Multi-view clustering has attracted growing attention owing to its\ncapabilities of aggregating information from various sources and its promising\nhorizons in public affairs. Up till now, many advanced approaches have been\nproposed in recent literature. However, there are several ongoing difficulties\nto be tackled. One common dilemma occurs while attempting to align the features\nof different views. {Moreover, due to the fact that many existing multi-view\nclustering algorithms stem from spectral clustering, this results to cubic time\ncomplexity w.r.t. the number of dataset. However, we propose Anchor-based\nMulti-view Subspace Clustering with Hierarchical Feature Descent(MVSC-HFD) to\ntackle the discrepancy among views through hierarchical feature descent and\nproject to a common subspace( STAGE 1), which reveals dependency of different\nviews. We further reduce the computational complexity to linear time cost\nthrough a unified sampling strategy in the common subspace( STAGE 2), followed\nby anchor-based subspace clustering to learn the bipartite graph collectively(\nSTAGE 3). }Extensive experimental results on public benchmark datasets\ndemonstrate that our proposed model consistently outperforms the\nstate-of-the-art techniques.\n","authors":["Qiyuan Ou","Siwei Wang","Pei Zhang","Sihang Zhou","En Zhu"],"pdf_url":"https://arxiv.org/pdf/2310.07166v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06261v1","updated":"2024-04-09T12:34:28Z","published":"2024-04-09T12:34:28Z","title":"Playing to Vision Foundation Model's Strengths in Stereo Matching","summary":" Stereo matching has become a key technique for 3D environment perception in\nintelligent vehicles. For a considerable time, convolutional neural networks\n(CNNs) have remained the mainstream choice for feature extraction in this\ndomain. Nonetheless, there is a growing consensus that the existing paradigm\nshould evolve towards vision foundation models (VFM), particularly those\ndeveloped based on vision Transformers (ViTs) and pre-trained through\nself-supervision on extensive, unlabeled datasets. While VFMs are adept at\nextracting informative, general-purpose visual features, specifically for dense\nprediction tasks, their performance often lacks in geometric vision tasks. This\nstudy serves as the first exploration of a viable approach for adapting VFMs to\nstereo matching. Our ViT adapter, referred to as ViTAS, is constructed upon\nthree types of modules: spatial differentiation, patch attention fusion, and\ncross-attention. The first module initializes feature pyramids, while the\nlatter two aggregate stereo and multi-scale contextual information into\nfine-grained features, respectively. ViTAStereo, which combines ViTAS with cost\nvolume-based stereo matching back-end processes, achieves the top rank on the\nKITTI Stereo 2012 dataset and outperforms the second-best network StereoBase by\napproximately 7.9% in terms of the percentage of error pixels, with a tolerance\nof 3 pixels. Additional experiments across diverse scenarios further\ndemonstrate its superior generalizability compared to all other\nstate-of-the-art approaches. We believe this new paradigm will pave the way for\nthe next generation of stereo matching networks.\n","authors":["Chuang-Wei Liu","Qijun Chen","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2404.06261v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06258v1","updated":"2024-04-09T12:32:10Z","published":"2024-04-09T12:32:10Z","title":"Robust feature knowledge distillation for enhanced performance of\n lightweight crack segmentation models","summary":" Vision-based crack detection faces deployment challenges due to the size of\nrobust models and edge device limitations. These can be addressed with\nlightweight models trained with knowledge distillation (KD). However,\nstate-of-the-art (SOTA) KD methods compromise anti-noise robustness. This paper\ndevelops Robust Feature Knowledge Distillation (RFKD), a framework to improve\nrobustness while retaining the precision of light models for crack\nsegmentation. RFKD distils knowledge from a teacher model's logit layers and\nintermediate feature maps while leveraging mixed clean and noisy images to\ntransfer robust patterns to the student model, improving its precision,\ngeneralisation, and anti-noise performance. To validate the proposed RFKD, a\nlightweight crack segmentation model, PoolingCrack Tiny (PCT), with only 0.5 M\nparameters, is also designed and used as the student to run the framework. The\nresults show a significant enhancement in noisy images, with RFKD reaching a\n62% enhanced mean Dice score (mDS) compared to SOTA KD methods.\n","authors":["Zhaohui Chen","Elyas Asadi Shamsabadi","Sheng Jiang","Luming Shen","Daniel Dias-da-Costa"],"pdf_url":"https://arxiv.org/pdf/2404.06258v1.pdf","comment":"24 pages, 13 figures"},{"id":"http://arxiv.org/abs/2404.06256v1","updated":"2024-04-09T12:29:16Z","published":"2024-04-09T12:29:16Z","title":"Label-Efficient 3D Object Detection For Road-Side Units","summary":" Occlusion presents a significant challenge for safety-critical applications\nsuch as autonomous driving. Collaborative perception has recently attracted a\nlarge research interest thanks to the ability to enhance the perception of\nautonomous vehicles via deep information fusion with intelligent roadside units\n(RSU), thus minimizing the impact of occlusion. While significant advancement\nhas been made, the data-hungry nature of these methods creates a major hurdle\nfor their real-world deployment, particularly due to the need for annotated RSU\ndata. Manually annotating the vast amount of RSU data required for training is\nprohibitively expensive, given the sheer number of intersections and the effort\ninvolved in annotating point clouds. We address this challenge by devising a\nlabel-efficient object detection method for RSU based on unsupervised object\ndiscovery. Our paper introduces two new modules: one for object discovery based\non a spatial-temporal aggregation of point clouds, and another for refinement.\nFurthermore, we demonstrate that fine-tuning on a small portion of annotated\ndata allows our object discovery models to narrow the performance gap with, or\neven surpass, fully supervised models. Extensive experiments are carried out in\nsimulated and real-world datasets to evaluate our method.\n","authors":["Minh-Quan Dao","Holger Caesar","Julie Stephany Berrio","Mao Shan","Stewart Worrall","Vincent Frémont","Ezio Malis"],"pdf_url":"https://arxiv.org/pdf/2404.06256v1.pdf","comment":"IV 2024"},{"id":"http://arxiv.org/abs/2404.06253v1","updated":"2024-04-09T12:25:06Z","published":"2024-04-09T12:25:06Z","title":"From Barlow Twins to Triplet Training: Differentiating Dementia with\n Limited Data","summary":" Differential diagnosis of dementia is challenging due to overlapping\nsymptoms, with structural magnetic resonance imaging (MRI) being the primary\nmethod for diagnosis. Despite the clinical value of computer-aided differential\ndiagnosis, research has been limited, mainly due to the absence of public\ndatasets that contain diverse types of dementia. This leaves researchers with\nsmall in-house datasets that are insufficient for training deep neural networks\n(DNNs). Self-supervised learning shows promise for utilizing unlabeled MRI\nscans in training, but small batch sizes for volumetric brain scans make its\napplication challenging. To address these issues, we propose Triplet Training\nfor differential diagnosis with limited target data. It consists of three key\nstages: (i) self-supervised pre-training on unlabeled data with Barlow Twins,\n(ii) self-distillation on task-related data, and (iii) fine-tuning on the\ntarget dataset. Our approach significantly outperforms traditional training\nstrategies, achieving a balanced accuracy of 75.6%. We further provide insights\ninto the training process by visualizing changes in the latent space after each\nstep. Finally, we validate the robustness of Triplet Training in terms of its\nindividual components in a comprehensive ablation study. Our code is available\nat https://github.com/ai-med/TripletTraining.\n","authors":["Yitong Li","Tom Nuno Wolf","Sebastian Pölsterl","Igor Yakushev","Dennis M. Hedderich","Christian Wachinger"],"pdf_url":"https://arxiv.org/pdf/2404.06253v1.pdf","comment":"Accepted for presentation at MIDL 2024"},{"id":"http://arxiv.org/abs/2404.06251v1","updated":"2024-04-09T12:23:30Z","published":"2024-04-09T12:23:30Z","title":"ColorMNet: A Memory-based Deep Spatial-Temporal Feature Propagation\n Network for Video Colorization","summary":" How to effectively explore spatial-temporal features is important for video\ncolorization. Instead of stacking multiple frames along the temporal dimension\nor recurrently propagating estimated features that will accumulate errors or\ncannot explore information from far-apart frames, we develop a memory-based\nfeature propagation module that can establish reliable connections with\nfeatures from far-apart frames and alleviate the influence of inaccurately\nestimated features. To extract better features from each frame for the\nabove-mentioned feature propagation, we explore the features from\nlarge-pretrained visual models to guide the feature estimation of each frame so\nthat the estimated features can model complex scenarios. In addition, we note\nthat adjacent frames usually contain similar contents. To explore this property\nfor better spatial and temporal feature utilization, we develop a local\nattention module to aggregate the features from adjacent frames in a\nspatial-temporal neighborhood. We formulate our memory-based feature\npropagation module, large-pretrained visual model guided feature estimation\nmodule, and local attention module into an end-to-end trainable network (named\nColorMNet) and show that it performs favorably against state-of-the-art methods\non both the benchmark datasets and real-world scenarios. The source code and\npre-trained models will be available at\n\\url{https://github.com/yyang181/colormnet}.\n","authors":["Yixin Yang","Jiangxin Dong","Jinhui Tang","Jinshan Pan"],"pdf_url":"https://arxiv.org/pdf/2404.06251v1.pdf","comment":"Project website: \\url{https://github.com/yyang181/colormnet}"},{"id":"http://arxiv.org/abs/2404.06247v1","updated":"2024-04-09T12:13:40Z","published":"2024-04-09T12:13:40Z","title":"LRR: Language-Driven Resamplable Continuous Representation against\n Adversarial Tracking Attacks","summary":" Visual object tracking plays a critical role in visual-based autonomous\nsystems, as it aims to estimate the position and size of the object of interest\nwithin a live video. Despite significant progress made in this field,\nstate-of-the-art (SOTA) trackers often fail when faced with adversarial\nperturbations in the incoming frames. This can lead to significant robustness\nand security issues when these trackers are deployed in the real world. To\nachieve high accuracy on both clean and adversarial data, we propose building a\nspatial-temporal continuous representation using the semantic text guidance of\nthe object of interest. This novel continuous representation enables us to\nreconstruct incoming frames to maintain semantic and appearance consistency\nwith the object of interest and its clean counterparts. As a result, our\nproposed method successfully defends against different SOTA adversarial\ntracking attacks while maintaining high accuracy on clean data. In particular,\nour method significantly increases tracking accuracy under adversarial attacks\nwith around 90% relative improvement on UAV123, which is even higher than the\naccuracy on clean data.\n","authors":["Jianlang Chen","Xuhong Ren","Qing Guo","Felix Juefei-Xu","Di Lin","Wei Feng","Lei Ma","Jianjun Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.06247v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06246v1","updated":"2024-04-09T12:11:25Z","published":"2024-04-09T12:11:25Z","title":"GHNeRF: Learning Generalizable Human Features with Efficient Neural\n Radiance Fields","summary":" Recent advances in Neural Radiance Fields (NeRF) have demonstrated promising\nresults in 3D scene representations, including 3D human representations.\nHowever, these representations often lack crucial information on the underlying\nhuman pose and structure, which is crucial for AR/VR applications and games. In\nthis paper, we introduce a novel approach, termed GHNeRF, designed to address\nthese limitations by learning 2D/3D joint locations of human subjects with NeRF\nrepresentation. GHNeRF uses a pre-trained 2D encoder streamlined to extract\nessential human features from 2D images, which are then incorporated into the\nNeRF framework in order to encode human biomechanic features. This allows our\nnetwork to simultaneously learn biomechanic features, such as joint locations,\nalong with human geometry and texture. To assess the effectiveness of our\nmethod, we conduct a comprehensive comparison with state-of-the-art human NeRF\ntechniques and joint estimation algorithms. Our results show that GHNeRF can\nachieve state-of-the-art results in near real-time.\n","authors":["Arnab Dey","Di Yang","Rohith Agaram","Antitza Dantcheva","Andrew I. Comport","Srinath Sridhar","Jean Martinet"],"pdf_url":"https://arxiv.org/pdf/2404.06246v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06244v1","updated":"2024-04-09T12:10:54Z","published":"2024-04-09T12:10:54Z","title":"Anchor-based Robust Finetuning of Vision-Language Models","summary":" We aim at finetuning a vision-language model without hurting its\nout-of-distribution (OOD) generalization. We address two types of OOD\ngeneralization, i.e., i) domain shift such as natural to sketch images, and ii)\nzero-shot capability to recognize the category that was not contained in the\nfinetune data. Arguably, the diminished OOD generalization after finetuning\nstems from the excessively simplified finetuning target, which only provides\nthe class information, such as ``a photo of a [CLASS]''. This is distinct from\nthe process in that CLIP was pretrained, where there is abundant text\nsupervision with rich semantic information. Therefore, we propose to compensate\nfor the finetune process using auxiliary supervision with rich semantic\ninformation, which acts as anchors to preserve the OOD generalization.\nSpecifically, two types of anchors are elaborated in our method, including i)\ntext-compensated anchor which uses the images from the finetune set but\nenriches the text supervision from a pretrained captioner, ii) image-text-pair\nanchor which is retrieved from the dataset similar to pretraining data of CLIP\naccording to the downstream task, associating with the original CLIP text with\nrich semantics. Those anchors are utilized as auxiliary semantic information to\nmaintain the original feature space of CLIP, thereby preserving the OOD\ngeneralization capabilities. Comprehensive experiments demonstrate that our\nmethod achieves in-distribution performance akin to conventional finetuning\nwhile attaining new state-of-the-art results on domain shift and zero-shot\nlearning benchmarks.\n","authors":["Jinwei Han","Zhiwen Lin","Zhongyisun Sun","Yingguo Gao","Ke Yan","Shouhong Ding","Yuan Gao","Gui-Song Xia"],"pdf_url":"https://arxiv.org/pdf/2404.06244v1.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2404.06243v1","updated":"2024-04-09T12:09:56Z","published":"2024-04-09T12:09:56Z","title":"ActNetFormer: Transformer-ResNet Hybrid Method for Semi-Supervised\n Action Recognition in Videos","summary":" Human action or activity recognition in videos is a fundamental task in\ncomputer vision with applications in surveillance and monitoring, self-driving\ncars, sports analytics, human-robot interaction and many more. Traditional\nsupervised methods require large annotated datasets for training, which are\nexpensive and time-consuming to acquire. This work proposes a novel approach\nusing Cross-Architecture Pseudo-Labeling with contrastive learning for\nsemi-supervised action recognition. Our framework leverages both labeled and\nunlabelled data to robustly learn action representations in videos, combining\npseudo-labeling with contrastive learning for effective learning from both\ntypes of samples. We introduce a novel cross-architecture approach where 3D\nConvolutional Neural Networks (3D CNNs) and video transformers (VIT) are\nutilised to capture different aspects of action representations; hence we call\nit ActNetFormer. The 3D CNNs excel at capturing spatial features and local\ndependencies in the temporal domain, while VIT excels at capturing long-range\ndependencies across frames. By integrating these complementary architectures\nwithin the ActNetFormer framework, our approach can effectively capture both\nlocal and global contextual information of an action. This comprehensive\nrepresentation learning enables the model to achieve better performance in\nsemi-supervised action recognition tasks by leveraging the strengths of each of\nthese architectures. Experimental results on standard action recognition\ndatasets demonstrate that our approach performs better than the existing\nmethods, achieving state-of-the-art performance with only a fraction of labeled\ndata. The official website of this work is available at:\nhttps://github.com/rana2149/ActNetFormer.\n","authors":["Sharana Dharshikgan Suresh Dass","Hrishav Bakul Barua","Ganesh Krishnasamy","Raveendran Paramesran","Raphael C. -W. Phan"],"pdf_url":"https://arxiv.org/pdf/2404.06243v1.pdf","comment":"Submitted for peer review"},{"id":"http://arxiv.org/abs/2404.06240v1","updated":"2024-04-09T12:06:21Z","published":"2024-04-09T12:06:21Z","title":"Hyperparameter-Free Medical Image Synthesis for Sharing Data and\n Improving Site-Specific Segmentation","summary":" Sharing synthetic medical images is a promising alternative to sharing real\nimages that can improve patient privacy and data security. To get good results,\nexisting methods for medical image synthesis must be manually adjusted when\nthey are applied to unseen data. To remove this manual burden, we introduce a\nHyperparameter-Free distributed learning method for automatic medical image\nSynthesis, Sharing, and Segmentation called HyFree-S3. For three diverse\nsegmentation settings (pelvic MRIs, lung X-rays, polyp photos), the use of\nHyFree-S3 results in improved performance over training only with site-specific\ndata (in the majority of cases). The hyperparameter-free nature of the method\nshould make data synthesis and sharing easier, potentially leading to an\nincrease in the quantity of available data and consequently the quality of the\nmodels trained that may ultimately be applied in the clinic. Our code is\navailable at https://github.com/AwesomeLemon/HyFree-S3\n","authors":["Alexander Chebykin","Peter A. N. Bosman","Tanja Alderliesten"],"pdf_url":"https://arxiv.org/pdf/2404.06240v1.pdf","comment":"Accepted at MIDL 2024"},{"id":"http://arxiv.org/abs/2311.18649v3","updated":"2024-04-09T11:55:20Z","published":"2023-11-30T15:57:34Z","title":"Simple Semantic-Aided Few-Shot Learning","summary":" Learning from a limited amount of data, namely Few-Shot Learning, stands out\nas a challenging computer vision task. Several works exploit semantics and\ndesign complicated semantic fusion mechanisms to compensate for rare\nrepresentative features within restricted data. However, relying on naive\nsemantics such as class names introduces biases due to their brevity, while\nacquiring extensive semantics from external knowledge takes a huge time and\neffort. This limitation severely constrains the potential of semantics in\nFew-Shot Learning. In this paper, we design an automatic way called Semantic\nEvolution to generate high-quality semantics. The incorporation of high-quality\nsemantics alleviates the need for complex network structures and learning\nalgorithms used in previous works. Hence, we employ a simple two-layer network\ntermed Semantic Alignment Network to transform semantics and visual features\ninto robust class prototypes with rich discriminative features for few-shot\nclassification. The experimental results show our framework outperforms all\nprevious methods on six benchmarks, demonstrating a simple network with\nhigh-quality semantics can beat intricate multi-modal modules on few-shot\nclassification tasks. Code is available at\nhttps://github.com/zhangdoudou123/SemFew.\n","authors":["Hai Zhang","Junzhe Xu","Shanlin Jiang","Zhenan He"],"pdf_url":"https://arxiv.org/pdf/2311.18649v3.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2307.10974v3","updated":"2024-04-09T11:23:10Z","published":"2023-07-20T16:00:19Z","title":"Deep Multi-Threshold Spiking-UNet for Image Processing","summary":" U-Net, known for its simple yet efficient architecture, is widely utilized\nfor image processing tasks and is particularly suitable for deployment on\nneuromorphic chips. This paper introduces the novel concept of Spiking-UNet for\nimage processing, which combines the power of Spiking Neural Networks (SNNs)\nwith the U-Net architecture. To achieve an efficient Spiking-UNet, we face two\nprimary challenges: ensuring high-fidelity information propagation through the\nnetwork via spikes and formulating an effective training strategy. To address\nthe issue of information loss, we introduce multi-threshold spiking neurons,\nwhich improve the efficiency of information transmission within the\nSpiking-UNet. For the training strategy, we adopt a conversion and fine-tuning\npipeline that leverage pre-trained U-Net models. During the conversion process,\nsignificant variability in data distribution across different parts is observed\nwhen utilizing skip connections. Therefore, we propose a connection-wise\nnormalization method to prevent inaccurate firing rates. Furthermore, we adopt\na flow-based training method to fine-tune the converted models, reducing time\nsteps while preserving performance. Experimental results show that, on image\nsegmentation and denoising, our Spiking-UNet achieves comparable performance to\nits non-spiking counterpart, surpassing existing SNN methods. Compared with the\nconverted Spiking-UNet without fine-tuning, our Spiking-UNet reduces inference\ntime by approximately 90\\%. This research broadens the application scope of\nSNNs in image processing and is expected to inspire further exploration in the\nfield of neuromorphic engineering. The code for our Spiking-UNet implementation\nis available at https://github.com/SNNresearch/Spiking-UNet.\n","authors":["Hebei Li","Yueyi Zhang","Zhiwei Xiong","Zheng-jun Zha","Xiaoyan Sun"],"pdf_url":"https://arxiv.org/pdf/2307.10974v3.pdf","comment":"Accepted in NeuroComputing"},{"id":"http://arxiv.org/abs/2404.06219v1","updated":"2024-04-09T11:13:36Z","published":"2024-04-09T11:13:36Z","title":"Automatic Defect Detection in Sewer Network Using Deep Learning Based\n Object Detector","summary":" Maintaining sewer systems in large cities is important, but also time and\neffort consuming, because visual inspections are currently done manually. To\nreduce the amount of aforementioned manual work, defects within sewer pipes\nshould be located and classified automatically. In the past, multiple works\nhave attempted solving this problem using classical image processing, machine\nlearning, or a combination of those. However, each provided solution only focus\non detecting a limited set of defect/structure types, such as fissure, root,\nand/or connection. Furthermore, due to the use of hand-crafted features and\nsmall training datasets, generalization is also problematic. In order to\novercome these deficits, a sizable dataset with 14.7 km of various sewer pipes\nwere annotated by sewer maintenance experts in the scope of this work. On top\nof that, an object detector (EfficientDet-D0) was trained for automatic defect\ndetection. From the result of several expermients, peculiar natures of defects\nin the context of object detection, which greatly effect annotation and\ntraining process, are found and discussed. At the end, the final detector was\nable to detect 83% of defects in the test set; out of the missing 17%, only\n0.77% are very severe defects. This work provides an example of applying deep\nlearning-based object detection into an important but quiet engineering field.\nIt also gives some practical pointers on how to annotate peculiar \"object\",\nsuch as defects.\n","authors":["Bach Ha","Birgit Schalter","Laura White","Joachim Koehler"],"pdf_url":"https://arxiv.org/pdf/2404.06219v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06212v1","updated":"2024-04-09T11:00:19Z","published":"2024-04-09T11:00:19Z","title":"OmniFusion Technical Report","summary":" Last year, multimodal architectures served up a revolution in AI-based\napproaches and solutions, extending the capabilities of large language models\n(LLM). We propose an \\textit{OmniFusion} model based on a pretrained LLM and\nadapters for visual modality. We evaluated and compared several architecture\ndesign principles for better text and visual data coupling: MLP and transformer\nadapters, various CLIP ViT-based encoders (SigLIP, InternVIT, etc.), and their\nfusing approach, image encoding method (whole image or tiles encoding) and two\n7B LLMs (the proprietary one and open-source Mistral). Experiments on 8\nvisual-language benchmarks show the top score for the best OmniFusion setup in\nterms of different VQA tasks in comparison with open-source LLaVA-like\nsolutions: VizWiz, Pope, MM-Vet, ScienceQA, MMBench, TextVQA, VQAv2, MMMU. We\nalso propose a variety of situations, where OmniFusion provides highly-detailed\nanswers in different domains: housekeeping, sightseeing, culture, medicine,\nhandwritten and scanned equations recognition, etc. Mistral-based OmniFusion\nmodel is an open-source solution with weights, training and inference scripts\navailable at https://github.com/AIRI-Institute/OmniFusion.\n","authors":["Elizaveta Goncharova","Anton Razzhigaev","Matvey Mikhalchuk","Maxim Kurkin","Irina Abdullaeva","Matvey Skripkin","Ivan Oseledets","Denis Dimitrov","Andrey Kuznetsov"],"pdf_url":"https://arxiv.org/pdf/2404.06212v1.pdf","comment":"17 pages, 4 figures, 9 tables, 2 appendices"},{"id":"http://arxiv.org/abs/2404.06211v1","updated":"2024-04-09T11:00:11Z","published":"2024-04-09T11:00:11Z","title":"Unified Physical-Digital Attack Detection Challenge","summary":" Face Anti-Spoofing (FAS) is crucial to safeguard Face Recognition (FR)\nSystems. In real-world scenarios, FRs are confronted with both physical and\ndigital attacks. However, existing algorithms often address only one type of\nattack at a time, which poses significant limitations in real-world scenarios\nwhere FR systems face hybrid physical-digital threats. To facilitate the\nresearch of Unified Attack Detection (UAD) algorithms, a large-scale\nUniAttackData dataset has been collected. UniAttackData is the largest public\ndataset for Unified Attack Detection, with a total of 28,706 videos, where each\nunique identity encompasses all advanced attack types. Based on this dataset,\nwe organized a Unified Physical-Digital Face Attack Detection Challenge to\nboost the research in Unified Attack Detections. It attracted 136 teams for the\ndevelopment phase, with 13 qualifying for the final round. The results\nre-verified by the organizing team were used for the final ranking. This paper\ncomprehensively reviews the challenge, detailing the dataset introduction,\nprotocol definition, evaluation criteria, and a summary of published results.\nFinally, we focus on the detailed analysis of the highest-performing algorithms\nand offer potential directions for unified physical-digital attack detection\ninspired by this competition. Challenge Website:\nhttps://sites.google.com/view/face-anti-spoofing-challenge/welcome/challengecvpr2024.\n","authors":["Haocheng Yuan","Ajian Liu","Junze Zheng","Jun Wan","Jiankang Deng","Sergio Escalera","Hugo Jair Escalante","Isabelle Guyon","Zhen Lei"],"pdf_url":"https://arxiv.org/pdf/2404.06211v1.pdf","comment":"11 pages, 10 figures"},{"id":"http://arxiv.org/abs/2404.06207v1","updated":"2024-04-09T10:56:46Z","published":"2024-04-09T10:56:46Z","title":"Leveraging edge detection and neural networks for better UAV\n localization","summary":" We propose a novel method for geolocalizing Unmanned Aerial Vehicles (UAVs)\nin environments lacking Global Navigation Satellite Systems (GNSS). Current\nstate-of-the-art techniques employ an offline-trained encoder to generate a\nvector representation (embedding) of the UAV's current view, which is then\ncompared with pre-computed embeddings of geo-referenced images to determine the\nUAV's position. Here, we demonstrate that the performance of these methods can\nbe significantly enhanced by preprocessing the images to extract their edges,\nwhich exhibit robustness to seasonal and illumination variations. Furthermore,\nwe establish that utilizing edges enhances resilience to orientation and\naltitude inaccuracies. Additionally, we introduce a confidence criterion for\nlocalization. Our findings are substantiated through synthetic experiments.\n","authors":["Theo Di Piazza","Enric Meinhardt-Llopis","Gabriele Facciolo","Benedicte Bascle","Corentin Abgrall","Jean-Clement Devaux"],"pdf_url":"https://arxiv.org/pdf/2404.06207v1.pdf","comment":"Accepted for publication in IGARSS2024. 4 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2404.06202v1","updated":"2024-04-09T10:47:43Z","published":"2024-04-09T10:47:43Z","title":"Automated National Urban Map Extraction","summary":" Developing countries usually lack the proper governance means to generate and\nregularly update a national rooftop map. Using traditional photogrammetry and\nsurveying methods to produce a building map at the federal level is costly and\ntime consuming. Using earth observation and deep learning methods, we can\nbridge this gap and propose an automated pipeline to fetch such national urban\nmaps. This paper aims to exploit the power of fully convolutional neural\nnetworks for multi-class buildings' instance segmentation to leverage high\nobject-wise accuracy results. Buildings' instance segmentation from sub-meter\nhigh-resolution satellite images can be achieved with relatively high\npixel-wise metric scores. We detail all engineering steps to replicate this\nwork and ensure highly accurate results in dense and slum areas witnessed in\nregions that lack proper urban planning in the Global South. We applied a case\nstudy of the proposed pipeline to Lebanon and successfully produced the first\ncomprehensive national building footprint map with approximately 1 Million\nunits with an 84% accuracy. The proposed architecture relies on advanced\naugmentation techniques to overcome dataset scarcity, which is often the case\nin developing countries.\n","authors":["Hasan Nasrallah","Abed Ellatif Samhat","Cristiano Nattero","Ali J. Ghandour"],"pdf_url":"https://arxiv.org/pdf/2404.06202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06194v1","updated":"2024-04-09T10:27:22Z","published":"2024-04-09T10:27:22Z","title":"Exploring the Potential of Large Foundation Models for Open-Vocabulary\n HOI Detection","summary":" Open-vocabulary human-object interaction (HOI) detection, which is concerned\nwith the problem of detecting novel HOIs guided by natural language, is crucial\nfor understanding human-centric scenes. However, prior zero-shot HOI detectors\noften employ the same levels of feature maps to model HOIs with varying\ndistances, leading to suboptimal performance in scenes containing human-object\npairs with a wide range of distances. In addition, these detectors primarily\nrely on category names and overlook the rich contextual information that\nlanguage can provide, which is essential for capturing open vocabulary concepts\nthat are typically rare and not well-represented by category names alone. In\nthis paper, we introduce a novel end-to-end open vocabulary HOI detection\nframework with conditional multi-level decoding and fine-grained semantic\nenhancement (CMD-SE), harnessing the potential of Visual-Language Models\n(VLMs). Specifically, we propose to model human-object pairs with different\ndistances with different levels of feature maps by incorporating a soft\nconstraint during the bipartite matching process. Furthermore, by leveraging\nlarge language models (LLMs) such as GPT models, we exploit their extensive\nworld knowledge to generate descriptions of human body part states for various\ninteractions. Then we integrate the generalizable and fine-grained semantics of\nhuman body parts to improve interaction recognition. Experimental results on\ntwo datasets, SWIG-HOI and HICO-DET, demonstrate that our proposed method\nachieves state-of-the-art results in open vocabulary HOI detection. The code\nand models are available at https://github.com/ltttpku/CMD-SE-release.\n","authors":["Ting Lei","Shaofeng Yin","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06194v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06181v1","updated":"2024-04-09T10:04:06Z","published":"2024-04-09T10:04:06Z","title":"EPL: Evidential Prototype Learning for Semi-supervised Medical Image\n Segmentation","summary":" Although current semi-supervised medical segmentation methods can achieve\ndecent performance, they are still affected by the uncertainty in unlabeled\ndata and model predictions, and there is currently a lack of effective\nstrategies that can explore the uncertain aspects of both simultaneously. To\naddress the aforementioned issues, we propose Evidential Prototype Learning\n(EPL), which utilizes an extended probabilistic framework to effectively fuse\nvoxel probability predictions from different sources and achieves prototype\nfusion utilization of labeled and unlabeled data under a generalized evidential\nframework, leveraging voxel-level dual uncertainty masking. The uncertainty not\nonly enables the model to self-correct predictions but also improves the guided\nlearning process with pseudo-labels and is able to feed back into the\nconstruction of hidden features. The method proposed in this paper has been\nexperimented on LA, Pancreas-CT and TBAD datasets, achieving the\nstate-of-the-art performance in three different labeled ratios, which strongly\ndemonstrates the effectiveness of our strategy.\n","authors":["Yuanpeng He"],"pdf_url":"https://arxiv.org/pdf/2404.06181v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06180v1","updated":"2024-04-09T10:03:44Z","published":"2024-04-09T10:03:44Z","title":"YOLC: You Only Look Clusters for Tiny Object Detection in Aerial Images","summary":" Detecting objects from aerial images poses significant challenges due to the\nfollowing factors: 1) Aerial images typically have very large sizes, generally\nwith millions or even hundreds of millions of pixels, while computational\nresources are limited. 2) Small object size leads to insufficient information\nfor effective detection. 3) Non-uniform object distribution leads to\ncomputational resource wastage. To address these issues, we propose YOLC (You\nOnly Look Clusters), an efficient and effective framework that builds on an\nanchor-free object detector, CenterNet. To overcome the challenges posed by\nlarge-scale images and non-uniform object distribution, we introduce a Local\nScale Module (LSM) that adaptively searches cluster regions for zooming in for\naccurate detection. Additionally, we modify the regression loss using Gaussian\nWasserstein distance (GWD) to obtain high-quality bounding boxes. Deformable\nconvolution and refinement methods are employed in the detection head to\nenhance the detection of small objects. We perform extensive experiments on two\naerial image datasets, including Visdrone2019 and UAVDT, to demonstrate the\neffectiveness and superiority of our proposed approach.\n","authors":["Chenguang Liu","Guangshuai Gao","Ziyue Huang","Zhenghui Hu","Qingjie Liu","Yunhong Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06180v1.pdf","comment":"accepted to TITS"},{"id":"http://arxiv.org/abs/2404.06177v1","updated":"2024-04-09T09:58:10Z","published":"2024-04-09T09:58:10Z","title":"Uncertainty-aware Evidential Fusion-based Learning for Semi-supervised\n Medical Image Segmentation","summary":" Although the existing uncertainty-based semi-supervised medical segmentation\nmethods have achieved excellent performance, they usually only consider a\nsingle uncertainty evaluation, which often fails to solve the problem related\nto credibility completely. Therefore, based on the framework of evidential deep\nlearning, this paper integrates the evidential predictive results in the\ncross-region of mixed and original samples to reallocate the confidence degree\nand uncertainty measure of each voxel, which is realized by emphasizing\nuncertain information of probability assignments fusion rule of traditional\nevidence theory. Furthermore, we design a voxel-level asymptotic learning\nstrategy by introducing information entropy to combine with the fused\nuncertainty measure to estimate voxel prediction more precisely. The model will\ngradually pay attention to the prediction results with high uncertainty in the\nlearning process, to learn the features that are difficult to master. The\nexperimental results on LA, Pancreas-CT, ACDC and TBAD datasets demonstrate the\nsuperior performance of our proposed method in comparison with the existing\nstate of the arts.\n","authors":["Yuanpeng He","Lijian Li"],"pdf_url":"https://arxiv.org/pdf/2404.06177v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06173v1","updated":"2024-04-09T09:54:21Z","published":"2024-04-09T09:54:21Z","title":"Improving Interpretable Embeddings for Ad-hoc Video Search with\n Generative Captions and Multi-word Concept Bank","summary":" Aligning a user query and video clips in cross-modal latent space and that\nwith semantic concepts are two mainstream approaches for ad-hoc video search\n(AVS). However, the effectiveness of existing approaches is bottlenecked by the\nsmall sizes of available video-text datasets and the low quality of concept\nbanks, which results in the failures of unseen queries and the\nout-of-vocabulary problem. This paper addresses these two problems by\nconstructing a new dataset and developing a multi-word concept bank.\nSpecifically, capitalizing on a generative model, we construct a new dataset\nconsisting of 7 million generated text and video pairs for pre-training. To\ntackle the out-of-vocabulary problem, we develop a multi-word concept bank\nbased on syntax analysis to enhance the capability of a state-of-the-art\ninterpretable AVS method in modeling relationships between query words. We also\nstudy the impact of current advanced features on the method. Experimental\nresults show that the integration of the above-proposed elements doubles the\nR@1 performance of the AVS method on the MSRVTT dataset and improves the xinfAP\non the TRECVid AVS query sets for 2016-2023 (eight years) by a margin from 2%\nto 77%, with an average about 20%.\n","authors":["Jiaxin Wu","Chong-Wah Ngo","Wing-Kwong Chan"],"pdf_url":"https://arxiv.org/pdf/2404.06173v1.pdf","comment":"Accepted in ICMR2024"},{"id":"http://arxiv.org/abs/2403.10376v2","updated":"2024-04-09T09:52:54Z","published":"2024-03-15T15:05:29Z","title":"PASTA: Towards Flexible and Efficient HDR Imaging Via Progressively\n Aggregated Spatio-Temporal Alignment","summary":" Leveraging Transformer attention has led to great advancements in HDR\ndeghosting. However, the intricate nature of self-attention introduces\npractical challenges, as existing state-of-the-art methods often demand\nhigh-end GPUs or exhibit slow inference speeds, especially for high-resolution\nimages like 2K. Striking an optimal balance between performance and latency\nremains a critical concern. In response, this work presents PASTA, a novel\nProgressively Aggregated Spatio-Temporal Alignment framework for HDR\ndeghosting. Our approach achieves effectiveness and efficiency by harnessing\nhierarchical representation during feature distanglement. Through the\nutilization of diverse granularities within the hierarchical structure, our\nmethod substantially boosts computational speed and optimizes the HDR imaging\nworkflow. In addition, we explore within-scale feature modeling with local and\nglobal attention, gradually merging and refining them in a coarse-to-fine\nfashion. Experimental results showcase PASTA's superiority over current SOTA\nmethods in both visual quality and performance metrics, accompanied by a\nsubstantial 3-fold (x3) increase in inference speed.\n","authors":["Xiaoning Liu","Ao Li","Zongwei Wu","Yapeng Du","Le Zhang","Yulun Zhang","Radu Timofte","Ce Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.10376v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05393v2","updated":"2024-04-09T09:52:32Z","published":"2024-04-08T10:52:29Z","title":"PAT: Pixel-wise Adaptive Training for Long-tailed Segmentation","summary":" Beyond class frequency, we recognize the impact of class-wise relationships\namong various class-specific predictions and the imbalance in label masks on\nlong-tailed segmentation learning. To address these challenges, we propose an\ninnovative Pixel-wise Adaptive Training (PAT) technique tailored for\nlong-tailed segmentation. PAT has two key features: 1) class-wise gradient\nmagnitude homogenization, and 2) pixel-wise class-specific loss adaptation\n(PCLA). First, the class-wise gradient magnitude homogenization helps alleviate\nthe imbalance among label masks by ensuring equal consideration of the\nclass-wise impact on model updates. Second, PCLA tackles the detrimental impact\nof both rare classes within the long-tailed distribution and inaccurate\npredictions from previous training stages by encouraging learning classes with\nlow prediction confidence and guarding against forgetting classes with high\nconfidence. This combined approach fosters robust learning while preventing the\nmodel from forgetting previously learned knowledge. PAT exhibits significant\nperformance improvements, surpassing the current state-of-the-art by 2.2% in\nthe NyU dataset. Moreover, it enhances overall pixel-wise accuracy by 2.85% and\nintersection over union value by 2.07%, with a particularly notable declination\nof 0.39% in detecting rare classes compared to Balance Logits Variation, as\ndemonstrated on the three popular datasets, i.e., OxfordPetIII, CityScape, and\nNYU.\n","authors":["Khoi Do","Duong Nguyen","Nguyen H. Tran","Viet Dung Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.05393v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06165v1","updated":"2024-04-09T09:42:18Z","published":"2024-04-09T09:42:18Z","title":"Enhanced Radar Perception via Multi-Task Learning: Towards Refined Data\n for Sensor Fusion Applications","summary":" Radar and camera fusion yields robustness in perception tasks by leveraging\nthe strength of both sensors. The typical extracted radar point cloud is 2D\nwithout height information due to insufficient antennas along the elevation\naxis, which challenges the network performance. This work introduces a\nlearning-based approach to infer the height of radar points associated with 3D\nobjects. A novel robust regression loss is introduced to address the sparse\ntarget challenge. In addition, a multi-task training strategy is employed,\nemphasizing important features. The average radar absolute height error\ndecreases from 1.69 to 0.25 meters compared to the state-of-the-art height\nextension method. The estimated target height values are used to preprocess and\nenrich radar data for downstream perception tasks. Integrating this refined\nradar information further enhances the performance of existing radar camera\nfusion models for object detection and depth estimation tasks.\n","authors":["Huawei Sun","Hao Feng","Gianfranco Mauro","Julius Ott","Georg Stettinger","Lorenzo Servadei","Robert Wille"],"pdf_url":"https://arxiv.org/pdf/2404.06165v1.pdf","comment":"Accepted by IEEE Intelligent Vehicles Symposium (IV 2024)"},{"id":"http://arxiv.org/abs/2404.06155v1","updated":"2024-04-09T09:28:05Z","published":"2024-04-09T09:28:05Z","title":"Efficient and Robust Point Cloud Registration via Heuristics-guided\n Parameter Search","summary":" Estimating the rigid transformation with 6 degrees of freedom based on a\nputative 3D correspondence set is a crucial procedure in point cloud\nregistration. Existing correspondence identification methods usually lead to\nlarge outlier ratios ($>$ 95 $\\%$ is common), underscoring the significance of\nrobust registration methods. Many researchers turn to parameter search-based\nstrategies (e.g., Branch-and-Bround) for robust registration. Although related\nmethods show high robustness, their efficiency is limited to the\nhigh-dimensional search space. This paper proposes a heuristics-guided\nparameter search strategy to accelerate the search while maintaining high\nrobustness. We first sample some correspondences (i.e., heuristics) and then\njust need to sequentially search the feasible regions that make each sample an\ninlier. Our strategy largely reduces the search space and can guarantee\naccuracy with only a few inlier samples, therefore enjoying an excellent\ntrade-off between efficiency and robustness. Since directly parameterizing the\n6-dimensional nonlinear feasible region for efficient search is intractable, we\nconstruct a three-stage decomposition pipeline to reparameterize the feasible\nregion, resulting in three lower-dimensional sub-problems that are easily\nsolvable via our strategy. Besides reducing the searching dimension, our\ndecomposition enables the leverage of 1-dimensional interval stabbing at all\nthree stages for searching acceleration. Moreover, we propose a valid sampling\nstrategy to guarantee our sampling effectiveness, and a compatibility\nverification setup to further accelerate our search. Extensive experiments on\nboth simulated and real-world datasets demonstrate that our approach exhibits\ncomparable robustness with state-of-the-art methods while achieving a\nsignificant efficiency boost.\n","authors":["Tianyu Huang","Haoang Li","Liangzu Peng","Yinlong Liu","Yun-Hui Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06155v1.pdf","comment":"21 pages, 16 figures. Accepted to IEEE Transactions on Pattern\n Analysis and Machine Intelligence, 2024"},{"id":"http://arxiv.org/abs/2404.06154v1","updated":"2024-04-09T09:27:54Z","published":"2024-04-09T09:27:54Z","title":"Concise Plane Arrangements for Low-Poly Surface and Volume Modelling","summary":" Plane arrangements are a useful tool for surface and volume modelling.\nHowever, their main drawback is poor scalability. We introduce two key\nnovelties that enable the construction of plane arrangements for complex\nobjects and entire scenes: an ordering scheme for the plane insertion and the\ndirect use of input points during arrangement construction. Both ingredients\nreduce the number of unwanted splits, resulting in improved scalability of the\nconstruction mechanism by up to two orders of magnitude compared to existing\nalgorithms. We further introduce a remeshing and simplification technique that\nallows us to extract low-polygon surface meshes and lightweight convex\ndecompositions of volumes from the arrangement. We show that our approach leads\nto state-of-the-art results for the aforementioned tasks by comparing it to\nlearning-based and traditional approaches on various different datasets. Our\nimplementation is available at https://github.com/raphaelsulzer/compod .\n","authors":["Raphael Sulzer","Florent Lafarge"],"pdf_url":"https://arxiv.org/pdf/2404.06154v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06152v1","updated":"2024-04-09T09:23:04Z","published":"2024-04-09T09:23:04Z","title":"HFNeRF: Learning Human Biomechanic Features with Neural Radiance Fields","summary":" In recent advancements in novel view synthesis, generalizable Neural Radiance\nFields (NeRF) based methods applied to human subjects have shown remarkable\nresults in generating novel views from few images. However, this generalization\nability cannot capture the underlying structural features of the skeleton\nshared across all instances. Building upon this, we introduce HFNeRF: a novel\ngeneralizable human feature NeRF aimed at generating human biomechanic features\nusing a pre-trained image encoder. While previous human NeRF methods have shown\npromising results in the generation of photorealistic virtual avatars, such\nmethods lack underlying human structure or biomechanic features such as\nskeleton or joint information that are crucial for downstream applications\nincluding Augmented Reality (AR)/Virtual Reality (VR). HFNeRF leverages 2D\npre-trained foundation models toward learning human features in 3D using neural\nrendering, and then volume rendering towards generating 2D feature maps. We\nevaluate HFNeRF in the skeleton estimation task by predicting heatmaps as\nfeatures. The proposed method is fully differentiable, allowing to successfully\nlearn color, geometry, and human skeleton in a simultaneous manner. This paper\npresents preliminary results of HFNeRF, illustrating its potential in\ngenerating realistic virtual avatars with biomechanic features using NeRF.\n","authors":["Arnab Dey","Di Yang","Antitza Dantcheva","Jean Martinet"],"pdf_url":"https://arxiv.org/pdf/2404.06152v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10634v2","updated":"2024-04-09T09:18:26Z","published":"2023-12-17T07:33:06Z","title":"Anomaly Score: Evaluating Generative Models and Individual Generated\n Images based on Complexity and Vulnerability","summary":" With the advancement of generative models, the assessment of generated images\nbecomes more and more important. Previous methods measure distances between\nfeatures of reference and generated images from trained vision models. In this\npaper, we conduct an extensive investigation into the relationship between the\nrepresentation space and input space around generated images. We first propose\ntwo measures related to the presence of unnatural elements within images:\ncomplexity, which indicates how non-linear the representation space is, and\nvulnerability, which is related to how easily the extracted feature changes by\nadversarial input changes. Based on these, we introduce a new metric to\nevaluating image-generative models called anomaly score (AS). Moreover, we\npropose AS-i (anomaly score for individual images) that can effectively\nevaluate generated images individually. Experimental results demonstrate the\nvalidity of the proposed approach.\n","authors":["Jaehui Hwang","Junghyuk Lee","Jong-Seok Lee"],"pdf_url":"https://arxiv.org/pdf/2312.10634v2.pdf","comment":"Accepted in CVPR 2024"},{"id":"http://arxiv.org/abs/2404.00915v2","updated":"2024-04-09T09:16:29Z","published":"2024-04-01T04:43:39Z","title":"Scalable 3D Registration via Truncated Entry-wise Absolute Residuals","summary":" Given an input set of $3$D point pairs, the goal of outlier-robust $3$D\nregistration is to compute some rotation and translation that align as many\npoint pairs as possible. This is an important problem in computer vision, for\nwhich many highly accurate approaches have been recently proposed. Despite\ntheir impressive performance, these approaches lack scalability, often\noverflowing the $16$GB of memory of a standard laptop to handle roughly\n$30,000$ point pairs. In this paper, we propose a $3$D registration approach\nthat can process more than ten million ($10^7$) point pairs with over $99\\%$\nrandom outliers. Moreover, our method is efficient, entails low memory costs,\nand maintains high accuracy at the same time. We call our method TEAR, as it\ninvolves minimizing an outlier-robust loss that computes Truncated Entry-wise\nAbsolute Residuals. To minimize this loss, we decompose the original\n$6$-dimensional problem into two subproblems of dimensions $3$ and $2$,\nrespectively, solved in succession to global optimality via a customized\nbranch-and-bound method. While branch-and-bound is often slow and unscalable,\nthis does not apply to TEAR as we propose novel bounding functions that are\ntight and computationally efficient. Experiments on various datasets are\nconducted to validate the scalability and efficiency of our method.\n","authors":["Tianyu Huang","Liangzu Peng","René Vidal","Yun-Hui Liu"],"pdf_url":"https://arxiv.org/pdf/2404.00915v2.pdf","comment":"24 pages, 12 figures. Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2403.08801v2","updated":"2024-04-09T09:13:01Z","published":"2024-02-05T12:33:37Z","title":"CoBra: Complementary Branch Fusing Class and Semantic Knowledge for\n Robust Weakly Supervised Semantic Segmentation","summary":" Leveraging semantically precise pseudo masks derived from image-level class\nknowledge for segmentation, namely image-level Weakly Supervised Semantic\nSegmentation (WSSS), still remains challenging. While Class Activation Maps\n(CAMs) using CNNs have steadily been contributing to the success of WSSS, the\nresulting activation maps often narrowly focus on class-specific parts (e.g.,\nonly face of human). On the other hand, recent works based on vision\ntransformers (ViT) have shown promising results based on their self-attention\nmechanism to capture the semantic parts but fail in capturing complete\nclass-specific details (e.g., entire body parts of human but also with a dog\nnearby). In this work, we propose Complementary Branch (CoBra), a novel dual\nbranch framework consisting of two distinct architectures which provide\nvaluable complementary knowledge of class (from CNN) and semantic (from ViT) to\neach branch. In particular, we learn Class-Aware Projection (CAP) for the CNN\nbranch and Semantic-Aware Projection (SAP) for the ViT branch to explicitly\nfuse their complementary knowledge and facilitate a new type of extra\npatch-level supervision. Our model, through CoBra, fuses CNN and ViT's\ncomplementary outputs to create robust pseudo masks that integrate both class\nand semantic information effectively. Extensive experiments qualitatively and\nquantitatively investigate how CNN and ViT complement each other on the PASCAL\nVOC 2012 dataset, showing a state-of-the-art WSSS result. This includes not\nonly the masks generated by our model, but also the segmentation results\nderived from utilizing these masks as pseudo labels.\n","authors":["Woojung Han","Seil Kang","Kyobin Choo","Seong Jae Hwang"],"pdf_url":"https://arxiv.org/pdf/2403.08801v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02813v2","updated":"2024-04-09T09:12:58Z","published":"2023-12-05T14:56:55Z","title":"BIVDiff: A Training-Free Framework for General-Purpose Video Synthesis\n via Bridging Image and Video Diffusion Models","summary":" Diffusion models have made tremendous progress in text-driven image and video\ngeneration. Now text-to-image foundation models are widely applied to various\ndownstream image synthesis tasks, such as controllable image generation and\nimage editing, while downstream video synthesis tasks are less explored for\nseveral reasons. First, it requires huge memory and computation overhead to\ntrain a video generation foundation model. Even with video foundation models,\nadditional costly training is still required for downstream video synthesis\ntasks. Second, although some works extend image diffusion models into videos in\na training-free manner, temporal consistency cannot be well preserved. Finally,\nthese adaption methods are specifically designed for one task and fail to\ngeneralize to different tasks. To mitigate these issues, we propose a\ntraining-free general-purpose video synthesis framework, coined as {\\bf\nBIVDiff}, via bridging specific image diffusion models and general\ntext-to-video foundation diffusion models. Specifically, we first use a\nspecific image diffusion model (e.g., ControlNet and Instruct Pix2Pix) for\nframe-wise video generation, then perform Mixed Inversion on the generated\nvideo, and finally input the inverted latents into the video diffusion models\n(e.g., VidRD and ZeroScope) for temporal smoothing. This decoupled framework\nenables flexible image model selection for different purposes with strong task\ngeneralization and high efficiency. To validate the effectiveness and general\nuse of BIVDiff, we perform a wide range of video synthesis tasks, including\ncontrollable video generation, video editing, video inpainting, and\noutpainting.\n","authors":["Fengyuan Shi","Jiaxi Gu","Hang Xu","Songcen Xu","Wei Zhang","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2312.02813v2.pdf","comment":"Accepted by CVPR 2024. Project page: https://bivdiff.github.io;\n GitHub repository: https://github.com/MCG-NJU/BIVDiff"},{"id":"http://arxiv.org/abs/2404.06139v1","updated":"2024-04-09T09:05:23Z","published":"2024-04-09T09:05:23Z","title":"DiffHarmony: Latent Diffusion Model Meets Image Harmonization","summary":" Image harmonization, which involves adjusting the foreground of a composite\nimage to attain a unified visual consistency with the background, can be\nconceptualized as an image-to-image translation task. Diffusion models have\nrecently promoted the rapid development of image-to-image translation tasks .\nHowever, training diffusion models from scratch is computationally intensive.\nFine-tuning pre-trained latent diffusion models entails dealing with the\nreconstruction error induced by the image compression autoencoder, making it\nunsuitable for image generation tasks that involve pixel-level evaluation\nmetrics. To deal with these issues, in this paper, we first adapt a pre-trained\nlatent diffusion model to the image harmonization task to generate the\nharmonious but potentially blurry initial images. Then we implement two\nstrategies: utilizing higher-resolution images during inference and\nincorporating an additional refinement stage, to further enhance the clarity of\nthe initially harmonized images. Extensive experiments on iHarmony4 datasets\ndemonstrate the superiority of our proposed method. The code and model will be\nmade publicly available at https://github.com/nicecv/DiffHarmony .\n","authors":["Pengfei Zhou","Fangxiang Feng","Xiaojie Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06139v1.pdf","comment":"Accepted by ICMR 2024"},{"id":"http://arxiv.org/abs/2404.06135v1","updated":"2024-04-09T09:02:21Z","published":"2024-04-09T09:02:21Z","title":"Mansformer: Efficient Transformer of Mixed Attention for Image\n Deblurring and Beyond","summary":" Transformer has made an enormous success in natural language processing and\nhigh-level vision over the past few years. However, the complexity of\nself-attention is quadratic to the image size, which makes it infeasible for\nhigh-resolution vision tasks. In this paper, we propose the Mansformer, a\nTransformer of mixed attention that combines multiple self-attentions, gate,\nand multi-layer perceptions (MLPs), to explore and employ more possibilities of\nself-attention. Taking efficiency into account, we design four kinds of\nself-attention, whose complexities are all linear. By elaborate adjustment of\nthe tensor shapes and dimensions for the dot product, we split the typical\nself-attention of quadratic complexity into four operations of linear\ncomplexity. To adaptively merge these different kinds of self-attention, we\ntake advantage of an architecture similar to Squeeze-and-Excitation Networks.\nFurthermore, we make it to merge the two-staged Transformer design into one\nstage by the proposed gated-dconv MLP. Image deblurring is our main target,\nwhile extensive quantitative and qualitative evaluations show that this method\nperforms favorably against the state-of-the-art methods far more than simply\ndeblurring. The source codes and trained models will be made available to the\npublic.\n","authors":["Pin-Hung Kuo","Jinshan Pan","Shao-Yi Chien","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2404.06135v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06128v1","updated":"2024-04-09T08:51:44Z","published":"2024-04-09T08:51:44Z","title":"Gaussian Pancakes: Geometrically-Regularized 3D Gaussian Splatting for\n Realistic Endoscopic Reconstruction","summary":" Within colorectal cancer diagnostics, conventional colonoscopy techniques\nface critical limitations, including a limited field of view and a lack of\ndepth information, which can impede the detection of precancerous lesions.\nCurrent methods struggle to provide comprehensive and accurate 3D\nreconstructions of the colonic surface which can help minimize the missing\nregions and reinspection for pre-cancerous polyps. Addressing this, we\nintroduce 'Gaussian Pancakes', a method that leverages 3D Gaussian Splatting\n(3D GS) combined with a Recurrent Neural Network-based Simultaneous\nLocalization and Mapping (RNNSLAM) system. By introducing geometric and depth\nregularization into the 3D GS framework, our approach ensures more accurate\nalignment of Gaussians with the colon surface, resulting in smoother 3D\nreconstructions with novel viewing of detailed textures and structures.\nEvaluations across three diverse datasets show that Gaussian Pancakes enhances\nnovel view synthesis quality, surpassing current leading methods with a 18%\nboost in PSNR and a 16% improvement in SSIM. It also delivers over 100X faster\nrendering and more than 10X shorter training times, making it a practical tool\nfor real-time applications. Hence, this holds promise for achieving clinical\ntranslation for better detection and diagnosis of colorectal cancer.\n","authors":["Sierra Bonilla","Shuai Zhang","Dimitrios Psychogyios","Danail Stoyanov","Francisco Vasconcelos","Sophia Bano"],"pdf_url":"https://arxiv.org/pdf/2404.06128v1.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.06124v1","updated":"2024-04-09T08:49:01Z","published":"2024-04-09T08:49:01Z","title":"Hierarchical Insights: Exploiting Structural Similarities for Reliable\n 3D Semantic Segmentation","summary":" Safety-critical applications like autonomous driving call for robust 3D\nenvironment perception algorithms which can withstand highly diverse and\nambiguous surroundings. The predictive performance of any classification model\nstrongly depends on the underlying dataset and the prior knowledge conveyed by\nthe annotated labels. While the labels provide a basis for the learning\nprocess, they usually fail to represent inherent relations between the classes\n- representations, which are a natural element of the human perception system.\nWe propose a training strategy which enables a 3D LiDAR semantic segmentation\nmodel to learn structural relationships between the different classes through\nabstraction. We achieve this by implicitly modeling those relationships through\na learning rule for hierarchical multi-label classification (HMC). With a\ndetailed analysis we show, how this training strategy not only improves the\nmodel's confidence calibration, but also preserves additional information for\ndownstream tasks like fusion, prediction and planning.\n","authors":["Mariella Dreissig","Florian Piewak","Joschka Boedecker"],"pdf_url":"https://arxiv.org/pdf/2404.06124v1.pdf","comment":"submitted to IROS 2024"},{"id":"http://arxiv.org/abs/2404.06119v1","updated":"2024-04-09T08:41:13Z","published":"2024-04-09T08:41:13Z","title":"DreamView: Injecting View-specific Text Guidance into Text-to-3D\n Generation","summary":" Text-to-3D generation, which synthesizes 3D assets according to an overall\ntext description, has significantly progressed. However, a challenge arises\nwhen the specific appearances need customizing at designated viewpoints but\nreferring solely to the overall description for generating 3D objects. For\ninstance, ambiguity easily occurs when producing a T-shirt with distinct\npatterns on its front and back using a single overall text guidance. In this\nwork, we propose DreamView, a text-to-image approach enabling multi-view\ncustomization while maintaining overall consistency by adaptively injecting the\nview-specific and overall text guidance through a collaborative text guidance\ninjection module, which can also be lifted to 3D generation via score\ndistillation sampling. DreamView is trained with large-scale rendered\nmulti-view images and their corresponding view-specific texts to learn to\nbalance the separate content manipulation in each view and the global\nconsistency of the overall object, resulting in a dual achievement of\ncustomization and consistency. Consequently, DreamView empowers artists to\ndesign 3D objects creatively, fostering the creation of more innovative and\ndiverse 3D assets. Code and model will be released at\nhttps://github.com/iSEE-Laboratory/DreamView.\n","authors":["Junkai Yan","Yipeng Gao","Qize Yang","Xihan Wei","Xuansong Xie","Ancong Wu","Wei-Shi Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.06119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06109v1","updated":"2024-04-09T08:20:37Z","published":"2024-04-09T08:20:37Z","title":"Revising Densification in Gaussian Splatting","summary":" In this paper, we address the limitations of Adaptive Density Control (ADC)\nin 3D Gaussian Splatting (3DGS), a scene representation method achieving\nhigh-quality, photorealistic results for novel view synthesis. ADC has been\nintroduced for automatic 3D point primitive management, controlling\ndensification and pruning, however, with certain limitations in the\ndensification logic. Our main contribution is a more principled, pixel-error\ndriven formulation for density control in 3DGS, leveraging an auxiliary,\nper-pixel error function as the criterion for densification. We further\nintroduce a mechanism to control the total number of primitives generated per\nscene and correct a bias in the current opacity handling strategy of ADC during\ncloning operations. Our approach leads to consistent quality improvements\nacross a variety of benchmark scenes, without sacrificing the method's\nefficiency.\n","authors":["Samuel Rota Bulò","Lorenzo Porzi","Peter Kontschieder"],"pdf_url":"https://arxiv.org/pdf/2404.06109v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04617v2","updated":"2024-04-09T08:20:08Z","published":"2024-04-06T12:50:08Z","title":"Empowering Image Recovery_ A Multi-Attention Approach","summary":" We propose Diverse Restormer (DART), a novel image restoration method that\neffectively integrates information from various sources (long sequences, local\nand global regions, feature dimensions, and positional dimensions) to address\nrestoration challenges. While Transformer models have demonstrated excellent\nperformance in image restoration due to their self-attention mechanism, they\nface limitations in complex scenarios. Leveraging recent advancements in\nTransformers and various attention mechanisms, our method utilizes customized\nattention mechanisms to enhance overall performance. DART, our novel network\narchitecture, employs windowed attention to mimic the selective focusing\nmechanism of human eyes. By dynamically adjusting receptive fields, it\noptimally captures the fundamental features crucial for image resolution\nreconstruction. Efficiency and performance balance are achieved through the\nLongIR attention mechanism for long sequence image restoration. Integration of\nattention mechanisms across feature and positional dimensions further enhances\nthe recovery of fine details. Evaluation across five restoration tasks\nconsistently positions DART at the forefront. Upon acceptance, we commit to\nproviding publicly accessible code and models to ensure reproducibility and\nfacilitate further research.\n","authors":["Juan Wen","Yawei Li","Chao Zhang","Weiyan Hou","Radu Timofte","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2404.04617v2.pdf","comment":"12 pages, 10 figures, 12 tables"},{"id":"http://arxiv.org/abs/2401.13961v2","updated":"2024-04-09T08:07:48Z","published":"2024-01-25T05:50:48Z","title":"TriSAM: Tri-Plane SAM for zero-shot cortical blood vessel segmentation\n in VEM images","summary":" While imaging techniques at macro and mesoscales have garnered substantial\nattention and resources, microscale VEM imaging, capable of revealing intricate\nvascular details, has lacked the necessary benchmarking infrastructure. In this\npaper, we address a significant gap in the field of neuroimaging by introducing\nthe largest-to-date public benchmark, \\textbf{BvEM}, designed specifically for\ncortical blood vessel segmentation in volume electron microscopy (VEM) images.\nOur BvEM benchmark is based on VEM image volumes from three mammal species:\nadult mouse, macaque, and human. We standardized the resolution, addressed\nimaging variations, and meticulously annotated blood vessels through\nsemi-automatic, manual, and quality control processes, ensuring high-quality 3D\nsegmentation. Furthermore, we developed a zero-shot cortical blood vessel\nsegmentation method named TriSAM, which leverages the powerful segmentation\nmodel SAM for 3D segmentation. To extend SAM from 2D to 3D volume segmentation,\nTriSAM employs a multi-seed tracking framework, leveraging the reliability of\ncertain image planes for tracking while using others to identify potential\nturning points. This approach effectively achieves long-term 3D blood vessel\nsegmentation without model training or fine-tuning. Experimental results show\nthat TriSAM achieved superior performances on the BvEM benchmark across three\nspecies.\n","authors":["Jia Wan","Wanhua Li","Jason Ken Adhinarta","Atmadeep Banerjee","Evelina Sjostedt","Jingpeng Wu","Jeff Lichtman","Hanspeter Pfister","Donglai Wei"],"pdf_url":"https://arxiv.org/pdf/2401.13961v2.pdf","comment":"BvEM-Mouse can be visualized at: https://tinyurl.com/yc2s38x9"},{"id":"http://arxiv.org/abs/2403.13358v2","updated":"2024-04-09T07:55:41Z","published":"2024-03-20T07:36:43Z","title":"GeRM: A Generalist Robotic Model with Mixture-of-experts for Quadruped\n Robot","summary":" Multi-task robot learning holds significant importance in tackling diverse\nand complex scenarios. However, current approaches are hindered by performance\nissues and difficulties in collecting training datasets. In this paper, we\npropose GeRM (Generalist Robotic Model). We utilize offline reinforcement\nlearning to optimize data utilization strategies to learn from both\ndemonstrations and sub-optimal data, thus surpassing the limitations of human\ndemonstrations. Thereafter, we employ a transformer-based VLA network to\nprocess multi-modal inputs and output actions. By introducing the\nMixture-of-Experts structure, GeRM allows faster inference speed with higher\nwhole model capacity, and thus resolves the issue of limited RL parameters,\nenhancing model performance in multi-task learning while controlling\ncomputational costs. Through a series of experiments, we demonstrate that GeRM\noutperforms other methods across all tasks, while also validating its\nefficiency in both training and inference processes. Additionally, we uncover\nits potential to acquire emergent skills. Additionally, we contribute the\nQUARD-Auto dataset, collected automatically to support our training approach\nand foster advancements in multi-task quadruped robot learning. This work\npresents a new paradigm for reducing the cost of collecting robot data and\ndriving progress in the multi-task learning community. You can reach our\nproject and video through the link: https://songwxuan.github.io/GeRM/ .\n","authors":["Wenxuan Song","Han Zhao","Pengxiang Ding","Can Cui","Shangke Lyu","Yaning Fan","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.13358v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.05970v3","updated":"2024-04-09T07:49:55Z","published":"2023-03-10T15:01:51Z","title":"Exploring Recurrent Long-term Temporal Fusion for Multi-view 3D\n Perception","summary":" Long-term temporal fusion is a crucial but often overlooked technique in\ncamera-based Bird's-Eye-View (BEV) 3D perception. Existing methods are mostly\nin a parallel manner. While parallel fusion can benefit from long-term\ninformation, it suffers from increasing computational and memory overheads as\nthe fusion window size grows. Alternatively, BEVFormer adopts a recurrent\nfusion pipeline so that history information can be efficiently integrated, yet\nit fails to benefit from longer temporal frames. In this paper, we explore an\nembarrassingly simple long-term recurrent fusion strategy built upon the\nLSS-based methods and find it already able to enjoy the merits from both sides,\ni.e., rich long-term information and efficient fusion pipeline. A temporal\nembedding module is further proposed to improve the model's robustness against\noccasionally missed frames in practical scenarios. We name this simple but\neffective fusing pipeline VideoBEV. Experimental results on the nuScenes\nbenchmark show that VideoBEV obtains strong performance on various camera-based\n3D perception tasks, including object detection (55.4\\% mAP and 62.9\\% NDS),\nsegmentation (48.6\\% vehicle mIoU), tracking (54.8\\% AMOTA), and motion\nprediction (0.80m minADE and 0.463 EPA).\n","authors":["Chunrui Han","Jinrong Yang","Jianjian Sun","Zheng Ge","Runpei Dong","Hongyu Zhou","Weixin Mao","Yuang Peng","Xiangyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.05970v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06091v1","updated":"2024-04-09T07:49:30Z","published":"2024-04-09T07:49:30Z","title":"Hash3D: Training-free Acceleration for 3D Generation","summary":" The evolution of 3D generative modeling has been notably propelled by the\nadoption of 2D diffusion models. Despite this progress, the cumbersome\noptimization process per se presents a critical hurdle to efficiency. In this\npaper, we introduce Hash3D, a universal acceleration for 3D generation without\nmodel training. Central to Hash3D is the insight that feature-map redundancy is\nprevalent in images rendered from camera positions and diffusion time-steps in\nclose proximity. By effectively hashing and reusing these feature maps across\nneighboring timesteps and camera angles, Hash3D substantially prevents\nredundant calculations, thus accelerating the diffusion model's inference in 3D\ngeneration tasks. We achieve this through an adaptive grid-based hashing.\nSurprisingly, this feature-sharing mechanism not only speed up the generation\nbut also enhances the smoothness and view consistency of the synthesized 3D\nobjects. Our experiments covering 5 text-to-3D and 3 image-to-3D models,\ndemonstrate Hash3D's versatility to speed up optimization, enhancing efficiency\nby 1.3 to 4 times. Additionally, Hash3D's integration with 3D Gaussian\nsplatting largely speeds up 3D model creation, reducing text-to-3D processing\nto about 10 minutes and image-to-3D conversion to roughly 30 seconds. The\nproject page is at https://adamdad.github.io/hash3D/.\n","authors":["Xingyi Yang","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06091v1.pdf","comment":"https://adamdad.github.io/hash3D/"},{"id":"http://arxiv.org/abs/2311.17002v3","updated":"2024-04-09T07:46:43Z","published":"2023-11-28T17:57:44Z","title":"Ranni: Taming Text-to-Image Diffusion for Accurate Instruction Following","summary":" Existing text-to-image (T2I) diffusion models usually struggle in\ninterpreting complex prompts, especially those with quantity, object-attribute\nbinding, and multi-subject descriptions. In this work, we introduce a semantic\npanel as the middleware in decoding texts to images, supporting the generator\nto better follow instructions. The panel is obtained through arranging the\nvisual concepts parsed from the input text by the aid of large language models,\nand then injected into the denoising network as a detailed control signal to\ncomplement the text condition. To facilitate text-to-panel learning, we come up\nwith a carefully designed semantic formatting protocol, accompanied by a\nfully-automatic data preparation pipeline. Thanks to such a design, our\napproach, which we call Ranni, manages to enhance a pre-trained T2I generator\nregarding its textual controllability. More importantly, the introduction of\nthe generative middleware brings a more convenient form of interaction (i.e.,\ndirectly adjusting the elements in the panel or using language instructions)\nand further allows users to finely customize their generation, based on which\nwe develop a practical system and showcase its potential in continuous\ngeneration and chatting-based editing. Our project page is at\nhttps://ranni-t2i.github.io/Ranni.\n","authors":["Yutong Feng","Biao Gong","Di Chen","Yujun Shen","Yu Liu","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.17002v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05559v2","updated":"2024-04-09T07:43:29Z","published":"2024-04-08T14:30:42Z","title":"TIM: A Time Interval Machine for Audio-Visual Action Recognition","summary":" Diverse actions give rise to rich audio-visual signals in long videos. Recent\nworks showcase that the two modalities of audio and video exhibit different\ntemporal extents of events and distinct labels. We address the interplay\nbetween the two modalities in long videos by explicitly modelling the temporal\nextents of audio and visual events. We propose the Time Interval Machine (TIM)\nwhere a modality-specific time interval poses as a query to a transformer\nencoder that ingests a long video input. The encoder then attends to the\nspecified interval, as well as the surrounding context in both modalities, in\norder to recognise the ongoing action.\n We test TIM on three long audio-visual video datasets: EPIC-KITCHENS,\nPerception Test, and AVE, reporting state-of-the-art (SOTA) for recognition. On\nEPIC-KITCHENS, we beat previous SOTA that utilises LLMs and significantly\nlarger pre-training by 2.9% top-1 action recognition accuracy. Additionally, we\nshow that TIM can be adapted for action detection, using dense multi-scale\ninterval queries, outperforming SOTA on EPIC-KITCHENS-100 for most metrics, and\nshowing strong performance on the Perception Test. Our ablations show the\ncritical role of integrating the two modalities and modelling their time\nintervals in achieving this performance. Code and models at:\nhttps://github.com/JacobChalk/TIM\n","authors":["Jacob Chalk","Jaesung Huh","Evangelos Kazakos","Andrew Zisserman","Dima Damen"],"pdf_url":"https://arxiv.org/pdf/2404.05559v2.pdf","comment":"Accepted to CVPR 2024. Project Webpage:\n https://jacobchalk.github.io/TIM-Project"},{"id":"http://arxiv.org/abs/2404.06080v1","updated":"2024-04-09T07:39:21Z","published":"2024-04-09T07:39:21Z","title":"Using Few-Shot Learning to Classify Primary Lung Cancer and Other\n Malignancy with Lung Metastasis in Cytological Imaging via Endobronchial\n Ultrasound Procedures","summary":" This study aims to establish a computer-aided diagnosis system for\nendobronchial ultrasound (EBUS) surgery to assist physicians in the preliminary\ndiagnosis of metastatic cancer. This involves arranging immediate examinations\nfor other sites of metastatic cancer after EBUS surgery, eliminating the need\nto wait for reports, thereby shortening the waiting time by more than half and\nenabling patients to detect other cancers earlier, allowing for early planning\nand implementation of treatment plans. Unlike previous studies on cell image\nclassification, which have abundant datasets for training, this study must also\nbe able to make effective classifications despite the limited amount of case\ndata for lung metastatic cancer. In the realm of small data set classification\nmethods, Few-shot learning (FSL) has become mainstream in recent years. Through\nits ability to train on small datasets and its strong generalization\ncapabilities, FSL shows potential in this task of lung metastatic cell image\nclassification. This study will adopt the approach of Few-shot learning,\nreferencing existing proposed models, and designing a model architecture for\nclassifying lung metastases cell images. Batch Spectral Regularization (BSR)\nwill be incorporated as a loss update parameter, and the Finetune method of PMF\nwill be modified. In terms of test results, the addition of BSR and the\nmodified Finetune method further increases the accuracy by 8.89% to 65.60%,\noutperforming other FSL methods. This study confirms that FSL is superior to\nsupervised and transfer learning in classifying metastatic cancer and\ndemonstrates that using BSR as a loss function and modifying Finetune can\nenhance the model's capabilities.\n","authors":["Ching-Kai Lin","Di-Chun Wei","Yun-Chien Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.06080v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07937v4","updated":"2024-04-09T07:31:25Z","published":"2023-12-13T07:30:19Z","title":"BOTH2Hands: Inferring 3D Hands from Both Text Prompts and Body Dynamics","summary":" The recently emerging text-to-motion advances have spired numerous attempts\nfor convenient and interactive human motion generation. Yet, existing methods\nare largely limited to generating body motions only without considering the\nrich two-hand motions, let alone handling various conditions like body dynamics\nor texts. To break the data bottleneck, we propose BOTH57M, a novel multi-modal\ndataset for two-hand motion generation. Our dataset includes accurate motion\ntracking for the human body and hands and provides pair-wised finger-level hand\nannotations and body descriptions. We further provide a strong baseline method,\nBOTH2Hands, for the novel task: generating vivid two-hand motions from both\nimplicit body dynamics and explicit text prompts. We first warm up two parallel\nbody-to-hand and text-to-hand diffusion models and then utilize the\ncross-attention transformer for motion blending. Extensive experiments and\ncross-validations demonstrate the effectiveness of our approach and dataset for\ngenerating convincing two-hand motions from the hybrid body-and-textual\nconditions. Our dataset and code will be disseminated to the community for\nfuture research.\n","authors":["Wenqian Zhang","Molin Huang","Yuxuan Zhou","Juze Zhang","Jingyi Yu","Jingya Wang","Lan Xu"],"pdf_url":"https://arxiv.org/pdf/2312.07937v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06075v1","updated":"2024-04-09T07:25:30Z","published":"2024-04-09T07:25:30Z","title":"LIPT: Latency-aware Image Processing Transformer","summary":" Transformer is leading a trend in the field of image processing. Despite the\ngreat success that existing lightweight image processing transformers have\nachieved, they are tailored to FLOPs or parameters reduction, rather than\npractical inference acceleration. In this paper, we present a latency-aware\nimage processing transformer, termed LIPT. We devise the low-latency proportion\nLIPT block that substitutes memory-intensive operators with the combination of\nself-attention and convolutions to achieve practical speedup. Specifically, we\npropose a novel non-volatile sparse masking self-attention (NVSM-SA) that\nutilizes a pre-computing sparse mask to capture contextual information from a\nlarger window with no extra computation overload. Besides, a high-frequency\nreparameterization module (HRM) is proposed to make LIPT block\nreparameterization friendly, which improves the model's detail reconstruction\ncapability. Extensive experiments on multiple image processing tasks (e.g.,\nimage super-resolution (SR), JPEG artifact reduction, and image denoising)\ndemonstrate the superiority of LIPT on both latency and PSNR. LIPT achieves\nreal-time GPU inference with state-of-the-art performance on multiple image SR\nbenchmarks.\n","authors":["Junbo Qiao","Wei Li","Haizhen Xie","Hanting Chen","Yunshuai Zhou","Zhijun Tu","Jie Hu","Shaohui Lin"],"pdf_url":"https://arxiv.org/pdf/2404.06075v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03892v2","updated":"2024-04-09T07:21:32Z","published":"2024-04-05T05:00:21Z","title":"Enhancing Breast Cancer Diagnosis in Mammography: Evaluation and\n Integration of Convolutional Neural Networks and Explainable AI","summary":" The study introduces an integrated framework combining Convolutional Neural\nNetworks (CNNs) and Explainable Artificial Intelligence (XAI) for the enhanced\ndiagnosis of breast cancer using the CBIS-DDSM dataset. Utilizing a fine-tuned\nResNet50 architecture, our investigation not only provides effective\ndifferentiation of mammographic images into benign and malignant categories but\nalso addresses the opaque \"black-box\" nature of deep learning models by\nemploying XAI methodologies, namely Grad-CAM, LIME, and SHAP, to interpret CNN\ndecision-making processes for healthcare professionals. Our methodology\nencompasses an elaborate data preprocessing pipeline and advanced data\naugmentation techniques to counteract dataset limitations, and transfer\nlearning using pre-trained networks, such as VGG-16, DenseNet and ResNet was\nemployed. A focal point of our study is the evaluation of XAI's effectiveness\nin interpreting model predictions, highlighted by utilising the Hausdorff\nmeasure to assess the alignment between AI-generated explanations and expert\nannotations quantitatively. This approach plays a critical role for XAI in\npromoting trustworthiness and ethical fairness in AI-assisted diagnostics. The\nfindings from our research illustrate the effective collaboration between CNNs\nand XAI in advancing diagnostic methods for breast cancer, thereby facilitating\na more seamless integration of advanced AI technologies within clinical\nsettings. By enhancing the interpretability of AI-driven decisions, this work\nlays the groundwork for improved collaboration between AI systems and medical\npractitioners, ultimately enriching patient care. Furthermore, the implications\nof our research extend well beyond the current methodologies, advocating for\nsubsequent inquiries into the integration of multimodal data and the refinement\nof AI explanations to satisfy the needs of clinical practice.\n","authors":["Maryam Ahmed","Tooba Bibi","Rizwan Ahmed Khan","Sidra Nasir"],"pdf_url":"https://arxiv.org/pdf/2404.03892v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18201v2","updated":"2024-04-09T07:18:41Z","published":"2024-02-28T09:46:56Z","title":"Learning Invariant Inter-pixel Correlations for Superpixel Generation","summary":" Deep superpixel algorithms have made remarkable strides by substituting\nhand-crafted features with learnable ones. Nevertheless, we observe that\nexisting deep superpixel methods, serving as mid-level representation\noperations, remain sensitive to the statistical properties (e.g., color\ndistribution, high-level semantics) embedded within the training dataset.\nConsequently, learnable features exhibit constrained discriminative capability,\nresulting in unsatisfactory pixel grouping performance, particularly in\nuntrainable application scenarios. To address this issue, we propose the\nContent Disentangle Superpixel (CDS) algorithm to selectively separate the\ninvariant inter-pixel correlations and statistical properties, i.e., style\nnoise. Specifically, We first construct auxiliary modalities that are\nhomologous to the original RGB image but have substantial stylistic variations.\nThen, driven by mutual information, we propose the local-grid correlation\nalignment across modalities to reduce the distribution discrepancy of\nadaptively selected features and learn invariant inter-pixel correlations.\nAfterwards, we perform global-style mutual information minimization to enforce\nthe separation of invariant content and train data styles. The experimental\nresults on four benchmark datasets demonstrate the superiority of our approach\nto existing state-of-the-art methods, regarding boundary adherence,\ngeneralization, and efficiency. Code and pre-trained model are available at\nhttps://github.com/rookiie/CDSpixel.\n","authors":["Sen Xu","Shikui Wei","Tao Ruan","Lixin Liao"],"pdf_url":"https://arxiv.org/pdf/2402.18201v2.pdf","comment":"Accepted by AAAI24"},{"id":"http://arxiv.org/abs/2404.06065v1","updated":"2024-04-09T07:08:00Z","published":"2024-04-09T07:08:00Z","title":"Unified Entropy Optimization for Open-Set Test-Time Adaptation","summary":" Test-time adaptation (TTA) aims at adapting a model pre-trained on the\nlabeled source domain to the unlabeled target domain. Existing methods usually\nfocus on improving TTA performance under covariate shifts, while neglecting\nsemantic shifts. In this paper, we delve into a realistic open-set TTA setting\nwhere the target domain may contain samples from unknown classes. Many\nstate-of-the-art closed-set TTA methods perform poorly when applied to open-set\nscenarios, which can be attributed to the inaccurate estimation of data\ndistribution and model confidence. To address these issues, we propose a simple\nbut effective framework called unified entropy optimization (UniEnt), which is\ncapable of simultaneously adapting to covariate-shifted in-distribution (csID)\ndata and detecting covariate-shifted out-of-distribution (csOOD) data.\nSpecifically, UniEnt first mines pseudo-csID and pseudo-csOOD samples from test\ndata, followed by entropy minimization on the pseudo-csID data and entropy\nmaximization on the pseudo-csOOD data. Furthermore, we introduce UniEnt+ to\nalleviate the noise caused by hard data partition leveraging sample-level\nconfidence. Extensive experiments on CIFAR benchmarks and Tiny-ImageNet-C show\nthe superiority of our framework. The code is available at\nhttps://github.com/gaozhengqing/UniEnt\n","authors":["Zhengqing Gao","Xu-Yao Zhang","Cheng-Lin Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06065v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.04580v2","updated":"2024-04-09T06:56:02Z","published":"2024-04-06T10:30:31Z","title":"SDFR: Synthetic Data for Face Recognition Competition","summary":" Large-scale face recognition datasets are collected by crawling the Internet\nand without individuals' consent, raising legal, ethical, and privacy concerns.\nWith the recent advances in generative models, recently several works proposed\ngenerating synthetic face recognition datasets to mitigate concerns in\nweb-crawled face recognition datasets. This paper presents the summary of the\nSynthetic Data for Face Recognition (SDFR) Competition held in conjunction with\nthe 18th IEEE International Conference on Automatic Face and Gesture\nRecognition (FG 2024) and established to investigate the use of synthetic data\nfor training face recognition models. The SDFR competition was split into two\ntasks, allowing participants to train face recognition systems using new\nsynthetic datasets and/or existing ones. In the first task, the face\nrecognition backbone was fixed and the dataset size was limited, while the\nsecond task provided almost complete freedom on the model backbone, the\ndataset, and the training pipeline. The submitted models were trained on\nexisting and also new synthetic datasets and used clever methods to improve\ntraining with synthetic data. The submissions were evaluated and ranked on a\ndiverse set of seven benchmarking datasets. The paper gives an overview of the\nsubmitted face recognition models and reports achieved performance compared to\nbaseline models trained on real and synthetic datasets. Furthermore, the\nevaluation of submissions is extended to bias assessment across different\ndemography groups. Lastly, an outlook on the current state of the research in\ntraining face recognition models using synthetic data is presented, and\nexisting problems as well as potential future directions are also discussed.\n","authors":["Hatef Otroshi Shahreza","Christophe Ecabert","Anjith George","Alexander Unnervik","Sébastien Marcel","Nicolò Di Domenico","Guido Borghi","Davide Maltoni","Fadi Boutros","Julia Vogel","Naser Damer","Ángela Sánchez-Pérez"," EnriqueMas-Candela","Jorge Calvo-Zaragoza","Bernardo Biesseck","Pedro Vidal","Roger Granada","David Menotti","Ivan DeAndres-Tame","Simone Maurizio La Cava","Sara Concas","Pietro Melzi","Ruben Tolosana","Ruben Vera-Rodriguez","Gianpaolo Perelli","Giulia Orrù","Gian Luca Marcialis","Julian Fierrez"],"pdf_url":"https://arxiv.org/pdf/2404.04580v2.pdf","comment":"The 18th IEEE International Conference on Automatic Face and Gesture\n Recognition (FG 2024)"},{"id":"http://arxiv.org/abs/2404.06057v1","updated":"2024-04-09T06:47:44Z","published":"2024-04-09T06:47:44Z","title":"Unified Multi-modal Diagnostic Framework with Reconstruction\n Pre-training and Heterogeneity-combat Tuning","summary":" Medical multi-modal pre-training has revealed promise in computer-aided\ndiagnosis by leveraging large-scale unlabeled datasets. However, existing\nmethods based on masked autoencoders mainly rely on data-level reconstruction\ntasks, but lack high-level semantic information. Furthermore, two significant\nheterogeneity challenges hinder the transfer of pre-trained knowledge to\ndownstream tasks, \\textit{i.e.}, the distribution heterogeneity between\npre-training data and downstream data, and the modality heterogeneity within\ndownstream data. To address these challenges, we propose a Unified Medical\nMulti-modal Diagnostic (UMD) framework with tailored pre-training and\ndownstream tuning strategies. Specifically, to enhance the representation\nabilities of vision and language encoders, we propose the Multi-level\nReconstruction Pre-training (MR-Pretrain) strategy, including a feature-level\nand data-level reconstruction, which guides models to capture the semantic\ninformation from masked inputs of different modalities. Moreover, to tackle two\nkinds of heterogeneities during the downstream tuning, we present the\nheterogeneity-combat downstream tuning strategy, which consists of a\nTask-oriented Distribution Calibration (TD-Calib) and a Gradient-guided\nModality Coordination (GM-Coord). In particular, TD-Calib fine-tunes the\npre-trained model regarding the distribution of downstream datasets, and\nGM-Coord adjusts the gradient weights according to the dynamic optimization\nstatus of different modalities. Extensive experiments on five public medical\ndatasets demonstrate the effectiveness of our UMD framework, which remarkably\noutperforms existing approaches on three kinds of downstream tasks.\n","authors":["Yupei Zhang","Li Pan","Qiushi Yang","Tan Li","Zhen Chen"],"pdf_url":"https://arxiv.org/pdf/2404.06057v1.pdf","comment":"to be published in IEEE JBHI; Code available at\n https://github.com/helenypzhang/UMD"},{"id":"http://arxiv.org/abs/2404.06050v1","updated":"2024-04-09T06:27:35Z","published":"2024-04-09T06:27:35Z","title":"Incremental Joint Learning of Depth, Pose and Implicit Scene\n Representation on Monocular Camera in Large-scale Scenes","summary":" Dense scene reconstruction for photo-realistic view synthesis has various\napplications, such as VR/AR, autonomous vehicles. However, most existing\nmethods have difficulties in large-scale scenes due to three core challenges:\n\\textit{(a) inaccurate depth input.} Accurate depth input is impossible to get\nin real-world large-scale scenes. \\textit{(b) inaccurate pose estimation.} Most\nexisting approaches rely on accurate pre-estimated camera poses. \\textit{(c)\ninsufficient scene representation capability.} A single global radiance field\nlacks the capacity to effectively scale to large-scale scenes. To this end, we\npropose an incremental joint learning framework, which can achieve accurate\ndepth, pose estimation, and large-scale scene reconstruction. A vision\ntransformer-based network is adopted as the backbone to enhance performance in\nscale information estimation. For pose estimation, a feature-metric bundle\nadjustment (FBA) method is designed for accurate and robust camera tracking in\nlarge-scale scenes. In terms of implicit scene representation, we propose an\nincremental scene representation method to construct the entire large-scale\nscene as multiple local radiance fields to enhance the scalability of 3D scene\nrepresentation. Extended experiments have been conducted to demonstrate the\neffectiveness and accuracy of our method in depth estimation, pose estimation,\nand large-scale scene reconstruction.\n","authors":["Tianchen Deng","Nailin Wang","Chongdi Wang","Shenghai Yuan","Jingchuan Wang","Danwei Wang","Weidong Chen"],"pdf_url":"https://arxiv.org/pdf/2404.06050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04421v2","updated":"2024-04-09T06:23:35Z","published":"2024-04-05T21:44:57Z","title":"PhysAvatar: Learning the Physics of Dressed 3D Avatars from Visual\n Observations","summary":" Modeling and rendering photorealistic avatars is of crucial importance in\nmany applications. Existing methods that build a 3D avatar from visual\nobservations, however, struggle to reconstruct clothed humans. We introduce\nPhysAvatar, a novel framework that combines inverse rendering with inverse\nphysics to automatically estimate the shape and appearance of a human from\nmulti-view video data along with the physical parameters of the fabric of their\nclothes. For this purpose, we adopt a mesh-aligned 4D Gaussian technique for\nspatio-temporal mesh tracking as well as a physically based inverse renderer to\nestimate the intrinsic material properties. PhysAvatar integrates a physics\nsimulator to estimate the physical parameters of the garments using\ngradient-based optimization in a principled manner. These novel capabilities\nenable PhysAvatar to create high-quality novel-view renderings of avatars\ndressed in loose-fitting clothes under motions and lighting conditions not seen\nin the training data. This marks a significant advancement towards modeling\nphotorealistic digital humans using physically based inverse rendering with\nphysics in the loop. Our project website is at:\nhttps://qingqing-zhao.github.io/PhysAvatar\n","authors":["Yang Zheng","Qingqing Zhao","Guandao Yang","Wang Yifan","Donglai Xiang","Florian Dubost","Dmitry Lagun","Thabo Beeler","Federico Tombari","Leonidas Guibas","Gordon Wetzstein"],"pdf_url":"https://arxiv.org/pdf/2404.04421v2.pdf","comment":"Project Page: https://qingqing-zhao.github.io/PhysAvatar"},{"id":"http://arxiv.org/abs/2404.06044v1","updated":"2024-04-09T06:10:15Z","published":"2024-04-09T06:10:15Z","title":"Object Dynamics Modeling with Hierarchical Point Cloud-based\n Representations","summary":" Modeling object dynamics with a neural network is an important problem with\nnumerous applications. Most recent work has been based on graph neural\nnetworks. However, physics happens in 3D space, where geometric information\npotentially plays an important role in modeling physical phenomena. In this\nwork, we propose a novel U-net architecture based on continuous point\nconvolution which naturally embeds information from 3D coordinates and allows\nfor multi-scale feature representations with established downsampling and\nupsampling procedures. Bottleneck layers in the downsampled point clouds lead\nto better long-range interaction modeling. Besides, the flexibility of point\nconvolutions allows our approach to generalize to sparsely sampled points from\nmesh vertices and dynamically generate features on important interaction points\non mesh faces. Experimental results demonstrate that our approach significantly\nimproves the state-of-the-art, especially in scenarios that require accurate\ngravity or collision reasoning.\n","authors":["Chanho Kim","Li Fuxin"],"pdf_url":"https://arxiv.org/pdf/2404.06044v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2306.11729v2","updated":"2024-04-09T05:57:18Z","published":"2023-06-20T17:57:23Z","title":"Dense Video Object Captioning from Disjoint Supervision","summary":" We propose a new task and model for dense video object captioning --\ndetecting, tracking and captioning trajectories of objects in a video. This\ntask unifies spatial and temporal localization in video, whilst also requiring\nfine-grained visual understanding that is best described by natural language.\nWe propose a unified model, and demonstrate how our end-to-end approach is more\naccurate and temporally coherent than a multi-stage pipeline combining\nstate-of-the-art detection, tracking, and captioning models. Moreover, we\npropose a training strategy based on a mixture of disjoint tasks, which allows\nus to leverage diverse, large-scale datasets which supervise different parts of\nour model. Although each pretraining task only provides weak supervision, they\nare complementary and, when combined, result in noteworthy zero-shot ability\nand serve as strong initialization for additional finetuning to further improve\naccuracy. We carefully design new metrics capturing all components of our task,\nand show how we can repurpose existing video grounding datasets (e.g. VidSTG\nand VLN) for our new task. We show that our model improves upon a number of\nstrong baselines for this new task. Furthermore, we can apply our model to the\ntask of spatial grounding, outperforming prior state-of-the-art on VidSTG and\nVLN, without explicitly training for it. Code is available at\nhttps://github.com/google-research/scenic/tree/main/scenic/projects/densevoc.\n","authors":["Xingyi Zhou","Anurag Arnab","Chen Sun","Cordelia Schmid"],"pdf_url":"https://arxiv.org/pdf/2306.11729v2.pdf","comment":"Code is available at\n https://github.com/google-research/scenic/tree/main/scenic/projects/densevoc"},{"id":"http://arxiv.org/abs/2404.06036v1","updated":"2024-04-09T05:49:04Z","published":"2024-04-09T05:49:04Z","title":"Space-Time Video Super-resolution with Neural Operator","summary":" This paper addresses the task of space-time video super-resolution (ST-VSR).\nExisting methods generally suffer from inaccurate motion estimation and motion\ncompensation (MEMC) problems for large motions. Inspired by recent progress in\nphysics-informed neural networks, we model the challenges of MEMC in ST-VSR as\na mapping between two continuous function spaces. Specifically, our approach\ntransforms independent low-resolution representations in the coarse-grained\ncontinuous function space into refined representations with enriched\nspatiotemporal details in the fine-grained continuous function space. To\nachieve efficient and accurate MEMC, we design a Galerkin-type attention\nfunction to perform frame alignment and temporal interpolation. Due to the\nlinear complexity of the Galerkin-type attention mechanism, our model avoids\npatch partitioning and offers global receptive fields, enabling precise\nestimation of large motions. The experimental results show that the proposed\nmethod surpasses state-of-the-art techniques in both fixed-size and continuous\nspace-time video super-resolution tasks.\n","authors":["Yuantong Zhang","Hanyou Zheng","Daiqin Yang","Zhenzhong Chen","Haichuan Ma","Wenpeng Ding"],"pdf_url":"https://arxiv.org/pdf/2404.06036v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.10473v4","updated":"2024-04-09T05:47:57Z","published":"2023-02-21T06:31:53Z","title":"Oriented Object Detection in Optical Remote Sensing Images using Deep\n Learning: A Survey","summary":" Oriented object detection is one of the most fundamental and challenging\ntasks in remote sensing, aiming to locate and classify objects with arbitrary\norientations. Recent years have witnessed remarkable progress in oriented\nobject detection using deep learning techniques. Given the rapid development of\nthis field, this paper aims to provide a comprehensive survey of recent\nadvances in oriented object detection. To be specific, we first review the\ntechnical evolution from horizontal object detection to oriented object\ndetection and summarize the specific challenges, including feature\nmisalignment, spatial misalignment, and periodicity of angle. Subsequently, we\nfurther categorize existing methods into detection framework, oriented bounding\nbox (OBB) regression, and feature representations, and discuss how these\nmethods address the above challenges in detail. In addition, we cover several\npublicly available datasets and performance evaluation protocols. Furthermore,\nwe provide a comprehensive comparison and analysis of state-of-the-art oriented\nobject detection methods. Toward the end of this paper, we discuss several\nfuture directions for oriented object detection.\n","authors":["Kun Wang","Zi Wang","Zhang Li","Ang Su","Xichao Teng","Minhao Liu","Qifeng Yu"],"pdf_url":"https://arxiv.org/pdf/2302.10473v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06668v2","updated":"2024-04-09T05:47:39Z","published":"2024-03-11T12:36:14Z","title":"PeerAiD: Improving Adversarial Distillation from a Specialized Peer\n Tutor","summary":" Adversarial robustness of the neural network is a significant concern when it\nis applied to security-critical domains. In this situation, adversarial\ndistillation is a promising option which aims to distill the robustness of the\nteacher network to improve the robustness of a small student network. Previous\nworks pretrain the teacher network to make it robust to the adversarial\nexamples aimed at itself. However, the adversarial examples are dependent on\nthe parameters of the target network. The fixed teacher network inevitably\ndegrades its robustness against the unseen transferred adversarial examples\nwhich targets the parameters of the student network in the adversarial\ndistillation process. We propose PeerAiD to make a peer network learn the\nadversarial examples of the student network instead of adversarial examples\naimed at itself. PeerAiD is an adversarial distillation that trains the peer\nnetwork and the student network simultaneously in order to make the peer\nnetwork specialized for defending the student network. We observe that such\npeer networks surpass the robustness of pretrained robust teacher network\nagainst student-attacked adversarial samples. With this peer network and\nadversarial distillation, PeerAiD achieves significantly higher robustness of\nthe student network with AutoAttack (AA) accuracy up to 1.66%p and improves the\nnatural accuracy of the student network up to 4.72%p with ResNet-18 and\nTinyImageNet dataset.\n","authors":["Jaewon Jung","Hongsun Jang","Jaeyong Song","Jinho Lee"],"pdf_url":"https://arxiv.org/pdf/2403.06668v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.06033v1","updated":"2024-04-09T05:44:00Z","published":"2024-04-09T05:44:00Z","title":"Little Strokes Fell Great Oaks: Boosting the Hierarchical Features for\n Multi-exposure Image Fusion","summary":" In recent years, deep learning networks have made remarkable strides in the\ndomain of multi-exposure image fusion. Nonetheless, prevailing approaches often\ninvolve directly feeding over-exposed and under-exposed images into the\nnetwork, which leads to the under-utilization of inherent information present\nin the source images. Additionally, unsupervised techniques predominantly\nemploy rudimentary weighted summation for color channel processing, culminating\nin an overall desaturated final image tone. To partially mitigate these issues,\nthis study proposes a gamma correction module specifically designed to fully\nleverage latent information embedded within source images. Furthermore, a\nmodified transformer block, embracing with self-attention mechanisms, is\nintroduced to optimize the fusion process. Ultimately, a novel color\nenhancement algorithm is presented to augment image saturation while preserving\nintricate details. The source code is available at this https://github.com/ZhiyingDu/BHFMEF url.\n","authors":["Pan Mu","Zhiying Du","Jinyuan Liu","Cong Bai"],"pdf_url":"https://arxiv.org/pdf/2404.06033v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06029v1","updated":"2024-04-09T05:30:58Z","published":"2024-04-09T05:30:58Z","title":"Improving Facial Landmark Detection Accuracy and Efficiency with\n Knowledge Distillation","summary":" The domain of computer vision has experienced significant advancements in\nfacial-landmark detection, becoming increasingly essential across various\napplications such as augmented reality, facial recognition, and emotion\nanalysis. Unlike object detection or semantic segmentation, which focus on\nidentifying objects and outlining boundaries, faciallandmark detection aims to\nprecisely locate and track critical facial features. However, deploying deep\nlearning-based facial-landmark detection models on embedded systems with\nlimited computational resources poses challenges due to the complexity of\nfacial features, especially in dynamic settings. Additionally, ensuring\nrobustness across diverse ethnicities and expressions presents further\nobstacles. Existing datasets often lack comprehensive representation of facial\nnuances, particularly within populations like those in Taiwan. This paper\nintroduces a novel approach to address these challenges through the development\nof a knowledge distillation method. By transferring knowledge from larger\nmodels to smaller ones, we aim to create lightweight yet powerful deep learning\nmodels tailored specifically for facial-landmark detection tasks. Our goal is\nto design models capable of accurately locating facial landmarks under varying\nconditions, including diverse expressions, orientations, and lighting\nenvironments. The ultimate objective is to achieve high accuracy and real-time\nperformance suitable for deployment on embedded systems. This method was\nsuccessfully implemented and achieved a top 6th place finish out of 165\nparticipants in the IEEE ICME 2024 PAIR competition.\n","authors":["Zong-Wei Hong","Yu-Chen Lin"],"pdf_url":"https://arxiv.org/pdf/2404.06029v1.pdf","comment":"technical report. 6th/165 in IEEE ICME 2024 PAIR competition"},{"id":"http://arxiv.org/abs/2404.06025v1","updated":"2024-04-09T05:21:32Z","published":"2024-04-09T05:21:32Z","title":"Greedy-DiM: Greedy Algorithms for Unreasonably Effective Face Morphs","summary":" Morphing attacks are an emerging threat to state-of-the-art Face Recognition\n(FR) systems, which aim to create a single image that contains the biometric\ninformation of multiple identities. Diffusion Morphs (DiM) are a recently\nproposed morphing attack that has achieved state-of-the-art performance for\nrepresentation-based morphing attacks. However, none of the existing research\non DiMs have leveraged the iterative nature of DiMs and left the DiM model as a\nblack box, treating it no differently than one would a Generative Adversarial\nNetwork (GAN) or Varational AutoEncoder (VAE). We propose a greedy strategy on\nthe iterative sampling process of DiM models which searches for an optimal step\nguided by an identity-based heuristic function. We compare our proposed\nalgorithm against ten other state-of-the-art morphing algorithms using the\nopen-source SYN-MAD 2022 competition dataset. We find that our proposed\nalgorithm is unreasonably effective, fooling all of the tested FR systems with\nan MMPMR of 100%, outperforming all other morphing algorithms compared.\n","authors":["Zander W. Blasingame","Chen Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06025v1.pdf","comment":"Initial preprint. Under review"},{"id":"http://arxiv.org/abs/2404.06022v1","updated":"2024-04-09T05:11:28Z","published":"2024-04-09T05:11:28Z","title":"Band-Attention Modulated RetNet for Face Forgery Detection","summary":" The transformer networks are extensively utilized in face forgery detection\ndue to their scalability across large datasets.Despite their success,\ntransformers face challenges in balancing the capture of global context, which\nis crucial for unveiling forgery clues, with computational complexity.To\nmitigate this issue, we introduce Band-Attention modulated RetNet (BAR-Net), a\nlightweight network designed to efficiently process extensive visual contexts\nwhile avoiding catastrophic forgetting.Our approach empowers the target token\nto perceive global information by assigning differential attention levels to\ntokens at varying distances. We implement self-attention along both spatial\naxes, thereby maintaining spatial priors and easing the computational\nburden.Moreover, we present the adaptive frequency Band-Attention Modulation\nmechanism, which treats the entire Discrete Cosine Transform spectrogram as a\nseries of frequency bands with learnable weights.Together, BAR-Net achieves\nfavorable performance on several face forgery datasets, outperforming current\nstate-of-the-art methods.\n","authors":["Zhida Zhang","Jie Cao","Wenkui Yang","Qihang Fan","Kai Zhou","Ran He"],"pdf_url":"https://arxiv.org/pdf/2404.06022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16271v4","updated":"2024-04-09T05:09:56Z","published":"2024-03-24T19:32:39Z","title":"Object Detectors in the Open Environment: Challenges, Solutions, and\n Outlook","summary":" With the emergence of foundation models, deep learning-based object detectors\nhave shown practical usability in closed set scenarios. However, for real-world\ntasks, object detectors often operate in open environments, where crucial\nfactors (e.g., data distribution, objective) that influence model learning are\noften changing. The dynamic and intricate nature of the open environment poses\nnovel and formidable challenges to object detectors. Unfortunately, current\nresearch on object detectors in open environments lacks a comprehensive\nanalysis of their distinctive characteristics, challenges, and corresponding\nsolutions, which hinders their secure deployment in critical real-world\nscenarios. This paper aims to bridge this gap by conducting a comprehensive\nreview and analysis of object detectors in open environments. We initially\nidentified limitations of key structural components within the existing\ndetection pipeline and propose the open environment object detector challenge\nframework that includes four quadrants (i.e., out-of-domain, out-of-category,\nrobust learning, and incremental learning) based on the dimensions of the data\n/ target changes. For each quadrant of challenges in the proposed framework, we\npresent a detailed description and systematic analysis of the overarching goals\nand core difficulties, systematically review the corresponding solutions, and\nbenchmark their performance over multiple widely adopted datasets. In addition,\nwe engage in a discussion of open problems and potential avenues for future\nresearch. This paper aims to provide a fresh, comprehensive, and systematic\nunderstanding of the challenges and solutions associated with open-environment\nobject detectors, thus catalyzing the development of more solid applications in\nreal-world scenarios. A project related to this survey can be found at\nhttps://github.com/LiangSiyuan21/OEOD_Survey.\n","authors":["Siyuan Liang","Wei Wang","Ruoyu Chen","Aishan Liu","Boxi Wu","Ee-Chien Chang","Xiaochun Cao","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2403.16271v4.pdf","comment":"37 pages, 17 figures"},{"id":"http://arxiv.org/abs/2312.13980v2","updated":"2024-04-09T04:41:53Z","published":"2023-12-21T16:10:33Z","title":"Carve3D: Improving Multi-view Reconstruction Consistency for Diffusion\n Models with RL Finetuning","summary":" Multi-view diffusion models, obtained by applying Supervised Finetuning (SFT)\nto text-to-image diffusion models, have driven recent breakthroughs in\ntext-to-3D research. However, due to the limited size and quality of existing\n3D datasets, they still suffer from multi-view inconsistencies and Neural\nRadiance Field (NeRF) reconstruction artifacts. We argue that multi-view\ndiffusion models can benefit from further Reinforcement Learning Finetuning\n(RLFT), which allows models to learn from the data generated by themselves and\nimprove beyond their dataset limitations during SFT. To this end, we introduce\nCarve3D, an improved RLFT algorithm coupled with a novel Multi-view\nReconstruction Consistency (MRC) metric, to enhance the consistency of\nmulti-view diffusion models. To measure the MRC metric on a set of multi-view\nimages, we compare them with their corresponding NeRF renderings at the same\ncamera viewpoints. The resulting model, which we denote as Carve3DM,\ndemonstrates superior multi-view consistency and NeRF reconstruction quality\nthan existing models. Our results suggest that pairing SFT with Carve3D's RLFT\nis essential for developing multi-view-consistent diffusion models, mirroring\nthe standard Large Language Model (LLM) alignment pipeline. Our code, training\nand testing data, and video results are available at:\nhttps://desaixie.github.io/carve-3d.\n","authors":["Desai Xie","Jiahao Li","Hao Tan","Xin Sun","Zhixin Shu","Yi Zhou","Sai Bi","Sören Pirk","Arie E. Kaufman"],"pdf_url":"https://arxiv.org/pdf/2312.13980v2.pdf","comment":"22 pages, 16 figures. Our code, training and testing data, and video\n results are available at: https://desaixie.github.io/carve-3d. This paper has\n been accepted to CVPR 2024. v2: incorporated changes from the CVPR 2024\n camera-ready version"},{"id":"http://arxiv.org/abs/2404.06012v1","updated":"2024-04-09T04:41:05Z","published":"2024-04-09T04:41:05Z","title":"Diffusion-Based Point Cloud Super-Resolution for mmWave Radar Data","summary":" The millimeter-wave radar sensor maintains stable performance under adverse\nenvironmental conditions, making it a promising solution for all-weather\nperception tasks, such as outdoor mobile robotics. However, the radar point\nclouds are relatively sparse and contain massive ghost points, which greatly\nlimits the development of mmWave radar technology. In this paper, we propose a\nnovel point cloud super-resolution approach for 3D mmWave radar data, named\nRadar-diffusion. Our approach employs the diffusion model defined by\nmean-reverting stochastic differential equations(SDE). Using our proposed new\nobjective function with supervision from corresponding LiDAR point clouds, our\napproach efficiently handles radar ghost points and enhances the sparse mmWave\nradar point clouds to dense LiDAR-like point clouds. We evaluate our approach\non two different datasets, and the experimental results show that our method\noutperforms the state-of-the-art baseline methods in 3D radar super-resolution\ntasks. Furthermore, we demonstrate that our enhanced radar point cloud is\ncapable of downstream radar point-based registration tasks.\n","authors":["Kai Luan","Chenghao Shi","Neng Wang","Yuwei Cheng","Huimin Lu","Xieyuanli Chen"],"pdf_url":"https://arxiv.org/pdf/2404.06012v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05997v1","updated":"2024-04-09T04:04:50Z","published":"2024-04-09T04:04:50Z","title":"Concept-Attention Whitening for Interpretable Skin Lesion Diagnosis","summary":" The black-box nature of deep learning models has raised concerns about their\ninterpretability for successful deployment in real-world clinical applications.\nTo address the concerns, eXplainable Artificial Intelligence (XAI) aims to\nprovide clear and understandable explanations of the decision-making process.\nIn the medical domain, concepts such as attributes of lesions or abnormalities\nserve as key evidence for deriving diagnostic results. However, existing\nconcept-based models mainly depend on concepts that appear independently and\nrequire fine-grained concept annotations such as bounding boxes. A medical\nimage usually contains multiple concepts and the fine-grained concept\nannotations are difficult to acquire. In this paper, we propose a novel\nConcept-Attention Whitening (CAW) framework for interpretable skin lesion\ndiagnosis. CAW is comprised of a disease diagnosis branch and a concept\nalignment branch. In the former branch, we train the CNN with a CAW layer\ninserted to perform skin lesion diagnosis. The CAW layer decorrelates features\nand aligns image features to conceptual meanings via an orthogonal matrix. In\nthe latter branch, we calculate the orthogonal matrix under the guidance of the\nconcept attention mask. We particularly introduce a weakly-supervised concept\nmask generator that only leverages coarse concept labels for filtering local\nregions that are relevant to certain concepts, improving the optimization of\nthe orthogonal matrix. Extensive experiments on two public skin lesion\ndiagnosis datasets demonstrated that CAW not only enhanced interpretability but\nalso maintained a state-of-the-art diagnostic performance.\n","authors":["Junlin Hou","Jilan Xu","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2404.05997v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05981v1","updated":"2024-04-09T03:27:09Z","published":"2024-04-09T03:27:09Z","title":"A Lightweight Measure of Classification Difficulty from Application\n Dataset Characteristics","summary":" Despite accuracy and computation benchmarks being widely available to help\nchoose among neural network models, these are usually trained on datasets with\nmany classes, and do not give a precise idea of performance for applications of\nfew (< 10) classes. The conventional procedure to predict performance is to\ntrain and test repeatedly on the different models and dataset variations of\ninterest. However, this is computationally expensive. We propose an efficient\nclassification difficulty measure that is calculated from the number of classes\nand intra- and inter-class similarity metrics of the dataset. After a single\nstage of training and testing per model family, relative performance for\ndifferent datasets and models of the same family can be predicted by comparing\ndifficulty measures - without further training and testing. We show how this\nmeasure can help a practitioner select a computationally efficient model for a\nsmall dataset 6 to 29x faster than through repeated training and testing. We\ngive an example of use of the measure for an industrial application in which\noptions are identified to select a model 42% smaller than the baseline\nYOLOv5-nano model, and if class merging from 3 to 2 classes meets requirements,\n85% smaller.\n","authors":["Bryan Bo Cao","Abhinav Sharma","Lawrence O'Gorman","Michael Coss","Shubham Jain"],"pdf_url":"https://arxiv.org/pdf/2404.05981v1.pdf","comment":"13 pages, 3 figures"},{"id":"http://arxiv.org/abs/2404.05980v1","updated":"2024-04-09T03:24:10Z","published":"2024-04-09T03:24:10Z","title":"Tackling Structural Hallucination in Image Translation with Local\n Diffusion","summary":" Recent developments in diffusion models have advanced conditioned image\ngeneration, yet they struggle with reconstructing out-of-distribution (OOD)\nimages, such as unseen tumors in medical images, causing ``image\nhallucination'' and risking misdiagnosis. We hypothesize such hallucinations\nresult from local OOD regions in the conditional images. We verify that\npartitioning the OOD region and conducting separate image generations\nalleviates hallucinations in several applications. From this, we propose a\ntraining-free diffusion framework that reduces hallucination with multiple\nLocal Diffusion processes. Our approach involves OOD estimation followed by two\nmodules: a ``branching'' module generates locally both within and outside OOD\nregions, and a ``fusion'' module integrates these predictions into one. Our\nevaluation shows our method mitigates hallucination over baseline models\nquantitatively and qualitatively, reducing misdiagnosis by 40% and 25% in the\nreal-world medical and natural image datasets, respectively. It also\ndemonstrates compatibility with various pre-trained diffusion models.\n","authors":["Seunghoi Kim","Chen Jin","Tom Diethe","Matteo Figini","Henry F. J. Tregidgo","Asher Mullokandov","Philip Teare","Daniel C. Alexander"],"pdf_url":"https://arxiv.org/pdf/2404.05980v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05979v1","updated":"2024-04-09T03:22:36Z","published":"2024-04-09T03:22:36Z","title":"StoryImager: A Unified and Efficient Framework for Coherent Story\n Visualization and Completion","summary":" Story visualization aims to generate a series of realistic and coherent\nimages based on a storyline. Current models adopt a frame-by-frame architecture\nby transforming the pre-trained text-to-image model into an auto-regressive\nmanner. Although these models have shown notable progress, there are still\nthree flaws. 1) The unidirectional generation of auto-regressive manner\nrestricts the usability in many scenarios. 2) The additional introduced story\nhistory encoders bring an extremely high computational cost. 3) The story\nvisualization and continuation models are trained and inferred independently,\nwhich is not user-friendly. To these ends, we propose a bidirectional, unified,\nand efficient framework, namely StoryImager. The StoryImager enhances the\nstoryboard generative ability inherited from the pre-trained text-to-image\nmodel for a bidirectional generation. Specifically, we introduce a Target Frame\nMasking Strategy to extend and unify different story image generation tasks.\nFurthermore, we propose a Frame-Story Cross Attention Module that decomposes\nthe cross attention for local fidelity and global coherence. Moreover, we\ndesign a Contextual Feature Extractor to extract contextual information from\nthe whole storyline. The extensive experimental results demonstrate the\nexcellent performance of our StoryImager. The code is available at\nhttps://github.com/tobran/StoryImager.\n","authors":["Ming Tao","Bing-Kun Bao","Hao Tang","Yaowei Wang","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2404.05979v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2403.14085v2","updated":"2024-04-09T02:59:41Z","published":"2024-03-21T02:31:17Z","title":"Surface Reconstruction from Point Clouds via Grid-based Intersection\n Prediction","summary":" Surface reconstruction from point clouds is a crucial task in the fields of\ncomputer vision and computer graphics. SDF-based methods excel at\nreconstructing smooth meshes with minimal error and artefacts but struggle with\nrepresenting open surfaces. On the other hand, UDF-based methods can\neffectively represent open surfaces but often introduce noise, leading to\nartefacts in the mesh. In this work, we propose a novel approach that directly\npredicts the intersection points between line segment of point pairs and\nimplicit surfaces. To achieve it, we propose two modules named Relative\nIntersection Module and Sign Module respectively with the feature of point pair\nas input. To preserve the continuity of the surface, we also integrate symmetry\ninto the two modules, which means the position of predicted intersection will\nnot change even if the input order of the point pair changes. This method not\nonly preserves the ability to represent open surfaces but also eliminates most\nartefacts on the mesh. Our approach demonstrates state-of-the-art performance\non three datasets: ShapeNet, MGN, and ScanNet. The code will be made available\nupon acceptance.\n","authors":["Hui Tian","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2403.14085v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03394v2","updated":"2024-04-09T02:56:27Z","published":"2024-04-04T11:53:37Z","title":"Background Noise Reduction of Attention Map for Weakly Supervised\n Semantic Segmentation","summary":" In weakly-supervised semantic segmentation (WSSS) using only image-level\nclass labels, a problem with CNN-based Class Activation Maps (CAM) is that they\ntend to activate the most discriminative local regions of objects. On the other\nhand, methods based on Transformers learn global features but suffer from the\nissue of background noise contamination. This paper focuses on addressing the\nissue of background noise in attention weights within the existing WSSS method\nbased on Conformer, known as TransCAM. The proposed method successfully reduces\nbackground noise, leading to improved accuracy of pseudo labels. Experimental\nresults demonstrate that our model achieves segmentation performance of 70.5%\non the PASCAL VOC 2012 validation data, 71.1% on the test data, and 45.9% on MS\nCOCO 2014 data, outperforming TransCAM in terms of segmentation performance.\n","authors":["Izumi Fujimori","Masaki Oono","Masami Shishibori"],"pdf_url":"https://arxiv.org/pdf/2404.03394v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05967v1","updated":"2024-04-09T02:55:12Z","published":"2024-04-09T02:55:12Z","title":"JSTR: Judgment Improves Scene Text Recognition","summary":" In this paper, we present a method for enhancing the accuracy of scene text\nrecognition tasks by judging whether the image and text match each other. While\nprevious studies focused on generating the recognition results from input\nimages, our approach also considers the model's misrecognition results to\nunderstand its error tendencies, thus improving the text recognition pipeline.\nThis method boosts text recognition accuracy by providing explicit feedback on\nthe data that the model is likely to misrecognize by predicting correct or\nincorrect between the image and text. The experimental results on publicly\navailable datasets demonstrate that our proposed method outperforms the\nbaseline and state-of-the-art methods in scene text recognition.\n","authors":["Masato Fujitake"],"pdf_url":"https://arxiv.org/pdf/2404.05967v1.pdf","comment":"IntelliSys 2024"},{"id":"http://arxiv.org/abs/2404.05960v1","updated":"2024-04-09T02:47:52Z","published":"2024-04-09T02:47:52Z","title":"EasyTrack: Efficient and Compact One-stream 3D Point Clouds Tracker","summary":" Most of 3D single object trackers (SOT) in point clouds follow the two-stream\nmulti-stage 3D Siamese or motion tracking paradigms, which process the template\nand search area point clouds with two parallel branches, built on supervised\npoint cloud backbones. In this work, beyond typical 3D Siamese or motion\ntracking, we propose a neat and compact one-stream transformer 3D SOT paradigm\nfrom the novel perspective, termed as \\textbf{EasyTrack}, which consists of\nthree special designs: 1) A 3D point clouds tracking feature pre-training\nmodule is developed to exploit the masked autoencoding for learning 3D point\nclouds tracking representations. 2) A unified 3D tracking feature learning and\nfusion network is proposed to simultaneously learns target-aware 3D features,\nand extensively captures mutual correlation through the flexible self-attention\nmechanism. 3) A target location network in the dense bird's eye view (BEV)\nfeature space is constructed for target classification and regression.\nMoreover, we develop an enhanced version named EasyTrack++, which designs the\ncenter points interaction (CPI) strategy to reduce the ambiguous targets caused\nby the noise point cloud background information. The proposed EasyTrack and\nEasyTrack++ set a new state-of-the-art performance ($\\textbf{18\\%}$,\n$\\textbf{40\\%}$ and $\\textbf{3\\%}$ success gains) in KITTI, NuScenes, and Waymo\nwhile runing at \\textbf{52.6fps} with few parameters (\\textbf{1.3M}). The code\nwill be available at https://github.com/KnightApple427/Easytrack.\n","authors":["Baojie Fan","Wuyang Zhou","Kai Wang","Shijun Zhou","Fengyu Xu","Jiandong Tian"],"pdf_url":"https://arxiv.org/pdf/2404.05960v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.12554v4","updated":"2024-04-09T02:42:28Z","published":"2023-01-29T22:05:28Z","title":"Improving the Accuracy-Robustness Trade-Off of Classifiers via Adaptive\n Smoothing","summary":" While prior research has proposed a plethora of methods that build neural\nclassifiers robust against adversarial robustness, practitioners are still\nreluctant to adopt them due to their unacceptably severe clean accuracy\npenalties. This paper significantly alleviates this accuracy-robustness\ntrade-off by mixing the output probabilities of a standard classifier and a\nrobust classifier, where the standard network is optimized for clean accuracy\nand is not robust in general. We show that the robust base classifier's\nconfidence difference for correct and incorrect examples is the key to this\nimprovement. In addition to providing intuitions and empirical evidence, we\ntheoretically certify the robustness of the mixed classifier under realistic\nassumptions. Furthermore, we adapt an adversarial input detector into a mixing\nnetwork that adaptively adjusts the mixture of the two base models, further\nreducing the accuracy penalty of achieving robustness. The proposed flexible\nmethod, termed \"adaptive smoothing\", can work in conjunction with existing or\neven future methods that improve clean accuracy, robustness, or adversary\ndetection. Our empirical evaluation considers strong attack methods, including\nAutoAttack and adaptive attack. On the CIFAR-100 dataset, our method achieves\nan 85.21% clean accuracy while maintaining a 38.72% $\\ell_\\infty$-AutoAttacked\n($\\epsilon = 8/255$) accuracy, becoming the second most robust method on the\nRobustBench CIFAR-100 benchmark as of submission, while improving the clean\naccuracy by ten percentage points compared with all listed models. The code\nthat implements our method is available at\nhttps://github.com/Bai-YT/AdaptiveSmoothing.\n","authors":["Yatong Bai","Brendon G. Anderson","Aerin Kim","Somayeh Sojoudi"],"pdf_url":"https://arxiv.org/pdf/2301.12554v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.06136v3","updated":"2024-04-09T02:38:16Z","published":"2024-02-09T01:48:44Z","title":"SIR: Multi-view Inverse Rendering with Decomposable Shadow for Indoor\n Scenes","summary":" We propose SIR, an efficient method to decompose differentiable shadows for\ninverse rendering on indoor scenes using multi-view data, addressing the\nchallenges in accurately decomposing the materials and lighting conditions.\nUnlike previous methods that struggle with shadow fidelity in complex lighting\nenvironments, our approach explicitly learns shadows for enhanced realism in\nmaterial estimation under unknown light positions. Utilizing posed HDR images\nas input, SIR employs an SDF-based neural radiance field for comprehensive\nscene representation. Then, SIR integrates a shadow term with a three-stage\nmaterial estimation approach to improve SVBRDF quality. Specifically, SIR is\ndesigned to learn a differentiable shadow, complemented by BRDF regularization,\nto optimize inverse rendering accuracy. Extensive experiments on both synthetic\nand real-world indoor scenes demonstrate the superior performance of SIR over\nexisting methods in both quantitative metrics and qualitative analysis. The\nsignificant decomposing ability of SIR enables sophisticated editing\ncapabilities like free-view relighting, object insertion, and material\nreplacement. The code and data are available at\nhttps://xiaokangwei.github.io/SIR/.\n","authors":["Xiaokang Wei","Zhuoman Liu","Yan Luximon"],"pdf_url":"https://arxiv.org/pdf/2402.06136v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15033v2","updated":"2024-04-09T02:29:32Z","published":"2024-03-22T08:32:30Z","title":"Toward Tiny and High-quality Facial Makeup with Data Amplify Learning","summary":" Contemporary makeup approaches primarily hinge on unpaired learning\nparadigms, yet they grapple with the challenges of inaccurate supervision\n(e.g., face misalignment) and sophisticated facial prompts (including face\nparsing, and landmark detection). These challenges prohibit low-cost deployment\nof facial makeup models, especially on mobile devices. To solve above problems,\nwe propose a brand-new learning paradigm, termed \"Data Amplify Learning (DAL),\"\nalongside a compact makeup model named \"TinyBeauty.\" The core idea of DAL lies\nin employing a Diffusion-based Data Amplifier (DDA) to \"amplify\" limited images\nfor the model training, thereby enabling accurate pixel-to-pixel supervision\nwith merely a handful of annotations. Two pivotal innovations in DDA facilitate\nthe above training approach: (1) A Residual Diffusion Model (RDM) is designed\nto generate high-fidelity detail and circumvent the detail vanishing problem in\nthe vanilla diffusion models; (2) A Fine-Grained Makeup Module (FGMM) is\nproposed to achieve precise makeup control and combination while retaining face\nidentity. Coupled with DAL, TinyBeauty necessitates merely 80K parameters to\nachieve a state-of-the-art performance without intricate face prompts.\nMeanwhile, TinyBeauty achieves a remarkable inference speed of up to 460 fps on\nthe iPhone 13. Extensive experiments show that DAL can produce highly\ncompetitive makeup models using only 5 image pairs.\n","authors":["Qiaoqiao Jin","Xuanhong Chen","Meiguang Jin","Ying Chen","Rui Shi","Yucheng Zheng","Yupeng Zhu","Bingbing Ni"],"pdf_url":"https://arxiv.org/pdf/2403.15033v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03662v2","updated":"2024-04-09T01:43:11Z","published":"2024-03-06T12:31:02Z","title":"Harnessing Meta-Learning for Improving Full-Frame Video Stabilization","summary":" Video stabilization is a longstanding computer vision problem, particularly\npixel-level synthesis solutions for video stabilization which synthesize full\nframes add to the complexity of this task. These techniques aim to stabilize\nvideos by synthesizing full frames while enhancing the stability of the\nconsidered video. This intensifies the complexity of the task due to the\ndistinct mix of unique motion profiles and visual content present in each video\nsequence, making robust generalization with fixed parameters difficult. In our\nstudy, we introduce a novel approach to enhance the performance of pixel-level\nsynthesis solutions for video stabilization by adapting these models to\nindividual input video sequences. The proposed adaptation exploits low-level\nvisual cues accessible during test-time to improve both the stability and\nquality of resulting videos. We highlight the efficacy of our methodology of\n\"test-time adaptation\" through simple fine-tuning of one of these models,\nfollowed by significant stability gain via the integration of meta-learning\ntechniques. Notably, significant improvement is achieved with only a single\nadaptation step. The versatility of the proposed algorithm is demonstrated by\nconsistently improving the performance of various pixel-level synthesis models\nfor video stabilization in real-world scenarios.\n","authors":["Muhammad Kashif Ali","Eun Woo Im","Dongjin Kim","Tae Hyun Kim"],"pdf_url":"https://arxiv.org/pdf/2403.03662v2.pdf","comment":"CVPR 2024, Code will be made availble on:\n http://github.com/MKashifAli/MetaVideoStab"},{"id":"http://arxiv.org/abs/2309.13475v3","updated":"2024-04-09T01:26:58Z","published":"2023-09-23T20:33:38Z","title":"Detecting and Mitigating System-Level Anomalies of Vision-Based\n Controllers","summary":" Autonomous systems, such as self-driving cars and drones, have made\nsignificant strides in recent years by leveraging visual inputs and machine\nlearning for decision-making and control. Despite their impressive performance,\nthese vision-based controllers can make erroneous predictions when faced with\nnovel or out-of-distribution inputs. Such errors can cascade to catastrophic\nsystem failures and compromise system safety. In this work, we introduce a\nrun-time anomaly monitor to detect and mitigate such closed-loop, system-level\nfailures. Specifically, we leverage a reachability-based framework to\nstress-test the vision-based controller offline and mine its system-level\nfailures. This data is then used to train a classifier that is leveraged online\nto flag inputs that might cause system breakdowns. The anomaly detector\nhighlights issues that transcend individual modules and pertain to the safety\nof the overall system. We also design a fallback controller that robustly\nhandles these detected anomalies to preserve system safety. We validate the\nproposed approach on an autonomous aircraft taxiing system that uses a\nvision-based controller for taxiing. Our results show the efficacy of the\nproposed approach in identifying and handling system-level anomalies,\noutperforming methods such as prediction error-based detection, and ensembling,\nthereby enhancing the overall safety and robustness of autonomous systems.\n","authors":["Aryaman Gupta","Kaustav Chakraborty","Somil Bansal"],"pdf_url":"https://arxiv.org/pdf/2309.13475v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10240v2","updated":"2024-04-09T01:16:07Z","published":"2023-12-15T22:18:38Z","title":"Rich Human Feedback for Text-to-Image Generation","summary":" Recent Text-to-Image (T2I) generation models such as Stable Diffusion and\nImagen have made significant progress in generating high-resolution images\nbased on text descriptions. However, many generated images still suffer from\nissues such as artifacts/implausibility, misalignment with text descriptions,\nand low aesthetic quality. Inspired by the success of Reinforcement Learning\nwith Human Feedback (RLHF) for large language models, prior works collected\nhuman-provided scores as feedback on generated images and trained a reward\nmodel to improve the T2I generation. In this paper, we enrich the feedback\nsignal by (i) marking image regions that are implausible or misaligned with the\ntext, and (ii) annotating which words in the text prompt are misrepresented or\nmissing on the image. We collect such rich human feedback on 18K generated\nimages (RichHF-18K) and train a multimodal transformer to predict the rich\nfeedback automatically. We show that the predicted rich human feedback can be\nleveraged to improve image generation, for example, by selecting high-quality\ntraining data to finetune and improve the generative models, or by creating\nmasks with predicted heatmaps to inpaint the problematic regions. Notably, the\nimprovements generalize to models (Muse) beyond those used to generate the\nimages on which human feedback data were collected (Stable Diffusion variants).\nThe RichHF-18K data set will be released in our GitHub repository:\nhttps://github.com/google-research/google-research/tree/master/richhf_18k.\n","authors":["Youwei Liang","Junfeng He","Gang Li","Peizhao Li","Arseniy Klimovskiy","Nicholas Carolan","Jiao Sun","Jordi Pont-Tuset","Sarah Young","Feng Yang","Junjie Ke","Krishnamurthy Dj Dvijotham","Katie Collins","Yiwen Luo","Yang Li","Kai J Kohlhoff","Deepak Ramachandran","Vidhya Navalpakkam"],"pdf_url":"https://arxiv.org/pdf/2312.10240v2.pdf","comment":"CVPR'24"},{"id":"http://arxiv.org/abs/2402.17228v3","updated":"2024-04-09T01:10:15Z","published":"2024-02-27T05:42:38Z","title":"Feature Re-Embedding: Towards Foundation Model-Level Performance in\n Computational Pathology","summary":" Multiple instance learning (MIL) is the most widely used framework in\ncomputational pathology, encompassing sub-typing, diagnosis, prognosis, and\nmore. However, the existing MIL paradigm typically requires an offline instance\nfeature extractor, such as a pre-trained ResNet or a foundation model. This\napproach lacks the capability for feature fine-tuning within the specific\ndownstream tasks, limiting its adaptability and performance. To address this\nissue, we propose a Re-embedded Regional Transformer (R$^2$T) for re-embedding\nthe instance features online, which captures fine-grained local features and\nestablishes connections across different regions. Unlike existing works that\nfocus on pre-training powerful feature extractor or designing sophisticated\ninstance aggregator, R$^2$T is tailored to re-embed instance features online.\nIt serves as a portable module that can seamlessly integrate into mainstream\nMIL models. Extensive experimental results on common computational pathology\ntasks validate that: 1) feature re-embedding improves the performance of MIL\nmodels based on ResNet-50 features to the level of foundation model features,\nand further enhances the performance of foundation model features; 2) the\nR$^2$T can introduce more significant performance improvements to various MIL\nmodels; 3) R$^2$T-MIL, as an R$^2$T-enhanced AB-MIL, outperforms other latest\nmethods by a large margin.The code is available at:\nhttps://github.com/DearCaat/RRT-MIL.\n","authors":["Wenhao Tang","Fengtao Zhou","Sheng Huang","Xiang Zhu","Yi Zhang","Bo Liu"],"pdf_url":"https://arxiv.org/pdf/2402.17228v3.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2308.13072v2","updated":"2024-04-09T01:09:41Z","published":"2023-08-24T20:29:09Z","title":"Full-dose Whole-body PET Synthesis from Low-dose PET Using\n High-efficiency Denoising Diffusion Probabilistic Model: PET Consistency\n Model","summary":" Objective: Positron Emission Tomography (PET) has been a commonly used\nimaging modality in broad clinical applications. One of the most important\ntradeoffs in PET imaging is between image quality and radiation dose: high\nimage quality comes with high radiation exposure. Improving image quality is\ndesirable for all clinical applications while minimizing radiation exposure is\nneeded to reduce risk to patients. Approach: We introduce PET Consistency Model\n(PET-CM), an efficient diffusion-based method for generating high-quality\nfull-dose PET images from low-dose PET images. It employs a two-step process,\nadding Gaussian noise to full-dose PET images in the forward diffusion, and\nthen denoising them using a PET Shifted-window Vision Transformer (PET-VIT)\nnetwork in the reverse diffusion. The PET-VIT network learns a consistency\nfunction that enables direct denoising of Gaussian noise into clean full-dose\nPET images. PET-CM achieves state-of-the-art image quality while requiring\nsignificantly less computation time than other methods. Results: In experiments\ncomparing eighth-dose to full-dose images, PET-CM demonstrated impressive\nperformance with NMAE of 1.278+/-0.122%, PSNR of 33.783+/-0.824dB, SSIM of\n0.964+/-0.009, NCC of 0.968+/-0.011, HRS of 4.543, and SUV Error of\n0.255+/-0.318%, with an average generation time of 62 seconds per patient. This\nis a significant improvement compared to the state-of-the-art diffusion-based\nmodel with PET-CM reaching this result 12x faster. Similarly, in the\nquarter-dose to full-dose image experiments, PET-CM delivered competitive\noutcomes, achieving an NMAE of 0.973+/-0.066%, PSNR of 36.172+/-0.801dB, SSIM\nof 0.984+/-0.004, NCC of 0.990+/-0.005, HRS of 4.428, and SUV Error of\n0.151+/-0.192% using the same generation process, which underlining its high\nquantitative and clinical precision in both denoising scenario.\n","authors":["Shaoyan Pan","Elham Abouei","Junbo Peng","Joshua Qian","Jacob F Wynne","Tonghe Wang","Chih-Wei Chang","Justin Roper","Jonathon A Nye","Hui Mao","Xiaofeng Yang"],"pdf_url":"https://arxiv.org/pdf/2308.13072v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05916v1","updated":"2024-04-09T00:30:16Z","published":"2024-04-09T00:30:16Z","title":"Prompt-driven Universal Model for View-Agnostic Echocardiography\n Analysis","summary":" Echocardiography segmentation for cardiac analysis is time-consuming and\nresource-intensive due to the variability in image quality and the necessity to\nprocess scans from various standard views. While current automated segmentation\nmethods in echocardiography show promising performance, they are trained on\nspecific scan views to analyze corresponding data. However, this solution has a\nlimitation as the number of required models increases with the number of\nstandard views. To address this, in this paper, we present a prompt-driven\nuniversal method for view-agnostic echocardiography analysis. Considering the\ndomain shift between standard views, we first introduce a method called prompt\nmatching, aimed at learning prompts specific to different views by matching\nprompts and querying input embeddings using a pre-trained vision model. Then,\nwe utilized a pre-trained medical language model to align textual information\nwith pixel data for accurate segmentation. Extensive experiments on three\nstandard views showed that our approach significantly outperforms the\nstate-of-the-art universal methods and achieves comparable or even better\nperformances over the segmentation model trained and tested on same views.\n","authors":["Sekeun Kim","Hui Ren","Peng Guo","Abder-Rahman Ali","Patrick Zhang","Kyungsang Kim","Xiang Li","Quanzheng Li"],"pdf_url":"https://arxiv.org/pdf/2404.05916v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05911v1","updated":"2024-04-09T00:05:45Z","published":"2024-04-09T00:05:45Z","title":"LATUP-Net: A Lightweight 3D Attention U-Net with Parallel Convolutions\n for Brain Tumor Segmentation","summary":" Early-stage 3D brain tumor segmentation from magnetic resonance imaging (MRI)\nscans is crucial for prompt and effective treatment. However, this process\nfaces the challenge of precise delineation due to the tumors' complex\nheterogeneity. Moreover, energy sustainability targets and resource\nlimitations, especially in developing countries, require efficient and\naccessible medical imaging solutions. The proposed architecture, a Lightweight\n3D ATtention U-Net with Parallel convolutions, LATUP-Net, addresses these\nissues. It is specifically designed to reduce computational requirements\nsignificantly while maintaining high segmentation performance. By incorporating\nparallel convolutions, it enhances feature representation by capturing\nmulti-scale information. It further integrates an attention mechanism to refine\nsegmentation through selective feature recalibration. LATUP-Net achieves\npromising segmentation performance: the average Dice scores for the whole\ntumor, tumor core, and enhancing tumor on the BraTS2020 dataset are 88.41%,\n83.82%, and 73.67%, and on the BraTS2021 dataset, they are 90.29%, 89.54%, and\n83.92%, respectively. Hausdorff distance metrics further indicate its improved\nability to delineate tumor boundaries. With its significantly reduced\ncomputational demand using only 3.07 M parameters, about 59 times fewer than\nother state-of-the-art models, and running on a single V100 GPU, LATUP-Net\nstands out as a promising solution for real-world clinical applications,\nparticularly in settings with limited resources. Investigations into the\nmodel's interpretability, utilizing gradient-weighted class activation mapping\nand confusion matrices, reveal that while attention mechanisms enhance the\nsegmentation of small regions, their impact is nuanced. Achieving the most\naccurate tumor delineation requires carefully balancing local and global\nfeatures.\n","authors":["Ebtihal J. Alwadee","Xianfang Sun","Yipeng Qin","Frank C. Langbein"],"pdf_url":"https://arxiv.org/pdf/2404.05911v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06657v1","updated":"2024-04-09T23:47:53Z","published":"2024-04-09T23:47:53Z","title":"Res-U2Net: Untrained Deep Learning for Phase Retrieval and Image\n Reconstruction","summary":" Conventional deep learning-based image reconstruction methods require a large\namount of training data which can be hard to obtain in practice. Untrained deep\nlearning methods overcome this limitation by training a network to invert a\nphysical model of the image formation process. Here we present a novel\nuntrained Res-U2Net model for phase retrieval. We use the extracted phase\ninformation to determine changes in an object's surface and generate a mesh\nrepresentation of its 3D structure. We compare the performance of Res-U2Net\nphase retrieval against UNet and U2Net using images from the GDXRAY dataset.\n","authors":["Carlos Osorio Quero","Daniel Leykam","Irving Rondon Ojeda"],"pdf_url":"https://arxiv.org/pdf/2404.06657v1.pdf","comment":"16 pages, 8 figures, 4 Tables"},{"id":"http://arxiv.org/abs/2312.00825v2","updated":"2024-04-09T23:28:49Z","published":"2023-11-30T18:32:14Z","title":"SocialCounterfactuals: Probing and Mitigating Intersectional Social\n Biases in Vision-Language Models with Counterfactual Examples","summary":" While vision-language models (VLMs) have achieved remarkable performance\nimprovements recently, there is growing evidence that these models also posses\nharmful biases with respect to social attributes such as gender and race. Prior\nstudies have primarily focused on probing such bias attributes individually\nwhile ignoring biases associated with intersections between social attributes.\nThis could be due to the difficulty of collecting an exhaustive set of\nimage-text pairs for various combinations of social attributes. To address this\nchallenge, we employ text-to-image diffusion models to produce counterfactual\nexamples for probing intersectional social biases at scale. Our approach\nutilizes Stable Diffusion with cross attention control to produce sets of\ncounterfactual image-text pairs that are highly similar in their depiction of a\nsubject (e.g., a given occupation) while differing only in their depiction of\nintersectional social attributes (e.g., race & gender). Through our\nover-generate-then-filter methodology, we produce SocialCounterfactuals, a\nhigh-quality dataset containing 171k image-text pairs for probing\nintersectional biases related to gender, race, and physical characteristics. We\nconduct extensive experiments to demonstrate the usefulness of our generated\ndataset for probing and mitigating intersectional social biases in\nstate-of-the-art VLMs.\n","authors":["Phillip Howard","Avinash Madasu","Tiep Le","Gustavo Lujan Moreno","Anahita Bhiwandiwalla","Vasudev Lal"],"pdf_url":"https://arxiv.org/pdf/2312.00825v2.pdf","comment":"Accepted to CVPR 2024. arXiv admin note: text overlap with\n arXiv:2310.02988"},{"id":"http://arxiv.org/abs/2404.06653v1","updated":"2024-04-09T23:24:19Z","published":"2024-04-09T23:24:19Z","title":"FlameFinder: Illuminating Obscured Fire through Smoke with Attentive\n Deep Metric Learning","summary":" FlameFinder is a deep metric learning (DML) framework designed to accurately\ndetect flames, even when obscured by smoke, using thermal images from\nfirefighter drones during wildfire monitoring. Traditional RGB cameras struggle\nin such conditions, but thermal cameras can capture smoke-obscured flame\nfeatures. However, they lack absolute thermal reference points, leading to\nfalse positives.To address this issue, FlameFinder utilizes paired thermal-RGB\nimages for training. By learning latent flame features from smoke-free samples,\nthe model becomes less biased towards relative thermal gradients. In testing,\nit identifies flames in smoky patches by analyzing their equivalent\nthermal-domain distribution. This method improves performance using both\nsupervised and distance-based clustering metrics.The framework incorporates a\nflame segmentation method and a DML-aided detection framework. This includes\nutilizing center loss (CL), triplet center loss (TCL), and triplet cosine\ncenter loss (TCCL) to identify optimal cluster representatives for\nclassification. However, the dominance of center loss over the other losses\nleads to the model missing features sensitive to them. To address this\nlimitation, an attention mechanism is proposed. This mechanism allows for\nnon-uniform feature contribution, amplifying the critical role of cosine and\ntriplet loss in the DML framework. Additionally, it improves interpretability,\nclass discrimination, and decreases intra-class variance. As a result, the\nproposed model surpasses the baseline by 4.4% in the FLAME2 dataset and 7% in\nthe FLAME3 dataset for unobscured flame detection accuracy. Moreover, it\ndemonstrates enhanced class separation in obscured scenarios compared to VGG19,\nResNet18, and three backbone models tailored for flame detection.\n","authors":["Hossein Rajoli","Sahand Khoshdel","Fatemeh Afghah","Xiaolong Ma"],"pdf_url":"https://arxiv.org/pdf/2404.06653v1.pdf","comment":"Submitted as a Journal Paper to IEEE Transactions on Geoscience and\n Remote Sensing"},{"id":"http://arxiv.org/abs/2404.05139v2","updated":"2024-04-09T23:17:07Z","published":"2024-04-08T01:38:43Z","title":"Better Monocular 3D Detectors with LiDAR from the Past","summary":" Accurate 3D object detection is crucial to autonomous driving. Though\nLiDAR-based detectors have achieved impressive performance, the high cost of\nLiDAR sensors precludes their widespread adoption in affordable vehicles.\nCamera-based detectors are cheaper alternatives but often suffer inferior\nperformance compared to their LiDAR-based counterparts due to inherent depth\nambiguities in images. In this work, we seek to improve monocular 3D detectors\nby leveraging unlabeled historical LiDAR data. Specifically, at inference time,\nwe assume that the camera-based detectors have access to multiple unlabeled\nLiDAR scans from past traversals at locations of interest (potentially from\nother high-end vehicles equipped with LiDAR sensors). Under this setup, we\nproposed a novel, simple, and end-to-end trainable framework, termed\nAsyncDepth, to effectively extract relevant features from asynchronous LiDAR\ntraversals of the same location for monocular 3D detectors. We show consistent\nand significant performance gain (up to 9 AP) across multiple state-of-the-art\nmodels and datasets with a negligible additional latency of 9.66 ms and a small\nstorage cost.\n","authors":["Yurong You","Cheng Perng Phoo","Carlos Andres Diaz-Ruiz","Katie Z Luo","Wei-Lun Chao","Mark Campbell","Bharath Hariharan","Kilian Q Weinberger"],"pdf_url":"https://arxiv.org/pdf/2404.05139v2.pdf","comment":"Accepted by ICRA 2024. The code can be found at\n https://github.com/YurongYou/AsyncDepth"},{"id":"http://arxiv.org/abs/2404.06638v1","updated":"2024-04-09T22:17:20Z","published":"2024-04-09T22:17:20Z","title":"SAM-I-Am: Semantic Boosting for Zero-shot Atomic-Scale Electron\n Micrograph Segmentation","summary":" Image segmentation is a critical enabler for tasks ranging from medical\ndiagnostics to autonomous driving. However, the correct segmentation semantics\n- where are boundaries located? what segments are logically similar? - change\ndepending on the domain, such that state-of-the-art foundation models can\ngenerate meaningless and incorrect results. Moreover, in certain domains,\nfine-tuning and retraining techniques are infeasible: obtaining labels is\ncostly and time-consuming; domain images (micrographs) can be exponentially\ndiverse; and data sharing (for third-party retraining) is restricted. To enable\nrapid adaptation of the best segmentation technology, we propose the concept of\nsemantic boosting: given a zero-shot foundation model, guide its segmentation\nand adjust results to match domain expectations. We apply semantic boosting to\nthe Segment Anything Model (SAM) to obtain microstructure segmentation for\ntransmission electron microscopy. Our booster, SAM-I-Am, extracts geometric and\ntextural features of various intermediate masks to perform mask removal and\nmask merging operations. We demonstrate a zero-shot performance increase of\n(absolute) +21.35%, +12.6%, +5.27% in mean IoU, and a -9.91%, -18.42%, -4.06%\ndrop in mean false positive masks across images of three difficulty classes\nover vanilla SAM (ViT-L).\n","authors":["Waqwoya Abebe","Jan Strube","Luanzheng Guo","Nathan R. Tallent","Oceane Bel","Steven Spurgeon","Christina Doty","Ali Jannesari"],"pdf_url":"https://arxiv.org/pdf/2404.06638v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06637v1","updated":"2024-04-09T22:16:34Z","published":"2024-04-09T22:16:34Z","title":"GeoSynth: Contextually-Aware High-Resolution Satellite Image Synthesis","summary":" We present GeoSynth, a model for synthesizing satellite images with global\nstyle and image-driven layout control. The global style control is via textual\nprompts or geographic location. These enable the specification of scene\nsemantics or regional appearance respectively, and can be used together. We\ntrain our model on a large dataset of paired satellite imagery, with\nautomatically generated captions, and OpenStreetMap data. We evaluate various\ncombinations of control inputs, including different types of layout controls.\nResults demonstrate that our model can generate diverse, high-quality images\nand exhibits excellent zero-shot generalization. The code and model checkpoints\nare available at https://github.com/mvrl/GeoSynth.\n","authors":["Srikumar Sastry","Subash Khanal","Aayush Dhakal","Nathan Jacobs"],"pdf_url":"https://arxiv.org/pdf/2404.06637v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05195v2","updated":"2024-04-09T22:14:37Z","published":"2024-02-07T19:07:10Z","title":"$λ$-ECLIPSE: Multi-Concept Personalized Text-to-Image Diffusion\n Models by Leveraging CLIP Latent Space","summary":" Despite the recent advances in personalized text-to-image (P-T2I) generative\nmodels, it remains challenging to perform finetuning-free multi-subject-driven\nT2I in a resource-efficient manner. Predominantly, contemporary approaches,\ninvolving the training of Hypernetworks and Multimodal Large Language Models\n(MLLMs), require heavy computing resources that range from 600 to 12300 GPU\nhours of training. These subject-driven T2I methods hinge on Latent Diffusion\nModels (LDMs), which facilitate T2I mapping through cross-attention layers.\nWhile LDMs offer distinct advantages, P-T2I methods' reliance on the latent\nspace of these diffusion models significantly escalates resource demands,\nleading to inconsistent results and necessitating numerous iterations for a\nsingle desired image. In this paper, we present $\\lambda$-ECLIPSE, an\nalternative prior-training strategy that works in the latent space of a\npre-trained CLIP model without relying on the diffusion UNet models.\n$\\lambda$-ECLIPSE leverages the image-text interleaved pre-training for fast\nand effective multi-subject-driven P-T2I. Through extensive experiments, we\nestablish that $\\lambda$-ECLIPSE surpasses existing baselines in composition\nalignment while preserving concept alignment performance, even with\nsignificantly lower resource utilization. $\\lambda$-ECLIPSE performs\nmulti-subject driven P-T2I with just 34M parameters and is trained on a mere 74\nGPU hours. Additionally, $\\lambda$-ECLIPSE demonstrates the unique ability to\nperform multi-concept interpolations.\n","authors":["Maitreya Patel","Sangmin Jung","Chitta Baral","Yezhou Yang"],"pdf_url":"https://arxiv.org/pdf/2402.05195v2.pdf","comment":"Project page: https://eclipse-t2i.github.io/Lambda-ECLIPSE/"},{"id":"http://arxiv.org/abs/2312.04746v2","updated":"2024-04-09T21:48:42Z","published":"2023-12-07T23:16:37Z","title":"Quilt-LLaVA: Visual Instruction Tuning by Extracting Localized\n Narratives from Open-Source Histopathology Videos","summary":" Diagnosis in histopathology requires a global whole slide images (WSIs)\nanalysis, requiring pathologists to compound evidence from different WSI\npatches. The gigapixel scale of WSIs poses a challenge for histopathology\nmulti-modal models. Training multi-model models for histopathology requires\ninstruction tuning datasets, which currently contain information for individual\nimage patches, without a spatial grounding of the concepts within each patch\nand without a wider view of the WSI. Therefore, they lack sufficient diagnostic\ncapacity for histopathology. To bridge this gap, we introduce Quilt-Instruct, a\nlarge-scale dataset of 107,131 histopathology-specific instruction\nquestion/answer pairs, grounded within diagnostically relevant image patches\nthat make up the WSI. Our dataset is collected by leveraging educational\nhistopathology videos from YouTube, which provides spatial localization of\nnarrations by automatically extracting the narrators' cursor positions.\nQuilt-Instruct supports contextual reasoning by extracting diagnosis and\nsupporting facts from the entire WSI. Using Quilt-Instruct, we train\nQuilt-LLaVA, which can reason beyond the given single image patch, enabling\ndiagnostic reasoning across patches. To evaluate Quilt-LLaVA, we propose a\ncomprehensive evaluation dataset created from 985 images and 1283\nhuman-generated question-answers. We also thoroughly evaluate Quilt-LLaVA using\npublic histopathology datasets, where Quilt-LLaVA significantly outperforms\nSOTA by over 10% on relative GPT-4 score and 4% and 9% on open and closed set\nVQA. Our code, data, and model are publicly accessible at\nquilt-llava.github.io.\n","authors":["Mehmet Saygin Seyfioglu","Wisdom O. Ikezogwo","Fatemeh Ghezloo","Ranjay Krishna","Linda Shapiro"],"pdf_url":"https://arxiv.org/pdf/2312.04746v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06622v1","updated":"2024-04-09T21:12:31Z","published":"2024-04-09T21:12:31Z","title":"Calibrating Higher-Order Statistics for Few-Shot Class-Incremental\n Learning with Pre-trained Vision Transformers","summary":" Few-shot class-incremental learning (FSCIL) aims to adapt the model to new\nclasses from very few data (5 samples) without forgetting the previously\nlearned classes. Recent works in many-shot CIL (MSCIL) (using all available\ntraining data) exploited pre-trained models to reduce forgetting and achieve\nbetter plasticity. In a similar fashion, we use ViT models pre-trained on\nlarge-scale datasets for few-shot settings, which face the critical issue of\nlow plasticity. FSCIL methods start with a many-shot first task to learn a very\ngood feature extractor and then move to the few-shot setting from the second\ntask onwards. While the focus of most recent studies is on how to learn the\nmany-shot first task so that the model generalizes to all future few-shot\ntasks, we explore in this work how to better model the few-shot data using\npre-trained models, irrespective of how the first task is trained. Inspired by\nrecent works in MSCIL, we explore how using higher-order feature statistics can\ninfluence the classification of few-shot classes. We identify the main\nchallenge of obtaining a good covariance matrix from few-shot data and propose\nto calibrate the covariance matrix for new classes based on semantic similarity\nto the many-shot base classes. Using the calibrated feature statistics in\ncombination with existing methods significantly improves few-shot continual\nclassification on several FSCIL benchmarks. Code is available at\nhttps://github.com/dipamgoswami/FSCIL-Calibration.\n","authors":["Dipam Goswami","Bartłomiej Twardowski","Joost van de Weijer"],"pdf_url":"https://arxiv.org/pdf/2404.06622v1.pdf","comment":"Accepted at CLVision workshop (CVPR 2024)"},{"id":"http://arxiv.org/abs/2403.08092v2","updated":"2024-04-09T20:55:01Z","published":"2024-03-12T22:03:19Z","title":"Mitigating the Impact of Attribute Editing on Face Recognition","summary":" Through a large-scale study over diverse face images, we show that facial\nattribute editing using modern generative AI models can severely degrade\nautomated face recognition systems. This degradation persists even with\nidentity-preserving generative models. To mitigate this issue, we propose two\nnovel techniques for local and global attribute editing. We empirically ablate\ntwenty-six facial semantic, demographic and expression-based attributes that\nhave been edited using state-of-the-art generative models, and evaluate them\nusing ArcFace and AdaFace matchers on CelebA, CelebAMaskHQ and LFW datasets.\nFinally, we use LLaVA, an emerging visual question-answering framework for\nattribute prediction to validate our editing techniques. Our methods outperform\nthe current state-of-the-art at facial editing (BLIP, InstantID) while\nimproving identity retention by a significant extent.\n","authors":["Sudipta Banerjee","Sai Pranaswi Mullangi","Shruti Wagle","Chinmay Hegde","Nasir Memon"],"pdf_url":"https://arxiv.org/pdf/2403.08092v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2404.06605v1","updated":"2024-04-09T20:24:29Z","published":"2024-04-09T20:24:29Z","title":"RoadBEV: Road Surface Reconstruction in Bird's Eye View","summary":" Road surface conditions, especially geometry profiles, enormously affect\ndriving performance of autonomous vehicles. Vision-based online road\nreconstruction promisingly captures road information in advance. Existing\nsolutions like monocular depth estimation and stereo matching suffer from\nmodest performance. The recent technique of Bird's-Eye-View (BEV) perception\nprovides immense potential to more reliable and accurate reconstruction. This\npaper uniformly proposes two simple yet effective models for road elevation\nreconstruction in BEV named RoadBEV-mono and RoadBEV-stereo, which estimate\nroad elevation with monocular and stereo images, respectively. The former\ndirectly fits elevation values based on voxel features queried from image view,\nwhile the latter efficiently recognizes road elevation patterns based on BEV\nvolume representing discrepancy between left and right voxel features.\nInsightful analyses reveal their consistence and difference with perspective\nview. Experiments on real-world dataset verify the models' effectiveness and\nsuperiority. Elevation errors of RoadBEV-mono and RoadBEV-stereo achieve 1.83cm\nand 0.56cm, respectively. The estimation performance improves by 50\\% in BEV\nbased on monocular image. Our models are promising for practical applications,\nproviding valuable references for vision-based BEV perception in autonomous\ndriving. The code is released at https://github.com/ztsrxh/RoadBEV.\n","authors":["Tong Zhao","Lei Yang","Yichen Xie","Mingyu Ding","Masayoshi Tomizuka","Yintao Wei"],"pdf_url":"https://arxiv.org/pdf/2404.06605v1.pdf","comment":"Dataset page: https://thu-rsxd.com/rsrd Code:\n https://github.com/ztsrxh/RoadBEV"},{"id":"http://arxiv.org/abs/2404.06593v1","updated":"2024-04-09T19:49:01Z","published":"2024-04-09T19:49:01Z","title":"Spatially Optimized Compact Deep Metric Learning Model for Similarity\n Search","summary":" Spatial optimization is often overlooked in many computer vision tasks.\nFilters should be able to recognize the features of an object regardless of\nwhere it is in the image. Similarity search is a crucial task where spatial\nfeatures decide an important output. The capacity of convolution to capture\nvisual patterns across various locations is limited. In contrast to\nconvolution, the involution kernel is dynamically created at each pixel based\non the pixel value and parameters that have been learned. This study\ndemonstrates that utilizing a single layer of involution feature extractor\nalongside a compact convolution model significantly enhances the performance of\nsimilarity search. Additionally, we improve predictions by using the GELU\nactivation function rather than the ReLU. The negligible amount of weight\nparameters in involution with a compact model with better performance makes the\nmodel very useful in real-world implementations. Our proposed model is below 1\nmegabyte in size. We have experimented with our proposed methodology and other\nmodels on CIFAR-10, FashionMNIST, and MNIST datasets. Our proposed method\noutperforms across all three datasets.\n","authors":["Md. Farhadul Islam","Md. Tanzim Reza","Meem Arafat Manab","Mohammad Rakibul Hasan Mahin","Sarah Zabeen","Jannatun Noor"],"pdf_url":"https://arxiv.org/pdf/2404.06593v1.pdf","comment":"5 pages, 3 figures,"},{"id":"http://arxiv.org/abs/2404.06589v1","updated":"2024-04-09T19:33:05Z","published":"2024-04-09T19:33:05Z","title":"Leveraging Latents for Efficient Thermography Classification and\n Segmentation","summary":" Breast cancer is a prominent health concern worldwide, currently being the\nsecondmost common and second-deadliest type of cancer in women. While current\nbreast cancer diagnosis mainly relies on mammography imaging, in recent years\nthe use of thermography for breast cancer imaging has been garnering growing\npopularity. Thermographic imaging relies on infrared cameras to capture\nbody-emitted heat distributions. While these heat signatures have proven useful\nfor computer-vision systems for accurate breast cancer segmentation and\nclassification, prior work often relies on handcrafted feature engineering or\ncomplex architectures, potentially limiting the comparability and applicability\nof these methods. In this work, we present a novel algorithm for both breast\ncancer classification and segmentation. Rather than focusing efforts on manual\nfeature and architecture engineering, our algorithm focuses on leveraging an\ninformative, learned feature space, thus making our solution simpler to use and\nextend to other frameworks and downstream tasks, as well as more applicable to\ndata-scarce settings. Our classification produces SOTA results, while we are\nthe first work to produce segmentation regions studied in this paper.\n","authors":["Tamir Shor","Chaim Baskin","Alex Bronstein"],"pdf_url":"https://arxiv.org/pdf/2404.06589v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01102v2","updated":"2024-04-09T19:26:36Z","published":"2024-04-01T13:23:04Z","title":"Diffusion based Zero-shot Medical Image-to-Image Translation for Cross\n Modality Segmentation","summary":" Cross-modality image segmentation aims to segment the target modalities using\na method designed in the source modality. Deep generative models can translate\nthe target modality images into the source modality, thus enabling\ncross-modality segmentation. However, a vast body of existing cross-modality\nimage translation methods relies on supervised learning. In this work, we aim\nto address the challenge of zero-shot learning-based image translation tasks\n(extreme scenarios in the target modality is unseen in the training phase). To\nleverage generative learning for zero-shot cross-modality image segmentation,\nwe propose a novel unsupervised image translation method. The framework learns\nto translate the unseen source image to the target modality for image\nsegmentation by leveraging the inherent statistical consistency between\ndifferent modalities for diffusion guidance. Our framework captures identical\ncross-modality features in the statistical domain, offering diffusion guidance\nwithout relying on direct mappings between the source and target domains. This\nadvantage allows our method to adapt to changing source domains without the\nneed for retraining, making it highly practical when sufficient labeled source\ndomain data is not available. The proposed framework is validated in zero-shot\ncross-modality image segmentation tasks through empirical comparisons with\ninfluential generative models, including adversarial-based and diffusion-based\nmodels.\n","authors":["Zihao Wang","Yingyu Yang","Yuzhou Chen","Tingting Yuan","Maxime Sermesant","Herve Delingette","Ona Wu"],"pdf_url":"https://arxiv.org/pdf/2404.01102v2.pdf","comment":"Neurips 2023 Diffusion Workshop"},{"id":"http://arxiv.org/abs/2212.05140v2","updated":"2024-04-09T19:17:07Z","published":"2022-12-09T22:53:40Z","title":"Local Neighborhood Features for 3D Classification","summary":" With advances in deep learning model training strategies, the training of\nPoint cloud classification methods is significantly improving. For example,\nPointNeXt, which adopts prominent training techniques and InvResNet layers into\nPointNet++, achieves over 7% improvement on the real-world ScanObjectNN\ndataset. However, most of these models use point coordinates features of\nneighborhood points mapped to higher dimensional space while ignoring the\nneighborhood point features computed before feeding to the network layers. In\nthis paper, we revisit the PointNeXt model to study the usage and benefit of\nsuch neighborhood point features. We train and evaluate PointNeXt on ModelNet40\n(synthetic), ScanObjectNN (real-world), and a recent large-scale, real-world\ngrocery dataset, i.e., 3DGrocery100. In addition, we provide an additional\ninference strategy of weight averaging the top two checkpoints of PointNeXt to\nimprove classification accuracy. Together with the abovementioned ideas, we\ngain 0.5%, 1%, 4.8%, 3.4%, and 1.6% overall accuracy on the PointNeXt model\nwith real-world datasets, ScanObjectNN (hardest variant), 3DGrocery100's\nApple10, Fruits, Vegetables, and Packages subsets, respectively. We also\nachieve a comparable 0.2% accuracy gain on ModelNet40.\n","authors":["Shivanand Venkanna Sheshappanavar","Chandra Kambhamettu"],"pdf_url":"https://arxiv.org/pdf/2212.05140v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05490v2","updated":"2024-04-09T18:55:43Z","published":"2024-04-08T13:11:57Z","title":"Two-Person Interaction Augmentation with Skeleton Priors","summary":" Close and continuous interaction with rich contacts is a crucial aspect of\nhuman activities (e.g. hugging, dancing) and of interest in many domains like\nactivity recognition, motion prediction, character animation, etc. However,\nacquiring such skeletal motion is challenging. While direct motion capture is\nexpensive and slow, motion editing/generation is also non-trivial, as complex\ncontact patterns with topological and geometric constraints have to be\nretained. To this end, we propose a new deep learning method for two-body\nskeletal interaction motion augmentation, which can generate variations of\ncontact-rich interactions with varying body sizes and proportions while\nretaining the key geometric/topological relations between two bodies. Our\nsystem can learn effectively from a relatively small amount of data and\ngeneralize to drastically different skeleton sizes. Through exhaustive\nevaluation and comparison, we show it can generate high-quality motions, has\nstrong generalizability and outperforms traditional optimization-based methods\nand alternative deep learning solutions.\n","authors":["Baiyi Li","Edmond S. L. Ho","Hubert P. H. Shum","He Wang"],"pdf_url":"https://arxiv.org/pdf/2404.05490v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02527v3","updated":"2024-04-09T18:26:27Z","published":"2024-03-04T22:42:17Z","title":"A dataset of over one thousand computed tomography scans of battery\n cells","summary":" Battery technology is increasingly important for global electrification\nefforts. However, batteries are highly sensitive to small manufacturing\nvariations that can induce reliability or safety issues. An important\ntechnology for battery quality control is computed tomography (CT) scanning,\nwhich is widely used for non-destructive 3D inspection across a variety of\nclinical and industrial applications. Historically, however, the utility of CT\nscanning for high-volume manufacturing has been limited by its low throughput\nas well as the difficulty of handling its large file sizes. In this work, we\npresent a dataset of over one thousand CT scans of as-produced commercially\navailable batteries. The dataset spans various chemistries (lithium-ion and\nsodium-ion) as well as various battery form factors (cylindrical, pouch, and\nprismatic). We evaluate seven different battery types in total. The\nmanufacturing variability and the presence of battery defects can be observed\nvia this dataset. This dataset may be of interest to scientists and engineers\nworking on battery technology, computer vision, or both.\n","authors":["Amariah Condon","Bailey Buscarino","Eric Moch","William J. Sehnert","Owen Miles","Patrick K. Herring","Peter M. Attia"],"pdf_url":"https://arxiv.org/pdf/2403.02527v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08514v2","updated":"2024-04-09T18:23:39Z","published":"2023-12-13T21:02:03Z","title":"TAM-VT: Transformation-Aware Multi-scale Video Transformer for\n Segmentation and Tracking","summary":" Video Object Segmentation (VOS) has emerged as an increasingly important\nproblem with availability of larger datasets and more complex and realistic\nsettings, which involve long videos with global motion (e.g, in egocentric\nsettings), depicting small objects undergoing both rigid and non-rigid\n(including state) deformations. While a number of recent approaches have been\nexplored for this task, these data characteristics still present challenges. In\nthis work we propose a novel, clip-based DETR-style encoder-decoder\narchitecture, which focuses on systematically analyzing and addressing\naforementioned challenges. Specifically, we propose a novel\ntransformation-aware loss that focuses learning on portions of the video where\nan object undergoes significant deformations -- a form of \"soft\" hard examples\nmining. Further, we propose a multiplicative time-coded memory, beyond vanilla\nadditive positional encoding, which helps propagate context across long videos.\nFinally, we incorporate these in our proposed holistic multi-scale video\ntransformer for tracking via multi-scale memory matching and decoding to ensure\nsensitivity and accuracy for long videos and small objects. Our model enables\non-line inference with long videos in a windowed fashion, by breaking the video\ninto clips and propagating context among them. We illustrate that short clip\nlength and longer memory with learned time-coding are important design choices\nfor improved performance. Collectively, these technical contributions enable\nour model to achieve new state-of-the-art (SoTA) performance on two complex\negocentric datasets -- VISOR and VOST, while achieving comparable to SoTA\nresults on the conventional VOS benchmark, DAVIS'17. A series of detailed\nablations validate our design choices as well as provide insights into the\nimportance of parameter choices and their impact on performance.\n","authors":["Raghav Goyal","Wan-Cyuan Fan","Mennatullah Siam","Leonid Sigal"],"pdf_url":"https://arxiv.org/pdf/2312.08514v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06559v1","updated":"2024-04-09T18:23:34Z","published":"2024-04-09T18:23:34Z","title":"The Impact of Print-and-Scan in Heterogeneous Morph Evaluation Scenarios","summary":" Face morphing attacks present an emerging threat to the face recognition\nsystem. On top of that, printing and scanning the morphed images could obscure\nthe artifacts generated during the morphing process, which makes morphed image\ndetection even harder. In this work, we investigate the impact that printing\nand scanning has on morphing attacks through a series of heterogeneous tests.\nOur experiments show that we can increase the possibility of a false match by\nup to 5.64% for DiM and 16.00% for StyleGAN2 when providing an image that has\nbeen printed and scanned, regardless it is morphed or bona fide, to a Face\nRecognition (FR) system. Likewise, using Frechet Inception Distance (FID)\nmetric, strictly print-scanned morph attacks performed on average 9.185%\nstronger than non-print-scanned digital morphs.\n","authors":["Richard E. Neddo","Zander W. Blasingame","Chen Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06559v1.pdf","comment":"Initial preprint. Under review"},{"id":"http://arxiv.org/abs/2404.06542v1","updated":"2024-04-09T18:00:25Z","published":"2024-04-09T18:00:25Z","title":"Training-Free Open-Vocabulary Segmentation with Offline\n Diffusion-Augmented Prototype Generation","summary":" Open-vocabulary semantic segmentation aims at segmenting arbitrary categories\nexpressed in textual form. Previous works have trained over large amounts of\nimage-caption pairs to enforce pixel-level multimodal alignments. However,\ncaptions provide global information about the semantics of a given image but\nlack direct localization of individual concepts. Further, training on\nlarge-scale datasets inevitably brings significant computational costs. In this\npaper, we propose FreeDA, a training-free diffusion-augmented method for\nopen-vocabulary semantic segmentation, which leverages the ability of diffusion\nmodels to visually localize generated concepts and local-global similarities to\nmatch class-agnostic regions with semantic classes. Our approach involves an\noffline stage in which textual-visual reference embeddings are collected,\nstarting from a large set of captions and leveraging visual and semantic\ncontexts. At test time, these are queried to support the visual matching\nprocess, which is carried out by jointly considering class-agnostic regions and\nglobal semantic similarities. Extensive analyses demonstrate that FreeDA\nachieves state-of-the-art performance on five datasets, surpassing previous\nmethods by more than 7.0 average points in terms of mIoU and without requiring\nany training.\n","authors":["Luca Barsellotti","Roberto Amoroso","Marcella Cornia","Lorenzo Baraldi","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2404.06542v1.pdf","comment":"CVPR 2024. Project page: https://aimagelab.github.io/freeda/"},{"id":"http://arxiv.org/abs/2208.11650v3","updated":"2024-04-09T17:59:34Z","published":"2022-08-24T16:40:27Z","title":"Lane Change Classification and Prediction with Action Recognition\n Networks","summary":" Anticipating lane change intentions of surrounding vehicles is crucial for\nefficient and safe driving decision making in an autonomous driving system.\nPrevious works often adopt physical variables such as driving speed,\nacceleration and so forth for lane change classification. However, physical\nvariables do not contain semantic information. Although 3D CNNs have been\ndeveloping rapidly, the number of methods utilising action recognition models\nand appearance feature for lane change recognition is low, and they all require\nadditional information to pre-process data. In this work, we propose an\nend-to-end framework including two action recognition methods for lane change\nrecognition, using video data collected by cameras. Our method achieves the\nbest lane change classification results using only the RGB video data of the\nPREVENTION dataset. Class activation maps demonstrate that action recognition\nmodels can efficiently extract lane change motions. A method to better extract\nmotion clues is also proposed in this paper.\n","authors":["Kai Liang","Jun Wang","Abhir Bhalerao"],"pdf_url":"https://arxiv.org/pdf/2208.11650v3.pdf","comment":"Accepted to ECCV2022 AVVISION"}]},"2024-04-10T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.07206v1","updated":"2024-04-10T17:59:59Z","published":"2024-04-10T17:59:59Z","title":"GoodDrag: Towards Good Practices for Drag Editing with Diffusion Models","summary":" In this paper, we introduce GoodDrag, a novel approach to improve the\nstability and image quality of drag editing. Unlike existing methods that\nstruggle with accumulated perturbations and often result in distortions,\nGoodDrag introduces an AlDD framework that alternates between drag and\ndenoising operations within the diffusion process, effectively improving the\nfidelity of the result. We also propose an information-preserving motion\nsupervision operation that maintains the original features of the starting\npoint for precise manipulation and artifact reduction. In addition, we\ncontribute to the benchmarking of drag editing by introducing a new dataset,\nDrag100, and developing dedicated quality assessment metrics, Dragging Accuracy\nIndex and Gemini Score, utilizing Large Multimodal Models. Extensive\nexperiments demonstrate that the proposed GoodDrag compares favorably against\nthe state-of-the-art approaches both qualitatively and quantitatively. The\nproject page is https://gooddrag.github.io.\n","authors":["Zewei Zhang","Huan Liu","Jun Chen","Xiangyu Xu"],"pdf_url":"https://arxiv.org/pdf/2404.07206v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07204v1","updated":"2024-04-10T17:59:45Z","published":"2024-04-10T17:59:45Z","title":"BRAVE: Broadening the visual encoding of vision-language models","summary":" Vision-language models (VLMs) are typically composed of a vision encoder,\ne.g. CLIP, and a language model (LM) that interprets the encoded features to\nsolve downstream tasks. Despite remarkable progress, VLMs are subject to\nseveral shortcomings due to the limited capabilities of vision encoders, e.g.\n\"blindness\" to certain image features, visual hallucination, etc. To address\nthese issues, we study broadening the visual encoding capabilities of VLMs. We\nfirst comprehensively benchmark several vision encoders with different\ninductive biases for solving VLM tasks. We observe that there is no single\nencoding configuration that consistently achieves top performance across\ndifferent tasks, and encoders with different biases can perform surprisingly\nsimilarly. Motivated by this, we introduce a method, named BRAVE, that\nconsolidates features from multiple frozen encoders into a more versatile\nrepresentation that can be directly fed as the input to a frozen LM. BRAVE\nachieves state-of-the-art performance on a broad range of captioning and VQA\nbenchmarks and significantly reduces the aforementioned issues of VLMs, while\nrequiring a smaller number of trainable parameters than existing methods and\nhaving a more compressed representation. Our results highlight the potential of\nincorporating different visual biases for a more broad and contextualized\nvisual understanding of VLMs.\n","authors":["Oğuzhan Fatih Kar","Alessio Tonioni","Petra Poklukar","Achin Kulshrestha","Amir Zamir","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2404.07204v1.pdf","comment":"Project page at https://brave-vlms.epfl.ch/"},{"id":"http://arxiv.org/abs/2404.07202v1","updated":"2024-04-10T17:59:20Z","published":"2024-04-10T17:59:20Z","title":"UMBRAE: Unified Multimodal Decoding of Brain Signals","summary":" We address prevailing challenges of the brain-powered research, departing\nfrom the observation that the literature hardly recover accurate spatial\ninformation and require subject-specific models. To address these challenges,\nwe propose UMBRAE, a unified multimodal decoding of brain signals. First, to\nextract instance-level conceptual and spatial details from neural signals, we\nintroduce an efficient universal brain encoder for multimodal-brain alignment\nand recover object descriptions at multiple levels of granularity from\nsubsequent multimodal large language model (MLLM). Second, we introduce a\ncross-subject training strategy mapping subject-specific features to a common\nfeature space. This allows a model to be trained on multiple subjects without\nextra resources, even yielding superior results compared to subject-specific\nmodels. Further, we demonstrate this supports weakly-supervised adaptation to\nnew subjects, with only a fraction of the total training data. Experiments\ndemonstrate that UMBRAE not only achieves superior results in the newly\nintroduced tasks but also outperforms methods in well established tasks. To\nassess our method, we construct and share with the community a comprehensive\nbrain understanding benchmark BrainHub. Our code and benchmark are available at\nhttps://weihaox.github.io/UMBRAE.\n","authors":["Weihao Xia","Raoul de Charette","Cengiz Öztireli","Jing-Hao Xue"],"pdf_url":"https://arxiv.org/pdf/2404.07202v1.pdf","comment":"Project Page: https://weihaox.github.io/UMBRAE"},{"id":"http://arxiv.org/abs/2404.07199v1","updated":"2024-04-10T17:57:41Z","published":"2024-04-10T17:57:41Z","title":"RealmDreamer: Text-Driven 3D Scene Generation with Inpainting and Depth\n Diffusion","summary":" We introduce RealmDreamer, a technique for generation of general\nforward-facing 3D scenes from text descriptions. Our technique optimizes a 3D\nGaussian Splatting representation to match complex text prompts. We initialize\nthese splats by utilizing the state-of-the-art text-to-image generators,\nlifting their samples into 3D, and computing the occlusion volume. We then\noptimize this representation across multiple views as a 3D inpainting task with\nimage-conditional diffusion models. To learn correct geometric structure, we\nincorporate a depth diffusion model by conditioning on the samples from the\ninpainting model, giving rich geometric structure. Finally, we finetune the\nmodel using sharpened samples from image generators. Notably, our technique\ndoes not require video or multi-view data and can synthesize a variety of\nhigh-quality 3D scenes in different styles, consisting of multiple objects. Its\ngenerality additionally allows 3D synthesis from a single image.\n","authors":["Jaidev Shriram","Alex Trevithick","Lingjie Liu","Ravi Ramamoorthi"],"pdf_url":"https://arxiv.org/pdf/2404.07199v1.pdf","comment":"Project Page: https://realmdreamer.github.io/"},{"id":"http://arxiv.org/abs/2404.07191v1","updated":"2024-04-10T17:48:37Z","published":"2024-04-10T17:48:37Z","title":"InstantMesh: Efficient 3D Mesh Generation from a Single Image with\n Sparse-view Large Reconstruction Models","summary":" We present InstantMesh, a feed-forward framework for instant 3D mesh\ngeneration from a single image, featuring state-of-the-art generation quality\nand significant training scalability. By synergizing the strengths of an\noff-the-shelf multiview diffusion model and a sparse-view reconstruction model\nbased on the LRM architecture, InstantMesh is able to create diverse 3D assets\nwithin 10 seconds. To enhance the training efficiency and exploit more\ngeometric supervisions, e.g, depths and normals, we integrate a differentiable\niso-surface extraction module into our framework and directly optimize on the\nmesh representation. Experimental results on public datasets demonstrate that\nInstantMesh significantly outperforms other latest image-to-3D baselines, both\nqualitatively and quantitatively. We release all the code, weights, and demo of\nInstantMesh, with the intention that it can make substantial contributions to\nthe community of 3D generative AI and empower both researchers and content\ncreators.\n","authors":["Jiale Xu","Weihao Cheng","Yiming Gao","Xintao Wang","Shenghua Gao","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2404.07191v1.pdf","comment":"Technical report. Project: https://github.com/TencentARC/InstantMesh"},{"id":"http://arxiv.org/abs/2404.07188v1","updated":"2024-04-10T17:41:41Z","published":"2024-04-10T17:41:41Z","title":"GCV-Turbo: End-to-end Acceleration of GNN-based Computer Vision Tasks on\n FPGA","summary":" Graph neural networks (GNNs) have recently empowered various novel computer\nvision (CV) tasks. In GNN-based CV tasks, a combination of CNN layers and GNN\nlayers or only GNN layers are employed. This paper introduces GCV-Turbo, a\ndomain-specific accelerator on FPGA for end-to-end acceleration of GNN-based CV\ntasks. GCV-Turbo consists of two key components: (1) a \\emph{novel} hardware\narchitecture optimized for the computation kernels in both CNNs and GNNs using\nthe same set of computation resources. (2) a PyTorch-compatible compiler that\ntakes a user-defined model as input, performs end-to-end optimization for the\ncomputation graph of a given GNN-based CV task, and produces optimized code for\nhardware execution. The hardware architecture and the compiler work\nsynergistically to support a variety of GNN-based CV tasks. We implement\nGCV-Turbo on a state-of-the-art FPGA and evaluate its performance across six\nrepresentative GNN-based CV tasks with diverse input data modalities (e.g.,\nimage, human skeleton, point cloud). Compared with state-of-the-art CPU (GPU)\nimplementations, GCV-Turbo achieves an average latency reduction of\n$68.4\\times$ ($4.1\\times$) on these six GNN-based CV tasks. Moreover, GCV-Turbo\nsupports the execution of the standalone CNNs or GNNs, achieving performance\ncomparable to that of state-of-the-art CNN (GNN) accelerators for widely used\nCNN-only (GNN-only) models.\n","authors":["Bingyi Zhang","Rajgopal Kannan","Carl Busart","Viktor Prasanna"],"pdf_url":"https://arxiv.org/pdf/2404.07188v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.14855v2","updated":"2024-04-10T17:35:16Z","published":"2022-12-30T18:04:25Z","title":"Disentangled Explanations of Neural Network Predictions by Finding\n Relevant Subspaces","summary":" Explainable AI aims to overcome the black-box nature of complex ML models\nlike neural networks by generating explanations for their predictions.\nExplanations often take the form of a heatmap identifying input features (e.g.\npixels) that are relevant to the model's decision. These explanations, however,\nentangle the potentially multiple factors that enter into the overall complex\ndecision strategy. We propose to disentangle explanations by extracting at some\nintermediate layer of a neural network, subspaces that capture the multiple and\ndistinct activation patterns (e.g. visual concepts) that are relevant to the\nprediction. To automatically extract these subspaces, we propose two new\nanalyses, extending principles found in PCA or ICA to explanations. These novel\nanalyses, which we call principal relevant component analysis (PRCA) and\ndisentangled relevant subspace analysis (DRSA), maximize relevance instead of\ne.g. variance or kurtosis. This allows for a much stronger focus of the\nanalysis on what the ML model actually uses for predicting, ignoring\nactivations or concepts to which the model is invariant. Our approach is\ngeneral enough to work alongside common attribution techniques such as Shapley\nValue, Integrated Gradients, or LRP. Our proposed methods show to be\npractically useful and compare favorably to the state of the art as\ndemonstrated on benchmarks and three use cases.\n","authors":["Pattarawat Chormai","Jan Herrmann","Klaus-Robert Müller","Grégoire Montavon"],"pdf_url":"https://arxiv.org/pdf/2212.14855v2.pdf","comment":"17 pages + supplement"},{"id":"http://arxiv.org/abs/2404.07178v1","updated":"2024-04-10T17:28:16Z","published":"2024-04-10T17:28:16Z","title":"Move Anything with Layered Scene Diffusion","summary":" Diffusion models generate images with an unprecedented level of quality, but\nhow can we freely rearrange image layouts? Recent works generate controllable\nscenes via learning spatially disentangled latent codes, but these methods do\nnot apply to diffusion models due to their fixed forward process. In this work,\nwe propose SceneDiffusion to optimize a layered scene representation during the\ndiffusion sampling process. Our key insight is that spatial disentanglement can\nbe obtained by jointly denoising scene renderings at different spatial layouts.\nOur generated scenes support a wide range of spatial editing operations,\nincluding moving, resizing, cloning, and layer-wise appearance editing\noperations, including object restyling and replacing. Moreover, a scene can be\ngenerated conditioned on a reference image, thus enabling object moving for\nin-the-wild images. Notably, this approach is training-free, compatible with\ngeneral text-to-image diffusion models, and responsive in less than a second.\n","authors":["Jiawei Ren","Mengmeng Xu","Jui-Chieh Wu","Ziwei Liu","Tao Xiang","Antoine Toisoul"],"pdf_url":"https://arxiv.org/pdf/2404.07178v1.pdf","comment":"CVPR 2024 camera-ready"},{"id":"http://arxiv.org/abs/2404.07176v1","updated":"2024-04-10T17:25:42Z","published":"2024-04-10T17:25:42Z","title":"Self-supervised Monocular Depth Estimation on Water Scenes via Specular\n Reflection Prior","summary":" Monocular depth estimation from a single image is an ill-posed problem for\ncomputer vision due to insufficient reliable cues as the prior knowledge.\nBesides the inter-frame supervision, namely stereo and adjacent frames,\nextensive prior information is available in the same frame. Reflections from\nspecular surfaces, informative intra-frame priors, enable us to reformulate the\nill-posed depth estimation task as a multi-view synthesis. This paper proposes\nthe first self-supervision for deep-learning depth estimation on water scenes\nvia intra-frame priors, known as reflection supervision and geometrical\nconstraints. In the first stage, a water segmentation network is performed to\nseparate the reflection components from the entire image. Next, we construct a\nself-supervised framework to predict the target appearance from reflections,\nperceived as other perspectives. The photometric re-projection error,\nincorporating SmoothL1 and a novel photometric adaptive SSIM, is formulated to\noptimize pose and depth estimation by aligning the transformed virtual depths\nand source ones. As a supplement, the water surface is determined from real and\nvirtual camera positions, which complement the depth of the water area.\nFurthermore, to alleviate these laborious ground truth annotations, we\nintroduce a large-scale water reflection scene (WRS) dataset rendered from\nUnreal Engine 4. Extensive experiments on the WRS dataset prove the feasibility\nof the proposed method compared to state-of-the-art depth estimation\ntechniques.\n","authors":["Zhengyang Lu","Ying Chen"],"pdf_url":"https://arxiv.org/pdf/2404.07176v1.pdf","comment":"16 pages, 8 figures"},{"id":"http://arxiv.org/abs/2212.11120v2","updated":"2024-04-10T17:15:23Z","published":"2022-12-10T07:50:29Z","title":"Deep Learning for Inertial Sensor Alignment","summary":" Accurate alignment of a fixed mobile device equipped with inertial sensors\ninside a moving vehicle is important for navigation, activity recognition, and\nother applications. Accurate estimation of the device mounting angle is\nrequired to rotate the inertial measurement from the sensor frame to the moving\nplatform frame to standardize measurements and improve the performance of the\ntarget task. In this work, a data-driven approach using deep neural networks\n(DNNs) is proposed to learn the yaw mounting angle of a smartphone equipped\nwith an inertial measurement unit (IMU) and strapped to a car. The proposed\nmodel uses only the accelerometer and gyroscope readings from an IMU as input\nand, in contrast to existing solutions, does not require global position inputs\nfrom global navigation satellite systems (GNSS). To train the model in a\nsupervised manner, IMU data is collected for training and validation with the\nsensor mounted at a known yaw mounting angle, and a range of ground truth\nlabels is generated by applying a random rotation in a bounded range to the\nmeasurements. The trained model is tested on data with real rotations showing\nsimilar performance as with synthetic rotations. The trained model is deployed\non an Android device and evaluated in real-time to test the accuracy of the\nestimated yaw mounting angle. The model is shown to find the mounting angle at\nan accuracy of 8 degrees within 5 seconds, and 4 degrees within 27 seconds. An\nexperiment is conducted to compare the proposed model with an existing\noff-the-shelf solution.\n","authors":["Maxim Freydin","Niv Sfaradi","Nimrod Segol","Areej Eweida","Barak Or"],"pdf_url":"https://arxiv.org/pdf/2212.11120v2.pdf","comment":"9 Pages, Preprint. Accepted IEEE"},{"id":"http://arxiv.org/abs/2404.07155v1","updated":"2024-04-10T16:44:11Z","published":"2024-04-10T16:44:11Z","title":"Unified Language-driven Zero-shot Domain Adaptation","summary":" This paper introduces Unified Language-driven Zero-shot Domain Adaptation\n(ULDA), a novel task setting that enables a single model to adapt to diverse\ntarget domains without explicit domain-ID knowledge. We identify the\nconstraints in the existing language-driven zero-shot domain adaptation task,\nparticularly the requirement for domain IDs and domain-specific models, which\nmay restrict flexibility and scalability. To overcome these issues, we propose\na new framework for ULDA, consisting of Hierarchical Context Alignment (HCA),\nDomain Consistent Representation Learning (DCRL), and Text-Driven Rectifier\n(TDR). These components work synergistically to align simulated features with\ntarget text across multiple visual levels, retain semantic correlations between\ndifferent regional representations, and rectify biases between simulated and\nreal target visual features, respectively. Our extensive empirical evaluations\ndemonstrate that this framework achieves competitive performance in both\nsettings, surpassing even the model that requires domain-ID, showcasing its\nsuperiority and generalization ability. The proposed method is not only\neffective but also maintains practicality and efficiency, as it does not\nintroduce additional computational costs during inference. Our project page is\nhttps://senqiaoyang.com/project/ULDA .\n","authors":["Senqiao Yang","Zhuotao Tian","Li Jiang","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2404.07155v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.07153v1","updated":"2024-04-10T16:39:50Z","published":"2024-04-10T16:39:50Z","title":"Lost in Translation: Modern Neural Networks Still Struggle With Small\n Realistic Image Transformations","summary":" Deep neural networks that achieve remarkable performance in image\nclassification have previously been shown to be easily fooled by tiny\ntransformations such as a one pixel translation of the input image. In order to\naddress this problem, two approaches have been proposed in recent years. The\nfirst approach suggests using huge datasets together with data augmentation in\nthe hope that a highly varied training set will teach the network to learn to\nbe invariant. The second approach suggests using architectural modifications\nbased on sampling theory to deal explicitly with image translations. In this\npaper, we show that these approaches still fall short in robustly handling\n'natural' image translations that simulate a subtle change in camera\norientation. Our findings reveal that a mere one-pixel translation can result\nin a significant change in the predicted image representation for approximately\n40% of the test images in state-of-the-art models (e.g. open-CLIP trained on\nLAION-2B or DINO-v2) , while models that are explicitly constructed to be\nrobust to cyclic translations can still be fooled with 1 pixel realistic\n(non-cyclic) translations 11% of the time. We present Robust Inference by Crop\nSelection: a simple method that can be proven to achieve any desired level of\nconsistency, although with a modest tradeoff with the model's accuracy.\nImportantly, we demonstrate how employing this method reduces the ability to\nfool state-of-the-art models with a 1 pixel translation to less than 5% while\nsuffering from only a 1% drop in classification accuracy. Additionally, we show\nthat our method can be easy adjusted to deal with circular shifts as well. In\nsuch case we achieve 100% robustness to integer shifts with state-of-the-art\naccuracy, and with no need for any further training.\n","authors":["Ofir Shifman","Yair Weiss"],"pdf_url":"https://arxiv.org/pdf/2404.07153v1.pdf","comment":"14 pages, 6 appendices, 17 figures"},{"id":"http://arxiv.org/abs/2312.00068v2","updated":"2024-04-10T16:04:48Z","published":"2023-11-29T20:59:00Z","title":"GLiDR: Topologically Regularized Graph Generative Network for Sparse\n LiDAR Point Clouds","summary":" Sparse LiDAR point clouds cause severe loss of detail of static structures\nand reduce the density of static points available for navigation. Reduced\ndensity can be detrimental to navigation under several scenarios. We observe\nthat despite high sparsity, in most cases, the global topology of LiDAR\noutlining the static structures can be inferred. We utilize this property to\nobtain a backbone skeleton of a LiDAR scan in the form of a single connected\ncomponent that is a proxy to its global topology. We utilize the backbone to\naugment new points along static structures to overcome sparsity. Newly\nintroduced points could correspond to existing static structures or to static\npoints that were earlier obstructed by dynamic objects. To the best of our\nknowledge, we are the first to use such a strategy for sparse LiDAR point\nclouds. Existing solutions close to our approach fail to identify and preserve\nthe global static LiDAR topology and generate sub-optimal points. We propose\nGLiDR, a Graph Generative network that is topologically regularized using\n0-dimensional Persistent Homology ($\\mathcal{PH}$) constraints. This enables\nGLiDR to introduce newer static points along a topologically consistent global\nstatic LiDAR backbone. GLiDR generates precise static points using $32\\times$\nsparser dynamic scans and performs better than the baselines across three\ndatasets. GLiDR generates a valuable byproduct - an accurate binary\nsegmentation mask of static and dynamic objects that are helpful for navigation\nplanning and safety in constrained environments. The newly introduced static\npoints allow GLiDR to outperform LiDAR-based navigation using SLAM in several\nsettings. Source code is available at\n$\\texttt{https://github.com/GLiDR-CVPR2024/GLiDR}$.\n","authors":["Prashant Kumar","Kshitij Madhav Bhat","Vedang Bhupesh Shenvi Nadkarni","Prem Kalra"],"pdf_url":"https://arxiv.org/pdf/2312.00068v2.pdf","comment":"IEEE / CVF Computer Vision and Pattern Recognition Conference (CVPR)"},{"id":"http://arxiv.org/abs/2404.07124v1","updated":"2024-04-10T16:04:21Z","published":"2024-04-10T16:04:21Z","title":"Measuring proximity to standard planes during fetal brain ultrasound\n scanning","summary":" This paper introduces a novel pipeline designed to bring ultrasound (US)\nplane pose estimation closer to clinical use for more effective navigation to\nthe standard planes (SPs) in the fetal brain. We propose a semi-supervised\nsegmentation model utilizing both labeled SPs and unlabeled 3D US volume\nslices. Our model enables reliable segmentation across a diverse set of fetal\nbrain images. Furthermore, the model incorporates a classification mechanism to\nidentify the fetal brain precisely. Our model not only filters out frames\nlacking the brain but also generates masks for those containing it, enhancing\nthe relevance of plane pose regression in clinical settings. We focus on fetal\nbrain navigation from 2D ultrasound (US) video analysis and combine this model\nwith a US plane pose regression network to provide sensorless proximity\ndetection to SPs and non-SPs planes; we emphasize the importance of proximity\ndetection to SPs for guiding sonographers, offering a substantial advantage\nover traditional methods by allowing earlier and more precise adjustments\nduring scanning. We demonstrate the practical applicability of our approach\nthrough validation on real fetal scan videos obtained from sonographers of\nvarying expertise levels. Our findings demonstrate the potential of our\napproach to complement existing fetal US technologies and advance prenatal\ndiagnostic practices.\n","authors":["Chiara Di Vece","Antonio Cirigliano","Meala Le Lous","Raffaele Napolitano","Anna L. David","Donald Peebles","Pierre Jannin","Francisco Vasconcelos","Danail Stoyanov"],"pdf_url":"https://arxiv.org/pdf/2404.07124v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.07122v1","updated":"2024-04-10T16:01:37Z","published":"2024-04-10T16:01:37Z","title":"Driver Attention Tracking and Analysis","summary":" We propose a novel method to estimate a driver's points-of-gaze using a pair\nof ordinary cameras mounted on the windshield and dashboard of a car. This is a\nchallenging problem due to the dynamics of traffic environments with 3D scenes\nof unknown depths. This problem is further complicated by the volatile distance\nbetween the driver and the camera system. To tackle these challenges, we\ndevelop a novel convolutional network that simultaneously analyzes the image of\nthe scene and the image of the driver's face. This network has a camera\ncalibration module that can compute an embedding vector that represents the\nspatial configuration between the driver and the camera system. This\ncalibration module improves the overall network's performance, which can be\njointly trained end to end.\n We also address the lack of annotated data for training and evaluation by\nintroducing a large-scale driving dataset with point-of-gaze annotations. This\nis an in situ dataset of real driving sessions in an urban city, containing\nsynchronized images of the driving scene as well as the face and gaze of the\ndriver. Experiments on this dataset show that the proposed method outperforms\nvarious baseline methods, having the mean prediction error of 29.69 pixels,\nwhich is relatively small compared to the $1280{\\times}720$ resolution of the\nscene camera.\n","authors":["Dat Viet Thanh Nguyen","Anh Tran","Nam Vu","Cuong Pham","Minh Hoai"],"pdf_url":"https://arxiv.org/pdf/2404.07122v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10908v3","updated":"2024-04-10T15:59:31Z","published":"2023-12-18T03:34:07Z","title":"CLOVA: A Closed-Loop Visual Assistant with Tool Usage and Update","summary":" Utilizing large language models (LLMs) to compose off-the-shelf visual tools\nrepresents a promising avenue of research for developing robust visual\nassistants capable of addressing diverse visual tasks. However, these methods\noften overlook the potential for continual learning, typically by freezing the\nutilized tools, thus limiting their adaptation to environments requiring new\nknowledge. To tackle this challenge, we propose CLOVA, a Closed-Loop Visual\nAssistant, which operates within a framework encompassing inference,\nreflection, and learning phases. During the inference phase, LLMs generate\nprograms and execute corresponding tools to complete assigned tasks. In the\nreflection phase, a multimodal global-local reflection scheme analyzes human\nfeedback to determine which tools require updating. Lastly, the learning phase\nemploys three flexible approaches to automatically gather training data and\nintroduces a novel prompt tuning scheme to update the tools, allowing CLOVA to\nefficiently acquire new knowledge. Experimental findings demonstrate that CLOVA\nsurpasses existing tool-usage methods by 5% in visual question answering and\nmultiple-image reasoning, by 10% in knowledge tagging, and by 20% in image\nediting. These results underscore the significance of the continual learning\ncapability in general visual assistants.\n","authors":["Zhi Gao","Yuntao Du","Xintong Zhang","Xiaojian Ma","Wenjuan Han","Song-Chun Zhu","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2312.10908v3.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2312.11468v3","updated":"2024-04-10T15:58:09Z","published":"2023-11-13T20:41:48Z","title":"Bias-Reduced Neural Networks for Parameter Estimation in Quantitative\n MRI","summary":" Purpose: To develop neural network (NN)-based quantitative MRI parameter\nestimators with minimal bias and a variance close to the Cram\\'er-Rao bound.\n Theory and Methods: We generalize the mean squared error loss to control the\nbias and variance of the NN's estimates, which involves averaging over multiple\nnoise realizations of the same measurements during training. Bias and variance\nproperties of the resulting NNs are studied for two neuroimaging applications.\n Results: In simulations, the proposed strategy reduces the estimates' bias\nthroughout parameter space and achieves a variance close to the Cram\\'er-Rao\nbound. In vivo, we observe good concordance between parameter maps estimated\nwith the proposed NNs and traditional estimators, such as non-linear\nleast-squares fitting, while state-of-the-art NNs show larger deviations.\n Conclusion: The proposed NNs have greatly reduced bias compared to those\ntrained using the mean squared error and offer significantly improved\ncomputational efficiency over traditional estimators with comparable or better\naccuracy.\n","authors":["Andrew Mao","Sebastian Flassbeck","Jakob Assländer"],"pdf_url":"https://arxiv.org/pdf/2312.11468v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07112v1","updated":"2024-04-10T15:51:46Z","published":"2024-04-10T15:51:46Z","title":"Unfolding ADMM for Enhanced Subspace Clustering of Hyperspectral Images","summary":" Deep subspace clustering methods are now prominent in clustering, typically\nusing fully connected networks and a self-representation loss function.\nHowever, these methods often struggle with overfitting and lack\ninterpretability. In this paper, we explore an alternative clustering approach\nbased on deep unfolding. By unfolding iterative optimization methods into\nneural networks, this approach offers enhanced interpretability and reliability\ncompared to data-driven deep learning methods, and greater adaptability and\ngeneralization than model-based approaches. Hence, unfolding has become widely\nused in inverse imaging problems, such as image restoration, reconstruction,\nand super-resolution, but has not been sufficiently explored yet in the context\nof clustering. In this work, we introduce an innovative clustering architecture\nfor hyperspectral images (HSI) by unfolding an iterative solver based on the\nAlternating Direction Method of Multipliers (ADMM) for sparse subspace\nclustering. To our knowledge, this is the first attempt to apply unfolding ADMM\nfor computing the self-representation matrix in subspace clustering. Moreover,\nour approach captures well the structural characteristics of HSI data by\nemploying the K nearest neighbors algorithm as part of a structure preservation\nmodule. Experimental evaluation of three established HSI datasets shows clearly\nthe potential of the unfolding approach in HSI clustering and even demonstrates\nsuperior performance compared to state-of-the-art techniques.\n","authors":["Xianlu Li","Nicolas Nadisic","Shaoguang Huang","Aleksandra Pižurica"],"pdf_url":"https://arxiv.org/pdf/2404.07112v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07110v1","updated":"2024-04-10T15:47:35Z","published":"2024-04-10T15:47:35Z","title":"Wild Visual Navigation: Fast Traversability Learning via Pre-Trained\n Models and Online Self-Supervision","summary":" Natural environments such as forests and grasslands are challenging for\nrobotic navigation because of the false perception of rigid obstacles from high\ngrass, twigs, or bushes. In this work, we present Wild Visual Navigation (WVN),\nan online self-supervised learning system for visual traversability estimation.\nThe system is able to continuously adapt from a short human demonstration in\nthe field, only using onboard sensing and computing. One of the key ideas to\nachieve this is the use of high-dimensional features from pre-trained\nself-supervised models, which implicitly encode semantic information that\nmassively simplifies the learning task. Further, the development of an online\nscheme for supervision generator enables concurrent training and inference of\nthe learned model in the wild. We demonstrate our approach through diverse\nreal-world deployments in forests, parks, and grasslands. Our system is able to\nbootstrap the traversable terrain segmentation in less than 5 min of in-field\ntraining time, enabling the robot to navigate in complex, previously unseen\noutdoor terrains. Code: https://bit.ly/498b0CV - Project\npage:https://bit.ly/3M6nMHH\n","authors":["Matías Mattamala","Jonas Frey","Piotr Libera","Nived Chebrolu","Georg Martius","Cesar Cadena","Marco Hutter","Maurice Fallon"],"pdf_url":"https://arxiv.org/pdf/2404.07110v1.pdf","comment":"Extended version of arXiv:2305.08510"},{"id":"http://arxiv.org/abs/2404.07106v1","updated":"2024-04-10T15:45:03Z","published":"2024-04-10T15:45:03Z","title":"3DMambaComplete: Exploring Structured State Space Model for Point Cloud\n Completion","summary":" Point cloud completion aims to generate a complete and high-fidelity point\ncloud from an initially incomplete and low-quality input. A prevalent strategy\ninvolves leveraging Transformer-based models to encode global features and\nfacilitate the reconstruction process. However, the adoption of pooling\noperations to obtain global feature representations often results in the loss\nof local details within the point cloud. Moreover, the attention mechanism\ninherent in Transformers introduces additional computational complexity,\nrendering it challenging to handle long sequences effectively. To address these\nissues, we propose 3DMambaComplete, a point cloud completion network built on\nthe novel Mamba framework. It comprises three modules: HyperPoint Generation\nencodes point cloud features using Mamba's selection mechanism and predicts a\nset of Hyperpoints. A specific offset is estimated, and the down-sampled points\nbecome HyperPoints. The HyperPoint Spread module disperses these HyperPoints\nacross different spatial locations to avoid concentration. Finally, a\ndeformation method transforms the 2D mesh representation of HyperPoints into a\nfine-grained 3D structure for point cloud reconstruction. Extensive experiments\nconducted on various established benchmarks demonstrate that 3DMambaComplete\nsurpasses state-of-the-art point cloud completion methods, as confirmed by\nqualitative and quantitative analyses.\n","authors":["Yixuan Li","Weidong Yang","Ben Fei"],"pdf_url":"https://arxiv.org/pdf/2404.07106v1.pdf","comment":"10 pages, 8 figures, 7 tables"},{"id":"http://arxiv.org/abs/2404.07097v1","updated":"2024-04-10T15:37:00Z","published":"2024-04-10T15:37:00Z","title":"Learning Priors for Non Rigid SfM from Casual Videos","summary":" We tackle the long-standing challenge of reconstructing 3D structures and\ncamera positions from videos. The problem is particularly hard when objects are\ntransformed in a non-rigid way. Current approaches to this problem make\nunrealistic assumptions or require a long optimization time.\n We present TracksTo4D, a novel deep learning-based approach that enables\ninferring 3D structure and camera positions from dynamic content originating\nfrom in-the-wild videos using a single feed-forward pass on a sparse point\ntrack matrix. To achieve this, we leverage recent advances in 2D point tracking\nand design an equivariant neural architecture tailored for directly processing\n2D point tracks by leveraging their symmetries. TracksTo4D is trained on a\ndataset of in-the-wild videos utilizing only the 2D point tracks extracted from\nthe videos, without any 3D supervision. Our experiments demonstrate that\nTracksTo4D generalizes well to unseen videos of unseen semantic categories at\ninference time, producing equivalent results to state-of-the-art methods while\nsignificantly reducing the runtime compared to other baselines.\n","authors":["Yoni Kasten","Wuyue Lu","Haggai Maron"],"pdf_url":"https://arxiv.org/pdf/2404.07097v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07094v1","updated":"2024-04-10T15:34:10Z","published":"2024-04-10T15:34:10Z","title":"MoCap-to-Visual Domain Adaptation for Efficient Human Mesh Estimation\n from 2D Keypoints","summary":" This paper presents Key2Mesh, a model that takes a set of 2D human pose\nkeypoints as input and estimates the corresponding body mesh. Since this\nprocess does not involve any visual (i.e. RGB image) data, the model can be\ntrained on large-scale motion capture (MoCap) datasets, thereby overcoming the\nscarcity of image datasets with 3D labels. To enable the model's application on\nRGB images, we first run an off-the-shelf 2D pose estimator to obtain the 2D\nkeypoints, and then feed these 2D keypoints to Key2Mesh. To improve the\nperformance of our model on RGB images, we apply an adversarial domain\nadaptation (DA) method to bridge the gap between the MoCap and visual domains.\nCrucially, our DA method does not require 3D labels for visual data, which\nenables adaptation to target sets without the need for costly labels. We\nevaluate Key2Mesh for the task of estimating 3D human meshes from 2D keypoints,\nin the absence of RGB and mesh label pairs. Our results on widely used H3.6M\nand 3DPW datasets show that Key2Mesh sets the new state-of-the-art by\noutperforming other models in PA-MPJPE for both datasets, and in MPJPE and PVE\nfor the 3DPW dataset. Thanks to our model's simple architecture, it operates at\nleast 12x faster than the prior state-of-the-art model, LGD. Additional\nqualitative samples and code are available on the project website:\nhttps://key2mesh.github.io/.\n","authors":["Bedirhan Uguz","Ozhan Suat","Batuhan Karagoz","Emre Akbas"],"pdf_url":"https://arxiv.org/pdf/2404.07094v1.pdf","comment":"accepted to CVPRW 2024"},{"id":"http://arxiv.org/abs/2401.07745v2","updated":"2024-04-10T15:30:23Z","published":"2024-01-15T14:56:15Z","title":"MaskClustering: View Consensus based Mask Graph Clustering for\n Open-Vocabulary 3D Instance Segmentation","summary":" Open-vocabulary 3D instance segmentation is cutting-edge for its ability to\nsegment 3D instances without predefined categories. However, progress in 3D\nlags behind its 2D counterpart due to limited annotated 3D data. To address\nthis, recent works first generate 2D open-vocabulary masks through 2D models\nand then merge them into 3D instances based on metrics calculated between two\nneighboring frames. In contrast to these local metrics, we propose a novel\nmetric, view consensus rate, to enhance the utilization of multi-view\nobservations. The key insight is that two 2D masks should be deemed part of the\nsame 3D instance if a significant number of other 2D masks from different views\ncontain both these two masks. Using this metric as edge weight, we construct a\nglobal mask graph where each mask is a node. Through iterative clustering of\nmasks showing high view consensus, we generate a series of clusters, each\nrepresenting a distinct 3D instance. Notably, our model is training-free.\nThrough extensive experiments on publicly available datasets, including\nScanNet++, ScanNet200 and MatterPort3D, we demonstrate that our method achieves\nstate-of-the-art performance in open-vocabulary 3D instance segmentation. Our\nproject page is at https://pku-epic.github.io/MaskClustering.\n","authors":["Mi Yan","Jiazhao Zhang","Yan Zhu","He Wang"],"pdf_url":"https://arxiv.org/pdf/2401.07745v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02233v2","updated":"2024-04-10T15:22:05Z","published":"2024-04-02T18:40:55Z","title":"Visual Concept Connectome (VCC): Open World Concept Discovery and their\n Interlayer Connections in Deep Models","summary":" Understanding what deep network models capture in their learned\nrepresentations is a fundamental challenge in computer vision. We present a new\nmethodology to understanding such vision models, the Visual Concept Connectome\n(VCC), which discovers human interpretable concepts and their interlayer\nconnections in a fully unsupervised manner. Our approach simultaneously reveals\nfine-grained concepts at a layer, connection weightings across all layers and\nis amendable to global analysis of network structure (e.g., branching pattern\nof hierarchical concept assemblies). Previous work yielded ways to extract\ninterpretable concepts from single layers and examine their impact on\nclassification, but did not afford multilayer concept analysis across an entire\nnetwork architecture. Quantitative and qualitative empirical results show the\neffectiveness of VCCs in the domain of image classification. Also, we leverage\nVCCs for the application of failure mode debugging to reveal where mistakes\narise in deep networks.\n","authors":["Matthew Kowal","Richard P. Wildes","Konstantinos G. Derpanis"],"pdf_url":"https://arxiv.org/pdf/2404.02233v2.pdf","comment":"CVPR 2024 (Highlight)"},{"id":"http://arxiv.org/abs/2401.10831v3","updated":"2024-04-10T15:19:07Z","published":"2024-01-19T17:27:21Z","title":"Understanding Video Transformers via Universal Concept Discovery","summary":" This paper studies the problem of concept-based interpretability of\ntransformer representations for videos. Concretely, we seek to explain the\ndecision-making process of video transformers based on high-level,\nspatiotemporal concepts that are automatically discovered. Prior research on\nconcept-based interpretability has concentrated solely on image-level tasks.\nComparatively, video models deal with the added temporal dimension, increasing\ncomplexity and posing challenges in identifying dynamic concepts over time. In\nthis work, we systematically address these challenges by introducing the first\nVideo Transformer Concept Discovery (VTCD) algorithm. To this end, we propose\nan efficient approach for unsupervised identification of units of video\ntransformer representations - concepts, and ranking their importance to the\noutput of a model. The resulting concepts are highly interpretable, revealing\nspatio-temporal reasoning mechanisms and object-centric representations in\nunstructured video models. Performing this analysis jointly over a diverse set\nof supervised and self-supervised representations, we discover that some of\nthese mechanism are universal in video transformers. Finally, we show that VTCD\ncan be used for fine-grained action recognition and video object segmentation.\n","authors":["Matthew Kowal","Achal Dave","Rares Ambrus","Adrien Gaidon","Konstantinos G. Derpanis","Pavel Tokmakov"],"pdf_url":"https://arxiv.org/pdf/2401.10831v3.pdf","comment":"CVPR 2024 (Highlight)"},{"id":"http://arxiv.org/abs/2402.18320v2","updated":"2024-04-10T15:09:22Z","published":"2024-02-28T13:33:43Z","title":"Location-guided Head Pose Estimation for Fisheye Image","summary":" Camera with a fisheye or ultra-wide lens covers a wide field of view that\ncannot be modeled by the perspective projection. Serious fisheye lens\ndistortion in the peripheral region of the image leads to degraded performance\nof the existing head pose estimation models trained on undistorted images. This\npaper presents a new approach for head pose estimation that uses the knowledge\nof head location in the image to reduce the negative effect of fisheye\ndistortion. We develop an end-to-end convolutional neural network to estimate\nthe head pose with the multi-task learning of head pose and head location. Our\nproposed network estimates the head pose directly from the fisheye image\nwithout the operation of rectification or calibration. We also created a\nfisheye-distorted version of the three popular head pose estimation datasets,\nBIWI, 300W-LP, and AFLW2000 for our experiments. Experiments results show that\nour network remarkably improves the accuracy of head pose estimation compared\nwith other state-of-the-art one-stage and two-stage methods.\n","authors":["Bing Li","Dong Zhang","Cheng Huang","Yun Xian","Ming Li","Dah-Jye Lee"],"pdf_url":"https://arxiv.org/pdf/2402.18320v2.pdf","comment":"Revised Introduction and Related Work; Submitted to lEEE Transactions\n on Cognitive and Developmental Systems for review"},{"id":"http://arxiv.org/abs/2404.07078v1","updated":"2024-04-10T15:09:15Z","published":"2024-04-10T15:09:15Z","title":"VLLMs Provide Better Context for Emotion Understanding Through Common\n Sense Reasoning","summary":" Recognising emotions in context involves identifying the apparent emotions of\nan individual, taking into account contextual cues from the surrounding scene.\nPrevious approaches to this task have involved the design of explicit\nscene-encoding architectures or the incorporation of external scene-related\ninformation, such as captions. However, these methods often utilise limited\ncontextual information or rely on intricate training pipelines. In this work,\nwe leverage the groundbreaking capabilities of Vision-and-Large-Language Models\n(VLLMs) to enhance in-context emotion classification without introducing\ncomplexity to the training process in a two-stage approach. In the first stage,\nwe propose prompting VLLMs to generate descriptions in natural language of the\nsubject's apparent emotion relative to the visual context. In the second stage,\nthe descriptions are used as contextual information and, along with the image\ninput, are used to train a transformer-based architecture that fuses text and\nvisual features before the final classification task. Our experimental results\nshow that the text and image features have complementary information, and our\nfused architecture significantly outperforms the individual modalities without\nany complex training methods. We evaluate our approach on three different\ndatasets, namely, EMOTIC, CAER-S, and BoLD, and achieve state-of-the-art or\ncomparable accuracy across all datasets and metrics compared to much more\ncomplex approaches. The code will be made publicly available on github:\nhttps://github.com/NickyFot/EmoCommonSense.git\n","authors":["Alexandros Xenos","Niki Maria Foteinopoulou","Ioanna Ntinou","Ioannis Patras","Georgios Tzimiropoulos"],"pdf_url":"https://arxiv.org/pdf/2404.07078v1.pdf","comment":"A. Xenos, N. Foteinopoulou and I. Ntinou contributed equally to this\n work; 14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.07072v1","updated":"2024-04-10T15:02:26Z","published":"2024-04-10T15:02:26Z","title":"Implicit Multi-Spectral Transformer: An Lightweight and Effective\n Visible to Infrared Image Translation Model","summary":" In the field of computer vision, visible light images often exhibit low\ncontrast in low-light conditions, presenting a significant challenge. While\ninfrared imagery provides a potential solution, its utilization entails high\ncosts and practical limitations. Recent advancements in deep learning,\nparticularly the deployment of Generative Adversarial Networks (GANs), have\nfacilitated the transformation of visible light images to infrared images.\nHowever, these methods often experience unstable training phases and may\nproduce suboptimal outputs. To address these issues, we propose a novel\nend-to-end Transformer-based model that efficiently converts visible light\nimages into high-fidelity infrared images. Initially, the Texture Mapping\nModule and Color Perception Adapter collaborate to extract texture and color\nfeatures from the visible light image. The Dynamic Fusion Aggregation Module\nsubsequently integrates these features. Finally, the transformation into an\ninfrared image is refined through the synergistic action of the Color\nPerception Adapter and the Enhanced Perception Attention mechanism.\nComprehensive benchmarking experiments confirm that our model outperforms\nexisting methods, producing infrared images of markedly superior quality, both\nqualitatively and quantitatively. Furthermore, the proposed model enables more\neffective downstream applications for infrared images than other methods.\n","authors":["Yijia Chen","Pinghua Chen","Xiangxin Zhou","Yingtie Lei","Ziyang Zhou","Mingxian Li"],"pdf_url":"https://arxiv.org/pdf/2404.07072v1.pdf","comment":"Accepted by IJCNN 2024"},{"id":"http://arxiv.org/abs/2404.07045v1","updated":"2024-04-10T14:35:22Z","published":"2024-04-10T14:35:22Z","title":"Identification of Fine-grained Systematic Errors via Controlled Scene\n Generation","summary":" Many safety-critical applications, especially in autonomous driving, require\nreliable object detectors. They can be very effectively assisted by a method to\nsearch for and identify potential failures and systematic errors before these\ndetectors are deployed. Systematic errors are characterized by combinations of\nattributes such as object location, scale, orientation, and color, as well as\nthe composition of their respective backgrounds. To identify them, one must\nrely on something other than real images from a test set because they do not\naccount for very rare but possible combinations of attributes. To overcome this\nlimitation, we propose a pipeline for generating realistic synthetic scenes\nwith fine-grained control, allowing the creation of complex scenes with\nmultiple objects. Our approach, BEV2EGO, allows for a realistic generation of\nthe complete scene with road-contingent control that maps 2D bird's-eye view\n(BEV) scene configurations to a first-person view (EGO). In addition, we\npropose a benchmark for controlled scene generation to select the most\nappropriate generative outpainting model for BEV2EGO. We further use it to\nperform a systematic analysis of multiple state-of-the-art object detection\nmodels and discover differences between them.\n","authors":["Valentyn Boreiko","Matthias Hein","Jan Hendrik Metzen"],"pdf_url":"https://arxiv.org/pdf/2404.07045v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07032v1","updated":"2024-04-10T14:25:23Z","published":"2024-04-10T14:25:23Z","title":"An Evidential-enhanced Tri-Branch Consistency Learning Method for\n Semi-supervised Medical Image Segmentation","summary":" Semi-supervised segmentation presents a promising approach for large-scale\nmedical image analysis, effectively reducing annotation burdens while achieving\ncomparable performance. This methodology holds substantial potential for\nstreamlining the segmentation process and enhancing its feasibility within\nclinical settings for translational investigations. While cross-supervised\ntraining, based on distinct co-training sub-networks, has become a prevalent\nparadigm for this task, addressing critical issues such as predication\ndisagreement and label-noise suppression requires further attention and\nprogress in cross-supervised training. In this paper, we introduce an\nEvidential Tri-Branch Consistency learning framework (ETC-Net) for\nsemi-supervised medical image segmentation. ETC-Net employs three branches: an\nevidential conservative branch, an evidential progressive branch, and an\nevidential fusion branch. The first two branches exhibit complementary\ncharacteristics, allowing them to address prediction diversity and enhance\ntraining stability. We also integrate uncertainty estimation from the\nevidential learning into cross-supervised training, mitigating the negative\nimpact of erroneous supervision signals. Additionally, the evidential fusion\nbranch capitalizes on the complementary attributes of the first two branches\nand leverages an evidence-based Dempster-Shafer fusion strategy, supervised by\nmore reliable and accurate pseudo-labels of unlabeled data. Extensive\nexperiments conducted on LA, Pancreas-CT, and ACDC datasets demonstrate that\nETC-Net surpasses other state-of-the-art methods for semi-supervised\nsegmentation. The code will be made available in the near future at\nhttps://github.com/Medsemiseg.\n","authors":["Zhenxi Zhang","Heng Zhou","Xiaoran Shi","Ran Ran","Chunna Tian","Feng Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.07032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10166v2","updated":"2024-04-10T14:25:12Z","published":"2024-01-18T17:55:39Z","title":"VMamba: Visual State Space Model","summary":" Convolutional Neural Networks (CNNs) and Vision Transformers (ViTs) have long\nbeen the predominant backbone networks for visual representation learning.\nWhile ViTs have recently gained prominence over CNNs due to their superior\nfitting capabilities, their scalability is largely constrained by the quadratic\ncomplexity of attention computation. Inspired by the capability of Mamba in\nefficiently modeling long sequences, we propose VMamba, a generic vision\nbackbone model aiming to reduce the computational complexity to linear while\nretaining ViTs' advantageous features. To enhance VMamba's adaptability in\nprocessing vision data, we introduce the Cross-Scan Module (CSM) to enable 1D\nselective scanning in 2D image space with global receptive fields.\nAdditionally, we make further improvements in implementation details and\narchitectural designs to enhance VMamba's performance and boost its inference\nspeed. Extensive experimental results demonstrate VMamba's promising\nperformance across various visual perception tasks, highlighting its pronounced\nadvantages in input scaling efficiency compared to existing benchmark models.\nSource code is available at https://github.com/MzeroMiko/VMamba.\n","authors":["Yue Liu","Yunjie Tian","Yuzhong Zhao","Hongtian Yu","Lingxi Xie","Yaowei Wang","Qixiang Ye","Yunfan Liu"],"pdf_url":"https://arxiv.org/pdf/2401.10166v2.pdf","comment":"21 pages, 12 figures, 5 tables"},{"id":"http://arxiv.org/abs/2404.07031v1","updated":"2024-04-10T14:24:10Z","published":"2024-04-10T14:24:10Z","title":"ORacle: Large Vision-Language Models for Knowledge-Guided Holistic OR\n Domain Modeling","summary":" Every day, countless surgeries are performed worldwide, each within the\ndistinct settings of operating rooms (ORs) that vary not only in their setups\nbut also in the personnel, tools, and equipment used. This inherent diversity\nposes a substantial challenge for achieving a holistic understanding of the OR,\nas it requires models to generalize beyond their initial training datasets. To\nreduce this gap, we introduce ORacle, an advanced vision-language model\ndesigned for holistic OR domain modeling, which incorporates multi-view and\ntemporal capabilities and can leverage external knowledge during inference,\nenabling it to adapt to previously unseen surgical scenarios. This capability\nis further enhanced by our novel data augmentation framework, which\nsignificantly diversifies the training dataset, ensuring ORacle's proficiency\nin applying the provided knowledge effectively. In rigorous testing, in scene\ngraph generation, and downstream tasks on the 4D-OR dataset, ORacle not only\ndemonstrates state-of-the-art performance but does so requiring less data than\nexisting models. Furthermore, its adaptability is displayed through its ability\nto interpret unseen views, actions, and appearances of tools and equipment.\nThis demonstrates ORacle's potential to significantly enhance the scalability\nand affordability of OR domain modeling and opens a pathway for future\nadvancements in surgical data science. We will release our code and data upon\nacceptance.\n","authors":["Ege Özsoy","Chantal Pellegrini","Matthias Keicher","Nassir Navab"],"pdf_url":"https://arxiv.org/pdf/2404.07031v1.pdf","comment":"11 pages, 3 figures, 7 tables"},{"id":"http://arxiv.org/abs/2404.07029v1","updated":"2024-04-10T14:22:16Z","published":"2024-04-10T14:22:16Z","title":"Diffusion-based inpainting of incomplete Euclidean distance matrices of\n trajectories generated by a fractional Brownian motion","summary":" Fractional Brownian trajectories (fBm) feature both randomness and strong\nscale-free correlations, challenging generative models to reproduce the\nintrinsic memory characterizing the underlying process. Here we test a\ndiffusion probabilistic model on a specific dataset of corrupted images\ncorresponding to incomplete Euclidean distance matrices of fBm at various\nmemory exponents $H$. Our dataset implies uniqueness of the data imputation in\nthe regime of low missing ratio, where the remaining partial graph is rigid,\nproviding the ground truth for the inpainting. We find that the conditional\ndiffusion generation stably reproduces the statistics of missing\nfBm-distributed distances for different values of $H$ exponent. Furthermore,\nwhile diffusion models have been recently shown to remember samples from the\ntraining database, we show that diffusion-based inpainting behaves\nqualitatively different from the database search with the increasing database\nsize. Finally, we apply our fBm-trained diffusion model with $H=1/3$ for\ncompletion of chromosome distance matrices obtained in single-cell microscopy\nexperiments, showing its superiority over the standard bioinformatics\nalgorithms. Our source code is available on GitHub at\nhttps://github.com/alobashev/diffusion_fbm.\n","authors":["Alexander Lobashev","Kirill Polovnikov"],"pdf_url":"https://arxiv.org/pdf/2404.07029v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10144v4","updated":"2024-04-10T13:58:08Z","published":"2023-12-15T19:00:07Z","title":"Data-Efficient Multimodal Fusion on a Single GPU","summary":" The goal of multimodal alignment is to learn a single latent space that is\nshared between multimodal inputs. The most powerful models in this space have\nbeen trained using massive datasets of paired inputs and large-scale\ncomputational resources, making them prohibitively expensive to train in many\npractical scenarios. We surmise that existing unimodal encoders pre-trained on\nlarge amounts of unimodal data should provide an effective bootstrap to create\nmultimodal models from unimodal ones at much lower costs. We therefore propose\nFuseMix, a multimodal augmentation scheme that operates on the latent spaces of\narbitrary pre-trained unimodal encoders. Using FuseMix for multimodal\nalignment, we achieve competitive performance -- and in certain cases\noutperform state-of-the art methods -- in both image-text and audio-text\nretrieval, with orders of magnitude less compute and data: for example, we\noutperform CLIP on the Flickr30K text-to-image retrieval task with $\\sim \\!\n600\\times$ fewer GPU days and $\\sim \\! 80\\times$ fewer image-text pairs.\nAdditionally, we show how our method can be applied to convert pre-trained\ntext-to-image generative models into audio-to-image ones. Code is available at:\nhttps://github.com/layer6ai-labs/fusemix.\n","authors":["Noël Vouitsis","Zhaoyan Liu","Satya Krishna Gorti","Valentin Villecroze","Jesse C. Cresswell","Guangwei Yu","Gabriel Loaiza-Ganem","Maksims Volkovs"],"pdf_url":"https://arxiv.org/pdf/2312.10144v4.pdf","comment":"CVPR 2024 (Highlight)"},{"id":"http://arxiv.org/abs/2307.12256v2","updated":"2024-04-10T13:43:54Z","published":"2023-07-23T08:02:37Z","title":"Building-road Collaborative Extraction from Remotely Sensed Images via\n Cross-Interaction","summary":" Buildings are the basic carrier of social production and human life; roads\nare the links that interconnect social networks. Building and road information\nhas important application value in the frontier fields of regional coordinated\ndevelopment, disaster prevention, auto-driving, etc. Mapping buildings and\nroads from very high-resolution (VHR) remote sensing images have become a hot\nresearch topic. However, the existing methods often ignore the strong spatial\ncorrelation between roads and buildings and extract them in isolation. To fully\nutilize the complementary advantages between buildings and roads, we propose a\nbuilding-road collaborative extraction method based on multi-task and\ncross-scale feature interaction to improve the accuracy of both tasks in a\ncomplementary way. A multi-task interaction module is proposed to interact\ninformation across tasks and preserve the unique information of each task,\nwhich tackle the seesaw phenomenon in multitask learning. By considering the\nvariation in appearance and structure between buildings and roads, a\ncross-scale interaction module is designed to automatically learn the optimal\nreception field for different tasks. Compared with many existing methods that\ntrain each task individually, the proposed collaborative extraction method can\nutilize the complementary advantages between buildings and roads by the\nproposed inter-task and inter-scale feature interactions, and automatically\nselect the optimal reception field for different tasks. Experiments on a wide\nrange of urban and rural scenarios show that the proposed algorithm can achieve\nbuilding-road extraction with outstanding performance and efficiency.\n","authors":["Haonan Guo","Xin Su","Chen Wu","Bo Du","Liangpei Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12256v2.pdf","comment":"IEEE Transactions on Geoscience and Remote Sensing"},{"id":"http://arxiv.org/abs/2312.07937v5","updated":"2024-04-10T13:35:51Z","published":"2023-12-13T07:30:19Z","title":"BOTH2Hands: Inferring 3D Hands from Both Text Prompts and Body Dynamics","summary":" The recently emerging text-to-motion advances have spired numerous attempts\nfor convenient and interactive human motion generation. Yet, existing methods\nare largely limited to generating body motions only without considering the\nrich two-hand motions, let alone handling various conditions like body dynamics\nor texts. To break the data bottleneck, we propose BOTH57M, a novel multi-modal\ndataset for two-hand motion generation. Our dataset includes accurate motion\ntracking for the human body and hands and provides pair-wised finger-level hand\nannotations and body descriptions. We further provide a strong baseline method,\nBOTH2Hands, for the novel task: generating vivid two-hand motions from both\nimplicit body dynamics and explicit text prompts. We first warm up two parallel\nbody-to-hand and text-to-hand diffusion models and then utilize the\ncross-attention transformer for motion blending. Extensive experiments and\ncross-validations demonstrate the effectiveness of our approach and dataset for\ngenerating convincing two-hand motions from the hybrid body-and-textual\nconditions. Our dataset and code will be disseminated to the community for\nfuture research.\n","authors":["Wenqian Zhang","Molin Huang","Yuxuan Zhou","Juze Zhang","Jingyi Yu","Jingya Wang","Lan Xu"],"pdf_url":"https://arxiv.org/pdf/2312.07937v5.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.05317v2","updated":"2024-04-10T13:30:09Z","published":"2024-04-08T09:08:43Z","title":"WebXR, A-Frame and Networked-Aframe as a Basis for an Open Metaverse: A\n Conceptual Architecture","summary":" This work proposes a WebXR-based cross-platform conceptual architecture,\nleveraging the A-Frame and Networked-Aframe frameworks, in order to facilitate\nthe development of an open, accessible, and interoperable metaverse. By\nintroducing the concept of spatial web app, this research contributes to the\ndiscourse on the metaverse, offering an architecture that democratizes access\nto virtual environments and extended reality through the web, and aligns with\nTim Berners-Lee's original vision of the World Wide Web as an open platform in\nthe digital realm.\n","authors":["Giuseppe Macario"],"pdf_url":"https://arxiv.org/pdf/2404.05317v2.pdf","comment":"minor fixes (typos, URLs etc.)"},{"id":"http://arxiv.org/abs/2309.06067v6","updated":"2024-04-10T13:17:52Z","published":"2023-09-12T09:07:03Z","title":"Implicit Neural Representation for MRI Parallel Imaging Reconstruction","summary":" Magnetic resonance imaging (MRI) usually faces lengthy acquisition times,\nprompting the exploration of strategies such as parallel imaging (PI) to\nalleviate this problem by periodically skipping specific K-space lines and\nsubsequently reconstructing high-quality images from the undersampled K-space.\nImplicit neural representation (INR) has recently emerged as a promising deep\nlearning technique, characterizing objects as continuous functions of spatial\ncoordinates typically parameterized by a multilayer perceptron (MLP). In this\nstudy, we propose a novel MRI PI reconstruction method that uses INR. Our\napproach represents reconstructed fully-sampled images as functions of voxel\ncoordinates and prior feature vectors from undersampled images, addressing the\ngeneralization challenges of INR. Specifically, we introduce a scale-embedded\nencoder to generate scale-independent, voxel-specific features from MR images\nacross various undersampling scales. These features are then concatenated with\ncoordinate vectors to reconstruct fully-sampled MR images, facilitating\nmultiple-scale reconstructions. To evaluate our method's performance, we\nconducted experiments using publicly available MRI datasets, comparing it with\nalternative reconstruction techniques. Our quantitative assessment demonstrates\nthe superiority of our proposed method.\n","authors":["Hao Li","Yusheng Zhou","Jianan Liu","Xiling Liu","Tao Huang","Zhihan Lv","Weidong Cai"],"pdf_url":"https://arxiv.org/pdf/2309.06067v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12220v2","updated":"2024-04-10T13:15:41Z","published":"2023-07-23T03:55:13Z","title":"Expediting Building Footprint Extraction from High-resolution Remote\n Sensing Images via progressive lenient supervision","summary":" The efficacy of building footprint segmentation from remotely sensed images\nhas been hindered by model transfer effectiveness. Many existing building\nsegmentation methods were developed upon the encoder-decoder architecture of\nU-Net, in which the encoder is finetuned from the newly developed backbone\nnetworks that are pre-trained on ImageNet. However, the heavy computational\nburden of the existing decoder designs hampers the successful transfer of these\nmodern encoder networks to remote sensing tasks. Even the widely-adopted deep\nsupervision strategy fails to mitigate these challenges due to its invalid loss\nin hybrid regions where foreground and background pixels are intermixed. In\nthis paper, we conduct a comprehensive evaluation of existing decoder network\ndesigns for building footprint segmentation and propose an efficient framework\ndenoted as BFSeg to enhance learning efficiency and effectiveness.\nSpecifically, a densely-connected coarse-to-fine feature fusion decoder network\nthat facilitates easy and fast feature fusion across scales is proposed.\nMoreover, considering the invalidity of hybrid regions in the down-sampled\nground truth during the deep supervision process, we present a lenient deep\nsupervision and distillation strategy that enables the network to learn proper\nknowledge from deep supervision. Building upon these advancements, we have\ndeveloped a new family of building segmentation networks, which consistently\nsurpass prior works with outstanding performance and efficiency across a wide\nrange of newly developed encoder networks.\n","authors":["Haonan Guo","Bo Du","Chen Wu","Xin Su","Liangpei Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12220v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06991v1","updated":"2024-04-10T13:10:52Z","published":"2024-04-10T13:10:52Z","title":"Ray-driven Spectral CT Reconstruction Based on Neural Base-Material\n Fields","summary":" In spectral CT reconstruction, the basis materials decomposition involves\nsolving a large-scale nonlinear system of integral equations, which is highly\nill-posed mathematically. This paper proposes a model that parameterizes the\nattenuation coefficients of the object using a neural field representation,\nthereby avoiding the complex calculations of pixel-driven projection\ncoefficient matrices during the discretization process of line integrals. It\nintroduces a lightweight discretization method for line integrals based on a\nray-driven neural field, enhancing the accuracy of the integral approximation\nduring the discretization process. The basis materials are represented as\ncontinuous vector-valued implicit functions to establish a neural field\nparameterization model for the basis materials. The auto-differentiation\nframework of deep learning is then used to solve the implicit continuous\nfunction of the neural base-material fields. This method is not limited by the\nspatial resolution of reconstructed images, and the network has compact and\nregular properties. Experimental validation shows that our method performs\nexceptionally well in addressing the spectral CT reconstruction. Additionally,\nit fulfils the requirements for the generation of high-resolution\nreconstruction images.\n","authors":["Ligen Shi","Chang Liu","Ping Yang","Jun Qiu","Xing Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.06991v1.pdf","comment":"14 pages,16 figures"},{"id":"http://arxiv.org/abs/2404.01563v2","updated":"2024-04-10T13:02:59Z","published":"2024-04-02T01:57:08Z","title":"Two-Phase Multi-Dose-Level PET Image Reconstruction with Dose Level\n Awareness","summary":" To obtain high-quality positron emission tomography (PET) while minimizing\nradiation exposure, a range of methods have been designed to reconstruct\nstandard-dose PET (SPET) from corresponding low-dose PET (LPET) images.\nHowever, most current methods merely learn the mapping between\nsingle-dose-level LPET and SPET images, but omit the dose disparity of LPET\nimages in clinical scenarios. In this paper, to reconstruct high-quality SPET\nimages from multi-dose-level LPET images, we design a novel two-phase\nmulti-dose-level PET reconstruction algorithm with dose level awareness,\ncontaining a pre-training phase and a SPET prediction phase. Specifically, the\npre-training phase is devised to explore both fine-grained discriminative\nfeatures and effective semantic representation. The SPET prediction phase\nadopts a coarse prediction network utilizing pre-learned dose level prior to\ngenerate preliminary result, and a refinement network to precisely preserve the\ndetails. Experiments on MICCAI 2022 Ultra-low Dose PET Imaging Challenge\nDataset have demonstrated the superiority of our method.\n","authors":["Yuchen Fei","Yanmei Luo","Yan Wang","Jiaqi Cui","Yuanyuan Xu","Jiliu Zhou","Dinggang Shen"],"pdf_url":"https://arxiv.org/pdf/2404.01563v2.pdf","comment":"Accepted by ISBI2024"},{"id":"http://arxiv.org/abs/2404.06033v2","updated":"2024-04-10T12:55:49Z","published":"2024-04-09T05:44:00Z","title":"Little Strokes Fell Great Oaks: Boosting the Hierarchical Features for\n Multi-exposure Image Fusion","summary":" In recent years, deep learning networks have made remarkable strides in the\ndomain of multi-exposure image fusion. Nonetheless, prevailing approaches often\ninvolve directly feeding over-exposed and under-exposed images into the\nnetwork, which leads to the under-utilization of inherent information present\nin the source images. Additionally, unsupervised techniques predominantly\nemploy rudimentary weighted summation for color channel processing, culminating\nin an overall desaturated final image tone. To partially mitigate these issues,\nthis study proposes a gamma correction module specifically designed to fully\nleverage latent information embedded within source images. Furthermore, a\nmodified transformer block, embracing with self-attention mechanisms, is\nintroduced to optimize the fusion process. Ultimately, a novel color\nenhancement algorithm is presented to augment image saturation while preserving\nintricate details. The source code is available at\nhttps://github.com/ZhiyingDu/BHFMEF.\n","authors":["Pan Mu","Zhiying Du","Jinyuan Liu","Cong Bai"],"pdf_url":"https://arxiv.org/pdf/2404.06033v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02265v2","updated":"2024-04-10T12:54:12Z","published":"2023-10-03T17:59:58Z","title":"DREAM: Visual Decoding from Reversing Human Visual System","summary":" In this work we present DREAM, an fMRI-to-image method for reconstructing\nviewed images from brain activities, grounded on fundamental knowledge of the\nhuman visual system. We craft reverse pathways that emulate the hierarchical\nand parallel nature of how humans perceive the visual world. These tailored\npathways are specialized to decipher semantics, color, and depth cues from fMRI\ndata, mirroring the forward pathways from visual stimuli to fMRI recordings. To\ndo so, two components mimic the inverse processes within the human visual\nsystem: the Reverse Visual Association Cortex (R-VAC) which reverses pathways\nof this brain region, extracting semantics from fMRI data; the Reverse Parallel\nPKM (R-PKM) component simultaneously predicting color and depth from fMRI\nsignals. The experiments indicate that our method outperforms the current\nstate-of-the-art models in terms of the consistency of appearance, structure,\nand semantics. Code will be made publicly available to facilitate further\nresearch in this field.\n","authors":["Weihao Xia","Raoul de Charette","Cengiz Öztireli","Jing-Hao Xue"],"pdf_url":"https://arxiv.org/pdf/2310.02265v2.pdf","comment":"Project Page: https://weihaox.github.io/DREAM"},{"id":"http://arxiv.org/abs/2404.06977v1","updated":"2024-04-10T12:45:27Z","published":"2024-04-10T12:45:27Z","title":"Accurate Tennis Court Line Detection on Amateur Recorded Matches","summary":" Typically, tennis court line detection is done by running\nHough-Line-Detection to find straight lines in the image, and then computing a\ntransformation matrix from the detected lines to create the final court\nstructure. We propose numerous improvements and enhancements to this algorithm,\nincluding using pretrained State-of-the-Art shadow-removal and object-detection\nML models to make our line-detection more robust. Compared to the original\nalgorithm, our method can accurately detect lines on amateur, dirty courts.\nWhen combined with a robust ball-tracking system, our method will enable\naccurate, automatic refereeing for amateur and professional tennis matches\nalike.\n","authors":["Sameer Agrawal","Ragoth Sundararajan","Vishak Sagar"],"pdf_url":"https://arxiv.org/pdf/2404.06977v1.pdf","comment":"Accepted to 5th International conference on Image, Video Processing\n and Artificial Intelligence"},{"id":"http://arxiv.org/abs/2404.06971v1","updated":"2024-04-10T12:31:43Z","published":"2024-04-10T12:31:43Z","title":"TrajPRed: Trajectory Prediction with Region-based Relation Learning","summary":" Forecasting human trajectories in traffic scenes is critical for safety\nwithin mixed or fully autonomous systems. Human future trajectories are driven\nby two major stimuli, social interactions, and stochastic goals. Thus, reliable\nforecasting needs to capture these two stimuli. Edge-based relation modeling\nrepresents social interactions using pairwise correlations from precise\nindividual states. Nevertheless, edge-based relations can be vulnerable under\nperturbations. To alleviate these issues, we propose a region-based relation\nlearning paradigm that models social interactions via region-wise dynamics of\njoint states, i.e., the changes in the density of crowds. In particular,\nregion-wise agent joint information is encoded within convolutional feature\ngrids. Social relations are modeled by relating the temporal changes of local\njoint information from a global perspective. We show that region-based\nrelations are less susceptible to perturbations. In order to account for the\nstochastic individual goals, we exploit a conditional variational autoencoder\nto realize multi-goal estimation and diverse future prediction. Specifically,\nwe perform variational inference via the latent distribution, which is\nconditioned on the correlation between input states and associated target\ngoals. Sampling from the latent distribution enables the framework to reliably\ncapture the stochastic behavior in test data. We integrate multi-goal\nestimation and region-based relation learning to model the two stimuli, social\ninteractions, and stochastic goals, in a prediction framework. We evaluate our\nframework on the ETH-UCY dataset and Stanford Drone Dataset (SDD). We show that\nthe diverse prediction better fits the ground truth when incorporating the\nrelation module. Our framework outperforms the state-of-the-art models on SDD\nby $27.61\\%$/$18.20\\%$ of ADE/FDE metrics.\n","authors":["Chen Zhou","Ghassan AlRegib","Armin Parchami","Kunjan Singh"],"pdf_url":"https://arxiv.org/pdf/2404.06971v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06963v1","updated":"2024-04-10T12:22:19Z","published":"2024-04-10T12:22:19Z","title":"V-MAD: Video-based Morphing Attack Detection in Operational Scenarios","summary":" In response to the rising threat of the face morphing attack, this paper\nintroduces and explores the potential of Video-based Morphing Attack Detection\n(V-MAD) systems in real-world operational scenarios. While current morphing\nattack detection methods primarily focus on a single or a pair of images, V-MAD\nis based on video sequences, exploiting the video streams often acquired by\nface verification tools available, for instance, at airport gates. Through this\nstudy, we show for the first time the advantages that the availability of\nmultiple probe frames can bring to the morphing attack detection task,\nespecially in scenarios where the quality of probe images is varied and might\nbe affected, for instance, by pose or illumination variations. Experimental\nresults on a real operational database demonstrate that video sequences\nrepresent valuable information for increasing the robustness and performance of\nmorphing attack detection systems.\n","authors":["Guido Borghi","Annalisa Franco","Nicolò Di Domenico","Matteo Ferrara","Davide Maltoni"],"pdf_url":"https://arxiv.org/pdf/2404.06963v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06957v1","updated":"2024-04-10T12:17:25Z","published":"2024-04-10T12:17:25Z","title":"Adversarial purification for no-reference image-quality metrics:\n applicability study and new methods","summary":" Recently, the area of adversarial attacks on image quality metrics has begun\nto be explored, whereas the area of defences remains under-researched. In this\nstudy, we aim to cover that case and check the transferability of adversarial\npurification defences from image classifiers to IQA methods. In this paper, we\napply several widespread attacks on IQA models and examine the success of the\ndefences against them. The purification methodologies covered different\npreprocessing techniques, including geometrical transformations, compression,\ndenoising, and modern neural network-based methods. Also, we address the\nchallenge of assessing the efficacy of a defensive methodology by proposing\nways to estimate output visual quality and the success of neutralizing attacks.\nDefences were tested against attack on three IQA metrics -- Linearity, MetaIQA\nand SPAQ. The code for attacks and defences is available at: (link is hidden\nfor a blind review).\n","authors":["Aleksandr Gushchin","Anna Chistyakova","Vladislav Minashkin","Anastasia Antsiferova","Dmitriy Vatolin"],"pdf_url":"https://arxiv.org/pdf/2404.06957v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.04350v3","updated":"2024-04-10T11:58:24Z","published":"2024-01-09T04:33:03Z","title":"Pre-trained Model Guided Fine-Tuning for Zero-Shot Adversarial\n Robustness","summary":" Large-scale pre-trained vision-language models like CLIP have demonstrated\nimpressive performance across various tasks, and exhibit remarkable zero-shot\ngeneralization capability, while they are also vulnerable to imperceptible\nadversarial examples. Existing works typically employ adversarial training\n(fine-tuning) as a defense method against adversarial examples. However, direct\napplication to the CLIP model may result in overfitting, compromising the\nmodel's capacity for generalization. In this paper, we propose Pre-trained\nModel Guided Adversarial Fine-Tuning (PMG-AFT) method, which leverages\nsupervision from the original pre-trained model by carefully designing an\nauxiliary branch, to enhance the model's zero-shot adversarial robustness.\nSpecifically, PMG-AFT minimizes the distance between the features of\nadversarial examples in the target model and those in the pre-trained model,\naiming to preserve the generalization features already captured by the\npre-trained model. Extensive Experiments on 15 zero-shot datasets demonstrate\nthat PMG-AFT significantly outperforms the state-of-the-art method, improving\nthe top-1 robust accuracy by an average of 4.99%. Furthermore, our approach\nconsistently improves clean accuracy by an average of 8.72%. Our code is\navailable at\nhttps://github.com/serendipity1122/Pre-trained-Model-Guided-Fine-Tuning-for-Zero-Shot-Adversarial-Robustness.\n","authors":["Sibo Wang","Jie Zhang","Zheng Yuan","Shiguang Shan"],"pdf_url":"https://arxiv.org/pdf/2401.04350v3.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2312.06275v3","updated":"2024-04-10T11:49:05Z","published":"2023-12-11T10:26:21Z","title":"DG-TTA: Out-of-domain medical image segmentation through Domain\n Generalization and Test-Time Adaptation","summary":" Applying pre-trained medical segmentation models on out-of-domain images\noften yields predictions of insufficient quality. Several strategies have been\nproposed to maintain model performance, such as finetuning or unsupervised- and\nsource-free domain adaptation. These strategies set restrictive requirements\nfor data availability. In this study, we propose to combine domain\ngeneralization and test-time adaptation to create a highly effective approach\nfor reusing pre-trained models in unseen target domains. Domain-generalized\npre-training on source data is used to obtain the best initial performance in\nthe target domain. We introduce the MIND descriptor previously used in image\nregistration tasks as a further technique to achieve generalization and present\nsuperior performance for small-scale datasets compared to existing approaches.\nAt test-time, high-quality segmentation for every single unseen scan is ensured\nby optimizing the model weights for consistency given different image\naugmentations. That way, our method enables separate use of source and target\ndata and thus removes current data availability barriers. Moreover, the\npresented method is highly modular as it does not require specific model\narchitectures or prior knowledge of involved domains and labels. We demonstrate\nthis by integrating it into the nnUNet, which is currently the most popular and\naccurate framework for medical image segmentation. We employ multiple datasets\ncovering abdominal, cardiac, and lumbar spine scans and compose several\nout-of-domain scenarios in this study. We demonstrate that our method, combined\nwith pre-trained whole-body CT models, can effectively segment MR images with\nhigh accuracy in all of the aforementioned scenarios. Open-source code can be\nfound here: https://github.com/multimodallearning/DG-TTA\n","authors":["Christian Weihsbach","Christian N. Kruse","Alexander Bigalke","Mattias P. Heinrich"],"pdf_url":"https://arxiv.org/pdf/2312.06275v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06941v1","updated":"2024-04-10T11:47:51Z","published":"2024-04-10T11:47:51Z","title":"Accelerating Cardiac MRI Reconstruction with CMRatt: An Attention-Driven\n Approach","summary":" Cine cardiac magnetic resonance (CMR) imaging is recognised as the benchmark\nmodality for the comprehensive assessment of cardiac function. Nevertheless,\nthe acquisition process of cine CMR is considered as an impediment due to its\nprolonged scanning time. One commonly used strategy to expedite the acquisition\nprocess is through k-space undersampling, though it comes with a drawback of\nintroducing aliasing effects in the reconstructed image. Lately, deep\nlearning-based methods have shown remarkable results over traditional\napproaches in rapidly achieving precise CMR reconstructed images. This study\naims to explore the untapped potential of attention mechanisms incorporated\nwith a deep learning model within the context of the CMR reconstruction\nproblem. We are motivated by the fact that attention has proven beneficial in\ndownstream tasks such as image classification and segmentation, but has not\nbeen systematically analysed in the context of CMR reconstruction. Our primary\ngoal is to identify the strengths and potential limitations of attention\nalgorithms when integrated with a convolutional backbone model such as a U-Net.\nTo achieve this, we benchmark different state-of-the-art spatial and channel\nattention mechanisms on the CMRxRecon dataset and quantitatively evaluate the\nquality of reconstruction using objective metrics. Furthermore, inspired by the\nbest performing attention mechanism, we propose a new, simple yet effective,\nattention pipeline specifically optimised for the task of cardiac image\nreconstruction that outperforms other state-of-the-art attention methods. The\nlayer and model code will be made publicly available.\n","authors":["Anam Hashmi","Julia Dietlmeier","Kathleen M. Curran","Noel E. O'Connor"],"pdf_url":"https://arxiv.org/pdf/2404.06941v1.pdf","comment":"This paper has been submitted for the 32nd European Signal Processing\n Conference EUSIPCO 2024 in Lyon"},{"id":"http://arxiv.org/abs/2306.10798v3","updated":"2024-04-10T11:42:22Z","published":"2023-06-19T09:38:21Z","title":"ExpPoint-MAE: Better interpretability and performance for\n self-supervised point cloud transformers","summary":" In this paper we delve into the properties of transformers, attained through\nself-supervision, in the point cloud domain. Specifically, we evaluate the\neffectiveness of Masked Autoencoding as a pretraining scheme, and explore\nMomentum Contrast as an alternative. In our study we investigate the impact of\ndata quantity on the learned features, and uncover similarities in the\ntransformer's behavior across domains. Through comprehensive visualiations, we\nobserve that the transformer learns to attend to semantically meaningful\nregions, indicating that pretraining leads to a better understanding of the\nunderlying geometry. Moreover, we examine the finetuning process and its effect\non the learned representations. Based on that, we devise an unfreezing strategy\nwhich consistently outperforms our baseline without introducing any other\nmodifications to the model or the training pipeline, and achieve\nstate-of-the-art results in the classification task among transformer models.\n","authors":["Ioannis Romanelis","Vlassis Fotis","Konstantinos Moustakas","Adrian Munteanu"],"pdf_url":"https://arxiv.org/pdf/2306.10798v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06936v1","updated":"2024-04-10T11:40:02Z","published":"2024-04-10T11:40:02Z","title":"Efficient and Generic Point Model for Lossless Point Cloud Attribute\n Compression","summary":" The past several years have witnessed the emergence of learned point cloud\ncompression (PCC) techniques. However, current learning-based lossless point\ncloud attribute compression (PCAC) methods either suffer from high\ncomputational complexity or deteriorated compression performance. Moreover, the\nsignificant variations in point cloud scale and sparsity encountered in\nreal-world applications make developing an all-in-one neural model a\nchallenging task. In this paper, we propose PoLoPCAC, an efficient and generic\nlossless PCAC method that achieves high compression efficiency and strong\ngeneralizability simultaneously. We formulate lossless PCAC as the task of\ninferring explicit distributions of attributes from group-wise autoregressive\npriors. A progressive random grouping strategy is first devised to efficiently\nresolve the point cloud into groups, and then the attributes of each group are\nmodeled sequentially from accumulated antecedents. A locality-aware attention\nmechanism is utilized to exploit prior knowledge from context windows in\nparallel. Since our method directly operates on points, it can naturally avoids\ndistortion caused by voxelization, and can be executed on point clouds with\narbitrary scale and density. Experiments show that our method can be instantly\ndeployed once trained on a Synthetic 2k-ShapeNet dataset while enjoying\ncontinuous bit-rate reduction over the latest G-PCCv23 on various datasets\n(ShapeNet, ScanNet, MVUB, 8iVFB). Meanwhile, our method reports shorter coding\ntime than G-PCCv23 on the majority of sequences with a lightweight model size\n(2.6MB), which is highly attractive for practical applications. Dataset, code\nand trained model are available at\nhttps://github.com/I2-Multimedia-Lab/PoLoPCAC.\n","authors":["Kang You","Pan Gao","Zhan Ma"],"pdf_url":"https://arxiv.org/pdf/2404.06936v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06918v1","updated":"2024-04-10T11:10:50Z","published":"2024-04-10T11:10:50Z","title":"HRVDA: High-Resolution Visual Document Assistant","summary":" Leveraging vast training data, multimodal large language models (MLLMs) have\ndemonstrated formidable general visual comprehension capabilities and achieved\nremarkable performance across various tasks. However, their performance in\nvisual document understanding still leaves much room for improvement. This\ndiscrepancy is primarily attributed to the fact that visual document\nunderstanding is a fine-grained prediction task. In natural scenes, MLLMs\ntypically use low-resolution images, leading to a substantial loss of visual\ninformation. Furthermore, general-purpose MLLMs do not excel in handling\ndocument-oriented instructions. In this paper, we propose a High-Resolution\nVisual Document Assistant (HRVDA), which bridges the gap between MLLMs and\nvisual document understanding. This model employs a content filtering mechanism\nand an instruction filtering module to separately filter out the\ncontent-agnostic visual tokens and instruction-agnostic visual tokens, thereby\nachieving efficient model training and inference for high-resolution images. In\naddition, we construct a document-oriented visual instruction tuning dataset\nand apply a multi-stage training strategy to enhance the model's document\nmodeling capabilities. Extensive experiments demonstrate that our model\nachieves state-of-the-art performance across multiple document understanding\ndatasets, while maintaining training efficiency and inference speed comparable\nto low-resolution models.\n","authors":["Chaohu Liu","Kun Yin","Haoyu Cao","Xinghua Jiang","Xin Li","Yinsong Liu","Deqiang Jiang","Xing Sun","Linli Xu"],"pdf_url":"https://arxiv.org/pdf/2404.06918v1.pdf","comment":"Accepted to CVPR 2024 main conference"},{"id":"http://arxiv.org/abs/2404.06913v1","updated":"2024-04-10T11:06:29Z","published":"2024-04-10T11:06:29Z","title":"Sparse Global Matching for Video Frame Interpolation with Large Motion","summary":" Large motion poses a critical challenge in Video Frame Interpolation (VFI)\ntask. Existing methods are often constrained by limited receptive fields,\nresulting in sub-optimal performance when handling scenarios with large motion.\nIn this paper, we introduce a new pipeline for VFI, which can effectively\nintegrate global-level information to alleviate issues associated with large\nmotion. Specifically, we first estimate a pair of initial intermediate flows\nusing a high-resolution feature map for extracting local details. Then, we\nincorporate a sparse global matching branch to compensate for flow estimation,\nwhich consists of identifying flaws in initial flows and generating sparse flow\ncompensation with a global receptive field. Finally, we adaptively merge the\ninitial flow estimation with global flow compensation, yielding a more accurate\nintermediate flow. To evaluate the effectiveness of our method in handling\nlarge motion, we carefully curate a more challenging subset from commonly used\nbenchmarks. Our method demonstrates the state-of-the-art performance on these\nVFI subsets with large motion.\n","authors":["Chunxu Liu","Guozhen Zhang","Rui Zhao","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06913v1.pdf","comment":"Accepted by CVPR 2024. Project page: https://sgm-vfi.github.io/"},{"id":"http://arxiv.org/abs/2306.00977v4","updated":"2024-04-10T10:56:00Z","published":"2023-06-01T17:59:10Z","title":"AGILE3D: Attention Guided Interactive Multi-object 3D Segmentation","summary":" During interactive segmentation, a model and a user work together to\ndelineate objects of interest in a 3D point cloud. In an iterative process, the\nmodel assigns each data point to an object (or the background), while the user\ncorrects errors in the resulting segmentation and feeds them back into the\nmodel. The current best practice formulates the problem as binary\nclassification and segments objects one at a time. The model expects the user\nto provide positive clicks to indicate regions wrongly assigned to the\nbackground and negative clicks on regions wrongly assigned to the object.\nSequentially visiting objects is wasteful since it disregards synergies between\nobjects: a positive click for a given object can, by definition, serve as a\nnegative click for nearby objects. Moreover, a direct competition between\nadjacent objects can speed up the identification of their common boundary. We\nintroduce AGILE3D, an efficient, attention-based model that (1) supports\nsimultaneous segmentation of multiple 3D objects, (2) yields more accurate\nsegmentation masks with fewer user clicks, and (3) offers faster inference. Our\ncore idea is to encode user clicks as spatial-temporal queries and enable\nexplicit interactions between click queries as well as between them and the 3D\nscene through a click attention module. Every time new clicks are added, we\nonly need to run a lightweight decoder that produces updated segmentation\nmasks. In experiments with four different 3D point cloud datasets, AGILE3D sets\na new state-of-the-art. Moreover, we also verify its practicality in real-world\nsetups with real user studies.\n","authors":["Yuanwen Yue","Sabarinath Mahadevan","Jonas Schult","Francis Engelmann","Bastian Leibe","Konrad Schindler","Theodora Kontogianni"],"pdf_url":"https://arxiv.org/pdf/2306.00977v4.pdf","comment":"ICLR 2024 camera-ready. Project page: https://ywyue.github.io/AGILE3D"},{"id":"http://arxiv.org/abs/2404.06903v1","updated":"2024-04-10T10:46:59Z","published":"2024-04-10T10:46:59Z","title":"DreamScene360: Unconstrained Text-to-3D Scene Generation with Panoramic\n Gaussian Splatting","summary":" The increasing demand for virtual reality applications has highlighted the\nsignificance of crafting immersive 3D assets. We present a text-to-3D\n360$^{\\circ}$ scene generation pipeline that facilitates the creation of\ncomprehensive 360$^{\\circ}$ scenes for in-the-wild environments in a matter of\nminutes. Our approach utilizes the generative power of a 2D diffusion model and\nprompt self-refinement to create a high-quality and globally coherent panoramic\nimage. This image acts as a preliminary \"flat\" (2D) scene representation.\nSubsequently, it is lifted into 3D Gaussians, employing splatting techniques to\nenable real-time exploration. To produce consistent 3D geometry, our pipeline\nconstructs a spatially coherent structure by aligning the 2D monocular depth\ninto a globally optimized point cloud. This point cloud serves as the initial\nstate for the centroids of 3D Gaussians. In order to address invisible issues\ninherent in single-view inputs, we impose semantic and geometric constraints on\nboth synthesized and input camera views as regularizations. These guide the\noptimization of Gaussians, aiding in the reconstruction of unseen regions. In\nsummary, our method offers a globally consistent 3D scene within a\n360$^{\\circ}$ perspective, providing an enhanced immersive experience over\nexisting techniques. Project website at: http://dreamscene360.github.io/\n","authors":["Shijie Zhou","Zhiwen Fan","Dejia Xu","Haoran Chang","Pradyumna Chari","Tejas Bharadwaj","Suya You","Zhangyang Wang","Achuta Kadambi"],"pdf_url":"https://arxiv.org/pdf/2404.06903v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12796v2","updated":"2024-04-10T10:37:22Z","published":"2023-11-21T18:59:58Z","title":"Physics-guided Shape-from-Template: Monocular Video Perception through\n Neural Surrogate Models","summary":" 3D reconstruction of dynamic scenes is a long-standing problem in computer\ngraphics and increasingly difficult the less information is available.\nShape-from-Template (SfT) methods aim to reconstruct a template-based geometry\nfrom RGB images or video sequences, often leveraging just a single monocular\ncamera without depth information, such as regular smartphone recordings.\nUnfortunately, existing reconstruction methods are either unphysical and noisy\nor slow in optimization. To solve this problem, we propose a novel SfT\nreconstruction algorithm for cloth using a pre-trained neural surrogate model\nthat is fast to evaluate, stable, and produces smooth reconstructions due to a\nregularizing physics simulation. Differentiable rendering of the simulated mesh\nenables pixel-wise comparisons between the reconstruction and a target video\nsequence that can be used for a gradient-based optimization procedure to\nextract not only shape information but also physical parameters such as\nstretching, shearing, or bending stiffness of the cloth. This allows to retain\na precise, stable, and smooth reconstructed geometry while reducing the runtime\nby a factor of 400-500 compared to $\\phi$-SfT, a state-of-the-art physics-based\nSfT approach.\n","authors":["David Stotko","Nils Wandel","Reinhard Klein"],"pdf_url":"https://arxiv.org/pdf/2311.12796v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06894v1","updated":"2024-04-10T10:36:15Z","published":"2024-04-10T10:36:15Z","title":"O-TALC: Steps Towards Combating Oversegmentation within Online Action\n Segmentation","summary":" Online temporal action segmentation shows a strong potential to facilitate\nmany HRI tasks where extended human action sequences must be tracked and\nunderstood in real time. Traditional action segmentation approaches, however,\noperate in an offline two stage approach, relying on computationally expensive\nvideo wide features for segmentation, rendering them unsuitable for online HRI\napplications. In order to facilitate online action segmentation on a stream of\nincoming video data, we introduce two methods for improved training and\ninference of backbone action recognition models, allowing them to be deployed\ndirectly for online frame level classification. Firstly, we introduce surround\ndense sampling whilst training to facilitate training vs. inference clip\nmatching and improve segment boundary predictions. Secondly, we introduce an\nOnline Temporally Aware Label Cleaning (O-TALC) strategy to explicitly reduce\noversegmentation during online inference. As our methods are backbone\ninvariant, they can be deployed with computationally efficient spatio-temporal\naction recognition models capable of operating in real time with a small\nsegmentation latency. We show our method outperforms similar online action\nsegmentation work as well as matches the performance of many offline models\nwith access to full temporal resolution when operating on challenging\nfine-grained datasets.\n","authors":["Matthew Kent Myers","Nick Wright","A. Stephen McGough","Nicholas Martin"],"pdf_url":"https://arxiv.org/pdf/2404.06894v1.pdf","comment":"5 pages, 3 figures. Accepted as a short (unindexed) paper at the\n TAHRI conference"},{"id":"http://arxiv.org/abs/2404.06892v1","updated":"2024-04-10T10:34:34Z","published":"2024-04-10T10:34:34Z","title":"SparseAD: Sparse Query-Centric Paradigm for Efficient End-to-End\n Autonomous Driving","summary":" End-to-End paradigms use a unified framework to implement multi-tasks in an\nautonomous driving system. Despite simplicity and clarity, the performance of\nend-to-end autonomous driving methods on sub-tasks is still far behind the\nsingle-task methods. Meanwhile, the widely used dense BEV features in previous\nend-to-end methods make it costly to extend to more modalities or tasks. In\nthis paper, we propose a Sparse query-centric paradigm for end-to-end\nAutonomous Driving (SparseAD), where the sparse queries completely represent\nthe whole driving scenario across space, time and tasks without any dense BEV\nrepresentation. Concretely, we design a unified sparse architecture for\nperception tasks including detection, tracking, and online mapping. Moreover,\nwe revisit motion prediction and planning, and devise a more justifiable motion\nplanner framework. On the challenging nuScenes dataset, SparseAD achieves SOTA\nfull-task performance among end-to-end methods and significantly narrows the\nperformance gap between end-to-end paradigms and single-task methods. Codes\nwill be released soon.\n","authors":["Diankun Zhang","Guoan Wang","Runwen Zhu","Jianbo Zhao","Xiwu Chen","Siyu Zhang","Jiahao Gong","Qibin Zhou","Wenyuan Zhang","Ningzi Wang","Feiyang Tan","Hangning Zhou","Ziyao Xu","Haotian Yao","Chi Zhang","Xiaojun Liu","Xiaoguang Di","Bin Li"],"pdf_url":"https://arxiv.org/pdf/2404.06892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06883v1","updated":"2024-04-10T10:13:37Z","published":"2024-04-10T10:13:37Z","title":"Research on Detection of Floating Objects in River and Lake Based on AI\n Intelligent Image Recognition","summary":" With the rapid advancement of artificial intelligence technology, AI-enabled\nimage recognition has emerged as a potent tool for addressing challenges in\ntraditional environmental monitoring. This study focuses on the detection of\nfloating objects in river and lake environments, exploring an innovative\napproach based on deep learning. By intricately analyzing the technical\npathways for detecting static and dynamic features and considering the\ncharacteristics of river and lake debris, a comprehensive image acquisition and\nprocessing workflow has been developed. The study highlights the application\nand performance comparison of three mainstream deep learning models -SSD,\nFaster-RCNN, and YOLOv5- in debris identification. Additionally, a detection\nsystem for floating objects has been designed and implemented, encompassing\nboth hardware platform construction and software framework development. Through\nrigorous experimental validation, the proposed system has demonstrated its\nability to significantly enhance the accuracy and efficiency of debris\ndetection, thus offering a new technological avenue for water quality\nmonitoring in rivers and lakes\n","authors":["Jingyu Zhang","Ao Xiang","Yu Cheng","Qin Yang","Liyang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06883v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07887v2","updated":"2024-04-10T10:06:46Z","published":"2023-10-11T20:48:20Z","title":"Unsupervised Denoising for Signal-Dependent and Row-Correlated Imaging\n Noise","summary":" Accurate analysis of microscopy images is hindered by the presence of noise.\nThis noise is usually signal-dependent and often additionally correlated along\nrows or columns of pixels. Current self- and unsupervised denoisers can address\nsignal-dependent noise, but none can reliably remove noise that is also row- or\ncolumn-correlated. Here, we present the first fully unsupervised deep\nlearning-based denoiser capable of handling imaging noise that is\nrow-correlated as well as signal-dependent. Our approach uses a Variational\nAutoencoder (VAE) with a specially designed autoregressive decoder. This\ndecoder is capable of modeling row-correlated and signal-dependent noise but is\nincapable of independently modeling underlying clean signal. The VAE therefore\nproduces latent variables containing only clean signal information, and these\nare mapped back into image space using a proposed second decoder network. Our\nmethod does not require a pre-trained noise model and can be trained from\nscratch using unpaired noisy data. We show that our approach achieves\ncompetitive results when applied to a range of different sensor types and\nimaging modalities.\n","authors":["Benjamin Salmon","Alexander Krull"],"pdf_url":"https://arxiv.org/pdf/2310.07887v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03190v5","updated":"2024-04-10T09:51:11Z","published":"2024-03-05T18:29:17Z","title":"Triple-CFN: Restructuring Conceptual Spaces for Enhancing Abstract\n Reasoning process","summary":" Abstract reasoning problems pose significant challenges to artificial\nintelligence algorithms, demanding cognitive capabilities beyond those required\nfor perception tasks. This study introduces the Triple-CFN approach to tackle\nthe Bongard-Logo problem, achieving notable reasoning accuracy by implicitly\nreorganizing the concept space of conflicting instances. Additionally, the\nTriple-CFN paradigm proves effective for the RPM problem with necessary\nmodifications, yielding competitive results. To further enhance performance on\nthe RPM issue, we develop the Meta Triple-CFN network, which explicitly\nstructures the problem space while maintaining interpretability on progressive\npatterns. The success of Meta Triple-CFN is attributed to its paradigm of\nmodeling the conceptual space, equivalent to normalizing reasoning information.\nBased on this ideology, we introduce the Re-space layer, enhancing the\nperformance of both Meta Triple-CFN and Triple-CFN. This paper aims to\ncontribute to advancements in machine intelligence by exploring innovative\nnetwork designs for addressing abstract reasoning problems, paving the way for\nfurther breakthroughs in this domain.\n","authors":["Ruizhuo Song","Beiming Yuan"],"pdf_url":"https://arxiv.org/pdf/2403.03190v5.pdf","comment":"14 pages, 14 figures, 5 tables"},{"id":"http://arxiv.org/abs/2404.06865v1","updated":"2024-04-10T09:45:02Z","published":"2024-04-10T09:45:02Z","title":"Fine color guidance in diffusion models and its application to image\n compression at extremely low bitrates","summary":" This study addresses the challenge of, without training or fine-tuning,\ncontrolling the global color aspect of images generated with a diffusion model.\nWe rewrite the guidance equations to ensure that the outputs are closer to a\nknown color map, and this without hindering the quality of the generation. Our\nmethod leads to new guidance equations. We show in the color guidance context\nthat, the scaling of the guidance should not decrease but remains high\nthroughout the diffusion process. In a second contribution, our guidance is\napplied in a compression framework, we combine both semantic and general color\ninformation on the image to decode the images at low cost. We show that our\nmethod is effective at improving fidelity and realism of compressed images at\nextremely low bit rates, when compared to other classical or more semantic\noriented approaches.\n","authors":["Tom Bordin","Thomas Maugey"],"pdf_url":"https://arxiv.org/pdf/2404.06865v1.pdf","comment":"Submitted to IEEE Transactions on Image Processing (TIP)"},{"id":"http://arxiv.org/abs/2404.06863v1","updated":"2024-04-10T09:40:56Z","published":"2024-04-10T09:40:56Z","title":"RESSCAL3D: Resolution Scalable 3D Semantic Segmentation of Point Clouds","summary":" While deep learning-based methods have demonstrated outstanding results in\nnumerous domains, some important functionalities are missing. Resolution\nscalability is one of them. In this work, we introduce a novel architecture,\ndubbed RESSCAL3D, providing resolution-scalable 3D semantic segmentation of\npoint clouds. In contrast to existing works, the proposed method does not\nrequire the whole point cloud to be available to start inference. Once a\nlow-resolution version of the input point cloud is available, first semantic\npredictions can be generated in an extremely fast manner. This enables early\ndecision-making in subsequent processing steps. As additional points become\navailable, these are processed in parallel. To improve performance, features\nfrom previously computed scales are employed as prior knowledge at the current\nscale. Our experiments show that RESSCAL3D is 31-62% faster than the\nnon-scalable baseline while keeping a limited impact on performance. To the\nbest of our knowledge, the proposed method is the first to propose a\nresolution-scalable approach for 3D semantic segmentation of point clouds based\non deep learning.\n","authors":["Remco Royen","Adrian Munteanu"],"pdf_url":"https://arxiv.org/pdf/2404.06863v1.pdf","comment":"Published at 2023 IEEE International Conference on Image Processing\n (ICIP)"},{"id":"http://arxiv.org/abs/2404.06860v1","updated":"2024-04-10T09:35:50Z","published":"2024-04-10T09:35:50Z","title":"Monocular 3D lane detection for Autonomous Driving: Recent Achievements,\n Challenges, and Outlooks","summary":" 3D lane detection plays a crucial role in autonomous driving by extracting\nstructural and traffic information from the road in 3D space to assist the\nself-driving car in rational, safe, and comfortable path planning and motion\ncontrol. Due to the consideration of sensor costs and the advantages of visual\ndata in color information, in practical applications, 3D lane detection based\non monocular vision is one of the important research directions in the field of\nautonomous driving, which has attracted more and more attention in both\nindustry and academia. Unfortunately, recent progress in visual perception\nseems insufficient to develop completely reliable 3D lane detection algorithms,\nwhich also hinders the development of vision-based fully autonomous\nself-driving cars, i.e., achieving level 5 autonomous driving, driving like\nhuman-controlled cars. This is one of the conclusions drawn from this review\npaper: there is still a lot of room for improvement and significant\nimprovements are still needed in the 3D lane detection algorithm for autonomous\ndriving cars using visual sensors. Motivated by this, this review defines,\nanalyzes, and reviews the current achievements in the field of 3D lane\ndetection research, and the vast majority of the current progress relies\nheavily on computationally complex deep learning models. In addition, this\nreview covers the 3D lane detection pipeline, investigates the performance of\nstate-of-the-art algorithms, analyzes the time complexity of cutting-edge\nmodeling choices, and highlights the main achievements and limitations of\ncurrent research efforts. The survey also includes a comprehensive discussion\nof available 3D lane detection datasets and the challenges that researchers\nhave faced but have not yet resolved. Finally, our work outlines future\nresearch directions and welcomes researchers and practitioners to enter this\nexciting field.\n","authors":["Fulong Ma","Weiqing Qi","Guoyang Zhao","Linwei Zheng","Sheng Wang","Ming Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06860v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06859v1","updated":"2024-04-10T09:35:36Z","published":"2024-04-10T09:35:36Z","title":"Multi-Label Continual Learning for the Medical Domain: A Novel Benchmark","summary":" Multi-label image classification in dynamic environments is a problem that\nposes significant challenges. Previous studies have primarily focused on\nscenarios such as Domain Incremental Learning and Class Incremental Learning,\nwhich do not fully capture the complexity of real-world applications. In this\npaper, we study the problem of classification of medical imaging in the\nscenario termed New Instances \\& New Classes, which combines the challenges of\nboth new class arrivals and domain shifts in a single framework. Unlike\ntraditional scenarios, it reflects the realistic nature of CL in domains such\nas medical imaging, where updates may introduce both new classes and changes in\ndomain characteristics. To address the unique challenges posed by this complex\nscenario, we introduce a novel approach called Pseudo-Label Replay. This method\naims to mitigate forgetting while adapting to new classes and domain shifts by\ncombining the advantages of the Replay and Pseudo-Label methods and solving\ntheir limitations in the proposed scenario. % part3 We evaluate our proposed\napproach on a challenging benchmark consisting of two datasets, seven tasks,\nand nineteen classes, modeling a realistic Continual Learning scenario. Our\nexperimental findings demonstrate the effectiveness of Pseudo-Label Replay in\naddressing the challenges posed by the complex scenario proposed. Our method\nsurpasses existing approaches, exhibiting superior performance while showing\nminimal forgetting.\n","authors":["Marina Ceccon","Davide Dalle Pezze","Alessandro Fabris","Gian Antonio Susto"],"pdf_url":"https://arxiv.org/pdf/2404.06859v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.10035v3","updated":"2024-04-10T09:34:03Z","published":"2023-02-20T15:34:03Z","title":"Large-scale Multi-Modal Pre-trained Models: A Comprehensive Survey","summary":" With the urgent demand for generalized deep models, many pre-trained big\nmodels are proposed, such as BERT, ViT, GPT, etc. Inspired by the success of\nthese models in single domains (like computer vision and natural language\nprocessing), the multi-modal pre-trained big models have also drawn more and\nmore attention in recent years. In this work, we give a comprehensive survey of\nthese models and hope this paper could provide new insights and helps fresh\nresearchers to track the most cutting-edge works. Specifically, we firstly\nintroduce the background of multi-modal pre-training by reviewing the\nconventional deep learning, pre-training works in natural language process,\ncomputer vision, and speech. Then, we introduce the task definition, key\nchallenges, and advantages of multi-modal pre-training models (MM-PTMs), and\ndiscuss the MM-PTMs with a focus on data, objectives, network architectures,\nand knowledge enhanced pre-training. After that, we introduce the downstream\ntasks used for the validation of large-scale MM-PTMs, including generative,\nclassification, and regression tasks. We also give visualization and analysis\nof the model parameters and results on representative downstream tasks.\nFinally, we point out possible research directions for this topic that may\nbenefit future works. In addition, we maintain a continuously updated paper\nlist for large-scale pre-trained multi-modal big models:\nhttps://github.com/wangxiao5791509/MultiModal_BigModels_Survey. This paper has\nbeen published by the journal Machine Intelligence Research (MIR),\nhttps://link.springer.com/article/10.1007/s11633-022-1410-8, DOI:\n10.1007/s11633-022-1410-8, vol. 20, no. 4, pp. 447-482, 2023.\n","authors":["Xiao Wang","Guangyao Chen","Guangwu Qian","Pengcheng Gao","Xiao-Yong Wei","Yaowei Wang","Yonghong Tian","Wen Gao"],"pdf_url":"https://arxiv.org/pdf/2302.10035v3.pdf","comment":"Accepted by Machine Intelligence Research (MIR)"},{"id":"http://arxiv.org/abs/2404.06851v1","updated":"2024-04-10T09:24:54Z","published":"2024-04-10T09:24:54Z","title":"UDiFF: Generating Conditional Unsigned Distance Fields with Optimal\n Wavelet Diffusion","summary":" Diffusion models have shown remarkable results for image generation, editing\nand inpainting. Recent works explore diffusion models for 3D shape generation\nwith neural implicit functions, i.e., signed distance function and occupancy\nfunction. However, they are limited to shapes with closed surfaces, which\nprevents them from generating diverse 3D real-world contents containing open\nsurfaces. In this work, we present UDiFF, a 3D diffusion model for unsigned\ndistance fields (UDFs) which is capable to generate textured 3D shapes with\nopen surfaces from text conditions or unconditionally. Our key idea is to\ngenerate UDFs in spatial-frequency domain with an optimal wavelet\ntransformation, which produces a compact representation space for UDF\ngeneration. Specifically, instead of selecting an appropriate wavelet\ntransformation which requires expensive manual efforts and still leads to large\ninformation loss, we propose a data-driven approach to learn the optimal\nwavelet transformation for UDFs. We evaluate UDiFF to show our advantages by\nnumerical and visual comparisons with the latest methods on widely used\nbenchmarks. Page: https://weiqi-zhang.github.io/UDiFF.\n","authors":["Junsheng Zhou","Weiqi Zhang","Baorui Ma","Kanle Shi","Yu-Shen Liu","Zhizhong Han"],"pdf_url":"https://arxiv.org/pdf/2404.06851v1.pdf","comment":"To appear at CVPR2024. Project page:\n https://weiqi-zhang.github.io/UDiFF"},{"id":"http://arxiv.org/abs/2404.06842v1","updated":"2024-04-10T09:14:28Z","published":"2024-04-10T09:14:28Z","title":"MoCha-Stereo: Motif Channel Attention Network for Stereo Matching","summary":" Learning-based stereo matching techniques have made significant progress.\nHowever, existing methods inevitably lose geometrical structure information\nduring the feature channel generation process, resulting in edge detail\nmismatches. In this paper, the Motif Cha}nnel Attention Stereo Matching Network\n(MoCha-Stereo) is designed to address this problem. We provide the Motif\nChannel Correlation Volume (MCCV) to determine more accurate edge matching\ncosts. MCCV is achieved by projecting motif channels, which capture common\ngeometric structures in feature channels, onto feature maps and cost volumes.\nIn addition, edge variations in %potential feature channels of the\nreconstruction error map also affect details matching, we propose the\nReconstruction Error Motif Penalty (REMP) module to further refine the\nfull-resolution disparity estimation. REMP integrates the frequency information\nof typical channel features from the reconstruction error. MoCha-Stereo ranks\n1st on the KITTI-2015 and KITTI-2012 Reflective leaderboards. Our structure\nalso shows excellent performance in Multi-View Stereo. Code is avaliable at\nhttps://github.com/ZYangChen/MoCha-Stereo.\n","authors":["Ziyang Chen","Wei Long","He Yao","Yongjun Zhang","Bingshu Wang","Yongbin Qin","Jia Wu"],"pdf_url":"https://arxiv.org/pdf/2404.06842v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2402.02263v2","updated":"2024-04-10T09:00:44Z","published":"2024-02-03T21:12:36Z","title":"MixedNUTS: Training-Free Accuracy-Robustness Balance via Nonlinearly\n Mixed Classifiers","summary":" Adversarial robustness often comes at the cost of degraded accuracy, impeding\nthe real-life application of robust classification models. Training-based\nsolutions for better trade-offs are limited by incompatibilities with\nalready-trained high-performance large models, necessitating the exploration of\ntraining-free ensemble approaches. Observing that robust models are more\nconfident in correct predictions than in incorrect ones on clean and\nadversarial data alike, we speculate amplifying this \"benign confidence\nproperty\" can reconcile accuracy and robustness in an ensemble setting. To\nachieve so, we propose \"MixedNUTS\", a training-free method where the output\nlogits of a robust classifier and a standard non-robust classifier are\nprocessed by nonlinear transformations with only three parameters, which are\noptimized through an efficient algorithm. MixedNUTS then converts the\ntransformed logits into probabilities and mixes them as the overall output. On\nCIFAR-10, CIFAR-100, and ImageNet datasets, experimental results with custom\nstrong adaptive attacks demonstrate MixedNUTS's vastly improved accuracy and\nnear-SOTA robustness -- it boosts CIFAR-100 clean accuracy by 7.86 points,\nsacrificing merely 0.87 points in robust accuracy.\n","authors":["Yatong Bai","Mo Zhou","Vishal M. Patel","Somayeh Sojoudi"],"pdf_url":"https://arxiv.org/pdf/2402.02263v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06836v1","updated":"2024-04-10T08:54:43Z","published":"2024-04-10T08:54:43Z","title":"O2V-Mapping: Online Open-Vocabulary Mapping with Neural Implicit\n Representation","summary":" Online construction of open-ended language scenes is crucial for robotic\napplications, where open-vocabulary interactive scene understanding is\nrequired. Recently, neural implicit representation has provided a promising\ndirection for online interactive mapping. However, implementing open-vocabulary\nscene understanding capability into online neural implicit mapping still faces\nthree challenges: lack of local scene updating ability, blurry spatial\nhierarchical semantic segmentation and difficulty in maintaining multi-view\nconsistency. To this end, we proposed O2V-mapping, which utilizes voxel-based\nlanguage and geometric features to create an open-vocabulary field, thus\nallowing for local updates during online training process. Additionally, we\nleverage a foundational model for image segmentation to extract language\nfeatures on object-level entities, achieving clear segmentation boundaries and\nhierarchical semantic features. For the purpose of preserving consistency in 3D\nobject properties across different viewpoints, we propose a spatial adaptive\nvoxel adjustment mechanism and a multi-view weight selection method. Extensive\nexperiments on open-vocabulary object localization and semantic segmentation\ndemonstrate that O2V-mapping achieves online construction of language scenes\nwhile enhancing accuracy, outperforming the previous SOTA method.\n","authors":["Muer Tie","Julong Wei","Zhengjun Wang","Ke Wu","Shansuai Yuan","Kaizhao Zhang","Jie Jia","Jieru Zhao","Zhongxue Gan","Wenchao Ding"],"pdf_url":"https://arxiv.org/pdf/2404.06836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06835v1","updated":"2024-04-10T08:54:00Z","published":"2024-04-10T08:54:00Z","title":"Tuning-Free Adaptive Style Incorporation for Structure-Consistent\n Text-Driven Style Transfer","summary":" In this work, we target the task of text-driven style transfer in the context\nof text-to-image (T2I) diffusion models. The main challenge is consistent\nstructure preservation while enabling effective style transfer effects. The\npast approaches in this field directly concatenate the content and style\nprompts for a prompt-level style injection, leading to unavoidable structure\ndistortions. In this work, we propose a novel solution to the text-driven style\ntransfer task, namely, Adaptive Style Incorporation~(ASI), to achieve\nfine-grained feature-level style incorporation. It consists of the Siamese\nCross-Attention~(SiCA) to decouple the single-track cross-attention to a\ndual-track structure to obtain separate content and style features, and the\nAdaptive Content-Style Blending (AdaBlending) module to couple the content and\nstyle information from a structure-consistent manner. Experimentally, our\nmethod exhibits much better performance in both structure preservation and\nstylized effects.\n","authors":["Yanqi Ge","Jiaqi Liu","Qingnan Fan","Xi Jiang","Ye Huang","Shuai Qin","Hong Gu","Wen Li","Lixin Duan"],"pdf_url":"https://arxiv.org/pdf/2404.06835v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06832v1","updated":"2024-04-10T08:48:09Z","published":"2024-04-10T08:48:09Z","title":"SplatPose & Detect: Pose-Agnostic 3D Anomaly Detection","summary":" Detecting anomalies in images has become a well-explored problem in both\nacademia and industry. State-of-the-art algorithms are able to detect defects\nin increasingly difficult settings and data modalities. However, most current\nmethods are not suited to address 3D objects captured from differing poses.\nWhile solutions using Neural Radiance Fields (NeRFs) have been proposed, they\nsuffer from excessive computation requirements, which hinder real-world\nusability. For this reason, we propose the novel 3D Gaussian splatting-based\nframework SplatPose which, given multi-view images of a 3D object, accurately\nestimates the pose of unseen views in a differentiable manner, and detects\nanomalies in them. We achieve state-of-the-art results in both training and\ninference speed, and detection performance, even when using less training data\nthan competing methods. We thoroughly evaluate our framework using the recently\nproposed Pose-agnostic Anomaly Detection benchmark and its multi-pose anomaly\ndetection (MAD) data set.\n","authors":["Mathis Kruse","Marco Rudolph","Dominik Woiwode","Bodo Rosenhahn"],"pdf_url":"https://arxiv.org/pdf/2404.06832v1.pdf","comment":"Visual Anomaly and Novelty Detection 2.0 Workshop at CVPR 2024"},{"id":"http://arxiv.org/abs/2404.02668v2","updated":"2024-04-10T08:47:32Z","published":"2024-04-03T12:06:01Z","title":"RS-Mamba for Large Remote Sensing Image Dense Prediction","summary":" Context modeling is critical for remote sensing image dense prediction tasks.\nNowadays, the growing size of very-high-resolution (VHR) remote sensing images\nposes challenges in effectively modeling context. While transformer-based\nmodels possess global modeling capabilities, they encounter computational\nchallenges when applied to large VHR images due to their quadratic complexity.\nThe conventional practice of cropping large images into smaller patches results\nin a notable loss of contextual information. To address these issues, we\npropose the Remote Sensing Mamba (RSM) for dense prediction tasks in large VHR\nremote sensing images. RSM is specifically designed to capture the global\ncontext of remote sensing images with linear complexity, facilitating the\neffective processing of large VHR images. Considering that the land covers in\nremote sensing images are distributed in arbitrary spatial directions due to\ncharacteristics of remote sensing over-head imaging, the RSM incorporates an\nomnidirectional selective scan module to globally model the context of images\nin multiple directions, capturing large spatial features from various\ndirections. Extensive experiments on semantic segmentation and change detection\ntasks across various land covers demonstrate the effectiveness of the proposed\nRSM. We designed simple yet effective models based on RSM, achieving\nstate-of-the-art performance on dense prediction tasks in VHR remote sensing\nimages without fancy training strategies. Leveraging the linear complexity and\nglobal modeling capabilities, RSM achieves better efficiency and accuracy than\ntransformer-based models on large remote sensing images. Interestingly, we also\ndemonstrated that our model generally performs better with a larger image size\non dense prediction tasks. Our code is available at\nhttps://github.com/walking-shadow/Official_Remote_Sensing_Mamba.\n","authors":["Sijie Zhao","Hao Chen","Xueliang Zhang","Pengfeng Xiao","Lei Bai","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2404.02668v2.pdf","comment":"15 pages,8 figures"},{"id":"http://arxiv.org/abs/2312.03502v2","updated":"2024-04-10T08:29:23Z","published":"2023-12-06T13:59:22Z","title":"Improving the Generalization of Segmentation Foundation Model under\n Distribution Shift via Weakly Supervised Adaptation","summary":" The success of large language models has inspired the computer vision\ncommunity to explore image segmentation foundation model that is able to\nzero/few-shot generalize through prompt engineering. Segment-Anything(SAM),\namong others, is the state-of-the-art image segmentation foundation model\ndemonstrating strong zero/few-shot generalization. Despite the success, recent\nstudies reveal the weakness of SAM under strong distribution shift. In\nparticular, SAM performs awkwardly on corrupted natural images, camouflaged\nimages, medical images, etc. Motivated by the observations, we aim to develop a\nself-training based strategy to adapt SAM to target distribution. Given the\nunique challenges of large source dataset, high computation cost and incorrect\npseudo label, we propose a weakly supervised self-training architecture with\nanchor regularization and low-rank finetuning to improve the robustness and\ncomputation efficiency of adaptation. We validate the effectiveness on 5 types\nof downstream segmentation tasks including natural clean/corrupted images,\nmedical images, camouflaged images and robotic images. Our proposed method is\ntask-agnostic in nature and outperforms pre-trained SAM and state-of-the-art\ndomain adaptation methods on almost all downstream tasks with the same testing\nprompt inputs.\n","authors":["Haojie Zhang","Yongyi Su","Xun Xu","Kui Jia"],"pdf_url":"https://arxiv.org/pdf/2312.03502v2.pdf","comment":"20 pages, 12 figures"},{"id":"http://arxiv.org/abs/2308.10610v4","updated":"2024-04-10T08:16:18Z","published":"2023-08-21T10:20:46Z","title":"Ear-Keeper: Real-time Diagnosis of Ear Lesions Utilizing\n Ultralight-Ultrafast ConvNet and Large-scale Ear Endoscopic Dataset","summary":" Deep learning-based ear disease diagnosis technology has proven effective and\naffordable. However, due to the lack of ear endoscope datasets with diversity,\nthe practical potential of the deep learning model has not been thoroughly\nstudied. Moreover, existing research failed to achieve a good trade-off between\nmodel inference speed and parameter size, rendering models inapplicable in\nreal-world settings. To address these challenges, we constructed the first\nlarge-scale ear endoscopic dataset comprising eight types of ear diseases and\ndisease-free samples from two institutions. Inspired by ShuffleNetV2, we\nproposed Best-EarNet, an ultrafast and ultralight network enabling real-time\near disease diagnosis. Best-EarNet incorporates a novel Local-Global Spatial\nFeature Fusion Module and multi-scale supervision strategy, which facilitates\nthe model focusing on global-local information within feature maps at various\nlevels. Utilizing transfer learning, the accuracy of Best-EarNet with only\n0.77M parameters achieves 95.23% (internal 22,581 images) and 92.14% (external\n1,652 images), respectively. In particular, it achieves an average frame per\nsecond of 80 on the CPU. From the perspective of model practicality, the\nproposed Best-EarNet is superior to state-of-the-art backbone models in ear\nlesion detection tasks. Most importantly, Ear-keeper, an intelligent diagnosis\nsystem based Best-EarNet, was developed successfully and deployed on common\nelectronic devices (smartphone, tablet computer and personal computer). In the\nfuture, Ear-Keeper has the potential to assist the public and healthcare\nproviders in performing comprehensive scanning and diagnosis of the ear canal\nin real-time video, thereby promptly detecting ear lesions.\n","authors":["Yubiao Yue","Xinyu Zeng","Xiaoqiang Shi","Meiping Zhang","Fan Zhang","Yunxin Liang","Yan Liu","Zhenzhang Li","Yang Li"],"pdf_url":"https://arxiv.org/pdf/2308.10610v4.pdf","comment":"18 pages,8 figures"},{"id":"http://arxiv.org/abs/2404.06814v1","updated":"2024-04-10T08:02:17Z","published":"2024-04-10T08:02:17Z","title":"Zero-shot Point Cloud Completion Via 2D Priors","summary":" 3D point cloud completion is designed to recover complete shapes from\npartially observed point clouds. Conventional completion methods typically\ndepend on extensive point cloud data for training %, with their effectiveness\noften constrained to object categories similar to those seen during training.\nIn contrast, we propose a zero-shot framework aimed at completing partially\nobserved point clouds across any unseen categories. Leveraging point rendering\nvia Gaussian Splatting, we develop techniques of Point Cloud Colorization and\nZero-shot Fractal Completion that utilize 2D priors from pre-trained diffusion\nmodels to infer missing regions. Experimental results on both synthetic and\nreal-world scanned point clouds demonstrate that our approach outperforms\nexisting methods in completing a variety of objects without any requirement for\nspecific training data.\n","authors":["Tianxin Huang","Zhiwen Yan","Yuyang Zhao","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2404.06814v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05916v2","updated":"2024-04-10T07:58:44Z","published":"2024-03-09T13:56:25Z","title":"GPT as Psychologist? Preliminary Evaluations for GPT-4V on Visual\n Affective Computing","summary":" Multimodal large language models (MLLMs) are designed to process and\nintegrate information from multiple sources, such as text, speech, images, and\nvideos. Despite its success in language understanding, it is critical to\nevaluate the performance of downstream tasks for better human-centric\napplications. This paper assesses the application of MLLMs with 5 crucial\nabilities for affective computing, spanning from visual affective tasks and\nreasoning tasks. The results show that \\gpt has high accuracy in facial action\nunit recognition and micro-expression detection while its general facial\nexpression recognition performance is not accurate. We also highlight the\nchallenges of achieving fine-grained micro-expression recognition and the\npotential for further study and demonstrate the versatility and potential of\n\\gpt for handling advanced tasks in emotion recognition and related fields by\nintegrating with task-related agents for more complex tasks, such as heart rate\nestimation through signal processing. In conclusion, this paper provides\nvaluable insights into the potential applications and challenges of MLLMs in\nhuman-centric computing. Our interesting examples are at\nhttps://github.com/EnVision-Research/GPT4Affectivity.\n","authors":["Hao Lu","Xuesong Niu","Jiyao Wang","Yin Wang","Qingyong Hu","Jiaqi Tang","Yuting Zhang","Kaishen Yuan","Bin Huang","Zitong Yu","Dengbo He","Shuiguang Deng","Hao Chen","Yingcong Chen","Shiguang Shan"],"pdf_url":"https://arxiv.org/pdf/2403.05916v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08551v3","updated":"2024-04-10T07:58:04Z","published":"2024-03-13T14:02:54Z","title":"GaussianImage: 1000 FPS Image Representation and Compression by 2D\n Gaussian Splatting","summary":" Implicit neural representations (INRs) recently achieved great success in\nimage representation and compression, offering high visual quality and fast\nrendering speeds with 10-1000 FPS, assuming sufficient GPU resources are\navailable. However, this requirement often hinders their use on low-end devices\nwith limited memory. In response, we propose a groundbreaking paradigm of image\nrepresentation and compression by 2D Gaussian Splatting, named GaussianImage.\nWe first introduce 2D Gaussian to represent the image, where each Gaussian has\n8 parameters including position, covariance and color. Subsequently, we unveil\na novel rendering algorithm based on accumulated summation. Remarkably, our\nmethod with a minimum of 3$\\times$ lower GPU memory usage and 5$\\times$ faster\nfitting time not only rivals INRs (e.g., WIRE, I-NGP) in representation\nperformance, but also delivers a faster rendering speed of 1500-2000 FPS\nregardless of parameter size. Furthermore, we integrate existing vector\nquantization technique to build an image codec. Experimental results\ndemonstrate that our codec attains rate-distortion performance comparable to\ncompression-based INRs such as COIN and COIN++, while facilitating decoding\nspeeds of approximately 1000 FPS. Additionally, preliminary proof of concept\nshows that our codec surpasses COIN and COIN++ in performance when using\npartial bits-back coding. Code will be available at\nhttps://github.com/Xinjie-Q/GaussianImage.\n","authors":["Xinjie Zhang","Xingtong Ge","Tongda Xu","Dailan He","Yan Wang","Hongwei Qin","Guo Lu","Jing Geng","Jun Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.08551v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07354v4","updated":"2024-04-10T07:54:14Z","published":"2024-02-12T01:03:39Z","title":"Re-DiffiNet: Modeling discrepancies in tumor segmentation using\n diffusion models","summary":" Identification of tumor margins is essential for surgical decision-making for\nglioblastoma patients and provides reliable assistance for neurosurgeons.\nDespite improvements in deep learning architectures for tumor segmentation over\nthe years, creating a fully autonomous system suitable for clinical floors\nremains a formidable challenge because the model predictions have not yet\nreached the desired level of accuracy and generalizability for clinical\napplications. Generative modeling techniques have seen significant improvements\nin recent times. Specifically, Generative Adversarial Networks (GANs) and\nDenoising-diffusion-based models (DDPMs) have been used to generate\nhigher-quality images with fewer artifacts and finer attributes. In this work,\nwe introduce a framework called Re-Diffinet for modeling the discrepancy\nbetween the outputs of a segmentation model like U-Net and the ground truth,\nusing DDPMs. By explicitly modeling the discrepancy, the results show an\naverage improvement of 0.55\\% in the Dice score and 16.28\\% in HD95 from\ncross-validation over 5-folds, compared to the state-of-the-art U-Net\nsegmentation model.\n","authors":["Tianyi Ren","Abhishek Sharma","Juampablo Heras Rivera","Harshitha Rebala","Ethan Honey","Agamdeep Chopra","Jacob Ruzevick","Mehmet Kurt"],"pdf_url":"https://arxiv.org/pdf/2402.07354v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05063v2","updated":"2024-04-10T07:44:40Z","published":"2024-04-07T20:19:04Z","title":"AUEditNet: Dual-Branch Facial Action Unit Intensity Manipulation with\n Implicit Disentanglement","summary":" Facial action unit (AU) intensity plays a pivotal role in quantifying\nfine-grained expression behaviors, which is an effective condition for facial\nexpression manipulation. However, publicly available datasets containing\nintensity annotations for multiple AUs remain severely limited, often featuring\na restricted number of subjects. This limitation places challenges to the AU\nintensity manipulation in images due to disentanglement issues, leading\nresearchers to resort to other large datasets with pretrained AU intensity\nestimators for pseudo labels. In addressing this constraint and fully\nleveraging manual annotations of AU intensities for precise manipulation, we\nintroduce AUEditNet. Our proposed model achieves impressive intensity\nmanipulation across 12 AUs, trained effectively with only 18 subjects.\nUtilizing a dual-branch architecture, our approach achieves comprehensive\ndisentanglement of facial attributes and identity without necessitating\nadditional loss functions or implementing with large batch sizes. This approach\noffers a potential solution to achieve desired facial attribute editing despite\nthe dataset's limited subject count. Our experiments demonstrate AUEditNet's\nsuperior accuracy in editing AU intensities, affirming its capability in\ndisentangling facial attributes and identity within a limited subject pool.\nAUEditNet allows conditioning by either intensity values or target images,\neliminating the need for constructing AU combinations for specific facial\nexpression synthesis. Moreover, AU intensity estimation, as a downstream task,\nvalidates the consistency between real and edited images, confirming the\neffectiveness of our proposed AU intensity manipulation method.\n","authors":["Shiwei Jin","Zhen Wang","Lei Wang","Peng Liu","Ning Bi","Truong Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.05063v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06798v1","updated":"2024-04-10T07:41:35Z","published":"2024-04-10T07:41:35Z","title":"MedRG: Medical Report Grounding with Multi-modal Large Language Model","summary":" Medical Report Grounding is pivotal in identifying the most relevant regions\nin medical images based on a given phrase query, a critical aspect in medical\nimage analysis and radiological diagnosis. However, prevailing visual grounding\napproaches necessitate the manual extraction of key phrases from medical\nreports, imposing substantial burdens on both system efficiency and physicians.\nIn this paper, we introduce a novel framework, Medical Report Grounding\n(MedRG), an end-to-end solution for utilizing a multi-modal Large Language\nModel to predict key phrase by incorporating a unique token, BOX, into the\nvocabulary to serve as an embedding for unlocking detection capabilities.\nSubsequently, the vision encoder-decoder jointly decodes the hidden embedding\nand the input medical image, generating the corresponding grounding box. The\nexperimental results validate the effectiveness of MedRG, surpassing the\nperformance of the existing state-of-the-art medical phrase grounding methods.\nThis study represents a pioneering exploration of the medical report grounding\ntask, marking the first-ever endeavor in this domain.\n","authors":["Ke Zou","Yang Bai","Zhihao Chen","Yang Zhou","Yidi Chen","Kai Ren","Meng Wang","Xuedong Yuan","Xiaojing Shen","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2404.06798v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2311.15361v2","updated":"2024-04-10T06:46:08Z","published":"2023-11-26T17:27:26Z","title":"Ultra-Range Gesture Recognition using a Web-Camera in Human-Robot\n Interaction","summary":" Hand gestures play a significant role in human interactions where non-verbal\nintentions, thoughts and commands are conveyed. In Human-Robot Interaction\n(HRI), hand gestures offer a similar and efficient medium for conveying clear\nand rapid directives to a robotic agent. However, state-of-the-art vision-based\nmethods for gesture recognition have been shown to be effective only up to a\nuser-camera distance of seven meters. Such a short distance range limits\npractical HRI with, for example, service robots, search and rescue robots and\ndrones. In this work, we address the Ultra-Range Gesture Recognition (URGR)\nproblem by aiming for a recognition distance of up to 25 meters and in the\ncontext of HRI. We propose the URGR framework, a novel deep-learning, using\nsolely a simple RGB camera. Gesture inference is based on a single image.\nFirst, a novel super-resolution model termed High-Quality Network (HQ-Net) uses\na set of self-attention and convolutional layers to enhance the low-resolution\nimage of the user. Then, we propose a novel URGR classifier termed Graph Vision\nTransformer (GViT) which takes the enhanced image as input. GViT combines the\nbenefits of a Graph Convolutional Network (GCN) and a modified Vision\nTransformer (ViT). Evaluation of the proposed framework over diverse test data\nyields a high recognition rate of 98.1%. The framework has also exhibited\nsuperior performance compared to human recognition in ultra-range distances.\nWith the framework, we analyze and demonstrate the performance of an autonomous\nquadruped robot directed by human gestures in complex ultra-range indoor and\noutdoor environments, acquiring 96% recognition rate on average.\n","authors":["Eran Bamani","Eden Nissinman","Inbar Meir","Lisa Koenigsberg","Avishai Sintov"],"pdf_url":"https://arxiv.org/pdf/2311.15361v2.pdf","comment":"Engineering Applications of Artificial Intelligence, In press"},{"id":"http://arxiv.org/abs/2404.06780v1","updated":"2024-04-10T06:41:30Z","published":"2024-04-10T06:41:30Z","title":"Urban Architect: Steerable 3D Urban Scene Generation with Layout Prior","summary":" Text-to-3D generation has achieved remarkable success via large-scale\ntext-to-image diffusion models. Nevertheless, there is no paradigm for scaling\nup the methodology to urban scale. Urban scenes, characterized by numerous\nelements, intricate arrangement relationships, and vast scale, present a\nformidable barrier to the interpretability of ambiguous textual descriptions\nfor effective model optimization. In this work, we surmount the limitations by\nintroducing a compositional 3D layout representation into text-to-3D paradigm,\nserving as an additional prior. It comprises a set of semantic primitives with\nsimple geometric structures and explicit arrangement relationships,\ncomplementing textual descriptions and enabling steerable generation. Upon\nthis, we propose two modifications -- (1) We introduce Layout-Guided\nVariational Score Distillation to address model optimization inadequacies. It\nconditions the score distillation sampling process with geometric and semantic\nconstraints of 3D layouts. (2) To handle the unbounded nature of urban scenes,\nwe represent 3D scene with a Scalable Hash Grid structure, incrementally\nadapting to the growing scale of urban scenes. Extensive experiments\nsubstantiate the capability of our framework to scale text-to-3D generation to\nlarge-scale urban scenes that cover over 1000m driving distance for the first\ntime. We also present various scene editing demonstrations, showing the powers\nof steerable urban scene generation. Website: https://urbanarchitect.github.io.\n","authors":["Fan Lu","Kwan-Yee Lin","Yan Xu","Hongsheng Li","Guang Chen","Changjun Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.06780v1.pdf","comment":"Project page: https://urbanarchitect.github.io/"},{"id":"http://arxiv.org/abs/2404.06779v1","updated":"2024-04-10T06:39:18Z","published":"2024-04-10T06:39:18Z","title":"Efficient and Scalable Chinese Vector Font Generation via Component\n Composition","summary":" Chinese vector font generation is challenging due to the complex structure\nand huge amount of Chinese characters. Recent advances remain limited to\ngenerating a small set of characters with simple structure. In this work, we\nfirst observe that most Chinese characters can be disassembled into\nfrequently-reused components. Therefore, we introduce the first efficient and\nscalable Chinese vector font generation approach via component composition,\nallowing generating numerous vector characters from a small set of components.\nTo achieve this, we collect a large-scale dataset that contains over\n\\textit{90K} Chinese characters with their components and layout information.\nUpon the dataset, we propose a simple yet effective framework based on spatial\ntransformer networks (STN) and multiple losses tailored to font characteristics\nto learn the affine transformation of the components, which can be directly\napplied to the B\\'ezier curves, resulting in Chinese characters in vector\nformat. Our qualitative and quantitative experiments have demonstrated that our\nmethod significantly surpasses the state-of-the-art vector font generation\nmethods in generating large-scale complex Chinese characters in both font\ngeneration and zero-shot font extension.\n","authors":["Jinyu Song","Weitao You","Shuhui Shi","Shuxuan Guo","Lingyun Sun","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06779v1.pdf","comment":"15 pages, 23 figures"},{"id":"http://arxiv.org/abs/2403.19837v3","updated":"2024-04-10T23:47:34Z","published":"2024-03-28T21:15:38Z","title":"Concept-based Analysis of Neural Networks via Vision-Language Models","summary":" The analysis of vision-based deep neural networks (DNNs) is highly desirable\nbut it is very challenging due to the difficulty of expressing formal\nspecifications for vision tasks and the lack of efficient verification\nprocedures. In this paper, we propose to leverage emerging multimodal,\nvision-language, foundation models (VLMs) as a lens through which we can reason\nabout vision models. VLMs have been trained on a large body of images\naccompanied by their textual description, and are thus implicitly aware of\nhigh-level, human-understandable concepts describing the images. We describe a\nlogical specification language $\\texttt{Con}_{\\texttt{spec}}$ designed to\nfacilitate writing specifications in terms of these concepts. To define and\nformally check $\\texttt{Con}_{\\texttt{spec}}$ specifications, we build a map\nbetween the internal representations of a given vision model and a VLM, leading\nto an efficient verification procedure of natural-language properties for\nvision models. We demonstrate our techniques on a ResNet-based classifier\ntrained on the RIVAL-10 dataset using CLIP as the multimodal model.\n","authors":["Ravi Mangal","Nina Narodytska","Divya Gopinath","Boyue Caroline Hu","Anirban Roy","Susmit Jha","Corina Pasareanu"],"pdf_url":"https://arxiv.org/pdf/2403.19837v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14666v2","updated":"2024-04-10T23:39:38Z","published":"2023-08-24T17:47:32Z","title":"Learning to Predict 3D Rotational Dynamics from Images of a Rigid Body\n with Unknown Mass Distribution","summary":" In many real-world settings, image observations of freely rotating 3D rigid\nbodies may be available when low-dimensional measurements are not. However, the\nhigh-dimensionality of image data precludes the use of classical estimation\ntechniques to learn the dynamics. The usefulness of standard deep learning\nmethods is also limited, because an image of a rigid body reveals nothing about\nthe distribution of mass inside the body, which, together with initial angular\nvelocity, is what determines how the body will rotate. We present a\nphysics-based neural network model to estimate and predict 3D rotational\ndynamics from image sequences. We achieve this using a multi-stage prediction\npipeline that maps individual images to a latent representation homeomorphic to\n$\\mathbf{SO}(3)$, computes angular velocities from latent pairs, and predicts\nfuture latent states using the Hamiltonian equations of motion. We demonstrate\nthe efficacy of our approach on new rotating rigid-body datasets of sequences\nof synthetic images of rotating objects, including cubes, prisms and\nsatellites, with unknown uniform and non-uniform mass distributions. Our model\noutperforms competing baselines on our datasets, producing better qualitative\npredictions and reducing the error observed for the state-of-the-art\nHamiltonian Generative Network by a factor of 2.\n","authors":["Justice Mason","Christine Allen-Blanchette","Nicholas Zolman","Elizabeth Davison","Naomi Ehrich Leonard"],"pdf_url":"https://arxiv.org/pdf/2308.14666v2.pdf","comment":"Previously appeared as arXiv:2209.11355v2, which was submitted as a\n replacement by accident. arXiv admin note: text overlap with arXiv:2209.11355"},{"id":"http://arxiv.org/abs/2404.07389v1","updated":"2024-04-10T23:30:54Z","published":"2024-04-10T23:30:54Z","title":"Object-Conditioned Energy-Based Attention Map Alignment in Text-to-Image\n Diffusion Models","summary":" Text-to-image diffusion models have shown great success in generating\nhigh-quality text-guided images. Yet, these models may still fail to\nsemantically align generated images with the provided text prompts, leading to\nproblems like incorrect attribute binding and/or catastrophic object neglect.\nGiven the pervasive object-oriented structure underlying text prompts, we\nintroduce a novel object-conditioned Energy-Based Attention Map Alignment\n(EBAMA) method to address the aforementioned problems. We show that an\nobject-centric attribute binding loss naturally emerges by approximately\nmaximizing the log-likelihood of a $z$-parameterized energy-based model with\nthe help of the negative sampling technique. We further propose an\nobject-centric intensity regularizer to prevent excessive shifts of objects\nattention towards their attributes. Extensive qualitative and quantitative\nexperiments, including human evaluation, on several challenging benchmarks\ndemonstrate the superior performance of our method over previous strong\ncounterparts. With better aligned attention maps, our approach shows great\npromise in further enhancing the text-controlled image editing ability of\ndiffusion models.\n","authors":["Yasi Zhang","Peiyu Yu","Ying Nian Wu"],"pdf_url":"https://arxiv.org/pdf/2404.07389v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07377v1","updated":"2024-04-10T22:35:06Z","published":"2024-04-10T22:35:06Z","title":"Deep Generative Sampling in the Dual Divergence Space: A Data-efficient\n & Interpretative Approach for Generative AI","summary":" Building on the remarkable achievements in generative sampling of natural\nimages, we propose an innovative challenge, potentially overly ambitious, which\ninvolves generating samples of entire multivariate time series that resemble\nimages. However, the statistical challenge lies in the small sample size,\nsometimes consisting of a few hundred subjects. This issue is especially\nproblematic for deep generative models that follow the conventional approach of\ngenerating samples from a canonical distribution and then decoding or denoising\nthem to match the true data distribution. In contrast, our method is grounded\nin information theory and aims to implicitly characterize the distribution of\nimages, particularly the (global and local) dependency structure between\npixels. We achieve this by empirically estimating its KL-divergence in the dual\nform with respect to the respective marginal distribution. This enables us to\nperform generative sampling directly in the optimized 1-D dual divergence\nspace. Specifically, in the dual space, training samples representing the data\ndistribution are embedded in the form of various clusters between two end\npoints. In theory, any sample embedded between those two end points is\nin-distribution w.r.t. the data distribution. Our key idea for generating novel\nsamples of images is to interpolate between the clusters via a walk as per\ngradients of the dual function w.r.t. the data dimensions. In addition to the\ndata efficiency gained from direct sampling, we propose an algorithm that\noffers a significant reduction in sample complexity for estimating the\ndivergence of the data distribution with respect to the marginal distribution.\nWe provide strong theoretical guarantees along with an extensive empirical\nevaluation using many real-world datasets from diverse domains, establishing\nthe superiority of our approach w.r.t. state-of-the-art deep learning methods.\n","authors":["Sahil Garg","Anderson Schneider","Anant Raj","Kashif Rasul","Yuriy Nevmyvaka","Sneihil Gopal","Amit Dhurandhar","Guillermo Cecchi","Irina Rish"],"pdf_url":"https://arxiv.org/pdf/2404.07377v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07374v1","updated":"2024-04-10T22:16:20Z","published":"2024-04-10T22:16:20Z","title":"Improving Multi-Center Generalizability of GAN-Based Fat Suppression\n using Federated Learning","summary":" Generative Adversarial Network (GAN)-based synthesis of fat suppressed (FS)\nMRIs from non-FS proton density sequences has the potential to accelerate\nacquisition of knee MRIs. However, GANs trained on single-site data have poor\ngeneralizability to external data. We show that federated learning can improve\nmulti-center generalizability of GANs for synthesizing FS MRIs, while\nfacilitating privacy-preserving multi-institutional collaborations.\n","authors":["Pranav Kulkarni","Adway Kanhere","Harshita Kukreja","Vivian Zhang","Paul H. Yi","Vishwa S. Parekh"],"pdf_url":"https://arxiv.org/pdf/2404.07374v1.pdf","comment":"5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2404.07356v1","updated":"2024-04-10T21:23:13Z","published":"2024-04-10T21:23:13Z","title":"GANsemble for Small and Imbalanced Data Sets: A Baseline for Synthetic\n Microplastics Data","summary":" Microplastic particle ingestion or inhalation by humans is a problem of\ngrowing concern. Unfortunately, current research methods that use machine\nlearning to understand their potential harms are obstructed by a lack of\navailable data. Deep learning techniques in particular are challenged by such\ndomains where only small or imbalanced data sets are available. Overcoming this\nchallenge often involves oversampling underrepresented classes or augmenting\nthe existing data to improve model performance. This paper proposes GANsemble:\na two-module framework connecting data augmentation with conditional generative\nadversarial networks (cGANs) to generate class-conditioned synthetic data.\nFirst, the data chooser module automates augmentation strategy selection by\nsearching for the best data augmentation strategy. Next, the cGAN module uses\nthis strategy to train a cGAN for generating enhanced synthetic data. We\nexperiment with the GANsemble framework on a small and imbalanced microplastics\ndata set. A Microplastic-cGAN (MPcGAN) algorithm is introduced, and baselines\nfor synthetic microplastics (SYMP) data are established in terms of Frechet\nInception Distance (FID) and Inception Scores (IS). We also provide a synthetic\nmicroplastics filter (SYMP-Filter) algorithm to increase the quality of\ngenerated SYMP. Additionally, we show the best amount of oversampling with\naugmentation to fix class imbalance in small microplastics data sets. To our\nknowledge, this study is the first application of generative AI to\nsynthetically create microplastics data.\n","authors":["Daniel Platnick","Sourena Khanzadeh","Alireza Sadeghian","Richard Anthony Valenzano"],"pdf_url":"https://arxiv.org/pdf/2404.07356v1.pdf","comment":"Accepted to the 37th Canadian Artificial Intelligence Conference\n (2024), 12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2309.16133v2","updated":"2024-04-10T21:19:33Z","published":"2023-09-28T03:30:50Z","title":"Mask4Former: Mask Transformer for 4D Panoptic Segmentation","summary":" Accurately perceiving and tracking instances over time is essential for the\ndecision-making processes of autonomous agents interacting safely in dynamic\nenvironments. With this intention, we propose Mask4Former for the challenging\ntask of 4D panoptic segmentation of LiDAR point clouds. Mask4Former is the\nfirst transformer-based approach unifying semantic instance segmentation and\ntracking of sparse and irregular sequences of 3D point clouds into a single\njoint model. Our model directly predicts semantic instances and their temporal\nassociations without relying on hand-crafted non-learned association strategies\nsuch as probabilistic clustering or voting-based center prediction. Instead,\nMask4Former introduces spatio-temporal instance queries that encode the\nsemantic and geometric properties of each semantic tracklet in the sequence. In\nan in-depth study, we find that promoting spatially compact instance\npredictions is critical as spatio-temporal instance queries tend to merge\nmultiple semantically similar instances, even if they are spatially distant. To\nthis end, we regress 6-DOF bounding box parameters from spatio-temporal\ninstance queries, which are used as an auxiliary task to foster spatially\ncompact predictions. Mask4Former achieves a new state-of-the-art on the\nSemanticKITTI test set with a score of 68.4 LSTQ.\n","authors":["Kadir Yilmaz","Jonas Schult","Alexey Nekrasov","Bastian Leibe"],"pdf_url":"https://arxiv.org/pdf/2309.16133v2.pdf","comment":"Renamed from MASK4D to Mask4Former. ICRA 2024. Project page:\n https://vision.rwth-aachen.de/Mask4Former"},{"id":"http://arxiv.org/abs/2404.07351v1","updated":"2024-04-10T21:14:33Z","published":"2024-04-10T21:14:33Z","title":"A Transformer-Based Model for the Prediction of Human Gaze Behavior on\n Videos","summary":" Eye-tracking applications that utilize the human gaze in video understanding\ntasks have become increasingly important. To effectively automate the process\nof video analysis based on eye-tracking data, it is important to accurately\nreplicate human gaze behavior. However, this task presents significant\nchallenges due to the inherent complexity and ambiguity of human gaze patterns.\nIn this work, we introduce a novel method for simulating human gaze behavior.\nOur approach uses a transformer-based reinforcement learning algorithm to train\nan agent that acts as a human observer, with the primary role of watching\nvideos and simulating human gaze behavior. We employed an eye-tracking dataset\ngathered from videos generated by the VirtualHome simulator, with a primary\nfocus on activity recognition. Our experimental results demonstrate the\neffectiveness of our gaze prediction method by highlighting its capability to\nreplicate human gaze behavior and its applicability for downstream tasks where\nreal human-gaze is used as input.\n","authors":["Suleyman Ozdel","Yao Rong","Berat Mert Albaba","Yen-Ling Kuo","Xi Wang"],"pdf_url":"https://arxiv.org/pdf/2404.07351v1.pdf","comment":"2024 Symposium on Eye Tracking Research and Applications (ETRA24),\n Glasgow, United Kingdom"},{"id":"http://arxiv.org/abs/2309.04071v2","updated":"2024-04-10T21:09:15Z","published":"2023-09-08T02:05:03Z","title":"Enhancing Hierarchical Transformers for Whole Brain Segmentation with\n Intracranial Measurements Integration","summary":" Whole brain segmentation with magnetic resonance imaging (MRI) enables the\nnon-invasive measurement of brain regions, including total intracranial volume\n(TICV) and posterior fossa volume (PFV). Enhancing the existing whole brain\nsegmentation methodology to incorporate intracranial measurements offers a\nheightened level of comprehensiveness in the analysis of brain structures.\nDespite its potential, the task of generalizing deep learning techniques for\nintracranial measurements faces data availability constraints due to limited\nmanually annotated atlases encompassing whole brain and TICV/PFV labels. In\nthis paper, we enhancing the hierarchical transformer UNesT for whole brain\nsegmentation to achieve segmenting whole brain with 133 classes and TICV/PFV\nsimultaneously. To address the problem of data scarcity, the model is first\npretrained on 4859 T1-weighted (T1w) 3D volumes sourced from 8 different sites.\nThese volumes are processed through a multi-atlas segmentation pipeline for\nlabel generation, while TICV/PFV labels are unavailable. Subsequently, the\nmodel is finetuned with 45 T1w 3D volumes from Open Access Series Imaging\nStudies (OASIS) where both 133 whole brain classes and TICV/PFV labels are\navailable. We evaluate our method with Dice similarity coefficients(DSC). We\nshow that our model is able to conduct precise TICV/PFV estimation while\nmaintaining the 132 brain regions performance at a comparable level. Code and\ntrained model are available at:\nhttps://github.com/MASILab/UNesT/tree/main/wholebrainSeg.\n","authors":["Xin Yu","Yucheng Tang","Qi Yang","Ho Hin Lee","Shunxing Bao","Yuankai Huo","Bennett A. Landman"],"pdf_url":"https://arxiv.org/pdf/2309.04071v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07347v1","updated":"2024-04-10T21:03:23Z","published":"2024-04-10T21:03:23Z","title":"Gaze-Guided Graph Neural Network for Action Anticipation Conditioned on\n Intention","summary":" Humans utilize their gaze to concentrate on essential information while\nperceiving and interpreting intentions in videos. Incorporating human gaze into\ncomputational algorithms can significantly enhance model performance in video\nunderstanding tasks. In this work, we address a challenging and innovative task\nin video understanding: predicting the actions of an agent in a video based on\na partial video. We introduce the Gaze-guided Action Anticipation algorithm,\nwhich establishes a visual-semantic graph from the video input. Our method\nutilizes a Graph Neural Network to recognize the agent's intention and predict\nthe action sequence to fulfill this intention. To assess the efficiency of our\napproach, we collect a dataset containing household activities generated in the\nVirtualHome environment, accompanied by human gaze data of viewing videos. Our\nmethod outperforms state-of-the-art techniques, achieving a 7\\% improvement in\naccuracy for 18-class intention recognition. This highlights the efficiency of\nour method in learning important features from human gaze data.\n","authors":["Suleyman Ozdel","Yao Rong","Berat Mert Albaba","Yen-Ling Kuo","Xi Wang"],"pdf_url":"https://arxiv.org/pdf/2404.07347v1.pdf","comment":"2024 Symposium on Eye Tracking Research and Applications (ETRA24),\n Glasgow, United Kingdom"},{"id":"http://arxiv.org/abs/2404.07336v1","updated":"2024-04-10T20:32:24Z","published":"2024-04-10T20:32:24Z","title":"PEAVS: Perceptual Evaluation of Audio-Visual Synchrony Grounded in\n Viewers' Opinion Scores","summary":" Recent advancements in audio-visual generative modeling have been propelled\nby progress in deep learning and the availability of data-rich benchmarks.\nHowever, the growth is not attributed solely to models and benchmarks.\nUniversally accepted evaluation metrics also play an important role in\nadvancing the field. While there are many metrics available to evaluate audio\nand visual content separately, there is a lack of metrics that offer a\nquantitative and interpretable measure of audio-visual synchronization for\nvideos \"in the wild\". To address this gap, we first created a large scale human\nannotated dataset (100+ hrs) representing nine types of synchronization errors\nin audio-visual content and how human perceive them. We then developed a PEAVS\n(Perceptual Evaluation of Audio-Visual Synchrony) score, a novel automatic\nmetric with a 5-point scale that evaluates the quality of audio-visual\nsynchronization. We validate PEAVS using a newly generated dataset, achieving a\nPearson correlation of 0.79 at the set level and 0.54 at the clip level when\ncompared to human labels. In our experiments, we observe a relative gain 50%\nover a natural extension of Fr\\'echet based metrics for Audio-Visual synchrony,\nconfirming PEAVS efficacy in objectively modeling subjective perceptions of\naudio-visual synchronization for videos \"in the wild\".\n","authors":["Lucas Goncalves","Prashant Mathur","Chandrashekhar Lavania","Metehan Cekic","Marcello Federico","Kyu J. Han"],"pdf_url":"https://arxiv.org/pdf/2404.07336v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2403.19653v2","updated":"2024-04-10T20:03:05Z","published":"2024-03-28T17:59:42Z","title":"Detecting Image Attribution for Text-to-Image Diffusion Models in RGB\n and Beyond","summary":" Modern text-to-image (T2I) diffusion models can generate images with\nremarkable realism and creativity. These advancements have sparked research in\nfake image detection and attribution, yet prior studies have not fully explored\nthe practical and scientific dimensions of this task. In addition to\nattributing images to 12 state-of-the-art T2I generators, we provide extensive\nanalyses on what inference stage hyperparameters and image modifications are\ndiscernible. Our experiments reveal that initialization seeds are highly\ndetectable, along with other subtle variations in the image generation process\nto some extent. We further investigate what visual traces are leveraged in\nimage attribution by perturbing high-frequency details and employing mid-level\nrepresentations of image style and structure. Notably, altering high-frequency\ninformation causes only slight reductions in accuracy, and training an\nattributor on style representations outperforms training on RGB images. Our\nanalyses underscore that fake images are detectable and attributable at various\nlevels of visual granularity than previously explored.\n","authors":["Katherine Xu","Lingzhi Zhang","Jianbo Shi"],"pdf_url":"https://arxiv.org/pdf/2403.19653v2.pdf","comment":"Code available at https://github.com/k8xu/ImageAttribution"},{"id":"http://arxiv.org/abs/2404.07318v1","updated":"2024-04-10T19:39:43Z","published":"2024-04-10T19:39:43Z","title":"Rethinking Perceptual Metrics for Medical Image Translation","summary":" Modern medical image translation methods use generative models for tasks such\nas the conversion of CT images to MRI. Evaluating these methods typically\nrelies on some chosen downstream task in the target domain, such as\nsegmentation. On the other hand, task-agnostic metrics are attractive, such as\nthe network feature-based perceptual metrics (e.g., FID) that are common to\nimage translation in general computer vision. In this paper, we investigate\nevaluation metrics for medical image translation on two medical image\ntranslation tasks (GE breast MRI to Siemens breast MRI and lumbar spine MRI to\nCT), tested on various state-of-the-art translation methods. We show that\nperceptual metrics do not generally correlate with segmentation metrics due to\nthem extending poorly to the anatomical constraints of this sub-field, with FID\nbeing especially inconsistent. However, we find that the lesser-used\npixel-level SWD metric may be useful for subtle intra-modality translation. Our\nresults demonstrate the need for further research into helpful metrics for\nmedical image translation.\n","authors":["Nicholas Konz","Yuwen Chen","Hanxue Gu","Haoyu Dong","Maciej A. Mazurowski"],"pdf_url":"https://arxiv.org/pdf/2404.07318v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07306v1","updated":"2024-04-10T18:58:05Z","published":"2024-04-10T18:58:05Z","title":"AI-Guided Defect Detection Techniques to Model Single Crystal Diamond\n Growth","summary":" From a process development perspective, diamond growth via chemical vapor\ndeposition has made significant strides. However, challenges persist in\nachieving high quality and large-area material production. These difficulties\ninclude controlling conditions to maintain uniform growth rates for the entire\ngrowth surface. As growth progresses, various factors or defect states emerge,\naltering the uniform conditions. These changes affect the growth rate and\nresult in the formation of crystalline defects at the microscale. However,\nthere is a distinct lack of methods to identify these defect states and their\ngeometry using images taken during the growth process. This paper details\nseminal work on defect segmentation pipeline using in-situ optical images to\nidentify features that indicate defective states that are visible at the\nmacroscale. Using a semantic segmentation approach as applied in our previous\nwork, these defect states and corresponding derivative features are isolated\nand classified by their pixel masks. Using an annotation focused\nhuman-in-the-loop software architecture to produce training datasets, with\nmodules for selective data labeling using active learning, data augmentations,\nand model-assisted labeling, our approach achieves effective annotation\naccuracy and drastically reduces the time and cost of labeling by orders of\nmagnitude. On the model development front, we found that deep learning-based\nalgorithms are the most efficient. They can accurately learn complex\nrepresentations from feature-rich datasets. Our best-performing model, based on\nthe YOLOV3 and DeeplabV3plus architectures, achieved excellent accuracy for\nspecific features of interest. Specifically, it reached 93.35% accuracy for\ncenter defects, 92.83% for polycrystalline defects, and 91.98% for edge\ndefects.\n","authors":["Rohan Reddy Mekala","Elias Garratt","Matthias Muehle","Arjun Srinivasan","Adam Porter","Mikael Lindvall"],"pdf_url":"https://arxiv.org/pdf/2404.07306v1.pdf","comment":"12 pages,4 figures,ACMME 2024"},{"id":"http://arxiv.org/abs/2404.07292v1","updated":"2024-04-10T18:40:23Z","published":"2024-04-10T18:40:23Z","title":"Solving Masked Jigsaw Puzzles with Diffusion Vision Transformers","summary":" Solving image and video jigsaw puzzles poses the challenging task of\nrearranging image fragments or video frames from unordered sequences to restore\nmeaningful images and video sequences. Existing approaches often hinge on\ndiscriminative models tasked with predicting either the absolute positions of\npuzzle elements or the permutation actions applied to the original data.\nUnfortunately, these methods face limitations in effectively solving puzzles\nwith a large number of elements. In this paper, we propose JPDVT, an innovative\napproach that harnesses diffusion transformers to address this challenge.\nSpecifically, we generate positional information for image patches or video\nframes, conditioned on their underlying visual content. This information is\nthen employed to accurately assemble the puzzle pieces in their correct\npositions, even in scenarios involving missing pieces. Our method achieves\nstate-of-the-art performance on several datasets.\n","authors":["Jinyang Liu","Wondmgezahu Teshome","Sandesh Ghimire","Mario Sznaier","Octavia Camps"],"pdf_url":"https://arxiv.org/pdf/2404.07292v1.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2401.06287v2","updated":"2024-04-10T18:16:32Z","published":"2024-01-11T23:00:24Z","title":"Hierarchical Augmentation and Distillation for Class Incremental\n Audio-Visual Video Recognition","summary":" Audio-visual video recognition (AVVR) aims to integrate audio and visual\nclues to categorize videos accurately. While existing methods train AVVR models\nusing provided datasets and achieve satisfactory results, they struggle to\nretain historical class knowledge when confronted with new classes in\nreal-world situations. Currently, there are no dedicated methods for addressing\nthis problem, so this paper concentrates on exploring Class Incremental\nAudio-Visual Video Recognition (CIAVVR). For CIAVVR, since both stored data and\nlearned model of past classes contain historical knowledge, the core challenge\nis how to capture past data knowledge and past model knowledge to prevent\ncatastrophic forgetting. We introduce Hierarchical Augmentation and\nDistillation (HAD), which comprises the Hierarchical Augmentation Module (HAM)\nand Hierarchical Distillation Module (HDM) to efficiently utilize the\nhierarchical structure of data and models, respectively. Specifically, HAM\nimplements a novel augmentation strategy, segmental feature augmentation, to\npreserve hierarchical model knowledge. Meanwhile, HDM introduces newly designed\nhierarchical (video-distribution) logical distillation and hierarchical\n(snippet-video) correlative distillation to capture and maintain the\nhierarchical intra-sample knowledge of each data and the hierarchical\ninter-sample knowledge between data, respectively. Evaluations on four\nbenchmarks (AVE, AVK-100, AVK-200, and AVK-400) demonstrate that the proposed\nHAD effectively captures hierarchical information in both data and models,\nresulting in better preservation of historical class knowledge and improved\nperformance. Furthermore, we provide a theoretical analysis to support the\nnecessity of the segmental feature augmentation strategy.\n","authors":["Yukun Zuo","Hantao Yao","Liansheng Zhuang","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2401.06287v2.pdf","comment":"Submitted to TPAMI"},{"id":"http://arxiv.org/abs/2308.15321v6","updated":"2024-04-10T18:13:00Z","published":"2023-08-29T14:16:09Z","title":"Elucidating the Exposure Bias in Diffusion Models","summary":" Diffusion models have demonstrated impressive generative capabilities, but\ntheir \\textit{exposure bias} problem, described as the input mismatch between\ntraining and sampling, lacks in-depth exploration. In this paper, we\nsystematically investigate the exposure bias problem in diffusion models by\nfirst analytically modelling the sampling distribution, based on which we then\nattribute the prediction error at each sampling step as the root cause of the\nexposure bias issue. Furthermore, we discuss potential solutions to this issue\nand propose an intuitive metric for it. Along with the elucidation of exposure\nbias, we propose a simple, yet effective, training-free method called Epsilon\nScaling to alleviate the exposure bias. We show that Epsilon Scaling explicitly\nmoves the sampling trajectory closer to the vector field learned in the\ntraining phase by scaling down the network output, mitigating the input\nmismatch between training and sampling. Experiments on various diffusion\nframeworks (ADM, DDIM, EDM, LDM, DiT, PFGM++) verify the effectiveness of our\nmethod. Remarkably, our ADM-ES, as a state-of-the-art stochastic sampler,\nobtains 2.17 FID on CIFAR-10 under 100-step unconditional generation. The code\nis available at \\url{https://github.com/forever208/ADM-ES} and\n\\url{https://github.com/forever208/EDM-ES}.\n","authors":["Mang Ning","Mingxiao Li","Jianlin Su","Albert Ali Salah","Itir Onal Ertugrul"],"pdf_url":"https://arxiv.org/pdf/2308.15321v6.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2404.06776v1","updated":"2024-04-10T06:35:25Z","published":"2024-04-10T06:35:25Z","title":"Logit Calibration and Feature Contrast for Robust Federated Learning on\n Non-IID Data","summary":" Federated learning (FL) is a privacy-preserving distributed framework for\ncollaborative model training on devices in edge networks. However, challenges\narise due to vulnerability to adversarial examples (AEs) and the\nnon-independent and identically distributed (non-IID) nature of data\ndistribution among devices, hindering the deployment of adversarially robust\nand accurate learning models at the edge. While adversarial training (AT) is\ncommonly acknowledged as an effective defense strategy against adversarial\nattacks in centralized training, we shed light on the adverse effects of\ndirectly applying AT in FL that can severely compromise accuracy, especially in\nnon-IID challenges. Given this limitation, this paper proposes FatCC, which\nincorporates local logit \\underline{C}alibration and global feature\n\\underline{C}ontrast into the vanilla federated adversarial training\n(\\underline{FAT}) process from both logit and feature perspectives. This\napproach can effectively enhance the federated system's robust accuracy (RA)\nand clean accuracy (CA). First, we propose logit calibration, where the logits\nare calibrated during local adversarial updates, thereby improving adversarial\nrobustness. Second, FatCC introduces feature contrast, which involves a global\nalignment term that aligns each local representation with unbiased global\nfeatures, thus further enhancing robustness and accuracy in federated\nadversarial environments. Extensive experiments across multiple datasets\ndemonstrate that FatCC achieves comparable or superior performance gains in\nboth CA and RA compared to other baselines.\n","authors":["Yu Qiao","Chaoning Zhang","Apurba Adhikary","Choong Seon Hong"],"pdf_url":"https://arxiv.org/pdf/2404.06776v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06773v1","updated":"2024-04-10T06:30:08Z","published":"2024-04-10T06:30:08Z","title":"Adapting LLaMA Decoder to Vision Transformer","summary":" This work examines whether decoder-only Transformers such as LLaMA, which\nwere originally designed for large language models (LLMs), can be adapted to\nthe computer vision field. We first \"LLaMAfy\" a standard ViT step-by-step to\nalign with LLaMA's architecture, and find that directly applying a casual mask\nto the self-attention brings an attention collapse issue, resulting in the\nfailure to the network training. We suggest to reposition the class token\nbehind the image tokens with a post-sequence class token technique to overcome\nthis challenge, enabling causal self-attention to efficiently capture the\nentire image's information. Additionally, we develop a soft mask strategy that\ngradually introduces a casual mask to the self-attention at the onset of\ntraining to facilitate the optimization behavior. The tailored model, dubbed as\nimage LLaMA (iLLaMA), is akin to LLaMA in architecture and enables direct\nsupervised learning. Its causal self-attention boosts computational efficiency\nand learns complex representation by elevating attention map ranks. iLLaMA\nrivals the performance with its encoder-only counterparts, achieving 75.1%\nImageNet top-1 accuracy with only 5.7M parameters. Scaling the model to ~310M\nand pre-training on ImageNet-21K further enhances the accuracy to 86.0%.\nExtensive experiments demonstrate iLLaMA's reliable properties: calibration,\nshape-texture bias, quantization compatibility, ADE20K segmentation and CIFAR\ntransfer learning. We hope our study can kindle fresh views to visual model\ndesign in the wave of LLMs. Pre-trained models and codes are available here.\n","authors":["Jiahao Wang","Wenqi Shao","Mengzhao Chen","Chengyue Wu","Yong Liu","Kaipeng Zhang","Songyang Zhang","Kai Chen","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2404.06773v1.pdf","comment":"22 pages, 10 figures"},{"id":"http://arxiv.org/abs/2404.06753v1","updated":"2024-04-10T05:41:05Z","published":"2024-04-10T05:41:05Z","title":"MonoSelfRecon: Purely Self-Supervised Explicit Generalizable 3D\n Reconstruction of Indoor Scenes from Monocular RGB Views","summary":" Current monocular 3D scene reconstruction (3DR) works are either\nfully-supervised, or not generalizable, or implicit in 3D representation. We\npropose a novel framework - MonoSelfRecon that for the first time achieves\nexplicit 3D mesh reconstruction for generalizable indoor scenes with monocular\nRGB views by purely self-supervision on voxel-SDF (signed distance function).\nMonoSelfRecon follows an Autoencoder-based architecture, decodes voxel-SDF and\na generalizable Neural Radiance Field (NeRF), which is used to guide voxel-SDF\nin self-supervision. We propose novel self-supervised losses, which not only\nsupport pure self-supervision, but can be used together with supervised signals\nto further boost supervised training. Our experiments show that \"MonoSelfRecon\"\ntrained in pure self-supervision outperforms current best self-supervised\nindoor depth estimation models and is comparable to 3DR models trained in fully\nsupervision with depth annotations. MonoSelfRecon is not restricted by specific\nmodel design, which can be used to any models with voxel-SDF for purely\nself-supervised manner.\n","authors":["Runfa Li","Upal Mahbub","Vasudev Bhaskaran","Truong Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.06753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06744v1","updated":"2024-04-10T05:10:05Z","published":"2024-04-10T05:10:05Z","title":"YOLO based Ocean Eddy Localization with AWS SageMaker","summary":" Ocean eddies play a significant role both on the sea surface and beneath it,\ncontributing to the sustainability of marine life dependent on oceanic\nbehaviors. Therefore, it is crucial to investigate ocean eddies to monitor\nchanges in the Earth, particularly in the oceans, and their impact on climate.\nThis study aims to pinpoint ocean eddies using AWS cloud services, specifically\nSageMaker. The primary objective is to detect small-scale (<20km) ocean eddies\nfrom satellite remote images and assess the feasibility of utilizing SageMaker,\nwhich offers tools for deploying AI applications. Moreover, this research not\nonly explores the deployment of cloud-based services for remote sensing of\nEarth data but also evaluates several YOLO (You Only Look Once) models using\nsingle and multi-GPU-based services in the cloud. Furthermore, this study\nunderscores the potential of these services, their limitations, challenges\nrelated to deployment and resource management, and their user-riendliness for\nEarth science projects.\n","authors":["Seraj Al Mahmud Mostafa","Jinbo Wang","Benjamin Holt","Jianwu Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06744v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2404.06741v1","updated":"2024-04-10T04:59:51Z","published":"2024-04-10T04:59:51Z","title":"An Animation-based Augmentation Approach for Action Recognition from\n Discontinuous Video","summary":" The study of action recognition has attracted considerable attention recently\ndue to its broad applications in multiple areas. However, with the issue of\ndiscontinuous training video, which not only decreases the performance of\naction recognition model, but complicates the data augmentation process as\nwell, still remains under-exploration. In this study, we introduce the 4A\n(Action Animation-based Augmentation Approach), an innovative pipeline for data\naugmentation to address the problem. The main contributions remain in our work\nincludes: (1) we investigate the problem of severe decrease on performance of\naction recognition task training by discontinuous video, and the limitation of\nexisting augmentation methods on solving this problem. (2) we propose a novel\naugmentation pipeline, 4A, to address the problem of discontinuous video for\ntraining, while achieving a smoother and natural-looking action representation\nthan the latest data augmentation methodology. (3) We achieve the same\nperformance with only 10% of the original data for training as with all of the\noriginal data from the real-world dataset, and a better performance on\nIn-the-wild videos, by employing our data augmentation techniques.\n","authors":["Xingyu Song","Zhan Li","Shi Chen","Xin-Qiang Cai","Kazuyuki Demachi"],"pdf_url":"https://arxiv.org/pdf/2404.06741v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.02736v4","updated":"2024-04-10T04:51:33Z","published":"2022-11-04T20:22:58Z","title":"Discovering Closed-Loop Failures of Vision-Based Controllers via\n Reachability Analysis","summary":" Machine learning driven image-based controllers allow robotic systems to take\nintelligent actions based on the visual feedback from their environment.\nUnderstanding when these controllers might lead to system safety violations is\nimportant for their integration in safety-critical applications and engineering\ncorrective safety measures for the system. Existing methods leverage\nsimulation-based testing (or falsification) to find the failures of\nvision-based controllers, i.e., the visual inputs that lead to closed-loop\nsafety violations. However, these techniques do not scale well to the scenarios\ninvolving high-dimensional and complex visual inputs, such as RGB images. In\nthis work, we cast the problem of finding closed-loop vision failures as a\nHamilton-Jacobi (HJ) reachability problem. Our approach blends simulation-based\nanalysis with HJ reachability methods to compute an approximation of the\nbackward reachable tube (BRT) of the system, i.e., the set of unsafe states for\nthe system under vision-based controllers. Utilizing the BRT, we can tractably\nand systematically find the system states and corresponding visual inputs that\nlead to closed-loop failures. These visual inputs can be subsequently analyzed\nto find the input characteristics that might have caused the failure. Besides\nits scalability to high-dimensional visual inputs, an explicit computation of\nBRT allows the proposed approach to capture non-trivial system failures that\nare difficult to expose via random simulations. We demonstrate our framework on\ntwo case studies involving an RGB image-based neural network controller for (a)\nautonomous indoor navigation, and (b) autonomous aircraft taxiing.\n","authors":["Kaustav Chakraborty","Somil Bansal"],"pdf_url":"https://arxiv.org/pdf/2211.02736v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01289v3","updated":"2024-04-10T04:42:10Z","published":"2023-06-02T06:15:36Z","title":"nnMobileNe: Rethinking CNN for Retinopathy Research","summary":" Over the past few decades, convolutional neural networks (CNNs) have been at\nthe forefront of the detection and tracking of various retinal diseases (RD).\nDespite their success, the emergence of vision transformers (ViT) in the 2020s\nhas shifted the trajectory of RD model development. The leading-edge\nperformance of ViT-based models in RD can be largely credited to their\nscalability-their ability to improve as more parameters are added. As a result,\nViT-based models tend to outshine traditional CNNs in RD applications, albeit\nat the cost of increased data and computational demands. ViTs also differ from\nCNNs in their approach to processing images, working with patches rather than\nlocal regions, which can complicate the precise localization of small, variably\npresented lesions in RD. In our study, we revisited and updated the\narchitecture of a CNN model, specifically MobileNet, to enhance its utility in\nRD diagnostics. We found that an optimized MobileNet, through selective\nmodifications, can surpass ViT-based models in various RD benchmarks, including\ndiabetic retinopathy grading, detection of multiple fundus diseases, and\nclassification of diabetic macular edema. The code is available at\nhttps://github.com/Retinal-Research/NN-MOBILENET\n","authors":["Wenhui Zhu","Peijie Qiu","Xiwen Chen","Xin Li","Natasha Lepore","Oana M. Dumitrascu","Yalin Wang"],"pdf_url":"https://arxiv.org/pdf/2306.01289v3.pdf","comment":"Accepted as a conference paper to 2024 CVPRW"},{"id":"http://arxiv.org/abs/2404.06727v1","updated":"2024-04-10T04:24:42Z","published":"2024-04-10T04:24:42Z","title":"Bayesian NeRF: Quantifying Uncertainty with Volume Density in Neural\n Radiance Fields","summary":" We present the Bayesian Neural Radiance Field (NeRF), which explicitly\nquantifies uncertainty in geometric volume structures without the need for\nadditional networks, making it adept for challenging observations and\nuncontrolled images. NeRF diverges from traditional geometric methods by\noffering an enriched scene representation, rendering color and density in 3D\nspace from various viewpoints. However, NeRF encounters limitations in relaxing\nuncertainties by using geometric structure information, leading to inaccuracies\nin interpretation under insufficient real-world observations. Recent research\nefforts aimed at addressing this issue have primarily relied on empirical\nmethods or auxiliary networks. To fundamentally address this issue, we propose\na series of formulational extensions to NeRF. By introducing generalized\napproximations and defining density-related uncertainty, our method seamlessly\nextends to manage uncertainty not only for RGB but also for depth, without the\nneed for additional networks or empirical assumptions. In experiments we show\nthat our method significantly enhances performance on RGB and depth images in\nthe comprehensive dataset, demonstrating the reliability of the Bayesian NeRF\napproach to quantifying uncertainty based on the geometric structure.\n","authors":["Sibeak Lee","Kyeongsu Kang","Hyeonwoo Yu"],"pdf_url":"https://arxiv.org/pdf/2404.06727v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03384v2","updated":"2024-04-10T04:24:36Z","published":"2024-04-04T11:33:29Z","title":"LongVLM: Efficient Long Video Understanding via Large Language Models","summary":" Empowered by Large Language Models (LLMs), recent advancements in VideoLLMs\nhave driven progress in various video understanding tasks. These models encode\nvideo representations through pooling or query aggregation over a vast number\nof visual tokens, making computational and memory costs affordable. Despite\nsuccessfully providing an overall comprehension of video content, existing\nVideoLLMs still face challenges in achieving detailed understanding in videos\ndue to overlooking local information in long-term videos. To tackle this\nchallenge, we introduce LongVLM, a straightforward yet powerful VideoLLM for\nlong video understanding, building upon the observation that long videos often\nconsist of sequential key events, complex actions, and camera movements. Our\napproach proposes to decompose long videos into multiple short-term segments\nand encode local features for each local segment via a hierarchical token\nmerging module. These features are concatenated in temporal order to maintain\nthe storyline across sequential short-term segments. Additionally, we propose\nto integrate global semantics into each local feature to enhance context\nunderstanding. In this way, we encode video representations that incorporate\nboth local and global information, enabling the LLM to generate comprehensive\nresponses for long-term videos. Experimental results on the VideoChatGPT\nbenchmark and zero-shot video question-answering datasets demonstrate the\nsuperior capabilities of our model over the previous state-of-the-art methods.\nQualitative examples demonstrate that our model produces more precise responses\nfor long videos understanding. Code will be available at\nhttps://github.com/ziplab/LongVLM.\n","authors":["Yuetian Weng","Mingfei Han","Haoyu He","Xiaojun Chang","Bohan Zhuang"],"pdf_url":"https://arxiv.org/pdf/2404.03384v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11848v2","updated":"2024-04-10T04:05:24Z","published":"2024-03-18T15:00:38Z","title":"GraphBEV: Towards Robust BEV Feature Alignment for Multi-Modal 3D Object\n Detection","summary":" Integrating LiDAR and camera information into Bird's-Eye-View (BEV)\nrepresentation has emerged as a crucial aspect of 3D object detection in\nautonomous driving. However, existing methods are susceptible to the inaccurate\ncalibration relationship between LiDAR and the camera sensor. Such inaccuracies\nresult in errors in depth estimation for the camera branch, ultimately causing\nmisalignment between LiDAR and camera BEV features. In this work, we propose a\nrobust fusion framework called Graph BEV. Addressing errors caused by\ninaccurate point cloud projection, we introduce a Local Align module that\nemploys neighbor-aware depth features via Graph matching. Additionally, we\npropose a Global Align module to rectify the misalignment between LiDAR and\ncamera BEV features. Our Graph BEV framework achieves state-of-the-art\nperformance, with an mAP of 70.1\\%, surpassing BEV Fusion by 1.6\\% on the\nnuscenes validation set. Importantly, our Graph BEV outperforms BEV Fusion by\n8.3\\% under conditions with misalignment noise.\n","authors":["Ziying Song","Lei Yang","Shaoqing Xu","Lin Liu","Dongyang Xu","Caiyan Jia","Feiyang Jia","Li Wang"],"pdf_url":"https://arxiv.org/pdf/2403.11848v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06194v2","updated":"2024-04-10T04:01:43Z","published":"2024-04-09T10:27:22Z","title":"Exploring the Potential of Large Foundation Models for Open-Vocabulary\n HOI Detection","summary":" Open-vocabulary human-object interaction (HOI) detection, which is concerned\nwith the problem of detecting novel HOIs guided by natural language, is crucial\nfor understanding human-centric scenes. However, prior zero-shot HOI detectors\noften employ the same levels of feature maps to model HOIs with varying\ndistances, leading to suboptimal performance in scenes containing human-object\npairs with a wide range of distances. In addition, these detectors primarily\nrely on category names and overlook the rich contextual information that\nlanguage can provide, which is essential for capturing open vocabulary concepts\nthat are typically rare and not well-represented by category names alone. In\nthis paper, we introduce a novel end-to-end open vocabulary HOI detection\nframework with conditional multi-level decoding and fine-grained semantic\nenhancement (CMD-SE), harnessing the potential of Visual-Language Models\n(VLMs). Specifically, we propose to model human-object pairs with different\ndistances with different levels of feature maps by incorporating a soft\nconstraint during the bipartite matching process. Furthermore, by leveraging\nlarge language models (LLMs) such as GPT models, we exploit their extensive\nworld knowledge to generate descriptions of human body part states for various\ninteractions. Then we integrate the generalizable and fine-grained semantics of\nhuman body parts to improve interaction recognition. Experimental results on\ntwo datasets, SWIG-HOI and HICO-DET, demonstrate that our proposed method\nachieves state-of-the-art results in open vocabulary HOI detection. The code\nand models are available at https://github.com/ltttpku/CMD-SE-release.\n","authors":["Ting Lei","Shaofeng Yin","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06194v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06715v1","updated":"2024-04-10T03:54:53Z","published":"2024-04-10T03:54:53Z","title":"Sparse Points to Dense Clouds: Enhancing 3D Detection with Limited LiDAR\n Data","summary":" 3D detection is a critical task that enables machines to identify and locate\nobjects in three-dimensional space. It has a broad range of applications in\nseveral fields, including autonomous driving, robotics and augmented reality.\nMonocular 3D detection is attractive as it requires only a single camera,\nhowever, it lacks the accuracy and robustness required for real world\napplications. High resolution LiDAR on the other hand, can be expensive and\nlead to interference problems in heavy traffic given their active\ntransmissions. We propose a balanced approach that combines the advantages of\nmonocular and point cloud-based 3D detection. Our method requires only a small\nnumber of 3D points, that can be obtained from a low-cost, low-resolution\nsensor. Specifically, we use only 512 points, which is just 1% of a full LiDAR\nframe in the KITTI dataset. Our method reconstructs a complete 3D point cloud\nfrom this limited 3D information combined with a single image. The\nreconstructed 3D point cloud and corresponding image can be used by any\nmulti-modal off-the-shelf detector for 3D object detection. By using the\nproposed network architecture with an off-the-shelf multi-modal 3D detector,\nthe accuracy of 3D detection improves by 20% compared to the state-of-the-art\nmonocular detection methods and 6% to 9% compare to the baseline multi-modal\nmethods on KITTI and JackRabbot datasets.\n","authors":["Aakash Kumar","Chen Chen","Ajmal Mian","Neils Lobo","Mubarak Shah"],"pdf_url":"https://arxiv.org/pdf/2404.06715v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01929v2","updated":"2024-04-10T03:36:33Z","published":"2024-04-02T13:23:21Z","title":"Towards Enhanced Analysis of Lung Cancer Lesions in EBUS-TBNA -- A\n Semi-Supervised Video Object Detection Method","summary":" This study aims to establish a computer-aided diagnostic system for lung\nlesions using bronchoscope endobronchial ultrasound (EBUS) to assist physicians\nin identifying lesion areas. During EBUS-transbronchial needle aspiration\n(EBUS-TBNA) procedures, physicians rely on grayscale ultrasound images to\ndetermine the location of lesions. However, these images often contain\nsignificant noise and can be influenced by surrounding tissues or blood\nvessels, making interpretation challenging. Previous research has lacked the\napplication of object detection models to EBUS-TBNA, and there has been no\nwell-defined solution for annotating the EBUS-TBNA dataset. In related studies\non ultrasound images, although models have been successful in capturing target\nregions for their respective tasks, their training and predictions have been\nbased on two-dimensional images, limiting their ability to leverage temporal\nfeatures for improved predictions. This study introduces a three-dimensional\nimage-based object detection model. It utilizes an attention mechanism to\ncapture temporal correlations and we will implements a filtering mechanism to\nselect relevant information from previous frames. Subsequently, a\nteacher-student model training approach is employed to optimize the model\nfurther, leveraging unlabeled data. To mitigate the impact of poor-quality\npseudo-labels on the student model, we will add a special Gaussian Mixture\nModel (GMM) to ensure the quality of pseudo-labels.\n","authors":["Jyun-An Lin","Yun-Chien Cheng","Ching-Kai Lin"],"pdf_url":"https://arxiv.org/pdf/2404.01929v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06080v2","updated":"2024-04-10T03:35:35Z","published":"2024-04-09T07:39:21Z","title":"Using Few-Shot Learning to Classify Primary Lung Cancer and Other\n Malignancy with Lung Metastasis in Cytological Imaging via Endobronchial\n Ultrasound Procedures","summary":" This study aims to establish a computer-aided diagnosis system for\nendobronchial ultrasound (EBUS) surgery to assist physicians in the preliminary\ndiagnosis of metastatic cancer. This involves arranging immediate examinations\nfor other sites of metastatic cancer after EBUS surgery, eliminating the need\nto wait for reports, thereby shortening the waiting time by more than half and\nenabling patients to detect other cancers earlier, allowing for early planning\nand implementation of treatment plans. Unlike previous studies on cell image\nclassification, which have abundant datasets for training, this study must also\nbe able to make effective classifications despite the limited amount of case\ndata for lung metastatic cancer. In the realm of small data set classification\nmethods, Few-shot learning (FSL) has become mainstream in recent years. Through\nits ability to train on small datasets and its strong generalization\ncapabilities, FSL shows potential in this task of lung metastatic cell image\nclassification. This study will adopt the approach of Few-shot learning,\nreferencing existing proposed models, and designing a model architecture for\nclassifying lung metastases cell images. Batch Spectral Regularization (BSR)\nwill be incorporated as a loss update parameter, and the Finetune method of PMF\nwill be modified. In terms of test results, the addition of BSR and the\nmodified Finetune method further increases the accuracy by 8.89% to 65.60%,\noutperforming other FSL methods. This study confirms that FSL is superior to\nsupervised and transfer learning in classifying metastatic cancer and\ndemonstrates that using BSR as a loss function and modifying Finetune can\nenhance the model's capabilities.\n","authors":["Ching-Kai Lin","Di-Chun Wei","Yun-Chien Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.06080v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06038v2","updated":"2024-04-10T03:27:04Z","published":"2023-07-12T09:33:21Z","title":"Pyramid Deep Fusion Network for Two-Hand Reconstruction from RGB-D\n Images","summary":" Accurately recovering the dense 3D mesh of both hands from monocular images\nposes considerable challenges due to occlusions and projection ambiguity. Most\nof the existing methods extract features from color images to estimate the\nroot-aligned hand meshes, which neglect the crucial depth and scale information\nin the real world. Given the noisy sensor measurements with limited resolution,\ndepth-based methods predict 3D keypoints rather than a dense mesh. These\nlimitations motivate us to take advantage of these two complementary inputs to\nacquire dense hand meshes on a real-world scale. In this work, we propose an\nend-to-end framework for recovering dense meshes for both hands, which employ\nsingle-view RGB-D image pairs as input. The primary challenge lies in\neffectively utilizing two different input modalities to mitigate the blurring\neffects in RGB images and noises in depth images. Instead of directly treating\ndepth maps as additional channels for RGB images, we encode the depth\ninformation into the unordered point cloud to preserve more geometric details.\nSpecifically, our framework employs ResNet50 and PointNet++ to derive features\nfrom RGB and point cloud, respectively. Additionally, we introduce a novel\npyramid deep fusion network (PDFNet) to aggregate features at different scales,\nwhich demonstrates superior efficacy compared to previous fusion strategies.\nFurthermore, we employ a GCN-based decoder to process the fused features and\nrecover the corresponding 3D pose and dense mesh. Through comprehensive\nablation experiments, we have not only demonstrated the effectiveness of our\nproposed fusion algorithm but also outperformed the state-of-the-art approaches\non publicly available datasets. To reproduce the results, we will make our\nsource code and models publicly available at\n{https://github.com/zijinxuxu/PDFNet}.\n","authors":["Jinwei Ren","Jianke Zhu"],"pdf_url":"https://arxiv.org/pdf/2307.06038v2.pdf","comment":"Accepted by TCSVT"},{"id":"http://arxiv.org/abs/2404.06704v1","updated":"2024-04-10T03:20:33Z","published":"2024-04-10T03:20:33Z","title":"Convolution-based Probability Gradient Loss for Semantic Segmentation","summary":" In this paper, we introduce a novel Convolution-based Probability Gradient\n(CPG) loss for semantic segmentation. It employs convolution kernels similar to\nthe Sobel operator, capable of computing the gradient of pixel intensity in an\nimage. This enables the computation of gradients for both ground-truth and\npredicted category-wise probabilities. It enhances network performance by\nmaximizing the similarity between these two probability gradients. Moreover, to\nspecifically enhance accuracy near the object's boundary, we extract the object\nboundary based on the ground-truth probability gradient and exclusively apply\nthe CPG loss to pixels belonging to boundaries. CPG loss proves to be highly\nconvenient and effective. It establishes pixel relationships through\nconvolution, calculating errors from a distinct dimension compared to\npixel-wise loss functions such as cross-entropy loss. We conduct qualitative\nand quantitative analyses to evaluate the impact of the CPG loss on three\nwell-established networks (DeepLabv3-Resnet50, HRNetV2-OCR, and\nLRASPP_MobileNet_V3_Large) across three standard segmentation datasets\n(Cityscapes, COCO-Stuff, ADE20K). Our extensive experimental results\nconsistently and significantly demonstrate that the CPG loss enhances the mean\nIntersection over Union.\n","authors":["Guohang Shan","Shuangcheng Jia"],"pdf_url":"https://arxiv.org/pdf/2404.06704v1.pdf","comment":"12 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.06700v1","updated":"2024-04-10T03:11:10Z","published":"2024-04-10T03:11:10Z","title":"Scaling Multi-Camera 3D Object Detection through Weak-to-Strong\n Eliciting","summary":" The emergence of Multi-Camera 3D Object Detection (MC3D-Det), facilitated by\nbird's-eye view (BEV) representation, signifies a notable progression in 3D\nobject detection. Scaling MC3D-Det training effectively accommodates varied\ncamera parameters and urban landscapes, paving the way for the MC3D-Det\nfoundation model. However, the multi-view fusion stage of the MC3D-Det method\nrelies on the ill-posed monocular perception during training rather than\nsurround refinement ability, leading to what we term \"surround refinement\ndegradation\". To this end, our study presents a weak-to-strong eliciting\nframework aimed at enhancing surround refinement while maintaining robust\nmonocular perception. Specifically, our framework employs weakly tuned experts\ntrained on distinct subsets, and each is inherently biased toward specific\ncamera configurations and scenarios. These biased experts can learn the\nperception of monocular degeneration, which can help the multi-view fusion\nstage to enhance surround refinement abilities. Moreover, a composite\ndistillation strategy is proposed to integrate the universal knowledge of 2D\nfoundation models and task-specific information. Finally, for MC3D-Det joint\ntraining, the elaborate dataset merge strategy is designed to solve the problem\nof inconsistent camera numbers and camera parameters. We set up a multiple\ndataset joint training benchmark for MC3D-Det and adequately evaluated existing\nmethods. Further, we demonstrate the proposed framework brings a generalized\nand significant boost over multiple baselines. Our code is at\n\\url{https://github.com/EnVision-Research/Scale-BEV}.\n","authors":["Hao Lu","Jiaqi Tang","Xinli Xu","Xu Cao","Yunpeng Zhang","Guoqing Wang","Dalong Du","Hao Chen","Yingcong Chen"],"pdf_url":"https://arxiv.org/pdf/2404.06700v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05645v2","updated":"2024-04-10T03:05:04Z","published":"2023-09-11T17:37:08Z","title":"CitDet: A Benchmark Dataset for Citrus Fruit Detection","summary":" In this letter, we present a new dataset to advance the state of the art in\ndetecting citrus fruit and accurately estimate yield on trees affected by the\nHuanglongbing (HLB) disease in orchard environments via imaging. Despite the\nfact that significant progress has been made in solving the fruit detection\nproblem, the lack of publicly available datasets has complicated direct\ncomparison of results. For instance, citrus detection has long been of interest\nto the agricultural research community, yet there is an absence of work,\nparticularly involving public datasets of citrus affected by HLB. To address\nthis issue, we enhance state-of-the-art object detection methods for use in\ntypical orchard settings. Concretely, we provide high-resolution images of\ncitrus trees located in an area known to be highly affected by HLB, along with\nhigh-quality bounding box annotations of citrus fruit. Fruit on both the trees\nand the ground are labeled to allow for identification of fruit location, which\ncontributes to advancements in yield estimation and potential measure of HLB\nimpact via fruit drop. The dataset consists of over 32,000 bounding box\nannotations for fruit instances contained in 579 high-resolution images. In\nsummary, our contributions are the following: (i) we introduce a novel dataset\nalong with baseline performance benchmarks on multiple contemporary object\ndetection algorithms, (ii) we show the ability to accurately capture fruit\nlocation on tree or on ground, and finally (ii) we present a correlation of our\nresults with yield estimations.\n","authors":["Jordan A. James","Heather K. Manching","Matthew R. Mattia","Kim D. Bowman","Amanda M. Hulse-Kemp","William J. Beksi"],"pdf_url":"https://arxiv.org/pdf/2309.05645v2.pdf","comment":"Submitted to IEEE Robotics and Automation Letters (RA-L)"},{"id":"http://arxiv.org/abs/2404.06693v1","updated":"2024-04-10T02:47:05Z","published":"2024-04-10T02:47:05Z","title":"Binomial Self-compensation for Motion Error in Dynamic 3D Scanning","summary":" Phase shifting profilometry (PSP) is favored in high-precision 3D scanning\ndue to its high accuracy, robustness, and pixel-wise property. However, a\nfundamental assumption of PSP that the object should remain static is violated\nin dynamic measurement, making PSP susceptible to object moving, resulting in\nripple-like errors in the point clouds. We propose a pixel-wise and frame-wise\nloopable binomial self-compensation (BSC) algorithm to effectively and flexibly\neliminate motion error in the four-step PSP. Our mathematical model\ndemonstrates that by summing successive motion-affected phase frames weighted\nby binomial coefficients, motion error exponentially diminishes as the binomial\norder increases, accomplishing automatic error compensation through the\nmotion-affected phase sequence, without the assistance of any intermediate\nvariable. Extensive experiments show that our BSC outperforms the existing\nmethods in reducing motion error, while achieving a depth map frame rate equal\nto the camera's acquisition rate (90 fps), enabling high-accuracy 3D\nreconstruction with a quasi-single-shot frame rate.\n","authors":["Geyou Zhang","Ce Zhu","Kai Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06693v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06692v1","updated":"2024-04-10T02:40:17Z","published":"2024-04-10T02:40:17Z","title":"Perception-Oriented Video Frame Interpolation via Asymmetric Blending","summary":" Previous methods for Video Frame Interpolation (VFI) have encountered\nchallenges, notably the manifestation of blur and ghosting effects. These\nissues can be traced back to two pivotal factors: unavoidable motion errors and\nmisalignment in supervision. In practice, motion estimates often prove to be\nerror-prone, resulting in misaligned features. Furthermore, the reconstruction\nloss tends to bring blurry results, particularly in misaligned regions. To\nmitigate these challenges, we propose a new paradigm called PerVFI\n(Perception-oriented Video Frame Interpolation). Our approach incorporates an\nAsymmetric Synergistic Blending module (ASB) that utilizes features from both\nsides to synergistically blend intermediate features. One reference frame\nemphasizes primary content, while the other contributes complementary\ninformation. To impose a stringent constraint on the blending process, we\nintroduce a self-learned sparse quasi-binary mask which effectively mitigates\nghosting and blur artifacts in the output. Additionally, we employ a\nnormalizing flow-based generator and utilize the negative log-likelihood loss\nto learn the conditional distribution of the output, which further facilitates\nthe generation of clear and fine details. Experimental results validate the\nsuperiority of PerVFI, demonstrating significant improvements in perceptual\nquality compared to existing methods. Codes are available at\n\\url{https://github.com/mulns/PerVFI}\n","authors":["Guangyang Wu","Xin Tao","Changlin Li","Wenyi Wang","Xiaohong Liu","Qingqing Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.06692v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2210.16101v2","updated":"2024-04-10T02:33:57Z","published":"2022-10-27T13:24:08Z","title":"A Generic Shared Attention Mechanism for Various Backbone Neural\n Networks","summary":" The self-attention mechanism has emerged as a critical component for\nimproving the performance of various backbone neural networks. However, current\nmainstream approaches individually incorporate newly designed self-attention\nmodules (SAMs) into each layer of the network for granted without fully\nexploiting their parameters' potential. This leads to suboptimal performance\nand increased parameter consumption as the network depth increases. To improve\nthis paradigm, in this paper, we first present a counterintuitive but inherent\nphenomenon: SAMs tend to produce strongly correlated attention maps across\ndifferent layers, with an average Pearson correlation coefficient of up to\n0.85. Inspired by this inherent observation, we propose Dense-and-Implicit\nAttention (DIA), which directly shares SAMs across layers and employs a long\nshort-term memory module to calibrate and bridge the highly correlated\nattention maps of different layers, thus improving the parameter utilization\nefficiency of SAMs. This design of DIA is also consistent with the neural\nnetwork's dynamical system perspective. Through extensive experiments, we\ndemonstrate that our simple yet effective DIA can consistently enhance various\nnetwork backbones, including ResNet, Transformer, and UNet, across tasks such\nas image classification, object detection, and image generation using diffusion\nmodels.\n","authors":["Zhongzhan Huang","Senwei Liang","Mingfu Liang","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2210.16101v2.pdf","comment":"Work in progress. arXiv admin note: text overlap with\n arXiv:1905.10671"},{"id":"http://arxiv.org/abs/2404.06493v2","updated":"2024-04-10T02:24:58Z","published":"2024-04-09T17:48:52Z","title":"Flying with Photons: Rendering Novel Views of Propagating Light","summary":" We present an imaging and neural rendering technique that seeks to synthesize\nvideos of light propagating through a scene from novel, moving camera\nviewpoints. Our approach relies on a new ultrafast imaging setup to capture a\nfirst-of-its kind, multi-viewpoint video dataset with picosecond-level temporal\nresolution. Combined with this dataset, we introduce an efficient neural volume\nrendering framework based on the transient field. This field is defined as a\nmapping from a 3D point and 2D direction to a high-dimensional, discrete-time\nsignal that represents time-varying radiance at ultrafast timescales. Rendering\nwith transient fields naturally accounts for effects due to the finite speed of\nlight, including viewpoint-dependent appearance changes caused by light\npropagation delays to the camera. We render a range of complex effects,\nincluding scattering, specular reflection, refraction, and diffraction.\nAdditionally, we demonstrate removing viewpoint-dependent propagation delays\nusing a time warping procedure, rendering of relativistic effects, and video\nsynthesis of direct and global components of light transport.\n","authors":["Anagh Malik","Noah Juravsky","Ryan Po","Gordon Wetzstein","Kiriakos N. Kutulakos","David B. Lindell"],"pdf_url":"https://arxiv.org/pdf/2404.06493v2.pdf","comment":"Project page: https://anaghmalik.com/FlyingWithPhotons/"},{"id":"http://arxiv.org/abs/2404.06507v2","updated":"2024-04-10T02:23:09Z","published":"2024-04-09T17:55:41Z","title":"Reconstructing Hand-Held Objects in 3D","summary":" Objects manipulated by the hand (i.e., manipulanda) are particularly\nchallenging to reconstruct from in-the-wild RGB images or videos. Not only does\nthe hand occlude much of the object, but also the object is often only visible\nin a small number of image pixels. At the same time, two strong anchors emerge\nin this setting: (1) estimated 3D hands help disambiguate the location and\nscale of the object, and (2) the set of manipulanda is small relative to all\npossible objects. With these insights in mind, we present a scalable paradigm\nfor handheld object reconstruction that builds on recent breakthroughs in large\nlanguage/vision models and 3D object datasets. Our model, MCC-Hand-Object\n(MCC-HO), jointly reconstructs hand and object geometry given a single RGB\nimage and inferred 3D hand as inputs. Subsequently, we use GPT-4(V) to retrieve\na 3D object model that matches the object in the image and rigidly align the\nmodel to the network-inferred geometry; we call this alignment\nRetrieval-Augmented Reconstruction (RAR). Experiments demonstrate that MCC-HO\nachieves state-of-the-art performance on lab and Internet datasets, and we show\nhow RAR can be used to automatically obtain 3D labels for in-the-wild images of\nhand-object interactions.\n","authors":["Jane Wu","Georgios Pavlakos","Georgia Gkioxari","Jitendra Malik"],"pdf_url":"https://arxiv.org/pdf/2404.06507v2.pdf","comment":"Project page: https://janehwu.github.io/mcc-ho"},{"id":"http://arxiv.org/abs/2311.10568v2","updated":"2024-04-10T02:19:19Z","published":"2023-11-17T15:08:15Z","title":"Phase Guided Light Field for Spatial-Depth High Resolution 3D Imaging","summary":" On 3D imaging, light field cameras typically are of single shot, and however,\nthey heavily suffer from low spatial resolution and depth accuracy. In this\npaper, by employing an optical projector to project a group of single\nhigh-frequency phase-shifted sinusoid patterns, we propose a phase guided light\nfield algorithm to significantly improve both the spatial and depth resolutions\nfor off-the-shelf light field cameras. First, for correcting the axial\naberrations caused by the main lens of our light field camera, we propose a\ndeformed cone model to calibrate our structured light field system. Second,\nover wrapped phases computed from patterned images, we propose a stereo\nmatching algorithm, i.e. phase guided sum of absolute difference, to robustly\nobtain the correspondence for each pair of neighbored two lenslets. Finally, by\nintroducing a virtual camera according to the basic geometrical optics of light\nfield imaging, we propose a reorganization strategy to reconstruct 3D point\nclouds with spatial-depth high resolution. Experimental results show that,\ncompared with the state-of-the-art active light field methods, the proposed\nreconstructs 3D point clouds with a spatial resolution of 1280$\\times$720 with\nfactors 10$\\times$ increased, while maintaining the same high depth resolution\nand needing merely a single group of high-frequency patterns.\n","authors":["Geyou Zhang","Ce Zhu","Kai Liu","Yipeng Liu"],"pdf_url":"https://arxiv.org/pdf/2311.10568v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06479v2","updated":"2024-04-10T02:12:27Z","published":"2024-04-09T17:30:18Z","title":"Text-Based Reasoning About Vector Graphics","summary":" While large multimodal models excel in broad vision-language benchmarks, they\noften struggle with tasks requiring precise perception of low-level visual\ndetails, such as comparing line lengths or solving simple mazes. In particular,\nthis failure mode persists in question-answering tasks about vector graphics --\nimages composed purely of 2D objects and shapes. To address this challenge, we\npropose the Visually Descriptive Language Model (VDLM), which performs\ntext-based reasoning about vector graphics. VDLM leverages Scalable Vector\nGraphics (SVG) for a more precise visual description and first uses an\noff-the-shelf raster-to-SVG algorithm for encoding. Since existing language\nmodels cannot understand raw SVGs in a zero-shot setting, VDLM then bridges SVG\nwith pretrained language models through a newly introduced intermediate\nsymbolic representation, Primal Visual Description (PVD), comprising primitive\nattributes (e.g., shape, position, measurement) with their corresponding\npredicted values. PVD is task-agnostic and represents visual primitives that\nare universal across all vector graphics. It can be learned with procedurally\ngenerated (SVG, PVD) pairs and also enables the direct use of LLMs for\ngeneralization to complex reasoning tasks. By casting an image to a text-based\nrepresentation, we can leverage the power of language models to learn alignment\nfrom SVG to visual primitives and generalize to unseen question-answering\ntasks. Empirical results show that VDLM achieves stronger zero-shot performance\ncompared to state-of-the-art LMMs, such as GPT-4V, in various low-level\nmultimodal perception and reasoning tasks on vector graphics. We additionally\npresent extensive analyses on VDLM's performance, demonstrating that our\nframework offers better interpretability due to its disentangled perception and\nreasoning processes. Project page: https://mikewangwzhl.github.io/VDLM/\n","authors":["Zhenhailong Wang","Joy Hsu","Xingyao Wang","Kuan-Hao Huang","Manling Li","Jiajun Wu","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2404.06479v2.pdf","comment":"Project page: https://mikewangwzhl.github.io/VDLM/"},{"id":"http://arxiv.org/abs/2404.06683v1","updated":"2024-04-10T02:03:14Z","published":"2024-04-10T02:03:14Z","title":"Unsupervised Visible-Infrared ReID via Pseudo-label Correction and\n Modality-level Alignment","summary":" Unsupervised visible-infrared person re-identification (UVI-ReID) has\nrecently gained great attention due to its potential for enhancing human\ndetection in diverse environments without labeling. Previous methods utilize\nintra-modality clustering and cross-modality feature matching to achieve\nUVI-ReID. However, there exist two challenges: 1) noisy pseudo labels might be\ngenerated in the clustering process, and 2) the cross-modality feature\nalignment via matching the marginal distribution of visible and infrared\nmodalities may misalign the different identities from two modalities. In this\npaper, we first conduct a theoretic analysis where an interpretable\ngeneralization upper bound is introduced. Based on the analysis, we then\npropose a novel unsupervised cross-modality person re-identification framework\n(PRAISE). Specifically, to address the first challenge, we propose a\npseudo-label correction strategy that utilizes a Beta Mixture Model to predict\nthe probability of mis-clustering based network's memory effect and rectifies\nthe correspondence by adding a perceptual term to contrastive learning. Next,\nwe introduce a modality-level alignment strategy that generates paired\nvisible-infrared latent features and reduces the modality gap by aligning the\nlabeling function of visible and infrared features to learn identity\ndiscriminative and modality-invariant features. Experimental results on two\nbenchmark datasets demonstrate that our method achieves state-of-the-art\nperformance than the unsupervised visible-ReID methods.\n","authors":["Yexin Liu","Weiming Zhang","Athanasios V. Vasilakos","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06683v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.02065v2","updated":"2024-04-10T01:53:17Z","published":"2024-04-02T16:06:20Z","title":"Multi-Level Label Correction by Distilling Proximate Patterns for\n Semi-supervised Semantic Segmentation","summary":" Semi-supervised semantic segmentation relieves the reliance on large-scale\nlabeled data by leveraging unlabeled data. Recent semi-supervised semantic\nsegmentation approaches mainly resort to pseudo-labeling methods to exploit\nunlabeled data. However, unreliable pseudo-labeling can undermine the\nsemi-supervision processes. In this paper, we propose an algorithm called\nMulti-Level Label Correction (MLLC), which aims to use graph neural networks to\ncapture structural relationships in Semantic-Level Graphs (SLGs) and\nClass-Level Graphs (CLGs) to rectify erroneous pseudo-labels. Specifically,\nSLGs represent semantic affinities between pairs of pixel features, and CLGs\ndescribe classification consistencies between pairs of pixel labels. With the\nsupport of proximate pattern information from graphs, MLLC can rectify\nincorrectly predicted pseudo-labels and can facilitate discriminative feature\nrepresentations. We design an end-to-end network to train and perform this\neffective label corrections mechanism. Experiments demonstrate that MLLC can\nsignificantly improve supervised baselines and outperforms state-of-the-art\napproaches in different scenarios on Cityscapes and PASCAL VOC 2012 datasets.\nSpecifically, MLLC improves the supervised baseline by at least 5% and 2% with\nDeepLabV2 and DeepLabV3+ respectively under different partition protocols.\n","authors":["Hui Xiao","Yuting Hong","Li Dong","Diqun Yan","Jiayan Zhuang","Junjie Xiong","Dongtai Liang","Chengbin Peng"],"pdf_url":"https://arxiv.org/pdf/2404.02065v2.pdf","comment":"12 pages, 8 figures. IEEE Transactions on Multimedia, 2024"},{"id":"http://arxiv.org/abs/2301.04218v4","updated":"2024-04-10T01:11:15Z","published":"2023-01-10T21:50:26Z","title":"Leveraging Diffusion For Strong and High Quality Face Morphing Attacks","summary":" Face morphing attacks seek to deceive a Face Recognition (FR) system by\npresenting a morphed image consisting of the biometric qualities from two\ndifferent identities with the aim of triggering a false acceptance with one of\nthe two identities, thereby presenting a significant threat to biometric\nsystems. The success of a morphing attack is dependent on the ability of the\nmorphed image to represent the biometric characteristics of both identities\nthat were used to create the image. We present a novel morphing attack that\nuses a Diffusion-based architecture to improve the visual fidelity of the image\nand the ability of the morphing attack to represent characteristics from both\nidentities. We demonstrate the effectiveness of the proposed attack by\nevaluating its visual fidelity via the Frechet Inception Distance (FID). Also,\nextensive experiments are conducted to measure the vulnerability of FR systems\nto the proposed attack. The ability of a morphing attack detector to detect the\nproposed attack is measured and compared against two state-of-the-art GAN-based\nmorphing attacks along with two Landmark-based attacks. Additionally, a novel\nmetric to measure the relative strength between different morphing attacks is\nintroduced and evaluated.\n","authors":["Zander W. Blasingame","Chen Liu"],"pdf_url":"https://arxiv.org/pdf/2301.04218v4.pdf","comment":"Diffusion Morphs (DiM) paper. Accepted in IEEE TBIOM"},{"id":"http://arxiv.org/abs/2404.05215v2","updated":"2024-04-10T00:49:11Z","published":"2024-04-08T06:07:32Z","title":"Spatio-Temporal Attention and Gaussian Processes for Personalized Video\n Gaze Estimation","summary":" Gaze is an essential prompt for analyzing human behavior and attention.\nRecently, there has been an increasing interest in determining gaze direction\nfrom facial videos. However, video gaze estimation faces significant\nchallenges, such as understanding the dynamic evolution of gaze in video\nsequences, dealing with static backgrounds, and adapting to variations in\nillumination. To address these challenges, we propose a simple and novel deep\nlearning model designed to estimate gaze from videos, incorporating a\nspecialized attention module. Our method employs a spatial attention mechanism\nthat tracks spatial dynamics within videos. This technique enables accurate\ngaze direction prediction through a temporal sequence model, adeptly\ntransforming spatial observations into temporal insights, thereby significantly\nimproving gaze estimation accuracy. Additionally, our approach integrates\nGaussian processes to include individual-specific traits, facilitating the\npersonalization of our model with just a few labeled samples. Experimental\nresults confirm the efficacy of the proposed approach, demonstrating its\nsuccess in both within-dataset and cross-dataset settings. Specifically, our\nproposed approach achieves state-of-the-art performance on the Gaze360 dataset,\nimproving by $2.5^\\circ$ without personalization. Further, by personalizing the\nmodel with just three samples, we achieved an additional improvement of\n$0.8^\\circ$. The code and pre-trained models are available at\n\\url{https://github.com/jswati31/stage}.\n","authors":["Swati Jindal","Mohit Yadav","Roberto Manduchi"],"pdf_url":"https://arxiv.org/pdf/2404.05215v2.pdf","comment":"Accepted at CVPR 2024 Gaze workshop"},{"id":"http://arxiv.org/abs/2404.06666v1","updated":"2024-04-10T00:26:08Z","published":"2024-04-10T00:26:08Z","title":"SafeGen: Mitigating Unsafe Content Generation in Text-to-Image Models","summary":" Text-to-image (T2I) models, such as Stable Diffusion, have exhibited\nremarkable performance in generating high-quality images from text descriptions\nin recent years. However, text-to-image models may be tricked into generating\nnot-safe-for-work (NSFW) content, particularly in sexual scenarios. Existing\ncountermeasures mostly focus on filtering inappropriate inputs and outputs, or\nsuppressing improper text embeddings, which can block explicit NSFW-related\ncontent (e.g., naked or sexy) but may still be vulnerable to adversarial\nprompts inputs that appear innocent but are ill-intended. In this paper, we\npresent SafeGen, a framework to mitigate unsafe content generation by\ntext-to-image models in a text-agnostic manner. The key idea is to eliminate\nunsafe visual representations from the model regardless of the text input. In\nthis way, the text-to-image model is resistant to adversarial prompts since\nunsafe visual representations are obstructed from within. Extensive experiments\nconducted on four datasets demonstrate SafeGen's effectiveness in mitigating\nunsafe content generation while preserving the high-fidelity of benign images.\nSafeGen outperforms eight state-of-the-art baseline methods and achieves 99.1%\nsexual content removal performance. Furthermore, our constructed benchmark of\nadversarial prompts provides a basis for future development and evaluation of\nanti-NSFW-generation methods.\n","authors":["Xinfeng Li","Yuchen Yang","Jiangyi Deng","Chen Yan","Yanjiao Chen","Xiaoyu Ji","Wenyuan Xu"],"pdf_url":"https://arxiv.org/pdf/2404.06666v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06665v1","updated":"2024-04-10T00:25:09Z","published":"2024-04-10T00:25:09Z","title":"Deep Generative Data Assimilation in Multimodal Setting","summary":" Robust integration of physical knowledge and data is key to improve\ncomputational simulations, such as Earth system models. Data assimilation is\ncrucial for achieving this goal because it provides a systematic framework to\ncalibrate model outputs with observations, which can include remote sensing\nimagery and ground station measurements, with uncertainty quantification.\nConventional methods, including Kalman filters and variational approaches,\ninherently rely on simplifying linear and Gaussian assumptions, and can be\ncomputationally expensive. Nevertheless, with the rapid adoption of data-driven\nmethods in many areas of computational sciences, we see the potential of\nemulating traditional data assimilation with deep learning, especially\ngenerative models. In particular, the diffusion-based probabilistic framework\nhas large overlaps with data assimilation principles: both allows for\nconditional generation of samples with a Bayesian inverse framework. These\nmodels have shown remarkable success in text-conditioned image generation or\nimage-controlled video synthesis. Likewise, one can frame data assimilation as\nobservation-conditioned state calibration. In this work, we propose SLAMS:\nScore-based Latent Assimilation in Multimodal Setting. Specifically, we\nassimilate in-situ weather station data and ex-situ satellite imagery to\ncalibrate the vertical temperature profiles, globally. Through extensive\nablation, we demonstrate that SLAMS is robust even in low-resolution, noisy,\nand sparse data settings. To our knowledge, our work is the first to apply deep\ngenerative framework for multimodal data assimilation using real-world\ndatasets; an important step for building robust computational simulators,\nincluding the next-generation Earth system models. Our code is available at:\nhttps://github.com/yongquan-qu/SLAMS\n","authors":["Yongquan Qu","Juan Nathaniel","Shuolin Li","Pierre Gentine"],"pdf_url":"https://arxiv.org/pdf/2404.06665v1.pdf","comment":"Accepted to CVPR2024 EarthVision"},{"id":"http://arxiv.org/abs/2404.06663v1","updated":"2024-04-10T00:11:03Z","published":"2024-04-10T00:11:03Z","title":"Multi-modal Document Presentation Attack Detection With Forensics Trace\n Disentanglement","summary":" Document Presentation Attack Detection (DPAD) is an important measure in\nprotecting the authenticity of a document image. However, recent DPAD methods\ndemand additional resources, such as manual effort in collecting additional\ndata or knowing the parameters of acquisition devices. This work proposes a\nDPAD method based on multi-modal disentangled traces (MMDT) without the above\ndrawbacks. We first disentangle the recaptured traces by a self-supervised\ndisentanglement and synthesis network to enhance the generalization capacity in\ndocument images with different contents and layouts. Then, unlike the existing\nDPAD approaches that rely only on data in the RGB domain, we propose to\nexplicitly employ the disentangled recaptured traces as new modalities in the\ntransformer backbone through adaptive multi-modal adapters to fuse RGB/trace\nfeatures efficiently. Visualization of the disentangled traces confirms the\neffectiveness of the proposed method in different document contents. Extensive\nexperiments on three benchmark datasets demonstrate the superiority of our MMDT\nmethod on representing forensic traces of recapturing distortion.\n","authors":["Changsheng Chen","Yongyi Deng","Liangwei Lin","Zitong Yu","Zhimao Lai"],"pdf_url":"https://arxiv.org/pdf/2404.06663v1.pdf","comment":"Accepted to ICME 2024"},{"id":"http://arxiv.org/abs/2404.06661v1","updated":"2024-04-10T00:05:55Z","published":"2024-04-10T00:05:55Z","title":"Efficient Denoising using Score Embedding in Score-based Diffusion\n Models","summary":" It is well known that training a denoising score-based diffusion models\nrequires tens of thousands of epochs and a substantial number of image data to\ntrain the model. In this paper, we propose to increase the efficiency in\ntraining score-based diffusion models. Our method allows us to decrease the\nnumber of epochs needed to train the diffusion model. We accomplish this by\nsolving the log-density Fokker-Planck (FP) Equation numerically to compute the\nscore \\textit{before} training. The pre-computed score is embedded into the\nimage to encourage faster training under slice Wasserstein distance.\nConsequently, it also allows us to decrease the number of images we need to\ntrain the neural network to learn an accurate score. We demonstrate through our\nnumerical experiments the improved performance of our proposed method compared\nto standard score-based diffusion models. Our proposed method achieves a\nsimilar quality to the standard method meaningfully faster.\n","authors":["Andrew S. Na","William Gao","Justin W. L. Wan"],"pdf_url":"https://arxiv.org/pdf/2404.06661v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08017v1","updated":"2024-04-10T19:16:08Z","published":"2024-04-10T19:16:08Z","title":"AI-Guided Feature Segmentation Techniques to Model Features from Single\n Crystal Diamond Growth","summary":" Process refinement to consistently produce high-quality material over a large\narea of the grown crystal, enabling various applications from optics crystals\nto quantum detectors, has long been a goal for diamond growth. Machine learning\noffers a promising path toward this goal, but faces challenges such as the\ncomplexity of features within datasets, their time-dependency, and the volume\nof data produced per growth run. Accurate spatial feature extraction from image\nto image for real-time monitoring of diamond growth is crucial yet complicated\ndue to the low-volume and high feature complexity nature of the datasets. This\npaper compares various traditional and machine learning-driven approaches for\nfeature extraction in the diamond growth domain, proposing a novel deep\nlearning-driven semantic segmentation approach to isolate and classify accurate\npixel masks of geometric features like diamond, pocket holder, and background,\nalong with their derivative features based on shape and size. Using an\nannotation-focused human-in-the-loop software architecture for training\ndatasets, with modules for selective data labeling using active learning, data\naugmentations, and model-assisted labeling, our approach achieves effective\nannotation accuracy and drastically reduces labeling time and cost. Deep\nlearning algorithms prove highly efficient in accurately learning complex\nrepresentations from datasets with many features. Our top-performing model,\nbased on the DeeplabV3plus architecture, achieves outstanding accuracy in\nclassifying features of interest, with accuracies of 96.31% for pocket holder,\n98.60% for diamond top, and 91.64% for diamond side features.\n","authors":["Rohan Reddy Mekala","Elias Garratt","Matthias Muehle","Arjun Srinivasan","Adam Porter","Mikael Lindvall"],"pdf_url":"https://arxiv.org/pdf/2404.08017v1.pdf","comment":"12 pages,4 figures,ACMME 2024. arXiv admin note: substantial text\n overlap with arXiv:2404.07306"},{"id":"http://arxiv.org/abs/2404.08013v1","updated":"2024-04-10T15:37:15Z","published":"2024-04-10T15:37:15Z","title":"Enhanced Cooperative Perception for Autonomous Vehicles Using Imperfect\n Communication","summary":" Sharing and joint processing of camera feeds and sensor measurements, known\nas Cooperative Perception (CP), has emerged as a new technique to achieve\nhigher perception qualities. CP can enhance the safety of Autonomous Vehicles\n(AVs) where their individual visual perception quality is compromised by\nadverse weather conditions (haze as foggy weather), low illumination, winding\nroads, and crowded traffic. To cover the limitations of former methods, in this\npaper, we propose a novel approach to realize an optimized CP under constrained\ncommunications. At the core of our approach is recruiting the best helper from\nthe available list of front vehicles to augment the visual range and enhance\nthe Object Detection (OD) accuracy of the ego vehicle. In this two-step\nprocess, we first select the helper vehicles that contribute the most to CP\nbased on their visual range and lowest motion blur. Next, we implement a radio\nblock optimization among the candidate vehicles to further improve\ncommunication efficiency. We specifically focus on pedestrian detection as an\nexemplary scenario. To validate our approach, we used the CARLA simulator to\ncreate a dataset of annotated videos for different driving scenarios where\npedestrian detection is challenging for an AV with compromised vision. Our\nresults demonstrate the efficacy of our two-step optimization process in\nimproving the overall performance of cooperative perception in challenging\nscenarios, substantially improving driving safety under adverse conditions.\nFinally, we note that the networking assumptions are adopted from LTE Release\n14 Mode 4 side-link communication, commonly used for Vehicle-to-Vehicle (V2V)\ncommunication. Nonetheless, our method is flexible and applicable to arbitrary\nV2V communications.\n","authors":["Ahmad Sarlak","Hazim Alzorgan","Sayed Pedram Haeri Boroujeni","Abolfazl Razi","Rahul Amin"],"pdf_url":"https://arxiv.org/pdf/2404.08013v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08011v1","updated":"2024-04-10T06:30:33Z","published":"2024-04-10T06:30:33Z","title":"An inclusive review on deep learning techniques and their scope in\n handwriting recognition","summary":" Deep learning expresses a category of machine learning algorithms that have\nthe capability to combine raw inputs into intermediate features layers. These\ndeep learning algorithms have demonstrated great results in different fields.\nDeep learning has particularly witnessed for a great achievement of human level\nperformance across a number of domains in computer vision and pattern\nrecognition. For the achievement of state-of-the-art performances in diverse\ndomains, the deep learning used different architectures and these architectures\nused activation functions to perform various computations between hidden and\noutput layers of any architecture. This paper presents a survey on the existing\nstudies of deep learning in handwriting recognition field. Even though the\nrecent progress indicates that the deep learning methods has provided valuable\nmeans for speeding up or proving accurate results in handwriting recognition,\nbut following from the extensive literature survey, the present study finds\nthat the deep learning has yet to revolutionize more and has to resolve many of\nthe most pressing challenges in this field, but promising advances have been\nmade on the prior state of the art. Additionally, an inadequate availability of\nlabelled data to train presents problems in this domain. Nevertheless, the\npresent handwriting recognition survey foresees deep learning enabling changes\nat both bench and bedside with the potential to transform several domains as\nimage processing, speech recognition, computer vision, machine translation,\nrobotics and control, medical imaging, medical information processing,\nbio-informatics, natural language processing, cyber security, and many others.\n","authors":["Sukhdeep Singh","Sudhir Rohilla","Anuj Sharma"],"pdf_url":"https://arxiv.org/pdf/2404.08011v1.pdf","comment":null}]},"2024-04-11T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.07992v1","updated":"2024-04-11T17:59:59Z","published":"2024-04-11T17:59:59Z","title":"GoMVS: Geometrically Consistent Cost Aggregation for Multi-View Stereo","summary":" Matching cost aggregation plays a fundamental role in learning-based\nmulti-view stereo networks. However, directly aggregating adjacent costs can\nlead to suboptimal results due to local geometric inconsistency. Related\nmethods either seek selective aggregation or improve aggregated depth in the 2D\nspace, both are unable to handle geometric inconsistency in the cost volume\neffectively. In this paper, we propose GoMVS to aggregate geometrically\nconsistent costs, yielding better utilization of adjacent geometries. More\nspecifically, we correspond and propagate adjacent costs to the reference pixel\nby leveraging the local geometric smoothness in conjunction with surface\nnormals. We achieve this by the geometric consistent propagation (GCP) module.\nIt computes the correspondence from the adjacent depth hypothesis space to the\nreference depth space using surface normals, then uses the correspondence to\npropagate adjacent costs to the reference geometry, followed by a convolution\nfor aggregation. Our method achieves new state-of-the-art performance on DTU,\nTanks & Temple, and ETH3D datasets. Notably, our method ranks 1st on the Tanks\n& Temple Advanced benchmark.\n","authors":["Jiang Wu","Rui Li","Haofei Xu","Wenxun Zhao","Yu Zhu","Jinqiu Sun","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.07992v1.pdf","comment":"CVPR 2024. Project page: https://wuuu3511.github.io/gomvs/ Code:\n https://github.com/Wuuu3511/GoMVS"},{"id":"http://arxiv.org/abs/2404.07993v1","updated":"2024-04-11T17:59:59Z","published":"2024-04-11T17:59:59Z","title":"Connecting NeRFs, Images, and Text","summary":" Neural Radiance Fields (NeRFs) have emerged as a standard framework for\nrepresenting 3D scenes and objects, introducing a novel data type for\ninformation exchange and storage. Concurrently, significant progress has been\nmade in multimodal representation learning for text and image data. This paper\nexplores a novel research direction that aims to connect the NeRF modality with\nother modalities, similar to established methodologies for images and text. To\nthis end, we propose a simple framework that exploits pre-trained models for\nNeRF representations alongside multimodal models for text and image processing.\nOur framework learns a bidirectional mapping between NeRF embeddings and those\nobtained from corresponding images and text. This mapping unlocks several novel\nand useful applications, including NeRF zero-shot classification and NeRF\nretrieval from images or text.\n","authors":["Francesco Ballerini","Pierluigi Zama Ramirez","Roberto Mirabella","Samuele Salti","Luigi Di Stefano"],"pdf_url":"https://arxiv.org/pdf/2404.07993v1.pdf","comment":"Accepted at CVPRW-INRV 2024"},{"id":"http://arxiv.org/abs/2404.07991v1","updated":"2024-04-11T17:59:57Z","published":"2024-04-11T17:59:57Z","title":"GoMAvatar: Efficient Animatable Human Modeling from Monocular Video\n Using Gaussians-on-Mesh","summary":" We introduce GoMAvatar, a novel approach for real-time, memory-efficient,\nhigh-quality animatable human modeling. GoMAvatar takes as input a single\nmonocular video to create a digital avatar capable of re-articulation in new\nposes and real-time rendering from novel viewpoints, while seamlessly\nintegrating with rasterization-based graphics pipelines. Central to our method\nis the Gaussians-on-Mesh representation, a hybrid 3D model combining rendering\nquality and speed of Gaussian splatting with geometry modeling and\ncompatibility of deformable meshes. We assess GoMAvatar on ZJU-MoCap data and\nvarious YouTube videos. GoMAvatar matches or surpasses current monocular human\nmodeling algorithms in rendering quality and significantly outperforms them in\ncomputational efficiency (43 FPS) while being memory-efficient (3.63 MB per\nsubject).\n","authors":["Jing Wen","Xiaoming Zhao","Zhongzheng Ren","Alexander G. Schwing","Shenlong Wang"],"pdf_url":"https://arxiv.org/pdf/2404.07991v1.pdf","comment":"CVPR 2024; project page: https://wenj.github.io/GoMAvatar/"},{"id":"http://arxiv.org/abs/2404.07990v1","updated":"2024-04-11T17:59:56Z","published":"2024-04-11T17:59:56Z","title":"OpenBias: Open-set Bias Detection in Text-to-Image Generative Models","summary":" Text-to-image generative models are becoming increasingly popular and\naccessible to the general public. As these models see large-scale deployments,\nit is necessary to deeply investigate their safety and fairness to not\ndisseminate and perpetuate any kind of biases. However, existing works focus on\ndetecting closed sets of biases defined a priori, limiting the studies to\nwell-known concepts. In this paper, we tackle the challenge of open-set bias\ndetection in text-to-image generative models presenting OpenBias, a new\npipeline that identifies and quantifies the severity of biases agnostically,\nwithout access to any precompiled set. OpenBias has three stages. In the first\nphase, we leverage a Large Language Model (LLM) to propose biases given a set\nof captions. Secondly, the target generative model produces images using the\nsame set of captions. Lastly, a Vision Question Answering model recognizes the\npresence and extent of the previously proposed biases. We study the behavior of\nStable Diffusion 1.5, 2, and XL emphasizing new biases, never investigated\nbefore. Via quantitative experiments, we demonstrate that OpenBias agrees with\ncurrent closed-set bias detection methods and human judgement.\n","authors":["Moreno D'Incà","Elia Peruzzo","Massimiliano Mancini","Dejia Xu","Vidit Goel","Xingqian Xu","Zhangyang Wang","Humphrey Shi","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2404.07990v1.pdf","comment":"CVPR 2024 Highlight - Code:\n https://github.com/Picsart-AI-Research/OpenBias"},{"id":"http://arxiv.org/abs/2404.07989v1","updated":"2024-04-11T17:59:45Z","published":"2024-04-11T17:59:45Z","title":"Any2Point: Empowering Any-modality Large Models for Efficient 3D\n Understanding","summary":" Large foundation models have recently emerged as a prominent focus of\ninterest, attaining superior performance in widespread scenarios. Due to the\nscarcity of 3D data, many efforts have been made to adapt pre-trained\ntransformers from vision to 3D domains. However, such 2D-to-3D approaches are\nstill limited, due to the potential loss of spatial geometries and high\ncomputation cost. More importantly, their frameworks are mainly designed for 2D\nmodels, lacking a general any-to-3D paradigm. In this paper, we introduce\nAny2Point, a parameter-efficient method to empower any-modality large models\n(vision, language, audio) for 3D understanding. Given a frozen transformer from\nany source modality, we propose a 3D-to-any (1D or 2D) virtual projection\nstrategy that correlates the input 3D points to the original 1D or 2D positions\nwithin the source modality. This mechanism enables us to assign each 3D token\nwith a positional encoding paired with the pre-trained model, which avoids 3D\ngeometry loss caused by the true projection and better motivates the\ntransformer for 3D learning with 1D/2D positional priors. Then, within each\ntransformer block, we insert an any-to-3D guided adapter module for\nparameter-efficient fine-tuning. The adapter incorporates prior spatial\nknowledge from the source modality to guide the local feature aggregation of 3D\ntokens, compelling the semantic adaption of any-modality transformers. We\nconduct extensive experiments to showcase the effectiveness and efficiency of\nour method. Code and models are released at\nhttps://github.com/Ivan-Tang-3D/Any2Point.\n","authors":["Yiwen Tang","Jiaming Liu","Dong Wang","Zhigang Wang","Shanghang Zhang","Bin Zhao","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2404.07989v1.pdf","comment":"Code and models are released at\n https://github.com/Ivan-Tang-3D/Any2Point"},{"id":"http://arxiv.org/abs/2401.10222v2","updated":"2024-04-11T17:59:42Z","published":"2024-01-18T18:58:54Z","title":"Supervised Fine-tuning in turn Improves Visual Foundation Models","summary":" Image-text training like CLIP has dominated the pretraining of vision\nfoundation models in recent years. Subsequent efforts have been made to\nintroduce region-level visual learning into CLIP's pretraining but face\nscalability challenges due to the lack of large-scale region-level datasets.\nDrawing inspiration from supervised fine-tuning (SFT) in natural language\nprocessing such as instruction tuning, we explore the potential of fine-grained\nSFT in enhancing the generation of vision foundation models after their\npretraining. Thus a two-stage method ViSFT (Vision SFT) is proposed to unleash\nthe fine-grained knowledge of vision foundation models. In ViSFT, the vision\nfoundation model is enhanced by performing visual joint learning on some\nin-domain tasks and then tested on out-of-domain benchmarks. With updating\nusing ViSFT on 8 V100 GPUs in less than 2 days, a vision transformer with over\n4.4B parameters shows improvements across various out-of-domain benchmarks\nincluding vision and vision-linguistic scenarios.\n","authors":["Xiaohu Jiang","Yixiao Ge","Yuying Ge","Dachuan Shi","Chun Yuan","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2401.10222v2.pdf","comment":"23 pages, 3 figures, Project page:\n https://github.com/TencentARC/ViSFT/tree/main"},{"id":"http://arxiv.org/abs/2404.07988v1","updated":"2024-04-11T17:59:40Z","published":"2024-04-11T17:59:40Z","title":"QuasiSim: Parameterized Quasi-Physical Simulators for Dexterous\n Manipulations Transfer","summary":" We explore the dexterous manipulation transfer problem by designing\nsimulators. The task wishes to transfer human manipulations to dexterous robot\nhand simulations and is inherently difficult due to its intricate,\nhighly-constrained, and discontinuous dynamics and the need to control a\ndexterous hand with a DoF to accurately replicate human manipulations. Previous\napproaches that optimize in high-fidelity black-box simulators or a modified\none with relaxed constraints only demonstrate limited capabilities or are\nrestricted by insufficient simulation fidelity. We introduce parameterized\nquasi-physical simulators and a physics curriculum to overcome these\nlimitations. The key ideas are 1) balancing between fidelity and optimizability\nof the simulation via a curriculum of parameterized simulators, and 2) solving\nthe problem in each of the simulators from the curriculum, with properties\nranging from high task optimizability to high fidelity. We successfully enable\na dexterous hand to track complex and diverse manipulations in high-fidelity\nsimulated environments, boosting the success rate by 11\\%+ from the\nbest-performed baseline. The project website is available at\nhttps://meowuu7.github.io/QuasiSim/.\n","authors":["Xueyi Liu","Kangbo Lyu","Jieqiong Zhang","Tao Du","Li Yi"],"pdf_url":"https://arxiv.org/pdf/2404.07988v1.pdf","comment":"Project website: https://meowuu7.github.io/QuasiSim/ Code:\n https://github.com/Meowuu7/QuasiSim Hugging Face Demo:\n https://huggingface.co/spaces/xymeow7/quasi-physical-sims"},{"id":"http://arxiv.org/abs/2404.07987v1","updated":"2024-04-11T17:59:09Z","published":"2024-04-11T17:59:09Z","title":"ControlNet++: Improving Conditional Controls with Efficient Consistency\n Feedback","summary":" To enhance the controllability of text-to-image diffusion models, existing\nefforts like ControlNet incorporated image-based conditional controls. In this\npaper, we reveal that existing methods still face significant challenges in\ngenerating images that align with the image conditional controls. To this end,\nwe propose ControlNet++, a novel approach that improves controllable generation\nby explicitly optimizing pixel-level cycle consistency between generated images\nand conditional controls. Specifically, for an input conditional control, we\nuse a pre-trained discriminative reward model to extract the corresponding\ncondition of the generated images, and then optimize the consistency loss\nbetween the input conditional control and extracted condition. A\nstraightforward implementation would be generating images from random noises\nand then calculating the consistency loss, but such an approach requires\nstoring gradients for multiple sampling timesteps, leading to considerable time\nand memory costs. To address this, we introduce an efficient reward strategy\nthat deliberately disturbs the input images by adding noise, and then uses the\nsingle-step denoised images for reward fine-tuning. This avoids the extensive\ncosts associated with image sampling, allowing for more efficient reward\nfine-tuning. Extensive experiments show that ControlNet++ significantly\nimproves controllability under various conditional controls. For example, it\nachieves improvements over ControlNet by 7.9% mIoU, 13.4% SSIM, and 7.6% RMSE,\nrespectively, for segmentation mask, line-art edge, and depth conditions.\n","authors":["Ming Li","Taojiannan Yang","Huafeng Kuang","Jie Wu","Zhaoning Wang","Xuefeng Xiao","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2404.07987v1.pdf","comment":"Project Page: https://liming-ai.github.io/ControlNet_Plus_Plus"},{"id":"http://arxiv.org/abs/2404.07985v1","updated":"2024-04-11T17:58:44Z","published":"2024-04-11T17:58:44Z","title":"WaveMo: Learning Wavefront Modulations to See Through Scattering","summary":" Imaging through scattering media is a fundamental and pervasive challenge in\nfields ranging from medical diagnostics to astronomy. A promising strategy to\novercome this challenge is wavefront modulation, which induces measurement\ndiversity during image acquisition. Despite its importance, designing optimal\nwavefront modulations to image through scattering remains under-explored. This\npaper introduces a novel learning-based framework to address the gap. Our\napproach jointly optimizes wavefront modulations and a computationally\nlightweight feedforward \"proxy\" reconstruction network. This network is trained\nto recover scenes obscured by scattering, using measurements that are modified\nby these modulations. The learned modulations produced by our framework\ngeneralize effectively to unseen scattering scenarios and exhibit remarkable\nversatility. During deployment, the learned modulations can be decoupled from\nthe proxy network to augment other more computationally expensive restoration\nalgorithms. Through extensive experiments, we demonstrate our approach\nsignificantly advances the state of the art in imaging through scattering\nmedia. Our project webpage is at https://wavemo-2024.github.io/.\n","authors":["Mingyang Xie","Haiyun Guo","Brandon Y. Feng","Lingbo Jin","Ashok Veeraraghavan","Christopher A. Metzler"],"pdf_url":"https://arxiv.org/pdf/2404.07985v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07984v1","updated":"2024-04-11T17:58:11Z","published":"2024-04-11T17:58:11Z","title":"View Selection for 3D Captioning via Diffusion Ranking","summary":" Scalable annotation approaches are crucial for constructing extensive 3D-text\ndatasets, facilitating a broader range of applications. However, existing\nmethods sometimes lead to the generation of hallucinated captions, compromising\ncaption quality. This paper explores the issue of hallucination in 3D object\ncaptioning, with a focus on Cap3D method, which renders 3D objects into 2D\nviews for captioning using pre-trained models. We pinpoint a major challenge:\ncertain rendered views of 3D objects are atypical, deviating from the training\ndata of standard image captioning models and causing hallucinations. To tackle\nthis, we present DiffuRank, a method that leverages a pre-trained text-to-3D\nmodel to assess the alignment between 3D objects and their 2D rendered views,\nwhere the view with high alignment closely represent the object's\ncharacteristics. By ranking all rendered views and feeding the top-ranked ones\ninto GPT4-Vision, we enhance the accuracy and detail of captions, enabling the\ncorrection of 200k captions in the Cap3D dataset and extending it to 1 million\ncaptions across Objaverse and Objaverse-XL datasets. Additionally, we showcase\nthe adaptability of DiffuRank by applying it to pre-trained text-to-image\nmodels for a Visual Question Answering task, where it outperforms the CLIP\nmodel.\n","authors":["Tiange Luo","Justin Johnson","Honglak Lee"],"pdf_url":"https://arxiv.org/pdf/2404.07984v1.pdf","comment":"Dataset link: https://huggingface.co/datasets/tiange/Cap3D"},{"id":"http://arxiv.org/abs/2404.07983v1","updated":"2024-04-11T17:58:06Z","published":"2024-04-11T17:58:06Z","title":"Two Effects, One Trigger: On the Modality Gap, Object Bias, and\n Information Imbalance in Contrastive Vision-Language Representation Learning","summary":" Contrastive vision-language models like CLIP have gained popularity for their\nversatile applicable learned representations in various downstream tasks.\nDespite their successes in some tasks, like zero-shot image recognition, they\nalso perform surprisingly poor on other tasks, like attribute detection.\nPrevious work has attributed these challenges to the modality gap, a separation\nof image and text in the shared representation space, and a bias towards\nobjects over other factors, such as attributes. In this work we investigate\nboth phenomena. We find that only a few embedding dimensions drive the modality\ngap. Further, we propose a measure for object bias and find that object bias\ndoes not lead to worse performance on other concepts, such as attributes. But\nwhat leads to the emergence of the modality gap and object bias? To answer this\nquestion we carefully designed an experimental setting which allows us to\ncontrol the amount of shared information between the modalities. This revealed\nthat the driving factor behind both, the modality gap and the object bias, is\nthe information imbalance between images and captions.\n","authors":["Simon Schrodi","David T. Hoffmann","Max Argus","Volker Fischer","Thomas Brox"],"pdf_url":"https://arxiv.org/pdf/2404.07983v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07977v1","updated":"2024-04-11T17:57:19Z","published":"2024-04-11T17:57:19Z","title":"Gaga: Group Any Gaussians via 3D-aware Memory Bank","summary":" We introduce Gaga, a framework that reconstructs and segments open-world 3D\nscenes by leveraging inconsistent 2D masks predicted by zero-shot segmentation\nmodels. Contrasted to prior 3D scene segmentation approaches that heavily rely\non video object tracking, Gaga utilizes spatial information and effectively\nassociates object masks across diverse camera poses. By eliminating the\nassumption of continuous view changes in training images, Gaga demonstrates\nrobustness to variations in camera poses, particularly beneficial for sparsely\nsampled images, ensuring precise mask label consistency. Furthermore, Gaga\naccommodates 2D segmentation masks from diverse sources and demonstrates robust\nperformance with different open-world zero-shot segmentation models, enhancing\nits versatility. Extensive qualitative and quantitative evaluations demonstrate\nthat Gaga performs favorably against state-of-the-art methods, emphasizing its\npotential for real-world applications such as scene understanding and\nmanipulation.\n","authors":["Weijie Lyu","Xueting Li","Abhijit Kundu","Yi-Hsuan Tsai","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2404.07977v1.pdf","comment":"Project Page: https://www.gaga.gallery"},{"id":"http://arxiv.org/abs/2404.07976v1","updated":"2024-04-11T17:56:40Z","published":"2024-04-11T17:56:40Z","title":"Self-supervised Dataset Distillation: A Good Compression Is All You Need","summary":" Dataset distillation aims to compress information from a large-scale original\ndataset to a new compact dataset while striving to preserve the utmost degree\nof the original data informational essence. Previous studies have predominantly\nconcentrated on aligning the intermediate statistics between the original and\ndistilled data, such as weight trajectory, features, gradient, BatchNorm, etc.\nIn this work, we consider addressing this task through the new lens of model\ninformativeness in the compression stage on the original dataset pretraining.\nWe observe that with the prior state-of-the-art SRe$^2$L, as model sizes\nincrease, it becomes increasingly challenging for supervised pretrained models\nto recover learned information during data synthesis, as the channel-wise mean\nand variance inside the model are flatting and less informative. We further\nnotice that larger variances in BN statistics from self-supervised models\nenable larger loss signals to update the recovered data by gradients, enjoying\nmore informativeness during synthesis. Building on this observation, we\nintroduce SC-DD, a simple yet effective Self-supervised Compression framework\nfor Dataset Distillation that facilitates diverse information compression and\nrecovery compared to traditional supervised learning schemes, further reaps the\npotential of large pretrained models with enhanced capabilities. Extensive\nexperiments are conducted on CIFAR-100, Tiny-ImageNet and ImageNet-1K datasets\nto demonstrate the superiority of our proposed approach. The proposed SC-DD\noutperforms all previous state-of-the-art supervised dataset distillation\nmethods when employing larger models, such as SRe$^2$L, MTT, TESLA, DC, CAFE,\netc., by large margins under the same recovery and post-training budgets. Code\nis available at https://github.com/VILA-Lab/SRe2L/tree/main/SCDD/.\n","authors":["Muxin Zhou","Zeyuan Yin","Shitong Shao","Zhiqiang Shen"],"pdf_url":"https://arxiv.org/pdf/2404.07976v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07973v1","updated":"2024-04-11T17:56:05Z","published":"2024-04-11T17:56:05Z","title":"Ferret-v2: An Improved Baseline for Referring and Grounding with Large\n Language Models","summary":" While Ferret seamlessly integrates regional understanding into the Large\nLanguage Model (LLM) to facilitate its referring and grounding capability, it\nposes certain limitations: constrained by the pre-trained fixed visual encoder\nand failed to perform well on broader tasks. In this work, we unveil Ferret-v2,\na significant upgrade to Ferret, with three key designs. (1) Any resolution\ngrounding and referring: A flexible approach that effortlessly handles higher\nimage resolution, improving the model's ability to process and understand\nimages in greater detail. (2) Multi-granularity visual encoding: By integrating\nthe additional DINOv2 encoder, the model learns better and diverse underlying\ncontexts for global and fine-grained visual information. (3) A three-stage\ntraining paradigm: Besides image-caption alignment, an additional stage is\nproposed for high-resolution dense alignment before the final instruction\ntuning. Experiments show that Ferret-v2 provides substantial improvements over\nFerret and other state-of-the-art methods, thanks to its high-resolution\nscaling and fine-grained visual processing.\n","authors":["Haotian Zhang","Haoxuan You","Philipp Dufter","Bowen Zhang","Chen Chen","Hong-You Chen","Tsu-Jui Fu","William Yang Wang","Shih-Fu Chang","Zhe Gan","Yinfei Yang"],"pdf_url":"https://arxiv.org/pdf/2404.07973v1.pdf","comment":"Preprint. 14 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.07949v1","updated":"2024-04-11T17:46:14Z","published":"2024-04-11T17:46:14Z","title":"Taming Stable Diffusion for Text to 360° Panorama Image Generation","summary":" Generative models, e.g., Stable Diffusion, have enabled the creation of\nphotorealistic images from text prompts. Yet, the generation of 360-degree\npanorama images from text remains a challenge, particularly due to the dearth\nof paired text-panorama data and the domain gap between panorama and\nperspective images. In this paper, we introduce a novel dual-branch diffusion\nmodel named PanFusion to generate a 360-degree image from a text prompt. We\nleverage the stable diffusion model as one branch to provide prior knowledge in\nnatural image generation and register it to another panorama branch for\nholistic image generation. We propose a unique cross-attention mechanism with\nprojection awareness to minimize distortion during the collaborative denoising\nprocess. Our experiments validate that PanFusion surpasses existing methods\nand, thanks to its dual-branch structure, can integrate additional constraints\nlike room layout for customized panorama outputs. Code is available at\nhttps://chengzhag.github.io/publication/panfusion.\n","authors":["Cheng Zhang","Qianyi Wu","Camilo Cruz Gambardella","Xiaoshui Huang","Dinh Phung","Wanli Ouyang","Jianfei Cai"],"pdf_url":"https://arxiv.org/pdf/2404.07949v1.pdf","comment":"CVPR 2024. Project Page:\n https://chengzhag.github.io/publication/panfusion Code:\n https://github.com/chengzhag/PanFusion"},{"id":"http://arxiv.org/abs/2404.07933v1","updated":"2024-04-11T17:30:24Z","published":"2024-04-11T17:30:24Z","title":"Boosting Self-Supervision for Single-View Scene Completion via Knowledge\n Distillation","summary":" Inferring scene geometry from images via Structure from Motion is a\nlong-standing and fundamental problem in computer vision. While classical\napproaches and, more recently, depth map predictions only focus on the visible\nparts of a scene, the task of scene completion aims to reason about geometry\neven in occluded regions. With the popularity of neural radiance fields\n(NeRFs), implicit representations also became popular for scene completion by\npredicting so-called density fields. Unlike explicit approaches. e.g.\nvoxel-based methods, density fields also allow for accurate depth prediction\nand novel-view synthesis via image-based rendering. In this work, we propose to\nfuse the scene reconstruction from multiple images and distill this knowledge\ninto a more accurate single-view scene reconstruction. To this end, we propose\nMulti-View Behind the Scenes (MVBTS) to fuse density fields from multiple posed\nimages, trained fully self-supervised only from image data. Using knowledge\ndistillation, we use MVBTS to train a single-view scene completion network via\ndirect supervision called KDBTS. It achieves state-of-the-art performance on\noccupancy prediction, especially in occluded regions.\n","authors":["Keonhee Han","Dominik Muhle","Felix Wimbauer","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2404.07933v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07932v1","updated":"2024-04-11T17:29:56Z","published":"2024-04-11T17:29:56Z","title":"FusionMamba: Efficient Image Fusion with State Space Model","summary":" Image fusion aims to generate a high-resolution multi/hyper-spectral image by\ncombining a high-resolution image with limited spectral information and a\nlow-resolution image with abundant spectral data. Current deep learning\n(DL)-based methods for image fusion primarily rely on CNNs or Transformers to\nextract features and merge different types of data. While CNNs are efficient,\ntheir receptive fields are limited, restricting their capacity to capture\nglobal context. Conversely, Transformers excel at learning global information\nbut are hindered by their quadratic complexity. Fortunately, recent\nadvancements in the State Space Model (SSM), particularly Mamba, offer a\npromising solution to this issue by enabling global awareness with linear\ncomplexity. However, there have been few attempts to explore the potential of\nSSM in information fusion, which is a crucial ability in domains like image\nfusion. Therefore, we propose FusionMamba, an innovative method for efficient\nimage fusion. Our contributions mainly focus on two aspects. Firstly,\nrecognizing that images from different sources possess distinct properties, we\nincorporate Mamba blocks into two U-shaped networks, presenting a novel\narchitecture that extracts spatial and spectral features in an efficient,\nindependent, and hierarchical manner. Secondly, to effectively combine spatial\nand spectral information, we extend the Mamba block to accommodate dual inputs.\nThis expansion leads to the creation of a new module called the FusionMamba\nblock, which outperforms existing fusion techniques such as concatenation and\ncross-attention. To validate FusionMamba's effectiveness, we conduct a series\nof experiments on five datasets related to three image fusion tasks. The\nquantitative and qualitative evaluation results demonstrate that our method\nachieves state-of-the-art (SOTA) performance, underscoring the superiority of\nFusionMamba.\n","authors":["Siran Peng","Xiangyu Zhu","Haoyu Deng","Zhen Lei","Liang-Jian Deng"],"pdf_url":"https://arxiv.org/pdf/2404.07932v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07930v1","updated":"2024-04-11T17:27:39Z","published":"2024-04-11T17:27:39Z","title":"Parameter Hierarchical Optimization for Visible-Infrared Person\n Re-Identification","summary":" Visible-infrared person re-identification (VI-reID) aims at matching\ncross-modality pedestrian images captured by disjoint visible or infrared\ncameras. Existing methods alleviate the cross-modality discrepancies via\ndesigning different kinds of network architectures. Different from available\nmethods, in this paper, we propose a novel parameter optimizing paradigm,\nparameter hierarchical optimization (PHO) method, for the task of VI-ReID. It\nallows part of parameters to be directly optimized without any training, which\nnarrows the search space of parameters and makes the whole network more easier\nto be trained. Specifically, we first divide the parameters into different\ntypes, and then introduce a self-adaptive alignment strategy (SAS) to\nautomatically align the visible and infrared images through transformation.\nConsidering that features in different dimension have varying importance, we\ndevelop an auto-weighted alignment learning (AAL) module that can automatically\nweight features according to their importance. Importantly, in the alignment\nprocess of SAS and AAL, all the parameters are immediately optimized with\noptimization principles rather than training the whole network, which yields a\nbetter parameter training manner. Furthermore, we establish the cross-modality\nconsistent learning (CCL) loss to extract discriminative person representations\nwith translation consistency. We provide both theoretical justification and\nempirical evidence that our proposed PHO method outperform existing VI-reID\napproaches.\n","authors":["Zeng YU","Yunxiao Shi"],"pdf_url":"https://arxiv.org/pdf/2404.07930v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07922v1","updated":"2024-04-11T17:09:28Z","published":"2024-04-11T17:09:28Z","title":"LaVy: Vietnamese Multimodal Large Language Model","summary":" Large Language Models (LLMs) and Multimodal Large language models (MLLMs)\nhave taken the world by storm with impressive abilities in complex reasoning\nand linguistic comprehension. Meanwhile there are plethora of works related to\nVietnamese Large Language Models, the lack of high-quality resources in\nmultimodality limits the progress of Vietnamese MLLMs. In this paper, we\npioneer in address this by introducing LaVy, a state-of-the-art Vietnamese\nMLLM, and we also introduce LaVy-Bench benchmark designated for evaluating\nMLLMs's understanding on Vietnamese visual language tasks. All code and model\nweights are public at https://github.com/baochi0212/LaVy\n","authors":["Chi Tran","Huong Le Thanh"],"pdf_url":"https://arxiv.org/pdf/2404.07922v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2401.04716v3","updated":"2024-04-11T16:46:52Z","published":"2024-01-09T18:40:52Z","title":"Low-Resource Vision Challenges for Foundation Models","summary":" Low-resource settings are well-established in natural language processing,\nwhere many languages lack sufficient data for deep learning at scale. However,\nlow-resource problems are under-explored in computer vision. In this paper, we\naddress this gap and explore the challenges of low-resource image tasks with\nvision foundation models. We first collect a benchmark of genuinely\nlow-resource image data, covering historic maps, circuit diagrams, and\nmechanical drawings. These low-resource settings all share three challenges:\ndata scarcity, fine-grained differences, and the distribution shift from\nnatural images to the specialized domain of interest. While existing foundation\nmodels have shown impressive generalizability, we find they cannot transfer\nwell to our low-resource tasks. To begin to tackle the challenges of\nlow-resource vision, we introduce one simple baseline per challenge.\nSpecifically, we i) enlarge the data space by generative models, ii) adopt the\nbest sub-kernels to encode local regions for fine-grained difference discovery\nand iii) learn attention for specialized domains. Experiments on our three\nlow-resource tasks demonstrate our proposals already provide a better baseline\nthan transfer learning, data augmentation, and fine-grained methods. This\nhighlights the unique characteristics and challenges of low-resource vision for\nfoundation models that warrant further investigation. Project page:\nhttps://xiaobai1217.github.io/Low-Resource-Vision/.\n","authors":["Yunhua Zhang","Hazel Doughty","Cees G. M. Snoek"],"pdf_url":"https://arxiv.org/pdf/2401.04716v3.pdf","comment":"Accepted at CVPR2024"},{"id":"http://arxiv.org/abs/2401.08739v2","updated":"2024-04-11T16:35:22Z","published":"2024-01-16T18:55:22Z","title":"EgoGen: An Egocentric Synthetic Data Generator","summary":" Understanding the world in first-person view is fundamental in Augmented\nReality (AR). This immersive perspective brings dramatic visual changes and\nunique challenges compared to third-person views. Synthetic data has empowered\nthird-person-view vision models, but its application to embodied egocentric\nperception tasks remains largely unexplored. A critical challenge lies in\nsimulating natural human movements and behaviors that effectively steer the\nembodied cameras to capture a faithful egocentric representation of the 3D\nworld. To address this challenge, we introduce EgoGen, a new synthetic data\ngenerator that can produce accurate and rich ground-truth training data for\negocentric perception tasks. At the heart of EgoGen is a novel human motion\nsynthesis model that directly leverages egocentric visual inputs of a virtual\nhuman to sense the 3D environment. Combined with collision-avoiding motion\nprimitives and a two-stage reinforcement learning approach, our motion\nsynthesis model offers a closed-loop solution where the embodied perception and\nmovement of the virtual human are seamlessly coupled. Compared to previous\nworks, our model eliminates the need for a pre-defined global path, and is\ndirectly applicable to dynamic environments. Combined with our easy-to-use and\nscalable data generation pipeline, we demonstrate EgoGen's efficacy in three\ntasks: mapping and localization for head-mounted cameras, egocentric camera\ntracking, and human mesh recovery from egocentric views. EgoGen will be fully\nopen-sourced, offering a practical solution for creating realistic egocentric\ntraining data and aiming to serve as a useful tool for egocentric computer\nvision research. Refer to our project page: https://ego-gen.github.io/.\n","authors":["Gen Li","Kaifeng Zhao","Siwei Zhang","Xiaozhong Lyu","Mihai Dusmanu","Yan Zhang","Marc Pollefeys","Siyu Tang"],"pdf_url":"https://arxiv.org/pdf/2401.08739v2.pdf","comment":"Accepted by CVPR 2024 (Oral). 23 pages, 17 figures. Project page:\n https://ego-gen.github.io/"},{"id":"http://arxiv.org/abs/2404.07887v1","updated":"2024-04-11T16:17:36Z","published":"2024-04-11T16:17:36Z","title":"Context-aware Video Anomaly Detection in Long-Term Datasets","summary":" Video anomaly detection research is generally evaluated on short, isolated\nbenchmark videos only a few minutes long. However, in real-world environments,\nsecurity cameras observe the same scene for months or years at a time, and the\nnotion of anomalous behavior critically depends on context, such as the time of\nday, day of week, or schedule of events. Here, we propose a context-aware video\nanomaly detection algorithm, Trinity, specifically targeted to these scenarios.\nTrinity is especially well-suited to crowded scenes in which individuals cannot\nbe easily tracked, and anomalies are due to speed, direction, or absence of\ngroup motion. Trinity is a contrastive learning framework that aims to learn\nalignments between context, appearance, and motion, and uses alignment quality\nto classify videos as normal or anomalous. We evaluate our algorithm on both\nconventional benchmarks and a public webcam-based dataset we collected that\nspans more than three months of activity.\n","authors":["Zhengye Yang","Richard Radke"],"pdf_url":"https://arxiv.org/pdf/2404.07887v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06564v2","updated":"2024-04-11T16:06:39Z","published":"2024-04-09T18:28:55Z","title":"MambaAD: Exploring State Space Models for Multi-class Unsupervised\n Anomaly Detection","summary":" Recent advancements in anomaly detection have seen the efficacy of CNN- and\ntransformer-based approaches. However, CNNs struggle with long-range\ndependencies, while transformers are burdened by quadratic computational\ncomplexity. Mamba-based models, with their superior long-range modeling and\nlinear efficiency, have garnered substantial attention. This study pioneers the\napplication of Mamba to multi-class unsupervised anomaly detection, presenting\nMambaAD, which consists of a pre-trained encoder and a Mamba decoder featuring\n(Locality-Enhanced State Space) LSS modules at multi-scales. The proposed LSS\nmodule, integrating parallel cascaded (Hybrid State Space) HSS blocks and\nmulti-kernel convolutions operations, effectively captures both long-range and\nlocal information. The HSS block, utilizing (Hybrid Scanning) HS encoders,\nencodes feature maps into five scanning methods and eight directions, thereby\nstrengthening global connections through the (State Space Model) SSM. The use\nof Hilbert scanning and eight directions significantly improves feature\nsequence modeling. Comprehensive experiments on six diverse anomaly detection\ndatasets and seven metrics demonstrate state-of-the-art performance,\nsubstantiating the method's effectiveness.\n","authors":["Haoyang He","Yuhu Bai","Jiangning Zhang","Qingdong He","Hongxu Chen","Zhenye Gan","Chengjie Wang","Xiangtai Li","Guanzhong Tian","Lei Xie"],"pdf_url":"https://arxiv.org/pdf/2404.06564v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07867v1","updated":"2024-04-11T16:01:00Z","published":"2024-04-11T16:01:00Z","title":"The Power of Properties: Uncovering the Influential Factors in Emotion\n Classification","summary":" Facial expression-based human emotion recognition is a critical research area\nin psychology and medicine. State-of-the-art classification performance is only\nreached by end-to-end trained neural networks. Nevertheless, such black-box\nmodels lack transparency in their decision-making processes, prompting efforts\nto ascertain the rules that underlie classifiers' decisions. Analyzing single\ninputs alone fails to expose systematic learned biases. These biases can be\ncharacterized as facial properties summarizing abstract information like age or\nmedical conditions. Therefore, understanding a model's prediction behavior\nrequires an analysis rooted in causality along such selected properties. We\ndemonstrate that up to 91.25% of classifier output behavior changes are\nstatistically significant concerning basic properties. Among those are age,\ngender, and facial symmetry. Furthermore, the medical usage of surface\nelectromyography significantly influences emotion prediction. We introduce a\nworkflow to evaluate explicit properties and their impact. These insights might\nhelp medical professionals select and apply classifiers regarding their\nspecialized data and properties.\n","authors":["Tim Büchner","Niklas Penzel","Orlando Guntinas-Lichius","Joachim Denzler"],"pdf_url":"https://arxiv.org/pdf/2404.07867v1.pdf","comment":"8 pages, 3 tables, 1 figure, accepted at ICPRAI 2024"},{"id":"http://arxiv.org/abs/2404.06177v2","updated":"2024-04-11T15:57:52Z","published":"2024-04-09T09:58:10Z","title":"Uncertainty-aware Evidential Fusion-based Learning for Semi-supervised\n Medical Image Segmentation","summary":" Although the existing uncertainty-based semi-supervised medical segmentation\nmethods have achieved excellent performance, they usually only consider a\nsingle uncertainty evaluation, which often fails to solve the problem related\nto credibility completely. Therefore, based on the framework of evidential deep\nlearning, this paper integrates the evidential predictive results in the\ncross-region of mixed and original samples to reallocate the confidence degree\nand uncertainty measure of each voxel, which is realized by emphasizing\nuncertain information of probability assignments fusion rule of traditional\nevidence theory. Furthermore, we design a voxel-level asymptotic learning\nstrategy by introducing information entropy to combine with the fused\nuncertainty measure to estimate voxel prediction more precisely. The model will\ngradually pay attention to the prediction results with high uncertainty in the\nlearning process, to learn the features that are difficult to master. The\nexperimental results on LA, Pancreas-CT, ACDC and TBAD datasets demonstrate the\nsuperior performance of our proposed method in comparison with the existing\nstate of the arts.\n","authors":["Yuanpeng He","Lijian Li"],"pdf_url":"https://arxiv.org/pdf/2404.06177v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07855v1","updated":"2024-04-11T15:51:52Z","published":"2024-04-11T15:51:52Z","title":"Resolve Domain Conflicts for Generalizable Remote Physiological\n Measurement","summary":" Remote photoplethysmography (rPPG) technology has become increasingly popular\ndue to its non-invasive monitoring of various physiological indicators, making\nit widely applicable in multimedia interaction, healthcare, and emotion\nanalysis. Existing rPPG methods utilize multiple datasets for training to\nenhance the generalizability of models. However, they often overlook the\nunderlying conflict issues across different datasets, such as (1) label\nconflict resulting from different phase delays between physiological signal\nlabels and face videos at the instance level, and (2) attribute conflict\nstemming from distribution shifts caused by head movements, illumination\nchanges, skin types, etc. To address this, we introduce the DOmain-HArmonious\nframework (DOHA). Specifically, we first propose a harmonious phase strategy to\neliminate uncertain phase delays and preserve the temporal variation of\nphysiological signals. Next, we design a harmonious hyperplane optimization\nthat reduces irrelevant attribute shifts and encourages the model's\noptimization towards a global solution that fits more valid scenarios. Our\nexperiments demonstrate that DOHA significantly improves the performance of\nexisting methods under multiple protocols. Our code is available at\nhttps://github.com/SWY666/rPPG-DOHA.\n","authors":["Weiyu Sun","Xinyu Zhang","Hao Lu","Ying Chen","Yun Ge","Xiaolin Huang","Jie Yuan","Yingcong Chen"],"pdf_url":"https://arxiv.org/pdf/2404.07855v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2404.07850v1","updated":"2024-04-11T15:46:42Z","published":"2024-04-11T15:46:42Z","title":"MindBridge: A Cross-Subject Brain Decoding Framework","summary":" Brain decoding, a pivotal field in neuroscience, aims to reconstruct stimuli\nfrom acquired brain signals, primarily utilizing functional magnetic resonance\nimaging (fMRI). Currently, brain decoding is confined to a\nper-subject-per-model paradigm, limiting its applicability to the same\nindividual for whom the decoding model is trained. This constraint stems from\nthree key challenges: 1) the inherent variability in input dimensions across\nsubjects due to differences in brain size; 2) the unique intrinsic neural\npatterns, influencing how different individuals perceive and process sensory\ninformation; 3) limited data availability for new subjects in real-world\nscenarios hampers the performance of decoding models. In this paper, we present\na novel approach, MindBridge, that achieves cross-subject brain decoding by\nemploying only one model. Our proposed framework establishes a generic paradigm\ncapable of addressing these challenges by introducing biological-inspired\naggregation function and novel cyclic fMRI reconstruction mechanism for\nsubject-invariant representation learning. Notably, by cycle reconstruction of\nfMRI, MindBridge can enable novel fMRI synthesis, which also can serve as\npseudo data augmentation. Within the framework, we also devise a novel\nreset-tuning method for adapting a pretrained model to a new subject.\nExperimental results demonstrate MindBridge's ability to reconstruct images for\nmultiple subjects, which is competitive with dedicated subject-specific models.\nFurthermore, with limited data for a new subject, we achieve a high level of\ndecoding accuracy, surpassing that of subject-specific models. This advancement\nin cross-subject brain decoding suggests promising directions for wider\napplications in neuroscience and indicates potential for more efficient\nutilization of limited fMRI data in real-world scenarios. Project page:\nhttps://littlepure2333.github.io/MindBridge\n","authors":["Shizun Wang","Songhua Liu","Zhenxiong Tan","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2404.07850v1.pdf","comment":"CVPR 2024 highlight. Code is available at\n https://github.com/littlepure2333/MindBridge"},{"id":"http://arxiv.org/abs/2404.07847v1","updated":"2024-04-11T15:42:53Z","published":"2024-04-11T15:42:53Z","title":"Fuss-Free Network: A Simplified and Efficient Neural Network for Crowd\n Counting","summary":" In the field of crowd-counting research, many recent deep learning based\nmethods have demonstrated robust capabilities for accurately estimating crowd\nsizes. However, the enhancement in their performance often arises from an\nincrease in the complexity of the model structure. This paper introduces the\nFuss-Free Network (FFNet), a crowd counting deep learning model that is\ncharacterized by its simplicity and efficiency in terms of its structure. The\nmodel comprises only a backbone of a neural network and a multi-scale feature\nfusion structure.The multi-scale feature fusion structure is a simple\narchitecture consisting of three branches, each only equipped with a focus\ntransition module, and combines the features from these branches through the\nconcatenation operation.Our proposed crowd counting model is trained and\nevaluated on four widely used public datasets, and it achieves accuracy that is\ncomparable to that of existing complex models.The experimental results further\nindicate that excellent performance in crowd counting tasks can also be\nachieved by utilizing a simple, low-parameter, and computationally efficient\nneural network structure.\n","authors":["Lei Chen","Xingen Gao"],"pdf_url":"https://arxiv.org/pdf/2404.07847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07846v1","updated":"2024-04-11T15:39:10Z","published":"2024-04-11T15:39:10Z","title":"TBSN: Transformer-Based Blind-Spot Network for Self-Supervised Image\n Denoising","summary":" Blind-spot networks (BSN) have been prevalent network architectures in\nself-supervised image denoising (SSID). Existing BSNs are mostly conducted with\nconvolution layers. Although transformers offer potential solutions to the\nlimitations of convolutions and have demonstrated success in various image\nrestoration tasks, their attention mechanisms may violate the blind-spot\nrequirement, thus restricting their applicability in SSID. In this paper, we\npresent a transformer-based blind-spot network (TBSN) by analyzing and\nredesigning the transformer operators that meet the blind-spot requirement.\nSpecifically, TBSN follows the architectural principles of dilated BSNs, and\nincorporates spatial as well as channel self-attention layers to enhance the\nnetwork capability. For spatial self-attention, an elaborate mask is applied to\nthe attention matrix to restrict its receptive field, thus mimicking the\ndilated convolution. For channel self-attention, we observe that it may leak\nthe blind-spot information when the channel number is greater than spatial size\nin the deep layers of multi-scale architectures. To eliminate this effect, we\ndivide the channel into several groups and perform channel attention\nseparately. Furthermore, we introduce a knowledge distillation strategy that\ndistills TBSN into smaller denoisers to improve computational efficiency while\nmaintaining performance. Extensive experiments on real-world image denoising\ndatasets show that TBSN largely extends the receptive field and exhibits\nfavorable performance against state-of-the-art SSID methods. The code and\npre-trained models will be publicly available at\nhttps://github.com/nagejacob/TBSN.\n","authors":["Junyi Li","Zhilu Zhang","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.07846v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.08890v3","updated":"2024-04-11T15:34:46Z","published":"2023-02-17T14:19:28Z","title":"Deep Learning for Event-based Vision: A Comprehensive Survey and\n Benchmarks","summary":" Event cameras are bio-inspired sensors that capture the per-pixel intensity\nchanges asynchronously and produce event streams encoding the time, pixel\nposition, and polarity (sign) of the intensity changes. Event cameras possess a\nmyriad of advantages over canonical frame-based cameras, such as high temporal\nresolution, high dynamic range, low latency, etc. Being capable of capturing\ninformation in challenging visual conditions, event cameras have the potential\nto overcome the limitations of frame-based cameras in the computer vision and\nrobotics community. In very recent years, deep learning (DL) has been brought\nto this emerging field and inspired active research endeavors in mining its\npotential. However, there is still a lack of taxonomies in DL techniques for\nevent-based vision. We first scrutinize the typical event representations with\nquality enhancement methods as they play a pivotal role as inputs to the DL\nmodels. We then provide a comprehensive survey of existing DL-based methods by\nstructurally grouping them into two major categories: 1) image/video\nreconstruction and restoration; 2) event-based scene understanding and 3D\nvision. We conduct benchmark experiments for the existing methods in some\nrepresentative research directions, i.e., image reconstruction, deblurring, and\nobject recognition, to identify some critical insights and problems. Finally,\nwe have discussions regarding the challenges and provide new perspectives for\ninspiring more research studies.\n","authors":["Xu Zheng","Yexin Liu","Yunfan Lu","Tongyan Hua","Tianbo Pan","Weiming Zhang","Dacheng Tao","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2302.08890v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06842v2","updated":"2024-04-11T15:28:36Z","published":"2024-04-10T09:14:28Z","title":"MoCha-Stereo: Motif Channel Attention Network for Stereo Matching","summary":" Learning-based stereo matching techniques have made significant progress.\nHowever, existing methods inevitably lose geometrical structure information\nduring the feature channel generation process, resulting in edge detail\nmismatches. In this paper, the Motif Cha}nnel Attention Stereo Matching Network\n(MoCha-Stereo) is designed to address this problem. We provide the Motif\nChannel Correlation Volume (MCCV) to determine more accurate edge matching\ncosts. MCCV is achieved by projecting motif channels, which capture common\ngeometric structures in feature channels, onto feature maps and cost volumes.\nIn addition, edge variations in %potential feature channels of the\nreconstruction error map also affect details matching, we propose the\nReconstruction Error Motif Penalty (REMP) module to further refine the\nfull-resolution disparity estimation. REMP integrates the frequency information\nof typical channel features from the reconstruction error. MoCha-Stereo ranks\n1st on the KITTI-2015 and KITTI-2012 Reflective leaderboards. Our structure\nalso shows excellent performance in Multi-View Stereo. Code is avaliable at\nhttps://github.com/ZYangChen/MoCha-Stereo.\n","authors":["Ziyang Chen","Wei Long","He Yao","Yongjun Zhang","Bingshu Wang","Yongbin Qin","Jia Wu"],"pdf_url":"https://arxiv.org/pdf/2404.06842v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2404.07833v1","updated":"2024-04-11T15:18:34Z","published":"2024-04-11T15:18:34Z","title":"Streamlined Photoacoustic Image Processing with Foundation Models: A\n Training-Free Solution","summary":" Foundation models have rapidly evolved and have achieved significant\naccomplishments in computer vision tasks. Specifically, the prompt mechanism\nconveniently allows users to integrate image prior information into the model,\nmaking it possible to apply models without any training. Therefore, we propose\na method based on foundation models and zero training to solve the tasks of\nphotoacoustic (PA) image segmentation. We employed the segment anything model\n(SAM) by setting simple prompts and integrating the model's outputs with prior\nknowledge of the imaged objects to accomplish various tasks, including: (1)\nremoving the skin signal in three-dimensional PA image rendering; (2) dual\nspeed-of-sound reconstruction, and (3) segmentation of finger blood vessels.\nThrough these demonstrations, we have concluded that deep learning can be\ndirectly applied in PA imaging without the requirement for network design and\ntraining. This potentially allows for a hands-on, convenient approach to\nachieving efficient and accurate segmentation of PA images. This letter serves\nas a comprehensive tutorial, facilitating the mastery of the technique through\nthe provision of code and sample datasets.\n","authors":["Handi Deng","Yucheng Zhou","Jiaxuan Xiang","Liujie Gu","Yan Luo","Hai Feng","Mingyuan Liu","Cheng Ma"],"pdf_url":"https://arxiv.org/pdf/2404.07833v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07824v1","updated":"2024-04-11T15:09:22Z","published":"2024-04-11T15:09:22Z","title":"Heron-Bench: A Benchmark for Evaluating Vision Language Models in\n Japanese","summary":" Vision Language Models (VLMs) have undergone a rapid evolution, giving rise\nto significant advancements in the realm of multimodal understanding tasks.\nHowever, the majority of these models are trained and evaluated on\nEnglish-centric datasets, leaving a gap in the development and evaluation of\nVLMs for other languages, such as Japanese. This gap can be attributed to the\nlack of methodologies for constructing VLMs and the absence of benchmarks to\naccurately measure their performance. To address this issue, we introduce a\nnovel benchmark, Japanese Heron-Bench, for evaluating Japanese capabilities of\nVLMs. The Japanese Heron-Bench consists of a variety of imagequestion answer\npairs tailored to the Japanese context. Additionally, we present a baseline\nJapanese VLM that has been trained with Japanese visual instruction tuning\ndatasets. Our Heron-Bench reveals the strengths and limitations of the proposed\nVLM across various ability dimensions. Furthermore, we clarify the capability\ngap between strong closed models like GPT-4V and the baseline model, providing\nvaluable insights for future research in this domain. We release the benchmark\ndataset and training code to facilitate further developments in Japanese VLM\nresearch.\n","authors":["Yuichi Inoue","Kento Sasaki","Yuma Ochi","Kazuki Fujii","Kotaro Tanahashi","Yu Yamaguchi"],"pdf_url":"https://arxiv.org/pdf/2404.07824v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07821v1","updated":"2024-04-11T15:00:55Z","published":"2024-04-11T15:00:55Z","title":"Sparse Laneformer","summary":" Lane detection is a fundamental task in autonomous driving, and has achieved\ngreat progress as deep learning emerges. Previous anchor-based methods often\ndesign dense anchors, which highly depend on the training dataset and remain\nfixed during inference. We analyze that dense anchors are not necessary for\nlane detection, and propose a transformer-based lane detection framework based\non a sparse anchor mechanism. To this end, we generate sparse anchors with\nposition-aware lane queries and angle queries instead of traditional explicit\nanchors. We adopt Horizontal Perceptual Attention (HPA) to aggregate the lane\nfeatures along the horizontal direction, and adopt Lane-Angle Cross Attention\n(LACA) to perform interactions between lane queries and angle queries. We also\npropose Lane Perceptual Attention (LPA) based on deformable cross attention to\nfurther refine the lane predictions. Our method, named Sparse Laneformer, is\neasy-to-implement and end-to-end trainable. Extensive experiments demonstrate\nthat Sparse Laneformer performs favorably against the state-of-the-art methods,\ne.g., surpassing Laneformer by 3.0% F1 score and O2SFormer by 0.7% F1 score\nwith fewer MACs on CULane with the same ResNet-34 backbone.\n","authors":["Ji Liu","Zifeng Zhang","Mingjie Lu","Hongyang Wei","Dong Li","Yile Xie","Jinzhang Peng","Lu Tian","Ashish Sirasao","Emad Barsoum"],"pdf_url":"https://arxiv.org/pdf/2404.07821v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07807v1","updated":"2024-04-11T14:51:12Z","published":"2024-04-11T14:51:12Z","title":"Voice-Assisted Real-Time Traffic Sign Recognition System Using\n Convolutional Neural Network","summary":" Traffic signs are important in communicating information to drivers. Thus,\ncomprehension of traffic signs is essential for road safety and ignorance may\nresult in road accidents. Traffic sign detection has been a research spotlight\nover the past few decades. Real-time and accurate detections are the\npreliminaries of robust traffic sign detection system which is yet to be\nachieved. This study presents a voice-assisted real-time traffic sign\nrecognition system which is capable of assisting drivers. This system functions\nunder two subsystems. Initially, the detection and recognition of the traffic\nsigns are carried out using a trained Convolutional Neural Network (CNN). After\nrecognizing the specific traffic sign, it is narrated to the driver as a voice\nmessage using a text-to-speech engine. An efficient CNN model for a benchmark\ndataset is developed for real-time detection and recognition using Deep\nLearning techniques. The advantage of this system is that even if the driver\nmisses a traffic sign, or does not look at the traffic sign, or is unable to\ncomprehend the sign, the system detects it and narrates it to the driver. A\nsystem of this type is also important in the development of autonomous\nvehicles.\n","authors":["Mayura Manawadu","Udaya Wijenayake"],"pdf_url":"https://arxiv.org/pdf/2404.07807v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07794v1","updated":"2024-04-11T14:35:59Z","published":"2024-04-11T14:35:59Z","title":"DGMamba: Domain Generalization via Generalized State Space Model","summary":" Domain generalization~(DG) aims at solving distribution shift problems in\nvarious scenes. Existing approaches are based on Convolution Neural Networks\n(CNNs) or Vision Transformers (ViTs), which suffer from limited receptive\nfields or quadratic complexities issues. Mamba, as an emerging state space\nmodel (SSM), possesses superior linear complexity and global receptive fields.\nDespite this, it can hardly be applied to DG to address distribution shifts,\ndue to the hidden state issues and inappropriate scan mechanisms. In this\npaper, we propose a novel framework for DG, named DGMamba, that excels in\nstrong generalizability toward unseen domains and meanwhile has the advantages\nof global receptive fields, and efficient linear complexity. Our DGMamba\ncompromises two core components: Hidden State Suppressing~(HSS) and\nSemantic-aware Patch refining~(SPR). In particular, HSS is introduced to\nmitigate the influence of hidden states associated with domain-specific\nfeatures during output prediction. SPR strives to encourage the model to\nconcentrate more on objects rather than context, consisting of two designs:\nPrior-Free Scanning~(PFS), and Domain Context Interchange~(DCI). Concretely,\nPFS aims to shuffle the non-semantic patches within images, creating more\nflexible and effective sequences from images, and DCI is designed to regularize\nMamba with the combination of mismatched non-semantic and semantic information\nby fusing patches among domains. Extensive experiments on four commonly used DG\nbenchmarks demonstrate that the proposed DGMamba achieves remarkably superior\nresults to state-of-the-art models. The code will be made publicly available.\n","authors":["Shaocong Long","Qianyu Zhou","Xiangtai Li","Xuequan Lu","Chenhao Ying","Yuan Luo","Lizhuang Ma","Shuicheng Yan"],"pdf_url":"https://arxiv.org/pdf/2404.07794v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07790v1","updated":"2024-04-11T14:31:11Z","published":"2024-04-11T14:31:11Z","title":"VIFNet: An End-to-end Visible-Infrared Fusion Network for Image Dehazing","summary":" Image dehazing poses significant challenges in environmental perception.\nRecent research mainly focus on deep learning-based methods with single\nmodality, while they may result in severe information loss especially in\ndense-haze scenarios. The infrared image exhibits robustness to the haze,\nhowever, existing methods have primarily treated the infrared modality as\nauxiliary information, failing to fully explore its rich information in\ndehazing. To address this challenge, the key insight of this study is to design\na visible-infrared fusion network for image dehazing. In particular, we propose\na multi-scale Deep Structure Feature Extraction (DSFE) module, which\nincorporates the Channel-Pixel Attention Block (CPAB) to restore more spatial\nand marginal information within the deep structural features. Additionally, we\nintroduce an inconsistency weighted fusion strategy to merge the two modalities\nby leveraging the more reliable information. To validate this, we construct a\nvisible-infrared multimodal dataset called AirSim-VID based on the AirSim\nsimulation platform. Extensive experiments performed on challenging real and\nsimulated image datasets demonstrate that VIFNet can outperform many\nstate-of-the-art competing methods. The code and dataset are available at\nhttps://github.com/mengyu212/VIFNet_dehazing.\n","authors":["Meng Yu","Te Cui","Haoyang Lu","Yufeng Yue"],"pdf_url":"https://arxiv.org/pdf/2404.07790v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07788v1","updated":"2024-04-11T14:29:30Z","published":"2024-04-11T14:29:30Z","title":"AUG: A New Dataset and An Efficient Model for Aerial Image Urban Scene\n Graph Generation","summary":" Scene graph generation (SGG) aims to understand the visual objects and their\nsemantic relationships from one given image. Until now, lots of SGG datasets\nwith the eyelevel view are released but the SGG dataset with the overhead view\nis scarcely studied. By contrast to the object occlusion problem in the\neyelevel view, which impedes the SGG, the overhead view provides a new\nperspective that helps to promote the SGG by providing a clear perception of\nthe spatial relationships of objects in the ground scene. To fill in the gap of\nthe overhead view dataset, this paper constructs and releases an aerial image\nurban scene graph generation (AUG) dataset. Images from the AUG dataset are\ncaptured with the low-attitude overhead view. In the AUG dataset, 25,594\nobjects, 16,970 relationships, and 27,175 attributes are manually annotated. To\navoid the local context being overwhelmed in the complex aerial urban scene,\nthis paper proposes one new locality-preserving graph convolutional network\n(LPG). Different from the traditional graph convolutional network, which has\nthe natural advantage of capturing the global context for SGG, the\nconvolutional layer in the LPG integrates the non-destructive initial features\nof the objects with dynamically updated neighborhood information to preserve\nthe local context under the premise of mining the global context. To address\nthe problem that there exists an extra-large number of potential object\nrelationship pairs but only a small part of them is meaningful in AUG, we\npropose the adaptive bounding box scaling factor for potential relationship\ndetection (ABS-PRD) to intelligently prune the meaningless relationship pairs.\nExtensive experiments on the AUG dataset show that our LPG can significantly\noutperform the state-of-the-art methods and the effectiveness of the proposed\nlocality-preserving strategy.\n","authors":["Yansheng Li","Kun Li","Yongjun Zhang","Linlin Wang","Dingwen Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.07788v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07785v1","updated":"2024-04-11T14:28:04Z","published":"2024-04-11T14:28:04Z","title":"PRAM: Place Recognition Anywhere Model for Efficient Visual Localization","summary":" Humans localize themselves efficiently in known environments by first\nrecognizing landmarks defined on certain objects and their spatial\nrelationships, and then verifying the location by aligning detailed structures\nof recognized objects with those in the memory. Inspired by this, we propose\nthe place recognition anywhere model (PRAM) to perform visual localization as\nefficiently as humans do. PRAM consists of two main components - recognition\nand registration. In detail, first of all, a self-supervised map-centric\nlandmark definition strategy is adopted, making places in either indoor or\noutdoor scenes act as unique landmarks. Then, sparse keypoints extracted from\nimages, are utilized as the input to a transformer-based deep neural network\nfor landmark recognition; these keypoints enable PRAM to recognize hundreds of\nlandmarks with high time and memory efficiency. Keypoints along with recognized\nlandmark labels are further used for registration between query images and the\n3D landmark map. Different from previous hierarchical methods, PRAM discards\nglobal and local descriptors, and reduces over 90% storage. Since PRAM utilizes\nrecognition and landmark-wise verification to replace global reference search\nand exhaustive matching respectively, it runs 2.4 times faster than prior\nstate-of-the-art approaches. Moreover, PRAM opens new directions for visual\nlocalization including multi-modality localization, map-centric feature\nlearning, and hierarchical scene coordinate regression.\n","authors":["Fei Xue","Ignas Budvytis","Roberto Cipolla"],"pdf_url":"https://arxiv.org/pdf/2404.07785v1.pdf","comment":"project page: https://feixue94.github.io/pram-project/"},{"id":"http://arxiv.org/abs/2404.04562v2","updated":"2024-04-11T14:28:00Z","published":"2024-04-06T09:03:18Z","title":"Diffusion Time-step Curriculum for One Image to 3D Generation","summary":" Score distillation sampling~(SDS) has been widely adopted to overcome the\nabsence of unseen views in reconstructing 3D objects from a \\textbf{single}\nimage. It leverages pre-trained 2D diffusion models as teacher to guide the\nreconstruction of student 3D models. Despite their remarkable success,\nSDS-based methods often encounter geometric artifacts and texture saturation.\nWe find out the crux is the overlooked indiscriminate treatment of diffusion\ntime-steps during optimization: it unreasonably treats the student-teacher\nknowledge distillation to be equal at all time-steps and thus entangles\ncoarse-grained and fine-grained modeling. Therefore, we propose the Diffusion\nTime-step Curriculum one-image-to-3D pipeline (DTC123), which involves both the\nteacher and student models collaborating with the time-step curriculum in a\ncoarse-to-fine manner. Extensive experiments on NeRF4, RealFusion15, GSO and\nLevel50 benchmark demonstrate that DTC123 can produce multi-view consistent,\nhigh-quality, and diverse 3D assets. Codes and more generation demos will be\nreleased in https://github.com/yxymessi/DTC123.\n","authors":["Xuanyu Yi","Zike Wu","Qingshan Xu","Pan Zhou","Joo-Hwee Lim","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.04562v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2401.07782v2","updated":"2024-04-11T14:27:27Z","published":"2024-01-15T15:43:56Z","title":"Exploring Masked Autoencoders for Sensor-Agnostic Image Retrieval in\n Remote Sensing","summary":" Self-supervised learning through masked autoencoders (MAEs) has recently\nattracted great attention for remote sensing (RS) image representation\nlearning, and thus embodies a significant potential for content-based image\nretrieval (CBIR) from ever-growing RS image archives. However, the existing\nstudies on MAEs in RS assume that the considered RS images are acquired by a\nsingle image sensor, and thus are only suitable for uni-modal CBIR problems.\nThe effectiveness of MAEs for cross-sensor CBIR, which aims to search\nsemantically similar images across different image modalities, has not been\nexplored yet. In this paper, we take the first step to explore the\neffectiveness of MAEs for sensor-agnostic CBIR in RS. To this end, we present a\nsystematic overview on the possible adaptations of the vanilla MAE to exploit\nmasked image modeling on multi-sensor RS image archives (denoted as\ncross-sensor masked autoencoders [CSMAEs]). Based on different adjustments\napplied to the vanilla MAE, we introduce different CSMAE models. We also\nprovide an extensive experimental analysis of these CSMAE models. We finally\nderive a guideline to exploit masked image modeling for uni-modal and\ncross-modal CBIR problems in RS. The code of this work is publicly available at\nhttps://github.com/jakhac/CSMAE.\n","authors":["Jakob Hackstein","Gencer Sumbul","Kai Norman Clasen","Begüm Demir"],"pdf_url":"https://arxiv.org/pdf/2401.07782v2.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Our code is available at https://github.com/jakhac/CSMAE"},{"id":"http://arxiv.org/abs/2309.09590v2","updated":"2024-04-11T14:24:09Z","published":"2023-09-18T08:54:29Z","title":"An Autonomous Vision-Based Algorithm for Interplanetary Navigation","summary":" The surge of deep-space probes makes it unsustainable to navigate them with\nstandard radiometric tracking. Self-driving interplanetary satellites represent\na solution to this problem. In this work, a full vision-based navigation\nalgorithm is built by combining an orbit determination method with an image\nprocessing pipeline suitable for interplanetary transfers of autonomous\nplatforms. To increase the computational efficiency of the algorithm, a\nnon-dimensional extended Kalman filter is selected as state estimator, fed by\nthe positions of the planets extracted from deep-space images. An enhancement\nof the estimation accuracy is performed by applying an optimal strategy to\nselect the best pair of planets to track. Moreover, a novel analytical\nmeasurement model for deep-space navigation is developed providing a\nfirst-order approximation of the light-aberration and light-time effects.\nAlgorithm performance is tested on a high-fidelity, Earth--Mars interplanetary\ntransfer, showing the algorithm applicability for deep-space navigation.\n","authors":["Eleonora Andreis","Paolo Panicucci","Francesco Topputo"],"pdf_url":"https://arxiv.org/pdf/2309.09590v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18551v2","updated":"2024-04-11T14:10:43Z","published":"2024-03-27T13:31:39Z","title":"Attention Calibration for Disentangled Text-to-Image Personalization","summary":" Recent thrilling progress in large-scale text-to-image (T2I) models has\nunlocked unprecedented synthesis quality of AI-generated content (AIGC)\nincluding image generation, 3D and video composition. Further, personalized\ntechniques enable appealing customized production of a novel concept given only\nseveral images as reference. However, an intriguing problem persists: Is it\npossible to capture multiple, novel concepts from one single reference image?\nIn this paper, we identify that existing approaches fail to preserve visual\nconsistency with the reference image and eliminate cross-influence from\nconcepts. To alleviate this, we propose an attention calibration mechanism to\nimprove the concept-level understanding of the T2I model. Specifically, we\nfirst introduce new learnable modifiers bound with classes to capture\nattributes of multiple concepts. Then, the classes are separated and\nstrengthened following the activation of the cross-attention operation,\nensuring comprehensive and self-contained concepts. Additionally, we suppress\nthe attention activation of different classes to mitigate mutual influence\namong concepts. Together, our proposed method, dubbed DisenDiff, can learn\ndisentangled multiple concepts from one single image and produce novel\ncustomized images with learned concepts. We demonstrate that our method\noutperforms the current state of the art in both qualitative and quantitative\nevaluations. More importantly, our proposed techniques are compatible with LoRA\nand inpainting pipelines, enabling more interactive experiences.\n","authors":["Yanbing Zhang","Mengping Yang","Qin Zhou","Zhe Wang"],"pdf_url":"https://arxiv.org/pdf/2403.18551v2.pdf","comment":"CVPR 2024 (Oral)"},{"id":"http://arxiv.org/abs/2404.07773v1","updated":"2024-04-11T14:08:45Z","published":"2024-04-11T14:08:45Z","title":"ConsistencyDet: Robust Object Detector with Denoising Paradigm of\n Consistency Model","summary":" Object detection, a quintessential task in the realm of perceptual computing,\ncan be tackled using a generative methodology. In the present study, we\nintroduce a novel framework designed to articulate object detection as a\ndenoising diffusion process, which operates on perturbed bounding boxes of\nannotated entities. This framework, termed ConsistencyDet, leverages an\ninnovative denoising concept known as the Consistency Model. The hallmark of\nthis model is its self-consistency feature, which empowers the model to map\ndistorted information from any temporal stage back to its pristine state,\nthereby realizing a ``one-step denoising'' mechanism. Such an attribute\nmarkedly elevates the operational efficiency of the model, setting it apart\nfrom the conventional Diffusion Model. Throughout the training phase,\nConsistencyDet initiates the diffusion sequence with noise-infused boxes\nderived from the ground-truth annotations and conditions the model to perform\nthe denoising task. Subsequently, in the inference stage, the model employs a\ndenoising sampling strategy that commences with bounding boxes randomly sampled\nfrom a normal distribution. Through iterative refinement, the model transforms\nan assortment of arbitrarily generated boxes into the definitive detections.\nComprehensive evaluations employing standard benchmarks, such as MS-COCO and\nLVIS, corroborate that ConsistencyDet surpasses other leading-edge detectors in\nperformance metrics.\n","authors":["Lifan Jiang","Zhihui Wang","Changmiao Wang","Ming Li","Jiaxu Leng","Xindong Wu"],"pdf_url":"https://arxiv.org/pdf/2404.07773v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07770v1","updated":"2024-04-11T14:07:16Z","published":"2024-04-11T14:07:16Z","title":"Joint Conditional Diffusion Model for Image Restoration with Mixed\n Degradations","summary":" Image restoration is rather challenging in adverse weather conditions,\nespecially when multiple degradations occur simultaneously. Blind image\ndecomposition was proposed to tackle this issue, however, its effectiveness\nheavily relies on the accurate estimation of each component. Although\ndiffusion-based models exhibit strong generative abilities in image restoration\ntasks, they may generate irrelevant contents when the degraded images are\nseverely corrupted. To address these issues, we leverage physical constraints\nto guide the whole restoration process, where a mixed degradation model based\non atmosphere scattering model is constructed. Then we formulate our Joint\nConditional Diffusion Model (JCDM) by incorporating the degraded image and\ndegradation mask to provide precise guidance. To achieve better color and\ndetail recovery results, we further integrate a refinement network to\nreconstruct the restored image, where Uncertainty Estimation Block (UEB) is\nemployed to enhance the features. Extensive experiments performed on both\nmulti-weather and weather-specific datasets demonstrate the superiority of our\nmethod over state-of-the-art competing methods.\n","authors":["Yufeng Yue","Meng Yu","Luojie Yang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2404.07770v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07766v1","updated":"2024-04-11T14:05:37Z","published":"2024-04-11T14:05:37Z","title":"RMAFF-PSN: A Residual Multi-Scale Attention Feature Fusion Photometric\n Stereo Network","summary":" Predicting accurate normal maps of objects from two-dimensional images in\nregions of complex structure and spatial material variations is challenging\nusing photometric stereo methods due to the influence of surface reflection\nproperties caused by variations in object geometry and surface materials. To\naddress this issue, we propose a photometric stereo network called a RMAFF-PSN\nthat uses residual multiscale attentional feature fusion to handle the\n``difficult'' regions of the object. Unlike previous approaches that only use\nstacked convolutional layers to extract deep features from the input image, our\nmethod integrates feature information from different resolution stages and\nscales of the image. This approach preserves more physical information, such as\ntexture and geometry of the object in complex regions, through shallow-deep\nstage feature extraction, double branching enhancement, and attention\noptimization. To test the network structure under real-world conditions, we\npropose a new real dataset called Simple PS data, which contains multiple\nobjects with varying structures and materials. Experimental results on a\npublicly available benchmark dataset demonstrate that our method outperforms\nmost existing calibrated photometric stereo methods for the same number of\ninput images, especially in the case of highly non-convex object structures.\nOur method also obtains good results under sparse lighting conditions.\n","authors":["Kai Luo","Yakun Ju","Lin Qi","Kaixuan Wang","Junyu Dong"],"pdf_url":"https://arxiv.org/pdf/2404.07766v1.pdf","comment":"17 pages,12 figures"},{"id":"http://arxiv.org/abs/2404.07762v1","updated":"2024-04-11T14:03:16Z","published":"2024-04-11T14:03:16Z","title":"NeuroNCAP: Photorealistic Closed-loop Safety Testing for Autonomous\n Driving","summary":" We present a versatile NeRF-based simulator for testing autonomous driving\n(AD) software systems, designed with a focus on sensor-realistic closed-loop\nevaluation and the creation of safety-critical scenarios. The simulator learns\nfrom sequences of real-world driving sensor data and enables reconfigurations\nand renderings of new, unseen scenarios. In this work, we use our simulator to\ntest the responses of AD models to safety-critical scenarios inspired by the\nEuropean New Car Assessment Programme (Euro NCAP). Our evaluation reveals that,\nwhile state-of-the-art end-to-end planners excel in nominal driving scenarios\nin an open-loop setting, they exhibit critical flaws when navigating our\nsafety-critical scenarios in a closed-loop setting. This highlights the need\nfor advancements in the safety and real-world usability of end-to-end planners.\nBy publicly releasing our simulator and scenarios as an easy-to-run evaluation\nsuite, we invite the research community to explore, refine, and validate their\nAD models in controlled, yet highly configurable and challenging\nsensor-realistic environments. Code and instructions can be found at\nhttps://github.com/wljungbergh/NeuroNCAP\n","authors":["William Ljungbergh","Adam Tonderski","Joakim Johnander","Holger Caesar","Kalle Åström","Michael Felsberg","Christoffer Petersson"],"pdf_url":"https://arxiv.org/pdf/2404.07762v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07754v1","updated":"2024-04-11T14:00:20Z","published":"2024-04-11T14:00:20Z","title":"Generating Synthetic Satellite Imagery With Deep-Learning Text-to-Image\n Models -- Technical Challenges and Implications for Monitoring and\n Verification","summary":" Novel deep-learning (DL) architectures have reached a level where they can\ngenerate digital media, including photorealistic images, that are difficult to\ndistinguish from real data. These technologies have already been used to\ngenerate training data for Machine Learning (ML) models, and large\ntext-to-image models like DALL-E 2, Imagen, and Stable Diffusion are achieving\nremarkable results in realistic high-resolution image generation. Given these\ndevelopments, issues of data authentication in monitoring and verification\ndeserve a careful and systematic analysis: How realistic are synthetic images?\nHow easily can they be generated? How useful are they for ML researchers, and\nwhat is their potential for Open Science? In this work, we use novel DL models\nto explore how synthetic satellite images can be created using conditioning\nmechanisms. We investigate the challenges of synthetic satellite image\ngeneration and evaluate the results based on authenticity and state-of-the-art\nmetrics. Furthermore, we investigate how synthetic data can alleviate the lack\nof data in the context of ML methods for remote-sensing. Finally we discuss\nimplications of synthetic satellite imagery in the context of monitoring and\nverification.\n","authors":["Tuong Vy Nguyen","Alexander Glaser","Felix Biessmann"],"pdf_url":"https://arxiv.org/pdf/2404.07754v1.pdf","comment":"https://resources.inmm.org/annual-meeting-proceedings/generating-synthetic-satellite-imagery-deep-learning-text-image-models"},{"id":"http://arxiv.org/abs/2404.07748v1","updated":"2024-04-11T13:46:05Z","published":"2024-04-11T13:46:05Z","title":"3D-CSAD: Untrained 3D Anomaly Detection for Complex Manufacturing\n Surfaces","summary":" The surface quality inspection of manufacturing parts based on 3D point cloud\ndata has attracted increasing attention in recent years. The reason is that the\n3D point cloud can capture the entire surface of manufacturing parts, unlike\nthe previous practices that focus on some key product characteristics. However,\nachieving accurate 3D anomaly detection is challenging, due to the complex\nsurfaces of manufacturing parts and the difficulty of collecting sufficient\nanomaly samples. To address these challenges, we propose a novel untrained\nanomaly detection method based on 3D point cloud data for complex manufacturing\nparts, which can achieve accurate anomaly detection in a single sample without\ntraining data. In the proposed framework, we transform an input sample into two\nsets of profiles along different directions. Based on one set of the profiles,\na novel segmentation module is devised to segment the complex surface into\nmultiple basic and simple components. In each component, another set of\nprofiles, which have the nature of similar shapes, can be modeled as a low-rank\nmatrix. Thus, accurate 3D anomaly detection can be achieved by using Robust\nPrincipal Component Analysis (RPCA) on these low-rank matrices. Extensive\nnumerical experiments on different types of parts show that our method achieves\npromising results compared with the benchmark methods.\n","authors":["Xuanming Cao","Chengyu Tao","Juan Du"],"pdf_url":"https://arxiv.org/pdf/2404.07748v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.05539v2","updated":"2024-04-11T13:39:18Z","published":"2023-11-09T17:34:57Z","title":"A Deep Learning Method for Simultaneous Denoising and Missing Wedge\n Reconstruction in Cryogenic Electron Tomography","summary":" Cryogenic electron tomography is a technique for imaging biological samples\nin 3D. A microscope collects a series of 2D projections of the sample, and the\ngoal is to reconstruct the 3D density of the sample called the tomogram.\nReconstruction is difficult as the 2D projections are noisy and can not be\nrecorded from all directions, resulting in a missing wedge of information.\nTomograms conventionally reconstructed with filtered back-projection suffer\nfrom noise and strong artifacts due to the missing wedge. Here, we propose a\ndeep-learning approach for simultaneous denoising and missing wedge\nreconstruction called DeepDeWedge. The algorithm requires no ground truth data\nand is based on fitting a neural network to the 2D projections using a\nself-supervised loss. DeepDeWedge performs better than CryoCARE and IsoNet,\nwhich are state-of-the-art methods for denoising and missing wedge\nreconstruction, and similarly and, in some cases, better than the combination\nof the two methods. At the same time, DeepDeWedge is simpler than this two-step\napproach, as it does denoising and missing wedge reconstruction simultaneously\nrather than sequentially.\n","authors":["Simon Wiedemann","Reinhard Heckel"],"pdf_url":"https://arxiv.org/pdf/2311.05539v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07739v1","updated":"2024-04-11T13:37:51Z","published":"2024-04-11T13:37:51Z","title":"Exploiting Object-based and Segmentation-based Semantic Features for\n Deep Learning-based Indoor Scene Classification","summary":" Indoor scenes are usually characterized by scattered objects and their\nrelationships, which turns the indoor scene classification task into a\nchallenging computer vision task. Despite the significant performance boost in\nclassification tasks achieved in recent years, provided by the use of\ndeep-learning-based methods, limitations such as inter-category ambiguity and\nintra-category variation have been holding back their performance. To overcome\nsuch issues, gathering semantic information has been shown to be a promising\nsource of information towards a more complete and discriminative feature\nrepresentation of indoor scenes. Therefore, the work described in this paper\nuses both semantic information, obtained from object detection, and semantic\nsegmentation techniques. While object detection techniques provide the 2D\nlocation of objects allowing to obtain spatial distributions between objects,\nsemantic segmentation techniques provide pixel-level information that allows to\nobtain, at a pixel-level, a spatial distribution and shape-related features of\nthe segmentation categories. Hence, a novel approach that uses a semantic\nsegmentation mask to provide Hu-moments-based segmentation categories' shape\ncharacterization, designated by Segmentation-based Hu-Moments Features (SHMFs),\nis proposed. Moreover, a three-main-branch network, designated by\nGOS$^2$F$^2$App, that exploits deep-learning-based global features,\nobject-based features, and semantic segmentation-based features is also\nproposed. GOS$^2$F$^2$App was evaluated in two indoor scene benchmark datasets:\nSUN RGB-D and NYU Depth V2, where, to the best of our knowledge,\nstate-of-the-art results were achieved on both datasets, which present\nevidences of the effectiveness of the proposed approach.\n","authors":["Ricardo Pereira","Luís Garrote","Tiago Barros","Ana Lopes","Urbano J. Nunes"],"pdf_url":"https://arxiv.org/pdf/2404.07739v1.pdf","comment":"This preprint was submitted at IEEE Transactions on Image Processing"},{"id":"http://arxiv.org/abs/2404.05392v2","updated":"2024-04-11T13:36:58Z","published":"2024-04-08T10:51:29Z","title":"T-DEED: Temporal-Discriminability Enhancer Encoder-Decoder for Precise\n Event Spotting in Sports Videos","summary":" In this paper, we introduce T-DEED, a Temporal-Discriminability Enhancer\nEncoder-Decoder for Precise Event Spotting in sports videos. T-DEED addresses\nmultiple challenges in the task, including the need for discriminability among\nframe representations, high output temporal resolution to maintain prediction\nprecision, and the necessity to capture information at different temporal\nscales to handle events with varying dynamics. It tackles these challenges\nthrough its specifically designed architecture, featuring an encoder-decoder\nfor leveraging multiple temporal scales and achieving high output temporal\nresolution, along with temporal modules designed to increase token\ndiscriminability. Leveraging these characteristics, T-DEED achieves SOTA\nperformance on the FigureSkating and FineDiving datasets. Code is available at\nhttps://github.com/arturxe2/T-DEED.\n","authors":["Artur Xarles","Sergio Escalera","Thomas B. Moeslund","Albert Clapés"],"pdf_url":"https://arxiv.org/pdf/2404.05392v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07729v1","updated":"2024-04-11T13:19:46Z","published":"2024-04-11T13:19:46Z","title":"Realistic Continual Learning Approach using Pre-trained Models","summary":" Continual learning (CL) is crucial for evaluating adaptability in learning\nsolutions to retain knowledge. Our research addresses the challenge of\ncatastrophic forgetting, where models lose proficiency in previously learned\ntasks as they acquire new ones. While numerous solutions have been proposed,\nexisting experimental setups often rely on idealized class-incremental learning\nscenarios. We introduce Realistic Continual Learning (RealCL), a novel CL\nparadigm where class distributions across tasks are random, departing from\nstructured setups.\n We also present CLARE (Continual Learning Approach with pRE-trained models\nfor RealCL scenarios), a pre-trained model-based solution designed to integrate\nnew knowledge while preserving past learning. Our contributions include\npioneering RealCL as a generalization of traditional CL setups, proposing CLARE\nas an adaptable approach for RealCL tasks, and conducting extensive experiments\ndemonstrating its effectiveness across various RealCL scenarios. Notably, CLARE\noutperforms existing models on RealCL benchmarks, highlighting its versatility\nand robustness in unpredictable learning environments.\n","authors":["Nadia Nasri","Carlos Gutiérrez-Álvarez","Sergio Lafuente-Arroyo","Saturnino Maldonado-Bascón","Roberto J. López-Sastre"],"pdf_url":"https://arxiv.org/pdf/2404.07729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07724v1","updated":"2024-04-11T13:16:47Z","published":"2024-04-11T13:16:47Z","title":"Applying Guidance in a Limited Interval Improves Sample and Distribution\n Quality in Diffusion Models","summary":" Guidance is a crucial technique for extracting the best performance out of\nimage-generating diffusion models. Traditionally, a constant guidance weight\nhas been applied throughout the sampling chain of an image. We show that\nguidance is clearly harmful toward the beginning of the chain (high noise\nlevels), largely unnecessary toward the end (low noise levels), and only\nbeneficial in the middle. We thus restrict it to a specific range of noise\nlevels, improving both the inference speed and result quality. This limited\nguidance interval improves the record FID in ImageNet-512 significantly, from\n1.81 to 1.40. We show that it is quantitatively and qualitatively beneficial\nacross different sampler parameters, network architectures, and datasets,\nincluding the large-scale setting of Stable Diffusion XL. We thus suggest\nexposing the guidance interval as a hyperparameter in all diffusion models that\nuse guidance.\n","authors":["Tuomas Kynkäänniemi","Miika Aittala","Tero Karras","Samuli Laine","Timo Aila","Jaakko Lehtinen"],"pdf_url":"https://arxiv.org/pdf/2404.07724v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03778v2","updated":"2024-04-11T13:12:48Z","published":"2024-04-04T19:50:57Z","title":"Flattening the Parent Bias: Hierarchical Semantic Segmentation in the\n Poincar{é} Ball","summary":" Hierarchy is a natural representation of semantic taxonomies, including the\nones routinely used in image segmentation. Indeed, recent work on semantic\nsegmentation reports improved accuracy from supervised training leveraging\nhierarchical label structures. Encouraged by these results, we revisit the\nfundamental assumptions behind that work. We postulate and then empirically\nverify that the reasons for the observed improvement in segmentation accuracy\nmay be entirely unrelated to the use of the semantic hierarchy. To demonstrate\nthis, we design a range of cross-domain experiments with a representative\nhierarchical approach. We find that on the new testing domains, a flat\n(non-hierarchical) segmentation network, in which the parents are inferred from\nthe children, has superior segmentation accuracy to the hierarchical approach\nacross the board. Complementing these findings and inspired by the intrinsic\nproperties of hyperbolic spaces, we study a more principled approach to\nhierarchical segmentation using the Poincar\\'e ball model. The hyperbolic\nrepresentation largely outperforms the previous (Euclidean) hierarchical\napproach as well and is on par with our flat Euclidean baseline in terms of\nsegmentation accuracy. However, it additionally exhibits surprisingly strong\ncalibration quality of the parent nodes in the semantic hierarchy, especially\non the more challenging domains. Our combined analysis suggests that the\nestablished practice of hierarchical segmentation may be limited to in-domain\nsettings, whereas flat classifiers generalize substantially better, especially\nif they are modeled in the hyperbolic space.\n","authors":["Simon Weber","Barış Zöngür","Nikita Araslanov","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2404.03778v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.16783v2","updated":"2024-04-11T13:07:43Z","published":"2023-03-29T15:19:01Z","title":"Exploring Efficient Asymmetric Blind-Spots for Self-Supervised Denoising\n in Real-World Scenarios","summary":" Self-supervised denoising has attracted widespread attention due to its\nability to train without clean images. However, noise in real-world scenarios\nis often spatially correlated, which causes many self-supervised algorithms\nthat assume pixel-wise independent noise to perform poorly. Recent works have\nattempted to break noise correlation with downsampling or neighborhood masking.\nHowever, denoising on downsampled subgraphs can lead to aliasing effects and\nloss of details due to a lower sampling rate. Furthermore, the neighborhood\nmasking methods either come with high computational complexity or do not\nconsider local spatial preservation during inference. Through the analysis of\nexisting methods, we point out that the key to obtaining high-quality and\ntexture-rich results in real-world self-supervised denoising tasks is to train\nat the original input resolution structure and use asymmetric operations during\ntraining and inference. Based on this, we propose Asymmetric Tunable Blind-Spot\nNetwork (AT-BSN), where the blind-spot size can be freely adjusted, thus better\nbalancing noise correlation suppression and image local spatial destruction\nduring training and inference. In addition, we regard the pre-trained AT-BSN as\na meta-teacher network capable of generating various teacher networks by\nsampling different blind-spots. We propose a blind-spot based multi-teacher\ndistillation strategy to distill a lightweight network, significantly improving\nperformance. Experimental results on multiple datasets prove that our method\nachieves state-of-the-art, and is superior to other self-supervised algorithms\nin terms of computational overhead and visual effects.\n","authors":["Shiyan Chen","Jiyuan Zhang","Zhaofei Yu","Tiejun Huang"],"pdf_url":"https://arxiv.org/pdf/2303.16783v2.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.03936v2","updated":"2024-04-11T13:02:58Z","published":"2024-04-05T07:44:17Z","title":"Deep Learning for Satellite Image Time Series Analysis: A Review","summary":" Earth observation (EO) satellite missions have been providing detailed images\nabout the state of the Earth and its land cover for over 50 years. Long term\nmissions, such as NASA's Landsat, Terra, and Aqua satellites, and more\nrecently, the ESA's Sentinel missions, record images of the entire world every\nfew days. Although single images provide point-in-time data, repeated images of\nthe same area, or satellite image time series (SITS) provide information about\nthe changing state of vegetation and land use. These SITS are useful for\nmodeling dynamic processes and seasonal changes such as plant phenology. They\nhave potential benefits for many aspects of land and natural resource\nmanagement, including applications in agricultural, forest, water, and disaster\nmanagement, urban planning, and mining. However, the resulting satellite image\ntime series (SITS) are complex, incorporating information from the temporal,\nspatial, and spectral dimensions. Therefore, deep learning methods are often\ndeployed as they can analyze these complex relationships. This review presents\na summary of the state-of-the-art methods of modelling environmental,\nagricultural, and other Earth observation variables from SITS data using deep\nlearning methods. We aim to provide a resource for remote sensing experts\ninterested in using deep learning techniques to enhance Earth observation\nmodels with temporal information.\n","authors":["Lynn Miller","Charlotte Pelletier","Geoffrey I. Webb"],"pdf_url":"https://arxiv.org/pdf/2404.03936v2.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2404.07713v1","updated":"2024-04-11T12:59:38Z","published":"2024-04-11T12:59:38Z","title":"Progressive Semantic-Guided Vision Transformer for Zero-Shot Learning","summary":" Zero-shot learning (ZSL) recognizes the unseen classes by conducting\nvisual-semantic interactions to transfer semantic knowledge from seen classes\nto unseen ones, supported by semantic information (e.g., attributes). However,\nexisting ZSL methods simply extract visual features using a pre-trained network\nbackbone (i.e., CNN or ViT), which fail to learn matched visual-semantic\ncorrespondences for representing semantic-related visual features as lacking of\nthe guidance of semantic information, resulting in undesirable visual-semantic\ninteractions. To tackle this issue, we propose a progressive semantic-guided\nvision transformer for zero-shot learning (dubbed ZSLViT). ZSLViT mainly\nconsiders two properties in the whole network: i) discover the semantic-related\nvisual representations explicitly, and ii) discard the semantic-unrelated\nvisual information. Specifically, we first introduce semantic-embedded token\nlearning to improve the visual-semantic correspondences via semantic\nenhancement and discover the semantic-related visual tokens explicitly with\nsemantic-guided token attention. Then, we fuse low semantic-visual\ncorrespondence visual tokens to discard the semantic-unrelated visual\ninformation for visual enhancement. These two operations are integrated into\nvarious encoders to progressively learn semantic-related visual representations\nfor accurate visual-semantic interactions in ZSL. The extensive experiments\nshow that our ZSLViT achieves significant performance gains on three popular\nbenchmark datasets, i.e., CUB, SUN, and AWA2.\n","authors":["Shiming Chen","Wenjin Hou","Salman Khan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2404.07713v1.pdf","comment":"Accepted to CVPR'24"},{"id":"http://arxiv.org/abs/2404.07711v1","updated":"2024-04-11T12:58:12Z","published":"2024-04-11T12:58:12Z","title":"OpenTrench3D: A Photogrammetric 3D Point Cloud Dataset for Semantic\n Segmentation of Underground Utilities","summary":" Identifying and classifying underground utilities is an important task for\nefficient and effective urban planning and infrastructure maintenance. We\npresent OpenTrench3D, a novel and comprehensive 3D Semantic Segmentation point\ncloud dataset, designed to advance research and development in underground\nutility surveying and mapping. OpenTrench3D covers a completely novel domain\nfor public 3D point cloud datasets and is unique in its focus, scope, and\ncost-effective capturing method. The dataset consists of 310 point clouds\ncollected across 7 distinct areas. These include 5 water utility areas and 2\ndistrict heating utility areas. The inclusion of different geographical areas\nand main utilities (water and district heating utilities) makes OpenTrench3D\nparticularly valuable for inter-domain transfer learning experiments. We\nprovide benchmark results for the dataset using three state-of-the-art semantic\nsegmentation models, PointNeXt, PointVector and PointMetaBase. Benchmarks are\nconducted by training on data from water areas, fine-tuning on district heating\narea 1 and evaluating on district heating area 2. The dataset is publicly\navailable. With OpenTrench3D, we seek to foster innovation and progress in the\nfield of 3D semantic segmentation in applications related to detection and\ndocumentation of underground utilities as well as in transfer learning methods\nin general.\n","authors":["Lasse H. Hansen","Simon B. Jensen","Mark P. Philipsen","Andreas Møgelmose","Lars Bodum","Thomas B. Moeslund"],"pdf_url":"https://arxiv.org/pdf/2404.07711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07705v1","updated":"2024-04-11T12:49:56Z","published":"2024-04-11T12:49:56Z","title":"ViM-UNet: Vision Mamba for Biomedical Segmentation","summary":" CNNs, most notably the UNet, are the default architecture for biomedical\nsegmentation. Transformer-based approaches, such as UNETR, have been proposed\nto replace them, benefiting from a global field of view, but suffering from\nlarger runtimes and higher parameter counts. The recent Vision Mamba\narchitecture offers a compelling alternative to transformers, also providing a\nglobal field of view, but at higher efficiency. Here, we introduce ViM-UNet, a\nnovel segmentation architecture based on it and compare it to UNet and UNETR\nfor two challenging microscopy instance segmentation tasks. We find that it\nperforms similarly or better than UNet, depending on the task, and outperforms\nUNETR while being more efficient. Our code is open source and documented at\nhttps://github.com/constantinpape/torch-em/blob/main/vimunet.md.\n","authors":["Anwai Archit","Constantin Pape"],"pdf_url":"https://arxiv.org/pdf/2404.07705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07698v1","updated":"2024-04-11T12:44:15Z","published":"2024-04-11T12:44:15Z","title":"Point Cloud Geometry Scalable Coding with a Quality-Conditioned Latents\n Probability Estimator","summary":" The widespread usage of point clouds (PC) for immersive visual applications\nhas resulted in the use of very heterogeneous receiving conditions and devices,\nnotably in terms of network, hardware, and display capabilities. In this\nscenario, quality scalability, i.e., the ability to reconstruct a signal at\ndifferent qualities by progressively decoding a single bitstream, is a major\nrequirement that has yet to be conveniently addressed, notably in most\nlearning-based PC coding solutions. This paper proposes a quality scalability\nscheme, named Scalable Quality Hyperprior (SQH), adaptable to learning-based\nstatic point cloud geometry codecs, which uses a Quality-conditioned Latents\nProbability Estimator (QuLPE) to decode a high-quality version of a PC\nlearning-based representation, based on an available lower quality base layer.\nSQH is integrated in the future JPEG PC coding standard, allowing to create a\nlayered bitstream that can be used to progressively decode the PC geometry with\nincreasing quality and fidelity. Experimental results show that SQH offers the\nquality scalability feature with very limited or no compression performance\npenalty at all when compared with the corresponding non-scalable solution, thus\npreserving the significant compression gains over other state-of-the-art PC\ncodecs.\n","authors":["Daniele Mari","André F. R. Guarda","Nuno M. M. Rodrigues","Simone Milani","Fernando Pereira"],"pdf_url":"https://arxiv.org/pdf/2404.07698v1.pdf","comment":"Submitted at ICIP 2024"},{"id":"http://arxiv.org/abs/2404.07696v1","updated":"2024-04-11T12:42:18Z","published":"2024-04-11T12:42:18Z","title":"Flatness Improves Backbone Generalisation in Few-shot Classification","summary":" Deployment of deep neural networks in real-world settings typically requires\nadaptation to new tasks with few examples. Few-shot classification (FSC)\nprovides a solution to this problem by leveraging pre-trained backbones for\nfast adaptation to new classes. Surprisingly, most efforts have only focused on\ndeveloping architectures for easing the adaptation to the target domain without\nconsidering the importance of backbone training for good generalisation. We\nshow that flatness-aware backbone training with vanilla fine-tuning results in\na simpler yet competitive baseline compared to the state-of-the-art. Our\nresults indicate that for in- and cross-domain FSC, backbone training is\ncrucial to achieving good generalisation across different adaptation methods.\nWe advocate more care should be taken when training these models.\n","authors":["Rui Li","Martin Trapp","Marcus Klasson","Arno Solin"],"pdf_url":"https://arxiv.org/pdf/2404.07696v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07687v1","updated":"2024-04-11T12:26:10Z","published":"2024-04-11T12:26:10Z","title":"Chaos in Motion: Unveiling Robustness in Remote Heart Rate Measurement\n through Brain-Inspired Skin Tracking","summary":" Heart rate is an important physiological indicator of human health status.\nExisting remote heart rate measurement methods typically involve facial\ndetection followed by signal extraction from the region of interest (ROI).\nThese SOTA methods have three serious problems: (a) inaccuracies even failures\nin detection caused by environmental influences or subject movement; (b)\nfailures for special patients such as infants and burn victims; (c) privacy\nleakage issues resulting from collecting face video. To address these issues,\nwe regard the remote heart rate measurement as the process of analyzing the\nspatiotemporal characteristics of the optical flow signal in the video. We\napply chaos theory to computer vision tasks for the first time, thus designing\na brain-inspired framework. Firstly, using an artificial primary visual cortex\nmodel to extract the skin in the videos, and then calculate heart rate by\ntime-frequency analysis on all pixels. Our method achieves Robust Skin Tracking\nfor Heart Rate measurement, called HR-RST. The experimental results show that\nHR-RST overcomes the difficulty of environmental influences and effectively\ntracks the subject movement. Moreover, the method could extend to other body\nparts. Consequently, the method can be applied to special patients and\neffectively protect individual privacy, offering an innovative solution.\n","authors":["Jie Wang","Jing Lian","Minjie Ma","Junqiang Lei","Chunbiao Li","Bin Li","Jizhao Liu"],"pdf_url":"https://arxiv.org/pdf/2404.07687v1.pdf","comment":"8 pages, 10 figures"},{"id":"http://arxiv.org/abs/2404.07686v1","updated":"2024-04-11T12:25:54Z","published":"2024-04-11T12:25:54Z","title":"Depth Estimation using Weighted-loss and Transfer Learning","summary":" Depth estimation from 2D images is a common computer vision task that has\napplications in many fields including autonomous vehicles, scene understanding\nand robotics. The accuracy of a supervised depth estimation method mainly\nrelies on the chosen loss function, the model architecture, quality of data and\nperformance metrics. In this study, we propose a simplified and adaptable\napproach to improve depth estimation accuracy using transfer learning and an\noptimized loss function. The optimized loss function is a combination of\nweighted losses to which enhance robustness and generalization: Mean Absolute\nError (MAE), Edge Loss and Structural Similarity Index (SSIM). We use a grid\nsearch and a random search method to find optimized weights for the losses,\nwhich leads to an improved model. We explore multiple encoder-decoder-based\nmodels including DenseNet121, DenseNet169, DenseNet201, and EfficientNet for\nthe supervised depth estimation model on NYU Depth Dataset v2. We observe that\nthe EfficientNet model, pre-trained on ImageNet for classification when used as\nan encoder, with a simple upsampling decoder, gives the best results in terms\nof RSME, REL and log10: 0.386, 0.113 and 0.049, respectively. We also perform a\nqualitative analysis which illustrates that our model produces depth maps that\nclosely resemble ground truth, even in cases where the ground truth is flawed.\nThe results indicate significant improvements in accuracy and robustness, with\nEfficientNet being the most successful architecture.\n","authors":["Muhammad Adeel Hafeez","Michael G. Madden","Ganesh Sistu","Ihsan Ullah"],"pdf_url":"https://arxiv.org/pdf/2404.07686v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.09657v2","updated":"2024-04-11T12:25:45Z","published":"2022-08-20T10:59:33Z","title":"Is Medieval Distant Viewing Possible? : Extending and Enriching\n Annotation of Legacy Image Collections using Visual Analytics","summary":" Distant viewing approaches have typically used image datasets close to the\ncontemporary image data used to train machine learning models. To work with\nimages from other historical periods requires expert annotated data, and the\nquality of labels is crucial for the quality of results. Especially when\nworking with cultural heritage collections that contain myriad uncertainties,\nannotating data, or re-annotating, legacy data is an arduous task. In this\npaper, we describe working with two pre-annotated sets of medieval manuscript\nimages that exhibit conflicting and overlapping metadata. Since a manual\nreconciliation of the two legacy ontologies would be very expensive, we aim (1)\nto create a more uniform set of descriptive labels to serve as a \"bridge\" in\nthe combined dataset, and (2) to establish a high quality hierarchical\nclassification that can be used as a valuable input for subsequent supervised\nmachine learning. To achieve these goals, we developed visualization and\ninteraction mechanisms, enabling medievalists to combine, regularize and extend\nthe vocabulary used to describe these, and other cognate, image datasets. The\nvisual interfaces provide experts an overview of relationships in the data\ngoing beyond the sum total of the metadata. Word and image embeddings as well\nas co-occurrences of labels across the datasets, enable batch re-annotation of\nimages, recommendation of label candidates and support composing a hierarchical\nclassification of labels.\n","authors":["Christofer Meinecke","Estelle Guéville","David Joseph Wrisley","Stefan Jänicke"],"pdf_url":"https://arxiv.org/pdf/2208.09657v2.pdf","comment":"Revision after DSH Peer Review. Paper is now accepted at DSH"},{"id":"http://arxiv.org/abs/2404.07685v1","updated":"2024-04-11T12:24:47Z","published":"2024-04-11T12:24:47Z","title":"Run-time Monitoring of 3D Object Detection in Automated Driving Systems\n Using Early Layer Neural Activation Patterns","summary":" Monitoring the integrity of object detection for errors within the perception\nmodule of automated driving systems (ADS) is paramount for ensuring safety.\nDespite recent advancements in deep neural network (DNN)-based object\ndetectors, their susceptibility to detection errors, particularly in the\nless-explored realm of 3D object detection, remains a significant concern.\nState-of-the-art integrity monitoring (also known as introspection) mechanisms\nin 2D object detection mainly utilise the activation patterns in the final\nlayer of the DNN-based detector's backbone. However, that may not sufficiently\naddress the complexities and sparsity of data in 3D object detection. To this\nend, we conduct, in this article, an extensive investigation into the effects\nof activation patterns extracted from various layers of the backbone network\nfor introspecting the operation of 3D object detectors. Through a comparative\nanalysis using Kitti and NuScenes datasets with PointPillars and CenterPoint\ndetectors, we demonstrate that using earlier layers' activation patterns\nenhances the error detection performance of the integrity monitoring system,\nyet increases computational complexity. To address the real-time operation\nrequirements in ADS, we also introduce a novel introspection method that\ncombines activation patterns from multiple layers of the detector's backbone\nand report its performance.\n","authors":["Hakan Yekta Yatbaz","Mehrdad Dianati","Konstantinos Koufos","Roger Woodman"],"pdf_url":"https://arxiv.org/pdf/2404.07685v1.pdf","comment":"Accepted by CVPR 2024 Workshop on Safe Autonomy for All Domains\n (SAIAD)"},{"id":"http://arxiv.org/abs/2404.07676v1","updated":"2024-04-11T12:14:48Z","published":"2024-04-11T12:14:48Z","title":"Model-based Cleaning of the QUILT-1M Pathology Dataset for\n Text-Conditional Image Synthesis","summary":" The QUILT-1M dataset is the first openly available dataset containing images\nharvested from various online sources. While it provides a huge data variety,\nthe image quality and composition is highly heterogeneous, impacting its\nutility for text-conditional image synthesis. We propose an automatic pipeline\nthat provides predictions of the most common impurities within the images,\ne.g., visibility of narrators, desktop environment and pathology software, or\ntext within the image. Additionally, we propose to use semantic alignment\nfiltering of the image-text pairs. Our findings demonstrate that by rigorously\nfiltering the dataset, there is a substantial enhancement of image fidelity in\ntext-to-image tasks.\n","authors":["Marc Aubreville","Jonathan Ganz","Jonas Ammeling","Christopher C. Kaltenecker","Christof A. Bertram"],"pdf_url":"https://arxiv.org/pdf/2404.07676v1.pdf","comment":"4 pages (short paper)"},{"id":"http://arxiv.org/abs/2402.13255v2","updated":"2024-04-11T12:13:27Z","published":"2024-02-20T18:59:57Z","title":"How NeRFs and 3D Gaussian Splatting are Reshaping SLAM: a Survey","summary":" Over the past two decades, research in the field of Simultaneous Localization\nand Mapping (SLAM) has undergone a significant evolution, highlighting its\ncritical role in enabling autonomous exploration of unknown environments. This\nevolution ranges from hand-crafted methods, through the era of deep learning,\nto more recent developments focused on Neural Radiance Fields (NeRFs) and 3D\nGaussian Splatting (3DGS) representations. Recognizing the growing body of\nresearch and the absence of a comprehensive survey on the topic, this paper\naims to provide the first comprehensive overview of SLAM progress through the\nlens of the latest advancements in radiance fields. It sheds light on the\nbackground, evolutionary path, inherent strengths and limitations, and serves\nas a fundamental reference to highlight the dynamic progress and specific\nchallenges.\n","authors":["Fabio Tosi","Youmin Zhang","Ziren Gong","Erik Sandström","Stefano Mattoccia","Martin R. Oswald","Matteo Poggi"],"pdf_url":"https://arxiv.org/pdf/2402.13255v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07671v1","updated":"2024-04-11T12:06:50Z","published":"2024-04-11T12:06:50Z","title":"Deep learning-driven pulmonary arteries and veins segmentation reveals\n demography-associated pulmonary vasculature anatomy","summary":" Pulmonary artery-vein segmentation is crucial for diagnosing pulmonary\ndiseases and surgical planning, and is traditionally achieved by Computed\nTomography Pulmonary Angiography (CTPA). However, concerns regarding adverse\nhealth effects from contrast agents used in CTPA have constrained its clinical\nutility. In contrast, identifying arteries and veins using non-contrast CT, a\nconventional and low-cost clinical examination routine, has long been\nconsidered impossible. Here we propose a High-abundant Pulmonary Artery-vein\nSegmentation (HiPaS) framework achieving accurate artery-vein segmentation on\nboth non-contrast CT and CTPA across various spatial resolutions. HiPaS first\nperforms spatial normalization on raw CT scans via a super-resolution module,\nand then iteratively achieves segmentation results at different branch levels\nby utilizing the low-level vessel segmentation as a prior for high-level vessel\nsegmentation. We trained and validated HiPaS on our established multi-centric\ndataset comprising 1,073 CT volumes with meticulous manual annotation. Both\nquantitative experiments and clinical evaluation demonstrated the superior\nperformance of HiPaS, achieving a dice score of 91.8% and a sensitivity of\n98.0%. Further experiments demonstrated the non-inferiority of HiPaS\nsegmentation on non-contrast CT compared to segmentation on CTPA. Employing\nHiPaS, we have conducted an anatomical study of pulmonary vasculature on 10,613\nparticipants in China (five sites), discovering a new association between\npulmonary vessel abundance and sex and age: vessel abundance is significantly\nhigher in females than in males, and slightly decreases with age, under the\ncontrolling of lung volumes (p < 0.0001). HiPaS realizing accurate artery-vein\nsegmentation delineates a promising avenue for clinical diagnosis and\nunderstanding pulmonary physiology in a non-invasive manner.\n","authors":["Yuetan Chu","Gongning Luo","Longxi Zhou","Shaodong Cao","Guolin Ma","Xianglin Meng","Juexiao Zhou","Changchun Yang","Dexuan Xie","Ricardo Henao","Xigang Xiao","Lianming Wu","Zhaowen Qiu","Xin Gao"],"pdf_url":"https://arxiv.org/pdf/2404.07671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11111v2","updated":"2024-04-11T12:01:34Z","published":"2024-03-17T06:31:16Z","title":"3D Human Reconstruction in the Wild with Synthetic Data Using Generative\n Models","summary":" In this work, we show that synthetic data created by generative models is\ncomplementary to computer graphics (CG) rendered data for achieving remarkable\ngeneralization performance on diverse real-world scenes for 3D human pose and\nshape estimation (HPS). Specifically, we propose an effective approach based on\nrecent diffusion models, termed HumanWild, which can effortlessly generate\nhuman images and corresponding 3D mesh annotations. We first collect a\nlarge-scale human-centric dataset with comprehensive annotations, e.g., text\ncaptions and surface normal images. Then, we train a customized ControlNet\nmodel upon this dataset to generate diverse human images and initial\nground-truth labels. At the core of this step is that we can easily obtain\nnumerous surface normal images from a 3D human parametric model, e.g., SMPL-X,\nby rendering the 3D mesh onto the image plane. As there exists inevitable noise\nin the initial labels, we then apply an off-the-shelf foundation segmentation\nmodel, i.e., SAM, to filter negative data samples. Our data generation pipeline\nis flexible and customizable to facilitate different real-world tasks, e.g.,\nego-centric scenes and perspective-distortion scenes. The generated dataset\ncomprises 0.79M images with corresponding 3D annotations, covering versatile\nviewpoints, scenes, and human identities. We train various HPS regressors on\ntop of the generated data and evaluate them on a wide range of benchmarks\n(3DPW, RICH, EgoBody, AGORA, SSP-3D) to verify the effectiveness of the\ngenerated data. By exclusively employing generative models, we generate\nlarge-scale in-the-wild human images and high-quality annotations, eliminating\nthe need for real-world data collection.\n","authors":["Yongtao Ge","Wenjia Wang","Yongfan Chen","Hao Chen","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2403.11111v2.pdf","comment":"project page: https://yongtaoge.github.io/projects/humanwild"},{"id":"http://arxiv.org/abs/2404.07668v1","updated":"2024-04-11T12:00:13Z","published":"2024-04-11T12:00:13Z","title":"Shape Completion in the Dark: Completing Vertebrae Morphology from 3D\n Ultrasound","summary":" Purpose: Ultrasound (US) imaging, while advantageous for its radiation-free\nnature, is challenging to interpret due to only partially visible organs and a\nlack of complete 3D information. While performing US-based diagnosis or\ninvestigation, medical professionals therefore create a mental map of the 3D\nanatomy. In this work, we aim to replicate this process and enhance the visual\nrepresentation of anatomical structures.\n Methods: We introduce a point-cloud-based probabilistic DL method to complete\noccluded anatomical structures through 3D shape completion and choose US-based\nspine examinations as our application. To enable training, we generate\nsynthetic 3D representations of partially occluded spinal views by mimicking US\nphysics and accounting for inherent artifacts.\n Results: The proposed model performs consistently on synthetic and patient\ndata, with mean and median differences of 2.02 and 0.03 in CD, respectively.\nOur ablation study demonstrates the importance of US physics-based data\ngeneration, reflected in the large mean and median difference of 11.8 CD and\n9.55 CD, respectively. Additionally, we demonstrate that anatomic landmarks,\nsuch as the spinous process (with reconstruction CD of 4.73) and the facet\njoints (mean distance to GT of 4.96mm) are preserved in the 3D completion.\n Conclusion: Our work establishes the feasibility of 3D shape completion for\nlumbar vertebrae, ensuring the preservation of level-wise characteristics and\nsuccessful generalization from synthetic to real data. The incorporation of US\nphysics contributes to more accurate patient data completions. Notably, our\nmethod preserves essential anatomic landmarks and reconstructs crucial\ninjections sites at their correct locations. The generated data and source code\nwill be made publicly available\n(https://github.com/miruna20/Shape-Completion-in-the-Dark).\n","authors":["Miruna-Alexandra Gafencu","Yordanka Velikova","Mahdi Saleh","Tamas Ungi","Nassir Navab","Thomas Wendler","Mohammad Farid Azampour"],"pdf_url":"https://arxiv.org/pdf/2404.07668v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07667v1","updated":"2024-04-11T12:00:06Z","published":"2024-04-11T12:00:06Z","title":"Dealing with Subject Similarity in Differential Morphing Attack\n Detection","summary":" The advent of morphing attacks has posed significant security concerns for\nautomated Face Recognition systems, raising the pressing need for robust and\neffective Morphing Attack Detection (MAD) methods able to effectively address\nthis issue. In this paper, we focus on Differential MAD (D-MAD), where a\ntrusted live capture, usually representing the criminal, is compared with the\ndocument image to classify it as morphed or bona fide. We show these approaches\nbased on identity features are effective when the morphed image and the live\none are sufficiently diverse; unfortunately, the effectiveness is significantly\nreduced when the same approaches are applied to look-alike subjects or in all\nthose cases when the similarity between the two compared images is high (e.g.\ncomparison between the morphed image and the accomplice). Therefore, in this\npaper, we propose ACIdA, a modular D-MAD system, consisting of a module for the\nattempt type classification, and two modules for the identity and artifacts\nanalysis on input images. Successfully addressing this task would allow\nbroadening the D-MAD applications including, for instance, the document\nenrollment stage, which currently relies entirely on human evaluation, thus\nlimiting the possibility of releasing ID documents with manipulated images, as\nwell as the automated gates to detect both accomplices and criminals. An\nextensive cross-dataset experimental evaluation conducted on the introduced\nscenario shows that ACIdA achieves state-of-the-art results, outperforming\nliterature competitors, while maintaining good performance in traditional D-MAD\nbenchmarks.\n","authors":["Nicolò Di Domenico","Guido Borghi","Annalisa Franco","Davide Maltoni"],"pdf_url":"https://arxiv.org/pdf/2404.07667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07664v1","updated":"2024-04-11T11:55:42Z","published":"2024-04-11T11:55:42Z","title":"Finding Dino: A plug-and-play framework for unsupervised detection of\n out-of-distribution objects using prototypes","summary":" Detecting and localising unknown or Out-of-distribution (OOD) objects in any\nscene can be a challenging task in vision. Particularly, in safety-critical\ncases involving autonomous systems like automated vehicles or trains.\nSupervised anomaly segmentation or open-world object detection models depend on\ntraining on exhaustively annotated datasets for every domain and still struggle\nin distinguishing between background and OOD objects. In this work, we present\na plug-and-play generalised framework - PRototype-based zero-shot OOD detection\nWithout Labels (PROWL). It is an inference-based method that does not require\ntraining on the domain dataset and relies on extracting relevant features from\nself-supervised pre-trained models. PROWL can be easily adapted to detect OOD\nobjects in any operational design domain by specifying a list of known classes\nfrom this domain. PROWL, as an unsupervised method, outperforms other\nsupervised methods trained without auxiliary OOD data on the RoadAnomaly and\nRoadObstacle datasets provided in SegmentMeIfYouCan (SMIYC) benchmark. We also\ndemonstrate its suitability for other domains such as rail and maritime scenes.\n","authors":["Poulami Sinhamahapatra","Franziska Schwaiger","Shirsha Bose","Huiyu Wang","Karsten Roscher","Stephan Guennemann"],"pdf_url":"https://arxiv.org/pdf/2404.07664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03122v2","updated":"2024-04-11T11:42:13Z","published":"2024-03-05T17:07:29Z","title":"NRDF: Neural Riemannian Distance Fields for Learning Articulated Pose\n Priors","summary":" Faithfully modeling the space of articulations is a crucial task that allows\nrecovery and generation of realistic poses, and remains a notorious challenge.\nTo this end, we introduce Neural Riemannian Distance Fields (NRDFs),\ndata-driven priors modeling the space of plausible articulations, represented\nas the zero-level-set of a neural field in a high-dimensional\nproduct-quaternion space. To train NRDFs only on positive examples, we\nintroduce a new sampling algorithm, ensuring that the geodesic distances follow\na desired distribution, yielding a principled distance field learning paradigm.\nWe then devise a projection algorithm to map any random pose onto the level-set\nby an adaptive-step Riemannian optimizer, adhering to the product manifold of\njoint rotations at all times. NRDFs can compute the Riemannian gradient via\nbackpropagation and by mathematical analogy, are related to Riemannian flow\nmatching, a recent generative model. We conduct a comprehensive evaluation of\nNRDF against other pose priors in various downstream tasks, i.e., pose\ngeneration, image-based pose estimation, and solving inverse kinematics,\nhighlighting NRDF's superior performance. Besides humans, NRDF's versatility\nextends to hand and animal poses, as it can effectively represent any\narticulation.\n","authors":["Yannan He","Garvita Tiwari","Tolga Birdal","Jan Eric Lenssen","Gerard Pons-Moll"],"pdf_url":"https://arxiv.org/pdf/2403.03122v2.pdf","comment":"Accepted by CVPR 2024. Project page:\n https://virtualhumans.mpi-inf.mpg.de/nrdf"},{"id":"http://arxiv.org/abs/2404.07649v1","updated":"2024-04-11T11:12:06Z","published":"2024-04-11T11:12:06Z","title":"Separated Attention: An Improved Cycle GAN Based Under Water Image\n Enhancement Method","summary":" In this paper we have present an improved Cycle GAN based model for under\nwater image enhancement. We have utilized the cycle consistent learning\ntechnique of the state-of-the-art Cycle GAN model with modification in the loss\nfunction in terms of depth-oriented attention which enhance the contrast of the\noverall image, keeping global content, color, local texture, and style\ninformation intact. We trained the Cycle GAN model with the modified loss\nfunctions on the benchmarked Enhancing Underwater Visual Perception (EUPV)\ndataset a large dataset including paired and unpaired sets of underwater images\n(poor and good quality) taken with seven distinct cameras in a range of\nvisibility situation during research on ocean exploration and human-robot\ncooperation. In addition, we perform qualitative and quantitative evaluation\nwhich supports the given technique applied and provided a better contrast\nenhancement model of underwater imagery. More significantly, the upgraded\nimages provide better results from conventional models and further for under\nwater navigation, pose estimation, saliency prediction, object detection and\ntracking. The results validate the appropriateness of the model for autonomous\nunderwater vehicles (AUV) in visual navigation.\n","authors":["Tashmoy Ghosh"],"pdf_url":"https://arxiv.org/pdf/2404.07649v1.pdf","comment":"9 pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.07645v1","updated":"2024-04-11T11:07:57Z","published":"2024-04-11T11:07:57Z","title":"Simba: Mamba augmented U-ShiftGCN for Skeletal Action Recognition in\n Videos","summary":" Skeleton Action Recognition (SAR) involves identifying human actions using\nskeletal joint coordinates and their interconnections. While plain Transformers\nhave been attempted for this task, they still fall short compared to the\ncurrent leading methods, which are rooted in Graph Convolutional Networks\n(GCNs) due to the absence of structural priors. Recently, a novel selective\nstate space model, Mamba, has surfaced as a compelling alternative to the\nattention mechanism in Transformers, offering efficient modeling of long\nsequences. In this work, to the utmost extent of our awareness, we present the\nfirst SAR framework incorporating Mamba. Each fundamental block of our model\nadopts a novel U-ShiftGCN architecture with Mamba as its core component. The\nencoder segment of the U-ShiftGCN is devised to extract spatial features from\nthe skeletal data using downsampling vanilla Shift S-GCN blocks. These spatial\nfeatures then undergo intermediate temporal modeling facilitated by the Mamba\nblock before progressing to the encoder section, which comprises vanilla\nupsampling Shift S-GCN blocks. Additionally, a Shift T-GCN (ShiftTCN) temporal\nmodeling unit is employed before the exit of each fundamental block to refine\ntemporal representations. This particular integration of downsampling spatial,\nintermediate temporal, upsampling spatial, and ultimate temporal subunits\nyields promising results for skeleton action recognition. We dub the resulting\nmodel \\textbf{Simba}, which attains state-of-the-art performance across three\nwell-known benchmark skeleton action recognition datasets: NTU RGB+D, NTU RGB+D\n120, and Northwestern-UCLA. Interestingly, U-ShiftGCN (Simba without\nIntermediate Mamba Block) by itself is capable of performing reasonably well\nand surpasses our baseline.\n","authors":["Soumyabrata Chaudhuri","Saumik Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2404.07645v1.pdf","comment":"20 pages, 6 tables, 1 figure"},{"id":"http://arxiv.org/abs/2404.03425v2","updated":"2024-04-11T10:51:34Z","published":"2024-04-04T13:06:25Z","title":"ChangeMamba: Remote Sensing Change Detection with Spatio-Temporal State\n Space Model","summary":" Convolutional neural networks (CNN) and Transformers have made impressive\nprogress in the field of remote sensing change detection (CD). However, both\narchitectures have inherent shortcomings. Recently, the Mamba architecture,\nbased on state space models, has shown remarkable performance in a series of\nnatural language processing tasks, which can effectively compensate for the\nshortcomings of the above two architectures. In this paper, we explore for the\nfirst time the potential of the Mamba architecture for remote sensing CD tasks.\nWe tailor the corresponding frameworks, called MambaBCD, MambaSCD, and\nMambaBDA, for binary change detection (BCD), semantic change detection (SCD),\nand building damage assessment (BDA), respectively. All three frameworks adopt\nthe cutting-edge Visual Mamba architecture as the encoder, which allows full\nlearning of global spatial contextual information from the input images. For\nthe change decoder, which is available in all three architectures, we propose\nthree spatio-temporal relationship modeling mechanisms, which can be naturally\ncombined with the Mamba architecture and fully utilize its attribute to achieve\nspatio-temporal interaction of multi-temporal features, thereby obtaining\naccurate change information. On five benchmark datasets, our proposed\nframeworks outperform current CNN- and Transformer-based approaches without\nusing any complex training strategies or tricks, fully demonstrating the\npotential of the Mamba architecture in CD tasks. Specifically, we obtained\n83.11%, 88.39% and 94.19% F1 scores on the three BCD datasets SYSU, LEVIR-CD+,\nand WHU-CD; on the SCD dataset SECOND, we obtained 24.11% SeK; and on the BDA\ndataset xBD, we obtained 81.41% overall F1 score. Further experiments show that\nour architecture is quite robust to degraded data. The source code will be\navailable in https://github.com/ChenHongruixuan/MambaCD\n","authors":["Hongruixuan Chen","Jian Song","Chengxi Han","Junshi Xia","Naoto Yokoya"],"pdf_url":"https://arxiv.org/pdf/2404.03425v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.16074v2","updated":"2024-04-11T10:45:05Z","published":"2023-10-24T15:16:19Z","title":"RePoseDM: Recurrent Pose Alignment and Gradient Guidance for Pose Guided\n Image Synthesis","summary":" Pose-guided person image synthesis task requires re-rendering a reference\nimage, which should have a photorealistic appearance and flawless pose\ntransfer. Since person images are highly structured, existing approaches\nrequire dense connections for complex deformations and occlusions because these\nare generally handled through multi-level warping and masking in latent space.\nThe feature maps generated by convolutional neural networks do not have\nequivariance, and hence multi-level warping is required to perform pose\nalignment. Inspired by the ability of the diffusion model to generate\nphotorealistic images from the given conditional guidance, we propose recurrent\npose alignment to provide pose-aligned texture features as conditional\nguidance. Due to the leakage of the source pose in conditional guidance, we\npropose gradient guidance from pose interaction fields, which output the\ndistance from the valid pose manifold given a predicted pose as input. This\nhelps in learning plausible pose transfer trajectories that result in\nphotorealism and undistorted texture details. Extensive results on two\nlarge-scale benchmarks and a user study demonstrate the ability of our proposed\napproach to generate photorealistic pose transfer under challenging scenarios.\nAdditionally, we demonstrate the efficiency of gradient guidance in pose-guided\nimage generation on the HumanArt dataset with fine-tuned stable diffusion.\n","authors":["Anant Khandelwal"],"pdf_url":"https://arxiv.org/pdf/2310.16074v2.pdf","comment":"Accepted at CVPR 2024 SyntaGen Workshop, 13 pages, 4 tables, 7\n figures"},{"id":"http://arxiv.org/abs/2312.01919v2","updated":"2024-04-11T10:38:33Z","published":"2023-12-04T14:23:18Z","title":"COTR: Compact Occupancy TRansformer for Vision-based 3D Occupancy\n Prediction","summary":" The autonomous driving community has shown significant interest in 3D\noccupancy prediction, driven by its exceptional geometric perception and\ngeneral object recognition capabilities. To achieve this, current works try to\nconstruct a Tri-Perspective View (TPV) or Occupancy (OCC) representation\nextending from the Bird-Eye-View perception. However, compressed views like TPV\nrepresentation lose 3D geometry information while raw and sparse OCC\nrepresentation requires heavy but redundant computational costs. To address the\nabove limitations, we propose Compact Occupancy TRansformer (COTR), with a\ngeometry-aware occupancy encoder and a semantic-aware group decoder to\nreconstruct a compact 3D OCC representation. The occupancy encoder first\ngenerates a compact geometrical OCC feature through efficient explicit-implicit\nview transformation. Then, the occupancy decoder further enhances the semantic\ndiscriminability of the compact OCC representation by a coarse-to-fine semantic\ngrouping strategy. Empirical experiments show that there are evident\nperformance gains across multiple baselines, e.g., COTR outperforms baselines\nwith a relative improvement of 8%-15%, demonstrating the superiority of our\nmethod.\n","authors":["Qihang Ma","Xin Tan","Yanyun Qu","Lizhuang Ma","Zhizhong Zhang","Yuan Xie"],"pdf_url":"https://arxiv.org/pdf/2312.01919v2.pdf","comment":"CVPR2024. Code is available at https://github.com/NotACracker/COTR"},{"id":"http://arxiv.org/abs/2404.07626v1","updated":"2024-04-11T10:26:40Z","published":"2024-04-11T10:26:40Z","title":"Homography Guided Temporal Fusion for Road Line and Marking Segmentation","summary":" Reliable segmentation of road lines and markings is critical to autonomous\ndriving. Our work is motivated by the observations that road lines and markings\nare (1) frequently occluded in the presence of moving vehicles, shadow, and\nglare and (2) highly structured with low intra-class shape variance and overall\nhigh appearance consistency. To solve these issues, we propose a Homography\nGuided Fusion (HomoFusion) module to exploit temporally-adjacent video frames\nfor complementary cues facilitating the correct classification of the partially\noccluded road lines or markings. To reduce computational complexity, a novel\nsurface normal estimator is proposed to establish spatial correspondences\nbetween the sampled frames, allowing the HomoFusion module to perform a\npixel-to-pixel attention mechanism in updating the representation of the\noccluded road lines or markings. Experiments on ApolloScape, a large-scale lane\nmark segmentation dataset, and ApolloScape Night with artificial simulated\nnight-time road conditions, demonstrate that our method outperforms other\nexisting SOTA lane mark segmentation models with less than 9\\% of their\nparameters and computational complexity. We show that exploiting available\ncamera intrinsic data and ground plane assumption for cross-frame\ncorrespondence can lead to a light-weight network with significantly improved\nperformances in speed and accuracy. We also prove the versatility of our\nHomoFusion approach by applying it to the problem of water puddle segmentation\nand achieving SOTA performance.\n","authors":["Shan Wang","Chuong Nguyen","Jiawei Liu","Kaihao Zhang","Wenhan Luo","Yanhao Zhang","Sundaram Muthu","Fahira Afzal Maken","Hongdong Li"],"pdf_url":"https://arxiv.org/pdf/2404.07626v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.15855v2","updated":"2024-04-11T10:19:41Z","published":"2023-08-30T08:44:21Z","title":"IIDM: Inter and Intra-domain Mixing for Semi-supervised Domain\n Adaptation in Semantic Segmentation","summary":" Despite recent advances in semantic segmentation, an inevitable challenge is\nthe performance degradation caused by the domain shift in real applications.\nCurrent dominant approach to solve this problem is unsupervised domain\nadaptation (UDA). However, the absence of labeled target data in UDA is overly\nrestrictive and limits performance. To overcome this limitation, a more\npractical scenario called semi-supervised domain adaptation (SSDA) has been\nproposed. Existing SSDA methods are derived from the UDA paradigm and primarily\nfocus on leveraging the unlabeled target data and source data. In this paper,\nwe highlight the significance of exploiting the intra-domain information\nbetween the labeled target data and unlabeled target data. Instead of solely\nusing the scarce labeled target data for supervision, we propose a novel SSDA\nframework that incorporates both Inter and Intra Domain Mixing (IIDM), where\ninter-domain mixing mitigates the source-target domain gap and intra-domain\nmixing enriches the available target domain information, and the network can\ncapture more domain-invariant features. We also explore different domain mixing\nstrategies to better exploit the target domain information. Comprehensive\nexperiments conducted on the GTA5 to Cityscapes and SYNTHIA to Cityscapes\nbenchmarks demonstrate the effectiveness of IIDM, surpassing previous methods\nby a large margin.\n","authors":["Weifu Fu","Qiang Nie","Jialin Li","Yuhuan Lin","Kai Wu","Jian Li","Yabiao Wang","Yong Liu","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2308.15855v2.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.07622v1","updated":"2024-04-11T10:16:44Z","published":"2024-04-11T10:16:44Z","title":"Multi-Image Visual Question Answering for Unsupervised Anomaly Detection","summary":" Unsupervised anomaly detection enables the identification of potential\npathological areas by juxtaposing original images with their pseudo-healthy\nreconstructions generated by models trained exclusively on normal images.\nHowever, the clinical interpretation of resultant anomaly maps presents a\nchallenge due to a lack of detailed, understandable explanations. Recent\nadvancements in language models have shown the capability of mimicking\nhuman-like understanding and providing detailed descriptions. This raises an\ninteresting question: \\textit{How can language models be employed to make the\nanomaly maps more explainable?} To the best of our knowledge, we are the first\nto leverage a language model for unsupervised anomaly detection, for which we\nconstruct a dataset with different questions and answers. Additionally, we\npresent a novel multi-image visual question answering framework tailored for\nanomaly detection, incorporating diverse feature fusion strategies to enhance\nvisual knowledge extraction. Our experiments reveal that the framework,\naugmented by our new Knowledge Q-Former module, adeptly answers questions on\nthe anomaly detection dataset. Besides, integrating anomaly maps as inputs\ndistinctly aids in improving the detection of unseen pathologies.\n","authors":["Jun Li","Cosmin I. Bercea","Philip Müller","Lina Felsner","Suhwan Kim","Daniel Rueckert","Benedikt Wiestler","Julia A. Schnabel"],"pdf_url":"https://arxiv.org/pdf/2404.07622v1.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.07620v1","updated":"2024-04-11T10:14:56Z","published":"2024-04-11T10:14:56Z","title":"Diffusion Probabilistic Multi-cue Level Set for Reducing Edge\n Uncertainty in Pancreas Segmentation","summary":" Accurately segmenting the pancreas remains a huge challenge. Traditional\nmethods encounter difficulties in semantic localization due to the small volume\nand distorted structure of the pancreas, while deep learning methods encounter\nchallenges in obtaining accurate edges because of low contrast and organ\noverlapping. To overcome these issues, we propose a multi-cue level set method\nbased on the diffusion probabilistic model, namely Diff-mcs. Our method adopts\na coarse-to-fine segmentation strategy. We use the diffusion probabilistic\nmodel in the coarse segmentation stage, with the obtained probability\ndistribution serving as both the initial localization and prior cues for the\nlevel set method. In the fine segmentation stage, we combine the prior cues\nwith grayscale cues and texture cues to refine the edge by maximizing the\ndifference between probability distributions of the cues inside and outside the\nlevel set curve. The method is validated on three public datasets and achieves\nstate-of-the-art performance, which can obtain more accurate segmentation\nresults with lower uncertainty segmentation edges. In addition, we conduct\nablation studies and uncertainty analysis to verify that the diffusion\nprobability model provides a more appropriate initialization for the level set\nmethod. Furthermore, when combined with multiple cues, the level set method can\nbetter obtain edges and improve the overall accuracy. Our code is available at\nhttps://github.com/GOUYUEE/Diff-mcs.\n","authors":["Yue Gou","Yuming Xing","Shengzhu Shi","Zhichang Guo"],"pdf_url":"https://arxiv.org/pdf/2404.07620v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18956v2","updated":"2024-04-11T10:06:10Z","published":"2024-02-29T08:51:51Z","title":"WWW: A Unified Framework for Explaining What, Where and Why of Neural\n Networks by Interpretation of Neuron Concepts","summary":" Recent advancements in neural networks have showcased their remarkable\ncapabilities across various domains. Despite these successes, the \"black box\"\nproblem still remains. Addressing this, we propose a novel framework, WWW, that\noffers the 'what', 'where', and 'why' of the neural network decisions in\nhuman-understandable terms. Specifically, WWW utilizes adaptive selection for\nconcept discovery, employing adaptive cosine similarity and thresholding\ntechniques to effectively explain 'what'. To address the 'where' and 'why', we\nproposed a novel combination of neuron activation maps (NAMs) with Shapley\nvalues, generating localized concept maps and heatmaps for individual inputs.\nFurthermore, WWW introduces a method for predicting uncertainty, leveraging\nheatmap similarities to estimate 'how' reliable the prediction is. Experimental\nevaluations of WWW demonstrate superior performance in both quantitative and\nqualitative metrics, outperforming existing methods in interpretability. WWW\nprovides a unified solution for explaining 'what', 'where', and 'why',\nintroducing a method for localized explanations from global interpretations and\noffering a plug-and-play solution adaptable to various architectures.\n","authors":["Yong Hyun Ahn","Hyeon Bae Kim","Seong Tae Kim"],"pdf_url":"https://arxiv.org/pdf/2402.18956v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.01705v2","updated":"2024-04-11T10:05:12Z","published":"2024-04-02T07:38:16Z","title":"Samba: Semantic Segmentation of Remotely Sensed Images with State Space\n Model","summary":" High-resolution remotely sensed images pose a challenge for commonly used\nsemantic segmentation methods such as Convolutional Neural Network (CNN) and\nVision Transformer (ViT). CNN-based methods struggle with handling such\nhigh-resolution images due to their limited receptive field, while ViT faces\nchallenges in handling long sequences. Inspired by Mamba, which adopts a State\nSpace Model (SSM) to efficiently capture global semantic information, we\npropose a semantic segmentation framework for high-resolution remotely sensed\nimages, named Samba. Samba utilizes an encoder-decoder architecture, with Samba\nblocks serving as the encoder for efficient multi-level semantic information\nextraction, and UperNet functioning as the decoder. We evaluate Samba on the\nLoveDA, ISPRS Vaihingen, and ISPRS Potsdam datasets, comparing its performance\nagainst top-performing CNN and ViT methods. The results reveal that Samba\nachieved unparalleled performance on commonly used remote sensing datasets for\nsemantic segmentation. Our proposed Samba demonstrates for the first time the\neffectiveness of SSM in semantic segmentation of remotely sensed images,\nsetting a new benchmark in performance for Mamba-based techniques in this\nspecific application. The source code and baseline implementations are\navailable at https://github.com/zhuqinfeng1999/Samba.\n","authors":["Qinfeng Zhu","Yuanzhi Cai","Yuan Fang","Yihan Yang","Cheng Chen","Lei Fan","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.01705v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07610v1","updated":"2024-04-11T09:58:23Z","published":"2024-04-11T09:58:23Z","title":"Do You Remember? Dense Video Captioning with Cross-Modal Memory\n Retrieval","summary":" There has been significant attention to the research on dense video\ncaptioning, which aims to automatically localize and caption all events within\nuntrimmed video. Several studies introduce methods by designing dense video\ncaptioning as a multitasking problem of event localization and event captioning\nto consider inter-task relations. However, addressing both tasks using only\nvisual input is challenging due to the lack of semantic content. In this study,\nwe address this by proposing a novel framework inspired by the cognitive\ninformation processing of humans. Our model utilizes external memory to\nincorporate prior knowledge. The memory retrieval method is proposed with\ncross-modal video-to-text matching. To effectively incorporate retrieved text\nfeatures, the versatile encoder and the decoder with visual and textual\ncross-attention modules are designed. Comparative experiments have been\nconducted to show the effectiveness of the proposed method on ActivityNet\nCaptions and YouCook2 datasets. Experimental results show promising performance\nof our model without extensive pretraining from a large video dataset.\n","authors":["Minkuk Kim","Hyeon Bae Kim","Jinyoung Moon","Jinwoo Choi","Seong Tae Kim"],"pdf_url":"https://arxiv.org/pdf/2404.07610v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.07607v1","updated":"2024-04-11T09:50:05Z","published":"2024-04-11T09:50:05Z","title":"Automatic Detection of Dark Ship-to-Ship Transfers using Deep Learning\n and Satellite Imagery","summary":" Despite extensive research into ship detection via remote sensing, no studies\nidentify ship-to-ship transfers in satellite imagery. Given the importance of\ntransshipment in illicit shipping practices, this is a significant gap. In what\nfollows, I train a convolutional neural network to accurately detect 4\ndifferent types of cargo vessel and two different types of Ship-to-Ship\ntransfer in PlanetScope satellite imagery. I then elaborate a pipeline for the\nautomatic detection of suspected illicit ship-to-ship transfers by\ncross-referencing satellite detections with vessel borne GPS data. Finally, I\napply this method to the Kerch Strait between Ukraine and Russia to identify\nover 400 dark transshipment events since 2022.\n","authors":["Ollie Ballinger"],"pdf_url":"https://arxiv.org/pdf/2404.07607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07605v1","updated":"2024-04-11T09:47:52Z","published":"2024-04-11T09:47:52Z","title":"Contrastive-Based Deep Embeddings for Label Noise-Resilient\n Histopathology Image Classification","summary":" Recent advancements in deep learning have proven highly effective in medical\nimage classification, notably within histopathology. However, noisy labels\nrepresent a critical challenge in histopathology image classification, where\naccurate annotations are vital for training robust deep learning models.\nIndeed, deep neural networks can easily overfit label noise, leading to severe\ndegradations in model performance. While numerous public pathology foundation\nmodels have emerged recently, none have evaluated their resilience to label\nnoise. Through thorough empirical analyses across multiple datasets, we exhibit\nthe label noise resilience property of embeddings extracted from foundation\nmodels trained in a self-supervised contrastive manner. We demonstrate that\ntraining with such embeddings substantially enhances label noise robustness\nwhen compared to non-contrastive-based ones as well as commonly used\nnoise-resilient methods. Our results unequivocally underline the superiority of\ncontrastive learning in effectively mitigating the label noise challenge. Code\nis publicly available at\nhttps://github.com/LucasDedieu/NoiseResilientHistopathology.\n","authors":["Lucas Dedieu","Nicolas Nerrienet","Adrien Nivaggioli","Clara Simmat","Marceau Clavel","Arnaud Gauthier","Stéphane Sockeel","Rémy Peyret"],"pdf_url":"https://arxiv.org/pdf/2404.07605v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2404.07603v1","updated":"2024-04-11T09:43:07Z","published":"2024-04-11T09:43:07Z","title":"GLID: Pre-training a Generalist Encoder-Decoder Vision Model","summary":" This paper proposes a GeneraLIst encoder-Decoder (GLID) pre-training method\nfor better handling various downstream computer vision tasks. While\nself-supervised pre-training approaches, e.g., Masked Autoencoder, have shown\nsuccess in transfer learning, task-specific sub-architectures are still\nrequired to be appended for different downstream tasks, which cannot enjoy the\nbenefits of large-scale pre-training. GLID overcomes this challenge by allowing\nthe pre-trained generalist encoder-decoder to be fine-tuned on various vision\ntasks with minimal task-specific architecture modifications. In the GLID\ntraining scheme, pre-training pretext task and other downstream tasks are\nmodeled as \"query-to-answer\" problems, including the pre-training pretext task\nand other downstream tasks. We pre-train a task-agnostic encoder-decoder with\nquery-mask pairs. During fine-tuning, GLID maintains the pre-trained\nencoder-decoder and queries, only replacing the topmost linear transformation\nlayer with task-specific linear heads. This minimizes the pretrain-finetune\narchitecture inconsistency and enables the pre-trained model to better adapt to\ndownstream tasks. GLID achieves competitive performance on various vision\ntasks, including object detection, image segmentation, pose estimation, and\ndepth estimation, outperforming or matching specialist models such as\nMask2Former, DETR, ViTPose, and BinsFormer.\n","authors":["Jihao Liu","Jinliang Zheng","Yu Liu","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2404.07603v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.07602v1","updated":"2024-04-11T09:41:14Z","published":"2024-04-11T09:41:14Z","title":"Attention based End to end network for Offline Writer Identification on\n Word level data","summary":" Writer identification due to its widespread application in various fields has\ngained popularity over the years. In scenarios where optimum handwriting\nsamples are available, whether they be in the form of a single line, a\nsentence, or an entire page, writer identification algorithms have demonstrated\nnoteworthy levels of accuracy. However, in scenarios where only a limited\nnumber of handwritten samples are available, particularly in the form of word\nimages, there is a significant scope for improvement.\n In this paper, we propose a writer identification system based on an\nattention-driven Convolutional Neural Network (CNN). The system is trained\nutilizing image segments, known as fragments, extracted from word images,\nemploying a pyramid-based strategy. This methodology enables the system to\ncapture a comprehensive representation of the data, encompassing both\nfine-grained details and coarse features across various levels of abstraction.\nThese extracted fragments serve as the training data for the convolutional\nnetwork, enabling it to learn a more robust representation compared to\ntraditional convolution-based networks trained on word images. Additionally,\nthe paper explores the integration of an attention mechanism to enhance the\nrepresentational power of the learned features. The efficacy of the proposed\nalgorithm is evaluated on three benchmark databases, demonstrating its\nproficiency in writer identification tasks, particularly in scenarios with\nlimited access to handwriting data.\n","authors":["Vineet Kumar","Suresh Sundaram"],"pdf_url":"https://arxiv.org/pdf/2404.07602v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07600v1","updated":"2024-04-11T09:39:58Z","published":"2024-04-11T09:39:58Z","title":"Implicit and Explicit Language Guidance for Diffusion-based Visual\n Perception","summary":" Text-to-image diffusion models have shown powerful ability on conditional\nimage synthesis. With large-scale vision-language pre-training, diffusion\nmodels are able to generate high-quality images with rich texture and\nreasonable structure under different text prompts. However, it is an open\nproblem to adapt the pre-trained diffusion model for visual perception. In this\npaper, we propose an implicit and explicit language guidance framework for\ndiffusion-based perception, named IEDP. Our IEDP comprises of an implicit\nlanguage guidance branch and an explicit language guidance branch. The implicit\nbranch employs frozen CLIP image encoder to directly generate implicit text\nembeddings that are fed to diffusion model, without using explicit text\nprompts. The explicit branch utilizes the ground-truth labels of corresponding\nimages as text prompts to condition feature extraction of diffusion model.\nDuring training, we jointly train diffusion model by sharing the model weights\nof these two branches. As a result, implicit and explicit branches can jointly\nguide feature learning. During inference, we only employ implicit branch for\nfinal prediction, which does not require any ground-truth labels. Experiments\nare performed on two typical perception tasks, including semantic segmentation\nand depth estimation. Our IEDP achieves promising performance on both tasks.\nFor semantic segmentation, our IEDP has the mIoU score of 55.9% on AD20K\nvalidation set, which outperforms the baseline method VPD by 2.2%. For depth\nestimation, our IEDP outperforms the baseline method VPD with a relative gain\nof 10.2%.\n","authors":["Hefeng Wang","Jiale Cao","Jin Xie","Aiping Yang","Yanwei Pang"],"pdf_url":"https://arxiv.org/pdf/2404.07600v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07594v1","updated":"2024-04-11T09:23:44Z","published":"2024-04-11T09:23:44Z","title":"Weakly-Supervised Learning via Multi-Lateral Decoder Branching for\n Guidewire Segmentation in Robot-Assisted Cardiovascular Catheterization","summary":" Although robot-assisted cardiovascular catheterization is commonly performed\nfor intervention of cardiovascular diseases, more studies are needed to support\nthe procedure with automated tool segmentation. This can aid surgeons on tool\ntracking and visualization during intervention. Learning-based segmentation has\nrecently offered state-of-the-art segmentation performances however, generating\nground-truth signals for fully-supervised methods is labor-intensive and time\nconsuming for the interventionists. In this study, a weakly-supervised learning\nmethod with multi-lateral pseudo labeling is proposed for tool segmentation in\ncardiac angiograms. The method includes a modified U-Net model with one encoder\nand multiple lateral-branched decoders that produce pseudo labels as\nsupervision signals under different perturbation. The pseudo labels are\nself-generated through a mixed loss function and shared consistency in the\ndecoders. We trained the model end-to-end with weakly-annotated data obtained\nduring robotic cardiac catheterization. Experiments with the proposed model\nshows weakly annotated data has closer performance to when fully annotated data\nis used. Compared to three existing weakly-supervised methods, our approach\nyielded higher segmentation performance across three different cardiac\nangiogram data. With ablation study, we showed consistent performance under\ndifferent parameters. Thus, we offer a less expensive method for real-time tool\nsegmentation and tracking during robot-assisted cardiac catheterization.\n","authors":["Olatunji Mumini Omisore","Toluwanimi Akinyemi","Anh Nguyen","Lei Wang"],"pdf_url":"https://arxiv.org/pdf/2404.07594v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07580v1","updated":"2024-04-11T09:13:50Z","published":"2024-04-11T09:13:50Z","title":"Multi-rater Prompting for Ambiguous Medical Image Segmentation","summary":" Multi-rater annotations commonly occur when medical images are independently\nannotated by multiple experts (raters). In this paper, we tackle two challenges\narisen in multi-rater annotations for medical image segmentation (called\nambiguous medical image segmentation): (1) How to train a deep learning model\nwhen a group of raters produces a set of diverse but plausible annotations, and\n(2) how to fine-tune the model efficiently when computation resources are not\navailable for re-training the entire model on a different dataset domain. We\npropose a multi-rater prompt-based approach to address these two challenges\naltogether. Specifically, we introduce a series of rater-aware prompts that can\nbe plugged into the U-Net model for uncertainty estimation to handle\nmulti-annotation cases. During the prompt-based fine-tuning process, only 0.3%\nof learnable parameters are required to be updated comparing to training the\nentire model. Further, in order to integrate expert consensus and disagreement,\nwe explore different multi-rater incorporation strategies and design a\nmix-training strategy for comprehensive insight learning. Extensive experiments\nverify the effectiveness of our new approach for ambiguous medical image\nsegmentation on two public datasets while alleviating the heavy burden of model\nre-training.\n","authors":["Jinhong Wang","Yi Cheng","Jintai Chen","Hongxia Xu","Danny Chen","Jian Wu"],"pdf_url":"https://arxiv.org/pdf/2404.07580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07122v2","updated":"2024-04-11T09:10:21Z","published":"2024-04-10T16:01:37Z","title":"Driver Attention Tracking and Analysis","summary":" We propose a novel method to estimate a driver's points-of-gaze using a pair\nof ordinary cameras mounted on the windshield and dashboard of a car. This is a\nchallenging problem due to the dynamics of traffic environments with 3D scenes\nof unknown depths. This problem is further complicated by the volatile distance\nbetween the driver and the camera system. To tackle these challenges, we\ndevelop a novel convolutional network that simultaneously analyzes the image of\nthe scene and the image of the driver's face. This network has a camera\ncalibration module that can compute an embedding vector that represents the\nspatial configuration between the driver and the camera system. This\ncalibration module improves the overall network's performance, which can be\njointly trained end to end.\n We also address the lack of annotated data for training and evaluation by\nintroducing a large-scale driving dataset with point-of-gaze annotations. This\nis an in situ dataset of real driving sessions in an urban city, containing\nsynchronized images of the driving scene as well as the face and gaze of the\ndriver. Experiments on this dataset show that the proposed method outperforms\nvarious baseline methods, having the mean prediction error of 29.69 pixels,\nwhich is relatively small compared to the $1280{\\times}720$ resolution of the\nscene camera.\n","authors":["Dat Viet Thanh Nguyen","Anh Tran","Hoai Nam Vu","Cuong Pham","Minh Hoai"],"pdf_url":"https://arxiv.org/pdf/2404.07122v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07564v1","updated":"2024-04-11T08:50:12Z","published":"2024-04-11T08:50:12Z","title":"ObjBlur: A Curriculum Learning Approach With Progressive Object-Level\n Blurring for Improved Layout-to-Image Generation","summary":" We present ObjBlur, a novel curriculum learning approach to improve\nlayout-to-image generation models, where the task is to produce realistic\nimages from layouts composed of boxes and labels. Our method is based on\nprogressive object-level blurring, which effectively stabilizes training and\nenhances the quality of generated images. This curriculum learning strategy\nsystematically applies varying degrees of blurring to individual objects or the\nbackground during training, starting from strong blurring to progressively\ncleaner images. Our findings reveal that this approach yields significant\nperformance improvements, stabilized training, smoother convergence, and\nreduced variance between multiple runs. Moreover, our technique demonstrates\nits versatility by being compatible with generative adversarial networks and\ndiffusion models, underlining its applicability across various generative\nmodeling paradigms. With ObjBlur, we reach new state-of-the-art results on the\ncomplex COCO and Visual Genome datasets.\n","authors":["Stanislav Frolov","Brian B. Moser","Sebastian Palacio","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2404.07564v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06710v2","updated":"2024-04-11T08:40:42Z","published":"2024-04-10T03:31:32Z","title":"SpikeNVS: Enhancing Novel View Synthesis from Blurry Images via Spike\n Camera","summary":" One of the most critical factors in achieving sharp Novel View Synthesis\n(NVS) using neural field methods like Neural Radiance Fields (NeRF) and 3D\nGaussian Splatting (3DGS) is the quality of the training images. However,\nConventional RGB cameras are susceptible to motion blur. In contrast,\nneuromorphic cameras like event and spike cameras inherently capture more\ncomprehensive temporal information, which can provide a sharp representation of\nthe scene as additional training data. Recent methods have explored the\nintegration of event cameras to improve the quality of NVS. The event-RGB\napproaches have some limitations, such as high training costs and the inability\nto work effectively in the background. Instead, our study introduces a new\nmethod that uses the spike camera to overcome these limitations. By considering\ntexture reconstruction from spike streams as ground truth, we design the\nTexture from Spike (TfS) loss. Since the spike camera relies on temporal\nintegration instead of temporal differentiation used by event cameras, our\nproposed TfS loss maintains manageable training costs. It handles foreground\nobjects with backgrounds simultaneously. We also provide a real-world dataset\ncaptured with our spike-RGB camera system to facilitate future research\nendeavors. We conduct extensive experiments using synthetic and real-world\ndatasets to demonstrate that our design can enhance novel view synthesis across\nNeRF and 3DGS. The code and dataset will be made available for public access.\n","authors":["Gaole Dai","Zhenyu Wang","Qinwen Xu","Ming Lu","Wen Cheng","Baixin Shi","Shanghang Zhang","Tiejun Huang"],"pdf_url":"https://arxiv.org/pdf/2404.06710v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07556v1","updated":"2024-04-11T08:36:36Z","published":"2024-04-11T08:36:36Z","title":"Attention-Aware Laparoscopic Image Desmoking Network with Lightness\n Embedding and Hybrid Guided Embedding","summary":" This paper presents a novel method of smoke removal from the laparoscopic\nimages. Due to the heterogeneous nature of surgical smoke, a two-stage network\nis proposed to estimate the smoke distribution and reconstruct a clear,\nsmoke-free surgical scene. The utilization of the lightness channel plays a\npivotal role in providing vital information pertaining to smoke density. The\nreconstruction of smoke-free image is guided by a hybrid embedding, which\ncombines the estimated smoke mask with the initial image. Experimental results\ndemonstrate that the proposed method boasts a Peak Signal to Noise Ratio that\nis $2.79\\%$ higher than the state-of-the-art methods, while also exhibits a\nremarkable $38.2\\%$ reduction in run-time. Overall, the proposed method offers\ncomparable or even superior performance in terms of both smoke removal quality\nand computational efficiency when compared to existing state-of-the-art\nmethods. This work will be publicly available on\nhttp://homepage.hit.edu.cn/wpgao\n","authors":["Ziteng Liu","Jiahua Zhu","Bainan Liu","Hao Liu","Wenpeng Gao","Yili Fu"],"pdf_url":"https://arxiv.org/pdf/2404.07556v1.pdf","comment":"ISBI2024"},{"id":"http://arxiv.org/abs/2404.07554v1","updated":"2024-04-11T08:36:13Z","published":"2024-04-11T08:36:13Z","title":"CAT: Contrastive Adapter Training for Personalized Image Generation","summary":" The emergence of various adapters, including Low-Rank Adaptation (LoRA)\napplied from the field of natural language processing, has allowed diffusion\nmodels to personalize image generation at a low cost. However, due to the\nvarious challenges including limited datasets and shortage of regularization\nand computation resources, adapter training often results in unsatisfactory\noutcomes, leading to the corruption of the backbone model's prior knowledge.\nOne of the well known phenomena is the loss of diversity in object generation,\nespecially within the same class which leads to generating almost identical\nobjects with minor variations. This poses challenges in generation\ncapabilities. To solve this issue, we present Contrastive Adapter Training\n(CAT), a simple yet effective strategy to enhance adapter training through the\napplication of CAT loss. Our approach facilitates the preservation of the base\nmodel's original knowledge when the model initiates adapters. Furthermore, we\nintroduce the Knowledge Preservation Score (KPS) to evaluate CAT's ability to\nkeep the former information. We qualitatively and quantitatively compare CAT's\nimprovement. Finally, we mention the possibility of CAT in the aspects of\nmulti-concept adapter and optimization.\n","authors":["Jae Wan Park","Sang Hyun Park","Jun Young Koh","Junha Lee","Min Song"],"pdf_url":"https://arxiv.org/pdf/2404.07554v1.pdf","comment":"CVPRW 2024"},{"id":"http://arxiv.org/abs/2404.07553v1","updated":"2024-04-11T08:35:24Z","published":"2024-04-11T08:35:24Z","title":"SFSORT: Scene Features-based Simple Online Real-Time Tracker","summary":" This paper introduces SFSORT, the world's fastest multi-object tracking\nsystem based on experiments conducted on MOT Challenge datasets. To achieve an\naccurate and computationally efficient tracker, this paper employs a\ntracking-by-detection method, following the online real-time tracking approach\nestablished in prior literature. By introducing a novel cost function called\nthe Bounding Box Similarity Index, this work eliminates the Kalman Filter,\nleading to reduced computational requirements. Additionally, this paper\ndemonstrates the impact of scene features on enhancing object-track association\nand improving track post-processing. Using a 2.2 GHz Intel Xeon CPU, the\nproposed method achieves an HOTA of 61.7\\% with a processing speed of 2242 Hz\non the MOT17 dataset and an HOTA of 60.9\\% with a processing speed of 304 Hz on\nthe MOT20 dataset. The tracker's source code, fine-tuned object detection\nmodel, and tutorials are available at\n\\url{https://github.com/gitmehrdad/SFSORT}.\n","authors":["M. M. Morsali","Z. Sharifi","F. Fallah","S. Hashembeiki","H. Mohammadzade","S. Bagheri Shouraki"],"pdf_url":"https://arxiv.org/pdf/2404.07553v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07551v1","updated":"2024-04-11T08:34:10Z","published":"2024-04-11T08:34:10Z","title":"Event-Enhanced Snapshot Compressive Videography at 10K FPS","summary":" Video snapshot compressive imaging (SCI) encodes the target dynamic scene\ncompactly into a snapshot and reconstructs its high-speed frame sequence\nafterward, greatly reducing the required data footprint and transmission\nbandwidth as well as enabling high-speed imaging with a low frame rate\nintensity camera. In implementation, high-speed dynamics are encoded via\ntemporally varying patterns, and only frames at corresponding temporal\nintervals can be reconstructed, while the dynamics occurring between\nconsecutive frames are lost. To unlock the potential of conventional snapshot\ncompressive videography, we propose a novel hybrid \"intensity+event\" imaging\nscheme by incorporating an event camera into a video SCI setup. Our proposed\nsystem consists of a dual-path optical setup to record the coded intensity\nmeasurement and intermediate event signals simultaneously, which is compact and\nphoton-efficient by collecting the half photons discarded in conventional video\nSCI. Correspondingly, we developed a dual-branch Transformer utilizing the\nreciprocal relationship between two data modes to decode dense video frames.\nExtensive experiments on both simulated and real-captured data demonstrate our\nsuperiority to state-of-the-art video SCI and video frame interpolation (VFI)\nmethods. Benefiting from the new hybrid design leveraging both intrinsic\nredundancy in videos and the unique feature of event cameras, we achieve\nhigh-quality videography at 0.1ms time intervals with a low-cost CMOS image\nsensor working at 24 FPS.\n","authors":["Bo Zhang","Jinli Suo","Qionghai Dai"],"pdf_url":"https://arxiv.org/pdf/2404.07551v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10372v2","updated":"2024-04-11T08:21:09Z","published":"2023-10-16T13:11:35Z","title":"Learning Object Permanence from Videos via Latent Imaginations","summary":" While human infants exhibit knowledge about object permanence from two months\nof age onwards, deep-learning approaches still largely fail to recognize\nobjects' continued existence. We introduce a slot-based autoregressive deep\nlearning system, the looped location and identity tracking model Loci-Looped,\nwhich learns to adaptively fuse latent imaginations with pixel-space\nobservations into consistent latent object-specific what and where encodings\nover time. The novel loop empowers Loci-Looped to learn the physical concepts\nof object permanence, directional inertia, and object solidity through\nobservation alone. As a result, Loci-Looped tracks objects through occlusions,\nanticipates their reappearance, and shows signs of surprise and internal\nrevisions when observing implausible object behavior. Notably, Loci-Looped\noutperforms state-of-the-art baseline models in handling object occlusions and\ntemporary sensory interruptions while exhibiting more compositional,\ninterpretable internal activity patterns. Our work thus introduces the first\nself-supervised interpretable learning model that learns about object\npermanence directly from video data without supervision.\n","authors":["Manuel Traub","Frederic Becker","Sebastian Otte","Martin V. Butz"],"pdf_url":"https://arxiv.org/pdf/2310.10372v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15011v3","updated":"2024-04-11T08:16:53Z","published":"2023-11-25T12:34:02Z","title":"VSCode: General Visual Salient and Camouflaged Object Detection with 2D\n Prompt Learning","summary":" Salient object detection (SOD) and camouflaged object detection (COD) are\nrelated yet distinct binary mapping tasks. These tasks involve multiple\nmodalities, sharing commonalities and unique cues. Existing research often\nemploys intricate task-specific specialist models, potentially leading to\nredundancy and suboptimal results. We introduce VSCode, a generalist model with\nnovel 2D prompt learning, to jointly address four SOD tasks and three COD\ntasks. We utilize VST as the foundation model and introduce 2D prompts within\nthe encoder-decoder architecture to learn domain and task-specific knowledge on\ntwo separate dimensions. A prompt discrimination loss helps disentangle\npeculiarities to benefit model optimization. VSCode outperforms\nstate-of-the-art methods across six tasks on 26 datasets and exhibits zero-shot\ngeneralization to unseen tasks by combining 2D prompts, such as RGB-D COD.\nSource code has been available at https://github.com/Sssssuperior/VSCode.\n","authors":["Ziyang Luo","Nian Liu","Wangbo Zhao","Xuguang Yang","Dingwen Zhang","Deng-Ping Fan","Fahad Khan","Junwei Han"],"pdf_url":"https://arxiv.org/pdf/2311.15011v3.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2204.01348v2","updated":"2024-04-11T08:12:50Z","published":"2022-04-04T09:46:30Z","title":"Extended Reality for Mental Health Evaluation -A Scoping Review","summary":" Mental health disorders are the leading cause of health-related problems\nglobally. It is projected that mental health disorders will be the leading\ncause of morbidity among adults as the incidence rates of anxiety and\ndepression grows globally. Recently, extended reality (XR), a general term\ncovering virtual reality (VR), augmented reality (AR) and mixed reality (MR),\nis paving a new way to deliver mental health care. In this paper, we conduct a\nscoping review on the development and application of XR in the area of mental\ndisorders. We performed a scoping database search to identify the relevant\nstudies indexed in Google Scholar, PubMed, and the ACM Digital Library. A\nsearch period between August 2016 and December 2023 was defined to select\narticles related to the usage of VR, AR, and MR in a mental health context. We\nidentified a total of 85 studies from 27 countries across the globe. By\nperforming data analysis, we found that most of the studies focused on\ndeveloped countries such as the US (16.47%) and Germany (12.94%). None of the\nstudies were for African countries. The majority of the articles reported that\nXR techniques led to a significant reduction in symptoms of anxiety or\ndepression. More studies were published in the year 2021, i.e., 31.76% (n =\n31). This could indicate that mental disorder intervention received a higher\nattention when COVID-19 emerged. Most studies (n = 65) focused on a population\nbetween 18 and 65 years old, only a few studies focused on teenagers (n = 2).\nAlso, more studies were done experimentally (n = 67, 78.82%) rather than by\nanalytical and modeling approaches (n = 8, 9.41%). This shows that there is a\nrapid development of XR technology for mental health care. Furthermore, these\nstudies showed that XR technology can effectively be used for evaluating mental\ndisorders in similar or better way as the conventional approaches.\n","authors":["Omisore Olatunji","Ifeanyi Odenigbo","Joseph Orji","Amelia Beltran","Nilufar Baghaei","Meier Sandra","Rita Orji"],"pdf_url":"https://arxiv.org/pdf/2204.01348v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07545v1","updated":"2024-04-11T08:12:48Z","published":"2024-04-11T08:12:48Z","title":"Stereo-LiDAR Depth Estimation with Deformable Propagation and Learned\n Disparity-Depth Conversion","summary":" Accurate and dense depth estimation with stereo cameras and LiDAR is an\nimportant task for automatic driving and robotic perception. While sparse hints\nfrom LiDAR points have improved cost aggregation in stereo matching, their\neffectiveness is limited by the low density and non-uniform distribution. To\naddress this issue, we propose a novel stereo-LiDAR depth estimation network\nwith Semi-Dense hint Guidance, named SDG-Depth. Our network includes a\ndeformable propagation module for generating a semi-dense hint map and a\nconfidence map by propagating sparse hints using a learned deformable window.\nThese maps then guide cost aggregation in stereo matching. To reduce the\ntriangulation error in depth recovery from disparity, especially in distant\nregions, we introduce a disparity-depth conversion module. Our method is both\naccurate and efficient. The experimental results on benchmark tests show its\nsuperior performance. Our code is available at\nhttps://github.com/SJTU-ViSYS/SDG-Depth.\n","authors":["Ang Li","Anning Hu","Wei Xi","Wenxian Yu","Danping Zou"],"pdf_url":"https://arxiv.org/pdf/2404.07545v1.pdf","comment":"Accepted in ICRA 2024. 8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.07543v1","updated":"2024-04-11T08:11:36Z","published":"2024-04-11T08:11:36Z","title":"Content-Adaptive Non-Local Convolution for Remote Sensing Pansharpening","summary":" Currently, machine learning-based methods for remote sensing pansharpening\nhave progressed rapidly. However, existing pansharpening methods often do not\nfully exploit differentiating regional information in non-local spaces, thereby\nlimiting the effectiveness of the methods and resulting in redundant learning\nparameters. In this paper, we introduce a so-called content-adaptive non-local\nconvolution (CANConv), a novel method tailored for remote sensing image\npansharpening. Specifically, CANConv employs adaptive convolution, ensuring\nspatial adaptability, and incorporates non-local self-similarity through the\nsimilarity relationship partition (SRP) and the partition-wise adaptive\nconvolution (PWAC) sub-modules. Furthermore, we also propose a corresponding\nnetwork architecture, called CANNet, which mainly utilizes the multi-scale\nself-similarity. Extensive experiments demonstrate the superior performance of\nCANConv, compared with recent promising fusion methods. Besides, we\nsubstantiate the method's effectiveness through visualization, ablation\nexperiments, and comparison with existing methods on multiple test sets. The\nsource code is publicly available at https://github.com/duanyll/CANConv.\n","authors":["Yule Duan","Xiao Wu","Haoyu Deng","Liang-Jian Deng"],"pdf_url":"https://arxiv.org/pdf/2404.07543v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2310.11725v2","updated":"2024-04-11T08:11:20Z","published":"2023-10-18T05:44:49Z","title":"VST++: Efficient and Stronger Visual Saliency Transformer","summary":" While previous CNN-based models have exhibited promising results for salient\nobject detection (SOD), their ability to explore global long-range dependencies\nis restricted. Our previous work, the Visual Saliency Transformer (VST),\naddressed this constraint from a transformer-based sequence-to-sequence\nperspective, to unify RGB and RGB-D SOD. In VST, we developed a multi-task\ntransformer decoder that concurrently predicts saliency and boundary outcomes\nin a pure transformer architecture. Moreover, we introduced a novel token\nupsampling method called reverse T2T for predicting a high-resolution saliency\nmap effortlessly within transformer-based structures. Building upon the VST\nmodel, we further propose an efficient and stronger VST version in this work,\ni.e. VST++. To mitigate the computational costs of the VST model, we propose a\nSelect-Integrate Attention (SIA) module, partitioning foreground into\nfine-grained segments and aggregating background information into a single\ncoarse-grained token. To incorporate 3D depth information with low cost, we\ndesign a novel depth position encoding method tailored for depth maps.\nFurthermore, we introduce a token-supervised prediction loss to provide\nstraightforward guidance for the task-related tokens. We evaluate our VST++\nmodel across various transformer-based backbones on RGB, RGB-D, and RGB-T SOD\nbenchmark datasets. Experimental results show that our model outperforms\nexisting methods while achieving a 25% reduction in computational costs without\nsignificant performance compromise. The demonstrated strong ability for\ngeneralization, enhanced performance, and heightened efficiency of our VST++\nmodel highlight its potential.\n","authors":["Nian Liu","Ziyang Luo","Ni Zhang","Junwei Han"],"pdf_url":"https://arxiv.org/pdf/2310.11725v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.00349v2","updated":"2024-04-11T08:08:10Z","published":"2023-01-01T05:02:46Z","title":"Towards Reliable Medical Image Segmentation by utilizing Evidential\n Calibrated Uncertainty","summary":" Medical image segmentation is critical for disease diagnosis and treatment\nassessment. However, concerns regarding the reliability of segmentation regions\npersist among clinicians, mainly attributed to the absence of confidence\nassessment, robustness, and calibration to accuracy. To address this, we\nintroduce DEviS, an easily implementable foundational model that seamlessly\nintegrates into various medical image segmentation networks. DEviS not only\nenhances the calibration and robustness of baseline segmentation accuracy but\nalso provides high-efficiency uncertainty estimation for reliable predictions.\nBy leveraging subjective logic theory, we explicitly model probability and\nuncertainty for the problem of medical image segmentation. Here, the Dirichlet\ndistribution parameterizes the distribution of probabilities for different\nclasses of the segmentation results. To generate calibrated predictions and\nuncertainty, we develop a trainable calibrated uncertainty penalty.\nFurthermore, DEviS incorporates an uncertainty-aware filtering module, which\nutilizes the metric of uncertainty-calibrated error to filter reliable data\nwithin the dataset. We conducted validation studies to assess both the accuracy\nand robustness of DEviS segmentation, along with evaluating the efficiency and\nreliability of uncertainty estimation. These evaluations were performed using\npublicly available datasets including ISIC2018, LiTS2017, and BraTS2019.\nAdditionally, two potential clinical trials are being conducted at Johns\nHopkins OCT, Duke-OCT-DME, and FIVES datasets to demonstrate their efficacy in\nfiltering high-quality or out-of-distribution data. Our code has been released\nin https://github.com/Cocofeat/DEviS.\n","authors":["Ke Zou","Yidi Chen","Ling Huang","Xuedong Yuan","Xiaojing Shen","Meng Wang","Rick Siow Mong Goh","Yong Liu","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2301.00349v2.pdf","comment":"34 pages, 11 figures"},{"id":"http://arxiv.org/abs/2306.00696v2","updated":"2024-04-11T08:03:25Z","published":"2023-06-01T14:06:48Z","title":"Analyzing the Internals of Neural Radiance Fields","summary":" Modern Neural Radiance Fields (NeRFs) learn a mapping from position to\nvolumetric density leveraging proposal network samplers. In contrast to the\ncoarse-to-fine sampling approach with two NeRFs, this offers significant\npotential for acceleration using lower network capacity. Given that NeRFs\nutilize most of their network capacity to estimate radiance, they could store\nvaluable density information in their parameters or their deep features. To\ninvestigate this proposition, we take one step back and analyze large, trained\nReLU-MLPs used in coarse-to-fine sampling. Building on our novel activation\nvisualization method, we find that trained NeRFs, Mip-NeRFs and proposal\nnetwork samplers map samples with high density to local minima along a ray in\nactivation feature space. We show how these large MLPs can be accelerated by\ntransforming intermediate activations to a weight estimate, without any\nmodifications to the training protocol or the network architecture. With our\napproach, we can reduce the computational requirements of trained NeRFs by up\nto 50% with only a slight hit in rendering quality. Extensive experimental\nevaluation on a variety of datasets and architectures demonstrates the\neffectiveness of our approach. Consequently, our methodology provides valuable\ninsight into the inner workings of NeRFs.\n","authors":["Lukas Radl","Andreas Kurz","Michael Steiner","Markus Steinberger"],"pdf_url":"https://arxiv.org/pdf/2306.00696v2.pdf","comment":"Accepted to CVPRW'24! Project Page:\n https://r4dl.github.io/nerfinternals/"},{"id":"http://arxiv.org/abs/2404.07537v1","updated":"2024-04-11T08:03:23Z","published":"2024-04-11T08:03:23Z","title":"How is Visual Attention Influenced by Text Guidance? Database and Model","summary":" The analysis and prediction of visual attention have long been crucial tasks\nin the fields of computer vision and image processing. In practical\napplications, images are generally accompanied by various text descriptions,\nhowever, few studies have explored the influence of text descriptions on visual\nattention, let alone developed visual saliency prediction models considering\ntext guidance. In this paper, we conduct a comprehensive study on text-guided\nimage saliency (TIS) from both subjective and objective perspectives.\nSpecifically, we construct a TIS database named SJTU-TIS, which includes 1200\ntext-image pairs and the corresponding collected eye-tracking data. Based on\nthe established SJTU-TIS database, we analyze the influence of various text\ndescriptions on visual attention. Then, to facilitate the development of\nsaliency prediction models considering text influence, we construct a benchmark\nfor the established SJTU-TIS database using state-of-the-art saliency models.\nFinally, considering the effect of text descriptions on visual attention, while\nmost existing saliency models ignore this impact, we further propose a\ntext-guided saliency (TGSal) prediction model, which extracts and integrates\nboth image features and text features to predict the image saliency under\nvarious text-description conditions. Our proposed model significantly\noutperforms the state-of-the-art saliency models on both the SJTU-TIS database\nand the pure image saliency databases in terms of various evaluation metrics.\nThe SJTU-TIS database and the code of the proposed TGSal model will be released\nat: https://github.com/IntMeGroup/TGSal.\n","authors":["Yinan Sun","Xiongkuo Min","Huiyu Duan","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2404.07537v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09107v2","updated":"2024-04-11T07:42:43Z","published":"2024-03-14T05:00:29Z","title":"S^2MVTC: a Simple yet Efficient Scalable Multi-View Tensor Clustering","summary":" Anchor-based large-scale multi-view clustering has attracted considerable\nattention for its effectiveness in handling massive datasets. However, current\nmethods mainly seek the consensus embedding feature for clustering by exploring\nglobal correlations between anchor graphs or projection matrices.In this paper,\nwe propose a simple yet efficient scalable multi-view tensor clustering\n(S^2MVTC) approach, where our focus is on learning correlations of embedding\nfeatures within and across views. Specifically, we first construct the\nembedding feature tensor by stacking the embedding features of different views\ninto a tensor and rotating it. Additionally, we build a novel tensor\nlow-frequency approximation (TLFA) operator, which incorporates graph\nsimilarity into embedding feature learning, efficiently achieving smooth\nrepresentation of embedding features within different views. Furthermore,\nconsensus constraints are applied to embedding features to ensure inter-view\nsemantic consistency. Experimental results on six large-scale multi-view\ndatasets demonstrate that S^2MVTC significantly outperforms state-of-the-art\nalgorithms in terms of clustering performance and CPU execution time,\nespecially when handling massive data. The code of S^2MVTC is publicly\navailable at https://github.com/longzhen520/S2MVTC.\n","authors":["Zhen Long","Qiyuan Wang","Yazhou Ren","Yipeng Liu","Ce Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.09107v2.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2404.07520v1","updated":"2024-04-11T07:26:00Z","published":"2024-04-11T07:26:00Z","title":"PromptSync: Bridging Domain Gaps in Vision-Language Models through\n Class-Aware Prototype Alignment and Discrimination","summary":" The potential for zero-shot generalization in vision-language (V-L) models\nsuch as CLIP has spurred their widespread adoption in addressing numerous\ndownstream tasks. Previous methods have employed test-time prompt tuning to\nadapt the model to unseen domains, but they overlooked the issue of imbalanced\nclass distributions. In this study, we explicitly address this problem by\nemploying class-aware prototype alignment weighted by mean class probabilities\nobtained for the test sample and filtered augmented views. Additionally, we\nensure that the class probabilities are as accurate as possible by performing\nprototype discrimination using contrastive learning. The combination of\nalignment and discriminative loss serves as a geometric regularizer, preventing\nthe prompt representation from collapsing onto a single class and effectively\nbridging the distribution gap between the source and test domains. Our method,\nnamed PromptSync, synchronizes the prompts for each test sample on both the\ntext and vision branches of the V-L model. In empirical evaluations on the\ndomain generalization benchmark, our method outperforms previous best methods\nby 2.33\\% in overall performance, by 1\\% in base-to-novel generalization, and\nby 2.84\\% in cross-dataset transfer tasks.\n","authors":["Anant Khandelwal"],"pdf_url":"https://arxiv.org/pdf/2404.07520v1.pdf","comment":"Accepted at CVPR 2024 LIMIT, 12 pages, 8 Tables, 2 Figures"},{"id":"http://arxiv.org/abs/2404.06859v2","updated":"2024-04-11T07:24:59Z","published":"2024-04-10T09:35:36Z","title":"Multi-Label Continual Learning for the Medical Domain: A Novel Benchmark","summary":" Multi-label image classification in dynamic environments is a problem that\nposes significant challenges. Previous studies have primarily focused on\nscenarios such as Domain Incremental Learning and Class Incremental Learning,\nwhich do not fully capture the complexity of real-world applications. In this\npaper, we study the problem of classification of medical imaging in the\nscenario termed New Instances and New Classes, which combines the challenges of\nboth new class arrivals and domain shifts in a single framework. Unlike\ntraditional scenarios, it reflects the realistic nature of CL in domains such\nas medical imaging, where updates may introduce both new classes and changes in\ndomain characteristics. To address the unique challenges posed by this complex\nscenario, we introduce a novel approach called Pseudo-Label Replay. This method\naims to mitigate forgetting while adapting to new classes and domain shifts by\ncombining the advantages of the Replay and Pseudo-Label methods and solving\ntheir limitations in the proposed scenario. We evaluate our proposed approach\non a challenging benchmark consisting of two datasets, seven tasks, and\nnineteen classes, modeling a realistic Continual Learning scenario. Our\nexperimental findings demonstrate the effectiveness of Pseudo-Label Replay in\naddressing the challenges posed by the complex scenario proposed. Our method\nsurpasses existing approaches, exhibiting superior performance while showing\nminimal forgetting.\n","authors":["Marina Ceccon","Davide Dalle Pezze","Alessandro Fabris","Gian Antonio Susto"],"pdf_url":"https://arxiv.org/pdf/2404.06859v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07518v1","updated":"2024-04-11T07:22:14Z","published":"2024-04-11T07:22:14Z","title":"Remembering Transformer for Continual Learning","summary":" Neural networks encounter the challenge of Catastrophic Forgetting (CF) in\ncontinual learning, where new task knowledge interferes with previously learned\nknowledge. We propose Remembering Transformer, inspired by the brain's\nComplementary Learning Systems (CLS), to tackle this issue. Remembering\nTransformer employs a mixture-of-adapters and a generative model-based routing\nmechanism to alleviate CF by dynamically routing task data to relevant\nadapters. Our approach demonstrated a new SOTA performance in various vision\ncontinual learning tasks and great parameter efficiency.\n","authors":["Yuwei Sun","Jun Sakuma","Ryota Kanai"],"pdf_url":"https://arxiv.org/pdf/2404.07518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.16073v2","updated":"2024-04-11T07:20:52Z","published":"2023-10-24T14:59:51Z","title":"FloCoDe: Unbiased Dynamic Scene Graph Generation with Temporal\n Consistency and Correlation Debiasing","summary":" Dynamic scene graph generation (SGG) from videos requires not only a\ncomprehensive understanding of objects across scenes but also a method to\ncapture the temporal motions and interactions with different objects. Moreover,\nthe long-tailed distribution of visual relationships is a crucial bottleneck\nfor most dynamic SGG methods. This is because many of them focus on capturing\nspatio-temporal context using complex architectures, leading to the generation\nof biased scene graphs. To address these challenges, we propose\n\\textsc{FloCoDe}: \\textbf{Flo}w-aware Temporal Consistency and\n\\textbf{Co}rrelation \\textbf{De}biasing with uncertainty attenuation for\nunbiased dynamic scene graphs. \\textsc{FloCoDe} employs feature warping using\nflow to detect temporally consistent objects across frames. To address the\nlong-tail issue of visual relationships, we propose correlation debiasing and a\nlabel correlation-based loss to learn unbiased relation representations for\nlong-tailed classes. Specifically, we propose to incorporate label correlations\nusing contrastive loss to capture commonly co-occurring relations, which aids\nin learning robust representations for long-tailed classes. Further, we adopt\nthe uncertainty attenuation-based classifier framework to handle noisy\nannotations in the SGG data. Extensive experimental evaluation shows a\nperformance gain as high as 4.1\\%, demonstrating the superiority of generating\nmore unbiased scene graphs.\n","authors":["Anant Khandelwal"],"pdf_url":"https://arxiv.org/pdf/2310.16073v2.pdf","comment":"Accepted at CVPR 2024 SG2RL, 11 pages, 5 tables, 4 figures"},{"id":"http://arxiv.org/abs/2404.05426v2","updated":"2024-04-11T07:12:35Z","published":"2024-04-08T11:54:49Z","title":"Test-Time Zero-Shot Temporal Action Localization","summary":" Zero-Shot Temporal Action Localization (ZS-TAL) seeks to identify and locate\nactions in untrimmed videos unseen during training. Existing ZS-TAL methods\ninvolve fine-tuning a model on a large amount of annotated training data. While\neffective, training-based ZS-TAL approaches assume the availability of labeled\ndata for supervised learning, which can be impractical in some applications.\nFurthermore, the training process naturally induces a domain bias into the\nlearned model, which may adversely affect the model's generalization ability to\narbitrary videos. These considerations prompt us to approach the ZS-TAL problem\nfrom a radically novel perspective, relaxing the requirement for training data.\nTo this aim, we introduce a novel method that performs Test-Time adaptation for\nTemporal Action Localization (T3AL). In a nutshell, T3AL adapts a pre-trained\nVision and Language Model (VLM). T3AL operates in three steps. First, a\nvideo-level pseudo-label of the action category is computed by aggregating\ninformation from the entire video. Then, action localization is performed\nadopting a novel procedure inspired by self-supervised learning. Finally,\nframe-level textual descriptions extracted with a state-of-the-art captioning\nmodel are employed for refining the action region proposals. We validate the\neffectiveness of T3AL by conducting experiments on the THUMOS14 and the\nActivityNet-v1.3 datasets. Our results demonstrate that T3AL significantly\noutperforms zero-shot baselines based on state-of-the-art VLMs, confirming the\nbenefit of a test-time adaptation approach.\n","authors":["Benedetta Liberatori","Alessandro Conti","Paolo Rota","Yiming Wang","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2404.05426v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.07514v1","updated":"2024-04-11T07:11:43Z","published":"2024-04-11T07:11:43Z","title":"Generalization Gap in Data Augmentation: Insights from Illumination","summary":" In the field of computer vision, data augmentation is widely used to enrich\nthe feature complexity of training datasets with deep learning techniques.\nHowever, regarding the generalization capabilities of models, the difference in\nartificial features generated by data augmentation and natural visual features\nhas not been fully revealed. This study focuses on the visual representation\nvariable 'illumination', by simulating its distribution degradation and\nexamining how data augmentation techniques enhance model performance on a\nclassification task. Our goal is to investigate the differences in\ngeneralization between models trained with augmented data and those trained\nunder real-world illumination conditions. Results indicate that after\nundergoing various data augmentation methods, model performance has been\nsignificantly improved. Yet, a noticeable generalization gap still exists after\nutilizing various data augmentation methods, emphasizing the critical role of\nfeature diversity in the training set for enhancing model generalization.\n","authors":["Jianqiang Xiao","Weiwen Guo","Junfeng Liu","Mengze Li"],"pdf_url":"https://arxiv.org/pdf/2404.07514v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01446v2","updated":"2024-04-11T06:58:18Z","published":"2024-04-01T19:33:41Z","title":"Finding Regions of Interest in Whole Slide Images Using Multiple\n Instance Learning","summary":" Whole Slide Images (WSI), obtained by high-resolution digital scanning of\nmicroscope slides at multiple scales, are the cornerstone of modern Digital\nPathology. However, they represent a particular challenge to\nAI-based/AI-mediated analysis because pathology labeling is typically done at\nslide-level, instead of tile-level. It is not just that medical diagnostics is\nrecorded at the specimen level, the detection of oncogene mutation is also\nexperimentally obtained, and recorded by initiatives like The Cancer Genome\nAtlas (TCGA), at the slide level. This configures a dual challenge: a)\naccurately predicting the overall cancer phenotype and b) finding out what\ncellular morphologies are associated with it at the tile level. To address\nthese challenges, a weakly supervised Multiple Instance Learning (MIL) approach\nwas explored for two prevalent cancer types, Invasive Breast Carcinoma\n(TCGA-BRCA) and Lung Squamous Cell Carcinoma (TCGA-LUSC). This approach was\nexplored for tumor detection at low magnification levels and TP53 mutations at\nvarious levels. Our results show that a novel additive implementation of MIL\nmatched the performance of reference implementation (AUC 0.96), and was only\nslightly outperformed by Attention MIL (AUC 0.97). More interestingly from the\nperspective of the molecular pathologist, these different AI architectures\nidentify distinct sensitivities to morphological features (through the\ndetection of Regions of Interest, RoI) at different amplification levels.\nTellingly, TP53 mutation was most sensitive to features at the higher\napplications where cellular morphology is resolved.\n","authors":["Martim Afonso","Praphulla M. S. Bhawsar","Monjoy Saha","Jonas S. Almeida","Arlindo L. Oliveira"],"pdf_url":"https://arxiv.org/pdf/2404.01446v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07507v1","updated":"2024-04-11T06:55:44Z","published":"2024-04-11T06:55:44Z","title":"Learning to Classify New Foods Incrementally Via Compressed Exemplars","summary":" Food image classification systems play a crucial role in health monitoring\nand diet tracking through image-based dietary assessment techniques. However,\nexisting food recognition systems rely on static datasets characterized by a\npre-defined fixed number of food classes. This contrasts drastically with the\nreality of food consumption, which features constantly changing data.\nTherefore, food image classification systems should adapt to and manage data\nthat continuously evolves. This is where continual learning plays an important\nrole. A challenge in continual learning is catastrophic forgetting, where ML\nmodels tend to discard old knowledge upon learning new information. While\nmemory-replay algorithms have shown promise in mitigating this problem by\nstoring old data as exemplars, they are hampered by the limited capacity of\nmemory buffers, leading to an imbalance between new and previously learned\ndata. To address this, our work explores the use of neural image compression to\nextend buffer size and enhance data diversity. We introduced the concept of\ncontinuously learning a neural compression model to adaptively improve the\nquality of compressed data and optimize the bitrates per pixel (bpp) to store\nmore exemplars. Our extensive experiments, including evaluations on\nfood-specific datasets including Food-101 and VFN-74, as well as the general\ndataset ImageNet-100, demonstrate improvements in classification accuracy. This\nprogress is pivotal in advancing more realistic food recognition systems that\nare capable of adapting to continually evolving data. Moreover, the principles\nand methodologies we've developed hold promise for broader applications,\nextending their benefits to other domains of continual machine learning\nsystems.\n","authors":["Justin Yang","Zhihao Duan","Jiangpeng He","Fengqing Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.07507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.15430v2","updated":"2024-04-11T06:40:12Z","published":"2024-02-23T16:50:07Z","title":"Hierarchical Invariance for Robust and Interpretable Vision Tasks at\n Larger Scales","summary":" Developing robust and interpretable vision systems is a crucial step towards\ntrustworthy artificial intelligence. In this regard, a promising paradigm\nconsiders embedding task-required invariant structures, e.g., geometric\ninvariance, in the fundamental image representation. However, such invariant\nrepresentations typically exhibit limited discriminability, limiting their\napplications in larger-scale trustworthy vision tasks. For this open problem,\nwe conduct a systematic investigation of hierarchical invariance, exploring\nthis topic from theoretical, practical, and application perspectives. At the\ntheoretical level, we show how to construct over-complete invariants with a\nConvolutional Neural Networks (CNN)-like hierarchical architecture yet in a\nfully interpretable manner. The general blueprint, specific definitions,\ninvariant properties, and numerical implementations are provided. At the\npractical level, we discuss how to customize this theoretical framework into a\ngiven task. With the over-completeness, discriminative features w.r.t. the task\ncan be adaptively formed in a Neural Architecture Search (NAS)-like manner. We\ndemonstrate the above arguments with accuracy, invariance, and efficiency\nresults on texture, digit, and parasite classification experiments.\nFurthermore, at the application level, our representations are explored in\nreal-world forensics tasks on adversarial perturbations and Artificial\nIntelligence Generated Content (AIGC). Such applications reveal that the\nproposed strategy not only realizes the theoretically promised invariance, but\nalso exhibits competitive discriminability even in the era of deep learning.\nFor robust and interpretable vision tasks at larger scales, hierarchical\ninvariant representation can be considered as an effective alternative to\ntraditional CNN and invariants.\n","authors":["Shuren Qi","Yushu Zhang","Chao Wang","Zhihua Xia","Xiaochun Cao","Jian Weng"],"pdf_url":"https://arxiv.org/pdf/2402.15430v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07504v1","updated":"2024-04-11T06:39:53Z","published":"2024-04-11T06:39:53Z","title":"Mitigating Object Dependencies: Improving Point Cloud Self-Supervised\n Learning through Object Exchange","summary":" In the realm of point cloud scene understanding, particularly in indoor\nscenes, objects are arranged following human habits, resulting in objects of\ncertain semantics being closely positioned and displaying notable inter-object\ncorrelations. This can create a tendency for neural networks to exploit these\nstrong dependencies, bypassing the individual object patterns. To address this\nchallenge, we introduce a novel self-supervised learning (SSL) strategy. Our\napproach leverages both object patterns and contextual cues to produce robust\nfeatures. It begins with the formulation of an object-exchanging strategy,\nwhere pairs of objects with comparable sizes are exchanged across different\nscenes, effectively disentangling the strong contextual dependencies.\nSubsequently, we introduce a context-aware feature learning strategy, which\nencodes object patterns without relying on their specific context by\naggregating object features across various scenes. Our extensive experiments\ndemonstrate the superiority of our method over existing SSL techniques, further\nshowing its better robustness to environmental changes. Moreover, we showcase\nthe applicability of our approach by transferring pre-trained models to diverse\npoint cloud datasets.\n","authors":["Yanhao Wu","Tong Zhang","Wei Ke","Congpei Qiu","Sabine Susstrunk","Mathieu Salzmann"],"pdf_url":"https://arxiv.org/pdf/2404.07504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08801v3","updated":"2024-04-11T06:25:41Z","published":"2024-02-05T12:33:37Z","title":"CoBra: Complementary Branch Fusing Class and Semantic Knowledge for\n Robust Weakly Supervised Semantic Segmentation","summary":" Leveraging semantically precise pseudo masks derived from image-level class\nknowledge for segmentation, namely image-level Weakly Supervised Semantic\nSegmentation (WSSS), still remains challenging. While Class Activation Maps\n(CAMs) using CNNs have steadily been contributing to the success of WSSS, the\nresulting activation maps often narrowly focus on class-specific parts (e.g.,\nonly face of human). On the other hand, recent works based on vision\ntransformers (ViT) have shown promising results based on their self-attention\nmechanism to capture the semantic parts but fail in capturing complete\nclass-specific details (e.g., entire body parts of human but also with a dog\nnearby). In this work, we propose Complementary Branch (CoBra), a novel dual\nbranch framework consisting of two distinct architectures which provide\nvaluable complementary knowledge of class (from CNN) and semantic (from ViT) to\neach branch. In particular, we learn Class-Aware Projection (CAP) for the CNN\nbranch and Semantic-Aware Projection (SAP) for the ViT branch to explicitly\nfuse their complementary knowledge and facilitate a new type of extra\npatch-level supervision. Our model, through CoBra, fuses CNN and ViT's\ncomplementary outputs to create robust pseudo masks that integrate both class\nand semantic information effectively. Extensive experiments qualitatively and\nquantitatively investigate how CNN and ViT complement each other on the PASCAL\nVOC 2012 dataset, showing a state-of-the-art WSSS result. This includes not\nonly the masks generated by our model, but also the segmentation results\nderived from utilizing these masks as pseudo labels.\n","authors":["Woojung Han","Seil Kang","Kyobin Choo","Seong Jae Hwang"],"pdf_url":"https://arxiv.org/pdf/2403.08801v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.07868v2","updated":"2024-04-11T06:21:29Z","published":"2023-01-19T03:42:56Z","title":"MV-Adapter: Multimodal Video Transfer Learning for Video Text Retrieval","summary":" State-of-the-art video-text retrieval (VTR) methods typically involve fully\nfine-tuning a pre-trained model (e.g. CLIP) on specific datasets. However, this\ncan result in significant storage costs in practical applications as a separate\nmodel per task must be stored. To address this issue, we present our pioneering\nwork that enables parameter-efficient VTR using a pre-trained model, with only\na small number of tunable parameters during training. Towards this goal, we\npropose a new method dubbed Multimodal Video Adapter (MV-Adapter) for\nefficiently transferring the knowledge in the pre-trained CLIP from image-text\nto video-text. Specifically, MV-Adapter utilizes bottleneck structures in both\nvideo and text branches, along with two novel components. The first is a\nTemporal Adaptation Module that is incorporated in the video branch to\nintroduce global and local temporal contexts. We also train weights\ncalibrations to adjust to dynamic variations across frames. The second is Cross\nModality Tying that generates weights for video/text branches through sharing\ncross modality factors, for better aligning between modalities. Thanks to above\ninnovations, MV-Adapter can achieve comparable or better performance than\nstandard full fine-tuning with negligible parameters overhead. Notably,\nMV-Adapter consistently outperforms various competing methods in V2T/T2V tasks\nwith large margins on five widely used VTR benchmarks (MSR-VTT, MSVD, LSMDC,\nDiDemo, and ActivityNet).\n","authors":["Xiaojie Jin","Bowen Zhang","Weibo Gong","Kai Xu","XueQing Deng","Peng Wang","Zhao Zhang","Xiaohui Shen","Jiashi Feng"],"pdf_url":"https://arxiv.org/pdf/2301.07868v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07495v1","updated":"2024-04-11T06:06:56Z","published":"2024-04-11T06:06:56Z","title":"PillarTrack: Redesigning Pillar-based Transformer Network for Single\n Object Tracking on Point Clouds","summary":" LiDAR-based 3D single object tracking (3D SOT) is a critical issue in\nrobotics and autonomous driving. It aims to obtain accurate 3D BBox from the\nsearch area based on similarity or motion. However, existing 3D SOT methods\nusually follow the point-based pipeline, where the sampling operation\ninevitably leads to redundant or lost information, resulting in unexpected\nperformance. To address these issues, we propose PillarTrack, a pillar-based 3D\nsingle object tracking framework. Firstly, we transform sparse point clouds\ninto dense pillars to preserve the local and global geometrics. Secondly, we\nintroduce a Pyramid-type Encoding Pillar Feature Encoder (PE-PFE) design to\nhelp the feature representation of each pillar. Thirdly, we present an\nefficient Transformer-based backbone from the perspective of modality\ndifferences. Finally, we construct our PillarTrack tracker based above designs.\nExtensive experiments on the KITTI and nuScenes dataset demonstrate the\nsuperiority of our proposed method. Notably, our method achieves\nstate-of-the-art performance on the KITTI and nuScenes dataset and enables\nreal-time tracking speed. We hope our work could encourage the community to\nrethink existing 3D SOT tracker designs.We will open source our code to the\nresearch community in https://github.com/StiphyJay/PillarTrack.\n","authors":["Weisheng Xu","Sifan Zhou","Zhihang Yuan"],"pdf_url":"https://arxiv.org/pdf/2404.07495v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07487v1","updated":"2024-04-11T05:51:06Z","published":"2024-04-11T05:51:06Z","title":"Fine-Grained Side Information Guided Dual-Prompts for Zero-Shot Skeleton\n Action Recognition","summary":" Skeleton-based zero-shot action recognition aims to recognize unknown human\nactions based on the learned priors of the known skeleton-based actions and a\nsemantic descriptor space shared by both known and unknown categories. However,\nprevious works focus on establishing the bridges between the known skeleton\nrepresentation space and semantic descriptions space at the coarse-grained\nlevel for recognizing unknown action categories, ignoring the fine-grained\nalignment of these two spaces, resulting in suboptimal performance in\ndistinguishing high-similarity action categories. To address these challenges,\nwe propose a novel method via Side information and dual-prompts learning for\nskeleton-based zero-shot action recognition (STAR) at the fine-grained level.\nSpecifically, 1) we decompose the skeleton into several parts based on its\ntopology structure and introduce the side information concerning multi-part\ndescriptions of human body movements for alignment between the skeleton and the\nsemantic space at the fine-grained level; 2) we design the visual-attribute and\nsemantic-part prompts to improve the intra-class compactness within the\nskeleton space and inter-class separability within the semantic space,\nrespectively, to distinguish the high-similarity actions. Extensive experiments\nshow that our method achieves state-of-the-art performance in ZSL and GZSL\nsettings on NTU RGB+D, NTU RGB+D 120, and PKU-MMD datasets.\n","authors":["Yang Chen","Jingcai Guo","Tian He","Ling Wang"],"pdf_url":"https://arxiv.org/pdf/2404.07487v1.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2403.00644v3","updated":"2024-04-11T05:48:36Z","published":"2024-03-01T16:25:17Z","title":"Diff-Plugin: Revitalizing Details for Diffusion-based Low-level Tasks","summary":" Diffusion models trained on large-scale datasets have achieved remarkable\nprogress in image synthesis. However, due to the randomness in the diffusion\nprocess, they often struggle with handling diverse low-level tasks that require\ndetails preservation. To overcome this limitation, we present a new Diff-Plugin\nframework to enable a single pre-trained diffusion model to generate\nhigh-fidelity results across a variety of low-level tasks. Specifically, we\nfirst propose a lightweight Task-Plugin module with a dual branch design to\nprovide task-specific priors, guiding the diffusion process in preserving image\ncontent. We then propose a Plugin-Selector that can automatically select\ndifferent Task-Plugins based on the text instruction, allowing users to edit\nimages by indicating multiple low-level tasks with natural language. We conduct\nextensive experiments on 8 low-level vision tasks. The results demonstrate the\nsuperiority of Diff-Plugin over existing methods, particularly in real-world\nscenarios. Our ablations further validate that Diff-Plugin is stable,\nschedulable, and supports robust training across different dataset sizes.\n","authors":["Yuhao Liu","Zhanghan Ke","Fang Liu","Nanxuan Zhao","Rynson W. H. Lau"],"pdf_url":"https://arxiv.org/pdf/2403.00644v3.pdf","comment":"Accepted to CVPR2024. Replaced some celebrity images to avoid\n copyright disputes"},{"id":"http://arxiv.org/abs/2404.06351v2","updated":"2024-04-11T05:17:44Z","published":"2024-04-09T14:42:31Z","title":"HPNet: Dynamic Trajectory Forecasting with Historical Prediction\n Attention","summary":" Predicting the trajectories of road agents is essential for autonomous\ndriving systems. The recent mainstream methods follow a static paradigm, which\npredicts the future trajectory by using a fixed duration of historical frames.\nThese methods make the predictions independently even at adjacent time steps,\nwhich leads to potential instability and temporal inconsistency. As successive\ntime steps have largely overlapping historical frames, their forecasting should\nhave intrinsic correlation, such as overlapping predicted trajectories should\nbe consistent, or be different but share the same motion goal depending on the\nroad situation. Motivated by this, in this work, we introduce HPNet, a novel\ndynamic trajectory forecasting method. Aiming for stable and accurate\ntrajectory forecasting, our method leverages not only historical frames\nincluding maps and agent states, but also historical predictions. Specifically,\nwe newly design a Historical Prediction Attention module to automatically\nencode the dynamic relationship between successive predictions. Besides, it\nalso extends the attention range beyond the currently visible window\nbenefitting from the use of historical predictions. The proposed Historical\nPrediction Attention together with the Agent Attention and Mode Attention is\nfurther formulated as the Triple Factorized Attention module, serving as the\ncore design of HPNet.Experiments on the Argoverse and INTERACTION datasets show\nthat HPNet achieves state-of-the-art performance, and generates accurate and\nstable future trajectories. Our code are available at\nhttps://github.com/XiaolongTang23/HPNet.\n","authors":["Xiaolong Tang","Meina Kan","Shiguang Shan","Zhilong Ji","Jinfeng Bai","Xilin Chen"],"pdf_url":"https://arxiv.org/pdf/2404.06351v2.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2404.00511v3","updated":"2024-04-11T05:14:35Z","published":"2024-03-31T01:16:02Z","title":"MIPS at SemEval-2024 Task 3: Multimodal Emotion-Cause Pair Extraction in\n Conversations with Multimodal Language Models","summary":" This paper presents our winning submission to Subtask 2 of SemEval 2024 Task\n3 on multimodal emotion cause analysis in conversations. We propose a novel\nMultimodal Emotion Recognition and Multimodal Emotion Cause Extraction\n(MER-MCE) framework that integrates text, audio, and visual modalities using\nspecialized emotion encoders. Our approach sets itself apart from\ntop-performing teams by leveraging modality-specific features for enhanced\nemotion understanding and causality inference. Experimental evaluation\ndemonstrates the advantages of our multimodal approach, with our submission\nachieving a competitive weighted F1 score of 0.3435, ranking third with a\nmargin of only 0.0339 behind the 1st team and 0.0025 behind the 2nd team.\nProject: https://github.com/MIPS-COLT/MER-MCE.git\n","authors":["Zebang Cheng","Fuqiang Niu","Yuxiang Lin","Zhi-Qi Cheng","Bowen Zhang","Xiaojiang Peng"],"pdf_url":"https://arxiv.org/pdf/2404.00511v3.pdf","comment":"Ranked 3rd in SemEval '24 Task 3 with F1 of 0.3435, close to 1st &\n 2nd by 0.0339 & 0.0025"},{"id":"http://arxiv.org/abs/2404.07474v1","updated":"2024-04-11T04:58:18Z","published":"2024-04-11T04:58:18Z","title":"G-NeRF: Geometry-enhanced Novel View Synthesis from Single-View Images","summary":" Novel view synthesis aims to generate new view images of a given view image\ncollection. Recent attempts address this problem relying on 3D geometry priors\n(e.g., shapes, sizes, and positions) learned from multi-view images. However,\nsuch methods encounter the following limitations: 1) they require a set of\nmulti-view images as training data for a specific scene (e.g., face, car or\nchair), which is often unavailable in many real-world scenarios; 2) they fail\nto extract the geometry priors from single-view images due to the lack of\nmulti-view supervision. In this paper, we propose a Geometry-enhanced NeRF\n(G-NeRF), which seeks to enhance the geometry priors by a geometry-guided\nmulti-view synthesis approach, followed by a depth-aware training. In the\nsynthesis process, inspired that existing 3D GAN models can unconditionally\nsynthesize high-fidelity multi-view images, we seek to adopt off-the-shelf 3D\nGAN models, such as EG3D, as a free source to provide geometry priors through\nsynthesizing multi-view data. Simultaneously, to further improve the geometry\nquality of the synthetic data, we introduce a truncation method to effectively\nsample latent codes within 3D GAN models. To tackle the absence of multi-view\nsupervision for single-view images, we design the depth-aware training\napproach, incorporating a depth-aware discriminator to guide geometry priors\nthrough depth maps. Experiments demonstrate the effectiveness of our method in\nterms of both qualitative and quantitative results.\n","authors":["Zixiong Huang","Qi Chen","Libo Sun","Yifan Yang","Naizhou Wang","Mingkui Tan","Qi Wu"],"pdf_url":"https://arxiv.org/pdf/2404.07474v1.pdf","comment":"CVPR 2024 Accepted Paper"},{"id":"http://arxiv.org/abs/2404.07473v1","updated":"2024-04-11T04:54:42Z","published":"2024-04-11T04:54:42Z","title":"LUCF-Net: Lightweight U-shaped Cascade Fusion Network for Medical Image\n Segmentation","summary":" In this study, the performance of existing U-shaped neural network\narchitectures was enhanced for medical image segmentation by adding\nTransformer. Although Transformer architectures are powerful at extracting\nglobal information, its ability to capture local information is limited due to\nits high complexity. To address this challenge, we proposed a new lightweight\nU-shaped cascade fusion network (LUCF-Net) for medical image segmentation. It\nutilized an asymmetrical structural design and incorporated both local and\nglobal modules to enhance its capacity for local and global modeling.\nAdditionally, a multi-layer cascade fusion decoding network was designed to\nfurther bolster the network's information fusion capabilities. Validation\nresults achieved on multi-organ datasets in CT format, cardiac segmentation\ndatasets in MRI format, and dermatology datasets in image format demonstrated\nthat the proposed model outperformed other state-of-the-art methods in handling\nlocal-global information, achieving an improvement of 1.54% in Dice coefficient\nand 2.6 mm in Hausdorff distance on multi-organ segmentation. Furthermore, as a\nnetwork that combines Convolutional Neural Network and Transformer\narchitectures, it achieves competitive segmentation performance with only 6.93\nmillion parameters and 6.6 gigabytes of floating point operations, without the\nneed of pre-training. In summary, the proposed method demonstrated enhanced\nperformance while retaining a simpler model design compared to other\nTransformer-based segmentation networks.\n","authors":["Songkai Sun","Qingshan She","Yuliang Ma","Rihui Li","Yingchun Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.07473v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06845v2","updated":"2024-04-11T04:17:13Z","published":"2024-03-11T16:03:35Z","title":"DriveDreamer-2: LLM-Enhanced World Models for Diverse Driving Video\n Generation","summary":" World models have demonstrated superiority in autonomous driving,\nparticularly in the generation of multi-view driving videos. However,\nsignificant challenges still exist in generating customized driving videos. In\nthis paper, we propose DriveDreamer-2, which builds upon the framework of\nDriveDreamer and incorporates a Large Language Model (LLM) to generate\nuser-defined driving videos. Specifically, an LLM interface is initially\nincorporated to convert a user's query into agent trajectories. Subsequently, a\nHDMap, adhering to traffic regulations, is generated based on the trajectories.\nUltimately, we propose the Unified Multi-View Model to enhance temporal and\nspatial coherence in the generated driving videos. DriveDreamer-2 is the first\nworld model to generate customized driving videos, it can generate uncommon\ndriving videos (e.g., vehicles abruptly cut in) in a user-friendly manner.\nBesides, experimental results demonstrate that the generated videos enhance the\ntraining of driving perception methods (e.g., 3D detection and tracking).\nFurthermore, video generation quality of DriveDreamer-2 surpasses other\nstate-of-the-art methods, showcasing FID and FVD scores of 11.2 and 55.7,\nrepresenting relative improvements of 30% and 50%.\n","authors":["Guosheng Zhao","Xiaofeng Wang","Zheng Zhu","Xinze Chen","Guan Huang","Xiaoyi Bao","Xingang Wang"],"pdf_url":"https://arxiv.org/pdf/2403.06845v2.pdf","comment":"Project Page: https://drivedreamer2.github.io"},{"id":"http://arxiv.org/abs/2404.07467v1","updated":"2024-04-11T04:14:48Z","published":"2024-04-11T04:14:48Z","title":"Trashbusters: Deep Learning Approach for Litter Detection and Tracking","summary":" The illegal disposal of trash is a major public health and environmental\nconcern. Disposing of trash in unplanned places poses serious health and\nenvironmental risks. We should try to restrict public trash cans as much as\npossible. This research focuses on automating the penalization of litterbugs,\naddressing the persistent problem of littering in public places. Traditional\napproaches relying on manual intervention and witness reporting suffer from\ndelays, inaccuracies, and anonymity issues. To overcome these challenges, this\npaper proposes a fully automated system that utilizes surveillance cameras and\nadvanced computer vision algorithms for litter detection, object tracking, and\nface recognition. The system accurately identifies and tracks individuals\nengaged in littering activities, attaches their identities through face\nrecognition, and enables efficient enforcement of anti-littering policies. By\nreducing reliance on manual intervention, minimizing human error, and providing\nprompt identification, the proposed system offers significant advantages in\naddressing littering incidents. The primary contribution of this research lies\nin the implementation of the proposed system, leveraging advanced technologies\nto enhance surveillance operations and automate the penalization of litterbugs.\n","authors":["Kashish Jain","Manthan Juthani","Jash Jain","Anant V. Nimkar"],"pdf_url":"https://arxiv.org/pdf/2404.07467v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10974v4","updated":"2024-04-11T04:14:33Z","published":"2023-07-20T16:00:19Z","title":"Deep Multi-Threshold Spiking-UNet for Image Processing","summary":" U-Net, known for its simple yet efficient architecture, is widely utilized\nfor image processing tasks and is particularly suitable for deployment on\nneuromorphic chips. This paper introduces the novel concept of Spiking-UNet for\nimage processing, which combines the power of Spiking Neural Networks (SNNs)\nwith the U-Net architecture. To achieve an efficient Spiking-UNet, we face two\nprimary challenges: ensuring high-fidelity information propagation through the\nnetwork via spikes and formulating an effective training strategy. To address\nthe issue of information loss, we introduce multi-threshold spiking neurons,\nwhich improve the efficiency of information transmission within the\nSpiking-UNet. For the training strategy, we adopt a conversion and fine-tuning\npipeline that leverage pre-trained U-Net models. During the conversion process,\nsignificant variability in data distribution across different parts is observed\nwhen utilizing skip connections. Therefore, we propose a connection-wise\nnormalization method to prevent inaccurate firing rates. Furthermore, we adopt\na flow-based training method to fine-tune the converted models, reducing time\nsteps while preserving performance. Experimental results show that, on image\nsegmentation and denoising, our Spiking-UNet achieves comparable performance to\nits non-spiking counterpart, surpassing existing SNN methods. Compared with the\nconverted Spiking-UNet without fine-tuning, our Spiking-UNet reduces inference\ntime by approximately 90\\%. This research broadens the application scope of\nSNNs in image processing and is expected to inspire further exploration in the\nfield of neuromorphic engineering. The code for our Spiking-UNet implementation\nis available at https://github.com/SNNresearch/Spiking-UNet.\n","authors":["Hebei Li","Yueyi Zhang","Zhiwei Xiong","Xiaoyan Sun"],"pdf_url":"https://arxiv.org/pdf/2307.10974v4.pdf","comment":"Accepted in NeuroComputing"},{"id":"http://arxiv.org/abs/2402.16994v2","updated":"2024-04-11T03:44:49Z","published":"2024-02-26T20:00:57Z","title":"GEM3D: GEnerative Medial Abstractions for 3D Shape Synthesis","summary":" We introduce GEM3D -- a new deep, topology-aware generative model of 3D\nshapes. The key ingredient of our method is a neural skeleton-based\nrepresentation encoding information on both shape topology and geometry.\nThrough a denoising diffusion probabilistic model, our method first generates\nskeleton-based representations following the Medial Axis Transform (MAT), then\ngenerates surfaces through a skeleton-driven neural implicit formulation. The\nneural implicit takes into account the topological and geometric information\nstored in the generated skeleton representations to yield surfaces that are\nmore topologically and geometrically accurate compared to previous neural field\nformulations. We discuss applications of our method in shape synthesis and\npoint cloud reconstruction tasks, and evaluate our method both qualitatively\nand quantitatively. We demonstrate significantly more faithful surface\nreconstruction and diverse shape generation results compared to the\nstate-of-the-art, also involving challenging scenarios of reconstructing and\nsynthesizing structurally complex, high-genus shape surfaces from Thingi10K and\nShapeNet.\n","authors":["Dmitry Petrov","Pradyumn Goyal","Vikas Thamizharasan","Vladimir G. Kim","Matheus Gadelha","Melinos Averkiou","Siddhartha Chaudhuri","Evangelos Kalogerakis"],"pdf_url":"https://arxiv.org/pdf/2402.16994v2.pdf","comment":"Webpage: https://lodurality.github.io/GEM3D/ -- Cond. accept. to\n SIGGRAPH 2024 (conf. track) -- Changes (based on reviews): changed style to\n sigconf; rearranged figures for readability; added missing citations; fixed\n misaligned centers in Fig. 3; added failure cases (Fig. 10); rewrote\n discussion; added categories averages to Tab. 8; added Tab. 10 with model\n capacities"},{"id":"http://arxiv.org/abs/2404.07449v1","updated":"2024-04-11T03:09:34Z","published":"2024-04-11T03:09:34Z","title":"Learning to Localize Objects Improves Spatial Reasoning in Visual-LLMs","summary":" Integration of Large Language Models (LLMs) into visual domain tasks,\nresulting in visual-LLMs (V-LLMs), has enabled exceptional performance in\nvision-language tasks, particularly for visual question answering (VQA).\nHowever, existing V-LLMs (e.g. BLIP-2, LLaVA) demonstrate weak spatial\nreasoning and localization awareness. Despite generating highly descriptive and\nelaborate textual answers, these models fail at simple tasks like\ndistinguishing a left vs right location. In this work, we explore how\nimage-space coordinate based instruction fine-tuning objectives could inject\nspatial awareness into V-LLMs. We discover optimal coordinate representations,\ndata-efficient instruction fine-tuning objectives, and pseudo-data generation\nstrategies that lead to improved spatial awareness in V-LLMs. Additionally, our\nresulting model improves VQA across image and video domains, reduces undesired\nhallucination, and generates better contextual object descriptions. Experiments\nacross 5 vision-language tasks involving 14 different datasets establish the\nclear performance improvements achieved by our proposed framework.\n","authors":["Kanchana Ranasinghe","Satya Narayan Shukla","Omid Poursaeed","Michael S. Ryoo","Tsung-Yu Lin"],"pdf_url":"https://arxiv.org/pdf/2404.07449v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07448v1","updated":"2024-04-11T03:08:53Z","published":"2024-04-11T03:08:53Z","title":"Transferable and Principled Efficiency for Open-Vocabulary Segmentation","summary":" Recent success of pre-trained foundation vision-language models makes\nOpen-Vocabulary Segmentation (OVS) possible. Despite the promising performance,\nthis approach introduces heavy computational overheads for two challenges: 1)\nlarge model sizes of the backbone; 2) expensive costs during the fine-tuning.\nThese challenges hinder this OVS strategy from being widely applicable and\naffordable in real-world scenarios. Although traditional methods such as model\ncompression and efficient fine-tuning can address these challenges, they often\nrely on heuristics. This means that their solutions cannot be easily\ntransferred and necessitate re-training on different models, which comes at a\ncost. In the context of efficient OVS, we target achieving performance that is\ncomparable to or even better than prior OVS works based on large\nvision-language foundation models, by utilizing smaller models that incur lower\ntraining costs. The core strategy is to make our efficiency principled and thus\nseamlessly transferable from one OVS framework to others without further\ncustomization. Comprehensive experiments on diverse OVS benchmarks demonstrate\nour superior trade-off between segmentation accuracy and computation costs over\nprevious works. Our code is available on https://github.com/Xujxyang/OpenTrans\n","authors":["Jingxuan Xu","Wuyang Chen","Yao Zhao","Yunchao Wei"],"pdf_url":"https://arxiv.org/pdf/2404.07448v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16923v2","updated":"2024-04-11T03:01:41Z","published":"2024-01-30T11:46:27Z","title":"Fourier Prompt Tuning for Modality-Incomplete Scene Segmentation","summary":" Integrating information from multiple modalities enhances the robustness of\nscene perception systems in autonomous vehicles, providing a more comprehensive\nand reliable sensory framework. However, the modality incompleteness in\nmulti-modal segmentation remains under-explored. In this work, we establish a\ntask called Modality-Incomplete Scene Segmentation (MISS), which encompasses\nboth system-level modality absence and sensor-level modality errors. To avoid\nthe predominant modality reliance in multi-modal fusion, we introduce a\nMissing-aware Modal Switch (MMS) strategy to proactively manage missing\nmodalities during training. Utilizing bit-level batch-wise sampling enhances\nthe model's performance in both complete and incomplete testing scenarios.\nFurthermore, we introduce the Fourier Prompt Tuning (FPT) method to incorporate\nrepresentative spectral information into a limited number of learnable prompts\nthat maintain robustness against all MISS scenarios. Akin to fine-tuning\neffects but with fewer tunable parameters (1.1%). Extensive experiments prove\nthe efficacy of our proposed approach, showcasing an improvement of 5.84% mIoU\nover the prior state-of-the-art parameter-efficient methods in modality\nmissing. The source code is publicly available at\nhttps://github.com/RuipingL/MISS.\n","authors":["Ruiping Liu","Jiaming Zhang","Kunyu Peng","Yufan Chen","Ke Cao","Junwei Zheng","M. Saquib Sarfraz","Kailun Yang","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2401.16923v2.pdf","comment":"Accepted to IEEE IV 2024. The source code is publicly available at\n https://github.com/RuipingL/MISS"},{"id":"http://arxiv.org/abs/2404.07445v1","updated":"2024-04-11T03:00:00Z","published":"2024-04-11T03:00:00Z","title":"Multi-view Aggregation Network for Dichotomous Image Segmentation","summary":" Dichotomous Image Segmentation (DIS) has recently emerged towards\nhigh-precision object segmentation from high-resolution natural images.\n When designing an effective DIS model, the main challenge is how to balance\nthe semantic dispersion of high-resolution targets in the small receptive field\nand the loss of high-precision details in the large receptive field. Existing\nmethods rely on tedious multiple encoder-decoder streams and stages to\ngradually complete the global localization and local refinement.\n Human visual system captures regions of interest by observing them from\nmultiple views. Inspired by it, we model DIS as a multi-view object perception\nproblem and provide a parsimonious multi-view aggregation network (MVANet),\nwhich unifies the feature fusion of the distant view and close-up view into a\nsingle stream with one encoder-decoder structure. With the help of the proposed\nmulti-view complementary localization and refinement modules, our approach\nestablished long-range, profound visual interactions across multiple views,\nallowing the features of the detailed close-up view to focus on highly slender\nstructures.Experiments on the popular DIS-5K dataset show that our MVANet\nsignificantly outperforms state-of-the-art methods in both accuracy and speed.\nThe source code and datasets will be publicly available at\n\\href{https://github.com/qianyu-dlut/MVANet}{MVANet}.\n","authors":["Qian Yu","Xiaoqi Zhao","Youwei Pang","Lihe Zhang","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2404.07445v1.pdf","comment":"Accepted by CVPR2024 as Highlight"},{"id":"http://arxiv.org/abs/2310.14576v2","updated":"2024-04-11T02:57:21Z","published":"2023-10-23T05:25:49Z","title":"Tensor Decomposition Based Attention Module for Spiking Neural Networks","summary":" The attention mechanism has been proven to be an effective way to improve\nspiking neural network (SNN). However, based on the fact that the current SNN\ninput data flow is split into tensors to process on GPUs, none of the previous\nworks consider the properties of tensors to implement an attention module. This\ninspires us to rethink current SNN from the perspective of tensor-relevant\ntheories. Using tensor decomposition, we design the \\textit{projected full\nattention} (PFA) module, which demonstrates excellent results with linearly\ngrowing parameters. Specifically, PFA is composed by the \\textit{linear\nprojection of spike tensor} (LPST) module and \\textit{attention map composing}\n(AMC) module. In LPST, we start by compressing the original spike tensor into\nthree projected tensors using a single property-preserving strategy with\nlearnable parameters for each dimension. Then, in AMC, we exploit the inverse\nprocedure of the tensor decomposition process to combine the three tensors into\nthe attention map using a so-called connecting factor. To validate the\neffectiveness of the proposed PFA module, we integrate it into the widely used\nVGG and ResNet architectures for classification tasks. Our method achieves\nstate-of-the-art performance on both static and dynamic benchmark datasets,\nsurpassing the existing SNN models with Transformer-based and CNN-based\nbackbones.\n","authors":["Haoyu Deng","Ruijie Zhu","Xuerui Qiu","Yule Duan","Malu Zhang","Liangjian Deng"],"pdf_url":"https://arxiv.org/pdf/2310.14576v2.pdf","comment":"Accepted by Knowledge-Based Systems"},{"id":"http://arxiv.org/abs/2403.17920v2","updated":"2024-04-11T02:42:59Z","published":"2024-03-26T17:55:11Z","title":"TC4D: Trajectory-Conditioned Text-to-4D Generation","summary":" Recent techniques for text-to-4D generation synthesize dynamic 3D scenes\nusing supervision from pre-trained text-to-video models. However, existing\nrepresentations for motion, such as deformation models or time-dependent neural\nrepresentations, are limited in the amount of motion they can generate-they\ncannot synthesize motion extending far beyond the bounding box used for volume\nrendering. The lack of a more flexible motion model contributes to the gap in\nrealism between 4D generation methods and recent, near-photorealistic video\ngeneration models. Here, we propose TC4D: trajectory-conditioned text-to-4D\ngeneration, which factors motion into global and local components. We represent\nthe global motion of a scene's bounding box using rigid transformation along a\ntrajectory parameterized by a spline. We learn local deformations that conform\nto the global trajectory using supervision from a text-to-video model. Our\napproach enables the synthesis of scenes animated along arbitrary trajectories,\ncompositional scene generation, and significant improvements to the realism and\namount of generated motion, which we evaluate qualitatively and through a user\nstudy. Video results can be viewed on our website:\nhttps://sherwinbahmani.github.io/tc4d.\n","authors":["Sherwin Bahmani","Xian Liu","Yifan Wang","Ivan Skorokhodov","Victor Rong","Ziwei Liu","Xihui Liu","Jeong Joon Park","Sergey Tulyakov","Gordon Wetzstein","Andrea Tagliasacchi","David B. Lindell"],"pdf_url":"https://arxiv.org/pdf/2403.17920v2.pdf","comment":"Project Page: https://sherwinbahmani.github.io/tc4d"},{"id":"http://arxiv.org/abs/2404.07435v1","updated":"2024-04-11T02:29:08Z","published":"2024-04-11T02:29:08Z","title":"Encoding Urban Ecologies: Automated Building Archetype Generation\n through Self-Supervised Learning for Energy Modeling","summary":" As the global population and urbanization expand, the building sector has\nemerged as the predominant energy consumer and carbon emission contributor. The\nneed for innovative Urban Building Energy Modeling grows, yet existing building\narchetypes often fail to capture the unique attributes of local buildings and\nthe nuanced distinctions between different cities, jeopardizing the precision\nof energy modeling. This paper presents an alternative tool employing\nself-supervised learning to distill complex geometric data into representative,\nlocale-specific archetypes. This study attempts to foster a new paradigm of\ninteraction with built environments, incorporating local parameters to conduct\nbespoke energy simulations at the community level. The catered archetypes can\naugment the precision and applicability of energy consumption modeling at\ndifferent scales across diverse building inventories. This tool provides a\npotential solution that encourages the exploration of emerging local ecologies.\nBy integrating building envelope characteristics and cultural granularity into\nthe building archetype generation process, we seek a future where architecture\nand urban design are intricately interwoven with the energy sector in shaping\nour built environments.\n","authors":["Xinwei Zhuang","Zixun Huang","Wentao Zeng","Luisa Caldas"],"pdf_url":"https://arxiv.org/pdf/2404.07435v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10853v4","updated":"2024-04-11T01:56:38Z","published":"2023-07-20T13:16:10Z","title":"Exploring Effective Priors and Efficient Models for Weakly-Supervised\n Change Detection","summary":" Weakly-supervised change detection (WSCD) aims to detect pixel-level changes\nwith only image-level annotations. Owing to its label efficiency, WSCD is\ndrawing increasing attention recently. However, current WSCD methods often\nencounter the challenge of change missing and fabricating, i.e., the\ninconsistency between image-level annotations and pixel-level predictions.\nSpecifically, change missing refer to the situation that the WSCD model fails\nto predict any changed pixels, even though the image-level label indicates\nchanged, and vice versa for change fabricating. To address this challenge, in\nthis work, we leverage global-scale and local-scale priors in WSCD and propose\ntwo components: a Dilated Prior (DP) decoder and a Label Gated (LG) constraint.\nThe DP decoder decodes samples with the changed image-level label, skips\nsamples with the unchanged label, and replaces them with an all-unchanged\npixel-level label. The LG constraint is derived from the correspondence between\nchanged representations and image-level labels, penalizing the model when it\nmispredicts the change status. Additionally, we develop TransWCD, a simple yet\npowerful transformer-based model, showcasing the potential of weakly-supervised\nlearning in change detection. By integrating the DP decoder and LG constraint\ninto TransWCD, we form TransWCD-DL. Our proposed TransWCD and TransWCD-DL\nachieve significant +6.33% and +9.55% F1 score improvements over the\nstate-of-the-art methods on the WHU-CD dataset, respectively. Some performance\nmetrics even exceed several fully-supervised change detection (FSCD)\ncompetitors. Code will be available at\nhttps://github.com/zhenghuizhao/TransWCD.\n","authors":["Zhenghui Zhao","Lixiang Ru","Chen Wu"],"pdf_url":"https://arxiv.org/pdf/2307.10853v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07424v1","updated":"2024-04-11T01:33:45Z","published":"2024-04-11T01:33:45Z","title":"CopilotCAD: Empowering Radiologists with Report Completion Models and\n Quantitative Evidence from Medical Image Foundation Models","summary":" Computer-aided diagnosis systems hold great promise to aid radiologists and\nclinicians in radiological clinical practice and enhance diagnostic accuracy\nand efficiency. However, the conventional systems primarily focus on delivering\ndiagnostic results through text report generation or medical image\nclassification, positioning them as standalone decision-makers rather than\nhelpers and ignoring radiologists' expertise. This study introduces an\ninnovative paradigm to create an assistive co-pilot system for empowering\nradiologists by leveraging Large Language Models (LLMs) and medical image\nanalysis tools. Specifically, we develop a collaborative framework to integrate\nLLMs and quantitative medical image analysis results generated by foundation\nmodels with radiologists in the loop, achieving efficient and safe generation\nof radiology reports and effective utilization of computational power of AI and\nthe expertise of medical professionals. This approach empowers radiologists to\ngenerate more precise and detailed diagnostic reports, enhancing patient\noutcomes while reducing the burnout of clinicians. Our methodology underscores\nthe potential of AI as a supportive tool in medical diagnostics, promoting a\nharmonious integration of technology and human expertise to advance the field\nof radiology.\n","authors":["Sheng Wang","Tianming Du","Katherine Fischer","Gregory E Tasian","Justin Ziemba","Joanie M Garratt","Hersh Sagreiya","Yong Fan"],"pdf_url":"https://arxiv.org/pdf/2404.07424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07410v1","updated":"2024-04-11T00:49:38Z","published":"2024-04-11T00:49:38Z","title":"Improving Shift Invariance in Convolutional Neural Networks with\n Translation Invariant Polyphase Sampling","summary":" Downsampling operators break the shift invariance of convolutional neural\nnetworks (CNNs) and this affects the robustness of features learned by CNNs\nwhen dealing with even small pixel-level shift. Through a large-scale\ncorrelation analysis framework, we study shift invariance of CNNs by inspecting\nexisting downsampling operators in terms of their maximum-sampling bias (MSB),\nand find that MSB is negatively correlated with shift invariance. Based on this\ncrucial insight, we propose a learnable pooling operator called Translation\nInvariant Polyphase Sampling (TIPS) and two regularizations on the intermediate\nfeature maps of TIPS to reduce MSB and learn translation-invariant\nrepresentations. TIPS can be integrated into any CNN and can be trained\nend-to-end with marginal computational overhead. Our experiments demonstrate\nthat TIPS results in consistent performance gains in terms of accuracy, shift\nconsistency, and shift fidelity on multiple benchmarks for image classification\nand semantic segmentation compared to previous methods and also leads to\nimprovements in adversarial and distributional robustness. TIPS results in the\nlowest MSB compared to all previous methods, thus explaining our strong\nempirical results.\n","authors":["Sourajit Saha","Tejas Gokhale"],"pdf_url":"https://arxiv.org/pdf/2404.07410v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07405v1","updated":"2024-04-11T00:45:10Z","published":"2024-04-11T00:45:10Z","title":"Simplifying Two-Stage Detectors for On-Device Inference in Remote\n Sensing","summary":" Deep learning has been successfully applied to object detection from remotely\nsensed images. Images are typically processed on the ground rather than\non-board due to the computation power of the ground system. Such offloaded\nprocessing causes delays in acquiring target mission information, which hinders\nits application to real-time use cases. For on-device object detection,\nresearches have been conducted on designing efficient detectors or model\ncompression to reduce inference latency. However, highly accurate two-stage\ndetectors still need further exploitation for acceleration. In this paper, we\npropose a model simplification method for two-stage object detectors. Instead\nof constructing a general feature pyramid, we utilize only one feature\nextraction in the two-stage detector. To compensate for the accuracy drop, we\napply a high pass filter to the RPN's score map. Our approach is applicable to\nany two-stage detector using a feature pyramid network. In the experiments with\nstate-of-the-art two-stage detectors such as ReDet, Oriented-RCNN, and LSKNet,\nour method reduced computation costs upto 61.2% with the accuracy loss within\n2.1% on the DOTAv1.5 dataset. Source code will be released.\n","authors":["Jaemin Kang","Hoeseok Yang","Hyungshin Kim"],"pdf_url":"https://arxiv.org/pdf/2404.07405v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10300v4","updated":"2024-04-11T00:35:04Z","published":"2023-05-17T15:37:47Z","title":"One-Prompt to Segment All Medical Images","summary":" Large foundation models, known for their strong zero-shot generalization,\nhave excelled in visual and language applications. However, applying them to\nmedical image segmentation, a domain with diverse imaging types and target\nlabels, remains an open challenge. Current approaches, such as adapting\ninteractive segmentation models like Segment Anything Model (SAM), require user\nprompts for each sample during inference. Alternatively, transfer learning\nmethods like few/one-shot models demand labeled samples, leading to high costs.\nThis paper introduces a new paradigm toward the universal medical image\nsegmentation, termed 'One-Prompt Segmentation.' One-Prompt Segmentation\ncombines the strengths of one-shot and interactive methods. In the inference\nstage, with just \\textbf{one prompted sample}, it can adeptly handle the unseen\ntask in a single forward pass. We train One-Prompt Model on 64 open-source\nmedical datasets, accompanied by the collection of over 3,000 clinician-labeled\nprompts. Tested on 14 previously unseen datasets, the One-Prompt Model\nshowcases superior zero-shot segmentation capabilities, outperforming a wide\nrange of related methods. The code and data is released as\n\\url{https://github.com/KidsWithTokens/one-prompt}.\n","authors":["Junde Wu","Jiayuan Zhu","Yuanpei Liu","Yueming Jin","Min Xu"],"pdf_url":"https://arxiv.org/pdf/2305.10300v4.pdf","comment":"arXiv admin note: text overlap with arXiv:2304.12620"},{"id":"http://arxiv.org/abs/2404.07399v1","updated":"2024-04-11T00:23:28Z","published":"2024-04-11T00:23:28Z","title":"Post-hurricane building damage assessment using street-view imagery and\n structured data: A multi-modal deep learning approach","summary":" Accurately assessing building damage is critical for disaster response and\nrecovery. However, many existing models for detecting building damage have poor\nprediction accuracy due to their limited capabilities of identifying detailed,\ncomprehensive structural and/or non-structural damage from the street-view\nimage. Additionally, these models mainly rely on the imagery data for damage\nclassification, failing to account for other critical information, such as wind\nspeed, building characteristics, evacuation zones, and distance of the building\nto the hurricane track. To address these limitations, in this study, we propose\na novel multi-modal (i.e., imagery and structured data) approach for\npost-hurricane building damage classification, named the Multi-Modal Swin\nTransformer (MMST). We empirically train and evaluate the proposed MMST using\ndata collected from the 2022 Hurricane Ian in Florida, USA. Results show that\nMMST outperforms all selected state-of-the-art benchmark models and can achieve\nan accuracy of 92.67%, which are 7.71% improvement in accuracy compared to\nVisual Geometry Group 16 (VGG-16). In addition to the street-view imagery data,\nbuilding value, building age, and wind speed are the most important predictors\nfor damage level classification. The proposed MMST can be deployed to assist in\nrapid damage assessment and guide reconnaissance efforts in future hurricanes.\n","authors":["Zhuoqun Xue","Xiaojian Zhang","David O. Prevatt","Jennifer Bridge","Susu Xu","Xilei Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.07399v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07395v1","updated":"2024-04-11T00:02:57Z","published":"2024-04-11T00:02:57Z","title":"Global versus Local: Evaluating AlexNet Architectures for Tropical\n Cyclone Intensity Estimation","summary":" Given the destructive impacts of tropical cyclones, it is critical to have a\nreliable system for cyclone intensity detection. Various techniques are\navailable for this purpose, each with differing levels of accuracy. In this\npaper, we introduce two ensemble-based models based on AlexNet architecture to\nestimate tropical cyclone intensity using visible satellite images. The first\nmodel, trained on the entire dataset, is called the global AlexNet model. The\nsecond model is a distributed version of AlexNet in which multiple AlexNets are\ntrained separately on subsets of the training data categorized according to the\nSaffir-Simpson wind speed scale prescribed by the meterologists. We evaluated\nthe performance of both models against a deep learning benchmark model called\n\\textit{Deepti} using a publicly available cyclone image dataset. Results\nindicate that both the global model (with a root mean square error (RMSE) of\n9.03 knots) and the distributed model (with a RMSE of 9.3 knots) outperform the\nbenchmark model (with a RMSE of 13.62 knots). We provide a thorough discussion\nof our solution approach, including an explanantion of the AlexNet's\nperformance using gradient class activation maps (grad-CAM). Our proposed\nsolution strategy allows future experimentation with various deep learning\nmodels in both single and multi-channel settings.\n","authors":["Vikas Dwivedi"],"pdf_url":"https://arxiv.org/pdf/2404.07395v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04526v2","updated":"2024-04-11T23:50:32Z","published":"2023-08-08T18:41:38Z","title":"Large-Scale Multi-Hypotheses Cell Tracking Using Ultrametric Contours\n Maps","summary":" In this work, we describe a method for large-scale 3D cell-tracking through a\nsegmentation selection approach. The proposed method is effective at tracking\ncells across large microscopy datasets on two fronts: (i) It can solve problems\ncontaining millions of segmentation instances in terabyte-scale 3D+t datasets;\n(ii) It achieves competitive results with or without deep learning, which\nrequires 3D annotated data, that is scarce in the fluorescence microscopy\nfield. The proposed method computes cell tracks and segments using a hierarchy\nof segmentation hypotheses and selects disjoint segments by maximizing the\noverlap between adjacent frames. We show that this method achieves\nstate-of-the-art results in 3D images from the cell tracking challenge and has\na faster integer linear programming formulation. Moreover, our framework is\nflexible and supports segmentations from off-the-shelf cell segmentation models\nand can combine them into an ensemble that improves tracking. The code is\navailable https://github.com/royerlab/ultrack.\n","authors":["Jordão Bragantini","Merlin Lange","Loïc Royer"],"pdf_url":"https://arxiv.org/pdf/2308.04526v2.pdf","comment":"13 pages, 7 figures, 4 tables"},{"id":"http://arxiv.org/abs/2403.16400v2","updated":"2024-04-11T23:38:06Z","published":"2024-03-25T03:30:37Z","title":"ASDF: Assembly State Detection Utilizing Late Fusion by Integrating 6D\n Pose Estimation","summary":" In medical and industrial domains, providing guidance for assembly processes\nis critical to ensure efficiency and safety. Errors in assembly can lead to\nsignificant consequences such as extended surgery times, and prolonged\nmanufacturing or maintenance times in industry. Assembly scenarios can benefit\nfrom in-situ AR visualization to provide guidance, reduce assembly times and\nminimize errors. To enable in-situ visualization 6D pose estimation can be\nleveraged. Existing 6D pose estimation techniques primarily focus on individual\nobjects and static captures. However, assembly scenarios have various dynamics\nincluding occlusion during assembly and dynamics in the assembly objects\nappearance. Existing work, combining object detection/6D pose estimation and\nassembly state detection focuses either on pure deep learning-based approaches,\nor limit the assembly state detection to building blocks. To address the\nchallenges of 6D pose estimation in combination with assembly state detection,\nour approach ASDF builds upon the strengths of YOLOv8, a real-time capable\nobject detection framework. We extend this framework, refine the object pose\nand fuse pose knowledge with network-detected pose information. Utilizing our\nlate fusion in our Pose2State module results in refined 6D pose estimation and\nassembly state detection. By combining both pose and state information, our\nPose2State module predicts the final assembly state with precision. Our\nevaluation on our ASDF dataset shows that our Pose2State module leads to an\nimproved assembly state detection and that the improvement of the assembly\nstate further leads to a more robust 6D pose estimation. Moreover, on the GBOT\ndataset, we outperform the pure deep learning-based network, and even\noutperform the hybrid and pure tracking-based approaches.\n","authors":["Hannah Schieber","Shiyu Li","Niklas Corell","Philipp Beckerle","Julian Kreimeier","Daniel Roth"],"pdf_url":"https://arxiv.org/pdf/2403.16400v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01492v2","updated":"2024-04-11T23:09:25Z","published":"2024-04-01T21:28:50Z","title":"Modality Translation for Object Detection Adaptation Without Forgetting\n Prior Knowledge","summary":" A common practice in deep learning consists of training large neural networks\non massive datasets to perform accurately for different domains and tasks.\nWhile this methodology may work well in numerous application areas, it only\napplies across modalities due to a larger distribution shift in data captured\nusing different sensors. This paper focuses on the problem of adapting a large\nobject detection model to one or multiple modalities while being efficient. To\ndo so, we propose ModTr as an alternative to the common approach of fine-tuning\nlarge models. ModTr consists of adapting the input with a small transformation\nnetwork trained to minimize the detection loss directly. The original model can\ntherefore work on the translated inputs without any further change or\nfine-tuning to its parameters. Experimental results on translating from IR to\nRGB images on two well-known datasets show that this simple ModTr approach\nprovides detectors that can perform comparably or better than the standard\nfine-tuning without forgetting the original knowledge. This opens the doors to\na more flexible and efficient service-based detection pipeline in which,\ninstead of using a different detector for each modality, a unique and unaltered\nserver is constantly running, where multiple modalities with the corresponding\ntranslations can query it. Code: https://github.com/heitorrapela/ModTr.\n","authors":["Heitor Rapela Medeiros","Masih Aminbeidokhti","Fidel Guerrero Pena","David Latortue","Eric Granger","Marco Pedersoli"],"pdf_url":"https://arxiv.org/pdf/2404.01492v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12982v2","updated":"2024-04-11T22:47:39Z","published":"2023-10-19T17:59:56Z","title":"Putting the Object Back into Video Object Segmentation","summary":" We present Cutie, a video object segmentation (VOS) network with object-level\nmemory reading, which puts the object representation from memory back into the\nvideo object segmentation result. Recent works on VOS employ bottom-up\npixel-level memory reading which struggles due to matching noise, especially in\nthe presence of distractors, resulting in lower performance in more challenging\ndata. In contrast, Cutie performs top-down object-level memory reading by\nadapting a small set of object queries. Via those, it interacts with the\nbottom-up pixel features iteratively with a query-based object transformer (qt,\nhence Cutie). The object queries act as a high-level summary of the target\nobject, while high-resolution feature maps are retained for accurate\nsegmentation. Together with foreground-background masked attention, Cutie\ncleanly separates the semantics of the foreground object from the background.\nOn the challenging MOSE dataset, Cutie improves by 8.7 J&F over XMem with a\nsimilar running time and improves by 4.2 J&F over DeAOT while being three times\nfaster. Code is available at: https://hkchengrex.github.io/Cutie\n","authors":["Ho Kei Cheng","Seoung Wug Oh","Brian Price","Joon-Young Lee","Alexander Schwing"],"pdf_url":"https://arxiv.org/pdf/2310.12982v2.pdf","comment":"CVPR 2024 Highlight. Project page: https://hkchengrex.github.io/Cutie"},{"id":"http://arxiv.org/abs/2307.15904v2","updated":"2024-04-11T22:39:15Z","published":"2023-07-29T06:23:51Z","title":"Sat2Cap: Mapping Fine-Grained Textual Descriptions from Satellite Images","summary":" We propose a weakly supervised approach for creating maps using free-form\ntextual descriptions. We refer to this work of creating textual maps as\nzero-shot mapping. Prior works have approached mapping tasks by developing\nmodels that predict a fixed set of attributes using overhead imagery. However,\nthese models are very restrictive as they can only solve highly specific tasks\nfor which they were trained. Mapping text, on the other hand, allows us to\nsolve a large variety of mapping problems with minimal restrictions. To achieve\nthis, we train a contrastive learning framework called Sat2Cap on a new\nlarge-scale dataset with 6.1M pairs of overhead and ground-level images. For a\ngiven location and overhead image, our model predicts the expected CLIP\nembeddings of the ground-level scenery. The predicted CLIP embeddings are then\nused to learn about the textual space associated with that location. Sat2Cap is\nalso conditioned on date-time information, allowing it to model temporally\nvarying concepts over a location. Our experimental results demonstrate that our\nmodels successfully capture ground-level concepts and allow large-scale mapping\nof fine-grained textual queries. Our approach does not require any text-labeled\ndata, making the training easily scalable. The code, dataset, and models will\nbe made publicly available.\n","authors":["Aayush Dhakal","Adeel Ahmad","Subash Khanal","Srikumar Sastry","Hannah Kerner","Nathan Jacobs"],"pdf_url":"https://arxiv.org/pdf/2307.15904v2.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2404.08135v1","updated":"2024-04-11T21:41:55Z","published":"2024-04-11T21:41:55Z","title":"SciFlow: Empowering Lightweight Optical Flow Models with Self-Cleaning\n Iterations","summary":" Optical flow estimation is crucial to a variety of vision tasks. Despite\nsubstantial recent advancements, achieving real-time on-device optical flow\nestimation remains a complex challenge. First, an optical flow model must be\nsufficiently lightweight to meet computation and memory constraints to ensure\nreal-time performance on devices. Second, the necessity for real-time on-device\noperation imposes constraints that weaken the model's capacity to adequately\nhandle ambiguities in flow estimation, thereby intensifying the difficulty of\npreserving flow accuracy. This paper introduces two synergistic techniques,\nSelf-Cleaning Iteration (SCI) and Regression Focal Loss (RFL), designed to\nenhance the capabilities of optical flow models, with a focus on addressing\noptical flow regression ambiguities. These techniques prove particularly\neffective in mitigating error propagation, a prevalent issue in optical flow\nmodels that employ iterative refinement. Notably, these techniques add\nnegligible to zero overhead in model parameters and inference latency, thereby\npreserving real-time on-device efficiency. The effectiveness of our proposed\nSCI and RFL techniques, collectively referred to as SciFlow for brevity, is\ndemonstrated across two distinct lightweight optical flow model architectures\nin our experiments. Remarkably, SciFlow enables substantial reduction in error\nmetrics (EPE and Fl-all) over the baseline models by up to 6.3% and 10.5% for\nin-domain scenarios and by up to 6.2% and 13.5% for cross-domain scenarios on\nthe Sintel and KITTI 2015 datasets, respectively.\n","authors":["Jamie Menjay Lin","Jisoo Jeong","Hong Cai","Risheek Garrepalli","Kai Wang","Fatih Porikli"],"pdf_url":"https://arxiv.org/pdf/2404.08135v1.pdf","comment":"CVPRW 2024"},{"id":"http://arxiv.org/abs/2404.08127v1","updated":"2024-04-11T21:07:38Z","published":"2024-04-11T21:07:38Z","title":"Self-Supervised Learning of Color Constancy","summary":" Color constancy (CC) describes the ability of the visual system to perceive\nan object as having a relatively constant color despite changes in lighting\nconditions. While CC and its limitations have been carefully characterized in\nhumans, it is still unclear how the visual system acquires this ability during\ndevelopment. Here, we present a first study showing that CC develops in a\nneural network trained in a self-supervised manner through an invariance\nlearning objective. During learning, objects are presented under changing\nilluminations, while the network aims to map subsequent views of the same\nobject onto close-by latent representations. This gives rise to representations\nthat are largely invariant to the illumination conditions, offering a plausible\nexample of how CC could emerge during human cognitive development via a form of\nself-supervised learning.\n","authors":["Markus R. Ernst","Francisco M. López","Arthur Aubret","Roland W. Fleming","Jochen Triesch"],"pdf_url":"https://arxiv.org/pdf/2404.08127v1.pdf","comment":"7 pages, 5 figures, submitted to the IEEE International Conference on\n Development and Learning (ICDL 2024)"},{"id":"http://arxiv.org/abs/2404.08111v1","updated":"2024-04-11T20:25:26Z","published":"2024-04-11T20:25:26Z","title":"S3Editor: A Sparse Semantic-Disentangled Self-Training Framework for\n Face Video Editing","summary":" Face attribute editing plays a pivotal role in various applications. However,\nexisting methods encounter challenges in achieving high-quality results while\npreserving identity, editing faithfulness, and temporal consistency. These\nchallenges are rooted in issues related to the training pipeline, including\nlimited supervision, architecture design, and optimization strategy. In this\nwork, we introduce S3Editor, a Sparse Semantic-disentangled Self-training\nframework for face video editing. S3Editor is a generic solution that\ncomprehensively addresses these challenges with three key contributions.\nFirstly, S3Editor adopts a self-training paradigm to enhance the training\nprocess through semi-supervision. Secondly, we propose a semantic disentangled\narchitecture with a dynamic routing mechanism that accommodates diverse editing\nrequirements. Thirdly, we present a structured sparse optimization schema that\nidentifies and deactivates malicious neurons to further disentangle impacts\nfrom untarget attributes. S3Editor is model-agnostic and compatible with\nvarious editing approaches. Our extensive qualitative and quantitative results\naffirm that our approach significantly enhances identity preservation, editing\nfidelity, as well as temporal consistency.\n","authors":["Guangzhi Wang","Tianyi Chen","Kamran Ghasedi","HsiangTao Wu","Tianyu Ding","Chris Nuesmeyer","Ilya Zharkov","Mohan Kankanhalli","Luming Liang"],"pdf_url":"https://arxiv.org/pdf/2404.08111v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01001v2","updated":"2024-04-11T20:07:20Z","published":"2023-12-02T02:09:31Z","title":"Learning county from pixels: Corn yield prediction with\n attention-weighted multiple instance learning","summary":" Remote sensing technology has become a promising tool in yield prediction.\nMost prior work employs satellite imagery for county-level corn yield\nprediction by spatially aggregating all pixels within a county into a single\nvalue, potentially overlooking the detailed information and valuable insights\noffered by more granular data. To this end, this research examines each county\nat the pixel level and applies multiple instance learning to leverage detailed\ninformation within a county. In addition, our method addresses the \"mixed\npixel\" issue caused by the inconsistent resolution between feature datasets and\ncrop mask, which may introduce noise into the model and therefore hinder\naccurate yield prediction. Specifically, the attention mechanism is employed to\nautomatically assign weights to different pixels, which can mitigate the\ninfluence of mixed pixels. The experimental results show that the developed\nmodel outperforms four other machine learning models over the past five years\nin the U.S. corn belt and demonstrates its best performance in 2022, achieving\na coefficient of determination (R2) value of 0.84 and a root mean square error\n(RMSE) of 0.83. This paper demonstrates the advantages of our approach from\nboth spatial and temporal perspectives. Furthermore, through an in-depth study\nof the relationship between mixed pixels and attention, it is verified that our\napproach can capture critical feature information while filtering out noise\nfrom mixed pixels.\n","authors":["Xiaoyu Wang","Yuchi Ma","Qunying Huang","Zhengwei Yang","Zhou Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.01001v2.pdf","comment":"I am writing to request the withdrawal of my paper submitted to\n arXiv. Upon further review, I have identified an error in the paper that\n significantly affects the results and conclusions. To maintain the integrity\n of the scientific record and prevent the dissemination of incorrect\n information, I believe it is necessary to withdraw the paper from the archive"},{"id":"http://arxiv.org/abs/2307.03798v2","updated":"2024-04-11T19:24:50Z","published":"2023-07-07T18:54:11Z","title":"Fooling Contrastive Language-Image Pre-trained Models with\n CLIPMasterPrints","summary":" Models leveraging both visual and textual data such as Contrastive\nLanguage-Image Pre-training (CLIP), are the backbone of many recent advances in\nartificial intelligence. In this work, we show that despite their versatility,\nsuch models are vulnerable to what we refer to as fooling master images.\nFooling master images are capable of maximizing the confidence score of a CLIP\nmodel for a significant number of widely varying prompts, while being either\nunrecognizable or unrelated to the attacked prompts for humans. The existence\nof such images is problematic as it could be used by bad actors to maliciously\ninterfere with CLIP-trained image retrieval models in production with\ncomparably small effort as a single image can attack many different prompts. We\ndemonstrate how fooling master images for CLIP (CLIPMasterPrints) can be mined\nusing stochastic gradient descent, projected gradient descent, or blackbox\noptimization. Contrary to many common adversarial attacks, the blackbox\noptimization approach allows us to mine CLIPMasterPrints even when the weights\nof the model are not accessible. We investigate the properties of the mined\nimages, and find that images trained on a small number of image captions\ngeneralize to a much larger number of semantically related captions. We\nevaluate possible mitigation strategies, where we increase the robustness of\nthe model and introduce an approach to automatically detect CLIPMasterPrints to\nsanitize the input of vulnerable models. Finally, we find that vulnerability to\nCLIPMasterPrints is related to a modality gap in contrastive pre-trained\nmulti-modal networks. Code available at\nhttps://github.com/matfrei/CLIPMasterPrints.\n","authors":["Matthias Freiberger","Peter Kun","Christian Igel","Anders Sundnes Løvlie","Sebastian Risi"],"pdf_url":"https://arxiv.org/pdf/2307.03798v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.13004v3","updated":"2024-04-11T19:22:41Z","published":"2022-10-24T07:50:02Z","title":"Efficient Representation of Natural Image Patches","summary":" Utilizing an abstract information processing model based on minimal yet\nrealistic assumptions inspired by biological systems, we study how to achieve\nthe early visual system's two ultimate objectives: efficient information\ntransmission and accurate sensor probability distribution modeling. We prove\nthat optimizing for information transmission does not guarantee optimal\nprobability distribution modeling in general. We illustrate, using a two-pixel\n(2D) system and image patches, that an efficient representation can be realized\nthrough a nonlinear population code driven by two types of biologically\nplausible loss functions that depend solely on output. After unsupervised\nlearning, our abstract information processing model bears remarkable\nresemblances to biological systems, despite not mimicking many features of real\nneurons, such as spiking activity. A preliminary comparison with a contemporary\ndeep learning model suggests that our model offers a significant efficiency\nadvantage. Our model provides novel insights into the computational theory of\nearly visual systems as well as a potential new approach to enhance the\nefficiency of deep learning models.\n","authors":["Cheng Guo"],"pdf_url":"https://arxiv.org/pdf/2210.13004v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08088v1","updated":"2024-04-11T19:06:36Z","published":"2024-04-11T19:06:36Z","title":"Visual Context-Aware Person Fall Detection","summary":" As the global population ages, the number of fall-related incidents is on the\nrise. Effective fall detection systems, specifically in healthcare sector, are\ncrucial to mitigate the risks associated with such events. This study evaluates\nthe role of visual context, including background objects, on the accuracy of\nfall detection classifiers. We present a segmentation pipeline to\nsemi-automatically separate individuals and objects in images. Well-established\nmodels like ResNet-18, EfficientNetV2-S, and Swin-Small are trained and\nevaluated. During training, pixel-based transformations are applied to\nsegmented objects, and the models are then evaluated on raw images without\nsegmentation. Our findings highlight the significant influence of visual\ncontext on fall detection. The application of Gaussian blur to the image\nbackground notably improves the performance and generalization capabilities of\nall models. Background objects such as beds, chairs, or wheelchairs can\nchallenge fall detection systems, leading to false positive alarms. However, we\ndemonstrate that object-specific contextual transformations during training\neffectively mitigate this challenge. Further analysis using saliency maps\nsupports our observation that visual context is crucial in classification\ntasks. We create both dataset processing API and segmentation pipeline,\navailable at https://github.com/A-NGJ/image-segmentation-cli.\n","authors":["Aleksander Nagaj","Zenjie Li","Dim P. Papadopoulos","Kamal Nasrollahi"],"pdf_url":"https://arxiv.org/pdf/2404.08088v1.pdf","comment":"10 pages, 6 figures, KES IDT-24 conference"},{"id":"http://arxiv.org/abs/2404.03507v2","updated":"2024-04-11T18:54:24Z","published":"2024-04-04T15:10:24Z","title":"DQ-DETR: DETR with Dynamic Query for Tiny Object Detection","summary":" Despite previous DETR-like methods having performed successfully in generic\nobject detection, tiny object detection is still a challenging task for them\nsince the positional information of object queries is not customized for\ndetecting tiny objects, whose scale is extraordinarily smaller than general\nobjects. Also, DETR-like methods using a fixed number of queries make them\nunsuitable for aerial datasets, which only contain tiny objects, and the\nnumbers of instances are imbalanced between different images. Thus, we present\na simple yet effective model, named DQ-DETR, which consists of three different\ncomponents: categorical counting module, counting-guided feature enhancement,\nand dynamic query selection to solve the above-mentioned problems. DQ-DETR uses\nthe prediction and density maps from the categorical counting module to\ndynamically adjust the number of object queries and improve the positional\ninformation of queries. Our model DQ-DETR outperforms previous CNN-based and\nDETR-like methods, achieving state-of-the-art mAP 30.2% on the AI-TOD-V2\ndataset, which mostly consists of tiny objects.\n","authors":["Yi-Xin Huang","Hou-I Liu","Hong-Han Shuai","Wen-Huang Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.03507v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.17205v4","updated":"2024-04-11T18:48:04Z","published":"2023-12-28T18:40:31Z","title":"EFHQ: Multi-purpose ExtremePose-Face-HQ dataset","summary":" The existing facial datasets, while having plentiful images at near frontal\nviews, lack images with extreme head poses, leading to the downgraded\nperformance of deep learning models when dealing with profile or pitched faces.\nThis work aims to address this gap by introducing a novel dataset named Extreme\nPose Face High-Quality Dataset (EFHQ), which includes a maximum of 450k\nhigh-quality images of faces at extreme poses. To produce such a massive\ndataset, we utilize a novel and meticulous dataset processing pipeline to\ncurate two publicly available datasets, VFHQ and CelebV-HQ, which contain many\nhigh-resolution face videos captured in various settings. Our dataset can\ncomplement existing datasets on various facial-related tasks, such as facial\nsynthesis with 2D/3D-aware GAN, diffusion-based text-to-image face generation,\nand face reenactment. Specifically, training with EFHQ helps models generalize\nwell across diverse poses, significantly improving performance in scenarios\ninvolving extreme views, confirmed by extensive experiments. Additionally, we\nutilize EFHQ to define a challenging cross-view face verification benchmark, in\nwhich the performance of SOTA face recognition models drops 5-37% compared to\nfrontal-to-frontal scenarios, aiming to stimulate studies on face recognition\nunder severe pose conditions in the wild.\n","authors":["Trung Tuan Dao","Duc Hong Vu","Cuong Pham","Anh Tran"],"pdf_url":"https://arxiv.org/pdf/2312.17205v4.pdf","comment":"Project Page: https://bomcon123456.github.io/efhq/"},{"id":"http://arxiv.org/abs/2404.08081v1","updated":"2024-04-11T18:42:14Z","published":"2024-04-11T18:42:14Z","title":"Real-Time Detection and Analysis of Vehicles and Pedestrians using Deep\n Learning","summary":" Computer vision, particularly vehicle and pedestrian identification is\ncritical to the evolution of autonomous driving, artificial intelligence, and\nvideo surveillance. Current traffic monitoring systems confront major\ndifficulty in recognizing small objects and pedestrians effectively in\nreal-time, posing a serious risk to public safety and contributing to traffic\ninefficiency. Recognizing these difficulties, our project focuses on the\ncreation and validation of an advanced deep-learning framework capable of\nprocessing complex visual input for precise, real-time recognition of cars and\npeople in a variety of environmental situations. On a dataset representing\ncomplicated urban settings, we trained and evaluated different versions of the\nYOLOv8 and RT-DETR models. The YOLOv8 Large version proved to be the most\neffective, especially in pedestrian recognition, with great precision and\nrobustness. The results, which include Mean Average Precision and recall rates,\ndemonstrate the model's ability to dramatically improve traffic monitoring and\nsafety. This study makes an important addition to real-time, reliable detection\nin computer vision, establishing new benchmarks for traffic management systems.\n","authors":["Md Nahid Sadik","Tahmim Hossain","Faisal Sayeed"],"pdf_url":"https://arxiv.org/pdf/2404.08081v1.pdf","comment":"5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2404.08079v1","updated":"2024-04-11T18:34:29Z","published":"2024-04-11T18:34:29Z","title":"DIMAT: Decentralized Iterative Merging-And-Training for Deep Learning\n Models","summary":" Recent advances in decentralized deep learning algorithms have demonstrated\ncutting-edge performance on various tasks with large pre-trained models.\nHowever, a pivotal prerequisite for achieving this level of competitiveness is\nthe significant communication and computation overheads when updating these\nmodels, which prohibits the applications of them to real-world scenarios. To\naddress this issue, drawing inspiration from advanced model merging techniques\nwithout requiring additional training, we introduce the Decentralized Iterative\nMerging-And-Training (DIMAT) paradigm--a novel decentralized deep learning\nframework. Within DIMAT, each agent is trained on their local data and\nperiodically merged with their neighboring agents using advanced model merging\ntechniques like activation matching until convergence is achieved. DIMAT\nprovably converges with the best available rate for nonconvex functions with\nvarious first-order methods, while yielding tighter error bounds compared to\nthe popular existing approaches. We conduct a comprehensive empirical analysis\nto validate DIMAT's superiority over baselines across diverse computer vision\ntasks sourced from multiple datasets. Empirical results validate our\ntheoretical claims by showing that DIMAT attains faster and higher initial gain\nin accuracy with independent and identically distributed (IID) and non-IID\ndata, incurring lower communication overhead. This DIMAT paradigm presents a\nnew opportunity for the future decentralized learning, enhancing its\nadaptability to real-world with sparse and light-weight communication and\ncomputation.\n","authors":["Nastaran Saadati","Minh Pham","Nasla Saleem","Joshua R. Waite","Aditya Balu","Zhanhong Jiang","Chinmay Hegde","Soumik Sarkar"],"pdf_url":"https://arxiv.org/pdf/2404.08079v1.pdf","comment":"CVPR 2024 accepted paper, 22 pages, 12 figures"},{"id":"http://arxiv.org/abs/2404.02059v2","updated":"2024-04-11T18:29:01Z","published":"2024-04-02T15:58:36Z","title":"IISAN: Efficiently Adapting Multimodal Representation for Sequential\n Recommendation with Decoupled PEFT","summary":" Multimodal foundation models are transformative in sequential recommender\nsystems, leveraging powerful representation learning capabilities. While\nParameter-efficient Fine-tuning (PEFT) is commonly used to adapt foundation\nmodels for recommendation tasks, most research prioritizes parameter\nefficiency, often overlooking critical factors like GPU memory efficiency and\ntraining speed. Addressing this gap, our paper introduces IISAN (Intra- and\nInter-modal Side Adapted Network for Multimodal Representation), a simple\nplug-and-play architecture using a Decoupled PEFT structure and exploiting both\nintra- and inter-modal adaptation.\n IISAN matches the performance of full fine-tuning (FFT) and state-of-the-art\nPEFT. More importantly, it significantly reduces GPU memory usage - from 47GB\nto just 3GB for multimodal sequential recommendation tasks. Additionally, it\naccelerates training time per epoch from 443s to 22s compared to FFT. This is\nalso a notable improvement over the Adapter and LoRA, which require 37-39 GB\nGPU memory and 350-380 seconds per epoch for training.\n Furthermore, we propose a new composite efficiency metric, TPME\n(Training-time, Parameter, and GPU Memory Efficiency) to alleviate the\nprevalent misconception that \"parameter efficiency represents overall\nefficiency\". TPME provides more comprehensive insights into practical\nefficiency comparisons between different methods. Besides, we give an\naccessible efficiency analysis of all PEFT and FFT approaches, which\ndemonstrate the superiority of IISAN. We release our codes and other materials\nat https://github.com/GAIR-Lab/IISAN.\n","authors":["Junchen Fu","Xuri Ge","Xin Xin","Alexandros Karatzoglou","Ioannis Arapakis","Jie Wang","Joemon M. Jose"],"pdf_url":"https://arxiv.org/pdf/2404.02059v2.pdf","comment":"Accepted by SIGIR2024"},{"id":"http://arxiv.org/abs/2404.08031v1","updated":"2024-04-11T17:59:52Z","published":"2024-04-11T17:59:52Z","title":"Latent Guard: a Safety Framework for Text-to-image Generation","summary":" With the ability to generate high-quality images, text-to-image (T2I) models\ncan be exploited for creating inappropriate content. To prevent misuse,\nexisting safety measures are either based on text blacklists, which can be\neasily circumvented, or harmful content classification, requiring large\ndatasets for training and offering low flexibility. Hence, we propose Latent\nGuard, a framework designed to improve safety measures in text-to-image\ngeneration. Inspired by blacklist-based approaches, Latent Guard learns a\nlatent space on top of the T2I model's text encoder, where it is possible to\ncheck the presence of harmful concepts in the input text embeddings. Our\nproposed framework is composed of a data generation pipeline specific to the\ntask using large language models, ad-hoc architectural components, and a\ncontrastive learning strategy to benefit from the generated data. The\neffectiveness of our method is verified on three datasets and against four\nbaselines. Code and data will be shared at\nhttps://github.com/rt219/LatentGuard.\n","authors":["Runtao Liu","Ashkan Khakzar","Jindong Gu","Qifeng Chen","Philip Torr","Fabio Pizzati"],"pdf_url":"https://arxiv.org/pdf/2404.08031v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2404.08030v1","updated":"2024-04-11T17:59:43Z","published":"2024-04-11T17:59:43Z","title":"Rethinking Artistic Copyright Infringements in the Era of Text-to-Image\n Generative Models","summary":" Recent text-to-image generative models such as Stable Diffusion are extremely\nadept at mimicking and generating copyrighted content, raising concerns amongst\nartists that their unique styles may be improperly copied. Understanding how\ngenerative models copy \"artistic style\" is more complex than duplicating a\nsingle image, as style is comprised by a set of elements (or signature) that\nfrequently co-occurs across a body of work, where each individual work may vary\nsignificantly. In our paper, we first reformulate the problem of \"artistic\ncopyright infringement\" to a classification problem over image sets, instead of\nprobing image-wise similarities. We then introduce ArtSavant, a practical\n(i.e., efficient and easy to understand) tool to (i) determine the unique style\nof an artist by comparing it to a reference dataset of works from 372 artists\ncurated from WikiArt, and (ii) recognize if the identified style reappears in\ngenerated images. We leverage two complementary methods to perform artistic\nstyle classification over image sets, includingTagMatch, which is a novel\ninherently interpretable and attributable method, making it more suitable for\nbroader use by non-technical stake holders (artists, lawyers, judges, etc).\nLeveraging ArtSavant, we then perform a large-scale empirical study to provide\nquantitative insight on the prevalence of artistic style copying across 3\npopular text-to-image generative models. Namely, amongst a dataset of prolific\nartists (including many famous ones), only 20% of them appear to have their\nstyles be at a risk of copying via simple prompting of today's popular\ntext-to-image generative models.\n","authors":["Mazda Moayeri","Samyadeep Basu","Sriram Balasubramanian","Priyatham Kattakinda","Atoosa Chengini","Robert Brauneis","Soheil Feizi"],"pdf_url":"https://arxiv.org/pdf/2404.08030v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08027v1","updated":"2024-04-11T15:58:12Z","published":"2024-04-11T15:58:12Z","title":"SurvMamba: State Space Model with Multi-grained Multi-modal Interaction\n for Survival Prediction","summary":" Multi-modal learning that combines pathological images with genomic data has\nsignificantly enhanced the accuracy of survival prediction. Nevertheless,\nexisting methods have not fully utilized the inherent hierarchical structure\nwithin both whole slide images (WSIs) and transcriptomic data, from which\nbetter intra-modal representations and inter-modal integration could be\nderived. Moreover, many existing studies attempt to improve multi-modal\nrepresentations through attention mechanisms, which inevitably lead to high\ncomplexity when processing high-dimensional WSIs and transcriptomic data.\nRecently, a structured state space model named Mamba emerged as a promising\napproach for its superior performance in modeling long sequences with low\ncomplexity. In this study, we propose Mamba with multi-grained multi-modal\ninteraction (SurvMamba) for survival prediction. SurvMamba is implemented with\na Hierarchical Interaction Mamba (HIM) module that facilitates efficient\nintra-modal interactions at different granularities, thereby capturing more\ndetailed local features as well as rich global representations. In addition, an\nInteraction Fusion Mamba (IFM) module is used for cascaded inter-modal\ninteractive fusion, yielding more comprehensive features for survival\nprediction. Comprehensive evaluations on five TCGA datasets demonstrate that\nSurvMamba outperforms other existing methods in terms of performance and\ncomputational cost.\n","authors":["Ying Chen","Jiajing Xie","Yuxiang Lin","Yuhang Song","Wenxian Yang","Rongshan Yu"],"pdf_url":"https://arxiv.org/pdf/2404.08027v1.pdf","comment":null}]},"2024-04-12T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.08640v1","updated":"2024-04-12T17:59:47Z","published":"2024-04-12T17:59:47Z","title":"EventEgo3D: 3D Human Motion Capture from Egocentric Event Streams","summary":" Monocular egocentric 3D human motion capture is a challenging and actively\nresearched problem. Existing methods use synchronously operating visual sensors\n(e.g. RGB cameras) and often fail under low lighting and fast motions, which\ncan be restricting in many applications involving head-mounted devices. In\nresponse to the existing limitations, this paper 1) introduces a new problem,\ni.e., 3D human motion capture from an egocentric monocular event camera with a\nfisheye lens, and 2) proposes the first approach to it called EventEgo3D\n(EE3D). Event streams have high temporal resolution and provide reliable cues\nfor 3D human motion capture under high-speed human motions and rapidly changing\nillumination. The proposed EE3D framework is specifically tailored for learning\nwith event streams in the LNES representation, enabling high 3D reconstruction\naccuracy. We also design a prototype of a mobile head-mounted device with an\nevent camera and record a real dataset with event observations and the\nground-truth 3D human poses (in addition to the synthetic dataset). Our EE3D\ndemonstrates robustness and superior 3D accuracy compared to existing solutions\nacross various challenging experiments while supporting real-time 3D pose\nupdate rates of 140Hz.\n","authors":["Christen Millerdurai","Hiroyasu Akada","Jian Wang","Diogo Luvizon","Christian Theobalt","Vladislav Golyanik"],"pdf_url":"https://arxiv.org/pdf/2404.08640v1.pdf","comment":"14 pages, 11 figures and 6 tables; project page:\n https://4dqv.mpi-inf.mpg.de/EventEgo3D/; Computer Vision and Pattern\n Recognition (CVPR) 2024"},{"id":"http://arxiv.org/abs/2404.08639v1","updated":"2024-04-12T17:59:40Z","published":"2024-04-12T17:59:40Z","title":"COCONut: Modernizing COCO Segmentation","summary":" In recent decades, the vision community has witnessed remarkable progress in\nvisual recognition, partially owing to advancements in dataset benchmarks.\nNotably, the established COCO benchmark has propelled the development of modern\ndetection and segmentation systems. However, the COCO segmentation benchmark\nhas seen comparatively slow improvement over the last decade. Originally\nequipped with coarse polygon annotations for thing instances, it gradually\nincorporated coarse superpixel annotations for stuff regions, which were\nsubsequently heuristically amalgamated to yield panoptic segmentation\nannotations. These annotations, executed by different groups of raters, have\nresulted not only in coarse segmentation masks but also in inconsistencies\nbetween segmentation types. In this study, we undertake a comprehensive\nreevaluation of the COCO segmentation annotations. By enhancing the annotation\nquality and expanding the dataset to encompass 383K images with more than 5.18M\npanoptic masks, we introduce COCONut, the COCO Next Universal segmenTation\ndataset. COCONut harmonizes segmentation annotations across semantic, instance,\nand panoptic segmentation with meticulously crafted high-quality masks, and\nestablishes a robust benchmark for all segmentation tasks. To our knowledge,\nCOCONut stands as the inaugural large-scale universal segmentation dataset,\nverified by human raters. We anticipate that the release of COCONut will\nsignificantly contribute to the community's ability to assess the progress of\nnovel neural networks.\n","authors":["Xueqing Deng","Qihang Yu","Peng Wang","Xiaohui Shen","Liang-Chieh Chen"],"pdf_url":"https://arxiv.org/pdf/2404.08639v1.pdf","comment":"Accepted at CVPR2024, data available at\n https://xdeng7.github.io/coconut.github.io/"},{"id":"http://arxiv.org/abs/2404.08636v1","updated":"2024-04-12T17:58:04Z","published":"2024-04-12T17:58:04Z","title":"Probing the 3D Awareness of Visual Foundation Models","summary":" Recent advances in large-scale pretraining have yielded visual foundation\nmodels with strong capabilities. Not only can recent models generalize to\narbitrary images for their training task, their intermediate representations\nare useful for other visual tasks such as detection and segmentation. Given\nthat such models can classify, delineate, and localize objects in 2D, we ask\nwhether they also represent their 3D structure? In this work, we analyze the 3D\nawareness of visual foundation models. We posit that 3D awareness implies that\nrepresentations (1) encode the 3D structure of the scene and (2) consistently\nrepresent the surface across views. We conduct a series of experiments using\ntask-specific probes and zero-shot inference procedures on frozen features. Our\nexperiments reveal several limitations of the current models. Our code and\nanalysis can be found at https://github.com/mbanani/probe3d.\n","authors":["Mohamed El Banani","Amit Raj","Kevis-Kokitsi Maninis","Abhishek Kar","Yuanzhen Li","Michael Rubinstein","Deqing Sun","Leonidas Guibas","Justin Johnson","Varun Jampani"],"pdf_url":"https://arxiv.org/pdf/2404.08636v1.pdf","comment":"Accepted to CVPR 2024. Project page:\n https://github.com/mbanani/probe3d"},{"id":"http://arxiv.org/abs/2403.15388v4","updated":"2024-04-12T17:34:29Z","published":"2024-03-22T17:59:52Z","title":"LLaVA-PruMerge: Adaptive Token Reduction for Efficient Large Multimodal\n Models","summary":" Large Multimodal Models (LMMs) have shown significant reasoning capabilities\nby connecting a visual encoder and a large language model. LMMs typically use a\nfixed amount of visual tokens, such as the penultimate layer features in the\nCLIP visual encoder, as the prefix content. Recent LMMs incorporate more\ncomplex visual inputs, such as high-resolution images and videos, which\nincrease the number of visual tokens significantly. However, due to the design\nof the Transformer architecture, computational costs associated with these\nmodels tend to increase quadratically with the number of input tokens. To\ntackle this problem, we explore a token reduction mechanism and find, similar\nto prior work, that many visual tokens are spatially redundant. Based on this,\nwe propose PruMerge, a novel adaptive visual token reduction approach, which\nlargely reduces the number of visual tokens while maintaining comparable model\nperformance. We first select the unpruned visual tokens based on their\nsimilarity to class tokens and spatial tokens. We then cluster the pruned\ntokens based on key similarity and merge the clustered tokens with the unpruned\ntokens to supplement their information. Empirically, when applied to LLaVA-1.5,\nour approach can compress the visual tokens by 18 times on average, and achieve\ncomparable performance across diverse visual question-answering and reasoning\ntasks. Code and checkpoints are at https://llava-prumerge.github.io/.\n","authors":["Yuzhang Shang","Mu Cai","Bingxin Xu","Yong Jae Lee","Yan Yan"],"pdf_url":"https://arxiv.org/pdf/2403.15388v4.pdf","comment":"Project page: https://llava-prumerge.github.io/"},{"id":"http://arxiv.org/abs/2404.08611v1","updated":"2024-04-12T17:20:57Z","published":"2024-04-12T17:20:57Z","title":"Automatic Quantification of Serial PET/CT Images for Pediatric Hodgkin\n Lymphoma Patients Using a Longitudinally-Aware Segmentation Network","summary":" $\\textbf{Purpose}$: Automatic quantification of longitudinal changes in PET\nscans for lymphoma patients has proven challenging, as residual disease in\ninterim-therapy scans is often subtle and difficult to detect. Our goal was to\ndevelop a longitudinally-aware segmentation network (LAS-Net) that can quantify\nserial PET/CT images for pediatric Hodgkin lymphoma patients.\n$\\textbf{Materials and Methods}$: This retrospective study included baseline\n(PET1) and interim (PET2) PET/CT images from 297 patients enrolled in two\nChildren's Oncology Group clinical trials (AHOD1331 and AHOD0831). LAS-Net\nincorporates longitudinal cross-attention, allowing relevant features from PET1\nto inform the analysis of PET2. Model performance was evaluated using Dice\ncoefficients for PET1 and detection F1 scores for PET2. Additionally, we\nextracted and compared quantitative PET metrics, including metabolic tumor\nvolume (MTV) and total lesion glycolysis (TLG) in PET1, as well as qPET and\n$\\Delta$SUVmax in PET2, against physician measurements. We quantified their\nagreement using Spearman's $\\rho$ correlations and employed bootstrap\nresampling for statistical analysis. $\\textbf{Results}$: LAS-Net detected\nresidual lymphoma in PET2 with an F1 score of 0.606 (precision/recall:\n0.615/0.600), outperforming all comparator methods (P<0.01). For baseline\nsegmentation, LAS-Net achieved a mean Dice score of 0.772. In PET\nquantification, LAS-Net's measurements of qPET, $\\Delta$SUVmax, MTV and TLG\nwere strongly correlated with physician measurements, with Spearman's $\\rho$ of\n0.78, 0.80, 0.93 and 0.96, respectively. The performance remained high, with a\nslight decrease, in an external testing cohort. $\\textbf{Conclusion}$: LAS-Net\nachieved high performance in quantifying PET metrics across serial scans,\nhighlighting the value of longitudinal awareness in evaluating multi-time-point\nimaging datasets.\n","authors":["Xin Tie","Muheon Shin","Changhee Lee","Scott B. Perlman","Zachary Huemann","Amy J. Weisman","Sharon M. Castellino","Kara M. Kelly","Kathleen M. McCarten","Adina L. Alazraki","Junjie Hu","Steve Y. Cho","Tyler J. Bradshaw"],"pdf_url":"https://arxiv.org/pdf/2404.08611v1.pdf","comment":"6 figures, 4 tables in the main text"},{"id":"http://arxiv.org/abs/2310.16073v3","updated":"2024-04-12T17:04:15Z","published":"2023-10-24T14:59:51Z","title":"FloCoDe: Unbiased Dynamic Scene Graph Generation with Temporal\n Consistency and Correlation Debiasing","summary":" Dynamic scene graph generation (SGG) from videos requires not only a\ncomprehensive understanding of objects across scenes but also a method to\ncapture the temporal motions and interactions with different objects. Moreover,\nthe long-tailed distribution of visual relationships is a crucial bottleneck\nfor most dynamic SGG methods. This is because many of them focus on capturing\nspatio-temporal context using complex architectures, leading to the generation\nof biased scene graphs. To address these challenges, we propose FloCoDe:\nFlow-aware Temporal Consistency and Correlation Debiasing with uncertainty\nattenuation for unbiased dynamic scene graphs. FloCoDe employs feature warping\nusing flow to detect temporally consistent objects across frames. To address\nthe long-tail issue of visual relationships, we propose correlation debiasing\nand a label correlation-based loss to learn unbiased relation representations\nfor long-tailed classes. Specifically, we propose to incorporate label\ncorrelations using contrastive loss to capture commonly co-occurring relations,\nwhich aids in learning robust representations for long-tailed classes. Further,\nwe adopt the uncertainty attenuation-based classifier framework to handle noisy\nannotations in the SGG data. Extensive experimental evaluation shows a\nperformance gain as high as 4.1%, demonstrating the superiority of generating\nmore unbiased scene graphs.\n","authors":["Anant Khandelwal"],"pdf_url":"https://arxiv.org/pdf/2310.16073v3.pdf","comment":"Accepted at CVPR 2024 SG2RL, 11 pages, 5 tables, 4 figures"},{"id":"http://arxiv.org/abs/2404.08603v1","updated":"2024-04-12T17:02:56Z","published":"2024-04-12T17:02:56Z","title":"Training-free Boost for Open-Vocabulary Object Detection with Confidence\n Aggregation","summary":" Open-vocabulary object detection (OVOD) aims at localizing and recognizing\nvisual objects from novel classes unseen at the training time. Whereas,\nempirical studies reveal that advanced detectors generally assign lower scores\nto those novel instances, which are inadvertently suppressed during inference\nby commonly adopted greedy strategies like Non-Maximum Suppression (NMS),\nleading to sub-optimal detection performance for novel classes. This paper\nsystematically investigates this problem with the commonly-adopted two-stage\nOVOD paradigm. Specifically, in the region-proposal stage, proposals that\ncontain novel instances showcase lower objectness scores, since they are\ntreated as background proposals during the training phase. Meanwhile, in the\nobject-classification stage, novel objects share lower region-text similarities\n(i.e., classification scores) due to the biased visual-language alignment by\nseen training samples. To alleviate this problem, this paper introduces two\nadvanced measures to adjust confidence scores and conserve erroneously\ndismissed objects: (1) a class-agnostic localization quality estimate via\noverlap degree of region/object proposals, and (2) a text-guided visual\nsimilarity estimate with proxy prototypes for novel classes. Integrated with\nadjusting techniques specifically designed for the region-proposal and\nobject-classification stages, this paper derives the aggregated confidence\nestimate for the open-vocabulary object detection paradigm (AggDet). Our AggDet\nis a generic and training-free post-processing scheme, which consistently\nbolsters open-vocabulary detectors across model scales and architecture\ndesigns. For instance, AggDet receives 3.3% and 1.5% gains on OV-COCO and\nOV-LVIS benchmarks respectively, without any training cost.\n","authors":["Yanhao Zheng","Kai Liu"],"pdf_url":"https://arxiv.org/pdf/2404.08603v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07520v2","updated":"2024-04-12T17:01:04Z","published":"2024-04-11T07:26:00Z","title":"PromptSync: Bridging Domain Gaps in Vision-Language Models through\n Class-Aware Prototype Alignment and Discrimination","summary":" The potential for zero-shot generalization in vision-language (V-L) models\nsuch as CLIP has spurred their widespread adoption in addressing numerous\ndownstream tasks. Previous methods have employed test-time prompt tuning to\nadapt the model to unseen domains, but they overlooked the issue of imbalanced\nclass distributions. In this study, we explicitly address this problem by\nemploying class-aware prototype alignment weighted by mean class probabilities\nobtained for the test sample and filtered augmented views. Additionally, we\nensure that the class probabilities are as accurate as possible by performing\nprototype discrimination using contrastive learning. The combination of\nalignment and discriminative loss serves as a geometric regularizer, preventing\nthe prompt representation from collapsing onto a single class and effectively\nbridging the distribution gap between the source and test domains. Our method,\nnamed PromptSync, synchronizes the prompts for each test sample on both the\ntext and vision branches of the V-L model. In empirical evaluations on the\ndomain generalization benchmark, our method outperforms previous best methods\nby 2.33% in overall performance, by 1% in base-to-novel generalization, and by\n2.84% in cross-dataset transfer tasks.\n","authors":["Anant Khandelwal"],"pdf_url":"https://arxiv.org/pdf/2404.07520v2.pdf","comment":"Accepted at CVPR 2024 LIMIT, 12 pages, 8 Tables, 2 Figures"},{"id":"http://arxiv.org/abs/2312.03884v2","updated":"2024-04-12T16:47:05Z","published":"2023-12-06T20:22:32Z","title":"WonderJourney: Going from Anywhere to Everywhere","summary":" We introduce WonderJourney, a modularized framework for perpetual 3D scene\ngeneration. Unlike prior work on view generation that focuses on a single type\nof scenes, we start at any user-provided location (by a text description or an\nimage) and generate a journey through a long sequence of diverse yet coherently\nconnected 3D scenes. We leverage an LLM to generate textual descriptions of the\nscenes in this journey, a text-driven point cloud generation pipeline to make a\ncompelling and coherent sequence of 3D scenes, and a large VLM to verify the\ngenerated scenes. We show compelling, diverse visual results across various\nscene types and styles, forming imaginary \"wonderjourneys\". Project website:\nhttps://kovenyu.com/WonderJourney/\n","authors":["Hong-Xing Yu","Haoyi Duan","Junhwa Hur","Kyle Sargent","Michael Rubinstein","William T. Freeman","Forrester Cole","Deqing Sun","Noah Snavely","Jiajun Wu","Charles Herrmann"],"pdf_url":"https://arxiv.org/pdf/2312.03884v2.pdf","comment":"Project website with video results:\n https://kovenyu.com/WonderJourney/"},{"id":"http://arxiv.org/abs/2404.08590v1","updated":"2024-04-12T16:38:48Z","published":"2024-04-12T16:38:48Z","title":"Improving Referring Image Segmentation using Vision-Aware Text Features","summary":" Referring image segmentation is a challenging task that involves generating\npixel-wise segmentation masks based on natural language descriptions. Existing\nmethods have relied mostly on visual features to generate the segmentation\nmasks while treating text features as supporting components. This over-reliance\non visual features can lead to suboptimal results, especially in complex\nscenarios where text prompts are ambiguous or context-dependent. To overcome\nthese challenges, we present a novel framework VATEX to improve referring image\nsegmentation by enhancing object and context understanding with Vision-Aware\nText Feature. Our method involves using CLIP to derive a CLIP Prior that\nintegrates an object-centric visual heatmap with text description, which can be\nused as the initial query in DETR-based architecture for the segmentation task.\nFurthermore, by observing that there are multiple ways to describe an instance\nin an image, we enforce feature similarity between text variations referring to\nthe same visual input by two components: a novel Contextual Multimodal Decoder\nthat turns text embeddings into vision-aware text features, and a Meaning\nConsistency Constraint to ensure further the coherent and consistent\ninterpretation of language expressions with the context understanding obtained\nfrom the image. Our method achieves a significant performance improvement on\nthree benchmark datasets RefCOCO, RefCOCO+ and G-Ref. Code is available at:\nhttps://nero1342.github.io/VATEX\\_RIS.\n","authors":["Hai Nguyen-Truong","E-Ro Nguyen","Tuan-Anh Vu","Minh-Triet Tran","Binh-Son Hua","Sai-Kit Yeung"],"pdf_url":"https://arxiv.org/pdf/2404.08590v1.pdf","comment":"30 pages including supplementary"},{"id":"http://arxiv.org/abs/2401.01448v2","updated":"2024-04-12T16:37:46Z","published":"2024-01-02T22:15:20Z","title":"ProbMCL: Simple Probabilistic Contrastive Learning for Multi-label\n Visual Classification","summary":" Multi-label image classification presents a challenging task in many domains,\nincluding computer vision and medical imaging. Recent advancements have\nintroduced graph-based and transformer-based methods to improve performance and\ncapture label dependencies. However, these methods often include complex\nmodules that entail heavy computation and lack interpretability. In this paper,\nwe propose Probabilistic Multi-label Contrastive Learning (ProbMCL), a novel\nframework to address these challenges in multi-label image classification\ntasks. Our simple yet effective approach employs supervised contrastive\nlearning, in which samples that share enough labels with an anchor image based\non a decision threshold are introduced as a positive set. This structure\ncaptures label dependencies by pulling positive pair embeddings together and\npushing away negative samples that fall below the threshold. We enhance\nrepresentation learning by incorporating a mixture density network into\ncontrastive learning and generating Gaussian mixture distributions to explore\nthe epistemic uncertainty of the feature encoder. We validate the effectiveness\nof our framework through experimentation with datasets from the computer vision\nand medical imaging domains. Our method outperforms the existing\nstate-of-the-art methods while achieving a low computational footprint on both\ndatasets. Visualization analyses also demonstrate that ProbMCL-learned\nclassifiers maintain a meaningful semantic topology.\n","authors":["Ahmad Sajedi","Samir Khaki","Yuri A. Lawryshyn","Konstantinos N. Plataniotis"],"pdf_url":"https://arxiv.org/pdf/2401.01448v2.pdf","comment":"This paper has been accepted for the ICASSP 2024 - 2024 IEEE\n International Conference on Acoustics, Speech and Signal Processing (ICASSP)"},{"id":"http://arxiv.org/abs/2404.08589v1","updated":"2024-04-12T16:35:23Z","published":"2024-04-12T16:35:23Z","title":"Enhancing Visual Question Answering through Question-Driven Image\n Captions as Prompts","summary":" Visual question answering (VQA) is known as an AI-complete task as it\nrequires understanding, reasoning, and inferring about the vision and the\nlanguage content. Over the past few years, numerous neural architectures have\nbeen suggested for the VQA problem. However, achieving success in zero-shot VQA\nremains a challenge due to its requirement for advanced generalization and\nreasoning skills. This study explores the impact of incorporating image\ncaptioning as an intermediary process within the VQA pipeline. Specifically, we\nexplore the efficacy of utilizing image captions instead of images and\nleveraging large language models (LLMs) to establish a zero-shot setting. Since\nimage captioning is the most crucial step in this process, we compare the\nimpact of state-of-the-art image captioning models on VQA performance across\nvarious question types in terms of structure and semantics. We propose a\nstraightforward and efficient question-driven image captioning approach within\nthis pipeline to transfer contextual information into the question-answering\n(QA) model. This method involves extracting keywords from the question,\ngenerating a caption for each image-question pair using the keywords, and\nincorporating the question-driven caption into the LLM prompt. We evaluate the\nefficacy of using general-purpose and question-driven image captions in the VQA\npipeline. Our study highlights the potential of employing image captions and\nharnessing the capabilities of LLMs to achieve competitive performance on GQA\nunder the zero-shot setting. Our code is available at\n\\url{https://github.com/ovguyo/captions-in-VQA}.\n","authors":["Övgü Özdemir","Erdem Akagündüz"],"pdf_url":"https://arxiv.org/pdf/2404.08589v1.pdf","comment":"The paper has been accepted for presentation at CVPR 2024 Workshop on\n Prompting in Vision"},{"id":"http://arxiv.org/abs/2404.08585v1","updated":"2024-04-12T16:30:15Z","published":"2024-04-12T16:30:15Z","title":"Advanced wood species identification based on multiple anatomical\n sections and using deep feature transfer and fusion","summary":" In recent years, we have seen many advancements in wood species\nidentification. Methods like DNA analysis, Near Infrared (NIR) spectroscopy,\nand Direct Analysis in Real Time (DART) mass spectrometry complement the\nlong-established wood anatomical assessment of cell and tissue morphology.\nHowever, most of these methods have some limitations such as high costs, the\nneed for skilled experts for data interpretation, and the lack of good datasets\nfor professional reference. Therefore, most of these methods, and certainly the\nwood anatomical assessment, may benefit from tools based on Artificial\nIntelligence. In this paper, we apply two transfer learning techniques with\nConvolutional Neural Networks (CNNs) to a multi-view Congolese wood species\ndataset including sections from different orientations and viewed at different\nmicroscopic magnifications. We explore two feature extraction methods in\ndetail, namely Global Average Pooling (GAP) and Random Encoding of Aggregated\nDeep Activation Maps (RADAM), for efficient and accurate wood species\nidentification. Our results indicate superior accuracy on diverse datasets and\nanatomical sections, surpassing the results of other methods. Our proposal\nrepresents a significant advancement in wood species identification, offering a\nrobust tool to support the conservation of forest ecosystems and promote\nsustainable forestry practices.\n","authors":["Kallil M. Zielinski","Leonardo Scabini","Lucas C. Ribas","Núbia R. da Silva","Hans Beeckman","Jan Verwaeren","Odemir M. Bruno","Bernard De Baets"],"pdf_url":"https://arxiv.org/pdf/2404.08585v1.pdf","comment":"33 pages, 7 tables, 9 figures"},{"id":"http://arxiv.org/abs/2404.08584v1","updated":"2024-04-12T16:29:49Z","published":"2024-04-12T16:29:49Z","title":"Pathological Primitive Segmentation Based on Visual Foundation Model\n with Zero-Shot Mask Generation","summary":" Medical image processing usually requires a model trained with carefully\ncrafted datasets due to unique image characteristics and domain-specific\nchallenges, especially in pathology. Primitive detection and segmentation in\ndigitized tissue samples are essential for objective and automated diagnosis\nand prognosis of cancer. SAM (Segment Anything Model) has recently been\ndeveloped to segment general objects from natural images with high accuracy,\nbut it requires human prompts to generate masks. In this work, we present a\nnovel approach that adapts pre-trained natural image encoders of SAM for\ndetection-based region proposals. Regions proposed by a pre-trained encoder are\nsent to cascaded feature propagation layers for projection. Then, local\nsemantic and global context is aggregated from multi-scale for bounding box\nlocalization and classification. Finally, the SAM decoder uses the identified\nbounding boxes as essential prompts to generate a comprehensive primitive\nsegmentation map. The entire base framework, SAM, requires no additional\ntraining or fine-tuning but could produce an end-to-end result for two\nfundamental segmentation tasks in pathology. Our method compares with\nstate-of-the-art models in F1 score for nuclei detection and binary/multiclass\npanoptic(bPQ/mPQ) and mask quality(dice) for segmentation quality on the\nPanNuke dataset while offering end-to-end efficiency. Our model also achieves\nremarkable Average Precision (+4.5%) on the secondary dataset (HuBMAP Kidney)\ncompared to Faster RCNN. The code is publicly available at\nhttps://github.com/learner-codec/autoprom_sam.\n","authors":["Abu Bakor Hayat Arnob","Xiangxue Wang","Yiping Jiao","Xiao Gan","Wenlong Ming","Jun Xu"],"pdf_url":"https://arxiv.org/pdf/2404.08584v1.pdf","comment":"2024 IEEE International Symposium on Biomedical Imaging"},{"id":"http://arxiv.org/abs/2404.08582v1","updated":"2024-04-12T16:28:30Z","published":"2024-04-12T16:28:30Z","title":"FashionFail: Addressing Failure Cases in Fashion Object Detection and\n Segmentation","summary":" In the realm of fashion object detection and segmentation for online shopping\nimages, existing state-of-the-art fashion parsing models encounter limitations,\nparticularly when exposed to non-model-worn apparel and close-up shots. To\naddress these failures, we introduce FashionFail; a new fashion dataset with\ne-commerce images for object detection and segmentation. The dataset is\nefficiently curated using our novel annotation tool that leverages recent\nfoundation models. The primary objective of FashionFail is to serve as a test\nbed for evaluating the robustness of models. Our analysis reveals the\nshortcomings of leading models, such as Attribute-Mask R-CNN and Fashionformer.\nAdditionally, we propose a baseline approach using naive data augmentation to\nmitigate common failure cases and improve model robustness. Through this work,\nwe aim to inspire and support further research in fashion item detection and\nsegmentation for industrial applications. The dataset, annotation tool, code,\nand models are available at \\url{https://rizavelioglu.github.io/fashionfail/}.\n","authors":["Riza Velioglu","Robin Chan","Barbara Hammer"],"pdf_url":"https://arxiv.org/pdf/2404.08582v1.pdf","comment":"to be published in 2024 International Joint Conference on Neural\n Networks (IJCNN)"},{"id":"http://arxiv.org/abs/2404.08580v1","updated":"2024-04-12T16:23:42Z","published":"2024-04-12T16:23:42Z","title":"Lossy Image Compression with Foundation Diffusion Models","summary":" Incorporating diffusion models in the image compression domain has the\npotential to produce realistic and detailed reconstructions, especially at\nextremely low bitrates. Previous methods focus on using diffusion models as\nexpressive decoders robust to quantization errors in the conditioning signals,\nyet achieving competitive results in this manner requires costly training of\nthe diffusion model and long inference times due to the iterative generative\nprocess. In this work we formulate the removal of quantization error as a\ndenoising task, using diffusion to recover lost information in the transmitted\nimage latent. Our approach allows us to perform less than 10\\% of the full\ndiffusion generative process and requires no architectural changes to the\ndiffusion model, enabling the use of foundation models as a strong prior\nwithout additional fine tuning of the backbone. Our proposed codec outperforms\nprevious methods in quantitative realism metrics, and we verify that our\nreconstructions are qualitatively preferred by end users, even when other\nmethods use twice the bitrate.\n","authors":["Lucas Relic","Roberto Azevedo","Markus Gross","Christopher Schroers"],"pdf_url":"https://arxiv.org/pdf/2404.08580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.06994v2","updated":"2024-04-12T16:07:55Z","published":"2024-02-10T17:02:53Z","title":"A Change Detection Reality Check","summary":" In recent years, there has been an explosion of proposed change detection\ndeep learning architectures in the remote sensing literature. These approaches\nclaim to offer state-of-the-art performance on different standard benchmark\ndatasets. However, has the field truly made significant progress? In this paper\nwe perform experiments which conclude a simple U-Net segmentation baseline\nwithout training tricks or complicated architectural changes is still a top\nperformer for the task of change detection.\n","authors":["Isaac Corley","Caleb Robinson","Anthony Ortiz"],"pdf_url":"https://arxiv.org/pdf/2402.06994v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08561v1","updated":"2024-04-12T16:00:03Z","published":"2024-04-12T16:00:03Z","title":"IDD-X: A Multi-View Dataset for Ego-relative Important Object\n Localization and Explanation in Dense and Unstructured Traffic","summary":" Intelligent vehicle systems require a deep understanding of the interplay\nbetween road conditions, surrounding entities, and the ego vehicle's driving\nbehavior for safe and efficient navigation. This is particularly critical in\ndeveloping countries where traffic situations are often dense and unstructured\nwith heterogeneous road occupants. Existing datasets, predominantly geared\ntowards structured and sparse traffic scenarios, fall short of capturing the\ncomplexity of driving in such environments. To fill this gap, we present IDD-X,\na large-scale dual-view driving video dataset. With 697K bounding boxes, 9K\nimportant object tracks, and 1-12 objects per video, IDD-X offers comprehensive\nego-relative annotations for multiple important road objects covering 10\ncategories and 19 explanation label categories. The dataset also incorporates\nrearview information to provide a more complete representation of the driving\nenvironment. We also introduce custom-designed deep networks aimed at multiple\nimportant object localization and per-object explanation prediction. Overall,\nour dataset and introduced prediction models form the foundation for studying\nhow road conditions and surrounding entities affect driving behavior in complex\ntraffic situations.\n","authors":["Chirag Parikh","Rohit Saluja","C. V. Jawahar","Ravi Kiran Sarvadevabhatla"],"pdf_url":"https://arxiv.org/pdf/2404.08561v1.pdf","comment":"Accepted at ICRA 2024"},{"id":"http://arxiv.org/abs/2404.08557v1","updated":"2024-04-12T15:54:48Z","published":"2024-04-12T15:54:48Z","title":"Scalability in Building Component Data Annotation: Enhancing Facade\n Material Classification with Synthetic Data","summary":" Computer vision models trained on Google Street View images can create\nmaterial cadastres. However, current approaches need manually annotated\ndatasets that are difficult to obtain and often have class imbalance. To\naddress these challenges, this paper fine-tuned a Swin Transformer model on a\nsynthetic dataset generated with DALL-E and compared the performance to a\nsimilar manually annotated dataset. Although manual annotation remains the gold\nstandard, the synthetic dataset performance demonstrates a reasonable\nalternative. The findings will ease annotation needed to develop material\ncadastres, offering architects insights into opportunities for material reuse,\nthus contributing to the reduction of demolition waste.\n","authors":["Josie Harrison","Alexander Hollberg","Yinan Yu"],"pdf_url":"https://arxiv.org/pdf/2404.08557v1.pdf","comment":"10 pages, 6 figures, submitted to 2024 European Conference of\n Computing in Construction"},{"id":"http://arxiv.org/abs/2310.02557v3","updated":"2024-04-12T15:48:47Z","published":"2023-10-04T03:30:32Z","title":"Generalization in diffusion models arises from geometry-adaptive\n harmonic representations","summary":" Deep neural networks (DNNs) trained for image denoising are able to generate\nhigh-quality samples with score-based reverse diffusion algorithms. These\nimpressive capabilities seem to imply an escape from the curse of\ndimensionality, but recent reports of memorization of the training set raise\nthe question of whether these networks are learning the \"true\" continuous\ndensity of the data. Here, we show that two DNNs trained on non-overlapping\nsubsets of a dataset learn nearly the same score function, and thus the same\ndensity, when the number of training images is large enough. In this regime of\nstrong generalization, diffusion-generated images are distinct from the\ntraining set, and are of high visual quality, suggesting that the inductive\nbiases of the DNNs are well-aligned with the data density. We analyze the\nlearned denoising functions and show that the inductive biases give rise to a\nshrinkage operation in a basis adapted to the underlying image. Examination of\nthese bases reveals oscillating harmonic structures along contours and in\nhomogeneous regions. We demonstrate that trained denoisers are inductively\nbiased towards these geometry-adaptive harmonic bases since they arise not only\nwhen the network is trained on photographic images, but also when it is trained\non image classes supported on low-dimensional manifolds for which the harmonic\nbasis is suboptimal. Finally, we show that when trained on regular image\nclasses for which the optimal basis is known to be geometry-adaptive and\nharmonic, the denoising performance of the networks is near-optimal.\n","authors":["Zahra Kadkhodaie","Florentin Guth","Eero P. Simoncelli","Stéphane Mallat"],"pdf_url":"https://arxiv.org/pdf/2310.02557v3.pdf","comment":"Accepted for oral presentation at ICLR, Vienna, May 2024"},{"id":"http://arxiv.org/abs/2404.08549v1","updated":"2024-04-12T15:45:26Z","published":"2024-04-12T15:45:26Z","title":"Benchmarking the Cell Image Segmentation Models Robustness under the\n Microscope Optical Aberrations","summary":" Cell segmentation is essential in biomedical research for analyzing cellular\nmorphology and behavior. Deep learning methods, particularly convolutional\nneural networks (CNNs), have revolutionized cell segmentation by extracting\nintricate features from images. However, the robustness of these methods under\nmicroscope optical aberrations remains a critical challenge. This study\ncomprehensively evaluates the performance of cell instance segmentation models\nunder simulated aberration conditions using the DynamicNuclearNet (DNN) and\nLIVECell datasets. Aberrations, including Astigmatism, Coma, Spherical, and\nTrefoil, were simulated using Zernike polynomial equations. Various\nsegmentation models, such as Mask R-CNN with different network heads (FPN, C3)\nand backbones (ResNet, VGG19, SwinS), were trained and tested under aberrated\nconditions. Results indicate that FPN combined with SwinS demonstrates superior\nrobustness in handling simple cell images affected by minor aberrations.\nConversely, Cellpose2.0 proves effective for complex cell images under similar\nconditions. Our findings provide insights into selecting appropriate\nsegmentation models based on cell morphology and aberration severity, enhancing\nthe reliability of cell segmentation in biomedical applications. Further\nresearch is warranted to validate these methods with diverse aberration types\nand emerging segmentation models. Overall, this research aims to guide\nresearchers in effectively utilizing cell segmentation models in the presence\nof minor optical aberrations.\n","authors":["Boyuan Peng","Jiaju Chen","Qihui Ye","Minjiang Chen","Peiwu Qin","Chenggang Yan","Dongmei Yu","Zhenglin Chen"],"pdf_url":"https://arxiv.org/pdf/2404.08549v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08544v1","updated":"2024-04-12T15:37:53Z","published":"2024-04-12T15:37:53Z","title":"Analyzing Decades-Long Environmental Changes in Namibia Using Archival\n Aerial Photography and Deep Learning","summary":" This study explores object detection in historical aerial photographs of\nNamibia to identify long-term environmental changes. Specifically, we aim to\nidentify key objects -- \\textit{Waterholes}, \\textit{Omuti homesteads}, and\n\\textit{Big trees} -- around Oshikango in Namibia using sub-meter gray-scale\naerial imagery from 1943 and 1972. In this work, we propose a workflow for\nanalyzing historical aerial imagery using a deep semantic segmentation model on\nsparse hand-labels. To this end, we employ a number of strategies including\nclass-weighting, pseudo-labeling and empirical p-value-based filtering to\nbalance skewed and sparse representations of objects in the ground truth data.\nResults demonstrate the benefits of these different training strategies\nresulting in an average $F_1=0.661$ and $F_1=0.755$ over the three objects of\ninterest for the 1943 and 1972 imagery, respectively. We also identified that\nthe average size of Waterhole and Big trees increased while the average size of\nOmutis decreased between 1943 and 1972 reflecting some of the local effects of\nthe massive post-Second World War economic, agricultural, demographic, and\nenvironmental changes. This work also highlights the untapped potential of\nhistorical aerial photographs in understanding long-term environmental changes\nbeyond Namibia (and Africa). With the lack of adequate satellite technology in\nthe past, archival aerial photography offers a great alternative to uncover\ndecades-long environmental changes.\n","authors":["Girmaw Abebe Tadesse","Caleb Robinson","Gilles Quentin Hacheme","Akram Zaytar","Rahul Dodhia","Tsering Wangyal Shawa","Juan M. Lavista Ferres","Emmanuel H. Kreike"],"pdf_url":"https://arxiv.org/pdf/2404.08544v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08540v1","updated":"2024-04-12T15:35:20Z","published":"2024-04-12T15:35:20Z","title":"On the Robustness of Language Guidance for Low-Level Vision Tasks:\n Findings from Depth Estimation","summary":" Recent advances in monocular depth estimation have been made by incorporating\nnatural language as additional guidance. Although yielding impressive results,\nthe impact of the language prior, particularly in terms of generalization and\nrobustness, remains unexplored. In this paper, we address this gap by\nquantifying the impact of this prior and introduce methods to benchmark its\neffectiveness across various settings. We generate \"low-level\" sentences that\nconvey object-centric, three-dimensional spatial relationships, incorporate\nthem as additional language priors and evaluate their downstream impact on\ndepth estimation. Our key finding is that current language-guided depth\nestimators perform optimally only with scene-level descriptions and\ncounter-intuitively fare worse with low level descriptions. Despite leveraging\nadditional data, these methods are not robust to directed adversarial attacks\nand decline in performance with an increase in distribution shift. Finally, to\nprovide a foundation for future research, we identify points of failures and\noffer insights to better understand these shortcomings. With an increasing\nnumber of methods using language for depth estimation, our findings highlight\nthe opportunities and pitfalls that require careful consideration for effective\ndeployment in real-world settings\n","authors":["Agneet Chatterjee","Tejas Gokhale","Chitta Baral","Yezhou Yang"],"pdf_url":"https://arxiv.org/pdf/2404.08540v1.pdf","comment":"Accepted to CVPR 2024. Project webpage:\n https://agneetchatterjee.com/robustness_depth_lang/"},{"id":"http://arxiv.org/abs/2404.08535v1","updated":"2024-04-12T15:30:03Z","published":"2024-04-12T15:30:03Z","title":"Generalized Contrastive Learning for Multi-Modal Retrieval and Ranking","summary":" Contrastive learning has gained widespread adoption for retrieval tasks due\nto its minimal requirement for manual annotations. However, popular contrastive\nframeworks typically learn from binary relevance, making them ineffective at\nincorporating direct fine-grained rankings. In this paper, we curate a\nlarge-scale dataset featuring detailed relevance scores for each query-document\npair to facilitate future research and evaluation. Subsequently, we propose\nGeneralized Contrastive Learning for Multi-Modal Retrieval and Ranking (GCL),\nwhich is designed to learn from fine-grained rankings beyond binary relevance\nscores. Our results show that GCL achieves a 94.5% increase in NDCG@10 for\nin-domain and 26.3 to 48.8% increases for cold-start evaluations, all relative\nto the CLIP baseline and involving ground truth rankings.\n","authors":["Tianyu Zhu","Myong Chol Jung","Jesse Clark"],"pdf_url":"https://arxiv.org/pdf/2404.08535v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08531v1","updated":"2024-04-12T15:18:25Z","published":"2024-04-12T15:18:25Z","title":"Text Prompt with Normality Guidance for Weakly Supervised Video Anomaly\n Detection","summary":" Weakly supervised video anomaly detection (WSVAD) is a challenging task.\nGenerating fine-grained pseudo-labels based on weak-label and then\nself-training a classifier is currently a promising solution. However, since\nthe existing methods use only RGB visual modality and the utilization of\ncategory text information is neglected, thus limiting the generation of more\naccurate pseudo-labels and affecting the performance of self-training. Inspired\nby the manual labeling process based on the event description, in this paper,\nwe propose a novel pseudo-label generation and self-training framework based on\nText Prompt with Normality Guidance (TPWNG) for WSVAD. Our idea is to transfer\nthe rich language-visual knowledge of the contrastive language-image\npre-training (CLIP) model for aligning the video event description text and\ncorresponding video frames to generate pseudo-labels. Specifically, We first\nfine-tune the CLIP for domain adaptation by designing two ranking losses and a\ndistributional inconsistency loss. Further, we propose a learnable text prompt\nmechanism with the assist of a normality visual prompt to further improve the\nmatching accuracy of video event description text and video frames. Then, we\ndesign a pseudo-label generation module based on the normality guidance to\ninfer reliable frame-level pseudo-labels. Finally, we introduce a temporal\ncontext self-adaptive learning module to learn the temporal dependencies of\ndifferent video events more flexibly and accurately. Extensive experiments show\nthat our method achieves state-of-the-art performance on two benchmark\ndatasets, UCF-Crime and XD-Viole\n","authors":["Zhiwei Yang","Jing Liu","Peng Wu"],"pdf_url":"https://arxiv.org/pdf/2404.08531v1.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2402.11568v2","updated":"2024-04-12T15:17:17Z","published":"2024-02-18T12:31:29Z","title":"A novel Fourier neural operator framework for classification of\n multi-sized images: Application to three dimensional digital porous media","summary":" Fourier neural operators (FNOs) are invariant with respect to the size of\ninput images, and thus images with any size can be fed into FNO-based\nframeworks without any modification of network architectures, in contrast to\ntraditional convolutional neural networks (CNNs). Leveraging the advantage of\nFNOs, we propose a novel deep-learning framework for classifying images with\nvarying sizes. Particularly, we simultaneously train the proposed network on\nmulti-sized images. As a practical application, we consider the problem of\npredicting the label (e.g., permeability) of three-dimensional digital porous\nmedia. To construct the framework, an intuitive approach is to connect FNO\nlayers to a classifier using adaptive max pooling. First, we show that this\napproach is only effective for porous media with fixed sizes, whereas it fails\nfor porous media of varying sizes. To overcome this limitation, we introduce\nour approach: instead of using adaptive max pooling, we use static max pooling\nwith the size of channel width of FNO layers. Since the channel width of the\nFNO layers is independent of input image size, the introduced framework can\nhandle multi-sized images during training. We show the effectiveness of the\nintroduced framework and compare its performance with the intuitive approach\nthrough the example of the classification of three-dimensional digital porous\nmedia of varying sizes.\n","authors":["Ali Kashefi","Tapan Mukerji"],"pdf_url":"https://arxiv.org/pdf/2402.11568v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08526v1","updated":"2024-04-12T15:15:39Z","published":"2024-04-12T15:15:39Z","title":"Masked Image Modeling as a Framework for Self-Supervised Learning across\n Eye Movements","summary":" To make sense of their surroundings, intelligent systems must transform\ncomplex sensory inputs to structured codes that are reduced to task-relevant\ninformation such as object category. Biological agents achieve this in a\nlargely autonomous manner, presumably via self-\\allowbreak super-\\allowbreak\nvised learning. Whereas previous attempts to model the underlying mechanisms\nwere largely discriminative in nature, there is ample evidence that the brain\nemploys a generative model of the world. Here, we propose that eye movements,\nin combination with the focused nature of primate vision, constitute a\ngenerative, self-supervised task of predicting and revealing visual\ninformation. We construct a proof-of-principle model starting from the\nframework of masked image modeling (MIM), a common approach in deep\nrepresentation learning. To do so, we analyze how core components of MIM such\nas masking technique and data augmentation influence the formation of\ncategory-specific representations. This allows us not only to better understand\nthe principles behind MIM, but to then reassemble a MIM more in line with the\nfocused nature of biological perception. From a theoretical angle, we find that\nMIM disentangles neurons in latent space, a property that has been suggested to\nstructure visual representations in primates, without explicit regulation.\nTogether with previous findings of invariance learning, this highlights an\ninteresting connection of MIM to latent regularization approaches for\nself-supervised learning. The source code is available under\nhttps://github.com/RobinWeiler/FocusMIM\n","authors":["Robin Weiler","Matthias Brucklacher","Cyriel M. A. Pennartz","Sander M. Bohté"],"pdf_url":"https://arxiv.org/pdf/2404.08526v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11868v3","updated":"2024-04-12T15:01:10Z","published":"2024-03-18T15:22:09Z","title":"View-Consistent 3D Editing with Gaussian Splatting","summary":" The advent of 3D Gaussian Splatting (3DGS) has revolutionized 3D editing,\noffering efficient, high-fidelity rendering and enabling precise local\nmanipulations. Currently, diffusion-based 2D editing models are harnessed to\nmodify multi-view rendered images, which then guide the editing of 3DGS models.\nHowever, this approach faces a critical issue of multi-view inconsistency,\nwhere the guidance images exhibit significant discrepancies across views,\nleading to mode collapse and visual artifacts of 3DGS. To this end, we\nintroduce View-consistent Editing (VcEdit), a novel framework that seamlessly\nincorporates 3DGS into image editing processes, ensuring multi-view consistency\nin edited guidance images and effectively mitigating mode collapse issues.\nVcEdit employs two innovative consistency modules: the Cross-attention\nConsistency Module and the Editing Consistency Module, both designed to reduce\ninconsistencies in edited images. By incorporating these consistency modules\ninto an iterative pattern, VcEdit proficiently resolves the issue of multi-view\ninconsistency, facilitating high-quality 3DGS editing across a diverse range of\nscenes.\n","authors":["Yuxuan Wang","Xuanyu Yi","Zike Wu","Na Zhao","Long Chen","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.11868v3.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2404.06710v3","updated":"2024-04-12T14:58:21Z","published":"2024-04-10T03:31:32Z","title":"SpikeNVS: Enhancing Novel View Synthesis from Blurry Images via Spike\n Camera","summary":" One of the most critical factors in achieving sharp Novel View Synthesis\n(NVS) using neural field methods like Neural Radiance Fields (NeRF) and 3D\nGaussian Splatting (3DGS) is the quality of the training images. However,\nConventional RGB cameras are susceptible to motion blur. In contrast,\nneuromorphic cameras like event and spike cameras inherently capture more\ncomprehensive temporal information, which can provide a sharp representation of\nthe scene as additional training data. Recent methods have explored the\nintegration of event cameras to improve the quality of NVS. The event-RGB\napproaches have some limitations, such as high training costs and the inability\nto work effectively in the background. Instead, our study introduces a new\nmethod that uses the spike camera to overcome these limitations. By considering\ntexture reconstruction from spike streams as ground truth, we design the\nTexture from Spike (TfS) loss. Since the spike camera relies on temporal\nintegration instead of temporal differentiation used by event cameras, our\nproposed TfS loss maintains manageable training costs. It handles foreground\nobjects with backgrounds simultaneously. We also provide a real-world dataset\ncaptured with our spike-RGB camera system to facilitate future research\nendeavors. We conduct extensive experiments using synthetic and real-world\ndatasets to demonstrate that our design can enhance novel view synthesis across\nNeRF and 3DGS. The code and dataset will be made available for public access.\n","authors":["Gaole Dai","Zhenyu Wang","Qinwen Xu","Ming Lu","Wen Chen","Boxin Shi","Shanghang Zhang","Tiejun Huang"],"pdf_url":"https://arxiv.org/pdf/2404.06710v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08515v1","updated":"2024-04-12T14:54:34Z","published":"2024-04-12T14:54:34Z","title":"ChatGPT and general-purpose AI count fruits in pictures surprisingly\n well","summary":" Object counting is a popular task in deep learning applications in various\ndomains, including agriculture. A conventional deep learning approach requires\na large amount of training data, often a logistic problem in a real-world\napplication. To address this issue, we examined how well ChatGPT (GPT4V) and a\ngeneral-purpose AI (foundation model for object counting, T-Rex) can count the\nnumber of fruit bodies (coffee cherries) in 100 images. The foundation model\nwith few-shot learning outperformed the trained YOLOv8 model (R2 = 0.923 and\n0.900, respectively). ChatGPT also showed some interesting potential,\nespecially when few-shot learning with human feedback was applied (R2 = 0.360\nand 0.460, respectively). Moreover, we examined the time required for\nimplementation as a practical question. Obtaining the results with the\nfoundation model and ChatGPT were much shorter than the YOLOv8 model (0.83 hrs,\n1.75 hrs, and 161 hrs). We interpret these results as two surprises for deep\nlearning users in applied domains: a foundation model with few-shot\ndomain-specific learning can drastically save time and effort compared to the\nconventional approach, and ChatGPT can reveal a relatively good performance.\nBoth approaches do not need coding skills, which can foster AI education and\ndissemination.\n","authors":["Konlavach Mengsuwan","Juan Camilo Rivera Palacio","Masahiro Ryo"],"pdf_url":"https://arxiv.org/pdf/2404.08515v1.pdf","comment":"12 pages, 3 figures"},{"id":"http://arxiv.org/abs/2404.08514v1","updated":"2024-04-12T14:54:26Z","published":"2024-04-12T14:54:26Z","title":"NIR-Assisted Image Denoising: A Selective Fusion Approach and A\n Real-World Benchmark Datase","summary":" Despite the significant progress in image denoising, it is still challenging\nto restore fine-scale details while removing noise, especially in extremely\nlow-light environments. Leveraging near-infrared (NIR) images to assist visible\nRGB image denoising shows the potential to address this issue, becoming a\npromising technology. Nonetheless, existing works still struggle with taking\nadvantage of NIR information effectively for real-world image denoising, due to\nthe content inconsistency between NIR-RGB images and the scarcity of real-world\npaired datasets. To alleviate the problem, we propose an efficient Selective\nFusion Module (SFM), which can be plug-and-played into the advanced denoising\nnetworks to merge the deep NIR-RGB features. Specifically, we sequentially\nperform the global and local modulation for NIR and RGB features, and then\nintegrate the two modulated features. Furthermore, we present a Real-world\nNIR-Assisted Image Denoising (Real-NAID) dataset, which covers diverse\nscenarios as well as various noise levels. Extensive experiments on both\nsynthetic and our real-world datasets demonstrate that the proposed method\nachieves better results than state-of-the-art ones. The dataset, codes, and\npre-trained models will be publicly available at\nhttps://github.com/ronjonxu/NAID.\n","authors":["Rongjian Xu","Zhilu Zhang","Renlong Wu","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.08514v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2401.03785v2","updated":"2024-04-12T14:44:04Z","published":"2024-01-08T10:06:52Z","title":"Identifying Important Group of Pixels using Interactions","summary":" To better understand the behavior of image classifiers, it is useful to\nvisualize the contribution of individual pixels to the model prediction. In\nthis study, we propose a method, MoXI ($\\textbf{Mo}$del e$\\textbf{X}$planation\nby $\\textbf{I}$nteractions), that efficiently and accurately identifies a group\nof pixels with high prediction confidence. The proposed method employs\ngame-theoretic concepts, Shapley values and interactions, taking into account\nthe effects of individual pixels and the cooperative influence of pixels on\nmodel confidence. Theoretical analysis and experiments demonstrate that our\nmethod better identifies the pixels that are highly contributing to the model\noutputs than widely-used visualization by Grad-CAM, Attention rollout, and\nShapley value. While prior studies have suffered from the exponential\ncomputational cost in the computation of Shapley value and interactions, we\nshow that this can be reduced to quadratic cost for our task. The code is\navailable at https://github.com/KosukeSumiyasu/MoXI.\n","authors":["Kosuke Sumiyasu","Kazuhiko Kawamoto","Hiroshi Kera"],"pdf_url":"https://arxiv.org/pdf/2401.03785v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2404.08506v1","updated":"2024-04-12T14:40:45Z","published":"2024-04-12T14:40:45Z","title":"LaSagnA: Language-based Segmentation Assistant for Complex Queries","summary":" Recent advancements have empowered Large Language Models for Vision (vLLMs)\nto generate detailed perceptual outcomes, including bounding boxes and masks.\nNonetheless, there are two constraints that restrict the further application of\nthese vLLMs: the incapability of handling multiple targets per query and the\nfailure to identify the absence of query objects in the image. In this study,\nwe acknowledge that the main cause of these problems is the insufficient\ncomplexity of training queries. Consequently, we define the general sequence\nformat for complex queries. Then we incorporate a semantic segmentation task in\nthe current pipeline to fulfill the requirements of training data. Furthermore,\nwe present three novel strategies to effectively handle the challenges arising\nfrom the direct integration of the proposed format. The effectiveness of our\nmodel in processing complex queries is validated by the comparable results with\nconventional methods on both close-set and open-set semantic segmentation\ndatasets. Additionally, we outperform a series of vLLMs in reasoning and\nreferring segmentation, showcasing our model's remarkable capabilities. We\nrelease the code at https://github.com/congvvc/LaSagnA.\n","authors":["Cong Wei","Haoxian Tan","Yujie Zhong","Yujiu Yang","Lin Ma"],"pdf_url":"https://arxiv.org/pdf/2404.08506v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08504v1","updated":"2024-04-12T14:34:24Z","published":"2024-04-12T14:34:24Z","title":"3D Human Scan With A Moving Event Camera","summary":" Capturing the 3D human body is one of the important tasks in computer vision\nwith a wide range of applications such as virtual reality and sports analysis.\nHowever, conventional frame cameras are limited by their temporal resolution\nand dynamic range, which imposes constraints in real-world application setups.\nEvent cameras have the advantages of high temporal resolution and high dynamic\nrange (HDR), but the development of event-based methods is necessary to handle\ndata with different characteristics. This paper proposes a novel event-based\nmethod for 3D pose estimation and human mesh recovery. Prior work on\nevent-based human mesh recovery require frames (images) as well as event data.\nThe proposed method solely relies on events; it carves 3D voxels by moving the\nevent camera around a stationary body, reconstructs the human pose and mesh by\nattenuated rays, and fit statistical body models, preserving high-frequency\ndetails. The experimental results show that the proposed method outperforms\nconventional frame-based methods in the estimation accuracy of both pose and\nbody mesh. We also demonstrate results in challenging situations where a\nconventional camera has motion blur. This is the first to demonstrate\nevent-only human mesh recovery, and we hope that it is the first step toward\nachieving robust and accurate 3D human body scanning from vision sensors.\n","authors":["Kai Kohyama","Shintaro Shiba","Yoshimitsu Aoki"],"pdf_url":"https://arxiv.org/pdf/2404.08504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14991v2","updated":"2024-04-12T14:21:20Z","published":"2023-12-22T11:56:22Z","title":"FoodLMM: A Versatile Food Assistant using Large Multi-modal Model","summary":" Large Multi-modal Models (LMMs) have made impressive progress in many\nvision-language tasks. Nevertheless, the performance of general LMMs in\nspecific domains is still far from satisfactory. This paper proposes FoodLMM, a\nversatile food assistant based on LMMs with various capabilities, including\nfood recognition, ingredient recognition, recipe generation, nutrition\nestimation, food segmentation and multi-round conversation. To facilitate\nFoodLMM to deal with tasks beyond pure text output, we introduce a series of\nnovel task-specific tokens and heads, enabling the model to predict food\nnutritional values and multiple segmentation masks. We adopt a two-stage\ntraining strategy. In the first stage, we utilize multiple public food\nbenchmarks for multi-task learning by leveraging the instruct-following\nparadigm. In the second stage, we construct a multi-round conversation dataset\nand a reasoning segmentation dataset to fine-tune the model, enabling it to\nconduct professional dialogues and generate segmentation masks based on complex\nreasoning in the food domain. Our fine-tuned FoodLMM achieves state-of-the-art\nresults across several food benchmarks. We will make our code, models and\ndatasets publicly available.\n","authors":["Yuehao Yin","Huiyan Qi","Bin Zhu","Jingjing Chen","Yu-Gang Jiang","Chong-Wah Ngo"],"pdf_url":"https://arxiv.org/pdf/2312.14991v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08489v1","updated":"2024-04-12T14:12:03Z","published":"2024-04-12T14:12:03Z","title":"SpectralMamba: Efficient Mamba for Hyperspectral Image Classification","summary":" Recurrent neural networks and Transformers have recently dominated most\napplications in hyperspectral (HS) imaging, owing to their capability to\ncapture long-range dependencies from spectrum sequences. However, despite the\nsuccess of these sequential architectures, the non-ignorable inefficiency\ncaused by either difficulty in parallelization or computationally prohibitive\nattention still hinders their practicality, especially for large-scale\nobservation in remote sensing scenarios. To address this issue, we herein\npropose SpectralMamba -- a novel state space model incorporated efficient deep\nlearning framework for HS image classification. SpectralMamba features the\nsimplified but adequate modeling of HS data dynamics at two levels. First, in\nspatial-spectral space, a dynamical mask is learned by efficient convolutions\nto simultaneously encode spatial regularity and spectral peculiarity, thus\nattenuating the spectral variability and confusion in discriminative\nrepresentation learning. Second, the merged spectrum can then be efficiently\noperated in the hidden state space with all parameters learned input-dependent,\nyielding selectively focused responses without reliance on redundant attention\nor imparallelizable recurrence. To explore the room for further computational\ndownsizing, a piece-wise scanning mechanism is employed in-between,\ntransferring approximately continuous spectrum into sequences with squeezed\nlength while maintaining short- and long-term contextual profiles among\nhundreds of bands. Through extensive experiments on four benchmark HS datasets\nacquired by satellite-, aircraft-, and UAV-borne imagers, SpectralMamba\nsurprisingly creates promising win-wins from both performance and efficiency\nperspectives.\n","authors":["Jing Yao","Danfeng Hong","Chenyu Li","Jocelyn Chanussot"],"pdf_url":"https://arxiv.org/pdf/2404.08489v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00513v2","updated":"2024-04-12T13:58:33Z","published":"2024-03-31T01:20:16Z","title":"Transformer based Pluralistic Image Completion with Reduced Information\n Loss","summary":" Transformer based methods have achieved great success in image inpainting\nrecently. However, we find that these solutions regard each pixel as a token,\nthus suffering from an information loss issue from two aspects: 1) They\ndownsample the input image into much lower resolutions for efficiency\nconsideration. 2) They quantize $256^3$ RGB values to a small number (such as\n512) of quantized color values. The indices of quantized pixels are used as\ntokens for the inputs and prediction targets of the transformer. To mitigate\nthese issues, we propose a new transformer based framework called \"PUT\".\nSpecifically, to avoid input downsampling while maintaining computation\nefficiency, we design a patch-based auto-encoder P-VQVAE. The encoder converts\nthe masked image into non-overlapped patch tokens and the decoder recovers the\nmasked regions from the inpainted tokens while keeping the unmasked regions\nunchanged. To eliminate the information loss caused by input quantization, an\nUn-quantized Transformer is applied. It directly takes features from the\nP-VQVAE encoder as input without any quantization and only regards the\nquantized tokens as prediction targets. Furthermore, to make the inpainting\nprocess more controllable, we introduce semantic and structural conditions as\nextra guidance. Extensive experiments show that our method greatly outperforms\nexisting transformer based methods on image fidelity and achieves much higher\ndiversity and better fidelity than state-of-the-art pluralistic inpainting\nmethods on complex large-scale datasets (e.g., ImageNet). Codes are available\nat https://github.com/liuqk3/PUT.\n","authors":["Qiankun Liu","Yuqi Jiang","Zhentao Tan","Dongdong Chen","Ying Fu","Qi Chu","Gang Hua","Nenghai Yu"],"pdf_url":"https://arxiv.org/pdf/2404.00513v2.pdf","comment":"Accepted by TPAMI (2024). arXiv admin note: text overlap with\n arXiv:2205.05076"},{"id":"http://arxiv.org/abs/2404.08477v1","updated":"2024-04-12T13:55:05Z","published":"2024-04-12T13:55:05Z","title":"New Efficient Visual OILU Markers","summary":" Basic patterns are the source of a wide range of more or less complex\ngeometric structures. We will exploit such patterns to develop new efficient\nvisual markers. Besides being projective invariants, the proposed markers allow\nproducing rich panel of unique identifiers, highly required for\nresource-intensive navigation and augmented reality applications. The spiral\ntopology of our markers permits the validation of an accurate identification\nscheme, which is based on level set methods. The robustness of the markers\nagainst acquisition and geometric distortions is validated by extensive\nexperimental tests.\n","authors":["Youssef Chahir","Messaoud Mostefai","Hamza Saida"],"pdf_url":"https://arxiv.org/pdf/2404.08477v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13570v2","updated":"2024-04-12T13:44:44Z","published":"2023-11-22T18:25:51Z","title":"WildFusion: Learning 3D-Aware Latent Diffusion Models in View Space","summary":" Modern learning-based approaches to 3D-aware image synthesis achieve high\nphotorealism and 3D-consistent viewpoint changes for the generated images.\nExisting approaches represent instances in a shared canonical space. However,\nfor in-the-wild datasets a shared canonical system can be difficult to define\nor might not even exist. In this work, we instead model instances in view\nspace, alleviating the need for posed images and learned camera distributions.\nWe find that in this setting, existing GAN-based methods are prone to\ngenerating flat geometry and struggle with distribution coverage. We hence\npropose WildFusion, a new approach to 3D-aware image synthesis based on latent\ndiffusion models (LDMs). We first train an autoencoder that infers a compressed\nlatent representation, which additionally captures the images' underlying 3D\nstructure and enables not only reconstruction but also novel view synthesis. To\nlearn a faithful 3D representation, we leverage cues from monocular depth\nprediction. Then, we train a diffusion model in the 3D-aware latent space,\nthereby enabling synthesis of high-quality 3D-consistent image samples,\noutperforming recent state-of-the-art GAN-based methods. Importantly, our\n3D-aware LDM is trained without any direct supervision from multiview images or\n3D geometry and does not require posed images or learned pose or camera\ndistributions. It directly learns a 3D representation without relying on\ncanonical camera coordinates. This opens up promising research avenues for\nscalable 3D-aware image synthesis and 3D content creation from in-the-wild\nimage data. See https://katjaschwarz.github.io/wildfusion for videos of our 3D\nresults.\n","authors":["Katja Schwarz","Seung Wook Kim","Jun Gao","Sanja Fidler","Andreas Geiger","Karsten Kreis"],"pdf_url":"https://arxiv.org/pdf/2311.13570v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08452v1","updated":"2024-04-12T13:02:08Z","published":"2024-04-12T13:02:08Z","title":"MoE-FFD: Mixture of Experts for Generalized and Parameter-Efficient Face\n Forgery Detection","summary":" Deepfakes have recently raised significant trust issues and security concerns\namong the public. Compared to CNN face forgery detectors, ViT-based methods\ntake advantage of the expressivity of transformers, achieving superior\ndetection performance. However, these approaches still exhibit the following\nlimitations: (1). Fully fine-tuning ViT-based models from ImageNet weights\ndemands substantial computational and storage resources; (2). ViT-based methods\nstruggle to capture local forgery clues, leading to model bias and limited\ngeneralizability. To tackle these challenges, this work introduces\nMixture-of-Experts modules for Face Forgery Detection (MoE-FFD), a generalized\nyet parameter-efficient ViT-based approach. MoE-FFD only updates lightweight\nLow-Rank Adaptation (LoRA) and Adapter layers while keeping the ViT backbone\nfrozen, thereby achieving parameter-efficient training. Moreover, MoE-FFD\nleverages the expressivity of transformers and local priors of CNNs to\nsimultaneously extract global and local forgery clues. Additionally, novel MoE\nmodules are designed to scale the model's capacity and select optimal forgery\nexperts, further enhancing forgery detection performance. The proposed MoE\nlearning scheme can be seamlessly adapted to various transformer backbones in a\nplug-and-play manner. Extensive experimental results demonstrate that the\nproposed method achieves state-of-the-art face forgery detection performance\nwith reduced parameter overhead. The code will be released upon acceptance.\n","authors":["Chenqi Kong","Anwei Luo","Song Xia","Yi Yu","Haoliang Li","Alex C. Kot"],"pdf_url":"https://arxiv.org/pdf/2404.08452v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08450v1","updated":"2024-04-12T13:01:22Z","published":"2024-04-12T13:01:22Z","title":"Joint Physical-Digital Facial Attack Detection Via Simulating Spoofing\n Clues","summary":" Face recognition systems are frequently subjected to a variety of physical\nand digital attacks of different types. Previous methods have achieved\nsatisfactory performance in scenarios that address physical attacks and digital\nattacks, respectively. However, few methods are considered to integrate a model\nthat simultaneously addresses both physical and digital attacks, implying the\nnecessity to develop and maintain multiple models. To jointly detect physical\nand digital attacks within a single model, we propose an innovative approach\nthat can adapt to any network architecture. Our approach mainly contains two\ntypes of data augmentation, which we call Simulated Physical Spoofing Clues\naugmentation (SPSC) and Simulated Digital Spoofing Clues augmentation (SDSC).\nSPSC and SDSC augment live samples into simulated attack samples by simulating\nspoofing clues of physical and digital attacks, respectively, which\nsignificantly improve the capability of the model to detect \"unseen\" attack\ntypes. Extensive experiments show that SPSC and SDSC can achieve\nstate-of-the-art generalization in Protocols 2.1 and 2.2 of the UniAttackData\ndataset, respectively. Our method won first place in \"Unified Physical-Digital\nFace Attack Detection\" of the 5th Face Anti-spoofing Challenge@CVPR2024. Our\nfinal submission obtains 3.75% APCER, 0.93% BPCER, and 2.34% ACER,\nrespectively. Our code is available at\nhttps://github.com/Xianhua-He/cvpr2024-face-anti-spoofing-challenge.\n","authors":["Xianhua He","Dashuang Liang","Song Yang","Zhanlong Hao","Hui Ma","Binjie Mao","Xi Li","Yao Wang","Pengfei Yan","Ajian Liu"],"pdf_url":"https://arxiv.org/pdf/2404.08450v1.pdf","comment":"10 pages with 6 figures, Accepted by CVPRW 2024"},{"id":"http://arxiv.org/abs/2404.08449v1","updated":"2024-04-12T13:00:06Z","published":"2024-04-12T13:00:06Z","title":"OccGaussian: 3D Gaussian Splatting for Occluded Human Rendering","summary":" Rendering dynamic 3D human from monocular videos is crucial for various\napplications such as virtual reality and digital entertainment. Most methods\nassume the people is in an unobstructed scene, while various objects may cause\nthe occlusion of body parts in real-life scenarios. Previous method utilizing\nNeRF for surface rendering to recover the occluded areas, but it requiring more\nthan one day to train and several seconds to render, failing to meet the\nrequirements of real-time interactive applications. To address these issues, we\npropose OccGaussian based on 3D Gaussian Splatting, which can be trained within\n6 minutes and produces high-quality human renderings up to 160 FPS with\noccluded input. OccGaussian initializes 3D Gaussian distributions in the\ncanonical space, and we perform occlusion feature query at occluded regions,\nthe aggregated pixel-align feature is extracted to compensate for the missing\ninformation. Then we use Gaussian Feature MLP to further process the feature\nalong with the occlusion-aware loss functions to better perceive the occluded\narea. Extensive experiments both in simulated and real-world occlusions,\ndemonstrate that our method achieves comparable or even superior performance\ncompared to the state-of-the-art method. And we improving training and\ninference speeds by 250x and 800x, respectively. Our code will be available for\nresearch purposes.\n","authors":["Jingrui Ye","Zongkai Zhang","Yujiao Jiang","Qingmin Liao","Wenming Yang","Zongqing Lu"],"pdf_url":"https://arxiv.org/pdf/2404.08449v1.pdf","comment":"12 April, 2024; originally announced April 2024"},{"id":"http://arxiv.org/abs/2404.08433v1","updated":"2024-04-12T12:30:48Z","published":"2024-04-12T12:30:48Z","title":"MSSTNet: A Multi-Scale Spatio-Temporal CNN-Transformer Network for\n Dynamic Facial Expression Recognition","summary":" Unlike typical video action recognition, Dynamic Facial Expression\nRecognition (DFER) does not involve distinct moving targets but relies on\nlocalized changes in facial muscles. Addressing this distinctive attribute, we\npropose a Multi-Scale Spatio-temporal CNN-Transformer network (MSSTNet). Our\napproach takes spatial features of different scales extracted by CNN and feeds\nthem into a Multi-scale Embedding Layer (MELayer). The MELayer extracts\nmulti-scale spatial information and encodes these features before sending them\ninto a Temporal Transformer (T-Former). The T-Former simultaneously extracts\ntemporal information while continually integrating multi-scale spatial\ninformation. This process culminates in the generation of multi-scale\nspatio-temporal features that are utilized for the final classification. Our\nmethod achieves state-of-the-art results on two in-the-wild datasets.\nFurthermore, a series of ablation experiments and visualizations provide\nfurther validation of our approach's proficiency in leveraging spatio-temporal\ninformation within DFER.\n","authors":["Linhuang Wang","Xin Kang","Fei Ding","Satoshi Nakagawa","Fuji Ren"],"pdf_url":"https://arxiv.org/pdf/2404.08433v1.pdf","comment":"Accepted to 2024 IEEE International Conference on Acoustics, Speech,\n and Signal Processing (ICASSP 2024)"},{"id":"http://arxiv.org/abs/2404.08421v1","updated":"2024-04-12T12:10:53Z","published":"2024-04-12T12:10:53Z","title":"Adapting the Segment Anything Model During Usage in Novel Situations","summary":" The interactive segmentation task consists in the creation of object\nsegmentation masks based on user interactions. The most common way to guide a\nmodel towards producing a correct segmentation consists in clicks on the object\nand background. The recently published Segment Anything Model (SAM) supports a\ngeneralized version of the interactive segmentation problem and has been\ntrained on an object segmentation dataset which contains 1.1B masks. Though\nbeing trained extensively and with the explicit purpose of serving as a\nfoundation model, we show significant limitations of SAM when being applied for\ninteractive segmentation on novel domains or object types. On the used\ndatasets, SAM displays a failure rate $\\text{FR}_{30}@90$ of up to $72.6 \\%$.\nSince we still want such foundation models to be immediately applicable, we\npresent a framework that can adapt SAM during immediate usage. For this we will\nleverage the user interactions and masks, which are constructed during the\ninteractive segmentation process. We use this information to generate\npseudo-labels, which we use to compute a loss function and optimize a part of\nthe SAM model. The presented method causes a relative reduction of up to $48.1\n\\%$ in the $\\text{FR}_{20}@85$ and $46.6 \\%$ in the $\\text{FR}_{30}@90$\nmetrics.\n","authors":["Robin Schön","Julian Lorenz","Katja Ludwig","Rainer Lienhart"],"pdf_url":"https://arxiv.org/pdf/2404.08421v1.pdf","comment":"11 pages, 2 figures, 4 tables"},{"id":"http://arxiv.org/abs/2404.08419v1","updated":"2024-04-12T12:08:06Z","published":"2024-04-12T12:08:06Z","title":"Direct May Not Be the Best: An Incremental Evolution View of Pose\n Generation","summary":" Pose diversity is an inherent representative characteristic of 2D images. Due\nto the 3D to 2D projection mechanism, there is evident content discrepancy\namong distinct pose images. This is the main obstacle bothering pose\ntransformation related researches. To deal with this challenge, we propose a\nfine-grained incremental evolution centered pose generation framework, rather\nthan traditional direct one-to-one in a rush. Since proposed approach actually\nbypasses the theoretical difficulty of directly modeling dramatic non-linear\nvariation, the incurred content distortion and blurring could be effectively\nconstrained, at the same time the various individual pose details, especially\nclothes texture, could be precisely maintained. In order to systematically\nguide the evolution course, both global and incremental evolution constraints\nare elaborately designed and merged into the overall frame?work. And a novel\ntriple-path knowledge fusion structure is worked out to take full advantage of\nall available valuable knowledge to conduct high-quality pose synthesis. In\naddition, our framework could generate a series of valuable byproducts, namely\nthe various intermediate poses. Extensive experiments have been conducted to\nverify the effectiveness of the proposed approach. Code is available at\nhttps://github.com/Xiaofei-CN/Incremental-Evolution-Pose-Generation.\n","authors":["Yuelong Li","Tengfei Xiao","Lei Geng","Jianming Wang"],"pdf_url":"https://arxiv.org/pdf/2404.08419v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.06707v2","updated":"2024-04-12T11:56:18Z","published":"2023-04-13T17:56:08Z","title":"Toward Reliable Human Pose Forecasting with Uncertainty","summary":" Recently, there has been an arms race of pose forecasting methods aimed at\nsolving the spatio-temporal task of predicting a sequence of future 3D poses of\na person given a sequence of past observed ones. However, the lack of unified\nbenchmarks and limited uncertainty analysis have hindered progress in the\nfield. To address this, we first develop an open-source library for human pose\nforecasting, including multiple models, supporting several datasets, and\nemploying standardized evaluation metrics, with the aim of promoting research\nand moving toward a unified and consistent evaluation. Second, we devise two\ntypes of uncertainty in the problem to increase performance and convey better\ntrust: 1) we propose a method for modeling aleatoric uncertainty by using\nuncertainty priors to inject knowledge about the pattern of uncertainty. This\nfocuses the capacity of the model in the direction of more meaningful\nsupervision while reducing the number of learned parameters and improving\nstability; 2) we introduce a novel approach for quantifying the epistemic\nuncertainty of any model through clustering and measuring the entropy of its\nassignments. Our experiments demonstrate up to $25\\%$ improvements in\nforecasting at short horizons, with no loss on longer horizons on Human3.6M,\nAMSS, and 3DPW datasets, and better performance in uncertainty estimation. The\ncode is available online at https://github.com/vita-epfl/UnPOSed.\n","authors":["Saeed Saadatnejad","Mehrshad Mirmohammadi","Matin Daghyani","Parham Saremi","Yashar Zoroofchi Benisi","Amirhossein Alimohammadi","Zahra Tehraninasab","Taylor Mordan","Alexandre Alahi"],"pdf_url":"https://arxiv.org/pdf/2304.06707v2.pdf","comment":"Published in RA-L 2024"},{"id":"http://arxiv.org/abs/2404.08406v1","updated":"2024-04-12T11:33:26Z","published":"2024-04-12T11:33:26Z","title":"MambaDFuse: A Mamba-based Dual-phase Model for Multi-modality Image\n Fusion","summary":" Multi-modality image fusion (MMIF) aims to integrate complementary\ninformation from different modalities into a single fused image to represent\nthe imaging scene and facilitate downstream visual tasks comprehensively. In\nrecent years, significant progress has been made in MMIF tasks due to advances\nin deep neural networks. However, existing methods cannot effectively and\nefficiently extract modality-specific and modality-fused features constrained\nby the inherent local reductive bias (CNN) or quadratic computational\ncomplexity (Transformers). To overcome this issue, we propose a Mamba-based\nDual-phase Fusion (MambaDFuse) model. Firstly, a dual-level feature extractor\nis designed to capture long-range features from single-modality images by\nextracting low and high-level features from CNN and Mamba blocks. Then, a\ndual-phase feature fusion module is proposed to obtain fusion features that\ncombine complementary information from different modalities. It uses the\nchannel exchange method for shallow fusion and the enhanced Multi-modal Mamba\n(M3) blocks for deep fusion. Finally, the fused image reconstruction module\nutilizes the inverse transformation of the feature extraction to generate the\nfused result. Through extensive experiments, our approach achieves promising\nfusion results in infrared-visible image fusion and medical image fusion.\nAdditionally, in a unified benchmark, MambaDFuse has also demonstrated improved\nperformance in downstream tasks such as object detection. Code with checkpoints\nwill be available after the peer-review process.\n","authors":["Zhe Li","Haiwei Pan","Kejia Zhang","Yuhua Wang","Fengming Yu"],"pdf_url":"https://arxiv.org/pdf/2404.08406v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08401v1","updated":"2024-04-12T11:15:15Z","published":"2024-04-12T11:15:15Z","title":"No Bells, Just Whistles: Sports Field Registration by Leveraging\n Geometric Properties","summary":" Broadcast sports field registration is traditionally addressed as a\nhomography estimation task, mapping the visible image area to a planar field\nmodel, predominantly focusing on the main camera shot. Addressing the\nshortcomings of previous approaches, we propose a novel calibration pipeline\nenabling camera calibration using a 3D soccer field model and extending the\nprocess to assess the multiple-view nature of broadcast videos. Our approach\nbegins with a keypoint generation pipeline derived from SoccerNet dataset\nannotations, leveraging the geometric properties of the court. Subsequently, we\nexecute classical camera calibration through DLT algorithm in a minimalist\nfashion, without further refinement. Through extensive experimentation on\nreal-world soccer broadcast datasets such as SoccerNet-Calibration, WorldCup\n2014 and TS- WorldCup, our method demonstrates superior performance in both\nmultiple- and single-view 3D camera calibration while maintaining competitive\nresults in homography estimation compared to state-of-the-art techniques.\n","authors":["Marc Gutiérrez-Pérez","Antonio Agudo"],"pdf_url":"https://arxiv.org/pdf/2404.08401v1.pdf","comment":"Accepted in CVPRW 2024"},{"id":"http://arxiv.org/abs/2105.03026v2","updated":"2024-04-12T11:14:04Z","published":"2021-05-07T01:32:37Z","title":"Efficient Masked Face Recognition Method during the COVID-19 Pandemic","summary":" The coronavirus disease (COVID-19) is an unparalleled crisis leading to a\nhuge number of casualties and security problems. In order to reduce the spread\nof coronavirus, people often wear masks to protect themselves. This makes face\nrecognition a very difficult task since certain parts of the face are hidden. A\nprimary focus of researchers during the ongoing coronavirus pandemic is to come\nup with suggestions to handle this problem through rapid and efficient\nsolutions. In this paper, we propose a reliable method based on occlusion\nremoval and deep learning-based features in order to address the problem of the\nmasked face recognition process. The first step is to remove the masked face\nregion. Next, we apply three pre-trained deep Convolutional Neural Networks\n(CNN) namely, VGG-16, AlexNet, and ResNet-50, and use them to extract deep\nfeatures from the obtained regions (mostly eyes and forehead regions). The\nBag-of-features paradigm is then applied to the feature maps of the last\nconvolutional layer in order to quantize them and to get a slight\nrepresentation comparing to the fully connected layer of classical CNN.\nFinally, Multilayer Perceptron (MLP) is applied for the classification process.\nExperimental results on Real-World-Masked-Face-Dataset show high recognition\nperformance compared to other state-of-the-art methods.\n","authors":["Walid Hariri"],"pdf_url":"https://arxiv.org/pdf/2105.03026v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08399v1","updated":"2024-04-12T11:08:26Z","published":"2024-04-12T11:08:26Z","title":"Mitigating Challenges of the Space Environment for Onboard Artificial\n Intelligence: Design Overview of the Imaging Payload on SpIRIT","summary":" Artificial intelligence (AI) and autonomous edge computing in space are\nemerging areas of interest to augment capabilities of nanosatellites, where\nmodern sensors generate orders of magnitude more data than can typically be\ntransmitted to mission control. Here, we present the hardware and software\ndesign of an onboard AI subsystem hosted on SpIRIT. The system is optimised for\non-board computer vision experiments based on visible light and long wave\ninfrared cameras. This paper highlights the key design choices made to maximise\nthe robustness of the system in harsh space conditions, and their motivation\nrelative to key mission requirements, such as limited compute resources,\nresilience to cosmic radiation, extreme temperature variations, distribution\nshifts, and very low transmission bandwidths. The payload, called Loris,\nconsists of six visible light cameras, three infrared cameras, a camera control\nboard and a Graphics Processing Unit (GPU) system-on-module. Loris enables the\nexecution of AI models with on-orbit fine-tuning as well as a next-generation\nimage compression algorithm, including progressive coding. This innovative\napproach not only enhances the data processing capabilities of nanosatellites\nbut also lays the groundwork for broader applications to remote sensing from\nspace.\n","authors":["Miguel Ortiz del Castillo","Jonathan Morgan","Jack McRobbie","Clint Therakam","Zaher Joukhadar","Robert Mearns","Simon Barraclough","Richard Sinnott","Andrew Woods","Chris Bayliss","Kris Ehinger","Ben Rubinstein","James Bailey","Airlie Chapman","Michele Trenti"],"pdf_url":"https://arxiv.org/pdf/2404.08399v1.pdf","comment":"AI4Space 2024, 3rd Workshop on AI for Space, CVPR 2024"},{"id":"http://arxiv.org/abs/2404.08392v1","updated":"2024-04-12T10:54:11Z","published":"2024-04-12T10:54:11Z","title":"NC-TTT: A Noise Contrastive Approach for Test-Time Training","summary":" Despite their exceptional performance in vision tasks, deep learning models\noften struggle when faced with domain shifts during testing. Test-Time Training\n(TTT) methods have recently gained popularity by their ability to enhance the\nrobustness of models through the addition of an auxiliary objective that is\njointly optimized with the main task. Being strictly unsupervised, this\nauxiliary objective is used at test time to adapt the model without any access\nto labels. In this work, we propose Noise-Contrastive Test-Time Training\n(NC-TTT), a novel unsupervised TTT technique based on the discrimination of\nnoisy feature maps. By learning to classify noisy views of projected feature\nmaps, and then adapting the model accordingly on new domains, classification\nperformance can be recovered by an important margin. Experiments on several\npopular test-time adaptation baselines demonstrate the advantages of our method\ncompared to recent approaches for this task. The code can be found\nat:https://github.com/GustavoVargasHakim/NCTTT.git\n","authors":["David Osowiechi","Gustavo A. Vargas Hakim","Mehrdad Noori","Milad Cheraghalikhani","Ali Bahri","Moslem Yazdanpanah","Ismail Ben Ayed","Christian Desrosiers"],"pdf_url":"https://arxiv.org/pdf/2404.08392v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.04385v2","updated":"2024-04-12T10:15:45Z","published":"2024-03-07T10:25:23Z","title":"Impacts of Color and Texture Distortions on Earth Observation Data in\n Deep Learning","summary":" Land cover classification and change detection are two important applications\nof remote sensing and Earth observation (EO) that have benefited greatly from\nthe advances of deep learning. Convolutional and transformer-based U-net models\nare the state-of-the-art architectures for these tasks, and their performances\nhave been boosted by an increased availability of large-scale annotated EO\ndatasets. However, the influence of different visual characteristics of the\ninput EO data on a model's predictions is not well understood. In this work we\nsystematically examine model sensitivities with respect to several color- and\ntexture-based distortions on the input EO data during inference, given models\nthat have been trained without such distortions. We conduct experiments with\nmultiple state-of-the-art segmentation networks for land cover classification\nand show that they are in general more sensitive to texture than to color\ndistortions. Beyond revealing intriguing characteristics of widely used land\ncover classification models, our results can also be used to guide the\ndevelopment of more robust models within the EO domain.\n","authors":["Martin Willbo","Aleksis Pirinen","John Martinsson","Edvin Listo Zec","Olof Mogren","Mikael Nilsson"],"pdf_url":"https://arxiv.org/pdf/2403.04385v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08363v1","updated":"2024-04-12T10:04:03Z","published":"2024-04-12T10:04:03Z","title":"Let It Flow: Simultaneous Optimization of 3D Flow and Object Clustering","summary":" We study the problem of self-supervised 3D scene flow estimation from real\nlarge-scale raw point cloud sequences, which is crucial to various tasks like\ntrajectory prediction or instance segmentation. In the absence of ground truth\nscene flow labels, contemporary approaches concentrate on deducing optimizing\nflow across sequential pairs of point clouds by incorporating structure based\nregularization on flow and object rigidity. The rigid objects are estimated by\na variety of 3D spatial clustering methods. While state-of-the-art methods\nsuccessfully capture overall scene motion using the Neural Prior structure,\nthey encounter challenges in discerning multi-object motions. We identified the\nstructural constraints and the use of large and strict rigid clusters as the\nmain pitfall of the current approaches and we propose a novel clustering\napproach that allows for combination of overlapping soft clusters as well as\nnon-overlapping rigid clusters representation. Flow is then jointly estimated\nwith progressively growing non-overlapping rigid clusters together with fixed\nsize overlapping soft clusters. We evaluate our method on multiple datasets\nwith LiDAR point clouds, demonstrating the superior performance over the\nself-supervised baselines reaching new state of the art results. Our method\nespecially excels in resolving flow in complicated dynamic scenes with multiple\nindependently moving objects close to each other which includes pedestrians,\ncyclists and other vulnerable road users. Our codes will be publicly available.\n","authors":["Patrik Vacek","David Hurych","Tomáš Svoboda","Karel Zimmermann"],"pdf_url":"https://arxiv.org/pdf/2404.08363v1.pdf","comment":"ECCV submission"},{"id":"http://arxiv.org/abs/2404.08353v1","updated":"2024-04-12T09:44:18Z","published":"2024-04-12T09:44:18Z","title":"TDANet: Target-Directed Attention Network For Object-Goal Visual\n Navigation With Zero-Shot Ability","summary":" The generalization of the end-to-end deep reinforcement learning (DRL) for\nobject-goal visual navigation is a long-standing challenge since object classes\nand placements vary in new test environments. Learning domain-independent\nvisual representation is critical for enabling the trained DRL agent with the\nability to generalize to unseen scenes and objects. In this letter, a\ntarget-directed attention network (TDANet) is proposed to learn the end-to-end\nobject-goal visual navigation policy with zero-shot ability. TDANet features a\nnovel target attention (TA) module that learns both the spatial and semantic\nrelationships among objects to help TDANet focus on the most relevant observed\nobjects to the target. With the Siamese architecture (SA) design, TDANet\ndistinguishes the difference between the current and target states and\ngenerates the domain-independent visual representation. To evaluate the\nnavigation performance of TDANet, extensive experiments are conducted in the\nAI2-THOR embodied AI environment. The simulation results demonstrate a strong\ngeneralization ability of TDANet to unseen scenes and target objects, with\nhigher navigation success rate (SR) and success weighted by length (SPL) than\nother state-of-the-art models.\n","authors":["Shiwei Lian","Feitian Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.08353v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16588v2","updated":"2024-04-12T09:38:33Z","published":"2023-09-28T16:45:46Z","title":"Vision Transformers Need Registers","summary":" Transformers have recently emerged as a powerful tool for learning visual\nrepresentations. In this paper, we identify and characterize artifacts in\nfeature maps of both supervised and self-supervised ViT networks. The artifacts\ncorrespond to high-norm tokens appearing during inference primarily in\nlow-informative background areas of images, that are repurposed for internal\ncomputations. We propose a simple yet effective solution based on providing\nadditional tokens to the input sequence of the Vision Transformer to fill that\nrole. We show that this solution fixes that problem entirely for both\nsupervised and self-supervised models, sets a new state of the art for\nself-supervised visual models on dense visual prediction tasks, enables object\ndiscovery methods with larger models, and most importantly leads to smoother\nfeature maps and attention maps for downstream visual processing.\n","authors":["Timothée Darcet","Maxime Oquab","Julien Mairal","Piotr Bojanowski"],"pdf_url":"https://arxiv.org/pdf/2309.16588v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16254v2","updated":"2024-04-12T09:37:37Z","published":"2023-11-27T19:02:17Z","title":"Safe-CLIP: Removing NSFW Concepts from Vision-and-Language Models","summary":" Large-scale vision-and-language models, such as CLIP, are typically trained\non web-scale data, which can introduce inappropriate content and lead to the\ndevelopment of unsafe and biased behavior. This, in turn, hampers their\napplicability in sensitive and trustworthy contexts and could raise significant\nconcerns in their adoption. Our research introduces a novel approach to\nenhancing the safety of vision-and-language models by diminishing their\nsensitivity to NSFW (not safe for work) inputs. In particular, our methodology\nseeks to sever \"toxic\" linguistic and visual concepts, unlearning the linkage\nbetween unsafe linguistic or visual items and unsafe regions of the embedding\nspace. We show how this can be done by fine-tuning a CLIP model on synthetic\ndata obtained from a large language model trained to convert between safe and\nunsafe sentences, and a text-to-image generator. We conduct extensive\nexperiments on the resulting embedding space for cross-modal retrieval,\ntext-to-image, and image-to-text generation, where we show that our model can\nbe remarkably employed with pre-trained generative models. Our source code and\ntrained models are available at: https://github.com/aimagelab/safe-clip.\n","authors":["Samuele Poppi","Tobia Poppi","Federico Cocchi","Marcella Cornia","Lorenzo Baraldi","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2311.16254v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07236v2","updated":"2024-04-12T09:34:38Z","published":"2024-04-08T08:50:09Z","title":"Lightweight Deep Learning for Resource-Constrained Environments: A\n Survey","summary":" Over the past decade, the dominance of deep learning has prevailed across\nvarious domains of artificial intelligence, including natural language\nprocessing, computer vision, and biomedical signal processing. While there have\nbeen remarkable improvements in model accuracy, deploying these models on\nlightweight devices, such as mobile phones and microcontrollers, is constrained\nby limited resources. In this survey, we provide comprehensive design guidance\ntailored for these devices, detailing the meticulous design of lightweight\nmodels, compression methods, and hardware acceleration strategies. The\nprincipal goal of this work is to explore methods and concepts for getting\naround hardware constraints without compromising the model's accuracy.\nAdditionally, we explore two notable paths for lightweight deep learning in the\nfuture: deployment techniques for TinyML and Large Language Models. Although\nthese paths undoubtedly have potential, they also present significant\nchallenges, encouraging research into unexplored areas.\n","authors":["Hou-I Liu","Marco Galindo","Hongxia Xie","Lai-Kuan Wong","Hong-Han Shuai","Yung-Hui Li","Wen-Huang Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.07236v2.pdf","comment":"40 pages"},{"id":"http://arxiv.org/abs/2404.08351v1","updated":"2024-04-12T09:31:55Z","published":"2024-04-12T09:31:55Z","title":"OmniSat: Self-Supervised Modality Fusion for Earth Observation","summary":" The field of Earth Observations (EO) offers a wealth of data from diverse\nsensors, presenting a great opportunity for advancing self-supervised\nmultimodal learning. However, current multimodal EO datasets and models focus\non a single data type, either mono-date images or time series, which limits\ntheir expressivity. We introduce OmniSat, a novel architecture that exploits\nthe spatial alignment between multiple EO modalities to learn expressive\nmultimodal representations without labels. To demonstrate the advantages of\ncombining modalities of different natures, we augment two existing datasets\nwith new modalities. As demonstrated on three downstream tasks: forestry, land\ncover classification, and crop mapping. OmniSat can learn rich representations\nin an unsupervised manner, leading to improved performance in the semi- and\nfully-supervised settings, even when only one modality is available for\ninference. The code and dataset are available at github.com/gastruc/OmniSat.\n","authors":["Guillaume Astruc","Nicolas Gonthier","Clement Mallet","Loic Landrieu"],"pdf_url":"https://arxiv.org/pdf/2404.08351v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08350v1","updated":"2024-04-12T09:31:11Z","published":"2024-04-12T09:31:11Z","title":"Self-Supervised k-Space Regularization for Motion-Resolved Abdominal MRI\n Using Neural Implicit k-Space Representation","summary":" Neural implicit k-space representations have shown promising results for\ndynamic MRI at high temporal resolutions. Yet, their exclusive training in\nk-space limits the application of common image regularization methods to\nimprove the final reconstruction. In this work, we introduce the concept of\nparallel imaging-inspired self-consistency (PISCO), which we incorporate as\nnovel self-supervised k-space regularization enforcing a consistent\nneighborhood relationship. At no additional data cost, the proposed\nregularization significantly improves neural implicit k-space reconstructions\non simulated data. Abdominal in-vivo reconstructions using PISCO result in\nenhanced spatio-temporal image quality compared to state-of-the-art methods.\nCode is available at https://github.com/vjspi/PISCO-NIK.\n","authors":["Veronika Spieker","Hannah Eichhorn","Jonathan K. Stelter","Wenqi Huang","Rickmer F. Braren","Daniel Rückert","Francisco Sahli Costabal","Kerstin Hammernik","Claudia Prieto","Dimitrios C. Karampinos","Julia A. Schnabel"],"pdf_url":"https://arxiv.org/pdf/2404.08350v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2404.08347v1","updated":"2024-04-12T09:22:24Z","published":"2024-04-12T09:22:24Z","title":"Learning to Rebalance Multi-Modal Optimization by Adaptively Masking\n Subnetworks","summary":" Multi-modal learning aims to enhance performance by unifying models from\nvarious modalities but often faces the \"modality imbalance\" problem in real\ndata, leading to a bias towards dominant modalities and neglecting others,\nthereby limiting its overall effectiveness. To address this challenge, the core\nidea is to balance the optimization of each modality to achieve a joint\noptimum. Existing approaches often employ a modal-level control mechanism for\nadjusting the update of each modal parameter. However, such a global-wise\nupdating mechanism ignores the different importance of each parameter. Inspired\nby subnetwork optimization, we explore a uniform sampling-based optimization\nstrategy and find it more effective than global-wise updating. According to the\nfindings, we further propose a novel importance sampling-based, element-wise\njoint optimization method, called Adaptively Mask Subnetworks Considering Modal\nSignificance(AMSS). Specifically, we incorporate mutual information rates to\ndetermine the modal significance and employ non-uniform adaptive sampling to\nselect foreground subnetworks from each modality for parameter updates, thereby\nrebalancing multi-modal learning. Additionally, we demonstrate the reliability\nof the AMSS strategy through convergence analysis. Building upon theoretical\ninsights, we further enhance the multi-modal mask subnetwork strategy using\nunbiased estimation, referred to as AMSS+. Extensive experiments reveal the\nsuperiority of our approach over comparison methods.\n","authors":["Yang Yang","Hongpeng Pan","Qing-Yuan Jiang","Yi Xu","Jinghui Tang"],"pdf_url":"https://arxiv.org/pdf/2404.08347v1.pdf","comment":"17 pages;6 figures"},{"id":"http://arxiv.org/abs/2308.09372v2","updated":"2024-04-12T09:21:33Z","published":"2023-08-18T08:06:49Z","title":"Which Transformer to Favor: A Comparative Analysis of Efficiency in\n Vision Transformers","summary":" Transformers come with a high computational cost, yet their effectiveness in\naddressing problems in language and vision has sparked extensive research aimed\nat enhancing their efficiency. However, diverse experimental conditions,\nspanning multiple input domains, prevent a fair comparison based solely on\nreported results, posing challenges for model selection. To address this gap in\ncomparability, we design a comprehensive benchmark of more than 30 models for\nimage classification, evaluating key efficiency aspects, including accuracy,\nspeed, and memory usage. This benchmark provides a standardized baseline across\nthe landscape of efficiency-oriented transformers and our framework of\nanalysis, based on Pareto optimality, reveals surprising insights. Despite\nclaims of other models being more efficient, ViT remains Pareto optimal across\nmultiple metrics. We observe that hybrid attention-CNN models exhibit\nremarkable inference memory- and parameter-efficiency. Moreover, our benchmark\nshows that using a larger model in general is more efficient than using higher\nresolution images. Thanks to our holistic evaluation, we provide a centralized\nresource for practitioners and researchers, facilitating informed decisions\nwhen selecting transformers or measuring progress of the development of\nefficient transformers.\n","authors":["Tobias Christian Nauen","Sebastian Palacio","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2308.09372v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08341v1","updated":"2024-04-12T09:13:37Z","published":"2024-04-12T09:13:37Z","title":"Counterfactual Explanations for Face Forgery Detection via Adversarial\n Removal of Artifacts","summary":" Highly realistic AI generated face forgeries known as deepfakes have raised\nserious social concerns. Although DNN-based face forgery detection models have\nachieved good performance, they are vulnerable to latest generative methods\nthat have less forgery traces and adversarial attacks. This limitation of\ngeneralization and robustness hinders the credibility of detection results and\nrequires more explanations. In this work, we provide counterfactual\nexplanations for face forgery detection from an artifact removal perspective.\nSpecifically, we first invert the forgery images into the StyleGAN latent\nspace, and then adversarially optimize their latent representations with the\ndiscrimination supervision from the target detection model. We verify the\neffectiveness of the proposed explanations from two aspects: (1) Counterfactual\nTrace Visualization: the enhanced forgery images are useful to reveal artifacts\nby visually contrasting the original images and two different visualization\nmethods; (2) Transferable Adversarial Attacks: the adversarial forgery images\ngenerated by attacking the detection model are able to mislead other detection\nmodels, implying the removed artifacts are general. Extensive experiments\ndemonstrate that our method achieves over 90% attack success rate and superior\nattack transferability. Compared with naive adversarial noise methods, our\nmethod adopts both generative and discriminative model priors, and optimize the\nlatent representations in a synthesis-by-analysis way, which forces the search\nof counterfactual explanations on the natural face manifold. Thus, more general\ncounterfactual traces can be found and better adversarial attack\ntransferability can be achieved.\n","authors":["Yang Li","Songlin Yang","Wei Wang","Ziwen He","Bo Peng","Jing Dong"],"pdf_url":"https://arxiv.org/pdf/2404.08341v1.pdf","comment":"Accepted to ICME2024"},{"id":"http://arxiv.org/abs/2404.07762v2","updated":"2024-04-12T09:13:29Z","published":"2024-04-11T14:03:16Z","title":"NeuroNCAP: Photorealistic Closed-loop Safety Testing for Autonomous\n Driving","summary":" We present a versatile NeRF-based simulator for testing autonomous driving\n(AD) software systems, designed with a focus on sensor-realistic closed-loop\nevaluation and the creation of safety-critical scenarios. The simulator learns\nfrom sequences of real-world driving sensor data and enables reconfigurations\nand renderings of new, unseen scenarios. In this work, we use our simulator to\ntest the responses of AD models to safety-critical scenarios inspired by the\nEuropean New Car Assessment Programme (Euro NCAP). Our evaluation reveals that,\nwhile state-of-the-art end-to-end planners excel in nominal driving scenarios\nin an open-loop setting, they exhibit critical flaws when navigating our\nsafety-critical scenarios in a closed-loop setting. This highlights the need\nfor advancements in the safety and real-world usability of end-to-end planners.\nBy publicly releasing our simulator and scenarios as an easy-to-run evaluation\nsuite, we invite the research community to explore, refine, and validate their\nAD models in controlled, yet highly configurable and challenging\nsensor-realistic environments. Code and instructions can be found at\nhttps://github.com/wljungbergh/NeuroNCAP\n","authors":["William Ljungbergh","Adam Tonderski","Joakim Johnander","Holger Caesar","Kalle Åström","Michael Felsberg","Christoffer Petersson"],"pdf_url":"https://arxiv.org/pdf/2404.07762v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16794v2","updated":"2024-04-12T09:04:05Z","published":"2023-12-28T02:54:34Z","title":"ZONE: Zero-Shot Instruction-Guided Local Editing","summary":" Recent advances in vision-language models like Stable Diffusion have shown\nremarkable power in creative image synthesis and editing.However, most existing\ntext-to-image editing methods encounter two obstacles: First, the text prompt\nneeds to be carefully crafted to achieve good results, which is not intuitive\nor user-friendly. Second, they are insensitive to local edits and can\nirreversibly affect non-edited regions, leaving obvious editing traces. To\ntackle these problems, we propose a Zero-shot instructiON-guided local image\nEditing approach, termed ZONE. We first convert the editing intent from the\nuser-provided instruction (e.g., \"make his tie blue\") into specific image\nediting regions through InstructPix2Pix. We then propose a Region-IoU scheme\nfor precise image layer extraction from an off-the-shelf segment model. We\nfurther develop an edge smoother based on FFT for seamless blending between the\nlayer and the image.Our method allows for arbitrary manipulation of a specific\nregion with a single instruction while preserving the rest. Extensive\nexperiments demonstrate that our ZONE achieves remarkable local editing results\nand user-friendliness, outperforming state-of-the-art methods. Code is\navailable at https://github.com/lsl001006/ZONE.\n","authors":["Shanglin Li","Bohan Zeng","Yutang Feng","Sicheng Gao","Xuhui Liu","Jiaming Liu","Li Lin","Xu Tang","Yao Hu","Jianzhuang Liu","Baochang Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.16794v2.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2403.06567v2","updated":"2024-04-12T08:52:24Z","published":"2024-03-11T10:06:45Z","title":"Leveraging Foundation Models for Content-Based Medical Image Retrieval\n in Radiology","summary":" Content-based image retrieval (CBIR) has the potential to significantly\nimprove diagnostic aid and medical research in radiology. Current CBIR systems\nface limitations due to their specialization to certain pathologies, limiting\ntheir utility. In response, we propose using vision foundation models as\npowerful and versatile off-the-shelf feature extractors for content-based\nmedical image retrieval. By benchmarking these models on a comprehensive\ndataset of 1.6 million 2D radiological images spanning four modalities and 161\npathologies, we identify weakly-supervised models as superior, achieving a P@1\nof up to 0.594. This performance not only competes with a specialized model but\ndoes so without the need for fine-tuning. Our analysis further explores the\nchallenges in retrieving pathological versus anatomical structures, indicating\nthat accurate retrieval of pathological features presents greater difficulty.\nDespite these challenges, our research underscores the vast potential of\nfoundation models for CBIR in radiology, proposing a shift towards versatile,\ngeneral-purpose medical image retrieval systems that do not require specific\ntuning.\n","authors":["Stefan Denner","David Zimmerer","Dimitrios Bounias","Markus Bujotzek","Shuhan Xiao","Lisa Kausch","Philipp Schader","Tobias Penzkofer","Paul F. Jäger","Klaus Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2403.06567v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08330v1","updated":"2024-04-12T08:46:53Z","published":"2024-04-12T08:46:53Z","title":"Emerging Property of Masked Token for Effective Pre-training","summary":" Driven by the success of Masked Language Modeling (MLM), the realm of\nself-supervised learning for computer vision has been invigorated by the\ncentral role of Masked Image Modeling (MIM) in driving recent breakthroughs.\nNotwithstanding the achievements of MIM across various downstream tasks, its\noverall efficiency is occasionally hampered by the lengthy duration of the\npre-training phase. This paper presents a perspective that the optimization of\nmasked tokens as a means of addressing the prevailing issue. Initially, we\ndelve into an exploration of the inherent properties that a masked token ought\nto possess. Within the properties, we principally dedicated to articulating and\nemphasizing the `data singularity' attribute inherent in masked tokens. Through\na comprehensive analysis of the heterogeneity between masked tokens and visible\ntokens within pre-trained models, we propose a novel approach termed masked\ntoken optimization (MTO), specifically designed to improve model efficiency\nthrough weight recalibration and the enhancement of the key property of masked\ntokens. The proposed method serves as an adaptable solution that seamlessly\nintegrates into any MIM approach that leverages masked tokens. As a result, MTO\nachieves a considerable improvement in pre-training efficiency, resulting in an\napproximately 50% reduction in pre-training epochs required to attain converged\nperformance of the recent approaches.\n","authors":["Hyesong Choi","Hunsang Lee","Seyoung Joung","Hyejin Park","Jiyeong Kim","Dongbo Min"],"pdf_url":"https://arxiv.org/pdf/2404.08330v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01449v2","updated":"2024-04-12T08:40:55Z","published":"2024-03-03T09:07:16Z","title":"DUFOMap: Efficient Dynamic Awareness Mapping","summary":" The dynamic nature of the real world is one of the main challenges in\nrobotics. The first step in dealing with it is to detect which parts of the\nworld are dynamic. A typical benchmark task is to create a map that contains\nonly the static part of the world to support, for example, localization and\nplanning. Current solutions are often applied in post-processing, where\nparameter tuning allows the user to adjust the setting for a specific dataset.\nIn this paper, we propose DUFOMap, a novel dynamic awareness mapping framework\ndesigned for efficient online processing. Despite having the same parameter\nsettings for all scenarios, it performs better or is on par with\nstate-of-the-art methods. Ray casting is utilized to identify and classify\nfully observed empty regions. Since these regions have been observed empty, it\nfollows that anything inside them at another time must be dynamic. Evaluation\nis carried out in various scenarios, including outdoor environments in KITTI\nand Argoverse 2, open areas on the KTH campus, and with different sensor types.\nDUFOMap outperforms the state of the art in terms of accuracy and computational\nefficiency. The source code, benchmarks, and links to the datasets utilized are\nprovided. See https://kth-rpl.github.io/dufomap for more details.\n","authors":["Daniel Duberg","Qingwen Zhang","MingKai Jia","Patric Jensfelt"],"pdf_url":"https://arxiv.org/pdf/2403.01449v2.pdf","comment":"The first two authors hold equal contribution. 8 pages, 7 figures,\n project page https://kth-rpl.github.io/dufomap"},{"id":"http://arxiv.org/abs/2404.08327v1","updated":"2024-04-12T08:38:51Z","published":"2024-04-12T08:38:51Z","title":"Salience-Based Adaptive Masking: Revisiting Token Dynamics for Enhanced\n Pre-training","summary":" In this paper, we introduce Saliency-Based Adaptive Masking (SBAM), a novel\nand cost-effective approach that significantly enhances the pre-training\nperformance of Masked Image Modeling (MIM) approaches by prioritizing token\nsalience. Our method provides robustness against variations in masking ratios,\neffectively mitigating the performance instability issues common in existing\nmethods. This relaxes the sensitivity of MIM-based pre-training to masking\nratios, which in turn allows us to propose an adaptive strategy for `tailored'\nmasking ratios for each data sample, which no existing method can provide.\nToward this goal, we propose an Adaptive Masking Ratio (AMR) strategy that\ndynamically adjusts the proportion of masking for the unique content of each\nimage based on token salience. We show that our method significantly improves\nover the state-of-the-art in mask-based pre-training on the ImageNet-1K\ndataset.\n","authors":["Hyesong Choi","Hyejin Park","Kwang Moo Yi","Sungmin Cha","Dongbo Min"],"pdf_url":"https://arxiv.org/pdf/2404.08327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2109.14335v2","updated":"2024-04-12T08:37:47Z","published":"2021-09-29T10:41:41Z","title":"A Systematic Survey of Deep Learning-based Single-Image Super-Resolution","summary":" Single-image super-resolution (SISR) is an important task in image\nprocessing, which aims to enhance the resolution of imaging systems. Recently,\nSISR has made a huge leap and has achieved promising results with the help of\ndeep learning (DL). In this survey, we give an overview of DL-based SISR\nmethods and group them according to their design targets. Specifically, we\nfirst introduce the problem definition, research background, and the\nsignificance of SISR. Secondly, we introduce some related works, including\nbenchmark datasets, upsampling methods, optimization objectives, and image\nquality assessment methods. Thirdly, we provide a detailed investigation of\nSISR and give some domain-specific applications of it. Fourthly, we present the\nreconstruction results of some classic SISR methods to intuitively know their\nperformance. Finally, we discuss some issues that still exist in SISR and\nsummarize some new trends and future directions. This is an exhaustive survey\nof SISR, which can help researchers better understand SISR and inspire more\nexciting research in this field. An investigation project for SISR is provided\nat https://github.com/CV-JunchengLi/SISR-Survey.\n","authors":["Juncheng Li","Zehua Pei","Wenjie Li","Guangwei Gao","Longguang Wang","Yingqian Wang","Tieyong Zeng"],"pdf_url":"https://arxiv.org/pdf/2109.14335v2.pdf","comment":"40 pages, 12 figures"},{"id":"http://arxiv.org/abs/2404.07537v2","updated":"2024-04-12T08:18:44Z","published":"2024-04-11T08:03:23Z","title":"How is Visual Attention Influenced by Text Guidance? Database and Model","summary":" The analysis and prediction of visual attention have long been crucial tasks\nin the fields of computer vision and image processing. In practical\napplications, images are generally accompanied by various text descriptions,\nhowever, few studies have explored the influence of text descriptions on visual\nattention, let alone developed visual saliency prediction models considering\ntext guidance. In this paper, we conduct a comprehensive study on text-guided\nimage saliency (TIS) from both subjective and objective perspectives.\nSpecifically, we construct a TIS database named SJTU-TIS, which includes 1200\ntext-image pairs and the corresponding collected eye-tracking data. Based on\nthe established SJTU-TIS database, we analyze the influence of various text\ndescriptions on visual attention. Then, to facilitate the development of\nsaliency prediction models considering text influence, we construct a benchmark\nfor the established SJTU-TIS database using state-of-the-art saliency models.\nFinally, considering the effect of text descriptions on visual attention, while\nmost existing saliency models ignore this impact, we further propose a\ntext-guided saliency (TGSal) prediction model, which extracts and integrates\nboth image features and text features to predict the image saliency under\nvarious text-description conditions. Our proposed model significantly\noutperforms the state-of-the-art saliency models on both the SJTU-TIS database\nand the pure image saliency databases in terms of various evaluation metrics.\nThe SJTU-TIS database and the code of the proposed TGSal model will be released\nat: https://github.com/IntMeGroup/TGSal.\n","authors":["Yinan Sun","Xiongkuo Min","Huiyu Duan","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2404.07537v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08312v1","updated":"2024-04-12T08:14:17Z","published":"2024-04-12T08:14:17Z","title":"GPN: Generative Point-based NeRF","summary":" Scanning real-life scenes with modern registration devices typically gives\nincomplete point cloud representations, primarily due to the limitations of\npartial scanning, 3D occlusions, and dynamic light conditions. Recent works on\nprocessing incomplete point clouds have always focused on point cloud\ncompletion. However, these approaches do not ensure consistency between the\ncompleted point cloud and the captured images regarding color and geometry. We\npropose using Generative Point-based NeRF (GPN) to reconstruct and repair a\npartial cloud by fully utilizing the scanning images and the corresponding\nreconstructed cloud. The repaired point cloud can achieve multi-view\nconsistency with the captured images at high spatial resolution. For the\nfinetunes of a single scene, we optimize the global latent condition by\nincorporating an Auto-Decoder architecture while retaining multi-view\nconsistency. As a result, the generated point clouds are smooth, plausible, and\ngeometrically consistent with the partial scanning images. Extensive\nexperiments on ShapeNet demonstrate that our works achieve competitive\nperformances to the other state-of-the-art point cloud-based neural scene\nrendering and editing performances.\n","authors":["Haipeng Wang"],"pdf_url":"https://arxiv.org/pdf/2404.08312v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08801v4","updated":"2024-04-12T07:48:45Z","published":"2024-02-05T12:33:37Z","title":"CoBra: Complementary Branch Fusing Class and Semantic Knowledge for\n Robust Weakly Supervised Semantic Segmentation","summary":" Leveraging semantically precise pseudo masks derived from image-level class\nknowledge for segmentation, namely image-level Weakly Supervised Semantic\nSegmentation (WSSS), still remains challenging. While Class Activation Maps\n(CAMs) using CNNs have steadily been contributing to the success of WSSS, the\nresulting activation maps often narrowly focus on class-specific parts (e.g.,\nonly face of human). On the other hand, recent works based on vision\ntransformers (ViT) have shown promising results based on their self-attention\nmechanism to capture the semantic parts but fail in capturing complete\nclass-specific details (e.g., entire body parts of human but also with a dog\nnearby). In this work, we propose Complementary Branch (CoBra), a novel dual\nbranch framework consisting of two distinct architectures which provide\nvaluable complementary knowledge of class (from CNN) and semantic (from ViT) to\neach branch. In particular, we learn Class-Aware Projection (CAP) for the CNN\nbranch and Semantic-Aware Projection (SAP) for the ViT branch to explicitly\nfuse their complementary knowledge and facilitate a new type of extra\npatch-level supervision. Our model, through CoBra, fuses CNN and ViT's\ncomplementary outputs to create robust pseudo masks that integrate both class\nand semantic information effectively. Extensive experiments qualitatively and\nquantitatively investigate how CNN and ViT complement each other on the PASCAL\nVOC 2012 dataset, showing a state-of-the-art WSSS result. This includes not\nonly the masks generated by our model, but also the segmentation results\nderived from utilizing these masks as pseudo labels.\n","authors":["Woojung Han","Seil Kang","Kyobin Choo","Seong Jae Hwang"],"pdf_url":"https://arxiv.org/pdf/2403.08801v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17759v4","updated":"2024-04-12T07:44:25Z","published":"2024-01-31T11:36:12Z","title":"Rapid post-disaster infrastructure damage characterisation enabled by\n remote sensing and deep learning technologies -- a tiered approach","summary":" Critical infrastructure, such as transport networks and bridges, are\nsystematically targeted during wars and suffer damage during extensive natural\ndisasters because it is vital for enabling connectivity and transportation of\npeople and goods, and hence, underpins national and international economic\ngrowth. Mass destruction of transport assets, in conjunction with minimal or no\naccessibility in the wake of natural and anthropogenic disasters, prevents us\nfrom delivering rapid recovery and adaptation. As a result, systemic\noperability is drastically reduced, leading to low levels of resilience. Thus,\nthere is a need for rapid assessment of its condition to allow for informed\ndecision-making for restoration prioritisation. A solution to this challenge is\nto use technology that enables stand-off observations. Nevertheless, no methods\nexist for automated characterisation of damage at multiple scales, i.e.\nregional (e.g., network), asset (e.g., bridges), and structural (e.g., road\npavement) scales. We propose a methodology based on an integrated, multi-scale\ntiered approach to fill this capability gap. In doing so, we demonstrate how\nautomated damage characterisation can be enabled by fit-for-purpose digital\ntechnologies. Next, the methodology is applied and validated to a case study in\nUkraine that includes 17 bridges, damaged by human targeted interventions. From\nregional to component scale, we deploy technology to integrate assessments\nusing Sentinel-1 SAR images, crowdsourced information, and high-resolution\nimages for deep learning to facilitate automatic damage detection and\ncharacterisation. For the first time, the interferometric coherence difference\nand semantic segmentation of images were deployed in a tiered multi-scale\napproach to improve the reliability of damage characterisations at different\nscales.\n","authors":["Nadiia Kopiika","Andreas Karavias","Pavlos Krassakis","Zehao Ye","Jelena Ninic","Nataliya Shakhovska","Nikolaos Koukouzas","Sotirios Argyroudis","Stergios-Aristoteles Mitoulis"],"pdf_url":"https://arxiv.org/pdf/2401.17759v4.pdf","comment":"43 pages; 20 figures"},{"id":"http://arxiv.org/abs/2310.12877v4","updated":"2024-04-12T07:43:35Z","published":"2023-10-19T16:32:18Z","title":"Perceptual Assessment and Optimization of High Dynamic Range Image\n Rendering","summary":" High dynamic range (HDR) rendering has the ability to faithfully reproduce\nthe wide luminance ranges in natural scenes, but how to accurately assess the\nrendering quality is relatively underexplored. Existing quality models are\nmostly designed for low dynamic range (LDR) images, and do not align well with\nhuman perception of HDR image quality. To fill this gap, we propose a family of\nHDR quality metrics, in which the key step is employing a simple inverse\ndisplay model to decompose an HDR image into a stack of LDR images with varying\nexposures. Subsequently, these decomposed images are assessed through\nwell-established LDR quality metrics. Our HDR quality models present three\ndistinct benefits. First, they directly inherit the recent advancements of LDR\nquality metrics. Second, they do not rely on human perceptual data of HDR image\nquality for re-calibration. Third, they facilitate the alignment and\nprioritization of specific luminance ranges for more accurate and detailed\nquality assessment. Experimental results show that our HDR quality metrics\nconsistently outperform existing models in terms of quality assessment on four\nHDR image quality datasets and perceptual optimization of HDR novel view\nsynthesis.\n","authors":["Peibei Cao","Rafal K. Mantiuk","Kede Ma"],"pdf_url":"https://arxiv.org/pdf/2310.12877v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08298v1","updated":"2024-04-12T07:41:17Z","published":"2024-04-12T07:41:17Z","title":"Interference Motion Removal for Doppler Radar Vital Sign Detection Using\n Variational Encoder-Decoder Neural Network","summary":" The treatment of interfering motion contributions remains one of the key\nchallenges in the domain of radar-based vital sign monitoring. Removal of the\ninterference to extract the vital sign contributions is demanding due to\noverlapping Doppler bands, the complex structure of the interference motions\nand significant variations in the power levels of their contributions. A novel\napproach to the removal of interference through the use of a probabilistic deep\nlearning model is presented. Results show that a convolutional encoder-decoder\nneural network with a variational objective is capable of learning a meaningful\nrepresentation space of vital sign Doppler-time distribution facilitating their\nextraction from a mixture signal. The approach is tested on semi-experimental\ndata containing real vital sign signatures and simulated returns from\ninterfering body motions. The application of the proposed network enhances the\nextraction of the micro-Doppler frequency corresponding to the respiration rate\nis demonstrated.\n","authors":["Mikolaj Czerkawski","Christos Ilioudis","Carmine Clemente","Craig Michie","Ivan Andonovic","Christos Tachtatzis"],"pdf_url":"https://arxiv.org/pdf/2404.08298v1.pdf","comment":"Presented at 2021 IEEE Radar Conference (RadarConf21)"},{"id":"http://arxiv.org/abs/2404.08293v1","updated":"2024-04-12T07:30:52Z","published":"2024-04-12T07:30:52Z","title":"Overcoming Scene Context Constraints for Object Detection in wild using\n Defilters","summary":" This paper focuses on improving object detection performance by addressing\nthe issue of image distortions, commonly encountered in uncontrolled\nacquisition environments. High-level computer vision tasks such as object\ndetection, recognition, and segmentation are particularly sensitive to image\ndistortion. To address this issue, we propose a novel approach employing an\nimage defilter to rectify image distortion prior to object detection. This\nmethod enhances object detection accuracy, as models perform optimally when\ntrained on non-distorted images. Our experiments demonstrate that utilizing\ndefiltered images significantly improves mean average precision compared to\ntraining object detection models on distorted images. Consequently, our\nproposed method offers considerable benefits for real-world applications\nplagued by image distortion. To our knowledge, the contribution lies in\nemploying distortion-removal paradigm for object detection on images captured\nin natural settings. We achieved an improvement of 0.562 and 0.564 of mean\nAverage precision on validation and test data.\n","authors":["Vamshi Krishna Kancharla","Neelam sinha"],"pdf_url":"https://arxiv.org/pdf/2404.08293v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08292v1","updated":"2024-04-12T07:30:24Z","published":"2024-04-12T07:30:24Z","title":"AdaContour: Adaptive Contour Descriptor with Hierarchical Representation","summary":" Existing angle-based contour descriptors suffer from lossy representation for\nnon-starconvex shapes. By and large, this is the result of the shape being\nregistered with a single global inner center and a set of radii corresponding\nto a polar coordinate parameterization. In this paper, we propose AdaContour,\nan adaptive contour descriptor that uses multiple local representations to\ndesirably characterize complex shapes. After hierarchically encoding object\nshapes in a training set and constructing a contour matrix of all subdivided\nregions, we compute a robust low-rank robust subspace and approximate each\nlocal contour by linearly combining the shared basis vectors to represent an\nobject. Experiments show that AdaContour is able to represent shapes more\naccurately and robustly than other descriptors while retaining effectiveness.\nWe validate AdaContour by integrating it into off-the-shelf detectors to enable\ninstance segmentation which demonstrates faithful performance. The code is\navailable at https://github.com/tding1/AdaContour.\n","authors":["Tianyu Ding","Jinxin Zhou","Tianyi Chen","Zhihui Zhu","Ilya Zharkov","Luming Liang"],"pdf_url":"https://arxiv.org/pdf/2404.08292v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08291v1","updated":"2024-04-12T07:30:08Z","published":"2024-04-12T07:30:08Z","title":"On Input Formats for Radar Micro-Doppler Signature Processing by\n Convolutional Neural Networks","summary":" Convolutional neural networks have often been proposed for processing radar\nMicro-Doppler signatures, most commonly with the goal of classifying the\nsignals. The majority of works tend to disregard phase information from the\ncomplex time-frequency representation. Here, the utility of the phase\ninformation, as well as the optimal format of the Doppler-time input for a\nconvolutional neural network, is analysed. It is found that the performance\nachieved by convolutional neural network classifiers is heavily influenced by\nthe type of input representation, even across formats with equivalent\ninformation. Furthermore, it is demonstrated that the phase component of the\nDoppler-time representation contains rich information useful for classification\nand that unwrapping the phase in the temporal dimension can improve the results\ncompared to a magnitude-only solution, improving accuracy from 0.920 to 0.938\non the tested human activity dataset. Further improvement of 0.947 is achieved\nby training a linear classifier on embeddings from multiple-formats.\n","authors":["Mikolaj Czerkawski","Carmine Clemente","Craig Michie","Christos Tachtatzis"],"pdf_url":"https://arxiv.org/pdf/2404.08291v1.pdf","comment":"Presented at International Conference on Radar Systems (RADAR 2022)"},{"id":"http://arxiv.org/abs/2404.08285v1","updated":"2024-04-12T07:19:16Z","published":"2024-04-12T07:19:16Z","title":"A Survey of Neural Network Robustness Assessment in Image Recognition","summary":" In recent years, there has been significant attention given to the robustness\nassessment of neural networks. Robustness plays a critical role in ensuring\nreliable operation of artificial intelligence (AI) systems in complex and\nuncertain environments. Deep learning's robustness problem is particularly\nsignificant, highlighted by the discovery of adversarial attacks on image\nclassification models. Researchers have dedicated efforts to evaluate\nrobustness in diverse perturbation conditions for image recognition tasks.\nRobustness assessment encompasses two main techniques: robustness verification/\ncertification for deliberate adversarial attacks and robustness testing for\nrandom data corruptions. In this survey, we present a detailed examination of\nboth adversarial robustness (AR) and corruption robustness (CR) in neural\nnetwork assessment. Analyzing current research papers and standards, we provide\nan extensive overview of robustness assessment in image recognition. Three\nessential aspects are analyzed: concepts, metrics, and assessment methods. We\ninvestigate the perturbation metrics and range representations used to measure\nthe degree of perturbations on images, as well as the robustness metrics\nspecifically for the robustness conditions of classification models. The\nstrengths and limitations of the existing methods are also discussed, and some\npotential directions for future research are provided.\n","authors":["Jie Wang","Jun Ai","Minyan Lu","Haoran Su","Dan Yu","Yutao Zhang","Junda Zhu","Jingyu Liu"],"pdf_url":"https://arxiv.org/pdf/2404.08285v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08281v1","updated":"2024-04-12T07:13:32Z","published":"2024-04-12T07:13:32Z","title":"Calibration & Reconstruction: Deep Integrated Language for Referring\n Image Segmentation","summary":" Referring image segmentation aims to segment an object referred to by natural\nlanguage expression from an image. The primary challenge lies in the efficient\npropagation of fine-grained semantic information from textual features to\nvisual features. Many recent works utilize a Transformer to address this\nchallenge. However, conventional transformer decoders can distort linguistic\ninformation with deeper layers, leading to suboptimal results. In this paper,\nwe introduce CRFormer, a model that iteratively calibrates multi-modal features\nin the transformer decoder. We start by generating language queries using\nvision features, emphasizing different aspects of the input language. Then, we\npropose a novel Calibration Decoder (CDec) wherein the multi-modal features can\niteratively calibrated by the input language features. In the Calibration\nDecoder, we use the output of each decoder layer and the original language\nfeatures to generate new queries for continuous calibration, which gradually\nupdates the language features. Based on CDec, we introduce a Language\nReconstruction Module and a reconstruction loss. This module leverages queries\nfrom the final layer of the decoder to reconstruct the input language and\ncompute the reconstruction loss. This can further prevent the language\ninformation from being lost or distorted. Our experiments consistently show the\nsuperior performance of our approach across RefCOCO, RefCOCO+, and G-Ref\ndatasets compared to state-of-the-art methods.\n","authors":["Yichen Yan","Xingjian He","Sihan Chen","Jing Liu"],"pdf_url":"https://arxiv.org/pdf/2404.08281v1.pdf","comment":"9 pages, 8 figures ICMR2024. arXiv admin note: text overlap with\n arXiv:2305.14969"},{"id":"http://arxiv.org/abs/2404.08279v1","updated":"2024-04-12T07:08:05Z","published":"2024-04-12T07:08:05Z","title":"Convolutional neural network classification of cancer cytopathology\n images: taking breast cancer as an example","summary":" Breast cancer is a relatively common cancer among gynecological cancers. Its\ndiagnosis often relies on the pathology of cells in the lesion. The\npathological diagnosis of breast cancer not only requires professionals and\ntime, but also sometimes involves subjective judgment. To address the\nchallenges of dependence on pathologists expertise and the time-consuming\nnature of achieving accurate breast pathological image classification, this\npaper introduces an approach utilizing convolutional neural networks (CNNs) for\nthe rapid categorization of pathological images, aiming to enhance the\nefficiency of breast pathological image detection. And the approach enables the\nrapid and automatic classification of pathological images into benign and\nmalignant groups. The methodology involves utilizing a convolutional neural\nnetwork (CNN) model leveraging the Inceptionv3 architecture and transfer\nlearning algorithm for extracting features from pathological images. Utilizing\na neural network with fully connected layers and employing the SoftMax function\nfor image classification. Additionally, the concept of image partitioning is\nintroduced to handle high-resolution images. To achieve the ultimate\nclassification outcome, the classification probabilities of each image block\nare aggregated using three algorithms: summation, product, and maximum.\nExperimental validation was conducted on the BreaKHis public dataset, resulting\nin accuracy rates surpassing 0.92 across all four magnification coefficients\n(40X, 100X, 200X, and 400X). It demonstrates that the proposed method\neffectively enhances the accuracy in classifying pathological images of breast\ncancer.\n","authors":["MingXuan Xiao","Yufeng Li","Xu Yan","Min Gao","Weimin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.08279v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02492v3","updated":"2024-04-12T07:06:52Z","published":"2023-10-03T23:44:35Z","title":"FairVision: Equitable Deep Learning for Eye Disease Screening via Fair\n Identity Scaling","summary":" Equity in AI for healthcare is crucial due to its direct impact on human\nwell-being. Despite advancements in 2D medical imaging fairness, the fairness\nof 3D models remains underexplored, hindered by the small sizes of 3D fairness\ndatasets. Since 3D imaging surpasses 2D imaging in SOTA clinical care, it is\ncritical to understand the fairness of these 3D models. To address this\nresearch gap, we conduct the first comprehensive study on the fairness of 3D\nmedical imaging models across multiple protected attributes. Our investigation\nspans both 2D and 3D models and evaluates fairness across five architectures on\nthree common eye diseases, revealing significant biases across race, gender,\nand ethnicity. To alleviate these biases, we propose a novel fair identity\nscaling (FIS) method that improves both overall performance and fairness,\noutperforming various SOTA fairness methods. Moreover, we release\nHarvard-FairVision, the first large-scale medical fairness dataset with 30,000\nsubjects featuring both 2D and 3D imaging data and six demographic identity\nattributes. Harvard-FairVision provides labels for three major eye disorders\naffecting about 380 million people worldwide, serving as a valuable resource\nfor both 2D and 3D fairness learning. Our code and dataset are publicly\naccessible at\n\\url{https://ophai.hms.harvard.edu/datasets/harvard-fairvision30k}.\n","authors":["Yan Luo","Muhammad Osama Khan","Yu Tian","Min Shi","Zehao Dou","Tobias Elze","Yi Fang","Mengyu Wang"],"pdf_url":"https://arxiv.org/pdf/2310.02492v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08277v1","updated":"2024-04-12T07:04:56Z","published":"2024-04-12T07:04:56Z","title":"FaceFilterSense: A Filter-Resistant Face Recognition and Facial\n Attribute Analysis Framework","summary":" With the advent of social media, fun selfie filters have come into tremendous\nmainstream use affecting the functioning of facial biometric systems as well as\nimage recognition systems. These filters vary from beautification filters and\nAugmented Reality (AR)-based filters to filters that modify facial landmarks.\nHence, there is a need to assess the impact of such filters on the performance\nof existing face recognition systems. The limitation associated with existing\nsolutions is that these solutions focus more on the beautification filters.\nHowever, the current AR-based filters and filters which distort facial key\npoints are in vogue recently and make the faces highly unrecognizable even to\nthe naked eye. Also, the filters considered are mostly obsolete with limited\nvariations. To mitigate these limitations, we aim to perform a holistic impact\nanalysis of the latest filters and propose an user recognition model with the\nfiltered images. We have utilized a benchmark dataset for baseline images, and\napplied the latest filters over them to generate a beautified/filtered dataset.\nNext, we have introduced a model FaceFilterNet for beautified user recognition.\nIn this framework, we also utilize our model to comment on various attributes\nof the person including age, gender, and ethnicity. In addition, we have also\npresented a filter-wise impact analysis on face recognition, age estimation,\ngender, and ethnicity prediction. The proposed method affirms the efficacy of\nour dataset with an accuracy of 87.25% and an optimal accuracy for facial\nattribute analysis.\n","authors":["Shubham Tiwari","Yash Sethia","Ritesh Kumar","Ashwani Tanwar","Rudresh Dwivedi"],"pdf_url":"https://arxiv.org/pdf/2404.08277v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08273v1","updated":"2024-04-12T06:52:40Z","published":"2024-04-12T06:52:40Z","title":"Struggle with Adversarial Defense? Try Diffusion","summary":" Adversarial attacks induce misclassification by introducing subtle\nperturbations. Recently, diffusion models are applied to the image classifiers\nto improve adversarial robustness through adversarial training or by purifying\nadversarial noise. However, diffusion-based adversarial training often\nencounters convergence challenges and high computational expenses.\nAdditionally, diffusion-based purification inevitably causes data shift and is\ndeemed susceptible to stronger adaptive attacks. To tackle these issues, we\npropose the Truth Maximization Diffusion Classifier (TMDC), a generative\nBayesian classifier that builds upon pre-trained diffusion models and the\nBayesian theorem. Unlike data-driven classifiers, TMDC, guided by Bayesian\nprinciples, utilizes the conditional likelihood from diffusion models to\ndetermine the class probabilities of input images, thereby insulating against\nthe influences of data shift and the limitations of adversarial training.\nMoreover, to enhance TMDC's resilience against more potent adversarial attacks,\nwe propose an optimization strategy for diffusion classifiers. This strategy\ninvolves post-training the diffusion model on perturbed datasets with\nground-truth labels as conditions, guiding the diffusion model to learn the\ndata distribution and maximizing the likelihood under the ground-truth labels.\nThe proposed method achieves state-of-the-art performance on the CIFAR10\ndataset against heavy white-box attacks and strong adaptive attacks.\nSpecifically, TMDC achieves robust accuracies of 82.81% against $l_{\\infty}$\nnorm-bounded perturbations and 86.05% against $l_{2}$ norm-bounded\nperturbations, respectively, with $\\epsilon=0.05$.\n","authors":["Yujie Li","Yanbin Wang","Haitao xu","Bin Liu","Jianguo Sun","Zhenhao Guo","Wenrui Ma"],"pdf_url":"https://arxiv.org/pdf/2404.08273v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.05516v2","updated":"2024-04-12T06:51:06Z","published":"2022-06-11T12:39:37Z","title":"Deep Learning-Based MR Image Re-parameterization","summary":" Magnetic resonance (MR) image re-parameterization refers to the process of\ngenerating via simulations of an MR image with a new set of MRI scanning\nparameters. Different parameter values generate distinct contrast between\ndifferent tissues, helping identify pathologic tissue. Typically, more than one\nscan is required for diagnosis; however, acquiring repeated scans can be\ncostly, time-consuming, and difficult for patients. Thus, using MR image\nre-parameterization to predict and estimate the contrast in these imaging scans\ncan be an effective alternative. In this work, we propose a novel deep learning\n(DL) based convolutional model for MRI re-parameterization. Based on our\npreliminary results, DL-based techniques hold the potential to learn the\nnon-linearities that govern the re-parameterization.\n","authors":["Abhijeet Narang","Abhigyan Raj","Mihaela Pop","Mehran Ebrahimi"],"pdf_url":"https://arxiv.org/pdf/2206.05516v2.pdf","comment":"A. Narang, A. Raj, M. Pop and M. Ebrahimi, \"Deep Learning-Based MR\n Image Re-parameterization,\" 2023 Congress in Computer Science, Computer\n Engineering, & Applied Computing (CSCE), Las Vegas, NV, USA, 2023, pp.\n 536-541, doi: 10.1109/CSCE60160.2023.00094"},{"id":"http://arxiv.org/abs/2303.03761v2","updated":"2024-04-12T06:42:47Z","published":"2023-03-07T09:56:23Z","title":"Graph Neural Networks in Vision-Language Image Understanding: A Survey","summary":" 2D image understanding is a complex problem within computer vision, but it\nholds the key to providing human-level scene comprehension. It goes further\nthan identifying the objects in an image, and instead, it attempts to\nunderstand the scene. Solutions to this problem form the underpinning of a\nrange of tasks, including image captioning, visual question answering (VQA),\nand image retrieval. Graphs provide a natural way to represent the relational\narrangement between objects in an image, and thus, in recent years graph neural\nnetworks (GNNs) have become a standard component of many 2D image understanding\npipelines, becoming a core architectural component, especially in the VQA group\nof tasks. In this survey, we review this rapidly evolving field and we provide\na taxonomy of graph types used in 2D image understanding approaches, a\ncomprehensive list of the GNN models used in this domain, and a roadmap of\nfuture potential developments. To the best of our knowledge, this is the first\ncomprehensive survey that covers image captioning, visual question answering,\nand image retrieval techniques that focus on using GNNs as the main part of\ntheir architecture.\n","authors":["Henry Senior","Gregory Slabaugh","Shanxin Yuan","Luca Rossi"],"pdf_url":"https://arxiv.org/pdf/2303.03761v2.pdf","comment":"20 pages, 5 figures, 5 tables"},{"id":"http://arxiv.org/abs/2404.08264v1","updated":"2024-04-12T06:23:48Z","published":"2024-04-12T06:23:48Z","title":"Guided Masked Self-Distillation Modeling for Distributed Multimedia\n Sensor Event Analysis","summary":" Observations with distributed sensors are essential in analyzing a series of\nhuman and machine activities (referred to as 'events' in this paper) in complex\nand extensive real-world environments. This is because the information obtained\nfrom a single sensor is often missing or fragmented in such an environment;\nobservations from multiple locations and modalities should be integrated to\nanalyze events comprehensively. However, a learning method has yet to be\nestablished to extract joint representations that effectively combine such\ndistributed observations. Therefore, we propose Guided Masked sELf-Distillation\nmodeling (Guided-MELD) for inter-sensor relationship modeling. The basic idea\nof Guided-MELD is to learn to supplement the information from the masked sensor\nwith information from other sensors needed to detect the event. Guided-MELD is\nexpected to enable the system to effectively distill the fragmented or\nredundant target event information obtained by the sensors without being overly\ndependent on any specific sensors. To validate the effectiveness of the\nproposed method in novel tasks of distributed multimedia sensor event analysis,\nwe recorded two new datasets that fit the problem setting: MM-Store and\nMM-Office. These datasets consist of human activities in a convenience store\nand an office, recorded using distributed cameras and microphones. Experimental\nresults on these datasets show that the proposed Guided-MELD improves event\ntagging and detection performance and outperforms conventional inter-sensor\nrelationship modeling methods. Furthermore, the proposed method performed\nrobustly even when sensors were reduced.\n","authors":["Masahiro Yasuda","Noboru Harada","Yasunori Ohishi","Shoichiro Saito","Akira Nakayama","Nobutaka Ono"],"pdf_url":"https://arxiv.org/pdf/2404.08264v1.pdf","comment":"13page, 7figure, under review"},{"id":"http://arxiv.org/abs/2312.16837v3","updated":"2024-04-12T06:23:45Z","published":"2023-12-28T05:46:26Z","title":"DiffusionGAN3D: Boosting Text-guided 3D Generation and Domain Adaptation\n by Combining 3D GANs and Diffusion Priors","summary":" Text-guided domain adaptation and generation of 3D-aware portraits find many\napplications in various fields. However, due to the lack of training data and\nthe challenges in handling the high variety of geometry and appearance, the\nexisting methods for these tasks suffer from issues like inflexibility,\ninstability, and low fidelity. In this paper, we propose a novel framework\nDiffusionGAN3D, which boosts text-guided 3D domain adaptation and generation by\ncombining 3D GANs and diffusion priors. Specifically, we integrate the\npre-trained 3D generative models (e.g., EG3D) and text-to-image diffusion\nmodels. The former provides a strong foundation for stable and high-quality\navatar generation from text. And the diffusion models in turn offer powerful\npriors and guide the 3D generator finetuning with informative direction to\nachieve flexible and efficient text-guided domain adaptation. To enhance the\ndiversity in domain adaptation and the generation capability in text-to-avatar,\nwe introduce the relative distance loss and case-specific learnable triplane\nrespectively. Besides, we design a progressive texture refinement module to\nimprove the texture quality for both tasks above. Extensive experiments\ndemonstrate that the proposed framework achieves excellent results in both\ndomain adaptation and text-to-avatar tasks, outperforming existing methods in\nterms of generation quality and efficiency. The project homepage is at\nhttps://younglbw.github.io/DiffusionGAN3D-homepage/.\n","authors":["Biwen Lei","Kai Yu","Mengyang Feng","Miaomiao Cui","Xuansong Xie"],"pdf_url":"https://arxiv.org/pdf/2312.16837v3.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2404.05268v2","updated":"2024-04-12T06:20:49Z","published":"2024-04-08T07:59:04Z","title":"MC$^2$: Multi-concept Guidance for Customized Multi-concept Generation","summary":" Customized text-to-image generation aims to synthesize instantiations of\nuser-specified concepts and has achieved unprecedented progress in handling\nindividual concept. However, when extending to multiple customized concepts,\nexisting methods exhibit limitations in terms of flexibility and fidelity, only\naccommodating the combination of limited types of models and potentially\nresulting in a mix of characteristics from different concepts. In this paper,\nwe introduce the Multi-concept guidance for Multi-concept customization, termed\nMC$^2$, for improved flexibility and fidelity. MC$^2$ decouples the\nrequirements for model architecture via inference time optimization, allowing\nthe integration of various heterogeneous single-concept customized models. It\nadaptively refines the attention weights between visual and textual tokens,\ndirecting image regions to focus on their associated words while diminishing\nthe impact of irrelevant ones. Extensive experiments demonstrate that MC$^2$\neven surpasses previous methods that require additional training in terms of\nconsistency with input prompt and reference images. Moreover, MC$^2$ can be\nextended to elevate the compositional capabilities of text-to-image generation,\nyielding appealing results. Code will be publicly available at\nhttps://github.com/JIANGJiaXiu/MC-2.\n","authors":["Jiaxiu Jiang","Yabo Zhang","Kailai Feng","Xiaohe Wu","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.05268v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08255v1","updated":"2024-04-12T06:09:24Z","published":"2024-04-12T06:09:24Z","title":"Practical Region-level Attack against Segment Anything Models","summary":" Segment Anything Models (SAM) have made significant advancements in image\nsegmentation, allowing users to segment target portions of an image with a\nsingle click (i.e., user prompt). Given its broad applications, the robustness\nof SAM against adversarial attacks is a critical concern. While recent works\nhave explored adversarial attacks against a pre-defined prompt/click, their\nthreat model is not yet realistic: (1) they often assume the user-click\nposition is known to the attacker (point-based attack), and (2) they often\noperate under a white-box setting with limited transferability. In this paper,\nwe propose a more practical region-level attack where attackers do not need to\nknow the precise user prompt. The attack remains effective as the user clicks\non any point on the target object in the image, hiding the object from SAM.\nAlso, by adapting a spectrum transformation method, we make the attack more\ntransferable under a black-box setting. Both control experiments and testing\nagainst real-world SAM services confirm its effectiveness.\n","authors":["Yifan Shen","Zhengyuan Li","Gang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.08255v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08252v1","updated":"2024-04-12T05:43:10Z","published":"2024-04-12T05:43:10Z","title":"MonoPatchNeRF: Improving Neural Radiance Fields with Patch-based\n Monocular Guidance","summary":" The latest regularized Neural Radiance Field (NeRF) approaches produce poor\ngeometry and view extrapolation for multiview stereo (MVS) benchmarks such as\nETH3D. In this paper, we aim to create 3D models that provide accurate geometry\nand view synthesis, partially closing the large geometric performance gap\nbetween NeRF and traditional MVS methods. We propose a patch-based approach\nthat effectively leverages monocular surface normal and relative depth\npredictions. The patch-based ray sampling also enables the appearance\nregularization of normalized cross-correlation (NCC) and structural similarity\n(SSIM) between randomly sampled virtual and training views. We further show\nthat \"density restrictions\" based on sparse structure-from-motion points can\nhelp greatly improve geometric accuracy with a slight drop in novel view\nsynthesis metrics. Our experiments show 4x the performance of RegNeRF and 8x\nthat of FreeNeRF on average F1@2cm for ETH3D MVS benchmark, suggesting a\nfruitful research direction to improve the geometric accuracy of NeRF-based\nmodels, and sheds light on a potential future approach to enable NeRF-based\noptimization to eventually outperform traditional MVS.\n","authors":["Yuqun Wu","Jae Yong Lee","Chuhang Zou","Shenlong Wang","Derek Hoiem"],"pdf_url":"https://arxiv.org/pdf/2404.08252v1.pdf","comment":"26 pages, 15 figures"},{"id":"http://arxiv.org/abs/2309.08966v2","updated":"2024-04-12T05:34:02Z","published":"2023-09-16T11:42:41Z","title":"FF-LOGO: Cross-Modality Point Cloud Registration with Feature Filtering\n and Local to Global Optimization","summary":" Cross-modality point cloud registration is confronted with significant\nchallenges due to inherent differences in modalities between different sensors.\nWe propose a cross-modality point cloud registration framework FF-LOGO: a\ncross-modality point cloud registration method with feature filtering and\nlocal-global optimization. The cross-modality feature correlation filtering\nmodule extracts geometric transformation-invariant features from cross-modality\npoint clouds and achieves point selection by feature matching. We also\nintroduce a cross-modality optimization process, including a local adaptive key\nregion aggregation module and a global modality consistency fusion optimization\nmodule. Experimental results demonstrate that our two-stage optimization\nsignificantly improves the registration accuracy of the feature association and\nselection module. Our method achieves a substantial increase in recall rate\ncompared to the current state-of-the-art methods on the 3DCSR dataset,\nimproving from 40.59% to 75.74%. Our code will be available at\nhttps://github.com/wangmohan17/FFLOGO.\n","authors":["Nan Ma","Mohan Wang","Yiheng Han","Yong-Jin Liu"],"pdf_url":"https://arxiv.org/pdf/2309.08966v2.pdf","comment":"Accepted by 2024 IEEE International Conference on Robotics and\n Automation (ICRA),7 pages, 2 figures"},{"id":"http://arxiv.org/abs/2308.15070v3","updated":"2024-04-12T05:26:59Z","published":"2023-08-29T07:11:52Z","title":"DiffBIR: Towards Blind Image Restoration with Generative Diffusion Prior","summary":" We present DiffBIR, a general restoration pipeline that could handle\ndifferent blind image restoration tasks in a unified framework. DiffBIR\ndecouples blind image restoration problem into two stages: 1) degradation\nremoval: removing image-independent content; 2) information regeneration:\ngenerating the lost image content. Each stage is developed independently but\nthey work seamlessly in a cascaded manner. In the first stage, we use\nrestoration modules to remove degradations and obtain high-fidelity restored\nresults. For the second stage, we propose IRControlNet that leverages the\ngenerative ability of latent diffusion models to generate realistic details.\nSpecifically, IRControlNet is trained based on specially produced condition\nimages without distracting noisy content for stable generation performance.\nMoreover, we design a region-adaptive restoration guidance that can modify the\ndenoising process during inference without model re-training, allowing users to\nbalance realness and fidelity through a tunable guidance scale. Extensive\nexperiments have demonstrated DiffBIR's superiority over state-of-the-art\napproaches for blind image super-resolution, blind face restoration and blind\nimage denoising tasks on both synthetic and real-world datasets. The code is\navailable at https://github.com/XPixelGroup/DiffBIR.\n","authors":["Xinqi Lin","Jingwen He","Ziyan Chen","Zhaoyang Lyu","Bo Dai","Fanghua Yu","Wanli Ouyang","Yu Qiao","Chao Dong"],"pdf_url":"https://arxiv.org/pdf/2308.15070v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.10356v3","updated":"2024-04-12T05:07:28Z","published":"2023-09-19T06:32:19Z","title":"RoadFormer: Duplex Transformer for RGB-Normal Semantic Road Scene\n Parsing","summary":" The recent advancements in deep convolutional neural networks have shown\nsignificant promise in the domain of road scene parsing. Nevertheless, the\nexisting works focus primarily on freespace detection, with little attention\ngiven to hazardous road defects that could compromise both driving safety and\ncomfort. In this paper, we introduce RoadFormer, a novel Transformer-based\ndata-fusion network developed for road scene parsing. RoadFormer utilizes a\nduplex encoder architecture to extract heterogeneous features from both RGB\nimages and surface normal information. The encoded features are subsequently\nfed into a novel heterogeneous feature synergy block for effective feature\nfusion and recalibration. The pixel decoder then learns multi-scale long-range\ndependencies from the fused and recalibrated heterogeneous features, which are\nsubsequently processed by a Transformer decoder to produce the final semantic\nprediction. Additionally, we release SYN-UDTIRI, the first large-scale road\nscene parsing dataset that contains over 10,407 RGB images, dense depth images,\nand the corresponding pixel-level annotations for both freespace and road\ndefects of different shapes and sizes. Extensive experimental evaluations\nconducted on our SYN-UDTIRI dataset, as well as on three public datasets,\nincluding KITTI road, CityScapes, and ORFD, demonstrate that RoadFormer\noutperforms all other state-of-the-art networks for road scene parsing.\nSpecifically, RoadFormer ranks first on the KITTI road benchmark. Our source\ncode, created dataset, and demo video are publicly available at\nmias.group/RoadFormer.\n","authors":["Jiahang Li","Yikang Zhang","Peng Yun","Guangliang Zhou","Qijun Chen","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2309.10356v3.pdf","comment":"9 pages 7 figures. Accepted by Transactions on Intelligent Vehicles"},{"id":"http://arxiv.org/abs/2403.14047v2","updated":"2024-04-12T05:07:27Z","published":"2024-03-21T00:09:04Z","title":"Accelerating ViT Inference on FPGA through Static and Dynamic Pruning","summary":" Vision Transformers (ViTs) have achieved state-of-the-art accuracy on various\ncomputer vision tasks. However, their high computational complexity prevents\nthem from being applied to many real-world applications. Weight and token\npruning are two well-known methods for reducing complexity: weight pruning\nreduces the model size and associated computational demands, while token\npruning further dynamically reduces the computation based on the input.\nCombining these two techniques should significantly reduce computation\ncomplexity and model size; however, naively integrating them results in\nirregular computation patterns, leading to significant accuracy drops and\ndifficulties in hardware acceleration.\n Addressing the above challenges, we propose a comprehensive\nalgorithm-hardware codesign for accelerating ViT on FPGA through simultaneous\npruning -combining static weight pruning and dynamic token pruning. For\nalgorithm design, we systematically combine a hardware-aware structured\nblock-pruning method for pruning model parameters and a dynamic token pruning\nmethod for removing unimportant token vectors. Moreover, we design a novel\ntraining algorithm to recover the model's accuracy. For hardware design, we\ndevelop a novel hardware accelerator for executing the pruned model. The\nproposed hardware design employs multi-level parallelism with load balancing\nstrategy to efficiently deal with the irregular computation pattern led by the\ntwo pruning approaches. Moreover, we develop an efficient hardware mechanism\nfor efficiently executing the on-the-fly token pruning.\n","authors":["Dhruv Parikh","Shouyi Li","Bingyi Zhang","Rajgopal Kannan","Carl Busart","Viktor Prasanna"],"pdf_url":"https://arxiv.org/pdf/2403.14047v2.pdf","comment":"FCCM 2024"},{"id":"http://arxiv.org/abs/2208.07463v4","updated":"2024-04-12T04:48:48Z","published":"2022-08-15T22:51:23Z","title":"Conv-Adapter: Exploring Parameter Efficient Transfer Learning for\n ConvNets","summary":" While parameter efficient tuning (PET) methods have shown great potential\nwith transformer architecture on Natural Language Processing (NLP) tasks, their\neffectiveness with large-scale ConvNets is still under-studied on Computer\nVision (CV) tasks. This paper proposes Conv-Adapter, a PET module designed for\nConvNets. Conv-Adapter is light-weight, domain-transferable, and\narchitecture-agnostic with generalized performance on different tasks. When\ntransferring on downstream tasks, Conv-Adapter learns tasks-specific feature\nmodulation to the intermediate representations of backbones while keeping the\npre-trained parameters frozen. By introducing only a tiny amount of learnable\nparameters, e.g., only 3.5% full fine-tuning parameters of ResNet50. It can\nalso be applied for transformer-based backbones. Conv-Adapter outperforms\nprevious PET baseline methods and achieves comparable or surpasses the\nperformance of full fine-tuning on 23 classification tasks of various domains.\nIt also presents superior performance on the few-shot classification with an\naverage margin of 3.39%. Beyond classification, Conv-Adapter can generalize to\ndetection and segmentation tasks with more than 50% reduction of parameters but\ncomparable performance to the traditional full fine-tuning.\n","authors":["Hao Chen","Ran Tao","Han Zhang","Yidong Wang","Xiang Li","Wei Ye","Jindong Wang","Guosheng Hu","Marios Savvides"],"pdf_url":"https://arxiv.org/pdf/2208.07463v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08238v1","updated":"2024-04-12T04:45:51Z","published":"2024-04-12T04:45:51Z","title":"Simulation of a Vision Correction Display System","summary":" Eyes serve as our primary sensory organs, responsible for processing up to\n80\\% of our sensory input. However, common visual aberrations like myopia and\nhyperopia affect a significant portion of the global population. This paper\nfocuses on simulating a Vision Correction Display (VCD) to enhance the visual\nexperience of individuals with various visual impairments. Utilising Blender,\nwe digitally model the functionality of a VCD in correcting refractive errors\nsuch as myopia and hyperopia. With these simulations we can see potential\nimprovements in visual acuity and comfort. These simulations provide valuable\ninsights for the design and development of future VCD technologies, ultimately\nadvancing accessibility and usability for individuals with visual challenges.\n","authors":["Vidya Sunil","Renu M Rameshan"],"pdf_url":"https://arxiv.org/pdf/2404.08238v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08237v1","updated":"2024-04-12T04:44:11Z","published":"2024-04-12T04:44:11Z","title":"IFViT: Interpretable Fixed-Length Representation for Fingerprint\n Matching via Vision Transformer","summary":" Determining dense feature points on fingerprints used in constructing deep\nfixed-length representations for accurate matching, particularly at the pixel\nlevel, is of significant interest. To explore the interpretability of\nfingerprint matching, we propose a multi-stage interpretable fingerprint\nmatching network, namely Interpretable Fixed-length Representation for\nFingerprint Matching via Vision Transformer (IFViT), which consists of two\nprimary modules. The first module, an interpretable dense registration module,\nestablishes a Vision Transformer (ViT)-based Siamese Network to capture\nlong-range dependencies and the global context in fingerprint pairs. It\nprovides interpretable dense pixel-wise correspondences of feature points for\nfingerprint alignment and enhances the interpretability in the subsequent\nmatching stage. The second module takes into account both local and global\nrepresentations of the aligned fingerprint pair to achieve an interpretable\nfixed-length representation extraction and matching. It employs the ViTs\ntrained in the first module with the additional fully connected layer and\nretrains them to simultaneously produce the discriminative fixed-length\nrepresentation and interpretable dense pixel-wise correspondences of feature\npoints. Extensive experimental results on diverse publicly available\nfingerprint databases demonstrate that the proposed framework not only exhibits\nsuperior performance on dense registration and matching but also significantly\npromotes the interpretability in deep fixed-length representations-based\nfingerprint matching.\n","authors":["Yuhang Qiu","Honghui Chen","Xingbo Dong","Zheng Lin","Iman Yi Liao","Massimo Tistarelli","Zhe Jin"],"pdf_url":"https://arxiv.org/pdf/2404.08237v1.pdf","comment":"ready to submit to IEEE Transactions on Information Forensics and\n Security (TIFS)"},{"id":"http://arxiv.org/abs/2302.06874v2","updated":"2024-04-12T04:42:29Z","published":"2023-02-14T07:39:37Z","title":"Robust Representation Learning with Self-Distillation for Domain\n Generalization","summary":" Despite the recent success of deep neural networks, there remains a need for\neffective methods to enhance domain generalization using vision transformers.\nIn this paper, we propose a novel domain generalization technique called Robust\nRepresentation Learning with Self-Distillation (RRLD) comprising i)\nintermediate-block self-distillation and ii) augmentation-guided\nself-distillation to improve the generalization capabilities of\ntransformer-based models on unseen domains. This approach enables the network\nto learn robust and general features that are invariant to different\naugmentations and domain shifts while effectively mitigating overfitting to\nsource domains. To evaluate the effectiveness of our proposed method, we\nperform extensive experiments on PACS and OfficeHome benchmark datasets, as\nwell as an industrial wafer semiconductor defect dataset. The results\ndemonstrate that RRLD achieves robust and accurate generalization performance.\nWe observe an average accuracy improvement in the range of 1.2% to 2.3% over\nthe state-of-the-art on the three datasets.\n","authors":["Ankur Singh","Senthilnath Jayavelu"],"pdf_url":"https://arxiv.org/pdf/2302.06874v2.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2404.05960v2","updated":"2024-04-12T04:23:12Z","published":"2024-04-09T02:47:52Z","title":"EasyTrack: Efficient and Compact One-stream 3D Point Clouds Tracker","summary":" Most of 3D single object trackers (SOT) in point clouds follow the two-stream\nmulti-stage 3D Siamese or motion tracking paradigms, which process the template\nand search area point clouds with two parallel branches, built on supervised\npoint cloud backbones. In this work, beyond typical 3D Siamese or motion\ntracking, we propose a neat and compact one-stream transformer 3D SOT paradigm\nfrom the novel perspective, termed as \\textbf{EasyTrack}, which consists of\nthree special designs: 1) A 3D point clouds tracking feature pre-training\nmodule is developed to exploit the masked autoencoding for learning 3D point\nclouds tracking representations. 2) A unified 3D tracking feature learning and\nfusion network is proposed to simultaneously learns target-aware 3D features,\nand extensively captures mutual correlation through the flexible self-attention\nmechanism. 3) A target location network in the dense bird's eye view (BEV)\nfeature space is constructed for target classification and regression.\nMoreover, we develop an enhanced version named EasyTrack++, which designs the\ncenter points interaction (CPI) strategy to reduce the ambiguous targets caused\nby the noise point cloud background information. The proposed EasyTrack and\nEasyTrack++ set a new state-of-the-art performance ($\\textbf{18\\%}$,\n$\\textbf{40\\%}$ and $\\textbf{3\\%}$ success gains) in KITTI, NuScenes, and Waymo\nwhile runing at \\textbf{52.6fps} with few parameters (\\textbf{1.3M}). The code\nwill be available at https://github.com/KnightApple427/Easytrack.\n","authors":["Baojie Fan","Wuyang Zhou","Kai Wang","Shijun Zhou","Fengyu Xu","Jiandong Tian"],"pdf_url":"https://arxiv.org/pdf/2404.05960v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08229v1","updated":"2024-04-12T04:08:21Z","published":"2024-04-12T04:08:21Z","title":"Enhancing Traffic Safety with Parallel Dense Video Captioning for\n End-to-End Event Analysis","summary":" This paper introduces our solution for Track 2 in AI City Challenge 2024. The\ntask aims to solve traffic safety description and analysis with the dataset of\nWoven Traffic Safety (WTS), a real-world Pedestrian-Centric Traffic Video\nDataset for Fine-grained Spatial-Temporal Understanding. Our solution mainly\nfocuses on the following points: 1) To solve dense video captioning, we\nleverage the framework of dense video captioning with parallel decoding (PDVC)\nto model visual-language sequences and generate dense caption by chapters for\nvideo. 2) Our work leverages CLIP to extract visual features to more\nefficiently perform cross-modality training between visual and textual\nrepresentations. 3) We conduct domain-specific model adaptation to mitigate\ndomain shift problem that poses recognition challenge in video understanding.\n4) Moreover, we leverage BDD-5K captioned videos to conduct knowledge transfer\nfor better understanding WTS videos and more accurate captioning. Our solution\nhas yielded on the test set, achieving 6th place in the competition. The open\nsource code will be available at https://github.com/UCF-SST-Lab/AICity2024CVPRW\n","authors":["Maged Shoman","Dongdong Wang","Armstrong Aboah","Mohamed Abdel-Aty"],"pdf_url":"https://arxiv.org/pdf/2404.08229v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08226v1","updated":"2024-04-12T03:43:37Z","published":"2024-04-12T03:43:37Z","title":"Improving Continuous Sign Language Recognition with Adapted Image Models","summary":" The increase of web-scale weakly labelled image-text pairs have greatly\nfacilitated the development of large-scale vision-language models (e.g., CLIP),\nwhich have shown impressive generalization performance over a series of\ndownstream tasks. However, the massive model size and scarcity of available\ndata limit their applications to fine-tune the whole model in downstream tasks.\nBesides, fully fine-tuning the model easily forgets the generic essential\nknowledge acquired in the pretraining stage and overfits the downstream data.\nTo enable high efficiency when adapting these large vision-language models\n(e.g., CLIP) to performing continuous sign language recognition (CSLR) while\npreserving their generalizability, we propose a novel strategy (AdaptSign).\nEspecially, CLIP is adopted as the visual backbone to extract frame-wise\nfeatures whose parameters are fixed, and a set of learnable modules are\nintroduced to model spatial sign variations or capture temporal sign movements.\nThe introduced additional modules are quite lightweight, only owning 3.2% extra\ncomputations with high efficiency. The generic knowledge acquired in the\npretraining stage is well-preserved in the frozen CLIP backbone in this\nprocess. Extensive experiments show that despite being efficient, AdaptSign is\nable to demonstrate superior performance across a series of CSLR benchmarks\nincluding PHOENIX14, PHOENIX14-T, CSL-Daily and CSL compared to existing\nmethods. Visualizations show that AdaptSign could learn to dynamically pay\nmajor attention to the informative spatial regions and cross-frame trajectories\nin sign videos.\n","authors":["Lianyu Hu","Tongkai Shi","Liqing Gao","Zekang Liu","Wei Feng"],"pdf_url":"https://arxiv.org/pdf/2404.08226v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.04582v2","updated":"2024-04-12T03:33:31Z","published":"2023-10-06T20:48:43Z","title":"Universal Humanoid Motion Representations for Physics-Based Control","summary":" We present a universal motion representation that encompasses a comprehensive\nrange of motor skills for physics-based humanoid control. Due to the high\ndimensionality of humanoids and the inherent difficulties in reinforcement\nlearning, prior methods have focused on learning skill embeddings for a narrow\nrange of movement styles (e.g. locomotion, game characters) from specialized\nmotion datasets. This limited scope hampers their applicability in complex\ntasks. We close this gap by significantly increasing the coverage of our motion\nrepresentation space. To achieve this, we first learn a motion imitator that\ncan imitate all of human motion from a large, unstructured motion dataset. We\nthen create our motion representation by distilling skills directly from the\nimitator. This is achieved by using an encoder-decoder structure with a\nvariational information bottleneck. Additionally, we jointly learn a prior\nconditioned on proprioception (humanoid's own pose and velocities) to improve\nmodel expressiveness and sampling efficiency for downstream tasks. By sampling\nfrom the prior, we can generate long, stable, and diverse human motions. Using\nthis latent space for hierarchical RL, we show that our policies solve tasks\nusing human-like behavior. We demonstrate the effectiveness of our motion\nrepresentation by solving generative tasks (e.g. strike, terrain traversal) and\nmotion tracking using VR controllers.\n","authors":["Zhengyi Luo","Jinkun Cao","Josh Merel","Alexander Winkler","Jing Huang","Kris Kitani","Weipeng Xu"],"pdf_url":"https://arxiv.org/pdf/2310.04582v2.pdf","comment":"ICLR 2024 Spotlight. Project page:\n https://zhengyiluo.github.io/PULSE/"},{"id":"http://arxiv.org/abs/2403.12416v2","updated":"2024-04-12T03:15:26Z","published":"2024-03-19T03:59:14Z","title":"Eye-gaze Guided Multi-modal Alignment Framework for Radiology","summary":" In multi-modal frameworks, the alignment of cross-modal features presents a\nsignificant challenge. The predominant approach in multi-modal pre-training\nemphasizes either global or local alignment between modalities, utilizing\nextensive datasets. This bottom-up driven method often suffers from a lack of\ninterpretability, a critical concern in radiology. Previous studies have\nintegrated high-level labels in medical images or text, but these still rely on\nmanual annotation, a costly and labor-intensive process. Our work introduces a\nnovel approach by using eye-gaze data, collected synchronously by radiologists\nduring diagnostic evaluations. This data, indicating radiologists' focus areas,\nnaturally links chest X-rays to diagnostic texts. We propose the Eye-gaze\nGuided Multi-modal Alignment (EGMA) framework to harness eye-gaze data for\nbetter alignment of image and text features, aiming to reduce reliance on\nmanual annotations and thus cut training costs. Our model demonstrates robust\nperformance, outperforming other state-of-the-art methods in zero-shot\nclassification and retrieval tasks. The incorporation of easily-obtained\neye-gaze data during routine radiological diagnoses signifies a step towards\nminimizing manual annotation dependency. Additionally, we explore the impact of\nvarying amounts of eye-gaze data on model performance, highlighting the\nfeasibility and utility of integrating this auxiliary data into multi-modal\npre-training.\n","authors":["Chong Ma","Hanqi Jiang","Wenting Chen","Zihao Wu","Xiaowei Yu","Fang Zeng","Lei Guo","Dajiang Zhu","Tuo Zhang","Dinggang Shen","Tianming Liu","Xiang Li"],"pdf_url":"https://arxiv.org/pdf/2403.12416v2.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2310.15036v3","updated":"2024-04-12T03:14:34Z","published":"2023-10-23T15:34:03Z","title":"A Technique for Classifying Static Gestures Using UWB Radar","summary":" Our paper presents a robust framework for UWB-based static gesture\nrecognition, leveraging proprietary UWB radar sensor technology. Extensive data\ncollection efforts were undertaken to compile datasets containing five commonly\nused gestures. Our approach involves a comprehensive data pre-processing\npipeline that encompasses outlier handling, aspect ratio-preserving resizing,\nand false-color image transformation. Both CNN and MobileNet models were\ntrained on the processed images. Remarkably, our best-performing model achieved\nan accuracy of 96.78%. Additionally, we developed a user-friendly GUI framework\nto assess the model's system resource usage and processing times, which\nrevealed low memory utilization and real-time task completion in under one\nsecond. This research marks a significant step towards enhancing static gesture\nrecognition using UWB technology, promising practical applications in various\ndomains.\n","authors":["Abhishek Sebastian","Pragna R"],"pdf_url":"https://arxiv.org/pdf/2310.15036v3.pdf","comment":"This is not a technical research paper, but an excerpt of what was\n applied during a funded project for the promotion of Open Science"},{"id":"http://arxiv.org/abs/2312.17428v2","updated":"2024-04-12T03:06:07Z","published":"2023-12-29T01:42:20Z","title":"ChangeNet: Multi-Temporal Asymmetric Change Detection Dataset","summary":" Change Detection (CD) has been attracting extensive interests with the\navailability of bi-temporal datasets. However, due to the huge cost of\nmulti-temporal images acquisition and labeling, existing change detection\ndatasets are small in quantity, short in temporal, and low in practicability.\nTherefore, a large-scale practical-oriented dataset covering wide temporal\nphases is urgently needed to facilitate the community. To this end, the\nChangeNet dataset is presented especially for multi-temporal change detection,\nalong with the new task of \"Asymmetric Change Detection\". Specifically,\nChangeNet consists of 31,000 multi-temporal images pairs, a wide range of\ncomplex scenes from 100 cities, and 6 pixel-level annotated categories, which\nis far superior to all the existing change detection datasets including\nLEVIR-CD, WHU Building CD, etc.. In addition, ChangeNet contains amounts of\nreal-world perspective distortions in different temporal phases on the same\nareas, which is able to promote the practical application of change detection\nalgorithms. The ChangeNet dataset is suitable for both binary change detection\n(BCD) and semantic change detection (SCD) tasks. Accordingly, we benchmark the\nChangeNet dataset on six BCD methods and two SCD methods, and extensive\nexperiments demonstrate its challenges and great significance. The dataset is\navailable at https://github.com/jankyee/ChangeNet.\n","authors":["Deyi Ji","Siqi Gao","Mingyuan Tao","Hongtao Lu","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2312.17428v2.pdf","comment":"Accepted to ICASSP 2024 Oral/Lecture"},{"id":"http://arxiv.org/abs/2402.09055v2","updated":"2024-04-12T02:51:45Z","published":"2024-02-14T10:05:19Z","title":"Comment-aided Video-Language Alignment via Contrastive Pre-training for\n Short-form Video Humor Detection","summary":" The growing importance of multi-modal humor detection within affective\ncomputing correlates with the expanding influence of short-form video sharing\non social media platforms. In this paper, we propose a novel two-branch\nhierarchical model for short-form video humor detection (SVHD), named\nComment-aided Video-Language Alignment (CVLA) via data-augmented multi-modal\ncontrastive pre-training. Notably, our CVLA not only operates on raw signals\nacross various modal channels but also yields an appropriate multi-modal\nrepresentation by aligning the video and language components within a\nconsistent semantic space. The experimental results on two humor detection\ndatasets, including DY11k and UR-FUNNY, demonstrate that CVLA dramatically\noutperforms state-of-the-art and several competitive baseline approaches. Our\ndataset, code and model release at https://github.com/yliu-cs/CVLA.\n","authors":["Yang Liu","Tongfei Shen","Dong Zhang","Qingying Sun","Shoushan Li","Guodong Zhou"],"pdf_url":"https://arxiv.org/pdf/2402.09055v2.pdf","comment":"Accepted by ICMR 2024"},{"id":"http://arxiv.org/abs/2403.18554v2","updated":"2024-04-12T02:27:09Z","published":"2024-03-27T13:33:14Z","title":"CosalPure: Learning Concept from Group Images for Robust Co-Saliency\n Detection","summary":" Co-salient object detection (CoSOD) aims to identify the common and salient\n(usually in the foreground) regions across a given group of images. Although\nachieving significant progress, state-of-the-art CoSODs could be easily\naffected by some adversarial perturbations, leading to substantial accuracy\nreduction. The adversarial perturbations can mislead CoSODs but do not change\nthe high-level semantic information (e.g., concept) of the co-salient objects.\nIn this paper, we propose a novel robustness enhancement framework by first\nlearning the concept of the co-salient objects based on the input group images\nand then leveraging this concept to purify adversarial perturbations, which are\nsubsequently fed to CoSODs for robustness enhancement. Specifically, we propose\nCosalPure containing two modules, i.e., group-image concept learning and\nconcept-guided diffusion purification. For the first module, we adopt a\npre-trained text-to-image diffusion model to learn the concept of co-salient\nobjects within group images where the learned concept is robust to adversarial\nexamples. For the second module, we map the adversarial image to the latent\nspace and then perform diffusion generation by embedding the learned concept\ninto the noise prediction function as an extra condition. Our method can\neffectively alleviate the influence of the SOTA adversarial attack containing\ndifferent adversarial patterns, including exposure and noise. The extensive\nresults demonstrate that our method could enhance the robustness of CoSODs\nsignificantly.\n","authors":["Jiayi Zhu","Qing Guo","Felix Juefei-Xu","Yihao Huang","Yang Liu","Geguang Pu"],"pdf_url":"https://arxiv.org/pdf/2403.18554v2.pdf","comment":"This paper is accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2404.08201v1","updated":"2024-04-12T02:14:35Z","published":"2024-04-12T02:14:35Z","title":"A Mutual Inclusion Mechanism for Precise Boundary Segmentation in\n Medical Images","summary":" In medical imaging, accurate image segmentation is crucial for quantifying\ndiseases, assessing prognosis, and evaluating treatment outcomes. However,\nexisting methods lack an in-depth integration of global and local features,\nfailing to pay special attention to abnormal regions and boundary details in\nmedical images. To this end, we present a novel deep learning-based approach,\nMIPC-Net, for precise boundary segmentation in medical images. Our approach,\ninspired by radiologists' working patterns, features two distinct modules: (i)\n\\textbf{Mutual Inclusion of Position and Channel Attention (MIPC) module}: To\nenhance the precision of boundary segmentation in medical images, we introduce\nthe MIPC module, which enhances the focus on channel information when\nextracting position features and vice versa; (ii) \\textbf{GL-MIPC-Residue}: To\nimprove the restoration of medical images, we propose the GL-MIPC-Residue, a\nglobal residual connection that enhances the integration of the encoder and\ndecoder by filtering out invalid information and restoring the most effective\ninformation lost during the feature extraction process. We evaluate the\nperformance of the proposed model using metrics such as Dice coefficient (DSC)\nand Hausdorff Distance (HD) on three publicly accessible datasets: Synapse,\nISIC2018-Task, and Segpc. Our ablation study shows that each module contributes\nto improving the quality of segmentation results. Furthermore, with the\nassistance of both modules, our approach outperforms state-of-the-art methods\nacross all metrics on the benchmark datasets, notably achieving a 2.23mm\nreduction in HD on the Synapse dataset, strongly evidencing our model's\nenhanced capability for precise image boundary segmentation. Codes will be\navailable at https://github.com/SUN-1024/MIPC-Net.\n","authors":["Yizhi Pan","Junyi Xin","Tianhua Yang","Teeradaj Racharak","Le-Minh Nguyen","Guanqun Sun"],"pdf_url":"https://arxiv.org/pdf/2404.08201v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08197v1","updated":"2024-04-12T02:04:34Z","published":"2024-04-12T02:04:34Z","title":"Scaling (Down) CLIP: A Comprehensive Analysis of Data, Architecture, and\n Training Strategies","summary":" This paper investigates the performance of the Contrastive Language-Image\nPre-training (CLIP) when scaled down to limited computation budgets. We explore\nCLIP along three dimensions: data, architecture, and training strategies. With\nregards to data, we demonstrate the significance of high-quality training data\nand show that a smaller dataset of high-quality data can outperform a larger\ndataset with lower quality. We also examine how model performance varies with\ndifferent dataset sizes, suggesting that smaller ViT models are better suited\nfor smaller datasets, while larger models perform better on larger datasets\nwith fixed compute. Additionally, we provide guidance on when to choose a\nCNN-based architecture or a ViT-based architecture for CLIP training. We\ncompare four CLIP training strategies - SLIP, FLIP, CLIP, and CLIP+Data\nAugmentation - and show that the choice of training strategy depends on the\navailable compute resource. Our analysis reveals that CLIP+Data Augmentation\ncan achieve comparable performance to CLIP using only half of the training\ndata. This work provides practical insights into how to effectively train and\ndeploy CLIP models, making them more accessible and affordable for practical\nuse in various applications.\n","authors":["Zichao Li","Cihang Xie","Ekin Dogus Cubuk"],"pdf_url":"https://arxiv.org/pdf/2404.08197v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08195v1","updated":"2024-04-12T01:54:59Z","published":"2024-04-12T01:54:59Z","title":"Tackling Ambiguity from Perspective of Uncertainty Inference and\n Affinity Diversification for Weakly Supervised Semantic Segmentation","summary":" Weakly supervised semantic segmentation (WSSS) with image-level labels\nintends to achieve dense tasks without laborious annotations. However, due to\nthe ambiguous contexts and fuzzy regions, the performance of WSSS, especially\nthe stages of generating Class Activation Maps (CAMs) and refining pseudo\nmasks, widely suffers from ambiguity while being barely noticed by previous\nliterature. In this work, we propose UniA, a unified single-staged WSSS\nframework, to efficiently tackle this issue from the perspective of uncertainty\ninference and affinity diversification, respectively. When activating class\nobjects, we argue that the false activation stems from the bias to the\nambiguous regions during the feature extraction. Therefore, we design a more\nrobust feature representation with a probabilistic Gaussian distribution and\nintroduce the uncertainty estimation to avoid the bias. A distribution loss is\nparticularly proposed to supervise the process, which effectively captures the\nambiguity and models the complex dependencies among features. When refining\npseudo labels, we observe that the affinity from the prevailing refinement\nmethods intends to be similar among ambiguities. To this end, an affinity\ndiversification module is proposed to promote diversity among semantics. A\nmutual complementing refinement is proposed to initially rectify the ambiguous\naffinity with multiple inferred pseudo labels. More importantly, a contrastive\naffinity loss is further designed to diversify the relations among unrelated\nsemantics, which reliably propagates the diversity into the whole feature\nrepresentations and helps generate better pseudo masks. Extensive experiments\nare conducted on PASCAL VOC, MS COCO, and medical ACDC datasets, which validate\nthe efficiency of UniA tackling ambiguity and the superiority over recent\nsingle-staged or even most multi-staged competitors.\n","authors":["Zhiwei Yang","Yucong Meng","Kexue Fu","Shuo Wang","Zhijian Song"],"pdf_url":"https://arxiv.org/pdf/2404.08195v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08187v1","updated":"2024-04-12T01:36:00Z","published":"2024-04-12T01:36:00Z","title":"Adapting CNNs for Fisheye Cameras without Retraining","summary":" The majority of image processing approaches assume images are in or can be\nrectified to a perspective projection. However, in many applications it is\nbeneficial to use non conventional cameras, such as fisheye cameras, that have\na larger field of view (FOV). The issue arises that these large-FOV images\ncan't be rectified to a perspective projection without significant cropping of\nthe original image. To address this issue we propose Rectified Convolutions\n(RectConv); a new approach for adapting pre-trained convolutional networks to\noperate with new non-perspective images, without any retraining. Replacing the\nconvolutional layers of the network with RectConv layers allows the network to\nsee both rectified patches and the entire FOV. We demonstrate RectConv adapting\nmultiple pre-trained networks to perform segmentation and detection on fisheye\nimagery from two publicly available datasets. Our approach requires no\nadditional data or training, and operates directly on the native image as\ncaptured from the camera. We believe this work is a step toward adapting the\nvast resources available for perspective images to operate across a broad range\nof camera geometries.\n","authors":["Ryan Griffiths","Donald G. Dansereau"],"pdf_url":"https://arxiv.org/pdf/2404.08187v1.pdf","comment":"Project page: https://roboticimaging.org/Projects/RectConv/"},{"id":"http://arxiv.org/abs/2404.08184v1","updated":"2024-04-12T01:13:23Z","published":"2024-04-12T01:13:23Z","title":"Measuring Domain Shifts using Deep Learning Remote Photoplethysmography\n Model Similarity","summary":" Domain shift differences between training data for deep learning models and\nthe deployment context can result in severe performance issues for models which\nfail to generalize. We study the domain shift problem under the context of\nremote photoplethysmography (rPPG), a technique for video-based heart rate\ninference. We propose metrics based on model similarity which may be used as a\nmeasure of domain shift, and we demonstrate high correlation between these\nmetrics and empirical performance. One of the proposed metrics with viable\ncorrelations, DS-diff, does not assume access to the ground truth of the target\ndomain, i.e. it may be applied to in-the-wild data. To that end, we investigate\na model selection problem in which ground truth results for the evaluation\ndomain is not known, demonstrating a 13.9% performance improvement over the\naverage case baseline.\n","authors":["Nathan Vance","Patrick Flynn"],"pdf_url":"https://arxiv.org/pdf/2404.08184v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08181v1","updated":"2024-04-12T01:08:04Z","published":"2024-04-12T01:08:04Z","title":"Pay Attention to Your Neighbours: Training-Free Open-Vocabulary Semantic\n Segmentation","summary":" Despite the significant progress in deep learning for dense visual\nrecognition problems, such as semantic segmentation, traditional methods are\nconstrained by fixed class sets. Meanwhile, vision-language foundation models,\nsuch as CLIP, have showcased remarkable effectiveness in numerous zero-shot\nimage-level tasks, owing to their robust generalizability. Recently, a body of\nwork has investigated utilizing these models in open-vocabulary semantic\nsegmentation (OVSS). However, existing approaches often rely on impractical\nsupervised pre-training or access to additional pre-trained networks. In this\nwork, we propose a strong baseline for training-free OVSS, termed\nNeighbour-Aware CLIP (NACLIP), representing a straightforward adaptation of\nCLIP tailored for this scenario. Our method enforces localization of patches in\nthe self-attention of CLIP's vision transformer which, despite being crucial\nfor dense prediction tasks, has been overlooked in the OVSS literature. By\nincorporating design choices favouring segmentation, our approach significantly\nimproves performance without requiring additional data, auxiliary pre-trained\nnetworks, or extensive hyperparameter tuning, making it highly practical for\nreal-world applications. Experiments are performed on 8 popular semantic\nsegmentation benchmarks, yielding state-of-the-art performance on most\nscenarios. Our code is publicly available at https://github.com/sinahmr/NACLIP .\n","authors":["Sina Hajimiri","Ismail Ben Ayed","Jose Dolz"],"pdf_url":"https://arxiv.org/pdf/2404.08181v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03584v2","updated":"2024-04-12T00:52:35Z","published":"2023-06-06T11:03:05Z","title":"RDFC-GAN: RGB-Depth Fusion CycleGAN for Indoor Depth Completion","summary":" Raw depth images captured in indoor scenarios frequently exhibit extensive\nmissing values due to the inherent limitations of the sensors and environments.\nFor example, transparent materials frequently elude detection by depth sensors;\nsurfaces may introduce measurement inaccuracies due to their polished textures,\nextended distances, and oblique incidence angles from the sensor. The presence\nof incomplete depth maps imposes significant challenges for subsequent vision\napplications, prompting the development of numerous depth completion techniques\nto mitigate this problem. Numerous methods excel at reconstructing dense depth\nmaps from sparse samples, but they often falter when faced with extensive\ncontiguous regions of missing depth values, a prevalent and critical challenge\nin indoor environments. To overcome these challenges, we design a novel\ntwo-branch end-to-end fusion network named RDFC-GAN, which takes a pair of RGB\nand incomplete depth images as input to predict a dense and completed depth\nmap. The first branch employs an encoder-decoder structure, by adhering to the\nManhattan world assumption and utilizing normal maps from RGB-D information as\nguidance, to regress the local dense depth values from the raw depth map. The\nother branch applies an RGB-depth fusion CycleGAN, adept at translating RGB\nimagery into detailed, textured depth maps while ensuring high fidelity through\ncycle consistency. We fuse the two branches via adaptive fusion modules named\nW-AdaIN and train the model with the help of pseudo depth maps. Comprehensive\nevaluations on NYU-Depth V2 and SUN RGB-D datasets show that our method\nsignificantly enhances depth completion performance particularly in realistic\nindoor settings.\n","authors":["Haowen Wang","Zhengping Che","Yufan Yang","Mingyuan Wang","Zhiyuan Xu","Xiuquan Qiao","Mengshi Qi","Feifei Feng","Jian Tang"],"pdf_url":"https://arxiv.org/pdf/2306.03584v2.pdf","comment":"Haowen Wang and Zhengping Che are with equal contributions. Paper\n accepted by IEEE Transactions on Pattern Analysis and Machine Intelligence\n (TPAMI). An earlier version has been accepted by CVPR 2022\n (arXiv:2203.10856). arXiv admin note: text overlap with arXiv:2203.10856"},{"id":"http://arxiv.org/abs/2305.09948v5","updated":"2024-04-12T00:46:26Z","published":"2023-05-17T05:03:46Z","title":"HICO-DET-SG and V-COCO-SG: New Data Splits for Evaluating the Systematic\n Generalization Performance of Human-Object Interaction Detection Models","summary":" Human-Object Interaction (HOI) detection is a task to localize humans and\nobjects in an image and predict the interactions in human-object pairs. In\nreal-world scenarios, HOI detection models need systematic generalization,\ni.e., generalization to novel combinations of objects and interactions, because\nthe train data are expected to cover a limited portion of all possible\ncombinations. To evaluate the systematic generalization performance of HOI\ndetection models, we created two new sets of HOI detection data splits named\nHICO-DET-SG and V-COCO-SG based on the HICO-DET and V-COCO datasets,\nrespectively. When evaluated on the new data splits, HOI detection models with\nvarious characteristics performed much more poorly than when evaluated on the\noriginal splits. This shows that systematic generalization is a challenging\ngoal in HOI detection. By analyzing the evaluation results, we also gain\ninsights for improving the systematic generalization performance and identify\nfour possible future research directions. We hope that our new data splits and\npresented analysis will encourage further research on systematic generalization\nin HOI detection.\n","authors":["Kentaro Takemoto","Moyuru Yamada","Tomotake Sasaki","Hisanao Akima"],"pdf_url":"https://arxiv.org/pdf/2305.09948v5.pdf","comment":"19 pages, 3 figures, 4 tables"}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 0000000..7f5166c Binary files /dev/null and b/favicon.ico differ diff --git a/index.css b/index.css new file mode 100644 index 0000000..9ded9d9 --- /dev/null +++ b/index.css @@ -0,0 +1,355 @@ +:root { + /* Palette: Nord (https://www.nordtheme.com)*/ + --nord00: #2e3440; + --nord01: #3b4252; + --nord02: #434c5e; + --nord03: #4c566a; + --nord04: #d8dee9; + --nord05: #e5e9f0; + --nord06: #eceff4; + --nord07: #8fbcbb; + --nord08: #88c0d0; + --nord09: #81a1c1; + --nord0A: #5e81ac; + --nord0B: #bf616a; + --nord0C: #d08770; + --nord0D: #ebcb8b; + --nord0E: #a3be8c; + --nord0F: #b48ead; + + + /* Typograph */ + --font-family-default: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue", + sans-serif; + --font-size-scaler: 62.5%; + --font-size-m: 1.6rem; + --font-size-s: 1.4rem; + + /* Components */ + --body-color: var(--nord06); + --body-bg: var(--nord00); + + --header-title: var(--nord06); + --header-container: var(--nord00); + --header-title-preffix: var(--nord0F); + + --chip-font: var(--nord08); + --chip-color: var(--nord0B); + + --icons: var(--nord06); + --icons-hover: var(--nord0F); + + --day-container: var(--nord01); + --date: var(--nord09); + + --summary: var(--nord0E); + --summary-hover: var(--nord0F); + + --details-open: var(--nord02); + --details-content: var(--nord05); + --details-a: var(--nord07); + --details-a-hover: var(--nord0F); + + --highlight-title: var(--nord0B); + --highlight-author: var(--nord0B); + + --article-summary-hover-color: var(--nord0D); + --article-summary-color: var(--nord04); + + --article-title-color: var(--nord05); + --article-title-hover-color: var(--nord0E); + + --accordion-content-rail-color: var(--nord01); + --accordion-content-hover-rail-color: var(--nord0D); + --accordion-title-marker-color: var(--nord01); + --accordion-title-hover-marker-color: var(--nord0E); + + --footer-color: var(--nord04); + --footer-link-hover-color: var(--nord0D); +} + +[data-theme="light"] { + /* Theme design */ + + --color-primary: var(--nord07); + --color-primary-second: var(--nord00); + --color-info: var(--nord0A); + --color-success: var(--nord0E); + --color-warning: var(--nord0C); + --color-danger: var(--nord0B); + + --color-text: var(--nord00); + --color-hover: var(--nord0D); + --color-shadow: var(--nord03); + + --color-primary-h: var(--nord09); + --color-primary-s: var(--nord08); + --color-primary-l: var(--nord07); + + --color-contrast-higher-h: var(--nord01); + --color-contrast-higher-l: var(--nord02); + --color-contrast-higher-s: var(--nord03); + + --color-content: white; + + --background: var(--nord06); + --background-content: var(--nord05); + --background-color: var(--nord04); + + /* Components */ + + --chip-font: var(--nord06); + --chip-color: var(--nord09); + + --body-color: var(--background-color); + --body-bg: var(--background); + + --header-title: var(--color-shadow); + --header-container: var(--background); + --header-title-preffix: var(--color-primary-h); + + --icons: var(--color-shadow); + --icons-hover: var(--color-hover); + + --day-container: var(--background-content); + --date: var(--color-primary-l); + + --summary: var(--color-info); + --summary-hover: var(--color-success); + + --details-open: var(--color-content); + --details-content: var(--color-text); + --details-a: var(--color-primary-h); + --details-a-hover: var(--color-hover); + + --highlight-title: var(--color-danger); + --highlight-author: var(--color-warning); + + --article-summary-color: var(--color-text); + --article-summary-hover-color: var(--color-primary-s); + + --article-title-color: var(--color-primary); + --article-title-hover-color: var(--color-success); + + --accordion-content-rail-color: var(--color-warning); + --accordion-content-hover-rail-color: var(--color-warning); + --accordion-title-marker-color: var(--color-success); + --accordion-title-hover-marker-color: var(--color-success); + + --footer-color: var(--color-text); + --footer-link-hover-color: var(--color-hover); +} + +html { + font-size: var(--font-size-scaler); +} + +body { + background-color: var(--body-bg); + font-family: var(--font-family-default); + color: var(--body-color); + margin: 0; + padding-top: 16px; + display: grid; +} + +.header-container { + width: 90%; + max-width: 1200px; + background: var(--header-container); + margin: 0 auto; +} + +.header-title { + font-size: 32px; + font-weight: bold; + color: var(--header-title); + margin: 0; + padding-bottom: 14px; +} + +.header-title-preffix { + color: var(--header-title-preffix); +} + +.icons { + color: var(--icons); + padding-bottom: 16px; +} + +.icons a { + color: var(--icons); + text-decoration: none; +} + +.icons a:hover { + color: var(--icons-hover); +} + +.day-container { + padding: 16px 16px 16px 16px; + background: var(--day-container); + width: 90%; + max-width: 1200px; + margin: 0 auto; + margin-bottom: 8px; + border-radius: 10px; +} + +.date { + font-size: 24px; + font-weight: 700; + margin: 0; + color: var(--date); +} + +p { + margin: 0; +} + +summary { + font-weight: 600; + color: var(--summary); +} + +summary:hover { + text-decoration: underline; + cursor: pointer; + color: var(--summary-hover); +} + +details { + --border-color: transparent; + + padding: 2px 4px; + font-size: 20px; + border: 1px solid var(--border-color); + border-radius: 4px; +} + +details[open] { + background-color: var(--details-open); + margin-bottom: 8px; +} + +.details-content { + padding: 12px 3px; + gap: 16px; + color: var(--details-content); +} + +details a { + color: var(--details-a); +} + +details a:hover { + color: var(--details-a-hover); +} + +footer { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + justify-content: space-between; +} + +.description { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + text-align: center; +} + +.highlight-author { + color: var(--highlight-author); + font-weight: bold; +} + +.highlight-title { + color: var(--highlight-title); + font-weight: bold; +} + +.channel-description { + text-align: center; + font-size: var(--font-size-scaler); +} + +.article-summary-link { + color: var(--article-summary-color); + font-size: var(--font-size-s); + text-decoration: none; +} + +.article-summary-link:hover { + color: var(--article-summary-hover-color); + --accordion-content-rail-color: var(--accordion-content-hover-rail-color); +} + +.article-summary-box-outer { + display: block; + padding: 4px 8px 8px 4px; +} + +.article-summary-box-inner { + padding-left: 8px; + border-left: 1px solid var(--accordion-content-rail-color); + font-size: var(--font-size-m); +} + +.article-expander { + padding: 10px 4px; + border-radius: 4px; +} + +.article-authors { + font-size: var(--font-size-m); + padding: 0.25em 1em; +} + +.article-authors a { + text-decoration: none; +} + +.article-expander-title { + font-size: var(--font-size-m); + font-weight: 600; +} + +.article-expander-title:hover { + cursor: pointer; +} + +.article-expander-title::marker { + color: var(--accordion-title-marker-color); +} + +.article-expander-title:hover::marker { + color: var(--accordion-title-hover-marker-color); +} + +/* for switcher */ +.theme-switch { + display: inline-block; + position: relative; +} + +.theme-switch input { + display: none; +} + +/* chip */ +.chip { + font-size: 90%; + align-items: center; + color: var(--chip-font); + background: var(--chip-color); + border-radius: 5rem; + display: inline-flex; + padding: .2rem .4rem; + vertical-align: middle; +} \ No newline at end of file diff --git a/index.html b/index.html new file mode 100644 index 0000000..d2d5063 --- /dev/null +++ b/index.html @@ -0,0 +1,192083 @@ + + + + + Yibo's arxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 116 + +
+
+
+ + ☆ EventEgo3D: 3D Human Motion Capture from Egocentric Event Streams CVPR + + +
+ Monocular egocentric 3D human motion capture is a challenging and actively +researched problem. Existing methods use synchronously operating visual sensors +(e.g. RGB cameras) and often fail under low lighting and fast motions, which +can be restricting in many applications involving head-mounted devices. In +response to the existing limitations, this paper 1) introduces a new problem, +i.e., 3D human motion capture from an egocentric monocular event camera with a +fisheye lens, and 2) proposes the first approach to it called EventEgo3D +(EE3D). Event streams have high temporal resolution and provide reliable cues +for 3D human motion capture under high-speed human motions and rapidly changing +illumination. The proposed EE3D framework is specifically tailored for learning +with event streams in the LNES representation, enabling high 3D reconstruction +accuracy. We also design a prototype of a mobile head-mounted device with an +event camera and record a real dataset with event observations and the +ground-truth 3D human poses (in addition to the synthetic dataset). Our EE3D +demonstrates robustness and superior 3D accuracy compared to existing solutions +across various challenging experiments while supporting real-time 3D pose +update rates of 140Hz. + +
+
+ comment: 14 pages, 11 figures and 6 tables; project page: + https://4dqv.mpi-inf.mpg.de/EventEgo3D/; Computer Vision and Pattern + Recognition (CVPR) 2024 +
+
+
+
+
+ + ☆ COCONut: Modernizing COCO Segmentation CVPR2024 + + +
+ In recent decades, the vision community has witnessed remarkable progress in +visual recognition, partially owing to advancements in dataset benchmarks. +Notably, the established COCO benchmark has propelled the development of modern +detection and segmentation systems. However, the COCO segmentation benchmark +has seen comparatively slow improvement over the last decade. Originally +equipped with coarse polygon annotations for thing instances, it gradually +incorporated coarse superpixel annotations for stuff regions, which were +subsequently heuristically amalgamated to yield panoptic segmentation +annotations. These annotations, executed by different groups of raters, have +resulted not only in coarse segmentation masks but also in inconsistencies +between segmentation types. In this study, we undertake a comprehensive +reevaluation of the COCO segmentation annotations. By enhancing the annotation +quality and expanding the dataset to encompass 383K images with more than 5.18M +panoptic masks, we introduce COCONut, the COCO Next Universal segmenTation +dataset. COCONut harmonizes segmentation annotations across semantic, instance, +and panoptic segmentation with meticulously crafted high-quality masks, and +establishes a robust benchmark for all segmentation tasks. To our knowledge, +COCONut stands as the inaugural large-scale universal segmentation dataset, +verified by human raters. We anticipate that the release of COCONut will +significantly contribute to the community's ability to assess the progress of +novel neural networks. + +
+
+ comment: Accepted at CVPR2024, data available at + https://xdeng7.github.io/coconut.github.io/ +
+
+
+
+
+ + ☆ Probing the 3D Awareness of Visual Foundation Models CVPR 2024 + + +
+ Recent advances in large-scale pretraining have yielded visual foundation +models with strong capabilities. Not only can recent models generalize to +arbitrary images for their training task, their intermediate representations +are useful for other visual tasks such as detection and segmentation. Given +that such models can classify, delineate, and localize objects in 2D, we ask +whether they also represent their 3D structure? In this work, we analyze the 3D +awareness of visual foundation models. We posit that 3D awareness implies that +representations (1) encode the 3D structure of the scene and (2) consistently +represent the surface across views. We conduct a series of experiments using +task-specific probes and zero-shot inference procedures on frozen features. Our +experiments reveal several limitations of the current models. Our code and +analysis can be found at https://github.com/mbanani/probe3d. + +
+
+ comment: Accepted to CVPR 2024. Project page: + https://github.com/mbanani/probe3d +
+
+
+
+
+ + ☆ Automatic Quantification of Serial PET/CT Images for Pediatric Hodgkin + Lymphoma Patients Using a Longitudinally-Aware Segmentation Network + + +
+ $\textbf{Purpose}$: Automatic quantification of longitudinal changes in PET +scans for lymphoma patients has proven challenging, as residual disease in +interim-therapy scans is often subtle and difficult to detect. Our goal was to +develop a longitudinally-aware segmentation network (LAS-Net) that can quantify +serial PET/CT images for pediatric Hodgkin lymphoma patients. +$\textbf{Materials and Methods}$: This retrospective study included baseline +(PET1) and interim (PET2) PET/CT images from 297 patients enrolled in two +Children's Oncology Group clinical trials (AHOD1331 and AHOD0831). LAS-Net +incorporates longitudinal cross-attention, allowing relevant features from PET1 +to inform the analysis of PET2. Model performance was evaluated using Dice +coefficients for PET1 and detection F1 scores for PET2. Additionally, we +extracted and compared quantitative PET metrics, including metabolic tumor +volume (MTV) and total lesion glycolysis (TLG) in PET1, as well as qPET and +$\Delta$SUVmax in PET2, against physician measurements. We quantified their +agreement using Spearman's $\rho$ correlations and employed bootstrap +resampling for statistical analysis. $\textbf{Results}$: LAS-Net detected +residual lymphoma in PET2 with an F1 score of 0.606 (precision/recall: +0.615/0.600), outperforming all comparator methods (P<0.01). For baseline +segmentation, LAS-Net achieved a mean Dice score of 0.772. In PET +quantification, LAS-Net's measurements of qPET, $\Delta$SUVmax, MTV and TLG +were strongly correlated with physician measurements, with Spearman's $\rho$ of +0.78, 0.80, 0.93 and 0.96, respectively. The performance remained high, with a +slight decrease, in an external testing cohort. $\textbf{Conclusion}$: LAS-Net +achieved high performance in quantifying PET metrics across serial scans, +highlighting the value of longitudinal awareness in evaluating multi-time-point +imaging datasets. + +
+
+ comment: 6 figures, 4 tables in the main text +
+
+
+
+
+ + ☆ Training-free Boost for Open-Vocabulary Object Detection with Confidence + Aggregation + + +
+ Open-vocabulary object detection (OVOD) aims at localizing and recognizing +visual objects from novel classes unseen at the training time. Whereas, +empirical studies reveal that advanced detectors generally assign lower scores +to those novel instances, which are inadvertently suppressed during inference +by commonly adopted greedy strategies like Non-Maximum Suppression (NMS), +leading to sub-optimal detection performance for novel classes. This paper +systematically investigates this problem with the commonly-adopted two-stage +OVOD paradigm. Specifically, in the region-proposal stage, proposals that +contain novel instances showcase lower objectness scores, since they are +treated as background proposals during the training phase. Meanwhile, in the +object-classification stage, novel objects share lower region-text similarities +(i.e., classification scores) due to the biased visual-language alignment by +seen training samples. To alleviate this problem, this paper introduces two +advanced measures to adjust confidence scores and conserve erroneously +dismissed objects: (1) a class-agnostic localization quality estimate via +overlap degree of region/object proposals, and (2) a text-guided visual +similarity estimate with proxy prototypes for novel classes. Integrated with +adjusting techniques specifically designed for the region-proposal and +object-classification stages, this paper derives the aggregated confidence +estimate for the open-vocabulary object detection paradigm (AggDet). Our AggDet +is a generic and training-free post-processing scheme, which consistently +bolsters open-vocabulary detectors across model scales and architecture +designs. For instance, AggDet receives 3.3% and 1.5% gains on OV-COCO and +OV-LVIS benchmarks respectively, without any training cost. + +
+
+
+
+
+ + ☆ Improving Referring Image Segmentation using Vision-Aware Text Features + + +
+ Referring image segmentation is a challenging task that involves generating +pixel-wise segmentation masks based on natural language descriptions. Existing +methods have relied mostly on visual features to generate the segmentation +masks while treating text features as supporting components. This over-reliance +on visual features can lead to suboptimal results, especially in complex +scenarios where text prompts are ambiguous or context-dependent. To overcome +these challenges, we present a novel framework VATEX to improve referring image +segmentation by enhancing object and context understanding with Vision-Aware +Text Feature. Our method involves using CLIP to derive a CLIP Prior that +integrates an object-centric visual heatmap with text description, which can be +used as the initial query in DETR-based architecture for the segmentation task. +Furthermore, by observing that there are multiple ways to describe an instance +in an image, we enforce feature similarity between text variations referring to +the same visual input by two components: a novel Contextual Multimodal Decoder +that turns text embeddings into vision-aware text features, and a Meaning +Consistency Constraint to ensure further the coherent and consistent +interpretation of language expressions with the context understanding obtained +from the image. Our method achieves a significant performance improvement on +three benchmark datasets RefCOCO, RefCOCO+ and G-Ref. Code is available at: +https://nero1342.github.io/VATEX\_RIS. + +
+
+ comment: 30 pages including supplementary +
+
+
+
+
+ + ☆ Enhancing Visual Question Answering through Question-Driven Image + Captions as Prompts CVPR 2024 + + +
+ Visual question answering (VQA) is known as an AI-complete task as it +requires understanding, reasoning, and inferring about the vision and the +language content. Over the past few years, numerous neural architectures have +been suggested for the VQA problem. However, achieving success in zero-shot VQA +remains a challenge due to its requirement for advanced generalization and +reasoning skills. This study explores the impact of incorporating image +captioning as an intermediary process within the VQA pipeline. Specifically, we +explore the efficacy of utilizing image captions instead of images and +leveraging large language models (LLMs) to establish a zero-shot setting. Since +image captioning is the most crucial step in this process, we compare the +impact of state-of-the-art image captioning models on VQA performance across +various question types in terms of structure and semantics. We propose a +straightforward and efficient question-driven image captioning approach within +this pipeline to transfer contextual information into the question-answering +(QA) model. This method involves extracting keywords from the question, +generating a caption for each image-question pair using the keywords, and +incorporating the question-driven caption into the LLM prompt. We evaluate the +efficacy of using general-purpose and question-driven image captions in the VQA +pipeline. Our study highlights the potential of employing image captions and +harnessing the capabilities of LLMs to achieve competitive performance on GQA +under the zero-shot setting. Our code is available at +\url{https://github.com/ovguyo/captions-in-VQA}. + +
+
+ comment: The paper has been accepted for presentation at CVPR 2024 Workshop on + Prompting in Vision +
+
+
+
+
+ + ☆ Advanced wood species identification based on multiple anatomical + sections and using deep feature transfer and fusion + + +
+ In recent years, we have seen many advancements in wood species +identification. Methods like DNA analysis, Near Infrared (NIR) spectroscopy, +and Direct Analysis in Real Time (DART) mass spectrometry complement the +long-established wood anatomical assessment of cell and tissue morphology. +However, most of these methods have some limitations such as high costs, the +need for skilled experts for data interpretation, and the lack of good datasets +for professional reference. Therefore, most of these methods, and certainly the +wood anatomical assessment, may benefit from tools based on Artificial +Intelligence. In this paper, we apply two transfer learning techniques with +Convolutional Neural Networks (CNNs) to a multi-view Congolese wood species +dataset including sections from different orientations and viewed at different +microscopic magnifications. We explore two feature extraction methods in +detail, namely Global Average Pooling (GAP) and Random Encoding of Aggregated +Deep Activation Maps (RADAM), for efficient and accurate wood species +identification. Our results indicate superior accuracy on diverse datasets and +anatomical sections, surpassing the results of other methods. Our proposal +represents a significant advancement in wood species identification, offering a +robust tool to support the conservation of forest ecosystems and promote +sustainable forestry practices. + +
+
+ comment: 33 pages, 7 tables, 9 figures +
+
+
+
+
+ + ☆ Pathological Primitive Segmentation Based on Visual Foundation Model + with Zero-Shot Mask Generation + + +
+ Medical image processing usually requires a model trained with carefully +crafted datasets due to unique image characteristics and domain-specific +challenges, especially in pathology. Primitive detection and segmentation in +digitized tissue samples are essential for objective and automated diagnosis +and prognosis of cancer. SAM (Segment Anything Model) has recently been +developed to segment general objects from natural images with high accuracy, +but it requires human prompts to generate masks. In this work, we present a +novel approach that adapts pre-trained natural image encoders of SAM for +detection-based region proposals. Regions proposed by a pre-trained encoder are +sent to cascaded feature propagation layers for projection. Then, local +semantic and global context is aggregated from multi-scale for bounding box +localization and classification. Finally, the SAM decoder uses the identified +bounding boxes as essential prompts to generate a comprehensive primitive +segmentation map. The entire base framework, SAM, requires no additional +training or fine-tuning but could produce an end-to-end result for two +fundamental segmentation tasks in pathology. Our method compares with +state-of-the-art models in F1 score for nuclei detection and binary/multiclass +panoptic(bPQ/mPQ) and mask quality(dice) for segmentation quality on the +PanNuke dataset while offering end-to-end efficiency. Our model also achieves +remarkable Average Precision (+4.5%) on the secondary dataset (HuBMAP Kidney) +compared to Faster RCNN. The code is publicly available at +https://github.com/learner-codec/autoprom_sam. + +
+
+ comment: 2024 IEEE International Symposium on Biomedical Imaging +
+
+
+
+
+ + ☆ FashionFail: Addressing Failure Cases in Fashion Object Detection and + Segmentation IJCNN + + +
+ In the realm of fashion object detection and segmentation for online shopping +images, existing state-of-the-art fashion parsing models encounter limitations, +particularly when exposed to non-model-worn apparel and close-up shots. To +address these failures, we introduce FashionFail; a new fashion dataset with +e-commerce images for object detection and segmentation. The dataset is +efficiently curated using our novel annotation tool that leverages recent +foundation models. The primary objective of FashionFail is to serve as a test +bed for evaluating the robustness of models. Our analysis reveals the +shortcomings of leading models, such as Attribute-Mask R-CNN and Fashionformer. +Additionally, we propose a baseline approach using naive data augmentation to +mitigate common failure cases and improve model robustness. Through this work, +we aim to inspire and support further research in fashion item detection and +segmentation for industrial applications. The dataset, annotation tool, code, +and models are available at \url{https://rizavelioglu.github.io/fashionfail/}. + +
+
+ comment: to be published in 2024 International Joint Conference on Neural + Networks (IJCNN) +
+
+
+
+
+ + ☆ Lossy Image Compression with Foundation Diffusion Models + + +
+ Incorporating diffusion models in the image compression domain has the +potential to produce realistic and detailed reconstructions, especially at +extremely low bitrates. Previous methods focus on using diffusion models as +expressive decoders robust to quantization errors in the conditioning signals, +yet achieving competitive results in this manner requires costly training of +the diffusion model and long inference times due to the iterative generative +process. In this work we formulate the removal of quantization error as a +denoising task, using diffusion to recover lost information in the transmitted +image latent. Our approach allows us to perform less than 10\% of the full +diffusion generative process and requires no architectural changes to the +diffusion model, enabling the use of foundation models as a strong prior +without additional fine tuning of the backbone. Our proposed codec outperforms +previous methods in quantitative realism metrics, and we verify that our +reconstructions are qualitatively preferred by end users, even when other +methods use twice the bitrate. + +
+
+
+
+
+ + ☆ IDD-X: A Multi-View Dataset for Ego-relative Important Object + Localization and Explanation in Dense and Unstructured Traffic ICRA 2024 + + +
+ Intelligent vehicle systems require a deep understanding of the interplay +between road conditions, surrounding entities, and the ego vehicle's driving +behavior for safe and efficient navigation. This is particularly critical in +developing countries where traffic situations are often dense and unstructured +with heterogeneous road occupants. Existing datasets, predominantly geared +towards structured and sparse traffic scenarios, fall short of capturing the +complexity of driving in such environments. To fill this gap, we present IDD-X, +a large-scale dual-view driving video dataset. With 697K bounding boxes, 9K +important object tracks, and 1-12 objects per video, IDD-X offers comprehensive +ego-relative annotations for multiple important road objects covering 10 +categories and 19 explanation label categories. The dataset also incorporates +rearview information to provide a more complete representation of the driving +environment. We also introduce custom-designed deep networks aimed at multiple +important object localization and per-object explanation prediction. Overall, +our dataset and introduced prediction models form the foundation for studying +how road conditions and surrounding entities affect driving behavior in complex +traffic situations. + +
+
+ comment: Accepted at ICRA 2024 +
+
+
+
+
+ + ☆ Scalability in Building Component Data Annotation: Enhancing Facade + Material Classification with Synthetic Data + + +
+ Computer vision models trained on Google Street View images can create +material cadastres. However, current approaches need manually annotated +datasets that are difficult to obtain and often have class imbalance. To +address these challenges, this paper fine-tuned a Swin Transformer model on a +synthetic dataset generated with DALL-E and compared the performance to a +similar manually annotated dataset. Although manual annotation remains the gold +standard, the synthetic dataset performance demonstrates a reasonable +alternative. The findings will ease annotation needed to develop material +cadastres, offering architects insights into opportunities for material reuse, +thus contributing to the reduction of demolition waste. + +
+
+ comment: 10 pages, 6 figures, submitted to 2024 European Conference of + Computing in Construction +
+
+
+
+
+ + ☆ Benchmarking the Cell Image Segmentation Models Robustness under the + Microscope Optical Aberrations + + +
+ Cell segmentation is essential in biomedical research for analyzing cellular +morphology and behavior. Deep learning methods, particularly convolutional +neural networks (CNNs), have revolutionized cell segmentation by extracting +intricate features from images. However, the robustness of these methods under +microscope optical aberrations remains a critical challenge. This study +comprehensively evaluates the performance of cell instance segmentation models +under simulated aberration conditions using the DynamicNuclearNet (DNN) and +LIVECell datasets. Aberrations, including Astigmatism, Coma, Spherical, and +Trefoil, were simulated using Zernike polynomial equations. Various +segmentation models, such as Mask R-CNN with different network heads (FPN, C3) +and backbones (ResNet, VGG19, SwinS), were trained and tested under aberrated +conditions. Results indicate that FPN combined with SwinS demonstrates superior +robustness in handling simple cell images affected by minor aberrations. +Conversely, Cellpose2.0 proves effective for complex cell images under similar +conditions. Our findings provide insights into selecting appropriate +segmentation models based on cell morphology and aberration severity, enhancing +the reliability of cell segmentation in biomedical applications. Further +research is warranted to validate these methods with diverse aberration types +and emerging segmentation models. Overall, this research aims to guide +researchers in effectively utilizing cell segmentation models in the presence +of minor optical aberrations. + +
+
+
+
+
+ + ☆ Analyzing Decades-Long Environmental Changes in Namibia Using Archival + Aerial Photography and Deep Learning + + +
+ This study explores object detection in historical aerial photographs of +Namibia to identify long-term environmental changes. Specifically, we aim to +identify key objects -- \textit{Waterholes}, \textit{Omuti homesteads}, and +\textit{Big trees} -- around Oshikango in Namibia using sub-meter gray-scale +aerial imagery from 1943 and 1972. In this work, we propose a workflow for +analyzing historical aerial imagery using a deep semantic segmentation model on +sparse hand-labels. To this end, we employ a number of strategies including +class-weighting, pseudo-labeling and empirical p-value-based filtering to +balance skewed and sparse representations of objects in the ground truth data. +Results demonstrate the benefits of these different training strategies +resulting in an average $F_1=0.661$ and $F_1=0.755$ over the three objects of +interest for the 1943 and 1972 imagery, respectively. We also identified that +the average size of Waterhole and Big trees increased while the average size of +Omutis decreased between 1943 and 1972 reflecting some of the local effects of +the massive post-Second World War economic, agricultural, demographic, and +environmental changes. This work also highlights the untapped potential of +historical aerial photographs in understanding long-term environmental changes +beyond Namibia (and Africa). With the lack of adequate satellite technology in +the past, archival aerial photography offers a great alternative to uncover +decades-long environmental changes. + +
+
+
+
+
+ + ☆ On the Robustness of Language Guidance for Low-Level Vision Tasks: + Findings from Depth Estimation CVPR 2024 + + +
+ Recent advances in monocular depth estimation have been made by incorporating +natural language as additional guidance. Although yielding impressive results, +the impact of the language prior, particularly in terms of generalization and +robustness, remains unexplored. In this paper, we address this gap by +quantifying the impact of this prior and introduce methods to benchmark its +effectiveness across various settings. We generate "low-level" sentences that +convey object-centric, three-dimensional spatial relationships, incorporate +them as additional language priors and evaluate their downstream impact on +depth estimation. Our key finding is that current language-guided depth +estimators perform optimally only with scene-level descriptions and +counter-intuitively fare worse with low level descriptions. Despite leveraging +additional data, these methods are not robust to directed adversarial attacks +and decline in performance with an increase in distribution shift. Finally, to +provide a foundation for future research, we identify points of failures and +offer insights to better understand these shortcomings. With an increasing +number of methods using language for depth estimation, our findings highlight +the opportunities and pitfalls that require careful consideration for effective +deployment in real-world settings + +
+
+ comment: Accepted to CVPR 2024. Project webpage: + https://agneetchatterjee.com/robustness_depth_lang/ +
+
+
+
+
+ + ☆ Generalized Contrastive Learning for Multi-Modal Retrieval and Ranking + + +
+ Contrastive learning has gained widespread adoption for retrieval tasks due +to its minimal requirement for manual annotations. However, popular contrastive +frameworks typically learn from binary relevance, making them ineffective at +incorporating direct fine-grained rankings. In this paper, we curate a +large-scale dataset featuring detailed relevance scores for each query-document +pair to facilitate future research and evaluation. Subsequently, we propose +Generalized Contrastive Learning for Multi-Modal Retrieval and Ranking (GCL), +which is designed to learn from fine-grained rankings beyond binary relevance +scores. Our results show that GCL achieves a 94.5% increase in NDCG@10 for +in-domain and 26.3 to 48.8% increases for cold-start evaluations, all relative +to the CLIP baseline and involving ground truth rankings. + +
+
+
+
+
+ + ☆ Text Prompt with Normality Guidance for Weakly Supervised Video Anomaly + Detection CVPR2024 + + +
+ Weakly supervised video anomaly detection (WSVAD) is a challenging task. +Generating fine-grained pseudo-labels based on weak-label and then +self-training a classifier is currently a promising solution. However, since +the existing methods use only RGB visual modality and the utilization of +category text information is neglected, thus limiting the generation of more +accurate pseudo-labels and affecting the performance of self-training. Inspired +by the manual labeling process based on the event description, in this paper, +we propose a novel pseudo-label generation and self-training framework based on +Text Prompt with Normality Guidance (TPWNG) for WSVAD. Our idea is to transfer +the rich language-visual knowledge of the contrastive language-image +pre-training (CLIP) model for aligning the video event description text and +corresponding video frames to generate pseudo-labels. Specifically, We first +fine-tune the CLIP for domain adaptation by designing two ranking losses and a +distributional inconsistency loss. Further, we propose a learnable text prompt +mechanism with the assist of a normality visual prompt to further improve the +matching accuracy of video event description text and video frames. Then, we +design a pseudo-label generation module based on the normality guidance to +infer reliable frame-level pseudo-labels. Finally, we introduce a temporal +context self-adaptive learning module to learn the temporal dependencies of +different video events more flexibly and accurately. Extensive experiments show +that our method achieves state-of-the-art performance on two benchmark +datasets, UCF-Crime and XD-Viole + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ☆ Masked Image Modeling as a Framework for Self-Supervised Learning across + Eye Movements + + +
+ To make sense of their surroundings, intelligent systems must transform +complex sensory inputs to structured codes that are reduced to task-relevant +information such as object category. Biological agents achieve this in a +largely autonomous manner, presumably via self-\allowbreak super-\allowbreak +vised learning. Whereas previous attempts to model the underlying mechanisms +were largely discriminative in nature, there is ample evidence that the brain +employs a generative model of the world. Here, we propose that eye movements, +in combination with the focused nature of primate vision, constitute a +generative, self-supervised task of predicting and revealing visual +information. We construct a proof-of-principle model starting from the +framework of masked image modeling (MIM), a common approach in deep +representation learning. To do so, we analyze how core components of MIM such +as masking technique and data augmentation influence the formation of +category-specific representations. This allows us not only to better understand +the principles behind MIM, but to then reassemble a MIM more in line with the +focused nature of biological perception. From a theoretical angle, we find that +MIM disentangles neurons in latent space, a property that has been suggested to +structure visual representations in primates, without explicit regulation. +Together with previous findings of invariance learning, this highlights an +interesting connection of MIM to latent regularization approaches for +self-supervised learning. The source code is available under +https://github.com/RobinWeiler/FocusMIM + +
+
+
+
+
+ + ☆ ChatGPT and general-purpose AI count fruits in pictures surprisingly + well + + +
+ Object counting is a popular task in deep learning applications in various +domains, including agriculture. A conventional deep learning approach requires +a large amount of training data, often a logistic problem in a real-world +application. To address this issue, we examined how well ChatGPT (GPT4V) and a +general-purpose AI (foundation model for object counting, T-Rex) can count the +number of fruit bodies (coffee cherries) in 100 images. The foundation model +with few-shot learning outperformed the trained YOLOv8 model (R2 = 0.923 and +0.900, respectively). ChatGPT also showed some interesting potential, +especially when few-shot learning with human feedback was applied (R2 = 0.360 +and 0.460, respectively). Moreover, we examined the time required for +implementation as a practical question. Obtaining the results with the +foundation model and ChatGPT were much shorter than the YOLOv8 model (0.83 hrs, +1.75 hrs, and 161 hrs). We interpret these results as two surprises for deep +learning users in applied domains: a foundation model with few-shot +domain-specific learning can drastically save time and effort compared to the +conventional approach, and ChatGPT can reveal a relatively good performance. +Both approaches do not need coding skills, which can foster AI education and +dissemination. + +
+
+ comment: 12 pages, 3 figures +
+
+
+
+
+ + ☆ NIR-Assisted Image Denoising: A Selective Fusion Approach and A + Real-World Benchmark Datase + + +
+ Despite the significant progress in image denoising, it is still challenging +to restore fine-scale details while removing noise, especially in extremely +low-light environments. Leveraging near-infrared (NIR) images to assist visible +RGB image denoising shows the potential to address this issue, becoming a +promising technology. Nonetheless, existing works still struggle with taking +advantage of NIR information effectively for real-world image denoising, due to +the content inconsistency between NIR-RGB images and the scarcity of real-world +paired datasets. To alleviate the problem, we propose an efficient Selective +Fusion Module (SFM), which can be plug-and-played into the advanced denoising +networks to merge the deep NIR-RGB features. Specifically, we sequentially +perform the global and local modulation for NIR and RGB features, and then +integrate the two modulated features. Furthermore, we present a Real-world +NIR-Assisted Image Denoising (Real-NAID) dataset, which covers diverse +scenarios as well as various noise levels. Extensive experiments on both +synthetic and our real-world datasets demonstrate that the proposed method +achieves better results than state-of-the-art ones. The dataset, codes, and +pre-trained models will be publicly available at +https://github.com/ronjonxu/NAID. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ LaSagnA: Language-based Segmentation Assistant for Complex Queries + + +
+ Recent advancements have empowered Large Language Models for Vision (vLLMs) +to generate detailed perceptual outcomes, including bounding boxes and masks. +Nonetheless, there are two constraints that restrict the further application of +these vLLMs: the incapability of handling multiple targets per query and the +failure to identify the absence of query objects in the image. In this study, +we acknowledge that the main cause of these problems is the insufficient +complexity of training queries. Consequently, we define the general sequence +format for complex queries. Then we incorporate a semantic segmentation task in +the current pipeline to fulfill the requirements of training data. Furthermore, +we present three novel strategies to effectively handle the challenges arising +from the direct integration of the proposed format. The effectiveness of our +model in processing complex queries is validated by the comparable results with +conventional methods on both close-set and open-set semantic segmentation +datasets. Additionally, we outperform a series of vLLMs in reasoning and +referring segmentation, showcasing our model's remarkable capabilities. We +release the code at https://github.com/congvvc/LaSagnA. + +
+
+
+
+
+ + ☆ 3D Human Scan With A Moving Event Camera + + +
+ Capturing the 3D human body is one of the important tasks in computer vision +with a wide range of applications such as virtual reality and sports analysis. +However, conventional frame cameras are limited by their temporal resolution +and dynamic range, which imposes constraints in real-world application setups. +Event cameras have the advantages of high temporal resolution and high dynamic +range (HDR), but the development of event-based methods is necessary to handle +data with different characteristics. This paper proposes a novel event-based +method for 3D pose estimation and human mesh recovery. Prior work on +event-based human mesh recovery require frames (images) as well as event data. +The proposed method solely relies on events; it carves 3D voxels by moving the +event camera around a stationary body, reconstructs the human pose and mesh by +attenuated rays, and fit statistical body models, preserving high-frequency +details. The experimental results show that the proposed method outperforms +conventional frame-based methods in the estimation accuracy of both pose and +body mesh. We also demonstrate results in challenging situations where a +conventional camera has motion blur. This is the first to demonstrate +event-only human mesh recovery, and we hope that it is the first step toward +achieving robust and accurate 3D human body scanning from vision sensors. + +
+
+
+
+
+ + ☆ SpectralMamba: Efficient Mamba for Hyperspectral Image Classification + + +
+ Recurrent neural networks and Transformers have recently dominated most +applications in hyperspectral (HS) imaging, owing to their capability to +capture long-range dependencies from spectrum sequences. However, despite the +success of these sequential architectures, the non-ignorable inefficiency +caused by either difficulty in parallelization or computationally prohibitive +attention still hinders their practicality, especially for large-scale +observation in remote sensing scenarios. To address this issue, we herein +propose SpectralMamba -- a novel state space model incorporated efficient deep +learning framework for HS image classification. SpectralMamba features the +simplified but adequate modeling of HS data dynamics at two levels. First, in +spatial-spectral space, a dynamical mask is learned by efficient convolutions +to simultaneously encode spatial regularity and spectral peculiarity, thus +attenuating the spectral variability and confusion in discriminative +representation learning. Second, the merged spectrum can then be efficiently +operated in the hidden state space with all parameters learned input-dependent, +yielding selectively focused responses without reliance on redundant attention +or imparallelizable recurrence. To explore the room for further computational +downsizing, a piece-wise scanning mechanism is employed in-between, +transferring approximately continuous spectrum into sequences with squeezed +length while maintaining short- and long-term contextual profiles among +hundreds of bands. Through extensive experiments on four benchmark HS datasets +acquired by satellite-, aircraft-, and UAV-borne imagers, SpectralMamba +surprisingly creates promising win-wins from both performance and efficiency +perspectives. + +
+
+
+
+
+ + ☆ New Efficient Visual OILU Markers + + +
+ Basic patterns are the source of a wide range of more or less complex +geometric structures. We will exploit such patterns to develop new efficient +visual markers. Besides being projective invariants, the proposed markers allow +producing rich panel of unique identifiers, highly required for +resource-intensive navigation and augmented reality applications. The spiral +topology of our markers permits the validation of an accurate identification +scheme, which is based on level set methods. The robustness of the markers +against acquisition and geometric distortions is validated by extensive +experimental tests. + +
+
+
+
+
+ + ☆ MoE-FFD: Mixture of Experts for Generalized and Parameter-Efficient Face + Forgery Detection + + +
+ Deepfakes have recently raised significant trust issues and security concerns +among the public. Compared to CNN face forgery detectors, ViT-based methods +take advantage of the expressivity of transformers, achieving superior +detection performance. However, these approaches still exhibit the following +limitations: (1). Fully fine-tuning ViT-based models from ImageNet weights +demands substantial computational and storage resources; (2). ViT-based methods +struggle to capture local forgery clues, leading to model bias and limited +generalizability. To tackle these challenges, this work introduces +Mixture-of-Experts modules for Face Forgery Detection (MoE-FFD), a generalized +yet parameter-efficient ViT-based approach. MoE-FFD only updates lightweight +Low-Rank Adaptation (LoRA) and Adapter layers while keeping the ViT backbone +frozen, thereby achieving parameter-efficient training. Moreover, MoE-FFD +leverages the expressivity of transformers and local priors of CNNs to +simultaneously extract global and local forgery clues. Additionally, novel MoE +modules are designed to scale the model's capacity and select optimal forgery +experts, further enhancing forgery detection performance. The proposed MoE +learning scheme can be seamlessly adapted to various transformer backbones in a +plug-and-play manner. Extensive experimental results demonstrate that the +proposed method achieves state-of-the-art face forgery detection performance +with reduced parameter overhead. The code will be released upon acceptance. + +
+
+
+
+
+ + ☆ Joint Physical-Digital Facial Attack Detection Via Simulating Spoofing + Clues CVPR + + +
+ Face recognition systems are frequently subjected to a variety of physical +and digital attacks of different types. Previous methods have achieved +satisfactory performance in scenarios that address physical attacks and digital +attacks, respectively. However, few methods are considered to integrate a model +that simultaneously addresses both physical and digital attacks, implying the +necessity to develop and maintain multiple models. To jointly detect physical +and digital attacks within a single model, we propose an innovative approach +that can adapt to any network architecture. Our approach mainly contains two +types of data augmentation, which we call Simulated Physical Spoofing Clues +augmentation (SPSC) and Simulated Digital Spoofing Clues augmentation (SDSC). +SPSC and SDSC augment live samples into simulated attack samples by simulating +spoofing clues of physical and digital attacks, respectively, which +significantly improve the capability of the model to detect "unseen" attack +types. Extensive experiments show that SPSC and SDSC can achieve +state-of-the-art generalization in Protocols 2.1 and 2.2 of the UniAttackData +dataset, respectively. Our method won first place in "Unified Physical-Digital +Face Attack Detection" of the 5th Face Anti-spoofing Challenge@CVPR2024. Our +final submission obtains 3.75% APCER, 0.93% BPCER, and 2.34% ACER, +respectively. Our code is available at +https://github.com/Xianhua-He/cvpr2024-face-anti-spoofing-challenge. + +
+
+ comment: 10 pages with 6 figures, Accepted by CVPRW 2024 +
+
+
+
+
+ + ☆ OccGaussian: 3D Gaussian Splatting for Occluded Human Rendering + + +
+ Rendering dynamic 3D human from monocular videos is crucial for various +applications such as virtual reality and digital entertainment. Most methods +assume the people is in an unobstructed scene, while various objects may cause +the occlusion of body parts in real-life scenarios. Previous method utilizing +NeRF for surface rendering to recover the occluded areas, but it requiring more +than one day to train and several seconds to render, failing to meet the +requirements of real-time interactive applications. To address these issues, we +propose OccGaussian based on 3D Gaussian Splatting, which can be trained within +6 minutes and produces high-quality human renderings up to 160 FPS with +occluded input. OccGaussian initializes 3D Gaussian distributions in the +canonical space, and we perform occlusion feature query at occluded regions, +the aggregated pixel-align feature is extracted to compensate for the missing +information. Then we use Gaussian Feature MLP to further process the feature +along with the occlusion-aware loss functions to better perceive the occluded +area. Extensive experiments both in simulated and real-world occlusions, +demonstrate that our method achieves comparable or even superior performance +compared to the state-of-the-art method. And we improving training and +inference speeds by 250x and 800x, respectively. Our code will be available for +research purposes. + +
+
+ comment: 12 April, 2024; originally announced April 2024 +
+
+
+
+
+ + ☆ MSSTNet: A Multi-Scale Spatio-Temporal CNN-Transformer Network for + Dynamic Facial Expression Recognition ICASSP 2024 + + +
+ Unlike typical video action recognition, Dynamic Facial Expression +Recognition (DFER) does not involve distinct moving targets but relies on +localized changes in facial muscles. Addressing this distinctive attribute, we +propose a Multi-Scale Spatio-temporal CNN-Transformer network (MSSTNet). Our +approach takes spatial features of different scales extracted by CNN and feeds +them into a Multi-scale Embedding Layer (MELayer). The MELayer extracts +multi-scale spatial information and encodes these features before sending them +into a Temporal Transformer (T-Former). The T-Former simultaneously extracts +temporal information while continually integrating multi-scale spatial +information. This process culminates in the generation of multi-scale +spatio-temporal features that are utilized for the final classification. Our +method achieves state-of-the-art results on two in-the-wild datasets. +Furthermore, a series of ablation experiments and visualizations provide +further validation of our approach's proficiency in leveraging spatio-temporal +information within DFER. + +
+
+ comment: Accepted to 2024 IEEE International Conference on Acoustics, Speech, + and Signal Processing (ICASSP 2024) +
+
+
+
+
+ + ☆ Adapting the Segment Anything Model During Usage in Novel Situations + + +
+ The interactive segmentation task consists in the creation of object +segmentation masks based on user interactions. The most common way to guide a +model towards producing a correct segmentation consists in clicks on the object +and background. The recently published Segment Anything Model (SAM) supports a +generalized version of the interactive segmentation problem and has been +trained on an object segmentation dataset which contains 1.1B masks. Though +being trained extensively and with the explicit purpose of serving as a +foundation model, we show significant limitations of SAM when being applied for +interactive segmentation on novel domains or object types. On the used +datasets, SAM displays a failure rate $\text{FR}_{30}@90$ of up to $72.6 \%$. +Since we still want such foundation models to be immediately applicable, we +present a framework that can adapt SAM during immediate usage. For this we will +leverage the user interactions and masks, which are constructed during the +interactive segmentation process. We use this information to generate +pseudo-labels, which we use to compute a loss function and optimize a part of +the SAM model. The presented method causes a relative reduction of up to $48.1 +\%$ in the $\text{FR}_{20}@85$ and $46.6 \%$ in the $\text{FR}_{30}@90$ +metrics. + +
+
+ comment: 11 pages, 2 figures, 4 tables +
+
+
+
+
+ + ☆ Direct May Not Be the Best: An Incremental Evolution View of Pose + Generation + + +
+ Pose diversity is an inherent representative characteristic of 2D images. Due +to the 3D to 2D projection mechanism, there is evident content discrepancy +among distinct pose images. This is the main obstacle bothering pose +transformation related researches. To deal with this challenge, we propose a +fine-grained incremental evolution centered pose generation framework, rather +than traditional direct one-to-one in a rush. Since proposed approach actually +bypasses the theoretical difficulty of directly modeling dramatic non-linear +variation, the incurred content distortion and blurring could be effectively +constrained, at the same time the various individual pose details, especially +clothes texture, could be precisely maintained. In order to systematically +guide the evolution course, both global and incremental evolution constraints +are elaborately designed and merged into the overall frame?work. And a novel +triple-path knowledge fusion structure is worked out to take full advantage of +all available valuable knowledge to conduct high-quality pose synthesis. In +addition, our framework could generate a series of valuable byproducts, namely +the various intermediate poses. Extensive experiments have been conducted to +verify the effectiveness of the proposed approach. Code is available at +https://github.com/Xiaofei-CN/Incremental-Evolution-Pose-Generation. + +
+
+
+
+
+ + ☆ MambaDFuse: A Mamba-based Dual-phase Model for Multi-modality Image + Fusion + + +
+ Multi-modality image fusion (MMIF) aims to integrate complementary +information from different modalities into a single fused image to represent +the imaging scene and facilitate downstream visual tasks comprehensively. In +recent years, significant progress has been made in MMIF tasks due to advances +in deep neural networks. However, existing methods cannot effectively and +efficiently extract modality-specific and modality-fused features constrained +by the inherent local reductive bias (CNN) or quadratic computational +complexity (Transformers). To overcome this issue, we propose a Mamba-based +Dual-phase Fusion (MambaDFuse) model. Firstly, a dual-level feature extractor +is designed to capture long-range features from single-modality images by +extracting low and high-level features from CNN and Mamba blocks. Then, a +dual-phase feature fusion module is proposed to obtain fusion features that +combine complementary information from different modalities. It uses the +channel exchange method for shallow fusion and the enhanced Multi-modal Mamba +(M3) blocks for deep fusion. Finally, the fused image reconstruction module +utilizes the inverse transformation of the feature extraction to generate the +fused result. Through extensive experiments, our approach achieves promising +fusion results in infrared-visible image fusion and medical image fusion. +Additionally, in a unified benchmark, MambaDFuse has also demonstrated improved +performance in downstream tasks such as object detection. Code with checkpoints +will be available after the peer-review process. + +
+
+
+
+
+ + ☆ No Bells, Just Whistles: Sports Field Registration by Leveraging + Geometric Properties CVPR + + +
+ Broadcast sports field registration is traditionally addressed as a +homography estimation task, mapping the visible image area to a planar field +model, predominantly focusing on the main camera shot. Addressing the +shortcomings of previous approaches, we propose a novel calibration pipeline +enabling camera calibration using a 3D soccer field model and extending the +process to assess the multiple-view nature of broadcast videos. Our approach +begins with a keypoint generation pipeline derived from SoccerNet dataset +annotations, leveraging the geometric properties of the court. Subsequently, we +execute classical camera calibration through DLT algorithm in a minimalist +fashion, without further refinement. Through extensive experimentation on +real-world soccer broadcast datasets such as SoccerNet-Calibration, WorldCup +2014 and TS- WorldCup, our method demonstrates superior performance in both +multiple- and single-view 3D camera calibration while maintaining competitive +results in homography estimation compared to state-of-the-art techniques. + +
+
+ comment: Accepted in CVPRW 2024 +
+
+
+
+
+ + ☆ Mitigating Challenges of the Space Environment for Onboard Artificial + Intelligence: Design Overview of the Imaging Payload on SpIRIT CVPR 2024 + + +
+ Artificial intelligence (AI) and autonomous edge computing in space are +emerging areas of interest to augment capabilities of nanosatellites, where +modern sensors generate orders of magnitude more data than can typically be +transmitted to mission control. Here, we present the hardware and software +design of an onboard AI subsystem hosted on SpIRIT. The system is optimised for +on-board computer vision experiments based on visible light and long wave +infrared cameras. This paper highlights the key design choices made to maximise +the robustness of the system in harsh space conditions, and their motivation +relative to key mission requirements, such as limited compute resources, +resilience to cosmic radiation, extreme temperature variations, distribution +shifts, and very low transmission bandwidths. The payload, called Loris, +consists of six visible light cameras, three infrared cameras, a camera control +board and a Graphics Processing Unit (GPU) system-on-module. Loris enables the +execution of AI models with on-orbit fine-tuning as well as a next-generation +image compression algorithm, including progressive coding. This innovative +approach not only enhances the data processing capabilities of nanosatellites +but also lays the groundwork for broader applications to remote sensing from +space. + +
+
+ comment: AI4Space 2024, 3rd Workshop on AI for Space, CVPR 2024 +
+
+
+
+
+ + ☆ NC-TTT: A Noise Contrastive Approach for Test-Time Training + + +
+ Despite their exceptional performance in vision tasks, deep learning models +often struggle when faced with domain shifts during testing. Test-Time Training +(TTT) methods have recently gained popularity by their ability to enhance the +robustness of models through the addition of an auxiliary objective that is +jointly optimized with the main task. Being strictly unsupervised, this +auxiliary objective is used at test time to adapt the model without any access +to labels. In this work, we propose Noise-Contrastive Test-Time Training +(NC-TTT), a novel unsupervised TTT technique based on the discrimination of +noisy feature maps. By learning to classify noisy views of projected feature +maps, and then adapting the model accordingly on new domains, classification +performance can be recovered by an important margin. Experiments on several +popular test-time adaptation baselines demonstrate the advantages of our method +compared to recent approaches for this task. The code can be found +at:https://github.com/GustavoVargasHakim/NCTTT.git + +
+
+
+
+
+ + ☆ Let It Flow: Simultaneous Optimization of 3D Flow and Object Clustering ECCV + + +
+ We study the problem of self-supervised 3D scene flow estimation from real +large-scale raw point cloud sequences, which is crucial to various tasks like +trajectory prediction or instance segmentation. In the absence of ground truth +scene flow labels, contemporary approaches concentrate on deducing optimizing +flow across sequential pairs of point clouds by incorporating structure based +regularization on flow and object rigidity. The rigid objects are estimated by +a variety of 3D spatial clustering methods. While state-of-the-art methods +successfully capture overall scene motion using the Neural Prior structure, +they encounter challenges in discerning multi-object motions. We identified the +structural constraints and the use of large and strict rigid clusters as the +main pitfall of the current approaches and we propose a novel clustering +approach that allows for combination of overlapping soft clusters as well as +non-overlapping rigid clusters representation. Flow is then jointly estimated +with progressively growing non-overlapping rigid clusters together with fixed +size overlapping soft clusters. We evaluate our method on multiple datasets +with LiDAR point clouds, demonstrating the superior performance over the +self-supervised baselines reaching new state of the art results. Our method +especially excels in resolving flow in complicated dynamic scenes with multiple +independently moving objects close to each other which includes pedestrians, +cyclists and other vulnerable road users. Our codes will be publicly available. + +
+
+ comment: ECCV submission +
+
+
+
+
+ + ☆ TDANet: Target-Directed Attention Network For Object-Goal Visual + Navigation With Zero-Shot Ability + + +
+ The generalization of the end-to-end deep reinforcement learning (DRL) for +object-goal visual navigation is a long-standing challenge since object classes +and placements vary in new test environments. Learning domain-independent +visual representation is critical for enabling the trained DRL agent with the +ability to generalize to unseen scenes and objects. In this letter, a +target-directed attention network (TDANet) is proposed to learn the end-to-end +object-goal visual navigation policy with zero-shot ability. TDANet features a +novel target attention (TA) module that learns both the spatial and semantic +relationships among objects to help TDANet focus on the most relevant observed +objects to the target. With the Siamese architecture (SA) design, TDANet +distinguishes the difference between the current and target states and +generates the domain-independent visual representation. To evaluate the +navigation performance of TDANet, extensive experiments are conducted in the +AI2-THOR embodied AI environment. The simulation results demonstrate a strong +generalization ability of TDANet to unseen scenes and target objects, with +higher navigation success rate (SR) and success weighted by length (SPL) than +other state-of-the-art models. + +
+
+
+
+
+ + ☆ OmniSat: Self-Supervised Modality Fusion for Earth Observation + + +
+ The field of Earth Observations (EO) offers a wealth of data from diverse +sensors, presenting a great opportunity for advancing self-supervised +multimodal learning. However, current multimodal EO datasets and models focus +on a single data type, either mono-date images or time series, which limits +their expressivity. We introduce OmniSat, a novel architecture that exploits +the spatial alignment between multiple EO modalities to learn expressive +multimodal representations without labels. To demonstrate the advantages of +combining modalities of different natures, we augment two existing datasets +with new modalities. As demonstrated on three downstream tasks: forestry, land +cover classification, and crop mapping. OmniSat can learn rich representations +in an unsupervised manner, leading to improved performance in the semi- and +fully-supervised settings, even when only one modality is available for +inference. The code and dataset are available at github.com/gastruc/OmniSat. + +
+
+
+
+
+ + ☆ Self-Supervised k-Space Regularization for Motion-Resolved Abdominal MRI + Using Neural Implicit k-Space Representation + + +
+ Neural implicit k-space representations have shown promising results for +dynamic MRI at high temporal resolutions. Yet, their exclusive training in +k-space limits the application of common image regularization methods to +improve the final reconstruction. In this work, we introduce the concept of +parallel imaging-inspired self-consistency (PISCO), which we incorporate as +novel self-supervised k-space regularization enforcing a consistent +neighborhood relationship. At no additional data cost, the proposed +regularization significantly improves neural implicit k-space reconstructions +on simulated data. Abdominal in-vivo reconstructions using PISCO result in +enhanced spatio-temporal image quality compared to state-of-the-art methods. +Code is available at https://github.com/vjspi/PISCO-NIK. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Learning to Rebalance Multi-Modal Optimization by Adaptively Masking + Subnetworks + + +
+ Multi-modal learning aims to enhance performance by unifying models from +various modalities but often faces the "modality imbalance" problem in real +data, leading to a bias towards dominant modalities and neglecting others, +thereby limiting its overall effectiveness. To address this challenge, the core +idea is to balance the optimization of each modality to achieve a joint +optimum. Existing approaches often employ a modal-level control mechanism for +adjusting the update of each modal parameter. However, such a global-wise +updating mechanism ignores the different importance of each parameter. Inspired +by subnetwork optimization, we explore a uniform sampling-based optimization +strategy and find it more effective than global-wise updating. According to the +findings, we further propose a novel importance sampling-based, element-wise +joint optimization method, called Adaptively Mask Subnetworks Considering Modal +Significance(AMSS). Specifically, we incorporate mutual information rates to +determine the modal significance and employ non-uniform adaptive sampling to +select foreground subnetworks from each modality for parameter updates, thereby +rebalancing multi-modal learning. Additionally, we demonstrate the reliability +of the AMSS strategy through convergence analysis. Building upon theoretical +insights, we further enhance the multi-modal mask subnetwork strategy using +unbiased estimation, referred to as AMSS+. Extensive experiments reveal the +superiority of our approach over comparison methods. + +
+
+ comment: 17 pages;6 figures +
+
+
+
+
+ + ☆ Counterfactual Explanations for Face Forgery Detection via Adversarial + Removal of Artifacts ICME2024 + + +
+ Highly realistic AI generated face forgeries known as deepfakes have raised +serious social concerns. Although DNN-based face forgery detection models have +achieved good performance, they are vulnerable to latest generative methods +that have less forgery traces and adversarial attacks. This limitation of +generalization and robustness hinders the credibility of detection results and +requires more explanations. In this work, we provide counterfactual +explanations for face forgery detection from an artifact removal perspective. +Specifically, we first invert the forgery images into the StyleGAN latent +space, and then adversarially optimize their latent representations with the +discrimination supervision from the target detection model. We verify the +effectiveness of the proposed explanations from two aspects: (1) Counterfactual +Trace Visualization: the enhanced forgery images are useful to reveal artifacts +by visually contrasting the original images and two different visualization +methods; (2) Transferable Adversarial Attacks: the adversarial forgery images +generated by attacking the detection model are able to mislead other detection +models, implying the removed artifacts are general. Extensive experiments +demonstrate that our method achieves over 90% attack success rate and superior +attack transferability. Compared with naive adversarial noise methods, our +method adopts both generative and discriminative model priors, and optimize the +latent representations in a synthesis-by-analysis way, which forces the search +of counterfactual explanations on the natural face manifold. Thus, more general +counterfactual traces can be found and better adversarial attack +transferability can be achieved. + +
+
+ comment: Accepted to ICME2024 +
+
+
+
+
+ + ☆ Emerging Property of Masked Token for Effective Pre-training + + +
+ Driven by the success of Masked Language Modeling (MLM), the realm of +self-supervised learning for computer vision has been invigorated by the +central role of Masked Image Modeling (MIM) in driving recent breakthroughs. +Notwithstanding the achievements of MIM across various downstream tasks, its +overall efficiency is occasionally hampered by the lengthy duration of the +pre-training phase. This paper presents a perspective that the optimization of +masked tokens as a means of addressing the prevailing issue. Initially, we +delve into an exploration of the inherent properties that a masked token ought +to possess. Within the properties, we principally dedicated to articulating and +emphasizing the `data singularity' attribute inherent in masked tokens. Through +a comprehensive analysis of the heterogeneity between masked tokens and visible +tokens within pre-trained models, we propose a novel approach termed masked +token optimization (MTO), specifically designed to improve model efficiency +through weight recalibration and the enhancement of the key property of masked +tokens. The proposed method serves as an adaptable solution that seamlessly +integrates into any MIM approach that leverages masked tokens. As a result, MTO +achieves a considerable improvement in pre-training efficiency, resulting in an +approximately 50% reduction in pre-training epochs required to attain converged +performance of the recent approaches. + +
+
+
+
+
+ + ☆ Salience-Based Adaptive Masking: Revisiting Token Dynamics for Enhanced + Pre-training + + +
+ In this paper, we introduce Saliency-Based Adaptive Masking (SBAM), a novel +and cost-effective approach that significantly enhances the pre-training +performance of Masked Image Modeling (MIM) approaches by prioritizing token +salience. Our method provides robustness against variations in masking ratios, +effectively mitigating the performance instability issues common in existing +methods. This relaxes the sensitivity of MIM-based pre-training to masking +ratios, which in turn allows us to propose an adaptive strategy for `tailored' +masking ratios for each data sample, which no existing method can provide. +Toward this goal, we propose an Adaptive Masking Ratio (AMR) strategy that +dynamically adjusts the proportion of masking for the unique content of each +image based on token salience. We show that our method significantly improves +over the state-of-the-art in mask-based pre-training on the ImageNet-1K +dataset. + +
+
+
+
+
+ + ☆ GPN: Generative Point-based NeRF + + +
+ Scanning real-life scenes with modern registration devices typically gives +incomplete point cloud representations, primarily due to the limitations of +partial scanning, 3D occlusions, and dynamic light conditions. Recent works on +processing incomplete point clouds have always focused on point cloud +completion. However, these approaches do not ensure consistency between the +completed point cloud and the captured images regarding color and geometry. We +propose using Generative Point-based NeRF (GPN) to reconstruct and repair a +partial cloud by fully utilizing the scanning images and the corresponding +reconstructed cloud. The repaired point cloud can achieve multi-view +consistency with the captured images at high spatial resolution. For the +finetunes of a single scene, we optimize the global latent condition by +incorporating an Auto-Decoder architecture while retaining multi-view +consistency. As a result, the generated point clouds are smooth, plausible, and +geometrically consistent with the partial scanning images. Extensive +experiments on ShapeNet demonstrate that our works achieve competitive +performances to the other state-of-the-art point cloud-based neural scene +rendering and editing performances. + +
+
+
+
+
+ + ☆ Interference Motion Removal for Doppler Radar Vital Sign Detection Using + Variational Encoder-Decoder Neural Network + + +
+ The treatment of interfering motion contributions remains one of the key +challenges in the domain of radar-based vital sign monitoring. Removal of the +interference to extract the vital sign contributions is demanding due to +overlapping Doppler bands, the complex structure of the interference motions +and significant variations in the power levels of their contributions. A novel +approach to the removal of interference through the use of a probabilistic deep +learning model is presented. Results show that a convolutional encoder-decoder +neural network with a variational objective is capable of learning a meaningful +representation space of vital sign Doppler-time distribution facilitating their +extraction from a mixture signal. The approach is tested on semi-experimental +data containing real vital sign signatures and simulated returns from +interfering body motions. The application of the proposed network enhances the +extraction of the micro-Doppler frequency corresponding to the respiration rate +is demonstrated. + +
+
+ comment: Presented at 2021 IEEE Radar Conference (RadarConf21) +
+
+
+
+
+ + ☆ Overcoming Scene Context Constraints for Object Detection in wild using + Defilters + + +
+ This paper focuses on improving object detection performance by addressing +the issue of image distortions, commonly encountered in uncontrolled +acquisition environments. High-level computer vision tasks such as object +detection, recognition, and segmentation are particularly sensitive to image +distortion. To address this issue, we propose a novel approach employing an +image defilter to rectify image distortion prior to object detection. This +method enhances object detection accuracy, as models perform optimally when +trained on non-distorted images. Our experiments demonstrate that utilizing +defiltered images significantly improves mean average precision compared to +training object detection models on distorted images. Consequently, our +proposed method offers considerable benefits for real-world applications +plagued by image distortion. To our knowledge, the contribution lies in +employing distortion-removal paradigm for object detection on images captured +in natural settings. We achieved an improvement of 0.562 and 0.564 of mean +Average precision on validation and test data. + +
+
+
+
+
+ + ☆ AdaContour: Adaptive Contour Descriptor with Hierarchical Representation + + +
+ Existing angle-based contour descriptors suffer from lossy representation for +non-starconvex shapes. By and large, this is the result of the shape being +registered with a single global inner center and a set of radii corresponding +to a polar coordinate parameterization. In this paper, we propose AdaContour, +an adaptive contour descriptor that uses multiple local representations to +desirably characterize complex shapes. After hierarchically encoding object +shapes in a training set and constructing a contour matrix of all subdivided +regions, we compute a robust low-rank robust subspace and approximate each +local contour by linearly combining the shared basis vectors to represent an +object. Experiments show that AdaContour is able to represent shapes more +accurately and robustly than other descriptors while retaining effectiveness. +We validate AdaContour by integrating it into off-the-shelf detectors to enable +instance segmentation which demonstrates faithful performance. The code is +available at https://github.com/tding1/AdaContour. + +
+
+
+
+
+ + ☆ On Input Formats for Radar Micro-Doppler Signature Processing by + Convolutional Neural Networks + + +
+ Convolutional neural networks have often been proposed for processing radar +Micro-Doppler signatures, most commonly with the goal of classifying the +signals. The majority of works tend to disregard phase information from the +complex time-frequency representation. Here, the utility of the phase +information, as well as the optimal format of the Doppler-time input for a +convolutional neural network, is analysed. It is found that the performance +achieved by convolutional neural network classifiers is heavily influenced by +the type of input representation, even across formats with equivalent +information. Furthermore, it is demonstrated that the phase component of the +Doppler-time representation contains rich information useful for classification +and that unwrapping the phase in the temporal dimension can improve the results +compared to a magnitude-only solution, improving accuracy from 0.920 to 0.938 +on the tested human activity dataset. Further improvement of 0.947 is achieved +by training a linear classifier on embeddings from multiple-formats. + +
+
+ comment: Presented at International Conference on Radar Systems (RADAR 2022) +
+
+
+
+
+ + ☆ A Survey of Neural Network Robustness Assessment in Image Recognition + + +
+ In recent years, there has been significant attention given to the robustness +assessment of neural networks. Robustness plays a critical role in ensuring +reliable operation of artificial intelligence (AI) systems in complex and +uncertain environments. Deep learning's robustness problem is particularly +significant, highlighted by the discovery of adversarial attacks on image +classification models. Researchers have dedicated efforts to evaluate +robustness in diverse perturbation conditions for image recognition tasks. +Robustness assessment encompasses two main techniques: robustness verification/ +certification for deliberate adversarial attacks and robustness testing for +random data corruptions. In this survey, we present a detailed examination of +both adversarial robustness (AR) and corruption robustness (CR) in neural +network assessment. Analyzing current research papers and standards, we provide +an extensive overview of robustness assessment in image recognition. Three +essential aspects are analyzed: concepts, metrics, and assessment methods. We +investigate the perturbation metrics and range representations used to measure +the degree of perturbations on images, as well as the robustness metrics +specifically for the robustness conditions of classification models. The +strengths and limitations of the existing methods are also discussed, and some +potential directions for future research are provided. + +
+
+
+
+
+ + ☆ Calibration & Reconstruction: Deep Integrated Language for Referring + Image Segmentation ICMR2024 + + +
+ Referring image segmentation aims to segment an object referred to by natural +language expression from an image. The primary challenge lies in the efficient +propagation of fine-grained semantic information from textual features to +visual features. Many recent works utilize a Transformer to address this +challenge. However, conventional transformer decoders can distort linguistic +information with deeper layers, leading to suboptimal results. In this paper, +we introduce CRFormer, a model that iteratively calibrates multi-modal features +in the transformer decoder. We start by generating language queries using +vision features, emphasizing different aspects of the input language. Then, we +propose a novel Calibration Decoder (CDec) wherein the multi-modal features can +iteratively calibrated by the input language features. In the Calibration +Decoder, we use the output of each decoder layer and the original language +features to generate new queries for continuous calibration, which gradually +updates the language features. Based on CDec, we introduce a Language +Reconstruction Module and a reconstruction loss. This module leverages queries +from the final layer of the decoder to reconstruct the input language and +compute the reconstruction loss. This can further prevent the language +information from being lost or distorted. Our experiments consistently show the +superior performance of our approach across RefCOCO, RefCOCO+, and G-Ref +datasets compared to state-of-the-art methods. + +
+
+ comment: 9 pages, 8 figures ICMR2024. arXiv admin note: text overlap with + arXiv:2305.14969 +
+
+
+
+
+ + ☆ Convolutional neural network classification of cancer cytopathology + images: taking breast cancer as an example + + +
+ Breast cancer is a relatively common cancer among gynecological cancers. Its +diagnosis often relies on the pathology of cells in the lesion. The +pathological diagnosis of breast cancer not only requires professionals and +time, but also sometimes involves subjective judgment. To address the +challenges of dependence on pathologists expertise and the time-consuming +nature of achieving accurate breast pathological image classification, this +paper introduces an approach utilizing convolutional neural networks (CNNs) for +the rapid categorization of pathological images, aiming to enhance the +efficiency of breast pathological image detection. And the approach enables the +rapid and automatic classification of pathological images into benign and +malignant groups. The methodology involves utilizing a convolutional neural +network (CNN) model leveraging the Inceptionv3 architecture and transfer +learning algorithm for extracting features from pathological images. Utilizing +a neural network with fully connected layers and employing the SoftMax function +for image classification. Additionally, the concept of image partitioning is +introduced to handle high-resolution images. To achieve the ultimate +classification outcome, the classification probabilities of each image block +are aggregated using three algorithms: summation, product, and maximum. +Experimental validation was conducted on the BreaKHis public dataset, resulting +in accuracy rates surpassing 0.92 across all four magnification coefficients +(40X, 100X, 200X, and 400X). It demonstrates that the proposed method +effectively enhances the accuracy in classifying pathological images of breast +cancer. + +
+
+
+
+
+ + ☆ FaceFilterSense: A Filter-Resistant Face Recognition and Facial + Attribute Analysis Framework + + +
+ With the advent of social media, fun selfie filters have come into tremendous +mainstream use affecting the functioning of facial biometric systems as well as +image recognition systems. These filters vary from beautification filters and +Augmented Reality (AR)-based filters to filters that modify facial landmarks. +Hence, there is a need to assess the impact of such filters on the performance +of existing face recognition systems. The limitation associated with existing +solutions is that these solutions focus more on the beautification filters. +However, the current AR-based filters and filters which distort facial key +points are in vogue recently and make the faces highly unrecognizable even to +the naked eye. Also, the filters considered are mostly obsolete with limited +variations. To mitigate these limitations, we aim to perform a holistic impact +analysis of the latest filters and propose an user recognition model with the +filtered images. We have utilized a benchmark dataset for baseline images, and +applied the latest filters over them to generate a beautified/filtered dataset. +Next, we have introduced a model FaceFilterNet for beautified user recognition. +In this framework, we also utilize our model to comment on various attributes +of the person including age, gender, and ethnicity. In addition, we have also +presented a filter-wise impact analysis on face recognition, age estimation, +gender, and ethnicity prediction. The proposed method affirms the efficacy of +our dataset with an accuracy of 87.25% and an optimal accuracy for facial +attribute analysis. + +
+
+
+
+
+ + ☆ Struggle with Adversarial Defense? Try Diffusion + + +
+ Adversarial attacks induce misclassification by introducing subtle +perturbations. Recently, diffusion models are applied to the image classifiers +to improve adversarial robustness through adversarial training or by purifying +adversarial noise. However, diffusion-based adversarial training often +encounters convergence challenges and high computational expenses. +Additionally, diffusion-based purification inevitably causes data shift and is +deemed susceptible to stronger adaptive attacks. To tackle these issues, we +propose the Truth Maximization Diffusion Classifier (TMDC), a generative +Bayesian classifier that builds upon pre-trained diffusion models and the +Bayesian theorem. Unlike data-driven classifiers, TMDC, guided by Bayesian +principles, utilizes the conditional likelihood from diffusion models to +determine the class probabilities of input images, thereby insulating against +the influences of data shift and the limitations of adversarial training. +Moreover, to enhance TMDC's resilience against more potent adversarial attacks, +we propose an optimization strategy for diffusion classifiers. This strategy +involves post-training the diffusion model on perturbed datasets with +ground-truth labels as conditions, guiding the diffusion model to learn the +data distribution and maximizing the likelihood under the ground-truth labels. +The proposed method achieves state-of-the-art performance on the CIFAR10 +dataset against heavy white-box attacks and strong adaptive attacks. +Specifically, TMDC achieves robust accuracies of 82.81% against $l_{\infty}$ +norm-bounded perturbations and 86.05% against $l_{2}$ norm-bounded +perturbations, respectively, with $\epsilon=0.05$. + +
+
+
+
+
+ + ☆ Guided Masked Self-Distillation Modeling for Distributed Multimedia + Sensor Event Analysis + + +
+ Observations with distributed sensors are essential in analyzing a series of +human and machine activities (referred to as 'events' in this paper) in complex +and extensive real-world environments. This is because the information obtained +from a single sensor is often missing or fragmented in such an environment; +observations from multiple locations and modalities should be integrated to +analyze events comprehensively. However, a learning method has yet to be +established to extract joint representations that effectively combine such +distributed observations. Therefore, we propose Guided Masked sELf-Distillation +modeling (Guided-MELD) for inter-sensor relationship modeling. The basic idea +of Guided-MELD is to learn to supplement the information from the masked sensor +with information from other sensors needed to detect the event. Guided-MELD is +expected to enable the system to effectively distill the fragmented or +redundant target event information obtained by the sensors without being overly +dependent on any specific sensors. To validate the effectiveness of the +proposed method in novel tasks of distributed multimedia sensor event analysis, +we recorded two new datasets that fit the problem setting: MM-Store and +MM-Office. These datasets consist of human activities in a convenience store +and an office, recorded using distributed cameras and microphones. Experimental +results on these datasets show that the proposed Guided-MELD improves event +tagging and detection performance and outperforms conventional inter-sensor +relationship modeling methods. Furthermore, the proposed method performed +robustly even when sensors were reduced. + +
+
+ comment: 13page, 7figure, under review +
+
+
+
+
+ + ☆ Practical Region-level Attack against Segment Anything Models + + +
+ Segment Anything Models (SAM) have made significant advancements in image +segmentation, allowing users to segment target portions of an image with a +single click (i.e., user prompt). Given its broad applications, the robustness +of SAM against adversarial attacks is a critical concern. While recent works +have explored adversarial attacks against a pre-defined prompt/click, their +threat model is not yet realistic: (1) they often assume the user-click +position is known to the attacker (point-based attack), and (2) they often +operate under a white-box setting with limited transferability. In this paper, +we propose a more practical region-level attack where attackers do not need to +know the precise user prompt. The attack remains effective as the user clicks +on any point on the target object in the image, hiding the object from SAM. +Also, by adapting a spectrum transformation method, we make the attack more +transferable under a black-box setting. Both control experiments and testing +against real-world SAM services confirm its effectiveness. + +
+
+
+
+
+ + ☆ MonoPatchNeRF: Improving Neural Radiance Fields with Patch-based + Monocular Guidance + + +
+ The latest regularized Neural Radiance Field (NeRF) approaches produce poor +geometry and view extrapolation for multiview stereo (MVS) benchmarks such as +ETH3D. In this paper, we aim to create 3D models that provide accurate geometry +and view synthesis, partially closing the large geometric performance gap +between NeRF and traditional MVS methods. We propose a patch-based approach +that effectively leverages monocular surface normal and relative depth +predictions. The patch-based ray sampling also enables the appearance +regularization of normalized cross-correlation (NCC) and structural similarity +(SSIM) between randomly sampled virtual and training views. We further show +that "density restrictions" based on sparse structure-from-motion points can +help greatly improve geometric accuracy with a slight drop in novel view +synthesis metrics. Our experiments show 4x the performance of RegNeRF and 8x +that of FreeNeRF on average F1@2cm for ETH3D MVS benchmark, suggesting a +fruitful research direction to improve the geometric accuracy of NeRF-based +models, and sheds light on a potential future approach to enable NeRF-based +optimization to eventually outperform traditional MVS. + +
+
+ comment: 26 pages, 15 figures +
+
+
+
+
+ + ☆ Simulation of a Vision Correction Display System + + +
+ Eyes serve as our primary sensory organs, responsible for processing up to +80\% of our sensory input. However, common visual aberrations like myopia and +hyperopia affect a significant portion of the global population. This paper +focuses on simulating a Vision Correction Display (VCD) to enhance the visual +experience of individuals with various visual impairments. Utilising Blender, +we digitally model the functionality of a VCD in correcting refractive errors +such as myopia and hyperopia. With these simulations we can see potential +improvements in visual acuity and comfort. These simulations provide valuable +insights for the design and development of future VCD technologies, ultimately +advancing accessibility and usability for individuals with visual challenges. + +
+
+
+
+
+ + ☆ IFViT: Interpretable Fixed-Length Representation for Fingerprint + Matching via Vision Transformer + + +
+ Determining dense feature points on fingerprints used in constructing deep +fixed-length representations for accurate matching, particularly at the pixel +level, is of significant interest. To explore the interpretability of +fingerprint matching, we propose a multi-stage interpretable fingerprint +matching network, namely Interpretable Fixed-length Representation for +Fingerprint Matching via Vision Transformer (IFViT), which consists of two +primary modules. The first module, an interpretable dense registration module, +establishes a Vision Transformer (ViT)-based Siamese Network to capture +long-range dependencies and the global context in fingerprint pairs. It +provides interpretable dense pixel-wise correspondences of feature points for +fingerprint alignment and enhances the interpretability in the subsequent +matching stage. The second module takes into account both local and global +representations of the aligned fingerprint pair to achieve an interpretable +fixed-length representation extraction and matching. It employs the ViTs +trained in the first module with the additional fully connected layer and +retrains them to simultaneously produce the discriminative fixed-length +representation and interpretable dense pixel-wise correspondences of feature +points. Extensive experimental results on diverse publicly available +fingerprint databases demonstrate that the proposed framework not only exhibits +superior performance on dense registration and matching but also significantly +promotes the interpretability in deep fixed-length representations-based +fingerprint matching. + +
+
+ comment: ready to submit to IEEE Transactions on Information Forensics and + Security (TIFS) +
+
+
+
+
+ + ☆ Enhancing Traffic Safety with Parallel Dense Video Captioning for + End-to-End Event Analysis + + +
+ This paper introduces our solution for Track 2 in AI City Challenge 2024. The +task aims to solve traffic safety description and analysis with the dataset of +Woven Traffic Safety (WTS), a real-world Pedestrian-Centric Traffic Video +Dataset for Fine-grained Spatial-Temporal Understanding. Our solution mainly +focuses on the following points: 1) To solve dense video captioning, we +leverage the framework of dense video captioning with parallel decoding (PDVC) +to model visual-language sequences and generate dense caption by chapters for +video. 2) Our work leverages CLIP to extract visual features to more +efficiently perform cross-modality training between visual and textual +representations. 3) We conduct domain-specific model adaptation to mitigate +domain shift problem that poses recognition challenge in video understanding. +4) Moreover, we leverage BDD-5K captioned videos to conduct knowledge transfer +for better understanding WTS videos and more accurate captioning. Our solution +has yielded on the test set, achieving 6th place in the competition. The open +source code will be available at https://github.com/UCF-SST-Lab/AICity2024CVPRW + +
+
+
+
+
+ + ☆ Improving Continuous Sign Language Recognition with Adapted Image Models + + +
+ The increase of web-scale weakly labelled image-text pairs have greatly +facilitated the development of large-scale vision-language models (e.g., CLIP), +which have shown impressive generalization performance over a series of +downstream tasks. However, the massive model size and scarcity of available +data limit their applications to fine-tune the whole model in downstream tasks. +Besides, fully fine-tuning the model easily forgets the generic essential +knowledge acquired in the pretraining stage and overfits the downstream data. +To enable high efficiency when adapting these large vision-language models +(e.g., CLIP) to performing continuous sign language recognition (CSLR) while +preserving their generalizability, we propose a novel strategy (AdaptSign). +Especially, CLIP is adopted as the visual backbone to extract frame-wise +features whose parameters are fixed, and a set of learnable modules are +introduced to model spatial sign variations or capture temporal sign movements. +The introduced additional modules are quite lightweight, only owning 3.2% extra +computations with high efficiency. The generic knowledge acquired in the +pretraining stage is well-preserved in the frozen CLIP backbone in this +process. Extensive experiments show that despite being efficient, AdaptSign is +able to demonstrate superior performance across a series of CSLR benchmarks +including PHOENIX14, PHOENIX14-T, CSL-Daily and CSL compared to existing +methods. Visualizations show that AdaptSign could learn to dynamically pay +major attention to the informative spatial regions and cross-frame trajectories +in sign videos. + +
+
+
+
+
+ + ☆ A Mutual Inclusion Mechanism for Precise Boundary Segmentation in + Medical Images + + +
+ In medical imaging, accurate image segmentation is crucial for quantifying +diseases, assessing prognosis, and evaluating treatment outcomes. However, +existing methods lack an in-depth integration of global and local features, +failing to pay special attention to abnormal regions and boundary details in +medical images. To this end, we present a novel deep learning-based approach, +MIPC-Net, for precise boundary segmentation in medical images. Our approach, +inspired by radiologists' working patterns, features two distinct modules: (i) +\textbf{Mutual Inclusion of Position and Channel Attention (MIPC) module}: To +enhance the precision of boundary segmentation in medical images, we introduce +the MIPC module, which enhances the focus on channel information when +extracting position features and vice versa; (ii) \textbf{GL-MIPC-Residue}: To +improve the restoration of medical images, we propose the GL-MIPC-Residue, a +global residual connection that enhances the integration of the encoder and +decoder by filtering out invalid information and restoring the most effective +information lost during the feature extraction process. We evaluate the +performance of the proposed model using metrics such as Dice coefficient (DSC) +and Hausdorff Distance (HD) on three publicly accessible datasets: Synapse, +ISIC2018-Task, and Segpc. Our ablation study shows that each module contributes +to improving the quality of segmentation results. Furthermore, with the +assistance of both modules, our approach outperforms state-of-the-art methods +across all metrics on the benchmark datasets, notably achieving a 2.23mm +reduction in HD on the Synapse dataset, strongly evidencing our model's +enhanced capability for precise image boundary segmentation. Codes will be +available at https://github.com/SUN-1024/MIPC-Net. + +
+
+
+
+
+ + ☆ Scaling (Down) CLIP: A Comprehensive Analysis of Data, Architecture, and + Training Strategies + + +
+ This paper investigates the performance of the Contrastive Language-Image +Pre-training (CLIP) when scaled down to limited computation budgets. We explore +CLIP along three dimensions: data, architecture, and training strategies. With +regards to data, we demonstrate the significance of high-quality training data +and show that a smaller dataset of high-quality data can outperform a larger +dataset with lower quality. We also examine how model performance varies with +different dataset sizes, suggesting that smaller ViT models are better suited +for smaller datasets, while larger models perform better on larger datasets +with fixed compute. Additionally, we provide guidance on when to choose a +CNN-based architecture or a ViT-based architecture for CLIP training. We +compare four CLIP training strategies - SLIP, FLIP, CLIP, and CLIP+Data +Augmentation - and show that the choice of training strategy depends on the +available compute resource. Our analysis reveals that CLIP+Data Augmentation +can achieve comparable performance to CLIP using only half of the training +data. This work provides practical insights into how to effectively train and +deploy CLIP models, making them more accessible and affordable for practical +use in various applications. + +
+
+
+
+
+ + ☆ Tackling Ambiguity from Perspective of Uncertainty Inference and + Affinity Diversification for Weakly Supervised Semantic Segmentation + + +
+ Weakly supervised semantic segmentation (WSSS) with image-level labels +intends to achieve dense tasks without laborious annotations. However, due to +the ambiguous contexts and fuzzy regions, the performance of WSSS, especially +the stages of generating Class Activation Maps (CAMs) and refining pseudo +masks, widely suffers from ambiguity while being barely noticed by previous +literature. In this work, we propose UniA, a unified single-staged WSSS +framework, to efficiently tackle this issue from the perspective of uncertainty +inference and affinity diversification, respectively. When activating class +objects, we argue that the false activation stems from the bias to the +ambiguous regions during the feature extraction. Therefore, we design a more +robust feature representation with a probabilistic Gaussian distribution and +introduce the uncertainty estimation to avoid the bias. A distribution loss is +particularly proposed to supervise the process, which effectively captures the +ambiguity and models the complex dependencies among features. When refining +pseudo labels, we observe that the affinity from the prevailing refinement +methods intends to be similar among ambiguities. To this end, an affinity +diversification module is proposed to promote diversity among semantics. A +mutual complementing refinement is proposed to initially rectify the ambiguous +affinity with multiple inferred pseudo labels. More importantly, a contrastive +affinity loss is further designed to diversify the relations among unrelated +semantics, which reliably propagates the diversity into the whole feature +representations and helps generate better pseudo masks. Extensive experiments +are conducted on PASCAL VOC, MS COCO, and medical ACDC datasets, which validate +the efficiency of UniA tackling ambiguity and the superiority over recent +single-staged or even most multi-staged competitors. + +
+
+
+
+
+ + ☆ Adapting CNNs for Fisheye Cameras without Retraining + + +
+ The majority of image processing approaches assume images are in or can be +rectified to a perspective projection. However, in many applications it is +beneficial to use non conventional cameras, such as fisheye cameras, that have +a larger field of view (FOV). The issue arises that these large-FOV images +can't be rectified to a perspective projection without significant cropping of +the original image. To address this issue we propose Rectified Convolutions +(RectConv); a new approach for adapting pre-trained convolutional networks to +operate with new non-perspective images, without any retraining. Replacing the +convolutional layers of the network with RectConv layers allows the network to +see both rectified patches and the entire FOV. We demonstrate RectConv adapting +multiple pre-trained networks to perform segmentation and detection on fisheye +imagery from two publicly available datasets. Our approach requires no +additional data or training, and operates directly on the native image as +captured from the camera. We believe this work is a step toward adapting the +vast resources available for perspective images to operate across a broad range +of camera geometries. + +
+
+ comment: Project page: https://roboticimaging.org/Projects/RectConv/ +
+
+
+
+
+ + ☆ Measuring Domain Shifts using Deep Learning Remote Photoplethysmography + Model Similarity + + +
+ Domain shift differences between training data for deep learning models and +the deployment context can result in severe performance issues for models which +fail to generalize. We study the domain shift problem under the context of +remote photoplethysmography (rPPG), a technique for video-based heart rate +inference. We propose metrics based on model similarity which may be used as a +measure of domain shift, and we demonstrate high correlation between these +metrics and empirical performance. One of the proposed metrics with viable +correlations, DS-diff, does not assume access to the ground truth of the target +domain, i.e. it may be applied to in-the-wild data. To that end, we investigate +a model selection problem in which ground truth results for the evaluation +domain is not known, demonstrating a 13.9% performance improvement over the +average case baseline. + +
+
+
+
+
+ + ☆ Pay Attention to Your Neighbours: Training-Free Open-Vocabulary Semantic + Segmentation + + +
+ Despite the significant progress in deep learning for dense visual +recognition problems, such as semantic segmentation, traditional methods are +constrained by fixed class sets. Meanwhile, vision-language foundation models, +such as CLIP, have showcased remarkable effectiveness in numerous zero-shot +image-level tasks, owing to their robust generalizability. Recently, a body of +work has investigated utilizing these models in open-vocabulary semantic +segmentation (OVSS). However, existing approaches often rely on impractical +supervised pre-training or access to additional pre-trained networks. In this +work, we propose a strong baseline for training-free OVSS, termed +Neighbour-Aware CLIP (NACLIP), representing a straightforward adaptation of +CLIP tailored for this scenario. Our method enforces localization of patches in +the self-attention of CLIP's vision transformer which, despite being crucial +for dense prediction tasks, has been overlooked in the OVSS literature. By +incorporating design choices favouring segmentation, our approach significantly +improves performance without requiring additional data, auxiliary pre-trained +networks, or extensive hyperparameter tuning, making it highly practical for +real-world applications. Experiments are performed on 8 popular semantic +segmentation benchmarks, yielding state-of-the-art performance on most +scenarios. Our code is publicly available at https://github.com/sinahmr/NACLIP . + +
+
+
+
+
+ + ♻ ☆ LLaVA-PruMerge: Adaptive Token Reduction for Efficient Large Multimodal + Models + + +
+ Large Multimodal Models (LMMs) have shown significant reasoning capabilities +by connecting a visual encoder and a large language model. LMMs typically use a +fixed amount of visual tokens, such as the penultimate layer features in the +CLIP visual encoder, as the prefix content. Recent LMMs incorporate more +complex visual inputs, such as high-resolution images and videos, which +increase the number of visual tokens significantly. However, due to the design +of the Transformer architecture, computational costs associated with these +models tend to increase quadratically with the number of input tokens. To +tackle this problem, we explore a token reduction mechanism and find, similar +to prior work, that many visual tokens are spatially redundant. Based on this, +we propose PruMerge, a novel adaptive visual token reduction approach, which +largely reduces the number of visual tokens while maintaining comparable model +performance. We first select the unpruned visual tokens based on their +similarity to class tokens and spatial tokens. We then cluster the pruned +tokens based on key similarity and merge the clustered tokens with the unpruned +tokens to supplement their information. Empirically, when applied to LLaVA-1.5, +our approach can compress the visual tokens by 18 times on average, and achieve +comparable performance across diverse visual question-answering and reasoning +tasks. Code and checkpoints are at https://llava-prumerge.github.io/. + +
+
+ comment: Project page: https://llava-prumerge.github.io/ +
+
+
+
+
+ + ♻ ☆ FloCoDe: Unbiased Dynamic Scene Graph Generation with Temporal + Consistency and Correlation Debiasing CVPR 2024 + + +
+ Dynamic scene graph generation (SGG) from videos requires not only a +comprehensive understanding of objects across scenes but also a method to +capture the temporal motions and interactions with different objects. Moreover, +the long-tailed distribution of visual relationships is a crucial bottleneck +for most dynamic SGG methods. This is because many of them focus on capturing +spatio-temporal context using complex architectures, leading to the generation +of biased scene graphs. To address these challenges, we propose FloCoDe: +Flow-aware Temporal Consistency and Correlation Debiasing with uncertainty +attenuation for unbiased dynamic scene graphs. FloCoDe employs feature warping +using flow to detect temporally consistent objects across frames. To address +the long-tail issue of visual relationships, we propose correlation debiasing +and a label correlation-based loss to learn unbiased relation representations +for long-tailed classes. Specifically, we propose to incorporate label +correlations using contrastive loss to capture commonly co-occurring relations, +which aids in learning robust representations for long-tailed classes. Further, +we adopt the uncertainty attenuation-based classifier framework to handle noisy +annotations in the SGG data. Extensive experimental evaluation shows a +performance gain as high as 4.1%, demonstrating the superiority of generating +more unbiased scene graphs. + +
+
+ comment: Accepted at CVPR 2024 SG2RL, 11 pages, 5 tables, 4 figures +
+
+
+
+
+ + ♻ ☆ PromptSync: Bridging Domain Gaps in Vision-Language Models through + Class-Aware Prototype Alignment and Discrimination CVPR 2024 + + +
+ The potential for zero-shot generalization in vision-language (V-L) models +such as CLIP has spurred their widespread adoption in addressing numerous +downstream tasks. Previous methods have employed test-time prompt tuning to +adapt the model to unseen domains, but they overlooked the issue of imbalanced +class distributions. In this study, we explicitly address this problem by +employing class-aware prototype alignment weighted by mean class probabilities +obtained for the test sample and filtered augmented views. Additionally, we +ensure that the class probabilities are as accurate as possible by performing +prototype discrimination using contrastive learning. The combination of +alignment and discriminative loss serves as a geometric regularizer, preventing +the prompt representation from collapsing onto a single class and effectively +bridging the distribution gap between the source and test domains. Our method, +named PromptSync, synchronizes the prompts for each test sample on both the +text and vision branches of the V-L model. In empirical evaluations on the +domain generalization benchmark, our method outperforms previous best methods +by 2.33% in overall performance, by 1% in base-to-novel generalization, and by +2.84% in cross-dataset transfer tasks. + +
+
+ comment: Accepted at CVPR 2024 LIMIT, 12 pages, 8 Tables, 2 Figures +
+
+
+
+
+ + ♻ ☆ WonderJourney: Going from Anywhere to Everywhere + + +
+ We introduce WonderJourney, a modularized framework for perpetual 3D scene +generation. Unlike prior work on view generation that focuses on a single type +of scenes, we start at any user-provided location (by a text description or an +image) and generate a journey through a long sequence of diverse yet coherently +connected 3D scenes. We leverage an LLM to generate textual descriptions of the +scenes in this journey, a text-driven point cloud generation pipeline to make a +compelling and coherent sequence of 3D scenes, and a large VLM to verify the +generated scenes. We show compelling, diverse visual results across various +scene types and styles, forming imaginary "wonderjourneys". Project website: +https://kovenyu.com/WonderJourney/ + +
+
+ comment: Project website with video results: + https://kovenyu.com/WonderJourney/ +
+
+
+
+
+ + ♻ ☆ ProbMCL: Simple Probabilistic Contrastive Learning for Multi-label + Visual Classification ICASSP 2024 + + +
+ Multi-label image classification presents a challenging task in many domains, +including computer vision and medical imaging. Recent advancements have +introduced graph-based and transformer-based methods to improve performance and +capture label dependencies. However, these methods often include complex +modules that entail heavy computation and lack interpretability. In this paper, +we propose Probabilistic Multi-label Contrastive Learning (ProbMCL), a novel +framework to address these challenges in multi-label image classification +tasks. Our simple yet effective approach employs supervised contrastive +learning, in which samples that share enough labels with an anchor image based +on a decision threshold are introduced as a positive set. This structure +captures label dependencies by pulling positive pair embeddings together and +pushing away negative samples that fall below the threshold. We enhance +representation learning by incorporating a mixture density network into +contrastive learning and generating Gaussian mixture distributions to explore +the epistemic uncertainty of the feature encoder. We validate the effectiveness +of our framework through experimentation with datasets from the computer vision +and medical imaging domains. Our method outperforms the existing +state-of-the-art methods while achieving a low computational footprint on both +datasets. Visualization analyses also demonstrate that ProbMCL-learned +classifiers maintain a meaningful semantic topology. + +
+
+ comment: This paper has been accepted for the ICASSP 2024 - 2024 IEEE + International Conference on Acoustics, Speech and Signal Processing (ICASSP) +
+
+
+
+
+ + ♻ ☆ A Change Detection Reality Check + + +
+ In recent years, there has been an explosion of proposed change detection +deep learning architectures in the remote sensing literature. These approaches +claim to offer state-of-the-art performance on different standard benchmark +datasets. However, has the field truly made significant progress? In this paper +we perform experiments which conclude a simple U-Net segmentation baseline +without training tricks or complicated architectural changes is still a top +performer for the task of change detection. + +
+
+
+
+
+ + ♻ ☆ Generalization in diffusion models arises from geometry-adaptive + harmonic representations ICLR + + +
+ Deep neural networks (DNNs) trained for image denoising are able to generate +high-quality samples with score-based reverse diffusion algorithms. These +impressive capabilities seem to imply an escape from the curse of +dimensionality, but recent reports of memorization of the training set raise +the question of whether these networks are learning the "true" continuous +density of the data. Here, we show that two DNNs trained on non-overlapping +subsets of a dataset learn nearly the same score function, and thus the same +density, when the number of training images is large enough. In this regime of +strong generalization, diffusion-generated images are distinct from the +training set, and are of high visual quality, suggesting that the inductive +biases of the DNNs are well-aligned with the data density. We analyze the +learned denoising functions and show that the inductive biases give rise to a +shrinkage operation in a basis adapted to the underlying image. Examination of +these bases reveals oscillating harmonic structures along contours and in +homogeneous regions. We demonstrate that trained denoisers are inductively +biased towards these geometry-adaptive harmonic bases since they arise not only +when the network is trained on photographic images, but also when it is trained +on image classes supported on low-dimensional manifolds for which the harmonic +basis is suboptimal. Finally, we show that when trained on regular image +classes for which the optimal basis is known to be geometry-adaptive and +harmonic, the denoising performance of the networks is near-optimal. + +
+
+ comment: Accepted for oral presentation at ICLR, Vienna, May 2024 +
+
+
+
+
+ + ♻ ☆ A novel Fourier neural operator framework for classification of + multi-sized images: Application to three dimensional digital porous media + + +
+ Fourier neural operators (FNOs) are invariant with respect to the size of +input images, and thus images with any size can be fed into FNO-based +frameworks without any modification of network architectures, in contrast to +traditional convolutional neural networks (CNNs). Leveraging the advantage of +FNOs, we propose a novel deep-learning framework for classifying images with +varying sizes. Particularly, we simultaneously train the proposed network on +multi-sized images. As a practical application, we consider the problem of +predicting the label (e.g., permeability) of three-dimensional digital porous +media. To construct the framework, an intuitive approach is to connect FNO +layers to a classifier using adaptive max pooling. First, we show that this +approach is only effective for porous media with fixed sizes, whereas it fails +for porous media of varying sizes. To overcome this limitation, we introduce +our approach: instead of using adaptive max pooling, we use static max pooling +with the size of channel width of FNO layers. Since the channel width of the +FNO layers is independent of input image size, the introduced framework can +handle multi-sized images during training. We show the effectiveness of the +introduced framework and compare its performance with the intuitive approach +through the example of the classification of three-dimensional digital porous +media of varying sizes. + +
+
+
+
+
+ + ♻ ☆ View-Consistent 3D Editing with Gaussian Splatting + + +
+ The advent of 3D Gaussian Splatting (3DGS) has revolutionized 3D editing, +offering efficient, high-fidelity rendering and enabling precise local +manipulations. Currently, diffusion-based 2D editing models are harnessed to +modify multi-view rendered images, which then guide the editing of 3DGS models. +However, this approach faces a critical issue of multi-view inconsistency, +where the guidance images exhibit significant discrepancies across views, +leading to mode collapse and visual artifacts of 3DGS. To this end, we +introduce View-consistent Editing (VcEdit), a novel framework that seamlessly +incorporates 3DGS into image editing processes, ensuring multi-view consistency +in edited guidance images and effectively mitigating mode collapse issues. +VcEdit employs two innovative consistency modules: the Cross-attention +Consistency Module and the Editing Consistency Module, both designed to reduce +inconsistencies in edited images. By incorporating these consistency modules +into an iterative pattern, VcEdit proficiently resolves the issue of multi-view +inconsistency, facilitating high-quality 3DGS editing across a diverse range of +scenes. + +
+
+ comment: 25 pages +
+
+
+
+
+ + ♻ ☆ SpikeNVS: Enhancing Novel View Synthesis from Blurry Images via Spike + Camera + + +
+ One of the most critical factors in achieving sharp Novel View Synthesis +(NVS) using neural field methods like Neural Radiance Fields (NeRF) and 3D +Gaussian Splatting (3DGS) is the quality of the training images. However, +Conventional RGB cameras are susceptible to motion blur. In contrast, +neuromorphic cameras like event and spike cameras inherently capture more +comprehensive temporal information, which can provide a sharp representation of +the scene as additional training data. Recent methods have explored the +integration of event cameras to improve the quality of NVS. The event-RGB +approaches have some limitations, such as high training costs and the inability +to work effectively in the background. Instead, our study introduces a new +method that uses the spike camera to overcome these limitations. By considering +texture reconstruction from spike streams as ground truth, we design the +Texture from Spike (TfS) loss. Since the spike camera relies on temporal +integration instead of temporal differentiation used by event cameras, our +proposed TfS loss maintains manageable training costs. It handles foreground +objects with backgrounds simultaneously. We also provide a real-world dataset +captured with our spike-RGB camera system to facilitate future research +endeavors. We conduct extensive experiments using synthetic and real-world +datasets to demonstrate that our design can enhance novel view synthesis across +NeRF and 3DGS. The code and dataset will be made available for public access. + +
+
+
+
+
+ + ♻ ☆ Identifying Important Group of Pixels using Interactions CVPR 2024 + + +
+ To better understand the behavior of image classifiers, it is useful to +visualize the contribution of individual pixels to the model prediction. In +this study, we propose a method, MoXI ($\textbf{Mo}$del e$\textbf{X}$planation +by $\textbf{I}$nteractions), that efficiently and accurately identifies a group +of pixels with high prediction confidence. The proposed method employs +game-theoretic concepts, Shapley values and interactions, taking into account +the effects of individual pixels and the cooperative influence of pixels on +model confidence. Theoretical analysis and experiments demonstrate that our +method better identifies the pixels that are highly contributing to the model +outputs than widely-used visualization by Grad-CAM, Attention rollout, and +Shapley value. While prior studies have suffered from the exponential +computational cost in the computation of Shapley value and interactions, we +show that this can be reduced to quadratic cost for our task. The code is +available at https://github.com/KosukeSumiyasu/MoXI. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ FoodLMM: A Versatile Food Assistant using Large Multi-modal Model + + +
+ Large Multi-modal Models (LMMs) have made impressive progress in many +vision-language tasks. Nevertheless, the performance of general LMMs in +specific domains is still far from satisfactory. This paper proposes FoodLMM, a +versatile food assistant based on LMMs with various capabilities, including +food recognition, ingredient recognition, recipe generation, nutrition +estimation, food segmentation and multi-round conversation. To facilitate +FoodLMM to deal with tasks beyond pure text output, we introduce a series of +novel task-specific tokens and heads, enabling the model to predict food +nutritional values and multiple segmentation masks. We adopt a two-stage +training strategy. In the first stage, we utilize multiple public food +benchmarks for multi-task learning by leveraging the instruct-following +paradigm. In the second stage, we construct a multi-round conversation dataset +and a reasoning segmentation dataset to fine-tune the model, enabling it to +conduct professional dialogues and generate segmentation masks based on complex +reasoning in the food domain. Our fine-tuned FoodLMM achieves state-of-the-art +results across several food benchmarks. We will make our code, models and +datasets publicly available. + +
+
+
+
+
+ + ♻ ☆ Transformer based Pluralistic Image Completion with Reduced Information + Loss + + +
+ Transformer based methods have achieved great success in image inpainting +recently. However, we find that these solutions regard each pixel as a token, +thus suffering from an information loss issue from two aspects: 1) They +downsample the input image into much lower resolutions for efficiency +consideration. 2) They quantize $256^3$ RGB values to a small number (such as +512) of quantized color values. The indices of quantized pixels are used as +tokens for the inputs and prediction targets of the transformer. To mitigate +these issues, we propose a new transformer based framework called "PUT". +Specifically, to avoid input downsampling while maintaining computation +efficiency, we design a patch-based auto-encoder P-VQVAE. The encoder converts +the masked image into non-overlapped patch tokens and the decoder recovers the +masked regions from the inpainted tokens while keeping the unmasked regions +unchanged. To eliminate the information loss caused by input quantization, an +Un-quantized Transformer is applied. It directly takes features from the +P-VQVAE encoder as input without any quantization and only regards the +quantized tokens as prediction targets. Furthermore, to make the inpainting +process more controllable, we introduce semantic and structural conditions as +extra guidance. Extensive experiments show that our method greatly outperforms +existing transformer based methods on image fidelity and achieves much higher +diversity and better fidelity than state-of-the-art pluralistic inpainting +methods on complex large-scale datasets (e.g., ImageNet). Codes are available +at https://github.com/liuqk3/PUT. + +
+
+ comment: Accepted by TPAMI (2024). arXiv admin note: text overlap with + arXiv:2205.05076 +
+
+
+
+
+ + ♻ ☆ WildFusion: Learning 3D-Aware Latent Diffusion Models in View Space + + +
+ Modern learning-based approaches to 3D-aware image synthesis achieve high +photorealism and 3D-consistent viewpoint changes for the generated images. +Existing approaches represent instances in a shared canonical space. However, +for in-the-wild datasets a shared canonical system can be difficult to define +or might not even exist. In this work, we instead model instances in view +space, alleviating the need for posed images and learned camera distributions. +We find that in this setting, existing GAN-based methods are prone to +generating flat geometry and struggle with distribution coverage. We hence +propose WildFusion, a new approach to 3D-aware image synthesis based on latent +diffusion models (LDMs). We first train an autoencoder that infers a compressed +latent representation, which additionally captures the images' underlying 3D +structure and enables not only reconstruction but also novel view synthesis. To +learn a faithful 3D representation, we leverage cues from monocular depth +prediction. Then, we train a diffusion model in the 3D-aware latent space, +thereby enabling synthesis of high-quality 3D-consistent image samples, +outperforming recent state-of-the-art GAN-based methods. Importantly, our +3D-aware LDM is trained without any direct supervision from multiview images or +3D geometry and does not require posed images or learned pose or camera +distributions. It directly learns a 3D representation without relying on +canonical camera coordinates. This opens up promising research avenues for +scalable 3D-aware image synthesis and 3D content creation from in-the-wild +image data. See https://katjaschwarz.github.io/wildfusion for videos of our 3D +results. + +
+
+
+
+
+ + ♻ ☆ Toward Reliable Human Pose Forecasting with Uncertainty + + +
+ Recently, there has been an arms race of pose forecasting methods aimed at +solving the spatio-temporal task of predicting a sequence of future 3D poses of +a person given a sequence of past observed ones. However, the lack of unified +benchmarks and limited uncertainty analysis have hindered progress in the +field. To address this, we first develop an open-source library for human pose +forecasting, including multiple models, supporting several datasets, and +employing standardized evaluation metrics, with the aim of promoting research +and moving toward a unified and consistent evaluation. Second, we devise two +types of uncertainty in the problem to increase performance and convey better +trust: 1) we propose a method for modeling aleatoric uncertainty by using +uncertainty priors to inject knowledge about the pattern of uncertainty. This +focuses the capacity of the model in the direction of more meaningful +supervision while reducing the number of learned parameters and improving +stability; 2) we introduce a novel approach for quantifying the epistemic +uncertainty of any model through clustering and measuring the entropy of its +assignments. Our experiments demonstrate up to $25\%$ improvements in +forecasting at short horizons, with no loss on longer horizons on Human3.6M, +AMSS, and 3DPW datasets, and better performance in uncertainty estimation. The +code is available online at https://github.com/vita-epfl/UnPOSed. + +
+
+ comment: Published in RA-L 2024 +
+
+
+
+
+ + ♻ ☆ Efficient Masked Face Recognition Method during the COVID-19 Pandemic + + +
+ The coronavirus disease (COVID-19) is an unparalleled crisis leading to a +huge number of casualties and security problems. In order to reduce the spread +of coronavirus, people often wear masks to protect themselves. This makes face +recognition a very difficult task since certain parts of the face are hidden. A +primary focus of researchers during the ongoing coronavirus pandemic is to come +up with suggestions to handle this problem through rapid and efficient +solutions. In this paper, we propose a reliable method based on occlusion +removal and deep learning-based features in order to address the problem of the +masked face recognition process. The first step is to remove the masked face +region. Next, we apply three pre-trained deep Convolutional Neural Networks +(CNN) namely, VGG-16, AlexNet, and ResNet-50, and use them to extract deep +features from the obtained regions (mostly eyes and forehead regions). The +Bag-of-features paradigm is then applied to the feature maps of the last +convolutional layer in order to quantize them and to get a slight +representation comparing to the fully connected layer of classical CNN. +Finally, Multilayer Perceptron (MLP) is applied for the classification process. +Experimental results on Real-World-Masked-Face-Dataset show high recognition +performance compared to other state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ Impacts of Color and Texture Distortions on Earth Observation Data in + Deep Learning + + +
+ Land cover classification and change detection are two important applications +of remote sensing and Earth observation (EO) that have benefited greatly from +the advances of deep learning. Convolutional and transformer-based U-net models +are the state-of-the-art architectures for these tasks, and their performances +have been boosted by an increased availability of large-scale annotated EO +datasets. However, the influence of different visual characteristics of the +input EO data on a model's predictions is not well understood. In this work we +systematically examine model sensitivities with respect to several color- and +texture-based distortions on the input EO data during inference, given models +that have been trained without such distortions. We conduct experiments with +multiple state-of-the-art segmentation networks for land cover classification +and show that they are in general more sensitive to texture than to color +distortions. Beyond revealing intriguing characteristics of widely used land +cover classification models, our results can also be used to guide the +development of more robust models within the EO domain. + +
+
+
+
+
+ + ♻ ☆ Vision Transformers Need Registers + + +
+ Transformers have recently emerged as a powerful tool for learning visual +representations. In this paper, we identify and characterize artifacts in +feature maps of both supervised and self-supervised ViT networks. The artifacts +correspond to high-norm tokens appearing during inference primarily in +low-informative background areas of images, that are repurposed for internal +computations. We propose a simple yet effective solution based on providing +additional tokens to the input sequence of the Vision Transformer to fill that +role. We show that this solution fixes that problem entirely for both +supervised and self-supervised models, sets a new state of the art for +self-supervised visual models on dense visual prediction tasks, enables object +discovery methods with larger models, and most importantly leads to smoother +feature maps and attention maps for downstream visual processing. + +
+
+
+
+
+ + ♻ ☆ Safe-CLIP: Removing NSFW Concepts from Vision-and-Language Models + + +
+ Large-scale vision-and-language models, such as CLIP, are typically trained +on web-scale data, which can introduce inappropriate content and lead to the +development of unsafe and biased behavior. This, in turn, hampers their +applicability in sensitive and trustworthy contexts and could raise significant +concerns in their adoption. Our research introduces a novel approach to +enhancing the safety of vision-and-language models by diminishing their +sensitivity to NSFW (not safe for work) inputs. In particular, our methodology +seeks to sever "toxic" linguistic and visual concepts, unlearning the linkage +between unsafe linguistic or visual items and unsafe regions of the embedding +space. We show how this can be done by fine-tuning a CLIP model on synthetic +data obtained from a large language model trained to convert between safe and +unsafe sentences, and a text-to-image generator. We conduct extensive +experiments on the resulting embedding space for cross-modal retrieval, +text-to-image, and image-to-text generation, where we show that our model can +be remarkably employed with pre-trained generative models. Our source code and +trained models are available at: https://github.com/aimagelab/safe-clip. + +
+
+
+
+
+ + ♻ ☆ Lightweight Deep Learning for Resource-Constrained Environments: A + Survey + + +
+ Over the past decade, the dominance of deep learning has prevailed across +various domains of artificial intelligence, including natural language +processing, computer vision, and biomedical signal processing. While there have +been remarkable improvements in model accuracy, deploying these models on +lightweight devices, such as mobile phones and microcontrollers, is constrained +by limited resources. In this survey, we provide comprehensive design guidance +tailored for these devices, detailing the meticulous design of lightweight +models, compression methods, and hardware acceleration strategies. The +principal goal of this work is to explore methods and concepts for getting +around hardware constraints without compromising the model's accuracy. +Additionally, we explore two notable paths for lightweight deep learning in the +future: deployment techniques for TinyML and Large Language Models. Although +these paths undoubtedly have potential, they also present significant +challenges, encouraging research into unexplored areas. + +
+
+ comment: 40 pages +
+
+
+
+
+ + ♻ ☆ Which Transformer to Favor: A Comparative Analysis of Efficiency in + Vision Transformers + + +
+ Transformers come with a high computational cost, yet their effectiveness in +addressing problems in language and vision has sparked extensive research aimed +at enhancing their efficiency. However, diverse experimental conditions, +spanning multiple input domains, prevent a fair comparison based solely on +reported results, posing challenges for model selection. To address this gap in +comparability, we design a comprehensive benchmark of more than 30 models for +image classification, evaluating key efficiency aspects, including accuracy, +speed, and memory usage. This benchmark provides a standardized baseline across +the landscape of efficiency-oriented transformers and our framework of +analysis, based on Pareto optimality, reveals surprising insights. Despite +claims of other models being more efficient, ViT remains Pareto optimal across +multiple metrics. We observe that hybrid attention-CNN models exhibit +remarkable inference memory- and parameter-efficiency. Moreover, our benchmark +shows that using a larger model in general is more efficient than using higher +resolution images. Thanks to our holistic evaluation, we provide a centralized +resource for practitioners and researchers, facilitating informed decisions +when selecting transformers or measuring progress of the development of +efficient transformers. + +
+
+
+
+
+ + ♻ ☆ NeuroNCAP: Photorealistic Closed-loop Safety Testing for Autonomous + Driving + + +
+ We present a versatile NeRF-based simulator for testing autonomous driving +(AD) software systems, designed with a focus on sensor-realistic closed-loop +evaluation and the creation of safety-critical scenarios. The simulator learns +from sequences of real-world driving sensor data and enables reconfigurations +and renderings of new, unseen scenarios. In this work, we use our simulator to +test the responses of AD models to safety-critical scenarios inspired by the +European New Car Assessment Programme (Euro NCAP). Our evaluation reveals that, +while state-of-the-art end-to-end planners excel in nominal driving scenarios +in an open-loop setting, they exhibit critical flaws when navigating our +safety-critical scenarios in a closed-loop setting. This highlights the need +for advancements in the safety and real-world usability of end-to-end planners. +By publicly releasing our simulator and scenarios as an easy-to-run evaluation +suite, we invite the research community to explore, refine, and validate their +AD models in controlled, yet highly configurable and challenging +sensor-realistic environments. Code and instructions can be found at +https://github.com/wljungbergh/NeuroNCAP + +
+
+
+
+
+ + ♻ ☆ ZONE: Zero-Shot Instruction-Guided Local Editing CVPR 2024 + + +
+ Recent advances in vision-language models like Stable Diffusion have shown +remarkable power in creative image synthesis and editing.However, most existing +text-to-image editing methods encounter two obstacles: First, the text prompt +needs to be carefully crafted to achieve good results, which is not intuitive +or user-friendly. Second, they are insensitive to local edits and can +irreversibly affect non-edited regions, leaving obvious editing traces. To +tackle these problems, we propose a Zero-shot instructiON-guided local image +Editing approach, termed ZONE. We first convert the editing intent from the +user-provided instruction (e.g., "make his tie blue") into specific image +editing regions through InstructPix2Pix. We then propose a Region-IoU scheme +for precise image layer extraction from an off-the-shelf segment model. We +further develop an edge smoother based on FFT for seamless blending between the +layer and the image.Our method allows for arbitrary manipulation of a specific +region with a single instruction while preserving the rest. Extensive +experiments demonstrate that our ZONE achieves remarkable local editing results +and user-friendliness, outperforming state-of-the-art methods. Code is +available at https://github.com/lsl001006/ZONE. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Leveraging Foundation Models for Content-Based Medical Image Retrieval + in Radiology + + +
+ Content-based image retrieval (CBIR) has the potential to significantly +improve diagnostic aid and medical research in radiology. Current CBIR systems +face limitations due to their specialization to certain pathologies, limiting +their utility. In response, we propose using vision foundation models as +powerful and versatile off-the-shelf feature extractors for content-based +medical image retrieval. By benchmarking these models on a comprehensive +dataset of 1.6 million 2D radiological images spanning four modalities and 161 +pathologies, we identify weakly-supervised models as superior, achieving a P@1 +of up to 0.594. This performance not only competes with a specialized model but +does so without the need for fine-tuning. Our analysis further explores the +challenges in retrieving pathological versus anatomical structures, indicating +that accurate retrieval of pathological features presents greater difficulty. +Despite these challenges, our research underscores the vast potential of +foundation models for CBIR in radiology, proposing a shift towards versatile, +general-purpose medical image retrieval systems that do not require specific +tuning. + +
+
+
+
+
+ + ♻ ☆ DUFOMap: Efficient Dynamic Awareness Mapping + + +
+ The dynamic nature of the real world is one of the main challenges in +robotics. The first step in dealing with it is to detect which parts of the +world are dynamic. A typical benchmark task is to create a map that contains +only the static part of the world to support, for example, localization and +planning. Current solutions are often applied in post-processing, where +parameter tuning allows the user to adjust the setting for a specific dataset. +In this paper, we propose DUFOMap, a novel dynamic awareness mapping framework +designed for efficient online processing. Despite having the same parameter +settings for all scenarios, it performs better or is on par with +state-of-the-art methods. Ray casting is utilized to identify and classify +fully observed empty regions. Since these regions have been observed empty, it +follows that anything inside them at another time must be dynamic. Evaluation +is carried out in various scenarios, including outdoor environments in KITTI +and Argoverse 2, open areas on the KTH campus, and with different sensor types. +DUFOMap outperforms the state of the art in terms of accuracy and computational +efficiency. The source code, benchmarks, and links to the datasets utilized are +provided. See https://kth-rpl.github.io/dufomap for more details. + +
+
+ comment: The first two authors hold equal contribution. 8 pages, 7 figures, + project page https://kth-rpl.github.io/dufomap +
+
+
+
+
+ + ♻ ☆ A Systematic Survey of Deep Learning-based Single-Image Super-Resolution + + +
+ Single-image super-resolution (SISR) is an important task in image +processing, which aims to enhance the resolution of imaging systems. Recently, +SISR has made a huge leap and has achieved promising results with the help of +deep learning (DL). In this survey, we give an overview of DL-based SISR +methods and group them according to their design targets. Specifically, we +first introduce the problem definition, research background, and the +significance of SISR. Secondly, we introduce some related works, including +benchmark datasets, upsampling methods, optimization objectives, and image +quality assessment methods. Thirdly, we provide a detailed investigation of +SISR and give some domain-specific applications of it. Fourthly, we present the +reconstruction results of some classic SISR methods to intuitively know their +performance. Finally, we discuss some issues that still exist in SISR and +summarize some new trends and future directions. This is an exhaustive survey +of SISR, which can help researchers better understand SISR and inspire more +exciting research in this field. An investigation project for SISR is provided +at https://github.com/CV-JunchengLi/SISR-Survey. + +
+
+ comment: 40 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ How is Visual Attention Influenced by Text Guidance? Database and Model + + +
+ The analysis and prediction of visual attention have long been crucial tasks +in the fields of computer vision and image processing. In practical +applications, images are generally accompanied by various text descriptions, +however, few studies have explored the influence of text descriptions on visual +attention, let alone developed visual saliency prediction models considering +text guidance. In this paper, we conduct a comprehensive study on text-guided +image saliency (TIS) from both subjective and objective perspectives. +Specifically, we construct a TIS database named SJTU-TIS, which includes 1200 +text-image pairs and the corresponding collected eye-tracking data. Based on +the established SJTU-TIS database, we analyze the influence of various text +descriptions on visual attention. Then, to facilitate the development of +saliency prediction models considering text influence, we construct a benchmark +for the established SJTU-TIS database using state-of-the-art saliency models. +Finally, considering the effect of text descriptions on visual attention, while +most existing saliency models ignore this impact, we further propose a +text-guided saliency (TGSal) prediction model, which extracts and integrates +both image features and text features to predict the image saliency under +various text-description conditions. Our proposed model significantly +outperforms the state-of-the-art saliency models on both the SJTU-TIS database +and the pure image saliency databases in terms of various evaluation metrics. +The SJTU-TIS database and the code of the proposed TGSal model will be released +at: https://github.com/IntMeGroup/TGSal. + +
+
+
+
+
+ + ♻ ☆ CoBra: Complementary Branch Fusing Class and Semantic Knowledge for + Robust Weakly Supervised Semantic Segmentation + + +
+ Leveraging semantically precise pseudo masks derived from image-level class +knowledge for segmentation, namely image-level Weakly Supervised Semantic +Segmentation (WSSS), still remains challenging. While Class Activation Maps +(CAMs) using CNNs have steadily been contributing to the success of WSSS, the +resulting activation maps often narrowly focus on class-specific parts (e.g., +only face of human). On the other hand, recent works based on vision +transformers (ViT) have shown promising results based on their self-attention +mechanism to capture the semantic parts but fail in capturing complete +class-specific details (e.g., entire body parts of human but also with a dog +nearby). In this work, we propose Complementary Branch (CoBra), a novel dual +branch framework consisting of two distinct architectures which provide +valuable complementary knowledge of class (from CNN) and semantic (from ViT) to +each branch. In particular, we learn Class-Aware Projection (CAP) for the CNN +branch and Semantic-Aware Projection (SAP) for the ViT branch to explicitly +fuse their complementary knowledge and facilitate a new type of extra +patch-level supervision. Our model, through CoBra, fuses CNN and ViT's +complementary outputs to create robust pseudo masks that integrate both class +and semantic information effectively. Extensive experiments qualitatively and +quantitatively investigate how CNN and ViT complement each other on the PASCAL +VOC 2012 dataset, showing a state-of-the-art WSSS result. This includes not +only the masks generated by our model, but also the segmentation results +derived from utilizing these masks as pseudo labels. + +
+
+
+
+
+ + ♻ ☆ Rapid post-disaster infrastructure damage characterisation enabled by + remote sensing and deep learning technologies -- a tiered approach + + +
+ Critical infrastructure, such as transport networks and bridges, are +systematically targeted during wars and suffer damage during extensive natural +disasters because it is vital for enabling connectivity and transportation of +people and goods, and hence, underpins national and international economic +growth. Mass destruction of transport assets, in conjunction with minimal or no +accessibility in the wake of natural and anthropogenic disasters, prevents us +from delivering rapid recovery and adaptation. As a result, systemic +operability is drastically reduced, leading to low levels of resilience. Thus, +there is a need for rapid assessment of its condition to allow for informed +decision-making for restoration prioritisation. A solution to this challenge is +to use technology that enables stand-off observations. Nevertheless, no methods +exist for automated characterisation of damage at multiple scales, i.e. +regional (e.g., network), asset (e.g., bridges), and structural (e.g., road +pavement) scales. We propose a methodology based on an integrated, multi-scale +tiered approach to fill this capability gap. In doing so, we demonstrate how +automated damage characterisation can be enabled by fit-for-purpose digital +technologies. Next, the methodology is applied and validated to a case study in +Ukraine that includes 17 bridges, damaged by human targeted interventions. From +regional to component scale, we deploy technology to integrate assessments +using Sentinel-1 SAR images, crowdsourced information, and high-resolution +images for deep learning to facilitate automatic damage detection and +characterisation. For the first time, the interferometric coherence difference +and semantic segmentation of images were deployed in a tiered multi-scale +approach to improve the reliability of damage characterisations at different +scales. + +
+
+ comment: 43 pages; 20 figures +
+
+
+
+
+ + ♻ ☆ Perceptual Assessment and Optimization of High Dynamic Range Image + Rendering + + +
+ High dynamic range (HDR) rendering has the ability to faithfully reproduce +the wide luminance ranges in natural scenes, but how to accurately assess the +rendering quality is relatively underexplored. Existing quality models are +mostly designed for low dynamic range (LDR) images, and do not align well with +human perception of HDR image quality. To fill this gap, we propose a family of +HDR quality metrics, in which the key step is employing a simple inverse +display model to decompose an HDR image into a stack of LDR images with varying +exposures. Subsequently, these decomposed images are assessed through +well-established LDR quality metrics. Our HDR quality models present three +distinct benefits. First, they directly inherit the recent advancements of LDR +quality metrics. Second, they do not rely on human perceptual data of HDR image +quality for re-calibration. Third, they facilitate the alignment and +prioritization of specific luminance ranges for more accurate and detailed +quality assessment. Experimental results show that our HDR quality metrics +consistently outperform existing models in terms of quality assessment on four +HDR image quality datasets and perceptual optimization of HDR novel view +synthesis. + +
+
+
+
+
+ + ♻ ☆ FairVision: Equitable Deep Learning for Eye Disease Screening via Fair + Identity Scaling + + +
+ Equity in AI for healthcare is crucial due to its direct impact on human +well-being. Despite advancements in 2D medical imaging fairness, the fairness +of 3D models remains underexplored, hindered by the small sizes of 3D fairness +datasets. Since 3D imaging surpasses 2D imaging in SOTA clinical care, it is +critical to understand the fairness of these 3D models. To address this +research gap, we conduct the first comprehensive study on the fairness of 3D +medical imaging models across multiple protected attributes. Our investigation +spans both 2D and 3D models and evaluates fairness across five architectures on +three common eye diseases, revealing significant biases across race, gender, +and ethnicity. To alleviate these biases, we propose a novel fair identity +scaling (FIS) method that improves both overall performance and fairness, +outperforming various SOTA fairness methods. Moreover, we release +Harvard-FairVision, the first large-scale medical fairness dataset with 30,000 +subjects featuring both 2D and 3D imaging data and six demographic identity +attributes. Harvard-FairVision provides labels for three major eye disorders +affecting about 380 million people worldwide, serving as a valuable resource +for both 2D and 3D fairness learning. Our code and dataset are publicly +accessible at +\url{https://ophai.hms.harvard.edu/datasets/harvard-fairvision30k}. + +
+
+
+
+
+ + ♻ ☆ Deep Learning-Based MR Image Re-parameterization SC + + +
+ Magnetic resonance (MR) image re-parameterization refers to the process of +generating via simulations of an MR image with a new set of MRI scanning +parameters. Different parameter values generate distinct contrast between +different tissues, helping identify pathologic tissue. Typically, more than one +scan is required for diagnosis; however, acquiring repeated scans can be +costly, time-consuming, and difficult for patients. Thus, using MR image +re-parameterization to predict and estimate the contrast in these imaging scans +can be an effective alternative. In this work, we propose a novel deep learning +(DL) based convolutional model for MRI re-parameterization. Based on our +preliminary results, DL-based techniques hold the potential to learn the +non-linearities that govern the re-parameterization. + +
+
+ comment: A. Narang, A. Raj, M. Pop and M. Ebrahimi, "Deep Learning-Based MR + Image Re-parameterization," 2023 Congress in Computer Science, Computer + Engineering, & Applied Computing (CSCE), Las Vegas, NV, USA, 2023, pp. + 536-541, doi: 10.1109/CSCE60160.2023.00094 +
+
+
+
+
+ + ♻ ☆ Graph Neural Networks in Vision-Language Image Understanding: A Survey + + +
+ 2D image understanding is a complex problem within computer vision, but it +holds the key to providing human-level scene comprehension. It goes further +than identifying the objects in an image, and instead, it attempts to +understand the scene. Solutions to this problem form the underpinning of a +range of tasks, including image captioning, visual question answering (VQA), +and image retrieval. Graphs provide a natural way to represent the relational +arrangement between objects in an image, and thus, in recent years graph neural +networks (GNNs) have become a standard component of many 2D image understanding +pipelines, becoming a core architectural component, especially in the VQA group +of tasks. In this survey, we review this rapidly evolving field and we provide +a taxonomy of graph types used in 2D image understanding approaches, a +comprehensive list of the GNN models used in this domain, and a roadmap of +future potential developments. To the best of our knowledge, this is the first +comprehensive survey that covers image captioning, visual question answering, +and image retrieval techniques that focus on using GNNs as the main part of +their architecture. + +
+
+ comment: 20 pages, 5 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ DiffusionGAN3D: Boosting Text-guided 3D Generation and Domain Adaptation + by Combining 3D GANs and Diffusion Priors CVPR2024 + + +
+ Text-guided domain adaptation and generation of 3D-aware portraits find many +applications in various fields. However, due to the lack of training data and +the challenges in handling the high variety of geometry and appearance, the +existing methods for these tasks suffer from issues like inflexibility, +instability, and low fidelity. In this paper, we propose a novel framework +DiffusionGAN3D, which boosts text-guided 3D domain adaptation and generation by +combining 3D GANs and diffusion priors. Specifically, we integrate the +pre-trained 3D generative models (e.g., EG3D) and text-to-image diffusion +models. The former provides a strong foundation for stable and high-quality +avatar generation from text. And the diffusion models in turn offer powerful +priors and guide the 3D generator finetuning with informative direction to +achieve flexible and efficient text-guided domain adaptation. To enhance the +diversity in domain adaptation and the generation capability in text-to-avatar, +we introduce the relative distance loss and case-specific learnable triplane +respectively. Besides, we design a progressive texture refinement module to +improve the texture quality for both tasks above. Extensive experiments +demonstrate that the proposed framework achieves excellent results in both +domain adaptation and text-to-avatar tasks, outperforming existing methods in +terms of generation quality and efficiency. The project homepage is at +https://younglbw.github.io/DiffusionGAN3D-homepage/. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ MC$^2$: Multi-concept Guidance for Customized Multi-concept Generation + + +
+ Customized text-to-image generation aims to synthesize instantiations of +user-specified concepts and has achieved unprecedented progress in handling +individual concept. However, when extending to multiple customized concepts, +existing methods exhibit limitations in terms of flexibility and fidelity, only +accommodating the combination of limited types of models and potentially +resulting in a mix of characteristics from different concepts. In this paper, +we introduce the Multi-concept guidance for Multi-concept customization, termed +MC$^2$, for improved flexibility and fidelity. MC$^2$ decouples the +requirements for model architecture via inference time optimization, allowing +the integration of various heterogeneous single-concept customized models. It +adaptively refines the attention weights between visual and textual tokens, +directing image regions to focus on their associated words while diminishing +the impact of irrelevant ones. Extensive experiments demonstrate that MC$^2$ +even surpasses previous methods that require additional training in terms of +consistency with input prompt and reference images. Moreover, MC$^2$ can be +extended to elevate the compositional capabilities of text-to-image generation, +yielding appealing results. Code will be publicly available at +https://github.com/JIANGJiaXiu/MC-2. + +
+
+
+
+
+ + ♻ ☆ FF-LOGO: Cross-Modality Point Cloud Registration with Feature Filtering + and Local to Global Optimization ICRA + + +
+ Cross-modality point cloud registration is confronted with significant +challenges due to inherent differences in modalities between different sensors. +We propose a cross-modality point cloud registration framework FF-LOGO: a +cross-modality point cloud registration method with feature filtering and +local-global optimization. The cross-modality feature correlation filtering +module extracts geometric transformation-invariant features from cross-modality +point clouds and achieves point selection by feature matching. We also +introduce a cross-modality optimization process, including a local adaptive key +region aggregation module and a global modality consistency fusion optimization +module. Experimental results demonstrate that our two-stage optimization +significantly improves the registration accuracy of the feature association and +selection module. Our method achieves a substantial increase in recall rate +compared to the current state-of-the-art methods on the 3DCSR dataset, +improving from 40.59% to 75.74%. Our code will be available at +https://github.com/wangmohan17/FFLOGO. + +
+
+ comment: Accepted by 2024 IEEE International Conference on Robotics and + Automation (ICRA),7 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ DiffBIR: Towards Blind Image Restoration with Generative Diffusion Prior + + +
+ We present DiffBIR, a general restoration pipeline that could handle +different blind image restoration tasks in a unified framework. DiffBIR +decouples blind image restoration problem into two stages: 1) degradation +removal: removing image-independent content; 2) information regeneration: +generating the lost image content. Each stage is developed independently but +they work seamlessly in a cascaded manner. In the first stage, we use +restoration modules to remove degradations and obtain high-fidelity restored +results. For the second stage, we propose IRControlNet that leverages the +generative ability of latent diffusion models to generate realistic details. +Specifically, IRControlNet is trained based on specially produced condition +images without distracting noisy content for stable generation performance. +Moreover, we design a region-adaptive restoration guidance that can modify the +denoising process during inference without model re-training, allowing users to +balance realness and fidelity through a tunable guidance scale. Extensive +experiments have demonstrated DiffBIR's superiority over state-of-the-art +approaches for blind image super-resolution, blind face restoration and blind +image denoising tasks on both synthetic and real-world datasets. The code is +available at https://github.com/XPixelGroup/DiffBIR. + +
+
+
+
+
+ + ♻ ☆ RoadFormer: Duplex Transformer for RGB-Normal Semantic Road Scene + Parsing + + +
+ The recent advancements in deep convolutional neural networks have shown +significant promise in the domain of road scene parsing. Nevertheless, the +existing works focus primarily on freespace detection, with little attention +given to hazardous road defects that could compromise both driving safety and +comfort. In this paper, we introduce RoadFormer, a novel Transformer-based +data-fusion network developed for road scene parsing. RoadFormer utilizes a +duplex encoder architecture to extract heterogeneous features from both RGB +images and surface normal information. The encoded features are subsequently +fed into a novel heterogeneous feature synergy block for effective feature +fusion and recalibration. The pixel decoder then learns multi-scale long-range +dependencies from the fused and recalibrated heterogeneous features, which are +subsequently processed by a Transformer decoder to produce the final semantic +prediction. Additionally, we release SYN-UDTIRI, the first large-scale road +scene parsing dataset that contains over 10,407 RGB images, dense depth images, +and the corresponding pixel-level annotations for both freespace and road +defects of different shapes and sizes. Extensive experimental evaluations +conducted on our SYN-UDTIRI dataset, as well as on three public datasets, +including KITTI road, CityScapes, and ORFD, demonstrate that RoadFormer +outperforms all other state-of-the-art networks for road scene parsing. +Specifically, RoadFormer ranks first on the KITTI road benchmark. Our source +code, created dataset, and demo video are publicly available at +mias.group/RoadFormer. + +
+
+ comment: 9 pages 7 figures. Accepted by Transactions on Intelligent Vehicles +
+
+
+
+
+ + ♻ ☆ Accelerating ViT Inference on FPGA through Static and Dynamic Pruning + + +
+ Vision Transformers (ViTs) have achieved state-of-the-art accuracy on various +computer vision tasks. However, their high computational complexity prevents +them from being applied to many real-world applications. Weight and token +pruning are two well-known methods for reducing complexity: weight pruning +reduces the model size and associated computational demands, while token +pruning further dynamically reduces the computation based on the input. +Combining these two techniques should significantly reduce computation +complexity and model size; however, naively integrating them results in +irregular computation patterns, leading to significant accuracy drops and +difficulties in hardware acceleration. + Addressing the above challenges, we propose a comprehensive +algorithm-hardware codesign for accelerating ViT on FPGA through simultaneous +pruning -combining static weight pruning and dynamic token pruning. For +algorithm design, we systematically combine a hardware-aware structured +block-pruning method for pruning model parameters and a dynamic token pruning +method for removing unimportant token vectors. Moreover, we design a novel +training algorithm to recover the model's accuracy. For hardware design, we +develop a novel hardware accelerator for executing the pruned model. The +proposed hardware design employs multi-level parallelism with load balancing +strategy to efficiently deal with the irregular computation pattern led by the +two pruning approaches. Moreover, we develop an efficient hardware mechanism +for efficiently executing the on-the-fly token pruning. + +
+
+ comment: FCCM 2024 +
+
+
+
+
+ + ♻ ☆ Conv-Adapter: Exploring Parameter Efficient Transfer Learning for + ConvNets + + +
+ While parameter efficient tuning (PET) methods have shown great potential +with transformer architecture on Natural Language Processing (NLP) tasks, their +effectiveness with large-scale ConvNets is still under-studied on Computer +Vision (CV) tasks. This paper proposes Conv-Adapter, a PET module designed for +ConvNets. Conv-Adapter is light-weight, domain-transferable, and +architecture-agnostic with generalized performance on different tasks. When +transferring on downstream tasks, Conv-Adapter learns tasks-specific feature +modulation to the intermediate representations of backbones while keeping the +pre-trained parameters frozen. By introducing only a tiny amount of learnable +parameters, e.g., only 3.5% full fine-tuning parameters of ResNet50. It can +also be applied for transformer-based backbones. Conv-Adapter outperforms +previous PET baseline methods and achieves comparable or surpasses the +performance of full fine-tuning on 23 classification tasks of various domains. +It also presents superior performance on the few-shot classification with an +average margin of 3.39%. Beyond classification, Conv-Adapter can generalize to +detection and segmentation tasks with more than 50% reduction of parameters but +comparable performance to the traditional full fine-tuning. + +
+
+
+
+
+ + ♻ ☆ Robust Representation Learning with Self-Distillation for Domain + Generalization + + +
+ Despite the recent success of deep neural networks, there remains a need for +effective methods to enhance domain generalization using vision transformers. +In this paper, we propose a novel domain generalization technique called Robust +Representation Learning with Self-Distillation (RRLD) comprising i) +intermediate-block self-distillation and ii) augmentation-guided +self-distillation to improve the generalization capabilities of +transformer-based models on unseen domains. This approach enables the network +to learn robust and general features that are invariant to different +augmentations and domain shifts while effectively mitigating overfitting to +source domains. To evaluate the effectiveness of our proposed method, we +perform extensive experiments on PACS and OfficeHome benchmark datasets, as +well as an industrial wafer semiconductor defect dataset. The results +demonstrate that RRLD achieves robust and accurate generalization performance. +We observe an average accuracy improvement in the range of 1.2% to 2.3% over +the state-of-the-art on the three datasets. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ♻ ☆ EasyTrack: Efficient and Compact One-stream 3D Point Clouds Tracker + + +
+ Most of 3D single object trackers (SOT) in point clouds follow the two-stream +multi-stage 3D Siamese or motion tracking paradigms, which process the template +and search area point clouds with two parallel branches, built on supervised +point cloud backbones. In this work, beyond typical 3D Siamese or motion +tracking, we propose a neat and compact one-stream transformer 3D SOT paradigm +from the novel perspective, termed as \textbf{EasyTrack}, which consists of +three special designs: 1) A 3D point clouds tracking feature pre-training +module is developed to exploit the masked autoencoding for learning 3D point +clouds tracking representations. 2) A unified 3D tracking feature learning and +fusion network is proposed to simultaneously learns target-aware 3D features, +and extensively captures mutual correlation through the flexible self-attention +mechanism. 3) A target location network in the dense bird's eye view (BEV) +feature space is constructed for target classification and regression. +Moreover, we develop an enhanced version named EasyTrack++, which designs the +center points interaction (CPI) strategy to reduce the ambiguous targets caused +by the noise point cloud background information. The proposed EasyTrack and +EasyTrack++ set a new state-of-the-art performance ($\textbf{18\%}$, +$\textbf{40\%}$ and $\textbf{3\%}$ success gains) in KITTI, NuScenes, and Waymo +while runing at \textbf{52.6fps} with few parameters (\textbf{1.3M}). The code +will be available at https://github.com/KnightApple427/Easytrack. + +
+
+
+
+
+ + ♻ ☆ Universal Humanoid Motion Representations for Physics-Based Control ICLR 2024 + + +
+ We present a universal motion representation that encompasses a comprehensive +range of motor skills for physics-based humanoid control. Due to the high +dimensionality of humanoids and the inherent difficulties in reinforcement +learning, prior methods have focused on learning skill embeddings for a narrow +range of movement styles (e.g. locomotion, game characters) from specialized +motion datasets. This limited scope hampers their applicability in complex +tasks. We close this gap by significantly increasing the coverage of our motion +representation space. To achieve this, we first learn a motion imitator that +can imitate all of human motion from a large, unstructured motion dataset. We +then create our motion representation by distilling skills directly from the +imitator. This is achieved by using an encoder-decoder structure with a +variational information bottleneck. Additionally, we jointly learn a prior +conditioned on proprioception (humanoid's own pose and velocities) to improve +model expressiveness and sampling efficiency for downstream tasks. By sampling +from the prior, we can generate long, stable, and diverse human motions. Using +this latent space for hierarchical RL, we show that our policies solve tasks +using human-like behavior. We demonstrate the effectiveness of our motion +representation by solving generative tasks (e.g. strike, terrain traversal) and +motion tracking using VR controllers. + +
+
+ comment: ICLR 2024 Spotlight. Project page: + https://zhengyiluo.github.io/PULSE/ +
+
+
+
+
+ + ♻ ☆ Eye-gaze Guided Multi-modal Alignment Framework for Radiology + + +
+ In multi-modal frameworks, the alignment of cross-modal features presents a +significant challenge. The predominant approach in multi-modal pre-training +emphasizes either global or local alignment between modalities, utilizing +extensive datasets. This bottom-up driven method often suffers from a lack of +interpretability, a critical concern in radiology. Previous studies have +integrated high-level labels in medical images or text, but these still rely on +manual annotation, a costly and labor-intensive process. Our work introduces a +novel approach by using eye-gaze data, collected synchronously by radiologists +during diagnostic evaluations. This data, indicating radiologists' focus areas, +naturally links chest X-rays to diagnostic texts. We propose the Eye-gaze +Guided Multi-modal Alignment (EGMA) framework to harness eye-gaze data for +better alignment of image and text features, aiming to reduce reliance on +manual annotations and thus cut training costs. Our model demonstrates robust +performance, outperforming other state-of-the-art methods in zero-shot +classification and retrieval tasks. The incorporation of easily-obtained +eye-gaze data during routine radiological diagnoses signifies a step towards +minimizing manual annotation dependency. Additionally, we explore the impact of +varying amounts of eye-gaze data on model performance, highlighting the +feasibility and utility of integrating this auxiliary data into multi-modal +pre-training. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ A Technique for Classifying Static Gestures Using UWB Radar + + +
+ Our paper presents a robust framework for UWB-based static gesture +recognition, leveraging proprietary UWB radar sensor technology. Extensive data +collection efforts were undertaken to compile datasets containing five commonly +used gestures. Our approach involves a comprehensive data pre-processing +pipeline that encompasses outlier handling, aspect ratio-preserving resizing, +and false-color image transformation. Both CNN and MobileNet models were +trained on the processed images. Remarkably, our best-performing model achieved +an accuracy of 96.78%. Additionally, we developed a user-friendly GUI framework +to assess the model's system resource usage and processing times, which +revealed low memory utilization and real-time task completion in under one +second. This research marks a significant step towards enhancing static gesture +recognition using UWB technology, promising practical applications in various +domains. + +
+
+ comment: This is not a technical research paper, but an excerpt of what was + applied during a funded project for the promotion of Open Science +
+
+
+
+
+ + ♻ ☆ ChangeNet: Multi-Temporal Asymmetric Change Detection Dataset ICASSP 2024 + + +
+ Change Detection (CD) has been attracting extensive interests with the +availability of bi-temporal datasets. However, due to the huge cost of +multi-temporal images acquisition and labeling, existing change detection +datasets are small in quantity, short in temporal, and low in practicability. +Therefore, a large-scale practical-oriented dataset covering wide temporal +phases is urgently needed to facilitate the community. To this end, the +ChangeNet dataset is presented especially for multi-temporal change detection, +along with the new task of "Asymmetric Change Detection". Specifically, +ChangeNet consists of 31,000 multi-temporal images pairs, a wide range of +complex scenes from 100 cities, and 6 pixel-level annotated categories, which +is far superior to all the existing change detection datasets including +LEVIR-CD, WHU Building CD, etc.. In addition, ChangeNet contains amounts of +real-world perspective distortions in different temporal phases on the same +areas, which is able to promote the practical application of change detection +algorithms. The ChangeNet dataset is suitable for both binary change detection +(BCD) and semantic change detection (SCD) tasks. Accordingly, we benchmark the +ChangeNet dataset on six BCD methods and two SCD methods, and extensive +experiments demonstrate its challenges and great significance. The dataset is +available at https://github.com/jankyee/ChangeNet. + +
+
+ comment: Accepted to ICASSP 2024 Oral/Lecture +
+
+
+
+
+ + ♻ ☆ Comment-aided Video-Language Alignment via Contrastive Pre-training for + Short-form Video Humor Detection ICMR 2024 + + +
+ The growing importance of multi-modal humor detection within affective +computing correlates with the expanding influence of short-form video sharing +on social media platforms. In this paper, we propose a novel two-branch +hierarchical model for short-form video humor detection (SVHD), named +Comment-aided Video-Language Alignment (CVLA) via data-augmented multi-modal +contrastive pre-training. Notably, our CVLA not only operates on raw signals +across various modal channels but also yields an appropriate multi-modal +representation by aligning the video and language components within a +consistent semantic space. The experimental results on two humor detection +datasets, including DY11k and UR-FUNNY, demonstrate that CVLA dramatically +outperforms state-of-the-art and several competitive baseline approaches. Our +dataset, code and model release at https://github.com/yliu-cs/CVLA. + +
+
+ comment: Accepted by ICMR 2024 +
+
+
+
+
+ + ♻ ☆ CosalPure: Learning Concept from Group Images for Robust Co-Saliency + Detection CVPR 2024 + + +
+ Co-salient object detection (CoSOD) aims to identify the common and salient +(usually in the foreground) regions across a given group of images. Although +achieving significant progress, state-of-the-art CoSODs could be easily +affected by some adversarial perturbations, leading to substantial accuracy +reduction. The adversarial perturbations can mislead CoSODs but do not change +the high-level semantic information (e.g., concept) of the co-salient objects. +In this paper, we propose a novel robustness enhancement framework by first +learning the concept of the co-salient objects based on the input group images +and then leveraging this concept to purify adversarial perturbations, which are +subsequently fed to CoSODs for robustness enhancement. Specifically, we propose +CosalPure containing two modules, i.e., group-image concept learning and +concept-guided diffusion purification. For the first module, we adopt a +pre-trained text-to-image diffusion model to learn the concept of co-salient +objects within group images where the learned concept is robust to adversarial +examples. For the second module, we map the adversarial image to the latent +space and then perform diffusion generation by embedding the learned concept +into the noise prediction function as an extra condition. Our method can +effectively alleviate the influence of the SOTA adversarial attack containing +different adversarial patterns, including exposure and noise. The extensive +results demonstrate that our method could enhance the robustness of CoSODs +significantly. + +
+
+ comment: This paper is accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ RDFC-GAN: RGB-Depth Fusion CycleGAN for Indoor Depth Completion CVPR 2022 + + +
+ Raw depth images captured in indoor scenarios frequently exhibit extensive +missing values due to the inherent limitations of the sensors and environments. +For example, transparent materials frequently elude detection by depth sensors; +surfaces may introduce measurement inaccuracies due to their polished textures, +extended distances, and oblique incidence angles from the sensor. The presence +of incomplete depth maps imposes significant challenges for subsequent vision +applications, prompting the development of numerous depth completion techniques +to mitigate this problem. Numerous methods excel at reconstructing dense depth +maps from sparse samples, but they often falter when faced with extensive +contiguous regions of missing depth values, a prevalent and critical challenge +in indoor environments. To overcome these challenges, we design a novel +two-branch end-to-end fusion network named RDFC-GAN, which takes a pair of RGB +and incomplete depth images as input to predict a dense and completed depth +map. The first branch employs an encoder-decoder structure, by adhering to the +Manhattan world assumption and utilizing normal maps from RGB-D information as +guidance, to regress the local dense depth values from the raw depth map. The +other branch applies an RGB-depth fusion CycleGAN, adept at translating RGB +imagery into detailed, textured depth maps while ensuring high fidelity through +cycle consistency. We fuse the two branches via adaptive fusion modules named +W-AdaIN and train the model with the help of pseudo depth maps. Comprehensive +evaluations on NYU-Depth V2 and SUN RGB-D datasets show that our method +significantly enhances depth completion performance particularly in realistic +indoor settings. + +
+
+ comment: Haowen Wang and Zhengping Che are with equal contributions. Paper + accepted by IEEE Transactions on Pattern Analysis and Machine Intelligence + (TPAMI). An earlier version has been accepted by CVPR 2022 + (arXiv:2203.10856). arXiv admin note: text overlap with arXiv:2203.10856 +
+
+
+
+
+ + ♻ ☆ HICO-DET-SG and V-COCO-SG: New Data Splits for Evaluating the Systematic + Generalization Performance of Human-Object Interaction Detection Models + + +
+ Human-Object Interaction (HOI) detection is a task to localize humans and +objects in an image and predict the interactions in human-object pairs. In +real-world scenarios, HOI detection models need systematic generalization, +i.e., generalization to novel combinations of objects and interactions, because +the train data are expected to cover a limited portion of all possible +combinations. To evaluate the systematic generalization performance of HOI +detection models, we created two new sets of HOI detection data splits named +HICO-DET-SG and V-COCO-SG based on the HICO-DET and V-COCO datasets, +respectively. When evaluated on the new data splits, HOI detection models with +various characteristics performed much more poorly than when evaluated on the +original splits. This shows that systematic generalization is a challenging +goal in HOI detection. By analyzing the evaluation results, we also gain +insights for improving the systematic generalization performance and identify +four possible future research directions. We hope that our new data splits and +presented analysis will encourage further research on systematic generalization +in HOI detection. + +
+
+ comment: 19 pages, 3 figures, 4 tables +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 167 + +
+
+
+ + ☆ GoMVS: Geometrically Consistent Cost Aggregation for Multi-View Stereo CVPR 2024 + + +
+ Matching cost aggregation plays a fundamental role in learning-based +multi-view stereo networks. However, directly aggregating adjacent costs can +lead to suboptimal results due to local geometric inconsistency. Related +methods either seek selective aggregation or improve aggregated depth in the 2D +space, both are unable to handle geometric inconsistency in the cost volume +effectively. In this paper, we propose GoMVS to aggregate geometrically +consistent costs, yielding better utilization of adjacent geometries. More +specifically, we correspond and propagate adjacent costs to the reference pixel +by leveraging the local geometric smoothness in conjunction with surface +normals. We achieve this by the geometric consistent propagation (GCP) module. +It computes the correspondence from the adjacent depth hypothesis space to the +reference depth space using surface normals, then uses the correspondence to +propagate adjacent costs to the reference geometry, followed by a convolution +for aggregation. Our method achieves new state-of-the-art performance on DTU, +Tanks & Temple, and ETH3D datasets. Notably, our method ranks 1st on the Tanks +& Temple Advanced benchmark. + +
+
+ comment: CVPR 2024. Project page: https://wuuu3511.github.io/gomvs/ Code: + https://github.com/Wuuu3511/GoMVS +
+
+
+
+
+ + ☆ Connecting NeRFs, Images, and Text CVPR + + +
+ Neural Radiance Fields (NeRFs) have emerged as a standard framework for +representing 3D scenes and objects, introducing a novel data type for +information exchange and storage. Concurrently, significant progress has been +made in multimodal representation learning for text and image data. This paper +explores a novel research direction that aims to connect the NeRF modality with +other modalities, similar to established methodologies for images and text. To +this end, we propose a simple framework that exploits pre-trained models for +NeRF representations alongside multimodal models for text and image processing. +Our framework learns a bidirectional mapping between NeRF embeddings and those +obtained from corresponding images and text. This mapping unlocks several novel +and useful applications, including NeRF zero-shot classification and NeRF +retrieval from images or text. + +
+
+ comment: Accepted at CVPRW-INRV 2024 +
+
+
+
+
+ + ☆ GoMAvatar: Efficient Animatable Human Modeling from Monocular Video + Using Gaussians-on-Mesh CVPR 2024 + + +
+ We introduce GoMAvatar, a novel approach for real-time, memory-efficient, +high-quality animatable human modeling. GoMAvatar takes as input a single +monocular video to create a digital avatar capable of re-articulation in new +poses and real-time rendering from novel viewpoints, while seamlessly +integrating with rasterization-based graphics pipelines. Central to our method +is the Gaussians-on-Mesh representation, a hybrid 3D model combining rendering +quality and speed of Gaussian splatting with geometry modeling and +compatibility of deformable meshes. We assess GoMAvatar on ZJU-MoCap data and +various YouTube videos. GoMAvatar matches or surpasses current monocular human +modeling algorithms in rendering quality and significantly outperforms them in +computational efficiency (43 FPS) while being memory-efficient (3.63 MB per +subject). + +
+
+ comment: CVPR 2024; project page: https://wenj.github.io/GoMAvatar/ +
+
+
+
+
+ + ☆ OpenBias: Open-set Bias Detection in Text-to-Image Generative Models CVPR 2024 + + +
+ Text-to-image generative models are becoming increasingly popular and +accessible to the general public. As these models see large-scale deployments, +it is necessary to deeply investigate their safety and fairness to not +disseminate and perpetuate any kind of biases. However, existing works focus on +detecting closed sets of biases defined a priori, limiting the studies to +well-known concepts. In this paper, we tackle the challenge of open-set bias +detection in text-to-image generative models presenting OpenBias, a new +pipeline that identifies and quantifies the severity of biases agnostically, +without access to any precompiled set. OpenBias has three stages. In the first +phase, we leverage a Large Language Model (LLM) to propose biases given a set +of captions. Secondly, the target generative model produces images using the +same set of captions. Lastly, a Vision Question Answering model recognizes the +presence and extent of the previously proposed biases. We study the behavior of +Stable Diffusion 1.5, 2, and XL emphasizing new biases, never investigated +before. Via quantitative experiments, we demonstrate that OpenBias agrees with +current closed-set bias detection methods and human judgement. + +
+
+ comment: CVPR 2024 Highlight - Code: + https://github.com/Picsart-AI-Research/OpenBias +
+
+
+
+
+ + ☆ Any2Point: Empowering Any-modality Large Models for Efficient 3D + Understanding + + +
+ Large foundation models have recently emerged as a prominent focus of +interest, attaining superior performance in widespread scenarios. Due to the +scarcity of 3D data, many efforts have been made to adapt pre-trained +transformers from vision to 3D domains. However, such 2D-to-3D approaches are +still limited, due to the potential loss of spatial geometries and high +computation cost. More importantly, their frameworks are mainly designed for 2D +models, lacking a general any-to-3D paradigm. In this paper, we introduce +Any2Point, a parameter-efficient method to empower any-modality large models +(vision, language, audio) for 3D understanding. Given a frozen transformer from +any source modality, we propose a 3D-to-any (1D or 2D) virtual projection +strategy that correlates the input 3D points to the original 1D or 2D positions +within the source modality. This mechanism enables us to assign each 3D token +with a positional encoding paired with the pre-trained model, which avoids 3D +geometry loss caused by the true projection and better motivates the +transformer for 3D learning with 1D/2D positional priors. Then, within each +transformer block, we insert an any-to-3D guided adapter module for +parameter-efficient fine-tuning. The adapter incorporates prior spatial +knowledge from the source modality to guide the local feature aggregation of 3D +tokens, compelling the semantic adaption of any-modality transformers. We +conduct extensive experiments to showcase the effectiveness and efficiency of +our method. Code and models are released at +https://github.com/Ivan-Tang-3D/Any2Point. + +
+
+ comment: Code and models are released at + https://github.com/Ivan-Tang-3D/Any2Point +
+
+
+
+
+ + ☆ QuasiSim: Parameterized Quasi-Physical Simulators for Dexterous + Manipulations Transfer + + +
+ We explore the dexterous manipulation transfer problem by designing +simulators. The task wishes to transfer human manipulations to dexterous robot +hand simulations and is inherently difficult due to its intricate, +highly-constrained, and discontinuous dynamics and the need to control a +dexterous hand with a DoF to accurately replicate human manipulations. Previous +approaches that optimize in high-fidelity black-box simulators or a modified +one with relaxed constraints only demonstrate limited capabilities or are +restricted by insufficient simulation fidelity. We introduce parameterized +quasi-physical simulators and a physics curriculum to overcome these +limitations. The key ideas are 1) balancing between fidelity and optimizability +of the simulation via a curriculum of parameterized simulators, and 2) solving +the problem in each of the simulators from the curriculum, with properties +ranging from high task optimizability to high fidelity. We successfully enable +a dexterous hand to track complex and diverse manipulations in high-fidelity +simulated environments, boosting the success rate by 11\%+ from the +best-performed baseline. The project website is available at +https://meowuu7.github.io/QuasiSim/. + +
+
+ comment: Project website: https://meowuu7.github.io/QuasiSim/ Code: + https://github.com/Meowuu7/QuasiSim Hugging Face Demo: + https://huggingface.co/spaces/xymeow7/quasi-physical-sims +
+
+
+
+
+ + ☆ ControlNet++: Improving Conditional Controls with Efficient Consistency + Feedback + + +
+ To enhance the controllability of text-to-image diffusion models, existing +efforts like ControlNet incorporated image-based conditional controls. In this +paper, we reveal that existing methods still face significant challenges in +generating images that align with the image conditional controls. To this end, +we propose ControlNet++, a novel approach that improves controllable generation +by explicitly optimizing pixel-level cycle consistency between generated images +and conditional controls. Specifically, for an input conditional control, we +use a pre-trained discriminative reward model to extract the corresponding +condition of the generated images, and then optimize the consistency loss +between the input conditional control and extracted condition. A +straightforward implementation would be generating images from random noises +and then calculating the consistency loss, but such an approach requires +storing gradients for multiple sampling timesteps, leading to considerable time +and memory costs. To address this, we introduce an efficient reward strategy +that deliberately disturbs the input images by adding noise, and then uses the +single-step denoised images for reward fine-tuning. This avoids the extensive +costs associated with image sampling, allowing for more efficient reward +fine-tuning. Extensive experiments show that ControlNet++ significantly +improves controllability under various conditional controls. For example, it +achieves improvements over ControlNet by 7.9% mIoU, 13.4% SSIM, and 7.6% RMSE, +respectively, for segmentation mask, line-art edge, and depth conditions. + +
+
+ comment: Project Page: https://liming-ai.github.io/ControlNet_Plus_Plus +
+
+
+
+
+ + ☆ WaveMo: Learning Wavefront Modulations to See Through Scattering + + +
+ Imaging through scattering media is a fundamental and pervasive challenge in +fields ranging from medical diagnostics to astronomy. A promising strategy to +overcome this challenge is wavefront modulation, which induces measurement +diversity during image acquisition. Despite its importance, designing optimal +wavefront modulations to image through scattering remains under-explored. This +paper introduces a novel learning-based framework to address the gap. Our +approach jointly optimizes wavefront modulations and a computationally +lightweight feedforward "proxy" reconstruction network. This network is trained +to recover scenes obscured by scattering, using measurements that are modified +by these modulations. The learned modulations produced by our framework +generalize effectively to unseen scattering scenarios and exhibit remarkable +versatility. During deployment, the learned modulations can be decoupled from +the proxy network to augment other more computationally expensive restoration +algorithms. Through extensive experiments, we demonstrate our approach +significantly advances the state of the art in imaging through scattering +media. Our project webpage is at https://wavemo-2024.github.io/. + +
+
+
+
+
+ + ☆ View Selection for 3D Captioning via Diffusion Ranking + + +
+ Scalable annotation approaches are crucial for constructing extensive 3D-text +datasets, facilitating a broader range of applications. However, existing +methods sometimes lead to the generation of hallucinated captions, compromising +caption quality. This paper explores the issue of hallucination in 3D object +captioning, with a focus on Cap3D method, which renders 3D objects into 2D +views for captioning using pre-trained models. We pinpoint a major challenge: +certain rendered views of 3D objects are atypical, deviating from the training +data of standard image captioning models and causing hallucinations. To tackle +this, we present DiffuRank, a method that leverages a pre-trained text-to-3D +model to assess the alignment between 3D objects and their 2D rendered views, +where the view with high alignment closely represent the object's +characteristics. By ranking all rendered views and feeding the top-ranked ones +into GPT4-Vision, we enhance the accuracy and detail of captions, enabling the +correction of 200k captions in the Cap3D dataset and extending it to 1 million +captions across Objaverse and Objaverse-XL datasets. Additionally, we showcase +the adaptability of DiffuRank by applying it to pre-trained text-to-image +models for a Visual Question Answering task, where it outperforms the CLIP +model. + +
+
+ comment: Dataset link: https://huggingface.co/datasets/tiange/Cap3D +
+
+
+
+
+ + ☆ Two Effects, One Trigger: On the Modality Gap, Object Bias, and + Information Imbalance in Contrastive Vision-Language Representation Learning + + +
+ Contrastive vision-language models like CLIP have gained popularity for their +versatile applicable learned representations in various downstream tasks. +Despite their successes in some tasks, like zero-shot image recognition, they +also perform surprisingly poor on other tasks, like attribute detection. +Previous work has attributed these challenges to the modality gap, a separation +of image and text in the shared representation space, and a bias towards +objects over other factors, such as attributes. In this work we investigate +both phenomena. We find that only a few embedding dimensions drive the modality +gap. Further, we propose a measure for object bias and find that object bias +does not lead to worse performance on other concepts, such as attributes. But +what leads to the emergence of the modality gap and object bias? To answer this +question we carefully designed an experimental setting which allows us to +control the amount of shared information between the modalities. This revealed +that the driving factor behind both, the modality gap and the object bias, is +the information imbalance between images and captions. + +
+
+
+
+
+ + ☆ Gaga: Group Any Gaussians via 3D-aware Memory Bank + + +
+ We introduce Gaga, a framework that reconstructs and segments open-world 3D +scenes by leveraging inconsistent 2D masks predicted by zero-shot segmentation +models. Contrasted to prior 3D scene segmentation approaches that heavily rely +on video object tracking, Gaga utilizes spatial information and effectively +associates object masks across diverse camera poses. By eliminating the +assumption of continuous view changes in training images, Gaga demonstrates +robustness to variations in camera poses, particularly beneficial for sparsely +sampled images, ensuring precise mask label consistency. Furthermore, Gaga +accommodates 2D segmentation masks from diverse sources and demonstrates robust +performance with different open-world zero-shot segmentation models, enhancing +its versatility. Extensive qualitative and quantitative evaluations demonstrate +that Gaga performs favorably against state-of-the-art methods, emphasizing its +potential for real-world applications such as scene understanding and +manipulation. + +
+
+ comment: Project Page: https://www.gaga.gallery +
+
+
+
+
+ + ☆ Self-supervised Dataset Distillation: A Good Compression Is All You Need + + +
+ Dataset distillation aims to compress information from a large-scale original +dataset to a new compact dataset while striving to preserve the utmost degree +of the original data informational essence. Previous studies have predominantly +concentrated on aligning the intermediate statistics between the original and +distilled data, such as weight trajectory, features, gradient, BatchNorm, etc. +In this work, we consider addressing this task through the new lens of model +informativeness in the compression stage on the original dataset pretraining. +We observe that with the prior state-of-the-art SRe$^2$L, as model sizes +increase, it becomes increasingly challenging for supervised pretrained models +to recover learned information during data synthesis, as the channel-wise mean +and variance inside the model are flatting and less informative. We further +notice that larger variances in BN statistics from self-supervised models +enable larger loss signals to update the recovered data by gradients, enjoying +more informativeness during synthesis. Building on this observation, we +introduce SC-DD, a simple yet effective Self-supervised Compression framework +for Dataset Distillation that facilitates diverse information compression and +recovery compared to traditional supervised learning schemes, further reaps the +potential of large pretrained models with enhanced capabilities. Extensive +experiments are conducted on CIFAR-100, Tiny-ImageNet and ImageNet-1K datasets +to demonstrate the superiority of our proposed approach. The proposed SC-DD +outperforms all previous state-of-the-art supervised dataset distillation +methods when employing larger models, such as SRe$^2$L, MTT, TESLA, DC, CAFE, +etc., by large margins under the same recovery and post-training budgets. Code +is available at https://github.com/VILA-Lab/SRe2L/tree/main/SCDD/. + +
+
+
+
+
+ + ☆ Ferret-v2: An Improved Baseline for Referring and Grounding with Large + Language Models + + +
+ While Ferret seamlessly integrates regional understanding into the Large +Language Model (LLM) to facilitate its referring and grounding capability, it +poses certain limitations: constrained by the pre-trained fixed visual encoder +and failed to perform well on broader tasks. In this work, we unveil Ferret-v2, +a significant upgrade to Ferret, with three key designs. (1) Any resolution +grounding and referring: A flexible approach that effortlessly handles higher +image resolution, improving the model's ability to process and understand +images in greater detail. (2) Multi-granularity visual encoding: By integrating +the additional DINOv2 encoder, the model learns better and diverse underlying +contexts for global and fine-grained visual information. (3) A three-stage +training paradigm: Besides image-caption alignment, an additional stage is +proposed for high-resolution dense alignment before the final instruction +tuning. Experiments show that Ferret-v2 provides substantial improvements over +Ferret and other state-of-the-art methods, thanks to its high-resolution +scaling and fine-grained visual processing. + +
+
+ comment: Preprint. 14 pages, 4 figures +
+
+
+
+
+ + ☆ Taming Stable Diffusion for Text to 360° Panorama Image Generation CVPR 2024 + + +
+ Generative models, e.g., Stable Diffusion, have enabled the creation of +photorealistic images from text prompts. Yet, the generation of 360-degree +panorama images from text remains a challenge, particularly due to the dearth +of paired text-panorama data and the domain gap between panorama and +perspective images. In this paper, we introduce a novel dual-branch diffusion +model named PanFusion to generate a 360-degree image from a text prompt. We +leverage the stable diffusion model as one branch to provide prior knowledge in +natural image generation and register it to another panorama branch for +holistic image generation. We propose a unique cross-attention mechanism with +projection awareness to minimize distortion during the collaborative denoising +process. Our experiments validate that PanFusion surpasses existing methods +and, thanks to its dual-branch structure, can integrate additional constraints +like room layout for customized panorama outputs. Code is available at +https://chengzhag.github.io/publication/panfusion. + +
+
+ comment: CVPR 2024. Project Page: + https://chengzhag.github.io/publication/panfusion Code: + https://github.com/chengzhag/PanFusion +
+
+
+
+
+ + ☆ Boosting Self-Supervision for Single-View Scene Completion via Knowledge + Distillation + + +
+ Inferring scene geometry from images via Structure from Motion is a +long-standing and fundamental problem in computer vision. While classical +approaches and, more recently, depth map predictions only focus on the visible +parts of a scene, the task of scene completion aims to reason about geometry +even in occluded regions. With the popularity of neural radiance fields +(NeRFs), implicit representations also became popular for scene completion by +predicting so-called density fields. Unlike explicit approaches. e.g. +voxel-based methods, density fields also allow for accurate depth prediction +and novel-view synthesis via image-based rendering. In this work, we propose to +fuse the scene reconstruction from multiple images and distill this knowledge +into a more accurate single-view scene reconstruction. To this end, we propose +Multi-View Behind the Scenes (MVBTS) to fuse density fields from multiple posed +images, trained fully self-supervised only from image data. Using knowledge +distillation, we use MVBTS to train a single-view scene completion network via +direct supervision called KDBTS. It achieves state-of-the-art performance on +occupancy prediction, especially in occluded regions. + +
+
+
+
+
+ + ☆ FusionMamba: Efficient Image Fusion with State Space Model + + +
+ Image fusion aims to generate a high-resolution multi/hyper-spectral image by +combining a high-resolution image with limited spectral information and a +low-resolution image with abundant spectral data. Current deep learning +(DL)-based methods for image fusion primarily rely on CNNs or Transformers to +extract features and merge different types of data. While CNNs are efficient, +their receptive fields are limited, restricting their capacity to capture +global context. Conversely, Transformers excel at learning global information +but are hindered by their quadratic complexity. Fortunately, recent +advancements in the State Space Model (SSM), particularly Mamba, offer a +promising solution to this issue by enabling global awareness with linear +complexity. However, there have been few attempts to explore the potential of +SSM in information fusion, which is a crucial ability in domains like image +fusion. Therefore, we propose FusionMamba, an innovative method for efficient +image fusion. Our contributions mainly focus on two aspects. Firstly, +recognizing that images from different sources possess distinct properties, we +incorporate Mamba blocks into two U-shaped networks, presenting a novel +architecture that extracts spatial and spectral features in an efficient, +independent, and hierarchical manner. Secondly, to effectively combine spatial +and spectral information, we extend the Mamba block to accommodate dual inputs. +This expansion leads to the creation of a new module called the FusionMamba +block, which outperforms existing fusion techniques such as concatenation and +cross-attention. To validate FusionMamba's effectiveness, we conduct a series +of experiments on five datasets related to three image fusion tasks. The +quantitative and qualitative evaluation results demonstrate that our method +achieves state-of-the-art (SOTA) performance, underscoring the superiority of +FusionMamba. + +
+
+
+
+
+ + ☆ Parameter Hierarchical Optimization for Visible-Infrared Person + Re-Identification + + +
+ Visible-infrared person re-identification (VI-reID) aims at matching +cross-modality pedestrian images captured by disjoint visible or infrared +cameras. Existing methods alleviate the cross-modality discrepancies via +designing different kinds of network architectures. Different from available +methods, in this paper, we propose a novel parameter optimizing paradigm, +parameter hierarchical optimization (PHO) method, for the task of VI-ReID. It +allows part of parameters to be directly optimized without any training, which +narrows the search space of parameters and makes the whole network more easier +to be trained. Specifically, we first divide the parameters into different +types, and then introduce a self-adaptive alignment strategy (SAS) to +automatically align the visible and infrared images through transformation. +Considering that features in different dimension have varying importance, we +develop an auto-weighted alignment learning (AAL) module that can automatically +weight features according to their importance. Importantly, in the alignment +process of SAS and AAL, all the parameters are immediately optimized with +optimization principles rather than training the whole network, which yields a +better parameter training manner. Furthermore, we establish the cross-modality +consistent learning (CCL) loss to extract discriminative person representations +with translation consistency. We provide both theoretical justification and +empirical evidence that our proposed PHO method outperform existing VI-reID +approaches. + +
+
+
+
+
+ + ☆ LaVy: Vietnamese Multimodal Large Language Model + + +
+ Large Language Models (LLMs) and Multimodal Large language models (MLLMs) +have taken the world by storm with impressive abilities in complex reasoning +and linguistic comprehension. Meanwhile there are plethora of works related to +Vietnamese Large Language Models, the lack of high-quality resources in +multimodality limits the progress of Vietnamese MLLMs. In this paper, we +pioneer in address this by introducing LaVy, a state-of-the-art Vietnamese +MLLM, and we also introduce LaVy-Bench benchmark designated for evaluating +MLLMs's understanding on Vietnamese visual language tasks. All code and model +weights are public at https://github.com/baochi0212/LaVy + +
+
+ comment: 7 pages +
+
+
+
+
+ + ☆ Context-aware Video Anomaly Detection in Long-Term Datasets + + +
+ Video anomaly detection research is generally evaluated on short, isolated +benchmark videos only a few minutes long. However, in real-world environments, +security cameras observe the same scene for months or years at a time, and the +notion of anomalous behavior critically depends on context, such as the time of +day, day of week, or schedule of events. Here, we propose a context-aware video +anomaly detection algorithm, Trinity, specifically targeted to these scenarios. +Trinity is especially well-suited to crowded scenes in which individuals cannot +be easily tracked, and anomalies are due to speed, direction, or absence of +group motion. Trinity is a contrastive learning framework that aims to learn +alignments between context, appearance, and motion, and uses alignment quality +to classify videos as normal or anomalous. We evaluate our algorithm on both +conventional benchmarks and a public webcam-based dataset we collected that +spans more than three months of activity. + +
+
+
+
+
+ + ☆ The Power of Properties: Uncovering the Influential Factors in Emotion + Classification ICPR + + +
+ Facial expression-based human emotion recognition is a critical research area +in psychology and medicine. State-of-the-art classification performance is only +reached by end-to-end trained neural networks. Nevertheless, such black-box +models lack transparency in their decision-making processes, prompting efforts +to ascertain the rules that underlie classifiers' decisions. Analyzing single +inputs alone fails to expose systematic learned biases. These biases can be +characterized as facial properties summarizing abstract information like age or +medical conditions. Therefore, understanding a model's prediction behavior +requires an analysis rooted in causality along such selected properties. We +demonstrate that up to 91.25% of classifier output behavior changes are +statistically significant concerning basic properties. Among those are age, +gender, and facial symmetry. Furthermore, the medical usage of surface +electromyography significantly influences emotion prediction. We introduce a +workflow to evaluate explicit properties and their impact. These insights might +help medical professionals select and apply classifiers regarding their +specialized data and properties. + +
+
+ comment: 8 pages, 3 tables, 1 figure, accepted at ICPRAI 2024 +
+
+
+
+
+ + ☆ Resolve Domain Conflicts for Generalizable Remote Physiological + Measurement ACM MM 2023 + + +
+ Remote photoplethysmography (rPPG) technology has become increasingly popular +due to its non-invasive monitoring of various physiological indicators, making +it widely applicable in multimedia interaction, healthcare, and emotion +analysis. Existing rPPG methods utilize multiple datasets for training to +enhance the generalizability of models. However, they often overlook the +underlying conflict issues across different datasets, such as (1) label +conflict resulting from different phase delays between physiological signal +labels and face videos at the instance level, and (2) attribute conflict +stemming from distribution shifts caused by head movements, illumination +changes, skin types, etc. To address this, we introduce the DOmain-HArmonious +framework (DOHA). Specifically, we first propose a harmonious phase strategy to +eliminate uncertain phase delays and preserve the temporal variation of +physiological signals. Next, we design a harmonious hyperplane optimization +that reduces irrelevant attribute shifts and encourages the model's +optimization towards a global solution that fits more valid scenarios. Our +experiments demonstrate that DOHA significantly improves the performance of +existing methods under multiple protocols. Our code is available at +https://github.com/SWY666/rPPG-DOHA. + +
+
+ comment: Accepted by ACM MM 2023 +
+
+
+
+
+ + ☆ MindBridge: A Cross-Subject Brain Decoding Framework CVPR 2024 + + +
+ Brain decoding, a pivotal field in neuroscience, aims to reconstruct stimuli +from acquired brain signals, primarily utilizing functional magnetic resonance +imaging (fMRI). Currently, brain decoding is confined to a +per-subject-per-model paradigm, limiting its applicability to the same +individual for whom the decoding model is trained. This constraint stems from +three key challenges: 1) the inherent variability in input dimensions across +subjects due to differences in brain size; 2) the unique intrinsic neural +patterns, influencing how different individuals perceive and process sensory +information; 3) limited data availability for new subjects in real-world +scenarios hampers the performance of decoding models. In this paper, we present +a novel approach, MindBridge, that achieves cross-subject brain decoding by +employing only one model. Our proposed framework establishes a generic paradigm +capable of addressing these challenges by introducing biological-inspired +aggregation function and novel cyclic fMRI reconstruction mechanism for +subject-invariant representation learning. Notably, by cycle reconstruction of +fMRI, MindBridge can enable novel fMRI synthesis, which also can serve as +pseudo data augmentation. Within the framework, we also devise a novel +reset-tuning method for adapting a pretrained model to a new subject. +Experimental results demonstrate MindBridge's ability to reconstruct images for +multiple subjects, which is competitive with dedicated subject-specific models. +Furthermore, with limited data for a new subject, we achieve a high level of +decoding accuracy, surpassing that of subject-specific models. This advancement +in cross-subject brain decoding suggests promising directions for wider +applications in neuroscience and indicates potential for more efficient +utilization of limited fMRI data in real-world scenarios. Project page: +https://littlepure2333.github.io/MindBridge + +
+
+ comment: CVPR 2024 highlight. Code is available at + https://github.com/littlepure2333/MindBridge +
+
+
+
+
+ + ☆ Fuss-Free Network: A Simplified and Efficient Neural Network for Crowd + Counting + + +
+ In the field of crowd-counting research, many recent deep learning based +methods have demonstrated robust capabilities for accurately estimating crowd +sizes. However, the enhancement in their performance often arises from an +increase in the complexity of the model structure. This paper introduces the +Fuss-Free Network (FFNet), a crowd counting deep learning model that is +characterized by its simplicity and efficiency in terms of its structure. The +model comprises only a backbone of a neural network and a multi-scale feature +fusion structure.The multi-scale feature fusion structure is a simple +architecture consisting of three branches, each only equipped with a focus +transition module, and combines the features from these branches through the +concatenation operation.Our proposed crowd counting model is trained and +evaluated on four widely used public datasets, and it achieves accuracy that is +comparable to that of existing complex models.The experimental results further +indicate that excellent performance in crowd counting tasks can also be +achieved by utilizing a simple, low-parameter, and computationally efficient +neural network structure. + +
+
+
+
+
+ + ☆ TBSN: Transformer-Based Blind-Spot Network for Self-Supervised Image + Denoising + + +
+ Blind-spot networks (BSN) have been prevalent network architectures in +self-supervised image denoising (SSID). Existing BSNs are mostly conducted with +convolution layers. Although transformers offer potential solutions to the +limitations of convolutions and have demonstrated success in various image +restoration tasks, their attention mechanisms may violate the blind-spot +requirement, thus restricting their applicability in SSID. In this paper, we +present a transformer-based blind-spot network (TBSN) by analyzing and +redesigning the transformer operators that meet the blind-spot requirement. +Specifically, TBSN follows the architectural principles of dilated BSNs, and +incorporates spatial as well as channel self-attention layers to enhance the +network capability. For spatial self-attention, an elaborate mask is applied to +the attention matrix to restrict its receptive field, thus mimicking the +dilated convolution. For channel self-attention, we observe that it may leak +the blind-spot information when the channel number is greater than spatial size +in the deep layers of multi-scale architectures. To eliminate this effect, we +divide the channel into several groups and perform channel attention +separately. Furthermore, we introduce a knowledge distillation strategy that +distills TBSN into smaller denoisers to improve computational efficiency while +maintaining performance. Extensive experiments on real-world image denoising +datasets show that TBSN largely extends the receptive field and exhibits +favorable performance against state-of-the-art SSID methods. The code and +pre-trained models will be publicly available at +https://github.com/nagejacob/TBSN. + +
+
+
+
+
+ + ☆ Streamlined Photoacoustic Image Processing with Foundation Models: A + Training-Free Solution + + +
+ Foundation models have rapidly evolved and have achieved significant +accomplishments in computer vision tasks. Specifically, the prompt mechanism +conveniently allows users to integrate image prior information into the model, +making it possible to apply models without any training. Therefore, we propose +a method based on foundation models and zero training to solve the tasks of +photoacoustic (PA) image segmentation. We employed the segment anything model +(SAM) by setting simple prompts and integrating the model's outputs with prior +knowledge of the imaged objects to accomplish various tasks, including: (1) +removing the skin signal in three-dimensional PA image rendering; (2) dual +speed-of-sound reconstruction, and (3) segmentation of finger blood vessels. +Through these demonstrations, we have concluded that deep learning can be +directly applied in PA imaging without the requirement for network design and +training. This potentially allows for a hands-on, convenient approach to +achieving efficient and accurate segmentation of PA images. This letter serves +as a comprehensive tutorial, facilitating the mastery of the technique through +the provision of code and sample datasets. + +
+
+
+
+
+ + ☆ Heron-Bench: A Benchmark for Evaluating Vision Language Models in + Japanese + + +
+ Vision Language Models (VLMs) have undergone a rapid evolution, giving rise +to significant advancements in the realm of multimodal understanding tasks. +However, the majority of these models are trained and evaluated on +English-centric datasets, leaving a gap in the development and evaluation of +VLMs for other languages, such as Japanese. This gap can be attributed to the +lack of methodologies for constructing VLMs and the absence of benchmarks to +accurately measure their performance. To address this issue, we introduce a +novel benchmark, Japanese Heron-Bench, for evaluating Japanese capabilities of +VLMs. The Japanese Heron-Bench consists of a variety of imagequestion answer +pairs tailored to the Japanese context. Additionally, we present a baseline +Japanese VLM that has been trained with Japanese visual instruction tuning +datasets. Our Heron-Bench reveals the strengths and limitations of the proposed +VLM across various ability dimensions. Furthermore, we clarify the capability +gap between strong closed models like GPT-4V and the baseline model, providing +valuable insights for future research in this domain. We release the benchmark +dataset and training code to facilitate further developments in Japanese VLM +research. + +
+
+
+
+
+ + ☆ Sparse Laneformer + + +
+ Lane detection is a fundamental task in autonomous driving, and has achieved +great progress as deep learning emerges. Previous anchor-based methods often +design dense anchors, which highly depend on the training dataset and remain +fixed during inference. We analyze that dense anchors are not necessary for +lane detection, and propose a transformer-based lane detection framework based +on a sparse anchor mechanism. To this end, we generate sparse anchors with +position-aware lane queries and angle queries instead of traditional explicit +anchors. We adopt Horizontal Perceptual Attention (HPA) to aggregate the lane +features along the horizontal direction, and adopt Lane-Angle Cross Attention +(LACA) to perform interactions between lane queries and angle queries. We also +propose Lane Perceptual Attention (LPA) based on deformable cross attention to +further refine the lane predictions. Our method, named Sparse Laneformer, is +easy-to-implement and end-to-end trainable. Extensive experiments demonstrate +that Sparse Laneformer performs favorably against the state-of-the-art methods, +e.g., surpassing Laneformer by 3.0% F1 score and O2SFormer by 0.7% F1 score +with fewer MACs on CULane with the same ResNet-34 backbone. + +
+
+
+
+
+ + ☆ Voice-Assisted Real-Time Traffic Sign Recognition System Using + Convolutional Neural Network + + +
+ Traffic signs are important in communicating information to drivers. Thus, +comprehension of traffic signs is essential for road safety and ignorance may +result in road accidents. Traffic sign detection has been a research spotlight +over the past few decades. Real-time and accurate detections are the +preliminaries of robust traffic sign detection system which is yet to be +achieved. This study presents a voice-assisted real-time traffic sign +recognition system which is capable of assisting drivers. This system functions +under two subsystems. Initially, the detection and recognition of the traffic +signs are carried out using a trained Convolutional Neural Network (CNN). After +recognizing the specific traffic sign, it is narrated to the driver as a voice +message using a text-to-speech engine. An efficient CNN model for a benchmark +dataset is developed for real-time detection and recognition using Deep +Learning techniques. The advantage of this system is that even if the driver +misses a traffic sign, or does not look at the traffic sign, or is unable to +comprehend the sign, the system detects it and narrates it to the driver. A +system of this type is also important in the development of autonomous +vehicles. + +
+
+
+
+
+ + ☆ DGMamba: Domain Generalization via Generalized State Space Model + + +
+ Domain generalization~(DG) aims at solving distribution shift problems in +various scenes. Existing approaches are based on Convolution Neural Networks +(CNNs) or Vision Transformers (ViTs), which suffer from limited receptive +fields or quadratic complexities issues. Mamba, as an emerging state space +model (SSM), possesses superior linear complexity and global receptive fields. +Despite this, it can hardly be applied to DG to address distribution shifts, +due to the hidden state issues and inappropriate scan mechanisms. In this +paper, we propose a novel framework for DG, named DGMamba, that excels in +strong generalizability toward unseen domains and meanwhile has the advantages +of global receptive fields, and efficient linear complexity. Our DGMamba +compromises two core components: Hidden State Suppressing~(HSS) and +Semantic-aware Patch refining~(SPR). In particular, HSS is introduced to +mitigate the influence of hidden states associated with domain-specific +features during output prediction. SPR strives to encourage the model to +concentrate more on objects rather than context, consisting of two designs: +Prior-Free Scanning~(PFS), and Domain Context Interchange~(DCI). Concretely, +PFS aims to shuffle the non-semantic patches within images, creating more +flexible and effective sequences from images, and DCI is designed to regularize +Mamba with the combination of mismatched non-semantic and semantic information +by fusing patches among domains. Extensive experiments on four commonly used DG +benchmarks demonstrate that the proposed DGMamba achieves remarkably superior +results to state-of-the-art models. The code will be made publicly available. + +
+
+
+
+
+ + ☆ VIFNet: An End-to-end Visible-Infrared Fusion Network for Image Dehazing + + +
+ Image dehazing poses significant challenges in environmental perception. +Recent research mainly focus on deep learning-based methods with single +modality, while they may result in severe information loss especially in +dense-haze scenarios. The infrared image exhibits robustness to the haze, +however, existing methods have primarily treated the infrared modality as +auxiliary information, failing to fully explore its rich information in +dehazing. To address this challenge, the key insight of this study is to design +a visible-infrared fusion network for image dehazing. In particular, we propose +a multi-scale Deep Structure Feature Extraction (DSFE) module, which +incorporates the Channel-Pixel Attention Block (CPAB) to restore more spatial +and marginal information within the deep structural features. Additionally, we +introduce an inconsistency weighted fusion strategy to merge the two modalities +by leveraging the more reliable information. To validate this, we construct a +visible-infrared multimodal dataset called AirSim-VID based on the AirSim +simulation platform. Extensive experiments performed on challenging real and +simulated image datasets demonstrate that VIFNet can outperform many +state-of-the-art competing methods. The code and dataset are available at +https://github.com/mengyu212/VIFNet_dehazing. + +
+
+
+
+
+ + ☆ AUG: A New Dataset and An Efficient Model for Aerial Image Urban Scene + Graph Generation + + +
+ Scene graph generation (SGG) aims to understand the visual objects and their +semantic relationships from one given image. Until now, lots of SGG datasets +with the eyelevel view are released but the SGG dataset with the overhead view +is scarcely studied. By contrast to the object occlusion problem in the +eyelevel view, which impedes the SGG, the overhead view provides a new +perspective that helps to promote the SGG by providing a clear perception of +the spatial relationships of objects in the ground scene. To fill in the gap of +the overhead view dataset, this paper constructs and releases an aerial image +urban scene graph generation (AUG) dataset. Images from the AUG dataset are +captured with the low-attitude overhead view. In the AUG dataset, 25,594 +objects, 16,970 relationships, and 27,175 attributes are manually annotated. To +avoid the local context being overwhelmed in the complex aerial urban scene, +this paper proposes one new locality-preserving graph convolutional network +(LPG). Different from the traditional graph convolutional network, which has +the natural advantage of capturing the global context for SGG, the +convolutional layer in the LPG integrates the non-destructive initial features +of the objects with dynamically updated neighborhood information to preserve +the local context under the premise of mining the global context. To address +the problem that there exists an extra-large number of potential object +relationship pairs but only a small part of them is meaningful in AUG, we +propose the adaptive bounding box scaling factor for potential relationship +detection (ABS-PRD) to intelligently prune the meaningless relationship pairs. +Extensive experiments on the AUG dataset show that our LPG can significantly +outperform the state-of-the-art methods and the effectiveness of the proposed +locality-preserving strategy. + +
+
+
+
+
+ + ☆ PRAM: Place Recognition Anywhere Model for Efficient Visual Localization + + +
+ Humans localize themselves efficiently in known environments by first +recognizing landmarks defined on certain objects and their spatial +relationships, and then verifying the location by aligning detailed structures +of recognized objects with those in the memory. Inspired by this, we propose +the place recognition anywhere model (PRAM) to perform visual localization as +efficiently as humans do. PRAM consists of two main components - recognition +and registration. In detail, first of all, a self-supervised map-centric +landmark definition strategy is adopted, making places in either indoor or +outdoor scenes act as unique landmarks. Then, sparse keypoints extracted from +images, are utilized as the input to a transformer-based deep neural network +for landmark recognition; these keypoints enable PRAM to recognize hundreds of +landmarks with high time and memory efficiency. Keypoints along with recognized +landmark labels are further used for registration between query images and the +3D landmark map. Different from previous hierarchical methods, PRAM discards +global and local descriptors, and reduces over 90% storage. Since PRAM utilizes +recognition and landmark-wise verification to replace global reference search +and exhaustive matching respectively, it runs 2.4 times faster than prior +state-of-the-art approaches. Moreover, PRAM opens new directions for visual +localization including multi-modality localization, map-centric feature +learning, and hierarchical scene coordinate regression. + +
+
+ comment: project page: https://feixue94.github.io/pram-project/ +
+
+
+
+
+ + ☆ ConsistencyDet: Robust Object Detector with Denoising Paradigm of + Consistency Model + + +
+ Object detection, a quintessential task in the realm of perceptual computing, +can be tackled using a generative methodology. In the present study, we +introduce a novel framework designed to articulate object detection as a +denoising diffusion process, which operates on perturbed bounding boxes of +annotated entities. This framework, termed ConsistencyDet, leverages an +innovative denoising concept known as the Consistency Model. The hallmark of +this model is its self-consistency feature, which empowers the model to map +distorted information from any temporal stage back to its pristine state, +thereby realizing a ``one-step denoising'' mechanism. Such an attribute +markedly elevates the operational efficiency of the model, setting it apart +from the conventional Diffusion Model. Throughout the training phase, +ConsistencyDet initiates the diffusion sequence with noise-infused boxes +derived from the ground-truth annotations and conditions the model to perform +the denoising task. Subsequently, in the inference stage, the model employs a +denoising sampling strategy that commences with bounding boxes randomly sampled +from a normal distribution. Through iterative refinement, the model transforms +an assortment of arbitrarily generated boxes into the definitive detections. +Comprehensive evaluations employing standard benchmarks, such as MS-COCO and +LVIS, corroborate that ConsistencyDet surpasses other leading-edge detectors in +performance metrics. + +
+
+
+
+
+ + ☆ Joint Conditional Diffusion Model for Image Restoration with Mixed + Degradations + + +
+ Image restoration is rather challenging in adverse weather conditions, +especially when multiple degradations occur simultaneously. Blind image +decomposition was proposed to tackle this issue, however, its effectiveness +heavily relies on the accurate estimation of each component. Although +diffusion-based models exhibit strong generative abilities in image restoration +tasks, they may generate irrelevant contents when the degraded images are +severely corrupted. To address these issues, we leverage physical constraints +to guide the whole restoration process, where a mixed degradation model based +on atmosphere scattering model is constructed. Then we formulate our Joint +Conditional Diffusion Model (JCDM) by incorporating the degraded image and +degradation mask to provide precise guidance. To achieve better color and +detail recovery results, we further integrate a refinement network to +reconstruct the restored image, where Uncertainty Estimation Block (UEB) is +employed to enhance the features. Extensive experiments performed on both +multi-weather and weather-specific datasets demonstrate the superiority of our +method over state-of-the-art competing methods. + +
+
+
+
+
+ + ☆ RMAFF-PSN: A Residual Multi-Scale Attention Feature Fusion Photometric + Stereo Network + + +
+ Predicting accurate normal maps of objects from two-dimensional images in +regions of complex structure and spatial material variations is challenging +using photometric stereo methods due to the influence of surface reflection +properties caused by variations in object geometry and surface materials. To +address this issue, we propose a photometric stereo network called a RMAFF-PSN +that uses residual multiscale attentional feature fusion to handle the +``difficult'' regions of the object. Unlike previous approaches that only use +stacked convolutional layers to extract deep features from the input image, our +method integrates feature information from different resolution stages and +scales of the image. This approach preserves more physical information, such as +texture and geometry of the object in complex regions, through shallow-deep +stage feature extraction, double branching enhancement, and attention +optimization. To test the network structure under real-world conditions, we +propose a new real dataset called Simple PS data, which contains multiple +objects with varying structures and materials. Experimental results on a +publicly available benchmark dataset demonstrate that our method outperforms +most existing calibrated photometric stereo methods for the same number of +input images, especially in the case of highly non-convex object structures. +Our method also obtains good results under sparse lighting conditions. + +
+
+ comment: 17 pages,12 figures +
+
+
+
+
+ + ☆ NeuroNCAP: Photorealistic Closed-loop Safety Testing for Autonomous + Driving + + +
+ We present a versatile NeRF-based simulator for testing autonomous driving +(AD) software systems, designed with a focus on sensor-realistic closed-loop +evaluation and the creation of safety-critical scenarios. The simulator learns +from sequences of real-world driving sensor data and enables reconfigurations +and renderings of new, unseen scenarios. In this work, we use our simulator to +test the responses of AD models to safety-critical scenarios inspired by the +European New Car Assessment Programme (Euro NCAP). Our evaluation reveals that, +while state-of-the-art end-to-end planners excel in nominal driving scenarios +in an open-loop setting, they exhibit critical flaws when navigating our +safety-critical scenarios in a closed-loop setting. This highlights the need +for advancements in the safety and real-world usability of end-to-end planners. +By publicly releasing our simulator and scenarios as an easy-to-run evaluation +suite, we invite the research community to explore, refine, and validate their +AD models in controlled, yet highly configurable and challenging +sensor-realistic environments. Code and instructions can be found at +https://github.com/wljungbergh/NeuroNCAP + +
+
+
+
+
+ + ☆ Generating Synthetic Satellite Imagery With Deep-Learning Text-to-Image + Models -- Technical Challenges and Implications for Monitoring and + Verification + + +
+ Novel deep-learning (DL) architectures have reached a level where they can +generate digital media, including photorealistic images, that are difficult to +distinguish from real data. These technologies have already been used to +generate training data for Machine Learning (ML) models, and large +text-to-image models like DALL-E 2, Imagen, and Stable Diffusion are achieving +remarkable results in realistic high-resolution image generation. Given these +developments, issues of data authentication in monitoring and verification +deserve a careful and systematic analysis: How realistic are synthetic images? +How easily can they be generated? How useful are they for ML researchers, and +what is their potential for Open Science? In this work, we use novel DL models +to explore how synthetic satellite images can be created using conditioning +mechanisms. We investigate the challenges of synthetic satellite image +generation and evaluate the results based on authenticity and state-of-the-art +metrics. Furthermore, we investigate how synthetic data can alleviate the lack +of data in the context of ML methods for remote-sensing. Finally we discuss +implications of synthetic satellite imagery in the context of monitoring and +verification. + +
+
+ comment: https://resources.inmm.org/annual-meeting-proceedings/generating-synthetic-satellite-imagery-deep-learning-text-image-models +
+
+
+
+
+ + ☆ 3D-CSAD: Untrained 3D Anomaly Detection for Complex Manufacturing + Surfaces + + +
+ The surface quality inspection of manufacturing parts based on 3D point cloud +data has attracted increasing attention in recent years. The reason is that the +3D point cloud can capture the entire surface of manufacturing parts, unlike +the previous practices that focus on some key product characteristics. However, +achieving accurate 3D anomaly detection is challenging, due to the complex +surfaces of manufacturing parts and the difficulty of collecting sufficient +anomaly samples. To address these challenges, we propose a novel untrained +anomaly detection method based on 3D point cloud data for complex manufacturing +parts, which can achieve accurate anomaly detection in a single sample without +training data. In the proposed framework, we transform an input sample into two +sets of profiles along different directions. Based on one set of the profiles, +a novel segmentation module is devised to segment the complex surface into +multiple basic and simple components. In each component, another set of +profiles, which have the nature of similar shapes, can be modeled as a low-rank +matrix. Thus, accurate 3D anomaly detection can be achieved by using Robust +Principal Component Analysis (RPCA) on these low-rank matrices. Extensive +numerical experiments on different types of parts show that our method achieves +promising results compared with the benchmark methods. + +
+
+
+
+
+ + ☆ Exploiting Object-based and Segmentation-based Semantic Features for + Deep Learning-based Indoor Scene Classification + + +
+ Indoor scenes are usually characterized by scattered objects and their +relationships, which turns the indoor scene classification task into a +challenging computer vision task. Despite the significant performance boost in +classification tasks achieved in recent years, provided by the use of +deep-learning-based methods, limitations such as inter-category ambiguity and +intra-category variation have been holding back their performance. To overcome +such issues, gathering semantic information has been shown to be a promising +source of information towards a more complete and discriminative feature +representation of indoor scenes. Therefore, the work described in this paper +uses both semantic information, obtained from object detection, and semantic +segmentation techniques. While object detection techniques provide the 2D +location of objects allowing to obtain spatial distributions between objects, +semantic segmentation techniques provide pixel-level information that allows to +obtain, at a pixel-level, a spatial distribution and shape-related features of +the segmentation categories. Hence, a novel approach that uses a semantic +segmentation mask to provide Hu-moments-based segmentation categories' shape +characterization, designated by Segmentation-based Hu-Moments Features (SHMFs), +is proposed. Moreover, a three-main-branch network, designated by +GOS$^2$F$^2$App, that exploits deep-learning-based global features, +object-based features, and semantic segmentation-based features is also +proposed. GOS$^2$F$^2$App was evaluated in two indoor scene benchmark datasets: +SUN RGB-D and NYU Depth V2, where, to the best of our knowledge, +state-of-the-art results were achieved on both datasets, which present +evidences of the effectiveness of the proposed approach. + +
+
+ comment: This preprint was submitted at IEEE Transactions on Image Processing +
+
+
+
+
+ + ☆ Realistic Continual Learning Approach using Pre-trained Models + + +
+ Continual learning (CL) is crucial for evaluating adaptability in learning +solutions to retain knowledge. Our research addresses the challenge of +catastrophic forgetting, where models lose proficiency in previously learned +tasks as they acquire new ones. While numerous solutions have been proposed, +existing experimental setups often rely on idealized class-incremental learning +scenarios. We introduce Realistic Continual Learning (RealCL), a novel CL +paradigm where class distributions across tasks are random, departing from +structured setups. + We also present CLARE (Continual Learning Approach with pRE-trained models +for RealCL scenarios), a pre-trained model-based solution designed to integrate +new knowledge while preserving past learning. Our contributions include +pioneering RealCL as a generalization of traditional CL setups, proposing CLARE +as an adaptable approach for RealCL tasks, and conducting extensive experiments +demonstrating its effectiveness across various RealCL scenarios. Notably, CLARE +outperforms existing models on RealCL benchmarks, highlighting its versatility +and robustness in unpredictable learning environments. + +
+
+
+
+
+ + ☆ Applying Guidance in a Limited Interval Improves Sample and Distribution + Quality in Diffusion Models + + +
+ Guidance is a crucial technique for extracting the best performance out of +image-generating diffusion models. Traditionally, a constant guidance weight +has been applied throughout the sampling chain of an image. We show that +guidance is clearly harmful toward the beginning of the chain (high noise +levels), largely unnecessary toward the end (low noise levels), and only +beneficial in the middle. We thus restrict it to a specific range of noise +levels, improving both the inference speed and result quality. This limited +guidance interval improves the record FID in ImageNet-512 significantly, from +1.81 to 1.40. We show that it is quantitatively and qualitatively beneficial +across different sampler parameters, network architectures, and datasets, +including the large-scale setting of Stable Diffusion XL. We thus suggest +exposing the guidance interval as a hyperparameter in all diffusion models that +use guidance. + +
+
+
+
+
+ + ☆ Progressive Semantic-Guided Vision Transformer for Zero-Shot Learning CVPR'24 + + +
+ Zero-shot learning (ZSL) recognizes the unseen classes by conducting +visual-semantic interactions to transfer semantic knowledge from seen classes +to unseen ones, supported by semantic information (e.g., attributes). However, +existing ZSL methods simply extract visual features using a pre-trained network +backbone (i.e., CNN or ViT), which fail to learn matched visual-semantic +correspondences for representing semantic-related visual features as lacking of +the guidance of semantic information, resulting in undesirable visual-semantic +interactions. To tackle this issue, we propose a progressive semantic-guided +vision transformer for zero-shot learning (dubbed ZSLViT). ZSLViT mainly +considers two properties in the whole network: i) discover the semantic-related +visual representations explicitly, and ii) discard the semantic-unrelated +visual information. Specifically, we first introduce semantic-embedded token +learning to improve the visual-semantic correspondences via semantic +enhancement and discover the semantic-related visual tokens explicitly with +semantic-guided token attention. Then, we fuse low semantic-visual +correspondence visual tokens to discard the semantic-unrelated visual +information for visual enhancement. These two operations are integrated into +various encoders to progressively learn semantic-related visual representations +for accurate visual-semantic interactions in ZSL. The extensive experiments +show that our ZSLViT achieves significant performance gains on three popular +benchmark datasets, i.e., CUB, SUN, and AWA2. + +
+
+ comment: Accepted to CVPR'24 +
+
+
+
+
+ + ☆ OpenTrench3D: A Photogrammetric 3D Point Cloud Dataset for Semantic + Segmentation of Underground Utilities + + +
+ Identifying and classifying underground utilities is an important task for +efficient and effective urban planning and infrastructure maintenance. We +present OpenTrench3D, a novel and comprehensive 3D Semantic Segmentation point +cloud dataset, designed to advance research and development in underground +utility surveying and mapping. OpenTrench3D covers a completely novel domain +for public 3D point cloud datasets and is unique in its focus, scope, and +cost-effective capturing method. The dataset consists of 310 point clouds +collected across 7 distinct areas. These include 5 water utility areas and 2 +district heating utility areas. The inclusion of different geographical areas +and main utilities (water and district heating utilities) makes OpenTrench3D +particularly valuable for inter-domain transfer learning experiments. We +provide benchmark results for the dataset using three state-of-the-art semantic +segmentation models, PointNeXt, PointVector and PointMetaBase. Benchmarks are +conducted by training on data from water areas, fine-tuning on district heating +area 1 and evaluating on district heating area 2. The dataset is publicly +available. With OpenTrench3D, we seek to foster innovation and progress in the +field of 3D semantic segmentation in applications related to detection and +documentation of underground utilities as well as in transfer learning methods +in general. + +
+
+
+
+
+ + ☆ ViM-UNet: Vision Mamba for Biomedical Segmentation + + +
+ CNNs, most notably the UNet, are the default architecture for biomedical +segmentation. Transformer-based approaches, such as UNETR, have been proposed +to replace them, benefiting from a global field of view, but suffering from +larger runtimes and higher parameter counts. The recent Vision Mamba +architecture offers a compelling alternative to transformers, also providing a +global field of view, but at higher efficiency. Here, we introduce ViM-UNet, a +novel segmentation architecture based on it and compare it to UNet and UNETR +for two challenging microscopy instance segmentation tasks. We find that it +performs similarly or better than UNet, depending on the task, and outperforms +UNETR while being more efficient. Our code is open source and documented at +https://github.com/constantinpape/torch-em/blob/main/vimunet.md. + +
+
+
+
+
+ + ☆ Point Cloud Geometry Scalable Coding with a Quality-Conditioned Latents + Probability Estimator ICIP 2024 + + +
+ The widespread usage of point clouds (PC) for immersive visual applications +has resulted in the use of very heterogeneous receiving conditions and devices, +notably in terms of network, hardware, and display capabilities. In this +scenario, quality scalability, i.e., the ability to reconstruct a signal at +different qualities by progressively decoding a single bitstream, is a major +requirement that has yet to be conveniently addressed, notably in most +learning-based PC coding solutions. This paper proposes a quality scalability +scheme, named Scalable Quality Hyperprior (SQH), adaptable to learning-based +static point cloud geometry codecs, which uses a Quality-conditioned Latents +Probability Estimator (QuLPE) to decode a high-quality version of a PC +learning-based representation, based on an available lower quality base layer. +SQH is integrated in the future JPEG PC coding standard, allowing to create a +layered bitstream that can be used to progressively decode the PC geometry with +increasing quality and fidelity. Experimental results show that SQH offers the +quality scalability feature with very limited or no compression performance +penalty at all when compared with the corresponding non-scalable solution, thus +preserving the significant compression gains over other state-of-the-art PC +codecs. + +
+
+ comment: Submitted at ICIP 2024 +
+
+
+
+
+ + ☆ Flatness Improves Backbone Generalisation in Few-shot Classification + + +
+ Deployment of deep neural networks in real-world settings typically requires +adaptation to new tasks with few examples. Few-shot classification (FSC) +provides a solution to this problem by leveraging pre-trained backbones for +fast adaptation to new classes. Surprisingly, most efforts have only focused on +developing architectures for easing the adaptation to the target domain without +considering the importance of backbone training for good generalisation. We +show that flatness-aware backbone training with vanilla fine-tuning results in +a simpler yet competitive baseline compared to the state-of-the-art. Our +results indicate that for in- and cross-domain FSC, backbone training is +crucial to achieving good generalisation across different adaptation methods. +We advocate more care should be taken when training these models. + +
+
+
+
+
+ + ☆ Chaos in Motion: Unveiling Robustness in Remote Heart Rate Measurement + through Brain-Inspired Skin Tracking + + +
+ Heart rate is an important physiological indicator of human health status. +Existing remote heart rate measurement methods typically involve facial +detection followed by signal extraction from the region of interest (ROI). +These SOTA methods have three serious problems: (a) inaccuracies even failures +in detection caused by environmental influences or subject movement; (b) +failures for special patients such as infants and burn victims; (c) privacy +leakage issues resulting from collecting face video. To address these issues, +we regard the remote heart rate measurement as the process of analyzing the +spatiotemporal characteristics of the optical flow signal in the video. We +apply chaos theory to computer vision tasks for the first time, thus designing +a brain-inspired framework. Firstly, using an artificial primary visual cortex +model to extract the skin in the videos, and then calculate heart rate by +time-frequency analysis on all pixels. Our method achieves Robust Skin Tracking +for Heart Rate measurement, called HR-RST. The experimental results show that +HR-RST overcomes the difficulty of environmental influences and effectively +tracks the subject movement. Moreover, the method could extend to other body +parts. Consequently, the method can be applied to special patients and +effectively protect individual privacy, offering an innovative solution. + +
+
+ comment: 8 pages, 10 figures +
+
+
+
+
+ + ☆ Depth Estimation using Weighted-loss and Transfer Learning + + +
+ Depth estimation from 2D images is a common computer vision task that has +applications in many fields including autonomous vehicles, scene understanding +and robotics. The accuracy of a supervised depth estimation method mainly +relies on the chosen loss function, the model architecture, quality of data and +performance metrics. In this study, we propose a simplified and adaptable +approach to improve depth estimation accuracy using transfer learning and an +optimized loss function. The optimized loss function is a combination of +weighted losses to which enhance robustness and generalization: Mean Absolute +Error (MAE), Edge Loss and Structural Similarity Index (SSIM). We use a grid +search and a random search method to find optimized weights for the losses, +which leads to an improved model. We explore multiple encoder-decoder-based +models including DenseNet121, DenseNet169, DenseNet201, and EfficientNet for +the supervised depth estimation model on NYU Depth Dataset v2. We observe that +the EfficientNet model, pre-trained on ImageNet for classification when used as +an encoder, with a simple upsampling decoder, gives the best results in terms +of RSME, REL and log10: 0.386, 0.113 and 0.049, respectively. We also perform a +qualitative analysis which illustrates that our model produces depth maps that +closely resemble ground truth, even in cases where the ground truth is flawed. +The results indicate significant improvements in accuracy and robustness, with +EfficientNet being the most successful architecture. + +
+
+
+
+
+ + ☆ Run-time Monitoring of 3D Object Detection in Automated Driving Systems + Using Early Layer Neural Activation Patterns CVPR 2024 + + +
+ Monitoring the integrity of object detection for errors within the perception +module of automated driving systems (ADS) is paramount for ensuring safety. +Despite recent advancements in deep neural network (DNN)-based object +detectors, their susceptibility to detection errors, particularly in the +less-explored realm of 3D object detection, remains a significant concern. +State-of-the-art integrity monitoring (also known as introspection) mechanisms +in 2D object detection mainly utilise the activation patterns in the final +layer of the DNN-based detector's backbone. However, that may not sufficiently +address the complexities and sparsity of data in 3D object detection. To this +end, we conduct, in this article, an extensive investigation into the effects +of activation patterns extracted from various layers of the backbone network +for introspecting the operation of 3D object detectors. Through a comparative +analysis using Kitti and NuScenes datasets with PointPillars and CenterPoint +detectors, we demonstrate that using earlier layers' activation patterns +enhances the error detection performance of the integrity monitoring system, +yet increases computational complexity. To address the real-time operation +requirements in ADS, we also introduce a novel introspection method that +combines activation patterns from multiple layers of the detector's backbone +and report its performance. + +
+
+ comment: Accepted by CVPR 2024 Workshop on Safe Autonomy for All Domains + (SAIAD) +
+
+
+
+
+ + ☆ Model-based Cleaning of the QUILT-1M Pathology Dataset for + Text-Conditional Image Synthesis + + +
+ The QUILT-1M dataset is the first openly available dataset containing images +harvested from various online sources. While it provides a huge data variety, +the image quality and composition is highly heterogeneous, impacting its +utility for text-conditional image synthesis. We propose an automatic pipeline +that provides predictions of the most common impurities within the images, +e.g., visibility of narrators, desktop environment and pathology software, or +text within the image. Additionally, we propose to use semantic alignment +filtering of the image-text pairs. Our findings demonstrate that by rigorously +filtering the dataset, there is a substantial enhancement of image fidelity in +text-to-image tasks. + +
+
+ comment: 4 pages (short paper) +
+
+
+
+
+ + ☆ Deep learning-driven pulmonary arteries and veins segmentation reveals + demography-associated pulmonary vasculature anatomy + + +
+ Pulmonary artery-vein segmentation is crucial for diagnosing pulmonary +diseases and surgical planning, and is traditionally achieved by Computed +Tomography Pulmonary Angiography (CTPA). However, concerns regarding adverse +health effects from contrast agents used in CTPA have constrained its clinical +utility. In contrast, identifying arteries and veins using non-contrast CT, a +conventional and low-cost clinical examination routine, has long been +considered impossible. Here we propose a High-abundant Pulmonary Artery-vein +Segmentation (HiPaS) framework achieving accurate artery-vein segmentation on +both non-contrast CT and CTPA across various spatial resolutions. HiPaS first +performs spatial normalization on raw CT scans via a super-resolution module, +and then iteratively achieves segmentation results at different branch levels +by utilizing the low-level vessel segmentation as a prior for high-level vessel +segmentation. We trained and validated HiPaS on our established multi-centric +dataset comprising 1,073 CT volumes with meticulous manual annotation. Both +quantitative experiments and clinical evaluation demonstrated the superior +performance of HiPaS, achieving a dice score of 91.8% and a sensitivity of +98.0%. Further experiments demonstrated the non-inferiority of HiPaS +segmentation on non-contrast CT compared to segmentation on CTPA. Employing +HiPaS, we have conducted an anatomical study of pulmonary vasculature on 10,613 +participants in China (five sites), discovering a new association between +pulmonary vessel abundance and sex and age: vessel abundance is significantly +higher in females than in males, and slightly decreases with age, under the +controlling of lung volumes (p < 0.0001). HiPaS realizing accurate artery-vein +segmentation delineates a promising avenue for clinical diagnosis and +understanding pulmonary physiology in a non-invasive manner. + +
+
+
+
+
+ + ☆ Shape Completion in the Dark: Completing Vertebrae Morphology from 3D + Ultrasound + + +
+ Purpose: Ultrasound (US) imaging, while advantageous for its radiation-free +nature, is challenging to interpret due to only partially visible organs and a +lack of complete 3D information. While performing US-based diagnosis or +investigation, medical professionals therefore create a mental map of the 3D +anatomy. In this work, we aim to replicate this process and enhance the visual +representation of anatomical structures. + Methods: We introduce a point-cloud-based probabilistic DL method to complete +occluded anatomical structures through 3D shape completion and choose US-based +spine examinations as our application. To enable training, we generate +synthetic 3D representations of partially occluded spinal views by mimicking US +physics and accounting for inherent artifacts. + Results: The proposed model performs consistently on synthetic and patient +data, with mean and median differences of 2.02 and 0.03 in CD, respectively. +Our ablation study demonstrates the importance of US physics-based data +generation, reflected in the large mean and median difference of 11.8 CD and +9.55 CD, respectively. Additionally, we demonstrate that anatomic landmarks, +such as the spinous process (with reconstruction CD of 4.73) and the facet +joints (mean distance to GT of 4.96mm) are preserved in the 3D completion. + Conclusion: Our work establishes the feasibility of 3D shape completion for +lumbar vertebrae, ensuring the preservation of level-wise characteristics and +successful generalization from synthetic to real data. The incorporation of US +physics contributes to more accurate patient data completions. Notably, our +method preserves essential anatomic landmarks and reconstructs crucial +injections sites at their correct locations. The generated data and source code +will be made publicly available +(https://github.com/miruna20/Shape-Completion-in-the-Dark). + +
+
+
+
+
+ + ☆ Dealing with Subject Similarity in Differential Morphing Attack + Detection + + +
+ The advent of morphing attacks has posed significant security concerns for +automated Face Recognition systems, raising the pressing need for robust and +effective Morphing Attack Detection (MAD) methods able to effectively address +this issue. In this paper, we focus on Differential MAD (D-MAD), where a +trusted live capture, usually representing the criminal, is compared with the +document image to classify it as morphed or bona fide. We show these approaches +based on identity features are effective when the morphed image and the live +one are sufficiently diverse; unfortunately, the effectiveness is significantly +reduced when the same approaches are applied to look-alike subjects or in all +those cases when the similarity between the two compared images is high (e.g. +comparison between the morphed image and the accomplice). Therefore, in this +paper, we propose ACIdA, a modular D-MAD system, consisting of a module for the +attempt type classification, and two modules for the identity and artifacts +analysis on input images. Successfully addressing this task would allow +broadening the D-MAD applications including, for instance, the document +enrollment stage, which currently relies entirely on human evaluation, thus +limiting the possibility of releasing ID documents with manipulated images, as +well as the automated gates to detect both accomplices and criminals. An +extensive cross-dataset experimental evaluation conducted on the introduced +scenario shows that ACIdA achieves state-of-the-art results, outperforming +literature competitors, while maintaining good performance in traditional D-MAD +benchmarks. + +
+
+
+
+
+ + ☆ Finding Dino: A plug-and-play framework for unsupervised detection of + out-of-distribution objects using prototypes + + +
+ Detecting and localising unknown or Out-of-distribution (OOD) objects in any +scene can be a challenging task in vision. Particularly, in safety-critical +cases involving autonomous systems like automated vehicles or trains. +Supervised anomaly segmentation or open-world object detection models depend on +training on exhaustively annotated datasets for every domain and still struggle +in distinguishing between background and OOD objects. In this work, we present +a plug-and-play generalised framework - PRototype-based zero-shot OOD detection +Without Labels (PROWL). It is an inference-based method that does not require +training on the domain dataset and relies on extracting relevant features from +self-supervised pre-trained models. PROWL can be easily adapted to detect OOD +objects in any operational design domain by specifying a list of known classes +from this domain. PROWL, as an unsupervised method, outperforms other +supervised methods trained without auxiliary OOD data on the RoadAnomaly and +RoadObstacle datasets provided in SegmentMeIfYouCan (SMIYC) benchmark. We also +demonstrate its suitability for other domains such as rail and maritime scenes. + +
+
+
+
+
+ + ☆ Separated Attention: An Improved Cycle GAN Based Under Water Image + Enhancement Method + + +
+ In this paper we have present an improved Cycle GAN based model for under +water image enhancement. We have utilized the cycle consistent learning +technique of the state-of-the-art Cycle GAN model with modification in the loss +function in terms of depth-oriented attention which enhance the contrast of the +overall image, keeping global content, color, local texture, and style +information intact. We trained the Cycle GAN model with the modified loss +functions on the benchmarked Enhancing Underwater Visual Perception (EUPV) +dataset a large dataset including paired and unpaired sets of underwater images +(poor and good quality) taken with seven distinct cameras in a range of +visibility situation during research on ocean exploration and human-robot +cooperation. In addition, we perform qualitative and quantitative evaluation +which supports the given technique applied and provided a better contrast +enhancement model of underwater imagery. More significantly, the upgraded +images provide better results from conventional models and further for under +water navigation, pose estimation, saliency prediction, object detection and +tracking. The results validate the appropriateness of the model for autonomous +underwater vehicles (AUV) in visual navigation. + +
+
+ comment: 9 pages, 8 figures +
+
+
+
+
+ + ☆ Simba: Mamba augmented U-ShiftGCN for Skeletal Action Recognition in + Videos + + +
+ Skeleton Action Recognition (SAR) involves identifying human actions using +skeletal joint coordinates and their interconnections. While plain Transformers +have been attempted for this task, they still fall short compared to the +current leading methods, which are rooted in Graph Convolutional Networks +(GCNs) due to the absence of structural priors. Recently, a novel selective +state space model, Mamba, has surfaced as a compelling alternative to the +attention mechanism in Transformers, offering efficient modeling of long +sequences. In this work, to the utmost extent of our awareness, we present the +first SAR framework incorporating Mamba. Each fundamental block of our model +adopts a novel U-ShiftGCN architecture with Mamba as its core component. The +encoder segment of the U-ShiftGCN is devised to extract spatial features from +the skeletal data using downsampling vanilla Shift S-GCN blocks. These spatial +features then undergo intermediate temporal modeling facilitated by the Mamba +block before progressing to the encoder section, which comprises vanilla +upsampling Shift S-GCN blocks. Additionally, a Shift T-GCN (ShiftTCN) temporal +modeling unit is employed before the exit of each fundamental block to refine +temporal representations. This particular integration of downsampling spatial, +intermediate temporal, upsampling spatial, and ultimate temporal subunits +yields promising results for skeleton action recognition. We dub the resulting +model \textbf{Simba}, which attains state-of-the-art performance across three +well-known benchmark skeleton action recognition datasets: NTU RGB+D, NTU RGB+D +120, and Northwestern-UCLA. Interestingly, U-ShiftGCN (Simba without +Intermediate Mamba Block) by itself is capable of performing reasonably well +and surpasses our baseline. + +
+
+ comment: 20 pages, 6 tables, 1 figure +
+
+
+
+
+ + ☆ Homography Guided Temporal Fusion for Road Line and Marking Segmentation ICCV 2023 + + +
+ Reliable segmentation of road lines and markings is critical to autonomous +driving. Our work is motivated by the observations that road lines and markings +are (1) frequently occluded in the presence of moving vehicles, shadow, and +glare and (2) highly structured with low intra-class shape variance and overall +high appearance consistency. To solve these issues, we propose a Homography +Guided Fusion (HomoFusion) module to exploit temporally-adjacent video frames +for complementary cues facilitating the correct classification of the partially +occluded road lines or markings. To reduce computational complexity, a novel +surface normal estimator is proposed to establish spatial correspondences +between the sampled frames, allowing the HomoFusion module to perform a +pixel-to-pixel attention mechanism in updating the representation of the +occluded road lines or markings. Experiments on ApolloScape, a large-scale lane +mark segmentation dataset, and ApolloScape Night with artificial simulated +night-time road conditions, demonstrate that our method outperforms other +existing SOTA lane mark segmentation models with less than 9\% of their +parameters and computational complexity. We show that exploiting available +camera intrinsic data and ground plane assumption for cross-frame +correspondence can lead to a light-weight network with significantly improved +performances in speed and accuracy. We also prove the versatility of our +HomoFusion approach by applying it to the problem of water puddle segmentation +and achieving SOTA performance. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Multi-Image Visual Question Answering for Unsupervised Anomaly Detection + + +
+ Unsupervised anomaly detection enables the identification of potential +pathological areas by juxtaposing original images with their pseudo-healthy +reconstructions generated by models trained exclusively on normal images. +However, the clinical interpretation of resultant anomaly maps presents a +challenge due to a lack of detailed, understandable explanations. Recent +advancements in language models have shown the capability of mimicking +human-like understanding and providing detailed descriptions. This raises an +interesting question: \textit{How can language models be employed to make the +anomaly maps more explainable?} To the best of our knowledge, we are the first +to leverage a language model for unsupervised anomaly detection, for which we +construct a dataset with different questions and answers. Additionally, we +present a novel multi-image visual question answering framework tailored for +anomaly detection, incorporating diverse feature fusion strategies to enhance +visual knowledge extraction. Our experiments reveal that the framework, +augmented by our new Knowledge Q-Former module, adeptly answers questions on +the anomaly detection dataset. Besides, integrating anomaly maps as inputs +distinctly aids in improving the detection of unseen pathologies. + +
+
+ comment: 13 pages, 8 figures +
+
+
+
+
+ + ☆ Diffusion Probabilistic Multi-cue Level Set for Reducing Edge + Uncertainty in Pancreas Segmentation + + +
+ Accurately segmenting the pancreas remains a huge challenge. Traditional +methods encounter difficulties in semantic localization due to the small volume +and distorted structure of the pancreas, while deep learning methods encounter +challenges in obtaining accurate edges because of low contrast and organ +overlapping. To overcome these issues, we propose a multi-cue level set method +based on the diffusion probabilistic model, namely Diff-mcs. Our method adopts +a coarse-to-fine segmentation strategy. We use the diffusion probabilistic +model in the coarse segmentation stage, with the obtained probability +distribution serving as both the initial localization and prior cues for the +level set method. In the fine segmentation stage, we combine the prior cues +with grayscale cues and texture cues to refine the edge by maximizing the +difference between probability distributions of the cues inside and outside the +level set curve. The method is validated on three public datasets and achieves +state-of-the-art performance, which can obtain more accurate segmentation +results with lower uncertainty segmentation edges. In addition, we conduct +ablation studies and uncertainty analysis to verify that the diffusion +probability model provides a more appropriate initialization for the level set +method. Furthermore, when combined with multiple cues, the level set method can +better obtain edges and improve the overall accuracy. Our code is available at +https://github.com/GOUYUEE/Diff-mcs. + +
+
+
+
+
+ + ☆ Do You Remember? Dense Video Captioning with Cross-Modal Memory + Retrieval CVPR 2024 + + +
+ There has been significant attention to the research on dense video +captioning, which aims to automatically localize and caption all events within +untrimmed video. Several studies introduce methods by designing dense video +captioning as a multitasking problem of event localization and event captioning +to consider inter-task relations. However, addressing both tasks using only +visual input is challenging due to the lack of semantic content. In this study, +we address this by proposing a novel framework inspired by the cognitive +information processing of humans. Our model utilizes external memory to +incorporate prior knowledge. The memory retrieval method is proposed with +cross-modal video-to-text matching. To effectively incorporate retrieved text +features, the versatile encoder and the decoder with visual and textual +cross-attention modules are designed. Comparative experiments have been +conducted to show the effectiveness of the proposed method on ActivityNet +Captions and YouCook2 datasets. Experimental results show promising performance +of our model without extensive pretraining from a large video dataset. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Automatic Detection of Dark Ship-to-Ship Transfers using Deep Learning + and Satellite Imagery + + +
+ Despite extensive research into ship detection via remote sensing, no studies +identify ship-to-ship transfers in satellite imagery. Given the importance of +transshipment in illicit shipping practices, this is a significant gap. In what +follows, I train a convolutional neural network to accurately detect 4 +different types of cargo vessel and two different types of Ship-to-Ship +transfer in PlanetScope satellite imagery. I then elaborate a pipeline for the +automatic detection of suspected illicit ship-to-ship transfers by +cross-referencing satellite detections with vessel borne GPS data. Finally, I +apply this method to the Kerch Strait between Ukraine and Russia to identify +over 400 dark transshipment events since 2022. + +
+
+
+
+
+ + ☆ Contrastive-Based Deep Embeddings for Label Noise-Resilient + Histopathology Image Classification + + +
+ Recent advancements in deep learning have proven highly effective in medical +image classification, notably within histopathology. However, noisy labels +represent a critical challenge in histopathology image classification, where +accurate annotations are vital for training robust deep learning models. +Indeed, deep neural networks can easily overfit label noise, leading to severe +degradations in model performance. While numerous public pathology foundation +models have emerged recently, none have evaluated their resilience to label +noise. Through thorough empirical analyses across multiple datasets, we exhibit +the label noise resilience property of embeddings extracted from foundation +models trained in a self-supervised contrastive manner. We demonstrate that +training with such embeddings substantially enhances label noise robustness +when compared to non-contrastive-based ones as well as commonly used +noise-resilient methods. Our results unequivocally underline the superiority of +contrastive learning in effectively mitigating the label noise challenge. Code +is publicly available at +https://github.com/LucasDedieu/NoiseResilientHistopathology. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ☆ GLID: Pre-training a Generalist Encoder-Decoder Vision Model CVPR 2024 + + +
+ This paper proposes a GeneraLIst encoder-Decoder (GLID) pre-training method +for better handling various downstream computer vision tasks. While +self-supervised pre-training approaches, e.g., Masked Autoencoder, have shown +success in transfer learning, task-specific sub-architectures are still +required to be appended for different downstream tasks, which cannot enjoy the +benefits of large-scale pre-training. GLID overcomes this challenge by allowing +the pre-trained generalist encoder-decoder to be fine-tuned on various vision +tasks with minimal task-specific architecture modifications. In the GLID +training scheme, pre-training pretext task and other downstream tasks are +modeled as "query-to-answer" problems, including the pre-training pretext task +and other downstream tasks. We pre-train a task-agnostic encoder-decoder with +query-mask pairs. During fine-tuning, GLID maintains the pre-trained +encoder-decoder and queries, only replacing the topmost linear transformation +layer with task-specific linear heads. This minimizes the pretrain-finetune +architecture inconsistency and enables the pre-trained model to better adapt to +downstream tasks. GLID achieves competitive performance on various vision +tasks, including object detection, image segmentation, pose estimation, and +depth estimation, outperforming or matching specialist models such as +Mask2Former, DETR, ViTPose, and BinsFormer. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Attention based End to end network for Offline Writer Identification on + Word level data + + +
+ Writer identification due to its widespread application in various fields has +gained popularity over the years. In scenarios where optimum handwriting +samples are available, whether they be in the form of a single line, a +sentence, or an entire page, writer identification algorithms have demonstrated +noteworthy levels of accuracy. However, in scenarios where only a limited +number of handwritten samples are available, particularly in the form of word +images, there is a significant scope for improvement. + In this paper, we propose a writer identification system based on an +attention-driven Convolutional Neural Network (CNN). The system is trained +utilizing image segments, known as fragments, extracted from word images, +employing a pyramid-based strategy. This methodology enables the system to +capture a comprehensive representation of the data, encompassing both +fine-grained details and coarse features across various levels of abstraction. +These extracted fragments serve as the training data for the convolutional +network, enabling it to learn a more robust representation compared to +traditional convolution-based networks trained on word images. Additionally, +the paper explores the integration of an attention mechanism to enhance the +representational power of the learned features. The efficacy of the proposed +algorithm is evaluated on three benchmark databases, demonstrating its +proficiency in writer identification tasks, particularly in scenarios with +limited access to handwriting data. + +
+
+
+
+
+ + ☆ Implicit and Explicit Language Guidance for Diffusion-based Visual + Perception + + +
+ Text-to-image diffusion models have shown powerful ability on conditional +image synthesis. With large-scale vision-language pre-training, diffusion +models are able to generate high-quality images with rich texture and +reasonable structure under different text prompts. However, it is an open +problem to adapt the pre-trained diffusion model for visual perception. In this +paper, we propose an implicit and explicit language guidance framework for +diffusion-based perception, named IEDP. Our IEDP comprises of an implicit +language guidance branch and an explicit language guidance branch. The implicit +branch employs frozen CLIP image encoder to directly generate implicit text +embeddings that are fed to diffusion model, without using explicit text +prompts. The explicit branch utilizes the ground-truth labels of corresponding +images as text prompts to condition feature extraction of diffusion model. +During training, we jointly train diffusion model by sharing the model weights +of these two branches. As a result, implicit and explicit branches can jointly +guide feature learning. During inference, we only employ implicit branch for +final prediction, which does not require any ground-truth labels. Experiments +are performed on two typical perception tasks, including semantic segmentation +and depth estimation. Our IEDP achieves promising performance on both tasks. +For semantic segmentation, our IEDP has the mIoU score of 55.9% on AD20K +validation set, which outperforms the baseline method VPD by 2.2%. For depth +estimation, our IEDP outperforms the baseline method VPD with a relative gain +of 10.2%. + +
+
+
+
+
+ + ☆ Weakly-Supervised Learning via Multi-Lateral Decoder Branching for + Guidewire Segmentation in Robot-Assisted Cardiovascular Catheterization + + +
+ Although robot-assisted cardiovascular catheterization is commonly performed +for intervention of cardiovascular diseases, more studies are needed to support +the procedure with automated tool segmentation. This can aid surgeons on tool +tracking and visualization during intervention. Learning-based segmentation has +recently offered state-of-the-art segmentation performances however, generating +ground-truth signals for fully-supervised methods is labor-intensive and time +consuming for the interventionists. In this study, a weakly-supervised learning +method with multi-lateral pseudo labeling is proposed for tool segmentation in +cardiac angiograms. The method includes a modified U-Net model with one encoder +and multiple lateral-branched decoders that produce pseudo labels as +supervision signals under different perturbation. The pseudo labels are +self-generated through a mixed loss function and shared consistency in the +decoders. We trained the model end-to-end with weakly-annotated data obtained +during robotic cardiac catheterization. Experiments with the proposed model +shows weakly annotated data has closer performance to when fully annotated data +is used. Compared to three existing weakly-supervised methods, our approach +yielded higher segmentation performance across three different cardiac +angiogram data. With ablation study, we showed consistent performance under +different parameters. Thus, we offer a less expensive method for real-time tool +segmentation and tracking during robot-assisted cardiac catheterization. + +
+
+
+
+
+ + ☆ Multi-rater Prompting for Ambiguous Medical Image Segmentation + + +
+ Multi-rater annotations commonly occur when medical images are independently +annotated by multiple experts (raters). In this paper, we tackle two challenges +arisen in multi-rater annotations for medical image segmentation (called +ambiguous medical image segmentation): (1) How to train a deep learning model +when a group of raters produces a set of diverse but plausible annotations, and +(2) how to fine-tune the model efficiently when computation resources are not +available for re-training the entire model on a different dataset domain. We +propose a multi-rater prompt-based approach to address these two challenges +altogether. Specifically, we introduce a series of rater-aware prompts that can +be plugged into the U-Net model for uncertainty estimation to handle +multi-annotation cases. During the prompt-based fine-tuning process, only 0.3% +of learnable parameters are required to be updated comparing to training the +entire model. Further, in order to integrate expert consensus and disagreement, +we explore different multi-rater incorporation strategies and design a +mix-training strategy for comprehensive insight learning. Extensive experiments +verify the effectiveness of our new approach for ambiguous medical image +segmentation on two public datasets while alleviating the heavy burden of model +re-training. + +
+
+
+
+
+ + ☆ ObjBlur: A Curriculum Learning Approach With Progressive Object-Level + Blurring for Improved Layout-to-Image Generation + + +
+ We present ObjBlur, a novel curriculum learning approach to improve +layout-to-image generation models, where the task is to produce realistic +images from layouts composed of boxes and labels. Our method is based on +progressive object-level blurring, which effectively stabilizes training and +enhances the quality of generated images. This curriculum learning strategy +systematically applies varying degrees of blurring to individual objects or the +background during training, starting from strong blurring to progressively +cleaner images. Our findings reveal that this approach yields significant +performance improvements, stabilized training, smoother convergence, and +reduced variance between multiple runs. Moreover, our technique demonstrates +its versatility by being compatible with generative adversarial networks and +diffusion models, underlining its applicability across various generative +modeling paradigms. With ObjBlur, we reach new state-of-the-art results on the +complex COCO and Visual Genome datasets. + +
+
+
+
+
+ + ☆ Attention-Aware Laparoscopic Image Desmoking Network with Lightness + Embedding and Hybrid Guided Embedding + + +
+ This paper presents a novel method of smoke removal from the laparoscopic +images. Due to the heterogeneous nature of surgical smoke, a two-stage network +is proposed to estimate the smoke distribution and reconstruct a clear, +smoke-free surgical scene. The utilization of the lightness channel plays a +pivotal role in providing vital information pertaining to smoke density. The +reconstruction of smoke-free image is guided by a hybrid embedding, which +combines the estimated smoke mask with the initial image. Experimental results +demonstrate that the proposed method boasts a Peak Signal to Noise Ratio that +is $2.79\%$ higher than the state-of-the-art methods, while also exhibits a +remarkable $38.2\%$ reduction in run-time. Overall, the proposed method offers +comparable or even superior performance in terms of both smoke removal quality +and computational efficiency when compared to existing state-of-the-art +methods. This work will be publicly available on +http://homepage.hit.edu.cn/wpgao + +
+
+ comment: ISBI2024 +
+
+
+
+
+ + ☆ CAT: Contrastive Adapter Training for Personalized Image Generation CVPR + + +
+ The emergence of various adapters, including Low-Rank Adaptation (LoRA) +applied from the field of natural language processing, has allowed diffusion +models to personalize image generation at a low cost. However, due to the +various challenges including limited datasets and shortage of regularization +and computation resources, adapter training often results in unsatisfactory +outcomes, leading to the corruption of the backbone model's prior knowledge. +One of the well known phenomena is the loss of diversity in object generation, +especially within the same class which leads to generating almost identical +objects with minor variations. This poses challenges in generation +capabilities. To solve this issue, we present Contrastive Adapter Training +(CAT), a simple yet effective strategy to enhance adapter training through the +application of CAT loss. Our approach facilitates the preservation of the base +model's original knowledge when the model initiates adapters. Furthermore, we +introduce the Knowledge Preservation Score (KPS) to evaluate CAT's ability to +keep the former information. We qualitatively and quantitatively compare CAT's +improvement. Finally, we mention the possibility of CAT in the aspects of +multi-concept adapter and optimization. + +
+
+ comment: CVPRW 2024 +
+
+
+
+
+ + ☆ SFSORT: Scene Features-based Simple Online Real-Time Tracker + + +
+ This paper introduces SFSORT, the world's fastest multi-object tracking +system based on experiments conducted on MOT Challenge datasets. To achieve an +accurate and computationally efficient tracker, this paper employs a +tracking-by-detection method, following the online real-time tracking approach +established in prior literature. By introducing a novel cost function called +the Bounding Box Similarity Index, this work eliminates the Kalman Filter, +leading to reduced computational requirements. Additionally, this paper +demonstrates the impact of scene features on enhancing object-track association +and improving track post-processing. Using a 2.2 GHz Intel Xeon CPU, the +proposed method achieves an HOTA of 61.7\% with a processing speed of 2242 Hz +on the MOT17 dataset and an HOTA of 60.9\% with a processing speed of 304 Hz on +the MOT20 dataset. The tracker's source code, fine-tuned object detection +model, and tutorials are available at +\url{https://github.com/gitmehrdad/SFSORT}. + +
+
+
+
+
+ + ☆ Event-Enhanced Snapshot Compressive Videography at 10K FPS + + +
+ Video snapshot compressive imaging (SCI) encodes the target dynamic scene +compactly into a snapshot and reconstructs its high-speed frame sequence +afterward, greatly reducing the required data footprint and transmission +bandwidth as well as enabling high-speed imaging with a low frame rate +intensity camera. In implementation, high-speed dynamics are encoded via +temporally varying patterns, and only frames at corresponding temporal +intervals can be reconstructed, while the dynamics occurring between +consecutive frames are lost. To unlock the potential of conventional snapshot +compressive videography, we propose a novel hybrid "intensity+event" imaging +scheme by incorporating an event camera into a video SCI setup. Our proposed +system consists of a dual-path optical setup to record the coded intensity +measurement and intermediate event signals simultaneously, which is compact and +photon-efficient by collecting the half photons discarded in conventional video +SCI. Correspondingly, we developed a dual-branch Transformer utilizing the +reciprocal relationship between two data modes to decode dense video frames. +Extensive experiments on both simulated and real-captured data demonstrate our +superiority to state-of-the-art video SCI and video frame interpolation (VFI) +methods. Benefiting from the new hybrid design leveraging both intrinsic +redundancy in videos and the unique feature of event cameras, we achieve +high-quality videography at 0.1ms time intervals with a low-cost CMOS image +sensor working at 24 FPS. + +
+
+
+
+
+ + ☆ Stereo-LiDAR Depth Estimation with Deformable Propagation and Learned + Disparity-Depth Conversion ICRA 2024 + + +
+ Accurate and dense depth estimation with stereo cameras and LiDAR is an +important task for automatic driving and robotic perception. While sparse hints +from LiDAR points have improved cost aggregation in stereo matching, their +effectiveness is limited by the low density and non-uniform distribution. To +address this issue, we propose a novel stereo-LiDAR depth estimation network +with Semi-Dense hint Guidance, named SDG-Depth. Our network includes a +deformable propagation module for generating a semi-dense hint map and a +confidence map by propagating sparse hints using a learned deformable window. +These maps then guide cost aggregation in stereo matching. To reduce the +triangulation error in depth recovery from disparity, especially in distant +regions, we introduce a disparity-depth conversion module. Our method is both +accurate and efficient. The experimental results on benchmark tests show its +superior performance. Our code is available at +https://github.com/SJTU-ViSYS/SDG-Depth. + +
+
+ comment: Accepted in ICRA 2024. 8 pages, 6 figures +
+
+
+
+
+ + ☆ Content-Adaptive Non-Local Convolution for Remote Sensing Pansharpening CVPR 2024 + + +
+ Currently, machine learning-based methods for remote sensing pansharpening +have progressed rapidly. However, existing pansharpening methods often do not +fully exploit differentiating regional information in non-local spaces, thereby +limiting the effectiveness of the methods and resulting in redundant learning +parameters. In this paper, we introduce a so-called content-adaptive non-local +convolution (CANConv), a novel method tailored for remote sensing image +pansharpening. Specifically, CANConv employs adaptive convolution, ensuring +spatial adaptability, and incorporates non-local self-similarity through the +similarity relationship partition (SRP) and the partition-wise adaptive +convolution (PWAC) sub-modules. Furthermore, we also propose a corresponding +network architecture, called CANNet, which mainly utilizes the multi-scale +self-similarity. Extensive experiments demonstrate the superior performance of +CANConv, compared with recent promising fusion methods. Besides, we +substantiate the method's effectiveness through visualization, ablation +experiments, and comparison with existing methods on multiple test sets. The +source code is publicly available at https://github.com/duanyll/CANConv. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ How is Visual Attention Influenced by Text Guidance? Database and Model + + +
+ The analysis and prediction of visual attention have long been crucial tasks +in the fields of computer vision and image processing. In practical +applications, images are generally accompanied by various text descriptions, +however, few studies have explored the influence of text descriptions on visual +attention, let alone developed visual saliency prediction models considering +text guidance. In this paper, we conduct a comprehensive study on text-guided +image saliency (TIS) from both subjective and objective perspectives. +Specifically, we construct a TIS database named SJTU-TIS, which includes 1200 +text-image pairs and the corresponding collected eye-tracking data. Based on +the established SJTU-TIS database, we analyze the influence of various text +descriptions on visual attention. Then, to facilitate the development of +saliency prediction models considering text influence, we construct a benchmark +for the established SJTU-TIS database using state-of-the-art saliency models. +Finally, considering the effect of text descriptions on visual attention, while +most existing saliency models ignore this impact, we further propose a +text-guided saliency (TGSal) prediction model, which extracts and integrates +both image features and text features to predict the image saliency under +various text-description conditions. Our proposed model significantly +outperforms the state-of-the-art saliency models on both the SJTU-TIS database +and the pure image saliency databases in terms of various evaluation metrics. +The SJTU-TIS database and the code of the proposed TGSal model will be released +at: https://github.com/IntMeGroup/TGSal. + +
+
+
+
+
+ + ☆ PromptSync: Bridging Domain Gaps in Vision-Language Models through + Class-Aware Prototype Alignment and Discrimination CVPR 2024 + + +
+ The potential for zero-shot generalization in vision-language (V-L) models +such as CLIP has spurred their widespread adoption in addressing numerous +downstream tasks. Previous methods have employed test-time prompt tuning to +adapt the model to unseen domains, but they overlooked the issue of imbalanced +class distributions. In this study, we explicitly address this problem by +employing class-aware prototype alignment weighted by mean class probabilities +obtained for the test sample and filtered augmented views. Additionally, we +ensure that the class probabilities are as accurate as possible by performing +prototype discrimination using contrastive learning. The combination of +alignment and discriminative loss serves as a geometric regularizer, preventing +the prompt representation from collapsing onto a single class and effectively +bridging the distribution gap between the source and test domains. Our method, +named PromptSync, synchronizes the prompts for each test sample on both the +text and vision branches of the V-L model. In empirical evaluations on the +domain generalization benchmark, our method outperforms previous best methods +by 2.33\% in overall performance, by 1\% in base-to-novel generalization, and +by 2.84\% in cross-dataset transfer tasks. + +
+
+ comment: Accepted at CVPR 2024 LIMIT, 12 pages, 8 Tables, 2 Figures +
+
+
+
+
+ + ☆ Remembering Transformer for Continual Learning + + +
+ Neural networks encounter the challenge of Catastrophic Forgetting (CF) in +continual learning, where new task knowledge interferes with previously learned +knowledge. We propose Remembering Transformer, inspired by the brain's +Complementary Learning Systems (CLS), to tackle this issue. Remembering +Transformer employs a mixture-of-adapters and a generative model-based routing +mechanism to alleviate CF by dynamically routing task data to relevant +adapters. Our approach demonstrated a new SOTA performance in various vision +continual learning tasks and great parameter efficiency. + +
+
+
+
+
+ + ☆ Generalization Gap in Data Augmentation: Insights from Illumination + + +
+ In the field of computer vision, data augmentation is widely used to enrich +the feature complexity of training datasets with deep learning techniques. +However, regarding the generalization capabilities of models, the difference in +artificial features generated by data augmentation and natural visual features +has not been fully revealed. This study focuses on the visual representation +variable 'illumination', by simulating its distribution degradation and +examining how data augmentation techniques enhance model performance on a +classification task. Our goal is to investigate the differences in +generalization between models trained with augmented data and those trained +under real-world illumination conditions. Results indicate that after +undergoing various data augmentation methods, model performance has been +significantly improved. Yet, a noticeable generalization gap still exists after +utilizing various data augmentation methods, emphasizing the critical role of +feature diversity in the training set for enhancing model generalization. + +
+
+
+
+
+ + ☆ Learning to Classify New Foods Incrementally Via Compressed Exemplars + + +
+ Food image classification systems play a crucial role in health monitoring +and diet tracking through image-based dietary assessment techniques. However, +existing food recognition systems rely on static datasets characterized by a +pre-defined fixed number of food classes. This contrasts drastically with the +reality of food consumption, which features constantly changing data. +Therefore, food image classification systems should adapt to and manage data +that continuously evolves. This is where continual learning plays an important +role. A challenge in continual learning is catastrophic forgetting, where ML +models tend to discard old knowledge upon learning new information. While +memory-replay algorithms have shown promise in mitigating this problem by +storing old data as exemplars, they are hampered by the limited capacity of +memory buffers, leading to an imbalance between new and previously learned +data. To address this, our work explores the use of neural image compression to +extend buffer size and enhance data diversity. We introduced the concept of +continuously learning a neural compression model to adaptively improve the +quality of compressed data and optimize the bitrates per pixel (bpp) to store +more exemplars. Our extensive experiments, including evaluations on +food-specific datasets including Food-101 and VFN-74, as well as the general +dataset ImageNet-100, demonstrate improvements in classification accuracy. This +progress is pivotal in advancing more realistic food recognition systems that +are capable of adapting to continually evolving data. Moreover, the principles +and methodologies we've developed hold promise for broader applications, +extending their benefits to other domains of continual machine learning +systems. + +
+
+
+
+
+ + ☆ Mitigating Object Dependencies: Improving Point Cloud Self-Supervised + Learning through Object Exchange + + +
+ In the realm of point cloud scene understanding, particularly in indoor +scenes, objects are arranged following human habits, resulting in objects of +certain semantics being closely positioned and displaying notable inter-object +correlations. This can create a tendency for neural networks to exploit these +strong dependencies, bypassing the individual object patterns. To address this +challenge, we introduce a novel self-supervised learning (SSL) strategy. Our +approach leverages both object patterns and contextual cues to produce robust +features. It begins with the formulation of an object-exchanging strategy, +where pairs of objects with comparable sizes are exchanged across different +scenes, effectively disentangling the strong contextual dependencies. +Subsequently, we introduce a context-aware feature learning strategy, which +encodes object patterns without relying on their specific context by +aggregating object features across various scenes. Our extensive experiments +demonstrate the superiority of our method over existing SSL techniques, further +showing its better robustness to environmental changes. Moreover, we showcase +the applicability of our approach by transferring pre-trained models to diverse +point cloud datasets. + +
+
+
+
+
+ + ☆ PillarTrack: Redesigning Pillar-based Transformer Network for Single + Object Tracking on Point Clouds + + +
+ LiDAR-based 3D single object tracking (3D SOT) is a critical issue in +robotics and autonomous driving. It aims to obtain accurate 3D BBox from the +search area based on similarity or motion. However, existing 3D SOT methods +usually follow the point-based pipeline, where the sampling operation +inevitably leads to redundant or lost information, resulting in unexpected +performance. To address these issues, we propose PillarTrack, a pillar-based 3D +single object tracking framework. Firstly, we transform sparse point clouds +into dense pillars to preserve the local and global geometrics. Secondly, we +introduce a Pyramid-type Encoding Pillar Feature Encoder (PE-PFE) design to +help the feature representation of each pillar. Thirdly, we present an +efficient Transformer-based backbone from the perspective of modality +differences. Finally, we construct our PillarTrack tracker based above designs. +Extensive experiments on the KITTI and nuScenes dataset demonstrate the +superiority of our proposed method. Notably, our method achieves +state-of-the-art performance on the KITTI and nuScenes dataset and enables +real-time tracking speed. We hope our work could encourage the community to +rethink existing 3D SOT tracker designs.We will open source our code to the +research community in https://github.com/StiphyJay/PillarTrack. + +
+
+
+
+
+ + ☆ Fine-Grained Side Information Guided Dual-Prompts for Zero-Shot Skeleton + Action Recognition + + +
+ Skeleton-based zero-shot action recognition aims to recognize unknown human +actions based on the learned priors of the known skeleton-based actions and a +semantic descriptor space shared by both known and unknown categories. However, +previous works focus on establishing the bridges between the known skeleton +representation space and semantic descriptions space at the coarse-grained +level for recognizing unknown action categories, ignoring the fine-grained +alignment of these two spaces, resulting in suboptimal performance in +distinguishing high-similarity action categories. To address these challenges, +we propose a novel method via Side information and dual-prompts learning for +skeleton-based zero-shot action recognition (STAR) at the fine-grained level. +Specifically, 1) we decompose the skeleton into several parts based on its +topology structure and introduce the side information concerning multi-part +descriptions of human body movements for alignment between the skeleton and the +semantic space at the fine-grained level; 2) we design the visual-attribute and +semantic-part prompts to improve the intra-class compactness within the +skeleton space and inter-class separability within the semantic space, +respectively, to distinguish the high-similarity actions. Extensive experiments +show that our method achieves state-of-the-art performance in ZSL and GZSL +settings on NTU RGB+D, NTU RGB+D 120, and PKU-MMD datasets. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+ + ☆ G-NeRF: Geometry-enhanced Novel View Synthesis from Single-View Images CVPR 2024 + + +
+ Novel view synthesis aims to generate new view images of a given view image +collection. Recent attempts address this problem relying on 3D geometry priors +(e.g., shapes, sizes, and positions) learned from multi-view images. However, +such methods encounter the following limitations: 1) they require a set of +multi-view images as training data for a specific scene (e.g., face, car or +chair), which is often unavailable in many real-world scenarios; 2) they fail +to extract the geometry priors from single-view images due to the lack of +multi-view supervision. In this paper, we propose a Geometry-enhanced NeRF +(G-NeRF), which seeks to enhance the geometry priors by a geometry-guided +multi-view synthesis approach, followed by a depth-aware training. In the +synthesis process, inspired that existing 3D GAN models can unconditionally +synthesize high-fidelity multi-view images, we seek to adopt off-the-shelf 3D +GAN models, such as EG3D, as a free source to provide geometry priors through +synthesizing multi-view data. Simultaneously, to further improve the geometry +quality of the synthetic data, we introduce a truncation method to effectively +sample latent codes within 3D GAN models. To tackle the absence of multi-view +supervision for single-view images, we design the depth-aware training +approach, incorporating a depth-aware discriminator to guide geometry priors +through depth maps. Experiments demonstrate the effectiveness of our method in +terms of both qualitative and quantitative results. + +
+
+ comment: CVPR 2024 Accepted Paper +
+
+
+
+
+ + ☆ LUCF-Net: Lightweight U-shaped Cascade Fusion Network for Medical Image + Segmentation + + +
+ In this study, the performance of existing U-shaped neural network +architectures was enhanced for medical image segmentation by adding +Transformer. Although Transformer architectures are powerful at extracting +global information, its ability to capture local information is limited due to +its high complexity. To address this challenge, we proposed a new lightweight +U-shaped cascade fusion network (LUCF-Net) for medical image segmentation. It +utilized an asymmetrical structural design and incorporated both local and +global modules to enhance its capacity for local and global modeling. +Additionally, a multi-layer cascade fusion decoding network was designed to +further bolster the network's information fusion capabilities. Validation +results achieved on multi-organ datasets in CT format, cardiac segmentation +datasets in MRI format, and dermatology datasets in image format demonstrated +that the proposed model outperformed other state-of-the-art methods in handling +local-global information, achieving an improvement of 1.54% in Dice coefficient +and 2.6 mm in Hausdorff distance on multi-organ segmentation. Furthermore, as a +network that combines Convolutional Neural Network and Transformer +architectures, it achieves competitive segmentation performance with only 6.93 +million parameters and 6.6 gigabytes of floating point operations, without the +need of pre-training. In summary, the proposed method demonstrated enhanced +performance while retaining a simpler model design compared to other +Transformer-based segmentation networks. + +
+
+
+
+
+ + ☆ Trashbusters: Deep Learning Approach for Litter Detection and Tracking + + +
+ The illegal disposal of trash is a major public health and environmental +concern. Disposing of trash in unplanned places poses serious health and +environmental risks. We should try to restrict public trash cans as much as +possible. This research focuses on automating the penalization of litterbugs, +addressing the persistent problem of littering in public places. Traditional +approaches relying on manual intervention and witness reporting suffer from +delays, inaccuracies, and anonymity issues. To overcome these challenges, this +paper proposes a fully automated system that utilizes surveillance cameras and +advanced computer vision algorithms for litter detection, object tracking, and +face recognition. The system accurately identifies and tracks individuals +engaged in littering activities, attaches their identities through face +recognition, and enables efficient enforcement of anti-littering policies. By +reducing reliance on manual intervention, minimizing human error, and providing +prompt identification, the proposed system offers significant advantages in +addressing littering incidents. The primary contribution of this research lies +in the implementation of the proposed system, leveraging advanced technologies +to enhance surveillance operations and automate the penalization of litterbugs. + +
+
+
+
+
+ + ☆ Learning to Localize Objects Improves Spatial Reasoning in Visual-LLMs + + +
+ Integration of Large Language Models (LLMs) into visual domain tasks, +resulting in visual-LLMs (V-LLMs), has enabled exceptional performance in +vision-language tasks, particularly for visual question answering (VQA). +However, existing V-LLMs (e.g. BLIP-2, LLaVA) demonstrate weak spatial +reasoning and localization awareness. Despite generating highly descriptive and +elaborate textual answers, these models fail at simple tasks like +distinguishing a left vs right location. In this work, we explore how +image-space coordinate based instruction fine-tuning objectives could inject +spatial awareness into V-LLMs. We discover optimal coordinate representations, +data-efficient instruction fine-tuning objectives, and pseudo-data generation +strategies that lead to improved spatial awareness in V-LLMs. Additionally, our +resulting model improves VQA across image and video domains, reduces undesired +hallucination, and generates better contextual object descriptions. Experiments +across 5 vision-language tasks involving 14 different datasets establish the +clear performance improvements achieved by our proposed framework. + +
+
+
+
+
+ + ☆ Transferable and Principled Efficiency for Open-Vocabulary Segmentation + + +
+ Recent success of pre-trained foundation vision-language models makes +Open-Vocabulary Segmentation (OVS) possible. Despite the promising performance, +this approach introduces heavy computational overheads for two challenges: 1) +large model sizes of the backbone; 2) expensive costs during the fine-tuning. +These challenges hinder this OVS strategy from being widely applicable and +affordable in real-world scenarios. Although traditional methods such as model +compression and efficient fine-tuning can address these challenges, they often +rely on heuristics. This means that their solutions cannot be easily +transferred and necessitate re-training on different models, which comes at a +cost. In the context of efficient OVS, we target achieving performance that is +comparable to or even better than prior OVS works based on large +vision-language foundation models, by utilizing smaller models that incur lower +training costs. The core strategy is to make our efficiency principled and thus +seamlessly transferable from one OVS framework to others without further +customization. Comprehensive experiments on diverse OVS benchmarks demonstrate +our superior trade-off between segmentation accuracy and computation costs over +previous works. Our code is available on https://github.com/Xujxyang/OpenTrans + +
+
+
+
+
+ + ☆ Multi-view Aggregation Network for Dichotomous Image Segmentation CVPR2024 + + +
+ Dichotomous Image Segmentation (DIS) has recently emerged towards +high-precision object segmentation from high-resolution natural images. + When designing an effective DIS model, the main challenge is how to balance +the semantic dispersion of high-resolution targets in the small receptive field +and the loss of high-precision details in the large receptive field. Existing +methods rely on tedious multiple encoder-decoder streams and stages to +gradually complete the global localization and local refinement. + Human visual system captures regions of interest by observing them from +multiple views. Inspired by it, we model DIS as a multi-view object perception +problem and provide a parsimonious multi-view aggregation network (MVANet), +which unifies the feature fusion of the distant view and close-up view into a +single stream with one encoder-decoder structure. With the help of the proposed +multi-view complementary localization and refinement modules, our approach +established long-range, profound visual interactions across multiple views, +allowing the features of the detailed close-up view to focus on highly slender +structures.Experiments on the popular DIS-5K dataset show that our MVANet +significantly outperforms state-of-the-art methods in both accuracy and speed. +The source code and datasets will be publicly available at +\href{https://github.com/qianyu-dlut/MVANet}{MVANet}. + +
+
+ comment: Accepted by CVPR2024 as Highlight +
+
+
+
+
+ + ☆ Encoding Urban Ecologies: Automated Building Archetype Generation + through Self-Supervised Learning for Energy Modeling + + +
+ As the global population and urbanization expand, the building sector has +emerged as the predominant energy consumer and carbon emission contributor. The +need for innovative Urban Building Energy Modeling grows, yet existing building +archetypes often fail to capture the unique attributes of local buildings and +the nuanced distinctions between different cities, jeopardizing the precision +of energy modeling. This paper presents an alternative tool employing +self-supervised learning to distill complex geometric data into representative, +locale-specific archetypes. This study attempts to foster a new paradigm of +interaction with built environments, incorporating local parameters to conduct +bespoke energy simulations at the community level. The catered archetypes can +augment the precision and applicability of energy consumption modeling at +different scales across diverse building inventories. This tool provides a +potential solution that encourages the exploration of emerging local ecologies. +By integrating building envelope characteristics and cultural granularity into +the building archetype generation process, we seek a future where architecture +and urban design are intricately interwoven with the energy sector in shaping +our built environments. + +
+
+
+
+
+ + ☆ CopilotCAD: Empowering Radiologists with Report Completion Models and + Quantitative Evidence from Medical Image Foundation Models + + +
+ Computer-aided diagnosis systems hold great promise to aid radiologists and +clinicians in radiological clinical practice and enhance diagnostic accuracy +and efficiency. However, the conventional systems primarily focus on delivering +diagnostic results through text report generation or medical image +classification, positioning them as standalone decision-makers rather than +helpers and ignoring radiologists' expertise. This study introduces an +innovative paradigm to create an assistive co-pilot system for empowering +radiologists by leveraging Large Language Models (LLMs) and medical image +analysis tools. Specifically, we develop a collaborative framework to integrate +LLMs and quantitative medical image analysis results generated by foundation +models with radiologists in the loop, achieving efficient and safe generation +of radiology reports and effective utilization of computational power of AI and +the expertise of medical professionals. This approach empowers radiologists to +generate more precise and detailed diagnostic reports, enhancing patient +outcomes while reducing the burnout of clinicians. Our methodology underscores +the potential of AI as a supportive tool in medical diagnostics, promoting a +harmonious integration of technology and human expertise to advance the field +of radiology. + +
+
+
+
+
+ + ☆ Improving Shift Invariance in Convolutional Neural Networks with + Translation Invariant Polyphase Sampling + + +
+ Downsampling operators break the shift invariance of convolutional neural +networks (CNNs) and this affects the robustness of features learned by CNNs +when dealing with even small pixel-level shift. Through a large-scale +correlation analysis framework, we study shift invariance of CNNs by inspecting +existing downsampling operators in terms of their maximum-sampling bias (MSB), +and find that MSB is negatively correlated with shift invariance. Based on this +crucial insight, we propose a learnable pooling operator called Translation +Invariant Polyphase Sampling (TIPS) and two regularizations on the intermediate +feature maps of TIPS to reduce MSB and learn translation-invariant +representations. TIPS can be integrated into any CNN and can be trained +end-to-end with marginal computational overhead. Our experiments demonstrate +that TIPS results in consistent performance gains in terms of accuracy, shift +consistency, and shift fidelity on multiple benchmarks for image classification +and semantic segmentation compared to previous methods and also leads to +improvements in adversarial and distributional robustness. TIPS results in the +lowest MSB compared to all previous methods, thus explaining our strong +empirical results. + +
+
+
+
+
+ + ☆ Simplifying Two-Stage Detectors for On-Device Inference in Remote + Sensing + + +
+ Deep learning has been successfully applied to object detection from remotely +sensed images. Images are typically processed on the ground rather than +on-board due to the computation power of the ground system. Such offloaded +processing causes delays in acquiring target mission information, which hinders +its application to real-time use cases. For on-device object detection, +researches have been conducted on designing efficient detectors or model +compression to reduce inference latency. However, highly accurate two-stage +detectors still need further exploitation for acceleration. In this paper, we +propose a model simplification method for two-stage object detectors. Instead +of constructing a general feature pyramid, we utilize only one feature +extraction in the two-stage detector. To compensate for the accuracy drop, we +apply a high pass filter to the RPN's score map. Our approach is applicable to +any two-stage detector using a feature pyramid network. In the experiments with +state-of-the-art two-stage detectors such as ReDet, Oriented-RCNN, and LSKNet, +our method reduced computation costs upto 61.2% with the accuracy loss within +2.1% on the DOTAv1.5 dataset. Source code will be released. + +
+
+
+
+
+ + ☆ Post-hurricane building damage assessment using street-view imagery and + structured data: A multi-modal deep learning approach + + +
+ Accurately assessing building damage is critical for disaster response and +recovery. However, many existing models for detecting building damage have poor +prediction accuracy due to their limited capabilities of identifying detailed, +comprehensive structural and/or non-structural damage from the street-view +image. Additionally, these models mainly rely on the imagery data for damage +classification, failing to account for other critical information, such as wind +speed, building characteristics, evacuation zones, and distance of the building +to the hurricane track. To address these limitations, in this study, we propose +a novel multi-modal (i.e., imagery and structured data) approach for +post-hurricane building damage classification, named the Multi-Modal Swin +Transformer (MMST). We empirically train and evaluate the proposed MMST using +data collected from the 2022 Hurricane Ian in Florida, USA. Results show that +MMST outperforms all selected state-of-the-art benchmark models and can achieve +an accuracy of 92.67%, which are 7.71% improvement in accuracy compared to +Visual Geometry Group 16 (VGG-16). In addition to the street-view imagery data, +building value, building age, and wind speed are the most important predictors +for damage level classification. The proposed MMST can be deployed to assist in +rapid damage assessment and guide reconnaissance efforts in future hurricanes. + +
+
+
+
+
+ + ☆ Global versus Local: Evaluating AlexNet Architectures for Tropical + Cyclone Intensity Estimation + + +
+ Given the destructive impacts of tropical cyclones, it is critical to have a +reliable system for cyclone intensity detection. Various techniques are +available for this purpose, each with differing levels of accuracy. In this +paper, we introduce two ensemble-based models based on AlexNet architecture to +estimate tropical cyclone intensity using visible satellite images. The first +model, trained on the entire dataset, is called the global AlexNet model. The +second model is a distributed version of AlexNet in which multiple AlexNets are +trained separately on subsets of the training data categorized according to the +Saffir-Simpson wind speed scale prescribed by the meterologists. We evaluated +the performance of both models against a deep learning benchmark model called +\textit{Deepti} using a publicly available cyclone image dataset. Results +indicate that both the global model (with a root mean square error (RMSE) of +9.03 knots) and the distributed model (with a RMSE of 9.3 knots) outperform the +benchmark model (with a RMSE of 13.62 knots). We provide a thorough discussion +of our solution approach, including an explanantion of the AlexNet's +performance using gradient class activation maps (grad-CAM). Our proposed +solution strategy allows future experimentation with various deep learning +models in both single and multi-channel settings. + +
+
+
+
+
+ + ☆ SciFlow: Empowering Lightweight Optical Flow Models with Self-Cleaning + Iterations CVPR + + +
+ Optical flow estimation is crucial to a variety of vision tasks. Despite +substantial recent advancements, achieving real-time on-device optical flow +estimation remains a complex challenge. First, an optical flow model must be +sufficiently lightweight to meet computation and memory constraints to ensure +real-time performance on devices. Second, the necessity for real-time on-device +operation imposes constraints that weaken the model's capacity to adequately +handle ambiguities in flow estimation, thereby intensifying the difficulty of +preserving flow accuracy. This paper introduces two synergistic techniques, +Self-Cleaning Iteration (SCI) and Regression Focal Loss (RFL), designed to +enhance the capabilities of optical flow models, with a focus on addressing +optical flow regression ambiguities. These techniques prove particularly +effective in mitigating error propagation, a prevalent issue in optical flow +models that employ iterative refinement. Notably, these techniques add +negligible to zero overhead in model parameters and inference latency, thereby +preserving real-time on-device efficiency. The effectiveness of our proposed +SCI and RFL techniques, collectively referred to as SciFlow for brevity, is +demonstrated across two distinct lightweight optical flow model architectures +in our experiments. Remarkably, SciFlow enables substantial reduction in error +metrics (EPE and Fl-all) over the baseline models by up to 6.3% and 10.5% for +in-domain scenarios and by up to 6.2% and 13.5% for cross-domain scenarios on +the Sintel and KITTI 2015 datasets, respectively. + +
+
+ comment: CVPRW 2024 +
+
+
+
+
+ + ☆ Self-Supervised Learning of Color Constancy + + +
+ Color constancy (CC) describes the ability of the visual system to perceive +an object as having a relatively constant color despite changes in lighting +conditions. While CC and its limitations have been carefully characterized in +humans, it is still unclear how the visual system acquires this ability during +development. Here, we present a first study showing that CC develops in a +neural network trained in a self-supervised manner through an invariance +learning objective. During learning, objects are presented under changing +illuminations, while the network aims to map subsequent views of the same +object onto close-by latent representations. This gives rise to representations +that are largely invariant to the illumination conditions, offering a plausible +example of how CC could emerge during human cognitive development via a form of +self-supervised learning. + +
+
+ comment: 7 pages, 5 figures, submitted to the IEEE International Conference on + Development and Learning (ICDL 2024) +
+
+
+
+
+ + ☆ S3Editor: A Sparse Semantic-Disentangled Self-Training Framework for + Face Video Editing + + +
+ Face attribute editing plays a pivotal role in various applications. However, +existing methods encounter challenges in achieving high-quality results while +preserving identity, editing faithfulness, and temporal consistency. These +challenges are rooted in issues related to the training pipeline, including +limited supervision, architecture design, and optimization strategy. In this +work, we introduce S3Editor, a Sparse Semantic-disentangled Self-training +framework for face video editing. S3Editor is a generic solution that +comprehensively addresses these challenges with three key contributions. +Firstly, S3Editor adopts a self-training paradigm to enhance the training +process through semi-supervision. Secondly, we propose a semantic disentangled +architecture with a dynamic routing mechanism that accommodates diverse editing +requirements. Thirdly, we present a structured sparse optimization schema that +identifies and deactivates malicious neurons to further disentangle impacts +from untarget attributes. S3Editor is model-agnostic and compatible with +various editing approaches. Our extensive qualitative and quantitative results +affirm that our approach significantly enhances identity preservation, editing +fidelity, as well as temporal consistency. + +
+
+
+
+
+ + ☆ Visual Context-Aware Person Fall Detection + + +
+ As the global population ages, the number of fall-related incidents is on the +rise. Effective fall detection systems, specifically in healthcare sector, are +crucial to mitigate the risks associated with such events. This study evaluates +the role of visual context, including background objects, on the accuracy of +fall detection classifiers. We present a segmentation pipeline to +semi-automatically separate individuals and objects in images. Well-established +models like ResNet-18, EfficientNetV2-S, and Swin-Small are trained and +evaluated. During training, pixel-based transformations are applied to +segmented objects, and the models are then evaluated on raw images without +segmentation. Our findings highlight the significant influence of visual +context on fall detection. The application of Gaussian blur to the image +background notably improves the performance and generalization capabilities of +all models. Background objects such as beds, chairs, or wheelchairs can +challenge fall detection systems, leading to false positive alarms. However, we +demonstrate that object-specific contextual transformations during training +effectively mitigate this challenge. Further analysis using saliency maps +supports our observation that visual context is crucial in classification +tasks. We create both dataset processing API and segmentation pipeline, +available at https://github.com/A-NGJ/image-segmentation-cli. + +
+
+ comment: 10 pages, 6 figures, KES IDT-24 conference +
+
+
+
+
+ + ☆ Real-Time Detection and Analysis of Vehicles and Pedestrians using Deep + Learning + + +
+ Computer vision, particularly vehicle and pedestrian identification is +critical to the evolution of autonomous driving, artificial intelligence, and +video surveillance. Current traffic monitoring systems confront major +difficulty in recognizing small objects and pedestrians effectively in +real-time, posing a serious risk to public safety and contributing to traffic +inefficiency. Recognizing these difficulties, our project focuses on the +creation and validation of an advanced deep-learning framework capable of +processing complex visual input for precise, real-time recognition of cars and +people in a variety of environmental situations. On a dataset representing +complicated urban settings, we trained and evaluated different versions of the +YOLOv8 and RT-DETR models. The YOLOv8 Large version proved to be the most +effective, especially in pedestrian recognition, with great precision and +robustness. The results, which include Mean Average Precision and recall rates, +demonstrate the model's ability to dramatically improve traffic monitoring and +safety. This study makes an important addition to real-time, reliable detection +in computer vision, establishing new benchmarks for traffic management systems. + +
+
+ comment: 5 pages, 2 figures +
+
+
+
+
+ + ☆ DIMAT: Decentralized Iterative Merging-And-Training for Deep Learning + Models CVPR 2024 + + +
+ Recent advances in decentralized deep learning algorithms have demonstrated +cutting-edge performance on various tasks with large pre-trained models. +However, a pivotal prerequisite for achieving this level of competitiveness is +the significant communication and computation overheads when updating these +models, which prohibits the applications of them to real-world scenarios. To +address this issue, drawing inspiration from advanced model merging techniques +without requiring additional training, we introduce the Decentralized Iterative +Merging-And-Training (DIMAT) paradigm--a novel decentralized deep learning +framework. Within DIMAT, each agent is trained on their local data and +periodically merged with their neighboring agents using advanced model merging +techniques like activation matching until convergence is achieved. DIMAT +provably converges with the best available rate for nonconvex functions with +various first-order methods, while yielding tighter error bounds compared to +the popular existing approaches. We conduct a comprehensive empirical analysis +to validate DIMAT's superiority over baselines across diverse computer vision +tasks sourced from multiple datasets. Empirical results validate our +theoretical claims by showing that DIMAT attains faster and higher initial gain +in accuracy with independent and identically distributed (IID) and non-IID +data, incurring lower communication overhead. This DIMAT paradigm presents a +new opportunity for the future decentralized learning, enhancing its +adaptability to real-world with sparse and light-weight communication and +computation. + +
+
+ comment: CVPR 2024 accepted paper, 22 pages, 12 figures +
+
+
+
+
+ + ☆ Latent Guard: a Safety Framework for Text-to-image Generation + + +
+ With the ability to generate high-quality images, text-to-image (T2I) models +can be exploited for creating inappropriate content. To prevent misuse, +existing safety measures are either based on text blacklists, which can be +easily circumvented, or harmful content classification, requiring large +datasets for training and offering low flexibility. Hence, we propose Latent +Guard, a framework designed to improve safety measures in text-to-image +generation. Inspired by blacklist-based approaches, Latent Guard learns a +latent space on top of the T2I model's text encoder, where it is possible to +check the presence of harmful concepts in the input text embeddings. Our +proposed framework is composed of a data generation pipeline specific to the +task using large language models, ad-hoc architectural components, and a +contrastive learning strategy to benefit from the generated data. The +effectiveness of our method is verified on three datasets and against four +baselines. Code and data will be shared at +https://github.com/rt219/LatentGuard. + +
+
+ comment: under review +
+
+
+
+
+ + ☆ Rethinking Artistic Copyright Infringements in the Era of Text-to-Image + Generative Models + + +
+ Recent text-to-image generative models such as Stable Diffusion are extremely +adept at mimicking and generating copyrighted content, raising concerns amongst +artists that their unique styles may be improperly copied. Understanding how +generative models copy "artistic style" is more complex than duplicating a +single image, as style is comprised by a set of elements (or signature) that +frequently co-occurs across a body of work, where each individual work may vary +significantly. In our paper, we first reformulate the problem of "artistic +copyright infringement" to a classification problem over image sets, instead of +probing image-wise similarities. We then introduce ArtSavant, a practical +(i.e., efficient and easy to understand) tool to (i) determine the unique style +of an artist by comparing it to a reference dataset of works from 372 artists +curated from WikiArt, and (ii) recognize if the identified style reappears in +generated images. We leverage two complementary methods to perform artistic +style classification over image sets, includingTagMatch, which is a novel +inherently interpretable and attributable method, making it more suitable for +broader use by non-technical stake holders (artists, lawyers, judges, etc). +Leveraging ArtSavant, we then perform a large-scale empirical study to provide +quantitative insight on the prevalence of artistic style copying across 3 +popular text-to-image generative models. Namely, amongst a dataset of prolific +artists (including many famous ones), only 20% of them appear to have their +styles be at a risk of copying via simple prompting of today's popular +text-to-image generative models. + +
+
+
+
+
+ + ☆ SurvMamba: State Space Model with Multi-grained Multi-modal Interaction + for Survival Prediction + + +
+ Multi-modal learning that combines pathological images with genomic data has +significantly enhanced the accuracy of survival prediction. Nevertheless, +existing methods have not fully utilized the inherent hierarchical structure +within both whole slide images (WSIs) and transcriptomic data, from which +better intra-modal representations and inter-modal integration could be +derived. Moreover, many existing studies attempt to improve multi-modal +representations through attention mechanisms, which inevitably lead to high +complexity when processing high-dimensional WSIs and transcriptomic data. +Recently, a structured state space model named Mamba emerged as a promising +approach for its superior performance in modeling long sequences with low +complexity. In this study, we propose Mamba with multi-grained multi-modal +interaction (SurvMamba) for survival prediction. SurvMamba is implemented with +a Hierarchical Interaction Mamba (HIM) module that facilitates efficient +intra-modal interactions at different granularities, thereby capturing more +detailed local features as well as rich global representations. In addition, an +Interaction Fusion Mamba (IFM) module is used for cascaded inter-modal +interactive fusion, yielding more comprehensive features for survival +prediction. Comprehensive evaluations on five TCGA datasets demonstrate that +SurvMamba outperforms other existing methods in terms of performance and +computational cost. + +
+
+
+
+
+ + ♻ ☆ Supervised Fine-tuning in turn Improves Visual Foundation Models + + +
+ Image-text training like CLIP has dominated the pretraining of vision +foundation models in recent years. Subsequent efforts have been made to +introduce region-level visual learning into CLIP's pretraining but face +scalability challenges due to the lack of large-scale region-level datasets. +Drawing inspiration from supervised fine-tuning (SFT) in natural language +processing such as instruction tuning, we explore the potential of fine-grained +SFT in enhancing the generation of vision foundation models after their +pretraining. Thus a two-stage method ViSFT (Vision SFT) is proposed to unleash +the fine-grained knowledge of vision foundation models. In ViSFT, the vision +foundation model is enhanced by performing visual joint learning on some +in-domain tasks and then tested on out-of-domain benchmarks. With updating +using ViSFT on 8 V100 GPUs in less than 2 days, a vision transformer with over +4.4B parameters shows improvements across various out-of-domain benchmarks +including vision and vision-linguistic scenarios. + +
+
+ comment: 23 pages, 3 figures, Project page: + https://github.com/TencentARC/ViSFT/tree/main +
+
+
+
+
+ + ♻ ☆ Low-Resource Vision Challenges for Foundation Models CVPR2024 + + +
+ Low-resource settings are well-established in natural language processing, +where many languages lack sufficient data for deep learning at scale. However, +low-resource problems are under-explored in computer vision. In this paper, we +address this gap and explore the challenges of low-resource image tasks with +vision foundation models. We first collect a benchmark of genuinely +low-resource image data, covering historic maps, circuit diagrams, and +mechanical drawings. These low-resource settings all share three challenges: +data scarcity, fine-grained differences, and the distribution shift from +natural images to the specialized domain of interest. While existing foundation +models have shown impressive generalizability, we find they cannot transfer +well to our low-resource tasks. To begin to tackle the challenges of +low-resource vision, we introduce one simple baseline per challenge. +Specifically, we i) enlarge the data space by generative models, ii) adopt the +best sub-kernels to encode local regions for fine-grained difference discovery +and iii) learn attention for specialized domains. Experiments on our three +low-resource tasks demonstrate our proposals already provide a better baseline +than transfer learning, data augmentation, and fine-grained methods. This +highlights the unique characteristics and challenges of low-resource vision for +foundation models that warrant further investigation. Project page: +https://xiaobai1217.github.io/Low-Resource-Vision/. + +
+
+ comment: Accepted at CVPR2024 +
+
+
+
+
+ + ♻ ☆ EgoGen: An Egocentric Synthetic Data Generator CVPR 2024 + + +
+ Understanding the world in first-person view is fundamental in Augmented +Reality (AR). This immersive perspective brings dramatic visual changes and +unique challenges compared to third-person views. Synthetic data has empowered +third-person-view vision models, but its application to embodied egocentric +perception tasks remains largely unexplored. A critical challenge lies in +simulating natural human movements and behaviors that effectively steer the +embodied cameras to capture a faithful egocentric representation of the 3D +world. To address this challenge, we introduce EgoGen, a new synthetic data +generator that can produce accurate and rich ground-truth training data for +egocentric perception tasks. At the heart of EgoGen is a novel human motion +synthesis model that directly leverages egocentric visual inputs of a virtual +human to sense the 3D environment. Combined with collision-avoiding motion +primitives and a two-stage reinforcement learning approach, our motion +synthesis model offers a closed-loop solution where the embodied perception and +movement of the virtual human are seamlessly coupled. Compared to previous +works, our model eliminates the need for a pre-defined global path, and is +directly applicable to dynamic environments. Combined with our easy-to-use and +scalable data generation pipeline, we demonstrate EgoGen's efficacy in three +tasks: mapping and localization for head-mounted cameras, egocentric camera +tracking, and human mesh recovery from egocentric views. EgoGen will be fully +open-sourced, offering a practical solution for creating realistic egocentric +training data and aiming to serve as a useful tool for egocentric computer +vision research. Refer to our project page: https://ego-gen.github.io/. + +
+
+ comment: Accepted by CVPR 2024 (Oral). 23 pages, 17 figures. Project page: + https://ego-gen.github.io/ +
+
+
+
+
+ + ♻ ☆ MambaAD: Exploring State Space Models for Multi-class Unsupervised + Anomaly Detection + + +
+ Recent advancements in anomaly detection have seen the efficacy of CNN- and +transformer-based approaches. However, CNNs struggle with long-range +dependencies, while transformers are burdened by quadratic computational +complexity. Mamba-based models, with their superior long-range modeling and +linear efficiency, have garnered substantial attention. This study pioneers the +application of Mamba to multi-class unsupervised anomaly detection, presenting +MambaAD, which consists of a pre-trained encoder and a Mamba decoder featuring +(Locality-Enhanced State Space) LSS modules at multi-scales. The proposed LSS +module, integrating parallel cascaded (Hybrid State Space) HSS blocks and +multi-kernel convolutions operations, effectively captures both long-range and +local information. The HSS block, utilizing (Hybrid Scanning) HS encoders, +encodes feature maps into five scanning methods and eight directions, thereby +strengthening global connections through the (State Space Model) SSM. The use +of Hilbert scanning and eight directions significantly improves feature +sequence modeling. Comprehensive experiments on six diverse anomaly detection +datasets and seven metrics demonstrate state-of-the-art performance, +substantiating the method's effectiveness. + +
+
+
+
+
+ + ♻ ☆ Uncertainty-aware Evidential Fusion-based Learning for Semi-supervised + Medical Image Segmentation + + +
+ Although the existing uncertainty-based semi-supervised medical segmentation +methods have achieved excellent performance, they usually only consider a +single uncertainty evaluation, which often fails to solve the problem related +to credibility completely. Therefore, based on the framework of evidential deep +learning, this paper integrates the evidential predictive results in the +cross-region of mixed and original samples to reallocate the confidence degree +and uncertainty measure of each voxel, which is realized by emphasizing +uncertain information of probability assignments fusion rule of traditional +evidence theory. Furthermore, we design a voxel-level asymptotic learning +strategy by introducing information entropy to combine with the fused +uncertainty measure to estimate voxel prediction more precisely. The model will +gradually pay attention to the prediction results with high uncertainty in the +learning process, to learn the features that are difficult to master. The +experimental results on LA, Pancreas-CT, ACDC and TBAD datasets demonstrate the +superior performance of our proposed method in comparison with the existing +state of the arts. + +
+
+
+
+
+ + ♻ ☆ Deep Learning for Event-based Vision: A Comprehensive Survey and + Benchmarks + + +
+ Event cameras are bio-inspired sensors that capture the per-pixel intensity +changes asynchronously and produce event streams encoding the time, pixel +position, and polarity (sign) of the intensity changes. Event cameras possess a +myriad of advantages over canonical frame-based cameras, such as high temporal +resolution, high dynamic range, low latency, etc. Being capable of capturing +information in challenging visual conditions, event cameras have the potential +to overcome the limitations of frame-based cameras in the computer vision and +robotics community. In very recent years, deep learning (DL) has been brought +to this emerging field and inspired active research endeavors in mining its +potential. However, there is still a lack of taxonomies in DL techniques for +event-based vision. We first scrutinize the typical event representations with +quality enhancement methods as they play a pivotal role as inputs to the DL +models. We then provide a comprehensive survey of existing DL-based methods by +structurally grouping them into two major categories: 1) image/video +reconstruction and restoration; 2) event-based scene understanding and 3D +vision. We conduct benchmark experiments for the existing methods in some +representative research directions, i.e., image reconstruction, deblurring, and +object recognition, to identify some critical insights and problems. Finally, +we have discussions regarding the challenges and provide new perspectives for +inspiring more research studies. + +
+
+
+
+
+ + ♻ ☆ MoCha-Stereo: Motif Channel Attention Network for Stereo Matching CVPR 2024 + + +
+ Learning-based stereo matching techniques have made significant progress. +However, existing methods inevitably lose geometrical structure information +during the feature channel generation process, resulting in edge detail +mismatches. In this paper, the Motif Cha}nnel Attention Stereo Matching Network +(MoCha-Stereo) is designed to address this problem. We provide the Motif +Channel Correlation Volume (MCCV) to determine more accurate edge matching +costs. MCCV is achieved by projecting motif channels, which capture common +geometric structures in feature channels, onto feature maps and cost volumes. +In addition, edge variations in %potential feature channels of the +reconstruction error map also affect details matching, we propose the +Reconstruction Error Motif Penalty (REMP) module to further refine the +full-resolution disparity estimation. REMP integrates the frequency information +of typical channel features from the reconstruction error. MoCha-Stereo ranks +1st on the KITTI-2015 and KITTI-2012 Reflective leaderboards. Our structure +also shows excellent performance in Multi-View Stereo. Code is avaliable at +https://github.com/ZYangChen/MoCha-Stereo. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Diffusion Time-step Curriculum for One Image to 3D Generation CVPR 2024 + + +
+ Score distillation sampling~(SDS) has been widely adopted to overcome the +absence of unseen views in reconstructing 3D objects from a \textbf{single} +image. It leverages pre-trained 2D diffusion models as teacher to guide the +reconstruction of student 3D models. Despite their remarkable success, +SDS-based methods often encounter geometric artifacts and texture saturation. +We find out the crux is the overlooked indiscriminate treatment of diffusion +time-steps during optimization: it unreasonably treats the student-teacher +knowledge distillation to be equal at all time-steps and thus entangles +coarse-grained and fine-grained modeling. Therefore, we propose the Diffusion +Time-step Curriculum one-image-to-3D pipeline (DTC123), which involves both the +teacher and student models collaborating with the time-step curriculum in a +coarse-to-fine manner. Extensive experiments on NeRF4, RealFusion15, GSO and +Level50 benchmark demonstrate that DTC123 can produce multi-view consistent, +high-quality, and diverse 3D assets. Codes and more generation demos will be +released in https://github.com/yxymessi/DTC123. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Exploring Masked Autoencoders for Sensor-Agnostic Image Retrieval in + Remote Sensing + + +
+ Self-supervised learning through masked autoencoders (MAEs) has recently +attracted great attention for remote sensing (RS) image representation +learning, and thus embodies a significant potential for content-based image +retrieval (CBIR) from ever-growing RS image archives. However, the existing +studies on MAEs in RS assume that the considered RS images are acquired by a +single image sensor, and thus are only suitable for uni-modal CBIR problems. +The effectiveness of MAEs for cross-sensor CBIR, which aims to search +semantically similar images across different image modalities, has not been +explored yet. In this paper, we take the first step to explore the +effectiveness of MAEs for sensor-agnostic CBIR in RS. To this end, we present a +systematic overview on the possible adaptations of the vanilla MAE to exploit +masked image modeling on multi-sensor RS image archives (denoted as +cross-sensor masked autoencoders [CSMAEs]). Based on different adjustments +applied to the vanilla MAE, we introduce different CSMAE models. We also +provide an extensive experimental analysis of these CSMAE models. We finally +derive a guideline to exploit masked image modeling for uni-modal and +cross-modal CBIR problems in RS. The code of this work is publicly available at +https://github.com/jakhac/CSMAE. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Our code is available at https://github.com/jakhac/CSMAE +
+
+
+
+
+ + ♻ ☆ An Autonomous Vision-Based Algorithm for Interplanetary Navigation + + +
+ The surge of deep-space probes makes it unsustainable to navigate them with +standard radiometric tracking. Self-driving interplanetary satellites represent +a solution to this problem. In this work, a full vision-based navigation +algorithm is built by combining an orbit determination method with an image +processing pipeline suitable for interplanetary transfers of autonomous +platforms. To increase the computational efficiency of the algorithm, a +non-dimensional extended Kalman filter is selected as state estimator, fed by +the positions of the planets extracted from deep-space images. An enhancement +of the estimation accuracy is performed by applying an optimal strategy to +select the best pair of planets to track. Moreover, a novel analytical +measurement model for deep-space navigation is developed providing a +first-order approximation of the light-aberration and light-time effects. +Algorithm performance is tested on a high-fidelity, Earth--Mars interplanetary +transfer, showing the algorithm applicability for deep-space navigation. + +
+
+
+
+
+ + ♻ ☆ Attention Calibration for Disentangled Text-to-Image Personalization CVPR 2024 + + +
+ Recent thrilling progress in large-scale text-to-image (T2I) models has +unlocked unprecedented synthesis quality of AI-generated content (AIGC) +including image generation, 3D and video composition. Further, personalized +techniques enable appealing customized production of a novel concept given only +several images as reference. However, an intriguing problem persists: Is it +possible to capture multiple, novel concepts from one single reference image? +In this paper, we identify that existing approaches fail to preserve visual +consistency with the reference image and eliminate cross-influence from +concepts. To alleviate this, we propose an attention calibration mechanism to +improve the concept-level understanding of the T2I model. Specifically, we +first introduce new learnable modifiers bound with classes to capture +attributes of multiple concepts. Then, the classes are separated and +strengthened following the activation of the cross-attention operation, +ensuring comprehensive and self-contained concepts. Additionally, we suppress +the attention activation of different classes to mitigate mutual influence +among concepts. Together, our proposed method, dubbed DisenDiff, can learn +disentangled multiple concepts from one single image and produce novel +customized images with learned concepts. We demonstrate that our method +outperforms the current state of the art in both qualitative and quantitative +evaluations. More importantly, our proposed techniques are compatible with LoRA +and inpainting pipelines, enabling more interactive experiences. + +
+
+ comment: CVPR 2024 (Oral) +
+
+
+
+
+ + ♻ ☆ A Deep Learning Method for Simultaneous Denoising and Missing Wedge + Reconstruction in Cryogenic Electron Tomography + + +
+ Cryogenic electron tomography is a technique for imaging biological samples +in 3D. A microscope collects a series of 2D projections of the sample, and the +goal is to reconstruct the 3D density of the sample called the tomogram. +Reconstruction is difficult as the 2D projections are noisy and can not be +recorded from all directions, resulting in a missing wedge of information. +Tomograms conventionally reconstructed with filtered back-projection suffer +from noise and strong artifacts due to the missing wedge. Here, we propose a +deep-learning approach for simultaneous denoising and missing wedge +reconstruction called DeepDeWedge. The algorithm requires no ground truth data +and is based on fitting a neural network to the 2D projections using a +self-supervised loss. DeepDeWedge performs better than CryoCARE and IsoNet, +which are state-of-the-art methods for denoising and missing wedge +reconstruction, and similarly and, in some cases, better than the combination +of the two methods. At the same time, DeepDeWedge is simpler than this two-step +approach, as it does denoising and missing wedge reconstruction simultaneously +rather than sequentially. + +
+
+
+
+
+ + ♻ ☆ T-DEED: Temporal-Discriminability Enhancer Encoder-Decoder for Precise + Event Spotting in Sports Videos + + +
+ In this paper, we introduce T-DEED, a Temporal-Discriminability Enhancer +Encoder-Decoder for Precise Event Spotting in sports videos. T-DEED addresses +multiple challenges in the task, including the need for discriminability among +frame representations, high output temporal resolution to maintain prediction +precision, and the necessity to capture information at different temporal +scales to handle events with varying dynamics. It tackles these challenges +through its specifically designed architecture, featuring an encoder-decoder +for leveraging multiple temporal scales and achieving high output temporal +resolution, along with temporal modules designed to increase token +discriminability. Leveraging these characteristics, T-DEED achieves SOTA +performance on the FigureSkating and FineDiving datasets. Code is available at +https://github.com/arturxe2/T-DEED. + +
+
+
+
+
+ + ♻ ☆ Flattening the Parent Bias: Hierarchical Semantic Segmentation in the + Poincar{é} Ball + + +
+ Hierarchy is a natural representation of semantic taxonomies, including the +ones routinely used in image segmentation. Indeed, recent work on semantic +segmentation reports improved accuracy from supervised training leveraging +hierarchical label structures. Encouraged by these results, we revisit the +fundamental assumptions behind that work. We postulate and then empirically +verify that the reasons for the observed improvement in segmentation accuracy +may be entirely unrelated to the use of the semantic hierarchy. To demonstrate +this, we design a range of cross-domain experiments with a representative +hierarchical approach. We find that on the new testing domains, a flat +(non-hierarchical) segmentation network, in which the parents are inferred from +the children, has superior segmentation accuracy to the hierarchical approach +across the board. Complementing these findings and inspired by the intrinsic +properties of hyperbolic spaces, we study a more principled approach to +hierarchical segmentation using the Poincar\'e ball model. The hyperbolic +representation largely outperforms the previous (Euclidean) hierarchical +approach as well and is on par with our flat Euclidean baseline in terms of +segmentation accuracy. However, it additionally exhibits surprisingly strong +calibration quality of the parent nodes in the semantic hierarchy, especially +on the more challenging domains. Our combined analysis suggests that the +established practice of hierarchical segmentation may be limited to in-domain +settings, whereas flat classifiers generalize substantially better, especially +if they are modeled in the hyperbolic space. + +
+
+
+
+
+ + ♻ ☆ Exploring Efficient Asymmetric Blind-Spots for Self-Supervised Denoising + in Real-World Scenarios CVPR 2024 + + +
+ Self-supervised denoising has attracted widespread attention due to its +ability to train without clean images. However, noise in real-world scenarios +is often spatially correlated, which causes many self-supervised algorithms +that assume pixel-wise independent noise to perform poorly. Recent works have +attempted to break noise correlation with downsampling or neighborhood masking. +However, denoising on downsampled subgraphs can lead to aliasing effects and +loss of details due to a lower sampling rate. Furthermore, the neighborhood +masking methods either come with high computational complexity or do not +consider local spatial preservation during inference. Through the analysis of +existing methods, we point out that the key to obtaining high-quality and +texture-rich results in real-world self-supervised denoising tasks is to train +at the original input resolution structure and use asymmetric operations during +training and inference. Based on this, we propose Asymmetric Tunable Blind-Spot +Network (AT-BSN), where the blind-spot size can be freely adjusted, thus better +balancing noise correlation suppression and image local spatial destruction +during training and inference. In addition, we regard the pre-trained AT-BSN as +a meta-teacher network capable of generating various teacher networks by +sampling different blind-spots. We propose a blind-spot based multi-teacher +distillation strategy to distill a lightweight network, significantly improving +performance. Experimental results on multiple datasets prove that our method +achieves state-of-the-art, and is superior to other self-supervised algorithms +in terms of computational overhead and visual effects. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Deep Learning for Satellite Image Time Series Analysis: A Review + + +
+ Earth observation (EO) satellite missions have been providing detailed images +about the state of the Earth and its land cover for over 50 years. Long term +missions, such as NASA's Landsat, Terra, and Aqua satellites, and more +recently, the ESA's Sentinel missions, record images of the entire world every +few days. Although single images provide point-in-time data, repeated images of +the same area, or satellite image time series (SITS) provide information about +the changing state of vegetation and land use. These SITS are useful for +modeling dynamic processes and seasonal changes such as plant phenology. They +have potential benefits for many aspects of land and natural resource +management, including applications in agricultural, forest, water, and disaster +management, urban planning, and mining. However, the resulting satellite image +time series (SITS) are complex, incorporating information from the temporal, +spatial, and spectral dimensions. Therefore, deep learning methods are often +deployed as they can analyze these complex relationships. This review presents +a summary of the state-of-the-art methods of modelling environmental, +agricultural, and other Earth observation variables from SITS data using deep +learning methods. We aim to provide a resource for remote sensing experts +interested in using deep learning techniques to enhance Earth observation +models with temporal information. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ Is Medieval Distant Viewing Possible? : Extending and Enriching + Annotation of Legacy Image Collections using Visual Analytics + + +
+ Distant viewing approaches have typically used image datasets close to the +contemporary image data used to train machine learning models. To work with +images from other historical periods requires expert annotated data, and the +quality of labels is crucial for the quality of results. Especially when +working with cultural heritage collections that contain myriad uncertainties, +annotating data, or re-annotating, legacy data is an arduous task. In this +paper, we describe working with two pre-annotated sets of medieval manuscript +images that exhibit conflicting and overlapping metadata. Since a manual +reconciliation of the two legacy ontologies would be very expensive, we aim (1) +to create a more uniform set of descriptive labels to serve as a "bridge" in +the combined dataset, and (2) to establish a high quality hierarchical +classification that can be used as a valuable input for subsequent supervised +machine learning. To achieve these goals, we developed visualization and +interaction mechanisms, enabling medievalists to combine, regularize and extend +the vocabulary used to describe these, and other cognate, image datasets. The +visual interfaces provide experts an overview of relationships in the data +going beyond the sum total of the metadata. Word and image embeddings as well +as co-occurrences of labels across the datasets, enable batch re-annotation of +images, recommendation of label candidates and support composing a hierarchical +classification of labels. + +
+
+ comment: Revision after DSH Peer Review. Paper is now accepted at DSH +
+
+
+
+
+ + ♻ ☆ How NeRFs and 3D Gaussian Splatting are Reshaping SLAM: a Survey + + +
+ Over the past two decades, research in the field of Simultaneous Localization +and Mapping (SLAM) has undergone a significant evolution, highlighting its +critical role in enabling autonomous exploration of unknown environments. This +evolution ranges from hand-crafted methods, through the era of deep learning, +to more recent developments focused on Neural Radiance Fields (NeRFs) and 3D +Gaussian Splatting (3DGS) representations. Recognizing the growing body of +research and the absence of a comprehensive survey on the topic, this paper +aims to provide the first comprehensive overview of SLAM progress through the +lens of the latest advancements in radiance fields. It sheds light on the +background, evolutionary path, inherent strengths and limitations, and serves +as a fundamental reference to highlight the dynamic progress and specific +challenges. + +
+
+
+
+
+ + ♻ ☆ 3D Human Reconstruction in the Wild with Synthetic Data Using Generative + Models + + +
+ In this work, we show that synthetic data created by generative models is +complementary to computer graphics (CG) rendered data for achieving remarkable +generalization performance on diverse real-world scenes for 3D human pose and +shape estimation (HPS). Specifically, we propose an effective approach based on +recent diffusion models, termed HumanWild, which can effortlessly generate +human images and corresponding 3D mesh annotations. We first collect a +large-scale human-centric dataset with comprehensive annotations, e.g., text +captions and surface normal images. Then, we train a customized ControlNet +model upon this dataset to generate diverse human images and initial +ground-truth labels. At the core of this step is that we can easily obtain +numerous surface normal images from a 3D human parametric model, e.g., SMPL-X, +by rendering the 3D mesh onto the image plane. As there exists inevitable noise +in the initial labels, we then apply an off-the-shelf foundation segmentation +model, i.e., SAM, to filter negative data samples. Our data generation pipeline +is flexible and customizable to facilitate different real-world tasks, e.g., +ego-centric scenes and perspective-distortion scenes. The generated dataset +comprises 0.79M images with corresponding 3D annotations, covering versatile +viewpoints, scenes, and human identities. We train various HPS regressors on +top of the generated data and evaluate them on a wide range of benchmarks +(3DPW, RICH, EgoBody, AGORA, SSP-3D) to verify the effectiveness of the +generated data. By exclusively employing generative models, we generate +large-scale in-the-wild human images and high-quality annotations, eliminating +the need for real-world data collection. + +
+
+ comment: project page: https://yongtaoge.github.io/projects/humanwild +
+
+
+
+
+ + ♻ ☆ NRDF: Neural Riemannian Distance Fields for Learning Articulated Pose + Priors CVPR 2024 + + +
+ Faithfully modeling the space of articulations is a crucial task that allows +recovery and generation of realistic poses, and remains a notorious challenge. +To this end, we introduce Neural Riemannian Distance Fields (NRDFs), +data-driven priors modeling the space of plausible articulations, represented +as the zero-level-set of a neural field in a high-dimensional +product-quaternion space. To train NRDFs only on positive examples, we +introduce a new sampling algorithm, ensuring that the geodesic distances follow +a desired distribution, yielding a principled distance field learning paradigm. +We then devise a projection algorithm to map any random pose onto the level-set +by an adaptive-step Riemannian optimizer, adhering to the product manifold of +joint rotations at all times. NRDFs can compute the Riemannian gradient via +backpropagation and by mathematical analogy, are related to Riemannian flow +matching, a recent generative model. We conduct a comprehensive evaluation of +NRDF against other pose priors in various downstream tasks, i.e., pose +generation, image-based pose estimation, and solving inverse kinematics, +highlighting NRDF's superior performance. Besides humans, NRDF's versatility +extends to hand and animal poses, as it can effectively represent any +articulation. + +
+
+ comment: Accepted by CVPR 2024. Project page: + https://virtualhumans.mpi-inf.mpg.de/nrdf +
+
+
+
+
+ + ♻ ☆ ChangeMamba: Remote Sensing Change Detection with Spatio-Temporal State + Space Model + + +
+ Convolutional neural networks (CNN) and Transformers have made impressive +progress in the field of remote sensing change detection (CD). However, both +architectures have inherent shortcomings. Recently, the Mamba architecture, +based on state space models, has shown remarkable performance in a series of +natural language processing tasks, which can effectively compensate for the +shortcomings of the above two architectures. In this paper, we explore for the +first time the potential of the Mamba architecture for remote sensing CD tasks. +We tailor the corresponding frameworks, called MambaBCD, MambaSCD, and +MambaBDA, for binary change detection (BCD), semantic change detection (SCD), +and building damage assessment (BDA), respectively. All three frameworks adopt +the cutting-edge Visual Mamba architecture as the encoder, which allows full +learning of global spatial contextual information from the input images. For +the change decoder, which is available in all three architectures, we propose +three spatio-temporal relationship modeling mechanisms, which can be naturally +combined with the Mamba architecture and fully utilize its attribute to achieve +spatio-temporal interaction of multi-temporal features, thereby obtaining +accurate change information. On five benchmark datasets, our proposed +frameworks outperform current CNN- and Transformer-based approaches without +using any complex training strategies or tricks, fully demonstrating the +potential of the Mamba architecture in CD tasks. Specifically, we obtained +83.11%, 88.39% and 94.19% F1 scores on the three BCD datasets SYSU, LEVIR-CD+, +and WHU-CD; on the SCD dataset SECOND, we obtained 24.11% SeK; and on the BDA +dataset xBD, we obtained 81.41% overall F1 score. Further experiments show that +our architecture is quite robust to degraded data. The source code will be +available in https://github.com/ChenHongruixuan/MambaCD + +
+
+
+
+
+ + ♻ ☆ RePoseDM: Recurrent Pose Alignment and Gradient Guidance for Pose Guided + Image Synthesis CVPR 2024 + + +
+ Pose-guided person image synthesis task requires re-rendering a reference +image, which should have a photorealistic appearance and flawless pose +transfer. Since person images are highly structured, existing approaches +require dense connections for complex deformations and occlusions because these +are generally handled through multi-level warping and masking in latent space. +The feature maps generated by convolutional neural networks do not have +equivariance, and hence multi-level warping is required to perform pose +alignment. Inspired by the ability of the diffusion model to generate +photorealistic images from the given conditional guidance, we propose recurrent +pose alignment to provide pose-aligned texture features as conditional +guidance. Due to the leakage of the source pose in conditional guidance, we +propose gradient guidance from pose interaction fields, which output the +distance from the valid pose manifold given a predicted pose as input. This +helps in learning plausible pose transfer trajectories that result in +photorealism and undistorted texture details. Extensive results on two +large-scale benchmarks and a user study demonstrate the ability of our proposed +approach to generate photorealistic pose transfer under challenging scenarios. +Additionally, we demonstrate the efficiency of gradient guidance in pose-guided +image generation on the HumanArt dataset with fine-tuned stable diffusion. + +
+
+ comment: Accepted at CVPR 2024 SyntaGen Workshop, 13 pages, 4 tables, 7 + figures +
+
+
+
+
+ + ♻ ☆ COTR: Compact Occupancy TRansformer for Vision-based 3D Occupancy + Prediction CVPR2024 + + +
+ The autonomous driving community has shown significant interest in 3D +occupancy prediction, driven by its exceptional geometric perception and +general object recognition capabilities. To achieve this, current works try to +construct a Tri-Perspective View (TPV) or Occupancy (OCC) representation +extending from the Bird-Eye-View perception. However, compressed views like TPV +representation lose 3D geometry information while raw and sparse OCC +representation requires heavy but redundant computational costs. To address the +above limitations, we propose Compact Occupancy TRansformer (COTR), with a +geometry-aware occupancy encoder and a semantic-aware group decoder to +reconstruct a compact 3D OCC representation. The occupancy encoder first +generates a compact geometrical OCC feature through efficient explicit-implicit +view transformation. Then, the occupancy decoder further enhances the semantic +discriminability of the compact OCC representation by a coarse-to-fine semantic +grouping strategy. Empirical experiments show that there are evident +performance gains across multiple baselines, e.g., COTR outperforms baselines +with a relative improvement of 8%-15%, demonstrating the superiority of our +method. + +
+
+ comment: CVPR2024. Code is available at https://github.com/NotACracker/COTR +
+
+
+
+
+ + ♻ ☆ IIDM: Inter and Intra-domain Mixing for Semi-supervised Domain + Adaptation in Semantic Segmentation + + +
+ Despite recent advances in semantic segmentation, an inevitable challenge is +the performance degradation caused by the domain shift in real applications. +Current dominant approach to solve this problem is unsupervised domain +adaptation (UDA). However, the absence of labeled target data in UDA is overly +restrictive and limits performance. To overcome this limitation, a more +practical scenario called semi-supervised domain adaptation (SSDA) has been +proposed. Existing SSDA methods are derived from the UDA paradigm and primarily +focus on leveraging the unlabeled target data and source data. In this paper, +we highlight the significance of exploiting the intra-domain information +between the labeled target data and unlabeled target data. Instead of solely +using the scarce labeled target data for supervision, we propose a novel SSDA +framework that incorporates both Inter and Intra Domain Mixing (IIDM), where +inter-domain mixing mitigates the source-target domain gap and intra-domain +mixing enriches the available target domain information, and the network can +capture more domain-invariant features. We also explore different domain mixing +strategies to better exploit the target domain information. Comprehensive +experiments conducted on the GTA5 to Cityscapes and SYNTHIA to Cityscapes +benchmarks demonstrate the effectiveness of IIDM, surpassing previous methods +by a large margin. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ WWW: A Unified Framework for Explaining What, Where and Why of Neural + Networks by Interpretation of Neuron Concepts CVPR 2024 + + +
+ Recent advancements in neural networks have showcased their remarkable +capabilities across various domains. Despite these successes, the "black box" +problem still remains. Addressing this, we propose a novel framework, WWW, that +offers the 'what', 'where', and 'why' of the neural network decisions in +human-understandable terms. Specifically, WWW utilizes adaptive selection for +concept discovery, employing adaptive cosine similarity and thresholding +techniques to effectively explain 'what'. To address the 'where' and 'why', we +proposed a novel combination of neuron activation maps (NAMs) with Shapley +values, generating localized concept maps and heatmaps for individual inputs. +Furthermore, WWW introduces a method for predicting uncertainty, leveraging +heatmap similarities to estimate 'how' reliable the prediction is. Experimental +evaluations of WWW demonstrate superior performance in both quantitative and +qualitative metrics, outperforming existing methods in interpretability. WWW +provides a unified solution for explaining 'what', 'where', and 'why', +introducing a method for localized explanations from global interpretations and +offering a plug-and-play solution adaptable to various architectures. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Samba: Semantic Segmentation of Remotely Sensed Images with State Space + Model + + +
+ High-resolution remotely sensed images pose a challenge for commonly used +semantic segmentation methods such as Convolutional Neural Network (CNN) and +Vision Transformer (ViT). CNN-based methods struggle with handling such +high-resolution images due to their limited receptive field, while ViT faces +challenges in handling long sequences. Inspired by Mamba, which adopts a State +Space Model (SSM) to efficiently capture global semantic information, we +propose a semantic segmentation framework for high-resolution remotely sensed +images, named Samba. Samba utilizes an encoder-decoder architecture, with Samba +blocks serving as the encoder for efficient multi-level semantic information +extraction, and UperNet functioning as the decoder. We evaluate Samba on the +LoveDA, ISPRS Vaihingen, and ISPRS Potsdam datasets, comparing its performance +against top-performing CNN and ViT methods. The results reveal that Samba +achieved unparalleled performance on commonly used remote sensing datasets for +semantic segmentation. Our proposed Samba demonstrates for the first time the +effectiveness of SSM in semantic segmentation of remotely sensed images, +setting a new benchmark in performance for Mamba-based techniques in this +specific application. The source code and baseline implementations are +available at https://github.com/zhuqinfeng1999/Samba. + +
+
+
+
+
+ + ♻ ☆ Driver Attention Tracking and Analysis + + +
+ We propose a novel method to estimate a driver's points-of-gaze using a pair +of ordinary cameras mounted on the windshield and dashboard of a car. This is a +challenging problem due to the dynamics of traffic environments with 3D scenes +of unknown depths. This problem is further complicated by the volatile distance +between the driver and the camera system. To tackle these challenges, we +develop a novel convolutional network that simultaneously analyzes the image of +the scene and the image of the driver's face. This network has a camera +calibration module that can compute an embedding vector that represents the +spatial configuration between the driver and the camera system. This +calibration module improves the overall network's performance, which can be +jointly trained end to end. + We also address the lack of annotated data for training and evaluation by +introducing a large-scale driving dataset with point-of-gaze annotations. This +is an in situ dataset of real driving sessions in an urban city, containing +synchronized images of the driving scene as well as the face and gaze of the +driver. Experiments on this dataset show that the proposed method outperforms +various baseline methods, having the mean prediction error of 29.69 pixels, +which is relatively small compared to the $1280{\times}720$ resolution of the +scene camera. + +
+
+
+
+
+ + ♻ ☆ SpikeNVS: Enhancing Novel View Synthesis from Blurry Images via Spike + Camera + + +
+ One of the most critical factors in achieving sharp Novel View Synthesis +(NVS) using neural field methods like Neural Radiance Fields (NeRF) and 3D +Gaussian Splatting (3DGS) is the quality of the training images. However, +Conventional RGB cameras are susceptible to motion blur. In contrast, +neuromorphic cameras like event and spike cameras inherently capture more +comprehensive temporal information, which can provide a sharp representation of +the scene as additional training data. Recent methods have explored the +integration of event cameras to improve the quality of NVS. The event-RGB +approaches have some limitations, such as high training costs and the inability +to work effectively in the background. Instead, our study introduces a new +method that uses the spike camera to overcome these limitations. By considering +texture reconstruction from spike streams as ground truth, we design the +Texture from Spike (TfS) loss. Since the spike camera relies on temporal +integration instead of temporal differentiation used by event cameras, our +proposed TfS loss maintains manageable training costs. It handles foreground +objects with backgrounds simultaneously. We also provide a real-world dataset +captured with our spike-RGB camera system to facilitate future research +endeavors. We conduct extensive experiments using synthetic and real-world +datasets to demonstrate that our design can enhance novel view synthesis across +NeRF and 3DGS. The code and dataset will be made available for public access. + +
+
+
+
+
+ + ♻ ☆ Learning Object Permanence from Videos via Latent Imaginations + + +
+ While human infants exhibit knowledge about object permanence from two months +of age onwards, deep-learning approaches still largely fail to recognize +objects' continued existence. We introduce a slot-based autoregressive deep +learning system, the looped location and identity tracking model Loci-Looped, +which learns to adaptively fuse latent imaginations with pixel-space +observations into consistent latent object-specific what and where encodings +over time. The novel loop empowers Loci-Looped to learn the physical concepts +of object permanence, directional inertia, and object solidity through +observation alone. As a result, Loci-Looped tracks objects through occlusions, +anticipates their reappearance, and shows signs of surprise and internal +revisions when observing implausible object behavior. Notably, Loci-Looped +outperforms state-of-the-art baseline models in handling object occlusions and +temporary sensory interruptions while exhibiting more compositional, +interpretable internal activity patterns. Our work thus introduces the first +self-supervised interpretable learning model that learns about object +permanence directly from video data without supervision. + +
+
+
+
+
+ + ♻ ☆ VSCode: General Visual Salient and Camouflaged Object Detection with 2D + Prompt Learning CVPR2024 + + +
+ Salient object detection (SOD) and camouflaged object detection (COD) are +related yet distinct binary mapping tasks. These tasks involve multiple +modalities, sharing commonalities and unique cues. Existing research often +employs intricate task-specific specialist models, potentially leading to +redundancy and suboptimal results. We introduce VSCode, a generalist model with +novel 2D prompt learning, to jointly address four SOD tasks and three COD +tasks. We utilize VST as the foundation model and introduce 2D prompts within +the encoder-decoder architecture to learn domain and task-specific knowledge on +two separate dimensions. A prompt discrimination loss helps disentangle +peculiarities to benefit model optimization. VSCode outperforms +state-of-the-art methods across six tasks on 26 datasets and exhibits zero-shot +generalization to unseen tasks by combining 2D prompts, such as RGB-D COD. +Source code has been available at https://github.com/Sssssuperior/VSCode. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ Extended Reality for Mental Health Evaluation -A Scoping Review + + +
+ Mental health disorders are the leading cause of health-related problems +globally. It is projected that mental health disorders will be the leading +cause of morbidity among adults as the incidence rates of anxiety and +depression grows globally. Recently, extended reality (XR), a general term +covering virtual reality (VR), augmented reality (AR) and mixed reality (MR), +is paving a new way to deliver mental health care. In this paper, we conduct a +scoping review on the development and application of XR in the area of mental +disorders. We performed a scoping database search to identify the relevant +studies indexed in Google Scholar, PubMed, and the ACM Digital Library. A +search period between August 2016 and December 2023 was defined to select +articles related to the usage of VR, AR, and MR in a mental health context. We +identified a total of 85 studies from 27 countries across the globe. By +performing data analysis, we found that most of the studies focused on +developed countries such as the US (16.47%) and Germany (12.94%). None of the +studies were for African countries. The majority of the articles reported that +XR techniques led to a significant reduction in symptoms of anxiety or +depression. More studies were published in the year 2021, i.e., 31.76% (n = +31). This could indicate that mental disorder intervention received a higher +attention when COVID-19 emerged. Most studies (n = 65) focused on a population +between 18 and 65 years old, only a few studies focused on teenagers (n = 2). +Also, more studies were done experimentally (n = 67, 78.82%) rather than by +analytical and modeling approaches (n = 8, 9.41%). This shows that there is a +rapid development of XR technology for mental health care. Furthermore, these +studies showed that XR technology can effectively be used for evaluating mental +disorders in similar or better way as the conventional approaches. + +
+
+
+
+
+ + ♻ ☆ VST++: Efficient and Stronger Visual Saliency Transformer + + +
+ While previous CNN-based models have exhibited promising results for salient +object detection (SOD), their ability to explore global long-range dependencies +is restricted. Our previous work, the Visual Saliency Transformer (VST), +addressed this constraint from a transformer-based sequence-to-sequence +perspective, to unify RGB and RGB-D SOD. In VST, we developed a multi-task +transformer decoder that concurrently predicts saliency and boundary outcomes +in a pure transformer architecture. Moreover, we introduced a novel token +upsampling method called reverse T2T for predicting a high-resolution saliency +map effortlessly within transformer-based structures. Building upon the VST +model, we further propose an efficient and stronger VST version in this work, +i.e. VST++. To mitigate the computational costs of the VST model, we propose a +Select-Integrate Attention (SIA) module, partitioning foreground into +fine-grained segments and aggregating background information into a single +coarse-grained token. To incorporate 3D depth information with low cost, we +design a novel depth position encoding method tailored for depth maps. +Furthermore, we introduce a token-supervised prediction loss to provide +straightforward guidance for the task-related tokens. We evaluate our VST++ +model across various transformer-based backbones on RGB, RGB-D, and RGB-T SOD +benchmark datasets. Experimental results show that our model outperforms +existing methods while achieving a 25% reduction in computational costs without +significant performance compromise. The demonstrated strong ability for +generalization, enhanced performance, and heightened efficiency of our VST++ +model highlight its potential. + +
+
+
+
+
+ + ♻ ☆ Towards Reliable Medical Image Segmentation by utilizing Evidential + Calibrated Uncertainty + + +
+ Medical image segmentation is critical for disease diagnosis and treatment +assessment. However, concerns regarding the reliability of segmentation regions +persist among clinicians, mainly attributed to the absence of confidence +assessment, robustness, and calibration to accuracy. To address this, we +introduce DEviS, an easily implementable foundational model that seamlessly +integrates into various medical image segmentation networks. DEviS not only +enhances the calibration and robustness of baseline segmentation accuracy but +also provides high-efficiency uncertainty estimation for reliable predictions. +By leveraging subjective logic theory, we explicitly model probability and +uncertainty for the problem of medical image segmentation. Here, the Dirichlet +distribution parameterizes the distribution of probabilities for different +classes of the segmentation results. To generate calibrated predictions and +uncertainty, we develop a trainable calibrated uncertainty penalty. +Furthermore, DEviS incorporates an uncertainty-aware filtering module, which +utilizes the metric of uncertainty-calibrated error to filter reliable data +within the dataset. We conducted validation studies to assess both the accuracy +and robustness of DEviS segmentation, along with evaluating the efficiency and +reliability of uncertainty estimation. These evaluations were performed using +publicly available datasets including ISIC2018, LiTS2017, and BraTS2019. +Additionally, two potential clinical trials are being conducted at Johns +Hopkins OCT, Duke-OCT-DME, and FIVES datasets to demonstrate their efficacy in +filtering high-quality or out-of-distribution data. Our code has been released +in https://github.com/Cocofeat/DEviS. + +
+
+ comment: 34 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Analyzing the Internals of Neural Radiance Fields CVPR + + +
+ Modern Neural Radiance Fields (NeRFs) learn a mapping from position to +volumetric density leveraging proposal network samplers. In contrast to the +coarse-to-fine sampling approach with two NeRFs, this offers significant +potential for acceleration using lower network capacity. Given that NeRFs +utilize most of their network capacity to estimate radiance, they could store +valuable density information in their parameters or their deep features. To +investigate this proposition, we take one step back and analyze large, trained +ReLU-MLPs used in coarse-to-fine sampling. Building on our novel activation +visualization method, we find that trained NeRFs, Mip-NeRFs and proposal +network samplers map samples with high density to local minima along a ray in +activation feature space. We show how these large MLPs can be accelerated by +transforming intermediate activations to a weight estimate, without any +modifications to the training protocol or the network architecture. With our +approach, we can reduce the computational requirements of trained NeRFs by up +to 50% with only a slight hit in rendering quality. Extensive experimental +evaluation on a variety of datasets and architectures demonstrates the +effectiveness of our approach. Consequently, our methodology provides valuable +insight into the inner workings of NeRFs. + +
+
+ comment: Accepted to CVPRW'24! Project Page: + https://r4dl.github.io/nerfinternals/ +
+
+
+
+
+ + ♻ ☆ S^2MVTC: a Simple yet Efficient Scalable Multi-View Tensor Clustering CVPR2024 + + +
+ Anchor-based large-scale multi-view clustering has attracted considerable +attention for its effectiveness in handling massive datasets. However, current +methods mainly seek the consensus embedding feature for clustering by exploring +global correlations between anchor graphs or projection matrices.In this paper, +we propose a simple yet efficient scalable multi-view tensor clustering +(S^2MVTC) approach, where our focus is on learning correlations of embedding +features within and across views. Specifically, we first construct the +embedding feature tensor by stacking the embedding features of different views +into a tensor and rotating it. Additionally, we build a novel tensor +low-frequency approximation (TLFA) operator, which incorporates graph +similarity into embedding feature learning, efficiently achieving smooth +representation of embedding features within different views. Furthermore, +consensus constraints are applied to embedding features to ensure inter-view +semantic consistency. Experimental results on six large-scale multi-view +datasets demonstrate that S^2MVTC significantly outperforms state-of-the-art +algorithms in terms of clustering performance and CPU execution time, +especially when handling massive data. The code of S^2MVTC is publicly +available at https://github.com/longzhen520/S2MVTC. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ Multi-Label Continual Learning for the Medical Domain: A Novel Benchmark + + +
+ Multi-label image classification in dynamic environments is a problem that +poses significant challenges. Previous studies have primarily focused on +scenarios such as Domain Incremental Learning and Class Incremental Learning, +which do not fully capture the complexity of real-world applications. In this +paper, we study the problem of classification of medical imaging in the +scenario termed New Instances and New Classes, which combines the challenges of +both new class arrivals and domain shifts in a single framework. Unlike +traditional scenarios, it reflects the realistic nature of CL in domains such +as medical imaging, where updates may introduce both new classes and changes in +domain characteristics. To address the unique challenges posed by this complex +scenario, we introduce a novel approach called Pseudo-Label Replay. This method +aims to mitigate forgetting while adapting to new classes and domain shifts by +combining the advantages of the Replay and Pseudo-Label methods and solving +their limitations in the proposed scenario. We evaluate our proposed approach +on a challenging benchmark consisting of two datasets, seven tasks, and +nineteen classes, modeling a realistic Continual Learning scenario. Our +experimental findings demonstrate the effectiveness of Pseudo-Label Replay in +addressing the challenges posed by the complex scenario proposed. Our method +surpasses existing approaches, exhibiting superior performance while showing +minimal forgetting. + +
+
+
+
+
+ + ♻ ☆ FloCoDe: Unbiased Dynamic Scene Graph Generation with Temporal + Consistency and Correlation Debiasing CVPR 2024 + + +
+ Dynamic scene graph generation (SGG) from videos requires not only a +comprehensive understanding of objects across scenes but also a method to +capture the temporal motions and interactions with different objects. Moreover, +the long-tailed distribution of visual relationships is a crucial bottleneck +for most dynamic SGG methods. This is because many of them focus on capturing +spatio-temporal context using complex architectures, leading to the generation +of biased scene graphs. To address these challenges, we propose +\textsc{FloCoDe}: \textbf{Flo}w-aware Temporal Consistency and +\textbf{Co}rrelation \textbf{De}biasing with uncertainty attenuation for +unbiased dynamic scene graphs. \textsc{FloCoDe} employs feature warping using +flow to detect temporally consistent objects across frames. To address the +long-tail issue of visual relationships, we propose correlation debiasing and a +label correlation-based loss to learn unbiased relation representations for +long-tailed classes. Specifically, we propose to incorporate label correlations +using contrastive loss to capture commonly co-occurring relations, which aids +in learning robust representations for long-tailed classes. Further, we adopt +the uncertainty attenuation-based classifier framework to handle noisy +annotations in the SGG data. Extensive experimental evaluation shows a +performance gain as high as 4.1\%, demonstrating the superiority of generating +more unbiased scene graphs. + +
+
+ comment: Accepted at CVPR 2024 SG2RL, 11 pages, 5 tables, 4 figures +
+
+
+
+
+ + ♻ ☆ Test-Time Zero-Shot Temporal Action Localization CVPR 2024 + + +
+ Zero-Shot Temporal Action Localization (ZS-TAL) seeks to identify and locate +actions in untrimmed videos unseen during training. Existing ZS-TAL methods +involve fine-tuning a model on a large amount of annotated training data. While +effective, training-based ZS-TAL approaches assume the availability of labeled +data for supervised learning, which can be impractical in some applications. +Furthermore, the training process naturally induces a domain bias into the +learned model, which may adversely affect the model's generalization ability to +arbitrary videos. These considerations prompt us to approach the ZS-TAL problem +from a radically novel perspective, relaxing the requirement for training data. +To this aim, we introduce a novel method that performs Test-Time adaptation for +Temporal Action Localization (T3AL). In a nutshell, T3AL adapts a pre-trained +Vision and Language Model (VLM). T3AL operates in three steps. First, a +video-level pseudo-label of the action category is computed by aggregating +information from the entire video. Then, action localization is performed +adopting a novel procedure inspired by self-supervised learning. Finally, +frame-level textual descriptions extracted with a state-of-the-art captioning +model are employed for refining the action region proposals. We validate the +effectiveness of T3AL by conducting experiments on the THUMOS14 and the +ActivityNet-v1.3 datasets. Our results demonstrate that T3AL significantly +outperforms zero-shot baselines based on state-of-the-art VLMs, confirming the +benefit of a test-time adaptation approach. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Finding Regions of Interest in Whole Slide Images Using Multiple + Instance Learning + + +
+ Whole Slide Images (WSI), obtained by high-resolution digital scanning of +microscope slides at multiple scales, are the cornerstone of modern Digital +Pathology. However, they represent a particular challenge to +AI-based/AI-mediated analysis because pathology labeling is typically done at +slide-level, instead of tile-level. It is not just that medical diagnostics is +recorded at the specimen level, the detection of oncogene mutation is also +experimentally obtained, and recorded by initiatives like The Cancer Genome +Atlas (TCGA), at the slide level. This configures a dual challenge: a) +accurately predicting the overall cancer phenotype and b) finding out what +cellular morphologies are associated with it at the tile level. To address +these challenges, a weakly supervised Multiple Instance Learning (MIL) approach +was explored for two prevalent cancer types, Invasive Breast Carcinoma +(TCGA-BRCA) and Lung Squamous Cell Carcinoma (TCGA-LUSC). This approach was +explored for tumor detection at low magnification levels and TP53 mutations at +various levels. Our results show that a novel additive implementation of MIL +matched the performance of reference implementation (AUC 0.96), and was only +slightly outperformed by Attention MIL (AUC 0.97). More interestingly from the +perspective of the molecular pathologist, these different AI architectures +identify distinct sensitivities to morphological features (through the +detection of Regions of Interest, RoI) at different amplification levels. +Tellingly, TP53 mutation was most sensitive to features at the higher +applications where cellular morphology is resolved. + +
+
+
+
+
+ + ♻ ☆ Hierarchical Invariance for Robust and Interpretable Vision Tasks at + Larger Scales + + +
+ Developing robust and interpretable vision systems is a crucial step towards +trustworthy artificial intelligence. In this regard, a promising paradigm +considers embedding task-required invariant structures, e.g., geometric +invariance, in the fundamental image representation. However, such invariant +representations typically exhibit limited discriminability, limiting their +applications in larger-scale trustworthy vision tasks. For this open problem, +we conduct a systematic investigation of hierarchical invariance, exploring +this topic from theoretical, practical, and application perspectives. At the +theoretical level, we show how to construct over-complete invariants with a +Convolutional Neural Networks (CNN)-like hierarchical architecture yet in a +fully interpretable manner. The general blueprint, specific definitions, +invariant properties, and numerical implementations are provided. At the +practical level, we discuss how to customize this theoretical framework into a +given task. With the over-completeness, discriminative features w.r.t. the task +can be adaptively formed in a Neural Architecture Search (NAS)-like manner. We +demonstrate the above arguments with accuracy, invariance, and efficiency +results on texture, digit, and parasite classification experiments. +Furthermore, at the application level, our representations are explored in +real-world forensics tasks on adversarial perturbations and Artificial +Intelligence Generated Content (AIGC). Such applications reveal that the +proposed strategy not only realizes the theoretically promised invariance, but +also exhibits competitive discriminability even in the era of deep learning. +For robust and interpretable vision tasks at larger scales, hierarchical +invariant representation can be considered as an effective alternative to +traditional CNN and invariants. + +
+
+
+
+
+ + ♻ ☆ CoBra: Complementary Branch Fusing Class and Semantic Knowledge for + Robust Weakly Supervised Semantic Segmentation + + +
+ Leveraging semantically precise pseudo masks derived from image-level class +knowledge for segmentation, namely image-level Weakly Supervised Semantic +Segmentation (WSSS), still remains challenging. While Class Activation Maps +(CAMs) using CNNs have steadily been contributing to the success of WSSS, the +resulting activation maps often narrowly focus on class-specific parts (e.g., +only face of human). On the other hand, recent works based on vision +transformers (ViT) have shown promising results based on their self-attention +mechanism to capture the semantic parts but fail in capturing complete +class-specific details (e.g., entire body parts of human but also with a dog +nearby). In this work, we propose Complementary Branch (CoBra), a novel dual +branch framework consisting of two distinct architectures which provide +valuable complementary knowledge of class (from CNN) and semantic (from ViT) to +each branch. In particular, we learn Class-Aware Projection (CAP) for the CNN +branch and Semantic-Aware Projection (SAP) for the ViT branch to explicitly +fuse their complementary knowledge and facilitate a new type of extra +patch-level supervision. Our model, through CoBra, fuses CNN and ViT's +complementary outputs to create robust pseudo masks that integrate both class +and semantic information effectively. Extensive experiments qualitatively and +quantitatively investigate how CNN and ViT complement each other on the PASCAL +VOC 2012 dataset, showing a state-of-the-art WSSS result. This includes not +only the masks generated by our model, but also the segmentation results +derived from utilizing these masks as pseudo labels. + +
+
+
+
+
+ + ♻ ☆ MV-Adapter: Multimodal Video Transfer Learning for Video Text Retrieval + + +
+ State-of-the-art video-text retrieval (VTR) methods typically involve fully +fine-tuning a pre-trained model (e.g. CLIP) on specific datasets. However, this +can result in significant storage costs in practical applications as a separate +model per task must be stored. To address this issue, we present our pioneering +work that enables parameter-efficient VTR using a pre-trained model, with only +a small number of tunable parameters during training. Towards this goal, we +propose a new method dubbed Multimodal Video Adapter (MV-Adapter) for +efficiently transferring the knowledge in the pre-trained CLIP from image-text +to video-text. Specifically, MV-Adapter utilizes bottleneck structures in both +video and text branches, along with two novel components. The first is a +Temporal Adaptation Module that is incorporated in the video branch to +introduce global and local temporal contexts. We also train weights +calibrations to adjust to dynamic variations across frames. The second is Cross +Modality Tying that generates weights for video/text branches through sharing +cross modality factors, for better aligning between modalities. Thanks to above +innovations, MV-Adapter can achieve comparable or better performance than +standard full fine-tuning with negligible parameters overhead. Notably, +MV-Adapter consistently outperforms various competing methods in V2T/T2V tasks +with large margins on five widely used VTR benchmarks (MSR-VTT, MSVD, LSMDC, +DiDemo, and ActivityNet). + +
+
+
+
+
+ + ♻ ☆ Diff-Plugin: Revitalizing Details for Diffusion-based Low-level Tasks CVPR2024 + + +
+ Diffusion models trained on large-scale datasets have achieved remarkable +progress in image synthesis. However, due to the randomness in the diffusion +process, they often struggle with handling diverse low-level tasks that require +details preservation. To overcome this limitation, we present a new Diff-Plugin +framework to enable a single pre-trained diffusion model to generate +high-fidelity results across a variety of low-level tasks. Specifically, we +first propose a lightweight Task-Plugin module with a dual branch design to +provide task-specific priors, guiding the diffusion process in preserving image +content. We then propose a Plugin-Selector that can automatically select +different Task-Plugins based on the text instruction, allowing users to edit +images by indicating multiple low-level tasks with natural language. We conduct +extensive experiments on 8 low-level vision tasks. The results demonstrate the +superiority of Diff-Plugin over existing methods, particularly in real-world +scenarios. Our ablations further validate that Diff-Plugin is stable, +schedulable, and supports robust training across different dataset sizes. + +
+
+ comment: Accepted to CVPR2024. Replaced some celebrity images to avoid + copyright disputes +
+
+
+
+
+ + ♻ ☆ HPNet: Dynamic Trajectory Forecasting with Historical Prediction + Attention CVPR2024 + + +
+ Predicting the trajectories of road agents is essential for autonomous +driving systems. The recent mainstream methods follow a static paradigm, which +predicts the future trajectory by using a fixed duration of historical frames. +These methods make the predictions independently even at adjacent time steps, +which leads to potential instability and temporal inconsistency. As successive +time steps have largely overlapping historical frames, their forecasting should +have intrinsic correlation, such as overlapping predicted trajectories should +be consistent, or be different but share the same motion goal depending on the +road situation. Motivated by this, in this work, we introduce HPNet, a novel +dynamic trajectory forecasting method. Aiming for stable and accurate +trajectory forecasting, our method leverages not only historical frames +including maps and agent states, but also historical predictions. Specifically, +we newly design a Historical Prediction Attention module to automatically +encode the dynamic relationship between successive predictions. Besides, it +also extends the attention range beyond the currently visible window +benefitting from the use of historical predictions. The proposed Historical +Prediction Attention together with the Agent Attention and Mode Attention is +further formulated as the Triple Factorized Attention module, serving as the +core design of HPNet.Experiments on the Argoverse and INTERACTION datasets show +that HPNet achieves state-of-the-art performance, and generates accurate and +stable future trajectories. Our code are available at +https://github.com/XiaolongTang23/HPNet. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ♻ ☆ MIPS at SemEval-2024 Task 3: Multimodal Emotion-Cause Pair Extraction in + Conversations with Multimodal Language Models SemEval '24 + + +
+ This paper presents our winning submission to Subtask 2 of SemEval 2024 Task +3 on multimodal emotion cause analysis in conversations. We propose a novel +Multimodal Emotion Recognition and Multimodal Emotion Cause Extraction +(MER-MCE) framework that integrates text, audio, and visual modalities using +specialized emotion encoders. Our approach sets itself apart from +top-performing teams by leveraging modality-specific features for enhanced +emotion understanding and causality inference. Experimental evaluation +demonstrates the advantages of our multimodal approach, with our submission +achieving a competitive weighted F1 score of 0.3435, ranking third with a +margin of only 0.0339 behind the 1st team and 0.0025 behind the 2nd team. +Project: https://github.com/MIPS-COLT/MER-MCE.git + +
+
+ comment: Ranked 3rd in SemEval '24 Task 3 with F1 of 0.3435, close to 1st & + 2nd by 0.0339 & 0.0025 +
+
+
+
+
+ + ♻ ☆ DriveDreamer-2: LLM-Enhanced World Models for Diverse Driving Video + Generation + + +
+ World models have demonstrated superiority in autonomous driving, +particularly in the generation of multi-view driving videos. However, +significant challenges still exist in generating customized driving videos. In +this paper, we propose DriveDreamer-2, which builds upon the framework of +DriveDreamer and incorporates a Large Language Model (LLM) to generate +user-defined driving videos. Specifically, an LLM interface is initially +incorporated to convert a user's query into agent trajectories. Subsequently, a +HDMap, adhering to traffic regulations, is generated based on the trajectories. +Ultimately, we propose the Unified Multi-View Model to enhance temporal and +spatial coherence in the generated driving videos. DriveDreamer-2 is the first +world model to generate customized driving videos, it can generate uncommon +driving videos (e.g., vehicles abruptly cut in) in a user-friendly manner. +Besides, experimental results demonstrate that the generated videos enhance the +training of driving perception methods (e.g., 3D detection and tracking). +Furthermore, video generation quality of DriveDreamer-2 surpasses other +state-of-the-art methods, showcasing FID and FVD scores of 11.2 and 55.7, +representing relative improvements of 30% and 50%. + +
+
+ comment: Project Page: https://drivedreamer2.github.io +
+
+
+
+
+ + ♻ ☆ Deep Multi-Threshold Spiking-UNet for Image Processing + + +
+ U-Net, known for its simple yet efficient architecture, is widely utilized +for image processing tasks and is particularly suitable for deployment on +neuromorphic chips. This paper introduces the novel concept of Spiking-UNet for +image processing, which combines the power of Spiking Neural Networks (SNNs) +with the U-Net architecture. To achieve an efficient Spiking-UNet, we face two +primary challenges: ensuring high-fidelity information propagation through the +network via spikes and formulating an effective training strategy. To address +the issue of information loss, we introduce multi-threshold spiking neurons, +which improve the efficiency of information transmission within the +Spiking-UNet. For the training strategy, we adopt a conversion and fine-tuning +pipeline that leverage pre-trained U-Net models. During the conversion process, +significant variability in data distribution across different parts is observed +when utilizing skip connections. Therefore, we propose a connection-wise +normalization method to prevent inaccurate firing rates. Furthermore, we adopt +a flow-based training method to fine-tune the converted models, reducing time +steps while preserving performance. Experimental results show that, on image +segmentation and denoising, our Spiking-UNet achieves comparable performance to +its non-spiking counterpart, surpassing existing SNN methods. Compared with the +converted Spiking-UNet without fine-tuning, our Spiking-UNet reduces inference +time by approximately 90\%. This research broadens the application scope of +SNNs in image processing and is expected to inspire further exploration in the +field of neuromorphic engineering. The code for our Spiking-UNet implementation +is available at https://github.com/SNNresearch/Spiking-UNet. + +
+
+ comment: Accepted in NeuroComputing +
+
+
+
+
+ + ♻ ☆ GEM3D: GEnerative Medial Abstractions for 3D Shape Synthesis SIGGRAPH 2024 + + +
+ We introduce GEM3D -- a new deep, topology-aware generative model of 3D +shapes. The key ingredient of our method is a neural skeleton-based +representation encoding information on both shape topology and geometry. +Through a denoising diffusion probabilistic model, our method first generates +skeleton-based representations following the Medial Axis Transform (MAT), then +generates surfaces through a skeleton-driven neural implicit formulation. The +neural implicit takes into account the topological and geometric information +stored in the generated skeleton representations to yield surfaces that are +more topologically and geometrically accurate compared to previous neural field +formulations. We discuss applications of our method in shape synthesis and +point cloud reconstruction tasks, and evaluate our method both qualitatively +and quantitatively. We demonstrate significantly more faithful surface +reconstruction and diverse shape generation results compared to the +state-of-the-art, also involving challenging scenarios of reconstructing and +synthesizing structurally complex, high-genus shape surfaces from Thingi10K and +ShapeNet. + +
+
+ comment: Webpage: https://lodurality.github.io/GEM3D/ -- Cond. accept. to + SIGGRAPH 2024 (conf. track) -- Changes (based on reviews): changed style to + sigconf; rearranged figures for readability; added missing citations; fixed + misaligned centers in Fig. 3; added failure cases (Fig. 10); rewrote + discussion; added categories averages to Tab. 8; added Tab. 10 with model + capacities +
+
+
+
+
+ + ♻ ☆ Fourier Prompt Tuning for Modality-Incomplete Scene Segmentation + + +
+ Integrating information from multiple modalities enhances the robustness of +scene perception systems in autonomous vehicles, providing a more comprehensive +and reliable sensory framework. However, the modality incompleteness in +multi-modal segmentation remains under-explored. In this work, we establish a +task called Modality-Incomplete Scene Segmentation (MISS), which encompasses +both system-level modality absence and sensor-level modality errors. To avoid +the predominant modality reliance in multi-modal fusion, we introduce a +Missing-aware Modal Switch (MMS) strategy to proactively manage missing +modalities during training. Utilizing bit-level batch-wise sampling enhances +the model's performance in both complete and incomplete testing scenarios. +Furthermore, we introduce the Fourier Prompt Tuning (FPT) method to incorporate +representative spectral information into a limited number of learnable prompts +that maintain robustness against all MISS scenarios. Akin to fine-tuning +effects but with fewer tunable parameters (1.1%). Extensive experiments prove +the efficacy of our proposed approach, showcasing an improvement of 5.84% mIoU +over the prior state-of-the-art parameter-efficient methods in modality +missing. The source code is publicly available at +https://github.com/RuipingL/MISS. + +
+
+ comment: Accepted to IEEE IV 2024. The source code is publicly available at + https://github.com/RuipingL/MISS +
+
+
+
+
+ + ♻ ☆ Tensor Decomposition Based Attention Module for Spiking Neural Networks + + +
+ The attention mechanism has been proven to be an effective way to improve +spiking neural network (SNN). However, based on the fact that the current SNN +input data flow is split into tensors to process on GPUs, none of the previous +works consider the properties of tensors to implement an attention module. This +inspires us to rethink current SNN from the perspective of tensor-relevant +theories. Using tensor decomposition, we design the \textit{projected full +attention} (PFA) module, which demonstrates excellent results with linearly +growing parameters. Specifically, PFA is composed by the \textit{linear +projection of spike tensor} (LPST) module and \textit{attention map composing} +(AMC) module. In LPST, we start by compressing the original spike tensor into +three projected tensors using a single property-preserving strategy with +learnable parameters for each dimension. Then, in AMC, we exploit the inverse +procedure of the tensor decomposition process to combine the three tensors into +the attention map using a so-called connecting factor. To validate the +effectiveness of the proposed PFA module, we integrate it into the widely used +VGG and ResNet architectures for classification tasks. Our method achieves +state-of-the-art performance on both static and dynamic benchmark datasets, +surpassing the existing SNN models with Transformer-based and CNN-based +backbones. + +
+
+ comment: Accepted by Knowledge-Based Systems +
+
+
+
+
+ + ♻ ☆ TC4D: Trajectory-Conditioned Text-to-4D Generation + + +
+ Recent techniques for text-to-4D generation synthesize dynamic 3D scenes +using supervision from pre-trained text-to-video models. However, existing +representations for motion, such as deformation models or time-dependent neural +representations, are limited in the amount of motion they can generate-they +cannot synthesize motion extending far beyond the bounding box used for volume +rendering. The lack of a more flexible motion model contributes to the gap in +realism between 4D generation methods and recent, near-photorealistic video +generation models. Here, we propose TC4D: trajectory-conditioned text-to-4D +generation, which factors motion into global and local components. We represent +the global motion of a scene's bounding box using rigid transformation along a +trajectory parameterized by a spline. We learn local deformations that conform +to the global trajectory using supervision from a text-to-video model. Our +approach enables the synthesis of scenes animated along arbitrary trajectories, +compositional scene generation, and significant improvements to the realism and +amount of generated motion, which we evaluate qualitatively and through a user +study. Video results can be viewed on our website: +https://sherwinbahmani.github.io/tc4d. + +
+
+ comment: Project Page: https://sherwinbahmani.github.io/tc4d +
+
+
+
+
+ + ♻ ☆ Exploring Effective Priors and Efficient Models for Weakly-Supervised + Change Detection + + +
+ Weakly-supervised change detection (WSCD) aims to detect pixel-level changes +with only image-level annotations. Owing to its label efficiency, WSCD is +drawing increasing attention recently. However, current WSCD methods often +encounter the challenge of change missing and fabricating, i.e., the +inconsistency between image-level annotations and pixel-level predictions. +Specifically, change missing refer to the situation that the WSCD model fails +to predict any changed pixels, even though the image-level label indicates +changed, and vice versa for change fabricating. To address this challenge, in +this work, we leverage global-scale and local-scale priors in WSCD and propose +two components: a Dilated Prior (DP) decoder and a Label Gated (LG) constraint. +The DP decoder decodes samples with the changed image-level label, skips +samples with the unchanged label, and replaces them with an all-unchanged +pixel-level label. The LG constraint is derived from the correspondence between +changed representations and image-level labels, penalizing the model when it +mispredicts the change status. Additionally, we develop TransWCD, a simple yet +powerful transformer-based model, showcasing the potential of weakly-supervised +learning in change detection. By integrating the DP decoder and LG constraint +into TransWCD, we form TransWCD-DL. Our proposed TransWCD and TransWCD-DL +achieve significant +6.33% and +9.55% F1 score improvements over the +state-of-the-art methods on the WHU-CD dataset, respectively. Some performance +metrics even exceed several fully-supervised change detection (FSCD) +competitors. Code will be available at +https://github.com/zhenghuizhao/TransWCD. + +
+
+
+
+
+ + ♻ ☆ One-Prompt to Segment All Medical Images + + +
+ Large foundation models, known for their strong zero-shot generalization, +have excelled in visual and language applications. However, applying them to +medical image segmentation, a domain with diverse imaging types and target +labels, remains an open challenge. Current approaches, such as adapting +interactive segmentation models like Segment Anything Model (SAM), require user +prompts for each sample during inference. Alternatively, transfer learning +methods like few/one-shot models demand labeled samples, leading to high costs. +This paper introduces a new paradigm toward the universal medical image +segmentation, termed 'One-Prompt Segmentation.' One-Prompt Segmentation +combines the strengths of one-shot and interactive methods. In the inference +stage, with just \textbf{one prompted sample}, it can adeptly handle the unseen +task in a single forward pass. We train One-Prompt Model on 64 open-source +medical datasets, accompanied by the collection of over 3,000 clinician-labeled +prompts. Tested on 14 previously unseen datasets, the One-Prompt Model +showcases superior zero-shot segmentation capabilities, outperforming a wide +range of related methods. The code and data is released as +\url{https://github.com/KidsWithTokens/one-prompt}. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2304.12620 +
+
+
+
+
+ + ♻ ☆ Large-Scale Multi-Hypotheses Cell Tracking Using Ultrametric Contours + Maps + + +
+ In this work, we describe a method for large-scale 3D cell-tracking through a +segmentation selection approach. The proposed method is effective at tracking +cells across large microscopy datasets on two fronts: (i) It can solve problems +containing millions of segmentation instances in terabyte-scale 3D+t datasets; +(ii) It achieves competitive results with or without deep learning, which +requires 3D annotated data, that is scarce in the fluorescence microscopy +field. The proposed method computes cell tracks and segments using a hierarchy +of segmentation hypotheses and selects disjoint segments by maximizing the +overlap between adjacent frames. We show that this method achieves +state-of-the-art results in 3D images from the cell tracking challenge and has +a faster integer linear programming formulation. Moreover, our framework is +flexible and supports segmentations from off-the-shelf cell segmentation models +and can combine them into an ensemble that improves tracking. The code is +available https://github.com/royerlab/ultrack. + +
+
+ comment: 13 pages, 7 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ ASDF: Assembly State Detection Utilizing Late Fusion by Integrating 6D + Pose Estimation + + +
+ In medical and industrial domains, providing guidance for assembly processes +is critical to ensure efficiency and safety. Errors in assembly can lead to +significant consequences such as extended surgery times, and prolonged +manufacturing or maintenance times in industry. Assembly scenarios can benefit +from in-situ AR visualization to provide guidance, reduce assembly times and +minimize errors. To enable in-situ visualization 6D pose estimation can be +leveraged. Existing 6D pose estimation techniques primarily focus on individual +objects and static captures. However, assembly scenarios have various dynamics +including occlusion during assembly and dynamics in the assembly objects +appearance. Existing work, combining object detection/6D pose estimation and +assembly state detection focuses either on pure deep learning-based approaches, +or limit the assembly state detection to building blocks. To address the +challenges of 6D pose estimation in combination with assembly state detection, +our approach ASDF builds upon the strengths of YOLOv8, a real-time capable +object detection framework. We extend this framework, refine the object pose +and fuse pose knowledge with network-detected pose information. Utilizing our +late fusion in our Pose2State module results in refined 6D pose estimation and +assembly state detection. By combining both pose and state information, our +Pose2State module predicts the final assembly state with precision. Our +evaluation on our ASDF dataset shows that our Pose2State module leads to an +improved assembly state detection and that the improvement of the assembly +state further leads to a more robust 6D pose estimation. Moreover, on the GBOT +dataset, we outperform the pure deep learning-based network, and even +outperform the hybrid and pure tracking-based approaches. + +
+
+
+
+
+ + ♻ ☆ Modality Translation for Object Detection Adaptation Without Forgetting + Prior Knowledge + + +
+ A common practice in deep learning consists of training large neural networks +on massive datasets to perform accurately for different domains and tasks. +While this methodology may work well in numerous application areas, it only +applies across modalities due to a larger distribution shift in data captured +using different sensors. This paper focuses on the problem of adapting a large +object detection model to one or multiple modalities while being efficient. To +do so, we propose ModTr as an alternative to the common approach of fine-tuning +large models. ModTr consists of adapting the input with a small transformation +network trained to minimize the detection loss directly. The original model can +therefore work on the translated inputs without any further change or +fine-tuning to its parameters. Experimental results on translating from IR to +RGB images on two well-known datasets show that this simple ModTr approach +provides detectors that can perform comparably or better than the standard +fine-tuning without forgetting the original knowledge. This opens the doors to +a more flexible and efficient service-based detection pipeline in which, +instead of using a different detector for each modality, a unique and unaltered +server is constantly running, where multiple modalities with the corresponding +translations can query it. Code: https://github.com/heitorrapela/ModTr. + +
+
+
+
+
+ + ♻ ☆ Putting the Object Back into Video Object Segmentation CVPR 2024 + + +
+ We present Cutie, a video object segmentation (VOS) network with object-level +memory reading, which puts the object representation from memory back into the +video object segmentation result. Recent works on VOS employ bottom-up +pixel-level memory reading which struggles due to matching noise, especially in +the presence of distractors, resulting in lower performance in more challenging +data. In contrast, Cutie performs top-down object-level memory reading by +adapting a small set of object queries. Via those, it interacts with the +bottom-up pixel features iteratively with a query-based object transformer (qt, +hence Cutie). The object queries act as a high-level summary of the target +object, while high-resolution feature maps are retained for accurate +segmentation. Together with foreground-background masked attention, Cutie +cleanly separates the semantics of the foreground object from the background. +On the challenging MOSE dataset, Cutie improves by 8.7 J&F over XMem with a +similar running time and improves by 4.2 J&F over DeAOT while being three times +faster. Code is available at: https://hkchengrex.github.io/Cutie + +
+
+ comment: CVPR 2024 Highlight. Project page: https://hkchengrex.github.io/Cutie +
+
+
+
+
+ + ♻ ☆ Sat2Cap: Mapping Fine-Grained Textual Descriptions from Satellite Images + + +
+ We propose a weakly supervised approach for creating maps using free-form +textual descriptions. We refer to this work of creating textual maps as +zero-shot mapping. Prior works have approached mapping tasks by developing +models that predict a fixed set of attributes using overhead imagery. However, +these models are very restrictive as they can only solve highly specific tasks +for which they were trained. Mapping text, on the other hand, allows us to +solve a large variety of mapping problems with minimal restrictions. To achieve +this, we train a contrastive learning framework called Sat2Cap on a new +large-scale dataset with 6.1M pairs of overhead and ground-level images. For a +given location and overhead image, our model predicts the expected CLIP +embeddings of the ground-level scenery. The predicted CLIP embeddings are then +used to learn about the textual space associated with that location. Sat2Cap is +also conditioned on date-time information, allowing it to model temporally +varying concepts over a location. Our experimental results demonstrate that our +models successfully capture ground-level concepts and allow large-scale mapping +of fine-grained textual queries. Our approach does not require any text-labeled +data, making the training easily scalable. The code, dataset, and models will +be made publicly available. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ♻ ☆ Learning county from pixels: Corn yield prediction with + attention-weighted multiple instance learning + + +
+ Remote sensing technology has become a promising tool in yield prediction. +Most prior work employs satellite imagery for county-level corn yield +prediction by spatially aggregating all pixels within a county into a single +value, potentially overlooking the detailed information and valuable insights +offered by more granular data. To this end, this research examines each county +at the pixel level and applies multiple instance learning to leverage detailed +information within a county. In addition, our method addresses the "mixed +pixel" issue caused by the inconsistent resolution between feature datasets and +crop mask, which may introduce noise into the model and therefore hinder +accurate yield prediction. Specifically, the attention mechanism is employed to +automatically assign weights to different pixels, which can mitigate the +influence of mixed pixels. The experimental results show that the developed +model outperforms four other machine learning models over the past five years +in the U.S. corn belt and demonstrates its best performance in 2022, achieving +a coefficient of determination (R2) value of 0.84 and a root mean square error +(RMSE) of 0.83. This paper demonstrates the advantages of our approach from +both spatial and temporal perspectives. Furthermore, through an in-depth study +of the relationship between mixed pixels and attention, it is verified that our +approach can capture critical feature information while filtering out noise +from mixed pixels. + +
+
+ comment: I am writing to request the withdrawal of my paper submitted to + arXiv. Upon further review, I have identified an error in the paper that + significantly affects the results and conclusions. To maintain the integrity + of the scientific record and prevent the dissemination of incorrect + information, I believe it is necessary to withdraw the paper from the archive +
+
+
+
+
+ + ♻ ☆ Fooling Contrastive Language-Image Pre-trained Models with + CLIPMasterPrints + + +
+ Models leveraging both visual and textual data such as Contrastive +Language-Image Pre-training (CLIP), are the backbone of many recent advances in +artificial intelligence. In this work, we show that despite their versatility, +such models are vulnerable to what we refer to as fooling master images. +Fooling master images are capable of maximizing the confidence score of a CLIP +model for a significant number of widely varying prompts, while being either +unrecognizable or unrelated to the attacked prompts for humans. The existence +of such images is problematic as it could be used by bad actors to maliciously +interfere with CLIP-trained image retrieval models in production with +comparably small effort as a single image can attack many different prompts. We +demonstrate how fooling master images for CLIP (CLIPMasterPrints) can be mined +using stochastic gradient descent, projected gradient descent, or blackbox +optimization. Contrary to many common adversarial attacks, the blackbox +optimization approach allows us to mine CLIPMasterPrints even when the weights +of the model are not accessible. We investigate the properties of the mined +images, and find that images trained on a small number of image captions +generalize to a much larger number of semantically related captions. We +evaluate possible mitigation strategies, where we increase the robustness of +the model and introduce an approach to automatically detect CLIPMasterPrints to +sanitize the input of vulnerable models. Finally, we find that vulnerability to +CLIPMasterPrints is related to a modality gap in contrastive pre-trained +multi-modal networks. Code available at +https://github.com/matfrei/CLIPMasterPrints. + +
+
+
+
+
+ + ♻ ☆ Efficient Representation of Natural Image Patches + + +
+ Utilizing an abstract information processing model based on minimal yet +realistic assumptions inspired by biological systems, we study how to achieve +the early visual system's two ultimate objectives: efficient information +transmission and accurate sensor probability distribution modeling. We prove +that optimizing for information transmission does not guarantee optimal +probability distribution modeling in general. We illustrate, using a two-pixel +(2D) system and image patches, that an efficient representation can be realized +through a nonlinear population code driven by two types of biologically +plausible loss functions that depend solely on output. After unsupervised +learning, our abstract information processing model bears remarkable +resemblances to biological systems, despite not mimicking many features of real +neurons, such as spiking activity. A preliminary comparison with a contemporary +deep learning model suggests that our model offers a significant efficiency +advantage. Our model provides novel insights into the computational theory of +early visual systems as well as a potential new approach to enhance the +efficiency of deep learning models. + +
+
+
+
+
+ + ♻ ☆ DQ-DETR: DETR with Dynamic Query for Tiny Object Detection + + +
+ Despite previous DETR-like methods having performed successfully in generic +object detection, tiny object detection is still a challenging task for them +since the positional information of object queries is not customized for +detecting tiny objects, whose scale is extraordinarily smaller than general +objects. Also, DETR-like methods using a fixed number of queries make them +unsuitable for aerial datasets, which only contain tiny objects, and the +numbers of instances are imbalanced between different images. Thus, we present +a simple yet effective model, named DQ-DETR, which consists of three different +components: categorical counting module, counting-guided feature enhancement, +and dynamic query selection to solve the above-mentioned problems. DQ-DETR uses +the prediction and density maps from the categorical counting module to +dynamically adjust the number of object queries and improve the positional +information of queries. Our model DQ-DETR outperforms previous CNN-based and +DETR-like methods, achieving state-of-the-art mAP 30.2% on the AI-TOD-V2 +dataset, which mostly consists of tiny objects. + +
+
+
+
+
+ + ♻ ☆ EFHQ: Multi-purpose ExtremePose-Face-HQ dataset + + +
+ The existing facial datasets, while having plentiful images at near frontal +views, lack images with extreme head poses, leading to the downgraded +performance of deep learning models when dealing with profile or pitched faces. +This work aims to address this gap by introducing a novel dataset named Extreme +Pose Face High-Quality Dataset (EFHQ), which includes a maximum of 450k +high-quality images of faces at extreme poses. To produce such a massive +dataset, we utilize a novel and meticulous dataset processing pipeline to +curate two publicly available datasets, VFHQ and CelebV-HQ, which contain many +high-resolution face videos captured in various settings. Our dataset can +complement existing datasets on various facial-related tasks, such as facial +synthesis with 2D/3D-aware GAN, diffusion-based text-to-image face generation, +and face reenactment. Specifically, training with EFHQ helps models generalize +well across diverse poses, significantly improving performance in scenarios +involving extreme views, confirmed by extensive experiments. Additionally, we +utilize EFHQ to define a challenging cross-view face verification benchmark, in +which the performance of SOTA face recognition models drops 5-37% compared to +frontal-to-frontal scenarios, aiming to stimulate studies on face recognition +under severe pose conditions in the wild. + +
+
+ comment: Project Page: https://bomcon123456.github.io/efhq/ +
+
+
+
+
+ + ♻ ☆ IISAN: Efficiently Adapting Multimodal Representation for Sequential + Recommendation with Decoupled PEFT SIGIR2024 + + +
+ Multimodal foundation models are transformative in sequential recommender +systems, leveraging powerful representation learning capabilities. While +Parameter-efficient Fine-tuning (PEFT) is commonly used to adapt foundation +models for recommendation tasks, most research prioritizes parameter +efficiency, often overlooking critical factors like GPU memory efficiency and +training speed. Addressing this gap, our paper introduces IISAN (Intra- and +Inter-modal Side Adapted Network for Multimodal Representation), a simple +plug-and-play architecture using a Decoupled PEFT structure and exploiting both +intra- and inter-modal adaptation. + IISAN matches the performance of full fine-tuning (FFT) and state-of-the-art +PEFT. More importantly, it significantly reduces GPU memory usage - from 47GB +to just 3GB for multimodal sequential recommendation tasks. Additionally, it +accelerates training time per epoch from 443s to 22s compared to FFT. This is +also a notable improvement over the Adapter and LoRA, which require 37-39 GB +GPU memory and 350-380 seconds per epoch for training. + Furthermore, we propose a new composite efficiency metric, TPME +(Training-time, Parameter, and GPU Memory Efficiency) to alleviate the +prevalent misconception that "parameter efficiency represents overall +efficiency". TPME provides more comprehensive insights into practical +efficiency comparisons between different methods. Besides, we give an +accessible efficiency analysis of all PEFT and FFT approaches, which +demonstrate the superiority of IISAN. We release our codes and other materials +at https://github.com/GAIR-Lab/IISAN. + +
+
+ comment: Accepted by SIGIR2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 138 + +
+
+
+ + ☆ GoodDrag: Towards Good Practices for Drag Editing with Diffusion Models + + +
+ In this paper, we introduce GoodDrag, a novel approach to improve the +stability and image quality of drag editing. Unlike existing methods that +struggle with accumulated perturbations and often result in distortions, +GoodDrag introduces an AlDD framework that alternates between drag and +denoising operations within the diffusion process, effectively improving the +fidelity of the result. We also propose an information-preserving motion +supervision operation that maintains the original features of the starting +point for precise manipulation and artifact reduction. In addition, we +contribute to the benchmarking of drag editing by introducing a new dataset, +Drag100, and developing dedicated quality assessment metrics, Dragging Accuracy +Index and Gemini Score, utilizing Large Multimodal Models. Extensive +experiments demonstrate that the proposed GoodDrag compares favorably against +the state-of-the-art approaches both qualitatively and quantitatively. The +project page is https://gooddrag.github.io. + +
+
+
+
+
+ + ☆ BRAVE: Broadening the visual encoding of vision-language models + + +
+ Vision-language models (VLMs) are typically composed of a vision encoder, +e.g. CLIP, and a language model (LM) that interprets the encoded features to +solve downstream tasks. Despite remarkable progress, VLMs are subject to +several shortcomings due to the limited capabilities of vision encoders, e.g. +"blindness" to certain image features, visual hallucination, etc. To address +these issues, we study broadening the visual encoding capabilities of VLMs. We +first comprehensively benchmark several vision encoders with different +inductive biases for solving VLM tasks. We observe that there is no single +encoding configuration that consistently achieves top performance across +different tasks, and encoders with different biases can perform surprisingly +similarly. Motivated by this, we introduce a method, named BRAVE, that +consolidates features from multiple frozen encoders into a more versatile +representation that can be directly fed as the input to a frozen LM. BRAVE +achieves state-of-the-art performance on a broad range of captioning and VQA +benchmarks and significantly reduces the aforementioned issues of VLMs, while +requiring a smaller number of trainable parameters than existing methods and +having a more compressed representation. Our results highlight the potential of +incorporating different visual biases for a more broad and contextualized +visual understanding of VLMs. + +
+
+ comment: Project page at https://brave-vlms.epfl.ch/ +
+
+
+
+
+ + ☆ UMBRAE: Unified Multimodal Decoding of Brain Signals + + +
+ We address prevailing challenges of the brain-powered research, departing +from the observation that the literature hardly recover accurate spatial +information and require subject-specific models. To address these challenges, +we propose UMBRAE, a unified multimodal decoding of brain signals. First, to +extract instance-level conceptual and spatial details from neural signals, we +introduce an efficient universal brain encoder for multimodal-brain alignment +and recover object descriptions at multiple levels of granularity from +subsequent multimodal large language model (MLLM). Second, we introduce a +cross-subject training strategy mapping subject-specific features to a common +feature space. This allows a model to be trained on multiple subjects without +extra resources, even yielding superior results compared to subject-specific +models. Further, we demonstrate this supports weakly-supervised adaptation to +new subjects, with only a fraction of the total training data. Experiments +demonstrate that UMBRAE not only achieves superior results in the newly +introduced tasks but also outperforms methods in well established tasks. To +assess our method, we construct and share with the community a comprehensive +brain understanding benchmark BrainHub. Our code and benchmark are available at +https://weihaox.github.io/UMBRAE. + +
+
+ comment: Project Page: https://weihaox.github.io/UMBRAE +
+
+
+
+
+ + ☆ RealmDreamer: Text-Driven 3D Scene Generation with Inpainting and Depth + Diffusion + + +
+ We introduce RealmDreamer, a technique for generation of general +forward-facing 3D scenes from text descriptions. Our technique optimizes a 3D +Gaussian Splatting representation to match complex text prompts. We initialize +these splats by utilizing the state-of-the-art text-to-image generators, +lifting their samples into 3D, and computing the occlusion volume. We then +optimize this representation across multiple views as a 3D inpainting task with +image-conditional diffusion models. To learn correct geometric structure, we +incorporate a depth diffusion model by conditioning on the samples from the +inpainting model, giving rich geometric structure. Finally, we finetune the +model using sharpened samples from image generators. Notably, our technique +does not require video or multi-view data and can synthesize a variety of +high-quality 3D scenes in different styles, consisting of multiple objects. Its +generality additionally allows 3D synthesis from a single image. + +
+
+ comment: Project Page: https://realmdreamer.github.io/ +
+
+
+
+
+ + ☆ InstantMesh: Efficient 3D Mesh Generation from a Single Image with + Sparse-view Large Reconstruction Models + + +
+ We present InstantMesh, a feed-forward framework for instant 3D mesh +generation from a single image, featuring state-of-the-art generation quality +and significant training scalability. By synergizing the strengths of an +off-the-shelf multiview diffusion model and a sparse-view reconstruction model +based on the LRM architecture, InstantMesh is able to create diverse 3D assets +within 10 seconds. To enhance the training efficiency and exploit more +geometric supervisions, e.g, depths and normals, we integrate a differentiable +iso-surface extraction module into our framework and directly optimize on the +mesh representation. Experimental results on public datasets demonstrate that +InstantMesh significantly outperforms other latest image-to-3D baselines, both +qualitatively and quantitatively. We release all the code, weights, and demo of +InstantMesh, with the intention that it can make substantial contributions to +the community of 3D generative AI and empower both researchers and content +creators. + +
+
+ comment: Technical report. Project: https://github.com/TencentARC/InstantMesh +
+
+
+
+
+ + ☆ GCV-Turbo: End-to-end Acceleration of GNN-based Computer Vision Tasks on + FPGA + + +
+ Graph neural networks (GNNs) have recently empowered various novel computer +vision (CV) tasks. In GNN-based CV tasks, a combination of CNN layers and GNN +layers or only GNN layers are employed. This paper introduces GCV-Turbo, a +domain-specific accelerator on FPGA for end-to-end acceleration of GNN-based CV +tasks. GCV-Turbo consists of two key components: (1) a \emph{novel} hardware +architecture optimized for the computation kernels in both CNNs and GNNs using +the same set of computation resources. (2) a PyTorch-compatible compiler that +takes a user-defined model as input, performs end-to-end optimization for the +computation graph of a given GNN-based CV task, and produces optimized code for +hardware execution. The hardware architecture and the compiler work +synergistically to support a variety of GNN-based CV tasks. We implement +GCV-Turbo on a state-of-the-art FPGA and evaluate its performance across six +representative GNN-based CV tasks with diverse input data modalities (e.g., +image, human skeleton, point cloud). Compared with state-of-the-art CPU (GPU) +implementations, GCV-Turbo achieves an average latency reduction of +$68.4\times$ ($4.1\times$) on these six GNN-based CV tasks. Moreover, GCV-Turbo +supports the execution of the standalone CNNs or GNNs, achieving performance +comparable to that of state-of-the-art CNN (GNN) accelerators for widely used +CNN-only (GNN-only) models. + +
+
+
+
+
+ + ☆ Move Anything with Layered Scene Diffusion CVPR 2024 + + +
+ Diffusion models generate images with an unprecedented level of quality, but +how can we freely rearrange image layouts? Recent works generate controllable +scenes via learning spatially disentangled latent codes, but these methods do +not apply to diffusion models due to their fixed forward process. In this work, +we propose SceneDiffusion to optimize a layered scene representation during the +diffusion sampling process. Our key insight is that spatial disentanglement can +be obtained by jointly denoising scene renderings at different spatial layouts. +Our generated scenes support a wide range of spatial editing operations, +including moving, resizing, cloning, and layer-wise appearance editing +operations, including object restyling and replacing. Moreover, a scene can be +generated conditioned on a reference image, thus enabling object moving for +in-the-wild images. Notably, this approach is training-free, compatible with +general text-to-image diffusion models, and responsive in less than a second. + +
+
+ comment: CVPR 2024 camera-ready +
+
+
+
+
+ + ☆ Self-supervised Monocular Depth Estimation on Water Scenes via Specular + Reflection Prior + + +
+ Monocular depth estimation from a single image is an ill-posed problem for +computer vision due to insufficient reliable cues as the prior knowledge. +Besides the inter-frame supervision, namely stereo and adjacent frames, +extensive prior information is available in the same frame. Reflections from +specular surfaces, informative intra-frame priors, enable us to reformulate the +ill-posed depth estimation task as a multi-view synthesis. This paper proposes +the first self-supervision for deep-learning depth estimation on water scenes +via intra-frame priors, known as reflection supervision and geometrical +constraints. In the first stage, a water segmentation network is performed to +separate the reflection components from the entire image. Next, we construct a +self-supervised framework to predict the target appearance from reflections, +perceived as other perspectives. The photometric re-projection error, +incorporating SmoothL1 and a novel photometric adaptive SSIM, is formulated to +optimize pose and depth estimation by aligning the transformed virtual depths +and source ones. As a supplement, the water surface is determined from real and +virtual camera positions, which complement the depth of the water area. +Furthermore, to alleviate these laborious ground truth annotations, we +introduce a large-scale water reflection scene (WRS) dataset rendered from +Unreal Engine 4. Extensive experiments on the WRS dataset prove the feasibility +of the proposed method compared to state-of-the-art depth estimation +techniques. + +
+
+ comment: 16 pages, 8 figures +
+
+
+
+
+ + ☆ Unified Language-driven Zero-shot Domain Adaptation CVPR 2024 + + +
+ This paper introduces Unified Language-driven Zero-shot Domain Adaptation +(ULDA), a novel task setting that enables a single model to adapt to diverse +target domains without explicit domain-ID knowledge. We identify the +constraints in the existing language-driven zero-shot domain adaptation task, +particularly the requirement for domain IDs and domain-specific models, which +may restrict flexibility and scalability. To overcome these issues, we propose +a new framework for ULDA, consisting of Hierarchical Context Alignment (HCA), +Domain Consistent Representation Learning (DCRL), and Text-Driven Rectifier +(TDR). These components work synergistically to align simulated features with +target text across multiple visual levels, retain semantic correlations between +different regional representations, and rectify biases between simulated and +real target visual features, respectively. Our extensive empirical evaluations +demonstrate that this framework achieves competitive performance in both +settings, surpassing even the model that requires domain-ID, showcasing its +superiority and generalization ability. The proposed method is not only +effective but also maintains practicality and efficiency, as it does not +introduce additional computational costs during inference. Our project page is +https://senqiaoyang.com/project/ULDA . + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Lost in Translation: Modern Neural Networks Still Struggle With Small + Realistic Image Transformations + + +
+ Deep neural networks that achieve remarkable performance in image +classification have previously been shown to be easily fooled by tiny +transformations such as a one pixel translation of the input image. In order to +address this problem, two approaches have been proposed in recent years. The +first approach suggests using huge datasets together with data augmentation in +the hope that a highly varied training set will teach the network to learn to +be invariant. The second approach suggests using architectural modifications +based on sampling theory to deal explicitly with image translations. In this +paper, we show that these approaches still fall short in robustly handling +'natural' image translations that simulate a subtle change in camera +orientation. Our findings reveal that a mere one-pixel translation can result +in a significant change in the predicted image representation for approximately +40% of the test images in state-of-the-art models (e.g. open-CLIP trained on +LAION-2B or DINO-v2) , while models that are explicitly constructed to be +robust to cyclic translations can still be fooled with 1 pixel realistic +(non-cyclic) translations 11% of the time. We present Robust Inference by Crop +Selection: a simple method that can be proven to achieve any desired level of +consistency, although with a modest tradeoff with the model's accuracy. +Importantly, we demonstrate how employing this method reduces the ability to +fool state-of-the-art models with a 1 pixel translation to less than 5% while +suffering from only a 1% drop in classification accuracy. Additionally, we show +that our method can be easy adjusted to deal with circular shifts as well. In +such case we achieve 100% robustness to integer shifts with state-of-the-art +accuracy, and with no need for any further training. + +
+
+ comment: 14 pages, 6 appendices, 17 figures +
+
+
+
+
+ + ☆ Measuring proximity to standard planes during fetal brain ultrasound + scanning + + +
+ This paper introduces a novel pipeline designed to bring ultrasound (US) +plane pose estimation closer to clinical use for more effective navigation to +the standard planes (SPs) in the fetal brain. We propose a semi-supervised +segmentation model utilizing both labeled SPs and unlabeled 3D US volume +slices. Our model enables reliable segmentation across a diverse set of fetal +brain images. Furthermore, the model incorporates a classification mechanism to +identify the fetal brain precisely. Our model not only filters out frames +lacking the brain but also generates masks for those containing it, enhancing +the relevance of plane pose regression in clinical settings. We focus on fetal +brain navigation from 2D ultrasound (US) video analysis and combine this model +with a US plane pose regression network to provide sensorless proximity +detection to SPs and non-SPs planes; we emphasize the importance of proximity +detection to SPs for guiding sonographers, offering a substantial advantage +over traditional methods by allowing earlier and more precise adjustments +during scanning. We demonstrate the practical applicability of our approach +through validation on real fetal scan videos obtained from sonographers of +varying expertise levels. Our findings demonstrate the potential of our +approach to complement existing fetal US technologies and advance prenatal +diagnostic practices. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ☆ Driver Attention Tracking and Analysis + + +
+ We propose a novel method to estimate a driver's points-of-gaze using a pair +of ordinary cameras mounted on the windshield and dashboard of a car. This is a +challenging problem due to the dynamics of traffic environments with 3D scenes +of unknown depths. This problem is further complicated by the volatile distance +between the driver and the camera system. To tackle these challenges, we +develop a novel convolutional network that simultaneously analyzes the image of +the scene and the image of the driver's face. This network has a camera +calibration module that can compute an embedding vector that represents the +spatial configuration between the driver and the camera system. This +calibration module improves the overall network's performance, which can be +jointly trained end to end. + We also address the lack of annotated data for training and evaluation by +introducing a large-scale driving dataset with point-of-gaze annotations. This +is an in situ dataset of real driving sessions in an urban city, containing +synchronized images of the driving scene as well as the face and gaze of the +driver. Experiments on this dataset show that the proposed method outperforms +various baseline methods, having the mean prediction error of 29.69 pixels, +which is relatively small compared to the $1280{\times}720$ resolution of the +scene camera. + +
+
+
+
+
+ + ☆ Unfolding ADMM for Enhanced Subspace Clustering of Hyperspectral Images + + +
+ Deep subspace clustering methods are now prominent in clustering, typically +using fully connected networks and a self-representation loss function. +However, these methods often struggle with overfitting and lack +interpretability. In this paper, we explore an alternative clustering approach +based on deep unfolding. By unfolding iterative optimization methods into +neural networks, this approach offers enhanced interpretability and reliability +compared to data-driven deep learning methods, and greater adaptability and +generalization than model-based approaches. Hence, unfolding has become widely +used in inverse imaging problems, such as image restoration, reconstruction, +and super-resolution, but has not been sufficiently explored yet in the context +of clustering. In this work, we introduce an innovative clustering architecture +for hyperspectral images (HSI) by unfolding an iterative solver based on the +Alternating Direction Method of Multipliers (ADMM) for sparse subspace +clustering. To our knowledge, this is the first attempt to apply unfolding ADMM +for computing the self-representation matrix in subspace clustering. Moreover, +our approach captures well the structural characteristics of HSI data by +employing the K nearest neighbors algorithm as part of a structure preservation +module. Experimental evaluation of three established HSI datasets shows clearly +the potential of the unfolding approach in HSI clustering and even demonstrates +superior performance compared to state-of-the-art techniques. + +
+
+
+
+
+ + ☆ Wild Visual Navigation: Fast Traversability Learning via Pre-Trained + Models and Online Self-Supervision + + +
+ Natural environments such as forests and grasslands are challenging for +robotic navigation because of the false perception of rigid obstacles from high +grass, twigs, or bushes. In this work, we present Wild Visual Navigation (WVN), +an online self-supervised learning system for visual traversability estimation. +The system is able to continuously adapt from a short human demonstration in +the field, only using onboard sensing and computing. One of the key ideas to +achieve this is the use of high-dimensional features from pre-trained +self-supervised models, which implicitly encode semantic information that +massively simplifies the learning task. Further, the development of an online +scheme for supervision generator enables concurrent training and inference of +the learned model in the wild. We demonstrate our approach through diverse +real-world deployments in forests, parks, and grasslands. Our system is able to +bootstrap the traversable terrain segmentation in less than 5 min of in-field +training time, enabling the robot to navigate in complex, previously unseen +outdoor terrains. Code: https://bit.ly/498b0CV - Project +page:https://bit.ly/3M6nMHH + +
+
+ comment: Extended version of arXiv:2305.08510 +
+
+
+
+
+ + ☆ 3DMambaComplete: Exploring Structured State Space Model for Point Cloud + Completion + + +
+ Point cloud completion aims to generate a complete and high-fidelity point +cloud from an initially incomplete and low-quality input. A prevalent strategy +involves leveraging Transformer-based models to encode global features and +facilitate the reconstruction process. However, the adoption of pooling +operations to obtain global feature representations often results in the loss +of local details within the point cloud. Moreover, the attention mechanism +inherent in Transformers introduces additional computational complexity, +rendering it challenging to handle long sequences effectively. To address these +issues, we propose 3DMambaComplete, a point cloud completion network built on +the novel Mamba framework. It comprises three modules: HyperPoint Generation +encodes point cloud features using Mamba's selection mechanism and predicts a +set of Hyperpoints. A specific offset is estimated, and the down-sampled points +become HyperPoints. The HyperPoint Spread module disperses these HyperPoints +across different spatial locations to avoid concentration. Finally, a +deformation method transforms the 2D mesh representation of HyperPoints into a +fine-grained 3D structure for point cloud reconstruction. Extensive experiments +conducted on various established benchmarks demonstrate that 3DMambaComplete +surpasses state-of-the-art point cloud completion methods, as confirmed by +qualitative and quantitative analyses. + +
+
+ comment: 10 pages, 8 figures, 7 tables +
+
+
+
+
+ + ☆ Learning Priors for Non Rigid SfM from Casual Videos + + +
+ We tackle the long-standing challenge of reconstructing 3D structures and +camera positions from videos. The problem is particularly hard when objects are +transformed in a non-rigid way. Current approaches to this problem make +unrealistic assumptions or require a long optimization time. + We present TracksTo4D, a novel deep learning-based approach that enables +inferring 3D structure and camera positions from dynamic content originating +from in-the-wild videos using a single feed-forward pass on a sparse point +track matrix. To achieve this, we leverage recent advances in 2D point tracking +and design an equivariant neural architecture tailored for directly processing +2D point tracks by leveraging their symmetries. TracksTo4D is trained on a +dataset of in-the-wild videos utilizing only the 2D point tracks extracted from +the videos, without any 3D supervision. Our experiments demonstrate that +TracksTo4D generalizes well to unseen videos of unseen semantic categories at +inference time, producing equivalent results to state-of-the-art methods while +significantly reducing the runtime compared to other baselines. + +
+
+
+
+
+ + ☆ MoCap-to-Visual Domain Adaptation for Efficient Human Mesh Estimation + from 2D Keypoints CVPR + + +
+ This paper presents Key2Mesh, a model that takes a set of 2D human pose +keypoints as input and estimates the corresponding body mesh. Since this +process does not involve any visual (i.e. RGB image) data, the model can be +trained on large-scale motion capture (MoCap) datasets, thereby overcoming the +scarcity of image datasets with 3D labels. To enable the model's application on +RGB images, we first run an off-the-shelf 2D pose estimator to obtain the 2D +keypoints, and then feed these 2D keypoints to Key2Mesh. To improve the +performance of our model on RGB images, we apply an adversarial domain +adaptation (DA) method to bridge the gap between the MoCap and visual domains. +Crucially, our DA method does not require 3D labels for visual data, which +enables adaptation to target sets without the need for costly labels. We +evaluate Key2Mesh for the task of estimating 3D human meshes from 2D keypoints, +in the absence of RGB and mesh label pairs. Our results on widely used H3.6M +and 3DPW datasets show that Key2Mesh sets the new state-of-the-art by +outperforming other models in PA-MPJPE for both datasets, and in MPJPE and PVE +for the 3DPW dataset. Thanks to our model's simple architecture, it operates at +least 12x faster than the prior state-of-the-art model, LGD. Additional +qualitative samples and code are available on the project website: +https://key2mesh.github.io/. + +
+
+ comment: accepted to CVPRW 2024 +
+
+
+
+
+ + ☆ VLLMs Provide Better Context for Emotion Understanding Through Common + Sense Reasoning + + +
+ Recognising emotions in context involves identifying the apparent emotions of +an individual, taking into account contextual cues from the surrounding scene. +Previous approaches to this task have involved the design of explicit +scene-encoding architectures or the incorporation of external scene-related +information, such as captions. However, these methods often utilise limited +contextual information or rely on intricate training pipelines. In this work, +we leverage the groundbreaking capabilities of Vision-and-Large-Language Models +(VLLMs) to enhance in-context emotion classification without introducing +complexity to the training process in a two-stage approach. In the first stage, +we propose prompting VLLMs to generate descriptions in natural language of the +subject's apparent emotion relative to the visual context. In the second stage, +the descriptions are used as contextual information and, along with the image +input, are used to train a transformer-based architecture that fuses text and +visual features before the final classification task. Our experimental results +show that the text and image features have complementary information, and our +fused architecture significantly outperforms the individual modalities without +any complex training methods. We evaluate our approach on three different +datasets, namely, EMOTIC, CAER-S, and BoLD, and achieve state-of-the-art or +comparable accuracy across all datasets and metrics compared to much more +complex approaches. The code will be made publicly available on github: +https://github.com/NickyFot/EmoCommonSense.git + +
+
+ comment: A. Xenos, N. Foteinopoulou and I. Ntinou contributed equally to this + work; 14 pages, 5 figures +
+
+
+
+
+ + ☆ Implicit Multi-Spectral Transformer: An Lightweight and Effective + Visible to Infrared Image Translation Model IJCNN 2024 + + +
+ In the field of computer vision, visible light images often exhibit low +contrast in low-light conditions, presenting a significant challenge. While +infrared imagery provides a potential solution, its utilization entails high +costs and practical limitations. Recent advancements in deep learning, +particularly the deployment of Generative Adversarial Networks (GANs), have +facilitated the transformation of visible light images to infrared images. +However, these methods often experience unstable training phases and may +produce suboptimal outputs. To address these issues, we propose a novel +end-to-end Transformer-based model that efficiently converts visible light +images into high-fidelity infrared images. Initially, the Texture Mapping +Module and Color Perception Adapter collaborate to extract texture and color +features from the visible light image. The Dynamic Fusion Aggregation Module +subsequently integrates these features. Finally, the transformation into an +infrared image is refined through the synergistic action of the Color +Perception Adapter and the Enhanced Perception Attention mechanism. +Comprehensive benchmarking experiments confirm that our model outperforms +existing methods, producing infrared images of markedly superior quality, both +qualitatively and quantitatively. Furthermore, the proposed model enables more +effective downstream applications for infrared images than other methods. + +
+
+ comment: Accepted by IJCNN 2024 +
+
+
+
+
+ + ☆ Identification of Fine-grained Systematic Errors via Controlled Scene + Generation + + +
+ Many safety-critical applications, especially in autonomous driving, require +reliable object detectors. They can be very effectively assisted by a method to +search for and identify potential failures and systematic errors before these +detectors are deployed. Systematic errors are characterized by combinations of +attributes such as object location, scale, orientation, and color, as well as +the composition of their respective backgrounds. To identify them, one must +rely on something other than real images from a test set because they do not +account for very rare but possible combinations of attributes. To overcome this +limitation, we propose a pipeline for generating realistic synthetic scenes +with fine-grained control, allowing the creation of complex scenes with +multiple objects. Our approach, BEV2EGO, allows for a realistic generation of +the complete scene with road-contingent control that maps 2D bird's-eye view +(BEV) scene configurations to a first-person view (EGO). In addition, we +propose a benchmark for controlled scene generation to select the most +appropriate generative outpainting model for BEV2EGO. We further use it to +perform a systematic analysis of multiple state-of-the-art object detection +models and discover differences between them. + +
+
+
+
+
+ + ☆ An Evidential-enhanced Tri-Branch Consistency Learning Method for + Semi-supervised Medical Image Segmentation + + +
+ Semi-supervised segmentation presents a promising approach for large-scale +medical image analysis, effectively reducing annotation burdens while achieving +comparable performance. This methodology holds substantial potential for +streamlining the segmentation process and enhancing its feasibility within +clinical settings for translational investigations. While cross-supervised +training, based on distinct co-training sub-networks, has become a prevalent +paradigm for this task, addressing critical issues such as predication +disagreement and label-noise suppression requires further attention and +progress in cross-supervised training. In this paper, we introduce an +Evidential Tri-Branch Consistency learning framework (ETC-Net) for +semi-supervised medical image segmentation. ETC-Net employs three branches: an +evidential conservative branch, an evidential progressive branch, and an +evidential fusion branch. The first two branches exhibit complementary +characteristics, allowing them to address prediction diversity and enhance +training stability. We also integrate uncertainty estimation from the +evidential learning into cross-supervised training, mitigating the negative +impact of erroneous supervision signals. Additionally, the evidential fusion +branch capitalizes on the complementary attributes of the first two branches +and leverages an evidence-based Dempster-Shafer fusion strategy, supervised by +more reliable and accurate pseudo-labels of unlabeled data. Extensive +experiments conducted on LA, Pancreas-CT, and ACDC datasets demonstrate that +ETC-Net surpasses other state-of-the-art methods for semi-supervised +segmentation. The code will be made available in the near future at +https://github.com/Medsemiseg. + +
+
+
+
+
+ + ☆ ORacle: Large Vision-Language Models for Knowledge-Guided Holistic OR + Domain Modeling + + +
+ Every day, countless surgeries are performed worldwide, each within the +distinct settings of operating rooms (ORs) that vary not only in their setups +but also in the personnel, tools, and equipment used. This inherent diversity +poses a substantial challenge for achieving a holistic understanding of the OR, +as it requires models to generalize beyond their initial training datasets. To +reduce this gap, we introduce ORacle, an advanced vision-language model +designed for holistic OR domain modeling, which incorporates multi-view and +temporal capabilities and can leverage external knowledge during inference, +enabling it to adapt to previously unseen surgical scenarios. This capability +is further enhanced by our novel data augmentation framework, which +significantly diversifies the training dataset, ensuring ORacle's proficiency +in applying the provided knowledge effectively. In rigorous testing, in scene +graph generation, and downstream tasks on the 4D-OR dataset, ORacle not only +demonstrates state-of-the-art performance but does so requiring less data than +existing models. Furthermore, its adaptability is displayed through its ability +to interpret unseen views, actions, and appearances of tools and equipment. +This demonstrates ORacle's potential to significantly enhance the scalability +and affordability of OR domain modeling and opens a pathway for future +advancements in surgical data science. We will release our code and data upon +acceptance. + +
+
+ comment: 11 pages, 3 figures, 7 tables +
+
+
+
+
+ + ☆ Diffusion-based inpainting of incomplete Euclidean distance matrices of + trajectories generated by a fractional Brownian motion + + +
+ Fractional Brownian trajectories (fBm) feature both randomness and strong +scale-free correlations, challenging generative models to reproduce the +intrinsic memory characterizing the underlying process. Here we test a +diffusion probabilistic model on a specific dataset of corrupted images +corresponding to incomplete Euclidean distance matrices of fBm at various +memory exponents $H$. Our dataset implies uniqueness of the data imputation in +the regime of low missing ratio, where the remaining partial graph is rigid, +providing the ground truth for the inpainting. We find that the conditional +diffusion generation stably reproduces the statistics of missing +fBm-distributed distances for different values of $H$ exponent. Furthermore, +while diffusion models have been recently shown to remember samples from the +training database, we show that diffusion-based inpainting behaves +qualitatively different from the database search with the increasing database +size. Finally, we apply our fBm-trained diffusion model with $H=1/3$ for +completion of chromosome distance matrices obtained in single-cell microscopy +experiments, showing its superiority over the standard bioinformatics +algorithms. Our source code is available on GitHub at +https://github.com/alobashev/diffusion_fbm. + +
+
+
+
+
+ + ☆ Ray-driven Spectral CT Reconstruction Based on Neural Base-Material + Fields + + +
+ In spectral CT reconstruction, the basis materials decomposition involves +solving a large-scale nonlinear system of integral equations, which is highly +ill-posed mathematically. This paper proposes a model that parameterizes the +attenuation coefficients of the object using a neural field representation, +thereby avoiding the complex calculations of pixel-driven projection +coefficient matrices during the discretization process of line integrals. It +introduces a lightweight discretization method for line integrals based on a +ray-driven neural field, enhancing the accuracy of the integral approximation +during the discretization process. The basis materials are represented as +continuous vector-valued implicit functions to establish a neural field +parameterization model for the basis materials. The auto-differentiation +framework of deep learning is then used to solve the implicit continuous +function of the neural base-material fields. This method is not limited by the +spatial resolution of reconstructed images, and the network has compact and +regular properties. Experimental validation shows that our method performs +exceptionally well in addressing the spectral CT reconstruction. Additionally, +it fulfils the requirements for the generation of high-resolution +reconstruction images. + +
+
+ comment: 14 pages,16 figures +
+
+
+
+
+ + ☆ Accurate Tennis Court Line Detection on Amateur Recorded Matches + + +
+ Typically, tennis court line detection is done by running +Hough-Line-Detection to find straight lines in the image, and then computing a +transformation matrix from the detected lines to create the final court +structure. We propose numerous improvements and enhancements to this algorithm, +including using pretrained State-of-the-Art shadow-removal and object-detection +ML models to make our line-detection more robust. Compared to the original +algorithm, our method can accurately detect lines on amateur, dirty courts. +When combined with a robust ball-tracking system, our method will enable +accurate, automatic refereeing for amateur and professional tennis matches +alike. + +
+
+ comment: Accepted to 5th International conference on Image, Video Processing + and Artificial Intelligence +
+
+
+
+
+ + ☆ TrajPRed: Trajectory Prediction with Region-based Relation Learning + + +
+ Forecasting human trajectories in traffic scenes is critical for safety +within mixed or fully autonomous systems. Human future trajectories are driven +by two major stimuli, social interactions, and stochastic goals. Thus, reliable +forecasting needs to capture these two stimuli. Edge-based relation modeling +represents social interactions using pairwise correlations from precise +individual states. Nevertheless, edge-based relations can be vulnerable under +perturbations. To alleviate these issues, we propose a region-based relation +learning paradigm that models social interactions via region-wise dynamics of +joint states, i.e., the changes in the density of crowds. In particular, +region-wise agent joint information is encoded within convolutional feature +grids. Social relations are modeled by relating the temporal changes of local +joint information from a global perspective. We show that region-based +relations are less susceptible to perturbations. In order to account for the +stochastic individual goals, we exploit a conditional variational autoencoder +to realize multi-goal estimation and diverse future prediction. Specifically, +we perform variational inference via the latent distribution, which is +conditioned on the correlation between input states and associated target +goals. Sampling from the latent distribution enables the framework to reliably +capture the stochastic behavior in test data. We integrate multi-goal +estimation and region-based relation learning to model the two stimuli, social +interactions, and stochastic goals, in a prediction framework. We evaluate our +framework on the ETH-UCY dataset and Stanford Drone Dataset (SDD). We show that +the diverse prediction better fits the ground truth when incorporating the +relation module. Our framework outperforms the state-of-the-art models on SDD +by $27.61\%$/$18.20\%$ of ADE/FDE metrics. + +
+
+
+
+
+ + ☆ V-MAD: Video-based Morphing Attack Detection in Operational Scenarios + + +
+ In response to the rising threat of the face morphing attack, this paper +introduces and explores the potential of Video-based Morphing Attack Detection +(V-MAD) systems in real-world operational scenarios. While current morphing +attack detection methods primarily focus on a single or a pair of images, V-MAD +is based on video sequences, exploiting the video streams often acquired by +face verification tools available, for instance, at airport gates. Through this +study, we show for the first time the advantages that the availability of +multiple probe frames can bring to the morphing attack detection task, +especially in scenarios where the quality of probe images is varied and might +be affected, for instance, by pose or illumination variations. Experimental +results on a real operational database demonstrate that video sequences +represent valuable information for increasing the robustness and performance of +morphing attack detection systems. + +
+
+
+
+
+ + ☆ Adversarial purification for no-reference image-quality metrics: + applicability study and new methods + + +
+ Recently, the area of adversarial attacks on image quality metrics has begun +to be explored, whereas the area of defences remains under-researched. In this +study, we aim to cover that case and check the transferability of adversarial +purification defences from image classifiers to IQA methods. In this paper, we +apply several widespread attacks on IQA models and examine the success of the +defences against them. The purification methodologies covered different +preprocessing techniques, including geometrical transformations, compression, +denoising, and modern neural network-based methods. Also, we address the +challenge of assessing the efficacy of a defensive methodology by proposing +ways to estimate output visual quality and the success of neutralizing attacks. +Defences were tested against attack on three IQA metrics -- Linearity, MetaIQA +and SPAQ. The code for attacks and defences is available at: (link is hidden +for a blind review). + +
+
+
+
+
+ + ☆ Accelerating Cardiac MRI Reconstruction with CMRatt: An Attention-Driven + Approach + + +
+ Cine cardiac magnetic resonance (CMR) imaging is recognised as the benchmark +modality for the comprehensive assessment of cardiac function. Nevertheless, +the acquisition process of cine CMR is considered as an impediment due to its +prolonged scanning time. One commonly used strategy to expedite the acquisition +process is through k-space undersampling, though it comes with a drawback of +introducing aliasing effects in the reconstructed image. Lately, deep +learning-based methods have shown remarkable results over traditional +approaches in rapidly achieving precise CMR reconstructed images. This study +aims to explore the untapped potential of attention mechanisms incorporated +with a deep learning model within the context of the CMR reconstruction +problem. We are motivated by the fact that attention has proven beneficial in +downstream tasks such as image classification and segmentation, but has not +been systematically analysed in the context of CMR reconstruction. Our primary +goal is to identify the strengths and potential limitations of attention +algorithms when integrated with a convolutional backbone model such as a U-Net. +To achieve this, we benchmark different state-of-the-art spatial and channel +attention mechanisms on the CMRxRecon dataset and quantitatively evaluate the +quality of reconstruction using objective metrics. Furthermore, inspired by the +best performing attention mechanism, we propose a new, simple yet effective, +attention pipeline specifically optimised for the task of cardiac image +reconstruction that outperforms other state-of-the-art attention methods. The +layer and model code will be made publicly available. + +
+
+ comment: This paper has been submitted for the 32nd European Signal Processing + Conference EUSIPCO 2024 in Lyon +
+
+
+
+
+ + ☆ Efficient and Generic Point Model for Lossless Point Cloud Attribute + Compression + + +
+ The past several years have witnessed the emergence of learned point cloud +compression (PCC) techniques. However, current learning-based lossless point +cloud attribute compression (PCAC) methods either suffer from high +computational complexity or deteriorated compression performance. Moreover, the +significant variations in point cloud scale and sparsity encountered in +real-world applications make developing an all-in-one neural model a +challenging task. In this paper, we propose PoLoPCAC, an efficient and generic +lossless PCAC method that achieves high compression efficiency and strong +generalizability simultaneously. We formulate lossless PCAC as the task of +inferring explicit distributions of attributes from group-wise autoregressive +priors. A progressive random grouping strategy is first devised to efficiently +resolve the point cloud into groups, and then the attributes of each group are +modeled sequentially from accumulated antecedents. A locality-aware attention +mechanism is utilized to exploit prior knowledge from context windows in +parallel. Since our method directly operates on points, it can naturally avoids +distortion caused by voxelization, and can be executed on point clouds with +arbitrary scale and density. Experiments show that our method can be instantly +deployed once trained on a Synthetic 2k-ShapeNet dataset while enjoying +continuous bit-rate reduction over the latest G-PCCv23 on various datasets +(ShapeNet, ScanNet, MVUB, 8iVFB). Meanwhile, our method reports shorter coding +time than G-PCCv23 on the majority of sequences with a lightweight model size +(2.6MB), which is highly attractive for practical applications. Dataset, code +and trained model are available at +https://github.com/I2-Multimedia-Lab/PoLoPCAC. + +
+
+
+
+
+ + ☆ HRVDA: High-Resolution Visual Document Assistant CVPR 2024 + + +
+ Leveraging vast training data, multimodal large language models (MLLMs) have +demonstrated formidable general visual comprehension capabilities and achieved +remarkable performance across various tasks. However, their performance in +visual document understanding still leaves much room for improvement. This +discrepancy is primarily attributed to the fact that visual document +understanding is a fine-grained prediction task. In natural scenes, MLLMs +typically use low-resolution images, leading to a substantial loss of visual +information. Furthermore, general-purpose MLLMs do not excel in handling +document-oriented instructions. In this paper, we propose a High-Resolution +Visual Document Assistant (HRVDA), which bridges the gap between MLLMs and +visual document understanding. This model employs a content filtering mechanism +and an instruction filtering module to separately filter out the +content-agnostic visual tokens and instruction-agnostic visual tokens, thereby +achieving efficient model training and inference for high-resolution images. In +addition, we construct a document-oriented visual instruction tuning dataset +and apply a multi-stage training strategy to enhance the model's document +modeling capabilities. Extensive experiments demonstrate that our model +achieves state-of-the-art performance across multiple document understanding +datasets, while maintaining training efficiency and inference speed comparable +to low-resolution models. + +
+
+ comment: Accepted to CVPR 2024 main conference +
+
+
+
+
+ + ☆ Sparse Global Matching for Video Frame Interpolation with Large Motion CVPR 2024 + + +
+ Large motion poses a critical challenge in Video Frame Interpolation (VFI) +task. Existing methods are often constrained by limited receptive fields, +resulting in sub-optimal performance when handling scenarios with large motion. +In this paper, we introduce a new pipeline for VFI, which can effectively +integrate global-level information to alleviate issues associated with large +motion. Specifically, we first estimate a pair of initial intermediate flows +using a high-resolution feature map for extracting local details. Then, we +incorporate a sparse global matching branch to compensate for flow estimation, +which consists of identifying flaws in initial flows and generating sparse flow +compensation with a global receptive field. Finally, we adaptively merge the +initial flow estimation with global flow compensation, yielding a more accurate +intermediate flow. To evaluate the effectiveness of our method in handling +large motion, we carefully curate a more challenging subset from commonly used +benchmarks. Our method demonstrates the state-of-the-art performance on these +VFI subsets with large motion. + +
+
+ comment: Accepted by CVPR 2024. Project page: https://sgm-vfi.github.io/ +
+
+
+
+
+ + ☆ DreamScene360: Unconstrained Text-to-3D Scene Generation with Panoramic + Gaussian Splatting + + +
+ The increasing demand for virtual reality applications has highlighted the +significance of crafting immersive 3D assets. We present a text-to-3D +360$^{\circ}$ scene generation pipeline that facilitates the creation of +comprehensive 360$^{\circ}$ scenes for in-the-wild environments in a matter of +minutes. Our approach utilizes the generative power of a 2D diffusion model and +prompt self-refinement to create a high-quality and globally coherent panoramic +image. This image acts as a preliminary "flat" (2D) scene representation. +Subsequently, it is lifted into 3D Gaussians, employing splatting techniques to +enable real-time exploration. To produce consistent 3D geometry, our pipeline +constructs a spatially coherent structure by aligning the 2D monocular depth +into a globally optimized point cloud. This point cloud serves as the initial +state for the centroids of 3D Gaussians. In order to address invisible issues +inherent in single-view inputs, we impose semantic and geometric constraints on +both synthesized and input camera views as regularizations. These guide the +optimization of Gaussians, aiding in the reconstruction of unseen regions. In +summary, our method offers a globally consistent 3D scene within a +360$^{\circ}$ perspective, providing an enhanced immersive experience over +existing techniques. Project website at: http://dreamscene360.github.io/ + +
+
+
+
+
+ + ☆ O-TALC: Steps Towards Combating Oversegmentation within Online Action + Segmentation + + +
+ Online temporal action segmentation shows a strong potential to facilitate +many HRI tasks where extended human action sequences must be tracked and +understood in real time. Traditional action segmentation approaches, however, +operate in an offline two stage approach, relying on computationally expensive +video wide features for segmentation, rendering them unsuitable for online HRI +applications. In order to facilitate online action segmentation on a stream of +incoming video data, we introduce two methods for improved training and +inference of backbone action recognition models, allowing them to be deployed +directly for online frame level classification. Firstly, we introduce surround +dense sampling whilst training to facilitate training vs. inference clip +matching and improve segment boundary predictions. Secondly, we introduce an +Online Temporally Aware Label Cleaning (O-TALC) strategy to explicitly reduce +oversegmentation during online inference. As our methods are backbone +invariant, they can be deployed with computationally efficient spatio-temporal +action recognition models capable of operating in real time with a small +segmentation latency. We show our method outperforms similar online action +segmentation work as well as matches the performance of many offline models +with access to full temporal resolution when operating on challenging +fine-grained datasets. + +
+
+ comment: 5 pages, 3 figures. Accepted as a short (unindexed) paper at the + TAHRI conference +
+
+
+
+
+ + ☆ SparseAD: Sparse Query-Centric Paradigm for Efficient End-to-End + Autonomous Driving + + +
+ End-to-End paradigms use a unified framework to implement multi-tasks in an +autonomous driving system. Despite simplicity and clarity, the performance of +end-to-end autonomous driving methods on sub-tasks is still far behind the +single-task methods. Meanwhile, the widely used dense BEV features in previous +end-to-end methods make it costly to extend to more modalities or tasks. In +this paper, we propose a Sparse query-centric paradigm for end-to-end +Autonomous Driving (SparseAD), where the sparse queries completely represent +the whole driving scenario across space, time and tasks without any dense BEV +representation. Concretely, we design a unified sparse architecture for +perception tasks including detection, tracking, and online mapping. Moreover, +we revisit motion prediction and planning, and devise a more justifiable motion +planner framework. On the challenging nuScenes dataset, SparseAD achieves SOTA +full-task performance among end-to-end methods and significantly narrows the +performance gap between end-to-end paradigms and single-task methods. Codes +will be released soon. + +
+
+
+
+
+ + ☆ Research on Detection of Floating Objects in River and Lake Based on AI + Intelligent Image Recognition + + +
+ With the rapid advancement of artificial intelligence technology, AI-enabled +image recognition has emerged as a potent tool for addressing challenges in +traditional environmental monitoring. This study focuses on the detection of +floating objects in river and lake environments, exploring an innovative +approach based on deep learning. By intricately analyzing the technical +pathways for detecting static and dynamic features and considering the +characteristics of river and lake debris, a comprehensive image acquisition and +processing workflow has been developed. The study highlights the application +and performance comparison of three mainstream deep learning models -SSD, +Faster-RCNN, and YOLOv5- in debris identification. Additionally, a detection +system for floating objects has been designed and implemented, encompassing +both hardware platform construction and software framework development. Through +rigorous experimental validation, the proposed system has demonstrated its +ability to significantly enhance the accuracy and efficiency of debris +detection, thus offering a new technological avenue for water quality +monitoring in rivers and lakes + +
+
+
+
+
+ + ☆ Fine color guidance in diffusion models and its application to image + compression at extremely low bitrates + + +
+ This study addresses the challenge of, without training or fine-tuning, +controlling the global color aspect of images generated with a diffusion model. +We rewrite the guidance equations to ensure that the outputs are closer to a +known color map, and this without hindering the quality of the generation. Our +method leads to new guidance equations. We show in the color guidance context +that, the scaling of the guidance should not decrease but remains high +throughout the diffusion process. In a second contribution, our guidance is +applied in a compression framework, we combine both semantic and general color +information on the image to decode the images at low cost. We show that our +method is effective at improving fidelity and realism of compressed images at +extremely low bit rates, when compared to other classical or more semantic +oriented approaches. + +
+
+ comment: Submitted to IEEE Transactions on Image Processing (TIP) +
+
+
+
+
+ + ☆ RESSCAL3D: Resolution Scalable 3D Semantic Segmentation of Point Clouds ICIP + + +
+ While deep learning-based methods have demonstrated outstanding results in +numerous domains, some important functionalities are missing. Resolution +scalability is one of them. In this work, we introduce a novel architecture, +dubbed RESSCAL3D, providing resolution-scalable 3D semantic segmentation of +point clouds. In contrast to existing works, the proposed method does not +require the whole point cloud to be available to start inference. Once a +low-resolution version of the input point cloud is available, first semantic +predictions can be generated in an extremely fast manner. This enables early +decision-making in subsequent processing steps. As additional points become +available, these are processed in parallel. To improve performance, features +from previously computed scales are employed as prior knowledge at the current +scale. Our experiments show that RESSCAL3D is 31-62% faster than the +non-scalable baseline while keeping a limited impact on performance. To the +best of our knowledge, the proposed method is the first to propose a +resolution-scalable approach for 3D semantic segmentation of point clouds based +on deep learning. + +
+
+ comment: Published at 2023 IEEE International Conference on Image Processing + (ICIP) +
+
+
+
+
+ + ☆ Monocular 3D lane detection for Autonomous Driving: Recent Achievements, + Challenges, and Outlooks + + +
+ 3D lane detection plays a crucial role in autonomous driving by extracting +structural and traffic information from the road in 3D space to assist the +self-driving car in rational, safe, and comfortable path planning and motion +control. Due to the consideration of sensor costs and the advantages of visual +data in color information, in practical applications, 3D lane detection based +on monocular vision is one of the important research directions in the field of +autonomous driving, which has attracted more and more attention in both +industry and academia. Unfortunately, recent progress in visual perception +seems insufficient to develop completely reliable 3D lane detection algorithms, +which also hinders the development of vision-based fully autonomous +self-driving cars, i.e., achieving level 5 autonomous driving, driving like +human-controlled cars. This is one of the conclusions drawn from this review +paper: there is still a lot of room for improvement and significant +improvements are still needed in the 3D lane detection algorithm for autonomous +driving cars using visual sensors. Motivated by this, this review defines, +analyzes, and reviews the current achievements in the field of 3D lane +detection research, and the vast majority of the current progress relies +heavily on computationally complex deep learning models. In addition, this +review covers the 3D lane detection pipeline, investigates the performance of +state-of-the-art algorithms, analyzes the time complexity of cutting-edge +modeling choices, and highlights the main achievements and limitations of +current research efforts. The survey also includes a comprehensive discussion +of available 3D lane detection datasets and the challenges that researchers +have faced but have not yet resolved. Finally, our work outlines future +research directions and welcomes researchers and practitioners to enter this +exciting field. + +
+
+
+
+
+ + ☆ Multi-Label Continual Learning for the Medical Domain: A Novel Benchmark + + +
+ Multi-label image classification in dynamic environments is a problem that +poses significant challenges. Previous studies have primarily focused on +scenarios such as Domain Incremental Learning and Class Incremental Learning, +which do not fully capture the complexity of real-world applications. In this +paper, we study the problem of classification of medical imaging in the +scenario termed New Instances \& New Classes, which combines the challenges of +both new class arrivals and domain shifts in a single framework. Unlike +traditional scenarios, it reflects the realistic nature of CL in domains such +as medical imaging, where updates may introduce both new classes and changes in +domain characteristics. To address the unique challenges posed by this complex +scenario, we introduce a novel approach called Pseudo-Label Replay. This method +aims to mitigate forgetting while adapting to new classes and domain shifts by +combining the advantages of the Replay and Pseudo-Label methods and solving +their limitations in the proposed scenario. % part3 We evaluate our proposed +approach on a challenging benchmark consisting of two datasets, seven tasks, +and nineteen classes, modeling a realistic Continual Learning scenario. Our +experimental findings demonstrate the effectiveness of Pseudo-Label Replay in +addressing the challenges posed by the complex scenario proposed. Our method +surpasses existing approaches, exhibiting superior performance while showing +minimal forgetting. + +
+
+
+
+
+ + ☆ UDiFF: Generating Conditional Unsigned Distance Fields with Optimal + Wavelet Diffusion CVPR2024 + + +
+ Diffusion models have shown remarkable results for image generation, editing +and inpainting. Recent works explore diffusion models for 3D shape generation +with neural implicit functions, i.e., signed distance function and occupancy +function. However, they are limited to shapes with closed surfaces, which +prevents them from generating diverse 3D real-world contents containing open +surfaces. In this work, we present UDiFF, a 3D diffusion model for unsigned +distance fields (UDFs) which is capable to generate textured 3D shapes with +open surfaces from text conditions or unconditionally. Our key idea is to +generate UDFs in spatial-frequency domain with an optimal wavelet +transformation, which produces a compact representation space for UDF +generation. Specifically, instead of selecting an appropriate wavelet +transformation which requires expensive manual efforts and still leads to large +information loss, we propose a data-driven approach to learn the optimal +wavelet transformation for UDFs. We evaluate UDiFF to show our advantages by +numerical and visual comparisons with the latest methods on widely used +benchmarks. Page: https://weiqi-zhang.github.io/UDiFF. + +
+
+ comment: To appear at CVPR2024. Project page: + https://weiqi-zhang.github.io/UDiFF +
+
+
+
+
+ + ☆ MoCha-Stereo: Motif Channel Attention Network for Stereo Matching CVPR 2024 + + +
+ Learning-based stereo matching techniques have made significant progress. +However, existing methods inevitably lose geometrical structure information +during the feature channel generation process, resulting in edge detail +mismatches. In this paper, the Motif Cha}nnel Attention Stereo Matching Network +(MoCha-Stereo) is designed to address this problem. We provide the Motif +Channel Correlation Volume (MCCV) to determine more accurate edge matching +costs. MCCV is achieved by projecting motif channels, which capture common +geometric structures in feature channels, onto feature maps and cost volumes. +In addition, edge variations in %potential feature channels of the +reconstruction error map also affect details matching, we propose the +Reconstruction Error Motif Penalty (REMP) module to further refine the +full-resolution disparity estimation. REMP integrates the frequency information +of typical channel features from the reconstruction error. MoCha-Stereo ranks +1st on the KITTI-2015 and KITTI-2012 Reflective leaderboards. Our structure +also shows excellent performance in Multi-View Stereo. Code is avaliable at +https://github.com/ZYangChen/MoCha-Stereo. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ O2V-Mapping: Online Open-Vocabulary Mapping with Neural Implicit + Representation + + +
+ Online construction of open-ended language scenes is crucial for robotic +applications, where open-vocabulary interactive scene understanding is +required. Recently, neural implicit representation has provided a promising +direction for online interactive mapping. However, implementing open-vocabulary +scene understanding capability into online neural implicit mapping still faces +three challenges: lack of local scene updating ability, blurry spatial +hierarchical semantic segmentation and difficulty in maintaining multi-view +consistency. To this end, we proposed O2V-mapping, which utilizes voxel-based +language and geometric features to create an open-vocabulary field, thus +allowing for local updates during online training process. Additionally, we +leverage a foundational model for image segmentation to extract language +features on object-level entities, achieving clear segmentation boundaries and +hierarchical semantic features. For the purpose of preserving consistency in 3D +object properties across different viewpoints, we propose a spatial adaptive +voxel adjustment mechanism and a multi-view weight selection method. Extensive +experiments on open-vocabulary object localization and semantic segmentation +demonstrate that O2V-mapping achieves online construction of language scenes +while enhancing accuracy, outperforming the previous SOTA method. + +
+
+
+
+
+ + ☆ Tuning-Free Adaptive Style Incorporation for Structure-Consistent + Text-Driven Style Transfer + + +
+ In this work, we target the task of text-driven style transfer in the context +of text-to-image (T2I) diffusion models. The main challenge is consistent +structure preservation while enabling effective style transfer effects. The +past approaches in this field directly concatenate the content and style +prompts for a prompt-level style injection, leading to unavoidable structure +distortions. In this work, we propose a novel solution to the text-driven style +transfer task, namely, Adaptive Style Incorporation~(ASI), to achieve +fine-grained feature-level style incorporation. It consists of the Siamese +Cross-Attention~(SiCA) to decouple the single-track cross-attention to a +dual-track structure to obtain separate content and style features, and the +Adaptive Content-Style Blending (AdaBlending) module to couple the content and +style information from a structure-consistent manner. Experimentally, our +method exhibits much better performance in both structure preservation and +stylized effects. + +
+
+
+
+
+ + ☆ SplatPose & Detect: Pose-Agnostic 3D Anomaly Detection CVPR 2024 + + +
+ Detecting anomalies in images has become a well-explored problem in both +academia and industry. State-of-the-art algorithms are able to detect defects +in increasingly difficult settings and data modalities. However, most current +methods are not suited to address 3D objects captured from differing poses. +While solutions using Neural Radiance Fields (NeRFs) have been proposed, they +suffer from excessive computation requirements, which hinder real-world +usability. For this reason, we propose the novel 3D Gaussian splatting-based +framework SplatPose which, given multi-view images of a 3D object, accurately +estimates the pose of unseen views in a differentiable manner, and detects +anomalies in them. We achieve state-of-the-art results in both training and +inference speed, and detection performance, even when using less training data +than competing methods. We thoroughly evaluate our framework using the recently +proposed Pose-agnostic Anomaly Detection benchmark and its multi-pose anomaly +detection (MAD) data set. + +
+
+ comment: Visual Anomaly and Novelty Detection 2.0 Workshop at CVPR 2024 +
+
+
+
+
+ + ☆ Zero-shot Point Cloud Completion Via 2D Priors + + +
+ 3D point cloud completion is designed to recover complete shapes from +partially observed point clouds. Conventional completion methods typically +depend on extensive point cloud data for training %, with their effectiveness +often constrained to object categories similar to those seen during training. +In contrast, we propose a zero-shot framework aimed at completing partially +observed point clouds across any unseen categories. Leveraging point rendering +via Gaussian Splatting, we develop techniques of Point Cloud Colorization and +Zero-shot Fractal Completion that utilize 2D priors from pre-trained diffusion +models to infer missing regions. Experimental results on both synthetic and +real-world scanned point clouds demonstrate that our approach outperforms +existing methods in completing a variety of objects without any requirement for +specific training data. + +
+
+
+
+
+ + ☆ MedRG: Medical Report Grounding with Multi-modal Large Language Model + + +
+ Medical Report Grounding is pivotal in identifying the most relevant regions +in medical images based on a given phrase query, a critical aspect in medical +image analysis and radiological diagnosis. However, prevailing visual grounding +approaches necessitate the manual extraction of key phrases from medical +reports, imposing substantial burdens on both system efficiency and physicians. +In this paper, we introduce a novel framework, Medical Report Grounding +(MedRG), an end-to-end solution for utilizing a multi-modal Large Language +Model to predict key phrase by incorporating a unique token, BOX, into the +vocabulary to serve as an embedding for unlocking detection capabilities. +Subsequently, the vision encoder-decoder jointly decodes the hidden embedding +and the input medical image, generating the corresponding grounding box. The +experimental results validate the effectiveness of MedRG, surpassing the +performance of the existing state-of-the-art medical phrase grounding methods. +This study represents a pioneering exploration of the medical report grounding +task, marking the first-ever endeavor in this domain. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ☆ Urban Architect: Steerable 3D Urban Scene Generation with Layout Prior + + +
+ Text-to-3D generation has achieved remarkable success via large-scale +text-to-image diffusion models. Nevertheless, there is no paradigm for scaling +up the methodology to urban scale. Urban scenes, characterized by numerous +elements, intricate arrangement relationships, and vast scale, present a +formidable barrier to the interpretability of ambiguous textual descriptions +for effective model optimization. In this work, we surmount the limitations by +introducing a compositional 3D layout representation into text-to-3D paradigm, +serving as an additional prior. It comprises a set of semantic primitives with +simple geometric structures and explicit arrangement relationships, +complementing textual descriptions and enabling steerable generation. Upon +this, we propose two modifications -- (1) We introduce Layout-Guided +Variational Score Distillation to address model optimization inadequacies. It +conditions the score distillation sampling process with geometric and semantic +constraints of 3D layouts. (2) To handle the unbounded nature of urban scenes, +we represent 3D scene with a Scalable Hash Grid structure, incrementally +adapting to the growing scale of urban scenes. Extensive experiments +substantiate the capability of our framework to scale text-to-3D generation to +large-scale urban scenes that cover over 1000m driving distance for the first +time. We also present various scene editing demonstrations, showing the powers +of steerable urban scene generation. Website: https://urbanarchitect.github.io. + +
+
+ comment: Project page: https://urbanarchitect.github.io/ +
+
+
+
+
+ + ☆ Efficient and Scalable Chinese Vector Font Generation via Component + Composition + + +
+ Chinese vector font generation is challenging due to the complex structure +and huge amount of Chinese characters. Recent advances remain limited to +generating a small set of characters with simple structure. In this work, we +first observe that most Chinese characters can be disassembled into +frequently-reused components. Therefore, we introduce the first efficient and +scalable Chinese vector font generation approach via component composition, +allowing generating numerous vector characters from a small set of components. +To achieve this, we collect a large-scale dataset that contains over +\textit{90K} Chinese characters with their components and layout information. +Upon the dataset, we propose a simple yet effective framework based on spatial +transformer networks (STN) and multiple losses tailored to font characteristics +to learn the affine transformation of the components, which can be directly +applied to the B\'ezier curves, resulting in Chinese characters in vector +format. Our qualitative and quantitative experiments have demonstrated that our +method significantly surpasses the state-of-the-art vector font generation +methods in generating large-scale complex Chinese characters in both font +generation and zero-shot font extension. + +
+
+ comment: 15 pages, 23 figures +
+
+
+
+
+ + ☆ Object-Conditioned Energy-Based Attention Map Alignment in Text-to-Image + Diffusion Models + + +
+ Text-to-image diffusion models have shown great success in generating +high-quality text-guided images. Yet, these models may still fail to +semantically align generated images with the provided text prompts, leading to +problems like incorrect attribute binding and/or catastrophic object neglect. +Given the pervasive object-oriented structure underlying text prompts, we +introduce a novel object-conditioned Energy-Based Attention Map Alignment +(EBAMA) method to address the aforementioned problems. We show that an +object-centric attribute binding loss naturally emerges by approximately +maximizing the log-likelihood of a $z$-parameterized energy-based model with +the help of the negative sampling technique. We further propose an +object-centric intensity regularizer to prevent excessive shifts of objects +attention towards their attributes. Extensive qualitative and quantitative +experiments, including human evaluation, on several challenging benchmarks +demonstrate the superior performance of our method over previous strong +counterparts. With better aligned attention maps, our approach shows great +promise in further enhancing the text-controlled image editing ability of +diffusion models. + +
+
+
+
+
+ + ☆ Deep Generative Sampling in the Dual Divergence Space: A Data-efficient + & Interpretative Approach for Generative AI + + +
+ Building on the remarkable achievements in generative sampling of natural +images, we propose an innovative challenge, potentially overly ambitious, which +involves generating samples of entire multivariate time series that resemble +images. However, the statistical challenge lies in the small sample size, +sometimes consisting of a few hundred subjects. This issue is especially +problematic for deep generative models that follow the conventional approach of +generating samples from a canonical distribution and then decoding or denoising +them to match the true data distribution. In contrast, our method is grounded +in information theory and aims to implicitly characterize the distribution of +images, particularly the (global and local) dependency structure between +pixels. We achieve this by empirically estimating its KL-divergence in the dual +form with respect to the respective marginal distribution. This enables us to +perform generative sampling directly in the optimized 1-D dual divergence +space. Specifically, in the dual space, training samples representing the data +distribution are embedded in the form of various clusters between two end +points. In theory, any sample embedded between those two end points is +in-distribution w.r.t. the data distribution. Our key idea for generating novel +samples of images is to interpolate between the clusters via a walk as per +gradients of the dual function w.r.t. the data dimensions. In addition to the +data efficiency gained from direct sampling, we propose an algorithm that +offers a significant reduction in sample complexity for estimating the +divergence of the data distribution with respect to the marginal distribution. +We provide strong theoretical guarantees along with an extensive empirical +evaluation using many real-world datasets from diverse domains, establishing +the superiority of our approach w.r.t. state-of-the-art deep learning methods. + +
+
+
+
+
+ + ☆ Improving Multi-Center Generalizability of GAN-Based Fat Suppression + using Federated Learning + + +
+ Generative Adversarial Network (GAN)-based synthesis of fat suppressed (FS) +MRIs from non-FS proton density sequences has the potential to accelerate +acquisition of knee MRIs. However, GANs trained on single-site data have poor +generalizability to external data. We show that federated learning can improve +multi-center generalizability of GANs for synthesizing FS MRIs, while +facilitating privacy-preserving multi-institutional collaborations. + +
+
+ comment: 5 pages, 2 figures +
+
+
+
+
+ + ☆ GANsemble for Small and Imbalanced Data Sets: A Baseline for Synthetic + Microplastics Data + + +
+ Microplastic particle ingestion or inhalation by humans is a problem of +growing concern. Unfortunately, current research methods that use machine +learning to understand their potential harms are obstructed by a lack of +available data. Deep learning techniques in particular are challenged by such +domains where only small or imbalanced data sets are available. Overcoming this +challenge often involves oversampling underrepresented classes or augmenting +the existing data to improve model performance. This paper proposes GANsemble: +a two-module framework connecting data augmentation with conditional generative +adversarial networks (cGANs) to generate class-conditioned synthetic data. +First, the data chooser module automates augmentation strategy selection by +searching for the best data augmentation strategy. Next, the cGAN module uses +this strategy to train a cGAN for generating enhanced synthetic data. We +experiment with the GANsemble framework on a small and imbalanced microplastics +data set. A Microplastic-cGAN (MPcGAN) algorithm is introduced, and baselines +for synthetic microplastics (SYMP) data are established in terms of Frechet +Inception Distance (FID) and Inception Scores (IS). We also provide a synthetic +microplastics filter (SYMP-Filter) algorithm to increase the quality of +generated SYMP. Additionally, we show the best amount of oversampling with +augmentation to fix class imbalance in small microplastics data sets. To our +knowledge, this study is the first application of generative AI to +synthetically create microplastics data. + +
+
+ comment: Accepted to the 37th Canadian Artificial Intelligence Conference + (2024), 12 pages, 4 figures +
+
+
+
+
+ + ☆ A Transformer-Based Model for the Prediction of Human Gaze Behavior on + Videos + + +
+ Eye-tracking applications that utilize the human gaze in video understanding +tasks have become increasingly important. To effectively automate the process +of video analysis based on eye-tracking data, it is important to accurately +replicate human gaze behavior. However, this task presents significant +challenges due to the inherent complexity and ambiguity of human gaze patterns. +In this work, we introduce a novel method for simulating human gaze behavior. +Our approach uses a transformer-based reinforcement learning algorithm to train +an agent that acts as a human observer, with the primary role of watching +videos and simulating human gaze behavior. We employed an eye-tracking dataset +gathered from videos generated by the VirtualHome simulator, with a primary +focus on activity recognition. Our experimental results demonstrate the +effectiveness of our gaze prediction method by highlighting its capability to +replicate human gaze behavior and its applicability for downstream tasks where +real human-gaze is used as input. + +
+
+ comment: 2024 Symposium on Eye Tracking Research and Applications (ETRA24), + Glasgow, United Kingdom +
+
+
+
+
+ + ☆ Gaze-Guided Graph Neural Network for Action Anticipation Conditioned on + Intention + + +
+ Humans utilize their gaze to concentrate on essential information while +perceiving and interpreting intentions in videos. Incorporating human gaze into +computational algorithms can significantly enhance model performance in video +understanding tasks. In this work, we address a challenging and innovative task +in video understanding: predicting the actions of an agent in a video based on +a partial video. We introduce the Gaze-guided Action Anticipation algorithm, +which establishes a visual-semantic graph from the video input. Our method +utilizes a Graph Neural Network to recognize the agent's intention and predict +the action sequence to fulfill this intention. To assess the efficiency of our +approach, we collect a dataset containing household activities generated in the +VirtualHome environment, accompanied by human gaze data of viewing videos. Our +method outperforms state-of-the-art techniques, achieving a 7\% improvement in +accuracy for 18-class intention recognition. This highlights the efficiency of +our method in learning important features from human gaze data. + +
+
+ comment: 2024 Symposium on Eye Tracking Research and Applications (ETRA24), + Glasgow, United Kingdom +
+
+
+
+
+ + ☆ PEAVS: Perceptual Evaluation of Audio-Visual Synchrony Grounded in + Viewers' Opinion Scores + + +
+ Recent advancements in audio-visual generative modeling have been propelled +by progress in deep learning and the availability of data-rich benchmarks. +However, the growth is not attributed solely to models and benchmarks. +Universally accepted evaluation metrics also play an important role in +advancing the field. While there are many metrics available to evaluate audio +and visual content separately, there is a lack of metrics that offer a +quantitative and interpretable measure of audio-visual synchronization for +videos "in the wild". To address this gap, we first created a large scale human +annotated dataset (100+ hrs) representing nine types of synchronization errors +in audio-visual content and how human perceive them. We then developed a PEAVS +(Perceptual Evaluation of Audio-Visual Synchrony) score, a novel automatic +metric with a 5-point scale that evaluates the quality of audio-visual +synchronization. We validate PEAVS using a newly generated dataset, achieving a +Pearson correlation of 0.79 at the set level and 0.54 at the clip level when +compared to human labels. In our experiments, we observe a relative gain 50% +over a natural extension of Fr\'echet based metrics for Audio-Visual synchrony, +confirming PEAVS efficacy in objectively modeling subjective perceptions of +audio-visual synchronization for videos "in the wild". + +
+
+ comment: 24 pages +
+
+
+
+
+ + ☆ Rethinking Perceptual Metrics for Medical Image Translation + + +
+ Modern medical image translation methods use generative models for tasks such +as the conversion of CT images to MRI. Evaluating these methods typically +relies on some chosen downstream task in the target domain, such as +segmentation. On the other hand, task-agnostic metrics are attractive, such as +the network feature-based perceptual metrics (e.g., FID) that are common to +image translation in general computer vision. In this paper, we investigate +evaluation metrics for medical image translation on two medical image +translation tasks (GE breast MRI to Siemens breast MRI and lumbar spine MRI to +CT), tested on various state-of-the-art translation methods. We show that +perceptual metrics do not generally correlate with segmentation metrics due to +them extending poorly to the anatomical constraints of this sub-field, with FID +being especially inconsistent. However, we find that the lesser-used +pixel-level SWD metric may be useful for subtle intra-modality translation. Our +results demonstrate the need for further research into helpful metrics for +medical image translation. + +
+
+
+
+
+ + ☆ AI-Guided Defect Detection Techniques to Model Single Crystal Diamond + Growth + + +
+ From a process development perspective, diamond growth via chemical vapor +deposition has made significant strides. However, challenges persist in +achieving high quality and large-area material production. These difficulties +include controlling conditions to maintain uniform growth rates for the entire +growth surface. As growth progresses, various factors or defect states emerge, +altering the uniform conditions. These changes affect the growth rate and +result in the formation of crystalline defects at the microscale. However, +there is a distinct lack of methods to identify these defect states and their +geometry using images taken during the growth process. This paper details +seminal work on defect segmentation pipeline using in-situ optical images to +identify features that indicate defective states that are visible at the +macroscale. Using a semantic segmentation approach as applied in our previous +work, these defect states and corresponding derivative features are isolated +and classified by their pixel masks. Using an annotation focused +human-in-the-loop software architecture to produce training datasets, with +modules for selective data labeling using active learning, data augmentations, +and model-assisted labeling, our approach achieves effective annotation +accuracy and drastically reduces the time and cost of labeling by orders of +magnitude. On the model development front, we found that deep learning-based +algorithms are the most efficient. They can accurately learn complex +representations from feature-rich datasets. Our best-performing model, based on +the YOLOV3 and DeeplabV3plus architectures, achieved excellent accuracy for +specific features of interest. Specifically, it reached 93.35% accuracy for +center defects, 92.83% for polycrystalline defects, and 91.98% for edge +defects. + +
+
+ comment: 12 pages,4 figures,ACMME 2024 +
+
+
+
+
+ + ☆ Solving Masked Jigsaw Puzzles with Diffusion Vision Transformers + + +
+ Solving image and video jigsaw puzzles poses the challenging task of +rearranging image fragments or video frames from unordered sequences to restore +meaningful images and video sequences. Existing approaches often hinge on +discriminative models tasked with predicting either the absolute positions of +puzzle elements or the permutation actions applied to the original data. +Unfortunately, these methods face limitations in effectively solving puzzles +with a large number of elements. In this paper, we propose JPDVT, an innovative +approach that harnesses diffusion transformers to address this challenge. +Specifically, we generate positional information for image patches or video +frames, conditioned on their underlying visual content. This information is +then employed to accurately assemble the puzzle pieces in their correct +positions, even in scenarios involving missing pieces. Our method achieves +state-of-the-art performance on several datasets. + +
+
+ comment: 8 pages, 7 figures +
+
+
+
+
+ + ☆ Logit Calibration and Feature Contrast for Robust Federated Learning on + Non-IID Data + + +
+ Federated learning (FL) is a privacy-preserving distributed framework for +collaborative model training on devices in edge networks. However, challenges +arise due to vulnerability to adversarial examples (AEs) and the +non-independent and identically distributed (non-IID) nature of data +distribution among devices, hindering the deployment of adversarially robust +and accurate learning models at the edge. While adversarial training (AT) is +commonly acknowledged as an effective defense strategy against adversarial +attacks in centralized training, we shed light on the adverse effects of +directly applying AT in FL that can severely compromise accuracy, especially in +non-IID challenges. Given this limitation, this paper proposes FatCC, which +incorporates local logit \underline{C}alibration and global feature +\underline{C}ontrast into the vanilla federated adversarial training +(\underline{FAT}) process from both logit and feature perspectives. This +approach can effectively enhance the federated system's robust accuracy (RA) +and clean accuracy (CA). First, we propose logit calibration, where the logits +are calibrated during local adversarial updates, thereby improving adversarial +robustness. Second, FatCC introduces feature contrast, which involves a global +alignment term that aligns each local representation with unbiased global +features, thus further enhancing robustness and accuracy in federated +adversarial environments. Extensive experiments across multiple datasets +demonstrate that FatCC achieves comparable or superior performance gains in +both CA and RA compared to other baselines. + +
+
+
+
+
+ + ☆ Adapting LLaMA Decoder to Vision Transformer + + +
+ This work examines whether decoder-only Transformers such as LLaMA, which +were originally designed for large language models (LLMs), can be adapted to +the computer vision field. We first "LLaMAfy" a standard ViT step-by-step to +align with LLaMA's architecture, and find that directly applying a casual mask +to the self-attention brings an attention collapse issue, resulting in the +failure to the network training. We suggest to reposition the class token +behind the image tokens with a post-sequence class token technique to overcome +this challenge, enabling causal self-attention to efficiently capture the +entire image's information. Additionally, we develop a soft mask strategy that +gradually introduces a casual mask to the self-attention at the onset of +training to facilitate the optimization behavior. The tailored model, dubbed as +image LLaMA (iLLaMA), is akin to LLaMA in architecture and enables direct +supervised learning. Its causal self-attention boosts computational efficiency +and learns complex representation by elevating attention map ranks. iLLaMA +rivals the performance with its encoder-only counterparts, achieving 75.1% +ImageNet top-1 accuracy with only 5.7M parameters. Scaling the model to ~310M +and pre-training on ImageNet-21K further enhances the accuracy to 86.0%. +Extensive experiments demonstrate iLLaMA's reliable properties: calibration, +shape-texture bias, quantization compatibility, ADE20K segmentation and CIFAR +transfer learning. We hope our study can kindle fresh views to visual model +design in the wave of LLMs. Pre-trained models and codes are available here. + +
+
+ comment: 22 pages, 10 figures +
+
+
+
+
+ + ☆ MonoSelfRecon: Purely Self-Supervised Explicit Generalizable 3D + Reconstruction of Indoor Scenes from Monocular RGB Views + + +
+ Current monocular 3D scene reconstruction (3DR) works are either +fully-supervised, or not generalizable, or implicit in 3D representation. We +propose a novel framework - MonoSelfRecon that for the first time achieves +explicit 3D mesh reconstruction for generalizable indoor scenes with monocular +RGB views by purely self-supervision on voxel-SDF (signed distance function). +MonoSelfRecon follows an Autoencoder-based architecture, decodes voxel-SDF and +a generalizable Neural Radiance Field (NeRF), which is used to guide voxel-SDF +in self-supervision. We propose novel self-supervised losses, which not only +support pure self-supervision, but can be used together with supervised signals +to further boost supervised training. Our experiments show that "MonoSelfRecon" +trained in pure self-supervision outperforms current best self-supervised +indoor depth estimation models and is comparable to 3DR models trained in fully +supervision with depth annotations. MonoSelfRecon is not restricted by specific +model design, which can be used to any models with voxel-SDF for purely +self-supervised manner. + +
+
+
+
+
+ + ☆ YOLO based Ocean Eddy Localization with AWS SageMaker + + +
+ Ocean eddies play a significant role both on the sea surface and beneath it, +contributing to the sustainability of marine life dependent on oceanic +behaviors. Therefore, it is crucial to investigate ocean eddies to monitor +changes in the Earth, particularly in the oceans, and their impact on climate. +This study aims to pinpoint ocean eddies using AWS cloud services, specifically +SageMaker. The primary objective is to detect small-scale (<20km) ocean eddies +from satellite remote images and assess the feasibility of utilizing SageMaker, +which offers tools for deploying AI applications. Moreover, this research not +only explores the deployment of cloud-based services for remote sensing of +Earth data but also evaluates several YOLO (You Only Look Once) models using +single and multi-GPU-based services in the cloud. Furthermore, this study +underscores the potential of these services, their limitations, challenges +related to deployment and resource management, and their user-riendliness for +Earth science projects. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ An Animation-based Augmentation Approach for Action Recognition from + Discontinuous Video + + +
+ The study of action recognition has attracted considerable attention recently +due to its broad applications in multiple areas. However, with the issue of +discontinuous training video, which not only decreases the performance of +action recognition model, but complicates the data augmentation process as +well, still remains under-exploration. In this study, we introduce the 4A +(Action Animation-based Augmentation Approach), an innovative pipeline for data +augmentation to address the problem. The main contributions remain in our work +includes: (1) we investigate the problem of severe decrease on performance of +action recognition task training by discontinuous video, and the limitation of +existing augmentation methods on solving this problem. (2) we propose a novel +augmentation pipeline, 4A, to address the problem of discontinuous video for +training, while achieving a smoother and natural-looking action representation +than the latest data augmentation methodology. (3) We achieve the same +performance with only 10% of the original data for training as with all of the +original data from the real-world dataset, and a better performance on +In-the-wild videos, by employing our data augmentation techniques. + +
+
+
+
+
+ + ☆ Bayesian NeRF: Quantifying Uncertainty with Volume Density in Neural + Radiance Fields + + +
+ We present the Bayesian Neural Radiance Field (NeRF), which explicitly +quantifies uncertainty in geometric volume structures without the need for +additional networks, making it adept for challenging observations and +uncontrolled images. NeRF diverges from traditional geometric methods by +offering an enriched scene representation, rendering color and density in 3D +space from various viewpoints. However, NeRF encounters limitations in relaxing +uncertainties by using geometric structure information, leading to inaccuracies +in interpretation under insufficient real-world observations. Recent research +efforts aimed at addressing this issue have primarily relied on empirical +methods or auxiliary networks. To fundamentally address this issue, we propose +a series of formulational extensions to NeRF. By introducing generalized +approximations and defining density-related uncertainty, our method seamlessly +extends to manage uncertainty not only for RGB but also for depth, without the +need for additional networks or empirical assumptions. In experiments we show +that our method significantly enhances performance on RGB and depth images in +the comprehensive dataset, demonstrating the reliability of the Bayesian NeRF +approach to quantifying uncertainty based on the geometric structure. + +
+
+
+
+
+ + ☆ Sparse Points to Dense Clouds: Enhancing 3D Detection with Limited LiDAR + Data + + +
+ 3D detection is a critical task that enables machines to identify and locate +objects in three-dimensional space. It has a broad range of applications in +several fields, including autonomous driving, robotics and augmented reality. +Monocular 3D detection is attractive as it requires only a single camera, +however, it lacks the accuracy and robustness required for real world +applications. High resolution LiDAR on the other hand, can be expensive and +lead to interference problems in heavy traffic given their active +transmissions. We propose a balanced approach that combines the advantages of +monocular and point cloud-based 3D detection. Our method requires only a small +number of 3D points, that can be obtained from a low-cost, low-resolution +sensor. Specifically, we use only 512 points, which is just 1% of a full LiDAR +frame in the KITTI dataset. Our method reconstructs a complete 3D point cloud +from this limited 3D information combined with a single image. The +reconstructed 3D point cloud and corresponding image can be used by any +multi-modal off-the-shelf detector for 3D object detection. By using the +proposed network architecture with an off-the-shelf multi-modal 3D detector, +the accuracy of 3D detection improves by 20% compared to the state-of-the-art +monocular detection methods and 6% to 9% compare to the baseline multi-modal +methods on KITTI and JackRabbot datasets. + +
+
+
+
+
+ + ☆ Convolution-based Probability Gradient Loss for Semantic Segmentation + + +
+ In this paper, we introduce a novel Convolution-based Probability Gradient +(CPG) loss for semantic segmentation. It employs convolution kernels similar to +the Sobel operator, capable of computing the gradient of pixel intensity in an +image. This enables the computation of gradients for both ground-truth and +predicted category-wise probabilities. It enhances network performance by +maximizing the similarity between these two probability gradients. Moreover, to +specifically enhance accuracy near the object's boundary, we extract the object +boundary based on the ground-truth probability gradient and exclusively apply +the CPG loss to pixels belonging to boundaries. CPG loss proves to be highly +convenient and effective. It establishes pixel relationships through +convolution, calculating errors from a distinct dimension compared to +pixel-wise loss functions such as cross-entropy loss. We conduct qualitative +and quantitative analyses to evaluate the impact of the CPG loss on three +well-established networks (DeepLabv3-Resnet50, HRNetV2-OCR, and +LRASPP_MobileNet_V3_Large) across three standard segmentation datasets +(Cityscapes, COCO-Stuff, ADE20K). Our extensive experimental results +consistently and significantly demonstrate that the CPG loss enhances the mean +Intersection over Union. + +
+
+ comment: 12 pages, 7 figures +
+
+
+
+
+ + ☆ Scaling Multi-Camera 3D Object Detection through Weak-to-Strong + Eliciting + + +
+ The emergence of Multi-Camera 3D Object Detection (MC3D-Det), facilitated by +bird's-eye view (BEV) representation, signifies a notable progression in 3D +object detection. Scaling MC3D-Det training effectively accommodates varied +camera parameters and urban landscapes, paving the way for the MC3D-Det +foundation model. However, the multi-view fusion stage of the MC3D-Det method +relies on the ill-posed monocular perception during training rather than +surround refinement ability, leading to what we term "surround refinement +degradation". To this end, our study presents a weak-to-strong eliciting +framework aimed at enhancing surround refinement while maintaining robust +monocular perception. Specifically, our framework employs weakly tuned experts +trained on distinct subsets, and each is inherently biased toward specific +camera configurations and scenarios. These biased experts can learn the +perception of monocular degeneration, which can help the multi-view fusion +stage to enhance surround refinement abilities. Moreover, a composite +distillation strategy is proposed to integrate the universal knowledge of 2D +foundation models and task-specific information. Finally, for MC3D-Det joint +training, the elaborate dataset merge strategy is designed to solve the problem +of inconsistent camera numbers and camera parameters. We set up a multiple +dataset joint training benchmark for MC3D-Det and adequately evaluated existing +methods. Further, we demonstrate the proposed framework brings a generalized +and significant boost over multiple baselines. Our code is at +\url{https://github.com/EnVision-Research/Scale-BEV}. + +
+
+
+
+
+ + ☆ Binomial Self-compensation for Motion Error in Dynamic 3D Scanning + + +
+ Phase shifting profilometry (PSP) is favored in high-precision 3D scanning +due to its high accuracy, robustness, and pixel-wise property. However, a +fundamental assumption of PSP that the object should remain static is violated +in dynamic measurement, making PSP susceptible to object moving, resulting in +ripple-like errors in the point clouds. We propose a pixel-wise and frame-wise +loopable binomial self-compensation (BSC) algorithm to effectively and flexibly +eliminate motion error in the four-step PSP. Our mathematical model +demonstrates that by summing successive motion-affected phase frames weighted +by binomial coefficients, motion error exponentially diminishes as the binomial +order increases, accomplishing automatic error compensation through the +motion-affected phase sequence, without the assistance of any intermediate +variable. Extensive experiments show that our BSC outperforms the existing +methods in reducing motion error, while achieving a depth map frame rate equal +to the camera's acquisition rate (90 fps), enabling high-accuracy 3D +reconstruction with a quasi-single-shot frame rate. + +
+
+
+
+
+ + ☆ Perception-Oriented Video Frame Interpolation via Asymmetric Blending CVPR 2024 + + +
+ Previous methods for Video Frame Interpolation (VFI) have encountered +challenges, notably the manifestation of blur and ghosting effects. These +issues can be traced back to two pivotal factors: unavoidable motion errors and +misalignment in supervision. In practice, motion estimates often prove to be +error-prone, resulting in misaligned features. Furthermore, the reconstruction +loss tends to bring blurry results, particularly in misaligned regions. To +mitigate these challenges, we propose a new paradigm called PerVFI +(Perception-oriented Video Frame Interpolation). Our approach incorporates an +Asymmetric Synergistic Blending module (ASB) that utilizes features from both +sides to synergistically blend intermediate features. One reference frame +emphasizes primary content, while the other contributes complementary +information. To impose a stringent constraint on the blending process, we +introduce a self-learned sparse quasi-binary mask which effectively mitigates +ghosting and blur artifacts in the output. Additionally, we employ a +normalizing flow-based generator and utilize the negative log-likelihood loss +to learn the conditional distribution of the output, which further facilitates +the generation of clear and fine details. Experimental results validate the +superiority of PerVFI, demonstrating significant improvements in perceptual +quality compared to existing methods. Codes are available at +\url{https://github.com/mulns/PerVFI} + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Unsupervised Visible-Infrared ReID via Pseudo-label Correction and + Modality-level Alignment + + +
+ Unsupervised visible-infrared person re-identification (UVI-ReID) has +recently gained great attention due to its potential for enhancing human +detection in diverse environments without labeling. Previous methods utilize +intra-modality clustering and cross-modality feature matching to achieve +UVI-ReID. However, there exist two challenges: 1) noisy pseudo labels might be +generated in the clustering process, and 2) the cross-modality feature +alignment via matching the marginal distribution of visible and infrared +modalities may misalign the different identities from two modalities. In this +paper, we first conduct a theoretic analysis where an interpretable +generalization upper bound is introduced. Based on the analysis, we then +propose a novel unsupervised cross-modality person re-identification framework +(PRAISE). Specifically, to address the first challenge, we propose a +pseudo-label correction strategy that utilizes a Beta Mixture Model to predict +the probability of mis-clustering based network's memory effect and rectifies +the correspondence by adding a perceptual term to contrastive learning. Next, +we introduce a modality-level alignment strategy that generates paired +visible-infrared latent features and reduces the modality gap by aligning the +labeling function of visible and infrared features to learn identity +discriminative and modality-invariant features. Experimental results on two +benchmark datasets demonstrate that our method achieves state-of-the-art +performance than the unsupervised visible-ReID methods. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ☆ SafeGen: Mitigating Unsafe Content Generation in Text-to-Image Models + + +
+ Text-to-image (T2I) models, such as Stable Diffusion, have exhibited +remarkable performance in generating high-quality images from text descriptions +in recent years. However, text-to-image models may be tricked into generating +not-safe-for-work (NSFW) content, particularly in sexual scenarios. Existing +countermeasures mostly focus on filtering inappropriate inputs and outputs, or +suppressing improper text embeddings, which can block explicit NSFW-related +content (e.g., naked or sexy) but may still be vulnerable to adversarial +prompts inputs that appear innocent but are ill-intended. In this paper, we +present SafeGen, a framework to mitigate unsafe content generation by +text-to-image models in a text-agnostic manner. The key idea is to eliminate +unsafe visual representations from the model regardless of the text input. In +this way, the text-to-image model is resistant to adversarial prompts since +unsafe visual representations are obstructed from within. Extensive experiments +conducted on four datasets demonstrate SafeGen's effectiveness in mitigating +unsafe content generation while preserving the high-fidelity of benign images. +SafeGen outperforms eight state-of-the-art baseline methods and achieves 99.1% +sexual content removal performance. Furthermore, our constructed benchmark of +adversarial prompts provides a basis for future development and evaluation of +anti-NSFW-generation methods. + +
+
+
+
+
+ + ☆ Deep Generative Data Assimilation in Multimodal Setting CVPR2024 + + +
+ Robust integration of physical knowledge and data is key to improve +computational simulations, such as Earth system models. Data assimilation is +crucial for achieving this goal because it provides a systematic framework to +calibrate model outputs with observations, which can include remote sensing +imagery and ground station measurements, with uncertainty quantification. +Conventional methods, including Kalman filters and variational approaches, +inherently rely on simplifying linear and Gaussian assumptions, and can be +computationally expensive. Nevertheless, with the rapid adoption of data-driven +methods in many areas of computational sciences, we see the potential of +emulating traditional data assimilation with deep learning, especially +generative models. In particular, the diffusion-based probabilistic framework +has large overlaps with data assimilation principles: both allows for +conditional generation of samples with a Bayesian inverse framework. These +models have shown remarkable success in text-conditioned image generation or +image-controlled video synthesis. Likewise, one can frame data assimilation as +observation-conditioned state calibration. In this work, we propose SLAMS: +Score-based Latent Assimilation in Multimodal Setting. Specifically, we +assimilate in-situ weather station data and ex-situ satellite imagery to +calibrate the vertical temperature profiles, globally. Through extensive +ablation, we demonstrate that SLAMS is robust even in low-resolution, noisy, +and sparse data settings. To our knowledge, our work is the first to apply deep +generative framework for multimodal data assimilation using real-world +datasets; an important step for building robust computational simulators, +including the next-generation Earth system models. Our code is available at: +https://github.com/yongquan-qu/SLAMS + +
+
+ comment: Accepted to CVPR2024 EarthVision +
+
+
+
+
+ + ☆ Multi-modal Document Presentation Attack Detection With Forensics Trace + Disentanglement ICME 2024 + + +
+ Document Presentation Attack Detection (DPAD) is an important measure in +protecting the authenticity of a document image. However, recent DPAD methods +demand additional resources, such as manual effort in collecting additional +data or knowing the parameters of acquisition devices. This work proposes a +DPAD method based on multi-modal disentangled traces (MMDT) without the above +drawbacks. We first disentangle the recaptured traces by a self-supervised +disentanglement and synthesis network to enhance the generalization capacity in +document images with different contents and layouts. Then, unlike the existing +DPAD approaches that rely only on data in the RGB domain, we propose to +explicitly employ the disentangled recaptured traces as new modalities in the +transformer backbone through adaptive multi-modal adapters to fuse RGB/trace +features efficiently. Visualization of the disentangled traces confirms the +effectiveness of the proposed method in different document contents. Extensive +experiments on three benchmark datasets demonstrate the superiority of our MMDT +method on representing forensic traces of recapturing distortion. + +
+
+ comment: Accepted to ICME 2024 +
+
+
+
+
+ + ☆ Efficient Denoising using Score Embedding in Score-based Diffusion + Models + + +
+ It is well known that training a denoising score-based diffusion models +requires tens of thousands of epochs and a substantial number of image data to +train the model. In this paper, we propose to increase the efficiency in +training score-based diffusion models. Our method allows us to decrease the +number of epochs needed to train the diffusion model. We accomplish this by +solving the log-density Fokker-Planck (FP) Equation numerically to compute the +score \textit{before} training. The pre-computed score is embedded into the +image to encourage faster training under slice Wasserstein distance. +Consequently, it also allows us to decrease the number of images we need to +train the neural network to learn an accurate score. We demonstrate through our +numerical experiments the improved performance of our proposed method compared +to standard score-based diffusion models. Our proposed method achieves a +similar quality to the standard method meaningfully faster. + +
+
+
+
+
+ + ☆ AI-Guided Feature Segmentation Techniques to Model Features from Single + Crystal Diamond Growth + + +
+ Process refinement to consistently produce high-quality material over a large +area of the grown crystal, enabling various applications from optics crystals +to quantum detectors, has long been a goal for diamond growth. Machine learning +offers a promising path toward this goal, but faces challenges such as the +complexity of features within datasets, their time-dependency, and the volume +of data produced per growth run. Accurate spatial feature extraction from image +to image for real-time monitoring of diamond growth is crucial yet complicated +due to the low-volume and high feature complexity nature of the datasets. This +paper compares various traditional and machine learning-driven approaches for +feature extraction in the diamond growth domain, proposing a novel deep +learning-driven semantic segmentation approach to isolate and classify accurate +pixel masks of geometric features like diamond, pocket holder, and background, +along with their derivative features based on shape and size. Using an +annotation-focused human-in-the-loop software architecture for training +datasets, with modules for selective data labeling using active learning, data +augmentations, and model-assisted labeling, our approach achieves effective +annotation accuracy and drastically reduces labeling time and cost. Deep +learning algorithms prove highly efficient in accurately learning complex +representations from datasets with many features. Our top-performing model, +based on the DeeplabV3plus architecture, achieves outstanding accuracy in +classifying features of interest, with accuracies of 96.31% for pocket holder, +98.60% for diamond top, and 91.64% for diamond side features. + +
+
+ comment: 12 pages,4 figures,ACMME 2024. arXiv admin note: substantial text + overlap with arXiv:2404.07306 +
+
+
+
+
+ + ☆ Enhanced Cooperative Perception for Autonomous Vehicles Using Imperfect + Communication + + +
+ Sharing and joint processing of camera feeds and sensor measurements, known +as Cooperative Perception (CP), has emerged as a new technique to achieve +higher perception qualities. CP can enhance the safety of Autonomous Vehicles +(AVs) where their individual visual perception quality is compromised by +adverse weather conditions (haze as foggy weather), low illumination, winding +roads, and crowded traffic. To cover the limitations of former methods, in this +paper, we propose a novel approach to realize an optimized CP under constrained +communications. At the core of our approach is recruiting the best helper from +the available list of front vehicles to augment the visual range and enhance +the Object Detection (OD) accuracy of the ego vehicle. In this two-step +process, we first select the helper vehicles that contribute the most to CP +based on their visual range and lowest motion blur. Next, we implement a radio +block optimization among the candidate vehicles to further improve +communication efficiency. We specifically focus on pedestrian detection as an +exemplary scenario. To validate our approach, we used the CARLA simulator to +create a dataset of annotated videos for different driving scenarios where +pedestrian detection is challenging for an AV with compromised vision. Our +results demonstrate the efficacy of our two-step optimization process in +improving the overall performance of cooperative perception in challenging +scenarios, substantially improving driving safety under adverse conditions. +Finally, we note that the networking assumptions are adopted from LTE Release +14 Mode 4 side-link communication, commonly used for Vehicle-to-Vehicle (V2V) +communication. Nonetheless, our method is flexible and applicable to arbitrary +V2V communications. + +
+
+
+
+
+ + ☆ An inclusive review on deep learning techniques and their scope in + handwriting recognition + + +
+ Deep learning expresses a category of machine learning algorithms that have +the capability to combine raw inputs into intermediate features layers. These +deep learning algorithms have demonstrated great results in different fields. +Deep learning has particularly witnessed for a great achievement of human level +performance across a number of domains in computer vision and pattern +recognition. For the achievement of state-of-the-art performances in diverse +domains, the deep learning used different architectures and these architectures +used activation functions to perform various computations between hidden and +output layers of any architecture. This paper presents a survey on the existing +studies of deep learning in handwriting recognition field. Even though the +recent progress indicates that the deep learning methods has provided valuable +means for speeding up or proving accurate results in handwriting recognition, +but following from the extensive literature survey, the present study finds +that the deep learning has yet to revolutionize more and has to resolve many of +the most pressing challenges in this field, but promising advances have been +made on the prior state of the art. Additionally, an inadequate availability of +labelled data to train presents problems in this domain. Nevertheless, the +present handwriting recognition survey foresees deep learning enabling changes +at both bench and bedside with the potential to transform several domains as +image processing, speech recognition, computer vision, machine translation, +robotics and control, medical imaging, medical information processing, +bio-informatics, natural language processing, cyber security, and many others. + +
+
+
+
+
+ + ♻ ☆ Disentangled Explanations of Neural Network Predictions by Finding + Relevant Subspaces + + +
+ Explainable AI aims to overcome the black-box nature of complex ML models +like neural networks by generating explanations for their predictions. +Explanations often take the form of a heatmap identifying input features (e.g. +pixels) that are relevant to the model's decision. These explanations, however, +entangle the potentially multiple factors that enter into the overall complex +decision strategy. We propose to disentangle explanations by extracting at some +intermediate layer of a neural network, subspaces that capture the multiple and +distinct activation patterns (e.g. visual concepts) that are relevant to the +prediction. To automatically extract these subspaces, we propose two new +analyses, extending principles found in PCA or ICA to explanations. These novel +analyses, which we call principal relevant component analysis (PRCA) and +disentangled relevant subspace analysis (DRSA), maximize relevance instead of +e.g. variance or kurtosis. This allows for a much stronger focus of the +analysis on what the ML model actually uses for predicting, ignoring +activations or concepts to which the model is invariant. Our approach is +general enough to work alongside common attribution techniques such as Shapley +Value, Integrated Gradients, or LRP. Our proposed methods show to be +practically useful and compare favorably to the state of the art as +demonstrated on benchmarks and three use cases. + +
+
+ comment: 17 pages + supplement +
+
+
+
+
+ + ♻ ☆ Deep Learning for Inertial Sensor Alignment + + +
+ Accurate alignment of a fixed mobile device equipped with inertial sensors +inside a moving vehicle is important for navigation, activity recognition, and +other applications. Accurate estimation of the device mounting angle is +required to rotate the inertial measurement from the sensor frame to the moving +platform frame to standardize measurements and improve the performance of the +target task. In this work, a data-driven approach using deep neural networks +(DNNs) is proposed to learn the yaw mounting angle of a smartphone equipped +with an inertial measurement unit (IMU) and strapped to a car. The proposed +model uses only the accelerometer and gyroscope readings from an IMU as input +and, in contrast to existing solutions, does not require global position inputs +from global navigation satellite systems (GNSS). To train the model in a +supervised manner, IMU data is collected for training and validation with the +sensor mounted at a known yaw mounting angle, and a range of ground truth +labels is generated by applying a random rotation in a bounded range to the +measurements. The trained model is tested on data with real rotations showing +similar performance as with synthetic rotations. The trained model is deployed +on an Android device and evaluated in real-time to test the accuracy of the +estimated yaw mounting angle. The model is shown to find the mounting angle at +an accuracy of 8 degrees within 5 seconds, and 4 degrees within 27 seconds. An +experiment is conducted to compare the proposed model with an existing +off-the-shelf solution. + +
+
+ comment: 9 Pages, Preprint. Accepted IEEE +
+
+
+
+
+ + ♻ ☆ GLiDR: Topologically Regularized Graph Generative Network for Sparse + LiDAR Point Clouds CVPR + + +
+ Sparse LiDAR point clouds cause severe loss of detail of static structures +and reduce the density of static points available for navigation. Reduced +density can be detrimental to navigation under several scenarios. We observe +that despite high sparsity, in most cases, the global topology of LiDAR +outlining the static structures can be inferred. We utilize this property to +obtain a backbone skeleton of a LiDAR scan in the form of a single connected +component that is a proxy to its global topology. We utilize the backbone to +augment new points along static structures to overcome sparsity. Newly +introduced points could correspond to existing static structures or to static +points that were earlier obstructed by dynamic objects. To the best of our +knowledge, we are the first to use such a strategy for sparse LiDAR point +clouds. Existing solutions close to our approach fail to identify and preserve +the global static LiDAR topology and generate sub-optimal points. We propose +GLiDR, a Graph Generative network that is topologically regularized using +0-dimensional Persistent Homology ($\mathcal{PH}$) constraints. This enables +GLiDR to introduce newer static points along a topologically consistent global +static LiDAR backbone. GLiDR generates precise static points using $32\times$ +sparser dynamic scans and performs better than the baselines across three +datasets. GLiDR generates a valuable byproduct - an accurate binary +segmentation mask of static and dynamic objects that are helpful for navigation +planning and safety in constrained environments. The newly introduced static +points allow GLiDR to outperform LiDAR-based navigation using SLAM in several +settings. Source code is available at +$\texttt{https://github.com/GLiDR-CVPR2024/GLiDR}$. + +
+
+ comment: IEEE / CVF Computer Vision and Pattern Recognition Conference (CVPR) +
+
+
+
+
+ + ♻ ☆ CLOVA: A Closed-Loop Visual Assistant with Tool Usage and Update CVPR 2024 + + +
+ Utilizing large language models (LLMs) to compose off-the-shelf visual tools +represents a promising avenue of research for developing robust visual +assistants capable of addressing diverse visual tasks. However, these methods +often overlook the potential for continual learning, typically by freezing the +utilized tools, thus limiting their adaptation to environments requiring new +knowledge. To tackle this challenge, we propose CLOVA, a Closed-Loop Visual +Assistant, which operates within a framework encompassing inference, +reflection, and learning phases. During the inference phase, LLMs generate +programs and execute corresponding tools to complete assigned tasks. In the +reflection phase, a multimodal global-local reflection scheme analyzes human +feedback to determine which tools require updating. Lastly, the learning phase +employs three flexible approaches to automatically gather training data and +introduces a novel prompt tuning scheme to update the tools, allowing CLOVA to +efficiently acquire new knowledge. Experimental findings demonstrate that CLOVA +surpasses existing tool-usage methods by 5% in visual question answering and +multiple-image reasoning, by 10% in knowledge tagging, and by 20% in image +editing. These results underscore the significance of the continual learning +capability in general visual assistants. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Bias-Reduced Neural Networks for Parameter Estimation in Quantitative + MRI + + +
+ Purpose: To develop neural network (NN)-based quantitative MRI parameter +estimators with minimal bias and a variance close to the Cram\'er-Rao bound. + Theory and Methods: We generalize the mean squared error loss to control the +bias and variance of the NN's estimates, which involves averaging over multiple +noise realizations of the same measurements during training. Bias and variance +properties of the resulting NNs are studied for two neuroimaging applications. + Results: In simulations, the proposed strategy reduces the estimates' bias +throughout parameter space and achieves a variance close to the Cram\'er-Rao +bound. In vivo, we observe good concordance between parameter maps estimated +with the proposed NNs and traditional estimators, such as non-linear +least-squares fitting, while state-of-the-art NNs show larger deviations. + Conclusion: The proposed NNs have greatly reduced bias compared to those +trained using the mean squared error and offer significantly improved +computational efficiency over traditional estimators with comparable or better +accuracy. + +
+
+
+
+
+ + ♻ ☆ MaskClustering: View Consensus based Mask Graph Clustering for + Open-Vocabulary 3D Instance Segmentation + + +
+ Open-vocabulary 3D instance segmentation is cutting-edge for its ability to +segment 3D instances without predefined categories. However, progress in 3D +lags behind its 2D counterpart due to limited annotated 3D data. To address +this, recent works first generate 2D open-vocabulary masks through 2D models +and then merge them into 3D instances based on metrics calculated between two +neighboring frames. In contrast to these local metrics, we propose a novel +metric, view consensus rate, to enhance the utilization of multi-view +observations. The key insight is that two 2D masks should be deemed part of the +same 3D instance if a significant number of other 2D masks from different views +contain both these two masks. Using this metric as edge weight, we construct a +global mask graph where each mask is a node. Through iterative clustering of +masks showing high view consensus, we generate a series of clusters, each +representing a distinct 3D instance. Notably, our model is training-free. +Through extensive experiments on publicly available datasets, including +ScanNet++, ScanNet200 and MatterPort3D, we demonstrate that our method achieves +state-of-the-art performance in open-vocabulary 3D instance segmentation. Our +project page is at https://pku-epic.github.io/MaskClustering. + +
+
+
+
+
+ + ♻ ☆ Visual Concept Connectome (VCC): Open World Concept Discovery and their + Interlayer Connections in Deep Models CVPR 2024 + + +
+ Understanding what deep network models capture in their learned +representations is a fundamental challenge in computer vision. We present a new +methodology to understanding such vision models, the Visual Concept Connectome +(VCC), which discovers human interpretable concepts and their interlayer +connections in a fully unsupervised manner. Our approach simultaneously reveals +fine-grained concepts at a layer, connection weightings across all layers and +is amendable to global analysis of network structure (e.g., branching pattern +of hierarchical concept assemblies). Previous work yielded ways to extract +interpretable concepts from single layers and examine their impact on +classification, but did not afford multilayer concept analysis across an entire +network architecture. Quantitative and qualitative empirical results show the +effectiveness of VCCs in the domain of image classification. Also, we leverage +VCCs for the application of failure mode debugging to reveal where mistakes +arise in deep networks. + +
+
+ comment: CVPR 2024 (Highlight) +
+
+
+
+
+ + ♻ ☆ Understanding Video Transformers via Universal Concept Discovery CVPR 2024 + + +
+ This paper studies the problem of concept-based interpretability of +transformer representations for videos. Concretely, we seek to explain the +decision-making process of video transformers based on high-level, +spatiotemporal concepts that are automatically discovered. Prior research on +concept-based interpretability has concentrated solely on image-level tasks. +Comparatively, video models deal with the added temporal dimension, increasing +complexity and posing challenges in identifying dynamic concepts over time. In +this work, we systematically address these challenges by introducing the first +Video Transformer Concept Discovery (VTCD) algorithm. To this end, we propose +an efficient approach for unsupervised identification of units of video +transformer representations - concepts, and ranking their importance to the +output of a model. The resulting concepts are highly interpretable, revealing +spatio-temporal reasoning mechanisms and object-centric representations in +unstructured video models. Performing this analysis jointly over a diverse set +of supervised and self-supervised representations, we discover that some of +these mechanism are universal in video transformers. Finally, we show that VTCD +can be used for fine-grained action recognition and video object segmentation. + +
+
+ comment: CVPR 2024 (Highlight) +
+
+
+
+
+ + ♻ ☆ Location-guided Head Pose Estimation for Fisheye Image + + +
+ Camera with a fisheye or ultra-wide lens covers a wide field of view that +cannot be modeled by the perspective projection. Serious fisheye lens +distortion in the peripheral region of the image leads to degraded performance +of the existing head pose estimation models trained on undistorted images. This +paper presents a new approach for head pose estimation that uses the knowledge +of head location in the image to reduce the negative effect of fisheye +distortion. We develop an end-to-end convolutional neural network to estimate +the head pose with the multi-task learning of head pose and head location. Our +proposed network estimates the head pose directly from the fisheye image +without the operation of rectification or calibration. We also created a +fisheye-distorted version of the three popular head pose estimation datasets, +BIWI, 300W-LP, and AFLW2000 for our experiments. Experiments results show that +our network remarkably improves the accuracy of head pose estimation compared +with other state-of-the-art one-stage and two-stage methods. + +
+
+ comment: Revised Introduction and Related Work; Submitted to lEEE Transactions + on Cognitive and Developmental Systems for review +
+
+
+
+
+ + ♻ ☆ VMamba: Visual State Space Model + + +
+ Convolutional Neural Networks (CNNs) and Vision Transformers (ViTs) have long +been the predominant backbone networks for visual representation learning. +While ViTs have recently gained prominence over CNNs due to their superior +fitting capabilities, their scalability is largely constrained by the quadratic +complexity of attention computation. Inspired by the capability of Mamba in +efficiently modeling long sequences, we propose VMamba, a generic vision +backbone model aiming to reduce the computational complexity to linear while +retaining ViTs' advantageous features. To enhance VMamba's adaptability in +processing vision data, we introduce the Cross-Scan Module (CSM) to enable 1D +selective scanning in 2D image space with global receptive fields. +Additionally, we make further improvements in implementation details and +architectural designs to enhance VMamba's performance and boost its inference +speed. Extensive experimental results demonstrate VMamba's promising +performance across various visual perception tasks, highlighting its pronounced +advantages in input scaling efficiency compared to existing benchmark models. +Source code is available at https://github.com/MzeroMiko/VMamba. + +
+
+ comment: 21 pages, 12 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Data-Efficient Multimodal Fusion on a Single GPU CVPR 2024 + + +
+ The goal of multimodal alignment is to learn a single latent space that is +shared between multimodal inputs. The most powerful models in this space have +been trained using massive datasets of paired inputs and large-scale +computational resources, making them prohibitively expensive to train in many +practical scenarios. We surmise that existing unimodal encoders pre-trained on +large amounts of unimodal data should provide an effective bootstrap to create +multimodal models from unimodal ones at much lower costs. We therefore propose +FuseMix, a multimodal augmentation scheme that operates on the latent spaces of +arbitrary pre-trained unimodal encoders. Using FuseMix for multimodal +alignment, we achieve competitive performance -- and in certain cases +outperform state-of-the art methods -- in both image-text and audio-text +retrieval, with orders of magnitude less compute and data: for example, we +outperform CLIP on the Flickr30K text-to-image retrieval task with $\sim \! +600\times$ fewer GPU days and $\sim \! 80\times$ fewer image-text pairs. +Additionally, we show how our method can be applied to convert pre-trained +text-to-image generative models into audio-to-image ones. Code is available at: +https://github.com/layer6ai-labs/fusemix. + +
+
+ comment: CVPR 2024 (Highlight) +
+
+
+
+
+ + ♻ ☆ Building-road Collaborative Extraction from Remotely Sensed Images via + Cross-Interaction + + +
+ Buildings are the basic carrier of social production and human life; roads +are the links that interconnect social networks. Building and road information +has important application value in the frontier fields of regional coordinated +development, disaster prevention, auto-driving, etc. Mapping buildings and +roads from very high-resolution (VHR) remote sensing images have become a hot +research topic. However, the existing methods often ignore the strong spatial +correlation between roads and buildings and extract them in isolation. To fully +utilize the complementary advantages between buildings and roads, we propose a +building-road collaborative extraction method based on multi-task and +cross-scale feature interaction to improve the accuracy of both tasks in a +complementary way. A multi-task interaction module is proposed to interact +information across tasks and preserve the unique information of each task, +which tackle the seesaw phenomenon in multitask learning. By considering the +variation in appearance and structure between buildings and roads, a +cross-scale interaction module is designed to automatically learn the optimal +reception field for different tasks. Compared with many existing methods that +train each task individually, the proposed collaborative extraction method can +utilize the complementary advantages between buildings and roads by the +proposed inter-task and inter-scale feature interactions, and automatically +select the optimal reception field for different tasks. Experiments on a wide +range of urban and rural scenarios show that the proposed algorithm can achieve +building-road extraction with outstanding performance and efficiency. + +
+
+ comment: IEEE Transactions on Geoscience and Remote Sensing +
+
+
+
+
+ + ♻ ☆ BOTH2Hands: Inferring 3D Hands from Both Text Prompts and Body Dynamics CVPR 2024 + + +
+ The recently emerging text-to-motion advances have spired numerous attempts +for convenient and interactive human motion generation. Yet, existing methods +are largely limited to generating body motions only without considering the +rich two-hand motions, let alone handling various conditions like body dynamics +or texts. To break the data bottleneck, we propose BOTH57M, a novel multi-modal +dataset for two-hand motion generation. Our dataset includes accurate motion +tracking for the human body and hands and provides pair-wised finger-level hand +annotations and body descriptions. We further provide a strong baseline method, +BOTH2Hands, for the novel task: generating vivid two-hand motions from both +implicit body dynamics and explicit text prompts. We first warm up two parallel +body-to-hand and text-to-hand diffusion models and then utilize the +cross-attention transformer for motion blending. Extensive experiments and +cross-validations demonstrate the effectiveness of our approach and dataset for +generating convincing two-hand motions from the hybrid body-and-textual +conditions. Our dataset and code will be disseminated to the community for +future research. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ WebXR, A-Frame and Networked-Aframe as a Basis for an Open Metaverse: A + Conceptual Architecture + + +
+ This work proposes a WebXR-based cross-platform conceptual architecture, +leveraging the A-Frame and Networked-Aframe frameworks, in order to facilitate +the development of an open, accessible, and interoperable metaverse. By +introducing the concept of spatial web app, this research contributes to the +discourse on the metaverse, offering an architecture that democratizes access +to virtual environments and extended reality through the web, and aligns with +Tim Berners-Lee's original vision of the World Wide Web as an open platform in +the digital realm. + +
+
+ comment: minor fixes (typos, URLs etc.) +
+
+
+
+
+ + ♻ ☆ Implicit Neural Representation for MRI Parallel Imaging Reconstruction + + +
+ Magnetic resonance imaging (MRI) usually faces lengthy acquisition times, +prompting the exploration of strategies such as parallel imaging (PI) to +alleviate this problem by periodically skipping specific K-space lines and +subsequently reconstructing high-quality images from the undersampled K-space. +Implicit neural representation (INR) has recently emerged as a promising deep +learning technique, characterizing objects as continuous functions of spatial +coordinates typically parameterized by a multilayer perceptron (MLP). In this +study, we propose a novel MRI PI reconstruction method that uses INR. Our +approach represents reconstructed fully-sampled images as functions of voxel +coordinates and prior feature vectors from undersampled images, addressing the +generalization challenges of INR. Specifically, we introduce a scale-embedded +encoder to generate scale-independent, voxel-specific features from MR images +across various undersampling scales. These features are then concatenated with +coordinate vectors to reconstruct fully-sampled MR images, facilitating +multiple-scale reconstructions. To evaluate our method's performance, we +conducted experiments using publicly available MRI datasets, comparing it with +alternative reconstruction techniques. Our quantitative assessment demonstrates +the superiority of our proposed method. + +
+
+
+
+
+ + ♻ ☆ Expediting Building Footprint Extraction from High-resolution Remote + Sensing Images via progressive lenient supervision + + +
+ The efficacy of building footprint segmentation from remotely sensed images +has been hindered by model transfer effectiveness. Many existing building +segmentation methods were developed upon the encoder-decoder architecture of +U-Net, in which the encoder is finetuned from the newly developed backbone +networks that are pre-trained on ImageNet. However, the heavy computational +burden of the existing decoder designs hampers the successful transfer of these +modern encoder networks to remote sensing tasks. Even the widely-adopted deep +supervision strategy fails to mitigate these challenges due to its invalid loss +in hybrid regions where foreground and background pixels are intermixed. In +this paper, we conduct a comprehensive evaluation of existing decoder network +designs for building footprint segmentation and propose an efficient framework +denoted as BFSeg to enhance learning efficiency and effectiveness. +Specifically, a densely-connected coarse-to-fine feature fusion decoder network +that facilitates easy and fast feature fusion across scales is proposed. +Moreover, considering the invalidity of hybrid regions in the down-sampled +ground truth during the deep supervision process, we present a lenient deep +supervision and distillation strategy that enables the network to learn proper +knowledge from deep supervision. Building upon these advancements, we have +developed a new family of building segmentation networks, which consistently +surpass prior works with outstanding performance and efficiency across a wide +range of newly developed encoder networks. + +
+
+
+
+
+ + ♻ ☆ Two-Phase Multi-Dose-Level PET Image Reconstruction with Dose Level + Awareness + + +
+ To obtain high-quality positron emission tomography (PET) while minimizing +radiation exposure, a range of methods have been designed to reconstruct +standard-dose PET (SPET) from corresponding low-dose PET (LPET) images. +However, most current methods merely learn the mapping between +single-dose-level LPET and SPET images, but omit the dose disparity of LPET +images in clinical scenarios. In this paper, to reconstruct high-quality SPET +images from multi-dose-level LPET images, we design a novel two-phase +multi-dose-level PET reconstruction algorithm with dose level awareness, +containing a pre-training phase and a SPET prediction phase. Specifically, the +pre-training phase is devised to explore both fine-grained discriminative +features and effective semantic representation. The SPET prediction phase +adopts a coarse prediction network utilizing pre-learned dose level prior to +generate preliminary result, and a refinement network to precisely preserve the +details. Experiments on MICCAI 2022 Ultra-low Dose PET Imaging Challenge +Dataset have demonstrated the superiority of our method. + +
+
+ comment: Accepted by ISBI2024 +
+
+
+
+
+ + ♻ ☆ Little Strokes Fell Great Oaks: Boosting the Hierarchical Features for + Multi-exposure Image Fusion + + +
+ In recent years, deep learning networks have made remarkable strides in the +domain of multi-exposure image fusion. Nonetheless, prevailing approaches often +involve directly feeding over-exposed and under-exposed images into the +network, which leads to the under-utilization of inherent information present +in the source images. Additionally, unsupervised techniques predominantly +employ rudimentary weighted summation for color channel processing, culminating +in an overall desaturated final image tone. To partially mitigate these issues, +this study proposes a gamma correction module specifically designed to fully +leverage latent information embedded within source images. Furthermore, a +modified transformer block, embracing with self-attention mechanisms, is +introduced to optimize the fusion process. Ultimately, a novel color +enhancement algorithm is presented to augment image saturation while preserving +intricate details. The source code is available at +https://github.com/ZhiyingDu/BHFMEF. + +
+
+
+
+
+ + ♻ ☆ DREAM: Visual Decoding from Reversing Human Visual System + + +
+ In this work we present DREAM, an fMRI-to-image method for reconstructing +viewed images from brain activities, grounded on fundamental knowledge of the +human visual system. We craft reverse pathways that emulate the hierarchical +and parallel nature of how humans perceive the visual world. These tailored +pathways are specialized to decipher semantics, color, and depth cues from fMRI +data, mirroring the forward pathways from visual stimuli to fMRI recordings. To +do so, two components mimic the inverse processes within the human visual +system: the Reverse Visual Association Cortex (R-VAC) which reverses pathways +of this brain region, extracting semantics from fMRI data; the Reverse Parallel +PKM (R-PKM) component simultaneously predicting color and depth from fMRI +signals. The experiments indicate that our method outperforms the current +state-of-the-art models in terms of the consistency of appearance, structure, +and semantics. Code will be made publicly available to facilitate further +research in this field. + +
+
+ comment: Project Page: https://weihaox.github.io/DREAM +
+
+
+
+
+ + ♻ ☆ Pre-trained Model Guided Fine-Tuning for Zero-Shot Adversarial + Robustness CVPR 2024 + + +
+ Large-scale pre-trained vision-language models like CLIP have demonstrated +impressive performance across various tasks, and exhibit remarkable zero-shot +generalization capability, while they are also vulnerable to imperceptible +adversarial examples. Existing works typically employ adversarial training +(fine-tuning) as a defense method against adversarial examples. However, direct +application to the CLIP model may result in overfitting, compromising the +model's capacity for generalization. In this paper, we propose Pre-trained +Model Guided Adversarial Fine-Tuning (PMG-AFT) method, which leverages +supervision from the original pre-trained model by carefully designing an +auxiliary branch, to enhance the model's zero-shot adversarial robustness. +Specifically, PMG-AFT minimizes the distance between the features of +adversarial examples in the target model and those in the pre-trained model, +aiming to preserve the generalization features already captured by the +pre-trained model. Extensive Experiments on 15 zero-shot datasets demonstrate +that PMG-AFT significantly outperforms the state-of-the-art method, improving +the top-1 robust accuracy by an average of 4.99%. Furthermore, our approach +consistently improves clean accuracy by an average of 8.72%. Our code is +available at +https://github.com/serendipity1122/Pre-trained-Model-Guided-Fine-Tuning-for-Zero-Shot-Adversarial-Robustness. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ DG-TTA: Out-of-domain medical image segmentation through Domain + Generalization and Test-Time Adaptation + + +
+ Applying pre-trained medical segmentation models on out-of-domain images +often yields predictions of insufficient quality. Several strategies have been +proposed to maintain model performance, such as finetuning or unsupervised- and +source-free domain adaptation. These strategies set restrictive requirements +for data availability. In this study, we propose to combine domain +generalization and test-time adaptation to create a highly effective approach +for reusing pre-trained models in unseen target domains. Domain-generalized +pre-training on source data is used to obtain the best initial performance in +the target domain. We introduce the MIND descriptor previously used in image +registration tasks as a further technique to achieve generalization and present +superior performance for small-scale datasets compared to existing approaches. +At test-time, high-quality segmentation for every single unseen scan is ensured +by optimizing the model weights for consistency given different image +augmentations. That way, our method enables separate use of source and target +data and thus removes current data availability barriers. Moreover, the +presented method is highly modular as it does not require specific model +architectures or prior knowledge of involved domains and labels. We demonstrate +this by integrating it into the nnUNet, which is currently the most popular and +accurate framework for medical image segmentation. We employ multiple datasets +covering abdominal, cardiac, and lumbar spine scans and compose several +out-of-domain scenarios in this study. We demonstrate that our method, combined +with pre-trained whole-body CT models, can effectively segment MR images with +high accuracy in all of the aforementioned scenarios. Open-source code can be +found here: https://github.com/multimodallearning/DG-TTA + +
+
+
+
+
+ + ♻ ☆ ExpPoint-MAE: Better interpretability and performance for + self-supervised point cloud transformers + + +
+ In this paper we delve into the properties of transformers, attained through +self-supervision, in the point cloud domain. Specifically, we evaluate the +effectiveness of Masked Autoencoding as a pretraining scheme, and explore +Momentum Contrast as an alternative. In our study we investigate the impact of +data quantity on the learned features, and uncover similarities in the +transformer's behavior across domains. Through comprehensive visualiations, we +observe that the transformer learns to attend to semantically meaningful +regions, indicating that pretraining leads to a better understanding of the +underlying geometry. Moreover, we examine the finetuning process and its effect +on the learned representations. Based on that, we devise an unfreezing strategy +which consistently outperforms our baseline without introducing any other +modifications to the model or the training pipeline, and achieve +state-of-the-art results in the classification task among transformer models. + +
+
+
+
+
+ + ♻ ☆ AGILE3D: Attention Guided Interactive Multi-object 3D Segmentation ICLR 2024 + + +
+ During interactive segmentation, a model and a user work together to +delineate objects of interest in a 3D point cloud. In an iterative process, the +model assigns each data point to an object (or the background), while the user +corrects errors in the resulting segmentation and feeds them back into the +model. The current best practice formulates the problem as binary +classification and segments objects one at a time. The model expects the user +to provide positive clicks to indicate regions wrongly assigned to the +background and negative clicks on regions wrongly assigned to the object. +Sequentially visiting objects is wasteful since it disregards synergies between +objects: a positive click for a given object can, by definition, serve as a +negative click for nearby objects. Moreover, a direct competition between +adjacent objects can speed up the identification of their common boundary. We +introduce AGILE3D, an efficient, attention-based model that (1) supports +simultaneous segmentation of multiple 3D objects, (2) yields more accurate +segmentation masks with fewer user clicks, and (3) offers faster inference. Our +core idea is to encode user clicks as spatial-temporal queries and enable +explicit interactions between click queries as well as between them and the 3D +scene through a click attention module. Every time new clicks are added, we +only need to run a lightweight decoder that produces updated segmentation +masks. In experiments with four different 3D point cloud datasets, AGILE3D sets +a new state-of-the-art. Moreover, we also verify its practicality in real-world +setups with real user studies. + +
+
+ comment: ICLR 2024 camera-ready. Project page: https://ywyue.github.io/AGILE3D +
+
+
+
+
+ + ♻ ☆ Physics-guided Shape-from-Template: Monocular Video Perception through + Neural Surrogate Models + + +
+ 3D reconstruction of dynamic scenes is a long-standing problem in computer +graphics and increasingly difficult the less information is available. +Shape-from-Template (SfT) methods aim to reconstruct a template-based geometry +from RGB images or video sequences, often leveraging just a single monocular +camera without depth information, such as regular smartphone recordings. +Unfortunately, existing reconstruction methods are either unphysical and noisy +or slow in optimization. To solve this problem, we propose a novel SfT +reconstruction algorithm for cloth using a pre-trained neural surrogate model +that is fast to evaluate, stable, and produces smooth reconstructions due to a +regularizing physics simulation. Differentiable rendering of the simulated mesh +enables pixel-wise comparisons between the reconstruction and a target video +sequence that can be used for a gradient-based optimization procedure to +extract not only shape information but also physical parameters such as +stretching, shearing, or bending stiffness of the cloth. This allows to retain +a precise, stable, and smooth reconstructed geometry while reducing the runtime +by a factor of 400-500 compared to $\phi$-SfT, a state-of-the-art physics-based +SfT approach. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Denoising for Signal-Dependent and Row-Correlated Imaging + Noise + + +
+ Accurate analysis of microscopy images is hindered by the presence of noise. +This noise is usually signal-dependent and often additionally correlated along +rows or columns of pixels. Current self- and unsupervised denoisers can address +signal-dependent noise, but none can reliably remove noise that is also row- or +column-correlated. Here, we present the first fully unsupervised deep +learning-based denoiser capable of handling imaging noise that is +row-correlated as well as signal-dependent. Our approach uses a Variational +Autoencoder (VAE) with a specially designed autoregressive decoder. This +decoder is capable of modeling row-correlated and signal-dependent noise but is +incapable of independently modeling underlying clean signal. The VAE therefore +produces latent variables containing only clean signal information, and these +are mapped back into image space using a proposed second decoder network. Our +method does not require a pre-trained noise model and can be trained from +scratch using unpaired noisy data. We show that our approach achieves +competitive results when applied to a range of different sensor types and +imaging modalities. + +
+
+
+
+
+ + ♻ ☆ Triple-CFN: Restructuring Conceptual Spaces for Enhancing Abstract + Reasoning process + + +
+ Abstract reasoning problems pose significant challenges to artificial +intelligence algorithms, demanding cognitive capabilities beyond those required +for perception tasks. This study introduces the Triple-CFN approach to tackle +the Bongard-Logo problem, achieving notable reasoning accuracy by implicitly +reorganizing the concept space of conflicting instances. Additionally, the +Triple-CFN paradigm proves effective for the RPM problem with necessary +modifications, yielding competitive results. To further enhance performance on +the RPM issue, we develop the Meta Triple-CFN network, which explicitly +structures the problem space while maintaining interpretability on progressive +patterns. The success of Meta Triple-CFN is attributed to its paradigm of +modeling the conceptual space, equivalent to normalizing reasoning information. +Based on this ideology, we introduce the Re-space layer, enhancing the +performance of both Meta Triple-CFN and Triple-CFN. This paper aims to +contribute to advancements in machine intelligence by exploring innovative +network designs for addressing abstract reasoning problems, paving the way for +further breakthroughs in this domain. + +
+
+ comment: 14 pages, 14 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Large-scale Multi-Modal Pre-trained Models: A Comprehensive Survey + + +
+ With the urgent demand for generalized deep models, many pre-trained big +models are proposed, such as BERT, ViT, GPT, etc. Inspired by the success of +these models in single domains (like computer vision and natural language +processing), the multi-modal pre-trained big models have also drawn more and +more attention in recent years. In this work, we give a comprehensive survey of +these models and hope this paper could provide new insights and helps fresh +researchers to track the most cutting-edge works. Specifically, we firstly +introduce the background of multi-modal pre-training by reviewing the +conventional deep learning, pre-training works in natural language process, +computer vision, and speech. Then, we introduce the task definition, key +challenges, and advantages of multi-modal pre-training models (MM-PTMs), and +discuss the MM-PTMs with a focus on data, objectives, network architectures, +and knowledge enhanced pre-training. After that, we introduce the downstream +tasks used for the validation of large-scale MM-PTMs, including generative, +classification, and regression tasks. We also give visualization and analysis +of the model parameters and results on representative downstream tasks. +Finally, we point out possible research directions for this topic that may +benefit future works. In addition, we maintain a continuously updated paper +list for large-scale pre-trained multi-modal big models: +https://github.com/wangxiao5791509/MultiModal_BigModels_Survey. This paper has +been published by the journal Machine Intelligence Research (MIR), +https://link.springer.com/article/10.1007/s11633-022-1410-8, DOI: +10.1007/s11633-022-1410-8, vol. 20, no. 4, pp. 447-482, 2023. + +
+
+ comment: Accepted by Machine Intelligence Research (MIR) +
+
+
+
+
+ + ♻ ☆ MixedNUTS: Training-Free Accuracy-Robustness Balance via Nonlinearly + Mixed Classifiers + + +
+ Adversarial robustness often comes at the cost of degraded accuracy, impeding +the real-life application of robust classification models. Training-based +solutions for better trade-offs are limited by incompatibilities with +already-trained high-performance large models, necessitating the exploration of +training-free ensemble approaches. Observing that robust models are more +confident in correct predictions than in incorrect ones on clean and +adversarial data alike, we speculate amplifying this "benign confidence +property" can reconcile accuracy and robustness in an ensemble setting. To +achieve so, we propose "MixedNUTS", a training-free method where the output +logits of a robust classifier and a standard non-robust classifier are +processed by nonlinear transformations with only three parameters, which are +optimized through an efficient algorithm. MixedNUTS then converts the +transformed logits into probabilities and mixes them as the overall output. On +CIFAR-10, CIFAR-100, and ImageNet datasets, experimental results with custom +strong adaptive attacks demonstrate MixedNUTS's vastly improved accuracy and +near-SOTA robustness -- it boosts CIFAR-100 clean accuracy by 7.86 points, +sacrificing merely 0.87 points in robust accuracy. + +
+
+
+
+
+ + ♻ ☆ RS-Mamba for Large Remote Sensing Image Dense Prediction + + +
+ Context modeling is critical for remote sensing image dense prediction tasks. +Nowadays, the growing size of very-high-resolution (VHR) remote sensing images +poses challenges in effectively modeling context. While transformer-based +models possess global modeling capabilities, they encounter computational +challenges when applied to large VHR images due to their quadratic complexity. +The conventional practice of cropping large images into smaller patches results +in a notable loss of contextual information. To address these issues, we +propose the Remote Sensing Mamba (RSM) for dense prediction tasks in large VHR +remote sensing images. RSM is specifically designed to capture the global +context of remote sensing images with linear complexity, facilitating the +effective processing of large VHR images. Considering that the land covers in +remote sensing images are distributed in arbitrary spatial directions due to +characteristics of remote sensing over-head imaging, the RSM incorporates an +omnidirectional selective scan module to globally model the context of images +in multiple directions, capturing large spatial features from various +directions. Extensive experiments on semantic segmentation and change detection +tasks across various land covers demonstrate the effectiveness of the proposed +RSM. We designed simple yet effective models based on RSM, achieving +state-of-the-art performance on dense prediction tasks in VHR remote sensing +images without fancy training strategies. Leveraging the linear complexity and +global modeling capabilities, RSM achieves better efficiency and accuracy than +transformer-based models on large remote sensing images. Interestingly, we also +demonstrated that our model generally performs better with a larger image size +on dense prediction tasks. Our code is available at +https://github.com/walking-shadow/Official_Remote_Sensing_Mamba. + +
+
+ comment: 15 pages,8 figures +
+
+
+
+
+ + ♻ ☆ Improving the Generalization of Segmentation Foundation Model under + Distribution Shift via Weakly Supervised Adaptation + + +
+ The success of large language models has inspired the computer vision +community to explore image segmentation foundation model that is able to +zero/few-shot generalize through prompt engineering. Segment-Anything(SAM), +among others, is the state-of-the-art image segmentation foundation model +demonstrating strong zero/few-shot generalization. Despite the success, recent +studies reveal the weakness of SAM under strong distribution shift. In +particular, SAM performs awkwardly on corrupted natural images, camouflaged +images, medical images, etc. Motivated by the observations, we aim to develop a +self-training based strategy to adapt SAM to target distribution. Given the +unique challenges of large source dataset, high computation cost and incorrect +pseudo label, we propose a weakly supervised self-training architecture with +anchor regularization and low-rank finetuning to improve the robustness and +computation efficiency of adaptation. We validate the effectiveness on 5 types +of downstream segmentation tasks including natural clean/corrupted images, +medical images, camouflaged images and robotic images. Our proposed method is +task-agnostic in nature and outperforms pre-trained SAM and state-of-the-art +domain adaptation methods on almost all downstream tasks with the same testing +prompt inputs. + +
+
+ comment: 20 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ Ear-Keeper: Real-time Diagnosis of Ear Lesions Utilizing + Ultralight-Ultrafast ConvNet and Large-scale Ear Endoscopic Dataset + + +
+ Deep learning-based ear disease diagnosis technology has proven effective and +affordable. However, due to the lack of ear endoscope datasets with diversity, +the practical potential of the deep learning model has not been thoroughly +studied. Moreover, existing research failed to achieve a good trade-off between +model inference speed and parameter size, rendering models inapplicable in +real-world settings. To address these challenges, we constructed the first +large-scale ear endoscopic dataset comprising eight types of ear diseases and +disease-free samples from two institutions. Inspired by ShuffleNetV2, we +proposed Best-EarNet, an ultrafast and ultralight network enabling real-time +ear disease diagnosis. Best-EarNet incorporates a novel Local-Global Spatial +Feature Fusion Module and multi-scale supervision strategy, which facilitates +the model focusing on global-local information within feature maps at various +levels. Utilizing transfer learning, the accuracy of Best-EarNet with only +0.77M parameters achieves 95.23% (internal 22,581 images) and 92.14% (external +1,652 images), respectively. In particular, it achieves an average frame per +second of 80 on the CPU. From the perspective of model practicality, the +proposed Best-EarNet is superior to state-of-the-art backbone models in ear +lesion detection tasks. Most importantly, Ear-keeper, an intelligent diagnosis +system based Best-EarNet, was developed successfully and deployed on common +electronic devices (smartphone, tablet computer and personal computer). In the +future, Ear-Keeper has the potential to assist the public and healthcare +providers in performing comprehensive scanning and diagnosis of the ear canal +in real-time video, thereby promptly detecting ear lesions. + +
+
+ comment: 18 pages,8 figures +
+
+
+
+
+ + ♻ ☆ GPT as Psychologist? Preliminary Evaluations for GPT-4V on Visual + Affective Computing + + +
+ Multimodal large language models (MLLMs) are designed to process and +integrate information from multiple sources, such as text, speech, images, and +videos. Despite its success in language understanding, it is critical to +evaluate the performance of downstream tasks for better human-centric +applications. This paper assesses the application of MLLMs with 5 crucial +abilities for affective computing, spanning from visual affective tasks and +reasoning tasks. The results show that \gpt has high accuracy in facial action +unit recognition and micro-expression detection while its general facial +expression recognition performance is not accurate. We also highlight the +challenges of achieving fine-grained micro-expression recognition and the +potential for further study and demonstrate the versatility and potential of +\gpt for handling advanced tasks in emotion recognition and related fields by +integrating with task-related agents for more complex tasks, such as heart rate +estimation through signal processing. In conclusion, this paper provides +valuable insights into the potential applications and challenges of MLLMs in +human-centric computing. Our interesting examples are at +https://github.com/EnVision-Research/GPT4Affectivity. + +
+
+
+
+
+ + ♻ ☆ GaussianImage: 1000 FPS Image Representation and Compression by 2D + Gaussian Splatting + + +
+ Implicit neural representations (INRs) recently achieved great success in +image representation and compression, offering high visual quality and fast +rendering speeds with 10-1000 FPS, assuming sufficient GPU resources are +available. However, this requirement often hinders their use on low-end devices +with limited memory. In response, we propose a groundbreaking paradigm of image +representation and compression by 2D Gaussian Splatting, named GaussianImage. +We first introduce 2D Gaussian to represent the image, where each Gaussian has +8 parameters including position, covariance and color. Subsequently, we unveil +a novel rendering algorithm based on accumulated summation. Remarkably, our +method with a minimum of 3$\times$ lower GPU memory usage and 5$\times$ faster +fitting time not only rivals INRs (e.g., WIRE, I-NGP) in representation +performance, but also delivers a faster rendering speed of 1500-2000 FPS +regardless of parameter size. Furthermore, we integrate existing vector +quantization technique to build an image codec. Experimental results +demonstrate that our codec attains rate-distortion performance comparable to +compression-based INRs such as COIN and COIN++, while facilitating decoding +speeds of approximately 1000 FPS. Additionally, preliminary proof of concept +shows that our codec surpasses COIN and COIN++ in performance when using +partial bits-back coding. Code will be available at +https://github.com/Xinjie-Q/GaussianImage. + +
+
+
+
+
+ + ♻ ☆ Re-DiffiNet: Modeling discrepancies in tumor segmentation using + diffusion models + + +
+ Identification of tumor margins is essential for surgical decision-making for +glioblastoma patients and provides reliable assistance for neurosurgeons. +Despite improvements in deep learning architectures for tumor segmentation over +the years, creating a fully autonomous system suitable for clinical floors +remains a formidable challenge because the model predictions have not yet +reached the desired level of accuracy and generalizability for clinical +applications. Generative modeling techniques have seen significant improvements +in recent times. Specifically, Generative Adversarial Networks (GANs) and +Denoising-diffusion-based models (DDPMs) have been used to generate +higher-quality images with fewer artifacts and finer attributes. In this work, +we introduce a framework called Re-Diffinet for modeling the discrepancy +between the outputs of a segmentation model like U-Net and the ground truth, +using DDPMs. By explicitly modeling the discrepancy, the results show an +average improvement of 0.55\% in the Dice score and 16.28\% in HD95 from +cross-validation over 5-folds, compared to the state-of-the-art U-Net +segmentation model. + +
+
+
+
+
+ + ♻ ☆ AUEditNet: Dual-Branch Facial Action Unit Intensity Manipulation with + Implicit Disentanglement + + +
+ Facial action unit (AU) intensity plays a pivotal role in quantifying +fine-grained expression behaviors, which is an effective condition for facial +expression manipulation. However, publicly available datasets containing +intensity annotations for multiple AUs remain severely limited, often featuring +a restricted number of subjects. This limitation places challenges to the AU +intensity manipulation in images due to disentanglement issues, leading +researchers to resort to other large datasets with pretrained AU intensity +estimators for pseudo labels. In addressing this constraint and fully +leveraging manual annotations of AU intensities for precise manipulation, we +introduce AUEditNet. Our proposed model achieves impressive intensity +manipulation across 12 AUs, trained effectively with only 18 subjects. +Utilizing a dual-branch architecture, our approach achieves comprehensive +disentanglement of facial attributes and identity without necessitating +additional loss functions or implementing with large batch sizes. This approach +offers a potential solution to achieve desired facial attribute editing despite +the dataset's limited subject count. Our experiments demonstrate AUEditNet's +superior accuracy in editing AU intensities, affirming its capability in +disentangling facial attributes and identity within a limited subject pool. +AUEditNet allows conditioning by either intensity values or target images, +eliminating the need for constructing AU combinations for specific facial +expression synthesis. Moreover, AU intensity estimation, as a downstream task, +validates the consistency between real and edited images, confirming the +effectiveness of our proposed AU intensity manipulation method. + +
+
+
+
+
+ + ♻ ☆ Ultra-Range Gesture Recognition using a Web-Camera in Human-Robot + Interaction + + +
+ Hand gestures play a significant role in human interactions where non-verbal +intentions, thoughts and commands are conveyed. In Human-Robot Interaction +(HRI), hand gestures offer a similar and efficient medium for conveying clear +and rapid directives to a robotic agent. However, state-of-the-art vision-based +methods for gesture recognition have been shown to be effective only up to a +user-camera distance of seven meters. Such a short distance range limits +practical HRI with, for example, service robots, search and rescue robots and +drones. In this work, we address the Ultra-Range Gesture Recognition (URGR) +problem by aiming for a recognition distance of up to 25 meters and in the +context of HRI. We propose the URGR framework, a novel deep-learning, using +solely a simple RGB camera. Gesture inference is based on a single image. +First, a novel super-resolution model termed High-Quality Network (HQ-Net) uses +a set of self-attention and convolutional layers to enhance the low-resolution +image of the user. Then, we propose a novel URGR classifier termed Graph Vision +Transformer (GViT) which takes the enhanced image as input. GViT combines the +benefits of a Graph Convolutional Network (GCN) and a modified Vision +Transformer (ViT). Evaluation of the proposed framework over diverse test data +yields a high recognition rate of 98.1%. The framework has also exhibited +superior performance compared to human recognition in ultra-range distances. +With the framework, we analyze and demonstrate the performance of an autonomous +quadruped robot directed by human gestures in complex ultra-range indoor and +outdoor environments, acquiring 96% recognition rate on average. + +
+
+ comment: Engineering Applications of Artificial Intelligence, In press +
+
+
+
+
+ + ♻ ☆ Concept-based Analysis of Neural Networks via Vision-Language Models + + +
+ The analysis of vision-based deep neural networks (DNNs) is highly desirable +but it is very challenging due to the difficulty of expressing formal +specifications for vision tasks and the lack of efficient verification +procedures. In this paper, we propose to leverage emerging multimodal, +vision-language, foundation models (VLMs) as a lens through which we can reason +about vision models. VLMs have been trained on a large body of images +accompanied by their textual description, and are thus implicitly aware of +high-level, human-understandable concepts describing the images. We describe a +logical specification language $\texttt{Con}_{\texttt{spec}}$ designed to +facilitate writing specifications in terms of these concepts. To define and +formally check $\texttt{Con}_{\texttt{spec}}$ specifications, we build a map +between the internal representations of a given vision model and a VLM, leading +to an efficient verification procedure of natural-language properties for +vision models. We demonstrate our techniques on a ResNet-based classifier +trained on the RIVAL-10 dataset using CLIP as the multimodal model. + +
+
+
+
+
+ + ♻ ☆ Learning to Predict 3D Rotational Dynamics from Images of a Rigid Body + with Unknown Mass Distribution + + +
+ In many real-world settings, image observations of freely rotating 3D rigid +bodies may be available when low-dimensional measurements are not. However, the +high-dimensionality of image data precludes the use of classical estimation +techniques to learn the dynamics. The usefulness of standard deep learning +methods is also limited, because an image of a rigid body reveals nothing about +the distribution of mass inside the body, which, together with initial angular +velocity, is what determines how the body will rotate. We present a +physics-based neural network model to estimate and predict 3D rotational +dynamics from image sequences. We achieve this using a multi-stage prediction +pipeline that maps individual images to a latent representation homeomorphic to +$\mathbf{SO}(3)$, computes angular velocities from latent pairs, and predicts +future latent states using the Hamiltonian equations of motion. We demonstrate +the efficacy of our approach on new rotating rigid-body datasets of sequences +of synthetic images of rotating objects, including cubes, prisms and +satellites, with unknown uniform and non-uniform mass distributions. Our model +outperforms competing baselines on our datasets, producing better qualitative +predictions and reducing the error observed for the state-of-the-art +Hamiltonian Generative Network by a factor of 2. + +
+
+ comment: Previously appeared as arXiv:2209.11355v2, which was submitted as a + replacement by accident. arXiv admin note: text overlap with arXiv:2209.11355 +
+
+
+
+
+ + ♻ ☆ Mask4Former: Mask Transformer for 4D Panoptic Segmentation ICRA 2024 + + +
+ Accurately perceiving and tracking instances over time is essential for the +decision-making processes of autonomous agents interacting safely in dynamic +environments. With this intention, we propose Mask4Former for the challenging +task of 4D panoptic segmentation of LiDAR point clouds. Mask4Former is the +first transformer-based approach unifying semantic instance segmentation and +tracking of sparse and irregular sequences of 3D point clouds into a single +joint model. Our model directly predicts semantic instances and their temporal +associations without relying on hand-crafted non-learned association strategies +such as probabilistic clustering or voting-based center prediction. Instead, +Mask4Former introduces spatio-temporal instance queries that encode the +semantic and geometric properties of each semantic tracklet in the sequence. In +an in-depth study, we find that promoting spatially compact instance +predictions is critical as spatio-temporal instance queries tend to merge +multiple semantically similar instances, even if they are spatially distant. To +this end, we regress 6-DOF bounding box parameters from spatio-temporal +instance queries, which are used as an auxiliary task to foster spatially +compact predictions. Mask4Former achieves a new state-of-the-art on the +SemanticKITTI test set with a score of 68.4 LSTQ. + +
+
+ comment: Renamed from MASK4D to Mask4Former. ICRA 2024. Project page: + https://vision.rwth-aachen.de/Mask4Former +
+
+
+
+
+ + ♻ ☆ Enhancing Hierarchical Transformers for Whole Brain Segmentation with + Intracranial Measurements Integration + + +
+ Whole brain segmentation with magnetic resonance imaging (MRI) enables the +non-invasive measurement of brain regions, including total intracranial volume +(TICV) and posterior fossa volume (PFV). Enhancing the existing whole brain +segmentation methodology to incorporate intracranial measurements offers a +heightened level of comprehensiveness in the analysis of brain structures. +Despite its potential, the task of generalizing deep learning techniques for +intracranial measurements faces data availability constraints due to limited +manually annotated atlases encompassing whole brain and TICV/PFV labels. In +this paper, we enhancing the hierarchical transformer UNesT for whole brain +segmentation to achieve segmenting whole brain with 133 classes and TICV/PFV +simultaneously. To address the problem of data scarcity, the model is first +pretrained on 4859 T1-weighted (T1w) 3D volumes sourced from 8 different sites. +These volumes are processed through a multi-atlas segmentation pipeline for +label generation, while TICV/PFV labels are unavailable. Subsequently, the +model is finetuned with 45 T1w 3D volumes from Open Access Series Imaging +Studies (OASIS) where both 133 whole brain classes and TICV/PFV labels are +available. We evaluate our method with Dice similarity coefficients(DSC). We +show that our model is able to conduct precise TICV/PFV estimation while +maintaining the 132 brain regions performance at a comparable level. Code and +trained model are available at: +https://github.com/MASILab/UNesT/tree/main/wholebrainSeg. + +
+
+
+
+
+ + ♻ ☆ Detecting Image Attribution for Text-to-Image Diffusion Models in RGB + and Beyond + + +
+ Modern text-to-image (T2I) diffusion models can generate images with +remarkable realism and creativity. These advancements have sparked research in +fake image detection and attribution, yet prior studies have not fully explored +the practical and scientific dimensions of this task. In addition to +attributing images to 12 state-of-the-art T2I generators, we provide extensive +analyses on what inference stage hyperparameters and image modifications are +discernible. Our experiments reveal that initialization seeds are highly +detectable, along with other subtle variations in the image generation process +to some extent. We further investigate what visual traces are leveraged in +image attribution by perturbing high-frequency details and employing mid-level +representations of image style and structure. Notably, altering high-frequency +information causes only slight reductions in accuracy, and training an +attributor on style representations outperforms training on RGB images. Our +analyses underscore that fake images are detectable and attributable at various +levels of visual granularity than previously explored. + +
+
+ comment: Code available at https://github.com/k8xu/ImageAttribution +
+
+
+
+
+ + ♻ ☆ Hierarchical Augmentation and Distillation for Class Incremental + Audio-Visual Video Recognition + + +
+ Audio-visual video recognition (AVVR) aims to integrate audio and visual +clues to categorize videos accurately. While existing methods train AVVR models +using provided datasets and achieve satisfactory results, they struggle to +retain historical class knowledge when confronted with new classes in +real-world situations. Currently, there are no dedicated methods for addressing +this problem, so this paper concentrates on exploring Class Incremental +Audio-Visual Video Recognition (CIAVVR). For CIAVVR, since both stored data and +learned model of past classes contain historical knowledge, the core challenge +is how to capture past data knowledge and past model knowledge to prevent +catastrophic forgetting. We introduce Hierarchical Augmentation and +Distillation (HAD), which comprises the Hierarchical Augmentation Module (HAM) +and Hierarchical Distillation Module (HDM) to efficiently utilize the +hierarchical structure of data and models, respectively. Specifically, HAM +implements a novel augmentation strategy, segmental feature augmentation, to +preserve hierarchical model knowledge. Meanwhile, HDM introduces newly designed +hierarchical (video-distribution) logical distillation and hierarchical +(snippet-video) correlative distillation to capture and maintain the +hierarchical intra-sample knowledge of each data and the hierarchical +inter-sample knowledge between data, respectively. Evaluations on four +benchmarks (AVE, AVK-100, AVK-200, and AVK-400) demonstrate that the proposed +HAD effectively captures hierarchical information in both data and models, +resulting in better preservation of historical class knowledge and improved +performance. Furthermore, we provide a theoretical analysis to support the +necessity of the segmental feature augmentation strategy. + +
+
+ comment: Submitted to TPAMI +
+
+
+
+
+ + ♻ ☆ Elucidating the Exposure Bias in Diffusion Models ICLR 2024 + + +
+ Diffusion models have demonstrated impressive generative capabilities, but +their \textit{exposure bias} problem, described as the input mismatch between +training and sampling, lacks in-depth exploration. In this paper, we +systematically investigate the exposure bias problem in diffusion models by +first analytically modelling the sampling distribution, based on which we then +attribute the prediction error at each sampling step as the root cause of the +exposure bias issue. Furthermore, we discuss potential solutions to this issue +and propose an intuitive metric for it. Along with the elucidation of exposure +bias, we propose a simple, yet effective, training-free method called Epsilon +Scaling to alleviate the exposure bias. We show that Epsilon Scaling explicitly +moves the sampling trajectory closer to the vector field learned in the +training phase by scaling down the network output, mitigating the input +mismatch between training and sampling. Experiments on various diffusion +frameworks (ADM, DDIM, EDM, LDM, DiT, PFGM++) verify the effectiveness of our +method. Remarkably, our ADM-ES, as a state-of-the-art stochastic sampler, +obtains 2.17 FID on CIFAR-10 under 100-step unconditional generation. The code +is available at \url{https://github.com/forever208/ADM-ES} and +\url{https://github.com/forever208/EDM-ES}. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Discovering Closed-Loop Failures of Vision-Based Controllers via + Reachability Analysis + + +
+ Machine learning driven image-based controllers allow robotic systems to take +intelligent actions based on the visual feedback from their environment. +Understanding when these controllers might lead to system safety violations is +important for their integration in safety-critical applications and engineering +corrective safety measures for the system. Existing methods leverage +simulation-based testing (or falsification) to find the failures of +vision-based controllers, i.e., the visual inputs that lead to closed-loop +safety violations. However, these techniques do not scale well to the scenarios +involving high-dimensional and complex visual inputs, such as RGB images. In +this work, we cast the problem of finding closed-loop vision failures as a +Hamilton-Jacobi (HJ) reachability problem. Our approach blends simulation-based +analysis with HJ reachability methods to compute an approximation of the +backward reachable tube (BRT) of the system, i.e., the set of unsafe states for +the system under vision-based controllers. Utilizing the BRT, we can tractably +and systematically find the system states and corresponding visual inputs that +lead to closed-loop failures. These visual inputs can be subsequently analyzed +to find the input characteristics that might have caused the failure. Besides +its scalability to high-dimensional visual inputs, an explicit computation of +BRT allows the proposed approach to capture non-trivial system failures that +are difficult to expose via random simulations. We demonstrate our framework on +two case studies involving an RGB image-based neural network controller for (a) +autonomous indoor navigation, and (b) autonomous aircraft taxiing. + +
+
+
+
+
+ + ♻ ☆ nnMobileNe: Rethinking CNN for Retinopathy Research CVPR + + +
+ Over the past few decades, convolutional neural networks (CNNs) have been at +the forefront of the detection and tracking of various retinal diseases (RD). +Despite their success, the emergence of vision transformers (ViT) in the 2020s +has shifted the trajectory of RD model development. The leading-edge +performance of ViT-based models in RD can be largely credited to their +scalability-their ability to improve as more parameters are added. As a result, +ViT-based models tend to outshine traditional CNNs in RD applications, albeit +at the cost of increased data and computational demands. ViTs also differ from +CNNs in their approach to processing images, working with patches rather than +local regions, which can complicate the precise localization of small, variably +presented lesions in RD. In our study, we revisited and updated the +architecture of a CNN model, specifically MobileNet, to enhance its utility in +RD diagnostics. We found that an optimized MobileNet, through selective +modifications, can surpass ViT-based models in various RD benchmarks, including +diabetic retinopathy grading, detection of multiple fundus diseases, and +classification of diabetic macular edema. The code is available at +https://github.com/Retinal-Research/NN-MOBILENET + +
+
+ comment: Accepted as a conference paper to 2024 CVPRW +
+
+
+
+
+ + ♻ ☆ LongVLM: Efficient Long Video Understanding via Large Language Models + + +
+ Empowered by Large Language Models (LLMs), recent advancements in VideoLLMs +have driven progress in various video understanding tasks. These models encode +video representations through pooling or query aggregation over a vast number +of visual tokens, making computational and memory costs affordable. Despite +successfully providing an overall comprehension of video content, existing +VideoLLMs still face challenges in achieving detailed understanding in videos +due to overlooking local information in long-term videos. To tackle this +challenge, we introduce LongVLM, a straightforward yet powerful VideoLLM for +long video understanding, building upon the observation that long videos often +consist of sequential key events, complex actions, and camera movements. Our +approach proposes to decompose long videos into multiple short-term segments +and encode local features for each local segment via a hierarchical token +merging module. These features are concatenated in temporal order to maintain +the storyline across sequential short-term segments. Additionally, we propose +to integrate global semantics into each local feature to enhance context +understanding. In this way, we encode video representations that incorporate +both local and global information, enabling the LLM to generate comprehensive +responses for long-term videos. Experimental results on the VideoChatGPT +benchmark and zero-shot video question-answering datasets demonstrate the +superior capabilities of our model over the previous state-of-the-art methods. +Qualitative examples demonstrate that our model produces more precise responses +for long videos understanding. Code will be available at +https://github.com/ziplab/LongVLM. + +
+
+
+
+
+ + ♻ ☆ GraphBEV: Towards Robust BEV Feature Alignment for Multi-Modal 3D Object + Detection + + +
+ Integrating LiDAR and camera information into Bird's-Eye-View (BEV) +representation has emerged as a crucial aspect of 3D object detection in +autonomous driving. However, existing methods are susceptible to the inaccurate +calibration relationship between LiDAR and the camera sensor. Such inaccuracies +result in errors in depth estimation for the camera branch, ultimately causing +misalignment between LiDAR and camera BEV features. In this work, we propose a +robust fusion framework called Graph BEV. Addressing errors caused by +inaccurate point cloud projection, we introduce a Local Align module that +employs neighbor-aware depth features via Graph matching. Additionally, we +propose a Global Align module to rectify the misalignment between LiDAR and +camera BEV features. Our Graph BEV framework achieves state-of-the-art +performance, with an mAP of 70.1\%, surpassing BEV Fusion by 1.6\% on the +nuscenes validation set. Importantly, our Graph BEV outperforms BEV Fusion by +8.3\% under conditions with misalignment noise. + +
+
+
+
+
+ + ♻ ☆ Exploring the Potential of Large Foundation Models for Open-Vocabulary + HOI Detection + + +
+ Open-vocabulary human-object interaction (HOI) detection, which is concerned +with the problem of detecting novel HOIs guided by natural language, is crucial +for understanding human-centric scenes. However, prior zero-shot HOI detectors +often employ the same levels of feature maps to model HOIs with varying +distances, leading to suboptimal performance in scenes containing human-object +pairs with a wide range of distances. In addition, these detectors primarily +rely on category names and overlook the rich contextual information that +language can provide, which is essential for capturing open vocabulary concepts +that are typically rare and not well-represented by category names alone. In +this paper, we introduce a novel end-to-end open vocabulary HOI detection +framework with conditional multi-level decoding and fine-grained semantic +enhancement (CMD-SE), harnessing the potential of Visual-Language Models +(VLMs). Specifically, we propose to model human-object pairs with different +distances with different levels of feature maps by incorporating a soft +constraint during the bipartite matching process. Furthermore, by leveraging +large language models (LLMs) such as GPT models, we exploit their extensive +world knowledge to generate descriptions of human body part states for various +interactions. Then we integrate the generalizable and fine-grained semantics of +human body parts to improve interaction recognition. Experimental results on +two datasets, SWIG-HOI and HICO-DET, demonstrate that our proposed method +achieves state-of-the-art results in open vocabulary HOI detection. The code +and models are available at https://github.com/ltttpku/CMD-SE-release. + +
+
+
+
+
+ + ♻ ☆ Towards Enhanced Analysis of Lung Cancer Lesions in EBUS-TBNA -- A + Semi-Supervised Video Object Detection Method + + +
+ This study aims to establish a computer-aided diagnostic system for lung +lesions using bronchoscope endobronchial ultrasound (EBUS) to assist physicians +in identifying lesion areas. During EBUS-transbronchial needle aspiration +(EBUS-TBNA) procedures, physicians rely on grayscale ultrasound images to +determine the location of lesions. However, these images often contain +significant noise and can be influenced by surrounding tissues or blood +vessels, making interpretation challenging. Previous research has lacked the +application of object detection models to EBUS-TBNA, and there has been no +well-defined solution for annotating the EBUS-TBNA dataset. In related studies +on ultrasound images, although models have been successful in capturing target +regions for their respective tasks, their training and predictions have been +based on two-dimensional images, limiting their ability to leverage temporal +features for improved predictions. This study introduces a three-dimensional +image-based object detection model. It utilizes an attention mechanism to +capture temporal correlations and we will implements a filtering mechanism to +select relevant information from previous frames. Subsequently, a +teacher-student model training approach is employed to optimize the model +further, leveraging unlabeled data. To mitigate the impact of poor-quality +pseudo-labels on the student model, we will add a special Gaussian Mixture +Model (GMM) to ensure the quality of pseudo-labels. + +
+
+
+
+
+ + ♻ ☆ Using Few-Shot Learning to Classify Primary Lung Cancer and Other + Malignancy with Lung Metastasis in Cytological Imaging via Endobronchial + Ultrasound Procedures + + +
+ This study aims to establish a computer-aided diagnosis system for +endobronchial ultrasound (EBUS) surgery to assist physicians in the preliminary +diagnosis of metastatic cancer. This involves arranging immediate examinations +for other sites of metastatic cancer after EBUS surgery, eliminating the need +to wait for reports, thereby shortening the waiting time by more than half and +enabling patients to detect other cancers earlier, allowing for early planning +and implementation of treatment plans. Unlike previous studies on cell image +classification, which have abundant datasets for training, this study must also +be able to make effective classifications despite the limited amount of case +data for lung metastatic cancer. In the realm of small data set classification +methods, Few-shot learning (FSL) has become mainstream in recent years. Through +its ability to train on small datasets and its strong generalization +capabilities, FSL shows potential in this task of lung metastatic cell image +classification. This study will adopt the approach of Few-shot learning, +referencing existing proposed models, and designing a model architecture for +classifying lung metastases cell images. Batch Spectral Regularization (BSR) +will be incorporated as a loss update parameter, and the Finetune method of PMF +will be modified. In terms of test results, the addition of BSR and the +modified Finetune method further increases the accuracy by 8.89% to 65.60%, +outperforming other FSL methods. This study confirms that FSL is superior to +supervised and transfer learning in classifying metastatic cancer and +demonstrates that using BSR as a loss function and modifying Finetune can +enhance the model's capabilities. + +
+
+
+
+
+ + ♻ ☆ Pyramid Deep Fusion Network for Two-Hand Reconstruction from RGB-D + Images + + +
+ Accurately recovering the dense 3D mesh of both hands from monocular images +poses considerable challenges due to occlusions and projection ambiguity. Most +of the existing methods extract features from color images to estimate the +root-aligned hand meshes, which neglect the crucial depth and scale information +in the real world. Given the noisy sensor measurements with limited resolution, +depth-based methods predict 3D keypoints rather than a dense mesh. These +limitations motivate us to take advantage of these two complementary inputs to +acquire dense hand meshes on a real-world scale. In this work, we propose an +end-to-end framework for recovering dense meshes for both hands, which employ +single-view RGB-D image pairs as input. The primary challenge lies in +effectively utilizing two different input modalities to mitigate the blurring +effects in RGB images and noises in depth images. Instead of directly treating +depth maps as additional channels for RGB images, we encode the depth +information into the unordered point cloud to preserve more geometric details. +Specifically, our framework employs ResNet50 and PointNet++ to derive features +from RGB and point cloud, respectively. Additionally, we introduce a novel +pyramid deep fusion network (PDFNet) to aggregate features at different scales, +which demonstrates superior efficacy compared to previous fusion strategies. +Furthermore, we employ a GCN-based decoder to process the fused features and +recover the corresponding 3D pose and dense mesh. Through comprehensive +ablation experiments, we have not only demonstrated the effectiveness of our +proposed fusion algorithm but also outperformed the state-of-the-art approaches +on publicly available datasets. To reproduce the results, we will make our +source code and models publicly available at +{https://github.com/zijinxuxu/PDFNet}. + +
+
+ comment: Accepted by TCSVT +
+
+
+
+
+ + ♻ ☆ CitDet: A Benchmark Dataset for Citrus Fruit Detection + + +
+ In this letter, we present a new dataset to advance the state of the art in +detecting citrus fruit and accurately estimate yield on trees affected by the +Huanglongbing (HLB) disease in orchard environments via imaging. Despite the +fact that significant progress has been made in solving the fruit detection +problem, the lack of publicly available datasets has complicated direct +comparison of results. For instance, citrus detection has long been of interest +to the agricultural research community, yet there is an absence of work, +particularly involving public datasets of citrus affected by HLB. To address +this issue, we enhance state-of-the-art object detection methods for use in +typical orchard settings. Concretely, we provide high-resolution images of +citrus trees located in an area known to be highly affected by HLB, along with +high-quality bounding box annotations of citrus fruit. Fruit on both the trees +and the ground are labeled to allow for identification of fruit location, which +contributes to advancements in yield estimation and potential measure of HLB +impact via fruit drop. The dataset consists of over 32,000 bounding box +annotations for fruit instances contained in 579 high-resolution images. In +summary, our contributions are the following: (i) we introduce a novel dataset +along with baseline performance benchmarks on multiple contemporary object +detection algorithms, (ii) we show the ability to accurately capture fruit +location on tree or on ground, and finally (ii) we present a correlation of our +results with yield estimations. + +
+
+ comment: Submitted to IEEE Robotics and Automation Letters (RA-L) +
+
+
+
+
+ + ♻ ☆ A Generic Shared Attention Mechanism for Various Backbone Neural + Networks + + +
+ The self-attention mechanism has emerged as a critical component for +improving the performance of various backbone neural networks. However, current +mainstream approaches individually incorporate newly designed self-attention +modules (SAMs) into each layer of the network for granted without fully +exploiting their parameters' potential. This leads to suboptimal performance +and increased parameter consumption as the network depth increases. To improve +this paradigm, in this paper, we first present a counterintuitive but inherent +phenomenon: SAMs tend to produce strongly correlated attention maps across +different layers, with an average Pearson correlation coefficient of up to +0.85. Inspired by this inherent observation, we propose Dense-and-Implicit +Attention (DIA), which directly shares SAMs across layers and employs a long +short-term memory module to calibrate and bridge the highly correlated +attention maps of different layers, thus improving the parameter utilization +efficiency of SAMs. This design of DIA is also consistent with the neural +network's dynamical system perspective. Through extensive experiments, we +demonstrate that our simple yet effective DIA can consistently enhance various +network backbones, including ResNet, Transformer, and UNet, across tasks such +as image classification, object detection, and image generation using diffusion +models. + +
+
+ comment: Work in progress. arXiv admin note: text overlap with + arXiv:1905.10671 +
+
+
+
+
+ + ♻ ☆ Flying with Photons: Rendering Novel Views of Propagating Light + + +
+ We present an imaging and neural rendering technique that seeks to synthesize +videos of light propagating through a scene from novel, moving camera +viewpoints. Our approach relies on a new ultrafast imaging setup to capture a +first-of-its kind, multi-viewpoint video dataset with picosecond-level temporal +resolution. Combined with this dataset, we introduce an efficient neural volume +rendering framework based on the transient field. This field is defined as a +mapping from a 3D point and 2D direction to a high-dimensional, discrete-time +signal that represents time-varying radiance at ultrafast timescales. Rendering +with transient fields naturally accounts for effects due to the finite speed of +light, including viewpoint-dependent appearance changes caused by light +propagation delays to the camera. We render a range of complex effects, +including scattering, specular reflection, refraction, and diffraction. +Additionally, we demonstrate removing viewpoint-dependent propagation delays +using a time warping procedure, rendering of relativistic effects, and video +synthesis of direct and global components of light transport. + +
+
+ comment: Project page: https://anaghmalik.com/FlyingWithPhotons/ +
+
+
+
+
+ + ♻ ☆ Reconstructing Hand-Held Objects in 3D + + +
+ Objects manipulated by the hand (i.e., manipulanda) are particularly +challenging to reconstruct from in-the-wild RGB images or videos. Not only does +the hand occlude much of the object, but also the object is often only visible +in a small number of image pixels. At the same time, two strong anchors emerge +in this setting: (1) estimated 3D hands help disambiguate the location and +scale of the object, and (2) the set of manipulanda is small relative to all +possible objects. With these insights in mind, we present a scalable paradigm +for handheld object reconstruction that builds on recent breakthroughs in large +language/vision models and 3D object datasets. Our model, MCC-Hand-Object +(MCC-HO), jointly reconstructs hand and object geometry given a single RGB +image and inferred 3D hand as inputs. Subsequently, we use GPT-4(V) to retrieve +a 3D object model that matches the object in the image and rigidly align the +model to the network-inferred geometry; we call this alignment +Retrieval-Augmented Reconstruction (RAR). Experiments demonstrate that MCC-HO +achieves state-of-the-art performance on lab and Internet datasets, and we show +how RAR can be used to automatically obtain 3D labels for in-the-wild images of +hand-object interactions. + +
+
+ comment: Project page: https://janehwu.github.io/mcc-ho +
+
+
+
+
+ + ♻ ☆ Phase Guided Light Field for Spatial-Depth High Resolution 3D Imaging + + +
+ On 3D imaging, light field cameras typically are of single shot, and however, +they heavily suffer from low spatial resolution and depth accuracy. In this +paper, by employing an optical projector to project a group of single +high-frequency phase-shifted sinusoid patterns, we propose a phase guided light +field algorithm to significantly improve both the spatial and depth resolutions +for off-the-shelf light field cameras. First, for correcting the axial +aberrations caused by the main lens of our light field camera, we propose a +deformed cone model to calibrate our structured light field system. Second, +over wrapped phases computed from patterned images, we propose a stereo +matching algorithm, i.e. phase guided sum of absolute difference, to robustly +obtain the correspondence for each pair of neighbored two lenslets. Finally, by +introducing a virtual camera according to the basic geometrical optics of light +field imaging, we propose a reorganization strategy to reconstruct 3D point +clouds with spatial-depth high resolution. Experimental results show that, +compared with the state-of-the-art active light field methods, the proposed +reconstructs 3D point clouds with a spatial resolution of 1280$\times$720 with +factors 10$\times$ increased, while maintaining the same high depth resolution +and needing merely a single group of high-frequency patterns. + +
+
+
+
+
+ + ♻ ☆ Text-Based Reasoning About Vector Graphics + + +
+ While large multimodal models excel in broad vision-language benchmarks, they +often struggle with tasks requiring precise perception of low-level visual +details, such as comparing line lengths or solving simple mazes. In particular, +this failure mode persists in question-answering tasks about vector graphics -- +images composed purely of 2D objects and shapes. To address this challenge, we +propose the Visually Descriptive Language Model (VDLM), which performs +text-based reasoning about vector graphics. VDLM leverages Scalable Vector +Graphics (SVG) for a more precise visual description and first uses an +off-the-shelf raster-to-SVG algorithm for encoding. Since existing language +models cannot understand raw SVGs in a zero-shot setting, VDLM then bridges SVG +with pretrained language models through a newly introduced intermediate +symbolic representation, Primal Visual Description (PVD), comprising primitive +attributes (e.g., shape, position, measurement) with their corresponding +predicted values. PVD is task-agnostic and represents visual primitives that +are universal across all vector graphics. It can be learned with procedurally +generated (SVG, PVD) pairs and also enables the direct use of LLMs for +generalization to complex reasoning tasks. By casting an image to a text-based +representation, we can leverage the power of language models to learn alignment +from SVG to visual primitives and generalize to unseen question-answering +tasks. Empirical results show that VDLM achieves stronger zero-shot performance +compared to state-of-the-art LMMs, such as GPT-4V, in various low-level +multimodal perception and reasoning tasks on vector graphics. We additionally +present extensive analyses on VDLM's performance, demonstrating that our +framework offers better interpretability due to its disentangled perception and +reasoning processes. Project page: https://mikewangwzhl.github.io/VDLM/ + +
+
+ comment: Project page: https://mikewangwzhl.github.io/VDLM/ +
+
+
+
+
+ + ♻ ☆ Multi-Level Label Correction by Distilling Proximate Patterns for + Semi-supervised Semantic Segmentation + + +
+ Semi-supervised semantic segmentation relieves the reliance on large-scale +labeled data by leveraging unlabeled data. Recent semi-supervised semantic +segmentation approaches mainly resort to pseudo-labeling methods to exploit +unlabeled data. However, unreliable pseudo-labeling can undermine the +semi-supervision processes. In this paper, we propose an algorithm called +Multi-Level Label Correction (MLLC), which aims to use graph neural networks to +capture structural relationships in Semantic-Level Graphs (SLGs) and +Class-Level Graphs (CLGs) to rectify erroneous pseudo-labels. Specifically, +SLGs represent semantic affinities between pairs of pixel features, and CLGs +describe classification consistencies between pairs of pixel labels. With the +support of proximate pattern information from graphs, MLLC can rectify +incorrectly predicted pseudo-labels and can facilitate discriminative feature +representations. We design an end-to-end network to train and perform this +effective label corrections mechanism. Experiments demonstrate that MLLC can +significantly improve supervised baselines and outperforms state-of-the-art +approaches in different scenarios on Cityscapes and PASCAL VOC 2012 datasets. +Specifically, MLLC improves the supervised baseline by at least 5% and 2% with +DeepLabV2 and DeepLabV3+ respectively under different partition protocols. + +
+
+ comment: 12 pages, 8 figures. IEEE Transactions on Multimedia, 2024 +
+
+
+
+
+ + ♻ ☆ Leveraging Diffusion For Strong and High Quality Face Morphing Attacks + + +
+ Face morphing attacks seek to deceive a Face Recognition (FR) system by +presenting a morphed image consisting of the biometric qualities from two +different identities with the aim of triggering a false acceptance with one of +the two identities, thereby presenting a significant threat to biometric +systems. The success of a morphing attack is dependent on the ability of the +morphed image to represent the biometric characteristics of both identities +that were used to create the image. We present a novel morphing attack that +uses a Diffusion-based architecture to improve the visual fidelity of the image +and the ability of the morphing attack to represent characteristics from both +identities. We demonstrate the effectiveness of the proposed attack by +evaluating its visual fidelity via the Frechet Inception Distance (FID). Also, +extensive experiments are conducted to measure the vulnerability of FR systems +to the proposed attack. The ability of a morphing attack detector to detect the +proposed attack is measured and compared against two state-of-the-art GAN-based +morphing attacks along with two Landmark-based attacks. Additionally, a novel +metric to measure the relative strength between different morphing attacks is +introduced and evaluated. + +
+
+ comment: Diffusion Morphs (DiM) paper. Accepted in IEEE TBIOM +
+
+
+
+
+ + ♻ ☆ Spatio-Temporal Attention and Gaussian Processes for Personalized Video + Gaze Estimation CVPR 2024 + + +
+ Gaze is an essential prompt for analyzing human behavior and attention. +Recently, there has been an increasing interest in determining gaze direction +from facial videos. However, video gaze estimation faces significant +challenges, such as understanding the dynamic evolution of gaze in video +sequences, dealing with static backgrounds, and adapting to variations in +illumination. To address these challenges, we propose a simple and novel deep +learning model designed to estimate gaze from videos, incorporating a +specialized attention module. Our method employs a spatial attention mechanism +that tracks spatial dynamics within videos. This technique enables accurate +gaze direction prediction through a temporal sequence model, adeptly +transforming spatial observations into temporal insights, thereby significantly +improving gaze estimation accuracy. Additionally, our approach integrates +Gaussian processes to include individual-specific traits, facilitating the +personalization of our model with just a few labeled samples. Experimental +results confirm the efficacy of the proposed approach, demonstrating its +success in both within-dataset and cross-dataset settings. Specifically, our +proposed approach achieves state-of-the-art performance on the Gaze360 dataset, +improving by $2.5^\circ$ without personalization. Further, by personalizing the +model with just three samples, we achieved an additional improvement of +$0.8^\circ$. The code and pre-trained models are available at +\url{https://github.com/jswati31/stage}. + +
+
+ comment: Accepted at CVPR 2024 Gaze workshop +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 169 + +
+
+
+ + ☆ InternLM-XComposer2-4KHD: A Pioneering Large Vision-Language Model + Handling Resolutions from 336 Pixels to 4K HD + + +
+ The Large Vision-Language Model (LVLM) field has seen significant +advancements, yet its progression has been hindered by challenges in +comprehending fine-grained visual content due to limited resolution. Recent +efforts have aimed to enhance the high-resolution understanding capabilities of +LVLMs, yet they remain capped at approximately 1500 x 1500 pixels and +constrained to a relatively narrow resolution range. This paper represents +InternLM-XComposer2-4KHD, a groundbreaking exploration into elevating LVLM +resolution capabilities up to 4K HD (3840 x 1600) and beyond. Concurrently, +considering the ultra-high resolution may not be necessary in all scenarios, it +supports a wide range of diverse resolutions from 336 pixels to 4K standard, +significantly broadening its scope of applicability. Specifically, this +research advances the patch division paradigm by introducing a novel extension: +dynamic resolution with automatic patch configuration. It maintains the +training image aspect ratios while automatically varying patch counts and +configuring layouts based on a pre-trained Vision Transformer (ViT) (336 x +336), leading to dynamic training resolution from 336 pixels to 4K standard. +Our research demonstrates that scaling training resolution up to 4K HD leads to +consistent performance enhancements without hitting the ceiling of potential +improvements. InternLM-XComposer2-4KHD shows superb capability that matches or +even surpasses GPT-4V and Gemini Pro in 10 of the 16 benchmarks. The +InternLM-XComposer2-4KHD model series with 7B parameters are publicly available +at https://github.com/InternLM/InternLM-XComposer. + +
+
+ comment: Code and models are publicly available at + https://github.com/InternLM/InternLM-XComposer +
+
+
+
+
+ + ☆ MoReVQA: Exploring Modular Reasoning Models for Video Question Answering CVPR 2024 + + +
+ This paper addresses the task of video question answering (videoQA) via a +decomposed multi-stage, modular reasoning framework. Previous modular methods +have shown promise with a single planning stage ungrounded in visual content. +However, through a simple and effective baseline, we find that such systems can +lead to brittle behavior in practice for challenging videoQA settings. Thus, +unlike traditional single-stage planning methods, we propose a multi-stage +system consisting of an event parser, a grounding stage, and a final reasoning +stage in conjunction with an external memory. All stages are training-free, and +performed using few-shot prompting of large models, creating interpretable +intermediate outputs at each stage. By decomposing the underlying planning and +task complexity, our method, MoReVQA, improves over prior work on standard +videoQA benchmarks (NExT-QA, iVQA, EgoSchema, ActivityNet-QA) with +state-of-the-art results, and extensions to related tasks (grounded videoQA, +paragraph captioning). + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Can Feedback Enhance Semantic Grounding in Large Vision-Language Models? + + +
+ Enhancing semantic grounding abilities in Vision-Language Models (VLMs) often +involves collecting domain-specific training data, refining the network +architectures, or modifying the training recipes. In this work, we venture into +an orthogonal direction and explore whether VLMs can improve their semantic +grounding by "receiving" feedback, without requiring in-domain data, +fine-tuning, or modifications to the network architectures. We systematically +analyze this hypothesis using a feedback mechanism composed of a binary signal. +We find that if prompted appropriately, VLMs can utilize feedback both in a +single step and iteratively, showcasing the potential of feedback as an +alternative technique to improve grounding in internet-scale VLMs. Furthermore, +VLMs, like LLMs, struggle to self-correct errors out-of-the-box. However, we +find that this issue can be mitigated via a binary verification mechanism. +Finally, we explore the potential and limitations of amalgamating these +findings and applying them iteratively to automatically enhance VLMs' grounding +performance, showing grounding accuracy consistently improves using automated +feedback across all models in all settings investigated. Overall, our iterative +framework improves semantic grounding in VLMs by more than 15 accuracy points +under noise-free feedback and up to 5 accuracy points under a simple automated +binary verification mechanism. The project website is hosted at +https://andrewliao11.github.io/vlms_feedback + +
+
+ comment: 31 pages, 15 figures +
+
+
+
+
+ + ☆ Reconstructing Hand-Held Objects in 3D + + +
+ Objects manipulated by the hand (i.e., manipulanda) are particularly +challenging to reconstruct from in-the-wild RGB images or videos. Not only does +the hand occlude much of the object, but also the object is often only visible +in a small number of image pixels. At the same time, two strong anchors emerge +in this setting: (1) estimated 3D hands help disambiguate the location and +scale of the object, and (2) the set of manipulanda is small relative to all +possible objects. With these insights in mind, we present a scalable paradigm +for handheld object reconstruction that builds on recent breakthroughs in large +language/vision models and 3D object datasets. Our model, MCC-Hand-Object +(MCC-HO), jointly reconstructs hand and object geometry given a single RGB +image and inferred 3D hand as inputs. Subsequently, we use GPT-4(V) to retrieve +a 3D object model that matches the object in the image and rigidly align the +model to the network-inferred geometry; we call this alignment +Retrieval-Augmented Reconstruction (RAR). Experiments demonstrate that MCC-HO +achieves state-of-the-art performance on lab and Internet datasets, and we show +how RAR can be used to automatically obtain 3D labels for in-the-wild images of +hand-object interactions. + +
+
+
+
+
+ + ☆ Flying With Photons: Rendering Novel Views of Propagating Light + + +
+ We present an imaging and neural rendering technique that seeks to synthesize +videos of light propagating through a scene from novel, moving camera +viewpoints. Our approach relies on a new ultrafast imaging setup to capture a +first-of-its kind, multi-viewpoint video dataset with picosecond-level temporal +resolution. Combined with this dataset, we introduce an efficient neural volume +rendering framework based on the transient field. This field is defined as a +mapping from a 3D point and 2D direction to a high-dimensional, discrete-time +signal that represents time-varying radiance at ultrafast timescales. Rendering +with transient fields naturally accounts for effects due to the finite speed of +light, including viewpoint-dependent appearance changes caused by light +propagation delays to the camera. We render a range of complex effects, +including scattering, specular reflection, refraction, and diffraction. +Additionally, we demonstrate removing viewpoint-dependent propagation delays +using a time warping procedure, rendering of relativistic effects, and video +synthesis of direct and global components of light transport. + +
+
+ comment: Project page: https://anaghmalik.com/FlyingWithPhotons/ +
+
+
+
+
+ + ☆ RhythmMamba: Fast Remote Physiological Measurement with Arbitrary Length + Videos + + +
+ Remote photoplethysmography (rPPG) is a non-contact method for detecting +physiological signals from facial videos, holding great potential in various +applications such as healthcare, affective computing, and anti-spoofing. +Existing deep learning methods struggle to address two core issues of rPPG +simultaneously: extracting weak rPPG signals from video segments with large +spatiotemporal redundancy and understanding the periodic patterns of rPPG among +long contexts. This represents a trade-off between computational complexity and +the ability to capture long-range dependencies, posing a challenge for rPPG +that is suitable for deployment on mobile devices. Based on the in-depth +exploration of Mamba's comprehension of spatial and temporal information, this +paper introduces RhythmMamba, an end-to-end Mamba-based method that employs +multi-temporal Mamba to constrain both periodic patterns and short-term trends, +coupled with frequency domain feed-forward to enable Mamba to robustly +understand the quasi-periodic patterns of rPPG. Extensive experiments show that +RhythmMamba achieves state-of-the-art performance with reduced parameters and +lower computational complexity. The proposed RhythmMamba can be applied to +video segments of any length without performance degradation. The codes are +available at https://github.com/zizheng-guo/RhythmMamba. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2402.12788 +
+
+
+
+
+ + ☆ Text-Based Reasoning About Vector Graphics + + +
+ While large multimodal models excel in broad vision-language benchmarks, they +often struggle with tasks requiring precise perception of low-level visual +details, such as comparing line lengths or solving simple mazes. In particular, +this failure mode persists in question-answering tasks about vector graphics -- +images composed purely of 2D objects and shapes. To address this challenge, we +propose the Visually Descriptive Language Model (VDLM), which performs +text-based reasoning about vector graphics. VDLM leverages Scalable Vector +Graphics (SVG) for a more precise visual description and first uses an +off-the-shelf raster-to-SVG algorithm for encoding. Since existing language +models cannot understand raw SVGs in a zero-shot setting, VDLM then bridges SVG +with pretrained language models through a newly introduced intermediate +symbolic representation, Primal Visual Description (PVD), comprising primitive +attributes (e.g., shape, position, measurement) with their corresponding +predicted values. PVD is task-agnostic and represents visual primitives that +are universal across all vector graphics. It can be learned with procedurally +generated (SVG, PVD) pairs and also enables the direct use of LLMs for +generalization to complex reasoning tasks. By casting an image to a text-based +representation, we can leverage the power of language models to learn alignment +from SVG to visual primitives and generalize to unseen question-answering +tasks. Empirical results show that VDLM achieves stronger zero-shot performance +compared to state-of-the-art LMMs, such as GPT-4V, in various low-level +multimodal perception and reasoning tasks on vector graphics. We additionally +present extensive analyses on VDLM's performance, demonstrating that our +framework offers better interpretability due to its disentangled perception and +reasoning processes. Project page: https://mikewangwzhl.github.io/VDLM/ + +
+
+ comment: Project page: https://mikewangwzhl.github.io/VDLM/ +
+
+
+
+
+ + ☆ Learning State-Invariant Representations of Objects from Image + Collections with State, Pose, and Viewpoint Changes + + +
+ We add one more invariance - state invariance - to the more commonly used +other invariances for learning object representations for recognition and +retrieval. By state invariance, we mean robust with respect to changes in the +structural form of the object, such as when an umbrella is folded, or when an +item of clothing is tossed on the floor. Since humans generally have no +difficulty in recognizing objects despite such state changes, we are naturally +faced with the question of whether it is possible to devise a neural +architecture with similar abilities. To that end, we present a novel dataset, +ObjectsWithStateChange, that captures state and pose variations in the object +images recorded from arbitrary viewpoints. We believe that this dataset will +facilitate research in fine-grained object recognition and retrieval of objects +that are capable of state changes. The goal of such research would be to train +models capable of generating object embeddings that remain invariant to state +changes while also staying invariant to transformations induced by changes in +viewpoint, pose, illumination, etc. To demonstrate the usefulness of the +ObjectsWithStateChange dataset, we also propose a curriculum learning strategy +that uses the similarity relationships in the learned embedding space after +each epoch to guide the training process. The model learns discriminative +features by comparing visually similar objects within and across different +categories, encouraging it to differentiate between objects that may be +challenging to distinguish due to changes in their state. We believe that this +strategy enhances the model's ability to capture discriminative features for +fine-grained tasks that may involve objects with state changes, leading to +performance improvements on object-level tasks not only on our new dataset, but +also on two other challenging multi-view datasets such as ModelNet40 and +ObjectPI. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ A comparative analysis of deep learning models for lung segmentation on + X-ray images + + +
+ Robust and highly accurate lung segmentation in X-rays is crucial in medical +imaging. This study evaluates deep learning solutions for this task, ranking +existing methods and analyzing their performance under diverse image +modifications. Out of 61 analyzed papers, only nine offered implementation or +pre-trained models, enabling assessment of three prominent methods: Lung VAE, +TransResUNet, and CE-Net. The analysis revealed that CE-Net performs best, +demonstrating the highest values in dice similarity coefficient and +intersection over union metric. + +
+
+ comment: published at the Polish Conference on Artificial Intelligence + (PP-RAI), 2024 +
+
+
+
+
+ + ☆ PURE: Turning Polysemantic Neurons Into Pure Features by Identifying + Relevant Circuits + + +
+ The field of mechanistic interpretability aims to study the role of +individual neurons in Deep Neural Networks. Single neurons, however, have the +capability to act polysemantically and encode for multiple (unrelated) +features, which renders their interpretation difficult. We present a method for +disentangling polysemanticity of any Deep Neural Network by decomposing a +polysemantic neuron into multiple monosemantic "virtual" neurons. This is +achieved by identifying the relevant sub-graph ("circuit") for each "pure" +feature. We demonstrate how our approach allows us to find and disentangle +various polysemantic units of ResNet models trained on ImageNet. While +evaluating feature visualizations using CLIP, our method effectively +disentangles representations, improving upon methods based on neuron +activations. Our code is available at https://github.com/maxdreyer/PURE. + +
+
+ comment: 14 pages (4 pages manuscript, 2 pages references, 8 pages appendix) +
+
+
+
+
+ + ☆ SmartControl: Enhancing ControlNet for Handling Rough Visual Conditions + + +
+ Human visual imagination usually begins with analogies or rough sketches. For +example, given an image with a girl playing guitar before a building, one may +analogously imagine how it seems like if Iron Man playing guitar before Pyramid +in Egypt. Nonetheless, visual condition may not be precisely aligned with the +imaginary result indicated by text prompt, and existing layout-controllable +text-to-image (T2I) generation models is prone to producing degraded generated +results with obvious artifacts. To address this issue, we present a novel T2I +generation method dubbed SmartControl, which is designed to modify the rough +visual conditions for adapting to text prompt. The key idea of our SmartControl +is to relax the visual condition on the areas that are conflicted with text +prompts. In specific, a Control Scale Predictor (CSP) is designed to identify +the conflict regions and predict the local control scales, while a dataset with +text prompts and rough visual conditions is constructed for training CSP. It is +worth noting that, even with a limited number (e.g., 1,000~2,000) of training +samples, our SmartControl can generalize well to unseen objects. Extensive +experiments on four typical visual condition types clearly show the efficacy of +our SmartControl against state-of-the-arts. Source code, pre-trained models, +and datasets are available at https://github.com/liuxiaoyu1104/SmartControl. + +
+
+
+
+
+ + ☆ The Central Spanning Tree Problem + + +
+ Spanning trees are an important primitive in many data analysis tasks, when a +data set needs to be summarized in terms of its "skeleton", or when a +tree-shaped graph over all observations is required for downstream processing. +Popular definitions of spanning trees include the minimum spanning tree and the +optimum distance spanning tree, a.k.a. the minimum routing cost tree. When +searching for the shortest spanning tree but admitting additional branching +points, even shorter spanning trees can be realized: Steiner trees. +Unfortunately, both minimum spanning and Steiner trees are not robust with +respect to noise in the observations; that is, small perturbations of the +original data set often lead to drastic changes in the associated spanning +trees. In response, we make two contributions when the data lies in a Euclidean +space: on the theoretical side, we introduce a new optimization problem, the +"(branched) central spanning tree", which subsumes all previously mentioned +definitions as special cases. On the practical side, we show empirically that +the (branched) central spanning tree is more robust to noise in the data, and +as such is better suited to summarize a data set in terms of its skeleton. We +also propose a heuristic to address the NP-hard optimization problem, and +illustrate its use on single cell RNA expression data from biology and 3D point +clouds of plants. + +
+
+
+
+
+ + ☆ Multi-scale Dynamic and Hierarchical Relationship Modeling for Facial + Action Units Recognition CVPR2024 + + +
+ Human facial action units (AUs) are mutually related in a hierarchical +manner, as not only they are associated with each other in both spatial and +temporal domains but also AUs located in the same/close facial regions show +stronger relationships than those of different facial regions. While none of +existing approach thoroughly model such hierarchical inter-dependencies among +AUs, this paper proposes to comprehensively model multi-scale AU-related +dynamic and hierarchical spatio-temporal relationship among AUs for their +occurrences recognition. Specifically, we first propose a novel multi-scale +temporal differencing network with an adaptive weighting block to explicitly +capture facial dynamics across frames at different spatial scales, which +specifically considers the heterogeneity of range and magnitude in different +AUs' activation. Then, a two-stage strategy is introduced to hierarchically +model the relationship among AUs based on their spatial distribution (i.e., +local and cross-region AU relationship modelling). Experimental results +achieved on BP4D and DISFA show that our approach is the new state-of-the-art +in the field of AU occurrence recognition. Our code is publicly available at +https://github.com/CVI-SZU/MDHR. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ☆ QueSTMaps: Queryable Semantic Topological Maps for 3D Scene + Understanding + + +
+ Understanding the structural organisation of 3D indoor scenes in terms of +rooms is often accomplished via floorplan extraction. Robotic tasks such as +planning and navigation require a semantic understanding of the scene as well. +This is typically achieved via object-level semantic segmentation. However, +such methods struggle to segment out topological regions like "kitchen" in the +scene. In this work, we introduce a two-step pipeline. First, we extract a +topological map, i.e., floorplan of the indoor scene using a novel +multi-channel occupancy representation. Then, we generate CLIP-aligned features +and semantic labels for every room instance based on the objects it contains +using a self-attention transformer. Our language-topology alignment supports +natural language querying, e.g., a "place to cook" locates the "kitchen". We +outperform the current state-of-the-art on room segmentation by ~20% and room +classification by ~12%. Our detailed qualitative analysis and ablation studies +provide insights into the problem of joint structural and semantic 3D scene +understanding. + +
+
+
+
+
+ + ☆ Seasonal Fire Prediction using Spatio-Temporal Deep Neural Networks + + +
+ With climate change expected to exacerbate fire weather conditions, the +accurate anticipation of wildfires on a global scale becomes increasingly +crucial for disaster mitigation. In this study, we utilize SeasFire, a +comprehensive global wildfire dataset with climate, vegetation, oceanic +indices, and human-related variables, to enable seasonal wildfire forecasting +with machine learning. For the predictive analysis, we train deep learning +models with different architectures that capture the spatio-temporal context +leading to wildfires. Our investigation focuses on assessing the effectiveness +of these models in predicting the presence of burned areas at varying +forecasting time horizons globally, extending up to six months into the future, +and on how different spatial or/and temporal context affects the performance of +the models. Our findings demonstrate the great potential of deep learning +models in seasonal fire forecasting; longer input time-series leads to more +robust predictions across varying forecasting horizons, while integrating +spatial information to capture wildfire spatio-temporal dynamics boosts +performance. Finally, our results hint that in order to enhance performance at +longer forecasting horizons, a larger receptive field spatially needs to be +considered. + +
+
+
+
+
+ + ☆ pfl-research: simulation framework for accelerating research in Private + Federated Learning + + +
+ Federated learning (FL) is an emerging machine learning (ML) training +paradigm where clients own their data and collaborate to train a global model, +without revealing any data to the server and other participants. Researchers +commonly perform experiments in a simulation environment to quickly iterate on +ideas. However, existing open-source tools do not offer the efficiency required +to simulate FL on larger and more realistic FL datasets. We introduce +pfl-research, a fast, modular, and easy-to-use Python framework for simulating +FL. It supports TensorFlow, PyTorch, and non-neural network models, and is +tightly integrated with state-of-the-art privacy algorithms. We study the speed +of open-source FL frameworks and show that pfl-research is 7-72$\times$ faster +than alternative open-source frameworks on common cross-device setups. Such +speedup will significantly boost the productivity of the FL research community +and enable testing hypotheses on realistic FL datasets that were previously too +resource intensive. We release a suite of benchmarks that evaluates an +algorithm's overall performance on a diverse set of realistic scenarios. The +code is available on GitHub at https://github.com/apple/pfl-research. + +
+
+
+
+
+ + ☆ Magic-Boost: Boost 3D Generation with Mutli-View Conditioned Diffusion + + +
+ Benefiting from the rapid development of 2D diffusion models, 3D content +creation has made significant progress recently. One promising solution +involves the fine-tuning of pre-trained 2D diffusion models to harness their +capacity for producing multi-view images, which are then lifted into accurate +3D models via methods like fast-NeRFs or large reconstruction models. However, +as inconsistency still exists and limited generated resolution, the generation +results of such methods still lack intricate textures and complex geometries. +To solve this problem, we propose Magic-Boost, a multi-view conditioned +diffusion model that significantly refines coarse generative results through a +brief period of SDS optimization ($\sim15$min). Compared to the previous text +or single image based diffusion models, Magic-Boost exhibits a robust +capability to generate images with high consistency from pseudo synthesized +multi-view images. It provides precise SDS guidance that well aligns with the +identity of the input images, enriching the local detail in both geometry and +texture of the initial generative results. Extensive experiments show +Magic-Boost greatly enhances the coarse inputs and generates high-quality 3D +assets with rich geometric and textural details. (Project Page: +https://magic-research.github.io/magic-boost/) + +
+
+
+
+
+ + ☆ ZeST: Zero-Shot Material Transfer from a Single Image + + +
+ We propose ZeST, a method for zero-shot material transfer to an object in the +input image given a material exemplar image. ZeST leverages existing diffusion +adapters to extract implicit material representation from the exemplar image. +This representation is used to transfer the material using pre-trained +inpainting diffusion model on the object in the input image using depth +estimates as geometry cue and grayscale object shading as illumination cues. +The method works on real images without any training resulting a zero-shot +approach. Both qualitative and quantitative results on real and synthetic +datasets demonstrate that ZeST outputs photorealistic images with transferred +materials. We also show the application of ZeST to perform multiple edits and +robust material assignment under different illuminations. Project Page: +https://ttchengab.github.io/zest + +
+
+ comment: Project Page: https://ttchengab.github.io/zest +
+
+
+
+
+ + ☆ Emergent Dynamics in Neural Cellular Automata + + +
+ Neural Cellular Automata (NCA) models are trainable variations of traditional +Cellular Automata (CA). Emergent motion in the patterns created by NCA has been +successfully applied to synthesize dynamic textures. However, the conditions +required for an NCA to display dynamic patterns remain unexplored. Here, we +investigate the relationship between the NCA architecture and the emergent +dynamics of the trained models. Specifically, we vary the number of channels in +the cell state and the number of hidden neurons in the MultiLayer Perceptron +(MLP), and draw a relationship between the combination of these two variables +and the motion strength between successive frames. Our analysis reveals that +the disparity and proportionality between these two variables have a strong +correlation with the emergent dynamics in the NCA output. We thus propose a +design principle for creating dynamic NCA. + +
+
+ comment: 2 pages +
+
+
+
+
+ + ☆ Raster Forge: Interactive Raster Manipulation Library and GUI for Python + + +
+ Raster Forge is a Python library and graphical user interface for raster data +manipulation and analysis. The tool is focused on remote sensing applications, +particularly in wildfire management. It allows users to import, visualize, and +process raster layers for tasks such as image compositing or topographical +analysis. For wildfire management, it generates fuel maps using predefined +models. Its impact extends from disaster management to hydrological modeling, +agriculture, and environmental monitoring. Raster Forge can be a valuable asset +for geoscientists and researchers who rely on raster data analysis, enhancing +geospatial data processing and visualization across various disciplines. + +
+
+
+
+
+ + ☆ VISION2UI: A Real-World Dataset with Layout for Code Generation from UI + Designs + + +
+ Automatically generating UI code from webpage design visions can +significantly alleviate the burden of developers, enabling beginner developers +or designers to directly generate Web pages from design diagrams. Currently, +prior research has accomplished the objective of generating UI code from +rudimentary design visions or sketches through designing deep neural networks. +Inspired by the groundbreaking advancements achieved by Multimodal Large +Language Models (MLLMs), the automatic generation of UI code from high-fidelity +design images is now emerging as a viable possibility. Nevertheless, our +investigation reveals that existing MLLMs are hampered by the scarcity of +authentic, high-quality, and large-scale datasets, leading to unsatisfactory +performance in automated UI code generation. To mitigate this gap, we present a +novel dataset, termed VISION2UI, extracted from real-world scenarios, augmented +with comprehensive layout information, tailored specifically for finetuning +MLLMs in UI code generation. Specifically, this dataset is derived through a +series of operations, encompassing collecting, cleaning, and filtering of the +open-source Common Crawl dataset. In order to uphold its quality, a neural +scorer trained on labeled samples is utilized to refine the data, retaining +higher-quality instances. Ultimately, this process yields a dataset comprising +2,000 (Much more is coming soon) parallel samples encompassing design visions +and UI code. The dataset is available at +https://huggingface.co/datasets/xcodemind/vision2ui. + +
+
+
+
+
+ + ☆ Dynamic Resolution Guidance for Facial Expression Recognition + + +
+ Facial expression recognition (FER) is vital for human-computer interaction +and emotion analysis, yet recognizing expressions in low-resolution images +remains challenging. This paper introduces a practical method called Dynamic +Resolution Guidance for Facial Expression Recognition (DRGFER) to effectively +recognize facial expressions in images with varying resolutions without +compromising FER model accuracy. Our framework comprises two main components: +the Resolution Recognition Network (RRN) and the Multi-Resolution Adaptation +Facial Expression Recognition Network (MRAFER). The RRN determines image +resolution, outputs a binary vector, and the MRAFER assigns images to suitable +facial expression recognition networks based on resolution. We evaluated DRGFER +on widely-used datasets RAFDB and FERPlus, demonstrating that our method +retains optimal model performance at each resolution and outperforms +alternative resolution approaches. The proposed framework exhibits robustness +against resolution variations and facial expressions, offering a promising +solution for real-world applications. + +
+
+
+
+
+ + ☆ Test-Time Adaptation with SaLIP: A Cascade of SAM and CLIP for Zero shot + Medical Image Segmentation + + +
+ The Segment Anything Model (SAM) and CLIP are remarkable vision foundation +models (VFMs). SAM, a prompt driven segmentation model, excels in segmentation +tasks across diverse domains, while CLIP is renowned for its zero shot +recognition capabilities. However, their unified potential has not yet been +explored in medical image segmentation. To adapt SAM to medical imaging, +existing methods primarily rely on tuning strategies that require extensive +data or prior prompts tailored to the specific task, making it particularly +challenging when only a limited number of data samples are available. This work +presents an in depth exploration of integrating SAM and CLIP into a unified +framework for medical image segmentation. Specifically, we propose a simple +unified framework, SaLIP, for organ segmentation. Initially, SAM is used for +part based segmentation within the image, followed by CLIP to retrieve the mask +corresponding to the region of interest (ROI) from the pool of SAM generated +masks. Finally, SAM is prompted by the retrieved ROI to segment a specific +organ. Thus, SaLIP is training and fine tuning free and does not rely on domain +expertise or labeled data for prompt engineering. Our method shows substantial +enhancements in zero shot segmentation, showcasing notable improvements in DICE +scores across diverse segmentation tasks like brain (63.46%), lung (50.11%), +and fetal head (30.82%), when compared to un prompted SAM. Code and text +prompts will be available online. + +
+
+
+
+
+ + ☆ High Noise Scheduling is a Must + + +
+ Consistency models possess high capabilities for image generation, advancing +sampling steps to a single step through their advanced techniques. Current +advancements move one step forward consistency training techniques and +eliminates the limitation of distillation training. Even though the proposed +curriculum and noise scheduling in improved training techniques yield better +results than basic consistency models, it lacks well balanced noise +distribution and its consistency between curriculum. In this study, it is +investigated the balance between high and low noise levels in noise +distribution and offered polynomial noise distribution to maintain the +stability. This proposed polynomial noise distribution is also supported with a +predefined Karras noises to prevent unique noise levels arises with Karras +noise generation algorithm. Furthermore, by elimination of learned noisy steps +with a curriculum based on sinusoidal function increase the performance of the +model in denoising. To make a fair comparison with the latest released +consistency model training techniques, experiments are conducted with same +hyper-parameters except curriculum and noise distribution. The models utilized +during experiments are determined with low depth to prove the robustness of our +proposed technique. The results show that the polynomial noise distribution +outperforms the model trained with log-normal noise distribution, yielding a +33.54 FID score after 100,000 training steps with constant discretization +steps. Additionally, the implementation of a sinusoidal-based curriculum +enhances denoising performance, resulting in a FID score of 30.48. + +
+
+
+
+
+ + ☆ DaF-BEVSeg: Distortion-aware Fisheye Camera based Bird's Eye View + Segmentation with Occlusion Reasoning + + +
+ Semantic segmentation is an effective way to perform scene understanding. +Recently, segmentation in 3D Bird's Eye View (BEV) space has become popular as +its directly used by drive policy. However, there is limited work on BEV +segmentation for surround-view fisheye cameras, commonly used in commercial +vehicles. As this task has no real-world public dataset and existing synthetic +datasets do not handle amodal regions due to occlusion, we create a synthetic +dataset using the Cognata simulator comprising diverse road types, weather, and +lighting conditions. We generalize the BEV segmentation to work with any camera +model; this is useful for mixing diverse cameras. We implement a baseline by +applying cylindrical rectification on the fisheye images and using a standard +LSS-based BEV segmentation model. We demonstrate that we can achieve better +performance without undistortion, which has the adverse effects of increased +runtime due to pre-processing, reduced field-of-view, and resampling artifacts. +Further, we introduce a distortion-aware learnable BEV pooling strategy that is +more effective for the fisheye cameras. We extend the model with an occlusion +reasoning module, which is critical for estimating in BEV space. Qualitative +performance of DaF-BEVSeg is showcased in the video at +https://streamable.com/ge4v51. + +
+
+
+
+
+ + ☆ HPNet: Dynamic Trajectory Forecasting with Historical Prediction + Attention CVPR2024 + + +
+ Predicting the trajectories of road agents is essential for autonomous +driving systems. The recent mainstream methods follow a static paradigm, which +predicts the future trajectory by using a fixed duration of historical frames. +These methods make the predictions independently even at adjacent time steps, +which leads to potential instability and temporal inconsistency. As successive +time steps have largely overlapping historical frames, their forecasting should +have intrinsic correlation, such as overlapping predicted trajectories should +be consistent, or be different but share the same motion goal depending on the +road situation. Motivated by this, in this work, we introduce HPNet, a novel +dynamic trajectory forecasting method. Aiming for stable and accurate +trajectory forecasting, our method leverages not only historical frames +including maps and agent states, but also historical predictions. Specifically, +we newly design a Historical Prediction Attention module to automatically +encode the dynamic relationship between successive predictions. Besides, it +also extends the attention range beyond the currently visible window +benefitting from the use of historical predictions. The proposed Historical +Prediction Attention together with the Agent Attention and Mode Attention is +further formulated as the Triple Factorized Attention module, serving as the +core design of HPNet.Experiments on the Argoverse and INTERACTION datasets show +that HPNet achieves state-of-the-art performance, and generates accurate and +stable future trajectories. Our code are available at +https://github.com/XiaolongTang23/HPNet. + +
+
+ comment: accepted by CVPR2024 +
+
+
+
+
+ + ☆ Rolling Shutter Correction with Intermediate Distortion Flow Estimation CVPR2024 + + +
+ This paper proposes to correct the rolling shutter (RS) distorted images by +estimating the distortion flow from the global shutter (GS) to RS directly. +Existing methods usually perform correction using the undistortion flow from +the RS to GS. They initially predict the flow from consecutive RS frames, +subsequently rescaling it as the displacement fields from the RS frame to the +underlying GS image using time-dependent scaling factors. Following this, +RS-aware forward warping is employed to convert the RS image into its GS +counterpart. Nevertheless, this strategy is prone to two shortcomings. First, +the undistortion flow estimation is rendered inaccurate by merely linear +scaling the flow, due to the complex non-linear motion nature. Second, RS-aware +forward warping often results in unavoidable artifacts. To address these +limitations, we introduce a new framework that directly estimates the +distortion flow and rectifies the RS image with the backward warping operation. +More specifically, we first propose a global correlation-based flow attention +mechanism to estimate the initial distortion flow and GS feature jointly, which +are then refined by the following coarse-to-fine decoder layers. Additionally, +a multi-distortion flow prediction strategy is integrated to mitigate the issue +of inaccurate flow estimation further. Experimental results validate the +effectiveness of the proposed method, which outperforms state-of-the-art +approaches on various benchmarks while maintaining high efficiency. The project +is available at \url{https://github.com/ljzycmd/DFRSC}. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ☆ Matching 2D Images in 3D: Metric Relative Pose from Metric + Correspondences + + +
+ Given two images, we can estimate the relative camera pose between them by +establishing image-to-image correspondences. Usually, correspondences are +2D-to-2D and the pose we estimate is defined only up to scale. Some +applications, aiming at instant augmented reality anywhere, require +scale-metric pose estimates, and hence, they rely on external depth estimators +to recover the scale. We present MicKey, a keypoint matching pipeline that is +able to predict metric correspondences in 3D camera space. By learning to match +3D coordinates across images, we are able to infer the metric relative pose +without depth measurements. Depth measurements are also not required for +training, nor are scene reconstructions or image overlap information. MicKey is +supervised only by pairs of images and their relative poses. MicKey achieves +state-of-the-art performance on the Map-Free Relocalisation benchmark while +requiring less supervision than competing approaches. + +
+
+
+
+
+ + ☆ Audio-Visual Generalized Zero-Shot Learning using Pre-Trained Large + Multi-Modal Models CVPR + + +
+ Audio-visual zero-shot learning methods commonly build on features extracted +from pre-trained models, e.g. video or audio classification models. However, +existing benchmarks predate the popularization of large multi-modal models, +such as CLIP and CLAP. In this work, we explore such large pre-trained models +to obtain features, i.e. CLIP for visual features, and CLAP for audio features. +Furthermore, the CLIP and CLAP text encoders provide class label embeddings +which are combined to boost the performance of the system. We propose a simple +yet effective model that only relies on feed-forward neural networks, +exploiting the strong generalization capabilities of the new audio, visual and +textual features. Our framework achieves state-of-the-art performance on +VGGSound-GZSL, UCF-GZSL, and ActivityNet-GZSL with our new features. Code and +data available at: https://github.com/dkurzend/ClipClap-GZSL. + +
+
+ comment: CVPRw 2024 (L3D-IVU) +
+
+
+
+
+ + ☆ Fortifying Fully Convolutional Generative Adversarial Networks for Image + Super-Resolution Using Divergence Measures + + +
+ Super-Resolution (SR) is a time-hallowed image processing problem that aims +to improve the quality of a Low-Resolution (LR) sample up to the standard of +its High-Resolution (HR) counterpart. We aim to address this by introducing +Super-Resolution Generator (SuRGe), a fully-convolutional Generative +Adversarial Network (GAN)-based architecture for SR. We show that distinct +convolutional features obtained at increasing depths of a GAN generator can be +optimally combined by a set of learnable convex weights to improve the quality +of generated SR samples. In the process, we employ the Jensen-Shannon and the +Gromov-Wasserstein losses respectively between the SR-HR and LR-SR pairs of +distributions to further aid the generator of SuRGe to better exploit the +available information in an attempt to improve SR. Moreover, we train the +discriminator of SuRGe with the Wasserstein loss with gradient penalty, to +primarily prevent mode collapse. The proposed SuRGe, as an end-to-end GAN +workflow tailor-made for super-resolution, offers improved performance while +maintaining low inference time. The efficacy of SuRGe is substantiated by its +superior performance compared to 18 state-of-the-art contenders on 10 benchmark +datasets. + +
+
+
+
+
+ + ☆ Counterfactual Reasoning for Multi-Label Image Classification via + Patching-Based Training + + +
+ The key to multi-label image classification (MLC) is to improve model +performance by leveraging label correlations. Unfortunately, it has been shown +that overemphasizing co-occurrence relationships can cause the overfitting +issue of the model, ultimately leading to performance degradation. In this +paper, we provide a causal inference framework to show that the correlative +features caused by the target object and its co-occurring objects can be +regarded as a mediator, which has both positive and negative impacts on model +predictions. On the positive side, the mediator enhances the recognition +performance of the model by capturing co-occurrence relationships; on the +negative side, it has the harmful causal effect that causes the model to make +an incorrect prediction for the target object, even when only co-occurring +objects are present in an image. To address this problem, we propose a +counterfactual reasoning method to measure the total direct effect, achieved by +enhancing the direct effect caused only by the target object. Due to the +unknown location of the target object, we propose patching-based training and +inference to accomplish this goal, which divides an image into multiple patches +and identifies the pivot patch that contains the target object. Experimental +results on multiple benchmark datasets with diverse configurations validate +that the proposed method can achieve state-of-the-art performance. + +
+
+
+
+
+ + ☆ NoiseNCA: Noisy Seed Improves Spatio-Temporal Continuity of Neural + Cellular Automata + + +
+ Neural Cellular Automata (NCA) is a class of Cellular Automata where the +update rule is parameterized by a neural network that can be trained using +gradient descent. In this paper, we focus on NCA models used for texture +synthesis, where the update rule is inspired by partial differential equations +(PDEs) describing reaction-diffusion systems. To train the NCA model, the +spatio-termporal domain is discretized, and Euler integration is used to +numerically simulate the PDE. However, whether a trained NCA truly learns the +continuous dynamic described by the corresponding PDE or merely overfits the +discretization used in training remains an open question. We study NCA models +at the limit where space-time discretization approaches continuity. We find +that existing NCA models tend to overfit the training discretization, +especially in the proximity of the initial condition, also called "seed". To +address this, we propose a solution that utilizes uniform noise as the initial +condition. We demonstrate the effectiveness of our approach in preserving the +consistency of NCA dynamics across a wide range of spatio-temporal +granularities. Our improved NCA model enables two new test-time interactions by +allowing continuous control over the speed of pattern formation and the scale +of the synthesized patterns. We demonstrate this new NCA feature in our +interactive online demo. Our work reveals that NCA models can learn continuous +dynamics and opens new venues for NCA research from a dynamical systems' +perspective. + +
+
+ comment: 9 pages, 12 figures +
+
+
+
+
+ + ☆ Learning Embeddings with Centroid Triplet Loss for Object Identification + in Robotic Grasping + + +
+ Foundation models are a strong trend in deep learning and computer vision. +These models serve as a base for applications as they require minor or no +further fine-tuning by developers to integrate into their applications. +Foundation models for zero-shot object segmentation such as Segment Anything +(SAM) output segmentation masks from images without any further object +information. When they are followed in a pipeline by an object identification +model, they can perform object detection without training. Here, we focus on +training such an object identification model. A crucial practical aspect for an +object identification model is to be flexible in input size. As object +identification is an image retrieval problem, a suitable method should handle +multi-query multi-gallery situations without constraining the number of input +images (e.g. by having fixed-size aggregation layers). The key solution to +train such a model is the centroid triplet loss (CTL), which aggregates image +features to their centroids. CTL yields high accuracy, avoids misleading +training signals and keeps the model input size flexible. In our experiments, +we establish a new state of the art on the ArmBench object identification task, +which shows general applicability of our model. We furthermore demonstrate an +integrated unseen object detection pipeline on the challenging HOPE dataset, +which requires fine-grained detection. There, our pipeline matches and +surpasses related methods which have been trained on dataset-specific data. + +
+
+
+
+
+ + ☆ Robust Confidence Intervals in Stereo Matching using Possibility Theory + + +
+ We propose a method for estimating disparity confidence intervals in stereo +matching problems. Confidence intervals provide complementary information to +usual confidence measures. To the best of our knowledge, this is the first +method creating disparity confidence intervals based on the cost volume. This +method relies on possibility distributions to interpret the epistemic +uncertainty of the cost volume. Our method has the benefit of having a +white-box nature, differing in this respect from current state-of-the-art deep +neural networks approaches. The accuracy and size of confidence intervals are +validated using the Middlebury stereo datasets as well as a dataset of +satellite images. This contribution is freely available on GitHub. + +
+
+
+
+
+ + ☆ 3D Geometry-aware Deformable Gaussian Splatting for Dynamic View + Synthesis CVPR 2024 + + +
+ In this paper, we propose a 3D geometry-aware deformable Gaussian Splatting +method for dynamic view synthesis. Existing neural radiance fields (NeRF) based +solutions learn the deformation in an implicit manner, which cannot incorporate +3D scene geometry. Therefore, the learned deformation is not necessarily +geometrically coherent, which results in unsatisfactory dynamic view synthesis +and 3D dynamic reconstruction. Recently, 3D Gaussian Splatting provides a new +representation of the 3D scene, building upon which the 3D geometry could be +exploited in learning the complex 3D deformation. Specifically, the scenes are +represented as a collection of 3D Gaussian, where each 3D Gaussian is optimized +to move and rotate over time to model the deformation. To enforce the 3D scene +geometry constraint during deformation, we explicitly extract 3D geometry +features and integrate them in learning the 3D deformation. In this way, our +solution achieves 3D geometry-aware deformation modeling, which enables +improved dynamic view synthesis and 3D dynamic reconstruction. Extensive +experimental results on both synthetic and real datasets prove the superiority +of our solution, which achieves new state-of-the-art performance. + The project is available at https://npucvr.github.io/GaGS/ + +
+
+ comment: Accepted by CVPR 2024. Project page: https://npucvr.github.io/GaGS/ +
+
+
+
+
+ + ☆ Spatial-Temporal Multi-level Association for Video Object Segmentation + + +
+ Existing semi-supervised video object segmentation methods either focus on +temporal feature matching or spatial-temporal feature modeling. However, they +do not address the issues of sufficient target interaction and efficient +parallel processing simultaneously, thereby constraining the learning of +dynamic, target-aware features. To tackle these limitations, this paper +proposes a spatial-temporal multi-level association framework, which jointly +associates reference frame, test frame, and object features to achieve +sufficient interaction and parallel target ID association with a +spatial-temporal memory bank for efficient video object segmentation. +Specifically, we construct a spatial-temporal multi-level feature association +module to learn better target-aware features, which formulates feature +extraction and interaction as the efficient operations of object +self-attention, reference object enhancement, and test reference correlation. +In addition, we propose a spatial-temporal memory to assist feature association +and temporal ID assignment and correlation. We evaluate the proposed method by +conducting extensive experiments on numerous video object segmentation +datasets, including DAVIS 2016/2017 val, DAVIS 2017 test-dev, and YouTube-VOS +2018/2019 val. The favorable performance against the state-of-the-art methods +demonstrates the effectiveness of our approach. All source code and trained +models will be made publicly available. + +
+
+
+
+
+ + ☆ Playing to Vision Foundation Model's Strengths in Stereo Matching + + +
+ Stereo matching has become a key technique for 3D environment perception in +intelligent vehicles. For a considerable time, convolutional neural networks +(CNNs) have remained the mainstream choice for feature extraction in this +domain. Nonetheless, there is a growing consensus that the existing paradigm +should evolve towards vision foundation models (VFM), particularly those +developed based on vision Transformers (ViTs) and pre-trained through +self-supervision on extensive, unlabeled datasets. While VFMs are adept at +extracting informative, general-purpose visual features, specifically for dense +prediction tasks, their performance often lacks in geometric vision tasks. This +study serves as the first exploration of a viable approach for adapting VFMs to +stereo matching. Our ViT adapter, referred to as ViTAS, is constructed upon +three types of modules: spatial differentiation, patch attention fusion, and +cross-attention. The first module initializes feature pyramids, while the +latter two aggregate stereo and multi-scale contextual information into +fine-grained features, respectively. ViTAStereo, which combines ViTAS with cost +volume-based stereo matching back-end processes, achieves the top rank on the +KITTI Stereo 2012 dataset and outperforms the second-best network StereoBase by +approximately 7.9% in terms of the percentage of error pixels, with a tolerance +of 3 pixels. Additional experiments across diverse scenarios further +demonstrate its superior generalizability compared to all other +state-of-the-art approaches. We believe this new paradigm will pave the way for +the next generation of stereo matching networks. + +
+
+
+
+
+ + ☆ Robust feature knowledge distillation for enhanced performance of + lightweight crack segmentation models + + +
+ Vision-based crack detection faces deployment challenges due to the size of +robust models and edge device limitations. These can be addressed with +lightweight models trained with knowledge distillation (KD). However, +state-of-the-art (SOTA) KD methods compromise anti-noise robustness. This paper +develops Robust Feature Knowledge Distillation (RFKD), a framework to improve +robustness while retaining the precision of light models for crack +segmentation. RFKD distils knowledge from a teacher model's logit layers and +intermediate feature maps while leveraging mixed clean and noisy images to +transfer robust patterns to the student model, improving its precision, +generalisation, and anti-noise performance. To validate the proposed RFKD, a +lightweight crack segmentation model, PoolingCrack Tiny (PCT), with only 0.5 M +parameters, is also designed and used as the student to run the framework. The +results show a significant enhancement in noisy images, with RFKD reaching a +62% enhanced mean Dice score (mDS) compared to SOTA KD methods. + +
+
+ comment: 24 pages, 13 figures +
+
+
+
+
+ + ☆ Label-Efficient 3D Object Detection For Road-Side Units + + +
+ Occlusion presents a significant challenge for safety-critical applications +such as autonomous driving. Collaborative perception has recently attracted a +large research interest thanks to the ability to enhance the perception of +autonomous vehicles via deep information fusion with intelligent roadside units +(RSU), thus minimizing the impact of occlusion. While significant advancement +has been made, the data-hungry nature of these methods creates a major hurdle +for their real-world deployment, particularly due to the need for annotated RSU +data. Manually annotating the vast amount of RSU data required for training is +prohibitively expensive, given the sheer number of intersections and the effort +involved in annotating point clouds. We address this challenge by devising a +label-efficient object detection method for RSU based on unsupervised object +discovery. Our paper introduces two new modules: one for object discovery based +on a spatial-temporal aggregation of point clouds, and another for refinement. +Furthermore, we demonstrate that fine-tuning on a small portion of annotated +data allows our object discovery models to narrow the performance gap with, or +even surpass, fully supervised models. Extensive experiments are carried out in +simulated and real-world datasets to evaluate our method. + +
+
+ comment: IV 2024 +
+
+
+
+
+ + ☆ From Barlow Twins to Triplet Training: Differentiating Dementia with + Limited Data + + +
+ Differential diagnosis of dementia is challenging due to overlapping +symptoms, with structural magnetic resonance imaging (MRI) being the primary +method for diagnosis. Despite the clinical value of computer-aided differential +diagnosis, research has been limited, mainly due to the absence of public +datasets that contain diverse types of dementia. This leaves researchers with +small in-house datasets that are insufficient for training deep neural networks +(DNNs). Self-supervised learning shows promise for utilizing unlabeled MRI +scans in training, but small batch sizes for volumetric brain scans make its +application challenging. To address these issues, we propose Triplet Training +for differential diagnosis with limited target data. It consists of three key +stages: (i) self-supervised pre-training on unlabeled data with Barlow Twins, +(ii) self-distillation on task-related data, and (iii) fine-tuning on the +target dataset. Our approach significantly outperforms traditional training +strategies, achieving a balanced accuracy of 75.6%. We further provide insights +into the training process by visualizing changes in the latent space after each +step. Finally, we validate the robustness of Triplet Training in terms of its +individual components in a comprehensive ablation study. Our code is available +at https://github.com/ai-med/TripletTraining. + +
+
+ comment: Accepted for presentation at MIDL 2024 +
+
+
+
+
+ + ☆ ColorMNet: A Memory-based Deep Spatial-Temporal Feature Propagation + Network for Video Colorization + + +
+ How to effectively explore spatial-temporal features is important for video +colorization. Instead of stacking multiple frames along the temporal dimension +or recurrently propagating estimated features that will accumulate errors or +cannot explore information from far-apart frames, we develop a memory-based +feature propagation module that can establish reliable connections with +features from far-apart frames and alleviate the influence of inaccurately +estimated features. To extract better features from each frame for the +above-mentioned feature propagation, we explore the features from +large-pretrained visual models to guide the feature estimation of each frame so +that the estimated features can model complex scenarios. In addition, we note +that adjacent frames usually contain similar contents. To explore this property +for better spatial and temporal feature utilization, we develop a local +attention module to aggregate the features from adjacent frames in a +spatial-temporal neighborhood. We formulate our memory-based feature +propagation module, large-pretrained visual model guided feature estimation +module, and local attention module into an end-to-end trainable network (named +ColorMNet) and show that it performs favorably against state-of-the-art methods +on both the benchmark datasets and real-world scenarios. The source code and +pre-trained models will be available at +\url{https://github.com/yyang181/colormnet}. + +
+
+ comment: Project website: \url{https://github.com/yyang181/colormnet} +
+
+
+
+
+ + ☆ LRR: Language-Driven Resamplable Continuous Representation against + Adversarial Tracking Attacks + + +
+ Visual object tracking plays a critical role in visual-based autonomous +systems, as it aims to estimate the position and size of the object of interest +within a live video. Despite significant progress made in this field, +state-of-the-art (SOTA) trackers often fail when faced with adversarial +perturbations in the incoming frames. This can lead to significant robustness +and security issues when these trackers are deployed in the real world. To +achieve high accuracy on both clean and adversarial data, we propose building a +spatial-temporal continuous representation using the semantic text guidance of +the object of interest. This novel continuous representation enables us to +reconstruct incoming frames to maintain semantic and appearance consistency +with the object of interest and its clean counterparts. As a result, our +proposed method successfully defends against different SOTA adversarial +tracking attacks while maintaining high accuracy on clean data. In particular, +our method significantly increases tracking accuracy under adversarial attacks +with around 90% relative improvement on UAV123, which is even higher than the +accuracy on clean data. + +
+
+
+
+
+ + ☆ GHNeRF: Learning Generalizable Human Features with Efficient Neural + Radiance Fields + + +
+ Recent advances in Neural Radiance Fields (NeRF) have demonstrated promising +results in 3D scene representations, including 3D human representations. +However, these representations often lack crucial information on the underlying +human pose and structure, which is crucial for AR/VR applications and games. In +this paper, we introduce a novel approach, termed GHNeRF, designed to address +these limitations by learning 2D/3D joint locations of human subjects with NeRF +representation. GHNeRF uses a pre-trained 2D encoder streamlined to extract +essential human features from 2D images, which are then incorporated into the +NeRF framework in order to encode human biomechanic features. This allows our +network to simultaneously learn biomechanic features, such as joint locations, +along with human geometry and texture. To assess the effectiveness of our +method, we conduct a comprehensive comparison with state-of-the-art human NeRF +techniques and joint estimation algorithms. Our results show that GHNeRF can +achieve state-of-the-art results in near real-time. + +
+
+
+
+
+ + ☆ Anchor-based Robust Finetuning of Vision-Language Models CVPR2024 + + +
+ We aim at finetuning a vision-language model without hurting its +out-of-distribution (OOD) generalization. We address two types of OOD +generalization, i.e., i) domain shift such as natural to sketch images, and ii) +zero-shot capability to recognize the category that was not contained in the +finetune data. Arguably, the diminished OOD generalization after finetuning +stems from the excessively simplified finetuning target, which only provides +the class information, such as ``a photo of a [CLASS]''. This is distinct from +the process in that CLIP was pretrained, where there is abundant text +supervision with rich semantic information. Therefore, we propose to compensate +for the finetune process using auxiliary supervision with rich semantic +information, which acts as anchors to preserve the OOD generalization. +Specifically, two types of anchors are elaborated in our method, including i) +text-compensated anchor which uses the images from the finetune set but +enriches the text supervision from a pretrained captioner, ii) image-text-pair +anchor which is retrieved from the dataset similar to pretraining data of CLIP +according to the downstream task, associating with the original CLIP text with +rich semantics. Those anchors are utilized as auxiliary semantic information to +maintain the original feature space of CLIP, thereby preserving the OOD +generalization capabilities. Comprehensive experiments demonstrate that our +method achieves in-distribution performance akin to conventional finetuning +while attaining new state-of-the-art results on domain shift and zero-shot +learning benchmarks. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ☆ ActNetFormer: Transformer-ResNet Hybrid Method for Semi-Supervised + Action Recognition in Videos + + +
+ Human action or activity recognition in videos is a fundamental task in +computer vision with applications in surveillance and monitoring, self-driving +cars, sports analytics, human-robot interaction and many more. Traditional +supervised methods require large annotated datasets for training, which are +expensive and time-consuming to acquire. This work proposes a novel approach +using Cross-Architecture Pseudo-Labeling with contrastive learning for +semi-supervised action recognition. Our framework leverages both labeled and +unlabelled data to robustly learn action representations in videos, combining +pseudo-labeling with contrastive learning for effective learning from both +types of samples. We introduce a novel cross-architecture approach where 3D +Convolutional Neural Networks (3D CNNs) and video transformers (VIT) are +utilised to capture different aspects of action representations; hence we call +it ActNetFormer. The 3D CNNs excel at capturing spatial features and local +dependencies in the temporal domain, while VIT excels at capturing long-range +dependencies across frames. By integrating these complementary architectures +within the ActNetFormer framework, our approach can effectively capture both +local and global contextual information of an action. This comprehensive +representation learning enables the model to achieve better performance in +semi-supervised action recognition tasks by leveraging the strengths of each of +these architectures. Experimental results on standard action recognition +datasets demonstrate that our approach performs better than the existing +methods, achieving state-of-the-art performance with only a fraction of labeled +data. The official website of this work is available at: +https://github.com/rana2149/ActNetFormer. + +
+
+ comment: Submitted for peer review +
+
+
+
+
+ + ☆ Hyperparameter-Free Medical Image Synthesis for Sharing Data and + Improving Site-Specific Segmentation + + +
+ Sharing synthetic medical images is a promising alternative to sharing real +images that can improve patient privacy and data security. To get good results, +existing methods for medical image synthesis must be manually adjusted when +they are applied to unseen data. To remove this manual burden, we introduce a +Hyperparameter-Free distributed learning method for automatic medical image +Synthesis, Sharing, and Segmentation called HyFree-S3. For three diverse +segmentation settings (pelvic MRIs, lung X-rays, polyp photos), the use of +HyFree-S3 results in improved performance over training only with site-specific +data (in the majority of cases). The hyperparameter-free nature of the method +should make data synthesis and sharing easier, potentially leading to an +increase in the quantity of available data and consequently the quality of the +models trained that may ultimately be applied in the clinic. Our code is +available at https://github.com/AwesomeLemon/HyFree-S3 + +
+
+ comment: Accepted at MIDL 2024 +
+
+
+
+
+ + ☆ Automatic Defect Detection in Sewer Network Using Deep Learning Based + Object Detector + + +
+ Maintaining sewer systems in large cities is important, but also time and +effort consuming, because visual inspections are currently done manually. To +reduce the amount of aforementioned manual work, defects within sewer pipes +should be located and classified automatically. In the past, multiple works +have attempted solving this problem using classical image processing, machine +learning, or a combination of those. However, each provided solution only focus +on detecting a limited set of defect/structure types, such as fissure, root, +and/or connection. Furthermore, due to the use of hand-crafted features and +small training datasets, generalization is also problematic. In order to +overcome these deficits, a sizable dataset with 14.7 km of various sewer pipes +were annotated by sewer maintenance experts in the scope of this work. On top +of that, an object detector (EfficientDet-D0) was trained for automatic defect +detection. From the result of several expermients, peculiar natures of defects +in the context of object detection, which greatly effect annotation and +training process, are found and discussed. At the end, the final detector was +able to detect 83% of defects in the test set; out of the missing 17%, only +0.77% are very severe defects. This work provides an example of applying deep +learning-based object detection into an important but quiet engineering field. +It also gives some practical pointers on how to annotate peculiar "object", +such as defects. + +
+
+
+
+
+ + ☆ OmniFusion Technical Report + + +
+ Last year, multimodal architectures served up a revolution in AI-based +approaches and solutions, extending the capabilities of large language models +(LLM). We propose an \textit{OmniFusion} model based on a pretrained LLM and +adapters for visual modality. We evaluated and compared several architecture +design principles for better text and visual data coupling: MLP and transformer +adapters, various CLIP ViT-based encoders (SigLIP, InternVIT, etc.), and their +fusing approach, image encoding method (whole image or tiles encoding) and two +7B LLMs (the proprietary one and open-source Mistral). Experiments on 8 +visual-language benchmarks show the top score for the best OmniFusion setup in +terms of different VQA tasks in comparison with open-source LLaVA-like +solutions: VizWiz, Pope, MM-Vet, ScienceQA, MMBench, TextVQA, VQAv2, MMMU. We +also propose a variety of situations, where OmniFusion provides highly-detailed +answers in different domains: housekeeping, sightseeing, culture, medicine, +handwritten and scanned equations recognition, etc. Mistral-based OmniFusion +model is an open-source solution with weights, training and inference scripts +available at https://github.com/AIRI-Institute/OmniFusion. + +
+
+ comment: 17 pages, 4 figures, 9 tables, 2 appendices +
+
+
+
+
+ + ☆ Unified Physical-Digital Attack Detection Challenge + + +
+ Face Anti-Spoofing (FAS) is crucial to safeguard Face Recognition (FR) +Systems. In real-world scenarios, FRs are confronted with both physical and +digital attacks. However, existing algorithms often address only one type of +attack at a time, which poses significant limitations in real-world scenarios +where FR systems face hybrid physical-digital threats. To facilitate the +research of Unified Attack Detection (UAD) algorithms, a large-scale +UniAttackData dataset has been collected. UniAttackData is the largest public +dataset for Unified Attack Detection, with a total of 28,706 videos, where each +unique identity encompasses all advanced attack types. Based on this dataset, +we organized a Unified Physical-Digital Face Attack Detection Challenge to +boost the research in Unified Attack Detections. It attracted 136 teams for the +development phase, with 13 qualifying for the final round. The results +re-verified by the organizing team were used for the final ranking. This paper +comprehensively reviews the challenge, detailing the dataset introduction, +protocol definition, evaluation criteria, and a summary of published results. +Finally, we focus on the detailed analysis of the highest-performing algorithms +and offer potential directions for unified physical-digital attack detection +inspired by this competition. Challenge Website: +https://sites.google.com/view/face-anti-spoofing-challenge/welcome/challengecvpr2024. + +
+
+ comment: 11 pages, 10 figures +
+
+
+
+
+ + ☆ Leveraging edge detection and neural networks for better UAV + localization + + +
+ We propose a novel method for geolocalizing Unmanned Aerial Vehicles (UAVs) +in environments lacking Global Navigation Satellite Systems (GNSS). Current +state-of-the-art techniques employ an offline-trained encoder to generate a +vector representation (embedding) of the UAV's current view, which is then +compared with pre-computed embeddings of geo-referenced images to determine the +UAV's position. Here, we demonstrate that the performance of these methods can +be significantly enhanced by preprocessing the images to extract their edges, +which exhibit robustness to seasonal and illumination variations. Furthermore, +we establish that utilizing edges enhances resilience to orientation and +altitude inaccuracies. Additionally, we introduce a confidence criterion for +localization. Our findings are substantiated through synthetic experiments. + +
+
+ comment: Accepted for publication in IGARSS2024. 4 pages, 3 figures, 3 tables +
+
+
+
+
+ + ☆ Automated National Urban Map Extraction + + +
+ Developing countries usually lack the proper governance means to generate and +regularly update a national rooftop map. Using traditional photogrammetry and +surveying methods to produce a building map at the federal level is costly and +time consuming. Using earth observation and deep learning methods, we can +bridge this gap and propose an automated pipeline to fetch such national urban +maps. This paper aims to exploit the power of fully convolutional neural +networks for multi-class buildings' instance segmentation to leverage high +object-wise accuracy results. Buildings' instance segmentation from sub-meter +high-resolution satellite images can be achieved with relatively high +pixel-wise metric scores. We detail all engineering steps to replicate this +work and ensure highly accurate results in dense and slum areas witnessed in +regions that lack proper urban planning in the Global South. We applied a case +study of the proposed pipeline to Lebanon and successfully produced the first +comprehensive national building footprint map with approximately 1 Million +units with an 84% accuracy. The proposed architecture relies on advanced +augmentation techniques to overcome dataset scarcity, which is often the case +in developing countries. + +
+
+
+
+
+ + ☆ Exploring the Potential of Large Foundation Models for Open-Vocabulary + HOI Detection + + +
+ Open-vocabulary human-object interaction (HOI) detection, which is concerned +with the problem of detecting novel HOIs guided by natural language, is crucial +for understanding human-centric scenes. However, prior zero-shot HOI detectors +often employ the same levels of feature maps to model HOIs with varying +distances, leading to suboptimal performance in scenes containing human-object +pairs with a wide range of distances. In addition, these detectors primarily +rely on category names and overlook the rich contextual information that +language can provide, which is essential for capturing open vocabulary concepts +that are typically rare and not well-represented by category names alone. In +this paper, we introduce a novel end-to-end open vocabulary HOI detection +framework with conditional multi-level decoding and fine-grained semantic +enhancement (CMD-SE), harnessing the potential of Visual-Language Models +(VLMs). Specifically, we propose to model human-object pairs with different +distances with different levels of feature maps by incorporating a soft +constraint during the bipartite matching process. Furthermore, by leveraging +large language models (LLMs) such as GPT models, we exploit their extensive +world knowledge to generate descriptions of human body part states for various +interactions. Then we integrate the generalizable and fine-grained semantics of +human body parts to improve interaction recognition. Experimental results on +two datasets, SWIG-HOI and HICO-DET, demonstrate that our proposed method +achieves state-of-the-art results in open vocabulary HOI detection. The code +and models are available at https://github.com/ltttpku/CMD-SE-release. + +
+
+
+
+
+ + ☆ EPL: Evidential Prototype Learning for Semi-supervised Medical Image + Segmentation + + +
+ Although current semi-supervised medical segmentation methods can achieve +decent performance, they are still affected by the uncertainty in unlabeled +data and model predictions, and there is currently a lack of effective +strategies that can explore the uncertain aspects of both simultaneously. To +address the aforementioned issues, we propose Evidential Prototype Learning +(EPL), which utilizes an extended probabilistic framework to effectively fuse +voxel probability predictions from different sources and achieves prototype +fusion utilization of labeled and unlabeled data under a generalized evidential +framework, leveraging voxel-level dual uncertainty masking. The uncertainty not +only enables the model to self-correct predictions but also improves the guided +learning process with pseudo-labels and is able to feed back into the +construction of hidden features. The method proposed in this paper has been +experimented on LA, Pancreas-CT and TBAD datasets, achieving the +state-of-the-art performance in three different labeled ratios, which strongly +demonstrates the effectiveness of our strategy. + +
+
+
+
+
+ + ☆ YOLC: You Only Look Clusters for Tiny Object Detection in Aerial Images + + +
+ Detecting objects from aerial images poses significant challenges due to the +following factors: 1) Aerial images typically have very large sizes, generally +with millions or even hundreds of millions of pixels, while computational +resources are limited. 2) Small object size leads to insufficient information +for effective detection. 3) Non-uniform object distribution leads to +computational resource wastage. To address these issues, we propose YOLC (You +Only Look Clusters), an efficient and effective framework that builds on an +anchor-free object detector, CenterNet. To overcome the challenges posed by +large-scale images and non-uniform object distribution, we introduce a Local +Scale Module (LSM) that adaptively searches cluster regions for zooming in for +accurate detection. Additionally, we modify the regression loss using Gaussian +Wasserstein distance (GWD) to obtain high-quality bounding boxes. Deformable +convolution and refinement methods are employed in the detection head to +enhance the detection of small objects. We perform extensive experiments on two +aerial image datasets, including Visdrone2019 and UAVDT, to demonstrate the +effectiveness and superiority of our proposed approach. + +
+
+ comment: accepted to TITS +
+
+
+
+
+ + ☆ Uncertainty-aware Evidential Fusion-based Learning for Semi-supervised + Medical Image Segmentation + + +
+ Although the existing uncertainty-based semi-supervised medical segmentation +methods have achieved excellent performance, they usually only consider a +single uncertainty evaluation, which often fails to solve the problem related +to credibility completely. Therefore, based on the framework of evidential deep +learning, this paper integrates the evidential predictive results in the +cross-region of mixed and original samples to reallocate the confidence degree +and uncertainty measure of each voxel, which is realized by emphasizing +uncertain information of probability assignments fusion rule of traditional +evidence theory. Furthermore, we design a voxel-level asymptotic learning +strategy by introducing information entropy to combine with the fused +uncertainty measure to estimate voxel prediction more precisely. The model will +gradually pay attention to the prediction results with high uncertainty in the +learning process, to learn the features that are difficult to master. The +experimental results on LA, Pancreas-CT, ACDC and TBAD datasets demonstrate the +superior performance of our proposed method in comparison with the existing +state of the arts. + +
+
+
+
+
+ + ☆ Improving Interpretable Embeddings for Ad-hoc Video Search with + Generative Captions and Multi-word Concept Bank ICMR2024 + + +
+ Aligning a user query and video clips in cross-modal latent space and that +with semantic concepts are two mainstream approaches for ad-hoc video search +(AVS). However, the effectiveness of existing approaches is bottlenecked by the +small sizes of available video-text datasets and the low quality of concept +banks, which results in the failures of unseen queries and the +out-of-vocabulary problem. This paper addresses these two problems by +constructing a new dataset and developing a multi-word concept bank. +Specifically, capitalizing on a generative model, we construct a new dataset +consisting of 7 million generated text and video pairs for pre-training. To +tackle the out-of-vocabulary problem, we develop a multi-word concept bank +based on syntax analysis to enhance the capability of a state-of-the-art +interpretable AVS method in modeling relationships between query words. We also +study the impact of current advanced features on the method. Experimental +results show that the integration of the above-proposed elements doubles the +R@1 performance of the AVS method on the MSRVTT dataset and improves the xinfAP +on the TRECVid AVS query sets for 2016-2023 (eight years) by a margin from 2% +to 77%, with an average about 20%. + +
+
+ comment: Accepted in ICMR2024 +
+
+
+
+
+ + ☆ Enhanced Radar Perception via Multi-Task Learning: Towards Refined Data + for Sensor Fusion Applications + + +
+ Radar and camera fusion yields robustness in perception tasks by leveraging +the strength of both sensors. The typical extracted radar point cloud is 2D +without height information due to insufficient antennas along the elevation +axis, which challenges the network performance. This work introduces a +learning-based approach to infer the height of radar points associated with 3D +objects. A novel robust regression loss is introduced to address the sparse +target challenge. In addition, a multi-task training strategy is employed, +emphasizing important features. The average radar absolute height error +decreases from 1.69 to 0.25 meters compared to the state-of-the-art height +extension method. The estimated target height values are used to preprocess and +enrich radar data for downstream perception tasks. Integrating this refined +radar information further enhances the performance of existing radar camera +fusion models for object detection and depth estimation tasks. + +
+
+ comment: Accepted by IEEE Intelligent Vehicles Symposium (IV 2024) +
+
+
+
+
+ + ☆ Efficient and Robust Point Cloud Registration via Heuristics-guided + Parameter Search + + +
+ Estimating the rigid transformation with 6 degrees of freedom based on a +putative 3D correspondence set is a crucial procedure in point cloud +registration. Existing correspondence identification methods usually lead to +large outlier ratios ($>$ 95 $\%$ is common), underscoring the significance of +robust registration methods. Many researchers turn to parameter search-based +strategies (e.g., Branch-and-Bround) for robust registration. Although related +methods show high robustness, their efficiency is limited to the +high-dimensional search space. This paper proposes a heuristics-guided +parameter search strategy to accelerate the search while maintaining high +robustness. We first sample some correspondences (i.e., heuristics) and then +just need to sequentially search the feasible regions that make each sample an +inlier. Our strategy largely reduces the search space and can guarantee +accuracy with only a few inlier samples, therefore enjoying an excellent +trade-off between efficiency and robustness. Since directly parameterizing the +6-dimensional nonlinear feasible region for efficient search is intractable, we +construct a three-stage decomposition pipeline to reparameterize the feasible +region, resulting in three lower-dimensional sub-problems that are easily +solvable via our strategy. Besides reducing the searching dimension, our +decomposition enables the leverage of 1-dimensional interval stabbing at all +three stages for searching acceleration. Moreover, we propose a valid sampling +strategy to guarantee our sampling effectiveness, and a compatibility +verification setup to further accelerate our search. Extensive experiments on +both simulated and real-world datasets demonstrate that our approach exhibits +comparable robustness with state-of-the-art methods while achieving a +significant efficiency boost. + +
+
+ comment: 21 pages, 16 figures. Accepted to IEEE Transactions on Pattern + Analysis and Machine Intelligence, 2024 +
+
+
+
+
+ + ☆ Concise Plane Arrangements for Low-Poly Surface and Volume Modelling + + +
+ Plane arrangements are a useful tool for surface and volume modelling. +However, their main drawback is poor scalability. We introduce two key +novelties that enable the construction of plane arrangements for complex +objects and entire scenes: an ordering scheme for the plane insertion and the +direct use of input points during arrangement construction. Both ingredients +reduce the number of unwanted splits, resulting in improved scalability of the +construction mechanism by up to two orders of magnitude compared to existing +algorithms. We further introduce a remeshing and simplification technique that +allows us to extract low-polygon surface meshes and lightweight convex +decompositions of volumes from the arrangement. We show that our approach leads +to state-of-the-art results for the aforementioned tasks by comparing it to +learning-based and traditional approaches on various different datasets. Our +implementation is available at https://github.com/raphaelsulzer/compod . + +
+
+
+
+
+ + ☆ HFNeRF: Learning Human Biomechanic Features with Neural Radiance Fields + + +
+ In recent advancements in novel view synthesis, generalizable Neural Radiance +Fields (NeRF) based methods applied to human subjects have shown remarkable +results in generating novel views from few images. However, this generalization +ability cannot capture the underlying structural features of the skeleton +shared across all instances. Building upon this, we introduce HFNeRF: a novel +generalizable human feature NeRF aimed at generating human biomechanic features +using a pre-trained image encoder. While previous human NeRF methods have shown +promising results in the generation of photorealistic virtual avatars, such +methods lack underlying human structure or biomechanic features such as +skeleton or joint information that are crucial for downstream applications +including Augmented Reality (AR)/Virtual Reality (VR). HFNeRF leverages 2D +pre-trained foundation models toward learning human features in 3D using neural +rendering, and then volume rendering towards generating 2D feature maps. We +evaluate HFNeRF in the skeleton estimation task by predicting heatmaps as +features. The proposed method is fully differentiable, allowing to successfully +learn color, geometry, and human skeleton in a simultaneous manner. This paper +presents preliminary results of HFNeRF, illustrating its potential in +generating realistic virtual avatars with biomechanic features using NeRF. + +
+
+
+
+
+ + ☆ DiffHarmony: Latent Diffusion Model Meets Image Harmonization ICMR 2024 + + +
+ Image harmonization, which involves adjusting the foreground of a composite +image to attain a unified visual consistency with the background, can be +conceptualized as an image-to-image translation task. Diffusion models have +recently promoted the rapid development of image-to-image translation tasks . +However, training diffusion models from scratch is computationally intensive. +Fine-tuning pre-trained latent diffusion models entails dealing with the +reconstruction error induced by the image compression autoencoder, making it +unsuitable for image generation tasks that involve pixel-level evaluation +metrics. To deal with these issues, in this paper, we first adapt a pre-trained +latent diffusion model to the image harmonization task to generate the +harmonious but potentially blurry initial images. Then we implement two +strategies: utilizing higher-resolution images during inference and +incorporating an additional refinement stage, to further enhance the clarity of +the initially harmonized images. Extensive experiments on iHarmony4 datasets +demonstrate the superiority of our proposed method. The code and model will be +made publicly available at https://github.com/nicecv/DiffHarmony . + +
+
+ comment: Accepted by ICMR 2024 +
+
+
+
+
+ + ☆ Mansformer: Efficient Transformer of Mixed Attention for Image + Deblurring and Beyond + + +
+ Transformer has made an enormous success in natural language processing and +high-level vision over the past few years. However, the complexity of +self-attention is quadratic to the image size, which makes it infeasible for +high-resolution vision tasks. In this paper, we propose the Mansformer, a +Transformer of mixed attention that combines multiple self-attentions, gate, +and multi-layer perceptions (MLPs), to explore and employ more possibilities of +self-attention. Taking efficiency into account, we design four kinds of +self-attention, whose complexities are all linear. By elaborate adjustment of +the tensor shapes and dimensions for the dot product, we split the typical +self-attention of quadratic complexity into four operations of linear +complexity. To adaptively merge these different kinds of self-attention, we +take advantage of an architecture similar to Squeeze-and-Excitation Networks. +Furthermore, we make it to merge the two-staged Transformer design into one +stage by the proposed gated-dconv MLP. Image deblurring is our main target, +while extensive quantitative and qualitative evaluations show that this method +performs favorably against the state-of-the-art methods far more than simply +deblurring. The source codes and trained models will be made available to the +public. + +
+
+
+
+
+ + ☆ Gaussian Pancakes: Geometrically-Regularized 3D Gaussian Splatting for + Realistic Endoscopic Reconstruction + + +
+ Within colorectal cancer diagnostics, conventional colonoscopy techniques +face critical limitations, including a limited field of view and a lack of +depth information, which can impede the detection of precancerous lesions. +Current methods struggle to provide comprehensive and accurate 3D +reconstructions of the colonic surface which can help minimize the missing +regions and reinspection for pre-cancerous polyps. Addressing this, we +introduce 'Gaussian Pancakes', a method that leverages 3D Gaussian Splatting +(3D GS) combined with a Recurrent Neural Network-based Simultaneous +Localization and Mapping (RNNSLAM) system. By introducing geometric and depth +regularization into the 3D GS framework, our approach ensures more accurate +alignment of Gaussians with the colon surface, resulting in smoother 3D +reconstructions with novel viewing of detailed textures and structures. +Evaluations across three diverse datasets show that Gaussian Pancakes enhances +novel view synthesis quality, surpassing current leading methods with a 18% +boost in PSNR and a 16% improvement in SSIM. It also delivers over 100X faster +rendering and more than 10X shorter training times, making it a practical tool +for real-time applications. Hence, this holds promise for achieving clinical +translation for better detection and diagnosis of colorectal cancer. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ☆ Hierarchical Insights: Exploiting Structural Similarities for Reliable + 3D Semantic Segmentation IROS 2024 + + +
+ Safety-critical applications like autonomous driving call for robust 3D +environment perception algorithms which can withstand highly diverse and +ambiguous surroundings. The predictive performance of any classification model +strongly depends on the underlying dataset and the prior knowledge conveyed by +the annotated labels. While the labels provide a basis for the learning +process, they usually fail to represent inherent relations between the classes +- representations, which are a natural element of the human perception system. +We propose a training strategy which enables a 3D LiDAR semantic segmentation +model to learn structural relationships between the different classes through +abstraction. We achieve this by implicitly modeling those relationships through +a learning rule for hierarchical multi-label classification (HMC). With a +detailed analysis we show, how this training strategy not only improves the +model's confidence calibration, but also preserves additional information for +downstream tasks like fusion, prediction and planning. + +
+
+ comment: submitted to IROS 2024 +
+
+
+
+
+ + ☆ DreamView: Injecting View-specific Text Guidance into Text-to-3D + Generation + + +
+ Text-to-3D generation, which synthesizes 3D assets according to an overall +text description, has significantly progressed. However, a challenge arises +when the specific appearances need customizing at designated viewpoints but +referring solely to the overall description for generating 3D objects. For +instance, ambiguity easily occurs when producing a T-shirt with distinct +patterns on its front and back using a single overall text guidance. In this +work, we propose DreamView, a text-to-image approach enabling multi-view +customization while maintaining overall consistency by adaptively injecting the +view-specific and overall text guidance through a collaborative text guidance +injection module, which can also be lifted to 3D generation via score +distillation sampling. DreamView is trained with large-scale rendered +multi-view images and their corresponding view-specific texts to learn to +balance the separate content manipulation in each view and the global +consistency of the overall object, resulting in a dual achievement of +customization and consistency. Consequently, DreamView empowers artists to +design 3D objects creatively, fostering the creation of more innovative and +diverse 3D assets. Code and model will be released at +https://github.com/iSEE-Laboratory/DreamView. + +
+
+
+
+
+ + ☆ Revising Densification in Gaussian Splatting + + +
+ In this paper, we address the limitations of Adaptive Density Control (ADC) +in 3D Gaussian Splatting (3DGS), a scene representation method achieving +high-quality, photorealistic results for novel view synthesis. ADC has been +introduced for automatic 3D point primitive management, controlling +densification and pruning, however, with certain limitations in the +densification logic. Our main contribution is a more principled, pixel-error +driven formulation for density control in 3DGS, leveraging an auxiliary, +per-pixel error function as the criterion for densification. We further +introduce a mechanism to control the total number of primitives generated per +scene and correct a bias in the current opacity handling strategy of ADC during +cloning operations. Our approach leads to consistent quality improvements +across a variety of benchmark scenes, without sacrificing the method's +efficiency. + +
+
+
+
+
+ + ☆ Hash3D: Training-free Acceleration for 3D Generation + + +
+ The evolution of 3D generative modeling has been notably propelled by the +adoption of 2D diffusion models. Despite this progress, the cumbersome +optimization process per se presents a critical hurdle to efficiency. In this +paper, we introduce Hash3D, a universal acceleration for 3D generation without +model training. Central to Hash3D is the insight that feature-map redundancy is +prevalent in images rendered from camera positions and diffusion time-steps in +close proximity. By effectively hashing and reusing these feature maps across +neighboring timesteps and camera angles, Hash3D substantially prevents +redundant calculations, thus accelerating the diffusion model's inference in 3D +generation tasks. We achieve this through an adaptive grid-based hashing. +Surprisingly, this feature-sharing mechanism not only speed up the generation +but also enhances the smoothness and view consistency of the synthesized 3D +objects. Our experiments covering 5 text-to-3D and 3 image-to-3D models, +demonstrate Hash3D's versatility to speed up optimization, enhancing efficiency +by 1.3 to 4 times. Additionally, Hash3D's integration with 3D Gaussian +splatting largely speeds up 3D model creation, reducing text-to-3D processing +to about 10 minutes and image-to-3D conversion to roughly 30 seconds. The +project page is at https://adamdad.github.io/hash3D/. + +
+
+ comment: https://adamdad.github.io/hash3D/ +
+
+
+
+
+ + ☆ Using Few-Shot Learning to Classify Primary Lung Cancer and Other + Malignancy with Lung Metastasis in Cytological Imaging via Endobronchial + Ultrasound Procedures + + +
+ This study aims to establish a computer-aided diagnosis system for +endobronchial ultrasound (EBUS) surgery to assist physicians in the preliminary +diagnosis of metastatic cancer. This involves arranging immediate examinations +for other sites of metastatic cancer after EBUS surgery, eliminating the need +to wait for reports, thereby shortening the waiting time by more than half and +enabling patients to detect other cancers earlier, allowing for early planning +and implementation of treatment plans. Unlike previous studies on cell image +classification, which have abundant datasets for training, this study must also +be able to make effective classifications despite the limited amount of case +data for lung metastatic cancer. In the realm of small data set classification +methods, Few-shot learning (FSL) has become mainstream in recent years. Through +its ability to train on small datasets and its strong generalization +capabilities, FSL shows potential in this task of lung metastatic cell image +classification. This study will adopt the approach of Few-shot learning, +referencing existing proposed models, and designing a model architecture for +classifying lung metastases cell images. Batch Spectral Regularization (BSR) +will be incorporated as a loss update parameter, and the Finetune method of PMF +will be modified. In terms of test results, the addition of BSR and the +modified Finetune method further increases the accuracy by 8.89% to 65.60%, +outperforming other FSL methods. This study confirms that FSL is superior to +supervised and transfer learning in classifying metastatic cancer and +demonstrates that using BSR as a loss function and modifying Finetune can +enhance the model's capabilities. + +
+
+
+
+
+ + ☆ LIPT: Latency-aware Image Processing Transformer + + +
+ Transformer is leading a trend in the field of image processing. Despite the +great success that existing lightweight image processing transformers have +achieved, they are tailored to FLOPs or parameters reduction, rather than +practical inference acceleration. In this paper, we present a latency-aware +image processing transformer, termed LIPT. We devise the low-latency proportion +LIPT block that substitutes memory-intensive operators with the combination of +self-attention and convolutions to achieve practical speedup. Specifically, we +propose a novel non-volatile sparse masking self-attention (NVSM-SA) that +utilizes a pre-computing sparse mask to capture contextual information from a +larger window with no extra computation overload. Besides, a high-frequency +reparameterization module (HRM) is proposed to make LIPT block +reparameterization friendly, which improves the model's detail reconstruction +capability. Extensive experiments on multiple image processing tasks (e.g., +image super-resolution (SR), JPEG artifact reduction, and image denoising) +demonstrate the superiority of LIPT on both latency and PSNR. LIPT achieves +real-time GPU inference with state-of-the-art performance on multiple image SR +benchmarks. + +
+
+
+
+
+ + ☆ Unified Entropy Optimization for Open-Set Test-Time Adaptation CVPR 2024 + + +
+ Test-time adaptation (TTA) aims at adapting a model pre-trained on the +labeled source domain to the unlabeled target domain. Existing methods usually +focus on improving TTA performance under covariate shifts, while neglecting +semantic shifts. In this paper, we delve into a realistic open-set TTA setting +where the target domain may contain samples from unknown classes. Many +state-of-the-art closed-set TTA methods perform poorly when applied to open-set +scenarios, which can be attributed to the inaccurate estimation of data +distribution and model confidence. To address these issues, we propose a simple +but effective framework called unified entropy optimization (UniEnt), which is +capable of simultaneously adapting to covariate-shifted in-distribution (csID) +data and detecting covariate-shifted out-of-distribution (csOOD) data. +Specifically, UniEnt first mines pseudo-csID and pseudo-csOOD samples from test +data, followed by entropy minimization on the pseudo-csID data and entropy +maximization on the pseudo-csOOD data. Furthermore, we introduce UniEnt+ to +alleviate the noise caused by hard data partition leveraging sample-level +confidence. Extensive experiments on CIFAR benchmarks and Tiny-ImageNet-C show +the superiority of our framework. The code is available at +https://github.com/gaozhengqing/UniEnt + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Unified Multi-modal Diagnostic Framework with Reconstruction + Pre-training and Heterogeneity-combat Tuning + + +
+ Medical multi-modal pre-training has revealed promise in computer-aided +diagnosis by leveraging large-scale unlabeled datasets. However, existing +methods based on masked autoencoders mainly rely on data-level reconstruction +tasks, but lack high-level semantic information. Furthermore, two significant +heterogeneity challenges hinder the transfer of pre-trained knowledge to +downstream tasks, \textit{i.e.}, the distribution heterogeneity between +pre-training data and downstream data, and the modality heterogeneity within +downstream data. To address these challenges, we propose a Unified Medical +Multi-modal Diagnostic (UMD) framework with tailored pre-training and +downstream tuning strategies. Specifically, to enhance the representation +abilities of vision and language encoders, we propose the Multi-level +Reconstruction Pre-training (MR-Pretrain) strategy, including a feature-level +and data-level reconstruction, which guides models to capture the semantic +information from masked inputs of different modalities. Moreover, to tackle two +kinds of heterogeneities during the downstream tuning, we present the +heterogeneity-combat downstream tuning strategy, which consists of a +Task-oriented Distribution Calibration (TD-Calib) and a Gradient-guided +Modality Coordination (GM-Coord). In particular, TD-Calib fine-tunes the +pre-trained model regarding the distribution of downstream datasets, and +GM-Coord adjusts the gradient weights according to the dynamic optimization +status of different modalities. Extensive experiments on five public medical +datasets demonstrate the effectiveness of our UMD framework, which remarkably +outperforms existing approaches on three kinds of downstream tasks. + +
+
+ comment: to be published in IEEE JBHI; Code available at + https://github.com/helenypzhang/UMD +
+
+
+
+
+ + ☆ Incremental Joint Learning of Depth, Pose and Implicit Scene + Representation on Monocular Camera in Large-scale Scenes + + +
+ Dense scene reconstruction for photo-realistic view synthesis has various +applications, such as VR/AR, autonomous vehicles. However, most existing +methods have difficulties in large-scale scenes due to three core challenges: +\textit{(a) inaccurate depth input.} Accurate depth input is impossible to get +in real-world large-scale scenes. \textit{(b) inaccurate pose estimation.} Most +existing approaches rely on accurate pre-estimated camera poses. \textit{(c) +insufficient scene representation capability.} A single global radiance field +lacks the capacity to effectively scale to large-scale scenes. To this end, we +propose an incremental joint learning framework, which can achieve accurate +depth, pose estimation, and large-scale scene reconstruction. A vision +transformer-based network is adopted as the backbone to enhance performance in +scale information estimation. For pose estimation, a feature-metric bundle +adjustment (FBA) method is designed for accurate and robust camera tracking in +large-scale scenes. In terms of implicit scene representation, we propose an +incremental scene representation method to construct the entire large-scale +scene as multiple local radiance fields to enhance the scalability of 3D scene +representation. Extended experiments have been conducted to demonstrate the +effectiveness and accuracy of our method in depth estimation, pose estimation, +and large-scale scene reconstruction. + +
+
+
+
+
+ + ☆ Object Dynamics Modeling with Hierarchical Point Cloud-based + Representations CVPR 2024 + + +
+ Modeling object dynamics with a neural network is an important problem with +numerous applications. Most recent work has been based on graph neural +networks. However, physics happens in 3D space, where geometric information +potentially plays an important role in modeling physical phenomena. In this +work, we propose a novel U-net architecture based on continuous point +convolution which naturally embeds information from 3D coordinates and allows +for multi-scale feature representations with established downsampling and +upsampling procedures. Bottleneck layers in the downsampled point clouds lead +to better long-range interaction modeling. Besides, the flexibility of point +convolutions allows our approach to generalize to sparsely sampled points from +mesh vertices and dynamically generate features on important interaction points +on mesh faces. Experimental results demonstrate that our approach significantly +improves the state-of-the-art, especially in scenarios that require accurate +gravity or collision reasoning. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Space-Time Video Super-resolution with Neural Operator + + +
+ This paper addresses the task of space-time video super-resolution (ST-VSR). +Existing methods generally suffer from inaccurate motion estimation and motion +compensation (MEMC) problems for large motions. Inspired by recent progress in +physics-informed neural networks, we model the challenges of MEMC in ST-VSR as +a mapping between two continuous function spaces. Specifically, our approach +transforms independent low-resolution representations in the coarse-grained +continuous function space into refined representations with enriched +spatiotemporal details in the fine-grained continuous function space. To +achieve efficient and accurate MEMC, we design a Galerkin-type attention +function to perform frame alignment and temporal interpolation. Due to the +linear complexity of the Galerkin-type attention mechanism, our model avoids +patch partitioning and offers global receptive fields, enabling precise +estimation of large motions. The experimental results show that the proposed +method surpasses state-of-the-art techniques in both fixed-size and continuous +space-time video super-resolution tasks. + +
+
+
+
+
+ + ☆ Little Strokes Fell Great Oaks: Boosting the Hierarchical Features for + Multi-exposure Image Fusion + + +
+ In recent years, deep learning networks have made remarkable strides in the +domain of multi-exposure image fusion. Nonetheless, prevailing approaches often +involve directly feeding over-exposed and under-exposed images into the +network, which leads to the under-utilization of inherent information present +in the source images. Additionally, unsupervised techniques predominantly +employ rudimentary weighted summation for color channel processing, culminating +in an overall desaturated final image tone. To partially mitigate these issues, +this study proposes a gamma correction module specifically designed to fully +leverage latent information embedded within source images. Furthermore, a +modified transformer block, embracing with self-attention mechanisms, is +introduced to optimize the fusion process. Ultimately, a novel color +enhancement algorithm is presented to augment image saturation while preserving +intricate details. The source code is available at this https://github.com/ZhiyingDu/BHFMEF url. + +
+
+
+
+
+ + ☆ Improving Facial Landmark Detection Accuracy and Efficiency with + Knowledge Distillation ICME 2024 + + +
+ The domain of computer vision has experienced significant advancements in +facial-landmark detection, becoming increasingly essential across various +applications such as augmented reality, facial recognition, and emotion +analysis. Unlike object detection or semantic segmentation, which focus on +identifying objects and outlining boundaries, faciallandmark detection aims to +precisely locate and track critical facial features. However, deploying deep +learning-based facial-landmark detection models on embedded systems with +limited computational resources poses challenges due to the complexity of +facial features, especially in dynamic settings. Additionally, ensuring +robustness across diverse ethnicities and expressions presents further +obstacles. Existing datasets often lack comprehensive representation of facial +nuances, particularly within populations like those in Taiwan. This paper +introduces a novel approach to address these challenges through the development +of a knowledge distillation method. By transferring knowledge from larger +models to smaller ones, we aim to create lightweight yet powerful deep learning +models tailored specifically for facial-landmark detection tasks. Our goal is +to design models capable of accurately locating facial landmarks under varying +conditions, including diverse expressions, orientations, and lighting +environments. The ultimate objective is to achieve high accuracy and real-time +performance suitable for deployment on embedded systems. This method was +successfully implemented and achieved a top 6th place finish out of 165 +participants in the IEEE ICME 2024 PAIR competition. + +
+
+ comment: technical report. 6th/165 in IEEE ICME 2024 PAIR competition +
+
+
+
+
+ + ☆ Greedy-DiM: Greedy Algorithms for Unreasonably Effective Face Morphs + + +
+ Morphing attacks are an emerging threat to state-of-the-art Face Recognition +(FR) systems, which aim to create a single image that contains the biometric +information of multiple identities. Diffusion Morphs (DiM) are a recently +proposed morphing attack that has achieved state-of-the-art performance for +representation-based morphing attacks. However, none of the existing research +on DiMs have leveraged the iterative nature of DiMs and left the DiM model as a +black box, treating it no differently than one would a Generative Adversarial +Network (GAN) or Varational AutoEncoder (VAE). We propose a greedy strategy on +the iterative sampling process of DiM models which searches for an optimal step +guided by an identity-based heuristic function. We compare our proposed +algorithm against ten other state-of-the-art morphing algorithms using the +open-source SYN-MAD 2022 competition dataset. We find that our proposed +algorithm is unreasonably effective, fooling all of the tested FR systems with +an MMPMR of 100%, outperforming all other morphing algorithms compared. + +
+
+ comment: Initial preprint. Under review +
+
+
+
+
+ + ☆ Band-Attention Modulated RetNet for Face Forgery Detection + + +
+ The transformer networks are extensively utilized in face forgery detection +due to their scalability across large datasets.Despite their success, +transformers face challenges in balancing the capture of global context, which +is crucial for unveiling forgery clues, with computational complexity.To +mitigate this issue, we introduce Band-Attention modulated RetNet (BAR-Net), a +lightweight network designed to efficiently process extensive visual contexts +while avoiding catastrophic forgetting.Our approach empowers the target token +to perceive global information by assigning differential attention levels to +tokens at varying distances. We implement self-attention along both spatial +axes, thereby maintaining spatial priors and easing the computational +burden.Moreover, we present the adaptive frequency Band-Attention Modulation +mechanism, which treats the entire Discrete Cosine Transform spectrogram as a +series of frequency bands with learnable weights.Together, BAR-Net achieves +favorable performance on several face forgery datasets, outperforming current +state-of-the-art methods. + +
+
+
+
+
+ + ☆ Diffusion-Based Point Cloud Super-Resolution for mmWave Radar Data + + +
+ The millimeter-wave radar sensor maintains stable performance under adverse +environmental conditions, making it a promising solution for all-weather +perception tasks, such as outdoor mobile robotics. However, the radar point +clouds are relatively sparse and contain massive ghost points, which greatly +limits the development of mmWave radar technology. In this paper, we propose a +novel point cloud super-resolution approach for 3D mmWave radar data, named +Radar-diffusion. Our approach employs the diffusion model defined by +mean-reverting stochastic differential equations(SDE). Using our proposed new +objective function with supervision from corresponding LiDAR point clouds, our +approach efficiently handles radar ghost points and enhances the sparse mmWave +radar point clouds to dense LiDAR-like point clouds. We evaluate our approach +on two different datasets, and the experimental results show that our method +outperforms the state-of-the-art baseline methods in 3D radar super-resolution +tasks. Furthermore, we demonstrate that our enhanced radar point cloud is +capable of downstream radar point-based registration tasks. + +
+
+
+
+
+ + ☆ Concept-Attention Whitening for Interpretable Skin Lesion Diagnosis + + +
+ The black-box nature of deep learning models has raised concerns about their +interpretability for successful deployment in real-world clinical applications. +To address the concerns, eXplainable Artificial Intelligence (XAI) aims to +provide clear and understandable explanations of the decision-making process. +In the medical domain, concepts such as attributes of lesions or abnormalities +serve as key evidence for deriving diagnostic results. However, existing +concept-based models mainly depend on concepts that appear independently and +require fine-grained concept annotations such as bounding boxes. A medical +image usually contains multiple concepts and the fine-grained concept +annotations are difficult to acquire. In this paper, we propose a novel +Concept-Attention Whitening (CAW) framework for interpretable skin lesion +diagnosis. CAW is comprised of a disease diagnosis branch and a concept +alignment branch. In the former branch, we train the CNN with a CAW layer +inserted to perform skin lesion diagnosis. The CAW layer decorrelates features +and aligns image features to conceptual meanings via an orthogonal matrix. In +the latter branch, we calculate the orthogonal matrix under the guidance of the +concept attention mask. We particularly introduce a weakly-supervised concept +mask generator that only leverages coarse concept labels for filtering local +regions that are relevant to certain concepts, improving the optimization of +the orthogonal matrix. Extensive experiments on two public skin lesion +diagnosis datasets demonstrated that CAW not only enhanced interpretability but +also maintained a state-of-the-art diagnostic performance. + +
+
+
+
+
+ + ☆ A Lightweight Measure of Classification Difficulty from Application + Dataset Characteristics + + +
+ Despite accuracy and computation benchmarks being widely available to help +choose among neural network models, these are usually trained on datasets with +many classes, and do not give a precise idea of performance for applications of +few (< 10) classes. The conventional procedure to predict performance is to +train and test repeatedly on the different models and dataset variations of +interest. However, this is computationally expensive. We propose an efficient +classification difficulty measure that is calculated from the number of classes +and intra- and inter-class similarity metrics of the dataset. After a single +stage of training and testing per model family, relative performance for +different datasets and models of the same family can be predicted by comparing +difficulty measures - without further training and testing. We show how this +measure can help a practitioner select a computationally efficient model for a +small dataset 6 to 29x faster than through repeated training and testing. We +give an example of use of the measure for an industrial application in which +options are identified to select a model 42% smaller than the baseline +YOLOv5-nano model, and if class merging from 3 to 2 classes meets requirements, +85% smaller. + +
+
+ comment: 13 pages, 3 figures +
+
+
+
+
+ + ☆ Tackling Structural Hallucination in Image Translation with Local + Diffusion + + +
+ Recent developments in diffusion models have advanced conditioned image +generation, yet they struggle with reconstructing out-of-distribution (OOD) +images, such as unseen tumors in medical images, causing ``image +hallucination'' and risking misdiagnosis. We hypothesize such hallucinations +result from local OOD regions in the conditional images. We verify that +partitioning the OOD region and conducting separate image generations +alleviates hallucinations in several applications. From this, we propose a +training-free diffusion framework that reduces hallucination with multiple +Local Diffusion processes. Our approach involves OOD estimation followed by two +modules: a ``branching'' module generates locally both within and outside OOD +regions, and a ``fusion'' module integrates these predictions into one. Our +evaluation shows our method mitigates hallucination over baseline models +quantitatively and qualitatively, reducing misdiagnosis by 40% and 25% in the +real-world medical and natural image datasets, respectively. It also +demonstrates compatibility with various pre-trained diffusion models. + +
+
+
+
+
+ + ☆ StoryImager: A Unified and Efficient Framework for Coherent Story + Visualization and Completion + + +
+ Story visualization aims to generate a series of realistic and coherent +images based on a storyline. Current models adopt a frame-by-frame architecture +by transforming the pre-trained text-to-image model into an auto-regressive +manner. Although these models have shown notable progress, there are still +three flaws. 1) The unidirectional generation of auto-regressive manner +restricts the usability in many scenarios. 2) The additional introduced story +history encoders bring an extremely high computational cost. 3) The story +visualization and continuation models are trained and inferred independently, +which is not user-friendly. To these ends, we propose a bidirectional, unified, +and efficient framework, namely StoryImager. The StoryImager enhances the +storyboard generative ability inherited from the pre-trained text-to-image +model for a bidirectional generation. Specifically, we introduce a Target Frame +Masking Strategy to extend and unify different story image generation tasks. +Furthermore, we propose a Frame-Story Cross Attention Module that decomposes +the cross attention for local fidelity and global coherence. Moreover, we +design a Contextual Feature Extractor to extract contextual information from +the whole storyline. The extensive experimental results demonstrate the +excellent performance of our StoryImager. The code is available at +https://github.com/tobran/StoryImager. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ☆ JSTR: Judgment Improves Scene Text Recognition + + +
+ In this paper, we present a method for enhancing the accuracy of scene text +recognition tasks by judging whether the image and text match each other. While +previous studies focused on generating the recognition results from input +images, our approach also considers the model's misrecognition results to +understand its error tendencies, thus improving the text recognition pipeline. +This method boosts text recognition accuracy by providing explicit feedback on +the data that the model is likely to misrecognize by predicting correct or +incorrect between the image and text. The experimental results on publicly +available datasets demonstrate that our proposed method outperforms the +baseline and state-of-the-art methods in scene text recognition. + +
+
+ comment: IntelliSys 2024 +
+
+
+
+
+ + ☆ EasyTrack: Efficient and Compact One-stream 3D Point Clouds Tracker + + +
+ Most of 3D single object trackers (SOT) in point clouds follow the two-stream +multi-stage 3D Siamese or motion tracking paradigms, which process the template +and search area point clouds with two parallel branches, built on supervised +point cloud backbones. In this work, beyond typical 3D Siamese or motion +tracking, we propose a neat and compact one-stream transformer 3D SOT paradigm +from the novel perspective, termed as \textbf{EasyTrack}, which consists of +three special designs: 1) A 3D point clouds tracking feature pre-training +module is developed to exploit the masked autoencoding for learning 3D point +clouds tracking representations. 2) A unified 3D tracking feature learning and +fusion network is proposed to simultaneously learns target-aware 3D features, +and extensively captures mutual correlation through the flexible self-attention +mechanism. 3) A target location network in the dense bird's eye view (BEV) +feature space is constructed for target classification and regression. +Moreover, we develop an enhanced version named EasyTrack++, which designs the +center points interaction (CPI) strategy to reduce the ambiguous targets caused +by the noise point cloud background information. The proposed EasyTrack and +EasyTrack++ set a new state-of-the-art performance ($\textbf{18\%}$, +$\textbf{40\%}$ and $\textbf{3\%}$ success gains) in KITTI, NuScenes, and Waymo +while runing at \textbf{52.6fps} with few parameters (\textbf{1.3M}). The code +will be available at https://github.com/KnightApple427/Easytrack. + +
+
+
+
+
+ + ☆ Prompt-driven Universal Model for View-Agnostic Echocardiography + Analysis + + +
+ Echocardiography segmentation for cardiac analysis is time-consuming and +resource-intensive due to the variability in image quality and the necessity to +process scans from various standard views. While current automated segmentation +methods in echocardiography show promising performance, they are trained on +specific scan views to analyze corresponding data. However, this solution has a +limitation as the number of required models increases with the number of +standard views. To address this, in this paper, we present a prompt-driven +universal method for view-agnostic echocardiography analysis. Considering the +domain shift between standard views, we first introduce a method called prompt +matching, aimed at learning prompts specific to different views by matching +prompts and querying input embeddings using a pre-trained vision model. Then, +we utilized a pre-trained medical language model to align textual information +with pixel data for accurate segmentation. Extensive experiments on three +standard views showed that our approach significantly outperforms the +state-of-the-art universal methods and achieves comparable or even better +performances over the segmentation model trained and tested on same views. + +
+
+
+
+
+ + ☆ LATUP-Net: A Lightweight 3D Attention U-Net with Parallel Convolutions + for Brain Tumor Segmentation + + +
+ Early-stage 3D brain tumor segmentation from magnetic resonance imaging (MRI) +scans is crucial for prompt and effective treatment. However, this process +faces the challenge of precise delineation due to the tumors' complex +heterogeneity. Moreover, energy sustainability targets and resource +limitations, especially in developing countries, require efficient and +accessible medical imaging solutions. The proposed architecture, a Lightweight +3D ATtention U-Net with Parallel convolutions, LATUP-Net, addresses these +issues. It is specifically designed to reduce computational requirements +significantly while maintaining high segmentation performance. By incorporating +parallel convolutions, it enhances feature representation by capturing +multi-scale information. It further integrates an attention mechanism to refine +segmentation through selective feature recalibration. LATUP-Net achieves +promising segmentation performance: the average Dice scores for the whole +tumor, tumor core, and enhancing tumor on the BraTS2020 dataset are 88.41%, +83.82%, and 73.67%, and on the BraTS2021 dataset, they are 90.29%, 89.54%, and +83.92%, respectively. Hausdorff distance metrics further indicate its improved +ability to delineate tumor boundaries. With its significantly reduced +computational demand using only 3.07 M parameters, about 59 times fewer than +other state-of-the-art models, and running on a single V100 GPU, LATUP-Net +stands out as a promising solution for real-world clinical applications, +particularly in settings with limited resources. Investigations into the +model's interpretability, utilizing gradient-weighted class activation mapping +and confusion matrices, reveal that while attention mechanisms enhance the +segmentation of small regions, their impact is nuanced. Achieving the most +accurate tumor delineation requires carefully balancing local and global +features. + +
+
+
+
+
+ + ☆ Res-U2Net: Untrained Deep Learning for Phase Retrieval and Image + Reconstruction + + +
+ Conventional deep learning-based image reconstruction methods require a large +amount of training data which can be hard to obtain in practice. Untrained deep +learning methods overcome this limitation by training a network to invert a +physical model of the image formation process. Here we present a novel +untrained Res-U2Net model for phase retrieval. We use the extracted phase +information to determine changes in an object's surface and generate a mesh +representation of its 3D structure. We compare the performance of Res-U2Net +phase retrieval against UNet and U2Net using images from the GDXRAY dataset. + +
+
+ comment: 16 pages, 8 figures, 4 Tables +
+
+
+
+
+ + ☆ FlameFinder: Illuminating Obscured Fire through Smoke with Attentive + Deep Metric Learning + + +
+ FlameFinder is a deep metric learning (DML) framework designed to accurately +detect flames, even when obscured by smoke, using thermal images from +firefighter drones during wildfire monitoring. Traditional RGB cameras struggle +in such conditions, but thermal cameras can capture smoke-obscured flame +features. However, they lack absolute thermal reference points, leading to +false positives.To address this issue, FlameFinder utilizes paired thermal-RGB +images for training. By learning latent flame features from smoke-free samples, +the model becomes less biased towards relative thermal gradients. In testing, +it identifies flames in smoky patches by analyzing their equivalent +thermal-domain distribution. This method improves performance using both +supervised and distance-based clustering metrics.The framework incorporates a +flame segmentation method and a DML-aided detection framework. This includes +utilizing center loss (CL), triplet center loss (TCL), and triplet cosine +center loss (TCCL) to identify optimal cluster representatives for +classification. However, the dominance of center loss over the other losses +leads to the model missing features sensitive to them. To address this +limitation, an attention mechanism is proposed. This mechanism allows for +non-uniform feature contribution, amplifying the critical role of cosine and +triplet loss in the DML framework. Additionally, it improves interpretability, +class discrimination, and decreases intra-class variance. As a result, the +proposed model surpasses the baseline by 4.4% in the FLAME2 dataset and 7% in +the FLAME3 dataset for unobscured flame detection accuracy. Moreover, it +demonstrates enhanced class separation in obscured scenarios compared to VGG19, +ResNet18, and three backbone models tailored for flame detection. + +
+
+ comment: Submitted as a Journal Paper to IEEE Transactions on Geoscience and + Remote Sensing +
+
+
+
+
+ + ☆ SAM-I-Am: Semantic Boosting for Zero-shot Atomic-Scale Electron + Micrograph Segmentation + + +
+ Image segmentation is a critical enabler for tasks ranging from medical +diagnostics to autonomous driving. However, the correct segmentation semantics +- where are boundaries located? what segments are logically similar? - change +depending on the domain, such that state-of-the-art foundation models can +generate meaningless and incorrect results. Moreover, in certain domains, +fine-tuning and retraining techniques are infeasible: obtaining labels is +costly and time-consuming; domain images (micrographs) can be exponentially +diverse; and data sharing (for third-party retraining) is restricted. To enable +rapid adaptation of the best segmentation technology, we propose the concept of +semantic boosting: given a zero-shot foundation model, guide its segmentation +and adjust results to match domain expectations. We apply semantic boosting to +the Segment Anything Model (SAM) to obtain microstructure segmentation for +transmission electron microscopy. Our booster, SAM-I-Am, extracts geometric and +textural features of various intermediate masks to perform mask removal and +mask merging operations. We demonstrate a zero-shot performance increase of +(absolute) +21.35%, +12.6%, +5.27% in mean IoU, and a -9.91%, -18.42%, -4.06% +drop in mean false positive masks across images of three difficulty classes +over vanilla SAM (ViT-L). + +
+
+
+
+
+ + ☆ GeoSynth: Contextually-Aware High-Resolution Satellite Image Synthesis + + +
+ We present GeoSynth, a model for synthesizing satellite images with global +style and image-driven layout control. The global style control is via textual +prompts or geographic location. These enable the specification of scene +semantics or regional appearance respectively, and can be used together. We +train our model on a large dataset of paired satellite imagery, with +automatically generated captions, and OpenStreetMap data. We evaluate various +combinations of control inputs, including different types of layout controls. +Results demonstrate that our model can generate diverse, high-quality images +and exhibits excellent zero-shot generalization. The code and model checkpoints +are available at https://github.com/mvrl/GeoSynth. + +
+
+
+
+
+ + ☆ Calibrating Higher-Order Statistics for Few-Shot Class-Incremental + Learning with Pre-trained Vision Transformers CVPR 2024 + + +
+ Few-shot class-incremental learning (FSCIL) aims to adapt the model to new +classes from very few data (5 samples) without forgetting the previously +learned classes. Recent works in many-shot CIL (MSCIL) (using all available +training data) exploited pre-trained models to reduce forgetting and achieve +better plasticity. In a similar fashion, we use ViT models pre-trained on +large-scale datasets for few-shot settings, which face the critical issue of +low plasticity. FSCIL methods start with a many-shot first task to learn a very +good feature extractor and then move to the few-shot setting from the second +task onwards. While the focus of most recent studies is on how to learn the +many-shot first task so that the model generalizes to all future few-shot +tasks, we explore in this work how to better model the few-shot data using +pre-trained models, irrespective of how the first task is trained. Inspired by +recent works in MSCIL, we explore how using higher-order feature statistics can +influence the classification of few-shot classes. We identify the main +challenge of obtaining a good covariance matrix from few-shot data and propose +to calibrate the covariance matrix for new classes based on semantic similarity +to the many-shot base classes. Using the calibrated feature statistics in +combination with existing methods significantly improves few-shot continual +classification on several FSCIL benchmarks. Code is available at +https://github.com/dipamgoswami/FSCIL-Calibration. + +
+
+ comment: Accepted at CLVision workshop (CVPR 2024) +
+
+
+
+
+ + ☆ RoadBEV: Road Surface Reconstruction in Bird's Eye View + + +
+ Road surface conditions, especially geometry profiles, enormously affect +driving performance of autonomous vehicles. Vision-based online road +reconstruction promisingly captures road information in advance. Existing +solutions like monocular depth estimation and stereo matching suffer from +modest performance. The recent technique of Bird's-Eye-View (BEV) perception +provides immense potential to more reliable and accurate reconstruction. This +paper uniformly proposes two simple yet effective models for road elevation +reconstruction in BEV named RoadBEV-mono and RoadBEV-stereo, which estimate +road elevation with monocular and stereo images, respectively. The former +directly fits elevation values based on voxel features queried from image view, +while the latter efficiently recognizes road elevation patterns based on BEV +volume representing discrepancy between left and right voxel features. +Insightful analyses reveal their consistence and difference with perspective +view. Experiments on real-world dataset verify the models' effectiveness and +superiority. Elevation errors of RoadBEV-mono and RoadBEV-stereo achieve 1.83cm +and 0.56cm, respectively. The estimation performance improves by 50\% in BEV +based on monocular image. Our models are promising for practical applications, +providing valuable references for vision-based BEV perception in autonomous +driving. The code is released at https://github.com/ztsrxh/RoadBEV. + +
+
+ comment: Dataset page: https://thu-rsxd.com/rsrd Code: + https://github.com/ztsrxh/RoadBEV +
+
+
+
+
+ + ☆ Spatially Optimized Compact Deep Metric Learning Model for Similarity + Search + + +
+ Spatial optimization is often overlooked in many computer vision tasks. +Filters should be able to recognize the features of an object regardless of +where it is in the image. Similarity search is a crucial task where spatial +features decide an important output. The capacity of convolution to capture +visual patterns across various locations is limited. In contrast to +convolution, the involution kernel is dynamically created at each pixel based +on the pixel value and parameters that have been learned. This study +demonstrates that utilizing a single layer of involution feature extractor +alongside a compact convolution model significantly enhances the performance of +similarity search. Additionally, we improve predictions by using the GELU +activation function rather than the ReLU. The negligible amount of weight +parameters in involution with a compact model with better performance makes the +model very useful in real-world implementations. Our proposed model is below 1 +megabyte in size. We have experimented with our proposed methodology and other +models on CIFAR-10, FashionMNIST, and MNIST datasets. Our proposed method +outperforms across all three datasets. + +
+
+ comment: 5 pages, 3 figures, +
+
+
+
+
+ + ☆ Leveraging Latents for Efficient Thermography Classification and + Segmentation + + +
+ Breast cancer is a prominent health concern worldwide, currently being the +secondmost common and second-deadliest type of cancer in women. While current +breast cancer diagnosis mainly relies on mammography imaging, in recent years +the use of thermography for breast cancer imaging has been garnering growing +popularity. Thermographic imaging relies on infrared cameras to capture +body-emitted heat distributions. While these heat signatures have proven useful +for computer-vision systems for accurate breast cancer segmentation and +classification, prior work often relies on handcrafted feature engineering or +complex architectures, potentially limiting the comparability and applicability +of these methods. In this work, we present a novel algorithm for both breast +cancer classification and segmentation. Rather than focusing efforts on manual +feature and architecture engineering, our algorithm focuses on leveraging an +informative, learned feature space, thus making our solution simpler to use and +extend to other frameworks and downstream tasks, as well as more applicable to +data-scarce settings. Our classification produces SOTA results, while we are +the first work to produce segmentation regions studied in this paper. + +
+
+
+
+
+ + ☆ The Impact of Print-and-Scan in Heterogeneous Morph Evaluation Scenarios + + +
+ Face morphing attacks present an emerging threat to the face recognition +system. On top of that, printing and scanning the morphed images could obscure +the artifacts generated during the morphing process, which makes morphed image +detection even harder. In this work, we investigate the impact that printing +and scanning has on morphing attacks through a series of heterogeneous tests. +Our experiments show that we can increase the possibility of a false match by +up to 5.64% for DiM and 16.00% for StyleGAN2 when providing an image that has +been printed and scanned, regardless it is morphed or bona fide, to a Face +Recognition (FR) system. Likewise, using Frechet Inception Distance (FID) +metric, strictly print-scanned morph attacks performed on average 9.185% +stronger than non-print-scanned digital morphs. + +
+
+ comment: Initial preprint. Under review +
+
+
+
+
+ + ☆ Training-Free Open-Vocabulary Segmentation with Offline + Diffusion-Augmented Prototype Generation CVPR 2024 + + +
+ Open-vocabulary semantic segmentation aims at segmenting arbitrary categories +expressed in textual form. Previous works have trained over large amounts of +image-caption pairs to enforce pixel-level multimodal alignments. However, +captions provide global information about the semantics of a given image but +lack direct localization of individual concepts. Further, training on +large-scale datasets inevitably brings significant computational costs. In this +paper, we propose FreeDA, a training-free diffusion-augmented method for +open-vocabulary semantic segmentation, which leverages the ability of diffusion +models to visually localize generated concepts and local-global similarities to +match class-agnostic regions with semantic classes. Our approach involves an +offline stage in which textual-visual reference embeddings are collected, +starting from a large set of captions and leveraging visual and semantic +contexts. At test time, these are queried to support the visual matching +process, which is carried out by jointly considering class-agnostic regions and +global semantic similarities. Extensive analyses demonstrate that FreeDA +achieves state-of-the-art performance on five datasets, surpassing previous +methods by more than 7.0 average points in terms of mIoU and without requiring +any training. + +
+
+ comment: CVPR 2024. Project page: https://aimagelab.github.io/freeda/ +
+
+
+
+
+ + ♻ ☆ Zero-shot Referring Expression Comprehension via Structural Similarity + Between Images and Captions CVPR 2024 + + +
+ Zero-shot referring expression comprehension aims at localizing bounding +boxes in an image corresponding to provided textual prompts, which requires: +(i) a fine-grained disentanglement of complex visual scene and textual context, +and (ii) a capacity to understand relationships among disentangled entities. +Unfortunately, existing large vision-language alignment (VLA) models, e.g., +CLIP, struggle with both aspects so cannot be directly used for this task. To +mitigate this gap, we leverage large foundation models to disentangle both +images and texts into triplets in the format of (subject, predicate, object). +After that, grounding is accomplished by calculating the structural similarity +matrix between visual and textual triplets with a VLA model, and subsequently +propagate it to an instance-level similarity matrix. Furthermore, to equip VLA +models with the ability of relationship understanding, we design a +triplet-matching objective to fine-tune the VLA models on a collection of +curated dataset containing abundant entity relationships. Experiments +demonstrate that our visual grounding performance increase of up to 19.5% over +the SOTA zero-shot model on RefCOCO/+/g. On the more challenging Who's Waldo +dataset, our zero-shot approach achieves comparable accuracy to the fully +supervised model. Code is available at +https://github.com/Show-han/Zeroshot_REC. + +
+
+ comment: CVPR 2024, Code available at https://github.com/Show-han/Zeroshot_REC +
+
+
+
+
+ + ♻ ☆ Multi-person 3D pose estimation from unlabelled data + + +
+ Its numerous applications make multi-human 3D pose estimation a remarkably +impactful area of research. Nevertheless, assuming a multiple-view system +composed of several regular RGB cameras, 3D multi-pose estimation presents +several challenges. First of all, each person must be uniquely identified in +the different views to separate the 2D information provided by the cameras. +Secondly, the 3D pose estimation process from the multi-view 2D information of +each person must be robust against noise and potential occlusions in the +scenario. In this work, we address these two challenges with the help of deep +learning. Specifically, we present a model based on Graph Neural Networks +capable of predicting the cross-view correspondence of the people in the +scenario along with a Multilayer Perceptron that takes the 2D points to yield +the 3D poses of each person. These two models are trained in a self-supervised +manner, thus avoiding the need for large datasets with 3D annotations. + +
+
+
+
+
+ + ♻ ☆ Influencer Backdoor Attack on Semantic Segmentation + + +
+ When a small number of poisoned samples are injected into the training +dataset of a deep neural network, the network can be induced to exhibit +malicious behavior during inferences, which poses potential threats to +real-world applications. While they have been intensively studied in +classification, backdoor attacks on semantic segmentation have been largely +overlooked. Unlike classification, semantic segmentation aims to classify every +pixel within a given image. In this work, we explore backdoor attacks on +segmentation models to misclassify all pixels of a victim class by injecting a +specific trigger on non-victim pixels during inferences, which is dubbed +Influencer Backdoor Attack (IBA). IBA is expected to maintain the +classification accuracy of non-victim pixels and mislead classifications of all +victim pixels in every single inference and could be easily applied to +real-world scenes. Based on the context aggregation ability of segmentation +models, we proposed a simple, yet effective, Nearest-Neighbor trigger injection +strategy. We also introduce an innovative Pixel Random Labeling strategy which +maintains optimal performance even when the trigger is placed far from the +victim pixels. Our extensive experiments reveal that current segmentation +models do suffer from backdoor attacks, demonstrate IBA real-world +applicability, and show that our proposed techniques can further increase +attack performance. + +
+
+
+
+
+ + ♻ ☆ An Edit Friendly DDPM Noise Space: Inversion and Manipulations CVPR 2024 + + +
+ Denoising diffusion probabilistic models (DDPMs) employ a sequence of white +Gaussian noise samples to generate an image. In analogy with GANs, those noise +maps could be considered as the latent code associated with the generated +image. However, this native noise space does not possess a convenient +structure, and is thus challenging to work with in editing tasks. Here, we +propose an alternative latent noise space for DDPM that enables a wide range of +editing operations via simple means, and present an inversion method for +extracting these edit-friendly noise maps for any given image (real or +synthetically generated). As opposed to the native DDPM noise space, the +edit-friendly noise maps do not have a standard normal distribution and are not +statistically independent across timesteps. However, they allow perfect +reconstruction of any desired image, and simple transformations on them +translate into meaningful manipulations of the output image (e.g. shifting, +color edits). Moreover, in text-conditional models, fixing those noise maps +while changing the text prompt, modifies semantics while retaining structure. +We illustrate how this property enables text-based editing of real images via +the diverse DDPM sampling scheme (in contrast to the popular non-diverse DDIM +inversion). We also show how it can be used within existing diffusion-based +editing methods to improve their quality and diversity. Webpage: +https://inbarhub.github.io/DDPM_inversion + +
+
+ comment: CVPR 2024. Code and examples are available at + https://github.com/inbarhub/DDPM_inversion +
+
+
+
+
+ + ♻ ☆ Event Data Association via Robust Model Fitting for Event-based Object + Tracking + + +
+ Event-based approaches, which are based on bio-inspired asynchronous event +cameras, have achieved promising performance on various computer vision tasks. +However, the study of the fundamental event data association problem is still +in its infancy. In this paper, we propose a novel Event Data Association +(called EDA) approach to explicitly address the event association and fusion +problem. The proposed EDA seeks for event trajectories that best fit the event +data, in order to perform unifying data association and information fusion. In +EDA, we first asynchronously fuse the event data based on its information +entropy. Then, we introduce a deterministic model hypothesis generation +strategy, which effectively generates model hypotheses from the fused events, +to represent the corresponding event trajectories. After that, we present a +two-stage weighting algorithm, which robustly weighs and selects true models +from the generated model hypotheses, through multi-structural geometric model +fitting. Meanwhile, we also propose an adaptive model selection strategy to +automatically determine the number of the true models. Finally, we use the +selected true models to associate and fuse the event data, without being +affected by sensor noise and irrelevant structures. We evaluate the performance +of the proposed EDA on the object tracking task. The experimental results show +the effectiveness of EDA under challenging scenarios, such as high speed, +motion blur, and high dynamic range conditions. + +
+
+ comment: 32 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ A Spatio-temporal Aligned SUNet Model for Low-light Video Enhancement + + +
+ Distortions caused by low-light conditions are not only visually unpleasant +but also degrade the performance of computer vision tasks. The restoration and +enhancement have proven to be highly beneficial. However, there are only a +limited number of enhancement methods explicitly designed for videos acquired +in low-light conditions. We propose a Spatio-Temporal Aligned SUNet (STA-SUNet) +model using a Swin Transformer as a backbone to capture low light video +features and exploit their spatio-temporal correlations. The STA-SUNet model is +trained on a novel, fully registered dataset (BVI), which comprises dynamic +scenes captured under varying light conditions. It is further analysed +comparatively against various other models over three test datasets. The model +demonstrates superior adaptivity across all datasets, obtaining the highest +PSNR and SSIM values. It is particularly effective in extreme low-light +conditions, yielding fairly good visualisation results. + +
+
+
+
+
+ + ♻ ☆ DIAGNOSIS: Detecting Unauthorized Data Usages in Text-to-image Diffusion + Models ICLR 2024 + + +
+ Recent text-to-image diffusion models have shown surprising performance in +generating high-quality images. However, concerns have arisen regarding the +unauthorized data usage during the training or fine-tuning process. One example +is when a model trainer collects a set of images created by a particular artist +and attempts to train a model capable of generating similar images without +obtaining permission and giving credit to the artist. To address this issue, we +propose a method for detecting such unauthorized data usage by planting the +injected memorization into the text-to-image diffusion models trained on the +protected dataset. Specifically, we modify the protected images by adding +unique contents on these images using stealthy image warping functions that are +nearly imperceptible to humans but can be captured and memorized by diffusion +models. By analyzing whether the model has memorized the injected content +(i.e., whether the generated images are processed by the injected +post-processing function), we can detect models that had illegally utilized the +unauthorized data. Experiments on Stable Diffusion and VQ Diffusion with +different model training or fine-tuning methods (i.e, LoRA, DreamBooth, and +standard training) demonstrate the effectiveness of our proposed method in +detecting unauthorized data usages. Code: +https://github.com/ZhentingWang/DIAGNOSIS. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ DiffusionLight: Light Probes for Free by Painting a Chrome Ball CVPR 2024 + + +
+ We present a simple yet effective technique to estimate lighting in a single +input image. Current techniques rely heavily on HDR panorama datasets to train +neural networks to regress an input with limited field-of-view to a full +environment map. However, these approaches often struggle with real-world, +uncontrolled settings due to the limited diversity and size of their datasets. +To address this problem, we leverage diffusion models trained on billions of +standard images to render a chrome ball into the input image. Despite its +simplicity, this task remains challenging: the diffusion models often insert +incorrect or inconsistent objects and cannot readily generate images in HDR +format. Our research uncovers a surprising relationship between the appearance +of chrome balls and the initial diffusion noise map, which we utilize to +consistently generate high-quality chrome balls. We further fine-tune an LDR +diffusion model (Stable Diffusion XL) with LoRA, enabling it to perform +exposure bracketing for HDR light estimation. Our method produces convincing +light estimates across diverse settings and demonstrates superior +generalization to in-the-wild scenarios. + +
+
+ comment: CVPR 2024 Oral. For more information and code, please visit our + website https://diffusionlight.github.io/ +
+
+
+
+
+ + ♻ ☆ Learning Local and Global Temporal Contexts for Video Semantic + Segmentation CVPR + 2022 + + +
+ Contextual information plays a core role for video semantic segmentation +(VSS). This paper summarizes contexts for VSS in two-fold: local temporal +contexts (LTC) which define the contexts from neighboring frames, and global +temporal contexts (GTC) which represent the contexts from the whole video. As +for LTC, it includes static and motional contexts, corresponding to static and +moving content in neighboring frames, respectively. Previously, both static and +motional contexts have been studied. However, there is no research about +simultaneously learning static and motional contexts (highly complementary). +Hence, we propose a Coarse-to-Fine Feature Mining (CFFM) technique to learn a +unified presentation of LTC. CFFM contains two parts: Coarse-to-Fine Feature +Assembling (CFFA) and Cross-frame Feature Mining (CFM). CFFA abstracts static +and motional contexts, and CFM mines useful information from nearby frames to +enhance target features. To further exploit more temporal contexts, we propose +CFFM++ by additionally learning GTC from the whole video. Specifically, we +uniformly sample certain frames from the video and extract global contextual +prototypes by k-means. The information within those prototypes is mined by CFM +to refine target features. Experimental results on popular benchmarks +demonstrate that CFFM and CFFM++ perform favorably against state-of-the-art +methods. Our code is available at https://github.com/GuoleiSun/VSS-CFFM + +
+
+ comment: Accepted to TPAMI, an extended version of a paper published in CVPR + 2022 +
+
+
+
+
+ + ♻ ☆ SGV3D:Towards Scenario Generalization for Vision-based Roadside 3D + Object Detection + + +
+ Roadside perception can greatly increase the safety of autonomous vehicles by +extending their perception ability beyond the visual range and addressing blind +spots. However, current state-of-the-art vision-based roadside detection +methods possess high accuracy on labeled scenes but have inferior performance +on new scenes. This is because roadside cameras remain stationary after +installation and can only collect data from a single scene, resulting in the +algorithm overfitting these roadside backgrounds and camera poses. To address +this issue, in this paper, we propose an innovative Scenario Generalization +Framework for Vision-based Roadside 3D Object Detection, dubbed SGV3D. +Specifically, we employ a Background-suppressed Module (BSM) to mitigate +background overfitting in vision-centric pipelines by attenuating background +features during the 2D to bird's-eye-view projection. Furthermore, by +introducing the Semi-supervised Data Generation Pipeline (SSDG) using unlabeled +images from new scenes, diverse instance foregrounds with varying camera poses +are generated, addressing the risk of overfitting specific camera poses. We +evaluate our method on two large-scale roadside benchmarks. Our method +surpasses all previous methods by a significant margin in new scenes, including ++42.57% for vehicle, +5.87% for pedestrian, and +14.89% for cyclist compared to +BEVHeight on the DAIR-V2X-I heterologous benchmark. On the larger-scale Rope3D +heterologous benchmark, we achieve notable gains of 14.48% for car and 12.41% +for large vehicle. We aspire to contribute insights on the exploration of +roadside perception techniques, emphasizing their capability for scenario +generalization. The code will be available at +https://github.com/yanglei18/SGV3D + +
+
+ comment: 13 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Are We on the Right Way for Evaluating Large Vision-Language Models? + + +
+ Large vision-language models (LVLMs) have recently achieved rapid progress, +sparking numerous studies to evaluate their multi-modal capabilities. However, +we dig into current evaluation works and identify two primary issues: 1) Visual +content is unnecessary for many samples. The answers can be directly inferred +from the questions and options, or the world knowledge embedded in LLMs. This +phenomenon is prevalent across current benchmarks. For instance, GeminiPro +achieves 42.9% on the MMMU benchmark without any visual input, and outperforms +the random choice baseline across six benchmarks over 24% on average. 2) +Unintentional data leakage exists in LLM and LVLM training. LLM and LVLM could +still answer some visual-necessary questions without visual content, indicating +the memorizing of these samples within large-scale training data. For example, +Sphinx-X-MoE gets 43.6% on MMMU without accessing images, surpassing its LLM +backbone with 17.9%. Both problems lead to misjudgments of actual multi-modal +gains and potentially misguide the study of LVLM. To this end, we present +MMStar, an elite vision-indispensable multi-modal benchmark comprising 1,500 +samples meticulously selected by humans. MMStar benchmarks 6 core capabilities +and 18 detailed axes, aiming to evaluate LVLMs' multi-modal capacities with +carefully balanced and purified samples. These samples are first roughly +selected from current benchmarks with an automated pipeline, human review is +then involved to ensure each curated sample exhibits visual dependency, minimal +data leakage, and requires advanced multi-modal capabilities. Moreover, two +metrics are developed to measure data leakage and actual performance gain in +multi-modal training. We evaluate 16 leading LVLMs on MMStar to assess their +multi-modal capabilities, and on 7 benchmarks with the proposed metrics to +investigate their data leakage and actual multi-modal gain. + +
+
+ comment: Project page: https://mmstar-benchmark.github.io/ +
+
+
+
+
+ + ♻ ☆ CN-RMA: Combined Network with Ray Marching Aggregation for 3D Indoors + Object Detection from Multi-view Images CVPR2024 + + +
+ This paper introduces CN-RMA, a novel approach for 3D indoor object detection +from multi-view images. We observe the key challenge as the ambiguity of image +and 3D correspondence without explicit geometry to provide occlusion +information. To address this issue, CN-RMA leverages the synergy of 3D +reconstruction networks and 3D object detection networks, where the +reconstruction network provides a rough Truncated Signed Distance Function +(TSDF) and guides image features to vote to 3D space correctly in an end-to-end +manner. Specifically, we associate weights to sampled points of each ray +through ray marching, representing the contribution of a pixel in an image to +corresponding 3D locations. Such weights are determined by the predicted signed +distances so that image features vote only to regions near the reconstructed +surface. Our method achieves state-of-the-art performance in 3D object +detection from multi-view images, as measured by mAP@0.25 and mAP@0.5 on the +ScanNet and ARKitScenes datasets. The code and models are released at +https://github.com/SerCharles/CN-RMA. + +
+
+ comment: CVPR2024 poster paper, 8 pages of main part, and 4 pages of + supplementary material +
+
+
+
+
+ + ♻ ☆ MetaMix: Meta-state Precision Searcher for Mixed-precision Activation + Quantization AAAI + + +
+ Mixed-precision quantization of efficient networks often suffer from +activation instability encountered in the exploration of bit selections. To +address this problem, we propose a novel method called MetaMix which consists +of bit selection and weight training phases. The bit selection phase iterates +two steps, (1) the mixed-precision-aware weight update, and (2) the bit-search +training with the fixed mixed-precision-aware weights, both of which combined +reduce activation instability in mixed-precision quantization and contribute to +fast and high-quality bit selection. The weight training phase exploits the +weights and step sizes trained in the bit selection phase and fine-tunes them +thereby offering fast training. Our experiments with efficient and +hard-to-quantize networks, i.e., MobileNet v2 and v3, and ResNet-18 on ImageNet +show that our proposed method pushes the boundary of mixed-precision +quantization, in terms of accuracy vs. operations, by outperforming both mixed- +and single-precision SOTA methods. + +
+
+ comment: Proc. The 38th Annual AAAI Conference on Artificial Intelligence + (AAAI) +
+
+
+
+
+ + ♻ ☆ UltraLight VM-UNet: Parallel Vision Mamba Significantly Reduces + Parameters for Skin Lesion Segmentation + + +
+ Traditionally for improving the segmentation performance of models, most +approaches prefer to use adding more complex modules. And this is not suitable +for the medical field, especially for mobile medical devices, where +computationally loaded models are not suitable for real clinical environments +due to computational resource constraints. Recently, state-space models (SSMs), +represented by Mamba, have become a strong competitor to traditional CNNs and +Transformers. In this paper, we deeply explore the key elements of parameter +influence in Mamba and propose an UltraLight Vision Mamba UNet (UltraLight +VM-UNet) based on this. Specifically, we propose a method for processing +features in parallel Vision Mamba, named PVM Layer, which achieves excellent +performance with the lowest computational load while keeping the overall number +of processing channels constant. We conducted comparisons and ablation +experiments with several state-of-the-art lightweight models on three skin +lesion public datasets and demonstrated that the UltraLight VM-UNet exhibits +the same strong performance competitiveness with parameters of only 0.049M and +GFLOPs of 0.060. In addition, this study deeply explores the key elements of +parameter influence in Mamba, which will lay a theoretical foundation for Mamba +to possibly become a new mainstream module for lightweighting in the future. +The code is available from https://github.com/wurenkai/UltraLight-VM-UNet . + +
+
+
+
+
+ + ♻ ☆ Cross-Silo Federated Learning Across Divergent Domains with Iterative + Parameter Alignment + + +
+ Learning from the collective knowledge of data dispersed across private +sources can provide neural networks with enhanced generalization capabilities. +Federated learning, a method for collaboratively training a machine learning +model across remote clients, achieves this by combining client models via the +orchestration of a central server. However, current approaches face two +critical limitations: i) they struggle to converge when client domains are +sufficiently different, and ii) current aggregation techniques produce an +identical global model for each client. In this work, we address these issues +by reformulating the typical federated learning setup: rather than learning a +single global model, we learn N models each optimized for a common objective. +To achieve this, we apply a weighted distance minimization to model parameters +shared in a peer-to-peer topology. The resulting framework, Iterative Parameter +Alignment, applies naturally to the cross-silo setting, and has the following +properties: (i) a unique solution for each participant, with the option to +globally converge each model in the federation, and (ii) an optional +early-stopping mechanism to elicit fairness among peers in collaborative +learning settings. These characteristics jointly provide a flexible new +framework for iteratively learning from peer models trained on disparate +datasets. We find that the technique achieves competitive results on a variety +of data partitions compared to state-of-the-art approaches. Further, we show +that the method is robust to divergent domains (i.e. disjoint classes across +peers) where existing approaches struggle. + +
+
+ comment: Published at IEEE Big Data 2023 +
+
+
+
+
+ + ♻ ☆ Coarse-to-Fine Latent Diffusion for Pose-Guided Person Image Synthesis CVPR 2024 + + +
+ Diffusion model is a promising approach to image generation and has been +employed for Pose-Guided Person Image Synthesis (PGPIS) with competitive +performance. While existing methods simply align the person appearance to the +target pose, they are prone to overfitting due to the lack of a high-level +semantic understanding on the source person image. In this paper, we propose a +novel Coarse-to-Fine Latent Diffusion (CFLD) method for PGPIS. In the absence +of image-caption pairs and textual prompts, we develop a novel training +paradigm purely based on images to control the generation process of a +pre-trained text-to-image diffusion model. A perception-refined decoder is +designed to progressively refine a set of learnable queries and extract +semantic understanding of person images as a coarse-grained prompt. This allows +for the decoupling of fine-grained appearance and pose information controls at +different stages, and thus circumventing the potential overfitting problem. To +generate more realistic texture details, a hybrid-granularity attention module +is proposed to encode multi-scale fine-grained appearance features as bias +terms to augment the coarse-grained prompt. Both quantitative and qualitative +experimental results on the DeepFashion benchmark demonstrate the superiority +of our method over the state of the arts for PGPIS. Code is available at +https://github.com/YanzuoLu/CFLD. + +
+
+ comment: Accepted by CVPR 2024 (Highlight) +
+
+
+
+
+ + ♻ ☆ One-Step Late Fusion Multi-view Clustering with Compressed Subspace ICASSP2024 + + +
+ Late fusion multi-view clustering (LFMVC) has become a rapidly growing class +of methods in the multi-view clustering (MVC) field, owing to its excellent +computational speed and clustering performance. One bottleneck faced by +existing late fusion methods is that they are usually aligned to the average +kernel function, which makes the clustering performance highly dependent on the +quality of datasets. Another problem is that they require subsequent k-means +clustering after obtaining the consensus partition matrix to get the final +discrete labels, and the resulting separation of the label learning and cluster +structure optimization processes limits the integrity of these models. To +address the above issues, we propose an integrated framework named One-Step +Late Fusion Multi-view Clustering with Compressed Subspace (OS-LFMVC-CS). +Specifically, we use the consensus subspace to align the partition matrix while +optimizing the partition fusion, and utilize the fused partition matrix to +guide the learning of discrete labels. A six-step iterative optimization +approach with verified convergence is proposed. Sufficient experiments on +multiple datasets validate the effectiveness and efficiency of our proposed +method. + +
+
+ comment: Accepted by ICASSP2024 +
+
+
+
+
+ + ♻ ☆ Deepfake Generation and Detection: A Benchmark and Survey + + +
+ In addition to the advancements in deepfake generation, corresponding +detection technologies need to continuously evolve to regulate the potential +misuse of deepfakes, such as for privacy invasion and phishing attacks. This +survey comprehensively reviews the latest developments in deepfake generation +and detection, summarizing and analyzing the current state of the art in this +rapidly evolving field. We first unify task definitions, comprehensively +introduce datasets and metrics, and discuss the development of generation and +detection technology frameworks. Then, we discuss the development of several +related sub-fields and focus on researching four mainstream deepfake fields: +popular face swap, face reenactment, talking face generation, and facial +attribute editing, as well as foreign detection. Subsequently, we +comprehensively benchmark representative methods on popular datasets for each +field, fully evaluating the latest and influential works published in top +conferences/journals. Finally, we analyze the challenges and future research +directions of the discussed fields. We closely follow the latest developments +in https://github.com/flyingby/Awesome-Deepfake-Generation-and-Detection. + +
+
+
+
+
+ + ♻ ☆ MultIOD: Rehearsal-free Multihead Incremental Object Detector CVPR 2024 + + +
+ Class-Incremental learning (CIL) refers to the ability of artificial agents +to integrate new classes as they appear in a stream. It is particularly +interesting in evolving environments where agents have limited access to memory +and computational resources. The main challenge of incremental learning is +catastrophic forgetting, the inability of neural networks to retain past +knowledge when learning a new one. Unfortunately, most existing +class-incremental methods for object detection are applied to two-stage +algorithms such as Faster-RCNN, and rely on rehearsal memory to retain past +knowledge. We argue that those are not suitable in resource-limited +environments, and more effort should be dedicated to anchor-free and +rehearsal-free object detection. In this paper, we propose MultIOD, a +class-incremental object detector based on CenterNet. Our contributions are: +(1) we propose a multihead feature pyramid and multihead detection architecture +to efficiently separate class representations, (2) we employ transfer learning +between classes learned initially and those learned incrementally to tackle +catastrophic forgetting, and (3) we use a class-wise non-max-suppression as a +post-processing technique to remove redundant boxes. Results show that our +method outperforms state-of-the-art methods on two Pascal VOC datasets, while +only saving the model in its current state, contrary to other +distillation-based counterparts. + +
+
+ comment: Accepted at the archival track of the Workshop on Continual Learning + in Computer Vision (CVPR 2024) +
+
+
+
+
+ + ♻ ☆ BlockFusion: Expandable 3D Scene Generation using Latent Tri-plane + Extrapolation + + +
+ We present BlockFusion, a diffusion-based model that generates 3D scenes as +unit blocks and seamlessly incorporates new blocks to extend the scene. +BlockFusion is trained using datasets of 3D blocks that are randomly cropped +from complete 3D scene meshes. Through per-block fitting, all training blocks +are converted into the hybrid neural fields: with a tri-plane containing the +geometry features, followed by a Multi-layer Perceptron (MLP) for decoding the +signed distance values. A variational auto-encoder is employed to compress the +tri-planes into the latent tri-plane space, on which the denoising diffusion +process is performed. Diffusion applied to the latent representations allows +for high-quality and diverse 3D scene generation. To expand a scene during +generation, one needs only to append empty blocks to overlap with the current +scene and extrapolate existing latent tri-planes to populate new blocks. The +extrapolation is done by conditioning the generation process with the feature +samples from the overlapping tri-planes during the denoising iterations. Latent +tri-plane extrapolation produces semantically and geometrically meaningful +transitions that harmoniously blend with the existing scene. A 2D layout +conditioning mechanism is used to control the placement and arrangement of +scene elements. Experimental results indicate that BlockFusion is capable of +generating diverse, geometrically consistent and unbounded large 3D scenes with +unprecedented high-quality shapes in both indoor and outdoor scenarios. + +
+
+ comment: Video: https://www.youtube.com/watch?v=PxIBtd6G0mA +
+
+
+
+
+ + ♻ ☆ Learning Zero-Shot Material States Segmentation, by Implanting Natural + Image Patterns in Synthetic Data + + +
+ Visual understanding and segmentation of materials and their states is +fundamental to understanding the physical world. The myriad textures, shapes, +and often blurry boundaries formed by materials make this task particularly +hard to generalize. Whether it's identifying wet regions of a surface, minerals +in rocks, infected regions in plants, or pollution in water, each material +state has its own unique form. For neural nets to learn general class-agnostic +material segmentation, it is necessary to first collect and annotate data that +captures this complexity. Collecting and manually annotating real-world images +is limited by the cost and precision of manual labor. In contrast, synthetic +CGI data is highly accurate and almost cost-free, but fails to replicate the +vast diversity of the material world. This work offers a method to bridge this +crucial gap by implanting patterns extracted from real-world images in +synthetic data. Hence, patterns automatically collected from natural images are +used to map materials into synthetic scenes. This unsupervised approach allows +the generated data to capture the vast complexity of the real world while +maintaining the precision and scale of synthetic data. We also present the +first general benchmark for zero-shot material state segmentation. The +benchmark contains a wide range of real-world images of material states, like +food, rocks, construction, plants, liquids, and many others, each in various +states (wet/dry/stained/cooked/burned/worn/rusted/sediment/foam, etc.). The +annotation includes both partial similarity between regions with similar but +not identical materials, and hard segmentation of only points in the exact same +material state. We show that net trains on MatSeg significantly outperform +existing state-of-the-art methods on this task. The dataset, code, and trained +model are available + +
+
+
+
+
+ + ♻ ☆ Improved Probabilistic Image-Text Representations ICLR 2024 + + +
+ Image-Text Matching (ITM) task, a fundamental vision-language (VL) task, +suffers from the inherent ambiguity arising from multiplicity and imperfect +annotations. Deterministic functions are not sufficiently powerful to capture +ambiguity, prompting the exploration of probabilistic embeddings to tackle the +challenge. However, the existing probabilistic ITM approach encounters two key +shortcomings; the burden of heavy computations due to the Monte Carlo +approximation, and the loss saturation issue in the face of abundant false +negatives. To overcome the issues, this paper presents an improved +Probabilistic Cross-Modal Embeddings (named PCME++) by introducing a new +probabilistic distance with a closed-form solution. In addition, two +optimization techniques are proposed to enhance PCME++ further: first, the +incorporation of pseudo-positives to prevent the negative effect under massive +false negatives; second, mixed sample data augmentation for probabilistic +matching. Experimental results on MS-COCO Caption and two extended benchmarks, +CxC and ECCV Caption, demonstrate the effectiveness of PCME++ compared to +state-of-the-art ITM methods. The robustness of PCME++ is also evaluated under +noisy image-text correspondences. In addition, the potential applicability of +PCME++ in automatic prompt-filtering for zero-shot classification is shown. The +code is available at https://github.com/naver-ai/pcmepp + +
+
+ comment: ICLR 2024 camera-ready; Code: https://github.com/naver-ai/pcmepp. + Project page: https://naver-ai.github.io/pcmepp/. 30 pages, 2.2 MB +
+
+
+
+
+ + ♻ ☆ Industrial Application of 6D Pose Estimation for Robotic Manipulation in + Automotive Internal Logistics + + +
+ Despite the advances in robotics a large proportion of the of parts handling +tasks in the automotive industry's internal logistics are not automated but +still performed by humans. A key component to competitively automate these +processes is a 6D pose estimation that can handle a large number of different +parts, is adaptable to new parts with little manual effort, and is sufficiently +accurate and robust with respect to industry requirements. In this context, the +question arises as to the current status quo with respect to these measures. To +address this we built a representative 6D pose estimation pipeline with +state-of-the-art components from economically scalable real to synthetic data +generation to pose estimators and evaluated it on automotive parts with regards +to a realistic sequencing process. We found that using the data generation +approaches, the performance of the trained 6D pose estimators are promising, +but do not meet industry requirements. We reveal that the reason for this is +the inability of the estimators to provide reliable uncertainties for their +poses, rather than the ability of to provide sufficiently accurate poses. In +this context we further analyzed how RGB- and RGB-D-based approaches compare +against this background and show that they are differently vulnerable to the +domain gap induced by synthetic data. + +
+
+ comment: Accepted for publication at IEEE International Conference on + Automation Science and Engineering (CASE 2023) +
+
+
+
+
+ + ♻ ☆ Self-training via Metric Learning for Source-Free Domain Adaptation of + Semantic Segmentation + + +
+ Unsupervised source-free domain adaptation methods aim to train a model for +the target domain utilizing a pretrained source-domain model and unlabeled +target-domain data, particularly when accessibility to source data is +restricted due to intellectual property or privacy concerns. Traditional +methods usually use self-training with pseudo-labeling, which is often +subjected to thresholding based on prediction confidence. However, such +thresholding limits the effectiveness of self-training due to insufficient +supervision. This issue becomes more severe in a source-free setting, where +supervision comes solely from the predictions of the pre-trained source model. +In this study, we propose a novel approach by incorporating a mean-teacher +model, wherein the student network is trained using all predictions from the +teacher network. Instead of employing thresholding on predictions, we introduce +a method to weight the gradients calculated from pseudo-labels based on the +reliability of the teacher's predictions. To assess reliability, we introduce a +novel approach using proxy-based metric learning. Our method is evaluated in +synthetic-to-real and cross-city scenarios, demonstrating superior performance +compared to existing state-of-the-art methods. + +
+
+ comment: This paper is under consideration at Computer Vision and Image + Understanding +
+
+
+
+
+ + ♻ ☆ Fine-grained Action Analysis: A Multi-modality and Multi-task Dataset of + Figure Skating + + +
+ The fine-grained action analysis of the existing action datasets is +challenged by insufficient action categories, low fine granularities, limited +modalities, and tasks. In this paper, we propose a Multi-modality and +Multi-task dataset of Figure Skating (MMFS) which was collected from the World +Figure Skating Championships. MMFS, which possesses action recognition and +action quality assessment, captures RGB, skeleton, and is collected the score +of actions from 11671 clips with 256 categories including spatial and temporal +labels. The key contributions of our dataset fall into three aspects as +follows. (1) Independently spatial and temporal categories are first proposed +to further explore fine-grained action recognition and quality assessment. (2) +MMFS first introduces the skeleton modality for complex fine-grained action +quality assessment. (3) Our multi-modality and multi-task dataset encourage +more action analysis models. To benchmark our dataset, we adopt RGB-based and +skeleton-based baseline methods for action recognition and action quality +assessment. + +
+
+
+
+
+ + ♻ ☆ Co-Occ: Coupling Explicit Feature Fusion with Volume Rendering + Regularization for Multi-Modal 3D Semantic Occupancy Prediction + + +
+ 3D semantic occupancy prediction is a pivotal task in the field of autonomous +driving. Recent approaches have made great advances in 3D semantic occupancy +predictions on a single modality. However, multi-modal semantic occupancy +prediction approaches have encountered difficulties in dealing with the +modality heterogeneity, modality misalignment, and insufficient modality +interactions that arise during the fusion of different modalities data, which +may result in the loss of important geometric and semantic information. This +letter presents a novel multi-modal, i.e., LiDAR-camera 3D semantic occupancy +prediction framework, dubbed Co-Occ, which couples explicit LiDAR-camera +feature fusion with implicit volume rendering regularization. The key insight +is that volume rendering in the feature space can proficiently bridge the gap +between 3D LiDAR sweeps and 2D images while serving as a physical +regularization to enhance LiDAR-camera fused volumetric representation. +Specifically, we first propose a Geometric- and Semantic-aware Fusion +(GSFusion) module to explicitly enhance LiDAR features by incorporating +neighboring camera features through a K-nearest neighbors (KNN) search. Then, +we employ volume rendering to project the fused feature back to the image +planes for reconstructing color and depth maps. These maps are then supervised +by input images from the camera and depth estimations derived from LiDAR, +respectively. Extensive experiments on the popular nuScenes and SemanticKITTI +benchmarks verify the effectiveness of our Co-Occ for 3D semantic occupancy +prediction. The project page is available at +https://rorisis.github.io/Co-Occ_project-page/. + +
+
+
+
+
+ + ♻ ☆ Anchor-based Multi-view Subspace Clustering with Hierarchical Feature + Descent + + +
+ Multi-view clustering has attracted growing attention owing to its +capabilities of aggregating information from various sources and its promising +horizons in public affairs. Up till now, many advanced approaches have been +proposed in recent literature. However, there are several ongoing difficulties +to be tackled. One common dilemma occurs while attempting to align the features +of different views. {Moreover, due to the fact that many existing multi-view +clustering algorithms stem from spectral clustering, this results to cubic time +complexity w.r.t. the number of dataset. However, we propose Anchor-based +Multi-view Subspace Clustering with Hierarchical Feature Descent(MVSC-HFD) to +tackle the discrepancy among views through hierarchical feature descent and +project to a common subspace( STAGE 1), which reveals dependency of different +views. We further reduce the computational complexity to linear time cost +through a unified sampling strategy in the common subspace( STAGE 2), followed +by anchor-based subspace clustering to learn the bipartite graph collectively( +STAGE 3). }Extensive experimental results on public benchmark datasets +demonstrate that our proposed model consistently outperforms the +state-of-the-art techniques. + +
+
+
+
+
+ + ♻ ☆ Simple Semantic-Aided Few-Shot Learning CVPR 2024 + + +
+ Learning from a limited amount of data, namely Few-Shot Learning, stands out +as a challenging computer vision task. Several works exploit semantics and +design complicated semantic fusion mechanisms to compensate for rare +representative features within restricted data. However, relying on naive +semantics such as class names introduces biases due to their brevity, while +acquiring extensive semantics from external knowledge takes a huge time and +effort. This limitation severely constrains the potential of semantics in +Few-Shot Learning. In this paper, we design an automatic way called Semantic +Evolution to generate high-quality semantics. The incorporation of high-quality +semantics alleviates the need for complex network structures and learning +algorithms used in previous works. Hence, we employ a simple two-layer network +termed Semantic Alignment Network to transform semantics and visual features +into robust class prototypes with rich discriminative features for few-shot +classification. The experimental results show our framework outperforms all +previous methods on six benchmarks, demonstrating a simple network with +high-quality semantics can beat intricate multi-modal modules on few-shot +classification tasks. Code is available at +https://github.com/zhangdoudou123/SemFew. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Deep Multi-Threshold Spiking-UNet for Image Processing + + +
+ U-Net, known for its simple yet efficient architecture, is widely utilized +for image processing tasks and is particularly suitable for deployment on +neuromorphic chips. This paper introduces the novel concept of Spiking-UNet for +image processing, which combines the power of Spiking Neural Networks (SNNs) +with the U-Net architecture. To achieve an efficient Spiking-UNet, we face two +primary challenges: ensuring high-fidelity information propagation through the +network via spikes and formulating an effective training strategy. To address +the issue of information loss, we introduce multi-threshold spiking neurons, +which improve the efficiency of information transmission within the +Spiking-UNet. For the training strategy, we adopt a conversion and fine-tuning +pipeline that leverage pre-trained U-Net models. During the conversion process, +significant variability in data distribution across different parts is observed +when utilizing skip connections. Therefore, we propose a connection-wise +normalization method to prevent inaccurate firing rates. Furthermore, we adopt +a flow-based training method to fine-tune the converted models, reducing time +steps while preserving performance. Experimental results show that, on image +segmentation and denoising, our Spiking-UNet achieves comparable performance to +its non-spiking counterpart, surpassing existing SNN methods. Compared with the +converted Spiking-UNet without fine-tuning, our Spiking-UNet reduces inference +time by approximately 90\%. This research broadens the application scope of +SNNs in image processing and is expected to inspire further exploration in the +field of neuromorphic engineering. The code for our Spiking-UNet implementation +is available at https://github.com/SNNresearch/Spiking-UNet. + +
+
+ comment: Accepted in NeuroComputing +
+
+
+
+
+ + ♻ ☆ PASTA: Towards Flexible and Efficient HDR Imaging Via Progressively + Aggregated Spatio-Temporal Alignment + + +
+ Leveraging Transformer attention has led to great advancements in HDR +deghosting. However, the intricate nature of self-attention introduces +practical challenges, as existing state-of-the-art methods often demand +high-end GPUs or exhibit slow inference speeds, especially for high-resolution +images like 2K. Striking an optimal balance between performance and latency +remains a critical concern. In response, this work presents PASTA, a novel +Progressively Aggregated Spatio-Temporal Alignment framework for HDR +deghosting. Our approach achieves effectiveness and efficiency by harnessing +hierarchical representation during feature distanglement. Through the +utilization of diverse granularities within the hierarchical structure, our +method substantially boosts computational speed and optimizes the HDR imaging +workflow. In addition, we explore within-scale feature modeling with local and +global attention, gradually merging and refining them in a coarse-to-fine +fashion. Experimental results showcase PASTA's superiority over current SOTA +methods in both visual quality and performance metrics, accompanied by a +substantial 3-fold (x3) increase in inference speed. + +
+
+
+
+
+ + ♻ ☆ PAT: Pixel-wise Adaptive Training for Long-tailed Segmentation + + +
+ Beyond class frequency, we recognize the impact of class-wise relationships +among various class-specific predictions and the imbalance in label masks on +long-tailed segmentation learning. To address these challenges, we propose an +innovative Pixel-wise Adaptive Training (PAT) technique tailored for +long-tailed segmentation. PAT has two key features: 1) class-wise gradient +magnitude homogenization, and 2) pixel-wise class-specific loss adaptation +(PCLA). First, the class-wise gradient magnitude homogenization helps alleviate +the imbalance among label masks by ensuring equal consideration of the +class-wise impact on model updates. Second, PCLA tackles the detrimental impact +of both rare classes within the long-tailed distribution and inaccurate +predictions from previous training stages by encouraging learning classes with +low prediction confidence and guarding against forgetting classes with high +confidence. This combined approach fosters robust learning while preventing the +model from forgetting previously learned knowledge. PAT exhibits significant +performance improvements, surpassing the current state-of-the-art by 2.2% in +the NyU dataset. Moreover, it enhances overall pixel-wise accuracy by 2.85% and +intersection over union value by 2.07%, with a particularly notable declination +of 0.39% in detecting rare classes compared to Balance Logits Variation, as +demonstrated on the three popular datasets, i.e., OxfordPetIII, CityScape, and +NYU. + +
+
+
+
+
+ + ♻ ☆ Anomaly Score: Evaluating Generative Models and Individual Generated + Images based on Complexity and Vulnerability CVPR 2024 + + +
+ With the advancement of generative models, the assessment of generated images +becomes more and more important. Previous methods measure distances between +features of reference and generated images from trained vision models. In this +paper, we conduct an extensive investigation into the relationship between the +representation space and input space around generated images. We first propose +two measures related to the presence of unnatural elements within images: +complexity, which indicates how non-linear the representation space is, and +vulnerability, which is related to how easily the extracted feature changes by +adversarial input changes. Based on these, we introduce a new metric to +evaluating image-generative models called anomaly score (AS). Moreover, we +propose AS-i (anomaly score for individual images) that can effectively +evaluate generated images individually. Experimental results demonstrate the +validity of the proposed approach. + +
+
+ comment: Accepted in CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Scalable 3D Registration via Truncated Entry-wise Absolute Residuals CVPR 2024 + + +
+ Given an input set of $3$D point pairs, the goal of outlier-robust $3$D +registration is to compute some rotation and translation that align as many +point pairs as possible. This is an important problem in computer vision, for +which many highly accurate approaches have been recently proposed. Despite +their impressive performance, these approaches lack scalability, often +overflowing the $16$GB of memory of a standard laptop to handle roughly +$30,000$ point pairs. In this paper, we propose a $3$D registration approach +that can process more than ten million ($10^7$) point pairs with over $99\%$ +random outliers. Moreover, our method is efficient, entails low memory costs, +and maintains high accuracy at the same time. We call our method TEAR, as it +involves minimizing an outlier-robust loss that computes Truncated Entry-wise +Absolute Residuals. To minimize this loss, we decompose the original +$6$-dimensional problem into two subproblems of dimensions $3$ and $2$, +respectively, solved in succession to global optimality via a customized +branch-and-bound method. While branch-and-bound is often slow and unscalable, +this does not apply to TEAR as we propose novel bounding functions that are +tight and computationally efficient. Experiments on various datasets are +conducted to validate the scalability and efficiency of our method. + +
+
+ comment: 24 pages, 12 figures. Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ CoBra: Complementary Branch Fusing Class and Semantic Knowledge for + Robust Weakly Supervised Semantic Segmentation + + +
+ Leveraging semantically precise pseudo masks derived from image-level class +knowledge for segmentation, namely image-level Weakly Supervised Semantic +Segmentation (WSSS), still remains challenging. While Class Activation Maps +(CAMs) using CNNs have steadily been contributing to the success of WSSS, the +resulting activation maps often narrowly focus on class-specific parts (e.g., +only face of human). On the other hand, recent works based on vision +transformers (ViT) have shown promising results based on their self-attention +mechanism to capture the semantic parts but fail in capturing complete +class-specific details (e.g., entire body parts of human but also with a dog +nearby). In this work, we propose Complementary Branch (CoBra), a novel dual +branch framework consisting of two distinct architectures which provide +valuable complementary knowledge of class (from CNN) and semantic (from ViT) to +each branch. In particular, we learn Class-Aware Projection (CAP) for the CNN +branch and Semantic-Aware Projection (SAP) for the ViT branch to explicitly +fuse their complementary knowledge and facilitate a new type of extra +patch-level supervision. Our model, through CoBra, fuses CNN and ViT's +complementary outputs to create robust pseudo masks that integrate both class +and semantic information effectively. Extensive experiments qualitatively and +quantitatively investigate how CNN and ViT complement each other on the PASCAL +VOC 2012 dataset, showing a state-of-the-art WSSS result. This includes not +only the masks generated by our model, but also the segmentation results +derived from utilizing these masks as pseudo labels. + +
+
+
+
+
+ + ♻ ☆ BIVDiff: A Training-Free Framework for General-Purpose Video Synthesis + via Bridging Image and Video Diffusion Models CVPR 2024 + + +
+ Diffusion models have made tremendous progress in text-driven image and video +generation. Now text-to-image foundation models are widely applied to various +downstream image synthesis tasks, such as controllable image generation and +image editing, while downstream video synthesis tasks are less explored for +several reasons. First, it requires huge memory and computation overhead to +train a video generation foundation model. Even with video foundation models, +additional costly training is still required for downstream video synthesis +tasks. Second, although some works extend image diffusion models into videos in +a training-free manner, temporal consistency cannot be well preserved. Finally, +these adaption methods are specifically designed for one task and fail to +generalize to different tasks. To mitigate these issues, we propose a +training-free general-purpose video synthesis framework, coined as {\bf +BIVDiff}, via bridging specific image diffusion models and general +text-to-video foundation diffusion models. Specifically, we first use a +specific image diffusion model (e.g., ControlNet and Instruct Pix2Pix) for +frame-wise video generation, then perform Mixed Inversion on the generated +video, and finally input the inverted latents into the video diffusion models +(e.g., VidRD and ZeroScope) for temporal smoothing. This decoupled framework +enables flexible image model selection for different purposes with strong task +generalization and high efficiency. To validate the effectiveness and general +use of BIVDiff, we perform a wide range of video synthesis tasks, including +controllable video generation, video editing, video inpainting, and +outpainting. + +
+
+ comment: Accepted by CVPR 2024. Project page: https://bivdiff.github.io; + GitHub repository: https://github.com/MCG-NJU/BIVDiff +
+
+
+
+
+ + ♻ ☆ Empowering Image Recovery_ A Multi-Attention Approach + + +
+ We propose Diverse Restormer (DART), a novel image restoration method that +effectively integrates information from various sources (long sequences, local +and global regions, feature dimensions, and positional dimensions) to address +restoration challenges. While Transformer models have demonstrated excellent +performance in image restoration due to their self-attention mechanism, they +face limitations in complex scenarios. Leveraging recent advancements in +Transformers and various attention mechanisms, our method utilizes customized +attention mechanisms to enhance overall performance. DART, our novel network +architecture, employs windowed attention to mimic the selective focusing +mechanism of human eyes. By dynamically adjusting receptive fields, it +optimally captures the fundamental features crucial for image resolution +reconstruction. Efficiency and performance balance are achieved through the +LongIR attention mechanism for long sequence image restoration. Integration of +attention mechanisms across feature and positional dimensions further enhances +the recovery of fine details. Evaluation across five restoration tasks +consistently positions DART at the forefront. Upon acceptance, we commit to +providing publicly accessible code and models to ensure reproducibility and +facilitate further research. + +
+
+ comment: 12 pages, 10 figures, 12 tables +
+
+
+
+
+ + ♻ ☆ TriSAM: Tri-Plane SAM for zero-shot cortical blood vessel segmentation + in VEM images + + +
+ While imaging techniques at macro and mesoscales have garnered substantial +attention and resources, microscale VEM imaging, capable of revealing intricate +vascular details, has lacked the necessary benchmarking infrastructure. In this +paper, we address a significant gap in the field of neuroimaging by introducing +the largest-to-date public benchmark, \textbf{BvEM}, designed specifically for +cortical blood vessel segmentation in volume electron microscopy (VEM) images. +Our BvEM benchmark is based on VEM image volumes from three mammal species: +adult mouse, macaque, and human. We standardized the resolution, addressed +imaging variations, and meticulously annotated blood vessels through +semi-automatic, manual, and quality control processes, ensuring high-quality 3D +segmentation. Furthermore, we developed a zero-shot cortical blood vessel +segmentation method named TriSAM, which leverages the powerful segmentation +model SAM for 3D segmentation. To extend SAM from 2D to 3D volume segmentation, +TriSAM employs a multi-seed tracking framework, leveraging the reliability of +certain image planes for tracking while using others to identify potential +turning points. This approach effectively achieves long-term 3D blood vessel +segmentation without model training or fine-tuning. Experimental results show +that TriSAM achieved superior performances on the BvEM benchmark across three +species. + +
+
+ comment: BvEM-Mouse can be visualized at: https://tinyurl.com/yc2s38x9 +
+
+
+
+
+ + ♻ ☆ GeRM: A Generalist Robotic Model with Mixture-of-experts for Quadruped + Robot + + +
+ Multi-task robot learning holds significant importance in tackling diverse +and complex scenarios. However, current approaches are hindered by performance +issues and difficulties in collecting training datasets. In this paper, we +propose GeRM (Generalist Robotic Model). We utilize offline reinforcement +learning to optimize data utilization strategies to learn from both +demonstrations and sub-optimal data, thus surpassing the limitations of human +demonstrations. Thereafter, we employ a transformer-based VLA network to +process multi-modal inputs and output actions. By introducing the +Mixture-of-Experts structure, GeRM allows faster inference speed with higher +whole model capacity, and thus resolves the issue of limited RL parameters, +enhancing model performance in multi-task learning while controlling +computational costs. Through a series of experiments, we demonstrate that GeRM +outperforms other methods across all tasks, while also validating its +efficiency in both training and inference processes. Additionally, we uncover +its potential to acquire emergent skills. Additionally, we contribute the +QUARD-Auto dataset, collected automatically to support our training approach +and foster advancements in multi-task quadruped robot learning. This work +presents a new paradigm for reducing the cost of collecting robot data and +driving progress in the multi-task learning community. You can reach our +project and video through the link: https://songwxuan.github.io/GeRM/ . + +
+
+
+
+
+ + ♻ ☆ Exploring Recurrent Long-term Temporal Fusion for Multi-view 3D + Perception + + +
+ Long-term temporal fusion is a crucial but often overlooked technique in +camera-based Bird's-Eye-View (BEV) 3D perception. Existing methods are mostly +in a parallel manner. While parallel fusion can benefit from long-term +information, it suffers from increasing computational and memory overheads as +the fusion window size grows. Alternatively, BEVFormer adopts a recurrent +fusion pipeline so that history information can be efficiently integrated, yet +it fails to benefit from longer temporal frames. In this paper, we explore an +embarrassingly simple long-term recurrent fusion strategy built upon the +LSS-based methods and find it already able to enjoy the merits from both sides, +i.e., rich long-term information and efficient fusion pipeline. A temporal +embedding module is further proposed to improve the model's robustness against +occasionally missed frames in practical scenarios. We name this simple but +effective fusing pipeline VideoBEV. Experimental results on the nuScenes +benchmark show that VideoBEV obtains strong performance on various camera-based +3D perception tasks, including object detection (55.4\% mAP and 62.9\% NDS), +segmentation (48.6\% vehicle mIoU), tracking (54.8\% AMOTA), and motion +prediction (0.80m minADE and 0.463 EPA). + +
+
+
+
+
+ + ♻ ☆ Ranni: Taming Text-to-Image Diffusion for Accurate Instruction Following + + +
+ Existing text-to-image (T2I) diffusion models usually struggle in +interpreting complex prompts, especially those with quantity, object-attribute +binding, and multi-subject descriptions. In this work, we introduce a semantic +panel as the middleware in decoding texts to images, supporting the generator +to better follow instructions. The panel is obtained through arranging the +visual concepts parsed from the input text by the aid of large language models, +and then injected into the denoising network as a detailed control signal to +complement the text condition. To facilitate text-to-panel learning, we come up +with a carefully designed semantic formatting protocol, accompanied by a +fully-automatic data preparation pipeline. Thanks to such a design, our +approach, which we call Ranni, manages to enhance a pre-trained T2I generator +regarding its textual controllability. More importantly, the introduction of +the generative middleware brings a more convenient form of interaction (i.e., +directly adjusting the elements in the panel or using language instructions) +and further allows users to finely customize their generation, based on which +we develop a practical system and showcase its potential in continuous +generation and chatting-based editing. Our project page is at +https://ranni-t2i.github.io/Ranni. + +
+
+
+
+
+ + ♻ ☆ TIM: A Time Interval Machine for Audio-Visual Action Recognition CVPR 2024 + + +
+ Diverse actions give rise to rich audio-visual signals in long videos. Recent +works showcase that the two modalities of audio and video exhibit different +temporal extents of events and distinct labels. We address the interplay +between the two modalities in long videos by explicitly modelling the temporal +extents of audio and visual events. We propose the Time Interval Machine (TIM) +where a modality-specific time interval poses as a query to a transformer +encoder that ingests a long video input. The encoder then attends to the +specified interval, as well as the surrounding context in both modalities, in +order to recognise the ongoing action. + We test TIM on three long audio-visual video datasets: EPIC-KITCHENS, +Perception Test, and AVE, reporting state-of-the-art (SOTA) for recognition. On +EPIC-KITCHENS, we beat previous SOTA that utilises LLMs and significantly +larger pre-training by 2.9% top-1 action recognition accuracy. Additionally, we +show that TIM can be adapted for action detection, using dense multi-scale +interval queries, outperforming SOTA on EPIC-KITCHENS-100 for most metrics, and +showing strong performance on the Perception Test. Our ablations show the +critical role of integrating the two modalities and modelling their time +intervals in achieving this performance. Code and models at: +https://github.com/JacobChalk/TIM + +
+
+ comment: Accepted to CVPR 2024. Project Webpage: + https://jacobchalk.github.io/TIM-Project +
+
+
+
+
+ + ♻ ☆ BOTH2Hands: Inferring 3D Hands from Both Text Prompts and Body Dynamics + + +
+ The recently emerging text-to-motion advances have spired numerous attempts +for convenient and interactive human motion generation. Yet, existing methods +are largely limited to generating body motions only without considering the +rich two-hand motions, let alone handling various conditions like body dynamics +or texts. To break the data bottleneck, we propose BOTH57M, a novel multi-modal +dataset for two-hand motion generation. Our dataset includes accurate motion +tracking for the human body and hands and provides pair-wised finger-level hand +annotations and body descriptions. We further provide a strong baseline method, +BOTH2Hands, for the novel task: generating vivid two-hand motions from both +implicit body dynamics and explicit text prompts. We first warm up two parallel +body-to-hand and text-to-hand diffusion models and then utilize the +cross-attention transformer for motion blending. Extensive experiments and +cross-validations demonstrate the effectiveness of our approach and dataset for +generating convincing two-hand motions from the hybrid body-and-textual +conditions. Our dataset and code will be disseminated to the community for +future research. + +
+
+
+
+
+ + ♻ ☆ Enhancing Breast Cancer Diagnosis in Mammography: Evaluation and + Integration of Convolutional Neural Networks and Explainable AI + + +
+ The study introduces an integrated framework combining Convolutional Neural +Networks (CNNs) and Explainable Artificial Intelligence (XAI) for the enhanced +diagnosis of breast cancer using the CBIS-DDSM dataset. Utilizing a fine-tuned +ResNet50 architecture, our investigation not only provides effective +differentiation of mammographic images into benign and malignant categories but +also addresses the opaque "black-box" nature of deep learning models by +employing XAI methodologies, namely Grad-CAM, LIME, and SHAP, to interpret CNN +decision-making processes for healthcare professionals. Our methodology +encompasses an elaborate data preprocessing pipeline and advanced data +augmentation techniques to counteract dataset limitations, and transfer +learning using pre-trained networks, such as VGG-16, DenseNet and ResNet was +employed. A focal point of our study is the evaluation of XAI's effectiveness +in interpreting model predictions, highlighted by utilising the Hausdorff +measure to assess the alignment between AI-generated explanations and expert +annotations quantitatively. This approach plays a critical role for XAI in +promoting trustworthiness and ethical fairness in AI-assisted diagnostics. The +findings from our research illustrate the effective collaboration between CNNs +and XAI in advancing diagnostic methods for breast cancer, thereby facilitating +a more seamless integration of advanced AI technologies within clinical +settings. By enhancing the interpretability of AI-driven decisions, this work +lays the groundwork for improved collaboration between AI systems and medical +practitioners, ultimately enriching patient care. Furthermore, the implications +of our research extend well beyond the current methodologies, advocating for +subsequent inquiries into the integration of multimodal data and the refinement +of AI explanations to satisfy the needs of clinical practice. + +
+
+
+
+
+ + ♻ ☆ Learning Invariant Inter-pixel Correlations for Superpixel Generation AAAI24 + + +
+ Deep superpixel algorithms have made remarkable strides by substituting +hand-crafted features with learnable ones. Nevertheless, we observe that +existing deep superpixel methods, serving as mid-level representation +operations, remain sensitive to the statistical properties (e.g., color +distribution, high-level semantics) embedded within the training dataset. +Consequently, learnable features exhibit constrained discriminative capability, +resulting in unsatisfactory pixel grouping performance, particularly in +untrainable application scenarios. To address this issue, we propose the +Content Disentangle Superpixel (CDS) algorithm to selectively separate the +invariant inter-pixel correlations and statistical properties, i.e., style +noise. Specifically, We first construct auxiliary modalities that are +homologous to the original RGB image but have substantial stylistic variations. +Then, driven by mutual information, we propose the local-grid correlation +alignment across modalities to reduce the distribution discrepancy of +adaptively selected features and learn invariant inter-pixel correlations. +Afterwards, we perform global-style mutual information minimization to enforce +the separation of invariant content and train data styles. The experimental +results on four benchmark datasets demonstrate the superiority of our approach +to existing state-of-the-art methods, regarding boundary adherence, +generalization, and efficiency. Code and pre-trained model are available at +https://github.com/rookiie/CDSpixel. + +
+
+ comment: Accepted by AAAI24 +
+
+
+
+
+ + ♻ ☆ SDFR: Synthetic Data for Face Recognition Competition + + +
+ Large-scale face recognition datasets are collected by crawling the Internet +and without individuals' consent, raising legal, ethical, and privacy concerns. +With the recent advances in generative models, recently several works proposed +generating synthetic face recognition datasets to mitigate concerns in +web-crawled face recognition datasets. This paper presents the summary of the +Synthetic Data for Face Recognition (SDFR) Competition held in conjunction with +the 18th IEEE International Conference on Automatic Face and Gesture +Recognition (FG 2024) and established to investigate the use of synthetic data +for training face recognition models. The SDFR competition was split into two +tasks, allowing participants to train face recognition systems using new +synthetic datasets and/or existing ones. In the first task, the face +recognition backbone was fixed and the dataset size was limited, while the +second task provided almost complete freedom on the model backbone, the +dataset, and the training pipeline. The submitted models were trained on +existing and also new synthetic datasets and used clever methods to improve +training with synthetic data. The submissions were evaluated and ranked on a +diverse set of seven benchmarking datasets. The paper gives an overview of the +submitted face recognition models and reports achieved performance compared to +baseline models trained on real and synthetic datasets. Furthermore, the +evaluation of submissions is extended to bias assessment across different +demography groups. Lastly, an outlook on the current state of the research in +training face recognition models using synthetic data is presented, and +existing problems as well as potential future directions are also discussed. + +
+
+ comment: The 18th IEEE International Conference on Automatic Face and Gesture + Recognition (FG 2024) +
+
+
+
+
+ + ♻ ☆ PhysAvatar: Learning the Physics of Dressed 3D Avatars from Visual + Observations + + +
+ Modeling and rendering photorealistic avatars is of crucial importance in +many applications. Existing methods that build a 3D avatar from visual +observations, however, struggle to reconstruct clothed humans. We introduce +PhysAvatar, a novel framework that combines inverse rendering with inverse +physics to automatically estimate the shape and appearance of a human from +multi-view video data along with the physical parameters of the fabric of their +clothes. For this purpose, we adopt a mesh-aligned 4D Gaussian technique for +spatio-temporal mesh tracking as well as a physically based inverse renderer to +estimate the intrinsic material properties. PhysAvatar integrates a physics +simulator to estimate the physical parameters of the garments using +gradient-based optimization in a principled manner. These novel capabilities +enable PhysAvatar to create high-quality novel-view renderings of avatars +dressed in loose-fitting clothes under motions and lighting conditions not seen +in the training data. This marks a significant advancement towards modeling +photorealistic digital humans using physically based inverse rendering with +physics in the loop. Our project website is at: +https://qingqing-zhao.github.io/PhysAvatar + +
+
+ comment: Project Page: https://qingqing-zhao.github.io/PhysAvatar +
+
+
+
+
+ + ♻ ☆ Dense Video Object Captioning from Disjoint Supervision + + +
+ We propose a new task and model for dense video object captioning -- +detecting, tracking and captioning trajectories of objects in a video. This +task unifies spatial and temporal localization in video, whilst also requiring +fine-grained visual understanding that is best described by natural language. +We propose a unified model, and demonstrate how our end-to-end approach is more +accurate and temporally coherent than a multi-stage pipeline combining +state-of-the-art detection, tracking, and captioning models. Moreover, we +propose a training strategy based on a mixture of disjoint tasks, which allows +us to leverage diverse, large-scale datasets which supervise different parts of +our model. Although each pretraining task only provides weak supervision, they +are complementary and, when combined, result in noteworthy zero-shot ability +and serve as strong initialization for additional finetuning to further improve +accuracy. We carefully design new metrics capturing all components of our task, +and show how we can repurpose existing video grounding datasets (e.g. VidSTG +and VLN) for our new task. We show that our model improves upon a number of +strong baselines for this new task. Furthermore, we can apply our model to the +task of spatial grounding, outperforming prior state-of-the-art on VidSTG and +VLN, without explicitly training for it. Code is available at +https://github.com/google-research/scenic/tree/main/scenic/projects/densevoc. + +
+
+ comment: Code is available at + https://github.com/google-research/scenic/tree/main/scenic/projects/densevoc +
+
+
+
+
+ + ♻ ☆ Oriented Object Detection in Optical Remote Sensing Images using Deep + Learning: A Survey + + +
+ Oriented object detection is one of the most fundamental and challenging +tasks in remote sensing, aiming to locate and classify objects with arbitrary +orientations. Recent years have witnessed remarkable progress in oriented +object detection using deep learning techniques. Given the rapid development of +this field, this paper aims to provide a comprehensive survey of recent +advances in oriented object detection. To be specific, we first review the +technical evolution from horizontal object detection to oriented object +detection and summarize the specific challenges, including feature +misalignment, spatial misalignment, and periodicity of angle. Subsequently, we +further categorize existing methods into detection framework, oriented bounding +box (OBB) regression, and feature representations, and discuss how these +methods address the above challenges in detail. In addition, we cover several +publicly available datasets and performance evaluation protocols. Furthermore, +we provide a comprehensive comparison and analysis of state-of-the-art oriented +object detection methods. Toward the end of this paper, we discuss several +future directions for oriented object detection. + +
+
+
+
+
+ + ♻ ☆ PeerAiD: Improving Adversarial Distillation from a Specialized Peer + Tutor CVPR 2024 + + +
+ Adversarial robustness of the neural network is a significant concern when it +is applied to security-critical domains. In this situation, adversarial +distillation is a promising option which aims to distill the robustness of the +teacher network to improve the robustness of a small student network. Previous +works pretrain the teacher network to make it robust to the adversarial +examples aimed at itself. However, the adversarial examples are dependent on +the parameters of the target network. The fixed teacher network inevitably +degrades its robustness against the unseen transferred adversarial examples +which targets the parameters of the student network in the adversarial +distillation process. We propose PeerAiD to make a peer network learn the +adversarial examples of the student network instead of adversarial examples +aimed at itself. PeerAiD is an adversarial distillation that trains the peer +network and the student network simultaneously in order to make the peer +network specialized for defending the student network. We observe that such +peer networks surpass the robustness of pretrained robust teacher network +against student-attacked adversarial samples. With this peer network and +adversarial distillation, PeerAiD achieves significantly higher robustness of +the student network with AutoAttack (AA) accuracy up to 1.66%p and improves the +natural accuracy of the student network up to 4.72%p with ResNet-18 and +TinyImageNet dataset. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Object Detectors in the Open Environment: Challenges, Solutions, and + Outlook + + +
+ With the emergence of foundation models, deep learning-based object detectors +have shown practical usability in closed set scenarios. However, for real-world +tasks, object detectors often operate in open environments, where crucial +factors (e.g., data distribution, objective) that influence model learning are +often changing. The dynamic and intricate nature of the open environment poses +novel and formidable challenges to object detectors. Unfortunately, current +research on object detectors in open environments lacks a comprehensive +analysis of their distinctive characteristics, challenges, and corresponding +solutions, which hinders their secure deployment in critical real-world +scenarios. This paper aims to bridge this gap by conducting a comprehensive +review and analysis of object detectors in open environments. We initially +identified limitations of key structural components within the existing +detection pipeline and propose the open environment object detector challenge +framework that includes four quadrants (i.e., out-of-domain, out-of-category, +robust learning, and incremental learning) based on the dimensions of the data +/ target changes. For each quadrant of challenges in the proposed framework, we +present a detailed description and systematic analysis of the overarching goals +and core difficulties, systematically review the corresponding solutions, and +benchmark their performance over multiple widely adopted datasets. In addition, +we engage in a discussion of open problems and potential avenues for future +research. This paper aims to provide a fresh, comprehensive, and systematic +understanding of the challenges and solutions associated with open-environment +object detectors, thus catalyzing the development of more solid applications in +real-world scenarios. A project related to this survey can be found at +https://github.com/LiangSiyuan21/OEOD_Survey. + +
+
+ comment: 37 pages, 17 figures +
+
+
+
+
+ + ♻ ☆ Carve3D: Improving Multi-view Reconstruction Consistency for Diffusion + Models with RL Finetuning CVPR 2024 + + +
+ Multi-view diffusion models, obtained by applying Supervised Finetuning (SFT) +to text-to-image diffusion models, have driven recent breakthroughs in +text-to-3D research. However, due to the limited size and quality of existing +3D datasets, they still suffer from multi-view inconsistencies and Neural +Radiance Field (NeRF) reconstruction artifacts. We argue that multi-view +diffusion models can benefit from further Reinforcement Learning Finetuning +(RLFT), which allows models to learn from the data generated by themselves and +improve beyond their dataset limitations during SFT. To this end, we introduce +Carve3D, an improved RLFT algorithm coupled with a novel Multi-view +Reconstruction Consistency (MRC) metric, to enhance the consistency of +multi-view diffusion models. To measure the MRC metric on a set of multi-view +images, we compare them with their corresponding NeRF renderings at the same +camera viewpoints. The resulting model, which we denote as Carve3DM, +demonstrates superior multi-view consistency and NeRF reconstruction quality +than existing models. Our results suggest that pairing SFT with Carve3D's RLFT +is essential for developing multi-view-consistent diffusion models, mirroring +the standard Large Language Model (LLM) alignment pipeline. Our code, training +and testing data, and video results are available at: +https://desaixie.github.io/carve-3d. + +
+
+ comment: 22 pages, 16 figures. Our code, training and testing data, and video + results are available at: https://desaixie.github.io/carve-3d. This paper has + been accepted to CVPR 2024. v2: incorporated changes from the CVPR 2024 + camera-ready version +
+
+
+
+
+ + ♻ ☆ Surface Reconstruction from Point Clouds via Grid-based Intersection + Prediction + + +
+ Surface reconstruction from point clouds is a crucial task in the fields of +computer vision and computer graphics. SDF-based methods excel at +reconstructing smooth meshes with minimal error and artefacts but struggle with +representing open surfaces. On the other hand, UDF-based methods can +effectively represent open surfaces but often introduce noise, leading to +artefacts in the mesh. In this work, we propose a novel approach that directly +predicts the intersection points between line segment of point pairs and +implicit surfaces. To achieve it, we propose two modules named Relative +Intersection Module and Sign Module respectively with the feature of point pair +as input. To preserve the continuity of the surface, we also integrate symmetry +into the two modules, which means the position of predicted intersection will +not change even if the input order of the point pair changes. This method not +only preserves the ability to represent open surfaces but also eliminates most +artefacts on the mesh. Our approach demonstrates state-of-the-art performance +on three datasets: ShapeNet, MGN, and ScanNet. The code will be made available +upon acceptance. + +
+
+
+
+
+ + ♻ ☆ Background Noise Reduction of Attention Map for Weakly Supervised + Semantic Segmentation + + +
+ In weakly-supervised semantic segmentation (WSSS) using only image-level +class labels, a problem with CNN-based Class Activation Maps (CAM) is that they +tend to activate the most discriminative local regions of objects. On the other +hand, methods based on Transformers learn global features but suffer from the +issue of background noise contamination. This paper focuses on addressing the +issue of background noise in attention weights within the existing WSSS method +based on Conformer, known as TransCAM. The proposed method successfully reduces +background noise, leading to improved accuracy of pseudo labels. Experimental +results demonstrate that our model achieves segmentation performance of 70.5% +on the PASCAL VOC 2012 validation data, 71.1% on the test data, and 45.9% on MS +COCO 2014 data, outperforming TransCAM in terms of segmentation performance. + +
+
+
+
+
+ + ♻ ☆ Improving the Accuracy-Robustness Trade-Off of Classifiers via Adaptive + Smoothing + + +
+ While prior research has proposed a plethora of methods that build neural +classifiers robust against adversarial robustness, practitioners are still +reluctant to adopt them due to their unacceptably severe clean accuracy +penalties. This paper significantly alleviates this accuracy-robustness +trade-off by mixing the output probabilities of a standard classifier and a +robust classifier, where the standard network is optimized for clean accuracy +and is not robust in general. We show that the robust base classifier's +confidence difference for correct and incorrect examples is the key to this +improvement. In addition to providing intuitions and empirical evidence, we +theoretically certify the robustness of the mixed classifier under realistic +assumptions. Furthermore, we adapt an adversarial input detector into a mixing +network that adaptively adjusts the mixture of the two base models, further +reducing the accuracy penalty of achieving robustness. The proposed flexible +method, termed "adaptive smoothing", can work in conjunction with existing or +even future methods that improve clean accuracy, robustness, or adversary +detection. Our empirical evaluation considers strong attack methods, including +AutoAttack and adaptive attack. On the CIFAR-100 dataset, our method achieves +an 85.21% clean accuracy while maintaining a 38.72% $\ell_\infty$-AutoAttacked +($\epsilon = 8/255$) accuracy, becoming the second most robust method on the +RobustBench CIFAR-100 benchmark as of submission, while improving the clean +accuracy by ten percentage points compared with all listed models. The code +that implements our method is available at +https://github.com/Bai-YT/AdaptiveSmoothing. + +
+
+
+
+
+ + ♻ ☆ SIR: Multi-view Inverse Rendering with Decomposable Shadow for Indoor + Scenes + + +
+ We propose SIR, an efficient method to decompose differentiable shadows for +inverse rendering on indoor scenes using multi-view data, addressing the +challenges in accurately decomposing the materials and lighting conditions. +Unlike previous methods that struggle with shadow fidelity in complex lighting +environments, our approach explicitly learns shadows for enhanced realism in +material estimation under unknown light positions. Utilizing posed HDR images +as input, SIR employs an SDF-based neural radiance field for comprehensive +scene representation. Then, SIR integrates a shadow term with a three-stage +material estimation approach to improve SVBRDF quality. Specifically, SIR is +designed to learn a differentiable shadow, complemented by BRDF regularization, +to optimize inverse rendering accuracy. Extensive experiments on both synthetic +and real-world indoor scenes demonstrate the superior performance of SIR over +existing methods in both quantitative metrics and qualitative analysis. The +significant decomposing ability of SIR enables sophisticated editing +capabilities like free-view relighting, object insertion, and material +replacement. The code and data are available at +https://xiaokangwei.github.io/SIR/. + +
+
+
+
+
+ + ♻ ☆ Toward Tiny and High-quality Facial Makeup with Data Amplify Learning + + +
+ Contemporary makeup approaches primarily hinge on unpaired learning +paradigms, yet they grapple with the challenges of inaccurate supervision +(e.g., face misalignment) and sophisticated facial prompts (including face +parsing, and landmark detection). These challenges prohibit low-cost deployment +of facial makeup models, especially on mobile devices. To solve above problems, +we propose a brand-new learning paradigm, termed "Data Amplify Learning (DAL)," +alongside a compact makeup model named "TinyBeauty." The core idea of DAL lies +in employing a Diffusion-based Data Amplifier (DDA) to "amplify" limited images +for the model training, thereby enabling accurate pixel-to-pixel supervision +with merely a handful of annotations. Two pivotal innovations in DDA facilitate +the above training approach: (1) A Residual Diffusion Model (RDM) is designed +to generate high-fidelity detail and circumvent the detail vanishing problem in +the vanilla diffusion models; (2) A Fine-Grained Makeup Module (FGMM) is +proposed to achieve precise makeup control and combination while retaining face +identity. Coupled with DAL, TinyBeauty necessitates merely 80K parameters to +achieve a state-of-the-art performance without intricate face prompts. +Meanwhile, TinyBeauty achieves a remarkable inference speed of up to 460 fps on +the iPhone 13. Extensive experiments show that DAL can produce highly +competitive makeup models using only 5 image pairs. + +
+
+
+
+
+ + ♻ ☆ Harnessing Meta-Learning for Improving Full-Frame Video Stabilization CVPR 2024 + + +
+ Video stabilization is a longstanding computer vision problem, particularly +pixel-level synthesis solutions for video stabilization which synthesize full +frames add to the complexity of this task. These techniques aim to stabilize +videos by synthesizing full frames while enhancing the stability of the +considered video. This intensifies the complexity of the task due to the +distinct mix of unique motion profiles and visual content present in each video +sequence, making robust generalization with fixed parameters difficult. In our +study, we introduce a novel approach to enhance the performance of pixel-level +synthesis solutions for video stabilization by adapting these models to +individual input video sequences. The proposed adaptation exploits low-level +visual cues accessible during test-time to improve both the stability and +quality of resulting videos. We highlight the efficacy of our methodology of +"test-time adaptation" through simple fine-tuning of one of these models, +followed by significant stability gain via the integration of meta-learning +techniques. Notably, significant improvement is achieved with only a single +adaptation step. The versatility of the proposed algorithm is demonstrated by +consistently improving the performance of various pixel-level synthesis models +for video stabilization in real-world scenarios. + +
+
+ comment: CVPR 2024, Code will be made availble on: + http://github.com/MKashifAli/MetaVideoStab +
+
+
+
+
+ + ♻ ☆ Detecting and Mitigating System-Level Anomalies of Vision-Based + Controllers + + +
+ Autonomous systems, such as self-driving cars and drones, have made +significant strides in recent years by leveraging visual inputs and machine +learning for decision-making and control. Despite their impressive performance, +these vision-based controllers can make erroneous predictions when faced with +novel or out-of-distribution inputs. Such errors can cascade to catastrophic +system failures and compromise system safety. In this work, we introduce a +run-time anomaly monitor to detect and mitigate such closed-loop, system-level +failures. Specifically, we leverage a reachability-based framework to +stress-test the vision-based controller offline and mine its system-level +failures. This data is then used to train a classifier that is leveraged online +to flag inputs that might cause system breakdowns. The anomaly detector +highlights issues that transcend individual modules and pertain to the safety +of the overall system. We also design a fallback controller that robustly +handles these detected anomalies to preserve system safety. We validate the +proposed approach on an autonomous aircraft taxiing system that uses a +vision-based controller for taxiing. Our results show the efficacy of the +proposed approach in identifying and handling system-level anomalies, +outperforming methods such as prediction error-based detection, and ensembling, +thereby enhancing the overall safety and robustness of autonomous systems. + +
+
+
+
+
+ + ♻ ☆ Rich Human Feedback for Text-to-Image Generation CVPR'24 + + +
+ Recent Text-to-Image (T2I) generation models such as Stable Diffusion and +Imagen have made significant progress in generating high-resolution images +based on text descriptions. However, many generated images still suffer from +issues such as artifacts/implausibility, misalignment with text descriptions, +and low aesthetic quality. Inspired by the success of Reinforcement Learning +with Human Feedback (RLHF) for large language models, prior works collected +human-provided scores as feedback on generated images and trained a reward +model to improve the T2I generation. In this paper, we enrich the feedback +signal by (i) marking image regions that are implausible or misaligned with the +text, and (ii) annotating which words in the text prompt are misrepresented or +missing on the image. We collect such rich human feedback on 18K generated +images (RichHF-18K) and train a multimodal transformer to predict the rich +feedback automatically. We show that the predicted rich human feedback can be +leveraged to improve image generation, for example, by selecting high-quality +training data to finetune and improve the generative models, or by creating +masks with predicted heatmaps to inpaint the problematic regions. Notably, the +improvements generalize to models (Muse) beyond those used to generate the +images on which human feedback data were collected (Stable Diffusion variants). +The RichHF-18K data set will be released in our GitHub repository: +https://github.com/google-research/google-research/tree/master/richhf_18k. + +
+
+ comment: CVPR'24 +
+
+
+
+
+ + ♻ ☆ Feature Re-Embedding: Towards Foundation Model-Level Performance in + Computational Pathology CVPR2024 + + +
+ Multiple instance learning (MIL) is the most widely used framework in +computational pathology, encompassing sub-typing, diagnosis, prognosis, and +more. However, the existing MIL paradigm typically requires an offline instance +feature extractor, such as a pre-trained ResNet or a foundation model. This +approach lacks the capability for feature fine-tuning within the specific +downstream tasks, limiting its adaptability and performance. To address this +issue, we propose a Re-embedded Regional Transformer (R$^2$T) for re-embedding +the instance features online, which captures fine-grained local features and +establishes connections across different regions. Unlike existing works that +focus on pre-training powerful feature extractor or designing sophisticated +instance aggregator, R$^2$T is tailored to re-embed instance features online. +It serves as a portable module that can seamlessly integrate into mainstream +MIL models. Extensive experimental results on common computational pathology +tasks validate that: 1) feature re-embedding improves the performance of MIL +models based on ResNet-50 features to the level of foundation model features, +and further enhances the performance of foundation model features; 2) the +R$^2$T can introduce more significant performance improvements to various MIL +models; 3) R$^2$T-MIL, as an R$^2$T-enhanced AB-MIL, outperforms other latest +methods by a large margin.The code is available at: +https://github.com/DearCaat/RRT-MIL. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ Full-dose Whole-body PET Synthesis from Low-dose PET Using + High-efficiency Denoising Diffusion Probabilistic Model: PET Consistency + Model + + +
+ Objective: Positron Emission Tomography (PET) has been a commonly used +imaging modality in broad clinical applications. One of the most important +tradeoffs in PET imaging is between image quality and radiation dose: high +image quality comes with high radiation exposure. Improving image quality is +desirable for all clinical applications while minimizing radiation exposure is +needed to reduce risk to patients. Approach: We introduce PET Consistency Model +(PET-CM), an efficient diffusion-based method for generating high-quality +full-dose PET images from low-dose PET images. It employs a two-step process, +adding Gaussian noise to full-dose PET images in the forward diffusion, and +then denoising them using a PET Shifted-window Vision Transformer (PET-VIT) +network in the reverse diffusion. The PET-VIT network learns a consistency +function that enables direct denoising of Gaussian noise into clean full-dose +PET images. PET-CM achieves state-of-the-art image quality while requiring +significantly less computation time than other methods. Results: In experiments +comparing eighth-dose to full-dose images, PET-CM demonstrated impressive +performance with NMAE of 1.278+/-0.122%, PSNR of 33.783+/-0.824dB, SSIM of +0.964+/-0.009, NCC of 0.968+/-0.011, HRS of 4.543, and SUV Error of +0.255+/-0.318%, with an average generation time of 62 seconds per patient. This +is a significant improvement compared to the state-of-the-art diffusion-based +model with PET-CM reaching this result 12x faster. Similarly, in the +quarter-dose to full-dose image experiments, PET-CM delivered competitive +outcomes, achieving an NMAE of 0.973+/-0.066%, PSNR of 36.172+/-0.801dB, SSIM +of 0.984+/-0.004, NCC of 0.990+/-0.005, HRS of 4.428, and SUV Error of +0.151+/-0.192% using the same generation process, which underlining its high +quantitative and clinical precision in both denoising scenario. + +
+
+
+
+
+ + ♻ ☆ SocialCounterfactuals: Probing and Mitigating Intersectional Social + Biases in Vision-Language Models with Counterfactual Examples CVPR 2024 + + +
+ While vision-language models (VLMs) have achieved remarkable performance +improvements recently, there is growing evidence that these models also posses +harmful biases with respect to social attributes such as gender and race. Prior +studies have primarily focused on probing such bias attributes individually +while ignoring biases associated with intersections between social attributes. +This could be due to the difficulty of collecting an exhaustive set of +image-text pairs for various combinations of social attributes. To address this +challenge, we employ text-to-image diffusion models to produce counterfactual +examples for probing intersectional social biases at scale. Our approach +utilizes Stable Diffusion with cross attention control to produce sets of +counterfactual image-text pairs that are highly similar in their depiction of a +subject (e.g., a given occupation) while differing only in their depiction of +intersectional social attributes (e.g., race & gender). Through our +over-generate-then-filter methodology, we produce SocialCounterfactuals, a +high-quality dataset containing 171k image-text pairs for probing +intersectional biases related to gender, race, and physical characteristics. We +conduct extensive experiments to demonstrate the usefulness of our generated +dataset for probing and mitigating intersectional social biases in +state-of-the-art VLMs. + +
+
+ comment: Accepted to CVPR 2024. arXiv admin note: text overlap with + arXiv:2310.02988 +
+
+
+
+
+ + ♻ ☆ Better Monocular 3D Detectors with LiDAR from the Past ICRA 2024 + + +
+ Accurate 3D object detection is crucial to autonomous driving. Though +LiDAR-based detectors have achieved impressive performance, the high cost of +LiDAR sensors precludes their widespread adoption in affordable vehicles. +Camera-based detectors are cheaper alternatives but often suffer inferior +performance compared to their LiDAR-based counterparts due to inherent depth +ambiguities in images. In this work, we seek to improve monocular 3D detectors +by leveraging unlabeled historical LiDAR data. Specifically, at inference time, +we assume that the camera-based detectors have access to multiple unlabeled +LiDAR scans from past traversals at locations of interest (potentially from +other high-end vehicles equipped with LiDAR sensors). Under this setup, we +proposed a novel, simple, and end-to-end trainable framework, termed +AsyncDepth, to effectively extract relevant features from asynchronous LiDAR +traversals of the same location for monocular 3D detectors. We show consistent +and significant performance gain (up to 9 AP) across multiple state-of-the-art +models and datasets with a negligible additional latency of 9.66 ms and a small +storage cost. + +
+
+ comment: Accepted by ICRA 2024. The code can be found at + https://github.com/YurongYou/AsyncDepth +
+
+
+
+
+ + ♻ ☆ $λ$-ECLIPSE: Multi-Concept Personalized Text-to-Image Diffusion + Models by Leveraging CLIP Latent Space + + +
+ Despite the recent advances in personalized text-to-image (P-T2I) generative +models, it remains challenging to perform finetuning-free multi-subject-driven +T2I in a resource-efficient manner. Predominantly, contemporary approaches, +involving the training of Hypernetworks and Multimodal Large Language Models +(MLLMs), require heavy computing resources that range from 600 to 12300 GPU +hours of training. These subject-driven T2I methods hinge on Latent Diffusion +Models (LDMs), which facilitate T2I mapping through cross-attention layers. +While LDMs offer distinct advantages, P-T2I methods' reliance on the latent +space of these diffusion models significantly escalates resource demands, +leading to inconsistent results and necessitating numerous iterations for a +single desired image. In this paper, we present $\lambda$-ECLIPSE, an +alternative prior-training strategy that works in the latent space of a +pre-trained CLIP model without relying on the diffusion UNet models. +$\lambda$-ECLIPSE leverages the image-text interleaved pre-training for fast +and effective multi-subject-driven P-T2I. Through extensive experiments, we +establish that $\lambda$-ECLIPSE surpasses existing baselines in composition +alignment while preserving concept alignment performance, even with +significantly lower resource utilization. $\lambda$-ECLIPSE performs +multi-subject driven P-T2I with just 34M parameters and is trained on a mere 74 +GPU hours. Additionally, $\lambda$-ECLIPSE demonstrates the unique ability to +perform multi-concept interpolations. + +
+
+ comment: Project page: https://eclipse-t2i.github.io/Lambda-ECLIPSE/ +
+
+
+
+
+ + ♻ ☆ Quilt-LLaVA: Visual Instruction Tuning by Extracting Localized + Narratives from Open-Source Histopathology Videos + + +
+ Diagnosis in histopathology requires a global whole slide images (WSIs) +analysis, requiring pathologists to compound evidence from different WSI +patches. The gigapixel scale of WSIs poses a challenge for histopathology +multi-modal models. Training multi-model models for histopathology requires +instruction tuning datasets, which currently contain information for individual +image patches, without a spatial grounding of the concepts within each patch +and without a wider view of the WSI. Therefore, they lack sufficient diagnostic +capacity for histopathology. To bridge this gap, we introduce Quilt-Instruct, a +large-scale dataset of 107,131 histopathology-specific instruction +question/answer pairs, grounded within diagnostically relevant image patches +that make up the WSI. Our dataset is collected by leveraging educational +histopathology videos from YouTube, which provides spatial localization of +narrations by automatically extracting the narrators' cursor positions. +Quilt-Instruct supports contextual reasoning by extracting diagnosis and +supporting facts from the entire WSI. Using Quilt-Instruct, we train +Quilt-LLaVA, which can reason beyond the given single image patch, enabling +diagnostic reasoning across patches. To evaluate Quilt-LLaVA, we propose a +comprehensive evaluation dataset created from 985 images and 1283 +human-generated question-answers. We also thoroughly evaluate Quilt-LLaVA using +public histopathology datasets, where Quilt-LLaVA significantly outperforms +SOTA by over 10% on relative GPT-4 score and 4% and 9% on open and closed set +VQA. Our code, data, and model are publicly accessible at +quilt-llava.github.io. + +
+
+
+
+
+ + ♻ ☆ Mitigating the Impact of Attribute Editing on Face Recognition + + +
+ Through a large-scale study over diverse face images, we show that facial +attribute editing using modern generative AI models can severely degrade +automated face recognition systems. This degradation persists even with +identity-preserving generative models. To mitigate this issue, we propose two +novel techniques for local and global attribute editing. We empirically ablate +twenty-six facial semantic, demographic and expression-based attributes that +have been edited using state-of-the-art generative models, and evaluate them +using ArcFace and AdaFace matchers on CelebA, CelebAMaskHQ and LFW datasets. +Finally, we use LLaVA, an emerging visual question-answering framework for +attribute prediction to validate our editing techniques. Our methods outperform +the current state-of-the-art at facial editing (BLIP, InstantID) while +improving identity retention by a significant extent. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Diffusion based Zero-shot Medical Image-to-Image Translation for Cross + Modality Segmentation + + +
+ Cross-modality image segmentation aims to segment the target modalities using +a method designed in the source modality. Deep generative models can translate +the target modality images into the source modality, thus enabling +cross-modality segmentation. However, a vast body of existing cross-modality +image translation methods relies on supervised learning. In this work, we aim +to address the challenge of zero-shot learning-based image translation tasks +(extreme scenarios in the target modality is unseen in the training phase). To +leverage generative learning for zero-shot cross-modality image segmentation, +we propose a novel unsupervised image translation method. The framework learns +to translate the unseen source image to the target modality for image +segmentation by leveraging the inherent statistical consistency between +different modalities for diffusion guidance. Our framework captures identical +cross-modality features in the statistical domain, offering diffusion guidance +without relying on direct mappings between the source and target domains. This +advantage allows our method to adapt to changing source domains without the +need for retraining, making it highly practical when sufficient labeled source +domain data is not available. The proposed framework is validated in zero-shot +cross-modality image segmentation tasks through empirical comparisons with +influential generative models, including adversarial-based and diffusion-based +models. + +
+
+ comment: Neurips 2023 Diffusion Workshop +
+
+
+
+
+ + ♻ ☆ Local Neighborhood Features for 3D Classification + + +
+ With advances in deep learning model training strategies, the training of +Point cloud classification methods is significantly improving. For example, +PointNeXt, which adopts prominent training techniques and InvResNet layers into +PointNet++, achieves over 7% improvement on the real-world ScanObjectNN +dataset. However, most of these models use point coordinates features of +neighborhood points mapped to higher dimensional space while ignoring the +neighborhood point features computed before feeding to the network layers. In +this paper, we revisit the PointNeXt model to study the usage and benefit of +such neighborhood point features. We train and evaluate PointNeXt on ModelNet40 +(synthetic), ScanObjectNN (real-world), and a recent large-scale, real-world +grocery dataset, i.e., 3DGrocery100. In addition, we provide an additional +inference strategy of weight averaging the top two checkpoints of PointNeXt to +improve classification accuracy. Together with the abovementioned ideas, we +gain 0.5%, 1%, 4.8%, 3.4%, and 1.6% overall accuracy on the PointNeXt model +with real-world datasets, ScanObjectNN (hardest variant), 3DGrocery100's +Apple10, Fruits, Vegetables, and Packages subsets, respectively. We also +achieve a comparable 0.2% accuracy gain on ModelNet40. + +
+
+
+
+
+ + ♻ ☆ Two-Person Interaction Augmentation with Skeleton Priors + + +
+ Close and continuous interaction with rich contacts is a crucial aspect of +human activities (e.g. hugging, dancing) and of interest in many domains like +activity recognition, motion prediction, character animation, etc. However, +acquiring such skeletal motion is challenging. While direct motion capture is +expensive and slow, motion editing/generation is also non-trivial, as complex +contact patterns with topological and geometric constraints have to be +retained. To this end, we propose a new deep learning method for two-body +skeletal interaction motion augmentation, which can generate variations of +contact-rich interactions with varying body sizes and proportions while +retaining the key geometric/topological relations between two bodies. Our +system can learn effectively from a relatively small amount of data and +generalize to drastically different skeleton sizes. Through exhaustive +evaluation and comparison, we show it can generate high-quality motions, has +strong generalizability and outperforms traditional optimization-based methods +and alternative deep learning solutions. + +
+
+
+
+
+ + ♻ ☆ A dataset of over one thousand computed tomography scans of battery + cells + + +
+ Battery technology is increasingly important for global electrification +efforts. However, batteries are highly sensitive to small manufacturing +variations that can induce reliability or safety issues. An important +technology for battery quality control is computed tomography (CT) scanning, +which is widely used for non-destructive 3D inspection across a variety of +clinical and industrial applications. Historically, however, the utility of CT +scanning for high-volume manufacturing has been limited by its low throughput +as well as the difficulty of handling its large file sizes. In this work, we +present a dataset of over one thousand CT scans of as-produced commercially +available batteries. The dataset spans various chemistries (lithium-ion and +sodium-ion) as well as various battery form factors (cylindrical, pouch, and +prismatic). We evaluate seven different battery types in total. The +manufacturing variability and the presence of battery defects can be observed +via this dataset. This dataset may be of interest to scientists and engineers +working on battery technology, computer vision, or both. + +
+
+
+
+
+ + ♻ ☆ TAM-VT: Transformation-Aware Multi-scale Video Transformer for + Segmentation and Tracking + + +
+ Video Object Segmentation (VOS) has emerged as an increasingly important +problem with availability of larger datasets and more complex and realistic +settings, which involve long videos with global motion (e.g, in egocentric +settings), depicting small objects undergoing both rigid and non-rigid +(including state) deformations. While a number of recent approaches have been +explored for this task, these data characteristics still present challenges. In +this work we propose a novel, clip-based DETR-style encoder-decoder +architecture, which focuses on systematically analyzing and addressing +aforementioned challenges. Specifically, we propose a novel +transformation-aware loss that focuses learning on portions of the video where +an object undergoes significant deformations -- a form of "soft" hard examples +mining. Further, we propose a multiplicative time-coded memory, beyond vanilla +additive positional encoding, which helps propagate context across long videos. +Finally, we incorporate these in our proposed holistic multi-scale video +transformer for tracking via multi-scale memory matching and decoding to ensure +sensitivity and accuracy for long videos and small objects. Our model enables +on-line inference with long videos in a windowed fashion, by breaking the video +into clips and propagating context among them. We illustrate that short clip +length and longer memory with learned time-coding are important design choices +for improved performance. Collectively, these technical contributions enable +our model to achieve new state-of-the-art (SoTA) performance on two complex +egocentric datasets -- VISOR and VOST, while achieving comparable to SoTA +results on the conventional VOS benchmark, DAVIS'17. A series of detailed +ablations validate our design choices as well as provide insights into the +importance of parameter choices and their impact on performance. + +
+
+
+
+
+ + ♻ ☆ Lane Change Classification and Prediction with Action Recognition + Networks ECCV2022 + + +
+ Anticipating lane change intentions of surrounding vehicles is crucial for +efficient and safe driving decision making in an autonomous driving system. +Previous works often adopt physical variables such as driving speed, +acceleration and so forth for lane change classification. However, physical +variables do not contain semantic information. Although 3D CNNs have been +developing rapidly, the number of methods utilising action recognition models +and appearance feature for lane change recognition is low, and they all require +additional information to pre-process data. In this work, we propose an +end-to-end framework including two action recognition methods for lane change +recognition, using video data collected by cameras. Our method achieves the +best lane change classification results using only the RGB video data of the +PREVENTION dataset. Class activation maps demonstrate that action recognition +models can efficiently extract lane change motions. A method to better extract +motion clues is also proposed in this paper. + +
+
+ comment: Accepted to ECCV2022 AVVISION +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 167 + +
+
+
+ + ☆ Finding Visual Task Vectors + + +
+ Visual Prompting is a technique for teaching models to perform a visual task +via in-context examples, without any additional training. In this work, we +analyze the activations of MAE-VQGAN, a recent Visual Prompting model, and find +task vectors, activations that encode task-specific information. Equipped with +this insight, we demonstrate that it is possible to identify the task vectors +and use them to guide the network towards performing different tasks without +providing any input-output examples. To find task vectors, we compute the +average intermediate activations per task and use the REINFORCE algorithm to +search for the subset of task vectors. The resulting task vectors guide the +model towards performing a task better than the original model without the need +for input-output examples. + +
+
+ comment: https://github.com/alhojel/visual_task_vectors +
+
+
+
+
+ + ☆ MA-LMM: Memory-Augmented Large Multimodal Model for Long-Term Video + Understanding CVPR 2024 + + +
+ With the success of large language models (LLMs), integrating the vision +model into LLMs to build vision-language foundation models has gained much more +interest recently. However, existing LLM-based large multimodal models (e.g., +Video-LLaMA, VideoChat) can only take in a limited number of frames for short +video understanding. In this study, we mainly focus on designing an efficient +and effective model for long-term video understanding. Instead of trying to +process more frames simultaneously like most existing work, we propose to +process videos in an online manner and store past video information in a memory +bank. This allows our model to reference historical video content for long-term +analysis without exceeding LLMs' context length constraints or GPU memory +limits. Our memory bank can be seamlessly integrated into current multimodal +LLMs in an off-the-shelf manner. We conduct extensive experiments on various +video understanding tasks, such as long-video understanding, video question +answering, and video captioning, and our model can achieve state-of-the-art +performances across multiple datasets. Code available at +https://boheumd.github.io/MA-LMM/. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Ferret-UI: Grounded Mobile UI Understanding with Multimodal LLMs + + +
+ Recent advancements in multimodal large language models (MLLMs) have been +noteworthy, yet, these general-domain MLLMs often fall short in their ability +to comprehend and interact effectively with user interface (UI) screens. In +this paper, we present Ferret-UI, a new MLLM tailored for enhanced +understanding of mobile UI screens, equipped with referring, grounding, and +reasoning capabilities. Given that UI screens typically exhibit a more +elongated aspect ratio and contain smaller objects of interest (e.g., icons, +texts) than natural images, we incorporate "any resolution" on top of Ferret to +magnify details and leverage enhanced visual features. Specifically, each +screen is divided into 2 sub-images based on the original aspect ratio (i.e., +horizontal division for portrait screens and vertical division for landscape +screens). Both sub-images are encoded separately before being sent to LLMs. We +meticulously gather training samples from an extensive range of elementary UI +tasks, such as icon recognition, find text, and widget listing. These samples +are formatted for instruction-following with region annotations to facilitate +precise referring and grounding. To augment the model's reasoning ability, we +further compile a dataset for advanced tasks, including detailed description, +perception/interaction conversations, and function inference. After training on +the curated datasets, Ferret-UI exhibits outstanding comprehension of UI +screens and the capability to execute open-ended instructions. For model +evaluation, we establish a comprehensive benchmark encompassing all the +aforementioned tasks. Ferret-UI excels not only beyond most open-source UI +MLLMs, but also surpasses GPT-4V on all the elementary UI tasks. + +
+
+
+
+
+ + ☆ SwapAnything: Enabling Arbitrary Object Swapping in Personalized Visual + Editing + + +
+ Effective editing of personal content holds a pivotal role in enabling +individuals to express their creativity, weaving captivating narratives within +their visual stories, and elevate the overall quality and impact of their +visual content. Therefore, in this work, we introduce SwapAnything, a novel +framework that can swap any objects in an image with personalized concepts +given by the reference, while keeping the context unchanged. Compared with +existing methods for personalized subject swapping, SwapAnything has three +unique advantages: (1) precise control of arbitrary objects and parts rather +than the main subject, (2) more faithful preservation of context pixels, (3) +better adaptation of the personalized concept to the image. First, we propose +targeted variable swapping to apply region control over latent feature maps and +swap masked variables for faithful context preservation and initial semantic +concept swapping. Then, we introduce appearance adaptation, to seamlessly adapt +the semantic concept into the original image in terms of target location, +shape, style, and content during the image generation process. Extensive +results on both human and automatic evaluation demonstrate significant +improvements of our approach over baseline methods on personalized swapping. +Furthermore, SwapAnything shows its precise and faithful swapping abilities +across single object, multiple objects, partial object, and cross-domain +swapping tasks. SwapAnything also achieves great performance on text-based +swapping and tasks beyond swapping such as object insertion. + +
+
+ comment: 18 pages, 16 figures, 3 tables +
+
+
+
+
+ + ☆ Learning 3D-Aware GANs from Unposed Images with Template Feature Field + + +
+ Collecting accurate camera poses of training images has been shown to well +serve the learning of 3D-aware generative adversarial networks (GANs) yet can +be quite expensive in practice. This work targets learning 3D-aware GANs from +unposed images, for which we propose to perform on-the-fly pose estimation of +training images with a learned template feature field (TeFF). Concretely, in +addition to a generative radiance field as in previous approaches, we ask the +generator to also learn a field from 2D semantic features while sharing the +density from the radiance field. Such a framework allows us to acquire a +canonical 3D feature template leveraging the dataset mean discovered by the +generative model, and further efficiently estimate the pose parameters on real +data. Experimental results on various challenging datasets demonstrate the +superiority of our approach over state-of-the-art alternatives from both the +qualitative and the quantitative perspectives. + +
+
+ comment: https://XDimlab.github.io/TeFF +
+
+
+
+
+ + ☆ Evaluating the Efficacy of Cut-and-Paste Data Augmentation in Semantic + Segmentation for Satellite Imagery + + +
+ Satellite imagery is crucial for tasks like environmental monitoring and +urban planning. Typically, it relies on semantic segmentation or Land Use Land +Cover (LULC) classification to categorize each pixel. Despite the advancements +brought about by Deep Neural Networks (DNNs), their performance in segmentation +tasks is hindered by challenges such as limited availability of labeled data, +class imbalance and the inherent variability and complexity of satellite +images. In order to mitigate those issues, our study explores the effectiveness +of a Cut-and-Paste augmentation technique for semantic segmentation in +satellite images. We adapt this augmentation, which usually requires labeled +instances, to the case of semantic segmentation. By leveraging the connected +components in the semantic segmentation labels, we extract instances that are +then randomly pasted during training. Using the DynamicEarthNet dataset and a +U-Net model for evaluation, we found that this augmentation significantly +enhances the mIoU score on the test set from 37.9 to 44.1. This finding +highlights the potential of the Cut-and-Paste augmentation to improve the +generalization capabilities of semantic segmentation models in satellite +imagery. + +
+
+ comment: Accepted for publication in IEEE 2024 International Geoscience & + Remote Sensing Symposium (IGARSS 2024) +
+
+
+
+
+ + ☆ Retrieval-Augmented Open-Vocabulary Object Detection CVPR 2024 + + +
+ Open-vocabulary object detection (OVD) has been studied with Vision-Language +Models (VLMs) to detect novel objects beyond the pre-trained categories. +Previous approaches improve the generalization ability to expand the knowledge +of the detector, using 'positive' pseudo-labels with additional 'class' names, +e.g., sock, iPod, and alligator. To extend the previous methods in two aspects, +we propose Retrieval-Augmented Losses and visual Features (RALF). Our method +retrieves related 'negative' classes and augments loss functions. Also, visual +features are augmented with 'verbalized concepts' of classes, e.g., worn on the +feet, handheld music player, and sharp teeth. Specifically, RALF consists of +two modules: Retrieval Augmented Losses (RAL) and Retrieval-Augmented visual +Features (RAF). RAL constitutes two losses reflecting the semantic similarity +with negative vocabularies. In addition, RAF augments visual features with the +verbalized concepts from a large language model (LLM). Our experiments +demonstrate the effectiveness of RALF on COCO and LVIS benchmark datasets. We +achieve improvement up to 3.4 box AP$_{50}^{\text{N}}$ on novel categories of +the COCO dataset and 3.6 mask AP$_{\text{r}}$ gains on the LVIS dataset. Code +is available at https://github.com/mlvlab/RALF . + +
+
+ comment: Accepted paper at CVPR 2024 +
+
+
+
+
+ + ☆ SphereHead: Stable 3D Full-head Synthesis with Spherical Tri-plane + Representation + + +
+ While recent advances in 3D-aware Generative Adversarial Networks (GANs) have +aided the development of near-frontal view human face synthesis, the challenge +of comprehensively synthesizing a full 3D head viewable from all angles still +persists. Although PanoHead proves the possibilities of using a large-scale +dataset with images of both frontal and back views for full-head synthesis, it +often causes artifacts for back views. Based on our in-depth analysis, we found +the reasons are mainly twofold. First, from network architecture perspective, +we found each plane in the utilized tri-plane/tri-grid representation space +tends to confuse the features from both sides, causing "mirroring" artifacts +(e.g., the glasses appear in the back). Second, from data supervision aspect, +we found that existing discriminator training in 3D GANs mainly focuses on the +quality of the rendered image itself, and does not care much about its +plausibility with the perspective from which it was rendered. This makes it +possible to generate "face" in non-frontal views, due to its easiness to fool +the discriminator. In response, we propose SphereHead, a novel tri-plane +representation in the spherical coordinate system that fits the human head's +geometric characteristics and efficiently mitigates many of the generated +artifacts. We further introduce a view-image consistency loss for the +discriminator to emphasize the correspondence of the camera parameters and the +images. The combination of these efforts results in visually superior outcomes +with significantly fewer artifacts. Our code and dataset are publicly available +at https://lhyfst.github.io/spherehead. + +
+
+ comment: project page: https://lhyfst.github.io/spherehead +
+
+
+
+
+ + ☆ Normalizing Flows on the Product Space of SO(3) Manifolds for + Probabilistic Human Pose Modeling CVPR 2024 + + +
+ Normalizing flows have proven their efficacy for density estimation in +Euclidean space, but their application to rotational representations, crucial +in various domains such as robotics or human pose modeling, remains +underexplored. Probabilistic models of the human pose can benefit from +approaches that rigorously consider the rotational nature of human joints. For +this purpose, we introduce HuProSO3, a normalizing flow model that operates on +a high-dimensional product space of SO(3) manifolds, modeling the joint +distribution for human joints with three degrees of freedom. HuProSO3's +advantage over state-of-the-art approaches is demonstrated through its superior +modeling accuracy in three different applications and its capability to +evaluate the exact likelihood. This work not only addresses the technical +challenge of learning densities on SO(3) manifolds, but it also has broader +implications for domains where the probabilistic regression of correlated 3D +rotations is of importance. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ MoMA: Multimodal LLM Adapter for Fast Personalized Image Generation + + +
+ In this paper, we present MoMA: an open-vocabulary, training-free +personalized image model that boasts flexible zero-shot capabilities. As +foundational text-to-image models rapidly evolve, the demand for robust +image-to-image translation grows. Addressing this need, MoMA specializes in +subject-driven personalized image generation. Utilizing an open-source, +Multimodal Large Language Model (MLLM), we train MoMA to serve a dual role as +both a feature extractor and a generator. This approach effectively synergizes +reference image and text prompt information to produce valuable image features, +facilitating an image diffusion model. To better leverage the generated +features, we further introduce a novel self-attention shortcut method that +efficiently transfers image features to an image diffusion model, improving the +resemblance of the target object in generated images. Remarkably, as a +tuning-free plug-and-play module, our model requires only a single reference +image and outperforms existing methods in generating images with high detail +fidelity, enhanced identity-preservation and prompt faithfulness. Our work is +open-source, thereby providing universal access to these advancements. + +
+
+
+
+
+ + ☆ CoReS: Orchestrating the Dance of Reasoning and Segmentation + + +
+ The reasoning segmentation task, which demands a nuanced comprehension of +intricate queries to accurately pinpoint object regions, is attracting +increasing attention. However, Multi-modal Large Language Models (MLLM) often +find it difficult to accurately localize the objects described in complex +reasoning contexts. We believe that the act of reasoning segmentation should +mirror the cognitive stages of human visual search, where each step is a +progressive refinement of thought toward the final object. Thus we introduce +the Chains of Reasoning and Segmenting (CoReS) and find this top-down visual +hierarchy indeed enhances the visual search process. Specifically, we propose a +dual-chain structure that generates multi-modal, chain-like outputs to aid the +segmentation process. Furthermore, to steer the MLLM's outputs into this +intended hierarchy, we incorporate in-context inputs as guidance. Extensive +experiments demonstrate the superior performance of our CoReS, which surpasses +the state-of-the-art method by 7.1\% on the ReasonSeg dataset. The code will be +released at https://github.com/baoxiaoyi/CoReS. + +
+
+
+
+
+ + ☆ NAF-DPM: A Nonlinear Activation-Free Diffusion Probabilistic Model for + Document Enhancement + + +
+ Real-world documents may suffer various forms of degradation, often resulting +in lower accuracy in optical character recognition (OCR) systems. Therefore, a +crucial preprocessing step is essential to eliminate noise while preserving +text and key features of documents. In this paper, we propose NAF-DPM, a novel +generative framework based on a diffusion probabilistic model (DPM) designed to +restore the original quality of degraded documents. While DPMs are recognized +for their high-quality generated images, they are also known for their large +inference time. To mitigate this problem we provide the DPM with an efficient +nonlinear activation-free (NAF) network and we employ as a sampler a fast +solver of ordinary differential equations, which can converge in a few +iterations. To better preserve text characters, we introduce an additional +differentiable module based on convolutional recurrent neural networks, +simulating the behavior of an OCR system during training. Experiments conducted +on various datasets showcase the superiority of our approach, achieving +state-of-the-art performance in terms of pixel-level and perceptual similarity +metrics. Furthermore, the results demonstrate a notable character error +reduction made by OCR systems when transcribing real-world document images +enhanced by our framework. Code and pre-trained models are available at +https://github.com/ispamm/NAF-DPM. + +
+
+ comment: Under review at IEEE Transactions on Pattern Analysis and Machine + Intelligence +
+
+
+
+
+ + ☆ AlignZeg: Mitigating Objective Misalignment for Zero-shot Semantic + Segmentation + + +
+ A serious issue that harms the performance of zero-shot visual recognition is +named objective misalignment, i.e., the learning objective prioritizes +improving the recognition accuracy of seen classes rather than unseen classes, +while the latter is the true target to pursue. This issue becomes more +significant in zero-shot image segmentation because the stronger (i.e., +pixel-level) supervision brings a larger gap between seen and unseen classes. +To mitigate it, we propose a novel architecture named AlignZeg, which embodies +a comprehensive improvement of the segmentation pipeline, including proposal +extraction, classification, and correction, to better fit the goal of zero-shot +segmentation. (1) Mutually-Refined Proposal Extraction. AlignZeg harnesses a +mutual interaction between mask queries and visual features, facilitating +detailed class-agnostic mask proposal extraction. (2) Generalization-Enhanced +Proposal Classification. AlignZeg introduces synthetic data and incorporates +multiple background prototypes to allocate a more generalizable feature space. +(3) Predictive Bias Correction. During the inference stage, AlignZeg uses a +class indicator to find potential unseen class proposals followed by a +prediction postprocess to correct the prediction bias. Experiments demonstrate +that AlignZeg markedly enhances zero-shot semantic segmentation, as shown by an +average 3.8% increase in hIoU, primarily attributed to a 7.1% improvement in +identifying unseen classes, and we further validate that the improvement comes +from alleviating the objective misalignment issue. + +
+
+
+
+
+ + ☆ YaART: Yet Another ART Rendering Technology + + +
+ In the rapidly progressing field of generative models, the development of +efficient and high-fidelity text-to-image diffusion systems represents a +significant frontier. This study introduces YaART, a novel production-grade +text-to-image cascaded diffusion model aligned to human preferences using +Reinforcement Learning from Human Feedback (RLHF). During the development of +YaART, we especially focus on the choices of the model and training dataset +sizes, the aspects that were not systematically investigated for text-to-image +cascaded diffusion models before. In particular, we comprehensively analyze how +these choices affect both the efficiency of the training process and the +quality of the generated images, which are highly important in practice. +Furthermore, we demonstrate that models trained on smaller datasets of +higher-quality images can successfully compete with those trained on larger +datasets, establishing a more efficient scenario of diffusion models training. +From the quality perspective, YaART is consistently preferred by users over +many existing state-of-the-art models. + +
+
+ comment: Prompts and additional information are available on the project page, + see https://ya.ru/ai/art/paper-yaart-v1 +
+
+
+
+
+ + ☆ BinaryDM: Towards Accurate Binarization of Diffusion Model + + +
+ With the advancement of diffusion models (DMs) and the substantially +increased computational requirements, quantization emerges as a practical +solution to obtain compact and efficient low-bit DMs. However, the highly +discrete representation leads to severe accuracy degradation, hindering the +quantization of diffusion models to ultra-low bit-widths. In this paper, we +propose BinaryDM, a novel accurate quantization-aware training approach to push +the weights of diffusion models towards the limit of 1-bit. Firstly, we present +a Learnable Multi-basis Binarizer (LMB) to recover the representations +generated by the binarized DM, which improves the information in details of +representations crucial to the DM. Secondly, a Low-rank Representation +Mimicking (LRM) is applied to enhance the binarization-aware optimization of +the DM, alleviating the optimization direction ambiguity caused by fine-grained +alignment. Moreover, a progressive initialization strategy is applied to +training DMs to avoid convergence difficulties. Comprehensive experiments +demonstrate that BinaryDM achieves significant accuracy and efficiency gains +compared to SOTA quantization methods of DMs under ultra-low bit-widths. As the +first binarization method for diffusion models, BinaryDM achieves impressive +16.0 times FLOPs and 27.1 times storage savings with 1-bit weight and 4-bit +activation, showcasing its substantial advantages and potential for deploying +DMs on resource-limited scenarios. + +
+
+ comment: The code will soon be available at + https://github.com/Xingyu-Zheng/BinaryDM +
+
+
+
+
+ + ☆ Automatic Controllable Colorization via Imagination CVPR 2024 + + +
+ We propose a framework for automatic colorization that allows for iterative +editing and modifications. The core of our framework lies in an imagination +module: by understanding the content within a grayscale image, we utilize a +pre-trained image generation model to generate multiple images that contain the +same content. These images serve as references for coloring, mimicking the +process of human experts. As the synthesized images can be imperfect or +different from the original grayscale image, we propose a Reference Refinement +Module to select the optimal reference composition. Unlike most previous +end-to-end automatic colorization algorithms, our framework allows for +iterative and localized modifications of the colorization results because we +explicitly model the coloring samples. Extensive experiments demonstrate the +superiority of our framework over existing automatic colorization algorithms in +editability and flexibility. Project page: +https://xy-cong.github.io/imagine-colorization. + +
+
+ comment: CVPR 2024. Project page: + https://xy-cong.github.io/imagine-colorization +
+
+
+
+
+ + ☆ MLP Can Be A Good Transformer Learner + + +
+ Self-attention mechanism is the key of the Transformer but often criticized +for its computation demands. Previous token pruning works motivate their +methods from the view of computation redundancy but still need to load the full +network and require same memory costs. This paper introduces a novel strategy +that simplifies vision transformers and reduces computational load through the +selective removal of non-essential attention layers, guided by entropy +considerations. We identify that regarding the attention layer in bottom +blocks, their subsequent MLP layers, i.e. two feed-forward layers, can elicit +the same entropy quantity. Meanwhile, the accompanied MLPs are under-exploited +since they exhibit smaller feature entropy compared to those MLPs in the top +blocks. Therefore, we propose to integrate the uninformative attention layers +into their subsequent counterparts by degenerating them into identical mapping, +yielding only MLP in certain transformer blocks. Experimental results on +ImageNet-1k show that the proposed method can remove 40% attention layer of +DeiT-B, improving throughput and memory bound without performance compromise. +Code is available at https://github.com/sihaoevery/lambda_vit. + +
+
+ comment: efficient transformer +
+
+
+
+
+ + ☆ 3D-COCO: extension of MS-COCO dataset for image detection and 3D + reconstruction modules + + +
+ We introduce 3D-COCO, an extension of the original MS-COCO dataset providing +3D models and 2D-3D alignment annotations. 3D-COCO was designed to achieve +computer vision tasks such as 3D reconstruction or image detection configurable +with textual, 2D image, and 3D CAD model queries. We complete the existing +MS-COCO dataset with 28K 3D models collected on ShapeNet and Objaverse. By +using an IoU-based method, we match each MS-COCO annotation with the best 3D +models to provide a 2D-3D alignment. The open-source nature of 3D-COCO is a +premiere that should pave the way for new research on 3D-related topics. The +dataset and its source codes is available at +https://kalisteo.cea.fr/index.php/coco3d-object-detection-and-reconstruction/ + +
+
+
+
+
+ + ☆ Learning a Category-level Object Pose Estimator without Pose Annotations + + +
+ 3D object pose estimation is a challenging task. Previous works always +require thousands of object images with annotated poses for learning the 3D +pose correspondence, which is laborious and time-consuming for labeling. In +this paper, we propose to learn a category-level 3D object pose estimator +without pose annotations. Instead of using manually annotated images, we +leverage diffusion models (e.g., Zero-1-to-3) to generate a set of images under +controlled pose differences and propose to learn our object pose estimator with +those images. Directly using the original diffusion model leads to images with +noisy poses and artifacts. To tackle this issue, firstly, we exploit an image +encoder, which is learned from a specially designed contrastive pose learning, +to filter the unreasonable details and extract image feature maps. +Additionally, we propose a novel learning strategy that allows the model to +learn object poses from those generated image sets without knowing the +alignment of their canonical poses. Experimental results show that our method +has the capability of category-level object pose estimation from a single shot +setting (as pose definition), while significantly outperforming other +state-of-the-art methods on the few-shot category-level object pose estimation +benchmarks. + +
+
+
+
+
+ + ☆ MULTIFLOW: Shifting Towards Task-Agnostic Vision-Language Pruning CVPR 2024 + + +
+ While excellent in transfer learning, Vision-Language models (VLMs) come with +high computational costs due to their large number of parameters. To address +this issue, removing parameters via model pruning is a viable solution. +However, existing techniques for VLMs are task-specific, and thus require +pruning the network from scratch for each new task of interest. In this work, +we explore a new direction: Task-Agnostic Vision-Language Pruning (TA-VLP). +Given a pretrained VLM, the goal is to find a unique pruned counterpart +transferable to multiple unknown downstream tasks. In this challenging setting, +the transferable representations already encoded in the pretrained model are a +key aspect to preserve. Thus, we propose Multimodal Flow Pruning (MULTIFLOW), a +first, gradient-free, pruning framework for TA-VLP where: (i) the importance of +a parameter is expressed in terms of its magnitude and its information flow, by +incorporating the saliency of the neurons it connects; and (ii) pruning is +driven by the emergent (multimodal) distribution of the VLM parameters after +pretraining. We benchmark eight state-of-the-art pruning algorithms in the +context of TA-VLP, experimenting with two VLMs, three vision-language tasks, +and three pruning ratios. Our experimental results show that MULTIFLOW +outperforms recent sophisticated, combinatorial competitors in the vast +majority of the cases, paving the way towards addressing TA-VLP. The code is +publicly available at https://github.com/FarinaMatteo/multiflow. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ A Training-Free Plug-and-Play Watermark Framework for Stable Diffusion + + +
+ Nowadays, the family of Stable Diffusion (SD) models has gained prominence +for its high quality outputs and scalability. This has also raised security +concerns on social media, as malicious users can create and disseminate harmful +content. Existing approaches involve training components or entire SDs to embed +a watermark in generated images for traceability and responsibility +attribution. However, in the era of AI-generated content (AIGC), the rapid +iteration of SDs renders retraining with watermark models costly. To address +this, we propose a training-free plug-and-play watermark framework for SDs. +Without modifying any components of SDs, we embed diverse watermarks in the +latent space, adapting to the denoising process. Our experimental findings +reveal that our method effectively harmonizes image quality and watermark +invisibility. Furthermore, it performs robustly under various attacks. We also +have validated that our method is generalized to multiple versions of SDs, even +without retraining the watermark model. + +
+
+
+
+
+ + ☆ Learning Topology Uniformed Face Mesh by Volume Rendering for Multi-view + Reconstruction + + +
+ Face meshes in consistent topology serve as the foundation for many +face-related applications, such as 3DMM constrained face reconstruction and +expression retargeting. Traditional methods commonly acquire topology uniformed +face meshes by two separate steps: multi-view stereo (MVS) to reconstruct +shapes followed by non-rigid registration to align topology, but struggles with +handling noise and non-lambertian surfaces. Recently neural volume rendering +techniques have been rapidly evolved and shown great advantages in 3D +reconstruction or novel view synthesis. Our goal is to leverage the superiority +of neural volume rendering into multi-view reconstruction of face mesh with +consistent topology. We propose a mesh volume rendering method that enables +directly optimizing mesh geometry while preserving topology, and learning +implicit features to model complex facial appearance from multi-view images. +The key innovation lies in spreading sparse mesh features into the surrounding +space to simulate radiance field required for volume rendering, which +facilitates backpropagation of gradients from images to mesh geometry and +implicit appearance features. Our proposed feature spreading module exhibits +deformation invariance, enabling photorealistic rendering seamlessly after mesh +editing. We conduct experiments on multi-view face image dataset to evaluate +the reconstruction and implement an application for photorealistic rendering of +animated face mesh. + +
+
+
+
+
+ + ☆ Self-Explainable Affordance Learning with Embodied Caption + + +
+ In the field of visual affordance learning, previous methods mainly used +abundant images or videos that delineate human behavior patterns to identify +action possibility regions for object manipulation, with a variety of +applications in robotic tasks. However, they encounter a main challenge of +action ambiguity, illustrated by the vagueness like whether to beat or carry a +drum, and the complexities involved in processing intricate scenes. Moreover, +it is important for human intervention to rectify robot errors in time. To +address these issues, we introduce Self-Explainable Affordance learning (SEA) +with embodied caption. This innovation enables robots to articulate their +intentions and bridge the gap between explainable vision-language caption and +visual affordance learning. Due to a lack of appropriate dataset, we unveil a +pioneering dataset and metrics tailored for this task, which integrates images, +heatmaps, and embodied captions. Furthermore, we propose a novel model to +effectively combine affordance grounding with self-explanation in a simple but +efficient manner. Extensive quantitative and qualitative experiments +demonstrate our method's effectiveness. + +
+
+
+
+
+ + ☆ UniFL: Improve Stable Diffusion via Unified Feedback Learning + + +
+ Diffusion models have revolutionized the field of image generation, leading +to the proliferation of high-quality models and diverse downstream +applications. However, despite these significant advancements, the current +competitive solutions still suffer from several limitations, including inferior +visual quality, a lack of aesthetic appeal, and inefficient inference, without +a comprehensive solution in sight. To address these challenges, we present +UniFL, a unified framework that leverages feedback learning to enhance +diffusion models comprehensively. UniFL stands out as a universal, effective, +and generalizable solution applicable to various diffusion models, such as +SD1.5 and SDXL. Notably, UniFL incorporates three key components: perceptual +feedback learning, which enhances visual quality; decoupled feedback learning, +which improves aesthetic appeal; and adversarial feedback learning, which +optimizes inference speed. In-depth experiments and extensive user studies +validate the superior performance of our proposed method in enhancing both the +quality of generated models and their acceleration. For instance, UniFL +surpasses ImageReward by 17% user preference in terms of generation quality and +outperforms LCM and SDXL Turbo by 57% and 20% in 4-step inference. Moreover, we +have verified the efficacy of our approach in downstream tasks, including Lora, +ControlNet, and AnimateDiff. + +
+
+
+
+
+ + ☆ Neural Cellular Automata for Lightweight, Robust and Explainable + Classification of White Blood Cell Images + + +
+ Diagnosis of hematological malignancies depends on accurate identification of +white blood cells in peripheral blood smears. Deep learning techniques are +emerging as a viable solution to scale and optimize this process by automatic +identification of cells in laboratories. However, these techniques face several +challenges such as limited generalizability, sensitivity to domain shifts and +lack of explainability. Here, we are introducing a novel approach based on +neural cellular automata (NCA) for white blood cell classification. We test our +approach on three datasets of white blood cell images and show that we achieve +competitive performance compared to conventional methods. Our NCA-based method +is significantly smaller in terms of parameters and exhibits robustness to +domain shifts. Furthermore, the architecture is inherently explainable, +providing insights into the decision process for each classification, helping +experts understand and validate model predictions. Results demonstrate that NCA +not only can be used for image classification, but also address key challenges +of conventional methods, indicating a high potential for applicability in +clinical practice. + +
+
+
+
+
+ + ☆ Towards More General Video-based Deepfake Detection through Facial + Feature Guided Adaptation for Foundation Model + + +
+ With the rise of deep learning, generative models have enabled the creation +of highly realistic synthetic images, presenting challenges due to their +potential misuse. While research in Deepfake detection has grown rapidly in +response, many detection methods struggle with unseen Deepfakes generated by +new synthesis techniques. To address this generalisation challenge, we propose +a novel Deepfake detection approach by adapting rich information encoded inside +the Foundation Models with rich information encoded inside, specifically using +the image encoder from CLIP which has demonstrated strong zero-shot capability +for downstream tasks. Inspired by the recent advances of parameter efficient +fine-tuning, we propose a novel side-network-based decoder to extract spatial +and temporal cues from the given video clip, with the promotion of the Facial +Component Guidance (FCG) to guidencourage the spatial feature to include +features of key facial parts for more robust and general Deepfake detection. +Through extensive cross-dataset evaluations, our approach exhibits superior +effectiveness in identifying unseen Deepfake samples, achieving notable +performance improvementsuccess even with limited training samples and +manipulation types. Our model secures an average performance enhancement of +0.9% AUROC in cross-dataset assessments comparing with state-of-the-art +methods, especiallytablishing a significant lead of achieving 4.4% improvement +on the challenging DFDC dataset. + +
+
+
+
+
+ + ☆ Responsible Visual Editing + + +
+ With recent advancements in visual synthesis, there is a growing risk of +encountering images with detrimental effects, such as hate, discrimination, or +privacy violations. The research on transforming harmful images into +responsible ones remains unexplored. In this paper, we formulate a new task, +responsible visual editing, which entails modifying specific concepts within an +image to render it more responsible while minimizing changes. However, the +concept that needs to be edited is often abstract, making it challenging to +locate what needs to be modified and plan how to modify it. To tackle these +challenges, we propose a Cognitive Editor (CoEditor) that harnesses the large +multimodal model through a two-stage cognitive process: (1) a perceptual +cognitive process to focus on what needs to be modified and (2) a behavioral +cognitive process to strategize how to modify. To mitigate the negative +implications of harmful images on research, we create a transparent and public +dataset, AltBear, which expresses harmful information using teddy bears instead +of humans. Experiments demonstrate that CoEditor can effectively comprehend +abstract concepts within complex scenes and significantly surpass the +performance of baseline models for responsible visual editing. We find that the +AltBear dataset corresponds well to the harmful content found in real images, +offering a consistent experimental evaluation, thereby providing a safer +benchmark for future research. Moreover, CoEditor also shows great results in +general editing. We release our code and dataset at +https://github.com/kodenii/Responsible-Visual-Editing. + +
+
+ comment: 24 pages, 12 figures +
+
+
+
+
+ + ☆ Robust Data Pruning: Uncovering and Overcoming Implicit Bias + + +
+ In the era of exceptionally data-hungry models, careful selection of the +training data is essential to mitigate the extensive costs of deep learning. +Data pruning offers a solution by removing redundant or uninformative samples +from the dataset, which yields faster convergence and improved neural scaling +laws. However, little is known about its impact on classification bias of the +trained models. We conduct the first systematic study of this effect and reveal +that existing data pruning algorithms can produce highly biased classifiers. At +the same time, we argue that random data pruning with appropriate class ratios +has potential to improve the worst-class performance. We propose a +"fairness-aware" approach to pruning and empirically demonstrate its +performance on standard computer vision benchmarks. In sharp contrast to +existing algorithms, our proposed method continues improving robustness at a +tolerable drop of average performance as we prune more from the datasets. We +present theoretical analysis of the classification risk in a mixture of +Gaussians to further motivate our algorithm and support our findings. + +
+
+
+
+
+ + ☆ Social-MAE: Social Masked Autoencoder for Multi-person Motion + Representation Learning + + +
+ For a complete comprehension of multi-person scenes, it is essential to go +beyond basic tasks like detection and tracking. Higher-level tasks, such as +understanding the interactions and social activities among individuals, are +also crucial. Progress towards models that can fully understand scenes +involving multiple people is hindered by a lack of sufficient annotated data +for such high-level tasks. To address this challenge, we introduce Social-MAE, +a simple yet effective transformer-based masked autoencoder framework for +multi-person human motion data. The framework uses masked modeling to pre-train +the encoder to reconstruct masked human joint trajectories, enabling it to +learn generalizable and data efficient representations of motion in human +crowded scenes. Social-MAE comprises a transformer as the MAE encoder and a +lighter-weight transformer as the MAE decoder which operates on multi-person +joints' trajectory in the frequency domain. After the reconstruction task, the +MAE decoder is replaced with a task-specific decoder and the model is +fine-tuned end-to-end for a variety of high-level social tasks. Our proposed +model combined with our pre-training approach achieves the state-of-the-art +results on various high-level social tasks, including multi-person pose +forecasting, social grouping, and social action understanding. These +improvements are demonstrated across four popular multi-person datasets +encompassing both human 2D and 3D body pose. + +
+
+
+
+
+ + ☆ TIM: A Time Interval Machine for Audio-Visual Action Recognition CVPR 2024 + + +
+ Diverse actions give rise to rich audio-visual signals in long videos. Recent +works showcase that the two modalities of audio and video exhibit different +temporal extents of events and distinct labels. We address the interplay +between the two modalities in long videos by explicitly modelling the temporal +extents of audio and visual events. We propose the Time Interval Machine (TIM) +where a modality-specific time interval poses as a query to a transformer +encoder that ingests a long video input. The encoder then attends to the +specified interval, as well as the surrounding context in both modalities, in +order to recognise the ongoing action. + We test TIM on three long audio-visual video datasets: EPIC-KITCHENS, +Perception Test, and AVE, reporting state-of-the-art (SOTA) for recognition. On +EPIC-KITCHENS, we beat previous SOTA that utilises LLMs and significantly +larger pre-training by 2.9% top-1 action recognition accuracy. Additionally, we +show that TIM can be adapted for action detection, using dense multi-scale +interval queries, outperforming SOTA on EPIC-KITCHENS-100 for most metrics, and +showing strong performance on the Perception Test. Our ablations show the +critical role of integrating the two modalities and modelling their time +intervals in achieving this performance. Code and models at: +https://github.com/JacobChalk/TIM + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Investigating the Effectiveness of Cross-Attention to Unlock Zero-Shot + Editing of Text-to-Video Diffusion Models CVPR 2024 + + +
+ With recent advances in image and video diffusion models for content +creation, a plethora of techniques have been proposed for customizing their +generated content. In particular, manipulating the cross-attention layers of +Text-to-Image (T2I) diffusion models has shown great promise in controlling the +shape and location of objects in the scene. Transferring image-editing +techniques to the video domain, however, is extremely challenging as object +motion and temporal consistency are difficult to capture accurately. In this +work, we take a first look at the role of cross-attention in Text-to-Video +(T2V) diffusion models for zero-shot video editing. While one-shot models have +shown potential in controlling motion and camera movement, we demonstrate +zero-shot control over object shape, position and movement in T2V models. We +show that despite the limitations of current T2V models, cross-attention +guidance can be a promising approach for editing videos. + +
+
+ comment: Generative Models for Computer Vision Generative Models for Computer + Vision CVPR 2024 Workshop +
+
+
+
+
+ + ☆ DepthMOT: Depth Cues Lead to a Strong Multi-Object Tracker + + +
+ Accurately distinguishing each object is a fundamental goal of Multi-object +tracking (MOT) algorithms. However, achieving this goal still remains +challenging, primarily due to: (i) For crowded scenes with occluded objects, +the high overlap of object bounding boxes leads to confusion among closely +located objects. Nevertheless, humans naturally perceive the depth of elements +in a scene when observing 2D videos. Inspired by this, even though the bounding +boxes of objects are close on the camera plane, we can differentiate them in +the depth dimension, thereby establishing a 3D perception of the objects. (ii) +For videos with rapidly irregular camera motion, abrupt changes in object +positions can result in ID switches. However, if the camera pose are known, we +can compensate for the errors in linear motion models. In this paper, we +propose \textit{DepthMOT}, which achieves: (i) detecting and estimating scene +depth map \textit{end-to-end}, (ii) compensating the irregular camera motion by +camera pose estimation. Extensive experiments demonstrate the superior +performance of DepthMOT in VisDrone-MOT and UAVDT datasets. The code will be +available at \url{https://github.com/JackWoo0831/DepthMOT}. + +
+
+
+
+
+ + ☆ Impact of LiDAR visualisations on semantic segmentation of + archaeological objects + + +
+ Deep learning methods in LiDAR-based archaeological research often leverage +visualisation techniques derived from Digital Elevation Models to enhance +characteristics of archaeological objects present in the images. This paper +investigates the impact of visualisations on deep learning performance through +a comprehensive testing framework. The study involves the use of eight semantic +segmentation models to evaluate seven diverse visualisations across two study +areas, encompassing five archaeological classes. Experimental results reveal +that the choice of appropriate visualisations can influence performance by up +to 8%. Yet, pinpointing one visualisation that outperforms the others in +segmenting all archaeological classes proves challenging. The observed +performance variation, reaching up to 25% across different model +configurations, underscores the importance of thoughtfully selecting model +configurations and LiDAR visualisations for successfully segmenting +archaeological objects. + +
+
+ comment: Accepted to IEEE International Geoscience and Remote Sensing + Symposium 2024 (IGARSS 2024) @IEEE copyright +
+
+
+
+
+ + ☆ Taming Transformers for Realistic Lidar Point Cloud Generation + + +
+ Diffusion Models (DMs) have achieved State-Of-The-Art (SOTA) results in the +Lidar point cloud generation task, benefiting from their stable training and +iterative refinement during sampling. However, DMs often fail to realistically +model Lidar raydrop noise due to their inherent denoising process. To retain +the strength of iterative sampling while enhancing the generation of raydrop +noise, we introduce LidarGRIT, a generative model that uses auto-regressive +transformers to iteratively sample the range images in the latent space rather +than image space. Furthermore, LidarGRIT utilises VQ-VAE to separately decode +range images and raydrop masks. Our results show that LidarGRIT achieves +superior performance compared to SOTA models on KITTI-360 and KITTI odometry +datasets. Code available at:https://github.com/hamedhaghighi/LidarGRIT. + +
+
+
+
+
+ + ☆ Two-Person Interaction Augmentation with Skeleton Priors + + +
+ Close and continuous interaction with rich contacts is a crucial aspect of +human activities (e.g. hugging, dancing) and of interest in many domains like +activity recognition, motion prediction, character animation, etc. However, +acquiring such skeletal motion is challenging. While direct motion capture is +expensive and slow, motion editing/generation is also non-trivial, as complex +contact patterns with topological and geometric constraints have to be +retained. To this end, we propose a new deep learning method for two-body +skeletal interaction motion augmentation, which can generate variations of +contact-rich interactions with varying body sizes and proportions while +retaining the key geometric/topological relations between two bodies. Our +system can learn effectively from a relatively small amount of data and +generalize to drastically different skeleton sizes. Through exhaustive +evaluation and comparison, we show it can generate high-quality motions, has +strong generalizability and outperforms traditional optimization-based methods +and alternative deep learning solutions. + +
+
+
+
+
+ + ☆ Mind-to-Image: Projecting Visual Mental Imagination of the Brain from + fMRI + + +
+ The reconstruction of images observed by subjects from fMRI data collected +during visual stimuli has made significant strides in the past decade, thanks +to the availability of extensive fMRI datasets and advancements in generative +models for image generation. However, the application of visual reconstruction +has remained limited. Reconstructing visual imagination presents a greater +challenge, with potentially revolutionary applications ranging from aiding +individuals with disabilities to verifying witness accounts in court. The +primary hurdles in this field are the absence of data collection protocols for +visual imagery and the lack of datasets on the subject. Traditionally, +fMRI-to-image relies on data collected from subjects exposed to visual stimuli, +which poses issues for generating visual imagery based on the difference of +brain activity between visual stimulation and visual imagery. For the first +time, we have compiled a substantial dataset (around 6h of scans) on visual +imagery along with a proposed data collection protocol. We then train a +modified version of an fMRI-to-image model and demonstrate the feasibility of +reconstructing images from two modes of imagination: from memory and from pure +imagination. This marks an important step towards creating a technology that +allow direct reconstruction of visual imagery. + +
+
+ comment: Pre-print to be updated +
+
+
+
+
+ + ☆ Enhancing Lip Reading with Multi-Scale Video and Multi-Encoder ICME2024 + + +
+ Automatic lip-reading (ALR) aims to automatically transcribe spoken content +from a speaker's silent lip motion captured in video. Current mainstream +lip-reading approaches only use a single visual encoder to model input videos +of a single scale. In this paper, we propose to enhance lipreading by +incorporating multi-scale video data and multi-encoder. Specifically, we first +propose a novel multi-scale lip extraction algorithm based on the size of the +speaker's face and an enhanced ResNet3D visual front-end (VFE) to extract lip +features at different scales. For the multi-encoder, in addition to the +mainstream Transformer and Conformer, we also incorporate the recently proposed +Branchformer and EBranchformer as visual encoders. In the experiments, we +explore the influence of different video data scales and encoders on ALR system +performance and fuse the texts transcribed by all ALR systems using recognizer +output voting error reduction (ROVER). Finally, our proposed approach placed +second in the ICME 2024 ChatCLR Challenge Task 2, with a 21.52% reduction in +character error rate (CER) compared to the official baseline on the evaluation +set. + +
+
+ comment: 6 pages, 3 figures, submitted to ICME2024 GC-ChatCLR +
+
+
+
+
+ + ☆ HAMMR: HierArchical MultiModal React agents for generic VQA + + +
+ Combining Large Language Models (LLMs) with external specialized tools +(LLMs+tools) is a recent paradigm to solve multimodal tasks such as Visual +Question Answering (VQA). While this approach was demonstrated to work well +when optimized and evaluated for each individual benchmark, in practice it is +crucial for the next generation of real-world AI systems to handle a broad +range of multimodal problems. Therefore we pose the VQA problem from a unified +perspective and evaluate a single system on a varied suite of VQA tasks +including counting, spatial reasoning, OCR-based reasoning, visual pointing, +external knowledge, and more. In this setting, we demonstrate that naively +applying the LLM+tools approach using the combined set of all tools leads to +poor results. This motivates us to introduce HAMMR: HierArchical MultiModal +React. We start from a multimodal ReAct-based system and make it hierarchical +by enabling our HAMMR agents to call upon other specialized agents. This +enhances the compositionality of the LLM+tools approach, which we show to be +critical for obtaining high accuracy on generic VQA. Concretely, on our generic +VQA suite, HAMMR outperforms the naive LLM+tools approach by 19.5%. +Additionally, HAMMR achieves state-of-the-art results on this task, +outperforming the generic standalone PaLI-X VQA model by 5.0%. + +
+
+
+
+
+ + ☆ Pansharpening of PRISMA products for archaeological prospection + + +
+ Hyperspectral data recorded from satellite platforms are often ill-suited for +geo-archaeological prospection due to low spatial resolution. The established +potential of hyperspectral data from airborne sensors in identifying +archaeological features has, on the other side, generated increased interest in +enhancing hyperspectral data to achieve higher spatial resolution. This +improvement is crucial for detecting traces linked to sub-surface +geo-archaeological features and can make satellite hyperspectral acquisitions +more suitable for archaeological research. This research assesses the usability +of pansharpened PRISMA satellite products in geo-archaeological prospections. +Three pan-sharpening methods (GSA, MTF-GLP and HySure) are compared +quantitatively and qualitatively and tested over the archaeological landscape +of Aquileia (Italy). The results suggest that the application of pansharpening +techniques makes hyperspectral satellite imagery highly suitable, under certain +conditions, to the identification of sub-surface archaeological features of +small and large size. + +
+
+ comment: Accepted to IEEE International Geoscience and Remote Sensing + Symposium 2024 (IGARSS 2024) @IEEE copyright +
+
+
+
+
+ + ☆ Action-conditioned video data improves predictability + + +
+ Long-term video generation and prediction remain challenging tasks in +computer vision, particularly in partially observable scenarios where cameras +are mounted on moving platforms. The interaction between observed image frames +and the motion of the recording agent introduces additional complexities. To +address these issues, we introduce the Action-Conditioned Video Generation +(ACVG) framework, a novel approach that investigates the relationship between +actions and generated image frames through a deep dual Generator-Actor +architecture. ACVG generates video sequences conditioned on the actions of +robots, enabling exploration and analysis of how vision and action mutually +influence one another in dynamic environments. We evaluate the framework's +effectiveness on an indoor robot motion dataset which consists of sequences of +image frames along with the sequences of actions taken by the robotic agent, +conducting a comprehensive empirical study comparing ACVG to other +state-of-the-art frameworks along with a detailed ablation study. + +
+
+
+
+
+ + ☆ Test-Time Zero-Shot Temporal Action Localization + + +
+ Zero-Shot Temporal Action Localization (ZS-TAL) seeks to identify and locate +actions in untrimmed videos unseen during training. Existing ZS-TAL methods +involve fine-tuning a model on a large amount of annotated training data. While +effective, training-based ZS-TAL approaches assume the availability of labeled +data for supervised learning, which can be impractical in some applications. +Furthermore, the training process naturally induces a domain bias into the +learned model, which may adversely affect the model's generalization ability to +arbitrary videos. These considerations prompt us to approach the ZS-TAL problem +from a radically novel perspective, relaxing the requirement for training data. +To this aim, we introduce a novel method that performs Test-Time adaptation for +Temporal Action Localization (T3AL). In a nutshell, T3AL adapts a pre-trained +Vision and Language Model (VLM). T3AL operates in three steps. First, a +video-level pseudo-label of the action category is computed by aggregating +information from the entire video. Then, action localization is performed +adopting a novel procedure inspired by self-supervised learning. Finally, +frame-level textual descriptions extracted with a state-of-the-art captioning +model are employed for refining the action region proposals. We validate the +effectiveness of T3AL by conducting experiments on the THUMOS14 and the +ActivityNet-v1.3 datasets. Our results demonstrate that T3AL significantly +outperforms zero-shot baselines based on state-of-the-art VLMs, confirming the +benefit of a test-time adaptation approach. + +
+
+
+
+
+ + ☆ Two Hands Are Better Than One: Resolving Hand to Hand Intersections via + Occupancy Networks + + +
+ 3D hand pose estimation from images has seen considerable interest from the +literature, with new methods improving overall 3D accuracy. One current +challenge is to address hand-to-hand interaction where self-occlusions and +finger articulation pose a significant problem to estimation. Little work has +applied physical constraints that minimize the hand intersections that occur as +a result of noisy estimation. This work addresses the intersection of hands by +exploiting an occupancy network that represents the hand's volume as a +continuous manifold. This allows us to model the probability distribution of +points being inside a hand. We designed an intersection loss function to +minimize the likelihood of hand-to-point intersections. Moreover, we propose a +new hand mesh parameterization that is superior to the commonly used MANO model +in many respects including lower mesh complexity, underlying 3D skeleton +extraction, watertightness, etc. On the benchmark InterHand2.6M dataset, the +models trained using our intersection loss achieve better results than the +state-of-the-art by significantly decreasing the number of hand intersections +while lowering the mean per-joint positional error. Additionally, we +demonstrate superior performance for 3D hand uplift on Re:InterHand and SMILE +datasets and show reduced hand-to-hand intersections for complex domains such +as sign-language pose estimation. + +
+
+
+
+
+ + ☆ Anatomical Conditioning for Contrastive Unpaired Image-to-Image + Translation of Optical Coherence Tomography Images + + +
+ For a unified analysis of medical images from different modalities, data +harmonization using image-to-image (I2I) translation is desired. We study this +problem employing an optical coherence tomography (OCT) data set of +Spectralis-OCT and Home-OCT images. I2I translation is challenging because the +images are unpaired, and a bijective mapping does not exist due to the +information discrepancy between both domains. This problem has been addressed +by the Contrastive Learning for Unpaired I2I Translation (CUT) approach, but it +reduces semantic consistency. To restore the semantic consistency, we support +the style decoder using an additional segmentation decoder. Our approach +increases the similarity between the style-translated images and the target +distribution. Importantly, we improve the segmentation of biomarkers in +Home-OCT images in an unsupervised domain adaptation scenario. Our data +harmonization approach provides potential for the monitoring of diseases, e.g., +age related macular disease, using different OCT devices. + +
+
+ comment: Accepted at ISBI 2024 +
+
+
+
+
+ + ☆ PAT: Pixel-wise Adaptive Training for Long-tailed Segmentation + + +
+ Beyond class frequency, we recognize the impact of class-wise relationships +among various class-specific predictions and the imbalance in label masks on +long-tailed segmentation learning. To address these challenges, we propose an +innovative Pixel-wise Adaptive Training (PAT) technique tailored for +long-tailed segmentation. PAT has two key features: 1) class-wise gradient +magnitude homogenization, and 2) pixel-wise class-specific loss adaptation +(PCLA). First, the class-wise gradient magnitude homogenization helps alleviate +the imbalance among label masks by ensuring equal consideration of the +class-wise impact on model updates. Second, PCLA tackles the detrimental impact +of both rare classes within the long-tailed distribution and inaccurate +predictions from previous training stages by encouraging learning classes with +low prediction confidence and guarding against forgetting classes with high +confidence. This combined approach fosters robust learning while preventing the +model from forgetting previously learned knowledge. PAT exhibits significant +performance improvements, surpassing the current state-of-the-art by 2.2% in +the NyU dataset. Moreover, it enhances overall pixel-wise accuracy by 2.85% and +intersection over union value by 2.07%, with a particularly notable declination +of 0.39% in detecting rare classes compared to Balance Logits Variation, as +demonstrated on the three popular datasets, i.e., OxfordPetIII, CityScape, and +NYU. + +
+
+
+
+
+ + ☆ T-DEED: Temporal-Discriminability Enhancer Encoder-Decoder for Precise + Event Spotting in Sports Videos + + +
+ In this paper, we introduce T-DEED, a Temporal-Discriminability Enhancer +Encoder-Decoder for Precise Event Spotting in sports videos. T-DEED addresses +multiple challenges in the task, including the need for discriminability among +frame representations, high output temporal resolution to maintain prediction +precision, and the necessity to capture information at different temporal +scales to handle events with varying dynamics. It tackles these challenges +through its specifically designed architecture, featuring an encoder-decoder +for leveraging multiple temporal scales and achieving high output temporal +resolution, along with temporal modules designed to increase token +discriminability. Leveraging these characteristics, T-DEED achieves SOTA +performance on the FigureSkating and FineDiving datasets. + +
+
+
+
+
+ + ☆ Rethinking the Spatial Inconsistency in Classifier-Free Diffusion + Guidance CVPR-2024 + + +
+ Classifier-Free Guidance (CFG) has been widely used in text-to-image +diffusion models, where the CFG scale is introduced to control the strength of +text guidance on the whole image space. However, we argue that a global CFG +scale results in spatial inconsistency on varying semantic strengths and +suboptimal image quality. To address this problem, we present a novel approach, +Semantic-aware Classifier-Free Guidance (S-CFG), to customize the guidance +degrees for different semantic units in text-to-image diffusion models. +Specifically, we first design a training-free semantic segmentation method to +partition the latent image into relatively independent semantic regions at each +denoising step. In particular, the cross-attention map in the denoising U-net +backbone is renormalized for assigning each patch to the corresponding token, +while the self-attention map is used to complete the semantic regions. Then, to +balance the amplification of diverse semantic units, we adaptively adjust the +CFG scales across different semantic regions to rescale the text guidance +degrees into a uniform level. Finally, extensive experiments demonstrate the +superiority of S-CFG over the original CFG strategy on various text-to-image +diffusion models, without requiring any extra training cost. our codes are +available at https://github.com/SmilesDZgk/S-CFG. + +
+
+ comment: accepted by CVPR-2024 +
+
+
+
+
+ + ☆ CDAD-Net: Bridging Domain Gaps in Generalized Category Discovery CVPR + + +
+ In Generalized Category Discovery (GCD), we cluster unlabeled samples of +known and novel classes, leveraging a training dataset of known classes. A +salient challenge arises due to domain shifts between these datasets. To +address this, we present a novel setting: Across Domain Generalized Category +Discovery (AD-GCD) and bring forth CDAD-NET (Class Discoverer Across Domains) +as a remedy. CDAD-NET is architected to synchronize potential known class +samples across both the labeled (source) and unlabeled (target) datasets, while +emphasizing the distinct categorization of the target data. To facilitate this, +we propose an entropy-driven adversarial learning strategy that accounts for +the distance distributions of target samples relative to source-domain class +prototypes. Parallelly, the discriminative nature of the shared space is upheld +through a fusion of three metric learning objectives. In the source domain, our +focus is on refining the proximity between samples and their affiliated class +prototypes, while in the target domain, we integrate a neighborhood-centric +contrastive learning mechanism, enriched with an adept neighborsmining +approach. To further accentuate the nuanced feature interrelation among +semantically aligned images, we champion the concept of conditional image +inpainting, underscoring the premise that semantically analogous images prove +more efficacious to the task than their disjointed counterparts. +Experimentally, CDAD-NET eclipses existing literature with a performance +increment of 8-15% on three AD-GCD benchmarks we present. + +
+
+ comment: Accepted in L3D-IVU, CVPR Workshop, 2024 +
+
+
+
+
+ + ☆ Multi-head Attention-based Deep Multiple Instance Learning + + +
+ This paper introduces MAD-MIL, a Multi-head Attention-based Deep Multiple +Instance Learning model, designed for weakly supervised Whole Slide Images +(WSIs) classification in digital pathology. Inspired by the multi-head +attention mechanism of the Transformer, MAD-MIL simplifies model complexity +while achieving competitive results against advanced models like CLAM and +DS-MIL. Evaluated on the MNIST-BAGS and public datasets, including TUPAC16, +TCGA BRCA, TCGA LUNG, and TCGA KIDNEY, MAD-MIL consistently outperforms ABMIL. +This demonstrates enhanced information diversity, interpretability, and +efficiency in slide representation. The model's effectiveness, coupled with +fewer trainable parameters and lower computational complexity makes it a +promising solution for automated pathology workflows. Our code is available at +https://github.com/tueimage/MAD-MIL. + +
+
+
+
+
+ + ☆ CNN-based Game State Detection for a Foosball Table + + +
+ The automation of games using Deep Reinforcement Learning Strategies (DRL) is +a well-known challenge in AI research. While for feature extraction in a video +game typically the whole image is used, this is hardly practical for many real +world games. Instead, using a smaller game state reducing the dimension of the +parameter space to include essential parameters only seems to be a promising +approach. In the game of Foosball, a compact and comprehensive game state +description consists of the positional shifts and rotations of the figures and +the position of the ball over time. In particular, velocities and accelerations +can be derived from consecutive time samples of the game state. In this paper, +a figure detection system to determine the game state in Foosball is presented. +We capture a dataset containing the rotations of the rods which were measured +using accelerometers and the positional shifts were derived using traditional +Computer Vision techniques (in a laboratory setting). This dataset is utilized +to train Convolutional Neural Network (CNN) based end-to-end regression models +to predict the rotations and shifts of each rod. We present an evaluation of +our system using different state-of-the-art CNNs as base architectures for the +regression model. We show that our system is able to predict the game state +with high accuracy. By providing data for both black and white teams, the +presented system is intended to provide the required data for future +developments of Imitation Learning techniques w.r.t. to observing human +players. + +
+
+
+
+
+ + ☆ Iterative Refinement Strategy for Automated Data Labeling: Facial + Landmark Diagnosis in Medical Imaging + + +
+ Automated data labeling techniques are crucial for accelerating the +development of deep learning models, particularly in complex medical imaging +applications. However, ensuring accuracy and efficiency remains challenging. +This paper presents iterative refinement strategies for automated data labeling +in facial landmark diagnosis to enhance accuracy and efficiency for deep +learning models in medical applications, including dermatology, plastic +surgery, and ophthalmology. Leveraging feedback mechanisms and advanced +algorithms, our approach iteratively refines initial labels, reducing reliance +on manual intervention while improving label quality. Through empirical +evaluation and case studies, we demonstrate the effectiveness of our proposed +strategies in deep learning tasks across medical imaging domains. Our results +highlight the importance of iterative refinement in automated data labeling to +enhance the capabilities of deep learning systems in medical imaging +applications. + +
+
+
+
+
+ + ☆ Comparative Analysis of Image Enhancement Techniques for Brain Tumor + Segmentation: Contrast, Histogram, and Hybrid Approaches CCS + + +
+ This study systematically investigates the impact of image enhancement +techniques on Convolutional Neural Network (CNN)-based Brain Tumor +Segmentation, focusing on Histogram Equalization (HE), Contrast Limited +Adaptive Histogram Equalization (CLAHE), and their hybrid variations. Employing +the U-Net architecture on a dataset of 3064 Brain MRI images, the research +delves into preprocessing steps, including resizing and enhancement, to +optimize segmentation accuracy. A detailed analysis of the CNN-based U-Net +architecture, training, and validation processes is provided. The comparative +analysis, utilizing metrics such as Accuracy, Loss, MSE, IoU, and DSC, reveals +that the hybrid approach CLAHE-HE consistently outperforms others. Results +highlight its superior accuracy (0.9982, 0.9939, 0.9936 for training, testing, +and validation, respectively) and robust segmentation overlap, with Jaccard +values of 0.9862, 0.9847, and 0.9864, and Dice values of 0.993, 0.9923, and +0.9932 for the same phases, emphasizing its potential in neuro-oncological +applications. The study concludes with a call for refinement in segmentation +methodologies to further enhance diagnostic precision and treatment planning in +neuro-oncology. + +
+
+ comment: 9 Pages, & Figures, 2 Tables, International Conference on Computer + Science Electronics and Information (ICCSEI 2023) +
+
+
+
+
+ + ☆ Mask-ControlNet: Higher-Quality Image Generation with An Additional Mask + Prompt + + +
+ Text-to-image generation has witnessed great progress, especially with the +recent advancements in diffusion models. Since texts cannot provide detailed +conditions like object appearance, reference images are usually leveraged for +the control of objects in the generated images. However, existing methods still +suffer limited accuracy when the relationship between the foreground and +background is complicated. To address this issue, we develop a framework termed +Mask-ControlNet by introducing an additional mask prompt. Specifically, we +first employ large vision models to obtain masks to segment the objects of +interest in the reference image. Then, the object images are employed as +additional prompts to facilitate the diffusion model to better understand the +relationship between foreground and background regions during image generation. +Experiments show that the mask prompts enhance the controllability of the +diffusion model to maintain higher fidelity to the reference image while +achieving better image quality. Comparison with previous text-to-image +generation methods demonstrates our method's superior quantitative and +qualitative performance on the benchmark datasets. + +
+
+
+
+
+ + ☆ WebXR, A-Frame and Networked-Aframe as a Basis for an Open Metaverse: A + Conceptual Architecture + + +
+ This work proposes a WebXR-based cross-platform conceptual architecture, +leveraging the A-Frame and Networked-Aframe frameworks, in order to facilitate +the development of an open, accessible, and interoperable metaverse. By +introducing the concept of spatial web app, this research contributes to the +discourse on the metaverse, offering an architecture that democratizes access +to virtual environments and extended reality through the web, and aligns with +Tim Berners-Lee's original vision of the World Wide Web as an open platform in +the digital realm. + +
+
+ comment: 8 pages, 3 figures +
+
+
+
+
+ + ☆ CLIPping the Limits: Finding the Sweet Spot for Relevant Images in + Automated Driving Systems Perception Testing + + +
+ Perception systems, especially cameras, are the eyes of automated driving +systems. Ensuring that they function reliably and robustly is therefore an +important building block in the automation of vehicles. There are various +approaches to test the perception of automated driving systems. Ultimately, +however, it always comes down to the investigation of the behavior of +perception systems under specific input data. Camera images are a crucial part +of the input data. Image data sets are therefore collected for the testing of +automated driving systems, but it is non-trivial to find specific images in +these data sets. Thanks to recent developments in neural networks, there are +now methods for sorting the images in a data set according to their similarity +to a prompt in natural language. In order to further automate the provision of +search results, we make a contribution by automating the threshold definition +in these sorted results and returning only the images relevant to the prompt as +a result. Our focus is on preventing false positives and false negatives +equally. It is also important that our method is robust and in the case that +our assumptions are not fulfilled, we provide a fallback solution. + +
+
+
+
+
+ + ☆ Human Detection from 4D Radar Data in Low-Visibility Field Conditions ICRA 2024 + + +
+ Autonomous driving technology is increasingly being used on public roads and +in industrial settings such as mines. While it is essential to detect +pedestrians, vehicles, or other obstacles, adverse field conditions negatively +affect the performance of classical sensors such as cameras or lidars. Radar, +on the other hand, is a promising modality that is less affected by, e.g., +dust, smoke, water mist or fog. In particular, modern 4D imaging radars provide +target responses across the range, vertical angle, horizontal angle and Doppler +velocity dimensions. We propose TMVA4D, a CNN architecture that leverages this +4D radar modality for semantic segmentation. The CNN is trained to distinguish +between the background and person classes based on a series of 2D projections +of the 4D radar data that include the elevation, azimuth, range, and Doppler +velocity dimensions. We also outline the process of compiling a novel dataset +consisting of data collected in industrial settings with a car-mounted 4D radar +and describe how the ground-truth labels were generated from reference thermal +images. Using TMVA4D on this dataset, we achieve an mIoU score of 78.2% and an +mDice score of 86.1%, evaluated on the two classes background and person + +
+
+ comment: Submitted to Radar in Robotics workshop at ICRA 2024 +
+
+
+
+
+ + ☆ Texture Classification Network Integrating Adaptive Wavelet Transform + + +
+ Graves' disease is a common condition that is diagnosed clinically by +determining the smoothness of the thyroid texture and its morphology in +ultrasound images. Currently, the most widely used approach for the automated +diagnosis of Graves' disease utilizes Convolutional Neural Networks (CNNs) for +both feature extraction and classification. However, these methods demonstrate +limited efficacy in capturing texture features. Given the high capacity of +wavelets in describing texture features, this research integrates learnable +wavelet modules utilizing the Lifting Scheme into CNNs and incorporates a +parallel wavelet branch into the ResNet18 model to enhance texture feature +extraction. Our model can analyze texture features in spatial and frequency +domains simultaneously, leading to optimized classification accuracy. We +conducted experiments on collected ultrasound datasets and publicly available +natural image texture datasets, our proposed network achieved 97.27% accuracy +and 95.60% recall on ultrasound datasets, 60.765% accuracy on natural image +texture datasets, surpassing the accuracy of ResNet and conrming the +effectiveness of our approach. + +
+
+
+
+
+ + ☆ MindSet: Vision. A toolbox for testing DNNs on key psychological + experiments + + +
+ Multiple benchmarks have been developed to assess the alignment between deep +neural networks (DNNs) and human vision. In almost all cases these benchmarks +are observational in the sense they are composed of behavioural and brain +responses to naturalistic images that have not been manipulated to test +hypotheses regarding how DNNs or humans perceive and identify objects. Here we +introduce the toolbox MindSet: Vision, consisting of a collection of image +datasets and related scripts designed to test DNNs on 30 psychological +findings. In all experimental conditions, the stimuli are systematically +manipulated to test specific hypotheses regarding human visual perception and +object recognition. In addition to providing pre-generated datasets of images, +we provide code to regenerate these datasets, offering many configurable +parameters which greatly extend the dataset versatility for different research +contexts, and code to facilitate the testing of DNNs on these image datasets +using three different methods (similarity judgments, out-of-distribution +classification, and decoder method), accessible at +https://github.com/MindSetVision/mindset-vision. We test ResNet-152 on each of +these methods as an example of how the toolbox can be used. + +
+
+
+
+
+ + ☆ Detecting Every Object from Events + + +
+ Object detection is critical in autonomous driving, and it is more practical +yet challenging to localize objects of unknown categories: an endeavour known +as Class-Agnostic Object Detection (CAOD). Existing studies on CAOD +predominantly rely on ordinary cameras, but these frame-based sensors usually +have high latency and limited dynamic range, leading to safety risks in +real-world scenarios. In this study, we turn to a new modality enabled by the +so-called event camera, featured by its sub-millisecond latency and high +dynamic range, for robust CAOD. We propose Detecting Every Object in Events +(DEOE), an approach tailored for achieving high-speed, class-agnostic +open-world object detection in event-based vision. Built upon the fast +event-based backbone: recurrent vision transformer, we jointly consider the +spatial and temporal consistencies to identify potential objects. The +discovered potential objects are assimilated as soft positive samples to avoid +being suppressed as background. Moreover, we introduce a disentangled +objectness head to separate the foreground-background classification and novel +object discovery tasks, enhancing the model's generalization in localizing +novel objects while maintaining a strong ability to filter out the background. +Extensive experiments confirm the superiority of our proposed DEOE in +comparison with three strong baseline methods that integrate the +state-of-the-art event-based object detector with advancements in RGB-based +CAOD. Our code is available at https://github.com/Hatins/DEOE. + +
+
+
+
+
+ + ☆ MOSE: Boosting Vision-based Roadside 3D Object Detection with Scene Cues + + +
+ 3D object detection based on roadside cameras is an additional way for +autonomous driving to alleviate the challenges of occlusion and short +perception range from vehicle cameras. Previous methods for roadside 3D object +detection mainly focus on modeling the depth or height of objects, neglecting +the stationary of cameras and the characteristic of inter-frame consistency. In +this work, we propose a novel framework, namely MOSE, for MOnocular 3D object +detection with Scene cuEs. The scene cues are the frame-invariant +scene-specific features, which are crucial for object localization and can be +intuitively regarded as the height between the surface of the real road and the +virtual ground plane. In the proposed framework, a scene cue bank is designed +to aggregate scene cues from multiple frames of the same scene with a carefully +designed extrinsic augmentation strategy. Then, a transformer-based decoder +lifts the aggregated scene cues as well as the 3D position embeddings for 3D +object location, which boosts generalization ability in heterologous scenes. +The extensive experiment results on two public benchmarks demonstrate the +state-of-the-art performance of the proposed method, which surpasses the +existing methods by a large margin. + +
+
+
+
+
+ + ☆ Deep Optics for Video Snapshot Compressive Imaging ICCV 2023 + + +
+ Video snapshot compressive imaging (SCI) aims to capture a sequence of video +frames with only a single shot of a 2D detector, whose backbones rest in +optical modulation patterns (also known as masks) and a computational +reconstruction algorithm. Advanced deep learning algorithms and mature hardware +are putting video SCI into practical applications. Yet, there are two clouds in +the sunshine of SCI: i) low dynamic range as a victim of high temporal +multiplexing, and ii) existing deep learning algorithms' degradation on real +system. To address these challenges, this paper presents a deep optics +framework to jointly optimize masks and a reconstruction network. Specifically, +we first propose a new type of structural mask to realize motion-aware and +full-dynamic-range measurement. Considering the motion awareness property in +measurement domain, we develop an efficient network for video SCI +reconstruction using Transformer to capture long-term temporal dependencies, +dubbed Res2former. Moreover, sensor response is introduced into the forward +model of video SCI to guarantee end-to-end model training close to real system. +Finally, we implement the learned structural masks on a digital micro-mirror +device. Experimental results on synthetic and real data validate the +effectiveness of the proposed framework. We believe this is a milestone for +real-world video SCI. The source code and data are available at +https://github.com/pwangcs/DeepOpticsSCI. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ☆ MC$^2$: Multi-concept Guidance for Customized Multi-concept Generation + + +
+ Customized text-to-image generation aims to synthesize instantiations of +user-specified concepts and has achieved unprecedented progress in handling +individual concept. However, when extending to multiple customized concepts, +existing methods exhibit limitations in terms of flexibility and fidelity, only +accommodating the combination of limited types of models and potentially +resulting in a mix of characteristics from different concepts. In this paper, +we introduce the Multi-concept guidance for Multi-concept customization, termed +MC$^2$, for improved flexibility and fidelity. MC$^2$ decouples the +requirements for model architecture via inference time optimization, allowing +the integration of various heterogeneous single-concept customized models. It +adaptively refines the attention weights between visual and textual tokens, +directing image regions to focus on their associated words while diminishing +the impact of irrelevant ones. Extensive experiments demonstrate that MC$^2$ +even surpasses previous methods that require additional training in terms of +consistency with input prompt and reference images. Moreover, MC$^2$ can be +extended to elevate the compositional capabilities of text-to-image generation, +yielding appealing results. Code will be publicly available at +https://github.com/JIANGJiaXiu/MC-2. + +
+
+
+
+
+ + ☆ Unbridled Icarus: A Survey of the Potential Perils of Image Inputs in + Multimodal Large Language Model Security + + +
+ Multimodal Large Language Models (MLLMs) demonstrate remarkable capabilities +that increasingly influence various aspects of our daily lives, constantly +defining the new boundary of Artificial General Intelligence (AGI). Image +modalities, enriched with profound semantic information and a more continuous +mathematical nature compared to other modalities, greatly enhance the +functionalities of MLLMs when integrated. However, this integration serves as a +double-edged sword, providing attackers with expansive vulnerabilities to +exploit for highly covert and harmful attacks. The pursuit of reliable AI +systems like powerful MLLMs has emerged as a pivotal area of contemporary +research. In this paper, we endeavor to demostrate the multifaceted risks +associated with the incorporation of image modalities into MLLMs. Initially, we +delineate the foundational components and training processes of MLLMs. +Subsequently, we construct a threat model, outlining the security +vulnerabilities intrinsic to MLLMs. Moreover, we analyze and summarize existing +scholarly discourses on MLLMs' attack and defense mechanisms, culminating in +suggestions for the future research on MLLM security. Through this +comprehensive analysis, we aim to deepen the academic understanding of MLLM +security challenges and propel forward the development of trustworthy MLLM +systems. + +
+
+ comment: 8 pages, 1 figure +
+
+
+
+
+ + ☆ Unsupervised Band Selection Using Fused HSI and LiDAR Attention + Integrating With Autoencoder + + +
+ Band selection in hyperspectral imaging (HSI) is critical for optimising data +processing and enhancing analytical accuracy. Traditional approaches have +predominantly concentrated on analysing spectral and pixel characteristics +within individual bands independently. These approaches overlook the potential +benefits of integrating multiple data sources, such as Light Detection and +Ranging (LiDAR), and is further challenged by the limited availability of +labeled data in HSI processing, which represents a significant obstacle. To +address these challenges, this paper introduces a novel unsupervised band +selection framework that incorporates attention mechanisms and an Autoencoder +for reconstruction-based band selection. Our methodology distinctively +integrates HSI with LiDAR data through an attention score, using a +convolutional Autoencoder to process the combined feature mask. This fusion +effectively captures essential spatial and spectral features and reduces +redundancy in hyperspectral datasets. A comprehensive comparative analysis of +our innovative fused band selection approach is performed against existing +unsupervised band selection and fusion models. We used data sets such as +Houston 2013, Trento, and MUUFLE for our experiments. The results demonstrate +that our method achieves superior classification accuracy and significantly +outperforms existing models. This enhancement in HSI band selection, +facilitated by the incorporation of LiDAR features, underscores the +considerable advantages of integrating features from different sources. + +
+
+ comment: 13 pages, 13figures, 6 tables +
+
+
+
+
+ + ☆ Text-to-Image Synthesis for Any Artistic Styles: Advancements in + Personalized Artistic Image Generation via Subdivision and Dual Binding + + +
+ Recent advancements in text-to-image models, such as Stable Diffusion, have +demonstrated their ability to synthesize visual images through natural language +prompts. One approach of personalizing text-to-image models, exemplified by +DreamBooth, fine-tunes the pre-trained model by binding unique text identifiers +with a few images of a specific subject. Although existing fine-tuning methods +have demonstrated competence in rendering images according to the styles of +famous painters, it is still challenging to learn to produce images +encapsulating distinct art styles due to abstract and broad visual perceptions +of stylistic attributes such as lines, shapes, textures, and colors. In this +paper, we introduce a new method, Single-StyleForge, for personalization. It +fine-tunes pre-trained text-to-image diffusion models to generate diverse +images in specified styles from text prompts. By using around 15-20 images of +the target style, the approach establishes a foundational binding of a unique +token identifier with a broad range of the target style. It also utilizes +auxiliary images to strengthen this binding, resulting in offering specific +guidance on representing elements such as persons in a target style-consistent +manner. In addition, we present ways to improve the quality of style and +text-image alignment through a method called Multi-StyleForge, which inherits +the strategy used in StyleForge and learns tokens in multiple. Experimental +evaluation conducted on six distinct artistic styles demonstrates substantial +improvements in both the quality of generated images and the perceptual +fidelity metrics, such as FID, KID, and CLIP scores. + +
+
+ comment: 20 pages, 12 figuers +
+
+
+
+
+ + ☆ CodeEnhance: A Codebook-Driven Approach for Low-Light Image Enhancement + + +
+ Low-light image enhancement (LLIE) aims to improve low-illumination images. +However, existing methods face two challenges: (1) uncertainty in restoration +from diverse brightness degradations; (2) loss of texture and color information +caused by noise suppression and light enhancement. In this paper, we propose a +novel enhancement approach, CodeEnhance, by leveraging quantized priors and +image refinement to address these challenges. In particular, we reframe LLIE as +learning an image-to-code mapping from low-light images to discrete codebook, +which has been learned from high-quality images. To enhance this process, a +Semantic Embedding Module (SEM) is introduced to integrate semantic information +with low-level features, and a Codebook Shift (CS) mechanism, designed to adapt +the pre-learned codebook to better suit the distinct characteristics of our +low-light dataset. Additionally, we present an Interactive Feature +Transformation (IFT) module to refine texture and color information during +image reconstruction, allowing for interactive enhancement based on user +preferences. Extensive experiments on both real-world and synthetic benchmarks +demonstrate that the incorporation of prior knowledge and controllable +information transfer significantly enhances LLIE performance in terms of +quality and fidelity. The proposed CodeEnhance exhibits superior robustness to +various degradations, including uneven illumination, noise, and color +distortion. + +
+
+ comment: 10 pages, 13 figures +
+
+
+
+
+ + ☆ Allowing humans to interactively guide machines where to look does not + always improve a human-AI team's classification accuracy CVPR + 2024 + + +
+ Via thousands of papers in Explainable AI (XAI), attention maps +\cite{vaswani2017attention} and feature attribution maps \cite{bansal2020sam} +have been established as a common means for explaining the input features that +are important to AI's decisions. It is an interesting but unexplored question +whether allowing users to edit the importance scores of input features at test +time would improve the human-AI team's accuracy on downstream tasks. In this +paper, we address this question by taking CHM-Corr, a state-of-the-art, +ante-hoc explanation method \cite{taesiri2022visual} that first predicts +patch-wise correspondences between the input and the training-set images, and +then uses them to make classification decisions. We build an interactive +interface on top of CHM-Corr, enabling users to directly edit the initial +feature attribution map provided by CHM-Corr. Via our CHM-Corr++ interface, +users gain insights into if, when, and how the model changes its outputs, +enhancing understanding beyond static explanations. Our user study with 18 +machine learning researchers who performed $\sim$1,400 decisions shows that our +interactive approach does not improve user accuracy on CUB-200 bird image +classification over static explanations. This challenges the belief that +interactivity inherently boosts XAI +effectiveness~\cite{sokol2020one,sun2022exploring,shen2024towards,singh2024rethinking,mindlin2024beyond,lakkaraju2022rethinking,cheng2019explaining,liu2021understanding} +and raises needs for future research. Our work contributes to the field by +open-sourcing an interactive tool for manipulating model attention, and it lays +the groundwork for future research to enable effective human-AI interaction in +computer vision. We release code and data on +\href{https://anonymous.4open.science/r/CHMCorrPlusPlus/}{github}. Our +interface are available \href{http://137.184.82.109:7080/}{here}. + +
+
+ comment: Accepted for presentation at the XAI4CV Workshop, part of the CVPR + 2024 proceedings +
+
+
+
+
+ + ☆ Stylizing Sparse-View 3D Scenes with Hierarchical Neural Representation + + +
+ Recently, a surge of 3D style transfer methods has been proposed that +leverage the scene reconstruction power of a pre-trained neural radiance field +(NeRF). To successfully stylize a scene this way, one must first reconstruct a +photo-realistic radiance field from collected images of the scene. However, +when only sparse input views are available, pre-trained few-shot NeRFs often +suffer from high-frequency artifacts, which are generated as a by-product of +high-frequency details for improving reconstruction quality. Is it possible to +generate more faithful stylized scenes from sparse inputs by directly +optimizing encoding-based scene representation with target style? In this +paper, we consider the stylization of sparse-view scenes in terms of +disentangling content semantics and style textures. We propose a coarse-to-fine +sparse-view scene stylization framework, where a novel hierarchical +encoding-based neural representation is designed to generate high-quality +stylized scenes directly from implicit scene representations. We also propose a +new optimization strategy with content strength annealing to achieve realistic +stylization and better content preservation. Extensive experiments demonstrate +that our method can achieve high-quality stylization of sparse-view scenes and +outperforms fine-tuning-based baselines in terms of stylization quality and +efficiency. + +
+
+
+
+
+ + ☆ PromptAD: Learning Prompts with only Normal Samples for Few-Shot Anomaly + Detection CVPR2024 + + +
+ The vision-language model has brought great improvement to few-shot +industrial anomaly detection, which usually needs to design of hundreds of +prompts through prompt engineering. For automated scenarios, we first use +conventional prompt learning with many-class paradigm as the baseline to +automatically learn prompts but found that it can not work well in one-class +anomaly detection. To address the above problem, this paper proposes a +one-class prompt learning method for few-shot anomaly detection, termed +PromptAD. First, we propose semantic concatenation which can transpose normal +prompts into anomaly prompts by concatenating normal prompts with anomaly +suffixes, thus constructing a large number of negative samples used to guide +prompt learning in one-class setting. Furthermore, to mitigate the training +challenge caused by the absence of anomaly images, we introduce the concept of +explicit anomaly margin, which is used to explicitly control the margin between +normal prompt features and anomaly prompt features through a hyper-parameter. +For image-level/pixel-level anomaly detection, PromptAD achieves first place in +11/12 few-shot settings on MVTec and VisA. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ LayoutLLM: Layout Instruction Tuning with Large Language Models for + Document Understanding CVPR 2024 + + +
+ Recently, leveraging large language models (LLMs) or multimodal large +language models (MLLMs) for document understanding has been proven very +promising. However, previous works that employ LLMs/MLLMs for document +understanding have not fully explored and utilized the document layout +information, which is vital for precise document understanding. In this paper, +we propose LayoutLLM, an LLM/MLLM based method for document understanding. The +core of LayoutLLM is a layout instruction tuning strategy, which is specially +designed to enhance the comprehension and utilization of document layouts. The +proposed layout instruction tuning strategy consists of two components: +Layout-aware Pre-training and Layout-aware Supervised Fine-tuning. To capture +the characteristics of document layout in Layout-aware Pre-training, three +groups of pre-training tasks, corresponding to document-level, region-level and +segment-level information, are introduced. Furthermore, a novel module called +layout chain-of-thought (LayoutCoT) is devised to enable LayoutLLM to focus on +regions relevant to the question and generate accurate answers. LayoutCoT is +effective for boosting the performance of document understanding. Meanwhile, it +brings a certain degree of interpretability, which could facilitate manual +inspection and correction. Experiments on standard benchmarks show that the +proposed LayoutLLM significantly outperforms existing methods that adopt +open-source 7B LLMs/MLLMs for document understanding. The training data of the +LayoutLLM is publicly available at +https://github.com/AlibabaResearch/AdvancedLiterateMachinery/tree/main/DocumentUnderstanding/LayoutLLM + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ StylizedGS: Controllable Stylization for 3D Gaussian Splatting + + +
+ With the rapid development of XR, 3D generation and editing are becoming more +and more important, among which, stylization is an important tool of 3D +appearance editing. It can achieve consistent 3D artistic stylization given a +single reference style image and thus is a user-friendly editing way. However, +recent NeRF-based 3D stylization methods face efficiency issues that affect the +actual user experience and the implicit nature limits its ability to transfer +the geometric pattern styles. Additionally, the ability for artists to exert +flexible control over stylized scenes is considered highly desirable, fostering +an environment conducive to creative exploration. In this paper, we introduce +StylizedGS, a 3D neural style transfer framework with adaptable control over +perceptual factors based on 3D Gaussian Splatting (3DGS) representation. The +3DGS brings the benefits of high efficiency. We propose a GS filter to +eliminate floaters in the reconstruction which affects the stylization effects +before stylization. Then the nearest neighbor-based style loss is introduced to +achieve stylization by fine-tuning the geometry and color parameters of 3DGS, +while a depth preservation loss with other regularizations is proposed to +prevent the tampering of geometry content. Moreover, facilitated by specially +designed losses, StylizedGS enables users to control color, stylized scale and +regions during the stylization to possess customized capabilities. Our method +can attain high-quality stylization results characterized by faithful +brushstrokes and geometric consistency with flexible controls. Extensive +experiments across various scenes and styles demonstrate the effectiveness and +efficiency of our method concerning both stylization quality and inference FPS. + +
+
+
+
+
+ + ☆ Multi-agent Long-term 3D Human Pose Forecasting via Interaction-aware + Trajectory Conditioning CVPR + + +
+ Human pose forecasting garners attention for its diverse applications. +However, challenges in modeling the multi-modal nature of human motion and +intricate interactions among agents persist, particularly with longer +timescales and more agents. In this paper, we propose an interaction-aware +trajectory-conditioned long-term multi-agent human pose forecasting model, +utilizing a coarse-to-fine prediction approach: multi-modal global trajectories +are initially forecasted, followed by respective local pose forecasts +conditioned on each mode. In doing so, our Trajectory2Pose model introduces a +graph-based agent-wise interaction module for a reciprocal forecast of local +motion-conditioned global trajectory and trajectory-conditioned local pose. Our +model effectively handles the multi-modality of human motion and the complexity +of long-term multi-agent interactions, improving performance in complex +environments. Furthermore, we address the lack of long-term (6s+) multi-agent +(5+) datasets by constructing a new dataset from real-world images and 2D +annotations, enabling a comprehensive evaluation of our proposed model. +State-of-the-art prediction performance on both complex and simpler datasets +confirms the generalized effectiveness of our method. The code is available at +https://github.com/Jaewoo97/T2P. + +
+
+ comment: 2024 CVPR Highlight +
+
+
+
+
+ + ☆ Spatio-Temporal Attention and Gaussian Processes for Personalized Video + Gaze Estimation CVPR 2024 + + +
+ Gaze is an essential prompt for analyzing human behavior and attention. +Recently, there has been an increasing interest in determining gaze direction +from facial videos. However, video gaze estimation faces significant +challenges, such as understanding the dynamic evolution of gaze in video +sequences, dealing with static backgrounds, and adapting to variations in +illumination. To address these challenges, we propose a simple and novel deep +learning model designed to estimate gaze from videos, incorporating a +specialized attention module. Our method employs a spatial attention mechanism +that tracks spatial dynamics within videos. This technique enables accurate +gaze direction prediction through a temporal sequence model, adeptly +transforming spatial observations into temporal insights, thereby significantly +improving gaze estimation accuracy. Additionally, our approach integrates +Gaussian processes to include individual-specific traits, facilitating the +personalization of our model with just a few labeled samples. Experimental +results confirm the efficacy of the proposed approach, demonstrating its +success in both within-dataset and cross-dataset settings. Specifically, our +proposed approach achieves state-of-the-art performance on the Gaze360 dataset, +improving by $2.5^\circ$ without personalization. Further, by personalizing the +model with just three samples, we achieved an additional improvement of +$0.8^\circ$. The code and pre-trained models are available at +\url{https://github.com/jswati31/stage}. + +
+
+ comment: Accepted at CVPR 2024 Gaze workshop +
+
+
+
+
+ + ☆ DiffCJK: Conditional Diffusion Model for High-Quality and Wide-coverage + CJK Character Generation + + +
+ Chinese, Japanese, and Korean (CJK), with a vast number of native speakers, +has profound influence on society and culture. The typesetting of CJK languages +carries a wide range of requirements due to the complexity of their scripts and +unique literary traditions. A critical aspect of this typesetting process is +that CJK fonts need to provide a set of consistent-looking glyphs for +approximately one hundred thousand characters. However, creating such a font is +inherently labor-intensive and expensive, which significantly hampers the +development of new CJK fonts for typesetting, historical, aesthetic, or +artistic purposes. + To bridge this gap, we are motivated by recent advancements in +diffusion-based generative models and propose a novel diffusion method for +generating glyphs in a targeted style from a \emph{single} conditioned, +standard glyph form. Our experiments show that our method is capable of +generating fonts of both printed and hand-written styles, the latter of which +presents a greater challenge. Moreover, our approach shows remarkable zero-shot +generalization capabilities for non-CJK but Chinese-inspired scripts. We also +show our method facilitates smooth style interpolation and generates bitmap +images suitable for vectorization, which is crucial in the font creation +process. In summary, our proposed method opens the door to high-quality, +generative model-assisted font creation for CJK characters, for both +typesetting and artistic endeavors. + +
+
+
+
+
+ + ☆ Multi-level Graph Subspace Contrastive Learning for Hyperspectral Image + Clustering IJCNN 2024 + + +
+ Hyperspectral image (HSI) clustering is a challenging task due to its high +complexity. Despite subspace clustering shows impressive performance for HSI, +traditional methods tend to ignore the global-local interaction in HSI data. In +this study, we proposed a multi-level graph subspace contrastive learning +(MLGSC) for HSI clustering. The model is divided into the following main parts. +Graph convolution subspace construction: utilizing spectral and texture +feautures to construct two graph convolution views. Local-global graph +representation: local graph representations were obtained by step-by-step +convolutions and a more representative global graph representation was obtained +using an attention-based pooling strategy. Multi-level graph subspace +contrastive learning: multi-level contrastive learning was conducted to obtain +local-global joint graph representations, to improve the consistency of the +positive samples between views, and to obtain more robust graph embeddings. +Specifically, graph-level contrastive learning is used to better learn global +representations of HSI data. Node-level intra-view and inter-view contrastive +learning is designed to learn joint representations of local regions of HSI. +The proposed model is evaluated on four popular HSI datasets: Indian Pines, +Pavia University, Houston, and Xu Zhou. The overall accuracies are 97.75%, +99.96%, 92.28%, and 95.73%, which significantly outperforms the current +state-of-the-art clustering methods. + +
+
+ comment: IJCNN 2024 +
+
+
+
+
+ + ☆ Bidirectional Long-Range Parser for Sequential Data Understanding + + +
+ The transformer is a powerful data modelling framework responsible for +remarkable performance on a wide range of tasks. However, they are limited in +terms of scalability as it is suboptimal and inefficient to process +long-sequence data. To this purpose we introduce BLRP (Bidirectional Long-Range +Parser), a novel and versatile attention mechanism designed to increase +performance and efficiency on long-sequence tasks. It leverages short and long +range heuristics in the form of a local sliding window approach combined with a +global bidirectional latent space synthesis technique. We show the benefits and +versatility of our approach on vision and language domains by demonstrating +competitive results against state-of-the-art methods on the Long-Range-Arena +and CIFAR benchmarks together with ablations demonstrating the computational +efficiency. + +
+
+
+
+
+ + ☆ iVPT: Improving Task-relevant Information Sharing in Visual Prompt + Tuning by Cross-layer Dynamic Connection + + +
+ Recent progress has shown great potential of visual prompt tuning (VPT) when +adapting pre-trained vision transformers to various downstream tasks. However, +most existing solutions independently optimize prompts at each layer, thereby +neglecting the usage of task-relevant information encoded in prompt tokens +across layers. Additionally, existing prompt structures are prone to +interference from task-irrelevant noise in input images, which can do harm to +the sharing of task-relevant information. In this paper, we propose a novel VPT +approach, \textbf{iVPT}. It innovatively incorporates a cross-layer dynamic +connection (CDC) for input prompt tokens from adjacent layers, enabling +effective sharing of task-relevant information. Furthermore, we design a +dynamic aggregation (DA) module that facilitates selective sharing of +information between layers. The combination of CDC and DA enhances the +flexibility of the attention process within the VPT framework. Building upon +these foundations, iVPT introduces an attentive reinforcement (AR) mechanism, +by automatically identifying salient image tokens, which are further enhanced +by prompt tokens in an additive manner. Extensive experiments on 24 image +classification and semantic segmentation benchmarks clearly demonstrate the +advantage of the proposed iVPT, compared to the state-of-the-art counterparts. + +
+
+
+
+
+ + ☆ SoundingActions: Learning How Actions Sound from Narrated Egocentric + Videos CVPR 2024 + + +
+ We propose a novel self-supervised embedding to learn how actions sound from +narrated in-the-wild egocentric videos. Whereas existing methods rely on +curated data with known audio-visual correspondence, our multimodal +contrastive-consensus coding (MC3) embedding reinforces the associations +between audio, language, and vision when all modality pairs agree, while +diminishing those associations when any one pair does not. We show our approach +can successfully discover how the long tail of human actions sound from +egocentric video, outperforming an array of recent multimodal embedding +techniques on two datasets (Ego4D and EPIC-Sounds) and multiple cross-modal +tasks. + +
+
+ comment: Accepted at CVPR 2024. Project page: + https://vision.cs.utexas.edu/projects/soundingactions +
+
+
+
+
+ + ☆ A secure and private ensemble matcher using multi-vault obfuscated + templates + + +
+ Given the irrevocability of biometric samples and mounting privacy concerns, +biometric template security and secure matching are among the essential +features of any well-designed modern biometric system. In this paper, we +propose an obfuscation method that hides the biometric template information +with just enough chaff. The main idea is to reduce the number of chaff points +to a practical level by creating n sub-templates from the original template and +hiding each sub-template with m chaff points. During verification, s closest +vectors to the biometric query are retrieved from each vault and then combined +to generate hash values that are compared with the stored hash value. We +demonstrate the effectiveness of synthetic facial images, generated by a +Generative Adversarial Network (GAN), as ``random chaff points'' within a +secure-vault authorization system. This approach safeguards user identities +during training and deployment. We tested our protocol using the AT&T, GT, and +LFW face datasets, with the ROC areas under the curve being 0.99, 0.99, and +0.90, respectively. These numbers were close to those of the unprotected +templates, showing that our method does not adversely affect accuracy. + +
+
+
+
+
+ + ☆ HSViT: Horizontally Scalable Vision Transformer + + +
+ While the Vision Transformer (ViT) architecture gains prominence in computer +vision and attracts significant attention from multimedia communities, its +deficiency in prior knowledge (inductive bias) regarding shift, scale, and +rotational invariance necessitates pre-training on large-scale datasets. +Furthermore, the growing layers and parameters in both ViT and convolutional +neural networks (CNNs) impede their applicability to mobile multimedia +services, primarily owing to the constrained computational resources on edge +devices. To mitigate the aforementioned challenges, this paper introduces a +novel horizontally scalable vision transformer (HSViT). Specifically, a novel +image-level feature embedding allows ViT to better leverage the inductive bias +inherent in the convolutional layers. Based on this, an innovative horizontally +scalable architecture is designed, which reduces the number of layers and +parameters of the models while facilitating collaborative training and +inference of ViT models across multiple nodes. The experimental results depict +that, without pre-training on large-scale datasets, HSViT achieves up to 10% +higher top-1 accuracy than state-of-the-art schemes, ascertaining its superior +preservation of inductive bias. The code is available at +https://github.com/xuchenhao001/HSViT. + +
+
+
+
+
+ + ☆ LGSDF: Continual Global Learning of Signed Distance Fields Aided by + Local Updating + + +
+ Implicit reconstruction of ESDF (Euclidean Signed Distance Field) involves +training a neural network to regress the signed distance from any point to the +nearest obstacle, which has the advantages of lightweight storage and +continuous querying. However, existing algorithms usually rely on conflicting +raw observations as training data, resulting in poor map performance. In this +paper, we propose LGSDF, an ESDF continual Global learning algorithm aided by +Local updating. At the front end, axis-aligned grids are dynamically updated by +pre-processed sensor observations, where incremental fusion alleviates +estimation error caused by limited viewing directions. At the back end, a +randomly initialized implicit ESDF neural network performs continual +self-supervised learning guided by these grids to generate smooth and +continuous maps. The results on multiple scenes show that LGSDF can construct +more accurate ESDF maps and meshes compared with SOTA (State Of The Art) +explicit and implicit mapping algorithms. The source code of LGSDF is publicly +available at https://github.com/BIT-DYN/LGSDF. + +
+
+
+
+
+ + ☆ Progressive Alignment with VLM-LLM Feature to Augment Defect + Classification for the ASE Dataset + + +
+ Traditional defect classification approaches are facing with two barriers. +(1) Insufficient training data and unstable data quality. Collecting sufficient +defective sample is expensive and time-costing, consequently leading to dataset +variance. It introduces the difficulty on recognition and learning. (2) +Over-dependence on visual modality. When the image pattern and texture is +monotonic for all defect classes in a given dataset, the performance of +conventional AOI system cannot be guaranteed. In scenarios where image quality +is compromised due to mechanical failures or when defect information is +inherently difficult to discern, the performance of deep models cannot be +guaranteed. A main question is, "how to solve those two problems when they +occur at the same time?" The feasible strategy is to explore another feature +within dataset and combine an eminent vision-language model (VLM) and +Large-Language model (LLM) with their astonishing zero-shot capability. In this +work, we propose the special ASE dataset, including rich data description +recorded on image, for defect classification, but the defect feature is uneasy +to learn directly. Secondly, We present the prompting for VLM-LLM against +defect classification with the proposed ASE dataset to activate extra-modality +feature from images to enhance performance. Then, We design the novel +progressive feature alignment (PFA) block to refine image-text feature to +alleviate the difficulty of alignment under few-shot scenario. Finally, the +proposed Cross-modality attention fusion (CMAF) module can effectively fuse +different modality feature. Experiment results have demonstrated our method's +effectiveness over several defect classification methods for the ASE dataset. + +
+
+ comment: MULA 2024 +
+
+
+
+
+ + ☆ Adaptive Learning for Multi-view Stereo Reconstruction + + +
+ Deep learning has recently demonstrated its excellent performance on the task +of multi-view stereo (MVS). However, loss functions applied for deep MVS are +rarely studied. In this paper, we first analyze existing loss functions' +properties for deep depth based MVS approaches. Regression based loss leads to +inaccurate continuous results by computing mathematical expectation, while +classification based loss outputs discretized depth values. To this end, we +then propose a novel loss function, named adaptive Wasserstein loss, which is +able to narrow down the difference between the true and predicted probability +distributions of depth. Besides, a simple but effective offset module is +introduced to better achieve sub-pixel prediction accuracy. Extensive +experiments on different benchmarks, including DTU, Tanks and Temples and +BlendedMVS, show that the proposed method with the adaptive Wasserstein loss +and the offset module achieves state-of-the-art performance. + +
+
+
+
+
+ + ☆ GloSoFarID: Global multispectral dataset for Solar Farm IDentification + in satellite imagery + + +
+ Solar Photovoltaic (PV) technology is increasingly recognized as a pivotal +solution in the global pursuit of clean and renewable energy. This technology +addresses the urgent need for sustainable energy alternatives by converting +solar power into electricity without greenhouse gas emissions. It not only +curtails global carbon emissions but also reduces reliance on finite, +non-renewable energy sources. In this context, monitoring solar panel farms +becomes essential for understanding and facilitating the worldwide shift toward +clean energy. This study contributes to this effort by developing the first +comprehensive global dataset of multispectral satellite imagery of solar panel +farms. This dataset is intended to form the basis for training robust machine +learning models, which can accurately map and analyze the expansion and +distribution of solar panel farms globally. The insights gained from this +endeavor will be instrumental in guiding informed decision-making for a +sustainable energy future. https://github.com/yzyly1992/GloSoFarID + +
+
+
+
+
+ + ☆ QMix: Quality-aware Learning with Mixed Noise for Robust Retinal Disease + Diagnosis + + +
+ Due to the complexity of medical image acquisition and the difficulty of +annotation, medical image datasets inevitably contain noise. Noisy data with +wrong labels affects the robustness and generalization ability of deep neural +networks. Previous noise learning methods mainly considered noise arising from +images being mislabeled, i.e. label noise, assuming that all mislabeled images +are of high image quality. However, medical images are prone to suffering +extreme quality issues, i.e. data noise, where discriminative visual features +are missing for disease diagnosis. In this paper, we propose a noise learning +framework, termed as QMix, that learns a robust disease diagnosis model under +mixed noise. QMix alternates between sample separation and quality-aware +semisupervised training in each training epoch. In the sample separation phase, +we design a joint uncertainty-loss criterion to effectively separate (1) +correctly labeled images; (2) mislabeled images with high quality and (3) +mislabeled images with low quality. In the semi-supervised training phase, we +train a disease diagnosis model to learn robust feature representation from the +separated samples. Specifically, we devise a sample-reweighing loss to mitigate +the effect of mislabeled images with low quality during training. Meanwhile, a +contrastive enhancement loss is proposed to further distinguish mislabeled +images with low quality from correctly labeled images. QMix achieved +state-of-the-art disease diagnosis performance on five public retinal image +datasets and exhibited substantial improvement on robustness against mixed +noise. + +
+
+
+
+
+ + ☆ Semantic Flow: Learning Semantic Field of Dynamic Scenes from Monocular + Videos ICLR 2024 + + +
+ In this work, we pioneer Semantic Flow, a neural semantic representation of +dynamic scenes from monocular videos. In contrast to previous NeRF methods that +reconstruct dynamic scenes from the colors and volume densities of individual +points, Semantic Flow learns semantics from continuous flows that contain rich +3D motion information. As there is 2D-to-3D ambiguity problem in the viewing +direction when extracting 3D flow features from 2D video frames, we consider +the volume densities as opacity priors that describe the contributions of flow +features to the semantics on the frames. More specifically, we first learn a +flow network to predict flows in the dynamic scene, and propose a flow feature +aggregation module to extract flow features from video frames. Then, we propose +a flow attention module to extract motion information from flow features, which +is followed by a semantic network to output semantic logits of flows. We +integrate the logits with volume densities in the viewing direction to +supervise the flow features with semantic labels on video frames. Experimental +results show that our model is able to learn from multiple dynamic scenes and +supports a series of new tasks such as instance-level scene editing, semantic +completions, dynamic scene tracking and semantic adaption on novel scenes. +Codes are available at https://github.com/tianfr/Semantic-Flow/. + +
+
+ comment: Accepted by ICLR 2024, Codes are available at + https://github.com/tianfr/Semantic-Flow/ +
+
+
+
+
+ + ☆ UniMix: Towards Domain Adaptive and Generalizable LiDAR Semantic + Segmentation in Adverse Weather CVPR 2024 + + +
+ LiDAR semantic segmentation (LSS) is a critical task in autonomous driving +and has achieved promising progress. However, prior LSS methods are +conventionally investigated and evaluated on datasets within the same domain in +clear weather. The robustness of LSS models in unseen scenes and all weather +conditions is crucial for ensuring safety and reliability in real applications. +To this end, we propose UniMix, a universal method that enhances the +adaptability and generalizability of LSS models. UniMix first leverages +physically valid adverse weather simulation to construct a Bridge Domain, which +serves to bridge the domain gap between the clear weather scenes and the +adverse weather scenes. Then, a Universal Mixing operator is defined regarding +spatial, intensity, and semantic distributions to create the intermediate +domain with mixed samples from given domains. Integrating the proposed two +techniques into a teacher-student framework, UniMix efficiently mitigates the +domain gap and enables LSS models to learn weather-robust and domain-invariant +representations. We devote UniMix to two main setups: 1) unsupervised domain +adaption, adapting the model from the clear weather source domain to the +adverse weather target domain; 2) domain generalization, learning a model that +generalizes well to unseen scenes in adverse weather. Extensive experiments +validate the effectiveness of UniMix across different tasks and datasets, all +achieving superior performance over state-of-the-art methods. The code will be +released. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Enhancing Clinical Efficiency through LLM: Discharge Note Generation for + Cardiac Patients + + +
+ Medical documentation, including discharge notes, is crucial for ensuring +patient care quality, continuity, and effective medical communication. However, +the manual creation of these documents is not only time-consuming but also +prone to inconsistencies and potential errors. The automation of this +documentation process using artificial intelligence (AI) represents a promising +area of innovation in healthcare. This study directly addresses the +inefficiencies and inaccuracies in creating discharge notes manually, +particularly for cardiac patients, by employing AI techniques, specifically +large language model (LLM). Utilizing a substantial dataset from a cardiology +center, encompassing wide-ranging medical records and physician assessments, +our research evaluates the capability of LLM to enhance the documentation +process. Among the various models assessed, Mistral-7B distinguished itself by +accurately generating discharge notes that significantly improve both +documentation efficiency and the continuity of care for patients. These notes +underwent rigorous qualitative evaluation by medical expert, receiving high +marks for their clinical relevance, completeness, readability, and contribution +to informed decision-making and care planning. Coupled with quantitative +analyses, these results confirm Mistral-7B's efficacy in distilling complex +medical information into concise, coherent summaries. Overall, our findings +illuminate the considerable promise of specialized LLM, such as Mistral-7B, in +refining healthcare documentation workflows and advancing patient care. This +study lays the groundwork for further integrating advanced AI technologies in +healthcare, demonstrating their potential to revolutionize patient +documentation and support better care outcomes. + +
+
+ comment: 10 pages, 1 figure, 3 tables, conference +
+
+
+
+
+ + ☆ Better Monocular 3D Detectors with LiDAR from the Past ICRA 2022 + + +
+ Accurate 3D object detection is crucial to autonomous driving. Though +LiDAR-based detectors have achieved impressive performance, the high cost of +LiDAR sensors precludes their widespread adoption in affordable vehicles. +Camera-based detectors are cheaper alternatives but often suffer inferior +performance compared to their LiDAR-based counterparts due to inherent depth +ambiguities in images. In this work, we seek to improve monocular 3D detectors +by leveraging unlabeled historical LiDAR data. Specifically, at inference time, +we assume that the camera-based detectors have access to multiple unlabeled +LiDAR scans from past traversals at locations of interest (potentially from +other high-end vehicles equipped with LiDAR sensors). Under this setup, we +proposed a novel, simple, and end-to-end trainable framework, termed +AsyncDepth, to effectively extract relevant features from asynchronous LiDAR +traversals of the same location for monocular 3D detectors. We show consistent +and significant performance gain (up to 9 AP) across multiple state-of-the-art +models and datasets with a negligible additional latency of 9.66 ms and a small +storage cost. + +
+
+ comment: Accepted by ICRA 2022. The code can be found at + https://github.com/YurongYou/AsyncDepth +
+
+
+
+
+ + ☆ Self-Supervised Multi-Object Tracking with Path Consistency CVPR 2024 + + +
+ In this paper, we propose a novel concept of path consistency to learn robust +object matching without using manual object identity supervision. Our key idea +is that, to track a object through frames, we can obtain multiple different +association results from a model by varying the frames it can observe, i.e., +skipping frames in observation. As the differences in observations do not alter +the identities of objects, the obtained association results should be +consistent. Based on this rationale, we generate multiple observation paths, +each specifying a different set of frames to be skipped, and formulate the Path +Consistency Loss that enforces the association results are consistent across +different observation paths. We use the proposed loss to train our object +matching model with only self-supervision. By extensive experiments on three +tracking datasets (MOT17, PersonPath22, KITTI), we demonstrate that our method +outperforms existing unsupervised methods with consistent margins on various +evaluation metrics, and even achieves performance close to supervised methods. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Image-based Agarwood Resinous Area Segmentation using Deep Learning + + +
+ The manual extraction method of Agarwood resinous compound is laborious work, +requires skilled workers, and is subject to human errors. Commercial Agarwood +industries have been actively exploring using Computer Numerical Control (CNC) +machines to replace human effort for this particular task. The CNC machine +accepts a G-code script produced from a binary image in which the wood region +that needs to be chiselled off is marked with (0, 0, 0) as its RGB value. +Rather than requiring a human expert to perform the region marking, we propose +using a Deep learning image segmentation method instead. Our setup involves a +camera that captures the cross-section image and then passes the image file to +a computer. The computer performs the automated image segmentation and feeds +the CNC machine with a G-code script. In this article, we report the initial +segmentation results achieved using a state-of-the-art Deep learning +segmentation method and discuss potential improvements to refine the +segmentation accuracy. + +
+
+ comment: 15 pages, 6 figures, 3 tables +
+
+
+
+
+ + ☆ Improving Deep Learning Predictions with Simulated Images, and Vice + Versa + + +
+ Artificial neural networks are often used to identify features of crop +plants. However, training their models requires many annotated images, which +can be expensive and time-consuming to acquire. Procedural models of plants, +such as those developed with Lindenmayer-systems (L-systems) can be created to +produce visually realistic simulations, and hence images of plant simulations, +where annotations are implicitly known. These synthetic images can either +augment or completely replace real images in training neural networks for +phenotyping tasks. In this paper, we systematically vary amounts of real and +synthetic images used for training in both maize and canola to better +understand situations where synthetic images generated from L-systems can help +prediction on real images. This work also explores the degree to which realism +in the synthetic images improves prediction. Furthermore, we see how neural +network predictions can be used to help calibrate L-systems themselves, +creating a feedback loop. + +
+
+
+
+
+ + ☆ Class Similarity Transition: Decoupling Class Similarities and Imbalance + from Generalized Few-shot Segmentation + + +
+ In Generalized Few-shot Segmentation (GFSS), a model is trained with a large +corpus of base class samples and then adapted on limited samples of novel +classes. This paper focuses on the relevance between base and novel classes, +and improves GFSS in two aspects: 1) mining the similarity between base and +novel classes to promote the learning of novel classes, and 2) mitigating the +class imbalance issue caused by the volume difference between the support set +and the training set. Specifically, we first propose a similarity transition +matrix to guide the learning of novel classes with base class knowledge. Then, +we leverage the Label-Distribution-Aware Margin (LDAM) loss and Transductive +Inference to the GFSS task to address the problem of class imbalance as well as +overfitting the support set. In addition, by extending the probability +transition matrix, the proposed method can mitigate the catastrophic forgetting +of base classes when learning novel classes. With a simple training phase, our +proposed method can be applied to any segmentation network trained on base +classes. We validated our methods on the adapted version of OpenEarthMap. +Compared to existing GFSS baselines, our method excels them all from 3% to 7% +and ranks second in the OpenEarthMap Land Cover Mapping Few-Shot Challenge at +the completion of this paper. Code: +https://github.com/earth-insights/ClassTrans + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ☆ TabConv: Low-Computation CNN Inference via Table Lookups + + +
+ Convolutional Neural Networks (CNNs) have demonstrated remarkable ability +throughout the field of computer vision. However, CNN inference requires a +large number of arithmetic operations, making them expensive to deploy in +hardware. Current approaches alleviate this issue by developing +hardware-supported, algorithmic processes to simplify spatial convolution +functions. However, these methods still heavily rely on matrix multiplication, +leading to significant computational overhead. To bridge the gap between +hardware, algorithmic acceleration, and approximate matrix multiplication, we +propose TabConv, a novel, table-based approximation for convolution to +significantly reduce arithmetic operations during inference. Additionally, we +introduce a priority masking technique based on cosine similarity to select +layers for table-based approximation, thereby maintaining the model +performance. We evaluate our approach on popular CNNs: ResNet-18, ResNet-34, +and NetworkInNetwork (NIN). TabConv preserves over 93% of the original model's +performance while reducing arithmetic operations by 36.5%, 25.8%, and 99.4% for +ResNet-18 on CIFAR-10, CIFAR-100, and MNIST, respectively, 35.6% and 99.3% for +ResNet-34 on CIFAR-10 and MNIST, and 98.9% for NIN on MNIST, achieving +low-computation inference. + +
+
+ comment: 8 pages, Accepted at CF '24 +
+
+
+
+
+ + ☆ Towards Improved Semiconductor Defect Inspection for high-NA EUVL based + on SEMI-SuperYOLO-NAS + + +
+ Due to potential pitch reduction, the semiconductor industry is adopting +High-NA EUVL technology. However, its low depth of focus presents challenges +for High Volume Manufacturing. To address this, suppliers are exploring thinner +photoresists and new underlayers/hardmasks. These may suffer from poor SNR, +complicating defect detection. Vision-based ML algorithms offer a promising +solution for semiconductor defect inspection. However, developing a robust ML +model across various image resolutions without explicit training remains a +challenge for nano-scale defect inspection. This research's goal is to propose +a scale-invariant ADCD framework capable to upscale images, addressing this +issue. We propose an improvised ADCD framework as SEMI-SuperYOLO-NAS, which +builds upon the baseline YOLO-NAS architecture. This framework integrates a SR +assisted branch to aid in learning HR features by the defect detection +backbone, particularly for detecting nano-scale defect instances from LR +images. Additionally, the SR-assisted branch can recursively generate upscaled +images from their corresponding downscaled counterparts, enabling defect +detection inference across various image resolutions without requiring explicit +training. Moreover, we investigate improved data augmentation strategy aimed at +generating diverse and realistic training datasets to enhance model +performance. We have evaluated our proposed approach using two original FAB +datasets obtained from two distinct processes and captured using two different +imaging tools. Finally, we demonstrate zero-shot inference for our model on a +new, originating from a process condition distinct from the training dataset +and possessing different Pitch characteristics. Experimental validation +demonstrates that our proposed ADCD framework aids in increasing the throughput +of imaging tools for defect inspection by reducing the required image pixel +resolutions. + +
+
+
+
+
+ + ☆ Localizing Moments of Actions in Untrimmed Videos of Infants with Autism + Spectrum Disorder + + +
+ Autism Spectrum Disorder (ASD) presents significant challenges in early +diagnosis and intervention, impacting children and their families. With +prevalence rates rising, there is a critical need for accessible and efficient +screening tools. Leveraging machine learning (ML) techniques, in particular +Temporal Action Localization (TAL), holds promise for automating ASD screening. +This paper introduces a self-attention based TAL model designed to identify +ASD-related behaviors in infant videos. Unlike existing methods, our approach +simplifies complex modeling and emphasizes efficiency, which is essential for +practical deployment in real-world scenarios. Importantly, this work +underscores the importance of developing computer vision methods capable of +operating in naturilistic environments with little equipment control, +addressing key challenges in ASD screening. This study is the first to conduct +end-to-end temporal action localization in untrimmed videos of infants with +ASD, offering promising avenues for early intervention and support. We report +baseline results of behavior detection using our TAL model. We achieve 70% +accuracy for look face, 79% accuracy for look object, 72% for smile and 65% for +vocalization. + +
+
+ comment: 7 pages, 2 figures, 3 tables +
+
+
+
+
+ + ☆ Privacy-Preserving Deep Learning Using Deformable Operators for Secure + Task Learning + + +
+ In the era of cloud computing and data-driven applications, it is crucial to +protect sensitive information to maintain data privacy, ensuring truly reliable +systems. As a result, preserving privacy in deep learning systems has become a +critical concern. Existing methods for privacy preservation rely on image +encryption or perceptual transformation approaches. However, they often suffer +from reduced task performance and high computational costs. To address these +challenges, we propose a novel Privacy-Preserving framework that uses a set of +deformable operators for secure task learning. Our method involves shuffling +pixels during the analog-to-digital conversion process to generate visually +protected data. Those are then fed into a well-known network enhanced with +deformable operators. Using our approach, users can achieve equivalent +performance to original images without additional training using a secret key. +Moreover, our method enables access control against unauthorized users. +Experimental results demonstrate the efficacy of our approach, showcasing its +potential in cloud-based scenarios and privacy-sensitive applications. + +
+
+ comment: copyright 2024 IEEE. Personal use of this material is permitted. + Permission from IEEE must be obtained for all other uses, in any current or + future media, including reprinting/republishing this material for advertising + or promotional purposes, creating new collective works, for resale or + redistribution to servers or lists, or reuse of any copyrighted component of + this work in other works +
+
+
+
+
+ + ☆ Towards Explainable Automated Neuroanatomy + + +
+ We present a novel method for quantifying the microscopic structure of brain +tissue. It is based on the automated recognition of interpretable features +obtained by analyzing the shapes of cells. This contrasts with prevailing +methods of brain anatomical analysis in two ways. First, contemporary methods +use gray-scale values derived from smoothed version of the anatomical images, +which dissipated valuable information from the texture of the images. Second, +contemporary analysis uses the output of black-box Convolutional Neural +Networks, while our system makes decisions based on interpretable features +obtained by analyzing the shapes of individual cells. An important benefit of +this open-box approach is that the anatomist can understand and correct the +decisions made by the computer. Our proposed system can accurately localize and +identify existing brain structures. This can be used to align and coregistar +brains and will facilitate connectomic studies for reverse engineering of brain +circuitry. + +
+
+
+
+
+ + ☆ BatSort: Enhanced Battery Classification with Transfer Learning for + Battery Sorting and Recycling + + +
+ Battery recycling is a critical process for minimizing environmental harm and +resource waste for used batteries. However, it is challenging, largely because +sorting batteries is costly and hardly automated to group batteries based on +battery types. In this paper, we introduce a machine learning-based approach +for battery-type classification and address the daunting problem of data +scarcity for the application. We propose BatSort which applies transfer +learning to utilize the existing knowledge optimized with large-scale datasets +and customizes ResNet to be specialized for classifying battery types. We +collected our in-house battery-type dataset of small-scale to guide the +knowledge transfer as a case study and evaluate the system performance. We +conducted an experimental study and the results show that BatSort can achieve +outstanding accuracy of 92.1% on average and up to 96.2% and the performance is +stable for battery-type classification. Our solution helps realize fast and +automated battery sorting with minimized cost and can be transferred to related +industry applications with insufficient data. + +
+
+
+
+
+ + ☆ Responsible Generative AI: What to Generate and What Not + + +
+ In recent years, generative AI (GenAI), like large language models and +text-to-image models, has received significant attention across various +domains. However, ensuring the responsible generation of content by these +models is crucial for their real-world applicability. This raises an +interesting question: \textit{What should responsible GenAI generate, and what +should it not?} To answer the question, this paper investigates the practical +responsible requirements of both textual and visual generative models, +outlining five key considerations: generating truthful content, avoiding toxic +content, refusing harmful instruction, leaking no training data-related +content, and ensuring generated content identifiable. Specifically, we review +recent advancements and challenges in addressing these requirements. Besides, +we discuss and emphasize the importance of responsible GenAI across healthcare, +education, finance, and artificial general intelligence domains. Through a +unified perspective on both textual and visual generative models, this paper +aims to provide insights into practical safety-related issues and further +benefit the community in building responsible GenAI. + +
+
+ comment: 74 pages, 10 figures +
+
+
+
+
+ + ☆ Forecasting Electric Vehicle Battery Output Voltage: A Predictive + Modeling Approach + + +
+ The battery management system plays a vital role in ensuring the safety and +dependability of electric and hybrid vehicles. It is responsible for various +functions, including state evaluation, monitoring, charge control, and cell +balancing, all integrated within the BMS. Nonetheless, due to the uncertainties +surrounding battery performance, implementing these functionalities poses +significant challenges. In this study, we explore the latest approaches for +assessing battery states, highlight notable advancements in battery management +systems (BMS), address existing issues with current BMS technology, and put +forth possible solutions for predicting battery charging voltage. + +
+
+
+
+
+ + ☆ Lightweight Deep Learning for Resource-Constrained Environments: A + Survey + + +
+ Over the past decade, the dominance of deep learning has prevailed across +various domains of artificial intelligence, including natural language +processing, computer vision, and biomedical signal processing. While there have +been remarkable improvements in model accuracy, deploying these models on +lightweight devices, such as mobile phones and microcontrollers, is constrained +by limited resources. In this survey, we provide comprehensive design guidance +tailored for these devices, detailing the meticulous design of lightweight +models, compression methods, and hardware acceleration strategies. The +principal goal of this work is to explore methods and concepts for getting +around hardware constraints without compromising the model's accuracy. +Additionally, we explore two notable paths for lightweight deep learning in the +future: deployment techniques for TinyML and Large Language Models. Although +these paths undoubtedly have potential, they also present significant +challenges, encouraging research into unexplored areas. + +
+
+ comment: 40 pages +
+
+
+
+
+ + ♻ ☆ Energy-Calibrated VAE with Test Time Free Lunch + + +
+ In this paper, we propose a novel generative model that utilizes a +conditional Energy-Based Model (EBM) for enhancing Variational Autoencoder +(VAE), termed Energy-Calibrated VAE (EC-VAE). Specifically, VAEs often suffer +from blurry generated samples due to the lack of a tailored training on the +samples generated in the generative direction. On the other hand, EBMs can +generate high-quality samples but require expensive Markov Chain Monte Carlo +(MCMC) sampling. To address these issues, we introduce a conditional EBM for +calibrating the generative direction of VAE during training, without requiring +it for the generation at test time. In particular, we train EC-VAE upon both +the input data and the calibrated samples with adaptive weight to enhance +efficacy while avoiding MCMC sampling at test time. Furthermore, we extend the +calibration idea of EC-VAE to variational learning and normalizing flows, and +apply EC-VAE to an additional application of zero-shot image restoration via +neural transport prior and range-null theory. We evaluate the proposed method +with two applications, including image generation and zero-shot image +restoration, and the experimental results show that our method achieves +competitive performance over single-step non-adversarial generation. Our code +is available at https://github.com/DJ-LYH/EC-VAE. + +
+
+ comment: Revision. Code is available at https://github.com/DJ-LYH/EC-VAE +
+
+
+
+
+ + ♻ ☆ Deep Internal Learning: Deep Learning from a Single Input + + +
+ Deep learning, in general, focuses on training a neural network from large +labeled datasets. Yet, in many cases there is value in training a network just +from the input at hand. This is particularly relevant in many signal and image +processing problems where training data is scarce and diversity is large on the +one hand, and on the other, there is a lot of structure in the data that can be +exploited. Using this information is the key to deep internal-learning +strategies, which may involve training a network from scratch using a single +input or adapting an already trained network to a provided input example at +inference time. This survey paper aims at covering deep internal-learning +techniques that have been proposed in the past few years for these two +important directions. While our main focus will be on image processing +problems, most of the approaches that we survey are derived for general signals +(vectors with recurring patterns that can be distinguished from noise) and are +therefore applicable to other modalities. + +
+
+ comment: Accepted to IEEE Signal Processing Magazine +
+
+
+
+
+ + ♻ ☆ FreGS: 3D Gaussian Splatting with Progressive Frequency Regularization CVPR 2024 + + +
+ 3D Gaussian splatting has achieved very impressive performance in real-time +novel view synthesis. However, it often suffers from over-reconstruction during +Gaussian densification where high-variance image regions are covered by a few +large Gaussians only, leading to blur and artifacts in the rendered images. We +design a progressive frequency regularization (FreGS) technique to tackle the +over-reconstruction issue within the frequency space. Specifically, FreGS +performs coarse-to-fine Gaussian densification by exploiting low-to-high +frequency components that can be easily extracted with low-pass and high-pass +filters in the Fourier space. By minimizing the discrepancy between the +frequency spectrum of the rendered image and the corresponding ground truth, it +achieves high-quality Gaussian densification and alleviates the +over-reconstruction of Gaussian splatting effectively. Experiments over +multiple widely adopted benchmarks (e.g., Mip-NeRF360, Tanks-and-Temples and +Deep Blending) show that FreGS achieves superior novel view synthesis and +outperforms the state-of-the-art consistently. + +
+
+ comment: Accepted by CVPR 2024. Project website: + https://rogeraigc.github.io/FreGS-Page/ +
+
+
+
+
+ + ♻ ☆ WEEP: A method for spatial interpretation of weakly supervised CNN + models in computational pathology + + +
+ Deep learning enables the modelling of high-resolution histopathology +whole-slide images (WSI). Weakly supervised learning of tile-level data is +typically applied for tasks where labels only exist on the patient or WSI level +(e.g. patient outcomes or histological grading). In this context, there is a +need for improved spatial interpretability of predictions from such models. We +propose a novel method, Wsi rEgion sElection aPproach (WEEP), for model +interpretation. It provides a principled yet straightforward way to establish +the spatial area of WSI required for assigning a particular prediction label. +We demonstrate WEEP on a binary classification task in the area of breast +cancer computational pathology. WEEP is easy to implement, is directly +connected to the model-based decision process, and offers information relevant +to both research and diagnostic applications. + +
+
+
+
+
+ + ♻ ☆ Deep Feature Statistics Mapping for Generalized Screen Content Image + Quality Assessment + + +
+ The statistical regularities of natural images, referred to as natural scene +statistics, play an important role in no-reference image quality assessment. +However, it has been widely acknowledged that screen content images (SCIs), +which are typically computer generated, do not hold such statistics. Here we +make the first attempt to learn the statistics of SCIs, based upon which the +quality of SCIs can be effectively determined. The underlying mechanism of the +proposed approach is based upon the mild assumption that the SCIs, which are +not physically acquired, still obey certain statistics that could be understood +in a learning fashion. We empirically show that the statistics deviation could +be effectively leveraged in quality assessment, and the proposed method is +superior when evaluated in different settings. Extensive experimental results +demonstrate the Deep Feature Statistics based SCI Quality Assessment (DFSS-IQA) +model delivers promising performance compared with existing NR-IQA models and +shows a high generalization capability in the cross-dataset settings. The +implementation of our method is publicly available at +https://github.com/Baoliang93/DFSS-IQA. + +
+
+
+
+
+ + ♻ ☆ Towards Domain-agnostic Depth Completion + + +
+ Existing depth completion methods are often targeted at a specific sparse +depth type and generalize poorly across task domains. We present a method to +complete sparse/semi-dense, noisy, and potentially low-resolution depth maps +obtained by various range sensors, including those in modern mobile phones, or +by multi-view reconstruction algorithms. Our method leverages a data-driven +prior in the form of a single image depth prediction network trained on +large-scale datasets, the output of which is used as an input to our model. We +propose an effective training scheme where we simulate various sparsity +patterns in typical task domains. In addition, we design two new benchmarks to +evaluate the generalizability and the robustness of depth completion methods. +Our simple method shows superior cross-domain generalization ability against +state-of-the-art depth completion methods, introducing a practical solution to +high-quality depth capture on a mobile device. The code is available at: +https://github.com/YvanYin/FillDepth. + +
+
+
+
+
+ + ♻ ☆ Intention-Conditioned Long-Term Human Egocentric Action Forecasting CVPR + + +
+ To anticipate how a human would act in the future, it is essential to +understand the human intention since it guides the human towards a certain +goal. In this paper, we propose a hierarchical architecture which assumes a +sequence of human action (low-level) can be driven from the human intention +(high-level). Based on this, we deal with Long-Term Action Anticipation task in +egocentric videos. Our framework first extracts two level of human information +over the N observed videos human actions through a Hierarchical Multi-task MLP +Mixer (H3M). Then, we condition the uncertainty of the future through an +Intention-Conditioned Variational Auto-Encoder (I-CVAE) that generates K stable +predictions of the next Z=20 actions that the observed human might perform. By +leveraging human intention as high-level information, we claim that our model +is able to anticipate more time-consistent actions in the long-term, thus +improving the results over baseline methods in EGO4D Challenge. This work +ranked first in both CVPR@2022 and ECVV@2022 EGO4D LTA Challenge by providing +more plausible anticipated sequences, improving the anticipation of nouns and +overall actions. Webpage: https://evm7.github.io/icvae-page/ + +
+
+ comment: Winner of CVPR@2022 and ECCV@2022 EGO4D LTA Challenge. Accepted in + WACV2023. Webpage: https://evm7.github.io/icvae-page/ +
+
+
+
+
+ + ♻ ☆ Robust Human Motion Forecasting using Transformer-based Model IROS2022 + + +
+ Comprehending human motion is a fundamental challenge for developing +Human-Robot Collaborative applications. Computer vision researchers have +addressed this field by only focusing on reducing error in predictions, but not +taking into account the requirements to facilitate its implementation in +robots. In this paper, we propose a new model based on Transformer that +simultaneously deals with the real time 3D human motion forecasting in the +short and long term. Our 2-Channel Transformer (2CH-TR) is able to efficiently +exploit the spatio-temporal information of a shortly observed sequence (400ms) +and generates a competitive accuracy against the current state-of-the-art. +2CH-TR stands out for the efficient performance of the Transformer, being +lighter and faster than its competitors. In addition, our model is tested in +conditions where the human motion is severely occluded, demonstrating its +robustness in reconstructing and predicting 3D human motion in a highly noisy +environment. Our experiment results show that the proposed 2CH-TR outperforms +the ST-Transformer, which is another state-of-the-art model based on the +Transformer, in terms of reconstruction and prediction under the same +conditions of input prefix. Our model reduces in 8.89% the mean squared error +of ST-Transformer in short-term prediction, and 2.57% in long-term prediction +in Human3.6M dataset with 400ms input prefix. Webpage: +https://evm7.github.io/2CHTR-page/ + +
+
+ comment: Accepted to IROS2022. Webpage: https://evm7.github.io/2CHTR-page/ +
+
+
+
+
+ + ♻ ☆ A Unified Masked Autoencoder with Patchified Skeletons for Motion + Synthesis AAAI2024 + + +
+ The synthesis of human motion has traditionally been addressed through +task-dependent models that focus on specific challenges, such as predicting +future motions or filling in intermediate poses conditioned on known key-poses. +In this paper, we present a novel task-independent model called UNIMASK-M, +which can effectively address these challenges using a unified architecture. +Our model obtains comparable or better performance than the state-of-the-art in +each field. Inspired by Vision Transformers (ViTs), our UNIMASK-M model +decomposes a human pose into body parts to leverage the spatio-temporal +relationships existing in human motion. Moreover, we reformulate various +pose-conditioned motion synthesis tasks as a reconstruction problem with +different masking patterns given as input. By explicitly informing our model +about the masked joints, our UNIMASK-M becomes more robust to occlusions. +Experimental results show that our model successfully forecasts human motion on +the Human3.6M dataset. Moreover, it achieves state-of-the-art results in motion +inbetweening on the LaFAN1 dataset, particularly in long transition periods. +More information can be found on the project website +https://evm7.github.io/UNIMASKM-page/ + +
+
+ comment: Accepted to AAAI2024. Webpage: https://evm7.github.io/UNIMASKM-page/ +
+
+
+
+
+ + ♻ ☆ HOI4ABOT: Human-Object Interaction Anticipation for Human Intention + Reading Collaborative roBOTs + + +
+ Robots are becoming increasingly integrated into our lives, assisting us in +various tasks. To ensure effective collaboration between humans and robots, it +is essential that they understand our intentions and anticipate our actions. In +this paper, we propose a Human-Object Interaction (HOI) anticipation framework +for collaborative robots. We propose an efficient and robust transformer-based +model to detect and anticipate HOIs from videos. This enhanced anticipation +empowers robots to proactively assist humans, resulting in more efficient and +intuitive collaborations. Our model outperforms state-of-the-art results in HOI +detection and anticipation in VidHOI dataset with an increase of 1.76% and +1.04% in mAP respectively while being 15.4 times faster. We showcase the +effectiveness of our approach through experimental results in a real robot, +demonstrating that the robot's ability to anticipate HOIs is key for better +Human-Robot Interaction. More information can be found on our project webpage: +https://evm7.github.io/HOI4ABOT_page/ + +
+
+ comment: Proceedings in Conference on Robot Learning 2023. Webpage: + https://evm7.github.io/HOI4ABOT_page/ +
+
+
+
+
+ + ♻ ☆ Robot Interaction Behavior Generation based on Social Motion Forecasting + for Human-Robot Interaction ICRA 2024 + + +
+ Integrating robots into populated environments is a complex challenge that +requires an understanding of human social dynamics. In this work, we propose to +model social motion forecasting in a shared human-robot representation space, +which facilitates us to synthesize robot motions that interact with humans in +social scenarios despite not observing any robot in the motion training. We +develop a transformer-based architecture called ECHO, which operates in the +aforementioned shared space to predict the future motions of the agents +encountered in social scenarios. Contrary to prior works, we reformulate the +social motion problem as the refinement of the predicted individual motions +based on the surrounding agents, which facilitates the training while allowing +for single-motion forecasting when only one human is in the scene. We evaluate +our model in multi-person and human-robot motion forecasting tasks and obtain +state-of-the-art performance by a large margin while being efficient and +performing in real-time. Additionally, our qualitative results showcase the +effectiveness of our approach in generating human-robot interaction behaviors +that can be controlled via text commands. Webpage: https://evm7.github.io/ECHO/ + +
+
+ comment: Accepted at ICRA 2024. Webpage: https://evm7.github.io/ECHO/ +
+
+
+
+
+ + ♻ ☆ DRCT: Saving Image Super-resolution away from Information Bottleneck + + +
+ In recent years, Vision Transformer-based applications to low-level vision +tasks have achieved widespread success. Unlike CNN-based models, Transformers +are more adept at capturing long-range dependencies, enabling the +reconstruction of images utilizing information from non-local areas. In the +domain of super-resolution, Swin-transformer-based approaches have become +mainstream due to their capacity to capture global spatial information and +their shifting-window attention mechanism that facilitates the interchange of +information between different windows. Many researchers have enhanced image +quality and network efficiency by expanding the receptive field or designing +complex networks, yielding commendable results. However, we observed that +spatial information tends to diminish during the forward propagation process +due to increased depth, leading to a loss of spatial information and, +consequently, limiting the model's potential. To address this, we propose the +Dense-residual-connected Transformer (DRCT), aimed at mitigating the loss of +spatial information through dense-residual connections between layers, thereby +unleashing the model's potential and enhancing performance. Experiment results +indicate that our approach is not only straightforward but also achieves +remarkable efficiency, surpassing state-of-the-art methods and performing +commendably at NTIRE2024. + +
+
+ comment: NTIRE 2024 Image Super-resolution (x4) +
+
+
+
+
+ + ♻ ☆ MESA: Matching Everything by Segmenting Anything CVPR24 + + +
+ Feature matching is a crucial task in the field of computer vision, which +involves finding correspondences between images. Previous studies achieve +remarkable performance using learning-based feature comparison. However, the +pervasive presence of matching redundancy between images gives rise to +unnecessary and error-prone computations in these methods, imposing limitations +on their accuracy. To address this issue, we propose MESA, a novel approach to +establish precise area (or region) matches for efficient matching redundancy +reduction. MESA first leverages the advanced image understanding capability of +SAM, a state-of-the-art foundation model for image segmentation, to obtain +image areas with implicit semantic. Then, a multi-relational graph is proposed +to model the spatial structure of these areas and construct their scale +hierarchy. Based on graphical models derived from the graph, the area matching +is reformulated as an energy minimization task and effectively resolved. +Extensive experiments demonstrate that MESA yields substantial precision +improvement for multiple point matchers in indoor and outdoor downstream tasks, +e.g. +13.61% for DKM in indoor pose estimation. + +
+
+ comment: CVPR24 +
+
+
+
+
+ + ♻ ☆ DPHMs: Diffusion Parametric Head Models for Depth-based Tracking CVPR 2024 + + +
+ We introduce Diffusion Parametric Head Models (DPHMs), a generative model +that enables robust volumetric head reconstruction and tracking from monocular +depth sequences. While recent volumetric head models, such as NPHMs, can now +excel in representing high-fidelity head geometries, tracking and +reconstructing heads from real-world single-view depth sequences remains very +challenging, as the fitting to partial and noisy observations is +underconstrained. To tackle these challenges, we propose a latent +diffusion-based prior to regularize volumetric head reconstruction and +tracking. This prior-based regularizer effectively constrains the identity and +expression codes to lie on the underlying latent manifold which represents +plausible head shapes. To evaluate the effectiveness of the diffusion-based +prior, we collect a dataset of monocular Kinect sequences consisting of various +complex facial expression motions and rapid transitions. We compare our method +to state-of-the-art tracking methods and demonstrate improved head identity +reconstruction as well as robust expression tracking. + +
+
+ comment: CVPR 2024; homepage: https://tangjiapeng.github.io/projects/DPHMs/ +
+
+
+
+
+ + ♻ ☆ SepVAE: a contrastive VAE to separate pathological patterns from healthy + ones ICML + + +
+ Contrastive Analysis VAE (CA-VAEs) is a family of Variational auto-encoders +(VAEs) that aims at separating the common factors of variation between a +background dataset (BG) (i.e., healthy subjects) and a target dataset (TG) +(i.e., patients) from the ones that only exist in the target dataset. To do so, +these methods separate the latent space into a set of salient features (i.e., +proper to the target dataset) and a set of common features (i.e., exist in both +datasets). Currently, all models fail to prevent the sharing of information +between latent spaces effectively and to capture all salient factors of +variation. To this end, we introduce two crucial regularization losses: a +disentangling term between common and salient representations and a +classification term between background and target samples in the salient space. +We show a better performance than previous CA-VAEs methods on three medical +applications and a natural images dataset (CelebA). Code and datasets are +available on GitHub https://github.com/neurospin-projects/2023_rlouiset_sepvae. + +
+
+ comment: Workshop on Interpretable ML in Healthcare at International + Conference on Machine Learning (ICML), Honolulu, Hawaii, USA. 2023 +
+
+
+
+
+ + ♻ ☆ SiT-MLP: A Simple MLP with Point-wise Topology Feature Learning for + Skeleton-based Action Recognition + + +
+ Graph convolution networks (GCNs) have achieved remarkable performance in +skeleton-based action recognition. However, previous GCN-based methods rely on +elaborate human priors excessively and construct complex feature aggregation +mechanisms, which limits the generalizability and effectiveness of networks. To +solve these problems, we propose a novel Spatial Topology Gating Unit (STGU), +an MLP-based variant without extra priors, to capture the co-occurrence +topology features that encode the spatial dependency across all joints. In +STGU, to learn the point-wise topology features, a new gate-based feature +interaction mechanism is introduced to activate the features point-to-point by +the attention map generated from the input sample. Based on the STGU, we +propose the first MLP-based model, SiT-MLP, for skeleton-based action +recognition in this work. Compared with previous methods on three large-scale +datasets, SiT-MLP achieves competitive performance. In addition, SiT-MLP +reduces the parameters significantly with favorable results. The code will be +available at https://github.com/BUPTSJZhang/SiT?MLP. + +
+
+ comment: Accepted by IEEE TCSVT 2024 +
+
+
+
+
+ + ♻ ☆ RTMO: Towards High-Performance One-Stage Real-Time Multi-Person Pose + Estimation CVPR 2024 + + +
+ Real-time multi-person pose estimation presents significant challenges in +balancing speed and precision. While two-stage top-down methods slow down as +the number of people in the image increases, existing one-stage methods often +fail to simultaneously deliver high accuracy and real-time performance. This +paper introduces RTMO, a one-stage pose estimation framework that seamlessly +integrates coordinate classification by representing keypoints using dual 1-D +heatmaps within the YOLO architecture, achieving accuracy comparable to +top-down methods while maintaining high speed. We propose a dynamic coordinate +classifier and a tailored loss function for heatmap learning, specifically +designed to address the incompatibilities between coordinate classification and +dense prediction models. RTMO outperforms state-of-the-art one-stage pose +estimators, achieving 1.1% higher AP on COCO while operating about 9 times +faster with the same backbone. Our largest model, RTMO-l, attains 74.8% AP on +COCO val2017 and 141 FPS on a single V100 GPU, demonstrating its efficiency and +accuracy. The code and models are available at +https://github.com/open-mmlab/mmpose/tree/main/projects/rtmo. + +
+
+ comment: Accepted at CVPR 2024. Project page: + https://github.com/open-mmlab/mmpose/tree/main/projects/rtmo +
+
+
+
+
+ + ♻ ☆ Zero-Shot Segmentation of Eye Features Using the Segment Anything Model + (SAM) + + +
+ The advent of foundation models signals a new era in artificial intelligence. +The Segment Anything Model (SAM) is the first foundation model for image +segmentation. In this study, we evaluate SAM's ability to segment features from +eye images recorded in virtual reality setups. The increasing requirement for +annotated eye-image datasets presents a significant opportunity for SAM to +redefine the landscape of data annotation in gaze estimation. Our investigation +centers on SAM's zero-shot learning abilities and the effectiveness of prompts +like bounding boxes or point clicks. Our results are consistent with studies in +other domains, demonstrating that SAM's segmentation effectiveness can be +on-par with specialized models depending on the feature, with prompts improving +its performance, evidenced by an IoU of 93.34% for pupil segmentation in one +dataset. Foundation models like SAM could revolutionize gaze estimation by +enabling quick and easy image segmentation, reducing reliance on specialized +models and extensive manual annotation. + +
+
+ comment: 14 pages, 8 figures, 1 table, Accepted to ETRA 2024: ACM Symposium on + Eye Tracking Research & Applications +
+
+
+
+
+ + ♻ ☆ Photo-SLAM: Real-time Simultaneous Localization and Photorealistic + Mapping for Monocular, Stereo, and RGB-D Cameras CVPR 2024 + + +
+ The integration of neural rendering and the SLAM system recently showed +promising results in joint localization and photorealistic view reconstruction. +However, existing methods, fully relying on implicit representations, are so +resource-hungry that they cannot run on portable devices, which deviates from +the original intention of SLAM. In this paper, we present Photo-SLAM, a novel +SLAM framework with a hyper primitives map. Specifically, we simultaneously +exploit explicit geometric features for localization and learn implicit +photometric features to represent the texture information of the observed +environment. In addition to actively densifying hyper primitives based on +geometric features, we further introduce a Gaussian-Pyramid-based training +method to progressively learn multi-level features, enhancing photorealistic +mapping performance. The extensive experiments with monocular, stereo, and +RGB-D datasets prove that our proposed system Photo-SLAM significantly +outperforms current state-of-the-art SLAM systems for online photorealistic +mapping, e.g., PSNR is 30% higher and rendering speed is hundreds of times +faster in the Replica dataset. Moreover, the Photo-SLAM can run at real-time +speed using an embedded platform such as Jetson AGX Orin, showing the potential +of robotics applications. + +
+
+ comment: CVPR 2024. Code: https://github.com/HuajianUP/Photo-SLAM - Project + Page: https://huajianup.github.io/research/Photo-SLAM/ +
+
+
+
+
+ + ♻ ☆ 360Loc: A Dataset and Benchmark for Omnidirectional Visual Localization + with Cross-device Queries CVPR 2024 + + +
+ Portable 360$^\circ$ cameras are becoming a cheap and efficient tool to +establish large visual databases. By capturing omnidirectional views of a +scene, these cameras could expedite building environment models that are +essential for visual localization. However, such an advantage is often +overlooked due to the lack of valuable datasets. This paper introduces a new +benchmark dataset, 360Loc, composed of 360$^\circ$ images with ground truth +poses for visual localization. We present a practical implementation of +360$^\circ$ mapping combining 360$^\circ$ images with lidar data to generate +the ground truth 6DoF poses. 360Loc is the first dataset and benchmark that +explores the challenge of cross-device visual positioning, involving +360$^\circ$ reference frames, and query frames from pinhole, ultra-wide FoV +fisheye, and 360$^\circ$ cameras. We propose a virtual camera approach to +generate lower-FoV query frames from 360$^\circ$ images, which ensures a fair +comparison of performance among different query types in visual localization +tasks. We also extend this virtual camera approach to feature matching-based +and pose regression-based methods to alleviate the performance loss caused by +the cross-device domain gap, and evaluate its effectiveness against +state-of-the-art baselines. We demonstrate that omnidirectional visual +localization is more robust in challenging large-scale scenes with symmetries +and repetitive structures. These results provide new insights into 360-camera +mapping and omnidirectional visual localization with cross-device queries. + +
+
+ comment: CVPR 2024. Project Page: https://huajianup.github.io/research/360Loc/ +
+
+
+
+
+ + ♻ ☆ Design as Desired: Utilizing Visual Question Answering for Multimodal + Pre-training + + +
+ Multimodal pre-training demonstrates its potential in the medical domain, +which learns medical visual representations from paired medical reports. +However, many pre-training tasks require extra annotations from clinicians, and +most of them fail to explicitly guide the model to learn the desired features +of different pathologies. To the best of our knowledge, we are the first to +utilize Visual Question Answering (VQA) for multimodal pre-training to guide +the framework focusing on targeted pathological features. In this work, we +leverage descriptions in medical reports to design multi-granular +question-answer pairs associated with different diseases, which assist the +framework in pre-training without requiring extra annotations from experts. We +also propose a novel pre-training framework with a quasi-textual feature +transformer, a module designed to transform visual features into a +quasi-textual space closer to the textual domain via a contrastive learning +strategy. This narrows the vision-language gap and facilitates modality +alignment. Our framework is applied to four downstream tasks: report +generation, classification, segmentation, and detection across five datasets. +Extensive experiments demonstrate the superiority of our framework compared to +other state-of-the-art methods. Our code will be released upon acceptance. + +
+
+
+
+
+ + ♻ ☆ LPSNet: End-to-End Human Pose and Shape Estimation with Lensless Imaging CVPR 2024 + + +
+ Human pose and shape (HPS) estimation with lensless imaging is not only +beneficial to privacy protection but also can be used in covert surveillance +scenarios due to the small size and simple structure of this device. However, +this task presents significant challenges due to the inherent ambiguity of the +captured measurements and lacks effective methods for directly estimating human +pose and shape from lensless data. In this paper, we propose the first +end-to-end framework to recover 3D human poses and shapes from lensless +measurements to our knowledge. We specifically design a multi-scale lensless +feature decoder to decode the lensless measurements through the optically +encoded mask for efficient feature extraction. We also propose a double-head +auxiliary supervision mechanism to improve the estimation accuracy of human +limb ends. Besides, we establish a lensless imaging system and verify the +effectiveness of our method on various datasets acquired by our lensless +imaging system. + +
+
+ comment: Accepted to CVPR 2024. More results available at + https://cic.tju.edu.cn/faculty/likun/projects/LPSNet +
+
+
+
+
+ + ♻ ☆ A ground-based dataset and a diffusion model for on-orbit low-light + image enhancement + + +
+ On-orbit service is important for maintaining the sustainability of space +environment. Space-based visible camera is an economical and lightweight sensor +for situation awareness during on-orbit service. However, it can be easily +affected by the low illumination environment. Recently, deep learning has +achieved remarkable success in image enhancement of natural images, but seldom +applied in space due to the data bottleneck. In this article, we first propose +a dataset of the Beidou Navigation Satellite for on-orbit low-light image +enhancement (LLIE). In the automatic data collection scheme, we focus on +reducing domain gap and improving the diversity of the dataset. we collect +hardware in-the-loop images based on a robotic simulation testbed imitating +space lighting conditions. To evenly sample poses of different orientation and +distance without collision, a collision-free working space and pose stratified +sampling is proposed. Afterwards, a novel diffusion model is proposed. To +enhance the image contrast without over-exposure and blurring details, we +design a fused attention to highlight the structure and dark region. Finally, +we compare our method with previous methods using our dataset, which indicates +that our method has a better capacity in on-orbit LLIE. + +
+
+
+
+
+ + ♻ ☆ Representing Noisy Image Without Denoising + + +
+ A long-standing topic in artificial intelligence is the effective recognition +of patterns from noisy images. In this regard, the recent data-driven paradigm +considers 1) improving the representation robustness by adding noisy samples in +training phase (i.e., data augmentation) or 2) pre-processing the noisy image +by learning to solve the inverse problem (i.e., image denoising). However, such +methods generally exhibit inefficient process and unstable result, limiting +their practical applications. In this paper, we explore a non-learning paradigm +that aims to derive robust representation directly from noisy images, without +the denoising as pre-processing. Here, the noise-robust representation is +designed as Fractional-order Moments in Radon space (FMR), with also beneficial +properties of orthogonality and rotation invariance. Unlike earlier +integer-order methods, our work is a more generic design taking such classical +methods as special cases, and the introduced fractional-order parameter offers +time-frequency analysis capability that is not available in classical methods. +Formally, both implicit and explicit paths for constructing the FMR are +discussed in detail. Extensive simulation experiments and an image security +application are provided to demonstrate the uniqueness and usefulness of our +FMR, especially for noise robustness, rotation invariance, and time-frequency +discriminability. + +
+
+ comment: Accepted by IEEE Transactions on Pattern Analysis and Machine + Intelligence, 2024 +
+
+
+
+
+ + ♻ ☆ PEEB: Part-based Image Classifiers with an Explainable and Editable + Language Bottleneck NAACL 2024 + + +
+ CLIP-based classifiers rely on the prompt containing a {class name} that is +known to the text encoder. Therefore, they perform poorly on new classes or the +classes whose names rarely appear on the Internet (e.g., scientific names of +birds). For fine-grained classification, we propose PEEB - an explainable and +editable classifier to (1) express the class name into a set of text +descriptors that describe the visual parts of that class; and (2) match the +embeddings of the detected parts to their textual descriptors in each class to +compute a logit score for classification. In a zero-shot setting where the +class names are unknown, PEEB outperforms CLIP by a huge margin (~10x in top-1 +accuracy). Compared to part-based classifiers, PEEB is not only the +state-of-the-art (SOTA) on the supervised-learning setting (88.80% and 92.20% +accuracy on CUB-200 and Dogs-120, respectively) but also the first to enable +users to edit the text descriptors to form a new classifier without any +re-training. Compared to concept bottleneck models, PEEB is also the SOTA in +both zero-shot and supervised-learning settings. + +
+
+ comment: Findings of NAACL 2024 (long paper) +
+
+
+
+
+ + ♻ ☆ Swap Attention in Spatiotemporal Diffusions for Text-to-Video Generation + + +
+ With the explosive popularity of AI-generated content (AIGC), video +generation has recently received a lot of attention. Generating videos guided +by text instructions poses significant challenges, such as modeling the complex +relationship between space and time, and the lack of large-scale text-video +paired data. Existing text-video datasets suffer from limitations in both +content quality and scale, or they are not open-source, rendering them +inaccessible for study and use. For model design, previous approaches extend +pretrained text-to-image generation models by adding temporal 1D +convolution/attention modules for video generation. However, these approaches +overlook the importance of jointly modeling space and time, inevitably leading +to temporal distortions and misalignment between texts and videos. In this +paper, we propose a novel approach that strengthens the interaction between +spatial and temporal perceptions. In particular, we utilize a swapped +cross-attention mechanism in 3D windows that alternates the ``query'' role +between spatial and temporal blocks, enabling mutual reinforcement for each +other. Moreover, to fully unlock model capabilities for high-quality video +generation and promote the development of the field, we curate a large-scale +and open-source video dataset called HD-VG-130M. This dataset comprises 130 +million text-video pairs from the open-domain, ensuring high-definition, +widescreen and watermark-free characters. A smaller-scale yet more meticulously +cleaned subset further enhances the data quality, aiding models in achieving +superior performance. Experimental quantitative and qualitative results +demonstrate the superiority of our approach in terms of per-frame quality, +temporal correlation, and text-video alignment, with clear margins. + +
+
+
+
+
+ + ♻ ☆ InstaGen: Enhancing Object Detection by Training on Synthetic Dataset CVPR2024 + + +
+ In this paper, we present a novel paradigm to enhance the ability of object +detector, e.g., expanding categories or improving detection performance, by +training on synthetic dataset generated from diffusion models. Specifically, we +integrate an instance-level grounding head into a pre-trained, generative +diffusion model, to augment it with the ability of localising instances in the +generated images. The grounding head is trained to align the text embedding of +category names with the regional visual feature of the diffusion model, using +supervision from an off-the-shelf object detector, and a novel self-training +scheme on (novel) categories not covered by the detector. We conduct thorough +experiments to show that, this enhanced version of diffusion model, termed as +InstaGen, can serve as a data synthesizer, to enhance object detectors by +training on its generated samples, demonstrating superior performance over +existing state-of-the-art methods in open-vocabulary (+4.5 AP) and data-sparse +(+1.2 to 5.2 AP) scenarios. Project page with code: +https://fcjian.github.io/InstaGen. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ♻ ☆ SIFU: Side-view Conditioned Implicit Function for Real-world Usable + Clothed Human Reconstruction CVPR 2024 + + +
+ Creating high-quality 3D models of clothed humans from single images for +real-world applications is crucial. Despite recent advancements, accurately +reconstructing humans in complex poses or with loose clothing from in-the-wild +images, along with predicting textures for unseen areas, remains a significant +challenge. A key limitation of previous methods is their insufficient prior +guidance in transitioning from 2D to 3D and in texture prediction. In response, +we introduce SIFU (Side-view Conditioned Implicit Function for Real-world +Usable Clothed Human Reconstruction), a novel approach combining a Side-view +Decoupling Transformer with a 3D Consistent Texture Refinement pipeline.SIFU +employs a cross-attention mechanism within the transformer, using SMPL-X +normals as queries to effectively decouple side-view features in the process of +mapping 2D features to 3D. This method not only improves the precision of the +3D models but also their robustness, especially when SMPL-X estimates are not +perfect. Our texture refinement process leverages text-to-image diffusion-based +prior to generate realistic and consistent textures for invisible views. +Through extensive experiments, SIFU surpasses SOTA methods in both geometry and +texture reconstruction, showcasing enhanced robustness in complex scenarios and +achieving an unprecedented Chamfer and P2S measurement. Our approach extends to +practical applications such as 3D printing and scene building, demonstrating +its broad utility in real-world scenarios. Project page +https://river-zhang.github.io/SIFU-projectpage/ . + +
+
+ comment: Accepted by CVPR 2024; Project page + https://river-zhang.github.io/SIFU-projectpage/ +
+
+
+
+
+ + ♻ ☆ SAOR: Single-View Articulated Object Reconstruction CVPR 2024 + + +
+ We introduce SAOR, a novel approach for estimating the 3D shape, texture, and +viewpoint of an articulated object from a single image captured in the wild. +Unlike prior approaches that rely on pre-defined category-specific 3D templates +or tailored 3D skeletons, SAOR learns to articulate shapes from single-view +image collections with a skeleton-free part-based model without requiring any +3D object shape priors. To prevent ill-posed solutions, we propose a +cross-instance consistency loss that exploits disentangled object shape +deformation and articulation. This is helped by a new silhouette-based sampling +mechanism to enhance viewpoint diversity during training. Our method only +requires estimated object silhouettes and relative depth maps from +off-the-shelf pre-trained networks during training. At inference time, given a +single-view image, it efficiently outputs an explicit mesh representation. We +obtain improved qualitative and quantitative results on challenging quadruped +animals compared to relevant existing work. + +
+
+ comment: Accepted to CVPR 2024, website: https://mehmetaygun.github.io/saor +
+
+
+
+
+ + ♻ ☆ CA-Jaccard: Camera-aware Jaccard Distance for Person Re-identification CVPR 2024 + + +
+ Person re-identification (re-ID) is a challenging task that aims to learn +discriminative features for person retrieval. In person re-ID, Jaccard distance +is a widely used distance metric, especially in re-ranking and clustering +scenarios. However, we discover that camera variation has a significant +negative impact on the reliability of Jaccard distance. In particular, Jaccard +distance calculates the distance based on the overlap of relevant neighbors. +Due to camera variation, intra-camera samples dominate the relevant neighbors, +which reduces the reliability of the neighbors by introducing intra-camera +negative samples and excluding inter-camera positive samples. To overcome this +problem, we propose a novel camera-aware Jaccard (CA-Jaccard) distance that +leverages camera information to enhance the reliability of Jaccard distance. +Specifically, we design camera-aware k-reciprocal nearest neighbors (CKRNNs) to +find k-reciprocal nearest neighbors on the intra-camera and inter-camera +ranking lists, which improves the reliability of relevant neighbors and +guarantees the contribution of inter-camera samples in the overlap. Moreover, +we propose a camera-aware local query expansion (CLQE) to mine reliable samples +in relevant neighbors by exploiting camera variation as a strong constraint and +assign these samples higher weights in overlap, further improving the +reliability. Our CA-Jaccard distance is simple yet effective and can serve as a +general distance metric for person re-ID methods with high reliability and low +computational cost. Extensive experiments demonstrate the effectiveness of our +method. + +
+
+ comment: This paper is accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ SegmentAnything helps microscopy images based automatic and quantitative + organoid detection and analysis + + +
+ Organoids are self-organized 3D cell clusters that closely mimic the +architecture and function of in vivo tissues and organs. Quantification of +organoid morphology helps in studying organ development, drug discovery, and +toxicity assessment. Recent microscopy techniques provide a potent tool to +acquire organoid morphology features, but manual image analysis remains a labor +and time-intensive process. Thus, this paper proposes a comprehensive pipeline +for microscopy analysis that leverages the SegmentAnything to precisely +demarcate individual organoids. Additionally, we introduce a set of +morphological properties, including perimeter, area, radius, non-smoothness, +and non-circularity, allowing researchers to analyze the organoid structures +quantitatively and automatically. To validate the effectiveness of our +approach, we conducted tests on bright-field images of human induced +pluripotent stem cells (iPSCs) derived neural-epithelial (NE) organoids. The +results obtained from our automatic pipeline closely align with manual organoid +detection and measurement, showcasing the capability of our proposed method in +accelerating organoids morphology analysis. + +
+
+ comment: Replace Figure 4 with the correct version. The original version is + wrong due to a column name mismatch +
+
+
+
+
+ + ♻ ☆ Understanding normalization in contrastive representation learning and + out-of-distribution detection + + +
+ Contrastive representation learning has emerged as an outstanding approach +for anomaly detection. In this work, we explore the $\ell_2$-norm of +contrastive features and its applications in out-of-distribution detection. We +propose a simple method based on contrastive learning, which incorporates +out-of-distribution data by discriminating against normal samples in the +contrastive layer space. Our approach can be applied flexibly as an outlier +exposure (OE) approach, where the out-of-distribution data is a huge collective +of random images, or as a fully self-supervised learning approach, where the +out-of-distribution data is self-generated by applying distribution-shifting +transformations. The ability to incorporate additional out-of-distribution +samples enables a feasible solution for datasets where AD methods based on +contrastive learning generally underperform, such as aerial images or +microscopy images. Furthermore, the high-quality features learned through +contrastive learning consistently enhance performance in OE scenarios, even +when the available out-of-distribution dataset is not diverse enough. Our +extensive experiments demonstrate the superiority of our proposed method under +various scenarios, including unimodal and multimodal settings, with various +image datasets. + +
+
+
+
+
+ + ♻ ☆ Confronting Ambiguity in 6D Object Pose Estimation via Score-Based + Diffusion on SE(3) CVPR2024 + + +
+ Addressing pose ambiguity in 6D object pose estimation from single RGB images +presents a significant challenge, particularly due to object symmetries or +occlusions. In response, we introduce a novel score-based diffusion method +applied to the $SE(3)$ group, marking the first application of diffusion models +to $SE(3)$ within the image domain, specifically tailored for pose estimation +tasks. Extensive evaluations demonstrate the method's efficacy in handling pose +ambiguity, mitigating perspective-induced ambiguity, and showcasing the +robustness of our surrogate Stein score formulation on $SE(3)$. This +formulation not only improves the convergence of denoising process but also +enhances computational efficiency. Thus, we pioneer a promising strategy for 6D +object pose estimation. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ♻ ☆ Neural Implicit Morphing of Face Images CVPR 2024 + + +
+ Face morphing is a problem in computer graphics with numerous artistic and +forensic applications. It is challenging due to variations in pose, lighting, +gender, and ethnicity. This task consists of a warping for feature alignment +and a blending for a seamless transition between the warped images. We propose +to leverage coord-based neural networks to represent such warpings and +blendings of face images. During training, we exploit the smoothness and +flexibility of such networks by combining energy functionals employed in +classical approaches without discretizations. Additionally, our method is +time-dependent, allowing a continuous warping/blending of the images. During +morphing inference, we need both direct and inverse transformations of the +time-dependent warping. The first (second) is responsible for warping the +target (source) image into the source (target) image. Our neural warping stores +those maps in a single network dismissing the need for inverting them. The +results of our experiments indicate that our method is competitive with both +classical and generative models under the lens of image quality and +face-morphing detectors. Aesthetically, the resulting images present a seamless +blending of diverse faces not yet usual in the literature. + +
+
+ comment: 14 pages, 20 figures, accepted for CVPR 2024 +
+
+
+
+
+ + ♻ ☆ SegForestNet: Spatial-Partitioning-Based Aerial Image Segmentation + + +
+ Aerial image segmentation is the basis for applications such as automatically +creating maps or tracking deforestation. In true orthophotos, which are often +used in these applications, many objects and regions can be approximated well +by polygons. However, this fact is rarely exploited by state-of-the-art +semantic segmentation models. Instead, most models allow unnecessary degrees of +freedom in their predictions by allowing arbitrary region shapes. We therefore +present a refinement of our deep learning model which predicts binary space +partitioning trees, an efficient polygon representation. The refinements +include a new feature decoder architecture and a new differentiable BSP tree +renderer which both avoid vanishing gradients. Additionally, we designed a +novel loss function specifically designed to improve the spatial partitioning +defined by the predicted trees. Furthermore, our expanded model can predict +multiple trees at once and thus can predict class-specific segmentations. As an +additional contribution, we investigate the impact of a non-optimal training +process in comparison to an optimized training process. While model +architectures optimized for aerial images, such as PFNet or our own model, show +an advantage under non-optimal conditions, this advantage disappears under +optimal training conditions. Despite this observation, our model still makes +better predictions for small rectangular objects, e.g., cars. + +
+
+
+
+
+ + ♻ ☆ Synthetic data shuffling accelerates the convergence of federated + learning under data heterogeneity + + +
+ In federated learning, data heterogeneity is a critical challenge. A +straightforward solution is to shuffle the clients' data to homogenize the +distribution. However, this may violate data access rights, and how and when +shuffling can accelerate the convergence of a federated optimization algorithm +is not theoretically well understood. In this paper, we establish a precise and +quantifiable correspondence between data heterogeneity and parameters in the +convergence rate when a fraction of data is shuffled across clients. We prove +that shuffling can quadratically reduce the gradient dissimilarity with respect +to the shuffling percentage, accelerating convergence. Inspired by the theory, +we propose a practical approach that addresses the data access rights issue by +shuffling locally generated synthetic data. The experimental results show that +shuffling synthetic data improves the performance of multiple existing +federated learning algorithms by a large margin. + +
+
+ comment: Accepted at TMLR +
+
+
+
+
+ + ♻ ☆ Learning Optical Flow and Scene Flow with Bidirectional Camera-LiDAR + Fusion + + +
+ In this paper, we study the problem of jointly estimating the optical flow +and scene flow from synchronized 2D and 3D data. Previous methods either employ +a complex pipeline that splits the joint task into independent stages, or fuse +2D and 3D information in an ``early-fusion'' or ``late-fusion'' manner. Such +one-size-fits-all approaches suffer from a dilemma of failing to fully utilize +the characteristic of each modality or to maximize the inter-modality +complementarity. To address the problem, we propose a novel end-to-end +framework, which consists of 2D and 3D branches with multiple bidirectional +fusion connections between them in specific layers. Different from previous +work, we apply a point-based 3D branch to extract the LiDAR features, as it +preserves the geometric structure of point clouds. To fuse dense image features +and sparse point features, we propose a learnable operator named bidirectional +camera-LiDAR fusion module (Bi-CLFM). We instantiate two types of the +bidirectional fusion pipeline, one based on the pyramidal coarse-to-fine +architecture (dubbed CamLiPWC), and the other one based on the recurrent +all-pairs field transforms (dubbed CamLiRAFT). On FlyingThings3D, both CamLiPWC +and CamLiRAFT surpass all existing methods and achieve up to a 47.9\% reduction +in 3D end-point-error from the best published result. Our best-performing +model, CamLiRAFT, achieves an error of 4.26\% on the KITTI Scene Flow +benchmark, ranking 1st among all submissions with much fewer parameters. +Besides, our methods have strong generalization performance and the ability to +handle non-rigid motion. Code is available at +https://github.com/MCG-NJU/CamLiFlow. + +
+
+ comment: Accepted to TPAMI 2023 +
+
+
+
+
+ + ♻ ☆ Burst Super-Resolution with Diffusion Models for Improving Perceptual + Quality IJCNN 2024 + + +
+ While burst LR images are useful for improving the SR image quality compared +with a single LR image, prior SR networks accepting the burst LR images are +trained in a deterministic manner, which is known to produce a blurry SR image. +In addition, it is difficult to perfectly align the burst LR images, making the +SR image more blurry. Since such blurry images are perceptually degraded, we +aim to reconstruct the sharp high-fidelity boundaries. Such high-fidelity +images can be reconstructed by diffusion models. However, prior SR methods +using the diffusion model are not properly optimized for the burst SR task. +Specifically, the reverse process starting from a random sample is not +optimized for image enhancement and restoration methods, including burst SR. In +our proposed method, on the other hand, burst LR features are used to +reconstruct the initial burst SR image that is fed into an intermediate step in +the diffusion model. This reverse process from the intermediate step 1) skips +diffusion steps for reconstructing the global structure of the image and 2) +focuses on steps for refining detailed textures. Our experimental results +demonstrate that our method can improve the scores of the perceptual quality +metrics. Code: https://github.com/placerkyo/BSRD + +
+
+ comment: Accepted to IJCNN 2024 (International Joint Conference on Neural + Networks) +
+
+
+
+
+ + ♻ ☆ Gyro-based Neural Single Image Deblurring + + +
+ In this paper, we present GyroDeblurNet, a novel single image deblurring +method that utilizes a gyro sensor to effectively resolve the ill-posedness of +image deblurring. The gyro sensor provides valuable information about camera +motion during exposure time that can significantly improve deblurring quality. +However, effectively exploiting real-world gyro data is challenging due to +significant errors from various sources including sensor noise, the disparity +between the positions of a camera module and a gyro sensor, the absence of +translational motion information, and moving objects whose motions cannot be +captured by a gyro sensor. To handle gyro error, GyroDeblurNet is equipped with +two novel neural network blocks: a gyro refinement block and a gyro deblurring +block. The gyro refinement block refines the error-ridden gyro data using the +blur information from the input image. On the other hand, the gyro deblurring +block removes blur from the input image using the refined gyro data and further +compensates for gyro error by leveraging the blur information from the input +image. For training a neural network with erroneous gyro data, we propose a +training strategy based on the curriculum learning. We also introduce a novel +gyro data embedding scheme to represent real-world intricate camera shakes. +Finally, we present a synthetic dataset and a real dataset for the training and +evaluation of gyro-based single image deblurring. Our experiments demonstrate +that our approach achieves state-of-the-art deblurring quality by effectively +utilizing erroneous gyro data. + +
+
+ comment: 14 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ A Comprehensive Review of Knowledge Distillation in Computer Vision + + +
+ Deep learning techniques have been demonstrated to surpass preceding +cutting-edge machine learning techniques in recent years, with computer vision +being one of the most prominent examples. However, deep learning models suffer +from significant drawbacks when deployed in resource-constrained environments +due to their large model size and high complexity. Knowledge Distillation is +one of the prominent solutions to overcome this challenge. This review paper +examines the current state of research on knowledge distillation, a technique +for compressing complex models into smaller and simpler ones. The paper +provides an overview of the major principles and techniques associated with +knowledge distillation and reviews the applications of knowledge distillation +in the domain of computer vision. The review focuses on the benefits of +knowledge distillation, as well as the problems that must be overcome to +improve its effectiveness. + +
+
+ comment: 36 pages ,10 figures +
+
+
+
+
+ + ♻ ☆ Autoregressive Omni-Aware Outpainting for Open-Vocabulary 360-Degree + Image Generation AAAI 24 + + +
+ A 360-degree (omni-directional) image provides an all-encompassing spherical +view of a scene. Recently, there has been an increasing interest in +synthesising 360-degree images from conventional narrow field of view (NFoV) +images captured by digital cameras and smartphones, for providing immersive +experiences in various scenarios such as virtual reality. Yet, existing methods +typically fall short in synthesizing intricate visual details or ensure the +generated images align consistently with user-provided prompts. In this study, +autoregressive omni-aware generative network (AOG-Net) is proposed for +360-degree image generation by out-painting an incomplete 360-degree image +progressively with NFoV and text guidances joinly or individually. This +autoregressive scheme not only allows for deriving finer-grained and +text-consistent patterns by dynamically generating and adjusting the process +but also offers users greater flexibility to edit their conditions throughout +the generation process. A global-local conditioning mechanism is devised to +comprehensively formulate the outpainting guidance in each autoregressive step. +Text guidances, omni-visual cues, NFoV inputs and omni-geometry are encoded and +further formulated with cross-attention based transformers into a global stream +and a local stream into a conditioned generative backbone model. As AOG-Net is +compatible to leverage large-scale models for the conditional encoder and the +generative prior, it enables the generation to use extensive open-vocabulary +text guidances. Comprehensive experiments on two commonly used 360-degree image +datasets for both indoor and outdoor settings demonstrate the state-of-the-art +performance of our proposed method. Our code will be made publicly available. + +
+
+ comment: Accepted by AAAI 24 +
+
+
+
+
+ + ♻ ☆ Feature 3DGS: Supercharging 3D Gaussian Splatting to Enable Distilled + Feature Fields + + +
+ 3D scene representations have gained immense popularity in recent years. +Methods that use Neural Radiance fields are versatile for traditional tasks +such as novel view synthesis. In recent times, some work has emerged that aims +to extend the functionality of NeRF beyond view synthesis, for semantically +aware tasks such as editing and segmentation using 3D feature field +distillation from 2D foundation models. However, these methods have two major +limitations: (a) they are limited by the rendering speed of NeRF pipelines, and +(b) implicitly represented feature fields suffer from continuity artifacts +reducing feature quality. Recently, 3D Gaussian Splatting has shown +state-of-the-art performance on real-time radiance field rendering. In this +work, we go one step further: in addition to radiance field rendering, we +enable 3D Gaussian splatting on arbitrary-dimension semantic features via 2D +foundation model distillation. This translation is not straightforward: naively +incorporating feature fields in the 3DGS framework encounters significant +challenges, notably the disparities in spatial resolution and channel +consistency between RGB images and feature maps. We propose architectural and +training changes to efficiently avert this problem. Our proposed method is +general, and our experiments showcase novel view semantic segmentation, +language-guided editing and segment anything through learning feature fields +from state-of-the-art 2D foundation models such as SAM and CLIP-LSeg. Across +experiments, our distillation method is able to provide comparable or better +results, while being significantly faster to both train and render. +Additionally, to the best of our knowledge, we are the first method to enable +point and bounding-box prompting for radiance field manipulation, by leveraging +the SAM model. Project website at: https://feature-3dgs.github.io/ + +
+
+
+
+
+ + ♻ ☆ Unifying Correspondence, Pose and NeRF for Pose-Free Novel View + Synthesis from Stereo Pairs CVPR2024 + + +
+ This work delves into the task of pose-free novel view synthesis from stereo +pairs, a challenging and pioneering task in 3D vision. Our innovative +framework, unlike any before, seamlessly integrates 2D correspondence matching, +camera pose estimation, and NeRF rendering, fostering a synergistic enhancement +of these tasks. We achieve this through designing an architecture that utilizes +a shared representation, which serves as a foundation for enhanced 3D geometry +understanding. Capitalizing on the inherent interplay between the tasks, our +unified framework is trained end-to-end with the proposed training strategy to +improve overall model accuracy. Through extensive evaluations across diverse +indoor and outdoor scenes from two real-world datasets, we demonstrate that our +approach achieves substantial improvement over previous methodologies, +especially in scenarios characterized by extreme viewpoint changes and the +absence of accurate camera poses. + +
+
+ comment: Project page: https://ku-cvlab.github.io/CoPoNeRF/ CVPR2024 camera + ready version (Highlight) +
+
+
+
+
+ + ♻ ☆ UAV-Rain1k: A Benchmark for Raindrop Removal from UAV Aerial Imagery CVPR + + +
+ Raindrops adhering to the lens of UAVs can obstruct visibility of the +background scene and degrade image quality. Despite recent progress in image +deraining methods and datasets, there is a lack of focus on raindrop removal +from UAV aerial imagery due to the unique challenges posed by varying angles +and rapid movement during drone flight. To fill the gap in this research, we +first construct a new benchmark dataset for removing raindrops from UAV images, +called UAV-Rain1k. In this letter, we provide a dataset generation pipeline, +which includes modeling raindrop shapes using Blender, collecting background +images from various UAV angles, random sampling of rain masks and etc. Based on +the proposed benchmark, we further present a comprehensive evaluation of +existing representative image deraining algorithms, and reveal future research +opportunities worth exploring. The proposed dataset is publicly available at +https://github.com/cschenxiang/UAV-Rain1k. + +
+
+ comment: Accepted by IEEE/CVF Conference on Computer Vision and Pattern + Recognition Workshops (CVPRW) 2024 +
+
+
+
+
+ + ♻ ☆ Fully Sparse 3D Occupancy Prediction + + +
+ Occupancy prediction plays a pivotal role in autonomous driving. Previous +methods typically construct dense 3D volumes, neglecting the inherent sparsity +of the scene and suffering high computational costs. To bridge the gap, we +introduce a novel fully sparse occupancy network, termed SparseOcc. SparseOcc +initially reconstructs a sparse 3D representation from visual inputs and +subsequently predicts semantic/instance occupancy from the 3D sparse +representation by sparse queries. A mask-guided sparse sampling is designed to +enable sparse queries to interact with 2D features in a fully sparse manner, +thereby circumventing costly dense features or global attention. Additionally, +we design a thoughtful ray-based evaluation metric, namely RayIoU, to solve the +inconsistency penalty along depths raised in traditional voxel-level mIoU +criteria. SparseOcc demonstrates its effectiveness by achieving a RayIoU of +34.0, while maintaining a real-time inference speed of 17.3 FPS, with 7 history +frames inputs. By incorporating more preceding frames to 15, SparseOcc +continuously improves its performance to 35.1 RayIoU without whistles and +bells. Code is available at https://github.com/MCG-NJU/SparseOcc. + +
+
+ comment: Add new metric: RayIoU +
+
+
+
+
+ + ♻ ☆ Enhancing Ship Classification in Optical Satellite Imagery: Integrating + Convolutional Block Attention Module with ResNet for Improved Performance + + +
+ This study presents an advanced Convolutional Neural Network (CNN) +architecture for ship classification from optical satellite imagery, +significantly enhancing performance through the integration of the +Convolutional Block Attention Module (CBAM) and additional architectural +innovations. Building upon the foundational ResNet50 model, we first +incorporated a standard CBAM to direct the model's focus towards more +informative features, achieving an accuracy of 87% compared to the baseline +ResNet50's 85%. Further augmentations involved multi-scale feature integration, +depthwise separable convolutions, and dilated convolutions, culminating in the +Enhanced ResNet Model with Improved CBAM. This model demonstrated a remarkable +accuracy of 95%, with precision, recall, and f1-scores all witnessing +substantial improvements across various ship classes. The bulk carrier and oil +tanker classes, in particular, showcased nearly perfect precision and recall +rates, underscoring the model's enhanced capability in accurately identifying +and classifying ships. Attention heatmap analyses further validated the +improved model's efficacy, revealing a more focused attention on relevant ship +features, regardless of background complexities. These findings underscore the +potential of integrating attention mechanisms and architectural innovations in +CNNs for high-resolution satellite imagery classification. The study navigates +through the challenges of class imbalance and computational costs, proposing +future directions towards scalability and adaptability in new or rare ship type +recognition. This research lays a groundwork for the application of advanced +deep learning techniques in the domain of remote sensing, offering insights +into scalable and efficient satellite image classification. + +
+
+
+
+
+ + ♻ ☆ Holistic Inverse Rendering of Complex Facade via Aerial 3D Scanning + + +
+ In this work, we use multi-view aerial images to reconstruct the geometry, +lighting, and material of facades using neural signed distance fields (SDFs). +Without the requirement of complex equipment, our method only takes simple RGB +images captured by a drone as inputs to enable physically based and +photorealistic novel-view rendering, relighting, and editing. However, a +real-world facade usually has complex appearances ranging from diffuse rocks +with subtle details to large-area glass windows with specular reflections, +making it hard to attend to everything. As a result, previous methods can +preserve the geometry details but fail to reconstruct smooth glass windows or +verse vise. In order to address this challenge, we introduce three spatial- and +semantic-adaptive optimization strategies, including a semantic regularization +approach based on zero-shot segmentation techniques to improve material +consistency, a frequency-aware geometry regularization to balance surface +smoothness and details in different surfaces, and a visibility probe-based +scheme to enable efficient modeling of the local lighting in large-scale +outdoor environments. In addition, we capture a real-world facade aerial 3D +scanning image set and corresponding point clouds for training and +benchmarking. The experiment demonstrates the superior quality of our method on +facade holistic inverse rendering, novel view synthesis, and scene editing +compared to state-of-the-art baselines. + +
+
+
+
+
+ + ♻ ☆ Temporally Consistent Unbalanced Optimal Transport for Unsupervised + Action Segmentation CVPR 2024 + + +
+ We propose a novel approach to the action segmentation task for long, +untrimmed videos, based on solving an optimal transport problem. By encoding a +temporal consistency prior into a Gromov-Wasserstein problem, we are able to +decode a temporally consistent segmentation from a noisy affinity/matching cost +matrix between video frames and action classes. Unlike previous approaches, our +method does not require knowing the action order for a video to attain temporal +consistency. Furthermore, our resulting (fused) Gromov-Wasserstein problem can +be efficiently solved on GPUs using a few iterations of projected mirror +descent. We demonstrate the effectiveness of our method in an unsupervised +learning setting, where our method is used to generate pseudo-labels for +self-training. We evaluate our segmentation approach and unsupervised learning +pipeline on the Breakfast, 50-Salads, YouTube Instructions and Desktop Assembly +datasets, yielding state-of-the-art results for the unsupervised video action +segmentation task. + +
+
+ comment: Accepted to CVPR 2024 (Oral) +
+
+
+
+
+ + ♻ ☆ MVSA-Net: Multi-View State-Action Recognition for Robust and Deployable + Trajectory Generation AAAI-2024 + + +
+ The learn-from-observation (LfO) paradigm is a human-inspired mode for a +robot to learn to perform a task simply by watching it being performed. LfO can +facilitate robot integration on factory floors by minimizing disruption and +reducing tedious programming. A key component of the LfO pipeline is a +transformation of the depth camera frames to the corresponding task state and +action pairs, which are then relayed to learning techniques such as imitation +or inverse reinforcement learning for understanding the task parameters. While +several existing computer vision models analyze videos for activity +recognition, SA-Net specifically targets robotic LfO from RGB-D data. However, +SA-Net and many other models analyze frame data captured from a single +viewpoint. Their analysis is therefore highly sensitive to occlusions of the +observed task, which are frequent in deployments. An obvious way of reducing +occlusions is to simultaneously observe the task from multiple viewpoints and +synchronously fuse the multiple streams in the model. Toward this, we present +multi-view SA-Net, which generalizes the SA-Net model to allow the perception +of multiple viewpoints of the task activity, integrate them, and better +recognize the state and action in each frame. Performance evaluations on two +distinct domains establish that MVSA-Net recognizes the state-action pairs +under occlusion more accurately compared to single-view MVSA-Net and other +baselines. Our ablation studies further evaluate its performance under +different ambient conditions and establish the contribution of the architecture +components. As such, MVSA-Net offers a significantly more robust and deployable +state-action trajectory generation compared to previous methods. + +
+
+ comment: Presented at Deployable AI Workshop at AAAI-2024 and 'Towards + Reliable and Deployable Learning-Based Robotic Systems' Workshop at CoRL2023 +
+
+
+
+
+ + ♻ ☆ And Then the Hammer Broke: Reflections on Machine Ethics from Feminist + Philosophy of Science + + +
+ Vision is an important metaphor in ethical and political questions of +knowledge. The feminist philosopher Donna Haraway points out the ``perverse'' +nature of an intrusive, alienating, all-seeing vision (to which we might cry +out ``stop looking at me!''), but also encourages us to embrace the embodied +nature of sight and its promises for genuinely situated knowledge. Current +technologies of machine vision -- surveillance cameras, drones (for war or +recreation), iPhone cameras -- are usually construed as instances of the former +rather than the latter, and for good reasons. However, although in no way +attempting to diminish the real suffering these technologies have brought about +in the world, I make the case for understanding technologies of computer vision +as material instances of embodied seeing and situated knowing. Furthermore, +borrowing from Iris Murdoch's concept of moral vision, I suggest that these +technologies direct our labor towards self-reflection in ethically significant +ways. My approach draws upon paradigms in computer vision research, +phenomenology, and feminist epistemology. Ultimately, this essay is an argument +for directing more philosophical attention from merely criticizing technologies +of vision as ethically deficient towards embracing them as complex, +methodologically and epistemologically important objects. + +
+
+ comment: Pacific University Philosophy Conference +
+
+
+
+
+ + ♻ ☆ 3D Diffusion Policy: Generalizable Visuomotor Policy Learning via Simple + 3D Representations + + +
+ Imitation learning provides an efficient way to teach robots dexterous +skills; however, learning complex skills robustly and generalizablely usually +consumes large amounts of human demonstrations. To tackle this challenging +problem, we present 3D Diffusion Policy (DP3), a novel visual imitation +learning approach that incorporates the power of 3D visual representations into +diffusion policies, a class of conditional action generative models. The core +design of DP3 is the utilization of a compact 3D visual representation, +extracted from sparse point clouds with an efficient point encoder. In our +experiments involving 72 simulation tasks, DP3 successfully handles most tasks +with just 10 demonstrations and surpasses baselines with a 24.2% relative +improvement. In 4 real robot tasks, DP3 demonstrates precise control with a +high success rate of 85%, given only 40 demonstrations of each task, and shows +excellent generalization abilities in diverse aspects, including space, +viewpoint, appearance, and instance. Interestingly, in real robot experiments, +DP3 rarely violates safety requirements, in contrast to baseline methods which +frequently do, necessitating human intervention. Our extensive evaluation +highlights the critical importance of 3D representations in real-world robot +learning. Videos, code, and data are available on +https://3d-diffusion-policy.github.io . + +
+
+ comment: Videos, code, and data: https://3d-diffusion-policy.github.io +
+
+
+
+
+ + ♻ ☆ 360+x: A Panoptic Multi-modal Scene Understanding Dataset CVPR 2024 + + +
+ Human perception of the world is shaped by a multitude of viewpoints and +modalities. While many existing datasets focus on scene understanding from a +certain perspective (e.g. egocentric or third-person views), our dataset offers +a panoptic perspective (i.e. multiple viewpoints with multiple data +modalities). Specifically, we encapsulate third-person panoramic and front +views, as well as egocentric monocular/binocular views with rich modalities +including video, multi-channel audio, directional binaural delay, location data +and textual scene descriptions within each scene captured, presenting +comprehensive observation of the world. Figure 1 offers a glimpse of all 28 +scene categories of our 360+x dataset. To the best of our knowledge, this is +the first database that covers multiple viewpoints with multiple data +modalities to mimic how daily information is accessed in the real world. +Through our benchmark analysis, we presented 5 different scene understanding +tasks on the proposed 360+x dataset to evaluate the impact and benefit of each +data modality and perspective in panoptic scene understanding. We hope this +unique dataset could broaden the scope of comprehensive scene understanding and +encourage the community to approach these problems from more diverse +perspectives. + +
+
+ comment: CVPR 2024 (Oral Presentation), Project page: + https://x360dataset.github.io/ +
+
+
+
+
+ + ♻ ☆ A Benchmark Grocery Dataset of Realworld Point Clouds From Single View + + +
+ Fine-grained grocery object recognition is an important computer vision +problem with broad applications in automatic checkout, in-store robotic +navigation, and assistive technologies for the visually impaired. Existing +datasets on groceries are mainly 2D images. Models trained on these datasets +are limited to learning features from the regular 2D grids. While portable 3D +sensors such as Kinect were commonly available for mobile phones, sensors such +as LiDAR and TrueDepth, have recently been integrated into mobile phones. +Despite the availability of mobile 3D sensors, there are currently no dedicated +real-world large-scale benchmark 3D datasets for grocery. In addition, existing +3D datasets lack fine-grained grocery categories and have limited training +samples. Furthermore, collecting data by going around the object versus the +traditional photo capture makes data collection cumbersome. Thus, we introduce +a large-scale grocery dataset called 3DGrocery100. It constitutes 100 classes, +with a total of 87,898 3D point clouds created from 10,755 RGB-D single-view +images. We benchmark our dataset on six recent state-of-the-art 3D point cloud +classification models. Additionally, we also benchmark the dataset on few-shot +and continual learning point cloud classification tasks. Project Page: +https://bigdatavision.org/3DGrocery100/. + +
+
+
+
+
+ + ♻ ☆ Linear Combination of Saved Checkpoints Makes Consistency and Diffusion + Models Better + + +
+ Diffusion Models (DM) and Consistency Models (CM) are two types of popular +generative models with good generation quality on various tasks. When training +DM and CM, intermediate weight checkpoints are not fully utilized and only the +last converged checkpoint is used. In this work, we find that high-quality +model weights often lie in a basin which cannot be reached by SGD but can be +obtained by proper checkpoint averaging. Based on these observations, we +propose LCSC, a simple but effective and efficient method to enhance the +performance of DM and CM, by combining checkpoints along the training +trajectory with coefficients deduced from evolutionary search. We demonstrate +the value of LCSC through two use cases: $\textbf{(a) Reducing training cost.}$ +With LCSC, we only need to train DM/CM with fewer number of iterations and/or +lower batch sizes to obtain comparable sample quality with the fully trained +model. For example, LCSC achieves considerable training speedups for CM +(23$\times$ on CIFAR-10 and 15$\times$ on ImageNet-64). $\textbf{(b) Enhancing +pre-trained models.}$ Assuming full training is already done, LCSC can further +improve the generation quality or speed of the final converged models. For +example, LCSC achieves better performance using 1 number of function evaluation +(NFE) than the base model with 2 NFE on consistency distillation, and decreases +the NFE of DM from 15 to 9 while maintaining the generation quality on +CIFAR-10. Our code is available at +https://github.com/imagination-research/LCSC. + +
+
+
+
+
+ + ♻ ☆ S$^{5}$Mars: Semi-Supervised Learning for Mars Semantic Segmentation + + +
+ Deep learning has become a powerful tool for Mars exploration. Mars terrain +semantic segmentation is an important Martian vision task, which is the base of +rover autonomous planning and safe driving. However, there is a lack of +sufficient detailed and high-confidence data annotations, which are exactly +required by most deep learning methods to obtain a good model. To address this +problem, we propose our solution from the perspective of joint data and method +design. We first present a newdataset S5Mars for Semi-SuperviSed learning on +Mars Semantic Segmentation, which contains 6K high-resolution images and is +sparsely annotated based on confidence, ensuring the high quality of labels. +Then to learn from this sparse data, we propose a semi-supervised learning +(SSL) framework for Mars image semantic segmentation, to learn representations +from limited labeled data. Different from the existing SSL methods which are +mostly targeted at the Earth image data, our method takes into account Mars +data characteristics. Specifically, we first investigate the impact of current +widely used natural image augmentations on Mars images. Based on the analysis, +we then proposed two novel and effective augmentations for SSL of Mars +segmentation, AugIN and SAM-Mix, which serve as strong augmentations to boost +the model performance. Meanwhile, to fully leverage the unlabeled data, we +introduce a soft-to-hard consistency learning strategy, learning from different +targets based on prediction confidence. Experimental results show that our +method can outperform state-of-the-art SSL approaches remarkably. Our proposed +dataset is available at https://jhang2020.github.io/S5Mars.github.io/. + +
+
+ comment: IEEE TGRS 2024 +
+
+
+
+
+ + ♻ ☆ OmniGS: Omnidirectional Gaussian Splatting for Fast Radiance Field + Reconstruction using Omnidirectional Images + + +
+ Photorealistic reconstruction relying on 3D Gaussian Splatting has shown +promising potential in robotics. However, the current 3D Gaussian Splatting +system only supports radiance field reconstruction using undistorted +perspective images. In this paper, we present OmniGS, a novel omnidirectional +Gaussian splatting system, to take advantage of omnidirectional images for fast +radiance field reconstruction. Specifically, we conduct a theoretical analysis +of spherical camera model derivatives in 3D Gaussian Splatting. According to +the derivatives, we then implement a new GPU-accelerated omnidirectional +rasterizer that directly splats 3D Gaussians onto the equirectangular screen +space for omnidirectional image rendering. As a result, we realize +differentiable optimization of the radiance field without the requirement of +cube-map rectification or tangent-plane approximation. Extensive experiments +conducted in egocentric and roaming scenarios demonstrate that our method +achieves state-of-the-art reconstruction quality and high rendering speed using +omnidirectional images. To benefit the research community, the code will be +made publicly available once the paper is published. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Single Mesh Diffusion Models with Field Latents for Texture Generation CVPR 2024 + + +
+ We introduce a framework for intrinsic latent diffusion models operating +directly on the surfaces of 3D shapes, with the goal of synthesizing +high-quality textures. Our approach is underpinned by two contributions: field +latents, a latent representation encoding textures as discrete vector fields on +the mesh vertices, and field latent diffusion models, which learn to denoise a +diffusion process in the learned latent space on the surface. We consider a +single-textured-mesh paradigm, where our models are trained to generate +variations of a given texture on a mesh. We show the synthesized textures are +of superior fidelity compared those from existing single-textured-mesh +generative models. Our models can also be adapted for user-controlled editing +tasks such as inpainting and label-guided generation. The efficacy of our +approach is due in part to the equivariance of our proposed framework under +isometries, allowing our models to seamlessly reproduce details across locally +similar regions and opening the door to a notion of generative texture +transfer. + +
+
+ comment: CVPR 2024. Code and additional visualizations available: + https://single-mesh-diffusion.github.io/ +
+
+
+
+
+ + ♻ ☆ GMISeg: General Medical Image Segmentation without Re-Training + + +
+ Although deep learning models have become the main method for medical image +segmentation, they often cannot be extended to unknown segmentation tasks +involving new anatomical structures, image shapes, or labels. For new +segmentation tasks, researchers often have to retrain or fine-tune the model, +which is time-consuming and poses a significant obstacle to clinical +researchers, who often lack the resources and professional knowledge to train +neural networks. Therefore, we proposed a general method that can solve unknown +medical image segmentation tasks without requiring additional training. Given +an example set of images and prompts for defining new segmentation tasks, +GMISeg applies a novel low-rank fine-tuning strategy based on the proposed +approach to the SAM (Segment Anything Model) image encoder, and works with the +prompt encoder and mask decoder to fine-tune the labeled dataset without the +need for additional training. To achieve generalization of new tasks, we used +medical image datasets with different imaging modes for different parts. We +trained and generalized GMISeg on a different set of anatomical and imaging +modes using cardiac images on other site datasets. We have demonstrated that +GMISeg outperforms the latest methods on unknown tasks and have conducted a +comprehensive analysis and summary of the important performance of the proposed +method. + +
+
+
+
+
+ + ♻ ☆ i-MAE: Are Latent Representations in Masked Autoencoders Linearly + Separable? + + +
+ Masked image modeling (MIM) has been recognized as a strong self-supervised +pre-training approach in the vision domain. However, the mechanism and +properties of the learned representations by such a scheme, as well as how to +further enhance the representations are so far not well-explored. In this +paper, we aim to explore an interactive Masked Autoencoders (i-MAE) framework +to enhance the representation capability from two aspects: (1) employing a +two-way image reconstruction and a latent feature reconstruction with +distillation loss to learn better features; (2) proposing a semantics-enhanced +sampling strategy to boost the learned semantics in MAE. Upon the proposed +i-MAE architecture, we can address two critical questions to explore the +behaviors of the learned representations in MAE: (1) Whether the separability +of latent representations in Masked Autoencoders is helpful for model +performance? We study it by forcing the input as a mixture of two images +instead of one. (2) Whether we can enhance the representations in the latent +feature space by controlling the degree of semantics during sampling on Masked +Autoencoders? To this end, we propose a sampling strategy within a mini-batch +based on the semantics of training samples to examine this aspect. Extensive +experiments are conducted on CIFAR-10/100, Tiny-ImageNet and ImageNet-1K to +verify the observations we discovered. Furthermore, in addition to +qualitatively analyzing the characteristics of the latent representations, we +examine the existence of linear separability and the degree of semantics in the +latent space by proposing two evaluation schemes. The surprising and consistent +results demonstrate that i-MAE is a superior framework design for understanding +MAE frameworks, as well as achieving better representational ability. Code is +available at https://github.com/vision-learning-acceleration-lab/i-mae. + +
+
+ comment: Project page: https://zhiqiangshen.com/projects/i-mae/ +
+
+
+
+
+ + ♻ ☆ Two Tricks to Improve Unsupervised Segmentation Learning + + +
+ We present two practical improvement techniques for unsupervised segmentation +learning. These techniques address limitations in the resolution and accuracy +of predicted segmentation maps of recent state-of-the-art methods. Firstly, we +leverage image post-processing techniques such as guided filtering to refine +the output masks, improving accuracy while avoiding substantial computational +costs. Secondly, we introduce a multi-scale consistency criterion, based on a +teacher-student training scheme. This criterion matches segmentation masks +predicted from regions of the input image extracted at different resolutions to +each other. Experimental results on several benchmarks used in unsupervised +segmentation learning demonstrate the effectiveness of our proposed techniques. + +
+
+
+
+
+ + ♻ ☆ Divide and Conquer: High-Resolution Industrial Anomaly Detection via + Memory Efficient Tiled Ensemble CVPR 24 + + +
+ Industrial anomaly detection is an important task within computer vision with +a wide range of practical use cases. The small size of anomalous regions in +many real-world datasets necessitates processing the images at a high +resolution. This frequently poses significant challenges concerning memory +consumption during the model training and inference stages, leaving some +existing methods impractical for widespread adoption. To overcome this +challenge, we present the tiled ensemble approach, which reduces memory +consumption by dividing the input images into a grid of tiles and training a +dedicated model for each tile location. The tiled ensemble is compatible with +any existing anomaly detection model without the need for any modification of +the underlying architecture. By introducing overlapping tiles, we utilize the +benefits of traditional stacking ensembles, leading to further improvements in +anomaly detection capabilities beyond high resolution alone. We perform a +comprehensive analysis using diverse underlying architectures, including Padim, +PatchCore, FastFlow, and Reverse Distillation, on two standard anomaly +detection datasets: MVTec and VisA. Our method demonstrates a notable +improvement across setups while remaining within GPU memory constraints, +consuming only as much GPU memory as a single model needs to process a single +tile. + +
+
+ comment: To appear at CVPR 24 Visual Anomaly Detection Workshop. Research + conducted during Google Summer of Code 2023 at OpenVINO (Intel). GSoC 2023 + page: https://summerofcode.withgoogle.com/archive/2023/projects/WUSjdxGl +
+
+
+
+
+ + ♻ ☆ TrailBlazer: Trajectory Control for Diffusion-Based Video Generation + + +
+ Within recent approaches to text-to-video (T2V) generation, achieving +controllability in the synthesized video is often a challenge. Typically, this +issue is addressed by providing low-level per-frame guidance in the form of +edge maps, depth maps, or an existing video to be altered. However, the process +of obtaining such guidance can be labor-intensive. This paper focuses on +enhancing controllability in video synthesis by employing straightforward +bounding boxes to guide the subject in various ways, all without the need for +neural network training, finetuning, optimization at inference time, or the use +of pre-existing videos. Our algorithm, TrailBlazer, is constructed upon a +pre-trained (T2V) model, and easy to implement. The subject is directed by a +bounding box through the proposed spatial and temporal attention map editing. +Moreover, we introduce the concept of keyframing, allowing the subject +trajectory and overall appearance to be guided by both a moving bounding box +and corresponding prompts, without the need to provide a detailed mask. The +method is efficient, with negligible additional computation relative to the +underlying pre-trained model. Despite the simplicity of the bounding box +guidance, the resulting motion is surprisingly natural, with emergent effects +including perspective and movement toward the virtual camera as the box size +increases. + +
+
+ comment: 14 pages, 18 figures, Project Page: + https://hohonu-vicml.github.io/Trailblazer.Page/ +
+
+
+
+
+ + ♻ ☆ Knowledge Distillation via the Target-aware Transformer CVPR2022 + + +
+ Knowledge distillation becomes a de facto standard to improve the performance +of small neural networks. Most of the previous works propose to regress the +representational features from the teacher to the student in a one-to-one +spatial matching fashion. However, people tend to overlook the fact that, due +to the architecture differences, the semantic information on the same spatial +location usually vary. This greatly undermines the underlying assumption of the +one-to-one distillation approach. To this end, we propose a novel one-to-all +spatial matching knowledge distillation approach. Specifically, we allow each +pixel of the teacher feature to be distilled to all spatial locations of the +student features given its similarity, which is generated from a target-aware +transformer. Our approach surpasses the state-of-the-art methods by a +significant margin on various computer vision benchmarks, such as ImageNet, +Pascal VOC and COCOStuff10k. Code is available at +https://github.com/sihaoevery/TaT. + +
+
+ comment: CVPR2022(Oral) +
+
+
+
+
+ + ♻ ☆ PAIR-Diffusion: A Comprehensive Multimodal Object-Level Image Editor CVPR 2024 + + +
+ Generative image editing has recently witnessed extremely fast-paced growth. +Some works use high-level conditioning such as text, while others use low-level +conditioning. Nevertheless, most of them lack fine-grained control over the +properties of the different objects present in the image, i.e. object-level +image editing. In this work, we tackle the task by perceiving the images as an +amalgamation of various objects and aim to control the properties of each +object in a fine-grained manner. Out of these properties, we identify structure +and appearance as the most intuitive to understand and useful for editing +purposes. We propose PAIR Diffusion, a generic framework that can enable a +diffusion model to control the structure and appearance properties of each +object in the image. We show that having control over the properties of each +object in an image leads to comprehensive editing capabilities. Our framework +allows for various object-level editing operations on real images such as +reference image-based appearance editing, free-form shape editing, adding +objects, and variations. Thanks to our design, we do not require any inversion +step. Additionally, we propose multimodal classifier-free guidance which +enables editing images using both reference images and text when using our +approach with foundational diffusion models. We validate the above claims by +extensively evaluating our framework on both unconditional and foundational +diffusion models. Please refer to +https://vidit98.github.io/publication/conference-paper/pair_diff.html for code +and model release. + +
+
+ comment: Accepted in CVPR 2024, Project page + https://vidit98.github.io/publication/conference-paper/pair_diff.html +
+
+
+
+
+ + ♻ ☆ DGInStyle: Domain-Generalizable Semantic Segmentation with Image + Diffusion Models and Stylized Semantic Control + + +
+ Large, pretrained latent diffusion models (LDMs) have demonstrated an +extraordinary ability to generate creative content, specialize to user data +through few-shot fine-tuning, and condition their output on other modalities, +such as semantic maps. However, are they usable as large-scale data generators, +e.g., to improve tasks in the perception stack, like semantic segmentation? We +investigate this question in the context of autonomous driving, and answer it +with a resounding "yes". We propose an efficient data generation pipeline +termed DGInStyle. First, we examine the problem of specializing a pretrained +LDM to semantically-controlled generation within a narrow domain. Second, we +propose a Style Swap technique to endow the rich generative prior with the +learned semantic control. Third, we design a Multi-resolution Latent Fusion +technique to overcome the bias of LDMs towards dominant objects. Using +DGInStyle, we generate a diverse dataset of street scenes, train a +domain-agnostic semantic segmentation model on it, and evaluate the model on +multiple popular autonomous driving datasets. Our approach consistently +increases the performance of several domain generalization methods compared to +the previous state-of-the-art methods. Source code and dataset are available at +https://dginstyle.github.io. + +
+
+
+
+
+ + ♻ ☆ No "Zero-Shot" Without Exponential Data: Pretraining Concept Frequency + Determines Multimodal Model Performance ICLR'24 + + +
+ Web-crawled pretraining datasets underlie the impressive "zero-shot" +evaluation performance of multimodal models, such as CLIP for +classification/retrieval and Stable-Diffusion for image generation. However, it +is unclear how meaningful the notion of "zero-shot" generalization is for such +multimodal models, as it is not known to what extent their pretraining datasets +encompass the downstream concepts targeted for during "zero-shot" evaluation. +In this work, we ask: How is the performance of multimodal models on downstream +concepts influenced by the frequency of these concepts in their pretraining +datasets? We comprehensively investigate this question across 34 models and +five standard pretraining datasets (CC-3M, CC-12M, YFCC-15M, LAION-400M, +LAION-Aesthetics), generating over 300GB of data artifacts. We consistently +find that, far from exhibiting "zero-shot" generalization, multimodal models +require exponentially more data to achieve linear improvements in downstream +"zero-shot" performance, following a sample inefficient log-linear scaling +trend. This trend persists even when controlling for sample-level similarity +between pretraining and downstream datasets, and testing on purely synthetic +data distributions. Furthermore, upon benchmarking models on long-tailed data +sampled based on our analysis, we demonstrate that multimodal models across the +board perform poorly. We contribute this long-tail test set as the "Let it +Wag!" benchmark to further research in this direction. Taken together, our +study reveals an exponential need for training data which implies that the key +to "zero-shot" generalization capabilities under large-scale training paradigms +remains to be found. + +
+
+ comment: Extended version of the short paper accepted at DPFM, ICLR'24 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 121 + +
+
+
+ + ☆ Reconstructing Retinal Visual Images from 3T fMRI Data Enhanced by + Unsupervised Learning + + +
+ The reconstruction of human visual inputs from brain activity, particularly +through functional Magnetic Resonance Imaging (fMRI), holds promising avenues +for unraveling the mechanisms of the human visual system. Despite the +significant strides made by deep learning methods in improving the quality and +interpretability of visual reconstruction, there remains a substantial demand +for high-quality, long-duration, subject-specific 7-Tesla fMRI experiments. The +challenge arises in integrating diverse smaller 3-Tesla datasets or +accommodating new subjects with brief and low-quality fMRI scans. In response +to these constraints, we propose a novel framework that generates enhanced 3T +fMRI data through an unsupervised Generative Adversarial Network (GAN), +leveraging unpaired training across two distinct fMRI datasets in 7T and 3T, +respectively. This approach aims to overcome the limitations of the scarcity of +high-quality 7-Tesla data and the challenges associated with brief and +low-quality scans in 3-Tesla experiments. In this paper, we demonstrate the +reconstruction capabilities of the enhanced 3T fMRI data, highlighting its +proficiency in generating superior input visual images compared to +data-intensive methods trained and tested on a single subject. + +
+
+ comment: Accepted by ISBI 2024 +
+
+
+
+
+ + ☆ VMambaMorph: a Visual Mamba-based Framework with Cross-Scan Module for + Deformable 3D Image Registration + + +
+ Image registration, a critical process in medical imaging, involves aligning +different sets of medical imaging data into a single unified coordinate system. +Deep learning networks, such as the Convolutional Neural Network (CNN)-based +VoxelMorph, Vision Transformer (ViT)-based TransMorph, and State Space Model +(SSM)-based MambaMorph, have demonstrated effective performance in this domain. +The recent Visual State Space Model (VMamba), which incorporates a cross-scan +module with SSM, has exhibited promising improvements in modeling global-range +dependencies with efficient computational cost in computer vision tasks. This +paper hereby introduces an exploration of VMamba with image registration, named +VMambaMorph. This novel hybrid VMamba-CNN network is designed specifically for +3D image registration. Utilizing a U-shaped network architecture, VMambaMorph +computes the deformation field based on target and source volumes. The +VMamba-based block with 2D cross-scan module is redesigned for 3D volumetric +feature processing, and a fine-grained feature extraction module is proposed +for high-dimensional feature learning. We validate VMambaMorph using a public +benchmark brain MR-CT registration dataset, comparing its performance against +current state-of-the-art methods. The results indicate that VMambaMorph +achieves competitive registration quality. The code for VMambaMorph is +available on GitHub. + +
+
+
+
+
+ + ☆ LHU-Net: A Light Hybrid U-Net for Cost-Efficient, High-Performance + Volumetric Medical Image Segmentation + + +
+ As a result of the rise of Transformer architectures in medical image +analysis, specifically in the domain of medical image segmentation, a multitude +of hybrid models have been created that merge the advantages of Convolutional +Neural Networks (CNNs) and Transformers. These hybrid models have achieved +notable success by significantly improving segmentation accuracy. Yet, this +progress often comes at the cost of increased model complexity, both in terms +of parameters and computational demand. Moreover, many of these models fail to +consider the crucial interplay between spatial and channel features, which +could further refine and improve segmentation outcomes. To address this, we +introduce LHU-Net, a Light Hybrid U-Net architecture optimized for volumetric +medical image segmentation. LHU-Net is meticulously designed to prioritize +spatial feature analysis in its initial layers before shifting focus to +channel-based features in its deeper layers, ensuring a comprehensive feature +extraction process. Rigorous evaluation across five benchmark datasets - +Synapse, LA, Pancreas, ACDC, and BRaTS 2018 - underscores LHU-Net's superior +performance, showcasing its dual capacity for efficiency and accuracy. Notably, +LHU-Net sets new performance benchmarks, such as attaining a Dice score of +92.66 on the ACDC dataset, while simultaneously reducing parameters by 85% and +quartering the computational load compared to existing state-of-the-art models. +Achieved without any reliance on pre-training, additional data, or model +ensemble, LHU-Net's effectiveness is further evidenced by its state-of-the-art +performance across all evaluated datasets, utilizing fewer than 11 million +parameters. This achievement highlights that balancing computational efficiency +with high accuracy in medical image segmentation is feasible. Our +implementation of LHU-Net is freely accessible to the research community on +GitHub. + +
+
+
+
+
+ + ☆ HaVTR: Improving Video-Text Retrieval Through Augmentation Using Large + Foundation Models + + +
+ While recent progress in video-text retrieval has been driven by the +exploration of powerful model architectures and training strategies, the +representation learning ability of video-text retrieval models is still limited +due to low-quality and scarce training data annotations. To address this issue, +we present a novel video-text learning paradigm, HaVTR, which augments video +and text data to learn more generalized features. Specifically, we first adopt +a simple augmentation method, which generates self-similar data by randomly +duplicating or dropping subwords and frames. In addition, inspired by the +recent advancement in visual and language generative models, we propose a more +powerful augmentation method through textual paraphrasing and video stylization +using large language models (LLMs) and visual generative models (VGMs). +Further, to bring richer information into video and text, we propose a +hallucination-based augmentation method, where we use LLMs and VGMs to generate +and add new relevant information to the original data. Benefiting from the +enriched data, extensive experiments on several video-text retrieval benchmarks +demonstrate the superiority of HaVTR over existing methods. + +
+
+
+
+
+ + ☆ Spatial Cognition from Egocentric Video: Out of Sight, Not Out of Mind + + +
+ As humans move around, performing their daily tasks, they are able to recall +where they have positioned objects in their environment, even if these objects +are currently out of sight. In this paper, we aim to mimic this spatial +cognition ability. We thus formulate the task of Out of Sight, Not Out of Mind +- 3D tracking active objects using observations captured through an egocentric +camera. We introduce Lift, Match and Keep (LMK), a method which lifts partial +2D observations to 3D world coordinates, matches them over time using visual +appearance, 3D location and interactions to form object tracks, and keeps these +object tracks even when they go out-of-view of the camera - hence keeping in +mind what is out of sight. We test LMK on 100 long videos from EPIC-KITCHENS. +Our results demonstrate that spatial cognition is critical for correctly +locating objects over short and long time scales. E.g., for one long egocentric +video, we estimate the 3D location of 50 active objects. Of these, 60% can be +correctly positioned in 3D after 2 minutes of leaving the camera view. + +
+
+ comment: 21 pages including references and appendix. Project Webpage: + http://dimadamen.github.io/OSNOM/ +
+
+
+
+
+ + ☆ AirShot: Efficient Few-Shot Detection for Autonomous Exploration + + +
+ Few-shot object detection has drawn increasing attention in the field of +robotic exploration, where robots are required to find unseen objects with a +few online provided examples. Despite recent efforts have been made to yield +online processing capabilities, slow inference speeds of low-powered robots +fail to meet the demands of real-time detection-making them impractical for +autonomous exploration. Existing methods still face performance and efficiency +challenges, mainly due to unreliable features and exhaustive class loops. In +this work, we propose a new paradigm AirShot, and discover that, by fully +exploiting the valuable correlation map, AirShot can result in a more robust +and faster few-shot object detection system, which is more applicable to +robotics community. The core module Top Prediction Filter (TPF) can operate on +multi-scale correlation maps in both the training and inference stages. During +training, TPF supervises the generation of a more representative correlation +map, while during inference, it reduces looping iterations by selecting +top-ranked classes, thus cutting down on computational costs with better +performance. Surprisingly, this dual functionality exhibits general +effectiveness and efficiency on various off-the-shelf models. Exhaustive +experiments on COCO2017, VOC2014, and SubT datasets demonstrate that TPF can +significantly boost the efficacy and efficiency of most off-the-shelf models, +achieving up to 36.4% precision improvements along with 56.3% faster inference +speed. Code and Data are at: https://github.com/ImNotPrepared/AirShot. + +
+
+
+
+
+ + ☆ AUEditNet: Dual-Branch Facial Action Unit Intensity Manipulation with + Implicit Disentanglement + + +
+ Facial action unit (AU) intensity plays a pivotal role in quantifying +fine-grained expression behaviors, which is an effective condition for facial +expression manipulation. However, publicly available datasets containing +intensity annotations for multiple AUs remain severely limited, often featuring +a restricted number of subjects. This limitation places challenges to the AU +intensity manipulation in images due to disentanglement issues, leading +researchers to resort to other large datasets with pretrained AU intensity +estimators for pseudo labels. In addressing this constraint and fully +leveraging manual annotations of AU intensities for precise manipulation, we +introduce AUEditNet. Our proposed model achieves impressive intensity +manipulation across 12 AUs, trained effectively with only 18 subjects. +Utilizing a dual-branch architecture, our approach achieves comprehensive +disentanglement of facial attributes and identity without necessitating +additional loss functions or implementing with large batch sizes. This approach +offers a potential solution to achieve desired facial attribute editing despite +the dataset's limited subject count. Our experiments demonstrate AUEditNet's +superior accuracy in editing AU intensities, affirming its capability in +disentangling facial attributes and identity within a limited subject pool. +AUEditNet allows conditioning by either intensity values or target images, +eliminating the need for constructing AU combinations for specific facial +expression synthesis. Moreover, AU intensity estimation, as a downstream task, +validates the consistency between real and edited images, confirming the +effectiveness of our proposed AU intensity manipulation method. + +
+
+
+
+
+ + ☆ Automated Prediction of Breast Cancer Response to Neoadjuvant + Chemotherapy from DWI Data + + +
+ Effective surgical planning for breast cancer hinges on accurately predicting +pathological complete response (pCR) to neoadjuvant chemotherapy (NAC). +Diffusion-weighted MRI (DWI) and machine learning offer a non-invasive approach +for early pCR assessment. However, most machine-learning models require manual +tumor segmentation, a cumbersome and error-prone task. We propose a deep +learning model employing "Size-Adaptive Lesion Weighting" for automatic DWI +tumor segmentation to enhance pCR prediction accuracy. Despite +histopathological changes during NAC complicating DWI image segmentation, our +model demonstrates robust performance. Utilizing the BMMR2 challenge dataset, +it matches human experts in pCR prediction pre-NAC with an area under the curve +(AUC) of 0.76 vs. 0.796, and surpasses standard automated methods mid-NAC, with +an AUC of 0.729 vs. 0.654 and 0.576. Our approach represents a significant +advancement in automating breast cancer treatment planning, enabling more +reliable pCR predictions without manual segmentation. + +
+
+ comment: Accepted for presentation at the IEEE International Symposium on + Biomedical Imaging (ISBI) +
+
+
+
+
+ + ☆ Facial Affective Behavior Analysis with Instruction Tuning + + +
+ Facial affective behavior analysis (FABA) is crucial for understanding human +mental states from images. However, traditional approaches primarily deploy +models to discriminate among discrete emotion categories, and lack the fine +granularity and reasoning capability for complex facial behaviors. The advent +of Multi-modal Large Language Models (MLLMs) has been proven successful in +general visual understanding tasks. However, directly harnessing MLLMs for FABA +is challenging due to the scarcity of datasets and benchmarks, neglecting +facial prior knowledge, and low training efficiency. To address these +challenges, we introduce (i) an instruction-following dataset for two FABA +tasks, e.g., emotion and action unit recognition, (ii) a benchmark FABA-Bench +with a new metric considering both recognition and generation ability, and +(iii) a new MLLM "EmoLA" as a strong baseline to the community. Our initiative +on the dataset and benchmarks reveal the nature and rationale of facial +affective behaviors, i.e., fine-grained facial movement, interpretability, and +reasoning. Moreover, to build an effective and efficient FABA MLLM, we +introduce a facial prior expert module with face structure knowledge and a +low-rank adaptation module into pre-trained MLLM. We conduct extensive +experiments on FABA-Bench and four commonly-used FABA datasets. The results +demonstrate that the proposed facial prior expert can boost the performance and +EmoLA achieves the best results on our FABA-Bench. On commonly-used FABA +datasets, EmoLA is competitive rivaling task-specific state-of-the-art models. + +
+
+ comment: V1.0 +
+
+
+
+
+ + ☆ PlateSegFL: A Privacy-Preserving License Plate Detection Using Federated + Segmentation Learning + + +
+ Automatic License Plate Recognition (ALPR) is an integral component of an +intelligent transport system with extensive applications in secure +transportation, vehicle-to-vehicle communication, stolen vehicles detection, +traffic violations, and traffic flow management. The existing license plate +detection system focuses on one-shot learners or pre-trained models that +operate with a geometric bounding box, limiting the model's performance. +Furthermore, continuous video data streams uploaded to the central server +result in network and complexity issues. To combat this, PlateSegFL was +introduced, which implements U-Net-based segmentation along with Federated +Learning (FL). U-Net is well-suited for multi-class image segmentation tasks +because it can analyze a large number of classes and generate a pixel-level +segmentation map for each class. Federated Learning is used to reduce the +quantity of data required while safeguarding the user's privacy. Different +computing platforms, such as mobile phones, are able to collaborate on the +development of a standard prediction model where it makes efficient use of +one's time; incorporates more diverse data; delivers projections in real-time; +and requires no physical effort from the user; resulting around 95% F1 score. + +
+
+
+
+
+ + ☆ FGAIF: Aligning Large Vision-Language Models with Fine-grained AI + Feedback + + +
+ Large Vision-Language Models (LVLMs) have demonstrated proficiency in +tackling a variety of visual-language tasks. However, current LVLMs suffer from +misalignment between text and image modalities which causes three kinds of +hallucination problems, i.e., object existence, object attribute, and object +relationship. To tackle this issue, existing methods mainly utilize +Reinforcement Learning (RL) to align modalities in LVLMs. However, they still +suffer from three main limitations: (1) General feedback can not indicate the +hallucination type contained in the response; (2) Sparse rewards only give the +sequence-level reward for the whole response; and (3)Annotation cost is +time-consuming and labor-intensive. To handle these limitations, we propose an +innovative method to align modalities in LVLMs through Fine-Grained Artificial +Intelligence Feedback (FGAIF), which mainly consists of three steps: AI-based +Feedback Collection, Fine-grained Reward Model Training, and Reinforcement +Learning with Fine-grained Reward. Specifically, We first utilize AI tools to +predict the types of hallucination for each segment in the response and obtain +a collection of fine-grained feedback. Then, based on the collected reward +data, three specialized reward models are trained to produce dense rewards. +Finally, a novel fine-grained feedback module is integrated into the Proximal +Policy Optimization (PPO) algorithm. Extensive experiments are conducted on +hallucination and general benchmarks, demonstrating the superior performance of +our proposed method. Notably, compared with previous models trained with the +RL-based aligning method, our proposed method is effective even with fewer +parameters. + +
+
+
+
+
+ + ☆ LOGO: A Long-Form Video Dataset for Group Action Quality Assessment CVPR 2023 + + +
+ Action quality assessment (AQA) has become an emerging topic since it can be +extensively applied in numerous scenarios. However, most existing methods and +datasets focus on single-person short-sequence scenes, hindering the +application of AQA in more complex situations. To address this issue, we +construct a new multi-person long-form video dataset for action quality +assessment named LOGO. Distinguished in scenario complexity, our dataset +contains 200 videos from 26 artistic swimming events with 8 athletes in each +sample along with an average duration of 204.2 seconds. As for richness in +annotations, LOGO includes formation labels to depict group information of +multiple athletes and detailed annotations on action procedures. Furthermore, +we propose a simple yet effective method to model relations among athletes and +reason about the potential temporal logic in long-form videos. Specifically, we +design a group-aware attention module, which can be easily plugged into +existing AQA methods, to enrich the clip-wise representations based on +contextual group information. To benchmark LOGO, we systematically conduct +investigations on the performance of several popular methods in AQA and action +segmentation. The results reveal the challenges our dataset brings. Extensive +experiments also show that our approach achieves state-of-the-art on the LOGO +dataset. The dataset and code will be released at +\url{https://github.com/shiyi-zh0408/LOGO }. + +
+
+ comment: Accepted by CVPR 2023 +
+
+
+
+
+ + ☆ PathFinder: Attention-Driven Dynamic Non-Line-of-Sight Tracking with a + Mobile Robot + + +
+ The study of non-line-of-sight (NLOS) imaging is growing due to its many +potential applications, including rescue operations and pedestrian detection by +self-driving cars. However, implementing NLOS imaging on a moving camera +remains an open area of research. Existing NLOS imaging methods rely on +time-resolved detectors and laser configurations that require precise optical +alignment, making it difficult to deploy them in dynamic environments. This +work proposes a data-driven approach to NLOS imaging, PathFinder, that can be +used with a standard RGB camera mounted on a small, power-constrained mobile +robot, such as an aerial drone. Our experimental pipeline is designed to +accurately estimate the 2D trajectory of a person who moves in a +Manhattan-world environment while remaining hidden from the camera's +field-of-view. We introduce a novel approach to process a sequence of dynamic +successive frames in a line-of-sight (LOS) video using an attention-based +neural network that performs inference in real-time. The method also includes a +preprocessing selection metric that analyzes images from a moving camera which +contain multiple vertical planar surfaces, such as walls and building facades, +and extracts planes that return maximum NLOS information. We validate the +approach on in-the-wild scenes using a drone for video capture, thus +demonstrating low-cost NLOS imaging in dynamic capture environments. + +
+
+ comment: First two authors have equal contribution +
+
+
+
+
+ + ☆ Scalable and Efficient Hierarchical Visual Topological Mapping + + +
+ Hierarchical topological representations can significantly reduce search +times within mapping and localization algorithms. Although recent research has +shown the potential for such approaches, limited consideration has been given +to the suitability and comparative performance of different global feature +representations within this context. In this work, we evaluate state-of-the-art +hand-crafted and learned global descriptors using a hierarchical topological +mapping technique on benchmark datasets and present results of a comprehensive +evaluation of the impact of the global descriptor used. Although learned +descriptors have been incorporated into place recognition methods to improve +retrieval accuracy and enhance overall recall, the problem of scalability and +efficiency when applied to longer trajectories has not been adequately +addressed in a majority of research studies. Based on our empirical analysis of +multiple runs, we identify that continuity and distinctiveness are crucial +characteristics for an optimal global descriptor that enable efficient and +scalable hierarchical mapping, and present a methodology for quantifying and +contrasting these characteristics across different global descriptors. Our +study demonstrates that the use of global descriptors based on an unsupervised +learned Variational Autoencoder (VAE) excels in these characteristics and +achieves significantly lower runtime. It runs on a consumer grade desktop, up +to 2.3x faster than the second best global descriptor, NetVLAD, and up to 9.5x +faster than the hand-crafted descriptor, PHOG, on the longest track evaluated +(St Lucia, 17.6 km), without sacrificing overall recall performance. + +
+
+ comment: Published in the 21st International Conference on Advanced Robotics + (ICAR 2023) +
+
+
+
+
+ + ☆ DinoBloom: A Foundation Model for Generalizable Cell Embeddings in + Hematology + + +
+ In hematology, computational models offer significant potential to improve +diagnostic accuracy, streamline workflows, and reduce the tedious work of +analyzing single cells in peripheral blood or bone marrow smears. However, +clinical adoption of computational models has been hampered by the lack of +generalization due to large batch effects, small dataset sizes, and poor +performance in transfer learning from natural images. To address these +challenges, we introduce DinoBloom, the first foundation model for single cell +images in hematology, utilizing a tailored DINOv2 pipeline. Our model is built +upon an extensive collection of 13 diverse, publicly available datasets of +peripheral blood and bone marrow smears, the most substantial open-source +cohort in hematology so far, comprising over 380,000 white blood cell images. +To assess its generalization capability, we evaluate it on an external dataset +with a challenging domain shift. We show that our model outperforms existing +medical and non-medical vision models in (i) linear probing and k-nearest +neighbor evaluations for cell-type classification on blood and bone marrow +smears and (ii) weakly supervised multiple instance learning for acute myeloid +leukemia subtyping by a large margin. A family of four DinoBloom models (small, +base, large, and giant) can be adapted for a wide range of downstream +applications, be a strong baseline for classification problems, and facilitate +the assessment of batch effects in new datasets. All models are available at +github.com/marrlab/DinoBloom. + +
+
+
+
+
+ + ☆ Hyperbolic Learning with Synthetic Captions for Open-World Detection CVPR 2024 + + +
+ Open-world detection poses significant challenges, as it requires the +detection of any object using either object class labels or free-form texts. +Existing related works often use large-scale manual annotated caption datasets +for training, which are extremely expensive to collect. Instead, we propose to +transfer knowledge from vision-language models (VLMs) to enrich the +open-vocabulary descriptions automatically. Specifically, we bootstrap dense +synthetic captions using pre-trained VLMs to provide rich descriptions on +different regions in images, and incorporate these captions to train a novel +detector that generalizes to novel concepts. To mitigate the noise caused by +hallucination in synthetic captions, we also propose a novel hyperbolic +vision-language learning approach to impose a hierarchy between visual and +caption embeddings. We call our detector ``HyperLearner''. We conduct extensive +experiments on a wide variety of open-world detection benchmarks (COCO, LVIS, +Object Detection in the Wild, RefCOCO) and our results show that our model +consistently outperforms existing state-of-the-art methods, such as GLIP, +GLIPv2 and Grounding DINO, when using the same backbone. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ MagicTime: Time-lapse Video Generation Models as Metamorphic Simulators + + +
+ Recent advances in Text-to-Video generation (T2V) have achieved remarkable +success in synthesizing high-quality general videos from textual descriptions. +A largely overlooked problem in T2V is that existing models have not adequately +encoded physical knowledge of the real world, thus generated videos tend to +have limited motion and poor variations. In this paper, we propose +\textbf{MagicTime}, a metamorphic time-lapse video generation model, which +learns real-world physics knowledge from time-lapse videos and implements +metamorphic generation. First, we design a MagicAdapter scheme to decouple +spatial and temporal training, encode more physical knowledge from metamorphic +videos, and transform pre-trained T2V models to generate metamorphic videos. +Second, we introduce a Dynamic Frames Extraction strategy to adapt to +metamorphic time-lapse videos, which have a wider variation range and cover +dramatic object metamorphic processes, thus embodying more physical knowledge +than general videos. Finally, we introduce a Magic Text-Encoder to improve the +understanding of metamorphic video prompts. Furthermore, we create a time-lapse +video-text dataset called \textbf{ChronoMagic}, specifically curated to unlock +the metamorphic video generation ability. Extensive experiments demonstrate the +superiority and effectiveness of MagicTime for generating high-quality and +dynamic metamorphic videos, suggesting time-lapse video generation is a +promising path toward building metamorphic simulators of the physical world. + +
+
+
+
+
+ + ☆ Camera-Based Remote Physiology Sensing for Hundreds of Subjects Across + Skin Tones + + +
+ Remote photoplethysmography (rPPG) emerges as a promising method for +non-invasive, convenient measurement of vital signs, utilizing the widespread +presence of cameras. Despite advancements, existing datasets fall short in +terms of size and diversity, limiting comprehensive evaluation under diverse +conditions. This paper presents an in-depth analysis of the VitalVideo dataset, +the largest real-world rPPG dataset to date, encompassing 893 subjects and 6 +Fitzpatrick skin tones. Our experimentation with six unsupervised methods and +three supervised models demonstrates that datasets comprising a few hundred +subjects(i.e., 300 for UBFC-rPPG, 500 for PURE, and 700 for MMPD-Simple) are +sufficient for effective rPPG model training. Our findings highlight the +importance of diversity and consistency in skin tones for precise performance +evaluation across different datasets. + +
+
+ comment: 11 pages, 5 figures, CHI24 Workshop PhysioCHI +
+
+
+
+
+ + ☆ Dual-Scale Transformer for Large-Scale Single-Pixel Imaging CVPR 2024 + + +
+ Single-pixel imaging (SPI) is a potential computational imaging technique +which produces image by solving an illposed reconstruction problem from few +measurements captured by a single-pixel detector. Deep learning has achieved +impressive success on SPI reconstruction. However, previous poor reconstruction +performance and impractical imaging model limit its real-world applications. In +this paper, we propose a deep unfolding network with hybrid-attention +Transformer on Kronecker SPI model, dubbed HATNet, to improve the imaging +quality of real SPI cameras. Specifically, we unfold the computation graph of +the iterative shrinkagethresholding algorithm (ISTA) into two alternative +modules: efficient tensor gradient descent and hybrid-attention multiscale +denoising. By virtue of Kronecker SPI, the gradient descent module can avoid +high computational overheads rooted in previous gradient descent modules based +on vectorized SPI. The denoising module is an encoder-decoder architecture +powered by dual-scale spatial attention for high- and low-frequency aggregation +and channel attention for global information recalibration. Moreover, we build +a SPI prototype to verify the effectiveness of the proposed method. Extensive +experiments on synthetic and real data demonstrate that our method achieves the +state-of-the-art performance. The source code and pre-trained models are +available at https://github.com/Gang-Qu/HATNet-SPI. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Weakly Supervised Deep Hyperspherical Quantization for Image Retrieval AAAI 2021 + + +
+ Deep quantization methods have shown high efficiency on large-scale image +retrieval. However, current models heavily rely on ground-truth information, +hindering the application of quantization in label-hungry scenarios. A more +realistic demand is to learn from inexhaustible uploaded images that are +associated with informal tags provided by amateur users. Though such sketchy +tags do not obviously reveal the labels, they actually contain useful semantic +information for supervising deep quantization. To this end, we propose +Weakly-Supervised Deep Hyperspherical Quantization (WSDHQ), which is the first +work to learn deep quantization from weakly tagged images. Specifically, 1) we +use word embeddings to represent the tags and enhance their semantic +information based on a tag correlation graph. 2) To better preserve semantic +information in quantization codes and reduce quantization error, we jointly +learn semantics-preserving embeddings and supervised quantizer on hypersphere +by employing a well-designed fusion layer and tailor-made loss functions. +Extensive experiments show that WSDHQ can achieve state-of-art performance on +weakly-supervised compact coding. Code is available at +https://github.com/gimpong/AAAI21-WSDHQ. + +
+
+ comment: In proceedings of AAAI 2021. Code and data are available +
+
+
+
+
+ + ☆ Fantastic Animals and Where to Find Them: Segment Any Marine Animal with + Dual SAM CVPR2024 + + +
+ As an important pillar of underwater intelligence, Marine Animal Segmentation +(MAS) involves segmenting animals within marine environments. Previous methods +don't excel in extracting long-range contextual features and overlook the +connectivity between discrete pixels. Recently, Segment Anything Model (SAM) +offers a universal framework for general segmentation tasks. Unfortunately, +trained with natural images, SAM does not obtain the prior knowledge from +marine images. In addition, the single-position prompt of SAM is very +insufficient for prior guidance. To address these issues, we propose a novel +feature learning framework, named Dual-SAM for high-performance MAS. To this +end, we first introduce a dual structure with SAM's paradigm to enhance feature +learning of marine images. Then, we propose a Multi-level Coupled Prompt (MCP) +strategy to instruct comprehensive underwater prior information, and enhance +the multi-level features of SAM's encoder with adapters. Subsequently, we +design a Dilated Fusion Attention Module (DFAM) to progressively integrate +multi-level features from SAM's encoder. Finally, instead of directly +predicting the masks of marine animals, we propose a Criss-Cross Connectivity +Prediction (C$^3$P) paradigm to capture the inter-connectivity between discrete +pixels. With dual decoders, it generates pseudo-labels and achieves mutual +supervision for complementary feature representations, resulting in +considerable improvements over previous techniques. Extensive experiments +verify that our proposed method achieves state-of-the-art performances on five +widely-used MAS datasets. The code is available at +https://github.com/Drchip61/Dual_SAM. + +
+
+ comment: Accepted by CVPR2024 as Poster(Highlight) +
+
+
+
+
+ + ☆ Efficient Surgical Tool Recognition via HMM-Stabilized Deep Learning + + +
+ Recognizing various surgical tools, actions and phases from surgery videos is +an important problem in computer vision with exciting clinical applications. +Existing deep-learning-based methods for this problem either process each +surgical video as a series of independent images without considering their +dependence, or rely on complicated deep learning models to count for dependence +of video frames. In this study, we revealed from exploratory data analysis that +surgical videos enjoy relatively simple semantic structure, where the presence +of surgical phases and tools can be well modeled by a compact hidden Markov +model (HMM). Based on this observation, we propose an HMM-stabilized deep +learning method for tool presence detection. A wide range of experiments +confirm that the proposed approaches achieve better performance with lower +training and running costs, and support more flexible ways to construct and +utilize training data in scenarios where not all surgery videos of interest are +extensively labelled. These results suggest that popular deep learning +approaches with over-complicated model structures may suffer from inefficient +utilization of data, and integrating ingredients of deep learning and +statistical learning wisely may lead to more powerful algorithms that enjoy +competitive performance, transparent interpretation and convenient model +training simultaneously. + +
+
+
+
+
+ + ☆ Dynamic Distinction Learning: Adaptive Pseudo Anomalies for Video + Anomaly Detection CVPR2024 + + +
+ We introduce Dynamic Distinction Learning (DDL) for Video Anomaly Detection, +a novel video anomaly detection methodology that combines pseudo-anomalies, +dynamic anomaly weighting, and a distinction loss function to improve detection +accuracy. By training on pseudo-anomalies, our approach adapts to the +variability of normal and anomalous behaviors without fixed anomaly thresholds. +Our model showcases superior performance on the Ped2, Avenue and ShanghaiTech +datasets, where individual models are tailored for each scene. These +achievements highlight DDL's effectiveness in advancing anomaly detection, +offering a scalable and adaptable solution for video surveillance challenges. + +
+
+ comment: To be published in the CVPR2024 Workshop +
+
+
+
+
+ + ☆ Primary liver cancer classification from routine tumour biopsy using + weakly supervised deep learning + + +
+ The diagnosis of primary liver cancers (PLCs) can be challenging, especially +on biopsies and for combined hepatocellular-cholangiocarcinoma (cHCC-CCA). We +automatically classified PLCs on routine-stained biopsies using a weakly +supervised learning method. Weak tumour/non-tumour annotations served as labels +for training a Resnet18 neural network, and the network's last convolutional +layer was used to extract new tumour tile features. Without knowledge of the +precise labels of the malignancies, we then applied an unsupervised clustering +algorithm. Our model identified specific features of hepatocellular carcinoma +(HCC) and intrahepatic cholangiocarcinoma (iCCA). Despite no specific features +of cHCC-CCA being recognized, the identification of HCC and iCCA tiles within a +slide could facilitate the diagnosis of primary liver cancers, particularly +cHCC-CCA. + Method and results: 166 PLC biopsies were divided into training, internal and +external validation sets: 90, 29 and 47 samples. Two liver pathologists +reviewed each whole-slide hematein eosin saffron (HES)-stained image (WSI). +After annotating the tumour/non-tumour areas, 256x256 pixel tiles were +extracted from the WSIs and used to train a ResNet18. The network was used to +extract new tile features. An unsupervised clustering algorithm was then +applied to the new tile features. In a two-cluster model, Clusters 0 and 1 +contained mainly HCC and iCCA histological features. The diagnostic agreement +between the pathological diagnosis and the model predictions in the internal +and external validation sets was 100% (11/11) and 96% (25/26) for HCC and 78% +(7/9) and 87% (13/15) for iCCA, respectively. For cHCC-CCA, we observed a +highly variable proportion of tiles from each cluster (Cluster 0: 5-97%; +Cluster 1: 2-94%). + +
+
+ comment: https://www.sciencedirect.com/science/article/pii/S2589555924000090 +
+
+
+
+
+ + ☆ FPL+: Filtered Pseudo Label-based Unsupervised Cross-Modality Adaptation + for 3D Medical Image Segmentation + + +
+ Adapting a medical image segmentation model to a new domain is important for +improving its cross-domain transferability, and due to the expensive annotation +process, Unsupervised Domain Adaptation (UDA) is appealing where only unlabeled +images are needed for the adaptation. Existing UDA methods are mainly based on +image or feature alignment with adversarial training for regularization, and +they are limited by insufficient supervision in the target domain. In this +paper, we propose an enhanced Filtered Pseudo Label (FPL+)-based UDA method for +3D medical image segmentation. It first uses cross-domain data augmentation to +translate labeled images in the source domain to a dual-domain training set +consisting of a pseudo source-domain set and a pseudo target-domain set. To +leverage the dual-domain augmented images to train a pseudo label generator, +domain-specific batch normalization layers are used to deal with the domain +shift while learning the domain-invariant structure features, generating +high-quality pseudo labels for target-domain images. We then combine labeled +source-domain images and target-domain images with pseudo labels to train a +final segmentor, where image-level weighting based on uncertainty estimation +and pixel-level weighting based on dual-domain consensus are proposed to +mitigate the adverse effect of noisy pseudo labels. Experiments on three public +multi-modal datasets for Vestibular Schwannoma, brain tumor and whole heart +segmentation show that our method surpassed ten state-of-the-art UDA methods, +and it even achieved better results than fully supervised learning in the +target domain in some cases. + +
+
+ comment: 12 pages, 7 figures +
+
+
+
+
+ + ☆ PairAug: What Can Augmented Image-Text Pairs Do for Radiology? CVPR2024 + + +
+ Current vision-language pre-training (VLP) methodologies predominantly depend +on paired image-text datasets, a resource that is challenging to acquire in +radiology due to privacy considerations and labelling complexities. Data +augmentation provides a practical solution to overcome the issue of data +scarcity, however, most augmentation methods exhibit a limited focus, +prioritising either image or text augmentation exclusively. Acknowledging this +limitation, our objective is to devise a framework capable of concurrently +augmenting medical image and text data. We design a Pairwise Augmentation +(PairAug) approach that contains an Inter-patient Augmentation (InterAug) +branch and an Intra-patient Augmentation (IntraAug) branch. Specifically, the +InterAug branch of our approach generates radiology images using synthesised +yet plausible reports derived from a Large Language Model (LLM). The generated +pairs can be considered a collection of new patient cases since they are +artificially created and may not exist in the original dataset. In contrast, +the IntraAug branch uses newly generated reports to manipulate images. This +process allows us to create new paired data for each individual with diverse +medical conditions. Our extensive experiments on various downstream tasks +covering medical image classification zero-shot and fine-tuning analysis +demonstrate that our PairAug, concurrently expanding both image and text data, +substantially outperforms image-/text-only expansion baselines and advanced +medical VLP baselines. Our code is released at +\url{https://github.com/YtongXie/PairAug}. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ☆ Gaussian Shading: Provable Performance-Lossless Image Watermarking for + Diffusion Models CVPR 2024 + + +
+ Ethical concerns surrounding copyright protection and inappropriate content +generation pose challenges for the practical implementation of diffusion +models. One effective solution involves watermarking the generated images. +However, existing methods often compromise the model performance or require +additional training, which is undesirable for operators and users. To address +this issue, we propose Gaussian Shading, a diffusion model watermarking +technique that is both performance-lossless and training-free, while serving +the dual purpose of copyright protection and tracing of offending content. Our +watermark embedding is free of model parameter modifications and thus is +plug-and-play. We map the watermark to latent representations following a +standard Gaussian distribution, which is indistinguishable from latent +representations obtained from the non-watermarked diffusion model. Therefore we +can achieve watermark embedding with lossless performance, for which we also +provide theoretical proof. Furthermore, since the watermark is intricately +linked with image semantics, it exhibits resilience to lossy processing and +erasure attempts. The watermark can be extracted by Denoising Diffusion +Implicit Models (DDIM) inversion and inverse sampling. We evaluate Gaussian +Shading on multiple versions of Stable Diffusion, and the results demonstrate +that Gaussian Shading not only is performance-lossless but also outperforms +existing methods in terms of robustness. + +
+
+ comment: 17 pages, 11 figures, accepted by CVPR 2024 +
+
+
+
+
+ + ☆ High-Discriminative Attribute Feature Learning for Generalized Zero-Shot + Learning + + +
+ Zero-shot learning(ZSL) aims to recognize new classes without prior exposure +to their samples, relying on semantic knowledge from observed classes. However, +current attention-based models may overlook the transferability of visual +features and the distinctiveness of attribute localization when learning +regional features in images. Additionally, they often overlook shared +attributes among different objects. Highly discriminative attribute features +are crucial for identifying and distinguishing unseen classes. To address these +issues, we propose an innovative approach called High-Discriminative Attribute +Feature Learning for Generalized Zero-Shot Learning (HDAFL). HDAFL optimizes +visual features by learning attribute features to obtain discriminative visual +embeddings. Specifically, HDAFL utilizes multiple convolutional kernels to +automatically learn discriminative regions highly correlated with attributes in +images, eliminating irrelevant interference in image features. Furthermore, we +introduce a Transformer-based attribute discrimination encoder to enhance the +discriminative capability among attributes. Simultaneously, the method employs +contrastive loss to alleviate dataset biases and enhance the transferability of +visual features, facilitating better semantic transfer between seen and unseen +classes. Experimental results demonstrate the effectiveness of HDAFL across +three widely used datasets. + +
+
+
+
+
+ + ☆ AnimateZoo: Zero-shot Video Generation of Cross-Species Animation via + Subject Alignment + + +
+ Recent video editing advancements rely on accurate pose sequences to animate +subjects. However, these efforts are not suitable for cross-species animation +due to pose misalignment between species (for example, the poses of a cat +differs greatly from that of a pig due to differences in body structure). In +this paper, we present AnimateZoo, a zero-shot diffusion-based video generator +to address this challenging cross-species animation issue, aiming to accurately +produce animal animations while preserving the background. The key technique +used in our AnimateZoo is subject alignment, which includes two steps. First, +we improve appearance feature extraction by integrating a Laplacian detail +booster and a prompt-tuning identity extractor. These components are +specifically designed to capture essential appearance information, including +identity and fine details. Second, we align shape features and address +conflicts from differing subjects by introducing a scale-information remover. +This ensures accurate cross-species animation. Moreover, we introduce two +high-quality animal video datasets featuring a wide variety of species. Trained +on these extensive datasets, our model is capable of generating videos +characterized by accurate movements, consistent appearance, and high-fidelity +frames, without the need for the pre-inference fine-tuning that prior arts +required. Extensive experiments showcase the outstanding performance of our +method in cross-species action following tasks, demonstrating exceptional shape +adaptation capability. The project page is available at +https://justinxu0.github.io/AnimateZoo/. + +
+
+ comment: Technical report,15 pages +
+
+
+
+
+ + ☆ Bootstrapping Chest CT Image Understanding by Distilling Knowledge from + X-ray Expert Models CVPR 2024 + + +
+ Radiologists highly desire fully automated versatile AI for medical imaging +interpretation. However, the lack of extensively annotated large-scale +multi-disease datasets has hindered the achievement of this goal. In this +paper, we explore the feasibility of leveraging language as a naturally +high-quality supervision for chest CT imaging. In light of the limited +availability of image-report pairs, we bootstrap the understanding of 3D chest +CT images by distilling chest-related diagnostic knowledge from an extensively +pre-trained 2D X-ray expert model. Specifically, we propose a language-guided +retrieval method to match each 3D CT image with its semantically closest 2D +X-ray image, and perform pair-wise and semantic relation knowledge +distillation. Subsequently, we use contrastive learning to align images and +reports within the same patient while distinguishing them from the other +patients. However, the challenge arises when patients have similar semantic +diagnoses, such as healthy patients, potentially confusing if treated as +negatives. We introduce a robust contrastive learning that identifies and +corrects these false negatives. We train our model with over 12,000 pairs of +chest CT images and radiology reports. Extensive experiments across multiple +scenarios, including zero-shot learning, report generation, and fine-tuning +processes, demonstrate the model's feasibility in interpreting chest CT images. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Anomaly Detection in Electrocardiograms: Advancing Clinical Diagnosis + Through Self-Supervised Learning + + +
+ The electrocardiogram (ECG) is an essential tool for diagnosing heart +disease, with computer-aided systems improving diagnostic accuracy and reducing +healthcare costs. Despite advancements, existing systems often miss rare +cardiac anomalies that could be precursors to serious, life-threatening issues +or alterations in the cardiac macro/microstructure. We address this gap by +focusing on self-supervised anomaly detection (AD), training exclusively on +normal ECGs to recognize deviations indicating anomalies. We introduce a novel +self-supervised learning framework for ECG AD, utilizing a vast dataset of +normal ECGs to autonomously detect and localize cardiac anomalies. It proposes +a novel masking and restoration technique alongside a multi-scale +cross-attention module, enhancing the model's ability to integrate global and +local signal features. The framework emphasizes accurate localization of +anomalies within ECG signals, ensuring the method's clinical relevance and +reliability. To reduce the impact of individual variability, the approach +further incorporates crucial patient-specific information from ECG reports, +such as age and gender, thus enabling accurate identification of a broad +spectrum of cardiac anomalies, including rare ones. Utilizing an extensive +dataset of 478,803 ECG graphic reports from real-world clinical practice, our +method has demonstrated exceptional effectiveness in AD across all tested +conditions, regardless of their frequency of occurrence, significantly +outperforming existing models. It achieved superior performance metrics, +including an AUROC of 91.2%, an F1 score of 83.7%, a sensitivity rate of 84.2%, +a specificity of 83.0%, and a precision of 75.6% with a fixed recall rate of +90%. It has also demonstrated robust localization capabilities, with an AUROC +of 76.5% and a Dice coefficient of 65.3% for anomaly localization. + +
+
+
+
+
+ + ☆ UniMD: Towards Unifying Moment Retrieval and Temporal Action Detection + + +
+ Temporal Action Detection (TAD) focuses on detecting pre-defined actions, +while Moment Retrieval (MR) aims to identify the events described by open-ended +natural language within untrimmed videos. Despite that they focus on different +events, we observe they have a significant connection. For instance, most +descriptions in MR involve multiple actions from TAD. In this paper, we aim to +investigate the potential synergy between TAD and MR. Firstly, we propose a +unified architecture, termed Unified Moment Detection (UniMD), for both TAD and +MR. It transforms the inputs of the two tasks, namely actions for TAD or events +for MR, into a common embedding space, and utilizes two novel query-dependent +decoders to generate a uniform output of classification score and temporal +segments. Secondly, we explore the efficacy of two task fusion learning +approaches, pre-training and co-training, in order to enhance the mutual +benefits between TAD and MR. Extensive experiments demonstrate that the +proposed task fusion learning scheme enables the two tasks to help each other +and outperform the separately trained counterparts. Impressively, UniMD +achieves state-of-the-art results on three paired datasets Ego4D, Charades-STA, +and ActivityNet. Our code will be released at +https://github.com/yingsen1/UniMD. + +
+
+ comment: Tech report +
+
+
+
+
+ + ☆ GvT: A Graph-based Vision Transformer with Talking-Heads Utilizing + Sparsity, Trained from Scratch on Small Datasets + + +
+ Vision Transformers (ViTs) have achieved impressive results in large-scale +image classification. However, when training from scratch on small datasets, +there is still a significant performance gap between ViTs and Convolutional +Neural Networks (CNNs), which is attributed to the lack of inductive bias. To +address this issue, we propose a Graph-based Vision Transformer (GvT) that +utilizes graph convolutional projection and graph-pooling. In each block, +queries and keys are calculated through graph convolutional projection based on +the spatial adjacency matrix, while dot-product attention is used in another +graph convolution to generate values. When using more attention heads, the +queries and keys become lower-dimensional, making their dot product an +uninformative matching function. To overcome this low-rank bottleneck in +attention heads, we employ talking-heads technology based on bilinear pooled +features and sparse selection of attention tensors. This allows interaction +among filtered attention scores and enables each attention mechanism to depend +on all queries and keys. Additionally, we apply graph-pooling between two +intermediate blocks to reduce the number of tokens and aggregate semantic +information more effectively. Our experimental results show that GvT produces +comparable or superior outcomes to deep convolutional networks and surpasses +vision transformers without pre-training on large datasets. The code for our +proposed model is publicly available on the website. + +
+
+
+
+
+ + ☆ Efficient Learnable Collaborative Attention for Single Image + Super-Resolution + + +
+ Non-Local Attention (NLA) is a powerful technique for capturing long-range +feature correlations in deep single image super-resolution (SR). However, NLA +suffers from high computational complexity and memory consumption, as it +requires aggregating all non-local feature information for each query response +and recalculating the similarity weight distribution for different abstraction +levels of features. To address these challenges, we propose a novel Learnable +Collaborative Attention (LCoA) that introduces inductive bias into non-local +modeling. Our LCoA consists of two components: Learnable Sparse Pattern (LSP) +and Collaborative Attention (CoA). LSP uses the k-means clustering algorithm to +dynamically adjust the sparse attention pattern of deep features, which reduces +the number of non-local modeling rounds compared with existing sparse +solutions. CoA leverages the sparse attention pattern and weights learned by +LSP, and co-optimizes the similarity matrix across different abstraction +levels, which avoids redundant similarity matrix calculations. The experimental +results show that our LCoA can reduce the non-local modeling time by about 83% +in the inference stage. In addition, we integrate our LCoA into a deep +Learnable Collaborative Attention Network (LCoAN), which achieves competitive +performance in terms of inference time, memory consumption, and reconstruction +quality compared with other state-of-the-art SR methods. + +
+
+
+
+
+ + ☆ Correcting Diffusion-Based Perceptual Image Compression with Privileged + End-to-End Decoder + + +
+ The images produced by diffusion models can attain excellent perceptual +quality. However, it is challenging for diffusion models to guarantee +distortion, hence the integration of diffusion models and image compression +models still needs more comprehensive explorations. This paper presents a +diffusion-based image compression method that employs a privileged end-to-end +decoder model as correction, which achieves better perceptual quality while +guaranteeing the distortion to an extent. We build a diffusion model and design +a novel paradigm that combines the diffusion model and an end-to-end decoder, +and the latter is responsible for transmitting the privileged information +extracted at the encoder side. Specifically, we theoretically analyze the +reconstruction process of the diffusion models at the encoder side with the +original images being visible. Based on the analysis, we introduce an +end-to-end convolutional decoder to provide a better approximation of the score +function $\nabla_{\mathbf{x}_t}\log p(\mathbf{x}_t)$ at the encoder side and +effectively transmit the combination. Experiments demonstrate the superiority +of our method in both distortion and perception compared with previous +perceptual compression methods. + +
+
+
+
+
+ + ☆ CodecNeRF: Toward Fast Encoding and Decoding, Compact, and High-quality + Novel-view Synthesis + + +
+ Neural Radiance Fields (NeRF) have achieved huge success in effectively +capturing and representing 3D objects and scenes. However, several factors have +impeded its further proliferation as next-generation 3D media. To establish a +ubiquitous presence in everyday media formats, such as images and videos, it is +imperative to devise a solution that effectively fulfills three key objectives: +fast encoding and decoding time, compact model sizes, and high-quality +renderings. Despite significant advancements, a comprehensive algorithm that +adequately addresses all objectives has yet to be fully realized. In this work, +we present CodecNeRF, a neural codec for NeRF representations, consisting of a +novel encoder and decoder architecture that can generate a NeRF representation +in a single forward pass. Furthermore, inspired by the recent +parameter-efficient finetuning approaches, we develop a novel finetuning method +to efficiently adapt the generated NeRF representations to a new test instance, +leading to high-quality image renderings and compact code sizes. The proposed +CodecNeRF, a newly suggested encoding-decoding-finetuning pipeline for NeRF, +achieved unprecedented compression performance of more than 150x and 20x +reduction in encoding time while maintaining (or improving) the image quality +on widely used 3D object datasets, such as ShapeNet and Objaverse. + +
+
+ comment: 34 pages, 22 figures, Project page: + https://gynjn.github.io/Codec-NeRF/ +
+
+
+
+
+ + ☆ MonoTAKD: Teaching Assistant Knowledge Distillation for Monocular 3D + Object Detection + + +
+ Monocular 3D object detection (Mono3D) is an indispensable research topic in +autonomous driving, thanks to the cost-effective monocular camera sensors and +its wide range of applications. Since the image perspective has depth +ambiguity, the challenges of Mono3D lie in understanding 3D scene geometry and +reconstructing 3D object information from a single image. Previous methods +attempted to transfer 3D information directly from the LiDAR-based teacher to +the camera-based student. However, a considerable gap in feature representation +makes direct cross-modal distillation inefficient, resulting in a significant +performance deterioration between the LiDAR-based teacher and the camera-based +student. To address this issue, we propose the Teaching Assistant Knowledge +Distillation (MonoTAKD) to break down the learning objective by integrating +intra-modal distillation with cross-modal residual distillation. In particular, +we employ a strong camera-based teaching assistant model to distill powerful +visual knowledge effectively through intra-modal distillation. Subsequently, we +introduce the cross-modal residual distillation to transfer the 3D spatial +cues. By acquiring both visual knowledge and 3D spatial cues, the predictions +of our approach are rigorously evaluated on the KITTI 3D object detection +benchmark and achieve state-of-the-art performance in Mono3D. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ Dual-Camera Smooth Zoom on Mobile Phones + + +
+ When zooming between dual cameras on a mobile, noticeable jumps in geometric +content and image color occur in the preview, inevitably affecting the user's +zoom experience. In this work, we introduce a new task, ie, dual-camera smooth +zoom (DCSZ) to achieve a smooth zoom preview. The frame interpolation (FI) +technique is a potential solution but struggles with ground-truth collection. +To address the issue, we suggest a data factory solution where continuous +virtual cameras are assembled to generate DCSZ data by rendering reconstructed +3D models of the scene. In particular, we propose a novel dual-camera smooth +zoom Gaussian Splatting (ZoomGS), where a camera-specific encoding is +introduced to construct a specific 3D model for each virtual camera. With the +proposed data factory, we construct a synthetic dataset for DCSZ, and we +utilize it to fine-tune FI models. In addition, we collect real-world dual-zoom +images without ground-truth for evaluation. Extensive experiments are conducted +with multiple FI methods. The results show that the fine-tuned FI models +achieve a significant performance improvement over the original ones on DCSZ +task. The datasets, codes, and pre-trained models will be publicly available. + +
+
+ comment: 24 +
+
+
+
+
+ + ☆ DL-EWF: Deep Learning Empowering Women's Fashion with + Grounded-Segment-Anything Segmentation for Body Shape Classification + + +
+ The global fashion industry plays a pivotal role in the global economy, and +addressing fundamental issues within the industry is crucial for developing +innovative solutions. One of the most pressing challenges in the fashion +industry is the mismatch between body shapes and the garments of individuals +they purchase. This issue is particularly prevalent among individuals with +non-ideal body shapes, exacerbating the challenges faced. Considering +inter-individual variability in body shapes is essential for designing and +producing garments that are widely accepted by consumers. Traditional methods +for determining human body shape are limited due to their low accuracy, high +costs, and time-consuming nature. New approaches, utilizing digital imaging and +deep neural networks (DNN), have been introduced to identify human body shape. +In this study, the Style4BodyShape dataset is used for classifying body shapes +into five categories: Rectangle, Triangle, Inverted Triangle, Hourglass, and +Apple. In this paper, the body shape segmentation of a person is extracted from +the image, disregarding the surroundings and background. Then, Various +pre-trained models, such as ResNet18, ResNet34, ResNet50, VGG16, VGG19, and +Inception v3, are used to classify the segmentation results. Among these +pre-trained models, the Inception V3 model demonstrates superior performance +regarding f1-score evaluation metric and accuracy compared to the other models. + +
+
+
+
+
+ + ☆ A Unified Diffusion Framework for Scene-aware Human Motion Estimation + from Sparse Signals + + +
+ Estimating full-body human motion via sparse tracking signals from +head-mounted displays and hand controllers in 3D scenes is crucial to +applications in AR/VR. One of the biggest challenges to this task is the +one-to-many mapping from sparse observations to dense full-body motions, which +endowed inherent ambiguities. To help resolve this ambiguous problem, we +introduce a new framework to combine rich contextual information provided by +scenes to benefit full-body motion tracking from sparse observations. To +estimate plausible human motions given sparse tracking signals and 3D scenes, +we develop $\text{S}^2$Fusion, a unified framework fusing \underline{S}cene and +sparse \underline{S}ignals with a conditional dif\underline{Fusion} model. +$\text{S}^2$Fusion first extracts the spatial-temporal relations residing in +the sparse signals via a periodic autoencoder, and then produces time-alignment +feature embedding as additional inputs. Subsequently, by drawing initial noisy +motion from a pre-trained prior, $\text{S}^2$Fusion utilizes conditional +diffusion to fuse scene geometry and sparse tracking signals to generate +full-body scene-aware motions. The sampling procedure of $\text{S}^2$Fusion is +further guided by a specially designed scene-penetration loss and +phase-matching loss, which effectively regularizes the motion of the lower body +even in the absence of any tracking signals, making the generated motion much +more plausible and coherent. Extensive experimental results have demonstrated +that our $\text{S}^2$Fusion outperforms the state-of-the-art in terms of +estimation quality and smoothness. + +
+
+
+
+
+ + ☆ A Clinical-oriented Multi-level Contrastive Learning Method for Disease + Diagnosis in Low-quality Medical Images + + +
+ Representation learning offers a conduit to elucidate distinctive features +within the latent space and interpret the deep models. However, the randomness +of lesion distribution and the complexity of low-quality factors in medical +images pose great challenges for models to extract key lesion features. Disease +diagnosis methods guided by contrastive learning (CL) have shown significant +advantages in lesion feature representation. Nevertheless, the effectiveness of +CL is highly dependent on the quality of the positive and negative sample +pairs. In this work, we propose a clinical-oriented multi-level CL framework +that aims to enhance the model's capacity to extract lesion features and +discriminate between lesion and low-quality factors, thereby enabling more +accurate disease diagnosis from low-quality medical images. Specifically, we +first construct multi-level positive and negative pairs to enhance the model's +comprehensive recognition capability of lesion features by integrating +information from different levels and qualities of medical images. Moreover, to +improve the quality of the learned lesion embeddings, we introduce a dynamic +hard sample mining method based on self-paced learning. The proposed CL +framework is validated on two public medical image datasets, EyeQ and Chest +X-ray, demonstrating superior performance compared to other state-of-the-art +disease diagnostic methods. + +
+
+
+
+
+ + ☆ LRNet: Change detection of high-resolution remote sensing imagery via + strategy of localization-then-refinement + + +
+ Change detection, as a research hotspot in the field of remote sensing, has +witnessed continuous development and progress. However, the discrimination of +boundary details remains a significant bottleneck due to the complexity of +surrounding elements between change areas and backgrounds. Discriminating the +boundaries of large change areas results in misalignment, while connecting +boundaries occurs for small change targets. To address the above issues, a +novel network based on the localization-then-refinement strategy is proposed in +this paper, namely LRNet. LRNet consists of two stages: localization and +refinement. In the localization stage, a three-branch encoder simultaneously +extracts original image features and their differential features for +interactive localization of the position of each change area. To minimize +information loss during feature extraction, learnable optimal pooling (LOP) is +proposed to replace the widely used max-pooling. Additionally, this process is +trainable and contributes to the overall optimization of the network. To +effectively interact features from different branches and accurately locate +change areas of various sizes, change alignment attention (C2A) and +hierarchical change alignment module (HCA) are proposed. In the refinement +stage, the localization results from the localization stage are corrected by +constraining the change areas and change edges through the edge-area alignment +module (E2A). Subsequently, the decoder, combined with the difference features +strengthened by C2A in the localization phase, refines change areas of +different sizes, ultimately achieving accurate boundary discrimination of +change areas. The proposed LRNet outperforms 13 other state-of-the-art methods +in terms of comprehensive evaluation metrics and provides the most precise +boundary discrimination results on the LEVIR-CD and WHU-CD datasets. + +
+
+ comment: 18 pages, 11 figures +
+
+
+
+
+ + ☆ Mixture of Low-rank Experts for Transferable AI-Generated Image + Detection + + +
+ Generative models have shown a giant leap in synthesizing photo-realistic +images with minimal expertise, sparking concerns about the authenticity of +online information. This study aims to develop a universal AI-generated image +detector capable of identifying images from diverse sources. Existing methods +struggle to generalize across unseen generative models when provided with +limited sample sources. Inspired by the zero-shot transferability of +pre-trained vision-language models, we seek to harness the nontrivial +visual-world knowledge and descriptive proficiency of CLIP-ViT to generalize +over unknown domains. This paper presents a novel parameter-efficient +fine-tuning approach, mixture of low-rank experts, to fully exploit CLIP-ViT's +potential while preserving knowledge and expanding capacity for transferable +detection. We adapt only the MLP layers of deeper ViT blocks via an integration +of shared and separate LoRAs within an MoE-based structure. Extensive +experiments on public benchmarks show that our method achieves superiority over +state-of-the-art approaches in cross-generator generalization and robustness to +perturbations. Remarkably, our best-performing ViT-L/14 variant requires +training only 0.08% of its parameters to surpass the leading baseline by +3.64% +mAP and +12.72% avg.Acc across unseen diffusion and autoregressive models. This +even outperforms the baseline with just 0.28% of the training data. Our code +and pre-trained models will be available at +https://github.com/zhliuworks/CLIPMoLE. + +
+
+
+
+
+ + ☆ GauU-Scene V2: Expanse Lidar Image Dataset Shows Unreliable Geometric + Reconstruction Using Gaussian Splatting and NeRF + + +
+ We introduce a novel large-scale scene reconstruction benchmark that utilizes +newly developed 3D representation approaches: Gaussian Splatting and Neural +Radiance Fields, on our expansive GauU-Scene V2 dataset. GauU-Scene V2 +encompasses over 6.5 square kilometers and features a comprehensive RGB dataset +coupled with LiDAR ground truth. This dataset offers a unique blend of urban +and academic environments for advanced spatial analysis, covering more than 6.5 +km2. We also provide detailed supplementary information on data collection +protocols. Furthermore, we present an easy-to-follow pipeline to align the +COLMAP sparse point cloud with the detailed LiDAR dataset. Our evaluation of +U-Scene, which includes a detailed analysis across various novel viewpoints +using image-based metrics such as SSIM, LPIPS, and PSNR, shows contradictory +results when applying geometric-based metrics, such as Chamfer distance. This +leads to doubts about the reliability of current image-based measurement +matrices and geometric extraction methods on Gaussian Splatting. We also make +the dataset available on the following anonymous project page + +
+
+ comment: 8 pages(No reference) 6 figures 4 tabs +
+
+
+
+
+ + ☆ CycleINR: Cycle Implicit Neural Representation for Arbitrary-Scale + Volumetric Super-Resolution of Medical Data CVPR + + +
+ In the realm of medical 3D data, such as CT and MRI images, prevalent +anisotropic resolution is characterized by high intra-slice but diminished +inter-slice resolution. The lowered resolution between adjacent slices poses +challenges, hindering optimal viewing experiences and impeding the development +of robust downstream analysis algorithms. Various volumetric super-resolution +algorithms aim to surmount these challenges, enhancing inter-slice resolution +and overall 3D medical imaging quality. However, existing approaches confront +inherent challenges: 1) often tailored to specific upsampling factors, lacking +flexibility for diverse clinical scenarios; 2) newly generated slices +frequently suffer from over-smoothing, degrading fine details, and leading to +inter-slice inconsistency. In response, this study presents CycleINR, a novel +enhanced Implicit Neural Representation model for 3D medical data volumetric +super-resolution. Leveraging the continuity of the learned implicit function, +the CycleINR model can achieve results with arbitrary up-sampling rates, +eliminating the need for separate training. Additionally, we enhance the grid +sampling in CycleINR with a local attention mechanism and mitigate +over-smoothing by integrating cycle-consistent loss. We introduce a new metric, +Slice-wise Noise Level Inconsistency (SNLI), to quantitatively assess +inter-slice noise level inconsistency. The effectiveness of our approach is +demonstrated through image quality evaluations on an in-house dataset and a +downstream task analysis on the Medical Segmentation Decathlon liver tumor +dataset. + +
+
+ comment: CVPR accepted paper +
+
+
+
+
+ + ☆ HiLo: Detailed and Robust 3D Clothed Human Reconstruction with High-and + Low-Frequency Information of Parametric Models CVPR 2024 + + +
+ Reconstructing 3D clothed human involves creating a detailed geometry of +individuals in clothing, with applications ranging from virtual try-on, movies, +to games. To enable practical and widespread applications, recent advances +propose to generate a clothed human from an RGB image. However, they struggle +to reconstruct detailed and robust avatars simultaneously. We empirically find +that the high-frequency (HF) and low-frequency (LF) information from a +parametric model has the potential to enhance geometry details and improve +robustness to noise, respectively. Based on this, we propose HiLo, namely +clothed human reconstruction with high- and low-frequency information, which +contains two components. 1) To recover detailed geometry using HF information, +we propose a progressive HF Signed Distance Function to enhance the detailed 3D +geometry of a clothed human. We analyze that our progressive learning manner +alleviates large gradients that hinder model convergence. 2) To achieve robust +reconstruction against inaccurate estimation of the parametric model by using +LF information, we propose a spatial interaction implicit function. This +function effectively exploits the complementary spatial information from a +low-resolution voxel grid of the parametric model. Experimental results +demonstrate that HiLo outperforms the state-of-the-art methods by 10.43% and +9.54% in terms of Chamfer distance on the Thuman2.0 and CAPE datasets, +respectively. Additionally, HiLo demonstrates robustness to noise from the +parametric model, challenging poses, and various clothing styles. + +
+
+ comment: CVPR 2024 Accepted Paper +
+
+
+
+
+ + ☆ NeRF2Points: Large-Scale Point Cloud Generation From Street Views' + Radiance Field Optimization + + +
+ Neural Radiance Fields (NeRF) have emerged as a paradigm-shifting methodology +for the photorealistic rendering of objects and environments, enabling the +synthesis of novel viewpoints with remarkable fidelity. This is accomplished +through the strategic utilization of object-centric camera poses characterized +by significant inter-frame overlap. This paper explores a compelling, +alternative utility of NeRF: the derivation of point clouds from aggregated +urban landscape imagery. The transmutation of street-view data into point +clouds is fraught with complexities, attributable to a nexus of interdependent +variables. First, high-quality point cloud generation hinges on precise camera +poses, yet many datasets suffer from inaccuracies in pose metadata. Also, the +standard approach of NeRF is ill-suited for the distinct characteristics of +street-view data from autonomous vehicles in vast, open settings. Autonomous +vehicle cameras often record with limited overlap, leading to blurring, +artifacts, and compromised pavement representation in NeRF-based point clouds. +In this paper, we present NeRF2Points, a tailored NeRF variant for urban point +cloud synthesis, notable for its high-quality output from RGB inputs alone. Our +paper is supported by a bespoke, high-resolution 20-kilometer urban street +dataset, designed for point cloud generation and evaluation. NeRF2Points +adeptly navigates the inherent challenges of NeRF-based point cloud synthesis +through the implementation of the following strategic innovations: (1) +Integration of Weighted Iterative Geometric Optimization (WIGO) and Structure +from Motion (SfM) for enhanced camera pose accuracy, elevating street-view data +precision. (2) Layered Perception and Integrated Modeling (LPiM) is designed +for distinct radiance field modeling in urban environments, resulting in +coherent point cloud representations. + +
+
+ comment: 18 pages +
+
+
+
+
+ + ☆ Data Stream Sampling with Fuzzy Task Boundaries and Noisy Labels + + +
+ In the realm of continual learning, the presence of noisy labels within data +streams represents a notable obstacle to model reliability and fairness. We +focus on the data stream scenario outlined in pertinent literature, +characterized by fuzzy task boundaries and noisy labels. To address this +challenge, we introduce a novel and intuitive sampling method called Noisy Test +Debiasing (NTD) to mitigate noisy labels in evolving data streams and establish +a fair and robust continual learning algorithm. NTD is straightforward to +implement, making it feasible across various scenarios. Our experiments +benchmark four datasets, including two synthetic noise datasets (CIFAR10 and +CIFAR100) and real-world noise datasets (mini-WebVision and Food-101N). The +results validate the efficacy of NTD for online continual learning in scenarios +with noisy labels in data streams. Compared to the previous leading approach, +NTD achieves a training speedup enhancement over two times while maintaining or +surpassing accuracy levels. Moreover, NTD utilizes less than one-fifth of the +GPU memory resources compared to previous leading methods. + +
+
+
+
+
+ + ☆ On the Learnability of Out-of-distribution Detection NeurIPS 2022 + + +
+ Supervised learning aims to train a classifier under the assumption that +training and test data are from the same distribution. To ease the above +assumption, researchers have studied a more realistic setting: +out-of-distribution (OOD) detection, where test data may come from classes that +are unknown during training (i.e., OOD data). Due to the unavailability and +diversity of OOD data, good generalization ability is crucial for effective OOD +detection algorithms, and corresponding learning theory is still an open +problem. To study the generalization of OOD detection, this paper investigates +the probably approximately correct (PAC) learning theory of OOD detection that +fits the commonly used evaluation metrics in the literature. First, we find a +necessary condition for the learnability of OOD detection. Then, using this +condition, we prove several impossibility theorems for the learnability of OOD +detection under some scenarios. Although the impossibility theorems are +frustrating, we find that some conditions of these impossibility theorems may +not hold in some practical scenarios. Based on this observation, we next give +several necessary and sufficient conditions to characterize the learnability of +OOD detection in some practical scenarios. Lastly, we offer theoretical support +for representative OOD detection works based on our OOD theory. + +
+
+ comment: Accepted by JMLR in 7th of April, 2024. This is a journal extension + of the previous NeurIPS 2022 Outstanding Paper "Is Out-of-distribution + Detection Learnable?" [arXiv:2210.14707] +
+
+
+
+
+ + ☆ ByteEdit: Boost, Comply and Accelerate Generative Image Editing + + +
+ Recent advancements in diffusion-based generative image editing have sparked +a profound revolution, reshaping the landscape of image outpainting and +inpainting tasks. Despite these strides, the field grapples with inherent +challenges, including: i) inferior quality; ii) poor consistency; iii) +insufficient instrcution adherence; iv) suboptimal generation efficiency. To +address these obstacles, we present ByteEdit, an innovative feedback learning +framework meticulously designed to Boost, Comply, and Accelerate Generative +Image Editing tasks. ByteEdit seamlessly integrates image reward models +dedicated to enhancing aesthetics and image-text alignment, while also +introducing a dense, pixel-level reward model tailored to foster coherence in +the output. Furthermore, we propose a pioneering adversarial and progressive +feedback learning strategy to expedite the model's inference speed. Through +extensive large-scale user evaluations, we demonstrate that ByteEdit surpasses +leading generative image editing products, including Adobe, Canva, and MeiTu, +in both generation quality and consistency. ByteEdit-Outpainting exhibits a +remarkable enhancement of 388% and 135% in quality and consistency, +respectively, when compared to the baseline model. Experiments also verfied +that our acceleration models maintains excellent performance results in terms +of quality and consistency. + +
+
+
+
+
+ + ☆ Msmsfnet: a multi-stream and multi-scale fusion net for edge detection + + +
+ Edge detection is a long standing problem in computer vision. Recent deep +learning based algorithms achieve state of-the-art performance in publicly +available datasets. Despite the efficiency of these algorithms, their +performance, however, relies heavily on the pretrained weights of the backbone +network on the ImageNet dataset. This limits heavily the design space of deep +learning based edge detectors. Whenever we want to devise a new model, we have +to train this new model on the ImageNet dataset first, and then fine tune the +model using the edge detection datasets. The comparison would be unfair +otherwise. However, it is usually not feasible for many researchers to train a +model on the ImageNet dataset due to the limited computation resources. In this +work, we study the performance that can be achieved by state-of-the-art deep +learning based edge detectors in publicly available datasets when they are +trained from scratch, and devise a new network architecture, the multi-stream +and multi scale fusion net (msmsfnet), for edge detection. We show in our +experiments that by training all models from scratch to ensure the fairness of +comparison, out model outperforms state-of-the art deep learning based edge +detectors in three publicly available datasets. + +
+
+
+
+
+ + ☆ Task-Aware Encoder Control for Deep Video Compression CVPR 2024 + + +
+ Prior research on deep video compression (DVC) for machine tasks typically +necessitates training a unique codec for each specific task, mandating a +dedicated decoder per task. In contrast, traditional video codecs employ a +flexible encoder controller, enabling the adaptation of a single codec to +different tasks through mechanisms like mode prediction. Drawing inspiration +from this, we introduce an innovative encoder controller for deep video +compression for machines. This controller features a mode prediction and a +Group of Pictures (GoP) selection module. Our approach centralizes control at +the encoding stage, allowing for adaptable encoder adjustments across different +tasks, such as detection and tracking, while maintaining compatibility with a +standard pre-trained DVC decoder. Empirical evidence demonstrates that our +method is applicable across multiple tasks with various existing pre-trained +DVCs. Moreover, extensive experiments demonstrate that our method outperforms +previous DVC by about 25% bitrate for different tasks, with only one +pre-trained decoder. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ ShoeModel: Learning to Wear on the User-specified Shoes via Diffusion + Model + + +
+ With the development of the large-scale diffusion model, Artificial +Intelligence Generated Content (AIGC) techniques are popular recently. However, +how to truly make it serve our daily lives remains an open question. To this +end, in this paper, we focus on employing AIGC techniques in one filed of +E-commerce marketing, i.e., generating hyper-realistic advertising images for +displaying user-specified shoes by human. Specifically, we propose a +shoe-wearing system, called Shoe-Model, to generate plausible images of human +legs interacting with the given shoes. It consists of three modules: (1) shoe +wearable-area detection module (WD), (2) leg-pose synthesis module (LpS) and +the final (3) shoe-wearing image generation module (SW). Them three are +performed in ordered stages. Compared to baselines, our ShoeModel is shown to +generalize better to different type of shoes and has ability of keeping the +ID-consistency of the given shoes, as well as automatically producing +reasonable interactions with human. Extensive experiments show the +effectiveness of our proposed shoe-wearing system. Figure 1 shows the input and +output examples of our ShoeModel. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ☆ Strictly-ID-Preserved and Controllable Accessory Advertising Image + Generation + + +
+ Customized generative text-to-image models have the ability to produce images +that closely resemble a given subject. However, in the context of generating +advertising images for e-commerce scenarios, it is crucial that the generated +subject's identity aligns perfectly with the product being advertised. In order +to address the need for strictly-ID preserved advertising image generation, we +have developed a Control-Net based customized image generation pipeline and +have taken earring model advertising as an example. Our approach facilitates a +seamless interaction between the earrings and the model's face, while ensuring +that the identity of the earrings remains intact. Furthermore, to achieve a +diverse and controllable display, we have proposed a multi-branch +cross-attention architecture, which allows for control over the scale, pose, +and appearance of the model, going beyond the limitations of text prompts. Our +method manages to achieve fine-grained control of the generated model's face, +resulting in controllable and captivating advertising effects. + +
+
+ comment: 22 pages +
+
+
+
+
+ + ☆ 3D Building Reconstruction from Monocular Remote Sensing Images with + Multi-level Supervisions CVPR 2024 + + +
+ 3D building reconstruction from monocular remote sensing images is an +important and challenging research problem that has received increasing +attention in recent years, owing to its low cost of data acquisition and +availability for large-scale applications. However, existing methods rely on +expensive 3D-annotated samples for fully-supervised training, restricting their +application to large-scale cross-city scenarios. In this work, we propose +MLS-BRN, a multi-level supervised building reconstruction network that can +flexibly utilize training samples with different annotation levels to achieve +better reconstruction results in an end-to-end manner. To alleviate the demand +on full 3D supervision, we design two new modules, Pseudo Building Bbox +Calculator and Roof-Offset guided Footprint Extractor, as well as new tasks and +training strategies for different types of samples. Experimental results on +several public and new datasets demonstrate that our proposed MLS-BRN achieves +competitive performance using much fewer 3D-annotated samples, and +significantly improves the footprint extraction and 3D reconstruction +performance compared with current state-of-the-art. The code and datasets of +this work will be released at https://github.com/opendatalab/MLS-BRN.git. + +
+
+ comment: accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Joint Reconstruction of 3D Human and Object via Contact-Based Refinement + Transformer CVPR 2024 + + +
+ Human-object contact serves as a strong cue to understand how humans +physically interact with objects. Nevertheless, it is not widely explored to +utilize human-object contact information for the joint reconstruction of 3D +human and object from a single image. In this work, we present a novel joint 3D +human-object reconstruction method (CONTHO) that effectively exploits contact +information between humans and objects. There are two core designs in our +system: 1) 3D-guided contact estimation and 2) contact-based 3D human and +object refinement. First, for accurate human-object contact estimation, CONTHO +initially reconstructs 3D humans and objects and utilizes them as explicit 3D +guidance for contact estimation. Second, to refine the initial reconstructions +of 3D human and object, we propose a novel contact-based refinement Transformer +that effectively aggregates human features and object features based on the +estimated human-object contact. The proposed contact-based refinement prevents +the learning of erroneous correlation between human and object, which enables +accurate 3D reconstruction. As a result, our CONTHO achieves state-of-the-art +performance in both human-object contact estimation and joint reconstruction of +3D human and object. The code is publicly available at +https://github.com/dqj5182/CONTHO_RELEASE. + +
+
+ comment: Published at CVPR 2024, 19 pages including the supplementary material +
+
+
+
+
+ + ☆ DWE+: Dual-Way Matching Enhanced Framework for Multimodal Entity Linking + + +
+ Multimodal entity linking (MEL) aims to utilize multimodal information +(usually textual and visual information) to link ambiguous mentions to +unambiguous entities in knowledge base. Current methods facing main issues: +(1)treating the entire image as input may contain redundant information. (2)the +insufficient utilization of entity-related information, such as attributes in +images. (3)semantic inconsistency between the entity in knowledge base and its +representation. To this end, we propose DWE+ for multimodal entity linking. +DWE+ could capture finer semantics and dynamically maintain semantic +consistency with entities. This is achieved by three aspects: (a)we introduce a +method for extracting fine-grained image features by partitioning the image +into multiple local objects. Then, hierarchical contrastive learning is used to +further align semantics between coarse-grained information(text and image) and +fine-grained (mention and visual objects). (b)we explore ways to extract visual +attributes from images to enhance fusion feature such as facial features and +identity. (c)we leverage Wikipedia and ChatGPT to capture the entity +representation, achieving semantic enrichment from both static and dynamic +perspectives, which better reflects the real-world entity semantics. +Experiments on Wikimel, Richpedia, and Wikidiverse datasets demonstrate the +effectiveness of DWE+ in improving MEL performance. Specifically, we optimize +these datasets and achieve state-of-the-art performance on the enhanced +datasets. The code and enhanced datasets are released on +https://github.com/season1blue/DWET + +
+
+ comment: under review on TOIS +
+
+
+
+
+ + ☆ MemFlow: Optical Flow Estimation and Prediction with Memory CVPR 2024 + + +
+ Optical flow is a classical task that is important to the vision community. +Classical optical flow estimation uses two frames as input, whilst some recent +methods consider multiple frames to explicitly model long-range information. +The former ones limit their ability to fully leverage temporal coherence along +the video sequence; and the latter ones incur heavy computational overhead, +typically not possible for real-time flow estimation. Some multi-frame-based +approaches even necessitate unseen future frames for current estimation, +compromising real-time applicability in safety-critical scenarios. To this end, +we present MemFlow, a real-time method for optical flow estimation and +prediction with memory. Our method enables memory read-out and update modules +for aggregating historical motion information in real-time. Furthermore, we +integrate resolution-adaptive re-scaling to accommodate diverse video +resolutions. Besides, our approach seamlessly extends to the future prediction +of optical flow based on past observations. Leveraging effective historical +motion aggregation, our method outperforms VideoFlow with fewer parameters and +faster inference speed on Sintel and KITTI-15 datasets in terms of +generalization performance. At the time of submission, MemFlow also leads in +performance on the 1080p Spring dataset. Codes and models will be available at: +https://dqiaole.github.io/MemFlow/. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ D2SL: Decouple Defogging and Semantic Learning for Foggy Domain-Adaptive + Segmentation + + +
+ We investigated domain adaptive semantic segmentation in foggy weather +scenarios, which aims to enhance the utilization of unlabeled foggy data and +improve the model's adaptability to foggy conditions. Current methods rely on +clear images as references, jointly learning defogging and segmentation for +foggy images. Despite making some progress, there are still two main drawbacks: +(1) the coupling of segmentation and defogging feature representations, +resulting in a decrease in semantic representation capability, and (2) the +failure to leverage real fog priors in unlabeled foggy data, leading to +insufficient model generalization ability. To address these issues, we propose +a novel training framework, Decouple Defogging and Semantic learning, called +D2SL, aiming to alleviate the adverse impact of defogging tasks on the final +segmentation task. In this framework, we introduce a domain-consistent transfer +strategy to establish a connection between defogging and segmentation tasks. +Furthermore, we design a real fog transfer strategy to improve defogging +effects by fully leveraging the fog priors from real foggy images. Our approach +enhances the semantic representations required for segmentation during the +defogging learning process and maximizes the representation capability of fog +invariance by effectively utilizing real fog data. Comprehensive experiments +validate the effectiveness of the proposed method. + +
+
+
+
+
+ + ☆ Light the Night: A Multi-Condition Diffusion Framework for Unpaired + Low-Light Enhancement in Autonomous Driving CVPR 2024 + + +
+ Vision-centric perception systems for autonomous driving have gained +considerable attention recently due to their cost-effectiveness and +scalability, especially compared to LiDAR-based systems. However, these systems +often struggle in low-light conditions, potentially compromising their +performance and safety. To address this, our paper introduces LightDiff, a +domain-tailored framework designed to enhance the low-light image quality for +autonomous driving applications. Specifically, we employ a multi-condition +controlled diffusion model. LightDiff works without any human-collected paired +data, leveraging a dynamic data degradation process instead. It incorporates a +novel multi-condition adapter that adaptively controls the input weights from +different modalities, including depth maps, RGB images, and text captions, to +effectively illuminate dark scenes while maintaining context consistency. +Furthermore, to align the enhanced images with the detection model's knowledge, +LightDiff employs perception-specific scores as rewards to guide the diffusion +training process through reinforcement learning. Extensive experiments on the +nuScenes datasets demonstrate that LightDiff can significantly improve the +performance of several state-of-the-art 3D detectors in night-time conditions +while achieving high visual quality scores, highlighting its potential to +safeguard autonomous driving. + +
+
+ comment: This paper is accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Coordinated Sparse Recovery of Label Noise + + +
+ Label noise is a common issue in real-world datasets that inevitably impacts +the generalization of models. This study focuses on robust classification tasks +where the label noise is instance-dependent. Estimating the transition matrix +accurately in this task is challenging, and methods based on sample selection +often exhibit confirmation bias to varying degrees. Sparse over-parameterized +training (SOP) has been theoretically effective in estimating and recovering +label noise, offering a novel solution for noise-label learning. However, this +study empirically observes and verifies a technical flaw of SOP: the lack of +coordination between model predictions and noise recovery leads to increased +generalization error. To address this, we propose a method called Coordinated +Sparse Recovery (CSR). CSR introduces a collaboration matrix and confidence +weights to coordinate model predictions and noise recovery, reducing error +leakage. Based on CSR, this study designs a joint sample selection strategy and +constructs a comprehensive and powerful learning framework called CSR+. CSR+ +significantly reduces confirmation bias, especially for datasets with more +classes and a high proportion of instance-specific noise. Experimental results +on simulated and real-world noisy datasets demonstrate that both CSR and CSR+ +achieve outstanding performance compared to methods at the same level. + +
+
+ comment: Pre-print prior to submission to journal +
+
+
+
+
+ + ☆ Few-Shot Object Detection: Research Advances and Challenges + + +
+ Object detection as a subfield within computer vision has achieved remarkable +progress, which aims to accurately identify and locate a specific object from +images or videos. Such methods rely on large-scale labeled training samples for +each object category to ensure accurate detection, but obtaining extensive +annotated data is a labor-intensive and expensive process in many real-world +scenarios. To tackle this challenge, researchers have explored few-shot object +detection (FSOD) that combines few-shot learning and object detection +techniques to rapidly adapt to novel objects with limited annotated samples. +This paper presents a comprehensive survey to review the significant +advancements in the field of FSOD in recent years and summarize the existing +challenges and solutions. Specifically, we first introduce the background and +definition of FSOD to emphasize potential value in advancing the field of +computer vision. We then propose a novel FSOD taxonomy method and survey the +plentifully remarkable FSOD algorithms based on this fact to report a +comprehensive overview that facilitates a deeper understanding of the FSOD +problem and the development of innovative solutions. Finally, we discuss the +advantages and limitations of these algorithms to summarize the challenges, +potential research direction, and development trend of object detection in the +data scarcity scenario. + +
+
+
+
+
+ + ☆ Rethinking Diffusion Model for Multi-Contrast MRI Super-Resolution CVPR2024 + + +
+ Recently, diffusion models (DM) have been applied in magnetic resonance +imaging (MRI) super-resolution (SR) reconstruction, exhibiting impressive +performance, especially with regard to detailed reconstruction. However, the +current DM-based SR reconstruction methods still face the following issues: (1) +They require a large number of iterations to reconstruct the final image, which +is inefficient and consumes a significant amount of computational resources. +(2) The results reconstructed by these methods are often misaligned with the +real high-resolution images, leading to remarkable distortion in the +reconstructed MR images. To address the aforementioned issues, we propose an +efficient diffusion model for multi-contrast MRI SR, named as DiffMSR. +Specifically, we apply DM in a highly compact low-dimensional latent space to +generate prior knowledge with high-frequency detail information. The highly +compact latent space ensures that DM requires only a few simple iterations to +produce accurate prior knowledge. In addition, we design the Prior-Guide Large +Window Transformer (PLWformer) as the decoder for DM, which can extend the +receptive field while fully utilizing the prior knowledge generated by DM to +ensure that the reconstructed MR image remains undistorted. Extensive +experiments on public and clinical datasets demonstrate that our DiffMSR +outperforms state-of-the-art methods. + +
+
+ comment: 14 pages, 12 figures, Accepted by CVPR2024 +
+
+
+
+
+ + ☆ GenEARL: A Training-Free Generative Framework for Multimodal Event + Argument Role Labeling + + +
+ Multimodal event argument role labeling (EARL), a task that assigns a role +for each event participant (object) in an image is a complex challenge. It +requires reasoning over the entire image, the depicted event, and the +interactions between various objects participating in the event. Existing +models heavily rely on high-quality event-annotated training data to understand +the event semantics and structures, and they fail to generalize to new event +types and domains. In this paper, we propose GenEARL, a training-free +generative framework that harness the power of the modern generative models to +understand event task descriptions given image contexts to perform the EARL +task. Specifically, GenEARL comprises two stages of generative prompting with a +frozen vision-language model (VLM) and a frozen large language model (LLM). +First, a generative VLM learns the semantics of the event argument roles and +generates event-centric object descriptions based on the image. Subsequently, a +LLM is prompted with the generated object descriptions with a predefined +template for EARL (i.e., assign an object with an event argument role). We show +that GenEARL outperforms the contrastive pretraining (CLIP) baseline by 9.4% +and 14.2% accuracy for zero-shot EARL on the M2E2 and SwiG datasets, +respectively. In addition, we outperform CLIP-Event by 22% precision on M2E2 +dataset. The framework also allows flexible adaptation and generalization to +unseen domains. + +
+
+ comment: 20 pages, 15 Figures, 13 figures +
+
+
+
+
+ + ☆ X-VARS: Introducing Explainability in Football Refereeing with + Multi-Modal Large Language Model + + +
+ The rapid advancement of artificial intelligence has led to significant +improvements in automated decision-making. However, the increased performance +of models often comes at the cost of explainability and transparency of their +decision-making processes. In this paper, we investigate the capabilities of +large language models to explain decisions, using football refereeing as a +testing ground, given its decision complexity and subjectivity. We introduce +the Explainable Video Assistant Referee System, X-VARS, a multi-modal large +language model designed for understanding football videos from the point of +view of a referee. X-VARS can perform a multitude of tasks, including video +description, question answering, action recognition, and conducting meaningful +conversations based on video content and in accordance with the Laws of the +Game for football referees. We validate X-VARS on our novel dataset, +SoccerNet-XFoul, which consists of more than 22k video-question-answer triplets +annotated by over 70 experienced football referees. Our experiments and human +study illustrate the impressive capabilities of X-VARS in interpreting complex +football clips. Furthermore, we highlight the potential of X-VARS to reach +human performance and support football referees in the future. + +
+
+
+
+
+ + ☆ DWE+: Dual-Way Matching Enhanced Framework for Multimodal Entity Linking + + +
+ Multimodal entity linking (MEL) aims to utilize multimodal information +(usually textual and visual information) to link ambiguous mentions to +unambiguous entities in knowledge base. Current methods facing main issues: +(1)treating the entire image as input may contain redundant information. (2)the +insufficient utilization of entity-related information, such as attributes in +images. (3)semantic inconsistency between the entity in knowledge base and its +representation. To this end, we propose DWE+ for multimodal entity linking. +DWE+ could capture finer semantics and dynamically maintain semantic +consistency with entities. This is achieved by three aspects: (a)we introduce a +method for extracting fine-grained image features by partitioning the image +into multiple local objects. Then, hierarchical contrastive learning is used to +further align semantics between coarse-grained information(text and image) and +fine-grained (mention and visual objects). (b)we explore ways to extract visual +attributes from images to enhance fusion feature such as facial features and +identity. (c)we leverage Wikipedia and ChatGPT to capture the entity +representation, achieving semantic enrichment from both static and dynamic +perspectives, which better reflects the real-world entity semantics. +Experiments on Wikimel, Richpedia, and Wikidiverse datasets demonstrate the +effectiveness of DWE+ in improving MEL performance. Specifically, we optimize +these datasets and achieve state-of-the-art performance on the enhanced +datasets. The code and enhanced datasets are released on +https://github.com/season1blue/DWET + +
+
+ comment: under review on TOIS. arXiv admin note: substantial text overlap with + arXiv:2312.11816 +
+
+
+
+
+ + ♻ ☆ PIGEON: Predicting Image Geolocations + + +
+ Planet-scale image geolocalization remains a challenging problem due to the +diversity of images originating from anywhere in the world. Although approaches +based on vision transformers have made significant progress in geolocalization +accuracy, success in prior literature is constrained to narrow distributions of +images of landmarks, and performance has not generalized to unseen places. We +present a new geolocalization system that combines semantic geocell creation, +multi-task contrastive pretraining, and a novel loss function. Additionally, +our work is the first to perform retrieval over location clusters for guess +refinements. We train two models for evaluations on street-level data and +general-purpose image geolocalization; the first model, PIGEON, is trained on +data from the game of Geoguessr and is capable of placing over 40% of its +guesses within 25 kilometers of the target location globally. We also develop a +bot and deploy PIGEON in a blind experiment against humans, ranking in the top +0.01% of players. We further challenge one of the world's foremost professional +Geoguessr players to a series of six matches with millions of viewers, winning +all six games. Our second model, PIGEOTTO, differs in that it is trained on a +dataset of images from Flickr and Wikipedia, achieving state-of-the-art results +on a wide range of image geolocalization benchmarks, outperforming the previous +SOTA by up to 7.7 percentage points on the city accuracy level and up to 38.8 +percentage points on the country level. Our findings suggest that PIGEOTTO is +the first image geolocalization model that effectively generalizes to unseen +places and that our approach can pave the way for highly accurate, planet-scale +image geolocalization systems. Our code is available on GitHub. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ MMSFormer: Multimodal Transformer for Material and Semantic Segmentation + + +
+ Leveraging information across diverse modalities is known to enhance +performance on multimodal segmentation tasks. However, effectively fusing +information from different modalities remains challenging due to the unique +characteristics of each modality. In this paper, we propose a novel fusion +strategy that can effectively fuse information from different modality +combinations. We also propose a new model named Multi-Modal Segmentation +TransFormer (MMSFormer) that incorporates the proposed fusion strategy to +perform multimodal material and semantic segmentation tasks. MMSFormer +outperforms current state-of-the-art models on three different datasets. As we +begin with only one input modality, performance improves progressively as +additional modalities are incorporated, showcasing the effectiveness of the +fusion block in combining useful information from diverse input modalities. +Ablation studies show that different modules in the fusion block are crucial +for overall model performance. Furthermore, our ablation studies also highlight +the capacity of different input modalities to improve performance in the +identification of different types of materials. The code and pretrained models +will be made available at https://github.com/csiplab/MMSFormer. + +
+
+ comment: Accepted by IEEE Open Journal of Signal Processing. 15 pages, 3 + figures, 9 tables +
+
+
+
+
+ + ♻ ☆ AG-ReID.v2: Bridging Aerial and Ground Views for Person + Re-identification + + +
+ Aerial-ground person re-identification (Re-ID) presents unique challenges in +computer vision, stemming from the distinct differences in viewpoints, poses, +and resolutions between high-altitude aerial and ground-based cameras. Existing +research predominantly focuses on ground-to-ground matching, with aerial +matching less explored due to a dearth of comprehensive datasets. To address +this, we introduce AG-ReID.v2, a dataset specifically designed for person Re-ID +in mixed aerial and ground scenarios. This dataset comprises 100,502 images of +1,615 unique individuals, each annotated with matching IDs and 15 soft +attribute labels. Data were collected from diverse perspectives using a UAV, +stationary CCTV, and smart glasses-integrated camera, providing a rich variety +of intra-identity variations. Additionally, we have developed an explainable +attention network tailored for this dataset. This network features a +three-stream architecture that efficiently processes pairwise image distances, +emphasizes key top-down features, and adapts to variations in appearance due to +altitude differences. Comparative evaluations demonstrate the superiority of +our approach over existing baselines. We plan to release the dataset and +algorithm source code publicly, aiming to advance research in this specialized +field of computer vision. For access, please visit +https://github.com/huynguyen792/AG-ReID.v2. + +
+
+ comment: 13 pages, Accepted by TIFS 2023 +
+
+
+
+
+ + ♻ ☆ Relightful Harmonization: Lighting-aware Portrait Background Replacement CVPR 2024 + + +
+ Portrait harmonization aims to composite a subject into a new background, +adjusting its lighting and color to ensure harmony with the background scene. +Existing harmonization techniques often only focus on adjusting the global +color and brightness of the foreground and ignore crucial illumination cues +from the background such as apparent lighting direction, leading to unrealistic +compositions. We introduce Relightful Harmonization, a lighting-aware diffusion +model designed to seamlessly harmonize sophisticated lighting effect for the +foreground portrait using any background image. Our approach unfolds in three +stages. First, we introduce a lighting representation module that allows our +diffusion model to encode lighting information from target image background. +Second, we introduce an alignment network that aligns lighting features learned +from image background with lighting features learned from panorama environment +maps, which is a complete representation for scene illumination. Last, to +further boost the photorealism of the proposed method, we introduce a novel +data simulation pipeline that generates synthetic training pairs from a diverse +range of natural images, which are used to refine the model. Our method +outperforms existing benchmarks in visual fidelity and lighting coherence, +showing superior generalization in real-world testing scenarios, highlighting +its versatility and practicality. + +
+
+ comment: CVPR 2024 camera ready +
+
+
+
+
+ + ♻ ☆ Zero-TPrune: Zero-Shot Token Pruning through Leveraging of the Attention + Graph in Pre-Trained Transformers CVPR + + +
+ Deployment of Transformer models on edge devices is becoming increasingly +challenging due to the exponentially growing inference cost that scales +quadratically with the number of tokens in the input sequence. Token pruning is +an emerging solution to address this challenge due to its ease of deployment on +various Transformer backbones. However, most token pruning methods require +computationally expensive fine-tuning, which is undesirable in many edge +deployment cases. In this work, we propose Zero-TPrune, the first zero-shot +method that considers both the importance and similarity of tokens in +performing token pruning. It leverages the attention graph of pre-trained +Transformer models to produce an importance distribution for tokens via our +proposed Weighted Page Rank (WPR) algorithm. This distribution further guides +token partitioning for efficient similarity-based pruning. Due to the +elimination of the fine-tuning overhead, Zero-TPrune can prune large models at +negligible computational cost, switch between different pruning configurations +at no computational cost, and perform hyperparameter tuning efficiently. We +evaluate the performance of Zero-TPrune on vision tasks by applying it to +various vision Transformer backbones and testing them on ImageNet. Without any +fine-tuning, Zero-TPrune reduces the FLOPs cost of DeiT-S by 34.7% and improves +its throughput by 45.3% with only 0.4% accuracy loss. Compared with +state-of-the-art pruning methods that require fine-tuning, Zero-TPrune not only +eliminates the need for fine-tuning after pruning but also does so with only +0.1% accuracy loss. Compared with state-of-the-art fine-tuning-free pruning +methods, Zero-TPrune reduces accuracy loss by up to 49% with similar FLOPs +budgets. Project webpage: https://jha-lab.github.io/zerotprune. + +
+
+ comment: IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) + 2024 +
+
+
+
+
+ + ♻ ☆ Spatio-Temporal Turbulence Mitigation: A Translational Perspective CVPR 2024 + + +
+ Recovering images distorted by atmospheric turbulence is a challenging +inverse problem due to the stochastic nature of turbulence. Although numerous +turbulence mitigation (TM) algorithms have been proposed, their efficiency and +generalization to real-world dynamic scenarios remain severely limited. +Building upon the intuitions of classical TM algorithms, we present the Deep +Atmospheric TUrbulence Mitigation network (DATUM). DATUM aims to overcome major +challenges when transitioning from classical to deep learning approaches. By +carefully integrating the merits of classical multi-frame TM methods into a +deep network structure, we demonstrate that DATUM can efficiently perform +long-range temporal aggregation using a recurrent fashion, while deformable +attention and temporal-channel attention seamlessly facilitate pixel +registration and lucky imaging. With additional supervision, tilt and blur +degradation can be jointly mitigated. These inductive biases empower DATUM to +significantly outperform existing methods while delivering a tenfold increase +in processing speed. A large-scale training dataset, ATSyn, is presented as a +co-invention to enable generalization in real turbulence. Our code and datasets +are available at https://xg416.github.io/DATUM. + +
+
+ comment: Accepted by CVPR 2024, project page https://xg416.github.io/DATUM/ +
+
+
+
+
+ + ♻ ☆ Get a Grip: Reconstructing Hand-Object Stable Grasps in Egocentric + Videos + + +
+ We propose the task of Hand-Object Stable Grasp Reconstruction (HO-SGR), the +reconstruction of frames during which the hand is stably holding the object. We +first develop the stable grasp definition based on the intuition that the +in-contact area between the hand and object should remain stable. By analysing +the 3D ARCTIC dataset, we identify stable grasp durations and showcase that +objects in stable grasps move within a single degree of freedom (1-DoF). We +thereby propose a method to jointly optimise all frames within a stable grasp, +minimising object motions to a latent 1-DoF. Finally, we extend the knowledge +to in-the-wild videos by labelling 2.4K clips of stable grasps. Our proposed +EPIC-Grasps dataset includes 390 object instances of 9 categories, featuring +stable grasps from videos of daily interactions in 141 environments. Without 3D +ground truth, we use stable contact areas and 2D projection masks to assess the +HO-SGR task in the wild. We evaluate relevant methods and our approach +preserves significantly higher stable contact area, on both EPIC-Grasps and +stable grasp sub-sequences from the ARCTIC dataset. + +
+
+ comment: webpage: https://zhifanzhu.github.io/getagrip +
+
+
+
+
+ + ♻ ☆ DragDiffusion: Harnessing Diffusion Models for Interactive Point-based + Image Editing + + +
+ Accurate and controllable image editing is a challenging task that has +attracted significant attention recently. Notably, DragGAN is an interactive +point-based image editing framework that achieves impressive editing results +with pixel-level precision. However, due to its reliance on generative +adversarial networks (GANs), its generality is limited by the capacity of +pretrained GAN models. In this work, we extend this editing framework to +diffusion models and propose a novel approach DragDiffusion. By harnessing +large-scale pretrained diffusion models, we greatly enhance the applicability +of interactive point-based editing on both real and diffusion-generated images. +Our approach involves optimizing the diffusion latents to achieve precise +spatial control. The supervision signal of this optimization process is from +the diffusion model's UNet features, which are known to contain rich semantic +and geometric information. Moreover, we introduce two additional techniques, +namely LoRA fine-tuning and latent-MasaCtrl, to further preserve the identity +of the original image. Lastly, we present a challenging benchmark dataset +called DragBench -- the first benchmark to evaluate the performance of +interactive point-based image editing methods. Experiments across a wide range +of challenging cases (e.g., images with multiple objects, diverse object +categories, various styles, etc.) demonstrate the versatility and generality of +DragDiffusion. Code: https://github.com/Yujun-Shi/DragDiffusion. + +
+
+ comment: Code is released at https://github.com/Yujun-Shi/DragDiffusion +
+
+
+
+
+ + ♻ ☆ Demystifying CLIP Data + + +
+ Contrastive Language-Image Pre-training (CLIP) is an approach that has +advanced research and applications in computer vision, fueling modern +recognition systems and generative models. We believe that the main ingredient +to the success of CLIP is its data and not the model architecture or +pre-training objective. However, CLIP only provides very limited information +about its data and how it has been collected, leading to works that aim to +reproduce CLIP's data by filtering with its model parameters. In this work, we +intend to reveal CLIP's data curation approach and in our pursuit of making it +open to the community introduce Metadata-Curated Language-Image Pre-training +(MetaCLIP). MetaCLIP takes a raw data pool and metadata (derived from CLIP's +concepts) and yields a balanced subset over the metadata distribution. Our +experimental study rigorously isolates the model and training settings, +concentrating solely on data. MetaCLIP applied to CommonCrawl with 400M +image-text data pairs outperforms CLIP's data on multiple standard benchmarks. +In zero-shot ImageNet classification, MetaCLIP achieves 70.8% accuracy, +surpassing CLIP's 68.3% on ViT-B models. Scaling to 1B data, while maintaining +the same training budget, attains 72.4%. Our observations hold across various +model sizes, exemplified by ViT-H achieving 80.5%, without any +bells-and-whistles. Curation code and training data distribution on metadata is +made available at https://github.com/facebookresearch/MetaCLIP. + +
+
+ comment: 17 pages. arXiv admin note: text overlap with arXiv:2103.00020 by + other authors +
+
+
+
+
+ + ♻ ☆ Mimicking the Oracle: An Initial Phase Decorrelation Approach for Class + Incremental Learning CVPR 2022 + + +
+ Class Incremental Learning (CIL) aims at learning a multi-class classifier in +a phase-by-phase manner, in which only data of a subset of the classes are +provided at each phase. Previous works mainly focus on mitigating forgetting in +phases after the initial one. However, we find that improving CIL at its +initial phase is also a promising direction. Specifically, we experimentally +show that directly encouraging CIL Learner at the initial phase to output +similar representations as the model jointly trained on all classes can greatly +boost the CIL performance. Motivated by this, we study the difference between a +na\"ively-trained initial-phase model and the oracle model. Specifically, since +one major difference between these two models is the number of training +classes, we investigate how such difference affects the model representations. +We find that, with fewer training classes, the data representations of each +class lie in a long and narrow region; with more training classes, the +representations of each class scatter more uniformly. Inspired by this +observation, we propose Class-wise Decorrelation (CwD) that effectively +regularizes representations of each class to scatter more uniformly, thus +mimicking the model jointly trained with all classes (i.e., the oracle model). +Our CwD is simple to implement and easy to plug into existing methods. +Extensive experiments on various benchmark datasets show that CwD consistently +and significantly improves the performance of existing state-of-the-art methods +by around 1\% to 3\%. Code will be released. + +
+
+ comment: CVPR 2022 Camera-Ready Version +
+
+
+
+
+ + ♻ ☆ Hidden in Plain Sight: Undetectable Adversarial Bias Attacks on + Vulnerable Patient Populations + + +
+ The proliferation of artificial intelligence (AI) in radiology has shed light +on the risk of deep learning (DL) models exacerbating clinical biases towards +vulnerable patient populations. While prior literature has focused on +quantifying biases exhibited by trained DL models, demographically targeted +adversarial bias attacks on DL models and its implication in the clinical +environment remains an underexplored field of research in medical imaging. In +this work, we demonstrate that demographically targeted label poisoning attacks +can introduce undetectable underdiagnosis bias in DL models. Our results across +multiple performance metrics and demographic groups like sex, age, and their +intersectional subgroups show that adversarial bias attacks demonstrate +high-selectivity for bias in the targeted group by degrading group model +performance without impacting overall model performance. Furthermore, our +results indicate that adversarial bias attacks result in biased DL models that +propagate prediction bias even when evaluated with external datasets. + +
+
+ comment: 29 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ NiteDR: Nighttime Image De-Raining with Cross-View Sensor Cooperative + Learning for Dynamic Driving Scenes + + +
+ In real-world environments, outdoor imaging systems are often affected by +disturbances such as rain degradation. Especially, in nighttime driving scenes, +insufficient and uneven lighting shrouds the scenes in darkness, resulting +degradation of both the image quality and visibility. Particularly, in the +field of autonomous driving, the visual perception ability of RGB sensors +experiences a sharp decline in such harsh scenarios. Additionally, driving +assistance systems suffer from reduced capabilities in capturing and discerning +the surrounding environment, posing a threat to driving safety. Single-view +information captured by single-modal sensors cannot comprehensively depict the +entire scene. To address these challenges, we developed an image de-raining +framework tailored for rainy nighttime driving scenes. It aims to remove rain +artifacts, enrich scene representation, and restore useful information. +Specifically, we introduce cooperative learning between visible and infrared +images captured by different sensors. By cross-view fusion of these +multi-source data, the scene within the images gains richer texture details and +enhanced contrast. We constructed an information cleaning module called +CleanNet as the first stage of our framework. Moreover, we designed an +information fusion module called FusionNet as the second stage to fuse the +clean visible images with infrared images. Using this stage-by-stage learning +strategy, we obtain de-rained fusion images with higher quality and better +visual perception. Extensive experiments demonstrate the effectiveness of our +proposed Cross-View Cooperative Learning (CVCL) in adverse driving scenarios in +low-light rainy environments. The proposed approach addresses the gap in the +utilization of existing rain removal algorithms in specific low-light +conditions. + +
+
+
+
+
+ + ♻ ☆ HiPose: Hierarchical Binary Surface Encoding and Correspondence Pruning + for RGB-D 6DoF Object Pose Estimation CVPR 2024 + + +
+ In this work, we present a novel dense-correspondence method for 6DoF object +pose estimation from a single RGB-D image. While many existing data-driven +methods achieve impressive performance, they tend to be time-consuming due to +their reliance on rendering-based refinement approaches. To circumvent this +limitation, we present HiPose, which establishes 3D-3D correspondences in a +coarse-to-fine manner with a hierarchical binary surface encoding. Unlike +previous dense-correspondence methods, we estimate the correspondence surface +by employing point-to-surface matching and iteratively constricting the surface +until it becomes a correspondence point while gradually removing outliers. +Extensive experiments on public benchmarks LM-O, YCB-V, and T-Less demonstrate +that our method surpasses all refinement-free methods and is even on par with +expensive refinement-based approaches. Crucially, our approach is +computationally efficient and enables real-time critical applications with high +accuracy requirements. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ CHAIN: Enhancing Generalization in Data-Efficient GANs via lipsCHitz + continuity constrAIned Normalization CVPR2024 + + +
+ Generative Adversarial Networks (GANs) significantly advanced image +generation but their performance heavily depends on abundant training data. In +scenarios with limited data, GANs often struggle with discriminator overfitting +and unstable training. Batch Normalization (BN), despite being known for +enhancing generalization and training stability, has rarely been used in the +discriminator of Data-Efficient GANs. Our work addresses this gap by +identifying a critical flaw in BN: the tendency for gradient explosion during +the centering and scaling steps. To tackle this issue, we present CHAIN +(lipsCHitz continuity constrAIned Normalization), which replaces the +conventional centering step with zero-mean regularization and integrates a +Lipschitz continuity constraint in the scaling step. CHAIN further enhances GAN +training by adaptively interpolating the normalized and unnormalized features, +effectively avoiding discriminator overfitting. Our theoretical analyses firmly +establishes CHAIN's effectiveness in reducing gradients in latent features and +weights, improving stability and generalization in GAN training. Empirical +evidence supports our theory. CHAIN achieves state-of-the-art results in +data-limited scenarios on CIFAR-10/100, ImageNet, five low-shot and seven +high-resolution few-shot image datasets. Code: +https://github.com/MaxwellYaoNi/CHAIN + +
+
+ comment: Accepted by CVPR2024. 26 pages full version. Code: + https://github.com/MaxwellYaoNi/CHAIN +
+
+
+
+
+ + ♻ ☆ EVCap: Retrieval-Augmented Image Captioning with External Visual-Name + Memory for Open-World Comprehension CVPR 2024 + + +
+ Large language models (LLMs)-based image captioning has the capability of +describing objects not explicitly observed in training data; yet novel objects +occur frequently, necessitating the requirement of sustaining up-to-date object +knowledge for open-world comprehension. Instead of relying on large amounts of +data and/or scaling up network parameters, we introduce a highly effective +retrieval-augmented image captioning method that prompts LLMs with object names +retrieved from External Visual--name memory (EVCap). We build ever-changing +object knowledge memory using objects' visuals and names, enabling us to (i) +update the memory at a minimal cost and (ii) effortlessly augment LLMs with +retrieved object names by utilizing a lightweight and fast-to-train model. Our +model, which was trained only on the COCO dataset, can adapt to out-of-domain +without requiring additional fine-tuning or re-training. Our experiments +conducted on benchmarks and synthetic commonsense-violating data show that +EVCap, with only 3.97M trainable parameters, exhibits superior performance +compared to other methods based on frozen pre-trained LLMs. Its performance is +also competitive to specialist SOTAs that require extensive training. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Cooperation Does Matter: Exploring Multi-Order Bilateral Relations for + Audio-Visual Segmentation CVPR 2024 + + +
+ Recently, an audio-visual segmentation (AVS) task has been introduced, aiming +to group pixels with sounding objects within a given video. This task +necessitates a first-ever audio-driven pixel-level understanding of the scene, +posing significant challenges. In this paper, we propose an innovative +audio-visual transformer framework, termed COMBO, an acronym for COoperation of +Multi-order Bilateral relatiOns. For the first time, our framework explores +three types of bilateral entanglements within AVS: pixel entanglement, modality +entanglement, and temporal entanglement. Regarding pixel entanglement, we +employ a Siam-Encoder Module (SEM) that leverages prior knowledge to generate +more precise visual features from the foundational model. For modality +entanglement, we design a Bilateral-Fusion Module (BFM), enabling COMBO to +align corresponding visual and auditory signals bi-directionally. As for +temporal entanglement, we introduce an innovative adaptive inter-frame +consistency loss according to the inherent rules of temporal. Comprehensive +experiments and ablation studies on AVSBench-object (84.7 mIoU on S4, 59.2 mIou +on MS3) and AVSBench-semantic (42.1 mIoU on AVSS) datasets demonstrate that +COMBO surpasses previous state-of-the-art methods. Code and more results will +be publicly available at https://yannqi.github.io/AVS-COMBO/. + +
+
+ comment: CVPR 2024 Highlight. 13 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Video Anomaly Detection via Spatio-Temporal Pseudo-Anomaly Generation : + A Unified Approach CVPR + + +
+ Video Anomaly Detection (VAD) is an open-set recognition task, which is +usually formulated as a one-class classification (OCC) problem, where training +data is comprised of videos with normal instances while test data contains both +normal and anomalous instances. Recent works have investigated the creation of +pseudo-anomalies (PAs) using only the normal data and making strong assumptions +about real-world anomalies with regards to abnormality of objects and speed of +motion to inject prior information about anomalies in an autoencoder (AE) based +reconstruction model during training. This work proposes a novel method for +generating generic spatio-temporal PAs by inpainting a masked out region of an +image using a pre-trained Latent Diffusion Model and further perturbing the +optical flow using mixup to emulate spatio-temporal distortions in the data. In +addition, we present a simple unified framework to detect real-world anomalies +under the OCC setting by learning three types of anomaly indicators, namely +reconstruction quality, temporal irregularity and semantic inconsistency. +Extensive experiments on four VAD benchmark datasets namely Ped2, Avenue, +ShanghaiTech and UBnormal demonstrate that our method performs on par with +other existing state-of-the-art PAs generation and reconstruction based methods +under the OCC setting. Our analysis also examines the transferability and +generalisation of PAs across these datasets, offering valuable insights by +identifying real-world anomalies through PAs. + +
+
+ comment: Accepted in CVPRW 2024 - VAND Workshop +
+
+
+
+
+ + ♻ ☆ Reconstruction and Simulation of Elastic Objects with Spring-Mass 3D + Gaussians + + +
+ Reconstructing and simulating elastic objects from visual observations is +crucial for applications in computer vision and robotics. Existing methods, +such as 3D Gaussians, model 3D appearance and geometry, but lack the ability to +estimate physical properties for objects and simulate them. The core challenge +lies in integrating an expressive yet efficient physical dynamics model. We +propose Spring-Gaus, a 3D physical object representation for reconstructing and +simulating elastic objects from videos of the object from multiple viewpoints. +In particular, we develop and integrate a 3D Spring-Mass model into 3D Gaussian +kernels, enabling the reconstruction of the visual appearance, shape, and +physical dynamics of the object. Our approach enables future prediction and +simulation under various initial states and environmental properties. We +evaluate Spring-Gaus on both synthetic and real-world datasets, demonstrating +accurate reconstruction and simulation of elastic objects. Project page: +https://zlicheng.com/spring_gaus. + +
+
+
+
+
+ + ♻ ☆ A Survey on Transformer Compression + + +
+ Transformer plays a vital role in the realms of natural language processing +(NLP) and computer vision (CV), specially for constructing large language +models (LLM) and large vision models (LVM). Model compression methods reduce +the memory and computational cost of Transformer, which is a necessary step to +implement large language/vision models on practical devices. Given the unique +architecture of Transformer, featuring alternative attention and feedforward +neural network (FFN) modules, specific compression techniques are usually +required. The efficiency of these compression methods is also paramount, as +retraining large models on the entire training dataset is usually impractical. +This survey provides a comprehensive review of recent compression methods, with +a specific focus on their application to Transformer-based models. The +compression methods are primarily categorized into pruning, quantization, +knowledge distillation, and efficient architecture design (Mamba, RetNet, RWKV, +etc.). In each category, we discuss compression methods for both language and +vision tasks, highlighting common underlying principles. Finally, we delve into +the relation between various compression methods, and discuss further +directions in this domain. + +
+
+ comment: Model Compression, Transformer, Large Language Model, Large Vision + Model, LLM +
+
+
+
+
+ + ♻ ☆ Linear Anchored Gaussian Mixture Model for Location and Width + Computation of Objects in Thick Line Shape + + +
+ An accurate detection of the centerlines of linear objects is a challenging +topic in many sensitive real-world applications such X-ray imaging, remote +sensing and lane marking detection in road traffic. Model-based approaches +using Hough and Radon transforms are often used but, are not recommended for +thick line detection, whereas approaches based on image derivatives need +further step-by-step processing, making their efficiency dependent on each step +outcomes. In this paper, we aim to detect linear structures found in images by +considering the 3D representation of the image gray levels as a finite mixture +model of statistical distribution. The latter, which we named linear anchored +Gaussian distribution could be parametrized by a scale value ${\sigma}$ +describing the linear structure thickness and a line equation, parametrized, in +turn, by a radius ${\rho}$ and an orientation angle ${\theta}$, describing the +linear structure centerline location. Expectation-Maximization (EM) algorithm +is used for the mixture model parameter estimation, where a new paradigm, using +the background subtraction for the likelihood function computation, is +proposed. For the EM algorithm, two ${\theta}$ parameter initialization schemes +are used: the first one is based on a random choice of the first component of +${\theta}$ vector, whereas the second is based on the image Hessian with a +simultaneous computation of the mixture model components number. Experiments on +real world images and synthetic images corrupted by blur and additive noise +show the good performance of the proposed methods, where the algorithm using +background subtraction and Hessian-based ${\theta}$ initialization provides an +outstanding accuracy of the linear structure detection despite irregular image +background and presence of blur and noise. + +
+
+ comment: 13 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ UPNet: Uncertainty-based Picking Deep Learning Network for Robust First + Break Picking + + +
+ In seismic exploration, first break (FB) picking is a crucial aspect in the +determination of subsurface velocity models, significantly influencing the +placement of wells. Many deep neural networks (DNNs)-based automatic picking +methods have been proposed to accelerate this processing. Significantly, the +segmentation-based DNN methods provide a segmentation map and then estimate FB +from the map using a picking threshold. However, the uncertainty of the results +picked by DNNs still needs to be analyzed. Thus, the automatic picking methods +applied in field datasets can not ensure robustness, especially in the case of +a low signal-to-noise ratio (SNR). In this paper, we introduce uncertainty +quantification into the FB picking task and propose a novel uncertainty-based +picking deep learning network called UPNet. UPNet not only estimates the +uncertainty of network output but also can filter the pickings with low +confidence. Many experiments evaluate that UPNet exhibits higher accuracy and +robustness than the deterministic DNN-based model, achieving State-of-the-Art +(SOTA) performance in field surveys. In addition, we verify that the +measurement uncertainty is meaningful, which can provide a reference for human +decision-making. + +
+
+
+
+
+ + ♻ ☆ UniEdit: A Unified Tuning-Free Framework for Video Motion and Appearance + Editing + + +
+ Recent advances in text-guided video editing have showcased promising results +in appearance editing (e.g., stylization). However, video motion editing in the +temporal dimension (e.g., from eating to waving), which distinguishes video +editing from image editing, is underexplored. In this work, we present UniEdit, +a tuning-free framework that supports both video motion and appearance editing +by harnessing the power of a pre-trained text-to-video generator within an +inversion-then-generation framework. To realize motion editing while preserving +source video content, based on the insights that temporal and spatial +self-attention layers encode inter-frame and intra-frame dependency +respectively, we introduce auxiliary motion-reference and reconstruction +branches to produce text-guided motion and source features respectively. The +obtained features are then injected into the main editing path via temporal and +spatial self-attention layers. Extensive experiments demonstrate that UniEdit +covers video motion editing and various appearance editing scenarios, and +surpasses the state-of-the-art methods. Our code will be publicly available. + +
+
+ comment: Project page: https://jianhongbai.github.io/UniEdit/ +
+
+
+
+
+ + ♻ ☆ SiCL: Silhouette-Driven Contrastive Learning for Unsupervised Person + Re-Identification with Clothes Change + + +
+ In this paper, we address a highly challenging yet critical task: +unsupervised long-term person re-identification with clothes change. Existing +unsupervised person re-id methods are mainly designed for short-term scenarios +and usually rely on RGB cues so that fail to perceive feature patterns that are +independent of the clothes. To crack this bottleneck, we propose a +silhouette-driven contrastive learning (SiCL) method, which is designed to +learn cross-clothes invariance by integrating both the RGB cues and the +silhouette information within a contrastive learning framework. To our +knowledge, this is the first tailor-made framework for unsupervised long-term +clothes change \reid{}, with superior performance on six benchmark datasets. We +conduct extensive experiments to evaluate our proposed SiCL compared to the +state-of-the-art unsupervised person reid methods across all the representative +datasets. Experimental results demonstrate that our proposed SiCL significantly +outperforms other unsupervised re-id methods. + +
+
+
+
+
+ + ♻ ☆ DetToolChain: A New Prompting Paradigm to Unleash Detection Ability of + MLLM + + +
+ We present DetToolChain, a novel prompting paradigm, to unleash the zero-shot +object detection ability of multimodal large language models (MLLMs), such as +GPT-4V and Gemini. Our approach consists of a detection prompting toolkit +inspired by high-precision detection priors and a new Chain-of-Thought to +implement these prompts. Specifically, the prompts in the toolkit are designed +to guide the MLLM to focus on regional information (e.g., zooming in), read +coordinates according to measure standards (e.g., overlaying rulers and +compasses), and infer from the contextual information (e.g., overlaying scene +graphs). Building upon these tools, the new detection chain-of-thought can +automatically decompose the task into simple subtasks, diagnose the +predictions, and plan for progressive box refinements. The effectiveness of our +framework is demonstrated across a spectrum of detection tasks, especially hard +cases. Compared to existing state-of-the-art methods, GPT-4V with our +DetToolChain improves state-of-the-art object detectors by +21.5% AP50 on MS +COCO Novel class set for open-vocabulary detection, +24.23% Acc on RefCOCO val +set for zero-shot referring expression comprehension, +14.5% AP on D-cube +describe object detection FULL setting. + +
+
+
+
+
+ + ♻ ☆ Self-Supervised Learning for Medical Image Data with Anatomy-Oriented + Imaging Planes + + +
+ Self-supervised learning has emerged as a powerful tool for pretraining deep +networks on unlabeled data, prior to transfer learning of target tasks with +limited annotation. The relevance between the pretraining pretext and target +tasks is crucial to the success of transfer learning. Various pretext tasks +have been proposed to utilize properties of medical image data (e.g., three +dimensionality), which are more relevant to medical image analysis than generic +ones for natural images. However, previous work rarely paid attention to data +with anatomy-oriented imaging planes, e.g., standard cardiac magnetic resonance +imaging views. As these imaging planes are defined according to the anatomy of +the imaged organ, pretext tasks effectively exploiting this information can +pretrain the networks to gain knowledge on the organ of interest. In this work, +we propose two complementary pretext tasks for this group of medical image data +based on the spatial relationship of the imaging planes. The first is to learn +the relative orientation between the imaging planes and implemented as +regressing their intersecting lines. The second exploits parallel imaging +planes to regress their relative slice locations within a stack. Both pretext +tasks are conceptually straightforward and easy to implement, and can be +combined in multitask learning for better representation learning. Thorough +experiments on two anatomical structures (heart and knee) and representative +target tasks (semantic segmentation and classification) demonstrate that the +proposed pretext tasks are effective in pretraining deep networks for +remarkably boosted performance on the target tasks, and superior to other +recent approaches. + +
+
+ comment: Medical Image Analysis +
+
+
+
+
+ + ♻ ☆ From Two-Stream to One-Stream: Efficient RGB-T Tracking via Mutual + Prompt Learning and Knowledge Distillation + + +
+ Due to the complementary nature of visible light and thermal infrared +modalities, object tracking based on the fusion of visible light images and +thermal images (referred to as RGB-T tracking) has received increasing +attention from researchers in recent years. How to achieve more comprehensive +fusion of information from the two modalities at a lower cost has been an issue +that researchers have been exploring. Inspired by visual prompt learning, we +designed a novel two-stream RGB-T tracking architecture based on cross-modal +mutual prompt learning, and used this model as a teacher to guide a one-stream +student model for rapid learning through knowledge distillation techniques. +Extensive experiments have shown that, compared to similar RGB-T trackers, our +designed teacher model achieved the highest precision rate, while the student +model, with comparable precision rate to the teacher model, realized an +inference speed more than three times faster than the teacher model.(Codes will +be available if accepted.) + +
+
+
+
+
+ + ♻ ☆ GS-SLAM: Dense Visual SLAM with 3D Gaussian Splatting CVPR 2024 + + +
+ In this paper, we introduce \textbf{GS-SLAM} that first utilizes 3D Gaussian +representation in the Simultaneous Localization and Mapping (SLAM) system. It +facilitates a better balance between efficiency and accuracy. Compared to +recent SLAM methods employing neural implicit representations, our method +utilizes a real-time differentiable splatting rendering pipeline that offers +significant speedup to map optimization and RGB-D rendering. Specifically, we +propose an adaptive expansion strategy that adds new or deletes noisy 3D +Gaussians in order to efficiently reconstruct new observed scene geometry and +improve the mapping of previously observed areas. This strategy is essential to +extend 3D Gaussian representation to reconstruct the whole scene rather than +synthesize a static object in existing methods. Moreover, in the pose tracking +process, an effective coarse-to-fine technique is designed to select reliable +3D Gaussian representations to optimize camera pose, resulting in runtime +reduction and robust estimation. Our method achieves competitive performance +compared with existing state-of-the-art real-time methods on the Replica, +TUM-RGBD datasets. Project page: https://gs-slam.github.io/. + +
+
+ comment: Accepted to CVPR 2024(highlight). Project Page: + https://gs-slam.github.io/ +
+
+
+
+
+ + ♻ ☆ PV-SSD: A Multi-Modal Point Cloud Feature Fusion Method for Projection + Features and Variable Receptive Field Voxel Features + + +
+ LiDAR-based 3D object detection and classification is crucial for autonomous +driving. However, real-time inference from extremely sparse 3D data is a +formidable challenge. To address this problem, a typical class of approaches +transforms the point cloud cast into a regular data representation (voxels or +projection maps). Then, it performs feature extraction with convolutional +neural networks. However, such methods often result in a certain degree of +information loss due to down-sampling or over-compression of feature +information. This paper proposes a multi-modal point cloud feature fusion +method for projection features and variable receptive field voxel features +(PV-SSD) based on projection and variable voxelization to solve the information +loss problem. We design a two-branch feature extraction structure with a 2D +convolutional neural network to extract the point cloud's projection features +in bird's-eye view to focus on the correlation between local features. A voxel +feature extraction branch is used to extract local fine-grained features. +Meanwhile, we propose a voxel feature extraction method with variable sensory +fields to reduce the information loss of voxel branches due to downsampling. It +avoids missing critical point information by selecting more useful feature +points based on feature point weights for the detection task. In addition, we +propose a multi-modal feature fusion module for point clouds. To validate the +effectiveness of our method, we tested it on the KITTI dataset and ONCE +dataset. + +
+
+
+
+
+ + ♻ ☆ LAKE-RED: Camouflaged Images Generation by Latent Background Knowledge + Retrieval-Augmented Diffusion CVPR 2024 + + +
+ Camouflaged vision perception is an important vision task with numerous +practical applications. Due to the expensive collection and labeling costs, +this community struggles with a major bottleneck that the species category of +its datasets is limited to a small number of object species. However, the +existing camouflaged generation methods require specifying the background +manually, thus failing to extend the camouflaged sample diversity in a low-cost +manner. In this paper, we propose a Latent Background Knowledge +Retrieval-Augmented Diffusion (LAKE-RED) for camouflaged image generation. To +our knowledge, our contributions mainly include: (1) For the first time, we +propose a camouflaged generation paradigm that does not need to receive any +background inputs. (2) Our LAKE-RED is the first knowledge retrieval-augmented +method with interpretability for camouflaged generation, in which we propose an +idea that knowledge retrieval and reasoning enhancement are separated +explicitly, to alleviate the task-specific challenges. Moreover, our method is +not restricted to specific foreground targets or backgrounds, offering a +potential for extending camouflaged vision perception to more diverse domains. +(3) Experimental results demonstrate that our method outperforms the existing +approaches, generating more realistic camouflage images. + +
+
+ comment: Accepted by CVPR 2024, Fig.3 revised +
+
+
+
+
+ + ♻ ☆ Extending CLIP's Image-Text Alignment to Referring Image Segmentation NAACL 2024 + + +
+ Referring Image Segmentation (RIS) is a cross-modal task that aims to segment +an instance described by a natural language expression. Recent methods leverage +large-scale pretrained unimodal models as backbones along with fusion +techniques for joint reasoning across modalities. However, the inherent +cross-modal nature of RIS raises questions about the effectiveness of unimodal +backbones. We propose RISCLIP, a novel framework that effectively leverages the +cross-modal nature of CLIP for RIS. Observing CLIP's inherent alignment between +image and text features, we capitalize on this starting point and introduce +simple but strong modules that enhance unimodal feature extraction and leverage +rich alignment knowledge in CLIP's image-text shared-embedding space. RISCLIP +exhibits outstanding results on all three major RIS benchmarks and also +outperforms previous CLIP-based methods, demonstrating the efficacy of our +strategy in extending CLIP's image-text alignment to RIS. + +
+
+ comment: NAACL 2024 +
+
+
+
+
+ + ♻ ☆ Human Mesh Recovery from Arbitrary Multi-view Images + + +
+ Human mesh recovery from arbitrary multi-view images involves two +characteristics: the arbitrary camera poses and arbitrary number of camera +views. Because of the variability, designing a unified framework to tackle this +task is challenging. The challenges can be summarized as the dilemma of being +able to simultaneously estimate arbitrary camera poses and recover human mesh +from arbitrary multi-view images while maintaining flexibility. To solve this +dilemma, we propose a divide and conquer framework for Unified Human Mesh +Recovery (U-HMR) from arbitrary multi-view images. In particular, U-HMR +consists of a decoupled structure and two main components: camera and body +decoupling (CBD), camera pose estimation (CPE), and arbitrary view fusion +(AVF). As camera poses and human body mesh are independent of each other, CBD +splits the estimation of them into two sub-tasks for two individual +sub-networks (ie, CPE and AVF) to handle respectively, thus the two sub-tasks +are disentangled. In CPE, since each camera pose is unrelated to the others, we +adopt a shared MLP to process all views in a parallel way. In AVF, in order to +fuse multi-view information and make the fusion operation independent of the +number of views, we introduce a transformer decoder with a SMPL parameters +query token to extract cross-view features for mesh recovery. To demonstrate +the efficacy and flexibility of the proposed framework and effect of each +component, we conduct extensive experiments on three public datasets: +Human3.6M, MPI-INF-3DHP, and TotalCapture. + +
+
+
+
+
+ + ♻ ☆ GP-NeRF: Generalized Perception NeRF for Context-Aware 3D Scene + Understanding CVPR 2024 + + +
+ Applying NeRF to downstream perception tasks for scene understanding and +representation is becoming increasingly popular. Most existing methods treat +semantic prediction as an additional rendering task, \textit{i.e.}, the "label +rendering" task, to build semantic NeRFs. However, by rendering +semantic/instance labels per pixel without considering the contextual +information of the rendered image, these methods usually suffer from unclear +boundary segmentation and abnormal segmentation of pixels within an object. To +solve this problem, we propose Generalized Perception NeRF (GP-NeRF), a novel +pipeline that makes the widely used segmentation model and NeRF work compatibly +under a unified framework, for facilitating context-aware 3D scene perception. +To accomplish this goal, we introduce transformers to aggregate radiance as +well as semantic embedding fields jointly for novel views and facilitate the +joint volumetric rendering of both fields. In addition, we propose two +self-distillation mechanisms, i.e., the Semantic Distill Loss and the +Depth-Guided Semantic Distill Loss, to enhance the discrimination and quality +of the semantic field and the maintenance of geometric consistency. In +evaluation, we conduct experimental comparisons under two perception tasks +(\textit{i.e.} semantic and instance segmentation) using both synthetic and +real-world datasets. Notably, our method outperforms SOTA approaches by 6.94\%, +11.76\%, and 8.47\% on generalized semantic segmentation, finetuning semantic +segmentation, and instance segmentation, respectively. + +
+
+ comment: CVPR 2024 (Highlight). Project Page: + https://lifuguan.github.io/gpnerf-pages/ +
+
+
+
+
+ + ♻ ☆ RaFE: Generative Radiance Fields Restoration + + +
+ NeRF (Neural Radiance Fields) has demonstrated tremendous potential in novel +view synthesis and 3D reconstruction, but its performance is sensitive to input +image quality, which struggles to achieve high-fidelity rendering when provided +with low-quality sparse input viewpoints. Previous methods for NeRF restoration +are tailored for specific degradation type, ignoring the generality of +restoration. To overcome this limitation, we propose a generic radiance fields +restoration pipeline, named RaFE, which applies to various types of +degradations, such as low resolution, blurriness, noise, compression artifacts, +or their combinations. Our approach leverages the success of off-the-shelf 2D +restoration methods to recover the multi-view images individually. Instead of +reconstructing a blurred NeRF by averaging inconsistencies, we introduce a +novel approach using Generative Adversarial Networks (GANs) for NeRF generation +to better accommodate the geometric and appearance inconsistencies present in +the multi-view images. Specifically, we adopt a two-level tri-plane +architecture, where the coarse level remains fixed to represent the low-quality +NeRF, and a fine-level residual tri-plane to be added to the coarse level is +modeled as a distribution with GAN to capture potential variations in +restoration. We validate RaFE on both synthetic and real cases for various +restoration tasks, demonstrating superior performance in both quantitative and +qualitative evaluations, surpassing other 3D restoration methods specific to +single task. Please see our project website +https://zkaiwu.github.io/RaFE-Project/. + +
+
+ comment: Project Page: https://zkaiwu.github.io/RaFE +
+
+
+
+
+ + ♻ ☆ Reduction of Class Activation Uncertainty with Background Information + + +
+ Multitask learning is a popular approach to training high-performing neural +networks with improved generalization. In this paper, we propose a background +class to achieve improved generalization at a lower computation compared to +multitask learning to help researchers and organizations with limited +computation power. We also present a methodology for selecting background +images and discuss potential future improvements. We apply our approach to +several datasets and achieve improved generalization with much lower +computation. Through the class activation mappings (CAMs) of the trained +models, we observed the tendency towards looking at a bigger picture with the +proposed model training methodology. Applying the vision transformer with the +proposed background class, we receive state-of-the-art (SOTA) performance on +STL-10, Caltech-101, and CINIC-10 datasets. Example scripts are available in +the 'CAM' folder of the following GitHub Repository: github.com/dipuk0506/UQ + +
+
+
+
+
+ + ♻ ☆ Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data CVPR 2024 + + +
+ This work presents Depth Anything, a highly practical solution for robust +monocular depth estimation. Without pursuing novel technical modules, we aim to +build a simple yet powerful foundation model dealing with any images under any +circumstances. To this end, we scale up the dataset by designing a data engine +to collect and automatically annotate large-scale unlabeled data (~62M), which +significantly enlarges the data coverage and thus is able to reduce the +generalization error. We investigate two simple yet effective strategies that +make data scaling-up promising. First, a more challenging optimization target +is created by leveraging data augmentation tools. It compels the model to +actively seek extra visual knowledge and acquire robust representations. +Second, an auxiliary supervision is developed to enforce the model to inherit +rich semantic priors from pre-trained encoders. We evaluate its zero-shot +capabilities extensively, including six public datasets and randomly captured +photos. It demonstrates impressive generalization ability. Further, through +fine-tuning it with metric depth information from NYUv2 and KITTI, new SOTAs +are set. Our better depth model also results in a better depth-conditioned +ControlNet. Our models are released at +https://github.com/LiheYoung/Depth-Anything. + +
+
+ comment: Accepted by CVPR 2024. Project page: https://depth-anything.github.io +
+
+
+
+
+ + ♻ ☆ StepNet: Spatial-temporal Part-aware Network for Isolated Sign Language + Recognition + + +
+ The goal of sign language recognition (SLR) is to help those who are hard of +hearing or deaf overcome the communication barrier. Most existing approaches +can be typically divided into two lines, i.e., Skeleton-based and RGB-based +methods, but both the two lines of methods have their limitations. +Skeleton-based methods do not consider facial expressions, while RGB-based +approaches usually ignore the fine-grained hand structure. To overcome both +limitations, we propose a new framework called Spatial-temporal Part-aware +network~(StepNet), based on RGB parts. As its name suggests, it is made up of +two modules: Part-level Spatial Modeling and Part-level Temporal Modeling. +Part-level Spatial Modeling, in particular, automatically captures the +appearance-based properties, such as hands and faces, in the feature space +without the use of any keypoint-level annotations. On the other hand, +Part-level Temporal Modeling implicitly mines the long-short term context to +capture the relevant attributes over time. Extensive experiments demonstrate +that our StepNet, thanks to spatial-temporal modules, achieves competitive +Top-1 Per-instance accuracy on three commonly-used SLR benchmarks, i.e., 56.89% +on WLASL, 77.2% on NMFs-CSL, and 77.1% on BOBSL. Additionally, the proposed +method is compatible with the optical flow input and can produce superior +performance if fused. For those who are hard of hearing, we hope that our work +can act as a preliminary step. + +
+
+
+
+
+ + ♻ ☆ DeepAAT: Deep Automated Aerial Triangulation for Fast UAV-based Mapping + + +
+ Automated Aerial Triangulation (AAT), aiming to restore image pose and +reconstruct sparse points simultaneously, plays a pivotal role in earth +observation. With its rich research heritage spanning several decades in +photogrammetry, AAT has evolved into a fundamental process widely applied in +large-scale Unmanned Aerial Vehicle (UAV) based mapping. Despite its +advancements, classic AAT methods still face challenges like low efficiency and +limited robustness. This paper introduces DeepAAT, a deep learning network +designed specifically for AAT of UAV imagery. DeepAAT considers both spatial +and spectral characteristics of imagery, enhancing its capability to resolve +erroneous matching pairs and accurately predict image poses. DeepAAT marks a +significant leap in AAT's efficiency, ensuring thorough scene coverage and +precision. Its processing speed outpaces incremental AAT methods by hundreds of +times and global AAT methods by tens of times while maintaining a comparable +level of reconstruction accuracy. Additionally, DeepAAT's scene clustering and +merging strategy facilitate rapid localization and pose determination for +large-scale UAV images, even under constrained computing resources. The +experimental results demonstrate DeepAAT's substantial improvements over +conventional AAT methods, highlighting its potential in the efficiency and +accuracy of UAV-based 3D reconstruction tasks. To benefit the photogrammetry +society, the code of DeepAAT will be released at: +https://github.com/WHU-USI3DV/DeepAAT. + +
+
+
+
+
+ + ♻ ☆ UniPAD: A Universal Pre-training Paradigm for Autonomous Driving CVPR2024 + + +
+ In the context of autonomous driving, the significance of effective feature +learning is widely acknowledged. While conventional 3D self-supervised +pre-training methods have shown widespread success, most methods follow the +ideas originally designed for 2D images. In this paper, we present UniPAD, a +novel self-supervised learning paradigm applying 3D volumetric differentiable +rendering. UniPAD implicitly encodes 3D space, facilitating the reconstruction +of continuous 3D shape structures and the intricate appearance characteristics +of their 2D projections. The flexibility of our method enables seamless +integration into both 2D and 3D frameworks, enabling a more holistic +comprehension of the scenes. We manifest the feasibility and effectiveness of +UniPAD by conducting extensive experiments on various downstream 3D tasks. Our +method significantly improves lidar-, camera-, and lidar-camera-based baseline +by 9.1, 7.7, and 6.9 NDS, respectively. Notably, our pre-training pipeline +achieves 73.2 NDS for 3D object detection and 79.4 mIoU for 3D semantic +segmentation on the nuScenes validation set, achieving state-of-the-art results +in comparison with previous methods. The code will be available at +https://github.com/Nightmare-n/UniPAD. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ♻ ☆ CityGaussian: Real-time High-quality Large-Scale Scene Rendering with + Gaussians + + +
+ The advancement of real-time 3D scene reconstruction and novel view synthesis +has been significantly propelled by 3D Gaussian Splatting (3DGS). However, +effectively training large-scale 3DGS and rendering it in real-time across +various scales remains challenging. This paper introduces CityGaussian +(CityGS), which employs a novel divide-and-conquer training approach and +Level-of-Detail (LoD) strategy for efficient large-scale 3DGS training and +rendering. Specifically, the global scene prior and adaptive training data +selection enables efficient training and seamless fusion. Based on fused +Gaussian primitives, we generate different detail levels through compression, +and realize fast rendering across various scales through the proposed +block-wise detail levels selection and aggregation strategy. Extensive +experimental results on large-scale scenes demonstrate that our approach +attains state-of-theart rendering quality, enabling consistent real-time +rendering of largescale scenes across vastly different scales. Our project page +is available at https://dekuliutesla.github.io/citygs/. + +
+
+ comment: Project Page: https://dekuliutesla.github.io/citygs/ +
+
+
+
+
+ + ♻ ☆ Towards AI-Architecture Liberty: A Comprehensive Survey on Designing and + Collaborating Virtual Architecture by Deep Learning in the Metaverse + + +
+ 3D shape generation techniques leveraging deep learning have garnered +significant interest from both the computer vision and architectural design +communities, promising to enrich the content of the future metaverse. However, +research on virtual architectural design remains limited, particularly +regarding human-AI collaboration and deep learning-assisted design. We first +illuminate the principles, generation techniques, and current literature of +virtual architecture, focusing on challenges such as datasets, multimodality, +design intuition, and generative frameworks. In our survey, we reviewed 187 +related articles (80.7\% of articles published between 2018 and 2022) covering +architectural research, virtual environments, and technical approaches. This +survey investigates the latest approaches to 3D object generation with deep +generative models (DGMs) and summarizes four characteristics of deep-learning +generation approaches for virtual architecture. According to our analysis of +the survey, we expound on four research agendas, including agency, +communication, user consideration, and integrating tools, and highlight three +important enablers of ubiquitous interaction with immersive systems in deep +learning-assisted architectural generation. Our work contributes to fostering +understanding between designers and deep learning techniques, broadening access +to human-AI collaboration. We advocate for interdisciplinary efforts to address +this timely research topic, facilitating content designing and generation in +the metaverse. + +
+
+ comment: 37 pages, 9 figures, and 5 tables +
+
+
+
+
+ + ♻ ☆ ARS-DETR: Aspect Ratio-Sensitive Detection Transformer for Aerial + Oriented Object Detection + + +
+ Existing oriented object detection methods commonly use metric AP$_{50}$ to +measure the performance of the model. We argue that AP$_{50}$ is inherently +unsuitable for oriented object detection due to its large tolerance in angle +deviation. Therefore, we advocate using high-precision metric, e.g. AP$_{75}$, +to measure the performance of models. In this paper, we propose an Aspect Ratio +Sensitive Oriented Object Detector with Transformer, termed ARS-DETR, which +exhibits a competitive performance in high-precision oriented object detection. +Specifically, a new angle classification method, calling Aspect Ratio aware +Circle Smooth Label (AR-CSL), is proposed to smooth the angle label in a more +reasonable way and discard the hyperparameter that introduced by previous work +(e.g. CSL). Then, a rotated deformable attention module is designed to rotate +the sampling points with the corresponding angles and eliminate the +misalignment between region features and sampling points. Moreover, a dynamic +weight coefficient according to the aspect ratio is adopted to calculate the +angle loss. Comprehensive experiments on several challenging datasets show that +our method achieves competitive performance on the high-precision oriented +object detection task. + +
+
+ comment: 15 pages, 13 figures, 13 tables, the source code is available at + https://github.com/httle/ARS-DETR +
+
+
+
+
+ + ♻ ☆ Bi-LORA: A Vision-Language Approach for Synthetic Image Detection + + +
+ Advancements in deep image synthesis techniques, such as generative +adversarial networks (GANs) and diffusion models (DMs), have ushered in an era +of generating highly realistic images. While this technological progress has +captured significant interest, it has also raised concerns about the potential +difficulty in distinguishing real images from their synthetic counterparts. +This paper takes inspiration from the potent convergence capabilities between +vision and language, coupled with the zero-shot nature of vision-language +models (VLMs). We introduce an innovative method called Bi-LORA that leverages +VLMs, combined with low-rank adaptation (LORA) tuning techniques, to enhance +the precision of synthetic image detection for unseen model-generated images. +The pivotal conceptual shift in our methodology revolves around reframing +binary classification as an image captioning task, leveraging the distinctive +capabilities of cutting-edge VLM, notably bootstrapping language image +pre-training (BLIP2). Rigorous and comprehensive experiments are conducted to +validate the effectiveness of our proposed approach, particularly in detecting +unseen diffusion-generated images from unknown diffusion-based generative +models during training, showcasing robustness to noise, and demonstrating +generalization capabilities to GANs. The obtained results showcase an +impressive average accuracy of 93.41% in synthetic image detection on unseen +generation models. The code and models associated with this research can be +publicly accessed at https://github.com/Mamadou-Keita/VLM-DETECT. + +
+
+
+
+
+ + ♻ ☆ Sketch3D: Style-Consistent Guidance for Sketch-to-3D Generation + + +
+ Recently, image-to-3D approaches have achieved significant results with a +natural image as input. However, it is not always possible to access these +enriched color input samples in practical applications, where only sketches are +available. Existing sketch-to-3D researches suffer from limitations in broad +applications due to the challenges of lacking color information and multi-view +content. To overcome them, this paper proposes a novel generation paradigm +Sketch3D to generate realistic 3D assets with shape aligned with the input +sketch and color matching the textual description. Concretely, Sketch3D first +instantiates the given sketch in the reference image through the +shape-preserving generation process. Second, the reference image is leveraged +to deduce a coarse 3D Gaussian prior, and multi-view style-consistent guidance +images are generated based on the renderings of the 3D Gaussians. Finally, +three strategies are designed to optimize 3D Gaussians, i.e., structural +optimization via a distribution transfer mechanism, color optimization with a +straightforward MSE loss and sketch similarity optimization with a CLIP-based +geometric similarity loss. Extensive visual comparisons and quantitative +analysis illustrate the advantage of our Sketch3D in generating realistic 3D +assets while preserving consistency with the input. + +
+
+
+
+
+ + ♻ ☆ Training Like a Medical Resident: Context-Prior Learning Toward + Universal Medical Image Segmentation CVPR 2024 + + +
+ A major focus of clinical imaging workflow is disease diagnosis and +management, leading to medical imaging datasets strongly tied to specific +clinical objectives. This scenario has led to the prevailing practice of +developing task-specific segmentation models, without gaining insights from +widespread imaging cohorts. Inspired by the training program of medical +radiology residents, we propose a shift towards universal medical image +segmentation, a paradigm aiming to build medical image understanding foundation +models by leveraging the diversity and commonality across clinical targets, +body regions, and imaging modalities. Towards this goal, we develop Hermes, a +novel context-prior learning approach to address the challenges of data +heterogeneity and annotation differences in medical image segmentation. In a +large collection of eleven diverse datasets (2,438 3D images) across five +modalities (CT, PET, T1, T2 and cine MRI) and multiple body regions, we +demonstrate the merit of the universal paradigm over the traditional paradigm +on addressing multiple tasks within a single model. By exploiting the synergy +across tasks, Hermes achieves state-of-the-art performance on all testing +datasets and shows superior model scalability. Results on two additional +datasets reveals Hermes' strong performance for transfer learning, incremental +learning, and generalization to downstream tasks. Hermes's learned priors +demonstrate an appealing trait to reflect the intricate relations among tasks +and modalities, which aligns with the established anatomical and imaging +principles in radiology. The code is available: +https://github.com/yhygao/universal-medical-image-segmentation. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Tailored Visions: Enhancing Text-to-Image Generation with Personalized + Prompt Rewriting CVPR 2024 + + +
+ Despite significant progress in the field, it is still challenging to create +personalized visual representations that align closely with the desires and +preferences of individual users. This process requires users to articulate +their ideas in words that are both comprehensible to the models and accurately +capture their vision, posing difficulties for many users. In this paper, we +tackle this challenge by leveraging historical user interactions with the +system to enhance user prompts. We propose a novel approach that involves +rewriting user prompts based on a newly collected large-scale text-to-image +dataset with over 300k prompts from 3115 users. Our rewriting model enhances +the expressiveness and alignment of user prompts with their intended visual +outputs. Experimental results demonstrate the superiority of our methods over +baseline approaches, as evidenced in our new offline evaluation method and +online tests. Our code and dataset are available at +https://github.com/zzjchen/Tailored-Visions. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ GraphAD: Interaction Scene Graph for End-to-end Autonomous Driving + + +
+ Modeling complicated interactions among the ego-vehicle, road agents, and map +elements has been a crucial part for safety-critical autonomous driving. +Previous works on end-to-end autonomous driving rely on the attention mechanism +for handling heterogeneous interactions, which fails to capture the geometric +priors and is also computationally intensive. In this paper, we propose the +Interaction Scene Graph (ISG) as a unified method to model the interactions +among the ego-vehicle, road agents, and map elements. With the representation +of the ISG, the driving agents aggregate essential information from the most +influential elements, including the road agents with potential collisions and +the map elements to follow. Since a mass of unnecessary interactions are +omitted, the more efficient scene-graph-based framework is able to focus on +indispensable connections and leads to better performance. We evaluate the +proposed method for end-to-end autonomous driving on the nuScenes dataset. +Compared with strong baselines, our method significantly outperforms in the +full-stack driving tasks, including perception, prediction, and planning. Code +will be released at https://github.com/zhangyp15/GraphAD. + +
+
+ comment: project page: https://github.com/zhangyp15/GraphAD +
+
+
+
+
+ + ♻ ☆ Towards a Simultaneous and Granular Identity-Expression Control in + Personalized Face Generation + + +
+ In human-centric content generation, the pre-trained text-to-image models +struggle to produce user-wanted portrait images, which retain the identity of +individuals while exhibiting diverse expressions. This paper introduces our +efforts towards personalized face generation. To this end, we propose a novel +multi-modal face generation framework, capable of simultaneous +identity-expression control and more fine-grained expression synthesis. Our +expression control is so sophisticated that it can be specialized by the +fine-grained emotional vocabulary. We devise a novel diffusion model that can +undertake the task of simultaneously face swapping and reenactment. Due to the +entanglement of identity and expression, it's nontrivial to separately and +precisely control them in one framework, thus has not been explored yet. To +overcome this, we propose several innovative designs in the conditional +diffusion model, including balancing identity and expression encoder, improved +midpoint sampling, and explicitly background conditioning. Extensive +experiments have demonstrated the controllability and scalability of the +proposed framework, in comparison with state-of-the-art text-to-image, face +swapping, and face reenactment methods. + +
+
+
+
+
+ + ♻ ☆ FineDiffusion: Scaling up Diffusion Models for Fine-grained Image + Generation with 10,000 Classes + + +
+ The class-conditional image generation based on diffusion models is renowned +for generating high-quality and diverse images. However, most prior efforts +focus on generating images for general categories, e.g., 1000 classes in +ImageNet-1k. A more challenging task, large-scale fine-grained image +generation, remains the boundary to explore. In this work, we present a +parameter-efficient strategy, called FineDiffusion, to fine-tune large +pre-trained diffusion models scaling to large-scale fine-grained image +generation with 10,000 categories. FineDiffusion significantly accelerates +training and reduces storage overhead by only fine-tuning tiered class +embedder, bias terms, and normalization layers' parameters. To further improve +the image generation quality of fine-grained categories, we propose a novel +sampling method for fine-grained image generation, which utilizes +superclass-conditioned guidance, specifically tailored for fine-grained +categories, to replace the conventional classifier-free guidance sampling. +Compared to full fine-tuning, FineDiffusion achieves a remarkable 1.56x +training speed-up and requires storing merely 1.77% of the total model +parameters, while achieving state-of-the-art FID of 9.776 on image generation +of 10,000 classes. Extensive qualitative and quantitative experiments +demonstrate the superiority of our method compared to other parameter-efficient +fine-tuning methods. The code and more generated results are available at our +project website: https://finediffusion.github.io/. + +
+
+
+
+
+ + ♻ ☆ Feature Re-Embedding: Towards Foundation Model-Level Performance in + Computational Pathology CVPR2024 + + +
+ Multiple instance learning (MIL) is the most widely used framework in +computational pathology, encompassing sub-typing, diagnosis, prognosis, and +more. However, the existing MIL paradigm typically requires an offline instance +feature extractor, such as a pre-trained ResNet or a foundation model. This +approach lacks the capability for feature fine-tuning within the specific +downstream tasks, limiting its adaptability and performance. To address this +issue, we propose a Re-embedded Regional Transformer (R$^2$T) for re-embedding +the instance features online, which captures fine-grained local features and +establishes connections across different regions. Unlike existing works that +focus on pre-training powerful feature extractor or designing sophisticated +instance aggregator, R$^2$T is tailored to re-embed instance features online. +It serves as a portable module that can seamlessly integrate into mainstream +MIL models. Extensive experimental results on common computational pathology +tasks validate that: 1) feature re-embedding improves the performance of MIL +models based on ResNet-50 features to the level of foundation model features, +and further enhances the performance of foundation model features; 2) the +R$^2$T can introduce more significant performance improvements to various MIL +models; 3) R$^2$T-MIL, as an R$^2$T-enhanced AB-MIL, outperforms other latest +methods by a large margin.The code is available at: +https://github.com/DearCaat/RRT-MIL. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ GenAD: Generative End-to-End Autonomous Driving + + +
+ Directly producing planning results from raw sensors has been a long-desired +solution for autonomous driving and has attracted increasing attention +recently. Most existing end-to-end autonomous driving methods factorize this +problem into perception, motion prediction, and planning. However, we argue +that the conventional progressive pipeline still cannot comprehensively model +the entire traffic evolution process, e.g., the future interaction between the +ego car and other traffic participants and the structural trajectory prior. In +this paper, we explore a new paradigm for end-to-end autonomous driving, where +the key is to predict how the ego car and the surroundings evolve given past +scenes. We propose GenAD, a generative framework that casts autonomous driving +into a generative modeling problem. We propose an instance-centric scene +tokenizer that first transforms the surrounding scenes into map-aware instance +tokens. We then employ a variational autoencoder to learn the future trajectory +distribution in a structural latent space for trajectory prior modeling. We +further adopt a temporal model to capture the agent and ego movements in the +latent space to generate more effective future trajectories. GenAD finally +simultaneously performs motion prediction and planning by sampling +distributions in the learned structural latent space conditioned on the +instance tokens and using the learned temporal model to generate futures. +Extensive experiments on the widely used nuScenes benchmark show that the +proposed GenAD achieves state-of-the-art performance on vision-centric +end-to-end autonomous driving with high efficiency. Code: +https://github.com/wzzheng/GenAD. + +
+
+ comment: Code is available at: https://github.com/wzzheng/GenAD +
+
+
+
+
+ + ♻ ☆ CCEdit: Creative and Controllable Video Editing via Diffusion Models + + +
+ In this paper, we present CCEdit, a versatile generative video editing +framework based on diffusion models. Our approach employs a novel trident +network structure that separates structure and appearance control, ensuring +precise and creative editing capabilities. Utilizing the foundational +ControlNet architecture, we maintain the structural integrity of the video +during editing. The incorporation of an additional appearance branch enables +users to exert fine-grained control over the edited key frame. These two side +branches seamlessly integrate into the main branch, which is constructed upon +existing text-to-image (T2I) generation models, through learnable temporal +layers. The versatility of our framework is demonstrated through a diverse +range of choices in both structure representations and personalized T2I models, +as well as the option to provide the edited key frame. To facilitate +comprehensive evaluation, we introduce the BalanceCC benchmark dataset, +comprising 100 videos and 4 target prompts for each video. Our extensive user +studies compare CCEdit with eight state-of-the-art video editing methods. The +outcomes demonstrate CCEdit's substantial superiority over all other methods. + +
+
+
+
+
+ + ♻ ☆ PrivImage: Differentially Private Synthetic Image Generation using + Diffusion Models with Semantic-Aware Pretraining USENIX Security 2024 + + +
+ Differential Privacy (DP) image data synthesis, which leverages the DP +technique to generate synthetic data to replace the sensitive data, allowing +organizations to share and utilize synthetic images without privacy concerns. +Previous methods incorporate the advanced techniques of generative models and +pre-training on a public dataset to produce exceptional DP image data, but +suffer from problems of unstable training and massive computational resource +demands. This paper proposes a novel DP image synthesis method, termed +PRIVIMAGE, which meticulously selects pre-training data, promoting the +efficient creation of DP datasets with high fidelity and utility. PRIVIMAGE +first establishes a semantic query function using a public dataset. Then, this +function assists in querying the semantic distribution of the sensitive +dataset, facilitating the selection of data from the public dataset with +analogous semantics for pre-training. Finally, we pre-train an image generative +model using the selected data and then fine-tune this model on the sensitive +dataset using Differentially Private Stochastic Gradient Descent (DP-SGD). +PRIVIMAGE allows us to train a lightly parameterized generative model, reducing +the noise in the gradient during DP-SGD training and enhancing training +stability. Extensive experiments demonstrate that PRIVIMAGE uses only 1% of the +public dataset for pre-training and 7.6% of the parameters in the generative +model compared to the state-of-the-art method, whereas achieves superior +synthetic performance and conserves more computational resources. On average, +PRIVIMAGE achieves 30.1% lower FID and 12.6% higher Classification Accuracy +than the state-of-the-art method. The replication package and datasets can be +accessed online. + +
+
+ comment: Accepted at USENIX Security 2024 +
+
+
+
+
+ + ♻ ☆ Knowledge NeRF: Few-shot Novel View Synthesis for Dynamic Articulated + Objects + + +
+ We present Knowledge NeRF to synthesize novel views for dynamic scenes. +Reconstructing dynamic 3D scenes from few sparse views and rendering them from +arbitrary perspectives is a challenging problem with applications in various +domains. Previous dynamic NeRF methods learn the deformation of articulated +objects from monocular videos. However, qualities of their reconstructed scenes +are limited. To clearly reconstruct dynamic scenes, we propose a new framework +by considering two frames at a time.We pretrain a NeRF model for an articulated +object.When articulated objects moves, Knowledge NeRF learns to generate novel +views at the new state by incorporating past knowledge in the pretrained NeRF +model with minimal observations in the present state. We propose a projection +module to adapt NeRF for dynamic scenes, learning the correspondence between +pretrained knowledge base and current states. Experimental results demonstrate +the effectiveness of our method in reconstructing dynamic 3D scenes with 5 +input images in one state. Knowledge NeRF is a new pipeline and promising +solution for novel view synthesis in dynamic articulated objects. The data and +implementation are publicly available at +https://github.com/RussRobin/Knowledge_NeRF. + +
+
+
+
+
+ + ♻ ☆ Optimizing Illuminant Estimation in Dual-Exposure HDR Imaging + + +
+ High dynamic range (HDR) imaging involves capturing a series of frames of the +same scene, each with different exposure settings, to broaden the dynamic range +of light. This can be achieved through burst capturing or using staggered HDR +sensors that capture long and short exposures simultaneously in the camera +image signal processor (ISP). Within camera ISP pipeline, illuminant estimation +is a crucial step aiming to estimate the color of the global illuminant in the +scene. This estimation is used in camera ISP white-balance module to remove +undesirable color cast in the final image. Despite the multiple frames captured +in the HDR pipeline, conventional illuminant estimation methods often rely only +on a single frame of the scene. In this paper, we explore leveraging +information from frames captured with different exposure times. Specifically, +we introduce a simple feature extracted from dual-exposure images to guide +illuminant estimators, referred to as the dual-exposure feature (DEF). To +validate the efficiency of DEF, we employed two illuminant estimators using the +proposed DEF: 1) a multilayer perceptron network (MLP), referred to as +exposure-based MLP (EMLP), and 2) a modified version of the convolutional color +constancy (CCC) to integrate our DEF, that we call ECCC. Both EMLP and ECCC +achieve promising results, in some cases surpassing prior methods that require +hundreds of thousands or millions of parameters, with only a few hundred +parameters for EMLP and a few thousand parameters for ECCC. + +
+
+
+
+
+ + ♻ ☆ UWFormer: Underwater Image Enhancement via a Semi-Supervised Multi-Scale + Transformer IJCNN 2024 + + +
+ Underwater images often exhibit poor quality, distorted color balance and low +contrast due to the complex and intricate interplay of light, water, and +objects. Despite the significant contributions of previous underwater +enhancement techniques, there exist several problems that demand further +improvement: (i) The current deep learning methods rely on Convolutional Neural +Networks (CNNs) that lack the multi-scale enhancement, and global perception +field is also limited. (ii) The scarcity of paired real-world underwater +datasets poses a significant challenge, and the utilization of synthetic image +pairs could lead to overfitting. To address the aforementioned problems, this +paper introduces a Multi-scale Transformer-based Network called UWFormer for +enhancing images at multiple frequencies via semi-supervised learning, in which +we propose a Nonlinear Frequency-aware Attention mechanism and a Multi-Scale +Fusion Feed-forward Network for low-frequency enhancement. Besides, we +introduce a special underwater semi-supervised training strategy, where we +propose a Subaqueous Perceptual Loss function to generate reliable pseudo +labels. Experiments using full-reference and non-reference underwater +benchmarks demonstrate that our method outperforms state-of-the-art methods in +terms of both quantity and visual quality. + +
+
+ comment: Accepted by IJCNN 2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 81 + +
+
+
+ + ☆ Collaborative Feedback Discriminative Propagation for Video + Super-Resolution + + +
+ The key success of existing video super-resolution (VSR) methods stems mainly +from exploring spatial and temporal information, which is usually achieved by a +recurrent propagation module with an alignment module. However, inaccurate +alignment usually leads to aligned features with significant artifacts, which +will be accumulated during propagation and thus affect video restoration. +Moreover, propagation modules only propagate the same timestep features forward +or backward that may fail in case of complex motion or occlusion, limiting +their performance for high-quality frame restoration. To address these issues, +we propose a collaborative feedback discriminative (CFD) method to correct +inaccurate aligned features and model long -range spatial and temporal +information for better video reconstruction. In detail, we develop a +discriminative alignment correction (DAC) method to adaptively explore +information and reduce the influences of the artifacts caused by inaccurate +alignment. Then, we propose a collaborative feedback propagation (CFP) module +that employs feedback and gating mechanisms to better explore spatial and +temporal information of different timestep features from forward and backward +propagation simultaneously. Finally, we embed the proposed DAC and CFP into +commonly used VSR networks to verify the effectiveness of our method. +Quantitative and qualitative experiments on several benchmarks demonstrate that +our method can improve the performance of existing VSR models while maintaining +a lower model complexity. The source code and pre-trained models will be +available at \url{https://github.com/House-Leo/CFDVSR}. + +
+
+ comment: Project website: https://github.com/House-Leo/CFDVSR +
+
+
+
+
+ + ☆ ProtoAL: Interpretable Deep Active Learning with prototypes for medical + imaging + + +
+ The adoption of Deep Learning algorithms in the medical imaging field is a +prominent area of research, with high potential for advancing AI-based +Computer-aided diagnosis (AI-CAD) solutions. However, current solutions face +challenges due to a lack of interpretability features and high data demands, +prompting recent efforts to address these issues. In this study, we propose the +ProtoAL method, where we integrate an interpretable DL model into the Deep +Active Learning (DAL) framework. This approach aims to address both challenges +by focusing on the medical imaging context and utilizing an inherently +interpretable model based on prototypes. We evaluated ProtoAL on the Messidor +dataset, achieving an area under the precision-recall curve of 0.79 while +utilizing only 76.54\% of the available labeled data. These capabilities can +enhances the practical usability of a DL model in the medical field, providing +a means of trust calibration in domain experts and a suitable solution for +learning in the data scarcity context often found. + +
+
+
+
+
+ + ☆ Towards Generalized Entropic Sparsification for Convolutional Neural + Networks + + +
+ Convolutional neural networks (CNNs) are reported to be overparametrized. The +search for optimal (minimal) and sufficient architecture is an NP-hard problem +as the hyperparameter space for possible network configurations is vast. Here, +we introduce a layer-by-layer data-driven pruning method based on the +mathematical idea aiming at a computationally-scalable entropic relaxation of +the pruning problem. The sparse subnetwork is found from the pre-trained (full) +CNN using the network entropy minimization as a sparsity constraint. This +allows deploying a numerically scalable algorithm with a sublinear scaling +cost. The method is validated on several benchmarks (architectures): (i) MNIST +(LeNet) with sparsity 55%-84% and loss in accuracy 0.1%-0.5%, and (ii) CIFAR-10 +(VGG-16, ResNet18) with sparsity 73-89% and loss in accuracy 0.1%-0.5%. + +
+
+
+
+
+ + ☆ On Exploring PDE Modeling for Point Cloud Video Representation Learning + + +
+ Point cloud video representation learning is challenging due to complex +structures and unordered spatial arrangement. Traditional methods struggle with +frame-to-frame correlations and point-wise correspondence tracking. Recently, +partial differential equations (PDE) have provided a new perspective in +uniformly solving spatial-temporal data information within certain constraints. +While tracking tangible point correspondence remains challenging, we propose to +formalize point cloud video representation learning as a PDE-solving problem. +Inspired by fluid analysis, where PDEs are used to solve the deformation of +spatial shape over time, we employ PDE to solve the variations of spatial +points affected by temporal information. By modeling spatial-temporal +correlations, we aim to regularize spatial variations with temporal features, +thereby enhancing representation learning in point cloud videos. We introduce +Motion PointNet composed of a PointNet-like encoder and a PDE-solving module. +Initially, we construct a lightweight yet effective encoder to model an initial +state of the spatial variations. Subsequently, we develop our PDE-solving +module in a parameterized latent space, tailored to address the spatio-temporal +correlations inherent in point cloud video. The process of solving PDE is +guided and refined by a contrastive learning structure, which is pivotal in +reshaping the feature distribution, thereby optimizing the feature +representation within point cloud video data. Remarkably, our Motion PointNet +achieves an impressive accuracy of 97.52% on the MSRAction-3D dataset, +surpassing the current state-of-the-art in all aspects while consuming minimal +resources (only 0.72M parameters and 0.82G FLOPs). + +
+
+
+
+
+ + ☆ Interpretable Multimodal Learning for Cardiovascular Hemodynamics + Assessment + + +
+ Pulmonary Arterial Wedge Pressure (PAWP) is an essential cardiovascular +hemodynamics marker to detect heart failure. In clinical practice, Right Heart +Catheterization is considered a gold standard for assessing cardiac +hemodynamics while non-invasive methods are often needed to screen high-risk +patients from a large population. In this paper, we propose a multimodal +learning pipeline to predict PAWP marker. We utilize complementary information +from Cardiac Magnetic Resonance Imaging (CMR) scans (short-axis and +four-chamber) and Electronic Health Records (EHRs). We extract spatio-temporal +features from CMR scans using tensor-based learning. We propose a graph +attention network to select important EHR features for prediction, where we +model subjects as graph nodes and feature relationships as graph edges using +the attention mechanism. We design four feature fusion strategies: early, +intermediate, late, and hybrid fusion. With a linear classifier and linear +fusion strategies, our pipeline is interpretable. We validate our pipeline on a +large dataset of $2,641$ subjects from our ASPIRE registry. The comparative +study against state-of-the-art methods confirms the superiority of our +pipeline. The decision curve analysis further validates that our pipeline can +be applied to screen a large population. The code is available at +https://github.com/prasunc/hemodynamics. + +
+
+
+
+
+ + ☆ OmniColor: A Global Camera Pose Optimization Approach of LiDAR-360Camera + Fusion for Colorizing Point Clouds + + +
+ A Colored point cloud, as a simple and efficient 3D representation, has many +advantages in various fields, including robotic navigation and scene +reconstruction. This representation is now commonly used in 3D reconstruction +tasks relying on cameras and LiDARs. However, fusing data from these two types +of sensors is poorly performed in many existing frameworks, leading to +unsatisfactory mapping results, mainly due to inaccurate camera poses. This +paper presents OmniColor, a novel and efficient algorithm to colorize point +clouds using an independent 360-degree camera. Given a LiDAR-based point cloud +and a sequence of panorama images with initial coarse camera poses, our +objective is to jointly optimize the poses of all frames for mapping images +onto geometric reconstructions. Our pipeline works in an off-the-shelf manner +that does not require any feature extraction or matching process. Instead, we +find optimal poses by directly maximizing the photometric consistency of LiDAR +maps. In experiments, we show that our method can overcome the severe visual +distortion of omnidirectional images and greatly benefit from the wide field of +view (FOV) of 360-degree cameras to reconstruct various scenarios with accuracy +and stability. The code will be released at +https://github.com/liubonan123/OmniColor/. + +
+
+ comment: 2024 IEEE International Conference on Robotics and Automation +
+
+
+
+
+ + ☆ Z-Splat: Z-Axis Gaussian Splatting for Camera-Sonar Fusion + + +
+ Differentiable 3D-Gaussian splatting (GS) is emerging as a prominent +technique in computer vision and graphics for reconstructing 3D scenes. GS +represents a scene as a set of 3D Gaussians with varying opacities and employs +a computationally efficient splatting operation along with analytical +derivatives to compute the 3D Gaussian parameters given scene images captured +from various viewpoints. Unfortunately, capturing surround view ($360^{\circ}$ +viewpoint) images is impossible or impractical in many real-world imaging +scenarios, including underwater imaging, rooms inside a building, and +autonomous navigation. In these restricted baseline imaging scenarios, the GS +algorithm suffers from a well-known 'missing cone' problem, which results in +poor reconstruction along the depth axis. In this manuscript, we demonstrate +that using transient data (from sonars) allows us to address the missing cone +problem by sampling high-frequency data along the depth axis. We extend the +Gaussian splatting algorithms for two commonly used sonars and propose fusion +algorithms that simultaneously utilize RGB camera data and sonar data. Through +simulations, emulations, and hardware experiments across various imaging +scenarios, we show that the proposed fusion algorithms lead to significantly +better novel view synthesis (5 dB improvement in PSNR) and 3D geometry +reconstruction (60% lower Chamfer distance). + +
+
+
+
+
+ + ☆ Predictive Modeling for Breast Cancer Classification in the Context of + Bangladeshi Patients: A Supervised Machine Learning Approach with Explainable + AI + + +
+ Breast cancer has rapidly increased in prevalence in recent years, making it +one of the leading causes of mortality worldwide. Among all cancers, it is by +far the most common. Diagnosing this illness manually requires significant time +and expertise. Since detecting breast cancer is a time-consuming process, +preventing its further spread can be aided by creating machine-based forecasts. +Machine learning and Explainable AI are crucial in classification as they not +only provide accurate predictions but also offer insights into how the model +arrives at its decisions, aiding in the understanding and trustworthiness of +the classification results. In this study, we evaluate and compare the +classification accuracy, precision, recall, and F-1 scores of five different +machine learning methods using a primary dataset (500 patients from Dhaka +Medical College Hospital). Five different supervised machine learning +techniques, including decision tree, random forest, logistic regression, naive +bayes, and XGBoost, have been used to achieve optimal results on our dataset. +Additionally, this study applied SHAP analysis to the XGBoost model to +interpret the model's predictions and understand the impact of each feature on +the model's output. We compared the accuracy with which several algorithms +classified the data, as well as contrasted with other literature in this field. +After final evaluation, this study found that XGBoost achieved the best model +accuracy, which is 97%. + +
+
+ comment: Accepted for the Scientific Reports (Nature) journal. 32 pages, 12 + figures +
+
+
+
+
+ + ☆ Salient Sparse Visual Odometry With Pose-Only Supervision + + +
+ Visual Odometry (VO) is vital for the navigation of autonomous systems, +providing accurate position and orientation estimates at reasonable costs. +While traditional VO methods excel in some conditions, they struggle with +challenges like variable lighting and motion blur. Deep learning-based VO, +though more adaptable, can face generalization problems in new environments. +Addressing these drawbacks, this paper presents a novel hybrid visual odometry +(VO) framework that leverages pose-only supervision, offering a balanced +solution between robustness and the need for extensive labeling. We propose two +cost-effective and innovative designs: a self-supervised homographic +pre-training for enhancing optical flow learning from pose-only labels and a +random patch-based salient point detection strategy for more accurate optical +flow patch extraction. These designs eliminate the need for dense optical flow +labels for training and significantly improve the generalization capability of +the system in diverse and challenging environments. Our pose-only supervised +method achieves competitive performance on standard datasets and greater +robustness and generalization ability in extreme and unseen scenarios, even +compared to dense optical flow-supervised state-of-the-art methods. + +
+
+ comment: Accepted by IEEE Robotics and Automation Letters +
+
+
+
+
+ + ☆ Neural-ABC: Neural Parametric Models for Articulated Body with Clothes + + +
+ In this paper, we introduce Neural-ABC, a novel parametric model based on +neural implicit functions that can represent clothed human bodies with +disentangled latent spaces for identity, clothing, shape, and pose. Traditional +mesh-based representations struggle to represent articulated bodies with +clothes due to the diversity of human body shapes and clothing styles, as well +as the complexity of poses. Our proposed model provides a unified framework for +parametric modeling, which can represent the identity, clothing, shape and pose +of the clothed human body. Our proposed approach utilizes the power of neural +implicit functions as the underlying representation and integrates +well-designed structures to meet the necessary requirements. Specifically, we +represent the underlying body as a signed distance function and clothing as an +unsigned distance function, and they can be uniformly represented as unsigned +distance fields. Different types of clothing do not require predefined +topological structures or classifications, and can follow changes in the +underlying body to fit the body. Additionally, we construct poses using a +controllable articulated structure. The model is trained on both open and newly +constructed datasets, and our decoupling strategy is carefully designed to +ensure optimal performance. Our model excels at disentangling clothing and +identity in different shape and poses while preserving the style of the +clothing. We demonstrate that Neural-ABC fits new observations of different +types of clothing. Compared to other state-of-the-art parametric models, +Neural-ABC demonstrates powerful advantages in the reconstruction of clothed +human bodies, as evidenced by fitting raw scans, depth maps and images. We show +that the attributes of the fitted results can be further edited by adjusting +their identities, clothing, shape and pose codes. + +
+
+ comment: Accepted by IEEE Transactions on Visualization and Computer Graphics. + Project page: https://ustc3dv.github.io/NeuralABC/ +
+
+
+
+
+ + ☆ Adaptive Intra-Class Variation Contrastive Learning for Unsupervised + Person Re-Identification + + +
+ The memory dictionary-based contrastive learning method has achieved +remarkable results in the field of unsupervised person Re-ID. However, The +method of updating memory based on all samples does not fully utilize the +hardest sample to improve the generalization ability of the model, and the +method based on hardest sample mining will inevitably introduce false-positive +samples that are incorrectly clustered in the early stages of the model. +Clustering-based methods usually discard a significant number of outliers, +leading to the loss of valuable information. In order to address the issues +mentioned before, we propose an adaptive intra-class variation contrastive +learning algorithm for unsupervised Re-ID, called AdaInCV. And the algorithm +quantitatively evaluates the learning ability of the model for each class by +considering the intra-class variations after clustering, which helps in +selecting appropriate samples during the training process of the model. To be +more specific, two new strategies are proposed: Adaptive Sample Mining (AdaSaM) +and Adaptive Outlier Filter (AdaOF). The first one gradually creates more +reliable clusters to dynamically refine the memory, while the second can +identify and filter out valuable outliers as negative samples. + +
+
+
+
+
+ + ☆ Focused Active Learning for Histopathological Image Classification + + +
+ Active Learning (AL) has the potential to solve a major problem of digital +pathology: the efficient acquisition of labeled data for machine learning +algorithms. However, existing AL methods often struggle in realistic settings +with artifacts, ambiguities, and class imbalances, as commonly seen in the +medical field. The lack of precise uncertainty estimations leads to the +acquisition of images with a low informative value. To address these +challenges, we propose Focused Active Learning (FocAL), which combines a +Bayesian Neural Network with Out-of-Distribution detection to estimate +different uncertainties for the acquisition function. Specifically, the +weighted epistemic uncertainty accounts for the class imbalance, aleatoric +uncertainty for ambiguous images, and an OoD score for artifacts. We perform +extensive experiments to validate our method on MNIST and the real-world Panda +dataset for the classification of prostate cancer. The results confirm that +other AL methods are 'distracted' by ambiguities and artifacts which harm the +performance. FocAL effectively focuses on the most informative images, avoiding +ambiguities and artifacts during acquisition. For both experiments, FocAL +outperforms existing AL approaches, reaching a Cohen's kappa of 0.764 with only +0.69% of the labeled Panda data. + +
+
+
+
+
+ + ☆ Music Recommendation Based on Facial Emotion Recognition + + +
+ Introduction: Music provides an incredible avenue for individuals to express +their thoughts and emotions, while also serving as a delightful mode of +entertainment for enthusiasts and music lovers. Objectives: This paper presents +a comprehensive approach to enhancing the user experience through the +integration of emotion recognition, music recommendation, and explainable AI +using GRAD-CAM. Methods: The proposed methodology utilizes a ResNet50 model +trained on the Facial Expression Recognition (FER) dataset, consisting of real +images of individuals expressing various emotions. Results: The system achieves +an accuracy of 82% in emotion classification. By leveraging GRAD-CAM, the model +provides explanations for its predictions, allowing users to understand the +reasoning behind the system's recommendations. The model is trained on both FER +and real user datasets, which include labelled facial expressions, and real +images of individuals expressing various emotions. The training process +involves pre-processing the input images, extracting features through +convolutional layers, reasoning with dense layers, and generating emotion +predictions through the output layer Conclusion: The proposed methodology, +leveraging the Resnet50 model with ROI-based analysis and explainable AI +techniques, offers a robust and interpretable solution for facial emotion +detection paper. + +
+
+
+
+
+ + ☆ HawkDrive: A Transformer-driven Visual Perception System for Autonomous + Driving in Night Scene + + +
+ Many established vision perception systems for autonomous driving scenarios +ignore the influence of light conditions, one of the key elements for driving +safety. To address this problem, we present HawkDrive, a novel perception +system with hardware and software solutions. Hardware that utilizes stereo +vision perception, which has been demonstrated to be a more reliable way of +estimating depth information than monocular vision, is partnered with the edge +computing device Nvidia Jetson Xavier AGX. Our software for low light +enhancement, depth estimation, and semantic segmentation tasks, is a +transformer-based neural network. Our software stack, which enables fast +inference and noise reduction, is packaged into system modules in Robot +Operating System 2 (ROS2). Our experimental results have shown that the +proposed end-to-end system is effective in improving the depth estimation and +semantic segmentation performance. Our dataset and codes will be released at +https://github.com/ZionGo6/HawkDrive. + +
+
+ comment: Accepted by IEEE IV 2024 +
+
+
+
+
+ + ☆ InitNO: Boosting Text-to-Image Diffusion Models via Initial Noise + Optimization CVPR 2024 + + +
+ Recent strides in the development of diffusion models, exemplified by +advancements such as Stable Diffusion, have underscored their remarkable +prowess in generating visually compelling images. However, the imperative of +achieving a seamless alignment between the generated image and the provided +prompt persists as a formidable challenge. This paper traces the root of these +difficulties to invalid initial noise, and proposes a solution in the form of +Initial Noise Optimization (InitNO), a paradigm that refines this noise. +Considering text prompts, not all random noises are effective in synthesizing +semantically-faithful images. We design the cross-attention response score and +the self-attention conflict score to evaluate the initial noise, bifurcating +the initial latent space into valid and invalid sectors. A strategically +crafted noise optimization pipeline is developed to guide the initial noise +towards valid regions. Our method, validated through rigorous experimentation, +shows a commendable proficiency in generating images in strict accordance with +text prompts. Our code is available at https://github.com/xiefan-guo/initno. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Structured Gradient-based Interpretations via Norm-Regularized + Adversarial Training CVPR 2024 + + +
+ Gradient-based saliency maps have been widely used to explain the decisions +of deep neural network classifiers. However, standard gradient-based +interpretation maps, including the simple gradient and integrated gradient +algorithms, often lack desired structures such as sparsity and connectedness in +their application to real-world computer vision models. A frequently used +approach to inducing sparsity structures into gradient-based saliency maps is +to alter the simple gradient scheme using sparsification or norm-based +regularization. A drawback with such post-processing methods is their +frequently-observed significant loss in fidelity to the original simple +gradient map. In this work, we propose to apply adversarial training as an +in-processing scheme to train neural networks with structured simple gradient +maps. We show a duality relation between the regularized norms of the +adversarial perturbations and gradient-based maps, based on which we design +adversarial training loss functions promoting sparsity and group-sparsity +properties in simple gradient maps. We present several numerical results to +show the influence of our proposed norm-based adversarial training methods on +the standard gradient-based maps of standard neural network architectures on +benchmark image datasets. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Constrained 6-DoF Grasp Generation on Complex Shapes for Improved + Dual-Arm Manipulation + + +
+ Efficiently generating grasp poses tailored to specific regions of an object +is vital for various robotic manipulation tasks, especially in a dual-arm +setup. This scenario presents a significant challenge due to the complex +geometries involved, requiring a deep understanding of the local geometry to +generate grasps efficiently on the specified constrained regions. Existing +methods only explore settings involving table-top/small objects and require +augmented datasets to train, limiting their performance on complex objects. We +propose CGDF: Constrained Grasp Diffusion Fields, a diffusion-based grasp +generative model that generalizes to objects with arbitrary geometries, as well +as generates dense grasps on the target regions. CGDF uses a part-guided +diffusion approach that enables it to get high sample efficiency in constrained +grasping without explicitly training on massive constraint-augmented datasets. +We provide qualitative and quantitative comparisons using analytical metrics +and in simulation, in both unconstrained and constrained settings to show that +our method can generalize to generate stable grasps on complex objects, +especially useful for dual-arm manipulation settings, while existing methods +struggle to do so. + +
+
+ comment: Project Page: https://constrained-grasp-diffusion.github.io/ +
+
+
+
+
+ + ☆ A Deep Look Into -- Automated Lung X-Ray Abnormality Detection System + + +
+ Introduction: Automated Lung X-Ray Abnormality Detection System is the +application which distinguish the normal x-ray images from infected x-ray +images and highlight area considered for prediction, with the recent pandemic a +need to have a non-conventional method and faster detecting diseases, for which +X ray serves the purpose. Obectives: As of current situation any viral disease +that is infectious is potential pandemic, so there is need for cheap and early +detection system. Methods: This research will help to eases the work of expert +to do further analysis. Accuracy of three different preexisting models such as +DenseNet, MobileNet and VGG16 were high but models over-fitted primarily due to +black and white images. Results: This led to building up new method such as as +V-BreathNet which gave more than 96% percent accuracy. Conclusion: Thus, it can +be stated that not all state-of art CNN models can be used on B/W images. In +conclusion not all state-of-art CNN models can be used on B/W images. + +
+
+
+
+
+ + ☆ DifFUSER: Diffusion Model for Robust Multi-Sensor Fusion in 3D Object + Detection and BEV Segmentation + + +
+ Diffusion models have recently gained prominence as powerful deep generative +models, demonstrating unmatched performance across various domains. However, +their potential in multi-sensor fusion remains largely unexplored. In this +work, we introduce DifFUSER, a novel approach that leverages diffusion models +for multi-modal fusion in 3D object detection and BEV map segmentation. +Benefiting from the inherent denoising property of diffusion, DifFUSER is able +to refine or even synthesize sensor features in case of sensor malfunction, +thereby improving the quality of the fused output. In terms of architecture, +our DifFUSER blocks are chained together in a hierarchical BiFPN fashion, +termed cMini-BiFPN, offering an alternative architecture for latent diffusion. +We further introduce a Gated Self-conditioned Modulated (GSM) latent diffusion +module together with a Progressive Sensor Dropout Training (PSDT) paradigm, +designed to add stronger conditioning to the diffusion process and robustness +to sensor failures. Our extensive evaluations on the Nuscenes dataset reveal +that DifFUSER not only achieves state-of-the-art performance with a 69.1% mIOU +in BEV map segmentation tasks but also competes effectively with leading +transformer-based fusion techniques in 3D object detection. + +
+
+ comment: 23 pages +
+
+
+
+
+ + ☆ Self-Training Large Language Models for Improved Visual Program + Synthesis With Visual Reinforcement CVPR 2024 + + +
+ Visual program synthesis is a promising approach to exploit the reasoning +abilities of large language models for compositional computer vision tasks. +Previous work has used few-shot prompting with frozen LLMs to synthesize visual +programs. Training an LLM to write better visual programs is an attractive +prospect, but it is unclear how to accomplish this. No dataset of visual +programs for training exists, and acquisition of a visual program dataset +cannot be easily crowdsourced due to the need for expert annotators. To get +around the lack of direct supervision, we explore improving the program +synthesis abilities of an LLM using feedback from interactive experience. We +propose a method where we exploit existing annotations for a vision-language +task to improvise a coarse reward signal for that task, treat the LLM as a +policy, and apply reinforced self-training to improve the visual program +synthesis ability of the LLM for that task. We describe a series of experiments +on object detection, compositional visual question answering, and image-text +retrieval, and show that in each case, the self-trained LLM outperforms or +performs on par with few-shot frozen LLMs that are an order of magnitude +larger. Website: https://zaidkhan.me/ViReP + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Bridging the Gap Between End-to-End and Two-Step Text Spotting CVPR2024 + + +
+ Modularity plays a crucial role in the development and maintenance of complex +systems. While end-to-end text spotting efficiently mitigates the issues of +error accumulation and sub-optimal performance seen in traditional two-step +methodologies, the two-step methods continue to be favored in many competitions +and practical settings due to their superior modularity. In this paper, we +introduce Bridging Text Spotting, a novel approach that resolves the error +accumulation and suboptimal performance issues in two-step methods while +retaining modularity. To achieve this, we adopt a well-trained detector and +recognizer that are developed and trained independently and then lock their +parameters to preserve their already acquired capabilities. Subsequently, we +introduce a Bridge that connects the locked detector and recognizer through a +zero-initialized neural network. This zero-initialized neural network, +initialized with weights set to zeros, ensures seamless integration of the +large receptive field features in detection into the locked recognizer. +Furthermore, since the fixed detector and recognizer cannot naturally acquire +end-to-end optimization features, we adopt the Adapter to facilitate their +efficient learning of these features. We demonstrate the effectiveness of the +proposed method through extensive experiments: Connecting the latest detector +and recognizer through Bridging Text Spotting, we achieved an accuracy of 83.3% +on Total-Text, 69.8% on CTW1500, and 89.5% on ICDAR 2015. The code is available +at https://github.com/mxin262/Bridging-Text-Spotting. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ Do We Really Need a Complex Agent System? Distill Embodied Agent into a + Single Model + + +
+ With the power of large language models (LLMs), open-ended embodied agents +can flexibly understand human instructions, generate interpretable guidance +strategies, and output executable actions. Nowadays, Multi-modal Language +Models~(MLMs) integrate multi-modal signals into LLMs, further bringing richer +perception to entity agents and allowing embodied agents to perceive +world-understanding tasks more delicately. However, existing works: 1) operate +independently by agents, each containing multiple LLMs, from perception to +action, resulting in gaps between complex tasks and execution; 2) train MLMs on +static data, struggling with dynamics in open-ended scenarios; 3) input prior +knowledge directly as prompts, suppressing application flexibility. We propose +STEVE-2, a hierarchical knowledge distillation framework for open-ended +embodied tasks, characterized by 1) a hierarchical system for multi-granular +task division, 2) a mirrored distillation method for parallel simulation data, +and 3) an extra expert model for bringing additional knowledge into parallel +simulation. After distillation, embodied agents can complete complex, +open-ended tasks without additional expert guidance, utilizing the performance +and knowledge of a versatile MLM. Extensive evaluations on navigation and +creation tasks highlight the superior performance of STEVE-2 in open-ended +tasks, with $1.4 \times$ - $7.3 \times$ in performance. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2403.08282 +
+
+
+
+
+ + ☆ Empowering Image Recovery_ A Multi-Attention Approach + + +
+ We propose Diverse Restormer (DART), a novel image restoration method that +effectively integrates information from various sources (long sequences, local +and global regions, feature dimensions, and positional dimensions) to address +restoration challenges. While Transformer models have demonstrated excellent +performance in image restoration due to their self-attention mechanism, they +face limitations in complex scenarios. Leveraging recent advancements in +Transformers and various attention mechanisms, our method utilizes customized +attention mechanisms to enhance overall performance. DART, our novel network +architecture, employs windowed attention to mimic the selective focusing +mechanism of human eyes. By dynamically adjusting receptive fields, it +optimally captures the fundamental features crucial for image resolution +reconstruction. Efficiency and performance balance are achieved through the +LongIR attention mechanism for long sequence image restoration. Integration of +attention mechanisms across feature and positional dimensions further enhances +the recovery of fine details. Evaluation across five restoration tasks +consistently positions DART at the forefront. Upon acceptance, we commit to +providing publicly accessible code and models to ensure reproducibility and +facilitate further research. + +
+
+ comment: 12 pages, 10 figures, 12 tables +
+
+
+
+
+ + ☆ Panoptic Perception: A Novel Task and Fine-grained Dataset for Universal + Remote Sensing Image Interpretation + + +
+ Current remote-sensing interpretation models often focus on a single task +such as detection, segmentation, or caption. However, the task-specific +designed models are unattainable to achieve the comprehensive multi-level +interpretation of images. The field also lacks support for multi-task joint +interpretation datasets. In this paper, we propose Panoptic Perception, a novel +task and a new fine-grained dataset (FineGrip) to achieve a more thorough and +universal interpretation for RSIs. The new task, 1) integrates pixel-level, +instance-level, and image-level information for universal image perception, 2) +captures image information from coarse to fine granularity, achieving deeper +scene understanding and description, and 3) enables various independent tasks +to complement and enhance each other through multi-task learning. By +emphasizing multi-task interactions and the consistency of perception results, +this task enables the simultaneous processing of fine-grained foreground +instance segmentation, background semantic segmentation, and global +fine-grained image captioning. Concretely, the FineGrip dataset includes 2,649 +remote sensing images, 12,054 fine-grained instance segmentation masks +belonging to 20 foreground things categories, 7,599 background semantic masks +for 5 stuff classes and 13,245 captioning sentences. Furthermore, we propose a +joint optimization-based panoptic perception model. Experimental results on +FineGrip demonstrate the feasibility of the panoptic perception task and the +beneficial effect of multi-task joint optimization on individual tasks. The +dataset will be publicly available. + +
+
+ comment: Undergoing Review +
+
+
+
+
+ + ☆ PIE: Physics-inspired Low-light Enhancement + + +
+ In this paper, we propose a physics-inspired contrastive learning paradigm +for low-light enhancement, called PIE. PIE primarily addresses three issues: +(i) To resolve the problem of existing learning-based methods often training a +LLE model with strict pixel-correspondence image pairs, we eliminate the need +for pixel-correspondence paired training data and instead train with unpaired +images. (ii) To address the disregard for negative samples and the inadequacy +of their generation in existing methods, we incorporate physics-inspired +contrastive learning for LLE and design the Bag of Curves (BoC) method to +generate more reasonable negative samples that closely adhere to the underlying +physical imaging principle. (iii) To overcome the reliance on semantic ground +truths in existing methods, we propose an unsupervised regional segmentation +module, ensuring regional brightness consistency while eliminating the +dependency on semantic ground truths. Overall, the proposed PIE can effectively +learn from unpaired positive/negative samples and smoothly realize non-semantic +regional enhancement, which is clearly different from existing LLE efforts. +Besides the novel architecture of PIE, we explore the gain of PIE on downstream +tasks such as semantic segmentation and face detection. Training on readily +available open data and extensive experiments demonstrate that our method +surpasses the state-of-the-art LLE models over six independent cross-scenes +datasets. PIE runs fast with reasonable GFLOPs in test time, making it easy to +use on mobile devices. + +
+
+
+
+
+ + ☆ D$^3$: Scaling Up Deepfake Detection by Learning from Discrepancy + + +
+ The boom of Generative AI brings opportunities entangled with risks and +concerns. In this work, we seek a step toward a universal deepfake detection +system with better generalization and robustness, to accommodate the +responsible deployment of diverse image generative models. We do so by first +scaling up the existing detection task setup from the one-generator to +multiple-generators in training, during which we disclose two challenges +presented in prior methodological designs. Specifically, we reveal that the +current methods tailored for training on one specific generator either struggle +to learn comprehensive artifacts from multiple generators or tend to sacrifice +their ability to identify fake images from seen generators (i.e., In-Domain +performance) to exchange the generalization for unseen generators (i.e., +Out-Of-Domain performance). To tackle the above challenges, we propose our +Discrepancy Deepfake Detector (D$^3$) framework, whose core idea is to learn +the universal artifacts from multiple generators by introducing a parallel +network branch that takes a distorted image as extra discrepancy signal to +supplement its original counterpart. Extensive scaled-up experiments on the +merged UFD and GenImage datasets with six detection models demonstrate the +effectiveness of our framework, achieving a 5.3% accuracy improvement in the +OOD testing compared to the current SOTA methods while maintaining the ID +performance. + +
+
+ comment: 14 pages, 3 figures +
+
+
+
+
+ + ☆ SDFR: Synthetic Data for Face Recognition Competition + + +
+ Large-scale face recognition datasets are collected by crawling the Internet +and without individuals' consent, raising legal, ethical, and privacy concerns. +With the recent advances in generative models, recently several works proposed +generating synthetic face recognition datasets to mitigate concerns in +web-crawled face recognition datasets. This paper presents the summary of the +Synthetic Data for Face Recognition (SDFR) Competition held in conjunction with +the 18th IEEE International Conference on Automatic Face and Gesture +Recognition (FG 2024) and established to investigate the use of synthetic data +for training face recognition models. The SDFR competition was split into two +tasks, allowing participants to train face recognition systems using new +synthetic datasets and/or existing ones. In the first task, the face +recognition backbone was fixed and the dataset size was limited, while the +second task provided almost complete freedom on the model backbone, the +dataset, and the training pipeline. The submitted models were trained on +existing and also new synthetic datasets and used clever methods to improve +training with synthetic data. The submissions were evaluated and ranked on a +diverse set of seven benchmarking datasets. The paper gives an overview of the +submitted face recognition models and reports achieved performance compared to +baseline models trained on real and synthetic datasets. Furthermore, the +evaluation of submissions is extended to bias assessment across different +demography groups. Lastly, an outlook on the current state of the research in +training face recognition models using synthetic data is presented, and +existing problems as well as potential future directions are also discussed. + +
+
+ comment: The 18th IEEE International Conference on Automatic Face and Gesture + Recognition (FG 2024) +
+
+
+
+
+ + ☆ GLCM-Based Feature Combination for Extraction Model Optimization in + Object Detection Using Machine Learning + + +
+ In the era of modern technology, object detection using the Gray Level +Co-occurrence Matrix (GLCM) extraction method plays a crucial role in object +recognition processes. It finds applications in real-time scenarios such as +security surveillance and autonomous vehicle navigation, among others. +Computational efficiency becomes a critical factor in achieving real-time +object detection. Hence, there is a need for a detection model with low +complexity and satisfactory accuracy. This research aims to enhance +computational efficiency by selecting appropriate features within the GLCM +framework. Two classification models, namely K-Nearest Neighbours (K-NN) and +Support Vector Machine (SVM), were employed, with the results indicating that +K-Nearest Neighbours (K-NN) outperforms SVM in terms of computational +complexity. Specifically, K-NN, when utilizing a combination of Correlation, +Energy, and Homogeneity features, achieves a 100% accuracy rate with low +complexity. Moreover, when using a combination of Energy and Homogeneity +features, K-NN attains an almost perfect accuracy level of 99.9889%, while +maintaining low complexity. On the other hand, despite SVM achieving 100% +accuracy in certain feature combinations, its high or very high complexity can +pose challenges, particularly in real-time applications. Therefore, based on +the trade-off between accuracy and complexity, the K-NN model with a +combination of Correlation, Energy, and Homogeneity features emerges as a more +suitable choice for real-time applications that demand high accuracy and low +complexity. This research provides valuable insights for optimizing object +detection in various applications requiring both high accuracy and rapid +responsiveness. + +
+
+
+
+
+ + ☆ SportsHHI: A Dataset for Human-Human Interaction Detection in Sports + Videos CVPR 2024 + + +
+ Video-based visual relation detection tasks, such as video scene graph +generation, play important roles in fine-grained video understanding. However, +current video visual relation detection datasets have two main limitations that +hinder the progress of research in this area. First, they do not explore +complex human-human interactions in multi-person scenarios. Second, the +relation types of existing datasets have relatively low-level semantics and can +be often recognized by appearance or simple prior information, without the need +for detailed spatio-temporal context reasoning. Nevertheless, comprehending +high-level interactions between humans is crucial for understanding complex +multi-person videos, such as sports and surveillance videos. To address this +issue, we propose a new video visual relation detection task: video human-human +interaction detection, and build a dataset named SportsHHI for it. SportsHHI +contains 34 high-level interaction classes from basketball and volleyball +sports. 118,075 human bounding boxes and 50,649 interaction instances are +annotated on 11,398 keyframes. To benchmark this, we propose a two-stage +baseline method and conduct extensive experiments to reveal the key factors for +a successful human-human interaction detector. We hope that SportsHHI can +stimulate research on human interaction understanding in videos and promote the +development of spatio-temporal context modeling techniques in video visual +relation detection. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Enhancing Video Summarization with Context Awareness + + +
+ Video summarization is a crucial research area that aims to efficiently +browse and retrieve relevant information from the vast amount of video content +available today. With the exponential growth of multimedia data, the ability to +extract meaningful representations from videos has become essential. Video +summarization techniques automatically generate concise summaries by selecting +keyframes, shots, or segments that capture the video's essence. This process +improves the efficiency and accuracy of various applications, including video +surveillance, education, entertainment, and social media. Despite the +importance of video summarization, there is a lack of diverse and +representative datasets, hindering comprehensive evaluation and benchmarking of +algorithms. Existing evaluation metrics also fail to fully capture the +complexities of video summarization, limiting accurate algorithm assessment and +hindering the field's progress. To overcome data scarcity challenges and +improve evaluation, we propose an unsupervised approach that leverages video +data structure and information for generating informative summaries. By moving +away from fixed annotations, our framework can produce representative summaries +effectively. Moreover, we introduce an innovative evaluation pipeline tailored +specifically for video summarization. Human participants are involved in the +evaluation, comparing our generated summaries to ground truth summaries and +assessing their informativeness. This human-centric approach provides valuable +insights into the effectiveness of our proposed techniques. Experimental +results demonstrate that our training-free framework outperforms existing +unsupervised approaches and achieves competitive results compared to +state-of-the-art supervised methods. + +
+
+ comment: 115 pages, 1 supplementary paper, undergraduate thesis report at + US-VNUHCM +
+
+
+
+
+ + ☆ Diffusion Time-step Curriculum for One Image to 3D Generation + + +
+ Score distillation sampling~(SDS) has been widely adopted to overcome the +absence of unseen views in reconstructing 3D objects from a \textbf{single} +image. It leverages pre-trained 2D diffusion models as teacher to guide the +reconstruction of student 3D models. Despite their remarkable success, +SDS-based methods often encounter geometric artifacts and texture saturation. +We find out the crux is the overlooked indiscriminate treatment of diffusion +time-steps during optimization: it unreasonably treats the student-teacher +knowledge distillation to be equal at all time-steps and thus entangles +coarse-grained and fine-grained modeling. Therefore, we propose the Diffusion +Time-step Curriculum one-image-to-3D pipeline (DTC123), which involves both the +teacher and student models collaborating with the time-step curriculum in a +coarse-to-fine manner. Extensive experiments on NeRF4, RealFusion15, GSO and +Level50 benchmark demonstrate that DTC123 can produce multi-view consistent, +high-quality, and diverse 3D assets. Codes and more generation demos will be +released in https://github.com/yxymessi/DTC123. + +
+
+
+
+
+ + ☆ Co-Occ: Coupling Explicit Feature Fusion with Volume Rendering + Regularization for Multi-Modal 3D Semantic Occupancy Prediction + + +
+ 3D semantic occupancy prediction is a pivotal task in the field of autonomous +driving. Recent approaches have made great advances in 3D semantic occupancy +predictions on a single modality. However, multi-modal semantic occupancy +prediction approaches have encountered difficulties in dealing with the +modality heterogeneity, modality misalignment, and insufficient modality +interactions that arise during the fusion of different modalities data, which +may result in the loss of important geometric and semantic information. This +letter presents a novel multi-modal, i.e., LiDAR-camera 3D semantic occupancy +prediction framework, dubbed Co-Occ, which couples explicit LiDAR-camera +feature fusion with implicit volume rendering regularization. The key insight +is that volume rendering in the feature space can proficiently bridge the gap +between 3D LiDAR sweeps and 2D images while serving as a physical +regularization to enhance LiDAR-camera fused volumetric representation. +Specifically, we first propose a Geometric- and Semantic-aware Fusion +(GSFusion) module to explicitly enhance LiDAR features by incorporating +neighboring camera features through a K-nearest neighbors (KNN) search. Then, +we employ volume rendering to project the fused feature back to the image +planes for reconstructing color and depth maps. These maps are then supervised +by input images from the camera and depth estimations derived from LiDAR, +respectively. Extensive experiments on the popular nuScenes and SemanticKITTI +benchmarks verify the effectiveness of our Co-Occ for 3D semantic occupancy +prediction. The project page is available at +https://rorisis.github.io/Co-Occ_project-page/. + +
+
+
+
+
+ + ☆ Learning Instance-Aware Correspondences for Robust Multi-Instance Point + Cloud Registration in Cluttered Scenes + + +
+ Multi-instance point cloud registration estimates the poses of multiple +instances of a model point cloud in a scene point cloud. Extracting accurate +point correspondence is to the center of the problem. Existing approaches +usually treat the scene point cloud as a whole, overlooking the separation of +instances. Therefore, point features could be easily polluted by other points +from the background or different instances, leading to inaccurate +correspondences oblivious to separate instances, especially in cluttered +scenes. In this work, we propose MIRETR, Multi-Instance REgistration +TRansformer, a coarse-to-fine approach to the extraction of instance-aware +correspondences. At the coarse level, it jointly learns instance-aware +superpoint features and predicts per-instance masks. With instance masks, the +influence from outside of the instance being concerned is minimized, such that +highly reliable superpoint correspondences can be extracted. The superpoint +correspondences are then extended to instance candidates at the fine level +according to the instance masks. At last, an efficient candidate selection and +refinement algorithm is devised to obtain the final registrations. Extensive +experiments on three public benchmarks demonstrate the efficacy of our +approach. In particular, MIRETR outperforms the state of the arts by 16.6 +points on F1 score on the challenging ROBI benchmark. Code and models are +available at https://github.com/zhiyuanYU134/MIRETR. + +
+
+
+
+
+ + ☆ Rethinking Self-training for Semi-supervised Landmark Detection: A + Selection-free Approach + + +
+ Self-training is a simple yet effective method for semi-supervised learning, +during which pseudo-label selection plays an important role for handling +confirmation bias. Despite its popularity, applying self-training to landmark +detection faces three problems: 1) The selected confident pseudo-labels often +contain data bias, which may hurt model performance; 2) It is not easy to +decide a proper threshold for sample selection as the localization task can be +sensitive to noisy pseudo-labels; 3) coordinate regression does not output +confidence, making selection-based self-training infeasible. To address the +above issues, we propose Self-Training for Landmark Detection (STLD), a method +that does not require explicit pseudo-label selection. Instead, STLD constructs +a task curriculum to deal with confirmation bias, which progressively +transitions from more confident to less confident tasks over the rounds of +self-training. Pseudo pretraining and shrink regression are two essential +components for such a curriculum, where the former is the first task of the +curriculum for providing a better model initialization and the latter is +further added in the later rounds to directly leverage the pseudo-labels in a +coarse-to-fine manner. Experiments on three facial and one medical landmark +detection benchmark show that STLD outperforms the existing methods +consistently in both semi- and omni-supervised settings. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ NPB-REC: A Non-parametric Bayesian Deep-learning Approach for + Undersampled MRI Reconstruction with Uncertainty Estimation + + +
+ The ability to reconstruct high-quality images from undersampled MRI data is +vital in improving MRI temporal resolution and reducing acquisition times. Deep +learning methods have been proposed for this task, but the lack of verified +methods to quantify the uncertainty in the reconstructed images hampered +clinical applicability. We introduce "NPB-REC", a non-parametric fully Bayesian +framework, for MRI reconstruction from undersampled data with uncertainty +estimation. We use Stochastic Gradient Langevin Dynamics during training to +characterize the posterior distribution of the network parameters. This enables +us to both improve the quality of the reconstructed images and quantify the +uncertainty in the reconstructed images. We demonstrate the efficacy of our +approach on a multi-coil MRI dataset from the fastMRI challenge and compare it +to the baseline End-to-End Variational Network (E2E-VarNet). Our approach +outperforms the baseline in terms of reconstruction accuracy by means of PSNR +and SSIM ($34.55$, $0.908$ vs. $33.08$, $0.897$, $p<0.01$, acceleration rate +$R=8$) and provides uncertainty measures that correlate better with the +reconstruction error (Pearson correlation, $R=0.94$ vs. $R=0.91$). +Additionally, our approach exhibits better generalization capabilities against +anatomical distribution shifts (PSNR and SSIM of $32.38$, $0.849$ vs. $31.63$, +$0.836$, $p<0.01$, training on brain data, inference on knee data, acceleration +rate $R=8$). NPB-REC has the potential to facilitate the safe utilization of +deep learning-based methods for MRI reconstruction from undersampled data. Code +and trained models are available at \url{https://github.com/samahkh/NPB-REC}. + +
+
+ comment: Published in Artificial Intelligence in Medicine, DOI: + https://doi.org/10.1016/j.artmed.2024.102798 This is an extension + representing a more comprehensive work extending preliminary work presented + at arXiv:2208.03966 +
+
+
+
+
+ + ☆ A self-attention model for robust rigid slice-to-volume registration of + functional MRI + + +
+ Functional Magnetic Resonance Imaging (fMRI) is vital in neuroscience, +enabling investigations into brain disorders, treatment monitoring, and brain +function mapping. However, head motion during fMRI scans, occurring between +shots of slice acquisition, can result in distortion, biased analyses, and +increased costs due to the need for scan repetitions. Therefore, retrospective +slice-level motion correction through slice-to-volume registration (SVR) is +crucial. Previous studies have utilized deep learning (DL) based models to +address the SVR task; however, they overlooked the uncertainty stemming from +the input stack of slices and did not assign weighting or scoring to each +slice. In this work, we introduce an end-to-end SVR model for aligning 2D fMRI +slices with a 3D reference volume, incorporating a self-attention mechanism to +enhance robustness against input data variations and uncertainties. It utilizes +independent slice and volume encoders and a self-attention module to assign +pixel-wise scores for each slice. We conducted evaluation experiments on 200 +images involving synthetic rigid motion generated from 27 subjects belonging to +the test set, from the publicly available Healthy Brain Network (HBN) dataset. +Our experimental results demonstrate that our model achieves competitive +performance in terms of alignment accuracy compared to state-of-the-art deep +learning-based methods (Euclidean distance of $0.93$ [mm] vs. $1.86$ [mm]). +Furthermore, our approach exhibits significantly faster registration speed +compared to conventional iterative methods ($0.096$ sec. vs. $1.17$ sec.). Our +end-to-end SVR model facilitates real-time head motion tracking during fMRI +acquisition, ensuring reliability and robustness against uncertainties in +inputs. source code, which includes the training and evaluations, will be +available soon. + +
+
+ comment: Currently under review +
+
+
+
+
+ + ☆ BeyondScene: Higher-Resolution Human-Centric Scene Generation With + Pretrained Diffusion + + +
+ Generating higher-resolution human-centric scenes with details and controls +remains a challenge for existing text-to-image diffusion models. This challenge +stems from limited training image size, text encoder capacity (limited tokens), +and the inherent difficulty of generating complex scenes involving multiple +humans. While current methods attempted to address training size limit only, +they often yielded human-centric scenes with severe artifacts. We propose +BeyondScene, a novel framework that overcomes prior limitations, generating +exquisite higher-resolution (over 8K) human-centric scenes with exceptional +text-image correspondence and naturalness using existing pretrained diffusion +models. BeyondScene employs a staged and hierarchical approach to initially +generate a detailed base image focusing on crucial elements in instance +creation for multiple humans and detailed descriptions beyond token limit of +diffusion model, and then to seamlessly convert the base image to a +higher-resolution output, exceeding training image size and incorporating +details aware of text and instances via our novel instance-aware hierarchical +enlargement process that consists of our proposed high-frequency injected +forward diffusion and adaptive joint diffusion. BeyondScene surpasses existing +methods in terms of correspondence with detailed text descriptions and +naturalness, paving the way for advanced applications in higher-resolution +human-centric scene creation beyond the capacity of pretrained diffusion models +without costly retraining. Project page: +https://janeyeon.github.io/beyond-scene. + +
+
+ comment: Project page: https://janeyeon.github.io/beyond-scene +
+
+
+
+
+ + ☆ Frequency Decomposition-Driven Unsupervised Domain Adaptation for Remote + Sensing Image Semantic Segmentation + + +
+ Cross-domain semantic segmentation of remote sensing (RS) imagery based on +unsupervised domain adaptation (UDA) techniques has significantly advanced +deep-learning applications in the geosciences. Recently, with its ingenious and +versatile architecture, the Transformer model has been successfully applied in +RS-UDA tasks. However, existing UDA methods mainly focus on domain alignment in +the high-level feature space. It is still challenging to retain cross-domain +local spatial details and global contextual semantics simultaneously, which is +crucial for the RS image semantic segmentation task. To address these problems, +we propose novel high/low-frequency decomposition (HLFD) techniques to guide +representation alignment in cross-domain semantic segmentation. Specifically, +HLFD attempts to decompose the feature maps into high- and low-frequency +components before performing the domain alignment in the corresponding +subspaces. Secondly, to further facilitate the alignment of decomposed +features, we propose a fully global-local generative adversarial network, +namely GLGAN, to learn domain-invariant detailed and semantic features across +domains by leveraging global-local transformer blocks (GLTBs). By integrating +HLFD techniques and the GLGAN, a novel UDA framework called FD-GLGAN is +developed to improve the cross-domain transferability and generalization +capability of semantic segmentation models. Extensive experiments on two +fine-resolution benchmark datasets, namely ISPRS Potsdam and ISPRS Vaihingen, +highlight the effectiveness and superiority of the proposed approach as +compared to the state-of-the-art UDA methods. The source code for this work +will be accessible at https://github.com/sstary/SSRS. + +
+
+ comment: 28 pages, 13 figures +
+
+
+
+
+ + ☆ VTR: An Optimized Vision Transformer for SAR ATR Acceleration on FPGA SP + + +
+ Synthetic Aperture Radar (SAR) Automatic Target Recognition (ATR) is a key +technique used in military applications like remote-sensing image recognition. +Vision Transformers (ViTs) are the current state-of-the-art in various computer +vision applications, outperforming their CNN counterparts. However, using ViTs +for SAR ATR applications is challenging due to (1) standard ViTs require +extensive training data to generalize well due to their low locality; the +standard SAR datasets, however, have a limited number of labeled training data +which reduces the learning capability of ViTs; (2) ViTs have a high parameter +count and are computation intensive which makes their deployment on +resource-constrained SAR platforms difficult. In this work, we develop a +lightweight ViT model that can be trained directly on small datasets without +any pre-training by utilizing the Shifted Patch Tokenization (SPT) and Locality +Self-Attention (LSA) modules. We directly train this model on SAR datasets +which have limited training samples to evaluate its effectiveness for SAR ATR +applications. We evaluate our proposed model, that we call VTR (ViT for SAR +ATR), on three widely used SAR datasets: MSTAR, SynthWakeSAR, and GBSAR. +Further, we propose a novel FPGA accelerator for VTR, in order to enable +deployment for real-time SAR ATR applications. + +
+
+ comment: SPIE DCS 2024 +
+
+
+
+
+ + ☆ DATENeRF: Depth-Aware Text-based Editing of NeRFs + + +
+ Recent advancements in diffusion models have shown remarkable proficiency in +editing 2D images based on text prompts. However, extending these techniques to +edit scenes in Neural Radiance Fields (NeRF) is complex, as editing individual +2D frames can result in inconsistencies across multiple views. Our crucial +insight is that a NeRF scene's geometry can serve as a bridge to integrate +these 2D edits. Utilizing this geometry, we employ a depth-conditioned +ControlNet to enhance the coherence of each 2D image modification. Moreover, we +introduce an inpainting approach that leverages the depth information of NeRF +scenes to distribute 2D edits across different images, ensuring robustness +against errors and resampling challenges. Our results reveal that this +methodology achieves more consistent, lifelike, and detailed edits than +existing leading methods for text-driven NeRF scene editing. + +
+
+ comment: 14 pages, Conference paper, 3D Scene Editing, Neural Rendering, + Diffusion Models +
+
+
+
+
+ + ☆ MedIAnomaly: A comparative study of anomaly detection in medical images + + +
+ Anomaly detection (AD) aims at detecting abnormal samples that deviate from +the expected normal patterns. Generally, it can be trained on merely normal +data without the requirement for abnormal samples, and thereby plays an +important role in the recognition of rare diseases and health screening in the +medical domain. Despite numerous related studies, we observe a lack of a fair +and comprehensive evaluation, which causes some ambiguous conclusions and +hinders the development of this field. This paper focuses on building a +benchmark with unified implementation and comparison to address this problem. +In particular, seven medical datasets with five image modalities, including +chest X-rays, brain MRIs, retinal fundus images, dermatoscopic images, and +histopathology whole slide images are organized for extensive evaluation. +Twenty-seven typical AD methods, including reconstruction and self-supervised +learning-based methods, are involved in comparison of image-level anomaly +classification and pixel-level anomaly segmentation. Furthermore, we for the +first time formally explore the effect of key components in existing methods, +clearly revealing unresolved challenges and potential future directions. The +datasets and code are available at +\url{https://github.com/caiyu6666/MedIAnomaly}. + +
+
+ comment: Under submission +
+
+
+
+
+ + ☆ Latent-based Diffusion Model for Long-tailed Recognition CVPR2024 + + +
+ Long-tailed imbalance distribution is a common issue in practical computer +vision applications. Previous works proposed methods to address this problem, +which can be categorized into several classes: re-sampling, re-weighting, +transfer learning, and feature augmentation. In recent years, diffusion models +have shown an impressive generation ability in many sub-problems of deep +computer vision. However, its powerful generation has not been explored in +long-tailed problems. We propose a new approach, the Latent-based Diffusion +Model for Long-tailed Recognition (LDMLR), as a feature augmentation method to +tackle the issue. First, we encode the imbalanced dataset into features using +the baseline model. Then, we train a Denoising Diffusion Implicit Model (DDIM) +using these encoded features to generate pseudo-features. Finally, we train the +classifier using the encoded and pseudo-features from the previous two steps. +The model's accuracy shows an improvement on the CIFAR-LT and ImageNet-LT +datasets by using the proposed method. + +
+
+ comment: 8 pages, 3 figures, accepted by L3DIVU-CVPR2024 +
+
+
+
+
+ + ☆ Cluster-based Video Summarization with Temporal Context Awareness + + +
+ In this paper, we present TAC-SUM, a novel and efficient training-free +approach for video summarization that addresses the limitations of existing +cluster-based models by incorporating temporal context. Our method partitions +the input video into temporally consecutive segments with clustering +information, enabling the injection of temporal awareness into the clustering +process, setting it apart from prior cluster-based summarization methods. The +resulting temporal-aware clusters are then utilized to compute the final +summary, using simple rules for keyframe selection and frame importance +scoring. Experimental results on the SumMe dataset demonstrate the +effectiveness of our proposed approach, outperforming existing unsupervised +methods and achieving comparable performance to state-of-the-art supervised +summarization techniques. Our source code is available for reference at +\url{https://github.com/hcmus-thesis-gulu/TAC-SUM}. + +
+
+ comment: 14 pages, 6 figures, accepted in PSIVT 2023 +
+
+
+
+
+ + ☆ Automated Lane Change Behavior Prediction and Environmental Perception + Based on SLAM Technology + + +
+ In addition to environmental perception sensors such as cameras, radars, etc. +in the automatic driving system, the external environment of the vehicle is +perceived, in fact, there is also a perception sensor that has been silently +dedicated in the system, that is, the positioning module. This paper explores +the application of SLAM (Simultaneous Localization and Mapping) technology in +the context of automatic lane change behavior prediction and environment +perception for autonomous vehicles. It discusses the limitations of traditional +positioning methods, introduces SLAM technology, and compares LIDAR SLAM with +visual SLAM. Real-world examples from companies like Tesla, Waymo, and Mobileye +showcase the integration of AI-driven technologies, sensor fusion, and SLAM in +autonomous driving systems. The paper then delves into the specifics of SLAM +algorithms, sensor technologies, and the importance of automatic lane changes +in driving safety and efficiency. It highlights Tesla's recent update to its +Autopilot system, which incorporates automatic lane change functionality using +SLAM technology. The paper concludes by emphasizing the crucial role of SLAM in +enabling accurate environment perception, positioning, and decision-making for +autonomous vehicles, ultimately enhancing safety and driving experience. + +
+
+
+
+
+ + ☆ FastHDRNet: A new efficient method for SDR-to-HDR Translation + + +
+ Modern displays nowadays possess the capability to render video content with +a high dynamic range (HDR) and an extensive color gamut (WCG).However, the +majority of available resources are still in standard dynamic range(SDR). +Therefore, we need to identify an effective methodology for this objective.The +existing deep neural network (DNN) based SDR(Standard dynamic range) to HDR +(High dynamic range) conversion methods outperform conventional methods, but +they are either too large to implement or generate some terrible artifacts. We +propose a neural network for SDRTV to HDRTV conversion, termed "FastHDRNet". +This network includes two parts, Adaptive Universal Color Transformation and +Local Enhancement.The architecture is designed as a lightweight network that +utilizes global statistics and local information with super high efficiency. +After the experiment, we find that our proposed method achieve state-of-the-art +performance in both quantitative comparisons and visual quality with a +lightweight structure and a enhanced infer speed. + +
+
+ comment: 16 pages, 4 figures +
+
+
+
+
+ + ☆ Diffusion-RWKV: Scaling RWKV-Like Architectures for Diffusion Models + + +
+ Transformers have catalyzed advancements in computer vision and natural +language processing (NLP) fields. However, substantial computational complexity +poses limitations for their application in long-context tasks, such as +high-resolution image generation. This paper introduces a series of +architectures adapted from the RWKV model used in the NLP, with requisite +modifications tailored for diffusion model applied to image generation tasks, +referred to as Diffusion-RWKV. Similar to the diffusion with Transformers, our +model is designed to efficiently handle patchnified inputs in a sequence with +extra conditions, while also scaling up effectively, accommodating both +large-scale parameters and extensive datasets. Its distinctive advantage +manifests in its reduced spatial aggregation complexity, rendering it +exceptionally adept at processing high-resolution images, thereby eliminating +the necessity for windowing or group cached operations. Experimental results on +both condition and unconditional image generation tasks demonstrate that +Diffison-RWKV achieves performance on par with or surpasses existing CNN or +Transformer-based diffusion models in FID and IS metrics while significantly +reducing total computation FLOP usage. + +
+
+
+
+
+ + ☆ DELTA: Decoupling Long-Tailed Online Continual Learning CVPR + + +
+ A significant challenge in achieving ubiquitous Artificial Intelligence is +the limited ability of models to rapidly learn new information in real-world +scenarios where data follows long-tailed distributions, all while avoiding +forgetting previously acquired knowledge. In this work, we study the +under-explored problem of Long-Tailed Online Continual Learning (LTOCL), which +aims to learn new tasks from sequentially arriving class-imbalanced data +streams. Each data is observed only once for training without knowing the task +data distribution. We present DELTA, a decoupled learning approach designed to +enhance learning representations and address the substantial imbalance in +LTOCL. We enhance the learning process by adapting supervised contrastive +learning to attract similar samples and repel dissimilar (out-of-class) +samples. Further, by balancing gradients during training using an equalization +loss, DELTA significantly enhances learning outcomes and successfully mitigates +catastrophic forgetting. Through extensive evaluation, we demonstrate that +DELTA improves the capacity for incremental learning, surpassing existing OCL +methods. Our results suggest considerable promise for applying OCL in +real-world applications. + +
+
+ comment: CVPR Workshop acceptance archival track +
+
+
+
+
+ + ☆ RoNet: Rotation-oriented Continuous Image Translation + + +
+ The generation of smooth and continuous images between domains has recently +drawn much attention in image-to-image (I2I) translation. Linear relationship +acts as the basic assumption in most existing approaches, while applied to +different aspects including features, models or labels. However, the linear +assumption is hard to conform with the element dimension increases and suffers +from the limit that having to obtain both ends of the line. In this paper, we +propose a novel rotation-oriented solution and model the continuous generation +with an in-plane rotation over the style representation of an image, achieving +a network named RoNet. A rotation module is implanted in the generation network +to automatically learn the proper plane while disentangling the content and the +style of an image. To encourage realistic texture, we also design a patch-based +semantic style loss that learns the different styles of the similar object in +different domains. We conduct experiments on forest scenes (where the complex +texture makes the generation very challenging), faces, streetscapes and the +iphone2dslr task. The results validate the superiority of our method in terms +of visual quality and continuity. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ Mixed-Query Transformer: A Unified Image Segmentation Architecture + + +
+ Existing unified image segmentation models either employ a unified +architecture across multiple tasks but use separate weights tailored to each +dataset, or apply a single set of weights to multiple datasets but are limited +to a single task. In this paper, we introduce the Mixed-Query Transformer +(MQ-Former), a unified architecture for multi-task and multi-dataset image +segmentation using a single set of weights. To enable this, we propose a mixed +query strategy, which can effectively and dynamically accommodate different +types of objects without heuristic designs. In addition, the unified +architecture allows us to use data augmentation with synthetic masks and +captions to further improve model generalization. Experiments demonstrate that +MQ-Former can not only effectively handle multiple segmentation datasets and +tasks compared to specialized state-of-the-art models with competitive +performance, but also generalize better to open-set segmentation tasks, +evidenced by over 7 points higher performance than the prior art on the +open-vocabulary SeginW benchmark. + +
+
+
+
+
+ + ☆ Aligning Diffusion Models by Optimizing Human Utility + + +
+ We present Diffusion-KTO, a novel approach for aligning text-to-image +diffusion models by formulating the alignment objective as the maximization of +expected human utility. Since this objective applies to each generation +independently, Diffusion-KTO does not require collecting costly pairwise +preference data nor training a complex reward model. Instead, our objective +requires simple per-image binary feedback signals, e.g. likes or dislikes, +which are abundantly available. After fine-tuning using Diffusion-KTO, +text-to-image diffusion models exhibit superior performance compared to +existing techniques, including supervised fine-tuning and Diffusion-DPO, both +in terms of human judgment and automatic evaluation metrics such as PickScore +and ImageReward. Overall, Diffusion-KTO unlocks the potential of leveraging +readily available per-image binary signals and broadens the applicability of +aligning text-to-image diffusion models with human preferences. + +
+
+ comment: 27 pages, 11 figures +
+
+
+
+
+ + ☆ Automated Polyp Segmentation in Colonoscopy Images + + +
+ It is important to find the polyps in a human system that helps to prevent +cancer during medical diagnosis. This research discusses using a dilated +convolution module along with a criss cross attention-based network to segment +polyps from the endoscopic images of the colon. To gather the context +information of all pixels in an image more efficiently, criss-cross attention +module has played a vital role. In order to extract maximum information from +dataset, data augmentation techniques are employed in the dataset. Rotations, +flips, scaling, and contrast along with varying learning rates were implemented +to make a better model. Global average pooling was applied over ResNet50 that +helped to store the important details of encoder. In our experiment, the +proposed architecture's performance was compared with existing models like +U-Net, DeepLabV3, PraNet. This architecture outperformed other models on the +subset of dataset which has irregular polyp shapes. The combination of dilated +convolution module, RCCA, and global average pooling was found to be effective +for irregular shapes. Our architecture demonstrates an enhancement, with an +average improvement of 3.75% across all metrics when compared to existing +models. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ JRDB-Social: A Multifaceted Robotic Dataset for Understanding of Context + and Dynamics of Human Interactions Within Social Groups CVPR 2024 + + +
+ Understanding human social behaviour is crucial in computer vision and +robotics. Micro-level observations like individual actions fall short, +necessitating a comprehensive approach that considers individual behaviour, +intra-group dynamics, and social group levels for a thorough understanding. To +address dataset limitations, this paper introduces JRDB-Social, an extension of +JRDB. Designed to fill gaps in human understanding across diverse indoor and +outdoor social contexts, JRDB-Social provides annotations at three levels: +individual attributes, intra-group interactions, and social group context. This +dataset aims to enhance our grasp of human social dynamics for robotic +applications. Utilizing the recent cutting-edge multi-modal large language +models, we evaluated our benchmark to explore their capacity to decipher social +human behaviour. + +
+
+ comment: Accepted by CVPR 2024. Project page: + https://jrdb.erc.monash.edu/dataset/social +
+
+
+
+
+ + ☆ Beyond the Known: Adversarial Autoencoders in Novelty Detection + + +
+ In novelty detection, the goal is to decide if a new data point should be +categorized as an inlier or an outlier, given a training dataset that primarily +captures the inlier distribution. Recent approaches typically use deep encoder +and decoder network frameworks to derive a reconstruction error, and employ +this error either to determine a novelty score, or as the basis for a one-class +classifier. In this research, we use a similar framework but with a lightweight +deep network, and we adopt a probabilistic score with reconstruction error. Our +methodology calculates the probability of whether the sample comes from the +inlier distribution or not. This work makes two key contributions. The first is +that we compute the novelty probability by linearizing the manifold that holds +the structure of the inlier distribution. This allows us to interpret how the +probability is distributed and can be determined in relation to the local +coordinates of the manifold tangent space. The second contribution is that we +improve the training protocol for the network. Our results indicate that our +approach is effective at learning the target class, and it outperforms recent +state-of-the-art methods on several benchmark datasets. + +
+
+ comment: Accepted at the VISAAP 2024 +
+
+
+
+
+ + ☆ Study of the effect of Sharpness on Blind Video Quality Assessment + + +
+ Introduction: Video Quality Assessment (VQA) is one of the important areas of +study in this modern era, where video is a crucial component of communication +with applications in every field. Rapid technology developments in mobile +technology enabled anyone to create videos resulting in a varied range of video +quality scenarios. Objectives: Though VQA was present for some time with the +classical metrices like SSIM and PSNR, the advent of machine learning has +brought in new techniques of VQAs which are built upon Convolutional Neural +Networks (CNNs) or Deep Neural Networks (DNNs). Methods: Over the past years +various research studies such as the BVQA which performed video quality +assessment of nature-based videos using DNNs exposed the powerful capabilities +of machine learning algorithms. BVQA using DNNs explored human visual system +effects such as content dependency and time-related factors normally known as +temporal effects. Results: This study explores the sharpness effect on models +like BVQA. Sharpness is the measure of the clarity and details of the video +image. Sharpness typically involves analyzing the edges and contrast of the +image to determine the overall level of detail and sharpness. Conclusion: This +study uses the existing video quality databases such as CVD2014. A comparative +study of the various machine learning parameters such as SRCC and PLCC during +the training and testing are presented along with the conclusion. + +
+
+
+
+
+ + ☆ Music Recommendation Based on Facial Emotion Recognition + + +
+ Introduction: Music provides an incredible avenue for individuals to express +their thoughts and emotions, while also serving as a delightful mode of +entertainment for enthusiasts and music lovers. Objectives: This paper presents +a comprehensive approach to enhancing the user experience through the +integration of emotion recognition, music recommendation, and explainable AI +using GRAD-CAM. Methods: The proposed methodology utilizes a ResNet50 model +trained on the Facial Expression Recognition (FER) dataset, consisting of real +images of individuals expressing various emotions. Results: The system achieves +an accuracy of 82% in emotion classification. By leveraging GRAD-CAM, the model +provides explanations for its predictions, allowing users to understand the +reasoning behind the system's recommendations. The model is trained on both FER +and real user datasets, which include labelled facial expressions, and real +images of individuals expressing various emotions. The training process +involves pre-processing the input images, extracting features through +convolutional layers, reasoning with dense layers, and generating emotion +predictions through the output layer. Conclusion: The proposed methodology, +leveraging the Resnet50 model with ROI-based analysis and explainable AI +techniques, offers a robust and interpretable solution for facial emotion +detection paper. + +
+
+
+
+
+ + ☆ Deep Learning-Based Brain Image Segmentation for Automated Tumour + Detection + + +
+ Introduction: The present study on the development and evaluation of an +automated brain tumor segmentation technique based on deep learning using the +3D U-Net model. Objectives: The objective is to leverage state-of-the-art +convolutional neural networks (CNNs) on a large dataset of brain MRI scans for +segmentation. Methods: The proposed methodology applies pre-processing +techniques for enhanced performance and generalizability. Results: Extensive +validation on an independent dataset confirms the model's robustness and +potential for integration into clinical workflows. The study emphasizes the +importance of data pre-processing and explores various hyperparameters to +optimize the model's performance. The 3D U-Net, has given IoUs for training and +validation dataset have been 0.8181 and 0.66 respectively. Conclusion: +Ultimately, this comprehensive framework showcases the efficacy of deep +learning in automating brain tumour detection, offering valuable support in +clinical practice. + +
+
+
+
+
+ + ☆ PIE: Physics-inspired Low-light Enhancement + + +
+ In this paper, we propose a physics-inspired contrastive learning paradigm +for low-light enhancement, called PIE. PIE primarily addresses three issues: +(i) To resolve the problem of existing learning-based methods often training a +LLE model with strict pixel-correspondence image pairs, we eliminate the need +for pixel-correspondence paired training data and instead train with unpaired +images. (ii) To address the disregard for negative samples and the inadequacy +of their generation in existing methods, we incorporate physics-inspired +contrastive learning for LLE and design the Bag of Curves (BoC) method to +generate more reasonable negative samples that closely adhere to the underlying +physical imaging principle. (iii) To overcome the reliance on semantic ground +truths in existing methods, we propose an unsupervised regional segmentation +module, ensuring regional brightness consistency while eliminating the +dependency on semantic ground truths. Overall, the proposed PIE can effectively +learn from unpaired positive/negative samples and smoothly realize non-semantic +regional enhancement, which is clearly different from existing LLE efforts. +Besides the novel architecture of PIE, we explore the gain of PIE on downstream +tasks such as semantic segmentation and face detection. Training on readily +available open data and extensive experiments demonstrate that our method +surpasses the state-of-the-art LLE models over six independent cross-scenes +datasets. PIE runs fast with reasonable GFLOPs in test time, making it easy to +use on mobile devices. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2112.06451 +
+
+
+
+
+ + ♻ ☆ Which One? Leveraging Context Between Objects and Multiple Views for + Language Grounding + + +
+ When connecting objects and their language referents in an embodied 3D +environment, it is important to note that: (1) an object can be better +characterized by leveraging comparative information between itself and other +objects, and (2) an object's appearance can vary with camera position. As such, +we present the Multi-view Approach to Grounding in Context (MAGiC), which +selects an object referent based on language that distinguishes between two +similar objects. By pragmatically reasoning over both objects and across +multiple views of those objects, MAGiC improves over the state-of-the-art model +on the SNARE object reference task with a relative error reduction of 12.9\% +(representing an absolute improvement of 2.7\%). Ablation studies show that +reasoning jointly over object referent candidates and multiple views of each +object both contribute to improved accuracy. Code: +https://github.com/rcorona/magic_snare/ + +
+
+
+
+
+ + ♻ ☆ Tensor-based Multimodal Learning for Prediction of Pulmonary Arterial + Wedge Pressure from Cardiac MRI + + +
+ Heart failure is a serious and life-threatening condition that can lead to +elevated pressure in the left ventricle. Pulmonary Arterial Wedge Pressure +(PAWP) is an important surrogate marker indicating high pressure in the left +ventricle. PAWP is determined by Right Heart Catheterization (RHC) but it is an +invasive procedure. A non-invasive method is useful in quickly identifying +high-risk patients from a large population. In this work, we develop a tensor +learning-based pipeline for identifying PAWP from multimodal cardiac Magnetic +Resonance Imaging (MRI). This pipeline extracts spatial and temporal features +from high-dimensional scans. For quality control, we incorporate an epistemic +uncertainty-based binning strategy to identify poor-quality training samples. +To improve the performance, we learn complementary information by integrating +features from multimodal data: cardiac MRI with short-axis and four-chamber +views, and Electronic Health Records. The experimental analysis on a large +cohort of $1346$ subjects who underwent the RHC procedure for PAWP estimation +indicates that the proposed pipeline has a diagnostic value and can produce +promising performance with significant improvement over the baseline in +clinical practice (i.e., $\Delta$AUC $=0.10$, $\Delta$Accuracy $=0.06$, and +$\Delta$MCC $=0.39$). The decision curve analysis further confirms the clinical +utility of our method. + +
+
+
+
+
+ + ♻ ☆ Adaptively Placed Multi-Grid Scene Representation Networks for + Large-Scale Data Visualization IEEE VIS 2023 + + +
+ Scene representation networks (SRNs) have been recently proposed for +compression and visualization of scientific data. However, state-of-the-art +SRNs do not adapt the allocation of available network parameters to the complex +features found in scientific data, leading to a loss in reconstruction quality. +We address this shortcoming with an adaptively placed multi-grid SRN (APMGSRN) +and propose a domain decomposition training and inference technique for +accelerated parallel training on multi-GPU systems. We also release an +open-source neural volume rendering application that allows plug-and-play +rendering with any PyTorch-based SRN. Our proposed APMGSRN architecture uses +multiple spatially adaptive feature grids that learn where to be placed within +the domain to dynamically allocate more neural network resources where error is +high in the volume, improving state-of-the-art reconstruction accuracy of SRNs +for scientific data without requiring expensive octree refining, pruning, and +traversal like previous adaptive models. In our domain decomposition approach +for representing large-scale data, we train an set of APMGSRNs in parallel on +separate bricks of the volume to reduce training time while avoiding overhead +necessary for an out-of-core solution for volumes too large to fit in GPU +memory. After training, the lightweight SRNs are used for realtime neural +volume rendering in our open-source renderer, where arbitrary view angles and +transfer functions can be explored. A copy of this paper, all code, all models +used in our experiments, and all supplemental materials and videos are +available at https://github.com/skywolf829/APMGSRN. + +
+
+ comment: Accepted to IEEE VIS 2023. + https://www.computer.org/csdl/journal/tg/2024/01/10297599/1RyYguiNBLO +
+
+
+
+
+ + ♻ ☆ Joint2Human: High-quality 3D Human Generation via Compact Spherical + Embedding of 3D Joints + + +
+ 3D human generation is increasingly significant in various applications. +However, the direct use of 2D generative methods in 3D generation often results +in losing local details, while methods that reconstruct geometry from generated +images struggle with global view consistency. In this work, we introduce +Joint2Human, a novel method that leverages 2D diffusion models to generate +detailed 3D human geometry directly, ensuring both global structure and local +details. To achieve this, we employ the Fourier occupancy field (FOF) +representation, enabling the direct generation of 3D shapes as preliminary +results with 2D generative models. With the proposed high-frequency enhancer +and the multi-view recarving strategy, our method can seamlessly integrate the +details from different views into a uniform global shape. To better utilize the +3D human prior and enhance control over the generated geometry, we introduce a +compact spherical embedding of 3D joints. This allows for an effective guidance +of pose during the generation process. Additionally, our method can generate 3D +humans guided by textual inputs. Our experimental results demonstrate the +capability of our method to ensure global structure, local details, high +resolution, and low computational cost simultaneously. More results and the +code can be found on our project page at +http://cic.tju.edu.cn/faculty/likun/projects/Joint2Human. + +
+
+
+
+
+ + ♻ ☆ Detection Is Tracking: Point Cloud Multi-Sweep Deep Learning Models + Revisited + + +
+ Conventional tracking paradigm takes in instantaneous measurements such as +range and bearing, and produces object tracks across time. In applications such +as autonomous driving, lidar measurements in the form of point clouds are +usually passed through a "virtual sensor" realized by a deep learning model, to +produce "measurements" such as bounding boxes, which are in turn ingested by a +tracking module to produce object tracks. Very often multiple lidar sweeps are +accumulated in a buffer to merge and become the input to the virtual sensor. We +argue in this paper that such an input already contains temporal information, +and therefore the virtual sensor output should also contain temporal +information, not just instantaneous values for the time corresponding to the +end of the buffer. In particular, we present the deep learning model called +MULti-Sweep PAired Detector (MULSPAD) that produces, for each detected object, +a pair of bounding boxes at both the end time and the beginning time of the +input buffer. This is achieved with fairly straightforward changes in commonly +used lidar detection models, and with only marginal extra processing, but the +resulting symmetry is satisfying. Such paired detections make it possible not +only to construct rudimentary trackers fairly easily, but also to construct +more sophisticated trackers that can exploit the extra information conveyed by +the pair and be robust to choices of motion models and object birth/death +models. We have conducted preliminary training and experimentation using Waymo +Open Dataset, which shows the efficacy of our proposed method. + +
+
+
+
+
+ + ♻ ☆ Learning Trimaps via Clicks for Image Matting + + +
+ Despite significant advancements in image matting, existing models heavily +depend on manually-drawn trimaps for accurate results in natural image +scenarios. However, the process of obtaining trimaps is time-consuming, lacking +user-friendliness and device compatibility. This reliance greatly limits the +practical application of all trimap-based matting methods. To address this +issue, we introduce Click2Trimap, an interactive model capable of predicting +high-quality trimaps and alpha mattes with minimal user click inputs. Through +analyzing real users' behavioral logic and characteristics of trimaps, we +successfully propose a powerful iterative three-class training strategy and a +dedicated simulation function, making Click2Trimap exhibit versatility across +various scenarios. Quantitative and qualitative assessments on synthetic and +real-world matting datasets demonstrate Click2Trimap's superior performance +compared to all existing trimap-free matting methods. Especially, in the user +study, Click2Trimap achieves high-quality trimap and matting predictions in +just an average of 5 seconds per image, demonstrating its substantial practical +value in real-world applications. + +
+
+
+
+
+ + ♻ ☆ Image Inpainting via Conditional Texture and Structure Dual Generation ICCV 2021 + + +
+ Deep generative approaches have recently made considerable progress in image +inpainting by introducing structure priors. Due to the lack of proper +interaction with image texture during structure reconstruction, however, +current solutions are incompetent in handling the cases with large corruptions, +and they generally suffer from distorted results. In this paper, we propose a +novel two-stream network for image inpainting, which models the +structure-constrained texture synthesis and texture-guided structure +reconstruction in a coupled manner so that they better leverage each other for +more plausible generation. Furthermore, to enhance the global consistency, a +Bi-directional Gated Feature Fusion (Bi-GFF) module is designed to exchange and +combine the structure and texture information and a Contextual Feature +Aggregation (CFA) module is developed to refine the generated contents by +region affinity learning and multi-scale feature aggregation. Qualitative and +quantitative experiments on the CelebA, Paris StreetView and Places2 datasets +demonstrate the superiority of the proposed method. Our code is available at +https://github.com/Xiefan-Guo/CTSDG. + +
+
+ comment: Accepted by ICCV 2021 +
+
+
+
+
+ + ♻ ☆ Geometry Transfer for Stylizing Radiance Fields CVPR 2024 + + +
+ Shape and geometric patterns are essential in defining stylistic identity. +However, current 3D style transfer methods predominantly focus on transferring +colors and textures, often overlooking geometric aspects. In this paper, we +introduce Geometry Transfer, a novel method that leverages geometric +deformation for 3D style transfer. This technique employs depth maps to extract +a style guide, subsequently applied to stylize the geometry of radiance fields. +Moreover, we propose new techniques that utilize geometric cues from the 3D +scene, thereby enhancing aesthetic expressiveness and more accurately +reflecting intended styles. Our extensive experiments show that Geometry +Transfer enables a broader and more expressive range of stylizations, thereby +significantly expanding the scope of 3D style transfer. + +
+
+ comment: CVPR 2024. Project page: https://hyblue.github.io/geo-srf/ +
+
+
+
+
+ + ♻ ☆ DiffSHEG: A Diffusion-Based Approach for Real-Time Speech-driven + Holistic 3D Expression and Gesture Generation CVPR 2024 + + +
+ We propose DiffSHEG, a Diffusion-based approach for Speech-driven Holistic 3D +Expression and Gesture generation with arbitrary length. While previous works +focused on co-speech gesture or expression generation individually, the joint +generation of synchronized expressions and gestures remains barely explored. To +address this, our diffusion-based co-speech motion generation transformer +enables uni-directional information flow from expression to gesture, +facilitating improved matching of joint expression-gesture distributions. +Furthermore, we introduce an outpainting-based sampling strategy for arbitrary +long sequence generation in diffusion models, offering flexibility and +computational efficiency. Our method provides a practical solution that +produces high-quality synchronized expression and gesture generation driven by +speech. Evaluated on two public datasets, our approach achieves +state-of-the-art performance both quantitatively and qualitatively. +Additionally, a user study confirms the superiority of DiffSHEG over prior +approaches. By enabling the real-time generation of expressive and synchronized +motions, DiffSHEG showcases its potential for various applications in the +development of digital humans and embodied agents. + +
+
+ comment: Accepted by CVPR 2024. Project page: + https://jeremycjm.github.io/proj/DiffSHEG +
+
+
+
+
+ + ♻ ☆ Optimizing Sparse Convolution on GPUs with CUDA for 3D Point Cloud + Processing in Embedded Systems + + +
+ In recent years, there has been a significant increase in the utilization of +deep learning methods, particularly convolutional neural networks (CNNs), which +have emerged as the dominant approach in various domains that involve +structured grid data, such as picture analysis and processing. Nevertheless, +the exponential growth in the utilization of LiDAR and 3D sensors across many +domains has resulted in an increased need for the analysis of 3D point clouds. +The utilization of 3D point clouds is crucial in various applications, +including object recognition and segmentation, as they offer a spatial +depiction of things within a three-dimensional environment. In contrast to +photos, point clouds exhibit sparsity and lack a regular grid, hence posing +distinct processing and computational issues. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ♻ ☆ YOLOv8-AM: YOLOv8 with Attention Mechanisms for Pediatric Wrist Fracture + Detection + + +
+ Wrist trauma and even fractures occur frequently in daily life, particularly +among children who account for a significant proportion of fracture cases. +Before performing surgery, surgeons often request patients to undergo X-ray +imaging first and prepare for it based on the analysis of the radiologist. With +the development of neural networks, You Only Look Once (YOLO) series models +have been widely used in fracture detection as computer-assisted diagnosis +(CAD). In 2023, Ultralytics presented the latest version of the YOLO models, +which has been employed for detecting fractures across various parts of the +body. Attention mechanism is one of the hottest methods to improve the model +performance. This research work proposes YOLOv8-AM, which incorporates the +attention mechanism into the original YOLOv8 architecture. Specifically, we +respectively employ four attention modules, Convolutional Block Attention +Module (CBAM), Global Attention Mechanism (GAM), Efficient Channel Attention +(ECA), and Shuffle Attention (SA), to design the improved models and train them +on GRAZPEDWRI-DX dataset. Experimental results demonstrate that the mean +Average Precision at IoU 50 (mAP 50) of the YOLOv8-AM model based on ResBlock + +CBAM (ResCBAM) increased from 63.6% to 65.8%, which achieves the +state-of-the-art (SOTA) performance. Conversely, YOLOv8-AM model incorporating +GAM obtains the mAP 50 value of 64.2%, which is not a satisfactory enhancement. +Therefore, we combine ResBlock and GAM, introducing ResGAM to design another +new YOLOv8-AM model, whose mAP 50 value is increased to 65.0%. The +implementation code for this study is available on GitHub at +https://github.com/RuiyangJu/Fracture_Detection_Improved_YOLOv8. + +
+
+
+
+
+ + ♻ ☆ Template Free Reconstruction of Human-object Interaction with Procedural + Interaction Generation CVPR'24 + + +
+ Reconstructing human-object interaction in 3D from a single RGB image is a +challenging task and existing data driven methods do not generalize beyond the +objects present in the carefully curated 3D interaction datasets. Capturing +large-scale real data to learn strong interaction and 3D shape priors is very +expensive due to the combinatorial nature of human-object interactions. In this +paper, we propose ProciGen (Procedural interaction Generation), a method to +procedurally generate datasets with both, plausible interaction and diverse +object variation. We generate 1M+ human-object interaction pairs in 3D and +leverage this large-scale data to train our HDM (Hierarchical Diffusion Model), +a novel method to reconstruct interacting human and unseen objects, without any +templates. Our HDM is an image-conditioned diffusion model that learns both +realistic interaction and highly accurate human and object shapes. Experiments +show that our HDM trained with ProciGen significantly outperforms prior methods +that requires template meshes and that our dataset allows training methods with +strong generalization ability to unseen object instances. Our code and data are +released. + +
+
+ comment: CVPR'24 camera ready version. 25 pages, 20 figures. Project page: + https://virtualhumans.mpi-inf.mpg.de/procigen-hdm +
+
+
+
+
+ + ♻ ☆ Grounding and Enhancing Grid-based Models for Neural Fields CVPR24 + + +
+ Many contemporary studies utilize grid-based models for neural field +representation, but a systematic analysis of grid-based models is still +missing, hindering the improvement of those models. Therefore, this paper +introduces a theoretical framework for grid-based models. This framework points +out that these models' approximation and generalization behaviors are +determined by grid tangent kernels (GTK), which are intrinsic properties of +grid-based models. The proposed framework facilitates a consistent and +systematic analysis of diverse grid-based models. Furthermore, the introduced +framework motivates the development of a novel grid-based model named the +Multiplicative Fourier Adaptive Grid (MulFAGrid). The numerical analysis +demonstrates that MulFAGrid exhibits a lower generalization bound than its +predecessors, indicating its robust generalization performance. Empirical +studies reveal that MulFAGrid achieves state-of-the-art performance in various +tasks, including 2D image fitting, 3D signed distance field (SDF) +reconstruction, and novel view synthesis, demonstrating superior representation +ability. The project website is available at +https://sites.google.com/view/cvpr24-2034-submission/home. + +
+
+ comment: Accepted in CVPR24 as an oral presentation. Pre-rebuttal scores: 555. + Post-rebuttal scores: 555 +
+
+
+
+
+ + ♻ ☆ Diff-Plugin: Revitalizing Details for Diffusion-based Low-level Tasks CVPR2024 + + +
+ Diffusion models trained on large-scale datasets have achieved remarkable +progress in image synthesis. However, due to the randomness in the diffusion +process, they often struggle with handling diverse low-level tasks that require +details preservation. To overcome this limitation, we present a new Diff-Plugin +framework to enable a single pre-trained diffusion model to generate +high-fidelity results across a variety of low-level tasks. Specifically, we +first propose a lightweight Task-Plugin module with a dual branch design to +provide task-specific priors, guiding the diffusion process in preserving image +content. We then propose a Plugin-Selector that can automatically select +different Task-Plugins based on the text instruction, allowing users to edit +images by indicating multiple low-level tasks with natural language. We conduct +extensive experiments on 8 low-level vision tasks. The results demonstrate the +superiority of Diff-Plugin over existing methods, particularly in real-world +scenarios. Our ablations further validate that Diff-Plugin is stable, +schedulable, and supports robust training across different dataset sizes. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ♻ ☆ Comparing the Decision-Making Mechanisms by Transformers and CNNs via + Explanation Methods CVPR24 + + +
+ In order to gain insights about the decision-making of different visual +recognition backbones, we propose two methodologies, sub-explanation counting +and cross-testing, that systematically applies deep explanation algorithms on a +dataset-wide basis, and compares the statistics generated from the amount and +nature of the explanations. These methodologies reveal the difference among +networks in terms of two properties called compositionality and disjunctivism. +Transformers and ConvNeXt are found to be more compositional, in the sense that +they jointly consider multiple parts of the image in building their decisions, +whereas traditional CNNs and distilled transformers are less compositional and +more disjunctive, which means that they use multiple diverse but smaller set of +parts to achieve a confident prediction. Through further experiments, we +pinpointed the choice of normalization to be especially important in the +compositionality of a model, in that batch normalization leads to less +compositionality while group and layer normalization lead to more. Finally, we +also analyze the features shared by different backbones and plot a landscape of +different models based on their feature-use similarity. + +
+
+ comment: 25 pages with 37 figures, to be published in CVPR24 +
+
+
+
+
+ + ♻ ☆ From Pixels to Graphs: Open-Vocabulary Scene Graph Generation with + Vision-Language Models CVPR 2024 + + +
+ Scene graph generation (SGG) aims to parse a visual scene into an +intermediate graph representation for downstream reasoning tasks. Despite +recent advancements, existing methods struggle to generate scene graphs with +novel visual relation concepts. To address this challenge, we introduce a new +open-vocabulary SGG framework based on sequence generation. Our framework +leverages vision-language pre-trained models (VLM) by incorporating an +image-to-graph generation paradigm. Specifically, we generate scene graph +sequences via image-to-text generation with VLM and then construct scene graphs +from these sequences. By doing so, we harness the strong capabilities of VLM +for open-vocabulary SGG and seamlessly integrate explicit relational modeling +for enhancing the VL tasks. Experimental results demonstrate that our design +not only achieves superior performance with an open vocabulary but also +enhances downstream vision-language task performance through explicit relation +modeling knowledge. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ A Universal Knowledge Embedded Contrastive Learning Framework for + Hyperspectral Image Classification + + +
+ Hyperspectral image (HSI) classification techniques have been intensively +studied and a variety of models have been developed. However, these HSI +classification models are confined to pocket models and unrealistic ways of +datasets partitioning. The former limits the generalization performance of the +model and the latter is partitioned leads to inflated model evaluation metrics, +which results in plummeting model performance in the real world. Therefore, we +propose a universal knowledge embedded contrastive learning framework (KnowCL) +for supervised, unsupervised, and semisupervised HSI classification, which +largely closes the gap of HSI classification models between pocket models and +standard vision backbones. We present a new HSI processing pipeline in +conjunction with a range of data transformation and augmentation techniques +that provide diverse data representations and realistic data partitioning. The +proposed framework based on this pipeline is compatible with all kinds of +backbones and can fully exploit labeled and unlabeled samples with expected +training time. Furthermore, we design a new loss function, which can adaptively +fuse the supervised loss and unsupervised loss, enhancing the learning +performance. This proposed new classification paradigm shows great potentials +in exploring for HSI classification technology. The code can be accessed at +https://github.com/quanweiliu/KnowCL. + +
+
+
+
+
+ + ♻ ☆ HAPNet: Toward Superior RGB-Thermal Scene Parsing via Hybrid, + Asymmetric, and Progressive Heterogeneous Feature Fusion + + +
+ Data-fusion networks have shown significant promise for RGB-thermal scene +parsing. However, the majority of existing studies have relied on symmetric +duplex encoders for heterogeneous feature extraction and fusion, paying +inadequate attention to the inherent differences between RGB and thermal +modalities. Recent progress in vision foundation models (VFMs) trained through +self-supervision on vast amounts of unlabeled data has proven their ability to +extract informative, general-purpose features. However, this potential has yet +to be fully leveraged in the domain. In this study, we take one step toward +this new research area by exploring a feasible strategy to fully exploit VFM +features for RGB-thermal scene parsing. Specifically, we delve deeper into the +unique characteristics of RGB and thermal modalities, thereby designing a +hybrid, asymmetric encoder that incorporates both a VFM and a convolutional +neural network. This design allows for more effective extraction of +complementary heterogeneous features, which are subsequently fused in a +dual-path, progressive manner. Moreover, we introduce an auxiliary task to +further enrich the local semantics of the fused features, thereby improving the +overall performance of RGB-thermal scene parsing. Our proposed HAPNet, equipped +with all these components, demonstrates superior performance compared to all +other state-of-the-art RGB-thermal scene parsing networks, achieving top ranks +across three widely used public RGB-thermal scene parsing datasets. We believe +this new paradigm has opened up new opportunities for future developments in +data-fusion scene parsing approaches. + +
+
+ comment: 12 pages, 4figures +
+
+
+
+
+ + ♻ ☆ SANeRF-HQ: Segment Anything for NeRF in High Quality CVPR 2024 + + +
+ Recently, the Segment Anything Model (SAM) has showcased remarkable +capabilities of zero-shot segmentation, while NeRF (Neural Radiance Fields) has +gained popularity as a method for various 3D problems beyond novel view +synthesis. Though there exist initial attempts to incorporate these two methods +into 3D segmentation, they face the challenge of accurately and consistently +segmenting objects in complex scenarios. In this paper, we introduce the +Segment Anything for NeRF in High Quality (SANeRF-HQ) to achieve high-quality +3D segmentation of any target object in a given scene. SANeRF-HQ utilizes SAM +for open-world object segmentation guided by user-supplied prompts, while +leveraging NeRF to aggregate information from different viewpoints. To overcome +the aforementioned challenges, we employ density field and RGB similarity to +enhance the accuracy of segmentation boundary during the aggregation. +Emphasizing on segmentation accuracy, we evaluate our method on multiple NeRF +datasets where high-quality ground-truths are available or manually annotated. +SANeRF-HQ shows a significant quality improvement over state-of-the-art methods +in NeRF object segmentation, provides higher flexibility for object +localization, and enables more consistent object segmentation across multiple +views. Results and code are available at the project site: +https://lyclyc52.github.io/SANeRF-HQ/. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ L2SR: Learning to Sample and Reconstruct for Accelerated MRI via + Reinforcement Learning + + +
+ Magnetic Resonance Imaging (MRI) is a widely used medical imaging technique, +but its long acquisition time can be a limiting factor in clinical settings. To +address this issue, researchers have been exploring ways to reduce the +acquisition time while maintaining the reconstruction quality. Previous works +have focused on finding either sparse samplers with a fixed reconstructor or +finding reconstructors with a fixed sampler. However, these approaches do not +fully utilize the potential of joint learning of samplers and reconstructors. +In this paper, we propose an alternating training framework for jointly +learning a good pair of samplers and reconstructors via deep reinforcement +learning (RL). In particular, we consider the process of MRI sampling as a +sampling trajectory controlled by a sampler, and introduce a novel +sparse-reward Partially Observed Markov Decision Process (POMDP) to formulate +the MRI sampling trajectory. Compared to the dense-reward POMDP used in +existing works, the proposed sparse-reward POMDP is more computationally +efficient and has a provable advantage. Moreover, the proposed framework, +called L2SR (Learning to Sample and Reconstruct), overcomes the training +mismatch problem that arises in previous methods that use dense-reward POMDP. +By alternately updating samplers and reconstructors, L2SR learns a pair of +samplers and reconstructors that achieve state-of-the-art reconstruction +performances on the fastMRI dataset. Codes are available at +\url{https://github.com/yangpuPKU/L2SR-Learning-to-Sample-and-Reconstruct}. + +
+
+
+
+
+ + ♻ ☆ Image Super-resolution Reconstruction Network based on Enhanced Swin + Transformer via Alternating Aggregation of Local-Global Features + + +
+ The Swin Transformer image super-resolution reconstruction network only +relies on the long-range relationship of window attention and shifted window +attention to explore features. This mechanism has two limitations. On the one +hand, it only focuses on global features while ignoring local features. On the +other hand, it is only concerned with spatial feature interactions while +ignoring channel features and channel interactions, thus limiting its +non-linear mapping ability. To address the above limitations, this paper +proposes enhanced Swin Transformer modules via alternating aggregation of +local-global features. In the local feature aggregation stage, we introduce a +shift convolution to realize the interaction between local spatial information +and channel information. Then, a block sparse global perception module is +introduced in the global feature aggregation stage. In this module, we +reorganize the spatial information first, then send the recombination +information into a dense layer to implement the global perception. After that, +a multi-scale self-attention module and a low-parameter residual channel +attention module are introduced to realize information aggregation at different +scales. Finally, the proposed network is validated on five publicly available +datasets. The experimental results show that the proposed network outperforms +the other state-of-the-art super-resolution networks. + +
+
+
+
+
+ + ♻ ☆ You Only Train Once: A Unified Framework for Both Full-Reference and + No-Reference Image Quality Assessment + + +
+ Although recent efforts in image quality assessment (IQA) have achieved +promising performance, there still exists a considerable gap compared to the +human visual system (HVS). One significant disparity lies in humans' seamless +transition between full reference (FR) and no reference (NR) tasks, whereas +existing models are constrained to either FR or NR tasks. This disparity +implies the necessity of designing two distinct systems, thereby greatly +diminishing the model's versatility. Therefore, our focus lies in unifying FR +and NR IQA under a single framework. Specifically, we first employ an encoder +to extract multi-level features from input images. Then a Hierarchical +Attention (HA) module is proposed as a universal adapter for both FR and NR +inputs to model the spatial distortion at each encoder stage. Furthermore, +considering that different distortions contaminate encoder stages and damage +image semantic meaning differently, a Semantic Distortion Aware (SDA) module is +proposed to examine feature correlations between shallow and deep layers of the +encoder. By adopting HA and SDA, the proposed network can effectively perform +both FR and NR IQA. When our proposed model is independently trained on NR or +FR IQA tasks, it outperforms existing models and achieves state-of-the-art +performance. Moreover, when trained jointly on NR and FR IQA tasks, it further +enhances the performance of NR IQA while achieving on-par performance in the +state-of-the-art FR IQA. You only train once to perform both IQA tasks. Code +will be released at: https://github.com/BarCodeReader/YOTO. + +
+
+
+
+
+ + ♻ ☆ Filtering Pixel Latent Variables for Unmixing Noisy and Undersampled + Volumetric Images + + +
+ The development of robust signal unmixing algorithms is essential for +leveraging multimodal datasets acquired through a wide array of scientific +imaging technologies, including hyperspectral or time-resolved acquisitions. In +experimental physics, enhancing the spatio-temporal resolution or expanding the +number of detection channels often leads to diminished sampling rate and +signal-to-noise ratio, significantly affecting the efficacy of signal unmixing +algorithms. We propose applying band-pass filters to the latent space of a +multi-dimensional convolutional neural network to disentangle overlapping +signal components, enabling the isolation and quantification of their +individual contributions. Using multi-dimensional convolution kernels to +process all dimensions simultaneously enhances the network's ability to extract +information from adjacent pixels, time- or spectral-bins. This approach enables +more effective separation of components in cases where individual pixels do not +provide clear, well-resolved information. We showcase the method's practical +use in experimental physics through two test cases that highlight the +versatility of our approach: fluorescence lifetime microscopy and mode +decomposition in optical fibers. The latent unmixing method extracts valuable +information from complex signals that cannot be resolved by standard methods. +Application of latent unmixing to real FLIM experiments will increase the +number of distinguishable fluorescent markers. It will also open new +possibilities in optics and photonics for multichannel separations at increased +sampling rate. + +
+
+ comment: 16 pages, 8 figures (main paper) + 18 pages, 9 figures (supplementary + material) +
+
+
+
+
+ + ♻ ☆ Open3DIS: Open-Vocabulary 3D Instance Segmentation with 2D Mask Guidance CVPR 2024 + + +
+ We introduce Open3DIS, a novel solution designed to tackle the problem of +Open-Vocabulary Instance Segmentation within 3D scenes. Objects within 3D +environments exhibit diverse shapes, scales, and colors, making precise +instance-level identification a challenging task. Recent advancements in +Open-Vocabulary scene understanding have made significant strides in this area +by employing class-agnostic 3D instance proposal networks for object +localization and learning queryable features for each 3D mask. While these +methods produce high-quality instance proposals, they struggle with identifying +small-scale and geometrically ambiguous objects. The key idea of our method is +a new module that aggregates 2D instance masks across frames and maps them to +geometrically coherent point cloud regions as high-quality object proposals +addressing the above limitations. These are then combined with 3D +class-agnostic instance proposals to include a wide range of objects in the +real world. To validate our approach, we conducted experiments on three +prominent datasets, including ScanNet200, S3DIS, and Replica, demonstrating +significant performance gains in segmenting objects with diverse categories +over the state-of-the-art approaches. + +
+
+ comment: CVPR 2024. Project page: https://open3dis.github.io/ +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 116 + +
+
+
+ + ☆ Sigma: Siamese Mamba Network for Multi-Modal Semantic Segmentation + + +
+ Multi-modal semantic segmentation significantly enhances AI agents' +perception and scene understanding, especially under adverse conditions like +low-light or overexposed environments. Leveraging additional modalities +(X-modality) like thermal and depth alongside traditional RGB provides +complementary information, enabling more robust and reliable segmentation. In +this work, we introduce Sigma, a Siamese Mamba network for multi-modal semantic +segmentation, utilizing the Selective Structured State Space Model, Mamba. +Unlike conventional methods that rely on CNNs, with their limited local +receptive fields, or Vision Transformers (ViTs), which offer global receptive +fields at the cost of quadratic complexity, our model achieves global receptive +fields coverage with linear complexity. By employing a Siamese encoder and +innovating a Mamba fusion mechanism, we effectively select essential +information from different modalities. A decoder is then developed to enhance +the channel-wise modeling ability of the model. Our method, Sigma, is +rigorously evaluated on both RGB-Thermal and RGB-Depth segmentation tasks, +demonstrating its superiority and marking the first successful application of +State Space Models (SSMs) in multi-modal perception tasks. Code is available at +https://github.com/zifuwan/Sigma. + +
+
+
+
+
+ + ☆ Watermark-based Detection and Attribution of AI-Generated Content + + +
+ Several companies--such as Google, Microsoft, and OpenAI--have deployed +techniques to watermark AI-generated content to enable proactive detection. +However, existing literature mainly focuses on user-agnostic detection. +Attribution aims to further trace back the user of a generative-AI service who +generated a given content detected as AI-generated. Despite its growing +importance, attribution is largely unexplored. In this work, we aim to bridge +this gap by providing the first systematic study on watermark-based, user-aware +detection and attribution of AI-generated content. Specifically, we +theoretically study the detection and attribution performance via rigorous +probabilistic analysis. Moreover, we develop an efficient algorithm to select +watermarks for the users to enhance attribution performance. Both our +theoretical and empirical results show that watermark-based detection and +attribution inherit the accuracy and (non-)robustness properties of the +watermarking method. + +
+
+
+
+
+ + ☆ Who Evaluates the Evaluations? Objectively Scoring Text-to-Image Prompt + Coherence Metrics with T2IScoreScore (TS2) + + +
+ With advances in the quality of text-to-image (T2I) models has come interest +in benchmarking their prompt faithfulness-the semantic coherence of generated +images to the prompts they were conditioned on. A variety of T2I faithfulness +metrics have been proposed, leveraging advances in cross-modal embeddings and +vision-language models (VLMs). However, these metrics are not rigorously +compared and benchmarked, instead presented against few weak baselines by +correlation to human Likert scores over a set of easy-to-discriminate images. + We introduce T2IScoreScore (TS2), a curated set of semantic error graphs +containing a prompt and a set increasingly erroneous images. These allow us to +rigorously judge whether a given prompt faithfulness metric can correctly order +images with respect to their objective error count and significantly +discriminate between different error nodes, using meta-metric scores derived +from established statistical tests. Surprisingly, we find that the +state-of-the-art VLM-based metrics (e.g., TIFA, DSG, LLMScore, VIEScore) we +tested fail to significantly outperform simple feature-based metrics like +CLIPScore, particularly on a hard subset of naturally-occurring T2I model +errors. TS2 will enable the development of better T2I prompt faithfulness +metrics through more rigorous comparison of their conformity to expected +orderings and separations under objective criteria. + +
+
+ comment: 15 pages main, 9 pages appendices, 16 figures, 3 tables +
+
+
+
+
+ + ☆ Evaluating Adversarial Robustness: A Comparison Of FGSM, Carlini-Wagner + Attacks, And The Role of Distillation as Defense Mechanism + + +
+ This technical report delves into an in-depth exploration of adversarial +attacks specifically targeted at Deep Neural Networks (DNNs) utilized for image +classification. The study also investigates defense mechanisms aimed at +bolstering the robustness of machine learning models. The research focuses on +comprehending the ramifications of two prominent attack methodologies: the Fast +Gradient Sign Method (FGSM) and the Carlini-Wagner (CW) approach. These attacks +are examined concerning three pre-trained image classifiers: Resnext50_32x4d, +DenseNet-201, and VGG-19, utilizing the Tiny-ImageNet dataset. Furthermore, the +study proposes the robustness of defensive distillation as a defense mechanism +to counter FGSM and CW attacks. This defense mechanism is evaluated using the +CIFAR-10 dataset, where CNN models, specifically resnet101 and Resnext50_32x4d, +serve as the teacher and student models, respectively. The proposed defensive +distillation model exhibits effectiveness in thwarting attacks such as FGSM. +However, it is noted to remain susceptible to more sophisticated techniques +like the CW attack. The document presents a meticulous validation of the +proposed scheme. It provides detailed and comprehensive results, elucidating +the efficacy and limitations of the defense mechanisms employed. Through +rigorous experimentation and analysis, the study offers insights into the +dynamics of adversarial attacks on DNNs, as well as the effectiveness of +defensive strategies in mitigating their impact. + +
+
+ comment: This report pertains to the Capstone Project done by Group 1 of the + Fall batch of 2023 students at Praxis Tech School, Kolkata, India. The + reports consists of 35 pages and it includes 15 figures and 10 tables. This + is the preprint which will be submitted to to an IEEE international + conference for review +
+
+
+
+
+ + ☆ DiffOp-net: A Differential Operator-based Fully Convolutional Network + for Unsupervised Deformable Image Registration + + +
+ Existing unsupervised deformable image registration methods usually rely on +metrics applied to the gradients of predicted displacement or velocity fields +as a regularization term to ensure transformation smoothness, which potentially +limits registration accuracy. In this study, we propose a novel approach to +enhance unsupervised deformable image registration by introducing a new +differential operator into the registration framework. This operator, acting on +the velocity field and mapping it to a dual space, ensures the smoothness of +the velocity field during optimization, facilitating accurate deformable +registration. In addition, to tackle the challenge of capturing large +deformations inside image pairs, we introduce a Cross-Coordinate Attention +module (CCA) and embed it into a proposed Fully Convolutional Networks +(FCNs)-based multi-resolution registration architecture. Evaluation experiments +are conducted on two magnetic resonance imaging (MRI) datasets. Compared to +various state-of-the-art registration approaches, including a traditional +algorithm and three representative unsupervised learning-based methods, our +method achieves superior accuracies, maintaining desirable diffeomorphic +properties, and exhibiting promising registration speed. + +
+
+
+
+
+ + ☆ Identity Decoupling for Multi-Subject Personalization of Text-to-Image + Models + + +
+ Text-to-image diffusion models have shown remarkable success in generating a +personalized subject based on a few reference images. However, current methods +struggle with handling multiple subjects simultaneously, often resulting in +mixed identities with combined attributes from different subjects. In this +work, we present MuDI, a novel framework that enables multi-subject +personalization by effectively decoupling identities from multiple subjects. +Our main idea is to utilize segmented subjects generated by the Segment +Anything Model for both training and inference, as a form of data augmentation +for training and initialization for the generation process. Our experiments +demonstrate that MuDI can produce high-quality personalized images without +identity mixing, even for highly similar subjects as shown in Figure 1. In +human evaluation, MuDI shows twice as many successes for personalizing multiple +subjects without identity mixing over existing baselines and is preferred over +70% compared to the strongest baseline. More results are available at +https://mudi-t2i.github.io/. + +
+
+ comment: Preprint. Project page: https://mudi-t2i.github.io/ +
+
+
+
+
+ + ☆ Physical Property Understanding from Language-Embedded Feature Fields CVPR 2024 + + +
+ Can computers perceive the physical properties of objects solely through +vision? Research in cognitive science and vision science has shown that humans +excel at identifying materials and estimating their physical properties based +purely on visual appearance. In this paper, we present a novel approach for +dense prediction of the physical properties of objects using a collection of +images. Inspired by how humans reason about physics through vision, we leverage +large language models to propose candidate materials for each object. We then +construct a language-embedded point cloud and estimate the physical properties +of each 3D point using a zero-shot kernel regression approach. Our method is +accurate, annotation-free, and applicable to any object in the open world. +Experiments demonstrate the effectiveness of the proposed approach in various +physical property reasoning tasks, such as estimating the mass of common +objects, as well as other properties like friction and hardness. + +
+
+ comment: CVPR 2024. Project page (with code): + https://ajzhai.github.io/NeRF2Physics/ +
+
+
+
+
+ + ☆ Image-Text Co-Decomposition for Text-Supervised Semantic Segmentation CVPR 2024 + + +
+ This paper addresses text-supervised semantic segmentation, aiming to learn a +model capable of segmenting arbitrary visual concepts within images by using +only image-text pairs without dense annotations. Existing methods have +demonstrated that contrastive learning on image-text pairs effectively aligns +visual segments with the meanings of texts. We notice that there is a +discrepancy between text alignment and semantic segmentation: A text often +consists of multiple semantic concepts, whereas semantic segmentation strives +to create semantically homogeneous segments. To address this issue, we propose +a novel framework, Image-Text Co-Decomposition (CoDe), where the paired image +and text are jointly decomposed into a set of image regions and a set of word +segments, respectively, and contrastive learning is developed to enforce +region-word alignment. To work with a vision-language model, we present a +prompt learning mechanism that derives an extra representation to highlight an +image segment or a word segment of interest, with which more effective features +can be extracted from that segment. Comprehensive experimental results +demonstrate that our method performs favorably against existing text-supervised +semantic segmentation methods on six benchmark datasets. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Robust Gaussian Splatting + + +
+ In this paper, we address common error sources for 3D Gaussian Splatting +(3DGS) including blur, imperfect camera poses, and color inconsistencies, with +the goal of improving its robustness for practical applications like +reconstructions from handheld phone captures. Our main contribution involves +modeling motion blur as a Gaussian distribution over camera poses, allowing us +to address both camera pose refinement and motion blur correction in a unified +way. Additionally, we propose mechanisms for defocus blur compensation and for +addressing color in-consistencies caused by ambient light, shadows, or due to +camera-related factors like varying white balancing settings. Our proposed +solutions integrate in a seamless way with the 3DGS formulation while +maintaining its benefits in terms of training efficiency and rendering speed. +We experimentally validate our contributions on relevant benchmark datasets +including Scannet++ and Deblur-NeRF, obtaining state-of-the-art results and +thus consistent improvements over relevant baselines. + +
+
+
+
+
+ + ☆ Deep-learning Segmentation of Small Volumes in CT images for + Radiotherapy Treatment Planning + + +
+ Our understanding of organs at risk is progressing to include physical small +tissues such as coronary arteries and the radiosensitivities of many small +organs and tissues are high. Therefore, the accurate segmentation of small +volumes in external radiotherapy is crucial to protect them from +over-irradiation. Moreover, with the development of the particle therapy and +on-board imaging, the treatment becomes more accurate and precise. The purpose +of this work is to optimize organ segmentation algorithms for small organs. We +used 50 three-dimensional (3-D) computed tomography (CT) head and neck images +from StructSeg2019 challenge to develop a general-purpose V-Net model to +segment 20 organs in the head and neck region. We applied specific strategies +to improve the segmentation accuracy of the small volumes in this anatomical +region, i.e., the lens of the eye. Then, we used 17 additional head images from +OSF healthcare to validate the robustness of the V Net model optimized for +small-volume segmentation. With the study of the StructSeg2019 images, we found +that the optimization of the image normalization range and classification +threshold yielded a segmentation improvement of the lens of the eye of +approximately 50%, compared to the use of the V-Net not optimized for small +volumes. We used the optimized model to segment 17 images acquired using +heterogeneous protocols. We obtained comparable Dice coefficient values for the +clinical and StructSeg2019 images (0.61 plus/minus 0.07 and 0.58 plus/minus +0.10 for the left and right lens of the eye, respectively) + +
+
+
+
+
+ + ☆ SCAResNet: A ResNet Variant Optimized for Tiny Object Detection in + Transmission and Distribution Towers + + +
+ Traditional deep learning-based object detection networks often resize images +during the data preprocessing stage to achieve a uniform size and scale in the +feature map. Resizing is done to facilitate model propagation and fully +connected classification. However, resizing inevitably leads to object +deformation and loss of valuable information in the images. This drawback +becomes particularly pronounced for tiny objects like distribution towers with +linear shapes and few pixels. To address this issue, we propose abandoning the +resizing operation. Instead, we introduce Positional-Encoding Multi-head +Criss-Cross Attention. This allows the model to capture contextual information +and learn from multiple representation subspaces, effectively enriching the +semantics of distribution towers. Additionally, we enhance Spatial Pyramid +Pooling by reshaping three pooled feature maps into a new unified one while +also reducing the computational burden. This approach allows images of +different sizes and scales to generate feature maps with uniform dimensions and +can be employed in feature map propagation. Our SCAResNet incorporates these +aforementioned improvements into the backbone network ResNet. We evaluated our +SCAResNet using the Electric Transmission and Distribution Infrastructure +Imagery dataset from Duke University. Without any additional tricks, we +employed various object detection models with Gaussian Receptive Field based +Label Assignment as the baseline. When incorporating the SCAResNet into the +baseline model, we achieved a 2.1% improvement in mAPs. This demonstrates the +advantages of our SCAResNet in detecting transmission and distribution towers +and its value in tiny object detection. The source code is available at +https://github.com/LisavilaLee/SCAResNet_mmdet. + +
+
+
+
+
+ + ☆ Noisy Label Processing for Classification: A Survey + + +
+ In recent years, deep neural networks (DNNs) have gained remarkable +achievement in computer vision tasks, and the success of DNNs often depends +greatly on the richness of data. However, the acquisition process of data and +high-quality ground truth requires a lot of manpower and money. In the long, +tedious process of data annotation, annotators are prone to make mistakes, +resulting in incorrect labels of images, i.e., noisy labels. The emergence of +noisy labels is inevitable. Moreover, since research shows that DNNs can easily +fit noisy labels, the existence of noisy labels will cause significant damage +to the model training process. Therefore, it is crucial to combat noisy labels +for computer vision tasks, especially for classification tasks. In this survey, +we first comprehensively review the evolution of different deep learning +approaches for noisy label combating in the image classification task. In +addition, we also review different noise patterns that have been proposed to +design robust algorithms. Furthermore, we explore the inner pattern of +real-world label noise and propose an algorithm to generate a synthetic label +noise pattern guided by real-world data. We test the algorithm on the +well-known real-world dataset CIFAR-10N to form a new real-world data-guided +synthetic benchmark and evaluate some typical noise-robust methods on the +benchmark. + +
+
+
+
+
+ + ☆ MarsSeg: Mars Surface Semantic Segmentation with Multi-level Extractor + and Connector + + +
+ The segmentation and interpretation of the Martian surface play a pivotal +role in Mars exploration, providing essential data for the trajectory planning +and obstacle avoidance of rovers. However, the complex topography, similar +surface features, and the lack of extensive annotated data pose significant +challenges to the high-precision semantic segmentation of the Martian surface. +To address these challenges, we propose a novel encoder-decoder based Mars +segmentation network, termed MarsSeg. Specifically, we employ an +encoder-decoder structure with a minimized number of down-sampling layers to +preserve local details. To facilitate a high-level semantic understanding +across the shadow multi-level feature maps, we introduce a feature enhancement +connection layer situated between the encoder and decoder. This layer +incorporates Mini Atrous Spatial Pyramid Pooling (Mini-ASPP), Polarized +Self-Attention (PSA), and Strip Pyramid Pooling Module (SPPM). The Mini-ASPP +and PSA are specifically designed for shadow feature enhancement, thereby +enabling the expression of local details and small objects. Conversely, the +SPPM is employed for deep feature enhancement, facilitating the extraction of +high-level semantic category-related information. Experimental results derived +from the Mars-Seg and AI4Mars datasets substantiate that the proposed MarsSeg +outperforms other state-of-the-art methods in segmentation performance, +validating the efficacy of each proposed component. + +
+
+
+
+
+ + ☆ Improving Detection in Aerial Images by Capturing Inter-Object + Relationships + + +
+ In many image domains, the spatial distribution of objects in a scene +exhibits meaningful patterns governed by their semantic relationships. In most +modern detection pipelines, however, the detection proposals are processed +independently, overlooking the underlying relationships between objects. In +this work, we introduce a transformer-based approach to capture these +inter-object relationships to refine classification and regression outcomes for +detected objects. Building on two-stage detectors, we tokenize the region of +interest (RoI) proposals to be processed by a transformer encoder. Specific +spatial and geometric relations are incorporated into the attention weights and +adaptively modulated and regularized. Experimental results demonstrate that the +proposed method achieves consistent performance improvement on three benchmarks +including DOTA-v1.0, DOTA-v1.5, and HRSC 2016, especially ranking first on both +DOTA-v1.5 and HRSC 2016. Specifically, our new method has an increase of 1.59 +mAP on DOTA-v1.0, 4.88 mAP on DOTA-v1.5, and 2.1 mAP on HRSC 2016, +respectively, compared to the baselines. + +
+
+
+
+
+ + ☆ 3D Facial Expressions through Analysis-by-Neural-Synthesis + + +
+ While existing methods for 3D face reconstruction from in-the-wild images +excel at recovering the overall face shape, they commonly miss subtle, extreme, +asymmetric, or rarely observed expressions. We improve upon these methods with +SMIRK (Spatial Modeling for Image-based Reconstruction of Kinesics), which +faithfully reconstructs expressive 3D faces from images. We identify two key +limitations in existing methods: shortcomings in their self-supervised training +formulation, and a lack of expression diversity in the training images. For +training, most methods employ differentiable rendering to compare a predicted +face mesh with the input image, along with a plethora of additional loss +functions. This differentiable rendering loss not only has to provide +supervision to optimize for 3D face geometry, camera, albedo, and lighting, +which is an ill-posed optimization problem, but the domain gap between +rendering and input image further hinders the learning process. Instead, SMIRK +replaces the differentiable rendering with a neural rendering module that, +given the rendered predicted mesh geometry, and sparsely sampled pixels of the +input image, generates a face image. As the neural rendering gets color +information from sampled image pixels, supervising with neural rendering-based +reconstruction loss can focus solely on the geometry. Further, it enables us to +generate images of the input identity with varying expressions while training. +These are then utilized as input to the reconstruction model and used as +supervision with ground truth geometry. This effectively augments the training +data and enhances the generalization for diverse expressions. Our qualitative, +quantitative and particularly our perceptual evaluations demonstrate that SMIRK +achieves the new state-of-the art performance on accurate expression +reconstruction. Project webpage: https://georgeretsi.github.io/smirk/. + +
+
+
+
+
+ + ☆ Dynamic Prompt Optimizing for Text-to-Image Generation CVPR 2024 + + +
+ Text-to-image generative models, specifically those based on diffusion models +like Imagen and Stable Diffusion, have made substantial advancements. Recently, +there has been a surge of interest in the delicate refinement of text prompts. +Users assign weights or alter the injection time steps of certain words in the +text prompts to improve the quality of generated images. However, the success +of fine-control prompts depends on the accuracy of the text prompts and the +careful selection of weights and time steps, which requires significant manual +intervention. To address this, we introduce the \textbf{P}rompt +\textbf{A}uto-\textbf{E}diting (PAE) method. Besides refining the original +prompts for image generation, we further employ an online reinforcement +learning strategy to explore the weights and injection time steps of each word, +leading to the dynamic fine-control prompts. The reward function during +training encourages the model to consider aesthetic score, semantic +consistency, and user preferences. Experimental results demonstrate that our +proposed method effectively improves the original prompts, generating visually +more appealing images while maintaining semantic alignment. Code is available +at https://github.com/Mowenyii/PAE. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Label Propagation for Zero-shot Classification with Vision-Language + Models CVPR 2024 + + +
+ Vision-Language Models (VLMs) have demonstrated impressive performance on +zero-shot classification, i.e. classification when provided merely with a list +of class names. In this paper, we tackle the case of zero-shot classification +in the presence of unlabeled data. We leverage the graph structure of the +unlabeled data and introduce ZLaP, a method based on label propagation (LP) +that utilizes geodesic distances for classification. We tailor LP to graphs +containing both text and image features and further propose an efficient method +for performing inductive inference based on a dual solution and a +sparsification step. We perform extensive experiments to evaluate the +effectiveness of our method on 14 common datasets and show that ZLaP +outperforms the latest related works. Code: +https://github.com/vladan-stojnic/ZLaP + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Score identity Distillation: Exponentially Fast Distillation of + Pretrained Diffusion Models for One-Step Generation + + +
+ We introduce Score identity Distillation (SiD), an innovative data-free +method that distills the generative capabilities of pretrained diffusion models +into a single-step generator. SiD not only facilitates an exponentially fast +reduction in Fr\'echet inception distance (FID) during distillation but also +approaches or even exceeds the FID performance of the original teacher +diffusion models. By reformulating forward diffusion processes as semi-implicit +distributions, we leverage three score-related identities to create an +innovative loss mechanism. This mechanism achieves rapid FID reduction by +training the generator using its own synthesized images, eliminating the need +for real data or reverse-diffusion-based generation, all accomplished within +significantly shortened generation time. Upon evaluation across four benchmark +datasets, the SiD algorithm demonstrates high iteration efficiency during +distillation and surpasses competing distillation approaches, whether they are +one-step or few-step, data-free, or dependent on training data, in terms of +generation quality. This achievement not only redefines the benchmarks for +efficiency and effectiveness in diffusion distillation but also in the broader +field of diffusion-based generation. Our PyTorch implementation will be +publicly accessible on GitHub. + +
+
+
+
+
+ + ☆ No Time to Train: Empowering Non-Parametric Networks for Few-shot 3D + Scene Segmentation CVPR + + +
+ To reduce the reliance on large-scale datasets, recent works in 3D +segmentation resort to few-shot learning. Current 3D few-shot segmentation +methods first pre-train models on 'seen' classes, and then evaluate their +generalization performance on 'unseen' classes. However, the prior pre-training +stage not only introduces excessive time overhead but also incurs a significant +domain gap on 'unseen' classes. To tackle these issues, we propose a +Non-parametric Network for few-shot 3D Segmentation, Seg-NN, and its Parametric +variant, Seg-PN. Without training, Seg-NN extracts dense representations by +hand-crafted filters and achieves comparable performance to existing parametric +models. Due to the elimination of pre-training, Seg-NN can alleviate the domain +gap issue and save a substantial amount of time. Based on Seg-NN, Seg-PN only +requires training a lightweight QUEry-Support Transferring (QUEST) module, +which enhances the interaction between the support set and query set. +Experiments suggest that Seg-PN outperforms previous state-of-the-art method by ++4.19% and +7.71% mIoU on S3DIS and ScanNet datasets respectively, while +reducing training time by -90%, indicating its effectiveness and efficiency. + +
+
+ comment: CVPR Highlight. Code is available at + https://github.com/yangyangyang127/Seg-NN. arXiv admin note: text overlap + with arXiv:2308.12961 +
+
+
+
+
+ + ☆ Dynamic Risk Assessment Methodology with an LDM-based System for Parking + Scenarios + + +
+ This paper describes the methodology for building a dynamic risk assessment +for ADAS (Advanced Driving Assistance Systems) algorithms in parking scenarios, +fusing exterior and interior perception for a better understanding of the scene +and a more comprehensive risk estimation. This includes the definition of a +dynamic risk methodology that depends on the situation from inside and outside +the vehicle, the creation of a multi-sensor dataset of risk assessment for ADAS +benchmarking purposes, and a Local Dynamic Map (LDM) that fuses data from the +exterior and interior of the car to build an LDM-based Dynamic Risk Assessment +System (DRAS). + +
+
+
+
+
+ + ☆ InstructHumans: Editing Animated 3D Human Textures with Instructions + + +
+ We present InstructHumans, a novel framework for instruction-driven 3D human +texture editing. Existing text-based editing methods use Score Distillation +Sampling (SDS) to distill guidance from generative models. This work shows that +naively using such scores is harmful to editing as they destroy consistency +with the source avatar. Instead, we propose an alternate SDS for Editing +(SDS-E) that selectively incorporates subterms of SDS across diffusion +timesteps. We further enhance SDS-E with spatial smoothness regularization and +gradient-based viewpoint sampling to achieve high-quality edits with sharp and +high-fidelity detailing. InstructHumans significantly outperforms existing 3D +editing methods, consistent with the initial avatar while faithful to the +textual instructions. Project page: https://jyzhu.top/instruct-humans . + +
+
+ comment: Project Page: https://jyzhu.top/instruct-humans +
+
+
+
+
+ + ☆ MM-Gaussian: 3D Gaussian-based Multi-modal Fusion for Localization and + Reconstruction in Unbounded Scenes + + +
+ Localization and mapping are critical tasks for various applications such as +autonomous vehicles and robotics. The challenges posed by outdoor environments +present particular complexities due to their unbounded characteristics. In this +work, we present MM-Gaussian, a LiDAR-camera multi-modal fusion system for +localization and mapping in unbounded scenes. Our approach is inspired by the +recently developed 3D Gaussians, which demonstrate remarkable capabilities in +achieving high rendering quality and fast rendering speed. Specifically, our +system fully utilizes the geometric structure information provided by +solid-state LiDAR to address the problem of inaccurate depth encountered when +relying solely on visual solutions in unbounded, outdoor scenarios. +Additionally, we utilize 3D Gaussian point clouds, with the assistance of +pixel-level gradient descent, to fully exploit the color information in photos, +thereby achieving realistic rendering effects. To further bolster the +robustness of our system, we designed a relocalization module, which assists in +returning to the correct trajectory in the event of a localization failure. +Experiments conducted in multiple scenarios demonstrate the effectiveness of +our method. + +
+
+ comment: 7 pages, 5 figures +
+
+
+
+
+ + ☆ Framework to generate perfusion map from CT and CTA images in patients + with acute ischemic stroke: A longitudinal and cross-sectional study MICCAI 2023 + + +
+ Stroke is a leading cause of disability and death. Effective treatment +decisions require early and informative vascular imaging. 4D perfusion imaging +is ideal but rarely available within the first hour after stroke, whereas plain +CT and CTA usually are. Hence, we propose a framework to extract a predicted +perfusion map (PPM) derived from CT and CTA images. In all eighteen patients, +we found significantly high spatial similarity (with average Spearman's +correlation = 0.7893) between our predicted perfusion map (PPM) and the T-max +map derived from 4D-CTP. Voxelwise correlations between the PPM and National +Institutes of Health Stroke Scale (NIHSS) subscores for L/R hand motor, gaze, +and language on a large cohort of 2,110 subjects reliably mapped symptoms to +expected infarct locations. Therefore our PPM could serve as an alternative for +4D perfusion imaging, if the latter is unavailable, to investigate blood +perfusion in the first hours after hospital admission. + +
+
+ comment: Accepted and presented in SWITCH2023: Stroke Workshop on Imaging and + Treatment CHallenges (MICCAI 2023, Vancouver Canada) +
+
+
+
+
+ + ☆ Neural-Symbolic VideoQA: Learning Compositional Spatio-Temporal + Reasoning for Real-world Video Question Answering + + +
+ Compositional spatio-temporal reasoning poses a significant challenge in the +field of video question answering (VideoQA). Existing approaches struggle to +establish effective symbolic reasoning structures, which are crucial for +answering compositional spatio-temporal questions. To address this challenge, +we propose a neural-symbolic framework called Neural-Symbolic VideoQA +(NS-VideoQA), specifically designed for real-world VideoQA tasks. The +uniqueness and superiority of NS-VideoQA are two-fold: 1) It proposes a Scene +Parser Network (SPN) to transform static-dynamic video scenes into Symbolic +Representation (SR), structuralizing persons, objects, relations, and action +chronologies. 2) A Symbolic Reasoning Machine (SRM) is designed for top-down +question decompositions and bottom-up compositional reasonings. Specifically, a +polymorphic program executor is constructed for internally consistent reasoning +from SR to the final answer. As a result, Our NS-VideoQA not only improves the +compositional spatio-temporal reasoning in real-world VideoQA task, but also +enables step-by-step error analysis by tracing the intermediate results. +Experimental evaluations on the AGQA Decomp benchmark demonstrate the +effectiveness of the proposed NS-VideoQA framework. Empirical studies further +confirm that NS-VideoQA exhibits internal consistency in answering +compositional questions and significantly improves the capability of +spatio-temporal and logical inference for VideoQA tasks. + +
+
+
+
+
+ + ☆ Finsler-Laplace-Beltrami Operators with Application to Shape Analysis + + +
+ The Laplace-Beltrami operator (LBO) emerges from studying manifolds equipped +with a Riemannian metric. It is often called the Swiss army knife of geometry +processing as it allows to capture intrinsic shape information and gives rise +to heat diffusion, geodesic distances, and a multitude of shape descriptors. It +also plays a central role in geometric deep learning. In this work, we explore +Finsler manifolds as a generalization of Riemannian manifolds. We revisit the +Finsler heat equation and derive a Finsler heat kernel and a +Finsler-Laplace-Beltrami Operator (FLBO): a novel theoretically justified +anisotropic Laplace-Beltrami operator (ALBO). In experimental evaluations we +demonstrate that the proposed FLBO is a valuable alternative to the traditional +Riemannian-based LBO and ALBOs for spatial filtering and shape correspondence +estimation. We hope that the proposed Finsler heat kernel and the FLBO will +inspire further exploration of Finsler geometry in the computer vision +community. + +
+
+
+
+
+ + ☆ Physics-Inspired Synthesized Underwater Image Dataset + + +
+ This paper introduces the physics-inspired synthesized underwater image +dataset (PHISWID), a dataset tailored for enhancing underwater image processing +through physics-inspired image synthesis. Deep learning approaches to +underwater image enhancement typically demand extensive datasets, yet acquiring +paired clean and degraded underwater ones poses significant challenges. While +several underwater image datasets have been proposed using physics-based +synthesis, a publicly accessible collection has been lacking. Additionally, +most underwater image synthesis approaches do not intend to reproduce +atmospheric scenes, resulting in incomplete enhancement. PHISWID addresses this +gap by offering a set of paired ground-truth (atmospheric) and synthetically +degraded underwater images, showcasing not only color degradation but also the +often-neglected effects of marine snow, a composite of organic matter and sand +particles that considerably impairs underwater image clarity. The dataset +applies these degradations to atmospheric RGB-D images, enhancing the dataset's +realism and applicability. PHISWID is particularly valuable for training deep +neural networks in a supervised learning setting and for objectively assessing +image quality in benchmark analyses. Our results reveal that even a basic U-Net +architecture, when trained with PHISWID, substantially outperforms existing +methods in underwater image enhancement. We intend to release PHISWID publicly, +contributing a significant resource to the advancement of underwater imaging +technology. + +
+
+
+
+
+ + ☆ Rolling the dice for better deep learning performance: A study of + randomness techniques in deep neural networks + + +
+ This paper investigates how various randomization techniques impact Deep +Neural Networks (DNNs). Randomization, like weight noise and dropout, aids in +reducing overfitting and enhancing generalization, but their interactions are +poorly understood. The study categorizes randomness techniques into four types +and proposes new methods: adding noise to the loss function and random masking +of gradient updates. Using Particle Swarm Optimizer (PSO) for hyperparameter +optimization, it explores optimal configurations across MNIST, FASHION-MNIST, +CIFAR10, and CIFAR100 datasets. Over 30,000 configurations are evaluated, +revealing data augmentation and weight initialization randomness as main +performance contributors. Correlation analysis shows different optimizers +prefer distinct randomization types. The complete implementation and dataset +are available on GitHub. + +
+
+
+
+
+ + ☆ Towards Efficient and Accurate CT Segmentation via Edge-Preserving + Probabilistic Downsampling + + +
+ Downsampling images and labels, often necessitated by limited resources or to +expedite network training, leads to the loss of small objects and thin +boundaries. This undermines the segmentation network's capacity to interpret +images accurately and predict detailed labels, resulting in diminished +performance compared to processing at original resolutions. This situation +exemplifies the trade-off between efficiency and accuracy, with higher +downsampling factors further impairing segmentation outcomes. Preserving +information during downsampling is especially critical for medical image +segmentation tasks. To tackle this challenge, we introduce a novel method named +Edge-preserving Probabilistic Downsampling (EPD). It utilizes class uncertainty +within a local window to produce soft labels, with the window size dictating +the downsampling factor. This enables a network to produce quality predictions +at low resolutions. Beyond preserving edge details more effectively than +conventional nearest-neighbor downsampling, employing a similar algorithm for +images, it surpasses bilinear interpolation in image downsampling, enhancing +overall performance. Our method significantly improved Intersection over Union +(IoU) to 2.85%, 8.65%, and 11.89% when downsampling data to 1/2, 1/4, and 1/8, +respectively, compared to conventional interpolation methods. + +
+
+ comment: 5 pages (4 figures, 1 table); This work has been submitted to the + IEEE Signal Processing Letters. Copyright may be transferred without notice, + after which this version may no longer be accessible +
+
+
+
+
+ + ☆ RaSim: A Range-aware High-fidelity RGB-D Data Simulation Pipeline for + Real-world Applications ICRA'24 + + +
+ In robotic vision, a de-facto paradigm is to learn in simulated environments +and then transfer to real-world applications, which poses an essential +challenge in bridging the sim-to-real domain gap. While mainstream works tackle +this problem in the RGB domain, we focus on depth data synthesis and develop a +range-aware RGB-D data simulation pipeline (RaSim). In particular, +high-fidelity depth data is generated by imitating the imaging principle of +real-world sensors. A range-aware rendering strategy is further introduced to +enrich data diversity. Extensive experiments show that models trained with +RaSim can be directly applied to real-world scenarios without any finetuning +and excel at downstream RGB-D perception tasks. + +
+
+ comment: accepted by ICRA'24 +
+
+
+
+
+ + ☆ Deep Learning for Satellite Image Time Series Analysis: A Review + + +
+ Earth observation (EO) satellite missions have been providing detailed images +about the state of the Earth and its land cover for over 50 years. Long term +missions, such as NASA's Landsat, Terra, and Aqua satellites, and more +recently, the ESA's Sentinel missions, record images of the entire world every +few days. Although single images provide point-in-time data, repeated images of +the same area, or satellite image time series (SITS) provide information about +the changing state of vegetation and land use. These SITS are useful for +modeling dynamic processes and seasonal changes such as plant phenology. They +have potential benefits for many aspects of land and natural resource +management, including applications in agricultural, forest, water, and disaster +management, urban planning, and mining. However, the resulting satellite image +time series (SITS) are complex, incorporating information from the temporal, +spatial, and spectral dimensions. Therefore, deep learning methods are often +deployed as they can analyze these complex relationships. This review presents +a summary of the state-of-the-art methods of modelling environmental, +agricultural, and other Earth observation variables from SITS data using deep +learning methods. We aim to provide a resource for remote sensing experts +interested in using deep learning techniques to enhance Earth observation +models with temporal information. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ Real-GDSR: Real-World Guided DSM Super-Resolution via Edge-Enhancing + Residual Network SP + + +
+ A low-resolution digital surface model (DSM) features distinctive attributes +impacted by noise, sensor limitations and data acquisition conditions, which +failed to be replicated using simple interpolation methods like bicubic. This +causes super-resolution models trained on synthetic data does not perform +effectively on real ones. Training a model on real low and high resolution DSMs +pairs is also a challenge because of the lack of information. On the other +hand, the existence of other imaging modalities of the same scene can be used +to enrich the information needed for large-scale super-resolution. In this +work, we introduce a novel methodology to address the intricacies of real-world +DSM super-resolution, named REAL-GDSR, breaking down this ill-posed problem +into two steps. The first step involves the utilization of a residual local +refinement network. This strategic approach departs from conventional methods +that trained to directly predict height values instead of the differences +(residuals) and utilize large receptive fields in their networks. The second +step introduces a diffusion-based technique that enhances the results on a +global scale, with a primary focus on smoothing and edge preservation. Our +experiments underscore the effectiveness of the proposed method. We conduct a +comprehensive evaluation, comparing it to recent state-of-the-art techniques in +the domain of real-world DSM super-resolution (SR). Our approach consistently +outperforms these existing methods, as evidenced through qualitative and +quantitative assessments. + +
+
+ comment: Accepted for publication in the ISPRS Annals of Photogrammetry, + Remote Sensing, and Spatial Information Sciences +
+
+
+
+
+ + ☆ LightOctree: Lightweight 3D Spatially-Coherent Indoor Lighting + Estimation + + +
+ We present a lightweight solution for estimating spatially-coherent indoor +lighting from a single RGB image. Previous methods for estimating illumination +using volumetric representations have overlooked the sparse distribution of +light sources in space, necessitating substantial memory and computational +resources for achieving high-quality results. We introduce a unified, voxel +octree-based illumination estimation framework to produce 3D spatially-coherent +lighting. Additionally, a differentiable voxel octree cone tracing rendering +layer is proposed to eliminate regular volumetric representation throughout the +entire process and ensure the retention of features across different frequency +domains. This reduction significantly decreases spatial usage and required +floating-point operations without substantially compromising precision. +Experimental results demonstrate that our approach achieves high-quality +coherent estimation with minimal cost compared to previous methods. + +
+
+
+
+
+ + ☆ Learning Correlation Structures for Vision Transformers CVPR 2024 + + +
+ We introduce a new attention mechanism, dubbed structural self-attention +(StructSA), that leverages rich correlation patterns naturally emerging in +key-query interactions of attention. StructSA generates attention maps by +recognizing space-time structures of key-query correlations via convolution and +uses them to dynamically aggregate local contexts of value features. This +effectively leverages rich structural patterns in images and videos such as +scene layouts, object motion, and inter-object relations. Using StructSA as a +main building block, we develop the structural vision transformer (StructViT) +and evaluate its effectiveness on both image and video classification tasks, +achieving state-of-the-art results on ImageNet-1K, Kinetics-400, +Something-Something V1 & V2, Diving-48, and FineGym. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Concept Weaver: Enabling Multi-Concept Fusion in Text-to-Image Models CVPR 2024 + + +
+ While there has been significant progress in customizing text-to-image +generation models, generating images that combine multiple personalized +concepts remains challenging. In this work, we introduce Concept Weaver, a +method for composing customized text-to-image diffusion models at inference +time. Specifically, the method breaks the process into two steps: creating a +template image aligned with the semantics of input prompts, and then +personalizing the template using a concept fusion strategy. The fusion strategy +incorporates the appearance of the target concepts into the template image +while retaining its structural details. The results indicate that our method +can generate multiple custom concepts with higher identity fidelity compared to +alternative approaches. Furthermore, the method is shown to seamlessly handle +more than two concepts and closely follow the semantic meaning of the input +prompt without blending appearances across different subjects. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Deep Phase Coded Image Prior + + +
+ Phase-coded imaging is a computational imaging method designed to tackle +tasks such as passive depth estimation and extended depth of field (EDOF) using +depth cues inserted during image capture. Most of the current deep +learning-based methods for depth estimation or all-in-focus imaging require a +training dataset with high-quality depth maps and an optimal focus point at +infinity for all-in-focus images. Such datasets are difficult to create, +usually synthetic, and require external graphic programs. We propose a new +method named "Deep Phase Coded Image Prior" (DPCIP) for jointly recovering the +depth map and all-in-focus image from a coded-phase image using solely the +captured image and the optical information of the imaging system. Our approach +does not depend on any specific dataset and surpasses prior supervised +techniques utilizing the same imaging system. This improvement is achieved +through the utilization of a problem formulation based on implicit neural +representation (INR) and deep image prior (DIP). Due to our zero-shot method, +we overcome the barrier of acquiring accurate ground-truth data of depth maps +and all-in-focus images for each new phase-coded system introduced. This allows +focusing mainly on developing the imaging system, and not on ground-truth data +collection. + +
+
+
+
+
+ + ☆ VoltaVision: A Transfer Learning model for electronic component + classification ICLR 2024 + + +
+ In this paper, we analyze the effectiveness of transfer learning on +classifying electronic components. Transfer learning reuses pre-trained models +to save time and resources in building a robust classifier rather than learning +from scratch. Our work introduces a lightweight CNN, coined as VoltaVision, and +compares its performance against more complex models. We test the hypothesis +that transferring knowledge from a similar task to our target domain yields +better results than state-of-the-art models trained on general datasets. Our +dataset and code for this work are available at +https://github.com/AnasIshfaque/VoltaVision. + +
+
+ comment: Tiny Paper at ICLR 2024 +
+
+
+
+
+ + ☆ Enhancing Breast Cancer Diagnosis in Mammography: Evaluation and + Integration of Convolutional Neural Networks and Explainable AI + + +
+ The study introduces an integrated framework combining Convolutional Neural +Networks (CNNs) and Explainable Artificial Intelligence (XAI) for the enhanced +diagnosis of breast cancer using the CBIS-DDSM dataset. Utilizing a fine-tuned +ResNet50 architecture, our investigation not only provides effective +differentiation of mammographic images into benign and malignant categories but +also addresses the opaque "black-box" nature of deep learning models by +employing XAI methodologies, namely Grad-CAM, LIME, and SHAP, to interpret CNN +decision-making processes for healthcare professionals. Our methodology +encompasses an elaborate data preprocessing pipeline and advanced data +augmentation techniques to counteract dataset limitations, and transfer +learning using pre-trained networks, such as VGG-16, DenseNet and ResNet was +employed. A focal point of our study is the evaluation of XAI's effectiveness +in interpreting model predictions, highlighted by utilising the Hausdorff +measure to assess the alignment between AI-generated explanations and expert +annotations quantitatively. This approach plays a critical role for XAI in +promoting trustworthiness and ethical fairness in AI-assisted diagnostics. The +findings from our research illustrate the effective collaboration between CNNs +and XAI in advancing diagnostic methods for breast cancer, thereby facilitating +a more seamless integration of advanced AI technologies within clinical +settings. By enhancing the interpretability of AI-driven decisions, this work +lays the groundwork for improved collaboration between AI systems and medical +practitioners, ultimately enriching patient care. Furthermore, the implications +of our research extend well beyond the current methodologies, advocating for +subsequent inquiries into the integration of multimodal data and the refinement +of AI explanations to satisfy the needs of clinical practice. + +
+
+
+
+
+ + ☆ LiDAR-Guided Cross-Attention Fusion for Hyperspectral Band Selection and + Image Classification + + +
+ The fusion of hyperspectral and LiDAR data has been an active research topic. +Existing fusion methods have ignored the high-dimensionality and redundancy +challenges in hyperspectral images, despite that band selection methods have +been intensively studied for hyperspectral image (HSI) processing. This paper +addresses this significant gap by introducing a cross-attention mechanism from +the transformer architecture for the selection of HSI bands guided by LiDAR +data. LiDAR provides high-resolution vertical structural information, which can +be useful in distinguishing different types of land cover that may have similar +spectral signatures but different structural profiles. In our approach, the +LiDAR data are used as the "query" to search and identify the "key" from the +HSI to choose the most pertinent bands for LiDAR. This method ensures that the +selected HSI bands drastically reduce redundancy and computational requirements +while working optimally with the LiDAR data. Extensive experiments have been +undertaken on three paired HSI and LiDAR data sets: Houston 2013, Trento and +MUUFL. The results highlight the superiority of the cross-attention mechanism, +underlining the enhanced classification accuracy of the identified HSI bands +when fused with the LiDAR features. The results also show that the use of fewer +bands combined with LiDAR surpasses the performance of state-of-the-art fusion +models. + +
+
+ comment: 15 pages, 13 figures +
+
+
+
+
+ + ☆ Increasing Fairness in Classification of Out of Distribution Data for + Facial Recognition + + +
+ Standard classification theory assumes that the distribution of images in the +test and training sets are identical. Unfortunately, real-life scenarios +typically feature unseen data ("out-of-distribution data") which is different +from data in the training distribution("in-distribution"). This issue is most +prevalent in social justice problems where data from under-represented groups +may appear in the test data without representing an equal proportion of the +training data. This may result in a model returning confidently wrong decisions +and predictions. We are interested in the following question: Can the +performance of a neural network improve on facial images of out-of-distribution +data when it is trained simultaneously on multiple datasets of in-distribution +data? We approach this problem by incorporating the Outlier Exposure model and +investigate how the model's performance changes when other datasets of facial +images were implemented. We observe that the accuracy and other metrics of the +model can be increased by applying Outlier Exposure, incorporating a trainable +weight parameter to increase the machine's emphasis on outlier images, and by +re-weighting the importance of different class labels. We also experimented +with whether sorting the images and determining outliers via image features +would have more of an effect on the metrics than sorting by average pixel +value. Our goal was to make models not only more accurate but also more fair by +scanning a more expanded range of images. We also tested the datasets in +reverse order to see whether a more fair dataset with balanced features has an +effect on the model's accuracy. + +
+
+ comment: 18 pages, 6 tables, 6 figures +
+
+
+
+
+ + ☆ Mitigating Heterogeneity in Federated Multimodal Learning with + Biomedical Vision-Language Pre-training + + +
+ Vision-language pre-training (VLP) has arised as an efficient scheme for +multimodal representation learning, but it requires large-scale multimodal data +for pre-training, making it an obstacle especially for biomedical applications. +To overcome the data limitation, federated learning (FL) can be a promising +strategy to scale up the dataset for biomedical VLP while protecting data +privacy. However, client data are often heterogeneous in real-world scenarios, +and we observe that local training on heterogeneous client data would distort +the multimodal representation learning and lead to biased cross-modal +alignment. To address this challenge, we propose Federated distributional +Robust Guidance-Based (FedRGB) learning framework for federated VLP with +robustness to data heterogeneity. Specifically, we utilize a guidance-based +local training scheme to reduce feature distortions, and employ a +distribution-based min-max optimization to learn unbiased cross-modal +alignment. The experiments on real-world datasets show our method successfully +promotes efficient federated multimodal learning for biomedical VLP with data +heterogeneity. + +
+
+
+
+
+ + ☆ Vision Transformers in Domain Adaptation and Generalization: A Study of + Robustness + + +
+ Deep learning models are often evaluated in scenarios where the data +distribution is different from those used in the training and validation +phases. The discrepancy presents a challenge for accurately predicting the +performance of models once deployed on the target distribution. Domain +adaptation and generalization are widely recognized as effective strategies for +addressing such shifts, thereby ensuring reliable performance. The recent +promising results in applying vision transformers in computer vision tasks, +coupled with advancements in self-attention mechanisms, have demonstrated their +significant potential for robustness and generalization in handling +distribution shifts. Motivated by the increased interest from the research +community, our paper investigates the deployment of vision transformers in +domain adaptation and domain generalization scenarios. For domain adaptation +methods, we categorize research into feature-level, instance-level, model-level +adaptations, and hybrid approaches, along with other categorizations with +respect to diverse strategies for enhancing domain adaptation. Similarly, for +domain generalization, we categorize research into multi-domain learning, +meta-learning, regularization techniques, and data augmentation strategies. We +further classify diverse strategies in research, underscoring the various +approaches researchers have taken to address distribution shifts by integrating +vision transformers. The inclusion of comprehensive tables summarizing these +categories is a distinct feature of our work, offering valuable insights for +researchers. These findings highlight the versatility of vision transformers in +managing distribution shifts, crucial for real-world applications, especially +in critical safety and decision-making scenarios. + +
+
+ comment: 28 pages, 5 figures, Preprint submitted to Elsevier +
+
+
+
+
+ + ☆ Robust Few-Shot Ensemble Learning with Focal Diversity-Based Pruning + + +
+ This paper presents FusionShot, a focal diversity optimized few-shot ensemble +learning approach for boosting the robustness and generalization performance of +pre-trained few-shot models. The paper makes three original contributions. +First, we explore the unique characteristics of few-shot learning to ensemble +multiple few-shot (FS) models by creating three alternative fusion channels. +Second, we introduce the concept of focal error diversity to learn the most +efficient ensemble teaming strategy, rather than assuming that an ensemble of a +larger number of base models will outperform those sub-ensembles of smaller +size. We develop a focal-diversity ensemble pruning method to effectively prune +out the candidate ensembles with low ensemble error diversity and recommend +top-$K$ FS ensembles with the highest focal error diversity. Finally, we +capture the complex non-linear patterns of ensemble few-shot predictions by +designing the learn-to-combine algorithm, which can learn the diverse weight +assignments for robust ensemble fusion over different member models. Extensive +experiments on representative few-shot benchmarks show that the top-K ensembles +recommended by FusionShot can outperform the representative SOTA few-shot +models on novel tasks (different distributions and unknown at training), and +can prevail over existing few-shot learners in both cross-domain settings and +adversarial settings. For reproducibility purposes, FusionShot trained models, +results, and code are made available at https://github.com/sftekin/fusionshot + +
+
+
+
+
+ + ☆ PhysPT: Physics-aware Pretrained Transformer for Estimating Human + Dynamics from Monocular Videos + + +
+ While current methods have shown promising progress on estimating 3D human +motion from monocular videos, their motion estimates are often physically +unrealistic because they mainly consider kinematics. In this paper, we +introduce Physics-aware Pretrained Transformer (PhysPT), which improves +kinematics-based motion estimates and infers motion forces. PhysPT exploits a +Transformer encoder-decoder backbone to effectively learn human dynamics in a +self-supervised manner. Moreover, it incorporates physics principles governing +human motion. Specifically, we build a physics-based body representation and +contact force model. We leverage them to impose novel physics-inspired training +losses (i.e., force loss, contact loss, and Euler-Lagrange loss), enabling +PhysPT to capture physical properties of the human body and the forces it +experiences. Experiments demonstrate that, once trained, PhysPT can be directly +applied to kinematics-based estimates to significantly enhance their physical +plausibility and generate favourable motion forces. Furthermore, we show that +these physically meaningful quantities translate into improved accuracy of an +important downstream task: human action recognition. + +
+
+
+
+
+ + ☆ PhysAvatar: Learning the Physics of Dressed 3D Avatars from Visual + Observations + + +
+ Modeling and rendering photorealistic avatars is of crucial importance in +many applications. Existing methods that build a 3D avatar from visual +observations, however, struggle to reconstruct clothed humans. We introduce +PhysAvatar, a novel framework that combines inverse rendering with inverse +physics to automatically estimate the shape and appearance of a human from +multi-view video data along with the physical parameters of the fabric of their +clothes. For this purpose, we adopt a mesh-aligned 4D Gaussian technique for +spatio-temporal mesh tracking as well as a physically based inverse renderer to +estimate the intrinsic material properties. PhysAvatar integrates a physics +simulator to estimate the physical parameters of the garments using +gradient-based optimization in a principled manner. These novel capabilities +enable PhysAvatar to create high-quality novel-view renderings of avatars +dressed in loose-fitting clothes under motions and lighting conditions not seen +in the training data. This marks a significant advancement towards modeling +photorealistic digital humans using physically based inverse rendering with +physics in the loop. Our project website is at: +https://qingqing-zhao.github.io/PhysAvatar + +
+
+ comment: Yang Zheng and Qingqing Zhao are project co-leads +
+
+
+
+
+ + ☆ Analyzing Participants' Engagement during Online Meetings Using + Unsupervised Remote Photoplethysmography with Behavioral Features + + +
+ Engagement measurement finds application in healthcare, education, +advertisement, and services. The use of physiological and behavioral features +is viable, but the impracticality of traditional physiological measurement +arises due to the need for contact sensors. We demonstrate the feasibility of +unsupervised remote photoplethysmography (rPPG) as an alternative for contact +sensors in deriving heart rate variability (HRV) features, then fusing these +with behavioral features to measure engagement in online group meetings. +Firstly, a unique Engagement Dataset of online interactions among social +workers is collected with granular engagement labels, offering insight into +virtual meeting dynamics. Secondly, a pre-trained rPPG model is customized to +reconstruct accurate rPPG signals from video meetings in an unsupervised +manner, enabling the calculation of HRV features. Thirdly, the feasibility of +estimating engagement from HRV features using short observation windows, with a +notable enhancement when using longer observation windows of two to four +minutes, is demonstrated. Fourthly, the effectiveness of behavioral cues is +evaluated and fused with physiological data, which further enhances engagement +estimation performance. An accuracy of 94% is achieved when only HRV features +are used, eliminating the need for contact sensors or ground truth signals. The +incorporation of behavioral cues raises the accuracy to 96%. Facial video +analysis offers precise engagement measurement, beneficial for future +applications. + +
+
+
+
+
+ + ☆ LOSS-SLAM: Lightweight Open-Set Semantic Simultaneous Localization and + Mapping + + +
+ Enabling robots to understand the world in terms of objects is a critical +building block towards higher level autonomy. The success of foundation models +in vision has created the ability to segment and identify nearly all objects in +the world. However, utilizing such objects to localize the robot and build an +open-set semantic map of the world remains an open research question. In this +work, a system of identifying, localizing, and encoding objects is tightly +coupled with probabilistic graphical models for performing open-set semantic +simultaneous localization and mapping (SLAM). Results are presented +demonstrating that the proposed lightweight object encoding can be used to +perform more accurate object-based SLAM than existing open-set methods, +closed-set methods, and geometric methods while incurring a lower computational +overhead than existing open-set mapping methods. + +
+
+
+
+
+ + ☆ ClickDiffusion: Harnessing LLMs for Interactive Precise Image Editing + + +
+ Recently, researchers have proposed powerful systems for generating and +manipulating images using natural language instructions. However, it is +difficult to precisely specify many common classes of image transformations +with text alone. For example, a user may wish to change the location and breed +of a particular dog in an image with several similar dogs. This task is quite +difficult with natural language alone, and would require a user to write a +laboriously complex prompt that both disambiguates the target dog and describes +the destination. We propose ClickDiffusion, a system for precise image +manipulation and generation that combines natural language instructions with +visual feedback provided by the user through a direct manipulation interface. +We demonstrate that by serializing both an image and a multi-modal instruction +into a textual representation it is possible to leverage LLMs to perform +precise transformations of the layout and appearance of an image. Code +available at https://github.com/poloclub/ClickDiffusion. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2402.07925 +
+
+
+
+
+ + ☆ Idea-2-3D: Collaborative LMM Agents Enable 3D Model Generation from + Interleaved Multimodal Inputs + + +
+ In this paper, we pursue a novel 3D AIGC setting: generating 3D content from +IDEAs. The definition of an IDEA is the composition of multimodal inputs +including text, image, and 3D models. To our knowledge, this challenging and +appealing 3D AIGC setting has not been studied before. We propose the novel +framework called Idea-2-3D to achieve this goal, which consists of three agents +based upon large multimodel models (LMMs) and several existing algorithmic +tools for them to invoke. Specifically, these three LMM-based agents are +prompted to do the jobs of prompt generation, model selection and feedback +reflection. They work in a cycle that involves both mutual collaboration and +criticism. Note that this cycle is done in a fully automatic manner, without +any human intervention. The framework then outputs a text prompt to generate 3D +models that well align with input IDEAs. We show impressive 3D AIGC results +that are beyond any previous methods can achieve. For quantitative comparisons, +we construct caption-based baselines using a whole bunch of state-of-the-art 3D +AIGC models and demonstrate Idea-2-3D out-performs significantly. In 94.2% of +cases, Idea-2-3D meets users' requirements, marking a degree of match between +IDEA and 3D models that is 2.3 times higher than baselines. Moreover, in 93.5% +of the cases, users agreed that Idea-2-3D was better than baselines. Codes, +data and models will made publicly available. + +
+
+ comment: Project Page: https://air-discover.github.io/Idea-2-3D/ Code: + https://github.com/yisuanwang/Idea23D +
+
+
+
+
+ + ☆ Pixel-wise RL on Diffusion Models: Reinforcement Learning from Rich + Feedback + + +
+ Latent diffusion models are the state-of-the-art for synthetic image +generation. To align these models with human preferences, training the models +using reinforcement learning on human feedback is crucial. Black et. al 2024 +introduced denoising diffusion policy optimisation (DDPO), which accounts for +the iterative denoising nature of the generation by modelling it as a Markov +chain with a final reward. As the reward is a single value that determines the +model's performance on the entire image, the model has to navigate a very +sparse reward landscape and so requires a large sample count. In this work, we +extend the DDPO by presenting the Pixel-wise Policy Optimisation (PXPO) +algorithm, which can take feedback for each pixel, providing a more nuanced +reward to the model. + +
+
+ comment: 6 pages, 7 figures +
+
+
+
+
+ + ☆ Koala: Key frame-conditioned long video-LLM CVPR 2024 + + +
+ Long video question answering is a challenging task that involves recognizing +short-term activities and reasoning about their fine-grained relationships. +State-of-the-art video Large Language Models (vLLMs) hold promise as a viable +solution due to their demonstrated emergent capabilities on new tasks. However, +despite being trained on millions of short seconds-long videos, vLLMs are +unable to understand minutes-long videos and accurately answer questions about +them. To address this limitation, we propose a lightweight and self-supervised +approach, Key frame-conditioned long video-LLM (Koala), that introduces +learnable spatiotemporal queries to adapt pretrained vLLMs for generalizing to +longer videos. Our approach introduces two new tokenizers that condition on +visual tokens computed from sparse video key frames for understanding short and +long video moments. We train our proposed approach on HowTo100M and demonstrate +its effectiveness on zero-shot long video understanding benchmarks, where it +outperforms state-of-the-art large models by 3 - 6% in absolute accuracy across +all tasks. Surprisingly, we also empirically show that our approach not only +helps a pretrained vLLM to understand long videos but also improves its +accuracy on short-term action recognition. + +
+
+ comment: Accepted at CVPR 2024 as a poster highlight +
+
+
+
+
+ + ☆ SpatialTracker: Tracking Any 2D Pixels in 3D Space CVPR 2024 + + +
+ Recovering dense and long-range pixel motion in videos is a challenging +problem. Part of the difficulty arises from the 3D-to-2D projection process, +leading to occlusions and discontinuities in the 2D motion domain. While 2D +motion can be intricate, we posit that the underlying 3D motion can often be +simple and low-dimensional. In this work, we propose to estimate point +trajectories in 3D space to mitigate the issues caused by image projection. Our +method, named SpatialTracker, lifts 2D pixels to 3D using monocular depth +estimators, represents the 3D content of each frame efficiently using a +triplane representation, and performs iterative updates using a transformer to +estimate 3D trajectories. Tracking in 3D allows us to leverage +as-rigid-as-possible (ARAP) constraints while simultaneously learning a +rigidity embedding that clusters pixels into different rigid parts. Extensive +evaluation shows that our approach achieves state-of-the-art tracking +performance both qualitatively and quantitatively, particularly in challenging +scenarios such as out-of-plane rotation. + +
+
+ comment: Accepted to CVPR 2024 (selected as highlight paper). Project page: + https://henry123-boy.github.io/SpaTracker/ +
+
+
+
+
+ + ☆ Robust Depth Enhancement via Polarization Prompt Fusion Tuning CVPR 2024 + + +
+ Existing depth sensors are imperfect and may provide inaccurate depth values +in challenging scenarios, such as in the presence of transparent or reflective +objects. In this work, we present a general framework that leverages +polarization imaging to improve inaccurate depth measurements from various +depth sensors. Previous polarization-based depth enhancement methods focus on +utilizing pure physics-based formulas for a single sensor. In contrast, our +method first adopts a learning-based strategy where a neural network is trained +to estimate a dense and complete depth map from polarization data and a sensor +depth map from different sensors. To further improve the performance, we +propose a Polarization Prompt Fusion Tuning (PPFT) strategy to effectively +utilize RGB-based models pre-trained on large-scale datasets, as the size of +the polarization dataset is limited to train a strong model from scratch. We +conducted extensive experiments on a public dataset, and the results +demonstrate that the proposed method performs favorably compared to existing +depth enhancement baselines. Code and demos are available at +https://lastbasket.github.io/PPFT/. + +
+
+ comment: CVPR 2024. Project page: https://lastbasket.github.io/PPFT/. The + first two authors contribute equally +
+
+
+
+
+ + ☆ Visual Knowledge in the Big Model Era: Retrospect and Prospect + + +
+ Visual knowledge is a new form of knowledge representation that can +encapsulate visual concepts and their relations in a succinct, comprehensive, +and interpretable manner, with a deep root in cognitive psychology. As the +knowledge about the visual world has been identified as an indispensable +component of human cognition and intelligence, visual knowledge is poised to +have a pivotal role in establishing machine intelligence. With the recent +advance of Artificial Intelligence (AI) techniques, large AI models (or +foundation models) have emerged as a potent tool capable of extracting +versatile patterns from broad data as implicit knowledge, and abstracting them +into an outrageous amount of numeric parameters. To pave the way for creating +visual knowledge empowered AI machines in this coming wave, we present a timely +review that investigates the origins and development of visual knowledge in the +pre-big model era, and accentuates the opportunities and unique role of visual +knowledge in the big model era. + +
+
+
+
+
+ + ☆ Implicit Assimilation of Sparse In Situ Data for Dense & Global Storm + Surge Forecasting CVPR + + +
+ Hurricanes and coastal floods are among the most disastrous natural hazards. +Both are intimately related to storm surges, as their causes and effects, +respectively. However, the short-term forecasting of storm surges has proven +challenging, especially when targeting previously unseen locations or sites +without tidal gauges. Furthermore, recent work improved short and medium-term +weather forecasting but the handling of raw unassimilated data remains +non-trivial. In this paper, we tackle both challenges and demonstrate that +neural networks can implicitly assimilate sparse in situ tide gauge data with +coarse ocean state reanalysis in order to forecast storm surges. We curate a +global dataset to learn and validate the dense prediction of storm surges, +building on preceding efforts. Other than prior work limited to known gauges, +our approach extends to ungauged sites, paving the way for global storm surge +forecasting. + +
+
+ comment: Accepted at CVPR EarthVision 2024 +
+
+
+
+
+ + ♻ ☆ DVIS-DAQ: Improving Video Segmentation via Dynamic Anchor Queries + + +
+ Modern video segmentation methods adopt object queries to perform inter-frame +association and demonstrate satisfactory performance in tracking continuously +appearing objects despite large-scale motion and transient occlusion. However, +they all underperform on newly emerging and disappearing objects that are +common in the real world because they attempt to model object emergence and +disappearance through feature transitions between background and foreground +queries that have significant feature gaps. We introduce Dynamic Anchor Queries +(DAQ) to shorten the transition gap between the anchor and target queries by +dynamically generating anchor queries based on the features of potential +candidates. Furthermore, we introduce a query-level object Emergence and +Disappearance Simulation (EDS) strategy, which unleashes DAQ's potential +without any additional cost. Finally, we combine our proposed DAQ and EDS with +DVIS to obtain DVIS-DAQ. Extensive experiments demonstrate that DVIS-DAQ +achieves a new state-of-the-art (SOTA) performance on five mainstream video +segmentation benchmarks. Code and models are available at +\url{https://github.com/SkyworkAI/DAQ-VS}. + +
+
+
+
+
+ + ♻ ☆ CenterGrasp: Object-Aware Implicit Representation Learning for + Simultaneous Shape Reconstruction and 6-DoF Grasp Estimation + + +
+ Reliable object grasping is a crucial capability for autonomous robots. +However, many existing grasping approaches focus on general clutter removal +without explicitly modeling objects and thus only relying on the visible local +geometry. We introduce CenterGrasp, a novel framework that combines object +awareness and holistic grasping. CenterGrasp learns a general object prior by +encoding shapes and valid grasps in a continuous latent space. It consists of +an RGB-D image encoder that leverages recent advances to detect objects and +infer their pose and latent code, and a decoder to predict shape and grasps for +each object in the scene. We perform extensive experiments on simulated as well +as real-world cluttered scenes and demonstrate strong scene reconstruction and +6-DoF grasp-pose estimation performance. Compared to the state of the art, +CenterGrasp achieves an improvement of 38.5 mm in shape reconstruction and 33 +percentage points on average in grasp success. We make the code and trained +models publicly available at http://centergrasp.cs.uni-freiburg.de. + +
+
+ comment: Accepted at RA-L. Video, code and models available at + http://centergrasp.cs.uni-freiburg.de +
+
+
+
+
+ + ♻ ☆ Modeling 3D Surface Manifolds with a Locally Conditioned Atlas + + +
+ Recently proposed 3D object reconstruction methods represent a mesh with an +atlas - a set of planar patches approximating the surface. However, their +application in a real-world scenario is limited since the surfaces of +reconstructed objects contain discontinuities, which degrades the quality of +the final mesh. This is mainly caused by independent processing of individual +patches, and in this work, we postulate to mitigate this limitation by +preserving local consistency around patch vertices. To that end, we introduce a +Locally Conditioned Atlas (LoCondA), a framework for representing a 3D object +hierarchically in a generative model. Firstly, the model maps a point cloud of +an object into a sphere. Secondly, by leveraging a spherical prior, we enforce +the mapping to be locally consistent on the sphere and on the target object. +This way, we can sample a mesh quad on that sphere and project it back onto the +object's manifold. With LoCondA, we can produce topologically diverse objects +while maintaining quads to be stitched together. We show that the proposed +approach provides structurally coherent reconstructions while producing meshes +of quality comparable to the competitors. + +
+
+
+
+
+ + ♻ ☆ Finding AI-Generated Faces in the Wild CVPR + + +
+ AI-based image generation has continued to rapidly improve, producing +increasingly more realistic images with fewer obvious visual flaws. +AI-generated images are being used to create fake online profiles which in turn +are being used for spam, fraud, and disinformation campaigns. As the general +problem of detecting any type of manipulated or synthesized content is +receiving increasing attention, here we focus on a more narrow task of +distinguishing a real face from an AI-generated face. This is particularly +applicable when tackling inauthentic online accounts with a fake user profile +photo. We show that by focusing on only faces, a more resilient and +general-purpose artifact can be detected that allows for the detection of +AI-generated faces from a variety of GAN- and diffusion-based synthesis +engines, and across image resolutions (as low as 128 x 128 pixels) and +qualities. + +
+
+ comment: to be published as: G.J.A. Porcile, J. Gindi, S. Mundra, J.R. Verbus, + and H. Farid, Finding AI-Generated Faces in the Wild, Workshop on Media + Forensics at CVPR, 2024 +
+
+
+
+
+ + ♻ ☆ WorDepth: Variational Language Prior for Monocular Depth Estimation + + +
+ Three-dimensional (3D) reconstruction from a single image is an ill-posed +problem with inherent ambiguities, i.e. scale. Predicting a 3D scene from text +description(s) is similarly ill-posed, i.e. spatial arrangements of objects +described. We investigate the question of whether two inherently ambiguous +modalities can be used in conjunction to produce metric-scaled reconstructions. +To test this, we focus on monocular depth estimation, the problem of predicting +a dense depth map from a single image, but with an additional text caption +describing the scene. To this end, we begin by encoding the text caption as a +mean and standard deviation; using a variational framework, we learn the +distribution of the plausible metric reconstructions of 3D scenes corresponding +to the text captions as a prior. To "select" a specific reconstruction or depth +map, we encode the given image through a conditional sampler that samples from +the latent space of the variational text encoder, which is then decoded to the +output depth map. Our approach is trained alternatingly between the text and +image branches: in one optimization step, we predict the mean and standard +deviation from the text description and sample from a standard Gaussian, and in +the other, we sample using a (image) conditional sampler. Once trained, we +directly predict depth from the encoded text using the conditional sampler. We +demonstrate our approach on indoor (NYUv2) and outdoor (KITTI) scenarios, where +we show that language can consistently improve performance in both. + +
+
+
+
+
+ + ♻ ☆ SnAG: Scalable and Accurate Video Grounding CVPR 2024 + + +
+ Temporal grounding of text descriptions in videos is a central problem in +vision-language learning and video understanding. Existing methods often +prioritize accuracy over scalability -- they have been optimized for grounding +only a few text queries within short videos, and fail to scale up to long +videos with hundreds of queries. In this paper, we study the effect of +cross-modal fusion on the scalability of video grounding models. Our analysis +establishes late fusion as a more cost-effective fusion scheme for long-form +videos with many text queries. Moreover, it leads us to a novel, video-centric +sampling scheme for efficient training. Based on these findings, we present +SnAG, a simple baseline for scalable and accurate video grounding. Without +bells and whistles, SnAG is 43% more accurate and 1.5x faster than CONE, a +state of the art for long-form video grounding on the challenging MAD dataset, +while achieving highly competitive results on short videos. + +
+
+ comment: Accepted to CVPR 2024. Code available at + https://github.com/fmu2/snag_release +
+
+
+
+
+ + ♻ ☆ State Space Models for Event Cameras CVPR 2024 + + +
+ Today, state-of-the-art deep neural networks that process event-camera data +first convert a temporal window of events into dense, grid-like input +representations. As such, they exhibit poor generalizability when deployed at +higher inference frequencies (i.e., smaller temporal windows) than the ones +they were trained on. We address this challenge by introducing state-space +models (SSMs) with learnable timescale parameters to event-based vision. This +design adapts to varying frequencies without the need to retrain the network at +different frequencies. Additionally, we investigate two strategies to +counteract aliasing effects when deploying the model at higher frequencies. We +comprehensively evaluate our approach against existing methods based on RNN and +Transformer architectures across various benchmarks, including Gen1 and 1 Mpx +event camera datasets. Our results demonstrate that SSM-based models train 33% +faster and also exhibit minimal performance degradation when tested at higher +frequencies than the training input. Traditional RNN and Transformer models +exhibit performance drops of more than 20 mAP, with SSMs having a drop of 3.31 +mAP, highlighting the effectiveness of SSMs in event-based vision tasks. + +
+
+ comment: 18 pages, 5 figures, 6 tables, CVPR 2024 Camera Ready paper +
+
+
+
+
+ + ♻ ☆ Opti-CAM: Optimizing saliency maps for interpretability + + +
+ Methods based on class activation maps (CAM) provide a simple mechanism to +interpret predictions of convolutional neural networks by using linear +combinations of feature maps as saliency maps. By contrast, masking-based +methods optimize a saliency map directly in the image space or learn it by +training another network on additional data. + In this work we introduce Opti-CAM, combining ideas from CAM-based and +masking-based approaches. Our saliency map is a linear combination of feature +maps, where weights are optimized per image such that the logit of the masked +image for a given class is maximized. We also fix a fundamental flaw in two of +the most common evaluation metrics of attribution methods. On several datasets, +Opti-CAM largely outperforms other CAM-based approaches according to the most +relevant classification metrics. We provide empirical evidence supporting that +localization and classifier interpretability are not necessarily aligned. + +
+
+ comment: This work is under consideration at "Computer Vision and Image + Understanding" +
+
+
+
+
+ + ♻ ☆ EAGLE: Eigen Aggregation Learning for Object-Centric Unsupervised + Semantic Segmentation + + +
+ Semantic segmentation has innately relied on extensive pixel-level annotated +data, leading to the emergence of unsupervised methodologies. Among them, +leveraging self-supervised Vision Transformers for unsupervised semantic +segmentation (USS) has been making steady progress with expressive deep +features. Yet, for semantically segmenting images with complex objects, a +predominant challenge remains: the lack of explicit object-level semantic +encoding in patch-level features. This technical limitation often leads to +inadequate segmentation of complex objects with diverse structures. To address +this gap, we present a novel approach, EAGLE, which emphasizes object-centric +representation learning for unsupervised semantic segmentation. Specifically, +we introduce EiCue, a spectral technique providing semantic and structural cues +through an eigenbasis derived from the semantic similarity matrix of deep image +features and color affinity from an image. Further, by incorporating our +object-centric contrastive loss with EiCue, we guide our model to learn +object-level representations with intra- and inter-image object-feature +consistency, thereby enhancing semantic accuracy. Extensive experiments on +COCO-Stuff, Cityscapes, and Potsdam-3 datasets demonstrate the state-of-the-art +USS results of EAGLE with accurate and consistent semantic segmentation across +complex scenes. + +
+
+
+
+
+ + ♻ ☆ On Inherent Adversarial Robustness of Active Vision Systems + + +
+ Current Deep Neural Networks are vulnerable to adversarial examples, which +alter their predictions by adding carefully crafted noise. Since human eyes are +robust to such inputs, it is possible that the vulnerability stems from the +standard way of processing inputs in one shot by processing every pixel with +the same importance. In contrast, neuroscience suggests that the human vision +system can differentiate salient features by (1) switching between multiple +fixation points (saccades) and (2) processing the surrounding with a +non-uniform external resolution (foveation). In this work, we advocate that the +integration of such active vision mechanisms into current deep learning systems +can offer robustness benefits. Specifically, we empirically demonstrate the +inherent robustness of two active vision methods - GFNet and FALcon - under a +black box threat model. By learning and inferencing based on downsampled +glimpses obtained from multiple distinct fixation points within an input, we +show that these active methods achieve (2-3) times greater robustness compared +to a standard passive convolutional network under state-of-the-art adversarial +attacks. More importantly, we provide illustrative and interpretable +visualization analysis that demonstrates how performing inference from distinct +fixation points makes active vision methods less vulnerable to malicious +inputs. + +
+
+
+
+
+ + ♻ ☆ SWAG: Splatting in the Wild images with Appearance-conditioned Gaussians + + +
+ Implicit neural representation methods have shown impressive advancements in +learning 3D scenes from unstructured in-the-wild photo collections but are +still limited by the large computational cost of volumetric rendering. More +recently, 3D Gaussian Splatting emerged as a much faster alternative with +superior rendering quality and training efficiency, especially for small-scale +and object-centric scenarios. Nevertheless, this technique suffers from poor +performance on unstructured in-the-wild data. To tackle this, we extend over 3D +Gaussian Splatting to handle unstructured image collections. We achieve this by +modeling appearance to seize photometric variations in the rendered images. +Additionally, we introduce a new mechanism to train transient Gaussians to +handle the presence of scene occluders in an unsupervised manner. Experiments +on diverse photo collection scenes and multi-pass acquisition of outdoor +landmarks show the effectiveness of our method over prior works achieving +state-of-the-art results with improved efficiency. + +
+
+
+
+
+ + ♻ ☆ Embedded Heterogeneous Attention Transformer for Cross-lingual Image + Captioning + + +
+ Cross-lingual image captioning is a challenging task that requires addressing +both cross-lingual and cross-modal obstacles in multimedia analysis. The +crucial issue in this task is to model the global and the local matching +between the image and different languages. Existing cross-modal embedding +methods based on the transformer architecture oversee the local matching +between the image region and monolingual words, especially when dealing with +diverse languages. To overcome these limitations, we propose an Embedded +Heterogeneous Attention Transformer (EHAT) to establish cross-domain +relationships and local correspondences between images and different languages +by using a heterogeneous network. EHAT comprises Masked Heterogeneous +Cross-attention (MHCA), Heterogeneous Attention Reasoning Network (HARN), and +Heterogeneous Co-attention (HCA). The HARN serves as the core network and it +captures cross-domain relationships by leveraging visual bounding box +representation features to connect word features from two languages and to +learn heterogeneous maps. MHCA and HCA facilitate cross-domain integration in +the encoder through specialized heterogeneous attention mechanisms, enabling a +single model to generate captions in two languages. We evaluate our approach on +the MSCOCO dataset to generate captions in English and Chinese, two languages +that exhibit significant differences in their language families. The +experimental results demonstrate the superior performance of our method +compared to existing advanced monolingual methods. Our proposed EHAT framework +effectively addresses the challenges of cross-lingual image captioning, paving +the way for improved multilingual image analysis and understanding. + +
+
+
+
+
+ + ♻ ☆ Self-Correcting Self-Consuming Loops for Generative Model Training + + +
+ As synthetic data becomes higher quality and proliferates on the internet, +machine learning models are increasingly trained on a mix of human- and +machine-generated data. Despite the successful stories of using synthetic data +for representation learning, using synthetic data for generative model training +creates "self-consuming loops" which may lead to training instability or even +collapse, unless certain conditions are met. Our paper aims to stabilize +self-consuming generative model training. Our theoretical results demonstrate +that by introducing an idealized correction function, which maps a data point +to be more likely under the true data distribution, self-consuming loops can be +made exponentially more stable. We then propose self-correction functions, +which rely on expert knowledge (e.g. the laws of physics programmed in a +simulator), and aim to approximate the idealized corrector automatically and at +scale. We empirically validate the effectiveness of self-correcting +self-consuming loops on the challenging human motion synthesis task, and +observe that it successfully avoids model collapse, even when the ratio of +synthetic data to real data is as high as 100%. + +
+
+ comment: This new version contains updated mathematical results (c.f. Remark + 4.4), as well as experiments for an additional generative modeling task. + Paper under submission; code is available at + https://nategillman.com/sc-sc.html +
+
+
+
+
+ + ♻ ☆ Chat-UniVi: Unified Visual Representation Empowers Large Language Models + with Image and Video Understanding CVPR 2024 + + +
+ Large language models have demonstrated impressive universal capabilities +across a wide range of open-ended tasks and have extended their utility to +encompass multimodal conversations. However, existing methods encounter +challenges in effectively handling both image and video understanding, +particularly with limited visual tokens. In this work, we introduce Chat-UniVi, +a Unified Vision-language model capable of comprehending and engaging in +conversations involving images and videos through a unified visual +representation. Specifically, we employ a set of dynamic visual tokens to +uniformly represent images and videos. This representation framework empowers +the model to efficiently utilize a limited number of visual tokens to +simultaneously capture the spatial details necessary for images and the +comprehensive temporal relationship required for videos. Moreover, we leverage +a multi-scale representation, enabling the model to perceive both high-level +semantic concepts and low-level visual details. Notably, Chat-UniVi is trained +on a mixed dataset containing both images and videos, allowing direct +application to tasks involving both mediums without requiring any +modifications. Extensive experimental results demonstrate that Chat-UniVi +consistently outperforms even existing methods exclusively designed for either +images or videos. Code is available at +https://github.com/PKU-YuanGroup/Chat-UniVi. + +
+
+ comment: Accepted by CVPR 2024 (Highlight) +
+
+
+
+
+ + ♻ ☆ PlatoNeRF: 3D Reconstruction in Plato's Cave via Single-View Two-Bounce + Lidar CVPR 2024 + + +
+ 3D reconstruction from a single-view is challenging because of the ambiguity +from monocular cues and lack of information about occluded regions. Neural +radiance fields (NeRF), while popular for view synthesis and 3D reconstruction, +are typically reliant on multi-view images. Existing methods for single-view 3D +reconstruction with NeRF rely on either data priors to hallucinate views of +occluded regions, which may not be physically accurate, or shadows observed by +RGB cameras, which are difficult to detect in ambient light and low albedo +backgrounds. We propose using time-of-flight data captured by a single-photon +avalanche diode to overcome these limitations. Our method models two-bounce +optical paths with NeRF, using lidar transient data for supervision. By +leveraging the advantages of both NeRF and two-bounce light measured by lidar, +we demonstrate that we can reconstruct visible and occluded geometry without +data priors or reliance on controlled ambient lighting or scene albedo. In +addition, we demonstrate improved generalization under practical constraints on +sensor spatial- and temporal-resolution. We believe our method is a promising +direction as single-photon lidars become ubiquitous on consumer devices, such +as phones, tablets, and headsets. + +
+
+ comment: CVPR 2024. Project Page: https://platonerf.github.io/ +
+
+
+
+
+ + ♻ ☆ Plug-and-Play image restoration with Stochastic deNOising REgularization + + +
+ Plug-and-Play (PnP) algorithms are a class of iterative algorithms that +address image inverse problems by combining a physical model and a deep neural +network for regularization. Even if they produce impressive image restoration +results, these algorithms rely on a non-standard use of a denoiser on images +that are less and less noisy along the iterations, which contrasts with recent +algorithms based on Diffusion Models (DM), where the denoiser is applied only +on re-noised images. We propose a new PnP framework, called Stochastic +deNOising REgularization (SNORE), which applies the denoiser only on images +with noise of the adequate level. It is based on an explicit stochastic +regularization, which leads to a stochastic gradient descent algorithm to solve +ill-posed inverse problems. A convergence analysis of this algorithm and its +annealing extension is provided. Experimentally, we prove that SNORE is +competitive with respect to state-of-the-art methods on deblurring and +inpainting tasks, both quantitatively and qualitatively. + +
+
+
+
+
+ + ♻ ☆ EGTR: Extracting Graph from Transformer for Scene Graph Generation CVPR 2024 + + +
+ Scene Graph Generation (SGG) is a challenging task of detecting objects and +predicting relationships between objects. After DETR was developed, one-stage +SGG models based on a one-stage object detector have been actively studied. +However, complex modeling is used to predict the relationship between objects, +and the inherent relationship between object queries learned in the multi-head +self-attention of the object detector has been neglected. We propose a +lightweight one-stage SGG model that extracts the relation graph from the +various relationships learned in the multi-head self-attention layers of the +DETR decoder. By fully utilizing the self-attention by-products, the relation +graph can be extracted effectively with a shallow relation extraction head. +Considering the dependency of the relation extraction task on the object +detection task, we propose a novel relation smoothing technique that adjusts +the relation label adaptively according to the quality of the detected objects. +By the relation smoothing, the model is trained according to the continuous +curriculum that focuses on object detection task at the beginning of training +and performs multi-task learning as the object detection performance gradually +improves. Furthermore, we propose a connectivity prediction task that predicts +whether a relation exists between object pairs as an auxiliary task of the +relation extraction. We demonstrate the effectiveness and efficiency of our +method for the Visual Genome and Open Image V6 datasets. Our code is publicly +available at https://github.com/naver-ai/egtr. + +
+
+ comment: CVPR 2024 (Oral) +
+
+
+
+
+ + ♻ ☆ Open-vocabulary object 6D pose estimation CVPR 2024 + + +
+ We introduce the new setting of open-vocabulary object 6D pose estimation, in +which a textual prompt is used to specify the object of interest. In contrast +to existing approaches, in our setting (i) the object of interest is specified +solely through the textual prompt, (ii) no object model (e.g., CAD or video +sequence) is required at inference, and (iii) the object is imaged from two +RGBD viewpoints of different scenes. To operate in this setting, we introduce a +novel approach that leverages a Vision-Language Model to segment the object of +interest from the scenes and to estimate its relative 6D pose. The key of our +approach is a carefully devised strategy to fuse object-level information +provided by the prompt with local image features, resulting in a feature space +that can generalize to novel concepts. We validate our approach on a new +benchmark based on two popular datasets, REAL275 and Toyota-Light, which +collectively encompass 34 object instances appearing in four thousand image +pairs. The results demonstrate that our approach outperforms both a +well-established hand-crafted method and a recent deep learning-based baseline +in estimating the relative 6D pose of objects in different scenes. Code and +dataset are available at https://jcorsetti.github.io/oryon. + +
+
+ comment: Camera ready version (CVPR 2024, poster highlight). 21 pages, 15 + figures, 6 tables +
+
+
+
+
+ + ♻ ☆ The Missing U for Efficient Diffusion Models + + +
+ Diffusion Probabilistic Models stand as a critical tool in generative +modelling, enabling the generation of complex data distributions. This family +of generative models yields record-breaking performance in tasks such as image +synthesis, video generation, and molecule design. Despite their capabilities, +their efficiency, especially in the reverse process, remains a challenge due to +slow convergence rates and high computational costs. In this paper, we +introduce an approach that leverages continuous dynamical systems to design a +novel denoising network for diffusion models that is more parameter-efficient, +exhibits faster convergence, and demonstrates increased noise robustness. +Experimenting with Denoising Diffusion Probabilistic Models (DDPMs), our +framework operates with approximately a quarter of the parameters, and $\sim$ +30\% of the Floating Point Operations (FLOPs) compared to standard U-Nets in +DDPMs. Furthermore, our model is notably faster in inference than the baseline +when measured in fair and equal conditions. We also provide a mathematical +intuition as to why our proposed reverse process is faster as well as a +mathematical discussion of the empirical tradeoffs in the denoising downstream +task. Finally, we argue that our method is compatible with existing performance +enhancement techniques, enabling further improvements in efficiency, quality, +and speed. + +
+
+ comment: 23 pages, 14 figures, Accepted at Transactions of Machine Learning + Research (04/2024) +
+
+
+
+
+ + ♻ ☆ DualRefine: Self-Supervised Depth and Pose Estimation Through Iterative + Epipolar Sampling and Refinement Toward Equilibrium CVPR 2023 + + +
+ Self-supervised multi-frame depth estimation achieves high accuracy by +computing matching costs of pixel correspondences between adjacent frames, +injecting geometric information into the network. These pixel-correspondence +candidates are computed based on the relative pose estimates between the +frames. Accurate pose predictions are essential for precise matching cost +computation as they influence the epipolar geometry. Furthermore, improved +depth estimates can, in turn, be used to align pose estimates. + Inspired by traditional structure-from-motion (SfM) principles, we propose +the DualRefine model, which tightly couples depth and pose estimation through a +feedback loop. Our novel update pipeline uses a deep equilibrium model +framework to iteratively refine depth estimates and a hidden state of feature +maps by computing local matching costs based on epipolar geometry. Importantly, +we used the refined depth estimates and feature maps to compute pose updates at +each step. This update in the pose estimates slowly alters the epipolar +geometry during the refinement process. Experimental results on the KITTI +dataset demonstrate competitive depth prediction and odometry prediction +performance surpassing published self-supervised baselines. + +
+
+ comment: CVPR 2023. Project page: + https://antabangun.github.io/projects/DualRefine/ Code: + https://github.com/antabangun/DualRefine +
+
+
+
+
+ + ♻ ☆ Neural Sign Actors: A diffusion model for 3D sign language production + from text CVPR 2024 + + +
+ Sign Languages (SL) serve as the primary mode of communication for the Deaf +and Hard of Hearing communities. Deep learning methods for SL recognition and +translation have achieved promising results. However, Sign Language Production +(SLP) poses a challenge as the generated motions must be realistic and have +precise semantic meaning. Most SLP methods rely on 2D data, which hinders their +realism. In this work, a diffusion-based SLP model is trained on a curated +large-scale dataset of 4D signing avatars and their corresponding text +transcripts. The proposed method can generate dynamic sequences of 3D avatars +from an unconstrained domain of discourse using a diffusion process formed on a +novel and anatomically informed graph neural network defined on the SMPL-X body +skeleton. Through quantitative and qualitative experiments, we show that the +proposed method considerably outperforms previous methods of SLP. This work +makes an important step towards realistic neural sign avatars, bridging the +communication gap between Deaf and hearing communities. + +
+
+ comment: Accepted at CVPR 2024, Project page: + https://baltatzisv.github.io/neural-sign-actors/ +
+
+
+
+
+ + ♻ ☆ Localization Is All You Evaluate: Data Leakage in Online Mapping + Datasets and How to Fix It + + +
+ The task of online mapping is to predict a local map using current sensor +observations, e.g. from lidar and camera, without relying on a pre-built map. +State-of-the-art methods are based on supervised learning and are trained +predominantly using two datasets: nuScenes and Argoverse 2. However, these +datasets revisit the same geographic locations across training, validation, and +test sets. Specifically, over $80$% of nuScenes and $40$% of Argoverse 2 +validation and test samples are less than $5$ m from a training sample. At test +time, the methods are thus evaluated more on how well they localize within a +memorized implicit map built from the training data than on extrapolating to +unseen locations. Naturally, this data leakage causes inflated performance +numbers and we propose geographically disjoint data splits to reveal the true +performance in unseen environments. Experimental results show that methods +perform considerably worse, some dropping more than $45$ mAP, when trained and +evaluated on proper data splits. Additionally, a reassessment of prior design +choices reveals diverging conclusions from those based on the original split. +Notably, the impact of lifting methods and the support from auxiliary tasks +(e.g., depth supervision) on performance appears less substantial or follows a +different trajectory than previously perceived. Splits can be found at +https://github.com/LiljaAdam/geographical-splits + +
+
+
+
+
+ + ♻ ☆ Contextual Encoder-Decoder Network for Visual Saliency Prediction + + +
+ Predicting salient regions in natural images requires the detection of +objects that are present in a scene. To develop robust representations for this +challenging task, high-level visual features at multiple spatial scales must be +extracted and augmented with contextual information. However, existing models +aimed at explaining human fixation maps do not incorporate such a mechanism +explicitly. Here we propose an approach based on a convolutional neural network +pre-trained on a large-scale image classification task. The architecture forms +an encoder-decoder structure and includes a module with multiple convolutional +layers at different dilation rates to capture multi-scale features in parallel. +Moreover, we combine the resulting representations with global scene +information for accurately predicting visual saliency. Our model achieves +competitive and consistent results across multiple evaluation metrics on two +public saliency benchmarks and we demonstrate the effectiveness of the +suggested approach on five datasets and selected examples. Compared to state of +the art approaches, the network is based on a lightweight image classification +backbone and hence presents a suitable choice for applications with limited +computational resources, such as (virtual) robotic systems, to estimate human +fixations across complex natural scenes. + +
+
+ comment: Updated contact information +
+
+
+
+
+ + ♻ ☆ Single Domain Generalization for Crowd Counting CVPR2024 + + +
+ Due to its promising results, density map regression has been widely employed +for image-based crowd counting. The approach, however, often suffers from +severe performance degradation when tested on data from unseen scenarios, the +so-called "domain shift" problem. To address the problem, we investigate in +this work single domain generalization (SDG) for crowd counting. The existing +SDG approaches are mainly for image classification and segmentation, and can +hardly be extended to our case due to its regression nature and label ambiguity +(i.e., ambiguous pixel-level ground truths). We propose MPCount, a novel +effective SDG approach even for narrow source distribution. MPCount stores +diverse density values for density map regression and reconstructs +domain-invariant features by means of only one memory bank, a content error +mask and attention consistency loss. By partitioning the image into grids, it +employs patch-wise classification as an auxiliary task to mitigate label +ambiguity. Through extensive experiments on different datasets, MPCount is +shown to significantly improve counting accuracy compared to the state of the +art under diverse scenarios unobserved in the training data characterized by +narrow source distribution. Code is available at +https://github.com/Shimmer93/MPCount. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ One model to use them all: Training a segmentation model with + complementary datasets + + +
+ Understanding a surgical scene is crucial for computer-assisted surgery +systems to provide any intelligent assistance functionality. One way of +achieving this scene understanding is via scene segmentation, where every pixel +of a frame is classified and therefore identifies the visible structures and +tissues. Progress on fully segmenting surgical scenes has been made using +machine learning. However, such models require large amounts of annotated +training data, containing examples of all relevant object classes. Such fully +annotated datasets are hard to create, as every pixel in a frame needs to be +annotated by medical experts and, therefore, are rarely available. In this +work, we propose a method to combine multiple partially annotated datasets, +which provide complementary annotations, into one model, enabling better scene +segmentation and the use of multiple readily available datasets. Our method +aims to combine available data with complementary labels by leveraging mutual +exclusive properties to maximize information. Specifically, we propose to use +positive annotations of other classes as negative samples and to exclude +background pixels of binary annotations, as we cannot tell if they contain a +class not annotated but predicted by the model. We evaluate our method by +training a DeepLabV3 on the publicly available Dresden Surgical Anatomy +Dataset, which provides multiple subsets of binary segmented anatomical +structures. Our approach successfully combines 6 classes into one model, +increasing the overall Dice Score by 4.4% compared to an ensemble of models +trained on the classes individually. By including information on multiple +classes, we were able to reduce confusion between stomach and colon by 24%. Our +results demonstrate the feasibility of training a model on multiple datasets. +This paves the way for future work further alleviating the need for one large, +fully segmented datasets. + +
+
+ comment: Accepted at IPCAI 2024; submitted to IJCARS (under revision) +
+
+
+
+
+ + ♻ ☆ Part-Attention Based Model Make Occluded Person Re-Identification + Stronger + + +
+ The goal of occluded person re-identification (ReID) is to retrieve specific +pedestrians in occluded situations. However, occluded person ReID still suffers +from background clutter and low-quality local feature representations, which +limits model performance. In our research, we introduce a new framework called +PAB-ReID, which is a novel ReID model incorporating part-attention mechanisms +to tackle the aforementioned issues effectively. Firstly, we introduce the +human parsing label to guide the generation of more accurate human part +attention maps. In addition, we propose a fine-grained feature focuser for +generating fine-grained human local feature representations while suppressing +background interference. Moreover, We also design a part triplet loss to +supervise the learning of human local features, which optimizes +intra/inter-class distance. We conducted extensive experiments on specialized +occlusion and regular ReID datasets, showcasing that our approach outperforms +the existing state-of-the-art methods. + +
+
+ comment: Accepted By International Joint Conference on Neural Networks 2024 +
+
+
+
+
+ + ♻ ☆ OMH: Structured Sparsity via Optimally Matched Hierarchy for + Unsupervised Semantic Segmentation + + +
+ Unsupervised Semantic Segmentation (USS) involves segmenting images without +relying on predefined labels, aiming to alleviate the burden of extensive human +labeling. Existing methods utilize features generated by self-supervised models +and specific priors for clustering. However, their clustering objectives are +not involved in the optimization of the features during training. Additionally, +due to the lack of clear class definitions in USS, the resulting segments may +not align well with the clustering objective. In this paper, we introduce a +novel approach called Optimally Matched Hierarchy (OMH) to simultaneously +address the above issues. The core of our method lies in imposing structured +sparsity on the feature space, which allows the features to encode information +with different levels of granularity. The structure of this sparsity stems from +our hierarchy (OMH). To achieve this, we learn a soft but sparse hierarchy +among parallel clusters through Optimal Transport. Our OMH yields better +unsupervised segmentation performance compared to existing USS methods. Our +extensive experiments demonstrate the benefits of OMH when utilizing our +differentiable paradigm. We will make our code publicly available. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ♻ ☆ SADA: Semantic adversarial unsupervised domain adaptation for Temporal + Action Localization + + +
+ Temporal Action Localization (TAL) is a complex task that poses relevant +challenges, particularly when attempting to generalize on new -- unseen -- +domains in real-world applications. These scenarios, despite realistic, are +often neglected in the literature, exposing these solutions to important +performance degradation. In this work, we tackle this issue by introducing, for +the first time, an approach for Unsupervised Domain Adaptation (UDA) in sparse +TAL, which we refer to as Semantic Adversarial unsupervised Domain Adaptation +(SADA). Our contributions are threefold: (1) we pioneer the development of a +domain adaptation model that operates on realistic sparse action detection +benchmarks; (2) we tackle the limitations of global-distribution alignment +techniques by introducing a novel adversarial loss that is sensitive to local +class distributions, ensuring finer-grained adaptation; and (3) we present a +novel set of benchmarks based on EpicKitchens100 and CharadesEgo, that evaluate +multiple domain shifts in a comprehensive manner. Our experiments indicate that +SADA improves the adaptation across domains when compared to fully supervised +state-of-the-art and alternative UDA methods, attaining a performance boost of +up to 6.14% mAP. + +
+
+
+
+
+ + ♻ ☆ SCILLA: SurfaCe Implicit Learning for Large Urban Area, a volumetric + hybrid solution + + +
+ Neural implicit surface representation methods have recently shown impressive +3D reconstruction results. However, existing solutions struggle to reconstruct +urban outdoor scenes due to their large, unbounded, and highly detailed nature. +Hence, to achieve accurate reconstructions, additional supervision data such as +LiDAR, strong geometric priors, and long training times are required. To tackle +such issues, we present SCILLA, a new hybrid implicit surface learning method +to reconstruct large driving scenes from 2D images. SCILLA's hybrid +architecture models two separate implicit fields: one for the volumetric +density and another for the signed distance to the surface. To accurately +represent urban outdoor scenarios, we introduce a novel volume-rendering +strategy that relies on self-supervised probabilistic density estimation to +sample points near the surface and transition progressively from volumetric to +surface representation. Our solution permits a proper and fast initialization +of the signed distance field without relying on any geometric prior on the +scene, compared to concurrent methods. By conducting extensive experiments on +four outdoor driving datasets, we show that SCILLA can learn an accurate and +detailed 3D surface scene representation in various urban scenarios while being +two times faster to train compared to previous state-of-the-art solutions. + +
+
+
+
+
+ + ♻ ☆ QuickQuakeBuildings: Post-earthquake SAR-Optical Dataset for Quick + Damaged-building Detection + + +
+ Quick and automated earthquake-damaged building detection from post-event +satellite imagery is crucial, yet it is challenging due to the scarcity of +training data required to develop robust algorithms. This letter presents the +first dataset dedicated to detecting earthquake-damaged buildings from +post-event very high resolution (VHR) Synthetic Aperture Radar (SAR) and +optical imagery. Utilizing open satellite imagery and annotations acquired +after the 2023 Turkey-Syria earthquakes, we deliver a dataset of coregistered +building footprints and satellite image patches of both SAR and optical data, +encompassing more than four thousand buildings. The task of damaged building +detection is formulated as a binary image classification problem, that can also +be treated as an anomaly detection problem due to extreme class imbalance. We +provide baseline methods and results to serve as references for comparison. +Researchers can utilize this dataset to expedite algorithm development, +facilitating the rapid detection of damaged buildings in response to future +events. The dataset and codes together with detailed explanations and +visualization are made publicly available at +\url{https://github.com/ya0-sun/PostEQ-SARopt-BuildingDamage}. + +
+
+
+
+
+ + ♻ ☆ SPOT: Self-Training with Patch-Order Permutation for Object-Centric + Learning with Autoregressive Transformers CVPR 2024 + + +
+ Unsupervised object-centric learning aims to decompose scenes into +interpretable object entities, termed slots. Slot-based auto-encoders stand out +as a prominent method for this task. Within them, crucial aspects include +guiding the encoder to generate object-specific slots and ensuring the decoder +utilizes them during reconstruction. This work introduces two novel techniques, +(i) an attention-based self-training approach, which distills superior +slot-based attention masks from the decoder to the encoder, enhancing object +segmentation, and (ii) an innovative patch-order permutation strategy for +autoregressive transformers that strengthens the role of slot vectors in +reconstruction. The effectiveness of these strategies is showcased +experimentally. The combined approach significantly surpasses prior slot-based +autoencoder methods in unsupervised object segmentation, especially with +complex real-world images. We provide the implementation code at +https://github.com/gkakogeorgiou/spot . + +
+
+ comment: CVPR 2024 (Highlight). Code: https://github.com/gkakogeorgiou/spot +
+
+
+
+
+ + ♻ ☆ Learning Enriched Features via Selective State Spaces Model for + Efficient Image Deblurring + + +
+ Image deblurring aims to restore a high-quality image from its corresponding +blurred. The emergence of CNNs and Transformers has enabled significant +progress. However, these methods often face the dilemma between eliminating +long-range degradation perturbations and maintaining computational efficiency. +While the selective state space model (SSM) shows promise in modeling +long-range dependencies with linear complexity, it also encounters challenges +such as local pixel forgetting and channel redundancy. To address this issue, +we propose an efficient image deblurring network that leverages selective state +spaces model to aggregate enriched and accurate features. Specifically, we +introduce an aggregate local and global information block (ALGBlock) designed +to effectively capture and integrate both local invariant properties and +non-local information. The ALGBlock comprises two primary modules: a module for +capturing local and global features (CLGF), and a feature aggregation module +(FA). The CLGF module is composed of two branches: the global branch captures +long-range dependency features via a selective state spaces model, while the +local branch employs simplified channel attention to model local connectivity, +thereby reducing local pixel forgetting and channel redundancy. In addition, we +design a FA module to accentuate the local part by recalibrating the weight +during the aggregation of the two branches for restoration. Experimental +results demonstrate that the proposed method outperforms state-of-the-art +approaches on widely used benchmarks. + +
+
+
+
+
+ + ♻ ☆ Sculpting Holistic 3D Representation in Contrastive Language-Image-3D + Pre-training CVPR 2024 + + +
+ Contrastive learning has emerged as a promising paradigm for 3D open-world +understanding, i.e., aligning point cloud representation to image and text +embedding space individually. In this paper, we introduce MixCon3D, a simple +yet effective method aiming to sculpt holistic 3D representation in contrastive +language-image-3D pre-training. In contrast to point cloud only, we develop the +3D object-level representation from complementary perspectives, e.g., +multi-view rendered images with the point cloud. Then, MixCon3D performs +language-3D contrastive learning, comprehensively depicting real-world 3D +objects and bolstering text alignment. Additionally, we pioneer the first +thorough investigation of various training recipes for the 3D contrastive +learning paradigm, building a solid baseline with improved performance. +Extensive experiments conducted on three representative benchmarks reveal that +our method significantly improves over the baseline, surpassing the previous +state-of-the-art performance on the challenging 1,156-category Objaverse-LVIS +dataset by 5.7%. The versatility of MixCon3D is showcased in applications such +as text-to-3D retrieval and point cloud captioning, further evidencing its +efficacy in diverse scenarios. The code is available at +https://github.com/UCSC-VLAA/MixCon3D. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ MO-YOLO: End-to-End Multiple-Object Tracking Method with YOLO and + Decoder + + +
+ In the field of multi-object tracking (MOT), recent Transformer based +end-to-end models like MOTR have demonstrated exceptional performance on +datasets such as DanceTracker. However, the computational demands of these +models present challenges in training and deployment. Drawing inspiration from +successful models like GPT, we present MO-YOLO, an efficient and +computationally frugal end-to-end MOT model. MO-YOLO integrates principles from +You Only Look Once (YOLO) and RT-DETR, adopting a decoder-only approach. By +leveraging the decoder from RT-DETR and architectural components from YOLOv8, +MO-YOLO achieves high speed, shorter training times, and proficient MOT +performance. On the Dancetrack, MO-YOLO not only matches MOTR's performance but +also surpasses it, achieving over twice the frames per second (MOTR 9.5 FPS, +MO-YOLO 19.6 FPS). Furthermore, MO-YOLO demonstrates significantly reduced +training times and lower hardware requirements compared to MOTR. This research +introduces a promising paradigm for efficient end-to-end MOT, emphasizing +enhanced performance and resource efficiency. + +
+
+
+
+
+ + ♻ ☆ GaussianCube: Structuring Gaussian Splatting using Optimal Transport for + 3D Generative Modeling + + +
+ 3D Gaussian Splatting (GS) have achieved considerable improvement over Neural +Radiance Fields in terms of 3D fitting fidelity and rendering speed. However, +this unstructured representation with scattered Gaussians poses a significant +challenge for generative modeling. To address the problem, we introduce +GaussianCube, a structured GS representation that is both powerful and +efficient for generative modeling. We achieve this by first proposing a +modified densification-constrained GS fitting algorithm which can yield +high-quality fitting results using a fixed number of free Gaussians, and then +re-arranging the Gaussians into a predefined voxel grid via Optimal Transport. +The structured grid representation allows us to use standard 3D U-Net as our +backbone in diffusion generative modeling without elaborate designs. Extensive +experiments conducted on ShapeNet and OmniObject3D show that our model achieves +state-of-the-art generation results both qualitatively and quantitatively, +underscoring the potential of GaussianCube as a powerful and versatile 3D +representation. + +
+
+ comment: Fix typo in Eq.2; Project Page: https://gaussiancube.github.io/ +
+
+
+
+
+ + ♻ ☆ Mind the Exit Pupil Gap: Revisiting the Intrinsics of a Standard + Plenoptic Camera + + +
+ Among the common applications of plenoptic cameras are depth reconstruction +and post-shot refocusing. These require a calibration relating the camera-side +light field to that of the scene. Numerous methods with this goal have been +developed based on thin lens models for the plenoptic camera's main lens and +microlenses. Our work addresses the often-overlooked role of the main lens exit +pupil in these models and specifically in the decoding process of standard +plenoptic camera (SPC) images. We formally deduce the connection between the +refocusing distance and the resampling parameter for the decoded light field +and provide an analysis of the errors that arise when the exit pupil is not +considered. In addition, previous work is revisited with respect to the exit +pupil's role and all theoretical results are validated through a +ray-tracing-based simulation. With the public release of the evaluated SPC +designs alongside our simulation and experimental data we aim to contribute to +a more accurate and nuanced understanding of plenoptic camera optics. + +
+
+ comment: 29 pages, 16 figures, Accepted for publication in MDPI Sensors, + Special Issue 'Short-Range Optical 3D Scanning and 3D Data Processing ' +
+
+
+
+
+ + ♻ ☆ Theoretical and Empirical Analysis of a Fast Algorithm for Extracting + Polygons from Signed Distance Bounds + + +
+ Recently there has been renewed interest in signed distance bound +representations due to their unique properties for 3D shape modelling. This is +especially the case for deep learning-based bounds. However, it is beneficial +to work with polygons in most computer-graphics applications. Thus, in this +paper we introduce and investigate an asymptotically fast method for +transforming signed distance bounds into polygon meshes. This is achieved by +combining the principles of sphere tracing (or ray marching) with traditional +polygonization techniques, such as Marching Cubes. We provide theoretical and +experimental evidence that this approach is of the $O(N^2\log N)$ computational +complexity for a polygonization grid with $N^3$ cells. The algorithm is tested +on both a set of primitive shapes as well as signed distance bounds generated +from point clouds by machine learning (and represented as neural networks). +Given its speed, implementation simplicity and portability, we argue that it +could prove useful during the modelling stage as well as in shape compression +for storage. + The code is available here: https://github.com/nenadmarkus/gridhopping + +
+
+
+
+
+ + ♻ ☆ InstantAvatar: Efficient 3D Head Reconstruction via Surface Rendering + + +
+ Recent advances in full-head reconstruction have been obtained by optimizing +a neural field through differentiable surface or volume rendering to represent +a single scene. While these techniques achieve an unprecedented accuracy, they +take several minutes, or even hours, due to the expensive optimization process +required. In this work, we introduce InstantAvatar, a method that recovers +full-head avatars from few images (down to just one) in a few seconds on +commodity hardware. In order to speed up the reconstruction process, we propose +a system that combines, for the first time, a voxel-grid neural field +representation with a surface renderer. Notably, a naive combination of these +two techniques leads to unstable optimizations that do not converge to valid +solutions. In order to overcome this limitation, we present a novel statistical +model that learns a prior distribution over 3D head signed distance functions +using a voxel-grid based architecture. The use of this prior model, in +combination with other design choices, results into a system that achieves 3D +head reconstructions with comparable accuracy as the state-of-the-art with a +100x speed-up. + +
+
+
+
+
+ + ♻ ☆ EVREAL: Towards a Comprehensive Benchmark and Analysis Suite for + Event-based Video Reconstruction CVPR + + +
+ Event cameras are a new type of vision sensor that incorporates asynchronous +and independent pixels, offering advantages over traditional frame-based +cameras such as high dynamic range and minimal motion blur. However, their +output is not easily understandable by humans, making the reconstruction of +intensity images from event streams a fundamental task in event-based vision. +While recent deep learning-based methods have shown promise in video +reconstruction from events, this problem is not completely solved yet. To +facilitate comparison between different approaches, standardized evaluation +protocols and diverse test datasets are essential. This paper proposes a +unified evaluation methodology and introduces an open-source framework called +EVREAL to comprehensively benchmark and analyze various event-based video +reconstruction methods from the literature. Using EVREAL, we give a detailed +analysis of the state-of-the-art methods for event-based video reconstruction, +and provide valuable insights into the performance of these methods under +varying settings, challenging scenarios, and downstream tasks. + +
+
+ comment: 19 pages, 9 figures. Has been accepted for publication at the IEEE + Conference on Computer Vision and Pattern Recognition Workshops (CVPRW), + Vancouver, 2023. The project page can be found at + https://ercanburak.github.io/evreal.html +
+
+
+
+
+ + ♻ ☆ Causal Mode Multiplexer: A Novel Framework for Unbiased Multispectral + Pedestrian Detection CVPR2024 + + +
+ RGBT multispectral pedestrian detection has emerged as a promising solution +for safety-critical applications that require day/night operations. However, +the modality bias problem remains unsolved as multispectral pedestrian +detectors learn the statistical bias in datasets. Specifically, datasets in +multispectral pedestrian detection mainly distribute between ROTO (day) and +RXTO (night) data; the majority of the pedestrian labels statistically co-occur +with their thermal features. As a result, multispectral pedestrian detectors +show poor generalization ability on examples beyond this statistical +correlation, such as ROTX data. To address this problem, we propose a novel +Causal Mode Multiplexer (CMM) framework that effectively learns the causalities +between multispectral inputs and predictions. Moreover, we construct a new +dataset (ROTX-MP) to evaluate modality bias in multispectral pedestrian +detection. ROTX-MP mainly includes ROTX examples not presented in previous +datasets. Extensive experiments demonstrate that our proposed CMM framework +generalizes well on existing datasets (KAIST, CVC-14, FLIR) and the new +ROTX-MP. We will release our new dataset to the public for future research. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ♻ ☆ Learning Prompt with Distribution-Based Feature Replay for Few-Shot + Class-Incremental Learning + + +
+ Few-shot Class-Incremental Learning (FSCIL) aims to continuously learn new +classes based on very limited training data without forgetting the old ones +encountered. Existing studies solely relied on pure visual networks, while in +this paper we solved FSCIL by leveraging the Vision-Language model (e.g., CLIP) +and propose a simple yet effective framework, named Learning Prompt with +Distribution-based Feature Replay (LP-DiF). We observe that simply using CLIP +for zero-shot evaluation can substantially outperform the most influential +methods. Then, prompt tuning technique is involved to further improve its +adaptation ability, allowing the model to continually capture specific +knowledge from each session. To prevent the learnable prompt from forgetting +old knowledge in the new session, we propose a pseudo-feature replay approach. +Specifically, we preserve the old knowledge of each class by maintaining a +feature-level Gaussian distribution with a diagonal covariance matrix, which is +estimated by the image features of training images and synthesized features +generated from a VAE. When progressing to a new session, pseudo-features are +sampled from old-class distributions combined with training images of the +current session to optimize the prompt, thus enabling the model to learn new +knowledge while retaining old knowledge. Experiments on three prevalent +benchmarks, i.e., CIFAR100, mini-ImageNet, CUB-200, and two more challenging +benchmarks, i.e., SUN-397 and CUB-200$^*$ proposed in this paper showcase the +superiority of LP-DiF, achieving new state-of-the-art (SOTA) in FSCIL. Code is +publicly available at https://github.com/1170300714/LP-DiF. + +
+
+
+
+
+ + ♻ ☆ Your Student is Better Than Expected: Adaptive Teacher-Student + Collaboration for Text-Conditional Diffusion Models CVPR2024 + + +
+ Knowledge distillation methods have recently shown to be a promising +direction to speedup the synthesis of large-scale diffusion models by requiring +only a few inference steps. While several powerful distillation methods were +recently proposed, the overall quality of student samples is typically lower +compared to the teacher ones, which hinders their practical usage. In this +work, we investigate the relative quality of samples produced by the teacher +text-to-image diffusion model and its distilled student version. As our main +empirical finding, we discover that a noticeable portion of student samples +exhibit superior fidelity compared to the teacher ones, despite the +"approximate" nature of the student. Based on this finding, we propose an +adaptive collaboration between student and teacher diffusion models for +effective text-to-image synthesis. Specifically, the distilled model produces +the initial sample, and then an oracle decides whether it needs further +improvements with a slow teacher model. Extensive experiments demonstrate that +the designed pipeline surpasses state-of-the-art text-to-image alternatives for +various inference budgets in terms of human preference. Furthermore, the +proposed approach can be naturally used in popular applications such as +text-guided image editing and controllable generation. + +
+
+ comment: CVPR2024 camera ready v2 +
+
+
+
+
+ + ♻ ☆ Predicting Traffic Flow with Federated Learning and Graph Neural with + Asynchronous Computations Network + + +
+ Real-time traffic flow prediction holds significant importance within the +domain of Intelligent Transportation Systems (ITS). The task of achieving a +balance between prediction precision and computational efficiency presents a +significant challenge. In this article, we present a novel deep-learning method +called Federated Learning and Asynchronous Graph Convolutional Network +(FLAGCN). Our framework incorporates the principles of asynchronous graph +convolutional networks with federated learning to enhance the accuracy and +efficiency of real-time traffic flow prediction. The FLAGCN model employs a +spatial-temporal graph convolution technique to asynchronously address +spatio-temporal dependencies within traffic data effectively. To efficiently +handle the computational requirements associated with this deep learning model, +this study used a graph federated learning technique known as GraphFL. This +approach is designed to facilitate the training process. The experimental +results obtained from conducting tests on two distinct traffic datasets +demonstrate that the utilization of FLAGCN leads to the optimization of both +training and inference durations while maintaining a high level of prediction +accuracy. FLAGCN outperforms existing models with significant improvements by +achieving up to approximately 6.85% reduction in RMSE, 20.45% reduction in +MAPE, compared to the best-performing existing models. + +
+
+ comment: I request to withdraw my paper from arXiv due to significant updates + and improvements identified post-submission. These enhancements will + substantially elevate the work's quality and impact. I plan to resubmit the + revised paper upon completion of these updates. Thank you for accommodating + this request +
+
+
+
+
+ + ♻ ☆ Generalizable Whole Slide Image Classification with Fine-Grained + Visual-Semantic Interaction CVPR 2024 + + +
+ Whole Slide Image (WSI) classification is often formulated as a Multiple +Instance Learning (MIL) problem. Recently, Vision-Language Models (VLMs) have +demonstrated remarkable performance in WSI classification. However, existing +methods leverage coarse-grained pathogenetic descriptions for visual +representation supervision, which are insufficient to capture the complex +visual appearance of pathogenetic images, hindering the generalizability of +models on diverse downstream tasks. Additionally, processing high-resolution +WSIs can be computationally expensive. In this paper, we propose a novel +"Fine-grained Visual-Semantic Interaction" (FiVE) framework for WSI +classification. It is designed to enhance the model's generalizability by +leveraging the interaction between localized visual patterns and fine-grained +pathological semantics. Specifically, with meticulously designed queries, we +start by utilizing a large language model to extract fine-grained pathological +descriptions from various non-standardized raw reports. The output descriptions +are then reconstructed into fine-grained labels used for training. By +introducing a Task-specific Fine-grained Semantics (TFS) module, we enable +prompts to capture crucial visual information in WSIs, which enhances +representation learning and augments generalization capabilities significantly. +Furthermore, given that pathological visual patterns are redundantly +distributed across tissue slices, we sample a subset of visual instances during +training. Our method demonstrates robust generalizability and strong +transferability, dominantly outperforming the counterparts on the TCGA Lung +Cancer dataset with at least 9.19% higher accuracy in few-shot experiments. The +code is available at: https://github.com/ls1rius/WSI_FiVE. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Dynamic Adapter Meets Prompt Tuning: Parameter-Efficient Transfer + Learning for Point Cloud Analysis CVPR 2024 + + +
+ Point cloud analysis has achieved outstanding performance by transferring +point cloud pre-trained models. However, existing methods for model adaptation +usually update all model parameters, i.e., full fine-tuning paradigm, which is +inefficient as it relies on high computational costs (e.g., training GPU +memory) and massive storage space. In this paper, we aim to study +parameter-efficient transfer learning for point cloud analysis with an ideal +trade-off between task performance and parameter efficiency. To achieve this +goal, we freeze the parameters of the default pre-trained models and then +propose the Dynamic Adapter, which generates a dynamic scale for each token, +considering the token significance to the downstream task. We further +seamlessly integrate Dynamic Adapter with Prompt Tuning (DAPT) by constructing +Internal Prompts, capturing the instance-specific features for interaction. +Extensive experiments conducted on five challenging datasets demonstrate that +the proposed DAPT achieves superior performance compared to the full +fine-tuning counterparts while significantly reducing the trainable parameters +and training GPU memory by 95% and 35%, respectively. Code is available at +https://github.com/LMD0311/DAPT. + +
+
+ comment: Accepted to CVPR 2024. Code is available at + https://github.com/LMD0311/DAPT +
+
+
+
+
+ + ♻ ☆ CapsFusion: Rethinking Image-Text Data at Scale CVPR 2024 + + +
+ Large multimodal models demonstrate remarkable generalist ability to perform +diverse multimodal tasks in a zero-shot manner. Large-scale web-based +image-text pairs contribute fundamentally to this success, but suffer from +excessive noise. Recent studies use alternative captions synthesized by +captioning models and have achieved notable benchmark performance. However, our +experiments reveal significant Scalability Deficiency and World Knowledge Loss +issues in models trained with synthetic captions, which have been largely +obscured by their initial benchmark success. Upon closer examination, we +identify the root cause as the overly-simplified language structure and lack of +knowledge details in existing synthetic captions. To provide higher-quality and +more scalable multimodal pretraining data, we propose CapsFusion, an advanced +framework that leverages large language models to consolidate and refine +information from both web-based image-text pairs and synthetic captions. +Extensive experiments show that CapsFusion captions exhibit remarkable +all-round superiority over existing captions in terms of model performance +(e.g., 18.8 and 18.3 improvements in CIDEr score on COCO and NoCaps), sample +efficiency (requiring 11-16 times less computation than baselines), world +knowledge depth, and scalability. These effectiveness, efficiency and +scalability advantages position CapsFusion as a promising candidate for future +scaling of LMM training. + +
+
+ comment: CVPR 2024. Code & Dataset: https://github.com/baaivision/CapsFusion +
+
+
+
+
+ + ♻ ☆ Visual Program Distillation: Distilling Tools and Programmatic Reasoning + into Vision-Language Models CVPR 2024 + + +
+ Solving complex visual tasks such as "Who invented the musical instrument on +the right?" involves a composition of skills: understanding space, recognizing +instruments, and also retrieving prior knowledge. Recent work shows promise by +decomposing such tasks using a large language model (LLM) into an executable +program that invokes specialized vision models. However, generated programs are +error-prone: they omit necessary steps, include spurious ones, and are unable +to recover when the specialized models give incorrect outputs. Moreover, they +require loading multiple models, incurring high latency and computation costs. +We propose Visual Program Distillation (VPD), an instruction tuning framework +that produces a vision-language model (VLM) capable of solving complex visual +tasks with a single forward pass. VPD distills the reasoning ability of LLMs by +using them to sample multiple candidate programs, which are then executed and +verified to identify a correct one. It translates each correct program into a +language description of the reasoning steps, which are then distilled into a +VLM. Extensive experiments show that VPD improves the VLM's ability to count, +understand spatial relations, and reason compositionally. Our VPD-trained +PaLI-X outperforms all prior VLMs, achieving state-of-the-art performance +across complex vision tasks, including MMBench, OK-VQA, A-OKVQA, TallyQA, POPE, +and Hateful Memes. An evaluation with human annotators also confirms that VPD +improves model response factuality and consistency. Finally, experiments on +content moderation demonstrate that VPD is also helpful for adaptation to +real-world applications with limited data. + +
+
+ comment: CVPR 2024 Oral +
+
+
+
+
+ + ♻ ☆ Detecting Heart Disease from Multi-View Ultrasound Images via Supervised + Attention Multiple Instance Learning + + +
+ Aortic stenosis (AS) is a degenerative valve condition that causes +substantial morbidity and mortality. This condition is under-diagnosed and +under-treated. In clinical practice, AS is diagnosed with expert review of +transthoracic echocardiography, which produces dozens of ultrasound images of +the heart. Only some of these views show the aortic valve. To automate +screening for AS, deep networks must learn to mimic a human expert's ability to +identify views of the aortic valve then aggregate across these relevant images +to produce a study-level diagnosis. We find previous approaches to AS detection +yield insufficient accuracy due to relying on inflexible averages across +images. We further find that off-the-shelf attention-based multiple instance +learning (MIL) performs poorly. We contribute a new end-to-end MIL approach +with two key methodological innovations. First, a supervised attention +technique guides the learned attention mechanism to favor relevant views. +Second, a novel self-supervised pretraining strategy applies contrastive +learning on the representation of the whole study instead of individual images +as commonly done in prior literature. Experiments on an open-access dataset and +an external validation set show that our approach yields higher accuracy while +reducing model size. + +
+
+ comment: Echocardiogram; multiple-instance learning; self-supervised learning; + semi-supervised learning; medical imaging +
+
+
+
+
+ + ♻ ☆ FashionEngine: Interactive Generation and Editing of 3D Clothed Humans + + +
+ We present FashionEngine, an interactive 3D human generation and editing +system that allows us to design 3D digital humans in a way that aligns with how +humans interact with the world, such as natural languages, visual perceptions, +and hand-drawing. FashionEngine automates the 3D human production with three +key components: 1) A pre-trained 3D human diffusion model that learns to model +3D humans in a semantic UV latent space from 2D image training data, which +provides strong priors for diverse generation and editing tasks. 2) +Multimodality-UV Space encoding the texture appearance, shape topology, and +textual semantics of human clothing in a canonical UV-aligned space, which +faithfully aligns the user multimodal inputs with the implicit UV latent space +for controllable 3D human editing. The multimodality-UV space is shared across +different user inputs, such as texts, images, and sketches, which enables +various joint multimodal editing tasks. 3) Multimodality-UV Aligned Sampler +learns to sample high-quality and diverse 3D humans from the diffusion prior +for multimodal user inputs. Extensive experiments validate FashionEngine's +state-of-the-art performance for conditional generation/editing tasks. In +addition, we present an interactive user interface for our FashionEngine that +enables both conditional and unconditional generation tasks, and editing tasks +including pose/view/shape control, text-, image-, and sketch-driven 3D human +editing and 3D virtual try-on, in a unified framework. Our project page is at: +https://taohuumd.github.io/projects/FashionEngine. + +
+
+ comment: Project Page: https://taohuumd.github.io/projects/FashionEngine +
+
+
+
+
+ + ♻ ☆ WaterVG: Waterway Visual Grounding based on Text-Guided Vision and + mmWave Radar + + +
+ The perception of waterways based on human intent is significant for +autonomous navigation and operations of Unmanned Surface Vehicles (USVs) in +water environments. Inspired by visual grounding, we introduce WaterVG, the +first visual grounding dataset designed for USV-based waterway perception based +on human prompts. WaterVG encompasses prompts describing multiple targets, with +annotations at the instance level including bounding boxes and masks. Notably, +WaterVG includes 11,568 samples with 34,987 referred targets, whose prompts +integrates both visual and radar characteristics. The pattern of text-guided +two sensors equips a finer granularity of text prompts with visual and radar +features of referred targets. Moreover, we propose a low-power visual grounding +model, Potamoi, which is a multi-task model with a well-designed Phased +Heterogeneous Modality Fusion (PHMF) mode, including Adaptive Radar Weighting +(ARW) and Multi-Head Slim Cross Attention (MHSCA). Exactly, ARW extracts +required radar features to fuse with vision for prompt alignment. MHSCA is an +efficient fusion module with a remarkably small parameter count and FLOPs, +elegantly fusing scenario context captured by two sensors with linguistic +features, which performs expressively on visual grounding tasks. Comprehensive +experiments and evaluations have been conducted on WaterVG, where our Potamoi +archives state-of-the-art performances compared with counterparts. + +
+
+ comment: 10 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ RadarDistill: Boosting Radar-based Object Detection Performance via + Knowledge Distillation from LiDAR Features CVPR + + +
+ The inherent noisy and sparse characteristics of radar data pose challenges +in finding effective representations for 3D object detection. In this paper, we +propose RadarDistill, a novel knowledge distillation (KD) method, which can +improve the representation of radar data by leveraging LiDAR data. RadarDistill +successfully transfers desirable characteristics of LiDAR features into radar +features using three key components: Cross-Modality Alignment (CMA), +Activation-based Feature Distillation (AFD), and Proposal-based Feature +Distillation (PFD). CMA enhances the density of radar features by employing +multiple layers of dilation operations, effectively addressing the challenge of +inefficient knowledge transfer from LiDAR to radar. AFD selectively transfers +knowledge based on regions of the LiDAR features, with a specific focus on +areas where activation intensity exceeds a predefined threshold. PFD similarly +guides the radar network to selectively mimic features from the LiDAR network +within the object proposals. Our comparative analyses conducted on the nuScenes +datasets demonstrate that RadarDistill achieves state-of-the-art (SOTA) +performance for radar-only object detection task, recording 20.5% in mAP and +43.7% in NDS. Also, RadarDistill significantly improves the performance of the +camera-radar fusion model. + +
+
+ comment: Accepted to IEEE/CVF Conference on Computer Vision and Pattern + Recognition (CVPR) 2024, 10 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ 94% on CIFAR-10 in 3.29 Seconds on a Single GPU + + +
+ CIFAR-10 is among the most widely used datasets in machine learning, +facilitating thousands of research projects per year. To accelerate research +and reduce the cost of experiments, we introduce training methods for CIFAR-10 +which reach 94% accuracy in 3.29 seconds, 95% in 10.4 seconds, and 96% in 46.3 +seconds, when run on a single NVIDIA A100 GPU. As one factor contributing to +these training speeds, we propose a derandomized variant of horizontal flipping +augmentation, which we show improves over the standard method in every case +where flipping is beneficial over no flipping at all. Our code is released at +https://github.com/KellerJordan/cifar10-airbench. + +
+
+
+
+
+ + ♻ ☆ Transient Neural Radiance Fields for Lidar View Synthesis and 3D + Reconstruction NeurIPS 2023 + + +
+ Neural radiance fields (NeRFs) have become a ubiquitous tool for modeling +scene appearance and geometry from multiview imagery. Recent work has also +begun to explore how to use additional supervision from lidar or depth sensor +measurements in the NeRF framework. However, previous lidar-supervised NeRFs +focus on rendering conventional camera imagery and use lidar-derived point +cloud data as auxiliary supervision; thus, they fail to incorporate the +underlying image formation model of the lidar. Here, we propose a novel method +for rendering transient NeRFs that take as input the raw, time-resolved photon +count histograms measured by a single-photon lidar system, and we seek to +render such histograms from novel views. Different from conventional NeRFs, the +approach relies on a time-resolved version of the volume rendering equation to +render the lidar measurements and capture transient light transport phenomena +at picosecond timescales. We evaluate our method on a first-of-its-kind dataset +of simulated and captured transient multiview scans from a prototype +single-photon lidar. Overall, our work brings NeRFs to a new dimension of +imaging at transient timescales, newly enabling rendering of transient imagery +from novel views. Additionally, we show that our approach recovers improved +geometry and conventional appearance compared to point cloud-based supervision +when training on few input viewpoints. Transient NeRFs may be especially useful +for applications which seek to simulate raw lidar measurements for downstream +tasks in autonomous driving, robotics, and remote sensing. + +
+
+ comment: NeurIPS 2023, Project Page: https://anaghmalik.com/TransientNeRF/ +
+
+
+
+
+ + ♻ ☆ RF-ULM: Ultrasound Localization Microscopy Learned from Radio-Frequency + Wavefronts + + +
+ In Ultrasound Localization Microscopy (ULM), achieving high-resolution images +relies on the precise localization of contrast agent particles across a series +of beamformed frames. However, our study uncovers an enormous potential: The +process of delay-and-sum beamforming leads to an irreversible reduction of +Radio-Frequency (RF) channel data, while its implications for localization +remain largely unexplored. The rich contextual information embedded within RF +wavefronts, including their hyperbolic shape and phase, offers great promise +for guiding Deep Neural Networks (DNNs) in challenging localization scenarios. +To fully exploit this data, we propose to directly localize scatterers in RF +channel data. Our approach involves a custom super-resolution DNN using learned +feature channel shuffling, non-maximum suppression, and a semi-global +convolutional block for reliable and accurate wavefront localization. +Additionally, we introduce a geometric point transformation that facilitates +seamless mapping to the B-mode coordinate space. To understand the impact of +beamforming on ULM, we validate the effectiveness of our method by conducting +an extensive comparison with State-Of-The-Art (SOTA) techniques. We present the +inaugural in vivo results from a wavefront-localizing DNN, highlighting its +real-world practicality. Our findings show that RF-ULM bridges the domain shift +between synthetic and real datasets, offering a considerable advantage in terms +of precision and complexity. To enable the broader research community to +benefit from our findings, our code and the associated SOTA methods are made +available at https://github.com/hahnec/rf-ulm. + +
+
+
+
+
+ + ♻ ☆ How Can Large Language Models Enable Better Socially Assistive + Human-Robot Interaction: A Brief Survey AAAI + + +
+ Socially assistive robots (SARs) have shown great success in providing +personalized cognitive-affective support for user populations with special +needs such as older adults, children with autism spectrum disorder (ASD), and +individuals with mental health challenges. The large body of work on SAR +demonstrates its potential to provide at-home support that complements +clinic-based interventions delivered by mental health professionals, making +these interventions more effective and accessible. However, there are still +several major technical challenges that hinder SAR-mediated interactions and +interventions from reaching human-level social intelligence and efficacy. With +the recent advances in large language models (LLMs), there is an increased +potential for novel applications within the field of SAR that can significantly +expand the current capabilities of SARs. However, incorporating LLMs introduces +new risks and ethical concerns that have not yet been encountered, and must be +carefully be addressed to safely deploy these more advanced systems. In this +work, we aim to conduct a brief survey on the use of LLMs in SAR technologies, +and discuss the potentials and risks of applying LLMs to the following three +major technical challenges of SAR: 1) natural language dialog; 2) multimodal +understanding; 3) LLMs as robot policies. + +
+
+ comment: 2 pages, accepted to the Proceedings of the AAAI Symposium Series, + 2024 +
+
+
+
+
+ + ♻ ☆ K-band: Self-supervised MRI Reconstruction via Stochastic Gradient + Descent over K-space Subsets + + +
+ Although deep learning (DL) methods are powerful for solving inverse +problems, their reliance on high-quality training data is a major hurdle. This +is significant in high-dimensional (dynamic/volumetric) magnetic resonance +imaging (MRI), where acquisition of high-resolution fully sampled k-space data +is impractical. We introduce a novel mathematical framework, dubbed k-band, +that enables training DL models using only partial, limited-resolution k-space +data. Specifically, we introduce training with stochastic gradient descent +(SGD) over k-space subsets. In each training iteration, rather than using the +fully sampled k-space for computing gradients, we use only a small k-space +portion. This concept is compatible with different sampling strategies; here we +demonstrate the method for k-space "bands", which have limited resolution in +one dimension and can hence be acquired rapidly. We prove analytically that our +method stochastically approximates the gradients computed in a fully-supervised +setup, when two simple conditions are met: (i) the limited-resolution axis is +chosen randomly-uniformly for every new scan, hence k-space is fully covered +across the entire training set, and (ii) the loss function is weighed with a +mask, derived here analytically, which facilitates accurate reconstruction of +high-resolution details. Numerical experiments with raw MRI data indicate that +k-band outperforms two other methods trained on limited-resolution data and +performs comparably to state-of-the-art (SoTA) methods trained on +high-resolution data. k-band hence obtains SoTA performance, with the advantage +of training using only limited-resolution data. This work hence introduces a +practical, easy-to-implement, self-supervised training framework, which +involves fast acquisition and self-supervised reconstruction and offers +theoretical guarantees. + +
+
+
+
+
+ + ♻ ☆ Cross-Silo Federated Learning Across Divergent Domains with Iterative + Parameter Alignment + + +
+ Learning from the collective knowledge of data dispersed across private +sources can provide neural networks with enhanced generalization capabilities. +Federated learning, a method for collaboratively training a machine learning +model across remote clients, achieves this by combining client models via the +orchestration of a central server. However, current approaches face two +critical limitations: i) they struggle to converge when client domains are +sufficiently different, and ii) current aggregation techniques produce an +identical global model for each client. In this work, we address these issues +by reformulating the typical federated learning setup: rather than learning a +single global model, we learn N models each optimized for a common objective. +To achieve this, we apply a weighted distance minimization to model parameters +shared in a peer-to-peer topology. The resulting framework, Iterative Parameter +Alignment, applies naturally to the cross-silo setting, and has the following +properties: (i) a unique solution for each participant, with the option to +globally converge each model in the federation, and (ii) an optional +early-stopping mechanism to elicit fairness among peers in collaborative +learning settings. These characteristics jointly provide a flexible new +framework for iteratively learning from peer models trained on disparate +datasets. We find that the technique achieves competitive results on a variety +of data partitions compared to state-of-the-art approaches. Further, we show +that the method is robust to divergent domains (i.e. disjoint classes across +peers) where existing approaches struggle. + +
+
+ comment: Published at IEEE Big Data 2023 +
+
+
+
+
+ + ♻ ☆ FairRAG: Fair Human Generation via Fair Retrieval Augmentation CVPR 2024 + + +
+ Existing text-to-image generative models reflect or even amplify societal +biases ingrained in their training data. This is especially concerning for +human image generation where models are biased against certain demographic +groups. Existing attempts to rectify this issue are hindered by the inherent +limitations of the pre-trained models and fail to substantially improve +demographic diversity. In this work, we introduce Fair Retrieval Augmented +Generation (FairRAG), a novel framework that conditions pre-trained generative +models on reference images retrieved from an external image database to improve +fairness in human generation. FairRAG enables conditioning through a +lightweight linear module that projects reference images into the textual +space. To enhance fairness, FairRAG applies simple-yet-effective debiasing +strategies, providing images from diverse demographic groups during the +generative process. Extensive experiments demonstrate that FairRAG outperforms +existing methods in terms of demographic diversity, image-text alignment, and +image fidelity while incurring minimal computational overhead during inference. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ FairCLIP: Harnessing Fairness in Vision-Language Learning CVPR 2024 + + +
+ Fairness is a critical concern in deep learning, especially in healthcare, +where these models influence diagnoses and treatment decisions. Although +fairness has been investigated in the vision-only domain, the fairness of +medical vision-language (VL) models remains unexplored due to the scarcity of +medical VL datasets for studying fairness. To bridge this research gap, we +introduce the first fair vision-language medical dataset Harvard-FairVLMed that +provides detailed demographic attributes, ground-truth labels, and clinical +notes to facilitate an in-depth examination of fairness within VL foundation +models. Using Harvard-FairVLMed, we conduct a comprehensive fairness analysis +of two widely-used VL models (CLIP and BLIP2), pre-trained on both natural and +medical domains, across four different protected attributes. Our results +highlight significant biases in all VL models, with Asian, Male, Non-Hispanic, +and Spanish being the preferred subgroups across the protected attributes of +race, gender, ethnicity, and language, respectively. In order to alleviate +these biases, we propose FairCLIP, an optimal-transport-based approach that +achieves a favorable trade-off between performance and fairness by reducing the +Sinkhorn distance between the overall sample distribution and the distributions +corresponding to each demographic group. As the first VL dataset of its kind, +Harvard-FairVLMed holds the potential to catalyze advancements in the +development of machine learning models that are both ethically aware and +clinically effective. Our dataset and code are available at +https://ophai.hms.harvard.edu/datasets/harvard-fairvlmed10k. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ The devil is in the fine-grained details: Evaluating open-vocabulary + object detectors for fine-grained understanding CVPR2024 + + +
+ Recent advancements in large vision-language models enabled visual object +detection in open-vocabulary scenarios, where object classes are defined in +free-text formats during inference. In this paper, we aim to probe the +state-of-the-art methods for open-vocabulary object detection to determine to +what extent they understand fine-grained properties of objects and their parts. +To this end, we introduce an evaluation protocol based on dynamic vocabulary +generation to test whether models detect, discern, and assign the correct +fine-grained description to objects in the presence of hard-negative classes. +We contribute with a benchmark suite of increasing difficulty and probing +different properties like color, pattern, and material. We further enhance our +investigation by evaluating several state-of-the-art open-vocabulary object +detectors using the proposed protocol and find that most existing solutions, +which shine in standard open-vocabulary benchmarks, struggle to accurately +capture and distinguish finer object details. We conclude the paper by +highlighting the limitations of current methodologies and exploring promising +research directions to overcome the discovered drawbacks. Data and code are +available at https://lorebianchi98.github.io/FG-OVD/. + +
+
+ comment: Accepted as Highlight at CVPR2024 +
+
+
+
+
+ + ♻ ☆ On Pretraining Data Diversity for Self-Supervised Learning + + +
+ We explore the impact of training with more diverse datasets, characterized +by the number of unique samples, on the performance of self-supervised learning +(SSL) under a fixed computational budget. Our findings consistently demonstrate +that increasing pretraining data diversity enhances SSL performance, albeit +only when the distribution distance to the downstream data is minimal. Notably, +even with an exceptionally large pretraining data diversity achieved through +methods like web crawling or diffusion-generated data, among other ways, the +distribution shift remains a challenge. Our experiments are comprehensive with +seven SSL methods using large-scale datasets such as ImageNet and YFCC100M +amounting to over 200 GPU days. Code and trained models will be available at +https://github.com/hammoudhasan/DiversitySSL . + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Object Detectors in the Open Environment: Challenges, Solutions, and + Outlook + + +
+ With the emergence of foundation models, deep learning-based object detectors +have shown practical usability in closed set scenarios. However, for real-world +tasks, object detectors often operate in open environments, where crucial +factors (e.g., data distribution, objective) that influence model learning are +often changing. The dynamic and intricate nature of the open environment poses +novel and formidable challenges to object detectors. Unfortunately, current +research on object detectors in open environments lacks a comprehensive +analysis of their distinctive characteristics, challenges, and corresponding +solutions, which hinders their secure deployment in critical real-world +scenarios. This paper aims to bridge this gap by conducting a comprehensive +review and analysis of object detectors in open environments. We initially +identified limitations of key structural components within the existing +detection pipeline and propose the open environment object detector challenge +framework that includes four quadrants (i.e., out-of-domain, out-of-category, +robust learning, and incremental learning) based on the dimensions of the data +/ target changes. For each quadrant of challenges in the proposed framework, we +present a detailed description and systematic analysis of the overarching goals +and core difficulties, systematically review the corresponding solutions, and +benchmark their performance over multiple widely adopted datasets. In addition, +we engage in a discussion of open problems and potential avenues for future +research. This paper aims to provide a fresh, comprehensive, and systematic +understanding of the challenges and solutions associated with open-environment +object detectors, thus catalyzing the development of more solid applications in +real-world scenarios. A project related to this survey can be found at +https://github.com/LiangSiyuan21/OEOD_Survey. + +
+
+ comment: 37 pages, 17 figures +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 142 + +
+
+
+ + ☆ Know Your Neighbors: Improving Single-View Reconstruction via Spatial + Vision-Language Reasoning CVPR 2024 + + +
+ Recovering the 3D scene geometry from a single view is a fundamental yet +ill-posed problem in computer vision. While classical depth estimation methods +infer only a 2.5D scene representation limited to the image plane, recent +approaches based on radiance fields reconstruct a full 3D representation. +However, these methods still struggle with occluded regions since inferring +geometry without visual observation requires (i) semantic knowledge of the +surroundings, and (ii) reasoning about spatial context. We propose KYN, a novel +method for single-view scene reconstruction that reasons about semantic and +spatial context to predict each point's density. We introduce a vision-language +modulation module to enrich point features with fine-grained semantic +information. We aggregate point representations across the scene through a +language-guided spatial attention mechanism to yield per-point density +predictions aware of the 3D semantic context. We show that KYN improves 3D +shape recovery compared to predicting density for each 3D point in isolation. +We achieve state-of-the-art results in scene and object reconstruction on +KITTI-360, and show improved zero-shot generalization compared to prior work. +Project page: https://ruili3.github.io/kyn. + +
+
+ comment: CVPR 2024. Project page: https://ruili3.github.io/kyn +
+
+
+
+
+ + ☆ OW-VISCap: Open-World Video Instance Segmentation and Captioning SC + + +
+ Open-world video instance segmentation is an important video understanding +task. Yet most methods either operate in a closed-world setting, require an +additional user-input, or use classic region-based proposals to identify never +before seen objects. Further, these methods only assign a one-word label to +detected objects, and don't generate rich object-centric descriptions. They +also often suffer from highly overlapping predictions. To address these issues, +we propose Open-World Video Instance Segmentation and Captioning (OW-VISCap), +an approach to jointly segment, track, and caption previously seen or unseen +objects in a video. For this, we introduce open-world object queries to +discover never before seen objects without additional user-input. We generate +rich and descriptive object-centric captions for each detected object via a +masked attention augmented LLM input. We introduce an inter-query contrastive +loss to ensure that the object queries differ from one another. Our generalized +approach matches or surpasses state-of-the-art on three tasks: open-world video +instance segmentation on the BURST dataset, dense video object captioning on +the VidSTG dataset, and closed-world video instance segmentation on the OVIS +dataset. + +
+
+ comment: Project page: https://anwesachoudhuri.github.io/OpenWorldVISCap/ +
+
+
+
+
+ + ☆ MVD-Fusion: Single-view 3D via Depth-consistent Multi-view Generation + + +
+ We present MVD-Fusion: a method for single-view 3D inference via generative +modeling of multi-view-consistent RGB-D images. While recent methods pursuing +3D inference advocate learning novel-view generative models, these generations +are not 3D-consistent and require a distillation process to generate a 3D +output. We instead cast the task of 3D inference as directly generating +mutually-consistent multiple views and build on the insight that additionally +inferring depth can provide a mechanism for enforcing this consistency. +Specifically, we train a denoising diffusion model to generate multi-view RGB-D +images given a single RGB input image and leverage the (intermediate noisy) +depth estimates to obtain reprojection-based conditioning to maintain +multi-view consistency. We train our model using large-scale synthetic dataset +Obajverse as well as the real-world CO3D dataset comprising of generic camera +viewpoints. We demonstrate that our approach can yield more accurate synthesis +compared to recent state-of-the-art, including distillation-based 3D inference +and prior multi-view generation methods. We also evaluate the geometry induced +by our multi-view depth prediction and find that it yields a more accurate +representation than other direct 3D inference approaches. + +
+
+ comment: Project page: https://mvd-fusion.github.io/ +
+
+
+
+
+ + ☆ RaFE: Generative Radiance Fields Restoration + + +
+ NeRF (Neural Radiance Fields) has demonstrated tremendous potential in novel +view synthesis and 3D reconstruction, but its performance is sensitive to input +image quality, which struggles to achieve high-fidelity rendering when provided +with low-quality sparse input viewpoints. Previous methods for NeRF restoration +are tailored for specific degradation type, ignoring the generality of +restoration. To overcome this limitation, we propose a generic radiance fields +restoration pipeline, named RaFE, which applies to various types of +degradations, such as low resolution, blurriness, noise, compression artifacts, +or their combinations. Our approach leverages the success of off-the-shelf 2D +restoration methods to recover the multi-view images individually. Instead of +reconstructing a blurred NeRF by averaging inconsistencies, we introduce a +novel approach using Generative Adversarial Networks (GANs) for NeRF generation +to better accommodate the geometric and appearance inconsistencies present in +the multi-view images. Specifically, we adopt a two-level tri-plane +architecture, where the coarse level remains fixed to represent the low-quality +NeRF, and a fine-level residual tri-plane to be added to the coarse level is +modeled as a distribution with GAN to capture potential variations in +restoration. We validate RaFE on both synthetic and real cases for various +restoration tasks, demonstrating superior performance in both quantitative and +qualitative evaluations, surpassing other 3D restoration methods specific to +single task. Please see our project website +https://zkaiwu.github.io/RaFE-Project/. + +
+
+ comment: Project Page: https://zkaiwu.github.io/RaFE-Project/ +
+
+
+
+
+ + ☆ CoMat: Aligning Text-to-Image Diffusion Model with Image-to-Text Concept + Matching + + +
+ Diffusion models have demonstrated great success in the field of +text-to-image generation. However, alleviating the misalignment between the +text prompts and images is still challenging. The root reason behind the +misalignment has not been extensively investigated. We observe that the +misalignment is caused by inadequate token attention activation. We further +attribute this phenomenon to the diffusion model's insufficient condition +utilization, which is caused by its training paradigm. To address the issue, we +propose CoMat, an end-to-end diffusion model fine-tuning strategy with an +image-to-text concept matching mechanism. We leverage an image captioning model +to measure image-to-text alignment and guide the diffusion model to revisit +ignored tokens. A novel attribute concentration module is also proposed to +address the attribute binding problem. Without any image or human preference +data, we use only 20K text prompts to fine-tune SDXL to obtain CoMat-SDXL. +Extensive experiments show that CoMat-SDXL significantly outperforms the +baseline model SDXL in two text-to-image alignment benchmarks and achieves +start-of-the-art performance. + +
+
+ comment: Project Page: https://caraj7.github.io/comat +
+
+
+
+
+ + ☆ The More You See in 2D, the More You Perceive in 3D + + +
+ Humans can infer 3D structure from 2D images of an object based on past +experience and improve their 3D understanding as they see more images. Inspired +by this behavior, we introduce SAP3D, a system for 3D reconstruction and novel +view synthesis from an arbitrary number of unposed images. Given a few unposed +images of an object, we adapt a pre-trained view-conditioned diffusion model +together with the camera poses of the images via test-time fine-tuning. The +adapted diffusion model and the obtained camera poses are then utilized as +instance-specific priors for 3D reconstruction and novel view synthesis. We +show that as the number of input images increases, the performance of our +approach improves, bridging the gap between optimization-based prior-less 3D +reconstruction methods and single-image-to-3D diffusion-based methods. We +demonstrate our system on real images as well as standard synthetic benchmarks. +Our ablation studies confirm that this adaption behavior is key for more +accurate 3D understanding. + +
+
+ comment: Project page: https://sap3d.github.io/ +
+
+
+
+
+ + ☆ OpenNeRF: Open Set 3D Neural Scene Segmentation with Pixel-Wise Features + and Rendered Novel Views ICLR 2024 + + +
+ Large visual-language models (VLMs), like CLIP, enable open-set image +segmentation to segment arbitrary concepts from an image in a zero-shot manner. +This goes beyond the traditional closed-set assumption, i.e., where models can +only segment classes from a pre-defined training set. More recently, first +works on open-set segmentation in 3D scenes have appeared in the literature. +These methods are heavily influenced by closed-set 3D convolutional approaches +that process point clouds or polygon meshes. However, these 3D scene +representations do not align well with the image-based nature of the +visual-language models. Indeed, point cloud and 3D meshes typically have a +lower resolution than images and the reconstructed 3D scene geometry might not +project well to the underlying 2D image sequences used to compute pixel-aligned +CLIP features. To address these challenges, we propose OpenNeRF which naturally +operates on posed images and directly encodes the VLM features within the NeRF. +This is similar in spirit to LERF, however our work shows that using pixel-wise +VLM features (instead of global CLIP features) results in an overall less +complex architecture without the need for additional DINO regularization. Our +OpenNeRF further leverages NeRF's ability to render novel views and extract +open-set VLM features from areas that are not well observed in the initial +posed images. For 3D point cloud segmentation on the Replica dataset, OpenNeRF +outperforms recent open-vocabulary methods such as LERF and OpenScene by at +least +4.9 mIoU. + +
+
+ comment: ICLR 2024, Project page: https://opennerf.github.io +
+
+
+
+
+ + ☆ Decoupling Static and Hierarchical Motion Perception for Referring Video + Segmentation CVPR 2024 + + +
+ Referring video segmentation relies on natural language expressions to +identify and segment objects, often emphasizing motion clues. Previous works +treat a sentence as a whole and directly perform identification at the +video-level, mixing up static image-level cues with temporal motion cues. +However, image-level features cannot well comprehend motion cues in sentences, +and static cues are not crucial for temporal perception. In fact, static cues +can sometimes interfere with temporal perception by overshadowing motion cues. +In this work, we propose to decouple video-level referring expression +understanding into static and motion perception, with a specific emphasis on +enhancing temporal comprehension. Firstly, we introduce an +expression-decoupling module to make static cues and motion cues perform their +distinct role, alleviating the issue of sentence embeddings overlooking motion +cues. Secondly, we propose a hierarchical motion perception module to capture +temporal information effectively across varying timescales. Furthermore, we +employ contrastive learning to distinguish the motions of visually similar +objects. These contributions yield state-of-the-art performance across five +datasets, including a remarkable $\textbf{9.2%}$ $\mathcal{J\&F}$ improvement +on the challenging $\textbf{MeViS}$ dataset. Code is available at +https://github.com/heshuting555/DsHmp. + +
+
+ comment: CVPR 2024, code: https://github.com/heshuting555/DsHmp +
+
+
+
+
+ + ☆ DiffBody: Human Body Restoration by Imagining with Generative Diffusion + Prior + + +
+ Human body restoration plays a vital role in various applications related to +the human body. Despite recent advances in general image restoration using +generative models, their performance in human body restoration remains +mediocre, often resulting in foreground and background blending, over-smoothing +surface textures, missing accessories, and distorted limbs. Addressing these +challenges, we propose a novel approach by constructing a human body-aware +diffusion model that leverages domain-specific knowledge to enhance +performance. Specifically, we employ a pretrained body attention module to +guide the diffusion model's focus on the foreground, addressing issues caused +by blending between the subject and background. We also demonstrate the value +of revisiting the language modality of the diffusion model in restoration tasks +by seamlessly incorporating text prompt to improve the quality of surface +texture and additional clothing and accessories details. Additionally, we +introduce a diffusion sampler tailored for fine-grained human body parts, +utilizing local semantic information to rectify limb distortions. Lastly, we +collect a comprehensive dataset for benchmarking and advancing the field of +human body restoration. Extensive experimental validation showcases the +superiority of our approach, both quantitatively and qualitatively, over +existing methods. + +
+
+
+
+
+ + ☆ WorDepth: Variational Language Prior for Monocular Depth Estimation + + +
+ Three-dimensional (3D) reconstruction from a single image is an ill-posed +problem with inherent ambiguities, i.e. scale. Predicting a 3D scene from text +description(s) is similarly ill-posed, i.e. spatial arrangements of objects +described. We investigate the question of whether two inherently ambiguous +modalities can be used in conjunction to produce metric-scaled reconstructions. +To test this, we focus on monocular depth estimation, the problem of predicting +a dense depth map from a single image, but with an additional text caption +describing the scene. To this end, we begin by encoding the text caption as a +mean and standard deviation; using a variational framework, we learn the +distribution of the plausible metric reconstructions of 3D scenes corresponding +to the text captions as a prior. To "select" a specific reconstruction or depth +map, we encode the given image through a conditional sampler that samples from +the latent space of the variational text encoder, which is then decoded to the +output depth map. Our approach is trained alternatingly between the text and +image branches: in one optimization step, we predict the mean and standard +deviation from the text description and sample from a standard Gaussian, and in +the other, we sample using a (image) conditional sampler. Once trained, we +directly predict depth from the encoded text using the conditional sampler. We +demonstrate our approach on indoor (NYUv2) and outdoor (KITTI) scenarios, where +we show that language can consistently improve performance in both. + +
+
+
+
+
+ + ☆ PreAfford: Universal Affordance-Based Pre-Grasping for Diverse Objects + and Environments + + +
+ Robotic manipulation of ungraspable objects with two-finger grippers presents +significant challenges due to the paucity of graspable features, while +traditional pre-grasping techniques, which rely on repositioning objects and +leveraging external aids like table edges, lack the adaptability across object +categories and scenes. Addressing this, we introduce PreAfford, a novel +pre-grasping planning framework that utilizes a point-level affordance +representation and a relay training approach to enhance adaptability across a +broad range of environments and object types, including those previously +unseen. Demonstrated on the ShapeNet-v2 dataset, PreAfford significantly +improves grasping success rates by 69% and validates its practicality through +real-world experiments. This work offers a robust and adaptable solution for +manipulating ungraspable objects. + +
+
+ comment: Project Page: https://air-discover.github.io/PreAfford/ +
+
+
+
+
+ + ☆ Reference-Based 3D-Aware Image Editing with Triplane + + +
+ Generative Adversarial Networks (GANs) have emerged as powerful tools not +only for high-quality image generation but also for real image editing through +manipulation of their interpretable latent spaces. Recent advancements in GANs +include the development of 3D-aware models such as EG3D, characterized by +efficient triplane-based architectures enabling the reconstruction of 3D +geometry from single images. However, scant attention has been devoted to +providing an integrated framework for high-quality reference-based 3D-aware +image editing within this domain. This study addresses this gap by exploring +and demonstrating the effectiveness of EG3D's triplane space for achieving +advanced reference-based edits, presenting a unique perspective on 3D-aware +image editing through our novel pipeline. Our approach integrates the encoding +of triplane features, spatial disentanglement and automatic localization of +features in the triplane domain, and fusion learning for desired image editing. +Moreover, our framework demonstrates versatility across domains, extending its +effectiveness to animal face edits and partial stylization of cartoon +portraits. The method shows significant improvements over relevant 3D-aware +latent editing and 2D reference-based editing methods, both qualitatively and +quantitatively. Project page: https://three-bee.github.io/triplane_edit + +
+
+
+
+
+ + ☆ Robust Concept Erasure Using Task Vectors + + +
+ With the rapid growth of text-to-image models, a variety of techniques have +been suggested to prevent undesirable image generations. Yet, these methods +often only protect against specific user prompts and have been shown to allow +unsafe generations with other inputs. Here we focus on unconditionally erasing +a concept from a text-to-image model rather than conditioning the erasure on +the user's prompt. We first show that compared to input-dependent erasure +methods, concept erasure that uses Task Vectors (TV) is more robust to +unexpected user inputs, not seen during training. However, TV-based erasure can +also affect the core performance of the edited model, particularly when the +required edit strength is unknown. To this end, we propose a method called +Diverse Inversion, which we use to estimate the required strength of the TV +edit. Diverse Inversion finds within the model input space a large set of word +embeddings, each of which induces the generation of the target concept. We find +that encouraging diversity in the set makes our estimation more robust to +unexpected prompts. Finally, we show that Diverse Inversion enables us to apply +a TV edit only to a subset of the model weights, enhancing the erasure +capabilities while better maintaining the core functionality of the model. + +
+
+
+
+
+ + ☆ LCM-Lookahead for Encoder-based Text-to-Image Personalization + + +
+ Recent advancements in diffusion models have introduced fast sampling methods +that can effectively produce high-quality images in just one or a few denoising +steps. Interestingly, when these are distilled from existing diffusion models, +they often maintain alignment with the original model, retaining similar +outputs for similar prompts and seeds. These properties present opportunities +to leverage fast sampling methods as a shortcut-mechanism, using them to create +a preview of denoised outputs through which we can backpropagate image-space +losses. In this work, we explore the potential of using such +shortcut-mechanisms to guide the personalization of text-to-image models to +specific facial identities. We focus on encoder-based personalization +approaches, and demonstrate that by tuning them with a lookahead identity loss, +we can achieve higher identity fidelity, without sacrificing layout diversity +or prompt alignment. We further explore the use of attention sharing mechanisms +and consistent data generation for the task of personalization, and find that +encoder training can benefit from both. + +
+
+ comment: Project page at https://lcm-lookahead.github.io/ +
+
+
+
+
+ + ☆ DeViDe: Faceted medical knowledge for improved medical vision-language + pre-training + + +
+ Vision-language pre-training for chest X-rays has made significant strides, +primarily by utilizing paired radiographs and radiology reports. However, +existing approaches often face challenges in encoding medical knowledge +effectively. While radiology reports provide insights into the current disease +manifestation, medical definitions (as used by contemporary methods) tend to be +overly abstract, creating a gap in knowledge. To address this, we propose +DeViDe, a novel transformer-based method that leverages radiographic +descriptions from the open web. These descriptions outline general visual +characteristics of diseases in radiographs, and when combined with abstract +definitions and radiology reports, provide a holistic snapshot of knowledge. +DeViDe incorporates three key features for knowledge-augmented vision language +alignment: First, a large-language model-based augmentation is employed to +homogenise medical knowledge from diverse sources. Second, this knowledge is +aligned with image information at various levels of granularity. Third, a novel +projection layer is proposed to handle the complexity of aligning each image +with multiple descriptions arising in a multi-label setting. In zero-shot +settings, DeViDe performs comparably to fully supervised models on external +datasets and achieves state-of-the-art results on three large-scale datasets. +Additionally, fine-tuning DeViDe on four downstream tasks and six segmentation +tasks showcases its superior performance across data from diverse +distributions. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2208.04060 by other authors +
+
+
+
+
+ + ☆ On the Efficiency of Convolutional Neural Networks + + +
+ Since the breakthrough performance of AlexNet in 2012, convolutional neural +networks (convnets) have grown into extremely powerful vision models. Deep +learning researchers have used convnets to produce accurate results that were +unachievable a decade ago. Yet computer scientists make computational +efficiency their primary objective. Accuracy with exorbitant cost is not +acceptable; an algorithm must also minimize its computational requirements. +Confronted with the daunting computation that convnets use, deep learning +researchers also became interested in efficiency. Researchers applied +tremendous effort to find the convnet architectures that have the greatest +efficiency. However, skepticism grew among researchers and engineers alike +about the relevance of arithmetic complexity. Contrary to the prevailing view +that latency and arithmetic complexity are irreconcilable, a simple formula +relates both through computational efficiency. This insight enabled us to +co-optimize the separate factors that determine latency. We observed that the +degenerate conv2d layers that produce the best accuracy-complexity trade-off +also have low operational intensity. Therefore, kernels that implement these +layers use significant memory resources. We solved this optimization problem +with block-fusion kernels that implement all layers of a residual block, +thereby creating temporal locality, avoiding communication, and reducing +workspace size. Our ConvFirst model with block-fusion kernels ran approximately +four times as fast as the ConvNeXt baseline with PyTorch Inductor, at equal +accuracy on the ImageNet-1K classification task. Our unified approach to +convnet efficiency envisions a new era of models and kernels that achieve +greater accuracy at lower cost. + +
+
+
+
+
+ + ☆ Per-Gaussian Embedding-Based Deformation for Deformable 3D Gaussian + Splatting + + +
+ As 3D Gaussian Splatting (3DGS) provides fast and high-quality novel view +synthesis, it is a natural extension to deform a canonical 3DGS to multiple +frames. However, previous works fail to accurately reconstruct dynamic scenes, +especially 1) static parts moving along nearby dynamic parts, and 2) some +dynamic areas are blurry. We attribute the failure to the wrong design of the +deformation field, which is built as a coordinate-based function. This approach +is problematic because 3DGS is a mixture of multiple fields centered at the +Gaussians, not just a single coordinate-based framework. To resolve this +problem, we define the deformation as a function of per-Gaussian embeddings and +temporal embeddings. Moreover, we decompose deformations as coarse and fine +deformations to model slow and fast movements, respectively. Also, we introduce +an efficient training strategy for faster convergence and higher quality. +Project page: https://jeongminb.github.io/e-d3dgs/ + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ InsectMamba: Insect Pest Classification with State Space Model + + +
+ The classification of insect pests is a critical task in agricultural +technology, vital for ensuring food security and environmental sustainability. +However, the complexity of pest identification, due to factors like high +camouflage and species diversity, poses significant obstacles. Existing methods +struggle with the fine-grained feature extraction needed to distinguish between +closely related pest species. Although recent advancements have utilized +modified network structures and combined deep learning approaches to improve +accuracy, challenges persist due to the similarity between pests and their +surroundings. To address this problem, we introduce InsectMamba, a novel +approach that integrates State Space Models (SSMs), Convolutional Neural +Networks (CNNs), Multi-Head Self-Attention mechanism (MSA), and Multilayer +Perceptrons (MLPs) within Mix-SSM blocks. This integration facilitates the +extraction of comprehensive visual features by leveraging the strengths of each +encoding strategy. A selective module is also proposed to adaptively aggregate +these features, enhancing the model's ability to discern pest characteristics. +InsectMamba was evaluated against strong competitors across five insect pest +classification datasets. The results demonstrate its superior performance and +verify the significance of each model component by an ablation study. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+ + ☆ SemGrasp: Semantic Grasp Generation via Language Aligned Discretization + + +
+ Generating natural human grasps necessitates consideration of not just object +geometry but also semantic information. Solely depending on object shape for +grasp generation confines the applications of prior methods in downstream +tasks. This paper presents a novel semantic-based grasp generation method, +termed SemGrasp, which generates a static human grasp pose by incorporating +semantic information into the grasp representation. We introduce a discrete +representation that aligns the grasp space with semantic space, enabling the +generation of grasp postures in accordance with language instructions. A +Multimodal Large Language Model (MLLM) is subsequently fine-tuned, integrating +object, grasp, and language within a unified semantic space. To facilitate the +training of SemGrasp, we have compiled a large-scale, grasp-text-aligned +dataset named CapGrasp, featuring about 260k detailed captions and 50k diverse +grasps. Experimental findings demonstrate that SemGrasp efficiently generates +natural human grasps in alignment with linguistic intentions. Our code, models, +and dataset are available publicly at: https://kailinli.github.io/SemGrasp. + +
+
+
+
+
+ + ☆ Towards more realistic human motion prediction with attention to motion + coordination + + +
+ Joint relation modeling is a curial component in human motion prediction. +Most existing methods rely on skeletal-based graphs to build the joint +relations, where local interactive relations between joint pairs are well +learned. However, the motion coordination, a global joint relation reflecting +the simultaneous cooperation of all joints, is usually weakened because it is +learned from part to whole progressively and asynchronously. Thus, the final +predicted motions usually appear unrealistic. To tackle this issue, we learn a +medium, called coordination attractor (CA), from the spatiotemporal features of +motion to characterize the global motion features, which is subsequently used +to build new relative joint relations. Through the CA, all joints are related +simultaneously, and thus the motion coordination of all joints can be better +learned. Based on this, we further propose a novel joint relation modeling +module, Comprehensive Joint Relation Extractor (CJRE), to combine this motion +coordination with the local interactions between joint pairs in a unified +manner. Additionally, we also present a Multi-timescale Dynamics Extractor +(MTDE) to extract enriched dynamics from the raw position information for +effective prediction. Extensive experiments show that the proposed framework +outperforms state-of-the-art methods in both short- and long-term predictions +on H3.6M, CMU-Mocap, and 3DPW. + +
+
+ comment: Accepted by TCSVT +
+
+
+
+
+ + ☆ DreamScene: 3D Gaussian-based Text-to-3D Scene Generation via Formation + Pattern Sampling + + +
+ Text-to-3D scene generation holds immense potential for the gaming, film, and +architecture sectors. Despite significant progress, existing methods struggle +with maintaining high quality, consistency, and editing flexibility. In this +paper, we propose DreamScene, a 3D Gaussian-based novel text-to-3D scene +generation framework, to tackle the aforementioned three challenges mainly via +two strategies. First, DreamScene employs Formation Pattern Sampling (FPS), a +multi-timestep sampling strategy guided by the formation patterns of 3D +objects, to form fast, semantically rich, and high-quality representations. FPS +uses 3D Gaussian filtering for optimization stability, and leverages +reconstruction techniques to generate plausible textures. Second, DreamScene +employs a progressive three-stage camera sampling strategy, specifically +designed for both indoor and outdoor settings, to effectively ensure +object-environment integration and scene-wide 3D consistency. Last, DreamScene +enhances scene editing flexibility by integrating objects and environments, +enabling targeted adjustments. Extensive experiments validate DreamScene's +superiority over current state-of-the-art techniques, heralding its +wide-ranging potential for diverse applications. Code and demos will be +released at https://dreamscene-project.github.io . + +
+
+
+
+
+ + ☆ TinyVQA: Compact Multimodal Deep Neural Network for Visual Question + Answering on Resource-Constrained Devices + + +
+ Traditional machine learning models often require powerful hardware, making +them unsuitable for deployment on resource-limited devices. Tiny Machine +Learning (tinyML) has emerged as a promising approach for running machine +learning models on these devices, but integrating multiple data modalities into +tinyML models still remains a challenge due to increased complexity, latency, +and power consumption. This paper proposes TinyVQA, a novel multimodal deep +neural network for visual question answering tasks that can be deployed on +resource-constrained tinyML hardware. TinyVQA leverages a supervised +attention-based model to learn how to answer questions about images using both +vision and language modalities. Distilled knowledge from the supervised +attention-based VQA model trains the memory aware compact TinyVQA model and low +bit-width quantization technique is employed to further compress the model for +deployment on tinyML devices. The TinyVQA model was evaluated on the FloodNet +dataset, which is used for post-disaster damage assessment. The compact model +achieved an accuracy of 79.5%, demonstrating the effectiveness of TinyVQA for +real-world applications. Additionally, the model was deployed on a Crazyflie +2.0 drone, equipped with an AI deck and GAP8 microprocessor. The TinyVQA model +achieved low latencies of 56 ms and consumes 693 mW power while deployed on the +tiny drone, showcasing its suitability for resource-constrained embedded +systems. + +
+
+ comment: Accepted as a full paper by the tinyML Research Symposium 2024 +
+
+
+
+
+ + ☆ Terrain Point Cloud Inpainting via Signal Decomposition + + +
+ The rapid development of 3D acquisition technology has made it possible to +obtain point clouds of real-world terrains. However, due to limitations in +sensor acquisition technology or specific requirements, point clouds often +contain defects such as holes with missing data. Inpainting algorithms are +widely used to patch these holes. However, existing traditional inpainting +algorithms rely on precise hole boundaries, which limits their ability to +handle cases where the boundaries are not well-defined. On the other hand, +learning-based completion methods often prioritize reconstructing the entire +point cloud instead of solely focusing on hole filling. Based on the fact that +real-world terrain exhibits both global smoothness and rich local detail, we +propose a novel representation for terrain point clouds. This representation +can help to repair the holes without clear boundaries. Specifically, it +decomposes terrains into low-frequency and high-frequency components, which are +represented by B-spline surfaces and relative height maps respectively. In this +way, the terrain point cloud inpainting problem is transformed into a B-spline +surface fitting and 2D image inpainting problem. By solving the two problems, +the highly complex and irregular holes on the terrain point clouds can be +well-filled, which not only satisfies the global terrain undulation but also +exhibits rich geometric details. The experimental results also demonstrate the +effectiveness of our method. + +
+
+
+
+
+ + ☆ PointInfinity: Resolution-Invariant Point Diffusion Models CVPR 2024 + + +
+ We present PointInfinity, an efficient family of point cloud diffusion +models. Our core idea is to use a transformer-based architecture with a +fixed-size, resolution-invariant latent representation. This enables efficient +training with low-resolution point clouds, while allowing high-resolution point +clouds to be generated during inference. More importantly, we show that scaling +the test-time resolution beyond the training resolution improves the fidelity +of generated point clouds and surfaces. We analyze this phenomenon and draw a +link to classifier-free guidance commonly used in diffusion models, +demonstrating that both allow trading off fidelity and variability during +inference. Experiments on CO3D show that PointInfinity can efficiently generate +high-resolution point clouds (up to 131k points, 31 times more than Point-E) +with state-of-the-art quality. + +
+
+ comment: Accepted to CVPR 2024, project website at + https://zixuanh.com/projects/pointinfinity +
+
+
+
+
+ + ☆ Segmentation-Guided Knee Radiograph Generation using Conditional + Diffusion Models + + +
+ Deep learning-based medical image processing algorithms require +representative data during development. In particular, surgical data might be +difficult to obtain, and high-quality public datasets are limited. To overcome +this limitation and augment datasets, a widely adopted solution is the +generation of synthetic images. In this work, we employ conditional diffusion +models to generate knee radiographs from contour and bone segmentations. +Remarkably, two distinct strategies are presented by incorporating the +segmentation as a condition into the sampling and training process, namely, +conditional sampling and conditional training. The results demonstrate that +both methods can generate realistic images while adhering to the conditioning +segmentation. The conditional training method outperforms the conditional +sampling method and the conventional U-Net. + +
+
+
+
+
+ + ☆ Is CLIP the main roadblock for fine-grained open-world perception? + + +
+ Modern applications increasingly demand flexible computer vision models that +adapt to novel concepts not encountered during training. This necessity is +pivotal in emerging domains like extended reality, robotics, and autonomous +driving, which require the ability to respond to open-world stimuli. A key +ingredient is the ability to identify objects based on free-form textual +queries defined at inference time - a task known as open-vocabulary object +detection. Multimodal backbones like CLIP are the main enabling technology for +current open-world perception solutions. Despite performing well on generic +queries, recent studies highlighted limitations on the fine-grained recognition +capabilities in open-vocabulary settings - i.e., for distinguishing subtle +object features like color, shape, and material. In this paper, we perform a +detailed examination of these open-vocabulary object recognition limitations to +find the root cause. We evaluate the performance of CLIP, the most commonly +used vision-language backbone, against a fine-grained object-matching +benchmark, revealing interesting analogies between the limitations of +open-vocabulary object detectors and their backbones. Experiments suggest that +the lack of fine-grained understanding is caused by the poor separability of +object characteristics in the CLIP latent space. Therefore, we try to +understand whether fine-grained knowledge is present in CLIP embeddings but not +exploited at inference time due, for example, to the unsuitability of the +cosine similarity matching function, which may discard important object +characteristics. Our preliminary experiments show that simple CLIP latent-space +re-projections help separate fine-grained concepts, paving the way towards the +development of backbones inherently able to process fine-grained details. The +code for reproducing these experiments is available at +https://github.com/lorebianchi98/FG-CLIP. + +
+
+
+
+
+ + ☆ If It's Not Enough, Make It So: Reducing Authentic Data Demand in Face + Recognition through Synthetic Faces + + +
+ Recent advances in deep face recognition have spurred a growing demand for +large, diverse, and manually annotated face datasets. Acquiring authentic, +high-quality data for face recognition has proven to be a challenge, primarily +due to privacy concerns. Large face datasets are primarily sourced from +web-based images, lacking explicit user consent. In this paper, we examine +whether and how synthetic face data can be used to train effective face +recognition models with reduced reliance on authentic images, thereby +mitigating data collection concerns. First, we explored the performance gap +among recent state-of-the-art face recognition models, trained with synthetic +data only and authentic (scarce) data only. Then, we deepened our analysis by +training a state-of-the-art backbone with various combinations of synthetic and +authentic data, gaining insights into optimizing the limited use of the latter +for verification accuracy. Finally, we assessed the effectiveness of data +augmentation approaches on synthetic and authentic data, with the same goal in +mind. Our results highlighted the effectiveness of FR trained on combined +datasets, particularly when combined with appropriate augmentation techniques. + +
+
+ comment: Accepted as a full paper at FG 2024 main track +
+
+
+
+
+ + ☆ COMO: Compact Mapping and Odometry + + +
+ We present COMO, a real-time monocular mapping and odometry system that +encodes dense geometry via a compact set of 3D anchor points. Decoding anchor +point projections into dense geometry via per-keyframe depth covariance +functions guarantees that depth maps are joined together at visible anchor +points. The representation enables joint optimization of camera poses and dense +geometry, intrinsic 3D consistency, and efficient second-order inference. To +maintain a compact yet expressive map, we introduce a frontend that leverages +the covariance function for tracking and initializing potentially visually +indistinct 3D points across frames. Altogether, we introduce a real-time system +capable of estimating accurate poses and consistent geometry. + +
+
+
+
+
+ + ☆ HAPNet: Toward Superior RGB-Thermal Scene Parsing via Hybrid, + Asymmetric, and Progressive Heterogeneous Feature Fusion + + +
+ Data-fusion networks have shown significant promise for RGB-thermal scene +parsing. However, the majority of existing studies have relied on symmetric +duplex encoders for heterogeneous feature extraction and fusion, paying +inadequate attention to the inherent differences between RGB and thermal +modalities. Recent progress in vision foundation models (VFMs) trained through +self-supervision on vast amounts of unlabeled data has proven their ability to +extract informative, general-purpose features. However, this potential has yet +to be fully leveraged in the domain. In this study, we take one step toward +this new research area by exploring a feasible strategy to fully exploit VFM +features for RGB-thermal scene parsing. Specifically, we delve deeper into the +unique characteristics of RGB and thermal modalities, thereby designing a +hybrid, asymmetric encoder that incorporates both a VFM and a convolutional +neural network. This design allows for more effective extraction of +complementary heterogeneous features, which are subsequently fused in a +dual-path, progressive manner. Moreover, we introduce an auxiliary task to +further enrich the local semantics of the fused features, thereby improving the +overall performance of RGB-thermal scene parsing. Our proposed HAPNet, equipped +with all these components, demonstrates superior performance compared to all +other state-of-the-art RGB-thermal scene parsing networks, achieving top ranks +across three widely used public RGB-thermal scene parsing datasets. We believe +this new paradigm has opened up new opportunities for future developments in +data-fusion scene parsing approaches. + +
+
+ comment: 12 pages, 4figures +
+
+
+
+
+ + ☆ SDPose: Tokenized Pose Estimation via Circulation-Guide + Self-Distillation CVPR 2024 + + +
+ Recently, transformer-based methods have achieved state-of-the-art prediction +quality on human pose estimation(HPE). Nonetheless, most of these +top-performing transformer-based models are too computation-consuming and +storage-demanding to deploy on edge computing platforms. Those +transformer-based models that require fewer resources are prone to +under-fitting due to their smaller scale and thus perform notably worse than +their larger counterparts. Given this conundrum, we introduce SDPose, a new +self-distillation method for improving the performance of small +transformer-based models. To mitigate the problem of under-fitting, we design a +transformer module named Multi-Cycled Transformer(MCT) based on multiple-cycled +forwards to more fully exploit the potential of small model parameters. +Further, in order to prevent the additional inference compute-consuming brought +by MCT, we introduce a self-distillation scheme, extracting the knowledge from +the MCT module to a naive forward model. Specifically, on the MSCOCO validation +dataset, SDPose-T obtains 69.7% mAP with 4.4M parameters and 1.8 GFLOPs. +Furthermore, SDPose-S-V2 obtains 73.5% mAP on the MSCOCO validation dataset +with 6.2M parameters and 4.7 GFLOPs, achieving a new state-of-the-art among +predominant tiny neural network methods. Our code is available at +https://github.com/MartyrPenink/SDPose. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ DQ-DETR: DETR with Dynamic Query for Tiny Object Detection + + +
+ Despite previous DETR-like methods having performed successfully in generic +object detection, tiny object detection is still a challenging task for them +since the positional information of object queries is not customized for +detecting tiny objects, whose scale is extraordinarily smaller than general +objects. Also, DETR-like methods using a fixed number of queries make them +unsuitable for aerial datasets, which only contain tiny objects, and the +numbers of instances are imbalanced between different images. Thus, we present +a simple yet effective model, named DQ-DETR, which consists of three different +components: categorical counting module, counting-guided feature enhancement, +and dynamic query selection to solve the above-mentioned problems. DQ-DETR uses +the prediction and density maps from the categorical counting module to +dynamically adjust the number of object queries and improve the positional +information of queries. Our model DQ-DETR outperforms previous CNN-based and +DETR-like methods, achieving state-of-the-art mAP 30.2% on the AI-TOD-V2 +dataset, which mostly consists of tiny objects. + +
+
+
+
+
+ + ☆ AdaGlimpse: Active Visual Exploration with Arbitrary Glimpse Position + and Scale + + +
+ Active Visual Exploration (AVE) is a task that involves dynamically selecting +observations (glimpses), which is critical to facilitate comprehension and +navigation within an environment. While modern AVE methods have demonstrated +impressive performance, they are constrained to fixed-scale glimpses from rigid +grids. In contrast, existing mobile platforms equipped with optical zoom +capabilities can capture glimpses of arbitrary positions and scales. To address +this gap between software and hardware capabilities, we introduce AdaGlimpse. +It uses Soft Actor-Critic, a reinforcement learning algorithm tailored for +exploration tasks, to select glimpses of arbitrary position and scale. This +approach enables our model to rapidly establish a general awareness of the +environment before zooming in for detailed analysis. Experimental results +demonstrate that AdaGlimpse surpasses previous methods across various visual +tasks while maintaining greater applicability in realistic AVE scenarios. + +
+
+
+
+
+ + ☆ Towards Automated Movie Trailer Generation CVPR 2024 + + +
+ Movie trailers are an essential tool for promoting films and attracting +audiences. However, the process of creating trailers can be time-consuming and +expensive. To streamline this process, we propose an automatic trailer +generation framework that generates plausible trailers from a full movie by +automating shot selection and composition. Our approach draws inspiration from +machine translation techniques and models the movies and trailers as sequences +of shots, thus formulating the trailer generation problem as a +sequence-to-sequence task. We introduce Trailer Generation Transformer (TGT), a +deep-learning framework utilizing an encoder-decoder architecture. TGT movie +encoder is tasked with contextualizing each movie shot representation via +self-attention, while the autoregressive trailer decoder predicts the feature +representation of the next trailer shot, accounting for the relevance of shots' +temporal order in trailers. Our TGT significantly outperforms previous methods +on a comprehensive suite of metrics. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Performance of computer vision algorithms for fine-grained + classification using crowdsourced insect images + + +
+ With fine-grained classification, we identify unique characteristics to +distinguish among classes of the same super-class. We are focusing on species +recognition in Insecta, as they are critical for biodiversity monitoring and at +the base of many ecosystems. With citizen science campaigns, billions of images +are collected in the wild. Once these are labelled, experts can use them to +create distribution maps. However, the labelling process is time-consuming, +which is where computer vision comes in. The field of computer vision offers a +wide range of algorithms, each with its strengths and weaknesses; how do we +identify the algorithm that is in line with our application? To answer this +question, we provide a full and detailed evaluation of nine algorithms among +deep convolutional networks (CNN), vision transformers (ViT), and +locality-based vision transformers (LBVT) on 4 different aspects: +classification performance, embedding quality, computational cost, and gradient +activity. We offer insights that we haven't yet had in this domain proving to +which extent these algorithms solve the fine-grained tasks in Insecta. We found +that the ViT performs the best on inference speed and computational cost while +the LBVT outperforms the others on performance and embedding quality; the CNN +provide a trade-off among the metrics. + +
+
+
+
+
+ + ☆ You Only Scan Once: A Dynamic Scene Reconstruction Pipeline for 6-DoF + Robotic Grasping of Novel Objects ICRA 2024 + + +
+ In the realm of robotic grasping, achieving accurate and reliable +interactions with the environment is a pivotal challenge. Traditional methods +of grasp planning methods utilizing partial point clouds derived from depth +image often suffer from reduced scene understanding due to occlusion, +ultimately impeding their grasping accuracy. Furthermore, scene reconstruction +methods have primarily relied upon static techniques, which are susceptible to +environment change during manipulation process limits their efficacy in +real-time grasping tasks. To address these limitations, this paper introduces a +novel two-stage pipeline for dynamic scene reconstruction. In the first stage, +our approach takes scene scanning as input to register each target object with +mesh reconstruction and novel object pose tracking. In the second stage, pose +tracking is still performed to provide object poses in real-time, enabling our +approach to transform the reconstructed object point clouds back into the +scene. Unlike conventional methodologies, which rely on static scene snapshots, +our method continuously captures the evolving scene geometry, resulting in a +comprehensive and up-to-date point cloud representation. By circumventing the +constraints posed by occlusion, our method enhances the overall grasp planning +process and empowers state-of-the-art 6-DoF robotic grasping algorithms to +exhibit markedly improved accuracy. + +
+
+ comment: ICRA 2024 +
+
+
+
+
+ + ☆ How Much Data are Enough? Investigating Dataset Requirements for + Patch-Based Brain MRI Segmentation Tasks + + +
+ Training deep neural networks reliably requires access to large-scale +datasets. However, obtaining such datasets can be challenging, especially in +the context of neuroimaging analysis tasks, where the cost associated with +image acquisition and annotation can be prohibitive. To mitigate both the time +and financial costs associated with model development, a clear understanding of +the amount of data required to train a satisfactory model is crucial. This +paper focuses on an early stage phase of deep learning research, prior to model +development, and proposes a strategic framework for estimating the amount of +annotated data required to train patch-based segmentation networks. This +framework includes the establishment of performance expectations using a novel +Minor Boundary Adjustment for Threshold (MinBAT) method, and standardizing +patch selection through the ROI-based Expanded Patch Selection (REPS) method. +Our experiments demonstrate that tasks involving regions of interest (ROIs) +with different sizes or shapes may yield variably acceptable Dice Similarity +Coefficient (DSC) scores. By setting an acceptable DSC as the target, the +required amount of training data can be estimated and even predicted as data +accumulates. This approach could assist researchers and engineers in estimating +the cost associated with data collection and annotation when defining a new +segmentation task based on deep neural networks, ultimately contributing to +their efficient translation to real-world applications. + +
+
+
+
+
+ + ☆ SP$^2$OT: Semantic-Regularized Progressive Partial Optimal Transport for + Imbalanced Clustering + + +
+ Deep clustering, which learns representation and semantic clustering without +labels information, poses a great challenge for deep learning-based approaches. +Despite significant progress in recent years, most existing methods focus on +uniformly distributed datasets, significantly limiting the practical +applicability of their methods. In this paper, we propose a more practical +problem setting named deep imbalanced clustering, where the underlying classes +exhibit an imbalance distribution. To address this challenge, we introduce a +novel optimal transport-based pseudo-label learning framework. Our framework +formulates pseudo-label generation as a Semantic-regularized Progressive +Partial Optimal Transport (SP$^2$OT) problem, which progressively transports +each sample to imbalanced clusters under several prior distribution and +semantic relation constraints, thus generating high-quality and imbalance-aware +pseudo-labels. To solve SP$^2$OT, we develop a Majorization-Minimization-based +optimization algorithm. To be more precise, we employ the strategy of +majorization to reformulate the SP$^2$OT problem into a Progressive Partial +Optimal Transport problem, which can be transformed into an unbalanced optimal +transport problem with augmented constraints and can be solved efficiently by a +fast matrix scaling algorithm. Experiments on various datasets, including a +human-curated long-tailed CIFAR100, challenging ImageNet-R, and large-scale +subsets of fine-grained iNaturalist2018 datasets, demonstrate the superiority +of our method. + +
+
+ comment: under review. arXiv admin note: substantial text overlap with + arXiv:2401.09266 +
+
+
+
+
+ + ☆ Part-Attention Based Model Make Occluded Person Re-Identification + Stronger + + +
+ The goal of occluded person re-identification (ReID) is to retrieve specific +pedestrians in occluded situations. However, occluded person ReID still suffers +from background clutter and low-quality local feature representations, which +limits model performance. In our research, we introduce a new framework called +PAB-ReID, which is a novel ReID model incorporating part-attention mechanisms +to tackle the aforementioned issues effectively. Firstly, we introduce the +human parsing label to guide the generation of more accurate human part +attention maps. In addition, we propose a fine-grained feature focuser for +generating fine-grained human local feature representations while suppressing +background interference. Moreover, We also design a part triplet loss to +supervise the learning of human local features, which optimizes +intra/inter-class distance. We conducted extensive experiments on specialized +occlusion and regular ReID datasets, showcasing that our approach outperforms +the existing state-of-the-art methods. + +
+
+ comment: Accepted By International Joint Conference on Neural Networks +
+
+
+
+
+ + ☆ ChangeMamba: Remote Sensing Change Detection with Spatio-Temporal State + Space Model + + +
+ Convolutional neural networks (CNN) and Transformers have made impressive +progress in the field of remote sensing change detection (CD). However, both +architectures have their inherent shortcomings. Recently, the Mamba +architecture, based on spatial state models, has shown remarkable performance +in a series of natural language processing tasks, which can effectively +compensate for the shortcomings of the above two architectures. In this paper, +we explore for the first time the potential of the Mamba architecture for +remote sensing change detection tasks. We tailor the corresponding frameworks, +called MambaBCD, MambaSCD, and MambaBDA, for binary change detection (BCD), +semantic change detection (SCD), and building damage assessment (BDA), +respectively. All three frameworks adopt the cutting-edge visual Mamba +architecture as the encoder, which allows full learning of global spatial +contextual information from the input images. For the change decoder, which is +available in all three architectures, we propose three spatio-temporal +relationship modeling mechanisms, which can be naturally combined with the +Mamba architecture and fully utilize its attribute to achieve spatio-temporal +interaction of multi-temporal features and obtain accurate change information. +On five benchmark datasets, our proposed frameworks outperform current CNN- and +Transformer-based approaches without using any complex strategies or tricks, +fully demonstrating the potential of the Mamba architecture. Specifically, we +obtained 83.11%, 88.39% and 94.19% F1 scores on the three BCD datasets SYSU, +LEVIR-CD+, and WHU-CD; on the SCD dataset SECOND, we obtained 24.04% SeK; and +on the xBD dataset, we obtained 81.41% overall F1 score. The source code will +be available in https://github.com/ChenHongruixuan/MambaCD + +
+
+
+
+
+ + ☆ Generalizable 3D Scene Reconstruction via Divide and Conquer from a + Single View + + +
+ Single-view 3D reconstruction is currently approached from two dominant +perspectives: reconstruction of scenes with limited diversity using 3D data +supervision or reconstruction of diverse singular objects using large image +priors. However, real-world scenarios are far more complex and exceed the +capabilities of these methods. We therefore propose a hybrid method following a +divide-and-conquer strategy. We first process the scene holistically, +extracting depth and semantic information, and then leverage a single-shot +object-level method for the detailed reconstruction of individual components. +By following a compositional processing approach, the overall framework +achieves full reconstruction of complex 3D scenes from a single image. We +purposely design our pipeline to be highly modular by carefully integrating +specific procedures for each processing step, without requiring an end-to-end +training of the whole system. This enables the pipeline to naturally improve as +future methods can replace the individual modules. We demonstrate the +reconstruction performance of our approach on both synthetic and real-world +scenes, comparing favorable against prior works. Project page: +https://andreeadogaru.github.io/Gen3DSR. + +
+
+
+
+
+ + ☆ NMF-Based Analysis of Mobile Eye-Tracking Data + + +
+ The depiction of scanpaths from mobile eye-tracking recordings by thumbnails +from the stimulus allows the application of visual computing to detect areas of +interest in an unsupervised way. We suggest using nonnegative matrix +factorization (NMF) to identify such areas in stimuli. For a user-defined +integer k, NMF produces an explainable decomposition into k components, each +consisting of a spatial representation associated with a temporal indicator. In +the context of multiple eye-tracking recordings, this leads to k spatial +representations, where the temporal indicator highlights the appearance within +recordings. The choice of k provides an opportunity to control the refinement +of the decomposition, i.e., the number of areas to detect. We combine our +NMF-based approach with visualization techniques to enable an exploratory +analysis of multiple recordings. Finally, we demonstrate the usefulness of our +approach with mobile eye-tracking data of an art gallery. + +
+
+
+
+
+ + ☆ Future Predictive Success-or-Failure Classification for Long-Horizon + Robotic Tasks IJCNN 2024 + + +
+ Automating long-horizon tasks with a robotic arm has been a central research +topic in robotics. Optimization-based action planning is an efficient approach +for creating an action plan to complete a given task. Construction of a +reliable planning method requires a design process of conditions, e.g., to +avoid collision between objects. The design process, however, has two critical +issues: 1) iterative trials--the design process is time-consuming due to the +trial-and-error process of modifying conditions, and 2) manual redesign--it is +difficult to cover all the necessary conditions manually. To tackle these +issues, this paper proposes a future-predictive +success-or-failure-classification method to obtain conditions automatically. +The key idea behind the proposed method is an end-to-end approach for +determining whether the action plan can complete a given task instead of +manually redesigning the conditions. The proposed method uses a long-horizon +future-prediction method to enable success-or-failure classification without +the execution of an action plan. This paper also proposes a regularization term +called transition consistency regularization to provide easy-to-predict feature +distribution. The regularization term improves future prediction and +classification performance. The effectiveness of our method is demonstrated +through classification and robotic-manipulation experiments. + +
+
+ comment: IJCNN 2024 +
+
+
+
+
+ + ☆ MiniGPT4-Video: Advancing Multimodal LLMs for Video Understanding with + Interleaved Visual-Textual Tokens + + +
+ This paper introduces MiniGPT4-Video, a multimodal Large Language Model (LLM) +designed specifically for video understanding. The model is capable of +processing both temporal visual and textual data, making it adept at +understanding the complexities of videos. Building upon the success of +MiniGPT-v2, which excelled in translating visual features into the LLM space +for single images and achieved impressive results on various image-text +benchmarks, this paper extends the model's capabilities to process a sequence +of frames, enabling it to comprehend videos. MiniGPT4-video does not only +consider visual content but also incorporates textual conversations, allowing +the model to effectively answer queries involving both visual and text +components. The proposed model outperforms existing state-of-the-art methods, +registering gains of 4.22%, 1.13%, 20.82%, and 13.1% on the MSVD, MSRVTT, TGIF, +and TVQA benchmarks respectively. Our models and code have been made publicly +available here https://vision-cair.github.io/MiniGPT4-video/ + +
+
+ comment: 6 pages,8 figures +
+
+
+
+
+ + ☆ AIGIQA-20K: A Large Database for AI-Generated Image Quality Assessment + + +
+ With the rapid advancements in AI-Generated Content (AIGC), AI-Generated +Images (AIGIs) have been widely applied in entertainment, education, and social +media. However, due to the significant variance in quality among different +AIGIs, there is an urgent need for models that consistently match human +subjective ratings. To address this issue, we organized a challenge towards +AIGC quality assessment on NTIRE 2024 that extensively considers 15 popular +generative models, utilizing dynamic hyper-parameters (including +classifier-free guidance, iteration epochs, and output image resolution), and +gather subjective scores that consider perceptual quality and text-to-image +alignment altogether comprehensively involving 21 subjects. This approach +culminates in the creation of the largest fine-grained AIGI subjective quality +database to date with 20,000 AIGIs and 420,000 subjective ratings, known as +AIGIQA-20K. Furthermore, we conduct benchmark experiments on this database to +assess the correspondence between 16 mainstream AIGI quality models and human +perception. We anticipate that this large-scale quality database will inspire +robust quality indicators for AIGIs and propel the evolution of AIGC for +vision. The database is released on +https://www.modelscope.cn/datasets/lcysyzxdxc/AIGCQA-30K-Image. + +
+
+
+
+
+ + ☆ Scaling Up Video Summarization Pretraining with Large Language Models CVPR 2024 + + +
+ Long-form video content constitutes a significant portion of internet +traffic, making automated video summarization an essential research problem. +However, existing video summarization datasets are notably limited in their +size, constraining the effectiveness of state-of-the-art methods for +generalization. Our work aims to overcome this limitation by capitalizing on +the abundance of long-form videos with dense speech-to-video alignment and the +remarkable capabilities of recent large language models (LLMs) in summarizing +long text. We introduce an automated and scalable pipeline for generating a +large-scale video summarization dataset using LLMs as Oracle summarizers. By +leveraging the generated dataset, we analyze the limitations of existing +approaches and propose a new video summarization model that effectively +addresses them. To facilitate further research in the field, our work also +presents a new benchmark dataset that contains 1200 long videos each with +high-quality summaries annotated by professionals. Extensive experiments +clearly indicate that our proposed approach sets a new state-of-the-art in +video summarization across several benchmarks. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Background Noise Reduction of Attention Map for Weakly Supervised + Semantic Segmentation + + +
+ In weakly-supervised semantic segmentation (WSSS) using only image-level +class labels, a problem with CNN-based Class Activation Maps (CAM) is that they +tend to activate the most discriminative local regions of objects. On the other +hand, methods based on Transformers learn global features but suffer from the +issue of background noise contamination. This paper focuses on addressing the +issue of background noise in attention weights within the existing WSSS method +based on Conformer, known as TransCAM. The proposed method successfully reduces +background noise, leading to improved accuracy of pseudo labels. Experimental +results demonstrate that our model achieves segmentation performance of 70.5% +on the PASCAL VOC 2012 validation data, 71.1% on the test data, and 45.9% on MS +COCO 2014 data, outperforming TransCAM in terms of segmentation performance. + +
+
+
+
+
+ + ☆ Two Tricks to Improve Unsupervised Segmentation Learning + + +
+ We present two practical improvement techniques for unsupervised segmentation +learning. These techniques address limitations in the resolution and accuracy +of predicted segmentation maps of recent state-of-the-art methods. Firstly, we +leverage image post-processing techniques such as guided filtering to refine +the output masks, improving accuracy while avoiding substantial computational +costs. Secondly, we introduce a multi-scale consistency criterion, based on a +teacher-student training scheme. This criterion matches segmentation masks +predicted from regions of the input image extracted at different resolutions to +each other. Experimental results on several benchmarks used in unsupervised +segmentation learning demonstrate the effectiveness of our proposed techniques. + +
+
+
+
+
+ + ☆ LongVLM: Efficient Long Video Understanding via Large Language Models + + +
+ Empowered by Large Language Models (LLMs), recent advancements in VideoLLMs +have driven progress in various video understanding tasks. These models encode +video representations through pooling or query aggregation over a vast number +of visual tokens, making computational and memory costs affordable. Despite +successfully providing an overall comprehension of video content, existing +VideoLLMs still face challenges in achieving detailed understanding in videos +due to overlooking local information in long-term videos. To tackle this +challenge, we introduce LongVLM, a straightforward yet powerful VideoLLM for +long video understanding, building upon the observation that long videos often +consist of sequential key events, complex actions, and camera movements. Our +approach proposes to decompose long videos into multiple short-term segments +and encode local features for each local segment via a hierarchical token +merging module. These features are concatenated in temporal order to maintain +the storyline across sequential short-term segments. Additionally, we propose +to integrate global semantics into each local feature to enhance context +understanding. In this way, we encode video representations that incorporate +both local and global information, enabling the LLM to generate comprehensive +responses for long-term videos. Experimental results on the VideoChatGPT +benchmark and zero-shot video question-answering datasets demonstrate the +superior capabilities of our model over the previous state-of-the-art methods. +Qualitative examples demonstrate that our model produces more precise responses +for long videos understanding. Code is available at +\url{https://github.com/ziplab/LongVLM}. + +
+
+
+
+
+ + ☆ VF-NeRF: Viewshed Fields for Rigid NeRF Registration + + +
+ 3D scene registration is a fundamental problem in computer vision that seeks +the best 6-DoF alignment between two scenes. This problem was extensively +investigated in the case of point clouds and meshes, but there has been +relatively limited work regarding Neural Radiance Fields (NeRF). In this paper, +we consider the problem of rigid registration between two NeRFs when the +position of the original cameras is not given. Our key novelty is the +introduction of Viewshed Fields (VF), an implicit function that determines, for +each 3D point, how likely it is to be viewed by the original cameras. We +demonstrate how VF can help in the various stages of NeRF registration, with an +extensive evaluation showing that VF-NeRF achieves SOTA results on various +datasets with different capturing approaches such as LLFF and Objaverese. + +
+
+
+
+
+ + ☆ Meta Invariance Defense Towards Generalizable Robustness to Unknown + Adversarial Attacks + + +
+ Despite providing high-performance solutions for computer vision tasks, the +deep neural network (DNN) model has been proved to be extremely vulnerable to +adversarial attacks. Current defense mainly focuses on the known attacks, but +the adversarial robustness to the unknown attacks is seriously overlooked. +Besides, commonly used adaptive learning and fine-tuning technique is +unsuitable for adversarial defense since it is essentially a zero-shot problem +when deployed. Thus, to tackle this challenge, we propose an attack-agnostic +defense method named Meta Invariance Defense (MID). Specifically, various +combinations of adversarial attacks are randomly sampled from a manually +constructed Attacker Pool to constitute different defense tasks against unknown +attacks, in which a student encoder is supervised by multi-consistency +distillation to learn the attack-invariant features via a meta principle. The +proposed MID has two merits: 1) Full distillation from pixel-, feature- and +prediction-level between benign and adversarial samples facilitates the +discovery of attack-invariance. 2) The model simultaneously achieves robustness +to the imperceptible adversarial perturbations in high-level image +classification and attack-suppression in low-level robust image regeneration. +Theoretical and empirical studies on numerous benchmarks such as ImageNet +verify the generalizable robustness and superiority of MID under various +attacks. + +
+
+ comment: Accepted by IEEE TPAMI in 2024 +
+
+
+
+
+ + ☆ DI-Retinex: Digital-Imaging Retinex Theory for Low-Light Image + Enhancement + + +
+ Many existing methods for low-light image enhancement (LLIE) based on Retinex +theory ignore important factors that affect the validity of this theory in +digital imaging, such as noise, quantization error, non-linearity, and dynamic +range overflow. In this paper, we propose a new expression called +Digital-Imaging Retinex theory (DI-Retinex) through theoretical and +experimental analysis of Retinex theory in digital imaging. Our new expression +includes an offset term in the enhancement model, which allows for pixel-wise +brightness contrast adjustment with a non-linear mapping function. In addition, +to solve the lowlight enhancement problem in an unsupervised manner, we propose +an image-adaptive masked reverse degradation loss in Gamma space. We also +design a variance suppression loss for regulating the additional offset term. +Extensive experiments show that our proposed method outperforms all existing +unsupervised methods in terms of visual quality, model size, and speed. Our +algorithm can also assist downstream face detectors in low-light, as it shows +the most performance gain after the low-light enhancement compared to other +methods. + +
+
+
+
+
+ + ☆ Sparse Concept Bottleneck Models: Gumbel Tricks in Contrastive Learning + + +
+ We propose a novel architecture and method of explainable classification with +Concept Bottleneck Models (CBMs). While SOTA approaches to Image Classification +task work as a black box, there is a growing demand for models that would +provide interpreted results. Such a models often learn to predict the +distribution over class labels using additional description of this target +instances, called concepts. However, existing Bottleneck methods have a number +of limitations: their accuracy is lower than that of a standard model and CBMs +require an additional set of concepts to leverage. We provide a framework for +creating Concept Bottleneck Model from pre-trained multi-modal encoder and new +CLIP-like architectures. By introducing a new type of layers known as Concept +Bottleneck Layers, we outline three methods for training them: with +$\ell_1$-loss, contrastive loss and loss function based on Gumbel-Softmax +distribution (Sparse-CBM), while final FC layer is still trained with +Cross-Entropy. We show a significant increase in accuracy using sparse hidden +layers in CLIP-based bottleneck models. Which means that sparse representation +of concepts activation vector is meaningful in Concept Bottleneck Models. +Moreover, with our Concept Matrix Search algorithm we can improve CLIP +predictions on complex datasets without any additional training or fine-tuning. +The code is available at: https://github.com/Andron00e/SparseCBM. + +
+
+ comment: 23 pages, 1 algorithm, 36 figures +
+
+
+
+
+ + ☆ AdaBM: On-the-Fly Adaptive Bit Mapping for Image Super-Resolution CVPR 2024 + + +
+ Although image super-resolution (SR) problem has experienced unprecedented +restoration accuracy with deep neural networks, it has yet limited versatile +applications due to the substantial computational costs. Since different input +images for SR face different restoration difficulties, adapting computational +costs based on the input image, referred to as adaptive inference, has emerged +as a promising solution to compress SR networks. Specifically, adapting the +quantization bit-widths has successfully reduced the inference and memory cost +without sacrificing the accuracy. However, despite the benefits of the +resultant adaptive network, existing works rely on time-intensive +quantization-aware training with full access to the original training pairs to +learn the appropriate bit allocation policies, which limits its ubiquitous +usage. To this end, we introduce the first on-the-fly adaptive quantization +framework that accelerates the processing time from hours to seconds. We +formulate the bit allocation problem with only two bit mapping modules: one to +map the input image to the image-wise bit adaptation factor and one to obtain +the layer-wise adaptation factors. These bit mappings are calibrated and +fine-tuned using only a small number of calibration images. We achieve +competitive performance with the previous adaptive quantization methods, while +the processing time is accelerated by x2000. Codes are available at +https://github.com/Cheeun/AdaBM. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Design and Development of a Framework For Stroke-Based Handwritten + Gujarati Font Generation + + +
+ Handwritten font generation is important for preserving cultural heritage and +creating personalized designs. It adds an authentic and expressive touch to +printed materials, making them visually appealing and establishing a stronger +connection with the audience. This paper aims to design a framework for +generating handwritten fonts in the Gujarati script, mimicking the variation of +human handwriting. The proposed font generation model consists of a learning +phase and a generation phase. In the learning phase, Gujarati scripts are +analyzed, and rules for designing each character are formulated. This ruleset +involves the concatenation of strokes in a stroke-based manner, ensuring visual +consistency in the resulting glyphs. The generation phase involves the user +providing a small subset of characters, and the system automatically generates +the remaining character glyphs based on extracted strokes and learned rules, +resulting in handwritten Gujarati fonts. The resulting character glyphs are +converted into an open-type font using the FontForge tool, making them +compatible with any Gujarati editor. Both subjective and objective evaluations +are conducted to assess the synthesized images and fonts. Subjective evaluation +through user studies provides feedback on quality and visual appeal, achieving +an overall accuracy of 84.84%. Notably, eleven characters demonstrated a +success ratio above 90%. Objective evaluation using an existing recognition +system achieves an overall accuracy of 84.28% in OCR evaluation. Notably, +fifteen characters had a success ratio of 80% or higher. + +
+
+ comment: 13 pages, 2 column, 12 figures +
+
+
+
+
+ + ☆ Multi Positive Contrastive Learning with Pose-Consistent Generated + Images + + +
+ Model pre-training has become essential in various recognition tasks. +Meanwhile, with the remarkable advancements in image generation models, +pre-training methods utilizing generated images have also emerged given their +ability to produce unlimited training data. However, while existing methods +utilizing generated images excel in classification, they fall short in more +practical tasks, such as human pose estimation. In this paper, we have +experimentally demonstrated it and propose the generation of visually distinct +images with identical human poses. We then propose a novel multi-positive +contrastive learning, which optimally utilize the previously generated images +to learn structural features of the human body. We term the entire learning +pipeline as GenPoCCL. Despite using only less than 1% amount of data compared +to current state-of-the-art method, GenPoCCL captures structural features of +the human body more effectively, surpassing existing methods in a variety of +human-centric perception tasks. + +
+
+
+
+
+ + ☆ A dataset of primary nasopharyngeal carcinoma MRI with multi-modalities + segmentation + + +
+ Multi-modality magnetic resonance imaging data with various sequences +facilitate the early diagnosis, tumor segmentation, and disease staging in the +management of nasopharyngeal carcinoma (NPC). The lack of publicly available, +comprehensive datasets limits advancements in diagnosis, treatment planning, +and the development of machine learning algorithms for NPC. Addressing this +critical need, we introduce the first comprehensive NPC MRI dataset, +encompassing MR axial imaging of 277 primary NPC patients. This dataset +includes T1-weighted, T2-weighted, and contrast-enhanced T1-weighted sequences, +totaling 831 scans. In addition to the corresponding clinical data, manually +annotated and labeled segmentations by experienced radiologists offer +high-quality data resources from untreated primary NPC. + +
+
+
+
+
+ + ☆ Real-time Noise Source Estimation of a Camera System from an Image and + Metadata + + +
+ Autonomous machines must self-maintain proper functionality to ensure the +safety of humans and themselves. This pertains particularly to its cameras as +predominant sensors to perceive the environment and support actions. A +fundamental camera problem addressed in this study is noise. Solutions often +focus on denoising images a posteriori, that is, fighting symptoms rather than +root causes. However, tackling root causes requires identifying the noise +sources, considering the limitations of mobile platforms. This work +investigates a real-time, memory-efficient and reliable noise source estimator +that combines data- and physically-based models. To this end, a DNN that +examines an image with camera metadata for major camera noise sources is built +and trained. In addition, it quantifies unexpected factors that impact image +noise or metadata. This study investigates seven different estimators on six +datasets that include synthetic noise, real-world noise from two camera +systems, and real field campaigns. For these, only the model with most metadata +is capable to accurately and robustly quantify all individual noise +contributions. This method outperforms total image noise estimators and can be +plug-and-play deployed. It also serves as a basis to include more advanced +noise sources, or as part of an automatic countermeasure feedback-loop to +approach fully reliable machines. + +
+
+ comment: 16 pages, 16 figures, 12 tables, Project page: + https://github.com/MaikWischow/Noise-Source-Estimation +
+
+
+
+
+ + ☆ Learning Transferable Negative Prompts for Out-of-Distribution Detection CVPR 2024 + + +
+ Existing prompt learning methods have shown certain capabilities in +Out-of-Distribution (OOD) detection, but the lack of OOD images in the target +dataset in their training can lead to mismatches between OOD images and +In-Distribution (ID) categories, resulting in a high false positive rate. To +address this issue, we introduce a novel OOD detection method, named +'NegPrompt', to learn a set of negative prompts, each representing a negative +connotation of a given class label, for delineating the boundaries between ID +and OOD images. It learns such negative prompts with ID data only, without any +reliance on external outlier data. Further, current methods assume the +availability of samples of all ID classes, rendering them ineffective in +open-vocabulary learning scenarios where the inference stage can contain novel +ID classes not present during training. In contrast, our learned negative +prompts are transferable to novel class labels. Experiments on various ImageNet +benchmarks show that NegPrompt surpasses state-of-the-art prompt-learning-based +OOD detection methods and maintains a consistent lead in hard OOD detection in +closed- and open-vocabulary classification scenarios. Code is available at +https://github.com/mala-lab/negprompt. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Would Deep Generative Models Amplify Bias in Future Models? CVPR 2024 + + +
+ We investigate the impact of deep generative models on potential social +biases in upcoming computer vision models. As the internet witnesses an +increasing influx of AI-generated images, concerns arise regarding inherent +biases that may accompany them, potentially leading to the dissemination of +harmful content. This paper explores whether a detrimental feedback loop, +resulting in bias amplification, would occur if generated images were used as +the training data for future models. We conduct simulations by progressively +substituting original images in COCO and CC3M datasets with images generated +through Stable Diffusion. The modified datasets are used to train OpenCLIP and +image captioning models, which we evaluate in terms of quality and bias. +Contrary to expectations, our findings indicate that introducing generated +images during training does not uniformly amplify bias. Instead, instances of +bias mitigation across specific tasks are observed. We further explore the +factors that may influence these phenomena, such as artifacts in image +generation (e.g., blurry faces) or pre-existing biases in the original +datasets. + +
+
+ comment: This paper has been accepted to CVPR 2024 +
+
+
+
+
+ + ☆ FACTUAL: A Novel Framework for Contrastive Learning Based Robust SAR + Image Classification + + +
+ Deep Learning (DL) Models for Synthetic Aperture Radar (SAR) Automatic Target +Recognition (ATR), while delivering improved performance, have been shown to be +quite vulnerable to adversarial attacks. Existing works improve robustness by +training models on adversarial samples. However, by focusing mostly on attacks +that manipulate images randomly, they neglect the real-world feasibility of +such attacks. In this paper, we propose FACTUAL, a novel Contrastive Learning +framework for Adversarial Training and robust SAR classification. FACTUAL +consists of two components: (1) Differing from existing works, a novel +perturbation scheme that incorporates realistic physical adversarial attacks +(such as OTSA) to build a supervised adversarial pre-training network. This +network utilizes class labels for clustering clean and perturbed images +together into a more informative feature space. (2) A linear classifier +cascaded after the encoder to use the computed representations to predict the +target labels. By pre-training and fine-tuning our model on both clean and +adversarial samples, we show that our model achieves high prediction accuracy +on both cases. Our model achieves 99.7% accuracy on clean samples, and 89.6% on +perturbed samples, both outperforming previous state-of-the-art methods. + +
+
+ comment: 2024 IEEE Radar Conference +
+
+
+
+
+ + ☆ iSeg: Interactive 3D Segmentation via Interactive Attention + + +
+ We present iSeg, a new interactive technique for segmenting 3D shapes. +Previous works have focused mainly on leveraging pre-trained 2D foundation +models for 3D segmentation based on text. However, text may be insufficient for +accurately describing fine-grained spatial segmentations. Moreover, achieving a +consistent 3D segmentation using a 2D model is challenging since occluded areas +of the same semantic region may not be visible together from any 2D view. Thus, +we design a segmentation method conditioned on fine user clicks, which operates +entirely in 3D. Our system accepts user clicks directly on the shape's surface, +indicating the inclusion or exclusion of regions from the desired shape +partition. To accommodate various click settings, we propose a novel +interactive attention module capable of processing different numbers and types +of clicks, enabling the training of a single unified interactive segmentation +model. We apply iSeg to a myriad of shapes from different domains, +demonstrating its versatility and faithfulness to the user's specifications. +Our project page is at https://threedle.github.io/iSeg/. + +
+
+ comment: Project page: https://threedle.github.io/iSeg/ +
+
+
+
+
+ + ☆ LeGrad: An Explainability Method for Vision Transformers via Feature + Formation Sensitivity + + +
+ Vision Transformers (ViTs), with their ability to model long-range +dependencies through self-attention mechanisms, have become a standard +architecture in computer vision. However, the interpretability of these models +remains a challenge. To address this, we propose LeGrad, an explainability +method specifically designed for ViTs. LeGrad computes the gradient with +respect to the attention maps of ViT layers, considering the gradient itself as +the explainability signal. We aggregate the signal over all layers, combining +the activations of the last as well as intermediate tokens to produce the +merged explainability map. This makes LeGrad a conceptually simple and an +easy-to-implement tool for enhancing the transparency of ViTs. We evaluate +LeGrad in challenging segmentation, perturbation, and open-vocabulary settings, +showcasing its versatility compared to other SotA explainability methods +demonstrating its superior spatial fidelity and robustness to perturbations. A +demo and the code is available at https://github.com/WalBouss/LeGrad. + +
+
+ comment: Code available at https://github.com/WalBouss/LeGrad +
+
+
+
+
+ + ☆ HDR Imaging for Dynamic Scenes with Events + + +
+ High dynamic range imaging (HDRI) for real-world dynamic scenes is +challenging because moving objects may lead to hybrid degradation of low +dynamic range and motion blur. Existing event-based approaches only focus on a +separate task, while cascading HDRI and motion deblurring would lead to +sub-optimal solutions, and unavailable ground-truth sharp HDR images aggravate +the predicament. To address these challenges, we propose an Event-based HDRI +framework within a Self-supervised learning paradigm, i.e., Self-EHDRI, which +generalizes HDRI performance in real-world dynamic scenarios. Specifically, a +self-supervised learning strategy is carried out by learning cross-domain +conversions from blurry LDR images to sharp LDR images, which enables sharp HDR +images to be accessible in the intermediate process even though ground-truth +sharp HDR images are missing. Then, we formulate the event-based HDRI and +motion deblurring model and conduct a unified network to recover the +intermediate sharp HDR results, where both the high dynamic range and high +temporal resolution of events are leveraged simultaneously for compensation. We +construct large-scale synthetic and real-world datasets to evaluate the +effectiveness of our method. Comprehensive experiments demonstrate that the +proposed Self-EHDRI outperforms state-of-the-art approaches by a large margin. +The codes, datasets, and results are available at +https://lxp-whu.github.io/Self-EHDRI. + +
+
+
+
+
+ + ☆ OmniGS: Omnidirectional Gaussian Splatting for Fast Radiance Field + Reconstruction using Omnidirectional Images IROS 2024 + + +
+ Photorealistic reconstruction relying on 3D Gaussian Splatting has shown +promising potential in robotics. However, the current 3D Gaussian Splatting +system only supports radiance field reconstruction using undistorted +perspective images. In this paper, we present OmniGS, a novel omnidirectional +Gaussian splatting system, to take advantage of omnidirectional images for fast +radiance field reconstruction. Specifically, we conduct a theoretical analysis +of spherical camera model derivatives in 3D Gaussian Splatting. According to +the derivatives, we then implement a new GPU-accelerated omnidirectional +rasterizer that directly splats 3D Gaussians onto the equirectangular screen +space for omnidirectional image rendering. As a result, we realize +differentiable optimization of the radiance field without the requirement of +cube-map rectification or tangent-plane approximation. Extensive experiments +conducted in egocentric and roaming scenarios demonstrate that our method +achieves state-of-the-art reconstruction quality and high rendering speed using +omnidirectional images. To benefit the research community, the code will be +made publicly available once the paper is published. + +
+
+ comment: IROS 2024 submission, 7 pages, 4 figures +
+
+
+
+
+ + ☆ Future-Proofing Class Incremental Learning + + +
+ Exemplar-Free Class Incremental Learning is a highly challenging setting +where replay memory is unavailable. Methods relying on frozen feature +extractors have drawn attention recently in this setting due to their +impressive performances and lower computational costs. However, those methods +are highly dependent on the data used to train the feature extractor and may +struggle when an insufficient amount of classes are available during the first +incremental step. To overcome this limitation, we propose to use a pre-trained +text-to-image diffusion model in order to generate synthetic images of future +classes and use them to train the feature extractor. Experiments on the +standard benchmarks CIFAR100 and ImageNet-Subset demonstrate that our proposed +method can be used to improve state-of-the-art methods for exemplar-free class +incremental learning, especially in the most difficult settings where the first +incremental step only contains few classes. Moreover, we show that using +synthetic samples of future classes achieves higher performance than using real +data from different classes, paving the way for better and less costly +pre-training methods for incremental learning. + +
+
+
+
+
+ + ☆ CORP: A Multi-Modal Dataset for Campus-Oriented Roadside Perception + Tasks + + +
+ Numerous roadside perception datasets have been introduced to propel +advancements in autonomous driving and intelligent transportation systems +research and development. However, it has been observed that the majority of +their concentrates is on urban arterial roads, inadvertently overlooking +residential areas such as parks and campuses that exhibit entirely distinct +characteristics. In light of this gap, we propose CORP, which stands as the +first public benchmark dataset tailored for multi-modal roadside perception +tasks under campus scenarios. Collected in a university campus, CORP consists +of over 205k images plus 102k point clouds captured from 18 cameras and 9 LiDAR +sensors. These sensors with different configurations are mounted on roadside +utility poles to provide diverse viewpoints within the campus region. The +annotations of CORP encompass multi-dimensional information beyond 2D and 3D +bounding boxes, providing extra support for 3D seamless tracking and instance +segmentation with unique IDs and pixel masks for identifying targets, to +enhance the understanding of objects and their behaviors distributed across the +campus premises. Unlike other roadside datasets about urban traffic, CORP +extends the spectrum to highlight the challenges for multi-modal perception in +campuses and other residential areas. + +
+
+
+
+
+ + ☆ Adaptive Discrete Disparity Volume for Self-supervised Monocular Depth + Estimation + + +
+ In self-supervised monocular depth estimation tasks, discrete disparity +prediction has been proven to attain higher quality depth maps than common +continuous methods. However, current discretization strategies often divide +depth ranges of scenes into bins in a handcrafted and rigid manner, limiting +model performance. In this paper, we propose a learnable module, Adaptive +Discrete Disparity Volume (ADDV), which is capable of dynamically sensing depth +distributions in different RGB images and generating adaptive bins for them. +Without any extra supervision, this module can be integrated into existing CNN +architectures, allowing networks to produce representative values for bins and +a probability volume over them. Furthermore, we introduce novel training +strategies - uniformizing and sharpening - through a loss term and temperature +parameter, respectively, to provide regularizations under self-supervised +conditions, preventing model degradation or collapse. Empirical results +demonstrate that ADDV effectively processes global information, generating +appropriate bins for various scenes and producing higher quality depth maps +compared to handcrafted methods. + +
+
+
+
+
+ + ☆ Classification of Nasopharyngeal Cases using DenseNet Deep Learning + Architecture + + +
+ Nasopharyngeal carcinoma (NPC) is one of the understudied yet deadliest +cancers in South East Asia. In Malaysia, the prevalence is identified mainly in +Sarawak, among the ethnic of Bidayuh. NPC is often late-diagnosed because it is +asymptomatic at the early stage. There are several tissue representations from +the nasopharynx biopsy, such as nasopharyngeal inflammation (NPI), lymphoid +hyperplasia (LHP), nasopharyngeal carcinoma (NPC) and normal tissue. This paper +is our first initiative to identify the difference between NPC, NPI and normal +cases. Seven whole slide images (WSIs) with gigapixel resolutions from seven +different patients and two hospitals were experimented with using two test +setups, consisting of a different set of images. The tissue regions are patched +into smaller blocks and classified using DenseNet architecture with 21 dense +layers. Two tests are carried out, each for proof of concept (Test 1) and +real-test scenario (Test 2). The accuracy achieved for NPC class is 94.8% for +Test 1 and 67.0% for Test 2. + +
+
+ comment: This article has been accepted in the Journal of Engineering Science + and Technology (JESTEC) and awaiting publication +
+
+
+
+
+ + ☆ AGL-NET: Aerial-Ground Cross-Modal Global Localization with Varying + Scales + + +
+ We present AGL-NET, a novel learning-based method for global localization +using LiDAR point clouds and satellite maps. AGL-NET tackles two critical +challenges: bridging the representation gap between image and points modalities +for robust feature matching, and handling inherent scale discrepancies between +global view and local view. To address these challenges, AGL-NET leverages a +unified network architecture with a novel two-stage matching design. The first +stage extracts informative neural features directly from raw sensor data and +performs initial feature matching. The second stage refines this matching +process by extracting informative skeleton features and incorporating a novel +scale alignment step to rectify scale variations between LiDAR and map data. +Furthermore, a novel scale and skeleton loss function guides the network toward +learning scale-invariant feature representations, eliminating the need for +pre-processing satellite maps. This significantly improves real-world +applicability in scenarios with unknown map scales. To facilitate rigorous +performance evaluation, we introduce a meticulously designed dataset within the +CARLA simulator specifically tailored for metric localization training and +assessment. The code and dataset will be made publicly available. + +
+
+
+
+
+ + ☆ BodyMAP -- Jointly Predicting Body Mesh and 3D Applied Pressure Map for + People in Bed CVPR 2024 + + +
+ Accurately predicting the 3D human posture and the pressure exerted on the +body for people resting in bed, visualized as a body mesh (3D pose & shape) +with a 3D pressure map, holds significant promise for healthcare applications, +particularly, in the prevention of pressure ulcers. Current methods focus on +singular facets of the problem -- predicting only 2D/3D poses, generating 2D +pressure images, predicting pressure only for certain body regions instead of +the full body, or forming indirect approximations to the 3D pressure map. In +contrast, we introduce BodyMAP, which jointly predicts the human body mesh and +3D applied pressure map across the entire human body. Our network leverages +multiple visual modalities, incorporating both a depth image of a person in bed +and its corresponding 2D pressure image acquired from a pressure-sensing +mattress. The 3D pressure map is represented as a pressure value at each mesh +vertex and thus allows for precise localization of high-pressure regions on the +body. Additionally, we present BodyMAP-WS, a new formulation of pressure +prediction in which we implicitly learn pressure in 3D by aligning sensed 2D +pressure images with a differentiable 2D projection of the predicted 3D +pressure maps. In evaluations with real-world human data, our method +outperforms the current state-of-the-art technique by 25% on both body mesh and +3D applied pressure map prediction tasks for people in bed. + +
+
+ comment: Accepted at CVPR 2024 Project Website: https://bodymap3d.github.io/ + Code: https://github.com/RCHI-Lab/BodyMAP +
+
+
+
+
+ + ☆ MonoCD: Monocular 3D Object Detection with Complementary Depths CVPR 2024 + + +
+ Monocular 3D object detection has attracted widespread attention due to its +potential to accurately obtain object 3D localization from a single image at a +low cost. Depth estimation is an essential but challenging subtask of monocular +3D object detection due to the ill-posedness of 2D to 3D mapping. Many methods +explore multiple local depth clues such as object heights and keypoints and +then formulate the object depth estimation as an ensemble of multiple depth +predictions to mitigate the insufficiency of single-depth information. However, +the errors of existing multiple depths tend to have the same sign, which +hinders them from neutralizing each other and limits the overall accuracy of +combined depth. To alleviate this problem, we propose to increase the +complementarity of depths with two novel designs. First, we add a new depth +prediction branch named complementary depth that utilizes global and efficient +depth clues from the entire image rather than the local clues to reduce the +correlation of depth predictions. Second, we propose to fully exploit the +geometric relations between multiple depth clues to achieve complementarity in +form. Benefiting from these designs, our method achieves higher +complementarity. Experiments on the KITTI benchmark demonstrate that our method +achieves state-of-the-art performance without introducing extra data. In +addition, complementary depth can also be a lightweight and plug-and-play +module to boost multiple existing monocular 3d object detectors. Code is +available at https://github.com/elvintanhust/MonoCD. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ UniAV: Unified Audio-Visual Perception for Multi-Task Video Localization + + +
+ Video localization tasks aim to temporally locate specific instances in +videos, including temporal action localization (TAL), sound event detection +(SED) and audio-visual event localization (AVEL). Existing methods +over-specialize on each task, overlooking the fact that these instances often +occur in the same video to form the complete video content. In this work, we +present UniAV, a Unified Audio-Visual perception network, to achieve joint +learning of TAL, SED and AVEL tasks for the first time. UniAV can leverage +diverse data available in task-specific datasets, allowing the model to learn +and share mutually beneficial knowledge across tasks and modalities. To tackle +the challenges posed by substantial variations in datasets +(size/domain/duration) and distinct task characteristics, we propose to +uniformly encode visual and audio modalities of all videos to derive generic +representations, while also designing task-specific experts to capture unique +knowledge for each task. Besides, we develop a unified language-aware +classifier by utilizing a pre-trained text encoder, enabling the model to +flexibly detect various types of instances and previously unseen ones by simply +changing prompts during inference. UniAV outperforms its single-task +counterparts by a large margin with fewer parameters, achieving on-par or +superior performances compared to state-of-the-art task-specific methods across +ActivityNet 1.3, DESED and UnAV-100 benchmarks. + +
+
+
+
+
+ + ☆ BioVL-QR: Egocentric Biochemical Video-and-Language Dataset Using Micro + QR Codes + + +
+ This paper introduces a biochemical vision-and-language dataset, which +consists of 24 egocentric experiment videos, corresponding protocols, and +video-and-language alignments. The key challenge in the wet-lab domain is +detecting equipment, reagents, and containers is difficult because the lab +environment is scattered by filling objects on the table and some objects are +indistinguishable. Therefore, previous studies assume that objects are manually +annotated and given for downstream tasks, but this is costly and +time-consuming. To address this issue, this study focuses on Micro QR Codes to +detect objects automatically. From our preliminary study, we found that +detecting objects only using Micro QR Codes is still difficult because the +researchers manipulate objects, causing blur and occlusion frequently. To +address this, we also propose a novel object labeling method by combining a +Micro QR Code detector and an off-the-shelf hand object detector. As one of the +applications of our dataset, we conduct the task of generating protocols from +experiment videos and find that our approach can generate accurate protocols. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ☆ HandDiff: 3D Hand Pose Estimation with Diffusion on Image-Point Cloud + + +
+ Extracting keypoint locations from input hand frames, known as 3D hand pose +estimation, is a critical task in various human-computer interaction +applications. Essentially, the 3D hand pose estimation can be regarded as a 3D +point subset generative problem conditioned on input frames. Thanks to the +recent significant progress on diffusion-based generative models, hand pose +estimation can also benefit from the diffusion model to estimate keypoint +locations with high quality. However, directly deploying the existing diffusion +models to solve hand pose estimation is non-trivial, since they cannot achieve +the complex permutation mapping and precise localization. Based on this +motivation, this paper proposes HandDiff, a diffusion-based hand pose +estimation model that iteratively denoises accurate hand pose conditioned on +hand-shaped image-point clouds. In order to recover keypoint permutation and +accurate location, we further introduce joint-wise condition and local detail +condition. Experimental results demonstrate that the proposed HandDiff +significantly outperforms the existing approaches on four challenging hand pose +benchmark datasets. Codes and pre-trained models are publicly available at +https://github.com/cwc1260/HandDiff. + +
+
+ comment: Accepted as a conference paper to the Conference on Computer Vision + and Pattern Recognition (2024) +
+
+
+
+
+ + ☆ DreamWalk: Style Space Exploration using Diffusion Guidance + + +
+ Text-conditioned diffusion models can generate impressive images, but fall +short when it comes to fine-grained control. Unlike direct-editing tools like +Photoshop, text conditioned models require the artist to perform "prompt +engineering," constructing special text sentences to control the style or +amount of a particular subject present in the output image. Our goal is to +provide fine-grained control over the style and substance specified by the +prompt, for example to adjust the intensity of styles in different regions of +the image (Figure 1). Our approach is to decompose the text prompt into +conceptual elements, and apply a separate guidance term for each element in a +single diffusion process. We introduce guidance scale functions to control when +in the diffusion process and \emph{where} in the image to intervene. Since the +method is based solely on adjusting diffusion guidance, it does not require +fine-tuning or manipulating the internal layers of the diffusion model's neural +network, and can be used in conjunction with LoRA- or DreamBooth-trained models +(Figure2). Project page: https://mshu1.github.io/dreamwalk.github.io/ + +
+
+
+
+
+ + ☆ Diverse and Tailored Image Generation for Zero-shot Multi-label + Classification + + +
+ Recently, zero-shot multi-label classification has garnered considerable +attention for its capacity to operate predictions on unseen labels without +human annotations. Nevertheless, prevailing approaches often use seen classes +as imperfect proxies for unseen ones, resulting in suboptimal performance. +Drawing inspiration from the success of text-to-image generation models in +producing realistic images, we propose an innovative solution: generating +synthetic data to construct a training set explicitly tailored for proxyless +training on unseen labels. Our approach introduces a novel image generation +framework that produces multi-label synthetic images of unseen classes for +classifier training. To enhance diversity in the generated images, we leverage +a pre-trained large language model to generate diverse prompts. Employing a +pre-trained multi-modal CLIP model as a discriminator, we assess whether the +generated images accurately represent the target classes. This enables +automatic filtering of inaccurately generated images, preserving classifier +accuracy. To refine text prompts for more precise and effective multi-label +object generation, we introduce a CLIP score-based discriminative loss to +fine-tune the text encoder in the diffusion model. Additionally, to enhance +visual features on the target task while maintaining the generalization of +original features and mitigating catastrophic forgetting resulting from +fine-tuning the entire visual encoder, we propose a feature fusion module +inspired by transformer attention mechanisms. This module aids in capturing +global dependencies between multiple objects more effectively. Extensive +experimental results validate the effectiveness of our approach, demonstrating +significant improvements over state-of-the-art methods. + +
+
+
+
+
+ + ☆ Discontinuity-preserving Normal Integration with Auxiliary Edges CVPR 2024 + + +
+ Many surface reconstruction methods incorporate normal integration, which is +a process to obtain a depth map from surface gradients. In this process, the +input may represent a surface with discontinuities, e.g., due to +self-occlusion. To reconstruct an accurate depth map from the input normal map, +hidden surface gradients occurring from the jumps must be handled. To model +these jumps correctly, we design a novel discretization scheme for the domain +of normal integration. Our key idea is to introduce auxiliary edges, which +bridge between piecewise-smooth patches in the domain so that the magnitude of +hidden jumps can be explicitly expressed. Using the auxiliary edges, we design +a novel algorithm to optimize the discontinuity and the depth map from the +input normal map. Our method optimizes discontinuities by using a combination +of iterative re-weighted least squares and iterative filtering of the jump +magnitudes on auxiliary edges to provide strong sparsity regularization. +Compared to previous discontinuity-preserving normal integration methods, which +model the magnitudes of jumps only implicitly, our method reconstructs subtle +discontinuities accurately thanks to our explicit representation of jumps +allowing for strong sparsity regularization. + +
+
+ comment: To appear at CVPR 2024. For supplementary video, see + https://youtu.be/MTTcW5kAOFE +
+
+
+
+
+ + ☆ GaSpCT: Gaussian Splatting for Novel CT Projection View Synthesis MICCAI 2024 + + +
+ We present GaSpCT, a novel view synthesis and 3D scene representation method +used to generate novel projection views for Computer Tomography (CT) scans. We +adapt the Gaussian Splatting framework to enable novel view synthesis in CT +based on limited sets of 2D image projections and without the need for +Structure from Motion (SfM) methodologies. Therefore, we reduce the total +scanning duration and the amount of radiation dose the patient receives during +the scan. We adapted the loss function to our use-case by encouraging a +stronger background and foreground distinction using two sparsity promoting +regularizers: a beta loss and a total variation (TV) loss. Finally, we +initialize the Gaussian locations across the 3D space using a uniform prior +distribution of where the brain's positioning would be expected to be within +the field of view. We evaluate the performance of our model using brain CT +scans from the Parkinson's Progression Markers Initiative (PPMI) dataset and +demonstrate that the rendered novel views closely match the original projection +views of the simulated scan, and have better performance than other implicit 3D +scene representations methodologies. Furthermore, we empirically observe +reduced training time compared to neural network based image synthesis for +sparse-view CT image reconstruction. Finally, the memory requirements of the +Gaussian Splatting representations are reduced by 17% compared to the +equivalent voxel grid image representations. + +
+
+ comment: Under Review Process for MICCAI 2024 +
+
+
+
+
+ + ☆ PARIS3D: Reasoning-based 3D Part Segmentation Using Large Multimodal + Model + + +
+ Recent advancements in 3D perception systems have significantly improved +their ability to perform visual recognition tasks such as segmentation. +However, these systems still heavily rely on explicit human instruction to +identify target objects or categories, lacking the capability to actively +reason and comprehend implicit user intentions. We introduce a novel +segmentation task known as reasoning part segmentation for 3D objects, aiming +to output a segmentation mask based on complex and implicit textual queries +about specific parts of a 3D object. To facilitate evaluation and benchmarking, +we present a large 3D dataset comprising over 60k instructions paired with +corresponding ground-truth part segmentation annotations specifically curated +for reasoning-based 3D part segmentation. We propose a model that is capable of +segmenting parts of 3D objects based on implicit textual queries and generating +natural language explanations corresponding to 3D object segmentation requests. +Experiments show that our method achieves competitive performance to models +that use explicit queries, with the additional abilities to identify part +concepts, reason about them, and complement them with world knowledge. Our +source code, dataset, and trained models are available at +https://github.com/AmrinKareem/PARIS3D. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ SleepVST: Sleep Staging from Near-Infrared Video Signals using + Pre-Trained Transformers CVPR 2024 + + +
+ Advances in camera-based physiological monitoring have enabled the robust, +non-contact measurement of respiration and the cardiac pulse, which are known +to be indicative of the sleep stage. This has led to research into camera-based +sleep monitoring as a promising alternative to "gold-standard" polysomnography, +which is cumbersome, expensive to administer, and hence unsuitable for +longer-term clinical studies. In this paper, we introduce SleepVST, a +transformer model which enables state-of-the-art performance in camera-based +sleep stage classification (sleep staging). After pre-training on contact +sensor data, SleepVST outperforms existing methods for cardio-respiratory sleep +staging on the SHHS and MESA datasets, achieving total Cohen's kappa scores of +0.75 and 0.77 respectively. We then show that SleepVST can be successfully +transferred to cardio-respiratory waveforms extracted from video, enabling +fully contact-free sleep staging. Using a video dataset of 50 nights, we +achieve a total accuracy of 78.8\% and a Cohen's $\kappa$ of 0.71 in four-class +video-based sleep staging, setting a new state-of-the-art in the domain. + +
+
+ comment: CVPR 2024 Highlight Paper +
+
+
+
+
+ + ☆ Effective Lymph Nodes Detection in CT Scans Using Location Debiased + Query Selection and Contrastive Query Representation in Transformer + + +
+ Lymph node (LN) assessment is a critical, indispensable yet very challenging +task in the routine clinical workflow of radiology and oncology. Accurate LN +analysis is essential for cancer diagnosis, staging, and treatment planning. +Finding scatteredly distributed, low-contrast clinically relevant LNs in 3D CT +is difficult even for experienced physicians under high inter-observer +variations. Previous automatic LN detection works typically yield limited +recall and high false positives (FPs) due to adjacent anatomies with similar +image intensities, shapes, or textures (vessels, muscles, esophagus, etc). In +this work, we propose a new LN DEtection TRansformer, named LN-DETR, to achieve +more accurate performance. By enhancing the 2D backbone with a multi-scale 2.5D +feature fusion to incorporate 3D context explicitly, more importantly, we make +two main contributions to improve the representation quality of LN queries. 1) +Considering that LN boundaries are often unclear, an IoU prediction head and a +location debiased query selection are proposed to select LN queries of higher +localization accuracy as the decoder query's initialization. 2) To reduce FPs, +query contrastive learning is employed to explicitly reinforce LN queries +towards their best-matched ground-truth queries over unmatched query +predictions. Trained and tested on 3D CT scans of 1067 patients (with 10,000+ +labeled LNs) via combining seven LN datasets from different body parts (neck, +chest, and abdomen) and pathologies/cancers, our method significantly improves +the performance of previous leading methods by > 4-5% average recall at the +same FP rates in both internal and external testing. We further evaluate on the +universal lesion detection task using NIH DeepLesion benchmark, and our method +achieves the top performance of 88.46% averaged recall across 0.5 to 4 FPs per +image, compared with other leading reported results. + +
+
+ comment: Technical report +
+
+
+
+
+ + ☆ Language-Guided Instance-Aware Domain-Adaptive Panoptic Segmentation + + +
+ The increasing relevance of panoptic segmentation is tied to the advancements +in autonomous driving and AR/VR applications. However, the deployment of such +models has been limited due to the expensive nature of dense data annotation, +giving rise to unsupervised domain adaptation (UDA). A key challenge in +panoptic UDA is reducing the domain gap between a labeled source and an +unlabeled target domain while harmonizing the subtasks of semantic and instance +segmentation to limit catastrophic interference. While considerable progress +has been achieved, existing approaches mainly focus on the adaptation of +semantic segmentation. In this work, we focus on incorporating instance-level +adaptation via a novel instance-aware cross-domain mixing strategy IMix. IMix +significantly enhances the panoptic quality by improving instance segmentation +performance. Specifically, we propose inserting high-confidence predicted +instances from the target domain onto source images, retaining the +exhaustiveness of the resulting pseudo-labels while reducing the injected +confirmation bias. Nevertheless, such an enhancement comes at the cost of +degraded semantic performance, attributed to catastrophic forgetting. To +mitigate this issue, we regularize our semantic branch by employing CLIP-based +domain alignment (CDA), exploiting the domain-robustness of natural language +prompts. Finally, we present an end-to-end model incorporating these two +mechanisms called LIDAPS, achieving state-of-the-art results on all popular +panoptic UDA benchmarks. + +
+
+
+
+
+ + ☆ Quantifying Uncertainty in Motion Prediction with Variational Bayesian + Mixture CVPR 2024 + + +
+ Safety and robustness are crucial factors in developing trustworthy +autonomous vehicles. One essential aspect of addressing these factors is to +equip vehicles with the capability to predict future trajectories for all +moving objects in the surroundings and quantify prediction uncertainties. In +this paper, we propose the Sequential Neural Variational Agent (SeNeVA), a +generative model that describes the distribution of future trajectories for a +single moving object. Our approach can distinguish Out-of-Distribution data +while quantifying uncertainty and achieving competitive performance compared to +state-of-the-art methods on the Argoverse 2 and INTERACTION datasets. +Specifically, a 0.446 meters minimum Final Displacement Error, a 0.203 meters +minimum Average Displacement Error, and a 5.35% Miss Rate are achieved on the +INTERACTION test set. Extensive qualitative and quantitative analysis is also +provided to evaluate the proposed model. Our open-source code is available at +https://github.com/PurdueDigitalTwin/seneva. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Layerwise Early Stopping for Test Time Adaptation + + +
+ Test Time Adaptation (TTA) addresses the problem of distribution shift by +enabling pretrained models to learn new features on an unseen domain at test +time. However, it poses a significant challenge to maintain a balance between +learning new features and retaining useful pretrained features. In this paper, +we propose Layerwise EArly STopping (LEAST) for TTA to address this problem. +The key idea is to stop adapting individual layers during TTA if the features +being learned do not appear beneficial for the new domain. For that purpose, we +propose using a novel gradient-based metric to measure the relevance of the +current learnt features to the new domain without the need for supervised +labels. More specifically, we propose to use this metric to determine +dynamically when to stop updating each layer during TTA. This enables a more +balanced adaptation, restricted to layers benefiting from it, and only for a +certain number of steps. Such an approach also has the added effect of limiting +the forgetting of pretrained features useful for dealing with new domains. +Through extensive experiments, we demonstrate that Layerwise Early Stopping +improves the performance of existing TTA approaches across multiple datasets, +domain shifts, model architectures, and TTA losses. + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ☆ Flattening the Parent Bias: Hierarchical Semantic Segmentation in the + Poincaré Ball + + +
+ Hierarchy is a natural representation of semantic taxonomies, including the +ones routinely used in image segmentation. Indeed, recent work on semantic +segmentation reports improved accuracy from supervised training leveraging +hierarchical label structures. Encouraged by these results, we revisit the +fundamental assumptions behind that work. We postulate and then empirically +verify that the reasons for the observed improvement in segmentation accuracy +may be entirely unrelated to the use of the semantic hierarchy. To demonstrate +this, we design a range of cross-domain experiments with a representative +hierarchical approach. We find that on the new testing domains, a flat +(non-hierarchical) segmentation network, in which the parents are inferred from +the children, has superior segmentation accuracy to the hierarchical approach +across the board. Complementing these findings and inspired by the intrinsic +properties of hyperbolic spaces, we study a more principled approach to +hierarchical segmentation using the Poincar\'e ball model. The hyperbolic +representation largely outperforms the previous (Euclidean) hierarchical +approach as well and is on par with our flat Euclidean baseline in terms of +segmentation accuracy. However, it additionally exhibits surprisingly strong +calibration quality of the parent nodes in the semantic hierarchy, especially +on the more challenging domains. Our combined analysis suggests that the +established practice of hierarchical segmentation may be limited to in-domain +settings, whereas flat classifiers generalize substantially better, especially +if they are modeled in the hyperbolic space. + +
+
+
+
+
+ + ☆ Data Science for Geographic Information Systems + + +
+ The integration of data science into Geographic Information Systems (GIS) has +facilitated the evolution of these tools into complete spatial analysis +platforms. The adoption of machine learning and big data techniques has +equipped these platforms with the capacity to handle larger amounts of +increasingly complex data, transcending the limitations of more traditional +approaches. This work traces the historical and technical evolution of data +science and GIS as fields of study, highlighting the critical points of +convergence between domains, and underlining the many sectors that rely on this +integration. A GIS application is presented as a case study in the disaster +management sector where we utilize aerial data from Tr\'oia, Portugal, to +emphasize the process of insight extraction from raw data. We conclude by +outlining prospects for future research in integration of these fields in +general, and the developed application in particular. + +
+
+
+
+
+ + ☆ Test Time Training for Industrial Anomaly Segmentation CVPR + + +
+ Anomaly Detection and Segmentation (AD&S) is crucial for industrial quality +control. While existing methods excel in generating anomaly scores for each +pixel, practical applications require producing a binary segmentation to +identify anomalies. Due to the absence of labeled anomalies in many real +scenarios, standard practices binarize these maps based on some statistics +derived from a validation set containing only nominal samples, resulting in +poor segmentation performance. This paper addresses this problem by proposing a +test time training strategy to improve the segmentation performance. Indeed, at +test time, we can extract rich features directly from anomalous samples to +train a classifier that can discriminate defects effectively. Our general +approach can work downstream to any AD&S method that provides an anomaly score +map as output, even in multimodal settings. We demonstrate the effectiveness of +our approach over baselines through extensive experimentation and evaluation on +MVTec AD and MVTec 3D-AD. + +
+
+ comment: Accepted at VAND 2.0, CVPRW 2024 +
+
+
+
+
+ + ☆ SC4D: Sparse-Controlled Video-to-4D Generation and Motion Transfer + + +
+ Recent advances in 2D/3D generative models enable the generation of dynamic +3D objects from a single-view video. Existing approaches utilize score +distillation sampling to form the dynamic scene as dynamic NeRF or dense 3D +Gaussians. However, these methods struggle to strike a balance among reference +view alignment, spatio-temporal consistency, and motion fidelity under +single-view conditions due to the implicit nature of NeRF or the intricate +dense Gaussian motion prediction. To address these issues, this paper proposes +an efficient, sparse-controlled video-to-4D framework named SC4D, that +decouples motion and appearance to achieve superior video-to-4D generation. +Moreover, we introduce Adaptive Gaussian (AG) initialization and Gaussian +Alignment (GA) loss to mitigate shape degeneration issue, ensuring the fidelity +of the learned motion and shape. Comprehensive experimental results demonstrate +that our method surpasses existing methods in both quality and efficiency. In +addition, facilitated by the disentangled modeling of motion and appearance of +SC4D, we devise a novel application that seamlessly transfers the learned +motion onto a diverse array of 4D entities according to textual descriptions. + +
+
+ comment: Project Page: https://sc4d.github.io/ +
+
+
+
+
+ + ☆ No "Zero-Shot" Without Exponential Data: Pretraining Concept Frequency + Determines Multimodal Model Performance ICLR'24 + + +
+ Web-crawled pretraining datasets underlie the impressive "zero-shot" +evaluation performance of multimodal models, such as CLIP for +classification/retrieval and Stable-Diffusion for image generation. However, it +is unclear how meaningful the notion of "zero-shot" generalization is for such +multimodal models, as it is not known to what extent their pretraining datasets +encompass the downstream concepts targeted for during "zero-shot" evaluation. +In this work, we ask: How is the performance of multimodal models on downstream +concepts influenced by the frequency of these concepts in their pretraining +datasets? We comprehensively investigate this question across 34 models and +five standard pretraining datasets (CC-3M, CC-12M, YFCC-15M, LAION-400M, +LAION-Aesthetics), generating over 300GB of data artifacts. We consistently +find that, far from exhibiting "zero-shot" generalization, multimodal models +require exponentially more data to achieve linear improvements in downstream +"zero-shot" performance, following a sample inefficient log-linear scaling +trend. This trend persists even when controlling for sample-level similarity +between pretraining and downstream datasets, and testing on purely synthetic +data distributions. Furthermore, upon benchmarking models on long-tailed data +sampled based on our analysis, we demonstrate that multimodal models across the +board perform poorly. We contribute this long-tail test set as the "Let it +Wag!" benchmark to further research in this direction. Taken together, our +study reveals an exponential need for training data which implies that the key +to "zero-shot" generalization capabilities under large-scale training paradigms +remains to be found. + +
+
+ comment: Extended version of the short paper accepted at DPFM, ICLR'24 +
+
+
+
+
+ + ☆ Explaining Explainability: Understanding Concept Activation Vectors + + +
+ Recent interpretability methods propose using concept-based explanations to +translate the internal representations of deep learning models into a language +that humans are familiar with: concepts. This requires understanding which +concepts are present in the representation space of a neural network. One +popular method for finding concepts is Concept Activation Vectors (CAVs), which +are learnt using a probe dataset of concept exemplars. In this work, we +investigate three properties of CAVs. CAVs may be: (1) inconsistent between +layers, (2) entangled with different concepts, and (3) spatially dependent. +Each property provides both challenges and opportunities in interpreting +models. We introduce tools designed to detect the presence of these properties, +provide insight into how they affect the derived explanations, and provide +recommendations to minimise their impact. Understanding these properties can be +used to our advantage. For example, we introduce spatially dependent CAVs to +test if a model is translation invariant with respect to a specific concept and +class. Our experiments are performed on ImageNet and a new synthetic dataset, +Elements. Elements is designed to capture a known ground truth relationship +between concepts and classes. We release this dataset to facilitate further +research in understanding and evaluating interpretability methods. + +
+
+ comment: (54 pages, 39 figures) +
+
+
+
+
+ + ☆ Cross-Modality Gait Recognition: Bridging LiDAR and Camera Modalities + for Human Identification + + +
+ Current gait recognition research mainly focuses on identifying pedestrians +captured by the same type of sensor, neglecting the fact that individuals may +be captured by different sensors in order to adapt to various environments. A +more practical approach should involve cross-modality matching across different +sensors. Hence, this paper focuses on investigating the problem of +cross-modality gait recognition, with the objective of accurately identifying +pedestrians across diverse vision sensors. We present CrossGait inspired by the +feature alignment strategy, capable of cross retrieving diverse data +modalities. Specifically, we investigate the cross-modality recognition task by +initially extracting features within each modality and subsequently aligning +these features across modalities. To further enhance the cross-modality +performance, we propose a Prototypical Modality-shared Attention Module that +learns modality-shared features from two modality-specific features. +Additionally, we design a Cross-modality Feature Adapter that transforms the +learned modality-specific features into a unified feature space. Extensive +experiments conducted on the SUSTech1K dataset demonstrate the effectiveness of +CrossGait: (1) it exhibits promising cross-modality ability in retrieving +pedestrians across various modalities from different sensors in diverse scenes, +and (2) CrossGait not only learns modality-shared features for cross-modality +gait recognition but also maintains modality-specific features for +single-modality recognition. + +
+
+
+
+
+ + ☆ Mitigating analytical variability in fMRI results with style transfer + + +
+ We propose a novel approach to improve the reproducibility of neuroimaging +results by converting statistic maps across different functional MRI pipelines. +We make the assumption that pipelines can be considered as a style component of +data and propose to use different generative models, among which, Diffusion +Models (DM) to convert data between pipelines. We design a new DM-based +unsupervised multi-domain image-to-image transition framework and constrain the +generation of 3D fMRI statistic maps using the latent space of an auxiliary +classifier that distinguishes statistic maps from different pipelines. We +extend traditional sampling techniques used in DM to improve the transition +performance. Our experiments demonstrate that our proposed methods are +successful: pipelines can indeed be transferred, providing an important source +of data augmentation for future medical studies. + +
+
+
+
+
+ + ♻ ☆ $CrowdDiff$: Multi-hypothesis Crowd Density Estimation using Diffusion + Models CVPR'24 + + +
+ Crowd counting is a fundamental problem in crowd analysis which is typically +accomplished by estimating a crowd density map and summing over the density +values. However, this approach suffers from background noise accumulation and +loss of density due to the use of broad Gaussian kernels to create the ground +truth density maps. This issue can be overcome by narrowing the Gaussian +kernel. However, existing approaches perform poorly when trained with ground +truth density maps with broad kernels. To deal with this limitation, we propose +using conditional diffusion models to predict density maps, as diffusion models +show high fidelity to training data during generation. With that, we present +$CrowdDiff$ that generates the crowd density map as a reverse diffusion +process. Furthermore, as the intermediate time steps of the diffusion process +are noisy, we incorporate a regression branch for direct crowd estimation only +during training to improve the feature learning. In addition, owing to the +stochastic nature of the diffusion model, we introduce producing multiple +density maps to improve the counting performance contrary to the existing crowd +counting pipelines. We conduct extensive experiments on publicly available +datasets to validate the effectiveness of our method. $CrowdDiff$ outperforms +existing state-of-the-art crowd counting methods on several public crowd +analysis benchmarks with significant improvements. + +
+
+ comment: Accepted at CVPR'24. The project is available at + https://dylran.github.io/crowddiff.github.io +
+
+
+
+
+ + ♻ ☆ Expressive Forecasting of 3D Whole-body Human Motions AAAI24 + + +
+ Human motion forecasting, with the goal of estimating future human behavior +over a period of time, is a fundamental task in many real-world applications. +However, existing works typically concentrate on predicting the major joints of +the human body without considering the delicate movements of the human hands. +In practical applications, hand gesture plays an important role in human +communication with the real world, and expresses the primary intention of human +beings. In this work, we are the first to formulate a whole-body human pose +forecasting task, which jointly predicts the future body and hand activities. +Correspondingly, we propose a novel Encoding-Alignment-Interaction (EAI) +framework that aims to predict both coarse (body joints) and fine-grained +(gestures) activities collaboratively, enabling expressive and +cross-facilitated forecasting of 3D whole-body human motions. Specifically, our +model involves two key constituents: cross-context alignment (XCA) and +cross-context interaction (XCI). Considering the heterogeneous information +within the whole-body, XCA aims to align the latent features of various human +components, while XCI focuses on effectively capturing the context interaction +among the human components. We conduct extensive experiments on a +newly-introduced large-scale benchmark and achieve state-of-the-art +performance. The code is public for research purposes at +https://github.com/Dingpx/EAI. + +
+
+ comment: Accepted by AAAI24 +
+
+
+
+
+ + ♻ ☆ Cameras as Rays: Pose Estimation via Ray Diffusion ICLR 2024 + + +
+ Estimating camera poses is a fundamental task for 3D reconstruction and +remains challenging given sparsely sampled views (<10). In contrast to existing +approaches that pursue top-down prediction of global parametrizations of camera +extrinsics, we propose a distributed representation of camera pose that treats +a camera as a bundle of rays. This representation allows for a tight coupling +with spatial image features improving pose precision. We observe that this +representation is naturally suited for set-level transformers and develop a +regression-based approach that maps image patches to corresponding rays. To +capture the inherent uncertainties in sparse-view pose inference, we adapt this +approach to learn a denoising diffusion model which allows us to sample +plausible modes while improving performance. Our proposed methods, both +regression- and diffusion-based, demonstrate state-of-the-art performance on +camera pose estimation on CO3D while generalizing to unseen object categories +and in-the-wild captures. + +
+
+ comment: In ICLR 2024 (oral). v2-3: updated references. Project webpage: + https://jasonyzhang.com/RayDiffusion +
+
+
+
+
+ + ♻ ☆ APISR: Anime Production Inspired Real-World Anime Super-Resolution + + +
+ While real-world anime super-resolution (SR) has gained increasing attention +in the SR community, existing methods still adopt techniques from the +photorealistic domain. In this paper, we analyze the anime production workflow +and rethink how to use characteristics of it for the sake of the real-world +anime SR. First, we argue that video networks and datasets are not necessary +for anime SR due to the repetition use of hand-drawing frames. Instead, we +propose an anime image collection pipeline by choosing the least compressed and +the most informative frames from the video sources. Based on this pipeline, we +introduce the Anime Production-oriented Image (API) dataset. In addition, we +identify two anime-specific challenges of distorted and faint hand-drawn lines +and unwanted color artifacts. We address the first issue by introducing a +prediction-oriented compression module in the image degradation model and a +pseudo-ground truth preparation with enhanced hand-drawn lines. In addition, we +introduce the balanced twin perceptual loss combining both anime and +photorealistic high-level features to mitigate unwanted color artifacts and +increase visual clarity. We evaluate our method through extensive experiments +on the public benchmark, showing our method outperforms state-of-the-art anime +dataset-trained approaches. + +
+
+
+
+
+ + ♻ ☆ NEMTO: Neural Environment Matting for Novel View and Relighting + Synthesis of Transparent Objects ICCV 2023 + + +
+ We propose NEMTO, the first end-to-end neural rendering pipeline to model 3D +transparent objects with complex geometry and unknown indices of refraction. +Commonly used appearance modeling such as the Disney BSDF model cannot +accurately address this challenging problem due to the complex light paths +bending through refractions and the strong dependency of surface appearance on +illumination. With 2D images of the transparent object as input, our method is +capable of high-quality novel view and relighting synthesis. We leverage +implicit Signed Distance Functions (SDF) to model the object geometry and +propose a refraction-aware ray bending network to model the effects of light +refraction within the object. Our ray bending network is more tolerant to +geometric inaccuracies than traditional physically-based methods for rendering +transparent objects. We provide extensive evaluations on both synthetic and +real-world datasets to demonstrate our high-quality synthesis and the +applicability of our method. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ 3DGS-Avatar: Animatable Avatars via Deformable 3D Gaussian Splatting + + +
+ We introduce an approach that creates animatable human avatars from monocular +videos using 3D Gaussian Splatting (3DGS). Existing methods based on neural +radiance fields (NeRFs) achieve high-quality novel-view/novel-pose image +synthesis but often require days of training, and are extremely slow at +inference time. Recently, the community has explored fast grid structures for +efficient training of clothed avatars. Albeit being extremely fast at training, +these methods can barely achieve an interactive rendering frame rate with +around 15 FPS. In this paper, we use 3D Gaussian Splatting and learn a +non-rigid deformation network to reconstruct animatable clothed human avatars +that can be trained within 30 minutes and rendered at real-time frame rates +(50+ FPS). Given the explicit nature of our representation, we further +introduce as-isometric-as-possible regularizations on both the Gaussian mean +vectors and the covariance matrices, enhancing the generalization of our model +on highly articulated unseen poses. Experimental results show that our method +achieves comparable and even better performance compared to state-of-the-art +approaches on animatable avatar creation from a monocular input, while being +400x and 250x faster in training and inference, respectively. + +
+
+ comment: Project page: https://neuralbodies.github.io/3DGS-Avatar +
+
+
+
+
+ + ♻ ☆ ILPO-NET: Network for the invariant recognition of arbitrary volumetric + patterns in 3D + + +
+ Effective recognition of spatial patterns and learning their hierarchy is +crucial in modern spatial data analysis. Volumetric data applications seek +techniques ensuring invariance not only to shifts but also to pattern +rotations. While traditional methods can readily achieve translational +invariance, rotational invariance possesses multiple challenges and remains an +active area of research. Here, we present ILPO-Net (Invariant to Local Patterns +Orientation Network), a novel approach that handles arbitrarily shaped patterns +with the convolutional operation inherently invariant to local spatial pattern +orientations using the Wigner matrix expansions. Our architecture seamlessly +integrates the new convolution operator and, when benchmarked on diverse +volumetric datasets such as MedMNIST and CATH, demonstrates superior +performance over the baselines with significantly reduced parameter counts - up +to 1000 times fewer in the case of MedMNIST. Beyond these demonstrations, +ILPO-Net's rotational invariance paves the way for other applications across +multiple disciplines. Our code is publicly available at +https://gricad-gitlab.univ-grenoble-alpes.fr/GruLab/ILPONet. + +
+
+
+
+
+ + ♻ ☆ Bootstrapping SparseFormers from Vision Foundation Models CVPR 2024 + + +
+ The recently proposed SparseFormer architecture provides an alternative +approach to visual understanding by utilizing a significantly lower number of +visual tokens via adjusting RoIs, greatly reducing computational costs while +still achieving promising performance. However, training SparseFormers from +scratch is still expensive, and scaling up the number of parameters can be +challenging. In this paper, we propose to bootstrap SparseFormers from +ViT-based vision foundation models in a simple and efficient way. Since the +majority of SparseFormer blocks are the standard transformer ones, we can +inherit weights from large-scale pre-trained vision transformers and freeze +them as much as possible. Therefore, we only need to train the +SparseFormer-specific lightweight focusing transformer to adjust token RoIs and +fine-tune a few early pre-trained blocks to align the final token +representation. In such a way, we can bootstrap SparseFormer architectures from +various large-scale pre-trained models (e.g., IN-21K pre-trained AugRegs or +CLIPs) using a rather smaller amount of training samples (e.g., IN-1K) and +without labels or captions within just a few hours. As a result, the +bootstrapped unimodal SparseFormer (from AugReg-ViT-L/16-384) can reach 84.9% +accuracy on IN-1K with only 49 tokens, and the multimodal SparseFormer from +CLIPs also demonstrates notable zero-shot performance with highly reduced +computational cost without seeing any caption during the bootstrapping +procedure. In addition, CLIP-bootstrapped SparseFormers, which align the output +space with language without seeing a word, can serve as efficient vision +encoders in multimodal large language models. Code and models are available at +https://github.com/showlab/sparseformer + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Unified Spatio-Temporal Tri-Perspective View Representation for 3D + Semantic Occupancy Prediction + + +
+ Holistic understanding and reasoning in 3D scenes play a vital role in the +success of autonomous driving systems. The evolution of 3D semantic occupancy +prediction as a pretraining task for autonomous driving and robotic downstream +tasks capture finer 3D details compared to methods like 3D detection. Existing +approaches predominantly focus on spatial cues such as tri-perspective view +embeddings (TPV), often overlooking temporal cues. This study introduces a +spatiotemporal transformer architecture S2TPVFormer for temporally coherent 3D +semantic occupancy prediction. We enrich the prior process by including +temporal cues using a novel temporal cross-view hybrid attention mechanism +(TCVHA) and generate spatiotemporal TPV embeddings (i.e. S2TPV embeddings). +Experimental evaluations on the nuScenes dataset demonstrate a substantial 4.1% +improvement in mean Intersection over Union (mIoU) for 3D Semantic Occupancy +compared to TPVFormer, confirming the effectiveness of the proposed S2TPVFormer +in enhancing 3D scene perception. + +
+
+
+
+
+ + ♻ ☆ Learning Subject-Aware Cropping by Outpainting Professional Photos AAAI 24 + + +
+ How to frame (or crop) a photo often depends on the image subject and its +context; e.g., a human portrait. Recent works have defined the subject-aware +image cropping task as a nuanced and practical version of image cropping. We +propose a weakly-supervised approach (GenCrop) to learn what makes a +high-quality, subject-aware crop from professional stock images. Unlike +supervised prior work, GenCrop requires no new manual annotations beyond the +existing stock image collection. The key challenge in learning from this data, +however, is that the images are already cropped and we do not know what regions +were removed. Our insight is to combine a library of stock images with a +modern, pre-trained text-to-image diffusion model. The stock image collection +provides diversity and its images serve as pseudo-labels for a good crop, while +the text-image diffusion model is used to out-paint (i.e., outward inpainting) +realistic uncropped images. Using this procedure, we are able to automatically +generate a large dataset of cropped-uncropped training pairs to train a +cropping model. Despite being weakly-supervised, GenCrop is competitive with +state-of-the-art supervised methods and significantly better than comparable +weakly-supervised baselines on quantitative and qualitative evaluation metrics. + +
+
+ comment: AAAI 24. Extended version with supplemental materials +
+
+
+
+
+ + ♻ ☆ Non-negative Subspace Feature Representation for Few-shot Learning in + Medical Imaging + + +
+ Unlike typical visual scene recognition domains, in which massive datasets +are accessible to deep neural networks, medical image interpretations are often +obstructed by the paucity of data. In this paper, we investigate the +effectiveness of data-based few-shot learning in medical imaging by exploring +different data attribute representations in a low-dimensional space. We +introduce different types of non-negative matrix factorization (NMF) in +few-shot learning, addressing the data scarcity issue in medical image +classification. Extensive empirical studies are conducted in terms of +validating the effectiveness of NMF, especially its supervised variants (e.g., +discriminative NMF, and supervised and constrained NMF with sparseness), and +the comparison with principal component analysis (PCA), i.e., the collaborative +representation-based dimensionality reduction technique derived from +eigenvectors. With 14 different datasets covering 11 distinct illness +categories, thorough experimental results and comparison with related +techniques demonstrate that NMF is a competitive alternative to PCA for +few-shot learning in medical imaging, and the supervised NMF algorithms are +more discriminative in the subspace with greater effectiveness. Furthermore, we +show that the part-based representation of NMF, especially its supervised +variants, is dramatically impactful in detecting lesion areas in medical +imaging with limited samples. + +
+
+
+
+
+ + ♻ ☆ Data Upcycling Knowledge Distillation for Image Super-Resolution + + +
+ Knowledge distillation (KD) compresses deep neural networks by transferring +task-related knowledge from cumbersome pre-trained teacher models to compact +student models. However, current KD methods for super-resolution (SR) networks +overlook the nature of SR task that the outputs of the teacher model are noisy +approximations to the ground-truth distribution of high-quality images (GT), +which shades the teacher model's knowledge to result in limited KD effects. To +utilize the teacher model beyond the GT upper-bound, we present the Data +Upcycling Knowledge Distillation (DUKD), to transfer the teacher model's +knowledge to the student model through the upcycled in-domain data derived from +training data. Besides, we impose label consistency regularization to KD for SR +by the paired invertible augmentations to improve the student model's +performance and robustness. Comprehensive experiments demonstrate that the DUKD +method significantly outperforms previous arts on several SR tasks. + +
+
+
+
+
+ + ♻ ☆ MorpheuS: Neural Dynamic 360° Surface Reconstruction from Monocular + RGB-D Video CVPR2024 + + +
+ Neural rendering has demonstrated remarkable success in dynamic scene +reconstruction. Thanks to the expressiveness of neural representations, prior +works can accurately capture the motion and achieve high-fidelity +reconstruction of the target object. Despite this, real-world video scenarios +often feature large unobserved regions where neural representations struggle to +achieve realistic completion. To tackle this challenge, we introduce MorpheuS, +a framework for dynamic 360{\deg} surface reconstruction from a casually +captured RGB-D video. Our approach models the target scene as a canonical field +that encodes its geometry and appearance, in conjunction with a deformation +field that warps points from the current frame to the canonical space. We +leverage a view-dependent diffusion prior and distill knowledge from it to +achieve realistic completion of unobserved regions. Experimental results on +various real-world and synthetic datasets show that our method can achieve +high-fidelity 360{\deg} surface reconstruction of a deformable object from a +monocular RGB-D video. + +
+
+ comment: CVPR2024. Project page: + https://hengyiwang.github.io/projects/morpheus +
+
+
+
+
+ + ♻ ☆ Calibrating Bayesian UNet++ for Sub-Seasonal Forecasting ICLR 2024 + + +
+ Seasonal forecasting is a crucial task when it comes to detecting the extreme +heat and colds that occur due to climate change. Confidence in the predictions +should be reliable since a small increase in the temperatures in a year has a +big impact on the world. Calibration of the neural networks provides a way to +ensure our confidence in the predictions. However, calibrating regression +models is an under-researched topic, especially in forecasters. We calibrate a +UNet++ based architecture, which was shown to outperform physics-based models +in temperature anomalies. We show that with a slight trade-off between +prediction error and calibration error, it is possible to get more reliable and +sharper forecasts. We believe that calibration should be an important part of +safety-critical machine learning applications such as weather forecasters. + +
+
+ comment: Accepted as a workshop paper at "ICLR 2024 Tackling Climate Change + with Machine Learning" +
+
+
+
+
+ + ♻ ☆ Roadside Monocular 3D Detection via 2D Detection Prompting + + +
+ The problem of roadside monocular 3D detection requires detecting objects of +interested classes in a 2D RGB frame and predicting their 3D information such +as locations in bird's-eye-view (BEV). It has broad applications in traffic +control, vehicle-vehicle communication, and vehicle-infrastructure cooperative +perception. To approach this problem, we present a novel and simple method by +prompting the 3D detector using 2D detections. Our method builds on a key +insight that, compared with 3D detectors, a 2D detector is much easier to train +and performs significantly better w.r.t detections on the 2D image plane. That +said, one can exploit 2D detections of a well-trained 2D detector as prompts to +a 3D detector, being trained in a way of inflating such 2D detections to 3D +towards 3D detection. To construct better prompts using the 2D detector, we +explore three techniques: (a) concatenating both 2D and 3D detectors' features, +(b) attentively fusing 2D and 3D detectors' features, and (c) encoding +predicted 2D boxes x, y, width, height, label and attentively fusing such with +the 3D detector's features. Surprisingly, the third performs the best. +Moreover, we present a yaw tuning tactic and a class-grouping strategy that +merges classes based on their functionality; these techniques improve 3D +detection performance further. Comprehensive ablation studies and extensive +experiments demonstrate that our method resoundingly outperforms prior works, +achieving the state-of-the-art on two large-scale roadside 3D detection +benchmarks. + +
+
+
+
+
+ + ♻ ☆ Scene-aware Human Motion Forecasting via Mutual Distance Prediction + + +
+ In this paper, we tackle the problem of scene-aware 3D human motion +forecasting. A key challenge of this task is to predict future human motions +that are consistent with the scene by modeling the human-scene interactions. +While recent works have demonstrated that explicit constraints on human-scene +interactions can prevent the occurrence of ghost motion, they only provide +constraints on partial human motion e.g., the global motion of the human or a +few joints contacting the scene, leaving the rest of the motion unconstrained. +To address this limitation, we propose to model the human-scene interaction +with the mutual distance between the human body and the scene. Such mutual +distances constrain both the local and global human motion, resulting in a +whole-body motion constrained prediction. In particular, mutual distance +constraints consist of two components, the signed distance of each vertex on +the human mesh to the scene surface and the distance of basis scene points to +the human mesh. We further introduce a global scene representation learned from +a signed distance function (SDF) volume to ensure coherence between the global +scene representation and the explicit constraint from the mutual distance. We +develop a pipeline with two sequential steps: predicting the future mutual +distances first, followed by forecasting future human motion. During training, +we explicitly encourage consistency between predicted poses and mutual +distances. Extensive evaluations on the existing synthetic and real datasets +demonstrate that our approach consistently outperforms the state-of-the-art +methods. + +
+
+
+
+
+ + ♻ ☆ ShapeFusion: A 3D diffusion model for localized shape editing + + +
+ In the realm of 3D computer vision, parametric models have emerged as a +ground-breaking methodology for the creation of realistic and expressive 3D +avatars. Traditionally, they rely on Principal Component Analysis (PCA), given +its ability to decompose data to an orthonormal space that maximally captures +shape variations. However, due to the orthogonality constraints and the global +nature of PCA's decomposition, these models struggle to perform localized and +disentangled editing of 3D shapes, which severely affects their use in +applications requiring fine control such as face sculpting. In this paper, we +leverage diffusion models to enable diverse and fully localized edits on 3D +meshes, while completely preserving the un-edited regions. We propose an +effective diffusion masking training strategy that, by design, facilitates +localized manipulation of any shape region, without being limited to predefined +regions or to sparse sets of predefined control vertices. Following our +framework, a user can explicitly set their manipulation region of choice and +define an arbitrary set of vertices as handles to edit a 3D mesh. Compared to +the current state-of-the-art our method leads to more interpretable shape +manipulations than methods relying on latent code state, greater localization +and generation diversity while offering faster inference than optimization +based approaches. Project page: https://rolpotamias.github.io/Shapefusion/ + +
+
+ comment: Project Page: https://rolpotamias.github.io/Shapefusion/ +
+
+
+
+
+ + ♻ ☆ Vestibular schwannoma growth prediction from longitudinal MRI by time + conditioned neural fields + + +
+ Vestibular schwannomas (VS) are benign tumors that are generally managed by +active surveillance with MRI examination. To further assist clinical +decision-making and avoid overtreatment, an accurate prediction of tumor growth +based on longitudinal imaging is highly desirable. In this paper, we introduce +DeepGrowth, a deep learning method that incorporates neural fields and +recurrent neural networks for prospective tumor growth prediction. In the +proposed method, each tumor is represented as a signed distance function (SDF) +conditioned on a low-dimensional latent code. Unlike previous studies that +perform tumor shape prediction directly in the image space, we predict the +latent codes instead and then reconstruct future shapes from it. To deal with +irregular time intervals, we introduce a time-conditioned recurrent module +based on a ConvLSTM and a novel temporal encoding strategy, which enables the +proposed model to output varying tumor shapes over time. The experiments on an +in-house longitudinal VS dataset showed that the proposed model significantly +improved the performance ($\ge 1.6\%$ Dice score and $\ge0.20$ mm 95\% +Hausdorff distance), in particular for top 20\% tumors that grow or shrink the +most ($\ge 4.6\%$ Dice score and $\ge 0.73$ mm 95\% Hausdorff distance). Our +code is available at ~\burl{https://github.com/cyjdswx/DeepGrowth} + +
+
+
+
+
+ + ♻ ☆ Smooth Deep Saliency + + +
+ In this work, we investigate methods to reduce the noise in deep saliency +maps coming from convolutional downsampling, with the purpose of explaining how +a deep learning model detects tumors in scanned histological tissue samples. +Those methods make the investigated models more interpretable for +gradient-based saliency maps, computed in hidden layers. We test our approach +on different models trained for image classification on ImageNet1K, and models +trained for tumor detection on Camelyon16 and in-house real-world digital +pathology scans of stained tissue samples. Our results show that the +checkerboard noise in the gradient gets reduced, resulting in smoother and +therefore easier to interpret saliency maps. + +
+
+
+
+
+ + ♻ ☆ Self-Aligning Depth-regularized Radiance Fields for Asynchronous RGB-D + Sequences + + +
+ It has been shown that learning radiance fields with depth rendering and +depth supervision can effectively promote the quality and convergence of view +synthesis. However, this paradigm requires input RGB-D sequences to be +synchronized, hindering its usage in the UAV city modeling scenario. As there +exists asynchrony between RGB images and depth images due to high-speed flight, +we propose a novel time-pose function, which is an implicit network that maps +timestamps to $\rm SE(3)$ elements. To simplify the training process, we also +design a joint optimization scheme to jointly learn the large-scale +depth-regularized radiance fields and the time-pose function. Our algorithm +consists of three steps: (1) time-pose function fitting, (2) radiance field +bootstrapping, (3) joint pose error compensation and radiance field refinement. +In addition, we propose a large synthetic dataset with diverse controlled +mismatches and ground truth to evaluate this new problem setting +systematically. Through extensive experiments, we demonstrate that our method +outperforms baselines without regularization. We also show qualitatively +improved results on a real-world asynchronous RGB-D sequence captured by drone. +Codes, data, and models will be made publicly available. + +
+
+
+
+
+ + ♻ ☆ Beyond Image Super-Resolution for Image Recognition with Task-Driven + Perceptual Loss CVPR 2024 + + +
+ In real-world scenarios, image recognition tasks, such as semantic +segmentation and object detection, often pose greater challenges due to the +lack of information available within low-resolution (LR) content. Image +super-resolution (SR) is one of the promising solutions for addressing the +challenges. However, due to the ill-posed property of SR, it is challenging for +typical SR methods to restore task-relevant high-frequency contents, which may +dilute the advantage of utilizing the SR method. Therefore, in this paper, we +propose Super-Resolution for Image Recognition (SR4IR) that effectively guides +the generation of SR images beneficial to achieving satisfactory image +recognition performance when processing LR images. The critical component of +our SR4IR is the task-driven perceptual (TDP) loss that enables the SR network +to acquire task-specific knowledge from a network tailored for a specific task. +Moreover, we propose a cross-quality patch mix and an alternate training +framework that significantly enhances the efficacy of the TDP loss by +addressing potential problems when employing the TDP loss. Through extensive +experiments, we demonstrate that our SR4IR achieves outstanding task +performance by generating SR images useful for a specific image recognition +task, including semantic segmentation, object detection, and image +classification. The implementation code is available at +https://github.com/JaehaKim97/SR4IR. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ CoDA: Instructive Chain-of-Domain Adaptation with Severity-Aware Visual + Prompt Tuning + + +
+ Unsupervised Domain Adaptation (UDA) aims to adapt models from labeled source +domains to unlabeled target domains. When adapting to adverse scenes, existing +UDA methods fail to perform well due to the lack of instructions, leading their +models to overlook discrepancies within all adverse scenes. To tackle this, we +propose CoDA which instructs models to distinguish, focus, and learn from these +discrepancies at scene and image levels. Specifically, CoDA consists of a +Chain-of-Domain (CoD) strategy and a Severity-Aware Visual Prompt Tuning +(SAVPT) mechanism. CoD focuses on scene-level instructions to divide all +adverse scenes into easy and hard scenes, guiding models to adapt from source +to easy domains with easy scene images, and then to hard domains with hard +scene images, thereby laying a solid foundation for whole adaptations. Building +upon this foundation, we employ SAVPT to dive into more detailed image-level +instructions to boost performance. SAVPT features a novel metric Severity that +divides all adverse scene images into low-severity and high-severity images. +Then Severity directs visual prompts and adapters, instructing models to +concentrate on unified severity features instead of scene-specific features, +without adding complexity to the model architecture. CoDA achieves SOTA +performances on widely-used benchmarks under all adverse scenes. Notably, CoDA +outperforms the existing ones by 4.6%, and 10.3% mIoU on the Foggy Driving, and +Foggy Zurich benchmarks, respectively. Our code is available at +https://github.com/Cuzyoung/CoDA + +
+
+
+
+
+ + ♻ ☆ GEARS: Local Geometry-aware Hand-object Interaction Synthesis + + +
+ Generating realistic hand motion sequences in interaction with objects has +gained increasing attention with the growing interest in digital humans. Prior +work has illustrated the effectiveness of employing occupancy-based or +distance-based virtual sensors to extract hand-object interaction features. +Nonetheless, these methods show limited generalizability across object +categories, shapes and sizes. We hypothesize that this is due to two reasons: +1) the limited expressiveness of employed virtual sensors, and 2) scarcity of +available training data. To tackle this challenge, we introduce a novel +joint-centered sensor designed to reason about local object geometry near +potential interaction regions. The sensor queries for object surface points in +the neighbourhood of each hand joint. As an important step towards mitigating +the learning complexity, we transform the points from global frame to hand +template frame and use a shared module to process sensor features of each +individual joint. This is followed by a spatio-temporal transformer network +aimed at capturing correlation among the joints in different dimensions. +Moreover, we devise simple heuristic rules to augment the limited training +sequences with vast static hand grasping samples. This leads to a broader +spectrum of grasping types observed during training, in turn enhancing our +model's generalization capability. We evaluate on two public datasets, GRAB and +InterCap, where our method shows superiority over baselines both quantitatively +and perceptually. + +
+
+
+
+
+ + ♻ ☆ Bias Behind the Wheel: Fairness Analysis of Autonomous Driving Systems + + +
+ This paper analyzes fairness in automated pedestrian detection, a crucial but +under-explored issue in autonomous driving systems. We evaluate eight +state-of-the-art deep learning-based pedestrian detectors across demographic +groups on large-scale real-world datasets. To enable thorough fairness testing, +we provide extensive annotations for the datasets, resulting in 8,311 images +with 16,070 gender labels, 20,115 age labels, and 3,513 skin tone labels. Our +findings reveal significant fairness issues, particularly related to age. The +undetected proportions for children are 20.14% higher compared to adults. +Furthermore, we explore how various driving scenarios affect the fairness of +pedestrian detectors. We find that pedestrian detectors demonstrate significant +gender biases during night time, potentially exacerbating the prevalent +societal issue of female safety concerns during nighttime out. Moreover, we +observe that pedestrian detectors can demonstrate both enhanced fairness and +superior performance under specific driving conditions, which challenges the +fairness-performance trade-off theory widely acknowledged in the fairness +literature. We publicly release the code, data, and results to support future +research on fairness in autonomous driving. + +
+
+
+
+
+ + ♻ ☆ Weighted structure tensor total variation for image denoising + + +
+ For image denoising problems, the structure tensor total variation +(STV)-based models show good performances when compared with other competing +regularization approaches. However, the STV regularizer does not couple the +local information of the image and may not maintain the image details. +Therefore, we employ the anisotropic weighted matrix introduced in the +anisotropic total variation (ATV) model to improve the STV model. By applying +the weighted matrix to the discrete gradient of the patch-based Jacobian +operator in STV, our proposed weighted STV (WSTV) model can effectively capture +local information from images and maintain their details during the denoising +process. The optimization problem in the model is solved by a fast first-order +gradient projection algorithm with a complexity result of $O(1 / i^2)$. For +images with different Gaussian noise levels, the experimental results +demonstrate that the WSTV model can effectively improve the quality of restored +images compared to other TV and STV-based models. + +
+
+
+
+
+ + ♻ ☆ A Novel Garment Transfer Method Supervised by Distilled Knowledge of + Virtual Try-on Model + + +
+ This paper proposes a novel garment transfer method supervised with knowledge +distillation from virtual try-on. Our method first reasons the transfer parsing +to provide shape prior to downstream tasks. We employ a multi-phase teaching +strategy to supervise the training of the transfer parsing reasoning model, +learning the response and feature knowledge from the try-on parsing reasoning +model. To correct the teaching error, it transfers the garment back to its +owner to absorb the hard knowledge in the self-study phase. Guided by the +transfer parsing, we adjust the position of the transferred garment via STN to +prevent distortion. Afterward, we estimate a progressive flow to precisely warp +the garment with shape and content correspondences. To ensure warping +rationality, we supervise the training of the garment warping model using +target shape and warping knowledge from virtual try-on. To better preserve body +features in the transfer result, we propose a well-designed training strategy +for the arm regrowth task to infer new exposure skin. Experiments demonstrate +that our method has state-of-the-art performance compared with other virtual +try-on and garment transfer methods in garment transfer, especially for +preserving garment texture and body features. + +
+
+
+
+
+ + ♻ ☆ ModaVerse: Efficiently Transforming Modalities with LLMs CVPR2024 + + +
+ Humans possess the capability to comprehend diverse modalities and seamlessly +transfer information between them. In this work, we introduce ModaVerse, a +Multi-modal Large Language Model (MLLM) capable of comprehending and +transforming content across various modalities including images, videos, and +audio. Predominant MLLM frameworks have largely relied on the alignment of +latent spaces of textual and non-textual features. This alignment process, +which synchronizes a language model trained on textual data with encoders and +decoders trained on multi-modal data, often necessitates extensive training of +several projection layers in multiple stages. Inspired by LLM-as-agent +methodologies, we propose a novel Input/Output (I/O) alignment mechanism that +operates directly at the level of natural language. It aligns the LLM's output +with the input of generative models, avoiding the complexities associated with +latent feature alignments, and simplifying the multiple training stages of +existing MLLMs into a single, efficient process. This conceptual advancement +leads to significant reductions in both data and computational costs. By +conducting experiments on several benchmarks, we demonstrate that our approach +attains comparable performance with the state of the art while achieving +considerable efficiencies in data usage and training duration. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ♻ ☆ DeepIPC: Deeply Integrated Perception and Control for an Autonomous + Vehicle in Real Environments + + +
+ In this work, we introduce DeepIPC, a novel end-to-end model tailored for +autonomous driving, which seamlessly integrates perception and control tasks. +Unlike traditional models that handle these tasks separately, DeepIPC +innovatively combines a perception module, which processes RGBD images for +semantic segmentation and generates bird's eye view (BEV) mappings, with a +controller module that utilizes these insights along with GNSS and angular +speed measurements to accurately predict navigational waypoints. This +integration allows DeepIPC to efficiently translate complex environmental data +into actionable driving commands. Our comprehensive evaluation demonstrates +DeepIPC's superior performance in terms of drivability and multi-task +efficiency across diverse real-world scenarios, setting a new benchmark for +end-to-end autonomous driving systems with a leaner model architecture. The +experimental results underscore DeepIPC's potential to significantly enhance +autonomous vehicular navigation, promising a step forward in the development of +autonomous driving technologies. For further insights and replication, we will +make our code and datasets available at https://github.com/oskarnatan/DeepIPC. + +
+
+ comment: Accepted for Publication in IEEE Access +
+
+
+
+
+ + ♻ ☆ CAPE: CAM as a Probabilistic Ensemble for Enhanced DNN Interpretation + + +
+ Deep Neural Networks (DNNs) are widely used for visual classification tasks, +but their complex computation process and black-box nature hinder decision +transparency and interpretability. Class activation maps (CAMs) and recent +variants provide ways to visually explain the DNN decision-making process by +displaying 'attention' heatmaps of the DNNs. Nevertheless, the CAM explanation +only offers relative attention information, that is, on an attention heatmap, +we can interpret which image region is more or less important than the others. +However, these regions cannot be meaningfully compared across classes, and the +contribution of each region to the model's class prediction is not revealed. To +address these challenges that ultimately lead to better DNN Interpretation, in +this paper, we propose CAPE, a novel reformulation of CAM that provides a +unified and probabilistically meaningful assessment of the contributions of +image regions. We quantitatively and qualitatively compare CAPE with +state-of-the-art CAM methods on CUB and ImageNet benchmark datasets to +demonstrate enhanced interpretability. We also test on a cytology imaging +dataset depicting a challenging Chronic Myelomonocytic Leukemia (CMML) +diagnosis problem. Code is available at: https://github.com/AIML-MED/CAPE. + +
+
+
+
+
+ + ♻ ☆ Image Outlier Detection Without Training using RANSAC + + +
+ Image outlier detection (OD) is an essential tool to ensure the quality of +images used in computer vision tasks. Existing algorithms often involve +training a model to represent the inlier distribution, and outliers are +determined by some deviation measure. Although existing methods proved +effective when trained on strictly inlier samples, their performance remains +questionable when undesired outliers are included during training. As a result +of this limitation, it is necessary to carefully examine the data when +developing OD models for new domains. In this work, we present a novel image OD +algorithm called RANSAC-NN that eliminates the need of data examination and +model training altogether. Unlike existing approaches, RANSAC-NN can be +directly applied on datasets containing outliers by sampling and comparing +subsets of the data. Our algorithm maintains favorable performance compared to +existing methods on a range of benchmarks. Furthermore, we show that RANSAC-NN +can enhance the robustness of existing methods by incorporating our algorithm +as part of the data preparation process. + +
+
+
+
+
+ + ♻ ☆ DeepIPCv2: LiDAR-powered Robust Environmental Perception and + Navigational Control for Autonomous Vehicle + + +
+ We present DeepIPCv2, an autonomous driving model that perceives the +environment using a LiDAR sensor for more robust drivability, especially when +driving under poor illumination conditions where everything is not clearly +visible. DeepIPCv2 takes a set of LiDAR point clouds as the main perception +input. Since point clouds are not affected by illumination changes, they can +provide a clear observation of the surroundings no matter what the condition +is. This results in a better scene understanding and stable features provided +by the perception module to support the controller module in estimating +navigational control properly. To evaluate its performance, we conduct several +tests by deploying the model to predict a set of driving records and perform +real automated driving under three different conditions. We also conduct +ablation and comparative studies with some recent models to justify its +performance. Based on the experimental results, DeepIPCv2 shows a robust +performance by achieving the best drivability in all driving scenarios. +Furthermore, to support future research, we will upload the codes and data to +https://github.com/oskarnatan/DeepIPCv2. + +
+
+
+
+
+ + ♻ ☆ HumanNeRF-SE: A Simple yet Effective Approach to Animate HumanNeRF with + Diverse Poses + + +
+ We present HumanNeRF-SE, a simple yet effective method that synthesizes +diverse novel pose images with simple input. Previous HumanNeRF works require a +large number of optimizable parameters to fit the human images. Instead, we +reload these approaches by combining explicit and implicit human +representations to design both generalized rigid deformation and specific +non-rigid deformation. Our key insight is that explicit shape can reduce the +sampling points used to fit implicit representation, and frozen blending +weights from SMPL constructing a generalized rigid deformation can effectively +avoid overfitting and improve pose generalization performance. Our architecture +involving both explicit and implicit representation is simple yet effective. +Experiments demonstrate our model can synthesize images under arbitrary poses +with few-shot input and increase the speed of synthesizing images by 15 times +through a reduction in computational complexity without using any existing +acceleration modules. Compared to the state-of-the-art HumanNeRF studies, +HumanNeRF-SE achieves better performance with fewer learnable parameters and +less training time. + +
+
+ comment: 16pages, 17 figures, 10 tables +
+
+
+
+
+ + ♻ ☆ TE-TAD: Towards Full End-to-End Temporal Action Detection via + Time-Aligned Coordinate Expression + + +
+ In this paper, we investigate that the normalized coordinate expression is a +key factor as reliance on hand-crafted components in query-based detectors for +temporal action detection (TAD). Despite significant advancements towards an +end-to-end framework in object detection, query-based detectors have been +limited in achieving full end-to-end modeling in TAD. To address this issue, we +propose \modelname{}, a full end-to-end temporal action detection transformer +that integrates time-aligned coordinate expression. We reformulate coordinate +expression utilizing actual timeline values, ensuring length-invariant +representations from the extremely diverse video duration environment. +Furthermore, our proposed adaptive query selection dynamically adjusts the +number of queries based on video length, providing a suitable solution for +varying video durations compared to a fixed query set. Our approach not only +simplifies the TAD process by eliminating the need for hand-crafted components +but also significantly improves the performance of query-based detectors. Our +TE-TAD outperforms the previous query-based detectors and achieves competitive +performance compared to state-of-the-art methods on popular benchmark datasets. +Code is available at: https://github.com/Dotori-HJ/TE-TAD + +
+
+
+
+
+ + ♻ ☆ Improving the Reconstruction of Disentangled Representation Learners via + Multi-Stage Modeling + + +
+ Current autoencoder-based disentangled representation learning methods +achieve disentanglement by penalizing the (aggregate) posterior to encourage +statistical independence of the latent factors. This approach introduces a +trade-off between disentangled representation learning and reconstruction +quality since the model does not have enough capacity to learn correlated +latent variables that capture detail information present in most image data. To +overcome this trade-off, we present a novel multi-stage modeling approach where +the disentangled factors are first learned using a penalty-based disentangled +representation learning method; then, the low-quality reconstruction is +improved with another deep generative model that is trained to model the +missing correlated latent variables, adding detail information while +maintaining conditioning on the previously learned disentangled factors. Taken +together, our multi-stage modelling approach results in a single, coherent +probabilistic model that is theoretically justified by the principal of +D-separation and can be realized with a variety of model classes including +likelihood-based models such as variational autoencoders, implicit models such +as generative adversarial networks, and tractable models like normalizing flows +or mixtures of Gaussians. We demonstrate that our multi-stage model has higher +reconstruction quality than current state-of-the-art methods with equivalent +disentanglement performance across multiple standard benchmarks. In addition, +we apply the multi-stage model to generate synthetic tabular datasets, +showcasing an enhanced performance over benchmark models across a variety of +metrics. The interpretability analysis further indicates that the multi-stage +model can effectively uncover distinct and meaningful features of variations +from which the original distribution can be recovered. + +
+
+
+
+
+ + ♻ ☆ WM-MoE: Weather-aware Multi-scale Mixture-of-Experts for Blind Adverse + Weather Removal + + +
+ Adverse weather removal tasks like deraining, desnowing, and dehazing are +usually treated as separate tasks. However, in practical autonomous driving +scenarios, the type, intensity,and mixing degree of weather are unknown, so +handling each task separately cannot deal with the complex practical scenarios. +In this paper, we study the blind adverse weather removal problem. +Mixture-of-Experts (MoE) is a popular model that adopts a learnable gate to +route the input to different expert networks. The principle of MoE involves +using adaptive networks to process different types of unknown inputs. +Therefore, MoE has great potential for blind adverse weather removal. However, +the original MoE module is inadequate for coupled multiple weather types and +fails to utilize multi-scale features for better performance. To this end, we +propose a method called Weather-aware Multi-scale MoE (WM-MoE) based on +Transformer for blind weather removal. WM-MoE includes two key designs: +WEather-Aware Router (WEAR) and Multi-Scale Experts (MSE). WEAR assigns experts +for each image token based on decoupled content and weather features, which +enhances the model's capability to process multiple adverse weathers. To obtain +discriminative weather features from images, we propose Weather Guidance +Fine-grained Contrastive Learning (WGF-CL), which utilizes weather cluster +information to guide the assignment of positive and negative samples for each +image token. Since processing different weather types requires different +receptive fields, MSE leverages multi-scale features to enhance the spatial +relationship modeling capability, facilitating the high-quality restoration of +diverse weather types and intensities. Our method achieves state-of-the-art +performance in blind adverse weather removal on two public datasets and our +dataset. We also demonstrate the advantage of our method on downstream +segmentation tasks. + +
+
+
+
+
+ + ♻ ☆ Temporally Consistent Unbalanced Optimal Transport for Unsupervised + Action Segmentation CVPR 2024 + + +
+ We propose a novel approach to the action segmentation task for long, +untrimmed videos, based on solving an optimal transport problem. By encoding a +temporal consistency prior into a Gromov-Wasserstein problem, we are able to +decode a temporally consistent segmentation from a noisy affinity/matching cost +matrix between video frames and action classes. Unlike previous approaches, our +method does not require knowing the action order for a video to attain temporal +consistency. Furthermore, our resulting (fused) Gromov-Wasserstein problem can +be efficiently solved on GPUs using a few iterations of projected mirror +descent. We demonstrate the effectiveness of our method in an unsupervised +learning setting, where our method is used to generate pseudo-labels for +self-training. We evaluate our segmentation approach and unsupervised learning +pipeline on the Breakfast, 50-Salads, YouTube Instructions and Desktop Assembly +datasets, yielding state-of-the-art results for the unsupervised video action +segmentation task. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ EGTR: Extracting Graph from Transformer for Scene Graph Generation CVPR 2024 + + +
+ Scene Graph Generation (SGG) is a challenging task of detecting objects and +predicting relationships between objects. After DETR was developed, one-stage +SGG models based on a one-stage object detector have been actively studied. +However, complex modeling is used to predict the relationship between objects, +and the inherent relationship between object queries learned in the multi-head +self-attention of the object detector has been neglected. We propose a +lightweight one-stage SGG model that extracts the relation graph from the +various relationships learned in the multi-head self-attention layers of the +DETR decoder. By fully utilizing the self-attention by-products, the relation +graph can be extracted effectively with a shallow relation extraction head. +Considering the dependency of the relation extraction task on the object +detection task, we propose a novel relation smoothing technique that adjusts +the relation label adaptively according to the quality of the detected objects. +By the relation smoothing, the model is trained according to the continuous +curriculum that focuses on object detection task at the beginning of training +and performs multi-task learning as the object detection performance gradually +improves. Furthermore, we propose a connectivity prediction task that predicts +whether a relation exists between object pairs as an auxiliary task of the +relation extraction. We demonstrate the effectiveness and efficiency of our +method for the Visual Genome and Open Image V6 datasets. Our code is publicly +available at https://github.com/naver-ai/egtr. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Robust deep learning for eye fundus images: Bridging real and synthetic + data for enhancing generalization + + +
+ Deep learning applications for assessing medical images are limited because +the datasets are often small and imbalanced. The use of synthetic data has been +proposed in the literature, but neither a robust comparison of the different +methods nor generalizability has been reported. Our approach integrates a +retinal image quality assessment model and StyleGAN2 architecture to enhance +Age-related Macular Degeneration (AMD) detection capabilities and improve +generalizability. This work compares ten different Generative Adversarial +Network (GAN) architectures to generate synthetic eye-fundus images with and +without AMD. We combined subsets of three public databases (iChallenge-AMD, +ODIR-2019, and RIADD) to form a single training and test set. We employed the +STARE dataset for external validation, ensuring a comprehensive assessment of +the proposed approach. The results show that StyleGAN2 reached the lowest +Frechet Inception Distance (166.17), and clinicians could not accurately +differentiate between real and synthetic images. ResNet-18 architecture +obtained the best performance with 85% accuracy and outperformed the two human +experts (80% and 75%) in detecting AMD fundus images. The accuracy rates were +82.8% for the test set and 81.3% for the STARE dataset, demonstrating the +model's generalizability. The proposed methodology for synthetic medical image +generation has been validated for robustness and accuracy, with free access to +its code for further research and development in this field. + +
+
+ comment: Accepted by the Biomedical Signal Processing and Control +
+
+
+
+
+ + ♻ ☆ Towards Fine-grained Large Object Segmentation 1st Place Solution to 3D + AI Challenge 2020 -- Instance Segmentation Track + + +
+ This technical report introduces our solutions of Team 'FineGrainedSeg' for +Instance Segmentation track in 3D AI Challenge 2020. In order to handle +extremely large objects in 3D-FUTURE, we adopt PointRend as our basic +framework, which outputs more fine-grained masks compared to HTC and SOLOv2. +Our final submission is an ensemble of 5 PointRend models, which achieves the +1st place on both validation and test leaderboards. The code is available at +https://github.com/zehuichen123/3DFuture_ins_seg. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ♻ ☆ Deep Learning in Cardiology + + +
+ The medical field is creating large amount of data that physicians are unable +to decipher and use efficiently. Moreover, rule-based expert systems are +inefficient in solving complicated medical tasks or for creating insights using +big data. Deep learning has emerged as a more accurate and effective technology +in a wide range of medical problems such as diagnosis, prediction and +intervention. Deep learning is a representation learning method that consists +of layers that transform the data non-linearly, thus, revealing hierarchical +relationships and structures. In this review we survey deep learning +application papers that use structured data, signal and imaging modalities from +cardiology. We discuss the advantages and limitations of applying deep learning +in cardiology that also apply in medicine in general, while proposing certain +directions as the most viable for clinical use. + +
+
+ comment: 27 pages, 2 figures, 10 tables +
+
+
+
+
+ + ♻ ☆ Synthesis of Annotated Colorectal Cancer Tissue Images from Gland Layout + + +
+ Generating realistic tissue images with annotations is a challenging task +that is important in many computational histopathology applications. +Synthetically generated images and annotations are valuable for training and +evaluating algorithms in this domain. To address this, we propose an +interactive framework generating pairs of realistic colorectal cancer histology +images with corresponding glandular masks from glandular structure layouts. The +framework accurately captures vital features like stroma, goblet cells, and +glandular lumen. Users can control gland appearance by adjusting parameters +such as the number of glands, their locations, and sizes. The generated images +exhibit good Frechet Inception Distance (FID) scores compared to the +state-of-the-art image-to-image translation model. Additionally, we demonstrate +the utility of our synthetic annotations for evaluating gland segmentation +algorithms. Furthermore, we present a methodology for constructing glandular +masks using advanced deep generative models, such as latent diffusion models. +These masks enable tissue image generation through a residual encoder-decoder +network. + +
+
+
+
+
+ + ♻ ☆ Spacetime Gaussian Feature Splatting for Real-Time Dynamic View + Synthesis CVPR 2024 + + +
+ Novel view synthesis of dynamic scenes has been an intriguing yet challenging +problem. Despite recent advancements, simultaneously achieving high-resolution +photorealistic results, real-time rendering, and compact storage remains a +formidable task. To address these challenges, we propose Spacetime Gaussian +Feature Splatting as a novel dynamic scene representation, composed of three +pivotal components. First, we formulate expressive Spacetime Gaussians by +enhancing 3D Gaussians with temporal opacity and parametric motion/rotation. +This enables Spacetime Gaussians to capture static, dynamic, as well as +transient content within a scene. Second, we introduce splatted feature +rendering, which replaces spherical harmonics with neural features. These +features facilitate the modeling of view- and time-dependent appearance while +maintaining small size. Third, we leverage the guidance of training error and +coarse depth to sample new Gaussians in areas that are challenging to converge +with existing pipelines. Experiments on several established real-world datasets +demonstrate that our method achieves state-of-the-art rendering quality and +speed, while retaining compact storage. At 8K resolution, our lite-version +model can render at 60 FPS on an Nvidia RTX 4090 GPU. Our code is available at +https://github.com/oppo-us-research/SpacetimeGaussians. + +
+
+ comment: Accepted to CVPR 2024. Project page: + https://oppo-us-research.github.io/SpacetimeGaussians-website/ +
+
+
+
+
+ + ♻ ☆ InstantStyle: Free Lunch towards Style-Preserving in Text-to-Image + Generation + + +
+ Tuning-free diffusion-based models have demonstrated significant potential in +the realm of image personalization and customization. However, despite this +notable progress, current models continue to grapple with several complex +challenges in producing style-consistent image generation. Firstly, the concept +of style is inherently underdetermined, encompassing a multitude of elements +such as color, material, atmosphere, design, and structure, among others. +Secondly, inversion-based methods are prone to style degradation, often +resulting in the loss of fine-grained details. Lastly, adapter-based approaches +frequently require meticulous weight tuning for each reference image to achieve +a balance between style intensity and text controllability. In this paper, we +commence by examining several compelling yet frequently overlooked +observations. We then proceed to introduce InstantStyle, a framework designed +to address these issues through the implementation of two key strategies: 1) A +straightforward mechanism that decouples style and content from reference +images within the feature space, predicated on the assumption that features +within the same space can be either added to or subtracted from one another. 2) +The injection of reference image features exclusively into style-specific +blocks, thereby preventing style leaks and eschewing the need for cumbersome +weight tuning, which often characterizes more parameter-heavy designs.Our work +demonstrates superior visual stylization outcomes, striking an optimal balance +between the intensity of style and the controllability of textual elements. Our +codes will be available at https://github.com/InstantStyle/InstantStyle. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ♻ ☆ DisCo: Disentangled Control for Realistic Human Dance Generation CVPR24 + + +
+ Generative AI has made significant strides in computer vision, particularly +in text-driven image/video synthesis (T2I/T2V). Despite the notable +advancements, it remains challenging in human-centric content synthesis such as +realistic dance generation. Current methodologies, primarily tailored for human +motion transfer, encounter difficulties when confronted with real-world dance +scenarios (e.g., social media dance), which require to generalize across a wide +spectrum of poses and intricate human details. In this paper, we depart from +the traditional paradigm of human motion transfer and emphasize two additional +critical attributes for the synthesis of human dance content in social media +contexts: (i) Generalizability: the model should be able to generalize beyond +generic human viewpoints as well as unseen human subjects, backgrounds, and +poses; (ii) Compositionality: it should allow for the seamless composition of +seen/unseen subjects, backgrounds, and poses from different sources. To address +these challenges, we introduce DISCO, which includes a novel model architecture +with disentangled control to improve the compositionality of dance synthesis, +and an effective human attribute pre-training for better generalizability to +unseen humans. Extensive qualitative and quantitative results demonstrate that +DisCc can generate high-quality human dance images and videos with diverse +appearances and flexible motions. Code is available at +https://disco-dance.github.io/. + +
+
+ comment: Accepted by CVPR24 +
+
+
+
+
+ + ♻ ☆ pixelSplat: 3D Gaussian Splats from Image Pairs for Scalable + Generalizable 3D Reconstruction + + +
+ We introduce pixelSplat, a feed-forward model that learns to reconstruct 3D +radiance fields parameterized by 3D Gaussian primitives from pairs of images. +Our model features real-time and memory-efficient rendering for scalable +training as well as fast 3D reconstruction at inference time. To overcome local +minima inherent to sparse and locally supported representations, we predict a +dense probability distribution over 3D and sample Gaussian means from that +probability distribution. We make this sampling operation differentiable via a +reparameterization trick, allowing us to back-propagate gradients through the +Gaussian splatting representation. We benchmark our method on wide-baseline +novel view synthesis on the real-world RealEstate10k and ACID datasets, where +we outperform state-of-the-art light field transformers and accelerate +rendering by 2.5 orders of magnitude while reconstructing an interpretable and +editable 3D radiance field. + +
+
+ comment: Project page: https://dcharatan.github.io/pixelsplat +
+
+
+
+
+ + ♻ ☆ ArtGPT-4: Towards Artistic-understanding Large Vision-Language Models + with Enhanced Adapter + + +
+ The success of large language models (LLMs) has inspired an emerging research +field of multimodal learning. However, a grand challenge of exploiting LLMs for +multimodal learning is the size of pre-trained LLMs which are always with +billions of parameters. To tackle this challenge, models such as MiniGPT-4 and +LLaVA have been developed to fine-tune the pre-trained models using fewer +parameters. Despite their promising performance, these models remain limited in +their understanding of artistic imagery. To facilitate better +artistic-understanding, in this paper, we propose ArtGPT-4, a pioneering large +vision-language model tailored to address the limitations of existing models in +artistic comprehension. The key innovation of ArtGPT-4 lies in its craft for +the sophisticated challenge of artistic image comprehension, setting it apart +from other models that overlook fine details for broader themes. Specifically, +it works by integrating some specialized adapter layers into the LLM, enabling +the model to more efficiently and effectively parse and interpret complex +visual tokens, instead of fine-tuning the whole LLM as in the existing method. +ArtGPT-4 has demonstrated its outstanding performance on the efficiency: +utilizing a Tesla A100 device, its training can be completed in mere 2 hours +with an image-text pair dataset comprising approximately 0.52M entries. +Additionally, ArtGPT-4 has also achieved state-of-the-art performance on the +ArtEmis and ArtEmis-v2.0 datasets as well as the benchmarks established in this +work, lagging behind professional artists' descriptions by a negligible 0.15 +points on a 6-point scale. The outstanding performance of ArtGPT-4 shows that +it can render images with an artistic-understanding and convey the emotions +they inspire, mirroring human interpretation. The code and the pre-trained +model are accessible in \url{https://github.com/DLYuanGod/ArtGPT-4}. + +
+
+
+
+
+ + ♻ ☆ TinyGPT-V: Efficient Multimodal Large Language Model via Small Backbones + + +
+ In recent years, multimodal large language models (MLLMs) such as GPT-4V have +demonstrated remarkable advancements, excelling in a variety of vision-language +tasks. Despite their prowess, the closed-source nature and computational +demands of such models limit their accessibility and applicability. This study +introduces TinyGPT-V, a novel open-source MLLM, designed for efficient training +and inference across various vision-language tasks, including image captioning +(IC) and visual question answering (VQA). Leveraging a compact yet powerful +architecture, TinyGPT-V integrates the Phi-2 language model with pre-trained +vision encoders, utilizing a unique mapping module for visual and linguistic +information fusion. With a training regimen optimized for small backbones and +employing a diverse dataset amalgam, TinyGPT-V requires significantly lower +computational resources 24GB for training and as little as 8GB for inference +without compromising on performance. Our experiments demonstrate that +TinyGPT-V, with its language model 2.8 billion parameters, achieves comparable +results in VQA and image inference tasks to its larger counterparts while being +uniquely suited for deployment on resource-constrained devices through +innovative quantization techniques. This work not only paves the way for more +accessible and efficient MLLMs but also underscores the potential of smaller, +optimized models in bridging the gap between high performance and computational +efficiency in real-world applications. Additionally, this paper introduces a +new approach to multimodal large language models using smaller backbones. Our +code and training weights are available in +\url{https://github.com/DLYuanGod/TinyGPT-V}. + +
+
+
+
+
+ + ♻ ☆ Few-shot point cloud reconstruction and denoising via learned Guassian + splats renderings and fine-tuned diffusion features + + +
+ Existing deep learning methods for the reconstruction and denoising of point +clouds rely on small datasets of 3D shapes. We circumvent the problem by +leveraging deep learning methods trained on billions of images. We propose a +method to reconstruct point clouds from few images and to denoise point clouds +from their rendering by exploiting prior knowledge distilled from image-based +deep learning models. To improve reconstruction in constraint settings, we +regularize the training of a differentiable renderer with hybrid surface and +appearance by introducing semantic consistency supervision. In addition, we +propose a pipeline to finetune Stable Diffusion to denoise renderings of noisy +point clouds and we demonstrate how these learned filters can be used to remove +point cloud noise coming without 3D supervision. We compare our method with DSS +and PointRadiance and achieved higher quality 3D reconstruction on the +Sketchfab Testset and SCUT Dataset. + +
+
+
+
+
+ + ♻ ☆ 3D scene generation from scene graphs and self-attention + + +
+ Synthesizing realistic and diverse indoor 3D scene layouts in a controllable +fashion opens up applications in simulated navigation and virtual reality. As +concise and robust representations of a scene, scene graphs have proven to be +well-suited as the semantic control on the generated layout. We present a +variant of the conditional variational autoencoder (cVAE) model to synthesize +3D scenes from scene graphs and floor plans. We exploit the properties of +self-attention layers to capture high-level relationships between objects in a +scene, and use these as the building blocks of our model. Our model, leverages +graph transformers to estimate the size, dimension and orientation of the +objects in a room while satisfying relationships in the given scene graph. Our +experiments shows self-attention layers leads to sparser (7.9x compared to +Graphto3D) and more diverse scenes (16%). + +
+
+
+
+
+ + ♻ ☆ Neural Field Convolutions by Repeated Differentiation + + +
+ Neural fields are evolving towards a general-purpose continuous +representation for visual computing. Yet, despite their numerous appealing +properties, they are hardly amenable to signal processing. As a remedy, we +present a method to perform general continuous convolutions with general +continuous signals such as neural fields. Observing that piecewise polynomial +kernels reduce to a sparse set of Dirac deltas after repeated differentiation, +we leverage convolution identities and train a repeated integral field to +efficiently execute large-scale convolutions. We demonstrate our approach on a +variety of data modalities and spatially-varying kernels. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 153 + +
+
+
+ + ☆ Visual Autoregressive Modeling: Scalable Image Generation via Next-Scale + Prediction + + +
+ We present Visual AutoRegressive modeling (VAR), a new generation paradigm +that redefines the autoregressive learning on images as coarse-to-fine +"next-scale prediction" or "next-resolution prediction", diverging from the +standard raster-scan "next-token prediction". This simple, intuitive +methodology allows autoregressive (AR) transformers to learn visual +distributions fast and generalize well: VAR, for the first time, makes AR +models surpass diffusion transformers in image generation. On ImageNet 256x256 +benchmark, VAR significantly improve AR baseline by improving Frechet inception +distance (FID) from 18.65 to 1.80, inception score (IS) from 80.4 to 356.4, +with around 20x faster inference speed. It is also empirically verified that +VAR outperforms the Diffusion Transformer (DiT) in multiple dimensions +including image quality, inference speed, data efficiency, and scalability. +Scaling up VAR models exhibits clear power-law scaling laws similar to those +observed in LLMs, with linear correlation coefficients near -0.998 as solid +evidence. VAR further showcases zero-shot generalization ability in downstream +tasks including image in-painting, out-painting, and editing. These results +suggest VAR has initially emulated the two important properties of LLMs: +Scaling Laws and zero-shot task generalization. We have released all models and +codes to promote the exploration of AR/VAR models for visual generation and +unified learning. + +
+
+
+
+
+ + ☆ ALOHa: A New Measure for Hallucination in Captioning Models NAACL 2024 + + +
+ Despite recent advances in multimodal pre-training for visual description, +state-of-the-art models still produce captions containing errors, such as +hallucinating objects not present in a scene. The existing prominent metric for +object hallucination, CHAIR, is limited to a fixed set of MS COCO objects and +synonyms. In this work, we propose a modernized open-vocabulary metric, ALOHa, +which leverages large language models (LLMs) to measure object hallucinations. +Specifically, we use an LLM to extract groundable objects from a candidate +caption, measure their semantic similarity to reference objects from captions +and object detections, and use Hungarian matching to produce a final +hallucination score. We show that ALOHa correctly identifies 13.6% more +hallucinated objects than CHAIR on HAT, a new gold-standard subset of MS COCO +Captions annotated for hallucinations, and 30.8% more on nocaps, where objects +extend beyond MS COCO categories. Our code is available at +https://davidmchan.github.io/aloha/. + +
+
+ comment: To appear at NAACL 2024 +
+
+
+
+
+ + ☆ LidarDM: Generative LiDAR Simulation in a Generated World + + +
+ We present LidarDM, a novel LiDAR generative model capable of producing +realistic, layout-aware, physically plausible, and temporally coherent LiDAR +videos. LidarDM stands out with two unprecedented capabilities in LiDAR +generative modeling: (i) LiDAR generation guided by driving scenarios, offering +significant potential for autonomous driving simulations, and (ii) 4D LiDAR +point cloud generation, enabling the creation of realistic and temporally +coherent sequences. At the heart of our model is a novel integrated 4D world +generation framework. Specifically, we employ latent diffusion models to +generate the 3D scene, combine it with dynamic actors to form the underlying 4D +world, and subsequently produce realistic sensory observations within this +virtual environment. Our experiments indicate that our approach outperforms +competing algorithms in realism, temporal coherency, and layout consistency. We +additionally show that LidarDM can be used as a generative world model +simulator for training and testing perception models. + +
+
+
+
+
+ + ☆ DeiT-LT Distillation Strikes Back for Vision Transformer Training on + Long-Tailed Datasets CVPR 2024 + + +
+ Vision Transformer (ViT) has emerged as a prominent architecture for various +computer vision tasks. In ViT, we divide the input image into patch tokens and +process them through a stack of self attention blocks. However, unlike +Convolutional Neural Networks (CNN), ViTs simple architecture has no +informative inductive bias (e.g., locality,etc. ). Due to this, ViT requires a +large amount of data for pre-training. Various data efficient approaches (DeiT) +have been proposed to train ViT on balanced datasets effectively. However, +limited literature discusses the use of ViT for datasets with long-tailed +imbalances. In this work, we introduce DeiT-LT to tackle the problem of +training ViTs from scratch on long-tailed datasets. In DeiT-LT, we introduce an +efficient and effective way of distillation from CNN via distillation DIST +token by using out-of-distribution images and re-weighting the distillation +loss to enhance focus on tail classes. This leads to the learning of local +CNN-like features in early ViT blocks, improving generalization for tail +classes. Further, to mitigate overfitting, we propose distilling from a flat +CNN teacher, which leads to learning low-rank generalizable features for DIST +tokens across all ViT blocks. With the proposed DeiT-LT scheme, the +distillation DIST token becomes an expert on the tail classes, and the +classifier CLS token becomes an expert on the head classes. The experts help to +effectively learn features corresponding to both the majority and minority +classes using a distinct set of tokens within the same ViT architecture. We +show the effectiveness of DeiT-LT for training ViT from scratch on datasets +ranging from small-scale CIFAR-10 LT to large-scale iNaturalist-2018. + +
+
+ comment: CVPR 2024. Project Page: https://rangwani-harsh.github.io/DeiT-LT +
+
+
+
+
+ + ☆ MatAtlas: Text-driven Consistent Geometry Texturing and Material + Assignment + + +
+ We present MatAtlas, a method for consistent text-guided 3D model texturing. +Following recent progress we leverage a large scale text-to-image generation +model (e.g., Stable Diffusion) as a prior to texture a 3D model. We carefully +design an RGB texturing pipeline that leverages a grid pattern diffusion, +driven by depth and edges. By proposing a multi-step texture refinement +process, we significantly improve the quality and 3D consistency of the +texturing output. To further address the problem of baked-in lighting, we move +beyond RGB colors and pursue assigning parametric materials to the assets. +Given the high-quality initial RGB texture, we propose a novel material +retrieval method capitalized on Large Language Models (LLM), enabling +editabiliy and relightability. We evaluate our method on a wide variety of +geometries and show that our method significantly outperform prior arts. We +also analyze the role of each component through a detailed ablation study. + +
+
+
+
+
+ + ☆ Deep Image Composition Meets Image Forgery + + +
+ Image forgery is a topic that has been studied for many years. Before the +breakthrough of deep learning, forged images were detected using handcrafted +features that did not require training. These traditional methods failed to +perform satisfactorily even on datasets much worse in quality than real-life +image manipulations. Advances in deep learning have impacted image forgery +detection as much as they have impacted other areas of computer vision and have +improved the state of the art. Deep learning models require large amounts of +labeled data for training. In the case of image forgery, labeled data at the +pixel level is a very important factor for the models to learn. None of the +existing datasets have sufficient size, realism and pixel-level labeling at the +same time. This is due to the high cost of producing and labeling quality +images. It can take hours for an image editing expert to manipulate just one +image. To bridge this gap, we automate data generation using image composition +techniques that are very related to image forgery. Unlike other automated data +generation frameworks, we use state of the art image composition deep learning +models to generate spliced images close to the quality of real-life +manipulations. Finally, we test the generated dataset on the SOTA image +manipulation detection model and show that its prediction performance is lower +compared to existing datasets, i.e. we produce realistic images that are more +difficult to detect. Dataset will be available at +https://github.com/99eren99/DIS25k . + +
+
+
+
+
+ + ☆ Steganographic Passport: An Owner and User Verifiable Credential for + Deep Model IP Protection Without Retraining + + +
+ Ensuring the legal usage of deep models is crucial to promoting trustable, +accountable, and responsible artificial intelligence innovation. Current +passport-based methods that obfuscate model functionality for license-to-use +and ownership verifications suffer from capacity and quality constraints, as +they require retraining the owner model for new users. They are also vulnerable +to advanced Expanded Residual Block ambiguity attacks. We propose +Steganographic Passport, which uses an invertible steganographic network to +decouple license-to-use from ownership verification by hiding the user's +identity images into the owner-side passport and recovering them from their +respective user-side passports. An irreversible and collision-resistant hash +function is used to avoid exposing the owner-side passport from the derived +user-side passports and increase the uniqueness of the model signature. To +safeguard both the passport and model's weights against advanced ambiguity +attacks, an activation-level obfuscation is proposed for the verification +branch of the owner's model. By jointly training the verification and +deployment branches, their weights become tightly coupled. The proposed method +supports agile licensing of deep models by providing a strong ownership proof +and license accountability without requiring a separate model retraining for +the admission of every new user. Experiment results show that our +Steganographic Passport outperforms other passport-based deep model protection +methods in robustness against various known attacks. + +
+
+
+
+
+ + ☆ PoCo: Point Context Cluster for RGBD Indoor Place Recognition + + +
+ We present a novel end-to-end algorithm (PoCo) for the indoor RGB-D place +recognition task, aimed at identifying the most likely match for a given query +frame within a reference database. The task presents inherent challenges +attributed to the constrained field of view and limited range of perception +sensors. We propose a new network architecture, which generalizes the recent +Context of Clusters (CoCs) to extract global descriptors directly from the +noisy point clouds through end-to-end learning. Moreover, we develop the +architecture by integrating both color and geometric modalities into the point +features to enhance the global descriptor representation. We conducted +evaluations on public datasets ScanNet-PR and ARKit with 807 and 5047 +scenarios, respectively. PoCo achieves SOTA performance: on ScanNet-PR, we +achieve R@1 of 64.63%, a 5.7% improvement from the best-published result CGis +(61.12%); on Arkit, we achieve R@1 of 45.12%, a 13.3% improvement from the +best-published result CGis (39.82%). In addition, PoCo shows higher efficiency +than CGis in inference time (1.75X-faster), and we demonstrate the +effectiveness of PoCo in recognizing places within a real-world laboratory +environment. + +
+
+
+
+
+ + ☆ On the Scalability of Diffusion-based Text-to-Image Generation CVPR2024 + + +
+ Scaling up model and data size has been quite successful for the evolution of +LLMs. However, the scaling law for the diffusion based text-to-image (T2I) +models is not fully explored. It is also unclear how to efficiently scale the +model for better performance at reduced cost. The different training settings +and expensive training cost make a fair model comparison extremely difficult. +In this work, we empirically study the scaling properties of diffusion based +T2I models by performing extensive and rigours ablations on scaling both +denoising backbones and training set, including training scaled UNet and +Transformer variants ranging from 0.4B to 4B parameters on datasets upto 600M +images. For model scaling, we find the location and amount of cross attention +distinguishes the performance of existing UNet designs. And increasing the +transformer blocks is more parameter-efficient for improving text-image +alignment than increasing channel numbers. We then identify an efficient UNet +variant, which is 45% smaller and 28% faster than SDXL's UNet. On the data +scaling side, we show the quality and diversity of the training set matters +more than simply dataset size. Increasing caption density and diversity +improves text-image alignment performance and the learning efficiency. Finally, +we provide scaling functions to predict the text-image alignment performance as +functions of the scale of model size, compute and dataset size. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ☆ FlightScope: A Deep Comprehensive Assessment of Aircraft Detection + Algorithms in Satellite Imagery + + +
+ Object detection in remotely sensed satellite pictures is fundamental in many +fields such as biophysical, and environmental monitoring. While deep learning +algorithms are constantly evolving, they have been mostly implemented and +tested on popular ground-based taken photos. This paper critically evaluates +and compares a suite of advanced object detection algorithms customized for the +task of identifying aircraft within satellite imagery. Using the large +HRPlanesV2 dataset, together with a rigorous validation with the GDIT dataset, +this research encompasses an array of methodologies including YOLO versions 5 +and 8, Faster RCNN, CenterNet, RetinaNet, RTMDet, and DETR, all trained from +scratch. This exhaustive training and validation study reveal YOLOv5 as the +preeminent model for the specific case of identifying airplanes from remote +sensing data, showcasing high precision and adaptability across diverse imaging +conditions. This research highlight the nuanced performance landscapes of these +algorithms, with YOLOv5 emerging as a robust solution for aerial object +detection, underlining its importance through superior mean average precision, +Recall, and Intersection over Union scores. The findings described here +underscore the fundamental role of algorithm selection aligned with the +specific demands of satellite imagery analysis and extend a comprehensive +framework to evaluate model efficacy. The benchmark toolkit and codes, +available via https://github.com/toelt-llc/FlightScope_Bench, aims to further +exploration and innovation in the realm of remote sensing object detection, +paving the way for improved analytical methodologies in satellite imagery +applications. + +
+
+ comment: 15 figures, 4 tables, comprehensive survey, comparative study +
+
+
+
+
+ + ☆ Cross-Modal Conditioned Reconstruction for Language-guided Medical Image + Segmentation + + +
+ Recent developments underscore the potential of textual information in +enhancing learning models for a deeper understanding of medical visual +semantics. However, language-guided medical image segmentation still faces a +challenging issue. Previous works employ implicit and ambiguous architectures +to embed textual information. This leads to segmentation results that are +inconsistent with the semantics represented by the language, sometimes even +diverging significantly. To this end, we propose a novel cross-modal +conditioned Reconstruction for Language-guided Medical Image Segmentation +(RecLMIS) to explicitly capture cross-modal interactions, which assumes that +well-aligned medical visual features and medical notes can effectively +reconstruct each other. We introduce conditioned interaction to adaptively +predict patches and words of interest. Subsequently, they are utilized as +conditioning factors for mutual reconstruction to align with regions described +in the medical notes. Extensive experiments demonstrate the superiority of our +RecLMIS, surpassing LViT by 3.74% mIoU on the publicly available MosMedData+ +dataset and achieving an average increase of 1.89% mIoU for cross-domain tests +on our QATA-CoV19 dataset. Simultaneously, we achieve a relative reduction of +20.2% in parameter count and a 55.5% decrease in computational load. The code +will be available at https://github.com/ShashankHuang/RecLMIS. + +
+
+
+
+
+ + ☆ Enhancing Interpretability of Vertebrae Fracture Grading using + Human-interpretable Prototypes + + +
+ Vertebral fracture grading classifies the severity of vertebral fractures, +which is a challenging task in medical imaging and has recently attracted Deep +Learning (DL) models. Only a few works attempted to make such models +human-interpretable despite the need for transparency and trustworthiness in +critical use cases like DL-assisted medical diagnosis. Moreover, such models +either rely on post-hoc methods or additional annotations. In this work, we +propose a novel interpretable-by-design method, ProtoVerse, to find relevant +sub-parts of vertebral fractures (prototypes) that reliably explain the model's +decision in a human-understandable way. Specifically, we introduce a novel +diversity-promoting loss to mitigate prototype repetitions in small datasets +with intricate semantics. We have experimented with the VerSe'19 dataset and +outperformed the existing prototype-based method. Further, our model provides +superior interpretability against the post-hoc method. Importantly, expert +radiologists validated the visual interpretability of our results, showing +clinical applicability. + +
+
+
+
+
+ + ☆ GPU-Accelerated RSF Level Set Evolution for Large-Scale Microvascular + Segmentation + + +
+ Microvascular networks are challenging to model because these structures are +currently near the diffraction limit for most advanced three-dimensional +imaging modalities, including confocal and light sheet microscopy. This makes +semantic segmentation difficult, because individual components of these +networks fluctuate within the confines of individual pixels. Level set methods +are ideally suited to solve this problem by providing surface and topological +constraints on the resulting model, however these active contour techniques are +extremely time intensive and impractical for terabyte-scale images. We propose +a reformulation and implementation of the region-scalable fitting (RSF) level +set model that makes it amenable to three-dimensional evaluation using both +single-instruction multiple data (SIMD) and single-program multiple-data (SPMD) +parallel processing. This enables evaluation of the level set equation on +independent regions of the data set using graphics processing units (GPUs), +making large-scale segmentation of high-resolution networks practical and +inexpensive. + We tested this 3D parallel RSF approach on multiple data sets acquired using +state-of-the-art imaging techniques to acquire microvascular data, including +micro-CT, light sheet fluorescence microscopy (LSFM) and milling microscopy. To +assess the performance and accuracy of the RSF model, we conducted a +Monte-Carlo-based validation technique to compare results to other segmentation +methods. We also provide a rigorous profiling to show the gains in processing +speed leveraging parallel hardware. This study showcases the practical +application of the RSF model, emphasizing its utility in the challenging domain +of segmenting large-scale high-topology network structures with a particular +focus on building microvascular models. + +
+
+
+
+
+ + ☆ MULAN: A Multi Layer Annotated Dataset for Controllable Text-to-Image + Generation CVPR 2024 + + +
+ Text-to-image generation has achieved astonishing results, yet precise +spatial controllability and prompt fidelity remain highly challenging. This +limitation is typically addressed through cumbersome prompt engineering, scene +layout conditioning, or image editing techniques which often require hand drawn +masks. Nonetheless, pre-existing works struggle to take advantage of the +natural instance-level compositionality of scenes due to the typically flat +nature of rasterized RGB output images. Towards adressing this challenge, we +introduce MuLAn: a novel dataset comprising over 44K MUlti-Layer ANnotations of +RGB images as multilayer, instance-wise RGBA decompositions, and over 100K +instance images. To build MuLAn, we developed a training free pipeline which +decomposes a monocular RGB image into a stack of RGBA layers comprising of +background and isolated instances. We achieve this through the use of +pretrained general-purpose models, and by developing three modules: image +decomposition for instance discovery and extraction, instance completion to +reconstruct occluded areas, and image re-assembly. We use our pipeline to +create MuLAn-COCO and MuLAn-LAION datasets, which contain a variety of image +decompositions in terms of style, composition and complexity. With MuLAn, we +provide the first photorealistic resource providing instance decomposition and +occlusion information for high quality images, opening up new avenues for +text-to-image generative AI research. With this, we aim to encourage the +development of novel generation and editing technology, in particular +layer-wise solutions. MuLAn data resources are available at +https://MuLAn-dataset.github.io/. + +
+
+ comment: CVPR 2024 - Project page: https://MuLAn-dataset.github.io/ +
+
+
+
+
+ + ☆ GenN2N: Generative NeRF2NeRF Translation CVPR 2024 + + +
+ We present GenN2N, a unified NeRF-to-NeRF translation framework for various +NeRF translation tasks such as text-driven NeRF editing, colorization, +super-resolution, inpainting, etc. Unlike previous methods designed for +individual translation tasks with task-specific schemes, GenN2N achieves all +these NeRF editing tasks by employing a plug-and-play image-to-image translator +to perform editing in the 2D domain and lifting 2D edits into the 3D NeRF +space. Since the 3D consistency of 2D edits may not be assured, we propose to +model the distribution of the underlying 3D edits through a generative model +that can cover all possible edited NeRFs. To model the distribution of 3D +edited NeRFs from 2D edited images, we carefully design a VAE-GAN that encodes +images while decoding NeRFs. The latent space is trained to align with a +Gaussian distribution and the NeRFs are supervised through an adversarial loss +on its renderings. To ensure the latent code does not depend on 2D viewpoints +but truly reflects the 3D edits, we also regularize the latent code through a +contrastive learning scheme. Extensive experiments on various editing tasks +show GenN2N, as a universal framework, performs as well or better than +task-specific specialists while possessing flexible generative power. More +results on our project page: https://xiangyueliu.github.io/GenN2N/ + +
+
+ comment: Accepted to CVPR 2024. Project page: + https://xiangyueliu.github.io/GenN2N/ +
+
+
+
+
+ + ☆ Domain Generalization through Meta-Learning: A Survey + + +
+ Deep neural networks (DNNs) have revolutionized artificial intelligence but +often lack performance when faced with out-of-distribution (OOD) data, a common +scenario due to the inevitable domain shifts in real-world applications. This +limitation stems from the common assumption that training and testing data +share the same distribution-an assumption frequently violated in practice. +Despite their effectiveness with large amounts of data and computational power, +DNNs struggle with distributional shifts and limited labeled data, leading to +overfitting and poor generalization across various tasks and domains. +Meta-learning presents a promising approach by employing algorithms that +acquire transferable knowledge across various tasks for fast adaptation, +eliminating the need to learn each task from scratch. This survey paper delves +into the realm of meta-learning with a focus on its contribution to domain +generalization. We first clarify the concept of meta-learning for domain +generalization and introduce a novel taxonomy based on the feature extraction +strategy and the classifier learning methodology, offering a granular view of +methodologies. Through an exhaustive review of existing methods and underlying +theories, we map out the fundamentals of the field. Our survey provides +practical insights and an informed discussion on promising research directions, +paving the way for future innovation in meta-learning for domain +generalization. + +
+
+
+
+
+ + ☆ Unsupervised Occupancy Learning from Sparse Point Cloud CVPR 2024 + + +
+ Implicit Neural Representations have gained prominence as a powerful +framework for capturing complex data modalities, encompassing a wide range from +3D shapes to images and audio. Within the realm of 3D shape representation, +Neural Signed Distance Functions (SDF) have demonstrated remarkable potential +in faithfully encoding intricate shape geometry. However, learning SDFs from 3D +point clouds in the absence of ground truth supervision remains a very +challenging task. In this paper, we propose a method to infer occupancy fields +instead of SDFs as they are easier to learn from sparse inputs. We leverage a +margin-based uncertainty measure to differentially sample from the decision +boundary of the occupancy function and supervise the sampled boundary points +using the input point cloud. We further stabilize the optimization process at +the early stages of the training by biasing the occupancy function towards +minimal entropy fields while maximizing its entropy at the input point cloud. +Through extensive experiments and evaluations, we illustrate the efficacy of +our proposed method, highlighting its capacity to improve implicit shape +inference with respect to baselines and the state-of-the-art using synthetic +and real data. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ DIBS: Enhancing Dense Video Captioning with Unlabeled Videos via Pseudo + Boundary Enrichment and Online Refinement CVPR 2024 + + +
+ We present Dive Into the BoundarieS (DIBS), a novel pretraining framework for +dense video captioning (DVC), that elaborates on improving the quality of the +generated event captions and their associated pseudo event boundaries from +unlabeled videos. By leveraging the capabilities of diverse large language +models (LLMs), we generate rich DVC-oriented caption candidates and optimize +the corresponding pseudo boundaries under several meticulously designed +objectives, considering diversity, event-centricity, temporal ordering, and +coherence. Moreover, we further introduce a novel online boundary refinement +strategy that iteratively improves the quality of pseudo boundaries during +training. Comprehensive experiments have been conducted to examine the +effectiveness of the proposed technique components. By leveraging a substantial +amount of unlabeled video data, such as HowTo100M, we achieve a remarkable +advancement on standard DVC datasets like YouCook2 and ActivityNet. We +outperform the previous state-of-the-art Vid2Seq across a majority of metrics, +achieving this with just 0.4% of the unlabeled video data used for pre-training +by Vid2Seq. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Cross-Attention Makes Inference Cumbersome in Text-to-Image Diffusion + Models + + +
+ This study explores the role of cross-attention during inference in +text-conditional diffusion models. We find that cross-attention outputs +converge to a fixed point after few inference steps. Accordingly, the time +point of convergence naturally divides the entire inference process into two +stages: an initial semantics-planning stage, during which, the model relies on +cross-attention to plan text-oriented visual semantics, and a subsequent +fidelity-improving stage, during which the model tries to generate images from +previously planned semantics. Surprisingly, ignoring text conditions in the +fidelity-improving stage not only reduces computation complexity, but also +maintains model performance. This yields a simple and training-free method +called TGATE for efficient generation, which caches the cross-attention output +once it converges and keeps it fixed during the remaining inference steps. Our +empirical study on the MS-COCO validation set confirms its effectiveness. The +source code of TGATE is available at https://github.com/HaozheLiu-ST/T-GATE. + +
+
+
+
+
+ + ☆ LiDAR4D: Dynamic Neural Fields for Novel Space-time View LiDAR Synthesis CVPR 2024 + + +
+ Although neural radiance fields (NeRFs) have achieved triumphs in image novel +view synthesis (NVS), LiDAR NVS remains largely unexplored. Previous LiDAR NVS +methods employ a simple shift from image NVS methods while ignoring the dynamic +nature and the large-scale reconstruction problem of LiDAR point clouds. In +light of this, we propose LiDAR4D, a differentiable LiDAR-only framework for +novel space-time LiDAR view synthesis. In consideration of the sparsity and +large-scale characteristics, we design a 4D hybrid representation combined with +multi-planar and grid features to achieve effective reconstruction in a +coarse-to-fine manner. Furthermore, we introduce geometric constraints derived +from point clouds to improve temporal consistency. For the realistic synthesis +of LiDAR point clouds, we incorporate the global optimization of ray-drop +probability to preserve cross-region patterns. Extensive experiments on +KITTI-360 and NuScenes datasets demonstrate the superiority of our method in +accomplishing geometry-aware and time-consistent dynamic reconstruction. Codes +are available at https://github.com/ispc-lab/LiDAR4D. + +
+
+ comment: Accepted by CVPR 2024. Project Page: + https://dyfcalid.github.io/LiDAR4D +
+
+
+
+
+ + ☆ Adaptive Affinity-Based Generalization For MRI Imaging Segmentation + Across Resource-Limited Settings + + +
+ The joint utilization of diverse data sources for medical imaging +segmentation has emerged as a crucial area of research, aiming to address +challenges such as data heterogeneity, domain shift, and data quality +discrepancies. Integrating information from multiple data domains has shown +promise in improving model generalizability and adaptability. However, this +approach often demands substantial computational resources, hindering its +practicality. In response, knowledge distillation (KD) has garnered attention +as a solution. KD involves training light-weight models to emulate the behavior +of more resource-intensive models, thereby mitigating the computational burden +while maintaining performance. This paper addresses the pressing need to +develop a lightweight and generalizable model for medical imaging segmentation +that can effectively handle data integration challenges. Our proposed approach +introduces a novel relation-based knowledge framework by seamlessly combining +adaptive affinity-based and kernel-based distillation through a gram matrix +that can capture the style representation across features. This methodology +empowers the student model to accurately replicate the feature representations +of the teacher model, facilitating robust performance even in the face of +domain shift and data heterogeneity. To validate our innovative approach, we +conducted experiments on publicly available multi-source prostate MRI data. The +results demonstrate a significant enhancement in segmentation performance using +lightweight networks. Notably, our method achieves this improvement while +reducing both inference time and storage usage, rendering it a practical and +efficient solution for real-time medical imaging segmentation. + +
+
+
+
+
+ + ☆ InstantStyle: Free Lunch towards Style-Preserving in Text-to-Image + Generation + + +
+ Tuning-free diffusion-based models have demonstrated significant potential in +the realm of image personalization and customization. However, despite this +notable progress, current models continue to grapple with several complex +challenges in producing style-consistent image generation. Firstly, the concept +of style is inherently underdetermined, encompassing a multitude of elements +such as color, material, atmosphere, design, and structure, among others. +Secondly, inversion-based methods are prone to style degradation, often +resulting in the loss of fine-grained details. Lastly, adapter-based approaches +frequently require meticulous weight tuning for each reference image to achieve +a balance between style intensity and text controllability. In this paper, we +commence by examining several compelling yet frequently overlooked +observations. We then proceed to introduce InstantStyle, a framework designed +to address these issues through the implementation of two key strategies: 1) A +straightforward mechanism that decouples style and content from reference +images within the feature space, predicated on the assumption that features +within the same space can be either added to or subtracted from one another. 2) +The injection of reference image features exclusively into style-specific +blocks, thereby preventing style leaks and eschewing the need for cumbersome +weight tuning, which often characterizes more parameter-heavy designs.Our work +demonstrates superior visual stylization outcomes, striking an optimal balance +between the intensity of style and the controllability of textual elements. Our +codes will be available at https://github.com/InstantStyle/InstantStyle. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ☆ Event Camera Demosaicing via Swin Transformer and Pixel-focus Loss CVPR 2024 + + +
+ Recent research has highlighted improvements in high-quality imaging guided +by event cameras, with most of these efforts concentrating on the RGB domain. +However, these advancements frequently neglect the unique challenges introduced +by the inherent flaws in the sensor design of event cameras in the RAW domain. +Specifically, this sensor design results in the partial loss of pixel values, +posing new challenges for RAW domain processes like demosaicing. The challenge +intensifies as most research in the RAW domain is based on the premise that +each pixel contains a value, making the straightforward adaptation of these +methods to event camera demosaicing problematic. To end this, we present a +Swin-Transformer-based backbone and a pixel-focus loss function for demosaicing +with missing pixel values in RAW domain processing. Our core motivation is to +refine a general and widely applicable foundational model from the RGB domain +for RAW domain processing, thereby broadening the model's applicability within +the entire imaging process. Our method harnesses multi-scale processing and +space-to-depth techniques to ensure efficiency and reduce computing complexity. +We also proposed the Pixel-focus Loss function for network fine-tuning to +improve network convergence based on our discovery of a long-tailed +distribution in training loss. Our method has undergone validation on the MIPI +Demosaic Challenge dataset, with subsequent analytical experimentation +confirming its efficacy. All code and trained models are released here: +https://github.com/yunfanLu/ev-demosaic + +
+
+ comment: Accepted for the CVPR 2024 Workshop on Mobile Intelligent Photography + & Imaging +
+
+
+
+
+ + ☆ Harnessing the Power of Large Vision Language Models for Synthetic Image + Detection + + +
+ In recent years, the emergence of models capable of generating images from +text has attracted considerable interest, offering the possibility of creating +realistic images from text descriptions. Yet these advances have also raised +concerns about the potential misuse of these images, including the creation of +misleading content such as fake news and propaganda. This study investigates +the effectiveness of using advanced vision-language models (VLMs) for synthetic +image identification. Specifically, the focus is on tuning state-of-the-art +image captioning models for synthetic image detection. By harnessing the robust +understanding capabilities of large VLMs, the aim is to distinguish authentic +images from synthetic images produced by diffusion-based models. This study +contributes to the advancement of synthetic image detection by exploiting the +capabilities of visual language models such as BLIP-2 and ViTGPT2. By tailoring +image captioning models, we address the challenges associated with the +potential misuse of synthetic images in real-world applications. Results +described in this paper highlight the promising role of VLMs in the field of +synthetic image detection, outperforming conventional image-based detection +techniques. Code and models can be found at +https://github.com/Mamadou-Keita/VLM-DETECT. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2404.01959 +
+
+
+
+
+ + ☆ Model-agnostic Origin Attribution of Generated Images with Few-shot + Examples + + +
+ Recent progress in visual generative models enables the generation of +high-quality images. To prevent the misuse of generated images, it is important +to identify the origin model that generates them. In this work, we study the +origin attribution of generated images in a practical setting where only a few +images generated by a source model are available and the source model cannot be +accessed. The goal is to check if a given image is generated by the source +model. We first formulate this problem as a few-shot one-class classification +task. To solve the task, we propose OCC-CLIP, a CLIP-based framework for +few-shot one-class classification, enabling the identification of an image's +source model, even among multiple candidates. Extensive experiments +corresponding to various generative models verify the effectiveness of our +OCC-CLIP framework. Furthermore, an experiment based on the recently released +DALL-E 3 API verifies the real-world applicability of our solution. + +
+
+
+
+
+ + ☆ Design2Cloth: 3D Cloth Generation from 2D Masks CVPR 2024 + + +
+ In recent years, there has been a significant shift in the field of digital +avatar research, towards modeling, animating and reconstructing clothed human +representations, as a key step towards creating realistic avatars. However, +current 3D cloth generation methods are garment specific or trained completely +on synthetic data, hence lacking fine details and realism. In this work, we +make a step towards automatic realistic garment design and propose +Design2Cloth, a high fidelity 3D generative model trained on a real world +dataset from more than 2000 subject scans. To provide vital contribution to the +fashion industry, we developed a user-friendly adversarial model capable of +generating diverse and detailed clothes simply by drawing a 2D cloth mask. +Under a series of both qualitative and quantitative experiments, we showcase +that Design2Cloth outperforms current state-of-the-art cloth generative models +by a large margin. In addition to the generative properties of our network, we +showcase that the proposed method can be used to achieve high quality +reconstructions from single in-the-wild images and 3D scans. Dataset, code and +pre-trained model will become publicly available. + +
+
+ comment: Accepted to CVPR 2024, Project page: + https://jiali-zheng.github.io/Design2Cloth/ +
+
+
+
+
+ + ☆ Independently Keypoint Learning for Small Object Semantic Correspondence + + +
+ Semantic correspondence remains a challenging task for establishing +correspondences between a pair of images with the same category or similar +scenes due to the large intra-class appearance. In this paper, we introduce a +novel problem called 'Small Object Semantic Correspondence (SOSC).' This +problem is challenging due to the close proximity of keypoints associated with +small objects, which results in the fusion of these respective features. It is +difficult to identify the corresponding key points of the fused features, and +it is also difficult to be recognized. To address this challenge, we propose +the Keypoint Bounding box-centered Cropping (KBC) method, which aims to +increase the spatial separation between keypoints of small objects, thereby +facilitating independent learning of these keypoints. The KBC method is +seamlessly integrated into our proposed inference pipeline and can be easily +incorporated into other methodologies, resulting in significant performance +enhancements. Additionally, we introduce a novel framework, named KBCNet, which +serves as our baseline model. KBCNet comprises a Cross-Scale Feature Alignment +(CSFA) module and an efficient 4D convolutional decoder. The CSFA module is +designed to align multi-scale features, enriching keypoint representations by +integrating fine-grained features and deep semantic features. Meanwhile, the 4D +convolutional decoder, based on efficient 4D convolution, ensures efficiency +and rapid convergence. To empirically validate the effectiveness of our +proposed methodology, extensive experiments are conducted on three widely used +benchmarks: PF-PASCAL, PF-WILLOW, and SPair-71k. Our KBC method demonstrates a +substantial performance improvement of 7.5\% on the SPair-71K dataset, +providing compelling evidence of its efficacy. + +
+
+
+
+
+ + ☆ RS-Mamba for Large Remote Sensing Image Dense Prediction + + +
+ The spatial resolution of remote sensing images is becoming increasingly +higher, posing challenges in handling large very-high-resolution (VHR) remote +sensing images for dense prediction tasks. Models based on convolutional neural +networks are limited in their ability to model global features of remote +sensing images due to local convolution operations. Transformer based models, +despite their global modeling capabilities, face computational challenges with +large VHR images due to their quadratic complexity. The common practice of +cropping large images into smaller patches leads to a significant loss of +contextual information. To address these issues, we propose the Remote Sensing +Mamba (RSM) for dense prediction tasks in VHR remote sensing. RSM is designed +to model global features of remote sensing images with linear complexity, +enabling it to process large VHR images effectively. It employs an +omnidirectional selective scan module to globally model the images in multiple +directions, capturing large spatial features from various directions. +Experiments on semantic segmentation and change detection tasks across various +objects demonstrate the effectiveness of RSM. With simple model architecture +and training approach, RSM achieves state-of-the-art performance on the dense +prediction tasks of VHR remote sensing. The code for this work will be +available at https://github.com/walking-shadow/Official_Remote_Sensing_Mamba. + +
+
+ comment: 13 pages,6 figures +
+
+
+
+
+ + ☆ A Satellite Band Selection Framework for Amazon Forest Deforestation + Detection Task GECCO 2024 + + +
+ The conservation of tropical forests is a topic of significant social and +ecological relevance due to their crucial role in the global ecosystem. +Unfortunately, deforestation and degradation impact millions of hectares +annually, necessitating government or private initiatives for effective forest +monitoring. This study introduces a novel framework that employs the Univariate +Marginal Distribution Algorithm (UMDA) to select spectral bands from Landsat-8 +satellite, optimizing the representation of deforested areas. This selection +guides a semantic segmentation architecture, DeepLabv3+, enhancing its +performance. Experimental results revealed several band compositions that +achieved superior balanced accuracy compared to commonly adopted combinations +for deforestation detection, utilizing segment classification via a Support +Vector Machine (SVM). Moreover, the optimal band compositions identified by the +UMDA-based approach improved the performance of the DeepLabv3+ architecture, +surpassing state-of-the-art approaches compared in this study. The observation +that a few selected bands outperform the total contradicts the data-driven +paradigm prevalent in the deep learning field. Therefore, this suggests an +exception to the conventional wisdom that 'more is always better'. + +
+
+ comment: 9 pages, 4 figures, paper accepted for presentation at GECCO 2024 +
+
+
+
+
+ + ☆ Non-negative Subspace Feature Representation for Few-shot Learning in + Medical Imaging + + +
+ Unlike typical visual scene recognition domains, in which massive datasets +are accessible to deep neural networks, medical image interpretations are often +obstructed by the paucity of data. In this paper, we investigate the +effectiveness of data-based few-shot learning in medical imaging by exploring +different data attribute representations in a low-dimensional space. We +introduce different types of non-negative matrix factorization (NMF) in +few-shot learning, addressing the data scarcity issue in medical image +classification. Extensive empirical studies are conducted in terms of +validating the effectiveness of NMF, especially its supervised variants (e.g., +discriminative NMF, and supervised and constrained NMF with sparseness), and +the comparison with principal component analysis (PCA), i.e., the collaborative +representation-based dimensionality reduction technique derived from +eigenvectors. With 14 different datasets covering 11 distinct illness +categories, thorough experimental results and comparison with related +techniques demonstrate that NMF is a competitive alternative to PCA for +few-shot learning in medical imaging, and the supervised NMF algorithms are +more discriminative in the subspace with greater effectiveness. Furthermore, we +show that the part-based representation of NMF, especially its supervised +variants, is dramatically impactful in detecting lesion areas in medical +imaging with limited samples. + +
+
+
+
+
+ + ☆ SG-BEV: Satellite-Guided BEV Fusion for Cross-View Semantic Segmentation CVPR 2024 + + +
+ This paper aims at achieving fine-grained building attribute segmentation in +a cross-view scenario, i.e., using satellite and street-view image pairs. The +main challenge lies in overcoming the significant perspective differences +between street views and satellite views. In this work, we introduce SG-BEV, a +novel approach for satellite-guided BEV fusion for cross-view semantic +segmentation. To overcome the limitations of existing cross-view projection +methods in capturing the complete building facade features, we innovatively +incorporate Bird's Eye View (BEV) method to establish a spatially explicit +mapping of street-view features. Moreover, we fully leverage the advantages of +multiple perspectives by introducing a novel satellite-guided reprojection +module, optimizing the uneven feature distribution issues associated with +traditional BEV methods. Our method demonstrates significant improvements on +four cross-view datasets collected from multiple cities, including New York, +San Francisco, and Boston. On average across these datasets, our method +achieves an increase in mIOU by 10.13% and 5.21% compared with the +state-of-the-art satellite-based and cross-view methods. The code and datasets +of this work will be released at https://github.com/yejy53/SG-BEV. + +
+
+ comment: accepted by CVPR 2024 +
+
+
+
+
+ + ☆ 3DStyleGLIP: Part-Tailored Text-Guided 3D Neural Stylization + + +
+ 3D stylization, which entails the application of specific styles to +three-dimensional objects, holds significant commercial potential as it enables +the creation of diverse 3D objects with distinct moods and styles, tailored to +specific demands of different scenes. With recent advancements in text-driven +methods and artificial intelligence, the stylization process is increasingly +intuitive and automated, thereby diminishing the reliance on manual labor and +expertise. However, existing methods have predominantly focused on holistic +stylization, thereby leaving the application of styles to individual components +of a 3D object unexplored. In response, we introduce 3DStyleGLIP, a novel +framework specifically designed for text-driven, part-tailored 3D stylization. +Given a 3D mesh and a text prompt, 3DStyleGLIP leverages the vision-language +embedding space of the Grounded Language-Image Pre-training (GLIP) model to +localize the individual parts of the 3D mesh and modify their colors and local +geometries to align them with the desired styles specified in the text prompt. +3DStyleGLIP is effectively trained for 3D stylization tasks through a +part-level style loss working in GLIP's embedding space, supplemented by two +complementary learning techniques. Extensive experimental validation confirms +that our method achieves significant part-wise stylization capabilities, +demonstrating promising potential in advancing the field of 3D stylization. + +
+
+
+
+
+ + ☆ Multi-Scale Spatial-Temporal Self-Attention Graph Convolutional Networks + for Skeleton-based Action Recognition + + +
+ Skeleton-based gesture recognition methods have achieved high success using +Graph Convolutional Network (GCN). In addition, context-dependent adaptive +topology as a neighborhood vertex information and attention mechanism leverages +a model to better represent actions. In this paper, we propose self-attention +GCN hybrid model, Multi-Scale Spatial-Temporal self-attention (MSST)-GCN to +effectively improve modeling ability to achieve state-of-the-art results on +several datasets. We utilize spatial self-attention module with adaptive +topology to understand intra-frame interactions within a frame among different +body parts, and temporal self-attention module to examine correlations between +frames of a node. These two are followed by multi-scale convolution network +with dilations, which not only captures the long-range temporal dependencies of +joints but also the long-range spatial dependencies (i.e., long-distance +dependencies) of node temporal behaviors. They are combined into high-level +spatial-temporal representations and output the predicted action with the +softmax classifier. + +
+
+ comment: 9 pages, 3 figures +
+
+
+
+
+ + ☆ Diffexplainer: Towards Cross-modal Global Explanations with Diffusion + Models + + +
+ We present DiffExplainer, a novel framework that, leveraging language-vision +models, enables multimodal global explainability. DiffExplainer employs +diffusion models conditioned on optimized text prompts, synthesizing images +that maximize class outputs and hidden features of a classifier, thus providing +a visual tool for explaining decisions. Moreover, the analysis of generated +visual descriptions allows for automatic identification of biases and spurious +features, as opposed to traditional methods that often rely on manual +intervention. The cross-modal transferability of language-vision models also +enables the possibility to describe decisions in a more human-interpretable +way, i.e., through text. We conduct comprehensive experiments, which include an +extensive user study, demonstrating the effectiveness of DiffExplainer on 1) +the generation of high-quality images explaining model decisions, surpassing +existing activation maximization methods, and 2) the automated identification +of biases and spurious features. + +
+
+
+
+
+ + ☆ Neural Radiance Fields with Torch Units + + +
+ Neural Radiance Fields (NeRF) give rise to learning-based 3D reconstruction +methods widely used in industrial applications. Although prevalent methods +achieve considerable improvements in small-scale scenes, accomplishing +reconstruction in complex and large-scale scenes is still challenging. First, +the background in complex scenes shows a large variance among different views. +Second, the current inference pattern, $i.e.$, a pixel only relies on an +individual camera ray, fails to capture contextual information. To solve these +problems, we propose to enlarge the ray perception field and build up the +sample points interactions. In this paper, we design a novel inference pattern +that encourages a single camera ray possessing more contextual information, and +models the relationship among sample points on each camera ray. To hold +contextual information,a camera ray in our proposed method can render a patch +of pixels simultaneously. Moreover, we replace the MLP in neural radiance field +models with distance-aware convolutions to enhance the feature propagation +among sample points from the same camera ray. To summarize, as a torchlight, a +ray in our proposed method achieves rendering a patch of image. Thus, we call +the proposed method, Torch-NeRF. Extensive experiments on KITTI-360 and LLFF +show that the Torch-NeRF exhibits excellent performance. + +
+
+
+
+
+ + ☆ Vestibular schwannoma growth_prediction from longitudinal MRI by time + conditioned neural fields + + +
+ Vestibular schwannomas (VS) are benign tumors that are generally managed by +active surveillance with MRI examination. To further assist clinical +decision-making and avoid overtreatment, an accurate prediction of tumor growth +based on longitudinal imaging is highly desirable. In this paper, we introduce +DeepGrowth, a deep learning method that incorporates neural fields and +recurrent neural networks for prospective tumor growth prediction. In the +proposed method, each tumor is represented as a signed distance function (SDF) +conditioned on a low-dimensional latent code. Unlike previous studies that +perform tumor shape prediction directly in the image space, we predict the +latent codes instead and then reconstruct future shapes from it. To deal with +irregular time intervals, we introduce a time-conditioned recurrent module +based on a ConvLSTM and a novel temporal encoding strategy, which enables the +proposed model to output varying tumor shapes over time. The experiments on an +in-house longitudinal VS dataset showed that the proposed model significantly +improved the performance ($\ge 1.6\%$ Dice score and $\ge0.20$ mm 95\% +Hausdorff distance), in particular for top 20\% tumors that grow or shrink the +most ($\ge 4.6\%$ Dice score and $\ge 0.73$ mm 95\% Hausdorff distance). Our +code is available at ~\burl{https://github.com/cyjdswx/DeepGrowth} + +
+
+
+
+
+ + ☆ Unsegment Anything by Simulating Deformation CVPR 2024 + + +
+ Foundation segmentation models, while powerful, pose a significant risk: they +enable users to effortlessly extract any objects from any digital content with +a single click, potentially leading to copyright infringement or malicious +misuse. To mitigate this risk, we introduce a new task "Anything Unsegmentable" +to grant any image "the right to be unsegmented". The ambitious pursuit of the +task is to achieve highly transferable adversarial attacks against all +prompt-based segmentation models, regardless of model parameterizations and +prompts. We highlight the non-transferable and heterogeneous nature of +prompt-specific adversarial noises. Our approach focuses on disrupting image +encoder features to achieve prompt-agnostic attacks. Intriguingly, targeted +feature attacks exhibit better transferability compared to untargeted ones, +suggesting the optimal update direction aligns with the image manifold. Based +on the observations, we design a novel attack named Unsegment Anything by +Simulating Deformation (UAD). Our attack optimizes a differentiable deformation +function to create a target deformed image, which alters structural information +while preserving achievable feature distance by adversarial example. Extensive +experiments verify the effectiveness of our approach, compromising a variety of +promptable segmentation models with different architectures and prompt +interfaces. We release the code at +https://github.com/jiahaolu97/anything-unsegmentable. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Active learning for efficient annotation in precision agriculture: a + use-case on crop-weed semantic segmentation + + +
+ Optimizing deep learning models requires large amounts of annotated images, a +process that is both time-intensive and costly. Especially for semantic +segmentation models in which every pixel must be annotated. A potential +strategy to mitigate annotation effort is active learning. Active learning +facilitates the identification and selection of the most informative images +from a large unlabelled pool. The underlying premise is that these selected +images can improve the model's performance faster than random selection to +reduce annotation effort. While active learning has demonstrated promising +results on benchmark datasets like Cityscapes, its performance in the +agricultural domain remains largely unexplored. This study addresses this +research gap by conducting a comparative study of three active learning-based +acquisition functions: Bayesian Active Learning by Disagreement (BALD), +stochastic-based BALD (PowerBALD), and Random. The acquisition functions were +tested on two agricultural datasets: Sugarbeet and Corn-Weed, both containing +three semantic classes: background, crop and weed. Our results indicated that +active learning, especially PowerBALD, yields a higher performance than Random +sampling on both datasets. But due to the relatively large standard deviations, +the differences observed were minimal; this was partly caused by high image +redundancy and imbalanced classes. Specifically, more than 89\% of the pixels +belonged to the background class on both datasets. The absence of significant +results on both datasets indicates that further research is required for +applying active learning on agricultural datasets, especially if they contain a +high-class imbalance and redundant images. Recommendations and insights are +provided in this paper to potentially resolve such issues. + +
+
+
+
+
+ + ☆ Knowledge Distillation with Multi-granularity Mixture of Priors for + Image Super-Resolution + + +
+ Knowledge distillation (KD) is a promising yet challenging model compression +technique that transfers rich learning representations from a well-performing +but cumbersome teacher model to a compact student model. Previous methods for +image super-resolution (SR) mostly compare the feature maps directly or after +standardizing the dimensions with basic algebraic operations (e.g. average, +dot-product). However, the intrinsic semantic differences among feature maps +are overlooked, which are caused by the disparate expressive capacity between +the networks. This work presents MiPKD, a multi-granularity mixture of prior KD +framework, to facilitate efficient SR model through the feature mixture in a +unified latent space and stochastic network block mixture. Extensive +experiments demonstrate the effectiveness of the proposed MiPKD method. + +
+
+
+
+
+ + ☆ Representation Alignment Contrastive Regularization for Multi-Object + Tracking + + +
+ Achieving high-performance in multi-object tracking algorithms heavily relies +on modeling spatio-temporal relationships during the data association stage. +Mainstream approaches encompass rule-based and deep learning-based methods for +spatio-temporal relationship modeling. While the former relies on physical +motion laws, offering wider applicability but yielding suboptimal results for +complex object movements, the latter, though achieving high-performance, lacks +interpretability and involves complex module designs. This work aims to +simplify deep learning-based spatio-temporal relationship models and introduce +interpretability into features for data association. Specifically, a +lightweight single-layer transformer encoder is utilized to model +spatio-temporal relationships. To make features more interpretative, two +contrastive regularization losses based on representation alignment are +proposed, derived from spatio-temporal consistency rules. By applying weighted +summation to affinity matrices, the aligned features can seamlessly integrate +into the data association stage of the original tracking workflow. Experimental +results showcase that our model enhances the majority of existing tracking +networks' performance without excessive complexity, with minimal increase in +training overhead and nearly negligible computational and storage costs. + +
+
+
+
+
+ + ☆ Regional biases in image geolocation estimation: a case study with the + SenseCity Africa dataset + + +
+ Advances in Artificial Intelligence are challenged by the biases rooted in +the datasets used to train the models. In image geolocation estimation, models +are mostly trained using data from specific geographic regions, notably the +Western world, and as a result, they may struggle to comprehend the +complexities of underrepresented regions. To assess this issue, we apply a +state-of-the-art image geolocation estimation model (ISNs) to a crowd-sourced +dataset of geolocated images from the African continent (SCA100), and then +explore the regional and socioeconomic biases underlying the model's +predictions. Our findings show that the ISNs model tends to over-predict image +locations in high-income countries of the Western world, which is consistent +with the geographic distribution of its training data, i.e., the IM2GPS3k +dataset. Accordingly, when compared to the IM2GPS3k benchmark, the accuracy of +the ISNs model notably decreases at all scales. Additionally, we cluster images +of the SCA100 dataset based on how accurately they are predicted by the ISNs +model and show the model's difficulties in correctly predicting the locations +of images in low income regions, especially in Sub-Saharan Africa. Therefore, +our results suggest that using IM2GPS3k as a training set and benchmark for +image geolocation estimation and other computer vision models overlooks its +potential application in the African context. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+ + ☆ Semi-Supervised Unconstrained Head Pose Estimation in the Wild + + +
+ Existing head pose estimation datasets are either composed of numerous +samples by non-realistic synthesis or lab collection, or limited images by +labor-intensive annotating. This makes deep supervised learning based solutions +compromised due to the reliance on generous labeled data. To alleviate it, we +propose the first semi-supervised unconstrained head pose estimation (SemiUHPE) +method, which can leverage a large amount of unlabeled wild head images. +Specifically, we follow the recent semi-supervised rotation regression, and +focus on the diverse and complex head pose domain. Firstly, we claim that the +aspect-ratio invariant cropping of heads is superior to the previous +landmark-based affine alignment, which does not fit unlabeled natural heads or +practical applications where landmarks are often unavailable. Then, instead of +using an empirically fixed threshold to filter out pseudo labels, we propose +the dynamic entropy-based filtering by updating thresholds for adaptively +removing unlabeled outliers. Moreover, we revisit the design of weak-strong +augmentations, and further exploit its superiority by devising two novel +head-oriented strong augmentations named pose-irrelevant cut-occlusion and +pose-altering rotation consistency. Extensive experiments show that SemiUHPE +can surpass SOTAs with remarkable improvements on public benchmarks under both +front-range and full-range. Our code is released in +\url{https://github.com/hnuzhy/SemiUHPE}. + +
+
+ comment: 14 pages. Semi-Supervised Unconstrained Head Pose Estimation +
+
+
+
+
+ + ☆ Severity Controlled Text-to-Image Generative Model Bias Manipulation + + +
+ Text-to-image (T2I) generative models are gaining wide popularity, especially +in public domains. However, their intrinsic bias and potential malicious +manipulations remain under-explored. Charting the susceptibility of T2I models +to such manipulation, we first expose the new possibility of a dynamic and +computationally efficient exploitation of model bias by targeting the embedded +language models. By leveraging mathematical foundations of vector algebra, our +technique enables a scalable and convenient control over the severity of output +manipulation through model bias. As a by-product, this control also allows a +form of precise prompt engineering to generate images which are generally +implausible with regular text prompts. We also demonstrate a constructive +application of our manipulation for balancing the frequency of generated +classes - as in model debiasing. Our technique does not require training and is +also framed as a backdoor attack with severity control using semantically-null +text triggers in the prompts. With extensive analysis, we present interesting +qualitative and quantitative results to expose potential manipulation +possibilities for T2I models. + Key-words: Text-to-Image Models, Generative Models, Backdoor Attacks, Prompt +Engineering, Bias + +
+
+ comment: This research was supported by National Intelligence and Security + Discovery Research Grants (project# NS220100007), funded by the Department of + Defence Australia +
+
+
+
+
+ + ☆ Weakly-Supervised 3D Scene Graph Generation via Visual-Linguistic + Assisted Pseudo-labeling + + +
+ Learning to build 3D scene graphs is essential for real-world perception in a +structured and rich fashion. However, previous 3D scene graph generation +methods utilize a fully supervised learning manner and require a large amount +of entity-level annotation data of objects and relations, which is extremely +resource-consuming and tedious to obtain. To tackle this problem, we propose +3D-VLAP, a weakly-supervised 3D scene graph generation method via +Visual-Linguistic Assisted Pseudo-labeling. Specifically, our 3D-VLAP exploits +the superior ability of current large-scale visual-linguistic models to align +the semantics between texts and 2D images, as well as the naturally existing +correspondences between 2D images and 3D point clouds, and thus implicitly +constructs correspondences between texts and 3D point clouds. First, we +establish the positional correspondence from 3D point clouds to 2D images via +camera intrinsic and extrinsic parameters, thereby achieving alignment of 3D +point clouds and 2D images. Subsequently, a large-scale cross-modal +visual-linguistic model is employed to indirectly align 3D instances with the +textual category labels of objects by matching 2D images with object category +labels. The pseudo labels for objects and relations are then produced for +3D-VLAP model training by calculating the similarity between visual embeddings +and textual category embeddings of objects and relations encoded by the +visual-linguistic model, respectively. Ultimately, we design an edge +self-attention based graph neural network to generate scene graphs of 3D point +cloud scenes. Extensive experiments demonstrate that our 3D-VLAP achieves +comparable results with current advanced fully supervised methods, meanwhile +significantly alleviating the pressure of data annotation. + +
+
+ comment: 11 pages, 9 figures +
+
+
+
+
+ + ☆ Text-driven Affordance Learning from Egocentric Vision + + +
+ Visual affordance learning is a key component for robots to understand how to +interact with objects. Conventional approaches in this field rely on +pre-defined objects and actions, falling short of capturing diverse +interactions in realworld scenarios. The key idea of our approach is employing +textual instruction, targeting various affordances for a wide range of objects. +This approach covers both hand-object and tool-object interactions. We +introduce text-driven affordance learning, aiming to learn contact points and +manipulation trajectories from an egocentric view following textual +instruction. In our task, contact points are represented as heatmaps, and the +manipulation trajectory as sequences of coordinates that incorporate both +linear and rotational movements for various manipulations. However, when we +gather data for this task, manual annotations of these diverse interactions are +costly. To this end, we propose a pseudo dataset creation pipeline and build a +large pseudo-training dataset: TextAFF80K, consisting of over 80K instances of +the contact points, trajectories, images, and text tuples. We extend existing +referring expression comprehension models for our task, and experimental +results show that our approach robustly handles multiple affordances, serving +as a new standard for affordance learning in real-world scenarios. + +
+
+
+
+
+ + ☆ CPAISD: Core-penumbra acute ischemic stroke dataset + + +
+ We introduce the CPAISD: Core-Penumbra Acute Ischemic Stroke Dataset, aimed +at enhancing the early detection and segmentation of ischemic stroke using +Non-Contrast Computed Tomography (NCCT) scans. Addressing the challenges in +diagnosing acute ischemic stroke during its early stages due to often +non-revealing native CT findings, the dataset provides a collection of +segmented NCCT images. These include annotations of ischemic core and penumbra +regions, critical for developing machine learning models for rapid stroke +identification and assessment. By offering a carefully collected and annotated +dataset, we aim to facilitate the development of advanced diagnostic tools, +contributing to improved patient care and outcomes in stroke management. Our +dataset's uniqueness lies in its focus on the acute phase of ischemic stroke, +with non-informative native CT scans, and includes a baseline model to +demonstrate the dataset's application, encouraging further research and +innovation in the field of medical imaging and stroke diagnosis. + +
+
+
+
+
+ + ☆ HENet: Hybrid Encoding for End-to-end Multi-task 3D Perception from + Multi-view Cameras + + +
+ Three-dimensional perception from multi-view cameras is a crucial component +in autonomous driving systems, which involves multiple tasks like 3D object +detection and bird's-eye-view (BEV) semantic segmentation. To improve +perception precision, large image encoders, high-resolution images, and +long-term temporal inputs have been adopted in recent 3D perception models, +bringing remarkable performance gains. However, these techniques are often +incompatible in training and inference scenarios due to computational resource +constraints. Besides, modern autonomous driving systems prefer to adopt an +end-to-end framework for multi-task 3D perception, which can simplify the +overall system architecture and reduce the implementation complexity. However, +conflict between tasks often arises when optimizing multiple tasks jointly +within an end-to-end 3D perception model. To alleviate these issues, we present +an end-to-end framework named HENet for multi-task 3D perception in this paper. +Specifically, we propose a hybrid image encoding network, using a large image +encoder for short-term frames and a small image encoder for long-term temporal +frames. Then, we introduce a temporal feature integration module based on the +attention mechanism to fuse the features of different frames extracted by the +two aforementioned hybrid image encoders. Finally, according to the +characteristics of each perception task, we utilize BEV features of different +grid sizes, independent BEV encoders, and task decoders for different tasks. +Experimental results show that HENet achieves state-of-the-art end-to-end +multi-task 3D perception results on the nuScenes benchmark, including 3D object +detection and BEV semantic segmentation. The source code and models will be +released at https://github.com/VDIGPKU/HENet. + +
+
+
+
+
+ + ☆ Freditor: High-Fidelity and Transferable NeRF Editing by Frequency + Decomposition + + +
+ This paper enables high-fidelity, transferable NeRF editing by frequency +decomposition. Recent NeRF editing pipelines lift 2D stylization results to 3D +scenes while suffering from blurry results, and fail to capture detailed +structures caused by the inconsistency between 2D editings. Our critical +insight is that low-frequency components of images are more +multiview-consistent after editing compared with their high-frequency parts. +Moreover, the appearance style is mainly exhibited on the low-frequency +components, and the content details especially reside in high-frequency parts. +This motivates us to perform editing on low-frequency components, which results +in high-fidelity edited scenes. In addition, the editing is performed in the +low-frequency feature space, enabling stable intensity control and novel scene +transfer. Comprehensive experiments conducted on photorealistic datasets +demonstrate the superior performance of high-fidelity and transferable NeRF +editing. The project page is at \url{https://aigc3d.github.io/freditor}. + +
+
+
+
+
+ + ☆ VIAssist: Adapting Multi-modal Large Language Models for Users with + Visual Impairments + + +
+ Individuals with visual impairments, encompassing both partial and total +difficulties in visual perception, are referred to as visually impaired (VI) +people. An estimated 2.2 billion individuals worldwide are affected by visual +impairments. Recent advancements in multi-modal large language models (MLLMs) +have showcased their extraordinary capabilities across various domains. It is +desirable to help VI individuals with MLLMs' great capabilities of visual +understanding and reasoning. However, it is challenging for VI people to use +MLLMs due to the difficulties in capturing the desirable images to fulfill +their daily requests. For example, the target object is not fully or partially +placed in the image. This paper explores how to leverage MLLMs for VI +individuals to provide visual-question answers. VIAssist can identify undesired +images and provide detailed actions. Finally, VIAssist can provide reliable +answers to users' queries based on the images. Our results show that VIAssist +provides +0.21 and +0.31 higher BERTScore and ROUGE scores than the baseline, +respectively. + +
+
+ comment: Accepted to IEEE International Workshop on Foundation Models for + Cyber-Physical Systems & Internet of Things (FMSys 2024) +
+
+
+
+
+ + ☆ A Unified Membership Inference Method for Visual Self-supervised Encoder + via Part-aware Capability + + +
+ Self-supervised learning shows promise in harnessing extensive unlabeled +data, but it also confronts significant privacy concerns, especially in vision. +In this paper, we aim to perform membership inference on visual self-supervised +models in a more realistic setting: self-supervised training method and details +are unknown for an adversary when attacking as he usually faces a black-box +system in practice. In this setting, considering that self-supervised model +could be trained by completely different self-supervised paradigms, e.g., +masked image modeling and contrastive learning, with complex training details, +we propose a unified membership inference method called PartCrop. It is +motivated by the shared part-aware capability among models and stronger part +response on the training data. Specifically, PartCrop crops parts of objects in +an image to query responses with the image in representation space. We conduct +extensive attacks on self-supervised models with different training protocols +and structures using three widely used image datasets. The results verify the +effectiveness and generalization of PartCrop. Moreover, to defend against +PartCrop, we evaluate two common approaches, i.e., early stop and differential +privacy, and propose a tailored method called shrinking crop scale range. The +defense experiments indicate that all of them are effective. Our code is +available at https://github.com/JiePKU/PartCrop + +
+
+ comment: Membership Inference, Self-supervised learning +
+
+
+
+
+ + ☆ TSNet:A Two-stage Network for Image Dehazing with Multi-scale Fusion and + Adaptive Learning + + +
+ Image dehazing has been a popular topic of research for a long time. Previous +deep learning-based image dehazing methods have failed to achieve satisfactory +dehazing effects on both synthetic datasets and real-world datasets, exhibiting +poor generalization. Moreover, single-stage networks often result in many +regions with artifacts and color distortion in output images. To address these +issues, this paper proposes a two-stage image dehazing network called TSNet, +mainly consisting of the multi-scale fusion module (MSFM) and the adaptive +learning module (ALM). Specifically, MSFM and ALM enhance the generalization of +TSNet. The MSFM can obtain large receptive fields at multiple scales and +integrate features at different frequencies to reduce the differences between +inputs and learning objectives. The ALM can actively learn of regions of +interest in images and restore texture details more effectively. Additionally, +TSNet is designed as a two-stage network, where the first-stage network +performs image dehazing, and the second-stage network is employed to improve +issues such as artifacts and color distortion present in the results of the +first-stage network. We also change the learning objective from ground truth +images to opposite fog maps, which improves the learning efficiency of TSNet. +Extensive experiments demonstrate that TSNet exhibits superior dehazing +performance on both synthetic and real-world datasets compared to previous +state-of-the-art methods. + +
+
+ comment: 12 pages, 10 figures, 7 tables +
+
+
+
+
+ + ☆ RS3Mamba: Visual State Space Model for Remote Sensing Images Semantic + Segmentation + + +
+ Semantic segmentation of remote sensing images is a fundamental task in +geoscience research. However, there are some significant shortcomings for the +widely used convolutional neural networks (CNNs) and Transformers. The former +is limited by its insufficient long-range modeling capabilities, while the +latter is hampered by its computational complexity. Recently, a novel visual +state space (VSS) model represented by Mamba has emerged, capable of modeling +long-range relationships with linear computability. In this work, we propose a +novel dual-branch network named remote sensing images semantic segmentation +Mamba (RS3Mamba) to incorporate this innovative technology into remote sensing +tasks. Specifically, RS3Mamba utilizes VSS blocks to construct an auxiliary +branch, providing additional global information to convolution-based main +branch. Moreover, considering the distinct characteristics of the two branches, +we introduce a collaborative completion module (CCM) to enhance and fuse +features from the dual-encoder. Experimental results on two widely used +datasets, ISPRS Vaihingen and LoveDA Urban, demonstrate the effectiveness and +potential of the proposed RS3Mamba. To the best of our knowledge, this is the +first vision Mamba specifically designed for remote sensing images semantic +segmentation. The source code will be made available at +https://github.com/sstary/SSRS. + +
+
+ comment: 5 pages, 4 figures +
+
+
+
+
+ + ☆ A Novel Approach to Breast Cancer Histopathological Image Classification + Using Cross-Colour Space Feature Fusion and Quantum-Classical Stack Ensemble + Method + + +
+ Breast cancer classification stands as a pivotal pillar in ensuring timely +diagnosis and effective treatment. This study with histopathological images +underscores the profound significance of harnessing the synergistic +capabilities of colour space ensembling and quantum-classical stacking to +elevate the precision of breast cancer classification. By delving into the +distinct colour spaces of RGB, HSV and CIE L*u*v, the authors initiated a +comprehensive investigation guided by advanced methodologies. Employing the +DenseNet121 architecture for feature extraction the authors have capitalized on +the robustness of Random Forest, SVM, QSVC, and VQC classifiers. This research +encompasses a unique feature fusion technique within the colour space ensemble. +This approach not only deepens our comprehension of breast cancer +classification but also marks a milestone in personalized medical assessment. +The amalgamation of quantum and classical classifiers through stacking emerges +as a potent catalyst, effectively mitigating the inherent constraints of +individual classifiers, paving a robust path towards more dependable and +refined breast cancer identification. Through rigorous experimentation and +meticulous analysis, fusion of colour spaces like RGB with HSV and RGB with CIE +L*u*v, presents an classification accuracy, nearing the value of unity. This +underscores the transformative potential of our approach, where the fusion of +diverse colour spaces and the synergy of quantum and classical realms converge +to establish a new horizon in medical diagnostics. Thus the implications of +this research extend across medical disciplines, offering promising avenues for +advancing diagnostic accuracy and treatment efficacy. + +
+
+
+
+
+ + ☆ RESSA: Repair Sparse Vision-Language Models via Sparse Cross-Modality + Adaptation + + +
+ Vision-Language Models (VLMs), integrating diverse information from multiple +modalities, have shown remarkable success across various tasks. However, +deploying VLMs, comprising large-scale vision and language models poses +challenges in resource-constrained scenarios. While pruning followed by +finetuning offers a potential solution to maintain performance with smaller +model sizes, its application to VLMs remains relatively unexplored, presenting +two main questions: how to distribute sparsity across different +modality-specific models, and how to repair the performance of pruned sparse +VLMs. To answer the first question, we conducted preliminary studies on VLM +pruning and found that pruning vision models and language models with the same +sparsity ratios contribute to nearly optimal performance. For the second +question, unlike finetuning unimodal sparse models, sparse VLMs involve +cross-modality interactions, requiring specialized techniques for post-pruning +performance repair. Moreover, while parameter-efficient LoRA finetuning has +been proposed to repair the performance of sparse models, a significant +challenge of weights merging arises due to the incompatibility of dense LoRA +modules with sparse models that destroy the sparsity of pruned models. To +tackle these challenges, we propose to Repair Sparse Vision-Language Models via +Sparse Cross-modality Adaptation (RESSA). RESSA utilizes cross-modality +finetuning to enhance task-specific performance and facilitate knowledge +distillation from original dense models. Additionally, we introduce SparseLoRA, +which applies sparsity directly to LoRA weights, enabling seamless integration +with sparse models. Our experimental results validate the effectiveness of +RESSA, showcasing significant enhancements, such as an 11.3\% improvement under +2:4 sparsity and a remarkable 47.6\% enhancement under unstructured 70\% +sparsity. + +
+
+
+
+
+ + ☆ What Are We Measuring When We Evaluate Large Vision-Language Models? An + Analysis of Latent Factors and Biases + + +
+ Vision-language (VL) models, pretrained on colossal image-text datasets, have +attained broad VL competence that is difficult to evaluate. A common belief is +that a small number of VL skills underlie the variety of VL tests. In this +paper, we perform a large-scale transfer learning experiment aimed at +discovering latent VL skills from data. We reveal interesting characteristics +that have important implications for test suite design. First, generation tasks +suffer from a length bias, suggesting benchmarks should balance tasks with +varying output lengths. Second, we demonstrate that factor analysis +successfully identifies reasonable yet surprising VL skill factors, suggesting +benchmarks could leverage similar analyses for task selection. Finally, we +present a new dataset, OLIVE (https://github.com/jq-zh/olive-dataset), which +simulates user instructions in the wild and presents challenges dissimilar to +all datasets we tested. Our findings contribute to the design of balanced and +broad-coverage vision-language evaluation methods. + +
+
+
+
+
+ + ☆ TCLC-GS: Tightly Coupled LiDAR-Camera Gaussian Splatting for Surrounding + Autonomous Driving Scenes + + +
+ Most 3D Gaussian Splatting (3D-GS) based methods for urban scenes initialize +3D Gaussians directly with 3D LiDAR points, which not only underutilizes LiDAR +data capabilities but also overlooks the potential advantages of fusing LiDAR +with camera data. In this paper, we design a novel tightly coupled LiDAR-Camera +Gaussian Splatting (TCLC-GS) to fully leverage the combined strengths of both +LiDAR and camera sensors, enabling rapid, high-quality 3D reconstruction and +novel view RGB/depth synthesis. TCLC-GS designs a hybrid explicit (colorized 3D +mesh) and implicit (hierarchical octree feature) 3D representation derived from +LiDAR-camera data, to enrich the properties of 3D Gaussians for splatting. 3D +Gaussian's properties are not only initialized in alignment with the 3D mesh +which provides more completed 3D shape and color information, but are also +endowed with broader contextual information through retrieved octree implicit +features. During the Gaussian Splatting optimization process, the 3D mesh +offers dense depth information as supervision, which enhances the training +process by learning of a robust geometry. Comprehensive evaluations conducted +on the Waymo Open Dataset and nuScenes Dataset validate our method's +state-of-the-art (SOTA) performance. Utilizing a single NVIDIA RTX 3090 Ti, our +method demonstrates fast training and achieves real-time RGB and depth +rendering at 90 FPS in resolution of 1920x1280 (Waymo), and 120 FPS in +resolution of 1600x900 (nuScenes) in urban scenarios. + +
+
+
+
+
+ + ☆ TE-TAD: Towards Full End-to-End Temporal Action Detection via + Time-Aligned Coordinate Expression + + +
+ In this paper, we investigate that the normalized coordinate expression is a +key factor as reliance on hand-crafted components in query-based detectors for +temporal action detection (TAD). Despite significant advancements towards an +end-to-end framework in object detection, query-based detectors have been +limited in achieving full end-to-end modeling in TAD. To address this issue, we +propose \modelname{}, a full end-to-end temporal action detection transformer +that integrates time-aligned coordinate expression. We reformulate coordinate +expression utilizing actual timeline values, ensuring length-invariant +representations from the extremely diverse video duration environment. +Furthermore, our proposed adaptive query selection dynamically adjusts the +number of queries based on video length, providing a suitable solution for +varying video durations compared to a fixed query set. Our approach not only +simplifies the TAD process by eliminating the need for hand-crafted components +but also significantly improves the performance of query-based detectors. Our +TE-TAD outperforms the previous query-based detectors and achieves competitive +performance compared to state-of-the-art methods on popular benchmark datasets. +Code is available at: https://github.com/Dotori-HJ/TE-TAD + +
+
+
+
+
+ + ☆ Enhancing Diffusion-based Point Cloud Generation with Smoothness + Constraint + + +
+ Diffusion models have been popular for point cloud generation tasks. Existing +works utilize the forward diffusion process to convert the original point +distribution into a noise distribution and then learn the reverse diffusion +process to recover the point distribution from the noise distribution. However, +the reverse diffusion process can produce samples with non-smooth points on the +surface because of the ignorance of the point cloud geometric properties. We +propose alleviating the problem by incorporating the local smoothness +constraint into the diffusion framework for point cloud generation. Experiments +demonstrate the proposed model can generate realistic shapes and smoother point +clouds, outperforming multiple state-of-the-art methods. + +
+
+
+
+
+ + ☆ Cohort-Individual Cooperative Learning for Multimodal Cancer Survival + Analysis + + +
+ Recently, we have witnessed impressive achievements in cancer survival +analysis by integrating multimodal data, e.g., pathology images and genomic +profiles. However, the heterogeneity and high dimensionality of these +modalities pose significant challenges for extracting discriminative +representations while maintaining good generalization. In this paper, we +propose a Cohort-individual Cooperative Learning (CCL) framework to advance +cancer survival analysis by collaborating knowledge decomposition and cohort +guidance. Specifically, first, we propose a Multimodal Knowledge Decomposition +(MKD) module to explicitly decompose multimodal knowledge into four distinct +components: redundancy, synergy and uniqueness of the two modalities. Such a +comprehensive decomposition can enlighten the models to perceive easily +overlooked yet important information, facilitating an effective multimodal +fusion. Second, we propose a Cohort Guidance Modeling (CGM) to mitigate the +risk of overfitting task-irrelevant information. It can promote a more +comprehensive and robust understanding of the underlying multimodal data, while +avoiding the pitfalls of overfitting and enhancing the generalization ability +of the model. By cooperating the knowledge decomposition and cohort guidance +methods, we develop a robust multimodal survival analysis model with enhanced +discrimination and generalization abilities. Extensive experimental results on +five cancer datasets demonstrate the effectiveness of our model in integrating +multimodal data for survival analysis. + +
+
+ comment: 10 pages, 9 figures +
+
+
+
+
+ + ☆ APC2Mesh: Bridging the gap from occluded building façades to full 3D + models + + +
+ The benefits of having digital twins of urban buildings are numerous. +However, a major difficulty encountered in their creation from airborne LiDAR +point clouds is the effective means of accurately reconstructing significant +occlusions amidst point density variations and noise. To bridge the +noise/sparsity/occlusion gap and generate high fidelity 3D building models, we +propose APC2Mesh which integrates point completion into a 3D reconstruction +pipeline, enabling the learning of dense geometrically accurate representation +of buildings. Specifically, we leveraged complete points generated from +occluded ones as input to a linearized skip attention-based deformation network +for 3D mesh reconstruction. In our experiments, conducted on 3 different +scenes, we demonstrate that: (1) APC2Mesh delivers comparatively superior +results, indicating its efficacy in handling the challenges of occluded +airborne building points of diverse styles and complexities. (2) The +combination of point completion with typical deep learning-based 3D point cloud +reconstruction methods offers a direct and effective solution for +reconstructing significantly occluded airborne building points. As such, this +neural integration holds promise for advancing the creation of digital twins +for urban buildings with greater accuracy and fidelity. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ☆ CAPE: CAM as a Probabilistic Ensemble for Enhanced DNN Interpretation + + +
+ Deep Neural Networks (DNNs) are widely used for visual classification tasks, +but their complex computation process and black-box nature hinder decision +transparency and interpretability. Class activation maps (CAMs) and recent +variants provide ways to visually explain the DNN decision-making process by +displaying 'attention' heatmaps of the DNNs. Nevertheless, the CAM explanation +only offers relative attention information, that is, on an attention heatmap, +we can interpret which image region is more or less important than the others. +However, these regions cannot be meaningfully compared across classes, and the +contribution of each region to the model's class prediction is not revealed. To +address these challenges that ultimately lead to better DNN Interpretation, in +this paper, we propose CAPE, a novel reformulation of CAM that provides a +unified and probabilistically meaningful assessment of the contributions of +image regions. We quantitatively and qualitatively compare CAPE with +state-of-the-art CAM methods on CUB and ImageNet benchmark datasets to +demonstrate enhanced interpretability. We also test on a cytology imaging +dataset depicting a challenging Chronic Myelomonocytic Leukemia (CMML) +diagnosis problem. Code is available at: https://github.com/AIML-MED/CAPE. + +
+
+
+
+
+ + ☆ Enhancing Human-Computer Interaction in Chest X-ray Analysis using + Vision and Language Model with Eye Gaze Patterns + + +
+ Recent advancements in Computer Assisted Diagnosis have shown promising +performance in medical imaging tasks, particularly in chest X-ray analysis. +However, the interaction between these models and radiologists has been +primarily limited to input images. This work proposes a novel approach to +enhance human-computer interaction in chest X-ray analysis using +Vision-Language Models (VLMs) enhanced with radiologists' attention by +incorporating eye gaze data alongside textual prompts. Our approach leverages +heatmaps generated from eye gaze data, overlaying them onto medical images to +highlight areas of intense radiologist's focus during chest X-ray evaluation. +We evaluate this methodology in tasks such as visual question answering, chest +X-ray report automation, error detection, and differential diagnosis. Our +results demonstrate the inclusion of eye gaze information significantly +enhances the accuracy of chest X-ray analysis. Also, the impact of eye gaze on +fine-tuning was confirmed as it outperformed other medical VLMs in all tasks +except visual question answering. This work marks the potential of leveraging +both the VLM's capabilities and the radiologist's domain knowledge to improve +the capabilities of AI models in medical imaging, paving a novel way for +Computer Assisted Diagnosis with a human-centred AI. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Utilizing Computer Vision for Continuous Monitoring of Vaccine Side + Effects in Experimental Mice + + +
+ The demand for improved efficiency and accuracy in vaccine safety assessments +is increasing. Here, we explore the application of computer vision technologies +to automate the monitoring of experimental mice for potential side effects +after vaccine administration. Traditional observation methods are +labor-intensive and lack the capability for continuous monitoring. By deploying +a computer vision system, our research aims to improve the efficiency and +accuracy of vaccine safety assessments. The methodology involves training +machine learning models on annotated video data of mice behaviors pre- and +post-vaccination. Preliminary results indicate that computer vision effectively +identify subtle changes, signaling possible side effects. Therefore, our +approach has the potential to significantly enhance the monitoring process in +vaccine trials in animals, providing a practical solution to the limitations of +human observation. + +
+
+ comment: 1 figure +
+
+
+
+
+ + ☆ LVLM-Intrepret: An Interpretability Tool for Large Vision-Language + Models + + +
+ In the rapidly evolving landscape of artificial intelligence, multi-modal +large language models are emerging as a significant area of interest. These +models, which combine various forms of data input, are becoming increasingly +popular. However, understanding their internal mechanisms remains a complex +task. Numerous advancements have been made in the field of explainability tools +and mechanisms, yet there is still much to explore. In this work, we present a +novel interactive application aimed towards understanding the internal +mechanisms of large vision-language models. Our interface is designed to +enhance the interpretability of the image patches, which are instrumental in +generating an answer, and assess the efficacy of the language model in +grounding its output in the image. With our application, a user can +systematically investigate the model and uncover system limitations, paving the +way for enhancements in system capabilities. Finally, we present a case study +of how our application can aid in understanding failure mechanisms in a popular +large multi-modal model: LLaVA. + +
+
+
+
+
+ + ☆ Ego-Motion Aware Target Prediction Module for Robust Multi-Object + Tracking IROS2024 + + +
+ Multi-object tracking (MOT) is a prominent task in computer vision with +application in autonomous driving, responsible for the simultaneous tracking of +multiple object trajectories. Detection-based multi-object tracking (DBT) +algorithms detect objects using an independent object detector and predict the +imminent location of each target. Conventional prediction methods in DBT +utilize Kalman Filter(KF) to extrapolate the target location in the upcoming +frames by supposing a constant velocity motion model. These methods are +especially hindered in autonomous driving applications due to dramatic camera +motion or unavailable detections. Such limitations lead to tracking failures +manifested by numerous identity switches and disrupted trajectories. In this +paper, we introduce a novel KF-based prediction module called the Ego-motion +Aware Target Prediction (EMAP) module by focusing on the integration of camera +motion and depth information with object motion models. Our proposed method +decouples the impact of camera rotational and translational velocity from the +object trajectories by reformulating the Kalman Filter. This reformulation +enables us to reject the disturbances caused by camera motion and maximizes the +reliability of the object motion model. We integrate our module with four +state-of-the-art base MOT algorithms, namely OC-SORT, Deep OC-SORT, ByteTrack, +and BoT-SORT. In particular, our evaluation on the KITTI MOT dataset +demonstrates that EMAP remarkably drops the number of identity switches (IDSW) +of OC-SORT and Deep OC-SORT by 73% and 21%, respectively. At the same time, it +elevates other performance metrics such as HOTA by more than 5%. Our source +code is available at https://github.com/noyzzz/EMAP. + +
+
+ comment: 7 pages, 4 figures, submitted to IROS2024 +
+
+
+
+
+ + ☆ Many-to-many Image Generation with Auto-regressive Diffusion Models + + +
+ Recent advancements in image generation have made significant progress, yet +existing models present limitations in perceiving and generating an arbitrary +number of interrelated images within a broad context. This limitation becomes +increasingly critical as the demand for multi-image scenarios, such as +multi-view images and visual narratives, grows with the expansion of multimedia +platforms. This paper introduces a domain-general framework for many-to-many +image generation, capable of producing interrelated image series from a given +set of images, offering a scalable solution that obviates the need for +task-specific solutions across different multi-image scenarios. To facilitate +this, we present MIS, a novel large-scale multi-image dataset, containing 12M +synthetic multi-image samples, each with 25 interconnected images. Utilizing +Stable Diffusion with varied latent noises, our method produces a set of +interconnected images from a single caption. Leveraging MIS, we learn M2M, an +autoregressive model for many-to-many generation, where each image is modeled +within a diffusion framework. Throughout training on the synthetic MIS, the +model excels in capturing style and content from preceding images - synthetic +or real - and generates novel images following the captured patterns. +Furthermore, through task-specific fine-tuning, our model demonstrates its +adaptability to various multi-image generation tasks, including Novel View +Synthesis and Visual Procedure Generation. + +
+
+
+
+
+ + ☆ SalFoM: Dynamic Saliency Prediction with Video Foundation Models + + +
+ Recent advancements in video saliency prediction (VSP) have shown promising +performance compared to the human visual system, whose emulation is the primary +goal of VSP. However, current state-of-the-art models employ spatio-temporal +transformers trained on limited amounts of data, hindering generalizability +adaptation to downstream tasks. The benefits of vision foundation models +present a potential solution to improve the VSP process. However, adapting +image foundation models to the video domain presents significant challenges in +modeling scene dynamics and capturing temporal information. To address these +challenges, and as the first initiative to design a VSP model based on video +foundation models, we introduce SalFoM, a novel encoder-decoder video +transformer architecture. Our model employs UnMasked Teacher (UMT) as feature +extractor and presents a heterogeneous decoder which features a locality-aware +spatio-temporal transformer and integrates local and global spatio-temporal +information from various perspectives to produce the final saliency map. Our +qualitative and quantitative experiments on the challenging VSP benchmark +datasets of DHF1K, Hollywood-2 and UCF-Sports demonstrate the superiority of +our proposed model in comparison with the state-of-the-art methods. + +
+
+ comment: 15 pages, 4 figures +
+
+
+
+
+ + ☆ Behind the Veil: Enhanced Indoor 3D Scene Reconstruction with Occluded + Surfaces Completion + + +
+ In this paper, we present a novel indoor 3D reconstruction method with +occluded surface completion, given a sequence of depth readings. Prior +state-of-the-art (SOTA) methods only focus on the reconstruction of the visible +areas in a scene, neglecting the invisible areas due to the occlusions, e.g., +the contact surface between furniture, occluded wall and floor. Our method +tackles the task of completing the occluded scene surfaces, resulting in a +complete 3D scene mesh. The core idea of our method is learning 3D geometry +prior from various complete scenes to infer the occluded geometry of an unseen +scene from solely depth measurements. We design a coarse-fine hierarchical +octree representation coupled with a dual-decoder architecture, i.e., +Geo-decoder and 3D Inpainter, which jointly reconstructs the complete 3D scene +geometry. The Geo-decoder with detailed representation at fine levels is +optimized online for each scene to reconstruct visible surfaces. The 3D +Inpainter with abstract representation at coarse levels is trained offline +using various scenes to complete occluded surfaces. As a result, while the +Geo-decoder is specialized for an individual scene, the 3D Inpainter can be +generally applied across different scenes. We evaluate the proposed method on +the 3D Completed Room Scene (3D-CRS) and iTHOR datasets, significantly +outperforming the SOTA methods by a gain of 16.8% and 24.2% in terms of the +completeness of 3D reconstruction. 3D-CRS dataset including a complete 3D mesh +of each scene is provided at project webpage. + +
+
+
+
+
+ + ☆ Self-supervised 6-DoF Robot Grasping by Demonstration via Augmented + Reality Teleoperation System + + +
+ Most existing 6-DoF robot grasping solutions depend on strong supervision on +grasp pose to ensure satisfactory performance, which could be laborious and +impractical when the robot works in some restricted area. To this end, we +propose a self-supervised 6-DoF grasp pose detection framework via an Augmented +Reality (AR) teleoperation system that can efficiently learn human +demonstrations and provide 6-DoF grasp poses without grasp pose annotations. +Specifically, the system collects the human demonstration from the AR +environment and contrastively learns the grasping strategy from the +demonstration. For the real-world experiment, the proposed system leads to +satisfactory grasping abilities and learning to grasp unknown objects within +three demonstrations. + +
+
+
+
+
+ + ☆ Linear Anchored Gaussian Mixture Model for Location and Width + Computation of Objects in Thick Line Shape + + +
+ An accurate detection of the centerlines of linear objects is a challenging +topic in many sensitive real-world applications such X-ray imaging, remote +sensing and lane marking detection in road traffic. Model-based approaches +using Hough and Radon transforms are often used but, are not recommended for +thick line detection, whereas approaches based on image derivatives need +further step-by-step processing, making their efficiency dependent on each step +outcomes. In this paper, we aim to detect linear structures found in images by +considering the 3D representation of the image gray levels as a finite mixture +model of statistical distribution. The latter, which we named linear anchored +Gaussian distribution could be parametrized by a scale value {\sigma} +describing the linear structure thickness and a line equation, parametrized, in +turn, by a radius \r{ho} and an orientation angle {\theta}, describing the +linear structure centerline location. Expectation-Maximization (EM) algorithm +is used for the mixture model parameter estimation, where a new paradigm, using +the background subtraction for the likelihood function computation, is +proposed. For the EM algorithm, two {\theta} parameter initialization schemes +are used: the first one is based on a random choice of the first component of +{\theta} vector, whereas the second is based on the image Hessian with a +simultaneous computation of the mixture model components number. Experiments on +real world images and synthetic images corrupted by blur and additive noise +show the good performance of the proposed methods, where the algorithm using +background subtraction and Hessian-based {\theta} initialization provides an +outstanding accuracy of the linear structure detection despite irregular image +background and presence of blur and noise. + +
+
+ comment: 13 pages, 13 figures +
+
+
+
+
+ + ☆ AWOL: Analysis WithOut synthesis using Language + + +
+ Many classical parametric 3D shape models exist, but creating novel shapes +with such models requires expert knowledge of their parameters. For example, +imagine creating a specific type of tree using procedural graphics or a new +kind of animal from a statistical shape model. Our key idea is to leverage +language to control such existing models to produce novel shapes. This involves +learning a mapping between the latent space of a vision-language model and the +parameter space of the 3D model, which we do using a small set of shape and +text pairs. Our hypothesis is that mapping from language to parameters allows +us to generate parameters for objects that were never seen during training. If +the mapping between language and parameters is sufficiently smooth, then +interpolation or generalization in language should translate appropriately into +novel 3D shapes. We test our approach with two very different types of +parametric shape models (quadrupeds and arboreal trees). We use a learned +statistical shape model of quadrupeds and show that we can use text to generate +new animals not present during training. In particular, we demonstrate +state-of-the-art shape estimation of 3D dogs. This work also constitutes the +first language-driven method for generating 3D trees. Finally, embedding images +in the CLIP latent space enables us to generate animals and trees directly from +images. + +
+
+
+
+
+ + ☆ BCAmirs at SemEval-2024 Task 4: Beyond Words: A Multimodal and + Multilingual Exploration of Persuasion in Memes SemEval-2024 + + +
+ Memes, combining text and images, frequently use metaphors to convey +persuasive messages, shaping public opinion. Motivated by this, our team +engaged in SemEval-2024 Task 4, a hierarchical multi-label classification task +designed to identify rhetorical and psychological persuasion techniques +embedded within memes. To tackle this problem, we introduced a caption +generation step to assess the modality gap and the impact of additional +semantic information from images, which improved our result. Our best model +utilizes GPT-4 generated captions alongside meme text to fine-tune RoBERTa as +the text encoder and CLIP as the image encoder. It outperforms the baseline by +a large margin in all 12 subtasks. In particular, it ranked in top-3 across all +languages in Subtask 2a, and top-4 in Subtask 2b, demonstrating quantitatively +strong performance. The improvement achieved by the introduced intermediate +step is likely attributable to the metaphorical essence of images that +challenges visual encoders. This highlights the potential for improving +abstract visual semantics encoding. + +
+
+ comment: 11 pages, 5 tables, 2 figures, Proceedings of the 18th International + Workshop on Semantic Evaluation (SemEval-2024) @ NAACL 2024 +
+
+
+
+
+ + ☆ DPFT: Dual Perspective Fusion Transformer for Camera-Radar-based Object + Detection + + +
+ The perception of autonomous vehicles has to be efficient, robust, and +cost-effective. However, cameras are not robust against severe weather +conditions, lidar sensors are expensive, and the performance of radar-based +perception is still inferior to the others. Camera-radar fusion methods have +been proposed to address this issue, but these are constrained by the typical +sparsity of radar point clouds and often designed for radars without elevation +information. We propose a novel camera-radar fusion approach called Dual +Perspective Fusion Transformer (DPFT), designed to overcome these limitations. +Our method leverages lower-level radar data (the radar cube) instead of the +processed point clouds to preserve as much information as possible and employs +projections in both the camera and ground planes to effectively use radars with +elevation information and simplify the fusion with camera data. As a result, +DPFT has demonstrated state-of-the-art performance on the K-Radar dataset while +showing remarkable robustness against adverse weather conditions and +maintaining a low inference time. The code is made available as open-source +software under https://github.com/TUMFTM/DPFT. + +
+
+
+
+
+ + ☆ Skeleton Recall Loss for Connectivity Conserving and Resource Efficient + Segmentation of Thin Tubular Structures + + +
+ Accurately segmenting thin tubular structures, such as vessels, nerves, roads +or concrete cracks, is a crucial task in computer vision. Standard deep +learning-based segmentation loss functions, such as Dice or Cross-Entropy, +focus on volumetric overlap, often at the expense of preserving structural +connectivity or topology. This can lead to segmentation errors that adversely +affect downstream tasks, including flow calculation, navigation, and structural +inspection. Although current topology-focused losses mark an improvement, they +introduce significant computational and memory overheads. This is particularly +relevant for 3D data, rendering these losses infeasible for larger volumes as +well as increasingly important multi-class segmentation problems. To mitigate +this, we propose a novel Skeleton Recall Loss, which effectively addresses +these challenges by circumventing intensive GPU-based calculations with +inexpensive CPU operations. It demonstrates overall superior performance to +current state-of-the-art approaches on five public datasets for +topology-preserving segmentation, while substantially reducing computational +overheads by more than 90%. In doing so, we introduce the first multi-class +capable loss function for thin structure segmentation, excelling in both +efficiency and efficacy for topology-preservation. + +
+
+
+
+
+ + ☆ MeshBrush: Painting the Anatomical Mesh with Neural Stylization for + Endoscopy + + +
+ Style transfer is a promising approach to close the sim-to-real gap in +medical endoscopy. Rendering realistic endoscopic videos by traversing +pre-operative scans (such as MRI or CT) can generate realistic simulations as +well as ground truth camera poses and depth maps. Although image-to-image (I2I) +translation models such as CycleGAN perform well, they are unsuitable for +video-to-video synthesis due to the lack of temporal consistency, resulting in +artifacts between frames. We propose MeshBrush, a neural mesh stylization +method to synthesize temporally consistent videos with differentiable +rendering. MeshBrush uses the underlying geometry of patient imaging data while +leveraging existing I2I methods. With learned per-vertex textures, the stylized +mesh guarantees consistency while producing high-fidelity outputs. We +demonstrate that mesh stylization is a promising approach for creating +realistic simulations for downstream tasks such as training and preoperative +planning. Although our method is tested and designed for ureteroscopy, its +components are transferable to general endoscopic and laparoscopic procedures. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ ASAP: Interpretable Analysis and Summarization of AI-generated Image + Patterns at Scale + + +
+ Generative image models have emerged as a promising technology to produce +realistic images. Despite potential benefits, concerns grow about its misuse, +particularly in generating deceptive images that could raise significant +ethical, legal, and societal issues. Consequently, there is growing demand to +empower users to effectively discern and comprehend patterns of AI-generated +images. To this end, we developed ASAP, an interactive visualization system +that automatically extracts distinct patterns of AI-generated images and allows +users to interactively explore them via various views. To uncover fake +patterns, ASAP introduces a novel image encoder, adapted from CLIP, which +transforms images into compact "distilled" representations, enriched with +information for differentiating authentic and fake images. These +representations generate gradients that propagate back to the attention maps of +CLIP's transformer block. This process quantifies the relative importance of +each pixel to image authenticity or fakeness, exposing key deceptive patterns. +ASAP enables the at scale interactive analysis of these patterns through +multiple, coordinated visualizations. This includes a representation overview +with innovative cell glyphs to aid in the exploration and qualitative +evaluation of fake patterns across a vast array of images, as well as a pattern +view that displays authenticity-indicating patterns in images and quantifies +their impact. ASAP supports the analysis of cutting-edge generative models with +the latest architectures, including GAN-based models like proGAN and diffusion +models like the latent diffusion model. We demonstrate ASAP's usefulness +through two usage scenarios using multiple fake image detection benchmark +datasets, revealing its ability to identify and understand hidden patterns in +AI-generated images, especially in detecting fake human faces produced by +diffusion-based techniques. + +
+
+ comment: 9 pages, 6 figures +
+
+
+
+
+ + ☆ Scaling Laws for Galaxy Images + + +
+ We present the first systematic investigation of supervised scaling laws +outside of an ImageNet-like context - on images of galaxies. We use 840k galaxy +images and over 100M annotations by Galaxy Zoo volunteers, comparable in scale +to Imagenet-1K. We find that adding annotated galaxy images provides a power +law improvement in performance across all architectures and all tasks, while +adding trainable parameters is effective only for some (typically more +subjectively challenging) tasks. We then compare the downstream performance of +finetuned models pretrained on either ImageNet-12k alone vs. additionally +pretrained on our galaxy images. We achieve an average relative error rate +reduction of 31% across 5 downstream tasks of scientific interest. Our +finetuned models are more label-efficient and, unlike their +ImageNet-12k-pretrained equivalents, often achieve linear transfer performance +equal to that of end-to-end finetuning. We find relatively modest additional +downstream benefits from scaling model size, implying that scaling alone is not +sufficient to address our domain gap, and suggest that practitioners with +qualitatively different images might benefit more from in-domain adaption +followed by targeted downstream labelling. + +
+
+ comment: 10+6 pages, 12 figures. Appendix C2 based on arxiv:2206.11927. Code, + demos, documentation at https://github.com/mwalmsley/zoobot +
+
+
+
+
+ + ☆ Translation-based Video-to-Video Synthesis + + +
+ Translation-based Video Synthesis (TVS) has emerged as a vital research area +in computer vision, aiming to facilitate the transformation of videos between +distinct domains while preserving both temporal continuity and underlying +content features. This technique has found wide-ranging applications, +encompassing video super-resolution, colorization, segmentation, and more, by +extending the capabilities of traditional image-to-image translation to the +temporal domain. One of the principal challenges faced in TVS is the inherent +risk of introducing flickering artifacts and inconsistencies between frames +during the synthesis process. This is particularly challenging due to the +necessity of ensuring smooth and coherent transitions between video frames. +Efforts to tackle this challenge have induced the creation of diverse +strategies and algorithms aimed at mitigating these unwanted consequences. This +comprehensive review extensively examines the latest progress in the realm of +TVS. It thoroughly investigates emerging methodologies, shedding light on the +fundamental concepts and mechanisms utilized for proficient video synthesis. +This survey also illuminates their inherent strengths, limitations, appropriate +applications, and potential avenues for future development. + +
+
+ comment: 25 pages, 9 figures +
+
+
+
+
+ + ☆ JDEC: JPEG Decoding via Enhanced Continuous Cosine Coefficients + + +
+ We propose a practical approach to JPEG image decoding, utilizing a local +implicit neural representation with continuous cosine formulation. The JPEG +algorithm significantly quantizes discrete cosine transform (DCT) spectra to +achieve a high compression rate, inevitably resulting in quality degradation +while encoding an image. We have designed a continuous cosine spectrum +estimator to address the quality degradation issue that restores the distorted +spectrum. By leveraging local DCT formulations, our network has the privilege +to exploit dequantization and upsampling simultaneously. Our proposed model +enables decoding compressed images directly across different quality factors +using a single pre-trained model without relying on a conventional JPEG +decoder. As a result, our proposed network achieves state-of-the-art +performance in flexible color image JPEG artifact removal tasks. Our source +code is available at https://github.com/WooKyoungHan/JDEC. + +
+
+
+
+
+ + ♻ ☆ FreeZe: Training-free zero-shot 6D pose estimation with geometric and + vision foundation models + + +
+ Estimating the 6D pose of objects unseen during training is highly desirable +yet challenging. Zero-shot object 6D pose estimation methods address this +challenge by leveraging additional task-specific supervision provided by +large-scale, photo-realistic synthetic datasets. However, their performance +heavily depends on the quality and diversity of rendered data and they require +extensive training. In this work, we show how to tackle the same task but +without training on specific data. We propose FreeZe, a novel solution that +harnesses the capabilities of pre-trained geometric and vision foundation +models. FreeZe leverages 3D geometric descriptors learned from unrelated 3D +point clouds and 2D visual features learned from web-scale 2D images to +generate discriminative 3D point-level descriptors. We then estimate the 6D +pose of unseen objects by 3D registration based on RANSAC. We also introduce a +novel algorithm to solve ambiguous cases due to geometrically symmetric objects +that is based on visual features. We comprehensively evaluate FreeZe across the +seven core datasets of the BOP Benchmark, which include over a hundred 3D +objects and 20,000 images captured in various scenarios. FreeZe consistently +outperforms all state-of-the-art approaches, including competitors extensively +trained on synthetic 6D pose estimation data. Code will be publicly available +at https://andreacaraffa.github.io/freeze. + +
+
+
+
+
+ + ♻ ☆ Total Selfie: Generating Full-Body Selfies + + +
+ We present a method to generate full-body selfies from photographs originally +taken at arms length. Because self-captured photos are typically taken close +up, they have limited field of view and exaggerated perspective that distorts +facial shapes. We instead seek to generate the photo some one else would take +of you from a few feet away. Our approach takes as input four selfies of your +face and body, a background image, and generates a full-body selfie in a +desired target pose. We introduce a novel diffusion-based approach to combine +all of this information into high-quality, well-composed photos of you with the +desired pose and background. + +
+
+ comment: Project page: + https://homes.cs.washington.edu/~boweiche/project_page/totalselfie/ +
+
+
+
+
+ + ♻ ☆ G3DR: Generative 3D Reconstruction in ImageNet CVPR 2024 + + +
+ We introduce a novel 3D generative method, Generative 3D Reconstruction +(G3DR) in ImageNet, capable of generating diverse and high-quality 3D objects +from single images, addressing the limitations of existing methods. At the +heart of our framework is a novel depth regularization technique that enables +the generation of scenes with high-geometric fidelity. G3DR also leverages a +pretrained language-vision model, such as CLIP, to enable reconstruction in +novel views and improve the visual realism of generations. Additionally, G3DR +designs a simple but effective sampling procedure to further improve the +quality of generations. G3DR offers diverse and efficient 3D asset generation +based on class or text conditioning. Despite its simplicity, G3DR is able to +beat state-of-theart methods, improving over them by up to 22% in perceptual +metrics and 90% in geometry scores, while needing only half of the training +time. Code is available at https://github.com/preddy5/G3DR + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Quantifying and Mitigating Unimodal Biases in Multimodal Large Language + Models: A Causal Perspective + + +
+ Recent advancements in Large Language Models (LLMs) have facilitated the +development of Multimodal LLMs (MLLMs). Despite their impressive capabilities, +MLLMs often suffer from an over-reliance on unimodal biases (e.g., language +bias and vision bias), leading to incorrect answers in complex multimodal +tasks. To investigate this issue, we propose a causal framework to interpret +the biases in Visual Question Answering (VQA) problems. Within our framework, +we devise a causal graph to elucidate the predictions of MLLMs on VQA problems, +and assess the causal effect of biases through an in-depth causal analysis. +Motivated by the causal graph, we introduce a novel MORE dataset, consisting of +12,000 VQA instances. This dataset is designed to challenge MLLMs' abilities, +necessitating multi-hop reasoning and the surmounting of unimodal biases. +Furthermore, we propose two strategies to mitigate unimodal biases and enhance +MLLMs' reasoning capabilities, including a Decompose-Verify-Answer (DeVA) +framework for limited-access MLLMs and the refinement of open-source MLLMs +through fine-tuning. Extensive quantitative and qualitative experiments offer +valuable insights for future research. Our project page is at +https://opencausalab.github.io/MORE. + +
+
+
+
+
+ + ♻ ☆ Learning Object State Changes in Videos: An Open-World Perspective CVPR 2024 + + +
+ Object State Changes (OSCs) are pivotal for video understanding. While humans +can effortlessly generalize OSC understanding from familiar to unknown objects, +current approaches are confined to a closed vocabulary. Addressing this gap, we +introduce a novel open-world formulation for the video OSC problem. The goal is +to temporally localize the three stages of an OSC -- the object's initial +state, its transitioning state, and its end state -- whether or not the object +has been observed during training. Towards this end, we develop VidOSC, a +holistic learning approach that: (1) leverages text and vision-language models +for supervisory signals to obviate manually labeling OSC training data, and (2) +abstracts fine-grained shared state representations from objects to enhance +generalization. Furthermore, we present HowToChange, the first open-world +benchmark for video OSC localization, which offers an order of magnitude +increase in the label space and annotation volume compared to the best existing +benchmark. Experimental results demonstrate the efficacy of our approach, in +both traditional closed-world and open-world scenarios. + +
+
+ comment: Accepted by CVPR 2024, Project website: + https://vision.cs.utexas.edu/projects/VidOSC/ +
+
+
+
+
+ + ♻ ☆ AddSR: Accelerating Diffusion-based Blind Super-Resolution with + Adversarial Diffusion Distillation + + +
+ Blind super-resolution methods based on stable diffusion showcase formidable +generative capabilities in reconstructing clear high-resolution images with +intricate details from low-resolution inputs. However, their practical +applicability is often hampered by poor efficiency, stemming from the +requirement of thousands or hundreds of sampling steps. Inspired by the +efficient text-to-image approach adversarial diffusion distillation (ADD), we +design AddSR to address this issue by incorporating the ideas of both +distillation and ControlNet. Specifically, we first propose a prediction-based +self-refinement strategy to provide high-frequency information in the student +model output with marginal additional time cost. Furthermore, we refine the +training process by employing HR images, rather than LR images, to regulate the +teacher model, providing a more robust constraint for distillation. Second, we +introduce a timestep-adapting loss to address the perception-distortion +imbalance problem introduced by ADD. Extensive experiments demonstrate our +AddSR generates better restoration results, while achieving faster speed than +previous SD-based state-of-the-art models (e.g., 7x faster than SeeSR). + +
+
+
+
+
+ + ♻ ☆ Your Student is Better Than Expected: Adaptive Teacher-Student + Collaboration for Text-Conditional Diffusion Models CVPR2024 + + +
+ Knowledge distillation methods have recently shown to be a promising +direction to speedup the synthesis of large-scale diffusion models by requiring +only a few inference steps. While several powerful distillation methods were +recently proposed, the overall quality of student samples is typically lower +compared to the teacher ones, which hinders their practical usage. In this +work, we investigate the relative quality of samples produced by the teacher +text-to-image diffusion model and its distilled student version. As our main +empirical finding, we discover that a noticeable portion of student samples +exhibit superior fidelity compared to the teacher ones, despite the +"approximate" nature of the student. Based on this finding, we propose an +adaptive collaboration between student and teacher diffusion models for +effective text-to-image synthesis. Specifically, the distilled model produces +the initial sample, and then an oracle decides whether it needs further +improvements with a slow teacher model. Extensive experiments demonstrate that +the designed pipeline surpasses state-of-the-art text-to-image alternatives for +various inference budgets in terms of human preference. Furthermore, the +proposed approach can be naturally used in popular applications such as +text-guided image editing and controllable generation. + +
+
+ comment: CVPR2024 camera ready +
+
+
+
+
+ + ♻ ☆ ElasticLaneNet: An Efficient Geometry-Flexible Approach for Lane + Detection + + +
+ The task of lane detection involves identifying the boundaries of driving +areas in real-time. Recognizing lanes with variable and complex geometric +structures remains a challenge. In this paper, we explore a novel and flexible +way of implicit lanes representation named \textit{Elastic Lane map (ELM)}, and +introduce an efficient physics-informed end-to-end lane detection framework, +namely, ElasticLaneNet (Elastic interaction energy-informed Lane detection +Network). The approach considers predicted lanes as moving zero-contours on the +flexibly shaped \textit{ELM} that are attracted to the ground truth guided by +an elastic interaction energy-loss function (EIE loss). Our framework well +integrates the global information and low-level features. The method performs +well in complex lane scenarios, including those with large curvature, weak +geometry features at intersections, complicated cross lanes, Y-shapes lanes, +dense lanes, etc. We apply our approach on three datasets: SDLane, CULane, and +TuSimple. The results demonstrate exceptional performance of our method, with +the state-of-the-art results on the structurally diverse SDLane, achieving +F1-score of 89.51, Recall rate of 87.50, and Precision of 91.61 with fast +inference speed. + +
+
+
+
+
+ + ♻ ☆ Dynamic LiDAR Re-simulation using Compositional Neural Fields + + +
+ We introduce DyNFL, a novel neural field-based approach for high-fidelity +re-simulation of LiDAR scans in dynamic driving scenes. DyNFL processes LiDAR +measurements from dynamic environments, accompanied by bounding boxes of moving +objects, to construct an editable neural field. This field, comprising +separately reconstructed static background and dynamic objects, allows users to +modify viewpoints, adjust object positions, and seamlessly add or remove +objects in the re-simulated scene. A key innovation of our method is the neural +field composition technique, which effectively integrates reconstructed neural +assets from various scenes through a ray drop test, accounting for occlusions +and transparent surfaces. Our evaluation with both synthetic and real-world +environments demonstrates that DyNFL substantially improves dynamic scene LiDAR +simulation, offering a combination of physical fidelity and flexible editing +capabilities. + +
+
+ comment: Project page: https://shengyuh.github.io/dynfl +
+
+
+
+
+ + ♻ ☆ Three Heads Are Better Than One: Complementary Experts for Long-Tailed + Semi-supervised Learning AAAI2024 + + +
+ We address the challenging problem of Long-Tailed Semi-Supervised Learning +(LTSSL) where labeled data exhibit imbalanced class distribution and unlabeled +data follow an unknown distribution. Unlike in balanced SSL, the generated +pseudo-labels are skewed towards head classes, intensifying the training bias. +Such a phenomenon is even amplified as more unlabeled data will be mislabeled +as head classes when the class distribution of labeled and unlabeled datasets +are mismatched. To solve this problem, we propose a novel method named +ComPlementary Experts (CPE). Specifically, we train multiple experts to model +various class distributions, each of them yielding high-quality pseudo-labels +within one form of class distribution. Besides, we introduce Classwise Batch +Normalization for CPE to avoid performance degradation caused by feature +distribution mismatch between head and non-head classes. CPE achieves +state-of-the-art performances on CIFAR-10-LT, CIFAR-100-LT, and STL-10-LT +dataset benchmarks. For instance, on CIFAR-10-LT, CPE improves test accuracy by +over 2.22% compared to baselines. Code is available at +https://github.com/machengcheng2016/CPE-LTSSL. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ♻ ☆ Elastic Interaction Energy-Informed Real-Time Traffic Scene Perception + + +
+ Urban segmentation and lane detection are two important tasks for traffic +scene perception. Accuracy and fast inference speed of visual perception are +crucial for autonomous driving safety. Fine and complex geometric objects are +the most challenging but important recognition targets in traffic scene, such +as pedestrians, traffic signs and lanes. In this paper, a simple and efficient +topology-aware energy loss function-based network training strategy named +EIEGSeg is proposed. EIEGSeg is designed for multi-class segmentation on +real-time traffic scene perception. To be specific, the convolutional neural +network (CNN) extracts image features and produces multiple outputs, and the +elastic interaction energy loss function (EIEL) drives the predictions moving +toward the ground truth until they are completely overlapped. Our strategy +performs well especially on fine-scale structure, \textit{i.e.} small or +irregularly shaped objects can be identified more accurately, and discontinuity +issues on slender objects can be improved. We quantitatively and qualitatively +analyze our method on three traffic datasets, including urban scene +segmentation data Cityscapes and lane detection data TuSimple and CULane. Our +results demonstrate that EIEGSeg consistently improves the performance, +especially on real-time, lightweight networks that are better suited for +autonomous driving. + +
+
+
+
+
+ + ♻ ☆ Strengthening Multimodal Large Language Model with Bootstrapped + Preference Optimization + + +
+ Multimodal Large Language Models (MLLMs) excel in generating responses based +on visual inputs. However, they often suffer from a bias towards generating +responses similar to their pretraining corpus, overshadowing the importance of +visual information. We treat this bias as a "preference" for pretraining +statistics, which hinders the model's grounding in visual input. To mitigate +this issue, we propose Bootstrapped Preference Optimization (BPO), which +conducts preference learning with datasets containing negative responses +bootstrapped from the model itself. Specifically, we propose the following two +strategies: 1) using distorted image inputs to the MLLM for eliciting responses +that contain signified pretraining bias; 2) leveraging text-based LLM to +explicitly inject erroneous but common elements into the original response. +Those undesirable responses are paired with original annotated responses from +the datasets to construct the preference dataset, which is subsequently +utilized to perform preference learning. Our approach effectively suppresses +pretrained LLM bias, enabling enhanced grounding in visual inputs. Extensive +experimentation demonstrates significant performance improvements across +multiple benchmarks, advancing the state-of-the-art in multimodal +conversational systems. + +
+
+
+
+
+ + ♻ ☆ Isometric Multi-Shape Matching + + +
+ Finding correspondences between shapes is a fundamental problem in computer +vision and graphics, which is relevant for many applications, including 3D +reconstruction, object tracking, and style transfer. The vast majority of +correspondence methods aim to find a solution between pairs of shapes, even if +multiple instances of the same class are available. While isometries are often +studied in shape correspondence problems, they have not been considered +explicitly in the multi-matching setting. This paper closes this gap by +proposing a novel optimisation formulation for isometric multi-shape matching. +We present a suitable optimisation algorithm for solving our formulation and +provide a convergence and complexity analysis. Our algorithm obtains +multi-matchings that are by construction provably cycle-consistent. We +demonstrate the superior performance of our method on various datasets and set +the new state-of-the-art in isometric multi-shape matching. + +
+
+
+
+
+ + ♻ ☆ Semi-supervised Active Learning for Video Action Detection AAAI + + +
+ In this work, we focus on label efficient learning for video action +detection. We develop a novel semi-supervised active learning approach which +utilizes both labeled as well as unlabeled data along with informative sample +selection for action detection. Video action detection requires spatio-temporal +localization along with classification, which poses several challenges for both +active learning informative sample selection as well as semi-supervised +learning pseudo label generation. First, we propose NoiseAug, a simple +augmentation strategy which effectively selects informative samples for video +action detection. Next, we propose fft-attention, a novel technique based on +high-pass filtering which enables effective utilization of pseudo label for SSL +in video action detection by emphasizing on relevant activity region within a +video. We evaluate the proposed approach on three different benchmark datasets, +UCF-101-24, JHMDB-21, and Youtube-VOS. First, we demonstrate its effectiveness +on video action detection where the proposed approach outperforms prior works +in semi-supervised and weakly-supervised learning along with several baseline +approaches in both UCF101-24 and JHMDB-21. Next, we also show its effectiveness +on Youtube-VOS for video object segmentation demonstrating its generalization +capability for other dense prediction tasks in videos. The code and models is +publicly available at: +\url{https://github.com/AKASH2907/semi-sup-active-learning}. + +
+
+ comment: AAAI Conference on Artificial Intelligence, Main Technical Track + (AAAI), 2024, Code: https://github.com/AKASH2907/semi-sup-active-learning +
+
+
+
+
+ + ♻ ☆ Text-Driven Image Editing via Learnable Regions CVPR 2024 + + +
+ Language has emerged as a natural interface for image editing. In this paper, +we introduce a method for region-based image editing driven by textual prompts, +without the need for user-provided masks or sketches. Specifically, our +approach leverages an existing pre-trained text-to-image model and introduces a +bounding box generator to identify the editing regions that are aligned with +the textual prompts. We show that this simple approach enables flexible editing +that is compatible with current image generation models, and is able to handle +complex prompts featuring multiple objects, complex sentences, or lengthy +paragraphs. We conduct an extensive user study to compare our method against +state-of-the-art methods. The experiments demonstrate the competitive +performance of our method in manipulating images with high fidelity and realism +that correspond to the provided language descriptions. Our project webpage can +be found at: https://yuanze-lin.me/LearnableRegions_page. + +
+
+ comment: Accepted to CVPR 2024 Project webpage: + https://yuanze-lin.me/LearnableRegions_page +
+
+
+
+
+ + ♻ ☆ SIGMA: Scale-Invariant Global Sparse Shape Matching + + +
+ We propose a novel mixed-integer programming (MIP) formulation for generating +precise sparse correspondences for highly non-rigid shapes. To this end, we +introduce a projected Laplace-Beltrami operator (PLBO) which combines intrinsic +and extrinsic geometric information to measure the deformation quality induced +by predicted correspondences. We integrate the PLBO, together with an +orientation-aware regulariser, into a novel MIP formulation that can be solved +to global optimality for many practical problems. In contrast to previous +methods, our approach is provably invariant to rigid transformations and global +scaling, initialisation-free, has optimality guarantees, and scales to high +resolution meshes with (empirically observed) linear time. We show +state-of-the-art results for sparse non-rigid matching on several challenging +3D datasets, including data with inconsistent meshing, as well as applications +in mesh-to-point-cloud matching. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ♻ ☆ Conquering the Communication Constraints to Enable Large Pre-Trained + Models in Federated Learning + + +
+ Federated learning (FL) has emerged as a promising paradigm for enabling the +collaborative training of models without centralized access to the raw data on +local devices. In the typical FL paradigm (e.g., FedAvg), model weights are +sent to and from the server each round to participating clients. Recently, the +use of small pre-trained models has been shown effective in federated learning +optimization and improving convergence. However, recent state-of-the-art +pre-trained models are getting more capable but also have more parameters. In +conventional FL, sharing the enormous model weights can quickly put a massive +communication burden on the system, especially if more capable models are +employed. Can we find a solution to enable those strong and readily-available +pre-trained models in FL to achieve excellent performance while simultaneously +reducing the communication burden? To this end, we investigate the use of +parameter-efficient fine-tuning in federated learning and thus introduce a new +framework: FedPEFT. Specifically, we systemically evaluate the performance of +FedPEFT across a variety of client stability, data distribution, and +differential privacy settings. By only locally tuning and globally sharing a +small portion of the model weights, significant reductions in the total +communication overhead can be achieved while maintaining competitive or even +better performance in a wide range of federated learning scenarios, providing +insight into a new paradigm for practical and effective federated systems. + +
+
+
+
+
+ + ♻ ☆ Towards Seamless Adaptation of Pre-trained Models for Visual Place + Recognition ICLR2024 + + +
+ Recent studies show that vision models pre-trained in generic visual learning +tasks with large-scale data can provide useful feature representations for a +wide range of visual perception problems. However, few attempts have been made +to exploit pre-trained foundation models in visual place recognition (VPR). Due +to the inherent difference in training objectives and data between the tasks of +model pre-training and VPR, how to bridge the gap and fully unleash the +capability of pre-trained models for VPR is still a key issue to address. To +this end, we propose a novel method to realize seamless adaptation of +pre-trained models for VPR. Specifically, to obtain both global and local +features that focus on salient landmarks for discriminating places, we design a +hybrid adaptation method to achieve both global and local adaptation +efficiently, in which only lightweight adapters are tuned without adjusting the +pre-trained model. Besides, to guide effective adaptation, we propose a mutual +nearest neighbor local feature loss, which ensures proper dense local features +are produced for local matching and avoids time-consuming spatial verification +in re-ranking. Experimental results show that our method outperforms the +state-of-the-art methods with less training data and training time, and uses +about only 3% retrieval runtime of the two-stage VPR methods with RANSAC-based +spatial verification. It ranks 1st on the MSLS challenge leaderboard (at the +time of submission). The code is released at +https://github.com/Lu-Feng/SelaVPR. + +
+
+ comment: ICLR2024 +
+
+
+
+
+ + ♻ ☆ NEAT: Distilling 3D Wireframes from Neural Attraction Fields CVPR 2024 + + +
+ This paper studies the problem of structured 3D reconstruction using +wireframes that consist of line segments and junctions, focusing on the +computation of structured boundary geometries of scenes. Instead of leveraging +matching-based solutions from 2D wireframes (or line segments) for 3D wireframe +reconstruction as done in prior arts, we present NEAT, a rendering-distilling +formulation using neural fields to represent 3D line segments with 2D +observations, and bipartite matching for perceiving and distilling of a sparse +set of 3D global junctions. The proposed {NEAT} enjoys the joint optimization +of the neural fields and the global junctions from scratch, using +view-dependent 2D observations without precomputed cross-view feature matching. +Comprehensive experiments on the DTU and BlendedMVS datasets demonstrate our +NEAT's superiority over state-of-the-art alternatives for 3D wireframe +reconstruction. Moreover, the distilled 3D global junctions by NEAT, are a +better initialization than SfM points, for the recently-emerged 3D Gaussian +Splatting for high-fidelity novel view synthesis using about 20 times fewer +initial 3D points. Project page: \url{https://xuenan.net/neat}. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ RadEdit: stress-testing biomedical vision models via diffusion image + editing + + +
+ Biomedical imaging datasets are often small and biased, meaning that +real-world performance of predictive models can be substantially lower than +expected from internal testing. This work proposes using generative image +editing to simulate dataset shifts and diagnose failure modes of biomedical +vision models; this can be used in advance of deployment to assess readiness, +potentially reducing cost and patient harm. Existing editing methods can +produce undesirable changes, with spurious correlations learned due to the +co-occurrence of disease and treatment interventions, limiting practical +applicability. To address this, we train a text-to-image diffusion model on +multiple chest X-ray datasets and introduce a new editing method RadEdit that +uses multiple masks, if present, to constrain changes and ensure consistency in +the edited images. We consider three types of dataset shifts: acquisition +shift, manifestation shift, and population shift, and demonstrate that our +approach can diagnose failures and quantify model robustness without additional +data collection, complementing more qualitative tools for explainable AI. + +
+
+
+
+
+ + ♻ ☆ DriftRec: Adapting diffusion models to blind JPEG restoration + + +
+ In this work, we utilize the high-fidelity generation abilities of diffusion +models to solve blind JPEG restoration at high compression levels. We propose +an elegant modification of the forward stochastic differential equation of +diffusion models to adapt them to this restoration task and name our method +DriftRec. Comparing DriftRec against an $L_2$ regression baseline with the same +network architecture and state-of-the-art techniques for JPEG restoration, we +show that our approach can escape the tendency of other methods to generate +blurry images, and recovers the distribution of clean images significantly more +faithfully. For this, only a dataset of clean/corrupted image pairs and no +knowledge about the corruption operation is required, enabling wider +applicability to other restoration tasks. In contrast to other conditional and +unconditional diffusion models, we utilize the idea that the distributions of +clean and corrupted images are much closer to each other than each is to the +usual Gaussian prior of the reverse process in diffusion models. Our approach +therefore requires only low levels of added noise and needs comparatively few +sampling steps even without further optimizations. We show that DriftRec +naturally generalizes to realistic and difficult scenarios such as unaligned +double JPEG compression and blind restoration of JPEGs found online, without +having encountered such examples during training. + +
+
+ comment: (C) 2024 IEEE. Personal use of this material is permitted. Permission + from IEEE must be obtained for all other uses, in any current or future + media, including reprinting/republishing this material for advertising or + promotional purposes, creating new collective works, for resale or + redistribution to servers or lists, or reuse of any copyrighted component of + this work in other works +
+
+
+
+
+ + ♻ ☆ Repurposing Diffusion-Based Image Generators for Monocular Depth + Estimation CVPR 2024 + + +
+ Monocular depth estimation is a fundamental computer vision task. Recovering +3D depth from a single image is geometrically ill-posed and requires scene +understanding, so it is not surprising that the rise of deep learning has led +to a breakthrough. The impressive progress of monocular depth estimators has +mirrored the growth in model capacity, from relatively modest CNNs to large +Transformer architectures. Still, monocular depth estimators tend to struggle +when presented with images with unfamiliar content and layout, since their +knowledge of the visual world is restricted by the data seen during training, +and challenged by zero-shot generalization to new domains. This motivates us to +explore whether the extensive priors captured in recent generative diffusion +models can enable better, more generalizable depth estimation. We introduce +Marigold, a method for affine-invariant monocular depth estimation that is +derived from Stable Diffusion and retains its rich prior knowledge. The +estimator can be fine-tuned in a couple of days on a single GPU using only +synthetic training data. It delivers state-of-the-art performance across a wide +range of datasets, including over 20% performance gains in specific cases. +Project page: https://marigoldmonodepth.github.io. + +
+
+ comment: CVPR 2024 camera ready +
+
+
+
+
+ + ♻ ☆ Learnable Weight Initialization for Volumetric Medical Image + Segmentation + + +
+ Hybrid volumetric medical image segmentation models, combining the advantages +of local convolution and global attention, have recently received considerable +attention. While mainly focusing on architectural modifications, most existing +hybrid approaches still use conventional data-independent weight initialization +schemes which restrict their performance due to ignoring the inherent +volumetric nature of the medical data. To address this issue, we propose a +learnable weight initialization approach that utilizes the available medical +training data to effectively learn the contextual and structural cues via the +proposed self-supervised objectives. Our approach is easy to integrate into any +hybrid model and requires no external training data. Experiments on multi-organ +and lung cancer segmentation tasks demonstrate the effectiveness of our +approach, leading to state-of-the-art segmentation performance. Our proposed +data-dependent initialization approach performs favorably as compared to the +Swin-UNETR model pretrained using large-scale datasets on multi-organ +segmentation task. Our source code and models are available at: +https://github.com/ShahinaKK/LWI-VMS. + +
+
+ comment: Accepted at Elsevier AI in Medicine Journal +
+
+
+
+
+ + ♻ ☆ Evaluating GPT-4 with Vision on Detection of Radiological Findings on + Chest Radiographs + + +
+ The study examines the application of GPT-4V, a multi-modal large language +model equipped with visual recognition, in detecting radiological findings from +a set of 100 chest radiographs and suggests that GPT-4V is currently not ready +for real-world diagnostic usage in interpreting chest radiographs. + +
+
+
+
+
+ + ♻ ☆ ResNet with Integrated Convolutional Block Attention Module for Ship + Classification Using Transfer Learning on Optical Satellite Imagery + + +
+ This study proposes a novel transfer learning framework for effective ship +classification using high-resolution optical remote sensing satellite imagery. +The framework is based on the deep convolutional neural network model ResNet50 +and incorporates the Convolutional Block Attention Module (CBAM) to enhance +performance. CBAM enables the model to attend to salient features in the +images, allowing it to better discriminate between subtle differences between +ships and backgrounds. Furthermore, this study adopts a transfer learning +approach tailored for accurately classifying diverse types of ships by +fine-tuning a pre-trained model for the specific task. Experimental results +demonstrate the efficacy of the proposed framework in ship classification using +optical remote sensing imagery, achieving a high classification accuracy of 94% +across 5 classes, outperforming existing methods. This research holds potential +applications in maritime surveillance and management, illegal fishing +detection, and maritime traffic monitoring. + +
+
+
+
+
+ + ♻ ☆ ReCoRe: Regularized Contrastive Representation Learning of World Model CVPR 2024 + + +
+ While recent model-free Reinforcement Learning (RL) methods have demonstrated +human-level effectiveness in gaming environments, their success in everyday +tasks like visual navigation has been limited, particularly under significant +appearance variations. This limitation arises from (i) poor sample efficiency +and (ii) over-fitting to training scenarios. To address these challenges, we +present a world model that learns invariant features using (i) contrastive +unsupervised learning and (ii) an intervention-invariant regularizer. Learning +an explicit representation of the world dynamics i.e. a world model, improves +sample efficiency while contrastive learning implicitly enforces learning of +invariant features, which improves generalization. However, the na\"ive +integration of contrastive loss to world models is not good enough, as +world-model-based RL methods independently optimize representation learning and +agent policy. To overcome this issue, we propose an intervention-invariant +regularizer in the form of an auxiliary task such as depth prediction, image +denoising, image segmentation, etc., that explicitly enforces invariance to +style interventions. Our method outperforms current state-of-the-art +model-based and model-free RL methods and significantly improves on +out-of-distribution point navigation tasks evaluated on the iGibson benchmark. +With only visual observations, we further demonstrate that our approach +outperforms recent language-guided foundation models for point navigation, +which is essential for deployment on robots with limited computation +capabilities. Finally, we demonstrate that our proposed model excels at the +sim-to-real transfer of its perception module on the Gibson benchmark. + +
+
+ comment: Accepted at CVPR 2024. arXiv admin note: text overlap with + arXiv:2209.14932 +
+
+
+
+
+ + ♻ ☆ AGFSync: Leveraging AI-Generated Feedback for Preference Optimization in + Text-to-Image Generation + + +
+ Text-to-Image (T2I) diffusion models have achieved remarkable success in +image generation. Despite their progress, challenges remain in both +prompt-following ability, image quality and lack of high-quality datasets, +which are essential for refining these models. As acquiring labeled data is +costly, we introduce AGFSync, a framework that enhances T2I diffusion models +through Direct Preference Optimization (DPO) in a fully AI-driven approach. +AGFSync utilizes Vision-Language Models (VLM) to assess image quality across +style, coherence, and aesthetics, generating feedback data within an AI-driven +loop. By applying AGFSync to leading T2I models such as SD v1.4, v1.5, and +SDXL, our extensive experiments on the TIFA dataset demonstrate notable +improvements in VQA scores, aesthetic evaluations, and performance on the HPSv2 +benchmark, consistently outperforming the base models. AGFSync's method of +refining T2I diffusion models paves the way for scalable alignment techniques. + +
+
+
+
+
+ + ♻ ☆ FreeMan: Towards Benchmarking 3D Human Pose Estimation under Real-World + Conditions CVPR2024 + + +
+ Estimating the 3D structure of the human body from natural scenes is a +fundamental aspect of visual perception. 3D human pose estimation is a vital +step in advancing fields like AIGC and human-robot interaction, serving as a +crucial technique for understanding and interacting with human actions in +real-world settings. However, the current datasets, often collected under +single laboratory conditions using complex motion capture equipment and +unvarying backgrounds, are insufficient. The absence of datasets on variable +conditions is stalling the progress of this crucial task. To facilitate the +development of 3D pose estimation, we present FreeMan, the first large-scale, +multi-view dataset collected under the real-world conditions. FreeMan was +captured by synchronizing 8 smartphones across diverse scenarios. It comprises +11M frames from 8000 sequences, viewed from different perspectives. These +sequences cover 40 subjects across 10 different scenarios, each with varying +lighting conditions. We have also established an semi-automated pipeline +containing error detection to reduce the workload of manual check and ensure +precise annotation. We provide comprehensive evaluation baselines for a range +of tasks, underlining the significant challenges posed by FreeMan. Further +evaluations of standard indoor/outdoor human sensing datasets reveal that +FreeMan offers robust representation transferability in real and complex +scenes. Code and data are available at https://wangjiongw.github.io/freeman. + +
+
+ comment: CVPR2024 camera ready version. 19 pages, 16 figures. Project page: + https://wangjiongw.github.io/freeman/ ; API: + https://github.com/wangjiongw/FreeMan_API +
+
+
+
+
+ + ♻ ☆ eWand: A calibration framework for wide baseline frame-based and + event-based camera systems ICRA 2024 + + +
+ Accurate calibration is crucial for using multiple cameras to triangulate the +position of objects precisely. However, it is also a time-consuming process +that needs to be repeated for every displacement of the cameras. The standard +approach is to use a printed pattern with known geometry to estimate the +intrinsic and extrinsic parameters of the cameras. The same idea can be applied +to event-based cameras, though it requires extra work. By using frame +reconstruction from events, a printed pattern can be detected. A blinking +pattern can also be displayed on a screen. Then, the pattern can be directly +detected from the events. Such calibration methods can provide accurate +intrinsic calibration for both frame- and event-based cameras. However, using +2D patterns has several limitations for multi-camera extrinsic calibration, +with cameras possessing highly different points of view and a wide baseline. +The 2D pattern can only be detected from one direction and needs to be of +significant size to compensate for its distance to the camera. This makes the +extrinsic calibration time-consuming and cumbersome. To overcome these +limitations, we propose eWand, a new method that uses blinking LEDs inside +opaque spheres instead of a printed or displayed pattern. Our method provides a +faster, easier-to-use extrinsic calibration approach that maintains high +accuracy for both event- and frame-based cameras. + +
+
+ comment: Accepted for 2024 IEEE International Conference on Robotics and + Automation (ICRA 2024). Project web page: + https://cogsys-tuebingen.github.io/ewand/ +
+
+
+
+
+ + ♻ ☆ Hallucination Benchmark in Medical Visual Question Answering ICLR 2024 + + +
+ The recent success of large language and vision models (LLVMs) on vision +question answering (VQA), particularly their applications in medicine +(Med-VQA), has shown a great potential of realizing effective visual assistants +for healthcare. However, these models are not extensively tested on the +hallucination phenomenon in clinical settings. Here, we created a hallucination +benchmark of medical images paired with question-answer sets and conducted a +comprehensive evaluation of the state-of-the-art models. The study provides an +in-depth analysis of current models' limitations and reveals the effectiveness +of various prompting strategies. + +
+
+ comment: Accepted to ICLR 2024 Tiny Papers(Notable) +
+
+
+
+
+ + ♻ ☆ LLaFS: When Large Language Models Meet Few-Shot Segmentation CVPR2024 + + +
+ This paper proposes LLaFS, the first attempt to leverage large language +models (LLMs) in few-shot segmentation. In contrast to the conventional +few-shot segmentation methods that only rely on the limited and biased +information from the annotated support images, LLaFS leverages the vast prior +knowledge gained by LLM as an effective supplement and directly uses the LLM to +segment images in a few-shot manner. To enable the text-based LLM to handle +image-related tasks, we carefully design an input instruction that allows the +LLM to produce segmentation results represented as polygons, and propose a +region-attribute table to simulate the human visual mechanism and provide +multi-modal guidance. We also synthesize pseudo samples and use curriculum +learning for pretraining to augment data and achieve better optimization. LLaFS +achieves state-of-the-art results on multiple datasets, showing the potential +of using LLMs for few-shot computer vision tasks. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ♻ ☆ DETRs Beat YOLOs on Real-time Object Detection + + +
+ The YOLO series has become the most popular framework for real-time object +detection due to its reasonable trade-off between speed and accuracy. However, +we observe that the speed and accuracy of YOLOs are negatively affected by the +NMS. Recently, end-to-end Transformer-based detectors (DETRs) have provided an +alternative to eliminating NMS. Nevertheless, the high computational cost +limits their practicality and hinders them from fully exploiting the advantage +of excluding NMS. In this paper, we propose the Real-Time DEtection TRansformer +(RT-DETR), the first real-time end-to-end object detector to our best knowledge +that addresses the above dilemma. We build RT-DETR in two steps, drawing on the +advanced DETR: first we focus on maintaining accuracy while improving speed, +followed by maintaining speed while improving accuracy. Specifically, we design +an efficient hybrid encoder to expeditiously process multi-scale features by +decoupling intra-scale interaction and cross-scale fusion to improve speed. +Then, we propose the uncertainty-minimal query selection to provide +high-quality initial queries to the decoder, thereby improving accuracy. In +addition, RT-DETR supports flexible speed tuning by adjusting the number of +decoder layers to adapt to various scenarios without retraining. Our +RT-DETR-R50 / R101 achieves 53.1% / 54.3% AP on COCO and 108 / 74 FPS on T4 +GPU, outperforming previously advanced YOLOs in both speed and accuracy. We +also develop scaled RT-DETRs that outperform the lighter YOLO detectors (S and +M models). Furthermore, RT-DETR-R50 outperforms DINO-R50 by 2.2% AP in accuracy +and about 21 times in FPS. After pre-training with Objects365, RT-DETR-R50 / +R101 achieves 55.3% / 56.2% AP. The project page: +https://zhao-yian.github.io/RTDETR. + +
+
+
+
+
+ + ♻ ☆ Creating Ensembles of Classifiers through UMDA for Aerial Scene + Classification GECCO2024 + + +
+ Aerial scene classification, which aims to semantically label remote sensing +images in a set of predefined classes (e.g., agricultural, beach, and harbor), +is a very challenging task in remote sensing due to high intra-class +variability and the different scales and orientations of the objects present in +the dataset images. In remote sensing area, the use of CNN architectures as an +alternative solution is also a reality for scene classification tasks. +Generally, these CNNs are used to perform the traditional image classification +task. However, another less used way to classify remote sensing image might be +the one that uses deep metric learning (DML) approaches. In this sense, this +work proposes to employ six DML approaches for aerial scene classification +tasks, analysing their behave with four different pre-trained CNNs as well as +combining them through the use of evolutionary computation algorithm (UMDA). In +performed experiments, it is possible to observe than DML approaches can +achieve the best classification results when compared to traditional +pre-trained CNNs for three well-known remote sensing aerial scene datasets. In +addition, the UMDA algorithm proved to be a promising strategy to combine DML +approaches when there is diversity among them, managing to improve at least +5.6% of accuracy in the classification results using almost 50\% of the +available classifiers for the construction of the final ensemble of +classifiers. + +
+
+ comment: 9 pages, 4 figures, accepted for presentation at the GECCO2024 +
+
+
+
+
+ + ♻ ☆ LYT-Net: Lightweight YUV Transformer-based Network for Low-Light Image + Enhancement ICIP + + +
+ In recent years, deep learning-based solutions have proven successful in the +domains of image enhancement. This paper introduces LYT-Net, or Lightweight YUV +Transformer-based Network, as a novel approach for low-light image enhancement. +The proposed architecture, distinct from conventional Retinex-based models, +leverages the YUV color space's natural separation of luminance (Y) and +chrominance (U and V) to simplify the intricate task of disentangling light and +color information in images. By utilizing the strengths of transformers, known +for their capability to capture long-range dependencies, LYT-Net ensures a +comprehensive contextual understanding of the image while maintaining reduced +model complexity. By employing a novel hybrid loss function, our proposed +method achieves state-of-the-art results on low-light image enhancement +datasets, all while being considerably more compact than its counterparts. The +source code and pre-trained models are available at +https://github.com/albrateanu/LYT-Net + +
+
+ comment: 10 pages, 6 figures, submitted to ICIP +
+
+
+
+
+ + ♻ ☆ Mind The Edge: Refining Depth Edges in Sparsely-Supervised Monocular + Depth Estimation CVPR24 + + +
+ Monocular Depth Estimation (MDE) is a fundamental problem in computer vision +with numerous applications. Recently, LIDAR-supervised methods have achieved +remarkable per-pixel depth accuracy in outdoor scenes. However, significant +errors are typically found in the proximity of depth discontinuities, i.e., +depth edges, which often hinder the performance of depth-dependent applications +that are sensitive to such inaccuracies, e.g., novel view synthesis and +augmented reality. Since direct supervision for the location of depth edges is +typically unavailable in sparse LIDAR-based scenes, encouraging the MDE model +to produce correct depth edges is not straightforward. To the best of our +knowledge this paper is the first attempt to address the depth edges issue for +LIDAR-supervised scenes. In this work we propose to learn to detect the +location of depth edges from densely-supervised synthetic data, and use it to +generate supervision for the depth edges in the MDE training. To quantitatively +evaluate our approach, and due to the lack of depth edges GT in LIDAR-based +scenes, we manually annotated subsets of the KITTI and the DDAD datasets with +depth edges ground truth. We demonstrate significant gains in the accuracy of +the depth edges with comparable per-pixel depth accuracy on several challenging +datasets. Code and datasets are available at +\url{https://github.com/liortalker/MindTheEdge}. + +
+
+ comment: Appears in CVPR24' +
+
+
+
+
+ + ♻ ☆ CV-Attention UNet: Attention-based UNet for 3D Cerebrovascular + Segmentation of Enhanced TOF-MRA Images + + +
+ Due to the lack of automated methods, to diagnose cerebrovascular disease, +time-of-flight magnetic resonance angiography (TOF-MRA) is assessed visually, +making it time-consuming. The commonly used encoder-decoder architectures for +cerebrovascular segmentation utilize redundant features, eventually leading to +the extraction of low-level features multiple times. Additionally, +convolutional neural networks (CNNs) suffer from performance degradation when +the batch size is small, and deeper networks experience the vanishing gradient +problem. Methods: In this paper, we attempt to solve these limitations and +propose the 3D cerebrovascular attention UNet method, named CV-AttentionUNet, +for precise extraction of brain vessel images. We proposed a sequence of +preprocessing techniques followed by deeply supervised UNet to improve the +accuracy of segmentation of the brain vessels leading to a stroke. To combine +the low and high semantics, we applied the attention mechanism. This mechanism +focuses on relevant associations and neglects irrelevant anatomical +information. Furthermore, the inclusion of deep supervision incorporates +different levels of features that prove to be beneficial for network +convergence. Results: We demonstrate the efficiency of the proposed method by +cross-validating with an unlabeled dataset, which was further labeled by us. We +believe that the novelty of this algorithm lies in its ability to perform well +on both labeled and unlabeled data with image processing-based enhancement. The +results indicate that our method performed better than the existing +state-of-the-art methods on the TubeTK dataset. Conclusion: The proposed method +will help in accurate segmentation of cerebrovascular structure leading to +stroke + +
+
+
+
+
+ + ♻ ☆ From Isolated Islands to Pangea: Unifying Semantic Space for Human + Action Understanding CVPR 2024 + + +
+ Action understanding has attracted long-term attention. It can be formed as +the mapping from the physical space to the semantic space. Typically, +researchers built datasets according to idiosyncratic choices to define classes +and push the envelope of benchmarks respectively. Datasets are incompatible +with each other like "Isolated Islands" due to semantic gaps and various class +granularities, e.g., do housework in dataset A and wash plate in dataset B. We +argue that we need a more principled semantic space to concentrate the +community efforts and use all datasets together to pursue generalizable action +learning. To this end, we design a structured action semantic space given verb +taxonomy hierarchy and covering massive actions. By aligning the classes of +previous datasets to our semantic space, we gather (image/video/skeleton/MoCap) +datasets into a unified database in a unified label system, i.e., bridging +"isolated islands" into a "Pangea". Accordingly, we propose a novel model +mapping from the physical space to semantic space to fully use Pangea. In +extensive experiments, our new system shows significant superiority, especially +in transfer learning. Our code and data will be made public at +https://mvig-rhos.com/pangea. + +
+
+ comment: CVPR 2024, Project Webpage: https://mvig-rhos.com/pangea +
+
+
+
+
+ + ♻ ☆ RDumb: A simple approach that questions our progress in continual + test-time adaptation + + +
+ Test-Time Adaptation (TTA) allows to update pre-trained models to changing +data distributions at deployment time. While early work tested these algorithms +for individual fixed distribution shifts, recent work proposed and applied +methods for continual adaptation over long timescales. To examine the reported +progress in the field, we propose the Continually Changing Corruptions (CCC) +benchmark to measure asymptotic performance of TTA techniques. We find that +eventually all but one state-of-the-art methods collapse and perform worse than +a non-adapting model, including models specifically proposed to be robust to +performance collapse. In addition, we introduce a simple baseline, "RDumb", +that periodically resets the model to its pretrained state. RDumb performs +better or on par with the previously proposed state-of-the-art in all +considered benchmarks. Our results show that previous TTA approaches are +neither effective at regularizing adaptation to avoid collapse nor able to +outperform a simplistic resetting strategy. + +
+
+
+
+
+ + ♻ ☆ Task-conditioned adaptation of visual features in multi-task policy + learning + + +
+ Successfully addressing a wide variety of tasks is a core ability of +autonomous agents, requiring flexibly adapting the underlying decision-making +strategies and, as we argue in this work, also adapting the perception modules. +An analogical argument would be the human visual system, which uses top-down +signals to focus attention determined by the current task. Similarly, we adapt +pre-trained large vision models conditioned on specific downstream tasks in the +context of multi-task policy learning. We introduce task-conditioned adapters +that do not require finetuning any pre-trained weights, combined with a single +policy trained with behavior cloning and capable of addressing multiple tasks. +We condition the visual adapters on task embeddings, which can be selected at +inference if the task is known, or alternatively inferred from a set of example +demonstrations. To this end, we propose a new optimization-based estimator. We +evaluate the method on a wide variety of tasks from the CortexBench benchmark +and show that, compared to existing work, it can be addressed with a single +policy. In particular, we demonstrate that adapting visual features is a key +design choice and that the method generalizes to unseen tasks given a few +demonstrations. + +
+
+
+
+
+ + ♻ ☆ Robustness Assessment of a Runway Object Classifier for Safe Aircraft + Taxiing SC + + +
+ As deep neural networks (DNNs) are becoming the prominent solution for many +computational problems, the aviation industry seeks to explore their potential +in alleviating pilot workload and in improving operational safety. However, the +use of DNNs in this type of safety-critical applications requires a thorough +certification process. This need can be addressed through formal verification, +which provides rigorous assurances -- e.g.,~by proving the absence of certain +mispredictions. In this case-study paper, we demonstrate this process using an +image-classifier DNN currently under development at Airbus and intended for use +during the aircraft taxiing phase. We use formal methods to assess this DNN's +robustness to three common image perturbation types: noise, brightness and +contrast, and some of their combinations. This process entails multiple +invocations of the underlying verifier, which might be computationally +expensive; and we therefore propose a method that leverages the monotonicity of +these robustness properties, as well as the results of past verification +queries, in order to reduce the overall number of verification queries required +by nearly 60%. Our results provide an indication of the level of robustness +achieved by the DNN classifier under study, and indicate that it is +considerably more vulnerable to noise than to brightness or contrast +perturbations. + +
+
+ comment: This is a preprint version of the paper in the proceedings of 43rd + Digital Avionics Systems Conference (DASC) +
+
+
+
+
+ + ♻ ☆ Long-term Frame-Event Visual Tracking: Benchmark Dataset and Baseline + + +
+ Current event-/frame-event based trackers undergo evaluation on short-term +tracking datasets, however, the tracking of real-world scenarios involves +long-term tracking, and the performance of existing tracking algorithms in +these scenarios remains unclear. In this paper, we first propose a new +long-term and large-scale frame-event single object tracking dataset, termed +FELT. It contains 742 videos and 1,594,474 RGB frames and event stream pairs +and has become the largest frame-event tracking dataset to date. We re-train +and evaluate 15 baseline trackers on our dataset for future works to compare. +More importantly, we find that the RGB frames and event streams are naturally +incomplete due to the influence of challenging factors and spatially sparse +event flow. In response to this, we propose a novel associative memory +Transformer network as a unified backbone by introducing modern Hopfield layers +into multi-head self-attention blocks to fuse both RGB and event data. +Extensive experiments on RGB-Event (FELT), RGB-Thermal (RGBT234, LasHeR), and +RGB-Depth (DepthTrack) datasets fully validated the effectiveness of our model. +The dataset and source code can be found at +\url{https://github.com/Event-AHU/FELT_SOT_Benchmark}. + +
+
+ comment: In Peer Review +
+
+
+
+
+ + ♻ ☆ Advancing Ante-Hoc Explainable Models through Generative Adversarial + Networks AAAI 2024 + + +
+ This paper presents a novel concept learning framework for enhancing model +interpretability and performance in visual classification tasks. Our approach +appends an unsupervised explanation generator to the primary classifier network +and makes use of adversarial training. During training, the explanation module +is optimized to extract visual concepts from the classifier's latent +representations, while the GAN-based module aims to discriminate images +generated from concepts, from true images. This joint training scheme enables +the model to implicitly align its internally learned concepts with +human-interpretable visual properties. Comprehensive experiments demonstrate +the robustness of our approach, while producing coherent concept activations. +We analyse the learned concepts, showing their semantic concordance with object +parts and visual attributes. We also study how perturbations in the adversarial +training protocol impact both classification and concept acquisition. In +summary, this work presents a significant step towards building inherently +interpretable deep vision models with task-aligned concept representations - a +key enabler for developing trustworthy AI for real-world perception tasks. + +
+
+ comment: Paper accepted in Human-Centric Representation Learning workshop at + AAAI 2024 (https://hcrl-workshop.github.io/2024/). Paper accepted and + presented at Deployable AI Workshop at AAAI-2024 + (https://sites.google.com/view/dai-2024/home) +
+
+
+
+
+ + ♻ ☆ RAVE: Residual Vector Embedding for CLIP-Guided Backlit Image + Enhancement + + +
+ In this paper we propose a novel modification of Contrastive Language-Image +Pre-Training (CLIP) guidance for the task of unsupervised backlit image +enhancement. Our work builds on the state-of-the-art CLIP-LIT approach, which +learns a prompt pair by constraining the text-image similarity between a prompt +(negative/positive sample) and a corresponding image (backlit image/well-lit +image) in the CLIP embedding space. Learned prompts then guide an image +enhancement network. Based on the CLIP-LIT framework, we propose two novel +methods for CLIP guidance. First, we show that instead of tuning prompts in the +space of text embeddings, it is possible to directly tune their embeddings in +the latent space without any loss in quality. This accelerates training and +potentially enables the use of additional encoders that do not have a text +encoder. Second, we propose a novel approach that does not require any prompt +tuning. Instead, based on CLIP embeddings of backlit and well-lit images from +training data, we compute the residual vector in the embedding space as a +simple difference between the mean embeddings of the well-lit and backlit +images. This vector then guides the enhancement network during training, +pushing a backlit image towards the space of well-lit images. This approach +further dramatically reduces training time, stabilizes training and produces +high quality enhanced images without artifacts, both in supervised and +unsupervised training regimes. Additionally, we show that residual vectors can +be interpreted, revealing biases in training data, and thereby enabling +potential bias correction. + +
+
+
+
+
+ + ♻ ☆ MeciFace: Mechanomyography and Inertial Fusion-based Glasses for Edge + Real-Time Recognition of Facial and Eating Activities + + +
+ The increasing prevalence of stress-related eating behaviors and their impact +on overall health highlights the importance of effective and ubiquitous +monitoring systems. In this paper, we present MeciFace, an innovative wearable +technology designed to monitor facial expressions and eating activities in +real-time on-the-edge (RTE). MeciFace aims to provide a low-power, +privacy-conscious, and highly accurate tool for promoting healthy eating +behaviors and stress management. We employ lightweight convolutional neural +networks as backbone models for facial expression and eating monitoring +scenarios. The MeciFace system ensures efficient data processing with a tiny +memory footprint, ranging from 11KB to 19 KB. During RTE evaluation, the system +achieves an F1-score of < 86% for facial expression recognition and 94% for +eating/drinking monitoring, for the RTE of unseen users (user-independent +case). + +
+
+ comment: Submitted to IEEE Transactions on Consumer Electronics +
+
+
+
+
+ + ♻ ☆ Language Guided Domain Generalized Medical Image Segmentation + + +
+ Single source domain generalization (SDG) holds promise for more reliable and +consistent image segmentation across real-world clinical settings particularly +in the medical domain, where data privacy and acquisition cost constraints +often limit the availability of diverse datasets. Depending solely on visual +features hampers the model's capacity to adapt effectively to various domains, +primarily because of the presence of spurious correlations and domain-specific +characteristics embedded within the image features. Incorporating text features +alongside visual features is a potential solution to enhance the model's +understanding of the data, as it goes beyond pixel-level information to provide +valuable context. Textual cues describing the anatomical structures, their +appearances, and variations across various imaging modalities can guide the +model in domain adaptation, ultimately contributing to more robust and +consistent segmentation. In this paper, we propose an approach that explicitly +leverages textual information by incorporating a contrastive learning mechanism +guided by the text encoder features to learn a more robust feature +representation. We assess the effectiveness of our text-guided contrastive +feature alignment technique in various scenarios, including cross-modality, +cross-sequence, and cross-site settings for different segmentation tasks. Our +approach achieves favorable performance against existing methods in literature. +Our code and model weights are available at +https://github.com/ShahinaKK/LG_SDG.git. + +
+
+ comment: Accepted at ISBI2024 +
+
+
+
+
+ + ♻ ☆ Scaling Up to Excellence: Practicing Model Scaling for Photo-Realistic + Image Restoration In the Wild CVPR 2024 + + +
+ We introduce SUPIR (Scaling-UP Image Restoration), a groundbreaking image +restoration method that harnesses generative prior and the power of model +scaling up. Leveraging multi-modal techniques and advanced generative prior, +SUPIR marks a significant advance in intelligent and realistic image +restoration. As a pivotal catalyst within SUPIR, model scaling dramatically +enhances its capabilities and demonstrates new potential for image restoration. +We collect a dataset comprising 20 million high-resolution, high-quality images +for model training, each enriched with descriptive text annotations. SUPIR +provides the capability to restore images guided by textual prompts, broadening +its application scope and potential. Moreover, we introduce negative-quality +prompts to further improve perceptual quality. We also develop a +restoration-guided sampling method to suppress the fidelity issue encountered +in generative-based restoration. Experiments demonstrate SUPIR's exceptional +restoration effects and its novel capacity to manipulate restoration through +textual prompts. + +
+
+ comment: This paper has been accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Analysis of Video Quality Datasets via Design of Minimalistic Video + Quality Models + + +
+ Blind video quality assessment (BVQA) plays an indispensable role in +monitoring and improving the end-users' viewing experience in various +real-world video-enabled media applications. As an experimental field, the +improvements of BVQA models have been measured primarily on a few human-rated +VQA datasets. Thus, it is crucial to gain a better understanding of existing +VQA datasets in order to properly evaluate the current progress in BVQA. +Towards this goal, we conduct a first-of-its-kind computational analysis of VQA +datasets via designing minimalistic BVQA models. By minimalistic, we restrict +our family of BVQA models to build only upon basic blocks: a video preprocessor +(for aggressive spatiotemporal downsampling), a spatial quality analyzer, an +optional temporal quality analyzer, and a quality regressor, all with the +simplest possible instantiations. By comparing the quality prediction +performance of different model variants on eight VQA datasets with realistic +distortions, we find that nearly all datasets suffer from the easy dataset +problem of varying severity, some of which even admit blind image quality +assessment (BIQA) solutions. We additionally justify our claims by contrasting +our model generalizability on these VQA datasets, and by ablating a dizzying +set of BVQA design choices related to the basic building blocks. Our results +cast doubt on the current progress in BVQA, and meanwhile shed light on good +practices of constructing next-generation VQA datasets and models. + +
+
+
+
+
+ + ♻ ☆ A Robust Ensemble Algorithm for Ischemic Stroke Lesion Segmentation: + Generalizability and Clinical Utility Beyond the ISLES Challenge + + +
+ Diffusion-weighted MRI (DWI) is essential for stroke diagnosis, treatment +decisions, and prognosis. However, image and disease variability hinder the +development of generalizable AI algorithms with clinical value. We address this +gap by presenting a novel ensemble algorithm derived from the 2022 Ischemic +Stroke Lesion Segmentation (ISLES) challenge. ISLES'22 provided 400 patient +scans with ischemic stroke from various medical centers, facilitating the +development of a wide range of cutting-edge segmentation algorithms by the +research community. Through collaboration with leading teams, we combined +top-performing algorithms into an ensemble model that overcomes the limitations +of individual solutions. Our ensemble model achieved superior ischemic lesion +detection and segmentation accuracy on our internal test set compared to +individual algorithms. This accuracy generalized well across diverse image and +disease variables. Furthermore, the model excelled in extracting clinical +biomarkers. Notably, in a Turing-like test, neuroradiologists consistently +preferred the algorithm's segmentations over manual expert efforts, +highlighting increased comprehensiveness and precision. Validation using a +real-world external dataset (N=1686) confirmed the model's generalizability. +The algorithm's outputs also demonstrated strong correlations with clinical +scores (admission NIHSS and 90-day mRS) on par with or exceeding expert-derived +results, underlining its clinical relevance. This study offers two key +findings. First, we present an ensemble algorithm +(https://github.com/Tabrisrei/ISLES22_Ensemble) that detects and segments +ischemic stroke lesions on DWI across diverse scenarios on par with expert +(neuro)radiologists. Second, we show the potential for biomedical challenge +outputs to extend beyond the challenge's initial objectives, demonstrating +their real-world clinical applicability. + +
+
+
+
+
+ + ♻ ☆ Object-level Copy-Move Forgery Image Detection based on Inconsistency + Mining WWW 2024 + + +
+ In copy-move tampering operations, perpetrators often employ techniques, such +as blurring, to conceal tampering traces, posing significant challenges to the +detection of object-level targets with intact structures. Focus on these +challenges, this paper proposes an Object-level Copy-Move Forgery Image +Detection based on Inconsistency Mining (IMNet). To obtain complete +object-level targets, we customize prototypes for both the source and tampered +regions and dynamically update them. Additionally, we extract inconsistent +regions between coarse similar regions obtained through self-correlation +calculations and regions composed of prototypes. The detected inconsistent +regions are used as supplements to coarse similar regions to refine pixel-level +detection. We operate experiments on three public datasets which validate the +effectiveness and the robustness of the proposed IMNet. + +
+
+ comment: 4 pages, 2 figures, Accepted to WWW 2024 +
+
+
+
+
+ + ♻ ☆ InfLoRA: Interference-Free Low-Rank Adaptation for Continual Learning CVPR 2024 + + +
+ Continual learning requires the model to learn multiple tasks sequentially. +In continual learning, the model should possess the ability to maintain its +performance on old tasks (stability) and the ability to adapt to new tasks +continuously (plasticity). Recently, parameter-efficient fine-tuning (PEFT), +which involves freezing a pre-trained model and injecting a small number of +learnable parameters to adapt to downstream tasks, has gained increasing +popularity in continual learning. Although existing continual learning methods +based on PEFT have demonstrated superior performance compared to those not +based on PEFT, most of them do not consider how to eliminate the interference +of the new task on the old tasks, which inhibits the model from making a good +trade-off between stability and plasticity. In this work, we propose a new PEFT +method, called interference-free low-rank adaptation (InfLoRA), for continual +learning. InfLoRA injects a small number of parameters to reparameterize the +pre-trained weights and shows that fine-tuning these injected parameters is +equivalent to fine-tuning the pre-trained weights within a subspace. +Furthermore, InfLoRA designs this subspace to eliminate the interference of the +new task on the old tasks, making a good trade-off between stability and +plasticity. Experimental results show that InfLoRA outperforms existing +state-of-the-art continual learning methods on multiple datasets. + +
+
+ comment: Accepted by the 2024 IEEE/CVF Conference on Computer Vision and + Pattern Recognition (CVPR 2024) +
+
+
+
+
+ + ♻ ☆ RRWNet: Recursive Refinement Network for Effective Retinal Artery/Vein + Segmentation and Classification + + +
+ The caliber and configuration of retinal blood vessels serve as important +biomarkers for various diseases and medical conditions. A thorough analysis of +the retinal vasculature requires the segmentation of the blood vessels and +their classification into arteries and veins, typically performed on color +fundus images obtained by retinography. However, manually performing these +tasks is labor-intensive and prone to human error. While several automated +methods have been proposed to address this task, the current state of art faces +challenges due to manifest classification errors affecting the topological +consistency of segmentation maps. In this work, we introduce RRWNet, a novel +end-to-end deep learning framework that addresses this limitation. The +framework consists of a fully convolutional neural network that recursively +refines semantic segmentation maps, correcting manifest classification errors +and thus improving topological consistency. In particular, RRWNet is composed +of two specialized subnetworks: a Base subnetwork that generates base +segmentation maps from the input images, and a Recursive Refinement subnetwork +that iteratively and recursively improves these maps. Evaluation on three +different public datasets demonstrates the state-of-the-art performance of the +proposed method, yielding more topologically consistent segmentation maps with +fewer manifest classification errors than existing approaches. In addition, the +Recursive Refinement module within RRWNet proves effective in post-processing +segmentation maps from other methods, further demonstrating its potential. The +model code, weights, and predictions will be publicly available at +https://github.com/j-morano/rrwnet. + +
+
+
+
+
+ + ♻ ☆ ARS-DETR: Aspect Ratio Sensitive Oriented Object Detection with + Transformer + + +
+ Existing oriented object detection methods commonly use metric AP$_{50}$ to +measure the performance of the model. We argue that AP$_{50}$ is inherently +unsuitable for oriented object detection due to its large tolerance in angle +deviation. Therefore, we advocate using high-precision metric, e.g. AP$_{75}$, +to measure the performance of models. In this paper, we propose an Aspect Ratio +Sensitive Oriented Object Detector with Transformer, termed ARS-DETR, which +exhibits a competitive performance in high-precision oriented object detection. +Specifically, a new angle classification method, calling Aspect Ratio aware +Circle Smooth Label (AR-CSL), is proposed to smooth the angle label in a more +reasonable way and discard the hyperparameter that introduced by previous work +(e.g. CSL). Then, a rotated deformable attention module is designed to rotate +the sampling points with the corresponding angles and eliminate the +misalignment between region features and sampling points. Moreover, a dynamic +weight coefficient according to the aspect ratio is adopted to calculate the +angle loss. Comprehensive experiments on several challenging datasets show that +our method achieves competitive performance on the high-precision oriented +object detection task. + +
+
+ comment: 10 pages, 8 figures, 8 tables, the source code is available at + https://github.com/httle/ARS-DETR +
+
+
+
+
+ + ♻ ☆ Discriminative Sample-Guided and Parameter-Efficient Feature Space + Adaptation for Cross-Domain Few-Shot Learning + + +
+ In this paper, we look at cross-domain few-shot classification which presents +the challenging task of learning new classes in previously unseen domains with +few labelled examples. Existing methods, though somewhat effective, encounter +several limitations, which we alleviate through two significant improvements. +First, we introduce a lightweight parameter-efficient adaptation strategy to +address overfitting associated with fine-tuning a large number of parameters on +small datasets. This strategy employs a linear transformation of pre-trained +features, significantly reducing the trainable parameter count. Second, we +replace the traditional nearest centroid classifier with a discriminative +sample-aware loss function, enhancing the model's sensitivity to the inter- and +intra-class variances within the training set for improved clustering in +feature space. Empirical evaluations on the Meta-Dataset benchmark showcase +that our approach not only improves accuracy up to 7.7\% and 5.3\% on +previously seen and unseen datasets, respectively, but also achieves the above +performance while being at least $\sim3\times$ more parameter-efficient than +existing methods, establishing a new state-of-the-art in cross-domain few-shot +learning. Our code is available at https://github.com/rashindrie/DIPA. + +
+
+ comment: Code is available at this link: https://github.com/rashindrie/DIPA +
+
+
+
+
+ + ♻ ☆ MotionChain: Conversational Motion Controllers via Multimodal Prompts + + +
+ Recent advancements in language models have demonstrated their adeptness in +conducting multi-turn dialogues and retaining conversational context. However, +this proficiency remains largely unexplored in other multimodal generative +models, particularly in human motion models. By integrating multi-turn +conversations in controlling continuous virtual human movements, generative +human motion models can achieve an intuitive and step-by-step process of human +task execution for humanoid robotics, game agents, or other embodied systems. +In this work, we present MotionChain, a conversational human motion controller +to generate continuous and long-term human motion through multimodal prompts. +Specifically, MotionChain consists of multi-modal tokenizers that transform +various data types such as text, image, and motion, into discrete tokens, +coupled with a Vision-Motion-aware Language model. By leveraging large-scale +language, vision-language, and vision-motion data to assist motion-related +generation tasks, MotionChain thus comprehends each instruction in multi-turn +conversation and generates human motions followed by these prompts. Extensive +experiments validate the efficacy of MotionChain, demonstrating +state-of-the-art performance in conversational motion generation, as well as +more intuitive manners of controlling and interacting with virtual humans. + +
+
+ comment: 14 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ The Audio-Visual Conversational Graph: From an Egocentric-Exocentric + Perspective + + +
+ In recent years, the thriving development of research related to egocentric +videos has provided a unique perspective for the study of conversational +interactions, where both visual and audio signals play a crucial role. While +most prior work focus on learning about behaviors that directly involve the +camera wearer, we introduce the Ego-Exocentric Conversational Graph Prediction +problem, marking the first attempt to infer exocentric conversational +interactions from egocentric videos. We propose a unified multi-modal framework +-- Audio-Visual Conversational Attention (AV-CONV), for the joint prediction of +conversation behaviors -- speaking and listening -- for both the camera wearer +as well as all other social partners present in the egocentric video. +Specifically, we adopt the self-attention mechanism to model the +representations across-time, across-subjects, and across-modalities. To +validate our method, we conduct experiments on a challenging egocentric video +dataset that includes multi-speaker and multi-conversation scenarios. Our +results demonstrate the superior performance of our method compared to a series +of baselines. We also present detailed ablation studies to assess the +contribution of each component in our model. Check our project page at +https://vjwq.github.io/AV-CONV/. + +
+
+
+
+
+ + ♻ ☆ LPSNet: End-to-End Human Pose and Shape Estimation with Lensless Imaging CVPR 2024 + + +
+ Human pose and shape (HPS) estimation with lensless imaging is not only +beneficial to privacy protection but also can be used in covert surveillance +scenarios due to the small size and simple structure of this device. However, +this task presents significant challenges due to the inherent ambiguity of the +captured measurements and lacks effective methods for directly estimating human +pose and shape from lensless data. In this paper, we propose the first +end-to-end framework to recover 3D human poses and shapes from lensless +measurements to our knowledge. We specifically design a multi-scale lensless +feature decoder to decode the lensless measurements through the optically +encoded mask for efficient feature extraction. We also propose a double-head +auxiliary supervision mechanism to improve the estimation accuracy of human +limb ends. Besides, we establish a lensless imaging system and verify the +effectiveness of our method on various datasets acquired by our lensless +imaging system. + +
+
+ comment: Accepted to CVPR 2024. More results available at + https://cic.tju.edu.cn/faculty/likun/projects/LPSNet +
+
+
+
+
+ + ♻ ☆ Semantic Human Mesh Reconstruction with Textures CVPR 2024 + + +
+ The field of 3D detailed human mesh reconstruction has made significant +progress in recent years. However, current methods still face challenges when +used in industrial applications due to unstable results, low-quality meshes, +and a lack of UV unwrapping and skinning weights. In this paper, we present +SHERT, a novel pipeline that can reconstruct semantic human meshes with +textures and high-precision details. SHERT applies semantic- and normal-based +sampling between the detailed surface (e.g. mesh and SDF) and the corresponding +SMPL-X model to obtain a partially sampled semantic mesh and then generates the +complete semantic mesh by our specifically designed self-supervised completion +and refinement networks. Using the complete semantic mesh as a basis, we employ +a texture diffusion model to create human textures that are driven by both +images and texts. Our reconstructed meshes have stable UV unwrapping, +high-quality triangle meshes, and consistent semantic information. The given +SMPL-X model provides semantic information and shape priors, allowing SHERT to +perform well even with incorrect and incomplete inputs. The semantic +information also makes it easy to substitute and animate different body parts +such as the face, body, and hands. Quantitative and qualitative experiments +demonstrate that SHERT is capable of producing high-fidelity and robust +semantic meshes that outperform state-of-the-art methods. + +
+
+ comment: Accepted to CVPR 2024. Project page: + https://zhanxy.xyz/projects/shert/ +
+
+
+
+
+ + ♻ ☆ Optimizing Diffusion Noise Can Serve As Universal Motion Priors CVPR 2024 + + +
+ We propose Diffusion Noise Optimization (DNO), a new method that effectively +leverages existing motion diffusion models as motion priors for a wide range of +motion-related tasks. Instead of training a task-specific diffusion model for +each new task, DNO operates by optimizing the diffusion latent noise of an +existing pre-trained text-to-motion model. Given the corresponding latent noise +of a human motion, it propagates the gradient from the target criteria defined +on the motion space through the whole denoising process to update the diffusion +latent noise. As a result, DNO supports any use cases where criteria can be +defined as a function of motion. In particular, we show that, for motion +editing and control, DNO outperforms existing methods in both achieving the +objective and preserving the motion content. DNO accommodates a diverse range +of editing modes, including changing trajectory, pose, joint locations, or +avoiding newly added obstacles. In addition, DNO is effective in motion +denoising and completion, producing smooth and realistic motion from noisy and +partial inputs. DNO achieves these results at inference time without the need +for model retraining, offering great versatility for any defined reward or loss +function on the motion representation. + +
+
+ comment: CVPR 2024. Project page: https://korrawe.github.io/dno-project/ +
+
+
+
+
+ + ♻ ☆ Analytic-Splatting: Anti-Aliased 3D Gaussian Splatting via Analytic + Integration + + +
+ The 3D Gaussian Splatting (3DGS) gained its popularity recently by combining +the advantages of both primitive-based and volumetric 3D representations, +resulting in improved quality and efficiency for 3D scene rendering. However, +3DGS is not alias-free, and its rendering at varying resolutions could produce +severe blurring or jaggies. This is because 3DGS treats each pixel as an +isolated, single point rather than as an area, causing insensitivity to changes +in the footprints of pixels. Consequently, this discrete sampling scheme +inevitably results in aliasing, owing to the restricted sampling bandwidth. In +this paper, we derive an analytical solution to address this issue. More +specifically, we use a conditioned logistic function as the analytic +approximation of the cumulative distribution function (CDF) in a +one-dimensional Gaussian signal and calculate the Gaussian integral by +subtracting the CDFs. We then introduce this approximation in the +two-dimensional pixel shading, and present Analytic-Splatting, which +analytically approximates the Gaussian integral within the 2D-pixel window area +to better capture the intensity response of each pixel. Moreover, we use the +approximated response of the pixel window integral area to participate in the +transmittance calculation of volume rendering, making Analytic-Splatting +sensitive to the changes in pixel footprint at different resolutions. +Experiments on various datasets validate that our approach has better +anti-aliasing capability that gives more details and better fidelity. + +
+
+ comment: 29 pages +
+
+
+
+
+ + ♻ ☆ MLLMReID: Multimodal Large Language Model-based Person Re-identification + + +
+ Multimodal large language models (MLLM) have achieved satisfactory results in +many tasks. However, their performance in the task of person re-identification +(ReID) has not been explored to date. This paper will investigate how to adapt +them for the task of ReID. An intuitive idea is to fine-tune MLLM with ReID +image-text datasets, and then use their visual encoder as a backbone for ReID. +However, there still exist two apparent issues: (1) Designing instructions for +ReID, MLLMs may overfit specific instructions, and designing a variety of +instructions will lead to higher costs. (2) Latent image feature vectors from +LLMs are not involved in loss computation. Instructional learning, aligning +image-text features, results in indirect optimization and a learning objective +that inadequately utilizes features, limiting effectiveness in person feature +learning. To address these problems, this paper proposes MLLMReID: Multimodal +Large Language Model-based ReID. Firstly, we proposed Common Instruction, a +simple approach that leverages the essence ability of LLMs to continue writing, +avoiding complex and diverse instruction design. Secondly, we proposed +DirectReID, which effectively employs the latent image feature vectors of +images outputted by LLMs in ReID tasks. The experimental results demonstrate +the superiority of our method. We will open-source the code on GitHub. + +
+
+
+
+
+ + ♻ ☆ 3D Open-Vocabulary Panoptic Segmentation with 2D-3D Vision-Language + Distillation + + +
+ 3D panoptic segmentation is a challenging perception task, especially in +autonomous driving. It aims to predict both semantic and instance annotations +for 3D points in a scene. Although prior 3D panoptic segmentation approaches +have achieved great performance on closed-set benchmarks, generalizing these +approaches to unseen things and unseen stuff categories remains an open +problem. For unseen object categories, 2D open-vocabulary segmentation has +achieved promising results that solely rely on frozen CLIP backbones and +ensembling multiple classification outputs. However, we find that simply +extending these 2D models to 3D does not guarantee good performance due to poor +per-mask classification quality, especially for novel stuff categories. In this +paper, we propose the first method to tackle 3D open-vocabulary panoptic +segmentation. Our model takes advantage of the fusion between learnable LiDAR +features and dense frozen vision CLIP features, using a single classification +head to make predictions for both base and novel classes. To further improve +the classification performance on novel classes and leverage the CLIP model, we +propose two novel loss functions: object-level distillation loss and +voxel-level distillation loss. Our experiments on the nuScenes and +SemanticKITTI datasets show that our method outperforms the strong baseline by +a large margin. + +
+
+
+
+
+ + ♻ ☆ On-Device Training Under 256KB Memory NeurIPS 2022 + + +
+ On-device training enables the model to adapt to new data collected from the +sensors by fine-tuning a pre-trained model. Users can benefit from customized +AI models without having to transfer the data to the cloud, protecting the +privacy. However, the training memory consumption is prohibitive for IoT +devices that have tiny memory resources. We propose an algorithm-system +co-design framework to make on-device training possible with only 256KB of +memory. On-device training faces two unique challenges: (1) the quantized +graphs of neural networks are hard to optimize due to low bit-precision and the +lack of normalization; (2) the limited hardware resource does not allow full +back-propagation. To cope with the optimization difficulty, we propose +Quantization-Aware Scaling to calibrate the gradient scales and stabilize 8-bit +quantized training. To reduce the memory footprint, we propose Sparse Update to +skip the gradient computation of less important layers and sub-tensors. The +algorithm innovation is implemented by a lightweight training system, Tiny +Training Engine, which prunes the backward computation graph to support sparse +updates and offload the runtime auto-differentiation to compile time. Our +framework is the first solution to enable tiny on-device training of +convolutional neural networks under 256KB SRAM and 1MB Flash without auxiliary +memory, using less than 1/1000 of the memory of PyTorch and TensorFlow while +matching the accuracy on tinyML application VWW. Our study enables IoT devices +not only to perform inference but also to continuously adapt to new data for +on-device lifelong learning. A video demo can be found here: +https://youtu.be/0pUFZYdoMY8. + +
+
+ comment: NeurIPS 2022 +
+
+
+
+
+ + ♻ ☆ Burst Super-Resolution with Diffusion Models for Improving Perceptual + Quality IJCNN 2024 + + +
+ While burst LR images are useful for improving the SR image quality compared +with a single LR image, prior SR networks accepting the burst LR images are +trained in a deterministic manner, which is known to produce a blurry SR image. +In addition, it is difficult to perfectly align the burst LR images, making the +SR image more blurry. Since such blurry images are perceptually degraded, we +aim to reconstruct the sharp high-fidelity boundaries. Such high-fidelity +images can be reconstructed by diffusion models. However, prior SR methods +using the diffusion model are not properly optimized for the burst SR task. +Specifically, the reverse process starting from a random sample is not +optimized for image enhancement and restoration methods, including burst SR. In +our proposed method, on the other hand, burst LR features are used to +reconstruct the initial burst SR image that is fed into an intermediate step in +the diffusion model. This reverse process from the intermediate step 1) skips +diffusion steps for reconstructing the global structure of the image and 2) +focuses on steps for refining detailed textures. Our experimental results +demonstrate that our method can improve the scores of the perceptual quality +metrics. Code: https://github.com/placerkyo/BSRD + +
+
+ comment: Accepted to IJCNN 2024 (International Joint Conference on Neural + Networks) +
+
+
+
+
+ + ♻ ☆ MI-NeRF: Learning a Single Face NeRF from Multiple Identities + + +
+ In this work, we introduce a method that learns a single dynamic neural +radiance field (NeRF) from monocular talking face videos of multiple +identities. NeRFs have shown remarkable results in modeling the 4D dynamics and +appearance of human faces. However, they require per-identity optimization. +Although recent approaches have proposed techniques to reduce the training and +rendering time, increasing the number of identities can be expensive. We +introduce MI-NeRF (multi-identity NeRF), a single unified network that models +complex non-rigid facial motion for multiple identities, using only monocular +videos of arbitrary length. The core premise in our method is to learn the +non-linear interactions between identity and non-identity specific information +with a multiplicative module. By training on multiple videos simultaneously, +MI-NeRF not only reduces the total training time compared to standard +single-identity NeRFs, but also demonstrates robustness in synthesizing novel +expressions for any input identity. We present results for both facial +expression transfer and talking face video synthesis. Our method can be further +personalized for a target identity given only a short video. + +
+
+ comment: Project page: https://aggelinacha.github.io/MI-NeRF/ +
+
+
+
+
+ + ♻ ☆ HAC: Hash-grid Assisted Context for 3D Gaussian Splatting Compression + + +
+ 3D Gaussian Splatting (3DGS) has emerged as a promising framework for novel +view synthesis, boasting rapid rendering speed with high fidelity. However, the +substantial Gaussians and their associated attributes necessitate effective +compression techniques. Nevertheless, the sparse and unorganized nature of the +point cloud of Gaussians (or anchors in our paper) presents challenges for +compression. To address this, we make use of the relations between the +unorganized anchors and the structured hash grid, leveraging their mutual +information for context modeling, and propose a Hash-grid Assisted Context +(HAC) framework for highly compact 3DGS representation. Our approach introduces +a binary hash grid to establish continuous spatial consistencies, allowing us +to unveil the inherent spatial relations of anchors through a carefully +designed context model. To facilitate entropy coding, we utilize Gaussian +distributions to accurately estimate the probability of each quantized +attribute, where an adaptive quantization module is proposed to enable +high-precision quantization of these attributes for improved fidelity +restoration. Additionally, we incorporate an adaptive masking strategy to +eliminate invalid Gaussians and anchors. Importantly, our work is the pioneer +to explore context-based compression for 3DGS representation, resulting in a +remarkable size reduction of over $75\times$ compared to vanilla 3DGS, while +simultaneously improving fidelity, and achieving over $11\times$ size reduction +over SOTA 3DGS compression approach Scaffold-GS. Our code is available here: +https://github.com/YihangChen-ee/HAC + +
+
+ comment: Project Page: https://yihangchen-ee.github.io/project_hac/ Code: + https://github.com/YihangChen-ee/HAC +
+
+
+
+
+ + ♻ ☆ Causal Intervention for Subject-Deconfounded Facial Action Unit + Recognition AAAI2022 + + +
+ Subject-invariant facial action unit (AU) recognition remains challenging for +the reason that the data distribution varies among subjects. In this paper, we +propose a causal inference framework for subject-invariant facial action unit +recognition. To illustrate the causal effect existing in AU recognition task, +we formulate the causalities among facial images, subjects, latent AU semantic +relations, and estimated AU occurrence probabilities via a structural causal +model. By constructing such a causal diagram, we clarify the causal effect +among variables and propose a plug-in causal intervention module, CIS, to +deconfound the confounder \emph{Subject} in the causal diagram. Extensive +experiments conducted on two commonly used AU benchmark datasets, BP4D and +DISFA, show the effectiveness of our CIS, and the model with CIS inserted, +CISNet, has achieved state-of-the-art performance. + +
+
+ comment: Accepted by AAAI2022 +
+
+
+
+
+ + ♻ ☆ Improved Zero-Shot Classification by Adapting VLMs with Text + Descriptions + + +
+ The zero-shot performance of existing vision-language models (VLMs) such as +CLIP is limited by the availability of large-scale, aligned image and text +datasets in specific domains. In this work, we leverage two complementary +sources of information -- descriptions of categories generated by large +language models (LLMs) and abundant, fine-grained image classification datasets +-- to improve the zero-shot classification performance of VLMs across +fine-grained domains. On the technical side, we develop methods to train VLMs +with this "bag-level" image-text supervision. We find that simply using these +attributes at test-time does not improve performance, but our training +strategy, for example, on the iNaturalist dataset, leads to an average +improvement of 4-5% in zero-shot classification accuracy for novel categories +of birds and flowers. Similar improvements are observed in domains where a +subset of the categories was used to fine-tune the model. By prompting LLMs in +various ways, we generate descriptions that capture visual appearance, habitat, +and geographic regions and pair them with existing attributes such as the +taxonomic structure of the categories. We systematically evaluate their ability +to improve zero-shot categorization in natural domains. Our findings suggest +that geographic priors can be just as effective and are complementary to visual +appearance. Our method also outperforms prior work on prompt-based tuning of +VLMs. We release the benchmark, consisting of 14 datasets at +https://github.com/cvl-umass/AdaptCLIPZS , which will contribute to future +research in zero-shot recognition. + +
+
+
+
+
+ + ♻ ☆ Cooperative Students: Navigating Unsupervised Domain Adaptation in + Nighttime Object Detection + + +
+ Unsupervised Domain Adaptation (UDA) has shown significant advancements in +object detection under well-lit conditions; however, its performance degrades +notably in low-visibility scenarios, especially at night, posing challenges not +only for its adaptability in low signal-to-noise ratio (SNR) conditions but +also for the reliability and efficiency of automated vehicles. To address this +problem, we propose a \textbf{Co}operative \textbf{S}tudents (\textbf{CoS}) +framework that innovatively employs global-local transformations (GLT) and a +proxy-based target consistency (PTC) mechanism to capture the spatial +consistency in day- and night-time scenarios effectively, and thus bridge the +significant domain shift across contexts. Building upon this, we further devise +an adaptive IoU-informed thresholding (AIT) module to gradually avoid +overlooking potential true positives and enrich the latent information in the +target domain. Comprehensive experiments show that CoS essentially enhanced UDA +performance in low-visibility conditions and surpasses current state-of-the-art +techniques, achieving an increase in mAP of 3.0\%, 1.9\%, and 2.5\% on BDD100K, +SHIFT, and ACDC datasets, respectively. Code is available at +https://github.com/jichengyuan/Cooperitive_Students. + +
+
+ comment: Code is available at + https://github.com/jichengyuan/Cooperitive_Students +
+
+
+
+
+ + ♻ ☆ Mirasol3B: A Multimodal Autoregressive model for time-aligned and + contextual modalities CVPR 2024 + + +
+ One of the main challenges of multimodal learning is the need to combine +heterogeneous modalities (e.g., video, audio, text). For example, video and +audio are obtained at much higher rates than text and are roughly aligned in +time. They are often not synchronized with text, which comes as a global +context, e.g., a title, or a description. Furthermore, video and audio inputs +are of much larger volumes, and grow as the video length increases, which +naturally requires more compute dedicated to these modalities and makes +modeling of long-range dependencies harder. + We here decouple the multimodal modeling, dividing it into separate, focused +autoregressive models, processing the inputs according to the characteristics +of the modalities. We propose a multimodal model, called Mirasol3B, consisting +of an autoregressive component for the time-synchronized modalities (audio and +video), and an autoregressive component for the context modalities which are +not necessarily aligned in time but are still sequential. To address the +long-sequences of the video-audio inputs, we propose to further partition the +video and audio sequences in consecutive snippets and autoregressively process +their representations. To that end, we propose a Combiner mechanism, which +models the audio-video information jointly within a timeframe. The Combiner +learns to extract audio and video features from raw spatio-temporal signals, +and then learns to fuse these features producing compact but expressive +representations per snippet. + Our approach achieves the state-of-the-art on well established multimodal +benchmarks, outperforming much larger models. It effectively addresses the high +computational demand of media inputs by both learning compact representations, +controlling the sequence length of the audio-video feature representations, and +modeling their dependencies in time. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ ViTamin: Designing Scalable Vision Models in the Vision-Language Era CVPR 2024 + + +
+ Recent breakthroughs in vision-language models (VLMs) start a new page in the +vision community. The VLMs provide stronger and more generalizable feature +embeddings compared to those from ImageNet-pretrained models, thanks to the +training on the large-scale Internet image-text pairs. However, despite the +amazing achievement from the VLMs, vanilla Vision Transformers (ViTs) remain +the default choice for the image encoder. Although pure transformer proves its +effectiveness in the text encoding area, it remains questionable whether it is +also the case for image encoding, especially considering that various types of +networks are proposed on the ImageNet benchmark, which, unfortunately, are +rarely studied in VLMs. Due to small data/model scale, the original conclusions +of model design on ImageNet can be limited and biased. In this paper, we aim at +building an evaluation protocol of vision models in the vision-language era +under the contrastive language-image pretraining (CLIP) framework. We provide a +comprehensive way to benchmark different vision models, covering their +zero-shot performance and scalability in both model and training data sizes. To +this end, we introduce ViTamin, a new vision models tailored for VLMs. +ViTamin-L significantly outperforms ViT-L by 2.0% ImageNet zero-shot accuracy, +when using the same publicly available DataComp-1B dataset and the same +OpenCLIP training scheme. ViTamin-L presents promising results on 60 diverse +benchmarks, including classification, retrieval, open-vocabulary detection and +segmentation, and large multi-modal models. When further scaling up the model +size, our ViTamin-XL with only 436M parameters attains 82.9% ImageNet zero-shot +accuracy, surpassing 82.0% achieved by EVA-E that has ten times more parameters +(4.4B). + +
+
+ comment: CVPR 2024; https://github.com/Beckschen/ViTamin +
+
+
+
+
+ + ♻ ☆ Generating Images with 3D Annotations Using Diffusion Models ICLR 2024 + + +
+ Diffusion models have emerged as a powerful generative method, capable of +producing stunning photo-realistic images from natural language descriptions. +However, these models lack explicit control over the 3D structure in the +generated images. Consequently, this hinders our ability to obtain detailed 3D +annotations for the generated images or to craft instances with specific poses +and distances. In this paper, we propose 3D Diffusion Style Transfer (3D-DST), +which incorporates 3D geometry control into diffusion models. Our method +exploits ControlNet, which extends diffusion models by using visual prompts in +addition to text prompts. We generate images of the 3D objects taken from 3D +shape repositories (e.g., ShapeNet and Objaverse), render them from a variety +of poses and viewing directions, compute the edge maps of the rendered images, +and use these edge maps as visual prompts to generate realistic images. With +explicit 3D geometry control, we can easily change the 3D structures of the +objects in the generated images and obtain ground-truth 3D annotations +automatically. This allows us to improve a wide range of vision tasks, e.g., +classification and 3D pose estimation, in both in-distribution (ID) and +out-of-distribution (OOD) settings. We demonstrate the effectiveness of our +method through extensive experiments on ImageNet-100/200, ImageNet-R, +PASCAL3D+, ObjectNet3D, and OOD-CV. The results show that our method +significantly outperforms existing methods, e.g., 3.8 percentage points on +ImageNet-100 using DeiT-B. + +
+
+ comment: ICLR 2024 Spotlight. Code: https://ccvl.jhu.edu/3D-DST/ +
+
+
+
+
+ + ♻ ☆ Effective Adapter for Face Recognition in the Wild + + +
+ In this paper, we tackle the challenge of face recognition in the wild, where +images often suffer from low quality and real-world distortions. Traditional +heuristic approaches-either training models directly on these degraded images +or their enhanced counterparts using face restoration techniques-have proven +ineffective, primarily due to the degradation of facial features and the +discrepancy in image domains. To overcome these issues, we propose an effective +adapter for augmenting existing face recognition models trained on high-quality +facial datasets. The key of our adapter is to process both the unrefined and +enhanced images using two similar structures, one fixed and the other +trainable. Such design can confer two benefits. First, the dual-input system +minimizes the domain gap while providing varied perspectives for the face +recognition model, where the enhanced image can be regarded as a complex +non-linear transformation of the original one by the restoration model. Second, +both two similar structures can be initialized by the pre-trained models +without dropping the past knowledge. The extensive experiments in zero-shot +settings show the effectiveness of our method by surpassing baselines of about +3%, 4%, and 7% in three datasets. Our code will be publicly available. + +
+
+
+
+
+ + ♻ ☆ Hybrid Video Diffusion Models with 2D Triplane and 3D Wavelet + Representation + + +
+ Generating high-quality videos that synthesize desired realistic content is a +challenging task due to their intricate high-dimensionality and complexity of +videos. Several recent diffusion-based methods have shown comparable +performance by compressing videos to a lower-dimensional latent space, using +traditional video autoencoder architecture. However, such method that employ +standard frame-wise 2D and 3D convolution fail to fully exploit the +spatio-temporal nature of videos. To address this issue, we propose a novel +hybrid video diffusion model, called HVDM, which can capture spatio-temporal +dependencies more effectively. The HVDM is trained by a hybrid video +autoencoder which extracts a disentangled representation of the video +including: (i) a global context information captured by a 2D projected latent +(ii) a local volume information captured by 3D convolutions with wavelet +decomposition (iii) a frequency information for improving the video +reconstruction. Based on this disentangled representation, our hybrid +autoencoder provide a more comprehensive video latent enriching the generated +videos with fine structures and details. Experiments on video generation +benchamarks (UCF101, SkyTimelapse, and TaiChi) demonstrate that the proposed +approach achieves state-of-the-art video generation quality, showing a wide +range of video applications (e.g., long video generation, image-to-video, and +video dynamics control). + +
+
+ comment: Project page is available at https://hxngiee.github.io/HVDM/ +
+
+
+
+
+ + ♻ ☆ MCUNetV2: Memory-Efficient Patch-based Inference for Tiny Deep Learning + + +
+ Tiny deep learning on microcontroller units (MCUs) is challenging due to the +limited memory size. We find that the memory bottleneck is due to the +imbalanced memory distribution in convolutional neural network (CNN) designs: +the first several blocks have an order of magnitude larger memory usage than +the rest of the network. To alleviate this issue, we propose a generic +patch-by-patch inference scheduling, which operates only on a small spatial +region of the feature map and significantly cuts down the peak memory. However, +naive implementation brings overlapping patches and computation overhead. We +further propose network redistribution to shift the receptive field and FLOPs +to the later stage and reduce the computation overhead. Manually redistributing +the receptive field is difficult. We automate the process with neural +architecture search to jointly optimize the neural architecture and inference +scheduling, leading to MCUNetV2. Patch-based inference effectively reduces the +peak memory usage of existing networks by 4-8x. Co-designed with neural +networks, MCUNetV2 sets a record ImageNet accuracy on MCU (71.8%), and achieves +>90% accuracy on the visual wake words dataset under only 32kB SRAM. MCUNetV2 +also unblocks object detection on tiny devices, achieving 16.9% higher mAP on +Pascal VOC compared to the state-of-the-art result. Our study largely addressed +the memory bottleneck in tinyML and paved the way for various vision +applications beyond image classification. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 235 + +
+
+
+ + ☆ Segment Any 3D Object with Language + + +
+ In this paper, we investigate Open-Vocabulary 3D Instance Segmentation +(OV-3DIS) with free-form language instructions. Earlier works that rely on only +annotated base categories for training suffer from limited generalization to +unseen novel categories. Recent works mitigate poor generalizability to novel +categories by generating class-agnostic masks or projecting generalized masks +from 2D to 3D, but disregard semantic or geometry information, leading to +sub-optimal performance. Instead, generating generalizable but semantic-related +masks directly from 3D point clouds would result in superior outcomes. In this +paper, we introduce Segment any 3D Object with LanguagE (SOLE), which is a +semantic and geometric-aware visual-language learning framework with strong +generalizability by generating semantic-related masks directly from 3D point +clouds. Specifically, we propose a multimodal fusion network to incorporate +multimodal semantics in both backbone and decoder. In addition, to align the 3D +segmentation model with various language instructions and enhance the mask +quality, we introduce three types of multimodal associations as supervision. +Our SOLE outperforms previous methods by a large margin on ScanNetv2, +ScanNet200, and Replica benchmarks, and the results are even close to the +fully-supervised counterpart despite the absence of class annotations in the +training. Furthermore, extensive qualitative results demonstrate the +versatility of our SOLE to language instructions. + +
+
+ comment: Project Page: https://cvrp-sole.github.io +
+
+
+
+
+ + ☆ Alpha Invariance: On Inverse Scaling Between Distance and Volume Density + in Neural Radiance Fields CVPR 2024 + + +
+ Scale-ambiguity in 3D scene dimensions leads to magnitude-ambiguity of +volumetric densities in neural radiance fields, i.e., the densities double when +scene size is halved, and vice versa. We call this property alpha invariance. +For NeRFs to better maintain alpha invariance, we recommend 1) parameterizing +both distance and volume densities in log space, and 2) a +discretization-agnostic initialization strategy to guarantee high ray +transmittance. We revisit a few popular radiance field models and find that +these systems use various heuristics to deal with issues arising from scene +scaling. We test their behaviors and show our recipe to be more robust. + +
+
+ comment: CVPR 2024. project page https://pals.ttic.edu/p/alpha-invariance +
+
+
+
+
+ + ☆ Dynamic Pre-training: Towards Efficient and Scalable All-in-One Image + Restoration + + +
+ All-in-one image restoration tackles different types of degradations with a +unified model instead of having task-specific, non-generic models for each +degradation. The requirement to tackle multiple degradations using the same +model can lead to high-complexity designs with fixed configuration that lack +the adaptability to more efficient alternatives. We propose DyNet, a dynamic +family of networks designed in an encoder-decoder style for all-in-one image +restoration tasks. Our DyNet can seamlessly switch between its bulkier and +lightweight variants, thereby offering flexibility for efficient model +deployment with a single round of training. This seamless switching is enabled +by our weights-sharing mechanism, forming the core of our architecture and +facilitating the reuse of initialized module weights. Further, to establish +robust weights initialization, we introduce a dynamic pre-training strategy +that trains variants of the proposed DyNet concurrently, thereby achieving a +50% reduction in GPU hours. To tackle the unavailability of large-scale dataset +required in pre-training, we curate a high-quality, high-resolution image +dataset named Million-IRD having 2M image samples. We validate our DyNet for +image denoising, deraining, and dehazing in all-in-one setting, achieving +state-of-the-art results with 31.34% reduction in GFlops and a 56.75% reduction +in parameters compared to baseline models. The source codes and trained models +are available at https://github.com/akshaydudhane16/DyNet. + +
+
+
+
+
+ + ☆ GeneAvatar: Generic Expression-Aware Volumetric Head Avatar Editing from + a Single Image CVPR 2024 + + +
+ Recently, we have witnessed the explosive growth of various volumetric +representations in modeling animatable head avatars. However, due to the +diversity of frameworks, there is no practical method to support high-level +applications like 3D head avatar editing across different representations. In +this paper, we propose a generic avatar editing approach that can be +universally applied to various 3DMM driving volumetric head avatars. To achieve +this goal, we design a novel expression-aware modification generative model, +which enables lift 2D editing from a single image to a consistent 3D +modification field. To ensure the effectiveness of the generative modification +process, we develop several techniques, including an expression-dependent +modification distillation scheme to draw knowledge from the large-scale head +avatar model and 2D facial texture editing tools, implicit latent space +guidance to enhance model convergence, and a segmentation-based loss reweight +strategy for fine-grained texture inversion. Extensive experiments demonstrate +that our method delivers high-quality and consistent results across multiple +expression and viewpoints. Project page: https://zju3dv.github.io/geneavatar/ + +
+
+ comment: Accepted to CVPR 2024. Project page: + https://zju3dv.github.io/geneavatar/ +
+
+
+
+
+ + ☆ Diffusion$^2$: Dynamic 3D Content Generation via Score Composition of + Orthogonal Diffusion Models + + +
+ Recent advancements in 3D generation are predominantly propelled by +improvements in 3D-aware image diffusion models which are pretrained on +Internet-scale image data and fine-tuned on massive 3D data, offering the +capability of producing highly consistent multi-view images. However, due to +the scarcity of synchronized multi-view video data, it is impractical to adapt +this paradigm to 4D generation directly. Despite that, the available video and +3D data are adequate for training video and multi-view diffusion models that +can provide satisfactory dynamic and geometric priors respectively. In this +paper, we present Diffusion$^2$, a novel framework for dynamic 3D content +creation that leverages the knowledge about geometric consistency and temporal +smoothness from these models to directly sample dense multi-view and +multi-frame images which can be employed to optimize continuous 4D +representation. Specifically, we design a simple yet effective denoising +strategy via score composition of video and multi-view diffusion models based +on the probability structure of the images to be generated. Owing to the high +parallelism of the image generation and the efficiency of the modern 4D +reconstruction pipeline, our framework can generate 4D content within few +minutes. Furthermore, our method circumvents the reliance on 4D data, thereby +having the potential to benefit from the scalability of the foundation video +and multi-view diffusion models. Extensive experiments demonstrate the efficacy +of our proposed framework and its capability to flexibly adapt to various types +of prompts. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ☆ Iterated Learning Improves Compositionality in Large Vision-Language + Models CVPR 2024 + + +
+ A fundamental characteristic common to both human vision and natural language +is their compositional nature. Yet, despite the performance gains contributed +by large vision and language pretraining, recent investigations find that +most-if not all-our state-of-the-art vision-language models struggle at +compositionality. They are unable to distinguish between images of " a girl in +white facing a man in black" and "a girl in black facing a man in white". +Moreover, prior work suggests that compositionality doesn't arise with scale: +larger model sizes or training data don't help. This paper develops a new +iterated training algorithm that incentivizes compositionality. We draw on +decades of cognitive science research that identifies cultural transmission-the +need to teach a new generation-as a necessary inductive prior that incentivizes +humans to develop compositional languages. Specifically, we reframe +vision-language contrastive learning as the Lewis Signaling Game between a +vision agent and a language agent, and operationalize cultural transmission by +iteratively resetting one of the agent's weights during training. After every +iteration, this training paradigm induces representations that become "easier +to learn", a property of compositional languages: e.g. our model trained on +CC3M and CC12M improves standard CLIP by 4.7%, 4.0% respectfully in the +SugarCrepe benchmark. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ ResNet with Integrated Convolutional Block Attention Module for Ship + Classification Using Transfer Learning on Optical Satellite Imagery + + +
+ This study proposes a novel transfer learning framework for effective ship +classification using high-resolution optical remote sensing satellite imagery. +The framework is based on the deep convolutional neural network model ResNet50 +and incorporates the Convolutional Block Attention Module (CBAM) to enhance +performance. CBAM enables the model to attend to salient features in the +images, allowing it to better discriminate between subtle differences between +ships and backgrounds. Furthermore, this study adopts a transfer learning +approach tailored for accurately classifying diverse types of ships by +fine-tuning a pre-trained model for the specific task. Experimental results +demonstrate the efficacy of the proposed framework in ship classification using +optical remote sensing imagery, achieving a high classification accuracy of 94% +across 5 classes, outperforming existing methods. This research holds potential +applications in maritime surveillance and management, illegal fishing +detection, and maritime traffic monitoring. + +
+
+
+
+
+ + ☆ ViTamin: Designing Scalable Vision Models in the Vision-Language Era CVPR 2024 + + +
+ Recent breakthroughs in vision-language models (VLMs) start a new page in the +vision community. The VLMs provide stronger and more generalizable feature +embeddings compared to those from ImageNet-pretrained models, thanks to the +training on the large-scale Internet image-text pairs. However, despite the +amazing achievement from the VLMs, vanilla Vision Transformers (ViTs) remain +the default choice for the image encoder. Although pure transformer proves its +effectiveness in the text encoding area, it remains questionable whether it is +also the case for image encoding, especially considering that various types of +networks are proposed on the ImageNet benchmark, which, unfortunately, are +rarely studied in VLMs. Due to small data/model scale, the original conclusions +of model design on ImageNet can be limited and biased. In this paper, we aim at +building an evaluation protocol of vision models in the vision-language era +under the contrastive language-image pretraining (CLIP) framework. We provide a +comprehensive way to benchmark different vision models, covering their +zero-shot performance and scalability in both model and training data sizes. To +this end, we introduce ViTamin, a new vision models tailored for VLMs. +ViTamin-L significantly outperforms ViT-L by 2.0% ImageNet zero-shot accuracy, +when using the same publicly available DataComp-1B dataset and the same +OpenCLIP training scheme. ViTamin-L presents promising results on 60 diverse +benchmarks, including classification, retrieval, open-vocabulary detection and +segmentation, and large multi-modal models. When further scaling up the model +size, our ViTamin-XL with only 436M parameters attains 82.9% ImageNet zero-shot +accuracy, surpassing 82.0% achieved by EVA-E that has ten times more parameters +(4.4B). + +
+
+ comment: CVPR 2024; https://github.com/Beckschen/ViTamin +
+
+
+
+
+ + ☆ 3D Congealing: 3D-Aware Image Alignment in the Wild + + +
+ We propose 3D Congealing, a novel problem of 3D-aware alignment for 2D images +capturing semantically similar objects. Given a collection of unlabeled +Internet images, our goal is to associate the shared semantic parts from the +inputs and aggregate the knowledge from 2D images to a shared 3D canonical +space. We introduce a general framework that tackles the task without assuming +shape templates, poses, or any camera parameters. At its core is a canonical 3D +representation that encapsulates geometric and semantic information. The +framework optimizes for the canonical representation together with the pose for +each input image, and a per-image coordinate map that warps 2D pixel +coordinates to the 3D canonical frame to account for the shape matching. The +optimization procedure fuses prior knowledge from a pre-trained image +generative model and semantic information from input images. The former +provides strong knowledge guidance for this under-constraint task, while the +latter provides the necessary information to mitigate the training data bias +from the pre-trained model. Our framework can be used for various tasks such as +correspondence matching, pose estimation, and image editing, achieving strong +results on real-world image datasets under challenging illumination conditions +and on in-the-wild online image collections. + +
+
+ comment: Project page: + https://ai.stanford.edu/~yzzhang/projects/3d-congealing/ +
+
+
+
+
+ + ☆ Pre-trained Vision and Language Transformers Are Few-Shot Incremental + Learners CVPR 2024 + + +
+ Few-Shot Class Incremental Learning (FSCIL) is a task that requires a model +to learn new classes incrementally without forgetting when only a few samples +for each class are given. FSCIL encounters two significant challenges: +catastrophic forgetting and overfitting, and these challenges have driven prior +studies to primarily rely on shallow models, such as ResNet-18. Even though +their limited capacity can mitigate both forgetting and overfitting issues, it +leads to inadequate knowledge transfer during few-shot incremental sessions. In +this paper, we argue that large models such as vision and language transformers +pre-trained on large datasets can be excellent few-shot incremental learners. +To this end, we propose a novel FSCIL framework called PriViLege, Pre-trained +Vision and Language transformers with prompting functions and knowledge +distillation. Our framework effectively addresses the challenges of +catastrophic forgetting and overfitting in large models through new pre-trained +knowledge tuning (PKT) and two losses: entropy-based divergence loss and +semantic knowledge distillation loss. Experimental results show that the +proposed PriViLege significantly outperforms the existing state-of-the-art +methods with a large margin, e.g., +9.38% in CUB200, +20.58% in CIFAR-100, and ++13.36% in miniImageNet. Our implementation code is available at +https://github.com/KHU-AGI/PriViLege. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ ImageNot: A contrast with ImageNet preserves model rankings + + +
+ We introduce ImageNot, a dataset designed to match the scale of ImageNet +while differing drastically in other aspects. We show that key model +architectures developed for ImageNet over the years rank identically when +trained and evaluated on ImageNot to how they rank on ImageNet. This is true +when training models from scratch or fine-tuning them. Moreover, the relative +improvements of each model over earlier models strongly correlate in both +datasets. We further give evidence that ImageNot has a similar utility as +ImageNet for transfer learning purposes. Our work demonstrates a surprising +degree of external validity in the relative performance of image classification +models. This stands in contrast with absolute accuracy numbers that typically +drop sharply even under small changes to a dataset. + +
+
+
+
+
+ + ☆ Neural Ordinary Differential Equation based Sequential Image + Registration for Dynamic Characterization CVPR 2022 + + +
+ Deformable image registration (DIR) is crucial in medical image analysis, +enabling the exploration of biological dynamics such as organ motions and +longitudinal changes in imaging. Leveraging Neural Ordinary Differential +Equations (ODE) for registration, this extension work discusses how this +framework can aid in the characterization of sequential biological processes. +Utilizing the Neural ODE's ability to model state derivatives with neural +networks, our Neural Ordinary Differential Equation Optimization-based (NODEO) +framework considers voxels as particles within a dynamic system, defining +deformation fields through the integration of neural differential equations. +This method learns dynamics directly from data, bypassing the need for physical +priors, making it exceptionally suitable for medical scenarios where such +priors are unavailable or inapplicable. Consequently, the framework can discern +underlying dynamics and use sequence data to regularize the transformation +trajectory. We evaluated our framework on two clinical datasets: one for +cardiac motion tracking and another for longitudinal brain MRI analysis. +Demonstrating its efficacy in both 2D and 3D imaging scenarios, our framework +offers flexibility and model agnosticism, capable of managing image sequences +and facilitating label propagation throughout these sequences. This study +provides a comprehensive understanding of how the Neural ODE-based framework +uniquely benefits the image registration challenge. + +
+
+ comment: Journal extension of NODEO: A Neural Ordinary Differential Equation + Based Optimization Framework for Deformable Image Registration, CVPR 2022 +
+
+
+
+
+ + ☆ CameraCtrl: Enabling Camera Control for Text-to-Video Generation + + +
+ Controllability plays a crucial role in video generation since it allows +users to create desired content. However, existing models largely overlooked +the precise control of camera pose that serves as a cinematic language to +express deeper narrative nuances. To alleviate this issue, we introduce +CameraCtrl, enabling accurate camera pose control for text-to-video(T2V) +models. After precisely parameterizing the camera trajectory, a plug-and-play +camera module is then trained on a T2V model, leaving others untouched. +Additionally, a comprehensive study on the effect of various datasets is also +conducted, suggesting that videos with diverse camera distribution and similar +appearances indeed enhance controllability and generalization. Experimental +results demonstrate the effectiveness of CameraCtrl in achieving precise and +domain-adaptive camera control, marking a step forward in the pursuit of +dynamic and customized video storytelling from textual and camera pose inputs. +Our project website is at: https://hehao13.github.io/projects-CameraCtrl/. + +
+
+ comment: Project page: https://hehao13.github.io/projects-CameraCtrl/ Code: + https://github.com/hehao13/CameraCtrl +
+
+
+
+
+ + ☆ BRAVEn: Improving Self-Supervised Pre-training for Visual and Auditory + Speech Recognition ICASSP 2024 + + +
+ Self-supervision has recently shown great promise for learning visual and +auditory speech representations from unlabelled data. In this work, we propose +BRAVEn, an extension to the recent RAVEn method, which learns speech +representations entirely from raw audio-visual data. Our modifications to RAVEn +enable BRAVEn to achieve state-of-the-art results among self-supervised methods +in various settings. Moreover, we observe favourable scaling behaviour by +increasing the amount of unlabelled data well beyond other self-supervised +works. In particular, we achieve 20.0% / 1.7% word error rate for VSR / ASR on +the LRS3 test set, with only 30 hours of labelled data and no external ASR +models. Our results suggest that readily available unlabelled audio-visual data +can largely replace costly transcribed data. + +
+
+ comment: ICASSP 2024. Code: https://github.com/ahaliassos/raven +
+
+
+
+
+ + ☆ Adaptive Feature Fusion Neural Network for Glaucoma Segmentation on + Unseen Fundus Images + + +
+ Fundus image segmentation on unseen domains is challenging, especially for +the over-parameterized deep models trained on the small medical datasets. To +address this challenge, we propose a method named Adaptive Feature-fusion +Neural Network (AFNN) for glaucoma segmentation on unseen domains, which mainly +consists of three modules: domain adaptor, feature-fusion network, and +self-supervised multi-task learning. Specifically, the domain adaptor helps the +pretrained-model fast adapt from other image domains to the medical fundus +image domain. Feature-fusion network and self-supervised multi-task learning +for the encoder and decoder are introduced to improve the domain generalization +ability. In addition, we also design the weighted-dice-loss to improve model +performance on complex optic-cup segmentation tasks. Our proposed method +achieves a competitive performance over existing fundus segmentation methods on +four public glaucoma datasets. + +
+
+ comment: 17 pages, 11 figures +
+
+
+
+
+ + ☆ WcDT: World-centric Diffusion Transformer for Traffic Scene Generation + + +
+ In this paper, we introduce a novel approach for autonomous driving +trajectory generation by harnessing the complementary strengths of diffusion +probabilistic models (a.k.a., diffusion models) and transformers. Our proposed +framework, termed the "World-Centric Diffusion Transformer" (WcDT), optimizes +the entire trajectory generation process, from feature extraction to model +inference. To enhance the scene diversity and stochasticity, the historical +trajectory data is first preprocessed and encoded into latent space using +Denoising Diffusion Probabilistic Models (DDPM) enhanced with Diffusion with +Transformer (DiT) blocks. Then, the latent features, historical trajectories, +HD map features, and historical traffic signal information are fused with +various transformer-based encoders. The encoded traffic scenes are then decoded +by a trajectory decoder to generate multimodal future trajectories. +Comprehensive experimental results show that the proposed approach exhibits +superior performance in generating both realistic and diverse trajectories, +showing its potential for integration into automatic driving simulation +systems. + +
+
+ comment: 12 pages, 6 figures +
+
+
+
+
+ + ☆ EGTR: Extracting Graph from Transformer for Scene Graph Generation CVPR 2024 + + +
+ Scene Graph Generation (SGG) is a challenging task of detecting objects and +predicting relationships between objects. After DETR was developed, one-stage +SGG models based on a one-stage object detector have been actively studied. +However, complex modeling is used to predict the relationship between objects, +and the inherent relationship between object queries learned in the multi-head +self-attention of the object detector has been neglected. We propose a +lightweight one-stage SGG model that extracts the relation graph from the +various relationships learned in the multi-head self-attention layers of the +DETR decoder. By fully utilizing the self-attention by-products, the relation +graph can be extracted effectively with a shallow relation extraction head. +Considering the dependency of the relation extraction task on the object +detection task, we propose a novel relation smoothing technique that adjusts +the relation label adaptively according to the quality of the detected objects. +By the relation smoothing, the model is trained according to the continuous +curriculum that focuses on object detection task at the beginning of training +and performs multi-task learning as the object detection performance gradually +improves. Furthermore, we propose a connectivity prediction task that predicts +whether a relation exists between object pairs as an auxiliary task of the +relation extraction. We demonstrate the effectiveness and efficiency of our +method for the Visual Genome and Open Image V6 datasets. Our code is publicly +available at https://github.com/naver-ai/egtr . + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Red-Teaming Segment Anything Model CVPR 2024 + + +
+ Foundation models have emerged as pivotal tools, tackling many complex tasks +through pre-training on vast datasets and subsequent fine-tuning for specific +applications. The Segment Anything Model is one of the first and most +well-known foundation models for computer vision segmentation tasks. This work +presents a multi-faceted red-teaming analysis that tests the Segment Anything +Model against challenging tasks: (1) We analyze the impact of style transfer on +segmentation masks, demonstrating that applying adverse weather conditions and +raindrops to dashboard images of city roads significantly distorts generated +masks. (2) We focus on assessing whether the model can be used for attacks on +privacy, such as recognizing celebrities' faces, and show that the model +possesses some undesired knowledge in this task. (3) Finally, we check how +robust the model is to adversarial attacks on segmentation masks under text +prompts. We not only show the effectiveness of popular white-box attacks and +resistance to black-box attacks but also introduce a novel approach - Focused +Iterative Gradient Attack (FIGA) that combines white-box approaches to +construct an efficient attack resulting in a smaller number of modified pixels. +All of our testing methods and analyses indicate a need for enhanced safety +measures in foundation models for image segmentation. + +
+
+ comment: CVPR 2024 - The 4th Workshop of Adversarial Machine Learning on + Computer Vision: Robustness of Foundation Models +
+
+
+
+
+ + ☆ Multi-Level Label Correction by Distilling Proximate Patterns for + Semi-supervised Semantic Segmentation + + +
+ Semi-supervised semantic segmentation relieves the reliance on large-scale +labeled data by leveraging unlabeled data. Recent semi-supervised semantic +segmentation approaches mainly resort to pseudo-labeling methods to exploit +unlabeled data. However, unreliable pseudo-labeling can undermine the +semi-supervision processes. In this paper, we propose an algorithm called +Multi-Level Label Correction (MLLC), which aims to use graph neural networks to +capture structural relationships in Semantic-Level Graphs (SLGs) and +Class-Level Graphs (CLGs) to rectify erroneous pseudo-labels. Specifically, +SLGs represent semantic affinities between pairs of pixel features, and CLGs +describe classification consistencies between pairs of pixel labels. With the +support of proximate pattern information from graphs, MLLC can rectify +incorrectly predicted pseudo-labels and can facilitate discriminative feature +representations. We design an end-to-end network to train and perform this +effective label corrections mechanism. Experiments demonstrate that MLLC can +significantly improve supervised baselines and outperforms state-of-the-art +approaches in different scenarios on Cityscapes and PASCAL VOC 2012 datasets. +Specifically, MLLC improves the supervised baseline by at least 5% and 2% with +DeepLabV2 and DeepLabV3+ respectively under different partition protocols. + +
+
+ comment: 12 pages, 8 figures. IEEE Transactions on Multimedia, 2024 +
+
+
+
+
+ + ☆ IISAN: Efficiently Adapting Multimodal Representation for Sequential + Recommendation with Decoupled PEFT SIGIR2024 + + +
+ Multimodal foundation models are transformative in sequential recommender +systems, leveraging powerful representation learning capabilities. While +Parameter-efficient Fine-tuning (PEFT) is commonly used to adapt foundation +models for recommendation tasks, most research prioritizes parameter +efficiency, often overlooking critical factors like GPU memory efficiency and +training speed. Addressing this gap, our paper introduces IISAN (Intra- and +Inter-modal Side Adapted Network for Multimodal Representation), a simple +plug-and-play architecture using a Decoupled PEFT structure and exploiting both +intra- and inter-modal adaptation. + IISAN matches the performance of full fine-tuning (FFT) and state-of-the-art +PEFT. More importantly, it significantly reduces GPU memory usage - from 47GB +to just 3GB for multimodal sequential recommendation tasks. Additionally, it +accelerates training time per epoch from 443s to 22s compared to FFT. This is +also a notable improvement over the Adapter and LoRA, which require 37-39 GB +GPU memory and 350-380 seconds per epoch for training. + Furthermore, we propose a new composite efficiency metric, TPME +(Training-time, Parameter, and GPU Memory Efficiency) to alleviate the +prevalent misconception that "parameter efficiency represents overall +efficiency". TPME provides more comprehensive insights into practical +efficiency comparisons between different methods. Besides, we give an +accessible efficiency analysis of all PEFT and FFT approaches, which +demonstrate the superiority of IISAN. We release our codes and other materials +at https://github.com/jjGenAILab/IISAN. + +
+
+ comment: Accepted by SIGIR2024 +
+
+
+
+
+ + ☆ Causality-based Transfer of Driving Scenarios to Unseen Intersections + + +
+ Scenario-based testing of automated driving functions has become a promising +method to reduce time and cost compared to real-world testing. In +scenario-based testing automated functions are evaluated in a set of +pre-defined scenarios. These scenarios provide information about vehicle +behaviors, environmental conditions, or road characteristics using parameters. +To create realistic scenarios, parameters and parameter dependencies have to be +fitted utilizing real-world data. However, due to the large variety of +intersections and movement constellations found in reality, data may not be +available for certain scenarios. This paper proposes a methodology to +systematically analyze relations between parameters of scenarios. Bayesian +networks are utilized to analyze causal dependencies in order to decrease the +amount of required data and to transfer causal patterns creating unseen +scenarios. Thereby, infrastructural influences on movement patterns are +investigated to generate realistic scenarios on unobserved intersections. For +evaluation, scenarios and underlying parameters are extracted from the inD +dataset. Movement patterns are estimated, transferred and checked against +recorded data from those initially unseen intersections. + +
+
+ comment: 6 pages, 8 figures, 1 table, Accepted to be published as part of the + 35th IEEE Intelligent Vehicles Symposium, June 2 - 5, 2024, Korea +
+
+
+
+
+ + ☆ SelfPose3d: Self-Supervised Multi-Person Multi-View 3d Pose Estimation CVPR 2024 + + +
+ We present a new self-supervised approach, SelfPose3d, for estimating 3d +poses of multiple persons from multiple camera views. Unlike current +state-of-the-art fully-supervised methods, our approach does not require any 2d +or 3d ground-truth poses and uses only the multi-view input images from a +calibrated camera setup and 2d pseudo poses generated from an off-the-shelf 2d +human pose estimator. We propose two self-supervised learning objectives: +self-supervised person localization in 3d space and self-supervised 3d pose +estimation. We achieve self-supervised 3d person localization by training the +model on synthetically generated 3d points, serving as 3d person root +positions, and on the projected root-heatmaps in all the views. We then model +the 3d poses of all the localized persons with a bottleneck representation, map +them onto all views obtaining 2d joints, and render them using 2d Gaussian +heatmaps in an end-to-end differentiable manner. Afterwards, we use the +corresponding 2d joints and heatmaps from the pseudo 2d poses for learning. To +alleviate the intrinsic inaccuracy of the pseudo labels, we propose an adaptive +supervision attention mechanism to guide the self-supervision. Our experiments +and analysis on three public benchmark datasets, including Panoptic, Shelf, and +Campus, show the effectiveness of our approach, which is comparable to +fully-supervised methods. Code is available at +\url{https://github.com/CAMMA-public/SelfPose3D} + +
+
+ comment: Accepted for CVPR 2024 +
+
+
+
+
+ + ☆ Specularity Factorization for Low-Light Enhancement CVPR 2024 + + +
+ We present a new additive image factorization technique that treats images to +be composed of multiple latent specular components which can be simply +estimated recursively by modulating the sparsity during decomposition. Our +model-driven {\em RSFNet} estimates these factors by unrolling the optimization +into network layers requiring only a few scalars to be learned. The resultant +factors are interpretable by design and can be fused for different image +enhancement tasks via a network or combined directly by the user in a +controllable fashion. Based on RSFNet, we detail a zero-reference Low Light +Enhancement (LLE) application trained without paired or unpaired supervision. +Our system improves the state-of-the-art performance on standard benchmarks and +achieves better generalization on multiple other datasets. We also integrate +our factors with other task specific fusion networks for applications like +deraining, deblurring and dehazing with negligible overhead thereby +highlighting the multi-domain and multi-task generalizability of our proposed +RSFNet. The code and data is released for reproducibility on the project +homepage. + +
+
+ comment: CVPR 2024, Pages: 8(main)+4(references)+17(supp) = 29 +
+
+
+
+
+ + ☆ A discussion about violin reduction: geometric analysis of contour lines + and channel of minima + + +
+ Some early violins have been reduced during their history to fit imposed +morphological standards, while more recent ones have been built directly to +these standards. We can observe differences between reduced and unreduced +instruments, particularly in their contour lines and channel of minima. In a +recent preliminary work, we computed and highlighted those two features for two +instruments using triangular 3D meshes acquired by photogrammetry, whose +fidelity has been assessed and validated with sub-millimetre accuracy. We +propose here an extension to a corpus of 38 violins, violas and cellos, and +introduce improved procedures, leading to a stronger discussion of the +geometric analysis. We first recall the material we are working with. We then +discuss how to derive the best reference plane for the violin alignment, which +is crucial for the computation of contour lines and channel of minima. Finally, +we show how to compute efficiently both characteristics and we illustrate our +results with a few examples. + +
+
+ comment: Paper accepted (before reviewing) for the Florence Heri-Tech 2024 + Conference +
+
+
+
+
+ + ☆ DELAN: Dual-Level Alignment for Vision-and-Language Navigation by + Cross-Modal Contrastive Learning LREC + + +
+ Vision-and-Language navigation (VLN) requires an agent to navigate in unseen +environment by following natural language instruction. For task completion, the +agent needs to align and integrate various navigation modalities, including +instruction, observation and navigation history. Existing works primarily +concentrate on cross-modal attention at the fusion stage to achieve this +objective. Nevertheless, modality features generated by disparate uni-encoders +reside in their own spaces, leading to a decline in the quality of cross-modal +fusion and decision. To address this problem, we propose a Dual-levEL AligNment +(DELAN) framework by cross-modal contrastive learning. This framework is +designed to align various navigation-related modalities before fusion, thereby +enhancing cross-modal interaction and action decision-making. Specifically, we +divide the pre-fusion alignment into dual levels: instruction-history level and +landmark-observation level according to their semantic correlations. We also +reconstruct a dual-level instruction for adaptation to the dual-level +alignment. As the training signals for pre-fusion alignment are extremely +limited, self-supervised contrastive learning strategies are employed to +enforce the matching between different modalities. Our approach seamlessly +integrates with the majority of existing models, resulting in improved +navigation performance on various VLN benchmarks, including R2R, R4R, RxR and +CVDN. + +
+
+ comment: Accepted by LREC-COLING 2024 +
+
+
+
+
+ + ☆ Cooperative Students: Navigating Unsupervised Domain Adaptation in + Nighttime Object Detection + + +
+ Unsupervised Domain Adaptation (UDA) has shown significant advancements in +object detection under well-lit conditions; however, its performance degrades +notably in low-visibility scenarios, especially at night, posing challenges not +only for its adaptability in low signal-to-noise ratio (SNR) conditions but +also for the reliability and efficiency of automated vehicles. To address this +problem, we propose a \textbf{Co}operative \textbf{S}tudents (\textbf{CoS}) +framework that innovatively employs global-local transformations (GLT) and a +proxy-based target consistency (PTC) mechanism to capture the spatial +consistency in day- and night-time scenarios effectively, and thus bridge the +significant domain shift across contexts. Building upon this, we further devise +an adaptive IoU-informed thresholding (AIT) module to gradually avoid +overlooking potential true positives and enrich the latent information in the +target domain. Comprehensive experiments show that CoS essentially enhanced UDA +performance in low-visibility conditions and surpasses current state-of-the-art +techniques, achieving an increase in mAP of 3.0\%, 1.9\%, and 2.5\% on BDD100K, +SHIFT, and ACDC datasets, respectively. Code is available at +https://github.com/jichengyuan/Cooperitive_Students. + +
+
+ comment: Code is available at + https://github.com/jichengyuan/Cooperitive_Students +
+
+
+
+
+ + ☆ Fashion Style Editing with Generative Human Prior + + +
+ Image editing has been a long-standing challenge in the research community +with its far-reaching impact on numerous applications. Recently, text-driven +methods started to deliver promising results in domains like human faces, but +their applications to more complex domains have been relatively limited. In +this work, we explore the task of fashion style editing, where we aim to +manipulate the fashion style of human imagery using text descriptions. +Specifically, we leverage a generative human prior and achieve fashion style +editing by navigating its learned latent space. We first verify that the +existing text-driven editing methods fall short for our problem due to their +overly simplified guidance signal, and propose two directions to reinforce the +guidance: textual augmentation and visual referencing. Combined with our +empirical findings on the latent space structure, our Fashion Style Editing +framework (FaSE) successfully projects abstract fashion concepts onto human +images and introduces exciting new applications to the field. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ☆ Joint-Task Regularization for Partially Labeled Multi-Task Learning CVPR 2024 + + +
+ Multi-task learning has become increasingly popular in the machine learning +field, but its practicality is hindered by the need for large, labeled +datasets. Most multi-task learning methods depend on fully labeled datasets +wherein each input example is accompanied by ground-truth labels for all target +tasks. Unfortunately, curating such datasets can be prohibitively expensive and +impractical, especially for dense prediction tasks which require per-pixel +labels for each image. With this in mind, we propose Joint-Task Regularization +(JTR), an intuitive technique which leverages cross-task relations to +simultaneously regularize all tasks in a single joint-task latent space to +improve learning when data is not fully labeled for all tasks. JTR stands out +from existing approaches in that it regularizes all tasks jointly rather than +separately in pairs -- therefore, it achieves linear complexity relative to the +number of tasks while previous methods scale quadratically. To demonstrate the +validity of our approach, we extensively benchmark our method across a wide +variety of partially labeled scenarios based on NYU-v2, Cityscapes, and +Taskonomy. + +
+
+ comment: Accepted paper to CVPR 2024 (main conference) +
+
+
+
+
+ + ☆ CAM-Based Methods Can See through Walls + + +
+ CAM-based methods are widely-used post-hoc interpretability method that +produce a saliency map to explain the decision of an image classification +model. The saliency map highlights the important areas of the image relevant to +the prediction. In this paper, we show that most of these methods can +incorrectly attribute an important score to parts of the image that the model +cannot see. We show that this phenomenon occurs both theoretically and +experimentally. On the theory side, we analyze the behavior of GradCAM on a +simple masked CNN model at initialization. Experimentally, we train a VGG-like +model constrained to not use the lower part of the image and nevertheless +observe positive scores in the unseen part of the image. This behavior is +evaluated quantitatively on two new datasets. We believe that this is +problematic, potentially leading to mis-interpretation of the model's behavior. + +
+
+ comment: 25 pages, 9 figures +
+
+
+
+
+ + ☆ Bi-LORA: A Vision-Language Approach for Synthetic Image Detection + + +
+ Advancements in deep image synthesis techniques, such as generative +adversarial networks (GANs) and diffusion models (DMs), have ushered in an era +of generating highly realistic images. While this technological progress has +captured significant interest, it has also raised concerns about the potential +difficulty in distinguishing real images from their synthetic counterparts. +This paper takes inspiration from the potent convergence capabilities between +vision and language, coupled with the zero-shot nature of vision-language +models (VLMs). We introduce an innovative method called Bi-LORA that leverages +VLMs, combined with low-rank adaptation (LORA) tuning techniques, to enhance +the precision of synthetic image detection for unseen model-generated images. +The pivotal conceptual shift in our methodology revolves around reframing +binary classification as an image captioning task, leveraging the distinctive +capabilities of cutting-edge VLM, notably bootstrapping language image +pre-training (BLIP2). Rigorous and comprehensive experiments are conducted to +validate the effectiveness of our proposed approach, particularly in detecting +unseen diffusion-generated images from unknown diffusion-based generative +models during training, showcasing robustness to noise, and demonstrating +generalization capabilities to GANs. The obtained results showcase an +impressive average accuracy of 93.41% in synthetic image detection on unseen +generation models. The code and models associated with this research can be +publicly accessed at https://github.com/Mamadou-Keita/VLM-DETECT. + +
+
+
+
+
+ + ☆ Automatic Wood Pith Detector: Local Orientation Estimation and Robust + Accumulation ICPR 2024 + + +
+ A fully automated technique for wood pith detection (APD), relying on the +concentric shape of the structure of wood ring slices, is introduced. The +method estimates the ring's local orientations using the 2D structure tensor +and finds the pith position, optimizing a cost function designed for this +problem. We also present a variant (APD-PCL), using the parallel coordinates +space, that enhances the method's effectiveness when there are no clear tree +ring patterns. Furthermore, refining previous work by Kurdthongmee, a YoloV8 +net is trained for pith detection, producing a deep learning-based approach to +the same problem (APD-DL). All methods were tested on seven datasets, including +images captured under diverse conditions (controlled laboratory settings, +sawmill, and forest) and featuring various tree species (Pinus taeda, Douglas +fir, Abies alba, and Gleditsia triacanthos). All proposed approaches outperform +existing state-of-the-art methods and can be used in CPU-based real-time +applications. Additionally, we provide a novel dataset comprising images of +gymnosperm and angiosperm species. Dataset and source code are available at +http://github.com/hmarichal93/apd. + +
+
+ comment: 18 pages, presented to ICPR 2024 conference +
+
+
+
+
+ + ☆ Quantifying Noise of Dynamic Vision Sensor + + +
+ Dynamic visual sensors (DVS) are characterized by a large amount of +background activity (BA) noise, which it is mixed with the original (cleaned) +sensor signal. The dynamic nature of the signal and the absence in practical +application of the ground truth, it clearly makes difficult to distinguish +between noise and the cleaned sensor signals using standard image processing +techniques. In this letter, a new technique is presented to characterise BA +noise derived from the Detrended Fluctuation Analysis (DFA). The proposed +technique can be used to address an existing DVS issues, which is how to +quantitatively characterised noise and signal without ground truth, and how to +derive an optimal denoising filter parameters. The solution of the latter +problem is demonstrated for the popular real moving-car dataset. + +
+
+ comment: 5 pages, 4 figures, submitted to the IEEE Signal Processing Letters +
+
+
+
+
+ + ☆ Synthetic Data for Robust Stroke Segmentation + + +
+ Deep learning-based semantic segmentation in neuroimaging currently requires +high-resolution scans and extensive annotated datasets, posing significant +barriers to clinical applicability. We present a novel synthetic framework for +the task of lesion segmentation, extending the capabilities of the established +SynthSeg approach to accommodate large heterogeneous pathologies with +lesion-specific augmentation strategies. Our method trains deep learning +models, demonstrated here with the UNet architecture, using label maps derived +from healthy and stroke datasets, facilitating the segmentation of both healthy +tissue and pathological lesions without sequence-specific training data. +Evaluated against in-domain and out-of-domain (OOD) datasets, our framework +demonstrates robust performance, rivaling current methods within the training +domain and significantly outperforming them on OOD data. This contribution +holds promise for advancing medical imaging analysis in clinical settings, +especially for stroke pathology, by enabling reliable segmentation across +varied imaging sequences with reduced dependency on large annotated corpora. +Code and weights available at https://github.com/liamchalcroft/SynthStroke. + +
+
+
+
+
+ + ☆ Event-assisted Low-Light Video Object Segmentation CVPR 2024 + + +
+ In the realm of video object segmentation (VOS), the challenge of operating +under low-light conditions persists, resulting in notably degraded image +quality and compromised accuracy when comparing query and memory frames for +similarity computation. Event cameras, characterized by their high dynamic +range and ability to capture motion information of objects, offer promise in +enhancing object visibility and aiding VOS methods under such low-light +conditions. This paper introduces a pioneering framework tailored for low-light +VOS, leveraging event camera data to elevate segmentation accuracy. Our +approach hinges on two pivotal components: the Adaptive Cross-Modal Fusion +(ACMF) module, aimed at extracting pertinent features while fusing image and +event modalities to mitigate noise interference, and the Event-Guided Memory +Matching (EGMM) module, designed to rectify the issue of inaccurate matching +prevalent in low-light settings. Additionally, we present the creation of a +synthetic LLE-DAVIS dataset and the curation of a real-world LLE-VOS dataset, +encompassing frames and events. Experimental evaluations corroborate the +efficacy of our method across both datasets, affirming its effectiveness in +low-light scenarios. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Lookahead Exploration with Neural Radiance Representation for Continuous + Vision-Language Navigation CVPR 2024 + + +
+ Vision-and-language navigation (VLN) enables the agent to navigate to a +remote location following the natural language instruction in 3D environments. +At each navigation step, the agent selects from possible candidate locations +and then makes the move. For better navigation planning, the lookahead +exploration strategy aims to effectively evaluate the agent's next action by +accurately anticipating the future environment of candidate locations. To this +end, some existing works predict RGB images for future environments, while this +strategy suffers from image distortion and high computational cost. To address +these issues, we propose the pre-trained hierarchical neural radiance +representation model (HNR) to produce multi-level semantic features for future +environments, which are more robust and efficient than pixel-wise RGB +reconstruction. Furthermore, with the predicted future environmental +representations, our lookahead VLN model is able to construct the navigable +future path tree and select the optimal path via efficient parallel evaluation. +Extensive experiments on the VLN-CE datasets confirm the effectiveness of our +method. + +
+
+ comment: Accepted by CVPR 2024. The code is available at + https://github.com/MrZihan/HNR-VLN +
+
+
+
+
+ + ☆ LPSNet: End-to-End Human Pose and Shape Estimation with Lensless Imaging + + +
+ Human pose and shape (HPS) estimation with lensless imaging is not only +beneficial to privacy protection but also can be used in covert surveillance +scenarios due to the small size and simple structure of this device. However, +this task presents significant challenges due to the inherent ambiguity of the +captured measurements and lacks effective methods for directly estimating human +pose and shape from lensless data. In this paper, we propose the first +end-to-end framework to recover 3D human poses and shapes from lensless +measurements to our knowledge. We specifically design a multi-scale lensless +feature decoder to decode the lensless measurements through the optically +encoded mask for efficient feature extraction. We also propose a double-head +auxiliary supervision mechanism to improve the estimation accuracy of human +limb ends. Besides, we establish a lensless imaging system and verify the +effectiveness of our method on various datasets acquired by our lensless +imaging system. + +
+
+
+
+
+ + ☆ PREGO: online mistake detection in PRocedural EGOcentric videos CVPR 2024 + + +
+ Promptly identifying procedural errors from egocentric videos in an online +setting is highly challenging and valuable for detecting mistakes as soon as +they happen. This capability has a wide range of applications across various +fields, such as manufacturing and healthcare. The nature of procedural mistakes +is open-set since novel types of failures might occur, which calls for +one-class classifiers trained on correctly executed procedures. However, no +technique can currently detect open-set procedural mistakes online. We propose +PREGO, the first online one-class classification model for mistake detection in +PRocedural EGOcentric videos. PREGO is based on an online action recognition +component to model the current action, and a symbolic reasoning module to +predict the next actions. Mistake detection is performed by comparing the +recognized current action with the expected future one. We evaluate PREGO on +two procedural egocentric video datasets, Assembly101 and Epic-tent, which we +adapt for online benchmarking of procedural mistake detection to establish +suitable benchmarks, thus defining the Assembly101-O and Epic-tent-O datasets, +respectively. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Towards Enhanced Analysis of Lung Cancer Lesions in EBUS-TBNA -- A + Semi-Supervised Video Object Detection Method + + +
+ This study aims to establish a computer-aided diagnostic system for lung +lesions using bronchoscope endobronchial ultrasound (EBUS) to assist physicians +in identifying lesion areas. During EBUS-transbronchial needle aspiration +(EBUS-TBNA) procedures, physicians rely on grayscale ultrasound images to +determine the location of lesions. However, these images often contain +significant noise and can be influenced by surrounding tissues or blood +vessels, making interpretation challenging. Previous research has lacked the +application of object detection models to EBUS-TBNA, and there has been no +well-defined solution for annotating the EBUS-TBNA dataset. In related studies +on ultrasound images, although models have been successful in capturing target +regions for their respective tasks, their training and predictions have been +based on two-dimensional images, limiting their ability to leverage temporal +features for improved predictions. This study introduces a three-dimensional +image-based object detection model. It utilizes an attention mechanism to +capture temporal correlations and we will implements a filtering mechanism to +select relevant information from previous frames. Subsequently, a +teacher-student model training approach is employed to optimize the model +further, leveraging unlabeled data. To mitigate the impact of poor-quality +pseudo-labels on the student model, we will add a special Gaussian Mixture +Model (GMM) to ensure the quality of pseudo-labels. + +
+
+
+
+
+ + ☆ Improving Bird's Eye View Semantic Segmentation by Task Decomposition CVPR 2024 + + +
+ Semantic segmentation in bird's eye view (BEV) plays a crucial role in +autonomous driving. Previous methods usually follow an end-to-end pipeline, +directly predicting the BEV segmentation map from monocular RGB inputs. +However, the challenge arises when the RGB inputs and BEV targets from distinct +perspectives, making the direct point-to-point predicting hard to optimize. In +this paper, we decompose the original BEV segmentation task into two stages, +namely BEV map reconstruction and RGB-BEV feature alignment. In the first +stage, we train a BEV autoencoder to reconstruct the BEV segmentation maps +given corrupted noisy latent representation, which urges the decoder to learn +fundamental knowledge of typical BEV patterns. The second stage involves +mapping RGB input images into the BEV latent space of the first stage, directly +optimizing the correlations between the two views at the feature level. Our +approach simplifies the complexity of combining perception and generation into +distinct steps, equipping the model to handle intricate and challenging scenes +effectively. Besides, we propose to transform the BEV segmentation map from the +Cartesian to the polar coordinate system to establish the column-wise +correspondence between RGB images and BEV maps. Moreover, our method requires +neither multi-scale features nor camera intrinsic parameters for depth +estimation and saves computational overhead. Extensive experiments on nuScenes +and Argoverse show the effectiveness and efficiency of our method. Code is +available at https://github.com/happytianhao/TaDe. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Toward Efficient Visual Gyroscopes: Spherical Moments, Harmonics + Filtering, and Masking Techniques for Spherical Camera Applications IROS 2024 + + +
+ Unlike a traditional gyroscope, a visual gyroscope estimates camera rotation +through images. The integration of omnidirectional cameras, offering a larger +field of view compared to traditional RGB cameras, has proven to yield more +accurate and robust results. However, challenges arise in situations that lack +features, have substantial noise causing significant errors, and where certain +features in the images lack sufficient strength, leading to less precise +prediction results. + Here, we address these challenges by introducing a novel visual gyroscope, +which combines an analytical method with a neural network approach to provide a +more efficient and accurate rotation estimation from spherical images. The +presented method relies on three key contributions: an adapted analytical +approach to compute the spherical moments coefficients, introduction of masks +for better global feature representation, and the use of a multilayer +perceptron to adaptively choose the best combination of masks and filters. +Experimental results demonstrate superior performance of the proposed approach +in terms of accuracy. The paper emphasizes the advantages of integrating +machine learning to optimize analytical solutions, discusses limitations, and +suggests directions for future research. + +
+
+ comment: Submitted to 2024 IEEE/RSJ International Conference on Intelligent + Robots and Systems (IROS 2024) +
+
+
+
+
+ + ☆ VLRM: Vision-Language Models act as Reward Models for Image Captioning + + +
+ In this work, we present an unsupervised method for enhancing an image +captioning model (in our case, BLIP2) using reinforcement learning and +vision-language models like CLIP and BLIP2-ITM as reward models. The RL-tuned +model is able to generate longer and more comprehensive descriptions. Our model +reaches impressive 0.90 R@1 CLIP Recall score on MS-COCO Carpathy Test Split. + Weights are available at +https://huggingface.co/sashakunitsyn/vlrm-blip2-opt-2.7b. + +
+
+
+
+
+ + ☆ Minimize Quantization Output Error with Bias Compensation + + +
+ Quantization is a promising method that reduces memory usage and +computational intensity of Deep Neural Networks (DNNs), but it often leads to +significant output error that hinder model deployment. In this paper, we +propose Bias Compensation (BC) to minimize the output error, thus realizing +ultra-low-precision quantization without model fine-tuning. Instead of +optimizing the non-convex quantization process as in most previous methods, the +proposed BC bypasses the step to directly minimize the quantizing output error +by identifying a bias vector for compensation. We have established that the +minimization of output error through BC is a convex problem and provides an +efficient strategy to procure optimal solutions associated with minimal output +error,without the need for training or fine-tuning. We conduct extensive +experiments on Vision Transformer models and Large Language Models, and the +results show that our method notably reduces quantization output error, thereby +permitting ultra-low-precision post-training quantization and enhancing the +task performance of models. Especially, BC improves the accuracy of ViT-B with +4-bit PTQ4ViT by 36.89% on the ImageNet-1k task, and decreases the perplexity +of OPT-350M with 3-bit GPTQ by 5.97 on WikiText2.The code is in +https://github.com/GongCheng1919/bias-compensation. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ☆ ASTRA: An Action Spotting TRAnsformer for Soccer Videos + + +
+ In this paper, we introduce ASTRA, a Transformer-based model designed for the +task of Action Spotting in soccer matches. ASTRA addresses several challenges +inherent in the task and dataset, including the requirement for precise action +localization, the presence of a long-tail data distribution, non-visibility in +certain actions, and inherent label noise. To do so, ASTRA incorporates (a) a +Transformer encoder-decoder architecture to achieve the desired output temporal +resolution and to produce precise predictions, (b) a balanced mixup strategy to +handle the long-tail distribution of the data, (c) an uncertainty-aware +displacement head to capture the label variability, and (d) input audio signal +to enhance detection of non-visible actions. Results demonstrate the +effectiveness of ASTRA, achieving a tight Average-mAP of 66.82 on the test set. +Moreover, in the SoccerNet 2023 Action Spotting challenge, we secure the 3rd +position with an Average-mAP of 70.21 on the challenge set. + +
+
+
+
+
+ + ☆ RAVE: Residual Vector Embedding for CLIP-Guided Backlit Image + Enhancement + + +
+ In this paper we propose a novel modification of Contrastive Language-Image +Pre-Training (CLIP) guidance for the task of unsupervised backlit image +enhancement. Our work builds on the state-of-the-art CLIP-LIT approach, which +learns a prompt pair by constraining the text-image similarity between a prompt +(negative/positive sample) and a corresponding image (backlit image/well-lit +image) in the CLIP embedding space. Learned prompts then guide an image +enhancement network. Based on the CLIP-LIT framework, we propose two novel +methods for CLIP guidance. First, we show that instead of tuning prompts in the +space of text embeddings, it is possible to directly tune their embeddings in +the latent space without any loss in quality. This accelerates training and +potentially enables the use of additional encoders that do not have a text +encoder. Second, we propose a novel approach that does not require any prompt +tuning. Instead, based on CLIP embeddings of backlit and well-lit images from +training data, we compute the residual vector in the embedding space as a +simple difference between the mean embeddings of the well-lit and backlit +images. This vector then guides the enhancement network during training, +pushing a backlit image towards the space of well-lit images. This approach +further dramatically reduces training time, stabilizes training and produces +high quality enhanced images without artifacts, both in supervised and +unsupervised training regimes. Additionally, we show that residual vectors can +be interpreted, revealing biases in training data, and thereby enabling +potential bias correction. + +
+
+
+
+
+ + ☆ 3D Scene Generation from Scene Graphs and Self-Attention + + +
+ Synthesizing realistic and diverse indoor 3D scene layouts in a controllable +fashion opens up applications in simulated navigation and virtual reality. As +concise and robust representations of a scene, scene graphs have proven to be +well-suited as the semantic control on the generated layout. We present a +variant of the conditional variational autoencoder (cVAE) model to synthesize +3D scenes from scene graphs and floor plans. We exploit the properties of +self-attention layers to capture high-level relationships between objects in a +scene, and use these as the building blocks of our model. Our model, leverages +graph transformers to estimate the size, dimension and orientation of the +objects in a room while satisfying relationships in the given scene graph. Our +experiments shows self-attention layers leads to sparser (HOW MUCH) and more +diverse scenes (HOW MUCH)\. Included in this work, we publish the first +large-scale dataset for conditioned scene generation from scene graphs, +containing over XXX rooms (of floor plans and scene graphs). + +
+
+
+
+
+ + ☆ Scene Adaptive Sparse Transformer for Event-based Object Detection + + +
+ While recent Transformer-based approaches have shown impressive performances +on event-based object detection tasks, their high computational costs still +diminish the low power consumption advantage of event cameras. Image-based +works attempt to reduce these costs by introducing sparse Transformers. +However, they display inadequate sparsity and adaptability when applied to +event-based object detection, since these approaches cannot balance the fine +granularity of token-level sparsification and the efficiency of window-based +Transformers, leading to reduced performance and efficiency. Furthermore, they +lack scene-specific sparsity optimization, resulting in information loss and a +lower recall rate. To overcome these limitations, we propose the Scene Adaptive +Sparse Transformer (SAST). SAST enables window-token co-sparsification, +significantly enhancing fault tolerance and reducing computational overhead. +Leveraging the innovative scoring and selection modules, along with the Masked +Sparse Window Self-Attention, SAST showcases remarkable scene-aware +adaptability: It focuses only on important objects and dynamically optimizes +sparsity level according to scene complexity, maintaining a remarkable balance +between performance and computational cost. The evaluation results show that +SAST outperforms all other dense and sparse networks in both performance and +efficiency on two large-scale event-based object detection datasets (1Mpx and +Gen1). Code: https://github.com/Peterande/SAST + +
+
+
+
+
+ + ☆ Real, fake and synthetic faces - does the coin have three sides? + + +
+ With the ever-growing power of generative artificial intelligence, deepfake +and artificially generated (synthetic) media have continued to spread online, +which creates various ethical and moral concerns regarding their usage. To +tackle this, we thus present a novel exploration of the trends and patterns +observed in real, deepfake and synthetic facial images. The proposed analysis +is done in two parts: firstly, we incorporate eight deep learning models and +analyze their performances in distinguishing between the three classes of +images. Next, we look to further delve into the similarities and differences +between these three sets of images by investigating their image properties both +in the context of the entire image as well as in the context of specific +regions within the image. ANOVA test was also performed and provided further +clarity amongst the patterns associated between the images of the three +classes. From our findings, we observe that the investigated deeplearning +models found it easier to detect synthetic facial images, with the ViT Patch-16 +model performing best on this task with a class-averaged sensitivity, +specificity, precision, and accuracy of 97.37%, 98.69%, 97.48%, and 98.25%, +respectively. This observation was supported by further analysis of various +image properties. We saw noticeable differences across the three category of +images. This analysis can help us build better algorithms for facial image +generation, and also shows that synthetic, deepfake and real face images are +indeed three different classes. + +
+
+
+
+
+ + ☆ Co-Speech Gesture Video Generation via Motion-Decoupled Diffusion Model CVPR 2024 + + +
+ Co-speech gestures, if presented in the lively form of videos, can achieve +superior visual effects in human-machine interaction. While previous works +mostly generate structural human skeletons, resulting in the omission of +appearance information, we focus on the direct generation of audio-driven +co-speech gesture videos in this work. There are two main challenges: 1) A +suitable motion feature is needed to describe complex human movements with +crucial appearance information. 2) Gestures and speech exhibit inherent +dependencies and should be temporally aligned even of arbitrary length. To +solve these problems, we present a novel motion-decoupled framework to generate +co-speech gesture videos. Specifically, we first introduce a well-designed +nonlinear TPS transformation to obtain latent motion features preserving +essential appearance information. Then a transformer-based diffusion model is +proposed to learn the temporal correlation between gestures and speech, and +performs generation in the latent motion space, followed by an optimal motion +selection module to produce long-term coherent and consistent gesture videos. +For better visual perception, we further design a refinement network focusing +on missing details of certain areas. Extensive experimental results show that +our proposed framework significantly outperforms existing approaches in both +motion and video-related evaluations. Our code, demos, and more resources are +available at https://github.com/thuhcsi/S2G-MDDiffusion. + +
+
+ comment: 22 pages, 8 figures, CVPR 2024 +
+
+
+
+
+ + ☆ Pairwise Similarity Distribution Clustering for Noisy Label Learning + + +
+ Noisy label learning aims to train deep neural networks using a large amount +of samples with noisy labels, whose main challenge comes from how to deal with +the inaccurate supervision caused by wrong labels. Existing works either take +the label correction or sample selection paradigm to involve more samples with +accurate labels into the training process. In this paper, we propose a simple +yet effective sample selection algorithm, termed as Pairwise Similarity +Distribution Clustering~(PSDC), to divide the training samples into one clean +set and another noisy set, which can power any of the off-the-shelf +semi-supervised learning regimes to further train networks for different +downstream tasks. Specifically, we take the pairwise similarity between sample +pairs to represent the sample structure, and the Gaussian Mixture Model~(GMM) +to model the similarity distribution between sample pairs belonging to the same +noisy cluster, therefore each sample can be confidently divided into the clean +set or noisy set. Even under severe label noise rate, the resulting data +partition mechanism has been proved to be more robust in judging the label +confidence in both theory and practice. Experimental results on various +benchmark datasets, such as CIFAR-10, CIFAR-100 and Clothing1M, demonstrate +significant improvements over state-of-the-art methods. + +
+
+
+
+
+ + ☆ Sketch3D: Style-Consistent Guidance for Sketch-to-3D Generation + + +
+ Recently, image-to-3D approaches have achieved significant results with a +natural image as input. However, it is not always possible to access these +enriched color input samples in practical applications, where only sketches are +available. Existing sketch-to-3D researches suffer from limitations in broad +applications due to the challenges of lacking color information and multi-view +content. To overcome them, this paper proposes a novel generation paradigm +Sketch3D to generate realistic 3D assets with shape aligned with the input +sketch and color matching the textual description. Concretely, Sketch3D first +instantiates the given sketch in the reference image through the +shape-preserving generation process. Second, the reference image is leveraged +to deduce a coarse 3D Gaussian prior, and multi-view style-consistent guidance +images are generated based on the renderings of the 3D Gaussians. Finally, +three strategies are designed to optimize 3D Gaussians, i.e., structural +optimization via a distribution transfer mechanism, color optimization with a +straightforward MSE loss and sketch similarity optimization with a CLIP-based +geometric similarity loss. Extensive visual comparisons and quantitative +analysis illustrate the advantage of our Sketch3D in generating realistic 3D +assets while preserving consistency with the input. + +
+
+
+
+
+ + ☆ Semi-Supervised Domain Adaptation for Wildfire Detection + + +
+ Recently, both the frequency and intensity of wildfires have increased +worldwide, primarily due to climate change. In this paper, we propose a novel +protocol for wildfire detection, leveraging semi-supervised Domain Adaptation +for object detection, accompanied by a corresponding dataset designed for use +by both academics and industries. Our dataset encompasses 30 times more diverse +labeled scenes for the current largest benchmark wildfire dataset, HPWREN, and +introduces a new labeling policy for wildfire detection. Inspired by CoordConv, +we propose a robust baseline, Location-Aware Object Detection for +Semi-Supervised Domain Adaptation (LADA), utilizing a teacher-student based +framework capable of extracting translational variance features characteristic +of wildfires. With only using 1% target domain labeled data, our framework +significantly outperforms our source-only baseline by a notable margin of 3.8% +in mean Average Precision on the HPWREN wildfire dataset. Our dataset is +available at https://github.com/BloomBerry/LADA. + +
+
+ comment: 16 pages, 5 figures, 22 tables +
+
+
+
+
+ + ☆ Sparse Semi-DETR: Sparse Learnable Queries for Semi-Supervised Object + Detection CVPR2024 + + +
+ In this paper, we address the limitations of the DETR-based semi-supervised +object detection (SSOD) framework, particularly focusing on the challenges +posed by the quality of object queries. In DETR-based SSOD, the one-to-one +assignment strategy provides inaccurate pseudo-labels, while the one-to-many +assignments strategy leads to overlapping predictions. These issues compromise +training efficiency and degrade model performance, especially in detecting +small or occluded objects. We introduce Sparse Semi-DETR, a novel +transformer-based, end-to-end semi-supervised object detection solution to +overcome these challenges. Sparse Semi-DETR incorporates a Query Refinement +Module to enhance the quality of object queries, significantly improving +detection capabilities for small and partially obscured objects. Additionally, +we integrate a Reliable Pseudo-Label Filtering Module that selectively filters +high-quality pseudo-labels, thereby enhancing detection accuracy and +consistency. On the MS-COCO and Pascal VOC object detection benchmarks, Sparse +Semi-DETR achieves a significant improvement over current state-of-the-art +methods that highlight Sparse Semi-DETR's effectiveness in semi-supervised +object detection, particularly in challenging scenarios involving small or +partially obscured objects. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ☆ Rethinking Annotator Simulation: Realistic Evaluation of Whole-Body PET + Lesion Interactive Segmentation Methods + + +
+ Interactive segmentation plays a crucial role in accelerating the annotation, +particularly in domains requiring specialized expertise such as nuclear +medicine. For example, annotating lesions in whole-body Positron Emission +Tomography (PET) images can require over an hour per volume. While previous +works evaluate interactive segmentation models through either real user studies +or simulated annotators, both approaches present challenges. Real user studies +are expensive and often limited in scale, while simulated annotators, also +known as robot users, tend to overestimate model performance due to their +idealized nature. To address these limitations, we introduce four evaluation +metrics that quantify the user shift between real and simulated annotators. In +an initial user study involving four annotators, we assess existing robot users +using our proposed metrics and find that robot users significantly deviate in +performance and annotation behavior compared to real annotators. Based on these +findings, we propose a more realistic robot user that reduces the user shift by +incorporating human factors such as click variation and inter-annotator +disagreement. We validate our robot user in a second user study, involving four +other annotators, and show it consistently reduces the simulated-to-real user +shift compared to traditional robot users. By employing our robot user, we can +conduct more large-scale and cost-efficient evaluations of interactive +segmentation models, while preserving the fidelity of real user studies. Our +implementation is based on MONAI Label and will be made publicly available. + +
+
+ comment: 10 pages, 5 figures, 1 table +
+
+
+
+
+ + ☆ Surface Reconstruction from Gaussian Splatting via Novel Stereo Views + + +
+ The Gaussian splatting for radiance field rendering method has recently +emerged as an efficient approach for accurate scene representation. It +optimizes the location, size, color, and shape of a cloud of 3D Gaussian +elements to visually match, after projection, or splatting, a set of given +images taken from various viewing directions. And yet, despite the proximity of +Gaussian elements to the shape boundaries, direct surface reconstruction of +objects in the scene is a challenge. + We propose a novel approach for surface reconstruction from Gaussian +splatting models. Rather than relying on the Gaussian elements' locations as a +prior for surface reconstruction, we leverage the superior novel-view synthesis +capabilities of 3DGS. To that end, we use the Gaussian splatting model to +render pairs of stereo-calibrated novel views from which we extract depth +profiles using a stereo matching method. We then combine the extracted RGB-D +images into a geometrically consistent surface. The resulting reconstruction is +more accurate and shows finer details when compared to other methods for +surface reconstruction from Gaussian splatting models, while requiring +significantly less compute time compared to other surface reconstruction +methods. + We performed extensive testing of the proposed method on in-the-wild scenes, +taken by a smartphone, showcasing its superior reconstruction abilities. +Additionally, we tested the proposed method on the Tanks and Temples benchmark, +and it has surpassed the current leading method for surface reconstruction from +Gaussian splatting models. Project page: https://gs2mesh.github.io/. + +
+
+ comment: Project Page: https://gs2mesh.github.io/ +
+
+
+
+
+ + ☆ EventSleep: Sleep Activity Recognition with Event Cameras + + +
+ Event cameras are a promising technology for activity recognition in dark +environments due to their unique properties. However, real event camera +datasets under low-lighting conditions are still scarce, which also limits the +number of approaches to solve these kind of problems, hindering the potential +of this technology in many applications. We present EventSleep, a new dataset +and methodology to address this gap and study the suitability of event cameras +for a very relevant medical application: sleep monitoring for sleep disorders +analysis. The dataset contains synchronized event and infrared recordings +emulating common movements that happen during the sleep, resulting in a new +challenging and unique dataset for activity recognition in dark environments. +Our novel pipeline is able to achieve high accuracy under these challenging +conditions and incorporates a Bayesian approach (Laplace ensembles) to increase +the robustness in the predictions, which is fundamental for medical +applications. Our work is the first application of Bayesian neural networks for +event cameras, the first use of Laplace ensembles in a realistic problem, and +also demonstrates for the first time the potential of event cameras in a new +application domain: to enhance current sleep evaluation procedures. Our +activity recognition results highlight the potential of event cameras under +dark conditions, and its capacity and robustness for sleep activity +recognition, and open problems as the adaptation of event data pre-processing +techniques to dark environments. + +
+
+
+
+
+ + ☆ Super-Resolution Analysis for Landfill Waste Classification + + +
+ Illegal landfills are a critical issue due to their environmental, economic, +and public health impacts. This study leverages aerial imagery for +environmental crime monitoring. While advances in artificial intelligence and +computer vision hold promise, the challenge lies in training models with +high-resolution literature datasets and adapting them to open-access +low-resolution images. Considering the substantial quality differences and +limited annotation, this research explores the adaptability of models across +these domains. Motivated by the necessity for a comprehensive evaluation of +waste detection algorithms, it advocates cross-domain classification and +super-resolution enhancement to analyze the impact of different image +resolutions on waste classification as an evaluation to combat the +proliferation of illegal landfills. We observed performance improvements by +enhancing image quality but noted an influence on model sensitivity, +necessitating careful threshold fine-tuning. + +
+
+ comment: This article has been accepted by the Symposium on Intelligent Data + Analysis (IDA 2024) +
+
+
+
+
+ + ☆ CSST Strong Lensing Preparation: a Framework for Detecting Strong Lenses + in the Multi-color Imaging Survey by the China Survey Space Telescope (CSST) + + +
+ Strong gravitational lensing is a powerful tool for investigating dark matter +and dark energy properties. With the advent of large-scale sky surveys, we can +discover strong lensing systems on an unprecedented scale, which requires +efficient tools to extract them from billions of astronomical objects. The +existing mainstream lens-finding tools are based on machine learning algorithms +and applied to cut-out-centered galaxies. However, according to the design and +survey strategy of optical surveys by CSST, preparing cutouts with multiple +bands requires considerable efforts. To overcome these challenges, we have +developed a framework based on a hierarchical visual Transformer with a sliding +window technique to search for strong lensing systems within entire images. +Moreover, given that multi-color images of strong lensing systems can provide +insights into their physical characteristics, our framework is specifically +crafted to identify strong lensing systems in images with any number of +channels. As evaluated using CSST mock data based on an Semi-Analytic Model +named CosmoDC2, our framework achieves precision and recall rates of 0.98 and +0.90, respectively. To evaluate the effectiveness of our method in real +observations, we have applied it to a subset of images from the DESI Legacy +Imaging Surveys and media images from Euclid Early Release Observations. 61 new +strong lensing system candidates are discovered by our method. However, we also +identified false positives arising primarily from the simplified galaxy +morphology assumptions within the simulation. This underscores the practical +limitations of our approach while simultaneously highlighting potential avenues +for future improvements. + +
+
+ comment: The paper is accepted by the AJ. The complete code could be + downloaded with DOI of: 10.12149/101393. Comments are welcome +
+
+
+
+
+ + ☆ A noisy elephant in the room: Is your out-of-distribution detector + robust to label noise? CVPR 2024 + + +
+ The ability to detect unfamiliar or unexpected images is essential for safe +deployment of computer vision systems. In the context of classification, the +task of detecting images outside of a model's training domain is known as +out-of-distribution (OOD) detection. While there has been a growing research +interest in developing post-hoc OOD detection methods, there has been +comparably little discussion around how these methods perform when the +underlying classifier is not trained on a clean, carefully curated dataset. In +this work, we take a closer look at 20 state-of-the-art OOD detection methods +in the (more realistic) scenario where the labels used to train the underlying +classifier are unreliable (e.g. crowd-sourced or web-scraped labels). Extensive +experiments across different datasets, noise types & levels, architectures and +checkpointing strategies provide insights into the effect of class label noise +on OOD detection, and show that poor separation between incorrectly classified +ID samples vs. OOD samples is an overlooked yet important limitation of +existing methods. Code: https://github.com/glhr/ood-labelnoise + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Guidelines for Cerebrovascular Segmentation: Managing Imperfect + Annotations in the context of Semi-Supervised Learning + + +
+ Segmentation in medical imaging is an essential and often preliminary task in +the image processing chain, driving numerous efforts towards the design of +robust segmentation algorithms. Supervised learning methods achieve excellent +performances when fed with a sufficient amount of labeled data. However, such +labels are typically highly time-consuming, error-prone and expensive to +produce. Alternatively, semi-supervised learning approaches leverage both +labeled and unlabeled data, and are very useful when only a small fraction of +the dataset is labeled. They are particularly useful for cerebrovascular +segmentation, given that labeling a single volume requires several hours for an +expert. In addition to the challenge posed by insufficient annotations, there +are concerns regarding annotation consistency. The task of annotating the +cerebrovascular tree is inherently ambiguous. Due to the discrete nature of +images, the borders and extremities of vessels are often unclear. Consequently, +annotations heavily rely on the expert subjectivity and on the underlying +clinical objective. These discrepancies significantly increase the complexity +of the segmentation task for the model and consequently impair the results. +Consequently, it becomes imperative to provide clinicians with precise +guidelines to improve the annotation process and construct more uniform +datasets. In this article, we investigate the data dependency of deep learning +methods within the context of imperfect data and semi-supervised learning, for +cerebrovascular segmentation. Specifically, this study compares various +state-of-the-art semi-supervised methods based on unsupervised regularization +and evaluates their performance in diverse quantity and quality data scenarios. +Based on these experiments, we provide guidelines for the annotation and +training of cerebrovascular segmentation models. + +
+
+
+
+
+ + ☆ GEARS: Local Geometry-aware Hand-object Interaction Synthesis + + +
+ Generating realistic hand motion sequences in interaction with objects has +gained increasing attention with the growing interest in digital humans. Prior +work has illustrated the effectiveness of employing occupancy-based or +distance-based virtual sensors to extract hand-object interaction features. +Nonetheless, these methods show limited generalizability across object +categories, shapes and sizes. We hypothesize that this is due to two reasons: +1) the limited expressiveness of employed virtual sensors, and 2) scarcity of +available training data. To tackle this challenge, we introduce a novel +joint-centered sensor designed to reason about local object geometry near +potential interaction regions. The sensor queries for object surface points in +the neighbourhood of each hand joint. As an important step towards mitigating +the learning complexity, we transform the points from global frame to hand +template frame and use a shared module to process sensor features of each +individual joint. This is followed by a spatio-temporal transformer network +aimed at capturing correlation among the joints in different dimensions. +Moreover, we devise simple heuristic rules to augment the limited training +sequences with vast static hand grasping samples. This leads to a broader +spectrum of grasping types observed during training, in turn enhancing our +model's generalization capability. We evaluate on two public datasets, GRAB and +InterCap, where our method shows superiority over baselines both quantitatively +and perceptually. + +
+
+
+
+
+ + ☆ T-VSL: Text-Guided Visual Sound Source Localization in Mixtures CVPR-2024 + + +
+ Visual sound source localization poses a significant challenge in identifying +the semantic region of each sounding source within a video. Existing +self-supervised and weakly supervised source localization methods struggle to +accurately distinguish the semantic regions of each sounding object, +particularly in multi-source mixtures. These methods often rely on audio-visual +correspondence as guidance, which can lead to substantial performance drops in +complex multi-source localization scenarios. The lack of access to individual +source sounds in multi-source mixtures during training exacerbates the +difficulty of learning effective audio-visual correspondence for localization. +To address this limitation, in this paper, we propose incorporating the text +modality as an intermediate feature guide using tri-modal joint embedding +models (e.g., AudioCLIP) to disentangle the semantic audio-visual source +correspondence in multi-source mixtures. Our framework, dubbed T-VSL, begins by +predicting the class of sounding entities in mixtures. Subsequently, the +textual representation of each sounding source is employed as guidance to +disentangle fine-grained audio-visual source correspondence from multi-source +mixtures, leveraging the tri-modal AudioCLIP embedding. This approach enables +our framework to handle a flexible number of sources and exhibits promising +zero-shot transferability to unseen classes during test time. Extensive +experiments conducted on the MUSIC, VGGSound, and VGGSound-Instruments datasets +demonstrate significant performance improvements over state-of-the-art methods. + +
+
+ comment: Tech report. Accepted in CVPR-2024 +
+
+
+
+
+ + ☆ Exploring Latent Pathways: Enhancing the Interpretability of Autonomous + Driving with a Variational Autoencoder IROS 2024 + + +
+ Autonomous driving presents a complex challenge, which is usually addressed +with artificial intelligence models that are end-to-end or modular in nature. +Within the landscape of modular approaches, a bio-inspired neural circuit +policy model has emerged as an innovative control module, offering a compact +and inherently interpretable system to infer a steering wheel command from +abstract visual features. Here, we take a leap forward by integrating a +variational autoencoder with the neural circuit policy controller, forming a +solution that directly generates steering commands from input camera images. By +substituting the traditional convolutional neural network approach to feature +extraction with a variational autoencoder, we enhance the system's +interpretability, enabling a more transparent and understandable +decision-making process. + In addition to the architectural shift toward a variational autoencoder, this +study introduces the automatic latent perturbation tool, a novel contribution +designed to probe and elucidate the latent features within the variational +autoencoder. The automatic latent perturbation tool automates the +interpretability process, offering granular insights into how specific latent +variables influence the overall model's behavior. Through a series of numerical +experiments, we demonstrate the interpretative power of the variational +autoencoder-neural circuit policy model and the utility of the automatic latent +perturbation tool in making the inner workings of autonomous driving systems +more transparent. + +
+
+ comment: Submitted to 2024 IEEE/RSJ International Conference on Intelligent + Robots and Systems (IROS 2024) +
+
+
+
+
+ + ☆ Global Mapping of Exposure and Physical Vulnerability Dynamics in Least + Developed Countries using Remote Sensing and Machine Learning ICLR + + +
+ As the world marked the midterm of the Sendai Framework for Disaster Risk +Reduction 2015-2030, many countries are still struggling to monitor their +climate and disaster risk because of the expensive large-scale survey of the +distribution of exposure and physical vulnerability and, hence, are not on +track in reducing risks amidst the intensifying effects of climate change. We +present an ongoing effort in mapping this vital information using machine +learning and time-series remote sensing from publicly available Sentinel-1 SAR +GRD and Sentinel-2 Harmonized MSI. We introduce the development of +"OpenSendaiBench" consisting of 47 countries wherein most are least developed +(LDCs), trained ResNet-50 deep learning models, and demonstrated the region of +Dhaka, Bangladesh by mapping the distribution of its informal constructions. As +a pioneering effort in auditing global disaster risk over time, this paper aims +to advance the area of large-scale risk quantification in informing our +collective long-term efforts in reducing climate and disaster risk. + +
+
+ comment: This is the camera-ready paper for the accepted poster at the 2nd + Machine Learning for Remote Sensing Workshop, 12th International Conference + on Learning Representations (ICLR) in Vienna, Austria, on the 11th of May + 2024. Access the poster here: https://zenodo.org/doi/10.5281/zenodo.10903886 + Watch the video version of our poster here: https://youtu.be/N6ithJeCF4M +
+
+
+
+
+ + ☆ Unleash the Potential of CLIP for Video Highlight Detection + + +
+ Multimodal and large language models (LLMs) have revolutionized the +utilization of open-world knowledge, unlocking novel potentials across various +tasks and applications. Among these domains, the video domain has notably +benefited from their capabilities. In this paper, we present Highlight-CLIP +(HL-CLIP), a method designed to excel in the video highlight detection task by +leveraging the pre-trained knowledge embedded in multimodal models. By simply +fine-tuning the multimodal encoder in combination with our innovative saliency +pooling technique, we have achieved the state-of-the-art performance in the +highlight detection task, the QVHighlight Benchmark, to the best of our +knowledge. + +
+
+
+
+
+ + ☆ Atom-Level Optical Chemical Structure Recognition with Limited + Supervision + + +
+ Identifying the chemical structure from a graphical representation, or image, +of a molecule is a challenging pattern recognition task that would greatly +benefit drug development. Yet, existing methods for chemical structure +recognition do not typically generalize well, and show diminished effectiveness +when confronted with domains where data is sparse, or costly to generate, such +as hand-drawn molecule images. To address this limitation, we propose a new +chemical structure recognition tool that delivers state-of-the-art performance +and can adapt to new domains with a limited number of data samples and +supervision. Unlike previous approaches, our method provides atom-level +localization, and can therefore segment the image into the different atoms and +bonds. Our model is the first model to perform OCSR with atom-level entity +detection with only SMILES supervision. Through rigorous and extensive +benchmarking, we demonstrate the preeminence of our chemical structure +recognition approach in terms of data efficiency, accuracy, and atom-level +entity prediction. + +
+
+ comment: Accepted in IEEE/CVF Conference on Computer Vision and Pattern + Recognition 2024 +
+
+
+
+
+ + ☆ Generalizing 6-DoF Grasp Detection via Domain Prior Knowledge CVPR 2024 + + +
+ We focus on the generalization ability of the 6-DoF grasp detection method in +this paper. While learning-based grasp detection methods can predict grasp +poses for unseen objects using the grasp distribution learned from the training +set, they often exhibit a significant performance drop when encountering +objects with diverse shapes and structures. To enhance the grasp detection +methods' generalization ability, we incorporate domain prior knowledge of +robotic grasping, enabling better adaptation to objects with significant shape +and structure differences. More specifically, we employ the physical constraint +regularization during the training phase to guide the model towards predicting +grasps that comply with the physical rule on grasping. For the unstable grasp +poses predicted on novel objects, we design a contact-score joint optimization +using the projection contact map to refine these poses in cluttered scenarios. +Extensive experiments conducted on the GraspNet-1billion benchmark demonstrate +a substantial performance gain on the novel object set and the real-world +grasping experiments also demonstrate the effectiveness of our generalizing +6-DoF grasp detection method. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Disentangled Pre-training for Human-Object Interaction Detection CVPR2024 + + +
+ Detecting human-object interaction (HOI) has long been limited by the amount +of supervised data available. Recent approaches address this issue by +pre-training according to pseudo-labels, which align object regions with HOI +triplets parsed from image captions. However, pseudo-labeling is tricky and +noisy, making HOI pre-training a complex process. Therefore, we propose an +efficient disentangled pre-training method for HOI detection (DP-HOI) to +address this problem. First, DP-HOI utilizes object detection and action +recognition datasets to pre-train the detection and interaction decoder layers, +respectively. Then, we arrange these decoder layers so that the pre-training +architecture is consistent with the downstream HOI detection task. This +facilitates efficient knowledge transfer. Specifically, the detection decoder +identifies reliable human instances in each action recognition dataset image, +generates one corresponding query, and feeds it into the interaction decoder +for verb classification. Next, we combine the human instance verb predictions +in the same image and impose image-level supervision. The DP-HOI structure can +be easily adapted to the HOI detection task, enabling effective model parameter +initialization. Therefore, it significantly enhances the performance of +existing HOI detection models on a broad range of rare categories. The code and +pre-trained weight are available at https://github.com/xingaoli/DP-HOI. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ Contextual Embedding Learning to Enhance 2D Networks for Volumetric + Image Segmentation + + +
+ The segmentation of organs in volumetric medical images plays an important +role in computer-aided diagnosis and treatment/surgery planning. Conventional +2D convolutional neural networks (CNNs) can hardly exploit the spatial +correlation of volumetric data. Current 3D CNNs have the advantage to extract +more powerful volumetric representations but they usually suffer from occupying +excessive memory and computation nevertheless. In this study we aim to enhance +the 2D networks with contextual information for better volumetric image +segmentation. Accordingly, we propose a contextual embedding learning approach +to facilitate 2D CNNs capturing spatial information properly. Our approach +leverages the learned embedding and the slice-wisely neighboring matching as a +soft cue to guide the network. In such a way, the contextual information can be +transferred slice-by-slice thus boosting the volumetric representation of the +network. Experiments on challenging prostate MRI dataset (PROMISE12) and +abdominal CT dataset (CHAOS) show that our contextual embedding learning can +effectively leverage the inter-slice context and improve segmentation +performance. The proposed approach is a plug-and-play, and memory-efficient +solution to enhance the 2D networks for volumetric segmentation. The code will +be publicly available. + +
+
+ comment: 13 pages, 8 figures +
+
+
+
+
+ + ☆ AddSR: Accelerating Diffusion-based Blind Super-Resolution with + Adversarial Diffusion Distillation + + +
+ Blind super-resolution methods based on stable diffusion showcase formidable +generative capabilities in reconstructing clear high-resolution images with +intricate details from low-resolution inputs. However, their practical +applicability is often hampered by poor efficiency, stemming from the +requirement of thousands or hundreds of sampling steps. Inspired by the +efficient text-to-image approach adversarial diffusion distillation (ADD), we +design AddSR to address this issue by incorporating the ideas of both +distillation and ControlNet. Specifically, we first propose a prediction-based +self-refinement strategy to provide high-frequency information in the student +model output with marginal additional time cost. Furthermore, we refine the +training process by employing HR images, rather than LR images, to regulate the +teacher model, providing a more robust constraint for distillation. Second, we +introduce a timestep-adapting loss to address the perception-distortion +imbalance problem introduced by ADD. Extensive experiments demonstrate our +AddSR generates better restoration results, while achieving faster speed than +previous SD-based state-of-the-art models (e.g., 7x faster than SeeSR). + +
+
+
+
+
+ + ☆ Conjugate-Gradient-like Based Adaptive Moment Estimation Optimization + Algorithm for Deep Learning + + +
+ Training deep neural networks is a challenging task. In order to speed up +training and enhance the performance of deep neural networks, we rectify the +vanilla conjugate gradient as conjugate-gradient-like and incorporate it into +the generic Adam, and thus propose a new optimization algorithm named +CG-like-Adam for deep learning. Specifically, both the first-order and the +second-order moment estimation of generic Adam are replaced by the +conjugate-gradient-like. Convergence analysis handles the cases where the +exponential moving average coefficient of the first-order moment estimation is +constant and the first-order moment estimation is unbiased. Numerical +experiments show the superiority of the proposed algorithm based on the +CIFAR10/100 dataset. + +
+
+ comment: 32 pages, 13 figures +
+
+
+
+
+ + ☆ Upsample Guidance: Scale Up Diffusion Models without Training + + +
+ Diffusion models have demonstrated superior performance across various +generative tasks including images, videos, and audio. However, they encounter +difficulties in directly generating high-resolution samples. Previously +proposed solutions to this issue involve modifying the architecture, further +training, or partitioning the sampling process into multiple stages. These +methods have the limitation of not being able to directly utilize pre-trained +models as-is, requiring additional work. In this paper, we introduce upsample +guidance, a technique that adapts pretrained diffusion model (e.g., $512^2$) to +generate higher-resolution images (e.g., $1536^2$) by adding only a single term +in the sampling process. Remarkably, this technique does not necessitate any +additional training or relying on external models. We demonstrate that upsample +guidance can be applied to various models, such as pixel-space, latent space, +and video diffusion models. We also observed that the proper selection of +guidance scale can improve image quality, fidelity, and prompt alignment. + +
+
+ comment: 15 pages, 15 Figures +
+
+
+
+
+ + ☆ Samba: Semantic Segmentation of Remotely Sensed Images with State Space + Model + + +
+ High-resolution remotely sensed images poses a challenge for commonly used +semantic segmentation methods such as Convolutional Neural Network (CNN) and +Vision Transformer (ViT). CNN-based methods struggle with handling such +high-resolution images due to their limited receptive field, while ViT faces +challenges to handle long sequences. Inspired by Mamba, which adopts a State +Space Model (SSM) to efficiently capture global semantic information, we +propose a semantic segmentation framework for high-resolution remotely sensed +images, named Samba. Samba utilizes an encoder-decoder architecture, with Samba +blocks serving as the encoder for efficient multi-level semantic information +extraction, and UperNet functioning as the decoder. We evaluate Samba on the +LoveDA dataset, comparing its performance against top-performing CNN and ViT +methods. The results reveal that Samba achieved unparalleled performance on +LoveDA. This represents that the proposed Samba is an effective application of +the SSM in semantic segmentation of remotely sensed images, setting a new +benchmark in performance for Mamba-based techniques in this specific +application. The source code and baseline implementations are available at +https://github.com/zhuqinfeng1999/Samba. + +
+
+
+
+
+ + ☆ Boosting Visual Recognition for Autonomous Driving in Real-world + Degradations with Deep Channel Prior + + +
+ The environmental perception of autonomous vehicles in normal conditions have +achieved considerable success in the past decade. However, various unfavourable +conditions such as fog, low-light, and motion blur will degrade image quality +and pose tremendous threats to the safety of autonomous driving. That is, when +applied to degraded images, state-of-the-art visual models often suffer +performance decline due to the feature content loss and artifact interference +caused by statistical and structural properties disruption of captured images. +To address this problem, this work proposes a novel Deep Channel Prior (DCP) +for degraded visual recognition. Specifically, we observe that, in the deep +representation space of pre-trained models, the channel correlations of +degraded features with the same degradation type have uniform distribution even +if they have different content and semantics, which can facilitate the mapping +relationship learning between degraded and clear representations in +high-sparsity feature space. Based on this, a novel plug-and-play Unsupervised +Feature Enhancement Module (UFEM) is proposed to achieve unsupervised feature +correction, where the multi-adversarial mechanism is introduced in the first +stage of UFEM to achieve the latent content restoration and artifact removal in +high-sparsity feature space. Then, the generated features are transferred to +the second stage for global correlation modulation under the guidance of DCP to +obtain high-quality and recognition-friendly features. Evaluations of three +tasks and eight benchmark datasets demonstrate that our proposed method can +comprehensively improve the performance of pre-trained models in real +degradation conditions. The source code is available at +https://github.com/liyuhang166/Deep_Channel_Prior + +
+
+
+
+
+ + ☆ MotionChain: Conversational Motion Controllers via Multimodal Prompts + + +
+ Recent advancements in language models have demonstrated their adeptness in +conducting multi-turn dialogues and retaining conversational context. However, +this proficiency remains largely unexplored in other multimodal generative +models, particularly in human motion models. By integrating multi-turn +conversations in controlling continuous virtual human movements, generative +human motion models can achieve an intuitive and step-by-step process of human +task execution for humanoid robotics, game agents, or other embodied systems. +In this work, we present MotionChain, a conversational human motion controller +to generate continuous and long-term human motion through multimodal prompts. +Specifically, MotionChain consists of multi-modal tokenizers that transform +various data types such as text, image, and motion, into discrete tokens, +coupled with a Vision-Motion-aware Language model. By leveraging large-scale +language, vision-language, and vision-motion data to assist motion-related +generation tasks, MotionChain thus comprehends each instruction in multi-turn +conversation and generates human motions followed by these prompts. Extensive +experiments validate the efficacy of MotionChain, demonstrating +state-of-the-art performance in conversational motion generation, as well as +more intuitive manners of controlling and interacting with virtual humans. + +
+
+ comment: 14 pages, 4 figures +
+
+
+
+
+ + ☆ Task Integration Distillation for Object Detectors + + +
+ Knowledge distillation is a widely adopted technique for model lightening. +However, the performance of most knowledge distillation methods in the domain +of object detection is not satisfactory. Typically, knowledge distillation +approaches consider only the classification task among the two sub-tasks of an +object detector, largely overlooking the regression task. This oversight leads +to a partial understanding of the object detector's comprehensive task, +resulting in skewed estimations and potentially adverse effects. Therefore, we +propose a knowledge distillation method that addresses both the classification +and regression tasks, incorporating a task significance strategy. By evaluating +the importance of features based on the output of the detector's two sub-tasks, +our approach ensures a balanced consideration of both classification and +regression tasks in object detection. Drawing inspiration from real-world +teaching processes and the definition of learning condition, we introduce a +method that focuses on both key and weak areas. By assessing the value of +features for knowledge distillation based on their importance differences, we +accurately capture the current model's learning situation. This method +effectively prevents the issue of biased predictions about the model's learning +reality caused by an incomplete utilization of the detector's outputs. + +
+
+
+
+
+ + ☆ Beyond Image Super-Resolution for Image Recognition with Task-Driven + Perceptual Loss CVPR 2024 + + +
+ In real-world scenarios, image recognition tasks, such as semantic +segmentation and object detection, often pose greater challenges due to the +lack of information available within low-resolution (LR) content. Image +super-resolution (SR) is one of the promising solutions for addressing the +challenges. However, due to the ill-posed property of SR, it is challenging for +typical SR methods to restore task-relevant high-frequency contents, which may +dilute the advantage of utilizing the SR method. Therefore, in this paper, we +propose Super-Resolution for Image Recognition (SR4IR) that effectively guides +the generation of SR images beneficial to achieving satisfactory image +recognition performance when processing LR images. The critical component of +our SR4IR is the task-driven perceptual (TDP) loss that enables the SR network +to acquire task-specific knowledge from a network tailored for a specific task. +Moreover, we propose a cross-quality patch mix and an alternate training +framework that significantly enhances the efficacy of the TDP loss by +addressing potential problems when employing the TDP loss. Through extensive +experiments, we demonstrate that our SR4IR achieves outstanding task +performance by generating SR images useful for a specific image recognition +task, including semantic segmentation, object detection, and image +classification. The implementation code is available at +https://github.com/JaehaKim97/SR4IR. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ RefQSR: Reference-based Quantization for Image Super-Resolution Networks + + +
+ Single image super-resolution (SISR) aims to reconstruct a high-resolution +image from its low-resolution observation. Recent deep learning-based SISR +models show high performance at the expense of increased computational costs, +limiting their use in resource-constrained environments. As a promising +solution for computationally efficient network design, network quantization has +been extensively studied. However, existing quantization methods developed for +SISR have yet to effectively exploit image self-similarity, which is a new +direction for exploration in this study. We introduce a novel method called +reference-based quantization for image super-resolution (RefQSR) that applies +high-bit quantization to several representative patches and uses them as +references for low-bit quantization of the rest of the patches in an image. To +this end, we design dedicated patch clustering and reference-based quantization +modules and integrate them into existing SISR network quantization methods. The +experimental results demonstrate the effectiveness of RefQSR on various SISR +networks and quantization methods. + +
+
+ comment: Accepted by IEEE Transactions on Image Processing (TIP) +
+
+
+
+
+ + ☆ JRDB-PanoTrack: An Open-world Panoptic Segmentation and Tracking Robotic + Dataset in Crowded Human Environments CVPR 2024 + + +
+ Autonomous robot systems have attracted increasing research attention in +recent years, where environment understanding is a crucial step for robot +navigation, human-robot interaction, and decision. Real-world robot systems +usually collect visual data from multiple sensors and are required to recognize +numerous objects and their movements in complex human-crowded settings. +Traditional benchmarks, with their reliance on single sensors and limited +object classes and scenarios, fail to provide the comprehensive environmental +understanding robots need for accurate navigation, interaction, and +decision-making. As an extension of JRDB dataset, we unveil JRDB-PanoTrack, a +novel open-world panoptic segmentation and tracking benchmark, towards more +comprehensive environmental perception. JRDB-PanoTrack includes (1) various +data involving indoor and outdoor crowded scenes, as well as comprehensive 2D +and 3D synchronized data modalities; (2) high-quality 2D spatial panoptic +segmentation and temporal tracking annotations, with additional 3D label +projections for further spatial understanding; (3) diverse object classes for +closed- and open-world recognition benchmarks, with OSPA-based metrics for +evaluation. Extensive evaluation of leading methods shows significant +challenges posed by our dataset. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ PRISM-TopoMap: Online Topological Mapping with Place Recognition and + Scan Matching IROS 2024 + + +
+ Mapping is one of the crucial tasks enabling autonomous navigation of a +mobile robot. Conventional mapping methods output dense geometric map +representation, e.g. an occupancy grid, which is not trivial to keep consistent +for the prolonged runs covering large environments. Meanwhile, capturing the +topological structure of the workspace enables fast path planning, is less +prone to odometry error accumulation and does not consume much memory. +Following this idea, this paper introduces PRISM-TopoMap -- a topological +mapping method that maintains a graph of locally aligned locations not relying +on global metric coordinates. The proposed method involves learnable multimodal +place recognition paired with the scan matching pipeline for localization and +loop closure in the graph of locations. The latter is updated online and the +robot is localized in a proper node at each time step. We conduct a broad +experimental evaluation of the suggested approach in a range of photo-realistic +environments and on a real robot (wheeled differential driven Husky robot), and +compare it to state of the art. The results of the empirical evaluation confirm +that PRISM-Topomap consistently outperforms competitors across several measures +of mapping and navigation efficiency and performs well on a real robot. The +code of PRISM-Topomap is open-sourced and available at +https://github.com/kirillMouraviev/prism-topomap. + +
+
+ comment: This is a pre-print of the paper submitted to an IROS 2024 conference +
+
+
+
+
+ + ☆ A Universal Knowledge Embedded Contrastive Learning Framework for + Hyperspectral Image Classification + + +
+ Hyperspectral image (HSI) classification techniques have been intensively +studied and a variety of models have been developed. However, these HSI +classification models are confined to pocket models and unrealistic ways of +datasets partitioning. The former limits the generalization performance of the +model and the latter is partitioned leads to inflated model evaluation metrics, +which results in plummeting model performance in the real world. Therefore, we +propose a universal knowledge embedded contrastive learning framework (KnowCL) +for supervised, unsupervised, and semisupervised HSI classification, which +largely closes the gap of HSI classification models between pocket models and +standard vision backbones. We present a new HSI processing pipeline in +conjunction with a range of data transformation and augmentation techniques +that provide diverse data representations and realistic data partitioning. The +proposed framework based on this pipeline is compatible with all kinds of +backbones and can fully exploit labeled and unlabeled samples with expected +training time. Furthermore, we design a new loss function, which can adaptively +fuse the supervised loss and unsupervised loss, enhancing the learning +performance. This proposed new classification paradigm shows great potentials +in exploring for HSI classification technology. The code can be accessed at +https://github.com/quanweiliu/KnowCL. + +
+
+
+
+
+ + ☆ Release of Pre-Trained Models for the Japanese Language LREC + + +
+ AI democratization aims to create a world in which the average person can +utilize AI techniques. To achieve this goal, numerous research institutes have +attempted to make their results accessible to the public. In particular, large +pre-trained models trained on large-scale data have shown unprecedented +potential, and their release has had a significant impact. However, most of the +released models specialize in the English language, and thus, AI +democratization in non-English-speaking communities is lagging significantly. +To reduce this gap in AI access, we released Generative Pre-trained Transformer +(GPT), Contrastive Language and Image Pre-training (CLIP), Stable Diffusion, +and Hidden-unit Bidirectional Encoder Representations from Transformers +(HuBERT) pre-trained in Japanese. By providing these models, users can freely +interface with AI that aligns with Japanese cultural values and ensures the +identity of Japanese culture, thus enhancing the democratization of AI. +Additionally, experiments showed that pre-trained models specialized for +Japanese can efficiently achieve high performance in Japanese tasks. + +
+
+ comment: 9 pages, 1 figure, 5 tables, accepted for LREC-COLING 2024. Models + are publicly available at https://huggingface.co/rinna +
+
+
+
+
+ + ☆ Supporting Mitosis Detection AI Training with Inter-Observer Eye-Gaze + Consistencies + + +
+ The expansion of artificial intelligence (AI) in pathology tasks has +intensified the demand for doctors' annotations in AI development. However, +collecting high-quality annotations from doctors is costly and time-consuming, +creating a bottleneck in AI progress. This study investigates eye-tracking as a +cost-effective technology to collect doctors' behavioral data for AI training +with a focus on the pathology task of mitosis detection. One major challenge in +using eye-gaze data is the low signal-to-noise ratio, which hinders the +extraction of meaningful information. We tackled this by levering the +properties of inter-observer eye-gaze consistencies and creating eye-gaze +labels from consistent eye-fixations shared by a group of observers. Our study +involved 14 non-medical participants, from whom we collected eye-gaze data and +generated eye-gaze labels based on varying group sizes. We assessed the +efficacy of such eye-gaze labels by training Convolutional Neural Networks +(CNNs) and comparing their performance to those trained with ground truth +annotations and a heuristic-based baseline. Results indicated that CNNs trained +with our eye-gaze labels closely followed the performance of ground-truth-based +CNNs, and significantly outperformed the baseline. Although primarily focused +on mitosis, we envision that insights from this study can be generalized to +other medical imaging tasks. + +
+
+ comment: Accepted by IEEE International Conference on Healthcare Informatics + 2024 +
+
+
+
+
+ + ☆ FashionEngine: Interactive Generation and Editing of 3D Clothed Humans + + +
+ We present FashionEngine, an interactive 3D human generation and editing +system that allows us to design 3D digital humans in a way that aligns with how +humans interact with the world, such as natural languages, visual perceptions, +and hand-drawing. FashionEngine automates the 3D human production with three +key components: 1) A pre-trained 3D human diffusion model that learns to model +3D humans in a semantic UV latent space from 2D image training data, which +provides strong priors for diverse generation and editing tasks. 2) +Multimodality-UV Space encoding the texture appearance, shape topology, and +textual semantics of human clothing in a canonical UV-aligned space, which +faithfully aligns the user multimodal inputs with the implicit UV latent space +for controllable 3D human editing. The multimodality-UV space is shared across +different user inputs, such as texts, images, and sketches, which enables +various joint multimodal editing tasks. 3) Multimodality-UV Aligned Sampler +learns to sample high-quality and diverse 3D humans from the diffusion prior +for multimodal user inputs. Extensive experiments validate FashionEngine's +state-of-the-art performance for conditional generation/editing tasks. In +addition, we present an interactive user interface for our FashionEngine that +enables both conditional and unconditional generation tasks, and editing tasks +including pose/view/shape control, text-, image-, and sketch-driven 3D human +editing and 3D virtual try-on, in a unified framework. Our project page is at: +https://taohuumd.github.io/projects/FashionEngine. + +
+
+ comment: Project Page: https://taohuumd.github.io/projects/FashionEngine +
+
+
+
+
+ + ☆ AI WALKUP: A Computer-Vision Approach to Quantifying MDS-UPDRS in + Parkinson's Disease + + +
+ Parkinson's Disease (PD) is the second most common neurodegenerative +disorder. The existing assessment method for PD is usually the Movement +Disorder Society - Unified Parkinson's Disease Rating Scale (MDS-UPDRS) to +assess the severity of various types of motor symptoms and disease progression. +However, manual assessment suffers from high subjectivity, lack of consistency, +and high cost and low efficiency of manual communication. We want to use a +computer vision based solution to capture human pose images based on a camera, +reconstruct and perform motion analysis using algorithms, and extract the +features of the amount of motion through feature engineering. The proposed +approach can be deployed on different smartphones, and the video recording and +artificial intelligence analysis can be done quickly and easily through our +APP. + +
+
+ comment: Technical report for AI WALKUP, an APP winning 3rd Prize of 2022 HUST + GS AI Innovation and Design Competition +
+
+
+
+
+ + ☆ EDTalk: Efficient Disentanglement for Emotional Talking Head Synthesis + + +
+ Achieving disentangled control over multiple facial motions and accommodating +diverse input modalities greatly enhances the application and entertainment of +the talking head generation. This necessitates a deep exploration of the +decoupling space for facial features, ensuring that they a) operate +independently without mutual interference and b) can be preserved to share with +different modal input, both aspects often neglected in existing methods. To +address this gap, this paper proposes a novel Efficient Disentanglement +framework for Talking head generation (EDTalk). Our framework enables +individual manipulation of mouth shape, head pose, and emotional expression, +conditioned on video or audio inputs. Specifically, we employ three lightweight +modules to decompose the facial dynamics into three distinct latent spaces +representing mouth, pose, and expression, respectively. Each space is +characterized by a set of learnable bases whose linear combinations define +specific motions. To ensure independence and accelerate training, we enforce +orthogonality among bases and devise an efficient training strategy to allocate +motion responsibilities to each space without relying on external knowledge. +The learned bases are then stored in corresponding banks, enabling shared +visual priors with audio input. Furthermore, considering the properties of each +space, we propose an Audio-to-Motion module for audio-driven talking head +synthesis. Experiments are conducted to demonstrate the effectiveness of +EDTalk. We recommend watching the project website: +https://tanshuai0219.github.io/EDTalk/ + +
+
+ comment: 22 pages, 15 figures +
+
+
+
+
+ + ☆ ContrastCAD: Contrastive Learning-based Representation Learning for + Computer-Aided Design Models + + +
+ The success of Transformer-based models has encouraged many researchers to +learn CAD models using sequence-based approaches. However, learning CAD models +is still a challenge, because they can be represented as complex shapes with +long construction sequences. Furthermore, the same CAD model can be expressed +using different CAD construction sequences. We propose a novel contrastive +learning-based approach, named ContrastCAD, that effectively captures semantic +information within the construction sequences of the CAD model. ContrastCAD +generates augmented views using dropout techniques without altering the shape +of the CAD model. We also propose a new CAD data augmentation method, called a +Random Replace and Extrude (RRE) method, to enhance the learning performance of +the model when training an imbalanced training CAD dataset. Experimental +results show that the proposed RRE augmentation method significantly enhances +the learning performance of Transformer-based autoencoders, even for complex +CAD models having very long construction sequences. The proposed ContrastCAD +model is shown to be robust to permutation changes of construction sequences +and performs better representation learning by generating representation spaces +where similar CAD models are more closely clustered. Our codes are available at +https://github.com/cm8908/ContrastCAD. + +
+
+
+
+
+ + ☆ A Closer Look at Spatial-Slice Features Learning for COVID-19 Detection + + +
+ Conventional Computed Tomography (CT) imaging recognition faces two +significant challenges: (1) There is often considerable variability in the +resolution and size of each CT scan, necessitating strict requirements for the +input size and adaptability of models. (2) CT-scan contains large number of +out-of-distribution (OOD) slices. The crucial features may only be present in +specific spatial regions and slices of the entire CT scan. How can we +effectively figure out where these are located? To deal with this, we introduce +an enhanced Spatial-Slice Feature Learning (SSFL++) framework specifically +designed for CT scan. It aim to filter out a OOD data within whole CT scan, +enabling our to select crucial spatial-slice for analysis by reducing 70% +redundancy totally. Meanwhile, we proposed Kernel-Density-based slice Sampling +(KDS) method to improve the stability when training and inference stage, +therefore speeding up the rate of convergence and boosting performance. As a +result, the experiments demonstrate the promising performance of our model +using a simple EfficientNet-2D (E2D) model, even with only 1% of the training +data. The efficacy of our approach has been validated on the COVID-19-CT-DB +datasets provided by the DEF-AI-MIA workshop, in conjunction with CVPR 2024. +Our source code will be made available. + +
+
+ comment: Submitted to DEF-AI-MIA workshop. arXiv admin note: text overlap with + arXiv:2403.11230 +
+
+
+
+
+ + ☆ Learning to Control Camera Exposure via Reinforcement Learning CVPR 2024 + + +
+ Adjusting camera exposure in arbitrary lighting conditions is the first step +to ensure the functionality of computer vision applications. Poorly adjusted +camera exposure often leads to critical failure and performance degradation. +Traditional camera exposure control methods require multiple convergence steps +and time-consuming processes, making them unsuitable for dynamic lighting +conditions. In this paper, we propose a new camera exposure control framework +that rapidly controls camera exposure while performing real-time processing by +exploiting deep reinforcement learning. The proposed framework consists of four +contributions: 1) a simplified training ground to simulate real-world's diverse +and dynamic lighting changes, 2) flickering and image attribute-aware reward +design, along with lightweight state design for real-time processing, 3) a +static-to-dynamic lighting curriculum to gradually improve the agent's +exposure-adjusting capability, and 4) domain randomization techniques to +alleviate the limitation of the training ground and achieve seamless +generalization in the wild.As a result, our proposed method rapidly reaches a +desired exposure level within five steps with real-time processing (1 ms). +Also, the acquired images are well-exposed and show superiority in various +computer vision tasks, such as feature extraction and object detection. + +
+
+ comment: Accepted at CVPR 2024, *First two authors contributed equally to this + work. Project page link: https://sites.google.com/view/drl-ae +
+
+
+
+
+ + ☆ Learning Equi-angular Representations for Online Continual Learning CVPR 2024 + + +
+ Online continual learning suffers from an underfitted solution due to +insufficient training for prompt model update (e.g., single-epoch training). To +address the challenge, we propose an efficient online continual learning method +using the neural collapse phenomenon. In particular, we induce neural collapse +to form a simplex equiangular tight frame (ETF) structure in the representation +space so that the continuously learned model with a single epoch can better fit +to the streamed data by proposing preparatory data training and residual +correction in the representation space. With an extensive set of empirical +validations using CIFAR-10/100, TinyImageNet, ImageNet-200, and ImageNet-1K, we +show that our proposed method outperforms state-of-the-art methods by a +noticeable margin in various online continual learning scenarios such as +disjoint and Gaussian scheduled continuous (i.e., boundary-free) data setups. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ LR-FPN: Enhancing Remote Sensing Object Detection with Location Refined + Feature Pyramid Network + + +
+ Remote sensing target detection aims to identify and locate critical targets +within remote sensing images, finding extensive applications in agriculture and +urban planning. Feature pyramid networks (FPNs) are commonly used to extract +multi-scale features. However, existing FPNs often overlook extracting +low-level positional information and fine-grained context interaction. To +address this, we propose a novel location refined feature pyramid network +(LR-FPN) to enhance the extraction of shallow positional information and +facilitate fine-grained context interaction. The LR-FPN consists of two primary +modules: the shallow position information extraction module (SPIEM) and the +contextual interaction module (CIM). Specifically, SPIEM first maximizes the +retention of solid location information of the target by simultaneously +extracting positional and saliency information from the low-level feature map. +Subsequently, CIM injects this robust location information into different +layers of the original FPN through spatial and channel interaction, explicitly +enhancing the object area. Moreover, in spatial interaction, we introduce a +simple local and non-local interaction strategy to learn and retain the +saliency information of the object. Lastly, the LR-FPN can be readily +integrated into common object detection frameworks to improve performance +significantly. Extensive experiments on two large-scale remote sensing datasets +(i.e., DOTAV1.0 and HRSC2016) demonstrate that the proposed LR-FPN is superior +to state-of-the-art object detection approaches. Our code and models will be +publicly available. + +
+
+
+
+
+ + ☆ Spin-UP: Spin Light for Natural Light Uncalibrated Photometric Stereo CVPR2024 + + +
+ Natural Light Uncalibrated Photometric Stereo (NaUPS) relieves the strict +environment and light assumptions in classical Uncalibrated Photometric Stereo +(UPS) methods. However, due to the intrinsic ill-posedness and high-dimensional +ambiguities, addressing NaUPS is still an open question. Existing works impose +strong assumptions on the environment lights and objects' material, restricting +the effectiveness in more general scenarios. Alternatively, some methods +leverage supervised learning with intricate models while lacking +interpretability, resulting in a biased estimation. In this work, we proposed +Spin Light Uncalibrated Photometric Stereo (Spin-UP), an unsupervised method to +tackle NaUPS in various environment lights and objects. The proposed method +uses a novel setup that captures the object's images on a rotatable platform, +which mitigates NaUPS's ill-posedness by reducing unknowns and provides +reliable priors to alleviate NaUPS's ambiguities. Leveraging neural inverse +rendering and the proposed training strategies, Spin-UP recovers surface +normals, environment light, and isotropic reflectance under complex natural +light with low computational cost. Experiments have shown that Spin-UP +outperforms other supervised / unsupervised NaUPS methods and achieves +state-of-the-art performance on synthetic and real-world datasets. Codes and +data are available at https://github.com/LMozart/CVPR2024-SpinUP. + +
+
+ comment: Paper accepted by CVPR2024 +
+
+
+
+
+ + ☆ WaveDH: Wavelet Sub-bands Guided ConvNet for Efficient Image Dehazing + + +
+ The surge in interest regarding image dehazing has led to notable +advancements in deep learning-based single image dehazing approaches, +exhibiting impressive performance in recent studies. Despite these strides, +many existing methods fall short in meeting the efficiency demands of practical +applications. In this paper, we introduce WaveDH, a novel and compact ConvNet +designed to address this efficiency gap in image dehazing. Our WaveDH leverages +wavelet sub-bands for guided up-and-downsampling and frequency-aware feature +refinement. The key idea lies in utilizing wavelet decomposition to extract +low-and-high frequency components from feature levels, allowing for faster +processing while upholding high-quality reconstruction. The downsampling block +employs a novel squeeze-and-attention scheme to optimize the feature +downsampling process in a structurally compact manner through wavelet domain +learning, preserving discriminative features while discarding noise components. +In our upsampling block, we introduce a dual-upsample and fusion mechanism to +enhance high-frequency component awareness, aiding in the reconstruction of +high-frequency details. Departing from conventional dehazing methods that treat +low-and-high frequency components equally, our feature refinement block +strategically processes features with a frequency-aware approach. By employing +a coarse-to-fine methodology, it not only refines the details at frequency +levels but also significantly optimizes computational costs. The refinement is +performed in a maximum 8x downsampled feature space, striking a favorable +efficiency-vs-accuracy trade-off. Extensive experiments demonstrate that our +method, WaveDH, outperforms many state-of-the-art methods on several image +dehazing benchmarks with significantly reduced computational costs. Our code is +available at https://github.com/AwesomeHwang/WaveDH. + +
+
+ comment: Submitted to TMM +
+
+
+
+
+ + ☆ Language Model Guided Interpretable Video Action Reasoning CVPR 2024 + + +
+ While neural networks have excelled in video action recognition tasks, their +black-box nature often obscures the understanding of their decision-making +processes. Recent approaches used inherently interpretable models to analyze +video actions in a manner akin to human reasoning. These models, however, +usually fall short in performance compared to their black-box counterparts. In +this work, we present a new framework named Language-guided Interpretable +Action Recognition framework (LaIAR). LaIAR leverages knowledge from language +models to enhance both the recognition capabilities and the interpretability of +video models. In essence, we redefine the problem of understanding video model +decisions as a task of aligning video and language models. Using the logical +reasoning captured by the language model, we steer the training of the video +model. This integrated approach not only improves the video model's +adaptability to different domains but also boosts its overall performance. +Extensive experiments on two complex video action datasets, Charades & CAD-120, +validates the improved performance and interpretability of our LaIAR framework. +The code of LaIAR is available at https://github.com/NingWang2049/LaIAR. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ TSCM: A Teacher-Student Model for Vision Place Recognition Using + Cross-Metric Knowledge Distillation + + +
+ Visual place recognition (VPR) plays a pivotal role in autonomous exploration +and navigation of mobile robots within complex outdoor environments. While +cost-effective and easily deployed, camera sensors are sensitive to lighting +and weather changes, and even slight image alterations can greatly affect VPR +efficiency and precision. Existing methods overcome this by exploiting powerful +yet large networks, leading to significant consumption of computational +resources. In this paper, we propose a high-performance teacher and lightweight +student distillation framework called TSCM. It exploits our devised +cross-metric knowledge distillation to narrow the performance gap between the +teacher and student models, maintaining superior performance while enabling +minimal computational load during deployment. We conduct comprehensive +evaluations on large-scale datasets, namely Pittsburgh30k and Pittsburgh250k. +Experimental results demonstrate the superiority of our method over baseline +models in terms of recognition accuracy and model parameter efficiency. +Moreover, our ablation studies show that the proposed knowledge distillation +technique surpasses other counterparts. The code of our method has been +released at https://github.com/nubot-nudt/TSCM. + +
+
+
+
+
+ + ☆ Learning Temporal Cues by Predicting Objects Move for Multi-camera 3D + Object Detection + + +
+ In autonomous driving and robotics, there is a growing interest in utilizing +short-term historical data to enhance multi-camera 3D object detection, +leveraging the continuous and correlated nature of input video streams. Recent +work has focused on spatially aligning BEV-based features over timesteps. +However, this is often limited as its gain does not scale well with long-term +past observations. To address this, we advocate for supervising a model to +predict objects' poses given past observations, thus explicitly guiding to +learn objects' temporal cues. To this end, we propose a model called DAP +(Detection After Prediction), consisting of a two-branch network: (i) a branch +responsible for forecasting the current objects' poses given past observations +and (ii) another branch that detects objects based on the current and past +observations. The features predicting the current objects from branch (i) is +fused into branch (ii) to transfer predictive knowledge. We conduct extensive +experiments with the large-scale nuScenes datasets, and we observe that +utilizing such predictive information significantly improves the overall +detection performance. Our model can be used plug-and-play, showing consistent +performance gain. + +
+
+
+
+
+ + ☆ Diffusion Deepfake + + +
+ Recent progress in generative AI, primarily through diffusion models, +presents significant challenges for real-world deepfake detection. The +increased realism in image details, diverse content, and widespread +accessibility to the general public complicates the identification of these +sophisticated deepfakes. Acknowledging the urgency to address the vulnerability +of current deepfake detectors to this evolving threat, our paper introduces two +extensive deepfake datasets generated by state-of-the-art diffusion models as +other datasets are less diverse and low in quality. Our extensive experiments +also showed that our dataset is more challenging compared to the other face +deepfake datasets. Our strategic dataset creation not only challenge the +deepfake detectors but also sets a new benchmark for more evaluation. Our +comprehensive evaluation reveals the struggle of existing detection methods, +often optimized for specific image domains and manipulations, to effectively +adapt to the intricate nature of diffusion deepfakes, limiting their practical +utility. To address this critical issue, we investigate the impact of enhancing +training data diversity on representative detection methods. This involves +expanding the diversity of both manipulation techniques and image domains. Our +findings underscore that increasing training data diversity results in improved +generalizability. Moreover, we propose a novel momentum difficulty boosting +strategy to tackle the additional challenge posed by training data +heterogeneity. This strategy dynamically assigns appropriate sample weights +based on learning difficulty, enhancing the model's adaptability to both easy +and challenging samples. Extensive experiments on both existing and newly +proposed benchmarks demonstrate that our model optimization approach surpasses +prior alternatives significantly. + +
+
+ comment: 28 pages including Supplementary material +
+
+
+
+
+ + ☆ Leveraging Digital Perceptual Technologies for Remote Perception and + Analysis of Human Biomechanical Processes: A Contactless Approach for + Workload and Joint Force Assessment + + +
+ This study presents an innovative computer vision framework designed to +analyze human movements in industrial settings, aiming to enhance biomechanical +analysis by integrating seamlessly with existing software. Through a +combination of advanced imaging and modeling techniques, the framework allows +for comprehensive scrutiny of human motion, providing valuable insights into +kinematic patterns and kinetic data. Utilizing Convolutional Neural Networks +(CNNs), Direct Linear Transform (DLT), and Long Short-Term Memory (LSTM) +networks, the methodology accurately detects key body points, reconstructs 3D +landmarks, and generates detailed 3D body meshes. Extensive evaluations across +various movements validate the framework's effectiveness, demonstrating +comparable results to traditional marker-based models with minor differences in +joint angle estimations and precise estimations of weight and height. +Statistical analyses consistently support the framework's reliability, with +joint angle estimations showing less than a 5-degree difference for hip +flexion, elbow flexion, and knee angle methods. Additionally, weight estimation +exhibits an average error of less than 6 % for weight and less than 2 % for +height when compared to ground-truth values from 10 subjects. The integration +of the Biomech-57 landmark skeleton template further enhances the robustness +and reinforces the framework's credibility. This framework shows significant +promise for meticulous biomechanical analysis in industrial contexts, +eliminating the need for cumbersome markers and extending its utility to +diverse research domains, including the study of specific exoskeleton devices' +impact on facilitating the prompt return of injured workers to their tasks. + +
+
+
+
+
+ + ☆ Leveraging YOLO-World and GPT-4V LMMs for Zero-Shot Person Detection and + Action Recognition in Drone Imagery + + +
+ In this article, we explore the potential of zero-shot Large Multimodal +Models (LMMs) in the domain of drone perception. We focus on person detection +and action recognition tasks and evaluate two prominent LMMs, namely YOLO-World +and GPT-4V(ision) using a publicly available dataset captured from aerial +views. Traditional deep learning approaches rely heavily on large and +high-quality training datasets. However, in certain robotic settings, acquiring +such datasets can be resource-intensive or impractical within a reasonable +timeframe. The flexibility of prompt-based Large Multimodal Models (LMMs) and +their exceptional generalization capabilities have the potential to +revolutionize robotics applications in these scenarios. Our findings suggest +that YOLO-World demonstrates good detection performance. GPT-4V struggles with +accurately classifying action classes but delivers promising results in +filtering out unwanted region proposals and in providing a general description +of the scenery. This research represents an initial step in leveraging LMMs for +drone perception and establishes a foundation for future investigations in this +area. + +
+
+ comment: 4 pages +
+
+
+
+
+ + ☆ A Linear Time and Space Local Point Cloud Geometry Encoder via + Vectorized Kernel Mixture (VecKM) + + +
+ We propose VecKM, a novel local point cloud geometry encoder that is +descriptive, efficient and robust to noise. VecKM leverages a unique approach +by vectorizing a kernel mixture to represent the local point clouds. Such +representation is descriptive and robust to noise, which is supported by two +theorems that confirm its ability to reconstruct and preserve the similarity of +the local shape. Moreover, VecKM is the first successful attempt to reduce the +computation and memory costs from $O(n^2+nKd)$ to $O(nd)$ by sacrificing a +marginal constant factor, where $n$ is the size of the point cloud and $K$ is +neighborhood size. The efficiency is primarily due to VecKM's unique +factorizable property that eliminates the need of explicitly grouping points +into neighborhoods. In the normal estimation task, VecKM demonstrates not only +100x faster inference speed but also strongest descriptiveness and robustness +compared with existing popular encoders. In classification and segmentation +tasks, integrating VecKM as a preprocessing module achieves consistently better +performance than the PointNet, PointNet++, and point transformer baselines, and +runs consistently faster by up to 10x. + +
+
+
+
+
+ + ☆ Two-Phase Multi-Dose-Level PET Image Reconstruction with Dose Level + Awareness + + +
+ To obtain high-quality positron emission tomography (PET) while minimizing +radiation exposure, a range of methods have been designed to reconstruct +standard-dose PET (SPET) from corresponding low-dose PET (LPET) images. +However, most current methods merely learn the mapping between +single-dose-level LPET and SPET images, but omit the dose disparity of LPET +images in clinical scenarios. In this paper, to reconstruct high-quality SPET +images from multi-dose-level LPET images, we design a novel two-phase +multi-dose-level PET reconstruction algorithm with dose level awareness, +containing a pre-training phase and a SPET prediction phase. Specifically, the +pre-training phase is devised to explore both fine-grained discriminative +features and effective semantic representation. The SPET prediction phase +adopts a coarse prediction network utilizing pre-learned dose level prior to +generate preliminary result, and a refinement network to precisely preserve the +details. Experiments on MICCAI 2022 Ultra-low Dose PET Imaging Challenge +Dataset have demonstrated the superiority of our method. + +
+
+ comment: Accepted by ISBI2024 +
+
+
+
+
+ + ☆ mChartQA: A universal benchmark for multimodal Chart Question Answer + based on Vision-Language Alignment and Reasoning + + +
+ In the fields of computer vision and natural language processing, multimodal +chart question-answering, especially involving color, structure, and textless +charts, poses significant challenges. Traditional methods, which typically +involve either direct multimodal processing or a table-to-text conversion +followed by language model analysis, have limitations in effectively handling +these complex scenarios. This paper introduces a novel multimodal chart +question-answering model, specifically designed to address these intricate +tasks. Our model integrates visual and linguistic processing, overcoming the +constraints of existing methods. We adopt a dual-phase training approach: the +initial phase focuses on aligning image and text representations, while the +subsequent phase concentrates on optimizing the model's interpretative and +analytical abilities in chart-related queries. This approach has demonstrated +superior performance on multiple public datasets, particularly in handling +color, structure, and textless chart questions, indicating its effectiveness in +complex multimodal tasks. + +
+
+
+
+
+ + ☆ Bidirectional Multi-Scale Implicit Neural Representations for Image + Deraining + + +
+ How to effectively explore multi-scale representations of rain streaks is +important for image deraining. In contrast to existing Transformer-based +methods that depend mostly on single-scale rain appearance, we develop an +end-to-end multi-scale Transformer that leverages the potentially useful +features in various scales to facilitate high-quality image reconstruction. To +better explore the common degradation representations from spatially-varying +rain streaks, we incorporate intra-scale implicit neural representations based +on pixel coordinates with the degraded inputs in a closed-loop design, enabling +the learned features to facilitate rain removal and improve the robustness of +the model in complex scenarios. To ensure richer collaborative representation +from different scales, we embed a simple yet effective inter-scale +bidirectional feedback operation into our multi-scale Transformer by performing +coarse-to-fine and fine-to-coarse information communication. Extensive +experiments demonstrate that our approach, named as NeRD-Rain, performs +favorably against the state-of-the-art ones on both synthetic and real-world +benchmark datasets. The source code and trained models are available at +https://github.com/cschenxiang/NeRD-Rain. + +
+
+ comment: Project website: https://github.com/cschenxiang/NeRD-Rain +
+
+
+
+
+ + ☆ Efficient 3D Implicit Head Avatar with Mesh-anchored Hash Table + Blendshapes CVPR2024 + + +
+ 3D head avatars built with neural implicit volumetric representations have +achieved unprecedented levels of photorealism. However, the computational cost +of these methods remains a significant barrier to their widespread adoption, +particularly in real-time applications such as virtual reality and +teleconferencing. While attempts have been made to develop fast neural +rendering approaches for static scenes, these methods cannot be simply employed +to support realistic facial expressions, such as in the case of a dynamic +facial performance. To address these challenges, we propose a novel fast 3D +neural implicit head avatar model that achieves real-time rendering while +maintaining fine-grained controllability and high rendering quality. Our key +idea lies in the introduction of local hash table blendshapes, which are +learned and attached to the vertices of an underlying face parametric model. +These per-vertex hash-tables are linearly merged with weights predicted via a +CNN, resulting in expression dependent embeddings. Our novel representation +enables efficient density and color predictions using a lightweight MLP, which +is further accelerated by a hierarchical nearest neighbor search method. +Extensive experiments show that our approach runs in real-time while achieving +comparable rendering quality to state-of-the-arts and decent results on +challenging expressions. + +
+
+ comment: In CVPR2024. Project page: + https://augmentedperception.github.io/monoavatar-plus +
+
+
+
+
+ + ☆ Semantic Augmentation in Images using Language + + +
+ Deep Learning models are incredibly data-hungry and require very large +labeled datasets for supervised learning. As a consequence, these models often +suffer from overfitting, limiting their ability to generalize to real-world +examples. Recent advancements in diffusion models have enabled the generation +of photorealistic images based on textual inputs. Leveraging the substantial +datasets used to train these diffusion models, we propose a technique to +utilize generated images to augment existing datasets. This paper explores +various strategies for effective data augmentation to improve the out-of-domain +generalization capabilities of deep learning models. + +
+
+
+
+
+ + ☆ COVID-19 Detection Based on Blood Test Parameters using Various + Artificial Intelligence Methods + + +
+ In 2019, the world faced a new challenge: a COVID-19 disease caused by the +novel coronavirus, SARS-CoV-2. The virus rapidly spread across the globe, +leading to a high rate of mortality, which prompted health organizations to +take measures to control its transmission. Early disease detection is crucial +in the treatment process, and computer-based automatic detection systems have +been developed to aid in this effort. These systems often rely on artificial +intelligence (AI) approaches such as machine learning, neural networks, fuzzy +systems, and deep learning to classify diseases. This study aimed to +differentiate COVID-19 patients from others using self-categorizing classifiers +and employing various AI methods. This study used two datasets: the blood test +samples and radiography images. The best results for the blood test samples +obtained from San Raphael Hospital, which include two classes of individuals, +those with COVID-19 and those with non-COVID diseases, were achieved through +the use of the Ensemble method (a combination of a neural network and two +machines learning methods). The results showed that this approach for COVID-19 +diagnosis is cost-effective and provides results in a shorter amount of time +than other methods. The proposed model achieved an accuracy of 94.09% on the +dataset used. Secondly, the radiographic images were divided into four classes: +normal, viral pneumonia, ground glass opacity, and COVID-19 infection. These +were used for segmentation and classification. The lung lobes were extracted +from the images and then categorized into specific classes. We achieved an +accuracy of 91.1% on the image dataset. Generally, this study highlights the +potential of AI in detecting and managing COVID-19 and underscores the +importance of continued research and development in this field. + +
+
+
+
+
+ + ☆ GaitSTR: Gait Recognition with Sequential Two-stream Refinement + + +
+ Gait recognition aims to identify a person based on their walking sequences, +serving as a useful biometric modality as it can be observed from long +distances without requiring cooperation from the subject. In representing a +person's walking sequence, silhouettes and skeletons are the two primary +modalities used. Silhouette sequences lack detailed part information when +overlapping occurs between different body segments and are affected by carried +objects and clothing. Skeletons, comprising joints and bones connecting the +joints, provide more accurate part information for different segments; however, +they are sensitive to occlusions and low-quality images, causing +inconsistencies in frame-wise results within a sequence. In this paper, we +explore the use of a two-stream representation of skeletons for gait +recognition, alongside silhouettes. By fusing the combined data of silhouettes +and skeletons, we refine the two-stream skeletons, joints, and bones through +self-correction in graph convolution, along with cross-modal correction with +temporal consistency from silhouettes. We demonstrate that with refined +skeletons, the performance of the gait recognition model can achieve further +improvement on public gait recognition datasets compared with state-of-the-art +methods without extra annotations. + +
+
+
+
+
+ + ☆ Effective Malware Detection for Embedded Computing Systems with Limited + Exposure + + +
+ One of the pivotal security threats for the embedded computing systems is +malicious software a.k.a malware. With efficiency and efficacy, Machine +Learning (ML) has been widely adopted for malware detection in recent times. +Despite being efficient, the existing techniques require a tremendous number of +benign and malware samples for training and modeling an efficient malware +detector. Furthermore, such constraints limit the detection of emerging malware +samples due to the lack of sufficient malware samples required for efficient +training. To address such concerns, we introduce a code-aware data generation +technique that generates multiple mutated samples of the limitedly seen malware +by the devices. Loss minimization ensures that the generated samples closely +mimic the limitedly seen malware and mitigate the impractical samples. Such +developed malware is further incorporated into the training set to formulate +the model that can efficiently detect the emerging malware despite having +limited exposure. The experimental results demonstrates that the proposed +technique achieves an accuracy of 90% in detecting limitedly seen malware, +which is approximately 3x more than the accuracy attained by state-of-the-art +techniques. + +
+
+
+
+
+ + ☆ One Noise to Rule Them All: Multi-View Adversarial Attacks with + Universal Perturbation + + +
+ This paper presents a novel universal perturbation method for generating +robust multi-view adversarial examples in 3D object recognition. Unlike +conventional attacks limited to single views, our approach operates on multiple +2D images, offering a practical and scalable solution for enhancing model +scalability and robustness. This generalizable method bridges the gap between +2D perturbations and 3D-like attack capabilities, making it suitable for +real-world applications. + Existing adversarial attacks may become ineffective when images undergo +transformations like changes in lighting, camera position, or natural +deformations. We address this challenge by crafting a single universal noise +perturbation applicable to various object views. Experiments on diverse +rendered 3D objects demonstrate the effectiveness of our approach. The +universal perturbation successfully identified a single adversarial noise for +each given set of 3D object renders from multiple poses and viewpoints. +Compared to single-view attacks, our universal attacks lower classification +confidence across multiple viewing angles, especially at low noise levels. A +sample implementation is made available at +https://github.com/memoatwit/UniversalPerturbation. + +
+
+ comment: 6 pages, 4 figures, presented at ICAIA, Springer to publish under + Algorithms for Intelligent Systems +
+
+
+
+
+ + ☆ LP++: A Surprisingly Strong Linear Probe for Few-Shot CLIP + + +
+ In a recent, strongly emergent literature on few-shot CLIP adaptation, Linear +Probe (LP) has been often reported as a weak baseline. This has motivated +intensive research building convoluted prompt learning or feature adaptation +strategies. In this work, we propose and examine from convex-optimization +perspectives a generalization of the standard LP baseline, in which the linear +classifier weights are learnable functions of the text embedding, with +class-wise multipliers blending image and text knowledge. As our objective +function depends on two types of variables, i.e., the class visual prototypes +and the learnable blending parameters, we propose a computationally efficient +block coordinate Majorize-Minimize (MM) descent algorithm. In our full-batch MM +optimizer, which we coin LP++, step sizes are implicit, unlike standard +gradient descent practices where learning rates are intensively searched over +validation sets. By examining the mathematical properties of our loss (e.g., +Lipschitz gradient continuity), we build majorizing functions yielding +data-driven learning rates and derive approximations of the loss's minima, +which provide data-informed initialization of the variables. Our image-language +objective function, along with these non-trivial optimization insights and +ingredients, yields, surprisingly, highly competitive few-shot CLIP +performances. Furthermore, LP++ operates in black-box, relaxes intensive +validation searches for the optimization hyper-parameters, and runs +orders-of-magnitudes faster than state-of-the-art few-shot CLIP adaptation +methods. Our code is available at: +\url{https://github.com/FereshteShakeri/FewShot-CLIP-Strong-Baseline.git}. + +
+
+
+
+
+ + ☆ Smooth Deep Saliency + + +
+ In this work, we investigate methods to reduce the noise in deep saliency +maps coming from convolutional downsampling, with the purpose of explaining how +a deep learning model detects tumors in scanned histological tissue samples. +Those methods make the investigated models more interpretable for +gradient-based saliency maps, computed in hidden layers. We test our approach +on different models trained for image classification on ImageNet1K, and models +trained for tumor detection on Camelyon16 and in-house real-world digital +pathology scans of stained tissue samples. Our results show that the +checkerboard noise in the gradient gets reduced, resulting in smoother and +therefore easier to interpret saliency maps. + +
+
+
+
+
+ + ☆ OFMPNet: Deep End-to-End Model for Occupancy and Flow Prediction in + Urban Environment + + +
+ The task of motion prediction is pivotal for autonomous driving systems, +providing crucial data to choose a vehicle behavior strategy within its +surroundings. Existing motion prediction techniques primarily focus on +predicting the future trajectory of each agent in the scene individually, +utilizing its past trajectory data. In this paper, we introduce an end-to-end +neural network methodology designed to predict the future behaviors of all +dynamic objects in the environment. This approach leverages the occupancy map +and the scene's motion flow. We are investigatin various alternatives for +constructing a deep encoder-decoder model called OFMPNet. This model uses a +sequence of bird's-eye-view road images, occupancy grid, and prior motion flow +as input data. The encoder of the model can incorporate transformer, +attention-based, or convolutional units. The decoder considers the use of both +convolutional modules and recurrent blocks. Additionally, we propose a novel +time-weighted motion flow loss, whose application has shown a substantial +decrease in end-point error. Our approach has achieved state-of-the-art results +on the Waymo Occupancy and Flow Prediction benchmark, with a Soft IoU of 52.1% +and an AUC of 76.75% on Flow-Grounded Occupancy. + +
+
+ comment: Accepted in Neurocomputing journal - 2024 +
+
+
+
+
+ + ☆ SnAG: Scalable and Accurate Video Grounding CVPR 2024 + + +
+ Temporal grounding of text descriptions in videos is a central problem in +vision-language learning and video understanding. Existing methods often +prioritize accuracy over scalability -- they have been optimized for grounding +only a few text queries within short videos, and fail to scale up to long +videos with hundreds of queries. In this paper, we study the effect of +cross-modal fusion on the scalability of video grounding models. Our analysis +establishes late fusion as a more cost-effective fusion scheme for long-form +videos with many text queries. Moreover, it leads us to a novel, video-centric +sampling scheme for efficient training. Based on these findings, we present +SnAG, a simple baseline for scalable and accurate video grounding. Without +bells and whistles, SnAG is 43% more accurate and 1.5x faster than CONE, a +state of the art for long-form video grounding on the challenging MAD dataset, +while achieving highly competitive results on short videos. + +
+
+ comment: Accepted to CVPR 2024. Code available at + https://github.com/fmu2/snag_release +
+
+
+
+
+ + ☆ Towards Robust 3D Pose Transfer with Adversarial Learning CVPR 2024 + + +
+ 3D pose transfer that aims to transfer the desired pose to a target mesh is +one of the most challenging 3D generation tasks. Previous attempts rely on +well-defined parametric human models or skeletal joints as driving pose +sources. However, to obtain those clean pose sources, cumbersome but necessary +pre-processing pipelines are inevitable, hindering implementations of the +real-time applications. This work is driven by the intuition that the +robustness of the model can be enhanced by introducing adversarial samples into +the training, leading to a more invulnerable model to the noisy inputs, which +even can be further extended to directly handling the real-world data like raw +point clouds/scans without intermediate processing. Furthermore, we propose a +novel 3D pose Masked Autoencoder (3D-PoseMAE), a customized MAE that +effectively learns 3D extrinsic presentations (i.e., pose). 3D-PoseMAE +facilitates learning from the aspect of extrinsic attributes by simultaneously +generating adversarial samples that perturb the model and learning the +arbitrary raw noisy poses via a multi-scale masking strategy. Both qualitative +and quantitative studies show that the transferred meshes given by our network +result in much better quality. Besides, we demonstrate the strong +generalizability of our method on various poses, different domains, and even +raw scans. Experimental results also show meaningful insights that the +intermediate adversarial samples generated in the training can successfully +attack the existing pose transfer models. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Linear Combination of Saved Checkpoints Makes Consistency and Diffusion + Models Better + + +
+ Diffusion Models (DM) and Consistency Models (CM) are two types of popular +generative models with good generation quality on various tasks. When training +DM and CM, intermediate weight checkpoints are not fully utilized and only the +last converged checkpoint is used. In this work, we find that high-quality +model weights often lie in a basin which cannot be reached by SGD but can be +obtained by proper checkpoint averaging. Based on these observations, we +propose LCSC, a simple but effective and efficient method to enhance the +performance of DM and CM, by combining checkpoints along the training +trajectory with coefficients deduced from evolutionary search. We demonstrate +the value of LCSC through two use cases: $\textbf{(a) Reducing training cost.}$ +With LCSC, we only need to train DM/CM with fewer number of iterations and/or +lower batch sizes to obtain comparable sample quality with the fully trained +model. For example, LCSC achieves considerable training speedups for CM +(23$\times$ on CIFAR-10 and 15$\times$ on ImageNet-64). $\textbf{(b) Enhancing +pre-trained models.}$ Assuming full training is already done, LCSC can further +improve the generation quality or speed of the final converged models. For +example, LCSC achieves better performance using 1 number of function evaluation +(NFE) than the base model with 2 NFE on consistency distillation, and decreases +the NFE of DM from 15 to 9 while maintaining the generation quality on +CIFAR-10. Our code is available at +https://github.com/imagination-research/LCSC. + +
+
+
+
+
+ + ☆ Visual Concept Connectome (VCC): Open World Concept Discovery and their + Interlayer Connections in Deep Models CVPR 2024 + + +
+ Understanding what deep network models capture in their learned +representations is a fundamental challenge in computer vision. We present a new +methodology to understanding such vision models, the Visual Concept Connectome +(VCC), which discovers human interpretable concepts and their interlayer +connections in a fully unsupervised manner. Our approach simultaneously reveals +fine-grained concepts at a layer, connection weightings across all layers and +is amendable to global analysis of network structure (e.g., branching pattern +of hierarchical concept assemblies). Previous work yielded ways to extract +interpretable concepts from single layers and examine their impact on +classification, but did not afford multilayer concept analysis across an entire +network architecture. Quantitative and qualitative empirical results show the +effectiveness of VCCs in the domain of image classification. Also, we leverage +VCCs for the application of failure mode debugging to reveal where mistakes +arise in deep networks. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ OOSTraj: Out-of-Sight Trajectory Prediction With Vision-Positioning + Denoising CVPR + + +
+ Trajectory prediction is fundamental in computer vision and autonomous +driving, particularly for understanding pedestrian behavior and enabling +proactive decision-making. Existing approaches in this field often assume +precise and complete observational data, neglecting the challenges associated +with out-of-view objects and the noise inherent in sensor data due to limited +camera range, physical obstructions, and the absence of ground truth for +denoised sensor data. Such oversights are critical safety concerns, as they can +result in missing essential, non-visible objects. To bridge this gap, we +present a novel method for out-of-sight trajectory prediction that leverages a +vision-positioning technique. Our approach denoises noisy sensor observations +in an unsupervised manner and precisely maps sensor-based trajectories of +out-of-sight objects into visual trajectories. This method has demonstrated +state-of-the-art performance in out-of-sight noisy sensor trajectory denoising +and prediction on the Vi-Fi and JRDB datasets. By enhancing trajectory +prediction accuracy and addressing the challenges of out-of-sight objects, our +work significantly contributes to improving the safety and reliability of +autonomous driving in complex environments. Our work represents the first +initiative towards Out-Of-Sight Trajectory prediction (OOSTraj), setting a new +benchmark for future research. The code is available at +\url{https://github.com/Hai-chao-Zhang/OOSTraj}. + +
+
+ comment: In Proceedings of IEEE/CVF Conference on Computer Vision and Pattern + Recognition 2024 (CVPR) +
+
+
+
+
+ + ☆ CHOSEN: Contrastive Hypothesis Selection for Multi-View Depth Refinement + + +
+ We propose CHOSEN, a simple yet flexible, robust and effective multi-view +depth refinement framework. It can be employed in any existing multi-view +stereo pipeline, with straightforward generalization capability for different +multi-view capture systems such as camera relative positioning and lenses. +Given an initial depth estimation, CHOSEN iteratively re-samples and selects +the best hypotheses, and automatically adapts to different metric or intrinsic +scales determined by the capture system. The key to our approach is the +application of contrastive learning in an appropriate solution space and a +carefully designed hypothesis feature, based on which positive and negative +hypotheses can be effectively distinguished. Integrated in a simple baseline +multi-view stereo pipeline, CHOSEN delivers impressive quality in terms of +depth and normal accuracy compared to many current deep learning based +multi-view stereo pipelines. + +
+
+
+
+
+ + ☆ Insights from the Use of Previously Unseen Neural Architecture Search + Datasets + + +
+ The boundless possibility of neural networks which can be used to solve a +problem -- each with different performance -- leads to a situation where a Deep +Learning expert is required to identify the best neural network. This goes +against the hope of removing the need for experts. Neural Architecture Search +(NAS) offers a solution to this by automatically identifying the best +architecture. However, to date, NAS work has focused on a small set of datasets +which we argue are not representative of real-world problems. We introduce +eight new datasets created for a series of NAS Challenges: AddNIST, Language, +MultNIST, CIFARTile, Gutenberg, Isabella, GeoClassing, and Chesseract. These +datasets and challenges are developed to direct attention to issues in NAS +development and to encourage authors to consider how their models will perform +on datasets unknown to them at development time. We present experimentation +using standard Deep Learning methods as well as the best results from challenge +participants. + +
+
+
+
+
+ + ☆ NeRFCodec: Neural Feature Compression Meets Neural Radiance Fields for + Memory-Efficient Scene Representation CVPR2024 + + +
+ The emergence of Neural Radiance Fields (NeRF) has greatly impacted 3D scene +modeling and novel-view synthesis. As a kind of visual media for 3D scene +representation, compression with high rate-distortion performance is an eternal +target. Motivated by advances in neural compression and neural field +representation, we propose NeRFCodec, an end-to-end NeRF compression framework +that integrates non-linear transform, quantization, and entropy coding for +memory-efficient scene representation. Since training a non-linear transform +directly on a large scale of NeRF feature planes is impractical, we discover +that pre-trained neural 2D image codec can be utilized for compressing the +features when adding content-specific parameters. Specifically, we reuse neural +2D image codec but modify its encoder and decoder heads, while keeping the +other parts of the pre-trained decoder frozen. This allows us to train the full +pipeline via supervision of rendering loss and entropy loss, yielding the +rate-distortion balance by updating the content-specific parameters. At test +time, the bitstreams containing latent code, feature decoder head, and other +side information are transmitted for communication. Experimental results +demonstrate our method outperforms existing NeRF compression methods, enabling +high-quality novel view synthesis with a memory budget of 0.5 MB. + +
+
+ comment: Accepted at CVPR2024. The source code will be released +
+
+
+
+
+ + ☆ Real, fake and synthetic faces -- does the coin have three sides? + + +
+ With the ever-growing power of generative artificial intelligence, deepfake +and artificially generated (synthetic) media have continued to spread online, +which creates various ethical and moral concerns regarding their usage. To +tackle this, we thus present a novel exploration of the trends and patterns +observed in real, deepfake and synthetic facial images. The proposed analysis +is done in two parts: firstly, we incorporate eight deep learning models and +analyze their performances in distinguishing between the three classes of +images. Next, we look to further delve into the similarities and differences +between these three sets of images by investigating their image properties both +in the context of the entire image as well as in the context of specific +regions within the image. ANOVA test was also performed and provided further +clarity amongst the patterns associated between the images of the three +classes. From our findings, we observe that the investigated deeplearning +models found it easier to detect synthetic facial images, with the ViT Patch-16 +model performing best on this task with a class-averaged sensitivity, +specificity, precision, and accuracy of 97.37%, 98.69%, 97.48%, and 98.25%, +respectively. This observation was supported by further analysis of various +image properties. We saw noticeable differences across the three category of +images. This analysis can help us build better algorithms for facial image +generation, and also shows that synthetic, deepfake and real face images are +indeed three different classes. + +
+
+
+
+
+ + ♻ ☆ Diffuse, Attend, and Segment: Unsupervised Zero-Shot Segmentation using + Stable Diffusion CVPR2024 + + +
+ Producing quality segmentation masks for images is a fundamental problem in +computer vision. Recent research has explored large-scale supervised training +to enable zero-shot segmentation on virtually any image style and unsupervised +training to enable segmentation without dense annotations. However, +constructing a model capable of segmenting anything in a zero-shot manner +without any annotations is still challenging. In this paper, we propose to +utilize the self-attention layers in stable diffusion models to achieve this +goal because the pre-trained stable diffusion model has learned inherent +concepts of objects within its attention layers. Specifically, we introduce a +simple yet effective iterative merging process based on measuring KL divergence +among attention maps to merge them into valid segmentation masks. The proposed +method does not require any training or language dependency to extract quality +segmentation for any images. On COCO-Stuff-27, our method surpasses the prior +unsupervised zero-shot SOTA method by an absolute 26% in pixel accuracy and 17% +in mean IoU. The project page is at +\url{https://sites.google.com/view/diffseg/home}. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ♻ ☆ Rephrase, Augment, Reason: Visual Grounding of Questions for + Vision-Language Models ICLR 2024 + + +
+ An increasing number of vision-language tasks can be handled with little to +no training, i.e., in a zero and few-shot manner, by marrying large language +models (LLMs) to vision encoders, resulting in large vision-language models +(LVLMs). While this has huge upsides, such as not requiring training data or +custom architectures, how an input is presented to an LVLM can have a major +impact on zero-shot model performance. In particular, inputs phrased in an +underspecified way can result in incorrect answers due to factors like missing +visual information, complex implicit reasoning, or linguistic ambiguity. +Therefore, adding visually-grounded information to the input as a preemptive +clarification should improve model performance by reducing underspecification, +e.g., by localizing objects and disambiguating references. Similarly, in the +VQA setting, changing the way questions are framed can make them easier for +models to answer. To this end, we present Rephrase, Augment and Reason +(RepARe), a gradient-free framework that extracts salient details about the +image using the underlying LVLM as a captioner and reasoner, in order to +propose modifications to the original question. We then use the LVLM's +confidence over a generated answer as an unsupervised scoring function to +select the rephrased question most likely to improve zero-shot performance. +Focusing on three visual question answering tasks, we show that RepARe can +result in a 3.85% (absolute) increase in zero-shot accuracy on VQAv2, 6.41%, +and 7.94% points increase on A-OKVQA, and VizWiz respectively. Additionally, we +find that using gold answers for oracle question candidate selection achieves a +substantial gain in VQA accuracy by up to 14.41%. Through extensive analysis, +we demonstrate that outputs from RepARe increase syntactic complexity, and +effectively utilize vision-language interaction and the frozen LLM. + +
+
+ comment: ICLR 2024 camera-ready (23 pages), Code: + https://github.com/archiki/RepARe +
+
+
+
+
+ + ♻ ☆ pixelSplat: 3D Gaussian Splats from Image Pairs for Scalable + Generalizable 3D Reconstruction + + +
+ We introduce pixelSplat, a feed-forward model that learns to reconstruct 3D +radiance fields parameterized by 3D Gaussian primitives from pairs of images. +Our model features real-time and memory-efficient rendering for scalable +training as well as fast 3D reconstruction at inference time. To overcome local +minima inherent to sparse and locally supported representations, we predict a +dense probability distribution over 3D and sample Gaussian means from that +probability distribution. We make this sampling operation differentiable via a +reparameterization trick, allowing us to back-propagate gradients through the +Gaussian splatting representation. We benchmark our method on wide-baseline +novel view synthesis on the real-world RealEstate10k and ACID datasets, where +we outperform state-of-the-art light field transformers and accelerate +rendering by 2.5 orders of magnitude while reconstructing an interpretable and +editable 3D radiance field. + +
+
+ comment: Project page: https://dcharatan.github.io/pixelsplat +
+
+
+
+
+ + ♻ ☆ MedMamba: Vision Mamba for Medical Image Classification + + +
+ Medical image classification is a very fundamental and crucial task in the +field of computer vision. These years, CNN-based and Transformer-based models +have been widely used to classify various medical images. Unfortunately, The +limitation of CNNs in long-range modeling capabilities prevents them from +effectively extracting features in medical images, while Transformers are +hampered by their quadratic computational complexity. Recent research has shown +that the state space model (SSM) represented by Mamba can efficiently model +long-range interactions while maintaining linear computational complexity. +Inspired by this, we propose Vision Mamba for medical image classification +(MedMamba). More specifically, we introduce a novel Conv-SSM module. Conv-SSM +combines the local feature extraction ability of convolutional layers with the +ability of SSM to capture long-range dependency, thereby modeling medical +images with different modalities. To demonstrate the potential of MedMamba, we +conducted extensive experiments using 14 publicly available medical datasets +with different imaging techniques and two private datasets built by ourselves. +Extensive experimental results demonstrate that the proposed MedMamba performs +well in detecting lesions in various medical images. To the best of our +knowledge, this is the first Vision Mamba tailored for medical image +classification. The purpose of this work is to establish a new baseline for +medical image classification tasks and provide valuable insights for the future +development of more efficient and effective SSM-based artificial intelligence +algorithms and application systems in the medical. Source code has been +available at https://github.com/YubiaoYue/MedMamba. + +
+
+
+
+
+ + ♻ ☆ GDA: Generalized Diffusion for Robust Test-time Adaptation + + +
+ Machine learning models struggle with generalization when encountering +out-of-distribution (OOD) samples with unexpected distribution shifts. For +vision tasks, recent studies have shown that test-time adaptation employing +diffusion models can achieve state-of-the-art accuracy improvements on OOD +samples by generating new samples that align with the model's domain without +the need to modify the model's weights. Unfortunately, those studies have +primarily focused on pixel-level corruptions, thereby lacking the +generalization to adapt to a broader range of OOD types. We introduce +Generalized Diffusion Adaptation (GDA), a novel diffusion-based test-time +adaptation method robust against diverse OOD types. Specifically, GDA +iteratively guides the diffusion by applying a marginal entropy loss derived +from the model, in conjunction with style and content preservation losses +during the reverse sampling process. In other words, GDA considers the model's +output behavior with the semantic information of the samples as a whole, which +can reduce ambiguity in downstream tasks during the generation process. +Evaluation across various popular model architectures and OOD benchmarks shows +that GDA consistently outperforms prior work on diffusion-driven adaptation. +Notably, it achieves the highest classification accuracy improvements, ranging +from 4.4\% to 5.02\% on ImageNet-C and 2.5\% to 7.4\% on Rendition, Sketch, and +Stylized benchmarks. This performance highlights GDA's generalization to a +broader range of OOD benchmarks. + +
+
+
+
+
+ + ♻ ☆ Learning CNN on ViT: A Hybrid Model to Explicitly Class-specific + Boundaries for Domain Adaptation + + +
+ Most domain adaptation (DA) methods are based on either a convolutional +neural networks (CNNs) or a vision transformers (ViTs). They align the +distribution differences between domains as encoders without considering their +unique characteristics. For instance, ViT excels in accuracy due to its +superior ability to capture global representations, while CNN has an advantage +in capturing local representations. This fact has led us to design a hybrid +method to fully take advantage of both ViT and CNN, called Explicitly +Class-specific Boundaries (ECB). ECB learns CNN on ViT to combine their +distinct strengths. In particular, we leverage ViT's properties to explicitly +find class-specific decision boundaries by maximizing the discrepancy between +the outputs of the two classifiers to detect target samples far from the source +support. In contrast, the CNN encoder clusters target features based on the +previously defined class-specific boundaries by minimizing the discrepancy +between the probabilities of the two classifiers. Finally, ViT and CNN mutually +exchange knowledge to improve the quality of pseudo labels and reduce the +knowledge discrepancies of these models. Compared to conventional DA methods, +our ECB achieves superior performance, which verifies its effectiveness in this +hybrid model. The project website can be found +https://dotrannhattuong.github.io/ECB/website/. + +
+
+
+
+
+ + ♻ ☆ MIPS at SemEval-2024 Task 3: Multimodal Emotion-Cause Pair Extraction in + Conversations with Multimodal Language Models SemEval '24 + + +
+ This paper presents our winning submission to Subtask 2 of SemEval 2024 Task +3 on multimodal emotion cause analysis in conversations. We propose a novel +Multimodal Emotion Recognition and Multimodal Emotion Cause Extraction +(MER-MCE) framework that integrates text, audio, and visual modalities using +specialized emotion encoders. Our approach sets itself apart from +top-performing teams by leveraging modality-specific features for enhanced +emotion understanding and causality inference. Experimental evaluation +demonstrates the advantages of our multimodal approach, with our submission +achieving a competitive weighted F1 score of 0.3435, ranking third with a +margin of only 0.0339 behind the 1st team and 0.0025 behind the 2nd team. +Project: https://github.com/MIPS-COLT/MER-MCE.git + +
+
+ comment: Ranked 3rd in SemEval '24 Task 3 with F1 of 0.3435, close to 1st & + 2nd by 0.0339 & 0.0025 +
+
+
+
+
+ + ♻ ☆ Immature Green Apple Detection and Sizing in Commercial Orchards using + YOLOv8 and Shape Fitting Techniques + + +
+ Detecting and estimating size of apples during the early stages of growth is +crucial for predicting yield, pest management, and making informed decisions +related to crop-load management, harvest and post-harvest logistics, and +marketing. Traditional fruit size measurement methods are laborious and +timeconsuming. This study employs the state-of-the-art YOLOv8 object detection +and instance segmentation algorithm in conjunction with geometric shape fitting +techniques on 3D point cloud data to accurately determine the size of immature +green apples (or fruitlet) in a commercial orchard environment. The methodology +utilized two RGB-D sensors: Intel RealSense D435i and Microsoft Azure Kinect +DK. Notably, the YOLOv8 instance segmentation models exhibited proficiency in +immature green apple detection, with the YOLOv8m-seg model achieving the +highest AP@0.5 and AP@0.75 scores of 0.94 and 0.91, respectively. Using the +ellipsoid fitting technique on images from the Azure Kinect, we achieved an +RMSE of 2.35 mm, MAE of 1.66 mm, MAPE of 6.15 mm, and an R-squared value of 0.9 +in estimating the size of apple fruitlets. Challenges such as partial occlusion +caused some error in accurately delineating and sizing green apples using the +YOLOv8-based segmentation technique, particularly in fruit clusters. In a +comparison with 102 outdoor samples, the size estimation technique performed +better on the images acquired with Microsoft Azure Kinect than the same with +Intel Realsense D435i. This superiority is evident from the metrics: the RMSE +values (2.35 mm for Azure Kinect vs. 9.65 mm for Realsense D435i), MAE values +(1.66 mm for Azure Kinect vs. 7.8 mm for Realsense D435i), and the R-squared +values (0.9 for Azure Kinect vs. 0.77 for Realsense D435i). + +
+
+
+
+
+ + ♻ ☆ Semantically-Prompted Language Models Improve Visual Descriptions NAACL 2024 + + +
+ Language-vision models like CLIP have made significant strides in vision +tasks, such as zero-shot image classification (ZSIC). However, generating +specific and expressive visual descriptions remains challenging; descriptions +produced by current methods are often ambiguous and lacking in granularity. To +tackle these issues, we propose V-GLOSS: Visual Glosses, a novel method built +upon two key ideas. The first is Semantic Prompting, which conditions a +language model on structured semantic knowledge. The second is a new +contrastive algorithm that elicits fine-grained distinctions between similar +concepts. With both ideas, we demonstrate that V-GLOSS improves visual +descriptions and achieves strong results in the zero-shot setting on general +and fine-grained image-classification datasets, including ImageNet, STL-10, +FGVC Aircraft, and Flowers 102. Moreover, these descriptive capabilities +contribute to enhancing image-generation performance. Finally, we introduce a +quality-tested silver dataset with descriptions generated with V-GLOSS for all +ImageNet classes. + +
+
+ comment: To appear at NAACL 2024 +
+
+
+
+
+ + ♻ ☆ FISTNet: FusIon of STyle-path generative Networks for Facial Style + Transfer + + +
+ With the surge in emerging technologies such as Metaverse, spatial computing, +and generative AI, the application of facial style transfer has gained a lot of +interest from researchers as well as startups enthusiasts alike. StyleGAN +methods have paved the way for transfer-learning strategies that could reduce +the dependency on the huge volume of data that is available for the training +process. However, StyleGAN methods have the tendency of overfitting that +results in the introduction of artifacts in the facial images. Studies, such as +DualStyleGAN, proposed the use of multipath networks but they require the +networks to be trained for a specific style rather than generating a fusion of +facial styles at once. In this paper, we propose a FusIon of STyles (FIST) +network for facial images that leverages pre-trained multipath style transfer +networks to eliminate the problem associated with lack of huge data volume in +the training phase along with the fusion of multiple styles at the output. We +leverage pre-trained styleGAN networks with an external style pass that use +residual modulation block instead of a transform coding block. The method also +preserves facial structure, identity, and details via the gated mapping unit +introduced in this study. The aforementioned components enable us to train the +network with very limited amount of data while generating high-quality stylized +images. Our training process adapts curriculum learning strategy to perform +efficient, flexible style and model fusion in the generative space. We perform +extensive experiments to show the superiority of FISTNet in comparison to +existing state-of-the-art methods. + +
+
+ comment: 21 pages, 6 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ MonoBox: Tightness-free Box-supervised Polyp Segmentation using + Monotonicity Constraint + + +
+ We propose MonoBox, an innovative box-supervised segmentation method +constrained by monotonicity to liberate its training from the user-unfriendly +box-tightness assumption. In contrast to conventional box-supervised +segmentation, where the box edges must precisely touch the target boundaries, +MonoBox leverages imprecisely-annotated boxes to achieve robust pixel-wise +segmentation. The 'linchpin' is that, within the noisy zones around box edges, +MonoBox discards the traditional misguiding multiple-instance learning loss, +and instead optimizes a carefully-designed objective, termed monotonicity +constraint. Along directions transitioning from the foreground to background, +this new constraint steers responses to adhere to a trend of monotonically +decreasing values. Consequently, the originally unreliable learning within the +noisy zones is transformed into a correct and effective monotonicity +optimization. Moreover, an adaptive label correction is introduced, enabling +MonoBox to enhance the tightness of box annotations using predicted masks from +the previous epoch and dynamically shrink the noisy zones as training +progresses. We verify MonoBox in the box-supervised segmentation task of +polyps, where satisfying box-tightness is challenging due to the vague +boundaries between the polyp and normal tissues. Experiments on both public +synthetic and in-house real noisy datasets demonstrate that MonoBox exceeds +other anti-noise state-of-the-arts by improving Dice by at least 5.5% and 3.3%, +respectively. Codes are at https://github.com/Huster-Hq/MonoBox. + +
+
+
+
+
+ + ♻ ☆ Joint Multimodal Transformer for Emotion Recognition in the Wild + + +
+ Systems for multimodal emotion recognition (MMER) can typically outperform +unimodal systems by leveraging the inter- and intra-modal relationships +between, e.g., visual, textual, physiological, and auditory modalities. In this +paper, an MMER method is proposed that relies on a joint multimodal transformer +for fusion with key-based cross-attention. This framework aims to exploit the +diverse and complementary nature of different modalities to improve predictive +accuracy. Separate backbones capture intra-modal spatiotemporal dependencies +within each modality over video sequences. Subsequently, a joint multimodal +transformer fusion architecture integrates the individual modality embeddings, +allowing the model to capture inter-modal and intra-modal relationships +effectively. Extensive experiments on two challenging expression recognition +tasks: (1) dimensional emotion recognition on the Affwild2 dataset (with face +and voice), and (2) pain estimation on the Biovid dataset (with face and +biosensors), indicate that the proposed method can work effectively with +different modalities. Empirical results show that MMER systems with our +proposed fusion method allow us to outperform relevant baseline and +state-of-the-art methods. + +
+
+ comment: 10 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Enhancing wind field resolution in complex terrain through a + knowledge-driven machine learning approach + + +
+ Atmospheric flows are governed by a broad variety of spatio-temporal scales, +thus making real-time numerical modeling of such turbulent flows in complex +terrain at high resolution computationally intractable. In this study, we +demonstrate a neural network approach motivated by Enhanced Super-Resolution +Generative Adversarial Networks to upscale low-resolution wind fields to +generate high-resolution wind fields in an actual wind farm in Bessaker, +Norway. The neural network-based model is shown to successfully reconstruct +fully resolved 3D velocity fields from a coarser scale while respecting the +local terrain and that it easily outperforms trilinear interpolation. We also +demonstrate that by using appropriate cost function based on domain knowledge, +we can alleviate the use of adversarial training. + +
+
+
+
+
+ + ♻ ☆ A Simple Recipe for Language-guided Domain Generalized Segmentation CVPR 2024 + + +
+ Generalization to new domains not seen during training is one of the +long-standing challenges in deploying neural networks in real-world +applications. Existing generalization techniques either necessitate external +images for augmentation, and/or aim at learning invariant representations by +imposing various alignment constraints. Large-scale pretraining has recently +shown promising generalization capabilities, along with the potential of +binding different modalities. For instance, the advent of vision-language +models like CLIP has opened the doorway for vision models to exploit the +textual modality. In this paper, we introduce a simple framework for +generalizing semantic segmentation networks by employing language as the source +of randomization. Our recipe comprises three key ingredients: (i) the +preservation of the intrinsic CLIP robustness through minimal fine-tuning, (ii) +language-driven local style augmentation, and (iii) randomization by locally +mixing the source and augmented styles during training. Extensive experiments +report state-of-the-art results on various generalization benchmarks. Code is +accessible at https://github.com/astra-vision/FAMix . + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Open-Vocabulary Federated Learning with Multimodal Prototyping NAACL 2024 + + +
+ Existing federated learning (FL) studies usually assume the training label +space and test label space are identical. However, in real-world applications, +this assumption is too ideal to be true. A new user could come up with queries +that involve data from unseen classes, and such open-vocabulary queries would +directly defect such FL systems. Therefore, in this work, we explicitly focus +on the under-explored open-vocabulary challenge in FL. That is, for a new user, +the global server shall understand her/his query that involves arbitrary +unknown classes. To address this problem, we leverage the pre-trained +vision-language models (VLMs). In particular, we present a novel adaptation +framework tailored for VLMs in the context of FL, named as Federated Multimodal +Prototyping (Fed-MP). Fed-MP adaptively aggregates the local model weights +based on light-weight client residuals, and makes predictions based on a novel +multimodal prototyping mechanism. Fed-MP exploits the knowledge learned from +the seen classes, and robustifies the adapted VLM to unseen categories. Our +empirical evaluation on various datasets validates the effectiveness of Fed-MP. + +
+
+ comment: Accepted at NAACL 2024 +
+
+
+
+
+ + ♻ ☆ Cross-modality debiasing: using language to mitigate sub-population + shifts in imaging + + +
+ Sub-population shift is a specific type of domain shift that highlights +changes in data distribution within specific sub-groups or populations between +training and testing. Sub-population shift accounts for a significant source of +algorithmic bias and calls for distributional robustness. Recent studies found +inherent distributional robustness in multi-modality foundation models, such as +the vision-language model CLIP, yet this robustness is vulnerable through +parameter fine-tuning. In this paper, we propose leveraging the connection of +robustness among different modalities and reshaping the distributional +robustness of one modality with another. Specifically, in the context of the +distributional robustness of CLIP, we propose to leverage natural language +inputs to debias the image feature representations, to improve worst-case +performance on sub-populations. Our extensive empirical studies show that image +representations debiased by natural language can achieve significant +performance improvement and reduction of performance instability under +sub-population shifts. + +
+
+
+
+
+ + ♻ ☆ VA3: Virtually Assured Amplification Attack on Probabilistic Copyright + Protection for Text-to-Image Generative Models CVPR 2024 + + +
+ The booming use of text-to-image generative models has raised concerns about +their high risk of producing copyright-infringing content. While probabilistic +copyright protection methods provide a probabilistic guarantee against such +infringement, in this paper, we introduce Virtually Assured Amplification +Attack (VA3), a novel online attack framework that exposes the vulnerabilities +of these protection mechanisms. The proposed framework significantly amplifies +the probability of generating infringing content on the sustained interactions +with generative models and a non-trivial lower-bound on the success probability +of each engagement. Our theoretical and experimental results demonstrate the +effectiveness of our approach under various scenarios. These findings highlight +the potential risk of implementing probabilistic copyright protection in +practical applications of text-to-image generative models. Code is available at +https://github.com/South7X/VA3. + +
+
+ comment: 18 pages, 9 figures. Accept to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Dual-Activated Lightweight Attention ResNet50 for Automatic + Histopathology Breast Cancer Image Classification + + +
+ Automatic breast cancer classification in histopathology images is crucial +for precise diagnosis and treatment planning. Recently, classification +approaches based on the ResNet architecture have gained popularity for +significantly improving accuracy by using skip connections to mitigate +vanishing gradient problems, thereby integrating low-level and high-level +feature information. Nevertheless, the conventional ResNet architecture faces +challenges such as data imbalance and limited interpretability, necessitating +cross-domain knowledge and collaboration among medical experts. This study +effectively addresses these challenges by introducing a novel method for breast +cancer classification, the Dual-Activated Lightweight Attention ResNet50 +(DALAResNet50) model. It integrates a pre-trained ResNet50 model with a +lightweight attention mechanism, embedding an attention module in the fourth +layer of ResNet50 and incorporating two fully connected layers with LeakyReLU +and ReLU activation functions to enhance feature learning capabilities. The +DALAResNet50 method was tested on breast cancer histopathology images from the +BreakHis Database across magnification factors of 40X, 100X, 200X, and 400X, +achieving accuracies of 98.5%, 98.7%, 97.9%, and 94.3%, respectively. It was +also compared with established deep learning models such as SEResNet50, +DenseNet121, VGG16, VGG16Inception, ViT, Swin-Transformer, Dinov2_Vitb14, and +ResNet50. The reported results of DALAResNet50 have been shown to outperform +the compared approaches regarding accuracy, F1 score, IBA, and GMean, +demonstrating significant robustness and broad applicability when dealing with +different magnifications and imbalanced breast cancer datasets + +
+
+ comment: 13 pages, 7 figures,7 tables +
+
+
+
+
+ + ♻ ☆ PatchCURE: Improving Certifiable Robustness, Model Utility, and + Computation Efficiency of Adversarial Patch Defenses USENIX Security 2024 + + +
+ State-of-the-art defenses against adversarial patch attacks can now achieve +strong certifiable robustness with a marginal drop in model utility. However, +this impressive performance typically comes at the cost of 10-100x more +inference-time computation compared to undefended models -- the research +community has witnessed an intense three-way trade-off between certifiable +robustness, model utility, and computation efficiency. In this paper, we +propose a defense framework named PatchCURE to approach this trade-off problem. +PatchCURE provides sufficient "knobs" for tuning defense performance and allows +us to build a family of defenses: the most robust PatchCURE instance can match +the performance of any existing state-of-the-art defense (without efficiency +considerations); the most efficient PatchCURE instance has similar inference +efficiency as undefended models. Notably, PatchCURE achieves state-of-the-art +robustness and utility performance across all different efficiency levels, +e.g., 16-23% absolute clean accuracy and certified robust accuracy advantages +over prior defenses when requiring computation efficiency to be close to +undefended models. The family of PatchCURE defenses enables us to flexibly +choose appropriate defenses to satisfy given computation and/or utility +constraints in practice. + +
+
+ comment: USENIX Security 2024. (extended) technical report +
+
+
+
+
+ + ♻ ☆ Deep Multi-Threshold Spiking-UNet for Image Processing + + +
+ U-Net, known for its simple yet efficient architecture, is widely utilized +for image processing tasks and is particularly suitable for deployment on +neuromorphic chips. This paper introduces the novel concept of Spiking-UNet for +image processing, which combines the power of Spiking Neural Networks (SNNs) +with the U-Net architecture. To achieve an efficient Spiking-UNet, we face two +primary challenges: ensuring high-fidelity information propagation through the +network via spikes and formulating an effective training strategy. To address +the issue of information loss, we introduce multi-threshold spiking neurons, +which improve the efficiency of information transmission within the +Spiking-UNet. For the training strategy, we adopt a conversion and fine-tuning +pipeline that leverage pre-trained U-Net models. During the conversion process, +significant variability in data distribution across different parts is observed +when utilizing skip connections. Therefore, we propose a connection-wise +normalization method to prevent inaccurate firing rates. Furthermore, we adopt +a flow-based training method to fine-tune the converted models, reducing time +steps while preserving performance. Experimental results show that, on image +segmentation and denoising, our Spiking-UNet achieves comparable performance to +its non-spiking counterpart, surpassing existing SNN methods. Compared with the +converted Spiking-UNet without fine-tuning, our Spiking-UNet reduces inference +time by approximately 90\%. This research broadens the application scope of +SNNs in image processing and is expected to inspire further exploration in the +field of neuromorphic engineering. The code for our Spiking-UNet implementation +is available at https://github.com/SNNresearch/Spiking-UNet. + +
+
+ comment: Accepted in NeuroComputing +
+
+
+
+
+ + ♻ ☆ Diverse Representation Embedding for Lifelong Person Re-Identification + + +
+ Lifelong Person Re-Identification (LReID) aims to continuously learn from +successive data streams, matching individuals across multiple cameras. The key +challenge for LReID is how to effectively preserve old knowledge while +incrementally learning new information, which is caused by task-level domain +gaps and limited old task datasets. Existing methods based on CNN backbone are +insufficient to explore the representation of each instance from different +perspectives, limiting model performance on limited old task datasets and new +task datasets. Unlike these methods, we propose a Diverse Representations +Embedding (DRE) framework that first explores a pure transformer for LReID. The +proposed DRE preserves old knowledge while adapting to new information based on +instance-level and task-level layout. Concretely, an Adaptive Constraint Module +(ACM) is proposed to implement integration and push away operations between +multiple overlapping representations generated by transformer-based backbone, +obtaining rich and discriminative representations for each instance to improve +adaptive ability of LReID. Based on the processed diverse representations, we +propose Knowledge Update (KU) and Knowledge Preservation (KP) strategies at the +task-level layout by introducing the adjustment model and the learner model. KU +strategy enhances the adaptive learning ability of learner models for new +information under the adjustment model prior, and KP strategy preserves old +knowledge operated by representation-level alignment and logit-level +supervision in limited old task datasets while guaranteeing the adaptive +learning information capacity of the LReID model. Compared to state-of-the-art +methods, our method achieves significantly improved performance in holistic, +large-scale, and occluded datasets. + +
+
+ comment: 11 pages,7 Tables,3 Figures +
+
+
+
+
+ + ♻ ☆ SVGDreamer: Text Guided SVG Generation with Diffusion Model CVPR 2024 + + +
+ Recently, text-guided scalable vector graphics (SVGs) synthesis has shown +promise in domains such as iconography and sketch. However, existing +text-to-SVG generation methods lack editability and struggle with visual +quality and result diversity. To address these limitations, we propose a novel +text-guided vector graphics synthesis method called SVGDreamer. SVGDreamer +incorporates a semantic-driven image vectorization (SIVE) process that enables +the decomposition of synthesis into foreground objects and background, thereby +enhancing editability. Specifically, the SIVE process introduces +attention-based primitive control and an attention-mask loss function for +effective control and manipulation of individual elements. Additionally, we +propose a Vectorized Particle-based Score Distillation (VPSD) approach to +address issues of shape over-smoothing, color over-saturation, limited +diversity, and slow convergence of the existing text-to-SVG generation methods +by modeling SVGs as distributions of control points and colors. Furthermore, +VPSD leverages a reward model to re-weight vector particles, which improves +aesthetic appeal and accelerates convergence. Extensive experiments are +conducted to validate the effectiveness of SVGDreamer, demonstrating its +superiority over baseline methods in terms of editability, visual quality, and +diversity. Project page: +\href{https://ximinng.github.io/SVGDreamer-project/}{https://ximinng.github.io/SVGDreamer-project/} + +
+
+ comment: Accepted by CVPR 2024. project link: + https://ximinng.github.io/SVGDreamer-project/ +
+
+
+
+
+ + ♻ ☆ Neural Implicit Representations for Physical Parameter Inference from a + Single Video WACV + + +
+ Neural networks have recently been used to analyze diverse physical systems +and to identify the underlying dynamics. While existing methods achieve +impressive results, they are limited by their strong demand for training data +and their weak generalization abilities to out-of-distribution data. To +overcome these limitations, in this work we propose to combine neural implicit +representations for appearance modeling with neural ordinary differential +equations (ODEs) for modelling physical phenomena to obtain a dynamic scene +representation that can be identified directly from visual observations. Our +proposed model combines several unique advantages: (i) Contrary to existing +approaches that require large training datasets, we are able to identify +physical parameters from only a single video. (ii) The use of neural implicit +representations enables the processing of high-resolution videos and the +synthesis of photo-realistic images. (iii) The embedded neural ODE has a known +parametric form that allows for the identification of interpretable physical +parameters, and (iv) long-term prediction in state space. (v) Furthermore, the +photo-realistic rendering of novel scenes with modified physical parameters +becomes possible. + +
+
+ comment: Published in IEEE/CVF Winter Conference on Applications of Computer + Vision (WACV) 2023 +
+
+
+
+
+ + ♻ ☆ FusionINN: Invertible Image Fusion for Brain Tumor Monitoring + + +
+ Image fusion typically employs non-invertible neural networks to merge +multiple source images into a single fused image. However, for clinical +experts, solely relying on fused images may be insufficient for making +diagnostic decisions, as the fusion mechanism blends features from source +images, thereby making it difficult to interpret the underlying tumor +pathology. We introduce FusionINN, a novel invertible image fusion framework, +capable of efficiently generating fused images and also decomposing them back +to the source images by solving the inverse of the fusion process. FusionINN +guarantees lossless one-to-one pixel mapping by integrating a normally +distributed latent image alongside the fused image to facilitate the generative +modeling of the decomposition process. To the best of our knowledge, we are the +first to investigate the decomposability of fused images, which is particularly +crucial for life-sensitive applications such as medical image fusion compared +to other tasks like multi-focus or multi-exposure image fusion. Our extensive +experimentation validates FusionINN over existing discriminative and generative +fusion methods, both subjectively and objectively. Moreover, compared to a +recent denoising diffusion-based fusion model, our approach offers faster and +qualitatively better fusion results. We also exhibit the clinical utility of +our results in aiding disease prognosis. + +
+
+ comment: Source code available at https://github.com/nish03/FusionINN +
+
+
+
+
+ + ♻ ☆ DRCT: Saving Image Super-resolution away from Information Bottleneck + + +
+ In recent years, Vision Transformer-based applications to low-level vision +tasks have achieved widespread success. Unlike CNN-based models, Transformers +are more adept at capturing long-range dependencies, enabling the +reconstruction of images utilizing information from non-local areas. In the +domain of super-resolution, Swin-transformer-based approaches have become +mainstream due to their capacity to capture global spatial information and +their shifting-window attention mechanism that facilitates the interchange of +information between different windows. Many researchers have enhanced image +quality and network efficiency by expanding the receptive field or designing +complex networks, yielding commendable results. However, we observed that +spatial information tends to diminish during the forward propagation process +due to increased depth, leading to a loss of spatial information and, +consequently, limiting the model's potential. To address this, we propose the +Dense-residual-connected Transformer (DRCT), aimed at mitigating the loss of +spatial information through dense-residual connections between layers, thereby +unleashing the model's potential and enhancing performance. Experiment results +indicate that our approach is not only straightforward but also achieves +remarkable efficiency, surpassing state-of-the-art methods and performing +commendably at NTIRE2024. + +
+
+ comment: Submitted to NTIRE 2024 +
+
+
+
+
+ + ♻ ☆ Direct Preference Optimization of Video Large Multimodal Models from + Language Model Reward + + +
+ Preference modeling techniques, such as direct preference optimization (DPO), +has shown effective in enhancing the generalization abilities of large language +model (LLM). However, in tasks involving video instruction-following, providing +informative feedback, especially for detecting hallucinations in generated +responses, remains a significant challenge. Previous studies have explored +using large large multimodal models (LMMs) as reward models to guide preference +modeling, but their ability to accurately assess the factuality of generated +responses compared to corresponding videos has not been conclusively +established. This paper introduces a novel framework that utilizes detailed +video captions as a proxy of video content, enabling language models to +incorporate this information as supporting evidence for scoring video Question +Answering (QA) predictions. Our approach demonstrates robust alignment with +OpenAI GPT-4V model's reward mechanism, which directly takes video frames as +input. Furthermore, we show that applying this tailored reward through DPO +significantly improves the performance of video LMMs on video QA tasks. + +
+
+
+
+
+ + ♻ ☆ CMRNext: Camera to LiDAR Matching in the Wild for Localization and + Extrinsic Calibration + + +
+ LiDARs are widely used for mapping and localization in dynamic environments. +However, their high cost limits their widespread adoption. On the other hand, +monocular localization in LiDAR maps using inexpensive cameras is a +cost-effective alternative for large-scale deployment. Nevertheless, most +existing approaches struggle to generalize to new sensor setups and +environments, requiring retraining or fine-tuning. In this paper, we present +CMRNext, a novel approach for camera-LIDAR matching that is independent of +sensor-specific parameters, generalizable, and can be used in the wild for +monocular localization in LiDAR maps and camera-LiDAR extrinsic calibration. +CMRNext exploits recent advances in deep neural networks for matching +cross-modal data and standard geometric techniques for robust pose estimation. +We reformulate the point-pixel matching problem as an optical flow estimation +problem and solve the Perspective-n-Point problem based on the resulting +correspondences to find the relative pose between the camera and the LiDAR +point cloud. We extensively evaluate CMRNext on six different robotic +platforms, including three publicly available datasets and three in-house +robots. Our experimental evaluations demonstrate that CMRNext outperforms +existing approaches on both tasks and effectively generalizes to previously +unseen environments and sensor setups in a zero-shot manner. We make the code +and pre-trained models publicly available at http://cmrnext.cs.uni-freiburg.de . + +
+
+
+
+
+ + ♻ ☆ HOI-M3:Capture Multiple Humans and Objects Interaction within Contextual + Environment CVPR 2024 + + +
+ Humans naturally interact with both others and the surrounding multiple +objects, engaging in various social activities. However, recent advances in +modeling human-object interactions mostly focus on perceiving isolated +individuals and objects, due to fundamental data scarcity. In this paper, we +introduce HOI-M3, a novel large-scale dataset for modeling the interactions of +Multiple huMans and Multiple objects. Notably, it provides accurate 3D tracking +for both humans and objects from dense RGB and object-mounted IMU inputs, +covering 199 sequences and 181M frames of diverse humans and objects under rich +activities. With the unique HOI-M3 dataset, we introduce two novel data-driven +tasks with companion strong baselines: monocular capture and unstructured +generation of multiple human-object interactions. Extensive experiments +demonstrate that our dataset is challenging and worthy of further research +about multiple human-object interactions and behavior analysis. Our HOI-M3 +dataset, corresponding codes, and pre-trained models will be disseminated to +the community for future research. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Few-shot point cloud reconstruction and denoising via learned Guassian + splats renderings and fine-tuned diffusion features + + +
+ Existing deep learning methods for the reconstruction and denoising of point +clouds rely on small datasets of 3D shapes. We circumvent the problem by +leveraging deep learning methods trained on billions of images. We propose a +method to reconstruct point clouds from few images and to denoise point clouds +from their rendering by exploiting prior knowledge distilled from image-based +deep learning models. To improve reconstruction in constraint settings, we +regularize the training of a differentiable renderer with hybrid surface and +appearance by introducing semantic consistency supervision. In addition, we +propose a pipeline to finetune Stable Diffusion to denoise renderings of noisy +point clouds and we demonstrate how these learned filters can be used to remove +point cloud noise coming without 3D supervision. We compare our method with DSS +and PointRadiance and achieved higher quality 3D reconstruction on the +Sketchfab Testset and SCUT Dataset. + +
+
+
+
+
+ + ♻ ☆ Recursive Joint Cross-Modal Attention for Multimodal Fusion in + Dimensional Emotion Recognition + + +
+ Though multimodal emotion recognition has achieved significant progress over +recent years, the potential of rich synergic relationships across the +modalities is not fully exploited. In this paper, we introduce Recursive Joint +Cross-Modal Attention (RJCMA) to effectively capture both intra-and inter-modal +relationships across audio, visual and text modalities for dimensional emotion +recognition. In particular, we compute the attention weights based on +cross-correlation between the joint audio-visual-text feature representations +and the feature representations of individual modalities to simultaneously +capture intra- and inter-modal relationships across the modalities. The +attended features of the individual modalities are again fed as input to the +fusion model in a recursive mechanism to obtain more refined feature +representations. We have also explored Temporal Convolutional Networks (TCNs) +to improve the temporal modeling of the feature representations of individual +modalities. Extensive experiments are conducted to evaluate the performance of +the proposed fusion model on the challenging Affwild2 dataset. By effectively +capturing the synergic intra- and inter-modal relationships across audio, +visual and text modalities, the proposed fusion model achieves a Concordance +Correlation Coefficient (CCC) of 0.585 (0.542) and 0.659 (0.619) for valence +and arousal respectively on the validation set (test set). This shows a +significant improvement over the baseline of 0.24 (0.211) and 0.20 (0.191) for +valence and arousal respectively on the validation set (test set) of the +valence-arousal challenge of 6th Affective Behavior Analysis in-the-Wild (ABAW) +competition. + +
+
+
+
+
+ + ♻ ☆ Learning to Generate Conditional Tri-plane for 3D-aware Expression + Controllable Portrait Animation + + +
+ In this paper, we present Export3D, a one-shot 3D-aware portrait animation +method that is able to control the facial expression and camera view of a given +portrait image. To achieve this, we introduce a tri-plane generator that +directly generates a tri-plane of 3D prior by transferring the expression +parameter of 3DMM into the source image. The tri-plane is then decoded into the +image of different view through a differentiable volume rendering. Existing +portrait animation methods heavily rely on image warping to transfer the +expression in the motion space, challenging on disentanglement of appearance +and expression. In contrast, we propose a contrastive pre-training framework +for appearance-free expression parameter, eliminating undesirable appearance +swap when transferring a cross-identity expression. Extensive experiments show +that our pre-training framework can learn the appearance-free expression +representation hidden in 3DMM, and our model can generate 3D-aware expression +controllable portrait image without appearance swap in the cross-identity +manner. + +
+
+ comment: Project page: https://export3d.github.io +
+
+
+
+
+ + ♻ ☆ Corrupting Convolution-based Unlearnable Datasets with Pixel-based Image + Transformations + + +
+ Unlearnable datasets lead to a drastic drop in the generalization performance +of models trained on them by introducing elaborate and imperceptible +perturbations into clean training sets. Many existing defenses, e.g., JPEG +compression and adversarial training, effectively counter UDs based on +norm-constrained additive noise. However, a fire-new type of convolution-based +UDs have been proposed and render existing defenses all ineffective, presenting +a greater challenge to defenders. To address this, we express the +convolution-based unlearnable sample as the result of multiplying a matrix by a +clean sample in a simplified scenario, and formalize the intra-class matrix +inconsistency as $\Theta_{imi}$, inter-class matrix consistency as +$\Theta_{imc}$ to investigate the working mechanism of the convolution-based +UDs. We conjecture that increasing both of these metrics will mitigate the +unlearnability effect. Through validation experiments that commendably support +our hypothesis, we further design a random matrix to boost both $\Theta_{imi}$ +and $\Theta_{imc}$, achieving a notable degree of defense effect. Hence, by +building upon and extending these facts, we first propose a brand-new image +COrruption that employs randomly multiplicative transformation via +INterpolation operation to successfully defend against convolution-based UDs. +Our approach leverages global pixel random interpolations, effectively +suppressing the impact of multiplicative noise in convolution-based UDs. +Additionally, we have also designed two new forms of convolution-based UDs, and +find that our defense is the most effective against them. + +
+
+
+
+
+ + ♻ ☆ VidEdit: Zero-Shot and Spatially Aware Text-Driven Video Editing + + +
+ Recently, diffusion-based generative models have achieved remarkable success +for image generation and edition. However, existing diffusion-based video +editing approaches lack the ability to offer precise control over generated +content that maintains temporal consistency in long-term videos. On the other +hand, atlas-based methods provide strong temporal consistency but are costly to +edit a video and lack spatial control. In this work, we introduce VidEdit, a +novel method for zero-shot text-based video editing that guarantees robust +temporal and spatial consistency. In particular, we combine an atlas-based +video representation with a pre-trained text-to-image diffusion model to +provide a training-free and efficient video editing method, which by design +fulfills temporal smoothness. To grant precise user control over generated +content, we utilize conditional information extracted from off-the-shelf +panoptic segmenters and edge detectors which guides the diffusion sampling +process. This method ensures a fine spatial control on targeted regions while +strictly preserving the structure of the original video. Our quantitative and +qualitative experiments show that VidEdit outperforms state-of-the-art methods +on DAVIS dataset, regarding semantic faithfulness, image preservation, and +temporal consistency metrics. With this framework, processing a single video +only takes approximately one minute, and it can generate multiple compatible +edits based on a unique text prompt. Project web-page at +https://videdit.github.io + +
+
+ comment: TMLR 2024. Project web-page at https://videdit.github.io +
+
+
+
+
+ + ♻ ☆ BRAIxDet: Learning to Detect Malignant Breast Lesion with Incomplete + Annotations + + +
+ Methods to detect malignant lesions from screening mammograms are usually +trained with fully annotated datasets, where images are labelled with the +localisation and classification of cancerous lesions. However, real-world +screening mammogram datasets commonly have a subset that is fully annotated and +another subset that is weakly annotated with just the global classification +(i.e., without lesion localisation). Given the large size of such datasets, +researchers usually face a dilemma with the weakly annotated subset: to not use +it or to fully annotate it. The first option will reduce detection accuracy +because it does not use the whole dataset, and the second option is too +expensive given that the annotation needs to be done by expert radiologists. In +this paper, we propose a middle-ground solution for the dilemma, which is to +formulate the training as a weakly- and semi-supervised learning problem that +we refer to as malignant breast lesion detection with incomplete annotations. +To address this problem, our new method comprises two stages, namely: 1) +pre-training a multi-view mammogram classifier with weak supervision from the +whole dataset, and 2) extending the trained classifier to become a multi-view +detector that is trained with semi-supervised student-teacher learning, where +the training set contains fully and weakly-annotated mammograms. We provide +extensive detection results on two real-world screening mammogram datasets +containing incomplete annotations, and show that our proposed approach achieves +state-of-the-art results in the detection of malignant breast lesions with +incomplete annotations. + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ LLM meets Vision-Language Models for Zero-Shot One-Class Classification + + +
+ We consider the problem of zero-shot one-class visual classification. In this +setting, only the label of the target class is available, and the goal is to +discriminate between positive and negative query samples without requiring any +validation example from the target task. We propose a two-step solution that +first queries large language models for visually confusing objects and then +relies on vision-language pre-trained models (e.g., CLIP) to perform +classification. By adapting large-scale vision benchmarks, we demonstrate the +ability of the proposed method to outperform adapted off-the-shelf alternatives +in this setting. Namely, we propose a realistic benchmark where negative query +samples are drawn from the same original dataset as positive ones, including a +granularity-controlled version of iNaturalist, where negative samples are at a +fixed distance in the taxonomy tree from the positive ones. Our work shows that +it is possible to discriminate between a single category and other semantically +related ones using only its label + +
+
+
+
+
+ + ♻ ☆ Decoupled Diffusion Models: Simultaneous Image to Zero and Zero to Noise + + +
+ We propose decoupled diffusion models (DDMs) for high-quality (un)conditioned +image generation in less than 10 function evaluations. In a nutshell, DDMs +decouple the forward image-to-noise mapping into \textit{image-to-zero} mapping +and \textit{zero-to-noise} mapping. Under this framework, we mathematically +derive 1) the training objectives and 2) for the reverse time the sampling +formula based on an analytic transition probability which models image to zero +transition. The former enables DDMs to learn noise and image components +simultaneously which simplifies learning. Importantly, because of the latter's +analyticity in the \textit{zero-to-image} sampling function, DDMs can avoid the +ordinary differential equation-based accelerators and instead naturally perform +sampling with an arbitrary step size. Under the few function evaluation setups, +DDMs experimentally yield very competitive performance compared with the state +of the art in 1) unconditioned image generation, \textit{e.g.}, CIFAR-10 and +CelebA-HQ-256 and 2) image-conditioned downstream tasks such as +super-resolution, saliency detection, edge detection, and image inpainting. + +
+
+
+
+
+ + ♻ ☆ GenHowTo: Learning to Generate Actions and State Transformations from + Instructional Videos CVPR 2024 + + +
+ We address the task of generating temporally consistent and physically +plausible images of actions and object state transformations. Given an input +image and a text prompt describing the targeted transformation, our generated +images preserve the environment and transform objects in the initial image. Our +contributions are threefold. First, we leverage a large body of instructional +videos and automatically mine a dataset of triplets of consecutive frames +corresponding to initial object states, actions, and resulting object +transformations. Second, equipped with this data, we develop and train a +conditioned diffusion model dubbed GenHowTo. Third, we evaluate GenHowTo on a +variety of objects and actions and show superior performance compared to +existing methods. In particular, we introduce a quantitative evaluation where +GenHowTo achieves 88% and 74% on seen and unseen interaction categories, +respectively, outperforming prior work by a large margin. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ PEGASUS: Personalized Generative 3D Avatars with Composable Attributes CVPR 2024 + + +
+ We present PEGASUS, a method for constructing a personalized generative 3D +face avatar from monocular video sources. Our generative 3D avatar enables +disentangled controls to selectively alter the facial attributes (e.g., hair or +nose) while preserving the identity. Our approach consists of two stages: +synthetic database generation and constructing a personalized generative +avatar. We generate a synthetic video collection of the target identity with +varying facial attributes, where the videos are synthesized by borrowing the +attributes from monocular videos of diverse identities. Then, we build a +person-specific generative 3D avatar that can modify its attributes +continuously while preserving its identity. Through extensive experiments, we +demonstrate that our method of generating a synthetic database and creating a +3D generative avatar is the most effective in preserving identity while +achieving high realism. Subsequently, we introduce a zero-shot approach to +achieve the same goal of generative modeling more efficiently by leveraging a +previously constructed personalized generative model. + +
+
+ comment: Accepted at CVPR 2024, Project Page: + https://snuvclab.github.io/pegasus/ +
+
+
+
+
+ + ♻ ☆ Distilling Semantic Priors from SAM to Efficient Image Restoration + Models + + +
+ In image restoration (IR), leveraging semantic priors from segmentation +models has been a common approach to improve performance. The recent segment +anything model (SAM) has emerged as a powerful tool for extracting advanced +semantic priors to enhance IR tasks. However, the computational cost of SAM is +prohibitive for IR, compared to existing smaller IR models. The incorporation +of SAM for extracting semantic priors considerably hampers the model inference +efficiency. To address this issue, we propose a general framework to distill +SAM's semantic knowledge to boost exiting IR models without interfering with +their inference process. Specifically, our proposed framework consists of the +semantic priors fusion (SPF) scheme and the semantic priors distillation (SPD) +scheme. SPF fuses two kinds of information between the restored image predicted +by the original IR model and the semantic mask predicted by SAM for the refined +restored image. SPD leverages a self-distillation manner to distill the fused +semantic priors to boost the performance of original IR models. Additionally, +we design a semantic-guided relation (SGR) module for SPD, which ensures +semantic feature representation space consistency to fully distill the priors. +We demonstrate the effectiveness of our framework across multiple IR models and +tasks, including deraining, deblurring, and denoising. + +
+
+
+
+
+ + ♻ ☆ Rethinking Saliency-Guided Weakly-Supervised Semantic Segmentation + + +
+ This paper presents a fresh perspective on the role of saliency maps in +weakly-supervised semantic segmentation (WSSS) and offers new insights and +research directions based on our empirical findings. We conduct comprehensive +experiments and observe that the quality of the saliency map is a critical +factor in saliency-guided WSSS approaches. Nonetheless, we find that the +saliency maps used in previous works are often arbitrarily chosen, despite +their significant impact on WSSS. Additionally, we observe that the choice of +the threshold, which has received less attention before, is non-trivial in +WSSS. To facilitate more meaningful and rigorous research for saliency-guided +WSSS, we introduce \texttt{WSSS-BED}, a standardized framework for conducting +research under unified conditions. \texttt{WSSS-BED} provides various saliency +maps and activation maps for seven WSSS methods, as well as saliency maps from +unsupervised salient object detection models. + +
+
+ comment: Preprint, 17 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ PKU-DyMVHumans: A Multi-View Video Benchmark for High-Fidelity Dynamic + Human Modeling CVPR2024 + + +
+ High-quality human reconstruction and photo-realistic rendering of a dynamic +scene is a long-standing problem in computer vision and graphics. Despite +considerable efforts invested in developing various capture systems and +reconstruction algorithms, recent advancements still struggle with loose or +oversized clothing and overly complex poses. In part, this is due to the +challenges of acquiring high-quality human datasets. To facilitate the +development of these fields, in this paper, we present PKU-DyMVHumans, a +versatile human-centric dataset for high-fidelity reconstruction and rendering +of dynamic human scenarios from dense multi-view videos. It comprises 8.2 +million frames captured by more than 56 synchronized cameras across diverse +scenarios. These sequences comprise 32 human subjects across 45 different +scenarios, each with a high-detailed appearance and realistic human motion. +Inspired by recent advancements in neural radiance field (NeRF)-based scene +representations, we carefully set up an off-the-shelf framework that is easy to +provide those state-of-the-art NeRF-based implementations and benchmark on +PKU-DyMVHumans dataset. It is paving the way for various applications like +fine-grained foreground/background decomposition, high-quality human +reconstruction and photo-realistic novel view synthesis of a dynamic scene. +Extensive studies are performed on the benchmark, demonstrating new +observations and challenges that emerge from using such high-fidelity dynamic +data. + +
+
+ comment: CVPR2024(accepted). Project page: https://pku-dymvhumans.github.io +
+
+
+
+
+ + ♻ ☆ Real-time 3D-aware Portrait Editing from a Single Image + + +
+ This work presents 3DPE, a practical method that can efficiently edit a face +image following given prompts, like reference images or text descriptions, in a +3D-aware manner. To this end, a lightweight module is distilled from a 3D +portrait generator and a text-to-image model, which provide prior knowledge of +face geometry and superior editing capability, respectively. Such a design +brings two compelling advantages over existing approaches. First, our system +achieves real-time editing with a feedforward network (i.e., ~0.04s per image), +over 100x faster than the second competitor. Second, thanks to the powerful +priors, our module could focus on the learning of editing-related variations, +such that it manages to handle various types of editing simultaneously in the +training phase and further supports fast adaptation to user-specified +customized types of editing during inference (e.g., with ~5min fine-tuning per +style). The code, the model, and the interface will be made publicly available +to facilitate future research. + +
+
+
+
+
+ + ♻ ☆ High-throughput Visual Nano-drone to Nano-drone Relative Localization + using Onboard Fully Convolutional Networks ICRA 2024 + + +
+ Relative drone-to-drone localization is a fundamental building block for any +swarm operations. We address this task in the context of miniaturized +nano-drones, i.e., 10cm in diameter, which show an ever-growing interest due to +novel use cases enabled by their reduced form factor. The price for their +versatility comes with limited onboard resources, i.e., sensors, processing +units, and memory, which limits the complexity of the onboard algorithms. A +traditional solution to overcome these limitations is represented by +lightweight deep learning models directly deployed aboard nano-drones. This +work tackles the challenging relative pose estimation between nano-drones using +only a gray-scale low-resolution camera and an ultra-low-power System-on-Chip +(SoC) hosted onboard. We present a vertically integrated system based on a +novel vision-based fully convolutional neural network (FCNN), which runs at +39Hz within 101mW onboard a Crazyflie nano-drone extended with the GWT GAP8 +SoC. We compare our FCNN against three State-of-the-Art (SoA) systems. +Considering the best-performing SoA approach, our model results in an R-squared +improvement from 32 to 47% on the horizontal image coordinate and from 18 to +55% on the vertical image coordinate, on a real-world dataset of 30k images. +Finally, our in-field tests show a reduction of the average tracking error of +37% compared to a previous SoA work and an endurance performance up to the +entire battery lifetime of 4 minutes. + +
+
+ comment: ICRA 2024, IEEE Conference +
+
+
+
+
+ + ♻ ☆ SegICL: A Universal In-context Learning Framework for Enhanced + Segmentation in Medical Imaging + + +
+ Medical image segmentation models adapting to new tasks in a training-free +manner through in-context learning is an exciting advancement. Universal +segmentation models aim to generalize across the diverse modality of medical +images, yet their effectiveness often diminishes when applied to +out-of-distribution (OOD) data modalities and tasks, requiring intricate +fine-tuning of model for optimal performance. For addressing this challenge, we +introduce SegICL, a novel approach leveraging In-Context Learning (ICL) for +image segmentation. Unlike existing methods, SegICL has the capability to +employ text-guided segmentation and conduct in-context learning with a small +set of image-mask pairs, eliminating the need for training the model from +scratch or fine-tuning for OOD tasks (including OOD modality and dataset). +Extensive experimental validation of SegICL demonstrates a positive correlation +between the number of prompt samples and segmentation performance on OOD +modalities and tasks. This indicates that SegICL effectively address new +segmentation tasks based on contextual information. Additionally, SegICL also +exhibits comparable segmentation performance to mainstream models on OOD and +in-distribution tasks. Our code will be released soon. + +
+
+
+
+
+ + ♻ ☆ The Solution for the CVPR 2023 1st foundation model challenge-Track2 + + +
+ In this paper, we propose a solution for cross-modal transportation +retrieval. Due to the cross-domain problem of traffic images, we divide the +problem into two sub-tasks of pedestrian retrieval and vehicle retrieval +through a simple strategy. In pedestrian retrieval tasks, we use IRRA as the +base model and specifically design an Attribute Classification to mine the +knowledge implied by attribute labels. More importantly, We use the strategy of +Inclusion Relation Matching to make the image-text pairs with inclusion +relation have similar representation in the feature space. For the vehicle +retrieval task, we use BLIP as the base model. Since aligning the color +attributes of vehicles is challenging, we introduce attribute-based object +detection techniques to add color patch blocks to vehicle images for color data +augmentation. This serves as strong prior information, helping the model +perform the image-text alignment. At the same time, we incorporate labeled +attributes into the image-text alignment loss to learn fine-grained alignment +and prevent similar images and texts from being incorrectly separated. Our +approach ranked first in the final B-board test with a score of 70.9. + +
+
+
+
+
+ + ♻ ☆ Vision-Language Models in Remote Sensing: Current Progress and Future + Trends + + +
+ The remarkable achievements of ChatGPT and GPT-4 have sparked a wave of +interest and research in the field of large language models for Artificial +General Intelligence (AGI). These models provide intelligent solutions close to +human thinking, enabling us to use general artificial intelligence to solve +problems in various applications. However, in remote sensing (RS), the +scientific literature on the implementation of AGI remains relatively scant. +Existing AI-related research in remote sensing primarily focuses on visual +understanding tasks while neglecting the semantic understanding of the objects +and their relationships. This is where vision-language models excel, as they +enable reasoning about images and their associated textual descriptions, +allowing for a deeper understanding of the underlying semantics. +Vision-language models can go beyond visual recognition of RS images, model +semantic relationships, and generate natural language descriptions of the +image. This makes them better suited for tasks requiring visual and textual +understanding, such as image captioning, and visual question answering. This +paper provides a comprehensive review of the research on vision-language models +in remote sensing, summarizing the latest progress, highlighting challenges, +and identifying potential research opportunities. + +
+
+ comment: Accepted by IEEE Geoscience and Remote Sensing Magazine +
+
+
+
+
+ + ♻ ☆ Steerers: A framework for rotation equivariant keypoint descriptors CVPR 2024 + + +
+ Image keypoint descriptions that are discriminative and matchable over large +changes in viewpoint are vital for 3D reconstruction. However, descriptions +output by learned descriptors are typically not robust to camera rotation. +While they can be made more robust by, e.g., data augmentation, this degrades +performance on upright images. Another approach is test-time augmentation, +which incurs a significant increase in runtime. Instead, we learn a linear +transform in description space that encodes rotations of the input image. We +call this linear transform a steerer since it allows us to transform the +descriptions as if the image was rotated. From representation theory, we know +all possible steerers for the rotation group. Steerers can be optimized (A) +given a fixed descriptor, (B) jointly with a descriptor or (C) we can optimize +a descriptor given a fixed steerer. We perform experiments in these three +settings and obtain state-of-the-art results on the rotation invariant image +matching benchmarks AIMS and Roto-360. We publish code and model weights at +https://github.com/georg-bn/rotation-steerers. + +
+
+ comment: CVPR 2024 Camera ready +
+
+
+
+
+ + ♻ ☆ All in an Aggregated Image for In-Image Learning + + +
+ This paper introduces a new in-context learning (ICL) mechanism called +In-Image Learning (I$^2$L) that combines demonstration examples, visual cues, +and chain-of-thought reasoning into an aggregated image to enhance the +capabilities of Large Multimodal Models (e.g., GPT-4V) in multimodal reasoning +tasks. Unlike previous approaches that rely on converting images to text or +incorporating visual input into language models, I$^2$L consolidates all +information into an aggregated image and leverages image processing, +understanding, and reasoning abilities. This has several advantages: it reduces +inaccurate textual descriptions of complex images, provides flexibility in +positioning demonstration examples, and avoids multiple input images and +lengthy prompts. We also introduce I$^2$L-Hybrid, a method that combines the +strengths of I$^2$L with other ICL methods. Specifically, it uses an automatic +strategy to select the most suitable method (I$^2$L or another certain ICL +method) for a specific task instance. We conduct extensive experiments to +assess the effectiveness of I$^2$L and I$^2$L-Hybrid on MathVista, which covers +a variety of complex multimodal reasoning tasks. Additionally, we investigate +the influence of image resolution, the number of demonstration examples in a +single image, and the positions of these demonstrations in the aggregated image +on the effectiveness of I$^2$L. Our code is publicly available at +https://github.com/AGI-Edgerunners/IIL. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ MM-Interleaved: Interleaved Image-Text Generative Modeling via + Multi-modal Feature Synchronizer + + +
+ Developing generative models for interleaved image-text data has both +research and practical value. It requires models to understand the interleaved +sequences and subsequently generate images and text. However, existing attempts +are limited by the issue that the fixed number of visual tokens cannot +efficiently capture image details, which is particularly problematic in the +multi-image scenarios. To address this, this paper presents MM-Interleaved, an +end-to-end generative model for interleaved image-text data. It introduces a +multi-scale and multi-image feature synchronizer module, allowing direct access +to fine-grained image features in the previous context during the generation +process. MM-Interleaved is end-to-end pre-trained on both paired and +interleaved image-text corpora. It is further enhanced through a supervised +fine-tuning phase, wherein the model improves its ability to follow complex +multi-modal instructions. Experiments demonstrate the versatility of +MM-Interleaved in recognizing visual details following multi-modal instructions +and generating consistent images following both textual and visual conditions. +Code and models are available at +\url{https://github.com/OpenGVLab/MM-Interleaved}. + +
+
+ comment: 20 pages, 9 figures, 17 tables +
+
+
+
+
+ + ♻ ☆ EpiDiff: Enhancing Multi-View Synthesis via Localized + Epipolar-Constrained Diffusion + + +
+ Generating multiview images from a single view facilitates the rapid +generation of a 3D mesh conditioned on a single image. Recent methods that +introduce 3D global representation into diffusion models have shown the +potential to generate consistent multiviews, but they have reduced generation +speed and face challenges in maintaining generalizability and quality. To +address this issue, we propose EpiDiff, a localized interactive multiview +diffusion model. At the core of the proposed approach is to insert a +lightweight epipolar attention block into the frozen diffusion model, +leveraging epipolar constraints to enable cross-view interaction among feature +maps of neighboring views. The newly initialized 3D modeling module preserves +the original feature distribution of the diffusion model, exhibiting +compatibility with a variety of base diffusion models. Experiments show that +EpiDiff generates 16 multiview images in just 12 seconds, and it surpasses +previous methods in quality evaluation metrics, including PSNR, SSIM and LPIPS. +Additionally, EpiDiff can generate a more diverse distribution of views, +improving the reconstruction quality from generated multiviews. Please see our +project page at https://huanngzh.github.io/EpiDiff/. + +
+
+ comment: Project page: https://huanngzh.github.io/EpiDiff/ +
+
+
+
+
+ + ♻ ☆ ADDP: Learning General Representations for Image Recognition and + Generation with Alternating Denoising Diffusion Process ICLR2024 + + +
+ Image recognition and generation have long been developed independently of +each other. With the recent trend towards general-purpose representation +learning, the development of general representations for both recognition and +generation tasks is also promoted. However, preliminary attempts mainly focus +on generation performance, but are still inferior on recognition tasks. These +methods are modeled in the vector-quantized (VQ) space, whereas leading +recognition methods use pixels as inputs. Our key insights are twofold: (1) +pixels as inputs are crucial for recognition tasks; (2) VQ tokens as +reconstruction targets are beneficial for generation tasks. These observations +motivate us to propose an Alternating Denoising Diffusion Process (ADDP) that +integrates these two spaces within a single representation learning framework. +In each denoising step, our method first decodes pixels from previous VQ +tokens, then generates new VQ tokens from the decoded pixels. The diffusion +process gradually masks out a portion of VQ tokens to construct the training +samples. The learned representations can be used to generate diverse +high-fidelity images and also demonstrate excellent transfer performance on +recognition tasks. Extensive experiments show that our method achieves +competitive performance on unconditional generation, ImageNet classification, +COCO detection, and ADE20k segmentation. Importantly, our method represents the +first successful development of general representations applicable to both +generation and dense recognition tasks. Code is released at +\url{https://github.com/ChangyaoTian/ADDP}. + +
+
+ comment: Accepted by ICLR2024 +
+
+
+
+
+ + ♻ ☆ SDGE: Stereo Guided Depth Estimation for 360$^\circ$ Camera Sets + + +
+ Depth estimation is a critical technology in autonomous driving, and +multi-camera systems are often used to achieve a 360$^\circ$ perception. These +360$^\circ$ camera sets often have limited or low-quality overlap regions, +making multi-view stereo methods infeasible for the entire image. +Alternatively, monocular methods may not produce consistent cross-view +predictions. To address these issues, we propose the Stereo Guided Depth +Estimation (SGDE) method, which enhances depth estimation of the full image by +explicitly utilizing multi-view stereo results on the overlap. We suggest +building virtual pinhole cameras to resolve the distortion problem of fisheye +cameras and unify the processing for the two types of 360$^\circ$ cameras. For +handling the varying noise on camera poses caused by unstable movement, the +approach employs a self-calibration method to obtain highly accurate relative +poses of the adjacent cameras with minor overlap. These enable the use of +robust stereo methods to obtain high-quality depth prior in the overlap region. +This prior serves not only as an additional input but also as pseudo-labels +that enhance the accuracy of depth estimation methods and improve cross-view +prediction consistency. The effectiveness of SGDE is evaluated on one fisheye +camera dataset, Synthetic Urban, and two pinhole camera datasets, DDAD and +nuScenes. Our experiments demonstrate that SGDE is effective for both +supervised and self-supervised depth estimation, and highlight the potential of +our method for advancing downstream autonomous driving technologies, such as 3D +object detection and occupancy prediction. + +
+
+
+
+
+ + ♻ ☆ HIPTrack: Visual Tracking with Historical Prompts CVPR2024 + + +
+ Trackers that follow Siamese paradigm utilize similarity matching between +template and search region features for tracking. Many methods have been +explored to enhance tracking performance by incorporating tracking history to +better handle scenarios involving target appearance variations such as +deformation and occlusion. However, the utilization of historical information +in existing methods is insufficient and incomprehensive, which typically +requires repetitive training and introduces a large amount of computation. In +this paper, we show that by providing a tracker that follows Siamese paradigm +with precise and updated historical information, a significant performance +improvement can be achieved with completely unchanged parameters. Based on +this, we propose a historical prompt network that uses refined historical +foreground masks and historical visual features of the target to provide +comprehensive and precise prompts for the tracker. We build a novel tracker +called HIPTrack based on the historical prompt network, which achieves +considerable performance improvements without the need to retrain the entire +model. We conduct experiments on seven datasets and experimental results +demonstrate that our method surpasses the current state-of-the-art trackers on +LaSOT, LaSOText, GOT-10k and NfS. Furthermore, the historical prompt network +can seamlessly integrate as a plug-and-play module into existing trackers, +providing performance enhancements. The source code is available at +https://github.com/WenRuiCai/HIPTrack. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ 3D Diffusion Policy: Generalizable Visuomotor Policy Learning via Simple + 3D Representations + + +
+ Imitation learning provides an efficient way to teach robots dexterous +skills; however, learning complex skills robustly and generalizablely usually +consumes large amounts of human demonstrations. To tackle this challenging +problem, we present 3D Diffusion Policy (DP3), a novel visual imitation +learning approach that incorporates the power of 3D visual representations into +diffusion policies, a class of conditional action generative models. The core +design of DP3 is the utilization of a compact 3D visual representation, +extracted from sparse point clouds with an efficient point encoder. In our +experiments involving 72 simulation tasks, DP3 successfully handles most tasks +with just 10 demonstrations and surpasses baselines with a 24.2% relative +improvement. In 4 real robot tasks, DP3 demonstrates precise control with a +high success rate of 85%, given only 40 demonstrations of each task, and shows +excellent generalization abilities in diverse aspects, including space, +viewpoint, appearance, and instance. Interestingly, in real robot experiments, +DP3 rarely violates safety requirements, in contrast to baseline methods which +frequently do, necessitating human intervention. Our extensive evaluation +highlights the critical importance of 3D representations in real-world robot +learning. Videos, code, and data are available on +https://3d-diffusion-policy.github.io . + +
+
+ comment: Videos, code, and data: https://3d-diffusion-policy.github.io +
+
+
+
+
+ + ♻ ☆ Saliency strikes back: How filtering out high frequencies improves + white-box explanations + + +
+ Attribution methods correspond to a class of explainability methods (XAI) +that aim to assess how individual inputs contribute to a model's +decision-making process. We have identified a significant limitation in one +type of attribution methods, known as "white-box" methods. Although highly +efficient, these methods rely on a gradient signal that is often contaminated +by high-frequency noise. To overcome this limitation, we introduce a new +approach called "FORGrad". This simple method effectively filters out noise +artifacts by using optimal cut-off frequencies tailored to the unique +characteristics of each model architecture. Our findings show that FORGrad +consistently enhances the performance of already existing white-box methods, +enabling them to compete effectively with more accurate yet computationally +demanding "black-box" methods. We anticipate that our research will foster +broader adoption of simpler and more efficient white-box methods for +explainability, offering a better balance between faithfulness and +computational efficiency. + +
+
+
+
+
+ + ♻ ☆ FRDiff : Feature Reuse for Universal Training-free Acceleration of + Diffusion Models + + +
+ The substantial computational costs of diffusion models, especially due to +the repeated denoising steps necessary for high-quality image generation, +present a major obstacle to their widespread adoption. While several studies +have attempted to address this issue by reducing the number of score function +evaluations (NFE) using advanced ODE solvers without fine-tuning, the decreased +number of denoising iterations misses the opportunity to update fine details, +resulting in noticeable quality degradation. In our work, we introduce an +advanced acceleration technique that leverages the temporal redundancy inherent +in diffusion models. Reusing feature maps with high temporal similarity opens +up a new opportunity to save computation resources without compromising output +quality. To realize the practical benefits of this intuition, we conduct an +extensive analysis and propose a novel method, FRDiff. FRDiff is designed to +harness the advantages of both reduced NFE and feature reuse, achieving a +Pareto frontier that balances fidelity and latency trade-offs in various +generative tasks. + +
+
+ comment: Work in progress. Project page : + https://jungwon-lee.github.io/Project_FRDiff/ +
+
+
+
+
+ + ♻ ☆ Morphable Diffusion: 3D-Consistent Diffusion for Single-image Avatar + Creation CVPR 2024 + + +
+ Recent advances in generative diffusion models have enabled the previously +unfeasible capability of generating 3D assets from a single input image or a +text prompt. In this work, we aim to enhance the quality and functionality of +these models for the task of creating controllable, photorealistic human +avatars. We achieve this by integrating a 3D morphable model into the +state-of-the-art multi-view-consistent diffusion approach. We demonstrate that +accurate conditioning of a generative pipeline on the articulated 3D model +enhances the baseline model performance on the task of novel view synthesis +from a single image. More importantly, this integration facilitates a seamless +and accurate incorporation of facial expression and body pose control into the +generation process. To the best of our knowledge, our proposed framework is the +first diffusion model to enable the creation of fully 3D-consistent, +animatable, and photorealistic human avatars from a single image of an unseen +subject; extensive quantitative and qualitative evaluations demonstrate the +advantages of our approach over existing state-of-the-art avatar creation +models on both novel view and novel expression synthesis tasks. The code for +our project is publicly available. + +
+
+ comment: [CVPR 2024] Project page: + https://xiyichen.github.io/morphablediffusion/ +
+
+
+
+
+ + ♻ ☆ PointMamba: A Simple State Space Model for Point Cloud Analysis + + +
+ Transformers have become one of the foundational architectures in point cloud +analysis tasks due to their excellent global modeling ability. However, the +attention mechanism has quadratic complexity and is difficult to extend to long +sequence modeling due to limited computational resources and so on. Recently, +state space models (SSM), a new family of deep sequence models, have presented +great potential for sequence modeling in NLP tasks. In this paper, taking +inspiration from the success of SSM in NLP, we propose PointMamba, a framework +with global modeling and linear complexity. Specifically, by taking embedded +point patches as input, we proposed a reordering strategy to enhance SSM's +global modeling ability by providing a more logical geometric scanning order. +The reordered point tokens are then sent to a series of Mamba blocks to +causally capture the point cloud structure. Experimental results show our +proposed PointMamba outperforms the transformer-based counterparts on different +point cloud analysis datasets, while significantly saving about 44.3% +parameters and 25% FLOPs, demonstrating the potential option for constructing +foundational 3D vision models. We hope our PointMamba can provide a new +perspective for point cloud analysis. The code is available at +https://github.com/LMD0311/PointMamba. + +
+
+ comment: Work in progress. The code is available at + https://github.com/LMD0311/PointMamba +
+
+
+
+
+ + ♻ ☆ Advancements in Point Cloud Data Augmentation for Deep Learning: A + Survey + + +
+ Deep learning (DL) has become one of the mainstream and effective methods for +point cloud analysis tasks such as detection, segmentation and classification. +To reduce overfitting during training DL models and improve model performance +especially when the amount and/or diversity of training data are limited, +augmentation is often crucial. Although various point cloud data augmentation +methods have been widely used in different point cloud processing tasks, there +are currently no published systematic surveys or reviews of these methods. +Therefore, this article surveys these methods, categorizing them into a +taxonomy framework that comprises basic and specialized point cloud data +augmentation methods. Through a comprehensive evaluation of these augmentation +methods, this article identifies their potentials and limitations, serving as a +useful reference for choosing appropriate augmentation methods. In addition, +potential directions for future research are recommended. This survey +contributes to providing a holistic overview of the current state of point +cloud data augmentation, promoting its wider application and development. + +
+
+
+
+
+ + ♻ ☆ CHAIN: Enhancing Generalization in Data-Efficient GANs via lipsCHitz + continuity constrAIned Normalization CVPR2024 + + +
+ Generative Adversarial Networks (GANs) significantly advanced image +generation but their performance heavily depends on abundant training data. In +scenarios with limited data, GANs often struggle with discriminator overfitting +and unstable training. Batch Normalization (BN), despite being known for +enhancing generalization and training stability, has rarely been used in the +discriminator of Data-Efficient GANs. Our work addresses this gap by +identifying a critical flaw in BN: the tendency for gradient explosion during +the centering and scaling steps. To tackle this issue, we present CHAIN +(lipsCHitz continuity constrAIned Normalization), which replaces the +conventional centering step with zero-mean regularization and integrates a +Lipschitz continuity constraint in the scaling step. CHAIN further enhances GAN +training by adaptively interpolating the normalized and unnormalized features, +effectively avoiding discriminator overfitting. Our theoretical analyses firmly +establishes CHAIN's effectiveness in reducing gradients in latent features and +weights, improving stability and generalization in GAN training. Empirical +evidence supports our theory. CHAIN achieves state-of-the-art results in +data-limited scenarios on CIFAR-10/100, ImageNet, five low-shot and seven +high-resolution few-shot image datasets. + +
+
+ comment: Accepted by CVPR2024, 26 pages full version +
+
+
+
+
+ + ♻ ☆ TIP-Editor: An Accurate 3D Editor Following Both Text-Prompts And + Image-Prompts + + +
+ Text-driven 3D scene editing has gained significant attention owing to its +convenience and user-friendliness. However, existing methods still lack +accurate control of the specified appearance and location of the editing result +due to the inherent limitations of the text description. To this end, we +propose a 3D scene editing framework, TIPEditor, that accepts both text and +image prompts and a 3D bounding box to specify the editing region. With the +image prompt, users can conveniently specify the detailed appearance/style of +the target content in complement to the text description, enabling accurate +control of the appearance. Specifically, TIP-Editor employs a stepwise 2D +personalization strategy to better learn the representation of the existing +scene and the reference image, in which a localization loss is proposed to +encourage correct object placement as specified by the bounding box. +Additionally, TIPEditor utilizes explicit and flexible 3D Gaussian splatting as +the 3D representation to facilitate local editing while keeping the background +unchanged. Extensive experiments have demonstrated that TIP-Editor conducts +accurate editing following the text and image prompts in the specified bounding +box region, consistently outperforming the baselines in editing quality, and +the alignment to the prompts, qualitatively and quantitatively. + +
+
+ comment: Accpeted by Siggraph 2024 & ACM Transactions on Graphics +
+
+
+
+
+ + ♻ ☆ SSM Meets Video Diffusion Models: Efficient Video Generation with + Structured State Spaces ICLR 2024 + + +
+ Given the remarkable achievements in image generation through diffusion +models, the research community has shown increasing interest in extending these +models to video generation. Recent diffusion models for video generation have +predominantly utilized attention layers to extract temporal features. However, +attention layers are limited by their memory consumption, which increases +quadratically with the length of the sequence. This limitation presents +significant challenges when attempting to generate longer video sequences using +diffusion models. To overcome this challenge, we propose leveraging state-space +models (SSMs). SSMs have recently gained attention as viable alternatives due +to their linear memory consumption relative to sequence length. In the +experiments, we first evaluate our SSM-based model with UCF101, a standard +benchmark of video generation. In addition, to investigate the potential of +SSMs for longer video generation, we perform an experiment using the MineRL +Navigate dataset, varying the number of frames to 64, 200, and 400. In these +settings, our SSM-based model can considerably save memory consumption for +longer sequences, while maintaining competitive FVD scores to the +attention-based models. Our codes are available at +https://github.com/shim0114/SSM-Meets-Video-Diffusion-Models. + +
+
+ comment: Accepted as a workshop paper at ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Endo-4DGS: Endoscopic Monocular Scene Reconstruction with 4D Gaussian + Splatting + + +
+ In the realm of robot-assisted minimally invasive surgery, dynamic scene +reconstruction can significantly enhance downstream tasks and improve surgical +outcomes. Neural Radiance Fields (NeRF)-based methods have recently risen to +prominence for their exceptional ability to reconstruct scenes but are hampered +by slow inference speed, prolonged training, and inconsistent depth estimation. +Some previous work utilizes ground truth depth for optimization but is hard to +acquire in the surgical domain. To overcome these obstacles, we present +Endo-4DGS, a real-time endoscopic dynamic reconstruction approach that utilizes +3D Gaussian Splatting (GS) for 3D representation. Specifically, we propose +lightweight MLPs to capture temporal dynamics with Gaussian deformation fields. +To obtain a satisfactory Gaussian Initialization, we exploit a powerful depth +estimation foundation model, Depth-Anything, to generate pseudo-depth maps as a +geometry prior. We additionally propose confidence-guided learning to tackle +the ill-pose problems in monocular depth estimation and enhance the +depth-guided reconstruction with surface normal constraints and depth +regularization. Our approach has been validated on two surgical datasets, where +it can effectively render in real-time, compute efficiently, and reconstruct +with remarkable accuracy. + +
+
+
+
+
+ + ♻ ☆ KTPFormer: Kinematics and Trajectory Prior Knowledge-Enhanced + Transformer for 3D Human Pose Estimation CVPR 2024 + + +
+ This paper presents a novel Kinematics and Trajectory Prior +Knowledge-Enhanced Transformer (KTPFormer), which overcomes the weakness in +existing transformer-based methods for 3D human pose estimation that the +derivation of Q, K, V vectors in their self-attention mechanisms are all based +on simple linear mapping. We propose two prior attention modules, namely +Kinematics Prior Attention (KPA) and Trajectory Prior Attention (TPA) to take +advantage of the known anatomical structure of the human body and motion +trajectory information, to facilitate effective learning of global dependencies +and features in the multi-head self-attention. KPA models kinematic +relationships in the human body by constructing a topology of kinematics, while +TPA builds a trajectory topology to learn the information of joint motion +trajectory across frames. Yielding Q, K, V vectors with prior knowledge, the +two modules enable KTPFormer to model both spatial and temporal correlations +simultaneously. Extensive experiments on three benchmarks (Human3.6M, +MPI-INF-3DHP and HumanEva) show that KTPFormer achieves superior performance in +comparison to state-of-the-art methods. More importantly, our KPA and TPA +modules have lightweight plug-and-play designs and can be integrated into +various transformer-based networks (i.e., diffusion-based) to improve the +performance with only a very small increase in the computational overhead. The +code is available at: https://github.com/JihuaPeng/KTPFormer. + +
+
+ comment: Accepted by CVPR 2024,GitHub + code:https://github.com/JihuaPeng/KTPFormer +
+
+
+
+
+ + ♻ ☆ eTraM: Event-based Traffic Monitoring Dataset + + +
+ Event cameras, with their high temporal and dynamic range and minimal memory +usage, have found applications in various fields. However, their potential in +static traffic monitoring remains largely unexplored. To facilitate this +exploration, we present eTraM - a first-of-its-kind, fully event-based traffic +monitoring dataset. eTraM offers 10 hr of data from different traffic scenarios +in various lighting and weather conditions, providing a comprehensive overview +of real-world situations. Providing 2M bounding box annotations, it covers +eight distinct classes of traffic participants, ranging from vehicles to +pedestrians and micro-mobility. eTraM's utility has been assessed using +state-of-the-art methods for traffic participant detection, including RVT, RED, +and YOLOv8. We quantitatively evaluate the ability of event-based models to +generalize on nighttime and unseen scenes. Our findings substantiate the +compelling potential of leveraging event cameras for traffic monitoring, +opening new avenues for research and application. eTraM is available at +https://eventbasedvision.github.io/eTraM + +
+
+
+
+
+ + ♻ ☆ MaskINT: Video Editing via Interpolative Non-autoregressive Masked + Transformers CVPR 2024 + + +
+ Recent advances in generative AI have significantly enhanced image and video +editing, particularly in the context of text prompt control. State-of-the-art +approaches predominantly rely on diffusion models to accomplish these tasks. +However, the computational demands of diffusion-based methods are substantial, +often necessitating large-scale paired datasets for training, and therefore +challenging the deployment in real applications. To address these issues, this +paper breaks down the text-based video editing task into two stages. First, we +leverage an pre-trained text-to-image diffusion model to simultaneously edit +few keyframes in an zero-shot way. Second, we introduce an efficient model +called MaskINT, which is built on non-autoregressive masked generative +transformers and specializes in frame interpolation between the edited +keyframes, using the structural guidance from intermediate frames. Experimental +results suggest that our MaskINT achieves comparable performance with +diffusion-based methodologies, while significantly improve the inference time. +This research offers a practical solution for text-based video editing and +showcases the potential of non-autoregressive masked generative transformers in +this domain. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Kiki or Bouba? Sound Symbolism in Vision-and-Language Models NeurIPS 2023 + + +
+ Although the mapping between sound and meaning in human language is assumed +to be largely arbitrary, research in cognitive science has shown that there are +non-trivial correlations between particular sounds and meanings across +languages and demographic groups, a phenomenon known as sound symbolism. Among +the many dimensions of meaning, sound symbolism is particularly salient and +well-demonstrated with regards to cross-modal associations between language and +the visual domain. In this work, we address the question of whether sound +symbolism is reflected in vision-and-language models such as CLIP and Stable +Diffusion. Using zero-shot knowledge probing to investigate the inherent +knowledge of these models, we find strong evidence that they do show this +pattern, paralleling the well-known kiki-bouba effect in psycholinguistics. Our +work provides a novel method for demonstrating sound symbolism and +understanding its nature using computational tools. Our code will be made +publicly available. + +
+
+ comment: Accepted to NeurIPS 2023 (spotlight). Project webpage: + https://kiki-bouba.github.io/ +
+
+
+
+
+ + ♻ ☆ Rotated Multi-Scale Interaction Network for Referring Remote Sensing + Image Segmentation CVPR 2024 + + +
+ Referring Remote Sensing Image Segmentation (RRSIS) is a new challenge that +combines computer vision and natural language processing, delineating specific +regions in aerial images as described by textual queries. Traditional Referring +Image Segmentation (RIS) approaches have been impeded by the complex spatial +scales and orientations found in aerial imagery, leading to suboptimal +segmentation results. To address these challenges, we introduce the Rotated +Multi-Scale Interaction Network (RMSIN), an innovative approach designed for +the unique demands of RRSIS. RMSIN incorporates an Intra-scale Interaction +Module (IIM) to effectively address the fine-grained detail required at +multiple scales and a Cross-scale Interaction Module (CIM) for integrating +these details coherently across the network. Furthermore, RMSIN employs an +Adaptive Rotated Convolution (ARC) to account for the diverse orientations of +objects, a novel contribution that significantly enhances segmentation +accuracy. To assess the efficacy of RMSIN, we have curated an expansive dataset +comprising 17,402 image-caption-mask triplets, which is unparalleled in terms +of scale and variety. This dataset not only presents the model with a wide +range of spatial and rotational scenarios but also establishes a stringent +benchmark for the RRSIS task, ensuring a rigorous evaluation of performance. +Our experimental evaluations demonstrate the exceptional performance of RMSIN, +surpassing existing state-of-the-art models by a significant margin. All +datasets and code are made available at https://github.com/Lsan2401/RMSIN. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Bridging the Projection Gap: Overcoming Projection Bias Through + Parameterized Distance Learning + + +
+ Generalized zero-shot learning (GZSL) aims to recognize samples from both +seen and unseen classes using only seen class samples for training. However, +GZSL methods are prone to bias towards seen classes during inference due to the +projection function being learned from seen classes. Most methods focus on +learning an accurate projection, but bias in the projection is inevitable. We +address this projection bias by proposing to learn a parameterized Mahalanobis +distance metric for robust inference. Our key insight is that the distance +computation during inference is critical, even with a biased projection. We +make two main contributions - (1) We extend the VAEGAN (Variational Autoencoder +\& Generative Adversarial Networks) architecture with two branches to +separately output the projection of samples from seen and unseen classes, +enabling more robust distance learning. (2) We introduce a novel loss function +to optimize the Mahalanobis distance representation and reduce projection bias. +Extensive experiments on four datasets show that our approach outperforms +state-of-the-art GZSL techniques with improvements of up to 3.5 \% on the +harmonic mean metric. + +
+
+ comment: 18 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ PosterLlama: Bridging Design Ability of Langauge Model to Contents-Aware + Layout Generation + + +
+ Visual layout plays a critical role in graphic design fields such as +advertising, posters, and web UI design. The recent trend towards content-aware +layout generation through generative models has shown promise, yet it often +overlooks the semantic intricacies of layout design by treating it as a simple +numerical optimization. To bridge this gap, we introduce PosterLlama, a network +designed for generating visually and textually coherent layouts by reformatting +layout elements into HTML code and leveraging the rich design knowledge +embedded within language models. Furthermore, we enhance the robustness of our +model with a unique depth-based poster augmentation strategy. This ensures our +generated layouts remain semantically rich but also visually appealing, even +with limited data. Our extensive evaluations across several benchmarks +demonstrate that PosterLlama outperforms existing methods in producing +authentic and content-aware layouts. It supports an unparalleled range of +conditions, including but not limited to unconditional layout generation, +element conditional layout generation, layout completion, among others, serving +as a highly versatile user manipulation tool. + +
+
+
+
+
+ + ♻ ☆ Classification for everyone : Building geography agnostic models for + fairer recognition + + +
+ In this paper, we analyze different methods to mitigate inherent geographical +biases present in state of the art image classification models. We first +quantitatively present this bias in two datasets - The Dollar Street Dataset +and ImageNet, using images with location information. We then present different +methods which can be employed to reduce this bias. Finally, we analyze the +effectiveness of the different techniques on making these models more robust to +geographical locations of the images. + +
+
+ comment: typos corrected, references added +
+
+
+
+
+ + ♻ ☆ Text-to-3D using Gaussian Splatting CVPR 2024 + + +
+ Automatic text-to-3D generation that combines Score Distillation Sampling +(SDS) with the optimization of volume rendering has achieved remarkable +progress in synthesizing realistic 3D objects. Yet most existing text-to-3D +methods by SDS and volume rendering suffer from inaccurate geometry, e.g., the +Janus issue, since it is hard to explicitly integrate 3D priors into implicit +3D representations. Besides, it is usually time-consuming for them to generate +elaborate 3D models with rich colors. In response, this paper proposes GSGEN, a +novel method that adopts Gaussian Splatting, a recent state-of-the-art +representation, to text-to-3D generation. GSGEN aims at generating high-quality +3D objects and addressing existing shortcomings by exploiting the explicit +nature of Gaussian Splatting that enables the incorporation of 3D prior. +Specifically, our method adopts a progressive optimization strategy, which +includes a geometry optimization stage and an appearance refinement stage. In +geometry optimization, a coarse representation is established under 3D point +cloud diffusion prior along with the ordinary 2D SDS optimization, ensuring a +sensible and 3D-consistent rough shape. Subsequently, the obtained Gaussians +undergo an iterative appearance refinement to enrich texture details. In this +stage, we increase the number of Gaussians by compactness-based densification +to enhance continuity and improve fidelity. With these designs, our approach +can generate 3D assets with delicate details and accurate geometry. Extensive +evaluations demonstrate the effectiveness of our method, especially for +capturing high-frequency components. Our code is available at +https://github.com/gsgen3d/gsgen + +
+
+ comment: To appear at CVPR 2024. Project page: https://gsgen3d.github.io. + Code: https://github.com/gsgen3d/gsgen +
+
+
+
+
+ + ♻ ☆ DPA-Net: Structured 3D Abstraction from Sparse Views via Differentiable + Primitive Assembly + + +
+ We present a differentiable rendering framework to learn structured 3D +abstractions in the form of primitive assemblies from sparse RGB images +capturing a 3D object. By leveraging differentiable volume rendering, our +method does not require 3D supervision. Architecturally, our network follows +the general pipeline of an image-conditioned neural radiance field (NeRF) +exemplified by pixelNeRF for color prediction. As our core contribution, we +introduce differential primitive assembly (DPA) into NeRF to output a 3D +occupancy field in place of density prediction, where the predicted occupancies +serve as opacity values for volume rendering. Our network, coined DPA-Net, +produces a union of convexes, each as an intersection of convex quadric +primitives, to approximate the target 3D object, subject to an abstraction loss +and a masking loss, both defined in the image space upon volume rendering. With +test-time adaptation and additional sampling and loss designs aimed at +improving the accuracy and compactness of the obtained assemblies, our method +demonstrates superior performance over state-of-the-art alternatives for 3D +primitive abstraction from sparse views. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ♻ ☆ SurMo: Surface-based 4D Motion Modeling for Dynamic Human Rendering CVPR 2024 + + +
+ Dynamic human rendering from video sequences has achieved remarkable progress +by formulating the rendering as a mapping from static poses to human images. +However, existing methods focus on the human appearance reconstruction of every +single frame while the temporal motion relations are not fully explored. In +this paper, we propose a new 4D motion modeling paradigm, SurMo, that jointly +models the temporal dynamics and human appearances in a unified framework with +three key designs: 1) Surface-based motion encoding that models 4D human +motions with an efficient compact surface-based triplane. It encodes both +spatial and temporal motion relations on the dense surface manifold of a +statistical body template, which inherits body topology priors for +generalizable novel view synthesis with sparse training observations. 2) +Physical motion decoding that is designed to encourage physical motion learning +by decoding the motion triplane features at timestep t to predict both spatial +derivatives and temporal derivatives at the next timestep t+1 in the training +stage. 3) 4D appearance decoding that renders the motion triplanes into images +by an efficient volumetric surface-conditioned renderer that focuses on the +rendering of body surfaces with motion learning conditioning. Extensive +experiments validate the state-of-the-art performance of our new paradigm and +illustrate the expressiveness of surface-based motion triplanes for rendering +high-fidelity view-consistent humans with fast motions and even +motion-dependent shadows. Our project page is at: +https://taohuumd.github.io/projects/SurMo/ + +
+
+ comment: Accepted to CVPR 2024. Project Page: + https://taohuumd.github.io/projects/SurMo/ +
+
+
+
+
+ + ♻ ☆ StructLDM: Structured Latent Diffusion for 3D Human Generation + + +
+ Recent 3D human generative models have achieved remarkable progress by +learning 3D-aware GANs from 2D images. However, existing 3D human generative +methods model humans in a compact 1D latent space, ignoring the articulated +structure and semantics of human body topology. In this paper, we explore more +expressive and higher-dimensional latent space for 3D human modeling and +propose StructLDM, a diffusion-based unconditional 3D human generative model, +which is learned from 2D images. StructLDM solves the challenges imposed due to +the high-dimensional growth of latent space with three key designs: 1) A +semantic structured latent space defined on the dense surface manifold of a +statistical human body template. 2) A structured 3D-aware auto-decoder that +factorizes the global latent space into several semantic body parts +parameterized by a set of conditional structured local NeRFs anchored to the +body template, which embeds the properties learned from the 2D training data +and can be decoded to render view-consistent humans under different poses and +clothing styles. 3) A structured latent diffusion model for generative human +appearance sampling. Extensive experiments validate StructLDM's +state-of-the-art generation performance and illustrate the expressiveness of +the structured latent space over the well-adopted 1D latent space. Notably, +StructLDM enables different levels of controllable 3D human generation and +editing, including pose/view/shape control, and high-level tasks including +compositional generations, part-aware clothing editing, 3D virtual try-on, etc. +Our project page is at: https://taohuumd.github.io/projects/StructLDM/. + +
+
+ comment: Project page: https://taohuumd.github.io/projects/StructLDM/ +
+
+
+
+
+ + ♻ ☆ A Comprehensive Review of Knowledge Distillation in Computer Vision + + +
+ Deep learning techniques have been demonstrated to surpass preceding +cutting-edge machine learning techniques in recent years, with computer vision +being one of the most prominent examples. However, deep learning models suffer +from significant drawbacks when deployed in resource-constrained environments +due to their large model size and high complexity. Knowledge Distillation is +one of the prominent solutions to overcome this challenge. This review paper +examines the current state of research on knowledge distillation, a technique +for compressing complex models into smaller and simpler ones. The paper +provides an overview of the major principles and techniques associated with +knowledge distillation and reviews the applications of knowledge distillation +in the domain of computer vision. The review focuses on the benefits of +knowledge distillation, as well as the problems that must be overcome to +improve its effectiveness. + +
+
+ comment: 37 pages ,10 figures +
+
+
+
+
+ + ♻ ☆ Volcano: Mitigating Multimodal Hallucination through Self-Feedback + Guided Revision + + +
+ Large multimodal models suffer from multimodal hallucination, where they +provide incorrect responses misaligned with the given visual information. +Recent works have conjectured that one of the reasons behind multimodal +hallucination is due to the vision encoder failing to ground on the image +properly. To mitigate this issue, we propose a novel approach that leverages +self-feedback as visual cues. Building on this approach, we introduce Volcano, +a multimodal self-feedback guided revision model. Volcano generates natural +language feedback to its initial response based on the provided visual +information and utilizes this feedback to self-revise its initial response. +Volcano effectively reduces multimodal hallucination and achieves +state-of-the-art on MMHal-Bench, POPE, and GAVIE. It also improves on general +multimodal abilities and outperforms previous models on MM-Vet and MMBench. +Through qualitative analysis, we show that Volcano's feedback is properly +grounded on the image than the initial response. This indicates that Volcano +can provide itself with richer visual information through feedback generation, +leading to self-correct hallucinations. We publicly release our model, data, +and code at https://github.com/kaistAI/Volcano}{github.com/kaistAI/Volcano + +
+
+
+
+
+ + ♻ ☆ 3D Reconstruction of Interacting Multi-Person in Clothing from a Single + Image WACV 2024 + + +
+ This paper introduces a novel pipeline to reconstruct the geometry of +interacting multi-person in clothing on a globally coherent scene space from a +single image. The main challenge arises from the occlusion: a part of a human +body is not visible from a single view due to the occlusion by others or the +self, which introduces missing geometry and physical implausibility (e.g., +penetration). We overcome this challenge by utilizing two human priors for +complete 3D geometry and surface contacts. For the geometry prior, an encoder +learns to regress the image of a person with missing body parts to the latent +vectors; a decoder decodes these vectors to produce 3D features of the +associated geometry; and an implicit network combines these features with a +surface normal map to reconstruct a complete and detailed 3D humans. For the +contact prior, we develop an image-space contact detector that outputs a +probability distribution of surface contacts between people in 3D. We use these +priors to globally refine the body poses, enabling the penetration-free and +accurate reconstruction of interacting multi-person in clothing on the scene +space. The results demonstrate that our method is complete, globally coherent, +and physically plausible compared to existing methods. + +
+
+ comment: Accepted to WACV 2024 +
+
+
+
+
+ + ♻ ☆ TAMM: TriAdapter Multi-Modal Learning for 3D Shape Understanding CVPR 2024 + + +
+ The limited scale of current 3D shape datasets hinders the advancements in 3D +shape understanding, and motivates multi-modal learning approaches which +transfer learned knowledge from data-abundant 2D image and language modalities +to 3D shapes. However, even though the image and language representations have +been aligned by cross-modal models like CLIP, we find that the image modality +fails to contribute as much as the language in existing multi-modal 3D +representation learning methods. This is attributed to the domain shift in the +2D images and the distinct focus of each modality. To more effectively leverage +both modalities in the pre-training, we introduce TriAdapter Multi-Modal +Learning (TAMM) -- a novel two-stage learning approach based on three +synergistic adapters. First, our CLIP Image Adapter mitigates the domain gap +between 3D-rendered images and natural images, by adapting the visual +representations of CLIP for synthetic image-text pairs. Subsequently, our Dual +Adapters decouple the 3D shape representation space into two complementary +sub-spaces: one focusing on visual attributes and the other for semantic +understanding, which ensure a more comprehensive and effective multi-modal +pre-training. Extensive experiments demonstrate that TAMM consistently enhances +3D representations for a wide range of 3D encoder architectures, pre-training +datasets, and downstream tasks. Notably, we boost the zero-shot classification +accuracy on Objaverse-LVIS from 46.8\% to 50.7\%, and improve the 5-way 10-shot +linear probing classification accuracy on ModelNet40 from 96.1\% to 99.0\%. +Project page: https://alanzhangcs.github.io/tamm-page. + +
+
+ comment: This paper is accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Survey of Bias In Text-to-Image Generation: Definition, Evaluation, and + Mitigation + + +
+ The recent advancement of large and powerful models with Text-to-Image (T2I) +generation abilities -- such as OpenAI's DALLE-3 and Google's Gemini -- enables +users to generate high-quality images from textual prompts. However, it has +become increasingly evident that even simple prompts could cause T2I models to +exhibit conspicuous social bias in generated images. Such bias might lead to +both allocational and representational harms in society, further marginalizing +minority groups. Noting this problem, a large body of recent works has been +dedicated to investigating different dimensions of bias in T2I systems. +However, an extensive review of these studies is lacking, hindering a +systematic understanding of current progress and research gaps. We present the +first extensive survey on bias in T2I generative models. In this survey, we +review prior studies on dimensions of bias: Gender, Skintone, and Geo-Culture. +Specifically, we discuss how these works define, evaluate, and mitigate +different aspects of bias. We found that: (1) while gender and skintone biases +are widely studied, geo-cultural bias remains under-explored; (2) most works on +gender and skintone bias investigated occupational association, while other +aspects are less frequently studied; (3) almost all gender bias works overlook +non-binary identities in their studies; (4) evaluation datasets and metrics are +scattered, with no unified framework for measuring biases; and (5) current +mitigation methods fail to resolve biases comprehensively. Based on current +limitations, we point out future research directions that contribute to +human-centric definitions, evaluations, and mitigation of biases. We hope to +highlight the importance of studying biases in T2I systems, as well as +encourage future efforts to holistically understand and tackle biases, building +fair and trustworthy T2I technologies for everyone. + +
+
+
+
+
+ + ♻ ☆ One-Shot Structure-Aware Stylized Image Synthesis CVPR 2024 + + +
+ While GAN-based models have been successful in image stylization tasks, they +often struggle with structure preservation while stylizing a wide range of +input images. Recently, diffusion models have been adopted for image +stylization but still lack the capability to maintain the original quality of +input images. Building on this, we propose OSASIS: a novel one-shot stylization +method that is robust in structure preservation. We show that OSASIS is able to +effectively disentangle the semantics from the structure of an image, allowing +it to control the level of content and style implemented to a given input. We +apply OSASIS to various experimental settings, including stylization with +out-of-domain reference images and stylization with text-driven manipulation. +Results show that OSASIS outperforms other stylization methods, especially for +input images that were rarely encountered during training, providing a +promising solution to stylization via diffusion models. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ LoSh: Long-Short Text Joint Prediction Network for Referring Video + Object Segmentation CVPR2024 + + +
+ Referring video object segmentation (RVOS) aims to segment the target +instance referred by a given text expression in a video clip. The text +expression normally contains sophisticated description of the instance's +appearance, action, and relation with others. It is therefore rather difficult +for a RVOS model to capture all these attributes correspondingly in the video; +in fact, the model often favours more on the action- and relation-related +visual attributes of the instance. This can end up with partial or even +incorrect mask prediction of the target instance. We tackle this problem by +taking a subject-centric short text expression from the original long text +expression. The short one retains only the appearance-related information of +the target instance so that we can use it to focus the model's attention on the +instance's appearance. We let the model make joint predictions using both long +and short text expressions; and insert a long-short cross-attention module to +interact the joint features and a long-short predictions intersection loss to +regulate the joint predictions. Besides the improvement on the linguistic part, +we also introduce a forward-backward visual consistency loss, which utilizes +optical flows to warp visual features between the annotated frames and their +temporal neighbors for consistency. We build our method on top of two state of +the art pipelines. Extensive experiments on A2D-Sentences, Refer-YouTube-VOS, +JHMDB-Sentences and Refer-DAVIS17 show impressive improvements of our +method.Code is available at https://github.com/LinfengYuan1997/Losh. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ♻ ☆ Flexible filtrations for multiparameter persistent homology detect + digital images + + +
+ Two important problems in the field of Topological Data Analysis are defining +practical multifiltrations on objects and showing ability of TDA to detect the +geometry. Motivated by the problems, we constuct three multifiltrations named +multi-GENEO, multi-DGENEO and mix-GENEO, and prove the stability of both the +interleaving distance and multiparameter persistence landscape of multi-GENEO +with respect to the pseudometric of the subspace of bounded functions. We also +give the estimations of upper bound for multi-DGENEO and mix-GENEO. Finally, we +provide experiment results on MNIST dataset to demonstrate our bifiltrations +have ability to detect geometric and topological differences of digital images. + +
+
+
+
+
+ + ♻ ☆ NEDS-SLAM: A Novel Neural Explicit Dense Semantic SLAM Framework using + 3D Gaussian Splatting + + +
+ We propose NEDS-SLAM, an Explicit Dense semantic SLAM system based on 3D +Gaussian representation, that enables robust 3D semantic mapping, accurate +camera tracking, and high-quality rendering in real-time. In the system, we +propose a Spatially Consistent Feature Fusion model to reduce the effect of +erroneous estimates from pre-trained segmentation head on semantic +reconstruction, achieving robust 3D semantic Gaussian mapping. Additionally, we +employ a lightweight encoder-decoder to compress the high-dimensional semantic +features into a compact 3D Gaussian representation, mitigating the burden of +excessive memory consumption. Furthermore, we leverage the advantage of 3D +Gaussian splatting, which enables efficient and differentiable novel view +rendering, and propose a Virtual Camera View Pruning method to eliminate +outlier GS points, thereby effectively enhancing the quality of scene +representations. Our NEDS-SLAM method demonstrates competitive performance over +existing dense semantic SLAM methods in terms of mapping and tracking accuracy +on Replica and ScanNet datasets, while also showing excellent capabilities in +3D dense semantic mapping. + +
+
+
+
+
+ + ♻ ☆ FairRAG: Fair Human Generation via Fair Retrieval Augmentation CVPR 2024 + + +
+ Existing text-to-image generative models reflect or even amplify societal +biases ingrained in their training data. This is especially concerning for +human image generation where models are biased against certain demographic +groups. Existing attempts to rectify this issue are hindered by the inherent +limitations of the pre-trained models and fail to substantially improve +demographic diversity. In this work, we introduce Fair Retrieval Augmented +Generation (FairRAG), a novel framework that conditions pre-trained generative +models on reference images retrieved from an external image database to improve +fairness in human generation. FairRAG enables conditioning through a +lightweight linear module that projects reference images into the textual +space. To enhance fairness, FairRAG applies simple-yet-effective debiasing +strategies, providing images from diverse demographic groups during the +generative process. Extensive experiments demonstrate that FairRAG outperforms +existing methods in terms of demographic diversity, image-text alignment, and +image fidelity while incurring minimal computational overhead during inference. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Variational Dynamic for Self-Supervised Exploration in Deep + Reinforcement Learning + + +
+ Efficient exploration remains a challenging problem in reinforcement +learning, especially for tasks where extrinsic rewards from environments are +sparse or even totally disregarded. Significant advances based on intrinsic +motivation show promising results in simple environments but often get stuck in +environments with multimodal and stochastic dynamics. In this work, we propose +a variational dynamic model based on the conditional variational inference to +model the multimodality and stochasticity. We consider the environmental +state-action transition as a conditional generative process by generating the +next-state prediction under the condition of the current state, action, and +latent variable, which provides a better understanding of the dynamics and +leads a better performance in exploration. We derive an upper bound of the +negative log-likelihood of the environmental transition and use such an upper +bound as the intrinsic reward for exploration, which allows the agent to learn +skills by self-supervised exploration without observing extrinsic rewards. We +evaluate the proposed method on several image-based simulation tasks and a real +robotic manipulating task. Our method outperforms several state-of-the-art +environment model-based exploration approaches. + +
+
+ comment: IEEE Transactions on Neural Networks and Learning Systems (TNNLS) + 2021 +
+
+
+
+
+ + ♻ ☆ Text2HOI: Text-guided 3D Motion Generation for Hand-Object Interaction CVPR 2024 + + +
+ This paper introduces the first text-guided work for generating the sequence +of hand-object interaction in 3D. The main challenge arises from the lack of +labeled data where existing ground-truth datasets are nowhere near +generalizable in interaction type and object category, which inhibits the +modeling of diverse 3D hand-object interaction with the correct physical +implication (e.g., contacts and semantics) from text prompts. To address this +challenge, we propose to decompose the interaction generation task into two +subtasks: hand-object contact generation; and hand-object motion generation. +For contact generation, a VAE-based network takes as input a text and an object +mesh, and generates the probability of contacts between the surfaces of hands +and the object during the interaction. The network learns a variety of local +geometry structure of diverse objects that is independent of the objects' +category, and thus, it is applicable to general objects. For motion generation, +a Transformer-based diffusion model utilizes this 3D contact map as a strong +prior for generating physically plausible hand-object motion as a function of +text prompts by learning from the augmented labeled dataset; where we annotate +text labels from many existing 3D hand and object motion data. Finally, we +further introduce a hand refiner module that minimizes the distance between the +object surface and hand joints to improve the temporal stability of the +object-hand contacts and to suppress the penetration artifacts. In the +experiments, we demonstrate that our method can generate more realistic and +diverse interactions compared to other baseline methods. We also show that our +method is applicable to unseen objects. We will release our model and newly +labeled data as a strong foundation for future research. Codes and data are +available in: https://github.com/JunukCha/Text2HOI. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ MMCert: Provable Defense against Adversarial Attacks to Multi-modal + Models CVPR'24 + + +
+ Different from a unimodal model whose input is from a single modality, the +input (called multi-modal input) of a multi-modal model is from multiple +modalities such as image, 3D points, audio, text, etc. Similar to unimodal +models, many existing studies show that a multi-modal model is also vulnerable +to adversarial perturbation, where an attacker could add small perturbation to +all modalities of a multi-modal input such that the multi-modal model makes +incorrect predictions for it. Existing certified defenses are mostly designed +for unimodal models, which achieve sub-optimal certified robustness guarantees +when extended to multi-modal models as shown in our experimental results. In +our work, we propose MMCert, the first certified defense against adversarial +attacks to a multi-modal model. We derive a lower bound on the performance of +our MMCert under arbitrary adversarial attacks with bounded perturbations to +both modalities (e.g., in the context of auto-driving, we bound the number of +changed pixels in both RGB image and depth image). We evaluate our MMCert using +two benchmark datasets: one for the multi-modal road segmentation task and the +other for the multi-modal emotion recognition task. Moreover, we compare our +MMCert with a state-of-the-art certified defense extended from unimodal models. +Our experimental results show that our MMCert outperforms the baseline. + +
+
+ comment: To appear in CVPR'24 +
+
+
+
+
+ + ♻ ☆ Image Captioning in news report scenario + + +
+ Image captioning strives to generate pertinent captions for specified images, +situating itself at the crossroads of Computer Vision (CV) and Natural Language +Processing (NLP). This endeavor is of paramount importance with far-reaching +applications in recommendation systems, news outlets, social media, and beyond. +Particularly within the realm of news reporting, captions are expected to +encompass detailed information, such as the identities of celebrities captured +in the images. However, much of the existing body of work primarily centers +around understanding scenes and actions. In this paper, we explore the realm of +image captioning specifically tailored for celebrity photographs, illustrating +its broad potential for enhancing news industry practices. This exploration +aims to augment automated news content generation, thereby facilitating a more +nuanced dissemination of information. Our endeavor shows a broader horizon, +enriching the narrative in news reporting through a more intuitive image +captioning framework. + +
+
+ comment: 10 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ InfLoRA: Interference-Free Low-Rank Adaptation for Continual Learning CVPR 2024 + + +
+ Continual learning requires the model to learn multiple tasks sequentially. +In continual learning, the model should possess the ability to maintain its +performance on old tasks (stability) and the ability to adapt to new tasks +continuously (plasticity). Recently, parameter-efficient fine-tuning (PEFT), +which involves freezing a pre-trained model and injecting a small number of +learnable parameters to adapt to downstream tasks, has gained increasing +popularity in continual learning. Although existing continual learning methods +based on PEFT have demonstrated superior performance compared to those not +based on PEFT, most of them do not consider how to eliminate the interference +of the new task on the old tasks, which inhibits the model from making a good +trade-off between stability and plasticity. In this work, we propose a new PEFT +method, called interference-free low-rank adaptation (InfLoRA), for continual +learning. InfLoRA injects a small number of parameters to reparameterize the +pre-trained weights and shows that fine-tuning these injected parameters is +equivalent to fine-tuning the pre-trained weights within a subspace. +Furthermore, InfLoRA designs this subspace to eliminate the interference of the +new task on the old tasks, making a good trade-off between stability and +plasticity. Experimental results show that InfLoRA outperforms existing +state-of-the-art continual learning methods on multiple datasets. + +
+
+ comment: Accepted by the 2024 IEEE/CVF Conference on Computer Vision and + Pattern Recognition (CVPR 2024) +
+
+
+
+
+ + ♻ ☆ Interpretable Dimensionality Reduction by Feature Preserving Manifold + Approximation and Projection + + +
+ Nonlinear dimensionality reduction lacks interpretability due to the absence +of source features in low-dimensional embedding space. We propose an +interpretable method featMAP to preserve source features by tangent space +embedding. The core of our proposal is to utilize local singular value +decomposition (SVD) to approximate the tangent space which is embedded to +low-dimensional space by maintaining the alignment. Based on the embedding +tangent space, featMAP enables the interpretability by locally demonstrating +the source features and feature importance. Furthermore, featMAP embeds the +data points by anisotropic projection to preserve the local similarity and +original density. We apply featMAP to interpreting digit classification, object +detection and MNIST adversarial examples. FeatMAP uses source features to +explicitly distinguish the digits and objects and to explain the +misclassification of adversarial examples. We also compare featMAP with other +state-of-the-art methods on local and global metrics. + +
+
+
+
+
+ + ♻ ☆ MCAD: Multi-teacher Cross-modal Alignment Distillation for efficient + image-text retrieval NAACL 2024 + + +
+ Due to the success of large-scale visual-language pretraining (VLP) models +and the widespread use of image-text retrieval in industry areas, it is now +critically necessary to reduce the model size and streamline their +mobile-device deployment. Single- and dual-stream model structures are commonly +used in image-text retrieval with the goal of closing the semantic gap between +textual and visual modalities. While single-stream models use deep feature +fusion to achieve more accurate cross-model alignment, dual-stream models are +better at offline indexing and fast inference.We propose a Multi-teacher +Cross-modality Alignment Distillation (MCAD) technique to integrate the +advantages of single- and dual-stream models. By incorporating the fused +single-stream features into the image and text features of the dual-stream +model, we formulate new modified teacher similarity distributions and features. +Then, we conduct both distribution and feature distillation to boost the +capability of the student dual-stream model, achieving high retrieval +performance without increasing inference complexity.Extensive experiments +demonstrate the remarkable performance and high efficiency of MCAD on +image-text retrieval tasks. Furthermore, we implement a lightweight CLIP model +on Snapdragon/Dimensity chips with only $\sim$100M running memory and +$\sim$8.0ms search latency, achieving the mobile-device application of VLP +models. + +
+
+ comment: Accepted by NAACL 2024 Findings +
+
+
+
+
+ + ♻ ☆ Efficient End-to-End Visual Document Understanding with Rationale + Distillation NAACL 2024 + + +
+ Understanding visually situated language requires interpreting complex +layouts of textual and visual elements. Pre-processing tools, such as optical +character recognition (OCR), can map document image inputs to textual tokens, +then large language models (LLMs) can reason over text. However, such methods +have high computational and engineering complexity. Can small pretrained +image-to-text models accurately understand visual documents through similar +recognition and reasoning steps instead? We propose Rationale Distillation +(RD), which incorporates the outputs of OCR tools, LLMs, and larger multimodal +models as intermediate "rationales", and trains a small student model to +predict both rationales and answers. On three visual document understanding +benchmarks representing infographics, scanned documents, and figures, our +Pix2Struct (282M parameters) student model finetuned with RD outperforms the +base model by 4-5% absolute accuracy with only 1% higher computational cost. + +
+
+ comment: Accepted by NAACL 2024 +
+
+
+
+
+ + ♻ ☆ OSCaR: Object State Captioning and State Change Representation NAACL 2024 + + +
+ The capability of intelligent models to extrapolate and comprehend changes in +object states is a crucial yet demanding aspect of AI research, particularly +through the lens of human interaction in real-world settings. This task +involves describing complex visual environments, identifying active objects, +and interpreting their changes as conveyed through language. Traditional +methods, which isolate object captioning and state change detection, offer a +limited view of dynamic environments. Moreover, relying on a small set of +symbolic words to represent changes has restricted the expressiveness of the +language. To address these challenges, in this paper, we introduce the Object +State Captioning and State Change Representation (OSCaR) dataset and benchmark. +OSCaR consists of 14,084 annotated video segments with nearly 1,000 unique +objects from various egocentric video collections. It sets a new testbed for +evaluating multimodal large language models (MLLMs). Our experiments +demonstrate that while MLLMs show some skill, they lack a full understanding of +object state changes. The benchmark includes a fine-tuned model that, despite +initial capabilities, requires significant improvements in accuracy and +generalization ability for effective understanding of these changes. Our code +and dataset are available at https://github.com/nguyennm1024/OSCaR. + +
+
+ comment: NAACL 2024 +
+
+
+
+
+ + ♻ ☆ SeiT++: Masked Token Modeling Improves Storage-efficient Training + + +
+ Recent advancements in Deep Neural Network (DNN) models have significantly +improved performance across computer vision tasks. However, achieving highly +generalizable and high-performing vision models requires expansive datasets, +resulting in significant storage requirements. This storage challenge is a +critical bottleneck for scaling up models. A recent breakthrough by SeiT +proposed the use of Vector-Quantized (VQ) feature vectors (i.e., tokens) as +network inputs for vision classification. This approach achieved 90% of the +performance of a model trained on full-pixel images with only 1% of the +storage. While SeiT needs labeled data, its potential in scenarios beyond fully +supervised learning remains largely untapped. In this paper, we extend SeiT by +integrating Masked Token Modeling (MTM) for self-supervised pre-training. +Recognizing that self-supervised approaches often demand more data due to the +lack of labels, we introduce TokenAdapt and ColorAdapt. These methods +facilitate comprehensive token-friendly data augmentation, effectively +addressing the increased data requirements of self-supervised learning. We +evaluate our approach across various scenarios, including storage-efficient +ImageNet-1k classification, fine-grained classification, ADE-20k semantic +segmentation, and robustness benchmarks. Experimental results demonstrate +consistent performance improvement in diverse experiments, validating the +effectiveness of our method. Code is available at +https://github.com/naver-ai/tokenadapt. + +
+
+ comment: First two authors contributed equally +
+
+
+
+
+ + ♻ ☆ Is Synthetic Image Useful for Transfer Learning? An Investigation into + Data Generation, Volume, and Utilization ICLR24 + + +
+ Synthetic image data generation represents a promising avenue for training +deep learning models, particularly in the realm of transfer learning, where +obtaining real images within a specific domain can be prohibitively expensive +due to privacy and intellectual property considerations. This work delves into +the generation and utilization of synthetic images derived from text-to-image +generative models in facilitating transfer learning paradigms. Despite the high +visual fidelity of the generated images, we observe that their naive +incorporation into existing real-image datasets does not consistently enhance +model performance due to the inherent distribution gap between synthetic and +real images. To address this issue, we introduce a novel two-stage framework +called bridged transfer, which initially employs synthetic images for +fine-tuning a pre-trained model to improve its transferability and subsequently +uses real data for rapid adaptation. Alongside, We propose dataset style +inversion strategy to improve the stylistic alignment between synthetic and +real images. Our proposed methods are evaluated across 10 different datasets +and 5 distinct models, demonstrating consistent improvements, with up to 30% +accuracy increase on classification tasks. Intriguingly, we note that the +enhancements were not yet saturated, indicating that the benefits may further +increase with an expanded volume of synthetic data. + +
+
+ comment: ICLR24 Score 6865 https://openreview.net/forum?id=CjPt1AC6w0 +
+
+
+
+
+ + ♻ ☆ Gemini: A Family of Highly Capable Multimodal Models + + +
+ This report introduces a new family of multimodal models, Gemini, that +exhibit remarkable capabilities across image, audio, video, and text +understanding. The Gemini family consists of Ultra, Pro, and Nano sizes, +suitable for applications ranging from complex reasoning tasks to on-device +memory-constrained use-cases. Evaluation on a broad range of benchmarks shows +that our most-capable Gemini Ultra model advances the state of the art in 30 of +32 of these benchmarks - notably being the first model to achieve human-expert +performance on the well-studied exam benchmark MMLU, and improving the state of +the art in every one of the 20 multimodal benchmarks we examined. We believe +that the new capabilities of the Gemini family in cross-modal reasoning and +language understanding will enable a wide variety of use cases. We discuss our +approach toward post-training and deploying Gemini models responsibly to users +through services including Gemini, Gemini Advanced, Google AI Studio, and Cloud +Vertex AI. + +
+
+
+
+
+ + ♻ ☆ Large Language Models are Good Prompt Learners for Low-Shot Image + Classification CVPR 2024 + + +
+ Low-shot image classification, where training images are limited or +inaccessible, has benefited from recent progress on pre-trained vision-language +(VL) models with strong generalizability, e.g. CLIP. Prompt learning methods +built with VL models generate text features from the class names that only have +confined class-specific information. Large Language Models (LLMs), with their +vast encyclopedic knowledge, emerge as the complement. Thus, in this paper, we +discuss the integration of LLMs to enhance pre-trained VL models, specifically +on low-shot classification. However, the domain gap between language and vision +blocks the direct application of LLMs. Thus, we propose LLaMP, Large Language +Models as Prompt learners, that produces adaptive prompts for the CLIP text +encoder, establishing it as the connecting bridge. Experiments show that, +compared with other state-of-the-art prompt learning methods, LLaMP yields +better performance on both zero-shot generalization and few-shot image +classification, over a spectrum of 11 datasets. Code will be made available at: +https://github.com/zhaohengz/LLaMP. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ EarthNets: Empowering AI in Earth Observation + + +
+ Earth observation (EO), aiming at monitoring the state of planet Earth using +remote sensing data, is critical for improving our daily lives and living +environment. With a growing number of satellites in orbit, an increasing number +of datasets with diverse sensors and research domains are being published to +facilitate the research of the remote sensing community. This paper presents a +comprehensive review of more than 500 publicly published datasets, including +research domains like agriculture, land use and land cover, disaster +monitoring, scene understanding, vision-language models, foundation models, +climate change, and weather forecasting. We systematically analyze these EO +datasets from four aspects: volume, resolution distributions, research domains, +and the correlation between datasets. Based on the dataset attributes, we +propose to measure, rank, and select datasets to build a new benchmark for +model evaluation. Furthermore, a new platform for EO, termed EarthNets, is +released to achieve a fair and consistent evaluation of deep learning methods +on remote sensing data. EarthNets supports standard dataset libraries and +cutting-edge deep learning models to bridge the gap between the remote sensing +and machine learning communities. Based on this platform, extensive +deep-learning methods are evaluated on the new benchmark. The insightful +results are beneficial to future research. The platform and dataset collections +are publicly available at https://earthnets.github.io. + +
+
+ comment: 30 pages +
+
+
+
+
+ + ♻ ☆ Visual Anagrams: Generating Multi-View Optical Illusions with Diffusion + Models CVPR 2024 + + +
+ We address the problem of synthesizing multi-view optical illusions: images +that change appearance upon a transformation, such as a flip or rotation. We +propose a simple, zero-shot method for obtaining these illusions from +off-the-shelf text-to-image diffusion models. During the reverse diffusion +process, we estimate the noise from different views of a noisy image, and then +combine these noise estimates together and denoise the image. A theoretical +analysis suggests that this method works precisely for views that can be +written as orthogonal transformations, of which permutations are a subset. This +leads to the idea of a visual anagram--an image that changes appearance under +some rearrangement of pixels. This includes rotations and flips, but also more +exotic pixel permutations such as a jigsaw rearrangement. Our approach also +naturally extends to illusions with more than two views. We provide both +qualitative and quantitative results demonstrating the effectiveness and +flexibility of our method. Please see our project webpage for additional +visualizations and results: https://dangeng.github.io/visual_anagrams/ + +
+
+ comment: CVPR 2024 camera ready +
+
+
+
+
+ + ♻ ☆ SignAvatars: A Large-scale 3D Sign Language Holistic Motion Dataset and + Benchmark + + +
+ We present SignAvatars, the first large-scale, multi-prompt 3D sign language +(SL) motion dataset designed to bridge the communication gap for Deaf and +hard-of-hearing individuals. While there has been an exponentially growing +number of research regarding digital communication, the majority of existing +communication technologies primarily cater to spoken or written languages, +instead of SL, the essential communication method for Deaf and hard-of-hearing +communities. Existing SL datasets, dictionaries, and sign language production +(SLP) methods are typically limited to 2D as annotating 3D models and avatars +for SL is usually an entirely manual and labor-intensive process conducted by +SL experts, often resulting in unnatural avatars. In response to these +challenges, we compile and curate the SignAvatars dataset, which comprises +70,000 videos from 153 signers, totaling 8.34 million frames, covering both +isolated signs and continuous, co-articulated signs, with multiple prompts +including HamNoSys, spoken language, and words. To yield 3D holistic +annotations, including meshes and biomechanically-valid poses of body, hands, +and face, as well as 2D and 3D keypoints, we introduce an automated annotation +pipeline operating on our large corpus of SL videos. SignAvatars facilitates +various tasks such as 3D sign language recognition (SLR) and the novel 3D SL +production (SLP) from diverse inputs like text scripts, individual words, and +HamNoSys notation. Hence, to evaluate the potential of SignAvatars, we further +propose a unified benchmark of 3D SL holistic motion production. We believe +that this work is a significant step forward towards bringing the digital world +to the Deaf and hard-of-hearing communities as well as people interacting with +them. + +
+
+ comment: 14 pages; Project page available at https://signavatars.github.io/ +
+
+
+
+
+ + ♻ ☆ CFIR: Fast and Effective Long-Text To Image Retrieval for Large Corpora + + +
+ Text-to-image retrieval aims to find the relevant images based on a text +query, which is important in various use-cases, such as digital libraries, +e-commerce, and multimedia databases. Although Multimodal Large Language Models +(MLLMs) demonstrate state-of-the-art performance, they exhibit limitations in +handling large-scale, diverse, and ambiguous real-world needs of retrieval, due +to the computation cost and the injective embeddings they produce. This paper +presents a two-stage Coarse-to-Fine Index-shared Retrieval (CFIR) framework, +designed for fast and effective large-scale long-text to image retrieval. The +first stage, Entity-based Ranking (ER), adapts to long-text query ambiguity by +employing a multiple-queries-to-multiple-targets paradigm, facilitating +candidate filtering for the next stage. The second stage, Summary-based +Re-ranking (SR), refines these rankings using summarized queries. We also +propose a specialized Decoupling-BEiT-3 encoder, optimized for handling +ambiguous user needs and both stages, which also enhances computational +efficiency through vector-based similarity inference. Evaluation on the AToMiC +dataset reveals that CFIR surpasses existing MLLMs by up to 11.06% in +Recall@1000, while reducing training and retrieval times by 68.75% and 99.79%, +respectively. We will release our code to facilitate future research at +https://github.com/longkukuhi/CFIR. + +
+
+
+
+
+ + ♻ ☆ MetaCloak: Preventing Unauthorized Subject-driven Text-to-image + Diffusion-based Synthesis via Meta-learning CVPR 2024 + + +
+ Text-to-image diffusion models allow seamless generation of personalized +images from scant reference photos. Yet, these tools, in the wrong hands, can +fabricate misleading or harmful content, endangering individuals. To address +this problem, existing poisoning-based approaches perturb user images in an +imperceptible way to render them "unlearnable" from malicious uses. We identify +two limitations of these defending approaches: i) sub-optimal due to the +hand-crafted heuristics for solving the intractable bilevel optimization and +ii) lack of robustness against simple data transformations like Gaussian +filtering. To solve these challenges, we propose MetaCloak, which solves the +bi-level poisoning problem with a meta-learning framework with an additional +transformation sampling process to craft transferable and robust perturbation. +Specifically, we employ a pool of surrogate diffusion models to craft +transferable and model-agnostic perturbation. Furthermore, by incorporating an +additional transformation process, we design a simple denoising-error +maximization loss that is sufficient for causing transformation-robust semantic +distortion and degradation in a personalized generation. Extensive experiments +on the VGGFace2 and CelebA-HQ datasets show that MetaCloak outperforms existing +approaches. Notably, MetaCloak can successfully fool online training services +like Replicate, in a black-box manner, demonstrating the effectiveness of +MetaCloak in real-world scenarios. Our code is available at +https://github.com/liuyixin-louis/MetaCloak. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ A Novel Benchmark for Few-Shot Semantic Segmentation in the Era of + Foundation Models + + +
+ In recent years, the rapid evolution of computer vision has seen the +emergence of various foundation models, each tailored to specific data types +and tasks. In this study, we explore the adaptation of these models for +few-shot semantic segmentation. Specifically, we conduct a comprehensive +comparative analysis of four prominent foundation models: DINO V2, Segment +Anything, CLIP, Masked AutoEncoders, and of a straightforward ResNet50 +pre-trained on the COCO dataset. We also include 5 adaptation methods, ranging +from linear probing to fine tuning. Our findings show that DINO V2 outperforms +other models by a large margin, across various datasets and adaptation methods. +On the other hand, adaptation methods provide little discrepancy in the +obtained results, suggesting that a simple linear probing can compete with +advanced, more computationally intensive, alternatives + +
+
+
+
+
+ + ♻ ☆ Readout Guidance: Learning Control from Diffusion Features CVPR 2024 + + +
+ We present Readout Guidance, a method for controlling text-to-image diffusion +models with learned signals. Readout Guidance uses readout heads, lightweight +networks trained to extract signals from the features of a pre-trained, frozen +diffusion model at every timestep. These readouts can encode single-image +properties, such as pose, depth, and edges; or higher-order properties that +relate multiple images, such as correspondence and appearance similarity. +Furthermore, by comparing the readout estimates to a user-defined target, and +back-propagating the gradient through the readout head, these estimates can be +used to guide the sampling process. Compared to prior methods for conditional +generation, Readout Guidance requires significantly fewer added parameters and +training samples, and offers a convenient and simple recipe for reproducing +different forms of conditional control under a single framework, with a single +architecture and sampling procedure. We showcase these benefits in the +applications of drag-based manipulation, identity-consistent generation, and +spatially aligned control. Project page: https://readout-guidance.github.io. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Omni-SMoLA: Boosting Generalist Multimodal Models with Soft Mixture of + Low-rank Experts + + +
+ Large multi-modal models (LMMs) exhibit remarkable performance across +numerous tasks. However, generalist LMMs often suffer from performance +degradation when tuned over a large collection of tasks. Recent research +suggests that Mixture of Experts (MoE) architectures are useful for instruction +tuning, but for LMMs of parameter size around O(50-100B), the prohibitive cost +of replicating and storing the expert models severely limits the number of +experts we can use. We propose Omni-SMoLA, an architecture that uses the Soft +MoE approach to (softly) mix many multimodal low rank experts, and avoids +introducing a significant number of new parameters compared to conventional MoE +models. The core intuition here is that the large model provides a foundational +backbone, while different lightweight experts residually learn specialized +knowledge, either per-modality or multimodally. Extensive experiments +demonstrate that the SMoLA approach helps improve the generalist performance +across a broad range of generative vision-and-language tasks, achieving new +SoTA generalist performance that often matches or outperforms single +specialized LMM baselines, as well as new SoTA specialist performance. + +
+
+
+
+
+ + ♻ ☆ Computational limits to the legibility of the imaged human brain + + +
+ Our knowledge of the organisation of the human brain at the population-level +is yet to translate into power to predict functional differences at the +individual-level, limiting clinical applications, and casting doubt on the +generalisability of inferred mechanisms. It remains unknown whether the +difficulty arises from the absence of individuating biological patterns within +the brain, or from limited power to access them with the models and compute at +our disposal. Here we comprehensively investigate the resolvability of such +patterns with data and compute at unprecedented scale. Across 23 810 unique +participants from UK Biobank, we systematically evaluate the predictability of +25 individual biological characteristics, from all available combinations of +structural and functional neuroimaging data. Over 4526 GPU hours of +computation, we train, optimize, and evaluate out-of-sample 700 individual +predictive models, including fully-connected feed-forward neural networks of +demographic, psychological, serological, chronic disease, and functional +connectivity characteristics, and both uni- and multi-modal 3D convolutional +neural network models of macro- and micro-structural brain imaging. We find a +marked discrepancy between the high predictability of sex (balanced accuracy +99.7%), age (mean absolute error 2.048 years, R2 0.859), and weight (mean +absolute error 2.609Kg, R2 0.625), for which we set new state-of-the-art +performance, and the surprisingly low predictability of other characteristics. +Neither structural nor functional imaging predicted psychology better than the +coincidence of chronic disease (p<0.05). Serology predicted chronic disease +(p<0.05) and was best predicted by it (p<0.001), followed by structural +neuroimaging (p<0.05). Our findings suggest either more informative imaging or +more powerful models are needed to decipher individual level characteristics +from the human brain. + +
+
+ comment: 38 pages, 6 figures, 1 table, 2 supplementary figures, 1 + supplementary table +
+
+
+
+
+ + ♻ ☆ Diffusion 3D Features (Diff3F): Decorating Untextured Shapes with + Distilled Semantic Features CVPR'24 + + +
+ We present Diff3F as a simple, robust, and class-agnostic feature descriptor +that can be computed for untextured input shapes (meshes or point clouds). Our +method distills diffusion features from image foundational models onto input +shapes. Specifically, we use the input shapes to produce depth and normal maps +as guidance for conditional image synthesis. In the process, we produce +(diffusion) features in 2D that we subsequently lift and aggregate on the +original surface. Our key observation is that even if the conditional image +generations obtained from multi-view rendering of the input shapes are +inconsistent, the associated image features are robust and, hence, can be +directly aggregated across views. This produces semantic features on the input +shapes, without requiring additional data or training. We perform extensive +experiments on multiple benchmarks (SHREC'19, SHREC'20, FAUST, and TOSCA) and +demonstrate that our features, being semantic instead of geometric, produce +reliable correspondence across both isometric and non-isometrically related +shape families. Code is available via the project page at +https://diff3f.github.io/ + +
+
+ comment: Accepted at CVPR'24 +
+
+
+
+
+ + ♻ ☆ UniBEV: Multi-modal 3D Object Detection with Uniform BEV Encoders for + Robustness against Missing Sensor Modalities + + +
+ Multi-sensor object detection is an active research topic in automated +driving, but the robustness of such detection models against missing sensor +input (modality missing), e.g., due to a sudden sensor failure, is a critical +problem which remains under-studied. In this work, we propose UniBEV, an +end-to-end multi-modal 3D object detection framework designed for robustness +against missing modalities: UniBEV can operate on LiDAR plus camera input, but +also on LiDAR-only or camera-only input without retraining. To facilitate its +detector head to handle different input combinations, UniBEV aims to create +well-aligned Bird's Eye View (BEV) feature maps from each available modality. +Unlike prior BEV-based multi-modal detection methods, all sensor modalities +follow a uniform approach to resample features from the native sensor +coordinate systems to the BEV features. We furthermore investigate the +robustness of various fusion strategies w.r.t. missing modalities: the commonly +used feature concatenation, but also channel-wise averaging, and a +generalization to weighted averaging termed Channel Normalized Weights. To +validate its effectiveness, we compare UniBEV to state-of-the-art BEVFusion and +MetaBEV on nuScenes over all sensor input combinations. In this setting, UniBEV +achieves $52.5 \%$ mAP on average over all input combinations, significantly +improving over the baselines ($43.5 \%$ mAP on average for BEVFusion, $48.7 \%$ +mAP on average for MetaBEV). An ablation study shows the robustness benefits of +fusing by weighted averaging over regular concatenation, and of sharing queries +between the BEV encoders of each modality. Our code will be released upon paper +acceptance. + +
+
+ comment: Accepted by IEEE Intelligent Vehicles Symposium (IV 2024) +
+
+
+
+
+ + ♻ ☆ Understanding Video Transformers via Universal Concept Discovery CVPR 2024 + + +
+ This paper studies the problem of concept-based interpretability of +transformer representations for videos. Concretely, we seek to explain the +decision-making process of video transformers based on high-level, +spatiotemporal concepts that are automatically discovered. Prior research on +concept-based interpretability has concentrated solely on image-level tasks. +Comparatively, video models deal with the added temporal dimension, increasing +complexity and posing challenges in identifying dynamic concepts over time. In +this work, we systematically address these challenges by introducing the first +Video Transformer Concept Discovery (VTCD) algorithm. To this end, we propose +an efficient approach for unsupervised identification of units of video +transformer representations - concepts, and ranking their importance to the +output of a model. The resulting concepts are highly interpretable, revealing +spatio-temporal reasoning mechanisms and object-centric representations in +unstructured video models. Performing this analysis jointly over a diverse set +of supervised and self-supervised representations, we discover that some of +these mechanism are universal in video transformers. Finally, we show that VTCD +can be used for fine-grained action recognition and video object segmentation. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Limitations of Data-Driven Spectral Reconstruction -- Optics-Aware + Analysis and Mitigation + + +
+ Hyperspectral imaging empowers machine vision systems with the distinct +capability of identifying materials through recording their spectral +signatures. Recent efforts in data-driven spectral reconstruction aim at +extracting spectral information from RGB images captured by cost-effective RGB +cameras, instead of dedicated hardware. + In this paper we systematically analyze the performance of such methods, +evaluating both the practical limitations with respect to current datasets and +overfitting, as well as fundamental limitations with respect to the nature of +the information encoded in the RGB images, and the dependency of this +information on the optical system of the camera. + We find that, the current models are not robust under slight variations, +e.g., in noise level or compression of the RGB file. Without modeling +underrepresented spectral content, existing datasets and the models trained on +them are limited in their ability to cope with challenging metameric colors. To +mitigate this issue, we propose to exploit the combination of metameric data +augmentation and optical lens aberrations to improve the encoding of the +metameric information into the RGB image, which paves the road towards higher +performing spectral imaging and reconstruction approaches. + +
+
+ comment: 12 pages, 7 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ Language-Informed Visual Concept Learning ICLR 2024 + + +
+ Our understanding of the visual world is centered around various concept +axes, characterizing different aspects of visual entities. While different +concept axes can be easily specified by language, e.g. color, the exact visual +nuances along each axis often exceed the limitations of linguistic +articulations, e.g. a particular style of painting. In this work, our goal is +to learn a language-informed visual concept representation, by simply +distilling large pre-trained vision-language models. Specifically, we train a +set of concept encoders to encode the information pertinent to a set of +language-informed concept axes, with an objective of reproducing the input +image through a pre-trained Text-to-Image (T2I) model. To encourage better +disentanglement of different concept encoders, we anchor the concept embeddings +to a set of text embeddings obtained from a pre-trained Visual Question +Answering (VQA) model. At inference time, the model extracts concept embeddings +along various axes from new test images, which can be remixed to generate +images with novel compositions of visual concepts. With a lightweight test-time +finetuning procedure, it can also generalize to novel concepts unseen at +training. + +
+
+ comment: ICLR 2024. The first two authors contributed equally and are + alphabetically ordered. Project page: + https://ai.stanford.edu/~yzzhang/projects/concept-axes/ +
+
+
+
+
+ + ♻ ☆ TAO-Amodal: A Benchmark for Tracking Any Object Amodally + + +
+ Amodal perception, the ability to comprehend complete object structures from +partial visibility, is a fundamental skill, even for infants. Its significance +extends to applications like autonomous driving, where a clear understanding of +heavily occluded objects is essential. However, modern detection and tracking +algorithms often overlook this critical capability, perhaps due to the +prevalence of \textit{modal} annotations in most benchmarks. To address the +scarcity of amodal benchmarks, we introduce TAO-Amodal, featuring 833 diverse +categories in thousands of video sequences. Our dataset includes +\textit{amodal} and modal bounding boxes for visible and partially or fully +occluded objects, including those that are partially out of the camera frame. +We investigate the current lay of the land in both amodal tracking and +detection by benchmarking state-of-the-art modal trackers and amodal +segmentation methods. We find that existing methods, even when adapted for +amodal tracking, struggle to detect and track objects under heavy occlusion. To +mitigate this, we explore simple finetuning schemes that can increase the +amodal tracking and detection metrics of occluded objects by 2.1\% and 3.3\%. + +
+
+ comment: Project Page: https://tao-amodal.github.io +
+
+
+
+
+ + ♻ ☆ Exploiting Diffusion Prior for Generalizable Dense Prediction CVPR 2024 + + +
+ Contents generated by recent advanced Text-to-Image (T2I) diffusion models +are sometimes too imaginative for existing off-the-shelf dense predictors to +estimate due to the immitigable domain gap. We introduce DMP, a pipeline +utilizing pre-trained T2I models as a prior for dense prediction tasks. To +address the misalignment between deterministic prediction tasks and stochastic +T2I models, we reformulate the diffusion process through a sequence of +interpolations, establishing a deterministic mapping between input RGB images +and output prediction distributions. To preserve generalizability, we use +low-rank adaptation to fine-tune pre-trained models. Extensive experiments +across five tasks, including 3D property estimation, semantic segmentation, and +intrinsic image decomposition, showcase the efficacy of the proposed method. +Despite limited-domain training data, the approach yields faithful estimations +for arbitrary images, surpassing existing state-of-the-art algorithms. + +
+
+ comment: To appear in CVPR 2024. Project page: https://shinying.github.io/dmp +
+
+
+
+
+ + ♻ ☆ High-performance real-world optical computing trained by in situ + model-free optimization + + +
+ Optical computing systems provide high-speed and low-energy data processing +but face deficiencies in computationally demanding training and +simulation-to-reality gaps. We propose a gradient-based model-free optimization +(G-MFO) method based on a Monte Carlo gradient estimation algorithm for +computationally efficient in situ training of optical computing systems. This +approach treats an optical computing system as a black box and back-propagates +the loss directly to the optical computing weights' probability distributions, +circumventing the need for a computationally heavy and biased system +simulation. Our experiments on diffractive optical computing systems show that +G-MFO outperforms hybrid training on the MNIST and FMNIST datasets. +Furthermore, we demonstrate image-free and high-speed classification of cells +from their marker-free phase maps. Our method's model-free and high-performance +nature, combined with its low demand for computational resources, paves the way +for accelerating the transition of optical computing from laboratory +demonstrations to practical, real-world applications. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 207 + +
+
+
+ + ☆ On Train-Test Class Overlap and Detection for Image Retrieval CVPR2024 + + +
+ How important is it for training and evaluation sets to not have class +overlap in image retrieval? We revisit Google Landmarks v2 clean, the most +popular training set, by identifying and removing class overlap with Revisited +Oxford and Paris [34], the most popular evaluation set. By comparing the +original and the new RGLDv2-clean on a benchmark of reproduced state-of-the-art +methods, our findings are striking. Not only is there a dramatic drop in +performance, but it is inconsistent across methods, changing the ranking.What +does it take to focus on objects or interest and ignore background clutter when +indexing? Do we need to train an object detector and the representation +separately? Do we need location supervision? We introduce Single-stage +Detect-to-Retrieve (CiDeR), an end-to-end, single-stage pipeline to detect +objects of interest and extract a global image representation. We outperform +previous state-of-the-art on both existing training sets and the new +RGLDv2-clean. Our dataset is available at +https://github.com/dealicious-inc/RGLDv2-clean. + +
+
+ comment: CVPR2024 Accepted +
+
+
+
+
+ + ☆ Temporally Consistent Unbalanced Optimal Transport for Unsupervised + Action Segmentation CVPR 2024 + + +
+ We propose a novel approach to the action segmentation task for long, +untrimmed videos, based on solving an optimal transport problem. By encoding a +temporal consistency prior into a Gromov-Wasserstein problem, we are able to +decode a temporally consistent segmentation from a noisy affinity/matching cost +matrix between video frames and action classes. Unlike previous approaches, our +method does not require knowing the action order for a video to attain temporal +consistency. Furthermore, our resulting (fused) Gromov-Wasserstein problem can +be efficiently solved on GPUs using a few iterations of projected mirror +descent. We demonstrate the effectiveness of our method in an unsupervised +learning setting, where our method is used to generate pseudo-labels for +self-training. We evaluate our segmentation approach and unsupervised learning +pipeline on the Breakfast, 50-Salads, YouTube Instructions and Desktop Assembly +datasets, yielding state-of-the-art results for the unsupervised video action +segmentation task. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Can Biases in ImageNet Models Explain Generalization? CVPR2024 + + +
+ The robust generalization of models to rare, in-distribution (ID) samples +drawn from the long tail of the training distribution and to +out-of-training-distribution (OOD) samples is one of the major challenges of +current deep learning methods. For image classification, this manifests in the +existence of adversarial attacks, the performance drops on distorted images, +and a lack of generalization to concepts such as sketches. The current +understanding of generalization in neural networks is very limited, but some +biases that differentiate models from human vision have been identified and +might be causing these limitations. Consequently, several attempts with varying +success have been made to reduce these biases during training to improve +generalization. We take a step back and sanity-check these attempts. Fixing the +architecture to the well-established ResNet-50, we perform a large-scale study +on 48 ImageNet models obtained via different training methods to understand how +and if these biases - including shape bias, spectral biases, and critical bands +- interact with generalization. Our extensive study results reveal that +contrary to previous findings, these biases are insufficient to accurately +predict the generalization of a model holistically. We provide access to all +checkpoints and evaluation code at +https://github.com/paulgavrikov/biases_vs_generalization + +
+
+ comment: Accepted at CVPR2024 +
+
+
+
+
+ + ☆ MosquitoFusion: A Multiclass Dataset for Real-Time Detection of + Mosquitoes, Swarms, and Breeding Sites Using Deep Learning + + +
+ In this paper, we present an integrated approach to real-time mosquito +detection using our multiclass dataset (MosquitoFusion) containing 1204 diverse +images and leverage cutting-edge technologies, specifically computer vision, to +automate the identification of Mosquitoes, Swarms, and Breeding Sites. The +pre-trained YOLOv8 model, trained on this dataset, achieved a mean Average +Precision (mAP@50) of 57.1%, with precision at 73.4% and recall at 50.5%. The +integration of Geographic Information Systems (GIS) further enriches the depth +of our analysis, providing valuable insights into spatial patterns. The dataset +and code are available at https://github.com/faiyazabdullah/MosquitoFusion. + +
+
+
+
+
+ + ☆ Modality Translation for Object Detection Adaptation Without Forgetting + Prior Knowledge + + +
+ A common practice in deep learning consists of training large neural networks +on massive datasets to perform accurately for different domains and tasks. +While this methodology may work well in numerous application areas, it only +applies across modalities due to a larger distribution shift in data captured +using different sensors. This paper focuses on the problem of adapting a large +object detection model to one or multiple modalities while being efficient. To +do so, we propose ModTr as an alternative to the common approach of fine-tuning +large models. ModTr consists of adapting the input with a small transformation +network trained to minimize the detection loss directly. The original model can +therefore work on the translated inputs without any further change or +fine-tuning to its parameters. Experimental results on translating from IR to +RGB images on two well-known datasets show that this simple ModTr approach +provides detectors that can perform comparably or better than the standard +fine-tuning without forgetting the original knowledge. This opens the doors to +a more flexible and efficient service-based detection pipeline in which, +instead of using a different detector for each modality, a unique and unaltered +server is constantly running, where multiple modalities with the corresponding +translations can query it. Code: https://github.com/heitorrapela/ModTr. + +
+
+
+
+
+ + ☆ SUGAR: Pre-training 3D Visual Representations for Robotics CVPR 2024 + + +
+ Learning generalizable visual representations from Internet data has yielded +promising results for robotics. Yet, prevailing approaches focus on +pre-training 2D representations, being sub-optimal to deal with occlusions and +accurately localize objects in complex 3D scenes. Meanwhile, 3D representation +learning has been limited to single-object understanding. To address these +limitations, we introduce a novel 3D pre-training framework for robotics named +SUGAR that captures semantic, geometric and affordance properties of objects +through 3D point clouds. We underscore the importance of cluttered scenes in 3D +representation learning, and automatically construct a multi-object dataset +benefiting from cost-free supervision in simulation. SUGAR employs a versatile +transformer-based model to jointly address five pre-training tasks, namely +cross-modal knowledge distillation for semantic learning, masked point modeling +to understand geometry structures, grasping pose synthesis for object +affordance, 3D instance segmentation and referring expression grounding to +analyze cluttered scenes. We evaluate our learned representation on three +robotic-related tasks, namely, zero-shot 3D object recognition, referring +expression grounding, and language-driven robotic manipulation. Experimental +results show that SUGAR's 3D representation outperforms state-of-the-art 2D and +3D representations. + +
+
+ comment: Accepted to CVPR 2024. Project webpage: + https://cshizhe.github.io/projects/robot_sugar.html +
+
+
+
+
+ + ☆ QuAD: Query-based Interpretable Neural Motion Planning for Autonomous + Driving + + +
+ A self-driving vehicle must understand its environment to determine the +appropriate action. Traditional autonomy systems rely on object detection to +find the agents in the scene. However, object detection assumes a discrete set +of objects and loses information about uncertainty, so any errors compound when +predicting the future behavior of those agents. Alternatively, dense occupancy +grid maps have been utilized to understand free-space. However, predicting a +grid for the entire scene is wasteful since only certain spatio-temporal +regions are reachable and relevant to the self-driving vehicle. We present a +unified, interpretable, and efficient autonomy framework that moves away from +cascading modules that first perceive, then predict, and finally plan. Instead, +we shift the paradigm to have the planner query occupancy at relevant +spatio-temporal points, restricting the computation to those regions of +interest. Exploiting this representation, we evaluate candidate trajectories +around key factors such as collision avoidance, comfort, and progress for +safety and interpretability. Our approach achieves better highway driving +quality than the state-of-the-art in high-fidelity closed-loop simulations. + +
+
+
+
+
+ + ☆ TraveLER: A Multi-LMM Agent Framework for Video Question-Answering + + +
+ Recently, Large Multimodal Models (LMMs) have made significant progress in +video question-answering using a frame-wise approach by leveraging large-scale, +image-based pretraining in a zero-shot manner. While image-based methods for +videos have shown impressive performance, a current limitation is that they +often overlook how key timestamps are selected and cannot adjust when incorrect +timestamps are identified. Moreover, they are unable to extract details +relevant to the question, instead providing general descriptions of the frame. +To overcome this, we design a multi-LMM agent framework that travels along the +video, iteratively collecting relevant information from keyframes through +interactive question-asking until there is sufficient information to answer the +question. Specifically, we propose TraveLER, a model that can create a plan to +"Traverse" through the video, ask questions about individual frames to "Locate" +and store key information, and then "Evaluate" if there is enough information +to answer the question. Finally, if there is not enough information, our method +is able to "Replan" based on its collected knowledge. Through extensive +experiments, we find that the proposed TraveLER approach improves performance +on several video question-answering benchmarks, such as NExT-QA, STAR, and +Perception Test, without the need to fine-tune on specific datasets. + +
+
+
+
+
+ + ☆ Data-Efficient Unsupervised Interpolation Without Any Intermediate Frame + for 4D Medical Images CVPR 2024 + + +
+ 4D medical images, which represent 3D images with temporal information, are +crucial in clinical practice for capturing dynamic changes and monitoring +long-term disease progression. However, acquiring 4D medical images poses +challenges due to factors such as radiation exposure and imaging duration, +necessitating a balance between achieving high temporal resolution and +minimizing adverse effects. Given these circumstances, not only is data +acquisition challenging, but increasing the frame rate for each dataset also +proves difficult. To address this challenge, this paper proposes a simple yet +effective Unsupervised Volumetric Interpolation framework, UVI-Net. This +framework facilitates temporal interpolation without the need for any +intermediate frames, distinguishing it from the majority of other existing +unsupervised methods. Experiments on benchmark datasets demonstrate significant +improvements across diverse evaluation metrics compared to unsupervised and +supervised baselines. Remarkably, our approach achieves this superior +performance even when trained with a dataset as small as one, highlighting its +exceptional robustness and efficiency in scenarios with sparse supervision. +This positions UVI-Net as a compelling alternative for 4D medical imaging, +particularly in settings where data availability is limited. The source code is +available at https://github.com/jungeun122333/UVI-Net. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Finding Regions of Interest in Whole Slide Images Using Multiple + Instance Learning + + +
+ Whole Slide Images (WSI), obtained by high-resolution digital scanning of +microscope slides at multiple scales, are the cornerstone of modern Digital +Pathology. However, they represent a particular challenge to +AI-based/AI-mediated analysis because pathology labeling is typically done at +slide-level, instead of tile-level. It is not just that medical diagnostics is +recorded at the specimen level, the detection of oncogene mutation is also +experimentally obtained, and recorded by initiatives like The Cancer Genome +Atlas (TCGA), at the slide level. This configures a dual challenge: a) +accurately predicting the overall cancer phenotype and b) finding out what +cellular morphologies are associated with it at the tile level. To address +these challenges, a weakly supervised Multiple Instance Learning (MIL) approach +was explored for two prevalent cancer types, Invasive Breast Carcinoma +(TCGA-BRCA) and Lung Squamous Cell Carcinoma (TCGA-LUSC). This approach was +explored for tumor detection at low magnification levels and TP53 mutations at +various levels. Our results show that a novel additive implementation of MIL +matched the performance of reference implementation (AUC 0.96), and was only +slightly outperformed by Attention MIL (AUC 0.97). More interestingly from the +perspective of the molecular pathologist, these different AI architectures +identify distinct sensitivities to morphological features (through the +detection of Regions of Interest, RoI) at different amplification levels. +Tellingly, TP53 mutation was most sensitive to features at the higher +applications where cellular morphology is resolved. + +
+
+
+
+
+ + ☆ Neural Implicit Representation for Building Digital Twins of Unknown + Articulated Objects CVPR 2024 + + +
+ We address the problem of building digital twins of unknown articulated +objects from two RGBD scans of the object at different articulation states. We +decompose the problem into two stages, each addressing distinct aspects. Our +method first reconstructs object-level shape at each state, then recovers the +underlying articulation model including part segmentation and joint +articulations that associate the two states. By explicitly modeling point-level +correspondences and exploiting cues from images, 3D reconstructions, and +kinematics, our method yields more accurate and stable results compared to +prior work. It also handles more than one movable part and does not rely on any +object shape or structure priors. Project page: +https://github.com/NVlabs/DigitalTwinArt + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Generation and Detection of Sign Language Deepfakes - A Linguistic and + Visual Analysis + + +
+ A question in the realm of deepfakes is slowly emerging pertaining to whether +we can go beyond facial deepfakes and whether it would be beneficial to +society. Therefore, this research presents a positive application of deepfake +technology in upper body generation, while performing sign-language for the +Deaf and Hard of Hearing (DHoH) community. The resulting videos are later +vetted with a sign language expert. This is particularly helpful, given the +intricate nature of sign language, a scarcity of sign language experts, and +potential benefits for health and education. The objectives of this work +encompass constructing a reliable deepfake dataset, evaluating its technical +and visual credibility through computer vision and natural language processing +models, and assessing the plausibility of the generated content. With over 1200 +videos, featuring both previously seen and unseen individuals for the +generation model, using the help of a sign language expert, we establish a +deepfake dataset in sign language that can further be utilized to detect fake +videos that may target certain people of determination. + +
+
+ comment: 13 pages, 13 figures, Computer Vision and Image Understanding Journal +
+
+
+
+
+ + ☆ The Radar Ghost Dataset -- An Evaluation of Ghost Objects in Automotive + Radar Data + + +
+ Radar sensors have a long tradition in advanced driver assistance systems +(ADAS) and also play a major role in current concepts for autonomous vehicles. +Their importance is reasoned by their high robustness against meteorological +effects, such as rain, snow, or fog, and the radar's ability to measure +relative radial velocity differences via the Doppler effect. The cause for +these advantages, namely the large wavelength, is also one of the drawbacks of +radar sensors. Compared to camera or lidar sensor, a lot more surfaces in a +typical traffic scenario appear flat relative to the radar's emitted signal. +This results in multi-path reflections or so called ghost detections in the +radar signal. Ghost objects pose a major source for potential false positive +detections in a vehicle's perception pipeline. Therefore, it is important to be +able to segregate multi-path reflections from direct ones. In this article, we +present a dataset with detailed manual annotations for different kinds of ghost +detections. Moreover, two different approaches for identifying these kinds of +objects are evaluated. We hope that our dataset encourages more researchers to +engage in the fields of multi-path object suppression or exploitation. + +
+
+
+
+
+ + ☆ DPMesh: Exploiting Diffusion Prior for Occluded Human Mesh Recovery CVPR + + +
+ The recovery of occluded human meshes presents challenges for current methods +due to the difficulty in extracting effective image features under severe +occlusion. In this paper, we introduce DPMesh, an innovative framework for +occluded human mesh recovery that capitalizes on the profound diffusion prior +about object structure and spatial relationships embedded in a pre-trained +text-to-image diffusion model. Unlike previous methods reliant on conventional +backbones for vanilla feature extraction, DPMesh seamlessly integrates the +pre-trained denoising U-Net with potent knowledge as its image backbone and +performs a single-step inference to provide occlusion-aware information. To +enhance the perception capability for occluded poses, DPMesh incorporates +well-designed guidance via condition injection, which produces effective +controls from 2D observations for the denoising U-Net. Furthermore, we explore +a dedicated noisy key-point reasoning approach to mitigate disturbances arising +from occlusion and crowded scenarios. This strategy fully unleashes the +perceptual capability of the diffusion prior, thereby enhancing accuracy. +Extensive experiments affirm the efficacy of our framework, as we outperform +state-of-the-art methods on both occlusion-specific and standard datasets. The +persuasive results underscore its ability to achieve precise and robust 3D +human mesh recovery, particularly in challenging scenarios involving occlusion +and crowded scenes. + +
+
+ comment: Accepted by IEEE/CVF Conference on Computer Vision and Pattern + Recognition (CVPR) 2024 +
+
+
+
+
+ + ☆ On the Faithfulness of Vision Transformer Explanations CVPR 2024 + + +
+ To interpret Vision Transformers, post-hoc explanations assign salience +scores to input pixels, providing human-understandable heatmaps. However, +whether these interpretations reflect true rationales behind the model's output +is still underexplored. To address this gap, we study the faithfulness +criterion of explanations: the assigned salience scores should represent the +influence of the corresponding input pixels on the model's predictions. To +evaluate faithfulness, we introduce Salience-guided Faithfulness Coefficient +(SaCo), a novel evaluation metric leveraging essential information of salience +distribution. Specifically, we conduct pair-wise comparisons among distinct +pixel groups and then aggregate the differences in their salience scores, +resulting in a coefficient that indicates the explanation's degree of +faithfulness. Our explorations reveal that current metrics struggle to +differentiate between advanced explanation methods and Random Attribution, +thereby failing to capture the faithfulness property. In contrast, our proposed +SaCo offers a reliable faithfulness measurement, establishing a robust metric +for interpretations. Furthermore, our SaCo demonstrates that the use of +gradient and multi-layer aggregation can markedly enhance the faithfulness of +attention-based explanation, shedding light on potential paths for advancing +Vision Transformer explainability. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ OVFoodSeg: Elevating Open-Vocabulary Food Image Segmentation via + Image-Informed Textual Representation CVPR 2024 + + +
+ In the realm of food computing, segmenting ingredients from images poses +substantial challenges due to the large intra-class variance among the same +ingredients, the emergence of new ingredients, and the high annotation costs +associated with large food segmentation datasets. Existing approaches primarily +utilize a closed-vocabulary and static text embeddings setting. These methods +often fall short in effectively handling the ingredients, particularly new and +diverse ones. In response to these limitations, we introduce OVFoodSeg, a +framework that adopts an open-vocabulary setting and enhances text embeddings +with visual context. By integrating vision-language models (VLMs), our approach +enriches text embedding with image-specific information through two innovative +modules, eg, an image-to-text learner FoodLearner and an Image-Informed Text +Encoder. The training process of OVFoodSeg is divided into two stages: the +pre-training of FoodLearner and the subsequent learning phase for segmentation. +The pre-training phase equips FoodLearner with the capability to align visual +information with corresponding textual representations that are specifically +related to food, while the second phase adapts both the FoodLearner and the +Image-Informed Text Encoder for the segmentation task. By addressing the +deficiencies of previous models, OVFoodSeg demonstrates a significant +improvement, achieving an 4.9\% increase in mean Intersection over Union (mIoU) +on the FoodSeg103 dataset, setting a new milestone for food image segmentation. + +
+
+ comment: CVPR 2024; 12 pages +
+
+
+
+
+ + ☆ ContactHandover: Contact-Guided Robot-to-Human Object Handover + + +
+ Robot-to-human object handover is an important step in many human robot +collaboration tasks. A successful handover requires the robot to maintain a +stable grasp on the object while making sure the human receives the object in a +natural and easy-to-use manner. We propose ContactHandover, a robot to human +handover system that consists of two phases: a contact-guided grasping phase +and an object delivery phase. During the grasping phase, ContactHandover +predicts both 6-DoF robot grasp poses and a 3D affordance map of human contact +points on the object. The robot grasp poses are reranked by penalizing those +that block human contact points, and the robot executes the highest ranking +grasp. During the delivery phase, the robot end effector pose is computed by +maximizing human contact points close to the human while minimizing the human +arm joint torques and displacements. We evaluate our system on 27 diverse +household objects and show that our system achieves better visibility and +reachability of human contacts to the receiver compared to several baselines. +More results can be found on +https://clairezixiwang.github.io/ContactHandover.github.io + +
+
+ comment: Project website: + https://clairezixiwang.github.io/ContactHandover.github.io/ +
+
+
+
+
+ + ☆ Object-conditioned Bag of Instances for Few-Shot Personalized Instance + Recognition ICASSP 2024 + + +
+ Nowadays, users demand for increased personalization of vision systems to +localize and identify personal instances of objects (e.g., my dog rather than +dog) from a few-shot dataset only. Despite outstanding results of deep networks +on classical label-abundant benchmarks (e.g., those of the latest YOLOv8 model +for standard object detection), they struggle to maintain within-class +variability to represent different instances rather than object categories +only. We construct an Object-conditioned Bag of Instances (OBoI) based on +multi-order statistics of extracted features, where generic object detection +models are extended to search and identify personal instances from the OBoI's +metric space, without need for backpropagation. By relying on multi-order +statistics, OBoI achieves consistent superior accuracy in distinguishing +different instances. In the results, we achieve 77.1% personal object +recognition accuracy in case of 18 personal instances, showing about 12% +relative gain over the state of the art. + +
+
+ comment: ICASSP 2024. Copyright 2024 IEEE. Personal use of this material is + permitted. Permission from IEEE must be obtained for all other uses, in any + current or future media, including reprinting/republishing this material for + advertising or promotional purposes, creating new collective works, for + resale or redistribution to servers or lists, or reuse of any copyrighted + component of this work in other +
+
+
+
+
+ + ☆ NeRF-MAE : Masked AutoEncoders for Self Supervised 3D representation + Learning for Neural Radiance Fields + + +
+ Neural fields excel in computer vision and robotics due to their ability to +understand the 3D visual world such as inferring semantics, geometry, and +dynamics. Given the capabilities of neural fields in densely representing a 3D +scene from 2D images, we ask the question: Can we scale their self-supervised +pretraining, specifically using masked autoencoders, to generate effective 3D +representations from posed RGB images. Owing to the astounding success of +extending transformers to novel data modalities, we employ standard 3D Vision +Transformers to suit the unique formulation of NeRFs. We leverage NeRF's +volumetric grid as a dense input to the transformer, contrasting it with other +3D representations such as pointclouds where the information density can be +uneven, and the representation is irregular. Due to the difficulty of applying +masked autoencoders to an implicit representation, such as NeRF, we opt for +extracting an explicit representation that canonicalizes scenes across domains +by employing the camera trajectory for sampling. Our goal is made possible by +masking random patches from NeRF's radiance and density grid and employing a +standard 3D Swin Transformer to reconstruct the masked patches. In doing so, +the model can learn the semantic and spatial structure of complete scenes. We +pretrain this representation at scale on our proposed curated posed-RGB data, +totaling over 1.6 million images. Once pretrained, the encoder is used for +effective 3D transfer learning. Our novel self-supervised pretraining for +NeRFs, NeRF-MAE, scales remarkably well and improves performance on various +challenging 3D tasks. Utilizing unlabeled posed 2D data for pretraining, +NeRF-MAE significantly outperforms self-supervised 3D pretraining and NeRF +scene understanding baselines on Front3D and ScanNet datasets with an absolute +performance improvement of over 20% AP50 and 8% AP25 for 3D object detection. + +
+
+ comment: 29 pages, 13 figures. Project Page: https://nerf-mae.github.io/ +
+
+
+
+
+ + ☆ Noise2Image: Noise-Enabled Static Scene Recovery for Event Cameras + + +
+ Event cameras capture changes of intensity over time as a stream of 'events' +and generally cannot measure intensity itself; hence, they are only used for +imaging dynamic scenes. However, fluctuations due to random photon arrival +inevitably trigger noise events, even for static scenes. While previous efforts +have been focused on filtering out these undesirable noise events to improve +signal quality, we find that, in the photon-noise regime, these noise events +are correlated with the static scene intensity. We analyze the noise event +generation and model its relationship to illuminance. Based on this +understanding, we propose a method, called Noise2Image, to leverage the +illuminance-dependent noise characteristics to recover the static parts of a +scene, which are otherwise invisible to event cameras. We experimentally +collect a dataset of noise events on static scenes to train and validate +Noise2Image. Our results show that Noise2Image can robustly recover intensity +images solely from noise events, providing a novel approach for capturing +static scenes in event cameras, without additional hardware. + +
+
+
+
+
+ + ☆ CausalChaos! Dataset for Comprehensive Causal Action Question Answering + Over Longer Causal Chains Grounded in Dynamic Visual Scenes + + +
+ Causal video question answering (QA) has garnered increasing interest, yet +existing datasets often lack depth in causal reasoning analysis. To address +this gap, we capitalize on the unique properties of cartoons and construct +CausalChaos!, a novel, challenging causal Why-QA dataset built upon the iconic +"Tom and Jerry" cartoon series. With thoughtful questions and multi-level +answers, our dataset contains much longer causal chains embedded in dynamic +interactions and visuals, at the same time principles of animation allows +animators to create well-defined, unambiguous causal relationships. These +factors allow models to solve more challenging, yet well-defined causal +relationships. We also introduce hard negative mining, including +CausalConfusion version. While models perform well, there is much room for +improvement, especially, on open-ended answers. We identify more +advanced/explicit causal relationship modeling and joint modeling of vision and +language as the immediate areas for future efforts to focus upon. Along with +the other complementary datasets, our new challenging dataset will pave the way +for these developments in the field. We will release our dataset, codes, and +models to help future efforts in this domain. + +
+
+
+
+
+ + ☆ Bigger is not Always Better: Scaling Properties of Latent Diffusion + Models + + +
+ We study the scaling properties of latent diffusion models (LDMs) with an +emphasis on their sampling efficiency. While improved network architecture and +inference algorithms have shown to effectively boost sampling efficiency of +diffusion models, the role of model size -- a critical determinant of sampling +efficiency -- has not been thoroughly examined. Through empirical analysis of +established text-to-image diffusion models, we conduct an in-depth +investigation into how model size influences sampling efficiency across varying +sampling steps. Our findings unveil a surprising trend: when operating under a +given inference budget, smaller models frequently outperform their larger +equivalents in generating high-quality results. Moreover, we extend our study +to demonstrate the generalizability of the these findings by applying various +diffusion samplers, exploring diverse downstream tasks, evaluating +post-distilled models, as well as comparing performance relative to training +compute. These findings open up new pathways for the development of LDM scaling +strategies which can be employed to enhance generative capabilities within +limited inference budgets. + +
+
+
+
+
+ + ☆ Streaming Dense Video Captioning CVPR 2024 + + +
+ An ideal model for dense video captioning -- predicting captions localized +temporally in a video -- should be able to handle long input videos, predict +rich, detailed textual descriptions, and be able to produce outputs before +processing the entire video. Current state-of-the-art models, however, process +a fixed number of downsampled frames, and make a single full prediction after +seeing the whole video. We propose a streaming dense video captioning model +that consists of two novel components: First, we propose a new memory module, +based on clustering incoming tokens, which can handle arbitrarily long videos +as the memory is of a fixed size. Second, we develop a streaming decoding +algorithm that enables our model to make predictions before the entire video +has been processed. Our model achieves this streaming ability, and +significantly improves the state-of-the-art on three dense video captioning +benchmarks: ActivityNet, YouCook2 and ViTT. Our code is released at +https://github.com/google-research/scenic. + +
+
+ comment: CVPR 2024. Code is available at + https://github.com/google-research/scenic/tree/main/scenic/projects/streaming_dvc +
+
+
+
+
+ + ☆ MagicMirror: Fast and High-Quality Avatar Generation with a Constrained + Search Space + + +
+ We introduce a novel framework for 3D human avatar generation and +personalization, leveraging text prompts to enhance user engagement and +customization. Central to our approach are key innovations aimed at overcoming +the challenges in photo-realistic avatar synthesis. Firstly, we utilize a +conditional Neural Radiance Fields (NeRF) model, trained on a large-scale +unannotated multi-view dataset, to create a versatile initial solution space +that accelerates and diversifies avatar generation. Secondly, we develop a +geometric prior, leveraging the capabilities of Text-to-Image Diffusion Models, +to ensure superior view invariance and enable direct optimization of avatar +geometry. These foundational ideas are complemented by our optimization +pipeline built on Variational Score Distillation (VSD), which mitigates texture +loss and over-saturation issues. As supported by our extensive experiments, +these strategies collectively enable the creation of custom avatars with +unparalleled visual quality and better adherence to input text prompts. You can +find more results and videos in our website: +https://syntec-research.github.io/MagicMirror + +
+
+
+
+
+ + ☆ CosmicMan: A Text-to-Image Foundation Model for Humans CVPR 2024 + + +
+ We present CosmicMan, a text-to-image foundation model specialized for +generating high-fidelity human images. Unlike current general-purpose +foundation models that are stuck in the dilemma of inferior quality and +text-image misalignment for humans, CosmicMan enables generating +photo-realistic human images with meticulous appearance, reasonable structure, +and precise text-image alignment with detailed dense descriptions. At the heart +of CosmicMan's success are the new reflections and perspectives on data and +models: (1) We found that data quality and a scalable data production flow are +essential for the final results from trained models. Hence, we propose a new +data production paradigm, Annotate Anyone, which serves as a perpetual data +flywheel to produce high-quality data with accurate yet cost-effective +annotations over time. Based on this, we constructed a large-scale dataset, +CosmicMan-HQ 1.0, with 6 Million high-quality real-world human images in a mean +resolution of 1488x1255, and attached with precise text annotations deriving +from 115 Million attributes in diverse granularities. (2) We argue that a +text-to-image foundation model specialized for humans must be pragmatic -- easy +to integrate into down-streaming tasks while effective in producing +high-quality human images. Hence, we propose to model the relationship between +dense text descriptions and image pixels in a decomposed manner, and present +Decomposed-Attention-Refocusing (Daring) training framework. It seamlessly +decomposes the cross-attention features in existing text-to-image diffusion +model, and enforces attention refocusing without adding extra modules. Through +Daring, we show that explicitly discretizing continuous text space into several +basic groups that align with human body structure is the key to tackling the +misalignment problem in a breeze. + +
+
+ comment: Accepted by CVPR 2024. The supplementary material is included. + Project Page: https://cosmicman-cvpr2024.github.io +
+
+
+
+
+ + ☆ Measuring Style Similarity in Diffusion Models + + +
+ Generative models are now widely used by graphic designers and artists. Prior +works have shown that these models remember and often replicate content from +their training data during generation. Hence as their proliferation increases, +it has become important to perform a database search to determine whether the +properties of the image are attributable to specific training data, every time +before a generated image is used for professional purposes. Existing tools for +this purpose focus on retrieving images of similar semantic content. Meanwhile, +many artists are concerned with style replication in text-to-image models. We +present a framework for understanding and extracting style descriptors from +images. Our framework comprises a new dataset curated using the insight that +style is a subjective property of an image that captures complex yet meaningful +interactions of factors including but not limited to colors, textures, shapes, +etc. We also propose a method to extract style descriptors that can be used to +attribute style of a generated image to the images used in the training dataset +of a text-to-image model. We showcase promising results in various style +retrieval tasks. We also quantitatively and qualitatively analyze style +attribution and matching in the Stable Diffusion model. Code and artifacts are +available at https://github.com/learn2phoenix/CSD. + +
+
+
+
+
+ + ☆ Evaluating Text-to-Visual Generation with Image-to-Text Generation + + +
+ Despite significant progress in generative AI, comprehensive evaluation +remains challenging because of the lack of effective metrics and standardized +benchmarks. For instance, the widely-used CLIPScore measures the alignment +between a (generated) image and text prompt, but it fails to produce reliable +scores for complex prompts involving compositions of objects, attributes, and +relations. One reason is that text encoders of CLIP can notoriously act as a +"bag of words", conflating prompts such as "the horse is eating the grass" with +"the grass is eating the horse". To address this, we introduce the VQAScore, +which uses a visual-question-answering (VQA) model to produce an alignment +score by computing the probability of a "Yes" answer to a simple "Does this +figure show '{text}'?" question. Though simpler than prior art, VQAScore +computed with off-the-shelf models produces state-of-the-art results across +many (8) image-text alignment benchmarks. We also compute VQAScore with an +in-house model that follows best practices in the literature. For example, we +use a bidirectional image-question encoder that allows image embeddings to +depend on the question being asked (and vice versa). Our in-house model, +CLIP-FlanT5, outperforms even the strongest baselines that make use of the +proprietary GPT-4V. Interestingly, although we train with only images, VQAScore +can also align text with video and 3D models. VQAScore allows researchers to +benchmark text-to-visual generation using complex texts that capture the +compositional structure of real-world prompts. We introduce GenAI-Bench, a more +challenging benchmark with 1,600 compositional text prompts that require +parsing scenes, objects, attributes, relationships, and high-order reasoning +like comparison and logic. GenAI-Bench also offers over 15,000 human ratings +for leading image and video generation models such as Stable Diffusion, DALL-E +3, and Gen2. + +
+
+ comment: We open-source our data, model, and code at: + https://github.com/linzhiqiu/t2v_metrics ; Project page: + https://linzhiqiu.github.io/papers/vqascore +
+
+
+
+
+ + ☆ Large Motion Model for Unified Multi-Modal Motion Generation + + +
+ Human motion generation, a cornerstone technique in animation and video +production, has widespread applications in various tasks like text-to-motion +and music-to-dance. Previous works focus on developing specialist models +tailored for each task without scalability. In this work, we present Large +Motion Model (LMM), a motion-centric, multi-modal framework that unifies +mainstream motion generation tasks into a generalist model. A unified motion +model is appealing since it can leverage a wide range of motion data to achieve +broad generalization beyond a single task. However, it is also challenging due +to the heterogeneous nature of substantially different motion data and tasks. +LMM tackles these challenges from three principled aspects: 1) Data: We +consolidate datasets with different modalities, formats and tasks into a +comprehensive yet unified motion generation dataset, MotionVerse, comprising 10 +tasks, 16 datasets, a total of 320k sequences, and 100 million frames. 2) +Architecture: We design an articulated attention mechanism ArtAttention that +incorporates body part-aware modeling into Diffusion Transformer backbone. 3) +Pre-Training: We propose a novel pre-training strategy for LMM, which employs +variable frame rates and masking forms, to better exploit knowledge from +diverse training data. Extensive experiments demonstrate that our generalist +LMM achieves competitive performance across various standard motion generation +tasks over state-of-the-art specialist models. Notably, LMM exhibits strong +generalization capabilities and emerging properties across many unseen tasks. +Additionally, our ablation studies reveal valuable insights about training and +scaling up large motion models for future research. + +
+
+ comment: Homepage: https://mingyuan-zhang.github.io/projects/LMM.html +
+
+
+
+
+ + ☆ LoSA: Long-Short-range Adapter for Scaling End-to-End Temporal Action + Localization + + +
+ Temporal Action Localization (TAL) involves localizing and classifying action +snippets in an untrimmed video. The emergence of large video foundation models +has led RGB-only video backbones to outperform previous methods needing both +RGB and optical flow modalities. Leveraging these large models is often limited +to training only the TAL head due to the prohibitively large GPU memory +required to adapt the video backbone for TAL. To overcome this limitation, we +introduce LoSA, the first memory-and-parameter-efficient backbone adapter +designed specifically for TAL to handle untrimmed videos. LoSA specializes for +TAL by introducing Long-Short-range Adapters that adapt the intermediate layers +of the video backbone over different temporal ranges. These adapters run +parallel to the video backbone to significantly reduce memory footprint. LoSA +also includes Long-Short-range Fusion that strategically combines the output of +these adapters from the video backbone layers to enhance the video features +provided to the TAL head. Experiments show that LoSA significantly outperforms +all existing methods on standard TAL benchmarks, THUMOS-14 and +ActivityNet-v1.3, by scaling end-to-end backbone adaptation to +billion-parameter-plus models like VideoMAEv2~(ViT-g) and leveraging them +beyond head-only transfer learning. + +
+
+
+
+
+ + ☆ BiPer: Binary Neural Networks using a Periodic Function + + +
+ Quantized neural networks employ reduced precision representations for both +weights and activations. This quantization process significantly reduces the +memory requirements and computational complexity of the network. Binary Neural +Networks (BNNs) are the extreme quantization case, representing values with +just one bit. Since the sign function is typically used to map real values to +binary values, smooth approximations are introduced to mimic the gradients +during error backpropagation. Thus, the mismatch between the forward and +backward models corrupts the direction of the gradient, causing training +inconsistency problems and performance degradation. In contrast to current BNN +approaches, we propose to employ a binary periodic (BiPer) function during +binarization. Specifically, we use a square wave for the forward pass to obtain +the binary values and employ the trigonometric sine function with the same +period of the square wave as a differentiable surrogate during the backward +pass. We demonstrate that this approach can control the quantization error by +using the frequency of the periodic function and improves network performance. +Extensive experiments validate the effectiveness of BiPer in benchmark datasets +and network architectures, with improvements of up to 1% and 0.69% with respect +to state-of-the-art methods in the classification task over CIFAR-10 and +ImageNet, respectively. Our code is publicly available at +https://github.com/edmav4/BiPer. + +
+
+
+
+
+ + ☆ Language Guided Domain Generalized Medical Image Segmentation + + +
+ Single source domain generalization (SDG) holds promise for more reliable and +consistent image segmentation across real-world clinical settings particularly +in the medical domain, where data privacy and acquisition cost constraints +often limit the availability of diverse datasets. Depending solely on visual +features hampers the model's capacity to adapt effectively to various domains, +primarily because of the presence of spurious correlations and domain-specific +characteristics embedded within the image features. Incorporating text features +alongside visual features is a potential solution to enhance the model's +understanding of the data, as it goes beyond pixel-level information to provide +valuable context. Textual cues describing the anatomical structures, their +appearances, and variations across various imaging modalities can guide the +model in domain adaptation, ultimately contributing to more robust and +consistent segmentation. In this paper, we propose an approach that explicitly +leverages textual information by incorporating a contrastive learning mechanism +guided by the text encoder features to learn a more robust feature +representation. We assess the effectiveness of our text-guided contrastive +feature alignment technique in various scenarios, including cross-modality, +cross-sequence, and cross-site settings for different segmentation tasks. Our +approach achieves favorable performance against existing methods in literature. +Our code and model weights are available at +https://github.com/ShahinaKK/LG_SDG.git. + +
+
+ comment: Accepted at ISBI2024 +
+
+
+
+
+ + ☆ What is Point Supervision Worth in Video Instance Segmentation? + + +
+ Video instance segmentation (VIS) is a challenging vision task that aims to +detect, segment, and track objects in videos. Conventional VIS methods rely on +densely-annotated object masks which are expensive. We reduce the human +annotations to only one point for each object in a video frame during training, +and obtain high-quality mask predictions close to fully supervised models. Our +proposed training method consists of a class-agnostic proposal generation +module to provide rich negative samples and a spatio-temporal point-based +matcher to match the object queries with the provided point annotations. +Comprehensive experiments on three VIS benchmarks demonstrate competitive +performance of the proposed framework, nearly matching fully supervised +methods. + +
+
+
+
+
+ + ☆ Bridging Remote Sensors with Multisensor Geospatial Foundation Models CVPR + + +
+ In the realm of geospatial analysis, the diversity of remote sensors, +encompassing both optical and microwave technologies, offers a wealth of +distinct observational capabilities. Recognizing this, we present msGFM, a +multisensor geospatial foundation model that effectively unifies data from four +key sensor modalities. This integration spans an expansive dataset of two +million multisensor images. msGFM is uniquely adept at handling both paired and +unpaired sensor data. For data originating from identical geolocations, our +model employs an innovative cross-sensor pretraining approach in masked image +modeling, enabling the synthesis of joint representations from diverse sensors. +msGFM, incorporating four remote sensors, upholds strong performance, forming a +comprehensive model adaptable to various sensor types. msGFM has demonstrated +enhanced proficiency in a range of both single-sensor and multisensor +downstream tasks. These include scene classification, segmentation, cloud +removal, and pan-sharpening. A key discovery of our research is that +representations derived from natural images are not always compatible with the +distinct characteristics of geospatial remote sensors, underscoring the +limitations of existing representations in this field. Our work can serve as a +guide for developing multisensor geospatial pretraining models, paving the way +for more advanced geospatial capabilities. + +
+
+ comment: Accepted to CVPR +
+
+
+
+
+ + ☆ FireANTs: Adaptive Riemannian Optimization for Multi-Scale Diffeomorphic + Registration + + +
+ Diffeomorphic Image Registration is a critical part of the analysis in +various imaging modalities and downstream tasks like image translation, +segmentation, and atlas building. Registration algorithms based on optimization +have stood the test of time in terms of accuracy, reliability, and robustness +across a wide spectrum of modalities and acquisition settings. However, these +algorithms converge slowly, are prohibitively expensive to run, and their usage +requires a steep learning curve, limiting their scalability to larger clinical +and scientific studies. In this paper, we develop multi-scale Adaptive +Riemannian Optimization algorithms for diffeomorphic image registration. We +demonstrate compelling improvements on image registration across a spectrum of +modalities and anatomies by measuring structural and landmark overlap of the +registered image volumes. Our proposed framework leads to a consistent +improvement in performance, and from 300x up to 2000x speedup over existing +algorithms. Our modular library design makes it easy to use and allows +customization via user-defined cost functions. + +
+
+
+
+
+ + ☆ Scalable Scene Modeling from Perspective Imaging: Physics-based + Appearance and Geometry Inference + + +
+ 3D scene modeling techniques serve as the bedrocks in the geospatial +engineering and computer science, which drives many applications ranging from +automated driving, terrain mapping, navigation, virtual, augmented, mixed, and +extended reality (for gaming and movie industry etc.). This dissertation +presents a fraction of contributions that advances 3D scene modeling to its +state of the art, in the aspects of both appearance and geometry modeling. In +contrast to the prevailing deep learning methods, as a core contribution, this +thesis aims to develop algorithms that follow first principles, where +sophisticated physic-based models are introduced alongside with simpler +learning and inference tasks. The outcomes of these algorithms yield processes +that can consume much larger volume of data for highly accurate reconstructing +3D scenes at a scale without losing methodological generality, which are not +possible by contemporary complex-model based deep learning methods. +Specifically, the dissertation introduces three novel methodologies that +address the challenges of inferring appearance and geometry through +physics-based modeling. + Overall, the research encapsulated in this dissertation marks a series of +methodological triumphs in the processing of complex datasets. By navigating +the confluence of deep learning, computational geometry, and photogrammetry, +this work lays down a robust framework for future exploration and practical +application in the rapidly evolving field of 3D scene reconstruction. The +outcomes of these studies are evidenced through rigorous experiments and +comparisons with existing state-of-the-art methods, demonstrating the efficacy +and scalability of the proposed approaches. + +
+
+ comment: Ph.D. Dissertation, Geospatial Data Analytics Lab, The Ohio State + University, 2024. arXiv admin note: text overlap with arXiv:2108.08378 +
+
+
+
+
+ + ☆ An image speaks a thousand words, but can everyone listen? On + translating images for cultural relevance + + +
+ Given the rise of multimedia content, human translators increasingly focus on +culturally adapting not only words but also other modalities such as images to +convey the same meaning. While several applications stand to benefit from this, +machine translation systems remain confined to dealing with language in speech +and text. In this work, we take a first step towards translating images to make +them culturally relevant. First, we build three pipelines comprising +state-of-the-art generative models to do the task. Next, we build a two-part +evaluation dataset: i) concept: comprising 600 images that are cross-culturally +coherent, focusing on a single concept per image, and ii) application: +comprising 100 images curated from real-world applications. We conduct a +multi-faceted human evaluation of translated images to assess for cultural +relevance and meaning preservation. We find that as of today, image-editing +models fail at this task, but can be improved by leveraging LLMs and retrievers +in the loop. Best pipelines can only translate 5% of images for some countries +in the easier concept dataset and no translation is successful for some +countries in the application dataset, highlighting the challenging nature of +the task. Our code and data is released here: +https://github.com/simran-khanuja/image-transcreation. + +
+
+
+
+
+ + ☆ A Unified and Interpretable Emotion Representation and Expression + Generation CVPR 2024 + + +
+ Canonical emotions, such as happy, sad, and fearful, are easy to understand +and annotate. However, emotions are often compound, e.g. happily surprised, and +can be mapped to the action units (AUs) used for expressing emotions, and +trivially to the canonical ones. Intuitively, emotions are continuous as +represented by the arousal-valence (AV) model. An interpretable unification of +these four modalities - namely, Canonical, Compound, AUs, and AV - is highly +desirable, for a better representation and understanding of emotions. However, +such unification remains to be unknown in the current literature. In this work, +we propose an interpretable and unified emotion model, referred as C2A2. We +also develop a method that leverages labels of the non-unified models to +annotate the novel unified one. Finally, we modify the text-conditional +diffusion models to understand continuous numbers, which are then used to +generate continuous expressions using our unified emotion model. Through +quantitative and qualitative experiments, we show that our generated images are +rich and capture subtle expressions. Our work allows a fine-grained generation +of expressions in conjunction with other textual inputs and offers a new label +space for emotions at the same time. + +
+
+ comment: 10 pages, 9 figures, 3 tables Accepted at CVPR 2024. Project page: + https://emotion-diffusion.github.io +
+
+
+
+
+ + ☆ AURORA: Navigating UI Tarpits via Automated Neural Screen Understanding + + +
+ Nearly a decade of research in software engineering has focused on automating +mobile app testing to help engineers in overcoming the unique challenges +associated with the software platform. Much of this work has come in the form +of Automated Input Generation tools (AIG tools) that dynamically explore app +screens. However, such tools have repeatedly been demonstrated to achieve +lower-than-expected code coverage - particularly on sophisticated proprietary +apps. Prior work has illustrated that a primary cause of these coverage +deficiencies is related to so-called tarpits, or complex screens that are +difficult to navigate. + In this paper, we take a critical step toward enabling AIG tools to +effectively navigate tarpits during app exploration through a new form of +automated semantic screen understanding. We introduce AURORA, a technique that +learns from the visual and textual patterns that exist in mobile app UIs to +automatically detect common screen designs and navigate them accordingly. The +key idea of AURORA is that there are a finite number of mobile app screen +designs, albeit with subtle variations, such that the general patterns of +different categories of UI designs can be learned. As such, AURORA employs a +multi-modal, neural screen classifier that is able to recognize the most common +types of UI screen designs. After recognizing a given screen, it then applies a +set of flexible and generalizable heuristics to properly navigate the screen. +We evaluated AURORA both on a set of 12 apps with known tarpits from prior +work, and on a new set of five of the most popular apps from the Google Play +store. Our results indicate that AURORA is able to effectively navigate tarpit +screens, outperforming prior approaches that avoid tarpits by 19.6% in terms of +method coverage. The improvements can be attributed to AURORA's UI design +classification and heuristic navigation techniques. + +
+
+ comment: Published at 17th IEEE International Conference on Software Testing, + Verification and Validation (ICST) 2024, 12 pages +
+
+
+
+
+ + ☆ Feature Splatting: Language-Driven Physics-Based Scene Synthesis and + Editing + + +
+ Scene representations using 3D Gaussian primitives have produced excellent +results in modeling the appearance of static and dynamic 3D scenes. Many +graphics applications, however, demand the ability to manipulate both the +appearance and the physical properties of objects. We introduce Feature +Splatting, an approach that unifies physics-based dynamic scene synthesis with +rich semantics from vision language foundation models that are grounded by +natural language. Our first contribution is a way to distill high-quality, +object-centric vision-language features into 3D Gaussians, that enables +semi-automatic scene decomposition using text queries. Our second contribution +is a way to synthesize physics-based dynamics from an otherwise static scene +using a particle-based simulator, in which material properties are assigned +automatically via text queries. We ablate key techniques used in this pipeline, +to illustrate the challenge and opportunities in using feature-carrying 3D +Gaussians as a unified format for appearance, geometry, material properties and +semantics grounded on natural language. Project website: +https://feature-splatting.github.io/ + +
+
+ comment: Project website: https://feature-splatting.github.io/ +
+
+
+
+
+ + ☆ Entity-Centric Reinforcement Learning for Object Manipulation from + Pixels ICLR 2024 + + +
+ Manipulating objects is a hallmark of human intelligence, and an important +task in domains such as robotics. In principle, Reinforcement Learning (RL) +offers a general approach to learn object manipulation. In practice, however, +domains with more than a few objects are difficult for RL agents due to the +curse of dimensionality, especially when learning from raw image observations. +In this work we propose a structured approach for visual RL that is suitable +for representing multiple objects and their interaction, and use it to learn +goal-conditioned manipulation of several objects. Key to our method is the +ability to handle goals with dependencies between the objects (e.g., moving +objects in a certain order). We further relate our architecture to the +generalization capability of the trained agent, based on a theoretical result +for compositional generalization, and demonstrate agents that learn with 3 +objects but generalize to similar tasks with over 10 objects. Videos and code +are available on the project website: +https://sites.google.com/view/entity-centric-rl + +
+
+ comment: ICLR 2024 Spotlight. Videos and code are available on the project + website: https://sites.google.com/view/entity-centric-rl +
+
+
+
+
+ + ☆ Vision-language models for decoding provider attention during neonatal + resuscitation + + +
+ Neonatal resuscitations demand an exceptional level of attentiveness from +providers, who must process multiple streams of information simultaneously. +Gaze strongly influences decision making; thus, understanding where a provider +is looking during neonatal resuscitations could inform provider training, +enhance real-time decision support, and improve the design of delivery rooms +and neonatal intensive care units (NICUs). Current approaches to quantifying +neonatal providers' gaze rely on manual coding or simulations, which limit +scalability and utility. Here, we introduce an automated, real-time, deep +learning approach capable of decoding provider gaze into semantic classes +directly from first-person point-of-view videos recorded during live +resuscitations. Combining state-of-the-art, real-time segmentation with +vision-language models (CLIP), our low-shot pipeline attains 91\% +classification accuracy in identifying gaze targets without training. Upon +fine-tuning, the performance of our gaze-guided vision transformer exceeds 98\% +accuracy in gaze classification, approaching human-level precision. This +system, capable of real-time inference, enables objective quantification of +provider attention dynamics during live neonatal resuscitation. Our approach +offers a scalable solution that seamlessly integrates with existing +infrastructure for data-scarce gaze analysis, thereby offering new +opportunities for understanding and refining clinical decision making. + +
+
+ comment: 9 pages, 4 figures +
+
+
+
+
+ + ☆ Video Interpolation with Diffusion Models CVPR 2024 + + +
+ We present VIDIM, a generative model for video interpolation, which creates +short videos given a start and end frame. In order to achieve high fidelity and +generate motions unseen in the input data, VIDIM uses cascaded diffusion models +to first generate the target video at low resolution, and then generate the +high-resolution video conditioned on the low-resolution generated video. We +compare VIDIM to previous state-of-the-art methods on video interpolation, and +demonstrate how such works fail in most settings where the underlying motion is +complex, nonlinear, or ambiguous while VIDIM can easily handle such cases. We +additionally demonstrate how classifier-free guidance on the start and end +frame and conditioning the super-resolution model on the original +high-resolution frames without additional parameters unlocks high-fidelity +results. VIDIM is fast to sample from as it jointly denoises all the frames to +be generated, requires less than a billion parameters per diffusion model to +produce compelling results, and still enjoys scalability and improved quality +at larger parameter counts. + +
+
+ comment: CVPR 2024, Project page at https://vidim-interpolation.github.io/ +
+
+
+
+
+ + ☆ Getting it Right: Improving Spatial Consistency in Text-to-Image Models + + +
+ One of the key shortcomings in current text-to-image (T2I) models is their +inability to consistently generate images which faithfully follow the spatial +relationships specified in the text prompt. In this paper, we offer a +comprehensive investigation of this limitation, while also developing datasets +and methods that achieve state-of-the-art performance. First, we find that +current vision-language datasets do not represent spatial relationships well +enough; to alleviate this bottleneck, we create SPRIGHT, the first +spatially-focused, large scale dataset, by re-captioning 6 million images from +4 widely used vision datasets. Through a 3-fold evaluation and analysis +pipeline, we find that SPRIGHT largely improves upon existing datasets in +capturing spatial relationships. To demonstrate its efficacy, we leverage only +~0.25% of SPRIGHT and achieve a 22% improvement in generating spatially +accurate images while also improving the FID and CMMD scores. Secondly, we find +that training on images containing a large number of objects results in +substantial improvements in spatial consistency. Notably, we attain +state-of-the-art on T2I-CompBench with a spatial score of 0.2133, by +fine-tuning on <500 images. Finally, through a set of controlled experiments +and ablations, we document multiple findings that we believe will enhance the +understanding of factors that affect spatial consistency in text-to-image +models. We publicly release our dataset and model to foster further research in +this area. + +
+
+ comment: project webpage : https://spright-t2i.github.io/ +
+
+
+
+
+ + ☆ Adaptive Query Prompting for Multi-Domain Landmark Detection + + +
+ Medical landmark detection is crucial in various medical imaging modalities +and procedures. Although deep learning-based methods have achieve promising +performance, they are mostly designed for specific anatomical regions or tasks. +In this work, we propose a universal model for multi-domain landmark detection +by leveraging transformer architecture and developing a prompting component, +named as Adaptive Query Prompting (AQP). Instead of embedding additional +modules in the backbone network, we design a separate module to generate +prompts that can be effectively extended to any other transformer network. In +our proposed AQP, prompts are learnable parameters maintained in a memory space +called prompt pool. The central idea is to keep the backbone frozen and then +optimize prompts to instruct the model inference process. Furthermore, we +employ a lightweight decoder to decode landmarks from the extracted features, +namely Light-MLD. Thanks to the lightweight nature of the decoder and AQP, we +can handle multiple datasets by sharing the backbone encoder and then only +perform partial parameter tuning without incurring much additional cost. It has +the potential to be extended to more landmark detection tasks. We conduct +experiments on three widely used X-ray datasets for different medical landmark +detection tasks. Our proposed Light-MLD coupled with AQP achieves SOTA +performance on many metrics even without the use of elaborate structural +designs or complex frameworks. + +
+
+
+
+
+ + ☆ iMD4GC: Incomplete Multimodal Data Integration to Advance Precise + Treatment Response Prediction and Survival Analysis for Gastric Cancer + + +
+ Gastric cancer (GC) is a prevalent malignancy worldwide, ranking as the fifth +most common cancer with over 1 million new cases and 700 thousand deaths in +2020. Locally advanced gastric cancer (LAGC) accounts for approximately +two-thirds of GC diagnoses, and neoadjuvant chemotherapy (NACT) has emerged as +the standard treatment for LAGC. However, the effectiveness of NACT varies +significantly among patients, with a considerable subset displaying treatment +resistance. Ineffective NACT not only leads to adverse effects but also misses +the optimal therapeutic window, resulting in lower survival rate. However, +existing multimodal learning methods assume the availability of all modalities +for each patient, which does not align with the reality of clinical practice. +The limited availability of modalities for each patient would cause information +loss, adversely affecting predictive accuracy. In this study, we propose an +incomplete multimodal data integration framework for GC (iMD4GC) to address the +challenges posed by incomplete multimodal data, enabling precise response +prediction and survival analysis. Specifically, iMD4GC incorporates unimodal +attention layers for each modality to capture intra-modal information. +Subsequently, the cross-modal interaction layers explore potential inter-modal +interactions and capture complementary information across modalities, thereby +enabling information compensation for missing modalities. To evaluate iMD4GC, +we collected three multimodal datasets for GC study: GastricRes (698 cases) for +response prediction, GastricSur (801 cases) for survival analysis, and +TCGA-STAD (400 cases) for survival analysis. The scale of our datasets is +significantly larger than previous studies. The iMD4GC achieved impressive +performance with an 80.2% AUC on GastricRes, 71.4% C-index on GastricSur, and +66.1% C-index on TCGA-STAD, significantly surpassing other compared methods. + +
+
+ comment: 27 pages, 9 figures, 3 tables (under review) +
+
+
+
+
+ + ☆ BEM: Balanced and Entropy-based Mix for Long-Tailed Semi-Supervised + Learning CVPR 2024 + + +
+ Data mixing methods play a crucial role in semi-supervised learning (SSL), +but their application is unexplored in long-tailed semi-supervised learning +(LTSSL). The primary reason is that the in-batch mixing manner fails to address +class imbalance. Furthermore, existing LTSSL methods mainly focus on +re-balancing data quantity but ignore class-wise uncertainty, which is also +vital for class balance. For instance, some classes with sufficient samples +might still exhibit high uncertainty due to indistinguishable features. To this +end, this paper introduces the Balanced and Entropy-based Mix (BEM), a +pioneering mixing approach to re-balance the class distribution of both data +quantity and uncertainty. Specifically, we first propose a class balanced mix +bank to store data of each class for mixing. This bank samples data based on +the estimated quantity distribution, thus re-balancing data quantity. Then, we +present an entropy-based learning approach to re-balance class-wise +uncertainty, including entropy-based sampling strategy, entropy-based selection +module, and entropy-based class balanced loss. Our BEM first leverages data +mixing for improving LTSSL, and it can also serve as a complement to the +existing re-balancing methods. Experimental results show that BEM significantly +enhances various LTSSL frameworks and achieves state-of-the-art performances +across multiple benchmarks. + +
+
+ comment: This paper is accepted to CVPR 2024. The supplementary material is + included +
+
+
+
+
+ + ☆ SpikeMba: Multi-Modal Spiking Saliency Mamba for Temporal Video + Grounding + + +
+ Temporal video grounding (TVG) is a critical task in video content +understanding. Despite significant advancements, existing methods often limit +in capturing the fine-grained relationships between multimodal inputs and the +high computational costs with processing long video sequences. To address these +limitations, we introduce a novel SpikeMba: multi-modal spiking saliency mamba +for temporal video grounding. In our work, we integrate the Spiking Neural +Networks (SNNs) and state space models (SSMs) to capture the fine-grained +relationships of multimodal features effectively. Specifically, we introduce +the relevant slots to enhance the model's memory capabilities, enabling a +deeper contextual understanding of video sequences. The contextual moment +reasoner leverages these slots to maintain a balance between contextual +information preservation and semantic relevance exploration. Simultaneously, +the spiking saliency detector capitalizes on the unique properties of SNNs to +accurately locate salient proposals. Our experiments demonstrate the +effectiveness of SpikeMba, which consistently outperforms state-of-the-art +methods across mainstream benchmarks. + +
+
+
+
+
+ + ☆ Mirror-3DGS: Incorporating Mirror Reflections into 3D Gaussian Splatting + + +
+ 3D Gaussian Splatting (3DGS) has marked a significant breakthrough in the +realm of 3D scene reconstruction and novel view synthesis. However, 3DGS, much +like its predecessor Neural Radiance Fields (NeRF), struggles to accurately +model physical reflections, particularly in mirrors that are ubiquitous in +real-world scenes. This oversight mistakenly perceives reflections as separate +entities that physically exist, resulting in inaccurate reconstructions and +inconsistent reflective properties across varied viewpoints. To address this +pivotal challenge, we introduce Mirror-3DGS, an innovative rendering framework +devised to master the intricacies of mirror geometries and reflections, paving +the way for the generation of realistically depicted mirror reflections. By +ingeniously incorporating mirror attributes into the 3DGS and leveraging the +principle of plane mirror imaging, Mirror-3DGS crafts a mirrored viewpoint to +observe from behind the mirror, enriching the realism of scene renderings. +Extensive assessments, spanning both synthetic and real-world scenes, showcase +our method's ability to render novel views with enhanced fidelity in real-time, +surpassing the state-of-the-art Mirror-NeRF specifically within the challenging +mirror regions. Our code will be made publicly available for reproducible +research. + +
+
+ comment: 22 pages, 7 figures +
+
+
+
+
+ + ☆ Diagnosis of Skin Cancer Using VGG16 and VGG19 Based Transfer Learning + Models + + +
+ Today, skin cancer is considered as one of the most dangerous and common +cancers in the world which demands special attention. Skin cancer may be +developed in different types; including melanoma, actinic keratosis, basal cell +carcinoma, squamous cell carcinoma, and Merkel cell carcinoma. Among them, +melanoma is more unpredictable. Melanoma cancer can be diagnosed at early +stages increasing the possibility of disease treatment. Automatic +classification of skin lesions is a challenging task due to diverse forms and +grades of the disease, demanding the requirement of novel methods +implementation. Deep convolution neural networks (CNN) have shown an excellent +potential for data and image classification. In this article, we inspect skin +lesion classification problem using CNN techniques. Remarkably, we present that +prominent classification accuracy of lesion detection can be obtained by proper +designing and applying of transfer learning framework on pre-trained neural +networks, without any requirement for data enlargement procedures i.e. merging +VGG16 and VGG19 architectures pre-trained by a generic dataset with modified +AlexNet network, and then, fine-tuned by a subject-specific dataset containing +dermatology images. The convolution neural network was trained using 2541 +images and, in particular, dropout was used to prevent the network from +overfitting. Finally, the validity of the model was checked by applying the +K-fold cross validation method. The proposed model increased classification +accuracy by 3% (from 94.2% to 98.18%) in comparison with other methods. + +
+
+ comment: 15 pages, journal +
+
+
+
+
+ + ☆ SyncMask: Synchronized Attentional Masking for Fashion-centric + Vision-Language Pretraining CVPR2024 + + +
+ Vision-language models (VLMs) have made significant strides in cross-modal +understanding through large-scale paired datasets. However, in fashion domain, +datasets often exhibit a disparity between the information conveyed in image +and text. This issue stems from datasets containing multiple images of a single +fashion item all paired with one text, leading to cases where some textual +details are not visible in individual images. This mismatch, particularly when +non-co-occurring elements are masked, undermines the training of conventional +VLM objectives like Masked Language Modeling and Masked Image Modeling, thereby +hindering the model's ability to accurately align fine-grained visual and +textual features. Addressing this problem, we propose Synchronized attentional +Masking (SyncMask), which generate masks that pinpoint the image patches and +word tokens where the information co-occur in both image and text. This +synchronization is accomplished by harnessing cross-attentional features +obtained from a momentum model, ensuring a precise alignment between the two +modalities. Additionally, we enhance grouped batch sampling with semi-hard +negatives, effectively mitigating false negative issues in Image-Text Matching +and Image-Text Contrastive learning objectives within fashion datasets. Our +experiments demonstrate the effectiveness of the proposed approach, +outperforming existing methods in three downstream tasks. + +
+
+ comment: CVPR2024 Accepted +
+
+
+
+
+ + ☆ Uncovering the Text Embedding in Text-to-Image Diffusion Models + + +
+ The correspondence between input text and the generated image exhibits +opacity, wherein minor textual modifications can induce substantial deviations +in the generated image. While, text embedding, as the pivotal intermediary +between text and images, remains relatively underexplored. In this paper, we +address this research gap by delving into the text embedding space, unleashing +its capacity for controllable image editing and explicable semantic direction +attributes within a learning-free framework. Specifically, we identify two +critical insights regarding the importance of per-word embedding and their +contextual correlations within text embedding, providing instructive principles +for learning-free image editing. Additionally, we find that text embedding +inherently possesses diverse semantic potentials, and further reveal this +property through the lens of singular value decomposition (SVD). These +uncovered properties offer practical utility for image editing and semantic +discovery. More importantly, we expect the in-depth analyses and findings of +the text embedding can enhance the understanding of text-to-image diffusion +models. + +
+
+
+
+
+ + ☆ Detect2Interact: Localizing Object Key Field in Visual Question + Answering (VQA) with LLMs + + +
+ Localization plays a crucial role in enhancing the practicality and precision +of VQA systems. By enabling fine-grained identification and interaction with +specific parts of an object, it significantly improves the system's ability to +provide contextually relevant and spatially accurate responses, crucial for +applications in dynamic environments like robotics and augmented reality. +However, traditional systems face challenges in accurately mapping objects +within images to generate nuanced and spatially aware responses. In this work, +we introduce "Detect2Interact", which addresses these challenges by introducing +an advanced approach for fine-grained object visual key field detection. First, +we use the segment anything model (SAM) to generate detailed spatial maps of +objects in images. Next, we use Vision Studio to extract semantic object +descriptions. Third, we employ GPT-4's common sense knowledge, bridging the gap +between an object's semantics and its spatial map. As a result, Detect2Interact +achieves consistent qualitative results on object key field detection across +extensive test cases and outperforms the existing VQA system with object +detection by providing a more reasonable and finer visual representation. + +
+
+ comment: Accepted to IEEE Intelligent Systems +
+
+
+
+
+ + ☆ Condition-Aware Neural Network for Controlled Image Generation CVPR 2024 + + +
+ We present Condition-Aware Neural Network (CAN), a new method for adding +control to image generative models. In parallel to prior conditional control +methods, CAN controls the image generation process by dynamically manipulating +the weight of the neural network. This is achieved by introducing a +condition-aware weight generation module that generates conditional weight for +convolution/linear layers based on the input condition. We test CAN on +class-conditional image generation on ImageNet and text-to-image generation on +COCO. CAN consistently delivers significant improvements for diffusion +transformer models, including DiT and UViT. In particular, CAN combined with +EfficientViT (CaT) achieves 2.78 FID on ImageNet 512x512, surpassing DiT-XL/2 +while requiring 52x fewer MACs per sampling step. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Structured Initialization for Attention in Vision Transformers + + +
+ The training of vision transformer (ViT) networks on small-scale datasets +poses a significant challenge. By contrast, convolutional neural networks +(CNNs) have an architectural inductive bias enabling them to perform well on +such problems. In this paper, we argue that the architectural bias inherent to +CNNs can be reinterpreted as an initialization bias within ViT. This insight is +significant as it empowers ViTs to perform equally well on small-scale problems +while maintaining their flexibility for large-scale applications. Our +inspiration for this ``structured'' initialization stems from our empirical +observation that random impulse filters can achieve comparable performance to +learned filters within CNNs. Our approach achieves state-of-the-art performance +for data-efficient ViT learning across numerous benchmarks including CIFAR-10, +CIFAR-100, and SVHN. + +
+
+ comment: 20 pages, 5 figures, 8 tables +
+
+
+
+
+ + ☆ CityGaussian: Real-time High-quality Large-Scale Scene Rendering with + Gaussians + + +
+ The advancement of real-time 3D scene reconstruction and novel view synthesis +has been significantly propelled by 3D Gaussian Splatting (3DGS). However, +effectively training large-scale 3DGS and rendering it in real-time across +various scales remains challenging. This paper introduces CityGaussian +(CityGS), which employs a novel divide-and-conquer training approach and +Level-of-Detail (LoD) strategy for efficient large-scale 3DGS training and +rendering. Specifically, the global scene prior and adaptive training data +selection enables efficient training and seamless fusion. Based on fused +Gaussian primitives, we generate different detail levels through compression, +and realize fast rendering across various scales through the proposed +block-wise detail levels selection and aggregation strategy. Extensive +experimental results on large-scale scenes demonstrate that our approach +attains state-of-theart rendering quality, enabling consistent real-time +rendering of largescale scenes across vastly different scales. Our project page +is available at https://dekuliutesla.github.io/citygs/. + +
+
+ comment: Project Page: https://dekuliutesla.github.io/citygs/ +
+
+
+
+
+ + ☆ Medical Visual Prompting (MVP): A Unified Framework for Versatile and + High-Quality Medical Image Segmentation + + +
+ Accurate segmentation of lesion regions is crucial for clinical diagnosis and +treatment across various diseases. While deep convolutional networks have +achieved satisfactory results in medical image segmentation, they face +challenges such as loss of lesion shape information due to continuous +convolution and downsampling, as well as the high cost of manually labeling +lesions with varying shapes and sizes. To address these issues, we propose a +novel medical visual prompting (MVP) framework that leverages pre-training and +prompting concepts from natural language processing (NLP). The framework +utilizes three key components: Super-Pixel Guided Prompting (SPGP) for +superpixelating the input image, Image Embedding Guided Prompting (IEGP) for +freezing patch embedding and merging with superpixels to provide visual +prompts, and Adaptive Attention Mechanism Guided Prompting (AAGP) for +pinpointing prompt content and efficiently adapting all layers. By integrating +SPGP, IEGP, and AAGP, the MVP enables the segmentation network to better learn +shape prompting information and facilitates mutual learning across different +tasks. Extensive experiments conducted on five datasets demonstrate superior +performance of this method in various challenging medical image tasks, while +simplifying single-task medical segmentation models. This novel framework +offers improved performance with fewer parameters and holds significant +potential for accurate segmentation of lesion regions in various medical tasks, +making it clinically valuable. + +
+
+
+
+
+ + ☆ CLIPtone: Unsupervised Learning for Text-based Image Tone Adjustment + + +
+ Recent image tone adjustment (or enhancement) approaches have predominantly +adopted supervised learning for learning human-centric perceptual assessment. +However, these approaches are constrained by intrinsic challenges of supervised +learning. Primarily, the requirement for expertly-curated or retouched images +escalates the data acquisition expenses. Moreover, their coverage of target +style is confined to stylistic variants inferred from the training data. To +surmount the above challenges, we propose an unsupervised learning-based +approach for text-based image tone adjustment method, CLIPtone, that extends an +existing image enhancement method to accommodate natural language descriptions. +Specifically, we design a hyper-network to adaptively modulate the pretrained +parameters of the backbone model based on text description. To assess whether +the adjusted image aligns with the text description without ground truth image, +we utilize CLIP, which is trained on a vast set of language-image pairs and +thus encompasses knowledge of human perception. The major advantages of our +approach are three fold: (i) minimal data collection expenses, (ii) support for +a range of adjustments, and (iii) the ability to handle novel text descriptions +unseen in training. Our approach's efficacy is demonstrated through +comprehensive experiments, including a user study. + +
+
+
+
+
+ + ☆ CMT: Cross Modulation Transformer with Hybrid Loss for Pansharpening + + +
+ Pansharpening aims to enhance remote sensing image (RSI) quality by merging +high-resolution panchromatic (PAN) with multispectral (MS) images. However, +prior techniques struggled to optimally fuse PAN and MS images for enhanced +spatial and spectral information, due to a lack of a systematic framework +capable of effectively coordinating their individual strengths. In response, we +present the Cross Modulation Transformer (CMT), a pioneering method that +modifies the attention mechanism. This approach utilizes a robust modulation +technique from signal processing, integrating it into the attention mechanism's +calculations. It dynamically tunes the weights of the carrier's value (V) +matrix according to the modulator's features, thus resolving historical +challenges and achieving a seamless integration of spatial and spectral +attributes. Furthermore, considering that RSI exhibits large-scale features and +edge details along with local textures, we crafted a hybrid loss function that +combines Fourier and wavelet transforms to effectively capture these +characteristics, thereby enhancing both spatial and spectral accuracy in +pansharpening. Extensive experiments demonstrate our framework's superior +performance over existing state-of-the-art methods. The code will be publicly +available to encourage further research. + +
+
+
+
+
+ + ☆ Motion Blur Decomposition with Cross-shutter Guidance CVPR 2024 + + +
+ Motion blur is a frequently observed image artifact, especially under +insufficient illumination where exposure time has to be prolonged so as to +collect more photons for a bright enough image. Rather than simply removing +such blurring effects, recent researches have aimed at decomposing a blurry +image into multiple sharp images with spatial and temporal coherence. Since +motion blur decomposition itself is highly ambiguous, priors from neighbouring +frames or human annotation are usually needed for motion disambiguation. In +this paper, inspired by the complementary exposure characteristics of a global +shutter (GS) camera and a rolling shutter (RS) camera, we propose to utilize +the ordered scanline-wise delay in a rolling shutter image to robustify motion +decomposition of a single blurry image. To evaluate this novel dual imaging +setting, we construct a triaxial system to collect realistic data, as well as a +deep network architecture that explicitly addresses temporal and contextual +information through reciprocal branches for cross-shutter motion blur +decomposition. Experiment results have verified the effectiveness of our +proposed algorithm, as well as the validity of our dual imaging setting. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Diffusion based Zero-shot Medical Image-to-Image Translation for Cross + Modality Segmentation + + +
+ Cross-modality image segmentation aims to segment the target modalities using +a method designed in the source modality. Deep generative models can translate +the target modality images into the source modality, thus enabling +cross-modality segmentation. However, a vast body of existing cross-modality +image translation methods relies on supervised learning. In this work, we aim +to address the challenge of zero-shot learning-based image translation tasks +(extreme scenarios in the target modality is unseen in the training phase). To +leverage generative learning for zero-shot cross-modality image segmentation, +we propose a novel unsupervised image translation method. The framework learns +to translate the unseen source image to the target modality for image +segmentation by leveraging the inherent statistical consistency between +different modalities for diffusion guidance. Our framework captures identical +cross-modality features in the statistical domain, offering diffusion guidance +without relying on direct mappings between the source and target domains. This +advantage allows our method to adapt to changing source domains without the +need for retraining, making it highly practical when sufficient labeled source +domain data is not available. The proposed framework is validated in zero-shot +cross-modality image segmentation tasks through empirical comparisons with +influential generative models, including adversarial-based and diffusion-based +models. + +
+
+ comment: Neurips 2023 Diffusion Workshop +
+
+
+
+
+ + ☆ UFID: A Unified Framework for Input-level Backdoor Detection on + Diffusion Models + + +
+ Diffusion Models are vulnerable to backdoor attacks, where malicious +attackers inject backdoors by poisoning some parts of the training samples +during the training stage. This poses a serious threat to the downstream users, +who query the diffusion models through the API or directly download them from +the internet. To mitigate the threat of backdoor attacks, there have been a +plethora of investigations on backdoor detections. However, none of them +designed a specialized backdoor detection method for diffusion models, +rendering the area much under-explored. Moreover, these prior methods mainly +focus on the traditional neural networks in the classification task, which +cannot be adapted to the backdoor detections on the generative task easily. +Additionally, most of the prior methods require white-box access to model +weights and architectures, or the probability logits as additional information, +which are not always practical. In this paper, we propose a Unified Framework +for Input-level backdoor Detection (UFID) on the diffusion models, which is +motivated by observations in the diffusion models and further validated with a +theoretical causality analysis. Extensive experiments across different datasets +on both conditional and unconditional diffusion models show that our method +achieves a superb performance on detection effectiveness and run-time +efficiency. The code is available at +https://github.com/GuanZihan/official_UFID. + +
+
+ comment: 20 pages,18 figures +
+
+
+
+
+ + ☆ HairFastGAN: Realistic and Robust Hair Transfer with a Fast + Encoder-Based Approach + + +
+ Our paper addresses the complex task of transferring a hairstyle from a +reference image to an input photo for virtual hair try-on. This task is +challenging due to the need to adapt to various photo poses, the sensitivity of +hairstyles, and the lack of objective metrics. The current state of the art +hairstyle transfer methods use an optimization process for different parts of +the approach, making them inexcusably slow. At the same time, faster +encoder-based models are of very low quality because they either operate in +StyleGAN's W+ space or use other low-dimensional image generators. +Additionally, both approaches have a problem with hairstyle transfer when the +source pose is very different from the target pose, because they either don't +consider the pose at all or deal with it inefficiently. In our paper, we +present the HairFast model, which uniquely solves these problems and achieves +high resolution, near real-time performance, and superior reconstruction +compared to optimization problem-based methods. Our solution includes a new +architecture operating in the FS latent space of StyleGAN, an enhanced +inpainting approach, and improved encoders for better alignment, color +transfer, and a new encoder for post-processing. The effectiveness of our +approach is demonstrated on realism metrics after random hairstyle transfer and +reconstruction when the original hairstyle is transferred. In the most +difficult scenario of transferring both shape and color of a hairstyle from +different images, our method performs in less than a second on the Nvidia V100. +Our code is available at https://github.com/AIRI-Institute/HairFastGAN. + +
+
+
+
+
+ + ☆ Texture-Preserving Diffusion Models for High-Fidelity Virtual Try-On CVPR 2024 + + +
+ Image-based virtual try-on is an increasingly important task for online +shopping. It aims to synthesize images of a specific person wearing a specified +garment. Diffusion model-based approaches have recently become popular, as they +are excellent at image synthesis tasks. However, these approaches usually +employ additional image encoders and rely on the cross-attention mechanism for +texture transfer from the garment to the person image, which affects the +try-on's efficiency and fidelity. To address these issues, we propose an +Texture-Preserving Diffusion (TPD) model for virtual try-on, which enhances the +fidelity of the results and introduces no additional image encoders. +Accordingly, we make contributions from two aspects. First, we propose to +concatenate the masked person and reference garment images along the spatial +dimension and utilize the resulting image as the input for the diffusion +model's denoising UNet. This enables the original self-attention layers +contained in the diffusion model to achieve efficient and accurate texture +transfer. Second, we propose a novel diffusion-based method that predicts a +precise inpainting mask based on the person and reference garment images, +further enhancing the reliability of the try-on results. In addition, we +integrate mask prediction and image synthesis into a single compact model. The +experimental results show that our approach can be applied to various try-on +tasks, e.g., garment-to-person and person-to-person try-ons, and significantly +outperforms state-of-the-art methods on popular VITON, VITON-HD databases. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ PhysReaction: Physically Plausible Real-Time Humanoid Reaction Synthesis + via Forward Dynamics Guided 4D Imitation + + +
+ Humanoid Reaction Synthesis is pivotal for creating highly interactive and +empathetic robots that can seamlessly integrate into human environments, +enhancing the way we live, work, and communicate. However, it is difficult to +learn the diverse interaction patterns of multiple humans and generate +physically plausible reactions. The kinematics-based approaches face +challenges, including issues like floating feet, sliding, penetration, and +other problems that defy physical plausibility. The existing physics-based +method often relies on kinematics-based methods to generate reference states, +which struggle with the challenges posed by kinematic noise during action +execution. Constrained by their reliance on diffusion models, these methods are +unable to achieve real-time inference. In this work, we propose a Forward +Dynamics Guided 4D Imitation method to generate physically plausible human-like +reactions. The learned policy is capable of generating physically plausible and +human-like reactions in real-time, significantly improving the speed(x33) and +quality of reactions compared with the existing method. Our experiments on the +InterHuman and Chi3D datasets, along with ablation studies, demonstrate the +effectiveness of our approach. + +
+
+
+
+
+ + ☆ Stale Diffusion: Hyper-realistic 5D Movie Generation Using Old-school + Methods + + +
+ Two years ago, Stable Diffusion achieved super-human performance at +generating images with super-human numbers of fingers. Following the steady +decline of its technical novelty, we propose Stale Diffusion, a method that +solidifies and ossifies Stable Diffusion in a maximum-entropy state. Stable +Diffusion works analogously to a barn (the Stable) from which an infinite set +of horses have escaped (the Diffusion). As the horses have long left the barn, +our proposal may be seen as antiquated and irrelevant. Nevertheless, we +vigorously defend our claim of novelty by identifying as early adopters of the +Slow Science Movement, which will produce extremely important pearls of wisdom +in the future. Our speed of contributions can also be seen as a quasi-static +implementation of the recent call to pause AI experiments, which we +wholeheartedly support. As a result of a careful archaeological expedition to +18-months-old Git commit histories, we found that naturally-accumulating errors +have produced a novel entropy-maximising Stale Diffusion method, that can +produce sleep-inducing hyper-realistic 5D video that is as good as one's +imagination. + +
+
+ comment: SIGBOVIK 2024 +
+
+
+
+
+ + ☆ Prompt Learning for Oriented Power Transmission Tower Detection in + High-Resolution SAR Images + + +
+ Detecting transmission towers from synthetic aperture radar (SAR) images +remains a challenging task due to the comparatively small size and side-looking +geometry, with background clutter interference frequently hindering tower +identification. A large number of interfering signals superimposes the return +signal from the tower. We found that localizing or prompting positions of power +transmission towers is beneficial to address this obstacle. Based on this +revelation, this paper introduces prompt learning into the oriented object +detector (P2Det) for multimodal information learning. P2Det contains the sparse +prompt coding and cross-attention between the multimodal data. Specifically, +the sparse prompt encoder (SPE) is proposed to represent point locations, +converting prompts into sparse embeddings. The image embeddings are generated +through the Transformer layers. Then a two-way fusion module (TWFM) is proposed +to calculate the cross-attention of the two different embeddings. The +interaction of image-level and prompt-level features is utilized to address the +clutter interference. A shape-adaptive refinement module (SARM) is proposed to +reduce the effect of aspect ratio. Extensive experiments demonstrated the +effectiveness of the proposed model on high-resolution SAR images. P2Det +provides a novel insight for multimodal object detection due to its competitive +performance. + +
+
+ comment: 22 pages, 12figures +
+
+
+
+
+ + ☆ T-Mamba: Frequency-Enhanced Gated Long-Range Dependency for Tooth 3D + CBCT Segmentation + + +
+ Efficient tooth segmentation in three-dimensional (3D) imaging, critical for +orthodontic diagnosis, remains challenging due to noise, low contrast, and +artifacts in CBCT images. Both convolutional Neural Networks (CNNs) and +transformers have emerged as popular architectures for image segmentation. +However, their efficacy in handling long-range dependencies is limited due to +inherent locality or computational complexity. To address this issue, we +propose T-Mamba, integrating shared positional encoding and frequency-based +features into vision mamba, to address limitations in spatial position +preservation and feature enhancement in frequency domain. Besides, we also +design a gate selection unit to integrate two features in spatial domain and +one feature in frequency domain adaptively. T-Mamba is the first work to +introduce frequency-based features into vision mamba. Extensive experiments +demonstrate that T-Mamba achieves new SOTA results on the public Tooth CBCT +dataset and outperforms previous SOTA methods by a large margin, i.e., IoU + +3.63%, SO + 2.43%, DSC +2.30%, HD -4.39mm, and ASSD -0.37mm. The code and +models are publicly available at https://github.com/isbrycee/T-Mamba. + +
+
+
+
+
+ + ☆ Roadside Monocular 3D Detection via 2D Detection Prompting + + +
+ The problem of roadside monocular 3D detection requires detecting objects of +interested classes in a 2D RGB frame and predicting their 3D information such +as locations in bird's-eye-view (BEV). It has broad applications in traffic +control, vehicle-vehicle communication, and vehicle-infrastructure cooperative +perception. To approach this problem, we present a novel and simple method by +prompting the 3D detector using 2D detections. Our method builds on a key +insight that, compared with 3D detectors, a 2D detector is much easier to train +and performs significantly better w.r.t detections on the 2D image plane. That +said, one can exploit 2D detections of a well-trained 2D detector as prompts to +a 3D detector, being trained in a way of inflating such 2D detections to 3D +towards 3D detection. To construct better prompts using the 2D detector, we +explore three techniques: (a) concatenating both 2D and 3D detectors' features, +(b) attentively fusing 2D and 3D detectors' features, and (c) encoding +predicted 2D boxes x, y, width, height, label and attentively fusing such with +the 3D detector's features. Surprisingly, the third performs the best. +Moreover, we present a yaw tuning tactic and a class-grouping strategy that +merges classes based on their functionality; these techniques improve 3D +detection performance further. Comprehensive ablation studies and extensive +experiments demonstrate that our method resoundingly outperforms prior works, +achieving the state-of-the-art on two large-scale roadside 3D detection +benchmarks. + +
+
+
+
+
+ + ☆ HAHA: Highly Articulated Gaussian Human Avatars with Textured Mesh Prior + + +
+ We present HAHA - a novel approach for animatable human avatar generation +from monocular input videos. The proposed method relies on learning the +trade-off between the use of Gaussian splatting and a textured mesh for +efficient and high fidelity rendering. We demonstrate its efficiency to animate +and render full-body human avatars controlled via the SMPL-X parametric model. +Our model learns to apply Gaussian splatting only in areas of the SMPL-X mesh +where it is necessary, like hair and out-of-mesh clothing. This results in a +minimal number of Gaussians being used to represent the full avatar, and +reduced rendering artifacts. This allows us to handle the animation of small +body parts such as fingers that are traditionally disregarded. We demonstrate +the effectiveness of our approach on two open datasets: SnapshotPeople and +X-Humans. Our method demonstrates on par reconstruction quality to the +state-of-the-art on SnapshotPeople, while using less than a third of Gaussians. +HAHA outperforms previous state-of-the-art on novel poses from X-Humans both +quantitatively and qualitatively. + +
+
+
+
+
+ + ☆ Action Detection via an Image Diffusion Process CVPR 2024 + + +
+ Action detection aims to localize the starting and ending points of action +instances in untrimmed videos, and predict the classes of those instances. In +this paper, we make the observation that the outputs of the action detection +task can be formulated as images. Thus, from a novel perspective, we tackle +action detection via a three-image generation process to generate starting +point, ending point and action-class predictions as images via our proposed +Action Detection Image Diffusion (ADI-Diff) framework. Furthermore, since our +images differ from natural images and exhibit special properties, we further +explore a Discrete Action-Detection Diffusion Process and a Row-Column +Transformer design to better handle their processing. Our ADI-Diff framework +achieves state-of-the-art results on two widely-used datasets. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Drag Your Noise: Interactive Point-based Editing via Diffusion Semantic + Propagation CVPR 2024 + + +
+ Point-based interactive editing serves as an essential tool to complement the +controllability of existing generative models. A concurrent work, +DragDiffusion, updates the diffusion latent map in response to user inputs, +causing global latent map alterations. This results in imprecise preservation +of the original content and unsuccessful editing due to gradient vanishing. In +contrast, we present DragNoise, offering robust and accelerated editing without +retracing the latent map. The core rationale of DragNoise lies in utilizing the +predicted noise output of each U-Net as a semantic editor. This approach is +grounded in two critical observations: firstly, the bottleneck features of +U-Net inherently possess semantically rich features ideal for interactive +editing; secondly, high-level semantics, established early in the denoising +process, show minimal variation in subsequent stages. Leveraging these +insights, DragNoise edits diffusion semantics in a single denoising step and +efficiently propagates these changes, ensuring stability and efficiency in +diffusion editing. Comparative experiments reveal that DragNoise achieves +superior control and semantic retention, reducing the optimization time by over +50% compared to DragDiffusion. Our codes are available at +https://github.com/haofengl/DragNoise. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Higher education assessment practice in the era of generative AI tools + + +
+ The higher education (HE) sector benefits every nation's economy and society +at large. However, their contributions are challenged by advanced technologies +like generative artificial intelligence (GenAI) tools. In this paper, we +provide a comprehensive assessment of GenAI tools towards assessment and +pedagogic practice and, subsequently, discuss the potential impacts. This study +experimented using three assessment instruments from data science, data +analytics, and construction management disciplines. Our findings are two-fold: +first, the findings revealed that GenAI tools exhibit subject knowledge, +problem-solving, analytical, critical thinking, and presentation skills and +thus can limit learning when used unethically. Secondly, the design of the +assessment of certain disciplines revealed the limitations of the GenAI tools. +Based on our findings, we made recommendations on how AI tools can be utilised +for teaching and learning in HE. + +
+
+ comment: 11 pages, 7 tables published in the Journal of Applied Learning & + Teaching +
+
+
+
+
+ + ☆ AIGCOIQA2024: Perceptual Quality Assessment of AI Generated + Omnidirectional Images + + +
+ In recent years, the rapid advancement of Artificial Intelligence Generated +Content (AIGC) has attracted widespread attention. Among the AIGC, AI generated +omnidirectional images hold significant potential for Virtual Reality (VR) and +Augmented Reality (AR) applications, hence omnidirectional AIGC techniques have +also been widely studied. AI-generated omnidirectional images exhibit unique +distortions compared to natural omnidirectional images, however, there is no +dedicated Image Quality Assessment (IQA) criteria for assessing them. This +study addresses this gap by establishing a large-scale AI generated +omnidirectional image IQA database named AIGCOIQA2024 and constructing a +comprehensive benchmark. We first generate 300 omnidirectional images based on +5 AIGC models utilizing 25 text prompts. A subjective IQA experiment is +conducted subsequently to assess human visual preferences from three +perspectives including quality, comfortability, and correspondence. Finally, we +conduct a benchmark experiment to evaluate the performance of state-of-the-art +IQA models on our database. The database will be released to facilitate future +research. + +
+
+
+
+
+ + ☆ Harnessing Large Language Models for Training-free Video Anomaly + Detection CVPR 2024 + + +
+ Video anomaly detection (VAD) aims to temporally locate abnormal events in a +video. Existing works mostly rely on training deep models to learn the +distribution of normality with either video-level supervision, one-class +supervision, or in an unsupervised setting. Training-based methods are prone to +be domain-specific, thus being costly for practical deployment as any domain +change will involve data collection and model training. In this paper, we +radically depart from previous efforts and propose LAnguage-based VAD (LAVAD), +a method tackling VAD in a novel, training-free paradigm, exploiting the +capabilities of pre-trained large language models (LLMs) and existing +vision-language models (VLMs). We leverage VLM-based captioning models to +generate textual descriptions for each frame of any test video. With the +textual scene description, we then devise a prompting mechanism to unlock the +capability of LLMs in terms of temporal aggregation and anomaly score +estimation, turning LLMs into an effective video anomaly detector. We further +leverage modality-aligned VLMs and propose effective techniques based on +cross-modal similarity for cleaning noisy captions and refining the LLM-based +anomaly scores. We evaluate LAVAD on two large datasets featuring real-world +surveillance scenarios (UCF-Crime and XD-Violence), showing that it outperforms +both unsupervised and one-class methods without requiring any training or data +collection. + +
+
+ comment: CVPR 2024. Project website at https://lucazanella.github.io/lavad/ +
+
+
+
+
+ + ☆ Teeth-SEG: An Efficient Instance Segmentation Framework for Orthodontic + Treatment based on Anthropic Prior Knowledge CVPR 2024 + + +
+ Teeth localization, segmentation, and labeling in 2D images have great +potential in modern dentistry to enhance dental diagnostics, treatment +planning, and population-based studies on oral health. However, general +instance segmentation frameworks are incompetent due to 1) the subtle +differences between some teeth' shapes (e.g., maxillary first premolar and +second premolar), 2) the teeth's position and shape variation across subjects, +and 3) the presence of abnormalities in the dentition (e.g., caries and +edentulism). To address these problems, we propose a ViT-based framework named +TeethSEG, which consists of stacked Multi-Scale Aggregation (MSA) blocks and an +Anthropic Prior Knowledge (APK) layer. Specifically, to compose the two +modules, we design 1) a unique permutation-based upscaler to ensure high +efficiency while establishing clear segmentation boundaries with 2) multi-head +self/cross-gating layers to emphasize particular semantics meanwhile +maintaining the divergence between token embeddings. Besides, we collect 3) the +first open-sourced intraoral image dataset IO150K, which comprises over 150k +intraoral photos, and all photos are annotated by orthodontists using a +human-machine hybrid algorithm. Experiments on IO150K demonstrate that our +TeethSEG outperforms the state-of-the-art segmentation models on dental image +segmentation. + +
+
+ comment: This paper has been accepted by CVPR 2024 +
+
+
+
+
+ + ☆ AMOR: Ambiguous Authorship Order + + +
+ As we all know, writing scientific papers together with our beloved +colleagues is a truly remarkable experience (partially): endless discussions +about the same useless paragraph over and over again, followed by long days and +long nights -- both at the same time. What a wonderful ride it is! What a +beautiful life we have. But wait, there's one tiny little problem that utterly +shatters the peace, turning even renowned scientists into bloodthirsty +monsters: author order. The reason is that, contrary to widespread opinion, +it's not the font size that matters, but the way things are ordered. Of course, +this is a fairly well-known fact among scientists all across the planet (and +beyond) and explains clearly why we regularly have to read about yet another +escalated paper submission in local police reports. + In this paper, we take an important step backwards to tackle this issue by +solving the so-called author ordering problem (AOP) once and for all. +Specifically, we propose AMOR, a system that replaces silly constructs like +co-first or co-middle authorship with a simple yet easy probabilistic approach +based on random shuffling of the author list at viewing time. In addition to +AOP, we also solve the ambiguous author ordering citation problem} (AAOCP) on +the fly. Stop author violence, be human. + +
+
+ comment: SIGBOVIK '24 submission +
+
+
+
+
+ + ☆ SGCNeRF: Few-Shot Neural Rendering via Sparse Geometric Consistency + Guidance + + +
+ Neural Radiance Field (NeRF) technology has made significant strides in +creating novel viewpoints. However, its effectiveness is hampered when working +with sparsely available views, often leading to performance dips due to +overfitting. FreeNeRF attempts to overcome this limitation by integrating +implicit geometry regularization, which incrementally improves both geometry +and textures. Nonetheless, an initial low positional encoding bandwidth results +in the exclusion of high-frequency elements. The quest for a holistic approach +that simultaneously addresses overfitting and the preservation of +high-frequency details remains ongoing. This study introduces a novel feature +matching based sparse geometry regularization module. This module excels in +pinpointing high-frequency keypoints, thereby safeguarding the integrity of +fine details. Through progressive refinement of geometry and textures across +NeRF iterations, we unveil an effective few-shot neural rendering architecture, +designated as SGCNeRF, for enhanced novel view synthesis. Our experiments +demonstrate that SGCNeRF not only achieves superior geometry-consistent +outcomes but also surpasses FreeNeRF, with improvements of 0.7 dB and 0.6 dB in +PSNR on the LLFF and DTU datasets, respectively. + +
+
+
+
+
+ + ☆ 360+x: A Panoptic Multi-modal Scene Understanding Dataset + + +
+ Human perception of the world is shaped by a multitude of viewpoints and +modalities. While many existing datasets focus on scene understanding from a +certain perspective (e.g. egocentric or third-person views), our dataset offers +a panoptic perspective (i.e. multiple viewpoints with multiple data +modalities). Specifically, we encapsulate third-person panoramic and front +views, as well as egocentric monocular/binocular views with rich modalities +including video, multi-channel audio, directional binaural delay, location data +and textual scene descriptions within each scene captured, presenting +comprehensive observation of the world. Figure 1 offers a glimpse of all 28 +scene categories of our 360+x dataset. To the best of our knowledge, this is +the first database that covers multiple viewpoints with multiple data +modalities to mimic how daily information is accessed in the real world. +Through our benchmark analysis, we presented 5 different scene understanding +tasks on the proposed 360+x dataset to evaluate the impact and benefit of each +data modality and perspective in panoptic scene understanding. We hope this +unique dataset could broaden the scope of comprehensive scene understanding and +encourage the community to approach these problems from more diverse +perspectives. + +
+
+ comment: To access the public dataset, please visit + https://x360dataset.github.io +
+
+
+
+
+ + ☆ FlexiDreamer: Single Image-to-3D Generation with FlexiCubes + + +
+ 3D content generation from text prompts or single images has made remarkable +progress in quality and speed recently. One of its dominant paradigms involves +generating consistent multi-view images followed by a sparse-view +reconstruction. However, due to the challenge of directly deforming the mesh +representation to approach the target topology, most methodologies learn an +implicit representation (such as NeRF) during the sparse-view reconstruction +and acquire the target mesh by a post-processing extraction. Although the +implicit representation can effectively model rich 3D information, its training +typically entails a long convergence time. In addition, the post-extraction +operation from the implicit field also leads to undesirable visual artifacts. +In this paper, we propose FlexiDreamer, a novel single image-to-3d generation +framework that reconstructs the target mesh in an end-to-end manner. By +leveraging a flexible gradient-based extraction known as FlexiCubes, our method +circumvents the defects brought by the post-processing and facilitates a direct +acquisition of the target mesh. Furthermore, we incorporate a multi-resolution +hash grid encoding scheme that progressively activates the encoding levels into +the implicit field in FlexiCubes to help capture geometric details for per-step +optimization. Notably, FlexiDreamer recovers a dense 3D structure from a +single-view image in approximately 1 minute on a single NVIDIA A100 GPU, +outperforming previous methodologies by a large margin. + +
+
+ comment: project page:https://flexidreamer.github.io +
+
+
+
+
+ + ☆ Make Continual Learning Stronger via C-Flat + + +
+ Model generalization ability upon incrementally acquiring dynamically +updating knowledge from sequentially arriving tasks is crucial to tackle the +sensitivity-stability dilemma in Continual Learning (CL). Weight loss landscape +sharpness minimization seeking for flat minima lying in neighborhoods with +uniform low loss or smooth gradient is proven to be a strong training regime +improving model generalization compared with loss minimization based optimizer +like SGD. Yet only a few works have discussed this training regime for CL, +proving that dedicated designed zeroth-order sharpness optimizer can improve CL +performance. In this work, we propose a Continual Flatness (C-Flat) method +featuring a flatter loss landscape tailored for CL. C-Flat could be easily +called with only one line of code and is plug-and-play to any CL methods. A +general framework of C-Flat applied to all CL categories and a thorough +comparison with loss minima optimizer and flat minima based CL approaches is +presented in this paper, showing that our method can boost CL performance in +almost all cases. Code will be publicly available upon publication. + +
+
+
+
+
+ + ☆ CAMO: Correlation-Aware Mask Optimization with Modulated Reinforcement + Learning + + +
+ Optical proximity correction (OPC) is a vital step to ensure printability in +modern VLSI manufacturing. Various OPC approaches based on machine learning +have been proposed to pursue performance and efficiency, which are typically +data-driven and hardly involve any particular considerations of the OPC +problem, leading to potential performance or efficiency bottlenecks. In this +paper, we propose CAMO, a reinforcement learning-based OPC system that +specifically integrates important principles of the OPC problem. CAMO +explicitly involves the spatial correlation among the movements of neighboring +segments and an OPC-inspired modulation for movement action selection. +Experiments are conducted on both via layer patterns and metal layer patterns. +The results demonstrate that CAMO outperforms state-of-the-art OPC engines from +both academia and industry. + +
+
+ comment: Accepted by DAC 2024 +
+
+
+
+
+ + ☆ PDF: A Probability-Driven Framework for Open World 3D Point Cloud + Semantic Segmentation + + +
+ Existing point cloud semantic segmentation networks cannot identify unknown +classes and update their knowledge, due to a closed-set and static perspective +of the real world, which would induce the intelligent agent to make bad +decisions. To address this problem, we propose a Probability-Driven Framework +(PDF) for open world semantic segmentation that includes (i) a lightweight +U-decoder branch to identify unknown classes by estimating the uncertainties, +(ii) a flexible pseudo-labeling scheme to supply geometry features along with +probability distribution features of unknown classes by generating pseudo +labels, and (iii) an incremental knowledge distillation strategy to incorporate +novel classes into the existing knowledge base gradually. Our framework enables +the model to behave like human beings, which could recognize unknown objects +and incrementally learn them with the corresponding knowledge. Experimental +results on the S3DIS and ScanNetv2 datasets demonstrate that the proposed PDF +outperforms other methods by a large margin in both important tasks of open +world semantic segmentation. + +
+
+
+
+
+ + ☆ Improving Visual Recognition with Hyperbolical Visual Hierarchy Mapping CVPR 2024 + + +
+ Visual scenes are naturally organized in a hierarchy, where a coarse semantic +is recursively comprised of several fine details. Exploring such a visual +hierarchy is crucial to recognize the complex relations of visual elements, +leading to a comprehensive scene understanding. In this paper, we propose a +Visual Hierarchy Mapper (Hi-Mapper), a novel approach for enhancing the +structured understanding of the pre-trained Deep Neural Networks (DNNs). +Hi-Mapper investigates the hierarchical organization of the visual scene by 1) +pre-defining a hierarchy tree through the encapsulation of probability +densities; and 2) learning the hierarchical relations in hyperbolic space with +a novel hierarchical contrastive loss. The pre-defined hierarchy tree +recursively interacts with the visual features of the pre-trained DNNs through +hierarchy decomposition and encoding procedures, thereby effectively +identifying the visual hierarchy and enhancing the recognition of an entire +scene. Extensive experiments demonstrate that Hi-Mapper significantly enhances +the representation capability of DNNs, leading to an improved performance on +various tasks, including image classification and dense prediction tasks. + +
+
+ comment: This paper is accepted to CVPR 2024. The supplementary material is + included. The code is available at + \url{https://github.com/kwonjunn01/Hi-Mapper} +
+
+
+
+
+ + ☆ VideoDistill: Language-aware Vision Distillation for Video Question + Answering CVPR2024 + + +
+ Significant advancements in video question answering (VideoQA) have been made +thanks to thriving large image-language pretraining frameworks. Although these +image-language models can efficiently represent both video and language +branches, they typically employ a goal-free vision perception process and do +not interact vision with language well during the answer generation, thus +omitting crucial visual cues. In this paper, we are inspired by the human +recognition and learning pattern and propose VideoDistill, a framework with +language-aware (i.e., goal-driven) behavior in both vision perception and +answer generation process. VideoDistill generates answers only from +question-related visual embeddings and follows a thinking-observing-answering +approach that closely resembles human behavior, distinguishing it from previous +research. Specifically, we develop a language-aware gating mechanism to replace +the standard cross-attention, avoiding language's direct fusion into visual +representations. We incorporate this mechanism into two key components of the +entire framework. The first component is a differentiable sparse sampling +module, which selects frames containing the necessary dynamics and semantics +relevant to the questions. The second component is a vision refinement module +that merges existing spatial-temporal attention layers to ensure the extraction +of multi-grained visual semantics associated with the questions. We conduct +experimental evaluations on various challenging video question-answering +benchmarks, and VideoDistill achieves state-of-the-art performance in both +general and long-form VideoQA datasets. In Addition, we verify that +VideoDistill can effectively alleviate the utilization of language shortcut +solutions in the EgoTaskQA dataset. + +
+
+ comment: This paper is accepted by CVPR2024 +
+
+
+
+
+ + ☆ S2RC-GCN: A Spatial-Spectral Reliable Contrastive Graph Convolutional + Network for Complex Land Cover Classification Using Hyperspectral Images IJCNN 2024 + + +
+ Spatial correlations between different ground objects are an important +feature of mining land cover research. Graph Convolutional Networks (GCNs) can +effectively capture such spatial feature representations and have demonstrated +promising results in performing hyperspectral imagery (HSI) classification +tasks of complex land. However, the existing GCN-based HSI classification +methods are prone to interference from redundant information when extracting +complex features. To classify complex scenes more effectively, this study +proposes a novel spatial-spectral reliable contrastive graph convolutional +classification framework named S2RC-GCN. Specifically, we fused the spectral +and spatial features extracted by the 1D- and 2D-encoder, and the 2D-encoder +includes an attention model to automatically extract important information. We +then leveraged the fused high-level features to construct graphs and fed the +resulting graphs into the GCNs to determine more effective graph +representations. Furthermore, a novel reliable contrastive graph convolution +was proposed for reliable contrastive learning to learn and fuse robust +features. Finally, to test the performance of the model on complex object +classification, we used imagery taken by Gaofen-5 in the Jiang Xia area to +construct complex land cover datasets. The test results show that compared with +other models, our model achieved the best results and effectively improved the +classification performance of complex remote sensing imagery. + +
+
+ comment: Accepted to IJCNN 2024 (International Joint Conference on Neural + Networks) +
+
+
+
+
+ + ☆ Equivariant Local Reference Frames for Unsupervised Non-rigid Point + Cloud Shape Correspondence + + +
+ Unsupervised non-rigid point cloud shape correspondence underpins a multitude +of 3D vision tasks, yet itself is non-trivial given the exponential complexity +stemming from inter-point degree-of-freedom, i.e., pose transformations. Based +on the assumption of local rigidity, one solution for reducing complexity is to +decompose the overall shape into independent local regions using Local +Reference Frames (LRFs) that are invariant to SE(3) transformations. However, +the focus solely on local structure neglects global geometric contexts, +resulting in less distinctive LRFs that lack crucial semantic information +necessary for effective matching. Furthermore, such complexity introduces +out-of-distribution geometric contexts during inference, thus complicating +generalization. To this end, we introduce 1) EquiShape, a novel structure +tailored to learn pair-wise LRFs with global structural cues for both spatial +and semantic consistency, and 2) LRF-Refine, an optimization strategy generally +applicable to LRF-based methods, aimed at addressing the generalization +challenges. Specifically, for EquiShape, we employ cross-talk within separate +equivariant graph neural networks (Cross-GVP) to build long-range dependencies +to compensate for the lack of semantic information in local structure modeling, +deducing pair-wise independent SE(3)-equivariant LRF vectors for each point. +For LRF-Refine, the optimization adjusts LRFs within specific contexts and +knowledge, enhancing the geometric and semantic generalizability of point +features. Our overall framework surpasses the state-of-the-art methods by a +large margin on three benchmarks. Code and models will be publicly available. + +
+
+
+
+
+ + ☆ Harnessing The Power of Attention For Patch-Based Biomedical Image + Classification + + +
+ Biomedical image analysis can be facilitated by an innovative architecture +rooted in self-attention mechanisms. The traditional convolutional neural +network (CNN), characterized by fixed-sized windows, needs help capturing +intricate spatial and temporal relations at the pixel level. The immutability +of CNN filter weights post-training further restricts input fluctuations. +Recognizing these limitations, we propose a new paradigm of attention-based +models instead of convolutions. As an alternative to traditional CNNs, these +models demonstrate robust modelling capabilities and the ability to grasp +comprehensive long-range contextual information efficiently. Providing a +solution to critical challenges faced by attention-based vision models such as +inductive bias, weight sharing, receptive field limitations, and data handling +in high resolution, our work combines non-overlapping (vanilla patching) with +novel overlapped Shifted Patching Techniques (S.P.T.s) to induce local context +that enhances model generalization. Moreover, we examine the novel Lancoz5 +interpolation technique, which adapts variable image sizes to higher +resolutions. Experimental evidence validates our model's generalization +effectiveness, comparing favourably with existing approaches. Attention-based +methods are particularly effective with ample data, especially when advanced +data augmentation methodologies are integrated to strengthen their robustness. + +
+
+
+
+
+ + ☆ Exploring the Efficacy of Group-Normalization in Deep Learning Models + for Alzheimer's Disease Classification + + +
+ Batch Normalization is an important approach to advancing deep learning since +it allows multiple networks to train simultaneously. A problem arises when +normalizing along the batch dimension because B.N.'s error increases +significantly as batch size shrinks because batch statistics estimates are +inaccurate. As a result, computer vision tasks like detection, segmentation, +and video, which require tiny batches based on memory consumption, aren't +suitable for using Batch Normalization for larger model training and feature +transfer. Here, we explore Group Normalization as an easy alternative to using +Batch Normalization A Group Normalization is a channel normalization method in +which each group is divided into different channels, and the corresponding mean +and variance are calculated for each group. Group Normalization computations +are accurate across a wide range of batch sizes and are independent of batch +size. When trained using a large ImageNet database on ResNet-50, GN achieves a +very low error rate of 10.6% compared to Batch Normalization. when a smaller +batch size of only 2 is used. For usual batch sizes, the performance of G.N. is +comparable to that of Batch Normalization, but at the same time, it outperforms +other normalization techniques. Implementing Group Normalization as a direct +alternative to B.N to combat the serious challenges faced by the Batch +Normalization in deep learning models with comparable or improved +classification accuracy. Additionally, Group Normalization can be naturally +transferred from the pre-training to the fine-tuning phase. . + +
+
+ comment: 19 pages, 3 figures +
+
+
+
+
+ + ☆ How Can Large Language Models Enable Better Socially Assistive + Human-Robot Interaction: A Brief Survey AAAI + + +
+ Socially assistive robots (SARs) have shown great success in providing +personalized cognitive-affective support for user populations with special +needs such as older adults, children with autism spectrum disorder (ASD), and +individuals with mental health challenges. The large body of work on SAR +demonstrates its potential to provide at-home support that complements +clinic-based interventions delivered by mental health professionals, making +these interventions more effective and accessible. However, there are still +several major technical challenges that hinder SAR-mediated interactions and +interventions from reaching human-level social intelligence and efficacy. With +the recent advances in large language models (LLMs), there is an increased +potential for novel applications within the field of SAR that can significantly +expand the current capabilities of SARs. However, incorporating LLMs introduces +new risks and ethical concerns that have not yet been encountered, and must be +carefully be addressed to safely deploy these more advanced systems. In this +work, we aim to conduct a brief survey on the use of LLMs in SAR technologies, +and discuss the potentials and risks of applying LLMs to the following three +major technical challenges of SAR: 1) natural language dialog; 2) multimodal +understanding; 3) LLMs as robot policies. + +
+
+ comment: 2 pages, to be submitted to 2024 AAAI Spring Symposium +
+
+
+
+
+ + ☆ GOV-NeSF: Generalizable Open-Vocabulary Neural Semantic Fields + + +
+ Recent advancements in vision-language foundation models have significantly +enhanced open-vocabulary 3D scene understanding. However, the generalizability +of existing methods is constrained due to their framework designs and their +reliance on 3D data. We address this limitation by introducing Generalizable +Open-Vocabulary Neural Semantic Fields (GOV-NeSF), a novel approach offering a +generalizable implicit representation of 3D scenes with open-vocabulary +semantics. We aggregate the geometry-aware features using a cost volume, and +propose a Multi-view Joint Fusion module to aggregate multi-view features +through a cross-view attention mechanism, which effectively predicts +view-specific blending weights for both colors and open-vocabulary features. +Remarkably, our GOV-NeSF exhibits state-of-the-art performance in both 2D and +3D open-vocabulary semantic segmentation, eliminating the need for ground truth +semantic labels or depth priors, and effectively generalize across scenes and +datasets without fine-tuning. + +
+
+
+
+
+ + ☆ VortexViz: Finding Vortex Boundaries by Learning from Particle + Trajectories + + +
+ Vortices are studied in various scientific disciplines, offering insights +into fluid flow behavior. Visualizing the boundary of vortices is crucial for +understanding flow phenomena and detecting flow irregularities. This paper +addresses the challenge of accurately extracting vortex boundaries using deep +learning techniques. While existing methods primarily train on velocity +components, we propose a novel approach incorporating particle trajectories +(streamlines or pathlines) into the learning process. By leveraging the +regional/local characteristics of the flow field captured by streamlines or +pathlines, our methodology aims to enhance the accuracy of vortex boundary +extraction. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Instance-Aware Group Quantization for Vision Transformers CVPR 2024 + + +
+ Post-training quantization (PTQ) is an efficient model compression technique +that quantizes a pretrained full-precision model using only a small calibration +set of unlabeled samples without retraining. PTQ methods for convolutional +neural networks (CNNs) provide quantization results comparable to +full-precision counterparts. Directly applying them to vision transformers +(ViTs), however, incurs severe performance degradation, mainly due to the +differences in architectures between CNNs and ViTs. In particular, the +distribution of activations for each channel vary drastically according to +input instances, making PTQ methods for CNNs inappropriate for ViTs. To address +this, we introduce instance-aware group quantization for ViTs (IGQ-ViT). To +this end, we propose to split the channels of activation maps into multiple +groups dynamically for each input instance, such that activations within each +group share similar statistical properties. We also extend our scheme to +quantize softmax attentions across tokens. In addition, the number of groups +for each layer is adjusted to minimize the discrepancies between predictions +from quantized and full-precision models, under a bit-operation (BOP) +constraint. We show extensive experimental results on image classification, +object detection, and instance segmentation, with various transformer +architectures, demonstrating the effectiveness of our approach. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ LLMs are Good Sign Language Translators CVPR 2024 + + +
+ Sign Language Translation (SLT) is a challenging task that aims to translate +sign videos into spoken language. Inspired by the strong translation +capabilities of large language models (LLMs) that are trained on extensive +multilingual text corpora, we aim to harness off-the-shelf LLMs to handle SLT. +In this paper, we regularize the sign videos to embody linguistic +characteristics of spoken language, and propose a novel SignLLM framework to +transform sign videos into a language-like representation for improved +readability by off-the-shelf LLMs. SignLLM comprises two key modules: (1) The +Vector-Quantized Visual Sign module converts sign videos into a sequence of +discrete character-level sign tokens, and (2) the Codebook Reconstruction and +Alignment module converts these character-level tokens into word-level sign +representations using an optimal transport formulation. A sign-text alignment +loss further bridges the gap between sign and text tokens, enhancing semantic +compatibility. We achieve state-of-the-art gloss-free results on two +widely-used SLT benchmarks. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ BadPart: Unified Black-box Adversarial Patch Attacks against Pixel-wise + Regression Tasks + + +
+ Pixel-wise regression tasks (e.g., monocular depth estimation (MDE) and +optical flow estimation (OFE)) have been widely involved in our daily life in +applications like autonomous driving, augmented reality and video composition. +Although certain applications are security-critical or bear societal +significance, the adversarial robustness of such models are not sufficiently +studied, especially in the black-box scenario. In this work, we introduce the +first unified black-box adversarial patch attack framework against pixel-wise +regression tasks, aiming to identify the vulnerabilities of these models under +query-based black-box attacks. We propose a novel square-based adversarial +patch optimization framework and employ probabilistic square sampling and +score-based gradient estimation techniques to generate the patch effectively +and efficiently, overcoming the scalability problem of previous black-box patch +attacks. Our attack prototype, named BadPart, is evaluated on both MDE and OFE +tasks, utilizing a total of 7 models. BadPart surpasses 3 baseline methods in +terms of both attack performance and efficiency. We also apply BadPart on the +Google online service for portrait depth estimation, causing 43.5% relative +distance error with 50K queries. State-of-the-art (SOTA) countermeasures cannot +defend our attack effectively. + +
+
+
+
+
+ + ☆ MM3DGS SLAM: Multi-modal 3D Gaussian Splatting for SLAM Using Vision, + Depth, and Inertial Measurements + + +
+ Simultaneous localization and mapping is essential for position tracking and +scene understanding. 3D Gaussian-based map representations enable +photorealistic reconstruction and real-time rendering of scenes using multiple +posed cameras. We show for the first time that using 3D Gaussians for map +representation with unposed camera images and inertial measurements can enable +accurate SLAM. Our method, MM3DGS, addresses the limitations of prior neural +radiance field-based representations by enabling faster rendering, scale +awareness, and improved trajectory tracking. Our framework enables +keyframe-based mapping and tracking utilizing loss functions that incorporate +relative pose transformations from pre-integrated inertial measurements, depth +estimates, and measures of photometric rendering quality. We also release a +multi-modal dataset, UT-MM, collected from a mobile robot equipped with a +camera and an inertial measurement unit. Experimental evaluation on several +scenes from the dataset shows that MM3DGS achieves 3x improvement in tracking +and 5% improvement in photometric rendering quality compared to the current +3DGS SLAM state-of-the-art, while allowing real-time rendering of a +high-resolution dense 3D map. Project Webpage: +https://vita-group.github.io/MM3DGS-SLAM + +
+
+ comment: Project Webpage: https://vita-group.github.io/MM3DGS-SLAM +
+
+
+
+
+ + ☆ Towards Memorization-Free Diffusion Models CVPR2024 + + +
+ Pretrained diffusion models and their outputs are widely accessible due to +their exceptional capacity for synthesizing high-quality images and their +open-source nature. The users, however, may face litigation risks owing to the +models' tendency to memorize and regurgitate training data during inference. To +address this, we introduce Anti-Memorization Guidance (AMG), a novel framework +employing three targeted guidance strategies for the main causes of +memorization: image and caption duplication, and highly specific user prompts. +Consequently, AMG ensures memorization-free outputs while maintaining high +image quality and text alignment, leveraging the synergy of its guidance +methods, each indispensable in its own right. AMG also features an innovative +automatic detection system for potential memorization during each step of +inference process, allows selective application of guidance strategies, +minimally interfering with the original sampling process to preserve output +utility. We applied AMG to pretrained Denoising Diffusion Probabilistic Models +(DDPM) and Stable Diffusion across various generation tasks. The results +demonstrate that AMG is the first approach to successfully eradicates all +instances of memorization with no or marginal impacts on image quality and +text-alignment, as evidenced by FID and CLIP scores. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ☆ Towards Label-Efficient Human Matting: A Simple Baseline for Weakly + Semi-Supervised Trimap-Free Human Matting + + +
+ This paper presents a new practical training method for human matting, which +demands delicate pixel-level human region identification and significantly +laborious annotations. To reduce the annotation cost, most existing matting +approaches often rely on image synthesis to augment the dataset. However, the +unnaturalness of synthesized training images brings in a new domain +generalization challenge for natural images. To address this challenge, we +introduce a new learning paradigm, weakly semi-supervised human matting +(WSSHM), which leverages a small amount of expensive matte labels and a large +amount of budget-friendly segmentation labels, to save the annotation cost and +resolve the domain generalization problem. To achieve the goal of WSSHM, we +propose a simple and effective training method, named Matte Label Blending +(MLB), that selectively guides only the beneficial knowledge of the +segmentation and matte data to the matting model. Extensive experiments with +our detailed analysis demonstrate our method can substantially improve the +robustness of the matting model using a few matte data and numerous +segmentation data. Our training method is also easily applicable to real-time +models, achieving competitive accuracy with breakneck inference speed (328 FPS +on NVIDIA V100 GPU). The implementation code is available at +\url{https://github.com/clovaai/WSSHM}. + +
+
+ comment: Preprint, 15 pages, 13 figures +
+
+
+
+
+ + ☆ Gyro-based Neural Single Image Deblurring + + +
+ In this paper, we present GyroDeblurNet, a novel single image deblurring +method that utilizes a gyro sensor to effectively resolve the ill-posedness of +image deblurring. The gyro sensor provides valuable information about camera +motion during exposure time that can significantly improve deblurring quality. +However, effectively exploiting real-world gyro data is challenging due to +significant errors from various sources including sensor noise, the disparity +between the positions of a camera module and a gyro sensor, the absence of +translational motion information, and moving objects whose motions cannot be +captured by a gyro sensor. To handle gyro error, GyroDeblurNet is equipped with +two novel neural network blocks: a gyro refinement block and a gyro deblurring +block. The gyro refinement block refines the error-ridden gyro data using the +blur information from the input image. On the other hand, the gyro deblurring +block removes blur from the input image using the refined gyro data and further +compensates for gyro error by leveraging the blur information from the input +image. For training a neural network with erroneous gyro data, we propose a +training strategy based on the curriculum learning. We also introduce a novel +gyro data embedding scheme to represent real-world intricate camera shakes. +Finally, we present a synthetic dataset and a real dataset for the training and +evaluation of gyro-based single image deblurring. Our experiments demonstrate +that our approach achieves state-of-the-art deblurring quality by effectively +utilizing erroneous gyro data. + +
+
+ comment: 14 pages, 11 figures +
+
+
+
+
+ + ☆ Scalable 3D Registration via Truncated Entry-wise Absolute Residuals CVPR 2024 + + +
+ Given an input set of $3$D point pairs, the goal of outlier-robust $3$D +registration is to compute some rotation and translation that align as many +point pairs as possible. This is an important problem in computer vision, for +which many highly accurate approaches have been recently proposed. Despite +their impressive performance, these approaches lack scalability, often +overflowing the $16$GB of memory of a standard laptop to handle roughly +$30,000$ point pairs. In this paper, we propose a $3$D registration approach +that can process more than ten million ($10^7$) point pairs with over $99\%$ +random outliers. Moreover, our method is efficient, entails low memory costs, +and maintains high accuracy at the same time. We call our method TEAR, as it +involves minimizing an outlier-robust loss that computes Truncated Entry-wise +Absolute Residuals. To minimize this loss, we decompose the original +$6$-dimensional problem into two subproblems of dimensions $3$ and $2$, +respectively, solved in succession to global optimality via a customized +branch-and-bound method. While branch-and-bound is often slow and unscalable, +this does not apply to TEAR as we propose novel bounding functions that are +tight and computationally efficient. Experiments on various datasets are +conducted to validate the scalability and efficiency of our method. + +
+
+ comment: 24 pages, 12 figures. Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ LLaMA-Excitor: General Instruction Tuning via Indirect Feature + Interaction CVPR 2024 + + +
+ Existing methods to fine-tune LLMs, like Adapter, Prefix-tuning, and LoRA, +which introduce extra modules or additional input sequences to inject new +skills or knowledge, may compromise the innate abilities of LLMs. In this +paper, we propose LLaMA-Excitor, a lightweight method that stimulates the LLMs' +potential to better follow instructions by gradually paying more attention to +worthwhile information. Specifically, the LLaMA-Excitor does not directly +change the intermediate hidden state during the self-attention calculation of +the transformer structure. We designed the Excitor block as a bypass module for +the similarity score computation in LLMs' self-attention to reconstruct keys +and change the importance of values by learnable prompts. LLaMA-Excitor ensures +a self-adaptive allocation of additional attention to input instructions, thus +effectively preserving LLMs' pre-trained knowledge when fine-tuning LLMs on +low-quality instruction-following datasets. Furthermore, we unify the modeling +of multi-modal tuning and language-only tuning, extending LLaMA-Excitor to a +powerful visual instruction follower without the need for complex multi-modal +alignment. Our proposed approach is evaluated in language-only and multi-modal +tuning experimental scenarios. Notably, LLaMA-Excitor is the only method that +maintains basic capabilities while achieving a significant improvement (+6%) on +the MMLU benchmark. In the visual instruction tuning, we achieve a new +state-of-the-art image captioning performance of 157.5 CIDEr on MSCOCO, and a +comparable performance (88.39%) on ScienceQA to cutting-edge models with more +parameters and extensive vision-language pertaining. + +
+
+ comment: This paper is accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Learning by Correction: Efficient Tuning Task for Zero-Shot Generative + Vision-Language Reasoning CVPR2024 + + +
+ Generative vision-language models (VLMs) have shown impressive performance in +zero-shot vision-language tasks like image captioning and visual question +answering. However, improving their zero-shot reasoning typically requires +second-stage instruction tuning, which relies heavily on human-labeled or large +language model-generated annotation, incurring high labeling costs. To tackle +this challenge, we introduce Image-Conditioned Caption Correction (ICCC), a +novel pre-training task designed to enhance VLMs' zero-shot performance without +the need for labeled task-aware data. The ICCC task compels VLMs to rectify +mismatches between visual and language concepts, thereby enhancing instruction +following and text generation conditioned on visual inputs. Leveraging language +structure and a lightweight dependency parser, we construct data samples of +ICCC task from image-text datasets with low labeling and computation costs. +Experimental results on BLIP-2 and InstructBLIP demonstrate significant +improvements in zero-shot image-text generation-based VL tasks through ICCC +instruction tuning. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ AETTA: Label-Free Accuracy Estimation for Test-Time Adaptation CVPR 2024 + + +
+ Test-time adaptation (TTA) has emerged as a viable solution to adapt +pre-trained models to domain shifts using unlabeled test data. However, TTA +faces challenges of adaptation failures due to its reliance on blind adaptation +to unknown test samples in dynamic scenarios. Traditional methods for +out-of-distribution performance estimation are limited by unrealistic +assumptions in the TTA context, such as requiring labeled data or re-training +models. To address this issue, we propose AETTA, a label-free accuracy +estimation algorithm for TTA. We propose the prediction disagreement as the +accuracy estimate, calculated by comparing the target model prediction with +dropout inferences. We then improve the prediction disagreement to extend the +applicability of AETTA under adaptation failures. Our extensive evaluation with +four baselines and six TTA methods demonstrates that AETTA shows an average of +19.8%p more accurate estimation compared with the baselines. We further +demonstrate the effectiveness of accuracy estimation with a model recovery case +study, showcasing the practicality of our model recovery based on accuracy +estimation. The source code is available at https://github.com/taeckyung/AETTA. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ From Pixels to Graphs: Open-Vocabulary Scene Graph Generation with + Vision-Language Models CVPR 2024 + + +
+ Scene graph generation (SGG) aims to parse a visual scene into an +intermediate graph representation for downstream reasoning tasks. Despite +recent advancements, existing methods struggle to generate scene graphs with +novel visual relation concepts. To address this challenge, we introduce a new +open-vocabulary SGG framework based on sequence generation. Our framework +leverages vision-language pre-trained models (VLM) by incorporating an +image-to-graph generation paradigm. Specifically, we generate scene graph +sequences via image-to-text generation with VLM and then construct scene graphs +from these sequences. By doing so, we harness the strong capabilities of VLM +for open-vocabulary SGG and seamlessly integrate explicit relational modeling +for enhancing the VL tasks. Experimental results demonstrate that our design +not only achieves superior performance with an open vocabulary but also +enhances downstream vision-language task performance through explicit relation +modeling knowledge. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Slightly Shift New Classes to Remember Old Classes for Video + Class-Incremental Learning + + +
+ Recent video class-incremental learning usually excessively pursues the +accuracy of the newly seen classes and relies on memory sets to mitigate +catastrophic forgetting of the old classes. However, limited storage only +allows storing a few representative videos. So we propose SNRO, which slightly +shifts the features of new classes to remember old classes. Specifically, SNRO +contains Examples Sparse(ES) and Early Break(EB). ES decimates at a lower +sample rate to build memory sets and uses interpolation to align those sparse +frames in the future. By this, SNRO stores more examples under the same memory +consumption and forces the model to focus on low-semantic features which are +harder to be forgotten. EB terminates the training at a small epoch, preventing +the model from overstretching into the high-semantic space of the current task. +Experiments on UCF101, HMDB51, and UESTC-MMEA-CL datasets show that SNRO +performs better than other approaches while consuming the same memory +consumption. + +
+
+
+
+
+ + ☆ Marrying NeRF with Feature Matching for One-step Pose Estimation ICRA + + +
+ Given the image collection of an object, we aim at building a real-time +image-based pose estimation method, which requires neither its CAD model nor +hours of object-specific training. Recent NeRF-based methods provide a +promising solution by directly optimizing the pose from pixel loss between +rendered and target images. However, during inference, they require long +converging time, and suffer from local minima, making them impractical for +real-time robot applications. We aim at solving this problem by marrying image +matching with NeRF. With 2D matches and depth rendered by NeRF, we directly +solve the pose in one step by building 2D-3D correspondences between target and +initial view, thus allowing for real-time prediction. Moreover, to improve the +accuracy of 2D-3D correspondences, we propose a 3D consistent point mining +strategy, which effectively discards unfaithful points reconstruted by NeRF. +Moreover, current NeRF-based methods naively optimizing pixel loss fail at +occluded images. Thus, we further propose a 2D matches based sampling strategy +to preclude the occluded area. Experimental results on representative datasets +prove that our method outperforms state-of-the-art methods, and improves +inference efficiency by 90x, achieving real-time prediction at 6 FPS. + +
+
+ comment: ICRA, 2024. Video https://www.youtube.com/watch?v=70fgUobOFWo +
+
+
+
+
+ + ☆ Model-Agnostic Human Preference Inversion in Diffusion Models + + +
+ Efficient text-to-image generation remains a challenging task due to the high +computational costs associated with the multi-step sampling in diffusion +models. Although distillation of pre-trained diffusion models has been +successful in reducing sampling steps, low-step image generation often falls +short in terms of quality. In this study, we propose a novel sampling design to +achieve high-quality one-step image generation aligning with human preferences, +particularly focusing on exploring the impact of the prior noise distribution. +Our approach, Prompt Adaptive Human Preference Inversion (PAHI), optimizes the +noise distributions for each prompt based on human preferences without the need +for fine-tuning diffusion models. Our experiments showcase that the tailored +noise distributions significantly improve image quality with only a marginal +increase in computational cost. Our findings underscore the importance of noise +optimization and pave the way for efficient and high-quality text-to-image +synthesis. + +
+
+
+
+
+ + ☆ TryOn-Adapter: Efficient Fine-Grained Clothing Identity Adaptation for + High-Fidelity Virtual Try-On + + +
+ Virtual try-on focuses on adjusting the given clothes to fit a specific +person seamlessly while avoiding any distortion of the patterns and textures of +the garment. However, the clothing identity uncontrollability and training +inefficiency of existing diffusion-based methods, which struggle to maintain +the identity even with full parameter training, are significant limitations +that hinder the widespread applications. In this work, we propose an effective +and efficient framework, termed TryOn-Adapter. Specifically, we first decouple +clothing identity into fine-grained factors: style for color and category +information, texture for high-frequency details, and structure for smooth +spatial adaptive transformation. Our approach utilizes a pre-trained +exemplar-based diffusion model as the fundamental network, whose parameters are +frozen except for the attention layers. We then customize three lightweight +modules (Style Preserving, Texture Highlighting, and Structure Adapting) +incorporated with fine-tuning techniques to enable precise and efficient +identity control. Meanwhile, we introduce the training-free T-RePaint strategy +to further enhance clothing identity preservation while maintaining the +realistic try-on effect during the inference. Our experiments demonstrate that +our approach achieves state-of-the-art performance on two widely-used +benchmarks. Additionally, compared with recent full-tuning diffusion-based +methods, we only use about half of their tunable parameters during training. +The code will be made publicly available at +https://github.com/jiazheng-xing/TryOn-Adapter. + +
+
+
+
+
+ + ☆ MGMap: Mask-Guided Learning for Online Vectorized HD Map Construction CVPR 2024 + + +
+ Currently, high-definition (HD) map construction leans towards a lightweight +online generation tendency, which aims to preserve timely and reliable road +scene information. However, map elements contain strong shape priors. Subtle +and sparse annotations make current detection-based frameworks ambiguous in +locating relevant feature scopes and cause the loss of detailed structures in +prediction. To alleviate these problems, we propose MGMap, a mask-guided +approach that effectively highlights the informative regions and achieves +precise map element localization by introducing the learned masks. +Specifically, MGMap employs learned masks based on the enhanced multi-scale BEV +features from two perspectives. At the instance level, we propose the +Mask-activated instance (MAI) decoder, which incorporates global instance and +structural information into instance queries by the activation of instance +masks. At the point level, a novel position-guided mask patch refinement +(PG-MPR) module is designed to refine point locations from a finer-grained +perspective, enabling the extraction of point-specific patch information. +Compared to the baselines, our proposed MGMap achieves a notable improvement of +around 10 mAP for different input modalities. Extensive experiments also +demonstrate that our approach showcases strong robustness and generalization +capabilities. Our code can be found at https://github.com/xiaolul2/MGMap. + +
+
+ comment: 18 pages, 11 figures, accepted by CVPR 2024 +
+
+
+
+
+ + ☆ DiSR-NeRF: Diffusion-Guided View-Consistent Super-Resolution NeRF + + +
+ We present DiSR-NeRF, a diffusion-guided framework for view-consistent +super-resolution (SR) NeRF. Unlike prior works, we circumvent the requirement +for high-resolution (HR) reference images by leveraging existing powerful 2D +super-resolution models. Nonetheless, independent SR 2D images are often +inconsistent across different views. We thus propose Iterative 3D +Synchronization (I3DS) to mitigate the inconsistency problem via the inherent +multi-view consistency property of NeRF. Specifically, our I3DS alternates +between upscaling low-resolution (LR) rendered images with diffusion models, +and updating the underlying 3D representation with standard NeRF training. We +further introduce Renoised Score Distillation (RSD), a novel score-distillation +objective for 2D image resolution. Our RSD combines features from ancestral +sampling and Score Distillation Sampling (SDS) to generate sharp images that +are also LR-consistent. Qualitative and quantitative results on both synthetic +and real-world datasets demonstrate that our DiSR-NeRF can achieve better +results on NeRF super-resolution compared with existing works. Code and video +results available at the project website. + +
+
+
+
+
+ + ☆ Lipsum-FT: Robust Fine-Tuning of Zero-Shot Models Using Random Text + Guidance ICLR 2024 + + +
+ Large-scale contrastive vision-language pre-trained models provide the +zero-shot model achieving competitive performance across a range of image +classification tasks without requiring training on downstream data. Recent +works have confirmed that while additional fine-tuning of the zero-shot model +on the reference data results in enhanced downstream performance, it +compromises the model's robustness against distribution shifts. Our +investigation begins by examining the conditions required to achieve the goals +of robust fine-tuning, employing descriptions based on feature distortion +theory and joint energy-based models. Subsequently, we propose a novel robust +fine-tuning algorithm, Lipsum-FT, that effectively utilizes the language +modeling aspect of the vision-language pre-trained models. Extensive +experiments conducted on distribution shift scenarios in DomainNet and ImageNet +confirm the superiority of our proposed Lipsum-FT approach over existing robust +fine-tuning methods. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ☆ Meta Episodic learning with Dynamic Task Sampling for CLIP-based Point + Cloud Classification + + +
+ Point cloud classification refers to the process of assigning semantic labels +or categories to individual points within a point cloud data structure. Recent +works have explored the extension of pre-trained CLIP to 3D recognition. In +this direction, CLIP-based point cloud models like PointCLIP, CLIP2Point have +become state-of-the-art methods in the few-shot setup. Although these methods +show promising performance for some classes like airplanes, desks, guitars, +etc, the performance for some classes like the cup, flower pot, sink, +nightstand, etc is still far from satisfactory. This is due to the fact that +the adapter of CLIP-based models is trained using randomly sampled N-way K-shot +data in the standard supervised learning setup. In this paper, we propose a +novel meta-episodic learning framework for CLIP-based point cloud +classification, addressing the challenges of limited training examples and +sampling unknown classes. Additionally, we introduce dynamic task sampling +within the episode based on performance memory. This sampling strategy +effectively addresses the challenge of sampling unknown classes, ensuring that +the model learns from a diverse range of classes and promotes the exploration +of underrepresented categories. By dynamically updating the performance memory, +we adaptively prioritize the sampling of classes based on their performance, +enhancing the model's ability to handle challenging and real-world scenarios. +Experiments show an average performance gain of 3-6\% on ModelNet40 and +ScanobjectNN datasets in a few-shot setup. + +
+
+
+
+
+ + ☆ TSOM: Small Object Motion Detection Neural Network Inspired by Avian + Visual Circuit + + +
+ Detecting small moving objects in complex backgrounds from an overhead +perspective is a highly challenging task for machine vision systems. As an +inspiration from nature, the avian visual system is capable of processing +motion information in various complex aerial scenes, and its Retina-OT-Rt +visual circuit is highly sensitive to capturing the motion information of small +objects from high altitudes. However, more needs to be done on small object +motion detection algorithms based on the avian visual system. In this paper, we +conducted mathematical modeling based on extensive studies of the biological +mechanisms of the Retina-OT-Rt visual circuit. Based on this, we proposed a +novel tectum small object motion detection neural network (TSOM). The neural +network includes the retina, SGC dendritic, SGC Soma, and Rt layers, each layer +corresponding to neurons in the visual pathway. The Retina layer is responsible +for accurately projecting input content, the SGC dendritic layer perceives and +encodes spatial-temporal information, the SGC Soma layer computes complex +motion information and extracts small objects, and the Rt layer integrates and +decodes motion information from multiple directions to determine the position +of small objects. Extensive experiments on pigeon neurophysiological +experiments and image sequence data showed that the TSOM is biologically +interpretable and effective in extracting reliable small object motion features +from complex high-altitude backgrounds. + +
+
+
+
+
+ + ☆ Ensemble Learning for Vietnamese Scene Text Spotting in Urban + Environments + + +
+ This paper presents a simple yet efficient ensemble learning framework for +Vietnamese scene text spotting. Leveraging the power of ensemble learning, +which combines multiple models to yield more accurate predictions, our approach +aims to significantly enhance the performance of scene text spotting in +challenging urban settings. Through experimental evaluations on the VinText +dataset, our proposed method achieves a significant improvement in accuracy +compared to existing methods with an impressive accuracy of 5%. These results +unequivocally demonstrate the efficacy of ensemble learning in the context of +Vietnamese scene text spotting in urban environments, highlighting its +potential for real world applications, such as text detection and recognition +in urban signage, advertisements, and various text-rich urban scenes. + +
+
+ comment: RIVF 2023 +
+
+
+
+
+ + ☆ Prompt Learning via Meta-Regularization CVPR 2024 + + +
+ Pre-trained vision-language models have shown impressive success on various +computer vision tasks with their zero-shot generalizability. Recently, prompt +learning approaches have been explored to efficiently and effectively adapt the +vision-language models to a variety of downstream tasks. However, most existing +prompt learning methods suffer from task overfitting since the general +knowledge of the pre-trained vision language models is forgotten while the +prompts are finetuned on a small data set from a specific target task. To +address this issue, we propose a Prompt Meta-Regularization (ProMetaR) to +improve the generalizability of prompt learning for vision-language models. +Specifically, ProMetaR meta-learns both the regularizer and the soft prompts to +harness the task-specific knowledge from the downstream tasks and task-agnostic +general knowledge from the vision-language models. Further, ProMetaR augments +the task to generate multiple virtual tasks to alleviate the meta-overfitting. +In addition, we provide the analysis to comprehend how ProMetaR improves the +generalizability of prompt tuning in the perspective of the gradient alignment. +Our extensive experiments demonstrate that our ProMetaR improves the +generalizability of conventional prompt learning methods under +base-to-base/base-to-new and domain generalization settings. The code of +ProMetaR is available at https://github.com/mlvlab/ProMetaR. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Generating Content for HDR Deghosting from Frequency View CVPR2024 + + +
+ Recovering ghost-free High Dynamic Range (HDR) images from multiple Low +Dynamic Range (LDR) images becomes challenging when the LDR images exhibit +saturation and significant motion. Recent Diffusion Models (DMs) have been +introduced in HDR imaging field, demonstrating promising performance, +particularly in achieving visually perceptible results compared to previous +DNN-based methods. However, DMs require extensive iterations with large models +to estimate entire images, resulting in inefficiency that hinders their +practical application. To address this challenge, we propose the Low-Frequency +aware Diffusion (LF-Diff) model for ghost-free HDR imaging. The key idea of +LF-Diff is implementing the DMs in a highly compacted latent space and +integrating it into a regression-based model to enhance the details of +reconstructed images. Specifically, as low-frequency information is closely +related to human visual perception we propose to utilize DMs to create compact +low-frequency priors for the reconstruction process. In addition, to take full +advantage of the above low-frequency priors, the Dynamic HDR Reconstruction +Network (DHRNet) is carried out in a regression-based manner to obtain final +HDR images. Extensive experiments conducted on synthetic and real-world +benchmark datasets demonstrate that our LF-Diff performs favorably against +several state-of-the-art methods and is 10$\times$ faster than previous +DM-based methods. + +
+
+ comment: This paper is accepted by CVPR2024 +
+
+
+
+
+ + ☆ Collaborative Learning of Anomalies with Privacy (CLAP) for Unsupervised + Video Anomaly Detection: A New Baseline CVPR + + +
+ Unsupervised (US) video anomaly detection (VAD) in surveillance applications +is gaining more popularity recently due to its practical real-world +applications. As surveillance videos are privacy sensitive and the availability +of large-scale video data may enable better US-VAD systems, collaborative +learning can be highly rewarding in this setting. However, due to the extremely +challenging nature of the US-VAD task, where learning is carried out without +any annotations, privacy-preserving collaborative learning of US-VAD systems +has not been studied yet. In this paper, we propose a new baseline for anomaly +detection capable of localizing anomalous events in complex surveillance videos +in a fully unsupervised fashion without any labels on a privacy-preserving +participant-based distributed training configuration. Additionally, we propose +three new evaluation protocols to benchmark anomaly detection approaches on +various scenarios of collaborations and data availability. Based on these +protocols, we modify existing VAD datasets to extensively evaluate our approach +as well as existing US SOTA methods on two large-scale datasets including +UCF-Crime and XD-Violence. All proposed evaluation protocols, dataset splits, +and codes are available here: https://github.com/AnasEmad11/CLAP + +
+
+ comment: Accepted in IEEE/CVF Computer Vision and Pattern Recognition + Conference (CVPR), 2024 +
+
+
+
+
+ + ☆ Transfer Learning with Point Transformers + + +
+ Point Transformers are near state-of-the-art models for classification, +segmentation, and detection tasks on Point Cloud data. They utilize a self +attention based mechanism to model large range spatial dependencies between +multiple point sets. In this project we explore two things: classification +performance of these attention based networks on ModelNet10 dataset and then, +we use the trained model to classify 3D MNIST dataset after finetuning. We also +train the model from scratch on 3D MNIST dataset to compare the performance of +finetuned and from-scratch model on the MNIST dataset. We observe that since +the two datasets have a large difference in the degree of the distributions, +transfer learned models do not outperform the from-scratch models in this case. +Although we do expect transfer learned models to converge faster since they +already know the lower level edges, corners, etc features from the ModelNet10 +dataset. + +
+
+
+
+
+ + ☆ An N-Point Linear Solver for Line and Motion Estimation with Event + Cameras + + +
+ Event cameras respond primarily to edges--formed by strong gradients--and are +thus particularly well-suited for line-based motion estimation. Recent work has +shown that events generated by a single line each satisfy a polynomial +constraint which describes a manifold in the space-time volume. Multiple such +constraints can be solved simultaneously to recover the partial linear velocity +and line parameters. In this work, we show that, with a suitable line +parametrization, this system of constraints is actually linear in the unknowns, +which allows us to design a novel linear solver. Unlike existing solvers, our +linear solver (i) is fast and numerically stable since it does not rely on +expensive root finding, (ii) can solve both minimal and overdetermined systems +with more than 5 events, and (iii) admits the characterization of all +degenerate cases and multiple solutions. The found line parameters are +singularity-free and have a fixed scale, which eliminates the need for +auxiliary constraints typically encountered in previous work. To recover the +full linear camera velocity we fuse observations from multiple lines with a +novel velocity averaging scheme that relies on a geometrically-motivated +residual, and thus solves the problem more efficiently than previous schemes +which minimize an algebraic residual. Extensive experiments in synthetic and +real-world settings demonstrate that our method surpasses the previous work in +numerical stability, and operates over 600 times faster. + +
+
+
+
+
+ + ☆ 3MOS: Multi-sources, Multi-resolutions, and Multi-scenes dataset for + Optical-SAR image matching + + +
+ Optical-SAR image matching is a fundamental task for image fusion and visual +navigation. However, all large-scale open SAR dataset for methods development +are collected from single platform, resulting in limited satellite types and +spatial resolutions. Since images captured by different sensors vary +significantly in both geometric and radiometric appearance, existing methods +may fail to match corresponding regions containing the same content. Besides, +most of existing datasets have not been categorized based on the +characteristics of different scenes. To encourage the design of more general +multi-modal image matching methods, we introduce a large-scale +Multi-sources,Multi-resolutions, and Multi-scenes dataset for Optical-SAR image +matching(3MOS). It consists of 155K optical-SAR image pairs, including SAR data +from six commercial satellites, with resolutions ranging from 1.25m to 12.5m. +The data has been classified into eight scenes including urban, rural, plains, +hills, mountains, water, desert, and frozen earth. Extensively experiments show +that none of state-of-the-art methods achieve consistently superior performance +across different sources, resolutions and scenes. In addition, the distribution +of data has a substantial impact on the matching capability of deep learning +models, this proposes the domain adaptation challenge in optical-SAR image +matching. Our data and code will be available at:https://github.com/3M-OS/3MOS. + +
+
+ comment: 20pages 17 figures +
+
+
+
+
+ + ☆ Automated HER2 Scoring in Breast Cancer Images Using Deep Learning and + Pyramid Sampling + + +
+ Human epidermal growth factor receptor 2 (HER2) is a critical protein in +cancer cell growth that signifies the aggressiveness of breast cancer (BC) and +helps predict its prognosis. Accurate assessment of immunohistochemically (IHC) +stained tissue slides for HER2 expression levels is essential for both +treatment guidance and understanding of cancer mechanisms. Nevertheless, the +traditional workflow of manual examination by board-certified pathologists +encounters challenges, including inter- and intra-observer inconsistency and +extended turnaround times. Here, we introduce a deep learning-based approach +utilizing pyramid sampling for the automated classification of HER2 status in +IHC-stained BC tissue images. Our approach analyzes morphological features at +various spatial scales, efficiently managing the computational load and +facilitating a detailed examination of cellular and larger-scale tissue-level +details. This method addresses the tissue heterogeneity of HER2 expression by +providing a comprehensive view, leading to a blind testing classification +accuracy of 84.70%, on a dataset of 523 core images from tissue microarrays. +Our automated system, proving reliable as an adjunct pathology tool, has the +potential to enhance diagnostic precision and evaluation speed, and might +significantly impact cancer treatment planning. + +
+
+ comment: 21 Pages, 7 Figures +
+
+
+
+
+ + ☆ Towards Robust Event-guided Low-Light Image Enhancement: A Large-Scale + Real-World Event-Image Dataset and Novel Approach CVPR 2024 + + +
+ Event camera has recently received much attention for low-light image +enhancement (LIE) thanks to their distinct advantages, such as high dynamic +range. However, current research is prohibitively restricted by the lack of +large-scale, real-world, and spatial-temporally aligned event-image datasets. +To this end, we propose a real-world (indoor and outdoor) dataset comprising +over 30K pairs of images and events under both low and normal illumination +conditions. To achieve this, we utilize a robotic arm that traces a consistent +non-linear trajectory to curate the dataset with spatial alignment precision +under 0.03mm. We then introduce a matching alignment strategy, rendering 90% of +our dataset with errors less than 0.01s. Based on the dataset, we propose a +novel event-guided LIE approach, called EvLight, towards robust performance in +real-world low-light scenes. Specifically, we first design the multi-scale +holistic fusion branch to extract holistic structural and textural information +from both events and images. To ensure robustness against variations in the +regional illumination and noise, we then introduce a Signal-to-Noise-Ratio +(SNR)-guided regional feature selection to selectively fuse features of images +from regions with high SNR and enhance those with low SNR by extracting +regional structure information from events. Extensive experiments on our +dataset and the synthetic SDSD dataset demonstrate our EvLight significantly +surpasses the frame-based methods. Code and datasets are available at +https://vlislab22.github.io/eg-lowlight/. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Generation and Detection of Sign Language Deepfakes -- A Linguistic and + Visual Analysis + + +
+ A question in the realm of deepfakes is slowly emerging pertaining to whether +we can go beyond facial deepfakes and whether it would be beneficial to +society. Therefore, this research presents a positive application of deepfake +technology in upper body generation, while performing sign-language for the +Deaf and Hard of Hearing (DHoH) community. The resulting videos are later +vetted with a sign language expert. This is particularly helpful, given the +intricate nature of sign language, a scarcity of sign language experts, and +potential benefits for health and education. The objectives of this work +encompass constructing a reliable deepfake dataset, evaluating its technical +and visual credibility through computer vision and natural language processing +models, and assessing the plausibility of the generated content. With over 1200 +videos, featuring both previously seen and unseen individuals for the +generation model, using the help of a sign language expert, we establish a +deepfake dataset in sign language that can further be utilized to detect fake +videos that may target certain people of determination. + +
+
+ comment: 13 pages, 13 figures, Computer Vision and Image Understanding Journal +
+
+
+
+
+ + ☆ DRIVE: Dual Gradient-Based Rapid Iterative Pruning + + +
+ Modern deep neural networks (DNNs) consist of millions of parameters, +necessitating high-performance computing during training and inference. Pruning +is one solution that significantly reduces the space and time complexities of +DNNs. Traditional pruning methods that are applied post-training focus on +streamlining inference, but there are recent efforts to leverage sparsity early +on by pruning before training. Pruning methods, such as iterative +magnitude-based pruning (IMP) achieve up to a 90% parameter reduction while +retaining accuracy comparable to the original model. However, this leads to +impractical runtime as it relies on multiple train-prune-reset cycles to +identify and eliminate redundant parameters. In contrast, training agnostic +early pruning methods, such as SNIP and SynFlow offer fast pruning but fall +short of the accuracy achieved by IMP at high sparsities. To bridge this gap, +we present Dual Gradient-Based Rapid Iterative Pruning (DRIVE), which leverages +dense training for initial epochs to counteract the randomness inherent at the +initialization. Subsequently, it employs a unique dual gradient-based metric +for parameter ranking. It has been experimentally demonstrated for VGG and +ResNet architectures on CIFAR-10/100 and Tiny ImageNet, and ResNet on ImageNet +that DRIVE consistently has superior performance over other training-agnostic +early pruning methods in accuracy. Notably, DRIVE is 43$\times$ to 869$\times$ +faster than IMP for pruning. + +
+
+
+
+
+ + ♻ ☆ Rotate to Scan: UNet-like Mamba with Triplet SSM Module for Medical + Image Segmentation + + +
+ Image segmentation holds a vital position in the realms of diagnosis and +treatment within the medical domain. Traditional convolutional neural networks +(CNNs) and Transformer models have made significant advancements in this realm, +but they still encounter challenges because of limited receptive field or high +computing complexity. Recently, State Space Models (SSMs), particularly Mamba +and its variants, have demonstrated notable performance in the field of vision. +However, their feature extraction methods may not be sufficiently effective and +retain some redundant structures, leaving room for parameter reduction. +Motivated by previous spatial and channel attention methods, we propose Triplet +Mamba-UNet. The method leverages residual VSS Blocks to extract intensive +contextual features, while Triplet SSM is employed to fuse features across +spatial and channel dimensions. We conducted experiments on ISIC17, ISIC18, +CVC-300, CVC-ClinicDB, Kvasir-SEG, CVC-ColonDB, and Kvasir-Instrument datasets, +demonstrating the superior segmentation performance of our proposed TM-UNet. +Additionally, compared to the previous VM-UNet, our model achieves a one-third +reduction in parameters. + +
+
+
+
+
+ + ♻ ☆ Modality-Agnostic Structural Image Representation Learning for + Deformable Multi-Modality Medical Image Registration CVPR2024 + + +
+ Establishing dense anatomical correspondence across distinct imaging +modalities is a foundational yet challenging procedure for numerous medical +image analysis studies and image-guided radiotherapy. Existing multi-modality +image registration algorithms rely on statistical-based similarity measures or +local structural image representations. However, the former is sensitive to +locally varying noise, while the latter is not discriminative enough to cope +with complex anatomical structures in multimodal scans, causing ambiguity in +determining the anatomical correspondence across scans with different +modalities. In this paper, we propose a modality-agnostic structural +representation learning method, which leverages Deep Neighbourhood +Self-similarity (DNS) and anatomy-aware contrastive learning to learn +discriminative and contrast-invariance deep structural image representations +(DSIR) without the need for anatomical delineations or pre-aligned training +images. We evaluate our method on multiphase CT, abdomen MR-CT, and brain MR +T1w-T2w registration. Comprehensive results demonstrate that our method is +superior to the conventional local structural representation and +statistical-based similarity measures in terms of discriminability and +accuracy. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ Structure Matters: Tackling the Semantic Discrepancy in Diffusion Models + for Image Inpainting CVPR 2024 + + +
+ Denoising diffusion probabilistic models for image inpainting aim to add the +noise to the texture of image during the forward process and recover masked +regions with unmasked ones of the texture via the reverse denoising +process.Despite the meaningful semantics generation,the existing arts suffer +from the semantic discrepancy between masked and unmasked regions, since the +semantically dense unmasked texture fails to be completely degraded while the +masked regions turn to the pure noise in diffusion process,leading to the large +discrepancy between them. In this paper,we aim to answer how unmasked semantics +guide texture denoising process;together with how to tackle the semantic +discrepancy,to facilitate the consistent and meaningful semantics generation. +To this end,we propose a novel structure-guided diffusion model named +StrDiffusion,to reformulate the conventional texture denoising process under +structure guidance to derive a simplified denoising objective for image +inpainting,while revealing:1)the semantically sparse structure is beneficial to +tackle semantic discrepancy in early stage, while dense texture generates +reasonable semantics in late stage;2)the semantics from unmasked regions +essentially offer the time-dependent structure guidance for the texture +denoising process,benefiting from the time-dependent sparsity of the structure +semantics.For the denoising process,a structure-guided neural network is +trained to estimate the simplified denoising objective by exploiting the +consistency of the denoised structure between masked and unmasked +regions.Besides,we devise an adaptive resampling strategy as a formal criterion +as whether structure is competent to guide the texture denoising process,while +regulate their semantic correlations.Extensive experiments validate the merits +of StrDiffusion over the state-of-the-arts.Our code is available at +https://github.com/htyjers/StrDiffusion. + +
+
+ comment: 15 pages, 10 figures, to appear CVPR 2024 +
+
+
+
+
+ + ♻ ☆ An Extensible Framework for Open Heterogeneous Collaborative Perception ICLR 2024 + + +
+ Collaborative perception aims to mitigate the limitations of single-agent +perception, such as occlusions, by facilitating data exchange among multiple +agents. However, most current works consider a homogeneous scenario where all +agents use identity sensors and perception models. In reality, heterogeneous +agent types may continually emerge and inevitably face a domain gap when +collaborating with existing agents. In this paper, we introduce a new open +heterogeneous problem: how to accommodate continually emerging new +heterogeneous agent types into collaborative perception, while ensuring high +perception performance and low integration cost? To address this problem, we +propose HEterogeneous ALliance (HEAL), a novel extensible collaborative +perception framework. HEAL first establishes a unified feature space with +initial agents via a novel multi-scale foreground-aware Pyramid Fusion network. +When heterogeneous new agents emerge with previously unseen modalities or +models, we align them to the established unified space with an innovative +backward alignment. This step only involves individual training on the new +agent type, thus presenting extremely low training costs and high +extensibility. To enrich agents' data heterogeneity, we bring OPV2V-H, a new +large-scale dataset with more diverse sensor types. Extensive experiments on +OPV2V-H and DAIR-V2X datasets show that HEAL surpasses SOTA methods in +performance while reducing the training parameters by 91.5% when integrating 3 +new agent types. We further implement a comprehensive codebase at: +https://github.com/yifanlu0227/HEAL + +
+
+ comment: Accepted by ICLR 2024. The code and data are open-sourced at + https://github.com/yifanlu0227/HEAL +
+
+
+
+
+ + ♻ ☆ WaterVG: Waterway Visual Grounding based on Text-Guided Vision and + mmWave Radar + + +
+ The perception of waterways based on human intent is significant for +autonomous navigation and operations of Unmanned Surface Vehicles (USVs) in +water environments. Inspired by visual grounding, we introduce WaterVG, the +first visual grounding dataset designed for USV-based waterway perception based +on human prompts. WaterVG encompasses prompts describing multiple targets, with +annotations at the instance level including bounding boxes and masks. Notably, +WaterVG includes 11,568 samples with 34,987 referred targets, whose prompts +integrates both visual and radar characteristics. The pattern of text-guided +two sensors equips a finer granularity of text prompts with visual and radar +features of referred targets. Moreover, we propose a low-power visual grounding +model, Potamoi, which is a multi-task model with a well-designed Phased +Heterogeneous Modality Fusion (PHMF) mode, including Adaptive Radar Weighting +(ARW) and Multi-Head Slim Cross Attention (MHSCA). Exactly, ARW extracts +required radar features to fuse with vision for prompt alignment. MHSCA is an +efficient fusion module with a remarkably small parameter count and FLOPs, +elegantly fusing scenario context captured by two sensors with linguistic +features, which performs expressively on visual grounding tasks. Comprehensive +experiments and evaluations have been conducted on WaterVG, where our Potamoi +archives state-of-the-art performances compared with counterparts. + +
+
+ comment: 10 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ PACE: A Large-Scale Dataset with Pose Annotations in Cluttered + Environments + + +
+ Pose estimation is a crucial task in computer vision and robotics, enabling +the tracking and manipulation of objects in images or videos. While several +datasets exist for pose estimation, there is a lack of large-scale datasets +specifically focusing on cluttered scenes with occlusions. We introduce PACE +(Pose Annotations in Cluttered Environments), a large-scale benchmark designed +to advance the development and evaluation of pose estimation methods in +cluttered scenarios. PACE consists of 54,945 frames with 257,673 annotations +across 300 videos, covering 576 objects from 44 categories and featuring a mix +of rigid and articulated items in cluttered scenes. To annotate the real-world +data efficiently, we developed an innovative annotation system utilizing a +calibrated 3-camera setup. We test state-of-the-art algorithms in PACE along +two tracks: pose estimation, and object pose tracking, revealing the +benchmark's challenges and research opportunities. Our code and data is +available on https://github.com/qq456cvb/PACE. + +
+
+
+
+
+ + ♻ ☆ Draw-and-Understand: Leveraging Visual Prompts to Enable MLLMs to + Comprehend What You Want + + +
+ The interaction between humans and artificial intelligence (AI) is a crucial +factor that reflects the effectiveness of multimodal large language models +(MLLMs). However, current MLLMs primarily focus on image-level comprehension +and limit interaction to textual instructions, thereby constraining their +flexibility in usage and depth of response. In this paper, we introduce the +Draw-and-Understand project: a new model, a multi-domain dataset, and a +challenging benchmark for visual prompting. Specifically, we propose SPHINX-V, +a new end-to-end trained Multimodal Large Language Model (MLLM) that connects a +vision encoder, a visual prompt encoder and an LLM for various visual prompts +(points, bounding boxes, and free-form shape) and language understanding. To +advance visual prompting research for MLLMs, we introduce MDVP-Data and +MDVP-Bench. MDVP-Data features a multi-domain dataset containing 1.6M unique +image-visual prompt-text instruction-following samples, including natural +images, document images, OCR images, mobile screenshots, web screenshots, and +multi-panel images. Furthermore, we present MDVP-Bench, a comprehensive and +challenging benchmark to assess a model's capability in understanding visual +prompting instructions. Our experiments demonstrate SPHINX-V's impressive +multimodal interaction capabilities through visual prompting, revealing +significant improvements in detailed pixel-level description and +question-answering abilities. + +
+
+ comment: 16 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ BAMM: Bidirectional Autoregressive Motion Model + + +
+ Generating human motion from text has been dominated by denoising motion +models either through diffusion or generative masking process. However, these +models face great limitations in usability by requiring prior knowledge of the +motion length. Conversely, autoregressive motion models address this limitation +by adaptively predicting motion endpoints, at the cost of degraded generation +quality and editing capabilities. To address these challenges, we propose +Bidirectional Autoregressive Motion Model (BAMM), a novel text-to-motion +generation framework. BAMM consists of two key components: (1) a motion +tokenizer that transforms 3D human motion into discrete tokens in latent space, +and (2) a masked self-attention transformer that autoregressively predicts +randomly masked tokens via a hybrid attention masking strategy. By unifying +generative masked modeling and autoregressive modeling, BAMM captures rich and +bidirectional dependencies among motion tokens, while learning the +probabilistic mapping from textual inputs to motion outputs with +dynamically-adjusted motion sequence length. This feature enables BAMM to +simultaneously achieving high-quality motion generation with enhanced usability +and built-in motion editability. Extensive experiments on HumanML3D and KIT-ML +datasets demonstrate that BAMM surpasses current state-of-the-art methods in +both qualitative and quantitative measures. Our project page is available at +https://exitudio.github.io/BAMM-page + +
+
+
+
+
+ + ♻ ☆ Video-Based Human Pose Regression via Decoupled Space-Time Aggregation + + +
+ By leveraging temporal dependency in video sequences, multi-frame human pose +estimation algorithms have demonstrated remarkable results in complicated +situations, such as occlusion, motion blur, and video defocus. These algorithms +are predominantly based on heatmaps, resulting in high computation and storage +requirements per frame, which limits their flexibility and real-time +application in video scenarios, particularly on edge devices. In this paper, we +develop an efficient and effective video-based human pose regression method, +which bypasses intermediate representations such as heatmaps and instead +directly maps the input to the output joint coordinates. Despite the inherent +spatial correlation among adjacent joints of the human pose, the temporal +trajectory of each individual joint exhibits relative independence. In light of +this, we propose a novel Decoupled Space-Time Aggregation network (DSTA) to +separately capture the spatial contexts between adjacent joints and the +temporal cues of each individual joint, thereby avoiding the conflation of +spatiotemporal dimensions. Concretely, DSTA learns a dedicated feature token +for each joint to facilitate the modeling of their spatiotemporal dependencies. +With the proposed joint-wise local-awareness attention mechanism, our method is +capable of efficiently and flexibly utilizing the spatial dependency of +adjacent joints and the temporal dependency of each joint itself. Extensive +experiments demonstrate the superiority of our method. Compared to previous +regression-based single-frame human pose estimation methods, DSTA significantly +enhances performance, achieving an 8.9 mAP improvement on PoseTrack2017. +Furthermore, our approach either surpasses or is on par with the +state-of-the-art heatmap-based multi-frame human pose estimation methods. +Project page: https://github.com/zgspose/DSTA. + +
+
+ comment: 12 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Compositional Chain-of-Thought Prompting for Large Multimodal Models + + +
+ The combination of strong visual backbones and Large Language Model (LLM) +reasoning has led to Large Multimodal Models (LMMs) becoming the current +standard for a wide range of vision and language (VL) tasks. However, recent +research has shown that even the most advanced LMMs still struggle to capture +aspects of compositional visual reasoning, such as attributes and relationships +between objects. One solution is to utilize scene graphs (SGs)--a formalization +of objects and their relations and attributes that has been extensively used as +a bridge between the visual and textual domains. Yet, scene graph data requires +scene graph annotations, which are expensive to collect and thus not easily +scalable. Moreover, finetuning an LMM based on SG data can lead to catastrophic +forgetting of the pretraining objective. To overcome this, inspired by +chain-of-thought methods, we propose Compositional Chain-of-Thought (CCoT), a +novel zero-shot Chain-of-Thought prompting method that utilizes SG +representations in order to extract compositional knowledge from an LMM. +Specifically, we first generate an SG using the LMM, and then use that SG in +the prompt to produce a response. Through extensive experiments, we find that +the proposed CCoT approach not only improves LMM performance on several vision +and language VL compositional benchmarks but also improves the performance of +several popular LMMs on general multimodal benchmarks, without the need for +fine-tuning or annotated ground-truth SGs. Code: +https://github.com/chancharikmitra/CCoT + +
+
+
+
+
+ + ♻ ☆ Efficient 3D Instance Mapping and Localization with Neural Fields + + +
+ We tackle the problem of learning an implicit scene representation for 3D +instance segmentation from a sequence of posed RGB images. Towards this, we +introduce 3DIML, a novel framework that efficiently learns a label field that +may be rendered from novel viewpoints to produce view-consistent instance +segmentation masks. 3DIML significantly improves upon training and inference +runtimes of existing implicit scene representation based methods. Opposed to +prior art that optimizes a neural field in a self-supervised manner, requiring +complicated training procedures and loss function design, 3DIML leverages a +two-phase process. The first phase, InstanceMap, takes as input 2D segmentation +masks of the image sequence generated by a frontend instance segmentation +model, and associates corresponding masks across images to 3D labels. These +almost view-consistent pseudolabel masks are then used in the second phase, +InstanceLift, to supervise the training of a neural label field, which +interpolates regions missed by InstanceMap and resolves ambiguities. +Additionally, we introduce InstanceLoc, which enables near realtime +localization of instance masks given a trained label field and an off-the-shelf +image segmentation model by fusing outputs from both. We evaluate 3DIML on +sequences from the Replica and ScanNet datasets and demonstrate 3DIML's +effectiveness under mild assumptions for the image sequences. We achieve a +large practical speedup over existing implicit scene representation methods +with comparable quality, showcasing its potential to facilitate faster and more +effective 3D scene understanding. + +
+
+
+
+
+ + ♻ ☆ Change-Agent: Towards Interactive Comprehensive Remote Sensing Change + Interpretation and Analysis + + +
+ Monitoring changes in the Earth's surface is crucial for understanding +natural processes and human impacts, necessitating precise and comprehensive +interpretation methodologies. Remote sensing satellite imagery offers a unique +perspective for monitoring these changes, leading to the emergence of remote +sensing image change interpretation (RSICI) as a significant research focus. +Current RSICI technology encompasses change detection and change captioning, +each with its limitations in providing comprehensive interpretation. To address +this, we propose an interactive Change-Agent, which can follow user +instructions to achieve comprehensive change interpretation and insightful +analysis according to user instructions, such as change detection and change +captioning, change object counting, change cause analysis, etc. The +Change-Agent integrates a multi-level change interpretation (MCI) model as the +eyes and a large language model (LLM) as the brain. The MCI model contains two +branches of pixel-level change detection and semantic-level change captioning, +in which multiple BI-temporal Iterative Interaction (BI3) layers utilize Local +Perception Enhancement (LPE) and the Global Difference Fusion Attention (GDFA) +modules to enhance the model's discriminative feature representation +capabilities. To support the training of the MCI model, we build the LEVIR-MCI +dataset with a large number of change masks and captions of changes. Extensive +experiments demonstrate the effectiveness of the proposed MCI model and +highlight the promising potential of our Change-Agent in facilitating +comprehensive and intelligent interpretation of surface changes. To facilitate +future research, we will make our dataset and codebase of the MCI model and +Change-Agent publicly available at +https://github.com/Chen-Yang-Liu/Change-Agent + +
+
+
+
+
+ + ♻ ☆ Frequency-Adaptive Dilated Convolution for Semantic Segmentation + + +
+ Dilated convolution, which expands the receptive field by inserting gaps +between its consecutive elements, is widely employed in computer vision. In +this study, we propose three strategies to improve individual phases of dilated +convolution from the view of spectrum analysis. Departing from the conventional +practice of fixing a global dilation rate as a hyperparameter, we introduce +Frequency-Adaptive Dilated Convolution (FADC), which dynamically adjusts +dilation rates spatially based on local frequency components. Subsequently, we +design two plug-in modules to directly enhance effective bandwidth and +receptive field size. The Adaptive Kernel (AdaKern) module decomposes +convolution weights into low-frequency and high-frequency components, +dynamically adjusting the ratio between these components on a per-channel +basis. By increasing the high-frequency part of convolution weights, AdaKern +captures more high-frequency components, thereby improving effective bandwidth. +The Frequency Selection (FreqSelect) module optimally balances high- and +low-frequency components in feature representations through spatially variant +reweighting. It suppresses high frequencies in the background to encourage FADC +to learn a larger dilation, thereby increasing the receptive field for an +expanded scope. Extensive experiments on segmentation and object detection +consistently validate the efficacy of our approach. The code is publicly +available at https://github.com/Linwei-Chen/FADC. + +
+
+
+
+
+ + ♻ ☆ Multi-criteria Token Fusion with One-step-ahead Attention for Efficient + Vision Transformers CVPR + + +
+ Vision Transformer (ViT) has emerged as a prominent backbone for computer +vision. For more efficient ViTs, recent works lessen the quadratic cost of the +self-attention layer by pruning or fusing the redundant tokens. However, these +works faced the speed-accuracy trade-off caused by the loss of information. +Here, we argue that token fusion needs to consider diverse relations between +tokens to minimize information loss. In this paper, we propose a Multi-criteria +Token Fusion (MCTF), that gradually fuses the tokens based on multi-criteria +(e.g., similarity, informativeness, and size of fused tokens). Further, we +utilize the one-step-ahead attention, which is the improved approach to capture +the informativeness of the tokens. By training the model equipped with MCTF +using a token reduction consistency, we achieve the best speed-accuracy +trade-off in the image classification (ImageNet1K). Experimental results prove +that MCTF consistently surpasses the previous reduction methods with and +without training. Specifically, DeiT-T and DeiT-S with MCTF reduce FLOPs by +about 44% while improving the performance (+0.5%, and +0.3%) over the base +model, respectively. We also demonstrate the applicability of MCTF in various +Vision Transformers (e.g., T2T-ViT, LV-ViT), achieving at least 31% speedup +without performance degradation. Code is available at +https://github.com/mlvlab/MCTF. + +
+
+ comment: Conference on Computer Vision and Pattern Recognition (CVPR), 2024 +
+
+
+
+
+ + ♻ ☆ High-Resolution Image Translation Model Based on Grayscale Redefinition + + +
+ Image-to-image translation is a technique that focuses on transferring images +from one domain to another while maintaining the essential content +representations. In recent years, image-to-image translation has gained +significant attention and achieved remarkable advancements due to its diverse +applications in computer vision and image processing tasks. In this work, we +propose an innovative method for image translation between different domains. +For high-resolution image translation tasks, we use a grayscale adjustment +method to achieve pixel-level translation. For other tasks, we utilize the +Pix2PixHD model with a coarse-to-fine generator, multi-scale discriminator, and +improved loss to enhance the image translation performance. On the other hand, +to tackle the issue of sparse training data, we adopt model weight +initialization from other task to optimize the performance of the current task. + +
+
+
+
+
+ + ♻ ☆ LLaVA-PruMerge: Adaptive Token Reduction for Efficient Large Multimodal + Models + + +
+ Large Multimodal Models (LMMs) have shown significant reasoning capabilities +by connecting a visual encoder and a large language model. LMMs typically use a +fixed amount of visual tokens, such as the penultimate layer features in the +CLIP visual encoder, as the prefix content. Recent LMMs incorporate more +complex visual inputs, such as high-resolution images and videos, which +increase the number of visual tokens significantly. However, due to the design +of the Transformer architecture, computational costs associated with these +models tend to increase quadratically with the number of input tokens. To +tackle this problem, we explore a token reduction mechanism and find, similar +to prior work, that many visual tokens are spatially redundant. Based on this, +we propose PruMerge, a novel adaptive visual token reduction approach, which +largely reduces the number of visual tokens while maintaining comparable model +performance. We first select the unpruned visual tokens based on their +similarity to class tokens and spatial tokens. We then cluster the pruned +tokens based on key similarity and merge the clustered tokens with the unpruned +tokens to supplement their information. Empirically, when applied to LLaVA-1.5, +our approach can compress the visual tokens by 18 times on average, and achieve +comparable performance across diverse visual question-answering and reasoning +tasks. Code and checkpoints are at https://llava-prumerge.github.io/. + +
+
+ comment: Project page: https://llava-prumerge.github.io/ +
+
+
+
+
+ + ♻ ☆ Concept-based Analysis of Neural Networks via Vision-Language Models + + +
+ The analysis of vision-based deep neural networks (DNNs) is highly desirable +but it is very challenging due to the difficulty of expressing formal +specifications for vision tasks and the lack of efficient verification +procedures. In this paper, we propose to leverage emerging multimodal, +vision-language, foundation models (VLMs) as a lens through which we can reason +about vision models. VLMs have been trained on a large body of images +accompanied by their textual description, and are thus implicitly aware of +high-level, human-understandable concepts describing the images. We describe a +logical specification language $\texttt{Con}_{\texttt{spec}}$ designed to +facilitate writing specifications in terms of these concepts. To define and +formally check $\texttt{Con}_{\texttt{spec}}$ specifications, we build a map +between the internal representations of a given vision model and a VLM, leading +to an efficient verification procedure of natural-language properties for +vision models. We demonstrate our techniques on a ResNet-based classifier +trained on the RIVAL-10 dataset using CLIP as the multimodal model. + +
+
+
+
+
+ + ♻ ☆ 3D Open-Vocabulary Panoptic Segmentation with 2D-3D Vision-Language + Distillation + + +
+ 3D panoptic segmentation is a challenging perception task, especially in +autonomous driving. It aims to predict both semantic and instance annotations +for 3D points in a scene. Although prior 3D panoptic segmentation approaches +have achieved great performance on closed-set benchmarks, generalizing these +approaches to unseen things and unseen stuff categories remains an open +problem. For unseen object categories, 2D open-vocabulary segmentation has +achieved promising results that solely rely on frozen CLIP backbones and +ensembling multiple classification outputs. However, we find that simply +extending these 2D models to 3D does not guarantee good performance due to poor +per-mask classification quality, especially for novel stuff categories. In this +paper, we propose the first method to tackle 3D open-vocabulary panoptic +segmentation. Our model takes advantage of the fusion between learnable LiDAR +features and dense frozen vision CLIP features, using a single classification +head to make predictions for both base and novel classes. To further improve +the classification performance on novel classes and leverage the CLIP model, we +propose two novel loss functions: object-level distillation loss and +voxel-level distillation loss. Our experiments on the nuScenes and +SemanticKITTI datasets show that our method outperforms the strong baseline by +a large margin. + +
+
+
+
+
+ + ♻ ☆ FeatUp: A Model-Agnostic Framework for Features at Any Resolution ICLR + + +
+ Deep features are a cornerstone of computer vision research, capturing image +semantics and enabling the community to solve downstream tasks even in the +zero- or few-shot regime. However, these features often lack the spatial +resolution to directly perform dense prediction tasks like segmentation and +depth prediction because models aggressively pool information over large areas. +In this work, we introduce FeatUp, a task- and model-agnostic framework to +restore lost spatial information in deep features. We introduce two variants of +FeatUp: one that guides features with high-resolution signal in a single +forward pass, and one that fits an implicit model to a single image to +reconstruct features at any resolution. Both approaches use a multi-view +consistency loss with deep analogies to NeRFs. Our features retain their +original semantics and can be swapped into existing applications to yield +resolution and performance gains even without re-training. We show that FeatUp +significantly outperforms other feature upsampling and image super-resolution +approaches in class activation map generation, transfer learning for +segmentation and depth prediction, and end-to-end training for semantic +segmentation. + +
+
+ comment: Accepted to the International Conference on Learning Representations + (ICLR) 2024 +
+
+
+
+
+ + ♻ ☆ Towards long-tailed, multi-label disease classification from chest + X-ray: Overview of the CXR-LT challenge + + +
+ Many real-world image recognition problems, such as diagnostic medical +imaging exams, are "long-tailed" $\unicode{x2013}$ there are a few common +findings followed by many more relatively rare conditions. In chest +radiography, diagnosis is both a long-tailed and multi-label problem, as +patients often present with multiple findings simultaneously. While researchers +have begun to study the problem of long-tailed learning in medical image +recognition, few have studied the interaction of label imbalance and label +co-occurrence posed by long-tailed, multi-label disease classification. To +engage with the research community on this emerging topic, we conducted an open +challenge, CXR-LT, on long-tailed, multi-label thorax disease classification +from chest X-rays (CXRs). We publicly release a large-scale benchmark dataset +of over 350,000 CXRs, each labeled with at least one of 26 clinical findings +following a long-tailed distribution. We synthesize common themes of +top-performing solutions, providing practical recommendations for long-tailed, +multi-label medical image classification. Finally, we use these insights to +propose a path forward involving vision-language foundation models for few- and +zero-shot disease classification. + +
+
+ comment: Update after major revision +
+
+
+
+
+ + ♻ ☆ Modeling Multimodal Social Interactions: New Challenges and Baselines + with Densely Aligned Representations CVPR 2024 + + +
+ Understanding social interactions involving both verbal and non-verbal cues +is essential for effectively interpreting social situations. However, most +prior works on multimodal social cues focus predominantly on single-person +behaviors or rely on holistic visual representations that are not aligned to +utterances in multi-party environments. Consequently, they are limited in +modeling the intricate dynamics of multi-party interactions. In this paper, we +introduce three new challenging tasks to model the fine-grained dynamics +between multiple people: speaking target identification, pronoun coreference +resolution, and mentioned player prediction. We contribute extensive data +annotations to curate these new challenges in social deduction game settings. +Furthermore, we propose a novel multimodal baseline that leverages densely +aligned language-visual representations by synchronizing visual features with +their corresponding utterances. This facilitates concurrently capturing verbal +and non-verbal cues pertinent to social reasoning. Experiments demonstrate the +effectiveness of the proposed approach with densely aligned multimodal +representations in modeling fine-grained social interactions. Project website: +https://sangmin-git.github.io/projects/MMSI. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Diffusion Hyperfeatures: Searching Through Time and Space for Semantic + Correspondence NeurIPS 2023 + + +
+ Diffusion models have been shown to be capable of generating high-quality +images, suggesting that they could contain meaningful internal representations. +Unfortunately, the feature maps that encode a diffusion model's internal +information are spread not only over layers of the network, but also over +diffusion timesteps, making it challenging to extract useful descriptors. We +propose Diffusion Hyperfeatures, a framework for consolidating multi-scale and +multi-timestep feature maps into per-pixel feature descriptors that can be used +for downstream tasks. These descriptors can be extracted for both synthetic and +real images using the generation and inversion processes. We evaluate the +utility of our Diffusion Hyperfeatures on the task of semantic keypoint +correspondence: our method achieves superior performance on the SPair-71k real +image benchmark. We also demonstrate that our method is flexible and +transferable: our feature aggregation network trained on the inversion features +of real image pairs can be used on the generation features of synthetic image +pairs with unseen objects and compositions. Our code is available at +https://diffusion-hyperfeatures.github.io. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ FasterViT: Fast Vision Transformers with Hierarchical Attention ICLR'24 + + +
+ We design a new family of hybrid CNN-ViT neural networks, named FasterViT, +with a focus on high image throughput for computer vision (CV) applications. +FasterViT combines the benefits of fast local representation learning in CNNs +and global modeling properties in ViT. Our newly introduced Hierarchical +Attention (HAT) approach decomposes global self-attention with quadratic +complexity into a multi-level attention with reduced computational costs. We +benefit from efficient window-based self-attention. Each window has access to +dedicated carrier tokens that participate in local and global representation +learning. At a high level, global self-attentions enable the efficient +cross-window communication at lower costs. FasterViT achieves a SOTA +Pareto-front in terms of accuracy and image throughput. We have extensively +validated its effectiveness on various CV tasks including classification, +object detection and segmentation. We also show that HAT can be used as a +plug-and-play module for existing networks and enhance them. We further +demonstrate significantly faster and more accurate performance than competitive +counterparts for images with high resolution. Code is available at +https://github.com/NVlabs/FasterViT. + +
+
+ comment: ICLR'24 Accepted Paper +
+
+
+
+
+ + ♻ ☆ DiffiT: Diffusion Vision Transformers for Image Generation + + +
+ Diffusion models with their powerful expressivity and high sample quality +have achieved State-Of-The-Art (SOTA) performance in the generative domain. The +pioneering Vision Transformer (ViT) has also demonstrated strong modeling +capabilities and scalability, especially for recognition tasks. In this paper, +we study the effectiveness of ViTs in diffusion-based generative learning and +propose a new model denoted as Diffusion Vision Transformers (DiffiT). +Specifically, we propose a methodology for finegrained control of the denoising +process and introduce the Time-dependant Multihead Self Attention (TMSA) +mechanism. DiffiT is surprisingly effective in generating high-fidelity images +with significantly better parameter efficiency. We also propose latent and +image space DiffiT models and show SOTA performance on a variety of +class-conditional and unconditional synthesis tasks at different resolutions. +The Latent DiffiT model achieves a new SOTA FID score of 1.73 on ImageNet-256 +dataset while having 19.85%, 16.88% less parameters than other +Transformer-based diffusion models such as MDT and DiT, respectively. Code: +https://github.com/NVlabs/DiffiT + +
+
+ comment: Revised Tech report +
+
+
+
+
+ + ♻ ☆ ECoDepth: Effective Conditioning of Diffusion Models for Monocular Depth + Estimation CVPR + + +
+ In the absence of parallax cues, a learning-based single image depth +estimation (SIDE) model relies heavily on shading and contextual cues in the +image. While this simplicity is attractive, it is necessary to train such +models on large and varied datasets, which are difficult to capture. It has +been shown that using embeddings from pre-trained foundational models, such as +CLIP, improves zero shot transfer in several applications. Taking inspiration +from this, in our paper we explore the use of global image priors generated +from a pre-trained ViT model to provide more detailed contextual information. +We argue that the embedding vector from a ViT model, pre-trained on a large +dataset, captures greater relevant information for SIDE than the usual route of +generating pseudo image captions, followed by CLIP based text embeddings. Based +on this idea, we propose a new SIDE model using a diffusion backbone which is +conditioned on ViT embeddings. Our proposed design establishes a new +state-of-the-art (SOTA) for SIDE on NYUv2 dataset, achieving Abs Rel error of +0.059(14% improvement) compared to 0.069 by the current SOTA (VPD). And on +KITTI dataset, achieving Sq Rel error of 0.139 (2% improvement) compared to +0.142 by the current SOTA (GEDepth). For zero-shot transfer with a model +trained on NYUv2, we report mean relative improvement of (20%, 23%, 81%, 25%) +over NeWCRFs on (Sun-RGBD, iBims1, DIODE, HyperSim) datasets, compared to (16%, +18%, 45%, 9%) by ZoeDepth. The project page is available at +https://ecodepth-iitd.github.io + +
+
+ comment: IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) + 2024 +
+
+
+
+
+ + ♻ ☆ NeRT: Implicit Neural Representations for General Unsupervised + Turbulence Mitigation + + +
+ The atmospheric and water turbulence mitigation problems have emerged as +challenging inverse problems in computer vision and optics communities over the +years. However, current methods either rely heavily on the quality of the +training dataset or fail to generalize over various scenarios, such as static +scenes, dynamic scenes, and text reconstructions. We propose a general implicit +neural representation for unsupervised atmospheric and water turbulence +mitigation (NeRT). NeRT leverages the implicit neural representations and the +physically correct tilt-then-blur turbulence model to reconstruct the clean, +undistorted image, given only dozens of distorted input images. Moreover, we +show that NeRT outperforms the state-of-the-art through various qualitative and +quantitative evaluations of atmospheric and water turbulence datasets. +Furthermore, we demonstrate the ability of NeRT to eliminate uncontrolled +turbulence from real-world environments. Lastly, we incorporate NeRT into +continuously captured video sequences and demonstrate $48 \times$ speedup. + +
+
+
+
+
+ + ♻ ☆ WALT3D: Generating Realistic Training Data from Time-Lapse Imagery for + Reconstructing Dynamic Objects under Occlusion CVPR 2024 + + +
+ Current methods for 2D and 3D object understanding struggle with severe +occlusions in busy urban environments, partly due to the lack of large-scale +labeled ground-truth annotations for learning occlusion. In this work, we +introduce a novel framework for automatically generating a large, realistic +dataset of dynamic objects under occlusions using freely available time-lapse +imagery. By leveraging off-the-shelf 2D (bounding box, segmentation, keypoint) +and 3D (pose, shape) predictions as pseudo-groundtruth, unoccluded 3D objects +are identified automatically and composited into the background in a clip-art +style, ensuring realistic appearances and physically accurate occlusion +configurations. The resulting clip-art image with pseudo-groundtruth enables +efficient training of object reconstruction methods that are robust to +occlusions. Our method demonstrates significant improvements in both 2D and 3D +reconstruction, particularly in scenarios with heavily occluded objects like +vehicles and people in urban scenes. + +
+
+ comment: To appear in CVPR 2024. Homepage: https://www.cs.cmu.edu/~walt3d +
+
+
+
+
+ + ♻ ☆ CityDreamer: Compositional Generative Model of Unbounded 3D Cities CVPR 2024 + + +
+ 3D city generation is a desirable yet challenging task, since humans are more +sensitive to structural distortions in urban environments. Additionally, +generating 3D cities is more complex than 3D natural scenes since buildings, as +objects of the same class, exhibit a wider range of appearances compared to the +relatively consistent appearance of objects like trees in natural scenes. To +address these challenges, we propose \textbf{CityDreamer}, a compositional +generative model designed specifically for unbounded 3D cities. Our key insight +is that 3D city generation should be a composition of different types of neural +fields: 1) various building instances, and 2) background stuff, such as roads +and green lands. Specifically, we adopt the bird's eye view scene +representation and employ a volumetric render for both instance-oriented and +stuff-oriented neural fields. The generative hash grid and periodic positional +embedding are tailored as scene parameterization to suit the distinct +characteristics of building instances and background stuff. Furthermore, we +contribute a suite of CityGen Datasets, including OSM and GoogleEarth, which +comprises a vast amount of real-world city imagery to enhance the realism of +the generated 3D cities both in their layouts and appearances. CityDreamer +achieves state-of-the-art performance not only in generating realistic 3D +cities but also in localized editing within the generated cities. + +
+
+ comment: CVPR 2024. Project page: https://haozhexie.com/project/city-dreamer +
+
+
+
+
+ + ♻ ☆ ZigMa: A DiT-style Zigzag Mamba Diffusion Model + + +
+ The diffusion model has long been plagued by scalability and quadratic +complexity issues, especially within transformer-based structures. In this +study, we aim to leverage the long sequence modeling capability of a +State-Space Model called Mamba to extend its applicability to visual data +generation. Firstly, we identify a critical oversight in most current +Mamba-based vision methods, namely the lack of consideration for spatial +continuity in the scan scheme of Mamba. Secondly, building upon this insight, +we introduce a simple, plug-and-play, zero-parameter method named Zigzag Mamba, +which outperforms Mamba-based baselines and demonstrates improved speed and +memory utilization compared to transformer-based baselines. Lastly, we +integrate Zigzag Mamba with the Stochastic Interpolant framework to investigate +the scalability of the model on large-resolution visual datasets, such as +FacesHQ $1024\times 1024$ and UCF101, MultiModal-CelebA-HQ, and MS COCO +$256\times 256$ . Code will be released at https://taohu.me/zigma/ + +
+
+ comment: Project Page: https://taohu.me/zigma/ +
+
+
+
+
+ + ♻ ☆ A Survey on Multimodal Large Language Models + + +
+ Recently, Multimodal Large Language Model (MLLM) represented by GPT-4V has +been a new rising research hotspot, which uses powerful Large Language Models +(LLMs) as a brain to perform multimodal tasks. The surprising emergent +capabilities of MLLM, such as writing stories based on images and OCR-free math +reasoning, are rare in traditional multimodal methods, suggesting a potential +path to artificial general intelligence. To this end, both academia and +industry have endeavored to develop MLLMs that can compete with or even better +than GPT-4V, pushing the limit of research at a surprising speed. In this +paper, we aim to trace and summarize the recent progress of MLLMs. First of +all, we present the basic formulation of MLLM and delineate its related +concepts, including architecture, training strategy and data, as well as +evaluation. Then, we introduce research topics about how MLLMs can be extended +to support more granularity, modalities, languages, and scenarios. We continue +with multimodal hallucination and extended techniques, including Multimodal ICL +(M-ICL), Multimodal CoT (M-CoT), and LLM-Aided Visual Reasoning (LAVR). To +conclude the paper, we discuss existing challenges and point out promising +research directions. In light of the fact that the era of MLLM has only just +begun, we will keep updating this survey and hope it can inspire more research. +An associated GitHub link collecting the latest papers is available at +https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models. + +
+
+ comment: Project + page:https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models +
+
+
+
+
+ + ♻ ☆ DST-Det: Simple Dynamic Self-Training for Open-Vocabulary Object + Detection + + +
+ Open-vocabulary object detection (OVOD) aims to detect the objects beyond the +set of classes observed during training. This work introduces a straightforward +and efficient strategy that utilizes pre-trained vision-language models (VLM), +like CLIP, to identify potential novel classes through zero-shot +classification. Previous methods use a class-agnostic region proposal network +to detect object proposals and consider the proposals that do not match the +ground truth as background. Unlike these methods, our method will select a +subset of proposals that will be considered as background during the training. +Then, we treat them as novel classes during training. We refer to this approach +as the self-training strategy, which enhances recall and accuracy for novel +classes without requiring extra annotations, datasets, and re-training. +Compared to previous pseudo methods, our approach does not require re-training +and offline labeling processing, which is more efficient and effective in +one-shot training. Empirical evaluations on three datasets, including LVIS, +V3Det, and COCO, demonstrate significant improvements over the baseline +performance without incurring additional parameters or computational costs +during inference. In addition, we also apply our method to various baselines. +In particular, compared with the previous method, F-VLM, our method achieves a +1.7% improvement on the LVIS dataset. Combined with the recent method CLIPSelf, +our method also achieves 46.7 novel class AP on COCO without introducing extra +data for pertaining. We also achieve over 6.5% improvement over the F-VLM +baseline in the recent challenging V3Det dataset. We release our code and +models at https://github.com/xushilin1/dst-det. + +
+
+
+
+
+ + ♻ ☆ Efficient Benchmarking of Language Models NAACL + + +
+ The increasing versatility of language models (LMs) has given rise to a new +class of benchmarks that comprehensively assess a broad range of capabilities. +Such benchmarks are associated with massive computational costs, extending to +thousands of GPU hours per model. However, the efficiency aspect of these +evaluation efforts had raised little discussion in the literature. In this +work, we present the problem of Efficient Benchmarking, namely, intelligently +reducing the computation costs of LM evaluation without compromising +reliability. Using the HELM benchmark as a test case, we investigate how +different benchmark design choices affect the computation-reliability +trade-off. We propose to evaluate the reliability of such decisions, by using a +new measure -- Decision Impact on Reliability, DIoR for short. We find, for +example, that a benchmark leader may change by merely removing a low-ranked +model from the benchmark, and observe that a correct benchmark ranking can be +obtained by considering only a fraction of the evaluation examples. Based on +our findings, we outline a set of concrete recommendations for efficient +benchmark design and utilization practices. To take a step further, we use our +findings to propose an evaluation algorithm, that, when applied to the HELM +benchmark, leads to dramatic cost savings with minimal loss of benchmark +reliability, often reducing computation by x100 or more. + +
+
+ comment: Accepted to NAACL main track +
+
+
+
+
+ + ♻ ☆ Text-image Alignment for Diffusion-based Perception + + +
+ Diffusion models are generative models with impressive text-to-image +synthesis capabilities and have spurred a new wave of creative methods for +classical machine learning tasks. However, the best way to harness the +perceptual knowledge of these generative models for visual tasks is still an +open question. Specifically, it is unclear how to use the prompting interface +when applying diffusion backbones to vision tasks. We find that automatically +generated captions can improve text-image alignment and significantly enhance a +model's cross-attention maps, leading to better perceptual performance. Our +approach improves upon the current state-of-the-art (SOTA) in diffusion-based +semantic segmentation on ADE20K and the current overall SOTA for depth +estimation on NYUv2. Furthermore, our method generalizes to the cross-domain +setting. We use model personalization and caption modifications to align our +model to the target domain and find improvements over unaligned baselines. Our +cross-domain object detection model, trained on Pascal VOC, achieves SOTA +results on Watercolor2K. Our cross-domain segmentation method, trained on +Cityscapes, achieves SOTA results on Dark Zurich-val and Nighttime Driving. +Project page: https://www.vision.caltech.edu/tadp/. Code: +https://github.com/damaggu/TADP. + +
+
+ comment: Project page: https://www.vision.caltech.edu/tadp/, Code page: + github.com/damaggu/TADP +
+
+
+
+
+ + ♻ ☆ Shape-Guided Diffusion with Inside-Outside Attention WACV 2024 + + +
+ We introduce precise object silhouette as a new form of user control in +text-to-image diffusion models, which we dub Shape-Guided Diffusion. Our +training-free method uses an Inside-Outside Attention mechanism during the +inversion and generation process to apply a shape constraint to the cross- and +self-attention maps. Our mechanism designates which spatial region is the +object (inside) vs. background (outside) then associates edits to the correct +region. We demonstrate the efficacy of our method on the shape-guided editing +task, where the model must replace an object according to a text prompt and +object mask. We curate a new ShapePrompts benchmark derived from MS-COCO and +achieve SOTA results in shape faithfulness without a degradation in text +alignment or image realism according to both automatic metrics and annotator +ratings. Our data and code will be made available at +https://shape-guided-diffusion.github.io. + +
+
+ comment: WACV 2024 +
+
+
+
+
+ + ♻ ☆ HAL3D: Hierarchical Active Learning for Fine-Grained 3D Part Labeling ICCV 2023 + + +
+ We present the first active learning tool for fine-grained 3D part labeling, +a problem which challenges even the most advanced deep learning (DL) methods +due to the significant structural variations among the small and intricate +parts. For the same reason, the necessary data annotation effort is tremendous, +motivating approaches to minimize human involvement. Our labeling tool +iteratively verifies or modifies part labels predicted by a deep neural +network, with human feedback continually improving the network prediction. To +effectively reduce human efforts, we develop two novel features in our tool, +hierarchical and symmetry-aware active labeling. Our human-in-the-loop +approach, coined HAL3D, achieves 100% accuracy (barring human errors) on any +test set with pre-defined hierarchical part labels, with 80% time-saving over +manual effort. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ♻ ☆ SymTC: A Symbiotic Transformer-CNN Net for Instance Segmentation of + Lumbar Spine MRI + + +
+ Intervertebral disc disease, a prevalent ailment, frequently leads to +intermittent or persistent low back pain, and diagnosing and assessing of this +disease rely on accurate measurement of vertebral bone and intervertebral disc +geometries from lumbar MR images. Deep neural network (DNN) models may assist +clinicians with more efficient image segmentation of individual instances +(disks and vertebrae) of the lumbar spine in an automated way, which is termed +as instance image segmentation. In this work, we proposed SymTC, an innovative +lumbar spine MR image segmentation model that combines the strengths of +Transformer and Convolutional Neural Network (CNN). Specifically, we designed a +parallel dual-path architecture to merge CNN layers and Transformer layers, and +we integrated a novel position embedding into the self-attention module of +Transformer, enhancing the utilization of positional information for more +accurate segmentation. To further improves model performance, we introduced a +new data augmentation technique to create synthetic yet realistic MR image +dataset, named SSMSpine, which is made publicly available. We evaluated our +SymTC and the other 15 existing image segmentation models on our private +in-house dataset and the public SSMSpine dataset, using two metrics, Dice +Similarity Coefficient and 95% Hausdorff Distance. The results show that our +SymTC has the best performance for segmenting vertebral bones and +intervertebral discs in lumbar spine MR images. The SymTC code and SSMSpine +dataset are available at https://github.com/jiasongchen/SymTC. + +
+
+
+
+
+ + ♻ ☆ Multimodal Representation Learning by Alternating Unimodal Adaptation CVPR 2024 + + +
+ Multimodal learning, which integrates data from diverse sensory modes, plays +a pivotal role in artificial intelligence. However, existing multimodal +learning methods often struggle with challenges where some modalities appear +more dominant than others during multimodal learning, resulting in suboptimal +performance. To address this challenge, we propose MLA (Multimodal Learning +with Alternating Unimodal Adaptation). MLA reframes the conventional joint +multimodal learning process by transforming it into an alternating unimodal +learning process, thereby minimizing interference between modalities. +Simultaneously, it captures cross-modal interactions through a shared head, +which undergoes continuous optimization across different modalities. This +optimization process is controlled by a gradient modification mechanism to +prevent the shared head from losing previously acquired information. During the +inference phase, MLA utilizes a test-time uncertainty-based model fusion +mechanism to integrate multimodal information. Extensive experiments are +conducted on five diverse datasets, encompassing scenarios with complete +modalities and scenarios with missing modalities. These experiments demonstrate +the superiority of MLA over competing prior approaches. Our code is available +at +https://github.com/Cecile-hi/Multimodal-Learning-with-Alternating-Unimodal-Adaptation. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Open3DSG: Open-Vocabulary 3D Scene Graphs from Point Clouds with + Queryable Objects and Open-Set Relationships CVPR 2024 + + +
+ Current approaches for 3D scene graph prediction rely on labeled datasets to +train models for a fixed set of known object classes and relationship +categories. We present Open3DSG, an alternative approach to learn 3D scene +graph prediction in an open world without requiring labeled scene graph data. +We co-embed the features from a 3D scene graph prediction backbone with the +feature space of powerful open world 2D vision language foundation models. This +enables us to predict 3D scene graphs from 3D point clouds in a zero-shot +manner by querying object classes from an open vocabulary and predicting the +inter-object relationships from a grounded LLM with scene graph features and +queried object classes as context. Open3DSG is the first 3D point cloud method +to predict not only explicit open-vocabulary object classes, but also open-set +relationships that are not limited to a predefined label set, making it +possible to express rare as well as specific objects and relationships in the +predicted 3D scene graph. Our experiments show that Open3DSG is effective at +predicting arbitrary object classes as well as their complex inter-object +relationships describing spatial, supportive, semantic and comparative +relationships. + +
+
+ comment: CVPR 2024. Project page: https://kochsebastian.com/open3dsg +
+
+
+
+
+ + ♻ ☆ Introducing an ensemble method for the early detection of Alzheimer's + disease through the analysis of PET scan images + + +
+ Alzheimer's disease is a progressive neurodegenerative disorder that +primarily affects cognitive functions such as memory, thinking, and behavior. +In this disease, there is a critical phase, mild cognitive impairment, that is +really important to be diagnosed early since some patients with progressive MCI +will develop the disease. This study delves into the challenging task of +classifying Alzheimer's disease into four distinct groups: control normal (CN), +progressive mild cognitive impairment (pMCI), stable mild cognitive impairment +(sMCI), and Alzheimer's disease (AD). This classification is based on a +thorough examination of PET scan images obtained from the ADNI dataset, which +provides a thorough understanding of the disease's progression. Several +deep-learning and traditional machine-learning models have been used to detect +Alzheimer's disease. In this paper, three deep-learning models, namely VGG16 +and AlexNet, and a custom Convolutional neural network (CNN) with 8-fold +cross-validation have been used for classification. Finally, an ensemble +technique is used to improve the overall result of these models. The results +show that using deep-learning models to tell the difference between MCI +patients gives an overall average accuracy of 93.13% and an AUC of 94.4%. + +
+
+
+
+
+ + ♻ ☆ Supplementing Missing Visions via Dialog for Scene Graph Generations ICASSP 2024 + + +
+ Most current AI systems rely on the premise that the input visual data are +sufficient to achieve competitive performance in various computer vision tasks. +However, the classic task setup rarely considers the challenging, yet common +practical situations where the complete visual data may be inaccessible due to +various reasons (e.g., restricted view range and occlusions). To this end, we +investigate a computer vision task setting with incomplete visual input data. +Specifically, we exploit the Scene Graph Generation (SGG) task with various +levels of visual data missingness as input. While insufficient visual input +intuitively leads to performance drop, we propose to supplement the missing +visions via the natural language dialog interactions to better accomplish the +task objective. We design a model-agnostic Supplementary Interactive Dialog +(SI-Dial) framework that can be jointly learned with most existing models, +endowing the current AI systems with the ability of question-answer +interactions in natural language. We demonstrate the feasibility of such a task +setting with missing visual input and the effectiveness of our proposed dialog +module as the supplementary information source through extensive experiments +and analysis, by achieving promising performance improvement over multiple +baselines. + +
+
+ comment: ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ Segment Anything in Medical Images + + +
+ Medical image segmentation is a critical component in clinical practice, +facilitating accurate diagnosis, treatment planning, and disease monitoring. +However, existing methods, often tailored to specific modalities or disease +types, lack generalizability across the diverse spectrum of medical image +segmentation tasks. Here we present MedSAM, a foundation model designed for +bridging this gap by enabling universal medical image segmentation. The model +is developed on a large-scale medical image dataset with 1,570,263 image-mask +pairs, covering 10 imaging modalities and over 30 cancer types. We conduct a +comprehensive evaluation on 86 internal validation tasks and 60 external +validation tasks, demonstrating better accuracy and robustness than +modality-wise specialist models. By delivering accurate and efficient +segmentation across a wide spectrum of tasks, MedSAM holds significant +potential to expedite the evolution of diagnostic tools and the personalization +of treatment plans. + +
+
+
+
+
+ + ♻ ☆ The Multi-modality Cell Segmentation Challenge: Towards Universal + Solutions NeurIPS22 + + +
+ Cell segmentation is a critical step for quantitative single-cell analysis in +microscopy images. Existing cell segmentation methods are often tailored to +specific modalities or require manual interventions to specify hyper-parameters +in different experimental settings. Here, we present a multi-modality cell +segmentation benchmark, comprising over 1500 labeled images derived from more +than 50 diverse biological experiments. The top participants developed a +Transformer-based deep-learning algorithm that not only exceeds existing +methods but can also be applied to diverse microscopy images across imaging +platforms and tissue types without manual parameter adjustments. This benchmark +and the improved algorithm offer promising avenues for more accurate and +versatile cell analysis in microscopy imaging. + +
+
+ comment: NeurIPS22 Cell Segmentation Challenge: + https://neurips22-cellseg.grand-challenge.org/ . Nature Methods (2024) +
+
+
+
+
+ + ♻ ☆ Copilot4D: Learning Unsupervised World Models for Autonomous Driving via + Discrete Diffusion ICLR 2024 + + +
+ Learning world models can teach an agent how the world works in an +unsupervised manner. Even though it can be viewed as a special case of sequence +modeling, progress for scaling world models on robotic applications such as +autonomous driving has been somewhat less rapid than scaling language models +with Generative Pre-trained Transformers (GPT). We identify two reasons as +major bottlenecks: dealing with complex and unstructured observation space, and +having a scalable generative model. Consequently, we propose Copilot4D, a novel +world modeling approach that first tokenizes sensor observations with VQVAE, +then predicts the future via discrete diffusion. To efficiently decode and +denoise tokens in parallel, we recast Masked Generative Image Transformer as +discrete diffusion and enhance it with a few simple changes, resulting in +notable improvement. When applied to learning world models on point cloud +observations, Copilot4D reduces prior SOTA Chamfer distance by more than 65% +for 1s prediction, and more than 50% for 3s prediction, across NuScenes, KITTI +Odometry, and Argoverse2 datasets. Our results demonstrate that discrete +diffusion on tokenized agent experience can unlock the power of GPT-like +unsupervised learning for robotics. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Sat2Scene: 3D Urban Scene Generation from Satellite Images with + Diffusion + + +
+ Directly generating scenes from satellite imagery offers exciting +possibilities for integration into applications like games and map services. +However, challenges arise from significant view changes and scene scale. +Previous efforts mainly focused on image or video generation, lacking +exploration into the adaptability of scene generation for arbitrary views. +Existing 3D generation works either operate at the object level or are +difficult to utilize the geometry obtained from satellite imagery. To overcome +these limitations, we propose a novel architecture for direct 3D scene +generation by introducing diffusion models into 3D sparse representations and +combining them with neural rendering techniques. Specifically, our approach +generates texture colors at the point level for a given geometry using a 3D +diffusion model first, which is then transformed into a scene representation in +a feed-forward manner. The representation can be utilized to render arbitrary +views which would excel in both single-frame quality and inter-frame +consistency. Experiments in two city-scale datasets show that our model +demonstrates proficiency in generating photo-realistic street-view image +sequences and cross-view urban scenes from satellite imagery. + +
+
+
+
+
+ + ♻ ☆ Task-Oriented Communication for Edge Video Analytics + + +
+ With the development of artificial intelligence (AI) techniques and the +increasing popularity of camera-equipped devices, many edge video analytics +applications are emerging, calling for the deployment of computation-intensive +AI models at the network edge. Edge inference is a promising solution to move +the computation-intensive workloads from low-end devices to a powerful edge +server for video analytics, but the device-server communications will remain a +bottleneck due to the limited bandwidth. This paper proposes a task-oriented +communication framework for edge video analytics, where multiple devices +collect the visual sensory data and transmit the informative features to an +edge server for processing. To enable low-latency inference, this framework +removes video redundancy in spatial and temporal domains and transmits minimal +information that is essential for the downstream task, rather than +reconstructing the videos at the edge server. Specifically, it extracts compact +task-relevant features based on the deterministic information bottleneck (IB) +principle, which characterizes a tradeoff between the informativeness of the +features and the communication cost. As the features of consecutive frames are +temporally correlated, we propose a temporal entropy model (TEM) to reduce the +bitrate by taking the previous features as side information in feature +encoding. To further improve the inference performance, we build a +spatial-temporal fusion module at the server to integrate features of the +current and previous frames for joint inference. Extensive experiments on video +analytics tasks evidence that the proposed framework effectively encodes +task-relevant information of video data and achieves a better rate-performance +tradeoff than existing methods. + +
+
+ comment: This paper was accepted to IEEE Transactions on Wireless + Communications (TWC) +
+
+
+
+
+ + ♻ ☆ Grounded Question-Answering in Long Egocentric Videos CVPR 2024 + + +
+ Existing approaches to video understanding, mainly designed for short videos +from a third-person perspective, are limited in their applicability in certain +fields, such as robotics. In this paper, we delve into open-ended +question-answering (QA) in long, egocentric videos, which allows individuals or +robots to inquire about their own past visual experiences. This task presents +unique challenges, including the complexity of temporally grounding queries +within extensive video content, the high resource demands for precise data +annotation, and the inherent difficulty of evaluating open-ended answers due to +their ambiguous nature. Our proposed approach tackles these challenges by (i) +integrating query grounding and answering within a unified model to reduce +error propagation; (ii) employing large language models for efficient and +scalable data synthesis; and (iii) introducing a close-ended QA task for +evaluation, to manage answer ambiguity. Extensive experiments demonstrate the +effectiveness of our method, which also achieves state-of-the-art performance +on the QaEgo4D and Ego4D-NLQ benchmarks. Code, data, and models are available +at https://github.com/Becomebright/GroundVQA. + +
+
+ comment: Accepted to CVPR 2024. Project website at https://dszdsz.cn/GroundVQA +
+
+
+
+
+ + ♻ ☆ CricaVPR: Cross-image Correlation-aware Representation Learning for + Visual Place Recognition CVPR2024 + + +
+ Over the past decade, most methods in visual place recognition (VPR) have +used neural networks to produce feature representations. These networks +typically produce a global representation of a place image using only this +image itself and neglect the cross-image variations (e.g. viewpoint and +illumination), which limits their robustness in challenging scenes. In this +paper, we propose a robust global representation method with cross-image +correlation awareness for VPR, named CricaVPR. Our method uses the attention +mechanism to correlate multiple images within a batch. These images can be +taken in the same place with different conditions or viewpoints, or even +captured from different places. Therefore, our method can utilize the +cross-image variations as a cue to guide the representation learning, which +ensures more robust features are produced. To further facilitate the +robustness, we propose a multi-scale convolution-enhanced adaptation method to +adapt pre-trained visual foundation models to the VPR task, which introduces +the multi-scale local information to further enhance the cross-image +correlation-aware representation. Experimental results show that our method +outperforms state-of-the-art methods by a large margin with significantly less +training time. The code is released at https://github.com/Lu-Feng/CricaVPR. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ MobileCLIP: Fast Image-Text Models through Multi-Modal Reinforced + Training CVPR 2024 + + +
+ Contrastive pretraining of image-text foundation models, such as CLIP, +demonstrated excellent zero-shot performance and improved robustness on a wide +range of downstream tasks. However, these models utilize large +transformer-based encoders with significant memory and latency overhead which +pose challenges for deployment on mobile devices. In this work, we introduce +MobileCLIP -- a new family of efficient image-text models optimized for runtime +performance along with a novel and efficient training approach, namely +multi-modal reinforced training. The proposed training approach leverages +knowledge transfer from an image captioning model and an ensemble of strong +CLIP encoders to improve the accuracy of efficient models. Our approach avoids +train-time compute overhead by storing the additional knowledge in a reinforced +dataset. MobileCLIP sets a new state-of-the-art latency-accuracy tradeoff for +zero-shot classification and retrieval tasks on several datasets. Our +MobileCLIP-S2 variant is 2.3$\times$ faster while more accurate compared to +previous best CLIP model based on ViT-B/16. We further demonstrate the +effectiveness of our multi-modal reinforced training by training a CLIP model +based on ViT-B/16 image backbone and achieving +2.9% average performance +improvement on 38 evaluation benchmarks compared to the previous best. +Moreover, we show that the proposed approach achieves 10$\times$-1000$\times$ +improved learning efficiency when compared with non-reinforced CLIP training. +Code and models are available at https://github.com/apple/ml-mobileclip . + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ TextFormer: A Query-based End-to-End Text Spotter with Mixed Supervision + + +
+ End-to-end text spotting is a vital computer vision task that aims to +integrate scene text detection and recognition into a unified framework. +Typical methods heavily rely on Region-of-Interest (RoI) operations to extract +local features and complex post-processing steps to produce final predictions. +To address these limitations, we propose TextFormer, a query-based end-to-end +text spotter with Transformer architecture. Specifically, using query embedding +per text instance, TextFormer builds upon an image encoder and a text decoder +to learn a joint semantic understanding for multi-task modeling. It allows for +mutual training and optimization of classification, segmentation, and +recognition branches, resulting in deeper feature sharing without sacrificing +flexibility or simplicity. Additionally, we design an Adaptive Global +aGgregation (AGG) module to transfer global features into sequential features +for reading arbitrarily-shaped texts, which overcomes the sub-optimization +problem of RoI operations. Furthermore, potential corpus information is +utilized from weak annotations to full labels through mixed supervision, +further improving text detection and end-to-end text spotting results. +Extensive experiments on various bilingual (i.e., English and Chinese) +benchmarks demonstrate the superiority of our method. Especially on TDA-ReCTS +dataset, TextFormer surpasses the state-of-the-art method in terms of 1-NED by +13.2%. + +
+
+ comment: Machine Intelligence Research, MIR 2024 +
+
+
+
+
+ + ♻ ☆ NViST: In the Wild New View Synthesis from a Single Image with + Transformers CVPR 2024 + + +
+ We propose NViST, a transformer-based model for efficient and generalizable +novel-view synthesis from a single image for real-world scenes. In contrast to +many methods that are trained on synthetic data, object-centred scenarios, or +in a category-specific manner, NViST is trained on MVImgNet, a large-scale +dataset of casually-captured real-world videos of hundreds of object categories +with diverse backgrounds. NViST transforms image inputs directly into a +radiance field, conditioned on camera parameters via adaptive layer +normalisation. In practice, NViST exploits fine-tuned masked autoencoder (MAE) +features and translates them to 3D output tokens via cross-attention, while +addressing occlusions with self-attention. To move away from object-centred +datasets and enable full scene synthesis, NViST adopts a 6-DOF camera pose +model and only requires relative pose, dropping the need for canonicalization +of the training data, which removes a substantial barrier to it being used on +casually captured datasets. We show results on unseen objects and categories +from MVImgNet and even generalization to casual phone captures. We conduct +qualitative and quantitative evaluations on MVImgNet and ShapeNet to show that +our model represents a step forward towards enabling true in-the-wild +generalizable novel-view synthesis from a single image. Project webpage: +https://wbjang.github.io/nvist_webpage. + +
+
+ comment: CVPR 2024, Project page: https://wbjang.github.io/nvist_webpage +
+
+
+
+
+ + ♻ ☆ Contrastive Denoising Score for Text-guided Latent Diffusion Image + Editing CVPR 2024 + + +
+ With the remarkable advent of text-to-image diffusion models, image editing +methods have become more diverse and continue to evolve. A promising recent +approach in this realm is Delta Denoising Score (DDS) - an image editing +technique based on Score Distillation Sampling (SDS) framework that leverages +the rich generative prior of text-to-image diffusion models. However, relying +solely on the difference between scoring functions is insufficient for +preserving specific structural elements from the original image, a crucial +aspect of image editing. To address this, here we present an embarrassingly +simple yet very powerful modification of DDS, called Contrastive Denoising +Score (CDS), for latent diffusion models (LDM). Inspired by the similarities +and differences between DDS and the contrastive learning for unpaired +image-to-image translation(CUT), we introduce a straightforward approach using +CUT loss within the DDS framework. Rather than employing auxiliary networks as +in the original CUT approach, we leverage the intermediate features of LDM, +specifically those from the self-attention layers, which possesses rich spatial +information. Our approach enables zero-shot image-to-image translation and +neural radiance field (NeRF) editing, achieving structural correspondence +between the input and output while maintaining content controllability. +Qualitative results and comparisons demonstrates the effectiveness of our +proposed method. Project page: https://hyelinnam.github.io/CDS/ + +
+
+ comment: CVPR 2024 (poster); Project page: https://hyelinnam.github.io/CDS/ +
+
+
+
+
+ + ♻ ☆ QUAR-VLA: Vision-Language-Action Model for Quadruped Robots + + +
+ The important manifestation of robot intelligence is the ability to naturally +interact and autonomously make decisions. Traditional approaches to robot +control often compartmentalize perception, planning, and decision-making, +simplifying system design but limiting the synergy between different +information streams. This compartmentalization poses challenges in achieving +seamless autonomous reasoning, decision-making, and action execution. To +address these limitations, a novel paradigm, named Vision-Language-Action tasks +for QUAdruped Robots (QUAR-VLA), has been introduced in this paper. This +approach tightly integrates visual information and instructions to generate +executable actions, effectively merging perception, planning, and +decision-making. The central idea is to elevate the overall intelligence of the +robot. Within this framework, a notable challenge lies in aligning fine-grained +instructions with visual perception information. This emphasizes the complexity +involved in ensuring that the robot accurately interprets and acts upon +detailed instructions in harmony with its visual observations. Consequently, we +propose QUAdruped Robotic Transformer (QUART), a family of VLA models to +integrate visual information and instructions from diverse modalities as input +and generates executable actions for real-world robots and present QUAdruped +Robot Dataset (QUARD), a large-scale multi-task dataset including navigation, +complex terrain locomotion, and whole-body manipulation tasks for training +QUART models. Our extensive evaluation (4000 evaluation trials) shows that our +approach leads to performant robotic policies and enables QUART to obtain a +range of emergent capabilities. + +
+
+
+
+
+ + ♻ ☆ OpenStereo: A Comprehensive Benchmark for Stereo Matching and Strong + Baseline + + +
+ Stereo matching aims to estimate the disparity between matching pixels in a +stereo image pair, which is of great importance to robotics, autonomous +driving, and other computer vision tasks. Despite the development of numerous +impressive methods in recent years, replicating their results and determining +the most suitable architecture for practical application remains challenging. +Addressing this gap, our paper introduces a comprehensive benchmark focusing on +practical applicability rather than solely on performance enhancement. +Specifically, we develop a flexible and efficient stereo matching codebase, +called OpenStereo. OpenStereo includes training and inference codes of more +than 10 network models, making it, to our knowledge, the most complete stereo +matching toolbox available. Based on OpenStereo, we conducted experiments and +have achieved or surpassed the performance metrics reported in the original +paper. Additionally, we carry out an exhaustive analysis and deconstruction of +recent developments in stereo matching through comprehensive ablative +experiments. These investigations inspired the creation of StereoBase, a strong +baseline model. Our StereoBase ranks 1st on SceneFlow, KITTI 2015, 2012 +(Reflective) among published methods and achieves the best performance across +all metrics. In addition, StereoBase has strong cross-dataset +generalization.Code is available at +\url{https://github.com/XiandaGuo/OpenStereo}. + +
+
+ comment: Code is available at: https://github.com/XiandaGuo/OpenStereo +
+
+
+
+
+ + ♻ ☆ Multiscale and Multilayer Contrastive Learning for Domain Generalization + + +
+ During the past decade, deep neural networks have led to fast-paced progress +and significant achievements in computer vision problems, for both academia and +industry. Yet despite their success, state-of-the-art image classification +approaches fail to generalize well in previously unseen visual contexts, as +required by many real-world applications. In this paper, we focus on this +domain generalization (DG) problem and argue that the generalization ability of +deep convolutional neural networks can be improved by taking advantage of +multi-layer and multi-scaled representations of the network. We introduce a +framework that aims at improving domain generalization of image classifiers by +combining both low-level and high-level features at multiple scales, enabling +the network to implicitly disentangle representations in its latent space and +learn domain-invariant attributes of the depicted objects. Additionally, to +further facilitate robust representation learning, we propose a novel objective +function, inspired by contrastive learning, which aims at constraining the +extracted representations to remain invariant under distribution shifts. We +demonstrate the effectiveness of our method by evaluating on the domain +generalization datasets of PACS, VLCS, Office-Home and NICO. Through extensive +experimentation, we show that our model is able to surpass the performance of +previous DG methods and consistently produce competitive and state-of-the-art +results in all datasets + +
+
+ comment: Manuscript accepted in: IEEE Transactions on Artificial Intelligence + (March 2024) +
+
+
+
+
+ + ♻ ☆ Solving Diffusion ODEs with Optimal Boundary Conditions for Better Image + Super-Resolution ICLR 2024 + + +
+ Diffusion models, as a kind of powerful generative model, have given +impressive results on image super-resolution (SR) tasks. However, due to the +randomness introduced in the reverse process of diffusion models, the +performances of diffusion-based SR models are fluctuating at every time of +sampling, especially for samplers with few resampled steps. This inherent +randomness of diffusion models results in ineffectiveness and instability, +making it challenging for users to guarantee the quality of SR results. +However, our work takes this randomness as an opportunity: fully analyzing and +leveraging it leads to the construction of an effective plug-and-play sampling +method that owns the potential to benefit a series of diffusion-based SR +methods. More in detail, we propose to steadily sample high-quality SR images +from pre-trained diffusion-based SR models by solving diffusion ordinary +differential equations (diffusion ODEs) with optimal boundary conditions (BCs) +and analyze the characteristics between the choices of BCs and their +corresponding SR results. Our analysis shows the route to obtain an +approximately optimal BC via an efficient exploration in the whole space. The +quality of SR results sampled by the proposed method with fewer steps +outperforms the quality of results sampled by current methods with randomness +from the same pre-trained diffusion-based SR model, which means that our +sampling method "boosts" current diffusion-based SR models without any +additional training. + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Bilateral Propagation Network for Depth Completion CVPR 2024 + + +
+ Depth completion aims to derive a dense depth map from sparse depth +measurements with a synchronized color image. Current state-of-the-art (SOTA) +methods are predominantly propagation-based, which work as an iterative +refinement on the initial estimated dense depth. However, the initial depth +estimations mostly result from direct applications of convolutional layers on +the sparse depth map. In this paper, we present a Bilateral Propagation Network +(BP-Net), that propagates depth at the earliest stage to avoid directly +convolving on sparse data. Specifically, our approach propagates the target +depth from nearby depth measurements via a non-linear model, whose coefficients +are generated through a multi-layer perceptron conditioned on both +\emph{radiometric difference} and \emph{spatial distance}. By integrating +bilateral propagation with multi-modal fusion and depth refinement in a +multi-scale framework, our BP-Net demonstrates outstanding performance on both +indoor and outdoor scenes. It achieves SOTA on the NYUv2 dataset and ranks 1st +on the KITTI depth completion benchmark at the time of submission. Experimental +results not only show the effectiveness of bilateral propagation but also +emphasize the significance of early-stage propagation in contrast to the +refinement stage. Our code and trained models will be available on the project +page. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Emotional Speech-driven 3D Body Animation via Disentangled Latent + Diffusion CVPR + + +
+ Existing methods for synthesizing 3D human gestures from speech have shown +promising results, but they do not explicitly model the impact of emotions on +the generated gestures. Instead, these methods directly output animations from +speech without control over the expressed emotion. To address this limitation, +we present AMUSE, an emotional speech-driven body animation model based on +latent diffusion. Our observation is that content (i.e., gestures related to +speech rhythm and word utterances), emotion, and personal style are separable. +To account for this, AMUSE maps the driving audio to three disentangled latent +vectors: one for content, one for emotion, and one for personal style. A latent +diffusion model, trained to generate gesture motion sequences, is then +conditioned on these latent vectors. Once trained, AMUSE synthesizes 3D human +gestures directly from speech with control over the expressed emotions and +style by combining the content from the driving speech with the emotion and +style of another speech sequence. Randomly sampling the noise of the diffusion +model further generates variations of the gesture with the same emotional +expressivity. Qualitative, quantitative, and perceptual evaluations demonstrate +that AMUSE outputs realistic gesture sequences. Compared to the state of the +art, the generated gestures are better synchronized with the speech content, +and better represent the emotion expressed by the input speech. Our code is +available at amuse.is.tue.mpg.de. + +
+
+ comment: Conference on Computer Vision and Pattern Recognition (CVPR) 2024. + Webpage: https://amuse.is.tue.mpg.de/ +
+
+
+
+
+ + ♻ ☆ Dense Supervision Propagation for Weakly Supervised Semantic + Segmentation on 3D Point Clouds + + +
+ Semantic segmentation on 3D point clouds is an important task for 3D scene +understanding. While dense labeling on 3D data is expensive and time-consuming, +only a few works address weakly supervised semantic point cloud segmentation +methods to relieve the labeling cost by learning from simpler and cheaper +labels. Meanwhile, there are still huge performance gaps between existing +weakly supervised methods and state-of-the-art fully supervised methods. In +this paper, we train a semantic point cloud segmentation network with only a +small portion of points being labeled. We argue that we can better utilize the +limited supervision information as we densely propagate the supervision signal +from the labeled points to other points within and across the input samples. +Specifically, we propose a cross-sample feature reallocating module to transfer +similar features and therefore re-route the gradients across two samples with +common classes and an intra-sample feature redistribution module to propagate +supervision signals on unlabeled points across and within point cloud samples. +We conduct extensive experiments on public datasets S3DIS and ScanNet. Our +weakly supervised method with only 10% and 1% of labels can produce compatible +results with the fully supervised counterpart. + +
+
+
+
+
+ + ♻ ☆ Towards Learning a Generalist Model for Embodied Navigation CVPR 2024 + + +
+ Building a generalist agent that can interact with the world is the +intriguing target of AI systems, thus spurring the research for embodied +navigation, where an agent is required to navigate according to instructions or +respond to queries. Despite the major progress attained, previous works +primarily focus on task-specific agents and lack generalizability to unseen +scenarios. Recently, LLMs have presented remarkable capabilities across various +fields, and provided a promising opportunity for embodied navigation. Drawing +on this, we propose the first generalist model for embodied navigation, +NaviLLM. It adapts LLMs to embodied navigation by introducing schema-based +instruction. The schema-based instruction flexibly casts various tasks into +generation problems, thereby unifying a wide range of tasks. This approach +allows us to integrate diverse data sources from various datasets into the +training, equipping NaviLLM with a wide range of capabilities required by +embodied navigation. We conduct extensive experiments to evaluate the +performance and generalizability of our model. The experimental results +demonstrate that our unified model achieves state-of-the-art performance on +CVDN, SOON, and ScanQA. Specifically, it surpasses the previous +stats-of-the-art method by a significant margin of 29% in goal progress on +CVDN. Moreover, our model also demonstrates strong generalizability and +presents impressive results on unseen tasks, e.g., embodied question answering +and 3D captioning. + +
+
+ comment: Accepted by CVPR 2024 (14 pages, 3 figures) +
+
+
+
+
+ + ♻ ☆ HiCMAE: Hierarchical Contrastive Masked Autoencoder for Self-Supervised + Audio-Visual Emotion Recognition + + +
+ Audio-Visual Emotion Recognition (AVER) has garnered increasing attention in +recent years for its critical role in creating emotion-ware intelligent +machines. Previous efforts in this area are dominated by the supervised +learning paradigm. Despite significant progress, supervised learning is meeting +its bottleneck due to the longstanding data scarcity issue in AVER. Motivated +by recent advances in self-supervised learning, we propose Hierarchical +Contrastive Masked Autoencoder (HiCMAE), a novel self-supervised framework that +leverages large-scale self-supervised pre-training on vast unlabeled +audio-visual data to promote the advancement of AVER. Following prior arts in +self-supervised audio-visual representation learning, HiCMAE adopts two primary +forms of self-supervision for pre-training, namely masked data modeling and +contrastive learning. Unlike them which focus exclusively on top-layer +representations while neglecting explicit guidance of intermediate layers, +HiCMAE develops a three-pronged strategy to foster hierarchical audio-visual +feature learning and improve the overall quality of learned representations. To +verify the effectiveness of HiCMAE, we conduct extensive experiments on 9 +datasets covering both categorical and dimensional AVER tasks. Experimental +results show that our method significantly outperforms state-of-the-art +supervised and self-supervised audio-visual methods, which indicates that +HiCMAE is a powerful audio-visual emotion representation learner. Codes and +models will be publicly available at https://github.com/sunlicai/HiCMAE. + +
+
+ comment: Accepted by Information Fusion. The code is available at + https://github.com/sunlicai/HiCMAE +
+
+
+
+
+ + ♻ ☆ Invariant Representation via Decoupling Style and Spurious Features from + Images + + +
+ This paper considers the out-of-distribution (OOD) generalization problem +under the setting that both style distribution shift and spurious features +exist and domain labels are missing. This setting frequently arises in +real-world applications and is underlooked because previous approaches mainly +handle either of these two factors. The critical challenge is decoupling style +and spurious features in the absence of domain labels. To address this +challenge, we first propose a structural causal model (SCM) for the image +generation process, which captures both style distribution shift and spurious +features. The proposed SCM enables us to design a new framework called IRSS, +which can gradually separate style distribution and spurious features from +images by introducing adversarial neural networks and multi-environment +optimization, thus achieving OOD generalization. Moreover, it does not require +additional supervision (e.g., domain labels) other than the images and their +corresponding labels. Experiments on benchmark datasets demonstrate that IRSS +outperforms traditional OOD methods and solves the problem of Invariant risk +minimization (IRM) degradation, enabling the extraction of invariant features +under distribution shift. + +
+
+ comment: 10 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ SemGauss-SLAM: Dense Semantic Gaussian Splatting SLAM + + +
+ We propose SemGauss-SLAM, the first semantic SLAM system utilizing 3D +Gaussian representation, that enables accurate 3D semantic mapping, robust +camera tracking, and high-quality rendering in real-time. In this system, we +incorporate semantic feature embedding into 3D Gaussian representation, which +effectively encodes semantic information within the spatial layout of the +environment for precise semantic scene representation. Furthermore, we propose +feature-level loss for updating 3D Gaussian representation, enabling +higher-level guidance for 3D Gaussian optimization. In addition, to reduce +cumulative drift and improve reconstruction accuracy, we introduce +semantic-informed bundle adjustment leveraging semantic associations for joint +optimization of 3D Gaussian representation and camera poses, leading to more +robust tracking and consistent mapping. Our SemGauss-SLAM method demonstrates +superior performance over existing dense semantic SLAM methods in terms of +mapping and tracking accuracy on Replica and ScanNet datasets, while also +showing excellent capabilities in novel-view semantic synthesis and 3D semantic +mapping. + +
+
+
+
+
+ + ♻ ☆ iMixer: hierarchical Hopfield network implies an invertible, implicit + and iterative MLP-Mixer + + +
+ In the last few years, the success of Transformers in computer vision has +stimulated the discovery of many alternative models that compete with +Transformers, such as the MLP-Mixer. Despite their weak inductive bias, these +models have achieved performance comparable to well-studied convolutional +neural networks. Recent studies on modern Hopfield networks suggest the +correspondence between certain energy-based associative memory models and +Transformers or MLP-Mixer, and shed some light on the theoretical background of +the Transformer-type architectures design. In this paper, we generalize the +correspondence to the recently introduced hierarchical Hopfield network, and +find iMixer, a novel generalization of MLP-Mixer model. Unlike ordinary +feedforward neural networks, iMixer involves MLP layers that propagate forward +from the output side to the input side. We characterize the module as an +example of invertible, implicit, and iterative mixing module. We evaluate the +model performance with various datasets on image classification tasks, and find +that iMixer, despite its unique architecture, exhibits stable learning +capabilities and achieves performance comparable to or better than the baseline +vanilla MLP-Mixer. The results imply that the correspondence between the +Hopfield networks and the Mixer models serves as a principle for understanding +a broader class of Transformer-like architecture designs. + +
+
+ comment: 19 pages. v2: minor improvements +
+
+
+
+
+ + ♻ ☆ StreamMultiDiffusion: Real-Time Interactive Generation with Region-Based + Semantic Control + + +
+ The enormous success of diffusion models in text-to-image synthesis has made +them promising candidates for the next generation of end-user applications for +image generation and editing. Previous works have focused on improving the +usability of diffusion models by reducing the inference time or increasing user +interactivity by allowing new, fine-grained controls such as region-based text +prompts. However, we empirically find that integrating both branches of works +is nontrivial, limiting the potential of diffusion models. To solve this +incompatibility, we present StreamMultiDiffusion, the first real-time +region-based text-to-image generation framework. By stabilizing fast inference +techniques and restructuring the model into a newly proposed multi-prompt +stream batch architecture, we achieve $\times 10$ faster panorama generation +than existing solutions, and the generation speed of 1.57 FPS in region-based +text-to-image synthesis on a single RTX 2080 Ti GPU. Our solution opens up a +new paradigm for interactive image generation named semantic palette, where +high-quality images are generated in real-time from given multiple hand-drawn +regions, encoding prescribed semantic meanings (e.g., eagle, girl). Our code +and demo application are available at +https://github.com/ironjr/StreamMultiDiffusion. + +
+
+ comment: 29 pages, 16 figures. v2: typos corrected, references added. Project + page: https://jaerinlee.com/research/StreamMultiDiffusion +
+
+
+
+
+ + ♻ ☆ LogoStyleFool: Vitiating Video Recognition Systems via Logo Style + Transfer AAAI 2024 + + +
+ Video recognition systems are vulnerable to adversarial examples. Recent +studies show that style transfer-based and patch-based unrestricted +perturbations can effectively improve attack efficiency. These attacks, +however, face two main challenges: 1) Adding large stylized perturbations to +all pixels reduces the naturalness of the video and such perturbations can be +easily detected. 2) Patch-based video attacks are not extensible to targeted +attacks due to the limited search space of reinforcement learning that has been +widely used in video attacks recently. In this paper, we focus on the video +black-box setting and propose a novel attack framework named LogoStyleFool by +adding a stylized logo to the clean video. We separate the attack into three +stages: style reference selection, reinforcement-learning-based logo style +transfer, and perturbation optimization. We solve the first challenge by +scaling down the perturbation range to a regional logo, while the second +challenge is addressed by complementing an optimization stage after +reinforcement learning. Experimental results substantiate the overall +superiority of LogoStyleFool over three state-of-the-art patch-based attacks in +terms of attack performance and semantic preservation. Meanwhile, LogoStyleFool +still maintains its performance against two existing patch-based defense +methods. We believe that our research is beneficial in increasing the attention +of the security community to such subregional style transfer attacks. + +
+
+ comment: 14 pages, 3 figures. Accepted to AAAI 2024 +
+
+
+
+
+ + ♻ ☆ StyleFool: Fooling Video Classification Systems via Style Transfer + + +
+ Video classification systems are vulnerable to adversarial attacks, which can +create severe security problems in video verification. Current black-box +attacks need a large number of queries to succeed, resulting in high +computational overhead in the process of attack. On the other hand, attacks +with restricted perturbations are ineffective against defenses such as +denoising or adversarial training. In this paper, we focus on unrestricted +perturbations and propose StyleFool, a black-box video adversarial attack via +style transfer to fool the video classification system. StyleFool first +utilizes color theme proximity to select the best style image, which helps +avoid unnatural details in the stylized videos. Meanwhile, the target class +confidence is additionally considered in targeted attacks to influence the +output distribution of the classifier by moving the stylized video closer to or +even across the decision boundary. A gradient-free method is then employed to +further optimize the adversarial perturbations. We carry out extensive +experiments to evaluate StyleFool on two standard datasets, UCF-101 and +HMDB-51. The experimental results demonstrate that StyleFool outperforms the +state-of-the-art adversarial attacks in terms of both the number of queries and +the robustness against existing defenses. Moreover, 50% of the stylized videos +in untargeted attacks do not need any query since they can already fool the +video classification model. Furthermore, we evaluate the indistinguishability +through a user study to show that the adversarial samples of StyleFool look +imperceptible to human eyes, despite unrestricted perturbations. + +
+
+ comment: 18 pages, 9 figures. Accepted to S&P 2023 +
+
+
+
+
+ + ♻ ☆ Unsigned Orthogonal Distance Fields: An Accurate Neural Implicit + Representation for Diverse 3D Shapes CVPR 2024 + + +
+ Neural implicit representation of geometric shapes has witnessed considerable +advancements in recent years. However, common distance field based implicit +representations, specifically signed distance field (SDF) for watertight shapes +or unsigned distance field (UDF) for arbitrary shapes, routinely suffer from +degradation of reconstruction accuracy when converting to explicit surface +points and meshes. In this paper, we introduce a novel neural implicit +representation based on unsigned orthogonal distance fields (UODFs). In UODFs, +the minimal unsigned distance from any spatial point to the shape surface is +defined solely in one orthogonal direction, contrasting with the +multi-directional determination made by SDF and UDF. Consequently, every point +in the 3D UODFs can directly access its closest surface points along three +orthogonal directions. This distinctive feature leverages the accurate +reconstruction of surface points without interpolation errors. We verify the +effectiveness of UODFs through a range of reconstruction examples, extending +from simple watertight or non-watertight shapes to complex shapes that include +hollows, internal or assembling structures. + +
+
+ comment: accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Asymmetric Masked Distillation for Pre-Training Small Foundation Models CVPR 2024 + + +
+ Self-supervised foundation models have shown great potential in computer +vision thanks to the pre-training paradigm of masked autoencoding. Scale is a +primary factor influencing the performance of these foundation models. However, +these large foundation models often result in high computational cost. This +paper focuses on pre-training relatively small vision transformer models that +could be efficiently adapted to downstream tasks. Specifically, taking +inspiration from knowledge distillation in model compression, we propose a new +asymmetric masked distillation (AMD) framework for pre-training relatively +small models with autoencoding. The core of AMD is to devise an asymmetric +masking strategy, where the teacher model is enabled to see more context +information with a lower masking ratio, while the student model is still +equipped with a high masking ratio. We design customized multi-layer feature +alignment between the teacher encoder and student encoder to regularize the +pre-training of student MAE. To demonstrate the effectiveness and versatility +of AMD, we apply it to both ImageMAE and VideoMAE for pre-training relatively +small ViT models. AMD achieved 84.6% classification accuracy on IN1K using the +ViT-B model. And AMD achieves 73.3% classification accuracy using the ViT-B +model on the Something-in-Something V2 dataset, a 3.7% improvement over the +original ViT-B model from VideoMAE. We also transfer AMD pre-trained models to +downstream tasks and obtain consistent performance improvement over the +original masked autoencoding. The code and models are available at +https://github.com/MCG-NJU/AMD. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Learning the 3D Fauna of the Web + + +
+ Learning 3D models of all animals on the Earth requires massively scaling up +existing solutions. With this ultimate goal in mind, we develop 3D-Fauna, an +approach that learns a pan-category deformable 3D animal model for more than +100 animal species jointly. One crucial bottleneck of modeling animals is the +limited availability of training data, which we overcome by simply learning +from 2D Internet images. We show that prior category-specific attempts fail to +generalize to rare species with limited training images. We address this +challenge by introducing the Semantic Bank of Skinned Models (SBSM), which +automatically discovers a small set of base animal shapes by combining +geometric inductive priors with semantic knowledge implicitly captured by an +off-the-shelf self-supervised feature extractor. To train such a model, we also +contribute a new large-scale dataset of diverse animal species. At inference +time, given a single image of any quadruped animal, our model reconstructs an +articulated 3D mesh in a feed-forward fashion within seconds. + +
+
+ comment: The first two authors contributed equally to this work. The last + three authors contributed equally. Project page: + https://kyleleey.github.io/3DFauna/ +
+
+
+
+
+ + ♻ ☆ Exploring Phonetic Context-Aware Lip-Sync For Talking Face Generation ICASSP 2024 + + +
+ Talking face generation is the challenging task of synthesizing a natural and +realistic face that requires accurate synchronization with a given audio. Due +to co-articulation, where an isolated phone is influenced by the preceding or +following phones, the articulation of a phone varies upon the phonetic context. +Therefore, modeling lip motion with the phonetic context can generate more +spatio-temporally aligned lip movement. In this respect, we investigate the +phonetic context in generating lip motion for talking face generation. We +propose Context-Aware Lip-Sync framework (CALS), which explicitly leverages +phonetic context to generate lip movement of the target face. CALS is comprised +of an Audio-to-Lip module and a Lip-to-Face module. The former is pretrained +based on masked learning to map each phone to a contextualized lip motion unit. +The contextualized lip motion unit then guides the latter in synthesizing a +target identity with context-aware lip motion. From extensive experiments, we +verify that simply exploiting the phonetic context in the proposed CALS +framework effectively enhances spatio-temporal alignment. We also demonstrate +the extent to which the phonetic context assists in lip synchronization and +find the effective window size for lip generation to be approximately 1.2 +seconds. + +
+
+ comment: Accepted at ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ An Embarrassingly Simple Defense Against Backdoor Attacks On SSL + + +
+ Self Supervised Learning (SSL) has emerged as a powerful paradigm to tackle +data landscapes with absence of human supervision. The ability to learn +meaningful tasks without the use of labeled data makes SSL a popular method to +manage large chunks of data in the absence of labels. However, recent work +indicates SSL to be vulnerable to backdoor attacks, wherein models can be +controlled, possibly maliciously, to suit an adversary's motives. Li et. al +(2022) introduce a novel frequency-based backdoor attack: CTRL. They show that +CTRL can be used to efficiently and stealthily gain control over a victim's +model trained using SSL. In this work, we devise two defense strategies against +frequency-based attacks in SSL: One applicable before model training and the +second to be applied during model inference. Our first contribution utilizes +the invariance property of the downstream task to defend against backdoor +attacks in a generalizable fashion. We observe the ASR (Attack Success Rate) to +reduce by over 60% across experiments. Our Inference-time defense relies on +evasiveness of the attack and uses the luminance channel to defend against +attacks. Using object classification as the downstream task for SSL, we +demonstrate successful defense strategies that do not require re-training of +the model. Code is available at https://github.com/Aryan-Satpathy/Backdoor. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Towards Universal Fake Image Detectors that Generalize Across Generative + Models + + +
+ With generative models proliferating at a rapid rate, there is a growing need +for general purpose fake image detectors. In this work, we first show that the +existing paradigm, which consists of training a deep network for real-vs-fake +classification, fails to detect fake images from newer breeds of generative +models when trained to detect GAN fake images. Upon analysis, we find that the +resulting classifier is asymmetrically tuned to detect patterns that make an +image fake. The real class becomes a sink class holding anything that is not +fake, including generated images from models not accessible during training. +Building upon this discovery, we propose to perform real-vs-fake classification +without learning; i.e., using a feature space not explicitly trained to +distinguish real from fake images. We use nearest neighbor and linear probing +as instantiations of this idea. When given access to the feature space of a +large pretrained vision-language model, the very simple baseline of nearest +neighbor classification has surprisingly good generalization ability in +detecting fake images from a wide variety of generative models; e.g., it +improves upon the SoTA by +15.07 mAP and +25.90% acc when tested on unseen +diffusion and autoregressive models. + +
+
+
+
+
+ + ♻ ☆ Each Test Image Deserves A Specific Prompt: Continual Test-Time + Adaptation for 2D Medical Image Segmentation + + +
+ Distribution shift widely exists in medical images acquired from different +medical centres and poses a significant obstacle to deploying the pre-trained +semantic segmentation model in real-world applications. Test-time adaptation +has proven its effectiveness in tackling the cross-domain distribution shift +during inference. However, most existing methods achieve adaptation by updating +the pre-trained models, rendering them susceptible to error accumulation and +catastrophic forgetting when encountering a series of distribution shifts +(i.e., under the continual test-time adaptation setup). To overcome these +challenges caused by updating the models, in this paper, we freeze the +pre-trained model and propose the Visual Prompt-based Test-Time Adaptation +(VPTTA) method to train a specific prompt for each test image to align the +statistics in the batch normalization layers. Specifically, we present the +low-frequency prompt, which is lightweight with only a few parameters and can +be effectively trained in a single iteration. To enhance prompt initialization, +we equip VPTTA with a memory bank to benefit the current prompt from previous +ones. Additionally, we design a warm-up mechanism, which mixes source and +target statistics to construct warm-up statistics, thereby facilitating the +training process. Extensive experiments demonstrate the superiority of our +VPTTA over other state-of-the-art methods on two medical image segmentation +benchmark tasks. The code and weights of pre-trained source models are +available at https://github.com/Chen-Ziyang/VPTTA. + +
+
+
+
+
+ + ♻ ☆ VSCode: General Visual Salient and Camouflaged Object Detection with 2D + Prompt Learning + + +
+ Salient object detection (SOD) and camouflaged object detection (COD) are +related yet distinct binary mapping tasks. These tasks involve multiple +modalities, sharing commonalities and unique cues. Existing research often +employs intricate task-specific specialist models, potentially leading to +redundancy and suboptimal results. We introduce VSCode, a generalist model with +novel 2D prompt learning, to jointly address four SOD tasks and three COD +tasks. We utilize VST as the foundation model and introduce 2D prompts within +the encoder-decoder architecture to learn domain and task-specific knowledge on +two separate dimensions. A prompt discrimination loss helps disentangle +peculiarities to benefit model optimization. VSCode outperforms +state-of-the-art methods across six tasks on 26 datasets and exhibits zero-shot +generalization to unseen tasks by combining 2D prompts, such as RGB-D COD. +Source code has been available at https://github.com/Sssssuperior/VSCode. + +
+
+
+
+
+ + ♻ ☆ Continual Segmentation with Disentangled Objectness Learning and Class + Recognition CVPR 2024 + + +
+ Most continual segmentation methods tackle the problem as a per-pixel +classification task. However, such a paradigm is very challenging, and we find +query-based segmenters with built-in objectness have inherent advantages +compared with per-pixel ones, as objectness has strong transfer ability and +forgetting resistance. Based on these findings, we propose CoMasTRe by +disentangling continual segmentation into two stages: forgetting-resistant +continual objectness learning and well-researched continual classification. +CoMasTRe uses a two-stage segmenter learning class-agnostic mask proposals at +the first stage and leaving recognition to the second stage. During continual +learning, a simple but effective distillation is adopted to strengthen +objectness. To further mitigate the forgetting of old classes, we design a +multi-label class distillation strategy suited for segmentation. We assess the +effectiveness of CoMasTRe on PASCAL VOC and ADE20K. Extensive experiments show +that our method outperforms per-pixel and query-based methods on both datasets. +Code will be available at https://github.com/jordangong/CoMasTRe. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ TransPose: 6D Object Pose Estimation with Geometry-Aware Transformer + + +
+ Estimating the 6D object pose is an essential task in many applications. Due +to the lack of depth information, existing RGB-based methods are sensitive to +occlusion and illumination changes. How to extract and utilize the geometry +features in depth information is crucial to achieve accurate predictions. To +this end, we propose TransPose, a novel 6D pose framework that exploits +Transformer Encoder with geometry-aware module to develop better learning of +point cloud feature representations. Specifically, we first uniformly sample +point cloud and extract local geometry features with the designed local feature +extractor base on graph convolution network. To improve robustness to +occlusion, we adopt Transformer to perform the exchange of global information, +making each local feature contains global information. Finally, we introduce +geometry-aware module in Transformer Encoder, which to form an effective +constrain for point cloud feature learning and makes the global information +exchange more tightly coupled with point cloud tasks. Extensive experiments +indicate the effectiveness of TransPose, our pose estimation pipeline achieves +competitive results on three benchmark datasets. + +
+
+ comment: accept by NEUROCOMPUTING +
+
+
+
+
+ + ♻ ☆ City-on-Web: Real-time Neural Rendering of Large-scale Scenes on the Web + + +
+ Existing neural radiance field-based methods can achieve real-time rendering +of small scenes on the web platform. However, extending these methods to +large-scale scenes still poses significant challenges due to limited resources +in computation, memory, and bandwidth. In this paper, we propose City-on-Web, +the first method for real-time rendering of large-scale scenes on the web. We +propose a block-based volume rendering method to guarantee 3D consistency and +correct occlusion between blocks, and introduce a Level-of-Detail strategy +combined with dynamic loading/unloading of resources to significantly reduce +memory demands. Our system achieves real-time rendering of large-scale scenes +at approximately 32FPS with RTX 3060 GPU on the web and maintains rendering +quality comparable to the current state-of-the-art novel view synthesis +methods. + +
+
+ comment: Project page: https://ustc3dv.github.io/City-on-Web/ +
+
+
+
+
+ + ♻ ☆ FaceChain-ImagineID: Freely Crafting High-Fidelity Diverse Talking Faces + from Disentangled Audio + + +
+ In this paper, we abstract the process of people hearing speech, extracting +meaningful cues, and creating various dynamically audio-consistent talking +faces, termed Listening and Imagining, into the task of high-fidelity diverse +talking faces generation from a single audio. Specifically, it involves two +critical challenges: one is to effectively decouple identity, content, and +emotion from entangled audio, and the other is to maintain intra-video +diversity and inter-video consistency. To tackle the issues, we first dig out +the intricate relationships among facial factors and simplify the decoupling +process, tailoring a Progressive Audio Disentanglement for accurate facial +geometry and semantics learning, where each stage incorporates a customized +training module responsible for a specific factor. Secondly, to achieve +visually diverse and audio-synchronized animation solely from input audio +within a single model, we introduce the Controllable Coherent Frame generation, +which involves the flexible integration of three trainable adapters with frozen +Latent Diffusion Models (LDMs) to focus on maintaining facial geometry and +semantics, as well as texture and temporal coherence between frames. In this +way, we inherit high-quality diverse generation from LDMs while significantly +improving their controllability at a low training cost. Extensive experiments +demonstrate the flexibility and effectiveness of our method in handling this +paradigm. The codes will be released at +https://github.com/modelscope/facechain. + +
+
+
+
+
+ + ♻ ☆ Honeybee: Locality-enhanced Projector for Multimodal LLM CVPR 2024 + + +
+ In Multimodal Large Language Models (MLLMs), a visual projector plays a +crucial role in bridging pre-trained vision encoders with LLMs, enabling +profound visual understanding while harnessing the LLMs' robust capabilities. +Despite the importance of the visual projector, it has been relatively less +explored. In this study, we first identify two essential projector properties: +(i) flexibility in managing the number of visual tokens, crucial for MLLMs' +overall efficiency, and (ii) preservation of local context from visual +features, vital for spatial understanding. Based on these findings, we propose +a novel projector design that is both flexible and locality-enhanced, +effectively satisfying the two desirable properties. Additionally, we present +comprehensive strategies to effectively utilize multiple and multifaceted +instruction datasets. Through extensive experiments, we examine the impact of +individual design choices. Finally, our proposed MLLM, Honeybee, remarkably +outperforms previous state-of-the-art methods across various benchmarks, +including MME, MMBench, SEED-Bench, and LLaVA-Bench, achieving significantly +higher efficiency. Code and models are available at +https://github.com/kakaobrain/honeybee. + +
+
+ comment: CVPR 2024 camera-ready +
+
+
+
+
+ + ♻ ☆ VDC: Versatile Data Cleanser based on Visual-Linguistic Inconsistency by + Multimodal Large Language Models ICLR 2024 + + +
+ The role of data in building AI systems has recently been emphasized by the +emerging concept of data-centric AI. Unfortunately, in the real-world, datasets +may contain dirty samples, such as poisoned samples from backdoor attack, noisy +labels in crowdsourcing, and even hybrids of them. The presence of such dirty +samples makes the DNNs vunerable and unreliable.Hence, it is critical to detect +dirty samples to improve the quality and realiability of dataset. Existing +detectors only focus on detecting poisoned samples or noisy labels, that are +often prone to weak generalization when dealing with dirty samples from other +domains.In this paper, we find a commonality of various dirty samples is +visual-linguistic inconsistency between images and associated labels. To +capture the semantic inconsistency between modalities, we propose versatile +data cleanser (VDC) leveraging the surpassing capabilities of multimodal large +language models (MLLM) in cross-modal alignment and reasoning.It consists of +three consecutive modules: the visual question generation module to generate +insightful questions about the image; the visual question answering module to +acquire the semantics of the visual content by answering the questions with +MLLM; followed by the visual answer evaluation module to evaluate the +inconsistency.Extensive experiments demonstrate its superior performance and +generalization to various categories and types of dirty samples. The code is +available at \url{https://github.com/zihao-ai/vdc}. + +
+
+ comment: Accepted to ICLR 2024 +
+
+
+
+
+ + ♻ ☆ UniHuman: A Unified Model for Editing Human Images in the Wild CVPR 2024 + + +
+ Human image editing includes tasks like changing a person's pose, their +clothing, or editing the image according to a text prompt. However, prior work +often tackles these tasks separately, overlooking the benefit of mutual +reinforcement from learning them jointly. In this paper, we propose UniHuman, a +unified model that addresses multiple facets of human image editing in +real-world settings. To enhance the model's generation quality and +generalization capacity, we leverage guidance from human visual encoders and +introduce a lightweight pose-warping module that can exploit different pose +representations, accommodating unseen textures and patterns. Furthermore, to +bridge the disparity between existing human editing benchmarks with real-world +data, we curated 400K high-quality human image-text pairs for training and +collected 2K human images for out-of-domain testing, both encompassing diverse +clothing styles, backgrounds, and age groups. Experiments on both in-domain and +out-of-domain test sets demonstrate that UniHuman outperforms task-specific +models by a significant margin. In user studies, UniHuman is preferred by the +users in an average of 77% of cases. Our project is available at +https://github.com/NannanLi999/UniHuman. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ AntGPT: Can Large Language Models Help Long-term Action Anticipation + from Videos? ICLR 2024 + + +
+ Can we better anticipate an actor's future actions (e.g. mix eggs) by knowing +what commonly happens after his/her current action (e.g. crack eggs)? What if +we also know the longer-term goal of the actor (e.g. making egg fried rice)? +The long-term action anticipation (LTA) task aims to predict an actor's future +behavior from video observations in the form of verb and noun sequences, and it +is crucial for human-machine interaction. We propose to formulate the LTA task +from two perspectives: a bottom-up approach that predicts the next actions +autoregressively by modeling temporal dynamics; and a top-down approach that +infers the goal of the actor and plans the needed procedure to accomplish the +goal. We hypothesize that large language models (LLMs), which have been +pretrained on procedure text data (e.g. recipes, how-tos), have the potential +to help LTA from both perspectives. It can help provide the prior knowledge on +the possible next actions, and infer the goal given the observed part of a +procedure, respectively. To leverage the LLMs, we propose a two-stage +framework, AntGPT. It first recognizes the actions already performed in the +observed videos and then asks an LLM to predict the future actions via +conditioned generation, or to infer the goal and plan the whole procedure by +chain-of-thought prompting. Empirical results on the Ego4D LTA v1 and v2 +benchmarks, EPIC-Kitchens-55, as well as EGTEA GAZE+ demonstrate the +effectiveness of our proposed approach. AntGPT achieves state-of-the-art +performance on all above benchmarks, and can successfully infer the goal and +thus perform goal-conditioned "counterfactual" prediction via qualitative +analysis. Code and model will be released at +https://brown-palm.github.io/AntGPT + +
+
+ comment: ICLR 2024 Camera Ready +
+
+
+
+
+ + ♻ ☆ Structure Matters: Tackling the Semantic Discrepancy in Diffusion Models + for Image Inpainting CVPR 2024 + + +
+ Denoising diffusion probabilistic models for image inpainting aim to add the +noise to the texture of image during the forward process and recover masked +regions with unmasked ones of the texture via the reverse denoising process. +Despite the meaningful semantics generation, the existing arts suffer from the +semantic discrepancy between masked and unmasked regions, since the +semantically dense unmasked texture fails to be completely degraded while the +masked regions turn to the pure noise in diffusion process, leading to the +large discrepancy between them. In this paper, we aim to answer how unmasked +semantics guide texture denoising process;together with how to tackle the +semantic discrepancy, to facilitate the consistent and meaningful semantics +generation. To this end, we propose a novel structure-guided diffusion model +named StrDiffusion, to reformulate the conventional texture denoising process +under structure guidance to derive a simplified denoising objective for image +inpainting, while revealing: 1) the semantically sparse structure is beneficial +to tackle semantic discrepancy in early stage, while dense texture generates +reasonable semantics in late stage; 2) the semantics from unmasked regions +essentially offer the time-dependent structure guidance for the texture +denoising process, benefiting from the time-dependent sparsity of the structure +semantics. For the denoising process, a structure-guided neural network is +trained to estimate the simplified denoising objective by exploiting the +consistency of the denoised structure between masked and unmasked regions. +Besides, we devise an adaptive resampling strategy as a formal criterion as +whether structure is competent to guide the texture denoising process, while +regulate their semantic correlations. Extensive experiments validate the merits +of StrDiffusion over the state-of-the-arts. Our code is available at +https://github.com/htyjers/StrDiffusion. + +
+
+ comment: 15 pages, 10 figures, to appear CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Posterior Distillation Sampling + + +
+ We introduce Posterior Distillation Sampling (PDS), a novel optimization +method for parametric image editing based on diffusion models. Existing +optimization-based methods, which leverage the powerful 2D prior of diffusion +models to handle various parametric images, have mainly focused on generation. +Unlike generation, editing requires a balance between conforming to the target +attribute and preserving the identity of the source content. Recent 2D image +editing methods have achieved this balance by leveraging the stochastic latent +encoded in the generative process of diffusion models. To extend the editing +capabilities of diffusion models shown in pixel space to parameter space, we +reformulate the 2D image editing method into an optimization form named PDS. +PDS matches the stochastic latents of the source and the target, enabling the +sampling of targets in diverse parameter spaces that align with a desired +attribute while maintaining the source's identity. We demonstrate that this +optimization resembles running a generative process with the target attribute, +but aligning this process with the trajectory of the source's generative +process. Extensive editing results in Neural Radiance Fields and Scalable +Vector Graphics representations demonstrate that PDS is capable of sampling +targets to fulfill the aforementioned balance across various parameter spaces. + +
+
+ comment: Project page: https://posterior-distillation-sampling.github.io/ +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 98 + +
+
+
+ + ☆ Towards Realistic Scene Generation with LiDAR Diffusion Models CVPR 2024 + + +
+ Diffusion models (DMs) excel in photo-realistic image synthesis, but their +adaptation to LiDAR scene generation poses a substantial hurdle. This is +primarily because DMs operating in the point space struggle to preserve the +curve-like patterns and 3D geometry of LiDAR scenes, which consumes much of +their representation power. In this paper, we propose LiDAR Diffusion Models +(LiDMs) to generate LiDAR-realistic scenes from a latent space tailored to +capture the realism of LiDAR scenes by incorporating geometric priors into the +learning pipeline. Our method targets three major desiderata: pattern realism, +geometry realism, and object realism. Specifically, we introduce curve-wise +compression to simulate real-world LiDAR patterns, point-wise coordinate +supervision to learn scene geometry, and patch-wise encoding for a full 3D +object context. With these three core designs, our method achieves competitive +performance on unconditional LiDAR generation in 64-beam scenario and state of +the art on conditional LiDAR generation, while maintaining high efficiency +compared to point-based DMs (up to 107$\times$ faster). Furthermore, by +compressing LiDAR scenes into a latent space, we enable the controllability of +DMs with various conditions such as semantic maps, camera views, and text +prompts. Our code and pretrained weights are available at +https://github.com/hancyran/LiDAR-Diffusion. + +
+
+ comment: CVPR 2024. Code available at + https://github.com/hancyran/LiDAR-Diffusion +
+
+
+
+
+ + ☆ GAMA-IR: Global Additive Multidimensional Averaging for Fast Image + Restoration + + +
+ Deep learning-based methods have shown remarkable success for various image +restoration tasks such as denoising and deblurring. The current +state-of-the-art networks are relatively deep and utilize (variants of) self +attention mechanisms. Those networks are significantly slower than shallow +convolutional networks, which however perform worse. In this paper, we +introduce an image restoration network that is both fast and yields excellent +image quality. The network is designed to minimize the latency and memory +consumption when executed on a standard GPU, while maintaining state-of-the-art +performance. The network is a simple shallow network with an efficient block +that implements global additive multidimensional averaging operations. This +block can capture global information and enable a large receptive field even +when used in shallow networks with minimal computational overhead. Through +extensive experiments and evaluations on diverse tasks, we demonstrate that our +network achieves comparable or even superior results to existing +state-of-the-art image restoration networks with less latency. For instance, we +exceed the state-of-the-art result on real-world SIDD denoising by 0.11dB, +while being 2 to 10 times faster. + +
+
+
+
+
+ + ☆ $R^2$-Tuning: Efficient Image-to-Video Transfer Learning for Video + Temporal Grounding + + +
+ Video temporal grounding (VTG) is a fine-grained video understanding problem +that aims to ground relevant clips in untrimmed videos given natural language +queries. Most existing VTG models are built upon frame-wise final-layer CLIP +features, aided by additional temporal backbones (e.g., SlowFast) with +sophisticated temporal reasoning mechanisms. In this work, we claim that CLIP +itself already shows great potential for fine-grained spatial-temporal +modeling, as each layer offers distinct yet useful information under different +granularity levels. Motivated by this, we propose Reversed Recurrent Tuning +($R^2$-Tuning), a parameter- and memory-efficient transfer learning framework +for video temporal grounding. Our method learns a lightweight $R^2$ Block +containing only 1.5% of the total parameters to perform progressive +spatial-temporal modeling. Starting from the last layer of CLIP, $R^2$ Block +recurrently aggregates spatial features from earlier layers, then refines +temporal correlation conditioning on the given query, resulting in a +coarse-to-fine scheme. $R^2$-Tuning achieves state-of-the-art performance +across three VTG tasks (i.e., moment retrieval, highlight detection, and video +summarization) on six public benchmarks (i.e., QVHighlights, Charades-STA, +Ego4D-NLQ, TACoS, YouTube Highlights, and TVSum) even without the additional +backbone, demonstrating the significance and effectiveness of the proposed +scheme. Our code is available at https://github.com/yeliudev/R2-Tuning. + +
+
+
+
+
+ + ☆ Disentangling Hippocampal Shape Variations: A Study of Neurological + Disorders Using Graph Variational Autoencoder with Contrastive Learning + + +
+ This paper presents a comprehensive study focused on disentangling +hippocampal shape variations from diffusion tensor imaging (DTI) datasets +within the context of neurological disorders. Leveraging a Graph Variational +Autoencoder (VAE) enhanced with Supervised Contrastive Learning, our approach +aims to improve interpretability by disentangling two distinct latent variables +corresponding to age and the presence of diseases. In our ablation study, we +investigate a range of VAE architectures and contrastive loss functions, +showcasing the enhanced disentanglement capabilities of our approach. This +evaluation uses synthetic 3D torus mesh data and real 3D hippocampal mesh +datasets derived from the DTI hippocampal dataset. Our supervised +disentanglement model outperforms several state-of-the-art (SOTA) methods like +attribute and guided VAEs in terms of disentanglement scores. Our model +distinguishes between age groups and disease status in patients with Multiple +Sclerosis (MS) using the hippocampus data. Our Graph VAE with Supervised +Contrastive Learning shows the volume changes of the hippocampus of MS +populations at different ages, and the result is consistent with the current +neuroimaging literature. This research provides valuable insights into the +relationship between neurological disorder and hippocampal shape changes in +different age groups of MS populations using a Graph VAE with Supervised +Contrastive loss. + +
+
+ comment: Length: 23 pages and submitted to the journal: MELBA (Machine + Learning for Biomedical Imaging) +
+
+
+
+
+ + ☆ Privacy-preserving Optics for Enhancing Protection in Face + De-identification CVPR 2024 + + +
+ The modern surge in camera usage alongside widespread computer vision +technology applications poses significant privacy and security concerns. +Current artificial intelligence (AI) technologies aid in recognizing relevant +events and assisting in daily tasks in homes, offices, hospitals, etc. The need +to access or process personal information for these purposes raises privacy +concerns. While software-level solutions like face de-identification provide a +good privacy/utility trade-off, they present vulnerabilities to sniffing +attacks. In this paper, we propose a hardware-level face de-identification +method to solve this vulnerability. Specifically, our approach first learns an +optical encoder along with a regression model to obtain a face heatmap while +hiding the face identity from the source image. We also propose an +anonymization framework that generates a new face using the privacy-preserving +image, face heatmap, and a reference face image from a public dataset as input. +We validate our approach with extensive simulations and hardware experiments. + +
+
+ comment: Accepted to CVPR 2024. Project Website and Code coming soon +
+
+
+
+
+ + ☆ Intensity-based 3D motion correction for cardiac MR images + + +
+ Cardiac magnetic resonance (CMR) image acquisition requires subjects to hold +their breath while 2D cine images are acquired. This process assumes that the +heart remains in the same position across all slices. However, differences in +breathhold positions or patient motion introduce 3D slice misalignments. In +this work, we propose an algorithm that simultaneously aligns all SA and LA +slices by maximizing the pair-wise intensity agreement between their +intersections. Unlike previous works, our approach is formulated as a +subject-specific optimization problem and requires no prior knowledge of the +underlying anatomy. We quantitatively demonstrate that the proposed method is +robust against a large range of rotations and translations by synthetically +misaligning 10 motion-free datasets and aligning them back using the proposed +method. + +
+
+
+
+
+ + ☆ Adapting to Length Shift: FlexiLength Network for Trajectory Prediction CVPR 2024 + + +
+ Trajectory prediction plays an important role in various applications, +including autonomous driving, robotics, and scene understanding. Existing +approaches mainly focus on developing compact neural networks to increase +prediction precision on public datasets, typically employing a standardized +input duration. However, a notable issue arises when these models are evaluated +with varying observation lengths, leading to a significant performance drop, a +phenomenon we term the Observation Length Shift. To address this issue, we +introduce a general and effective framework, the FlexiLength Network (FLN), to +enhance the robustness of existing trajectory prediction techniques against +varying observation periods. Specifically, FLN integrates trajectory data with +diverse observation lengths, incorporates FlexiLength Calibration (FLC) to +acquire temporal invariant representations, and employs FlexiLength Adaptation +(FLA) to further refine these representations for more accurate future +trajectory predictions. Comprehensive experiments on multiple datasets, ie, +ETH/UCY, nuScenes, and Argoverse 1, demonstrate the effectiveness and +flexibility of our proposed FLN framework. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Rethinking Interactive Image Segmentation with Low Latency, High + Quality, and Diverse Prompts CVPR 2024 + + +
+ The goal of interactive image segmentation is to delineate specific regions +within an image via visual or language prompts. Low-latency and high-quality +interactive segmentation with diverse prompts remain challenging for existing +specialist and generalist models. Specialist models, with their limited prompts +and task-specific designs, experience high latency because the image must be +recomputed every time the prompt is updated, due to the joint encoding of image +and visual prompts. Generalist models, exemplified by the Segment Anything +Model (SAM), have recently excelled in prompt diversity and efficiency, lifting +image segmentation to the foundation model era. However, for high-quality +segmentations, SAM still lags behind state-of-the-art specialist models despite +SAM being trained with x100 more segmentation masks. In this work, we delve +deep into the architectural differences between the two types of models. We +observe that dense representation and fusion of visual prompts are the key +design choices contributing to the high segmentation quality of specialist +models. In light of this, we reintroduce this dense design into the generalist +models, to facilitate the development of generalist models with high +segmentation quality. To densely represent diverse visual prompts, we propose +to use a dense map to capture five types: clicks, boxes, polygons, scribbles, +and masks. Thus, we propose SegNext, a next-generation interactive segmentation +approach offering low latency, high quality, and diverse prompt support. Our +method outperforms current state-of-the-art methods on HQSeg-44K and DAVIS, +both quantitatively and qualitatively. + +
+
+ comment: CVPR 2024 https://github.com/uncbiag/SegNext +
+
+
+
+
+ + ☆ MugenNet: A Novel Combined Convolution Neural Network and Transformer + Network with its Application for Colonic Polyp Image Segmentation + + +
+ Biomedical image segmentation is a very important part in disease diagnosis. +The term "colonic polyps" refers to polypoid lesions that occur on the surface +of the colonic mucosa within the intestinal lumen. In clinical practice, early +detection of polyps is conducted through colonoscopy examinations and +biomedical image processing. Therefore, the accurate polyp image segmentation +is of great significance in colonoscopy examinations. Convolutional Neural +Network (CNN) is a common automatic segmentation method, but its main +disadvantage is the long training time. Transformer utilizes a self-attention +mechanism, which essentially assigns different importance weights to each piece +of information, thus achieving high computational efficiency during +segmentation. However, a potential drawback is the risk of information loss. In +the study reported in this paper, based on the well-known hybridization +principle, we proposed a method to combine CNN and Transformer to retain the +strengths of both, and we applied this method to build a system called MugenNet +for colonic polyp image segmentation. We conducted a comprehensive experiment +to compare MugenNet with other CNN models on five publicly available datasets. +The ablation experiment on MugentNet was conducted as well. The experimental +results show that MugenNet achieves significantly higher processing speed and +accuracy compared with CNN alone. The generalized implication with our work is +a method to optimally combine two complimentary methods of machine learning. + +
+
+
+
+
+ + ☆ Absolute-Unified Multi-Class Anomaly Detection via Class-Agnostic + Distribution Alignment + + +
+ Conventional unsupervised anomaly detection (UAD) methods build separate +models for each object category. Recent studies have proposed to train a +unified model for multiple classes, namely model-unified UAD. However, such +methods still implement the unified model separately on each class during +inference with respective anomaly decision thresholds, which hinders their +application when the image categories are entirely unavailable. In this work, +we present a simple yet powerful method to address multi-class anomaly +detection without any class information, namely \textit{absolute-unified} UAD. +We target the crux of prior works in this challenging setting: different +objects have mismatched anomaly score distributions. We propose Class-Agnostic +Distribution Alignment (CADA) to align the mismatched score distribution of +each implicit class without knowing class information, which enables unified +anomaly detection for all classes and samples. The essence of CADA is to +predict each class's score distribution of normal samples given any image, +normal or anomalous, of this class. As a general component, CADA can activate +the potential of nearly all UAD methods under absolute-unified setting. Our +approach is extensively evaluated under the proposed setting on two popular UAD +benchmark datasets, MVTec AD and VisA, where we exceed previous +state-of-the-art by a large margin. + +
+
+
+
+
+ + ☆ End-to-End Autonomous Driving through V2X Cooperation + + +
+ Cooperatively utilizing both ego-vehicle and infrastructure sensor data via +V2X communication has emerged as a promising approach for advanced autonomous +driving. However, current research mainly focuses on improving individual +modules, rather than taking end-to-end learning to optimize final planning +performance, resulting in underutilized data potential. In this paper, we +introduce UniV2X, a pioneering cooperative autonomous driving framework that +seamlessly integrates all key driving modules across diverse views into a +unified network. We propose a sparse-dense hybrid data transmission and fusion +mechanism for effective vehicle-infrastructure cooperation, offering three +advantages: 1) Effective for simultaneously enhancing agent perception, online +mapping, and occupancy prediction, ultimately improving planning performance. +2) Transmission-friendly for practical and limited communication conditions. 3) +Reliable data fusion with interpretability of this hybrid data. We implement +UniV2X, as well as reproducing several benchmark methods, on the challenging +DAIR-V2X, the real-world cooperative driving dataset. Experimental results +demonstrate the effectiveness of UniV2X in significantly enhancing planning +performance, as well as all intermediate output performance. Code is at +https://github.com/AIR-THU/UniV2X. + +
+
+
+
+
+ + ☆ Neural Radiance Field-based Visual Rendering: A Comprehensive Review + + +
+ In recent years, Neural Radiance Fields (NeRF) has made remarkable progress +in the field of computer vision and graphics, providing strong technical +support for solving key tasks including 3D scene understanding, new perspective +synthesis, human body reconstruction, robotics, and so on, the attention of +academics to this research result is growing. As a revolutionary neural +implicit field representation, NeRF has caused a continuous research boom in +the academic community. Therefore, the purpose of this review is to provide an +in-depth analysis of the research literature on NeRF within the past two years, +to provide a comprehensive academic perspective for budding researchers. In +this paper, the core architecture of NeRF is first elaborated in detail, +followed by a discussion of various improvement strategies for NeRF, and case +studies of NeRF in diverse application scenarios, demonstrating its practical +utility in different domains. In terms of datasets and evaluation metrics, This +paper details the key resources needed for NeRF model training. Finally, this +paper provides a prospective discussion on the future development trends and +potential challenges of NeRF, aiming to provide research inspiration for +researchers in the field and to promote the further development of related +technologies. + +
+
+ comment: 35 pages, 22 figures, 14 tables, 18 formulas +
+
+
+
+
+ + ☆ Unknown Prompt, the only Lacuna: Unveiling CLIP's Potential for Open + Domain Generalization CVPR 2024 + + +
+ We delve into Open Domain Generalization (ODG), marked by domain and category +shifts between training's labeled source and testing's unlabeled target +domains. Existing solutions to ODG face limitations due to constrained +generalizations of traditional CNN backbones and errors in detecting target +open samples in the absence of prior knowledge. Addressing these pitfalls, we +introduce ODG-CLIP, harnessing the semantic prowess of the vision-language +model, CLIP. Our framework brings forth three primary innovations: Firstly, +distinct from prevailing paradigms, we conceptualize ODG as a multi-class +classification challenge encompassing both known and novel categories. Central +to our approach is modeling a unique prompt tailored for detecting unknown +class samples, and to train this, we employ a readily accessible stable +diffusion model, elegantly generating proxy images for the open class. +Secondly, aiming for domain-tailored classification (prompt) weights while +ensuring a balance of precision and simplicity, we devise a novel visual +stylecentric prompt learning mechanism. Finally, we infuse images with +class-discriminative knowledge derived from the prompt space to augment the +fidelity of CLIP's visual embeddings. We introduce a novel objective to +safeguard the continuity of this infused semantic intel across domains, +especially for the shared classes. Through rigorous testing on diverse +datasets, covering closed and open-set DG contexts, ODG-CLIP demonstrates clear +supremacy, consistently outpacing peers with performance boosts between 8%-16%. +Code will be available at https://github.com/mainaksingha01/ODG-CLIP. + +
+
+ comment: Accepted in CVPR 2024 +
+
+
+
+
+ + ☆ Training-Free Semantic Segmentation via LLM-Supervision + + +
+ Recent advancements in open vocabulary models, like CLIP, have notably +advanced zero-shot classification and segmentation by utilizing natural +language for class-specific embeddings. However, most research has focused on +improving model accuracy through prompt engineering, prompt learning, or +fine-tuning with limited labeled data, thereby overlooking the importance of +refining the class descriptors. This paper introduces a new approach to +text-supervised semantic segmentation using supervision by a large language +model (LLM) that does not require extra training. Our method starts from an +LLM, like GPT-3, to generate a detailed set of subclasses for more accurate +class representation. We then employ an advanced text-supervised semantic +segmentation model to apply the generated subclasses as target labels, +resulting in diverse segmentation results tailored to each subclass's unique +characteristics. Additionally, we propose an assembly that merges the +segmentation maps from the various subclass descriptors to ensure a more +comprehensive representation of the different aspects in the test images. +Through comprehensive experiments on three standard benchmarks, our method +outperforms traditional text-supervised semantic segmentation methods by a +marked margin. + +
+
+ comment: 22 pages,10 figures, conference +
+
+
+
+
+ + ☆ DMSSN: Distilled Mixed Spectral-Spatial Network for Hyperspectral + Salient Object Detection + + +
+ Hyperspectral salient object detection (HSOD) has exhibited remarkable +promise across various applications, particularly in intricate scenarios where +conventional RGB-based approaches fall short. Despite the considerable progress +in HSOD method advancements, two critical challenges require immediate +attention. Firstly, existing hyperspectral data dimension reduction techniques +incur a loss of spectral information, which adversely affects detection +accuracy. Secondly, previous methods insufficiently harness the inherent +distinctive attributes of hyperspectral images (HSIs) during the feature +extraction process. To address these challenges, we propose a novel approach +termed the Distilled Mixed Spectral-Spatial Network (DMSSN), comprising a +Distilled Spectral Encoding process and a Mixed Spectral-Spatial Transformer +(MSST) feature extraction network. The encoding process utilizes knowledge +distillation to construct a lightweight autoencoder for dimension reduction, +striking a balance between robust encoding capabilities and low computational +costs. The MSST extracts spectral-spatial features through multiple attention +head groups, collaboratively enhancing its resistance to intricate scenarios. +Moreover, we have created a large-scale HSOD dataset, HSOD-BIT, to tackle the +issue of data scarcity in this field and meet the fundamental data requirements +of deep network training. Extensive experiments demonstrate that our proposed +DMSSN achieves state-of-the-art performance on multiple datasets. We will soon +make the code and dataset publicly available on +https://github.com/anonymous0519/HSOD-BIT. + +
+
+
+
+
+ + ☆ Learning to Rank Patches for Unbiased Image Redundancy Reduction + + +
+ Images suffer from heavy spatial redundancy because pixels in neighboring +regions are spatially correlated. Existing approaches strive to overcome this +limitation by reducing less meaningful image regions. However, current leading +methods rely on supervisory signals. They may compel models to preserve content +that aligns with labeled categories and discard content belonging to unlabeled +categories. This categorical inductive bias makes these methods less effective +in real-world scenarios. To address this issue, we propose a self-supervised +framework for image redundancy reduction called Learning to Rank Patches +(LTRP). We observe that image reconstruction of masked image modeling models is +sensitive to the removal of visible patches when the masking ratio is high +(e.g., 90\%). Building upon it, we implement LTRP via two steps: inferring the +semantic density score of each patch by quantifying variation between +reconstructions with and without this patch, and learning to rank the patches +with the pseudo score. The entire process is self-supervised, thus getting out +of the dilemma of categorical inductive bias. We design extensive experiments +on different datasets and tasks. The results demonstrate that LTRP outperforms +both supervised and other self-supervised methods due to the fair assessment of +image content. + +
+
+
+
+
+ + ☆ Weak-to-Strong 3D Object Detection with X-Ray Distillation + + +
+ This paper addresses the critical challenges of sparsity and occlusion in +LiDAR-based 3D object detection. Current methods often rely on supplementary +modules or specific architectural designs, potentially limiting their +applicability to new and evolving architectures. To our knowledge, we are the +first to propose a versatile technique that seamlessly integrates into any +existing framework for 3D Object Detection, marking the first instance of +Weak-to-Strong generalization in 3D computer vision. We introduce a novel +framework, X-Ray Distillation with Object-Complete Frames, suitable for both +supervised and semi-supervised settings, that leverages the temporal aspect of +point cloud sequences. This method extracts crucial information from both +previous and subsequent LiDAR frames, creating Object-Complete frames that +represent objects from multiple viewpoints, thus addressing occlusion and +sparsity. Given the limitation of not being able to generate Object-Complete +frames during online inference, we utilize Knowledge Distillation within a +Teacher-Student framework. This technique encourages the strong Student model +to emulate the behavior of the weaker Teacher, which processes simple and +informative Object-Complete frames, effectively offering a comprehensive view +of objects as if seen through X-ray vision. Our proposed methods surpass +state-of-the-art in semi-supervised learning by 1-1.5 mAP and enhance the +performance of five established supervised models by 1-2 mAP on standard +autonomous driving datasets, even with default hyperparameters. Code for +Object-Complete frames is available here: +https://github.com/sakharok13/X-Ray-Teacher-Patching-Tools. + +
+
+ comment: Computer Vision and Pattern Recognition 2024 +
+
+
+
+
+ + ☆ OmniSDF: Scene Reconstruction using Omnidirectional Signed Distance + Functions and Adaptive Binoctrees + + +
+ We present a method to reconstruct indoor and outdoor static scene geometry +and appearance from an omnidirectional video moving in a small circular sweep. +This setting is challenging because of the small baseline and large depth +ranges, making it difficult to find ray crossings. To better constrain the +optimization, we estimate geometry as a signed distance field within a +spherical binoctree data structure and use a complementary efficient tree +traversal strategy based on a breadth-first search for sampling. Unlike regular +grids or trees, the shape of this structure well-matches the camera setting, +creating a better memory-quality trade-off. From an initial depth estimate, the +binoctree is adaptively subdivided throughout the optimization; previous +methods use a fixed depth that leaves the scene undersampled. In comparison +with three neural optimization methods and two non-neural methods, ours shows +decreased geometry error on average, especially in a detailed scene, while +significantly reducing the required number of voxels to represent such details. + +
+
+
+
+
+ + ☆ OmniLocalRF: Omnidirectional Local Radiance Fields from Dynamic Videos + + +
+ Omnidirectional cameras are extensively used in various applications to +provide a wide field of vision. However, they face a challenge in synthesizing +novel views due to the inevitable presence of dynamic objects, including the +photographer, in their wide field of view. In this paper, we introduce a new +approach called Omnidirectional Local Radiance Fields (OmniLocalRF) that can +render static-only scene views, removing and inpainting dynamic objects +simultaneously. Our approach combines the principles of local radiance fields +with the bidirectional optimization of omnidirectional rays. Our input is an +omnidirectional video, and we evaluate the mutual observations of the entire +angle between the previous and current frames. To reduce ghosting artifacts of +dynamic objects and inpaint occlusions, we devise a multi-resolution motion +mask prediction module. Unlike existing methods that primarily separate dynamic +components through the temporal domain, our method uses multi-resolution neural +feature planes for precise segmentation, which is more suitable for long +360-degree videos. Our experiments validate that OmniLocalRF outperforms +existing methods in both qualitative and quantitative metrics, especially in +scenarios with complex real-world scenes. In particular, our approach +eliminates the need for manual interaction, such as drawing motion masks by +hand and additional pose estimation, making it a highly effective and efficient +solution. + +
+
+
+
+
+ + ☆ Knowledge NeRF: Few-shot Novel View Synthesis for Dynamic Articulated + Objects + + +
+ We present Knowledge NeRF to synthesize novel views for dynamic +scenes.Reconstructing dynamic 3D scenes from few sparse views and rendering +them from arbitrary perspectives is a challenging problem with applications in +various domains. Previous dynamic NeRF methods learn the deformation of +articulated objects from monocular videos. However, qualities of their +reconstructed scenes are limited.To clearly reconstruct dynamic scenes, we +propose a new framework by considering two frames at a time.We pretrain a NeRF +model for an articulated object.When articulated objects moves, Knowledge NeRF +learns to generate novel views at the new state by incorporating past knowledge +in the pretrained NeRF model with minimal observations in the present state. We +propose a projection module to adapt NeRF for dynamic scenes, learning the +correspondence between pretrained knowledge base and current states. +Experimental results demonstrate the effectiveness of our method in +reconstructing dynamic 3D scenes with 5 input images in one state. Knowledge +NeRF is a new pipeline and promising solution for novel view synthesis in +dynamic articulated objects. The data and implementation are publicly available +at https://github.com/RussRobin/Knowledge_NeRF. + +
+
+
+
+
+ + ☆ A General and Efficient Training for Transformer via Token Expansion CVPR 2024 + + +
+ The remarkable performance of Vision Transformers (ViTs) typically requires +an extremely large training cost. Existing methods have attempted to accelerate +the training of ViTs, yet typically disregard method universality with accuracy +dropping. Meanwhile, they break the training consistency of the original +transformers, including the consistency of hyper-parameters, architecture, and +strategy, which prevents them from being widely applied to different +Transformer networks. In this paper, we propose a novel token growth scheme +Token Expansion (termed ToE) to achieve consistent training acceleration for +ViTs. We introduce an "initialization-expansion-merging" pipeline to maintain +the integrity of the intermediate feature distribution of original +transformers, preventing the loss of crucial learnable information in the +training process. ToE can not only be seamlessly integrated into the training +and fine-tuning process of transformers (e.g., DeiT and LV-ViT), but also +effective for efficient training frameworks (e.g., EfficientTrain), without +twisting the original training hyper-parameters, architecture, and introducing +additional training strategies. Extensive experiments demonstrate that ToE +achieves about 1.3x faster for the training of ViTs in a lossless manner, or +even with performance gains over the full-token training baselines. Code is +available at https://github.com/Osilly/TokenExpansion . + +
+
+ comment: Accepted to CVPR 2024. Code is available at + https://github.com/Osilly/TokenExpansion +
+
+
+
+
+ + ☆ Statistical Analysis by Semiparametric Additive Regression and LSTM-FCN + Based Hierarchical Classification for Computer Vision Quantification of + Parkinsonian Bradykinesia + + +
+ Bradykinesia, characterized by involuntary slowing or decrement of movement, +is a fundamental symptom of Parkinson's Disease (PD) and is vital for its +clinical diagnosis. Despite various methodologies explored to quantify +bradykinesia, computer vision-based approaches have shown promising results. +However, these methods often fall short in adequately addressing key +bradykinesia characteristics in repetitive limb movements: "occasional arrest" +and "decrement in amplitude." + This research advances vision-based quantification of bradykinesia by +introducing nuanced numerical analysis to capture decrement in amplitudes and +employing a simple deep learning technique, LSTM-FCN, for precise +classification of occasional arrests. Our approach structures the +classification process hierarchically, tailoring it to the unique dynamics of +bradykinesia in PD. + Statistical analysis of the extracted features, including those representing +arrest and fatigue, has demonstrated their statistical significance in most +cases. This finding underscores the importance of considering "occasional +arrest" and "decrement in amplitude" in bradykinesia quantification of limb +movement. Our enhanced diagnostic tool has been rigorously tested on an +extensive dataset comprising 1396 motion videos from 310 PD patients, achieving +an accuracy of 80.3%. The results confirm the robustness and reliability of our +method. + +
+
+
+
+
+ + ☆ Weakly-Supervised Cross-Domain Segmentation of Electron Microscopy with + Sparse Point Annotation + + +
+ Accurate segmentation of organelle instances from electron microscopy (EM) +images plays an essential role in many neuroscience researches. However, +practical scenarios usually suffer from high annotation costs, label scarcity, +and large domain diversity. While unsupervised domain adaptation (UDA) that +assumes no annotation effort on the target data is promising to alleviate these +challenges, its performance on complicated segmentation tasks is still far from +practical usage. To address these issues, we investigate a highly +annotation-efficient weak supervision, which assumes only sparse center-points +on a small subset of object instances in the target training images. To achieve +accurate segmentation with partial point annotations, we introduce instance +counting and center detection as auxiliary tasks and design a multitask +learning framework to leverage correlations among the counting, detection, and +segmentation, which are all tasks with partial or no supervision. Building upon +the different domain-invariances of the three tasks, we enforce counting +estimation with a novel soft consistency loss as a global prior for center +detection, which further guides the per-pixel segmentation. To further +compensate for annotation sparsity, we develop a cross-position cut-and-paste +for label augmentation and an entropy-based pseudo-label selection. The +experimental results highlight that, by simply using extremely weak annotation, +e.g., 15\% sparse points, for model training, the proposed model is capable of +significantly outperforming UDA methods and produces comparable performance as +the supervised counterpart. The high robustness of our model shown in the +validations and the low requirement of expert knowledge for sparse point +annotation further improve the potential application value of our model. + +
+
+
+
+
+ + ☆ DeeDSR: Towards Real-World Image Super-Resolution via Degradation-Aware + Stable Diffusion + + +
+ Diffusion models, known for their powerful generative capabilities, play a +crucial role in addressing real-world super-resolution challenges. However, +these models often focus on improving local textures while neglecting the +impacts of global degradation, which can significantly reduce semantic fidelity +and lead to inaccurate reconstructions and suboptimal super-resolution +performance. To address this issue, we introduce a novel two-stage, +degradation-aware framework that enhances the diffusion model's ability to +recognize content and degradation in low-resolution images. In the first stage, +we employ unsupervised contrastive learning to obtain representations of image +degradations. In the second stage, we integrate a degradation-aware module into +a simplified ControlNet, enabling flexible adaptation to various degradations +based on the learned representations. Furthermore, we decompose the +degradation-aware features into global semantics and local details branches, +which are then injected into the diffusion denoising module to modulate the +target generation. Our method effectively recovers semantically precise and +photorealistic details, particularly under significant degradation conditions, +demonstrating state-of-the-art performance across various benchmarks. Codes +will be released at https://github.com/bichunyang419/DeeDSR. + +
+
+
+
+
+ + ☆ Dual DETRs for Multi-Label Temporal Action Detection CVPR 2024 + + +
+ Temporal Action Detection (TAD) aims to identify the action boundaries and +the corresponding category within untrimmed videos. Inspired by the success of +DETR in object detection, several methods have adapted the query-based +framework to the TAD task. However, these approaches primarily followed DETR to +predict actions at the instance level (i.e., identify each action by its center +point), leading to sub-optimal boundary localization. To address this issue, we +propose a new Dual-level query-based TAD framework, namely DualDETR, to detect +actions from both instance-level and boundary-level. Decoding at different +levels requires semantics of different granularity, therefore we introduce a +two-branch decoding structure. This structure builds distinctive decoding +processes for different levels, facilitating explicit capture of temporal cues +and semantics at each level. On top of the two-branch design, we present a +joint query initialization strategy to align queries from both levels. +Specifically, we leverage encoder proposals to match queries from each level in +a one-to-one manner. Then, the matched queries are initialized using position +and content prior from the matched action proposal. The aligned dual-level +queries can refine the matched proposal with complementary cues during +subsequent decoding. We evaluate DualDETR on three challenging multi-label TAD +benchmarks. The experimental results demonstrate the superior performance of +DualDETR to the existing state-of-the-art methods, achieving a substantial +improvement under det-mAP and delivering impressive results under seg-mAP. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Deep Instruction Tuning for Segment Anything Model + + +
+ Segment Anything Model (SAM) exhibits powerful yet versatile capabilities on +(un) conditional image segmentation tasks recently. Although SAM can support +various segmentation prompts, we note that, compared to point- and box-guided +segmentation, it performs much worse on text-instructed tasks. We argue that +deep text instruction tuning is key to mitigate such shortcoming caused by the +shallow fusion scheme in its default light-weight mask decoder. In this paper, +two \emph{deep instruction tuning} (DIT) methods are proposed, one is +end-to-end and the other is layer-wise. With these tuning methods, we can +regard the image encoder of SAM as a stand-alone vision-language learner in +contrast to building another deep fusion branch. Extensive experiments on three +highly competitive benchmark datasets of referring image segmentation show that +a simple end-to-end DIT improves SAM by a large margin, with layer-wise DIT +further boosts the performance to state-of-the-art. Our code is anonymously +released at: https://github.com/wysnzzzz/DIT. + +
+
+
+
+
+ + ☆ SpiralMLP: A Lightweight Vision MLP Architecture + + +
+ We present SpiralMLP, a novel architecture that introduces a Spiral FC layer +as a replacement for the conventional Token Mixing approach. Differing from +several existing MLP-based models that primarily emphasize axes, our Spiral FC +layer is designed as a deformable convolution layer with spiral-like offsets. +We further adapt Spiral FC into two variants: Self-Spiral FC and Cross-Spiral +FC, which enable both local and global feature integration seamlessly, +eliminating the need for additional processing steps. To thoroughly investigate +the effectiveness of the spiral-like offsets and validate our design, we +conduct ablation studies and explore optimal configurations. In empirical +tests, SpiralMLP reaches state-of-the-art performance, similar to Transformers, +CNNs, and other MLPs, benchmarking on ImageNet-1k, COCO and ADE20K. SpiralMLP +still maintains linear computational complexity O(HW) and is compatible with +varying input image resolutions. Our study reveals that targeting the full +receptive field is not essential for achieving high performance, instead, +adopting a refined approach offers better results. + +
+
+
+
+
+ + ☆ Attire-Based Anomaly Detection in Restricted Areas Using YOLOv8 for + Enhanced CCTV Security + + +
+ This research introduces an innovative security enhancement approach, +employing advanced image analysis and soft computing. The focus is on an +intelligent surveillance system that detects unauthorized individuals in +restricted areas by analyzing attire. Traditional security measures face +challenges in monitoring unauthorized access. Leveraging YOLOv8, an advanced +object detection algorithm, our system identifies authorized personnel based on +their attire in CCTV footage. The methodology involves training the YOLOv8 +model on a comprehensive dataset of uniform patterns, ensuring precise +recognition in specific regions. Soft computing techniques enhance adaptability +to dynamic environments and varying lighting conditions. This research +contributes to image analysis and soft computing, providing a sophisticated +security solution. Emphasizing uniform-based anomaly detection, it establishes +a foundation for robust security systems in restricted areas. The outcomes +highlight the potential of YOLOv8-based surveillance in ensuring safety in +sensitive locations. + +
+
+ comment: 9 pages, 6 figures +
+
+
+
+
+ + ☆ IPT-V2: Efficient Image Processing Transformer using Hierarchical + Attentions + + +
+ Recent advances have demonstrated the powerful capability of transformer +architecture in image restoration. However, our analysis indicates that +existing transformerbased methods can not establish both exact global and local +dependencies simultaneously, which are much critical to restore the details and +missing content of degraded images. To this end, we present an efficient image +processing transformer architecture with hierarchical attentions, called IPTV2, +adopting a focal context self-attention (FCSA) and a global grid self-attention +(GGSA) to obtain adequate token interactions in local and global receptive +fields. Specifically, FCSA applies the shifted window mechanism into the +channel self-attention, helps capture the local context and mutual interaction +across channels. And GGSA constructs long-range dependencies in the +cross-window grid, aggregates global information in spatial dimension. +Moreover, we introduce structural re-parameterization technique to feed-forward +network to further improve the model capability. Extensive experiments +demonstrate that our proposed IPT-V2 achieves state-of-the-art results on +various image processing tasks, covering denoising, deblurring, deraining and +obtains much better trade-off for performance and computational complexity than +previous methods. Besides, we extend our method to image generation as latent +diffusion backbone, and significantly outperforms DiTs. + +
+
+
+
+
+ + ☆ Domain Generalizable Person Search Using Unreal Dataset AAAI2024 + + +
+ Collecting and labeling real datasets to train the person search networks not +only requires a lot of time and effort, but also accompanies privacy issues. +The weakly-supervised and unsupervised domain adaptation methods have been +proposed to alleviate the labeling burden for target datasets, however, their +generalization capability is limited. We introduce a novel person search method +based on the domain generalization framework, that uses an automatically +labeled unreal dataset only for training but is applicable to arbitrary unseen +real datasets. To alleviate the domain gaps when transferring the knowledge +from the unreal source dataset to the real target datasets, we estimate the +fidelity of person instances which is then used to train the end-to-end network +adaptively. Moreover, we devise a domain-invariant feature learning scheme to +encourage the network to suppress the domain-related features. Experimental +results demonstrate that the proposed method provides the competitive +performance to existing person search methods even though it is applicable to +arbitrary unseen datasets without any prior knowledge and re-training burdens. + +
+
+ comment: AAAI2024 accepted +
+
+
+
+
+ + ☆ A Multi-Branched Radial Basis Network Approach to Predicting Complex + Chaotic Behaviours + + +
+ In this study, we propose a multi branched network approach to predict the +dynamics of a physics attractor characterized by intricate and chaotic +behavior. We introduce a unique neural network architecture comprised of Radial +Basis Function (RBF) layers combined with an attention mechanism designed to +effectively capture nonlinear inter-dependencies inherent in the attractor's +temporal evolution. Our results demonstrate successful prediction of the +attractor's trajectory across 100 predictions made using a real-world dataset +of 36,700 time-series observations encompassing approximately 28 minutes of +activity. To further illustrate the performance of our proposed technique, we +provide comprehensive visualizations depicting the attractor's original and +predicted behaviors alongside quantitative measures comparing observed versus +estimated outcomes. Overall, this work showcases the potential of advanced +machine learning algorithms in elucidating hidden structures in complex +physical systems while offering practical applications in various domains +requiring accurate short-term forecasting capabilities. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ☆ Object-level Copy-Move Forgery Image Detection based on Inconsistency + Mining + + +
+ In copy-move tampering operations, perpetrators often employ techniques, such +as blurring, to conceal tampering traces, posing significant challenges to the +detection of object-level targets with intact structures. Focus on these +challenges, this paper proposes an Object-level Copy-Move Forgery Image +Detection based on Inconsistency Mining (IMNet). To obtain complete +object-level targets, we customize prototypes for both the source and tampered +regions and dynamically update them. Additionally, we extract inconsistent +regions between coarse similar regions obtained through self-correlation +calculations and regions composed of prototypes. The detected inconsistent +regions are used as supplements to coarse similar regions to refine pixel-level +detection. We operate experiments on three public datasets which validate the +effectiveness and the robustness of the proposed IMNet. + +
+
+ comment: 4 pages, 2 figures +
+
+
+
+
+ + ☆ Weak Distribution Detectors Lead to Stronger Generalizability of + Vision-Language Prompt Tuning AAAI2024 + + +
+ We propose a generalized method for boosting the generalization ability of +pre-trained vision-language models (VLMs) while fine-tuning on downstream +few-shot tasks. The idea is realized by exploiting out-of-distribution (OOD) +detection to predict whether a sample belongs to a base distribution or a novel +distribution and then using the score generated by a dedicated competition +based scoring function to fuse the zero-shot and few-shot classifier. The fused +classifier is dynamic, which will bias towards the zero-shot classifier if a +sample is more likely from the distribution pre-trained on, leading to improved +base-to-novel generalization ability. Our method is performed only in test +stage, which is applicable to boost existing methods without time-consuming +re-training. Extensive experiments show that even weak distribution detectors +can still improve VLMs' generalization ability. Specifically, with the help of +OOD detectors, the harmonic mean of CoOp and ProGrad increase by 2.6 and 1.5 +percentage points over 11 recognition datasets in the base-to-novel setting. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ☆ Parameter and Data-Efficient Spectral StyleDCGAN ICLR + + +
+ We present a simple, highly parameter, and data-efficient adversarial network +for unconditional face generation. Our method: Spectral Style-DCGAN or SSD +utilizes only 6.574 million parameters and 4739 dog faces from the Animal Faces +HQ (AFHQ) dataset as training samples while preserving fidelity at low +resolutions up to 64x64. Code available at +https://github.com/Aryan-Garg/StyleDCGAN. + +
+
+ comment: Notable ICLR Tiny Paper 2024 +
+
+
+
+
+ + ☆ LAESI: Leaf Area Estimation with Synthetic Imagery + + +
+ We introduce LAESI, a Synthetic Leaf Dataset of 100,000 synthetic leaf images +on millimeter paper, each with semantic masks and surface area labels. This +dataset provides a resource for leaf morphology analysis primarily aimed at +beech and oak leaves. We evaluate the applicability of the dataset by training +machine learning models for leaf surface area prediction and semantic +segmentation, using real images for validation. Our validation shows that these +models can be trained to predict leaf surface area with a relative error not +greater than an average human annotator. LAESI also provides an efficient +framework based on 3D procedural models and generative AI for the large-scale, +controllable generation of data with potential further applications in +agriculture and biology. We evaluate the inclusion of generative AI in our +procedural data generation pipeline and show how data filtering based on +annotation consistency results in datasets which allow training the highest +performing vision models. + +
+
+ comment: 10 pages, 12 figures, 1 table +
+
+
+
+
+ + ☆ Memory-based Cross-modal Semantic Alignment Network for Radiology Report + Generation + + +
+ Generating radiology reports automatically reduces the workload of +radiologists and helps the diagnoses of specific diseases. Many existing +methods take this task as modality transfer process. However, since the key +information related to disease accounts for a small proportion in both image +and report, it is hard for the model to learn the latent relation between the +radiology image and its report, thus failing to generate fluent and accurate +radiology reports. To tackle this problem, we propose a memory-based +cross-modal semantic alignment model (MCSAM) following an encoder-decoder +paradigm. MCSAM includes a well initialized long-term clinical memory bank to +learn disease-related representations as well as prior knowledge for different +modalities to retrieve and use the retrieved memory to perform feature +consolidation. To ensure the semantic consistency of the retrieved cross modal +prior knowledge, a cross-modal semantic alignment module (SAM) is proposed. SAM +is also able to generate semantic visual feature embeddings which can be added +to the decoder and benefits report generation. More importantly, to memorize +the state and additional information while generating reports with the decoder, +we use learnable memory tokens which can be seen as prompts. Extensive +experiments demonstrate the promising performance of our proposed method which +generates state-of-the-art performance on the MIMIC-CXR dataset. + +
+
+ comment: 12 pages, 8 figures +
+
+
+
+
+ + ☆ M3D: Advancing 3D Medical Image Analysis with Multi-Modal Large Language + Models + + +
+ Medical image analysis is essential to clinical diagnosis and treatment, +which is increasingly supported by multi-modal large language models (MLLMs). +However, previous research has primarily focused on 2D medical images, leaving +3D images under-explored, despite their richer spatial information. This paper +aims to advance 3D medical image analysis with MLLMs. To this end, we present a +large-scale 3D multi-modal medical dataset, M3D-Data, comprising 120K +image-text pairs and 662K instruction-response pairs specifically tailored for +various 3D medical tasks, such as image-text retrieval, report generation, +visual question answering, positioning, and segmentation. Additionally, we +propose M3D-LaMed, a versatile multi-modal large language model for 3D medical +image analysis. Furthermore, we introduce a new 3D multi-modal medical +benchmark, M3D-Bench, which facilitates automatic evaluation across eight +tasks. Through comprehensive evaluation, our method proves to be a robust model +for 3D medical image analysis, outperforming existing solutions. All code, +data, and models are publicly available at: https://github.com/BAAI-DCAI/M3D. + +
+
+ comment: MLLM, 3D medical image analysis +
+
+
+
+
+ + ☆ Automated Bi-Fold Weighted Ensemble Algorithms and its Application to + Brain Tumor Detection and Classification + + +
+ The uncontrolled and unstructured growth of brain cells is known as brain +tumor, which has one of the highest mortality rates among diseases from all +types of cancers. Due to limited diagnostic and treatment capabilities, they +pose significant challenges, especially in third-world countries. Early +diagnosis plays a vital role in effectively managing brain tumors and reducing +mortality rates. However, the availability of diagnostic methods is hindered by +various limitations, including high costs and lengthy result acquisition times, +impeding early detection of the disease. In this study, we present two +cutting-edge bi-fold weighted voting ensemble models that aim to boost the +effectiveness of weighted ensemble methods. These two proposed methods combine +the classification outcomes from multiple classifiers and determine the optimal +result by selecting the one with the highest probability in the first approach, +and the highest weighted prediction in the second technique. These approaches +significantly improve the overall performance of weighted ensemble techniques. +In the first proposed method, we improve the soft voting technique (SVT) by +introducing a novel unsupervised weight calculating schema (UWCS) to enhance +its weight assigning capability, known as the extended soft voting technique +(ESVT). Secondly, we propose a novel weighted method (NWM) by using the +proposed UWCS. Both of our approaches incorporate three distinct models: a +custom-built CNN, VGG-16, and InceptionResNetV2 which has been trained on +publicly available datasets. The effectiveness of our proposed systems is +evaluated through blind testing, where exceptional results are achieved. We +then establish a comparative analysis of the performance of our proposed +methods with that of SVT to show their superiority and effectiveness. + +
+
+
+
+
+ + ☆ Exploiting Inter-sample and Inter-feature Relations in Dataset + Distillation CVPR 2024 + + +
+ Dataset distillation has emerged as a promising approach in deep learning, +enabling efficient training with small synthetic datasets derived from larger +real ones. Particularly, distribution matching-based distillation methods +attract attention thanks to its effectiveness and low computational cost. +However, these methods face two primary limitations: the dispersed feature +distribution within the same class in synthetic datasets, reducing class +discrimination, and an exclusive focus on mean feature consistency, lacking +precision and comprehensiveness. To address these challenges, we introduce two +novel constraints: a class centralization constraint and a covariance matching +constraint. The class centralization constraint aims to enhance class +discrimination by more closely clustering samples within classes. The +covariance matching constraint seeks to achieve more accurate feature +distribution matching between real and synthetic datasets through local feature +covariance matrices, particularly beneficial when sample sizes are much smaller +than the number of features. Experiments demonstrate notable improvements with +these constraints, yielding performance boosts of up to 6.6% on CIFAR10, 2.9% +on SVHN, 2.5% on CIFAR100, and 2.5% on TinyImageNet, compared to the +state-of-the-art relevant methods. In addition, our method maintains robust +performance in cross-architecture settings, with a maximum performance drop of +1.7% on four architectures. Code is available at +https://github.com/VincenDen/IID. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ GAN with Skip Patch Discriminator for Biological Electron Microscopy + Image Generation + + +
+ Generating realistic electron microscopy (EM) images has been a challenging +problem due to their complex global and local structures. Isola et al. proposed +pix2pix, a conditional Generative Adversarial Network (GAN), for the general +purpose of image-to-image translation; which fails to generate realistic EM +images. We propose a new architecture for the discriminator in the GAN +providing access to multiple patch sizes using skip patches and generating +realistic EM images. + +
+
+ comment: 4 pages, International Conference on Computational and Mathematical + Biomedical Engineering +
+
+
+
+
+ + ☆ Comparison of Methods in Human Skin Decomposition + + +
+ Decomposition of skin pigment plays an important role in medical fields. +Human skin can be decomposed into two primitive components, hemoglobin and +melanin. It is our goal to apply these results for diagnosis of skin cancer. In +this paper, various methods for skin pigment decomposition are reviewed +comparatively and the performance of each method is evaluated both +theoretically and experimentally. In addition, isometric feature mapping +(Isomap) is introduced in order to improve the dimensionality reduction +performance in context of skin decomposition. + +
+
+ comment: 4 pages, 7 figures +
+
+
+
+
+ + ☆ Pneumonia App: a mobile application for efficient pediatric pneumonia + diagnosis using explainable convolutional neural networks (CNN) + + +
+ Mycoplasma pneumoniae pneumonia (MPP) poses significant diagnostic challenges +in pediatric healthcare, especially in regions like China where it's prevalent. +We introduce PneumoniaAPP, a mobile application leveraging deep learning +techniques for rapid MPP detection. Our approach capitalizes on convolutional +neural networks (CNNs) trained on a comprehensive dataset comprising 3345 chest +X-ray (CXR) images, which includes 833 CXR images revealing MPP and +additionally augmented with samples from a public dataset. The CNN model +achieved an accuracy of 88.20% and an AUROC of 0.9218 across all classes, with +a specific accuracy of 97.64% for the mycoplasma class, as demonstrated on the +testing dataset. Furthermore, we integrated explainability techniques into +PneumoniaAPP to aid respiratory physicians in lung opacity localization. Our +contribution extends beyond existing research by targeting pediatric MPP, +emphasizing the age group of 0-12 years, and prioritizing deployment on mobile +devices. This work signifies a significant advancement in pediatric pneumonia +diagnosis, offering a reliable and accessible tool to alleviate diagnostic +burdens in healthcare settings. + +
+
+ comment: 27 Pages,7 figures +
+
+
+
+
+ + ☆ Denoising Distillation Makes Event-Frame Transformers as Accurate Gaze + Trackers + + +
+ This paper tackles the problem of passive gaze estimation using both event +and frame data. Considering inherently different physiological structures, it's +intractable to accurately estimate purely based on a given state. Thus, we +reformulate the gaze estimation as the quantification of state transitions from +the current state to several prior registered anchor states. Technically, we +propose a two-stage learning-based gaze estimation framework to divide the +whole gaze estimation process into a coarse-to-fine process of anchor state +selection and final gaze location. Moreover, to improve generalization ability, +we align a group of local experts with a student network, where a novel +denoising distillation algorithm is introduced to utilize denoising diffusion +technique to iteratively remove inherent noise of event data. Extensive +experiments demonstrate the effectiveness of the proposed method, which greatly +surpasses state-of-the-art methods by a large extent of 15$\%$. The code will +be publicly available at +https://github.com/jdjdli/Denoise_distill_EF_gazetracker. + +
+
+
+
+
+ + ☆ On the Estimation of Image-matching Uncertainty in Visual Place + Recognition CVPR + + +
+ In Visual Place Recognition (VPR) the pose of a query image is estimated by +comparing the image to a map of reference images with known reference poses. As +is typical for image retrieval problems, a feature extractor maps the query and +reference images to a feature space, where a nearest neighbor search is then +performed. However, till recently little attention has been given to +quantifying the confidence that a retrieved reference image is a correct match. +Highly certain but incorrect retrieval can lead to catastrophic failure of +VPR-based localization pipelines. This work compares for the first time the +main approaches for estimating the image-matching uncertainty, including the +traditional retrieval-based uncertainty estimation, more recent data-driven +aleatoric uncertainty estimation, and the compute-intensive geometric +verification. We further formulate a simple baseline method, ``SUE'', which +unlike the other methods considers the freely-available poses of the reference +images in the map. Our experiments reveal that a simple L2-distance between the +query and reference descriptors is already a better estimate of image-matching +uncertainty than current data-driven approaches. SUE outperforms the other +efficient uncertainty estimation methods, and its uncertainty estimates +complement the computationally expensive geometric verification approach. +Future works for uncertainty estimation in VPR should consider the baselines +discussed in this work. + +
+
+ comment: To appear in the proceedings of the IEEE/CVF Conference on Computer + Vision and Pattern Recognition (CVPR) 2024 +
+
+
+
+
+ + ☆ Deep Extrinsic Manifold Representation for Vision Tasks + + +
+ Non-Euclidean data is frequently encountered across different fields, yet +there is limited literature that addresses the fundamental challenge of +training neural networks with manifold representations as outputs. We introduce +the trick named Deep Extrinsic Manifold Representation (DEMR) for visual tasks +in this context. DEMR incorporates extrinsic manifold embedding into deep +neural networks, which helps generate manifold representations. The DEMR +approach does not directly optimize the complex geodesic loss. Instead, it +focuses on optimizing the computation graph within the embedded Euclidean +space, allowing for adaptability to various architectural requirements. We +provide empirical evidence supporting the proposed concept on two types of +manifolds, $SE(3)$ and its associated quotient manifolds. This evidence offers +theoretical assurances regarding feasibility, asymptotic properties, and +generalization capability. The experimental results show that DEMR effectively +adapts to point cloud alignment, producing outputs in $ SE(3) $, as well as in +illumination subspace learning with outputs on the Grassmann manifold. + +
+
+
+
+
+ + ☆ Embodied Active Defense: Leveraging Recurrent Feedback to Counter + Adversarial Patches + + +
+ The vulnerability of deep neural networks to adversarial patches has +motivated numerous defense strategies for boosting model robustness. However, +the prevailing defenses depend on single observation or pre-established +adversary information to counter adversarial patches, often failing to be +confronted with unseen or adaptive adversarial attacks and easily exhibiting +unsatisfying performance in dynamic 3D environments. Inspired by active human +perception and recurrent feedback mechanisms, we develop Embodied Active +Defense (EAD), a proactive defensive strategy that actively contextualizes +environmental information to address misaligned adversarial patches in 3D +real-world settings. To achieve this, EAD develops two central recurrent +sub-modules, i.e., a perception module and a policy module, to implement two +critical functions of active vision. These models recurrently process a series +of beliefs and observations, facilitating progressive refinement of their +comprehension of the target object and enabling the development of strategic +actions to counter adversarial patches in 3D environments. To optimize learning +efficiency, we incorporate a differentiable approximation of environmental +dynamics and deploy patches that are agnostic to the adversary strategies. +Extensive experiments demonstrate that EAD substantially enhances robustness +against a variety of patches within just a few steps through its action policy +in safety-critical tasks (e.g., face recognition and object detection), without +compromising standard accuracy. Furthermore, due to the attack-agnostic +characteristic, EAD facilitates excellent generalization to unseen attacks, +diminishing the averaged attack success rate by 95 percent across a range of +unseen adversarial attacks. + +
+
+ comment: 27pages +
+
+
+
+
+ + ☆ LLMs are Good Action Recognizers CVPR 2024 + + +
+ Skeleton-based action recognition has attracted lots of research attention. +Recently, to build an accurate skeleton-based action recognizer, a variety of +works have been proposed. Among them, some works use large model architectures +as backbones of their recognizers to boost the skeleton data representation +capability, while some other works pre-train their recognizers on external data +to enrich the knowledge. In this work, we observe that large language models +which have been extensively used in various natural language processing tasks +generally hold both large model architectures and rich implicit knowledge. +Motivated by this, we propose a novel LLM-AR framework, in which we investigate +treating the Large Language Model as an Action Recognizer. In our framework, we +propose a linguistic projection process to project each input action signal +(i.e., each skeleton sequence) into its ``sentence format'' (i.e., an ``action +sentence''). Moreover, we also incorporate our framework with several designs +to further facilitate this linguistic projection process. Extensive experiments +demonstrate the efficacy of our proposed framework. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ TexVocab: Texture Vocabulary-conditioned Human Avatars + + +
+ To adequately utilize the available image evidence in multi-view video-based +avatar modeling, we propose TexVocab, a novel avatar representation that +constructs a texture vocabulary and associates body poses with texture maps for +animation. Given multi-view RGB videos, our method initially back-projects all +the available images in the training videos to the posed SMPL surface, +producing texture maps in the SMPL UV domain. Then we construct pairs of human +poses and texture maps to establish a texture vocabulary for encoding dynamic +human appearances under various poses. Unlike the commonly used joint-wise +manner, we further design a body-part-wise encoding strategy to learn the +structural effects of the kinematic chain. Given a driving pose, we query the +pose feature hierarchically by decomposing the pose vector into several body +parts and interpolating the texture features for synthesizing fine-grained +human dynamics. Overall, our method is able to create animatable human avatars +with detailed and dynamic appearances from RGB videos, and the experiments show +that our method outperforms state-of-the-art approaches. The project page can +be found at https://texvocab.github.io/. + +
+
+
+
+
+ + ☆ Transformer based Pluralistic Image Completion with Reduced Information + Loss + + +
+ Transformer based methods have achieved great success in image inpainting +recently. However, we find that these solutions regard each pixel as a token, +thus suffering from an information loss issue from two aspects: 1) They +downsample the input image into much lower resolutions for efficiency +consideration. 2) They quantize $256^3$ RGB values to a small number (such as +512) of quantized color values. The indices of quantized pixels are used as +tokens for the inputs and prediction targets of the transformer. To mitigate +these issues, we propose a new transformer based framework called "PUT". +Specifically, to avoid input downsampling while maintaining computation +efficiency, we design a patch-based auto-encoder P-VQVAE. The encoder converts +the masked image into non-overlapped patch tokens and the decoder recovers the +masked regions from the inpainted tokens while keeping the unmasked regions +unchanged. To eliminate the information loss caused by input quantization, an +Un-quantized Transformer is applied. It directly takes features from the +P-VQVAE encoder as input without any quantization and only regards the +quantized tokens as prediction targets. Furthermore, to make the inpainting +process more controllable, we introduce semantic and structural conditions as +extra guidance. Extensive experiments show that our method greatly outperforms +existing transformer based methods on image fidelity and achieves much higher +diversity and better fidelity than state-of-the-art pluralistic inpainting +methods on complex large-scale datasets (e.g., ImageNet). Codes are available +at https://github.com/liuqk3/PUT. + +
+
+ comment: Accepted by TPAMI (2024) +
+
+
+
+
+ + ☆ Denoising Low-dose Images Using Deep Learning of Time Series Images + + +
+ Digital image devices have been widely applied in many fields, including +scientific imaging, recognition of individuals, and remote sensing. As the +application of these imaging technologies to autonomous driving and +measurement, image noise generated when observation cannot be performed with a +sufficient dose has become a major problem. Machine learning denoise technology +is expected to be the solver of this problem, but there are the following +problems. Here we report, artifacts generated by machine learning denoise in +ultra-low dose observation using an in-situ observation video of an electron +microscope as an example. And as a method to solve this problem, we propose a +method to decompose a time series image into a 2D image of the spatial axis and +time to perform machine learning denoise. Our method opens new avenues accurate +and stable reconstruction of continuous high-resolution images from low-dose +imaging in science, industry, and life. + +
+
+
+
+
+ + ☆ DailyMAE: Towards Pretraining Masked Autoencoders in One Day + + +
+ Recently, masked image modeling (MIM), an important self-supervised learning +(SSL) method, has drawn attention for its effectiveness in learning data +representation from unlabeled data. Numerous studies underscore the advantages +of MIM, highlighting how models pretrained on extensive datasets can enhance +the performance of downstream tasks. However, the high computational demands of +pretraining pose significant challenges, particularly within academic +environments, thereby impeding the SSL research progress. In this study, we +propose efficient training recipes for MIM based SSL that focuses on mitigating +data loading bottlenecks and employing progressive training techniques and +other tricks to closely maintain pretraining performance. Our library enables +the training of a MAE-Base/16 model on the ImageNet 1K dataset for 800 epochs +within just 18 hours, using a single machine equipped with 8 A100 GPUs. By +achieving speed gains of up to 5.8 times, this work not only demonstrates the +feasibility of conducting high-efficiency SSL training but also paves the way +for broader accessibility and promotes advancement in SSL research particularly +for prototyping and initial testing of SSL ideas. The code is available in +https://github.com/erow/FastSSL. + +
+
+
+
+
+ + ☆ NYC-Indoor-VPR: A Long-Term Indoor Visual Place Recognition Dataset with + Semi-Automatic Annotation ICRA 2024 + + +
+ Visual Place Recognition (VPR) in indoor environments is beneficial to humans +and robots for better localization and navigation. It is challenging due to +appearance changes at various frequencies, and difficulties of obtaining ground +truth metric trajectories for training and evaluation. This paper introduces +the NYC-Indoor-VPR dataset, a unique and rich collection of over 36,000 images +compiled from 13 distinct crowded scenes in New York City taken under varying +lighting conditions with appearance changes. Each scene has multiple revisits +across a year. To establish the ground truth for VPR, we propose a +semiautomatic annotation approach that computes the positional information of +each image. Our method specifically takes pairs of videos as input and yields +matched pairs of images along with their estimated relative locations. The +accuracy of this matching is refined by human annotators, who utilize our +annotation software to correlate the selected keyframes. Finally, we present a +benchmark evaluation of several state-of-the-art VPR algorithms using our +annotated dataset, revealing its challenge and thus value for VPR research. + +
+
+ comment: 7 pages, 7 figures, published in 2024 IEEE International Conference + on Robotics and Automation (ICRA 2024) +
+
+
+
+
+ + ♻ ☆ Language-only Efficient Training of Zero-shot Composed Image Retrieval CVPR 2024 + + +
+ Composed image retrieval (CIR) task takes a composed query of image and text, +aiming to search relative images for both conditions. Conventional CIR +approaches need a training dataset composed of triplets of query image, query +text, and target image, which is very expensive to collect. Several recent +works have worked on the zero-shot (ZS) CIR paradigm to tackle the issue +without using pre-collected triplets. However, the existing ZS-CIR methods show +limited backbone scalability and generalizability due to the lack of diversity +of the input texts during training. We propose a novel CIR framework, only +using language for its training. Our LinCIR (Language-only training for CIR) +can be trained only with text datasets by a novel self-supervision named +self-masking projection (SMP). We project the text latent embedding to the +token embedding space and construct a new text by replacing the keyword tokens +of the original text. Then, we let the new and original texts have the same +latent embedding vector. With this simple strategy, LinCIR is surprisingly +efficient and highly effective; LinCIR with CLIP ViT-G backbone is trained in +48 minutes and shows the best ZS-CIR performances on four different CIR +benchmarks, CIRCO, GeneCIS, FashionIQ, and CIRR, even outperforming supervised +method on FashionIQ. Code is available at https://github.com/navervision/lincir + +
+
+ comment: CVPR 2024 camera-ready; First two authors contributed equally; 17 + pages, 3.1MB +
+
+
+
+
+ + ♻ ☆ Handling The Non-Smooth Challenge in Tensor SVD: A Multi-Objective + Tensor Recovery Framework + + +
+ Recently, numerous tensor singular value decomposition (t-SVD)-based tensor +recovery methods have shown promise in processing visual data, such as color +images and videos. However, these methods often suffer from severe performance +degradation when confronted with tensor data exhibiting non-smooth changes. It +has been commonly observed in real-world scenarios but ignored by the +traditional t-SVD-based methods. In this work, we introduce a novel tensor +recovery model with a learnable tensor nuclear norm to address such a +challenge. We develop a new optimization algorithm named the Alternating +Proximal Multiplier Method (APMM) to iteratively solve the proposed tensor +completion model. Theoretical analysis demonstrates the convergence of the +proposed APMM to the Karush-Kuhn-Tucker (KKT) point of the optimization +problem. In addition, we propose a multi-objective tensor recovery framework +based on APMM to efficiently explore the correlations of tensor data across its +various dimensions, providing a new perspective on extending the t-SVD-based +method to higher-order tensor cases. Numerical experiments demonstrated the +effectiveness of the proposed method in tensor completion. + +
+
+
+
+
+ + ♻ ☆ ElasticDiffusion: Training-free Arbitrary Size Image Generation through + Global-Local Content Separation CVPR 2024 + + +
+ Diffusion models have revolutionized image generation in recent years, yet +they are still limited to a few sizes and aspect ratios. We propose +ElasticDiffusion, a novel training-free decoding method that enables pretrained +text-to-image diffusion models to generate images with various sizes. +ElasticDiffusion attempts to decouple the generation trajectory of a pretrained +model into local and global signals. The local signal controls low-level pixel +information and can be estimated on local patches, while the global signal is +used to maintain overall structural consistency and is estimated with a +reference image. We test our method on CelebA-HQ (faces) and LAION-COCO +(objects/indoor/outdoor scenes). Our experiments and qualitative results show +superior image coherence quality across aspect ratios compared to +MultiDiffusion and the standard decoding strategy of Stable Diffusion. Project +page: https://elasticdiffusion.github.io/ + +
+
+ comment: Accepted at CVPR 2024. Project Page: + https://elasticdiffusion.github.io/ +
+
+
+
+
+ + ♻ ☆ CLRmatchNet: Enhancing Curved Lane Detection with Deep Matching Process + + +
+ Lane detection plays a crucial role in autonomous driving by providing vital +data to ensure safe navigation. Modern algorithms rely on anchor-based +detectors, which are then followed by a label-assignment process to categorize +training detections as positive or negative instances based on learned +geometric attributes. Accurate label assignment has great impact on the model +performance, that is usually relying on a pre-defined classical cost function +evaluating GT-prediction alignment. However, classical label assignment methods +face limitations due to their reliance on predefined cost functions derived +from low-dimensional models, potentially impacting their optimality. Our +research introduces MatchNet, a deep learning submodule-based approach aimed at +improving the label assignment process. Integrated into a state-of-the-art lane +detection network such as the Cross Layer Refinement Network for Lane Detection +(CLRNet), MatchNet replaces the conventional label assignment process with a +submodule network. The integrated model, CLRmatchNet, surpasses CLRNet, showing +substantial improvements in scenarios involving curved lanes, with remarkable +improvement across all backbones of +2.8% for ResNet34, +2.3% for ResNet101, +and +2.96% for DLA34. In addition, it maintains or even improves comparable +results in other sections. Our method boosts the confidence level in lane +detection, allowing an increase in the confidence threshold. Our code is +available at: https://github.com/sapirkontente/CLRmatchNet.git + +
+
+
+
+
+ + ♻ ☆ DiverseNet: Decision Diversified Semi-supervised Semantic Segmentation + Networks for Remote Sensing Imagery + + +
+ Semi-supervised learning aims to help reduce the cost of the manual labelling +process by leveraging valuable features extracted from a substantial pool of +unlabeled data alongside a limited set of labelled data during the training +phase. Since pixel-level manual labelling in large-scale remote sensing imagery +is expensive, semi-supervised learning becomes an appropriate solution to this. +However, most of the existing consistency learning frameworks based on network +perturbation are very bulky. There is still a lack of lightweight and efficient +perturbation methods to promote the diversity of features and the precision of +pseudo labels during training. In order to fill this gap, we propose DiverseNet +which explores multi-head and multi-model semi-supervised learning algorithms +by simultaneously enhancing precision and diversity during training. The two +proposed methods in the DiverseNet family, namely DiverseHead and DiverseModel, +both achieve the better semantic segmentation performance in four widely +utilised remote sensing imagery data sets compared to state-of-the-art +semi-supervised learning methods. Meanwhile, the proposed DiverseHead +architecture is simple and relatively lightweight in terms of parameter space +compared to the state-of-the-art methods whilst reaching high-performance +results for all the tested data sets. + +
+
+
+
+
+ + ♻ ☆ Faster ISNet for Background Bias Mitigation on Deep Neural Networks + + +
+ Bias or spurious correlations in image backgrounds can impact neural +networks, causing shortcut learning (Clever Hans Effect) and hampering +generalization to real-world data. ISNet, a recently introduced architecture, +proposed the optimization of Layer-Wise Relevance Propagation (LRP, an +explanation technique) heatmaps, to mitigate the influence of backgrounds on +deep classifiers. However, ISNet's training time scales linearly with the +number of classes in an application. Here, we propose reformulated +architectures whose training time becomes independent from this number. +Additionally, we introduce a concise and model-agnostic LRP implementation. We +challenge the proposed architectures using synthetic background bias, and +COVID-19 detection in chest X-rays, an application that commonly presents +background bias. The networks hindered background attention and shortcut +learning, surpassing multiple state-of-the-art models on out-of-distribution +test datasets. Representing a potentially massive training speed improvement +over ISNet, the proposed architectures introduce LRP optimization into a gamut +of applications that the original model cannot feasibly handle. + +
+
+
+
+
+ + ♻ ☆ Open3DIS: Open-Vocabulary 3D Instance Segmentation with 2D Mask Guidance CVPR 2024 + + +
+ We introduce Open3DIS, a novel solution designed to tackle the problem of +Open-Vocabulary Instance Segmentation within 3D scenes. Objects within 3D +environments exhibit diverse shapes, scales, and colors, making precise +instance-level identification a challenging task. Recent advancements in +Open-Vocabulary scene understanding have made significant strides in this area +by employing class-agnostic 3D instance proposal networks for object +localization and learning queryable features for each 3D mask. While these +methods produce high-quality instance proposals, they struggle with identifying +small-scale and geometrically ambiguous objects. The key idea of our method is +a new module that aggregates 2D instance masks across frames and maps them to +geometrically coherent point cloud regions as high-quality object proposals +addressing the above limitations. These are then combined with 3D +class-agnostic instance proposals to include a wide range of objects in the +real world. To validate our approach, we conducted experiments on three +prominent datasets, including ScanNet200, S3DIS, and Replica, demonstrating +significant performance gains in segmenting objects with diverse categories +over the state-of-the-art approaches. + +
+
+ comment: CVPR 2024. Project page: https://open3dis.github.io/ +
+
+
+
+
+ + ♻ ☆ Object Recognition as Next Token Prediction CVPR 2024 + + +
+ We present an approach to pose object recognition as next token prediction. +The idea is to apply a language decoder that auto-regressively predicts the +text tokens from image embeddings to form labels. To ground this prediction +process in auto-regression, we customize a non-causal attention mask for the +decoder, incorporating two key features: modeling tokens from different labels +to be independent, and treating image tokens as a prefix. This masking +mechanism inspires an efficient method - one-shot sampling - to simultaneously +sample tokens of multiple labels in parallel and rank generated labels by their +probabilities during inference. To further enhance the efficiency, we propose a +simple strategy to construct a compact decoder by simply discarding the +intermediate blocks of a pretrained language model. This approach yields a +decoder that matches the full model's performance while being notably more +efficient. The code is available at https://github.com/kaiyuyue/nxtp + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ DriveVLM: The Convergence of Autonomous Driving and Large + Vision-Language Models + + +
+ A primary hurdle of autonomous driving in urban environments is understanding +complex and long-tail scenarios, such as challenging road conditions and +delicate human behaviors. We introduce DriveVLM, an autonomous driving system +leveraging Vision-Language Models (VLMs) for enhanced scene understanding and +planning capabilities. DriveVLM integrates a unique combination of +chain-of-thought (CoT) modules for scene description, scene analysis, and +hierarchical planning. Furthermore, recognizing the limitations of VLMs in +spatial reasoning and heavy computational requirements, we propose +DriveVLM-Dual, a hybrid system that synergizes the strengths of DriveVLM with +the traditional autonomous driving pipeline. DriveVLM-Dual achieves robust +spatial understanding and real-time inference speed. Extensive experiments on +both the nuScenes dataset and our SUP-AD dataset demonstrate the effectiveness +of DriveVLM and the enhanced performance of DriveVLM-Dual, surpassing existing +methods in complex and unpredictable driving conditions. + +
+
+ comment: Project Page: https://tsinghua-mars-lab.github.io/DriveVLM/ +
+
+
+
+
+ + ♻ ☆ A New Benchmark and Model for Challenging Image Manipulation Detection AAAI-24 + + +
+ The ability to detect manipulation in multimedia data is vital in digital +forensics. Existing Image Manipulation Detection (IMD) methods are mainly based +on detecting anomalous features arisen from image editing or double compression +artifacts. All existing IMD techniques encounter challenges when it comes to +detecting small tampered regions from a large image. Moreover, +compression-based IMD approaches face difficulties in cases of double +compression of identical quality factors. To investigate the State-of-The-Art +(SoTA) IMD methods in those challenging conditions, we introduce a new +Challenging Image Manipulation Detection (CIMD) benchmark dataset, which +consists of two subsets, for evaluating editing-based and compression-based IMD +methods, respectively. The dataset images were manually taken and tampered with +high-quality annotations. In addition, we propose a new two-branch network +model based on HRNet that can better detect both the image-editing and +compression artifacts in those challenging conditions. Extensive experiments on +the CIMD benchmark show that our model significantly outperforms SoTA IMD +methods on CIMD. + +
+
+ comment: 9 pages, 6 figures, 3 tabels. AAAI-24 +
+
+
+
+
+ + ♻ ☆ Language-driven Object Fusion into Neural Radiance Fields with + Pose-Conditioned Dataset Updates CVPR 2024 + + +
+ Neural radiance field is an emerging rendering method that generates +high-quality multi-view consistent images from a neural scene representation +and volume rendering. Although neural radiance field-based techniques are +robust for scene reconstruction, their ability to add or remove objects remains +limited. This paper proposes a new language-driven approach for object +manipulation with neural radiance fields through dataset updates. Specifically, +to insert a new foreground object represented by a set of multi-view images +into a background radiance field, we use a text-to-image diffusion model to +learn and generate combined images that fuse the object of interest into the +given background across views. These combined images are then used for refining +the background radiance field so that we can render view-consistent images +containing both the object and the background. To ensure view consistency, we +propose a dataset updates strategy that prioritizes radiance field training +with camera views close to the already-trained views prior to propagating the +training to remaining views. We show that under the same dataset updates +strategy, we can easily adapt our method for object insertion using data from +text-to-3D models as well as object removal. Experimental results show that our +method generates photorealistic images of the edited scenes, and outperforms +state-of-the-art methods in 3D reconstruction and neural radiance field +blending. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Modular Blind Video Quality Assessment CVPR 2024 + + +
+ Blind video quality assessment (BVQA) plays a pivotal role in evaluating and +improving the viewing experience of end-users across a wide range of +video-based platforms and services. Contemporary deep learning-based models +primarily analyze video content in its aggressively subsampled format, while +being blind to the impact of the actual spatial resolution and frame rate on +video quality. In this paper, we propose a modular BVQA model and a method of +training it to improve its modularity. Our model comprises a base quality +predictor, a spatial rectifier, and a temporal rectifier, responding to the +visual content and distortion, spatial resolution, and frame rate changes on +video quality, respectively. During training, spatial and temporal rectifiers +are dropped out with some probabilities to render the base quality predictor a +standalone BVQA model, which should work better with the rectifiers. Extensive +experiments on both professionally-generated content and user-generated content +video databases show that our quality model achieves superior or comparable +performance to current methods. Additionally, the modularity of our model +offers an opportunity to analyze existing video quality databases in terms of +their spatial and temporal complexity. + +
+
+ comment: Accepted by CVPR 2024; Camera-ready version +
+
+
+
+
+ + ♻ ☆ Multi-Channel Orthogonal Transform-Based Perceptron Layers for Efficient + ResNets + + +
+ In this paper, we propose a set of transform-based neural network layers as +an alternative to the $3\times3$ Conv2D layers in Convolutional Neural Networks +(CNNs). The proposed layers can be implemented based on orthogonal transforms +such as the Discrete Cosine Transform (DCT), Hadamard transform (HT), and +biorthogonal Block Wavelet Transform (BWT). Furthermore, by taking advantage of +the convolution theorems, convolutional filtering operations are performed in +the transform domain using element-wise multiplications. Trainable +soft-thresholding layers, that remove noise in the transform domain, bring +nonlinearity to the transform domain layers. Compared to the Conv2D layer, +which is spatial-agnostic and channel-specific, the proposed layers are +location-specific and channel-specific. Moreover, these proposed layers reduce +the number of parameters and multiplications significantly while improving the +accuracy results of regular ResNets on the ImageNet-1K classification task. +Furthermore, they can be inserted with a batch normalization layer before the +global average pooling layer in the conventional ResNets as an additional layer +to improve classification accuracy. + +
+
+ comment: This work is accepted to IEEE Transactions on Neural Networks and + Learning Systems. The initial title is "Orthogonal Transform Domain + Approaches for the Convolutional Layer". We changed it to "Multi-Channel + Orthogonal Transform-Based Perceptron Layers for Efficient ResNets" based on + reviewer's comment. arXiv admin note: text overlap with arXiv:2211.08577 +
+
+
+
+
+ + ♻ ☆ C-TPT: Calibrated Test-Time Prompt Tuning for Vision-Language Models via + Text Feature Dispersion ICLR 2024 + + +
+ In deep learning, test-time adaptation has gained attention as a method for +model fine-tuning without the need for labeled data. A prime exemplification is +the recently proposed test-time prompt tuning for large-scale vision-language +models such as CLIP. Unfortunately, these prompts have been mainly developed to +improve accuracy, overlooking the importance of calibration, which is a crucial +aspect for quantifying prediction uncertainty. However, traditional calibration +methods rely on substantial amounts of labeled data, making them impractical +for test-time scenarios. To this end, this paper explores calibration during +test-time prompt tuning by leveraging the inherent properties of CLIP. Through +a series of observations, we find that the prompt choice significantly affects +the calibration in CLIP, where the prompts leading to higher text feature +dispersion result in better-calibrated predictions. Introducing the Average +Text Feature Dispersion (ATFD), we establish its relationship with calibration +error and present a novel method, Calibrated Test-time Prompt Tuning (C-TPT), +for optimizing prompts during test-time with enhanced calibration. Through +extensive experiments on different CLIP architectures and datasets, we show +that C-TPT can effectively improve the calibration of test-time prompt tuning +without needing labeled data. The code is publicly accessible at +https://github.com/hee-suk-yoon/C-TPT. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Break-for-Make: Modular Low-Rank Adaptations for Composable + Content-Style Customization + + +
+ Personalized generation paradigms empower designers to customize visual +intellectual properties with the help of textual descriptions by tuning or +adapting pre-trained text-to-image models on a few images. Recent works explore +approaches for concurrently customizing both content and detailed visual style +appearance. However, these existing approaches often generate images where the +content and style are entangled. In this study, we reconsider the customization +of content and style concepts from the perspective of parameter space +construction. Unlike existing methods that utilize a shared parameter space for +content and style, we propose a learning framework that separates the parameter +space to facilitate individual learning of content and style, thereby enabling +disentangled content and style. To achieve this goal, we introduce "partly +learnable projection" (PLP) matrices to separate the original adapters into +divided sub-parameter spaces. We propose "break-for-make" customization +learning pipeline based on PLP, which is simple yet effective. We break the +original adapters into "up projection" and "down projection", train content and +style PLPs individually with the guidance of corresponding textual prompts in +the separate adapters, and maintain generalization by employing a +multi-correspondence projection learning strategy. Based on the adapters broken +apart for separate training content and style, we then make the entity +parameter space by reconstructing the content and style PLPs matrices, followed +by fine-tuning the combined adapter to generate the target object with the +desired appearance. Experiments on various styles, including textures, +materials, and artistic style, show that our method outperforms +state-of-the-art single/multiple concept learning pipelines in terms of +content-style-prompt alignment. + +
+
+
+
+
+ + ♻ ☆ Sketch Input Method Editor: A Comprehensive Dataset and Methodology for + Systematic Input Recognition + + +
+ With the recent surge in the use of touchscreen devices, free-hand sketching +has emerged as a promising modality for human-computer interaction. While +previous research has focused on tasks such as recognition, retrieval, and +generation of familiar everyday objects, this study aims to create a Sketch +Input Method Editor (SketchIME) specifically designed for a professional C4I +system. Within this system, sketches are utilized as low-fidelity prototypes +for recommending standardized symbols in the creation of comprehensive +situation maps. This paper also presents a systematic dataset comprising 374 +specialized sketch types, and proposes a simultaneous recognition and +segmentation architecture with multilevel supervision between recognition and +segmentation to improve performance and enhance interpretability. By +incorporating few-shot domain adaptation and class-incremental learning, the +network's ability to adapt to new users and extend to new task-specific classes +is significantly enhanced. Results from experiments conducted on both the +proposed dataset and the SPG dataset illustrate the superior performance of the +proposed architecture. Our dataset and code are publicly available at +https://github.com/GuangmingZhu/SketchIME. + +
+
+ comment: The paper has been accepted by ACM Multimedia 2023 +
+
+
+
+
+ + ♻ ☆ High-Fidelity Lake Extraction via Two-Stage Prompt Enhancement: + Establishing a Novel Baseline and Benchmark ICME 2024 + + +
+ Lake extraction from remote sensing imagery is a complex challenge due to the +varied lake shapes and data noise. Current methods rely on multispectral image +datasets, making it challenging to learn lake features accurately from pixel +arrangements. This, in turn, affects model learning and the creation of +accurate segmentation masks. This paper introduces a prompt-based dataset +construction approach that provides approximate lake locations using point, +box, and mask prompts. We also propose a two-stage prompt enhancement +framework, LEPrompter, with prompt-based and prompt-free stages during +training. The prompt-based stage employs a prompt encoder to extract prior +information, integrating prompt tokens and image embedding through self- and +cross-attention in the prompt decoder. Prompts are deactivated to ensure +independence during inference, enabling automated lake extraction without +introducing additional parameters and GFlops. Extensive experiments showcase +performance improvements of our proposed approach compared to the previous +state-of-the-art method. The source code is available at +https://github.com/BastianChen/LEPrompter. + +
+
+ comment: Accepted by ICME 2024 +
+
+
+
+
+ + ♻ ☆ Deep Neural Networks Fused with Textures for Image Classification + + +
+ Fine-grained image classification (FGIC) is a challenging task in computer +vision for due to small visual differences among inter-subcategories, but, +large intra-class variations. Deep learning methods have achieved remarkable +success in solving FGIC. In this paper, we propose a fusion approach to address +FGIC by combining global texture with local patch-based information. The first +pipeline extracts deep features from various fixed-size non-overlapping patches +and encodes features by sequential modelling using the long short-term memory +(LSTM). Another path computes image-level textures at multiple scales using the +local binary patterns (LBP). The advantages of both streams are integrated to +represent an efficient feature vector for image classification. The method is +tested on eight datasets representing the human faces, skin lesions, food +dishes, marine lives, etc. using four standard backbone CNNs. Our method has +attained better classification accuracy over existing methods with notable +margins. + +
+
+ comment: 14 pages, 6 figures, 4 tables, conference +
+
+
+
+
+ + ♻ ☆ Object-level Geometric Structure Preserving for Natural Image Stitching + + +
+ The topic of stitching images with globally natural structures holds +paramount significance. Current methodologies exhibit the ability to preserve +local geometric structures, yet fall short in maintaining relationships between +these geometric structures. In this paper, we endeavor to safeguard the +overall, OBJect-level structures within images based on Global Similarity +Prior, while concurrently mitigating distortion and ghosting artifacts with +OBJ-GSP. Our approach leverages the Segment Anything Model to extract geometric +structures with semantic information, enhancing the algorithm's ability to +preserve objects in a manner that aligns more intuitively with human +perception. We seek to identify spatial constraints that govern the +relationships between various geometric boundaries. Recognizing that multiple +geometric boundaries collectively define complete objects, we employ triangular +meshes to safeguard not only individual geometric structures but also the +overall shapes of objects within the images. Empirical evaluations across +multiple image stitching datasets demonstrate that our method establishes a new +state-of-the-art benchmark in image stitching. Our implementation and dataset +is publicly available at https://github.com/RussRobin/OBJ-GSP . + +
+
+
+
+
+ + ♻ ☆ Self-Adaptive Sampling for Efficient Video Question-Answering on + Image--Text Models NAACL 2024 + + +
+ Video question-answering is a fundamental task in the field of video +understanding. Although current vision--language models (VLMs) equipped with +Video Transformers have enabled temporal modeling and yielded superior results, +they are at the cost of huge computational power and thus too expensive to +deploy in real-time application scenarios. An economical workaround only +samples a small portion of frames to represent the main content of that video +and tune an image--text model on these sampled frames. Recent video +understanding models usually randomly sample a set of frames or clips, +regardless of internal correlations between their visual contents, nor their +relevance to the problem. We argue that such kinds of aimless sampling may omit +the key frames from which the correct answer can be deduced, and the situation +gets worse when the sampling sparsity increases, which always happens as the +video lengths increase. To mitigate this issue, we propose two frame sampling +strategies, namely the most domain frames (MDF) and most implied frames (MIF), +to maximally preserve those frames that are most likely vital to the given +questions. MDF passively minimizes the risk of key frame omission in a +bootstrap manner, while MIS actively searches key frames customized for each +video--question pair with the assistance of auxiliary models. The experimental +results on three public datasets from three advanced VLMs (CLIP, GIT and +All-in-one) demonstrate that our proposed strategies can boost the performance +for image-text pretrained models. The source codes pertaining to the method +proposed in this paper are publicly available at +https://github.com/declare-lab/sas-vqa. + +
+
+ comment: 13 pages, 7 figures, accepted to Findings of NAACL 2024 +
+
+
+
+
+ + ♻ ☆ HiPose: Hierarchical Binary Surface Encoding and Correspondence Pruning + for RGB-D 6DoF Object Pose Estimation CVPR 2024 + + +
+ In this work, we present a novel dense-correspondence method for 6DoF object +pose estimation from a single RGB-D image. While many existing data-driven +methods achieve impressive performance, they tend to be time-consuming due to +their reliance on rendering-based refinement approaches. To circumvent this +limitation, we present HiPose, which establishes 3D-3D correspondences in a +coarse-to-fine manner with a hierarchical binary surface encoding. Unlike +previous dense-correspondence methods, we estimate the correspondence surface +by employing point-to-surface matching and iteratively constricting the surface +until it becomes a correspondence point while gradually removing outliers. +Extensive experiments on public benchmarks LM-O, YCB-V, and T-Less demonstrate +that our method surpasses all refinement-free methods and is even on par with +expensive refinement-based approaches. Crucially, our approach is +computationally efficient and enables real-time critical applications with high +accuracy requirements. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ CECT: Controllable Ensemble CNN and Transformer for COVID-19 Image + Classification + + +
+ The COVID-19 pandemic has resulted in hundreds of million cases and numerous +deaths worldwide. Here, we develop a novel classification network CECT by +controllable ensemble convolutional neural network and transformer to provide a +timely and accurate COVID-19 diagnosis. The CECT is composed of a parallel +convolutional encoder block, an aggregate transposed-convolutional decoder +block, and a windowed attention classification block. Each block captures +features at different scales from 28 $\times$ 28 to 224 $\times$ 224 from the +input, composing enriched and comprehensive information. Different from +existing methods, our CECT can capture features at both multi-local and global +scales without any sophisticated module design. Moreover, the contribution of +local features at different scales can be controlled with the proposed ensemble +coefficients. We evaluate CECT on two public COVID-19 datasets and it reaches +the highest accuracy of 98.1% in the intra-dataset evaluation, outperforming +existing state-of-the-art methods. Moreover, the developed CECT achieves an +accuracy of 90.9% on the unseen dataset in the inter-dataset evaluation, +showing extraordinary generalization ability. With remarkable feature capture +ability and generalization ability, we believe CECT can be extended to other +medical scenarios as a powerful diagnosis tool. Code is available at +https://github.com/NUS-Tim/CECT. + +
+
+ comment: Computers in Biology and Medicine Accepted +
+
+
+
+
+ + ♻ ☆ CAT-Seg: Cost Aggregation for Open-Vocabulary Semantic Segmentation CVPR 2024 + + +
+ Open-vocabulary semantic segmentation presents the challenge of labeling each +pixel within an image based on a wide range of text descriptions. In this work, +we introduce a novel cost-based approach to adapt vision-language foundation +models, notably CLIP, for the intricate task of semantic segmentation. Through +aggregating the cosine similarity score, i.e., the cost volume between image +and text embeddings, our method potently adapts CLIP for segmenting seen and +unseen classes by fine-tuning its encoders, addressing the challenges faced by +existing methods in handling unseen classes. Building upon this, we explore +methods to effectively aggregate the cost volume considering its multi-modal +nature of being established between image and text embeddings. Furthermore, we +examine various methods for efficiently fine-tuning CLIP. + +
+
+ comment: Accepted to CVPR 2024. Project page: + https://ku-cvlab.github.io/CAT-Seg/ +
+
+
+
+
+ + ♻ ☆ Can Language Models Laugh at YouTube Short-form Videos? EMNLP 2023 + + +
+ As short-form funny videos on social networks are gaining popularity, it +becomes demanding for AI models to understand them for better communication +with humans. Unfortunately, previous video humor datasets target specific +domains, such as speeches or sitcoms, and mostly focus on verbal cues. We +curate a user-generated dataset of 10K multimodal funny videos from YouTube, +called ExFunTube. Using a video filtering pipeline with GPT-3.5, we verify both +verbal and visual elements contributing to humor. After filtering, we annotate +each video with timestamps and text explanations for funny moments. Our +ExFunTube is unique over existing datasets in that our videos cover a wide +range of domains with various types of humor that necessitate a multimodal +understanding of the content. Also, we develop a zero-shot video-to-text +prompting to maximize video humor understanding of large language models +(LLMs). With three different evaluation methods using automatic scores, +rationale quality experiments, and human evaluations, we show that our +prompting significantly improves LLMs' ability for humor explanation. + +
+
+ comment: EMNLP 2023; references added +
+
+
+
+
+ + ♻ ☆ SPIDeRS: Structured Polarization for Invisible Depth and Reflectance + Sensing CVPR 2024 + + +
+ Can we capture shape and reflectance in stealth? Such capability would be +valuable for many application domains in vision, xR, robotics, and HCI. We +introduce structured polarization for invisible depth and reflectance sensing +(SPIDeRS), the first depth and reflectance sensing method using patterns of +polarized light. The key idea is to modulate the angle of linear polarization +(AoLP) of projected light at each pixel. The use of polarization makes it +invisible and lets us recover not only depth but also directly surface normals +and even reflectance. We implement SPIDeRS with a liquid crystal spatial light +modulator (SLM) and a polarimetric camera. We derive a novel method for +robustly extracting the projected structured polarization pattern from the +polarimetric object appearance. We evaluate the effectiveness of SPIDeRS by +applying it to a number of real-world objects. The results show that our method +successfully reconstructs object shapes of various materials and is robust to +diffuse reflection and ambient light. We also demonstrate relighting using +recovered surface normals and reflectance. We believe SPIDeRS opens a new +avenue of polarization use in visual sensing. + +
+
+ comment: to be published in CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Neural Parametric Gaussians for Monocular Non-Rigid Object + Reconstruction CVPR 2024 + + +
+ Reconstructing dynamic objects from monocular videos is a severely +underconstrained and challenging problem, and recent work has approached it in +various directions. However, owing to the ill-posed nature of this problem, +there has been no solution that can provide consistent, high-quality novel +views from camera positions that are significantly different from the training +views. In this work, we introduce Neural Parametric Gaussians (NPGs) to take on +this challenge by imposing a two-stage approach: first, we fit a low-rank +neural deformation model, which then is used as regularization for non-rigid +reconstruction in the second stage. The first stage learns the object's +deformations such that it preserves consistency in novel views. The second +stage obtains high reconstruction quality by optimizing 3D Gaussians that are +driven by the coarse model. To this end, we introduce a local 3D Gaussian +representation, where temporally shared Gaussians are anchored in and deformed +by local oriented volumes. The resulting combined model can be rendered as +radiance fields, resulting in high-quality photo-realistic reconstructions of +the non-rigidly deforming objects. We demonstrate that NPGs achieve superior +results compared to previous works, especially in challenging scenarios with +few multi-view cues. + +
+
+ comment: Accepted at CVPR 2024 | Project Website: + https://geometric-rl.mpi-inf.mpg.de/npg +
+
+
+
+
+ + ♻ ☆ Deep Convolutional Framelet Denoising for Panoramic by Mixed Wavelet + Integration + + +
+ Enhancing quality and removing noise during preprocessing is one of the most +critical steps in image processing. X-ray images are created by photons +colliding with atoms and the variation in scattered noise absorption. This +noise leads to a deterioration in the graph's medical quality and, at times, +results in repetition, thereby increasing the patient's effective dose. One of +the most critical challenges in this area has consistently been lowering the +image noise. Techniques like BM3d, low-pass filters, and Autoencoder have taken +this step. Owing to their structural design and high rate of repetition, neural +networks employing diverse architectures have, over the past decade, achieved +noise reduction with satisfactory outcomes, surpassing the traditional BM3D and +low-pass filters. The combination of the Hankel matrix with neural networks +represents one of these configurations. The Hankel matrix aims to identify a +local circle by separating individual values into local and non-local +components, utilizing a non-local matrix. A non-local matrix can be created +using the wave or DCT. This paper suggests integrating the waveform with the +Daubechies (D4) wavelet due to its higher energy concentration and employs the +u-Net neural network architecture, which incorporates the waveform exclusively +at each stage. The outcomes were evaluated using the PSNR and SSIM criteria, +and the outcomes were verified by using various waves. The effectiveness of a +one-wave network has increased from 0.5% to 1.2%, according to studies done on +other datasets. + +
+
+
+
+
+ + ♻ ☆ OCTDL: Optical Coherence Tomography Dataset for Image-Based Deep + Learning Methods + + +
+ Optical coherence tomography (OCT) is a non-invasive imaging technique with +extensive clinical applications in ophthalmology. OCT enables the visualization +of the retinal layers, playing a vital role in the early detection and +monitoring of retinal diseases. OCT uses the principle of light wave +interference to create detailed images of the retinal microstructures, making +it a valuable tool for diagnosing ocular conditions. This work presents an +open-access OCT dataset (OCTDL) comprising over 2000 OCT images labeled +according to disease group and retinal pathology. The dataset consists of OCT +records of patients with Age-related Macular Degeneration (AMD), Diabetic +Macular Edema (DME), Epiretinal Membrane (ERM), Retinal Artery Occlusion (RAO), +Retinal Vein Occlusion (RVO), and Vitreomacular Interface Disease (VID). The +images were acquired with an Optovue Avanti RTVue XR using raster scanning +protocols with dynamic scan length and image resolution. Each retinal b-scan +was acquired by centering on the fovea and interpreted and cataloged by an +experienced retinal specialist. In this work, we applied Deep Learning +classification techniques to this new open-access dataset. + +
+
+
+
+
+ + ♻ ☆ Adaptive Surface Normal Constraint for Geometric Estimation from + Monocular Images + + +
+ We introduce a novel approach to learn geometries such as depth and surface +normal from images while incorporating geometric context. The difficulty of +reliably capturing geometric context in existing methods impedes their ability +to accurately enforce the consistency between the different geometric +properties, thereby leading to a bottleneck of geometric estimation quality. We +therefore propose the Adaptive Surface Normal (ASN) constraint, a simple yet +efficient method. Our approach extracts geometric context that encodes the +geometric variations present in the input image and correlates depth estimation +with geometric constraints. By dynamically determining reliable local geometry +from randomly sampled candidates, we establish a surface normal constraint, +where the validity of these candidates is evaluated using the geometric +context. Furthermore, our normal estimation leverages the geometric context to +prioritize regions that exhibit significant geometric variations, which makes +the predicted normals accurately capture intricate and detailed geometric +information. Through the integration of geometric context, our method unifies +depth and surface normal estimations within a cohesive framework, which enables +the generation of high-quality 3D geometry from images. We validate the +superiority of our approach over state-of-the-art methods through extensive +evaluations and comparisons on diverse indoor and outdoor datasets, showcasing +its efficiency and robustness. + +
+
+ comment: Accepted by TPAMI. arXiv admin note: substantial text overlap with + arXiv:2103.15483 +
+
+
+
+
+ + ♻ ☆ HAVE-FUN: Human Avatar Reconstruction from Few-Shot Unconstrained Images + + +
+ As for human avatar reconstruction, contemporary techniques commonly +necessitate the acquisition of costly data and struggle to achieve satisfactory +results from a small number of casual images. In this paper, we investigate +this task from a few-shot unconstrained photo album. The reconstruction of +human avatars from such data sources is challenging because of limited data +amount and dynamic articulated poses. For handling dynamic data, we integrate a +skinning mechanism with deep marching tetrahedra (DMTet) to form a drivable +tetrahedral representation, which drives arbitrary mesh topologies generated by +the DMTet for the adaptation of unconstrained images. To effectively mine +instructive information from few-shot data, we devise a two-phase optimization +method with few-shot reference and few-shot guidance. The former focuses on +aligning avatar identity with reference images, while the latter aims to +generate plausible appearances for unseen regions. Overall, our framework, +called HaveFun, can undertake avatar reconstruction, rendering, and animation. +Extensive experiments on our developed benchmarks demonstrate that HaveFun +exhibits substantially superior performance in reconstructing the human body +and hand. Project website: https://seanchenxy.github.io/HaveFunWeb/. + +
+
+
+
+
+ + ♻ ☆ DiffBIR: Towards Blind Image Restoration with Generative Diffusion Prior + + +
+ We present DiffBIR, a general restoration pipeline that could handle +different blind image restoration tasks in a unified framework. DiffBIR +decouples blind image restoration problem into two stages: 1) degradation +removal: removing image-independent content; 2) information regeneration: +generating the lost image content. Each stage is developed independently but +they work seamlessly in a cascaded manner. In the first stage, we use +restoration modules to remove degradations and obtain high-fidelity restored +results. For the second stage, we propose IRControlNet that leverages the +generative ability of latent diffusion models to generate realistic details. +Specifically, IRControlNet is trained based on specially produced condition +images without distracting noisy content for stable generation performance. +Moreover, we design a region-adaptive restoration guidance that can modify the +denoising process during inference without model re-training, allowing users to +balance realness and fidelity through a tunable guidance scale. Extensive +experiments have demonstrated DiffBIR's superiority over state-of-the-art +approaches for blind image super-resolution, blind face restoration and blind +image denoising tasks on both synthetic and real-world datasets. The code is +available at https://github.com/XPixelGroup/DiffBIR. + +
+
+
+
+
+ + ♻ ☆ Prompt Tuning with Soft Context Sharing for Vision-Language Models + + +
+ Vision-language models have recently shown great potential on many tasks in +computer vision. Meanwhile, prior work demonstrates prompt tuning designed for +vision-language models could acquire superior performance on few-shot image +recognition compared to linear probe, a strong baseline. In practice, many +few-shot tasks are inherently correlated, particularly within specialized +domains. However, such information is overlooked previously. Inspired by the +fact that modeling task relationship by multi-task learning can usually boost +performance, we propose a novel method SoftCPT (Soft Context Sharing for Prompt +Tuning) to tune pre-trained vision-language models on multiple target few-shot +tasks jointly. Specifically, we design a task-shared meta network to generate +prompt context for each task using task name together with a learnable task +context as input. The parameters of this meta network as well as the task +context are tuned on the joint training set of all tasks. As such, the prompt +context of all tasks will be shared in a soft manner. Extensive experiments +across four multi-task few-shot datasets covering 44 tasks and 1593 categories +demonstrate that SoftCPT significantly outperforms single-task prompt tuning +methods, highlighting the effectiveness of multi-task learning for +vision-language prompt tuning. Code is available at +https://github.com/kding1225/softcpt. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ♻ ☆ Animatable Gaussians: Learning Pose-dependent Gaussian Maps for + High-fidelity Human Avatar Modeling CVPR 2024 + + +
+ Modeling animatable human avatars from RGB videos is a long-standing and +challenging problem. Recent works usually adopt MLP-based neural radiance +fields (NeRF) to represent 3D humans, but it remains difficult for pure MLPs to +regress pose-dependent garment details. To this end, we introduce Animatable +Gaussians, a new avatar representation that leverages powerful 2D CNNs and 3D +Gaussian splatting to create high-fidelity avatars. To associate 3D Gaussians +with the animatable avatar, we learn a parametric template from the input +videos, and then parameterize the template on two front \& back canonical +Gaussian maps where each pixel represents a 3D Gaussian. The learned template +is adaptive to the wearing garments for modeling looser clothes like dresses. +Such template-guided 2D parameterization enables us to employ a powerful +StyleGAN-based CNN to learn the pose-dependent Gaussian maps for modeling +detailed dynamic appearances. Furthermore, we introduce a pose projection +strategy for better generalization given novel poses. Overall, our method can +create lifelike avatars with dynamic, realistic and generalized appearances. +Experiments show that our method outperforms other state-of-the-art approaches. +Code: https://github.com/lizhe00/AnimatableGaussians + +
+
+ comment: Accepted by CVPR 2024, Projectpage: + https://animatable-gaussians.github.io/, Code: + https://github.com/lizhe00/AnimatableGaussians +
+
+
+
+
+ + ♻ ☆ G-PECNet: Towards a Generalizable Pedestrian Trajectory Prediction + System ICLR + + +
+ Navigating dynamic physical environments without obstructing or damaging +human assets is of quintessential importance for social robots. In this work, +we solve autonomous drone navigation's sub-problem of predicting out-of-domain +human and agent trajectories using a deep generative model. Our method: +General-PECNet or G-PECNet observes an improvement of 9.5\% on the Final +Displacement Error (FDE) on 2020's benchmark: PECNet through a combination of +architectural improvements inspired by periodic activation functions and +synthetic trajectory (data) augmentations using Hidden Markov Models (HMMs) and +Reinforcement Learning (RL). Additionally, we propose a simple +geometry-inspired metric for trajectory non-linearity and outlier detection, +helpful for the task. Code available at +https://github.com/Aryan-Garg/PECNet-Pedestrian-Trajectory-Prediction.git + +
+
+ comment: Notable ICLR Tiny Paper 2024 +
+
+
+
+
+ + ♻ ☆ LLMs as Bridges: Reformulating Grounded Multimodal Named Entity + Recognition + + +
+ Grounded Multimodal Named Entity Recognition (GMNER) is a nascent multimodal +task that aims to identify named entities, entity types and their corresponding +visual regions. GMNER task exhibits two challenging properties: 1) The weak +correlation between image-text pairs in social media results in a significant +portion of named entities being ungroundable. 2) There exists a distinction +between coarse-grained referring expressions commonly used in similar tasks +(e.g., phrase localization, referring expression comprehension) and +fine-grained named entities. In this paper, we propose RiVEG, a unified +framework that reformulates GMNER into a joint MNER-VE-VG task by leveraging +large language models (LLMs) as a connecting bridge. This reformulation brings +two benefits: 1) It maintains the optimal MNER performance and eliminates the +need for employing object detection methods to pre-extract regional features, +thereby naturally addressing two major limitations of existing GMNER methods. +2) The introduction of entity expansion expression and Visual Entailment (VE) +Module unifies Visual Grounding (VG) and Entity Grounding (EG). It enables +RiVEG to effortlessly inherit the Visual Entailment and Visual Grounding +capabilities of any current or prospective multimodal pretraining models. +Extensive experiments demonstrate that RiVEG outperforms state-of-the-art +methods on the existing GMNER dataset and achieves absolute leads of 10.65%, +6.21%, and 8.83% in all three subtasks. + +
+
+
+
+
+ + ♻ ☆ Decomposing Disease Descriptions for Enhanced Pathology Detection: A + Multi-Aspect Vision-Language Pre-training Framework CVPR2024 + + +
+ Medical vision language pre-training (VLP) has emerged as a frontier of +research, enabling zero-shot pathological recognition by comparing the query +image with the textual descriptions for each disease. Due to the complex +semantics of biomedical texts, current methods struggle to align medical images +with key pathological findings in unstructured reports. This leads to the +misalignment with the target disease's textual representation. In this paper, +we introduce a novel VLP framework designed to dissect disease descriptions +into their fundamental aspects, leveraging prior knowledge about the visual +manifestations of pathologies. This is achieved by consulting a large language +model and medical experts. Integrating a Transformer module, our approach +aligns an input image with the diverse elements of a disease, generating +aspect-centric image representations. By consolidating the matches from each +aspect, we improve the compatibility between an image and its associated +disease. Additionally, capitalizing on the aspect-oriented representations, we +present a dual-head Transformer tailored to process known and unknown diseases, +optimizing the comprehensive detection efficacy. Conducting experiments on +seven downstream datasets, ours improves the accuracy of recent methods by up +to 8.56% and 17.26% for seen and unseen categories, respectively. Our code is +released at https://github.com/HieuPhan33/MAVL. + +
+
+ comment: Accepted at CVPR2024. Pre-print before final camera-ready version +
+
+
+
+
+ + ♻ ☆ SchurVINS: Schur Complement-Based Lightweight Visual Inertial Navigation + System + + +
+ Accuracy and computational efficiency are the most important metrics to +Visual Inertial Navigation System (VINS). The existing VINS algorithms with +either high accuracy or low computational complexity, are difficult to provide +the high precision localization in resource-constrained devices. To this end, +we propose a novel filter-based VINS framework named SchurVINS, which could +guarantee both high accuracy by building a complete residual model and low +computational complexity with Schur complement. Technically, we first formulate +the full residual model where Gradient, Hessian and observation covariance are +explicitly modeled. Then Schur complement is employed to decompose the full +model into ego-motion residual model and landmark residual model. Finally, +Extended Kalman Filter (EKF) update is implemented in these two models with +high efficiency. Experiments on EuRoC and TUM-VI datasets show that our method +notably outperforms state-of-the-art (SOTA) methods in both accuracy and +computational complexity. The experimental code of SchurVINS is available at +https://github.com/bytedance/SchurVINS. + +
+
+
+
+
+ + ♻ ☆ Segment Anything Model for Road Network Graph Extraction + + +
+ We propose SAM-Road, an adaptation of the Segment Anything Model (SAM) for +extracting large-scale, vectorized road network graphs from satellite imagery. +To predict graph geometry, we formulate it as a dense semantic segmentation +task, leveraging the inherent strengths of SAM. The image encoder of SAM is +fine-tuned to produce probability masks for roads and intersections, from which +the graph vertices are extracted via simple non-maximum suppression. To predict +graph topology, we designed a lightweight transformer-based graph neural +network, which leverages the SAM image embeddings to estimate the edge +existence probabilities between vertices. Our approach directly predicts the +graph vertices and edges for large regions without expensive and complex +post-processing heuristics, and is capable of building complete road network +graphs spanning multiple square kilometers in a matter of seconds. With its +simple, straightforward, and minimalist design, SAM-Road achieves comparable +accuracy with the state-of-the-art method RNGDet++, while being 40 times faster +on the City-scale dataset. We thus demonstrate the power of a foundational +vision model when applied to a graph learning task. The code is available at +https://github.com/htcr/sam_road. + +
+
+
+
+
+ + ♻ ☆ RCooper: A Real-world Large-scale Dataset for Roadside Cooperative + Perception CVPR2024 + + +
+ The value of roadside perception, which could extend the boundaries of +autonomous driving and traffic management, has gradually become more prominent +and acknowledged in recent years. However, existing roadside perception +approaches only focus on the single-infrastructure sensor system, which cannot +realize a comprehensive understanding of a traffic area because of the limited +sensing range and blind spots. Orienting high-quality roadside perception, we +need Roadside Cooperative Perception (RCooper) to achieve practical +area-coverage roadside perception for restricted traffic areas. Rcooper has its +own domain-specific challenges, but further exploration is hindered due to the +lack of datasets. We hence release the first real-world, large-scale RCooper +dataset to bloom the research on practical roadside cooperative perception, +including detection and tracking. The manually annotated dataset comprises 50k +images and 30k point clouds, including two representative traffic scenes (i.e., +intersection and corridor). The constructed benchmarks prove the effectiveness +of roadside cooperation perception and demonstrate the direction of further +research. Codes and dataset can be accessed at: +https://github.com/AIR-THU/DAIR-RCooper. + +
+
+ comment: Accepted by CVPR2024. 10 pages with 6 figures +
+
+
+
+
+ + ♻ ☆ 3D Reconstruction of Interacting Multi-Person in Clothing from a Single + Image WACV 2024 + + +
+ This paper introduces a novel pipeline to reconstruct the geometry of +interacting multi-person in clothing on a globally coherent scene space from a +single image. The main challenge arises from the occlusion: a part of a human +body is not visible from a single view due to the occlusion by others or the +self, which introduces missing geometry and physical implausibility (e.g., +penetration). We overcome this challenge by utilizing two human priors for +complete 3D geometry and surface contacts. For the geometry prior, an encoder +learns to regress the image of a person with missing body parts to the latent +vectors; a decoder decodes these vectors to produce 3D features of the +associated geometry; and an implicit network combines these features with a +surface normal map to reconstruct a complete and detailed 3D humans. For the +contact prior, we develop an image-space contact detector that outputs a +probability distribution of surface contacts between people in 3D. We use these +priors to globally refine the body poses, enabling the penetration-free and +accurate reconstruction of interacting multi-person in clothing on the scene +space. The results demonstrate that our method is complete, globally coherent, +and physically plausible compared to existing methods. + +
+
+ comment: Accepted to WACV 2024 +
+
+
+
+
+ + ♻ ☆ LangSplat: 3D Language Gaussian Splatting CVPR 2024 + + +
+ Humans live in a 3D world and commonly use natural language to interact with +a 3D scene. Modeling a 3D language field to support open-ended language queries +in 3D has gained increasing attention recently. This paper introduces +LangSplat, which constructs a 3D language field that enables precise and +efficient open-vocabulary querying within 3D spaces. Unlike existing methods +that ground CLIP language embeddings in a NeRF model, LangSplat advances the +field by utilizing a collection of 3D Gaussians, each encoding language +features distilled from CLIP, to represent the language field. By employing a +tile-based splatting technique for rendering language features, we circumvent +the costly rendering process inherent in NeRF. Instead of directly learning +CLIP embeddings, LangSplat first trains a scene-wise language autoencoder and +then learns language features on the scene-specific latent space, thereby +alleviating substantial memory demands imposed by explicit modeling. Existing +methods struggle with imprecise and vague 3D language fields, which fail to +discern clear boundaries between objects. We delve into this issue and propose +to learn hierarchical semantics using SAM, thereby eliminating the need for +extensively querying the language field across various scales and the +regularization of DINO features. Extensive experimental results show that +LangSplat significantly outperforms the previous state-of-the-art method LERF +by a large margin. Notably, LangSplat is extremely efficient, achieving a 199 +$\times$ speedup compared to LERF at the resolution of 1440 $\times$ 1080. We +strongly recommend readers to check out our video results at +https://langsplat.github.io/ + +
+
+ comment: CVPR 2024. Project Page: https://langsplat.github.io +
+
+
+
+
+ + ♻ ☆ Guided Slot Attention for Unsupervised Video Object Segmentation CVPR 2024 + + +
+ Unsupervised video object segmentation aims to segment the most prominent +object in a video sequence. However, the existence of complex backgrounds and +multiple foreground objects make this task challenging. To address this issue, +we propose a guided slot attention network to reinforce spatial structural +information and obtain better foreground--background separation. The foreground +and background slots, which are initialized with query guidance, are +iteratively refined based on interactions with template information. +Furthermore, to improve slot--template interaction and effectively fuse global +and local features in the target and reference frames, K-nearest neighbors +filtering and a feature aggregation transformer are introduced. The proposed +model achieves state-of-the-art performance on two popular datasets. +Additionally, we demonstrate the robustness of the proposed model in +challenging scenes through various comparative experiments. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ LEMON: Learning 3D Human-Object Interaction Relation from 2D Images CVPR2024 + + +
+ Learning 3D human-object interaction relation is pivotal to embodied AI and +interaction modeling. Most existing methods approach the goal by learning to +predict isolated interaction elements, e.g., human contact, object affordance, +and human-object spatial relation, primarily from the perspective of either the +human or the object. Which underexploit certain correlations between the +interaction counterparts (human and object), and struggle to address the +uncertainty in interactions. Actually, objects' functionalities potentially +affect humans' interaction intentions, which reveals what the interaction is. +Meanwhile, the interacting humans and objects exhibit matching geometric +structures, which presents how to interact. In light of this, we propose +harnessing these inherent correlations between interaction counterparts to +mitigate the uncertainty and jointly anticipate the above interaction elements +in 3D space. To achieve this, we present LEMON (LEarning 3D huMan-Object +iNteraction relation), a unified model that mines interaction intentions of the +counterparts and employs curvatures to guide the extraction of geometric +correlations, combining them to anticipate the interaction elements. Besides, +the 3D Interaction Relation dataset (3DIR) is collected to serve as the test +bed for training and evaluation. Extensive experiments demonstrate the +superiority of LEMON over methods estimating each element in isolation. + +
+
+ comment: accept by CVPR2024 +
+
+
+
+
+ + ♻ ☆ Improved Probabilistic Image-Text Representations ICLR 2024 + + +
+ Image-Text Matching (ITM) task, a fundamental vision-language (VL) task, +suffers from the inherent ambiguity arising from multiplicity and imperfect +annotations. Deterministic functions are not sufficiently powerful to capture +ambiguity, prompting the exploration of probabilistic embeddings to tackle the +challenge. However, the existing probabilistic ITM approach encounters two key +shortcomings; the burden of heavy computations due to the Monte Carlo +approximation, and the loss saturation issue in the face of abundant false +negatives. To overcome the issues, this paper presents an improved +Probabilistic Cross-Modal Embeddings (named PCME++) by introducing a new +probabilistic distance with a closed-form solution. In addition, two +optimization techniques are proposed to enhance PCME++ further: first, the +incorporation of pseudo-positives to prevent the negative effect under massive +false negatives; second, mixed sample data augmentation for probabilistic +matching. Experimental results on MS-COCO Caption and two extended benchmarks, +CxC and ECCV Caption, demonstrate the effectiveness of PCME++ compared to +state-of-the-art ITM methods. The robustness of PCME++ is also evaluated under +noisy image-text correspondences. In addition, the potential applicability of +PCME++ in automatic prompt-filtering for zero-shot classification is shown. The +code is available at https://github.com/naver-ai/pcmepp + +
+
+ comment: ICLR 2024 camera-ready; Code: https://github.com/naver-ai/pcmepp. + Project page: https://naver-ai.github.io/pcmepp/. 30 pages, 2.2 MB +
+
+
+
+
+ + ♻ ☆ Deep Semantic Segmentation of Natural and Medical Images: A Review + + +
+ The semantic image segmentation task consists of classifying each pixel of an +image into an instance, where each instance corresponds to a class. This task +is a part of the concept of scene understanding or better explaining the global +context of an image. In the medical image analysis domain, image segmentation +can be used for image-guided interventions, radiotherapy, or improved +radiological diagnostics. In this review, we categorize the leading deep +learning-based medical and non-medical image segmentation solutions into six +main groups of deep architectural, data synthesis-based, loss function-based, +sequenced models, weakly supervised, and multi-task methods and provide a +comprehensive review of the contributions in each of these groups. Further, for +each group, we analyze each variant of these groups and discuss the limitations +of the current approaches and present potential future research directions for +semantic image segmentation. + +
+
+ comment: 45 pages, 16 figures. Accepted for publication in Springer Artificial + Intelligence Review +
+
+
+
+
+ + ♻ ☆ Resolution Limit of Single-Photon LiDAR + + +
+ Single-photon Light Detection and Ranging (LiDAR) systems are often equipped +with an array of detectors for improved spatial resolution and sensing speed. +However, given a fixed amount of flux produced by the laser transmitter across +the scene, the per-pixel Signal-to-Noise Ratio (SNR) will decrease when more +pixels are packed in a unit space. This presents a fundamental trade-off +between the spatial resolution of the sensor array and the SNR received at each +pixel. Theoretical characterization of this fundamental limit is explored. By +deriving the photon arrival statistics and introducing a series of new +approximation techniques, the Mean Squared Error (MSE) of the +maximum-likelihood estimator of the time delay is derived. The theoretical +predictions align well with simulations and real data. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 113 + +
+
+
+ + ☆ 94% on CIFAR-10 in 3.29 Seconds on a Single GPU + + +
+ CIFAR-10 is among the most widely used datasets in machine learning, +facilitating thousands of research projects per year. To accelerate research +and reduce the cost of experiments, we introduce training methods for CIFAR-10 +which reach 94% accuracy in 3.29 seconds, 95% in 10.4 seconds, and 96% in 46.3 +seconds, when run on a single NVIDIA A100 GPU. As one factor contributing to +these training speeds, we propose a derandomized variant of horizontal flipping +augmentation, which we show improves over the standard method in every case +where flipping is beneficial over no flipping at all. Our code is released at +https://github.com/KellerJordan/cifar10-airbench. + +
+
+
+
+
+ + ☆ Denoising Monte Carlo Renders With Diffusion Models + + +
+ Physically-based renderings contain Monte-Carlo noise, with variance that +increases as the number of rays per pixel decreases. This noise, while +zero-mean for good modern renderers, can have heavy tails (most notably, for +scenes containing specular or refractive objects). Learned methods for +restoring low fidelity renders are highly developed, because suppressing render +noise means one can save compute and use fast renders with few rays per pixel. +We demonstrate that a diffusion model can denoise low fidelity renders +successfully. Furthermore, our method can be conditioned on a variety of +natural render information, and this conditioning helps performance. +Quantitative experiments show that our method is competitive with SOTA across a +range of sampling rates, but current metrics slightly favor competitor methods. +Qualitative examination of the reconstructions suggests that the metrics +themselves may not be reliable. The image prior applied by a diffusion method +strongly favors reconstructions that are "like" real images -- so have straight +shadow boundaries, curved specularities, no "fireflies" and the like -- and +metrics do not account for this. We show numerous examples where methods +preferred by current metrics produce qualitatively weaker reconstructions than +ours. + +
+
+ comment: 14 pages, 12 figures +
+
+
+
+
+ + ☆ DiffHuman: Probabilistic Photorealistic 3D Reconstruction of Humans CVPR 2024 + + +
+ We present DiffHuman, a probabilistic method for photorealistic 3D human +reconstruction from a single RGB image. Despite the ill-posed nature of this +problem, most methods are deterministic and output a single solution, often +resulting in a lack of geometric detail and blurriness in unseen or uncertain +regions. In contrast, DiffHuman predicts a probability distribution over 3D +reconstructions conditioned on an input 2D image, which allows us to sample +multiple detailed 3D avatars that are consistent with the image. DiffHuman is +implemented as a conditional diffusion model that denoises pixel-aligned 2D +observations of an underlying 3D shape representation. During inference, we may +sample 3D avatars by iteratively denoising 2D renders of the predicted 3D +representation. Furthermore, we introduce a generator neural network that +approximates rendering with considerably reduced runtime (55x speed up), +resulting in a novel dual-branch diffusion framework. Our experiments show that +DiffHuman can produce diverse and detailed reconstructions for the parts of the +person that are unseen or uncertain in the input image, while remaining +competitive with the state-of-the-art when reconstructing visible surfaces. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Score-Based Diffusion Models for Photoacoustic Tomography Image + Reconstruction + + +
+ Photoacoustic tomography (PAT) is a rapidly-evolving medical imaging modality +that combines optical absorption contrast with ultrasound imaging depth. One +challenge in PAT is image reconstruction with inadequate acoustic signals due +to limited sensor coverage or due to the density of the transducer array. Such +cases call for solving an ill-posed inverse reconstruction problem. In this +work, we use score-based diffusion models to solve the inverse problem of +reconstructing an image from limited PAT measurements. The proposed approach +allows us to incorporate an expressive prior learned by a diffusion model on +simulated vessel structures while still being robust to varying transducer +sparsity conditions. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ☆ SceneGraphLoc: Cross-Modal Coarse Visual Localization on 3D Scene Graphs + + +
+ We introduce a novel problem, i.e., the localization of an input image within +a multi-modal reference map represented by a database of 3D scene graphs. These +graphs comprise multiple modalities, including object-level point clouds, +images, attributes, and relationships between objects, offering a lightweight +and efficient alternative to conventional methods that rely on extensive image +databases. Given the available modalities, the proposed method SceneGraphLoc +learns a fixed-sized embedding for each node (i.e., representing an object +instance) in the scene graph, enabling effective matching with the objects +visible in the input query image. This strategy significantly outperforms other +cross-modal methods, even without incorporating images into the map embeddings. +When images are leveraged, SceneGraphLoc achieves performance close to that of +state-of-the-art techniques depending on large image databases, while requiring +three orders-of-magnitude less storage and operating orders-of-magnitude +faster. The code will be made public. + +
+
+
+
+
+ + ☆ Multiway Point Cloud Mosaicking with Diffusion and Global Optimization + + +
+ We introduce a novel framework for multiway point cloud mosaicking (named +Wednesday), designed to co-align sets of partially overlapping point clouds -- +typically obtained from 3D scanners or moving RGB-D cameras -- into a unified +coordinate system. At the core of our approach is ODIN, a learned pairwise +registration algorithm that iteratively identifies overlaps and refines +attention scores, employing a diffusion-based process for denoising pairwise +correlation matrices to enhance matching accuracy. Further steps include +constructing a pose graph from all point clouds, performing rotation averaging, +a novel robust algorithm for re-estimating translations optimally in terms of +consensus maximization and translation optimization. Finally, the point cloud +rotations and positions are optimized jointly by a diffusion-based approach. +Tested on four diverse, large-scale datasets, our method achieves +state-of-the-art pairwise and multiway registration results by a large margin +on all benchmarks. Our code and models are available at +https://github.com/jinsz/Multiway-Point-Cloud-Mosaicking-with-Diffusion-and-Global-Optimization. + +
+
+
+
+
+ + ☆ Extracting Manifold Information from Point Clouds + + +
+ A kernel based method is proposed for the construction of signature +(defining) functions of subsets of $\mathbb{R}^d$. The subsets can range from +full dimensional manifolds (open subsets) to point clouds (a finite number of +points) and include bounded smooth manifolds of any codimension. The +interpolation and analysis of point clouds are the main application. Two +extreme cases in terms of regularity are considered, where the data set is +interpolated by an analytic surface, at the one extreme, and by a H\"older +continuous surface, at the other. The signature function can be computed as a +linear combination of translated kernels, the coefficients of which are the +solution of a finite dimensional linear problem. Once it is obtained, it can be +used to estimate the dimension as well as the normal and the curvatures of the +interpolated surface. The method is global and does not require explicit +knowledge of local neighborhoods or any other structure present in the data +set. It admits a variational formulation with a natural ``regularized'' +counterpart, that proves to be useful in dealing with data sets corrupted by +numerical error or noise. The underlying analytical structure of the approach +is presented in general before it is applied to the case of point clouds. + +
+
+ comment: 27 pages, 16 figures, 5 tables +
+
+
+
+
+ + ☆ Do Vision-Language Models Understand Compound Nouns? NAACL 2024 + + +
+ Open-vocabulary vision-language models (VLMs) like CLIP, trained using +contrastive loss, have emerged as a promising new paradigm for text-to-image +retrieval. However, do VLMs understand compound nouns (CNs) (e.g., lab coat) as +well as they understand nouns (e.g., lab)? We curate Compun, a novel benchmark +with 400 unique and commonly used CNs, to evaluate the effectiveness of VLMs in +interpreting CNs. The Compun benchmark challenges a VLM for text-to-image +retrieval where, given a text prompt with a CN, the task is to select the +correct image that shows the CN among a pair of distractor images that show the +constituent nouns that make up the CN. Next, we perform an in-depth analysis to +highlight CLIPs' limited understanding of certain types of CNs. Finally, we +present an alternative framework that moves beyond hand-written templates for +text prompts widely used by CLIP-like models. We employ a Large Language Model +to generate multiple diverse captions that include the CN as an object in the +scene described by the caption. Our proposed method improves CN understanding +of CLIP by 8.25% on Compun. Code and benchmark are available at: +https://github.com/sonalkum/Compun + +
+
+ comment: Accepted to NAACL 2024 Main Conference +
+
+
+
+
+ + ☆ Continual Learning for Autonomous Robots: A Prototype-based Approach IROS + + +
+ Humans and animals learn throughout their lives from limited amounts of +sensed data, both with and without supervision. Autonomous, intelligent robots +of the future are often expected to do the same. The existing continual +learning (CL) methods are usually not directly applicable to robotic settings: +they typically require buffering and a balanced replay of training data. A +few-shot online continual learning (FS-OCL) setting has been proposed to +address more realistic scenarios where robots must learn from a non-repeated +sparse data stream. To enable truly autonomous life-long learning, an +additional challenge of detecting novelties and learning new items without +supervision needs to be addressed. We address this challenge with our new +prototype-based approach called Continually Learning Prototypes (CLP). In +addition to being capable of FS-OCL learning, CLP also detects novel objects +and learns them without supervision. To mitigate forgetting, CLP utilizes a +novel metaplasticity mechanism that adapts the learning rate individually per +prototype. CLP is rehearsal-free, hence does not require a memory buffer, and +is compatible with neuromorphic hardware, characterized by ultra-low power +consumption, real-time processing abilities, and on-chip learning. Indeed, we +have open-sourced a simple version of CLP in the neuromorphic software +framework Lava, targetting Intel's neuromorphic chip Loihi 2. We evaluate CLP +on a robotic vision dataset, OpenLORIS. In a low-instance FS-OCL scenario, CLP +shows state-of-the-art results. In the open world, CLP detects novelties with +superior precision and recall and learns features of the detected novel classes +without supervision, achieving a strong baseline of 99% base class and 65%/76% +(5-shot/10-shot) novel class accuracy. + +
+
+ comment: Submitted to IEEE/RSJ International Conference on Intelligent Robots + and Systems (IROS) +
+
+
+
+
+ + ☆ Orchestrate Latent Expertise: Advancing Online Continual Learning with + Multi-Level Supervision and Reverse Self-Distillation CVPR 2024 + + +
+ To accommodate real-world dynamics, artificial intelligence systems need to +cope with sequentially arriving content in an online manner. Beyond regular +Continual Learning (CL) attempting to address catastrophic forgetting with +offline training of each task, Online Continual Learning (OCL) is a more +challenging yet realistic setting that performs CL in a one-pass data stream. +Current OCL methods primarily rely on memory replay of old training samples. +However, a notable gap from CL to OCL stems from the additional +overfitting-underfitting dilemma associated with the use of rehearsal buffers: +the inadequate learning of new training samples (underfitting) and the repeated +learning of a few old training samples (overfitting). To this end, we introduce +a novel approach, Multi-level Online Sequential Experts (MOSE), which +cultivates the model as stacked sub-experts, integrating multi-level +supervision and reverse self-distillation. Supervision signals across multiple +stages facilitate appropriate convergence of the new task while gathering +various strengths from experts by knowledge distillation mitigates the +performance decline of old tasks. MOSE demonstrates remarkable efficacy in +learning new samples and preserving past knowledge through multi-level experts, +thereby significantly advancing OCL performance over state-of-the-art baselines +(e.g., up to 7.3% on Split CIFAR-100 and 6.1% on Split Tiny-ImageNet). + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ SVGCraft: Beyond Single Object Text-to-SVG Synthesis with Comprehensive + Canvas Layout + + +
+ Generating VectorArt from text prompts is a challenging vision task, +requiring diverse yet realistic depictions of the seen as well as unseen +entities. However, existing research has been mostly limited to the generation +of single objects, rather than comprehensive scenes comprising multiple +elements. In response, this work introduces SVGCraft, a novel end-to-end +framework for the creation of vector graphics depicting entire scenes from +textual descriptions. Utilizing a pre-trained LLM for layout generation from +text prompts, this framework introduces a technique for producing masked +latents in specified bounding boxes for accurate object placement. It +introduces a fusion mechanism for integrating attention maps and employs a +diffusion U-Net for coherent composition, speeding up the drawing process. The +resulting SVG is optimized using a pre-trained encoder and LPIPS loss with +opacity modulation to maximize similarity. Additionally, this work explores the +potential of primitive shapes in facilitating canvas completion in constrained +environments. Through both qualitative and quantitative assessments, SVGCraft +is demonstrated to surpass prior works in abstraction, recognizability, and +detail, as evidenced by its performance metrics (CLIP-T: 0.4563, Cosine +Similarity: 0.6342, Confusion: 0.66, Aesthetic: 6.7832). The code will be +available at https://github.com/ayanban011/SVGCraft. + +
+
+
+
+
+ + ☆ 3DGSR: Implicit Surface Reconstruction with 3D Gaussian Splatting + + +
+ In this paper, we present an implicit surface reconstruction method with 3D +Gaussian Splatting (3DGS), namely 3DGSR, that allows for accurate 3D +reconstruction with intricate details while inheriting the high efficiency and +rendering quality of 3DGS. The key insight is incorporating an implicit signed +distance field (SDF) within 3D Gaussians to enable them to be aligned and +jointly optimized. First, we introduce a differentiable SDF-to-opacity +transformation function that converts SDF values into corresponding Gaussians' +opacities. This function connects the SDF and 3D Gaussians, allowing for +unified optimization and enforcing surface constraints on the 3D Gaussians. +During learning, optimizing the 3D Gaussians provides supervisory signals for +SDF learning, enabling the reconstruction of intricate details. However, this +only provides sparse supervisory signals to the SDF at locations occupied by +Gaussians, which is insufficient for learning a continuous SDF. Then, to +address this limitation, we incorporate volumetric rendering and align the +rendered geometric attributes (depth, normal) with those derived from 3D +Gaussians. This consistency regularization introduces supervisory signals to +locations not covered by discrete 3D Gaussians, effectively eliminating +redundant surfaces outside the Gaussian sampling range. Our extensive +experimental results demonstrate that our 3DGSR method enables high-quality 3D +surface reconstruction while preserving the efficiency and rendering quality of +3DGS. Besides, our method competes favorably with leading surface +reconstruction techniques while offering a more efficient learning process and +much better rendering qualities. The code will be available at +https://github.com/CVMI-Lab/3DGSR. + +
+
+
+
+
+ + ☆ Constrained Layout Generation with Factor Graphs CVPR 2024 + + +
+ This paper addresses the challenge of object-centric layout generation under +spatial constraints, seen in multiple domains including floorplan design +process. The design process typically involves specifying a set of spatial +constraints that include object attributes like size and inter-object relations +such as relative positioning. Existing works, which typically represent objects +as single nodes, lack the granularity to accurately model complex interactions +between objects. For instance, often only certain parts of an object, like a +room's right wall, interact with adjacent objects. To address this gap, we +introduce a factor graph based approach with four latent variable nodes for +each room, and a factor node for each constraint. The factor nodes represent +dependencies among the variables to which they are connected, effectively +capturing constraints that are potentially of a higher order. We then develop +message-passing on the bipartite graph, forming a factor graph neural network +that is trained to produce a floorplan that aligns with the desired +requirements. Our approach is simple and generates layouts faithful to the user +requirements, demonstrated by a large improvement in IOU scores over existing +methods. Additionally, our approach, being inferential and accurate, is +well-suited to the practical human-in-the-loop design process where +specifications evolve iteratively, offering a practical and powerful tool for +AI-guided design. + +
+
+ comment: To be published at IEEE/CVF CVPR 2024 +
+
+
+
+
+ + ☆ TTD: Text-Tag Self-Distillation Enhancing Image-Text Alignment in CLIP + to Alleviate Single Tag Bias + + +
+ We identify a critical bias in contemporary CLIP-based models, which we +denote as \textit{single tag bias}. This bias manifests as a disproportionate +focus on a singular tag (word) while neglecting other pertinent tags, stemming +from CLIP's text embeddings that prioritize one specific tag in image-text +relationships. When deconstructing text into individual tags, only one tag +tends to have high relevancy with CLIP's image embedding, leading to an +imbalanced tag relevancy. This results in an uneven alignment among multiple +tags present in the text. To tackle this challenge, we introduce a novel +two-step fine-tuning approach. First, our method leverages the similarity +between tags and their nearest pixels for scoring, enabling the extraction of +image-relevant tags from the text. Second, we present a self-distillation +strategy aimed at aligning the combined masks from extracted tags with the +text-derived mask. This approach mitigates the single tag bias, thereby +significantly improving the alignment of CLIP's model without necessitating +additional data or supervision. Our technique demonstrates model-agnostic +improvements in multi-tag classification and segmentation tasks, surpassing +competing methods that rely on external resources. Code is available at +https://github.com/shjo-april/TTD. + +
+
+
+
+
+ + ☆ DHR: Dual Features-Driven Hierarchical Rebalancing in Inter- and + Intra-Class Regions for Weakly-Supervised Semantic Segmentation + + +
+ Weakly-supervised semantic segmentation (WSS) ensures high-quality +segmentation with limited data and excels when employed as input seed masks for +large-scale vision models such as Segment Anything. However, WSS faces +challenges related to minor classes since those are overlooked in images with +adjacent multiple classes, a limitation originating from the overfitting of +traditional expansion methods like Random Walk. We first address this by +employing unsupervised and weakly-supervised feature maps instead of +conventional methodologies, allowing for hierarchical mask enhancement. This +method distinctly categorizes higher-level classes and subsequently separates +their associated lower-level classes, ensuring all classes are correctly +restored in the mask without losing minor ones. Our approach, validated through +extensive experimentation, significantly improves WSS across five benchmarks +(VOC: 79.8\%, COCO: 53.9\%, Context: 49.0\%, ADE: 32.9\%, Stuff: 37.4\%), +reducing the gap with fully supervised methods by over 84\% on the VOC +validation set. Code is available at https://github.com/shjo-april/DHR. + +
+
+
+
+
+ + ☆ The Devil is in the Edges: Monocular Depth Estimation with Edge-aware + Consistency Fusion + + +
+ This paper presents a novel monocular depth estimation method, named ECFNet, +for estimating high-quality monocular depth with clear edges and valid overall +structure from a single RGB image. We make a thorough inquiry about the key +factor that affects the edge depth estimation of the MDE networks, and come to +a ratiocination that the edge information itself plays a critical role in +predicting depth details. Driven by this analysis, we propose to explicitly +employ the image edges as input for ECFNet and fuse the initial depths from +different sources to produce the final depth. Specifically, ECFNet first uses a +hybrid edge detection strategy to get the edge map and edge-highlighted image +from the input image, and then leverages a pre-trained MDE network to infer the +initial depths of the aforementioned three images. After that, ECFNet utilizes +a layered fusion module (LFM) to fuse the initial depth, which will be further +updated by a depth consistency module (DCM) to form the final estimation. +Extensive experimental results on public datasets and ablation studies indicate +that our method achieves state-of-the-art performance. Project page: +https://zrealli.github.io/edgedepth. + +
+
+ comment: 17 pages, 19 figures +
+
+
+
+
+ + ☆ Towards Variable and Coordinated Holistic Co-Speech Motion Generation CVPR 2024 + + +
+ This paper addresses the problem of generating lifelike holistic co-speech +motions for 3D avatars, focusing on two key aspects: variability and +coordination. Variability allows the avatar to exhibit a wide range of motions +even with similar speech content, while coordination ensures a harmonious +alignment among facial expressions, hand gestures, and body poses. We aim to +achieve both with ProbTalk, a unified probabilistic framework designed to +jointly model facial, hand, and body movements in speech. ProbTalk builds on +the variational autoencoder (VAE) architecture and incorporates three core +designs. First, we introduce product quantization (PQ) to the VAE, which +enriches the representation of complex holistic motion. Second, we devise a +novel non-autoregressive model that embeds 2D positional encoding into the +product-quantized representation, thereby preserving essential structure +information of the PQ codes. Last, we employ a secondary stage to refine the +preliminary prediction, further sharpening the high-frequency details. Coupling +these three designs enables ProbTalk to generate natural and diverse holistic +co-speech motions, outperforming several state-of-the-art methods in +qualitative and quantitative evaluations, particularly in terms of realism. Our +code and model will be released for research purposes at +https://feifeifeiliu.github.io/probtalk/. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Efficient Multi-branch Segmentation Network for Situation Awareness in + Autonomous Navigation + + +
+ Real-time and high-precision situational awareness technology is critical for +autonomous navigation of unmanned surface vehicles (USVs). In particular, +robust and fast obstacle semantic segmentation methods are essential. However, +distinguishing between the sea and the sky is challenging due to the +differences between port and maritime environments. In this study, we built a +dataset that captured perspectives from USVs and unmanned aerial vehicles in a +maritime port environment and analysed the data features. Statistical analysis +revealed a high correlation between the distribution of the sea and sky and row +positional information. Based on this finding, a three-branch semantic +segmentation network with a row position encoding module (RPEM) was proposed to +improve the prediction accuracy between the sea and the sky. The proposed RPEM +highlights the effect of row coordinates on feature extraction. Compared to the +baseline, the three-branch network with RPEM significantly improved the ability +to distinguish between the sea and the sky without significantly reducing the +computational speed. + +
+
+
+
+
+ + ☆ STBA: Towards Evaluating the Robustness of DNNs for Query-Limited + Black-box Scenario + + +
+ Many attack techniques have been proposed to explore the vulnerability of +DNNs and further help to improve their robustness. Despite the significant +progress made recently, existing black-box attack methods still suffer from +unsatisfactory performance due to the vast number of queries needed to optimize +desired perturbations. Besides, the other critical challenge is that +adversarial examples built in a noise-adding manner are abnormal and struggle +to successfully attack robust models, whose robustness is enhanced by +adversarial training against small perturbations. There is no doubt that these +two issues mentioned above will significantly increase the risk of exposure and +result in a failure to dig deeply into the vulnerability of DNNs. Hence, it is +necessary to evaluate DNNs' fragility sufficiently under query-limited settings +in a non-additional way. In this paper, we propose the Spatial Transform +Black-box Attack (STBA), a novel framework to craft formidable adversarial +examples in the query-limited scenario. Specifically, STBA introduces a flow +field to the high-frequency part of clean images to generate adversarial +examples and adopts the following two processes to enhance their naturalness +and significantly improve the query efficiency: a) we apply an estimated flow +field to the high-frequency part of clean images to generate adversarial +examples instead of introducing external noise to the benign image, and b) we +leverage an efficient gradient estimation method based on a batch of samples to +optimize such an ideal flow field under query-limited settings. Compared to +existing score-based black-box baselines, extensive experiments indicated that +STBA could effectively improve the imperceptibility of the adversarial examples +and remarkably boost the attack success rate under query-limited settings. + +
+
+
+
+
+ + ☆ Reusable Architecture Growth for Continual Stereo Matching CVPR 2022 + + +
+ The remarkable performance of recent stereo depth estimation models benefits +from the successful use of convolutional neural networks to regress dense +disparity. Akin to most tasks, this needs gathering training data that covers a +number of heterogeneous scenes at deployment time. However, training samples +are typically acquired continuously in practical applications, making the +capability to learn new scenes continually even more crucial. For this purpose, +we propose to perform continual stereo matching where a model is tasked to 1) +continually learn new scenes, 2) overcome forgetting previously learned scenes, +and 3) continuously predict disparities at inference. We achieve this goal by +introducing a Reusable Architecture Growth (RAG) framework. RAG leverages +task-specific neural unit search and architecture growth to learn new scenes +continually in both supervised and self-supervised manners. It can maintain +high reusability during growth by reusing previous units while obtaining good +performance. Additionally, we present a Scene Router module to adaptively +select the scene-specific architecture path at inference. Comprehensive +experiments on numerous datasets show that our framework performs impressively +in various weather, road, and city circumstances and surpasses the +state-of-the-art methods in more challenging cross-dataset settings. Further +experiments also demonstrate the adaptability of our method to unseen scenes, +which can facilitate end-to-end stereo architecture learning and practical +deployment. + +
+
+ comment: Extended version of CVPR 2022 paper "Continual Stereo Matching of + Continuous Driving Scenes with Growing Architecture" - Accepted to TPAMI in + 2024 +
+
+
+
+
+ + ☆ Spread Your Wings: A Radial Strip Transformer for Image Deblurring + + +
+ Exploring motion information is important for the motion deblurring task. +Recent the window-based transformer approaches have achieved decent performance +in image deblurring. Note that the motion causing blurry results is usually +composed of translation and rotation movements and the window-shift operation +in the Cartesian coordinate system by the window-based transformer approaches +only directly explores translation motion in orthogonal directions. Thus, these +methods have the limitation of modeling the rotation part. To alleviate this +problem, we introduce the polar coordinate-based transformer, which has the +angles and distance to explore rotation motion and translation information +together. In this paper, we propose a Radial Strip Transformer (RST), which is +a transformer-based architecture that restores the blur images in a polar +coordinate system instead of a Cartesian one. RST contains a dynamic radial +embedding module (DRE) to extract the shallow feature by a radial deformable +convolution. We design a polar mask layer to generate the offsets for the +deformable convolution, which can reshape the convolution kernel along the +radius to better capture the rotation motion information. Furthermore, we +proposed a radial strip attention solver (RSAS) as deep feature extraction, +where the relationship of windows is organized by azimuth and radius. This +attention module contains radial strip windows to reweight image features in +the polar coordinate, which preserves more useful information in rotation and +translation motion together for better recovering the sharp images. +Experimental results on six synthesis and real-world datasets prove that our +method performs favorably against other SOTA methods for the image deblurring +task. + +
+
+
+
+
+ + ☆ Rethinking Attention-Based Multiple Instance Learning for Whole-Slide + Pathological Image Classification: An Instance Attribute Viewpoint + + +
+ Multiple instance learning (MIL) is a robust paradigm for whole-slide +pathological image (WSI) analysis, processing gigapixel-resolution images with +slide-level labels. As pioneering efforts, attention-based MIL (ABMIL) and its +variants are increasingly becoming popular due to the characteristics of +simultaneously handling clinical diagnosis and tumor localization. However, the +attention mechanism exhibits limitations in discriminating between instances, +which often misclassifies tissues and potentially impairs MIL performance. This +paper proposes an Attribute-Driven MIL (AttriMIL) framework to address these +issues. Concretely, we dissect the calculation process of ABMIL and present an +attribute scoring mechanism that measures the contribution of each instance to +bag prediction effectively, quantifying instance attributes. Based on attribute +quantification, we develop a spatial attribute constraint and an attribute +ranking constraint to model instance correlations within and across slides, +respectively. These constraints encourage the network to capture the spatial +correlation and semantic similarity of instances, improving the ability of +AttriMIL to distinguish tissue types and identify challenging instances. +Additionally, AttriMIL employs a histopathology adaptive backbone that +maximizes the pre-trained model's feature extraction capability for collecting +pathological features. Extensive experiments on three public benchmarks +demonstrate that our AttriMIL outperforms existing state-of-the-art frameworks +across multiple evaluation metrics. The implementation code is available at +https://github.com/MedCAI/AttriMIL. + +
+
+ comment: 10 pages, 8 figures +
+
+
+
+
+ + ☆ SGDFormer: One-stage Transformer-based Architecture for Cross-Spectral + Stereo Image Guided Denoising + + +
+ Cross-spectral image guided denoising has shown its great potential in +recovering clean images with rich details, such as using the near-infrared +image to guide the denoising process of the visible one. To obtain such image +pairs, a feasible and economical way is to employ a stereo system, which is +widely used on mobile devices. Current works attempt to generate an aligned +guidance image to handle the disparity between two images. However, due to +occlusion, spectral differences and noise degradation, the aligned guidance +image generally exists ghosting and artifacts, leading to an unsatisfactory +denoised result. To address this issue, we propose a one-stage +transformer-based architecture, named SGDFormer, for cross-spectral Stereo +image Guided Denoising. The architecture integrates the correspondence modeling +and feature fusion of stereo images into a unified network. Our transformer +block contains a noise-robust cross-attention (NRCA) module and a spatially +variant feature fusion (SVFF) module. The NRCA module captures the long-range +correspondence of two images in a coarse-to-fine manner to alleviate the +interference of noise. The SVFF module further enhances salient structures and +suppresses harmful artifacts through dynamically selecting useful information. +Thanks to the above design, our SGDFormer can restore artifact-free images with +fine structures, and achieves state-of-the-art performance on various datasets. +Additionally, our SGDFormer can be extended to handle other unaligned +cross-model guided restoration tasks such as guided depth super-resolution. + +
+
+
+
+
+ + ☆ MaGRITTe: Manipulative and Generative 3D Realization from Image, Topview + and Text + + +
+ The generation of 3D scenes from user-specified conditions offers a promising +avenue for alleviating the production burden in 3D applications. Previous +studies required significant effort to realize the desired scene, owing to +limited control conditions. We propose a method for controlling and generating +3D scenes under multimodal conditions using partial images, layout information +represented in the top view, and text prompts. Combining these conditions to +generate a 3D scene involves the following significant difficulties: (1) the +creation of large datasets, (2) reflection on the interaction of multimodal +conditions, and (3) domain dependence of the layout conditions. We decompose +the process of 3D scene generation into 2D image generation from the given +conditions and 3D scene generation from 2D images. 2D image generation is +achieved by fine-tuning a pretrained text-to-image model with a small +artificial dataset of partial images and layouts, and 3D scene generation is +achieved by layout-conditioned depth estimation and neural radiance fields +(NeRF), thereby avoiding the creation of large datasets. The use of a common +representation of spatial information using 360-degree images allows for the +consideration of multimodal condition interactions and reduces the domain +dependence of the layout control. The experimental results qualitatively and +quantitatively demonstrated that the proposed method can generate 3D scenes in +diverse domains, from indoor to outdoor, according to multimodal conditions. + +
+
+ comment: Project Page: https://hara012.github.io/MaGRITTe-project +
+
+
+
+
+ + ☆ Learing Trimaps via Clicks for Image Matting + + +
+ Despite significant advancements in image matting, existing models heavily +depend on manually-drawn trimaps for accurate results in natural image +scenarios. However, the process of obtaining trimaps is time-consuming, lacking +user-friendliness and device compatibility. This reliance greatly limits the +practical application of all trimap-based matting methods. To address this +issue, we introduce Click2Trimap, an interactive model capable of predicting +high-quality trimaps and alpha mattes with minimal user click inputs. Through +analyzing real users' behavioral logic and characteristics of trimaps, we +successfully propose a powerful iterative three-class training strategy and a +dedicated simulation function, making Click2Trimap exhibit versatility across +various scenarios. Quantitative and qualitative assessments on synthetic and +real-world matting datasets demonstrate Click2Trimap's superior performance +compared to all existing trimap-free matting methods. Especially, in the user +study, Click2Trimap achieves high-quality trimap and matting predictions in +just an average of 5 seconds per image, demonstrating its substantial practical +value in real-world applications. + +
+
+
+
+
+ + ☆ Memory-Scalable and Simplified Functional Map Learning + + +
+ Deep functional maps have emerged in recent years as a prominent +learning-based framework for non-rigid shape matching problems. While early +methods in this domain only focused on learning in the functional domain, the +latest techniques have demonstrated that by promoting consistency between +functional and pointwise maps leads to significant improvements in accuracy. +Unfortunately, existing approaches rely heavily on the computation of large +dense matrices arising from soft pointwise maps, which compromises their +efficiency and scalability. To address this limitation, we introduce a novel +memory-scalable and efficient functional map learning pipeline. By leveraging +the specific structure of functional maps, we offer the possibility to achieve +identical results without ever storing the pointwise map in memory. +Furthermore, based on the same approach, we present a differentiable map +refinement layer adapted from an existing axiomatic refinement algorithm. +Unlike many functional map learning methods, which use this algorithm at a +post-processing step, ours can be easily used at train time, enabling to +enforce consistency between the refined and initial versions of the map. Our +resulting approach is both simpler, more efficient and more numerically stable, +by avoiding differentiation through a linear system, while achieving close to +state-of-the-art results in challenging scenarios. + +
+
+
+
+
+ + ☆ YNetr: Dual-Encoder architecture on Plain Scan Liver Tumors (PSLT) + + +
+ Background: Liver tumors are abnormal growths in the liver that can be either +benign or malignant, with liver cancer being a significant health concern +worldwide. However, there is no dataset for plain scan segmentation of liver +tumors, nor any related algorithms. To fill this gap, we propose Plain Scan +Liver Tumors(PSLT) and YNetr. Methods: A collection of 40 liver tumor plain +scan segmentation datasets was assembled and annotated. Concurrently, we +utilized Dice coefficient as the metric for assessing the segmentation outcomes +produced by YNetr, having advantage of capturing different frequency +information. Results: The YNetr model achieved a Dice coefficient of 62.63% on +the PSLT dataset, surpassing the other publicly available model by an accuracy +margin of 1.22%. Comparative evaluations were conducted against a range of +models including UNet 3+, XNet, UNetr, Swin UNetr, Trans-BTS, COTr, nnUNetv2 +(2D), nnUNetv2 (3D fullres), MedNext (2D) and MedNext(3D fullres). Conclusions: +We not only proposed a dataset named PSLT(Plain Scan Liver Tumors), but also +explored a structure called YNetr that utilizes wavelet transform to extract +different frequency information, which having the SOTA in PSLT by experiments. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ CLIP-driven Outliers Synthesis for few-shot OOD detection + + +
+ Few-shot OOD detection focuses on recognizing out-of-distribution (OOD) +images that belong to classes unseen during training, with the use of only a +small number of labeled in-distribution (ID) images. Up to now, a mainstream +strategy is based on large-scale vision-language models, such as CLIP. However, +these methods overlook a crucial issue: the lack of reliable OOD supervision +information, which can lead to biased boundaries between in-distribution (ID) +and OOD. To tackle this problem, we propose CLIP-driven Outliers +Synthesis~(CLIP-OS). Firstly, CLIP-OS enhances patch-level features' perception +by newly proposed patch uniform convolution, and adaptively obtains the +proportion of ID-relevant information by employing CLIP-surgery-discrepancy, +thus achieving separation between ID-relevant and ID-irrelevant. Next, CLIP-OS +synthesizes reliable OOD data by mixing up ID-relevant features from different +classes to provide OOD supervision information. Afterward, CLIP-OS leverages +synthetic OOD samples by unknown-aware prompt learning to enhance the +separability of ID and OOD. Extensive experiments across multiple benchmarks +demonstrate that CLIP-OS achieves superior few-shot OOD detection capability. + +
+
+ comment: 9 pages,5 figures +
+
+
+
+
+ + ☆ Instrument-tissue Interaction Detection Framework for Surgical Video + Understanding + + +
+ Instrument-tissue interaction detection task, which helps understand surgical +activities, is vital for constructing computer-assisted surgery systems but +with many challenges. Firstly, most models represent instrument-tissue +interaction in a coarse-grained way which only focuses on classification and +lacks the ability to automatically detect instruments and tissues. Secondly, +existing works do not fully consider relations between intra- and inter-frame +of instruments and tissues. In the paper, we propose to represent +instrument-tissue interaction as quintuple and present an +Instrument-Tissue Interaction Detection Network (ITIDNet) to detect the +quintuple for surgery videos understanding. Specifically, we propose a Snippet +Consecutive Feature (SCF) Layer to enhance features by modeling relationships +of proposals in the current frame using global context information in the video +snippet. We also propose a Spatial Corresponding Attention (SCA) Layer to +incorporate features of proposals between adjacent frames through spatial +encoding. To reason relationships between instruments and tissues, a Temporal +Graph (TG) Layer is proposed with intra-frame connections to exploit +relationships between instruments and tissues in the same frame and inter-frame +connections to model the temporal information for the same instance. For +evaluation, we build a cataract surgery video (PhacoQ) dataset and a +cholecystectomy surgery video (CholecQ) dataset. Experimental results +demonstrate the promising performance of our model, which outperforms other +state-of-the-art models on both datasets. + +
+
+
+
+
+ + ☆ Exploring Unseen Environments with Robots using Large Language and + Vision Models through a Procedurally Generated 3D Scene Representation + + +
+ Recent advancements in Generative Artificial Intelligence, particularly in +the realm of Large Language Models (LLMs) and Large Vision Language Models +(LVLMs), have enabled the prospect of leveraging cognitive planners within +robotic systems. This work focuses on solving the object goal navigation +problem by mimicking human cognition to attend, perceive and store task +specific information and generate plans with the same. We introduce a +comprehensive framework capable of exploring an unfamiliar environment in +search of an object by leveraging the capabilities of Large Language +Models(LLMs) and Large Vision Language Models (LVLMs) in understanding the +underlying semantics of our world. A challenging task in using LLMs to generate +high level sub-goals is to efficiently represent the environment around the +robot. We propose to use a 3D scene modular representation, with semantically +rich descriptions of the object, to provide the LLM with task relevant +information. But providing the LLM with a mass of contextual information (rich +3D scene semantic representation), can lead to redundant and inefficient plans. +We propose to use an LLM based pruner that leverages the capabilities of +in-context learning to prune out irrelevant goal specific information. + +
+
+
+
+
+ + ☆ Harmonizing Light and Darkness: A Symphony of Prior-guided Data + Synthesis and Adaptive Focus for Nighttime Flare Removal + + +
+ Intense light sources often produce flares in captured images at night, which +deteriorates the visual quality and negatively affects downstream applications. +In order to train an effective flare removal network, a reliable dataset is +essential. The mainstream flare removal datasets are semi-synthetic to reduce +human labour, but these datasets do not cover typical scenarios involving +multiple scattering flares. To tackle this issue, we synthesize a prior-guided +dataset named Flare7K*, which contains multi-flare images where the brightness +of flares adheres to the laws of illumination. Besides, flares tend to occupy +localized regions of the image but existing networks perform flare removal on +the entire image and sometimes modify clean areas incorrectly. Therefore, we +propose a plug-and-play Adaptive Focus Module (AFM) that can adaptively mask +the clean background areas and assist models in focusing on the regions +severely affected by flares. Extensive experiments demonstrate that our data +synthesis method can better simulate real-world scenes and several models +equipped with AFM achieve state-of-the-art performance on the real-world test +dataset. + +
+
+
+
+
+ + ☆ Bayesian Exploration of Pre-trained Models for Low-shot Image + Classification + + +
+ Low-shot image classification is a fundamental task in computer vision, and +the emergence of large-scale vision-language models such as CLIP has greatly +advanced the forefront of research in this field. However, most existing +CLIP-based methods lack the flexibility to effectively incorporate other +pre-trained models that encompass knowledge distinct from CLIP. To bridge the +gap, this work proposes a simple and effective probabilistic model ensemble +framework based on Gaussian processes, which have previously demonstrated +remarkable efficacy in processing small data. We achieve the integration of +prior knowledge by specifying the mean function with CLIP and the kernel +function with an ensemble of deep kernels built upon various pre-trained +models. By regressing the classification label directly, our framework enables +analytical inference, straightforward uncertainty quantification, and +principled hyper-parameter tuning. Through extensive experiments on standard +benchmarks, we demonstrate that our method consistently outperforms competitive +ensemble baselines regarding predictive performance. Additionally, we assess +the robustness of our method and the quality of the yielded uncertainty +estimates on out-of-distribution datasets. We also illustrate that our method, +despite relying on label regression, still enjoys superior model calibration +compared to most deterministic baselines. + +
+
+
+
+
+ + ☆ ST-LLM: Large Language Models Are Effective Temporal Learners + + +
+ Large Language Models (LLMs) have showcased impressive capabilities in text +comprehension and generation, prompting research efforts towards video LLMs to +facilitate human-AI interaction at the video level. However, how to effectively +encode and understand videos in video-based dialogue systems remains to be +solved. In this paper, we investigate a straightforward yet unexplored +question: Can we feed all spatial-temporal tokens into the LLM, thus delegating +the task of video sequence modeling to the LLMs? Surprisingly, this simple +approach yields significant improvements in video understanding. Based upon +this, we propose ST-LLM, an effective video-LLM baseline with Spatial-Temporal +sequence modeling inside LLM. Furthermore, to address the overhead and +stability issues introduced by uncompressed video tokens within LLMs, we +develop a dynamic masking strategy with tailor-made training objectives. For +particularly long videos, we have also designed a global-local input module to +balance efficiency and effectiveness. Consequently, we harness LLM for +proficient spatial-temporal modeling, while upholding efficiency and stability. +Extensive experimental results attest to the effectiveness of our method. +Through a more concise model and training pipeline, ST-LLM establishes a new +state-of-the-art result on VideoChatGPT-Bench and MVBench. Codes have been +available at https://github.com/TencentARC/ST-LLM. + +
+
+
+
+
+ + ☆ Monocular Identity-Conditioned Facial Reflectance Reconstruction CVPR 2024 + + +
+ Recent 3D face reconstruction methods have made remarkable advancements, yet +there remain huge challenges in monocular high-quality facial reflectance +reconstruction. Existing methods rely on a large amount of light-stage captured +data to learn facial reflectance models. However, the lack of subject diversity +poses challenges in achieving good generalization and widespread applicability. +In this paper, we learn the reflectance prior in image space rather than UV +space and present a framework named ID2Reflectance. Our framework can directly +estimate the reflectance maps of a single image while using limited reflectance +data for training. Our key insight is that reflectance data shares facial +structures with RGB faces, which enables obtaining expressive facial prior from +inexpensive RGB data thus reducing the dependency on reflectance data. We first +learn a high-quality prior for facial reflectance. Specifically, we pretrain +multi-domain facial feature codebooks and design a codebook fusion method to +align the reflectance and RGB domains. Then, we propose an identity-conditioned +swapping module that injects facial identity from the target image into the +pre-trained autoencoder to modify the identity of the source reflectance image. +Finally, we stitch multi-view swapped reflectance images to obtain renderable +assets. Extensive experiments demonstrate that our method exhibits excellent +generalization capability and achieves state-of-the-art facial reflectance +reconstruction results for in-the-wild faces. Our project page is +https://xingyuren.github.io/id2reflectance/. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ LAKE-RED: Camouflaged Images Generation by Latent Background Knowledge + Retrieval-Augmented Diffusion CVPR 2024 + + +
+ Camouflaged vision perception is an important vision task with numerous +practical applications. Due to the expensive collection and labeling costs, +this community struggles with a major bottleneck that the species category of +its datasets is limited to a small number of object species. However, the +existing camouflaged generation methods require specifying the background +manually, thus failing to extend the camouflaged sample diversity in a low-cost +manner. In this paper, we propose a Latent Background Knowledge +Retrieval-Augmented Diffusion (LAKE-RED) for camouflaged image generation. To +our knowledge, our contributions mainly include: (1) For the first time, we +propose a camouflaged generation paradigm that does not need to receive any +background inputs. (2) Our LAKE-RED is the first knowledge retrieval-augmented +method with interpretability for camouflaged generation, in which we propose an +idea that knowledge retrieval and reasoning enhancement are separated +explicitly, to alleviate the task-specific challenges. Moreover, our method is +not restricted to specific foreground targets or backgrounds, offering a +potential for extending camouflaged vision perception to more diverse domains. +(3) Experimental results demonstrate that our method outperforms the existing +approaches, generating more realistic camouflage images. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Seeing the Unseen: A Frequency Prompt Guided Transformer for Image + Restoration + + +
+ How to explore useful features from images as prompts to guide the deep image +restoration models is an effective way to solve image restoration. In contrast +to mining spatial relations within images as prompt, which leads to +characteristics of different frequencies being neglected and further remaining +subtle or undetectable artifacts in the restored image, we develop a Frequency +Prompting image restoration method, dubbed FPro, which can effectively provide +prompt components from a frequency perspective to guild the restoration model +address these differences. Specifically, we first decompose input features into +separate frequency parts via dynamically learned filters, where we introduce a +gating mechanism for suppressing the less informative elements within the +kernels. To propagate useful frequency information as prompt, we then propose a +dual prompt block, consisting of a low-frequency prompt modulator (LPM) and a +high-frequency prompt modulator (HPM), to handle signals from different bands +respectively. Each modulator contains a generation process to incorporate +prompting components into the extracted frequency maps, and a modulation part +that modifies the prompt feature with the guidance of the decoder features. +Experimental results on commonly used benchmarks have demonstrated the +favorable performance of our pipeline against SOTA methods on 5 image +restoration tasks, including deraining, deraindrop, demoir\'eing, deblurring, +and dehazing. The source code and pre-trained models will be available at +https://github.com/joshyZhou/FPro. + +
+
+ comment: 18 pages, 10 figrues +
+
+
+
+
+ + ☆ Long-Tailed Recognition on Binary Networks by Calibrating A Pre-trained + Model + + +
+ Deploying deep models in real-world scenarios entails a number of challenges, +including computational efficiency and real-world (e.g., long-tailed) data +distributions. We address the combined challenge of learning long-tailed +distributions using highly resource-efficient binary neural networks as +backbones. Specifically, we propose a calibrate-and-distill framework that uses +off-the-shelf pretrained full-precision models trained on balanced datasets to +use as teachers for distillation when learning binary networks on long-tailed +datasets. To better generalize to various datasets, we further propose a novel +adversarial balancing among the terms in the objective function and an +efficient multiresolution learning scheme. We conducted the largest empirical +study in the literature using 15 datasets, including newly derived long-tailed +datasets from existing balanced datasets, and show that our proposed method +outperforms prior art by large margins (>14.33% on average). + +
+
+
+
+
+ + ☆ Look-Around Before You Leap: High-Frequency Injected Transformer for + Image Restoration + + +
+ Transformer-based approaches have achieved superior performance in image +restoration, since they can model long-term dependencies well. However, the +limitation in capturing local information restricts their capacity to remove +degradations. While existing approaches attempt to mitigate this issue by +incorporating convolutional operations, the core component in Transformer, +i.e., self-attention, which serves as a low-pass filter, could unintentionally +dilute or even eliminate the acquired local patterns. In this paper, we propose +HIT, a simple yet effective High-frequency Injected Transformer for image +restoration. Specifically, we design a window-wise injection module (WIM), +which incorporates abundant high-frequency details into the feature map, to +provide reliable references for restoring high-quality images. We also develop +a bidirectional interaction module (BIM) to aggregate features at different +scales using a mutually reinforced paradigm, resulting in spatially and +contextually improved representations. In addition, we introduce a spatial +enhancement unit (SEU) to preserve essential spatial relationships that may be +lost due to the computations carried out across channel dimensions in the BIM. +Extensive experiments on 9 tasks (real noise, real rain streak, raindrop, +motion blur, moir\'e, shadow, snow, haze, and low-light condition) demonstrate +that HIT with linear computational complexity performs favorably against the +state-of-the-art methods. The source code and pre-trained models will be +available at https://github.com/joshyZhou/HIT. + +
+
+ comment: 19 pages, 7 figures +
+
+
+
+
+ + ☆ HSIMamba: Hyperpsectral Imaging Efficient Feature Learning with + Bidirectional State Space for Classification + + +
+ Classifying hyperspectral images is a difficult task in remote sensing, due +to their complex high-dimensional data. To address this challenge, we propose +HSIMamba, a novel framework that uses bidirectional reversed convolutional +neural network pathways to extract spectral features more efficiently. +Additionally, it incorporates a specialized block for spatial analysis. Our +approach combines the operational efficiency of CNNs with the dynamic feature +extraction capability of attention mechanisms found in Transformers. However, +it avoids the associated high computational demands. HSIMamba is designed to +process data bidirectionally, significantly enhancing the extraction of +spectral features and integrating them with spatial information for +comprehensive analysis. This approach improves classification accuracy beyond +current benchmarks and addresses computational inefficiencies encountered with +advanced models like Transformers. HSIMamba were tested against three widely +recognized datasets Houston 2013, Indian Pines, and Pavia University and +demonstrated exceptional performance, surpassing existing state-of-the-art +models in HSI classification. This method highlights the methodological +innovation of HSIMamba and its practical implications, which are particularly +valuable in contexts where computational resources are limited. HSIMamba +redefines the standards of efficiency and accuracy in HSI classification, +thereby enhancing the capabilities of remote sensing applications. +Hyperspectral imaging has become a crucial tool for environmental surveillance, +agriculture, and other critical areas that require detailed analysis of the +Earth surface. Please see our code in HSIMamba for more details. + +
+
+ comment: 11 pages, 2 figures, 8 tables +
+
+
+
+
+ + ☆ IPoD: Implicit Field Learning with Point Diffusion for Generalizable 3D + Object Reconstruction from Single RGB-D Images CVPR 2024 + + +
+ Generalizable 3D object reconstruction from single-view RGB-D images remains +a challenging task, particularly with real-world data. Current state-of-the-art +methods develop Transformer-based implicit field learning, necessitating an +intensive learning paradigm that requires dense query-supervision uniformly +sampled throughout the entire space. We propose a novel approach, IPoD, which +harmonizes implicit field learning with point diffusion. This approach treats +the query points for implicit field learning as a noisy point cloud for +iterative denoising, allowing for their dynamic adaptation to the target object +shape. Such adaptive query points harness diffusion learning's capability for +coarse shape recovery and also enhances the implicit representation's ability +to delineate finer details. Besides, an additional self-conditioning mechanism +is designed to use implicit predictions as the guidance of diffusion learning, +leading to a cooperative system. Experiments conducted on the CO3D-v2 dataset +affirm the superiority of IPoD, achieving 7.8% improvement in F-score and 28.6% +in Chamfer distance over existing methods. The generalizability of IPoD is also +demonstrated on the MVImgNet dataset. Our project page is at +https://yushuang-wu.github.io/IPoD. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Image-to-Image Matching via Foundation Models: A New Perspective for + Open-Vocabulary Semantic Segmentation CVPR2024 + + +
+ Open-vocabulary semantic segmentation (OVS) aims to segment images of +arbitrary categories specified by class labels or captions. However, most +previous best-performing methods, whether pixel grouping methods or region +recognition methods, suffer from false matches between image features and +category labels. We attribute this to the natural gap between the textual +features and visual features. In this work, we rethink how to mitigate false +matches from the perspective of image-to-image matching and propose a novel +relation-aware intra-modal matching (RIM) framework for OVS based on visual +foundation models. RIM achieves robust region classification by firstly +constructing diverse image-modal reference features and then matching them with +region features based on relation-aware ranking distribution. The proposed RIM +enjoys several merits. First, the intra-modal reference features are better +aligned, circumventing potential ambiguities that may arise in cross-modal +matching. Second, the ranking-based matching process harnesses the structure +information implicit in the inter-class relationships, making it more robust +than comparing individually. Extensive experiments on three benchmarks +demonstrate that RIM outperforms previous state-of-the-art methods by large +margins, obtaining a lead of more than 10% in mIoU on PASCAL VOC benchmark. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ☆ Exploiting Self-Supervised Constraints in Image Super-Resolution ICME 2024 + + +
+ Recent advances in self-supervised learning, predominantly studied in +high-level visual tasks, have been explored in low-level image processing. This +paper introduces a novel self-supervised constraint for single image +super-resolution, termed SSC-SR. SSC-SR uniquely addresses the divergence in +image complexity by employing a dual asymmetric paradigm and a target model +updated via exponential moving average to enhance stability. The proposed +SSC-SR framework works as a plug-and-play paradigm and can be easily applied to +existing SR models. Empirical evaluations reveal that our SSC-SR framework +delivers substantial enhancements on a variety of benchmark datasets, achieving +an average increase of 0.1 dB over EDSR and 0.06 dB over SwinIR. In addition, +extensive ablation studies corroborate the effectiveness of each constituent in +our SSC-SR framework. Codes are available at https://github.com/Aitical/SSCSR. + +
+
+ comment: ICME 2024 +
+
+
+
+
+ + ☆ YOLOOC: YOLO-based Open-Class Incremental Object Detection with Novel + Class Discovery ACCV 2022 + + +
+ Because of its use in practice, open-world object detection (OWOD) has gotten +a lot of attention recently. The challenge is how can a model detect novel +classes and then incrementally learn them without forgetting previously known +classes. Previous approaches hinge on strongly-supervised or weakly-supervised +novel-class data for novel-class detection, which may not apply to real +applications. We construct a new benchmark that novel classes are only +encountered at the inference stage. And we propose a new OWOD detector YOLOOC, +based on the YOLO architecture yet for the Open-Class setup. We introduce label +smoothing to prevent the detector from over-confidently mapping novel classes +to known classes and to discover novel classes. Extensive experiments conducted +on our more realistic setup demonstrate the effectiveness of our method for +discovering novel classes in our new benchmark. + +
+
+ comment: Initially submitted to ACCV 2022 +
+
+
+
+
+ + ☆ Learned Scanpaths Aid Blind Panoramic Video Quality Assessment + + +
+ Panoramic videos have the advantage of providing an immersive and interactive +viewing experience. Nevertheless, their spherical nature gives rise to various +and uncertain user viewing behaviors, which poses significant challenges for +panoramic video quality assessment (PVQA). In this work, we propose an +end-to-end optimized, blind PVQA method with explicit modeling of user viewing +patterns through visual scanpaths. Our method consists of two modules: a +scanpath generator and a quality assessor. The scanpath generator is initially +trained to predict future scanpaths by minimizing their expected code length +and then jointly optimized with the quality assessor for quality prediction. +Our blind PVQA method enables direct quality assessment of panoramic images by +treating them as videos composed of identical frames. Experiments on three +public panoramic image and video quality datasets, encompassing both synthetic +and authentic distortions, validate the superiority of our blind PVQA model +over existing methods. + +
+
+
+
+
+ + ☆ Grid Diffusion Models for Text-to-Video Generation CVPR 2024 + + +
+ Recent advances in the diffusion models have significantly improved +text-to-image generation. However, generating videos from text is a more +challenging task than generating images from text, due to the much larger +dataset and higher computational cost required. Most existing video generation +methods use either a 3D U-Net architecture that considers the temporal +dimension or autoregressive generation. These methods require large datasets +and are limited in terms of computational costs compared to text-to-image +generation. To tackle these challenges, we propose a simple but effective novel +grid diffusion for text-to-video generation without temporal dimension in +architecture and a large text-video paired dataset. We can generate a +high-quality video using a fixed amount of GPU memory regardless of the number +of frames by representing the video as a grid image. Additionally, since our +method reduces the dimensions of the video to the dimensions of the image, +various image-based methods can be applied to videos, such as text-guided video +manipulation from image manipulation. Our proposed method outperforms the +existing methods in both quantitative and qualitative evaluations, +demonstrating the suitability of our model for real-world video generation. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Attention-based Shape-Deformation Networks for Artifact-Free Geometry + Reconstruction of Lumbar Spine from MR Images + + +
+ Lumbar disc degeneration, a progressive structural wear and tear of lumbar +intervertebral disc, is regarded as an essential role on low back pain, a +significant global health concern. Automated lumbar spine geometry +reconstruction from MR images will enable fast measurement of medical +parameters to evaluate the lumbar status, in order to determine a suitable +treatment. Existing image segmentation-based techniques often generate +erroneous segments or unstructured point clouds, unsuitable for medical +parameter measurement. In this work, we present TransDeformer: a novel +attention-based deep learning approach that reconstructs the contours of the +lumbar spine with high spatial accuracy and mesh correspondence across +patients, and we also present a variant of TransDeformer for error estimation. +Specially, we devise new attention modules with a new attention formula, which +integrates image features and tokenized contour features to predict the +displacements of the points on a shape template without the need for image +segmentation. The deformed template reveals the lumbar spine geometry in the +input image. We develop a multi-stage training strategy to enhance model +robustness with respect to template initialization. Experiment results show +that our TransDeformer generates artifact-free geometry outputs, and its +variant predicts the error of a reconstructed geometry. Our code is available +at https://github.com/linchenq/TransDeformer-Mesh. + +
+
+
+
+
+ + ☆ Latent Watermark: Inject and Detect Watermarks in Latent Diffusion Space + + +
+ Watermarking is a tool for actively identifying and attributing the images +generated by latent diffusion models. Existing methods face the dilemma of +watermark robustness and image quality. The reason for this dilemma is that +watermark detection is performed in pixel space, implying an intrinsic link +between image quality and watermark robustness. In this paper, we highlight +that an effective solution to the problem is to both inject and detect +watermarks in latent space, and propose Latent Watermark (LW) with a +progressive training strategy. Experiments show that compared to the recently +proposed methods such as StegaStamp, StableSignature, RoSteALS and TreeRing, LW +not only surpasses them in terms of robustness but also offers superior image +quality. When we inject 64-bit messages, LW can achieve an identification +performance close to 100% and an attribution performance above 97% under 9 +single-attack scenarios and one all-attack scenario. Our code will be available +on GitHub. + +
+
+
+
+
+ + ☆ Design as Desired: Utilizing Visual Question Answering for Multimodal + Pre-training + + +
+ Multimodal pre-training demonstrates its potential in the medical domain, +which learns medical visual representations from paired medical reports. +However, many pre-training tasks require extra annotations from clinicians, and +most of them fail to explicitly guide the model to learn the desired features +of different pathologies. To the best of our knowledge, we are the first to +utilize Visual Question Answering (VQA) for multimodal pre-training to guide +the framework focusing on targeted pathological features. In this work, we +leverage descriptions in medical reports to design multi-granular +question-answer pairs associated with different diseases, which assist the +framework in pre-training without requiring extra annotations from experts. We +also propose a novel pre-training framework with a quasi-textual feature +transformer, a module designed to transform visual features into a +quasi-textual space closer to the textual domain via a contrastive learning +strategy. This narrows the vision-language gap and facilitates modality +alignment. Our framework is applied to four downstream tasks: report +generation, classification, segmentation, and detection across five datasets. +Extensive experiments demonstrate the superiority of our framework compared to +other state-of-the-art methods. Our code will be released upon acceptance. + +
+
+
+
+
+ + ♻ ☆ AttackNet: Enhancing Biometric Security via Tailored Convolutional + Neural Network Architectures for Liveness Detection + + +
+ Biometric security is the cornerstone of modern identity verification and +authentication systems, where the integrity and reliability of biometric +samples is of paramount importance. This paper introduces AttackNet, a bespoke +Convolutional Neural Network architecture, meticulously designed to combat +spoofing threats in biometric systems. Rooted in deep learning methodologies, +this model offers a layered defense mechanism, seamlessly transitioning from +low-level feature extraction to high-level pattern discernment. Three +distinctive architectural phases form the crux of the model, each underpinned +by judiciously chosen activation functions, normalization techniques, and +dropout layers to ensure robustness and resilience against adversarial attacks. +Benchmarking our model across diverse datasets affirms its prowess, showcasing +superior performance metrics in comparison to contemporary models. Furthermore, +a detailed comparative analysis accentuates the model's efficacy, drawing +parallels with prevailing state-of-the-art methodologies. Through iterative +refinement and an informed architectural strategy, AttackNet underscores the +potential of deep learning in safeguarding the future of biometric security. + +
+
+
+
+
+ + ♻ ☆ MAPSeg: Unified Unsupervised Domain Adaptation for Heterogeneous Medical + Image Segmentation Based on 3D Masked Autoencoding and Pseudo-Labeling CVPR 2024 + + +
+ Robust segmentation is critical for deriving quantitative measures from +large-scale, multi-center, and longitudinal medical scans. Manually annotating +medical scans, however, is expensive and labor-intensive and may not always be +available in every domain. Unsupervised domain adaptation (UDA) is a +well-studied technique that alleviates this label-scarcity problem by +leveraging available labels from another domain. In this study, we introduce +Masked Autoencoding and Pseudo-Labeling Segmentation (MAPSeg), a +$\textbf{unified}$ UDA framework with great versatility and superior +performance for heterogeneous and volumetric medical image segmentation. To the +best of our knowledge, this is the first study that systematically reviews and +develops a framework to tackle four different domain shifts in medical image +segmentation. More importantly, MAPSeg is the first framework that can be +applied to $\textbf{centralized}$, $\textbf{federated}$, and +$\textbf{test-time}$ UDA while maintaining comparable performance. We compare +MAPSeg with previous state-of-the-art methods on a private infant brain MRI +dataset and a public cardiac CT-MRI dataset, and MAPSeg outperforms others by a +large margin (10.5 Dice improvement on the private MRI dataset and 5.7 on the +public CT-MRI dataset). MAPSeg poses great practical value and can be applied +to real-world problems. GitHub: https://github.com/XuzheZ/MAPSeg/. + +
+
+ comment: CVPR 2024 camera-ready (8 pages, 3 figures) with the supplemental + materials (5 pages, 4 figures). Xuzhe Zhang and Yuhao Wu are co-first + authors. Andrew F. Laine and Yun Wang are co-senior supervising authors +
+
+
+
+
+ + ♻ ☆ LangNav: Language as a Perceptual Representation for Navigation + + +
+ We explore the use of language as a perceptual representation for +vision-and-language navigation (VLN), with a focus on low-data settings. Our +approach uses off-the-shelf vision systems for image captioning and object +detection to convert an agent's egocentric panoramic view at each time step +into natural language descriptions. We then finetune a pretrained language +model to select an action, based on the current view and the trajectory +history, that would best fulfill the navigation instructions. In contrast to +the standard setup which adapts a pretrained language model to work directly +with continuous visual features from pretrained vision models, our approach +instead uses (discrete) language as the perceptual representation. We explore +several use cases of our language-based navigation (LangNav) approach on the +R2R VLN benchmark: generating synthetic trajectories from a prompted language +model (GPT-4) with which to finetune a smaller language model; domain transfer +where we transfer a policy learned on one simulated environment (ALFRED) to +another (more realistic) environment (R2R); and combining both vision- and +language-based representations for VLN. Our approach is found to improve upon +baselines that rely on visual features in settings where only a few expert +trajectories (10-100) are available, demonstrating the potential of language as +a perceptual representation for navigation. + +
+
+
+
+
+ + ♻ ☆ U-Net v2: Rethinking the Skip Connections of U-Net for Medical Image + Segmentation + + +
+ In this paper, we introduce U-Net v2, a new robust and efficient U-Net +variant for medical image segmentation. It aims to augment the infusion of +semantic information into low-level features while simultaneously refining +high-level features with finer details. For an input image, we begin by +extracting multi-level features with a deep neural network encoder. Next, we +enhance the feature map of each level by infusing semantic information from +higher-level features and integrating finer details from lower-level features +through Hadamard product. Our novel skip connections empower features of all +the levels with enriched semantic characteristics and intricate details. The +improved features are subsequently transmitted to the decoder for further +processing and segmentation. Our method can be seamlessly integrated into any +Encoder-Decoder network. We evaluate our method on several public medical image +segmentation datasets for skin lesion segmentation and polyp segmentation, and +the experimental results demonstrate the segmentation accuracy of our new +method over state-of-the-art methods, while preserving memory and computational +efficiency. Code is available at: https://github.com/yaoppeng/U-Net_v2 + +
+
+
+
+
+ + ♻ ☆ Recursive Joint Cross-Modal Attention for Multimodal Fusion in + Dimensional Emotion Recognition + + +
+ Though multimodal emotion recognition has achieved significant progress over +recent years, the potential of rich synergic relationships across the +modalities is not fully exploited. In this paper, we introduce Recursive Joint +Cross-Modal Attention (RJCMA) to effectively capture both intra-and inter-modal +relationships across audio, visual and text modalities for dimensional emotion +recognition. In particular, we compute the attention weights based on +cross-correlation between the joint audio-visual-text feature representations +and the feature representations of individual modalities to simultaneously +capture intra- and inter-modal relationships across the modalities. The +attended features of the individual modalities are again fed as input to the +fusion model in a recursive mechanism to obtain more refined feature +representations. We have also explored Temporal Convolutional Networks (TCNs) +to improve the temporal modeling of the feature representations of individual +modalities. Extensive experiments are conducted to evaluate the performance of +the proposed fusion model on the challenging Affwild2 dataset. By effectively +capturing the synergic intra- and inter-modal relationships across audio, +visual and text modalities, the proposed fusion model achieves a Concordance +Correlation Coefficient (CCC) of 0.585 (0.542) and 0.659 (0.619) for valence +and arousal respectively on the validation set (test set). This shows a +significant improvement over the baseline of 0.24 (0.211) and 0.20 (0.191) for +valence and arousal respectively on the validation set (test set) of the +valence-arousal challenge of 6th Affective Behavior Analysis in-the-Wild (ABAW) +competition. + +
+
+
+
+
+ + ♻ ☆ Unifying Top-down and Bottom-up Scanpath Prediction Using Transformers CVPR 2024 + + +
+ Most models of visual attention aim at predicting either top-down or +bottom-up control, as studied using different visual search and free-viewing +tasks. In this paper we propose the Human Attention Transformer (HAT), a single +model that predicts both forms of attention control. HAT uses a novel +transformer-based architecture and a simplified foveated retina that +collectively create a spatio-temporal awareness akin to the dynamic visual +working memory of humans. HAT not only establishes a new state-of-the-art in +predicting the scanpath of fixations made during target-present and +target-absent visual search and ``taskless'' free viewing, but also makes human +gaze behavior interpretable. Unlike previous methods that rely on a coarse grid +of fixation cells and experience information loss due to fixation +discretization, HAT features a sequential dense prediction architecture and +outputs a dense heatmap for each fixation, thus avoiding discretizing +fixations. HAT sets a new standard in computational attention, which emphasizes +effectiveness, generality, and interpretability. HAT's demonstrated scope and +applicability will likely inspire the development of new attention models that +can better predict human behavior in various attention-demanding scenarios. +Code is available at https://github.com/cvlab-stonybrook/HAT. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Mamba-UNet: UNet-Like Pure Visual Mamba for Medical Image Segmentation + + +
+ In recent advancements in medical image analysis, Convolutional Neural +Networks (CNN) and Vision Transformers (ViT) have set significant benchmarks. +While the former excels in capturing local features through its convolution +operations, the latter achieves remarkable global context understanding by +leveraging self-attention mechanisms. However, both architectures exhibit +limitations in efficiently modeling long-range dependencies within medical +images, which is a critical aspect for precise segmentation. Inspired by the +Mamba architecture, known for its proficiency in handling long sequences and +global contextual information with enhanced computational efficiency as a State +Space Model (SSM), we propose Mamba-UNet, a novel architecture that synergizes +the U-Net in medical image segmentation with Mamba's capability. Mamba-UNet +adopts a pure Visual Mamba (VMamba)-based encoder-decoder structure, infused +with skip connections to preserve spatial information across different scales +of the network. This design facilitates a comprehensive feature learning +process, capturing intricate details and broader semantic contexts within +medical images. We introduce a novel integration mechanism within the VMamba +blocks to ensure seamless connectivity and information flow between the encoder +and decoder paths, enhancing the segmentation performance. We conducted +experiments on publicly available ACDC MRI Cardiac segmentation dataset, and +Synapse CT Abdomen segmentation dataset. The results show that Mamba-UNet +outperforms several types of UNet in medical image segmentation under the same +hyper-parameter setting. The source code and baseline implementations are +available. + +
+
+
+
+
+ + ♻ ☆ Towards minimizing efforts for Morphing Attacks -- Deep embeddings for + morphing pair selection and improved Morphing Attack Detection + + +
+ Face Morphing Attacks pose a threat to the security of identity documents, +especially with respect to a subsequent access control process, because it +enables both individuals involved to exploit the same document. In this study, +face embeddings serve two purposes: pre-selecting images for large-scale +Morphing Attack generation and detecting potential Morphing Attacks. We build +upon previous embedding studies in both use cases using the MagFace model. For +the first objective, we employ an pre-selection algorithm that pairs +individuals based on face embedding similarity. We quantify the attack +potential of differently morphed face images to compare the usability of +pre-selection in automatically generating numerous successful Morphing Attacks. +Regarding the second objective, we compare embeddings from two state-of-the-art +face recognition systems in terms of their ability to detect Morphing Attacks. +Our findings demonstrate that ArcFace and MagFace provide valuable face +embeddings for image pre-selection. Both open-source and COTS face recognition +systems are susceptible to generated attacks, particularly when pre-selection +is based on embeddings rather than random pairing which was only constrained by +soft biometrics. More accurate face recognition systems exhibit greater +vulnerability to attacks, with COTS systems being the most susceptible. +Additionally, MagFace embeddings serve as a robust alternative for detecting +morphed face images compared to the previously used ArcFace embeddings. The +results endorse the advantages of face embeddings in more effective image +pre-selection for face morphing and accurate detection of morphed face images. +This is supported by extensive analysis of various designed attacks. The +MagFace model proves to be a powerful alternative to the commonly used ArcFace +model for both objectives, pre-selection and attack detection. + +
+
+
+
+
+ + ♻ ☆ Total-Decom: Decomposed 3D Scene Reconstruction with Minimal Interaction CVPR 2024 + + +
+ Scene reconstruction from multi-view images is a fundamental problem in +computer vision and graphics. Recent neural implicit surface reconstruction +methods have achieved high-quality results; however, editing and manipulating +the 3D geometry of reconstructed scenes remains challenging due to the absence +of naturally decomposed object entities and complex object/background +compositions. In this paper, we present Total-Decom, a novel method for +decomposed 3D reconstruction with minimal human interaction. Our approach +seamlessly integrates the Segment Anything Model (SAM) with hybrid +implicit-explicit neural surface representations and a mesh-based +region-growing technique for accurate 3D object decomposition. Total-Decom +requires minimal human annotations while providing users with real-time control +over the granularity and quality of decomposition. We extensively evaluate our +method on benchmark datasets and demonstrate its potential for downstream +applications, such as animation and scene editing. The code is available at +https://github.com/CVMI-Lab/Total-Decom.git. + +
+
+ comment: 8 pages, 7 figures, accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ RS-DPO: A Hybrid Rejection Sampling and Direct Preference Optimization + Method for Alignment of Large Language Models + + +
+ Reinforcement learning from human feedback (RLHF) has been extensively +employed to align large language models with user intent. However, proximal +policy optimization (PPO) based RLHF is occasionally unstable requiring +significant hyperparameter finetuning, and computationally expensive to +maximize the estimated reward during alignment. Recently, direct preference +optimization (DPO) is proposed to address those challenges. However, DPO relies +on contrastive responses generated from human annotator and alternative LLM, +instead of the policy model, limiting the effectiveness of the RLHF. In this +paper, we addresses both challenges by systematically combining rejection +sampling (RS) and DPO. Our proposed method, RS-DPO, initiates with the +development of a supervised fine-tuned policy model (SFT). A varied set of k +responses per prompt are sampled directly from the SFT model. RS-DPO identifies +pairs of contrastive samples based on their reward distribution. Finally, we +apply DPO with the contrastive samples to align the model to human preference. +Our experiments indicate that our proposed method effectively fine-tunes LLMs +with limited resource environments, leading to improved alignment with user +intent. Furthermore, it outperforms existing methods, including RS, PPO, and +DPO. + +
+
+ comment: 16 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ An Examination of the Compositionality of Large Generative + Vision-Language Models + + +
+ With the success of Large Language Models (LLMs), many Generative +Vision-Language Models (GVLMs) have been constructed via multimodal instruction +tuning. However, the performance of GVLMs in multimodal compositional reasoning +remains under-explored. In this paper, we examine both the evaluation metrics +(VisualGPTScore, etc.) and current benchmarks for evaluating the +compositionality of GVLMs. We identify the syntactical bias in current +benchmarks, which is exploited by the linguistic capability of GVLMs. The bias +renders VisualGPTScore an insufficient metric for assessing GVLMs. To combat +this, we first introduce a SyntaxBias Score, leveraging LLMs to quantify such +bias for mitigation. A challenging new task is subsequently added to evaluate +the robustness of GVLMs against inherent inclination toward syntactical +correctness. Using the bias-mitigated datasets and the new task, we propose a +novel benchmark, namely SyntActically DE-biased benchmark (SADE). Our study +provides an unbiased benchmark for the compositionality of GVLMs, facilitating +future research in this direction (Code and dataset are available at +https://github.com/TeleeMa/SADE). + +
+
+
+
+
+ + ♻ ☆ Scale Alone Does not Improve Mechanistic Interpretability in Vision + Models NeurIPS 2023 + + +
+ In light of the recent widespread adoption of AI systems, understanding the +internal information processing of neural networks has become increasingly +critical. Most recently, machine vision has seen remarkable progress by scaling +neural networks to unprecedented levels in dataset and model size. We here ask +whether this extraordinary increase in scale also positively impacts the field +of mechanistic interpretability. In other words, has our understanding of the +inner workings of scaled neural networks improved as well? We use a +psychophysical paradigm to quantify one form of mechanistic interpretability +for a diverse suite of nine models and find no scaling effect for +interpretability - neither for model nor dataset size. Specifically, none of +the investigated state-of-the-art models are easier to interpret than the +GoogLeNet model from almost a decade ago. Latest-generation vision models +appear even less interpretable than older architectures, hinting at a +regression rather than improvement, with modern models sacrificing +interpretability for accuracy. These results highlight the need for models +explicitly designed to be mechanistically interpretable and the need for more +helpful interpretability methods to increase our understanding of networks at +an atomic level. We release a dataset containing more than 130'000 human +responses from our psychophysical evaluation of 767 units across nine models. +This dataset facilitates research on automated instead of human-based +interpretability evaluations, which can ultimately be leveraged to directly +optimize the mechanistic interpretability of models. + +
+
+ comment: Spotlight at NeurIPS 2023. The first two authors contributed equally. + Code available at https://brendel-group.github.io/imi/ +
+
+
+
+
+ + ♻ ☆ ReGround: Improving Textual and Spatial Grounding at No Cost + + +
+ When an image generation process is guided by both a text prompt and spatial +cues, such as a set of bounding boxes, do these elements work in harmony, or +does one dominate the other? Our analysis of a pretrained image diffusion model +that integrates gated self-attention into the U-Net reveals that spatial +grounding often outweighs textual grounding due to the sequential flow from +gated self-attention to cross-attention. We demonstrate that such bias can be +significantly mitigated without sacrificing accuracy in either grounding by +simply rewiring the network architecture, changing from sequential to parallel +for gated self-attention and cross-attention. This surprisingly simple yet +effective solution does not require any fine-tuning of the network but +significantly reduces the trade-off between the two groundings. Our experiments +demonstrate significant improvements from the original GLIGEN to the rewired +version in the trade-off between textual grounding and spatial grounding. + +
+
+ comment: Project page: https://re-ground.github.io/ +
+
+
+
+
+ + ♻ ☆ Rapid post-disaster infrastructure damage characterisation enabled by + remote sensing and deep learning technologies -- a tiered approach + + +
+ Critical infrastructure are systematically targeted during wars and extensive +natural disasters because critical infrastructure is vital for enabling +connectivity and transportation of people and goods, and hence, underpins +national and international economic growth. Mass destruction of transport +assets, in conjunction with minimal or no accessibility in the wake of natural +and anthropogenic disasters, prevents us from delivering rapid recovery and +adaptation. A solution to this challenge is to use technology that enables +stand-off observations. Nevertheless, no methods exist for the integrated +characterisation of damage at multiple scales, i.e. regional, asset, and +structural scales, while there is no systematic correlation between +infrastructure damage assessments across these scales. We propose a methodology +based on an integrated multi-scale tiered approach to fill this capability gap. +In doing so, we demonstrate how damage characterisation can be enabled by +fit-for-purpose digital technologies. Next, the methodology is applied and +validated to a case study in Ukraine that includes 17 bridges all damages by +human targeted interventions. From macro to micro, we deploy technology to +integrate assessments at scale, using from Sentinel-1 SAR images, crowdsourced +information, and high-resolution images to deep learning to characterise +infrastructure damage. For the first time, the interferometric coherence +difference and semantic segmentation of images were deployed to improve the +reliability of damage characterisations at different scales, i.e. regional, +infrastructure asset and component, with the aim of enhancing the damage +characterisation accuracy. This integrated approach accelerates +decision-making, and therefore, facilitates more efficient restoration and +adaptation efforts, ultimately fostering resilience into our infrastructure. + +
+
+ comment: Main text (33 pages,15 figures); Supplementary materials (19 pages) +
+
+
+
+
+ + ♻ ☆ Auto MC-Reward: Automated Dense Reward Design with Large Language Models + for Minecraft CVPR2024 + + +
+ Many reinforcement learning environments (e.g., Minecraft) provide only +sparse rewards that indicate task completion or failure with binary values. The +challenge in exploration efficiency in such environments makes it difficult for +reinforcement-learning-based agents to learn complex tasks. To address this, +this paper introduces an advanced learning system, named Auto MC-Reward, that +leverages Large Language Models (LLMs) to automatically design dense reward +functions, thereby enhancing the learning efficiency. Auto MC-Reward consists +of three important components: Reward Designer, Reward Critic, and Trajectory +Analyzer. Given the environment information and task descriptions, the Reward +Designer first design the reward function by coding an executable Python +function with predefined observation inputs. Then, our Reward Critic will be +responsible for verifying the code, checking whether the code is +self-consistent and free of syntax and semantic errors. Further, the Trajectory +Analyzer summarizes possible failure causes and provides refinement suggestions +according to collected trajectories. In the next round, Reward Designer will +further refine and iterate the dense reward function based on feedback. +Experiments demonstrate a significant improvement in the success rate and +learning efficiency of our agents in complex tasks in Minecraft, such as +obtaining diamond with the efficient ability to avoid lava, and efficiently +explore trees and animals that are sparse in the plains biome. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ Video-Based Autism Detection with Deep Learning + + +
+ Individuals with Autism Spectrum Disorder (ASD) often experience challenges +in health, communication, and sensory processing; therefore, early diagnosis is +necessary for proper treatment and care. In this work, we consider the problem +of detecting or classifying ASD children to aid medical professionals in early +diagnosis. We develop a deep learning model that analyzes video clips of +children reacting to sensory stimuli, with the intent of capturing key +differences in reactions and behavior between ASD and non-ASD participants. +Unlike many recent studies in ASD classification with MRI data, which require +expensive specialized equipment, our method utilizes a powerful but relatively +affordable GPU, a standard computer setup, and a video camera for inference. +Results show that our model effectively generalizes and understands key +differences in the distinct movements of the children. It is noteworthy that +our model exhibits successful classification performance despite the limited +amount of data for a deep learning problem and limited temporal information +available for learning, even with the motion artifacts. + +
+
+ comment: Poster Abstract. Accepted into 2024 IEEE Green Technologies + Conference +
+
+
+
+
+ + ♻ ☆ SiTH: Single-view Textured Human Reconstruction with Image-Conditioned + Diffusion CVPR 2024 + + +
+ A long-standing goal of 3D human reconstruction is to create lifelike and +fully detailed 3D humans from single-view images. The main challenge lies in +inferring unknown body shapes, appearances, and clothing details in areas not +visible in the images. To address this, we propose SiTH, a novel pipeline that +uniquely integrates an image-conditioned diffusion model into a 3D mesh +reconstruction workflow. At the core of our method lies the decomposition of +the challenging single-view reconstruction problem into generative +hallucination and reconstruction subproblems. For the former, we employ a +powerful generative diffusion model to hallucinate unseen back-view appearance +based on the input images. For the latter, we leverage skinned body meshes as +guidance to recover full-body texture meshes from the input and back-view +images. SiTH requires as few as 500 3D human scans for training while +maintaining its generality and robustness to diverse images. Extensive +evaluations on two 3D human benchmarks, including our newly created one, +highlighted our method's superior accuracy and perceptual quality in 3D +textured human reconstruction. Our code and evaluation benchmark are available +at https://ait.ethz.ch/sith + +
+
+ comment: 23 pages, 23 figures, CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Gaussian Head Avatar: Ultra High-fidelity Head Avatar via Dynamic + Gaussians + + +
+ Creating high-fidelity 3D head avatars has always been a research hotspot, +but there remains a great challenge under lightweight sparse view setups. In +this paper, we propose Gaussian Head Avatar represented by controllable 3D +Gaussians for high-fidelity head avatar modeling. We optimize the neutral 3D +Gaussians and a fully learned MLP-based deformation field to capture complex +expressions. The two parts benefit each other, thereby our method can model +fine-grained dynamic details while ensuring expression accuracy. Furthermore, +we devise a well-designed geometry-guided initialization strategy based on +implicit SDF and Deep Marching Tetrahedra for the stability and convergence of +the training procedure. Experiments show our approach outperforms other +state-of-the-art sparse-view methods, achieving ultra high-fidelity rendering +quality at 2K resolution even under exaggerated expressions. + +
+
+ comment: Projectpage: https://yuelangx.github.io/gaussianheadavatar, Code: + https://github.com/YuelangX/Gaussian-Head-Avatar +
+
+
+
+
+ + ♻ ☆ LatentEditor: Text Driven Local Editing of 3D Scenes + + +
+ While neural fields have made significant strides in view synthesis and scene +reconstruction, editing them poses a formidable challenge due to their implicit +encoding of geometry and texture information from multi-view inputs. In this +paper, we introduce \textsc{LatentEditor}, an innovative framework designed to +empower users with the ability to perform precise and locally controlled +editing of neural fields using text prompts. Leveraging denoising diffusion +models, we successfully embed real-world scenes into the latent space, +resulting in a faster and more adaptable NeRF backbone for editing compared to +traditional methods. To enhance editing precision, we introduce a delta score +to calculate the 2D mask in the latent space that serves as a guide for local +modifications while preserving irrelevant regions. Our novel pixel-level +scoring approach harnesses the power of InstructPix2Pix (IP2P) to discern the +disparity between IP2P conditional and unconditional noise predictions in the +latent space. The edited latents conditioned on the 2D masks are then +iteratively updated in the training set to achieve 3D local editing. Our +approach achieves faster editing speeds and superior output quality compared to +existing 3D editing models, bridging the gap between textual instructions and +high-quality 3D scene editing in latent space. We show the superiority of our +approach on four benchmark 3D datasets, LLFF, IN2N, NeRFStudio and NeRF-Art. + +
+
+ comment: Project Page: https://latenteditor.github.io/ +
+
+
+
+
+ + ♻ ☆ Robust Active Speaker Detection in Noisy Environments + + +
+ This paper addresses the issue of active speaker detection (ASD) in noisy +environments and formulates a robust active speaker detection (rASD) problem. +Existing ASD approaches leverage both audio and visual modalities, but +non-speech sounds in the surrounding environment can negatively impact +performance. To overcome this, we propose a novel framework that utilizes +audio-visual speech separation as guidance to learn noise-free audio features. +These features are then utilized in an ASD model, and both tasks are jointly +optimized in an end-to-end framework. Our proposed framework mitigates residual +noise and audio quality reduction issues that can occur in a naive cascaded +two-stage framework that directly uses separated speech for ASD, and enables +the two tasks to be optimized simultaneously. To further enhance the robustness +of the audio features and handle inherent speech noises, we propose a dynamic +weighted loss approach to train the speech separator. We also collected a +real-world noise audio dataset to facilitate investigations. Experiments +demonstrate that non-speech audio noises significantly impact ASD models, and +our proposed approach improves ASD performance in noisy environments. The +framework is general and can be applied to different ASD approaches to improve +their robustness. Our code, models, and data will be released. + +
+
+ comment: 15 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ WaveMix: A Resource-efficient Neural Network for Image Analysis + + +
+ We propose a novel neural architecture for computer vision -- WaveMix -- that +is resource-efficient and yet generalizable and scalable. While using fewer +trainable parameters, GPU RAM, and computations, WaveMix networks achieve +comparable or better accuracy than the state-of-the-art convolutional neural +networks, vision transformers, and token mixers for several tasks. This +efficiency can translate to savings in time, cost, and energy. To achieve these +gains we used multi-level two-dimensional discrete wavelet transform (2D-DWT) +in WaveMix blocks, which has the following advantages: (1) It reorganizes +spatial information based on three strong image priors -- scale-invariance, +shift-invariance, and sparseness of edges -- (2) in a lossless manner without +adding parameters, (3) while also reducing the spatial sizes of feature maps, +which reduces the memory and time required for forward and backward passes, and +(4) expanding the receptive field faster than convolutions do. The whole +architecture is a stack of self-similar and resolution-preserving WaveMix +blocks, which allows architectural flexibility for various tasks and levels of +resource availability. WaveMix establishes new benchmarks for segmentation on +Cityscapes; and for classification on Galaxy 10 DECals, Places-365, five EMNIST +datasets, and iNAT-mini and performs competitively on other benchmarks. Our +code and trained models are publicly available. + +
+
+ comment: 20 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Bidirectional Consistency Models + + +
+ Diffusion models (DMs) are capable of generating remarkably high-quality +samples by iteratively denoising a random vector, a process that corresponds to +moving along the probability flow ordinary differential equation (PF ODE). +Interestingly, DMs can also invert an input image to noise by moving backward +along the PF ODE, a key operation for downstream tasks such as interpolation +and image editing. However, the iterative nature of this process restricts its +speed, hindering its broader application. Recently, Consistency Models (CMs) +have emerged to address this challenge by approximating the integral of the PF +ODE, largely reducing the number of iterations. Yet, the absence of an explicit +ODE solver complicates the inversion process. To resolve this, we introduce the +Bidirectional Consistency Model (BCM), which learns a single neural network +that enables both forward and backward traversal along the PF ODE, efficiently +unifying generation and inversion tasks within one framework. Notably, our +proposed method enables one-step generation and inversion while also allowing +the use of additional steps to enhance generation quality or reduce +reconstruction error. Furthermore, by leveraging our model's bidirectional +consistency, we introduce a sampling strategy that can enhance FID while +preserving the generated image content. We further showcase our model's +capabilities in several downstream tasks, such as interpolation and inpainting, +and present demonstrations of potential applications, including blind +restoration of compressed images and defending black-box adversarial attacks. + +
+
+ comment: 40 pages, 25 figures +
+
+
+
+
+ + ♻ ☆ IllusionVQA: A Challenging Optical Illusion Dataset for Vision Language + Models + + +
+ The advent of Vision Language Models (VLM) has allowed researchers to +investigate the visual understanding of a neural network using natural +language. Beyond object classification and detection, VLMs are capable of +visual comprehension and common-sense reasoning. This naturally led to the +question: How do VLMs respond when the image itself is inherently unreasonable? +To this end, we present IllusionVQA: a diverse dataset of challenging optical +illusions and hard-to-interpret scenes to test the capability of VLMs in two +distinct multiple-choice VQA tasks - comprehension and soft localization. +GPT4V, the best-performing VLM, achieves 62.99% accuracy (4-shot) on the +comprehension task and 49.7% on the localization task (4-shot and +Chain-of-Thought). Human evaluation reveals that humans achieve 91.03% and 100% +accuracy in comprehension and localization. We discover that In-Context +Learning (ICL) and Chain-of-Thought reasoning substantially degrade the +performance of GeminiPro on the localization task. Tangentially, we discover a +potential weakness in the ICL capabilities of VLMs: they fail to locate optical +illusions even when the correct answer is in the context window as a few-shot +example. + +
+
+
+
+
+ + ♻ ☆ Synthesize, Diagnose, and Optimize: Towards Fine-Grained Vision-Language + Understanding CVPR 2024 + + +
+ Vision language models (VLM) have demonstrated remarkable performance across +various downstream tasks. However, understanding fine-grained visual-linguistic +concepts, such as attributes and inter-object relationships, remains a +significant challenge. While several benchmarks aim to evaluate VLMs in finer +granularity, their primary focus remains on the linguistic aspect, neglecting +the visual dimension. Here, we highlight the importance of evaluating VLMs from +both a textual and visual perspective. We introduce a progressive pipeline to +synthesize images that vary in a specific attribute while ensuring consistency +in all other aspects. Utilizing this data engine, we carefully design a +benchmark, SPEC, to diagnose the comprehension of object size, position, +existence, and count. Subsequently, we conduct a thorough evaluation of four +leading VLMs on SPEC. Surprisingly, their performance is close to random guess, +revealing significant limitations. With this in mind, we propose a simple yet +effective approach to optimize VLMs in fine-grained understanding, achieving +significant improvements on SPEC without compromising the zero-shot +performance. Results on two additional fine-grained benchmarks also show +consistent improvements, further validating the transferability of our +approach. Code and data are available at https://github.com/wjpoom/SPEC. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ NICP: Neural ICP for 3D Human Registration at Scale + + +
+ Aligning a template to 3D human point clouds is a long-standing problem +crucial for tasks like animation, reconstruction, and enabling supervised +learning pipelines. Recent data-driven methods leverage predicted surface +correspondences; however, they are not robust to varied poses, identities, or +noise. In contrast, industrial solutions often rely on expensive manual +annotations or multi-view capturing systems. Recently, neural fields have shown +promising results. Still, their purely data-driven and extrinsic nature does +not incorporate any guidance toward the target surface, often resulting in a +trivial misalignment of the template registration. Currently, no method can be +considered the standard for 3D Human registration, limiting the scalability of +downstream applications. In this work, we propose NSR, a pipeline that, for the +first time, generalizes and scales across thousands of shapes and more than ten +different data sources. Our essential contribution is NICP, an ICP-style +self-supervised task tailored to neural fields. NICP takes a few seconds, is +self-supervised, and works out of the box on pre-trained neural fields. We +combine it with a localized Neural Field trained on a large MoCap dataset. NSR +achieves the state of the art over public benchmarks, and the release of its +code and checkpoints will provide the community with a powerful tool useful for +many downstream tasks like dataset alignments, cleaning, or asset animation. + +
+
+
+
+
+ + ♻ ☆ As-Plausible-As-Possible: Plausibility-Aware Mesh Deformation Using 2D + Diffusion Priors + + +
+ We present As-Plausible-as-Possible (APAP) mesh deformation technique that +leverages 2D diffusion priors to preserve the plausibility of a mesh under +user-controlled deformation. Our framework uses per-face Jacobians to represent +mesh deformations, where mesh vertex coordinates are computed via a +differentiable Poisson Solve. The deformed mesh is rendered, and the resulting +2D image is used in the Score Distillation Sampling (SDS) process, which +enables extracting meaningful plausibility priors from a pretrained 2D +diffusion model. To better preserve the identity of the edited mesh, we +fine-tune our 2D diffusion model with LoRA. Gradients extracted by SDS and a +user-prescribed handle displacement are then backpropagated to the per-face +Jacobians, and we use iterative gradient descent to compute the final +deformation that balances between the user edit and the output plausibility. We +evaluate our method with 2D and 3D meshes and demonstrate qualitative and +quantitative improvements when using plausibility priors over +geometry-preservation or distortion-minimization priors used by previous +techniques. Our project page is at: https://as-plausible-aspossible.github.io/ + +
+
+ comment: Project page: https://as-plausible-as-possible.github.io/ +
+
+
+
+
+ + ♻ ☆ Fun with Flags: Robust Principal Directions via Flag Manifolds + + +
+ Principal component analysis (PCA), along with its extensions to manifolds +and outlier contaminated data, have been indispensable in computer vision and +machine learning. In this work, we present a unifying formalism for PCA and its +variants, and introduce a framework based on the flags of linear subspaces, ie +a hierarchy of nested linear subspaces of increasing dimension, which not only +allows for a common implementation but also yields novel variants, not explored +previously. We begin by generalizing traditional PCA methods that either +maximize variance or minimize reconstruction error. We expand these +interpretations to develop a wide array of new dimensionality reduction +algorithms by accounting for outliers and the data manifold. To devise a common +computational approach, we recast robust and dual forms of PCA as optimization +problems on flag manifolds. We then integrate tangent space approximations of +principal geodesic analysis (tangent-PCA) into this flag-based framework, +creating novel robust and dual geodesic PCA variations. The remarkable +flexibility offered by the 'flagification' introduced here enables even more +algorithmic variants identified by specific flag types. Last but not least, we +propose an effective convergent solver for these flag-formulations employing +the Stiefel manifold. Our empirical results on both real-world and synthetic +scenarios, demonstrate the superiority of our novel algorithms, especially in +terms of robustness to outliers on manifolds. + +
+
+
+
+
+ + ♻ ☆ Open Vocabulary Semantic Scene Sketch Understanding + + +
+ We study the underexplored but fundamental vision problem of machine +understanding of abstract freehand scene sketches. We introduce a sketch +encoder that results in semantically-aware feature space, which we evaluate by +testing its performance on a semantic sketch segmentation task. To train our +model we rely only on the availability of bitmap sketches with their brief +captions and do not require any pixel-level annotations. To obtain +generalization to a large set of sketches and categories, we build on a vision +transformer encoder pretrained with the CLIP model. We freeze the text encoder +and perform visual-prompt tuning of the visual encoder branch while introducing +a set of critical modifications. Firstly, we augment the classical key-query +(k-q) self-attention blocks with value-value (v-v) self-attention blocks. +Central to our model is a two-level hierarchical network design that enables +efficient semantic disentanglement: The first level ensures holistic scene +sketch encoding, and the second level focuses on individual categories. We, +then, in the second level of the hierarchy, introduce a cross-attention between +textual and visual branches. Our method outperforms zero-shot CLIP pixel +accuracy of segmentation results by 37 points, reaching an accuracy of $85.5\%$ +on the FS-COCO sketch dataset. Finally, we conduct a user study that allows us +to identify further improvements needed over our method to reconcile machine +and human understanding of scene sketches. + +
+
+
+
+
+ + ♻ ☆ Correlation-guided Query-Dependency Calibration in Video Representation + Learning for Temporal Grounding + + +
+ Video Temporal Grounding is to identify specific moments or highlights from a +video corresponding to textual descriptions. Typical approaches in temporal +grounding treat all video clips equally during the encoding process regardless +of their semantic relevance with the text query. Therefore, we propose +Correlation-Guided DEtection TRansformer(CG-DETR), exploring to provide clues +for query-associated video clips within the cross-modal attention. First, we +design an adaptive cross-attention with dummy tokens. Dummy tokens conditioned +by text query take portions of the attention weights, preventing irrelevant +video clips from being represented by the text query. Yet, not all words +equally inherit the text query's correlation to video clips. Thus, we further +guide the cross-attention map by inferring the fine-grained correlation between +video clips and words. We enable this by learning a joint embedding space for +high-level concepts, i.e., moment and sentence level, and inferring the +clip-word correlation. Lastly, we exploit the moment-specific characteristics +and combine them with the context of each video to form a moment-adaptive +saliency detector. By exploiting the degrees of text engagement in each video +clip, it precisely measures the highlightness of each clip. CG-DETR achieves +state-of-the-art results on various benchmarks for temporal grounding. + +
+
+ comment: 34 pages, 16 figures, 13 tables, Code is available at + https://github.com/wjun0830/CGDETR +
+
+
+
+
+ + ♻ ☆ Geometrically-driven Aggregation for Zero-shot 3D Point Cloud + Understanding + + +
+ Zero-shot 3D point cloud understanding can be achieved via 2D Vision-Language +Models (VLMs). Existing strategies directly map Vision-Language Models from 2D +pixels of rendered or captured views to 3D points, overlooking the inherent and +expressible point cloud geometric structure. Geometrically similar or close +regions can be exploited for bolstering point cloud understanding as they are +likely to share semantic information. To this end, we introduce the first +training-free aggregation technique that leverages the point cloud's 3D +geometric structure to improve the quality of the transferred Vision-Language +Models. Our approach operates iteratively, performing local-to-global +aggregation based on geometric and semantic point-level reasoning. We benchmark +our approach on three downstream tasks, including classification, part +segmentation, and semantic segmentation, with a variety of datasets +representing both synthetic/real-world, and indoor/outdoor scenarios. Our +approach achieves new state-of-the-art results in all benchmarks. We will +release the source code publicly. + +
+
+ comment: Zero-shot, point cloud, 2D Vision-Language Models, geometric + structure, training-free +
+
+
+
+
+ + ♻ ☆ SAGE: Bridging Semantic and Actionable Parts for GEneralizable + Manipulation of Articulated Objects + + +
+ To interact with daily-life articulated objects of diverse structures and +functionalities, understanding the object parts plays a central role in both +user instruction comprehension and task execution. However, the possible +discordance between the semantic meaning and physics functionalities of the +parts poses a challenge for designing a general system. To address this +problem, we propose SAGE, a novel framework that bridges semantic and +actionable parts of articulated objects to achieve generalizable manipulation +under natural language instructions. More concretely, given an articulated +object, we first observe all the semantic parts on it, conditioned on which an +instruction interpreter proposes possible action programs that concretize the +natural language instruction. Then, a part-grounding module maps the semantic +parts into so-called Generalizable Actionable Parts (GAParts), which inherently +carry information about part motion. End-effector trajectories are predicted on +the GAParts, which, together with the action program, form an executable +policy. Additionally, an interactive feedback module is incorporated to respond +to failures, which closes the loop and increases the robustness of the overall +framework. Key to the success of our framework is the joint proposal and +knowledge fusion between a large vision-language model (VLM) and a small +domain-specific model for both context comprehension and part perception, with +the former providing general intuitions and the latter serving as expert facts. +Both simulation and real-robot experiments show our effectiveness in handling a +large variety of articulated objects with diverse language-instructed goals. + +
+
+
+
+
+ + ♻ ☆ vid-TLDR: Training Free Token merging for Light-weight Video Transformer CVPR + + +
+ Video Transformers have become the prevalent solution for various video +downstream tasks with superior expressive power and flexibility. However, these +video transformers suffer from heavy computational costs induced by the massive +number of tokens across the entire video frames, which has been the major +barrier to training the model. Further, the patches irrelevant to the main +contents, e.g., backgrounds, degrade the generalization performance of models. +To tackle these issues, we propose training free token merging for lightweight +video Transformer (vid-TLDR) that aims to enhance the efficiency of video +Transformers by merging the background tokens without additional training. For +vid-TLDR, we introduce a novel approach to capture the salient regions in +videos only with the attention map. Further, we introduce the saliency-aware +token merging strategy by dropping the background tokens and sharpening the +object scores. Our experiments show that vid-TLDR significantly mitigates the +computational complexity of video Transformers while achieving competitive +performance compared to the base model without vid-TLDR. Code is available at +https://github.com/mlvlab/vid-TLDR. + +
+
+ comment: Conference on Computer Vision and Pattern Recognition (CVPR), 2024 +
+
+
+
+
+ + ♻ ☆ VRP-SAM: SAM with Visual Reference Prompt CVPR 2024 + + +
+ In this paper, we propose a novel Visual Reference Prompt (VRP) encoder that +empowers the Segment Anything Model (SAM) to utilize annotated reference images +as prompts for segmentation, creating the VRP-SAM model. In essence, VRP-SAM +can utilize annotated reference images to comprehend specific objects and +perform segmentation of specific objects in target image. It is note that the +VRP encoder can support a variety of annotation formats for reference images, +including \textbf{point}, \textbf{box}, \textbf{scribble}, and \textbf{mask}. +VRP-SAM achieves a breakthrough within the SAM framework by extending its +versatility and applicability while preserving SAM's inherent strengths, thus +enhancing user-friendliness. To enhance the generalization ability of VRP-SAM, +the VRP encoder adopts a meta-learning strategy. To validate the effectiveness +of VRP-SAM, we conducted extensive empirical studies on the Pascal and COCO +datasets. Remarkably, VRP-SAM achieved state-of-the-art performance in visual +reference segmentation with minimal learnable parameters. Furthermore, VRP-SAM +demonstrates strong generalization capabilities, allowing it to perform +segmentation of unseen objects and enabling cross-domain segmentation. The +source code and models will be available at +\url{https://github.com/syp2ysy/VRP-SAM} + +
+
+ comment: Accepted by CVPR 2024; The camera-ready version +
+
+
+
+
+ + ♻ ☆ Video Self-Stitching Graph Network for Temporal Action Localization + + +
+ Temporal action localization (TAL) in videos is a challenging task, +especially due to the large variation in action temporal scales. Short actions +usually occupy a major proportion in the datasets, but tend to have the lowest +performance. In this paper, we confront the challenge of short actions and +propose a multi-level cross-scale solution dubbed as video self-stitching graph +network (VSGN). We have two key components in VSGN: video self-stitching (VSS) +and cross-scale graph pyramid network (xGPN). In VSS, we focus on a short +period of a video and magnify it along the temporal dimension to obtain a +larger scale. We stitch the original clip and its magnified counterpart in one +input sequence to take advantage of the complementary properties of both +scales. The xGPN component further exploits the cross-scale correlations by a +pyramid of cross-scale graph networks, each containing a hybrid module to +aggregate features from across scales as well as within the same scale. Our +VSGN not only enhances the feature representations, but also generates more +positive anchors for short actions and more short training samples. Experiments +demonstrate that VSGN obviously improves the localization performance of short +actions as well as achieving the state-of-the-art overall performance on +THUMOS-14 and ActivityNet-v1.3. + +
+
+
+
+
+ + ♻ ☆ Optimizing Sparse Convolution on GPUs with CUDA for 3D Point Cloud + Processing in Embedded Systems + + +
+ In recent years, there has been a significant increase in the utilization of +deep learning methods, particularly convolutional neural networks (CNNs), which +have emerged as the dominant approach in various domains that involve +structured grid data, such as picture analysis and processing. Nevertheless, +the exponential growth in the utilization of LiDAR and 3D sensors across many +domains has resulted in an increased need for the analysis of 3D point clouds. +The utilization of 3D point clouds is crucial in various applications, +including object recognition and segmentation, as they offer a spatial +depiction of things within a three-dimensional environment. In contrast to +photos, point clouds exhibit sparsity and lack a regular grid, hence posing +distinct processing and computational issues. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ♻ ☆ Aligning Logits Generatively for Principled Black-Box Knowledge + Distillation CVPR 2024 + + +
+ Black-Box Knowledge Distillation (B2KD) is a formulated problem for +cloud-to-edge model compression with invisible data and models hosted on the +server. B2KD faces challenges such as limited Internet exchange and edge-cloud +disparity of data distributions. In this paper, we formalize a two-step +workflow consisting of deprivatization and distillation, and theoretically +provide a new optimization direction from logits to cell boundary different +from direct logits alignment. With its guidance, we propose a new method +Mapping-Emulation KD (MEKD) that distills a black-box cumbersome model into a +lightweight one. Our method does not differentiate between treating soft or +hard responses, and consists of: 1) deprivatization: emulating the inverse +mapping of the teacher function with a generator, and 2) distillation: aligning +low-dimensional logits of the teacher and student models by reducing the +distance of high-dimensional image points. For different teacher-student pairs, +our method yields inspiring distillation performance on various benchmarks, and +outperforms the previous state-of-the-art approaches. + +
+
+ comment: To appear at CVPR 2024; significantly rewritten with extra + experiments since the preliminary report +
+
+
+
+
+ + ♻ ☆ MMA-Diffusion: MultiModal Attack on Diffusion Models CVPR 2024 + + +
+ In recent years, Text-to-Image (T2I) models have seen remarkable +advancements, gaining widespread adoption. However, this progress has +inadvertently opened avenues for potential misuse, particularly in generating +inappropriate or Not-Safe-For-Work (NSFW) content. Our work introduces +MMA-Diffusion, a framework that presents a significant and realistic threat to +the security of T2I models by effectively circumventing current defensive +measures in both open-source models and commercial online services. Unlike +previous approaches, MMA-Diffusion leverages both textual and visual modalities +to bypass safeguards like prompt filters and post-hoc safety checkers, thus +exposing and highlighting the vulnerabilities in existing defense mechanisms. + +
+
+ comment: CVPR 2024. Our codes and benchmarks are available at + https://github.com/cure-lab/MMA-Diffusion +
+
+
+
+
+ + ♻ ☆ Dr$^2$Net: Dynamic Reversible Dual-Residual Networks for + Memory-Efficient Finetuning + + +
+ Large pretrained models are increasingly crucial in modern computer vision +tasks. These models are typically used in downstream tasks by end-to-end +finetuning, which is highly memory-intensive for tasks with high-resolution +data, e.g., video understanding, small object detection, and point cloud +analysis. In this paper, we propose Dynamic Reversible Dual-Residual Networks, +or Dr$^2$Net, a novel family of network architectures that acts as a surrogate +network to finetune a pretrained model with substantially reduced memory +consumption. Dr$^2$Net contains two types of residual connections, one +maintaining the residual structure in the pretrained models, and the other +making the network reversible. Due to its reversibility, intermediate +activations, which can be reconstructed from output, are cleared from memory +during training. We use two coefficients on either type of residual connections +respectively, and introduce a dynamic training strategy that seamlessly +transitions the pretrained model to a reversible network with much higher +numerical precision. We evaluate Dr$^2$Net on various pretrained models and +various tasks, and show that it can reach comparable performance to +conventional finetuning but with significantly less memory usage. + +
+
+
+
+
+ + ♻ ☆ I'M HOI: Inertia-aware Monocular Capture of 3D Human-Object Interactions CVPR 2024 + + +
+ We are living in a world surrounded by diverse and "smart" devices with rich +modalities of sensing ability. Conveniently capturing the interactions between +us humans and these objects remains far-reaching. In this paper, we present +I'm-HOI, a monocular scheme to faithfully capture the 3D motions of both the +human and object in a novel setting: using a minimal amount of RGB camera and +object-mounted Inertial Measurement Unit (IMU). It combines general motion +inference and category-aware refinement. For the former, we introduce a +holistic human-object tracking method to fuse the IMU signals and the RGB +stream and progressively recover the human motions and subsequently the +companion object motions. For the latter, we tailor a category-aware motion +diffusion model, which is conditioned on both the raw IMU observations and the +results from the previous stage under over-parameterization representation. It +significantly refines the initial results and generates vivid body, hand, and +object motions. Moreover, we contribute a large dataset with ground truth human +and object motions, dense RGB inputs, and rich object-mounted IMU measurements. +Extensive experiments demonstrate the effectiveness of I'm-HOI under a hybrid +capture setting. Our dataset and code will be released to the community. + +
+
+ comment: Accepted to CVPR 2024. Project page: + https://afterjourney00.github.io/IM-HOI.github.io/ +
+
+
+
+
+ + ♻ ☆ Initialization Matters for Adversarial Transfer Learning CVPR 2024 + + +
+ With the prevalence of the Pretraining-Finetuning paradigm in transfer +learning, the robustness of downstream tasks has become a critical concern. In +this work, we delve into adversarial robustness in transfer learning and reveal +the critical role of initialization, including both the pretrained model and +the linear head. First, we discover the necessity of an adversarially robust +pretrained model. Specifically, we reveal that with a standard pretrained +model, Parameter-Efficient Finetuning (PEFT) methods either fail to be +adversarially robust or continue to exhibit significantly degraded adversarial +robustness on downstream tasks, even with adversarial training during +finetuning. Leveraging a robust pretrained model, surprisingly, we observe that +a simple linear probing can outperform full finetuning and other PEFT methods +with random initialization on certain datasets. We further identify that linear +probing excels in preserving robustness from the robust pretraining. Based on +this, we propose Robust Linear Initialization (RoLI) for adversarial +finetuning, which initializes the linear head with the weights obtained by +adversarial linear probing to maximally inherit the robustness from +pretraining. Across five different image classification datasets, we +demonstrate the effectiveness of RoLI and achieve new state-of-the-art results. +Our code is available at \url{https://github.com/DongXzz/RoLI}. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Latent Code Augmentation Based on Stable Diffusion for Data-free + Substitute Attacks + + +
+ Since the training data of the target model is not available in the black-box +substitute attack, most recent schemes utilize GANs to generate data for +training the substitute model. However, these GANs-based schemes suffer from +low training efficiency as the generator needs to be retrained for each target +model during the substitute training process, as well as low generation +quality. To overcome these limitations, we consider utilizing the diffusion +model to generate data, and propose a novel data-free substitute attack scheme +based on the Stable Diffusion (SD) to improve the efficiency and accuracy of +substitute training. Despite the data generated by the SD exhibiting high +quality, it presents a different distribution of domains and a large variation +of positive and negative samples for the target model. For this problem, we +propose Latent Code Augmentation (LCA) to facilitate SD in generating data that +aligns with the data distribution of the target model. Specifically, we augment +the latent codes of the inferred member data with LCA and use them as guidance +for SD. With the guidance of LCA, the data generated by the SD not only meets +the discriminative criteria of the target model but also exhibits high +diversity. By utilizing this data, it is possible to train the substitute model +that closely resembles the target model more efficiently. Extensive experiments +demonstrate that our LCA achieves higher attack success rates and requires +fewer query budgets compared to GANs-based schemes for different target models. +Our codes are available at \url{https://github.com/LzhMeng/LCA}. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ Can I Trust Your Answer? Visually Grounded Video Question Answering CVPR'24 + + +
+ We study visually grounded VideoQA in response to the emerging trends of +utilizing pretraining techniques for video-language understanding. +Specifically, by forcing vision-language models (VLMs) to answer questions and +simultaneously provide visual evidence, we seek to ascertain the extent to +which the predictions of such techniques are genuinely anchored in relevant +video content, versus spurious correlations from language or irrelevant visual +context. Towards this, we construct NExT-GQA -- an extension of NExT-QA with +10.5$K$ temporal grounding (or location) labels tied to the original QA pairs. +With NExT-GQA, we scrutinize a series of state-of-the-art VLMs. Through +post-hoc attention analysis, we find that these models are extremely weak in +substantiating the answers despite their strong QA performance. This exposes +the limitation of current VLMs in making reliable predictions. As a remedy, we +further explore and propose a grounded-QA method via Gaussian mask optimization +and cross-modal learning. Experiments with different backbones demonstrate that +this grounding mechanism improves both grounding and QA. With these efforts, we +aim to push towards trustworthy VLMs in VQA systems. Our dataset and code are +available at https://github.com/doc-doc/NExT-GQA. + +
+
+ comment: Accepted to CVPR'24. (Compared with preprint version, we mainly + improve the presentation, discuss more related works, and extend experiments + in Appendix.) +
+
+
+
+
+ + ♻ ☆ Tracing Hyperparameter Dependencies for Model Parsing via Learnable + Graph Pooling Network + + +
+ Model Parsing defines the research task of predicting hyperparameters of the +generative model (GM), given a generated image as input. Since a diverse set of +hyperparameters is jointly employed by the generative model, and dependencies +often exist among them, it is crucial to learn these hyperparameter +dependencies for the improved model parsing performance. To explore such +important dependencies, we propose a novel model parsing method called +Learnable Graph Pooling Network (LGPN). Specifically, we transform model +parsing into a graph node classification task, using graph nodes and edges to +represent hyperparameters and their dependencies, respectively. Furthermore, +LGPN incorporates a learnable pooling-unpooling mechanism tailored to model +parsing, which adaptively learns hyperparameter dependencies of GMs used to +generate the input image. We also extend our proposed method to CNN-generated +image detection and coordinate attacks detection. Empirically, we achieve +state-of-the-art results in model parsing and its extended applications, +showing the effectiveness of our method. Our source code are available. + +
+
+ comment: 24 pages, 15 figures, 17 tables +
+
+
+
+
+ + ♻ ☆ Finding needles in a haystack: A Black-Box Approach to Invisible + Watermark Detection + + +
+ In this paper, we propose WaterMark Detection (WMD), the first invisible +watermark detection method under a black-box and annotation-free setting. WMD +is capable of detecting arbitrary watermarks within a given reference dataset +using a clean non-watermarked dataset as a reference, without relying on +specific decoding methods or prior knowledge of the watermarking techniques. We +develop WMD using foundations of offset learning, where a clean non-watermarked +dataset enables us to isolate the influence of only watermarked samples in the +reference dataset. Our comprehensive evaluations demonstrate the effectiveness +of WMD, significantly outperforming naive detection methods, which only yield +AUC scores around 0.5. In contrast, WMD consistently achieves impressive +detection AUC scores, surpassing 0.9 in most single-watermark datasets and +exceeding 0.7 in more challenging multi-watermark scenarios across diverse +datasets and watermarking methods. As invisible watermarks become increasingly +prevalent, while specific decoding techniques remain undisclosed, our approach +provides a versatile solution and establishes a path toward increasing +accountability, transparency, and trust in our digital visual content. + +
+
+
+
+
+ + ♻ ☆ NTO3D: Neural Target Object 3D Reconstruction with Segment Anything CVPR24 + + +
+ Neural 3D reconstruction from multi-view images has recently attracted +increasing attention from the community. Existing methods normally learn a +neural field for the whole scene, while it is still under-explored how to +reconstruct a target object indicated by users. Considering the Segment +Anything Model (SAM) has shown effectiveness in segmenting any 2D images, in +this paper, we propose NTO3D, a novel high-quality Neural Target Object 3D +(NTO3D) reconstruction method, which leverages the benefits of both neural +field and SAM. We first propose a novel strategy to lift the multi-view 2D +segmentation masks of SAM into a unified 3D occupancy field. The 3D occupancy +field is then projected into 2D space and generates the new prompts for SAM. +This process is iterative until convergence to separate the target object from +the scene. After this, we then lift the 2D features of the SAM encoder into a +3D feature field in order to improve the reconstruction quality of the target +object. NTO3D lifts the 2D masks and features of SAM into the 3D neural field +for high-quality neural target object 3D reconstruction. We conduct detailed +experiments on several benchmark datasets to demonstrate the advantages of our +method. The code will be available at: https://github.com/ucwxb/NTO3D. + +
+
+ comment: accepted by CVPR24 +
+
+
+
+
+ + ♻ ☆ Consistency Trajectory Models: Learning Probability Flow ODE Trajectory + of Diffusion + + +
+ Consistency Models (CM) (Song et al., 2023) accelerate score-based diffusion +model sampling at the cost of sample quality but lack a natural way to +trade-off quality for speed. To address this limitation, we propose Consistency +Trajectory Model (CTM), a generalization encompassing CM and score-based models +as special cases. CTM trains a single neural network that can -- in a single +forward pass -- output scores (i.e., gradients of log-density) and enables +unrestricted traversal between any initial and final time along the Probability +Flow Ordinary Differential Equation (ODE) in a diffusion process. CTM enables +the efficient combination of adversarial training and denoising score matching +loss to enhance performance and achieves new state-of-the-art FIDs for +single-step diffusion model sampling on CIFAR-10 (FID 1.73) and ImageNet at +64x64 resolution (FID 1.92). CTM also enables a new family of sampling schemes, +both deterministic and stochastic, involving long jumps along the ODE solution +trajectories. It consistently improves sample quality as computational budgets +increase, avoiding the degradation seen in CM. Furthermore, unlike CM, CTM's +access to the score function can streamline the adoption of established +controllable/conditional generation methods from the diffusion community. This +access also enables the computation of likelihood. The code is available at +https://github.com/sony/ctm. + +
+
+ comment: International Conference on Learning Representations +
+
+
+
+
+ + ♻ ☆ Egocentric Scene-aware Human Trajectory Prediction + + +
+ Wearable collaborative robots stand to assist human wearers who need fall +prevention assistance or wear exoskeletons. Such a robot needs to be able to +predict the ego motion of the wearer based on egocentric vision and the +surrounding scene. In this work, we leveraged body-mounted cameras and sensors +to anticipate the trajectory of human wearers through complex surroundings. To +facilitate research in ego-motion prediction, we have collected a comprehensive +walking scene navigation dataset centered on the user's perspective. We present +a method to predict human motion conditioning on the surrounding static scene. +Our method leverages a diffusion model to produce a distribution of potential +future trajectories, taking into account the user's observation of the +environment. We introduce a compact representation to encode the user's visual +memory of the surroundings, as well as an efficient sample-generating technique +to speed up real-time inference of a diffusion model. We ablate our model and +compare it to baselines, and results show that our model outperforms existing +methods on key metrics of collision avoidance and trajectory mode coverage. + +
+
+ comment: 14 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Passive Snapshot Coded Aperture Dual-Pixel RGB-D Imaging + + +
+ Passive, compact, single-shot 3D sensing is useful in many application areas +such as microscopy, medical imaging, surgical navigation, and autonomous +driving where form factor, time, and power constraints can exist. Obtaining +RGB-D scene information over a short imaging distance, in an ultra-compact form +factor, and in a passive, snapshot manner is challenging. Dual-pixel (DP) +sensors are a potential solution to achieve the same. DP sensors collect light +rays from two different halves of the lens in two interleaved pixel arrays, +thus capturing two slightly different views of the scene, like a stereo camera +system. However, imaging with a DP sensor implies that the defocus blur size is +directly proportional to the disparity seen between the views. This creates a +trade-off between disparity estimation vs. deblurring accuracy. To improve this +trade-off effect, we propose CADS (Coded Aperture Dual-Pixel Sensing), in which +we use a coded aperture in the imaging lens along with a DP sensor. In our +approach, we jointly learn an optimal coded pattern and the reconstruction +algorithm in an end-to-end optimization setting. Our resulting CADS imaging +system demonstrates improvement of >1.5dB PSNR in all-in-focus (AIF) estimates +and 5-6% in depth estimation quality over naive DP sensing for a wide range of +aperture settings. Furthermore, we build the proposed CADS prototypes for DSLR +photography settings and in an endoscope and a dermoscope form factor. Our +novel coded dual-pixel sensing approach demonstrates accurate RGB-D +reconstruction results in simulations and real-world experiments in a passive, +snapshot, and compact manner. + +
+
+
+
+
+ + ♻ ☆ SKDF: A Simple Knowledge Distillation Framework for Distilling + Open-Vocabulary Knowledge to Open-world Object Detector + + +
+ In this paper, we attempt to specialize the VLM model for OWOD tasks by +distilling its open-world knowledge into a language-agnostic detector. +Surprisingly, we observe that the combination of a simple \textbf{knowledge +distillation} approach and the automatic pseudo-labeling mechanism in OWOD can +achieve better performance for unknown object detection, even with a small +amount of data. Unfortunately, knowledge distillation for unknown objects +severely affects the learning of detectors with conventional structures for +known objects, leading to catastrophic forgetting. To alleviate these problems, +we propose the \textbf{down-weight loss function} for knowledge distillation +from vision-language to single vision modality. Meanwhile, we propose the +\textbf{cascade decouple decoding structure} that decouples the learning of +localization and recognition to reduce the impact of category interactions of +known and unknown objects on the localization learning process. Ablation +experiments demonstrate that both of them are effective in mitigating the +impact of open-world knowledge distillation on the learning of known objects. +Additionally, to alleviate the current lack of comprehensive benchmarks for +evaluating the ability of the open-world detector to detect unknown objects in +the open world, we propose two benchmarks, which we name +"\textbf{StandardSet}$\heartsuit$" and "\textbf{IntensiveSet}$\spadesuit$" +respectively, based on the complexity of their testing scenarios. Comprehensive +experiments performed on OWOD, MS-COCO, and our proposed benchmarks demonstrate +the effectiveness of our methods. The code and proposed dataset are available +at \url{https://github.com/xiaomabufei/SKDF}. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2303.11623 +
+
+
+
+
+ + ♻ ☆ Free3D: Consistent Novel View Synthesis without 3D Representation + + +
+ We introduce Free3D, a simple accurate method for monocular open-set novel +view synthesis (NVS). Similar to Zero-1-to-3, we start from a pre-trained 2D +image generator for generalization, and fine-tune it for NVS. Compared to other +works that took a similar approach, we obtain significant improvements without +resorting to an explicit 3D representation, which is slow and memory-consuming, +and without training an additional network for 3D reconstruction. Our key +contribution is to improve the way the target camera pose is encoded in the +network, which we do by introducing a new ray conditioning normalization (RCN) +layer. The latter injects pose information in the underlying 2D image generator +by telling each pixel its viewing direction. We further improve multi-view +consistency by using light-weight multi-view attention layers and by sharing +generation noise between the different views. We train Free3D on the Objaverse +dataset and demonstrate excellent generalization to new categories in new +datasets, including OmniObject3D and GSO. The project page is available at +https://chuanxiaz.com/free3d/. + +
+
+ comment: webpage: https://chuanxiaz.com/free3d/, code: + https://github.com/lyndonzheng/Free3D +
+
+
+
+
+ + ♻ ☆ Dual-View Visual Contextualization for Web Navigation CVPR 2024 + + +
+ Automatic web navigation aims to build a web agent that can follow language +instructions to execute complex and diverse tasks on real-world websites. +Existing work primarily takes HTML documents as input, which define the +contents and action spaces (i.e., actionable elements and operations) of +webpages. Nevertheless, HTML documents may not provide a clear task-related +context for each element, making it hard to select the right (sequence of) +actions. In this paper, we propose to contextualize HTML elements through their +"dual views" in webpage screenshots: each HTML element has its corresponding +bounding box and visual content in the screenshot. We build upon the insight -- +web developers tend to arrange task-related elements nearby on webpages to +enhance user experiences -- and propose to contextualize each element with its +neighbor elements, using both textual and visual features. The resulting +representations of HTML elements are more informative for the agent to take +action. We validate our method on the recently released Mind2Web dataset, which +features diverse navigation domains and tasks on real-world websites. Our +method consistently outperforms the baseline in all the scenarios, including +cross-task, cross-website, and cross-domain ones. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ MoEController: Instruction-based Arbitrary Image Manipulation with + Mixture-of-Expert Controllers + + +
+ Diffusion-model-based text-guided image generation has recently made +astounding progress, producing fascinating results in open-domain image +manipulation tasks. Few models, however, currently have complete zero-shot +capabilities for both global and local image editing due to the complexity and +diversity of image manipulation tasks. In this work, we propose a method with a +mixture-of-expert (MOE) controllers to align the text-guided capacity of +diffusion models with different kinds of human instructions, enabling our model +to handle various open-domain image manipulation tasks with natural language +instructions. First, we use large language models (ChatGPT) and conditional +image synthesis models (ControlNet) to generate a large number of global image +transfer dataset in addition to the instruction-based local image editing +dataset. Then, using an MOE technique and task-specific adaptation training on +a large-scale dataset, our conditional diffusion model can edit images globally +and locally. Extensive experiments demonstrate that our approach performs +surprisingly well on various image manipulation tasks when dealing with +open-domain images and arbitrary human instructions. Please refer to our +project page: [https://oppo-mente-lab.github.io/moe_controller/] + +
+
+ comment: 6 pages,6 figures +
+
+
+
+
+ + ♻ ☆ Boosting Flow-based Generative Super-Resolution Models via Learned Prior CVPR2024 + + +
+ Flow-based super-resolution (SR) models have demonstrated astonishing +capabilities in generating high-quality images. However, these methods +encounter several challenges during image generation, such as grid artifacts, +exploding inverses, and suboptimal results due to a fixed sampling temperature. +To overcome these issues, this work introduces a conditional learned prior to +the inference phase of a flow-based SR model. This prior is a latent code +predicted by our proposed latent module conditioned on the low-resolution +image, which is then transformed by the flow model into an SR image. Our +framework is designed to seamlessly integrate with any contemporary flow-based +SR model without modifying its architecture or pre-trained weights. We evaluate +the effectiveness of our proposed framework through extensive experiments and +ablation analyses. The proposed framework successfully addresses all the +inherent issues in flow-based SR models and enhances their performance in +various SR scenarios. Our code is available at: +https://github.com/liyuantsao/FlowSR-LP + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ♻ ☆ GaussianAvatar: Towards Realistic Human Avatar Modeling from a Single + Video via Animatable 3D Gaussians + + +
+ We present GaussianAvatar, an efficient approach to creating realistic human +avatars with dynamic 3D appearances from a single video. We start by +introducing animatable 3D Gaussians to explicitly represent humans in various +poses and clothing styles. Such an explicit and animatable representation can +fuse 3D appearances more efficiently and consistently from 2D observations. Our +representation is further augmented with dynamic properties to support +pose-dependent appearance modeling, where a dynamic appearance network along +with an optimizable feature tensor is designed to learn the +motion-to-appearance mapping. Moreover, by leveraging the differentiable motion +condition, our method enables a joint optimization of motions and appearances +during avatar modeling, which helps to tackle the long-standing issue of +inaccurate motion estimation in monocular settings. The efficacy of +GaussianAvatar is validated on both the public dataset and our collected +dataset, demonstrating its superior performances in terms of appearance quality +and rendering efficiency. + +
+
+ comment: Project Page: https://huliangxiao.github.io/GaussianAvatar +
+
+
+
+
+ + ♻ ☆ EMAGE: Towards Unified Holistic Co-Speech Gesture Generation via + Expressive Masked Audio Gesture Modeling CVPR + + +
+ We propose EMAGE, a framework to generate full-body human gestures from audio +and masked gestures, encompassing facial, local body, hands, and global +movements. To achieve this, we first introduce BEAT2 (BEAT-SMPLX-FLAME), a new +mesh-level holistic co-speech dataset. BEAT2 combines a MoShed SMPL-X body with +FLAME head parameters and further refines the modeling of head, neck, and +finger movements, offering a community-standardized, high-quality 3D motion +captured dataset. EMAGE leverages masked body gesture priors during training to +boost inference performance. It involves a Masked Audio Gesture Transformer, +facilitating joint training on audio-to-gesture generation and masked gesture +reconstruction to effectively encode audio and body gesture hints. Encoded body +hints from masked gestures are then separately employed to generate facial and +body movements. Moreover, EMAGE adaptively merges speech features from the +audio's rhythm and content and utilizes four compositional VQ-VAEs to enhance +the results' fidelity and diversity. Experiments demonstrate that EMAGE +generates holistic gestures with state-of-the-art performance and is flexible +in accepting predefined spatial-temporal gesture inputs, generating complete, +audio-synchronized results. Our code and dataset are available +https://pantomatrix.github.io/EMAGE/ + +
+
+ comment: Fix typos; Conflict of Interest Disclosure; CVPR Camera Ready; + Project Page: https://pantomatrix.github.io/EMAGE/ +
+
+
+
+
+ + ♻ ☆ A Review of Predictive and Contrastive Self-supervised Learning for + Medical Images + + +
+ Over the last decade, supervised deep learning on manually annotated big data +has been progressing significantly on computer vision tasks. But the +application of deep learning in medical image analysis was limited by the +scarcity of high-quality annotated medical imaging data. An emerging solution +is self-supervised learning (SSL), among which contrastive SSL is the most +successful approach to rivalling or outperforming supervised learning. This +review investigates several state-of-the-art contrastive SSL algorithms +originally on natural images as well as their adaptations for medical images, +and concludes by discussing recent advances, current limitations, and future +directions in applying contrastive SSL in the medical domain. + +
+
+ comment: Article links: + https://link.springer.com/article/10.1007/s11633-022-1406-4 +
+
+
+
+
+ + ♻ ☆ MMVP: A Multimodal MoCap Dataset with Vision and Pressure Sensors CVPR2024 + + +
+ Foot contact is an important cue for human motion capture, understanding, and +generation. Existing datasets tend to annotate dense foot contact using visual +matching with thresholding or incorporating pressure signals. However, these +approaches either suffer from low accuracy or are only designed for small-range +and slow motion. There is still a lack of a vision-pressure multimodal dataset +with large-range and fast human motion, as well as accurate and dense +foot-contact annotation. To fill this gap, we propose a Multimodal MoCap +Dataset with Vision and Pressure sensors, named MMVP. MMVP provides accurate +and dense plantar pressure signals synchronized with RGBD observations, which +is especially useful for both plausible shape estimation, robust pose fitting +without foot drifting, and accurate global translation tracking. To validate +the dataset, we propose an RGBD-P SMPL fitting method and also a +monocular-video-based baseline framework, VP-MoCap, for human motion capture. +Experiments demonstrate that our RGBD-P SMPL Fitting results significantly +outperform pure visual motion capture. Moreover, VP-MoCap outperforms SOTA +methods in foot-contact and global translation estimation accuracy. We believe +the configuration of the dataset and the baseline frameworks will stimulate the +research in this direction and also provide a good reference for MoCap +applications in various domains. Project page: +https://metaverse-ai-lab-thu.github.io/MMVP-Dataset/. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ♻ ☆ Brain Decodes Deep Nets + + +
+ We developed a tool for visualizing and analyzing large pre-trained vision +models by mapping them onto the brain, thus exposing their hidden inside. Our +innovation arises from a surprising usage of brain encoding: predicting brain +fMRI measurements in response to images. We report two findings. First, +explicit mapping between the brain and deep-network features across dimensions +of space, layers, scales, and channels is crucial. This mapping method, +FactorTopy, is plug-and-play for any deep-network; with it, one can paint a +picture of the network onto the brain (literally!). Second, our visualization +shows how different training methods matter: they lead to remarkable +differences in hierarchical organization and scaling behavior, growing with +more data or network capacity. It also provides insight into fine-tuning: how +pre-trained models change when adapting to small datasets. We found brain-like +hierarchically organized network suffer less from catastrophic forgetting after +fine-tuned. + +
+
+ comment: Website: see https://huzeyann.github.io/brain-decodes-deep-nets . + Code: see https://github.com/huzeyann/BrainDecodesDeepNets +
+
+
+
+
+ + ♻ ☆ OccNeRF: Advancing 3D Occupancy Prediction in LiDAR-Free Environments + + +
+ As a fundamental task of vision-based perception, 3D occupancy prediction +reconstructs 3D structures of surrounding environments. It provides detailed +information for autonomous driving planning and navigation. However, most +existing methods heavily rely on the LiDAR point clouds to generate occupancy +ground truth, which is not available in the vision-based system. In this paper, +we propose an OccNeRF method for training occupancy networks without 3D +supervision. Different from previous works which consider a bounded scene, we +parameterize the reconstructed occupancy fields and reorganize the sampling +strategy to align with the cameras' infinite perceptive range. The neural +rendering is adopted to convert occupancy fields to multi-camera depth maps, +supervised by multi-frame photometric consistency. Moreover, for semantic +occupancy prediction, we design several strategies to polish the prompts and +filter the outputs of a pretrained open-vocabulary 2D segmentation model. +Extensive experiments for both self-supervised depth estimation and 3D +occupancy prediction tasks on nuScenes and SemanticKITTI datasets demonstrate +the effectiveness of our method. + +
+
+ comment: Code: https://github.com/LinShan-Bin/OccNeRF +
+
+
+
+
+ + ♻ ☆ Cross-domain Fiber Cluster Shape Analysis for Language Performance + Cognitive Score Prediction + + +
+ Shape plays an important role in computer graphics, offering informative +features to convey an object's morphology and functionality. Shape analysis in +brain imaging can help interpret structural and functionality correlations of +the human brain. In this work, we investigate the shape of the brain's 3D white +matter connections and its potential predictive relationship to human cognitive +function. We reconstruct brain connections as sequences of 3D points using +diffusion magnetic resonance imaging (dMRI) tractography. To describe each +connection, we extract 12 shape descriptors in addition to traditional dMRI +connectivity and tissue microstructure features. We introduce a novel +framework, Shape--fused Fiber Cluster Transformer (SFFormer), that leverages a +multi-head cross-attention feature fusion module to predict subject-specific +language performance based on dMRI tractography. We assess the performance of +the method on a large dataset including 1065 healthy young adults. The results +demonstrate that both the transformer-based SFFormer model and its inter/intra +feature fusion with shape, microstructure, and connectivity are informative, +and together, they improve the prediction of subject-specific language +performance scores. Overall, our results indicate that the shape of the brain's +connections is predictive of human language function. + +
+
+ comment: 2 figures, 11 pages +
+
+
+
+
+ + ♻ ☆ Video-GroundingDINO: Towards Open-Vocabulary Spatio-Temporal Video + Grounding + + +
+ Video grounding aims to localize a spatio-temporal section in a video +corresponding to an input text query. This paper addresses a critical +limitation in current video grounding methodologies by introducing an +Open-Vocabulary Spatio-Temporal Video Grounding task. Unlike prevalent +closed-set approaches that struggle with open-vocabulary scenarios due to +limited training data and predefined vocabularies, our model leverages +pre-trained representations from foundational spatial grounding models. This +empowers it to effectively bridge the semantic gap between natural language and +diverse visual content, achieving strong performance in closed-set and +open-vocabulary settings. Our contributions include a novel spatio-temporal +video grounding model, surpassing state-of-the-art results in closed-set +evaluations on multiple datasets and demonstrating superior performance in +open-vocabulary scenarios. Notably, the proposed model outperforms +state-of-the-art methods in closed-set settings on VidSTG (Declarative and +Interrogative) and HC-STVG (V1 and V2) datasets. Furthermore, in +open-vocabulary evaluations on HC-STVG V1 and YouCook-Interactions, our model +surpasses the recent best-performing models by $4.88$ m_vIoU and $1.83\%$ +accuracy, demonstrating its efficacy in handling diverse linguistic and visual +concepts for improved video understanding. Our codes will be publicly released. + +
+
+
+
+
+ + ♻ ☆ Collaborating Foundation Models for Domain Generalized Semantic + Segmentation CVPR 2024 + + +
+ Domain Generalized Semantic Segmentation (DGSS) deals with training a model +on a labeled source domain with the aim of generalizing to unseen domains +during inference. Existing DGSS methods typically effectuate robust features by +means of Domain Randomization (DR). Such an approach is often limited as it can +only account for style diversification and not content. In this work, we take +an orthogonal approach to DGSS and propose to use an assembly of CoLlaborative +FOUndation models for Domain Generalized Semantic Segmentation (CLOUDS). In +detail, CLOUDS is a framework that integrates FMs of various kinds: (i) CLIP +backbone for its robust feature representation, (ii) generative models to +diversify the content, thereby covering various modes of the possible target +distribution, and (iii) Segment Anything Model (SAM) for iteratively refining +the predictions of the segmentation model. Extensive experiments show that our +CLOUDS excels in adapting from synthetic to real DGSS benchmarks and under +varying weather conditions, notably outperforming prior methods by 5.6% and +6.7% on averaged miou, respectively. The code is available at : +https://github.com/yasserben/CLOUDS + +
+
+ comment: https://github.com/yasserben/CLOUDS ; Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Traffic Scene Parsing through the TSP6K Dataset CVPR 2024 + + +
+ Traffic scene perception in computer vision is a critically important task to +achieve intelligent cities. To date, most existing datasets focus on autonomous +driving scenes. We observe that the models trained on those driving datasets +often yield unsatisfactory results on traffic monitoring scenes. However, +little effort has been put into improving the traffic monitoring scene +understanding, mainly due to the lack of specific datasets. To fill this gap, +we introduce a specialized traffic monitoring dataset, termed TSP6K, containing +images from the traffic monitoring scenario, with high-quality pixel-level and +instance-level annotations. The TSP6K dataset captures more crowded traffic +scenes with several times more traffic participants than the existing driving +scenes. We perform a detailed analysis of the dataset and comprehensively +evaluate previous popular scene parsing methods, instance segmentation methods +and unsupervised domain adaption methods. Furthermore, considering the vast +difference in instance sizes, we propose a detail refining decoder for scene +parsing, which recovers the details of different semantic regions in traffic +scenes owing to the proposed TSP6K dataset. Experiments show its effectiveness +in parsing the traffic monitoring scenes. Code and dataset are available at +https://github.com/PengtaoJiang/TSP6K. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ SD-NAE: Generating Natural Adversarial Examples with Stable Diffusion ICLR 2024 + + +
+ Natural Adversarial Examples (NAEs), images arising naturally from the +environment and capable of deceiving classifiers, are instrumental in robustly +evaluating and identifying vulnerabilities in trained models. In this work, +unlike prior works that passively collect NAEs from real images, we propose to +actively synthesize NAEs using the state-of-the-art Stable Diffusion. +Specifically, our method formulates a controlled optimization process, where we +perturb the token embedding that corresponds to a specified class to generate +NAEs. This generation process is guided by the gradient of loss from the target +classifier, ensuring that the created image closely mimics the ground-truth +class yet fools the classifier. Named SD-NAE (Stable Diffusion for Natural +Adversarial Examples), our innovative method is effective in producing valid +and useful NAEs, which is demonstrated through a meticulously designed +experiment. Code is available at https://github.com/linyueqian/SD-NAE. + +
+
+ comment: Accepted by ICLR 2024 TinyPapers +
+
+
+
+
+ + ♻ ☆ GlORIE-SLAM: Globally Optimized RGB-only Implicit Encoding Point Cloud + SLAM + + +
+ Recent advancements in RGB-only dense Simultaneous Localization and Mapping +(SLAM) have predominantly utilized grid-based neural implicit encodings and/or +struggle to efficiently realize global map and pose consistency. To this end, +we propose an efficient RGB-only dense SLAM system using a flexible neural +point cloud scene representation that adapts to keyframe poses and depth +updates, without needing costly backpropagation. Another critical challenge of +RGB-only SLAM is the lack of geometric priors. To alleviate this issue, with +the aid of a monocular depth estimator, we introduce a novel DSPO layer for +bundle adjustment which optimizes the pose and depth of keyframes along with +the scale of the monocular depth. Finally, our system benefits from loop +closure and online global bundle adjustment and performs either better or +competitive to existing dense neural RGB SLAM methods in tracking, mapping and +rendering accuracy on the Replica, TUM-RGBD and ScanNet datasets. The source +code will be made available. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 185 + +
+
+
+ + ☆ Unsolvable Problem Detection: Evaluating Trustworthiness of Vision + Language Models + + +
+ This paper introduces a novel and significant challenge for Vision Language +Models (VLMs), termed Unsolvable Problem Detection (UPD). UPD examines the +VLM's ability to withhold answers when faced with unsolvable problems in the +context of Visual Question Answering (VQA) tasks. UPD encompasses three +distinct settings: Absent Answer Detection (AAD), Incompatible Answer Set +Detection (IASD), and Incompatible Visual Question Detection (IVQD). To deeply +investigate the UPD problem, extensive experiments indicate that most VLMs, +including GPT-4V and LLaVA-Next-34B, struggle with our benchmarks to varying +extents, highlighting significant room for the improvements. To address UPD, we +explore both training-free and training-based solutions, offering new insights +into their effectiveness and limitations. We hope our insights, together with +future efforts within the proposed UPD settings, will enhance the broader +understanding and development of more practical and reliable VLMs. + +
+
+ comment: Code: https://github.com/AtsuMiyai/UPD +
+
+
+
+
+ + ☆ Are We on the Right Way for Evaluating Large Vision-Language Models? + + +
+ Large vision-language models (LVLMs) have recently achieved rapid progress, +sparking numerous studies to evaluate their multi-modal capabilities. However, +we dig into current evaluation works and identify two primary issues: 1) Visual +content is unnecessary for many samples. The answers can be directly inferred +from the questions and options, or the world knowledge embedded in LLMs. This +phenomenon is prevalent across current benchmarks. For instance, GeminiPro +achieves 42.9% on the MMMU benchmark without any visual input, and outperforms +the random choice baseline across six benchmarks over 20% on average. 2) +Unintentional data leakage exists in LLM and LVLM training. LLM and LVLM could +still answer some visual-necessary questions without visual content, indicating +the memorizing of these samples within large-scale training data. For example, +Sphinx-X-MoE gets 43.6% on MMMU without accessing images, surpassing its LLM +backbone with 17.9%. Both problems lead to misjudgments of actual multi-modal +gains and potentially misguide the study of LVLM. To this end, we present +MMStar, an elite vision-indispensable multi-modal benchmark comprising 1,500 +samples meticulously selected by humans. MMStar benchmarks 6 core capabilities +and 18 detailed axes, aiming to evaluate LVLMs' multi-modal capacities with +carefully balanced and purified samples. These samples are first roughly +selected from current benchmarks with an automated pipeline, human review is +then involved to ensure each curated sample exhibits visual dependency, minimal +data leakage, and requires advanced multi-modal capabilities. Moreover, two +metrics are developed to measure data leakage and actual performance gain in +multi-modal training. We evaluate 16 leading LVLMs on MMStar to assess their +multi-modal capabilities, and on 7 benchmarks with the proposed metrics to +investigate their data leakage and actual multi-modal gain. + +
+
+ comment: Project page: https://mmstar-benchmark.github.io/ +
+
+
+
+
+ + ☆ MTLoRA: A Low-Rank Adaptation Approach for Efficient Multi-Task Learning CVPR + + +
+ Adapting models pre-trained on large-scale datasets to a variety of +downstream tasks is a common strategy in deep learning. Consequently, +parameter-efficient fine-tuning methods have emerged as a promising way to +adapt pre-trained models to different tasks while training only a minimal +number of parameters. While most of these methods are designed for single-task +adaptation, parameter-efficient training in Multi-Task Learning (MTL) +architectures is still unexplored. In this paper, we introduce MTLoRA, a novel +framework for parameter-efficient training of MTL models. MTLoRA employs +Task-Agnostic and Task-Specific Low-Rank Adaptation modules, which effectively +disentangle the parameter space in MTL fine-tuning, thereby enabling the model +to adeptly handle both task specialization and interaction within MTL contexts. +We applied MTLoRA to hierarchical-transformer-based MTL architectures, adapting +them to multiple downstream dense prediction tasks. Our extensive experiments +on the PASCAL dataset show that MTLoRA achieves higher accuracy on downstream +tasks compared to fully fine-tuning the MTL model while reducing the number of +trainable parameters by 3.6x. Furthermore, MTLoRA establishes a Pareto-optimal +trade-off between the number of trainable parameters and the accuracy of the +downstream tasks, outperforming current state-of-the-art parameter-efficient +training methods in both accuracy and efficiency. Our code is publicly +available. + +
+
+ comment: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern + Recognition (CVPR), 2024 +
+
+
+
+
+ + ☆ SeaBird: Segmentation in Bird's View with Dice Loss Improves Monocular + 3D Detection of Large Objects CVPR 2024 + + +
+ Monocular 3D detectors achieve remarkable performance on cars and smaller +objects. However, their performance drops on larger objects, leading to fatal +accidents. Some attribute the failures to training data scarcity or their +receptive field requirements of large objects. In this paper, we highlight this +understudied problem of generalization to large objects. We find that modern +frontal detectors struggle to generalize to large objects even on nearly +balanced datasets. We argue that the cause of failure is the sensitivity of +depth regression losses to noise of larger objects. To bridge this gap, we +comprehensively investigate regression and dice losses, examining their +robustness under varying error levels and object sizes. We mathematically prove +that the dice loss leads to superior noise-robustness and model convergence for +large objects compared to regression losses for a simplified case. Leveraging +our theoretical insights, we propose SeaBird (Segmentation in Bird's View) as +the first step towards generalizing to large objects. SeaBird effectively +integrates BEV segmentation on foreground objects for 3D detection, with the +segmentation head trained with the dice loss. SeaBird achieves SoTA results on +the KITTI-360 leaderboard and improves existing detectors on the nuScenes +leaderboard, particularly for large objects. Code and models at +https://github.com/abhi1kumar/SeaBird + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Convolutional Prompting meets Language Models for Continual Learning CVPR 2024 + + +
+ Continual Learning (CL) enables machine learning models to learn from +continuously shifting new training data in absence of data from old tasks. +Recently, pretrained vision transformers combined with prompt tuning have shown +promise for overcoming catastrophic forgetting in CL. These approaches rely on +a pool of learnable prompts which can be inefficient in sharing knowledge +across tasks leading to inferior performance. In addition, the lack of +fine-grained layer specific prompts does not allow these to fully express the +strength of the prompts for CL. We address these limitations by proposing +ConvPrompt, a novel convolutional prompt creation mechanism that maintains +layer-wise shared embeddings, enabling both layer-specific learning and better +concept transfer across tasks. The intelligent use of convolution enables us to +maintain a low parameter overhead without compromising performance. We further +leverage Large Language Models to generate fine-grained text descriptions of +each category which are used to get task similarity and dynamically decide the +number of prompts to be learned. Extensive experiments demonstrate the +superiority of ConvPrompt and improves SOTA by ~3% with significantly less +parameter overhead. We also perform strong ablation over various modules to +disentangle the importance of different components. + +
+
+ comment: CVPR 2024 Camera Ready +
+
+
+
+
+ + ☆ Learn "No" to Say "Yes" Better: Improving Vision-Language Models via + Negations + + +
+ Existing vision-language models (VLMs) treat text descriptions as a unit, +confusing individual concepts in a prompt and impairing visual semantic +matching and reasoning. An important aspect of reasoning in logic and language +is negations. This paper highlights the limitations of popular VLMs such as +CLIP, at understanding the implications of negations, i.e., the effect of the +word "not" in a given prompt. To enable evaluation of VLMs on fluent prompts +with negations, we present CC-Neg, a dataset containing 228,246 images, true +captions and their corresponding negated captions. Using CC-Neg along with +modifications to the contrastive loss of CLIP, our proposed CoN-CLIP framework, +has an improved understanding of negations. This training paradigm improves +CoN-CLIP's ability to encode semantics reliably, resulting in 3.85% average +gain in top-1 accuracy for zero-shot image classification across 8 datasets. +Further, CoN-CLIP outperforms CLIP on challenging compositionality benchmarks +such as SugarCREPE by 4.4%, showcasing emergent compositional understanding of +objects, relations, and attributes in text. Overall, our work addresses a +crucial limitation of VLMs by introducing a dataset and framework that +strengthens semantic associations between images and text, demonstrating +improved large-scale foundation models with significantly reduced computational +cost, promoting efficiency and accessibility. + +
+
+ comment: 14 pages + 6 figures in main manuscript (excluding references) +
+
+
+
+
+ + ☆ InstantSplat: Unbounded Sparse-view Pose-free Gaussian Splatting in 40 + Seconds + + +
+ While novel view synthesis (NVS) has made substantial progress in 3D computer +vision, it typically requires an initial estimation of camera intrinsics and +extrinsics from dense viewpoints. This pre-processing is usually conducted via +a Structure-from-Motion (SfM) pipeline, a procedure that can be slow and +unreliable, particularly in sparse-view scenarios with insufficient matched +features for accurate reconstruction. In this work, we integrate the strengths +of point-based representations (e.g., 3D Gaussian Splatting, 3D-GS) with +end-to-end dense stereo models (DUSt3R) to tackle the complex yet unresolved +issues in NVS under unconstrained settings, which encompasses pose-free and +sparse view challenges. Our framework, InstantSplat, unifies dense stereo +priors with 3D-GS to build 3D Gaussians of large-scale scenes from sparseview & +pose-free images in less than 1 minute. Specifically, InstantSplat comprises a +Coarse Geometric Initialization (CGI) module that swiftly establishes a +preliminary scene structure and camera parameters across all training views, +utilizing globally-aligned 3D point maps derived from a pre-trained dense +stereo pipeline. This is followed by the Fast 3D-Gaussian Optimization (F-3DGO) +module, which jointly optimizes the 3D Gaussian attributes and the initialized +poses with pose regularization. Experiments conducted on the large-scale +outdoor Tanks & Temples datasets demonstrate that InstantSplat significantly +improves SSIM (by 32%) while concurrently reducing Absolute Trajectory Error +(ATE) by 80%. These establish InstantSplat as a viable solution for scenarios +involving posefree and sparse-view conditions. Project page: +instantsplat.github.io. + +
+
+
+
+
+ + ☆ Benchmarking Counterfactual Image Generation + + +
+ Counterfactual image generation is pivotal for understanding the causal +relations of variables, with applications in interpretability and generation of +unbiased synthetic data. However, evaluating image generation is a +long-standing challenge in itself. The need to evaluate counterfactual +generation compounds on this challenge, precisely because counterfactuals, by +definition, are hypothetical scenarios without observable ground truths. In +this paper, we present a novel comprehensive framework aimed at benchmarking +counterfactual image generation methods. We incorporate metrics that focus on +evaluating diverse aspects of counterfactuals, such as composition, +effectiveness, minimality of interventions, and image realism. We assess the +performance of three distinct conditional image generation model types, based +on the Structural Causal Model paradigm. Our work is accompanied by a +user-friendly Python package which allows to further evaluate and benchmark +existing and future counterfactual image generation methods. Our framework is +extendable to additional SCM and other causal methods, generative models, and +datasets. + +
+
+
+
+
+ + ☆ Snap-it, Tap-it, Splat-it: Tactile-Informed 3D Gaussian Splatting for + Reconstructing Challenging Surfaces + + +
+ Touch and vision go hand in hand, mutually enhancing our ability to +understand the world. From a research perspective, the problem of mixing touch +and vision is underexplored and presents interesting challenges. To this end, +we propose Tactile-Informed 3DGS, a novel approach that incorporates touch data +(local depth maps) with multi-view vision data to achieve surface +reconstruction and novel view synthesis. Our method optimises 3D Gaussian +primitives to accurately model the object's geometry at points of contact. By +creating a framework that decreases the transmittance at touch locations, we +achieve a refined surface reconstruction, ensuring a uniformly smooth depth +map. Touch is particularly useful when considering non-Lambertian objects (e.g. +shiny or reflective surfaces) since contemporary methods tend to fail to +reconstruct with fidelity specular highlights. By combining vision and tactile +sensing, we achieve more accurate geometry reconstructions with fewer images +than prior methods. We conduct evaluation on objects with glossy and reflective +surfaces and demonstrate the effectiveness of our approach, offering +significant improvements in reconstruction quality. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ☆ CATSNet: a context-aware network for Height Estimation in a Forested + Area based on Pol-TomoSAR data + + +
+ Tropical forests are a key component of the global carbon cycle. With plans +for upcoming space-borne missions like BIOMASS to monitor forestry, several +airborne missions, including TropiSAR and AfriSAR campaigns, have been +successfully launched and experimented. Typical Synthetic Aperture Radar +Tomography (TomoSAR) methods involve complex models with low accuracy and high +computation costs. In recent years, deep learning methods have also gained +attention in the TomoSAR framework, showing interesting performance. Recently, +a solution based on a fully connected Tomographic Neural Network (TSNN) has +demonstrated its effectiveness in accurately estimating forest and ground +heights by exploiting the pixel-wise elements of the covariance matrix derived +from TomoSAR data. This work instead goes beyond the pixel-wise approach to +define a context-aware deep learning-based solution named CATSNet. A +convolutional neural network is considered to leverage patch-based information +and extract features from a neighborhood rather than focus on a single pixel. +The training is conducted by considering TomoSAR data as the input and Light +Detection and Ranging (LiDAR) values as the ground truth. The experimental +results show striking advantages in both performance and generalization ability +by leveraging context information within Multiple Baselines (MB) TomoSAR data +across different polarimetric modalities, surpassing existing techniques. + +
+
+ comment: Submitted to IEEE TGRS, under review +
+
+
+
+
+ + ☆ Draw-and-Understand: Leveraging Visual Prompts to Enable MLLMs to + Comprehend What You Want + + +
+ The interaction between humans and artificial intelligence (AI) is a crucial +factor that reflects the effectiveness of multimodal large language models +(MLLMs). However, current MLLMs primarily focus on image-level comprehension +and limit interaction to textual instructions, thereby constraining their +flexibility in usage and depth of response. In this paper, we introduce the +Draw-and-Understand project: a new model, a multi-domain dataset, and a +challenging benchmark for visual prompting. Specifically, we propose SPHINX-V, +a new end-to-end trained Multimodal Large Language Model (MLLM) that connects a +vision encoder, a visual prompt encoder and an LLM for various visual prompts +(points, bounding boxes, and free-form shape) and language understanding. To +advance visual prompting research for MLLMs, we introduce MDVP-Data and +MDVP-Bench. MDVP-Data features a multi-domain dataset containing 1.6M unique +image-visual prompt-text instruction-following samples, including natural +images, document images, OCR images, mobile screenshots, web screenshots, and +multi-panel images. Furthermore, we present MDVP-Bench, a comprehensive and +challenging benchmark to assess a model's capability in understanding visual +prompting instructions. Our experiments demonstrate SPHINX-V's impressive +multimodal interaction capabilities through visual prompting, revealing +significant improvements in detailed pixel-level description and +question-answering abilities. + +
+
+ comment: 16 pages, 7 figures +
+
+
+
+
+ + ☆ Prototype-based Interpretable Breast Cancer Prediction Models: Analysis + and Challenges + + +
+ Deep learning models have achieved high performance in medical applications, +however, their adoption in clinical practice is hindered due to their black-box +nature. Self-explainable models, like prototype-based models, can be especially +beneficial as they are interpretable by design. However, if the learnt +prototypes are of low quality then the prototype-based models are as good as +black-box. Having high quality prototypes is a pre-requisite for a truly +interpretable model. In this work, we propose a prototype evaluation framework +for coherence (PEF-C) for quantitatively evaluating the quality of the +prototypes based on domain knowledge. We show the use of PEF-C in the context +of breast cancer prediction using mammography. Existing works on +prototype-based models on breast cancer prediction using mammography have +focused on improving the classification performance of prototype-based models +compared to black-box models and have evaluated prototype quality through +anecdotal evidence. We are the first to go beyond anecdotal evidence and +evaluate the quality of the mammography prototypes systematically using our +PEF-C. Specifically, we apply three state-of-the-art prototype-based models, +ProtoPNet, BRAIxProtoPNet++ and PIP-Net on mammography images for breast cancer +prediction and evaluate these models w.r.t. i) classification performance, and +ii) quality of the prototypes, on three public datasets. Our results show that +prototype-based models are competitive with black-box models in terms of +classification performance, and achieve a higher score in detecting ROIs. +However, the quality of the prototypes are not yet sufficient and can be +improved in aspects of relevance, purity and learning a variety of prototypes. +We call the XAI community to systematically evaluate the quality of the +prototypes to check their true usability in high stake decisions and improve +such models further. + +
+
+ comment: 21 pages, 5 figures, 3 tables +
+
+
+
+
+ + ☆ Benchmarking the Robustness of Temporal Action Detection Models Against + Temporal Corruptions CVPR2024 + + +
+ Temporal action detection (TAD) aims to locate action positions and recognize +action categories in long-term untrimmed videos. Although many methods have +achieved promising results, their robustness has not been thoroughly studied. +In practice, we observe that temporal information in videos can be occasionally +corrupted, such as missing or blurred frames. Interestingly, existing methods +often incur a significant performance drop even if only one frame is affected. +To formally evaluate the robustness, we establish two temporal corruption +robustness benchmarks, namely THUMOS14-C and ActivityNet-v1.3-C. In this paper, +we extensively analyze the robustness of seven leading TAD methods and obtain +some interesting findings: 1) Existing methods are particularly vulnerable to +temporal corruptions, and end-to-end methods are often more susceptible than +those with a pre-trained feature extractor; 2) Vulnerability mainly comes from +localization error rather than classification error; 3) When corruptions occur +in the middle of an action instance, TAD models tend to yield the largest +performance drop. Besides building a benchmark, we further develop a simple but +effective robust training method to defend against temporal corruptions, +through the FrameDrop augmentation and Temporal-Robust Consistency loss. +Remarkably, our approach not only improves robustness but also yields promising +improvements on clean data. We believe that this study will serve as a +benchmark for future research in robust video analysis. Source code and models +are available at https://github.com/Alvin-Zeng/temporal-robustness-benchmark. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ MedCLIP-SAM: Bridging Text and Image Towards Universal Medical Image + Segmentation + + +
+ Medical image segmentation of anatomical structures and pathology is crucial +in modern clinical diagnosis, disease study, and treatment planning. To date, +great progress has been made in deep learning-based segmentation techniques, +but most methods still lack data efficiency, generalizability, and +interactability. Consequently, the development of new, precise segmentation +methods that demand fewer labeled datasets is of utmost importance in medical +image analysis. Recently, the emergence of foundation models, such as CLIP and +Segment-Anything-Model (SAM), with comprehensive cross-domain representation +opened the door for interactive and universal image segmentation. However, +exploration of these models for data-efficient medical image segmentation is +still limited, but is highly necessary. In this paper, we propose a novel +framework, called MedCLIP-SAM that combines CLIP and SAM models to generate +segmentation of clinical scans using text prompts in both zero-shot and weakly +supervised settings. To achieve this, we employed a new Decoupled Hard Negative +Noise Contrastive Estimation (DHN-NCE) loss to fine-tune the BiomedCLIP model +and the recent gScoreCAM to generate prompts to obtain segmentation masks from +SAM in a zero-shot setting. Additionally, we explored the use of zero-shot +segmentation labels in a weakly supervised paradigm to improve the segmentation +quality further. By extensively testing three diverse segmentation tasks and +medical image modalities (breast tumor ultrasound, brain tumor MRI, and lung +X-ray), our proposed framework has demonstrated excellent accuracy. + +
+
+ comment: 10 pages, 2 figures +
+
+
+
+
+ + ☆ Latent Embedding Clustering for Occlusion Robust Head Pose Estimation + + +
+ Head pose estimation has become a crucial area of research in computer vision +given its usefulness in a wide range of applications, including robotics, +surveillance, or driver attention monitoring. One of the most difficult +challenges in this field is managing head occlusions that frequently take place +in real-world scenarios. In this paper, we propose a novel and efficient +framework that is robust in real world head occlusion scenarios. In particular, +we propose an unsupervised latent embedding clustering with regression and +classification components for each pose angle. The model optimizes latent +feature representations for occluded and non-occluded images through a +clustering term while improving fine-grained angle predictions. Experimental +evaluation on in-the-wild head pose benchmark datasets reveal competitive +performance in comparison to state-of-the-art methodologies with the advantage +of having a significant data reduction. We observe a substantial improvement in +occluded head pose estimation. Also, an ablation study is conducted to +ascertain the impact of the clustering term within our proposed framework. + +
+
+ comment: Accepted at 18th IEEE International Conference on Automatic Face and + Gesture Recognition (FG'24) +
+
+
+
+
+ + ☆ Relation Rectification in Diffusion Model + + +
+ Despite their exceptional generative abilities, large text-to-image diffusion +models, much like skilled but careless artists, often struggle with accurately +depicting visual relationships between objects. This issue, as we uncover +through careful analysis, arises from a misaligned text encoder that struggles +to interpret specific relationships and differentiate the logical order of +associated objects. To resolve this, we introduce a novel task termed Relation +Rectification, aiming to refine the model to accurately represent a given +relationship it initially fails to generate. To address this, we propose an +innovative solution utilizing a Heterogeneous Graph Convolutional Network +(HGCN). It models the directional relationships between relation terms and +corresponding objects within the input prompts. Specifically, we optimize the +HGCN on a pair of prompts with identical relational words but reversed object +orders, supplemented by a few reference images. The lightweight HGCN adjusts +the text embeddings generated by the text encoder, ensuring the accurate +reflection of the textual relation in the embedding space. Crucially, our +method retains the parameters of the text encoder and diffusion model, +preserving the model's robust performance on unrelated descriptions. We +validated our approach on a newly curated dataset of diverse relational data, +demonstrating both quantitative and qualitative enhancements in generating +images with precise visual relations. Project page: +https://wuyinwei-hah.github.io/rrnet.github.io/. + +
+
+
+
+
+ + ☆ Long-Tailed Anomaly Detection with Learnable Class Names CVPR 2024 + + +
+ Anomaly detection (AD) aims to identify defective images and localize their +defects (if any). Ideally, AD models should be able to detect defects over many +image classes; without relying on hard-coded class names that can be +uninformative or inconsistent across datasets; learn without anomaly +supervision; and be robust to the long-tailed distributions of real-world +applications. To address these challenges, we formulate the problem of +long-tailed AD by introducing several datasets with different levels of class +imbalance and metrics for performance evaluation. We then propose a novel +method, LTAD, to detect defects from multiple and long-tailed classes, without +relying on dataset class names. LTAD combines AD by reconstruction and semantic +AD modules. AD by reconstruction is implemented with a transformer-based +reconstruction module. Semantic AD is implemented with a binary classifier, +which relies on learned pseudo class names and a pretrained foundation model. +These modules are learned over two phases. Phase 1 learns the pseudo-class +names and a variational autoencoder (VAE) for feature synthesis that augments +the training data to combat long-tails. Phase 2 then learns the parameters of +the reconstruction and classification modules of LTAD. Extensive experiments +using the proposed long-tailed datasets show that LTAD substantially +outperforms the state-of-the-art methods for most forms of dataset imbalance. +The long-tailed dataset split is available at +https://zenodo.org/records/10854201 . + +
+
+ comment: This paper is accepted to CVPR 2024. The supplementary material is + included. The long-tailed dataset split is available at + https://zenodo.org/records/10854201 +
+
+
+
+
+ + ☆ U-VAP: User-specified Visual Appearance Personalization via Decoupled + Self Augmentation + + +
+ Concept personalization methods enable large text-to-image models to learn +specific subjects (e.g., objects/poses/3D models) and synthesize renditions in +new contexts. Given that the image references are highly biased towards visual +attributes, state-of-the-art personalization models tend to overfit the whole +subject and cannot disentangle visual characteristics in pixel space. In this +study, we proposed a more challenging setting, namely fine-grained visual +appearance personalization. Different from existing methods, we allow users to +provide a sentence describing the desired attributes. A novel decoupled +self-augmentation strategy is proposed to generate target-related and +non-target samples to learn user-specified visual attributes. These augmented +data allow for refining the model's understanding of the target attribute while +mitigating the impact of unrelated attributes. At the inference stage, +adjustments are conducted on semantic space through the learned target and +non-target embeddings to further enhance the disentanglement of target +attributes. Extensive experiments on various kinds of visual attributes with +SOTA personalization methods show the ability of the proposed method to mimic +target visual appearance in novel contexts, thus improving the controllability +and flexibility of personalization. + +
+
+ comment: 14 pages, 13 figures, 2 tables +
+
+
+
+
+ + ☆ MTMMC: A Large-Scale Real-World Multi-Modal Camera Tracking Benchmark CVPR 2024 + + +
+ Multi-target multi-camera tracking is a crucial task that involves +identifying and tracking individuals over time using video streams from +multiple cameras. This task has practical applications in various fields, such +as visual surveillance, crowd behavior analysis, and anomaly detection. +However, due to the difficulty and cost of collecting and labeling data, +existing datasets for this task are either synthetically generated or +artificially constructed within a controlled camera network setting, which +limits their ability to model real-world dynamics and generalize to diverse +camera configurations. To address this issue, we present MTMMC, a real-world, +large-scale dataset that includes long video sequences captured by 16 +multi-modal cameras in two different environments - campus and factory - across +various time, weather, and season conditions. This dataset provides a +challenging test-bed for studying multi-camera tracking under diverse +real-world complexities and includes an additional input modality of spatially +aligned and temporally synchronized RGB and thermal cameras, which enhances the +accuracy of multi-camera tracking. MTMMC is a super-set of existing datasets, +benefiting independent fields such as person detection, re-identification, and +multiple object tracking. We provide baselines and new learning setups on this +dataset and set the reference scores for future studies. The datasets, models, +and test server will be made publicly available. + +
+
+ comment: Accepted on CVPR 2024 +
+
+
+
+
+ + ☆ H2RSVLM: Towards Helpful and Honest Remote Sensing Large Vision Language + Model + + +
+ The generic large Vision-Language Models (VLMs) is rapidly developing, but +still perform poorly in Remote Sensing (RS) domain, which is due to the unique +and specialized nature of RS imagery and the comparatively limited spatial +perception of current VLMs. Existing Remote Sensing specific Vision Language +Models (RSVLMs) still have considerable potential for improvement, primarily +owing to the lack of large-scale, high-quality RS vision-language datasets. We +constructed HqDC-1.4M, the large scale High quality and Detailed Captions for +RS images, containing 1.4 million image-caption pairs, which not only enhance +the RSVLM's understanding of RS images but also significantly improve the +model's spatial perception abilities, such as localization and counting, +thereby increasing the helpfulness of the RSVLM. Moreover, to address the +inevitable "hallucination" problem in RSVLM, we developed RSSA, the first +dataset aimed at enhancing the Self-Awareness capability of RSVLMs. By +incorporating a variety of unanswerable questions into typical RS visual +question-answering tasks, RSSA effectively improves the truthfulness and +reduces the hallucinations of the model's outputs, thereby enhancing the +honesty of the RSVLM. Based on these datasets, we proposed the H2RSVLM, the +Helpful and Honest Remote Sensing Vision Language Model. H2RSVLM has achieved +outstanding performance on multiple RS public datasets and is capable of +recognizing and refusing to answer the unanswerable questions, effectively +mitigating the incorrect generations. We will release the code, data and model +weights at https://github.com/opendatalab/H2RSVLM . + +
+
+ comment: Equal contribution: Chao Pang, Jiang Wu; Corresponding author: + Gui-Song Xia, Conghui He +
+
+
+
+
+ + ☆ Enhancing Lithological Mapping with Spatially Constrained Bayesian + Network (SCB-Net): An Approach for Field Data-Constrained Predictions with + Uncertainty Evaluation + + +
+ Geological maps are an extremely valuable source of information for the Earth +sciences. They provide insights into mineral exploration, vulnerability to +natural hazards, and many other applications. These maps are created using +numerical or conceptual models that use geological observations to extrapolate +data. Geostatistical techniques have traditionally been used to generate +reliable predictions that take into account the spatial patterns inherent in +the data. However, as the number of auxiliary variables increases, these +methods become more labor-intensive. Additionally, traditional machine learning +methods often struggle with spatially correlated data and extracting valuable +non-linear information from geoscientific datasets. To address these +limitations, a new architecture called the Spatially Constrained Bayesian +Network (SCB-Net) has been developed. The SCB-Net aims to effectively exploit +the information from auxiliary variables while producing spatially constrained +predictions. It is made up of two parts, the first part focuses on learning +underlying patterns in the auxiliary variables while the second part integrates +ground-truth data and the learned embeddings from the first part. Moreover, to +assess model uncertainty, a technique called Monte Carlo dropout is used as a +Bayesian approximation. The SCB-Net has been applied to two selected areas in +northern Quebec, Canada, and has demonstrated its potential in generating +field-data-constrained lithological maps while allowing assessment of +prediction uncertainty for decision-making. This study highlights the promising +advancements of deep neural networks in geostatistics, particularly in handling +complex spatial feature learning tasks, leading to improved spatial information +techniques. + +
+
+ comment: 17 pages, 3559 words, 14 figures +
+
+
+
+
+ + ☆ Motion Inversion for Video Customization + + +
+ In this research, we present a novel approach to motion customization in +video generation, addressing the widespread gap in the thorough exploration of +motion representation within video generative models. Recognizing the unique +challenges posed by video's spatiotemporal nature, our method introduces Motion +Embeddings, a set of explicit, temporally coherent one-dimensional embeddings +derived from a given video. These embeddings are designed to integrate +seamlessly with the temporal transformer modules of video diffusion models, +modulating self-attention computations across frames without compromising +spatial integrity. Our approach offers a compact and efficient solution to +motion representation and enables complex manipulations of motion +characteristics through vector arithmetic in the embedding space. Furthermore, +we identify the Temporal Discrepancy in video generative models, which refers +to variations in how different motion modules process temporal relationships +between frames. We leverage this understanding to optimize the integration of +our motion embeddings. Our contributions include the introduction of a tailored +motion embedding for customization tasks, insights into the temporal processing +differences in video models, and a demonstration of the practical advantages +and effectiveness of our method through extensive experiments. + +
+
+ comment: Project Page: + \href{https://wileewang.github.io/MotionInversion/}{https://wileewang.github.io/MotionInversion/} +
+
+
+
+
+ + ☆ Sketch-to-Architecture: Generative AI-aided Architectural Design + + +
+ Recently, the development of large-scale models has paved the way for various +interdisciplinary research, including architecture. By using generative AI, we +present a novel workflow that utilizes AI models to generate conceptual +floorplans and 3D models from simple sketches, enabling rapid ideation and +controlled generation of architectural renderings based on textual +descriptions. Our work demonstrates the potential of generative AI in the +architectural design process, pointing towards a new direction of +computer-aided architectural design. Our project website is available at: +https://zrealli.github.io/sketch2arc + +
+
+ comment: Pacific Graphics 2023, accepted as Poster +
+
+
+
+
+ + ☆ HARMamba: Efficient Wearable Sensor Human Activity Recognition Based on + Bidirectional Selective SSM + + +
+ Wearable sensor human activity recognition (HAR) is a crucial area of +research in activity sensing. While transformer-based temporal deep learning +models have been extensively studied and implemented, their large number of +parameters present significant challenges in terms of system computing load and +memory usage, rendering them unsuitable for real-time mobile activity +recognition applications. Recently, an efficient hardware-aware state space +model (SSM) called Mamba has emerged as a promising alternative. Mamba +demonstrates strong potential in long sequence modeling, boasts a simpler +network architecture, and offers an efficient hardware-aware design. Leveraging +SSM for activity recognition represents an appealing avenue for exploration. In +this study, we introduce HARMamba, which employs a more lightweight selective +SSM as the foundational model architecture for activity recognition. The goal +is to address the computational resource constraints encountered in real-time +activity recognition scenarios. Our approach involves processing sensor data +flow by independently learning each channel and segmenting the data into +"patches". The marked sensor sequence's position embedding serves as the input +token for the bidirectional state space model, ultimately leading to activity +categorization through the classification head. Compared to established +activity recognition frameworks like Transformer-based models, HARMamba +achieves superior performance while also reducing computational and memory +overhead. Furthermore, our proposed method has been extensively tested on four +public activity datasets: PAMAP2, WISDM, UNIMIB, and UCI, demonstrating +impressive performance in activity recognition tasks. + +
+
+
+
+
+ + ☆ MCNet: A crowd denstity estimation network based on integrating + multiscale attention module + + +
+ Aiming at the metro video surveillance system has not been able to +effectively solve the metro crowd density estimation problem, a Metro Crowd +density estimation Network (called MCNet) is proposed to automatically classify +crowd density level of passengers. Firstly, an Integrating Multi-scale +Attention (IMA) module is proposed to enhance the ability of the plain +classifiers to extract semantic crowd texture features to accommodate to the +characteristics of the crowd texture feature. The innovation of the IMA module +is to fuse the dilation convolution, multiscale feature extraction and +attention mechanism to obtain multi-scale crowd feature activation from a +larger receptive field with lower computational cost, and to strengthen the +crowds activation state of convolutional features in top layers. Secondly, a +novel lightweight crowd texture feature extraction network is proposed, which +can directly process video frames and automatically extract texture features +for crowd density estimation, while its faster image processing speed and fewer +network parameters make it flexible to be deployed on embedded platforms with +limited hardware resources. Finally, this paper integrates IMA module and the +lightweight crowd texture feature extraction network to construct the MCNet, +and validate the feasibility of this network on image classification dataset: +Cifar10 and four crowd density datasets: PETS2009, Mall, QUT and SH_METRO to +validate the MCNet whether can be a suitable solution for crowd density +estimation in metro video surveillance where there are image processing +challenges such as high density, high occlusion, perspective distortion and +limited hardware resources. + +
+
+
+
+
+ + ☆ Unsupervised Tumor-Aware Distillation for Multi-Modal Brain Image + Translation IJCNN 2024 + + +
+ Multi-modal brain images from MRI scans are widely used in clinical diagnosis +to provide complementary information from different modalities. However, +obtaining fully paired multi-modal images in practice is challenging due to +various factors, such as time, cost, and artifacts, resulting in +modality-missing brain images. To address this problem, unsupervised +multi-modal brain image translation has been extensively studied. Existing +methods suffer from the problem of brain tumor deformation during translation, +as they fail to focus on the tumor areas when translating the whole images. In +this paper, we propose an unsupervised tumor-aware distillation teacher-student +network called UTAD-Net, which is capable of perceiving and translating tumor +areas precisely. Specifically, our model consists of two parts: a teacher +network and a student network. The teacher network learns an end-to-end mapping +from source to target modality using unpaired images and corresponding tumor +masks first. Then, the translation knowledge is distilled into the student +network, enabling it to generate more realistic tumor areas and whole images +without masks. Experiments show that our model achieves competitive performance +on both quantitative and qualitative evaluations of image quality compared with +state-of-the-art methods. Furthermore, we demonstrate the effectiveness of the +generated images on downstream segmentation tasks. Our code is available at +https://github.com/scut-HC/UTAD-Net. + +
+
+ comment: 8 pages, 5 figures. It has been provisionally accepted for IJCNN 2024 +
+
+
+
+
+ + ☆ HGS-Mapping: Online Dense Mapping Using Hybrid Gaussian Representation + in Urban Scenes + + +
+ Online dense mapping of urban scenes forms a fundamental cornerstone for +scene understanding and navigation of autonomous vehicles. Recent advancements +in mapping methods are mainly based on NeRF, whose rendering speed is too slow +to meet online requirements. 3D Gaussian Splatting (3DGS), with its rendering +speed hundreds of times faster than NeRF, holds greater potential in online +dense mapping. However, integrating 3DGS into a street-view dense mapping +framework still faces two challenges, including incomplete reconstruction due +to the absence of geometric information beyond the LiDAR coverage area and +extensive computation for reconstruction in large urban scenes. To this end, we +propose HGS-Mapping, an online dense mapping framework in unbounded large-scale +scenes. To attain complete construction, our framework introduces Hybrid +Gaussian Representation, which models different parts of the entire scene using +Gaussians with distinct properties. Furthermore, we employ a hybrid Gaussian +initialization mechanism and an adaptive update method to achieve high-fidelity +and rapid reconstruction. To the best of our knowledge, we are the first to +integrate Gaussian representation into online dense mapping of urban scenes. +Our approach achieves SOTA reconstruction accuracy while only employing 66% +number of Gaussians, leading to 20% faster reconstruction speed. + +
+
+
+
+
+ + ☆ Talk3D: High-Fidelity Talking Portrait Synthesis via Personalized 3D + Generative Prior + + +
+ Recent methods for audio-driven talking head synthesis often optimize neural +radiance fields (NeRF) on a monocular talking portrait video, leveraging its +capability to render high-fidelity and 3D-consistent novel-view frames. +However, they often struggle to reconstruct complete face geometry due to the +absence of comprehensive 3D information in the input monocular videos. In this +paper, we introduce a novel audio-driven talking head synthesis framework, +called Talk3D, that can faithfully reconstruct its plausible facial geometries +by effectively adopting the pre-trained 3D-aware generative prior. Given the +personalized 3D generative model, we present a novel audio-guided attention +U-Net architecture that predicts the dynamic face variations in the NeRF space +driven by audio. Furthermore, our model is further modulated by audio-unrelated +conditioning tokens which effectively disentangle variations unrelated to audio +features. Compared to existing methods, our method excels in generating +realistic facial geometries even under extreme head poses. We also conduct +extensive experiments showing our approach surpasses state-of-the-art +benchmarks in terms of both quantitative and qualitative evaluations. + +
+
+ comment: Project page: https://ku-cvlab.github.io/Talk3D/ +
+
+
+
+
+ + ☆ StegoGAN: Leveraging Steganography for Non-Bijective Image-to-Image + Translation + + +
+ Most image-to-image translation models postulate that a unique correspondence +exists between the semantic classes of the source and target domains. However, +this assumption does not always hold in real-world scenarios due to divergent +distributions, different class sets, and asymmetrical information +representation. As conventional GANs attempt to generate images that match the +distribution of the target domain, they may hallucinate spurious instances of +classes absent from the source domain, thereby diminishing the usefulness and +reliability of translated images. CycleGAN-based methods are also known to hide +the mismatched information in the generated images to bypass cycle consistency +objectives, a process known as steganography. In response to the challenge of +non-bijective image translation, we introduce StegoGAN, a novel model that +leverages steganography to prevent spurious features in generated images. Our +approach enhances the semantic consistency of the translated images without +requiring additional postprocessing or supervision. Our experimental +evaluations demonstrate that StegoGAN outperforms existing GAN-based models +across various non-bijective image-to-image translation tasks, both +qualitatively and quantitatively. Our code and pretrained models are accessible +at https://github.com/sian-wusidi/StegoGAN. + +
+
+
+
+
+ + ☆ ECLIPSE: Efficient Continual Learning in Panoptic Segmentation with + Visual Prompt Tuning CVPR 2024 + + +
+ Panoptic segmentation, combining semantic and instance segmentation, stands +as a cutting-edge computer vision task. Despite recent progress with deep +learning models, the dynamic nature of real-world applications necessitates +continual learning, where models adapt to new classes (plasticity) over time +without forgetting old ones (catastrophic forgetting). Current continual +segmentation methods often rely on distillation strategies like knowledge +distillation and pseudo-labeling, which are effective but result in increased +training complexity and computational overhead. In this paper, we introduce a +novel and efficient method for continual panoptic segmentation based on Visual +Prompt Tuning, dubbed ECLIPSE. Our approach involves freezing the base model +parameters and fine-tuning only a small set of prompt embeddings, addressing +both catastrophic forgetting and plasticity and significantly reducing the +trainable parameters. To mitigate inherent challenges such as error propagation +and semantic drift in continual segmentation, we propose logit manipulation to +effectively leverage common knowledge across the classes. Experiments on ADE20K +continual panoptic segmentation benchmark demonstrate the superiority of +ECLIPSE, notably its robustness against catastrophic forgetting and its +reasonable plasticity, achieving a new state-of-the-art. The code is available +at https://github.com/clovaai/ECLIPSE. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Segmentation, Classification and Interpretation of Breast Cancer Medical + Images using Human-in-the-Loop Machine Learning + + +
+ This paper explores the application of Human-in-the-Loop (HITL) strategies in +training machine learning models in the medical domain. In this case a +doctor-in-the-loop approach is proposed to leverage human expertise in dealing +with large and complex data. Specifically, the paper deals with the integration +of genomic data and Whole Slide Imaging (WSI) analysis of breast cancer. Three +different tasks were developed: segmentation of histopathological images, +classification of this images regarding the genomic subtype of the cancer and, +finally, interpretation of the machine learning results. The involvement of a +pathologist helped us to develop a better segmentation model and to enhance the +explainatory capabilities of the models, but the classification results were +suboptimal, highlighting the limitations of this approach: despite involving +human experts, complex domains can still pose challenges, and a HITL approach +may not always be effective. + +
+
+
+
+
+ + ☆ Aggregating Local and Global Features via Selective State Spaces Model + for Efficient Image Deblurring + + +
+ Image deblurring is a process of restoring a high quality image from the +corresponding blurred image. Significant progress in this field has been made +possible by the emergence of various effective deep learning models, including +CNNs and Transformers. However, these methods often face the dilemma between +eliminating long-range blur degradation perturbations and maintaining +computational efficiency, which hinders their practical application. To address +this issue, we propose an efficient image deblurring network that leverages +selective structured state spaces model to aggregate enriched and accurate +features. Specifically, we design an aggregate local and global block +(ALGBlock) to capture and fuse both local invariant properties and non-local +information. The ALGBlock consists of two blocks: (1) The local block models +local connectivity using simplified channel attention. (2) The global block +captures long-range dependency features with linear complexity through +selective structured state spaces. Nevertheless, we note that the image details +are local features of images, we accentuate the local part for restoration by +recalibrating the weight when aggregating the two branches for recovery. +Experimental results demonstrate that the proposed method outperforms +state-of-the-art approaches on widely used benchmarks, highlighting its +superior performance. + +
+
+
+
+
+ + ☆ FreeSeg-Diff: Training-Free Open-Vocabulary Segmentation with Diffusion + Models + + +
+ Foundation models have exhibited unprecedented capabilities in tackling many +domains and tasks. Models such as CLIP are currently widely used to bridge +cross-modal representations, and text-to-image diffusion models are arguably +the leading models in terms of realistic image generation. Image generative +models are trained on massive datasets that provide them with powerful internal +spatial representations. In this work, we explore the potential benefits of +such representations, beyond image generation, in particular, for dense visual +prediction tasks. We focus on the task of image segmentation, which is +traditionally solved by training models on closed-vocabulary datasets, with +pixel-level annotations. To avoid the annotation cost or training large +diffusion models, we constraint our setup to be zero-shot and training-free. In +a nutshell, our pipeline leverages different and relatively small-sized, +open-source foundation models for zero-shot open-vocabulary segmentation. The +pipeline is as follows: the image is passed to both a captioner model (i.e. +BLIP) and a diffusion model (i.e., Stable Diffusion Model) to generate a text +description and visual representation, respectively. The features are clustered +and binarized to obtain class agnostic masks for each object. These masks are +then mapped to a textual class, using the CLIP model to support +open-vocabulary. Finally, we add a refinement step that allows to obtain a more +precise segmentation mask. Our approach (dubbed FreeSeg-Diff), which does not +rely on any training, outperforms many training-based approaches on both Pascal +VOC and COCO datasets. In addition, we show very competitive results compared +to the recent weakly-supervised segmentation approaches. We provide +comprehensive experiments showing the superiority of diffusion model features +compared to other pretrained models. Project page: +https://bcorrad.github.io/freesegdiff/ + +
+
+
+
+
+ + ☆ RealKIE: Five Novel Datasets for Enterprise Key Information Extraction + + +
+ We introduce RealKIE, a benchmark of five challenging datasets aimed at +advancing key information extraction methods, with an emphasis on enterprise +applications. The datasets include a diverse range of documents including SEC +S1 Filings, US Non-disclosure Agreements, UK Charity Reports, FCC Invoices, and +Resource Contracts. Each presents unique challenges: poor text serialization, +sparse annotations in long documents, and complex tabular layouts. These +datasets provide a realistic testing ground for key information extraction +tasks like investment analysis and legal data processing. + In addition to presenting these datasets, we offer an in-depth description of +the annotation process, document processing techniques, and baseline modeling +approaches. This contribution facilitates the development of NLP models capable +of handling practical challenges and supports further research into information +extraction technologies applicable to industry-specific problems. + The annotated data and OCR outputs are available to download at +https://indicodatasolutions.github.io/RealKIE/ code to reproduce the baselines +will be available shortly. + +
+
+
+
+
+ + ☆ Modeling Weather Uncertainty for Multi-weather Co-Presence Estimation + + +
+ Images from outdoor scenes may be taken under various weather conditions. It +is well studied that weather impacts the performance of computer vision +algorithms and needs to be handled properly. However, existing algorithms model +weather condition as a discrete status and estimate it using multi-label +classification. The fact is that, physically, specifically in meteorology, +weather are modeled as a continuous and transitional status. Instead of +directly implementing hard classification as existing multi-weather +classification methods do, we consider the physical formulation of +multi-weather conditions and model the impact of physical-related parameter on +learning from the image appearance. In this paper, we start with solid revisit +of the physics definition of weather and how it can be described as a +continuous machine learning and computer vision task. Namely, we propose to +model the weather uncertainty, where the level of probability and co-existence +of multiple weather conditions are both considered. A Gaussian mixture model is +used to encapsulate the weather uncertainty and a uncertainty-aware +multi-weather learning scheme is proposed based on prior-posterior learning. A +novel multi-weather co-presence estimation transformer (MeFormer) is proposed. +In addition, a new multi-weather co-presence estimation (MePe) dataset, along +with 14 fine-grained weather categories and 16,078 samples, is proposed to +benchmark both conventional multi-label weather classification task and +multi-weather co-presence estimation task. Large scale experiments show that +the proposed method achieves state-of-the-art performance and substantial +generalization capabilities on both the conventional multi-label weather +classification task and the proposed multi-weather co-presence estimation task. +Besides, modeling weather uncertainty also benefits adverse-weather semantic +segmentation. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Selective Attention-based Modulation for Continual Learning + + +
+ We present SAM, a biologically-plausible selective attention-driven +modulation approach to enhance classification models in a continual learning +setting. Inspired by neurophysiological evidence that the primary visual cortex +does not contribute to object manifold untangling for categorization and that +primordial attention biases are still embedded in the modern brain, we propose +to employ auxiliary saliency prediction features as a modulation signal to +drive and stabilize the learning of a sequence of non-i.i.d. classification +tasks. Experimental results confirm that SAM effectively enhances the +performance (in some cases up to about twenty percent points) of +state-of-the-art continual learning methods, both in class-incremental and +task-incremental settings. Moreover, we show that attention-based modulation +successfully encourages the learning of features that are more robust to the +presence of spurious features and to adversarial attacks than baseline methods. +Code is available at: https://github.com/perceivelab/SAM. + +
+
+
+
+
+ + ☆ Mixed-precision Supernet Training from Vision Foundation Models using + Low Rank Adapter + + +
+ Compression of large and performant vision foundation models (VFMs) into +arbitrary bit-wise operations (BitOPs) allows their deployment on various +hardware. We propose to fine-tune a VFM to a mixed-precision quantized +supernet. The supernet-based neural architecture search (NAS) can be adopted +for this purpose, which trains a supernet, and then subnets within arbitrary +hardware budgets can be extracted. However, existing methods face difficulties +in optimizing the mixed-precision search space and incurring large memory costs +during training. To tackle these challenges, first, we study the effective +search space design for fine-tuning a VFM by comparing different operators +(such as resolution, feature size, width, depth, and bit-widths) in terms of +performance and BitOPs reduction. Second, we propose memory-efficient supernet +training using a low-rank adapter (LoRA) and a progressive training strategy. +The proposed method is evaluated for the recently proposed VFM, Segment +Anything Model, fine-tuned on segmentation tasks. The searched model yields +about a 95% reduction in BitOPs without incurring performance degradation. + +
+
+
+
+
+ + ☆ SGD: Street View Synthesis with Gaussian Splatting and Diffusion Prior + + +
+ Novel View Synthesis (NVS) for street scenes play a critical role in the +autonomous driving simulation. The current mainstream technique to achieve it +is neural rendering, such as Neural Radiance Fields (NeRF) and 3D Gaussian +Splatting (3DGS). Although thrilling progress has been made, when handling +street scenes, current methods struggle to maintain rendering quality at the +viewpoint that deviates significantly from the training viewpoints. This issue +stems from the sparse training views captured by a fixed camera on a moving +vehicle. To tackle this problem, we propose a novel approach that enhances the +capacity of 3DGS by leveraging prior from a Diffusion Model along with +complementary multi-modal data. Specifically, we first fine-tune a Diffusion +Model by adding images from adjacent frames as condition, meanwhile exploiting +depth data from LiDAR point clouds to supply additional spatial information. +Then we apply the Diffusion Model to regularize the 3DGS at unseen views during +training. Experimental results validate the effectiveness of our method +compared with current state-of-the-art models, and demonstrate its advance in +rendering images from broader views. + +
+
+
+
+
+ + ☆ Negative Label Guided OOD Detection with Pretrained Vision-Language + Models ICLR 2024 + + +
+ Out-of-distribution (OOD) detection aims at identifying samples from unknown +classes, playing a crucial role in trustworthy models against errors on +unexpected inputs. Extensive research has been dedicated to exploring OOD +detection in the vision modality. Vision-language models (VLMs) can leverage +both textual and visual information for various multi-modal applications, +whereas few OOD detection methods take into account information from the text +modality. In this paper, we propose a novel post hoc OOD detection method, +called NegLabel, which takes a vast number of negative labels from extensive +corpus databases. We design a novel scheme for the OOD score collaborated with +negative labels. Theoretical analysis helps to understand the mechanism of +negative labels. Extensive experiments demonstrate that our method NegLabel +achieves state-of-the-art performance on various OOD detection benchmarks and +generalizes well on multiple VLM architectures. Furthermore, our method +NegLabel exhibits remarkable robustness against diverse domain shifts. The +codes are available at https://github.com/tmlr-group/NegLabel. + +
+
+ comment: ICLR 2024 Spotlight +
+
+
+
+
+ + ☆ Revolutionizing Disease Diagnosis with simultaneous functional PET/MR + and Deeply Integrated Brain Metabolic, Hemodynamic, and Perfusion Networks + + +
+ Simultaneous functional PET/MR (sf-PET/MR) presents a cutting-edge multimodal +neuroimaging technique. It provides an unprecedented opportunity for +concurrently monitoring and integrating multifaceted brain networks built by +spatiotemporally covaried metabolic activity, neural activity, and cerebral +blood flow (perfusion). Albeit high scientific/clinical values, short in +hardware accessibility of PET/MR hinders its applications, let alone modern +AI-based PET/MR fusion models. Our objective is to develop a clinically +feasible AI-based disease diagnosis model trained on comprehensive sf-PET/MR +data with the power of, during inferencing, allowing single modality input +(e.g., PET only) as well as enforcing multimodal-based accuracy. To this end, +we propose MX-ARM, a multimodal MiXture-of-experts Alignment and Reconstruction +Model. It is modality detachable and exchangeable, allocating different +multi-layer perceptrons dynamically ("mixture of experts") through learnable +weights to learn respective representations from different modalities. Such +design will not sacrifice model performance in uni-modal situation. To fully +exploit the inherent complex and nonlinear relation among modalities while +producing fine-grained representations for uni-modal inference, we subsequently +add a modal alignment module to line up a dominant modality (e.g., PET) with +representations of auxiliary modalities (MR). We further adopt multimodal +reconstruction to promote the quality of learned features. Experiments on +precious multimodal sf-PET/MR data for Mild Cognitive Impairment diagnosis +showcase the efficacy of our model toward clinically feasible precision +medicine. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ Embracing Unknown Step by Step: Towards Reliable Sparse Training in Real + World + + +
+ Sparse training has emerged as a promising method for resource-efficient deep +neural networks (DNNs) in real-world applications. However, the reliability of +sparse models remains a crucial concern, particularly in detecting unknown +out-of-distribution (OOD) data. This study addresses the knowledge gap by +investigating the reliability of sparse training from an OOD perspective and +reveals that sparse training exacerbates OOD unreliability. The lack of unknown +information and the sparse constraints hinder the effective exploration of +weight space and accurate differentiation between known and unknown knowledge. +To tackle these challenges, we propose a new unknown-aware sparse training +method, which incorporates a loss modification, auto-tuning strategy, and a +voting scheme to guide weight space exploration and mitigate confusion between +known and unknown information without incurring significant additional costs or +requiring access to additional OOD data. Theoretical insights demonstrate how +our method reduces model confidence when faced with OOD samples. Empirical +experiments across multiple datasets, model architectures, and sparsity levels +validate the effectiveness of our method, with improvements of up to +\textbf{8.4\%} in AUROC while maintaining comparable or higher accuracy and +calibration. This research enhances the understanding and readiness of sparse +DNNs for deployment in resource-limited applications. Our code is available on: +\url{https://github.com/StevenBoys/MOON}. + +
+
+
+
+
+ + ☆ UltraLight VM-UNet: Parallel Vision Mamba Significantly Reduces + Parameters for Skin Lesion Segmentation + + +
+ Traditionally for improving the segmentation performance of models, most +approaches prefer to use adding more complex modules. And this is not suitable +for the medical field, especially for mobile medical devices, where +computationally loaded models are not suitable for real clinical environments +due to computational resource constraints. Recently, state-space models (SSMs), +represented by Mamba, have become a strong competitor to traditional CNNs and +Transformers. In this paper, we deeply explore the key elements of parameter +influence in Mamba and propose an UltraLight Vision Mamba UNet (UltraLight +VM-UNet) based on this. Specifically, we propose a method for processing +features in parallel Vision Mamba, named PVM Layer, which achieves excellent +performance with the lowest computational load while keeping the overall number +of processing channels constant. We conducted comparisons and ablation +experiments with several state-of-the-art lightweight models on three skin +lesion public datasets and demonstrated that the UltraLight VM-UNet exhibits +the same strong performance competitiveness with parameters of only 0.049M and +GFLOPs of 0.060. In addition, this study deeply explores the key elements of +parameter influence in Mamba, which will lay a theoretical foundation for Mamba +to possibly become a new mainstream module for lightweighting in the future. +The code is available from https://github.com/wurenkai/UltraLight-VM-UNet . + +
+
+
+
+
+ + ☆ NeSLAM: Neural Implicit Mapping and Self-Supervised Feature Tracking + With Depth Completion and Denoising + + +
+ In recent years, there have been significant advancements in 3D +reconstruction and dense RGB-D SLAM systems. One notable development is the +application of Neural Radiance Fields (NeRF) in these systems, which utilizes +implicit neural representation to encode 3D scenes. This extension of NeRF to +SLAM has shown promising results. However, the depth images obtained from +consumer-grade RGB-D sensors are often sparse and noisy, which poses +significant challenges for 3D reconstruction and affects the accuracy of the +representation of the scene geometry. Moreover, the original hierarchical +feature grid with occupancy value is inaccurate for scene geometry +representation. Furthermore, the existing methods select random pixels for +camera tracking, which leads to inaccurate localization and is not robust in +real-world indoor environments. To this end, we present NeSLAM, an advanced +framework that achieves accurate and dense depth estimation, robust camera +tracking, and realistic synthesis of novel views. First, a depth completion and +denoising network is designed to provide dense geometry prior and guide the +neural implicit representation optimization. Second, the occupancy scene +representation is replaced with Signed Distance Field (SDF) hierarchical scene +representation for high-quality reconstruction and view synthesis. Furthermore, +we also propose a NeRF-based self-supervised feature tracking algorithm for +robust real-time tracking. Experiments on various indoor datasets demonstrate +the effectiveness and accuracy of the system in reconstruction, tracking +quality, and novel view synthesis. + +
+
+
+
+
+ + ☆ HO-Gaussian: Hybrid Optimization of 3D Gaussian Splatting for Urban + Scenes + + +
+ The rapid growth of 3D Gaussian Splatting (3DGS) has revolutionized neural +rendering, enabling real-time production of high-quality renderings. However, +the previous 3DGS-based methods have limitations in urban scenes due to +reliance on initial Structure-from-Motion(SfM) points and difficulties in +rendering distant, sky and low-texture areas. To overcome these challenges, we +propose a hybrid optimization method named HO-Gaussian, which combines a +grid-based volume with the 3DGS pipeline. HO-Gaussian eliminates the dependency +on SfM point initialization, allowing for rendering of urban scenes, and +incorporates the Point Densitification to enhance rendering quality in +problematic regions during training. Furthermore, we introduce Gaussian +Direction Encoding as an alternative for spherical harmonics in the rendering +pipeline, which enables view-dependent color representation. To account for +multi-camera systems, we introduce neural warping to enhance object consistency +across different cameras. Experimental results on widely used autonomous +driving datasets demonstrate that HO-Gaussian achieves photo-realistic +rendering in real-time on multi-camera urban datasets. + +
+
+
+
+
+ + ☆ A Unified Framework for Human-centric Point Cloud Video Understanding CVPR 2024 + + +
+ Human-centric Point Cloud Video Understanding (PVU) is an emerging field +focused on extracting and interpreting human-related features from sequences of +human point clouds, further advancing downstream human-centric tasks and +applications. Previous works usually focus on tackling one specific task and +rely on huge labeled data, which has poor generalization capability. +Considering that human has specific characteristics, including the structural +semantics of human body and the dynamics of human motions, we propose a unified +framework to make full use of the prior knowledge and explore the inherent +features in the data itself for generalized human-centric point cloud video +understanding. Extensive experiments demonstrate that our method achieves +state-of-the-art performance on various human-related tasks, including action +recognition and 3D pose estimation. All datasets and code will be released +soon. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ FSMR: A Feature Swapping Multi-modal Reasoning Approach with Joint + Textual and Visual Clues + + +
+ Multi-modal reasoning plays a vital role in bridging the gap between textual +and visual information, enabling a deeper understanding of the context. This +paper presents the Feature Swapping Multi-modal Reasoning (FSMR) model, +designed to enhance multi-modal reasoning through feature swapping. FSMR +leverages a pre-trained visual-language model as an encoder, accommodating both +text and image inputs for effective feature representation from both +modalities. It introduces a unique feature swapping module, enabling the +exchange of features between identified objects in images and corresponding +vocabulary words in text, thereby enhancing the model's comprehension of the +interplay between images and text. To further bolster its multi-modal alignment +capabilities, FSMR incorporates a multi-modal cross-attention mechanism, +facilitating the joint modeling of textual and visual information. During +training, we employ image-text matching and cross-entropy losses to ensure +semantic consistency between visual and language elements. Extensive +experiments on the PMR dataset demonstrate FSMR's superiority over +state-of-the-art baseline models across various performance metrics. + +
+
+
+
+
+ + ☆ Psychometry: An Omnifit Model for Image Reconstruction from Human Brain + Activity CVPR 2024 + + +
+ Reconstructing the viewed images from human brain activity bridges human and +computer vision through the Brain-Computer Interface. The inherent variability +in brain function between individuals leads existing literature to focus on +acquiring separate models for each individual using their respective brain +signal data, ignoring commonalities between these data. In this article, we +devise Psychometry, an omnifit model for reconstructing images from functional +Magnetic Resonance Imaging (fMRI) obtained from different subjects. Psychometry +incorporates an omni mixture-of-experts (Omni MoE) module where all the experts +work together to capture the inter-subject commonalities, while each expert +associated with subject-specific parameters copes with the individual +differences. Moreover, Psychometry is equipped with a retrieval-enhanced +inference strategy, termed Ecphory, which aims to enhance the learned fMRI +representation via retrieving from prestored subject-specific memories. These +designs collectively render Psychometry omnifit and efficient, enabling it to +capture both inter-subject commonality and individual specificity across +subjects. As a result, the enhanced fMRI representations serve as conditional +signals to guide a generation model to reconstruct high-quality and realistic +images, establishing Psychometry as state-of-the-art in terms of both +high-level and low-level metrics. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ SCINeRF: Neural Radiance Fields from a Snapshot Compressive Image + + +
+ In this paper, we explore the potential of Snapshot Compressive Imaging (SCI) +technique for recovering the underlying 3D scene representation from a single +temporal compressed image. SCI is a cost-effective method that enables the +recording of high-dimensional data, such as hyperspectral or temporal +information, into a single image using low-cost 2D imaging sensors. To achieve +this, a series of specially designed 2D masks are usually employed, which not +only reduces storage requirements but also offers potential privacy protection. +Inspired by this, to take one step further, our approach builds upon the +powerful 3D scene representation capabilities of neural radiance fields (NeRF). +Specifically, we formulate the physical imaging process of SCI as part of the +training of NeRF, allowing us to exploit its impressive performance in +capturing complex scene structures. To assess the effectiveness of our method, +we conduct extensive evaluations using both synthetic data and real data +captured by our SCI system. Extensive experimental results demonstrate that our +proposed approach surpasses the state-of-the-art methods in terms of image +reconstruction and novel view image synthesis. Moreover, our method also +exhibits the ability to restore high frame-rate multi-view consistent images by +leveraging SCI and the rendering capabilities of NeRF. The code is available at +https://github.com/WU-CVGL/SCINeRF. + +
+
+
+
+
+ + ☆ DerainNeRF: 3D Scene Estimation with Adhesive Waterdrop Removal + + +
+ When capturing images through the glass during rainy or snowy weather +conditions, the resulting images often contain waterdrops adhered on the glass +surface, and these waterdrops significantly degrade the image quality and +performance of many computer vision algorithms. To tackle these limitations, we +propose a method to reconstruct the clear 3D scene implicitly from multi-view +images degraded by waterdrops. Our method exploits an attention network to +predict the location of waterdrops and then train a Neural Radiance Fields to +recover the 3D scene implicitly. By leveraging the strong scene representation +capabilities of NeRF, our method can render high-quality novel-view images with +waterdrops removed. Extensive experimental results on both synthetic and real +datasets show that our method is able to generate clear 3D scenes and +outperforms existing state-of-the-art (SOTA) image adhesive waterdrop removal +methods. + +
+
+
+
+
+ + ☆ Colorful Cutout: Enhancing Image Data Augmentation with Curriculum + Learning ICLR 2024 + + +
+ Data augmentation is one of the regularization strategies for the training of +deep learning models, which enhances generalizability and prevents overfitting, +leading to performance improvement. Although researchers have proposed various +data augmentation techniques, they often lack consideration for the difficulty +of augmented data. Recently, another line of research suggests incorporating +the concept of curriculum learning with data augmentation in the field of +natural language processing. In this study, we adopt curriculum data +augmentation for image data augmentation and propose colorful cutout, which +gradually increases the noise and difficulty introduced in the augmented image. +Our experimental results highlight the possibility of curriculum data +augmentation for image data. We publicly released our source code to improve +the reproducibility of our study. + +
+
+ comment: ICLR 2024 Tiny Papers +
+
+
+
+
+ + ☆ Grounding and Enhancing Grid-based Models for Neural Fields CVPR24 + + +
+ Many contemporary studies utilize grid-based models for neural field +representation, but a systematic analysis of grid-based models is still +missing, hindering the improvement of those models. Therefore, this paper +introduces a theoretical framework for grid-based models. This framework points +out that these models' approximation and generalization behaviors are +determined by grid tangent kernels (GTK), which are intrinsic properties of +grid-based models. The proposed framework facilitates a consistent and +systematic analysis of diverse grid-based models. Furthermore, the introduced +framework motivates the development of a novel grid-based model named the +Multiplicative Fourier Adaptive Grid (MulFAGrid). The numerical analysis +demonstrates that MulFAGrid exhibits a lower generalization bound than its +predecessors, indicating its robust generalization performance. Empirical +studies reveal that MulFAGrid achieves state-of-the-art performance in various +tasks, including 2D image fitting, 3D signed distance field (SDF) +reconstruction, and novel view synthesis, demonstrating superior representation +ability. The project website is available at +https://sites.google.com/view/cvpr24-2034-submission/home. + +
+
+ comment: Accepted in CVPR24 +
+
+
+
+
+ + ☆ Stable Surface Regularization for Fast Few-Shot NeRF 3DV 2024 + + +
+ This paper proposes an algorithm for synthesizing novel views under few-shot +setup. The main concept is to develop a stable surface regularization technique +called Annealing Signed Distance Function (ASDF), which anneals the surface in +a coarse-to-fine manner to accelerate convergence speed. We observe that the +Eikonal loss - which is a widely known geometric regularization - requires +dense training signal to shape different level-sets of SDF, leading to +low-fidelity results under few-shot training. In contrast, the proposed surface +regularization successfully reconstructs scenes and produce high-fidelity +geometry with stable training. Our method is further accelerated by utilizing +grid representation and monocular geometric priors. Finally, the proposed +approach is up to 45 times faster than existing few-shot novel view synthesis +methods, and it produces comparable results in the ScanNet dataset and +NeRF-Real dataset. + +
+
+ comment: 3DV 2024 +
+
+
+
+
+ + ☆ A multi-stage semi-supervised learning for ankle fracture classification + on CT images + + +
+ Because of the complicated mechanism of ankle injury, it is very difficult to +diagnose ankle fracture in clinic. In order to simplify the process of fracture +diagnosis, an automatic diagnosis model of ankle fracture was proposed. +Firstly, a tibia-fibula segmentation network is proposed for the joint +tibiofibular region of the ankle joint, and the corresponding segmentation +dataset is established on the basis of fracture data. Secondly, the image +registration method is used to register the bone segmentation mask with the +normal bone mask. Finally, a semi-supervised classifier is constructed to make +full use of a large number of unlabeled data to classify ankle fractures. +Experiments show that the proposed method can segment fractures with fracture +lines accurately and has better performance than the general method. At the +same time, this method is superior to classification network in several +indexes. + +
+
+
+
+
+ + ☆ A Parallel Attention Network for Cattle Face Recognition ICME 2024 + + +
+ Cattle face recognition holds paramount significance in domains such as +animal husbandry and behavioral research. Despite significant progress in +confined environments, applying these accomplishments in wild settings remains +challenging. Thus, we create the first large-scale cattle face recognition +dataset, ICRWE, for wild environments. It encompasses 483 cattle and 9,816 +high-resolution image samples. Each sample undergoes annotation for face +features, light conditions, and face orientation. Furthermore, we introduce a +novel parallel attention network, PANet. Comprising several cascaded +Transformer modules, each module incorporates two parallel Position Attention +Modules (PAM) and Feature Mapping Modules (FMM). PAM focuses on local and +global features at each image position through parallel channel attention, and +FMM captures intricate feature patterns through non-linear mappings. +Experimental results indicate that PANet achieves a recognition accuracy of +88.03% on the ICRWE dataset, establishing itself as the current +state-of-the-art approach. The source code is available in the supplementary +materials. + +
+
+ comment: Accepted by ICME 2024 +
+
+
+
+
+ + ☆ Semantically-Shifted Incremental Adapter-Tuning is A Continual + ViTransformer CVPR 2024 + + +
+ Class-incremental learning (CIL) aims to enable models to continuously learn +new classes while overcoming catastrophic forgetting. The introduction of +pre-trained models has brought new tuning paradigms to CIL. In this paper, we +revisit different parameter-efficient tuning (PET) methods within the context +of continual learning. We observe that adapter tuning demonstrates superiority +over prompt-based methods, even without parameter expansion in each learning +session. Motivated by this, we propose incrementally tuning the shared adapter +without imposing parameter update constraints, enhancing the learning capacity +of the backbone. Additionally, we employ feature sampling from stored +prototypes to retrain a unified classifier, further improving its performance. +We estimate the semantic shift of old prototypes without access to past samples +and update stored prototypes session by session. Our proposed method eliminates +model expansion and avoids retaining any image samples. It surpasses previous +pre-trained model-based CIL methods and demonstrates remarkable continual +learning capabilities. Experimental results on five CIL benchmarks validate the +effectiveness of our approach, achieving state-of-the-art (SOTA) performance. + +
+
+ comment: To appear at CVPR 2024 +
+
+
+
+
+ + ☆ eTraM: Event-based Traffic Monitoring Dataset + + +
+ Event cameras, with their high temporal and dynamic range and minimal memory +usage, have found applications in various fields. However, their potential in +static traffic monitoring remains largely unexplored. To facilitate this +exploration, we present eTraM - a first-of-its-kind, fully event-based traffic +monitoring dataset. eTraM offers 10 hr of data from different traffic scenarios +in various lighting and weather conditions, providing a comprehensive overview +of real-world situations. Providing 2M bounding box annotations, it covers +eight distinct classes of traffic participants, ranging from vehicles to +pedestrians and micro-mobility. eTraM's utility has been assessed using +state-of-the-art methods for traffic participant detection, including RVT, RED, +and YOLOv8. We quantitatively evaluate the ability of event-based models to +generalize on nighttime and unseen scenes. Our findings substantiate the +compelling potential of leveraging event cameras for traffic monitoring, +opening new avenues for research and application. eTraM is available at +https://eventbasedvision.github.io/eTraM + +
+
+
+
+
+ + ☆ Context-Aware Integration of Language and Visual References for Natural + Language Tracking CVPR2024 + + +
+ Tracking by natural language specification (TNL) aims to consistently +localize a target in a video sequence given a linguistic description in the +initial frame. Existing methodologies perform language-based and template-based +matching for target reasoning separately and merge the matching results from +two sources, which suffer from tracking drift when language and visual +templates miss-align with the dynamic target state and ambiguity in the later +merging stage. To tackle the issues, we propose a joint multi-modal tracking +framework with 1) a prompt modulation module to leverage the complementarity +between temporal visual templates and language expressions, enabling precise +and context-aware appearance and linguistic cues, and 2) a unified target +decoding module to integrate the multi-modal reference cues and executes the +integrated queries on the search image to predict the target location in an +end-to-end manner directly. This design ensures spatio-temporal consistency by +leveraging historical visual information and introduces an integrated solution, +generating predictions in a single step. Extensive experiments conducted on +TNL2K, OTB-Lang, LaSOT, and RefCOCOg validate the efficacy of our proposed +approach. The results demonstrate competitive performance against +state-of-the-art methods for both tracking and grounding. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ Separate, Dynamic and Differentiable (SMART) Pruner for Block/Output + Channel Pruning on Computer Vision Tasks + + +
+ Deep Neural Network (DNN) pruning has emerged as a key strategy to reduce +model size, improve inference latency, and lower power consumption on DNN +accelerators. Among various pruning techniques, block and output channel +pruning have shown significant potential in accelerating hardware performance. +However, their accuracy often requires further improvement. In response to this +challenge, we introduce a separate, dynamic and differentiable (SMART) pruner. +This pruner stands out by utilizing a separate, learnable probability mask for +weight importance ranking, employing a differentiable Top k operator to achieve +target sparsity, and leveraging a dynamic temperature parameter trick to escape +from non-sparse local minima. In our experiments, the SMART pruner consistently +demonstrated its superiority over existing pruning methods across a wide range +of tasks and models on block and output channel pruning. Additionally, we +extend our testing to Transformer-based models in N:M pruning scenarios, where +SMART pruner also yields state-of-the-art results, demonstrating its +adaptability and robustness across various neural network architectures, and +pruning types. + +
+
+
+
+
+ + ☆ Rewrite the Stars CVPR 2024 + + +
+ Recent studies have drawn attention to the untapped potential of the "star +operation" (element-wise multiplication) in network design. While intuitive +explanations abound, the foundational rationale behind its application remains +largely unexplored. Our study attempts to reveal the star operation's ability +to map inputs into high-dimensional, non-linear feature spaces -- akin to +kernel tricks -- without widening the network. We further introduce StarNet, a +simple yet powerful prototype, demonstrating impressive performance and low +latency under compact network structure and efficient budget. Like stars in the +sky, the star operation appears unremarkable but holds a vast universe of +potential. Our work encourages further exploration across tasks, with codes +available at https://github.com/ma-xu/Rewrite-the-Stars. + +
+
+ comment: Accepted by CVPR 2024. Codes are made publically available at + https://github.com/ma-xu/Rewrite-the-Stars +
+
+
+
+
+ + ☆ Multi-task Magnetic Resonance Imaging Reconstruction using Meta-learning + + +
+ Using single-task deep learning methods to reconstruct Magnetic Resonance +Imaging (MRI) data acquired with different imaging sequences is inherently +challenging. The trained deep learning model typically lacks generalizability, +and the dissimilarity among image datasets with different types of contrast +leads to suboptimal learning performance. This paper proposes a meta-learning +approach to efficiently learn image features from multiple MR image datasets. +Our algorithm can perform multi-task learning to simultaneously reconstruct MR +images acquired using different imaging sequences with different image +contrasts. The experiment results demonstrate the ability of our new +meta-learning reconstruction method to successfully reconstruct +highly-undersampled k-space data from multiple MRI datasets simultaneously, +outperforming other compelling reconstruction methods previously developed for +single-task learning. + +
+
+
+
+
+ + ☆ FairRAG: Fair Human Generation via Fair Retrieval Augmentation + + +
+ Existing text-to-image generative models reflect or even amplify societal +biases ingrained in their training data. This is especially concerning for +human image generation where models are biased against certain demographic +groups. Existing attempts to rectify this issue are hindered by the inherent +limitations of the pre-trained models and fail to substantially improve +demographic diversity. In this work, we introduce Fair Retrieval Augmented +Generation (FairRAG), a novel framework that conditions pre-trained generative +models on reference images retrieved from an external image database to improve +fairness in human generation. FairRAG enables conditioning through a +lightweight linear module that projects reference images into the textual +space. To enhance fairness, FairRAG applies simple-yet-effective debiasing +strategies, providing images from diverse demographic groups during the +generative process. Extensive experiments demonstrate that FairRAG outperforms +existing methods in terms of demographic diversity, image-text alignment, and +image fidelity while incurring minimal computational overhead during inference. + +
+
+
+
+
+ + ☆ Efficient Modulation for Vision Networks ICLR 2024 + + +
+ In this work, we present efficient modulation, a novel design for efficient +vision networks. We revisit the modulation mechanism, which operates input +through convolutional context modeling and feature projection layers, and fuses +features via element-wise multiplication and an MLP block. We demonstrate that +the modulation mechanism is particularly well suited for efficient networks and +further tailor the modulation design by proposing the efficient modulation +(EfficientMod) block, which is considered the essential building block for our +networks. Benefiting from the prominent representational ability of modulation +mechanism and the proposed efficient design, our network can accomplish better +trade-offs between accuracy and efficiency and set new state-of-the-art +performance in the zoo of efficient networks. When integrating EfficientMod +with the vanilla self-attention block, we obtain the hybrid architecture which +further improves the performance without loss of efficiency. We carry out +comprehensive experiments to verify EfficientMod's performance. With fewer +parameters, our EfficientMod-s performs 0.6 top-1 accuracy better than +EfficientFormerV2-s2 and is 25% faster on GPU, and 2.9 better than +MobileViTv2-1.0 at the same GPU latency. Additionally, our method presents a +notable improvement in downstream tasks, outperforming EfficientFormerV2-s by +3.6 mIoU on the ADE20K benchmark. Code and checkpoints are available at +https://github.com/ma-xu/EfficientMod. + +
+
+ comment: Accepted by ICLR 2024. Codes are made publically available at + https://github.com/ma-xu/EfficientMod +
+
+
+
+
+ + ☆ FairCLIP: Harnessing Fairness in Vision-Language Learning CVPR 2024 + + +
+ Fairness is a critical concern in deep learning, especially in healthcare, +where these models influence diagnoses and treatment decisions. Although +fairness has been investigated in the vision-only domain, the fairness of +medical vision-language (VL) models remains unexplored due to the scarcity of +medical VL datasets for studying fairness. To bridge this research gap, we +introduce the first fair vision-language medical dataset FairVLMed that +provides detailed demographic attributes, ground-truth labels, and clinical +notes to facilitate an in-depth examination of fairness within VL foundation +models. Using FairVLMed, we conduct a comprehensive fairness analysis of two +widely-used VL models (CLIP and BLIP2), pre-trained on both natural and medical +domains, across four different protected attributes. Our results highlight +significant biases in all VL models, with Asian, Male, Non-Hispanic, and +Spanish being the preferred subgroups across the protected attributes of race, +gender, ethnicity, and language, respectively. In order to alleviate these +biases, we propose FairCLIP, an optimal-transport-based approach that achieves +a favorable trade-off between performance and fairness by reducing the Sinkhorn +distance between the overall sample distribution and the distributions +corresponding to each demographic group. As the first VL dataset of its kind, +FairVLMed holds the potential to catalyze advancements in the development of +machine learning models that are both ethically aware and clinically effective. +Our dataset and code are available at +https://ophai.hms.harvard.edu/datasets/fairvlmed10k. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Binarized Low-light Raw Video Enhancement CVPR 2024 + + +
+ Recently, deep neural networks have achieved excellent performance on +low-light raw video enhancement. However, they often come with high +computational complexity and large memory costs, which hinder their +applications on resource-limited devices. In this paper, we explore the +feasibility of applying the extremely compact binary neural network (BNN) to +low-light raw video enhancement. Nevertheless, there are two main issues with +binarizing video enhancement models. One is how to fuse the temporal +information to improve low-light denoising without complex modules. The other +is how to narrow the performance gap between binary convolutions with the full +precision ones. To address the first issue, we introduce a spatial-temporal +shift operation, which is easy-to-binarize and effective. The temporal shift +efficiently aggregates the features of neighbor frames and the spatial shift +handles the misalignment caused by the large motion in videos. For the second +issue, we present a distribution-aware binary convolution, which captures the +distribution characteristics of real-valued input and incorporates them into +plain binary convolutions to alleviate the degradation in performance. +Extensive quantitative and qualitative experiments have shown our +high-efficiency binarized low-light raw video enhancement method can attain a +promising performance. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ CP HDR: A feature point detection and description library for LDR and + HDR images + + +
+ In computer vision, characteristics refer to image regions with unique +properties, such as corners, edges, textures, or areas with high contrast. +These regions can be represented through feature points (FPs). FP detection and +description are fundamental steps to many computer vision tasks. Most FP +detection and description methods use low dynamic range (LDR) images, +sufficient for most applications involving digital images. However, LDR images +may have saturated pixels in scenes with extreme light conditions, which +degrade FP detection. On the other hand, high dynamic range (HDR) images +usually present a greater dynamic range but FP detection algorithms do not take +advantage of all the information in such images. In this study, we present a +systematic review of image detection and description algorithms that use HDR +images as input. We developed a library called CP_HDR that implements the +Harris corner detector, SIFT detector and descriptor, and two modifications of +those algorithms specialized in HDR images, called SIFT for HDR (SfHDR) and +Harris for HDR (HfHDR). Previous studies investigated the use of HDR images in +FP detection, but we did not find studies investigating the use of HDR images +in FP description. Using uniformity, repeatability rate, mean average +precision, and matching rate metrics, we compared the performance of the CP_HDR +algorithms using LDR and HDR images. We observed an increase in the uniformity +of the distribution of FPs among the high-light, mid-light, and low-light areas +of the images. The results show that using HDR images as input to detection +algorithms improves performance and that SfHDR and HfHDR enhance FP +description. + +
+
+
+
+
+ + ☆ SceneTracker: Long-term Scene Flow Estimation Network + + +
+ Considering the complementarity of scene flow estimation in the spatial +domain's focusing capability and 3D object tracking in the temporal domain's +coherence, this study aims to address a comprehensive new task that can +simultaneously capture fine-grained and long-term 3D motion in an online +manner: long-term scene flow estimation (LSFE). We introduce SceneTracker, a +novel learning-based LSFE network that adopts an iterative approach to +approximate the optimal trajectory. Besides, it dynamically indexes and +constructs appearance and depth correlation features simultaneously and employs +the Transformer to explore and utilize long-range connections within and +between trajectories. With detailed experiments, SceneTracker shows superior +capabilities in handling 3D spatial occlusion and depth noise interference, +highly tailored to the LSFE task's needs. The code for SceneTracker is +available at https://github.com/wwsource/SceneTracker. + +
+
+
+
+
+ + ☆ MI-NeRF: Learning a Single Face NeRF from Multiple Identities + + +
+ In this work, we introduce a method that learns a single dynamic neural +radiance field (NeRF) from monocular talking face videos of multiple +identities. NeRFs have shown remarkable results in modeling the 4D dynamics and +appearance of human faces. However, they require per-identity optimization. +Although recent approaches have proposed techniques to reduce the training and +rendering time, increasing the number of identities can be expensive. We +introduce MI-NeRF (multi-identity NeRF), a single unified network that models +complex non-rigid facial motion for multiple identities, using only monocular +videos of arbitrary length. The core premise in our method is to learn the +non-linear interactions between identity and non-identity specific information +with a multiplicative module. By training on multiple videos simultaneously, +MI-NeRF not only reduces the total training time compared to standard +single-identity NeRFs, but also demonstrates robustness in synthesizing novel +expressions for any input identity. We present results for both facial +expression transfer and talking face video synthesis. Our method can be further +personalized for a target identity given only a short video. + +
+
+ comment: Project page: https://aggelinacha.github.io/MI-NeRF/ +
+
+
+
+
+ + ☆ Diff-Reg v1: Diffusion Matching Model for Registration Problem + + +
+ Establishing reliable correspondences is essential for registration tasks +such as 3D and 2D3D registration. Existing methods commonly leverage geometric +or semantic point features to generate potential correspondences. However, +these features may face challenges such as large deformation, scale +inconsistency, and ambiguous matching problems (e.g., symmetry). Additionally, +many previous methods, which rely on single-pass prediction, may struggle with +local minima in complex scenarios. To mitigate these challenges, we introduce a +diffusion matching model for robust correspondence construction. Our approach +treats correspondence estimation as a denoising diffusion process within the +doubly stochastic matrix space, which gradually denoises (refines) a doubly +stochastic matching matrix to the ground-truth one for high-quality +correspondence estimation. It involves a forward diffusion process that +gradually introduces Gaussian noise into the ground truth matching matrix and a +reverse denoising process that iteratively refines the noisy matching matrix. +In particular, the feature extraction from the backbone occurs only once during +the inference phase. Our lightweight denoising module utilizes the same feature +at each reverse sampling step. Evaluation of our method on both 3D and 2D3D +registration tasks confirms its effectiveness. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2401.00436 +
+
+
+
+
+ + ☆ Using Images as Covariates: Measuring Curb Appeal with Deep Learning + + +
+ This paper details an innovative methodology to integrate image data into +traditional econometric models. Motivated by forecasting sales prices for +residential real estate, we harness the power of deep learning to add +"information" contained in images as covariates. Specifically, images of homes +were categorized and encoded using an ensemble of image classifiers (ResNet-50, +VGG16, MobileNet, and Inception V3). Unique features presented within each +image were further encoded through panoptic segmentation. Forecasts from a +neural network trained on the encoded data results in improved out-of-sample +predictive power. We also combine these image-based forecasts with standard +hedonic real estate property and location characteristics, resulting in a +unified dataset. We show that image-based forecasts increase the accuracy of +hedonic forecasts when encoded features are regarded as additional covariates. +We also attempt to "explain" which covariates the image-based forecasts are +most highly correlated with. The study exemplifies the benefits of +interdisciplinary methodologies, merging machine learning and econometrics to +harness untapped data sources for more accurate forecasting. + +
+
+
+
+
+ + ☆ Automated Identification and Segmentation of Hi Sources in CRAFTS Using + Deep Learning Method + + +
+ We introduce a machine learning-based method for extracting HI sources from +3D spectral data, and construct a dedicated dataset of HI sources from CRAFTS. +Our custom dataset provides comprehensive resources for HI source detection. +Utilizing the 3D-Unet segmentation architecture, our method reliably identifies +and segments HI sources, achieving notable performance metrics with recall +rates reaching 91.6% and accuracy levels at 95.7%. These outcomes substantiate +the value of our custom dataset and the efficacy of our proposed network in +identifying HI source. Our code is publicly available at +https://github.com/fishszh/HISF. + +
+
+ comment: 6 pages, 4 figures +
+
+
+
+
+ + ☆ Classification of Diabetic Retinopathy using Pre-Trained Deep Learning + Models + + +
+ Diabetic Retinopathy (DR) stands as the leading cause of blindness globally, +particularly affecting individuals between the ages of 20 and 70. This paper +presents a Computer-Aided Diagnosis (CAD) system designed for the automatic +classification of retinal images into five distinct classes: Normal, Mild, +Moderate, Severe, and Proliferative Diabetic Retinopathy (PDR). The proposed +system leverages Convolutional Neural Networks (CNNs) employing pre-trained +deep learning models. Through the application of fine-tuning techniques, our +model is trained on fundus images of diabetic retinopathy with resolutions of +350x350x3 and 224x224x3. Experimental results obtained on the Kaggle platform, +utilizing resources comprising 4 CPUs, 17 GB RAM, and 1 GB Disk, demonstrate +the efficacy of our approach. The achieved Area Under the Curve (AUC) values +for CNN, MobileNet, VGG-16, InceptionV3, and InceptionResNetV2 models are 0.50, +0.70, 0.53, 0.63, and 0.69, respectively. + +
+
+ comment: 3 pages, 1 figure, 1 table +
+
+
+
+
+ + ☆ Fully Geometric Panoramic Localization CVPR 2024 + + +
+ We introduce a lightweight and accurate localization method that only +utilizes the geometry of 2D-3D lines. Given a pre-captured 3D map, our approach +localizes a panorama image, taking advantage of the holistic 360 view. The +system mitigates potential privacy breaches or domain discrepancies by avoiding +trained or hand-crafted visual descriptors. However, as lines alone can be +ambiguous, we express distinctive yet compact spatial contexts from +relationships between lines, namely the dominant directions of parallel lines +and the intersection between non-parallel lines. The resulting representations +are efficient in processing time and memory compared to conventional visual +descriptor-based methods. Given the groups of dominant line directions and +their intersections, we accelerate the search process to test thousands of pose +candidates in less than a millisecond without sacrificing accuracy. We +empirically show that the proposed 2D-3D matching can localize panoramas for +challenging scenes with similar structures, dramatic domain shifts or +illumination changes. Our fully geometric approach does not involve extensive +parameter tuning or neural network training, making it a practical algorithm +that can be readily deployed in the real world. Project page including the code +is available through this link: https://82magnolia.github.io/fgpl/. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Heterogeneous Network Based Contrastive Learning Method for PolSAR Land + Cover Classification + + +
+ Polarimetric synthetic aperture radar (PolSAR) image interpretation is widely +used in various fields. Recently, deep learning has made significant progress +in PolSAR image classification. Supervised learning (SL) requires a large +amount of labeled PolSAR data with high quality to achieve better performance, +however, manually labeled data is insufficient. This causes the SL to fail into +overfitting and degrades its generalization performance. Furthermore, the +scattering confusion problem is also a significant challenge that attracts more +attention. To solve these problems, this article proposes a Heterogeneous +Network based Contrastive Learning method(HCLNet). It aims to learn high-level +representation from unlabeled PolSAR data for few-shot classification according +to multi-features and superpixels. Beyond the conventional CL, HCLNet +introduces the heterogeneous architecture for the first time to utilize +heterogeneous PolSAR features better. And it develops two easy-to-use plugins +to narrow the domain gap between optics and PolSAR, including feature filter +and superpixel-based instance discrimination, which the former is used to +enhance the complementarity of multi-features, and the latter is used to +increase the diversity of negative samples. Experiments demonstrate the +superiority of HCLNet on three widely used PolSAR benchmark datasets compared +with state-of-the-art methods. Ablation studies also verify the importance of +each component. Besides, this work has implications for how to efficiently +utilize the multi-features of PolSAR data to learn better high-level +representation in CL and how to construct networks suitable for PolSAR data +better. + +
+
+
+
+
+ + ☆ Disentangling Racial Phenotypes: Fine-Grained Control of Race-related + Facial Phenotype Characteristics + + +
+ Achieving an effective fine-grained appearance variation over 2D facial +images, whilst preserving facial identity, is a challenging task due to the +high complexity and entanglement of common 2D facial feature encoding spaces. +Despite these challenges, such fine-grained control, by way of disentanglement +is a crucial enabler for data-driven racial bias mitigation strategies across +multiple automated facial analysis tasks, as it allows to analyse, characterise +and synthesise human facial diversity. In this paper, we propose a novel GAN +framework to enable fine-grained control over individual race-related phenotype +attributes of the facial images. Our framework factors the latent (feature) +space into elements that correspond to race-related facial phenotype +representations, thereby separating phenotype aspects (e.g. skin, hair colour, +nose, eye, mouth shapes), which are notoriously difficult to annotate robustly +in real-world facial data. Concurrently, we also introduce a high quality +augmented, diverse 2D face image dataset drawn from CelebA-HQ for GAN training. +Unlike prior work, our framework only relies upon 2D imagery and related +parameters to achieve state-of-the-art individual control over race-related +phenotype attributes with improved photo-realistic output. + +
+
+
+
+
+ + ☆ Nonlinearity Enhanced Adaptive Activation Function + + +
+ A simply implemented activation function with even cubic nonlinearity is +introduced that increases the accuracy of neural networks without substantial +additional computational resources. This is partially enabled through an +apparent tradeoff between convergence and accuracy. The activation function +generalizes the standard RELU function by introducing additional degrees of +freedom through optimizable parameters that enable the degree of nonlinearity +to be adjusted. The associated accuracy enhancement is quantified in the +context of the MNIST digit data set through a comparison with standard +techniques. + +
+
+
+
+
+ + ☆ PLoc: A New Evaluation Criterion Based on Physical Location for + Autonomous Driving Datasets + + +
+ Autonomous driving has garnered significant attention as a key research area +within artificial intelligence. In the context of autonomous driving scenarios, +the varying physical locations of objects correspond to different levels of +danger. However, conventional evaluation criteria for automatic driving object +detection often overlook the crucial aspect of an object's physical location, +leading to evaluation results that may not accurately reflect the genuine +threat posed by the object to the autonomous driving vehicle. To enhance the +safety of autonomous driving, this paper introduces a novel evaluation +criterion based on physical location information, termed PLoc. This criterion +transcends the limitations of traditional criteria by acknowledging that the +physical location of pedestrians in autonomous driving scenarios can provide +valuable safety-related information. Furthermore, this paper presents a newly +re-annotated dataset (ApolloScape-R) derived from ApolloScape. ApolloScape-R +involves the relabeling of pedestrians based on the significance of their +physical location. The dataset is utilized to assess the performance of various +object detection models under the proposed PLoc criterion. Experimental results +demonstrate that the average accuracy of all object detection models in +identifying a person situated in the travel lane of an autonomous vehicle is +lower than that for a person on a sidewalk. The dataset is publicly available +at https://github.com/lnyrlyed/ApolloScape-R.git + +
+
+
+
+
+ + ☆ MambaMixer: Efficient Selective State Space Models with Dual Token and + Channel Selection + + +
+ Recent advances in deep learning have mainly relied on Transformers due to +their data dependency and ability to learn at scale. The attention module in +these architectures, however, exhibits quadratic time and space in input size, +limiting their scalability for long-sequence modeling. Despite recent attempts +to design efficient and effective architecture backbone for multi-dimensional +data, such as images and multivariate time series, existing models are either +data independent, or fail to allow inter- and intra-dimension communication. +Recently, State Space Models (SSMs), and more specifically Selective State +Space Models, with efficient hardware-aware implementation, have shown +promising potential for long sequence modeling. Motivated by the success of +SSMs, we present MambaMixer, a new architecture with data-dependent weights +that uses a dual selection mechanism across tokens and channels, called +Selective Token and Channel Mixer. MambaMixer connects selective mixers using a +weighted averaging mechanism, allowing layers to have direct access to early +features. As a proof of concept, we design Vision MambaMixer (ViM2) and Time +Series MambaMixer (TSM2) architectures based on the MambaMixer block and +explore their performance in various vision and time series forecasting tasks. +Our results underline the importance of selective mixing across both tokens and +channels. In ImageNet classification, object detection, and semantic +segmentation tasks, ViM2 achieves competitive performance with well-established +vision models and outperforms SSM-based vision models. In time series +forecasting, TSM2 achieves outstanding performance compared to state-of-the-art +methods while demonstrating significantly improved computational cost. These +results show that while Transformers, cross-channel attention, and MLPs are +sufficient for good performance in time series forecasting, neither is +necessary. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Optimal Blackjack Strategy Recommender: A Comprehensive Study on + Computer Vision Integration for Enhanced Gameplay + + +
+ This research project investigates the application of several computer vision +techniques for playing card detection and recognition in the context of the +popular casino game, blackjack. The primary objective is to develop a robust +system that is capable of detecting and accurately classifying playing cards in +real-time, and displaying the optimal move recommendation based on the given +image of the current game. The proposed methodology involves using K-Means for +image segmentation, card reprojection and feature extraction, training of the +KNN classifier using a labeled dataset, and integration of the detection system +into a Blackjack Basic Strategy recommendation algorithm. Further, the study +aims to observe the effectiveness of this approach in detecting various card +designs under different lighting conditions and occlusions. Overall, the +project examines the potential benefits of incorporating computer vision +techniques, with a specific focus on card detection, into commonly played games +aiming to enhance player decision-making and optimize strategic outcomes. The +results obtained from our experimental evaluations with models developed under +considerable time constraints, highlight the potential for practical +implementation in real-world casino environments and across other similarly +structured games. + +
+
+ comment: 24 pages, 13 figures +
+
+
+
+
+ + ☆ On Inherent Adversarial Robustness of Active Vision Systems + + +
+ Current Deep Neural Networks are vulnerable to adversarial examples, which +alter their predictions by adding carefully crafted noise. Since human eyes are +robust to such inputs, it is possible that the vulnerability stems from the +standard way of processing inputs in one shot by processing every pixel with +the same importance. In contrast, neuroscience suggests that the human vision +system can differentiate salient features by (1) switching between multiple +fixation points (saccades) and (2) processing the surrounding with a +non-uniform external resolution (foveation). In this work, we advocate that the +integration of such active vision mechanisms into current deep learning systems +can offer robustness benefits. Specifically, we empirically demonstrate the +inherent robustness of two active vision methods - GFNet and FALcon - under a +black box threat model. By learning and inferencing based on downsampled +glimpses obtained from multiple distinct fixation points within an input, we +show that these active methods achieve (2-3) times greater robustness compared +to a standard passive convolutional network under state-of-the-art adversarial +attacks. More importantly, we provide illustrative and interpretable +visualization analysis that demonstrates how performing inference from distinct +fixation points makes active vision methods less vulnerable to malicious +inputs. + +
+
+
+
+
+ + ☆ Multi-Region Transfer Learning for Segmentation of Crop Field Boundaries + in Satellite Images with Limited Labels AAAI + + +
+ The goal of field boundary delineation is to predict the polygonal boundaries +and interiors of individual crop fields in overhead remotely sensed images +(e.g., from satellites or drones). Automatic delineation of field boundaries is +a necessary task for many real-world use cases in agriculture, such as +estimating cultivated area in a region or predicting end-of-season yield in a +field. Field boundary delineation can be framed as an instance segmentation +problem, but presents unique research challenges compared to traditional +computer vision datasets used for instance segmentation. The practical +applicability of previous work is also limited by the assumption that a +sufficiently-large labeled dataset is available where field boundary +delineation models will be applied, which is not the reality for most regions +(especially under-resourced regions such as Sub-Saharan Africa). We present an +approach for segmentation of crop field boundaries in satellite images in +regions lacking labeled data that uses multi-region transfer learning to adapt +model weights for the target region. We show that our approach outperforms +existing methods and that multi-region transfer learning substantially boosts +performance for multiple model architectures. Our implementation and datasets +are publicly available to enable use of the approach by end-users and serve as +a benchmark for future work. + +
+
+ comment: Accepted for 2023 AAAI Workshop on AI to Accelerate Science and + Engineering +
+
+
+
+
+ + ☆ Universal Bovine Identification via Depth Data and Deep Metric Learning + + +
+ This paper proposes and evaluates, for the first time, a top-down (dorsal +view), depth-only deep learning system for accurately identifying individual +cattle and provides associated code, datasets, and training weights for +immediate reproducibility. An increase in herd size skews the cow-to-human +ratio at the farm and makes the manual monitoring of individuals more +challenging. Therefore, real-time cattle identification is essential for the +farms and a crucial step towards precision livestock farming. Underpinned by +our previous work, this paper introduces a deep-metric learning method for +cattle identification using depth data from an off-the-shelf 3D camera. The +method relies on CNN and MLP backbones that learn well-generalised embedding +spaces from the body shape to differentiate individuals -- requiring neither +species-specific coat patterns nor close-up muzzle prints for operation. The +network embeddings are clustered using a simple algorithm such as $k$-NN for +highly accurate identification, thus eliminating the need to retrain the +network for enrolling new individuals. We evaluate two backbone architectures, +ResNet, as previously used to identify Holstein Friesians using RGB images, and +PointNet, which is specialised to operate on 3D point clouds. We also present +CowDepth2023, a new dataset containing 21,490 synchronised colour-depth image +pairs of 99 cows, to evaluate the backbones. Both ResNet and PointNet +architectures, which consume depth maps and point clouds, respectively, led to +high accuracy that is on par with the coat pattern-based backbone. + +
+
+ comment: LaTeX, 38 pages, 14 figures, 3 tables +
+
+
+
+
+ + ☆ Multi-Level Neural Scene Graphs for Dynamic Urban Environments CVPR 2024 + + +
+ We estimate the radiance field of large-scale dynamic areas from multiple +vehicle captures under varying environmental conditions. Previous works in this +domain are either restricted to static environments, do not scale to more than +a single short video, or struggle to separately represent dynamic object +instances. To this end, we present a novel, decomposable radiance field +approach for dynamic urban environments. We propose a multi-level neural scene +graph representation that scales to thousands of images from dozens of +sequences with hundreds of fast-moving objects. To enable efficient training +and rendering of our representation, we develop a fast composite ray sampling +and rendering scheme. To test our approach in urban driving scenarios, we +introduce a new, novel view synthesis benchmark. We show that our approach +outperforms prior art by a significant margin on both established and our +proposed benchmark while being faster in training and rendering. + +
+
+ comment: CVPR 2024. Project page is available at + https://tobiasfshr.github.io/pub/ml-nsg/ +
+
+
+
+
+ + ☆ Uncovering Bias in Large Vision-Language Models with Counterfactuals + + +
+ With the advent of Large Language Models (LLMs) possessing increasingly +impressive capabilities, a number of Large Vision-Language Models (LVLMs) have +been proposed to augment LLMs with visual inputs. Such models condition +generated text on both an input image and a text prompt, enabling a variety of +use cases such as visual question answering and multimodal chat. While prior +studies have examined the social biases contained in text generated by LLMs, +this topic has been relatively unexplored in LVLMs. Examining social biases in +LVLMs is particularly challenging due to the confounding contributions of bias +induced by information contained across the text and visual modalities. To +address this challenging problem, we conduct a large-scale study of text +generated by different LVLMs under counterfactual changes to input images. +Specifically, we present LVLMs with identical open-ended text prompts while +conditioning on images from different counterfactual sets, where each set +contains images which are largely identical in their depiction of a common +subject (e.g., a doctor), but vary only in terms of intersectional social +attributes (e.g., race and gender). We comprehensively evaluate the text +produced by different LVLMs under this counterfactual generation setting and +find that social attributes such as race, gender, and physical characteristics +depicted in input images can significantly influence toxicity and the +generation of competency-associated words. + +
+
+
+
+
+ + ☆ CT respiratory motion synthesis using joint supervised and adversarial + learning + + +
+ Objective: Four-dimensional computed tomography (4DCT) imaging consists in +reconstructing a CT acquisition into multiple phases to track internal organ +and tumor motion. It is commonly used in radiotherapy treatment planning to +establish planning target volumes. However, 4DCT increases protocol complexity, +may not align with patient breathing during treatment, and lead to higher +radiation delivery. Approach: In this study, we propose a deep synthesis method +to generate pseudo respiratory CT phases from static images for motion-aware +treatment planning. The model produces patient-specific deformation vector +fields (DVFs) by conditioning synthesis on external patient surface-based +estimation, mimicking respiratory monitoring devices. A key methodological +contribution is to encourage DVF realism through supervised DVF training while +using an adversarial term jointly not only on the warped image but also on the +magnitude of the DVF itself. This way, we avoid excessive smoothness typically +obtained through deep unsupervised learning, and encourage correlations with +the respiratory amplitude. Main results: Performance is evaluated using real +4DCT acquisitions with smaller tumor volumes than previously reported. Results +demonstrate for the first time that the generated pseudo-respiratory CT phases +can capture organ and tumor motion with similar accuracy to repeated 4DCT scans +of the same patient. Mean inter-scans tumor center-of-mass distances and Dice +similarity coefficients were $1.97$mm and $0.63$, respectively, for real 4DCT +phases and $2.35$mm and $0.71$ for synthetic phases, and compares favorably to +a state-of-the-art technique (RMSim). + +
+
+ comment: to appear in Phys. Med. Biol +
+
+
+
+
+ + ☆ VSRD: Instance-Aware Volumetric Silhouette Rendering for Weakly + Supervised 3D Object Detection CVPR 2024 + + +
+ Monocular 3D object detection poses a significant challenge in 3D scene +understanding due to its inherently ill-posed nature in monocular depth +estimation. Existing methods heavily rely on supervised learning using abundant +3D labels, typically obtained through expensive and labor-intensive annotation +on LiDAR point clouds. To tackle this problem, we propose a novel weakly +supervised 3D object detection framework named VSRD (Volumetric Silhouette +Rendering for Detection) to train 3D object detectors without any 3D +supervision but only weak 2D supervision. VSRD consists of multi-view 3D +auto-labeling and subsequent training of monocular 3D object detectors using +the pseudo labels generated in the auto-labeling stage. In the auto-labeling +stage, we represent the surface of each instance as a signed distance field +(SDF) and render its silhouette as an instance mask through our proposed +instance-aware volumetric silhouette rendering. To directly optimize the 3D +bounding boxes through rendering, we decompose the SDF of each instance into +the SDF of a cuboid and the residual distance field (RDF) that represents the +residual from the cuboid. This mechanism enables us to optimize the 3D bounding +boxes in an end-to-end manner by comparing the rendered instance masks with the +ground truth instance masks. The optimized 3D bounding boxes serve as effective +training data for 3D object detection. We conduct extensive experiments on the +KITTI-360 dataset, demonstrating that our method outperforms the existing +weakly supervised 3D object detection methods. The code is available at +https://github.com/skmhrk1209/VSRD. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Fast OMP for Exact Recovery and Sparse Approximation + + +
+ Orthogonal Matching Pursuit (OMP) has been a powerful method in sparse signal +recovery and approximation. However OMP suffers computational issue when the +signal has large number of non-zeros. This paper advances OMP in two fronts: it +offers a fast algorithm for the orthogonal projection of the input signal at +each iteration, and a new selection criterion for making the greedy choice, +which reduces the number of iterations it takes to recover the signal. The +proposed modifications to OMP directly reduce the computational complexity. +Experiment results show significant improvement over the classical OMP in +computation time. The paper also provided a sufficient condition for exact +recovery under the new greedy choice criterion. For general signals that may +not have sparse representations, the paper provides a bound for the +approximation error. The approximation error is at the same order as OMP but is +obtained within fewer iterations and less time. + +
+
+
+
+
+ + ☆ An Interpretable Cross-Attentive Multi-modal MRI Fusion Framework for + Schizophrenia Diagnosis + + +
+ Both functional and structural magnetic resonance imaging (fMRI and sMRI) are +widely used for the diagnosis of mental disorder. However, combining +complementary information from these two modalities is challenging due to their +heterogeneity. Many existing methods fall short of capturing the interaction +between these modalities, frequently defaulting to a simple combination of +latent features. In this paper, we propose a novel Cross-Attentive Multi-modal +Fusion framework (CAMF), which aims to capture both intra-modal and inter-modal +relationships between fMRI and sMRI, enhancing multi-modal data representation. +Specifically, our CAMF framework employs self-attention modules to identify +interactions within each modality while cross-attention modules identify +interactions between modalities. Subsequently, our approach optimizes the +integration of latent features from both modalities. This approach +significantly improves classification accuracy, as demonstrated by our +evaluations on two extensive multi-modal brain imaging datasets, where CAMF +consistently outperforms existing methods. Furthermore, the gradient-guided +Score-CAM is applied to interpret critical functional networks and brain +regions involved in schizophrenia. The bio-markers identified by CAMF align +with established research, potentially offering new insights into the diagnosis +and pathological endophenotypes of schizophrenia. + +
+
+
+
+
+ + ☆ FetalDiffusion: Pose-Controllable 3D Fetal MRI Synthesis with + Conditional Diffusion Model MICCAI 2024 + + +
+ The quality of fetal MRI is significantly affected by unpredictable and +substantial fetal motion, leading to the introduction of artifacts even when +fast acquisition sequences are employed. The development of 3D real-time fetal +pose estimation approaches on volumetric EPI fetal MRI opens up a promising +avenue for fetal motion monitoring and prediction. Challenges arise in fetal +pose estimation due to limited number of real scanned fetal MR training images, +hindering model generalization when the acquired fetal MRI lacks adequate pose. + In this study, we introduce FetalDiffusion, a novel approach utilizing a +conditional diffusion model to generate 3D synthetic fetal MRI with +controllable pose. Additionally, an auxiliary pose-level loss is adopted to +enhance model performance. Our work demonstrates the success of this proposed +model by producing high-quality synthetic fetal MRI images with accurate and +recognizable fetal poses, comparing favorably with in-vivo real fetal MRI. +Furthermore, we show that the integration of synthetic fetal MR images enhances +the fetal pose estimation model's performance, particularly when the number of +available real scanned data is limited resulting in 15.4% increase in PCK and +50.2% reduced in mean error. All experiments are done on a single 32GB V100 +GPU. Our method holds promise for improving real-time tracking models, thereby +addressing fetal motion issues more effectively. + +
+
+ comment: 8 pages, 3 figures, 2 tables, submitted to MICCAI 2024, code + available if accepted +
+
+
+
+
+ + ☆ FISBe: A real-world benchmark dataset for instance segmentation of + long-range thin filamentous structures CVPR2024 + + +
+ Instance segmentation of neurons in volumetric light microscopy images of +nervous systems enables groundbreaking research in neuroscience by facilitating +joint functional and morphological analyses of neural circuits at cellular +resolution. Yet said multi-neuron light microscopy data exhibits extremely +challenging properties for the task of instance segmentation: Individual +neurons have long-ranging, thin filamentous and widely branching morphologies, +multiple neurons are tightly inter-weaved, and partial volume effects, uneven +illumination and noise inherent to light microscopy severely impede local +disentangling as well as long-range tracing of individual neurons. These +properties reflect a current key challenge in machine learning research, namely +to effectively capture long-range dependencies in the data. While respective +methodological research is buzzing, to date methods are typically benchmarked +on synthetic datasets. To address this gap, we release the FlyLight Instance +Segmentation Benchmark (FISBe) dataset, the first publicly available +multi-neuron light microscopy dataset with pixel-wise annotations. In addition, +we define a set of instance segmentation metrics for benchmarking that we +designed to be meaningful with regard to downstream analyses. Lastly, we +provide three baselines to kick off a competition that we envision to both +advance the field of machine learning regarding methodology for capturing +long-range data dependencies, and facilitate scientific discovery in basic +neuroscience. + +
+
+ comment: CVPR2024, Project page: https://kainmueller-lab.github.io/fisbe +
+
+
+
+
+ + ☆ AgileFormer: Spatially Agile Transformer UNet for Medical Image + Segmentation + + +
+ In the past decades, deep neural networks, particularly convolutional neural +networks, have achieved state-of-the-art performance in a variety of medical +image segmentation tasks. Recently, the introduction of the vision transformer +(ViT) has significantly altered the landscape of deep segmentation models. +There has been a growing focus on ViTs, driven by their excellent performance +and scalability. However, we argue that the current design of the vision +transformer-based UNet (ViT-UNet) segmentation models may not effectively +handle the heterogeneous appearance (e.g., varying shapes and sizes) of objects +of interest in medical image segmentation tasks. To tackle this challenge, we +present a structured approach to introduce spatially dynamic components to the +ViT-UNet. This adaptation enables the model to effectively capture features of +target objects with diverse appearances. This is achieved by three main +components: \textbf{(i)} deformable patch embedding; \textbf{(ii)} spatially +dynamic multi-head attention; \textbf{(iii)} deformable positional encoding. +These components were integrated into a novel architecture, termed AgileFormer. +AgileFormer is a spatially agile ViT-UNet designed for medical image +segmentation. Experiments in three segmentation tasks using publicly available +datasets demonstrated the effectiveness of the proposed method. The code is +available at +\href{https://github.com/sotiraslab/AgileFormer}{https://github.com/sotiraslab/AgileFormer}. + +
+
+
+
+
+ + ☆ Deepfake Sentry: Harnessing Ensemble Intelligence for Resilient + Detection and Generalisation + + +
+ Recent advancements in Generative Adversarial Networks (GANs) have enabled +photorealistic image generation with high quality. However, the malicious use +of such generated media has raised concerns regarding visual misinformation. +Although deepfake detection research has demonstrated high accuracy, it is +vulnerable to advances in generation techniques and adversarial iterations on +detection countermeasures. To address this, we propose a proactive and +sustainable deepfake training augmentation solution that introduces artificial +fingerprints into models. We achieve this by employing an ensemble learning +approach that incorporates a pool of autoencoders that mimic the effect of the +artefacts introduced by the deepfake generator models. Experiments on three +datasets reveal that our proposed ensemble autoencoder-based data augmentation +learning approach offers improvements in terms of generalisation, resistance +against basic data perturbations such as noise, blurring, sharpness +enhancement, and affine transforms, resilience to commonly used lossy +compression algorithms such as JPEG, and enhanced resistance against +adversarial attacks. + +
+
+ comment: 16 pages, 1 figure, U.P.B. Sci. Bull., Series C, Vol. 85, Iss. 4, + 2023 +
+
+
+
+
+ + ☆ Robust Ensemble Person Re-Identification via Orthogonal Fusion with + Occlusion Handling + + +
+ Occlusion remains one of the major challenges in person reidentification +(ReID) as a result of the diversity of poses and the variation of appearances. +Developing novel architectures to improve the robustness of occlusion-aware +person Re-ID requires new insights, especially on low-resolution edge cameras. +We propose a deep ensemble model that harnesses both CNN and Transformer +architectures to generate robust feature representations. To achieve robust +Re-ID without the need to manually label occluded regions, we propose to take +an ensemble learning-based approach derived from the analogy between +arbitrarily shaped occluded regions and robust feature representation. Using +the orthogonality principle, our developed deep CNN model makes use of masked +autoencoder (MAE) and global-local feature fusion for robust person +identification. Furthermore, we present a part occlusion-aware transformer +capable of learning feature space that is robust to occluded regions. +Experimental results are reported on several Re-ID datasets to show the +effectiveness of our developed ensemble model named orthogonal fusion with +occlusion handling (OFOH). Compared to competing methods, the proposed OFOH +approach has achieved competent rank-1 and mAP performance. + +
+
+
+
+
+ + ☆ PikeLPN: Mitigating Overlooked Inefficiencies of Low-Precision Neural + Networks CVPR 2024 + + +
+ Low-precision quantization is recognized for its efficacy in neural network +optimization. Our analysis reveals that non-quantized elementwise operations +which are prevalent in layers such as parameterized activation functions, batch +normalization, and quantization scaling dominate the inference cost of +low-precision models. These non-quantized elementwise operations are commonly +overlooked in SOTA efficiency metrics such as Arithmetic Computation Effort +(ACE). In this paper, we propose ACEv2 - an extended version of ACE which +offers a better alignment with the inference cost of quantized models and their +energy consumption on ML hardware. Moreover, we introduce PikeLPN, a model that +addresses these efficiency issues by applying quantization to both elementwise +operations and multiply-accumulate operations. In particular, we present a +novel quantization technique for batch normalization layers named QuantNorm +which allows for quantizing the batch normalization parameters without +compromising the model performance. Additionally, we propose applying Double +Quantization where the quantization scaling parameters are quantized. +Furthermore, we recognize and resolve the issue of distribution mismatch in +Separable Convolution layers by introducing Distribution-Heterogeneous +Quantization which enables quantizing them to low-precision. PikeLPN achieves +Pareto-optimality in efficiency-accuracy trade-off with up to 3X efficiency +improvement compared to SOTA low-precision models. + +
+
+ comment: Accepted in CVPR 2024. 10 Figures, 9 Tables +
+
+
+
+
+ + ☆ Sparse Views, Near Light: A Practical Paradigm for Uncalibrated + Point-light Photometric Stereo CVPR 2024 + + +
+ Neural approaches have shown a significant progress on camera-based +reconstruction. But they require either a fairly dense sampling of the viewing +sphere, or pre-training on an existing dataset, thereby limiting their +generalizability. In contrast, photometric stereo (PS) approaches have shown +great potential for achieving high-quality reconstruction under sparse +viewpoints. Yet, they are impractical because they typically require tedious +laboratory conditions, are restricted to dark rooms, and often multi-staged, +making them subject to accumulated errors. To address these shortcomings, we +propose an end-to-end uncalibrated multi-view PS framework for reconstructing +high-resolution shapes acquired from sparse viewpoints in a real-world +environment. We relax the dark room assumption, and allow a combination of +static ambient lighting and dynamic near LED lighting, thereby enabling easy +data capture outside the lab. Experimental validation confirms that it +outperforms existing baseline approaches in the regime of sparse viewpoints by +a large margin. This allows to bring high-accuracy 3D reconstruction from the +dark room to the real world, while maintaining a reasonable data capture +complexity. + +
+
+ comment: Accepted in CVPR 2024 +
+
+
+
+
+ + ☆ DVIS-DAQ: Improving Video Segmentation via Dynamic Anchor Queries + + +
+ Modern video segmentation methods adopt object queries to perform inter-frame +association and demonstrate satisfactory performance in tracking continuously +appearing objects despite large-scale motion and transient occlusion. + However, they all underperform on newly emerging and disappearing objects +that are common in the real world because they attempt to model object +emergence and disappearance through feature transitions between background and +foreground queries that have significant feature gaps. We introduce Dynamic +Anchor Queries (DAQ) to shorten the transition gap between the anchor and +target queries by dynamically generating anchor queries based on the features +of potential candidates. + Furthermore, we introduce a query-level object Emergence and Disappearance +Simulation (EDS) strategy, which unleashes DAQ's potential without any +additional cost. + Finally, we combine our proposed DAQ and EDS with DVIS~\cite{zhang2023dvis} +to obtain DVIS-DAQ. + Extensive experiments demonstrate that DVIS-DAQ achieves a new +state-of-the-art (SOTA) performance on five mainstream video segmentation +benchmarks. Code and models are available at +\url{https://github.com/SkyworkAI/DAQ-VS}. + +
+
+
+
+
+ + ☆ Holo-VQVAE: VQ-VAE for phase-only holograms + + +
+ Holography stands at the forefront of visual technology innovation, offering +immersive, three-dimensional visualizations through the manipulation of light +wave amplitude and phase. Contemporary research in hologram generation has +predominantly focused on image-to-hologram conversion, producing holograms from +existing images. These approaches, while effective, inherently limit the scope +of innovation and creativity in hologram generation. In response to this +limitation, we present Holo-VQVAE, a novel generative framework tailored for +phase-only holograms (POHs). Holo-VQVAE leverages the architecture of Vector +Quantized Variational AutoEncoders, enabling it to learn the complex +distributions of POHs. Furthermore, it integrates the Angular Spectrum Method +into the training process, facilitating learning in the image domain. This +framework allows for the generation of unseen, diverse holographic content +directly from its intricately learned latent space without requiring +pre-existing images. This pioneering work paves the way for groundbreaking +applications and methodologies in holographic content creation, opening a new +era in the exploration of holographic content. + +
+
+
+
+
+ + ♻ ☆ LightGaussian: Unbounded 3D Gaussian Compression with 15x Reduction and + 200+ FPS + + +
+ Recent advancements in real-time neural rendering using point-based +techniques have paved the way for the widespread adoption of 3D +representations. However, foundational approaches like 3D Gaussian Splatting +come with a substantial storage overhead caused by growing the SfM points to +millions, often demanding gigabyte-level disk space for a single unbounded +scene, posing significant scalability challenges and hindering the splatting +efficiency. + To address this challenge, we introduce LightGaussian, a novel method +designed to transform 3D Gaussians into a more efficient and compact format. +Drawing inspiration from the concept of Network Pruning, LightGaussian +identifies Gaussians that are insignificant in contributing to the scene +reconstruction and adopts a pruning and recovery process, effectively reducing +redundancy in Gaussian counts while preserving visual effects. Additionally, +LightGaussian employs distillation and pseudo-view augmentation to distill +spherical harmonics to a lower degree, allowing knowledge transfer to more +compact representations while maintaining reflectance. Furthermore, we propose +a hybrid scheme, VecTree Quantization, to quantize all attributes, resulting in +lower bitwidth representations with minimal accuracy losses. + In summary, LightGaussian achieves an averaged compression rate over 15x +while boosting the FPS from 139 to 215, enabling an efficient representation of +complex scenes on Mip-NeRF 360, Tank and Temple datasets. + Project website: https://lightgaussian.github.io/ + +
+
+ comment: 16pages, 8figures +
+
+
+
+
+ + ♻ ☆ Gromov-Wassertein-like Distances in the Gaussian Mixture Models Space + + +
+ The Gromov-Wasserstein (GW) distance is frequently used in machine learning +to compare distributions across distinct metric spaces. Despite its utility, it +remains computationally intensive, especially for large-scale problems. +Recently, a novel Wasserstein distance specifically tailored for Gaussian +mixture models and known as MW (mixture Wasserstein) has been introduced by +several authors. In scenarios where data exhibit clustering, this approach +simplifies to a small-scale discrete optimal transport problem, which +complexity depends solely on the number of Gaussian components in the GMMs. +This paper aims to extend MW by introducing new Gromov-type distances. These +distances are designed to be isometry-invariant in Euclidean spaces and are +applicable for comparing GMMs across different dimensional spaces. Our first +contribution is the Mixture Gromov Wasserstein distance (MGW), which can be +viewed as a Gromovized version of MW. This new distance has a straightforward +discrete formulation, making it highly efficient for estimating distances +between GMMs in practical applications. To facilitate the derivation of a +transport plan between GMMs, we present a second distance, the Embedded +Wasserstein distance (EW). This distance turns out to be closely related to +several recent alternatives to Gromov-Wasserstein. We show that EW can be +adapted to derive a distance as well as optimal transportation plans between +GMMs. We demonstrate the efficiency of these newly proposed distances on medium +to large-scale problems, including shape matching and hyperspectral image color +transfer. + +
+
+ comment: preprint +
+
+
+
+
+ + ♻ ☆ Language Model Beats Diffusion -- Tokenizer is Key to Visual Generation ICLR 2024 + + +
+ While Large Language Models (LLMs) are the dominant models for generative +tasks in language, they do not perform as well as diffusion models on image and +video generation. To effectively use LLMs for visual generation, one crucial +component is the visual tokenizer that maps pixel-space inputs to discrete +tokens appropriate for LLM learning. In this paper, we introduce MAGVIT-v2, a +video tokenizer designed to generate concise and expressive tokens for both +videos and images using a common token vocabulary. Equipped with this new +tokenizer, we show that LLMs outperform diffusion models on standard image and +video generation benchmarks including ImageNet and Kinetics. In addition, we +demonstrate that our tokenizer surpasses the previously top-performing video +tokenizer on two more tasks: (1) video compression comparable to the +next-generation video codec (VCC) according to human evaluations, and (2) +learning effective representations for action recognition tasks. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ SERNet-Former: Semantic Segmentation by Efficient Residual Network with + Attention-Boosting Gates and Attention-Fusion Networks + + +
+ Improving the efficiency of state-of-the-art methods in semantic segmentation +requires overcoming the increasing computational cost as well as issues such as +fusing semantic information from global and local contexts. Based on the recent +success and problems that convolutional neural networks (CNNs) encounter in +semantic segmentation, this research proposes an encoder-decoder architecture +with a unique efficient residual network, Efficient-ResNet. Attention-boosting +gates (AbGs) and attention-boosting modules (AbMs) are deployed by aiming to +fuse the equivariant and feature-based semantic information with the equivalent +sizes of the output of global context of the efficient residual network in the +encoder. Respectively, the decoder network is developed with the additional +attention-fusion networks (AfNs) inspired by AbM. AfNs are designed to improve +the efficiency in the one-to-one conversion of the semantic information by +deploying additional convolution layers in the decoder part. Our network is +tested on the challenging CamVid and Cityscapes datasets, and the proposed +methods reveal significant improvements on the residual networks. To the best +of our knowledge, the developed network, SERNet-Former, achieves +state-of-the-art results (84.62 % mean IoU) on CamVid dataset and challenging +results (87.35 % mean IoU) on Cityscapes validation dataset. + +
+
+
+
+
+ + ♻ ☆ Learning to Count without Annotations CVPR'24 + + +
+ While recent supervised methods for reference-based object counting continue +to improve the performance on benchmark datasets, they have to rely on small +datasets due to the cost associated with manually annotating dozens of objects +in images. We propose UnCounTR, a model that can learn this task without +requiring any manual annotations. To this end, we construct "Self-Collages", +images with various pasted objects as training samples, that provide a rich +learning signal covering arbitrary object types and counts. Our method builds +on existing unsupervised representations and segmentation techniques to +successfully demonstrate for the first time the ability of reference-based +counting without manual supervision. Our experiments show that our method not +only outperforms simple baselines and generic models such as FasterRCNN and +DETR, but also matches the performance of supervised counting models in some +domains. + +
+
+ comment: Accepted at CVPR'24. Code available at + https://github.com/lukasknobel/SelfCollages +
+
+
+
+
+ + ♻ ☆ LipSim: A Provably Robust Perceptual Similarity Metric + + +
+ Recent years have seen growing interest in developing and applying perceptual +similarity metrics. Research has shown the superiority of perceptual metrics +over pixel-wise metrics in aligning with human perception and serving as a +proxy for the human visual system. On the other hand, as perceptual metrics +rely on neural networks, there is a growing concern regarding their resilience, +given the established vulnerability of neural networks to adversarial attacks. +It is indeed logical to infer that perceptual metrics may inherit both the +strengths and shortcomings of neural networks. In this work, we demonstrate the +vulnerability of state-of-the-art perceptual similarity metrics based on an +ensemble of ViT-based feature extractors to adversarial attacks. We then +propose a framework to train a robust perceptual similarity metric called +LipSim (Lipschitz Similarity Metric) with provable guarantees. By leveraging +1-Lipschitz neural networks as the backbone, LipSim provides guarded areas +around each data point and certificates for all perturbations within an +$\ell_2$ ball. Finally, a comprehensive set of experiments shows the +performance of LipSim in terms of natural and certified scores and on the image +retrieval application. The code is available at +https://github.com/SaraGhazanfari/LipSim. + +
+
+
+
+
+ + ♻ ☆ RNb-NeuS: Reflectance and Normal-based Multi-View 3D Reconstruction CVPR 2024 + + +
+ This paper introduces a versatile paradigm for integrating multi-view +reflectance (optional) and normal maps acquired through photometric stereo. Our +approach employs a pixel-wise joint re-parameterization of reflectance and +normal, considering them as a vector of radiances rendered under simulated, +varying illumination. This re-parameterization enables the seamless integration +of reflectance and normal maps as input data in neural volume rendering-based +3D reconstruction while preserving a single optimization objective. In +contrast, recent multi-view photometric stereo (MVPS) methods depend on +multiple, potentially conflicting objectives. Despite its apparent simplicity, +our proposed approach outperforms state-of-the-art approaches in MVPS +benchmarks across F-score, Chamfer distance, and mean angular error metrics. +Notably, it significantly improves the detailed 3D reconstruction of areas with +high curvature or low visibility. + +
+
+ comment: 14 pages, 13 figures, 7 tables. Accepted to CVPR 2024. The project + page can be accessed via + https://robinbruneau.github.io/publications/rnb_neus.html. The source code is + available at https://github.com/bbrument/RNb-NeuS +
+
+
+
+
+ + ♻ ☆ A Strong Baseline for Point Cloud Registration via Direct Superpoints + Matching + + +
+ Deep neural networks endow the downsampled superpoints with highly +discriminative feature representations. Previous dominant point cloud +registration approaches match these feature representations as the first step, +e.g., using the Sinkhorn algorithm. A RANSAC-like method is then usually +adopted as a post-processing refinement to filter the outliers. Other dominant +method is to directly predict the superpoint matchings using learned MLP +layers. Both of them have drawbacks: RANSAC-based methods are computationally +intensive and prediction-based methods suffer from outputing non-existing +points in the point cloud. In this paper, we propose a straightforward and +effective baseline to find correspondences of superpoints in a global matching +manner. We employ the normalized matching scores as weights for each +correspondence, allowing us to reject the outliers and further weigh the rest +inliers when fitting the transformation matrix without relying on the +cumbersome RANSAC. Moreover, the entire model can be trained in an end-to-end +fashion, leading to better accuracy. Our simple yet effective baseline shows +comparable or even better results than state-of-the-art methods on three +datasets including ModelNet, 3DMatch, and KITTI. We do not advocate our +approach to be \emph{the} solution for point cloud registration but use the +results to emphasize the role of matching strategy for point cloud +registration. The code and models are available at +https://github.com/neu-vi/Superpoints_Registration. + +
+
+
+
+
+ + ♻ ☆ VicTR: Video-conditioned Text Representations for Activity Recognition CVPR 2024 + + +
+ Vision-Language models (VLMs) have excelled in the image-domain -- especially +in zero-shot settings -- thanks to the availability of vast pretraining data +(i.e., paired image-text samples). However for videos, such paired data is not +as abundant. Therefore, video-VLMs are usually designed by adapting pretrained +image-VLMs to the video-domain, instead of training from scratch. All such +recipes rely on augmenting visual embeddings with temporal information (i.e., +image $\rightarrow$ video), often keeping text embeddings unchanged or even +being discarded. In this paper, we argue the contrary, that better video-VLMs +can be designed by focusing more on augmenting text, rather than visual +information. More specifically, we introduce Video-conditioned Text +Representations (VicTR): a form of text embeddings optimized w.r.t. visual +embeddings, creating a more-flexible contrastive latent space. Our model can +further make use of freely-available semantic information, in the form of +visually-grounded auxiliary text (e.g. object or scene information). We +evaluate our model on few-shot, zero-shot (HMDB-51, UCF-101), short-form +(Kinetics-400) and long-form (Charades) activity recognition benchmarks, +showing strong performance among video-VLMs. + +
+
+ comment: To appear at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Towards Low-Energy Adaptive Personalization for Resource-Constrained + Devices + + +
+ The personalization of machine learning (ML) models to address data drift is +a significant challenge in the context of Internet of Things (IoT) +applications. Presently, most approaches focus on fine-tuning either the full +base model or its last few layers to adapt to new data, while often neglecting +energy costs. However, various types of data drift exist, and fine-tuning the +full base model or the last few layers may not result in optimal performance in +certain scenarios. We propose Target Block Fine-Tuning (TBFT), a low-energy +adaptive personalization framework designed for resource-constrained devices. +We categorize data drift and personalization into three types: input-level, +feature-level, and output-level. For each type, we fine-tune different blocks +of the model to achieve optimal performance with reduced energy costs. +Specifically, input-, feature-, and output-level correspond to fine-tuning the +front, middle, and rear blocks of the model. We evaluate TBFT on a ResNet +model, three datasets, three different training sizes, and a Raspberry Pi. +Compared with the $Block Avg$, where each block is fine-tuned individually and +their performance improvements are averaged, TBFT exhibits an improvement in +model accuracy by an average of 15.30% whilst saving 41.57% energy consumption +on average compared with full fine-tuning. + +
+
+ comment: Accepetd to The 4th Workshop on Machine Learning and Systems + (EuroMLSys '24) +
+
+
+
+
+ + ♻ ☆ GlitchBench: Can large multimodal models detect video game glitches? CVPR 2024 + + +
+ Large multimodal models (LMMs) have evolved from large language models (LLMs) +to integrate multiple input modalities, such as visual inputs. This integration +augments the capacity of LLMs for tasks requiring visual comprehension and +reasoning. However, the extent and limitations of their enhanced abilities are +not fully understood, especially when it comes to real-world tasks. To address +this gap, we introduce GlitchBench, a novel benchmark derived from video game +quality assurance tasks, to test and evaluate the reasoning capabilities of +LMMs. Our benchmark is curated from a variety of unusual and glitched scenarios +from video games and aims to challenge both the visual and linguistic reasoning +powers of LMMs in detecting and interpreting out-of-the-ordinary events. We +evaluate multiple state-of-the-art LMMs, and we show that GlitchBench presents +a new challenge for these models. Code and data are available at: +https://glitchbench.github.io/ + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Rapid Motor Adaptation for Robotic Manipulator Arms CVPR 2024 + + +
+ Developing generalizable manipulation skills is a core challenge in embodied +AI. This includes generalization across diverse task configurations, +encompassing variations in object shape, density, friction coefficient, and +external disturbances such as forces applied to the robot. Rapid Motor +Adaptation (RMA) offers a promising solution to this challenge. It posits that +essential hidden variables influencing an agent's task performance, such as +object mass and shape, can be effectively inferred from the agent's action and +proprioceptive history. Drawing inspiration from RMA in locomotion and in-hand +rotation, we use depth perception to develop agents tailored for rapid motor +adaptation in a variety of manipulation tasks. We evaluated our agents on four +challenging tasks from the Maniskill2 benchmark, namely pick-and-place +operations with hundreds of objects from the YCB and EGAD datasets, peg +insertion with precise position and orientation, and operating a variety of +faucets and handles, with customized environment variations. Empirical results +demonstrate that our agents surpass state-of-the-art methods like automatic +domain randomization and vision-based policies, obtaining better generalization +performance and sample efficiency. + +
+
+ comment: Accepted at CVPR 2024. 12 pages +
+
+
+
+
+ + ♻ ☆ FlashAvatar: High-fidelity Head Avatar with Efficient Gaussian Embedding + + +
+ We propose FlashAvatar, a novel and lightweight 3D animatable avatar +representation that could reconstruct a digital avatar from a short monocular +video sequence in minutes and render high-fidelity photo-realistic images at +300FPS on a consumer-grade GPU. To achieve this, we maintain a uniform 3D +Gaussian field embedded in the surface of a parametric face model and learn +extra spatial offset to model non-surface regions and subtle facial details. +While full use of geometric priors can capture high-frequency facial details +and preserve exaggerated expressions, proper initialization can help reduce the +number of Gaussians, thus enabling super-fast rendering speed. Extensive +experimental results demonstrate that FlashAvatar outperforms existing works +regarding visual quality and personalized details and is almost an order of +magnitude faster in rendering speed. Project page: +https://ustc3dv.github.io/FlashAvatar/ + +
+
+ comment: Project page: https://ustc3dv.github.io/FlashAvatar/ +
+
+
+
+
+ + ♻ ☆ Joint chest X-ray diagnosis and clinical visual attention prediction + with multi-stage cooperative learning: enhancing interpretability + + +
+ As deep learning has become the state-of-the-art for computer-assisted +diagnosis, interpretability of the automatic decisions is crucial for clinical +deployment. While various methods were proposed in this domain, visual +attention maps of clinicians during radiological screening offer a unique asset +to provide important insights and can potentially enhance the quality of +computer-assisted diagnosis. With this paper, we introduce a novel +deep-learning framework for joint disease diagnosis and prediction of +corresponding visual saliency maps for chest X-ray scans. Specifically, we +designed a novel dual-encoder multi-task UNet, which leverages both a +DenseNet201 backbone and a Residual and Squeeze-and-Excitation block-based +encoder to extract diverse features for saliency map prediction, and a +multi-scale feature-fusion classifier to perform disease classification. To +tackle the issue of asynchronous training schedules of individual tasks in +multi-task learning, we proposed a multi-stage cooperative learning strategy, +with contrastive learning for feature encoder pretraining to boost performance. +Experiments show that our proposed method outperformed existing techniques for +chest X-ray diagnosis and the quality of visual saliency map prediction. + +
+
+
+
+
+ + ♻ ☆ EAGLE: Eigen Aggregation Learning for Object-Centric Unsupervised + Semantic Segmentation + + +
+ Semantic segmentation has innately relied on extensive pixel-level annotated +data, leading to the emergence of unsupervised methodologies. Among them, +leveraging self-supervised Vision Transformers for unsupervised semantic +segmentation (USS) has been making steady progress with expressive deep +features. Yet, for semantically segmenting images with complex objects, a +predominant challenge remains: the lack of explicit object-level semantic +encoding in patch-level features. This technical limitation often leads to +inadequate segmentation of complex objects with diverse structures. To address +this gap, we present a novel approach, EAGLE, which emphasizes object-centric +representation learning for unsupervised semantic segmentation. Specifically, +we introduce EiCue, a spectral technique providing semantic and structural cues +through an eigenbasis derived from the semantic similarity matrix of deep image +features and color affinity from an image. Further, by incorporating our +object-centric contrastive loss with EiCue, we guide our model to learn +object-level representations with intra- and inter-image object-feature +consistency, thereby enhancing semantic accuracy. Extensive experiments on +COCO-Stuff, Cityscapes, and Potsdam-3 datasets demonstrate the state-of-the-art +USS results of EAGLE with accurate and consistent semantic segmentation across +complex scenes. + +
+
+
+
+
+ + ♻ ☆ Descriptor and Word Soups: Overcoming the Parameter Efficiency Accuracy + Tradeoff for Out-of-Distribution Few-shot Learning + + +
+ Over the past year, a large body of multimodal research has emerged around +zero-shot evaluation using GPT descriptors. These studies boost the zero-shot +accuracy of pretrained VL models with an ensemble of label-specific text +generated by GPT. A recent study, WaffleCLIP, demonstrated that similar +zero-shot accuracy can be achieved with an ensemble of random descriptors. +However, both zero-shot methods are un-trainable and consequently sub-optimal +when some few-shot out-of-distribution (OOD) training data is available. +Inspired by these prior works, we present two more flexible methods called +descriptor and word soups, which do not require an LLM at test time and can +leverage training data to increase OOD target accuracy. Descriptor soup +greedily selects a small set of textual descriptors using generic few-shot +training data, then calculates robust class embeddings using the selected +descriptors. Word soup greedily assembles a chain of words in a similar manner. +Compared to existing few-shot soft prompt tuning methods, word soup requires +fewer parameters by construction and less GPU memory, since it does not require +backpropagation. Both soups outperform current published few-shot methods, even +when combined with SoTA zero-shot methods, on cross-dataset and domain +generalization benchmarks. Compared with SoTA prompt and descriptor ensembling +methods, such as ProDA and WaffleCLIP, word soup achieves higher OOD accuracy +with fewer ensemble members. Please checkout our code: +github.com/Chris210634/word_soups + +
+
+
+
+
+ + ♻ ☆ LifelongMemory: Leveraging LLMs for Answering Queries in Long-form + Egocentric Videos + + +
+ In this paper we introduce LifelongMemory, a new framework for accessing +long-form egocentric videographic memory through natural language question +answering and retrieval. LifelongMemory generates concise video activity +descriptions of the camera wearer and leverages the zero-shot capabilities of +pretrained large language models to perform reasoning over long-form video +context. Furthermore, Lifelong Memory uses a confidence and explanation module +to produce confident, high-quality, and interpretable answers. Our approach +achieves state-of-the-art performance on the EgoSchema benchmark for question +answering and is highly competitive on the natural language query (NLQ) +challenge of Ego4D. Code is available at +https://github.com/Agentic-Learning-AI-Lab/lifelong-memory. + +
+
+
+
+
+ + ♻ ☆ DialogCC: An Automated Pipeline for Creating High-Quality Multi-Modal + Dialogue Dataset NAACL 2024 + + +
+ As sharing images in an instant message is a crucial factor, there has been +active research on learning an image-text multi-modal dialogue models. However, +training a well-generalized multi-modal dialogue model remains challenging due +to the low quality and limited diversity of images per dialogue in existing +multi-modal dialogue datasets. In this paper, we propose an automated pipeline +to construct a multi-modal dialogue dataset, ensuring both dialogue quality and +image diversity without requiring minimum human effort. In our pipeline, to +guarantee the coherence between images and dialogue, we prompt GPT-4 to infer +potential image-sharing moments - specifically, the utterance, speaker, +rationale, and image description. Furthermore, we leverage CLIP similarity to +maintain consistency between aligned multiple images to the utterance. Through +this pipeline, we introduce DialogCC, a high-quality and diverse multi-modal +dialogue dataset that surpasses existing datasets in terms of quality and +diversity in human evaluation. Our comprehensive experiments highlight that +when multi-modal dialogue models are trained using our dataset, their +generalization performance on unseen dialogue datasets is significantly +enhanced. We make our source code and dataset publicly available. + +
+
+ comment: NAACL 2024 +
+
+
+
+
+ + ♻ ☆ 3DInAction: Understanding Human Actions in 3D Point Clouds + + +
+ We propose a novel method for 3D point cloud action recognition. +Understanding human actions in RGB videos has been widely studied in recent +years, however, its 3D point cloud counterpart remains under-explored. This is +mostly due to the inherent limitation of the point cloud data modality -- lack +of structure, permutation invariance, and varying number of points -- which +makes it difficult to learn a spatio-temporal representation. To address this +limitation, we propose the 3DinAction pipeline that first estimates patches +moving in time (t-patches) as a key building block, alongside a hierarchical +architecture that learns an informative spatio-temporal representation. We show +that our method achieves improved performance on existing datasets, including +DFAUST and IKEA ASM. Code is publicly available at +https://github.com/sitzikbs/3dincaction. + +
+
+
+
+
+ + ♻ ☆ DragVideo: Interactive Drag-style Video Editing + + +
+ Video generation models have shown their superior ability to generate +photo-realistic video. However, how to accurately control (or edit) the video +remains a formidable challenge. The main issues are: 1) how to perform direct +and accurate user control in editing; 2) how to execute editings like changing +shape, expression, and layout without unsightly distortion and artifacts to the +edited content; and 3) how to maintain spatio-temporal consistency of video +after editing. To address the above issues, we propose DragVideo, a general +drag-style video editing framework. Inspired by DragGAN, DragVideo addresses +issues 1) and 2) by proposing the drag-style video latent optimization method +which gives desired control by updating noisy video latent according to drag +instructions through video-level drag objective function. We amend issue 3) by +integrating the video diffusion model with sample-specific LoRA and Mutual +Self-Attention in DragVideo to ensure the edited result is spatio-temporally +consistent. We also present a series of testing examples for drag-style video +editing and conduct extensive experiments across a wide array of challenging +editing tasks, such as motion, skeleton editing, etc, underscoring DragVideo +can edit video in an intuitive, faithful to the user's intention manner, with +nearly unnoticeable distortion and artifacts, while maintaining spatio-temporal +consistency. While traditional prompt-based video editing fails to do the +former two and directly applying image drag editing fails in the last, +DragVideo's versatility and generality are emphasized. Github link: +https://github.com/RickySkywalker/DragVideo-Official. + +
+
+
+
+
+ + ♻ ☆ Self-learning Canonical Space for Multi-view 3D Human Pose Estimation + + +
+ Multi-view 3D human pose estimation is naturally superior to single view one, +benefiting from more comprehensive information provided by images of multiple +views. The information includes camera poses, 2D/3D human poses, and 3D +geometry. However, the accurate annotation of these information is hard to +obtain, making it challenging to predict accurate 3D human pose from multi-view +images. To deal with this issue, we propose a fully self-supervised framework, +named cascaded multi-view aggregating network (CMANet), to construct a +canonical parameter space to holistically integrate and exploit multi-view +information. In our framework, the multi-view information is grouped into two +categories: 1) intra-view information , 2) inter-view information. Accordingly, +CMANet consists of two components: intra-view module (IRV) and inter-view +module (IEV). IRV is used for extracting initial camera pose and 3D human pose +of each view; IEV is to fuse complementary pose information and cross-view 3D +geometry for a final 3D human pose. To facilitate the aggregation of the intra- +and inter-view, we define a canonical parameter space, depicted by per-view +camera pose and human pose and shape parameters ($\theta$ and $\beta$) of SMPL +model, and propose a two-stage learning procedure. At first stage, IRV learns +to estimate camera pose and view-dependent 3D human pose supervised by +confident output of an off-the-shelf 2D keypoint detector. At second stage, IRV +is frozen and IEV further refines the camera pose and optimizes the 3D human +pose by implicitly encoding the cross-view complement and 3D geometry +constraint, achieved by jointly fitting predicted multi-view 2D keypoints. The +proposed framework, modules, and learning strategy are demonstrated to be +effective by comprehensive experiments and CMANet is superior to +state-of-the-art methods in extensive quantitative and qualitative analysis. + +
+
+
+
+
+ + ♻ ☆ NOPE: Novel Object Pose Estimation from a Single Image CVPR 2024 + + +
+ The practicality of 3D object pose estimation remains limited for many +applications due to the need for prior knowledge of a 3D model and a training +period for new objects. To address this limitation, we propose an approach that +takes a single image of a new object as input and predicts the relative pose of +this object in new images without prior knowledge of the object's 3D model and +without requiring training time for new objects and categories. We achieve this +by training a model to directly predict discriminative embeddings for +viewpoints surrounding the object. This prediction is done using a simple U-Net +architecture with attention and conditioned on the desired pose, which yields +extremely fast inference. We compare our approach to state-of-the-art methods +and show it outperforms them both in terms of accuracy and robustness. Our +source code is publicly available at https://github.com/nv-nguyen/nope + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Gradient Reweighting: Towards Imbalanced Class-Incremental Learning CVPR 2024 + + +
+ Class-Incremental Learning (CIL) trains a model to continually recognize new +classes from non-stationary data while retaining learned knowledge. A major +challenge of CIL arises when applying to real-world data characterized by +non-uniform distribution, which introduces a dual imbalance problem involving +(i) disparities between stored exemplars of old tasks and new class data +(inter-phase imbalance), and (ii) severe class imbalances within each +individual task (intra-phase imbalance). We show that this dual imbalance issue +causes skewed gradient updates with biased weights in FC layers, thus inducing +over/under-fitting and catastrophic forgetting in CIL. Our method addresses it +by reweighting the gradients towards balanced optimization and unbiased +classifier learning. Additionally, we observe imbalanced forgetting where +paradoxically the instance-rich classes suffer higher performance degradation +during CIL due to a larger amount of training data becoming unavailable in +subsequent learning phases. To tackle this, we further introduce a +distribution-aware knowledge distillation loss to mitigate forgetting by +aligning output logits proportionally with the distribution of lost training +data. We validate our method on CIFAR-100, ImageNetSubset, and Food101 across +various evaluation protocols and demonstrate consistent improvements compared +to existing works, showing great potential to apply CIL in real-world scenarios +with enhanced robustness and effectiveness. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ SHINOBI: Shape and Illumination using Neural Object Decomposition via + BRDF Optimization In-the-wild CVPR 2024 + + +
+ We present SHINOBI, an end-to-end framework for the reconstruction of shape, +material, and illumination from object images captured with varying lighting, +pose, and background. Inverse rendering of an object based on unconstrained +image collections is a long-standing challenge in computer vision and graphics +and requires a joint optimization over shape, radiance, and pose. We show that +an implicit shape representation based on a multi-resolution hash encoding +enables faster and robust shape reconstruction with joint camera alignment +optimization that outperforms prior work. Further, to enable the editing of +illumination and object reflectance (i.e. material) we jointly optimize BRDF +and illumination together with the object's shape. Our method is class-agnostic +and works on in-the-wild image collections of objects to produce relightable 3D +assets for several use cases such as AR/VR, movies, games, etc. Project page: +https://shinobi.aengelhardt.com Video: +https://www.youtube.com/watch?v=iFENQ6AcYd8&feature=youtu.be + +
+
+ comment: Accepted by IEEE/CVF Conference on Computer Vision and Pattern + Recognition (CVPR 2024). Updated supplementary material and acknowledgements +
+
+
+
+
+ + ♻ ☆ Task2Box: Box Embeddings for Modeling Asymmetric Task Relationships + + +
+ Modeling and visualizing relationships between tasks or datasets is an +important step towards solving various meta-tasks such as dataset discovery, +multi-tasking, and transfer learning. However, many relationships, such as +containment and transferability, are naturally asymmetric and current +approaches for representation and visualization (e.g., t-SNE) do not readily +support this. We propose Task2Box, an approach to represent tasks using box +embeddings -- axis-aligned hyperrectangles in low dimensional spaces -- that +can capture asymmetric relationships between them through volumetric overlaps. +We show that Task2Box accurately predicts unseen hierarchical relationships +between nodes in ImageNet and iNaturalist datasets, as well as transferability +between tasks in the Taskonomy benchmark. We also show that box embeddings +estimated from task representations (e.g., CLIP, Task2Vec, or attribute based) +can be used to predict relationships between unseen tasks more accurately than +classifiers trained on the same representations, as well as handcrafted +asymmetric distances (e.g., KL divergence). This suggests that low-dimensional +box embeddings can effectively capture these task relationships and have the +added advantage of being interpretable. We use the approach to visualize +relationships among publicly available image classification datasets on popular +dataset hosting platform called Hugging Face. + +
+
+
+
+
+ + ♻ ☆ LoCoNet: Long-Short Context Network for Active Speaker Detection CVPR 2024 + + +
+ Active Speaker Detection (ASD) aims to identify who is speaking in each frame +of a video. ASD reasons from audio and visual information from two contexts: +long-term intra-speaker context and short-term inter-speaker context. Long-term +intra-speaker context models the temporal dependencies of the same speaker, +while short-term inter-speaker context models the interactions of speakers in +the same scene. These two contexts are complementary to each other and can help +infer the active speaker. Motivated by these observations, we propose LoCoNet, +a simple yet effective Long-Short Context Network that models the long-term +intra-speaker context and short-term inter-speaker context. We use +self-attention to model long-term intra-speaker context due to its +effectiveness in modeling long-range dependencies, and convolutional blocks +that capture local patterns to model short-term inter-speaker context. +Extensive experiments show that LoCoNet achieves state-of-the-art performance +on multiple datasets, achieving an mAP of 95.2%(+1.1%) on AVA-ActiveSpeaker, +68.1%(+22%) on Columbia dataset, 97.2%(+2.8%) on Talkies dataset and +59.7%(+8.0%) on Ego4D dataset. Moreover, in challenging cases where multiple +speakers are present, or face of active speaker is much smaller than other +faces in the same scene, LoCoNet outperforms previous state-of-the-art methods +by 3.4% on the AVA-ActiveSpeaker dataset. The code will be released at +https://github.com/SJTUwxz/LoCoNet_ASD. + +
+
+ comment: accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Tiny Machine Learning: Progress and Futures + + +
+ Tiny Machine Learning (TinyML) is a new frontier of machine learning. By +squeezing deep learning models into billions of IoT devices and +microcontrollers (MCUs), we expand the scope of AI applications and enable +ubiquitous intelligence. However, TinyML is challenging due to hardware +constraints: the tiny memory resource makes it difficult to hold deep learning +models designed for cloud and mobile platforms. There is also limited compiler +and inference engine support for bare-metal devices. Therefore, we need to +co-design the algorithm and system stack to enable TinyML. In this review, we +will first discuss the definition, challenges, and applications of TinyML. We +then survey the recent progress in TinyML and deep learning on MCUs. Next, we +will introduce MCUNet, showing how we can achieve ImageNet-scale AI +applications on IoT devices with system-algorithm co-design. We will further +extend the solution from inference to training and introduce tiny on-device +training techniques. Finally, we present future directions in this area. +Today's large model might be tomorrow's tiny model. The scope of TinyML should +evolve and adapt over time. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2206.15472 +
+
+
+
+
+ + ♻ ☆ FocusMAE: Gallbladder Cancer Detection from Ultrasound Videos with + Focused Masked Autoencoders CVPR 2024 + + +
+ In recent years, automated Gallbladder Cancer (GBC) detection has gained the +attention of researchers. Current state-of-the-art (SOTA) methodologies relying +on ultrasound sonography (US) images exhibit limited generalization, +emphasizing the need for transformative approaches. We observe that individual +US frames may lack sufficient information to capture disease manifestation. +This study advocates for a paradigm shift towards video-based GBC detection, +leveraging the inherent advantages of spatiotemporal representations. Employing +the Masked Autoencoder (MAE) for representation learning, we address +shortcomings in conventional image-based methods. We propose a novel design +called FocusMAE to systematically bias the selection of masking tokens from +high-information regions, fostering a more refined representation of +malignancy. Additionally, we contribute the most extensive US video dataset for +GBC detection. We also note that, this is the first study on US video-based GBC +detection. We validate the proposed methods on the curated dataset, and report +a new state-of-the-art (SOTA) accuracy of 96.4% for the GBC detection problem, +against an accuracy of 84% by current Image-based SOTA - GBCNet, and RadFormer, +and 94.7% by Video-based SOTA - AdaMAE. We further demonstrate the generality +of the proposed FocusMAE on a public CT-based Covid detection dataset, +reporting an improvement in accuracy by 3.3% over current baselines. The source +code and pretrained models are available at: +https://gbc-iitd.github.io/focusmae + +
+
+ comment: To Appear at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Explaining latent representations of generative models with large + multimodal models ICLR 2024 + + +
+ Learning interpretable representations of data generative latent factors is +an important topic for the development of artificial intelligence. With the +rise of the large multimodal model, it can align images with text to generate +answers. In this work, we propose a framework to comprehensively explain each +latent variable in the generative models using a large multimodal model. We +further measure the uncertainty of our generated explanations, quantitatively +evaluate the performance of explanation generation among multiple large +multimodal models, and qualitatively visualize the variations of each latent +variable to learn the disentanglement effects of different generative models on +explanations. Finally, we discuss the explanatory capabilities and limitations +of state-of-the-art large multimodal models. + +
+
+ comment: ICLR 2024 Workshop Paper on Reliable and Responsible Foundation + Models +
+
+
+
+
+ + ♻ ☆ Versatile Medical Image Segmentation Learned from Multi-Source Datasets + via Model Self-Disambiguation + + +
+ A versatile medical image segmentation model applicable to images acquired +with diverse equipment and protocols can facilitate model deployment and +maintenance. However, building such a model typically demands a large, diverse, +and fully annotated dataset, which is challenging to obtain due to the +labor-intensive nature of data curation. To address this challenge, we propose +a cost-effective alternative that harnesses multi-source data with only partial +or sparse segmentation labels for training, substantially reducing the cost of +developing a versatile model. We devise strategies for model +self-disambiguation, prior knowledge incorporation, and imbalance mitigation to +tackle challenges associated with inconsistently labeled multi-source data, +including label ambiguity and modality, dataset, and class imbalances. +Experimental results on a multi-modal dataset compiled from eight different +sources for abdominal structure segmentation have demonstrated the +effectiveness and superior performance of our method compared to +state-of-the-art alternative approaches. We anticipate that its cost-saving +features, which optimize the utilization of existing annotated data and reduce +annotation efforts for new data, will have a significant impact in the field. + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Euclidean and Affine Curve Reconstruction + + +
+ We consider practical aspects of reconstructing planar curves with prescribed +Euclidean or affine curvatures. These curvatures are invariant under the +special Euclidean group and the equi-affine groups, respectively, and play an +important role in computer vision and shape analysis. We discuss and implement +algorithms for such reconstruction, and give estimates on how close +reconstructed curves are relative to the closeness of their curvatures in +appropriate metrics. Several illustrative examples are provided. + +
+
+ comment: This paper is a result of an REU project conducted at the North + Carolina State University in the Summer and Fall 2020. This version has + several minor corrections +
+
+
+
+
+ + ♻ ☆ AVID: Any-Length Video Inpainting with Diffusion Model + + +
+ Recent advances in diffusion models have successfully enabled text-guided +image inpainting. While it seems straightforward to extend such editing +capability into the video domain, there have been fewer works regarding +text-guided video inpainting. Given a video, a masked region at its initial +frame, and an editing prompt, it requires a model to do infilling at each frame +following the editing guidance while keeping the out-of-mask region intact. +There are three main challenges in text-guided video inpainting: ($i$) temporal +consistency of the edited video, ($ii$) supporting different inpainting types +at different structural fidelity levels, and ($iii$) dealing with variable +video length. To address these challenges, we introduce Any-Length Video +Inpainting with Diffusion Model, dubbed as AVID. At its core, our model is +equipped with effective motion modules and adjustable structure guidance, for +fixed-length video inpainting. Building on top of that, we propose a novel +Temporal MultiDiffusion sampling pipeline with a middle-frame attention +guidance mechanism, facilitating the generation of videos with any desired +duration. Our comprehensive experiments show our model can robustly deal with +various inpainting types at different video duration ranges, with high quality. +More visualization results are made publicly available at +https://zhang-zx.github.io/AVID/ . + +
+
+ comment: Project website: https://zhang-zx.github.io/AVID/ +
+
+
+
+
+ + ♻ ☆ Semi-Mamba-UNet: Pixel-Level Contrastive and Pixel-Level + Cross-Supervised Visual Mamba-based UNet for Semi-Supervised Medical Image + Segmentation + + +
+ Medical image segmentation is essential in diagnostics, treatment planning, +and healthcare, with deep learning offering promising advancements. Notably, +Convolutional Neural Network (CNN) excel in capturing local image features, +whereas Vision Transformer (ViT) adeptly model long-range dependencies through +multi-head self-attention mechanisms. Despite their strengths, both CNN and ViT +face challenges in efficiently processing long-range dependencies within +medical images, often requiring substantial computational resources. This +issue, combined with the high cost and limited availability of expert +annotations, poses significant obstacles to achieving precise segmentation. To +address these challenges, this paper introduces the Semi-Mamba-UNet, which +integrates a visual mamba-based UNet architecture with a conventional UNet into +a semi-supervised learning (SSL) framework. This innovative SSL approach +leverages dual networks to jointly generate pseudo labels and cross supervise +each other, drawing inspiration from consistency regularization techniques. +Furthermore, we introduce a self-supervised pixel-level contrastive learning +strategy, employing a projector pair to further enhance feature learning +capabilities. Our comprehensive evaluation on a publicly available MRI cardiac +segmentation dataset, comparing against various SSL frameworks with different +UNet-based segmentation networks, highlights the superior performance of +Semi-Mamba-UNet. The source code has been made publicly accessible. + +
+
+
+
+
+ + ♻ ☆ DiffAvatar: Simulation-Ready Garment Optimization with Differentiable + Simulation CVPR 2024 + + +
+ The realism of digital avatars is crucial in enabling telepresence +applications with self-expression and customization. While physical simulations +can produce realistic motions for clothed humans, they require high-quality +garment assets with associated physical parameters for cloth simulations. +However, manually creating these assets and calibrating their parameters is +labor-intensive and requires specialized expertise. Current methods focus on +reconstructing geometry, but don't generate complete assets for physics-based +applications. To address this gap, we propose \papername,~a novel approach that +performs body and garment co-optimization using differentiable simulation. By +integrating physical simulation into the optimization loop and accounting for +the complex nonlinear behavior of cloth and its intricate interaction with the +body, our framework recovers body and garment geometry and extracts important +material parameters in a physically plausible way. Our experiments demonstrate +that our approach generates realistic clothing and body shape suitable for +downstream applications. We provide additional insights and results on our +webpage: https://people.csail.mit.edu/liyifei/publication/diffavatar/ + +
+
+ comment: CVPR 2024; Project page: + https://people.csail.mit.edu/liyifei/publication/diffavatar/ +
+
+
+
+
+ + ♻ ☆ Evaluating Text-to-Image Synthesis: Survey and Taxonomy of Image Quality + Metrics + + +
+ Recent advances in text-to-image synthesis enabled through a combination of +language and vision foundation models have led to a proliferation of the tools +available and an increased attention to the field. When conducting +text-to-image synthesis, a central goal is to ensure that the content between +text and image is aligned. As such, there exist numerous evaluation metrics +that aim to mimic human judgement. However, it is often unclear which metric to +use for evaluating text-to-image synthesis systems as their evaluation is +highly nuanced. In this work, we provide a comprehensive overview of existing +text-to-image evaluation metrics. Based on our findings, we propose a new +taxonomy for categorizing these metrics. Our taxonomy is grounded in the +assumption that there are two main quality criteria, namely compositionality +and generality, which ideally map to human preferences. Ultimately, we derive +guidelines for practitioners conducting text-to-image evaluation, discuss open +challenges of evaluation mechanisms, and surface limitations of current +metrics. + +
+
+ comment: preprint, 21 pages, 2 figures, 1 table +
+
+
+
+
+ + ♻ ☆ Augmented Reality Warnings in Roadway Work Zones: Evaluating the Effect + of Modality on Worker Reaction Times + + +
+ Given the aging highway infrastructure requiring extensive rebuilding and +enhancements, and the consequent rise in the number of work zones, there is an +urgent need to develop advanced safety systems to protect workers. While +Augmented Reality (AR) holds significant potential for delivering warnings to +workers, its integration into roadway work zones remains relatively unexplored. +The primary objective of this study is to improve safety measures within +roadway work zones by conducting an extensive analysis of how different +combinations of multimodal AR warnings influence the reaction times of workers. +This paper addresses this gap through a series of experiments that aim to +replicate the distinctive conditions of roadway work zones, both in real-world +and virtual reality environments. Our approach comprises three key components: +an advanced AR system prototype, a VR simulation of AR functionality within the +work zone environment, and the Wizard of Oz technique to synchronize user +experiences across experiments. To assess reaction times, we leverage both the +simple reaction time (SRT) technique and an innovative vision-based metric that +utilizes real-time pose estimation. By conducting five experiments in +controlled outdoor work zones and indoor VR settings, our study provides +valuable information on how various multimodal AR warnings impact workers +reaction times. Furthermore, our findings reveal the disparities in reaction +times between VR simulations and real-world scenarios, thereby gauging VR's +capability to mirror the dynamics of roadway work zones. Furthermore, our +results substantiate the potential and reliability of vision-based reaction +time measurements. These insights resonate well with those derived using the +SRT technique, underscoring the viability of this approach for tangible +real-world uses. + +
+
+
+
+
+ + ♻ ☆ Incorporating Geo-Diverse Knowledge into Prompting for Increased + Geographical Robustness in Object Recognition CVPR + + +
+ Existing object recognition models have been shown to lack robustness in +diverse geographical scenarios due to domain shifts in design and context. +Class representations need to be adapted to more accurately reflect an object +concept under these shifts. In the absence of training data from target +geographies, we hypothesize that geographically diverse descriptive knowledge +of categories can enhance robustness. For this purpose, we explore the +feasibility of probing a large language model for geography-based object +knowledge, and we examine the effects of integrating knowledge into zero-shot +and learnable soft prompting with CLIP. Within this exploration, we propose +geography knowledge regularization to ensure that soft prompts trained on a +source set of geographies generalize to an unseen target set. Accuracy gains +over prompting baselines on DollarStreet while training only on Europe data are +up to +2.8/1.2/1.6 on target data from Africa/Asia/Americas, and +4.6 overall +on the hardest classes. Competitive performance is shown vs. few-shot target +training, and analysis is provided to direct future study of geographical +robustness. + +
+
+ comment: To appear in IEEE/CVF Computer Vision and Pattern Recognition + Conference (CVPR), 2024 +
+
+
+
+
+ + ♻ ☆ Towards 3D Vision with Low-Cost Single-Photon Cameras + + +
+ We present a method for reconstructing 3D shape of arbitrary Lambertian +objects based on measurements by miniature, energy-efficient, low-cost +single-photon cameras. These cameras, operating as time resolved image sensors, +illuminate the scene with a very fast pulse of diffuse light and record the +shape of that pulse as it returns back from the scene at a high temporal +resolution. We propose to model this image formation process, account for its +non-idealities, and adapt neural rendering to reconstruct 3D geometry from a +set of spatially distributed sensors with known poses. We show that our +approach can successfully recover complex 3D shapes from simulated data. We +further demonstrate 3D object reconstruction from real-world captures, +utilizing measurements from a commodity proximity sensor. Our work draws a +connection between image-based modeling and active range scanning and is a step +towards 3D vision with single-photon cameras. + +
+
+
+
+
+ + ♻ ☆ SteinDreamer: Variance Reduction for Text-to-3D Score Distillation via + Stein Identity + + +
+ Score distillation has emerged as one of the most prevalent approaches for +text-to-3D asset synthesis. Essentially, score distillation updates 3D +parameters by lifting and back-propagating scores averaged over different +views. In this paper, we reveal that the gradient estimation in score +distillation is inherent to high variance. Through the lens of variance +reduction, the effectiveness of SDS and VSD can be interpreted as applications +of various control variates to the Monte Carlo estimator of the distilled +score. Motivated by this rethinking and based on Stein's identity, we propose a +more general solution to reduce variance for score distillation, termed Stein +Score Distillation (SSD). SSD incorporates control variates constructed by +Stein identity, allowing for arbitrary baseline functions. This enables us to +include flexible guidance priors and network architectures to explicitly +optimize for variance reduction. In our experiments, the overall pipeline, +dubbed SteinDreamer, is implemented by instantiating the control variate with a +monocular depth estimator. The results suggest that SSD can effectively reduce +the distillation variance and consistently improve visual quality for both +object- and scene-level generation. Moreover, we demonstrate that SteinDreamer +achieves faster convergence than existing methods due to more stable gradient +updates. + +
+
+ comment: Project page: https://vita-group.github.io/SteinDreamer/ +
+
+
+
+
+ + ♻ ☆ Systematic comparison of semi-supervised and self-supervised learning + for medical image classification CVPR 2024 + + +
+ In typical medical image classification problems, labeled data is scarce +while unlabeled data is more available. Semi-supervised learning and +self-supervised learning are two different research directions that can improve +accuracy by learning from extra unlabeled data. Recent methods from both +directions have reported significant gains on traditional benchmarks. Yet past +benchmarks do not focus on medical tasks and rarely compare self- and semi- +methods together on an equal footing. Furthermore, past benchmarks often handle +hyperparameter tuning suboptimally. First, they may not tune hyperparameters at +all, leading to underfitting. Second, when tuning does occur, it often +unrealistically uses a labeled validation set that is much larger than the +training set. Therefore currently published rankings might not always +corroborate with their practical utility This study contributes a systematic +evaluation of self- and semi- methods with a unified experimental protocol +intended to guide a practitioner with scarce overall labeled data and a limited +compute budget. We answer two key questions: Can hyperparameter tuning be +effective with realistic-sized validation sets? If so, when all methods are +tuned well, which self- or semi-supervised methods achieve the best accuracy? +Our study compares 13 representative semi- and self-supervised methods to +strong labeled-set-only baselines on 4 medical datasets. From 20000+ GPU hours +of computation, we provide valuable best practices to resource-constrained +practitioners: hyperparameter tuning is effective, and the semi-supervised +method known as MixMatch delivers the most reliable gains across 4 datasets. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ SplatFace: Gaussian Splat Face Reconstruction Leveraging an Optimizable + Surface + + +
+ We present SplatFace, a novel Gaussian splatting framework designed for 3D +human face reconstruction without reliance on accurate pre-determined geometry. +Our method is designed to simultaneously deliver both high-quality novel view +rendering and accurate 3D mesh reconstructions. We incorporate a generic 3D +Morphable Model (3DMM) to provide a surface geometric structure, making it +possible to reconstruct faces with a limited set of input images. We introduce +a joint optimization strategy that refines both the Gaussians and the morphable +surface through a synergistic non-rigid alignment process. A novel distance +metric, splat-to-surface, is proposed to improve alignment by considering both +the Gaussian position and covariance. The surface information is also utilized +to incorporate a world-space densification process, resulting in superior +reconstruction quality. Our experimental analysis demonstrates that the +proposed method is competitive with both other Gaussian splatting techniques in +novel view synthesis and other 3D reconstruction methods in producing 3D face +meshes with high geometric precision. + +
+
+
+
+
+ + ♻ ☆ Taming Mode Collapse in Score Distillation for Text-to-3D Generation + + +
+ Despite the remarkable performance of score distillation in text-to-3D +generation, such techniques notoriously suffer from view inconsistency issues, +also known as "Janus" artifact, where the generated objects fake each view with +multiple front faces. Although empirically effective methods have approached +this problem via score debiasing or prompt engineering, a more rigorous +perspective to explain and tackle this problem remains elusive. In this paper, +we reveal that the existing score distillation-based text-to-3D generation +frameworks degenerate to maximal likelihood seeking on each view independently +and thus suffer from the mode collapse problem, manifesting as the Janus +artifact in practice. To tame mode collapse, we improve score distillation by +re-establishing the entropy term in the corresponding variational objective, +which is applied to the distribution of rendered images. Maximizing the entropy +encourages diversity among different views in generated 3D assets, thereby +mitigating the Janus problem. Based on this new objective, we derive a new +update rule for 3D score distillation, dubbed Entropic Score Distillation +(ESD). We theoretically reveal that ESD can be simplified and implemented by +just adopting the classifier-free guidance trick upon variational score +distillation. Although embarrassingly straightforward, our extensive +experiments successfully demonstrate that ESD can be an effective treatment for +Janus artifacts in score distillation. + +
+
+ comment: Project page: https://vita-group.github.io/3D-Mode-Collapse/ +
+
+
+
+
+ + ♻ ☆ UAV-Borne Mapping Algorithms for Low-Altitude and High-Speed Drone + Applications + + +
+ This article presents an analysis of current state-of-the-art sensors and how +these sensors work with several mapping algorithms for UAV (Unmanned Aerial +Vehicle) applications, focusing on low-altitude and high-speed scenarios. A new +experimental construct is created using highly realistic environments made +possible by integrating the AirSim simulator with Google 3D maps models using +the Cesium Tiles plugin. Experiments are conducted in this high-realism +simulated environment to evaluate the performance of three distinct mapping +algorithms: (1) Direct Sparse Odometry (DSO), (2) Stereo DSO (SDSO), and (3) +DSO Lite (DSOL). Experimental results evaluate algorithms based on their +measured geometric accuracy and computational speed. The results provide +valuable insights into the strengths and limitations of each algorithm. +Findings quantify compromises in UAV algorithm selection, allowing researchers +to find the mapping solution best suited to their application, which often +requires a compromise between computational performance and the density and +accuracy of geometric map estimates. Results indicate that for UAVs with +restrictive computing resources, DSOL is the best option. For systems with +payload capacity and modest compute resources, SDSO is the best option. If only +one camera is available, DSO is the option to choose for applications that +require dense mapping results. + +
+
+
+
+
+ + ♻ ☆ Point Cloud Mamba: Point Cloud Learning via State Space Model + + +
+ In this work, for the first time, we demonstrate that Mamba-based point cloud +methods can outperform point-based methods. Mamba exhibits strong global +modeling capabilities and linear computational complexity, making it highly +attractive for point cloud analysis. To enable more effective processing of 3-D +point cloud data by Mamba, we propose a novel Consistent Traverse Serialization +to convert point clouds into 1-D point sequences while ensuring that +neighboring points in the sequence are also spatially adjacent. Consistent +Traverse Serialization yields six variants by permuting the order of x, y, and +z coordinates, and the synergistic use of these variants aids Mamba in +comprehensively observing point cloud data. Furthermore, to assist Mamba in +handling point sequences with different orders more effectively, we introduce +point prompts to inform Mamba of the sequence's arrangement rules. Finally, we +propose positional encoding based on spatial coordinate mapping to inject +positional information into point cloud sequences better. Based on these +improvements, we construct a point cloud network named Point Cloud Mamba, which +combines local and global modeling. Point Cloud Mamba surpasses the SOTA +point-based method PointNeXt and achieves new SOTA performance on the +ScanObjectNN, ModelNet40, and ShapeNetPart datasets. + +
+
+
+
+
+ + ♻ ☆ Rethinking Multi-view Representation Learning via Distilled + Disentangling CVPR 2024 + + +
+ Multi-view representation learning aims to derive robust representations that +are both view-consistent and view-specific from diverse data sources. This +paper presents an in-depth analysis of existing approaches in this domain, +highlighting a commonly overlooked aspect: the redundancy between +view-consistent and view-specific representations. To this end, we propose an +innovative framework for multi-view representation learning, which incorporates +a technique we term 'distilled disentangling'. Our method introduces the +concept of masked cross-view prediction, enabling the extraction of compact, +high-quality view-consistent representations from various sources without +incurring extra computational overhead. Additionally, we develop a distilled +disentangling module that efficiently filters out consistency-related +information from multi-view representations, resulting in purer view-specific +representations. This approach significantly reduces redundancy between +view-consistent and view-specific representations, enhancing the overall +efficiency of the learning process. Our empirical evaluations reveal that +higher mask ratios substantially improve the quality of view-consistent +representations. Moreover, we find that reducing the dimensionality of +view-consistent representations relative to that of view-specific +representations further refines the quality of the combined representations. +Our code is accessible at: https://github.com/Guanzhou-Ke/MRDD. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ You Only Sample Once: Taming One-Step Text-To-Image Synthesis by + Self-Cooperative Diffusion GANs + + +
+ We introduce YOSO, a novel generative model designed for rapid, scalable, and +high-fidelity one-step image synthesis. This is achieved by integrating the +diffusion process with GANs. Specifically, we smooth the distribution by the +denoising generator itself, performing self-cooperative learning. We show that +our method can serve as a one-step generation model training from scratch with +competitive performance. Moreover, we show that our method can be extended to +finetune pre-trained text-to-image diffusion for high-quality one-step +text-to-image synthesis even with LoRA fine-tuning. In particular, we provide +the first diffusion transformer that can generate images in one step trained on +512 resolution, with the capability of adapting to 1024 resolution without +explicit training. Our code is provided at https://github.com/Luo-Yihong/YOSO. + +
+
+ comment: Early version +
+
+
+
+
+ + ♻ ☆ V2X-DGW: Domain Generalization for Multi-agent Perception under Adverse + Weather Conditions + + +
+ Current LiDAR-based Vehicle-to-Everything (V2X) multi-agent perception +systems have shown the significant success on 3D object detection. While these +models perform well in the trained clean weather, they struggle in unseen +adverse weather conditions with the real-world domain gap. In this paper, we +propose a domain generalization approach, named V2X-DGW, for LiDAR-based 3D +object detection on multi-agent perception system under adverse weather +conditions. Not only in the clean weather does our research aim to ensure +favorable multi-agent performance, but also in the unseen adverse weather +conditions by learning only on the clean weather data. To advance research in +this area, we have simulated the impact of three prevalent adverse weather +conditions on two widely-used multi-agent datasets, resulting in the creation +of two novel benchmark datasets: OPV2V-w and V2XSet-w. + To this end, we first introduce the Adaptive Weather Augmentation (AWA) to +mimic the unseen adverse weather conditions, and then propose two alignments +for generalizable representation learning: Trust-region Weather-invariant +Alignment (TWA) and Agent-aware Contrastive Alignment (ACA). Extensive +experimental results demonstrate that our V2X-DGW achieved improvements in the +unseen adverse weather conditions. + +
+
+
+
+
+ + ♻ ☆ Cross-modal tumor segmentation using generative blending augmentation + and self training + + +
+ \textit{Objectives}: Data scarcity and domain shifts lead to biased training +sets that do not accurately represent deployment conditions. A related +practical problem is cross-modal image segmentation, where the objective is to +segment unlabelled images using previously labelled datasets from other imaging +modalities. \textit{Methods}: We propose a cross-modal segmentation method +based on conventional image synthesis boosted by a new data augmentation +technique called Generative Blending Augmentation (GBA). GBA leverages a SinGAN +model to learn representative generative features from a single training image +to diversify realistically tumor appearances. This way, we compensate for image +synthesis errors, subsequently improving the generalization power of a +downstream segmentation model. The proposed augmentation is further combined to +an iterative self-training procedure leveraging pseudo labels at each pass. +\textit{Results}: The proposed solution ranked first for vestibular schwannoma +(VS) segmentation during the validation and test phases of the MICCAI CrossMoDA +2022 challenge, with best mean Dice similarity and average symmetric surface +distance measures. \textit{Conclusion and significance}: Local contrast +alteration of tumor appearances and iterative self-training with pseudo labels +are likely to lead to performance improvements in a variety of segmentation +contexts. + +
+
+
+
+
+ + ♻ ☆ VGTS: Visually Guided Text Spotting for Novel Categories in Historical + Manuscripts + + +
+ In the field of historical manuscript research, scholars frequently encounter +novel symbols in ancient texts, investing considerable effort in their +identification and documentation. Although existing object detection methods +achieve impressive performance on known categories, they struggle to recognize +novel symbols without retraining. To address this limitation, we propose a +Visually Guided Text Spotting (VGTS) approach that accurately spots novel +characters using just one annotated support sample. The core of VGTS is a +spatial alignment module consisting of a Dual Spatial Attention (DSA) block and +a Geometric Matching (GM) block. The DSA block aims to identify, focus on, and +learn discriminative spatial regions in the support and query images, mimicking +the human visual spotting process. It first refines the support image by +analyzing inter-channel relationships to identify critical areas, and then +refines the query image by focusing on informative key points. The GM block, on +the other hand, establishes the spatial correspondence between the two images, +enabling accurate localization of the target character in the query image. To +tackle the example imbalance problem in low-resource spotting tasks, we develop +a novel torus loss function that enhances the discriminative power of the +embedding space for distance metric learning. To further validate our approach, +we introduce a new dataset featuring ancient Dongba hieroglyphics (DBH) +associated with the Naxi minority of China. Extensive experiments on the DBH +dataset and other public datasets, including EGY, VML-HD, TKH, and NC, show +that VGTS consistently surpasses state-of-the-art methods. The proposed +framework exhibits great potential for application in historical manuscript +text spotting, enabling scholars to efficiently identify and document novel +symbols with minimal annotation effort. + +
+
+
+
+
+ + ♻ ☆ CustomListener: Text-guided Responsive Interaction for User-friendly + Listening Head Generation CVPR 2024 + + +
+ Listening head generation aims to synthesize a non-verbal responsive listener +head by modeling the correlation between the speaker and the listener in +dynamic conversion.The applications of listener agent generation in virtual +interaction have promoted many works achieving the diverse and fine-grained +motion generation. However, they can only manipulate motions through simple +emotional labels, but cannot freely control the listener's motions. Since +listener agents should have human-like attributes (e.g. identity, personality) +which can be freely customized by users, this limits their realism. In this +paper, we propose a user-friendly framework called CustomListener to realize +the free-form text prior guided listener generation. To achieve +speaker-listener coordination, we design a Static to Dynamic Portrait module +(SDP), which interacts with speaker information to transform static text into +dynamic portrait token with completion rhythm and amplitude information. To +achieve coherence between segments, we design a Past Guided Generation Module +(PGG) to maintain the consistency of customized listener attributes through the +motion prior, and utilize a diffusion-based structure conditioned on the +portrait token and the motion prior to realize the controllable generation. To +train and evaluate our model, we have constructed two text-annotated listening +head datasets based on ViCo and RealTalk, which provide text-video paired +labels. Extensive experiments have verified the effectiveness of our model. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Video Super-Resolution Transformer with Masked Inter&Intra-Frame + Attention CVPR 2024 + + +
+ Recently, Vision Transformer has achieved great success in recovering missing +details in low-resolution sequences, i.e., the video super-resolution (VSR) +task. Despite its superiority in VSR accuracy, the heavy computational burden +as well as the large memory footprint hinder the deployment of +Transformer-based VSR models on constrained devices. In this paper, we address +the above issue by proposing a novel feature-level masked processing framework: +VSR with Masked Intra and inter frame Attention (MIA-VSR). The core of MIA-VSR +is leveraging feature-level temporal continuity between adjacent frames to +reduce redundant computations and make more rational use of previously enhanced +SR features. Concretely, we propose an intra-frame and inter-frame attention +block which takes the respective roles of past features and input features into +consideration and only exploits previously enhanced features to provide +supplementary information. In addition, an adaptive block-wise mask prediction +module is developed to skip unimportant computations according to feature +similarity between adjacent frames. We conduct detailed ablation studies to +validate our contributions and compare the proposed method with recent +state-of-the-art VSR approaches. The experimental results demonstrate that +MIA-VSR improves the memory and computation efficiency over state-of-the-art +methods, without trading off PSNR accuracy. The code is available at +https://github.com/LabShuHangGU/MIA-VSR. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Gradient strikes back: How filtering out high frequencies improves + explanations + + +
+ Attribution methods correspond to a class of explainability methods (XAI) +that aim to assess how individual inputs contribute to a model's +decision-making process. We have identified a significant limitation in one +type of attribution methods, known as "white-box" methods. Although highly +efficient, these methods rely on a gradient signal that is often contaminated +by high-frequency noise. To overcome this limitation, we introduce a new +approach called "FORGrad". This simple method effectively filters out noise +artifacts by using optimal cut-off frequencies tailored to the unique +characteristics of each model architecture. Our findings show that FORGrad +consistently enhances the performance of already existing white-box methods, +enabling them to compete effectively with more accurate yet computationally +demanding "black-box" methods. We anticipate that our research will foster +broader adoption of simpler and more efficient white-box methods for +explainability, offering a better balance between faithfulness and +computational efficiency. + +
+
+
+
+
+ + ♻ ☆ MaxViT-UNet: Multi-Axis Attention for Medical Image Segmentation + + +
+ Since their emergence, Convolutional Neural Networks (CNNs) have made +significant strides in medical image analysis. However, the local nature of the +convolution operator may pose a limitation for capturing global and long-range +interactions in CNNs. Recently, Transformers have gained popularity in the +computer vision community and also in medical image segmentation due to their +ability to process global features effectively. The scalability issues of the +self-attention mechanism and lack of the CNN-like inductive bias may have +limited their adoption. Therefore, hybrid Vision transformers +(CNN-Transformer), exploiting the advantages of both Convolution and +Self-attention Mechanisms, have gained importance. In this work, we present +MaxViT-UNet, a new Encoder-Decoder based UNet type hybrid vision transformer +(CNN-Transformer) for medical image segmentation. The proposed Hybrid Decoder +is designed to harness the power of both the convolution and self-attention +mechanisms at each decoding stage with a nominal memory and computational +burden. The inclusion of multi-axis self-attention, within each decoder stage, +significantly enhances the discriminating capacity between the object and +background regions, thereby helping in improving the segmentation efficiency. +In the Hybrid Decoder, a new block is also proposed. The fusion process +commences by integrating the upsampled lower-level decoder features, obtained +through transpose convolution, with the skip-connection features derived from +the hybrid encoder. Subsequently, the fused features undergo refinement through +the utilization of a multi-axis attention mechanism. The proposed decoder block +is repeated multiple times to segment the nuclei regions progressively. +Experimental results on MoNuSeg18 and MoNuSAC20 datasets demonstrate the +effectiveness of the proposed technique. + +
+
+ comment: 19 pages, 6 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Audio-Visual Compound Expression Recognition Method based on Late + Modality Fusion and Rule-based Decision + + +
+ This paper presents the results of the SUN team for the Compound Expressions +Recognition Challenge of the 6th ABAW Competition. We propose a novel +audio-visual method for compound expression recognition. Our method relies on +emotion recognition models that fuse modalities at the emotion probability +level, while decisions regarding the prediction of compound expressions are +based on predefined rules. Notably, our method does not use any training data +specific to the target task. Thus, the problem is a zero-shot classification +task. The method is evaluated in multi-corpus training and cross-corpus +validation setups. Using our proposed method is achieved an F1-score value +equals to 22.01% on the C-EXPR-DB test subset. Our findings from the challenge +demonstrate that the proposed method can potentially form a basis for +developing intelligent tools for annotating audio-visual data in the context of +human's basic and compound emotions. + +
+
+ comment: 7 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ MultiCorrupt: A Multi-Modal Robustness Dataset and Benchmark of + LiDAR-Camera Fusion for 3D Object Detection + + +
+ Multi-modal 3D object detection models for automated driving have +demonstrated exceptional performance on computer vision benchmarks like +nuScenes. However, their reliance on densely sampled LiDAR point clouds and +meticulously calibrated sensor arrays poses challenges for real-world +applications. Issues such as sensor misalignment, miscalibration, and disparate +sampling frequencies lead to spatial and temporal misalignment in data from +LiDAR and cameras. Additionally, the integrity of LiDAR and camera data is +often compromised by adverse environmental conditions such as inclement +weather, leading to occlusions and noise interference. To address this +challenge, we introduce MultiCorrupt, a comprehensive benchmark designed to +evaluate the robustness of multi-modal 3D object detectors against ten distinct +types of corruptions. We evaluate five state-of-the-art multi-modal detectors +on MultiCorrupt and analyze their performance in terms of their resistance +ability. Our results show that existing methods exhibit varying degrees of +robustness depending on the type of corruption and their fusion strategy. We +provide insights into which multi-modal design choices make such models robust +against certain perturbations. The dataset generation code and benchmark are +open-sourced at https://github.com/ika-rwth-aachen/MultiCorrupt. + +
+
+ comment: Code: https://github.com/ika-rwth-aachen/MultiCorrupt +
+
+
+
+
+ + ♻ ☆ Single-Model and Any-Modality for Video Object Tracking CVPR2024 + + +
+ In the realm of video object tracking, auxiliary modalities such as depth, +thermal, or event data have emerged as valuable assets to complement the RGB +trackers. In practice, most existing RGB trackers learn a single set of +parameters to use them across datasets and applications. However, a similar +single-model unification for multi-modality tracking presents several +challenges. These challenges stem from the inherent heterogeneity of inputs -- +each with modality-specific representations, the scarcity of multi-modal +datasets, and the absence of all the modalities at all times. In this work, we +introduce Un-Track, a Unified Tracker of a single set of parameters for any +modality. To handle any modality, our method learns their common latent space +through low-rank factorization and reconstruction techniques. More importantly, +we use only the RGB-X pairs to learn the common latent space. This unique +shared representation seamlessly binds all modalities together, enabling +effective unification and accommodating any missing modality, all within a +single transformer-based architecture. Our Un-Track achieves +8.1 absolute +F-score gain, on the DepthTrack dataset, by introducing only +2.14 (over 21.50) +GFLOPs with +6.6M (over 93M) parameters, through a simple yet efficient +prompting strategy. Extensive comparisons on five benchmark datasets with +different modalities show that Un-Track surpasses both SOTA unified trackers +and modality-specific counterparts, validating our effectiveness and +practicality. The source code is publicly available at +https://github.com/Zongwei97/UnTrack. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ DXAI: Explaining Classification by Image Decomposition + + +
+ We propose a new way to explain and to visualize neural network +classification through a decomposition-based explainable AI (DXAI). Instead of +providing an explanation heatmap, our method yields a decomposition of the +image into class-agnostic and class-distinct parts, with respect to the data +and chosen classifier. Following a fundamental signal processing paradigm of +analysis and synthesis, the original image is the sum of the decomposed parts. +We thus obtain a radically different way of explaining classification. The +class-agnostic part ideally is composed of all image features which do not +posses class information, where the class-distinct part is its complementary. +This new visualization can be more helpful and informative in certain +scenarios, especially when the attributes are dense, global and additive in +nature, for instance, when colors or textures are essential for class +distinction. Code is available at https://github.com/dxai2024/dxai. + +
+
+
+
+
+ + ♻ ☆ SPOT: Self-Training with Patch-Order Permutation for Object-Centric + Learning with Autoregressive Transformers CVPR 2024 + + +
+ Unsupervised object-centric learning aims to decompose scenes into +interpretable object entities, termed slots. Slot-based auto-encoders stand out +as a prominent method for this task. Within them, crucial aspects include +guiding the encoder to generate object-specific slots and ensuring the decoder +utilizes them during reconstruction. This work introduces two novel techniques, +(i) an attention-based self-training approach, which distills superior +slot-based attention masks from the decoder to the encoder, enhancing object +segmentation, and (ii) an innovative patch-order permutation strategy for +autoregressive transformers that strengthens the role of slot vectors in +reconstruction. The effectiveness of these strategies is showcased +experimentally. The combined approach significantly surpasses prior slot-based +autoencoder methods in unsupervised object segmentation, especially with +complex real-world images. We provide the implementation code at +https://github.com/gkakogeorgiou/spot . + +
+
+ comment: CVPR 2024. Code: https://github.com/gkakogeorgiou/spot +
+
+
+
+
+ + ♻ ☆ GD^2-NeRF: Generative Detail Compensation via GAN and Diffusion for + One-shot Generalizable Neural Radiance Fields + + +
+ In this paper, we focus on the One-shot Novel View Synthesis (O-NVS) task +which targets synthesizing photo-realistic novel views given only one reference +image per scene. Previous One-shot Generalizable Neural Radiance Fields +(OG-NeRF) methods solve this task in an inference-time finetuning-free manner, +yet suffer the blurry issue due to the encoder-only architecture that highly +relies on the limited reference image. On the other hand, recent +diffusion-based image-to-3d methods show vivid plausible results via distilling +pre-trained 2D diffusion models into a 3D representation, yet require tedious +per-scene optimization. Targeting these issues, we propose the GD$^2$-NeRF, a +Generative Detail compensation framework via GAN and Diffusion that is both +inference-time finetuning-free and with vivid plausible details. In detail, +following a coarse-to-fine strategy, GD$^2$-NeRF is mainly composed of a +One-stage Parallel Pipeline (OPP) and a 3D-consistent Detail Enhancer +(Diff3DE). At the coarse stage, OPP first efficiently inserts the GAN model +into the existing OG-NeRF pipeline for primarily relieving the blurry issue +with in-distribution priors captured from the training dataset, achieving a +good balance between sharpness (LPIPS, FID) and fidelity (PSNR, SSIM). Then, at +the fine stage, Diff3DE further leverages the pre-trained image diffusion +models to complement rich out-distribution details while maintaining decent 3D +consistency. Extensive experiments on both the synthetic and real-world +datasets show that GD$^2$-NeRF noticeably improves the details while without +per-scene finetuning. + +
+
+ comment: Submitted to Journal +
+
+
+
+
+ + ♻ ☆ Deep Equilibrium Diffusion Restoration with Parallel Sampling CVPR'2024 + + +
+ Diffusion model-based image restoration (IR) aims to use diffusion models to +recover high-quality (HQ) images from degraded images, achieving promising +performance. Due to the inherent property of diffusion models, most existing +methods need long serial sampling chains to restore HQ images step-by-step, +resulting in expensive sampling time and high computation costs. Moreover, such +long sampling chains hinder understanding the relationship between inputs and +restoration results since it is hard to compute the gradients in the whole +chains. In this work, we aim to rethink the diffusion model-based IR models +through a different perspective, i.e., a deep equilibrium (DEQ) fixed point +system, called DeqIR. Specifically, we derive an analytical solution by +modeling the entire sampling chain in these IR models as a joint multivariate +fixed point system. Based on the analytical solution, we can conduct parallel +sampling and restore HQ images without training. Furthermore, we compute fast +gradients via DEQ inversion and found that initialization optimization can +boost image quality and control the generation direction. Extensive experiments +on benchmarks demonstrate the effectiveness of our method on typical IR tasks +and real-world settings. + +
+
+ comment: CVPR'2024 +
+
+
+
+
+ + ♻ ☆ Integrating Language-Derived Appearance Elements with Visual Cues in + Pedestrian Detection + + +
+ Large language models (LLMs) have shown their capabilities in understanding +contextual and semantic information regarding knowledge of instance +appearances. In this paper, we introduce a novel approach to utilize the +strengths of LLMs in understanding contextual appearance variations and to +leverage this knowledge into a vision model (here, pedestrian detection). While +pedestrian detection is considered one of the crucial tasks directly related to +our safety (e.g., intelligent driving systems), it is challenging because of +varying appearances and poses in diverse scenes. Therefore, we propose to +formulate language-derived appearance elements and incorporate them with visual +cues in pedestrian detection. To this end, we establish a description corpus +that includes numerous narratives describing various appearances of pedestrians +and other instances. By feeding them through an LLM, we extract appearance +knowledge sets that contain the representations of appearance variations. +Subsequently, we perform a task-prompting process to obtain appearance elements +which are guided representative appearance knowledge relevant to a downstream +pedestrian detection task. The obtained knowledge elements are adaptable to +various detection frameworks, so that we can provide plentiful appearance +information by integrating the language-derived appearance elements with visual +cues within a detector. Through comprehensive experiments with various +pedestrian detectors, we verify the adaptability and effectiveness of our +method showing noticeable performance gains and achieving state-of-the-art +detection performance on two public pedestrian detection benchmarks (i.e., +CrowdHuman and WiderPedestrian). + +
+
+ comment: 11 pages, 5 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Feature 3DGS: Supercharging 3D Gaussian Splatting to Enable Distilled + Feature Fields + + +
+ 3D scene representations have gained immense popularity in recent years. +Methods that use Neural Radiance fields are versatile for traditional tasks +such as novel view synthesis. In recent times, some work has emerged that aims +to extend the functionality of NeRF beyond view synthesis, for semantically +aware tasks such as editing and segmentation using 3D feature field +distillation from 2D foundation models. However, these methods have two major +limitations: (a) they are limited by the rendering speed of NeRF pipelines, and +(b) implicitly represented feature fields suffer from continuity artifacts +reducing feature quality. Recently, 3D Gaussian Splatting has shown +state-of-the-art performance on real-time radiance field rendering. In this +work, we go one step further: in addition to radiance field rendering, we +enable 3D Gaussian splatting on arbitrary-dimension semantic features via 2D +foundation model distillation. This translation is not straightforward: naively +incorporating feature fields in the 3DGS framework encounters significant +challenges, notably the disparities in spatial resolution and channel +consistency between RGB images and feature maps. We propose architectural and +training changes to efficiently avert this problem. Our proposed method is +general, and our experiments showcase novel view semantic segmentation, +language-guided editing and segment anything through learning feature fields +from state-of-the-art 2D foundation models such as SAM and CLIP-LSeg. Across +experiments, our distillation method is able to provide comparable or better +results, while being significantly faster to both train and render. +Additionally, to the best of our knowledge, we are the first method to enable +point and bounding-box prompting for radiance field manipulation, by leveraging +the SAM model. Project website at: https://feature-3dgs.github.io/ + +
+
+
+
+
+ + ♻ ☆ Distribution-Aware Continual Test-Time Adaptation for Semantic + Segmentation + + +
+ Since autonomous driving systems usually face dynamic and ever-changing +environments, continual test-time adaptation (CTTA) has been proposed as a +strategy for transferring deployed models to continually changing target +domains. However, the pursuit of long-term adaptation often introduces +catastrophic forgetting and error accumulation problems, which impede the +practical implementation of CTTA in the real world. Recently, existing CTTA +methods mainly focus on utilizing a majority of parameters to fit target domain +knowledge through self-training. Unfortunately, these approaches often amplify +the challenge of error accumulation due to noisy pseudo-labels, and pose +practical limitations stemming from the heavy computational costs associated +with entire model updates. In this paper, we propose a distribution-aware +tuning (DAT) method to make the semantic segmentation CTTA efficient and +practical in real-world applications. DAT adaptively selects and updates two +small groups of trainable parameters based on data distribution during the +continual adaptation process, including domain-specific parameters (DSP) and +task-relevant parameters (TRP). Specifically, DSP exhibits sensitivity to +outputs with substantial distribution shifts, effectively mitigating the +problem of error accumulation. In contrast, TRP are allocated to positions that +are responsive to outputs with minor distribution shifts, which are fine-tuned +to avoid the catastrophic forgetting problem. In addition, since CTTA is a +temporal task, we introduce the Parameter Accumulation Update (PAU) strategy to +collect the updated DSP and TRP in target domain sequences. We conduct +extensive experiments on two widely-used semantic segmentation CTTA benchmarks, +achieving promising performance compared to previous state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ Strong Transferable Adversarial Attacks via Ensembled Asymptotically + Normal Distribution Learning + + +
+ Strong adversarial examples are crucial for evaluating and enhancing the +robustness of deep neural networks. However, the performance of popular attacks +is usually sensitive, for instance, to minor image transformations, stemming +from limited information -- typically only one input example, a handful of +white-box source models, and undefined defense strategies. Hence, the crafted +adversarial examples are prone to overfit the source model, which hampers their +transferability to unknown architectures. In this paper, we propose an approach +named Multiple Asymptotically Normal Distribution Attacks (MultiANDA) which +explicitly characterize adversarial perturbations from a learned distribution. +Specifically, we approximate the posterior distribution over the perturbations +by taking advantage of the asymptotic normality property of stochastic gradient +ascent (SGA), then employ the deep ensemble strategy as an effective proxy for +Bayesian marginalization in this process, aiming to estimate a mixture of +Gaussians that facilitates a more thorough exploration of the potential +optimization space. The approximated posterior essentially describes the +stationary distribution of SGA iterations, which captures the geometric +information around the local optimum. Thus, MultiANDA allows drawing an +unlimited number of adversarial perturbations for each input and reliably +maintains the transferability. Our proposed method outperforms ten +state-of-the-art black-box attacks on deep learning models with or without +defenses through extensive experiments on seven normally trained and seven +defense models. + +
+
+
+
+
+ + ♻ ☆ DreamGaussian: Generative Gaussian Splatting for Efficient 3D Content + Creation + + +
+ Recent advances in 3D content creation mostly leverage optimization-based 3D +generation via score distillation sampling (SDS). Though promising results have +been exhibited, these methods often suffer from slow per-sample optimization, +limiting their practical usage. In this paper, we propose DreamGaussian, a +novel 3D content generation framework that achieves both efficiency and quality +simultaneously. Our key insight is to design a generative 3D Gaussian Splatting +model with companioned mesh extraction and texture refinement in UV space. In +contrast to the occupancy pruning used in Neural Radiance Fields, we +demonstrate that the progressive densification of 3D Gaussians converges +significantly faster for 3D generative tasks. To further enhance the texture +quality and facilitate downstream applications, we introduce an efficient +algorithm to convert 3D Gaussians into textured meshes and apply a fine-tuning +stage to refine the details. Extensive experiments demonstrate the superior +efficiency and competitive generation quality of our proposed approach. +Notably, DreamGaussian produces high-quality textured meshes in just 2 minutes +from a single-view image, achieving approximately 10 times acceleration +compared to existing methods. + +
+
+ comment: Camera-ready version. Project page: https://dreamgaussian.github.io/ +
+
+
+
+
+ + ♻ ☆ SEGIC: Unleashing the Emergent Correspondence for In-Context + Segmentation + + +
+ In-context segmentation aims at segmenting novel images using a few labeled +example images, termed as "in-context examples", exploring content similarities +between examples and the target. The resulting models can be generalized +seamlessly to novel segmentation tasks, significantly reducing the labeling and +training costs compared with conventional pipelines. However, in-context +segmentation is more challenging than classic ones requiring the model to learn +segmentation rules conditioned on a few samples. Unlike previous work with +ad-hoc or non-end-to-end designs, we propose SEGIC, an end-to-end +segment-in-context framework built upon a single vision foundation model (VFM). +In particular, SEGIC leverages the emergent correspondence within VFM to +capture dense relationships between target images and in-context samples. As +such, information from in-context samples is then extracted into three types of +instructions, i.e. geometric, visual, and meta instructions, serving as +explicit conditions for the final mask prediction. SEGIC is a straightforward +yet effective approach that yields state-of-the-art performance on one-shot +segmentation benchmarks. Notably, SEGIC can be easily generalized to diverse +tasks, including video object segmentation and open-vocabulary segmentation. +Code will be available at https://github.com/MengLcool/SEGIC. + +
+
+
+
+
+ + ♻ ☆ PLGSLAM: Progressive Neural Scene Represenation with Local to Global + Bundle Adjustment CVPR 2024 + + +
+ Neural implicit scene representations have recently shown encouraging results +in dense visual SLAM. However, existing methods produce low-quality scene +reconstruction and low-accuracy localization performance when scaling up to +large indoor scenes and long sequences. These limitations are mainly due to +their single, global radiance field with finite capacity, which does not adapt +to large scenarios. Their end-to-end pose networks are also not robust enough +with the growth of cumulative errors in large scenes. To this end, we introduce +PLGSLAM, a neural visual SLAM system capable of high-fidelity surface +reconstruction and robust camera tracking in real-time. To handle large-scale +indoor scenes, PLGSLAM proposes a progressive scene representation method which +dynamically allocates new local scene representation trained with frames within +a local sliding window. This allows us to scale up to larger indoor scenes and +improves robustness (even under pose drifts). In local scene representation, +PLGSLAM utilizes tri-planes for local high-frequency features with multi-layer +perceptron (MLP) networks for the low-frequency feature, achieving smoothness +and scene completion in unobserved areas. Moreover, we propose local-to-global +bundle adjustment method with a global keyframe database to address the +increased pose drifts on long sequences. Experimental results demonstrate that +PLGSLAM achieves state-of-the-art scene reconstruction results and tracking +performance across various datasets and scenarios (both in small and +large-scale indoor environments). + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ mPLUG-Owl: Modularization Empowers Large Language Models with + Multimodality + + +
+ Large language models (LLMs) have demonstrated impressive zero-shot abilities +on a variety of open-ended tasks, while recent research has also explored the +use of LLMs for multi-modal generation. In this study, we introduce mPLUG-Owl, +a novel training paradigm that equips LLMs with multi-modal abilities through +modularized learning of foundation LLM, a visual knowledge module, and a visual +abstractor module. This approach can support multiple modalities and facilitate +diverse unimodal and multimodal abilities through modality collaboration. The +training paradigm of mPLUG-Owl involves a two-stage method for aligning image +and text, which learns visual knowledge with the assistance of LLM while +maintaining and even improving the generation abilities of LLM. In the first +stage, the visual knowledge module and abstractor module are trained with a +frozen LLM module to align the image and text. In the second stage, +language-only and multi-modal supervised datasets are used to jointly fine-tune +a low-rank adaption (LoRA) module on LLM and the abstractor module by freezing +the visual knowledge module. We carefully build a visually-related instruction +evaluation set OwlEval. Experimental results show that our model outperforms +existing multi-modal models, demonstrating mPLUG-Owl's impressive instruction +and visual understanding ability, multi-turn conversation ability, and +knowledge reasoning ability. Besides, we observe some unexpected and exciting +abilities such as multi-image correlation and scene text understanding, which +makes it possible to leverage it for harder real scenarios, such as vision-only +document comprehension. Our code, pre-trained model, instruction-tuned models, +and evaluation set are available at https://github.com/X-PLUG/mPLUG-Owl. The +online demo is available at https://www.modelscope.cn/studios/damo/mPLUG-Owl. + +
+
+ comment: Working in Process +
+
+
+
+
+ + ♻ ☆ SNE-RoadSegV2: Advancing Heterogeneous Feature Fusion and Fallibility + Awareness for Freespace Detection + + +
+ Feature-fusion networks with duplex encoders have proven to be an effective +technique to solve the freespace detection problem. However, despite the +compelling results achieved by previous research efforts, the exploration of +adequate and discriminative heterogeneous feature fusion, as well as the +development of fallibility-aware loss functions remains relatively scarce. This +paper makes several significant contributions to address these limitations: (1) +It presents a novel heterogeneous feature fusion block, comprising a holistic +attention module, a heterogeneous feature contrast descriptor, and an +affinity-weighted feature recalibrator, enabling a more in-depth exploitation +of the inherent characteristics of the extracted features, (2) it incorporates +both inter-scale and intra-scale skip connections into the decoder architecture +while eliminating redundant ones, leading to both improved accuracy and +computational efficiency, and (3) it introduces two fallibility-aware loss +functions that separately focus on semantic-transition and depth-inconsistent +regions, collectively contributing to greater supervision during model +training. Our proposed heterogeneous feature fusion network (SNE-RoadSegV2), +which incorporates all these innovative components, demonstrates superior +performance in comparison to all other freespace detection algorithms across +multiple public datasets. Notably, it ranks the 1st on the official KITTI Road +benchmark. + +
+
+
+
+
+ + ♻ ☆ Gamba: Marry Gaussian Splatting with Mamba for single view 3D + reconstruction + + +
+ We tackle the challenge of efficiently reconstructing a 3D asset from a +single image with growing demands for automated 3D content creation pipelines. +Previous methods primarily rely on Score Distillation Sampling (SDS) and Neural +Radiance Fields (NeRF). Despite their significant success, these approaches +encounter practical limitations due to lengthy optimization and considerable +memory usage. In this report, we introduce Gamba, an end-to-end amortized 3D +reconstruction model from single-view images, emphasizing two main insights: +(1) 3D representation: leveraging a large number of 3D Gaussians for an +efficient 3D Gaussian splatting process; (2) Backbone design: introducing a +Mamba-based sequential network that facilitates context-dependent reasoning and +linear scalability with the sequence (token) length, accommodating a +substantial number of Gaussians. Gamba incorporates significant advancements in +data preprocessing, regularization design, and training methodologies. We +assessed Gamba against existing optimization-based and feed-forward 3D +generation approaches using the real-world scanned OmniObject3D dataset. Here, +Gamba demonstrates competitive generation capabilities, both qualitatively and +quantitatively, while achieving remarkable speed, approximately 0.6 second on a +single NVIDIA A100 GPU. + +
+
+
+
+
+ + ♻ ☆ Dr.Hair: Reconstructing Scalp-Connected Hair Strands without + Pre-training via Differentiable Rendering of Line Segments CVPR 2024 + + +
+ In the film and gaming industries, achieving a realistic hair appearance +typically involves the use of strands originating from the scalp. However, +reconstructing these strands from observed surface images of hair presents +significant challenges. The difficulty in acquiring Ground Truth (GT) data has +led state-of-the-art learning-based methods to rely on pre-training with +manually prepared synthetic CG data. This process is not only labor-intensive +and costly but also introduces complications due to the domain gap when +compared to real-world data. In this study, we propose an optimization-based +approach that eliminates the need for pre-training. Our method represents hair +strands as line segments growing from the scalp and optimizes them using a +novel differentiable rendering algorithm. To robustly optimize a substantial +number of slender explicit geometries, we introduce 3D orientation estimation +utilizing global optimization, strand initialization based on Laplace's +equation, and reparameterization that leverages geometric connectivity and +spatial proximity. Unlike existing optimization-based methods, our method is +capable of reconstructing internal hair flow in an absolute direction. Our +method exhibits robust and accurate inverse rendering, surpassing the quality +of existing methods and significantly improving processing speed. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ CLOVA: A Closed-Loop Visual Assistant with Tool Usage and Update CVPR 2024 + + +
+ Utilizing large language models (LLMs) to compose off-the-shelf visual tools +represents a promising avenue of research for developing robust visual +assistants capable of addressing diverse visual tasks. However, these methods +often overlook the potential for continual learning, typically by freezing the +utilized tools, thus limiting their adaptation to environments requiring new +knowledge. To tackle this challenge, we propose CLOVA, a Closed-Loop Visual +Assistant, which operates within a framework encompassing inference, +reflection, and learning phases. During the inference phase, LLMs generate +programs and execute corresponding tools to complete assigned tasks. In the +reflection phase, a multimodal global-local reflection scheme analyzes human +feedback to determine which tools require updating. Lastly, the learning phase +employs three flexible approaches to automatically gather training data and +introduces a novel prompt tuning scheme to update the tools, allowing CLOVA to +efficiently acquire new knowledge. Experimental findings demonstrate that CLOVA +surpasses existing tool-usage methods by 5% in visual question answering and +multiple-image reasoning, by 10% in knowledge tagging, and by 20% in image +editing. These results underscore the significance of the continual learning +capability in general visual assistants. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Cell Variational Information Bottleneck Network + + +
+ In this work, we propose Cell Variational Information Bottleneck Network +(cellVIB), a convolutional neural network using information bottleneck +mechanism, which can be combined with the latest feedforward network +architecture in an end-to-end training method. Our Cell Variational Information +Bottleneck Network is constructed by stacking VIB cells, which generate feature +maps with uncertainty. As layers going deeper, the regularization effect will +gradually increase, instead of directly adding excessive regular constraints to +the output layer of the model as in Deep VIB. Under each VIB cell, the +feedforward process learns an independent mean term and an standard deviation +term, and predicts the Gaussian distribution based on them. The feedback +process is based on reparameterization trick for effective training. This work +performs an extensive analysis on MNIST dataset to verify the effectiveness of +each VIB cells, and provides an insightful analysis on how the VIB cells affect +mutual information. Experiments conducted on CIFAR-10 also prove that our +cellVIB is robust against noisy labels during training and against corrupted +images during testing. Then, we validate our method on PACS dataset, whose +results show that the VIB cells can significantly improve the generalization +performance of the basic model. Finally, in a more complex representation +learning task, face recognition, our network structure has also achieved very +competitive results. + +
+
+ comment: Found errors in the article, therefore postponing publication for now +
+
+
+
+
+ + ♻ ☆ Improving Generalization via Meta-Learning on Hard Samples CVPR 2024 + + +
+ Learned reweighting (LRW) approaches to supervised learning use an +optimization criterion to assign weights for training instances, in order to +maximize performance on a representative validation dataset. We pose and +formalize the problem of optimized selection of the validation set used in LRW +training, to improve classifier generalization. In particular, we show that +using hard-to-classify instances in the validation set has both a theoretical +connection to, and strong empirical evidence of generalization. We provide an +efficient algorithm for training this meta-optimized model, as well as a simple +train-twice heuristic for careful comparative study. We demonstrate that LRW +with easy validation data performs consistently worse than LRW with hard +validation data, establishing the validity of our meta-optimization problem. +Our proposed algorithm outperforms a wide range of baselines on a range of +datasets and domain shift challenges (Imagenet-1K, CIFAR-100, Clothing-1M, +CAMELYON, WILDS, etc.), with ~1% gains using VIT-B on Imagenet. We also show +that using naturally hard examples for validation (Imagenet-R / Imagenet-A) in +LRW training for Imagenet improves performance on both clean and naturally hard +test instances by 1-2%. Secondary analyses show that using hard validation data +in an LRW framework improves margins on test data, hinting at the mechanism +underlying our empirical gains. We believe this work opens up new research +directions for the meta-optimization of meta-learning in a supervised learning +context. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ DyBluRF: Dynamic Deblurring Neural Radiance Fields for Blurry Monocular + Video + + +
+ Neural Radiance Fields (NeRF), initially developed for static scenes, have +inspired many video novel view synthesis techniques. However, the challenge for +video view synthesis arises from motion blur, a consequence of object or camera +movement during exposure, which hinders the precise synthesis of sharp +spatio-temporal views. In response, we propose a novel dynamic deblurring NeRF +framework for blurry monocular video, called DyBluRF, consisting of a Base Ray +Initialization (BRI) stage and a Motion Decomposition-based Deblurring (MDD) +stage. Our DyBluRF is the first that handles the novel view synthesis for +blurry monocular video with a novel two-stage framework. In the BRI stage, we +coarsely reconstruct dynamic 3D scenes and jointly initialize the base ray, +which is further used to predict latent sharp rays, using the inaccurate camera +pose information from the given blurry frames. In the MDD stage, we introduce a +novel Incremental Latent Sharp-rays Prediction (ILSP) approach for the blurry +monocular video frames by decomposing the latent sharp rays into global camera +motion and local object motion components. We further propose two loss +functions for effective geometry regularization and decomposition of static and +dynamic scene components without any mask supervision. Experiments show that +DyBluRF outperforms qualitatively and quantitatively the SOTA methods. + +
+
+ comment: The first two authors contributed equally to this work (equal + contribution). The last two authors advised equally to this work. Please + visit our project page at https://kaist-viclab.github.io/dyblurf-site/ +
+
+
+
+
+ + ♻ ☆ Elysium: Exploring Object-level Perception in Videos via MLLM + + +
+ Multi-modal Large Language Models (MLLMs) have demonstrated their ability to +perceive objects in still images, but their application in video-related tasks, +such as object tracking, remains understudied. This lack of exploration is +primarily due to two key challenges. Firstly, extensive pretraining on +large-scale video datasets is required to equip MLLMs with the capability to +perceive objects across multiple frames and understand inter-frame +relationships. Secondly, processing a large number of frames within the context +window of Large Language Models (LLMs) can impose a significant computational +burden. To address the first challenge, we introduce ElysiumTrack-1M, a +large-scale video dataset supported for three tasks: Single Object Tracking +(SOT), Referring Single Object Tracking (RSOT), and Video Referring Expression +Generation (Video-REG). ElysiumTrack-1M contains 1.27 million annotated video +frames with corresponding object boxes and descriptions. Leveraging this +dataset, we conduct training of MLLMs and propose a token-compression model +T-Selector to tackle the second challenge. Our proposed approach, Elysium: +Exploring Object-level Perception in Videos via MLLM, is an end-to-end +trainable MLLM that attempts to conduct object-level tasks in videos without +requiring any additional plug-in or expert models. All codes and datasets are +available at https://github.com/Hon-Wong/Elysium. + +
+
+
+
+
+ + ♻ ☆ TransNeXt: Robust Foveal Visual Perception for Vision Transformers CVPR 2024 + + +
+ Due to the depth degradation effect in residual connections, many efficient +Vision Transformers models that rely on stacking layers for information +exchange often fail to form sufficient information mixing, leading to unnatural +visual perception. To address this issue, in this paper, we propose Aggregated +Attention, a biomimetic design-based token mixer that simulates biological +foveal vision and continuous eye movement while enabling each token on the +feature map to have a global perception. Furthermore, we incorporate learnable +tokens that interact with conventional queries and keys, which further +diversifies the generation of affinity matrices beyond merely relying on the +similarity between queries and keys. Our approach does not rely on stacking for +information exchange, thus effectively avoiding depth degradation and achieving +natural visual perception. Additionally, we propose Convolutional GLU, a +channel mixer that bridges the gap between GLU and SE mechanism, which empowers +each token to have channel attention based on its nearest neighbor image +features, enhancing local modeling capability and model robustness. We combine +aggregated attention and convolutional GLU to create a new visual backbone +called TransNeXt. Extensive experiments demonstrate that our TransNeXt achieves +state-of-the-art performance across multiple model sizes. At a resolution of +$224^2$, TransNeXt-Tiny attains an ImageNet accuracy of 84.0%, surpassing +ConvNeXt-B with 69% fewer parameters. Our TransNeXt-Base achieves an ImageNet +accuracy of 86.2% and an ImageNet-A accuracy of 61.6% at a resolution of +$384^2$, a COCO object detection mAP of 57.1, and an ADE20K semantic +segmentation mIoU of 54.7. + +
+
+ comment: CVPR 2024 Camera-ready Version. Project Page: + https://github.com/DaiShiResearch/TransNeXt +
+
+
+
+
+ + ♻ ☆ GAvatar: Animatable 3D Gaussian Avatars with Implicit Mesh Learning CVPR 2024 + + +
+ Gaussian splatting has emerged as a powerful 3D representation that harnesses +the advantages of both explicit (mesh) and implicit (NeRF) 3D representations. +In this paper, we seek to leverage Gaussian splatting to generate realistic +animatable avatars from textual descriptions, addressing the limitations (e.g., +flexibility and efficiency) imposed by mesh or NeRF-based representations. +However, a naive application of Gaussian splatting cannot generate high-quality +animatable avatars and suffers from learning instability; it also cannot +capture fine avatar geometries and often leads to degenerate body parts. To +tackle these problems, we first propose a primitive-based 3D Gaussian +representation where Gaussians are defined inside pose-driven primitives to +facilitate animation. Second, to stabilize and amortize the learning of +millions of Gaussians, we propose to use neural implicit fields to predict the +Gaussian attributes (e.g., colors). Finally, to capture fine avatar geometries +and extract detailed meshes, we propose a novel SDF-based implicit mesh +learning approach for 3D Gaussians that regularizes the underlying geometries +and extracts highly detailed textured meshes. Our proposed method, GAvatar, +enables the large-scale generation of diverse animatable avatars using only +text prompts. GAvatar significantly surpasses existing methods in terms of both +appearance and geometry quality, and achieves extremely fast rendering (100 +fps) at 1K resolution. + +
+
+ comment: CVPR 2024. Project website: https://nvlabs.github.io/GAvatar +
+
+
+
+
+ + ♻ ☆ P-MapNet: Far-seeing Map Generator Enhanced by both SDMap and HDMap + Priors + + +
+ Autonomous vehicles are gradually entering city roads today, with the help of +high-definition maps (HDMaps). However, the reliance on HDMaps prevents +autonomous vehicles from stepping into regions without this expensive digital +infrastructure. This fact drives many researchers to study online HDMap +generation algorithms, but the performance of these algorithms at far regions +is still unsatisfying. We present P-MapNet, in which the letter P highlights +the fact that we focus on incorporating map priors to improve model +performance. Specifically, we exploit priors in both SDMap and HDMap. On one +hand, we extract weakly aligned SDMap from OpenStreetMap, and encode it as an +additional conditioning branch. Despite the misalignment challenge, our +attention-based architecture adaptively attends to relevant SDMap skeletons and +significantly improves performance. On the other hand, we exploit a masked +autoencoder to capture the prior distribution of HDMap, which can serve as a +refinement module to mitigate occlusions and artifacts. We benchmark on the +nuScenes and Argoverse2 datasets. Through comprehensive experiments, we show +that: (1) our SDMap prior can improve online map generation performance, using +both rasterized (by up to $+18.73$ $\rm mIoU$) and vectorized (by up to $+8.50$ +$\rm mAP$) output representations. (2) our HDMap prior can improve map +perceptual metrics by up to $6.34\%$. (3) P-MapNet can be switched into +different inference modes that covers different regions of the +accuracy-efficiency trade-off landscape. (4) P-MapNet is a far-seeing solution +that brings larger improvements on longer ranges. Codes and models are publicly +available at https://jike5.github.io/P-MapNet. + +
+
+ comment: Code: https://jike5.github.io/P-MapNet +
+
+
+
+
+ + ♻ ☆ CPPF++: Uncertainty-Aware Sim2Real Object Pose Estimation by Vote + Aggregation + + +
+ Object pose estimation constitutes a critical area within the domain of 3D +vision. While contemporary state-of-the-art methods that leverage real-world +pose annotations have demonstrated commendable performance, the procurement of +such real training data incurs substantial costs. This paper focuses on a +specific setting wherein only 3D CAD models are utilized as a priori knowledge, +devoid of any background or clutter information. We introduce a novel method, +CPPF++, designed for sim-to-real pose estimation. This method builds upon the +foundational point-pair voting scheme of CPPF, reformulating it through a +probabilistic view. To address the challenge posed by vote collision, we +propose a novel approach that involves modeling the voting uncertainty by +estimating the probabilistic distribution of each point pair within the +canonical space. Furthermore, we augment the contextual information provided by +each voting unit through the introduction of N-point tuples. To enhance the +robustness and accuracy of the model, we incorporate several innovative +modules, including noisy pair filtering, online alignment optimization, and a +tuple feature ensemble. Alongside these methodological advancements, we +introduce a new category-level pose estimation dataset, named DiversePose 300. +Empirical evidence demonstrates that our method significantly surpasses +previous sim-to-real approaches and achieves comparable or superior performance +on novel datasets. Our code is available on https://github.com/qq456cvb/CPPF2. + +
+
+
+
+
+ + ♻ ☆ Embodied Multi-Modal Agent trained by an LLM from a Parallel TextWorld CVPR 2024 + + +
+ While large language models (LLMs) excel in a simulated world of texts, they +struggle to interact with the more realistic world without perceptions of other +modalities such as visual or audio signals. Although vision-language models +(VLMs) integrate LLM modules (1) aligned with static image features, and (2) +may possess prior knowledge of world dynamics (as demonstrated in the text +world), they have not been trained in an embodied visual world and thus cannot +align with its dynamics. On the other hand, training an embodied agent in a +noisy visual world without expert guidance is often challenging and +inefficient. In this paper, we train a VLM agent living in a visual world using +an LLM agent excelling in a parallel text world. Specifically, we distill LLM's +reflection outcomes (improved actions by analyzing mistakes) in a text world's +tasks to finetune the VLM on the same tasks of the visual world, resulting in +an Embodied Multi-Modal Agent (EMMA) quickly adapting to the visual world +dynamics. Such cross-modality imitation learning between the two parallel +worlds is achieved by a novel DAgger-DPO algorithm, enabling EMMA to generalize +to a broad scope of new tasks without any further guidance from the LLM expert. +Extensive evaluations on the ALFWorld benchmark's diverse tasks highlight +EMMA's superior performance to SOTA VLM-based agents, e.g., 20%-70% improvement +in the success rate. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Interpreting CLIP's Image Representation via Text-Based Decomposition + + +
+ We investigate the CLIP image encoder by analyzing how individual model +components affect the final representation. We decompose the image +representation as a sum across individual image patches, model layers, and +attention heads, and use CLIP's text representation to interpret the summands. +Interpreting the attention heads, we characterize each head's role by +automatically finding text representations that span its output space, which +reveals property-specific roles for many heads (e.g. location or shape). Next, +interpreting the image patches, we uncover an emergent spatial localization +within CLIP. Finally, we use this understanding to remove spurious features +from CLIP and to create a strong zero-shot image segmenter. Our results +indicate that a scalable understanding of transformer models is attainable and +can be used to repair and improve models. + +
+
+ comment: Project page and code: + https://yossigandelsman.github.io/clip_decomposition/ +
+
+
+
+
+ + ♻ ☆ MMCert: Provable Defense against Adversarial Attacks to Multi-modal + Models CVPR'24 + + +
+ Different from a unimodal model whose input is from a single modality, the +input (called multi-modal input) of a multi-modal model is from multiple +modalities such as image, 3D points, audio, text, etc. Similar to unimodal +models, many existing studies show that a multi-modal model is also vulnerable +to adversarial perturbation, where an attacker could add small perturbation to +all modalities of a multi-modal input such that the multi-modal model makes +incorrect predictions for it. Existing certified defenses are mostly designed +for unimodal models, which achieve sub-optimal certified robustness guarantees +when extended to multi-modal models as shown in our experimental results. In +our work, we propose MMCert, the first certified defense against adversarial +attacks to a multi-modal model. We derive a lower bound on the performance of +our MMCert under arbitrary adversarial attacks with bounded perturbations to +both modalities (e.g., in the context of auto-driving, we bound the number of +changed pixels in both RGB image and depth image). We evaluate our MMCert using +two benchmark datasets: one for the multi-modal road segmentation task and the +other for the multi-modal emotion recognition task. Moreover, we compare our +MMCert with a state-of-the-art certified defense extended from unimodal models. +Our experimental results show that our MMCert outperforms the baseline. + +
+
+ comment: To appear in CVPR'24 +
+
+
+
+
+ + ♻ ☆ Emergent Open-Vocabulary Semantic Segmentation from Off-the-shelf + Vision-Language Models CVPR 2024 + + +
+ From image-text pairs, large-scale vision-language models (VLMs) learn to +implicitly associate image regions with words, which prove effective for tasks +like visual question answering. However, leveraging the learned association for +open-vocabulary semantic segmentation remains a challenge. In this paper, we +propose a simple, yet extremely effective, training-free technique, +Plug-and-Play Open-Vocabulary Semantic Segmentation (PnP-OVSS) for this task. +PnP-OVSS leverages a VLM with direct text-to-image cross-attention and an +image-text matching loss. To balance between over-segmentation and +under-segmentation, we introduce Salience Dropout; by iteratively dropping +patches that the model is most attentive to, we are able to better resolve the +entire extent of the segmentation mask. \shortname{} does not require any +neural network training and performs hyperparameter tuning without the need for +any segmentation annotations, even for a validation set. PnP-OVSS demonstrates +substantial improvements over comparable baselines (+29.4% mIoU on Pascal VOC, ++13.2% mIoU on Pascal Context, +14.0% mIoU on MS COCO, and +11.4% mIoU on +ADE-20K.) and even outperforms most baselines that conduct additional network +training on top of pretrained VLMs. Our codebase is at +https://github.com/letitiabanana/PnP-OVSS. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Hybrid Video Diffusion Models with 2D Triplane and 3D Wavelet + Representation + + +
+ Generating high-quality videos that synthesize desired realistic content is a +challenging task due to their intricate high-dimensionality and complexity of +videos. Several recent diffusion-based methods have shown comparable +performance by compressing videos to a lower-dimensional latent space, using +traditional video autoencoder architecture. However, such method that employ +standard frame-wise 2D and 3D convolution fail to fully exploit the +spatio-temporal nature of videos. To address this issue, we propose a novel +hybrid video diffusion model, called HVDM, which can capture spatio-temporal +dependencies more effectively. The HVDM is trained by a hybrid video +autoencoder which extracts a disentangled representation of the video +including: (i) a global context information captured by a 2D projected latent +(ii) a local volume information captured by 3D convolutions with wavelet +decomposition (iii) a frequency information for improving the video +reconstruction. Based on this disentangled representation, our hybrid +autoencoder provide a more comprehensive video latent enriching the generated +videos with fine structures and details. Experiments on video generation +benchamarks (UCF101, SkyTimelapse, and TaiChi) demonstrate that the proposed +approach achieves state-of-the-art video generation quality, showing a wide +range of video applications (e.g., long video generation, image-to-video, and +video dynamics control). + +
+
+ comment: 17 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ Predicting Gradient is Better: Exploring Self-Supervised Learning for + SAR ATR with a Joint-Embedding Predictive Architecture + + +
+ The growing Synthetic Aperture Radar (SAR) data has the potential to build a +foundation model through Self-Supervised Learning (SSL) methods, which can +achieve various SAR Automatic Target Recognition (ATR) tasks with pre-training +in large-scale unlabeled data and fine-tuning in small labeled samples. SSL +aims to construct supervision signals directly from the data, which minimizes +the need for expensive expert annotation and maximizes the use of the expanding +data pool for a foundational model. This study investigates an effective SSL +method for SAR ATR, which can pave the way for a foundation model in SAR ATR. +The primary obstacles faced in SSL for SAR ATR are the small targets in remote +sensing and speckle noise in SAR images, corresponding to the SSL approach and +signals. To overcome these challenges, we present a novel Joint-Embedding +Predictive Architecture for SAR ATR (SAR-JEPA), which leverages local masked +patches to predict the multi-scale SAR gradient representations of unseen +context. The key aspect of SAR-JEPA is integrating SAR domain features to +ensure high-quality self-supervised signals as target features. Besides, we +employ local masks and multi-scale features to accommodate the various small +targets in remote sensing. By fine-tuning and evaluating our framework on three +target recognition datasets (vehicle, ship, and aircraft) with four other +datasets as pre-training, we demonstrate its outperformance over other SSL +methods and its effectiveness with increasing SAR data. This study showcases +the potential of SSL for SAR target recognition across diverse targets, scenes, +and sensors. + +
+
+ comment: Our codes at https://github.com/waterdisappear/SAR-JEPA +
+
+
+
+
+ + ♻ ☆ GTA: A Geometry-Aware Attention Mechanism for Multi-View Transformers ICLR 2024 + + +
+ As transformers are equivariant to the permutation of input tokens, encoding +the positional information of tokens is necessary for many tasks. However, +since existing positional encoding schemes have been initially designed for NLP +tasks, their suitability for vision tasks, which typically exhibit different +structural properties in their data, is questionable. We argue that existing +positional encoding schemes are suboptimal for 3D vision tasks, as they do not +respect their underlying 3D geometric structure. Based on this hypothesis, we +propose a geometry-aware attention mechanism that encodes the geometric +structure of tokens as relative transformation determined by the geometric +relationship between queries and key-value pairs. By evaluating on multiple +novel view synthesis (NVS) datasets in the sparse wide-baseline multi-view +setting, we show that our attention, called Geometric Transform Attention +(GTA), improves learning efficiency and performance of state-of-the-art +transformer-based NVS models without any additional learned parameters and only +minor computational overhead. + +
+
+ comment: Published as a conference paper at ICLR 2024 +
+
+
+
+
+ + ♻ ☆ MedPromptX: Grounded Multimodal Prompting for Chest X-ray Diagnosis + + +
+ Chest X-ray images are commonly used for predicting acute and chronic +cardiopulmonary conditions, but efforts to integrate them with structured +clinical data face challenges due to incomplete electronic health records +(EHR). This paper introduces MedPromptX, the first model to integrate +multimodal large language models (MLLMs), few-shot prompting (FP) and visual +grounding (VG) to combine imagery with EHR data for chest X-ray diagnosis. A +pre-trained MLLM is utilized to complement the missing EHR information, +providing a comprehensive understanding of patients' medical history. +Additionally, FP reduces the necessity for extensive training of MLLMs while +effectively tackling the issue of hallucination. Nevertheless, the process of +determining the optimal number of few-shot examples and selecting high-quality +candidates can be burdensome, yet it profoundly influences model performance. +Hence, we propose a new technique that dynamically refines few-shot data for +real-time adjustment to new patient scenarios. Moreover, VG aids in focusing +the model's attention on relevant regions of interest in X-ray images, +enhancing the identification of abnormalities. We release MedPromptX-VQA, a new +in-context visual question answering dataset encompassing interleaved image and +EHR data derived from MIMIC-IV and MIMIC-CXR databases. Results demonstrate the +SOTA performance of MedPromptX, achieving an 11% improvement in F1-score +compared to the baselines. Code and data are available at +https://github.com/BioMedIA-MBZUAI/MedPromptX + +
+
+
+
+
+ + ♻ ☆ Multi-Label Classification of Thoracic Diseases using Dense + Convolutional Network on Chest Radiographs + + +
+ Traditional methods of identifying pathologies in X-ray images rely heavily +on skilled human interpretation and are often time-consuming. The advent of +deep learning techniques has enabled the development of automated disease +diagnosis systems. Still, the performance of such systems is opaque to +end-users and limited to detecting a single pathology. In this paper, we +propose a multi-label disease prediction model that allows the detection of +more than one pathology at a given test time. We use a dense convolutional +neural network (DenseNet) for disease diagnosis. Our proposed model achieved +the highest AUC score of 0.896 for the condition Cardiomegaly with an accuracy +of 0.826, while the lowest AUC score was obtained for Nodule, at 0.655 with an +accuracy of 0.66. To build trust in decision-making, we generated heatmaps on +X-rays to visualize the regions where the model paid attention to make certain +predictions. Our proposed automated disease prediction model obtained highly +confident high-performance metrics in multi-label disease prediction tasks. + +
+
+ comment: 13 pages +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 214 + +
+
+
+ + ☆ GaussianCube: Structuring Gaussian Splatting using Optimal Transport for + 3D Generative Modeling + + +
+ 3D Gaussian Splatting (GS) have achieved considerable improvement over Neural +Radiance Fields in terms of 3D fitting fidelity and rendering speed. However, +this unstructured representation with scattered Gaussians poses a significant +challenge for generative modeling. To address the problem, we introduce +GaussianCube, a structured GS representation that is both powerful and +efficient for generative modeling. We achieve this by first proposing a +modified densification-constrained GS fitting algorithm which can yield +high-quality fitting results using a fixed number of free Gaussians, and then +re-arranging the Gaussians into a predefined voxel grid via Optimal Transport. +The structured grid representation allows us to use standard 3D U-Net as our +backbone in diffusion generative modeling without elaborate designs. Extensive +experiments conducted on ShapeNet and OmniObject3D show that our model achieves +state-of-the-art generation results both qualitatively and quantitatively, +underscoring the potential of GaussianCube as a powerful and versatile 3D +representation. + +
+
+ comment: Project Page: https://gaussiancube.github.io/ +
+
+
+
+
+ + ☆ RSMamba: Remote Sensing Image Classification with State Space Model + + +
+ Remote sensing image classification forms the foundation of various +understanding tasks, serving a crucial function in remote sensing image +interpretation. The recent advancements of Convolutional Neural Networks (CNNs) +and Transformers have markedly enhanced classification accuracy. Nonetheless, +remote sensing scene classification remains a significant challenge, especially +given the complexity and diversity of remote sensing scenarios and the +variability of spatiotemporal resolutions. The capacity for whole-image +understanding can provide more precise semantic cues for scene discrimination. +In this paper, we introduce RSMamba, a novel architecture for remote sensing +image classification. RSMamba is based on the State Space Model (SSM) and +incorporates an efficient, hardware-aware design known as the Mamba. It +integrates the advantages of both a global receptive field and linear modeling +complexity. To overcome the limitation of the vanilla Mamba, which can only +model causal sequences and is not adaptable to two-dimensional image data, we +propose a dynamic multi-path activation mechanism to augment Mamba's capacity +to model non-causal data. Notably, RSMamba maintains the inherent modeling +mechanism of the vanilla Mamba, yet exhibits superior performance across +multiple remote sensing image classification datasets. This indicates that +RSMamba holds significant potential to function as the backbone of future +visual foundation models. The code will be available at +\url{https://github.com/KyanChen/RSMamba}. + +
+
+
+
+
+ + ☆ Detecting Image Attribution for Text-to-Image Diffusion Models in RGB + and Beyond + + +
+ Modern text-to-image (T2I) diffusion models can generate images with +remarkable realism and creativity. These advancements have sparked research in +fake image detection and attribution, yet prior studies have not fully explored +the practical and scientific dimensions of this task. In addition to +attributing images to 12 state-of-the-art T2I generators, we provide extensive +analyses on what inference stage hyperparameters and image modifications are +discernible. Our experiments reveal that initialization seeds are highly +detectable, along with other subtle variations in the image generation process +to some extent. We further investigate what visual traces are leveraged in +image attribution by perturbing high-frequency details and employing mid-level +representations of image style and structure. Notably, altering high-frequency +information causes only slight reductions in accuracy, and training an +attributor on style representations outperforms training on RGB images. Our +analyses underscore that fake images are detectable and attributable at various +levels of visual granularity than previously explored. + +
+
+ comment: Code available at https://github.com/k8xu/ImageAttribution +
+
+
+
+
+ + ☆ InterDreamer: Zero-Shot Text to 3D Dynamic Human-Object Interaction + + +
+ Text-conditioned human motion generation has experienced significant +advancements with diffusion models trained on extensive motion capture data and +corresponding textual annotations. However, extending such success to 3D +dynamic human-object interaction (HOI) generation faces notable challenges, +primarily due to the lack of large-scale interaction data and comprehensive +descriptions that align with these interactions. This paper takes the +initiative and showcases the potential of generating human-object interactions +without direct training on text-interaction pair data. Our key insight in +achieving this is that interaction semantics and dynamics can be decoupled. +Being unable to learn interaction semantics through supervised training, we +instead leverage pre-trained large models, synergizing knowledge from a large +language model and a text-to-motion model. While such knowledge offers +high-level control over interaction semantics, it cannot grasp the intricacies +of low-level interaction dynamics. To overcome this issue, we further introduce +a world model designed to comprehend simple physics, modeling how human actions +influence object motion. By integrating these components, our novel framework, +InterDreamer, is able to generate text-aligned 3D HOI sequences in a zero-shot +manner. We apply InterDreamer to the BEHAVE and CHAIRS datasets, and our +comprehensive experimental analysis demonstrates its capability to generate +realistic and coherent interaction sequences that seamlessly align with the +text directives. + +
+
+ comment: Project Page: https://sirui-xu.github.io/InterDreamer/ +
+
+
+
+
+ + ☆ MagicLens: Self-Supervised Image Retrieval with Open-Ended Instructions + + +
+ Image retrieval, i.e., finding desired images given a reference image, +inherently encompasses rich, multi-faceted search intents that are difficult to +capture solely using image-based measures. Recent work leverages text +instructions to allow users to more freely express their search intents. +However, existing work primarily focuses on image pairs that are visually +similar and/or can be characterized by a small set of pre-defined relations. +The core thesis of this paper is that text instructions can enable retrieving +images with richer relations beyond visual similarity. To show this, we +introduce MagicLens, a series of self-supervised image retrieval models that +support open-ended instructions. MagicLens is built on a key novel insight: +image pairs that naturally occur on the same web pages contain a wide range of +implicit relations (e.g., inside view of), and we can bring those implicit +relations explicit by synthesizing instructions via large multimodal models +(LMMs) and large language models (LLMs). Trained on 36.7M (query image, +instruction, target image) triplets with rich semantic relations mined from the +web, MagicLens achieves comparable or better results on eight benchmarks of +various image retrieval tasks than prior state-of-the-art (SOTA) methods. +Remarkably, it outperforms previous SOTA but with a 50X smaller model size on +multiple benchmarks. Additional human analyses on a 1.4M-image unseen corpus +further demonstrate the diversity of search intents supported by MagicLens. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ GraspXL: Generating Grasping Motions for Diverse Objects at Scale + + +
+ Human hands possess the dexterity to interact with diverse objects such as +grasping specific parts of the objects and/or approaching them from desired +directions. More importantly, humans can grasp objects of any shape without +object-specific skills. Recent works synthesize grasping motions following +single objectives such as a desired approach heading direction or a grasping +area. Moreover, they usually rely on expensive 3D hand-object data during +training and inference, which limits their capability to synthesize grasping +motions for unseen objects at scale. In this paper, we unify the generation of +hand-object grasping motions across multiple motion objectives, diverse object +shapes and dexterous hand morphologies in a policy learning framework GraspXL. +The objectives are composed of the graspable area, heading direction during +approach, wrist rotation, and hand position. Without requiring any 3D +hand-object interaction data, our policy trained with 58 objects can robustly +synthesize diverse grasping motions for more than 500k unseen objects with a +success rate of 82.2%. At the same time, the policy adheres to objectives, +which enables the generation of diverse grasps per object. Moreover, we show +that our framework can be deployed to different dexterous hands and work with +reconstructed or generated objects. We quantitatively and qualitatively +evaluate our method to show the efficacy of our approach. Our model and code +will be available. + +
+
+ comment: Project Page: https://eth-ait.github.io/graspxl/ +
+
+
+
+
+ + ☆ Change-Agent: Towards Interactive Comprehensive Change Interpretation + and Analysis from Change Detection and Change Captioning + + +
+ Monitoring changes in the Earth's surface is crucial for understanding +natural processes and human impacts, necessitating precise and comprehensive +interpretation methodologies. Remote sensing satellite imagery offers a unique +perspective for monitoring these changes, leading to the emergence of remote +sensing image change interpretation (RSICI) as a significant research focus. +Current RSICI technology encompasses change detection and change captioning, +each with its limitations in providing comprehensive interpretation. To address +this, we propose an interactive Change-Agent which integrates a multi-level +change interpretation (MCI) model as eyes and a large language model (LLM) as +the brain. Our Change-Agent can follow user instructions to achieve +comprehensive change interpretation and insightful analysis according to user +instructions, such as change detection and change captioning, change object +counting, change cause analysis, etc. Our proposed MCI model contains two +branches of pixel-level change detection and semantic-level change captioning, +in which multiple BI-temporal Iterative Interaction (BI3) layers utilize Local +Perception Enhancement (LPE) and the Global Difference Fusion Attention (GDFA) +modules to enhance the model's discriminative feature representation +capabilities. To train the MCI model, we build the LEVIR-MCI dataset with +change masks and captions of bi-temporal images. Extensive experiments +demonstrate the effectiveness of the proposed change interpretation model and +highlight the promising potential of our Change-Agent in facilitating +comprehensive and intelligent interpretation of surface changes. We will make +our dataset and codebase of the change interpretation model and Change-Agent +publicly available to facilitate future research at +https://github.com/Chen-Yang-Liu/Change-Agent + +
+
+
+
+
+ + ☆ GANTASTIC: GAN-based Transfer of Interpretable Directions for + Disentangled Image Editing in Text-to-Image Diffusion Models + + +
+ The rapid advancement in image generation models has predominantly been +driven by diffusion models, which have demonstrated unparalleled success in +generating high-fidelity, diverse images from textual prompts. Despite their +success, diffusion models encounter substantial challenges in the domain of +image editing, particularly in executing disentangled edits-changes that target +specific attributes of an image while leaving irrelevant parts untouched. In +contrast, Generative Adversarial Networks (GANs) have been recognized for their +success in disentangled edits through their interpretable latent spaces. We +introduce GANTASTIC, a novel framework that takes existing directions from +pre-trained GAN models-representative of specific, controllable attributes-and +transfers these directions into diffusion-based models. This novel approach not +only maintains the generative quality and diversity that diffusion models are +known for but also significantly enhances their capability to perform precise, +targeted image edits, thereby leveraging the best of both worlds. + +
+
+ comment: Project page: https://gantastic.github.io +
+
+
+
+
+ + ☆ Siamese Vision Transformers are Scalable Audio-visual Learners + + +
+ Traditional audio-visual methods rely on independent audio and visual +backbones, which is costly and not scalable. In this work, we investigate using +an audio-visual siamese network (AVSiam) for efficient and scalable +audio-visual pretraining. Our framework uses a single shared vision transformer +backbone to process audio and visual inputs, improving its parameter +efficiency, reducing the GPU memory footprint, and allowing us to scale our +method to larger datasets and model sizes. We pretrain our model using a +contrastive audio-visual matching objective with a multi-ratio random masking +scheme, which enables our model to process larger audio-visual instance +batches, helpful for contrastive learning. Unlike prior audio-visual methods, +our method can robustly handle audio, visual, and audio-visual inputs with a +single shared ViT backbone. Furthermore, despite using the shared backbone for +both modalities, AVSiam achieves competitive or even better results than prior +methods on AudioSet and VGGSound for audio-visual classification and retrieval. +Our code is available at https://github.com/GenjiB/AVSiam + +
+
+
+
+
+ + ☆ GauStudio: A Modular Framework for 3D Gaussian Splatting and Beyond + + +
+ We present GauStudio, a novel modular framework for modeling 3D Gaussian +Splatting (3DGS) to provide standardized, plug-and-play components for users to +easily customize and implement a 3DGS pipeline. Supported by our framework, we +propose a hybrid Gaussian representation with foreground and skyball background +models. Experiments demonstrate this representation reduces artifacts in +unbounded outdoor scenes and improves novel view synthesis. Finally, we propose +Gaussian Splatting Surface Reconstruction (GauS), a novel render-then-fuse +approach for high-fidelity mesh reconstruction from 3DGS inputs without +fine-tuning. Overall, our GauStudio framework, hybrid representation, and GauS +approach enhance 3DGS modeling and rendering capabilities, enabling +higher-quality novel view synthesis and surface reconstruction. + +
+
+ comment: Code: https://github.com/GAP-LAB-CUHK-SZ/gaustudio +
+
+
+
+
+ + ☆ RH20T-P: A Primitive-Level Robotic Dataset Towards Composable + Generalization Agents + + +
+ The ultimate goals of robotic learning is to acquire a comprehensive and +generalizable robotic system capable of performing both seen skills within the +training distribution and unseen skills in novel environments. Recent progress +in utilizing language models as high-level planners has demonstrated that the +complexity of tasks can be reduced through decomposing them into +primitive-level plans, making it possible to generalize on novel robotic tasks +in a composable manner. Despite the promising future, the community is not yet +adequately prepared for composable generalization agents, particularly due to +the lack of primitive-level real-world robotic datasets. In this paper, we +propose a primitive-level robotic dataset, namely RH20T-P, which contains about +33000 video clips covering 44 diverse and complicated robotic tasks. Each clip +is manually annotated according to a set of meticulously designed primitive +skills, facilitating the future development of composable generalization +agents. To validate the effectiveness of RH20T-P, we also construct a potential +and scalable agent based on RH20T-P, called RA-P. Equipped with two planners +specialized in task decomposition and motion planning, RA-P can adapt to novel +physical skills through composable generalization. Our website and videos can +be found at https://sites.google.com/view/rh20t-primitive/main. Dataset and +code will be made available soon. + +
+
+ comment: 24 pages, 12 figures, 6 tables +
+
+
+
+
+ + ☆ Collaborative Interactive Evolution of Art in the Latent Space of Deep + Generative Models + + +
+ Generative Adversarial Networks (GANs) have shown great success in generating +high quality images and are thus used as one of the main approaches to generate +art images. However, usually the image generation process involves sampling +from the latent space of the learned art representations, allowing little +control over the output. In this work, we first employ GANs that are trained to +produce creative images using an architecture known as Creative Adversarial +Networks (CANs), then, we employ an evolutionary approach to navigate within +the latent space of the models to discover images. We use automatic aesthetic +and collaborative interactive human evaluation metrics to assess the generated +images. In the human interactive evaluation case, we propose a collaborative +evaluation based on the assessments of several participants. Furthermore, we +also experiment with an intelligent mutation operator that aims to improve the +quality of the images through local search based on an aesthetic measure. We +evaluate the effectiveness of this approach by comparing the results produced +by the automatic and collaborative interactive evolution. The results show that +the proposed approach can generate highly attractive art images when the +evolution is guided by collaborative human feedback. + +
+
+ comment: Preprint. The Version of Record of this contribution is to be + published in the proceedings of the 13th International Conference on + Artificial Intelligence in Music, Sound, Art and Design (EvoMUSART) 2024 +
+
+
+
+
+ + ☆ SA-GS: Scale-Adaptive Gaussian Splatting for Training-Free Anti-Aliasing + + +
+ In this paper, we present a Scale-adaptive method for Anti-aliasing Gaussian +Splatting (SA-GS). While the state-of-the-art method Mip-Splatting needs +modifying the training procedure of Gaussian splatting, our method functions at +test-time and is training-free. Specifically, SA-GS can be applied to any +pretrained Gaussian splatting field as a plugin to significantly improve the +field's anti-alising performance. The core technique is to apply 2D +scale-adaptive filters to each Gaussian during test time. As pointed out by +Mip-Splatting, observing Gaussians at different frequencies leads to mismatches +between the Gaussian scales during training and testing. Mip-Splatting resolves +this issue using 3D smoothing and 2D Mip filters, which are unfortunately not +aware of testing frequency. In this work, we show that a 2D scale-adaptive +filter that is informed of testing frequency can effectively match the Gaussian +scale, thus making the Gaussian primitive distribution remain consistent across +different testing frequencies. When scale inconsistency is eliminated, sampling +rates smaller than the scene frequency result in conventional jaggedness, and +we propose to integrate the projected 2D Gaussian within each pixel during +testing. This integration is actually a limiting case of super-sampling, which +significantly improves anti-aliasing performance over vanilla Gaussian +Splatting. Through extensive experiments using various settings and both +bounded and unbounded scenes, we show SA-GS performs comparably with or better +than Mip-Splatting. Note that super-sampling and integration are only effective +when our scale-adaptive filtering is activated. Our codes, data and models are +available at https://github.com/zsy1987/SA-GS. + +
+
+ comment: Project page: https://kevinsong729.github.io/project-pages/SA-GS/ + Code: https://github.com/zsy1987/SA-GS +
+
+
+
+
+ + ☆ ILPO-NET: Network for the invariant recognition of arbitrary volumetric + patterns in 3D + + +
+ Effective recognition of spatial patterns and learning their hierarchy is +crucial in modern spatial data analysis. Volumetric data applications seek +techniques ensuring invariance not only to shifts but also to pattern +rotations. While traditional methods can readily achieve translational +invariance, rotational invariance possesses multiple challenges and remains an +active area of research. Here, we present ILPO-Net (Invariant to Local Patterns +Orientation Network), a novel approach that handles arbitrarily shaped patterns +with the convolutional operation inherently invariant to local spatial pattern +orientations using the Wigner matrix expansions. Our architecture seamlessly +integrates the new convolution operator and, when benchmarked on diverse +volumetric datasets such as MedMNIST and CATH, demonstrates superior +performance over the baselines with significantly reduced parameter counts - up +to 1000 times fewer in the case of MedMNIST. Beyond these demonstrations, +ILPO-Net's rotational invariance paves the way for other applications across +multiple disciplines. Our code is publicly available at +https://gricad-gitlab.univ-grenoble-alpes.fr/GruLab/ILPONet. + +
+
+
+
+
+ + ☆ Nearest Neighbor Classication for Classical Image Upsampling + + +
+ Given a set of ordered pixel data in the form of an image, our goal is to +perform upsampling on the data such that: the resulting resolution is improved +by some factor, the final result passes the human test, having added new, +believable, and realistic information and detail to the image, the time +complexity for upscaling is relatively close to that of lossy upscaling +implementations. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ☆ SAID-NeRF: Segmentation-AIDed NeRF for Depth Completion of Transparent + Objects + + +
+ Acquiring accurate depth information of transparent objects using +off-the-shelf RGB-D cameras is a well-known challenge in Computer Vision and +Robotics. Depth estimation/completion methods are typically employed and +trained on datasets with quality depth labels acquired from either simulation, +additional sensors or specialized data collection setups and known 3d models. +However, acquiring reliable depth information for datasets at scale is not +straightforward, limiting training scalability and generalization. Neural +Radiance Fields (NeRFs) are learning-free approaches and have demonstrated wide +success in novel view synthesis and shape recovery. However, heuristics and +controlled environments (lights, backgrounds, etc) are often required to +accurately capture specular surfaces. In this paper, we propose using Visual +Foundation Models (VFMs) for segmentation in a zero-shot, label-free way to +guide the NeRF reconstruction process for these objects via the simultaneous +reconstruction of semantic fields and extensions to increase robustness. Our +proposed method Segmentation-AIDed NeRF (SAID-NeRF) shows significant +performance on depth completion datasets for transparent objects and robotic +grasping. + +
+
+ comment: 8 pages. An accompanying video is available at + https://www.youtube.com/watch?v=S4NCoUq4bmE +
+
+
+
+
+ + ☆ Semantic Map-based Generation of Navigation Instructions LREC + + +
+ We are interested in the generation of navigation instructions, either in +their own right or as training material for robotic navigation task. In this +paper, we propose a new approach to navigation instruction generation by +framing the problem as an image captioning task using semantic maps as visual +input. Conventional approaches employ a sequence of panorama images to generate +navigation instructions. Semantic maps abstract away from visual details and +fuse the information in multiple panorama images into a single top-down +representation, thereby reducing computational complexity to process the input. +We present a benchmark dataset for instruction generation using semantic maps, +propose an initial model and ask human subjects to manually assess the quality +of generated instructions. Our initial investigations show promise in using +semantic maps for instruction generation instead of a sequence of panorama +images, but there is vast scope for improvement. We release the code for data +preparation and model training at https://github.com/chengzu-li/VLGen. + +
+
+ comment: 5 pages, 2 figures, 3 tables (13 pages, 3 figures, 5 tables including + references and appendices), accepted at LREC-COLING 2024 +
+
+
+
+
+ + ☆ Enhance Image Classification via Inter-Class Image Mixup with Diffusion + Model + + +
+ Text-to-image (T2I) generative models have recently emerged as a powerful +tool, enabling the creation of photo-realistic images and giving rise to a +multitude of applications. However, the effective integration of T2I models +into fundamental image classification tasks remains an open question. A +prevalent strategy to bolster image classification performance is through +augmenting the training set with synthetic images generated by T2I models. In +this study, we scrutinize the shortcomings of both current generative and +conventional data augmentation techniques. Our analysis reveals that these +methods struggle to produce images that are both faithful (in terms of +foreground objects) and diverse (in terms of background contexts) for +domain-specific concepts. To tackle this challenge, we introduce an innovative +inter-class data augmentation method known as Diff-Mix +(https://github.com/Zhicaiwww/Diff-Mix), which enriches the dataset by +performing image translations between classes. Our empirical results +demonstrate that Diff-Mix achieves a better balance between faithfulness and +diversity, leading to a marked improvement in performance across diverse image +classification scenarios, including few-shot, conventional, and long-tail +classifications for domain-specific datasets. + +
+
+
+
+
+ + ☆ LocCa: Visual Pretraining with Location-aware Captioners + + +
+ Image captioning has been shown as an effective pretraining method similar to +contrastive pretraining. However, the incorporation of location-aware +information into visual pretraining remains an area with limited research. In +this paper, we propose a simple visual pretraining method with location-aware +captioners (LocCa). LocCa uses a simple image captioner task interface, to +teach a model to read out rich information, i.e. bounding box coordinates, and +captions, conditioned on the image pixel input. Thanks to the multitask +capabilities of an encoder-decoder architecture, we show that an image +captioner can easily handle multiple tasks during pretraining. Our experiments +demonstrate that LocCa outperforms standard captioners significantly on +localization downstream tasks while maintaining comparable performance on +holistic tasks. + +
+
+
+
+
+ + ☆ Situation Awareness for Driver-Centric Driving Style Adaptation + + +
+ There is evidence that the driving style of an autonomous vehicle is +important to increase the acceptance and trust of the passengers. The driving +situation has been found to have a significant influence on human driving +behavior. However, current driving style models only partially incorporate +driving environment information, limiting the alignment between an agent and +the given situation. Therefore, we propose a situation-aware driving style +model based on different visual feature encoders pretrained on fleet data, as +well as driving behavior predictors, which are adapted to the driving style of +a specific driver. Our experiments show that the proposed method outperforms +static driving styles significantly and forms plausible situation clusters. +Furthermore, we found that feature encoders pretrained on our dataset lead to +more precise driving behavior modeling. In contrast, feature encoders +pretrained supervised and unsupervised on different data sources lead to more +specific situation clusters, which can be utilized to constrain and control the +driving style adaptation for specific situations. Moreover, in a real-world +setting, where driving style adaptation is happening iteratively, we found the +MLP-based behavior predictors achieve good performance initially but suffer +from catastrophic forgetting. In contrast, behavior predictors based on +situationdependent statistics can learn iteratively from continuous data +streams by design. Overall, our experiments show that important information for +driving behavior prediction is contained within the visual feature encoder. The +dataset is publicly available at +huggingface.co/datasets/jHaselberger/SADC-Situation-Awareness-for-Driver-Centric-Driving-Style-Adaptation. + +
+
+ comment: 14 pages, 6 figures. This work has been submitted to the IEEE for + possible publication. Copyright may be transferred without notice, after + which this version may no longer be accessible +
+
+
+
+
+ + ☆ Frame by Familiar Frame: Understanding Replication in Video Diffusion + Models + + +
+ Building on the momentum of image generation diffusion models, there is an +increasing interest in video-based diffusion models. However, video generation +poses greater challenges due to its higher-dimensional nature, the scarcity of +training data, and the complex spatiotemporal relationships involved. Image +generation models, due to their extensive data requirements, have already +strained computational resources to their limits. There have been instances of +these models reproducing elements from the training samples, leading to +concerns and even legal disputes over sample replication. Video diffusion +models, which operate with even more constrained datasets and are tasked with +generating both spatial and temporal content, may be more prone to replicating +samples from their training sets. Compounding the issue, these models are often +evaluated using metrics that inadvertently reward replication. In our paper, we +present a systematic investigation into the phenomenon of sample replication in +video diffusion models. We scrutinize various recent diffusion models for video +synthesis, assessing their tendency to replicate spatial and temporal content +in both unconditional and conditional generation scenarios. Our study +identifies strategies that are less likely to lead to replication. Furthermore, +we propose new evaluation strategies that take replication into account, +offering a more accurate measure of a model's ability to generate the original +content. + +
+
+
+
+
+ + ☆ TOD3Cap: Towards 3D Dense Captioning in Outdoor Scenes + + +
+ 3D dense captioning stands as a cornerstone in achieving a comprehensive +understanding of 3D scenes through natural language. It has recently witnessed +remarkable achievements, particularly in indoor settings. However, the +exploration of 3D dense captioning in outdoor scenes is hindered by two major +challenges: 1) the \textbf{domain gap} between indoor and outdoor scenes, such +as dynamics and sparse visual inputs, makes it difficult to directly adapt +existing indoor methods; 2) the \textbf{lack of data} with comprehensive +box-caption pair annotations specifically tailored for outdoor scenes. To this +end, we introduce the new task of outdoor 3D dense captioning. As input, we +assume a LiDAR point cloud and a set of RGB images captured by the panoramic +camera rig. The expected output is a set of object boxes with captions. To +tackle this task, we propose the TOD3Cap network, which leverages the BEV +representation to generate object box proposals and integrates Relation +Q-Former with LLaMA-Adapter to generate rich captions for these objects. We +also introduce the TOD3Cap dataset, the largest one to our knowledge for 3D +dense captioning in outdoor scenes, which contains 2.3M descriptions of 64.3K +outdoor objects from 850 scenes. Notably, our TOD3Cap network can effectively +localize and caption 3D objects in outdoor scenes, which outperforms baseline +methods by a significant margin (+9.6 CiDEr@0.5IoU). Code, data, and models are +publicly available at https://github.com/jxbbb/TOD3Cap. + +
+
+ comment: Code, data, and models are publicly available at + https://github.com/jxbbb/TOD3Cap +
+
+
+
+
+ + ☆ DenseNets Reloaded: Paradigm Shift Beyond ResNets and ViTs + + +
+ This paper revives Densely Connected Convolutional Networks (DenseNets) and +reveals the underrated effectiveness over predominant ResNet-style +architectures. We believe DenseNets' potential was overlooked due to untouched +training methods and traditional design elements not fully revealing their +capabilities. Our pilot study shows dense connections through concatenation are +strong, demonstrating that DenseNets can be revitalized to compete with modern +architectures. We methodically refine suboptimal components - architectural +adjustments, block redesign, and improved training recipes towards widening +DenseNets and boosting memory efficiency while keeping concatenation shortcuts. +Our models, employing simple architectural elements, ultimately surpass Swin +Transformer, ConvNeXt, and DeiT-III - key architectures in the residual +learning lineage. Furthermore, our models exhibit near state-of-the-art +performance on ImageNet-1K, competing with the very recent models and +downstream tasks, ADE20k semantic segmentation, and COCO object +detection/instance segmentation. Finally, we provide empirical analyses that +uncover the merits of the concatenation over additive shortcuts, steering a +renewed preference towards DenseNet-style designs. Our code is available at +https://github.com/naver-ai/rdnet. + +
+
+ comment: Code at https://github.com/naver-ai/rdnet +
+
+
+
+
+ + ☆ TOGS: Gaussian Splatting with Temporal Opacity Offset for Real-Time 4D + DSA Rendering + + +
+ Four-dimensional Digital Subtraction Angiography (4D DSA) is a medical +imaging technique that provides a series of 2D images captured at different +stages and angles during the process of contrast agent filling blood vessels. +It plays a significant role in the diagnosis of cerebrovascular diseases. +Improving the rendering quality and speed under sparse sampling is important +for observing the status and location of lesions. The current methods exhibit +inadequate rendering quality in sparse views and suffer from slow rendering +speed. To overcome these limitations, we propose TOGS, a Gaussian splatting +method with opacity offset over time, which can effectively improve the +rendering quality and speed of 4D DSA. We introduce an opacity offset table for +each Gaussian to model the temporal variations in the radiance of the contrast +agent. By interpolating the opacity offset table, the opacity variation of the +Gaussian at different time points can be determined. This enables us to render +the 2D DSA image at that specific moment. Additionally, we introduced a Smooth +loss term in the loss function to mitigate overfitting issues that may arise in +the model when dealing with sparse view scenarios. During the training phase, +we randomly prune Gaussians, thereby reducing the storage overhead of the +model. The experimental results demonstrate that compared to previous methods, +this model achieves state-of-the-art reconstruction quality under the same +number of training views. Additionally, it enables real-time rendering while +maintaining low storage overhead. The code will be publicly available. + +
+
+
+
+
+ + ☆ Img2Loc: Revisiting Image Geolocalization using Multi-modality + Foundation Models and Image-based Retrieval-Augmented Generation + + +
+ Geolocating precise locations from images presents a challenging problem in +computer vision and information retrieval.Traditional methods typically employ +either classification, which dividing the Earth surface into grid cells and +classifying images accordingly, or retrieval, which identifying locations by +matching images with a database of image-location pairs. However, +classification-based approaches are limited by the cell size and cannot yield +precise predictions, while retrieval-based systems usually suffer from poor +search quality and inadequate coverage of the global landscape at varied scale +and aggregation levels. To overcome these drawbacks, we present Img2Loc, a +novel system that redefines image geolocalization as a text generation task. +This is achieved using cutting-edge large multi-modality models like GPT4V or +LLaVA with retrieval augmented generation. Img2Loc first employs CLIP-based +representations to generate an image-based coordinate query database. It then +uniquely combines query results with images itself, forming elaborate prompts +customized for LMMs. When tested on benchmark datasets such as Im2GPS3k and +YFCC4k, Img2Loc not only surpasses the performance of previous state-of-the-art +models but does so without any model training. + +
+
+
+
+
+ + ☆ OV-Uni3DETR: Towards Unified Open-Vocabulary 3D Object Detection via + Cycle-Modality Propagation + + +
+ In the current state of 3D object detection research, the severe scarcity of +annotated 3D data, substantial disparities across different data modalities, +and the absence of a unified architecture, have impeded the progress towards +the goal of universality. In this paper, we propose \textbf{OV-Uni3DETR}, a +unified open-vocabulary 3D detector via cycle-modality propagation. Compared +with existing 3D detectors, OV-Uni3DETR offers distinct advantages: 1) +Open-vocabulary 3D detection: During training, it leverages various accessible +data, especially extensive 2D detection images, to boost training diversity. +During inference, it can detect both seen and unseen classes. 2) Modality +unifying: It seamlessly accommodates input data from any given modality, +effectively addressing scenarios involving disparate modalities or missing +sensor information, thereby supporting test-time modality switching. 3) Scene +unifying: It provides a unified multi-modal model architecture for diverse +scenes collected by distinct sensors. Specifically, we propose the +cycle-modality propagation, aimed at propagating knowledge bridging 2D and 3D +modalities, to support the aforementioned functionalities. 2D semantic +knowledge from large-vocabulary learning guides novel class discovery in the 3D +domain, and 3D geometric knowledge provides localization supervision for 2D +detection images. OV-Uni3DETR achieves the state-of-the-art performance on +various scenarios, surpassing existing methods by more than 6\% on average. Its +performance using only RGB images is on par with or even surpasses that of +previous point cloud based methods. Code and pre-trained models will be +released later. + +
+
+
+
+
+ + ☆ The Bad Batches: Enhancing Self-Supervised Learning in Image + Classification Through Representative Batch Curation + + +
+ The pursuit of learning robust representations without human supervision is a +longstanding challenge. The recent advancements in self-supervised contrastive +learning approaches have demonstrated high performance across various +representation learning challenges. However, current methods depend on the +random transformation of training examples, resulting in some cases of +unrepresentative positive pairs that can have a large impact on learning. This +limitation not only impedes the convergence of the learning process but the +robustness of the learnt representation as well as requiring larger batch sizes +to improve robustness to such bad batches. This paper attempts to alleviate the +influence of false positive and false negative pairs by employing pairwise +similarity calculations through the Fr\'echet ResNet Distance (FRD), thereby +obtaining robust representations from unlabelled data. The effectiveness of the +proposed method is substantiated by empirical results, where a linear +classifier trained on self-supervised contrastive representations achieved an +impressive 87.74\% top-1 accuracy on STL10 and 99.31\% on the Flower102 +dataset. These results emphasize the potential of the proposed approach in +pushing the boundaries of the state-of-the-art in self-supervised contrastive +learning, particularly for image classification tasks. + +
+
+ comment: 8 Pages, 4 figures, IEEE WCCI 2024 Conference +
+
+
+
+
+ + ☆ Cross-Attention is Not Always Needed: Dynamic Cross-Attention for + Audio-Visual Dimensional Emotion Recognition ICME2024 + + +
+ In video-based emotion recognition, audio and visual modalities are often +expected to have a complementary relationship, which is widely explored using +cross-attention. However, they may also exhibit weak complementary +relationships, resulting in poor representations of audio-visual features, thus +degrading the performance of the system. To address this issue, we propose +Dynamic Cross-Attention (DCA) that can dynamically select cross-attended or +unattended features on the fly based on their strong or weak complementary +relationship with each other, respectively. Specifically, a simple yet +efficient gating layer is designed to evaluate the contribution of the +cross-attention mechanism and choose cross-attended features only when they +exhibit a strong complementary relationship, otherwise unattended features. We +evaluate the performance of the proposed approach on the challenging RECOLA and +Aff-Wild2 datasets. We also compare the proposed approach with other variants +of cross-attention and show that the proposed model consistently improves the +performance on both datasets. + +
+
+ comment: Accepted at IEEE ICME2024 +
+
+
+
+
+ + ☆ GlORIE-SLAM: Globally Optimized RGB-only Implicit Encoding Point Cloud + SLAM + + +
+ Recent advancements in RGB-only dense Simultaneous Localization and Mapping +(SLAM) have predominantly utilized grid-based neural implicit encodings and/or +struggle to efficiently realize global map and pose consistency. To this end, +we propose an efficient RGB-only dense SLAM system using a flexible neural +point cloud scene representation that adapts to keyframe poses and depth +updates, without needing costly backpropagation. Another critical challenge of +RGB-only SLAM is the lack of geometric priors. To alleviate this issue, with +the aid of a monocular depth estimator, we introduce a novel DSPO layer for +bundle adjustment which optimizes the pose and depth of keyframes along with +the scale of the monocular depth. Finally, our system benefits from loop +closure and online global bundle adjustment and performs either better or +competitive to existing dense neural RGB SLAM methods in tracking, mapping and +rendering accuracy on the Replica, TUM-RGBD and ScanNet datasets. The source +code will be made available. + +
+
+
+
+
+ + ☆ De-confounded Data-free Knowledge Distillation for Handling Distribution + Shifts CVPR24 + + +
+ Data-Free Knowledge Distillation (DFKD) is a promising task to train +high-performance small models to enhance actual deployment without relying on +the original training data. Existing methods commonly avoid relying on private +data by utilizing synthetic or sampled data. However, a long-overlooked issue +is that the severe distribution shifts between their substitution and original +data, which manifests as huge differences in the quality of images and class +proportions. The harmful shifts are essentially the confounder that +significantly causes performance bottlenecks. To tackle the issue, this paper +proposes a novel perspective with causal inference to disentangle the student +models from the impact of such shifts. By designing a customized causal graph, +we first reveal the causalities among the variables in the DFKD task. +Subsequently, we propose a Knowledge Distillation Causal Intervention (KDCI) +framework based on the backdoor adjustment to de-confound the confounder. KDCI +can be flexibly combined with most existing state-of-the-art baselines. +Experiments in combination with six representative DFKD methods demonstrate the +effectiveness of our KDCI, which can obviously help existing methods under +almost all settings, \textit{e.g.}, improving the baseline by up to 15.54\% +accuracy on the CIFAR-100 dataset. + +
+
+ comment: Accepted by CVPR24 +
+
+
+
+
+ + ☆ Locate, Assign, Refine: Taming Customized Image Inpainting with + Text-Subject Guidance + + +
+ Prior studies have made significant progress in image inpainting guided by +either text or subject image. However, the research on editing with their +combined guidance is still in the early stages. To tackle this challenge, we +present LAR-Gen, a novel approach for image inpainting that enables seamless +inpainting of masked scene images, incorporating both the textual prompts and +specified subjects. Our approach adopts a coarse-to-fine manner to ensure +subject identity preservation and local semantic coherence. The process +involves (i) Locate: concatenating the noise with masked scene image to achieve +precise regional editing, (ii) Assign: employing decoupled cross-attention +mechanism to accommodate multi-modal guidance, and (iii) Refine: using a novel +RefineNet to supplement subject details. Additionally, to address the issue of +scarce training data, we introduce a novel data construction pipeline. This +pipeline extracts substantial pairs of data consisting of local text prompts +and corresponding visual instances from a vast image dataset, leveraging +publicly available large models. Extensive experiments and varied application +scenarios demonstrate the superiority of LAR-Gen in terms of both identity +preservation and text semantic consistency. Project page can be found at +\url{https://ali-vilab.github.io/largen-page/}. + +
+
+ comment: 22 pages, 14 figures +
+
+
+
+
+ + ☆ Instance-Adaptive and Geometric-Aware Keypoint Learning for + Category-Level 6D Object Pose Estimation CVPR2024 + + +
+ Category-level 6D object pose estimation aims to estimate the rotation, +translation and size of unseen instances within specific categories. In this +area, dense correspondence-based methods have achieved leading performance. +However, they do not explicitly consider the local and global geometric +information of different instances, resulting in poor generalization ability to +unseen instances with significant shape variations. To deal with this problem, +we propose a novel Instance-Adaptive and Geometric-Aware Keypoint Learning +method for category-level 6D object pose estimation (AG-Pose), which includes +two key designs: (1) The first design is an Instance-Adaptive Keypoint +Detection module, which can adaptively detect a set of sparse keypoints for +various instances to represent their geometric structures. (2) The second +design is a Geometric-Aware Feature Aggregation module, which can efficiently +integrate the local and global geometric information into keypoint features. +These two modules can work together to establish robust keypoint-level +correspondences for unseen instances, thus enhancing the generalization ability +of the model.Experimental results on CAMERA25 and REAL275 datasets show that +the proposed AG-Pose outperforms state-of-the-art methods by a large margin +without category-specific shape priors. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ☆ Model Stock: All we need is just a few fine-tuned models + + +
+ This paper introduces an efficient fine-tuning method for large pre-trained +models, offering strong in-distribution (ID) and out-of-distribution (OOD) +performance. Breaking away from traditional practices that need a multitude of +fine-tuned models for averaging, our approach employs significantly fewer +models to achieve final weights yet yield superior accuracy. Drawing from key +insights in the weight space of fine-tuned weights, we uncover a strong link +between the performance and proximity to the center of weight space. Based on +this, we introduce a method that approximates a center-close weight using only +two fine-tuned models, applicable during or after training. Our innovative +layer-wise weight averaging technique surpasses state-of-the-art model methods +such as Model Soup, utilizing only two fine-tuned models. This strategy can be +aptly coined Model Stock, highlighting its reliance on selecting a minimal +number of models to draw a more optimized-averaged model. We demonstrate the +efficacy of Model Stock with fine-tuned models based upon pre-trained CLIP +architectures, achieving remarkable performance on both ID and OOD tasks on the +standard benchmarks, all while barely bringing extra computational demands. Our +code and pre-trained models are available at +https://github.com/naver-ai/model-stock. + +
+
+ comment: Code at https://github.com/naver-ai/model-stock +
+
+
+
+
+ + ☆ XScale-NVS: Cross-Scale Novel View Synthesis with Hash Featurized + Manifold CVPR 2024 + + +
+ We propose XScale-NVS for high-fidelity cross-scale novel view synthesis of +real-world large-scale scenes. Existing representations based on explicit +surface suffer from discretization resolution or UV distortion, while implicit +volumetric representations lack scalability for large scenes due to the +dispersed weight distribution and surface ambiguity. In light of the above +challenges, we introduce hash featurized manifold, a novel hash-based +featurization coupled with a deferred neural rendering framework. This approach +fully unlocks the expressivity of the representation by explicitly +concentrating the hash entries on the 2D manifold, thus effectively +representing highly detailed contents independent of the discretization +resolution. We also introduce a novel dataset, namely GigaNVS, to benchmark +cross-scale, high-resolution novel view synthesis of realworld large-scale +scenes. Our method significantly outperforms competing baselines on various +real-world scenes, yielding an average LPIPS that is 40% lower than prior +state-of-the-art on the challenging GigaNVS benchmark. Please see our project +page at: xscalenvs.github.io. + +
+
+ comment: Accepted to CVPR 2024. Project page: xscalenvs.github.io/ +
+
+
+
+
+ + ☆ CDIMC-net: Cognitive Deep Incomplete Multi-view Clustering Network IJCAI 2020 + + +
+ In recent years, incomplete multi-view clustering, which studies the +challenging multi-view clustering problem on missing views, has received +growing research interests. Although a series of methods have been proposed to +address this issue, the following problems still exist: 1) Almost all of the +existing methods are based on shallow models, which is difficult to obtain +discriminative common representations. 2) These methods are generally sensitive +to noise or outliers since the negative samples are treated equally as the +important samples. In this paper, we propose a novel incomplete multi-view +clustering network, called Cognitive Deep Incomplete Multi-view Clustering +Network (CDIMC-net), to address these issues. Specifically, it captures the +high-level features and local structure of each view by incorporating the +view-specific deep encoders and graph embedding strategy into a framework. +Moreover, based on the human cognition, i.e., learning from easy to hard, it +introduces a self-paced strategy to select the most confident samples for model +training, which can reduce the negative influence of outliers. Experimental +results on several incomplete datasets show that CDIMC-net outperforms the +state-of-the-art incomplete multi-view clustering methods. + +
+
+ comment: Accepted by IJCAI 2020 +
+
+
+
+
+ + ☆ Debiasing Cardiac Imaging with Controlled Latent Diffusion Models + + +
+ The progress in deep learning solutions for disease diagnosis and prognosis +based on cardiac magnetic resonance imaging is hindered by highly imbalanced +and biased training data. To address this issue, we propose a method to +alleviate imbalances inherent in datasets through the generation of synthetic +data based on sensitive attributes such as sex, age, body mass index, and +health condition. We adopt ControlNet based on a denoising diffusion +probabilistic model to condition on text assembled from patient metadata and +cardiac geometry derived from segmentation masks using a large-cohort study, +specifically, the UK Biobank. We assess our method by evaluating the realism of +the generated images using established quantitative metrics. Furthermore, we +conduct a downstream classification task aimed at debiasing a classifier by +rectifying imbalances within underrepresented groups through synthetically +generated samples. Our experiments demonstrate the effectiveness of the +proposed approach in mitigating dataset imbalances, such as the scarcity of +younger patients or individuals with normal BMI level suffering from heart +failure. This work represents a major step towards the adoption of synthetic +data for the development of fair and generalizable models for medical +classification tasks. Notably, we conduct all our experiments using a single, +consumer-level GPU to highlight the feasibility of our approach within +resource-constrained environments. Our code is available at +https://github.com/faildeny/debiasing-cardiac-mri. + +
+
+
+
+
+ + ☆ RELI11D: A Comprehensive Multimodal Human Motion Dataset and Method CVPR2024 + + +
+ Comprehensive capturing of human motions requires both accurate captures of +complex poses and precise localization of the human within scenes. Most of the +HPE datasets and methods primarily rely on RGB, LiDAR, or IMU data. However, +solely using these modalities or a combination of them may not be adequate for +HPE, particularly for complex and fast movements. For holistic human motion +understanding, we present RELI11D, a high-quality multimodal human motion +dataset involves LiDAR, IMU system, RGB camera, and Event camera. It records +the motions of 10 actors performing 5 sports in 7 scenes, including 3.32 hours +of synchronized LiDAR point clouds, IMU measurement data, RGB videos and Event +steams. Through extensive experiments, we demonstrate that the RELI11D presents +considerable challenges and opportunities as it contains many rapid and complex +motions that require precise location. To address the challenge of integrating +different modalities, we propose LEIR, a multimodal baseline that effectively +utilizes LiDAR Point Cloud, Event stream, and RGB through our cross-attention +fusion strategy. We show that LEIR exhibits promising results for rapid motions +and daily motions and that utilizing the characteristics of multiple modalities +can indeed improve HPE performance. Both the dataset and source code will be +released publicly to the research community, fostering collaboration and +enabling further exploration in this field. + +
+
+ comment: CVPR2024, Project website: http://www.lidarhumanmotion.net/reli11d/ +
+
+
+
+
+ + ☆ Surface-based parcellation and vertex-wise analysis of ultra + high-resolution ex vivo 7 tesla MRI in neurodegenerative diseases MICCAI 2024 + + +
+ Magnetic resonance imaging (MRI) is the standard modality to understand human +brain structure and function in vivo (antemortem). Decades of research in human +neuroimaging has led to the widespread development of methods and tools to +provide automated volume-based segmentations and surface-based parcellations +which help localize brain functions to specialized anatomical regions. Recently +ex vivo (postmortem) imaging of the brain has opened-up avenues to study brain +structure at sub-millimeter ultra high-resolution revealing details not +possible to observe with in vivo MRI. Unfortunately, there has been limited +methodological development in ex vivo MRI primarily due to lack of datasets and +limited centers with such imaging resources. Therefore, in this work, we +present one-of-its-kind dataset of 82 ex vivo T2w whole brain hemispheres MRI +at 0.3 mm isotropic resolution spanning Alzheimer's disease and related +dementias. We adapted and developed a fast and easy-to-use automated +surface-based pipeline to parcellate, for the first time, ultra high-resolution +ex vivo brain tissue at the native subject space resolution using the +Desikan-Killiany-Tourville (DKT) brain atlas. This allows us to perform +vertex-wise analysis in the template space and thereby link morphometry +measures with pathology measurements derived from histology. We will +open-source our dataset docker container, Jupyter notebooks for ready-to-use +out-of-the-box set of tools and command line options to advance ex vivo MRI +clinical brain imaging research on the project webpage. + +
+
+ comment: Under review at MICCAI 2024 +
+
+
+
+
+ + ☆ CoherentGS: Sparse Novel View Synthesis with Coherent 3D Gaussians + + +
+ The field of 3D reconstruction from images has rapidly evolved in the past +few years, first with the introduction of Neural Radiance Field (NeRF) and more +recently with 3D Gaussian Splatting (3DGS). The latter provides a significant +edge over NeRF in terms of the training and inference speed, as well as the +reconstruction quality. Although 3DGS works well for dense input images, the +unstructured point-cloud like representation quickly overfits to the more +challenging setup of extremely sparse input images (e.g., 3 images), creating a +representation that appears as a jumble of needles from novel views. To address +this issue, we propose regularized optimization and depth-based initialization. +Our key idea is to introduce a structured Gaussian representation that can be +controlled in 2D image space. We then constraint the Gaussians, in particular +their position, and prevent them from moving independently during optimization. +Specifically, we introduce single and multiview constraints through an implicit +convolutional decoder and a total variation loss, respectively. With the +coherency introduced to the Gaussians, we further constrain the optimization +through a flow-based loss function. To support our regularized optimization, we +propose an approach to initialize the Gaussians using monocular depth estimates +at each input view. We demonstrate significant improvements compared to the +state-of-the-art sparse-view NeRF-based approaches on a variety of scenes. + +
+
+ comment: Project page: https://people.engr.tamu.edu/nimak/Papers/CoherentGS +
+
+
+
+
+ + ☆ Segmentation tool for images of cracks + + +
+ Safety-critical infrastructures, such as bridges, are periodically inspected +to check for existing damage, such as fatigue cracks and corrosion, and to +guarantee the safe use of the infrastructure. Visual inspection is the most +frequent type of general inspection, despite the fact that its detection +capability is rather limited, especially for fatigue cracks. Machine learning +algorithms can be used for augmenting the capability of classical visual +inspection of bridge structures, however, the implementation of such an +algorithm requires a massive annotated training dataset, which is +time-consuming to produce. This paper proposes a semi-automatic crack +segmentation tool that eases the manual segmentation of cracks on images needed +to create a training dataset for a machine learning algorithm. Also, it can be +used to measure the geometry of the crack. This tool makes use of an image +processing algorithm, which was initially developed for the analysis of +vascular systems on retinal images. The algorithm relies on a multi-orientation +wavelet transform, which is applied to the image to construct the so-called +"orientation scores", i.e. a modified version of the image. Afterwards, the +filtered orientation scores are used to formulate an optimal path problem that +identifies the crack. The globally optimal path between manually selected crack +endpoints is computed, using a state-of-the-art geometric tracking method. The +pixel-wise segmentation is done afterwards using the obtained crack path. The +proposed method outperforms fully automatic methods and shows potential to be +an adequate alternative to the manual data annotation. + +
+
+
+
+
+ + ☆ Jointly Training and Pruning CNNs via Learnable Agent Guidance and + Alignment CVPR + 2024 + + +
+ Structural model pruning is a prominent approach used for reducing the +computational cost of Convolutional Neural Networks (CNNs) before their +deployment on resource-constrained devices. Yet, the majority of proposed ideas +require a pretrained model before pruning, which is costly to secure. In this +paper, we propose a novel structural pruning approach to jointly learn the +weights and structurally prune architectures of CNN models. The core element of +our method is a Reinforcement Learning (RL) agent whose actions determine the +pruning ratios of the CNN model's layers, and the resulting model's accuracy +serves as its reward. We conduct the joint training and pruning by iteratively +training the model's weights and the agent's policy, and we regularize the +model's weights to align with the selected structure by the agent. The evolving +model's weights result in a dynamic reward function for the agent, which +prevents using prominent episodic RL methods with stationary environment +assumption for our purpose. We address this challenge by designing a mechanism +to model the complex changing dynamics of the reward function and provide a +representation of it to the RL agent. To do so, we take a learnable embedding +for each training epoch and employ a recurrent model to calculate a +representation of the changing environment. We train the recurrent model and +embeddings using a decoder model to reconstruct observed rewards. Such a design +empowers our agent to effectively leverage episodic observations along with the +environment representations to learn a proper policy to determine performant +sub-networks of the CNN model. Our extensive experiments on CIFAR-10 and +ImageNet using ResNets and MobileNets demonstrate the effectiveness of our +method. + +
+
+ comment: IEEE/CVF Conference on Computer Vision and Pattern Recognition, CVPR + 2024 +
+
+
+
+
+ + ☆ SG-PGM: Partial Graph Matching Network with Semantic Geometric Fusion + for 3D Scene Graph Alignment and Its Downstream Tasks + + +
+ Scene graphs have been recently introduced into 3D spatial understanding as a +comprehensive representation of the scene. The alignment between 3D scene +graphs is the first step of many downstream tasks such as scene graph aided +point cloud registration, mosaicking, overlap checking, and robot navigation. +In this work, we treat 3D scene graph alignment as a partial graph-matching +problem and propose to solve it with a graph neural network. We reuse the +geometric features learned by a point cloud registration method and associate +the clustered point-level geometric features with the node-level semantic +feature via our designed feature fusion module. Partial matching is enabled by +using a learnable method to select the top-k similar node pairs. Subsequent +downstream tasks such as point cloud registration are achieved by running a +pre-trained registration network within the matched regions. We further propose +a point-matching rescoring method, that uses the node-wise alignment of the 3D +scene graph to reweight the matching candidates from a pre-trained point cloud +registration method. It reduces the false point correspondences estimated +especially in low-overlapping cases. Experiments show that our method improves +the alignment accuracy by 10~20% in low-overlap and random transformation +scenarios and outperforms the existing work in multiple downstream tasks. + +
+
+ comment: 16 pages, 10 figures +
+
+
+
+
+ + ☆ Benchmarking Implicit Neural Representation and Geometric Rendering in + Real-Time RGB-D SLAM CVPR 2024 + + +
+ Implicit neural representation (INR), in combination with geometric +rendering, has recently been employed in real-time dense RGB-D SLAM. Despite +active research endeavors being made, there lacks a unified protocol for fair +evaluation, impeding the evolution of this area. In this work, we establish, to +our knowledge, the first open-source benchmark framework to evaluate the +performance of a wide spectrum of commonly used INRs and rendering functions +for mapping and localization. The goal of our benchmark is to 1) gain an +intuition of how different INRs and rendering functions impact mapping and +localization and 2) establish a unified evaluation protocol w.r.t. the design +choices that may impact the mapping and localization. With the framework, we +conduct a large suite of experiments, offering various insights in choosing the +INRs and geometric rendering functions: for example, the dense feature grid +outperforms other INRs (e.g. tri-plane and hash grid), even when geometric and +color features are jointly encoded for memory efficiency. To extend the +findings into the practical scenario, a hybrid encoding strategy is proposed to +bring the best of the accuracy and completion from the grid-based and +decomposition-based INRs. We further propose explicit hybrid encoding for +high-fidelity dense grid mapping to comply with the RGB-D SLAM system that puts +the premise on robustness and computation efficiency. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Beyond Talking -- Generating Holistic 3D Human Dyadic Motion for + Communication + + +
+ In this paper, we introduce an innovative task focused on human +communication, aiming to generate 3D holistic human motions for both speakers +and listeners. Central to our approach is the incorporation of factorization to +decouple audio features and the combination of textual semantic information, +thereby facilitating the creation of more realistic and coordinated movements. +We separately train VQ-VAEs with respect to the holistic motions of both +speaker and listener. We consider the real-time mutual influence between the +speaker and the listener and propose a novel chain-like transformer-based +auto-regressive model specifically designed to characterize real-world +communication scenarios effectively which can generate the motions of both the +speaker and the listener simultaneously. These designs ensure that the results +we generate are both coordinated and diverse. Our approach demonstrates +state-of-the-art performance on two benchmark datasets. Furthermore, we +introduce the HoCo holistic communication dataset, which is a valuable resource +for future research. Our HoCo dataset and code will be released for research +purposes upon acceptance. + +
+
+
+
+
+ + ☆ Break-for-Make: Modular Low-Rank Adaptations for Composable + Content-Style Customization + + +
+ Personalized generation paradigms empower designers to customize visual +intellectual properties with the help of textual descriptions by tuning or +adapting pre-trained text-to-image models on a few images. Recent works explore +approaches for concurrently customizing both content and detailed visual style +appearance. However, these existing approaches often generate images where the +content and style are entangled. In this study, we reconsider the customization +of content and style concepts from the perspective of parameter space +construction. Unlike existing methods that utilize a shared parameter space for +content and style, we propose a learning framework that separates the parameter +space to facilitate individual learning of content and style, thereby enabling +disentangled content and style. To achieve this goal, we introduce "partly +learnable projection" (PLP) matrices to separate the original adapters into +divided sub-parameter spaces. We propose "break-for-make" customization +learning pipeline based on PLP, which is simple yet effective. We break the +original adapters into "up projection" and "down projection", train content and +style PLPs individually with the guidance of corresponding textual prompts in +the separate adapters, and maintain generalization by employing a +multi-correspondence projection learning strategy. Based on the adapters broken +apart for separate training content and style, we then make the entity +parameter space by reconstructing the content and style PLPs matrices, followed +by fine-tuning the combined adapter to generate the target object with the +desired appearance. Experiments on various styles, including textures, +materials, and artistic style, show that our method outperforms +state-of-the-art single/multiple concept learning pipelines in terms of +content-style-prompt alignment. + +
+
+
+
+
+ + ☆ Transparent and Clinically Interpretable AI for Lung Cancer Detection in + Chest X-Rays + + +
+ The rapidly advancing field of Explainable Artificial Intelligence (XAI) aims +to tackle the issue of trust regarding the use of complex black-box deep +learning models in real-world applications. Existing post-hoc XAI techniques +have recently been shown to have poor performance on medical data, producing +unreliable explanations which are infeasible for clinical use. To address this, +we propose an ante-hoc approach based on concept bottleneck models which +introduces for the first time clinical concepts into the classification +pipeline, allowing the user valuable insight into the decision-making process. +On a large public dataset of chest X-rays and associated medical reports, we +focus on the binary classification task of lung cancer detection. Our approach +yields improved classification performance in lung cancer detection when +compared to baseline deep learning models (F1 > 0.9), while also generating +clinically relevant and more reliable explanations than existing techniques. We +evaluate our approach against post-hoc image XAI techniques LIME and SHAP, as +well as CXR-LLaVA, a recent textual XAI tool which operates in the context of +question answering on chest X-rays. + +
+
+ comment: 12 pages, 10 figures +
+
+
+
+
+ + ☆ SubjectDrive: Scaling Generative Data in Autonomous Driving via Subject + Control + + +
+ Autonomous driving progress relies on large-scale annotated datasets. In this +work, we explore the potential of generative models to produce vast quantities +of freely-labeled data for autonomous driving applications and present +SubjectDrive, the first model proven to scale generative data production in a +way that could continuously improve autonomous driving applications. We +investigate the impact of scaling up the quantity of generative data on the +performance of downstream perception models and find that enhancing data +diversity plays a crucial role in effectively scaling generative data +production. Therefore, we have developed a novel model equipped with a subject +control mechanism, which allows the generative model to leverage diverse +external data sources for producing varied and useful data. Extensive +evaluations confirm SubjectDrive's efficacy in generating scalable autonomous +driving training data, marking a significant step toward revolutionizing data +production methods in this field. + +
+
+ comment: Project page: https://subjectdrive.github.io/ +
+
+
+
+
+ + ☆ BAMM: Bidirectional Autoregressive Motion Model + + +
+ Generating human motion from text has been dominated by denoising motion +models either through diffusion or generative masking process. However, these +models face great limitations in usability by requiring prior knowledge of the +motion length. Conversely, autoregressive motion models address this limitation +by adaptively predicting motion endpoints, at the cost of degraded generation +quality and editing capabilities. To address these challenges, we propose +Bidirectional Autoregressive Motion Model (BAMM), a novel text-to-motion +generation framework. BAMM consists of two key components: (1) a motion +tokenizer that transforms 3D human motion into discrete tokens in latent space, +and (2) a masked self-attention transformer that autoregressively predicts +randomly masked tokens via a hybrid attention masking strategy. By unifying +generative masked modeling and autoregressive modeling, BAMM captures rich and +bidirectional dependencies among motion tokens, while learning the +probabilistic mapping from textual inputs to motion outputs with +dynamically-adjusted motion sequence length. This feature enables BAMM to +simultaneously achieving high-quality motion generation with enhanced usability +and built-in motion editability. Extensive experiments on HumanML3D and KIT-ML +datasets demonstrate that BAMM surpasses current state-of-the-art methods in +both qualitative and quantitative measures. + +
+
+
+
+
+ + ☆ Burst Super-Resolution with Diffusion Models for Improving Perceptual + Quality IJCNN 2024 + + +
+ While burst LR images are useful for improving the SR image quality compared +with a single LR image, prior SR networks accepting the burst LR images are +trained in a deterministic manner, which is known to produce a blurry SR image. +In addition, it is difficult to perfectly align the burst LR images, making the +SR image more blurry. Since such blurry images are perceptually degraded, we +aim to reconstruct the sharp high-fidelity boundaries. Such high-fidelity +images can be reconstructed by diffusion models. However, prior SR methods +using the diffusion model are not properly optimized for the burst SR task. +Specifically, the reverse process starting from a random sample is not +optimized for image enhancement and restoration methods, including burst SR. In +our proposed method, on the other hand, burst LR features are used to +reconstruct the initial burst SR image that is fed into an intermediate step in +the diffusion model. This reverse process from the intermediate step 1) skips +diffusion steps for reconstructing the global structure of the image and 2) +focuses on steps for refining detailed textures. Our experimental results +demonstrate that our method can improve the scores of the perceptual quality +metrics. Code: https://github.com/placerkyo/BSRD + +
+
+ comment: Accepted to IJCNN 2024 (International Joint Conference on Neural + Networks) +
+
+
+
+
+ + ☆ A Robust Ensemble Algorithm for Ischemic Stroke Lesion Segmentation: + Generalizability and Clinical Utility Beyond the ISLES Challenge + + +
+ Diffusion-weighted MRI (DWI) is essential for stroke diagnosis, treatment +decisions, and prognosis. However, image and disease variability hinder the +development of generalizable AI algorithms with clinical value. We address this +gap by presenting a novel ensemble algorithm derived from the 2022 Ischemic +Stroke Lesion Segmentation (ISLES) challenge. ISLES'22 provided 400 patient +scans with ischemic stroke from various medical centers, facilitating the +development of a wide range of cutting-edge segmentation algorithms by the +research community. Through collaboration with leading teams, we combined +top-performing algorithms into an ensemble model that overcomes the limitations +of individual solutions. Our ensemble model achieved superior ischemic lesion +detection and segmentation accuracy on our internal test set compared to +individual algorithms. This accuracy generalized well across diverse image and +disease variables. Furthermore, the model excelled in extracting clinical +biomarkers. Notably, in a Turing-like test, neuroradiologists consistently +preferred the algorithm's segmentations over manual expert efforts, +highlighting increased comprehensiveness and precision. Validation using a +real-world external dataset (N=1686) confirmed the model's generalizability. +The algorithm's outputs also demonstrated strong correlations with clinical +scores (admission NIHSS and 90-day mRS) on par with or exceeding expert-derived +results, underlining its clinical relevance. This study offers two key +findings. First, we present an ensemble algorithm +(https://github.com/Tabrisrei/ISLES22_Ensemble) that detects and segments +ischemic stroke lesions on DWI across diverse scenarios on par with expert +(neuro)radiologists. Second, we show the potential for biomedical challenge +outputs to extend beyond the challenge's initial objectives, demonstrating +their real-world clinical applicability. + +
+
+
+
+
+ + ☆ OAKINK2: A Dataset of Bimanual Hands-Object Manipulation in Complex Task + Completion CVPR 2024 + + +
+ We present OAKINK2, a dataset of bimanual object manipulation tasks for +complex daily activities. In pursuit of constructing the complex tasks into a +structured representation, OAKINK2 introduces three level of abstraction to +organize the manipulation tasks: Affordance, Primitive Task, and Complex Task. +OAKINK2 features on an object-centric perspective for decoding the complex +tasks, treating them as a sequence of object affordance fulfillment. The first +level, Affordance, outlines the functionalities that objects in the scene can +afford, the second level, Primitive Task, describes the minimal interaction +units that humans interact with the object to achieve its affordance, and the +third level, Complex Task, illustrates how Primitive Tasks are composed and +interdependent. OAKINK2 dataset provides multi-view image streams and precise +pose annotations for the human body, hands and various interacting objects. +This extensive collection supports applications such as interaction +reconstruction and motion synthesis. Based on the 3-level abstraction of +OAKINK2, we explore a task-oriented framework for Complex Task Completion +(CTC). CTC aims to generate a sequence of bimanual manipulation to achieve task +objectives. Within the CTC framework, we employ Large Language Models (LLMs) to +decompose the complex task objectives into sequences of Primitive Tasks and +have developed a Motion Fulfillment Model that generates bimanual hand motion +for each Primitive Task. OAKINK2 datasets and models are available at +https://oakink.net/v2. + +
+
+ comment: To be appeared in CVPR 2024. 26 pages +
+
+
+
+
+ + ☆ Brain-Shift: Unsupervised Pseudo-Healthy Brain Synthesis for Novel + Biomarker Extraction in Chronic Subdural Hematoma + + +
+ Chronic subdural hematoma (cSDH) is a common neurological condition +characterized by the accumulation of blood between the brain and the dura +mater. This accumulation of blood can exert pressure on the brain, potentially +leading to fatal outcomes. Treatment options for cSDH are limited to invasive +surgery or non-invasive management. Traditionally, the midline shift, +hand-measured by experts from an ideal sagittal plane, and the hematoma volume +have been the primary metrics for quantifying and analyzing cSDH. However, +these approaches do not quantify the local 3D brain deformation caused by cSDH. +We propose a novel method using anatomy-aware unsupervised diffeomorphic +pseudo-healthy synthesis to generate brain deformation fields. The deformation +fields derived from this process are utilized to extract biomarkers that +quantify the shift in the brain due to cSDH. We use CT scans of 121 patients +for training and validation of our method and find that our metrics allow the +identification of patients who require surgery. Our results indicate that +automatically obtained brain deformation fields might contain prognostic value +for personalized cSDH treatment. Our implementation is available on: +github.com/Barisimre/brain-morphing + +
+
+
+
+
+ + ☆ A Simple and Effective Point-based Network for Event Camera 6-DOFs Pose + Relocalization CVPR 2024 + + +
+ Event cameras exhibit remarkable attributes such as high dynamic range, +asynchronicity, and low latency, making them highly suitable for vision tasks +that involve high-speed motion in challenging lighting conditions. These +cameras implicitly capture movement and depth information in events, making +them appealing sensors for Camera Pose Relocalization (CPR) tasks. +Nevertheless, existing CPR networks based on events neglect the pivotal +fine-grained temporal information in events, resulting in unsatisfactory +performance. Moreover, the energy-efficient features are further compromised by +the use of excessively complex models, hindering efficient deployment on edge +devices. In this paper, we introduce PEPNet, a simple and effective point-based +network designed to regress six degrees of freedom (6-DOFs) event camera poses. +We rethink the relationship between the event camera and CPR tasks, leveraging +the raw Point Cloud directly as network input to harness the high-temporal +resolution and inherent sparsity of events. PEPNet is adept at abstracting the +spatial and implicit temporal features through hierarchical structure and +explicit temporal features by Attentive Bi-directional Long Short-Term Memory +(A-Bi-LSTM). By employing a carefully crafted lightweight design, PEPNet +delivers state-of-the-art (SOTA) performance on both indoor and outdoor +datasets with meager computational resources. Specifically, PEPNet attains a +significant 38% and 33% performance improvement on the random split IJRR and +M3ED datasets, respectively. Moreover, the lightweight design version +PEPNet$_{tiny}$ accomplishes results comparable to the SOTA while employing a +mere 0.5% of the parameters. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Towards Temporally Consistent Referring Video Object Segmentation + + +
+ Referring Video Object Segmentation (R-VOS) methods face challenges in +maintaining consistent object segmentation due to temporal context variability +and the presence of other visually similar objects. We propose an end-to-end +R-VOS paradigm that explicitly models temporal instance consistency alongside +the referring segmentation. Specifically, we introduce a novel hybrid memory +that facilitates inter-frame collaboration for robust spatio-temporal matching +and propagation. Features of frames with automatically generated high-quality +reference masks are propagated to segment the remaining frames based on +multi-granularity association to achieve temporally consistent R-VOS. +Furthermore, we propose a new Mask Consistency Score (MCS) metric to evaluate +the temporal consistency of video segmentation. Extensive experiments +demonstrate that our approach enhances temporal consistency by a significant +margin, leading to top-ranked performance on popular R-VOS benchmarks, i.e., +Ref-YouTube-VOS (67.1%) and Ref-DAVIS17 (65.6%). + +
+
+
+
+
+ + ☆ PointCloud-Text Matching: Benchmark Datasets and a Baseline + + +
+ In this paper, we present and study a new instance-level retrieval task: +PointCloud-Text Matching~(PTM), which aims to find the exact cross-modal +instance that matches a given point-cloud query or text query. PTM could be +applied to various scenarios, such as indoor/urban-canyon localization and +scene retrieval. However, there exists no suitable and targeted dataset for PTM +in practice. Therefore, we construct three new PTM benchmark datasets, namely +3D2T-SR, 3D2T-NR, and 3D2T-QA. We observe that the data is challenging and with +noisy correspondence due to the sparsity, noise, or disorder of point clouds +and the ambiguity, vagueness, or incompleteness of texts, which make existing +cross-modal matching methods ineffective for PTM. To tackle these challenges, +we propose a PTM baseline, named Robust PointCloud-Text Matching method (RoMa). +RoMa consists of two modules: a Dual Attention Perception module (DAP) and a +Robust Negative Contrastive Learning module (RNCL). Specifically, DAP leverages +token-level and feature-level attention to adaptively focus on useful local and +global features, and aggregate them into common representations, thereby +reducing the adverse impact of noise and ambiguity. To handle noisy +correspondence, RNCL divides negative pairs, which are much less error-prone +than positive pairs, into clean and noisy subsets, and assigns them forward and +reverse optimization directions respectively, thus enhancing robustness against +noisy correspondence. We conduct extensive experiments on our benchmarks and +demonstrate the superiority of our RoMa. + +
+
+
+
+
+ + ☆ NIGHT -- Non-Line-of-Sight Imaging from Indirect Time of Flight Data ECCV 24 + + +
+ The acquisition of objects outside the Line-of-Sight of cameras is a very +intriguing but also extremely challenging research topic. Recent works showed +the feasibility of this idea exploiting transient imaging data produced by +custom direct Time of Flight sensors. In this paper, for the first time, we +tackle this problem using only data from an off-the-shelf indirect Time of +Flight sensor without any further hardware requirement. We introduced a Deep +Learning model able to re-frame the surfaces where light bounces happen as a +virtual mirror. This modeling makes the task easier to handle and also +facilitates the construction of annotated training data. From the obtained data +it is possible to retrieve the depth information of the hidden scene. We also +provide a first-in-its-kind synthetic dataset for the task and demonstrate the +feasibility of the proposed idea over it. + +
+
+ comment: Submitted to ECCV 24, 17 pages, 6 figures, 2 tables +
+
+
+
+
+ + ☆ Infrared Small Target Detection with Scale and Location Sensitivity CVPR 2024 + + +
+ Recently, infrared small target detection (IRSTD) has been dominated by +deep-learning-based methods. However, these methods mainly focus on the design +of complex model structures to extract discriminative features, leaving the +loss functions for IRSTD under-explored. For example, the widely used +Intersection over Union (IoU) and Dice losses lack sensitivity to the scales +and locations of targets, limiting the detection performance of detectors. In +this paper, we focus on boosting detection performance with a more effective +loss but a simpler model structure. Specifically, we first propose a novel +Scale and Location Sensitive (SLS) loss to handle the limitations of existing +losses: 1) for scale sensitivity, we compute a weight for the IoU loss based on +target scales to help the detector distinguish targets with different scales: +2) for location sensitivity, we introduce a penalty term based on the center +points of targets to help the detector localize targets more precisely. Then, +we design a simple Multi-Scale Head to the plain U-Net (MSHNet). By applying +SLS loss to each scale of the predictions, our MSHNet outperforms existing +state-of-the-art methods by a large margin. In addition, the detection +performance of existing detectors can be further improved when trained with our +SLS loss, demonstrating the effectiveness and generalization of our SLS loss. +The code is available at https://github.com/ying-fu/MSHNet. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ IVLMap: Instance-Aware Visual Language Grounding for Consumer Robot + Navigation + + +
+ Vision-and-Language Navigation (VLN) is a challenging task that requires a +robot to navigate in photo-realistic environments with human natural language +promptings. Recent studies aim to handle this task by constructing the semantic +spatial map representation of the environment, and then leveraging the strong +ability of reasoning in large language models for generalizing code for guiding +the robot navigation. However, these methods face limitations in instance-level +and attribute-level navigation tasks as they cannot distinguish different +instances of the same object. To address this challenge, we propose a new +method, namely, Instance-aware Visual Language Map (IVLMap), to empower the +robot with instance-level and attribute-level semantic mapping, where it is +autonomously constructed by fusing the RGBD video data collected from the robot +agent with special-designed natural language map indexing in the bird's-in-eye +view. Such indexing is instance-level and attribute-level. In particular, when +integrated with a large language model, IVLMap demonstrates the capability to +i) transform natural language into navigation targets with instance and +attribute information, enabling precise localization, and ii) accomplish +zero-shot end-to-end navigation tasks based on natural language commands. +Extensive navigation experiments are conducted. Simulation results illustrate +that our method can achieve an average improvement of 14.4\% in navigation +accuracy. Code and demo are released at https://ivlmap.github.io/. + +
+
+
+
+
+ + ☆ Test-Time Domain Generalization for Face Anti-Spoofing CVPR + + +
+ Face Anti-Spoofing (FAS) is pivotal in safeguarding facial recognition +systems against presentation attacks. While domain generalization (DG) methods +have been developed to enhance FAS performance, they predominantly focus on +learning domain-invariant features during training, which may not guarantee +generalizability to unseen data that differs largely from the source +distributions. Our insight is that testing data can serve as a valuable +resource to enhance the generalizability beyond mere evaluation for DG FAS. In +this paper, we introduce a novel Test-Time Domain Generalization (TTDG) +framework for FAS, which leverages the testing data to boost the model's +generalizability. Our method, consisting of Test-Time Style Projection (TTSP) +and Diverse Style Shifts Simulation (DSSS), effectively projects the unseen +data to the seen domain space. In particular, we first introduce the innovative +TTSP to project the styles of the arbitrarily unseen samples of the testing +distribution to the known source space of the training distributions. We then +design the efficient DSSS to synthesize diverse style shifts via learnable +style bases with two specifically designed losses in a hyperspherical feature +space. Our method eliminates the need for model updates at the test time and +can be seamlessly integrated into not only the CNN but also ViT backbones. +Comprehensive experiments on widely used cross-domain FAS benchmarks +demonstrate our method's state-of-the-art performance and effectiveness. + +
+
+ comment: Accepted to IEEE/CVF Conference on Computer Vision and Pattern + Recognition (CVPR), 2024 +
+
+
+
+
+ + ☆ MedBN: Robust Test-Time Adaptation against Malicious Test Samples CVPR 2024 + + +
+ Test-time adaptation (TTA) has emerged as a promising solution to address +performance decay due to unforeseen distribution shifts between training and +test data. While recent TTA methods excel in adapting to test data variations, +such adaptability exposes a model to vulnerability against malicious examples, +an aspect that has received limited attention. Previous studies have uncovered +security vulnerabilities within TTA even when a small proportion of the test +batch is maliciously manipulated. In response to the emerging threat, we +propose median batch normalization (MedBN), leveraging the robustness of the +median for statistics estimation within the batch normalization layer during +test-time inference. Our method is algorithm-agnostic, thus allowing seamless +integration with existing TTA frameworks. Our experimental results on benchmark +datasets, including CIFAR10-C, CIFAR100-C and ImageNet-C, consistently +demonstrate that MedBN outperforms existing approaches in maintaining robust +performance across different attack scenarios, encompassing both instant and +cumulative attacks. Through extensive experiments, we show that our approach +sustains the performance even in the absence of attacks, achieving a practical +balance between robustness and performance. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Plug-and-Play Grounding of Reasoning in Multimodal Large Language Models + + +
+ The surge of Multimodal Large Language Models (MLLMs), given their prominent +emergent capabilities in instruction following and reasoning, has greatly +advanced the field of visual reasoning. However, constrained by their +non-lossless image tokenization, most MLLMs fall short of comprehensively +capturing details of text and objects, especially in high-resolution images. To +address this, we propose P2G, a novel framework for plug-and-play grounding of +reasoning in MLLMs. Specifically, P2G exploits the tool-usage potential of +MLLMs to employ expert agents to achieve on-the-fly grounding to critical +visual and textual objects of image, thus achieving deliberate reasoning via +multimodal prompting. We further create P2GB, a benchmark aimed at assessing +MLLMs' ability to understand inter-object relationships and text in challenging +high-resolution images. Comprehensive experiments on visual reasoning tasks +demonstrate the superiority of P2G. Noteworthy, P2G achieved comparable +performance with GPT-4V on P2GB, with a 7B backbone. Our work highlights the +potential of plug-and-play grounding of reasoning and opens up a promising +alternative beyond model scaling. + +
+
+ comment: 14 pages, 3 figures +
+
+
+
+
+ + ☆ Mesh2NeRF: Direct Mesh Supervision for Neural Radiance Field + Representation and Generation + + +
+ We present Mesh2NeRF, an approach to derive ground-truth radiance fields from +textured meshes for 3D generation tasks. Many 3D generative approaches +represent 3D scenes as radiance fields for training. Their ground-truth +radiance fields are usually fitted from multi-view renderings from a +large-scale synthetic 3D dataset, which often results in artifacts due to +occlusions or under-fitting issues. In Mesh2NeRF, we propose an analytic +solution to directly obtain ground-truth radiance fields from 3D meshes, +characterizing the density field with an occupancy function featuring a defined +surface thickness, and determining view-dependent color through a reflection +function considering both the mesh and environment lighting. Mesh2NeRF extracts +accurate radiance fields which provides direct supervision for training +generative NeRFs and single scene representation. We validate the effectiveness +of Mesh2NeRF across various tasks, achieving a noteworthy 3.12dB improvement in +PSNR for view synthesis in single scene representation on the ABO dataset, a +0.69 PSNR enhancement in the single-view conditional generation of ShapeNet +Cars, and notably improved mesh extraction from NeRF in the unconditional +generation of Objaverse Mugs. + +
+
+ comment: Project page: https://terencecyj.github.io/projects/Mesh2NeRF/ Video: + https://youtu.be/oufv1N3f7iY +
+
+
+
+
+ + ☆ Hypergraph-based Multi-View Action Recognition using Event Cameras + + +
+ Action recognition from video data forms a cornerstone with wide-ranging +applications. Single-view action recognition faces limitations due to its +reliance on a single viewpoint. In contrast, multi-view approaches capture +complementary information from various viewpoints for improved accuracy. +Recently, event cameras have emerged as innovative bio-inspired sensors, +leading to advancements in event-based action recognition. However, existing +works predominantly focus on single-view scenarios, leaving a gap in multi-view +event data exploitation, particularly in challenges like information deficit +and semantic misalignment. To bridge this gap, we introduce HyperMV, a +multi-view event-based action recognition framework. HyperMV converts discrete +event data into frame-like representations and extracts view-related features +using a shared convolutional network. By treating segments as vertices and +constructing hyperedges using rule-based and KNN-based strategies, a multi-view +hypergraph neural network that captures relationships across viewpoint and +temporal features is established. The vertex attention hypergraph propagation +is also introduced for enhanced feature fusion. To prompt research in this +area, we present the largest multi-view event-based action dataset +$\text{THU}^{\text{MV-EACT}}\text{-50}$, comprising 50 actions from 6 +viewpoints, which surpasses existing datasets by over tenfold. Experimental +results show that HyperMV significantly outperforms baselines in both +cross-subject and cross-view scenarios, and also exceeds the state-of-the-arts +in frame-based multi-view action recognition. + +
+
+ comment: Accepted by IEEE Transactions on Pattern Analysis and Machine + Intelligence (TPAMI 2024) +
+
+
+
+
+ + ☆ Total-Decom: Decomposed 3D Scene Reconstruction with Minimal Interaction CVPR 2024 + + +
+ Scene reconstruction from multi-view images is a fundamental problem in +computer vision and graphics. Recent neural implicit surface reconstruction +methods have achieved high-quality results; however, editing and manipulating +the 3D geometry of reconstructed scenes remains challenging due to the absence +of naturally decomposed object entities and complex object/background +compositions. In this paper, we present Total-Decom, a novel method for +decomposed 3D reconstruction with minimal human interaction. Our approach +seamlessly integrates the Segment Anything Model (SAM) with hybrid +implicit-explicit neural surface representations and a mesh-based +region-growing technique for accurate 3D object decomposition. Total-Decom +requires minimal human annotations while providing users with real-time control +over the granularity and quality of decomposition. We extensively evaluate our +method on benchmark datasets and demonstrate its potential for downstream +applications, such as animation and scene editing. The code is available at +\href{https://github.com/CVMI-Lab/Total-Decom.git}{https://github.com/CVMI-Lab/Total-Decom.git}. + +
+
+ comment: 8 pages, 7 figures, accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Sparse Generation: Making Pseudo Labels Sparse for weakly supervision + with points + + +
+ In recent years, research on point weakly supervised object detection (PWSOD) +methods in the field of computer vision has attracted people's attention. +However, existing pseudo labels generation methods perform poorly in a small +amount of supervised annotation data and dense object detection tasks. We +consider the generation of weakly supervised pseudo labels as the result of +model's sparse output, and propose a method called Sparse Generation to make +pseudo labels sparse. It constructs dense tensors through the relationship +between data and detector model, optimizes three of its parameters, and obtains +a sparse tensor via coordinated calculation, thereby indirectly obtaining +higher quality pseudo labels, and solving the model's density problem in the +situation of only a small amount of supervised annotation data can be used. On +two broadly used open-source datasets (RSOD, SIMD) and a self-built dataset +(Bullet-Hole), the experimental results showed that the proposed method has a +significant advantage in terms of overall performance metrics, comparing to +that state-of-the-art method. + +
+
+
+
+
+ + ☆ FlowDepth: Decoupling Optical Flow for Self-Supervised Monocular Depth + Estimation + + +
+ Self-supervised multi-frame methods have currently achieved promising results +in depth estimation. However, these methods often suffer from mismatch problems +due to the moving objects, which break the static assumption. Additionally, +unfairness can occur when calculating photometric errors in high-freq or +low-texture regions of the images. To address these issues, existing approaches +use additional semantic priori black-box networks to separate moving objects +and improve the model only at the loss level. Therefore, we propose FlowDepth, +where a Dynamic Motion Flow Module (DMFM) decouples the optical flow by a +mechanism-based approach and warps the dynamic regions thus solving the +mismatch problem. For the unfairness of photometric errors caused by high-freq +and low-texture regions, we use Depth-Cue-Aware Blur (DCABlur) and Cost-Volume +sparsity loss respectively at the input and the loss level to solve the +problem. Experimental results on the KITTI and Cityscapes datasets show that +our method outperforms the state-of-the-art methods. + +
+
+
+
+
+ + ☆ CAT: Exploiting Inter-Class Dynamics for Domain Adaptive Object + Detection CVPR 2024 + + +
+ Domain adaptive object detection aims to adapt detection models to domains +where annotated data is unavailable. Existing methods have been proposed to +address the domain gap using the semi-supervised student-teacher framework. +However, a fundamental issue arises from the class imbalance in the labelled +training set, which can result in inaccurate pseudo-labels. The relationship +between classes, especially where one class is a majority and the other +minority, has a large impact on class bias. We propose Class-Aware Teacher +(CAT) to address the class bias issue in the domain adaptation setting. In our +work, we approximate the class relationships with our Inter-Class Relation +module (ICRm) and exploit it to reduce the bias within the model. In this way, +we are able to apply augmentations to highly related classes, both inter- and +intra-domain, to boost the performance of minority classes while having minimal +impact on majority classes. We further reduce the bias by implementing a +class-relation weight to our classification loss. Experiments conducted on +various datasets and ablation studies show that our method is able to address +the class bias in the domain adaptation setting. On the Cityscapes to Foggy +Cityscapes dataset, we attained a 52.5 mAP, a substantial improvement over the +51.2 mAP achieved by the state-of-the-art method. + +
+
+ comment: Accepted into CVPR 2024 +
+
+
+
+
+ + ☆ Neural Fields for 3D Tracking of Anatomy and Surgical Instruments in + Monocular Laparoscopic Video Clips + + +
+ Laparoscopic video tracking primarily focuses on two target types: surgical +instruments and anatomy. The former could be used for skill assessment, while +the latter is necessary for the projection of virtual overlays. Where +instrument and anatomy tracking have often been considered two separate +problems, in this paper, we propose a method for joint tracking of all +structures simultaneously. Based on a single 2D monocular video clip, we train +a neural field to represent a continuous spatiotemporal scene, used to create +3D tracks of all surfaces visible in at least one frame. Due to the small size +of instruments, they generally cover a small part of the image only, resulting +in decreased tracking accuracy. Therefore, we propose enhanced class weighting +to improve the instrument tracks. We evaluate tracking on video clips from +laparoscopic cholecystectomies, where we find mean tracking accuracies of 92.4% +for anatomical structures and 87.4% for instruments. Additionally, we assess +the quality of depth maps obtained from the method's scene reconstructions. We +show that these pseudo-depths have comparable quality to a state-of-the-art +pre-trained depth estimator. On laparoscopic videos in the SCARED dataset, the +method predicts depth with an MAE of 2.9 mm and a relative error of 9.2%. These +results show the feasibility of using neural fields for monocular 3D +reconstruction of laparoscopic scenes. + +
+
+
+
+
+ + ☆ Imperceptible Protection against Style Imitation from Diffusion Models + + +
+ Recent progress in diffusion models has profoundly enhanced the fidelity of +image generation. However, this has raised concerns about copyright +infringements. While prior methods have introduced adversarial perturbations to +prevent style imitation, most are accompanied by the degradation of artworks' +visual quality. Recognizing the importance of maintaining this, we develop a +visually improved protection method that preserves its protection capability. +To this end, we create a perceptual map to identify areas most sensitive to +human eyes. We then adjust the protection intensity guided by an instance-aware +refinement. We also integrate a perceptual constraints bank to further improve +the imperceptibility. Results show that our method substantially elevates the +quality of the protected image without compromising on protection efficacy. + +
+
+
+
+
+ + ☆ Sine Activated Low-Rank Matrices for Parameter Efficient Learning + + +
+ Low-rank decomposition has emerged as a vital tool for enhancing parameter +efficiency in neural network architectures, gaining traction across diverse +applications in machine learning. These techniques significantly lower the +number of parameters, striking a balance between compactness and performance. +However, a common challenge has been the compromise between parameter +efficiency and the accuracy of the model, where reduced parameters often lead +to diminished accuracy compared to their full-rank counterparts. In this work, +we propose a novel theoretical framework that integrates a sinusoidal function +within the low-rank decomposition process. This approach not only preserves the +benefits of the parameter efficiency characteristic of low-rank methods but +also increases the decomposition's rank, thereby enhancing model accuracy. Our +method proves to be an adaptable enhancement for existing low-rank models, as +evidenced by its successful application in Vision Transformers (ViT), Large +Language Models (LLMs), Neural Radiance Fields (NeRF), and 3D shape modeling. +This demonstrates the wide-ranging potential and efficiency of our proposed +technique. + +
+
+ comment: The first two authors contributed equally +
+
+
+
+
+ + ☆ RTracker: Recoverable Tracking via PN Tree Structured Memory CVPR 2024 + + +
+ Existing tracking methods mainly focus on learning better target +representation or developing more robust prediction models to improve tracking +performance. While tracking performance has significantly improved, the target +loss issue occurs frequently due to tracking failures, complete occlusion, or +out-of-view situations. However, considerably less attention is paid to the +self-recovery issue of tracking methods, which is crucial for practical +applications. To this end, we propose a recoverable tracking framework, +RTracker, that uses a tree-structured memory to dynamically associate a tracker +and a detector to enable self-recovery ability. Specifically, we propose a +Positive-Negative Tree-structured memory to chronologically store and maintain +positive and negative target samples. Upon the PN tree memory, we develop +corresponding walking rules for determining the state of the target and define +a set of control flows to unite the tracker and the detector in different +tracking scenarios. Our core idea is to use the support samples of positive and +negative target categories to establish a relative distance-based criterion for +a reliable assessment of target loss. The favorable performance in comparison +against the state-of-the-art methods on numerous challenging benchmarks +demonstrates the effectiveness of the proposed algorithm. + +
+
+ comment: accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Taming Lookup Tables for Efficient Image Retouching + + +
+ The widespread use of high-definition screens in edge devices, such as +end-user cameras, smartphones, and televisions, is spurring a significant +demand for image enhancement. Existing enhancement models often optimize for +high performance while falling short of reducing hardware inference time and +power consumption, especially on edge devices with constrained computing and +storage resources. To this end, we propose Image Color Enhancement Lookup Table +(ICELUT) that adopts LUTs for extremely efficient edge inference, without any +convolutional neural network (CNN). During training, we leverage pointwise +(1x1) convolution to extract color information, alongside a split fully +connected layer to incorporate global information. Both components are then +seamlessly converted into LUTs for hardware-agnostic deployment. ICELUT +achieves near-state-of-the-art performance and remarkably low power +consumption. We observe that the pointwise network structure exhibits robust +scalability, upkeeping the performance even with a heavily downsampled 32x32 +input image. These enable ICELUT, the first-ever purely LUT-based image +enhancer, to reach an unprecedented speed of 0.4ms on GPU and 7ms on CPU, at +least one order faster than any CNN solution. Codes are available at +https://github.com/Stephen0808/ICELUT. + +
+
+
+
+
+ + ☆ DreamSalon: A Staged Diffusion Framework for Preserving Identity-Context + in Editable Face Generation + + +
+ While large-scale pre-trained text-to-image models can synthesize diverse and +high-quality human-centered images, novel challenges arise with a nuanced task +of "identity fine editing": precisely modifying specific features of a subject +while maintaining its inherent identity and context. Existing personalization +methods either require time-consuming optimization or learning additional +encoders, adept in "identity re-contextualization". However, they often +struggle with detailed and sensitive tasks like human face editing. To address +these challenges, we introduce DreamSalon, a noise-guided, staged-editing +framework, uniquely focusing on detailed image manipulations and +identity-context preservation. By discerning editing and boosting stages via +the frequency and gradient of predicted noises, DreamSalon first performs +detailed manipulations on specific features in the editing stage, guided by +high-frequency information, and then employs stochastic denoising in the +boosting stage to improve image quality. For more precise editing, DreamSalon +semantically mixes source and target textual prompts, guided by differences in +their embedding covariances, to direct the model's focus on specific +manipulation areas. Our experiments demonstrate DreamSalon's ability to +efficiently and faithfully edit fine details on human faces, outperforming +existing methods both qualitatively and quantitatively. + +
+
+
+
+
+ + ☆ AZ-NAS: Assembling Zero-Cost Proxies for Network Architecture Search CVPR 2024 + + +
+ Training-free network architecture search (NAS) aims to discover +high-performing networks with zero-cost proxies, capturing network +characteristics related to the final performance. However, network rankings +estimated by previous training-free NAS methods have shown weak correlations +with the performance. To address this issue, we propose AZ-NAS, a novel +approach that leverages the ensemble of various zero-cost proxies to enhance +the correlation between a predicted ranking of networks and the ground truth +substantially in terms of the performance. To achieve this, we introduce four +novel zero-cost proxies that are complementary to each other, analyzing +distinct traits of architectures in the views of expressivity, progressivity, +trainability, and complexity. The proxy scores can be obtained simultaneously +within a single forward and backward pass, making an overall NAS process highly +efficient. In order to integrate the rankings predicted by our proxies +effectively, we introduce a non-linear ranking aggregation method that +highlights the networks highly-ranked consistently across all the proxies. +Experimental results conclusively demonstrate the efficacy and efficiency of +AZ-NAS, outperforming state-of-the-art methods on standard benchmarks, all +while maintaining a reasonable runtime cost. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Efficient and Effective Weakly-Supervised Action Segmentation via + Action-Transition-Aware Boundary Alignment CVPR 2024 + + +
+ Weakly-supervised action segmentation is a task of learning to partition a +long video into several action segments, where training videos are only +accompanied by transcripts (ordered list of actions). Most of existing methods +need to infer pseudo segmentation for training by serial alignment between all +frames and the transcript, which is time-consuming and hard to be parallelized +while training. In this work, we aim to escape from this inefficient alignment +with massive but redundant frames, and instead to directly localize a few +action transitions for pseudo segmentation generation, where a transition +refers to the change from an action segment to its next adjacent one in the +transcript. As the true transitions are submerged in noisy boundaries due to +intra-segment visual variation, we propose a novel Action-Transition-Aware +Boundary Alignment (ATBA) framework to efficiently and effectively filter out +noisy boundaries and detect transitions. In addition, to boost the semantic +learning in the case that noise is inevitably present in the pseudo +segmentation, we also introduce video-level losses to utilize the trusted +video-level supervision. Extensive experiments show the effectiveness of our +approach on both performance and training speed. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Towards Multimodal Video Paragraph Captioning Models Robust to Missing + Modality + + +
+ Video paragraph captioning (VPC) involves generating detailed narratives for +long videos, utilizing supportive modalities such as speech and event +boundaries. However, the existing models are constrained by the assumption of +constant availability of a single auxiliary modality, which is impractical +given the diversity and unpredictable nature of real-world scenarios. To this +end, we propose a Missing-Resistant framework MR-VPC that effectively harnesses +all available auxiliary inputs and maintains resilience even in the absence of +certain modalities. Under this framework, we propose the Multimodal VPC (MVPC) +architecture integrating video, speech, and event boundary inputs in a unified +manner to process various auxiliary inputs. Moreover, to fortify the model +against incomplete data, we introduce DropAM, a data augmentation strategy that +randomly omits auxiliary inputs, paired with DistillAM, a regularization target +that distills knowledge from teacher models trained on modality-complete data, +enabling efficient learning in modality-deficient environments. Through +exhaustive experimentation on YouCook2 and ActivityNet Captions, MR-VPC has +proven to deliver superior performance on modality-complete and +modality-missing test data. This work highlights the significance of developing +resilient VPC models and paves the way for more adaptive, robust multimodal +video understanding. + +
+
+ comment: Code available at https://github.com/lancopku/MR-VPC +
+
+
+
+
+ + ☆ GeoAuxNet: Towards Universal 3D Representation Learning for Multi-sensor + Point Clouds CVPR 2024 + + +
+ Point clouds captured by different sensors such as RGB-D cameras and LiDAR +possess non-negligible domain gaps. Most existing methods design different +network architectures and train separately on point clouds from various +sensors. Typically, point-based methods achieve outstanding performances on +even-distributed dense point clouds from RGB-D cameras, while voxel-based +methods are more efficient for large-range sparse LiDAR point clouds. In this +paper, we propose geometry-to-voxel auxiliary learning to enable voxel +representations to access point-level geometric information, which supports +better generalisation of the voxel-based backbone with additional +interpretations of multi-sensor point clouds. Specifically, we construct +hierarchical geometry pools generated by a voxel-guided dynamic point network, +which efficiently provide auxiliary fine-grained geometric information adapted +to different stages of voxel features. We conduct experiments on joint +multi-sensor datasets to demonstrate the effectiveness of GeoAuxNet. Enjoying +elaborate geometric information, our method outperforms other models +collectively trained on multi-sensor datasets, and achieve competitive results +with the-state-of-art experts on each single dataset. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Learning Multiple Representations with Inconsistency-Guided Detail + Regularization for Mask-Guided Matting + + +
+ Mask-guided matting networks have achieved significant improvements and have +shown great potential in practical applications in recent years. However, +simply learning matting representation from synthetic and +lack-of-real-world-diversity matting data, these approaches tend to overfit +low-level details in wrong regions, lack generalization to objects with complex +structures and real-world scenes such as shadows, as well as suffer from +interference of background lines or textures. To address these challenges, in +this paper, we propose a novel auxiliary learning framework for mask-guided +matting models, incorporating three auxiliary tasks: semantic segmentation, +edge detection, and background line detection besides matting, to learn +different and effective representations from different types of data and +annotations. Our framework and model introduce the following key aspects: (1) +to learn real-world adaptive semantic representation for objects with diverse +and complex structures under real-world scenes, we introduce extra semantic +segmentation and edge detection tasks on more diverse real-world data with +segmentation annotations; (2) to avoid overfitting on low-level details, we +propose a module to utilize the inconsistency between learned segmentation and +matting representations to regularize detail refinement; (3) we propose a novel +background line detection task into our auxiliary learning framework, to +suppress interference of background lines or textures. In addition, we propose +a high-quality matting benchmark, Plant-Mat, to evaluate matting methods on +complex structures. Extensively quantitative and qualitative results show that +our approach outperforms state-of-the-art mask-guided methods. + +
+
+
+
+
+ + ☆ From Activation to Initialization: Scaling Insights for Optimizing + Neural Fields CVPR 2024 + + +
+ In the realm of computer vision, Neural Fields have gained prominence as a +contemporary tool harnessing neural networks for signal representation. Despite +the remarkable progress in adapting these networks to solve a variety of +problems, the field still lacks a comprehensive theoretical framework. This +article aims to address this gap by delving into the intricate interplay +between initialization and activation, providing a foundational basis for the +robust optimization of Neural Fields. Our theoretical insights reveal a +deep-seated connection among network initialization, architectural choices, and +the optimization process, emphasizing the need for a holistic approach when +designing cutting-edge Neural Fields. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Single-Shared Network with Prior-Inspired Loss for Parameter-Efficient + Multi-Modal Imaging Skin Lesion Classification + + +
+ In this study, we introduce a multi-modal approach that efficiently +integrates multi-scale clinical and dermoscopy features within a single +network, thereby substantially reducing model parameters. The proposed method +includes three novel fusion schemes. + Firstly, unlike current methods that usually employ two individual models for +for clinical and dermoscopy modalities, we verified that multimodal feature can +be learned by sharing the parameters of encoder while leaving the individual +modal-specific classifiers. + Secondly, the shared cross-attention module can replace the individual one to +efficiently interact between two modalities at multiple layers. + Thirdly, different from current methods that equally optimize dermoscopy and +clinical branches, inspired by prior knowledge that dermoscopy images play a +more significant role than clinical images, we propose a novel biased loss. +This loss guides the single-shared network to prioritize dermoscopy information +over clinical information, implicitly learning a better joint feature +representation for the modal-specific task. + Extensive experiments on a well-recognized Seven-Point Checklist (SPC) +dataset and a collected dataset demonstrate the effectiveness of our method on +both CNN and Transformer structures. Furthermore, our method exhibits +superiority in both accuracy and model parameters compared to currently +advanced methods. + +
+
+ comment: This paper have submitted to Journal for review +
+
+
+
+
+ + ☆ Text Data-Centric Image Captioning with Interactive Prompts + + +
+ Supervised image captioning approaches have made great progress, but it is +challenging to collect high-quality human-annotated image-text data. Recently, +large-scale vision and language models (e.g., CLIP) and large-scale generative +language models (e.g., GPT-2) have shown strong performances in various tasks, +which also provide some new solutions for image captioning with web paired +data, unpaired data or even text-only data. Among them, the mainstream solution +is to project image embeddings into the text embedding space with the +assistance of consistent representations between image-text pairs from the CLIP +model. However, the current methods still face several challenges in adapting +to the diversity of data configurations in a unified solution, accurately +estimating image-text embedding bias, and correcting unsatisfactory prediction +results in the inference stage. This paper proposes a new Text data-centric +approach with Interactive Prompts for image Captioning, named TIPCap. 1) We +consider four different settings which gradually reduce the dependence on +paired data. 2) We construct a mapping module driven by multivariate Gaussian +distribution to mitigate the modality gap, which is applicable to the above +four different settings. 3) We propose a prompt interaction module that can +incorporate optional prompt information before generating captions. Extensive +experiments show that our TIPCap outperforms other weakly or unsupervised image +captioning methods and achieves a new state-of-the-art performance on two +widely used datasets, i.e., MS-COCO and Flickr30K. + +
+
+
+
+
+ + ☆ Rethinking Information Loss in Medical Image Segmentation with + Various-sized Targets + + +
+ Medical image segmentation presents the challenge of segmenting various-size +targets, demanding the model to effectively capture both local and global +information. Despite recent efforts using CNNs and ViTs to predict annotations +of different scales, these approaches often struggle to effectively balance the +detection of targets across varying sizes. Simply utilizing local information +from CNNs and global relationships from ViTs without considering potential +significant divergence in latent feature distributions may result in +substantial information loss. To address this issue, in this paper, we will +introduce a novel Stagger Network (SNet) and argues that a well-designed fusion +structure can mitigate the divergence in latent feature distributions between +CNNs and ViTs, thereby reducing information loss. Specifically, to emphasize +both global dependencies and local focus, we design a Parallel Module to bridge +the semantic gap. Meanwhile, we propose the Stagger Module, trying to fuse the +selected features that are more semantically similar. An Information Recovery +Module is further adopted to recover complementary information back to the +network. As a key contribution, we theoretically analyze that the proposed +parallel and stagger strategies would lead to less information loss, thus +certifying the SNet's rationale. Experimental results clearly proved that the +proposed SNet excels comparisons with recent SOTAs in segmenting on the Synapse +dataset where targets are in various sizes. Besides, it also demonstrates +superiority on the ACDC and the MoNuSeg datasets where targets are with more +consistent dimensions. + +
+
+
+
+
+ + ☆ Algorithmic Ways of Seeing: Using Object Detection to Facilitate Art + Exploration + + +
+ This Research through Design paper explores how object detection may be +applied to a large digital art museum collection to facilitate new ways of +encountering and experiencing art. We present the design and evaluation of an +interactive application called SMKExplore, which allows users to explore a +museum's digital collection of paintings by browsing through objects detected +in the images, as a novel form of open-ended exploration. We provide three +contributions. First, we show how an object detection pipeline can be +integrated into a design process for visual exploration. Second, we present the +design and development of an app that enables exploration of an art museum's +collection. Third, we offer reflections on future possibilities for museums and +HCI researchers to incorporate object detection techniques into the +digitalization of museums. + +
+
+
+
+
+ + ☆ RecDiffusion: Rectangling for Image Stitching with Diffusion Models + + +
+ Image stitching from different captures often results in non-rectangular +boundaries, which is often considered unappealing. To solve non-rectangular +boundaries, current solutions involve cropping, which discards image content, +inpainting, which can introduce unrelated content, or warping, which can +distort non-linear features and introduce artifacts. To overcome these issues, +we introduce a novel diffusion-based learning framework, \textbf{RecDiffusion}, +for image stitching rectangling. This framework combines Motion Diffusion +Models (MDM) to generate motion fields, effectively transitioning from the +stitched image's irregular borders to a geometrically corrected intermediary. +Followed by Content Diffusion Models (CDM) for image detail refinement. +Notably, our sampling process utilizes a weighted map to identify regions +needing correction during each iteration of CDM. Our RecDiffusion ensures +geometric accuracy and overall visual appeal, surpassing all previous methods +in both quantitative and qualitative measures when evaluated on public +benchmarks. Code is released at https://github.com/lhaippp/RecDiffusion. + +
+
+
+
+
+ + ☆ D'OH: Decoder-Only random Hypernetworks for Implicit Neural + Representations + + +
+ Deep implicit functions have been found to be an effective tool for +efficiently encoding all manner of natural signals. Their attractiveness stems +from their ability to compactly represent signals with little to no off-line +training data. Instead, they leverage the implicit bias of deep networks to +decouple hidden redundancies within the signal. In this paper, we explore the +hypothesis that additional compression can be achieved by leveraging the +redundancies that exist between layers. We propose to use a novel run-time +decoder-only hypernetwork - that uses no offline training data - to better +model this cross-layer parameter redundancy. Previous applications of +hyper-networks with deep implicit functions have applied feed-forward +encoder/decoder frameworks that rely on large offline datasets that do not +generalize beyond the signals they were trained on. We instead present a +strategy for the initialization of run-time deep implicit functions for +single-instance signals through a Decoder-Only randomly projected Hypernetwork +(D'OH). By directly changing the dimension of a latent code to approximate a +target implicit neural architecture, we provide a natural way to vary the +memory footprint of neural representations without the costly need for neural +architecture search on a space of alternative low-rate structures. + +
+
+ comment: 29 pages, 17 figures +
+
+
+
+
+ + ☆ Within the Dynamic Context: Inertia-aware 3D Human Modeling with Pose + Sequence + + +
+ Neural rendering techniques have significantly advanced 3D human body +modeling. However, previous approaches often overlook dynamics induced by +factors such as motion inertia, leading to challenges in scenarios like abrupt +stops after rotation, where the pose remains static while the appearance +changes. This limitation arises from reliance on a single pose as conditional +input, resulting in ambiguity in mapping one pose to multiple appearances. In +this study, we elucidate that variations in human appearance depend not only on +the current frame's pose condition but also on past pose states. Therefore, we +introduce Dyco, a novel method utilizing the delta pose sequence representation +for non-rigid deformations and canonical space to effectively model temporal +appearance variations. To prevent a decrease in the model's generalization +ability to novel poses, we further propose low-dimensional global context to +reduce unnecessary inter-body part dependencies and a quantization operation to +mitigate overfitting of the delta pose sequence by the model. To validate the +effectiveness of our approach, we collected a novel dataset named I3D-Human, +with a focus on capturing temporal changes in clothing appearance under +approximate poses. Through extensive experiments on both I3D-Human and existing +datasets, our approach demonstrates superior qualitative and quantitative +performance. In addition, our inertia-aware 3D human method can unprecedentedly +simulate appearance changes caused by inertia at different velocities. + +
+
+
+
+
+ + ☆ Uncertainty-Aware Deep Video Compression with Ensembles + + +
+ Deep learning-based video compression is a challenging task, and many +previous state-of-the-art learning-based video codecs use optical flows to +exploit the temporal correlation between successive frames and then compress +the residual error. Although these two-stage models are end-to-end optimized, +the epistemic uncertainty in the motion estimation and the aleatoric +uncertainty from the quantization operation lead to errors in the intermediate +representations and introduce artifacts in the reconstructed frames. This +inherent flaw limits the potential for higher bit rate savings. To address this +issue, we propose an uncertainty-aware video compression model that can +effectively capture the predictive uncertainty with deep ensembles. +Additionally, we introduce an ensemble-aware loss to encourage the diversity +among ensemble members and investigate the benefits of incorporating +adversarial training in the video compression task. Experimental results on +1080p sequences show that our model can effectively save bits by more than 20% +compared to DVC Pro. + +
+
+ comment: Published on IEEE Transactions on Multimedia +
+
+
+
+
+ + ☆ Towards Understanding Dual BN In Hybrid Adversarial Training + + +
+ There is a growing concern about applying batch normalization (BN) in +adversarial training (AT), especially when the model is trained on both +adversarial samples and clean samples (termed Hybrid-AT). With the assumption +that adversarial and clean samples are from two different domains, a common +practice in prior works is to adopt Dual BN, where BN and BN are used for +adversarial and clean branches, respectively. A popular belief for motivating +Dual BN is that estimating normalization statistics of this mixture +distribution is challenging and thus disentangling it for normalization +achieves stronger robustness. In contrast to this belief, we reveal that +disentangling statistics plays a less role than disentangling affine parameters +in model training. This finding aligns with prior work (Rebuffi et al., 2023), +and we build upon their research for further investigations. We demonstrate +that the domain gap between adversarial and clean samples is not very large, +which is counter-intuitive considering the significant influence of adversarial +perturbation on the model accuracy. We further propose a two-task hypothesis +which serves as the empirical foundation and a unified framework for Hybrid-AT +improvement. We also investigate Dual BN in test-time and reveal that affine +parameters characterize the robustness during inference. Overall, our work +sheds new light on understanding the mechanism of Dual BN in Hybrid-AT and its +underlying justification. + +
+
+ comment: Accepted at TMLR +
+
+
+
+
+ + ☆ MoDiTalker: Motion-Disentangled Diffusion Model for High-Fidelity + Talking Head Generation + + +
+ Conventional GAN-based models for talking head generation often suffer from +limited quality and unstable training. Recent approaches based on diffusion +models aimed to address these limitations and improve fidelity. However, they +still face challenges, including extensive sampling times and difficulties in +maintaining temporal consistency due to the high stochasticity of diffusion +models. To overcome these challenges, we propose a novel motion-disentangled +diffusion model for high-quality talking head generation, dubbed MoDiTalker. We +introduce the two modules: audio-to-motion (AToM), designed to generate a +synchronized lip motion from audio, and motion-to-video (MToV), designed to +produce high-quality head video following the generated motion. AToM excels in +capturing subtle lip movements by leveraging an audio attention mechanism. In +addition, MToV enhances temporal consistency by leveraging an efficient +tri-plane representation. Our experiments conducted on standard benchmarks +demonstrate that our model achieves superior performance compared to existing +models. We also provide comprehensive ablation studies and user study results. + +
+
+
+
+
+ + ☆ QNCD: Quantization Noise Correction for Diffusion Models + + +
+ Diffusion models have revolutionized image synthesis, setting new benchmarks +in quality and creativity. However, their widespread adoption is hindered by +the intensive computation required during the iterative denoising process. +Post-training quantization (PTQ) presents a solution to accelerate sampling, +aibeit at the expense of sample quality, extremely in low-bit settings. +Addressing this, our study introduces a unified Quantization Noise Correction +Scheme (QNCD), aimed at minishing quantization noise throughout the sampling +process. We identify two primary quantization challenges: intra and inter +quantization noise. Intra quantization noise, mainly exacerbated by embeddings +in the resblock module, extends activation quantization ranges, increasing +disturbances in each single denosing step. Besides, inter quantization noise +stems from cumulative quantization deviations across the entire denoising +process, altering data distributions step-by-step. QNCD combats these through +embedding-derived feature smoothing for eliminating intra quantization noise +and an effective runtime noise estimatiation module for dynamicly filtering +inter quantization noise. Extensive experiments demonstrate that our method +outperforms previous quantization methods for diffusion models, achieving +lossless results in W4A8 and W8A8 quantization settings on ImageNet (LDM-4). +Code is available at: https://github.com/huanpengchu/QNCD + +
+
+
+
+
+ + ☆ CLAP4CLIP: Continual Learning with Probabilistic Finetuning for + Vision-Language Models + + +
+ Continual learning (CL) aims to help deep neural networks to learn new +knowledge while retaining what has been learned. Recently, pre-trained +vision-language models such as CLIP, with powerful generalization ability, have +been gaining traction as practical CL candidates. However, the domain mismatch +between the pre-training and the downstream CL tasks calls for finetuning of +the CLIP on the latter. The deterministic nature of the existing finetuning +methods makes them overlook the many possible interactions across the +modalities and deems them unsafe for high-risk CL tasks requiring reliable +uncertainty estimation. To address these, our work proposes Continual LeArning +with Probabilistic finetuning (CLAP). CLAP develops probabilistic modeling over +task-specific modules with visual-guided text features, providing more reliable +fine-tuning in CL. It further alleviates forgetting by exploiting the rich +pre-trained knowledge of CLIP for weight initialization and distribution +regularization of task-specific modules. Cooperating with the diverse range of +existing prompting methods, CLAP can surpass the predominant deterministic +finetuning approaches for CL with CLIP. Lastly, we study the superior +uncertainty estimation abilities of CLAP for novel data detection and exemplar +selection within CL setups. Our code is available at +\url{https://github.com/srvCodes/clap4clip}. + +
+
+ comment: Work under review +
+
+
+
+
+ + ☆ OmniParser: A Unified Framework for Text Spotting, Key Information + Extraction and Table Recognition CVPR 2024 + + +
+ Recently, visually-situated text parsing (VsTP) has experienced notable +advancements, driven by the increasing demand for automated document +understanding and the emergence of Generative Large Language Models (LLMs) +capable of processing document-based questions. Various methods have been +proposed to address the challenging problem of VsTP. However, due to the +diversified targets and heterogeneous schemas, previous works usually design +task-specific architectures and objectives for individual tasks, which +inadvertently leads to modal isolation and complex workflow. In this paper, we +propose a unified paradigm for parsing visually-situated text across diverse +scenarios. Specifically, we devise a universal model, called OmniParser, which +can simultaneously handle three typical visually-situated text parsing tasks: +text spotting, key information extraction, and table recognition. In +OmniParser, all tasks share the unified encoder-decoder architecture, the +unified objective: point-conditioned text generation, and the unified input & +output representation: prompt & structured sequences. Extensive experiments +demonstrate that the proposed OmniParser achieves state-of-the-art (SOTA) or +highly competitive performances on 7 datasets for the three visually-situated +text parsing tasks, despite its unified, concise design. The code is available +at https://github.com/AlibabaResearch/AdvancedLiterateMachinery. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ PoCo: A Self-Supervised Approach via Polar Transformation Based + Progressive Contrastive Learning for Ophthalmic Disease Diagnosis + + +
+ Automatic ophthalmic disease diagnosis on fundus images is important in +clinical practice. However, due to complex fundus textures and limited +annotated data, developing an effective automatic method for this problem is +still challenging. In this paper, we present a self-supervised method via polar +transformation based progressive contrastive learning, called PoCo, for +ophthalmic disease diagnosis. Specifically, we novelly inject the polar +transformation into contrastive learning to 1) promote contrastive learning +pre-training to be faster and more stable and 2) naturally capture task-free +and rotation-related textures, which provides insights into disease recognition +on fundus images. Beneficially, simple normal translation-invariant convolution +on transformed images can equivalently replace the complex rotation-invariant +and sector convolution on raw images. After that, we develop a progressive +contrastive learning method to efficiently utilize large unannotated images and +a novel progressive hard negative sampling scheme to gradually reduce the +negative sample number for efficient training and performance enhancement. +Extensive experiments on three public ophthalmic disease datasets show that our +PoCo achieves state-of-the-art performance with good generalization ability, +validating that our method can reduce annotation efforts and provide reliable +diagnosis. Codes are available at \url{https://github.com/wjh892521292/PoCo}. + +
+
+
+
+
+ + ☆ Patch Spatio-Temporal Relation Prediction for Video Anomaly Detection + + +
+ Video Anomaly Detection (VAD), aiming to identify abnormalities within a +specific context and timeframe, is crucial for intelligent Video Surveillance +Systems. While recent deep learning-based VAD models have shown promising +results by generating high-resolution frames, they often lack competence in +preserving detailed spatial and temporal coherence in video frames. To tackle +this issue, we propose a self-supervised learning approach for VAD through an +inter-patch relationship prediction task. Specifically, we introduce a +two-branch vision transformer network designed to capture deep visual features +of video frames, addressing spatial and temporal dimensions responsible for +modeling appearance and motion patterns, respectively. The inter-patch +relationship in each dimension is decoupled into inter-patch similarity and the +order information of each patch. To mitigate memory consumption, we convert the +order information prediction task into a multi-label learning problem, and the +inter-patch similarity prediction task into a distance matrix regression +problem. Comprehensive experiments demonstrate the effectiveness of our method, +surpassing pixel-generation-based methods by a significant margin across three +public benchmarks. Additionally, our approach outperforms other self-supervised +learning-based methods. + +
+
+
+
+
+ + ☆ Synthetic Medical Imaging Generation with Generative Adversarial + Networks For Plain Radiographs + + +
+ In medical imaging, access to data is commonly limited due to patient privacy +restrictions and the issue that it can be difficult to acquire enough data in +the case of rare diseases.[1] The purpose of this investigation was to develop +a reusable open-source synthetic image generation pipeline, the GAN Image +Synthesis Tool (GIST), that is easy to use as well as easy to deploy. The +pipeline helps to improve and standardize AI algorithms in the digital health +space by generating high quality synthetic image data that is not linked to +specific patients. Its image generation capabilities include the ability to +generate imaging of pathologies or injuries with low incidence rates. This +improvement of digital health AI algorithms could improve diagnostic accuracy, +aid in patient care, decrease medicolegal claims, and ultimately decrease the +overall cost of healthcare. The pipeline builds on existing Generative +Adversarial Networks (GANs) algorithms, and preprocessing and evaluation steps +were included for completeness. For this work, we focused on ensuring the +pipeline supports radiography, with a focus on synthetic knee and elbow x-ray +images. In designing the pipeline, we evaluated the performance of current GAN +architectures, studying the performance on available x-ray data. We show that +the pipeline is capable of generating high quality and clinically relevant +images based on a lay person's evaluation and the Fr\'echet Inception Distance +(FID) metric. + +
+
+
+
+
+ + ☆ CRKD: Enhanced Camera-Radar Object Detection with Cross-modality + Knowledge Distillation CVPR 2024 + + +
+ In the field of 3D object detection for autonomous driving, LiDAR-Camera (LC) +fusion is the top-performing sensor configuration. Still, LiDAR is relatively +high cost, which hinders adoption of this technology for consumer automobiles. +Alternatively, camera and radar are commonly deployed on vehicles already on +the road today, but performance of Camera-Radar (CR) fusion falls behind LC +fusion. In this work, we propose Camera-Radar Knowledge Distillation (CRKD) to +bridge the performance gap between LC and CR detectors with a novel +cross-modality KD framework. We use the Bird's-Eye-View (BEV) representation as +the shared feature space to enable effective knowledge distillation. To +accommodate the unique cross-modality KD path, we propose four distillation +losses to help the student learn crucial features from the teacher model. We +present extensive evaluations on the nuScenes dataset to demonstrate the +effectiveness of the proposed CRKD framework. The project page for CRKD is +https://song-jingyu.github.io/CRKD. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Automated Black-box Prompt Engineering for Personalized Text-to-Image + Generation + + +
+ Prompt engineering is effective for controlling the output of text-to-image +(T2I) generative models, but it is also laborious due to the need for manually +crafted prompts. This challenge has spurred the development of algorithms for +automated prompt generation. However, these methods often struggle with +transferability across T2I models, require white-box access to the underlying +model, and produce non-intuitive prompts. In this work, we introduce PRISM, an +algorithm that automatically identifies human-interpretable and transferable +prompts that can effectively generate desired concepts given only black-box +access to T2I models. Inspired by large language model (LLM) jailbreaking, +PRISM leverages the in-context learning ability of LLMs to iteratively refine +the candidate prompts distribution for given reference images. Our experiments +demonstrate the versatility and effectiveness of PRISM in generating accurate +prompts for objects, styles and images across multiple T2I models, including +Stable Diffusion, DALL-E, and Midjourney. + +
+
+
+
+
+ + ☆ AAPMT: AGI Assessment Through Prompt and Metric Transformer + + +
+ The emergence of text-to-image models marks a significant milestone in the +evolution of AI-generated images (AGIs), expanding their use in diverse domains +like design, entertainment, and more. Despite these breakthroughs, the quality +of AGIs often remains suboptimal, highlighting the need for effective +evaluation methods. These methods are crucial for assessing the quality of +images relative to their textual descriptions, and they must accurately mirror +human perception. Substantial progress has been achieved in this domain, with +innovative techniques such as BLIP and DBCNN contributing significantly. +However, recent studies, including AGIQA-3K, reveal a notable discrepancy +between current methods and state-of-the-art (SOTA) standards. This gap +emphasizes the necessity for a more sophisticated and precise evaluation +metric. In response, our objective is to develop a model that could give +ratings for metrics, which focuses on parameters like perceptual quality, +authenticity, and the correspondence between text and image, that more closely +aligns with human perception. In our paper, we introduce a range of effective +methods, including prompt designs and the Metric Transformer. The Metric +Transformer is a novel structure inspired by the complex interrelationships +among various AGI quality metrics. The code is available at +https://github.com/huskydoge/CS3324-Digital-Image-Processing/tree/main/Assignment1 + +
+
+
+
+
+ + ☆ GraphAD: Interaction Scene Graph for End-to-end Autonomous Driving + + +
+ Modeling complicated interactions among the ego-vehicle, road agents, and map +elements has been a crucial part for safety-critical autonomous driving. +Previous works on end-to-end autonomous driving rely on the attention mechanism +for handling heterogeneous interactions, which fails to capture the geometric +priors and is also computationally intensive. In this paper, we propose the +Interaction Scene Graph (ISG) as a unified method to model the interactions +among the ego-vehicle, road agents, and map elements. With the representation +of the ISG, the driving agents aggregate essential information from the most +influential elements, including the road agents with potential collisions and +the map elements to follow. Since a mass of unnecessary interactions are +omitted, the more efficient scene-graph-based framework is able to focus on +indispensable connections and leads to better performance. We evaluate the +proposed method for end-to-end autonomous driving on the nuScenes dataset. +Compared with strong baselines, our method significantly outperforms in the +full-stack driving tasks, including perception, prediction, and planning. Code +will be released at https://github.com/zhangyp15/GraphAD. + +
+
+ comment: project page: https://github.com/zhangyp15/GraphAD +
+
+
+
+
+ + ☆ MMCert: Provable Defense against Adversarial Attacks to Multi-modal + Models + + +
+ Different from a unimodal model whose input is from a single modality, the +input (called multi-modal input) of a multi-modal model is from multiple +modalities such as image, 3D points, audio, text, etc. Similar to unimodal +models, many existing studies show that a multi-modal model is also vulnerable +to adversarial perturbation, where an attacker could add small perturbation to +all modalities of a multi-modal input such that the multi-modal model makes +incorrect predictions for it. Existing certified defenses are mostly designed +for unimodal models, which achieve sub-optimal certified robustness guarantees +when extended to multi-modal models as shown in our experimental results. In +our work, we propose MMCert, the first certified defense against adversarial +attacks to a multi-modal model. We derive a lower bound on the performance of +our MMCert under arbitrary adversarial attacks with bounded perturbations to +both modalities (e.g., in the context of auto-driving, we bound the number of +changed pixels in both RGB image and depth image). We evaluate our MMCert using +two benchmark datasets: one for the multi-modal road segmentation task and the +other for the multi-modal emotion recognition task. Moreover, we compare our +MMCert with a state-of-the-art certified defense extended from unimodal models. +Our experimental results show that our MMCert outperforms the baseline. + +
+
+
+
+
+ + ☆ A Real-Time Framework for Domain-Adaptive Underwater Object Detection + with Image Enhancement ICRA24 + + +
+ In recent years, significant progress has been made in the field of +underwater image enhancement (UIE). However, its practical utility for +high-level vision tasks, such as underwater object detection (UOD) in +Autonomous Underwater Vehicles (AUVs), remains relatively unexplored. It may be +attributed to several factors: (1) Existing methods typically employ UIE as a +pre-processing step, which inevitably introduces considerable computational +overhead and latency. (2) The process of enhancing images prior to training +object detectors may not necessarily yield performance improvements. (3) The +complex underwater environments can induce significant domain shifts across +different scenarios, seriously deteriorating the UOD performance. To address +these challenges, we introduce EnYOLO, an integrated real-time framework +designed for simultaneous UIE and UOD with domain-adaptation capability. +Specifically, both the UIE and UOD task heads share the same network backbone +and utilize a lightweight design. Furthermore, to ensure balanced training for +both tasks, we present a multi-stage training strategy aimed at consistently +enhancing their performance. Additionally, we propose a novel domain-adaptation +strategy to align feature embeddings originating from diverse underwater +environments. Comprehensive experiments demonstrate that our framework not only +achieves state-of-the-art (SOTA) performance in both UIE and UOD tasks, but +also shows superior adaptability when applied to different underwater +scenarios. Our efficiency analysis further highlights the substantial potential +of our framework for onboard deployment. + +
+
+ comment: accepted by ICRA24 +
+
+
+
+
+ + ☆ MVEB: Self-Supervised Learning with Multi-View Entropy Bottleneck + + +
+ Self-supervised learning aims to learn representation that can be effectively +generalized to downstream tasks. Many self-supervised approaches regard two +views of an image as both the input and the self-supervised signals, assuming +that either view contains the same task-relevant information and the shared +information is (approximately) sufficient for predicting downstream tasks. +Recent studies show that discarding superfluous information not shared between +the views can improve generalization. Hence, the ideal representation is +sufficient for downstream tasks and contains minimal superfluous information, +termed minimal sufficient representation. One can learn this representation by +maximizing the mutual information between the representation and the supervised +view while eliminating superfluous information. Nevertheless, the computation +of mutual information is notoriously intractable. In this work, we propose an +objective termed multi-view entropy bottleneck (MVEB) to learn minimal +sufficient representation effectively. MVEB simplifies the minimal sufficient +learning to maximizing both the agreement between the embeddings of two views +and the differential entropy of the embedding distribution. Our experiments +confirm that MVEB significantly improves performance. For example, it achieves +top-1 accuracy of 76.9\% on ImageNet with a vanilla ResNet-50 backbone on +linear evaluation. To the best of our knowledge, this is the new +state-of-the-art result with ResNet-50. + +
+
+ comment: Accepted by TPAMI +
+
+
+
+
+ + ☆ Tiny Machine Learning: Progress and Futures + + +
+ Tiny Machine Learning (TinyML) is a new frontier of machine learning. By +squeezing deep learning models into billions of IoT devices and +microcontrollers (MCUs), we expand the scope of AI applications and enable +ubiquitous intelligence. However, TinyML is challenging due to hardware +constraints: the tiny memory resource makes it difficult to hold deep learning +models designed for cloud and mobile platforms. There is also limited compiler +and inference engine support for bare-metal devices. Therefore, we need to +co-design the algorithm and system stack to enable TinyML. In this review, we +will first discuss the definition, challenges, and applications of TinyML. We +then survey the recent progress in TinyML and deep learning on MCUs. Next, we +will introduce MCUNet, showing how we can achieve ImageNet-scale AI +applications on IoT devices with system-algorithm co-design. We will further +extend the solution from inference to training and introduce tiny on-device +training techniques. Finally, we present future directions in this area. +Today's large model might be tomorrow's tiny model. The scope of TinyML should +evolve and adapt over time. + +
+
+ comment: IEEE Circuits and Systems Magazine (2023). arXiv admin note: text + overlap with arXiv:2206.15472 +
+
+
+
+
+ + ☆ Low-Rank Rescaled Vision Transformer Fine-Tuning: A Residual Design + Approach + + +
+ Parameter-efficient fine-tuning for pre-trained Vision Transformers aims to +adeptly tailor a model to downstream tasks by learning a minimal set of new +adaptation parameters while preserving the frozen majority of pre-trained +parameters. Striking a balance between retaining the generalizable +representation capacity of the pre-trained model and acquiring task-specific +features poses a key challenge. Currently, there is a lack of focus on guiding +this delicate trade-off. In this study, we approach the problem from the +perspective of Singular Value Decomposition (SVD) of pre-trained parameter +matrices, providing insights into the tuning dynamics of existing methods. +Building upon this understanding, we propose a Residual-based Low-Rank +Rescaling (RLRR) fine-tuning strategy. This strategy not only enhances +flexibility in parameter tuning but also ensures that new parameters do not +deviate excessively from the pre-trained model through a residual design. +Extensive experiments demonstrate that our method achieves competitive +performance across various downstream image classification tasks, all while +maintaining comparable new parameters. We believe this work takes a step +forward in offering a unified perspective for interpreting existing methods and +serves as motivation for the development of new approaches that move closer to +effectively considering the crucial trade-off mentioned above. Our code is +available at +\href{https://github.com/zstarN70/RLRR.git}{https://github.com/zstarN70/RLRR.git}. + +
+
+
+
+
+ + ☆ Generative Quanta Color Imaging CVPR + + +
+ The astonishing development of single-photon cameras has created an +unprecedented opportunity for scientific and industrial imaging. However, the +high data throughput generated by these 1-bit sensors creates a significant +bottleneck for low-power applications. In this paper, we explore the +possibility of generating a color image from a single binary frame of a +single-photon camera. We evidently find this problem being particularly +difficult to standard colorization approaches due to the substantial degree of +exposure variation. The core innovation of our paper is an exposure synthesis +model framed under a neural ordinary differential equation (Neural ODE) that +allows us to generate a continuum of exposures from a single observation. This +innovation ensures consistent exposure in binary images that colorizers take +on, resulting in notably enhanced colorization. We demonstrate applications of +the method in single-image and burst colorization and show superior generative +performance over baselines. Project website can be found at +https://vishal-s-p.github.io/projects/2023/generative_quanta_color.html. + +
+
+ comment: Accepted at IEEE Conference on Computer Vision and Pattern + Recognition (CVPR), 2024 +
+
+
+
+
+ + ☆ Single-Shared Network with Prior-Inspired Loss for Parameter-Efficient + Multi-Modal Imaging Skin Lesion Classification + + +
+ In this study, we introduce a multi-modal approach that efficiently +integrates multi-scale clinical and dermoscopy features within a single +network, thereby substantially reducing model parameters. The proposed method +includes three novel fusion schemes. Firstly, unlike current methods that +usually employ two individual models for for clinical and dermoscopy +modalities, we verified that multimodal feature can be learned by sharing the +parameters of encoder while leaving the individual modal-specific classifiers. +Secondly, the shared cross-attention module can replace the individual one to +efficiently interact between two modalities at multiple layers. Thirdly, +different from current methods that equally optimize dermoscopy and clinical +branches, inspired by prior knowledge that dermoscopy images play a more +significant role than clinical images, we propose a novel biased loss. This +loss guides the single-shared network to prioritize dermoscopy information over +clinical information, implicitly learning a better joint feature representation +for the modal-specific task. Extensive experiments on a well-recognized +Seven-Point Checklist (SPC) dataset and a collected dataset demonstrate the +effectiveness of our method on both CNN and Transformer structures. +Furthermore, our method exhibits superiority in both accuracy and model +parameters compared to currently advanced methods. + +
+
+ comment: This paper have submitted to Journal for review +
+
+
+
+
+ + ☆ Tiny Machine Learning: Progress and Futures + + +
+ Tiny Machine Learning (TinyML) is a new frontier of machine learning. By +squeezing deep learning models into billions of IoT devices and +microcontrollers (MCUs), we expand the scope of AI applications and enable +ubiquitous intelligence. However, TinyML is challenging due to hardware +constraints: the tiny memory resource makes it difficult to hold deep learning +models designed for cloud and mobile platforms. There is also limited compiler +and inference engine support for bare-metal devices. Therefore, we need to +co-design the algorithm and system stack to enable TinyML. In this review, we +will first discuss the definition, challenges, and applications of TinyML. We +then survey the recent progress in TinyML and deep learning on MCUs. Next, we +will introduce MCUNet, showing how we can achieve ImageNet-scale AI +applications on IoT devices with system-algorithm co-design. We will further +extend the solution from inference to training and introduce tiny on-device +training techniques. Finally, we present future directions in this area. +Today's large model might be tomorrow's tiny model. The scope of TinyML should +evolve and adapt over time. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2206.15472 +
+
+
+
+
+ + ☆ Towards Long Term SLAM on Thermal Imagery IROS 2024 + + +
+ Visual SLAM with thermal imagery, and other low contrast visually degraded +environments such as underwater, or in areas dominated by snow and ice, remain +a difficult problem for many state of the art (SOTA) algorithms. In addition to +challenging front-end data association, thermal imagery presents an additional +difficulty for long term relocalization and map reuse. The relative +temperatures of objects in thermal imagery change dramatically from day to +night. Feature descriptors typically used for relocalization in SLAM are unable +to maintain consistency over these diurnal changes. We show that learned +feature descriptors can be used within existing Bag of Word based localization +schemes to dramatically improve place recognition across large temporal gaps in +thermal imagery. In order to demonstrate the effectiveness of our trained +vocabulary, we have developed a baseline SLAM system, integrating learned +features and matching into a classical SLAM algorithm. Our system demonstrates +good local tracking on challenging thermal imagery, and relocalization that +overcomes dramatic day to night thermal appearance changes. Our code and +datasets are available here: +https://github.com/neufieldrobotics/IRSLAM_Baseline + +
+
+ comment: 8 pages, 7 figures, Submitted to IROS 2024 +
+
+
+
+
+ + ☆ Enhancing Efficiency in Vision Transformer Networks: Design Techniques + and Insights + + +
+ Intrigued by the inherent ability of the human visual system to identify +salient regions in complex scenes, attention mechanisms have been seamlessly +integrated into various Computer Vision (CV) tasks. Building upon this +paradigm, Vision Transformer (ViT) networks exploit attention mechanisms for +improved efficiency. This review navigates the landscape of redesigned +attention mechanisms within ViTs, aiming to enhance their performance. This +paper provides a comprehensive exploration of techniques and insights for +designing attention mechanisms, systematically reviewing recent literature in +the field of CV. This survey begins with an introduction to the theoretical +foundations and fundamental concepts underlying attention mechanisms. We then +present a systematic taxonomy of various attention mechanisms within ViTs, +employing redesigned approaches. A multi-perspective categorization is proposed +based on their application, objectives, and the type of attention applied. The +analysis includes an exploration of the novelty, strengths, weaknesses, and an +in-depth evaluation of the different proposed strategies. This culminates in +the development of taxonomies that highlight key properties and contributions. +Finally, we gather the reviewed studies along with their available open-source +implementations at our +\href{https://github.com/mindflow-institue/Awesome-Attention-Mechanism-in-Medical-Imaging}{GitHub}\footnote{\url{https://github.com/xmindflow/Awesome-Attention-Mechanism-in-Medical-Imaging}}. +We aim to regularly update it with the most recent relevant papers. + +
+
+ comment: Submitted to Computational Visual Media Journal +
+
+
+
+
+ + ☆ Vision-Language Synthetic Data Enhances Echocardiography Downstream + Tasks MICCAI 2024 + + +
+ High-quality, large-scale data is essential for robust deep learning models +in medical applications, particularly ultrasound image analysis. Diffusion +models facilitate high-fidelity medical image generation, reducing the costs +associated with acquiring and annotating new images. This paper utilizes recent +vision-language models to produce diverse and realistic synthetic +echocardiography image data, preserving key features of the original images +guided by textual and semantic label maps. Specifically, we investigate three +potential avenues: unconditional generation, generation guided by text, and a +hybrid approach incorporating both textual and semantic supervision. We show +that the rich contextual information present in the synthesized data +potentially enhances the accuracy and interpretability of downstream tasks, +such as echocardiography segmentation and classification with improved metrics +and faster convergence. Our implementation with checkpoints, prompts, and the +created synthetic dataset will be publicly available at +\href{https://github.com/Pooria90/DiffEcho}{GitHub}. + +
+
+ comment: Submitted as a conference paper to MICCAI 2024 +
+
+
+
+
+ + ☆ Is Synthetic Image Useful for Transfer Learning? An Investigation into + Data Generation, Volume, and Utilization ICLR24 + + +
+ Synthetic image data generation represents a promising avenue for training +deep learning models, particularly in the realm of transfer learning, where +obtaining real images within a specific domain can be prohibitively expensive +due to privacy and intellectual property considerations. This work delves into +the generation and utilization of synthetic images derived from text-to-image +generative models in facilitating transfer learning paradigms. Despite the high +visual fidelity of the generated images, we observe that their naive +incorporation into existing real-image datasets does not consistently enhance +model performance due to the inherent distribution gap between synthetic and +real images. To address this issue, we introduce a novel two-stage framework +called bridged transfer, which initially employs synthetic images for +fine-tuning a pre-trained model to improve its transferability and subsequently +uses real data for rapid adaptation. Alongside, We propose dataset style +inversion strategy to improve the stylistic alignment between synthetic and +real images. Our proposed methods are evaluated across 10 different datasets +and 5 distinct models, demonstrating consistent improvements, with up to 30% +accuracy increase on classification tasks. Intriguingly, we note that the +enhancements were not yet saturated, indicating that the benefits may further +increase with an expanded volume of synthetic data. + +
+
+ comment: ICLR24 Score 6865 + https://openreview.net/forum?id=CjPt1AC6w0&referrer=%5Bthe%20profile%20of%20Chen%20Chen%5D(%2Fprofile%3Fid%3D~Chen_Chen20) +
+
+
+
+
+ + ☆ DeNetDM: Debiasing by Network Depth Modulation + + +
+ When neural networks are trained on biased datasets, they tend to +inadvertently learn spurious correlations, leading to challenges in achieving +strong generalization and robustness. Current approaches to address such biases +typically involve utilizing bias annotations, reweighting based on pseudo-bias +labels, or enhancing diversity within bias-conflicting data points through +augmentation techniques. We introduce DeNetDM, a novel debiasing method based +on the observation that shallow neural networks prioritize learning core +attributes, while deeper ones emphasize biases when tasked with acquiring +distinct information. Using a training paradigm derived from Product of +Experts, we create both biased and debiased branches with deep and shallow +architectures and then distill knowledge to produce the target debiased model. +Extensive experiments and analyses demonstrate that our approach outperforms +current debiasing techniques, achieving a notable improvement of around 5% in +three datasets, encompassing both synthetic and real-world data. Remarkably, +DeNetDM accomplishes this without requiring annotations pertaining to bias +labels or bias types, while still delivering performance on par with supervised +counterparts. Furthermore, our approach effectively harnesses the diversity of +bias-conflicting points within the data, surpassing previous methods and +obviating the need for explicit augmentation-based methods to enhance the +diversity of such bias-conflicting points. The source code will be available +upon acceptance. + +
+
+ comment: 23 pages including supplementary +
+
+
+
+
+ + ☆ Multi-Frame, Lightweight & Efficient Vision-Language Models for Question + Answering in Autonomous Driving + + +
+ Vision-Language Models (VLMs) and Multi-Modal Language models (MMLMs) have +become prominent in autonomous driving research, as these models can provide +interpretable textual reasoning and responses for end-to-end autonomous driving +safety tasks using traffic scene images and other data modalities. However, +current approaches to these systems use expensive large language model (LLM) +backbones and image encoders, making such systems unsuitable for real-time +autonomous driving systems where tight memory constraints exist and fast +inference time is necessary. To address these previous issues, we develop +EM-VLM4AD, an efficient, lightweight, multi-frame vision language model which +performs Visual Question Answering for autonomous driving. In comparison to +previous approaches, EM-VLM4AD requires at least 10 times less memory and +floating point operations, while also achieving higher BLEU-4, METEOR, CIDEr, +and ROGUE scores than the existing baseline on the DriveLM dataset. EM-VLM4AD +also exhibits the ability to extract relevant information from traffic views +related to prompts and can answer questions for various autonomous driving +subtasks. We release our code to train and evaluate our model at +https://github.com/akshaygopalkr/EM-VLM4AD. + +
+
+ comment: 9 pages, 3 figures +
+
+
+
+
+ + ☆ Concept-based Analysis of Neural Networks via Vision-Language Models + + +
+ Formal analysis of vision-based deep neural networks (DNNs) is highly +desirable but it is very challenging due to the difficulty of expressing formal +specifications for vision tasks and the lack of efficient verification +procedures. In this paper, we propose to leverage emerging multimodal, +vision-language, foundation models (VLMs) as a lens through which we can reason +about vision models. VLMs have been trained on a large body of images +accompanied by their textual description, and are thus implicitly aware of +high-level, human-understandable concepts describing the images. We describe a +logical specification language $\texttt{Con}_{\texttt{spec}}$ designed to +facilitate writing specifications in terms of these concepts. To define and +formally check $\texttt{Con}_{\texttt{spec}}$ specifications, we leverage a +VLM, which provides a means to encode and efficiently check natural-language +properties of vision models. We demonstrate our techniques on a ResNet-based +classifier trained on the RIVAL-10 dataset leveraging CLIP as the multimodal +model. + +
+
+
+
+
+ + ☆ X-MIC: Cross-Modal Instance Conditioning for Egocentric Action + Generalization CVPR 2024 + + +
+ Lately, there has been growing interest in adapting vision-language models +(VLMs) to image and third-person video classification due to their success in +zero-shot recognition. However, the adaptation of these models to egocentric +videos has been largely unexplored. To address this gap, we propose a simple +yet effective cross-modal adaptation framework, which we call X-MIC. Using a +video adapter, our pipeline learns to align frozen text embeddings to each +egocentric video directly in the shared embedding space. Our novel adapter +architecture retains and improves generalization of the pre-trained VLMs by +disentangling learnable temporal modeling and frozen visual encoder. This +results in an enhanced alignment of text embeddings to each egocentric video, +leading to a significant improvement in cross-dataset generalization. We +evaluate our approach on the Epic-Kitchens, Ego4D, and EGTEA datasets for +fine-grained cross-dataset action generalization, demonstrating the +effectiveness of our method. Code is available at +https://github.com/annusha/xmic + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ JIST: Joint Image and Sequence Training for Sequential Visual Place + Recognition + + +
+ Visual Place Recognition aims at recognizing previously visited places by +relying on visual clues, and it is used in robotics applications for SLAM and +localization. Since typically a mobile robot has access to a continuous stream +of frames, this task is naturally cast as a sequence-to-sequence localization +problem. Nevertheless, obtaining sequences of labelled data is much more +expensive than collecting isolated images, which can be done in an automated +way with little supervision. As a mitigation to this problem, we propose a +novel Joint Image and Sequence Training protocol (JIST) that leverages large +uncurated sets of images through a multi-task learning framework. With JIST we +also introduce SeqGeM, an aggregation layer that revisits the popular GeM +pooling to produce a single robust and compact embedding from a sequence of +single-frame embeddings. We show that our model is able to outperform previous +state of the art while being faster, using 8 times smaller descriptors, having +a lighter architecture and allowing to process sequences of various lengths. +Code is available at https://github.com/ga1i13o/JIST + +
+
+
+
+
+ + ☆ Zero-shot Prompt-based Video Encoder for Surgical Gesture Recognition + + +
+ Purpose: Surgical video is an important data stream for gesture recognition. +Thus, robust visual encoders for those data-streams is similarly important. +Methods: Leveraging the Bridge-Prompt framework, we fine-tune a pre-trained +vision-text model (CLIP) for gesture recognition in surgical videos. This can +utilize extensive outside video data such as text, but also make use of label +meta-data and weakly supervised contrastive losses. Results: Our experiments +show that prompt-based video encoder outperforms standard encoders in surgical +gesture recognition tasks. Notably, it displays strong performance in zero-shot +scenarios, where gestures/tasks that were not provided during the encoder +training phase are included in the prediction phase. Additionally, we measure +the benefit of inclusion text descriptions in the feature extractor training +schema. Conclusion: Bridge-Prompt and similar pre-trained+fine-tuned video +encoder models present significant visual representation for surgical robotics, +especially in gesture recognition tasks. Given the diverse range of surgical +tasks (gestures), the ability of these models to zero-shot transfer without the +need for any task (gesture) specific retraining makes them invaluable. + +
+
+ comment: 17 pages,4 figures, 7 tables, IPCAI 2024 +
+
+
+
+
+ + ☆ ENet-21: An Optimized light CNN Structure for Lane Detection + + +
+ Lane detection for autonomous vehicles is an important concept, yet it is a +challenging issue of driver assistance systems in modern vehicles. The +emergence of deep learning leads to significant progress in self-driving cars. +Conventional deep learning-based methods handle lane detection problems as a +binary segmentation task and determine whether a pixel belongs to a line. These +methods rely on the assumption of a fixed number of lanes, which does not +always work. This study aims to develop an optimal structure for the lane +detection problem, offering a promising solution for driver assistance features +in modern vehicles by utilizing a machine learning method consisting of binary +segmentation and Affinity Fields that can manage varying numbers of lanes and +lane change scenarios. In this approach, the Convolutional Neural Network +(CNN), is selected as a feature extractor, and the final output is obtained +through clustering of the semantic segmentation and Affinity Field outputs. Our +method uses less complex CNN architecture than exi + +
+
+ comment: The paper is under review by Soft Computing journal +
+
+
+
+
+ + ☆ Mitigating Motion Blur in Neural Radiance Fields with Events and Frames CVPR + + +
+ Neural Radiance Fields (NeRFs) have shown great potential in novel view +synthesis. However, they struggle to render sharp images when the data used for +training is affected by motion blur. On the other hand, event cameras excel in +dynamic scenes as they measure brightness changes with microsecond resolution +and are thus only marginally affected by blur. Recent methods attempt to +enhance NeRF reconstructions under camera motion by fusing frames and events. +However, they face challenges in recovering accurate color content or constrain +the NeRF to a set of predefined camera poses, harming reconstruction quality in +challenging conditions. This paper proposes a novel formulation addressing +these issues by leveraging both model- and learning-based modules. We +explicitly model the blur formation process, exploiting the event double +integral as an additional model-based prior. Additionally, we model the +event-pixel response using an end-to-end learnable response function, allowing +our method to adapt to non-idealities in the real event-camera sensor. We show, +on synthetic and real data, that the proposed approach outperforms existing +deblur NeRFs that use only frames as well as those that combine frames and +events by +6.13dB and +2.48dB, respectively. + +
+
+ comment: IEEE Conference on Computer Vision and Pattern Recognition (CVPR), + 2024 +
+
+
+
+
+ + ☆ CLoRA: A Contrastive Approach to Compose Multiple LoRA Models + + +
+ Low-Rank Adaptations (LoRAs) have emerged as a powerful and popular technique +in the field of image generation, offering a highly effective way to adapt and +refine pre-trained deep learning models for specific tasks without the need for +comprehensive retraining. By employing pre-trained LoRA models, such as those +representing a specific cat and a particular dog, the objective is to generate +an image that faithfully embodies both animals as defined by the LoRAs. +However, the task of seamlessly blending multiple concept LoRAs to capture a +variety of concepts in one image proves to be a significant challenge. Common +approaches often fall short, primarily because the attention mechanisms within +different LoRA models overlap, leading to scenarios where one concept may be +completely ignored (e.g., omitting the dog) or where concepts are incorrectly +combined (e.g., producing an image of two cats instead of one cat and one dog). +To overcome these issues, CLoRA addresses them by updating the attention maps +of multiple LoRA models and leveraging them to create semantic masks that +facilitate the fusion of latent representations. Our method enables the +creation of composite images that truly reflect the characteristics of each +LoRA, successfully merging multiple concepts or styles. Our comprehensive +evaluations, both qualitative and quantitative, demonstrate that our approach +outperforms existing methodologies, marking a significant advancement in the +field of image generation with LoRAs. Furthermore, we share our source code, +benchmark dataset, and trained LoRA models to promote further research on this +topic. + +
+
+
+
+
+ + ☆ ShapeFusion: A 3D diffusion model for localized shape editing + + +
+ In the realm of 3D computer vision, parametric models have emerged as a +ground-breaking methodology for the creation of realistic and expressive 3D +avatars. Traditionally, they rely on Principal Component Analysis (PCA), given +its ability to decompose data to an orthonormal space that maximally captures +shape variations. However, due to the orthogonality constraints and the global +nature of PCA's decomposition, these models struggle to perform localized and +disentangled editing of 3D shapes, which severely affects their use in +applications requiring fine control such as face sculpting. In this paper, we +leverage diffusion models to enable diverse and fully localized edits on 3D +meshes, while completely preserving the un-edited regions. We propose an +effective diffusion masking training strategy that, by design, facilitates +localized manipulation of any shape region, without being limited to predefined +regions or to sparse sets of predefined control vertices. Following our +framework, a user can explicitly set their manipulation region of choice and +define an arbitrary set of vertices as handles to edit a 3D mesh. Compared to +the current state-of-the-art our method leads to more interpretable shape +manipulations than methods relying on latent code state, greater localization +and generation diversity while offering faster inference than optimization +based approaches. Project page: https://rolpotamias.github.io/Shapefusion/ + +
+
+ comment: Project Page: https://rolpotamias.github.io/Shapefusion/ +
+
+
+
+
+ + ☆ Using Deep Learning to Increase Eye-Tracking Robustness, Accuracy, and + Precision in Virtual Reality + + +
+ Algorithms for the estimation of gaze direction from mobile and video-based +eye trackers typically involve tracking a feature of the eye that moves through +the eye camera image in a way that covaries with the shifting gaze direction, +such as the center or boundaries of the pupil. Tracking these features using +traditional computer vision techniques can be difficult due to partial +occlusion and environmental reflections. Although recent efforts to use machine +learning (ML) for pupil tracking have demonstrated superior results when +evaluated using standard measures of segmentation performance, little is known +of how these networks may affect the quality of the final gaze estimate. This +work provides an objective assessment of the impact of several contemporary +ML-based methods for eye feature tracking when the subsequent gaze estimate is +produced using either feature-based or model-based methods. Metrics include the +accuracy and precision of the gaze estimate, as well as drop-out rate. + +
+
+ comment: 16 pages, 10 figures, accepted to ETRA 2024 Full Papers +
+
+
+
+
+ + ☆ MIST: Mitigating Intersectional Bias with Disentangled Cross-Attention + Editing in Text-to-Image Diffusion Models + + +
+ Diffusion-based text-to-image models have rapidly gained popularity for their +ability to generate detailed and realistic images from textual descriptions. +However, these models often reflect the biases present in their training data, +especially impacting marginalized groups. While prior efforts to debias +language models have focused on addressing specific biases, such as racial or +gender biases, efforts to tackle intersectional bias have been limited. +Intersectional bias refers to the unique form of bias experienced by +individuals at the intersection of multiple social identities. Addressing +intersectional bias is crucial because it amplifies the negative effects of +discrimination based on race, gender, and other identities. In this paper, we +introduce a method that addresses intersectional bias in diffusion-based +text-to-image models by modifying cross-attention maps in a disentangled +manner. Our approach utilizes a pre-trained Stable Diffusion model, eliminates +the need for an additional set of reference images, and preserves the original +quality for unaltered concepts. Comprehensive experiments demonstrate that our +method surpasses existing approaches in mitigating both single and +intersectional biases across various attributes. We make our source code and +debiased models for various attributes available to encourage fairness in +generative models and to support further research. + +
+
+
+
+
+ + ♻ ☆ ACT-Diffusion: Efficient Adversarial Consistency Training for One-step + Diffusion Models CVPR 2024 + + +
+ Though diffusion models excel in image generation, their step-by-step +denoising leads to slow generation speeds. Consistency training addresses this +issue with single-step sampling but often produces lower-quality generations +and requires high training costs. In this paper, we show that optimizing +consistency training loss minimizes the Wasserstein distance between target and +generated distributions. As timestep increases, the upper bound accumulates +previous consistency training losses. Therefore, larger batch sizes are needed +to reduce both current and accumulated losses. We propose Adversarial +Consistency Training (ACT), which directly minimizes the Jensen-Shannon (JS) +divergence between distributions at each timestep using a discriminator. +Theoretically, ACT enhances generation quality, and convergence. By +incorporating a discriminator into the consistency training framework, our +method achieves improved FID scores on CIFAR10 and ImageNet 64$\times$64 and +LSUN Cat 256$\times$256 datasets, retains zero-shot image inpainting +capabilities, and uses less than $1/6$ of the original batch size and fewer +than $1/2$ of the model parameters and training steps compared to the baseline +method, this leads to a substantial reduction in resource consumption. Our code +is available:https://github.com/kong13661/ACT + +
+
+ comment: To appear in CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Direct Superpoints Matching for Robust Point Cloud Registration + + +
+ Deep neural networks endow the downsampled superpoints with highly +discriminative feature representations. Previous dominant point cloud +registration approaches match these feature representations as the first step, +e.g., using the Sinkhorn algorithm. A RANSAC-like method is then usually +adopted as a post-processing refinement to filter the outliers. Other dominant +method is to directly predict the superpoint matchings using learned MLP +layers. Both of them have drawbacks: RANSAC-based methods are computationally +intensive and prediction-based methods suffer from outputing non-existing +points in the point cloud. In this paper, we propose a straightforward and +effective baseline to find correspondences of superpoints in a global matching +manner. We employ the normalized matching scores as weights for each +correspondence, allowing us to reject the outliers and further weigh the rest +inliers when fitting the transformation matrix without relying on the +cumbersome RANSAC. Moreover, the entire model can be trained in an end-to-end +fashion, leading to better accuracy. Our simple yet effective baseline shows +comparable or even better results than state-of-the-art methods on three +datasets including ModelNet, 3DMatch, and KITTI. We do not advocate our +approach to be \emph{the} solution for point cloud registration but use the +results to emphasize the role of matching strategy for point cloud +registration. The code and models are available at +https://github.com/neu-vi/Superpoints_Registration. + +
+
+
+
+
+ + ♻ ☆ Learnable Earth Parser: Discovering 3D Prototypes in Aerial Scans + + +
+ We propose an unsupervised method for parsing large 3D scans of real-world +scenes with easily-interpretable shapes. This work aims to provide a practical +tool for analyzing 3D scenes in the context of aerial surveying and mapping, +without the need for user annotations. Our approach is based on a probabilistic +reconstruction model that decomposes an input 3D point cloud into a small set +of learned prototypical 3D shapes. The resulting reconstruction is visually +interpretable and can be used to perform unsupervised instance and low-shot +semantic segmentation of complex scenes. We demonstrate the usefulness of our +model on a novel dataset of seven large aerial LiDAR scans from diverse +real-world scenarios. Our approach outperforms state-of-the-art unsupervised +methods in terms of decomposition accuracy while remaining visually +interpretable. Our code and dataset are available at +https://romainloiseau.fr/learnable-earth-parser/ + +
+
+
+
+
+ + ♻ ☆ Quantum machine learning for image classification + + +
+ Image classification, a pivotal task in multiple industries, faces +computational challenges due to the burgeoning volume of visual data. This +research addresses these challenges by introducing two quantum machine learning +models that leverage the principles of quantum mechanics for effective +computations. Our first model, a hybrid quantum neural network with parallel +quantum circuits, enables the execution of computations even in the noisy +intermediate-scale quantum era, where circuits with a large number of qubits +are currently infeasible. This model demonstrated a record-breaking +classification accuracy of 99.21% on the full MNIST dataset, surpassing the +performance of known quantum-classical models, while having eight times fewer +parameters than its classical counterpart. Also, the results of testing this +hybrid model on a Medical MNIST (classification accuracy over 99%), and on +CIFAR-10 (classification accuracy over 82%), can serve as evidence of the +generalizability of the model and highlights the efficiency of quantum layers +in distinguishing common features of input data. Our second model introduces a +hybrid quantum neural network with a Quanvolutional layer, reducing image +resolution via a convolution process. The model matches the performance of its +classical counterpart, having four times fewer trainable parameters, and +outperforms a classical model with equal weight parameters. These models +represent advancements in quantum machine learning research and illuminate the +path towards more accurate image classification systems. + +
+
+ comment: 13 pages, 10 figures, 1 table +
+
+
+
+
+ + ♻ ☆ Boosting Latent Diffusion with Flow Matching + + +
+ Recently, there has been tremendous progress in visual synthesis and the +underlying generative models. Here, diffusion models (DMs) stand out +particularly, but lately, flow matching (FM) has also garnered considerable +interest. While DMs excel in providing diverse images, they suffer from long +training and slow generation. With latent diffusion, these issues are only +partially alleviated. Conversely, FM offers faster training and inference but +exhibits less diversity in synthesis. We demonstrate that introducing FM +between the Diffusion model and the convolutional decoder offers +high-resolution image synthesis with reduced computational cost and model size. +Diffusion can then efficiently provide the necessary generation diversity. FM +compensates for the lower resolution, mapping the small latent space to a +high-dimensional one. Subsequently, the convolutional decoder of the LDM maps +these latents to high-resolution images. By combining the diversity of DMs, the +efficiency of FMs, and the effectiveness of convolutional decoders, we achieve +state-of-the-art high-resolution image synthesis at $1024^2$ with minimal +computational cost. Importantly, our approach is orthogonal to recent +approximation and speed-up strategies for the underlying DMs, making it easily +integrable into various DM frameworks. + +
+
+
+
+
+ + ♻ ☆ Transcending Forgery Specificity with Latent Space Augmentation for + Generalizable Deepfake Detection + + +
+ Deepfake detection faces a critical generalization hurdle, with performance +deteriorating when there is a mismatch between the distributions of training +and testing data. A broadly received explanation is the tendency of these +detectors to be overfitted to forgery-specific artifacts, rather than learning +features that are widely applicable across various forgeries. To address this +issue, we propose a simple yet effective detector called LSDA +(\underline{L}atent \underline{S}pace \underline{D}ata +\underline{A}ugmentation), which is based on a heuristic idea: representations +with a wider variety of forgeries should be able to learn a more generalizable +decision boundary, thereby mitigating the overfitting of method-specific +features (see Fig.~\ref{fig:toy}). Following this idea, we propose to enlarge +the forgery space by constructing and simulating variations within and across +forgery features in the latent space. This approach encompasses the acquisition +of enriched, domain-specific features and the facilitation of smoother +transitions between different forgery types, effectively bridging domain gaps. +Our approach culminates in refining a binary classifier that leverages the +distilled knowledge from the enhanced features, striving for a generalizable +deepfake detector. Comprehensive experiments show that our proposed method is +surprisingly effective and transcends state-of-the-art detectors across several +widely used benchmarks. + +
+
+
+
+
+ + ♻ ☆ Zero-shot Referring Expression Comprehension via Structural Similarity + Between Images and Captions CVPR 2024 + + +
+ Zero-shot referring expression comprehension aims at localizing bounding +boxes in an image corresponding to provided textual prompts, which requires: +(i) a fine-grained disentanglement of complex visual scene and textual context, +and (ii) a capacity to understand relationships among disentangled entities. +Unfortunately, existing large vision-language alignment (VLA) models, e.g., +CLIP, struggle with both aspects so cannot be directly used for this task. To +mitigate this gap, we leverage large foundation models to disentangle both +images and texts into triplets in the format of (subject, predicate, object). +After that, grounding is accomplished by calculating the structural similarity +matrix between visual and textual triplets with a VLA model, and subsequently +propagate it to an instance-level similarity matrix. Furthermore, to equip VLA +models with the ability of relationship understanding, we design a +triplet-matching objective to fine-tune the VLA models on a collection of +curated dataset containing abundant entity relationships. Experiments +demonstrate that our visual grounding performance increase of up to 19.5% over +the SOTA zero-shot model on RefCOCO/+/g. On the more challenging Who's Waldo +dataset, our zero-shot approach achieves comparable accuracy to the fully +supervised model. Code is available at +https://github.com/Show-han/Zeroshot_REC. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Classifying Objects in 3D Point Clouds Using Recurrent Neural Network: A + GRU LSTM Hybrid Approach + + +
+ Accurate classification of objects in 3D point clouds is a significant +problem in several applications, such as autonomous navigation and +augmented/virtual reality scenarios, which has become a research hot spot. In +this paper, we presented a deep learning strategy for 3D object classification +in augmented reality. The proposed approach is a combination of the GRU and +LSTM. LSTM networks learn longer dependencies well, but due to the number of +gates, it takes longer to train; on the other hand, GRU networks have a weaker +performance than LSTM, but their training speed is much higher than GRU, which +is The speed is due to its fewer gates. The proposed approach used the +combination of speed and accuracy of these two networks. The proposed approach +achieved an accuracy of 0.99 in the 4,499,0641 points dataset, which includes +eight classes (unlabeled, man-made terrain, natural terrain, high vegetation, +low vegetation, buildings, hardscape, scanning artifacts, cars). Meanwhile, the +traditional machine learning approaches could achieve a maximum accuracy of +0.9489 in the best case. Keywords: Point Cloud Classification, Virtual Reality, +Hybrid Model, GRULSTM, GRU, LSTM + +
+
+
+
+
+ + ♻ ☆ Quantifying and Mitigating Unimodal Biases in Multimodal Large Language + Models: A Causal Perspective + + +
+ Recent advancements in Large Language Models (LLMs) have facilitated the +development of Multimodal LLMs (MLLMs). Despite their impressive capabilities, +MLLMs often suffer from an over-reliance on unimodal biases (e.g., language +bias and vision bias), leading to incorrect answers in complex multimodal +tasks. To investigate this issue, we propose a causal framework to interpret +the biases in Visual Question Answering (VQA) problems. Within our framework, +we devise a causal graph to elucidate the predictions of MLLMs on VQA problems, +and assess the causal effect of biases through an in-depth causal analysis. +Motivated by the causal graph, we introduce a novel MORE dataset, consisting of +12,000 VQA instances. This dataset is designed to challenge MLLMs' abilities, +necessitating multi-hop reasoning and the surmounting of unimodal biases. +Furthermore, we propose two strategies to mitigate unimodal biases and enhance +MLLMs' reasoning capabilities, including a Decompose-Verify-Answer (DeVA) +framework for limited-access MLLMs and the refinement of open-source MLLMs +through fine-tuning. Extensive quantitative and qualitative experiments offer +valuable insights for future research. Our project page is at +https://opencausalab.github.io/MORE. + +
+
+
+
+
+ + ♻ ☆ Learned representation-guided diffusion models for large-image + generation + + +
+ To synthesize high-fidelity samples, diffusion models typically require +auxiliary data to guide the generation process. However, it is impractical to +procure the painstaking patch-level annotation effort required in specialized +domains like histopathology and satellite imagery; it is often performed by +domain experts and involves hundreds of millions of patches. Modern-day +self-supervised learning (SSL) representations encode rich semantic and visual +information. In this paper, we posit that such representations are expressive +enough to act as proxies to fine-grained human labels. We introduce a novel +approach that trains diffusion models conditioned on embeddings from SSL. Our +diffusion models successfully project these features back to high-quality +histopathology and remote sensing images. In addition, we construct larger +images by assembling spatially consistent patches inferred from SSL embeddings, +preserving long-range dependencies. Augmenting real data by generating +variations of real images improves downstream classifier accuracy for +patch-level and larger, image-scale classification tasks. Our models are +effective even on datasets not encountered during training, demonstrating their +robustness and generalizability. Generating images from learned embeddings is +agnostic to the source of the embeddings. The SSL embeddings used to generate a +large image can either be extracted from a reference image, or sampled from an +auxiliary model conditioned on any related modality (e.g. class labels, text, +genomic data). As proof of concept, we introduce the text-to-large image +synthesis paradigm where we successfully synthesize large pathology and +satellite images out of text descriptions. + +
+
+
+
+
+ + ♻ ☆ Human Gaussian Splatting: Real-time Rendering of Animatable Avatars CVPR 2024 + + +
+ This work addresses the problem of real-time rendering of photorealistic +human body avatars learned from multi-view videos. While the classical +approaches to model and render virtual humans generally use a textured mesh, +recent research has developed neural body representations that achieve +impressive visual quality. However, these models are difficult to render in +real-time and their quality degrades when the character is animated with body +poses different than the training observations. We propose an animatable human +model based on 3D Gaussian Splatting, that has recently emerged as a very +efficient alternative to neural radiance fields. The body is represented by a +set of gaussian primitives in a canonical space which is deformed with a coarse +to fine approach that combines forward skinning and local non-rigid refinement. +We describe how to learn our Human Gaussian Splatting (HuGS) model in an +end-to-end fashion from multi-view observations, and evaluate it against the +state-of-the-art approaches for novel pose synthesis of clothed body. Our +method achieves 1.5 dB PSNR improvement over the state-of-the-art on THuman4 +dataset while being able to render in real-time (80 fps for 512x512 +resolution). + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Predicting Species Occurrence Patterns from Partial Observations ICLR 2024 + + +
+ To address the interlinked biodiversity and climate crises, we need an +understanding of where species occur and how these patterns are changing. +However, observational data on most species remains very limited, and the +amount of data available varies greatly between taxonomic groups. We introduce +the problem of predicting species occurrence patterns given (a) satellite +imagery, and (b) known information on the occurrence of other species. To +evaluate algorithms on this task, we introduce SatButterfly, a dataset of +satellite images, environmental data and observational data for butterflies, +which is designed to pair with the existing SatBird dataset of bird +observational data. To address this task, we propose a general model, R-Tran, +for predicting species occurrence patterns that enables the use of partial +observational data wherever found. We find that R-Tran outperforms other +methods in predicting species encounter rates with partial information both +within a taxon (birds) and across taxa (birds and butterflies). Our approach +opens new perspectives to leveraging insights from species with abundant data +to other species with scarce data, by modelling the ecosystems in which they +co-occur. + +
+
+ comment: Tackling Climate Change with Machine Learning workshop at ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Towards Generalizable Tumor Synthesis CVPR 2024 + + +
+ Tumor synthesis enables the creation of artificial tumors in medical images, +facilitating the training of AI models for tumor detection and segmentation. +However, success in tumor synthesis hinges on creating visually realistic +tumors that are generalizable across multiple organs and, furthermore, the +resulting AI models being capable of detecting real tumors in images sourced +from different domains (e.g., hospitals). This paper made a progressive stride +toward generalizable tumor synthesis by leveraging a critical observation: +early-stage tumors (< 2cm) tend to have similar imaging characteristics in +computed tomography (CT), whether they originate in the liver, pancreas, or +kidneys. We have ascertained that generative AI models, e.g., Diffusion Models, +can create realistic tumors generalized to a range of organs even when trained +on a limited number of tumor examples from only one organ. Moreover, we have +shown that AI models trained on these synthetic tumors can be generalized to +detect and segment real tumors from CT volumes, encompassing a broad spectrum +of patient demographics, imaging protocols, and healthcare facilities. + +
+
+ comment: The IEEE / CVF Computer Vision and Pattern Recognition Conference + (CVPR 2024) +
+
+
+
+
+ + ♻ ☆ Parameter Efficient Fine-tuning via Cross Block Orchestration for + Segment Anything Model CVPR2024 + + +
+ Parameter-efficient fine-tuning (PEFT) is an effective methodology to unleash +the potential of large foundation models in novel scenarios with limited +training data. In the computer vision community, PEFT has shown effectiveness +in image classification, but little research has studied its ability for image +segmentation. Fine-tuning segmentation models usually require a heavier +adjustment of parameters to align the proper projection directions in the +parameter space for new scenarios. This raises a challenge to existing PEFT +algorithms, as they often inject a limited number of individual parameters into +each block, which prevents substantial adjustment of the projection direction +of the parameter space due to the limitation of Hidden Markov Chain along +blocks. In this paper, we equip PEFT with a cross-block orchestration mechanism +to enable the adaptation of the Segment Anything Model (SAM) to various +downstream scenarios. We introduce a novel inter-block communication module, +which integrates a learnable relation matrix to facilitate communication among +different coefficient sets of each PEFT block's parameter space. Moreover, we +propose an intra-block enhancement module, which introduces a linear projection +head whose weights are generated from a hyper-complex layer, further enhancing +the impact of the adjustment of projection directions on the entire parameter +space. Extensive experiments on diverse benchmarks demonstrate that our +proposed approach consistently improves the segmentation performance +significantly on novel scenarios with only around 1K additional parameters. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ MANUS: Markerless Grasp Capture using Articulated 3D Gaussians CVPR + + +
+ Understanding how we grasp objects with our hands has important applications +in areas like robotics and mixed reality. However, this challenging problem +requires accurate modeling of the contact between hands and objects. To capture +grasps, existing methods use skeletons, meshes, or parametric models that does +not represent hand shape accurately resulting in inaccurate contacts. We +present MANUS, a method for Markerless Hand-Object Grasp Capture using +Articulated 3D Gaussians. We build a novel articulated 3D Gaussians +representation that extends 3D Gaussian splatting for high-fidelity +representation of articulating hands. Since our representation uses Gaussian +primitives, it enables us to efficiently and accurately estimate contacts +between the hand and the object. For the most accurate results, our method +requires tens of camera views that current datasets do not provide. We +therefore build MANUS-Grasps, a new dataset that contains hand-object grasps +viewed from 50+ cameras across 30+ scenes, 3 subjects, and comprising over 7M +frames. In addition to extensive qualitative results, we also show that our +method outperforms others on a quantitative contact evaluation method that uses +paint transfer from the object to the hand. + +
+
+ comment: IEEE / CVF Computer Vision and Pattern Recognition Conference (CVPR) + 2024 +
+
+
+
+
+ + ♻ ☆ SkillDiffuser: Interpretable Hierarchical Planning via Skill + Abstractions in Diffusion-Based Task Execution CVPR 2024 + + +
+ Diffusion models have demonstrated strong potential for robotic trajectory +planning. However, generating coherent trajectories from high-level +instructions remains challenging, especially for long-range composition tasks +requiring multiple sequential skills. We propose SkillDiffuser, an end-to-end +hierarchical planning framework integrating interpretable skill learning with +conditional diffusion planning to address this problem. At the higher level, +the skill abstraction module learns discrete, human-understandable skill +representations from visual observations and language instructions. These +learned skill embeddings are then used to condition the diffusion model to +generate customized latent trajectories aligned with the skills. This allows +generating diverse state trajectories that adhere to the learnable skills. By +integrating skill learning with conditional trajectory generation, +SkillDiffuser produces coherent behavior following abstract instructions across +diverse tasks. Experiments on multi-task robotic manipulation benchmarks like +Meta-World and LOReL demonstrate state-of-the-art performance and +human-interpretable skill representations from SkillDiffuser. More +visualization results and information could be found on our website. + +
+
+ comment: Accepted by CVPR 2024. Camera ready version. Project page: + https://skilldiffuser.github.io/ +
+
+
+
+
+ + ♻ ☆ Synthesize Step-by-Step: Tools, Templates and LLMs as Data Generators + for Reasoning-Based Chart VQA CVPR 2024 + + +
+ Understanding data visualizations like charts and plots requires reasoning +about both visual elements and numerics. Although strong in extractive +questions, current chart visual question answering (chart VQA) models suffer on +complex reasoning questions. In this work, we address the lack of reasoning +ability by data augmentation. We leverage Large Language Models (LLMs), which +have shown to have strong reasoning ability, as an automatic data annotator +that generates question-answer annotations for chart images. The key innovation +in our method lies in the Synthesize Step-by-Step strategy: our LLM-based data +generator learns to decompose the complex question into step-by-step +sub-questions (rationales), which are then used to derive the final answer +using external tools, i.e. Python. This step-wise generation procedure is +trained on synthetic data generated using a template-based QA generation +pipeline. Experimental results highlight the significance of the proposed +step-by-step generation. By training with the LLM-augmented data (LAMENDA), we +significantly enhance the chart VQA models, achieving the state-of-the-art +accuracy on the ChartQA and PlotQA datasets. In particular, our approach +improves the accuracy of the previous state-of-the-art approach from 38% to 54% +on the human-written questions in the ChartQA dataset, which needs strong +reasoning. We hope our work underscores the potential of synthetic data and +encourages further exploration of data augmentation using LLMs for +reasoning-heavy tasks. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Exploring Accurate 3D Phenotyping in Greenhouse through Neural Radiance + Fields + + +
+ Accurate collection of plant phenotyping is critical to optimising +sustainable farming practices in precision agriculture. Traditional phenotyping +in controlled laboratory environments, while valuable, falls short in +understanding plant growth under real-world conditions. Emerging sensor and +digital technologies offer a promising approach for direct phenotyping of +plants in farm environments. This study investigates a learning-based +phenotyping method using the Neural Radiance Field to achieve accurate in-situ +phenotyping of pepper plants in greenhouse environments. To quantitatively +evaluate the performance of this method, traditional point cloud registration +on 3D scanning data is implemented for comparison. Experimental result shows +that NeRF(Neural Radiance Fields) achieves competitive accuracy compared to the +3D scanning methods. The mean distance error between the scanner-based method +and the NeRF-based method is 0.865mm. This study shows that the learning-based +NeRF method achieves similar accuracy to 3D scanning-based methods but with +improved scalability and robustness. + +
+
+
+
+
+ + ♻ ☆ Efficient Deep Learning-based Estimation of the Vital Signs on + Smartphones + + +
+ With the increasing use of smartphones in our daily lives, these devices have +become capable of performing many complex tasks. Concerning the need for +continuous monitoring of vital signs, especially for the elderly or those with +certain types of diseases, the development of algorithms that can estimate +vital signs using smartphones has attracted researchers worldwide. In +particular, researchers have been exploring ways to estimate vital signs, such +as heart rate, oxygen saturation levels, and respiratory rate, using algorithms +that can be run on smartphones. However, many of these algorithms require +multiple pre-processing steps that might introduce some implementation +overheads or require the design of a couple of hand-crafted stages to obtain an +optimal result. To address this issue, this research proposes a novel +end-to-end solution to mobile-based vital sign estimation using deep learning +that eliminates the need for pre-processing. By using a fully convolutional +architecture, the proposed model has much fewer parameters and less +computational complexity compared to the architectures that use fully-connected +layers as the prediction heads. This also reduces the risk of overfitting. +Additionally, a public dataset for vital sign estimation, which includes 62 +videos collected from 35 men and 27 women, is provided. Overall, the proposed +end-to-end approach promises significantly improved efficiency and performance +for on-device health monitoring on readily available consumer electronics. + +
+
+ comment: 10 pages, 8 figures, 11 tables +
+
+
+
+
+ + ♻ ☆ A Comprehensive Study of Knowledge Editing for Large Language Models + + +
+ Large Language Models (LLMs) have shown extraordinary capabilities in +understanding and generating text that closely mirrors human communication. +However, a primary limitation lies in the significant computational demands +during training, arising from their extensive parameterization. This challenge +is further intensified by the dynamic nature of the world, necessitating +frequent updates to LLMs to correct outdated information or integrate new +knowledge, thereby ensuring their continued relevance. Note that many +applications demand continual model adjustments post-training to address +deficiencies or undesirable behaviors. There is an increasing interest in +efficient, lightweight methods for on-the-fly model modifications. To this end, +recent years have seen a burgeoning in the techniques of knowledge editing for +LLMs, which aim to efficiently modify LLMs' behaviors within specific domains +while preserving overall performance across various inputs. In this paper, we +first define the knowledge editing problem and then provide a comprehensive +review of cutting-edge approaches. Drawing inspiration from educational and +cognitive research theories, we propose a unified categorization criterion that +classifies knowledge editing methods into three groups: resorting to external +knowledge, merging knowledge into the model, and editing intrinsic knowledge. +Furthermore, we introduce a new benchmark, KnowEdit, for a comprehensive +empirical evaluation of representative knowledge editing approaches. +Additionally, we provide an in-depth analysis of knowledge location, which can +give a deeper understanding of the knowledge structures inherent within LLMs. +Finally, we discuss several potential applications of knowledge editing, +outlining its broad and impactful implications. + +
+
+ comment: Ongoing work; 52 pages, 282 citations; benchmark is available at + https://huggingface.co/datasets/zjunlp/KnowEdit code is available at + https://github.com/zjunlp/EasyEdit paper list is available at + https://github.com/zjunlp/KnowledgeEditingPapers +
+
+
+
+
+ + ♻ ☆ VisionKG: Unleashing the Power of Visual Datasets via Knowledge Graph ESWC 2024 + + +
+ The availability of vast amounts of visual data with heterogeneous features +is a key factor for developing, testing, and benchmarking of new computer +vision (CV) algorithms and architectures. Most visual datasets are created and +curated for specific tasks or with limited image data distribution for very +specific situations, and there is no unified approach to manage and access them +across diverse sources, tasks, and taxonomies. This not only creates +unnecessary overheads when building robust visual recognition systems, but also +introduces biases into learning systems and limits the capabilities of +data-centric AI. To address these problems, we propose the Vision Knowledge +Graph (VisionKG), a novel resource that interlinks, organizes and manages +visual datasets via knowledge graphs and Semantic Web technologies. It can +serve as a unified framework facilitating simple access and querying of +state-of-the-art visual datasets, regardless of their heterogeneous formats and +taxonomies. One of the key differences between our approach and existing +methods is that ours is knowledge-based rather than metadatabased. It enhances +the enrichment of the semantics at both image and instance levels and offers +various data retrieval and exploratory services via SPARQL. VisionKG currently +contains 519 million RDF triples that describe approximately 40 million +entities, and are accessible at https://vision.semkg.org and through APIs. With +the integration of 30 datasets and four popular CV tasks, we demonstrate its +usefulness across various scenarios when working with CV pipelines. + +
+
+ comment: Accepted at ESWC 2024 +
+
+
+
+
+ + ♻ ☆ GaussianAvatars: Photorealistic Head Avatars with Rigged 3D Gaussians + + +
+ We introduce GaussianAvatars, a new method to create photorealistic head +avatars that are fully controllable in terms of expression, pose, and +viewpoint. The core idea is a dynamic 3D representation based on 3D Gaussian +splats that are rigged to a parametric morphable face model. This combination +facilitates photorealistic rendering while allowing for precise animation +control via the underlying parametric model, e.g., through expression transfer +from a driving sequence or by manually changing the morphable model parameters. +We parameterize each splat by a local coordinate frame of a triangle and +optimize for explicit displacement offset to obtain a more accurate geometric +representation. During avatar reconstruction, we jointly optimize for the +morphable model parameters and Gaussian splat parameters in an end-to-end +fashion. We demonstrate the animation capabilities of our photorealistic avatar +in several challenging scenarios. For instance, we show reenactments from a +driving video, where our method outperforms existing works by a significant +margin. + +
+
+ comment: Project page: https://shenhanqian.github.io/gaussian-avatars +
+
+
+
+
+ + ♻ ☆ DiffusionPoser: Real-time Human Motion Reconstruction From Arbitrary + Sparse Sensors Using Autoregressive Diffusion CVPR2024 + + +
+ Motion capture from a limited number of body-worn sensors, such as inertial +measurement units (IMUs) and pressure insoles, has important applications in +health, human performance, and entertainment. Recent work has focused on +accurately reconstructing whole-body motion from a specific sensor +configuration using six IMUs. While a common goal across applications is to use +the minimal number of sensors to achieve required accuracy, the optimal +arrangement of the sensors might differ from application to application. We +propose a single diffusion model, DiffusionPoser, which reconstructs human +motion in real-time from an arbitrary combination of sensors, including IMUs +placed at specified locations, and, pressure insoles. Unlike existing methods, +our model grants users the flexibility to determine the number and arrangement +of sensors tailored to the specific activity of interest, without the need for +retraining. A novel autoregressive inferencing scheme ensures real-time motion +reconstruction that closely aligns with measured sensor signals. The generative +nature of DiffusionPoser ensures realistic behavior, even for +degrees-of-freedom not directly measured. Qualitative results can be found on +our website: https://diffusionposer.github.io/. + +
+
+ comment: accepted at CVPR2024 +
+
+
+
+
+ + ♻ ☆ Learning to reconstruct the bubble distribution with conductivity maps + using Invertible Neural Networks and Error Diffusion + + +
+ Electrolysis is crucial for eco-friendly hydrogen production, but gas bubbles +generated during the process hinder reactions, reduce cell efficiency, and +increase energy consumption. Additionally, these gas bubbles cause changes in +the conductivity inside the cell, resulting in corresponding variations in the +induced magnetic field around the cell. Therefore, measuring these gas +bubble-induced magnetic field fluctuations using external magnetic sensors and +solving the inverse problem of Biot-Savart Law allows for estimating the +conductivity in the cell and, thus, bubble size and location. However, +determining high-resolution conductivity maps from only a few induced magnetic +field measurements is an ill-posed inverse problem. To overcome this, we +exploit Invertible Neural Networks (INNs) to reconstruct the conductivity +field. Our qualitative results and quantitative evaluation using random error +diffusion show that INN achieves far superior performance compared to Tikhonov +regularization. + +
+
+ comment: Accepted for Oral presentation at WCIPT11 (11th World Congress on + Industrial Process Tomography) +
+
+
+
+
+ + ♻ ☆ Detoxifying Large Language Models via Knowledge Editing + + +
+ This paper investigates using knowledge editing techniques to detoxify Large +Language Models (LLMs). We construct a benchmark, SafeEdit, which covers nine +unsafe categories with various powerful attack prompts and equips comprehensive +metrics for systematic evaluation. We conduct experiments with several +knowledge editing approaches, indicating that knowledge editing has the +potential to efficiently detoxify LLMs with limited impact on general +performance. Then, we propose a simple yet effective baseline, dubbed +Detoxifying with Intraoperative Neural Monitoring (DINM), to diminish the +toxicity of LLMs within a few tuning steps via only one instance. We further +provide an in-depth analysis of the internal mechanism for various detoxify +approaches, demonstrating that previous methods like SFT and DPO may merely +suppress the activations of toxic parameters, while DINM mitigates the toxicity +of the toxic parameters to a certain extent, making permanent adjustments. We +hope that these insights could shed light on future work of developing +detoxifying approaches and the underlying knowledge mechanisms of LLMs. Code +and benchmark are available at https://github.com/zjunlp/EasyEdit. + +
+
+ comment: Ongoing work. Project website: + https://zjunlp.github.io/project/SafeEdit Due to the specificity of the + knowledge editing setting, we revise Tables 1 and 3 to present a fair + comparison of experimental results. More experimental results will be updated + soon +
+
+
+
+
+ + ♻ ☆ Fake or JPEG? Revealing Common Biases in Generated Image Detection + Datasets + + +
+ The widespread adoption of generative image models has highlighted the urgent +need to detect artificial content, which is a crucial step in combating +widespread manipulation and misinformation. Consequently, numerous detectors +and associated datasets have emerged. However, many of these datasets +inadvertently introduce undesirable biases, thereby impacting the effectiveness +and evaluation of detectors. In this paper, we emphasize that many datasets for +AI-generated image detection contain biases related to JPEG compression and +image size. Using the GenImage dataset, we demonstrate that detectors indeed +learn from these undesired factors. Furthermore, we show that removing the +named biases substantially increases robustness to JPEG compression and +significantly alters the cross-generator performance of evaluated detectors. +Specifically, it leads to more than 11 percentage points increase in +cross-generator performance for ResNet50 and Swin-T detectors on the GenImage +dataset, achieving state-of-the-art results. + We provide the dataset and source codes of this paper on the anonymous +website: https://www.unbiased-genimage.org + +
+
+
+
+
+ + ♻ ☆ Segment Every Out-of-Distribution Object + + +
+ Semantic segmentation models, while effective for in-distribution categories, +face challenges in real-world deployment due to encountering +out-of-distribution (OoD) objects. Detecting these OoD objects is crucial for +safety-critical applications. Existing methods rely on anomaly scores, but +choosing a suitable threshold for generating masks presents difficulties and +can lead to fragmentation and inaccuracy. This paper introduces a method to +convert anomaly \textbf{S}core \textbf{T}o segmentation \textbf{M}ask, called +S2M, a simple and effective framework for OoD detection in semantic +segmentation. Unlike assigning anomaly scores to pixels, S2M directly segments +the entire OoD object. By transforming anomaly scores into prompts for a +promptable segmentation model, S2M eliminates the need for threshold selection. +Extensive experiments demonstrate that S2M outperforms the state-of-the-art by +approximately 20% in IoU and 40% in mean F1 score, on average, across various +benchmarks including Fishyscapes, Segment-Me-If-You-Can, and RoadAnomaly +datasets. + +
+
+ comment: 20 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ Towards Low-Energy Adaptive Personalization for Resource-Constrained + Devices + + +
+ The personalization of machine learning (ML) models to address data drift is +a significant challenge in the context of Internet of Things (IoT) +applications. Presently, most approaches focus on fine-tuning either the full +base model or its last few layers to adapt to new data, while often neglecting +energy costs. However, various types of data drift exist, and fine-tuning the +full base model or the last few layers may not result in optimal performance in +certain scenarios. We propose Target Block Fine-Tuning (TBFT), a low-energy +adaptive personalization framework designed for resource-constrained devices. +We categorize data drift and personalization into three types: input-level, +feature-level, and output-level. For each type, we fine-tune different blocks +of the model to achieve optimal performance with reduced energy costs. +Specifically, input-, feature-, and output-level correspond to fine-tuning the +front, middle, and rear blocks of the model. We evaluate TBFT on a ResNet +model, three datasets, three different training sizes, and a Raspberry Pi. +Compared with the $Block Avg$, where each block is fine-tuned individually and +their performance improvements are averaged, TBFT exhibits an improvement in +model accuracy by an average of 15.30% whilst saving 41.57% energy consumption +on average compared with full fine-tuning. + +
+
+ comment: Accepetd to The 4th Workshop on Machine Learning and Systems + (EuroMLSys '24) +
+
+
+
+
+ + ♻ ☆ ViTAR: Vision Transformer with Any Resolution + + +
+ This paper tackles a significant challenge faced by Vision Transformers +(ViTs): their constrained scalability across different image resolutions. +Typically, ViTs experience a performance decline when processing resolutions +different from those seen during training. Our work introduces two key +innovations to address this issue. Firstly, we propose a novel module for +dynamic resolution adjustment, designed with a single Transformer block, +specifically to achieve highly efficient incremental token integration. +Secondly, we introduce fuzzy positional encoding in the Vision Transformer to +provide consistent positional awareness across multiple resolutions, thereby +preventing overfitting to any single training resolution. Our resulting model, +ViTAR (Vision Transformer with Any Resolution), demonstrates impressive +adaptability, achieving 83.3\% top-1 accuracy at a 1120x1120 resolution and +80.4\% accuracy at a 4032x4032 resolution, all while reducing computational +costs. ViTAR also shows strong performance in downstream tasks such as instance +and semantic segmentation and can easily combined with self-supervised learning +techniques like Masked AutoEncoder. Our work provides a cost-effective solution +for enhancing the resolution scalability of ViTs, paving the way for more +versatile and efficient high-resolution image processing. + +
+
+
+
+
+ + ♻ ☆ Self-Discovering Interpretable Diffusion Latent Directions for + Responsible Text-to-Image Generation CVPR 2024 + + +
+ Diffusion-based models have gained significant popularity for text-to-image +generation due to their exceptional image-generation capabilities. A risk with +these models is the potential generation of inappropriate content, such as +biased or harmful images. However, the underlying reasons for generating such +undesired content from the perspective of the diffusion model's internal +representation remain unclear. Previous work interprets vectors in an +interpretable latent space of diffusion models as semantic concepts. However, +existing approaches cannot discover directions for arbitrary concepts, such as +those related to inappropriate concepts. In this work, we propose a novel +self-supervised approach to find interpretable latent directions for a given +concept. With the discovered vectors, we further propose a simple approach to +mitigate inappropriate generation. Extensive experiments have been conducted to +verify the effectiveness of our mitigation approach, namely, for fair +generation, safe generation, and responsible text-enhancing generation. Project +page: \url{https://interpretdiffusion.github.io}. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Sparse 3D Reconstruction via Object-Centric Ray Sampling + + +
+ We propose a novel method for 3D object reconstruction from a sparse set of +views captured from a 360-degree calibrated camera rig. We represent the object +surface through a hybrid model that uses both an MLP-based neural +representation and a triangle mesh. A key contribution in our work is a novel +object-centric sampling scheme of the neural representation, where rays are +shared among all views. This efficiently concentrates and reduces the number of +samples used to update the neural model at each iteration. This sampling scheme +relies on the mesh representation to ensure also that samples are +well-distributed along its normals. The rendering is then performed efficiently +by a differentiable renderer. We demonstrate that this sampling scheme results +in a more effective training of the neural representation, does not require the +additional supervision of segmentation masks, yields state of the art 3D +reconstructions, and works with sparse views on the Google's Scanned Objects, +Tank and Temples and MVMC Car datasets. Code available at: +https://github.com/llukmancerkezi/ROSTER + +
+
+
+
+
+ + ♻ ☆ OpenGraph: Open-Vocabulary Hierarchical 3D Graph Representation in + Large-Scale Outdoor Environments + + +
+ Environment representations endowed with sophisticated semantics are pivotal +for facilitating seamless interaction between robots and humans, enabling them +to effectively carry out various tasks. Open-vocabulary maps, powered by +Visual-Language models (VLMs), possess inherent advantages, including zero-shot +learning and support for open-set classes. However, existing open-vocabulary +maps are primarily designed for small-scale environments, such as desktops or +rooms, and are typically geared towards limited-area tasks involving robotic +indoor navigation or in-place manipulation. They face challenges in direct +generalization to outdoor environments characterized by numerous objects and +complex tasks, owing to limitations in both understanding level and map +structure. In this work, we propose OpenGraph, the first open-vocabulary +hierarchical graph representation designed for large-scale outdoor +environments. OpenGraph initially extracts instances and their captions from +visual images, enhancing textual reasoning by encoding them. Subsequently, it +achieves 3D incremental object-centric mapping with feature embedding by +projecting images onto LiDAR point clouds. Finally, the environment is +segmented based on lane graph connectivity to construct a hierarchical graph. +Validation results from public dataset SemanticKITTI demonstrate that OpenGraph +achieves the highest segmentation and query accuracy. The source code of +OpenGraph is publicly available at https://github.com/BIT-DYN/OpenGraph. + +
+
+
+
+
+ + ♻ ☆ Manifold Constraint Regularization for Remote Sensing Image Generation + + +
+ Generative Adversarial Networks (GANs) have shown notable accomplishments in +remote sensing domain. However, this paper reveals that their performance on +remote sensing images falls short when compared to their impressive results +with natural images. This study identifies a previously overlooked issue: GANs +exhibit a heightened susceptibility to overfitting on remote sensing images.To +address this challenge, this paper analyzes the characteristics of remote +sensing images and proposes manifold constraint regularization, a novel +approach that tackles overfitting of GANs on remote sensing images for the +first time. Our method includes a new measure for evaluating the structure of +the data manifold. Leveraging this measure, we propose the manifold constraint +regularization term, which not only alleviates the overfitting problem, but +also promotes alignment between the generated and real data manifolds, leading +to enhanced quality in the generated images. The effectiveness and versatility +of this method have been corroborated through extensive validation on various +remote sensing datasets and GAN models. The proposed method not only enhances +the quality of the generated images, reflected in a 3.13\% improvement in +Frechet Inception Distance (FID) score, but also boosts the performance of the +GANs on downstream tasks, evidenced by a 3.76\% increase in classification +accuracy. + +
+
+
+
+
+ + ♻ ☆ WinSyn: A High Resolution Testbed for Synthetic Data + + +
+ We present WinSyn, a unique dataset and testbed for creating high-quality +synthetic data with procedural modeling techniques. The dataset contains +high-resolution photographs of windows, selected from locations around the +world, with 89,318 individual window crops showcasing diverse geometric and +material characteristics. We evaluate a procedural model by training semantic +segmentation networks on both synthetic and real images and then comparing +their performances on a shared test set of real images. Specifically, we +measure the difference in mean Intersection over Union (mIoU) and determine the +effective number of real images to match synthetic data's training performance. +We design a baseline procedural model as a benchmark and provide 21,290 +synthetically generated images. By tuning the procedural model, key factors are +identified which significantly influence the model's fidelity in replicating +real-world scenarios. Importantly, we highlight the challenge of procedural +modeling using current techniques, especially in their ability to replicate the +spatial semantics of real-world scenarios. This insight is critical because of +the potential of procedural models to bridge to hidden scene aspects such as +depth, reflectivity, material properties, and lighting conditions. + +
+
+ comment: cvpr version +
+
+
+
+
+ + ♻ ☆ Frequency-Adaptive Dilated Convolution for Semantic Segmentation + + +
+ Dilated convolution, which expands the receptive field by inserting gaps +between its consecutive elements, is widely employed in computer vision. In +this study, we propose three strategies to improve individual phases of dilated +convolution from the view of spectrum analysis. Departing from the conventional +practice of fixing a global dilation rate as a hyperparameter, we introduce +Frequency-Adaptive Dilated Convolution (FADC), which dynamically adjusts +dilation rates spatially based on local frequency components. Subsequently, we +design two plug-in modules to directly enhance effective bandwidth and +receptive field size. The Adaptive Kernel (AdaKern) module decomposes +convolution weights into low-frequency and high-frequency components, +dynamically adjusting the ratio between these components on a per-channel +basis. By increasing the high-frequency part of convolution weights, AdaKern +captures more high-frequency components, thereby improving effective bandwidth. +The Frequency Selection (FreqSelect) module optimally balances high- and +low-frequency components in feature representations through spatially variant +reweighting. It suppresses high frequencies in the background to encourage FADC +to learn a larger dilation, thereby increasing the receptive field for an +expanded scope. Extensive experiments on segmentation and object detection +consistently validate the efficacy of our approach. The code is publicly +available at \url{https://github.com/Linwei-Chen/FADC}. + +
+
+
+
+
+ + ♻ ☆ MRFP: Learning Generalizable Semantic Segmentation from Sim-2-Real with + Multi-Resolution Feature Perturbation CVPR 2024 + + +
+ Deep neural networks have shown exemplary performance on semantic scene +understanding tasks on source domains, but due to the absence of style +diversity during training, enhancing performance on unseen target domains using +only single source domain data remains a challenging task. Generation of +simulated data is a feasible alternative to retrieving large style-diverse +real-world datasets as it is a cumbersome and budget-intensive process. +However, the large domain-specfic inconsistencies between simulated and +real-world data pose a significant generalization challenge in semantic +segmentation. In this work, to alleviate this problem, we propose a novel +MultiResolution Feature Perturbation (MRFP) technique to randomize +domain-specific fine-grained features and perturb style of coarse features. Our +experimental results on various urban-scene segmentation datasets clearly +indicate that, along with the perturbation of style-information, perturbation +of fine-feature components is paramount to learn domain invariant robust +feature maps for semantic segmentation models. MRFP is a simple and +computationally efficient, transferable module with no additional learnable +parameters or objective functions, that helps state-of-the-art deep neural +networks to learn robust domain invariant features for simulation-to-real +semantic segmentation. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Continual Learning: Applications and the Road Forward + + +
+ Continual learning is a subfield of machine learning, which aims to allow +machine learning models to continuously learn on new data, by accumulating +knowledge without forgetting what was learned in the past. In this work, we +take a step back, and ask: "Why should one care about continual learning in the +first place?". We set the stage by examining recent continual learning papers +published at four major machine learning conferences, and show that +memory-constrained settings dominate the field. Then, we discuss five open +problems in machine learning, and even though they might seem unrelated to +continual learning at first sight, we show that continual learning will +inevitably be part of their solution. These problems are model editing, +personalization and specialization, on-device learning, faster (re-)training +and reinforcement learning. Finally, by comparing the desiderata from these +unsolved problems and the current assumptions in continual learning, we +highlight and discuss four future directions for continual learning research. +We hope that this work offers an interesting perspective on the future of +continual learning, while displaying its potential value and the paths we have +to pursue in order to make it successful. This work is the result of the many +discussions the authors had at the Dagstuhl seminar on Deep Continual Learning, +in March 2023. + +
+
+
+
+
+ + ♻ ☆ TimeChat: A Time-sensitive Multimodal Large Language Model for Long + Video Understanding CVPR 2024 + + +
+ This work proposes TimeChat, a time-sensitive multimodal large language model +specifically designed for long video understanding. Our model incorporates two +key architectural contributions: (1) a timestamp-aware frame encoder that binds +visual content with the timestamp of each frame, and (2) a sliding video +Q-Former that produces a video token sequence of varying lengths to accommodate +videos of various durations. Additionally, we construct an instruction-tuning +dataset, encompassing 6 tasks and a total of 125K instances, to further enhance +TimeChat's instruction-following performance. Experiment results across various +video understanding tasks, such as dense captioning, temporal grounding, and +highlight detection, demonstrate TimeChat's strong zero-shot temporal +localization and reasoning capabilities. For example, it achieves +9.2 F1 score +and +2.8 CIDEr on YouCook2, +5.8 HIT@1 on QVHighlights, and +27.5 R@1 (IoU=0.5) +on Charades-STA, compared to state-of-the-art video large language models, +holding the potential to serve as a versatile video assistant for long-form +video comprehension tasks and satisfy realistic user requirements. + +
+
+ comment: CVPR 2024 camera-ready version, code is available at + https://github.com/RenShuhuai-Andy/TimeChat +
+
+
+
+
+ + ♻ ☆ RFAConv: Innovating Spatial Attention and Standard Convolutional + Operation + + +
+ Spatial attention has been widely used to improve the performance of +convolutional neural networks. However, it has certain limitations. In this +paper, we propose a new perspective on the effectiveness of spatial attention, +which is that the spatial attention mechanism essentially solves the problem of +convolutional kernel parameter sharing. However, the information contained in +the attention map generated by spatial attention is not sufficient for +large-size convolutional kernels. Therefore, we propose a novel attention +mechanism called Receptive-Field Attention (RFA). Existing spatial attention, +such as Convolutional Block Attention Module (CBAM) and Coordinated Attention +(CA) focus only on spatial features, which does not fully address the problem +of convolutional kernel parameter sharing. In contrast, RFA not only focuses on +the receptive-field spatial feature but also provides effective attention +weights for large-size convolutional kernels. The Receptive-Field Attention +convolutional operation (RFAConv), developed by RFA, represents a new approach +to replace the standard convolution operation. It offers nearly negligible +increment of computational cost and parameters, while significantly improving +network performance. We conducted a series of experiments on ImageNet-1k, COCO, +and VOC datasets to demonstrate the superiority of our approach. Of particular +importance, we believe that it is time to shift focus from spatial features to +receptive-field spatial features for current spatial attention mechanisms. In +this way, we can further improve network performance and achieve even better +results. The code and pre-trained models for the relevant tasks can be found at +https://github.com/Liuchen1997/RFAConv. + +
+
+ comment: 12 pages, 11figures +
+
+
+
+
+ + ♻ ☆ DoseDiff: Distance-aware Diffusion Model for Dose Prediction in + Radiotherapy + + +
+ Treatment planning, which is a critical component of the radiotherapy +workflow, is typically carried out by a medical physicist in a time-consuming +trial-and-error manner. Previous studies have proposed knowledge-based or +deep-learning-based methods for predicting dose distribution maps to assist +medical physicists in improving the efficiency of treatment planning. However, +these dose prediction methods usually fail to effectively utilize distance +information between surrounding tissues and targets or organs-at-risk (OARs). +Moreover, they are poor at maintaining the distribution characteristics of ray +paths in the predicted dose distribution maps, resulting in a loss of valuable +information. In this paper, we propose a distance-aware diffusion model +(DoseDiff) for precise prediction of dose distribution. We define dose +prediction as a sequence of denoising steps, wherein the predicted dose +distribution map is generated with the conditions of the computed tomography +(CT) image and signed distance maps (SDMs). The SDMs are obtained by distance +transformation from the masks of targets or OARs, which provide the distance +from each pixel in the image to the outline of the targets or OARs. We further +propose a multi-encoder and multi-scale fusion network (MMFNet) that +incorporates multi-scale and transformer-based fusion modules to enhance +information fusion between the CT image and SDMs at the feature level. We +evaluate our model on two in-house datasets and a public dataset, respectively. +The results demonstrate that our DoseDiff method outperforms state-of-the-art +dose prediction methods in terms of both quantitative performance and visual +quality. + +
+
+
+
+
+ + ♻ ☆ H2ASeg: Hierarchical Adaptive Interaction and Weighting Network for + Tumor Segmentation in PET/CT Images + + +
+ Positron emission tomography (PET) combined with computed tomography (CT) +imaging is routinely used in cancer diagnosis and prognosis by providing +complementary information. Automatically segmenting tumors in PET/CT images can +significantly improve examination efficiency. Traditional multi-modal +segmentation solutions mainly rely on concatenation operations for modality +fusion, which fail to effectively model the non-linear dependencies between PET +and CT modalities. Recent studies have investigated various approaches to +optimize the fusion of modality-specific features for enhancing joint +representations. However, modality-specific encoders used in these methods +operate independently, inadequately leveraging the synergistic relationships +inherent in PET and CT modalities, for example, the complementarity between +semantics and structure. To address these issues, we propose a Hierarchical +Adaptive Interaction and Weighting Network termed H2ASeg to explore the +intrinsic cross-modal correlations and transfer potential complementary +information. Specifically, we design a Modality-Cooperative Spatial Attention +(MCSA) module that performs intra- and inter-modal interactions globally and +locally. Additionally, a Target-Aware Modality Weighting (TAMW) module is +developed to highlight tumor-related features within multi-modal features, +thereby refining tumor segmentation. By embedding these modules across +different layers, H2ASeg can hierarchically model cross-modal correlations, +enabling a nuanced understanding of both semantic and structural tumor +features. Extensive experiments demonstrate the superiority of H2ASeg, +outperforming state-of-the-art methods on AutoPet-II and Hecktor2022 +benchmarks. The code is released at https://github.com/JinPLu/H2ASeg. + +
+
+ comment: 10 pages,4 figures +
+
+
+
+
+ + ♻ ☆ EgoThink: Evaluating First-Person Perspective Thinking Capability of + Vision-Language Models + + +
+ Vision-language models (VLMs) have recently shown promising results in +traditional downstream tasks. Evaluation studies have emerged to assess their +abilities, with the majority focusing on the third-person perspective, and only +a few addressing specific tasks from the first-person perspective. However, the +capability of VLMs to "think" from a first-person perspective, a crucial +attribute for advancing autonomous agents and robotics, remains largely +unexplored. To bridge this research gap, we introduce EgoThink, a novel visual +question-answering benchmark that encompasses six core capabilities with twelve +detailed dimensions. The benchmark is constructed using selected clips from +egocentric videos, with manually annotated question-answer pairs containing +first-person information. To comprehensively assess VLMs, we evaluate eighteen +popular VLMs on EgoThink. Moreover, given the open-ended format of the answers, +we use GPT-4 as the automatic judge to compute single-answer grading. +Experimental results indicate that although GPT-4V leads in numerous +dimensions, all evaluated VLMs still possess considerable potential for +improvement in first-person perspective tasks. Meanwhile, enlarging the number +of trainable parameters has the most significant impact on model performance on +EgoThink. In conclusion, EgoThink serves as a valuable addition to existing +evaluation benchmarks for VLMs, providing an indispensable resource for future +research in the realm of embodied artificial intelligence and robotics. + +
+
+
+
+
+ + ♻ ☆ Data-free Defense of Black Box Models Against Adversarial Attacks CVPR + + +
+ Several companies often safeguard their trained deep models (i.e., details of +architecture, learnt weights, training details etc.) from third-party users by +exposing them only as black boxes through APIs. Moreover, they may not even +provide access to the training data due to proprietary reasons or sensitivity +concerns. In this work, we propose a novel defense mechanism for black box +models against adversarial attacks in a data-free set up. We construct +synthetic data via generative model and train surrogate network using model +stealing techniques. To minimize adversarial contamination on perturbed +samples, we propose 'wavelet noise remover' (WNR) that performs discrete +wavelet decomposition on input images and carefully select only a few important +coefficients determined by our 'wavelet coefficient selection module' (WCSM). +To recover the high-frequency content of the image after noise removal via WNR, +we further train a 'regenerator' network with an objective to retrieve the +coefficients such that the reconstructed image yields similar to original +predictions on the surrogate model. At test time, WNR combined with trained +regenerator network is prepended to the black box network, resulting in a high +boost in adversarial accuracy. Our method improves the adversarial accuracy on +CIFAR-10 by 38.98% and 32.01% on state-of-the-art Auto Attack compared to +baseline, even when the attacker uses surrogate architecture (Alexnet-half and +Alexnet) similar to the black box architecture (Alexnet) with same model +stealing strategy as defender. The code is available at +https://github.com/vcl-iisc/data-free-black-box-defense + +
+
+ comment: CVPR Workshop (Under Review) +
+
+
+
+
+ + ♻ ☆ Intrinsic Image Decomposition Using Point Cloud Representation + + +
+ The purpose of intrinsic decomposition is to separate an image into its +albedo (reflective properties) and shading components (illumination +properties). This is challenging because it's an ill-posed problem. +Conventional approaches primarily concentrate on 2D imagery and fail to fully +exploit the capabilities of 3D data representation. 3D point clouds offer a +more comprehensive format for representing scenes, as they combine geometric +and color information effectively. To this end, in this paper, we introduce +Point Intrinsic Net (PoInt-Net), which leverages 3D point cloud data to +concurrently estimate albedo and shading maps. The merits of PoInt-Net include +the following aspects. First, the model is efficient, achieving consistent +performance across point clouds of any size with training only required on +small-scale point clouds. Second, it exhibits remarkable robustness; even when +trained exclusively on datasets comprising individual objects, PoInt-Net +demonstrates strong generalization to unseen objects and scenes. Third, it +delivers superior accuracy over conventional 2D approaches, demonstrating +enhanced performance across various metrics on different datasets. (Code +Released) + +
+
+ comment: Code: https://github.com/xyxingx/PoInt-Net +
+
+
+
+
+ + ♻ ☆ UADA3D: Unsupervised Adversarial Domain Adaptation for 3D Object + Detection with Sparse LiDAR and Large Domain Gaps + + +
+ In this study, we address a gap in existing unsupervised domain adaptation +approaches on LiDAR-based 3D object detection, which have predominantly +concentrated on adapting between established, high-density autonomous driving +datasets. We focus on sparser point clouds, capturing scenarios from different +perspectives: not just from vehicles on the road but also from mobile robots on +sidewalks, which encounter significantly different environmental conditions and +sensor configurations. We introduce Unsupervised Adversarial Domain Adaptation +for 3D Object Detection (UADA3D). UADA3D does not depend on pre-trained source +models or teacher-student architectures. Instead, it uses an adversarial +approach to directly learn domain-invariant features. We demonstrate its +efficacy in various adaptation scenarios, showing significant improvements in +both self-driving car and mobile robot domains. Our code is open-source and +will be available soon. + +
+
+
+
+
+ + ♻ ☆ AnimatableDreamer: Text-Guided Non-rigid 3D Model Generation and + Reconstruction with Canonical Score Distillation + + +
+ Advances in 3D generation have facilitated sequential 3D model generation +(a.k.a 4D generation), yet its application for animatable objects with large +motion remains scarce. Our work proposes AnimatableDreamer, a text-to-4D +generation framework capable of generating diverse categories of non-rigid +objects on skeletons extracted from a monocular video. At its core, +AnimatableDreamer is equipped with our novel optimization design dubbed +Canonical Score Distillation (CSD), which lifts 2D diffusion for temporal +consistent 4D generation. CSD, designed from a score gradient perspective, +generates a canonical model with warp-robustness across different +articulations. Notably, it also enhances the authenticity of bones and skinning +by integrating inductive priors from a diffusion model. Furthermore, with +multi-view distillation, CSD infers invisible regions, thereby improving the +fidelity of monocular non-rigid reconstruction. Extensive experiments +demonstrate the capability of our method in generating high-flexibility +text-guided 3D models from the monocular video, while also showing improved +reconstruction performance over existing non-rigid reconstruction methods. + +
+
+ comment: Project page: https://animatabledreamer.github.io/ +
+
+
+
+
+ + ♻ ☆ Text2Loc: 3D Point Cloud Localization from Natural Language CVPR 2024 + + +
+ We tackle the problem of 3D point cloud localization based on a few natural +linguistic descriptions and introduce a novel neural network, Text2Loc, that +fully interprets the semantic relationship between points and text. Text2Loc +follows a coarse-to-fine localization pipeline: text-submap global place +recognition, followed by fine localization. In global place recognition, +relational dynamics among each textual hint are captured in a hierarchical +transformer with max-pooling (HTM), whereas a balance between positive and +negative pairs is maintained using text-submap contrastive learning. Moreover, +we propose a novel matching-free fine localization method to further refine the +location predictions, which completely removes the need for complicated +text-instance matching and is lighter, faster, and more accurate than previous +methods. Extensive experiments show that Text2Loc improves the localization +accuracy by up to $2\times$ over the state-of-the-art on the KITTI360Pose +dataset. Our project page is publicly available at +\url{https://yan-xia.github.io/projects/text2loc/}. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ RT-SRTS: Angle-Agnostic Real-Time Simultaneous 3D Reconstruction and + Tumor Segmentation from Single X-Ray Projection + + +
+ Radiotherapy is one of the primary treatment methods for tumors, but the +organ movement caused by respiration limits its accuracy. Recently, 3D imaging +from a single X-ray projection has received extensive attention as a promising +approach to address this issue. However, current methods can only reconstruct +3D images without directly locating the tumor and are only validated for +fixed-angle imaging, which fails to fully meet the requirements of motion +control in radiotherapy. In this study, a novel imaging method RT-SRTS is +proposed which integrates 3D imaging and tumor segmentation into one network +based on multi-task learning (MTL) and achieves real-time simultaneous 3D +reconstruction and tumor segmentation from a single X-ray projection at any +angle. Furthermore, the attention enhanced calibrator (AEC) and +uncertain-region elaboration (URE) modules have been proposed to aid feature +extraction and improve segmentation accuracy. The proposed method was evaluated +on fifteen patient cases and compared with three state-of-the-art methods. It +not only delivers superior 3D reconstruction but also demonstrates commendable +tumor segmentation results. Simultaneous reconstruction and segmentation can be +completed in approximately 70 ms, significantly faster than the required time +threshold for real-time tumor tracking. The efficacies of both AEC and URE have +also been validated in ablation studies. The code of work is available at +https://github.com/ZywooSimple/RT-SRTS. + +
+
+
+
+
+ + ♻ ☆ NaviNeRF: NeRF-based 3D Representation Disentanglement by Latent + Semantic Navigation + + +
+ 3D representation disentanglement aims to identify, decompose, and manipulate +the underlying explanatory factors of 3D data, which helps AI fundamentally +understand our 3D world. This task is currently under-explored and poses great +challenges: (i) the 3D representations are complex and in general contains much +more information than 2D image; (ii) many 3D representations are not well +suited for gradient-based optimization, let alone disentanglement. To address +these challenges, we use NeRF as a differentiable 3D representation, and +introduce a self-supervised Navigation to identify interpretable semantic +directions in the latent space. To our best knowledge, this novel method, +dubbed NaviNeRF, is the first work to achieve fine-grained 3D disentanglement +without any priors or supervisions. Specifically, NaviNeRF is built upon the +generative NeRF pipeline, and equipped with an Outer Navigation Branch and an +Inner Refinement Branch. They are complementary -- the outer navigation is to +identify global-view semantic directions, and the inner refinement dedicates to +fine-grained attributes. A synergistic loss is further devised to coordinate +two branches. Extensive experiments demonstrate that NaviNeRF has a superior +fine-grained 3D disentanglement ability than the previous 3D-aware models. Its +performance is also comparable to editing-oriented models relying on semantic +or geometry priors. + +
+
+
+
+
+ + ♻ ☆ MCAD: Multi-teacher Cross-modal Alignment Distillation for efficient + image-text retrieval + + +
+ Due to the success of large-scale visual-language pretraining (VLP) models +and the widespread use of image-text retrieval in industry areas, it is now +critically necessary to reduce the model size and streamline their +mobile-device deployment. Single- and dual-stream model structures are commonly +used in image-text retrieval with the goal of closing the semantic gap between +textual and visual modalities. While single-stream models use deep feature +fusion to achieve more accurate cross-model alignment, dual-stream models are +better at offline indexing and fast inference.We propose a Multi-teacher +Cross-modality Alignment Distillation (MCAD) technique to integrate the +advantages of single- and dual-stream models. By incorporating the fused +single-stream features into the image and text features of the dual-stream +model, we formulate new modified teacher similarity distributions and features. +Then, we conduct both distribution and feature distillation to boost the +capability of the student dual-stream model, achieving high retrieval +performance without increasing inference complexity.Extensive experiments +demonstrate the remarkable performance and high efficiency of MCAD on +image-text retrieval tasks. Furthermore, we implement a lightweight CLIP model +on Snapdragon/Dimensity chips with only $\sim$100M running memory and +$\sim$8.0ms search latency, achieving the mobile-device application of VLP +models. + +
+
+
+
+
+ + ♻ ☆ Detect-Order-Construct: A Tree Construction based Approach for + Hierarchical Document Structure Analysis + + +
+ Document structure analysis (aka document layout analysis) is crucial for +understanding the physical layout and logical structure of documents, with +applications in information retrieval, document summarization, knowledge +extraction, etc. In this paper, we concentrate on Hierarchical Document +Structure Analysis (HDSA) to explore hierarchical relationships within +structured documents created using authoring software employing hierarchical +schemas, such as LaTeX, Microsoft Word, and HTML. To comprehensively analyze +hierarchical document structures, we propose a tree construction based approach +that addresses multiple subtasks concurrently, including page object detection +(Detect), reading order prediction of identified objects (Order), and the +construction of intended hierarchical structure (Construct). We present an +effective end-to-end solution based on this framework to demonstrate its +performance. To assess our approach, we develop a comprehensive benchmark +called Comp-HRDoc, which evaluates the above subtasks simultaneously. Our +end-to-end system achieves state-of-the-art performance on two large-scale +document layout analysis datasets (PubLayNet and DocLayNet), a high-quality +hierarchical document structure reconstruction dataset (HRDoc), and our +Comp-HRDoc benchmark. The Comp-HRDoc benchmark will be released to facilitate +further research in this field. + +
+
+ comment: Submitted to Pattern Recognition +
+
+
+
+
+ + ♻ ☆ Multi-modal In-Context Learning Makes an Ego-evolving Scene Text + Recognizer CVPR2024 + + +
+ Scene text recognition (STR) in the wild frequently encounters challenges +when coping with domain variations, font diversity, shape deformations, etc. A +straightforward solution is performing model fine-tuning tailored to a specific +scenario, but it is computationally intensive and requires multiple model +copies for various scenarios. Recent studies indicate that large language +models (LLMs) can learn from a few demonstration examples in a training-free +manner, termed "In-Context Learning" (ICL). Nevertheless, applying LLMs as a +text recognizer is unacceptably resource-consuming. Moreover, our pilot +experiments on LLMs show that ICL fails in STR, mainly attributed to the +insufficient incorporation of contextual information from diverse samples in +the training stage. To this end, we introduce E$^2$STR, a STR model trained +with context-rich scene text sequences, where the sequences are generated via +our proposed in-context training strategy. E$^2$STR demonstrates that a +regular-sized model is sufficient to achieve effective ICL capabilities in STR. +Extensive experiments show that E$^2$STR exhibits remarkable training-free +adaptation in various scenarios and outperforms even the fine-tuned +state-of-the-art approaches on public benchmarks. The code is released at +https://github.com/bytedance/E2STR . + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ♻ ☆ Scalable Diffusion Models with State Space Backbone + + +
+ This paper presents a new exploration into a category of diffusion models +built upon state space architecture. We endeavor to train diffusion models for +image data, wherein the traditional U-Net backbone is supplanted by a state +space backbone, functioning on raw patches or latent space. Given its notable +efficacy in accommodating long-range dependencies, Diffusion State Space Models +(DiS) are distinguished by treating all inputs including time, condition, and +noisy image patches as tokens. Our assessment of DiS encompasses both +unconditional and class-conditional image generation scenarios, revealing that +DiS exhibits comparable, if not superior, performance to CNN-based or +Transformer-based U-Net architectures of commensurate size. Furthermore, we +analyze the scalability of DiS, gauged by the forward pass complexity +quantified in Gflops. DiS models with higher Gflops, achieved through +augmentation of depth/width or augmentation of input tokens, consistently +demonstrate lower FID. In addition to demonstrating commendable scalability +characteristics, DiS-H/2 models in latent space achieve performance levels akin +to prior diffusion models on class-conditional ImageNet benchmarks at the +resolution of 256$\times$256 and 512$\times$512, while significantly reducing +the computational burden. The code and models are available at: +https://github.com/feizc/DiS. + +
+
+
+
+
+ + ♻ ☆ OST: Refining Text Knowledge with Optimal Spatio-Temporal Descriptor for + General Video Recognition + + +
+ Due to the resource-intensive nature of training vision-language models on +expansive video data, a majority of studies have centered on adapting +pre-trained image-language models to the video domain. Dominant pipelines +propose to tackle the visual discrepancies with additional temporal learners +while overlooking the substantial discrepancy for web-scaled descriptive +narratives and concise action category names, leading to less distinct semantic +space and potential performance limitations. In this work, we prioritize the +refinement of text knowledge to facilitate generalizable video recognition. To +address the limitations of the less distinct semantic space of category names, +we prompt a large language model (LLM) to augment action class names into +Spatio-Temporal Descriptors thus bridging the textual discrepancy and serving +as a knowledge base for general recognition. Moreover, to assign the best +descriptors with different video instances, we propose Optimal Descriptor +Solver, forming the video recognition problem as solving the optimal matching +flow across frame-level representations and descriptors. Comprehensive +evaluations in zero-shot, few-shot, and fully supervised video recognition +highlight the effectiveness of our approach. Our best model achieves a +state-of-the-art zero-shot accuracy of 75.1% on Kinetics-600. + +
+
+ comment: Technical report. Project Page: https://tomchen-ctj.github.io/OST/ +
+
+
+
+
+ + ♻ ☆ FedSOL: Stabilized Orthogonal Learning with Proximal Restrictions in + Federated Learning CVPR 2024 + + +
+ Federated Learning (FL) aggregates locally trained models from individual +clients to construct a global model. While FL enables learning a model with +data privacy, it often suffers from significant performance degradation when +clients have heterogeneous data distributions. This data heterogeneity causes +the model to forget the global knowledge acquired from previously sampled +clients after being trained on local datasets. Although the introduction of +proximal objectives in local updates helps to preserve global knowledge, it can +also hinder local learning by interfering with local objectives. To address +this problem, we propose a novel method, Federated Stabilized Orthogonal +Learning (FedSOL), which adopts an orthogonal learning strategy to balance the +two conflicting objectives. FedSOL is designed to identify gradients of local +objectives that are inherently orthogonal to directions affecting the proximal +objective. Specifically, FedSOL targets parameter regions where learning on the +local objective is minimally influenced by proximal weight perturbations. Our +experiments demonstrate that FedSOL consistently achieves state-of-the-art +performance across various scenarios. + +
+
+ comment: The IEEE/CVF Conference on Computer Vision and Pattern Recognition + 2024 (CVPR 2024) +
+
+
+
+
+ + ♻ ☆ UFineBench: Towards Text-based Person Retrieval with Ultra-fine + Granularity + + +
+ Existing text-based person retrieval datasets often have relatively +coarse-grained text annotations. This hinders the model to comprehend the +fine-grained semantics of query texts in real scenarios. To address this +problem, we contribute a new benchmark named \textbf{UFineBench} for text-based +person retrieval with ultra-fine granularity. + Firstly, we construct a new \textbf{dataset} named UFine6926. We collect a +large number of person images and manually annotate each image with two +detailed textual descriptions, averaging 80.8 words each. The average word +count is three to four times that of the previous datasets. In addition of +standard in-domain evaluation, we also propose a special \textbf{evaluation +paradigm} more representative of real scenarios. It contains a new evaluation +set with cross domains, cross textual granularity and cross textual styles, +named UFine3C, and a new evaluation metric for accurately measuring retrieval +ability, named mean Similarity Distribution (mSD). Moreover, we propose CFAM, a +more efficient \textbf{algorithm} especially designed for text-based person +retrieval with ultra fine-grained texts. It achieves fine granularity mining by +adopting a shared cross-modal granularity decoder and hard negative match +mechanism. + With standard in-domain evaluation, CFAM establishes competitive performance +across various datasets, especially on our ultra fine-grained UFine6926. +Furthermore, by evaluating on UFine3C, we demonstrate that training on our +UFine6926 significantly improves generalization to real scenarios compared with +other coarse-grained datasets. The dataset and code will be made publicly +available at \url{https://github.com/Zplusdragon/UFineBench}. + +
+
+
+
+
+ + ♻ ☆ Subjective-Aligned Dataset and Metric for Text-to-Video Quality + Assessment + + +
+ With the rapid development of generative models, Artificial +Intelligence-Generated Contents (AIGC) have exponentially increased in daily +lives. Among them, Text-to-Video (T2V) generation has received widespread +attention. Though many T2V models have been released for generating high +perceptual quality videos, there is still lack of a method to evaluate the +quality of these videos quantitatively. To solve this issue, we establish the +largest-scale Text-to-Video Quality Assessment DataBase (T2VQA-DB) to date. The +dataset is composed of 10,000 videos generated by 9 different T2V models. We +also conduct a subjective study to obtain each video's corresponding mean +opinion score. Based on T2VQA-DB, we propose a novel transformer-based model +for subjective-aligned Text-to-Video Quality Assessment (T2VQA). The model +extracts features from text-video alignment and video fidelity perspectives, +then it leverages the ability of a large language model to give the prediction +score. Experimental results show that T2VQA outperforms existing T2V metrics +and SOTA video quality assessment models. Quantitative analysis indicates that +T2VQA is capable of giving subjective-align predictions, validating its +effectiveness. The dataset and code will be released at +https://github.com/QMME/T2VQA. + +
+
+
+
+
+ + ♻ ☆ ECoDepth: Effective Conditioning of Diffusion Models for Monocular Depth + Estimation CVPR + + +
+ In the absence of parallax cues, a learning-based single image depth +estimation (SIDE) model relies heavily on shading and contextual cues in the +image. While this simplicity is attractive, it is necessary to train such +models on large and varied datasets, which are difficult to capture. It has +been shown that using embeddings from pre-trained foundational models, such as +CLIP, improves zero shot transfer in several applications. Taking inspiration +from this, in our paper we explore the use of global image priors generated +from a pre-trained ViT model to provide more detailed contextual information. +We argue that the embedding vector from a ViT model, pre-trained on a large +dataset, captures greater relevant information for SIDE than the usual route of +generating pseudo image captions, followed by CLIP based text embeddings. Based +on this idea, we propose a new SIDE model using a diffusion backbone which is +conditioned on ViT embeddings. Our proposed design establishes a new +state-of-the-art (SOTA) for SIDE on NYUv2 dataset, achieving Abs Rel error of +0.059(14% improvement) compared to 0.069 by the current SOTA (VPD). And on +KITTI dataset, achieving Sq Rel error of 0.139 (2% improvement) compared to +0.142 by the current SOTA (GEDepth). For zero-shot transfer with a model +trained on NYUv2, we report mean relative improvement of (20%, 23%, 81%, 25%) +over NeWCRFs on (Sun-RGBD, iBims1, DIODE, HyperSim) datasets, compared to (16%, +18%, 45%, 9%) by ZoeDepth. The code is available at +https://ecodepth-iitd.github.io + +
+
+ comment: IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) + 2024 +
+
+
+
+
+ + ♻ ☆ Can 3D Vision-Language Models Truly Understand Natural Language? + + +
+ Rapid advancements in 3D vision-language (3D-VL) tasks have opened up new +avenues for human interaction with embodied agents or robots using natural +language. Despite this progress, we find a notable limitation: existing 3D-VL +models exhibit sensitivity to the styles of language input, struggling to +understand sentences with the same semantic meaning but written in different +variants. This observation raises a critical question: Can 3D vision-language +models truly understand natural language? To test the language +understandability of 3D-VL models, we first propose a language robustness task +for systematically assessing 3D-VL models across various tasks, benchmarking +their performance when presented with different language style variants. +Importantly, these variants are commonly encountered in applications requiring +direct interaction with humans, such as embodied robotics, given the diversity +and unpredictability of human language. We propose a 3D Language Robustness +Dataset, designed based on the characteristics of human language, to facilitate +the systematic study of robustness. Our comprehensive evaluation uncovers a +significant drop in the performance of all existing models across various 3D-VL +tasks. Even the state-of-the-art 3D-LLM fails to understand some variants of +the same sentences. Further in-depth analysis suggests that the existing models +have a fragile and biased fusion module, which stems from the low diversity of +the existing dataset. Finally, we propose a training-free module driven by LLM, +which improves language robustness. Datasets and code will be available at +github. + +
+
+ comment: https://github.com/VincentDENGP/3D-LR +
+
+
+
+
+ + ♻ ☆ Finding needles in a haystack: A Black-Box Approach to Invisible + Watermark Detection + + +
+ In this paper, we propose WaterMark Detection (WMD), the first invisible +watermark detection method under a black-box and annotation-free setting. WMD +is capable of detecting arbitrary watermarks within a given reference dataset +using a clean non-watermarked dataset as a reference, without relying on +specific decoding methods or prior knowledge of the watermarking techniques. We +develop WMD using foundations of offset learning, where a clean non-watermarked +dataset enables us to isolate the influence of only watermarked samples in the +reference dataset. Our comprehensive evaluations demonstrate the effectiveness +of WMD, significantly outperforming naive detection methods, which only yield +AUC scores around 0.5. In contrast, WMD consistently achieves impressive +detection AUC scores, surpassing 0.9 in most single-watermark datasets and +exceeding 0.7 in more challenging multi-watermark scenarios across diverse +datasets and watermarking methods. As invisible watermarks become increasingly +prevalent, while specific decoding techniques remain undisclosed, our approach +provides a versatile solution and establishes a path toward increasing +accountability, transparency, and trust in our digital visual content. + +
+
+
+
+
+ + ♻ ☆ Noisy-Correspondence Learning for Text-to-Image Person Re-identification + + +
+ Text-to-image person re-identification (TIReID) is a compelling topic in the +cross-modal community, which aims to retrieve the target person based on a +textual query. Although numerous TIReID methods have been proposed and achieved +promising performance, they implicitly assume the training image-text pairs are +correctly aligned, which is not always the case in real-world scenarios. In +practice, the image-text pairs inevitably exist under-correlated or even +false-correlated, a.k.a noisy correspondence (NC), due to the low quality of +the images and annotation errors. To address this problem, we propose a novel +Robust Dual Embedding method (RDE) that can learn robust visual-semantic +associations even with NC. Specifically, RDE consists of two main components: +1) A Confident Consensus Division (CCD) module that leverages the dual-grained +decisions of dual embedding modules to obtain a consensus set of clean training +data, which enables the model to learn correct and reliable visual-semantic +associations. 2) A Triplet Alignment Loss (TAL) relaxes the conventional +Triplet Ranking loss with the hardest negative samples to a log-exponential +upper bound over all negative ones, thus preventing the model collapse under NC +and can also focus on hard-negative samples for promising performance. We +conduct extensive experiments on three public benchmarks, namely CUHK-PEDES, +ICFG-PEDES, and RSTPReID, to evaluate the performance and robustness of our +RDE. Our method achieves state-of-the-art results both with and without +synthetic noisy correspondences on all three datasets. Code is available at +https://github.com/QinYang79/RDE. + +
+
+
+
+
+ + ♻ ☆ SpecNeRF: Gaussian Directional Encoding for Specular Reflections CVPR2024 + + +
+ Neural radiance fields have achieved remarkable performance in modeling the +appearance of 3D scenes. However, existing approaches still struggle with the +view-dependent appearance of glossy surfaces, especially under complex lighting +of indoor environments. Unlike existing methods, which typically assume distant +lighting like an environment map, we propose a learnable Gaussian directional +encoding to better model the view-dependent effects under near-field lighting +conditions. Importantly, our new directional encoding captures the +spatially-varying nature of near-field lighting and emulates the behavior of +prefiltered environment maps. As a result, it enables the efficient evaluation +of preconvolved specular color at any 3D location with varying roughness +coefficients. We further introduce a data-driven geometry prior that helps +alleviate the shape radiance ambiguity in reflection modeling. We show that our +Gaussian directional encoding and geometry prior significantly improve the +modeling of challenging specular reflections in neural radiance fields, which +helps decompose appearance into more physically meaningful components. + +
+
+ comment: Accepted to CVPR2024, Project page: + https://limacv.github.io/SpecNeRF_web/ +
+
+
+
+
+ + ♻ ☆ Gaze-guided Hand-Object Interaction Synthesis: Benchmark and Method + + +
+ Gaze plays a crucial role in revealing human attention and intention, +shedding light on the cognitive processes behind human actions. The integration +of gaze guidance with the dynamics of hand-object interactions boosts the +accuracy of human motion prediction. However, the lack of datasets that capture +the intricate relationship and consistency among gaze, hand, and object +movements remains a substantial hurdle. In this paper, we introduce the first +Gaze-guided Hand-Object Interaction dataset, GazeHOI, and present a novel task +for synthesizing gaze-guided hand-object interactions. Our dataset, GazeHOI, +features simultaneous 3D modeling of gaze, hand, and object interactions, +comprising 479 sequences with an average duration of 19.1 seconds, 812 +sub-sequences, and 33 objects of various sizes. We propose a hierarchical +framework centered on a gaze-guided hand-object interaction diffusion model, +named GHO-Diffusion. In the pre-diffusion phase, we separate gaze conditions +into spatial-temporal features and goal pose conditions at different levels of +information granularity. During the diffusion phase, two gaze-conditioned +diffusion models are stacked to simplify the complex synthesis of hand-object +motions. Here, the object motion diffusion model generates sequences of object +motions based on gaze conditions, while the hand motion diffusion model +produces hand motions based on the generated object motion. To improve +fine-grained goal pose alignment, we introduce a Spherical Gaussian constraint +to guide the denoising step. In the subsequent post-diffusion phase, we +optimize the generated hand motions using contact consistency. Our extensive +experiments highlight the uniqueness of our dataset and the effectiveness of +our approach. + +
+
+
+
+
+ + ♻ ☆ HQ-VAE: Hierarchical Discrete Representation Learning with Variational + Bayes + + +
+ Vector quantization (VQ) is a technique to deterministically learn features +with discrete codebook representations. It is commonly performed with a +variational autoencoding model, VQ-VAE, which can be further extended to +hierarchical structures for making high-fidelity reconstructions. However, such +hierarchical extensions of VQ-VAE often suffer from the codebook/layer collapse +issue, where the codebook is not efficiently used to express the data, and +hence degrades reconstruction accuracy. To mitigate this problem, we propose a +novel unified framework to stochastically learn hierarchical discrete +representation on the basis of the variational Bayes framework, called +hierarchically quantized variational autoencoder (HQ-VAE). HQ-VAE naturally +generalizes the hierarchical variants of VQ-VAE, such as VQ-VAE-2 and +residual-quantized VAE (RQ-VAE), and provides them with a Bayesian training +scheme. Our comprehensive experiments on image datasets show that HQ-VAE +enhances codebook usage and improves reconstruction performance. We also +validated HQ-VAE in terms of its applicability to a different modality with an +audio dataset. + +
+
+ comment: 34 pages with 17 figures, accepted for TMLR +
+
+
+
+
+ + ♻ ☆ Enhancing Object Coherence in Layout-to-Image Synthesis + + +
+ Layout-to-image synthesis is an emerging technique in conditional image +generation. It aims to generate complex scenes, where users require fine +control over the layout of the objects in a scene. However, it remains +challenging to control the object coherence, including semantic coherence +(e.g., the cat looks at the flowers or not) and physical coherence (e.g., the +hand and the racket should not be misaligned). In this paper, we propose a +novel diffusion model with effective global semantic fusion (GSF) and +self-similarity feature enhancement modules to guide the object coherence for +this task. For semantic coherence, we argue that the image caption contains +rich information for defining the semantic relationship within the objects in +the images. Instead of simply employing cross-attention between captions and +generated images, which addresses the highly relevant layout restriction and +semantic coherence separately and thus leads to unsatisfying results shown in +our experiments, we develop GSF to fuse the supervision from the layout +restriction and semantic coherence requirement and exploit it to guide the +image synthesis process. Moreover, to improve the physical coherence, we +develop a Self-similarity Coherence Attention (SCA) module to explicitly +integrate local contextual physical coherence into each pixel's generation +process. Specifically, we adopt a self-similarity map to encode the coherence +restrictions and employ it to extract coherent features from text embedding. +Through visualization of our self-similarity map, we explore the essence of +SCA, revealing that its effectiveness is not only in capturing reliable +physical coherence patterns but also in enhancing complex texture generation. +Extensive experiments demonstrate the superiority of our proposed method in +both image generation quality and controllability. + +
+
+ comment: GitHub: https://github.com/CodeGoat24/EOCNet +
+
+
+
+
+ + ♻ ☆ GS-IR: 3D Gaussian Splatting for Inverse Rendering + + +
+ We propose GS-IR, a novel inverse rendering approach based on 3D Gaussian +Splatting (GS) that leverages forward mapping volume rendering to achieve +photorealistic novel view synthesis and relighting results. Unlike previous +works that use implicit neural representations and volume rendering (e.g. +NeRF), which suffer from low expressive power and high computational +complexity, we extend GS, a top-performance representation for novel view +synthesis, to estimate scene geometry, surface material, and environment +illumination from multi-view images captured under unknown lighting conditions. +There are two main problems when introducing GS to inverse rendering: 1) GS +does not support producing plausible normal natively; 2) forward mapping (e.g. +rasterization and splatting) cannot trace the occlusion like backward mapping +(e.g. ray tracing). To address these challenges, our GS-IR proposes an +efficient optimization scheme that incorporates a depth-derivation-based +regularization for normal estimation and a baking-based occlusion to model +indirect lighting. The flexible and expressive GS representation allows us to +achieve fast and compact geometry reconstruction, photorealistic novel view +synthesis, and effective physically-based rendering. We demonstrate the +superiority of our method over baseline methods through qualitative and +quantitative evaluations on various challenging scenes. + +
+
+
+
+
+ + ♻ ☆ ProTeCt: Prompt Tuning for Taxonomic Open Set Classification CVPR 2024 + + +
+ Visual-language foundation models, like CLIP, learn generalized +representations that enable zero-shot open-set classification. Few-shot +adaptation methods, based on prompt tuning, have been shown to further improve +performance on downstream datasets. However, these methods do not fare well in +the taxonomic open set (TOS) setting, where the classifier is asked to make +predictions from label sets across different levels of semantic granularity. +Frequently, they infer incorrect labels at coarser taxonomic class levels, even +when the inference at the leaf level (original class labels) is correct. To +address this problem, we propose a prompt tuning technique that calibrates the +hierarchical consistency of model predictions. A set of metrics of hierarchical +consistency, the Hierarchical Consistent Accuracy (HCA) and the Mean Treecut +Accuracy (MTA), are first proposed to evaluate TOS model performance. A new +Prompt Tuning for Hierarchical Consistency (ProTeCt) technique is then proposed +to calibrate classification across label set granularities. Results show that +ProTeCt can be combined with existing prompt tuning methods to significantly +improve TOS classification without degrading the leaf level classification +performance. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ SpikingResformer: Bridging ResNet and Vision Transformer in Spiking + Neural Networks CVPR + + +
+ The remarkable success of Vision Transformers in Artificial Neural Networks +(ANNs) has led to a growing interest in incorporating the self-attention +mechanism and transformer-based architecture into Spiking Neural Networks +(SNNs). While existing methods propose spiking self-attention mechanisms that +are compatible with SNNs, they lack reasonable scaling methods, and the overall +architectures proposed by these methods suffer from a bottleneck in effectively +extracting local features. To address these challenges, we propose a novel +spiking self-attention mechanism named Dual Spike Self-Attention (DSSA) with a +reasonable scaling method. Based on DSSA, we propose a novel spiking Vision +Transformer architecture called SpikingResformer, which combines the +ResNet-based multi-stage architecture with our proposed DSSA to improve both +performance and energy efficiency while reducing parameters. Experimental +results show that SpikingResformer achieves higher accuracy with fewer +parameters and lower energy consumption than other spiking Vision Transformer +counterparts. Notably, our SpikingResformer-L achieves 79.40% top-1 accuracy on +ImageNet with 4 time-steps, which is the state-of-the-art result in the SNN +field. + +
+
+ comment: To be published in the 2024 IEEE/CVF Conference on Computer Vision + and Pattern Recognition (CVPR) +
+
+
+
+
+ + ♻ ☆ MemoNav: Working Memory Model for Visual Navigation CVPR 2024 + + +
+ Image-goal navigation is a challenging task that requires an agent to +navigate to a goal indicated by an image in unfamiliar environments. Existing +methods utilizing diverse scene memories suffer from inefficient exploration +since they use all historical observations for decision-making without +considering the goal-relevant fraction. To address this limitation, we present +MemoNav, a novel memory model for image-goal navigation, which utilizes a +working memory-inspired pipeline to improve navigation performance. +Specifically, we employ three types of navigation memory. The node features on +a map are stored in the short-term memory (STM), as these features are +dynamically updated. A forgetting module then retains the informative STM +fraction to increase efficiency. We also introduce long-term memory (LTM) to +learn global scene representations by progressively aggregating STM features. +Subsequently, a graph attention module encodes the retained STM and the LTM to +generate working memory (WM) which contains the scene features essential for +efficient navigation. The synergy among these three memory types boosts +navigation performance by enabling the agent to learn and leverage +goal-relevant scene features within a topological map. Our evaluation on +multi-goal tasks demonstrates that MemoNav significantly outperforms previous +methods across all difficulty levels in both Gibson and Matterport3D scenes. +Qualitative results further illustrate that MemoNav plans more efficient +routes. + +
+
+ comment: Accepted to CVPR 2024. Code: https://github.com/ZJULiHongxin/MemoNav +
+
+
+
+
+ + ♻ ☆ FlexEdit: Flexible and Controllable Diffusion-based Object-centric Image + Editing + + +
+ Our work addresses limitations seen in previous approaches for object-centric +editing problems, such as unrealistic results due to shape discrepancies and +limited control in object replacement or insertion. To this end, we introduce +FlexEdit, a flexible and controllable editing framework for objects where we +iteratively adjust latents at each denoising step using our FlexEdit block. +Initially, we optimize latents at test time to align with specified object +constraints. Then, our framework employs an adaptive mask, automatically +extracted during denoising, to protect the background while seamlessly blending +new content into the target image. We demonstrate the versatility of FlexEdit +in various object editing tasks and curate an evaluation test suite with +samples from both real and synthetic images, along with novel evaluation +metrics designed for object-centric editing. We conduct extensive experiments +on different editing scenarios, demonstrating the superiority of our editing +framework over recent advanced text-guided image editing methods. Our project +page is published at https://flex-edit.github.io/. + +
+
+ comment: Our project page: https://flex-edit.github.io/ +
+
+
+
+
+ + ♻ ☆ Feature Unlearning for Pre-trained GANs and VAEs + + +
+ We tackle the problem of feature unlearning from a pre-trained image +generative model: GANs and VAEs. Unlike a common unlearning task where an +unlearning target is a subset of the training set, we aim to unlearn a specific +feature, such as hairstyle from facial images, from the pre-trained generative +models. As the target feature is only presented in a local region of an image, +unlearning the entire image from the pre-trained model may result in losing +other details in the remaining region of the image. To specify which features +to unlearn, we collect randomly generated images that contain the target +features. We then identify a latent representation corresponding to the target +feature and then use the representation to fine-tune the pre-trained model. +Through experiments on MNIST, CelebA, and FFHQ datasets, we show that target +features are successfully removed while keeping the fidelity of the original +models. Further experiments with an adversarial attack show that the unlearned +model is more robust under the presence of malicious parties. + +
+
+
+
+
+ + ♻ ☆ MMM: Generative Masked Motion Model CVPR + + +
+ Recent advances in text-to-motion generation using diffusion and +autoregressive models have shown promising results. However, these models often +suffer from a trade-off between real-time performance, high fidelity, and +motion editability. To address this gap, we introduce MMM, a novel yet simple +motion generation paradigm based on Masked Motion Model. MMM consists of two +key components: (1) a motion tokenizer that transforms 3D human motion into a +sequence of discrete tokens in latent space, and (2) a conditional masked +motion transformer that learns to predict randomly masked motion tokens, +conditioned on the pre-computed text tokens. By attending to motion and text +tokens in all directions, MMM explicitly captures inherent dependency among +motion tokens and semantic mapping between motion and text tokens. During +inference, this allows parallel and iterative decoding of multiple motion +tokens that are highly consistent with fine-grained text descriptions, +therefore simultaneously achieving high-fidelity and high-speed motion +generation. In addition, MMM has innate motion editability. By simply placing +mask tokens in the place that needs editing, MMM automatically fills the gaps +while guaranteeing smooth transitions between editing and non-editing parts. +Extensive experiments on the HumanML3D and KIT-ML datasets demonstrate that MMM +surpasses current leading methods in generating high-quality motion (evidenced +by superior FID scores of 0.08 and 0.429), while offering advanced editing +features such as body-part modification, motion in-betweening, and the +synthesis of long motion sequences. In addition, MMM is two orders of magnitude +faster on a single mid-range GPU than editable motion diffusion models. Our +project page is available at \url{https://exitudio.github.io/MMM-page}. + +
+
+ comment: accepted to CVPR +
+
+
+
+
+ + ♻ ☆ SDSTrack: Self-Distillation Symmetric Adapter Learning for Multi-Modal + Visual Object Tracking CVPR2024 + + +
+ Multimodal Visual Object Tracking (VOT) has recently gained significant +attention due to its robustness. Early research focused on fully fine-tuning +RGB-based trackers, which was inefficient and lacked generalized representation +due to the scarcity of multimodal data. Therefore, recent studies have utilized +prompt tuning to transfer pre-trained RGB-based trackers to multimodal data. +However, the modality gap limits pre-trained knowledge recall, and the +dominance of the RGB modality persists, preventing the full utilization of +information from other modalities. To address these issues, we propose a novel +symmetric multimodal tracking framework called SDSTrack. We introduce +lightweight adaptation for efficient fine-tuning, which directly transfers the +feature extraction ability from RGB to other domains with a small number of +trainable parameters and integrates multimodal features in a balanced, +symmetric manner. Furthermore, we design a complementary masked patch +distillation strategy to enhance the robustness of trackers in complex +environments, such as extreme weather, poor imaging, and sensor failure. +Extensive experiments demonstrate that SDSTrack outperforms state-of-the-art +methods in various multimodal tracking scenarios, including RGB+Depth, +RGB+Thermal, and RGB+Event tracking, and exhibits impressive results in extreme +conditions. Our source code is available at https://github.com/hoqolo/SDSTrack. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ InterGen: Diffusion-based Multi-human Motion Generation under Complex + Interactions + + +
+ We have recently seen tremendous progress in diffusion advances for +generating realistic human motions. Yet, they largely disregard the multi-human +interactions. In this paper, we present InterGen, an effective diffusion-based +approach that incorporates human-to-human interactions into the motion +diffusion process, which enables layman users to customize high-quality +two-person interaction motions, with only text guidance. We first contribute a +multimodal dataset, named InterHuman. It consists of about 107M frames for +diverse two-person interactions, with accurate skeletal motions and 23,337 +natural language descriptions. For the algorithm side, we carefully tailor the +motion diffusion model to our two-person interaction setting. To handle the +symmetry of human identities during interactions, we propose two cooperative +transformer-based denoisers that explicitly share weights, with a mutual +attention mechanism to further connect the two denoising processes. Then, we +propose a novel representation for motion input in our interaction diffusion +model, which explicitly formulates the global relations between the two +performers in the world frame. We further introduce two novel regularization +terms to encode spatial relations, equipped with a corresponding damping scheme +during the training of our interaction diffusion model. Extensive experiments +validate the effectiveness and generalizability of InterGen. Notably, it can +generate more diverse and compelling two-person motions than previous methods +and enables various downstream applications for human interactions. + +
+
+ comment: accepted by IJCV 2024 +
+
+
+
+
+ + ♻ ☆ Gaining the Sparse Rewards by Exploring Lottery Tickets in Spiking + Neural Network + + +
+ Deploying energy-efficient deep learning algorithms on computational-limited +devices, such as robots, is still a pressing issue for real-world applications. +Spiking Neural Networks (SNNs), a novel brain-inspired algorithm, offer a +promising solution due to their low-latency and low-energy properties over +traditional Artificial Neural Networks (ANNs). Despite their advantages, the +dense structure of deep SNNs can still result in extra energy consumption. The +Lottery Ticket Hypothesis (LTH) posits that within dense neural networks, there +exist winning Lottery Tickets (LTs), namely sub-networks, that can be obtained +without compromising performance. Inspired by this, this paper delves into the +spiking-based LTs (SLTs), examining their unique properties and potential for +extreme efficiency. Then, two significant sparse \textbf{\textit{Rewards}} are +gained through comprehensive explorations and meticulous experiments on SLTs +across various dense structures. Moreover, a sparse algorithm tailored for +spiking transformer structure, which incorporates convolution operations into +the Patch Embedding Projection (ConvPEP) module, has been proposed to achieve +Multi-level Sparsity (MultiSp). MultiSp refers to (1) Patch number sparsity; +(2) ConvPEP weights sparsity and binarization; and (3) ConvPEP activation layer +binarization. Extensive experiments demonstrate that our method achieves +extreme sparsity with only a slight performance decrease, paving the way for +deploying energy-efficient neural networks in robotics and beyond. + +
+
+ comment: This paper is under submission +
+
+
+
+
+ + ♻ ☆ FluoroSAM: A Language-aligned Foundation Model for X-ray Image + Segmentation + + +
+ Automated X-ray image segmentation would accelerate research and development +in diagnostic and interventional precision medicine. Prior efforts have +contributed task-specific models capable of solving specific image analysis +problems, but the utility of these models is restricted to their particular +task domain, and expanding to broader use requires additional data, labels, and +retraining efforts. Recently, foundation models (FMs) -- machine learning +models trained on large amounts of highly variable data thus enabling broad +applicability -- have emerged as promising tools for automated image analysis. +Existing FMs for medical image analysis focus on scenarios and modalities where +objects are clearly defined by visually apparent boundaries, such as surgical +tool segmentation in endoscopy. X-ray imaging, by contrast, does not generally +offer such clearly delineated boundaries or structure priors. During X-ray +image formation, complex 3D structures are projected in transmission onto the +imaging plane, resulting in overlapping features of varying opacity and shape. +To pave the way toward an FM for comprehensive and automated analysis of +arbitrary medical X-ray images, we develop FluoroSAM, a language-aligned +variant of the Segment-Anything Model, trained from scratch on 1.6M synthetic +X-ray images. FluoroSAM is trained on data including masks for 128 organ types +and 464 non-anatomical objects, such as tools and implants. In real X-ray +images of cadaveric specimens, FluoroSAM is able to segment bony anatomical +structures based on text-only prompting with 0.51 and 0.79 DICE with +point-based refinement, outperforming competing SAM variants for all +structures. FluoroSAM is also capable of zero-shot generalization to segmenting +classes beyond the training set thanks to its language alignment, which we +demonstrate for full lung segmentation on real chest X-rays. + +
+
+
+
+
+ + ♻ ☆ FMA-Net: Flow-Guided Dynamic Filtering and Iterative Feature Refinement + with Multi-Attention for Joint Video Super-Resolution and Deblurring CVPR2024 + + +
+ We present a joint learning scheme of video super-resolution and deblurring, +called VSRDB, to restore clean high-resolution (HR) videos from blurry +low-resolution (LR) ones. This joint restoration problem has drawn much less +attention compared to single restoration problems. In this paper, we propose a +novel flow-guided dynamic filtering (FGDF) and iterative feature refinement +with multi-attention (FRMA), which constitutes our VSRDB framework, denoted as +FMA-Net. Specifically, our proposed FGDF enables precise estimation of both +spatio-temporally-variant degradation and restoration kernels that are aware of +motion trajectories through sophisticated motion representation learning. +Compared to conventional dynamic filtering, the FGDF enables the FMA-Net to +effectively handle large motions into the VSRDB. Additionally, the stacked FRMA +blocks trained with our novel temporal anchor (TA) loss, which temporally +anchors and sharpens features, refine features in a course-to-fine manner +through iterative updates. Extensive experiments demonstrate the superiority of +the proposed FMA-Net over state-of-the-art methods in terms of both +quantitative and qualitative quality. Codes and pre-trained models are +available at: https://kaist-viclab.github.io/fmanet-site + +
+
+ comment: CVPR2024 (camera-ready version). The last two authors are + co-corresponding authors. Please visit our project page at + https://kaist-viclab.github.io/fmanet-site +
+
+
+
+
+ + ♻ ☆ HallE-Control: Controlling Object Hallucination in Large Multimodal + Models + + +
+ Current Large Multimodal Models (LMMs) achieve remarkable progress, yet there +remains significant uncertainty regarding their ability to accurately apprehend +visual details, that is, in performing detailed captioning. To address this, we +introduce $\textit{CCEval}$, a GPT-4 assisted evaluation method for detailed +captioning. Interestingly, while LMMs demonstrate minimal object existence +hallucination in existing VQA benchmarks, our proposed evaluation reveals +continued susceptibility to such hallucinations. In this paper, we make the +first attempt to investigate such hallucination from different aspects, +including image resolution, the language decoder size, and instruction data +amount, quality, granularity. Our findings underscore the unwarranted inference +when the language description includes details at a finer object granularity +than what the vision module can ground or verify, thus inducing hallucination. +To control such hallucinations, we further attribute the reliability of +captioning to contextual knowledge (involving only contextually grounded +objects) and parametric knowledge (containing inferred objects by the model). +Thus, we introduce $\textit{HallE-Control}$, a controllable LMM in terms of +$\textbf{Hall}$ucination in object $\textbf{E}$xistence. HallE-Control can +condition the captioning to shift between (i) exclusively depicting contextual +knowledge for grounded objects and (ii) blending it with parametric knowledge +to imagine inferred objects. Our method reduces hallucination by 44% compared +to LLaVA$_{7B}$ and maintains the object coverage. + +
+
+ comment: Our code is publicly available at + https://github.com/bronyayang/HallE_Control +
+
+
+
+
+ + ♻ ☆ Data-Efficient Multimodal Fusion on a Single GPU CVPR 2024 + + +
+ The goal of multimodal alignment is to learn a single latent space that is +shared between multimodal inputs. The most powerful models in this space have +been trained using massive datasets of paired inputs and large-scale +computational resources, making them prohibitively expensive to train in many +practical scenarios. We surmise that existing unimodal encoders pre-trained on +large amounts of unimodal data should provide an effective bootstrap to create +multimodal models from unimodal ones at much lower costs. We therefore propose +FuseMix, a multimodal augmentation scheme that operates on the latent spaces of +arbitrary pre-trained unimodal encoders. Using FuseMix for multimodal +alignment, we achieve competitive performance -- and in certain cases +outperform state-of-the art methods -- in both image-text and audio-text +retrieval, with orders of magnitude less compute and data: for example, we +outperform CLIP on the Flickr30K text-to-image retrieval task with $\sim \! +600\times$ fewer GPU days and $\sim \! 80\times$ fewer image-text pairs. +Additionally, we show how our method can be applied to convert pre-trained +text-to-image generative models into audio-to-image ones. Code is available at: +https://github.com/layer6ai-labs/fusemix. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ QN-Mixer: A Quasi-Newton MLP-Mixer Model for Sparse-View CT + Reconstruction CVPR 2024 + + +
+ Inverse problems span across diverse fields. In medical contexts, computed +tomography (CT) plays a crucial role in reconstructing a patient's internal +structure, presenting challenges due to artifacts caused by inherently +ill-posed inverse problems. Previous research advanced image quality via +post-processing and deep unrolling algorithms but faces challenges, such as +extended convergence times with ultra-sparse data. Despite enhancements, +resulting images often show significant artifacts, limiting their effectiveness +for real-world diagnostic applications. We aim to explore deep second-order +unrolling algorithms for solving imaging inverse problems, emphasizing their +faster convergence and lower time complexity compared to common first-order +methods like gradient descent. In this paper, we introduce QN-Mixer, an +algorithm based on the quasi-Newton approach. We use learned parameters through +the BFGS algorithm and introduce Incept-Mixer, an efficient neural architecture +that serves as a non-local regularization term, capturing long-range +dependencies within images. To address the computational demands typically +associated with quasi-Newton algorithms that require full Hessian matrix +computations, we present a memory-efficient alternative. Our approach +intelligently downsamples gradient information, significantly reducing +computational requirements while maintaining performance. The approach is +validated through experiments on the sparse-view CT problem, involving various +datasets and scanning protocols, and is compared with post-processing and deep +unrolling state-of-the-art approaches. Our method outperforms existing +approaches and achieves state-of-the-art performance in terms of SSIM and PSNR, +all while reducing the number of unrolling iterations required. + +
+
+ comment: Accepted at CVPR 2024. Project page: + https://towzeur.github.io/QN-Mixer/ +
+
+
+
+
+ + ♻ ☆ Learning from One Continuous Video Stream CVPR + + +
+ We introduce a framework for online learning from a single continuous video +stream -- the way people and animals learn, without mini-batches, data +augmentation or shuffling. This poses great challenges given the high +correlation between consecutive video frames and there is very little prior +work on it. Our framework allows us to do a first deep dive into the topic and +includes a collection of streams and tasks composed from two existing video +datasets, plus methodology for performance evaluation that considers both +adaptation and generalization. We employ pixel-to-pixel modelling as a +practical and flexible way to switch between pre-training and single-stream +evaluation as well as between arbitrary tasks, without ever requiring changes +to models and always using the same pixel loss. Equipped with this framework we +obtained large single-stream learning gains from pre-training with a novel +family of future prediction tasks, found that momentum hurts, and that the pace +of weight updates matters. The combination of these insights leads to matching +the performance of IID learning with batch size 1, when using the same +architecture and without costly replay buffers. + +
+
+ comment: CVPR camera ready version +
+
+
+
+
+ + ♻ ☆ Residual-based Language Models are Free Boosters for Biomedical Imaging + + +
+ In this study, we uncover the unexpected efficacy of residual-based large +language models (LLMs) as part of encoders for biomedical imaging tasks, a +domain traditionally devoid of language or textual data. The approach diverges +from established methodologies by utilizing a frozen transformer block, +extracted from pre-trained LLMs, as an innovative encoder layer for the direct +processing of visual tokens. This strategy represents a significant departure +from the standard multi-modal vision-language frameworks, which typically hinge +on language-driven prompts and inputs. We found that these LLMs could boost +performance across a spectrum of biomedical imaging applications, including +both 2D and 3D visual classification tasks, serving as plug-and-play boosters. +More interestingly, as a byproduct, we found that the proposed framework +achieved superior performance, setting new state-of-the-art results on +extensive, standardized datasets in MedMNIST-2D and 3D. Through this work, we +aim to open new avenues for employing LLMs in biomedical imaging and enriching +the understanding of their potential in this specialized domain. + +
+
+
+
+
+ + ♻ ☆ Advances in Kidney Biopsy Lesion Assessment through Dense Instance + Segmentation + + +
+ Renal biopsies are the gold standard for diagnosis of kidney diseases. Lesion +scores made by renal pathologists are semi-quantitative and exhibit high +inter-observer variability. Automating lesion classification within segmented +anatomical structures can provide decision support in quantification analysis +and reduce the inter-observer variability. Nevertheless, classifying lesions in +regions-of-interest (ROIs) is clinically challenging due to (a) a large amount +of densely packed anatomical objects (up to 1000), (b) class imbalance across +different compartments (at least 3), (c) significant variation in object scales +(i.e. sizes and shapes), and (d) the presence of multi-label lesions per +anatomical structure. Existing models lack the capacity to address these +complexities efficiently and generically. This paper presents \textbf{a +generalized technical solution} for large-scale, multi-source datasets with +diverse lesions. Our approach utilizes two sub-networks: dense instance +segmentation and lesion classification. We introduce \textbf{DiffRegFormer}, an +end-to-end dense instance segmentation model designed for multi-class, +multi-scale objects within ROIs. Combining diffusion models, transformers, and +RCNNs, DiffRegFormer efficiently recognizes over 500 objects across three +anatomical classes (glomeruli, tubuli, arteries) within ROIs on a single NVIDIA +GeForce RTX 3090 GPU. On a dataset of 303 ROIs (from 148 Jones' silver-stained +renal WSIs), it outperforms state of art models, achieving AP of 52.1\% +(detection) and 46.8\% (segmentation). Our lesion classification sub-network +achieves 89.2\% precision and 64.6\% recall on 21889 object patches (from the +303 ROIs). Importantly, the model demonstrates direct domain transfer to +PAS-stained WSIs without fine-tuning. + +
+
+ comment: 16 pages, 15 figures, 6 tables, Journal +
+
+
+
+
+ + ♻ ☆ DecentNeRFs: Decentralized Neural Radiance Fields from Crowdsourced + Images + + +
+ Neural radiance fields (NeRFs) show potential for transforming images +captured worldwide into immersive 3D visual experiences. However, most of this +captured visual data remains siloed in our camera rolls as these images contain +personal details. Even if made public, the problem of learning 3D +representations of billions of scenes captured daily in a centralized manner is +computationally intractable. Our approach, DecentNeRF, is the first attempt at +decentralized, crowd-sourced NeRFs that require $\sim 10^4\times$ less server +computing for a scene than a centralized approach. Instead of sending the raw +data, our approach requires users to send a 3D representation, distributing the +high computation cost of training centralized NeRFs between the users. It +learns photorealistic scene representations by decomposing users' 3D views into +personal and global NeRFs and a novel optimally weighted aggregation of only +the latter. We validate the advantage of our approach to learn NeRFs with +photorealism and minimal server computation cost on structured synthetic and +real-world photo tourism datasets. We further analyze how secure aggregation of +global NeRFs in DecentNeRF minimizes the undesired reconstruction of personal +content by the server. + +
+
+
+
+
+ + ♻ ☆ Hybrid quantum image classification and federated learning for hepatic + steatosis diagnosis + + +
+ In the realm of liver transplantation, accurately determining hepatic +steatosis levels is crucial. Recognizing the essential need for improved +diagnostic precision, particularly for optimizing diagnosis time by swiftly +handling easy-to-solve cases and allowing the expert time to focus on more +complex cases, this study aims to develop cutting-edge algorithms that enhance +the classification of liver biopsy images. Additionally, the challenge of +maintaining data privacy arises when creating automated algorithmic solutions, +as sharing patient data between hospitals is restricted, further complicating +the development and validation process. This research tackles diagnostic +accuracy by leveraging novel techniques from the rapidly evolving field of +quantum machine learning, known for their superior generalization abilities. +Concurrently, it addresses privacy concerns through the implementation of +privacy-conscious collaborative machine learning with federated learning. We +introduce a hybrid quantum neural network model that leverages real-world +clinical data to assess non-alcoholic liver steatosis accurately. This model +achieves an image classification accuracy of 97%, surpassing traditional +methods by 1.8%. Moreover, by employing a federated learning approach that +allows data from different clients to be shared while ensuring privacy, we +maintain an accuracy rate exceeding 90%. This initiative marks a significant +step towards a scalable, collaborative, efficient, and dependable computational +framework that aids clinical pathologists in their daily diagnostic tasks. + +
+
+ comment: 13 pages, 3 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Wasserstein Distortion: Unifying Fidelity and Realism + + +
+ We introduce a distortion measure for images, Wasserstein distortion, that +simultaneously generalizes pixel-level fidelity on the one hand and realism or +perceptual quality on the other. We show how Wasserstein distortion reduces to +a pure fidelity constraint or a pure realism constraint under different +parameter choices and discuss its metric properties. Pairs of images that are +close under Wasserstein distortion illustrate its utility. In particular, we +generate random textures that have high fidelity to a reference texture in one +location of the image and smoothly transition to an independent realization of +the texture as one moves away from this point. Wasserstein distortion attempts +to generalize and unify prior work on texture generation, image realism and +distortion, and models of the early human visual system, in the form of an +optimizable metric in the mathematical sense. + +
+
+
+
+
+ + ♻ ☆ GOTCHA: Real-Time Video Deepfake Detection via Challenge-Response + + +
+ With the rise of AI-enabled Real-Time Deepfakes (RTDFs), the integrity of +online video interactions has become a growing concern. RTDFs have now made it +feasible to replace an imposter's face with their victim in live video +interactions. Such advancement in deepfakes also coaxes detection to rise to +the same standard. However, existing deepfake detection techniques are +asynchronous and hence ill-suited for RTDFs. To bridge this gap, we propose a +challenge-response approach that establishes authenticity in live settings. We +focus on talking-head style video interaction and present a taxonomy of +challenges that specifically target inherent limitations of RTDF generation +pipelines. We evaluate representative examples from the taxonomy by collecting +a unique dataset comprising eight challenges, which consistently and visibly +degrades the quality of state-of-the-art deepfake generators. These results are +corroborated both by humans and a new automated scoring function, leading to +88.6% and 80.1% AUC, respectively. The findings underscore the promising +potential of challenge-response systems for explainable and scalable real-time +deepfake detection in practical scenarios. We provide access to data and code +at https://github.com/mittalgovind/GOTCHA-Deepfakes + +
+
+ comment: 20 pages, 19 figures, Code and data released +
+
+
+
+
+ + ♻ ☆ Federated attention consistent learning models for prostate cancer + diagnosis and Gleason grading + + +
+ Artificial intelligence (AI) holds significant promise in transforming +medical imaging, enhancing diagnostics, and refining treatment strategies. +However, the reliance on extensive multicenter datasets for training AI models +poses challenges due to privacy concerns. Federated learning provides a +solution by facilitating collaborative model training across multiple centers +without sharing raw data. This study introduces a federated +attention-consistent learning (FACL) framework to address challenges associated +with large-scale pathological images and data heterogeneity. FACL enhances +model generalization by maximizing attention consistency between local clients +and the server model. To ensure privacy and validate robustness, we +incorporated differential privacy by introducing noise during parameter +transfer. We assessed the effectiveness of FACL in cancer diagnosis and Gleason +grading tasks using 19,461 whole-slide images of prostate cancer from multiple +centers. In the diagnosis task, FACL achieved an area under the curve (AUC) of +0.9718, outperforming seven centers with an average AUC of 0.9499 when +categories are relatively balanced. For the Gleason grading task, FACL attained +a Kappa score of 0.8463, surpassing the average Kappa score of 0.7379 from six +centers. In conclusion, FACL offers a robust, accurate, and cost-effective AI +training model for prostate cancer pathology while maintaining effective data +safeguards. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ♻ ☆ Toward a Surgeon-in-the-Loop Ophthalmic Robotic Apprentice using + Reinforcement and Imitation Learning + + +
+ Robotic-assisted surgical systems have demonstrated significant potential in +enhancing surgical precision and minimizing human errors. However, existing +systems lack the ability to accommodate the unique preferences and requirements +of individual surgeons. Additionally, they primarily focus on general surgeries +(e.g., laparoscopy) and are not suitable for highly precise microsurgeries, +such as ophthalmic procedures. Thus, we propose a simulation-based image-guided +approach for surgeon-centered autonomous agents that can adapt to the +individual surgeon's skill level and preferred surgical techniques during +ophthalmic cataract surgery. Our approach utilizes a simulated environment to +train reinforcement and imitation learning agents guided by image data to +perform all tasks of the incision phase of cataract surgery. By integrating the +surgeon's actions and preferences into the training process with the +surgeon-in-the-loop, our approach enables the robot to implicitly learn and +adapt to the individual surgeon's unique approach through demonstrations. This +results in a more intuitive and personalized surgical experience for the +surgeon. Simultaneously, it ensures consistent performance for the autonomous +robotic apprentice. We define and evaluate the effectiveness of our approach +using our proposed metrics; and highlight the trade-off between a generic agent +and a surgeon-centered adapted agent. Moreover, our approach has the potential +to extend to other ophthalmic surgical procedures, opening the door to a new +generation of surgeon-in-the-loop autonomous surgical robots. We provide an +open-source simulation framework for future development and reproducibility. + +
+
+
+
+
+ + ♻ ☆ TUNeS: A Temporal U-Net with Self-Attention for Video-based Surgical + Phase Recognition + + +
+ To enable context-aware computer assistance in the operating room of the +future, cognitive systems need to understand automatically which surgical phase +is being performed by the medical team. The primary source of information for +surgical phase recognition is typically video, which presents two challenges: +extracting meaningful features from the video stream and effectively modeling +temporal information in the sequence of visual features. For temporal modeling, +attention mechanisms have gained popularity due to their ability to capture +long-range dependencies. In this paper, we explore design choices for attention +in existing temporal models for surgical phase recognition and propose a novel +approach that uses attention more effectively and does not require hand-crafted +constraints: TUNeS, an efficient and simple temporal model that incorporates +self-attention at the core of a convolutional U-Net structure. In addition, we +propose to train the feature extractor, a standard CNN, together with an LSTM +on preferably long video segments, i.e., with long temporal context. In our +experiments, almost all temporal models performed better on top of feature +extractors that were trained with longer temporal context. On these +contextualized features, TUNeS achieves state-of-the-art results on the +Cholec80 and AutoLaparo datasets. + +
+
+ comment: Major revision: comparison to Temporal U-Transformer +
+
+
+
+
+ + ♻ ☆ STREAM: Spatio-TempoRal Evaluation and Analysis Metric for Video + Generative Models ICLR 2024 + + +
+ Image generative models have made significant progress in generating +realistic and diverse images, supported by comprehensive guidance from various +evaluation metrics. However, current video generative models struggle to +generate even short video clips, with limited tools that provide insights for +improvements. Current video evaluation metrics are simple adaptations of image +metrics by switching the embeddings with video embedding networks, which may +underestimate the unique characteristics of video. Our analysis reveals that +the widely used Frechet Video Distance (FVD) has a stronger emphasis on the +spatial aspect than the temporal naturalness of video and is inherently +constrained by the input size of the embedding networks used, limiting it to 16 +frames. Additionally, it demonstrates considerable instability and diverges +from human evaluations. To address the limitations, we propose STREAM, a new +video evaluation metric uniquely designed to independently evaluate spatial and +temporal aspects. This feature allows comprehensive analysis and evaluation of +video generative models from various perspectives, unconstrained by video +length. We provide analytical and experimental evidence demonstrating that +STREAM provides an effective evaluation tool for both visual and temporal +quality of videos, offering insights into area of improvement for video +generative models. To the best of our knowledge, STREAM is the first evaluation +metric that can separately assess the temporal and spatial aspects of videos. +Our code is available at https://github.com/pro2nit/STREAM. + +
+
+ comment: Our work is accepted to ICLR 2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 222 + +
+
+
+ + ☆ Real Acoustic Fields: An Audio-Visual Room Acoustics Dataset and + Benchmark CVPR 2024 + + +
+ We present a new dataset called Real Acoustic Fields (RAF) that captures real +acoustic room data from multiple modalities. The dataset includes high-quality +and densely captured room impulse response data paired with multi-view images, +and precise 6DoF pose tracking data for sound emitters and listeners in the +rooms. We used this dataset to evaluate existing methods for novel-view +acoustic synthesis and impulse response generation which previously relied on +synthetic data. In our evaluation, we thoroughly assessed existing audio and +audio-visual models against multiple criteria and proposed settings to enhance +their performance on real-world data. We also conducted experiments to +investigate the impact of incorporating visual data (i.e., images and depth) +into neural acoustic field models. Additionally, we demonstrated the +effectiveness of a simple sim2real approach, where a model is pre-trained with +simulated data and fine-tuned with sparse real-world data, resulting in +significant improvements in the few-shot learning approach. RAF is the first +dataset to provide densely captured room acoustic data, making it an ideal +resource for researchers working on audio and audio-visual neural acoustic +field modeling techniques. Demos and datasets are available on our project +page: https://facebookresearch.github.io/real-acoustic-fields/ + +
+
+ comment: Accepted to CVPR 2024. Project site: + https://facebookresearch.github.io/real-acoustic-fields/ +
+
+
+
+
+ + ☆ MetaCap: Meta-learning Priors from Multi-View Imagery for Sparse-view + Human Performance Capture and Rendering + + +
+ Faithful human performance capture and free-view rendering from sparse RGB +observations is a long-standing problem in Vision and Graphics. The main +challenges are the lack of observations and the inherent ambiguities of the +setting, e.g. occlusions and depth ambiguity. As a result, radiance fields, +which have shown great promise in capturing high-frequency appearance and +geometry details in dense setups, perform poorly when na\"ively supervising +them on sparse camera views, as the field simply overfits to the sparse-view +inputs. To address this, we propose MetaCap, a method for efficient and +high-quality geometry recovery and novel view synthesis given very sparse or +even a single view of the human. Our key idea is to meta-learn the radiance +field weights solely from potentially sparse multi-view videos, which can serve +as a prior when fine-tuning them on sparse imagery depicting the human. This +prior provides a good network weight initialization, thereby effectively +addressing ambiguities in sparse-view capture. Due to the articulated structure +of the human body and motion-induced surface deformations, learning such a +prior is non-trivial. Therefore, we propose to meta-learn the field weights in +a pose-canonicalized space, which reduces the spatial feature range and makes +feature learning more effective. Consequently, one can fine-tune our field +parameters to quickly generalize to unseen poses, novel illumination conditions +as well as novel and sparse (even monocular) camera views. For evaluating our +method under different scenarios, we collect a new dataset, WildDynaCap, which +contains subjects captured in, both, a dense camera dome and in-the-wild sparse +camera rigs, and demonstrate superior results compared to recent +state-of-the-art methods on both public and WildDynaCap dataset. + +
+
+ comment: Project page: https://vcai.mpi-inf.mpg.de/projects/MetaCap/ +
+
+
+
+
+ + ☆ Benchmarking Object Detectors with COCO: A New Path Forward + + +
+ The Common Objects in Context (COCO) dataset has been instrumental in +benchmarking object detectors over the past decade. Like every dataset, COCO +contains subtle errors and imperfections stemming from its annotation +procedure. With the advent of high-performing models, we ask whether these +errors of COCO are hindering its utility in reliably benchmarking further +progress. In search for an answer, we inspect thousands of masks from COCO +(2017 version) and uncover different types of errors such as imprecise mask +boundaries, non-exhaustively annotated instances, and mislabeled masks. Due to +the prevalence of COCO, we choose to correct these errors to maintain +continuity with prior research. We develop COCO-ReM (Refined Masks), a cleaner +set of annotations with visibly better mask quality than COCO-2017. We evaluate +fifty object detectors and find that models that predict visually sharper masks +score higher on COCO-ReM, affirming that they were being incorrectly penalized +due to errors in COCO-2017. Moreover, our models trained using COCO-ReM +converge faster and score higher than their larger variants trained using +COCO-2017, highlighting the importance of data quality in improving object +detectors. With these findings, we advocate using COCO-ReM for future object +detection research. Our dataset is available at https://cocorem.xyz + +
+
+ comment: Technical report. Dataset website: https://cocorem.xyz and code: + https://github.com/kdexd/coco-rem +
+
+
+
+
+ + ☆ ObjectDrop: Bootstrapping Counterfactuals for Photorealistic Object + Removal and Insertion + + +
+ Diffusion models have revolutionized image editing but often generate images +that violate physical laws, particularly the effects of objects on the scene, +e.g., occlusions, shadows, and reflections. By analyzing the limitations of +self-supervised approaches, we propose a practical solution centered on a +\q{counterfactual} dataset. Our method involves capturing a scene before and +after removing a single object, while minimizing other changes. By fine-tuning +a diffusion model on this dataset, we are able to not only remove objects but +also their effects on the scene. However, we find that applying this approach +for photorealistic object insertion requires an impractically large dataset. To +tackle this challenge, we propose bootstrap supervision; leveraging our object +removal model trained on a small counterfactual dataset, we synthetically +expand this dataset considerably. Our approach significantly outperforms prior +methods in photorealistic object removal and insertion, particularly at +modeling the effects of objects on the scene. + +
+
+
+
+
+ + ☆ Garment3DGen: 3D Garment Stylization and Texture Generation + + +
+ We introduce Garment3DGen a new method to synthesize 3D garment assets from a +base mesh given a single input image as guidance. Our proposed approach allows +users to generate 3D textured clothes based on both real and synthetic images, +such as those generated by text prompts. The generated assets can be directly +draped and simulated on human bodies. First, we leverage the recent progress of +image to 3D diffusion methods to generate 3D garment geometries. However, since +these geometries cannot be utilized directly for downstream tasks, we propose +to use them as pseudo ground-truth and set up a mesh deformation optimization +procedure that deforms a base template mesh to match the generated 3D target. +Second, we introduce carefully designed losses that allow the input base mesh +to freely deform towards the desired target, yet preserve mesh quality and +topology such that they can be simulated. Finally, a texture estimation module +generates high-fidelity texture maps that are globally and locally consistent +and faithfully capture the input guidance, allowing us to render the generated +3D assets. With Garment3DGen users can generate the textured 3D garment of +their choice without the need of artist intervention. One can provide a textual +prompt describing the garment they desire to generate a simulation-ready 3D +asset. We present a plethora of quantitative and qualitative comparisons on +various assets both real and generated and provide use-cases of how one can +generate simulation-ready 3D garments. + +
+
+ comment: Project Page: https://nsarafianos.github.io/garment3dgen +
+
+
+
+
+ + ☆ Mini-Gemini: Mining the Potential of Multi-modality Vision Language + Models + + +
+ In this work, we introduce Mini-Gemini, a simple and effective framework +enhancing multi-modality Vision Language Models (VLMs). Despite the +advancements in VLMs facilitating basic visual dialog and reasoning, a +performance gap persists compared to advanced models like GPT-4 and Gemini. We +try to narrow the gap by mining the potential of VLMs for better performance +and any-to-any workflow from three aspects, i.e., high-resolution visual +tokens, high-quality data, and VLM-guided generation. To enhance visual tokens, +we propose to utilize an additional visual encoder for high-resolution +refinement without increasing the visual token count. We further construct a +high-quality dataset that promotes precise image comprehension and +reasoning-based generation, expanding the operational scope of current VLMs. In +general, Mini-Gemini further mines the potential of VLMs and empowers current +frameworks with image understanding, reasoning, and generation simultaneously. +Mini-Gemini supports a series of dense and MoE Large Language Models (LLMs) +from 2B to 34B. It is demonstrated to achieve leading performance in several +zero-shot benchmarks and even surpasses the developed private models. Code and +models are available at https://github.com/dvlab-research/MiniGemini. + +
+
+ comment: Code and models are available at + https://github.com/dvlab-research/MiniGemini +
+
+
+
+
+ + ☆ Duolando: Follower GPT with Off-Policy Reinforcement Learning for Dance + Accompaniment ICLR 2024 + + +
+ We introduce a novel task within the field of 3D dance generation, termed +dance accompaniment, which necessitates the generation of responsive movements +from a dance partner, the "follower", synchronized with the lead dancer's +movements and the underlying musical rhythm. Unlike existing solo or group +dance generation tasks, a duet dance scenario entails a heightened degree of +interaction between the two participants, requiring delicate coordination in +both pose and position. To support this task, we first build a large-scale and +diverse duet interactive dance dataset, DD100, by recording about 117 minutes +of professional dancers' performances. To address the challenges inherent in +this task, we propose a GPT-based model, Duolando, which autoregressively +predicts the subsequent tokenized motion conditioned on the coordinated +information of the music, the leader's and the follower's movements. To further +enhance the GPT's capabilities of generating stable results on unseen +conditions (music and leader motions), we devise an off-policy reinforcement +learning strategy that allows the model to explore viable trajectories from +out-of-distribution samplings, guided by human-defined rewards. Based on the +collected dataset and proposed method, we establish a benchmark with several +carefully designed metrics. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ☆ ECoDepth: Effective Conditioning of Diffusion Models for Monocular Depth + Estimation CVPR + + +
+ In the absence of parallax cues, a learning-based single image depth +estimation (SIDE) model relies heavily on shading and contextual cues in the +image. While this simplicity is attractive, it is necessary to train such +models on large and varied datasets, which are difficult to capture. It has +been shown that using embeddings from pre-trained foundational models, such as +CLIP, improves zero shot transfer in several applications. Taking inspiration +from this, in our paper we explore the use of global image priors generated +from a pre-trained ViT model to provide more detailed contextual information. +We argue that the embedding vector from a ViT model, pre-trained on a large +dataset, captures greater relevant information for SIDE than the usual route of +generating pseudo image captions, followed by CLIP based text embeddings. Based +on this idea, we propose a new SIDE model using a diffusion backbone which is +conditioned on ViT embeddings. Our proposed design establishes a new +state-of-the-art (SOTA) for SIDE on NYUv2 dataset, achieving Abs Rel error of +0.059(14% improvement) compared to 0.069 by the current SOTA (VPD). And on +KITTI dataset, achieving Sq Rel error of 0.139 (2% improvement) compared to +0.142 by the current SOTA (GEDepth). For zero-shot transfer with a model +trained on NYUv2, we report mean relative improvement of (20%, 23%, 81%, 25%) +over NeWCRFs on (Sun-RGBD, iBims1, DIODE, HyperSim) datasets, compared to (16%, +18%, 45%, 9%) by ZoeDepth. The code is available at +https://github.com/Aradhye2002/EcoDepth. + +
+
+ comment: Accepted at IEEE/CVF Conference on Computer Vision and Pattern + Recognition (CVPR) 2024 +
+
+
+
+
+ + ☆ Gamba: Marry Gaussian Splatting with Mamba for single view 3D + reconstruction + + +
+ We tackle the challenge of efficiently reconstructing a 3D asset from a +single image with growing demands for automated 3D content creation pipelines. +Previous methods primarily rely on Score Distillation Sampling (SDS) and Neural +Radiance Fields (NeRF). Despite their significant success, these approaches +encounter practical limitations due to lengthy optimization and considerable +memory usage. In this report, we introduce Gamba, an end-to-end amortized 3D +reconstruction model from single-view images, emphasizing two main insights: +(1) 3D representation: leveraging a large number of 3D Gaussians for an +efficient 3D Gaussian splatting process; (2) Backbone design: introducing a +Mamba-based sequential network that facilitates context-dependent reasoning and +linear scalability with the sequence (token) length, accommodating a +substantial number of Gaussians. Gamba incorporates significant advancements in +data preprocessing, regularization design, and training methodologies. We +assessed Gamba against existing optimization-based and feed-forward 3D +generation approaches using the real-world scanned OmniObject3D dataset. Here, +Gamba demonstrates competitive generation capabilities, both qualitatively and +quantitatively, while achieving remarkable speed, approximately 0.6 second on a +single NVIDIA A100 GPU. + +
+
+
+
+
+ + ☆ Object Pose Estimation via the Aggregation of Diffusion Features CVPR2024 + + +
+ Estimating the pose of objects from images is a crucial task of 3D scene +understanding, and recent approaches have shown promising results on very large +benchmarks. However, these methods experience a significant performance drop +when dealing with unseen objects. We believe that it results from the limited +generalizability of image features. To address this problem, we have an +in-depth analysis on the features of diffusion models, e.g. Stable Diffusion, +which hold substantial potential for modeling unseen objects. Based on this +analysis, we then innovatively introduce these diffusion features for object +pose estimation. To achieve this, we propose three distinct architectures that +can effectively capture and aggregate diffusion features of different +granularity, greatly improving the generalizability of object pose estimation. +Our approach outperforms the state-of-the-art methods by a considerable margin +on three popular benchmark datasets, LM, O-LM, and T-LESS. In particular, our +method achieves higher accuracy than the previous best arts on unseen objects: +98.2% vs. 93.5% on Unseen LM, 85.9% vs. 76.3% on Unseen O-LM, showing the +strong generalizability of our method. Our code is released at +https://github.com/Tianfu18/diff-feats-pose. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ☆ SplatFace: Gaussian Splat Face Reconstruction Leveraging an Optimizable + Surface + + +
+ We present SplatFace, a novel Gaussian splatting framework designed for 3D +human face reconstruction without reliance on accurate pre-determined geometry. +Our method is designed to simultaneously deliver both high-quality novel view +rendering and accurate 3D mesh reconstructions. We incorporate a generic 3D +Morphable Model (3DMM) to provide a surface geometric structure, making it +possible to reconstruct faces with a limited set of input images. We introduce +a joint optimization strategy that refines both the Gaussians and the morphable +surface through a synergistic non-rigid alignment process. A novel distance +metric, splat-to-surface, is proposed to improve alignment by considering both +the Gaussian position and covariance. The surface information is also utilized +to incorporate a world-space densification process, resulting in superior +reconstruction quality. Our experimental analysis demonstrates that the +proposed method is competitive with both other Gaussian splatting techniques in +novel view synthesis and other 3D reconstruction methods in producing 3D face +meshes with high geometric precision. + +
+
+
+
+
+ + ☆ ImageNet-D: Benchmarking Neural Network Robustness on Diffusion + Synthetic Object CVPR 2024 + + +
+ We establish rigorous benchmarks for visual perception robustness. Synthetic +images such as ImageNet-C, ImageNet-9, and Stylized ImageNet provide specific +type of evaluation over synthetic corruptions, backgrounds, and textures, yet +those robustness benchmarks are restricted in specified variations and have low +synthetic quality. In this work, we introduce generative model as a data source +for synthesizing hard images that benchmark deep models' robustness. Leveraging +diffusion models, we are able to generate images with more diversified +backgrounds, textures, and materials than any prior work, where we term this +benchmark as ImageNet-D. Experimental results show that ImageNet-D results in a +significant accuracy drop to a range of vision models, from the standard ResNet +visual classifier to the latest foundation models like CLIP and MiniGPT-4, +significantly reducing their accuracy by up to 60\%. Our work suggests that +diffusion models can be an effective source to test vision models. The code and +dataset are available at https://github.com/chenshuang-zhang/imagenet_d. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ ModaLink: Unifying Modalities for Efficient Image-to-PointCloud Place + Recognition + + +
+ Place recognition is an important task for robots and autonomous cars to +localize themselves and close loops in pre-built maps. While single-modal +sensor-based methods have shown satisfactory performance, cross-modal place +recognition that retrieving images from a point-cloud database remains a +challenging problem. Current cross-modal methods transform images into 3D +points using depth estimation for modality conversion, which are usually +computationally intensive and need expensive labeled data for depth +supervision. In this work, we introduce a fast and lightweight framework to +encode images and point clouds into place-distinctive descriptors. We propose +an effective Field of View (FoV) transformation module to convert point clouds +into an analogous modality as images. This module eliminates the necessity for +depth estimation and helps subsequent modules achieve real-time performance. We +further design a non-negative factorization-based encoder to extract mutually +consistent semantic features between point clouds and images. This encoder +yields more distinctive global descriptors for retrieval. Experimental results +on the KITTI dataset show that our proposed methods achieve state-of-the-art +performance while running in real time. Additional evaluation on the HAOMO +dataset covering a 17 km trajectory further shows the practical generalization +capabilities. We have released the implementation of our methods as open source +at: https://github.com/haomo-ai/ModaLink.git. + +
+
+ comment: 8 pages, 11 figures, conference +
+
+
+
+
+ + ☆ Detection of subclinical atherosclerosis by image-based deep learning on + chest x-ray + + +
+ Aims. To develop a deep-learning based system for recognition of subclinical +atherosclerosis on a plain frontal chest x-ray. Methods and Results. A +deep-learning algorithm to predict coronary artery calcium (CAC) score (the +AI-CAC model) was developed on 460 chest x-ray (80% training cohort, 20% +internal validation cohort) of primary prevention patients (58.4% male, median +age 63 [51-74] years) with available paired chest x-ray and chest computed +tomography (CT) indicated for any clinical reason and performed within 3 +months. The CAC score calculated on chest CT was used as ground truth. The +model was validated on an temporally-independent cohort of 90 patients from the +same institution (external validation). The diagnostic accuracy of the AI-CAC +model assessed by the area under the curve (AUC) was the primary outcome. +Overall, median AI-CAC score was 35 (0-388) and 28.9% patients had no AI-CAC. +AUC of the AI-CAC model to identify a CAC>0 was 0.90 in the internal validation +cohort and 0.77 in the external validation cohort. Sensitivity was consistently +above 92% in both cohorts. In the overall cohort (n=540), among patients with +AI-CAC=0, a single ASCVD event occurred, after 4.3 years. Patients with +AI-CAC>0 had significantly higher Kaplan Meier estimates for ASCVD events +(13.5% vs. 3.4%, log-rank=0.013). Conclusion. The AI-CAC model seems to +accurately detect subclinical atherosclerosis on chest x-ray with elevated +sensitivity, and to predict ASCVD events with elevated negative predictive +value. Adoption of the AI-CAC model to refine CV risk stratification or as an +opportunistic screening tool requires prospective evaluation. + +
+
+ comment: Submitted to European Heart Journal - Cardiovascular Imaging Added + also the additional material 44 pages (30 main paper, 14 additional + material), 14 figures (5 main manuscript, 9 additional material) +
+
+
+
+
+ + ☆ A vascular synthetic model for improved aneurysm segmentation and + detection via Deep Neural Networks + + +
+ We hereby present a full synthetic model, able to mimic the various +constituents of the cerebral vascular tree: the cerebral arteries, the +bifurcations and the intracranial aneurysms. By building this model, our goal +was to provide a substantial dataset of brain arteries which could be used by a +3D Convolutional Neural Network (CNN) to either segment or detect/recognize +various vascular diseases (such as artery dissection/thrombosis) or even some +portions of the cerebral vasculature, such as the bifurcations or aneurysms. In +this study, we will particularly focus on Intra-Cranial Aneurysm (ICA) +detection and segmentation. The cerebral aneurysms most often occur on a +particular structure of the vascular tree named the Circle of Willis. Various +studies have been conducted to detect and monitor the ICAs and those based on +Deep Learning (DL) achieve the best performances. Specifically, in this work, +we propose a full synthetic 3D model able to mimic the brain vasculature as +acquired by Magnetic Resonance Angiography (MRA), and more particularly the +Time Of Flight (TOF) principle. Among the various MRI modalities, the MRA-TOF +allows to have a relatively good rendering of the blood vessels and is +non-invasive (no contrast liquid injection). Our model has been designed to +simultaneously mimic the arteries geometry, the ICA shape and the background +noise. The geometry of the vascular tree is modeled thanks to an interpolation +with 3D Spline functions, and the statistical properties of the background MRI +noise is collected from MRA acquisitions and reproduced within the model. In +this work, we thoroughly describe the synthetic vasculature model, we build up +a neural network designed for ICA segmentation and detection, and finally, we +carry out an in-depth evaluation of the performance gap gained thanks to the +synthetic model data augmentation. + +
+
+
+
+
+ + ☆ Enhancing Manufacturing Quality Prediction Models through the + Integration of Explainability Methods + + +
+ This research presents a method that utilizes explainability techniques to +amplify the performance of machine learning (ML) models in forecasting the +quality of milling processes, as demonstrated in this paper through a +manufacturing use case. The methodology entails the initial training of ML +models, followed by a fine-tuning phase where irrelevant features identified +through explainability methods are eliminated. This procedural refinement +results in performance enhancements, paving the way for potential reductions in +manufacturing costs and a better understanding of the trained ML models. This +study highlights the usefulness of explainability techniques in both explaining +and optimizing predictive models in the manufacturing realm. + +
+
+
+
+
+ + ☆ Towards Image Ambient Lighting Normalization + + +
+ Lighting normalization is a crucial but underexplored restoration task with +broad applications. However, existing works often simplify this task within the +context of shadow removal, limiting the light sources to one and +oversimplifying the scene, thus excluding complex self-shadows and restricting +surface classes to smooth ones. Although promising, such simplifications hinder +generalizability to more realistic settings encountered in daily use. In this +paper, we propose a new challenging task termed Ambient Lighting Normalization +(ALN), which enables the study of interactions between shadows, unifying image +restoration and shadow removal in a broader context. To address the lack of +appropriate datasets for ALN, we introduce the large-scale high-resolution +dataset Ambient6K, comprising samples obtained from multiple light sources and +including self-shadows resulting from complex geometries, which is the first of +its kind. For benchmarking, we select various mainstream methods and rigorously +evaluate them on Ambient6K. Additionally, we propose IFBlend, a novel strong +baseline that maximizes Image-Frequency joint entropy to selectively restore +local areas under different lighting conditions, without relying on shadow +localization priors. Experiments show that IFBlend achieves SOTA scores on +Ambient6K and exhibits competitive performance on conventional shadow removal +benchmarks compared to shadow-specific models with mask priors. The dataset, +benchmark, and code are available at https://github.com/fvasluianu97/IFBlend. + +
+
+
+
+
+ + ☆ Semi-Supervised Learning for Deep Causal Generative Models + + +
+ Developing models that can answer questions of the form "How would $x$ change +if $y$ had been $z$?" is fundamental for advancing medical image analysis. +Training causal generative models that address such counterfactual questions, +though, currently requires that all relevant variables have been observed and +that corresponding labels are available in training data. However, clinical +data may not have complete records for all patients and state of the art causal +generative models are unable to take full advantage of this. We thus develop, +for the first time, a semi-supervised deep causal generative model that +exploits the causal relationships between variables to maximise the use of all +available data. We explore this in the setting where each sample is either +fully labelled or fully unlabelled, as well as the more clinically realistic +case of having different labels missing for each sample. We leverage techniques +from causal inference to infer missing values and subsequently generate +realistic counterfactuals, even for samples with incomplete labels. + +
+
+
+
+
+ + ☆ Mitigating Hallucinations in Large Vision-Language Models with + Instruction Contrastive Decoding + + +
+ Large Vision-Language Models (LVLMs) are increasingly adept at generating +contextually detailed and coherent responses from visual inputs. However, their +application in multimodal decision-making and open-ended generation is hindered +by a notable rate of hallucinations, where generated text inaccurately +represents the visual contents. To address this issue, this paper introduces +the Instruction Contrastive Decoding (ICD) method, a novel approach designed to +reduce hallucinations during LVLM inference. Our method is inspired by our +observation that what we call disturbance instructions significantly exacerbate +hallucinations in multimodal fusion modules. ICD contrasts distributions from +standard and instruction disturbance, thereby increasing alignment uncertainty +and effectively subtracting hallucinated concepts from the original +distribution. Through comprehensive experiments on discriminative benchmarks +(POPE and MME) and a generative benchmark (LLaVa-Bench), we demonstrate that +ICD significantly mitigates both object-level and attribute-level +hallucinations. Moreover, our method not only addresses hallucinations but also +significantly enhances the general perception and recognition capabilities of +LVLMs. + +
+
+
+
+
+ + ☆ Bringing Textual Prompt to AI-Generated Image Quality Assessment ICME2024 + + +
+ AI-Generated Images (AGIs) have inherent multimodal nature. Unlike +traditional image quality assessment (IQA) on natural scenarios, AGIs quality +assessment (AGIQA) takes the correspondence of image and its textual prompt +into consideration. This is coupled in the ground truth score, which confuses +the unimodal IQA methods. To solve this problem, we introduce IP-IQA (AGIs +Quality Assessment via Image and Prompt), a multimodal framework for AGIQA via +corresponding image and prompt incorporation. Specifically, we propose a novel +incremental pretraining task named Image2Prompt for better understanding of +AGIs and their corresponding textual prompts. An effective and efficient +image-prompt fusion module, along with a novel special [QA] token, are also +applied. Both are plug-and-play and beneficial for the cooperation of image and +its corresponding prompt. Experiments demonstrate that our IP-IQA achieves the +state-of-the-art on AGIQA-1k and AGIQA-3k datasets. Code will be available. + +
+
+ comment: 6 pages, 3 figures, accepted by ICME2024 +
+
+
+
+
+ + ☆ SAT-NGP : Unleashing Neural Graphics Primitives for Fast Relightable + Transient-Free 3D reconstruction from Satellite Imagery + + +
+ Current stereo-vision pipelines produce high accuracy 3D reconstruction when +using multiple pairs or triplets of satellite images. However, these pipelines +are sensitive to the changes between images that can occur as a result of +multi-date acquisitions. Such variations are mainly due to variable shadows, +reflexions and transient objects (cars, vegetation). To take such changes into +account, Neural Radiance Fields (NeRF) have recently been applied to multi-date +satellite imagery. However, Neural methods are very compute-intensive, taking +dozens of hours to learn, compared with minutes for standard stereo-vision +pipelines. Following the ideas of Instant Neural Graphics Primitives we propose +to use an efficient sampling strategy and multi-resolution hash encoding to +accelerate the learning. Our model, Satellite Neural Graphics Primitives +(SAT-NGP) decreases the learning time to 15 minutes while maintaining the +quality of the 3D reconstruction. + +
+
+ comment: 5 pages, 3 figures, 1 table; Accepted to International Geoscience and + Remote Sensing Symposium (IGARSS) 2024; Code available at + https://github.com/Ellimac0/SAT-NGP +
+
+
+
+
+ + ☆ Dense Vision Transformer Compression with Few Samples CVPR 2024 + + +
+ Few-shot model compression aims to compress a large model into a more compact +one with only a tiny training set (even without labels). Block-level pruning +has recently emerged as a leading technique in achieving high accuracy and low +latency in few-shot CNN compression. But, few-shot compression for Vision +Transformers (ViT) remains largely unexplored, which presents a new challenge. +In particular, the issue of sparse compression exists in traditional CNN +few-shot methods, which can only produce very few compressed models of +different model sizes. This paper proposes a novel framework for few-shot ViT +compression named DC-ViT. Instead of dropping the entire block, DC-ViT +selectively eliminates the attention module while retaining and reusing +portions of the MLP module. DC-ViT enables dense compression, which outputs +numerous compressed models that densely populate the range of model complexity. +DC-ViT outperforms state-of-the-art few-shot compression methods by a +significant margin of 10 percentage points, along with lower latency in the +compression of ViT and its variants. + +
+
+ comment: Accepted to CVPR 2024. Note: Jianxin Wu is a contributing author for + the arXiv version of this paper but is not listed as an author in the CVPR + version due to his role as Program Chair +
+
+
+
+
+ + ☆ Annolid: Annotate, Segment, and Track Anything You Need + + +
+ Annolid is a deep learning-based software package designed for the +segmentation, labeling, and tracking of research targets within video files, +focusing primarily on animal behavior analysis. Based on state-of-the-art +instance segmentation methods, Annolid now harnesses the Cutie video object +segmentation model to achieve resilient, markerless tracking of multiple +animals from single annotated frames, even in environments in which they may be +partially or entirely concealed by environmental features or by one another. +Our integration of Segment Anything and Grounding-DINO strategies additionally +enables the automatic masking and segmentation of recognizable animals and +objects by text command, removing the need for manual annotation. Annolid's +comprehensive approach to object segmentation flexibly accommodates a broad +spectrum of behavior analysis applications, enabling the classification of +diverse behavioral states such as freezing, digging, pup huddling, and social +interactions in addition to the tracking of animals and their body parts. + +
+
+
+
+
+ + ☆ Deep Learning for Robust and Explainable Models in Computer Vision + + +
+ Recent breakthroughs in machine and deep learning (ML and DL) research have +provided excellent tools for leveraging enormous amounts of data and optimizing +huge models with millions of parameters to obtain accurate networks for image +processing. These developments open up tremendous opportunities for using +artificial intelligence (AI) in the automation and human assisted AI industry. +However, as more and more models are deployed and used in practice, many +challenges have emerged. This thesis presents various approaches that address +robustness and explainability challenges for using ML and DL in practice. + Robustness and reliability are the critical components of any model before +certification and deployment in practice. Deep convolutional neural networks +(CNNs) exhibit vulnerability to transformations of their inputs, such as +rotation and scaling, or intentional manipulations as described in the +adversarial attack literature. In addition, building trust in AI-based models +requires a better understanding of current models and developing methods that +are more explainable and interpretable a priori. + This thesis presents developments in computer vision models' robustness and +explainability. Furthermore, this thesis offers an example of using vision +models' feature response visualization (models' interpretations) to improve +robustness despite interpretability and robustness being seemingly unrelated in +the related research. Besides methodological developments for robust and +explainable vision models, a key message of this thesis is introducing model +interpretation techniques as a tool for understanding vision models and +improving their design and robustness. In addition to the theoretical +developments, this thesis demonstrates several applications of ML and DL in +different contexts, such as medical imaging and affective computing. + +
+
+ comment: 150 pages, 37 figures, 12 tables +
+
+
+
+
+ + ☆ InstructBrush: Learning Attention-based Instruction Optimization for + Image Editing + + +
+ In recent years, instruction-based image editing methods have garnered +significant attention in image editing. However, despite encompassing a wide +range of editing priors, these methods are helpless when handling editing tasks +that are challenging to accurately describe through language. We propose +InstructBrush, an inversion method for instruction-based image editing methods +to bridge this gap. It extracts editing effects from exemplar image pairs as +editing instructions, which are further applied for image editing. Two key +techniques are introduced into InstructBrush, Attention-based Instruction +Optimization and Transformation-oriented Instruction Initialization, to address +the limitations of the previous method in terms of inversion effects and +instruction generalization. To explore the ability of instruction inversion +methods to guide image editing in open scenarios, we establish a +TransformationOriented Paired Benchmark (TOP-Bench), which contains a rich set +of scenes and editing types. The creation of this benchmark paves the way for +further exploration of instruction inversion. Quantitatively and qualitatively, +our approach achieves superior performance in editing and is more semantically +consistent with the target editing effects. + +
+
+ comment: Project Page: https://royzhao926.github.io/InstructBrush/ +
+
+
+
+
+ + ☆ Addressing Data Annotation Challenges in Multiple Sensors: A Solution + for Scania Collected Datasets + + +
+ Data annotation in autonomous vehicles is a critical step in the development +of Deep Neural Network (DNN) based models or the performance evaluation of the +perception system. This often takes the form of adding 3D bounding boxes on +time-sequential and registered series of point-sets captured from active +sensors like Light Detection and Ranging (LiDAR) and Radio Detection and +Ranging (RADAR). When annotating multiple active sensors, there is a need to +motion compensate and translate the points to a consistent coordinate frame and +timestamp respectively. However, highly dynamic objects pose a unique +challenge, as they can appear at different timestamps in each sensor's data. +Without knowing the speed of the objects, their position appears to be +different in different sensor outputs. Thus, even after motion compensation, +highly dynamic objects are not matched from multiple sensors in the same frame, +and human annotators struggle to add unique bounding boxes that capture all +objects. This article focuses on addressing this challenge, primarily within +the context of Scania collected datasets. The proposed solution takes a track +of an annotated object as input and uses the Moving Horizon Estimation (MHE) to +robustly estimate its speed. The estimated speed profile is utilized to correct +the position of the annotated box and add boxes to object clusters missed by +the original annotation. + +
+
+ comment: Accepted to European Control Conference 2024 +
+
+
+
+
+ + ☆ Transformers-based architectures for stroke segmentation: A review + + +
+ Stroke remains a significant global health concern, necessitating precise and +efficient diagnostic tools for timely intervention and improved patient +outcomes. The emergence of deep learning methodologies has transformed the +landscape of medical image analysis. Recently, Transformers, initially designed +for natural language processing, have exhibited remarkable capabilities in +various computer vision applications, including medical image analysis. This +comprehensive review aims to provide an in-depth exploration of the +cutting-edge Transformer-based architectures applied in the context of stroke +segmentation. It commences with an exploration of stroke pathology, imaging +modalities, and the challenges associated with accurate diagnosis and +segmentation. Subsequently, the review delves into the fundamental ideas of +Transformers, offering detailed insights into their architectural intricacies +and the underlying mechanisms that empower them to effectively capture complex +spatial information within medical images. The existing literature is +systematically categorized and analyzed, discussing various approaches that +leverage Transformers for stroke segmentation. A critical assessment is +provided, highlighting the strengths and limitations of these methods, +including considerations of performance and computational efficiency. +Additionally, this review explores potential avenues for future research and +development + +
+
+
+
+
+ + ☆ FlexEdit: Flexible and Controllable Diffusion-based Object-centric Image + Editing + + +
+ Our work addresses limitations seen in previous approaches for object-centric +editing problems, such as unrealistic results due to shape discrepancies and +limited control in object replacement or insertion. To this end, we introduce +FlexEdit, a flexible and controllable editing framework for objects where we +iteratively adjust latents at each denoising step using our FlexEdit block. +Initially, we optimize latents at test time to align with specified object +constraints. Then, our framework employs an adaptive mask, automatically +extracted during denoising, to protect the background while seamlessly blending +new content into the target image. We demonstrate the versatility of FlexEdit +in various object editing tasks and curate an evaluation test suite with +samples from both real and synthetic images, along with novel evaluation +metrics designed for object-centric editing. We conduct extensive experiments +on different editing scenarios, demonstrating the superiority of our editing +framework over recent advanced text-guided image editing methods. Our project +page is published at https://flex-edit.github.io/. + +
+
+ comment: Our project page: https://flex-edit.github.io/ +
+
+
+
+
+ + ☆ RAP: Retrieval-Augmented Planner for Adaptive Procedure Planning in + Instructional Videos + + +
+ Procedure Planning in instructional videos entails generating a sequence of +action steps based on visual observations of the initial and target states. +Despite the rapid progress in this task, there remain several critical +challenges to be solved: (1) Adaptive procedures: Prior works hold an +unrealistic assumption that the number of action steps is known and fixed, +leading to non-generalizable models in real-world scenarios where the sequence +length varies. (2) Temporal relation: Understanding the step temporal relation +knowledge is essential in producing reasonable and executable plans. (3) +Annotation cost: Annotating instructional videos with step-level labels (i.e., +timestamp) or sequence-level labels (i.e., action category) is demanding and +labor-intensive, limiting its generalizability to large-scale datasets.In this +work, we propose a new and practical setting, called adaptive procedure +planning in instructional videos, where the procedure length is not fixed or +pre-determined. To address these challenges we introduce Retrieval-Augmented +Planner (RAP) model. Specifically, for adaptive procedures, RAP adaptively +determines the conclusion of actions using an auto-regressive model +architecture. For temporal relation, RAP establishes an external memory module +to explicitly retrieve the most relevant state-action pairs from the training +videos and revises the generated procedures. To tackle high annotation cost, +RAP utilizes a weakly-supervised learning manner to expand the training dataset +to other task-relevant, unannotated videos by generating pseudo labels for +action steps. Experiments on CrossTask and COIN benchmarks show the superiority +of RAP over traditional fixed-length models, establishing it as a strong +baseline solution for adaptive procedure planning. + +
+
+ comment: 23 pages, 6 figures, 12 tables +
+
+
+
+
+ + ☆ Homogeneous Tokenizer Matters: Homogeneous Visual Tokenizer for Remote + Sensing Image Understanding + + +
+ The tokenizer, as one of the fundamental components of large models, has long +been overlooked or even misunderstood in visual tasks. One key factor of the +great comprehension power of the large language model is that natural language +tokenizers utilize meaningful words or subwords as the basic elements of +language. In contrast, mainstream visual tokenizers, represented by patch-based +methods such as Patch Embed, rely on meaningless rectangular patches as basic +elements of vision, which cannot serve as effectively as words or subwords in +language. Starting from the essence of the tokenizer, we defined semantically +independent regions (SIRs) for vision. We designed a simple HOmogeneous visual +tOKenizer: HOOK. HOOK mainly consists of two modules: the Object Perception +Module (OPM) and the Object Vectorization Module (OVM). To achieve homogeneity, +the OPM splits the image into 4*4 pixel seeds and then utilizes the attention +mechanism to perceive SIRs. The OVM employs cross-attention to merge seeds +within the same SIR. To achieve adaptability, the OVM defines a variable number +of learnable vectors as cross-attention queries, allowing for the adjustment of +token quantity. We conducted experiments on the NWPU-RESISC45, WHU-RS19 +classification dataset, and GID5 segmentation dataset for sparse and dense +tasks. The results demonstrate that the visual tokens obtained by HOOK +correspond to individual objects, which demonstrates homogeneity. HOOK +outperformed Patch Embed by 6\% and 10\% in the two tasks and achieved +state-of-the-art performance compared to the baselines used for comparison. +Compared to Patch Embed, which requires more than one hundred tokens for one +image, HOOK requires only 6 and 8 tokens for sparse and dense tasks, +respectively, resulting in efficiency improvements of 1.5 to 2.8 times. The +code is available at https://github.com/GeoX-Lab/Hook. + +
+
+ comment: 20 pages, 8 figures, 6 tables +
+
+
+
+
+ + ☆ Users prefer Jpegli over same-sized libjpeg-turbo or MozJPEG + + +
+ We performed pairwise comparisons by human raters of JPEG images from +MozJPEG, libjpeg-turbo and our new Jpegli encoder. When compressing images at a +quality similar to libjpeg-turbo quality 95, the Jpegli images were 54% likely +to be preferred over both libjpeg-turbo and MozJPEG images, but used only 2.8 +bits per pixel compared to libjpeg-turbo and MozJPEG that used 3.8 and 3.5 bits +per pixel respectively. The raw ratings and source images are publicly +available for further analysis and study. + +
+
+
+
+
+ + ☆ The Impact of Uniform Inputs on Activation Sparsity and Energy-Latency + Attacks in Computer Vision SP 2024 + + +
+ Resource efficiency plays an important role for machine learning nowadays. +The energy and decision latency are two critical aspects to ensure a +sustainable and practical application. Unfortunately, the energy consumption +and decision latency are not robust against adversaries. Researchers have +recently demonstrated that attackers can compute and submit so-called sponge +examples at inference time to increase the energy consumption and decision +latency of neural networks. In computer vision, the proposed strategy crafts +inputs with less activation sparsity which could otherwise be used to +accelerate the computation. In this paper, we analyze the mechanism how these +energy-latency attacks reduce activation sparsity. In particular, we find that +input uniformity is a key enabler. A uniform image, that is, an image with +mostly flat, uniformly colored surfaces, triggers more activations due to a +specific interplay of convolution, batch normalization, and ReLU activation. +Based on these insights, we propose two new simple, yet effective strategies +for crafting sponge examples: sampling images from a probability distribution +and identifying dense, yet inconspicuous inputs in natural datasets. We +empirically examine our findings in a comprehensive evaluation with multiple +image classification models and show that our attack achieves the same sparsity +effect as prior sponge-example methods, but at a fraction of computation +effort. We also show that our sponge examples transfer between different neural +networks. Finally, we discuss applications of our findings for the good by +improving efficiency by increasing sparsity. + +
+
+ comment: Accepted at the DLSP 2024 +
+
+
+
+
+ + ☆ HandBooster: Boosting 3D Hand-Mesh Reconstruction by Conditional + Synthesis and Sampling of Hand-Object Interactions + + +
+ Reconstructing 3D hand mesh robustly from a single image is very challenging, +due to the lack of diversity in existing real-world datasets. While data +synthesis helps relieve the issue, the syn-to-real gap still hinders its usage. +In this work, we present HandBooster, a new approach to uplift the data +diversity and boost the 3D hand-mesh reconstruction performance by training a +conditional generative space on hand-object interactions and purposely sampling +the space to synthesize effective data samples. First, we construct versatile +content-aware conditions to guide a diffusion model to produce realistic images +with diverse hand appearances, poses, views, and backgrounds; favorably, +accurate 3D annotations are obtained for free. Then, we design a novel +condition creator based on our similarity-aware distribution sampling +strategies to deliberately find novel and realistic interaction poses that are +distinctive from the training set. Equipped with our method, several baselines +can be significantly improved beyond the SOTA on the HO3D and DexYCB +benchmarks. Our code will be released on +https://github.com/hxwork/HandBooster_Pytorch. + +
+
+
+
+
+ + ☆ Artifact Reduction in 3D and 4D Cone-beam Computed Tomography Images + with Deep Learning -- A Review + + +
+ Deep learning based approaches have been used to improve image quality in +cone-beam computed tomography (CBCT), a medical imaging technique often used in +applications such as image-guided radiation therapy, implant dentistry or +orthopaedics. In particular, while deep learning methods have been applied to +reduce various types of CBCT image artifacts arising from motion, metal +objects, or low-dose acquisition, a comprehensive review summarizing the +successes and shortcomings of these approaches, with a primary focus on the +type of artifacts rather than the architecture of neural networks, is lacking +in the literature. In this review, the data generation and simulation +pipelines, and artifact reduction techniques are specifically investigated for +each type of artifact. We provide an overview of deep learning techniques that +have successfully been shown to reduce artifacts in 3D, as well as in +time-resolved (4D) CBCT through the use of projection- and/or volume-domain +optimizations, or by introducing neural networks directly within the CBCT +reconstruction algorithms. Research gaps are identified to suggest avenues for +future exploration. One of the key findings of this work is an observed trend +towards the use of generative models including GANs and score-based or +diffusion models, accompanied with the need for more diverse and open training +datasets and simulations. + +
+
+ comment: 16 pages, 4 figures, 1 Table, published in IEEE Access Journal +
+
+
+
+
+ + ☆ CosalPure: Learning Concept from Group Images for Robust Co-Saliency + Detection + + +
+ Co-salient object detection (CoSOD) aims to identify the common and salient +(usually in the foreground) regions across a given group of images. Although +achieving significant progress, state-of-the-art CoSODs could be easily +affected by some adversarial perturbations, leading to substantial accuracy +reduction. The adversarial perturbations can mislead CoSODs but do not change +the high-level semantic information (e.g., concept) of the co-salient objects. +In this paper, we propose a novel robustness enhancement framework by first +learning the concept of the co-salient objects based on the input group images +and then leveraging this concept to purify adversarial perturbations, which are +subsequently fed to CoSODs for robustness enhancement. Specifically, we propose +CosalPure containing two modules, i.e., group-image concept learning and +concept-guided diffusion purification. For the first module, we adopt a +pre-trained text-to-image diffusion model to learn the concept of co-salient +objects within group images where the learned concept is robust to adversarial +examples. For the second module, we map the adversarial image to the latent +space and then perform diffusion generation by embedding the learned concept +into the noise prediction function as an extra condition. Our method can +effectively alleviate the influence of the SOTA adversarial attack containing +different adversarial patterns, including exposure and noise. The extensive +results demonstrate that our method could enhance the robustness of CoSODs +significantly. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ Attention Calibration for Disentangled Text-to-Image Personalization CVPR 2024 + + +
+ Recent thrilling progress in large-scale text-to-image (T2I) models has +unlocked unprecedented synthesis quality of AI-generated content (AIGC) +including image generation, 3D and video composition. Further, personalized +techniques enable appealing customized production of a novel concept given only +several images as reference. However, an intriguing problem persists: Is it +possible to capture multiple, novel concepts from one single reference image? +In this paper, we identify that existing approaches fail to preserve visual +consistency with the reference image and eliminate cross-influence from +concepts. To alleviate this, we propose an attention calibration mechanism to +improve the concept-level understanding of the T2I model. Specifically, we +first introduce new learnable modifiers bound with classes to capture +attributes of multiple concepts. Then, the classes are separated and +strengthened following the activation of the cross-attention operation, +ensuring comprehensive and self-contained concepts. Additionally, we suppress +the attention activation of different classes to mitigate mutual influence +among concepts. Together, our proposed method, dubbed DisenDiff, can learn +disentangled multiple concepts from one single image and produce novel +customized images with learned concepts. We demonstrate that our method +outperforms the current state of the art in both qualitative and quantitative +evaluations. More importantly, our proposed techniques are compatible with LoRA +and inpainting pipelines, enabling more interactive experiences. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ OrCo: Towards Better Generalization via Orthogonality and Contrast for + Few-Shot Class-Incremental Learning + + +
+ Few-Shot Class-Incremental Learning (FSCIL) introduces a paradigm in which +the problem space expands with limited data. FSCIL methods inherently face the +challenge of catastrophic forgetting as data arrives incrementally, making +models susceptible to overwriting previously acquired knowledge. Moreover, +given the scarcity of labeled samples available at any given time, models may +be prone to overfitting and find it challenging to strike a balance between +extensive pretraining and the limited incremental data. To address these +challenges, we propose the OrCo framework built on two core principles: +features' orthogonality in the representation space, and contrastive learning. +In particular, we improve the generalization of the embedding space by +employing a combination of supervised and self-supervised contrastive losses +during the pretraining phase. Additionally, we introduce OrCo loss to address +challenges arising from data limitations during incremental sessions. Through +feature space perturbations and orthogonality between classes, the OrCo loss +maximizes margins and reserves space for the following incremental data. This, +in turn, ensures the accommodation of incoming classes in the feature space +without compromising previously acquired knowledge. Our experimental results +showcase state-of-the-art performance across three benchmark datasets, +including mini-ImageNet, CIFAR100, and CUB datasets. Code is available at +https://github.com/noorahmedds/OrCo + +
+
+
+
+
+ + ☆ A Semi-supervised Nighttime Dehazing Baseline with Spatial-Frequency + Aware and Realistic Brightness Constraint CVPR2024 + + +
+ Existing research based on deep learning has extensively explored the problem +of daytime image dehazing. However, few studies have considered the +characteristics of nighttime hazy scenes. There are two distinctions between +nighttime and daytime haze. First, there may be multiple active colored light +sources with lower illumination intensity in nighttime scenes, which may cause +haze, glow and noise with localized, coupled and frequency inconsistent +characteristics. Second, due to the domain discrepancy between simulated and +real-world data, unrealistic brightness may occur when applying a dehazing +model trained on simulated data to real-world data. To address the above two +issues, we propose a semi-supervised model for real-world nighttime dehazing. +First, the spatial attention and frequency spectrum filtering are implemented +as a spatial-frequency domain information interaction module to handle the +first issue. Second, a pseudo-label-based retraining strategy and a local +window-based brightness loss for semi-supervised training process is designed +to suppress haze and glow while achieving realistic brightness. Experiments on +public benchmarks validate the effectiveness of the proposed method and its +superiority over state-of-the-art methods. The source code and Supplementary +Materials are placed in the https://github.com/Xiaofeng-life/SFSNiD. + +
+
+ comment: This paper is accepted by CVPR2024 +
+
+
+
+
+ + ☆ Efficient Heatmap-Guided 6-Dof Grasp Detection in Cluttered Scenes + + +
+ Fast and robust object grasping in clutter is a crucial component of +robotics. Most current works resort to the whole observed point cloud for 6-Dof +grasp generation, ignoring the guidance information excavated from global +semantics, thus limiting high-quality grasp generation and real-time +performance. In this work, we show that the widely used heatmaps are +underestimated in the efficiency of 6-Dof grasp generation. Therefore, we +propose an effective local grasp generator combined with grasp heatmaps as +guidance, which infers in a global-to-local semantic-to-point way. +Specifically, Gaussian encoding and the grid-based strategy are applied to +predict grasp heatmaps as guidance to aggregate local points into graspable +regions and provide global semantic information. Further, a novel non-uniform +anchor sampling mechanism is designed to improve grasp accuracy and diversity. +Benefiting from the high-efficiency encoding in the image space and focusing on +points in local graspable regions, our framework can perform high-quality grasp +detection in real-time and achieve state-of-the-art results. In addition, real +robot experiments demonstrate the effectiveness of our method with a success +rate of 94% and a clutter completion rate of 100%. Our code is available at +https://github.com/THU-VCLab/HGGD. + +
+
+ comment: Extensive results on GraspNet-1B dataset +
+
+
+
+
+ + ☆ Language Plays a Pivotal Role in the Object-Attribute Compositional + Generalization of CLIP + + +
+ Vision-language models, such as CLIP, have shown promising +Out-of-Distribution (OoD) generalization under various types of distribution +shifts. Recent studies attempted to investigate the leading cause of this +capability. In this work, we follow the same path, but focus on a specific type +of OoD data - images with novel compositions of attribute-object pairs - and +study whether such models can successfully classify those images into +composition classes. We carefully designed an authentic image test dataset +called ImageNet-AO, consisting of attributes for objects that are unlikely +encountered in the CLIP training sets. We found that CLIPs trained with large +datasets such as OpenAI CLIP, LAION-400M, and LAION-2B show orders-of-magnitude +improvement in effective compositional OoD generalization compared to both +supervised models and CLIPs trained with smaller datasets, such as CC-12M and +YFCC-15M. Our results provide evidence that the scale and diversity of training +data and language supervision play a key role in unlocking the compositional +generalization abilities of vision-language models. + +
+
+ comment: Oral accepted at OODCV 2023(http://www.ood-cv.org) +
+
+
+
+
+ + ☆ CT-3DFlow : Leveraging 3D Normalizing Flows for Unsupervised Detection + of Pathological Pulmonary CT scans + + +
+ Unsupervised pathology detection can be implemented by training a model on +healthy data only and measuring the deviation from the training set upon +inference, for example with CNN-based feature extraction and one-class +classifiers, or reconstruction-score-based methods such as AEs, GANs and +Diffusion models. Normalizing Flows (NF) have the ability to directly learn the +probability distribution of training examples through an invertible +architecture. We leverage this property in a novel 3D NF-based model named +CT-3DFlow, specifically tailored for patient-level pulmonary pathology +detection in chest CT data. Our model is trained unsupervised on healthy 3D +pulmonary CT patches, and detects deviations from its log-likelihood +distribution as anomalies. We aggregate patches-level likelihood values from a +patient's CT scan to provide a patient-level 'normal'/'abnormal' prediction. +Out-of-distribution detection performance is evaluated using expert annotations +on a separate chest CT test dataset, outperforming other state-of-the-art +methods. + +
+
+
+
+
+ + ☆ ParCo: Part-Coordinating Text-to-Motion Synthesis + + +
+ We study a challenging task: text-to-motion synthesis, aiming to generate +motions that align with textual descriptions and exhibit coordinated movements. +Currently, the part-based methods introduce part partition into the motion +synthesis process to achieve finer-grained generation. However, these methods +encounter challenges such as the lack of coordination between different part +motions and difficulties for networks to understand part concepts. Moreover, +introducing finer-grained part concepts poses computational complexity +challenges. In this paper, we propose Part-Coordinating Text-to-Motion +Synthesis (ParCo), endowed with enhanced capabilities for understanding part +motions and communication among different part motion generators, ensuring a +coordinated and fined-grained motion synthesis. Specifically, we discretize +whole-body motion into multiple part motions to establish the prior concept of +different parts. Afterward, we employ multiple lightweight generators designed +to synthesize different part motions and coordinate them through our part +coordination module. Our approach demonstrates superior performance on common +benchmarks with economic computations, including HumanML3D and KIT-ML, +providing substantial evidence of its effectiveness. Code is available at +https://github.com/qrzou/ParCo . + +
+
+
+
+
+ + ☆ HEMIT: H&E to Multiplex-immunohistochemistry Image Translation with + Dual-Branch Pix2pix Generator + + +
+ Computational analysis of multiplexed immunofluorescence histology data is +emerging as an important method for understanding the tumour micro-environment +in cancer. This work presents HEMIT, a dataset designed for translating +Hematoxylin and Eosin (H&E) sections to multiplex-immunohistochemistry (mIHC) +images, featuring DAPI, CD3, and panCK markers. Distinctively, HEMIT's mIHC +images are multi-component and cellular-level aligned with H&E, enriching +supervised stain translation tasks. To our knowledge, HEMIT is the first +publicly available cellular-level aligned dataset that enables H&E to +multi-target mIHC image translation. This dataset provides the computer vision +community with a valuable resource to develop novel computational methods which +have the potential to gain new insights from H&E slide archives. + We also propose a new dual-branch generator architecture, using residual +Convolutional Neural Networks (CNNs) and Swin Transformers which achieves +better translation outcomes than other popular algorithms. When evaluated on +HEMIT, it outperforms pix2pixHD, pix2pix, U-Net, and ResNet, achieving the +highest overall score on key metrics including the Structural Similarity Index +Measure (SSIM), Pearson correlation score (R), and Peak signal-to-noise Ratio +(PSNR). Additionally, downstream analysis has been used to further validate the +quality of the generated mIHC images. These results set a new benchmark in the +field of stain translation tasks. + +
+
+
+
+
+ + ☆ Direct mineral content prediction from drill core images via transfer + learning + + +
+ Deep subsurface exploration is important for mining, oil and gas industries, +as well as in the assessment of geological units for the disposal of chemical +or nuclear waste, or the viability of geothermal energy systems. Typically, +detailed examinations of subsurface formations or units are performed on +cuttings or core materials extracted during drilling campaigns, as well as on +geophysical borehole data, which provide detailed information about the +petrophysical properties of the rocks. Depending on the volume of rock samples +and the analytical program, the laboratory analysis and diagnostics can be very +time-consuming. This study investigates the potential of utilizing machine +learning, specifically convolutional neural networks (CNN), to assess the +lithology and mineral content solely from analysis of drill core images, aiming +to support and expedite the subsurface geological exploration. The paper +outlines a comprehensive methodology, encompassing data preprocessing, machine +learning methods, and transfer learning techniques. The outcome reveals a +remarkable 96.7% accuracy in the classification of drill core segments into +distinct formation classes. Furthermore, a CNN model was trained for the +evaluation of mineral content using a learning data set from multidimensional +log analysis data (silicate, total clay, carbonate). When benchmarked against +laboratory XRD measurements on samples from the cores, both the advanced +multidimensional log analysis model and the neural network approach developed +here provide equally good performance. This work demonstrates that deep +learning and particularly transfer learning can support extracting +petrophysical properties, including mineral content and formation +classification, from drill core images, thus offering a road map for enhancing +model performance and data set quality in image-based analysis of drill cores. + +
+
+
+
+
+ + ☆ VersaT2I: Improving Text-to-Image Models with Versatile Reward + + +
+ Recent text-to-image (T2I) models have benefited from large-scale and +high-quality data, demonstrating impressive performance. However, these T2I +models still struggle to produce images that are aesthetically pleasing, +geometrically accurate, faithful to text, and of good low-level quality. We +present VersaT2I, a versatile training framework that can boost the performance +with multiple rewards of any T2I model. We decompose the quality of the image +into several aspects such as aesthetics, text-image alignment, geometry, +low-level quality, etc. Then, for every quality aspect, we select high-quality +images in this aspect generated by the model as the training set to finetune +the T2I model using the Low-Rank Adaptation (LoRA). Furthermore, we introduce a +gating function to combine multiple quality aspects, which can avoid conflicts +between different quality aspects. Our method is easy to extend and does not +require any manual annotation, reinforcement learning, or model architecture +changes. Extensive experiments demonstrate that VersaT2I outperforms the +baseline methods across various quality criteria. + +
+
+
+
+
+ + ☆ I2CKD : Intra- and Inter-Class Knowledge Distillation for Semantic + Segmentation + + +
+ This paper proposes a new knowledge distillation method tailored for image +semantic segmentation, termed Intra- and Inter-Class Knowledge Distillation +(I2CKD). The focus of this method is on capturing and transferring knowledge +between the intermediate layers of teacher (cumbersome model) and student +(compact model). For knowledge extraction, we exploit class prototypes derived +from feature maps. To facilitate knowledge transfer, we employ a triplet loss +in order to minimize intra-class variances and maximize inter-class variances +between teacher and student prototypes. Consequently, I2CKD enables the student +to better mimic the feature representation of the teacher for each class, +thereby enhancing the segmentation performance of the compact network. +Extensive experiments on three segmentation datasets, i.e., Cityscapes, Pascal +VOC and CamVid, using various teacher-student network pairs demonstrate the +effectiveness of the proposed method. + +
+
+
+
+
+ + ☆ Modeling uncertainty for Gaussian Splatting + + +
+ We present Stochastic Gaussian Splatting (SGS): the first framework for +uncertainty estimation using Gaussian Splatting (GS). GS recently advanced the +novel-view synthesis field by achieving impressive reconstruction quality at a +fraction of the computational cost of Neural Radiance Fields (NeRF). However, +contrary to the latter, it still lacks the ability to provide information about +the confidence associated with their outputs. To address this limitation, in +this paper, we introduce a Variational Inference-based approach that seamlessly +integrates uncertainty prediction into the common rendering pipeline of GS. +Additionally, we introduce the Area Under Sparsification Error (AUSE) as a new +term in the loss function, enabling optimization of uncertainty estimation +alongside image reconstruction. Experimental results on the LLFF dataset +demonstrate that our method outperforms existing approaches in terms of both +image rendering quality and uncertainty estimation accuracy. Overall, our +framework equips practitioners with valuable insights into the reliability of +synthesized views, facilitating safer decision-making in real-world +applications. + +
+
+
+
+
+ + ☆ DiffusionFace: Towards a Comprehensive Dataset for Diffusion-Based Face + Forgery Analysis + + +
+ The rapid progress in deep learning has given rise to hyper-realistic facial +forgery methods, leading to concerns related to misinformation and security +risks. Existing face forgery datasets have limitations in generating +high-quality facial images and addressing the challenges posed by evolving +generative techniques. To combat this, we present DiffusionFace, the first +diffusion-based face forgery dataset, covering various forgery categories, +including unconditional and Text Guide facial image generation, Img2Img, +Inpaint, and Diffusion-based facial exchange algorithms. Our DiffusionFace +dataset stands out with its extensive collection of 11 diffusion models and the +high-quality of the generated images, providing essential metadata and a +real-world internet-sourced forgery facial image dataset for evaluation. +Additionally, we provide an in-depth analysis of the data and introduce +practical evaluation protocols to rigorously assess discriminative models' +effectiveness in detecting counterfeit facial images, aiming to enhance +security in facial image authentication processes. The dataset is available for +download at \url{https://github.com/Rapisurazurite/DiffFace}. + +
+
+
+
+
+ + ☆ Density-guided Translator Boosts Synthetic-to-Real Unsupervised Domain + Adaptive Segmentation of 3D Point Clouds CVPR2024 + + +
+ 3D synthetic-to-real unsupervised domain adaptive segmentation is crucial to +annotating new domains. Self-training is a competitive approach for this task, +but its performance is limited by different sensor sampling patterns (i.e., +variations in point density) and incomplete training strategies. In this work, +we propose a density-guided translator (DGT), which translates point density +between domains, and integrates it into a two-stage self-training pipeline +named DGT-ST. First, in contrast to existing works that simultaneously conduct +data generation and feature/output alignment within unstable adversarial +training, we employ the non-learnable DGT to bridge the domain gap at the input +level. Second, to provide a well-initialized model for self-training, we +propose a category-level adversarial network in stage one that utilizes the +prototype to prevent negative transfer. Finally, by leveraging the designs +above, a domain-mixed self-training method with source-aware consistency loss +is proposed in stage two to narrow the domain gap further. Experiments on two +synthetic-to-real segmentation tasks (SynLiDAR $\rightarrow$ semanticKITTI and +SynLiDAR $\rightarrow$ semanticPOSS) demonstrate that DGT-ST outperforms +state-of-the-art methods, achieving 9.4$\%$ and 4.3$\%$ mIoU improvements, +respectively. Code is available at \url{https://github.com/yuan-zm/DGT-ST}. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ☆ Deep Learning Segmentation and Classification of Red Blood Cells Using a + Large Multi-Scanner Dataset + + +
+ Digital pathology has recently been revolutionized by advancements in +artificial intelligence, deep learning, and high-performance computing. With +its advanced tools, digital pathology can help improve and speed up the +diagnostic process, reduce human errors, and streamline the reporting step. In +this paper, we report a new large red blood cell (RBC) image dataset and +propose a two-stage deep learning framework for RBC image segmentation and +classification. The dataset is a highly diverse dataset of more than 100K RBCs +containing eight different classes. The dataset, which is considerably larger +than any publicly available hematopathology dataset, was labeled independently +by two hematopathologists who also manually created masks for RBC cell +segmentation. Subsequently, in the proposed framework, first, a U-Net model was +trained to achieve automatic RBC image segmentation. Second, an EfficientNetB0 +model was trained to classify RBC images into one of the eight classes using a +transfer learning approach with a 5X2 cross-validation scheme. An IoU of 98.03% +and an average classification accuracy of 96.5% were attained on the test set. +Moreover, we have performed experimental comparisons against several prominent +CNN models. These comparisons show the superiority of the proposed model with a +good balance between performance and computational cost. + +
+
+ comment: 15 pages, 12 figures, 8 tables +
+
+
+
+
+ + ☆ DiffStyler: Diffusion-based Localized Image Style Transfer + + +
+ Image style transfer aims to imbue digital imagery with the distinctive +attributes of style targets, such as colors, brushstrokes, shapes, whilst +concurrently preserving the semantic integrity of the content. Despite the +advancements in arbitrary style transfer methods, a prevalent challenge remains +the delicate equilibrium between content semantics and style attributes. Recent +developments in large-scale text-to-image diffusion models have heralded +unprecedented synthesis capabilities, albeit at the expense of relying on +extensive and often imprecise textual descriptions to delineate artistic +styles. Addressing these limitations, this paper introduces DiffStyler, a novel +approach that facilitates efficient and precise arbitrary image style transfer. +DiffStyler lies the utilization of a text-to-image Stable Diffusion model-based +LoRA to encapsulate the essence of style targets. This approach, coupled with +strategic cross-LoRA feature and attention injection, guides the style transfer +process. The foundation of our methodology is rooted in the observation that +LoRA maintains the spatial feature consistency of UNet, a discovery that +further inspired the development of a mask-wise style transfer technique. This +technique employs masks extracted through a pre-trained FastSAM model, +utilizing mask prompts to facilitate feature fusion during the denoising +process, thereby enabling localized style transfer that preserves the original +image's unaffected regions. Moreover, our approach accommodates multiple style +targets through the use of corresponding masks. Through extensive +experimentation, we demonstrate that DiffStyler surpasses previous methods in +achieving a more harmonious balance between content preservation and style +integration. + +
+
+
+
+
+ + ☆ Scaling Vision-and-Language Navigation With Offline RL + + +
+ The study of vision-and-language navigation (VLN) has typically relied on +expert trajectories, which may not always be available in real-world situations +due to the significant effort required to collect them. On the other hand, +existing approaches to training VLN agents that go beyond available expert data +involve data augmentations or online exploration which can be tedious and +risky. In contrast, it is easy to access large repositories of suboptimal +offline trajectories. Inspired by research in offline reinforcement learning +(ORL), we introduce a new problem setup of VLN-ORL which studies VLN using +suboptimal demonstration data. We introduce a simple and effective +reward-conditioned approach that can account for dataset suboptimality for +training VLN agents, as well as benchmarks to evaluate progress and promote +research in this area. We empirically study various noise models for +characterizing dataset suboptimality among other unique challenges in VLN-ORL +and instantiate it for the VLN$\circlearrowright$BERT and MTVM architectures in +the R2R and RxR environments. Our experiments demonstrate that the proposed +reward-conditioned approach leads to significant performance improvements, even +in complex and intricate environments. + +
+
+ comment: Published in Transactions on Machine Learning Research (04/2024) +
+
+
+
+
+ + ☆ SingularTrajectory: Universal Trajectory Predictor Using Diffusion Model CVPR 2024 + + +
+ There are five types of trajectory prediction tasks: deterministic, +stochastic, domain adaptation, momentary observation, and few-shot. These +associated tasks are defined by various factors, such as the length of input +paths, data split and pre-processing methods. Interestingly, even though they +commonly take sequential coordinates of observations as input and infer future +paths in the same coordinates as output, designing specialized architectures +for each task is still necessary. For the other task, generality issues can +lead to sub-optimal performances. In this paper, we propose SingularTrajectory, +a diffusion-based universal trajectory prediction framework to reduce the +performance gap across the five tasks. The core of SingularTrajectory is to +unify a variety of human dynamics representations on the associated tasks. To +do this, we first build a Singular space to project all types of motion +patterns from each task into one embedding space. We next propose an adaptive +anchor working in the Singular space. Unlike traditional fixed anchor methods +that sometimes yield unacceptable paths, our adaptive anchor enables correct +anchors, which are put into a wrong location, based on a traversability map. +Finally, we adopt a diffusion-based predictor to further enhance the prototype +paths using a cascaded denoising process. Our unified framework ensures the +generality across various benchmark settings such as input modality, and +trajectory lengths. Extensive experiments on five public benchmarks demonstrate +that SingularTrajectory substantially outperforms existing models, highlighting +its effectiveness in estimating general dynamics of human movements. Code is +publicly available at https://github.com/inhwanbae/SingularTrajectory . + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Can Language Beat Numerical Regression? Language-Based Multimodal + Trajectory Prediction CVPR 2024 + + +
+ Language models have demonstrated impressive ability in context understanding +and generative performance. Inspired by the recent success of language +foundation models, in this paper, we propose LMTraj (Language-based Multimodal +Trajectory predictor), which recasts the trajectory prediction task into a sort +of question-answering problem. Departing from traditional numerical regression +models, which treat the trajectory coordinate sequence as continuous signals, +we consider them as discrete signals like text prompts. Specially, we first +transform an input space for the trajectory coordinate into the natural +language space. Here, the entire time-series trajectories of pedestrians are +converted into a text prompt, and scene images are described as text +information through image captioning. The transformed numerical and image data +are then wrapped into the question-answering template for use in a language +model. Next, to guide the language model in understanding and reasoning +high-level knowledge, such as scene context and social relationships between +pedestrians, we introduce an auxiliary multi-task question and answering. We +then train a numerical tokenizer with the prompt data. We encourage the +tokenizer to separate the integer and decimal parts well, and leverage it to +capture correlations between the consecutive numbers in the language model. +Lastly, we train the language model using the numerical tokenizer and all of +the question-answer prompts. Here, we propose a beam-search-based most-likely +prediction and a temperature-based multimodal prediction to implement both +deterministic and stochastic inferences. Applying our LMTraj, we show that the +language-based model can be a powerful pedestrian trajectory predictor, and +outperforms existing numerical-based predictor methods. Code is publicly +available at https://github.com/inhwanbae/LMTrajectory . + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ $\mathrm{F^2Depth}$: Self-supervised Indoor Monocular Depth Estimation + via Optical Flow Consistency and Feature Map Synthesis + + +
+ Self-supervised monocular depth estimation methods have been increasingly +given much attention due to the benefit of not requiring large, labelled +datasets. Such self-supervised methods require high-quality salient features +and consequently suffer from severe performance drop for indoor scenes, where +low-textured regions dominant in the scenes are almost indiscriminative. To +address the issue, we propose a self-supervised indoor monocular depth +estimation framework called $\mathrm{F^2Depth}$. A self-supervised optical flow +estimation network is introduced to supervise depth learning. To improve +optical flow estimation performance in low-textured areas, only some patches of +points with more discriminative features are adopted for finetuning based on +our well-designed patch-based photometric loss. The finetuned optical flow +estimation network generates high-accuracy optical flow as a supervisory signal +for depth estimation. Correspondingly, an optical flow consistency loss is +designed. Multi-scale feature maps produced by finetuned optical flow +estimation network perform warping to compute feature map synthesis loss as +another supervisory signal for depth learning. Experimental results on the NYU +Depth V2 dataset demonstrate the effectiveness of the framework and our +proposed losses. To evaluate the generalization ability of our +$\mathrm{F^2Depth}$, we collect a Campus Indoor depth dataset composed of +approximately 1500 points selected from 99 images in 18 scenes. Zero-shot +generalization experiments on 7-Scenes dataset and Campus Indoor achieve +$\delta_1$ accuracy of 75.8% and 76.0% respectively. The accuracy results show +that our model can generalize well to monocular images captured in unknown +indoor scenes. + +
+
+
+
+
+ + ☆ Backpropagation-free Network for 3D Test-time Adaptation CVPR 2024 + + +
+ Real-world systems often encounter new data over time, which leads to +experiencing target domain shifts. Existing Test-Time Adaptation (TTA) methods +tend to apply computationally heavy and memory-intensive backpropagation-based +approaches to handle this. Here, we propose a novel method that uses a +backpropagation-free approach for TTA for the specific case of 3D data. Our +model uses a two-stream architecture to maintain knowledge about the source +domain as well as complementary target-domain-specific information. The +backpropagation-free property of our model helps address the well-known +forgetting problem and mitigates the error accumulation issue. The proposed +method also eliminates the need for the usually noisy process of +pseudo-labeling and reliance on costly self-supervised training. Moreover, our +method leverages subspace learning, effectively reducing the distribution +variance between the two domains. Furthermore, the source-domain-specific and +the target-domain-specific streams are aligned using a novel entropy-based +adaptive fusion strategy. Extensive experiments on popular benchmarks +demonstrate the effectiveness of our method. The code will be available at +https://github.com/abie-e/BFTT3D. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ U-Sketch: An Efficient Approach for Sketch to Image Diffusion Models + + +
+ Diffusion models have demonstrated remarkable performance in text-to-image +synthesis, producing realistic and high resolution images that faithfully +adhere to the corresponding text-prompts. Despite their great success, they +still fall behind in sketch-to-image synthesis tasks, where in addition to +text-prompts, the spatial layout of the generated images has to closely follow +the outlines of certain reference sketches. Employing an MLP latent edge +predictor to guide the spatial layout of the synthesized image by predicting +edge maps at each denoising step has been recently proposed. Despite yielding +promising results, the pixel-wise operation of the MLP does not take into +account the spatial layout as a whole, and demands numerous denoising +iterations to produce satisfactory images, leading to time inefficiency. To +this end, we introduce U-Sketch, a framework featuring a U-Net type latent edge +predictor, which is capable of efficiently capturing both local and global +features, as well as spatial correlations between pixels. Moreover, we propose +the addition of a sketch simplification network that offers the user the choice +of preprocessing and simplifying input sketches for enhanced outputs. The +experimental results, corroborated by user feedback, demonstrate that our +proposed U-Net latent edge predictor leads to more realistic results, that are +better aligned with the spatial outlines of the reference sketches, while +drastically reducing the number of required denoising steps and, consequently, +the overall execution time. + +
+
+
+
+
+ + ☆ ECNet: Effective Controllable Text-to-Image Diffusion Models + + +
+ The conditional text-to-image diffusion models have garnered significant +attention in recent years. However, the precision of these models is often +compromised mainly for two reasons, ambiguous condition input and inadequate +condition guidance over single denoising loss. To address the challenges, we +introduce two innovative solutions. Firstly, we propose a Spatial Guidance +Injector (SGI) which enhances conditional detail by encoding text inputs with +precise annotation information. This method directly tackles the issue of +ambiguous control inputs by providing clear, annotated guidance to the model. +Secondly, to overcome the issue of limited conditional supervision, we +introduce Diffusion Consistency Loss (DCL), which applies supervision on the +denoised latent code at any given time step. This encourages consistency +between the latent code at each time step and the input signal, thereby +enhancing the robustness and accuracy of the output. The combination of SGI and +DCL results in our Effective Controllable Network (ECNet), which offers a more +accurate controllable end-to-end text-to-image generation framework with a more +precise conditioning input and stronger controllable supervision. We validate +our approach through extensive experiments on generation under various +conditions, such as human body skeletons, facial landmarks, and sketches of +general objects. The results consistently demonstrate that our method +significantly enhances the controllability and robustness of the generated +images, outperforming existing state-of-the-art controllable text-to-image +models. + +
+
+
+
+
+ + ☆ A Channel-ensemble Approach: Unbiased and Low-variance Pseudo-labels is + Critical for Semi-supervised Classification + + +
+ Semi-supervised learning (SSL) is a practical challenge in computer vision. +Pseudo-label (PL) methods, e.g., FixMatch and FreeMatch, obtain the State Of +The Art (SOTA) performances in SSL. These approaches employ a +threshold-to-pseudo-label (T2L) process to generate PLs by truncating the +confidence scores of unlabeled data predicted by the self-training method. +However, self-trained models typically yield biased and high-variance +predictions, especially in the scenarios when a little labeled data are +supplied. To address this issue, we propose a lightweight channel-based +ensemble method to effectively consolidate multiple inferior PLs into the +theoretically guaranteed unbiased and low-variance one. Importantly, our +approach can be readily extended to any SSL framework, such as FixMatch or +FreeMatch. Experimental results demonstrate that our method significantly +outperforms state-of-the-art techniques on CIFAR10/100 in terms of +effectiveness and efficiency. + +
+
+
+
+
+ + ☆ An Image Grid Can Be Worth a Video: Zero-shot Video Question Answering + Using a VLM + + +
+ Stimulated by the sophisticated reasoning capabilities of recent Large +Language Models (LLMs), a variety of strategies for bridging video modality +have been devised. A prominent strategy involves Video Language Models +(VideoLMs), which train a learnable interface with video data to connect +advanced vision encoders with LLMs. Recently, an alternative strategy has +surfaced, employing readily available foundation models, such as VideoLMs and +LLMs, across multiple stages for modality bridging. In this study, we introduce +a simple yet novel strategy where only a single Vision Language Model (VLM) is +utilized. Our starting point is the plain insight that a video comprises a +series of images, or frames, interwoven with temporal information. The essence +of video comprehension lies in adeptly managing the temporal aspects along with +the spatial details of each frame. Initially, we transform a video into a +single composite image by arranging multiple frames in a grid layout. The +resulting single image is termed as an image grid. This format, while +maintaining the appearance of a solitary image, effectively retains temporal +information within the grid structure. Therefore, the image grid approach +enables direct application of a single high-performance VLM without +necessitating any video-data training. Our extensive experimental analysis +across ten zero-shot video question answering benchmarks, including five +open-ended and five multiple-choice benchmarks, reveals that the proposed Image +Grid Vision Language Model (IG-VLM) surpasses the existing methods in nine out +of ten benchmarks. + +
+
+ comment: Our code is available at https://github.com/imagegridworth/IG-VLM +
+
+
+
+
+ + ☆ Colour and Brush Stroke Pattern Recognition in Abstract Art using + Modified Deep Convolutional Generative Adversarial Networks + + +
+ Abstract Art is an immensely popular, discussed form of art that often has +the ability to depict the emotions of an artist. Many researchers have made +attempts to study abstract art in the form of edge detection, brush stroke and +emotion recognition algorithms using machine and deep learning. This papers +describes the study of a wide distribution of abstract paintings using +Generative Adversarial Neural Networks(GAN). GANs have the ability to learn and +reproduce a distribution enabling researchers and scientists to effectively +explore and study the generated image space. However, the challenge lies in +developing an efficient GAN architecture that overcomes common training +pitfalls. This paper addresses this challenge by introducing a modified-DCGAN +(mDCGAN) specifically designed for high-quality artwork generation. The +approach involves a thorough exploration of the modifications made, delving +into the intricate workings of DCGANs, optimisation techniques, and +regularisation methods aimed at improving stability and realism in art +generation enabling effective study of generated patterns. The proposed mDCGAN +incorporates meticulous adjustments in layer configurations and architectural +choices, offering tailored solutions to the unique demands of art generation +while effectively combating issues like mode collapse and gradient vanishing. +Further this paper explores the generated latent space by performing random +walks to understand vector relationships between brush strokes and colours in +the abstract art space and a statistical analysis of unstable outputs after a +certain period of GAN training and compare its significant difference. These +findings validate the effectiveness of the proposed approach, emphasising its +potential to revolutionise the field of digital art generation and digital art +ecosystem. + +
+
+ comment: 28 pages, 5 tables, 7 figures +
+
+
+
+
+ + ☆ FTBC: Forward Temporal Bias Correction for Optimizing ANN-SNN Conversion + + +
+ Spiking Neural Networks (SNNs) offer a promising avenue for energy-efficient +computing compared with Artificial Neural Networks (ANNs), closely mirroring +biological neural processes. However, this potential comes with inherent +challenges in directly training SNNs through spatio-temporal backpropagation -- +stemming from the temporal dynamics of spiking neurons and their discrete +signal processing -- which necessitates alternative ways of training, most +notably through ANN-SNN conversion. In this work, we introduce a lightweight +Forward Temporal Bias Correction (FTBC) technique, aimed at enhancing +conversion accuracy without the computational overhead. We ground our method on +provided theoretical findings that through proper temporal bias calibration the +expected error of ANN-SNN conversion can be reduced to be zero after each time +step. We further propose a heuristic algorithm for finding the temporal bias +only in the forward pass, thus eliminating the computational burden of +backpropagation and we evaluate our method on CIFAR-10/100 and ImageNet +datasets, achieving a notable increase in accuracy on all datasets. Codes are +released at a GitHub repository. + +
+
+
+
+
+ + ☆ Generative Multi-modal Models are Good Class-Incremental Learners CVPR 2024 + + +
+ In class-incremental learning (CIL) scenarios, the phenomenon of catastrophic +forgetting caused by the classifier's bias towards the current task has long +posed a significant challenge. It is mainly caused by the characteristic of +discriminative models. With the growing popularity of the generative +multi-modal models, we would explore replacing discriminative models with +generative ones for CIL. However, transitioning from discriminative to +generative models requires addressing two key challenges. The primary challenge +lies in transferring the generated textual information into the classification +of distinct categories. Additionally, it requires formulating the task of CIL +within a generative framework. To this end, we propose a novel generative +multi-modal model (GMM) framework for class-incremental learning. Our approach +directly generates labels for images using an adapted generative model. After +obtaining the detailed text, we use a text encoder to extract text features and +employ feature matching to determine the most similar label as the +classification prediction. In the conventional CIL settings, we achieve +significantly better results in long-sequence task scenarios. Under the +Few-shot CIL setting, we have improved by at least 14\% accuracy over all the +current state-of-the-art methods with significantly less forgetting. Our code +is available at \url{https://github.com/DoubleClass/GMM}. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ BAM: Box Abstraction Monitors for Real-time OoD Detection in Object + Detection + + +
+ Out-of-distribution (OoD) detection techniques for deep neural networks +(DNNs) become crucial thanks to their filtering of abnormal inputs, especially +when DNNs are used in safety-critical applications and interact with an open +and dynamic environment. Nevertheless, integrating OoD detection into +state-of-the-art (SOTA) object detection DNNs poses significant challenges, +partly due to the complexity introduced by the SOTA OoD construction methods, +which require the modification of DNN architecture and the introduction of +complex loss functions. This paper proposes a simple, yet surprisingly +effective, method that requires neither retraining nor architectural change in +object detection DNN, called Box Abstraction-based Monitors (BAM). The novelty +of BAM stems from using a finite union of convex box abstractions to capture +the learned features of objects for in-distribution (ID) data, and an important +observation that features from OoD data are more likely to fall outside of +these boxes. The union of convex regions within the feature space allows the +formation of non-convex and interpretable decision boundaries, overcoming the +limitations of VOS-like detectors without sacrificing real-time performance. +Experiments integrating BAM into Faster R-CNN-based object detection DNNs +demonstrate a considerably improved performance against SOTA OoD detection +techniques. + +
+
+
+
+
+ + ☆ Ship in Sight: Diffusion Models for Ship-Image Super Resolution IJCNN + + +
+ In recent years, remarkable advancements have been achieved in the field of +image generation, primarily driven by the escalating demand for high-quality +outcomes across various image generation subtasks, such as inpainting, +denoising, and super resolution. A major effort is devoted to exploring the +application of super-resolution techniques to enhance the quality of +low-resolution images. In this context, our method explores in depth the +problem of ship image super resolution, which is crucial for coastal and port +surveillance. We investigate the opportunity given by the growing interest in +text-to-image diffusion models, taking advantage of the prior knowledge that +such foundation models have already learned. In particular, we present a +diffusion-model-based architecture that leverages text conditioning during +training while being class-aware, to best preserve the crucial details of the +ships during the generation of the super-resoluted image. Since the specificity +of this task and the scarcity availability of off-the-shelf data, we also +introduce a large labeled ship dataset scraped from online ship images, mostly +from ShipSpotting\footnote{\url{www.shipspotting.com}} website. Our method +achieves more robust results than other deep learning models previously +employed for super resolution, as proven by the multiple experiments performed. +Moreover, we investigate how this model can benefit downstream tasks, such as +classification and object detection, thus emphasizing practical implementation +in a real-world scenario. Experimental results show flexibility, reliability, +and impressive performance of the proposed framework over state-of-the-art +methods for different tasks. The code is available at: +https://github.com/LuigiSigillo/ShipinSight . + +
+
+ comment: Accepted at 2024 International Joint Conference on Neural Networks + (IJCNN) +
+
+
+
+
+ + ☆ ViTAR: Vision Transformer with Any Resolution + + +
+ his paper tackles a significant challenge faced by Vision Transformers +(ViTs): their constrained scalability across different image resolutions. +Typically, ViTs experience a performance decline when processing resolutions +different from those seen during training. Our work introduces two key +innovations to address this issue. Firstly, we propose a novel module for +dynamic resolution adjustment, designed with a single Transformer block, +specifically to achieve highly efficient incremental token integration. +Secondly, we introduce fuzzy positional encoding in the Vision Transformer to +provide consistent positional awareness across multiple resolutions, thereby +preventing overfitting to any single training resolution. Our resulting model, +ViTAR (Vision Transformer with Any Resolution), demonstrates impressive +adaptability, achieving 83.3\% top-1 accuracy at a 1120x1120 resolution and +80.4\% accuracy at a 4032x4032 resolution, all while reducing computational +costs. ViTAR also shows strong performance in downstream tasks such as instance +and semantic segmentation and can easily combined with self-supervised learning +techniques like Masked AutoEncoder. Our work provides a cost-effective solution +for enhancing the resolution scalability of ViTs, paving the way for more +versatile and efficient high-resolution image processing. + +
+
+
+
+
+ + ☆ Learning CNN on ViT: A Hybrid Model to Explicitly Class-specific + Boundaries for Domain Adaptation + + +
+ Most domain adaptation (DA) methods are based on either a convolutional +neural networks (CNNs) or a vision transformers (ViTs). They align the +distribution differences between domains as encoders without considering their +unique characteristics. For instance, ViT excels in accuracy due to its +superior ability to capture global representations, while CNN has an advantage +in capturing local representations. This fact has led us to design a hybrid +method to fully take advantage of both ViT and CNN, called Explicitly +Class-specific Boundaries (ECB). ECB learns CNN on ViT to combine their +distinct strengths. In particular, we leverage ViT's properties to explicitly +find class-specific decision boundaries by maximizing the discrepancy between +the outputs of the two classifiers to detect target samples far from the source +support. In contrast, the CNN encoder clusters target features based on the +previously defined class-specific boundaries by minimizing the discrepancy +between the probabilities of the two classifiers. Finally, ViT and CNN mutually +exchange knowledge to improve the quality of pseudo labels and reduce the +knowledge discrepancies of these models. Compared to conventional DA methods, +our ECB achieves superior performance, which verifies its effectiveness in this +hybrid model. The project website can be found +https://dotrannhattuong.github.io/ECB/website/. + +
+
+
+
+
+ + ☆ MonoHair: High-Fidelity Hair Modeling from a Monocular Video CVPR 2024 + + +
+ Undoubtedly, high-fidelity 3D hair is crucial for achieving realism, artistic +expression, and immersion in computer graphics. While existing 3D hair modeling +methods have achieved impressive performance, the challenge of achieving +high-quality hair reconstruction persists: they either require strict capture +conditions, making practical applications difficult, or heavily rely on learned +prior data, obscuring fine-grained details in images. To address these +challenges, we propose MonoHair,a generic framework to achieve high-fidelity +hair reconstruction from a monocular video, without specific requirements for +environments. Our approach bifurcates the hair modeling process into two main +stages: precise exterior reconstruction and interior structure inference. The +exterior is meticulously crafted using our Patch-based Multi-View Optimization +(PMVO). This method strategically collects and integrates hair information from +multiple views, independent of prior data, to produce a high-fidelity exterior +3D line map. This map not only captures intricate details but also facilitates +the inference of the hair's inner structure. For the interior, we employ a +data-driven, multi-view 3D hair reconstruction method. This method utilizes 2D +structural renderings derived from the reconstructed exterior, mirroring the +synthetic 2D inputs used during training. This alignment effectively bridges +the domain gap between our training data and real-world data, thereby enhancing +the accuracy and reliability of our interior structure inference. Lastly, we +generate a strand model and resolve the directional ambiguity by our hair +growth algorithm. Our experiments demonstrate that our method exhibits +robustness across diverse hairstyles and achieves state-of-the-art performance. +For more results, please refer to our project page +https://keyuwu-cs.github.io/MonoHair/. + +
+
+ comment: Accepted by IEEE CVPR 2024 +
+
+
+
+
+ + ☆ Generating Diverse Agricultural Data for Vision-Based Farming + Applications + + +
+ We present a specialized procedural model for generating synthetic +agricultural scenes, focusing on soybean crops, along with various weeds. This +model is capable of simulating distinct growth stages of these plants, diverse +soil conditions, and randomized field arrangements under varying lighting +conditions. The integration of real-world textures and environmental factors +into the procedural generation process enhances the photorealism and +applicability of the synthetic data. Our dataset includes 12,000 images with +semantic labels, offering a comprehensive resource for computer vision tasks in +precision agriculture, such as semantic segmentation for autonomous weed +control. We validate our model's effectiveness by comparing the synthetic data +against real agricultural images, demonstrating its potential to significantly +augment training data for machine learning models in agriculture. This approach +not only provides a cost-effective solution for generating high-quality, +diverse data but also addresses specific needs in agricultural vision tasks +that are not fully covered by general-purpose models. + +
+
+ comment: 10 pages, 8 figures, 3 tables +
+
+
+
+
+ + ☆ A Quantum Fuzzy-based Approach for Real-Time Detection of Solar Coronal + Holes + + +
+ The detection and analysis of the solar coronal holes (CHs) is an important +field of study in the domain of solar physics. Mainly, it is required for the +proper prediction of the geomagnetic storms which directly or indirectly affect +various space and ground-based systems. For the detection of CHs till date, the +solar scientist depends on manual hand-drawn approaches. However, with the +advancement of image processing technologies, some automated image segmentation +methods have been used for the detection of CHs. In-spite of this, fast and +accurate detection of CHs are till a major issues. Here in this work, a novel +quantum computing-based fast fuzzy c-mean technique has been developed for fast +detection of the CHs region. The task has been carried out in two stages, in +first stage the solar image has been segmented using a quantum computing based +fast fuzzy c-mean (QCFFCM) and in the later stage the CHs has been extracted +out from the segmented image based on image morphological operation. In the +work, quantum computing has been used to optimize the cost function of the fast +fuzzy c-mean (FFCM) algorithm, where quantum approximate optimization algorithm +(QAOA) has been used to optimize the quadratic part of the cost function. The +proposed method has been tested for 193 \AA{} SDO/AIA full-disk solar image +datasets and has been compared with the existing techniques. The outcome shows +the comparable performance of the proposed method with the existing one within +a very lesser time. + +
+
+ comment: 14 pages, 5 figures, 3 tables +
+
+
+
+
+ + ☆ Quantifying and Mitigating Unimodal Biases in Multimodal Large Language + Models: A Causal Perspective + + +
+ Recent advancements in Large Language Models (LLMs) have facilitated the +development of Multimodal LLMs (MLLMs). Despite their impressive capabilities, +MLLMs often suffer from an over-reliance on unimodal biases (e.g., language +bias and vision bias), leading to incorrect answers in complex multimodal +tasks. To investigate this issue, we propose a causal framework to interpret +the biases in Visual Question Answering (VQA) problems. Within our framework, +we devise a causal graph to elucidate the predictions of MLLMs on VQA problems, +and assess the causal effect of biases through an in-depth causal analysis. +Motivated by the causal graph, we introduce a novel MORE dataset, consisting of +12,000 VQA instances. This dataset is designed to challenge MLLMs' abilities, +necessitating multi-hop reasoning and the surmounting of unimodal biases. +Furthermore, we propose two strategies to mitigate unimodal biases and enhance +MLLMs' reasoning capabilities, including a Decompose-Verify-Answer (DeVA) +framework for limited-access MLLMs and the refinement of open-source MLLMs +through fine-tuning. Extensive quantitative and qualitative experiments offer +valuable insights for future research. + +
+
+
+
+
+ + ☆ Learning Inclusion Matching for Animation Paint Bucket Colorization CVPR 2024 + + +
+ Colorizing line art is a pivotal task in the production of hand-drawn cel +animation. This typically involves digital painters using a paint bucket tool +to manually color each segment enclosed by lines, based on RGB values +predetermined by a color designer. This frame-by-frame process is both arduous +and time-intensive. Current automated methods mainly focus on segment matching. +This technique migrates colors from a reference to the target frame by aligning +features within line-enclosed segments across frames. However, issues like +occlusion and wrinkles in animations often disrupt these direct +correspondences, leading to mismatches. In this work, we introduce a new +learning-based inclusion matching pipeline, which directs the network to +comprehend the inclusion relationships between segments rather than relying +solely on direct visual correspondences. Our method features a two-stage +pipeline that integrates a coarse color warping module with an inclusion +matching module, enabling more nuanced and accurate colorization. To facilitate +the training of our network, we also develope a unique dataset, referred to as +PaintBucket-Character. This dataset includes rendered line arts alongside their +colorized counterparts, featuring various 3D characters. Extensive experiments +demonstrate the effectiveness and superiority of our method over existing +techniques. + +
+
+ comment: accepted to CVPR 2024. Project Page: + https://ykdai.github.io/projects/InclusionMatching +
+
+
+
+
+ + ☆ H2ASeg: Hierarchical Adaptive Interaction and Weighting Network for + Tumor Segmentation in PET/CT Images + + +
+ Positron emission tomography (PET) combined with computed tomography (CT) +imaging is routinely used in cancer diagnosis and prognosis by providing +complementary information. Automatically segmenting tumors in PET/CT images can +significantly improve examination efficiency. Traditional multi-modal +segmentation solutions mainly rely on concatenation operations for modality +fusion, which fail to effectively model the non-linear dependencies between PET +and CT modalities. Recent studies have investigated various approaches to +optimize the fusion of modality-specific features for enhancing joint +representations. However, modality-specific encoders used in these methods +operate independently, inadequately leveraging the synergistic relationships +inherent in PET and CT modalities, for example, the complementarity between +semantics and structure. To address these issues, we propose a Hierarchical +Adaptive Interaction and Weighting Network termed H2ASeg to explore the +intrinsic cross-modal correlations and transfer potential complementary +information. Specifically, we design a Modality-Cooperative Spatial Attention +(MCSA) module that performs intra- and inter-modal interactions globally and +locally. Additionally, a Target-Aware Modality Weighting (TAMW) module is +developed to highlight tumor-related features within multi-modal features, +thereby refining tumor segmentation. By embedding these modules across +different layers, H2ASeg can hierarchically model cross-modal correlations, +enabling a nuanced understanding of both semantic and structural tumor +features. Extensive experiments demonstrate the superiority of H2ASeg, +outperforming state-of-the-art methods on AutoPet-II and Hecktor2022 +benchmarks. The code is released at https://github.com/G14nTDo4/H2ASeg. + +
+
+ comment: 10 pages,4 figures +
+
+
+
+
+ + ☆ DODA: Diffusion for Object-detection Domain Adaptation in Agriculture + + +
+ The diverse and high-quality content generated by recent generative models +demonstrates the great potential of using synthetic data to train downstream +models. However, in vision, especially in objection detection, related areas +are not fully explored, the synthetic images are merely used to balance the +long tails of existing datasets, and the accuracy of the generated labels is +low, the full potential of generative models has not been exploited. In this +paper, we propose DODA, a data synthesizer that can generate high-quality +object detection data for new domains in agriculture. Specifically, we improve +the controllability of layout-to-image through encoding layout as an image, +thereby improving the quality of labels, and use a visual encoder to provide +visual clues for the diffusion model to decouple visual features from the +diffusion model, and empowering the model the ability to generate data in new +domains. On the Global Wheat Head Detection (GWHD) Dataset, which is the +largest dataset in agriculture and contains diverse domains, using the data +synthesized by DODA improves the performance of the object detector by +12.74-17.76 AP$_{50}$ in the domain that was significantly shifted from the +training data. + +
+
+
+
+
+ + ☆ Tracking-Assisted Object Detection with Event Cameras + + +
+ Event-based object detection has recently garnered attention in the computer +vision community due to the exceptional properties of event cameras, such as +high dynamic range and no motion blur. However, feature asynchronism and +sparsity cause invisible objects due to no relative motion to the camera, +posing a significant challenge in the task. Prior works have studied various +memory mechanisms to preserve as many features as possible at the current time, +guided by temporal clues. While these implicit-learned memories retain some +short-term information, they still struggle to preserve long-term features +effectively. In this paper, we consider those invisible objects as +pseudo-occluded objects and aim to reveal their features. Firstly, we introduce +visibility attribute of objects and contribute an auto-labeling algorithm to +append additional visibility labels on an existing event camera dataset. +Secondly, we exploit tracking strategies for pseudo-occluded objects to +maintain their permanence and retain their bounding boxes, even when features +have not been available for a very long time. These strategies can be treated +as an explicit-learned memory guided by the tracking objective to record the +displacements of objects across frames. Lastly, we propose a spatio-temporal +feature aggregation module to enrich the latent features and a consistency loss +to increase the robustness of the overall pipeline. We conduct comprehensive +experiments to verify our method's effectiveness where still objects are +retained but real occluded objects are discarded. The results demonstrate that +(1) the additional visibility labels can assist in supervised training, and (2) +our method outperforms state-of-the-art approaches with a significant +improvement of 7.9% absolute mAP. + +
+
+
+
+
+ + ☆ PIPNet3D: Interpretable Detection of Alzheimer in MRI Scans + + +
+ Information from neuroimaging examinations (CT, MRI) is increasingly used to +support diagnoses of dementia, e.g., Alzheimer's disease. While current +clinical practice is mainly based on visual inspection and feature engineering, +Deep Learning approaches can be used to automate the analysis and to discover +new image-biomarkers. Part-prototype neural networks (PP-NN) are an alternative +to standard blackbox models, and have shown promising results in general +computer vision. PP-NN's base their reasoning on prototypical image regions +that are learned fully unsupervised, and combined with a simple-to-understand +decision layer. We present PIPNet3D, a PP-NN for volumetric images. We apply +PIPNet3D to the clinical case study of Alzheimer's Disease diagnosis from +structural Magnetic Resonance Imaging (sMRI). We assess the quality of +prototypes under a systematic evaluation framework, propose new metrics to +evaluate brain prototypes and perform an evaluation with domain experts. Our +results show that PIPNet3D is an interpretable, compact model for Alzheimer's +diagnosis with its reasoning well aligned to medical domain knowledge. Notably, +PIPNet3D achieves the same accuracy as its blackbox counterpart; and removing +the remaining clinically irrelevant prototypes from its decision process does +not decrease predictive performance. + +
+
+
+
+
+ + ☆ Implementation of the Principal Component Analysis onto High-Performance + Computer Facilities for Hyperspectral Dimensionality Reduction: Results and + Comparisons + + +
+ Dimensionality reduction represents a critical preprocessing step in order to +increase the efficiency and the performance of many hyperspectral imaging +algorithms. However, dimensionality reduction algorithms, such as the Principal +Component Analysis (PCA), suffer from their computationally demanding nature, +becoming advisable for their implementation onto high-performance computer +architectures for applications under strict latency constraints. This work +presents the implementation of the PCA algorithm onto two different +high-performance devices, namely, an NVIDIA Graphics Processing Unit (GPU) and +a Kalray manycore, uncovering a highly valuable set of tips and tricks in order +to take full advantage of the inherent parallelism of these high-performance +computing platforms, and hence, reducing the time that is required to process a +given hyperspectral image. Moreover, the achieved results obtained with +different hyperspectral images have been compared with the ones that were +obtained with a field programmable gate array (FPGA)-based implementation of +the PCA algorithm that has been recently published, providing, for the first +time in the literature, a comprehensive analysis in order to highlight the pros +and cons of each option. + +
+
+ comment: 30 pages, 10 figures +
+
+
+
+
+ + ☆ Uncertainty-Aware SAR ATR: Defending Against Adversarial Attacks via + Bayesian Neural Networks + + +
+ Adversarial attacks have demonstrated the vulnerability of Machine Learning +(ML) image classifiers in Synthetic Aperture Radar (SAR) Automatic Target +Recognition (ATR) systems. An adversarial attack can deceive the classifier +into making incorrect predictions by perturbing the input SAR images, for +example, with a few scatterers attached to the on-ground objects. Therefore, it +is critical to develop robust SAR ATR systems that can detect potential +adversarial attacks by leveraging the inherent uncertainty in ML classifiers, +thereby effectively alerting human decision-makers. In this paper, we propose a +novel uncertainty-aware SAR ATR for detecting adversarial attacks. +Specifically, we leverage the capability of Bayesian Neural Networks (BNNs) in +performing image classification with quantified epistemic uncertainty to +measure the confidence for each input SAR image. By evaluating the uncertainty, +our method alerts when the input SAR image is likely to be adversarially +generated. Simultaneously, we also generate visual explanations that reveal the +specific regions in the SAR image where the adversarial scatterers are likely +to to be present, thus aiding human decision-making with hints of evidence of +adversarial attacks. Experiments on the MSTAR dataset demonstrate that our +approach can identify over 80% adversarial SAR images with fewer than 20% false +alarms, and our visual explanations can identify up to over 90% of scatterers +in an adversarial SAR image. + +
+
+
+
+
+ + ☆ Selective Mixup Fine-Tuning for Optimizing Non-Decomposable Objectives ICLR 2024 + + +
+ The rise in internet usage has led to the generation of massive amounts of +data, resulting in the adoption of various supervised and semi-supervised +machine learning algorithms, which can effectively utilize the colossal amount +of data to train models. However, before deploying these models in the real +world, these must be strictly evaluated on performance measures like worst-case +recall and satisfy constraints such as fairness. We find that current +state-of-the-art empirical techniques offer sub-optimal performance on these +practical, non-decomposable performance objectives. On the other hand, the +theoretical techniques necessitate training a new model from scratch for each +performance objective. To bridge the gap, we propose SelMix, a selective +mixup-based inexpensive fine-tuning technique for pre-trained models, to +optimize for the desired objective. The core idea of our framework is to +determine a sampling distribution to perform a mixup of features between +samples from particular classes such that it optimizes the given objective. We +comprehensively evaluate our technique against the existing empirical and +theoretically principled methods on standard benchmark datasets for imbalanced +classification. We find that proposed SelMix fine-tuning significantly improves +the performance for various practical non-decomposable objectives across +benchmarks. + +
+
+ comment: ICLR 2024 SpotLight +
+
+
+
+
+ + ☆ Multi-scale Unified Network for Image Classification + + +
+ Convolutional Neural Networks (CNNs) have advanced significantly in visual +representation learning and recognition. However, they face notable challenges +in performance and computational efficiency when dealing with real-world, +multi-scale image inputs. Conventional methods rescale all input images into a +fixed size, wherein a larger fixed size favors performance but rescaling small +size images to a larger size incurs digitization noise and increased +computation cost. In this work, we carry out a comprehensive, layer-wise +investigation of CNN models in response to scale variation, based on Centered +Kernel Alignment (CKA) analysis. The observations reveal lower layers are more +sensitive to input image scale variations than high-level layers. Inspired by +this insight, we propose Multi-scale Unified Network (MUSN) consisting of +multi-scale subnets, a unified network, and scale-invariant constraint. Our +method divides the shallow layers into multi-scale subnets to enable feature +extraction from multi-scale inputs, and the low-level features are unified in +deep layers for extracting high-level semantic features. A scale-invariant +constraint is posed to maintain feature consistency across different scales. +Extensive experiments on ImageNet and other scale-diverse datasets, demonstrate +that MSUN achieves significant improvements in both model performance and +computational efficiency. Particularly, MSUN yields an accuracy increase up to +44.53% and diminishes FLOPs by 7.01-16.13% in multi-scale scenarios. + +
+
+
+
+
+ + ☆ Efficient Test-Time Adaptation of Vision-Language Models CVPR 2024 + + +
+ Test-time adaptation with pre-trained vision-language models has attracted +increasing attention for tackling distribution shifts during the test time. +Though prior studies have achieved very promising performance, they involve +intensive computation which is severely unaligned with test-time adaptation. We +design TDA, a training-free dynamic adapter that enables effective and +efficient test-time adaptation with vision-language models. TDA works with a +lightweight key-value cache that maintains a dynamic queue with few-shot pseudo +labels as values and the corresponding test-sample features as keys. Leveraging +the key-value cache, TDA allows adapting to test data gradually via progressive +pseudo label refinement which is super-efficient without incurring any +backpropagation. In addition, we introduce negative pseudo labeling that +alleviates the adverse impact of pseudo label noises by assigning pseudo labels +to certain negative classes when the model is uncertain about its pseudo label +predictions. Extensive experiments over two benchmarks demonstrate TDA's +superior effectiveness and efficiency as compared with the state-of-the-art. +The code has been released in \url{https://kdiaaa.github.io/tda/}. + +
+
+ comment: Accepted to CVPR 2024. The code has been released in + \url{https://kdiaaa.github.io/tda/} +
+
+
+
+
+ + ☆ Towards Non-Exemplar Semi-Supervised Class-Incremental Learning + + +
+ Deep neural networks perform remarkably well in close-world scenarios. +However, novel classes emerged continually in real applications, making it +necessary to learn incrementally. Class-incremental learning (CIL) aims to +gradually recognize new classes while maintaining the discriminability of old +ones. Existing CIL methods have two limitations: a heavy reliance on preserving +old data for forgetting mitigation and the need for vast labeled data for +knowledge adaptation. To overcome these issues, we propose a non-exemplar +semi-supervised CIL framework with contrastive learning and semi-supervised +incremental prototype classifier (Semi-IPC). On the one hand, contrastive +learning helps the model learn rich representations, easing the trade-off +between learning representations of new classes and forgetting that of old +classes. On the other hand, Semi-IPC learns a prototype for each class with +unsupervised regularization, enabling the model to incrementally learn from +partially labeled new data while maintaining the knowledge of old classes. +Experiments on benchmark datasets demonstrate the strong performance of our +method: without storing any old samples and only using less than 1% of labels, +Semi-IPC outperforms advanced exemplar-based methods. We hope our work offers +new insights for future CIL research. The code will be made publicly available. + +
+
+
+
+
+ + ☆ SGDM: Static-Guided Dynamic Module Make Stronger Visual Models + + +
+ The spatial attention mechanism has been widely used to improve object +detection performance. However, its operation is currently limited to static +convolutions lacking content-adaptive features. This paper innovatively +approaches from the perspective of dynamic convolution. We propose Razor +Dynamic Convolution (RDConv) to address thetwo flaws in dynamic weight +convolution, making it hard to implement in spatial mechanism: 1) it is +computation-heavy; 2) when generating weights, spatial information is +disregarded. Firstly, by using Razor Operation to generate certain features, we +vastly reduce the parameters of the entire dynamic convolution operation. +Secondly, we added a spatial branch inside RDConv to generate convolutional +kernel parameters with richer spatial information. Embedding dynamic +convolution will also bring the problem of sensitivity to high-frequency noise. +We propose the Static-Guided Dynamic Module (SGDM) to address this limitation. +By using SGDM, we utilize a set of asymmetric static convolution kernel +parameters to guide the construction of dynamic convolution. We introduce the +mechanism of shared weights in static convolution to solve the problem of +dynamic convolution being sensitive to high-frequency noise. Extensive +experiments illustrate that multiple different object detection backbones +equipped with SGDM achieve a highly competitive boost in performance(e.g., +4% +mAP with YOLOv5n on VOC and +1.7% mAP with YOLOv8n on COCO) with negligible +parameter increase(i.e., +0.33M on YOLOv5n and +0.19M on YOLOv8n). + +
+
+ comment: 16 pages, 4 figures +
+
+
+
+
+ + ☆ AIR-HLoc: Adaptive Image Retrieval for Efficient Visual Localisation + + +
+ State-of-the-art (SOTA) hierarchical localisation pipelines (HLoc) rely on +image retrieval (IR) techniques to establish 2D-3D correspondences by selecting +the $k$ most similar images from a reference image database for a given query +image. Although higher values of $k$ enhance localisation robustness, the +computational cost for feature matching increases linearly with $k$. In this +paper, we observe that queries that are the most similar to images in the +database result in a higher proportion of feature matches and, thus, more +accurate positioning. Thus, a small number of images is sufficient for queries +very similar to images in the reference database. We then propose a novel +approach, AIR-HLoc, which divides query images into different localisation +difficulty levels based on their similarity to the reference image database. We +consider an image with high similarity to the reference image as an easy query +and an image with low similarity as a hard query. Easy queries show a limited +improvement in accuracy when increasing $k$. Conversely, higher values of $k$ +significantly improve accuracy for hard queries. Given the limited improvement +in accuracy when increasing $k$ for easy queries and the significant +improvement for hard queries, we adapt the value of $k$ to the query's +difficulty level. Therefore, AIR-HLoc optimizes processing time by adaptively +assigning different values of $k$ based on the similarity between the query and +reference images without losing accuracy. Our extensive experiments on the +Cambridge Landmarks, 7Scenes, and Aachen Day-Night-v1.1 datasets demonstrate +our algorithm's efficacy, reducing 30\%, 26\%, and 11\% in computational +overhead while maintaining SOTA accuracy compared to HLoc with fixed image +retrieval. + +
+
+
+
+
+ + ☆ DVLO: Deep Visual-LiDAR Odometry with Local-to-Global Feature Fusion and + Bi-Directional Structure Alignment + + +
+ Information inside visual and LiDAR data is well complementary derived from +the fine-grained texture of images and massive geometric information in point +clouds. However, it remains challenging to explore effective visual-LiDAR +fusion, mainly due to the intrinsic data structure inconsistency between two +modalities: Images are regular and dense, but LiDAR points are unordered and +sparse. To address the problem, we propose a local-to-global fusion network +with bi-directional structure alignment. To obtain locally fused features, we +project points onto image plane as cluster centers and cluster image pixels +around each center. Image pixels are pre-organized as pseudo points for +image-to-point structure alignment. Then, we convert points to pseudo images by +cylindrical projection (point-to-image structure alignment) and perform +adaptive global feature fusion between point features with local fused +features. Our method achieves state-of-the-art performance on KITTI odometry +and FlyingThings3D scene flow datasets compared to both single-modal and +multi-modal methods. Codes will be released later. + +
+
+
+
+
+ + ☆ Unleashing the Potential of SAM for Medical Adaptation via Hierarchical + Decoding CVPR 2024 + + +
+ The Segment Anything Model (SAM) has garnered significant attention for its +versatile segmentation abilities and intuitive prompt-based interface. However, +its application in medical imaging presents challenges, requiring either +substantial training costs and extensive medical datasets for full model +fine-tuning or high-quality prompts for optimal performance. This paper +introduces H-SAM: a prompt-free adaptation of SAM tailored for efficient +fine-tuning of medical images via a two-stage hierarchical decoding procedure. +In the initial stage, H-SAM employs SAM's original decoder to generate a prior +probabilistic mask, guiding a more intricate decoding process in the second +stage. Specifically, we propose two key designs: 1) A class-balanced, +mask-guided self-attention mechanism addressing the unbalanced label +distribution, enhancing image embedding; 2) A learnable mask cross-attention +mechanism spatially modulating the interplay among different image regions +based on the prior mask. Moreover, the inclusion of a hierarchical pixel +decoder in H-SAM enhances its proficiency in capturing fine-grained and +localized details. This approach enables SAM to effectively integrate learned +medical priors, facilitating enhanced adaptation for medical image segmentation +with limited samples. Our H-SAM demonstrates a 4.78% improvement in average +Dice compared to existing prompt-free SAM variants for multi-organ segmentation +using only 10% of 2D slices. Notably, without using any unlabeled data, H-SAM +even outperforms state-of-the-art semi-supervised models relying on extensive +unlabeled training data across various medical datasets. Our code is available +at https://github.com/Cccccczh404/H-SAM. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Image Deraining via Self-supervised Reinforcement Learning + + +
+ The quality of images captured outdoors is often affected by the weather. One +factor that interferes with sight is rain, which can obstruct the view of +observers and computer vision applications that rely on those images. The work +aims to recover rain images by removing rain streaks via Self-supervised +Reinforcement Learning (RL) for image deraining (SRL-Derain). We locate rain +streak pixels from the input rain image via dictionary learning and use +pixel-wise RL agents to take multiple inpainting actions to remove rain +progressively. To our knowledge, this work is the first attempt where +self-supervised RL is applied to image deraining. Experimental results on +several benchmark image-deraining datasets show that the proposed SRL-Derain +performs favorably against state-of-the-art few-shot and self-supervised +deraining and denoising methods. + +
+
+
+
+
+ + ☆ Branch-Tuning: Balancing Stability and Plasticity for Continual + Self-Supervised Learning + + +
+ Self-supervised learning (SSL) has emerged as an effective paradigm for +deriving general representations from vast amounts of unlabeled data. However, +as real-world applications continually integrate new content, the high +computational and resource demands of SSL necessitate continual learning rather +than complete retraining. This poses a challenge in striking a balance between +stability and plasticity when adapting to new information. In this paper, we +employ Centered Kernel Alignment for quantitatively analyzing model stability +and plasticity, revealing the critical roles of batch normalization layers for +stability and convolutional layers for plasticity. Motivated by this, we +propose Branch-tuning, an efficient and straightforward method that achieves a +balance between stability and plasticity in continual SSL. Branch-tuning +consists of branch expansion and compression, and can be easily applied to +various SSL methods without the need of modifying the original methods, +retaining old data or models. We validate our method through incremental +experiments on various benchmark datasets, demonstrating its effectiveness and +practical value in real-world scenarios. We hope our work offers new insights +for future continual self-supervised learning research. The code will be made +publicly available. + +
+
+
+
+
+ + ☆ Toward Interactive Regional Understanding in Vision-Large Language + Models NAACL 2024 + + +
+ Recent Vision-Language Pre-training (VLP) models have demonstrated +significant advancements. Nevertheless, these models heavily rely on image-text +pairs that capture only coarse and global information of an image, leading to a +limitation in their regional understanding ability. In this work, we introduce +\textbf{RegionVLM}, equipped with explicit regional modeling capabilities, +allowing them to understand user-indicated image regions. To achieve this, we +design a simple yet innovative architecture, requiring no modifications to the +model architecture or objective function. Additionally, we leverage a dataset +that contains a novel source of information, namely Localized Narratives, which +has been overlooked in previous VLP research. Our experiments demonstrate that +our single generalist model not only achieves an interactive dialogue system +but also exhibits superior performance on various zero-shot region +understanding tasks, without compromising its ability for global image +understanding. + +
+
+ comment: NAACL 2024 Main Conference +
+
+
+
+
+ + ☆ Enhancing Generative Class Incremental Learning Performance with Model + Forgetting Approach + + +
+ This study presents a novel approach to Generative Class Incremental Learning +(GCIL) by introducing the forgetting mechanism, aimed at dynamically managing +class information for better adaptation to streaming data. GCIL is one of the +hot topics in the field of computer vision, and this is considered one of the +crucial tasks in society, specifically the continual learning of generative +models. The ability to forget is a crucial brain function that facilitates +continual learning by selectively discarding less relevant information for +humans. However, in the field of machine learning models, the concept of +intentionally forgetting has not been extensively investigated. In this study +we aim to bridge this gap by incorporating the forgetting mechanisms into GCIL, +thereby examining their impact on the models' ability to learn in continual +learning. Through our experiments, we have found that integrating the +forgetting mechanisms significantly enhances the models' performance in +acquiring new knowledge, underscoring the positive role that strategic +forgetting plays in the process of continual learning. + +
+
+
+
+
+ + ☆ Beyond Embeddings: The Promise of Visual Table in Multi-Modal Models + + +
+ Visual representation learning has been a cornerstone in computer vision, +evolving from supervised learning with human-annotated labels to aligning +image-text pairs from the Internet. Despite recent advancements in multi-modal +large language models (MLLMs), the visual representations they rely on, such as +CLIP embeddings, often lack access to external world knowledge critical for +real-world visual reasoning. In this work, we propose Visual Table, a novel +visual representation tailored for MLLMs. It provides hierarchical text +descriptions of holistic visual scenes, consisting of a scene description and +multiple object-centric descriptions that encompass categories, attributes, and +knowledge at instance level. We further develop a scalable generator for visual +table generation and train it on small-scale annotations from GPT4V. Extensive +evaluations demonstrate that, with generated visual tables as additional visual +representations, our model can consistently outperform the state-of-the-art +(SOTA) MLLMs across diverse benchmarks. When visual tables serve as standalone +visual representations, our model can closely match or even beat the SOTA MLLMs +that are built on CLIP visual embeddings. Our code is available at +https://github.com/LaVi-Lab/Visual-Table. + +
+
+ comment: Project page: https://github.com/LaVi-Lab/Visual-Table +
+
+
+
+
+ + ☆ NeuSDFusion: A Spatial-Aware Generative Model for 3D Shape Completion, + Reconstruction, and Generation + + +
+ 3D shape generation aims to produce innovative 3D content adhering to +specific conditions and constraints. Existing methods often decompose 3D shapes +into a sequence of localized components, treating each element in isolation +without considering spatial consistency. As a result, these approaches exhibit +limited versatility in 3D data representation and shape generation, hindering +their ability to generate highly diverse 3D shapes that comply with the +specified constraints. In this paper, we introduce a novel spatial-aware 3D +shape generation framework that leverages 2D plane representations for enhanced +3D shape modeling. To ensure spatial coherence and reduce memory usage, we +incorporate a hybrid shape representation technique that directly learns a +continuous signed distance field representation of the 3D shape using +orthogonal 2D planes. Additionally, we meticulously enforce spatial +correspondences across distinct planes using a transformer-based autoencoder +structure, promoting the preservation of spatial relationships in the generated +3D shapes. This yields an algorithm that consistently outperforms +state-of-the-art 3D shape generation methods on various tasks, including +unconditional shape generation, multi-modal shape completion, single-view +reconstruction, and text-to-shape synthesis. + +
+
+
+
+
+ + ☆ TAFormer: A Unified Target-Aware Transformer for Video and Motion Joint + Prediction in Aerial Scenes + + +
+ As drone technology advances, using unmanned aerial vehicles for aerial +surveys has become the dominant trend in modern low-altitude remote sensing. +The surge in aerial video data necessitates accurate prediction for future +scenarios and motion states of the interested target, particularly in +applications like traffic management and disaster response. Existing video +prediction methods focus solely on predicting future scenes (video frames), +suffering from the neglect of explicitly modeling target's motion states, which +is crucial for aerial video interpretation. To address this issue, we introduce +a novel task called Target-Aware Aerial Video Prediction, aiming to +simultaneously predict future scenes and motion states of the target. Further, +we design a model specifically for this task, named TAFormer, which provides a +unified modeling approach for both video and target motion states. +Specifically, we introduce Spatiotemporal Attention (STA), which decouples the +learning of video dynamics into spatial static attention and temporal dynamic +attention, effectively modeling the scene appearance and motion. Additionally, +we design an Information Sharing Mechanism (ISM), which elegantly unifies the +modeling of video and target motion by facilitating information interaction +through two sets of messenger tokens. Moreover, to alleviate the difficulty of +distinguishing targets in blurry predictions, we introduce Target-Sensitive +Gaussian Loss (TSGL), enhancing the model's sensitivity to both target's +position and content. Extensive experiments on UAV123VP and VisDroneVP (derived +from single-object tracking datasets) demonstrate the exceptional performance +of TAFormer in target-aware video prediction, showcasing its adaptability to +the additional requirements of aerial video interpretation for target +awareness. + +
+
+ comment: 17 pages, 9 figures +
+
+
+
+
+ + ☆ Benchmarking Image Transformers for Prostate Cancer Detection from + Ultrasound Data SP + + +
+ PURPOSE: Deep learning methods for classifying prostate cancer (PCa) in +ultrasound images typically employ convolutional networks (CNNs) to detect +cancer in small regions of interest (ROI) along a needle trace region. However, +this approach suffers from weak labelling, since the ground-truth +histopathology labels do not describe the properties of individual ROIs. +Recently, multi-scale approaches have sought to mitigate this issue by +combining the context awareness of transformers with a CNN feature extractor to +detect cancer from multiple ROIs using multiple-instance learning (MIL). In +this work, we present a detailed study of several image transformer +architectures for both ROI-scale and multi-scale classification, and a +comparison of the performance of CNNs and transformers for ultrasound-based +prostate cancer classification. We also design a novel multi-objective learning +strategy that combines both ROI and core predictions to further mitigate label +noise. METHODS: We evaluate 3 image transformers on ROI-scale cancer +classification, then use the strongest model to tune a multi-scale classifier +with MIL. We train our MIL models using our novel multi-objective learning +strategy and compare our results to existing baselines. RESULTS: We find that +for both ROI-scale and multi-scale PCa detection, image transformer backbones +lag behind their CNN counterparts. This deficit in performance is even more +noticeable for larger models. When using multi-objective learning, we can +improve performance of MIL, with a 77.9% AUROC, a sensitivity of 75.9%, and a +specificity of 66.3%. CONCLUSION: Convolutional networks are better suited for +modelling sparse datasets of prostate ultrasounds, producing more robust +features than transformers in PCa detection. Multi-scale methods remain the +best architecture for this task, with multi-objective learning presenting an +effective way to improve performance. + +
+
+ comment: early draft, 7 pages; Accepted to SPIE Medical Imaging 2024 +
+
+
+
+
+ + ☆ Fourier or Wavelet bases as counterpart self-attention in spikformer for + efficient visual classification + + +
+ Energy-efficient spikformer has been proposed by integrating the biologically +plausible spiking neural network (SNN) and artificial Transformer, whereby the +Spiking Self-Attention (SSA) is used to achieve both higher accuracy and lower +computational cost. However, it seems that self-attention is not always +necessary, especially in sparse spike-form calculation manners. In this paper, +we innovatively replace vanilla SSA (using dynamic bases calculating from Query +and Key) with spike-form Fourier Transform, Wavelet Transform, and their +combinations (using fixed triangular or wavelets bases), based on a key +hypothesis that both of them use a set of basis functions for information +transformation. Hence, the Fourier-or-Wavelet-based spikformer (FWformer) is +proposed and verified in visual classification tasks, including both static +image and event-based video datasets. The FWformer can achieve comparable or +even higher accuracies ($0.4\%$-$1.5\%$), higher running speed ($9\%$-$51\%$ +for training and $19\%$-$70\%$ for inference), reduced theoretical energy +consumption ($20\%$-$25\%$), and reduced GPU memory usage ($4\%$-$26\%$), +compared to the standard spikformer. Our result indicates the continuous +refinement of new Transformers, that are inspired either by biological +discovery (spike-form), or information theory (Fourier or Wavelet Transform), +is promising. + +
+
+ comment: 18 pages, 2 figures. arXiv admin note: substantial text overlap with + arXiv:2308.02557 +
+
+
+
+
+ + ☆ NeuroPictor: Refining fMRI-to-Image Reconstruction via Multi-individual + Pretraining and Multi-level Modulation + + +
+ Recent fMRI-to-image approaches mainly focused on associating fMRI signals +with specific conditions of pre-trained diffusion models. These approaches, +while producing high-quality images, capture only a limited aspect of the +complex information in fMRI signals and offer little detailed control over +image creation. In contrast, this paper proposes to directly modulate the +generation process of diffusion models using fMRI signals. Our approach, +NeuroPictor, divides the fMRI-to-image process into three steps: i) fMRI +calibrated-encoding, to tackle multi-individual pre-training for a shared +latent space to minimize individual difference and enable the subsequent +cross-subject training; ii) fMRI-to-image cross-subject pre-training, +perceptually learning to guide diffusion model with high- and low-level +conditions across different individuals; iii) fMRI-to-image single-subject +refining, similar with step ii but focus on adapting to particular individual. +NeuroPictor extracts high-level semantic features from fMRI signals that +characterizing the visual stimulus and incrementally fine-tunes the diffusion +model with a low-level manipulation network to provide precise structural +instructions. By training with over 60,000 fMRI-image pairs from various +individuals, our model enjoys superior fMRI-to-image decoding capacity, +particularly in the within-subject setting, as evidenced in benchmark datasets. +Project page: https://jingyanghuo.github.io/neuropictor/. + +
+
+
+
+
+ + ☆ An Evolutionary Network Architecture Search Framework with Adaptive + Multimodal Fusion for Hand Gesture Recognition + + +
+ Hand gesture recognition (HGR) based on multimodal data has attracted +considerable attention owing to its great potential in applications. Various +manually designed multimodal deep networks have performed well in multimodal +HGR (MHGR), but most of existing algorithms require a lot of expert experience +and time-consuming manual trials. To address these issues, we propose an +evolutionary network architecture search framework with the adaptive multimodel +fusion (AMF-ENAS). Specifically, we design an encoding space that +simultaneously considers fusion positions and ratios of the multimodal data, +allowing for the automatic construction of multimodal networks with different +architectures through decoding. Additionally, we consider three input streams +corresponding to intra-modal surface electromyography (sEMG), intra-modal +accelerometer (ACC), and inter-modal sEMG-ACC. To automatically adapt to +various datasets, the ENAS framework is designed to automatically search a MHGR +network with appropriate fusion positions and ratios. To the best of our +knowledge, this is the first time that ENAS has been utilized in MHGR to tackle +issues related to the fusion position and ratio of multimodal data. +Experimental results demonstrate that AMF-ENAS achieves state-of-the-art +performance on the Ninapro DB2, DB3, and DB7 datasets. + +
+
+
+
+
+ + ☆ Road Obstacle Detection based on Unknown Objectness Scores ICRA 2024 + + +
+ The detection of unknown traffic obstacles is vital to ensure safe autonomous +driving. The standard object-detection methods cannot identify unknown objects +that are not included under predefined categories. This is because +object-detection methods are trained to assign a background label to pixels +corresponding to the presence of unknown objects. To address this problem, the +pixel-wise anomaly-detection approach has attracted increased research +attention. Anomaly-detection techniques, such as uncertainty estimation and +perceptual difference from reconstructed images, make it possible to identify +pixels of unknown objects as out-of-distribution (OoD) samples. However, when +applied to images with many unknowns and complex components, such as driving +scenes, these methods often exhibit unstable performance. The purpose of this +study is to achieve stable performance for detecting unknown objects by +incorporating the object-detection fashions into the pixel-wise anomaly +detection methods. To achieve this goal, we adopt a semantic-segmentation +network with a sigmoid head that simultaneously provides pixel-wise anomaly +scores and objectness scores. Our experimental results show that the objectness +scores play an important role in improving the detection performance. Based on +these results, we propose a novel anomaly score by integrating these two +scores, which we term as unknown objectness score. Quantitative evaluations +show that the proposed method outperforms state-of-the-art methods when applied +to the publicly available datasets. + +
+
+ comment: ICRA 2024 +
+
+
+
+
+ + ☆ Few-shot Online Anomaly Detection and Segmentation + + +
+ Detecting anomaly patterns from images is a crucial artificial intelligence +technique in industrial applications. Recent research in this domain has +emphasized the necessity of a large volume of training data, overlooking the +practical scenario where, post-deployment of the model, unlabeled data +containing both normal and abnormal samples can be utilized to enhance the +model's performance. Consequently, this paper focuses on addressing the +challenging yet practical few-shot online anomaly detection and segmentation +(FOADS) task. Under the FOADS framework, models are trained on a few-shot +normal dataset, followed by inspection and improvement of their capabilities by +leveraging unlabeled streaming data containing both normal and abnormal samples +simultaneously. + To tackle this issue, we propose modeling the feature distribution of normal +images using a Neural Gas network, which offers the flexibility to adapt the +topology structure to identify outliers in the data flow. In order to achieve +improved performance with limited training samples, we employ multi-scale +feature embedding extracted from a CNN pre-trained on ImageNet to obtain a +robust representation. Furthermore, we introduce an algorithm that can +incrementally update parameters without the need to store previous samples. +Comprehensive experimental results demonstrate that our method can achieve +substantial performance under the FOADS setting, while ensuring that the time +complexity remains within an acceptable range on MVTec AD and BTAD datasets. + +
+
+
+
+
+ + ☆ Generative Medical Segmentation + + +
+ Rapid advancements in medical image segmentation performance have been +significantly driven by the development of Convolutional Neural Networks (CNNs) +and Vision Transformers (ViTs). However, these models introduce high +computational demands and often have limited ability to generalize across +diverse medical imaging datasets. In this manuscript, we introduce Generative +Medical Segmentation (GMS), a novel approach leveraging a generative model for +image segmentation. Concretely, GMS employs a robust pre-trained Variational +Autoencoder (VAE) to derive latent representations of both images and masks, +followed by a mapping model that learns the transition from image to mask in +the latent space. This process culminates in generating a precise segmentation +mask within the image space using the pre-trained VAE decoder. The design of +GMS leads to fewer learnable parameters in the model, resulting in a reduced +computational burden and enhanced generalization capability. Our extensive +experimental analysis across five public datasets in different medical imaging +domains demonstrates GMS outperforms existing discriminative segmentation +models and has remarkable domain generalization. Our experiments suggest GMS +could set a new benchmark for medical image segmentation, offering a scalable +and effective solution. GMS implementation and model weights are available at +https://github.com/King-HAW/GMS. + +
+
+
+
+
+ + ☆ Looking Beyond What You See: An Empirical Analysis on Subgroup + Intersectional Fairness for Multi-label Chest X-ray Classification Using + Social Determinants of Racial Health Inequities ICCV + + +
+ There has been significant progress in implementing deep learning models in +disease diagnosis using chest X- rays. Despite these advancements, inherent +biases in these models can lead to disparities in prediction accuracy across +protected groups. In this study, we propose a framework to achieve accurate +diagnostic outcomes and ensure fairness across intersectional groups in +high-dimensional chest X- ray multi-label classification. Transcending +traditional protected attributes, we consider complex interactions within +social determinants, enabling a more granular benchmark and evaluation of +fairness. We present a simple and robust method that involves retraining the +last classification layer of pre-trained models using a balanced dataset across +groups. Additionally, we account for fairness constraints and integrate +class-balanced fine-tuning for multi-label settings. The evaluation of our +method on the MIMIC-CXR dataset demonstrates that our framework achieves an +optimal tradeoff between accuracy and fairness compared to baseline methods. + +
+
+ comment: ICCV CVAMD 2023 +
+
+
+
+
+ + ☆ Middle Fusion and Multi-Stage, Multi-Form Prompts for Robust RGB-T + Tracking + + +
+ RGB-T tracking, a vital downstream task of object tracking, has made +remarkable progress in recent years. Yet, it remains hindered by two major +challenges: 1) the trade-off between performance and efficiency; 2) the +scarcity of training data. To address the latter challenge, some recent methods +employ prompts to fine-tune pre-trained RGB tracking models and leverage +upstream knowledge in a parameter-efficient manner. However, these methods +inadequately explore modality-independent patterns and disregard the dynamic +reliability of different modalities in open scenarios. We propose M3PT, a novel +RGB-T prompt tracking method that leverages middle fusion and multi-modal and +multi-stage visual prompts to overcome these challenges. We pioneer the use of +the middle fusion framework for RGB-T tracking, which achieves a balance +between performance and efficiency. Furthermore, we incorporate the pre-trained +RGB tracking model into the framework and utilize multiple flexible prompt +strategies to adapt the pre-trained model to the comprehensive exploration of +uni-modal patterns and the improved modeling of fusion-modal features, +harnessing the potential of prompt learning in RGB-T tracking. Our method +outperforms the state-of-the-art methods on four challenging benchmarks, while +attaining 46.1 fps inference speed. + +
+
+
+
+
+ + ☆ LayoutFlow: Flow Matching for Layout Generation + + +
+ Finding a suitable layout represents a crucial task for diverse applications +in graphic design. Motivated by simpler and smoother sampling trajectories, we +explore the use of Flow Matching as an alternative to current diffusion-based +layout generation models. Specifically, we propose LayoutFlow, an efficient +flow-based model capable of generating high-quality layouts. Instead of +progressively denoising the elements of a noisy layout, our method learns to +gradually move, or flow, the elements of an initial sample until it reaches its +final prediction. In addition, we employ a conditioning scheme that allows us +to handle various generation tasks with varying degrees of conditioning with a +single model. Empirically, LayoutFlow performs on par with state-of-the-art +models while being significantly faster. + +
+
+
+
+
+ + ☆ Don't Look into the Dark: Latent Codes for Pluralistic Image Inpainting + + +
+ We present a method for large-mask pluralistic image inpainting based on the +generative framework of discrete latent codes. Our method learns latent priors, +discretized as tokens, by only performing computations at the visible locations +of the image. This is realized by a restrictive partial encoder that predicts +the token label for each visible block, a bidirectional transformer that infers +the missing labels by only looking at these tokens, and a dedicated synthesis +network that couples the tokens with the partial image priors to generate +coherent and pluralistic complete image even under extreme mask settings. +Experiments on public benchmarks validate our design choices as the proposed +method outperforms strong baselines in both visual quality and diversity +metrics. + +
+
+ comment: cvpr 2024 +
+
+
+
+
+ + ☆ Multi-Layer Dense Attention Decoder for Polyp Segmentation + + +
+ Detecting and segmenting polyps is crucial for expediting the diagnosis of +colon cancer. This is a challenging task due to the large variations of polyps +in color, texture, and lighting conditions, along with subtle differences +between the polyp and its surrounding area. Recently, vision Transformers have +shown robust abilities in modeling global context for polyp segmentation. +However, they face two major limitations: the inability to learn local +relations among multi-level layers and inadequate feature aggregation in the +decoder. To address these issues, we propose a novel decoder architecture aimed +at hierarchically aggregating locally enhanced multi-level dense features. +Specifically, we introduce a novel module named Dense Attention Gate (DAG), +which adaptively fuses all previous layers' features to establish local feature +relations among all layers. Furthermore, we propose a novel nested decoder +architecture that hierarchically aggregates decoder features, thereby enhancing +semantic features. We incorporate our novel dense decoder with the PVT backbone +network and conduct evaluations on five polyp segmentation datasets: Kvasir, +CVC-300, CVC-ColonDB, CVC-ClinicDB, and ETIS. Our experiments and comparisons +with nine competing segmentation models demonstrate that the proposed +architecture achieves state-of-the-art performance and outperforms the previous +models on four datasets. The source code is available at: +https://github.com/krushi1992/Dense-Decoder. + +
+
+
+
+
+ + ☆ Online Embedding Multi-Scale CLIP Features into 3D Maps + + +
+ This study introduces a novel approach to online embedding of multi-scale +CLIP (Contrastive Language-Image Pre-Training) features into 3D maps. By +harnessing CLIP, this methodology surpasses the constraints of conventional +vocabulary-limited methods and enables the incorporation of semantic +information into the resultant maps. While recent approaches have explored the +embedding of multi-modal features in maps, they often impose significant +computational costs, lacking practicality for exploring unfamiliar environments +in real time. Our approach tackles these challenges by efficiently computing +and embedding multi-scale CLIP features, thereby facilitating the exploration +of unfamiliar environments through real-time map generation. Moreover, the +embedding CLIP features into the resultant maps makes offline retrieval via +linguistic queries feasible. In essence, our approach simultaneously achieves +real-time object search and mapping of unfamiliar environments. Additionally, +we propose a zero-shot object-goal navigation system based on our mapping +approach, and we validate its efficacy through object-goal navigation, offline +object retrieval, and multi-object-goal navigation in both simulated +environments and real robot experiments. The findings demonstrate that our +method not only exhibits swifter performance than state-of-the-art mapping +methods but also surpasses them in terms of the success rate of object-goal +navigation tasks. + +
+
+ comment: 8 pages, 7 figures +
+
+
+
+
+ + ☆ LITA: Language Instructed Temporal-Localization Assistant + + +
+ There has been tremendous progress in multimodal Large Language Models +(LLMs). Recent works have extended these models to video input with promising +instruction following capabilities. However, an important missing piece is +temporal localization. These models cannot accurately answer the "When?" +questions. We identify three key aspects that limit their temporal localization +capabilities: (i) time representation, (ii) architecture, and (iii) data. We +address these shortcomings by proposing Language Instructed +Temporal-Localization Assistant (LITA) with the following features: (1) We +introduce time tokens that encode timestamps relative to the video length to +better represent time in videos. (2) We introduce SlowFast tokens in the +architecture to capture temporal information at fine temporal resolution. (3) +We emphasize temporal localization data for LITA. In addition to leveraging +existing video datasets with timestamps, we propose a new task, Reasoning +Temporal Localization (RTL), along with the dataset, ActivityNet-RTL, for +learning and evaluating this task. Reasoning temporal localization requires +both the reasoning and temporal localization of Video LLMs. LITA demonstrates +strong performance on this challenging task, nearly doubling the temporal mean +intersection-over-union (mIoU) of baselines. In addition, we show that our +emphasis on temporal localization also substantially improves video-based text +generation compared to existing Video LLMs, including a 36% relative +improvement of Temporal Understanding. Code is available at: +https://github.com/NVlabs/LITA + +
+
+
+
+
+ + ☆ Illicit object detection in X-ray images using Vision Transformers + + +
+ Illicit object detection is a critical task performed at various +high-security locations, including airports, train stations, subways, and +ports. The continuous and tedious work of examining thousands of X-ray images +per hour can be mentally taxing. Thus, Deep Neural Networks (DNNs) can be used +to automate the X-ray image analysis process, improve efficiency and alleviate +the security officers' inspection burden. The neural architectures typically +utilized in relevant literature are Convolutional Neural Networks (CNNs), with +Vision Transformers (ViTs) rarely employed. In order to address this gap, this +paper conducts a comprehensive evaluation of relevant ViT architectures on +illicit item detection in X-ray images. This study utilizes both Transformer +and hybrid backbones, such as SWIN and NextViT, and detectors, such as DINO and +RT-DETR. The results demonstrate the remarkable accuracy of the DINO +Transformer detector in the low-data regime, the impressive real-time +performance of YOLOv8, and the effectiveness of the hybrid NextViT backbone. + +
+
+
+
+
+ + ☆ Egocentric Scene-aware Human Trajectory Prediction + + +
+ Wearable collaborative robots stand to assist human wearers who need fall +prevention assistance or wear exoskeletons. Such a robot needs to be able to +predict the ego motion of the wearer based on egocentric vision and the +surrounding scene. In this work, we leveraged body-mounted cameras and sensors +to anticipate the trajectory of human wearers through complex surroundings. To +facilitate research in ego-motion prediction, we have collected a comprehensive +walking scene navigation dataset centered on the user's perspective. We present +a method to predict human motion conditioning on the surrounding static scene. +Our method leverages a diffusion model to produce a distribution of potential +future trajectories, taking into account the user's observation of the +environment. We introduce a compact representation to encode the user's visual +memory of the surroundings, as well as an efficient sample-generating technique +to speed up real-time inference of a diffusion model. We ablate our model and +compare it to baselines, and results show that our model outperforms existing +methods on key metrics of collision avoidance and trajectory mode coverage. + +
+
+ comment: 14 pages, 9 figures +
+
+
+
+
+ + ☆ WALT3D: Generating Realistic Training Data from Time-Lapse Imagery for + Reconstructing Dynamic Objects under Occlusion CVPR 2024 + + +
+ Current methods for 2D and 3D object understanding struggle with severe +occlusions in busy urban environments, partly due to the lack of large-scale +labeled ground-truth annotations for learning occlusion. In this work, we +introduce a novel framework for automatically generating a large, realistic +dataset of dynamic objects under occlusions using freely available time-lapse +imagery. By leveraging off-the-shelf 2D (bounding box, segmentation, keypoint) +and 3D (pose, shape) predictions as pseudo-groundtruth, unoccluded 3D objects +are identified automatically and composited into the background in a clip-art +style, ensuring realistic appearances and physically accurate occlusion +configurations. The resulting clip-art image with pseudo-groundtruth enables +efficient training of object reconstruction methods that are robust to +occlusions. Our method demonstrates significant improvements in both 2D and 3D +reconstruction, particularly in scenarios with heavily occluded objects like +vehicles and people in urban scenes. + +
+
+ comment: To appear in CVPR 2024 +
+
+
+
+
+ + ☆ Robust Active Speaker Detection in Noisy Environments + + +
+ This paper addresses the issue of active speaker detection (ASD) in noisy +environments and formulates a robust active speaker detection (rASD) problem. +Existing ASD approaches leverage both audio and visual modalities, but +non-speech sounds in the surrounding environment can negatively impact +performance. To overcome this, we propose a novel framework that utilizes +audio-visual speech separation as guidance to learn noise-free audio features. +These features are then utilized in an ASD model, and both tasks are jointly +optimized in an end-to-end framework. Our proposed framework mitigates residual +noise and audio quality reduction issues that can occur in a naive cascaded +two-stage framework that directly uses separated speech for ASD, and enables +the two tasks to be optimized simultaneously. To further enhance the robustness +of the audio features and handle inherent speech noises, we propose a dynamic +weighted loss approach to train the speech separator. We also collected a +real-world noise audio dataset to facilitate investigations. Experiments +demonstrate that non-speech audio noises significantly impact ASD models, and +our proposed approach improves ASD performance in noisy environments. The +framework is general and can be applied to different ASD approaches to improve +their robustness. Our code, models, and data will be released. + +
+
+ comment: 15 pages, 5 figures +
+
+
+
+
+ + ☆ Cross--domain Fiber Cluster Shape Analysis for Language Performance + Cognitive Score Prediction + + +
+ Shape plays an important role in computer graphics, offering informative +features to convey an object's morphology and functionality. Shape analysis in +brain imaging can help interpret structural and functionality correlations of +the human brain. In this work, we investigate the shape of the brain's 3D white +matter connections and its potential predictive relationship to human cognitive +function. We reconstruct brain connections as sequences of 3D points using +diffusion magnetic resonance imaging (dMRI) tractography. To describe each +connection, we extract 12 shape descriptors in addition to traditional dMRI +connectivity and tissue microstructure features. We introduce a novel +framework, Shape--fused Fiber Cluster Transformer (SFFormer), that leverages a +multi-head cross-attention feature fusion module to predict subject-specific +language performance based on dMRI tractography. We assess the performance of +the method on a large dataset including 1065 healthy young adults. The results +demonstrate that both the transformer-based SFFormer model and its inter/intra +feature fusion with shape, microstructure, and connectivity are informative, +and together, they improve the prediction of subject-specific language +performance scores. Overall, our results indicate that the shape of the brain's +connections is predictive of human language function. + +
+
+ comment: 2 figures, 11 pages +
+
+
+
+
+ + ☆ Envisioning MedCLIP: A Deep Dive into Explainability for Medical + Vision-Language Models + + +
+ Explaining Deep Learning models is becoming increasingly important in the +face of daily emerging multimodal models, particularly in safety-critical +domains like medical imaging. However, the lack of detailed investigations into +the performance of explainability methods on these models is widening the gap +between their development and safe deployment. In this work, we analyze the +performance of various explainable AI methods on a vision-language model, +MedCLIP, to demystify its inner workings. We also provide a simple methodology +to overcome the shortcomings of these methods. Our work offers a different new +perspective on the explainability of a recent well-known VLM in the medical +domain and our assessment method is generalizable to other current and possible +future VLMs. + +
+
+
+
+
+ + ☆ Robustness and Visual Explanation for Black Box Image, Video, and ECG + Signal Classification with Reinforcement Learning AAAI + + +
+ We present a generic Reinforcement Learning (RL) framework optimized for +crafting adversarial attacks on different model types spanning from ECG signal +analysis (1D), image classification (2D), and video classification (3D). The +framework focuses on identifying sensitive regions and inducing +misclassifications with minimal distortions and various distortion types. The +novel RL method outperforms state-of-the-art methods for all three +applications, proving its efficiency. Our RL approach produces superior +localization masks, enhancing interpretability for image classification and ECG +analysis models. For applications such as ECG analysis, our platform highlights +critical ECG segments for clinicians while ensuring resilience against +prevalent distortions. This comprehensive tool aims to bolster both resilience +with adversarial training and transparency across varied applications and data +types. + +
+
+ comment: AAAI Proceedings reference: + https://ojs.aaai.org/index.php/AAAI/article/view/30579 +
+
+
+
+
+ + ☆ TextCraftor: Your Text Encoder Can be Image Quality Controller + + +
+ Diffusion-based text-to-image generative models, e.g., Stable Diffusion, have +revolutionized the field of content generation, enabling significant +advancements in areas like image editing and video synthesis. Despite their +formidable capabilities, these models are not without their limitations. It is +still challenging to synthesize an image that aligns well with the input text, +and multiple runs with carefully crafted prompts are required to achieve +satisfactory results. To mitigate these limitations, numerous studies have +endeavored to fine-tune the pre-trained diffusion models, i.e., UNet, utilizing +various technologies. Yet, amidst these efforts, a pivotal question of +text-to-image diffusion model training has remained largely unexplored: Is it +possible and feasible to fine-tune the text encoder to improve the performance +of text-to-image diffusion models? Our findings reveal that, instead of +replacing the CLIP text encoder used in Stable Diffusion with other large +language models, we can enhance it through our proposed fine-tuning approach, +TextCraftor, leading to substantial improvements in quantitative benchmarks and +human assessments. Interestingly, our technique also empowers controllable +image generation through the interpolation of different text encoders +fine-tuned with various rewards. We also demonstrate that TextCraftor is +orthogonal to UNet finetuning, and can be combined to further improve +generative quality. + +
+
+
+
+
+ + ☆ Lift3D: Zero-Shot Lifting of Any 2D Vision Model to 3D CVPR + + +
+ In recent years, there has been an explosion of 2D vision models for numerous +tasks such as semantic segmentation, style transfer or scene editing, enabled +by large-scale 2D image datasets. At the same time, there has been renewed +interest in 3D scene representations such as neural radiance fields from +multi-view images. However, the availability of 3D or multiview data is still +substantially limited compared to 2D image datasets, making extending 2D vision +models to 3D data highly desirable but also very challenging. Indeed, extending +a single 2D vision operator like scene editing to 3D typically requires a +highly creative method specialized to that task and often requires per-scene +optimization. In this paper, we ask the question of whether any 2D vision model +can be lifted to make 3D consistent predictions. We answer this question in the +affirmative; our new Lift3D method trains to predict unseen views on feature +spaces generated by a few visual models (i.e. DINO and CLIP), but then +generalizes to novel vision operators and tasks, such as style transfer, +super-resolution, open vocabulary segmentation and image colorization; for some +of these tasks, there is no comparable previous 3D method. In many cases, we +even outperform state-of-the-art methods specialized for the task in question. +Moreover, Lift3D is a zero-shot method, in the sense that it requires no +task-specific training, nor scene-specific optimization. + +
+
+ comment: Computer Vision and Pattern Recognition Conference (CVPR), 2024 +
+
+
+
+
+ + ☆ SMOF: Streaming Modern CNNs on FPGAs with Smart Off-Chip Eviction + + +
+ Convolutional Neural Networks (CNNs) have demonstrated their effectiveness in +numerous vision tasks. However, their high processing requirements necessitate +efficient hardware acceleration to meet the application's performance targets. +In the space of FPGAs, streaming-based dataflow architectures are often adopted +by users, as significant performance gains can be achieved through layer-wise +pipelining and reduced off-chip memory access by retaining data on-chip. +However, modern topologies, such as the UNet, YOLO, and X3D models, utilise +long skip connections, requiring significant on-chip storage and thus limiting +the performance achieved by such system architectures. The paper addresses the +above limitation by introducing weight and activation eviction mechanisms to +off-chip memory along the computational pipeline, taking into account the +available compute and memory resources. The proposed mechanism is incorporated +into an existing toolflow, expanding the design space by utilising off-chip +memory as a buffer. This enables the mapping of such modern CNNs to devices +with limited on-chip memory, under the streaming architecture design approach. +SMOF has demonstrated the capacity to deliver competitive and, in some cases, +state-of-the-art performance across a spectrum of computer vision tasks, +achieving up to 10.65 X throughput improvement compared to previous works. + +
+
+ comment: 12 pages, 8 figures, 5 tables +
+
+
+
+
+ + ☆ CPR: Retrieval Augmented Generation for Copyright Protection CVPR 2024 + + +
+ Retrieval Augmented Generation (RAG) is emerging as a flexible and robust +technique to adapt models to private users data without training, to handle +credit attribution, and to allow efficient machine unlearning at scale. +However, RAG techniques for image generation may lead to parts of the retrieved +samples being copied in the model's output. To reduce risks of leaking private +information contained in the retrieved set, we introduce Copy-Protected +generation with Retrieval (CPR), a new method for RAG with strong copyright +protection guarantees in a mixed-private setting for diffusion models.CPR +allows to condition the output of diffusion models on a set of retrieved +images, while also guaranteeing that unique identifiable information about +those example is not exposed in the generated outputs. In particular, it does +so by sampling from a mixture of public (safe) distribution and private (user) +distribution by merging their diffusion scores at inference. We prove that CPR +satisfies Near Access Freeness (NAF) which bounds the amount of information an +attacker may be able to extract from the generated images. We provide two +algorithms for copyright protection, CPR-KL and CPR-Choose. Unlike previously +proposed rejection-sampling-based NAF methods, our methods enable efficient +copyright-protected sampling with a single run of backward diffusion. We show +that our method can be applied to any pre-trained conditional diffusion model, +such as Stable Diffusion or unCLIP. In particular, we empirically show that +applying CPR on top of unCLIP improves quality and text-to-image alignment of +the generated results (81.4 to 83.17 on TIFA benchmark), while enabling credit +attribution, copy-right protection, and deterministic, constant time, +unlearning. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ PLOT-TAL -- Prompt Learning with Optimal Transport for Few-Shot Temporal + Action Localization + + +
+ This paper introduces a novel approach to temporal action localization (TAL) +in few-shot learning. Our work addresses the inherent limitations of +conventional single-prompt learning methods that often lead to overfitting due +to the inability to generalize across varying contexts in real-world videos. +Recognizing the diversity of camera views, backgrounds, and objects in videos, +we propose a multi-prompt learning framework enhanced with optimal transport. +This design allows the model to learn a set of diverse prompts for each action, +capturing general characteristics more effectively and distributing the +representation to mitigate the risk of overfitting. Furthermore, by employing +optimal transport theory, we efficiently align these prompts with action +features, optimizing for a comprehensive representation that adapts to the +multifaceted nature of video data. Our experiments demonstrate significant +improvements in action localization accuracy and robustness in few-shot +settings on the standard challenging datasets of THUMOS-14 and EpicKitchens100, +highlighting the efficacy of our multi-prompt optimal transport approach in +overcoming the challenges of conventional few-shot TAL methods. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ UniDepth: Universal Monocular Metric Depth Estimation + + +
+ Accurate monocular metric depth estimation (MMDE) is crucial to solving +downstream tasks in 3D perception and modeling. However, the remarkable +accuracy of recent MMDE methods is confined to their training domains. These +methods fail to generalize to unseen domains even in the presence of moderate +domain gaps, which hinders their practical applicability. We propose a new +model, UniDepth, capable of reconstructing metric 3D scenes from solely single +images across domains. Departing from the existing MMDE methods, UniDepth +directly predicts metric 3D points from the input image at inference time +without any additional information, striving for a universal and flexible MMDE +solution. In particular, UniDepth implements a self-promptable camera module +predicting dense camera representation to condition depth features. Our model +exploits a pseudo-spherical output representation, which disentangles camera +and depth representations. In addition, we propose a geometric invariance loss +that promotes the invariance of camera-prompted depth features. Thorough +evaluations on ten datasets in a zero-shot regime consistently demonstrate the +superior performance of UniDepth, even when compared with methods directly +trained on the testing domains. Code and models are available at: +https://github.com/lpiccinelli-eth/unidepth + +
+
+
+
+
+ + ☆ A Geometric Explanation of the Likelihood OOD Detection Paradox + + +
+ Likelihood-based deep generative models (DGMs) commonly exhibit a puzzling +behaviour: when trained on a relatively complex dataset, they assign higher +likelihood values to out-of-distribution (OOD) data from simpler sources. +Adding to the mystery, OOD samples are never generated by these DGMs despite +having higher likelihoods. This two-pronged paradox has yet to be conclusively +explained, making likelihood-based OOD detection unreliable. Our primary +observation is that high-likelihood regions will not be generated if they +contain minimal probability mass. We demonstrate how this seeming contradiction +of large densities yet low probability mass can occur around data confined to +low-dimensional manifolds. We also show that this scenario can be identified +through local intrinsic dimension (LID) estimation, and propose a method for +OOD detection which pairs the likelihoods and LID estimates obtained from a +pre-trained DGM. Our method can be applied to normalizing flows and score-based +diffusion models, and obtains results which match or surpass state-of-the-art +OOD detection benchmarks using the same DGM backbones. Our code is available at +https://github.com/layer6ai-labs/dgm_ood_detection. + +
+
+
+
+
+ + ☆ Enhancing Multiple Object Tracking Accuracy via Quantum Annealing + + +
+ Multiple object tracking (MOT), a key task in image recognition, presents a +persistent challenge in balancing processing speed and tracking accuracy. This +study introduces a novel approach that leverages quantum annealing (QA) to +expedite computation speed, while enhancing tracking accuracy through the +ensembling of object tracking processes. A method to improve the matching +integration process is also proposed. By utilizing the sequential nature of +MOT, this study further augments the tracking method via reverse annealing +(RA). Experimental validation confirms the maintenance of high accuracy with an +annealing time of a mere 3 $\mu$s per tracking process. The proposed method +holds significant potential for real-time MOT applications, including traffic +flow measurement for urban traffic light control, collision prediction for +autonomous robots and vehicles, and management of products mass-produced in +factories. + +
+
+ comment: 19pages, 15 figures +
+
+
+
+
+ + ☆ Self-Expansion of Pre-trained Models with Mixture of Adapters for + Continual Learning + + +
+ Continual learning aims to learn from a stream of continuously arriving data +with minimum forgetting of previously learned knowledge. While previous works +have explored the effectiveness of leveraging the generalizable knowledge from +pre-trained models in continual learning, existing parameter-efficient +fine-tuning approaches focus on the use of a predetermined or task-wise set of +adapters or prompts. However, these approaches still suffer from forgetting due +to task interference on jointly used parameters or restricted flexibility. The +reliance on a static model architecture may lead to the allocation of excessive +parameters that are not essential or, conversely, inadequate adaptation for +downstream tasks, given that the scale and distribution of incoming data are +unpredictable in continual learning. We propose Self-Expansion of pre-trained +models with Modularized Adaptation (SEMA), a novel fine-tuning approach which +automatically decides to reuse or add adapter modules on demand in continual +learning, depending on whether drastic distribution shift that could not be +handled by existing modules is detected at different representation levels. We +design each adapter module to consist of an adapter and a representation +descriptor, specifically, implemented as an autoencoder. The representation +descriptor functions as a distributional shift indicator during training and +triggers adapter expansion. For better usage of the adapters, an expandable +weighting router is learned jointly for mixture of adapter outputs. By +comparing with vision-transformer-based continual learning adaptation methods, +we demonstrate that the proposed framework outperforms the state-of-the-art +without memory rehearsal. + +
+
+
+
+
+ + ☆ AIC-UNet: Anatomy-informed Cascaded UNet for Robust Multi-Organ + Segmentation + + +
+ Imposing key anatomical features, such as the number of organs, their shapes, +sizes, and relative positions, is crucial for building a robust multi-organ +segmentation model. Current attempts to incorporate anatomical features include +broadening effective receptive fields (ERF) size with resource- and +data-intensive modules such as self-attention or introducing organ-specific +topology regularizers, which may not scale to multi-organ segmentation problems +where inter-organ relation also plays a huge role. We introduce a new approach +to impose anatomical constraints on any existing encoder-decoder segmentation +model by conditioning model prediction with learnable anatomy prior. More +specifically, given an abdominal scan, a part of the encoder spatially warps a +learnable prior to align with the given input scan using thin plate spline +(TPS) grid interpolation. The warped prior is then integrated during the +decoding phase to guide the model for more anatomy-informed predictions. Code +is available at +\hyperlink{https://anonymous.4open.science/r/AIC-UNet-7048}{https://anonymous.4open.science/r/AIC-UNet-7048}. + +
+
+
+
+
+ + ☆ Capability-aware Prompt Reformulation Learning for Text-to-Image + Generation SIGIR 2024 + + +
+ Text-to-image generation systems have emerged as revolutionary tools in the +realm of artistic creation, offering unprecedented ease in transforming textual +prompts into visual art. However, the efficacy of these systems is intricately +linked to the quality of user-provided prompts, which often poses a challenge +to users unfamiliar with prompt crafting. This paper addresses this challenge +by leveraging user reformulation data from interaction logs to develop an +automatic prompt reformulation model. Our in-depth analysis of these logs +reveals that user prompt reformulation is heavily dependent on the individual +user's capability, resulting in significant variance in the quality of +reformulation pairs. To effectively use this data for training, we introduce +the Capability-aware Prompt Reformulation (CAPR) framework. CAPR innovatively +integrates user capability into the reformulation process through two key +components: the Conditional Reformulation Model (CRM) and Configurable +Capability Features (CCF). CRM reformulates prompts according to a specified +user capability, as represented by CCF. The CCF, in turn, offers the +flexibility to tune and guide the CRM's behavior. This enables CAPR to +effectively learn diverse reformulation strategies across various user +capacities and to simulate high-capability user reformulation during inference. +Extensive experiments on standard text-to-image generation benchmarks showcase +CAPR's superior performance over existing baselines and its remarkable +robustness on unseen systems. Furthermore, comprehensive analyses validate the +effectiveness of different components. CAPR can facilitate user-friendly +interaction with text-to-image systems and make advanced artistic creation more +achievable for a broader range of users. + +
+
+ comment: Accepted at SIGIR 2024 +
+
+
+
+
+ + ♻ ☆ Shifting to Machine Supervision: Annotation-Efficient Semi and + Self-Supervised Learning for Automatic Medical Image Segmentation and + Classification + + +
+ Advancements in clinical treatment are increasingly constrained by the +limitations of supervised learning techniques, which depend heavily on large +volumes of annotated data. The annotation process is not only costly but also +demands substantial time from clinical specialists. Addressing this issue, we +introduce the S4MI (Self-Supervision and Semi-Supervision for Medical Imaging) +pipeline, a novel approach that leverages advancements in self-supervised and +semi-supervised learning. These techniques engage in auxiliary tasks that do +not require labeling, thus simplifying the scaling of machine supervision +compared to fully-supervised methods. Our study benchmarks these techniques on +three distinct medical imaging datasets to evaluate their effectiveness in +classification and segmentation tasks. Notably, we observed that self +supervised learning significantly surpassed the performance of supervised +methods in the classification of all evaluated datasets. Remarkably, the +semi-supervised approach demonstrated superior outcomes in segmentation, +outperforming fully-supervised methods while using 50% fewer labels across all +datasets. In line with our commitment to contributing to the scientific +community, we have made the S4MI code openly accessible, allowing for broader +application and further development of these methods. + +
+
+ comment: Seventeen pages (incl. references), five figures, and one table. + (Under Review) +
+
+
+
+
+ + ♻ ☆ Boosting Object Detection with Zero-Shot Day-Night Domain Adaptation CVPR 2024 + + +
+ Detecting objects in low-light scenarios presents a persistent challenge, as +detectors trained on well-lit data exhibit significant performance degradation +on low-light data due to low visibility. Previous methods mitigate this issue +by exploring image enhancement or object detection techniques with real +low-light image datasets. However, the progress is impeded by the inherent +difficulties about collecting and annotating low-light images. To address this +challenge, we propose to boost low-light object detection with zero-shot +day-night domain adaptation, which aims to generalize a detector from well-lit +scenarios to low-light ones without requiring real low-light data. Revisiting +Retinex theory in the low-level vision, we first design a reflectance +representation learning module to learn Retinex-based illumination invariance +in images with a carefully designed illumination invariance reinforcement +strategy. Next, an interchange-redecomposition-coherence procedure is +introduced to improve over the vanilla Retinex image decomposition process by +performing two sequential image decompositions and introducing a +redecomposition cohering loss. Extensive experiments on ExDark, DARK FACE, and +CODaN datasets show strong low-light generalizability of our method. Our code +is available at https://github.com/ZPDu/DAI-Net. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Decoupled Data Consistency with Diffusion Purification for Image + Restoration + + +
+ Diffusion models have recently gained traction as a powerful class of deep +generative priors, excelling in a wide range of image restoration tasks due to +their exceptional ability to model data distributions. To solve image +restoration problems, many existing techniques achieve data consistency by +incorporating additional likelihood gradient steps into the reverse sampling +process of diffusion models. However, the additional gradient steps pose a +challenge for real-world practical applications as they incur a large +computational overhead, thereby increasing inference time. They also present +additional difficulties when using accelerated diffusion model samplers, as the +number of data consistency steps is limited by the number of reverse sampling +steps. In this work, we propose a novel diffusion-based image restoration +solver that addresses these issues by decoupling the reverse process from the +data consistency steps. Our method involves alternating between a +reconstruction phase to maintain data consistency and a refinement phase that +enforces the prior via diffusion purification. Our approach demonstrates +versatility, making it highly adaptable for efficient problem-solving in latent +space. Additionally, it reduces the necessity for numerous sampling steps +through the integration of consistency models. The efficacy of our approach is +validated through comprehensive experiments across various image restoration +tasks, including image denoising, deblurring, inpainting, and super-resolution. + +
+
+
+
+
+ + ♻ ☆ Interpretable machine learning for time-to-event prediction in medicine + and healthcare + + +
+ Time-to-event prediction, e.g. cancer survival analysis or hospital length of +stay, is a highly prominent machine learning task in medical and healthcare +applications. However, only a few interpretable machine learning methods comply +with its challenges. To facilitate a comprehensive explanatory analysis of +survival models, we formally introduce time-dependent feature effects and +global feature importance explanations. We show how post-hoc interpretation +methods allow for finding biases in AI systems predicting length of stay using +a novel multi-modal dataset created from 1235 X-ray images with textual +radiology reports annotated by human experts. Moreover, we evaluate cancer +survival models beyond predictive performance to include the importance of +multi-omics feature groups based on a large-scale benchmark comprising 11 +datasets from The Cancer Genome Atlas (TCGA). Model developers can use the +proposed methods to debug and improve machine learning algorithms, while +physicians can discover disease biomarkers and assess their significance. We +hope the contributed open data and code resources facilitate future work in the +emerging research direction of explainable survival analysis. + +
+
+ comment: An extended version of an AIME 2023 paper submitted to Artificial + Intelligence in Medicine +
+
+
+
+
+ + ♻ ☆ Simplified Diffusion Schrödinger Bridge + + +
+ This paper introduces a novel theoretical simplification of the Diffusion +Schr\"odinger Bridge (DSB) that facilitates its unification with Score-based +Generative Models (SGMs), addressing the limitations of DSB in complex data +generation and enabling faster convergence and enhanced performance. By +employing SGMs as an initial solution for DSB, our approach capitalizes on the +strengths of both frameworks, ensuring a more efficient training process and +improving the performance of SGM. We also propose a reparameterization +technique that, despite theoretical approximations, practically improves the +network's fitting capabilities. Our extensive experimental evaluations confirm +the effectiveness of the simplified DSB, demonstrating its significant +improvements. We believe the contributions of this work pave the way for +advanced generative modeling. The code is available at +https://github.com/checkcrab/SDSB. + +
+
+
+
+
+ + ♻ ☆ Self-supervised co-salient object detection via feature correspondence + at multiple scales + + +
+ Our paper introduces a novel two-stage self-supervised approach for detecting +co-occurring salient objects (CoSOD) in image groups without requiring +segmentation annotations. Unlike existing unsupervised methods that rely solely +on patch-level information (e.g. clustering patch descriptors) or on +computation heavy off-the-shelf components for CoSOD, our lightweight model +leverages feature correspondences at both patch and region levels, +significantly improving prediction performance. In the first stage, we train a +self-supervised network that detects co-salient regions by computing local +patch-level feature correspondences across images. We obtain the segmentation +predictions using confidence-based adaptive thresholding. In the next stage, we +refine these intermediate segmentations by eliminating the detected regions +(within each image) whose averaged feature representations are dissimilar to +the foreground feature representation averaged across all the cross-attention +maps (from the previous stage). Extensive experiments on three CoSOD benchmark +datasets show that our self-supervised model outperforms the corresponding +state-of-the-art models by a huge margin (e.g. on the CoCA dataset, our model +has a 13.7% F-measure gain over the SOTA unsupervised CoSOD model). Notably, +our self-supervised model also outperforms several recent fully supervised +CoSOD models on the three test datasets (e.g., on the CoCA dataset, our model +has a 4.6% F-measure gain over a recent supervised CoSOD model). + +
+
+
+
+
+ + ♻ ☆ LION: Implicit Vision Prompt Tuning AAAI2024 + + +
+ Despite recent competitive performance across a range of vision tasks, vision +Transformers still have an issue of heavy computational costs. Recently, vision +prompt learning has provided an economic solution to this problem without +fine-tuning the whole large-scale models. However, the efficiency of existing +models are still far from satisfactory due to insertion of extensive prompts +blocks and trick prompt designs. In this paper, we propose an efficient vision +model named impLicit vIsion prOmpt tuNing (LION), which is motivated by deep +implicit models with stable memory costs for various complex tasks. In +particular, we merely insect two equilibrium implicit layers in two ends of the +pre-trained main backbone with parameters in the backbone frozen. Moreover, we +prune the parameters in these two layers according to lottery hypothesis. The +performance obtained by our LION are promising on a wide range of datasets. In +particular, our LION reduces up to 11.5% of training parameter numbers while +obtaining higher performance compared with the state-of-the-art baseline VPT, +especially under challenging scenes. Furthermore, we find that our proposed +LION had a good generalization performance, making it an easy way to boost +transfer learning in the future. + +
+
+ comment: Accepted by AAAI2024; 9 pages, 3 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Incorporating simulated spatial context information improves the + effectiveness of contrastive learning models + + +
+ Visual learning often occurs in a specific context, where an agent acquires +skills through exploration and tracking of its location in a consistent +environment. The historical spatial context of the agent provides a similarity +signal for self-supervised contrastive learning. We present a unique approach, +termed Environmental Spatial Similarity (ESS), that complements existing +contrastive learning methods. Using images from simulated, photorealistic +environments as an experimental setting, we demonstrate that ESS outperforms +traditional instance discrimination approaches. Moreover, sampling additional +data from the same environment substantially improves accuracy and provides new +augmentations. ESS allows remarkable proficiency in room classification and +spatial prediction tasks, especially in unfamiliar environments. This learning +paradigm has the potential to enable rapid visual learning in agents operating +in new environments with unique visual characteristics. Potentially +transformative applications span from robotics to space exploration. Our proof +of concept demonstrates improved efficiency over methods that rely on +extensive, disconnected datasets. + +
+
+
+
+
+ + ♻ ☆ Adaptive Negative Evidential Deep Learning for Open-set Semi-supervised + Learning AAAI2024 + + +
+ Semi-supervised learning (SSL) methods assume that labeled data, unlabeled +data and test data are from the same distribution. Open-set semi-supervised +learning (Open-set SSL) considers a more practical scenario, where unlabeled +data and test data contain new categories (outliers) not observed in labeled +data (inliers). Most previous works focused on outlier detection via binary +classifiers, which suffer from insufficient scalability and inability to +distinguish different types of uncertainty. In this paper, we propose a novel +framework, Adaptive Negative Evidential Deep Learning (ANEDL) to tackle these +limitations. Concretely, we first introduce evidential deep learning (EDL) as +an outlier detector to quantify different types of uncertainty, and design +different uncertainty metrics for self-training and inference. Furthermore, we +propose a novel adaptive negative optimization strategy, making EDL more +tailored to the unlabeled dataset containing both inliers and outliers. As +demonstrated empirically, our proposed method outperforms existing +state-of-the-art methods across four datasets. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ♻ ☆ Vision Transformer-Based Deep Learning for Histologic Classification of + Endometrial Cancer + + +
+ Endometrial cancer, the fourth most common cancer in females in the United +States, with the lifetime risk for developing this disease is approximately +2.8% in women. Precise histologic evaluation and molecular classification of +endometrial cancer is important for effective patient management and +determining the best treatment modalities. This study introduces EndoNet, which +uses convolutional neural networks for extracting histologic features and a +vision transformer for aggregating these features and classifying slides based +on their visual characteristics into high- and low- grade. The model was +trained on 929 digitized hematoxylin and eosin-stained whole-slide images of +endometrial cancer from hysterectomy cases at Dartmouth-Health. It classifies +these slides into low-grade (Endometroid Grades 1 and 2) and high-grade +(endometroid carcinoma FIGO grade 3, uterine serous carcinoma, carcinosarcoma) +categories. EndoNet was evaluated on an internal test set of 110 patients and +an external test set of 100 patients from the public TCGA database. The model +achieved a weighted average F1-score of 0.91 (95% CI: 0.86-0.95) and an AUC of +0.95 (95% CI: 0.89-0.99) on the internal test, and 0.86 (95% CI: 0.80-0.94) for +F1-score and 0.86 (95% CI: 0.75-0.93) for AUC on the external test. Pending +further validation, EndoNet has the potential to support pathologists without +the need of manual annotations in classifying the grades of gynecologic +pathology tumors. + +
+
+ comment: 4 Tables and 3 Figures +
+
+
+
+
+ + ♻ ☆ Automated Construction of Time-Space Diagrams for Traffic Analysis Using + Street-View Video Sequence SC + + +
+ Time-space diagrams are essential tools for analyzing traffic patterns and +optimizing transportation infrastructure and traffic management strategies. +Traditional data collection methods for these diagrams have limitations in +terms of temporal and spatial coverage. Recent advancements in camera +technology have overcome these limitations and provided extensive urban data. +In this study, we propose an innovative approach to constructing time-space +diagrams by utilizing street-view video sequences captured by cameras mounted +on moving vehicles. Using the state-of-the-art YOLOv5, StrongSORT, and +photogrammetry techniques for distance calculation, we can infer vehicle +trajectories from the video data and generate time-space diagrams. To evaluate +the effectiveness of our proposed method, we utilized datasets from the KITTI +computer vision benchmark suite. The evaluation results demonstrate that our +approach can generate trajectories from video data, although there are some +errors that can be mitigated by improving the performance of the detector, +tracker, and distance calculation components. In conclusion, the utilization of +street-view video sequences captured by cameras mounted on moving vehicles, +combined with state-of-the-art computer vision techniques, has immense +potential for constructing comprehensive time-space diagrams. These diagrams +offer valuable insights into traffic patterns and contribute to the design of +transportation infrastructure and traffic management strategies. + +
+
+ comment: The paper is published in 2023 IEEE 26th International Conference on + Intelligent Transportation Systems (ITSC) +
+
+
+
+
+ + ♻ ☆ SOAC: Spatio-Temporal Overlap-Aware Multi-Sensor Calibration using + Neural Radiance Fields CVPR 2024 + + +
+ In rapidly-evolving domains such as autonomous driving, the use of multiple +sensors with different modalities is crucial to ensure high operational +precision and stability. To correctly exploit the provided information by each +sensor in a single common frame, it is essential for these sensors to be +accurately calibrated. In this paper, we leverage the ability of Neural +Radiance Fields (NeRF) to represent different sensors modalities in a common +volumetric representation to achieve robust and accurate spatio-temporal sensor +calibration. By designing a partitioning approach based on the visible part of +the scene for each sensor, we formulate the calibration problem using only the +overlapping areas. This strategy results in a more robust and accurate +calibration that is less prone to failure. We demonstrate that our approach +works on outdoor urban scenes by validating it on multiple established driving +datasets. Results show that our method is able to get better accuracy and +robustness compared to existing methods. + +
+
+ comment: Accepted at CVPR 2024. Project page: https://qherau.github.io/SOAC/ +
+
+
+
+
+ + ♻ ☆ Point, Segment and Count: A Generalized Framework for Object Counting CVPR 2024 + + +
+ Class-agnostic object counting aims to count all objects in an image with +respect to example boxes or class names, \emph{a.k.a} few-shot and zero-shot +counting. In this paper, we propose a generalized framework for both few-shot +and zero-shot object counting based on detection. Our framework combines the +superior advantages of two foundation models without compromising their +zero-shot capability: (\textbf{i}) SAM to segment all possible objects as mask +proposals, and (\textbf{ii}) CLIP to classify proposals to obtain accurate +object counts. However, this strategy meets the obstacles of efficiency +overhead and the small crowded objects that cannot be localized and +distinguished. To address these issues, our framework, termed PseCo, follows +three steps: point, segment, and count. Specifically, we first propose a +class-agnostic object localization to provide accurate but least point prompts +for SAM, which consequently not only reduces computation costs but also avoids +missing small objects. Furthermore, we propose a generalized object +classification that leverages CLIP image/text embeddings as the classifier, +following a hierarchical knowledge distillation to obtain discriminative +classifications among hierarchical mask proposals. Extensive experimental +results on FSC-147, COCO, and LVIS demonstrate that PseCo achieves +state-of-the-art performance in both few-shot/zero-shot object +counting/detection. Code: https://github.com/Hzzone/PseCo + +
+
+ comment: Accepted by CVPR 2024. Camera ready +
+
+
+
+
+ + ♻ ☆ Weakly-Supervised Emotion Transition Learning for Diverse 3D Co-speech + Gesture Generation CVPR 2024 + + +
+ Generating vivid and emotional 3D co-speech gestures is crucial for virtual +avatar animation in human-machine interaction applications. While the existing +methods enable generating the gestures to follow a single emotion label, they +overlook that long gesture sequence modeling with emotion transition is more +practical in real scenes. In addition, the lack of large-scale available +datasets with emotional transition speech and corresponding 3D human gestures +also limits the addressing of this task. To fulfill this goal, we first +incorporate the ChatGPT-4 and an audio inpainting approach to construct the +high-fidelity emotion transition human speeches. Considering obtaining the +realistic 3D pose annotations corresponding to the dynamically inpainted +emotion transition audio is extremely difficult, we propose a novel weakly +supervised training strategy to encourage authority gesture transitions. +Specifically, to enhance the coordination of transition gestures w.r.t +different emotional ones, we model the temporal association representation +between two different emotional gesture sequences as style guidance and infuse +it into the transition generation. We further devise an emotion mixture +mechanism that provides weak supervision based on a learnable mixed emotion +label for transition gestures. Last, we present a keyframe sampler to supply +effective initial posture cues in long sequences, enabling us to generate +diverse gestures. Extensive experiments demonstrate that our method outperforms +the state-of-the-art models constructed by adapting single emotion-conditioned +counterparts on our newly defined emotion transition task and datasets. Our +code and dataset will be released on the project page: +https://xingqunqi-lab.github.io/Emo-Transition-Gesture/. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Learning by Erasing: Conditional Entropy based Transferable + Out-Of-Distribution Detection + + +
+ Out-of-distribution (OOD) detection is essential to handle the distribution +shifts between training and test scenarios. For a new in-distribution (ID) +dataset, existing methods require retraining to capture the dataset-specific +feature representation or data distribution. In this paper, we propose a deep +generative models (DGM) based transferable OOD detection method, which is +unnecessary to retrain on a new ID dataset. We design an image erasing strategy +to equip exclusive conditional entropy distribution for each ID dataset, which +determines the discrepancy of DGM's posteriori ucertainty distribution on +different ID datasets. Owing to the powerful representation capacity of +convolutional neural networks, the proposed model trained on complex dataset +can capture the above discrepancy between ID datasets without retraining and +thus achieve transferable OOD detection. We validate the proposed method on +five datasets and verity that ours achieves comparable performance to the +state-of-the-art group based OOD detection methods that need to be retrained to +deploy on new ID datasets. Our code is available at +https://github.com/oOHCIOo/CETOOD. + +
+
+ comment: update new experimental results +
+
+
+
+
+ + ♻ ☆ Dual Structure-Aware Image Filterings for Semi-supervised Medical Image + Segmentation + + +
+ Semi-supervised image segmentation has attracted great attention recently. +The key is how to leverage unlabeled images in the training process. Most +methods maintain consistent predictions of the unlabeled images under +variations (e.g., adding noise/perturbations, or creating alternative versions) +in the image and/or model level. In most image-level variation, medical images +often have prior structure information, which has not been well explored. In +this paper, we propose novel dual structure-aware image filterings (DSAIF) as +the image-level variations for semi-supervised medical image segmentation. +Motivated by connected filtering that simplifies image via filtering in +structure-aware tree-based image representation, we resort to the dual contrast +invariant Max-tree and Min-tree representation. Specifically, we propose a +novel connected filtering that removes topologically equivalent nodes (i.e. +connected components) having no siblings in the Max/Min-tree. This results in +two filtered images preserving topologically critical structure. Applying the +proposed DSAIF to mutually supervised networks decreases the consensus of their +erroneous predictions on unlabeled images. This helps to alleviate the +confirmation bias issue of overfitting to noisy pseudo labels of unlabeled +images, and thus effectively improves the segmentation performance. Extensive +experimental results on three benchmark datasets demonstrate that the proposed +method significantly/consistently outperforms some state-of-the-art methods. +The source codes will be publicly available. + +
+
+
+
+
+ + ♻ ☆ Decomposing Disease Descriptions for Enhanced Pathology Detection: A + Multi-Aspect Vision-Language Pre-training Framework CVPR2024 + + +
+ Medical vision language pre-training (VLP) has emerged as a frontier of +research, enabling zero-shot pathological recognition by comparing the query +image with the textual descriptions for each disease. Due to the complex +semantics of biomedical texts, current methods struggle to align medical images +with key pathological findings in unstructured reports. This leads to the +misalignment with the target disease's textual representation. In this paper, +we introduce a novel VLP framework designed to dissect disease descriptions +into their fundamental aspects, leveraging prior knowledge about the visual +manifestations of pathologies. This is achieved by consulting a large language +model and medical experts. Integrating a Transformer module, our approach +aligns an input image with the diverse elements of a disease, generating +aspect-centric image representations. By consolidating the matches from each +aspect, we improve the compatibility between an image and its associated +disease. Additionally, capitalizing on the aspect-oriented representations, we +present a dual-head Transformer tailored to process known and unknown diseases, +optimizing the comprehensive detection efficacy. Conducting experiments on +seven downstream datasets, ours improves the accuracy of recent methods by up +to 8.56% and 17.0% for seen and unseen categories, respectively. Our code is +released at https://github.com/HieuPhan33/MAVL. + +
+
+ comment: Accepted at CVPR2024. Pre-print before final camera-ready version +
+
+
+
+
+ + ♻ ☆ Shapley Values-Powered Framework for Fair Reward Split in Content + Produced by GenAI + + +
+ It is evident that, currently, generative models are surpassed in quality by +human professionals. However, with the advancements in Artificial Intelligence, +this gap will narrow, leading to scenarios where individuals who have dedicated +years of their lives to mastering a skill become obsolete due to their high +costs, which are inherently linked to the time they require to complete a task +-- a task that AI could accomplish in minutes or seconds. To avoid future +social upheavals, we must, even now, contemplate how to fairly assess the +contributions of such individuals in training generative models and how to +compensate them for the reduction or complete loss of their incomes. In this +work, we propose a method to structure collaboration between model developers +and data providers. To achieve this, we employ Shapley Values to quantify the +contribution of artist(s) in an image generated by the Stable Diffusion-v1.5 +model and to equitably allocate the reward among them. + +
+
+ comment: 36 pages, 32 figures +
+
+
+
+
+ + ♻ ☆ E4S: Fine-grained Face Swapping via Editing With Regional GAN Inversion + + +
+ This paper proposes a novel approach to face swapping from the perspective of +fine-grained facial editing, dubbed "editing for swapping" (E4S). The +traditional face swapping methods rely on global feature extraction and fail to +preserve the detailed source identity. In contrast, we propose a Regional GAN +Inversion (RGI) method, which allows the explicit disentanglement of shape and +texture. Specifically, our E4S performs face swapping in the latent space of a +pretrained StyleGAN, where a multi-scale mask-guided encoder is applied to +project the texture of each facial component into regional style codes and a +mask-guided injection module manipulating feature maps with the style codes. +Based on this disentanglement, face swapping can be simplified as style and +mask swapping. Besides, due to the large lighting condition gap, transferring +the source skin into the target image may lead to disharmony lighting. We +propose a re-coloring network to make the swapped face maintain the target +lighting condition while preserving the source skin. Further, to deal with the +potential mismatch areas during mask exchange, we design a face inpainting +module to refine the face shape. The extensive comparisons with +state-of-the-art methods demonstrate that our E4S outperforms existing methods +in preserving texture, shape, and lighting. Our implementation is available at +https://github.com/e4s2024/E4S2024. + +
+
+ comment: Project Page: https://e4s2024.github.io/ ;. arXiv admin note: text + overlap with arXiv:2211.14068 +
+
+
+
+
+ + ♻ ☆ ViDA: Homeostatic Visual Domain Adapter for Continual Test Time + Adaptation ICLR2024 + + +
+ Since real-world machine systems are running in non-stationary environments, +Continual Test-Time Adaptation (CTTA) task is proposed to adapt the pre-trained +model to continually changing target domains. Recently, existing methods mainly +focus on model-based adaptation, which aims to leverage a self-training manner +to extract the target domain knowledge. However, pseudo labels can be noisy and +the updated model parameters are unreliable under dynamic data distributions, +leading to error accumulation and catastrophic forgetting in the continual +adaptation process. To tackle these challenges and maintain the model +plasticity, we design a Visual Domain Adapter (ViDA) for CTTA, explicitly +handling both domain-specific and domain-shared knowledge. Specifically, we +first comprehensively explore the different domain representations of the +adapters with trainable high-rank or low-rank embedding spaces. Then we inject +ViDAs into the pre-trained model, which leverages high-rank and low-rank +features to adapt the current domain distribution and maintain the continual +domain-shared knowledge, respectively. To exploit the low-rank and high-rank +ViDAs more effectively, we further propose a Homeostatic Knowledge Allotment +(HKA) strategy, which adaptively combines different knowledge from each ViDA. +Extensive experiments conducted on four widely used benchmarks demonstrate that +our proposed method achieves state-of-the-art performance in both +classification and segmentation CTTA tasks. Note that, our method can be +regarded as a novel transfer paradigm for large-scale models, delivering +promising results in adaptation to continually changing distributions. Project +page: https://sites.google.com/view/iclr2024-vida/home. + +
+
+ comment: Accepted by ICLR2024 +
+
+
+
+
+ + ♻ ☆ Visually Guided Generative Text-Layout Pre-training for Document + Intelligence NAACL 2024 + + +
+ Prior study shows that pre-training techniques can boost the performance of +visual document understanding (VDU), which typically requires models to gain +abilities to perceive and reason both document texts and layouts (e.g., +locations of texts and table-cells). To this end, we propose visually guided +generative text-layout pre-training, named ViTLP. Given a document image, the +model optimizes hierarchical language and layout modeling objectives to +generate the interleaved text and layout sequence. In addition, to address the +limitation of processing long documents by Transformers, we introduce a +straightforward yet effective multi-segment generative pre-training scheme, +facilitating ViTLP to process word-intensive documents of any length. ViTLP can +function as a native OCR model to localize and recognize texts of document +images. Besides, ViTLP can be effectively applied to various downstream VDU +tasks. Extensive experiments show that ViTLP achieves competitive performance +over existing baselines on benchmark VDU tasks, including information +extraction, document classification, and document question answering. + +
+
+ comment: Accepted to NAACL 2024 main conference. The first version of this + paper was submitted to OpenReview + (https://openreview.net/forum?id=ARtBIBAmNR) in June 2023 +
+
+
+
+
+ + ♻ ☆ Intraoperative 2D/3D Image Registration via Differentiable X-ray + Rendering CVPR 2024 + + +
+ Surgical decisions are informed by aligning rapid portable 2D intraoperative +images (e.g., X-rays) to a high-fidelity 3D preoperative reference scan (e.g., +CT). 2D/3D image registration often fails in practice: conventional +optimization methods are prohibitively slow and susceptible to local minima, +while neural networks trained on small datasets fail on new patients or require +impractical landmark supervision. We present DiffPose, a self-supervised +approach that leverages patient-specific simulation and differentiable +physics-based rendering to achieve accurate 2D/3D registration without relying +on manually labeled data. Preoperatively, a CNN is trained to regress the pose +of a randomly oriented synthetic X-ray rendered from the preoperative CT. The +CNN then initializes rapid intraoperative test-time optimization that uses the +differentiable X-ray renderer to refine the solution. Our work further proposes +several geometrically principled methods for sampling camera poses from +$\mathbf{SE}(3)$, for sparse differentiable rendering, and for driving +registration in the tangent space $\mathfrak{se}(3)$ with geodesic and +multiscale locality-sensitive losses. DiffPose achieves sub-millimeter accuracy +across surgical datasets at intraoperative speeds, improving upon existing +unsupervised methods by an order of magnitude and even outperforming supervised +baselines. Our code is available at https://github.com/eigenvivek/DiffPose. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Challenging Common Paradigms in Multi-Task Learning + + +
+ While multi-task learning (MTL) has gained significant attention in recent +years, its underlying mechanisms remain poorly understood. Recent methods did +not yield consistent performance improvements over single task learning (STL) +baselines, underscoring the importance of gaining more profound insights about +challenges specific to MTL. In our study, we challenge paradigms in MTL in the +context of STL: First, the impact of the choice of optimizer has only been +mildly investigated in MTL. We show the pivotal role of common STL tools such +as the Adam optimizer in MTL empirically in various experiments. To further +investigate Adam's effectiveness, we theoretical derive a partial loss-scale +invariance under mild assumptions. Second, the notion of gradient conflicts has +often been phrased as a specific problem in MTL. We delve into the role of +gradient conflicts in MTL and compare it to STL. For angular gradient alignment +we find no evidence that this is a unique problem in MTL. We emphasize +differences in gradient magnitude as the main distinguishing factor. Lastly, we +compare the transferability of features learned through MTL and STL on common +image corruptions, and find light evidence that MTL can lead to superior +transferability. Overall, we find surprising similarities between STL and MTL +suggesting to consider methods from both fields in a broader context. + +
+
+ comment: - +
+
+
+
+
+ + ♻ ☆ Neural Fields for Interactive Visualization of Statistical Dependencies + in 3D Simulation Ensembles + + +
+ We present the first neural network that has learned to compactly represent +and can efficiently reconstruct the statistical dependencies between the values +of physical variables at different spatial locations in large 3D simulation +ensembles. Going beyond linear dependencies, we consider mutual information as +a measure of non-linear dependence. We demonstrate learning and reconstruction +with a large weather forecast ensemble comprising 1000 members, each storing +multiple physical variables at a 250 x 352 x 20 simulation grid. By +circumventing compute-intensive statistical estimators at runtime, we +demonstrate significantly reduced memory and computation requirements for +reconstructing the major dependence structures. This enables embedding the +estimator into a GPU-accelerated direct volume renderer and interactively +visualizing all mutual dependencies for a selected domain point. + +
+
+
+
+
+ + ♻ ☆ SAR-Net: Multi-scale Direction-aware SAR Network via Global Information + Fusion + + +
+ Deep learning has driven significant progress in object detection using +Synthetic Aperture Radar (SAR) imagery. Existing methods, while achieving +promising results, often struggle to effectively integrate local and global +information, particularly direction-aware features. This paper proposes +SAR-Net, a novel framework specifically designed for global fusion of +direction-aware information in SAR object detection. SAR-Net leverages two key +innovations: the Unity Compensation Mechanism (UCM) and the Direction-aware +Attention Module (DAM). UCM facilitates the establishment of complementary +relationships among features across different scales, enabling efficient global +information fusion. Among them, Multi-scale Alignment Module (MAM) and distinct +Multi-level Fusion Module (MFM) enhance feature integration by capturing both +texture detail and semantic information. Then, Multi-feature Embedding Module +(MEM) feeds back global features into the primary branches, further improving +information transmission. Additionally, DAM, through bidirectional attention +polymerization, captures direction-aware information, effectively eliminating +background interference. Extensive experiments demonstrate the effectiveness of +SAR-Net, achieving state-of-the-art results on aircraft (SAR-AIRcraft-1.0) and +ship datasets (SSDD, HRSID), confirming its generalization capability and +robustness. + +
+
+
+
+
+ + ♻ ☆ Hourglass Tokenizer for Efficient Transformer-Based 3D Human Pose + Estimation CVPR 2024 + + +
+ Transformers have been successfully applied in the field of video-based 3D +human pose estimation. However, the high computational costs of these video +pose transformers (VPTs) make them impractical on resource-constrained devices. +In this paper, we present a plug-and-play pruning-and-recovering framework, +called Hourglass Tokenizer (HoT), for efficient transformer-based 3D human pose +estimation from videos. Our HoT begins with pruning pose tokens of redundant +frames and ends with recovering full-length tokens, resulting in a few pose +tokens in the intermediate transformer blocks and thus improving the model +efficiency. To effectively achieve this, we propose a token pruning cluster +(TPC) that dynamically selects a few representative tokens with high semantic +diversity while eliminating the redundancy of video frames. In addition, we +develop a token recovering attention (TRA) to restore the detailed +spatio-temporal information based on the selected tokens, thereby expanding the +network output to the original full-length temporal resolution for fast +inference. Extensive experiments on two benchmark datasets (i.e., Human3.6M and +MPI-INF-3DHP) demonstrate that our method can achieve both high efficiency and +estimation accuracy compared to the original VPT models. For instance, applying +to MotionBERT and MixSTE on Human3.6M, our HoT can save nearly 50% FLOPs +without sacrificing accuracy and nearly 40% FLOPs with only 0.2% accuracy drop, +respectively. Code and models are available at +https://github.com/NationalGAILab/HoT. + +
+
+ comment: Accepted by CVPR 2024, Open Sourced +
+
+
+
+
+ + ♻ ☆ Enhancing Object Coherence in Layout-to-Image Synthesis + + +
+ Layout-to-image synthesis is an emerging technique in conditional image +generation. It aims to generate complex scenes, where users require fine +control over the layout of the objects in a scene. However, it remains +challenging to control the object coherence, including semantic coherence +(e.g., the cat looks at the flowers or not) and physical coherence (e.g., the +hand and the racket should not be misaligned). In this paper, we propose a +novel diffusion model with effective global semantic fusion (GSF) and +self-similarity feature enhancement modules to guide the object coherence for +this task. For semantic coherence, we argue that the image caption contains +rich information for defining the semantic relationship within the objects in +the images. Instead of simply employing cross-attention between captions and +generated images, which addresses the highly relevant layout restriction and +semantic coherence separately and thus leads to unsatisfying results shown in +our experiments, we develop GSF to fuse the supervision from the layout +restriction and semantic coherence requirement and exploit it to guide the +image synthesis process. Moreover, to improve the physical coherence, we +develop a Self-similarity Coherence Attention (SCA) module to explicitly +integrate local contextual physical coherence into each pixel's generation +process. Specifically, we adopt a self-similarity map to encode the coherence +restrictions and employ it to extract coherent features from text embedding. +Through visualization of our self-similarity map, we explore the essence of +SCA, revealing that its effectiveness is not only in capturing reliable +physical coherence patterns but also in enhancing complex texture generation. +Extensive experiments demonstrate the superiority of our proposed method in +both image generation quality and controllability. + +
+
+
+
+
+ + ♻ ☆ BEVUDA: Multi-geometric Space Alignments for Domain Adaptive BEV 3D + Object Detection ICRA2024 + + +
+ Vision-centric bird-eye-view (BEV) perception has shown promising potential +in autonomous driving. Recent works mainly focus on improving efficiency or +accuracy but neglect the challenges when facing environment changing, resulting +in severe degradation of transfer performance. For BEV perception, we figure +out the significant domain gaps existing in typical real-world cross-domain +scenarios and comprehensively solve the Domain Adaption (DA) problem for +multi-view 3D object detection. Since BEV perception approaches are complicated +and contain several components, the domain shift accumulation on multiple +geometric spaces (i.e., 2D, 3D Voxel, BEV) makes BEV DA even challenging. In +this paper, we propose a Multi-space Alignment Teacher-Student (MATS) framework +to ease the domain shift accumulation, which consists of a Depth-Aware Teacher +(DAT) and a Geometric-space Aligned Student (GAS) model. DAT tactfully combines +target lidar and reliable depth prediction to construct depth-aware +information, extracting target domain-specific knowledge in Voxel and BEV +feature spaces. It then transfers the sufficient domain knowledge of multiple +spaces to the student model. In order to jointly alleviate the domain shift, +GAS projects multi-geometric space features to a shared geometric embedding +space and decreases data distribution distance between two domains. To verify +the effectiveness of our method, we conduct BEV 3D object detection experiments +on three cross-domain scenarios and achieve state-of-the-art performance. + +
+
+ comment: Accepted by ICRA2024 +
+
+
+
+
+ + ♻ ☆ Back to 3D: Few-Shot 3D Keypoint Detection with Back-Projected 2D + Features CVPR 2024 + + +
+ With the immense growth of dataset sizes and computing resources in recent +years, so-called foundation models have become popular in NLP and vision tasks. +In this work, we propose to explore foundation models for the task of keypoint +detection on 3D shapes. A unique characteristic of keypoint detection is that +it requires semantic and geometric awareness while demanding high localization +accuracy. To address this problem, we propose, first, to back-project features +from large pre-trained 2D vision models onto 3D shapes and employ them for this +task. We show that we obtain robust 3D features that contain rich semantic +information and analyze multiple candidate features stemming from different 2D +foundation models. Second, we employ a keypoint candidate optimization module +which aims to match the average observed distribution of keypoints on the shape +and is guided by the back-projected features. The resulting approach achieves a +new state of the art for few-shot keypoint detection on the KeyPointNet +dataset, almost doubling the performance of the previous best methods. + +
+
+ comment: Accepted to CVPR 2024, Project page: + https://wimmerth.github.io/back-to-3d.html +
+
+
+
+
+ + ♻ ☆ Fast Dynamic 3D Object Generation from a Single-view Video + + +
+ Generating dynamic 3D object from a single-view video is challenging due to +the lack of 4D labeled data. Extending image-to-3D pipelines by transferring +off-the-shelf image generation models such as score distillation sampling, +existing methods tend to be slow and expensive to scale due to the need for +back-propagating the information-limited supervision signals through a large +pretrained model. To address this, we propose an efficient video-to-4D object +generation framework called Efficient4D. It generates high-quality +spacetime-consistent images under different camera views, and then uses them as +labeled data to directly train a novel 4D Gaussian splatting model with +explicit point cloud geometry, enabling real-time rendering under continuous +camera trajectories. Extensive experiments on synthetic and real videos show +that Efficient4D offers a remarkable 20-fold increase in speed when compared to +prior art alternatives while preserving the quality of novel view synthesis. +For example, Efficient4D takes only 6 mins to model a dynamic object, vs 120 +mins by Consistent4D. + +
+
+ comment: Technical report +
+
+
+
+
+ + ♻ ☆ UniTraj: A Unified Framework for Scalable Vehicle Trajectory Prediction + + +
+ Vehicle trajectory prediction has increasingly relied on data-driven +solutions, but their ability to scale to different data domains and the impact +of larger dataset sizes on their generalization remain under-explored. While +these questions can be studied by employing multiple datasets, it is +challenging due to several discrepancies, e.g., in data formats, map +resolution, and semantic annotation types. To address these challenges, we +introduce UniTraj, a comprehensive framework that unifies various datasets, +models, and evaluation criteria, presenting new opportunities for the vehicle +trajectory prediction field. In particular, using UniTraj, we conduct extensive +experiments and find that model performance significantly drops when +transferred to other datasets. However, enlarging data size and diversity can +substantially improve performance, leading to a new state-of-the-art result for +the nuScenes dataset. We provide insights into dataset characteristics to +explain these findings. The code can be found here: +https://github.com/vita-epfl/UniTraj + +
+
+
+
+
+ + ♻ ☆ CLIP-DINOiser: Teaching CLIP a few DINO tricks for open-vocabulary + semantic segmentation + + +
+ The popular CLIP model displays impressive zero-shot capabilities thanks to +its seamless interaction with arbitrary text prompts. However, its lack of +spatial awareness makes it unsuitable for dense computer vision tasks, e.g., +semantic segmentation, without an additional fine-tuning step that often uses +annotations and can potentially suppress its original open-vocabulary +properties. Meanwhile, self-supervised representation methods have demonstrated +good localization properties without human-made annotations nor explicit +supervision. In this work, we take the best of both worlds and propose an +open-vocabulary semantic segmentation method, which does not require any +annotations. We propose to locally improve dense MaskCLIP features, which are +computed with a simple modification of CLIP's last pooling layer, by +integrating localization priors extracted from self-supervised features. By +doing so, we greatly improve the performance of MaskCLIP and produce smooth +outputs. Moreover, we show that the used self-supervised feature properties can +directly be learnt from CLIP features. Our method CLIP-DINOiser needs only a +single forward pass of CLIP and two light convolutional layers at inference, no +extra supervision nor extra memory and reaches state-of-the-art results on +challenging and fine-grained benchmarks such as COCO, Pascal Context, +Cityscapes and ADE20k. The code to reproduce our results is available at +https://github.com/wysoczanska/clip_dinoiser. + +
+
+
+
+
+ + ♻ ☆ Continual-MAE: Adaptive Distribution Masked Autoencoders for Continual + Test-Time Adaptation CVPR2024 + + +
+ Continual Test-Time Adaptation (CTTA) is proposed to migrate a source +pre-trained model to continually changing target distributions, addressing +real-world dynamism. Existing CTTA methods mainly rely on entropy minimization +or teacher-student pseudo-labeling schemes for knowledge extraction in +unlabeled target domains. However, dynamic data distributions cause +miscalibrated predictions and noisy pseudo-labels in existing self-supervised +learning methods, hindering the effective mitigation of error accumulation and +catastrophic forgetting problems during the continual adaptation process. To +tackle these issues, we propose a continual self-supervised method, Adaptive +Distribution Masked Autoencoders (ADMA), which enhances the extraction of +target domain knowledge while mitigating the accumulation of distribution +shifts. Specifically, we propose a Distribution-aware Masking (DaM) mechanism +to adaptively sample masked positions, followed by establishing consistency +constraints between the masked target samples and the original target samples. +Additionally, for masked tokens, we utilize an efficient decoder to reconstruct +a hand-crafted feature descriptor (e.g., Histograms of Oriented Gradients), +leveraging its invariant properties to boost task-relevant representations. +Through conducting extensive experiments on four widely recognized benchmarks, +our proposed method attains state-of-the-art performance in both classification +and segmentation CTTA tasks. Our project page: +https://sites.google.com/view/continual-mae/home. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ A2V: A Semi-Supervised Domain Adaptation Framework for Brain Vessel + Segmentation via Two-Phase Training Angiography-to-Venography Translation BMVC + + +
+ We present a semi-supervised domain adaptation framework for brain vessel +segmentation from different image modalities. Existing state-of-the-art methods +focus on a single modality, despite the wide range of available cerebrovascular +imaging techniques. This can lead to significant distribution shifts that +negatively impact the generalization across modalities. By relying on annotated +angiographies and a limited number of annotated venographies, our framework +accomplishes image-to-image translation and semantic segmentation, leveraging a +disentangled and semantically rich latent space to represent heterogeneous data +and perform image-level adaptation from source to target domains. Moreover, we +reduce the typical complexity of cycle-based architectures and minimize the use +of adversarial training, which allows us to build an efficient and intuitive +model with stable training. We evaluate our method on magnetic resonance +angiographies and venographies. While achieving state-of-the-art performance in +the source domain, our method attains a Dice score coefficient in the target +domain that is only 8.9% lower, highlighting its promising potential for robust +cerebrovascular image segmentation across different modalities. + +
+
+ comment: Accepted at the 34th British Machine Vision Conference (BMVC) +
+
+
+
+
+ + ♻ ☆ Debiasing Multimodal Large Language Models + + +
+ In the realms of computer vision and natural language processing, Large +Vision-Language Models (LVLMs) have become indispensable tools, proficient in +generating textual descriptions based on visual inputs. Despite their +advancements, our investigation reveals a noteworthy bias in the generated +content, where the output is primarily influenced by the underlying Large +Language Models (LLMs) prior rather than the input image. Our empirical +experiments underscore the persistence of this bias, as LVLMs often provide +confident answers even in the absence of relevant images or given incongruent +visual input. To rectify these biases and redirect the model's focus toward +vision information, we introduce two simple, training-free strategies. Firstly, +for tasks such as classification or multi-choice question-answering (QA), we +propose a ``calibration'' step through affine transformation to adjust the +output distribution. This ``Post-Hoc debias'' approach ensures uniform scores +for each answer when the image is absent, serving as an effective +regularization technique to alleviate the influence of LLM priors. For more +intricate open-ended generation tasks, we extend this method to ``Debias +sampling'', drawing inspirations from contrastive decoding methods. +Furthermore, our investigation sheds light on the instability of LVLMs across +various decoding configurations. Through systematic exploration of different +settings, we significantly enhance performance, surpassing reported results and +raising concerns about the fairness of existing evaluations. Comprehensive +experiments substantiate the effectiveness of our proposed strategies in +mitigating biases. These strategies not only prove beneficial in minimizing +hallucinations but also contribute to the generation of more helpful and +precise illustrations. + +
+
+ comment: 38 pages, 17 figures +
+
+
+
+
+ + ♻ ☆ SIGNeRF: Scene Integrated Generation for Neural Radiance Fields + + +
+ Advances in image diffusion models have recently led to notable improvements +in the generation of high-quality images. In combination with Neural Radiance +Fields (NeRFs), they enabled new opportunities in 3D generation. However, most +generative 3D approaches are object-centric and applying them to editing +existing photorealistic scenes is not trivial. We propose SIGNeRF, a novel +approach for fast and controllable NeRF scene editing and scene-integrated +object generation. A new generative update strategy ensures 3D consistency +across the edited images, without requiring iterative optimization. We find +that depth-conditioned diffusion models inherently possess the capability to +generate 3D consistent views by requesting a grid of images instead of single +views. Based on these insights, we introduce a multi-view reference sheet of +modified images. Our method updates an image collection consistently based on +the reference sheet and refines the original NeRF with the newly generated +image set in one go. By exploiting the depth conditioning mechanism of the +image diffusion model, we gain fine control over the spatial location of the +edit and enforce shape guidance by a selected region or an external mesh. + +
+
+ comment: Project Page: https://signerf.jdihlmann.com +
+
+
+
+
+ + ♻ ☆ LocalStyleFool: Regional Video Style Transfer Attack Using Segment + Anything Model SP + + +
+ Previous work has shown that well-crafted adversarial perturbations can +threaten the security of video recognition systems. Attackers can invade such +models with a low query budget when the perturbations are semantic-invariant, +such as StyleFool. Despite the query efficiency, the naturalness of the minutia +areas still requires amelioration, since StyleFool leverages style transfer to +all pixels in each frame. To close the gap, we propose LocalStyleFool, an +improved black-box video adversarial attack that superimposes regional +style-transfer-based perturbations on videos. Benefiting from the popularity +and scalably usability of Segment Anything Model (SAM), we first extract +different regions according to semantic information and then track them through +the video stream to maintain the temporal consistency. Then, we add +style-transfer-based perturbations to several regions selected based on the +associative criterion of transfer-based gradient information and regional area. +Perturbation fine adjustment is followed to make stylized videos adversarial. +We demonstrate that LocalStyleFool can improve both intra-frame and inter-frame +naturalness through a human-assessed survey, while maintaining competitive +fooling rate and query efficiency. Successful experiments on the +high-resolution dataset also showcase that scrupulous segmentation of SAM helps +to improve the scalability of adversarial attacks under high-resolution data. + +
+
+ comment: Accepted to 2024 IEEE Security and Privacy Workshops (SPW) +
+
+
+
+
+ + ♻ ☆ TULIP: Transformer for Upsampling of LiDAR Point Cloud CVPR20224 + + +
+ LiDAR Upsampling is a challenging task for the perception systems of robots +and autonomous vehicles, due to the sparse and irregular structure of +large-scale scene contexts. Recent works propose to solve this problem by +converting LiDAR data from 3D Euclidean space into an image super-resolution +problem in 2D image space. Although their methods can generate high-resolution +range images with fine-grained details, the resulting 3D point clouds often +blur out details and predict invalid points. In this paper, we propose TULIP, a +new method to reconstruct high-resolution LiDAR point clouds from +low-resolution LiDAR input. We also follow a range image-based approach but +specifically modify the patch and window geometries of a Swin-Transformer-based +network to better fit the characteristics of range images. We conducted several +experiments on three public real-world and simulated datasets. TULIP +outperforms state-of-the-art methods in all relevant metrics and generates +robust and more realistic point clouds than prior works. + +
+
+ comment: The paper was accepted by CVPR20224 +
+
+
+
+
+ + ♻ ☆ 3D Face Reconstruction Using A Spectral-Based Graph Convolution Encoder WWW 2024 + + +
+ Monocular 3D face reconstruction plays a crucial role in avatar generation, +with significant demand in web-related applications such as generating virtual +financial advisors in FinTech. Current reconstruction methods predominantly +rely on deep learning techniques and employ 2D self-supervision as a means to +guide model learning. However, these methods encounter challenges in capturing +the comprehensive 3D structural information of the face due to the utilization +of 2D images for model training purposes. To overcome this limitation and +enhance the reconstruction of 3D structural features, we propose an innovative +approach that integrates existing 2D features with 3D features to guide the +model learning process. Specifically, we introduce the 3D-ID Loss, which +leverages the high-dimensional structure features extracted from a +Spectral-Based Graph Convolution Encoder applied to the facial mesh. This +approach surpasses the sole reliance on the 3D information provided by the +facial mesh vertices coordinates. Our model is trained using 2D-3D data pairs +from a combination of datasets and achieves state-of-the-art performance on the +NoW benchmark. + +
+
+ comment: 4 pages, 3 figures. Accepted to WWW 2024 +
+
+
+
+
+ + ♻ ☆ AEROBLADE: Training-Free Detection of Latent Diffusion Images Using + Autoencoder Reconstruction Error CVPR 2024 + + +
+ With recent text-to-image models, anyone can generate deceptively realistic +images with arbitrary contents, fueling the growing threat of visual +disinformation. A key enabler for generating high-resolution images with low +computational cost has been the development of latent diffusion models (LDMs). +In contrast to conventional diffusion models, LDMs perform the denoising +process in the low-dimensional latent space of a pre-trained autoencoder (AE) +instead of the high-dimensional image space. Despite their relevance, the +forensic analysis of LDMs is still in its infancy. In this work we propose +AEROBLADE, a novel detection method which exploits an inherent component of +LDMs: the AE used to transform images between image and latent space. We find +that generated images can be more accurately reconstructed by the AE than real +images, allowing for a simple detection approach based on the reconstruction +error. Most importantly, our method is easy to implement and does not require +any training, yet nearly matches the performance of detectors that rely on +extensive training. We empirically demonstrate that AEROBLADE is effective +against state-of-the-art LDMs, including Stable Diffusion and Midjourney. +Beyond detection, our approach allows for the qualitative analysis of images, +which can be leveraged for identifying inpainted regions. We release our code +and data at https://github.com/jonasricker/aeroblade . + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ A citizen science toolkit to collect human perceptions of urban + environments using open street view images + + +
+ Street View-level Imagery (SVI) is a valuable data source for studies (e.g., +environmental assessments, green space identification or land cover +classification). While commercial SVI is available, such providers commonly +restrict copying or reuse in ways necessary for research. Open SVI datasets are +readily available from less restrictive sources, such as Mapillary, but due to +the heterogeneity of the images, these require substantial preprocessing, +filtering, and careful quality checks. We present an efficient method for +automated downloading, processing, cropping, and filtering open SVI, to be used +in a survey of human perceptions of the streets portrayed in these images. We +demonstrate our open-source reusable SVI preparation and smartphone-friendly +perception-survey software with Amsterdam (Netherlands) as the case study. +Using a citizen science approach, we collected from 331 people 22,637 ratings +about their perceptions for various criteria. We have published our software in +a public repository for future re-use and reproducibility. + +
+
+
+
+
+ + ♻ ☆ Scalable Non-Cartesian Magnetic Resonance Imaging with R2D2 + + +
+ We propose a new approach for non-Cartesian magnetic resonance image +reconstruction. While unrolled architectures provide robustness via +data-consistency layers, embedding measurement operators in Deep Neural Network +(DNN) can become impractical at large scale. Alternative Plug-and-Play (PnP) +approaches, where the denoising DNNs are blind to the measurement setting, are +not affected by this limitation and have also proven effective, but their +highly iterative nature also affects scalability. To address this scalability +challenge, we leverage the "Residual-to-Residual DNN series for high-Dynamic +range imaging (R2D2)" approach recently introduced in astronomical imaging. +R2D2's reconstruction is formed as a series of residual images, iteratively +estimated as outputs of DNNs taking the previous iteration's image estimate and +associated data residual as inputs. The method can be interpreted as a learned +version of the Matching Pursuit algorithm. We demonstrate R2D2 in simulation, +considering radial k-space sampling acquisition sequences. Our preliminary +results suggest that R2D2 achieves: (i) suboptimal performance compared to its +unrolled incarnation R2D2-Net, which is however non-scalable due to the +necessary embedding of NUFFT-based data-consistency layers; (ii) superior +reconstruction quality to a scalable version of R2D2-Net embedding an FFT-based +approximation for data consistency; (iii) superior reconstruction quality to +PnP, while only requiring few iterations. + +
+
+ comment: submitted to IEEE EUSIPCO 2024 +
+
+
+
+
+ + ♻ ☆ FoMo-Bench: a multi-modal, multi-scale and multi-task Forest Monitoring + Benchmark for remote sensing foundation models + + +
+ Forests are an essential part of Earth's ecosystems and natural systems, as +well as providing services on which humanity depends, yet they are rapidly +changing as a result of land use decisions and climate change. Understanding +and mitigating negative effects requires parsing data on forests at global +scale from a broad array of sensory modalities, and recently many such problems +have been approached using machine learning algorithms for remote sensing. To +date, forest-monitoring problems have largely been addressed in isolation. +Inspired by the rise of foundation models for computer vision and remote +sensing, we here present the first unified Forest Monitoring Benchmark +(FoMo-Bench). FoMo-Bench consists of 15 diverse datasets encompassing +satellite, aerial, and inventory data, covering a variety of geographical +regions, and including multispectral, red-green-blue, synthetic aperture radar +(SAR) and LiDAR data with various temporal, spatial and spectral resolutions. +FoMo-Bench includes multiple types of forest-monitoring tasks, spanning +classification, segmentation, and object detection. To further enhance the +diversity of tasks and geographies represented in FoMo-Bench, we introduce a +novel global dataset, TalloS, combining satellite imagery with ground-based +annotations for tree species classification, encompassing 1,000+ categories +across multiple hierarchical taxonomic levels (species, genus, family). +Finally, we propose FoMo-Net, a baseline foundation model with the capacity to +process any combination of commonly used spectral bands in remote sensing, +across diverse ground sampling distances and geographical locations worldwide. +This work aims to inspire research collaborations between machine learning and +forest biology researchers in exploring scalable multi-modal and multi-task +models for forest monitoring. All code and data will be made publicly +available. + +
+
+ comment: 26 pages +
+
+
+
+
+ + ♻ ☆ Retrieval-Augmented Generation for AI-Generated Content: A Survey + + +
+ The development of Artificial Intelligence Generated Content (AIGC) has been +facilitated by advancements in model algorithms, the increasing scale of +foundation models, and the availability of ample high-quality datasets. While +AIGC has achieved remarkable performance, it still faces several challenges, +such as the difficulty of maintaining up-to-date and long-tail knowledge, the +risk of data leakage, and the high costs associated with training and +inference. Retrieval-Augmented Generation(RAG) has recently emerged as a +paradigm to address such challenges. In particular, RAG introduces the +information retrieval process, which enhances the generation process by +retrieving relevant objects from available data stores, leading to higher +accuracy and better robustness. In this paper, we comprehensively review +existing efforts that integrate RAG technique into AIGC scenarios. We first +classify RAG foundations according to how the retriever augments the generator, +distilling the fundamental abstractions of the augmentation methodologies for +various retrievers and generators. This unified perspective encompasses all RAG +scenarios, illuminating advancements and pivotal technologies that help with +potential future progress. We also summarize additional enhancements methods +for RAG, facilitating effective engineering and implementation of RAG systems. +Then from another view, we survey on practical applications of RAG across +different modalities and tasks, offering valuable references for researchers +and practitioners. Furthermore, we introduce the benchmarks for RAG, discuss +the limitations of current RAG systems, and suggest potential directions for +future research.Project Repo: https://github.com/hymie122/RAG-Survey. + +
+
+ comment: Citing 380 papers, 36 pages, 16 figures. Project: + https://github.com/hymie122/RAG-Survey +
+
+
+
+
+ + ♻ ☆ Learning Concept-Based Causal Transition and Symbolic Reasoning for + Visual Planning + + +
+ Visual planning simulates how humans make decisions to achieve desired goals +in the form of searching for visual causal transitions between an initial +visual state and a final visual goal state. It has become increasingly +important in egocentric vision with its advantages in guiding agents to perform +daily tasks in complex environments. In this paper, we propose an interpretable +and generalizable visual planning framework consisting of i) a novel +Substitution-based Concept Learner (SCL) that abstracts visual inputs into +disentangled concept representations, ii) symbol abstraction and reasoning that +performs task planning via the self-learned symbols, and iii) a Visual Causal +Transition model (ViCT) that grounds visual causal transitions to semantically +similar real-world actions. Given an initial state, we perform goal-conditioned +visual planning with a symbolic reasoning method fueled by the learned +representations and causal transitions to reach the goal state. To verify the +effectiveness of the proposed model, we collect a large-scale visual planning +dataset based on AI2-THOR, dubbed as CCTP. Extensive experiments on this +challenging dataset demonstrate the superior performance of our method in +visual task planning. Empirically, we show that our framework can generalize to +unseen task trajectories, unseen object categories, and real-world data. +Further details of this work are provided at +https://fqyqc.github.io/ConTranPlan/. + +
+
+
+
+
+ + ♻ ☆ Centered Masking for Language-Image Pre-Training + + +
+ We introduce Gaussian masking for Language-Image Pre-Training (GLIP) a novel, +straightforward, and effective technique for masking image patches during +pre-training of a vision-language model. GLIP builds on Fast Language-Image +Pre-Training (FLIP), which randomly masks image patches while training a CLIP +model. GLIP replaces random masking with centered masking, that uses a Gaussian +distribution and is inspired by the importance of image patches at the center +of the image. GLIP retains the same computational savings as FLIP, while +improving performance across a range of downstream datasets and tasks, as +demonstrated by our experimental results. We show the benefits of GLIP to be +easy to obtain, requiring no delicate tuning of the Gaussian, and also +applicable to data sets containing images without an obvious center focus. + +
+
+
+
+
+ + ♻ ☆ Physical 3D Adversarial Attacks against Monocular Depth Estimation in + Autonomous Driving CVPR 2024 + + +
+ Deep learning-based monocular depth estimation (MDE), extensively applied in +autonomous driving, is known to be vulnerable to adversarial attacks. Previous +physical attacks against MDE models rely on 2D adversarial patches, so they +only affect a small, localized region in the MDE map but fail under various +viewpoints. To address these limitations, we propose 3D Depth Fool +(3D$^2$Fool), the first 3D texture-based adversarial attack against MDE models. +3D$^2$Fool is specifically optimized to generate 3D adversarial textures +agnostic to model types of vehicles and to have improved robustness in bad +weather conditions, such as rain and fog. Experimental results validate the +superior performance of our 3D$^2$Fool across various scenarios, including +vehicles, MDE models, weather conditions, and viewpoints. Real-world +experiments with printed 3D textures on physical vehicle models further +demonstrate that our 3D$^2$Fool can cause an MDE error of over 10 meters. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Weakly-Supervised Conditional Embedding for Referred Visual Search + + +
+ This paper introduces a new challenge for image similarity search in the +context of fashion, addressing the inherent ambiguity in this domain stemming +from complex images. We present Referred Visual Search (RVS), a task allowing +users to define more precisely the desired similarity, following recent +interest in the industry. We release a new large public dataset, +LAION-RVS-Fashion, consisting of 272k fashion products with 842k images +extracted from LAION, designed explicitly for this task. However, unlike +traditional visual search methods in the industry, we demonstrate that superior +performance can be achieved by bypassing explicit object detection and adopting +weakly-supervised conditional contrastive learning on image tuples. Our method +is lightweight and demonstrates robustness, reaching Recall at one superior to +strong detection-based baselines against 2M distractors. Code, data and models +are available at https://www.github.com/Simon-Lepage/CondViT-LRVSF . + +
+
+ comment: 28 pages, 13 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Multi-criteria Token Fusion with One-step-ahead Attention for Efficient + Vision Transformers CVPR + + +
+ Vision Transformer (ViT) has emerged as a prominent backbone for computer +vision. For more efficient ViTs, recent works lessen the quadratic cost of the +self-attention layer by pruning or fusing the redundant tokens. However, these +works faced the speed-accuracy trade-off caused by the loss of information. +Here, we argue that token fusion needs to consider diverse relations between +tokens to minimize information loss. In this paper, we propose a Multi-criteria +Token Fusion (MCTF), that gradually fuses the tokens based on multi-criteria +(e.g., similarity, informativeness, and size of fused tokens). Further, we +utilize the one-step-ahead attention, which is the improved approach to capture +the informativeness of the tokens. By training the model equipped with MCTF +using a token reduction consistency, we achieve the best speed-accuracy +trade-off in the image classification (ImageNet1K). Experimental results prove +that MCTF consistently surpasses the previous reduction methods with and +without training. Specifically, DeiT-T and DeiT-S with MCTF reduce FLOPs by +about 44% while improving the performance (+0.5%, and +0.3%) over the base +model, respectively. We also demonstrate the applicability of MCTF in various +Vision Transformers (e.g., T2T-ViT, LV-ViT), achieving at least 31% speedup +without performance degradation. Code is available at +https://github.com/mlvlab/MCTF. + +
+
+ comment: Conference on Computer Vision and Pattern Recognition (CVPR), 2024 +
+
+
+
+
+ + ♻ ☆ Task-Adaptive Saliency Guidance for Exemplar-free Class Incremental + Learning CVPR 2024 + + +
+ Exemplar-free Class Incremental Learning (EFCIL) aims to sequentially learn +tasks with access only to data from the current one. EFCIL is of interest +because it mitigates concerns about privacy and long-term storage of data, +while at the same time alleviating the problem of catastrophic forgetting in +incremental learning. In this work, we introduce task-adaptive saliency for +EFCIL and propose a new framework, which we call Task-Adaptive Saliency +Supervision (TASS), for mitigating the negative effects of saliency drift +between different tasks. We first apply boundary-guided saliency to maintain +task adaptivity and \textit{plasticity} on model attention. Besides, we +introduce task-agnostic low-level signals as auxiliary supervision to increase +the \textit{stability} of model attention. Finally, we introduce a module for +injecting and recovering saliency noise to increase the robustness of saliency +preservation. Our experiments demonstrate that our method can better preserve +saliency maps across tasks and achieve state-of-the-art results on the +CIFAR-100, Tiny-ImageNet, and ImageNet-Subset EFCIL benchmarks. Code is +available at \url{https://github.com/scok30/tass}. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ The Effects of Mixed Sample Data Augmentation are Class Dependent + + +
+ Mixed Sample Data Augmentation (MSDA) techniques, such as Mixup, CutMix, and +PuzzleMix, have been widely acknowledged for enhancing performance in a variety +of tasks. A previous study reported the class dependency of traditional data +augmentation (DA), where certain classes benefit disproportionately compared to +others. This paper reveals a class dependent effect of MSDA, where some classes +experience improved performance while others experience degraded performance. +This research addresses the issue of class dependency in MSDA and proposes an +algorithm to mitigate it. The approach involves training on a mixture of MSDA +and non-MSDA data, which not only mitigates the negative impact on the affected +classes, but also improves overall accuracy. Furthermore, we provide in-depth +analysis and discussion of why MSDA introduced class dependencies and which +classes are most likely to have them. + +
+
+ comment: 21 pages, 18 figures, Overall Revision +
+
+
+
+
+ + ♻ ☆ Spectral Meets Spatial: Harmonising 3D Shape Matching and Interpolation CVPR2024 + + +
+ Although 3D shape matching and interpolation are highly interrelated, they +are often studied separately and applied sequentially to relate different 3D +shapes, thus resulting in sub-optimal performance. In this work we present a +unified framework to predict both point-wise correspondences and shape +interpolation between 3D shapes. To this end, we combine the deep functional +map framework with classical surface deformation models to map shapes in both +spectral and spatial domains. On the one hand, by incorporating spatial maps, +our method obtains more accurate and smooth point-wise correspondences compared +to previous functional map methods for shape matching. On the other hand, by +introducing spectral maps, our method gets rid of commonly used but +computationally expensive geodesic distance constraints that are only valid for +near-isometric shape deformations. Furthermore, we propose a novel test-time +adaptation scheme to capture both pose-dominant and shape-dominant +deformations. Using different challenging datasets, we demonstrate that our +method outperforms previous state-of-the-art methods for both shape matching +and interpolation, even compared to supervised approaches. + +
+
+ comment: accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ CEIMVEN: An Approach of Cutting Edge Implementation of Modified Versions + of EfficientNet (V1-V2) Architecture for Breast Cancer Detection and + Classification from Ultrasound Images + + +
+ Undoubtedly breast cancer identifies itself as one of the most widespread and +terrifying cancers across the globe. Millions of women are getting affected +each year from it. Breast cancer remains the major one for being the reason of +largest number of demise of women. In the recent time of research, Medical +Image Computing and Processing has been playing a significant role for +detecting and classifying breast cancers from ultrasound images and mammograms, +along with the celestial touch of deep neural networks. In this research, we +focused mostly on our rigorous implementations and iterative result analysis of +different cutting-edge modified versions of EfficientNet architectures namely +EfficientNet-V1 (b0-b7) and EfficientNet-V2 (b0-b3) with ultrasound image, +named as CEIMVEN. We utilized transfer learning approach here for using the +pre-trained models of EfficientNet versions. We activated the hyper-parameter +tuning procedures, added fully connected layers, discarded the unprecedented +outliers and recorded the accuracy results from our custom modified +EfficientNet architectures. Our deep learning model training approach was +related to both identifying the cancer affected areas with region of interest +(ROI) techniques and multiple classifications (benign, malignant and normal). +The approximate testing accuracies we got from the modified versions of +EfficientNet-V1 (b0- 99.15%, b1- 98.58%, b2- 98.43%, b3- 98.01%, b4- 98.86%, +b5- 97.72%, b6- 97.72%, b7- 98.72%) and EfficientNet-V2 (b0- 99.29%, b1- +99.01%, b2- 98.72%, b3- 99.43%) are showing very bright future and strong +potentials of deep learning approach for the successful detection and +classification of breast cancers from the ultrasound images at a very early +stage. The code for this research is available here: +https://github.com/ac005sheekar/CEIMVEN-Breast. + +
+
+
+
+
+ + ♻ ☆ ViT-CoMer: Vision Transformer with Convolutional Multi-scale Feature + Interaction for Dense Predictions CVPR2024 + + +
+ Although Vision Transformer (ViT) has achieved significant success in +computer vision, it does not perform well in dense prediction tasks due to the +lack of inner-patch information interaction and the limited diversity of +feature scale. Most existing studies are devoted to designing vision-specific +transformers to solve the above problems, which introduce additional +pre-training costs. Therefore, we present a plain, pre-training-free, and +feature-enhanced ViT backbone with Convolutional Multi-scale feature +interaction, named ViT-CoMer, which facilitates bidirectional interaction +between CNN and transformer. Compared to the state-of-the-art, ViT-CoMer has +the following advantages: (1) We inject spatial pyramid multi-receptive field +convolutional features into the ViT architecture, which effectively alleviates +the problems of limited local information interaction and single-feature +representation in ViT. (2) We propose a simple and efficient CNN-Transformer +bidirectional fusion interaction module that performs multi-scale fusion across +hierarchical features, which is beneficial for handling dense prediction tasks. +(3) We evaluate the performance of ViT-CoMer across various dense prediction +tasks, different frameworks, and multiple advanced pre-training. Notably, our +ViT-CoMer-L achieves 64.3% AP on COCO val2017 without extra training data, and +62.1% mIoU on ADE20K val, both of which are comparable to state-of-the-art +methods. We hope ViT-CoMer can serve as a new backbone for dense prediction +tasks to facilitate future research. The code will be released at +https://github.com/Traffic-X/ViT-CoMer. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ♻ ☆ InterControl: Generate Human Motion Interactions by Controlling Every + Joint + + +
+ Text-conditioned human motion synthesis has made remarkable progress with the +emergence of diffusion models in recent research. However, the majority of +these motion diffusion models are primarily designed for a single character and +overlook multi-human interactions. In our approach, we strive to explore this +problem by synthesizing human motion with interactions for a group of +characters of any size. The key aspect of our approach is the adaptation of +human-wise interactions as pairs of human joints that can be either in contact +or separated by a desired distance. In contrast to existing methods that +necessitate training motion generation models on multi-human motion datasets +with a fixed number of characters, our approach inherently possesses the +flexibility to model human interactions involving an arbitrary number of +individuals, thereby transcending the limitations imposed by the training data. +We introduce a novel controllable motion generation method, InterControl, to +encourage the synthesized motions maintaining the desired distance between +joint pairs. It consists of a motion controller and an inverse kinematics +guidance module that realistically and accurately aligns the joints of +synthesized characters to the desired location. Furthermore, we demonstrate +that the distance between joint pairs for human-wise interactions can be +generated using an off-the-shelf Large Language Model (LLM). Experimental +results highlight the capability of our framework to generate interactions with +multiple human characters and its potential to work with off-the-shelf +physics-based character simulators. + +
+
+ comment: Generate human interactions with only single-person data via joint + contact pairs, code https://github.com/zhenzhiwang/intercontrol +
+
+
+
+
+ + ♻ ☆ SSM Meets Video Diffusion Models: Efficient Video Generation with + Structured State Spaces ICLR 2024 + + +
+ Given the remarkable achievements in image generation through diffusion +models, the research community has shown increasing interest in extending these +models to video generation. Recent diffusion models for video generation have +predominantly utilized attention layers to extract temporal features. However, +attention layers are limited by their memory consumption, which increases +quadratically with the length of the sequence. This limitation presents +significant challenges when attempting to generate longer video sequences using +diffusion models. To overcome this challenge, we propose leveraging state-space +models (SSMs). SSMs have recently gained attention as viable alternatives due +to their linear memory consumption relative to sequence length. In the +experiments, we first evaluate our SSM-based model with UCF101, a standard +benchmark of video generation. In addition, to investigate the potential of +SSMs for longer video generation, we perform an experiment using the MineRL +Navigate dataset, varying the number of frames to 64, 200, and 400. In these +settings, our SSM-based model can considerably save memory consumption for +longer sequences, while maintaining competitive FVD scores to the +attention-based models. Our codes are available at +https://github.com/shim0114/SSM-Meets-Video-Diffusion-Models. + +
+
+ comment: Accepted as workshop paper at ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Rotation-Invariant Transformer for Point Cloud Matching CVPR 2023 + + +
+ The intrinsic rotation invariance lies at the core of matching point clouds +with handcrafted descriptors. However, it is widely despised by recent deep +matchers that obtain the rotation invariance extrinsically via data +augmentation. As the finite number of augmented rotations can never span the +continuous SO(3) space, these methods usually show instability when facing +rotations that are rarely seen. To this end, we introduce RoITr, a +Rotation-Invariant Transformer to cope with the pose variations in the point +cloud matching task. We contribute both on the local and global levels. +Starting from the local level, we introduce an attention mechanism embedded +with Point Pair Feature (PPF)-based coordinates to describe the pose-invariant +geometry, upon which a novel attention-based encoder-decoder architecture is +constructed. We further propose a global transformer with rotation-invariant +cross-frame spatial awareness learned by the self-attention mechanism, which +significantly improves the feature distinctiveness and makes the model robust +with respect to the low overlap. Experiments are conducted on both the rigid +and non-rigid public benchmarks, where RoITr outperforms all the +state-of-the-art models by a considerable margin in the low-overlapping +scenarios. Especially when the rotations are enlarged on the challenging +3DLoMatch benchmark, RoITr surpasses the existing methods by at least 13 and 5 +percentage points in terms of Inlier Ratio and Registration Recall, +respectively. + +
+
+ comment: Accepted to CVPR 2023 +
+
+
+
+
+ + ♻ ☆ Extend Your Own Correspondences: Unsupervised Distant Point Cloud + Registration by Progressive Distance Extension CVPR + + +
+ Registration of point clouds collected from a pair of distant vehicles +provides a comprehensive and accurate 3D view of the driving scenario, which is +vital for driving safety related applications, yet existing literature suffers +from the expensive pose label acquisition and the deficiency to generalize to +new data distributions. In this paper, we propose EYOC, an unsupervised distant +point cloud registration method that adapts to new point cloud distributions on +the fly, requiring no global pose labels. The core idea of EYOC is to train a +feature extractor in a progressive fashion, where in each round, the feature +extractor, trained with near point cloud pairs, can label slightly farther +point cloud pairs, enabling self-supervision on such far point cloud pairs. +This process continues until the derived extractor can be used to register +distant point clouds. Particularly, to enable high-fidelity correspondence +label generation, we devise an effective spatial filtering scheme to select the +most representative correspondences to register a point cloud pair, and then +utilize the aligned point clouds to discover more correct correspondences. +Experiments show that EYOC can achieve comparable performance with +state-of-the-art supervised methods at a lower training cost. Moreover, it +outwits supervised methods regarding generalization performance on new data +distributions. + +
+
+ comment: In Proceedings of the IEEE/CVF Conference on Computer Vision and + Pattern Recognition (CVPR), 2024 +
+
+
+
+
+ + ♻ ☆ Foundation Model Makes Clustering A Better Initialization For Cold-Start + Active Learning + + +
+ Active learning selects the most informative samples from the unlabelled +dataset to annotate in the context of a limited annotation budget. While +numerous methods have been proposed for subsequent sample selection based on an +initialized model, scant attention has been paid to the indispensable phase of +active learning: selecting samples for model cold-start initialization. Most of +the previous studies resort to random sampling or naive clustering. However, +random sampling is prone to fluctuation, and naive clustering suffers from +convergence speed, particularly when dealing with high-dimensional data such as +imaging data. In this work, we propose to integrate foundation models with +clustering methods to select samples for cold-start active learning +initialization. Foundation models refer to those trained on massive datasets by +the self-supervised paradigm and capable of generating informative and +compacted embeddings for various downstream tasks. Leveraging these embeddings +to replace raw features such as pixel values, clustering quickly converges and +identifies better initial samples. For a comprehensive comparison, we included +a classic ImageNet-supervised model to acquire embeddings. Experiments on two +clinical tasks of image classification and segmentation demonstrated that +foundation model-based clustering efficiently pinpointed informative initial +samples, leading to models showcasing enhanced performance than the baseline +methods. We envisage that this study provides an effective paradigm for future +cold-start active learning. + +
+
+
+
+
+ + ♻ ☆ DifFlow3D: Toward Robust Uncertainty-Aware Scene Flow Estimation with + Iterative Diffusion-Based Refinement CVPR 2024 + + +
+ Scene flow estimation, which aims to predict per-point 3D displacements of +dynamic scenes, is a fundamental task in the computer vision field. However, +previous works commonly suffer from unreliable correlation caused by locally +constrained searching ranges, and struggle with accumulated inaccuracy arising +from the coarse-to-fine structure. To alleviate these problems, we propose a +novel uncertainty-aware scene flow estimation network (DifFlow3D) with the +diffusion probabilistic model. Iterative diffusion-based refinement is designed +to enhance the correlation robustness and resilience to challenging cases, e.g. +dynamics, noisy inputs, repetitive patterns, etc. To restrain the generation +diversity, three key flow-related features are leveraged as conditions in our +diffusion model. Furthermore, we also develop an uncertainty estimation module +within diffusion to evaluate the reliability of estimated scene flow. Our +DifFlow3D achieves state-of-the-art performance, with 24.0% and 29.1% EPE3D +reduction respectively on FlyingThings3D and KITTI 2015 datasets. Notably, our +method achieves an unprecedented millimeter-level accuracy (0.0078m in EPE3D) +on the KITTI dataset. Additionally, our diffusion-based refinement paradigm can +be readily integrated as a plug-and-play module into existing scene flow +networks, significantly increasing their estimation accuracy. Codes are +released at https://github.com/IRMVLab/DifFlow3D. + +
+
+ comment: Camera-ready version of CVPR 2024. Codes are released at + https://github.com/IRMVLab/DifFlow3D +
+
+
+
+
+ + ♻ ☆ Task-wise Sampling Convolutions for Arbitrary-Oriented Object Detection + in Aerial Images + + +
+ Arbitrary-oriented object detection (AOOD) has been widely applied to locate +and classify objects with diverse orientations in remote sensing images. +However, the inconsistent features for the localization and classification +tasks in AOOD models may lead to ambiguity and low-quality object predictions, +which constrains the detection performance. In this article, an AOOD method +called task-wise sampling convolutions (TS-Conv) is proposed. TS-Conv +adaptively samples task-wise features from respective sensitive regions and +maps these features together in alignment to guide a dynamic label assignment +for better predictions. Specifically, sampling positions of the localization +convolution in TS-Conv are supervised by the oriented bounding box (OBB) +prediction associated with spatial coordinates, while sampling positions and +convolutional kernel of the classification convolution are designed to be +adaptively adjusted according to different orientations for improving the +orientation robustness of features. Furthermore, a dynamic +task-consistent-aware label assignment (DTLA) strategy is developed to select +optimal candidate positions and assign labels dynamically according to ranked +task-aware scores obtained from TS-Conv. Extensive experiments on several +public datasets covering multiple scenes, multimodal images, and multiple +categories of objects demonstrate the effectiveness, scalability, and superior +performance of the proposed TS-Conv. + +
+
+ comment: 15 pages, 13 figures, 11 tables +
+
+
+
+
+ + ♻ ☆ FSC: Few-point Shape Completion CVPR 2024 + + +
+ While previous studies have demonstrated successful 3D object shape +completion with a sufficient number of points, they often fail in scenarios +when a few points, e.g. tens of points, are observed. Surprisingly, via entropy +analysis, we find that even a few points, e.g. 64 points, could retain +substantial information to help recover the 3D shape of the object. To address +the challenge of shape completion with very sparse point clouds, we then +propose Few-point Shape Completion (FSC) model, which contains a novel +dual-branch feature extractor for handling extremely sparse inputs, coupled +with an extensive branch for maximal point utilization with a saliency branch +for dynamic importance assignment. This model is further bolstered by a +two-stage revision network that refines both the extracted features and the +decoder output, enhancing the detail and authenticity of the completed point +cloud. Our experiments demonstrate the feasibility of recovering 3D shapes from +a few points. The proposed Few-point Shape Completion (FSC) model outperforms +previous methods on both few-point inputs and many-point inputs, and shows good +generalizability to different object categories. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ EMAGE: Towards Unified Holistic Co-Speech Gesture Generation via + Expressive Masked Audio Gesture Modeling CVPR + + +
+ We propose EMAGE, a framework to generate full-body human gestures from audio +and masked gestures, encompassing facial, local body, hands, and global +movements. To achieve this, we first introduce BEAT2 (BEAT-SMPLX-FLAME), a new +mesh-level holistic co-speech dataset. BEAT2 combines MoShed SMPLX body with +FLAME head parameters and further refines the modeling of head, neck, and +finger movements, offering a community-standardized, high-quality 3D motion +captured dataset. EMAGE leverages masked body gesture priors during training to +boost inference performance. It involves a Masked Audio Gesture Transformer, +facilitating joint training on audio-to-gesture generation and masked gesture +reconstruction to effectively encode audio and body gesture hints. Encoded body +hints from masked gestures are then separately employed to generate facial and +body movements. Moreover, EMAGE adaptively merges speech features from the +audio's rhythm and content and utilizes four compositional VQ-VAEs to enhance +the results' fidelity and diversity. Experiments demonstrate that EMAGE +generates holistic gestures with state-of-the-art performance and is flexible +in accepting predefined spatial-temporal gesture inputs, generating complete, +audio-synchronized results. Our code and dataset are available at +https://pantomatrix.github.io/EMAGE/ + +
+
+ comment: Conflict of Interest Disclosure; CVPR Camera Ready; Project Page: + https://pantomatrix.github.io/EMAGE/ +
+
+
+
+
+ + ♻ ☆ PPAD: Iterative Interactions of Prediction and Planning for End-to-end + Autonomous Driving + + +
+ We present a new interaction mechanism of prediction and planning for +end-to-end autonomous driving, called PPAD (Iterative Interaction of Prediction +and Planning Autonomous Driving), which considers the timestep-wise interaction +to better integrate prediction and planning. An ego vehicle performs motion +planning at each timestep based on the trajectory prediction of surrounding +agents (e.g., vehicles and pedestrians) and its local road conditions. Unlike +existing end-to-end autonomous driving frameworks, PPAD models the interactions +among ego, agents, and the dynamic environment in an autoregressive manner by +interleaving the Prediction and Planning processes at every timestep, instead +of a single sequential process of prediction followed by planning. +Specifically, we design ego-to-agent, ego-to-map, and ego-to-BEV interaction +mechanisms with hierarchical dynamic key objects attention to better model the +interactions. The experiments on the nuScenes benchmark show that our approach +outperforms state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ SchurVINS: Schur Complement-Based Lightweight Visual Inertial Navigation + System + + +
+ Accuracy and computational efficiency are the most important metrics to +Visual Inertial Navigation System (VINS). The existing VINS algorithms with +either high accuracy or low computational complexity, are difficult to provide +the high precision localization in resource-constrained devices. To this end, +we propose a novel filter-based VINS framework named SchurVINS, which could +guarantee both high accuracy by building a complete residual model and low +computational complexity with Schur complement. Technically, we first formulate +the full residual model where Gradient, Hessian and observation covariance are +explicitly modeled. Then Schur complement is employed to decompose the full +model into ego-motion residual model and landmark residual model. Finally, +Extended Kalman Filter (EKF) update is implemented in these two models with +high efficiency. Experiments on EuRoC and TUM-VI datasets show that our method +notably outperforms state-of-the-art (SOTA) methods in both accuracy and +computational complexity. The experimental code of SchurVINS is available at +https://github.com/bytedance/SchurVINS. + +
+
+
+
+
+ + ♻ ☆ Generalized Logit Adjustment: Calibrating Fine-tuned Models by Removing + Label Bias in Foundation Models NeurIPS2023 + + +
+ Foundation models like CLIP allow zero-shot transfer on various tasks without +additional training data. Yet, the zero-shot performance is less competitive +than a fully supervised one. Thus, to enhance the performance, fine-tuning and +ensembling are also commonly adopted to better fit the downstream tasks. +However, we argue that such prior work has overlooked the inherent biases in +foundation models. Due to the highly imbalanced Web-scale training set, these +foundation models are inevitably skewed toward frequent semantics, and thus the +subsequent fine-tuning or ensembling is still biased. In this study, we +systematically examine the biases in foundation models and demonstrate the +efficacy of our proposed Generalized Logit Adjustment (GLA) method. Note that +bias estimation in foundation models is challenging, as most pre-train data +cannot be explicitly accessed like in traditional long-tailed classification +tasks. To this end, GLA has an optimization-based bias estimation approach for +debiasing foundation models. As our work resolves a fundamental flaw in the +pre-training, the proposed GLA demonstrates significant improvements across a +diverse range of tasks: it achieves 1.5 pp accuracy gains on ImageNet, an large +average improvement (1.4-4.6 pp) on 11 few-shot datasets, 2.4 pp gains on +long-tailed classification. Codes are in \url{https://github.com/BeierZhu/GLA}. + +
+
+ comment: V2 proposed a more effective method for label distribution + estimation. V1 fixed a typo in abstract; Accepted by NeurIPS2023 +
+
+
+
+
+ + ♻ ☆ BridgeTower: Building Bridges Between Encoders in Vision-Language + Representation Learning AAAI 2023 + + +
+ Vision-Language (VL) models with the Two-Tower architecture have dominated +visual-language representation learning in recent years. Current VL models +either use lightweight uni-modal encoders and learn to extract, align and fuse +both modalities simultaneously in a deep cross-modal encoder, or feed the +last-layer uni-modal representations from the deep pre-trained uni-modal +encoders into the top cross-modal encoder. Both approaches potentially restrict +vision-language representation learning and limit model performance. In this +paper, we propose BridgeTower, which introduces multiple bridge layers that +build a connection between the top layers of uni-modal encoders and each layer +of the cross-modal encoder. This enables effective bottom-up cross-modal +alignment and fusion between visual and textual representations of different +semantic levels of pre-trained uni-modal encoders in the cross-modal encoder. +Pre-trained with only 4M images, BridgeTower achieves state-of-the-art +performance on various downstream vision-language tasks. In particular, on the +VQAv2 test-std set, BridgeTower achieves an accuracy of 78.73%, outperforming +the previous state-of-the-art model METER by 1.09% with the same pre-training +data and almost negligible additional parameters and computational costs. +Notably, when further scaling the model, BridgeTower achieves an accuracy of +81.15%, surpassing models that are pre-trained on orders-of-magnitude larger +datasets. Code and checkpoints are available at +https://github.com/microsoft/BridgeTower. + +
+
+ comment: Accepted by AAAI 2023, Oral +
+
+
+
+
+ + ♻ ☆ Scalable and Robust Transformer Decoders for Interpretable Image + Classification with Foundation Models + + +
+ Interpretable computer vision models can produce transparent predictions, +where the features of an image are compared with prototypes from a training +dataset and the similarity between them forms a basis for classification. +Nevertheless these methods are computationally expensive to train, introduce +additional complexity and may require domain knowledge to adapt +hyper-parameters to a new dataset. Inspired by developments in object +detection, segmentation and large-scale self-supervised foundation vision +models, we introduce Component Features (ComFe), a novel explainable-by-design +image classification approach using a transformer-decoder head and hierarchical +mixture-modelling. With only global image labels and no segmentation or part +annotations, ComFe can identify consistent image components, such as the head, +body, wings and tail of a bird, and the image background, and determine which +of these features are informative in making a prediction. We demonstrate that +ComFe obtains higher accuracy compared to previous interpretable models across +a range of fine-grained vision benchmarks, without the need to individually +tune hyper-parameters for each dataset. We also show that ComFe outperforms a +non-interpretable linear head across a range of datasets, including ImageNet, +and improves performance on generalisation and robustness benchmarks. + +
+
+
+
+
+ + ♻ ☆ Discovering and Mitigating Visual Biases through Keyword Explanation CVPR 2024 + + +
+ Addressing biases in computer vision models is crucial for real-world AI +deployments. However, mitigating visual biases is challenging due to their +unexplainable nature, often identified indirectly through visualization or +sample statistics, which necessitates additional human supervision for +interpretation. To tackle this issue, we propose the Bias-to-Text (B2T) +framework, which interprets visual biases as keywords. Specifically, we extract +common keywords from the captions of mispredicted images to identify potential +biases in the model. We then validate these keywords by measuring their +similarity to the mispredicted images using a vision-language scoring model. +The keyword explanation form of visual bias offers several advantages, such as +a clear group naming for bias discovery and a natural extension for debiasing +using these group names. Our experiments demonstrate that B2T can identify +known biases, such as gender bias in CelebA, background bias in Waterbirds, and +distribution shifts in ImageNet-R/C. Additionally, B2T uncovers novel biases in +larger datasets, such as Dollar Street and ImageNet. For example, we discovered +a contextual bias between "bee" and "flower" in ImageNet. We also highlight +various applications of B2T keywords, including debiased training, CLIP +prompting, and model comparison. + +
+
+ comment: CVPR 2024. First two authors contributed equally +
+
+
+
+
+ + ♻ ☆ Few-shot Learner Parameterization by Diffusion Time-steps CVPR 2024 + + +
+ Even when using large multi-modal foundation models, few-shot learning is +still challenging -- if there is no proper inductive bias, it is nearly +impossible to keep the nuanced class attributes while removing the visually +prominent attributes that spuriously correlate with class labels. To this end, +we find an inductive bias that the time-steps of a Diffusion Model (DM) can +isolate the nuanced class attributes, i.e., as the forward diffusion adds noise +to an image at each time-step, nuanced attributes are usually lost at an +earlier time-step than the spurious attributes that are visually prominent. +Building on this, we propose Time-step Few-shot (TiF) learner. We train +class-specific low-rank adapters for a text-conditioned DM to make up for the +lost attributes, such that images can be accurately reconstructed from their +noisy ones given a prompt. Hence, at a small time-step, the adapter and prompt +are essentially a parameterization of only the nuanced class attributes. For a +test image, we can use the parameterization to only extract the nuanced class +attributes for classification. TiF learner significantly outperforms OpenCLIP +and its adapters on a variety of fine-grained and customized few-shot learning +tasks. Codes are in https://github.com/yue-zhongqi/tif. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Unified Sequence-to-Sequence Learning for Single- and Multi-Modal Visual + Object Tracking CVPR2023 + + +
+ In this paper, we introduce a new sequence-to-sequence learning framework for +RGB-based and multi-modal object tracking. First, we present SeqTrack for +RGB-based tracking. It casts visual tracking as a sequence generation task, +forecasting object bounding boxes in an autoregressive manner. This differs +from previous trackers, which depend on the design of intricate head networks, +such as classification and regression heads. SeqTrack employs a basic +encoder-decoder transformer architecture. The encoder utilizes a bidirectional +transformer for feature extraction, while the decoder generates bounding box +sequences autoregressively using a causal transformer. The loss function is a +plain cross-entropy. Second, we introduce SeqTrackv2, a unified +sequence-to-sequence framework for multi-modal tracking tasks. Expanding upon +SeqTrack, SeqTrackv2 integrates a unified interface for auxiliary modalities +and a set of task-prompt tokens to specify the task. This enables it to manage +multi-modal tracking tasks using a unified model and parameter set. This +sequence learning paradigm not only simplifies the tracking framework, but also +showcases superior performance across 14 challenging benchmarks spanning five +single- and multi-modal tracking tasks. The code and models are available at +https://github.com/chenxin-dlut/SeqTrackv2. + +
+
+ comment: This is a new expanded version of our previous CVPR2023 paper + "SeqTrack: Sequence to Sequence Learning for Visual Object Tracking." + SeqTrackv2 extends SeqTrack to four multi-modal tracking tasks with a unified + model and parameter set +
+
+
+
+
+ + ♻ ☆ Generative 3D Part Assembly via Part-Whole-Hierarchy Message Passing + + +
+ Generative 3D part assembly involves understanding part relationships and +predicting their 6-DoF poses for assembling a realistic 3D shape. Prior work +often focus on the geometry of individual parts, neglecting part-whole +hierarchies of objects. Leveraging two key observations: 1) super-part poses +provide strong hints about part poses, and 2) predicting super-part poses is +easier due to fewer superparts, we propose a part-whole-hierarchy message +passing network for efficient 3D part assembly. We first introduce super-parts +by grouping geometrically similar parts without any semantic labels. Then we +employ a part-whole hierarchical encoder, wherein a super-part encoder predicts +latent super-part poses based on input parts. Subsequently, we transform the +point cloud using the latent poses, feeding it to the part encoder for +aggregating super-part information and reasoning about part relationships to +predict all part poses. In training, only ground-truth part poses are required. +During inference, the predicted latent poses of super-parts enhance +interpretability. Experimental results on the PartNet dataset show that our +method achieves state-of-the-art performance in part and connectivity accuracy +and enables an interpretable hierarchical part assembly. Code is available at +https://github.com/pkudba/3DHPA. + +
+
+
+
+
+ + ♻ ☆ Distilling ODE Solvers of Diffusion Models into Smaller Steps + + +
+ Abstract Diffusion models have recently gained prominence as a novel category +of generative models. Despite their success, these models face a notable +drawback in terms of slow sampling speeds, requiring a high number of function +evaluations (NFE) in the order of hundreds or thousands. In response, both +learning-free and learning-based sampling strategies have been explored to +expedite the sampling process. Learning-free sampling employs various ordinary +differential equation (ODE) solvers based on the formulation of diffusion ODEs. +However, it encounters challenges in faithfully tracking the true sampling +trajectory, particularly for small NFE. Conversely, learning-based sampling +methods, such as knowledge distillation, demand extensive additional training, +limiting their practical applicability. To overcome these limitations, we +introduce Distilled-ODE solvers (D-ODE solvers), a straightforward distillation +approach grounded in ODE solver formulations. Our method seamlessly integrates +the strengths of both learning-free and learning-based sampling. D-ODE solvers +are constructed by introducing a single parameter adjustment to existing ODE +solvers. Furthermore, we optimize D-ODE solvers with smaller steps using +knowledge distillation from ODE solvers with larger steps across a batch of +samples. Comprehensive experiments demonstrate the superior performance of +D-ODE solvers compared to existing ODE solvers, including DDIM, PNDM, +DPM-Solver, DEIS, and EDM, particularly in scenarios with fewer NFE. Notably, +our method incurs negligible computational overhead compared to previous +distillation techniques, facilitating straightforward and rapid integration +with existing samplers. Qualitative analysis reveals that D-ODE solvers not +only enhance image quality but also faithfully follow the target ODE +trajectory. + +
+
+
+
+
+ + ♻ ☆ DiffPrompter: Differentiable Implicit Visual Prompts for + Semantic-Segmentation in Adverse Conditions + + +
+ Semantic segmentation in adverse weather scenarios is a critical task for +autonomous driving systems. While foundation models have shown promise, the +need for specialized adaptors becomes evident for handling more challenging +scenarios. We introduce DiffPrompter, a novel differentiable visual and latent +prompting mechanism aimed at expanding the learning capabilities of existing +adaptors in foundation models. Our proposed $\nabla$HFC image processing block +excels particularly in adverse weather conditions, where conventional methods +often fall short. Furthermore, we investigate the advantages of jointly +training visual and latent prompts, demonstrating that this combined approach +significantly enhances performance in out-of-distribution scenarios. Our +differentiable visual prompts leverage parallel and series architectures to +generate prompts, effectively improving object segmentation tasks in adverse +conditions. Through a comprehensive series of experiments and evaluations, we +provide empirical evidence to support the efficacy of our approach. Project +page at https://diffprompter.github.io. + +
+
+
+
+
+ + ♻ ☆ Language Models are Free Boosters for Biomedical Imaging Tasks + + +
+ In this study, we uncover the unexpected efficacy of residual-based large +language models (LLMs) as part of encoders for biomedical imaging tasks, a +domain traditionally devoid of language or textual data. The approach diverges +from established methodologies by utilizing a frozen transformer block, +extracted from pre-trained LLMs, as an innovative encoder layer for the direct +processing of visual tokens. This strategy represents a significant departure +from the standard multi-modal vision-language frameworks, which typically hinge +on language-driven prompts and inputs. We found that these LLMs could boost +performance across a spectrum of biomedical imaging applications, including +both 2D and 3D visual classification tasks, serving as plug-and-play boosters. +More interestingly, as a byproduct, we found that the proposed framework +achieved superior performance, setting new state-of-the-art results on +extensive, standardized datasets in MedMNIST-2D and 3D. Through this work, we +aim to open new avenues for employing LLMs in biomedical imaging and enriching +the understanding of their potential in this specialized domain. + +
+
+
+
+
+ + ♻ ☆ Contrastive Pre-Training with Multi-View Fusion for No-Reference Point + Cloud Quality Assessment + + +
+ No-reference point cloud quality assessment (NR-PCQA) aims to automatically +evaluate the perceptual quality of distorted point clouds without available +reference, which have achieved tremendous improvements due to the utilization +of deep neural networks. However, learning-based NR-PCQA methods suffer from +the scarcity of labeled data and usually perform suboptimally in terms of +generalization. To solve the problem, we propose a novel contrastive +pre-training framework tailored for PCQA (CoPA), which enables the pre-trained +model to learn quality-aware representations from unlabeled data. To obtain +anchors in the representation space, we project point clouds with different +distortions into images and randomly mix their local patches to form mixed +images with multiple distortions. Utilizing the generated anchors, we constrain +the pre-training process via a quality-aware contrastive loss following the +philosophy that perceptual quality is closely related to both content and +distortion. Furthermore, in the model fine-tuning stage, we propose a +semantic-guided multi-view fusion module to effectively integrate the features +of projected images from multiple perspectives. Extensive experiments show that +our method outperforms the state-of-the-art PCQA methods on popular benchmarks. +Further investigations demonstrate that CoPA can also benefit existing +learning-based PCQA models. + +
+
+
+
+
+ + ♻ ☆ LLMs in Political Science: Heralding a New Era of Visual Analysis + + +
+ Interest is increasing among political scientists in leveraging the extensive +information available in images. However, the challenge of interpreting these +images lies in the need for specialized knowledge in computer vision and access +to specialized hardware. As a result, image analysis has been limited to a +relatively small group within the political science community. This landscape +could potentially change thanks to the rise of large language models (LLMs). +This paper aims to raise awareness of the feasibility of using Gemini for image +content analysis. A retrospective analysis was conducted on a corpus of 688 +images. Content reports were elicited from Gemini for each image and then +manually evaluated by the authors. We find that Gemini is highly accurate in +performing object detection, which is arguably the most common and fundamental +task in image analysis for political scientists. Equally important, we show +that it is easy to implement as the entire command consists of a single prompt +in natural language; it is fast to run and should meet the time budget of most +researchers; and it is free to use and does not require any specialized +hardware. In addition, we illustrate how political scientists can leverage +Gemini for other image understanding tasks, including face identification, +sentiment analysis, and caption generation. Our findings suggest that Gemini +and other similar LLMs have the potential to drastically stimulate and +accelerate image research in political science and social sciences more +broadly. + +
+
+ comment: 7 pages, 3 tables +
+
+
+
+
+ + ♻ ☆ Trustworthy Self-Attention: Enabling the Network to Focus Only on the + Most Relevant References + + +
+ The prediction of optical flow for occluded points is still a difficult +problem that has not yet been solved. Recent methods use self-attention to find +relevant non-occluded points as references for estimating the optical flow of +occluded points based on the assumption of self-similarity. However, they rely +on visual features of a single image and weak constraints, which are not +sufficient to constrain the trained network to focus on erroneous and weakly +relevant reference points. We make full use of online occlusion recognition +information to construct occlusion extended visual features and two strong +constraints, allowing the network to learn to focus only on the most relevant +references without requiring occlusion ground truth to participate in the +training of the network. Our method adds very few network parameters to the +original framework, making it very lightweight. Extensive experiments show that +our model has the greatest cross-dataset generalization. Our method achieves +much greater error reduction, 18.6%, 16.2%, and 20.1% for all points, +non-occluded points, and occluded points respectively from the state-of-the-art +GMA-base method, MATCHFlow(GMA), on Sintel Albedo pass. Furthermore, our model +achieves state-of-the-art performance on the Sintel bench-marks, ranking \#1 +among all published methods on Sintel clean pass. The code will be open-source. + +
+
+ comment: Correct Figure 1 +
+
+
+
+
+ + ♻ ☆ Dyadic Interaction Modeling for Social Behavior Generation + + +
+ Human-human communication is like a delicate dance where listeners and +speakers concurrently interact to maintain conversational dynamics. Hence, an +effective model for generating listener nonverbal behaviors requires +understanding the dyadic context and interaction. In this paper, we present an +effective framework for creating 3D facial motions in dyadic interactions. +Existing work consider a listener as a reactive agent with reflexive behaviors +to the speaker's voice and facial motions. The heart of our framework is Dyadic +Interaction Modeling (DIM), a pre-training approach that jointly models +speakers' and listeners' motions through masking and contrastive learning to +learn representations that capture the dyadic context. To enable the generation +of non-deterministic behaviors, we encode both listener and speaker motions +into discrete latent representations, through VQ-VAE. The pre-trained model is +further fine-tuned for motion generation. Extensive experiments demonstrate the +superiority of our framework in generating listener motions, establishing a new +state-of-the-art according to the quantitative measures capturing the diversity +and realism of generated motions. Qualitative results demonstrate the superior +capabilities of the proposed approach in generating diverse and realistic +expressions, eye blinks and head gestures. + +
+
+
+
+
+ + ♻ ☆ Hybrid Video Diffusion Models with 2D Triplane and 3D Wavelet + Representation + + +
+ Generating high-quality videos that synthesize desired realistic content is a +challenging task due to their intricate high-dimensionality and complexity of +videos. Several recent diffusion-based methods have shown comparable +performance by compressing videos to a lower-dimensional latent space, using +traditional video autoencoder architecture. However, such method that employ +standard frame-wise 2D and 3D convolution fail to fully exploit the +spatio-temporal nature of videos. To address this issue, we propose a novel +hybrid video diffusion model, called HVDM, which can capture spatio-temporal +dependencies more effectively. The HVDM is trained by a hybrid video +autoencoder which extracts a disentangled representation of the video +including: (i) a global context information captured by a 2D projected latent +(ii) a local volume information captured by 3D convolutions with wavelet +decomposition (iii) a frequency information for improving the video +reconstruction. Based on this disentangled representation, our hybrid +autoencoder provide a more comprehensive video latent enriching the generated +videos with fine structures and details. Experiments on video generation +benchamarks (UCF101, SkyTimelapse, and TaiChi) demonstrate that the proposed +approach achieves state-of-the-art video generation quality, showing a wide +range of video applications (e.g., long video generation, image-to-video, and +video dynamics control). + +
+
+ comment: 17 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ Deep Learning-Driven Approach for Handwritten Chinese Character + Classification + + +
+ Handwritten character recognition (HCR) is a challenging problem for machine +learning researchers. Unlike printed text data, handwritten character datasets +have more variation due to human-introduced bias. With numerous unique +character classes present, some data, such as Logographic Scripts or +Sino-Korean character sequences, bring new complications to the HCR problem. +The classification task on such datasets requires the model to learn +high-complexity details of the images that share similar features. With recent +advances in computational resource availability and further computer vision +theory development, some research teams have effectively addressed the arising +challenges. Although known for achieving high accuracy while keeping the number +of parameters small, many common approaches are still not generalizable and use +dataset-specific solutions to achieve better results. Due to complex structure, +existing methods frequently prevent the solutions from gaining popularity. This +paper proposes a highly scalable approach for detailed character image +classification by introducing the model architecture, data preprocessing steps, +and testing design instructions. We also perform experiments to compare the +performance of our method with that of existing ones to show the improvements +achieved. + +
+
+ comment: 30 pages, 9 figures, 2 tables, preprint v2 +
+
+
+
+
+ + ♻ ☆ X-Portrait: Expressive Portrait Animation with Hierarchical Motion + Attention + + +
+ We propose X-Portrait, an innovative conditional diffusion model tailored for +generating expressive and temporally coherent portrait animation. Specifically, +given a single portrait as appearance reference, we aim to animate it with +motion derived from a driving video, capturing both highly dynamic and subtle +facial expressions along with wide-range head movements. As its core, we +leverage the generative prior of a pre-trained diffusion model as the rendering +backbone, while achieve fine-grained head pose and expression control with +novel controlling signals within the framework of ControlNet. In contrast to +conventional coarse explicit controls such as facial landmarks, our motion +control module is learned to interpret the dynamics directly from the original +driving RGB inputs. The motion accuracy is further enhanced with a patch-based +local control module that effectively enhance the motion attention to +small-scale nuances like eyeball positions. Notably, to mitigate the identity +leakage from the driving signals, we train our motion control modules with +scaling-augmented cross-identity images, ensuring maximized disentanglement +from the appearance reference modules. Experimental results demonstrate the +universal effectiveness of X-Portrait across a diverse range of facial +portraits and expressive driving sequences, and showcase its proficiency in +generating captivating portrait animations with consistently maintained +identity characteristics. + +
+
+
+
+
+ + ♻ ☆ Targeted collapse regularized autoencoder for anomaly detection: black + hole at the center + + +
+ Autoencoders have been extensively used in the development of recent anomaly +detection techniques. The premise of their application is based on the notion +that after training the autoencoder on normal training data, anomalous inputs +will exhibit a significant reconstruction error. Consequently, this enables a +clear differentiation between normal and anomalous samples. In practice, +however, it is observed that autoencoders can generalize beyond the normal +class and achieve a small reconstruction error on some of the anomalous +samples. To improve the performance, various techniques propose additional +components and more sophisticated training procedures. In this work, we propose +a remarkably straightforward alternative: instead of adding neural network +components, involved computations, and cumbersome training, we complement the +reconstruction loss with a computationally light term that regulates the norm +of representations in the latent space. The simplicity of our approach +minimizes the requirement for hyperparameter tuning and customization for new +applications which, paired with its permissive data modality constraint, +enhances the potential for successful adoption across a broad range of +applications. We test the method on various visual and tabular benchmarks and +demonstrate that the technique matches and frequently outperforms more complex +alternatives. We further demonstrate that implementing this idea in the context +of state-of-the-art methods can further improve their performance. We also +provide a theoretical analysis and numerical simulations that help demonstrate +the underlying process that unfolds during training and how it helps with +anomaly detection. This mitigates the black-box nature of autoencoder-based +anomaly detection algorithms and offers an avenue for further investigation of +advantages, fail cases, and potential new directions. + +
+
+ comment: 18 pages, 4 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ PIE-NeRF: Physics-based Interactive Elastodynamics with NeRF + + +
+ We show that physics-based simulations can be seamlessly integrated with NeRF +to generate high-quality elastodynamics of real-world objects. Unlike existing +methods, we discretize nonlinear hyperelasticity in a meshless way, obviating +the necessity for intermediate auxiliary shape proxies like a tetrahedral mesh +or voxel grid. A quadratic generalized moving least square (Q-GMLS) is employed +to capture nonlinear dynamics and large deformation on the implicit model. Such +meshless integration enables versatile simulations of complex and codimensional +shapes. We adaptively place the least-square kernels according to the NeRF +density field to significantly reduce the complexity of the nonlinear +simulation. As a result, physically realistic animations can be conveniently +synthesized using our method for a wide range of hyperelastic materials at an +interactive rate. For more information, please visit our project page at +https://fytalon.github.io/pienerf/. + +
+
+
+
+
+ + ♻ ☆ Towards Fairness-Aware Adversarial Learning CVPR 2024 + + +
+ Although adversarial training (AT) has proven effective in enhancing the +model's robustness, the recently revealed issue of fairness in robustness has +not been well addressed, i.e. the robust accuracy varies significantly among +different categories. In this paper, instead of uniformly evaluating the +model's average class performance, we delve into the issue of robust fairness, +by considering the worst-case distribution across various classes. We propose a +novel learning paradigm, named Fairness-Aware Adversarial Learning (FAAL). As a +generalization of conventional AT, we re-define the problem of adversarial +training as a min-max-max framework, to ensure both robustness and fairness of +the trained model. Specifically, by taking advantage of distributional robust +optimization, our method aims to find the worst distribution among different +categories, and the solution is guaranteed to obtain the upper bound +performance with high probability. In particular, FAAL can fine-tune an unfair +robust model to be fair within only two epochs, without compromising the +overall clean and robust accuracies. Extensive experiments on various image +datasets validate the superior performance and efficiency of the proposed FAAL +compared to other state-of-the-art methods. + +
+
+ comment: This work will appear in the CVPR 2024 conference proceedings +
+
+
+
+
+ + ♻ ☆ Multi-modal Misinformation Detection: Approaches, Challenges and + Opportunities + + +
+ As social media platforms are evolving from text-based forums into +multi-modal environments, the nature of misinformation in social media is also +transforming accordingly. Taking advantage of the fact that visual modalities +such as images and videos are more favorable and attractive to the users and +textual contents are sometimes skimmed carelessly, misinformation spreaders +have recently targeted contextual connections between the modalities e.g., text +and image. Hence many researchers have developed automatic techniques for +detecting possible cross-modal discordance in web-based content. We analyze, +categorize and identify existing approaches in addition to challenges and +shortcomings they face in order to unearth new research opportunities in the +field of multi-modal misinformation detection. + +
+
+
+
+
+ + ♻ ☆ HybridNeRF: Efficient Neural Rendering via Adaptive Volumetric Surfaces CVPR 2024 + + +
+ Neural radiance fields provide state-of-the-art view synthesis quality but +tend to be slow to render. One reason is that they make use of volume +rendering, thus requiring many samples (and model queries) per ray at render +time. Although this representation is flexible and easy to optimize, most +real-world objects can be modeled more efficiently with surfaces instead of +volumes, requiring far fewer samples per ray. This observation has spurred +considerable progress in surface representations such as signed distance +functions, but these may struggle to model semi-opaque and thin structures. We +propose a method, HybridNeRF, that leverages the strengths of both +representations by rendering most objects as surfaces while modeling the +(typically) small fraction of challenging regions volumetrically. We evaluate +HybridNeRF against the challenging Eyeful Tower dataset along with other +commonly used view synthesis datasets. When comparing to state-of-the-art +baselines, including recent rasterization-based approaches, we improve error +rates by 15-30% while achieving real-time framerates (at least 36 FPS) for +virtual-reality resolutions (2Kx2K). + +
+
+ comment: CVPR 2024 Project page: https://haithemturki.com/hybrid-nerf/ +
+
+
+
+
+ + ♻ ☆ L2B: Learning to Bootstrap Robust Models for Combating Label Noise CVPR 2024 + + +
+ Deep neural networks have shown great success in representation learning. +However, when learning with noisy labels (LNL), they can easily overfit and +fail to generalize to new data. This paper introduces a simple and effective +method, named Learning to Bootstrap (L2B), which enables models to bootstrap +themselves using their own predictions without being adversely affected by +erroneous pseudo-labels. It achieves this by dynamically adjusting the +importance weight between real observed and generated labels, as well as +between different samples through meta-learning. Unlike existing instance +reweighting methods, the key to our method lies in a new, versatile objective +that enables implicit relabeling concurrently, leading to significant +improvements without incurring additional costs. + L2B offers several benefits over the baseline methods. It yields more robust +models that are less susceptible to the impact of noisy labels by guiding the +bootstrapping procedure more effectively. It better exploits the valuable +information contained in corrupted instances by adapting the weights of both +instances and labels. Furthermore, L2B is compatible with existing LNL methods +and delivers competitive results spanning natural and medical imaging tasks +including classification and segmentation under both synthetic and real-world +noise. Extensive experiments demonstrate that our method effectively mitigates +the challenges of noisy labels, often necessitating few to no validation +samples, and is well generalized to other tasks such as image segmentation. +This not only positions it as a robust complement to existing LNL techniques +but also underscores its practical applicability. The code and models are +available at https://github.com/yuyinzhou/l2b. + +
+
+ comment: CVPR 2024; code is available at https://github.com/yuyinzhou/l2b +
+
+
+
+
+ + ♻ ☆ Visual Acuity Prediction on Real-Life Patient Data Using a Machine + Learning Based Multistage System + + +
+ In ophthalmology, intravitreal operative medication therapy (IVOM) is a +widespread treatment for diseases related to the age-related macular +degeneration (AMD), the diabetic macular edema (DME), as well as the retinal +vein occlusion (RVO). However, in real-world settings, patients often suffer +from loss of vision on time scales of years despite therapy, whereas the +prediction of the visual acuity (VA) and the earliest possible detection of +deterioration under real-life conditions is challenging due to heterogeneous +and incomplete data. In this contribution, we present a workflow for the +development of a research-compatible data corpus fusing different IT systems of +the department of ophthalmology of a German maximum care hospital. The +extensive data corpus allows predictive statements of the expected progression +of a patient and his or her VA in each of the three diseases. For the disease +AMD, we found out a significant deterioration of the visual acuity over time. +Within our proposed multistage system, we subsequently classify the VA +progression into the three groups of therapy "winners", "stabilizers", and +"losers" (WSL classification scheme). Our OCT biomarker classification using an +ensemble of deep neural networks results in a classification accuracy +(F1-score) of over 98 %, enabling us to complete incomplete OCT documentations +while allowing us to exploit them for a more precise VA modelling process. Our +VA prediction requires at least four VA examinations and optionally OCT +biomarkers from the same time period to predict the VA progression within a +forecasted time frame, whereas our prediction is currently restricted to IVOM / +no therapy. We achieve a final prediction accuracy of 69 % in macro average +F1-score, while being in the same range as the ophthalmologists with 57.8 and +50 +- 10.7 % F1-score. + +
+
+ comment: Preprint for journal Scientific Reports (Springer) +
+
+
+
+
+ + ♻ ☆ LUWA Dataset: Learning Lithic Use-Wear Analysis on Microscopic Images CVPR + + +
+ Lithic Use-Wear Analysis (LUWA) using microscopic images is an underexplored +vision-for-science research area. It seeks to distinguish the worked material, +which is critical for understanding archaeological artifacts, material +interactions, tool functionalities, and dental records. However, this +challenging task goes beyond the well-studied image classification problem for +common objects. It is affected by many confounders owing to the complex wear +mechanism and microscopic imaging, which makes it difficult even for human +experts to identify the worked material successfully. In this paper, we +investigate the following three questions on this unique vision task for the +first time:(i) How well can state-of-the-art pre-trained models (like DINOv2) +generalize to the rarely seen domain? (ii) How can few-shot learning be +exploited for scarce microscopic images? (iii) How do the ambiguous +magnification and sensing modality influence the classification accuracy? To +study these, we collaborated with archaeologists and built the first +open-source and the largest LUWA dataset containing 23,130 microscopic images +with different magnifications and sensing modalities. Extensive experiments +show that existing pre-trained models notably outperform human experts but +still leave a large gap for improvements. Most importantly, the LUWA dataset +provides an underexplored opportunity for vision and learning communities and +complements existing image classification problems on common objects. + +
+
+ comment: CVPR +
+
+
+
+
+ + ♻ ☆ SuPerPM: A Large Deformation-Robust Surgical Perception Framework Based + on Deep Point Matching Learned from Physical Constrained Simulation Data + + +
+ Manipulation of tissue with surgical tools often results in large +deformations that current methods in tracking and reconstructing algorithms +have not effectively addressed. A major source of tracking errors during large +deformations stems from wrong data association between observed sensor +measurements with previously tracked scene. To mitigate this issue, we present +a surgical perception framework, SuPerPM, that leverages learning-based +non-rigid point cloud matching for data association, thus accommodating larger +deformations. The learning models typically require training data with ground +truth point cloud correspondences, which is challenging or even impractical to +collect in surgical environments. Thus, for tuning the learning model, we +gather endoscopic data of soft tissue being manipulated by a surgical robot and +then establish correspondences between point clouds at different time points to +serve as ground truth. This was achieved by employing a position-based dynamics +(PBD) simulation to ensure that the correspondences adhered to physical +constraints. The proposed framework is demonstrated on several challenging +surgical datasets that are characterized by large deformations, achieving +superior performance over state-of-the-art surgical scene tracking algorithms. + +
+
+
+
+
+ + ♻ ☆ What's in a Prior? Learned Proximal Networks for Inverse Problems + + +
+ Proximal operators are ubiquitous in inverse problems, commonly appearing as +part of algorithmic strategies to regularize problems that are otherwise +ill-posed. Modern deep learning models have been brought to bear for these +tasks too, as in the framework of plug-and-play or deep unrolling, where they +loosely resemble proximal operators. Yet, something essential is lost in +employing these purely data-driven approaches: there is no guarantee that a +general deep network represents the proximal operator of any function, nor is +there any characterization of the function for which the network might provide +some approximate proximal. This not only makes guaranteeing convergence of +iterative schemes challenging but, more fundamentally, complicates the analysis +of what has been learned by these networks about their training data. Herein we +provide a framework to develop learned proximal networks (LPN), prove that they +provide exact proximal operators for a data-driven nonconvex regularizer, and +show how a new training strategy, dubbed proximal matching, provably promotes +the recovery of the log-prior of the true data distribution. Such LPN provide +general, unsupervised, expressive proximal operators that can be used for +general inverse problems with convergence guarantees. We illustrate our results +in a series of cases of increasing complexity, demonstrating that these models +not only result in state-of-the-art performance, but provide a window into the +resulting priors learned from data. + +
+
+
+
+
+ + ♻ ☆ COVID-19 detection from pulmonary CT scans using a novel EfficientNet + with attention mechanism + + +
+ Manual analysis and diagnosis of COVID-19 through the examination of Computed +Tomography (CT) images of the lungs can be time-consuming and result in errors, +especially given high volume of patients and numerous images per patient. So, +we address the need for automation of this task by developing a new deep +learning model-based pipeline. Our motivation was sparked by the CVPR Workshop +on "Domain Adaptation, Explainability and Fairness in AI for Medical Image +Analysis", more specifically, the "COVID-19 Diagnosis Competition (DEF-AI-MIA +COV19D)" under the same Workshop. This challenge provides an opportunity to +assess our proposed pipeline for COVID-19 detection from CT scan images. The +same pipeline incorporates the original EfficientNet, but with an added +Attention Mechanism: EfficientNet-AM. Also, unlike the traditional/past +pipelines, which relied on a pre-processing step, our pipeline takes the raw +selected input images without any such step, except for an image-selection step +to simply reduce the number of CT images required for training and/or testing. +Moreover, our pipeline is computationally efficient, as, for example, it does +not incorporate a decoder for segmenting the lungs. It also does not combine +different backbones nor combine RNN with a backbone, as other pipelines in the +past did. Nevertheless, our pipeline still outperforms all approaches presented +by other teams in last year's instance of the same challenge, at least based on +the validation subset of the competition dataset. + +
+
+
+
+
+ + ♻ ☆ Multi-camera calibration with pattern rigs, including for + non-overlapping cameras: CALICO + + +
+ This paper describes CALICO, a method for multi-camera calibration suitable +for challenging contexts: stationary and mobile multi-camera systems, cameras +without overlapping fields of view, and non-synchronized cameras. Recent +approaches are roughly divided into infrastructure- and pattern-based. +Infrastructure-based approaches use the scene's features to calibrate, while +pattern-based approaches use calibration patterns. Infrastructure-based +approaches are not suitable for stationary camera systems, and pattern-based +approaches may constrain camera placement because shared fields of view or +extremely large patterns are required. + CALICO is a pattern-based approach, where the multi-calibration problem is +formulated using rigidity constraints between patterns and cameras. We use a +{\it pattern rig}: several patterns rigidly attached to each other or some +structure. We express the calibration problem as that of algebraic and +reprojection error minimization problems. Simulated and real experiments +demonstrate the method in a variety of settings. CALICO compared favorably to +Kalibr. Mean reconstruction accuracy error was $\le 0.71$ mm for real camera +rigs, and $\le 1.11$ for simulated camera rigs. Code and data releases are +available at \cite{tabb_amy_2019_3520866} and +\url{https://github.com/amy-tabb/calico}. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ♻ ☆ From Correspondences to Pose: Non-minimal Certifiably Optimal Relative + Pose without Disambiguation CVPR 2024 + + +
+ Estimating the relative camera pose from $n \geq 5$ correspondences between +two calibrated views is a fundamental task in computer vision. This process +typically involves two stages: 1) estimating the essential matrix between the +views, and 2) disambiguating among the four candidate relative poses that +satisfy the epipolar geometry. In this paper, we demonstrate a novel approach +that, for the first time, bypasses the second stage. Specifically, we show that +it is possible to directly estimate the correct relative camera pose from +correspondences without needing a post-processing step to enforce the +cheirality constraint on the correspondences. Building on recent advances in +certifiable non-minimal optimization, we frame the relative pose estimation as +a Quadratically Constrained Quadratic Program (QCQP). By applying the +appropriate constraints, we ensure the estimation of a camera pose that +corresponds to a valid 3D geometry and that is globally optimal when certified. +We validate our method through exhaustive synthetic and real-world experiments, +confirming the efficacy, efficiency and accuracy of the proposed approach. Code +is available at https://github.com/javrtg/C2P. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Reasoning over the Behaviour of Objects in Video-Clips for Adverb-Type + Recognition + + +
+ In this work, following the intuition that adverbs describing scene-sequences +are best identified by reasoning over high-level concepts of object-behavior, +we propose the design of a new framework that reasons over object-behaviours +extracted from raw-video-clips to recognize the clip's corresponding +adverb-types. Importantly, while previous works for general scene +adverb-recognition assume knowledge of the clips underlying action-types, our +method is directly applicable in the more general problem setting where the +action-type of a video-clip is unknown. Specifically, we propose a novel +pipeline that extracts human-interpretable object-behaviour-facts from raw +video clips and propose novel symbolic and transformer based reasoning methods +that operate over these extracted facts to identify adverb-types. Experiment +results demonstrate that our proposed methods perform favourably against the +previous state-of-the-art. Additionally, to support efforts in symbolic +video-processing, we release two new datasets of object-behaviour-facts +extracted from raw video clips - the MSR-VTT-ASP and ActivityNet-ASP datasets. + +
+
+
+
+
+ + ♻ ☆ SHViT: Single-Head Vision Transformer with Memory Efficient Macro Design CVPR 2024 + + +
+ Recently, efficient Vision Transformers have shown great performance with low +latency on resource-constrained devices. Conventionally, they use 4x4 patch +embeddings and a 4-stage structure at the macro level, while utilizing +sophisticated attention with multi-head configuration at the micro level. This +paper aims to address computational redundancy at all design levels in a +memory-efficient manner. We discover that using larger-stride patchify stem not +only reduces memory access costs but also achieves competitive performance by +leveraging token representations with reduced spatial redundancy from the early +stages. Furthermore, our preliminary analyses suggest that attention layers in +the early stages can be substituted with convolutions, and several attention +heads in the latter stages are computationally redundant. To handle this, we +introduce a single-head attention module that inherently prevents head +redundancy and simultaneously boosts accuracy by parallelly combining global +and local information. Building upon our solutions, we introduce SHViT, a +Single-Head Vision Transformer that obtains the state-of-the-art speed-accuracy +tradeoff. For example, on ImageNet-1k, our SHViT-S4 is 3.3x, 8.1x, and 2.4x +faster than MobileViTv2 x1.0 on GPU, CPU, and iPhone12 mobile device, +respectively, while being 1.3% more accurate. For object detection and instance +segmentation on MS COCO using Mask-RCNN head, our model achieves performance +comparable to FastViT-SA12 while exhibiting 3.8x and 2.0x lower backbone +latency on GPU and mobile device, respectively. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 227 + +
+
+
+ + ☆ Efficient Video Object Segmentation via Modulated Cross-Attention Memory + + +
+ Recently, transformer-based approaches have shown promising results for +semi-supervised video object segmentation. However, these approaches typically +struggle on long videos due to increased GPU memory demands, as they frequently +expand the memory bank every few frames. We propose a transformer-based +approach, named MAVOS, that introduces an optimized and dynamic long-term +modulated cross-attention (MCA) memory to model temporal smoothness without +requiring frequent memory expansion. The proposed MCA effectively encodes both +local and global features at various levels of granularity while efficiently +maintaining consistent speed regardless of the video length. Extensive +experiments on multiple benchmarks, LVOS, Long-Time Video, and DAVIS 2017, +demonstrate the effectiveness of our proposed contributions leading to +real-time inference and markedly reduced memory demands without any degradation +in segmentation accuracy on long videos. Compared to the best existing +transformer-based approach, our MAVOS increases the speed by 7.6x, while +significantly reducing the GPU memory by 87% with comparable segmentation +performance on short and long video datasets. Notably on the LVOS dataset, our +MAVOS achieves a J&F score of 63.3% while operating at 37 frames per second +(FPS) on a single V100 GPU. Our code and models will be publicly available at: +https://github.com/Amshaker/MAVOS. + +
+
+
+
+
+ + ☆ ConvoFusion: Multi-Modal Conversational Diffusion for Co-Speech Gesture + Synthesis CVPR 2024 + + +
+ Gestures play a key role in human communication. Recent methods for co-speech +gesture generation, while managing to generate beat-aligned motions, struggle +generating gestures that are semantically aligned with the utterance. Compared +to beat gestures that align naturally to the audio signal, semantically +coherent gestures require modeling the complex interactions between the +language and human motion, and can be controlled by focusing on certain words. +Therefore, we present ConvoFusion, a diffusion-based approach for multi-modal +gesture synthesis, which can not only generate gestures based on multi-modal +speech inputs, but can also facilitate controllability in gesture synthesis. +Our method proposes two guidance objectives that allow the users to modulate +the impact of different conditioning modalities (e.g. audio vs text) as well as +to choose certain words to be emphasized during gesturing. Our method is +versatile in that it can be trained either for generating monologue gestures or +even the conversational gestures. To further advance the research on +multi-party interactive gestures, the DnD Group Gesture dataset is released, +which contains 6 hours of gesture data showing 5 people interacting with one +another. We compare our method with several recent works and demonstrate +effectiveness of our method on a variety of tasks. We urge the reader to watch +our supplementary video at our website. + +
+
+ comment: CVPR 2024. Project Page: + https://vcai.mpi-inf.mpg.de/projects/ConvoFusion/ +
+
+
+
+
+ + ☆ OmniVid: A Generative Framework for Universal Video Understanding CVPR 2024 + + +
+ The core of video understanding tasks, such as recognition, captioning, and +tracking, is to automatically detect objects or actions in a video and analyze +their temporal evolution. Despite sharing a common goal, different tasks often +rely on distinct model architectures and annotation formats. In contrast, +natural language processing benefits from a unified output space, i.e., text +sequences, which simplifies the training of powerful foundational language +models, such as GPT-3, with extensive training corpora. Inspired by this, we +seek to unify the output space of video understanding tasks by using languages +as labels and additionally introducing time and box tokens. In this way, a +variety of video tasks could be formulated as video-grounded token generation. +This enables us to address various types of video tasks, including +classification (such as action recognition), captioning (covering clip +captioning, video question answering, and dense video captioning), and +localization tasks (such as visual object tracking) within a fully shared +encoder-decoder architecture, following a generative framework. Through +comprehensive experiments, we demonstrate such a simple and straightforward +idea is quite effective and can achieve state-of-the-art or competitive results +on seven video benchmarks, providing a novel perspective for more universal +video understanding. Code is available at https://github.com/wangjk666/OmniVid. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ AiOS: All-in-One-Stage Expressive Human Pose and Shape Estimation + + +
+ Expressive human pose and shape estimation (a.k.a. 3D whole-body mesh +recovery) involves the human body, hand, and expression estimation. Most +existing methods have tackled this task in a two-stage manner, first detecting +the human body part with an off-the-shelf detection model and inferring the +different human body parts individually. Despite the impressive results +achieved, these methods suffer from 1) loss of valuable contextual information +via cropping, 2) introducing distractions, and 3) lacking inter-association +among different persons and body parts, inevitably causing performance +degradation, especially for crowded scenes. To address these issues, we +introduce a novel all-in-one-stage framework, AiOS, for multiple expressive +human pose and shape recovery without an additional human detection step. +Specifically, our method is built upon DETR, which treats multi-person +whole-body mesh recovery task as a progressive set prediction problem with +various sequential detection. We devise the decoder tokens and extend them to +our task. Specifically, we first employ a human token to probe a human location +in the image and encode global features for each instance, which provides a +coarse location for the later transformer block. Then, we introduce a +joint-related token to probe the human joint in the image and encoder a +fine-grained local feature, which collaborates with the global feature to +regress the whole-body mesh. This straightforward but effective model +outperforms previous state-of-the-art methods by a 9% reduction in NMVE on +AGORA, a 30% reduction in PVE on EHF, a 10% reduction in PVE on ARCTIC, and a +3% reduction in PVE on EgoBody. + +
+
+ comment: Homepage: https://ttxskk.github.io/AiOS/ +
+
+
+
+
+ + ☆ SLEDGE: Synthesizing Simulation Environments for Driving Agents with + Generative Models + + +
+ SLEDGE is the first generative simulator for vehicle motion planning trained +on real-world driving logs. Its core component is a learned model that is able +to generate agent bounding boxes and lane graphs. The model's outputs serve as +an initial state for traffic simulation. The unique properties of the entities +to be generated for SLEDGE, such as their connectivity and variable count per +scene, render the naive application of most modern generative models to this +task non-trivial. Therefore, together with a systematic study of existing lane +graph representations, we introduce a novel raster-to-vector autoencoder +(RVAE). It encodes agents and the lane graph into distinct channels in a +rasterized latent map. This facilitates both lane-conditioned agent generation +and combined generation of lanes and agents with a Diffusion Transformer. Using +generated entities in SLEDGE enables greater control over the simulation, e.g. +upsampling turns or increasing traffic density. Further, SLEDGE can support +500m long routes, a capability not found in existing data-driven simulators +like nuPlan. It presents new challenges for planning algorithms, evidenced by +failure rates of over 40% for PDM, the winner of the 2023 nuPlan challenge, +when tested on hard routes and dense traffic generated by our model. Compared +to nuPlan, SLEDGE requires 500$\times$ less storage to set up (<4GB), making it +a more accessible option and helping with democratizing future research in this +field. + +
+
+
+
+
+ + ☆ Track Everything Everywhere Fast and Robustly + + +
+ We propose a novel test-time optimization approach for efficiently and +robustly tracking any pixel at any time in a video. The latest state-of-the-art +optimization-based tracking technique, OmniMotion, requires a prohibitively +long optimization time, rendering it impractical for downstream applications. +OmniMotion is sensitive to the choice of random seeds, leading to unstable +convergence. To improve efficiency and robustness, we introduce a novel +invertible deformation network, CaDeX++, which factorizes the function +representation into a local spatial-temporal feature grid and enhances the +expressivity of the coupling blocks with non-linear functions. While CaDeX++ +incorporates a stronger geometric bias within its architectural design, it also +takes advantage of the inductive bias provided by the vision foundation models. +Our system utilizes monocular depth estimation to represent scene geometry and +enhances the objective by incorporating DINOv2 long-term semantics to regulate +the optimization process. Our experiments demonstrate a substantial improvement +in training speed (more than \textbf{10 times} faster), robustness, and +accuracy in tracking over the SoTA optimization-based method OmniMotion. + +
+
+ comment: project page: https://timsong412.github.io/FastOmniTrack/ +
+
+
+
+
+ + ☆ Towards Explaining Hypercomplex Neural Networks + + +
+ Hypercomplex neural networks are gaining increasing interest in the deep +learning community. The attention directed towards hypercomplex models +originates from several aspects, spanning from purely theoretical and +mathematical characteristics to the practical advantage of lightweight models +over conventional networks, and their unique properties to capture both global +and local relations. In particular, a branch of these architectures, +parameterized hypercomplex neural networks (PHNNs), has also gained popularity +due to their versatility across a multitude of application domains. +Nonetheless, only few attempts have been made to explain or interpret their +intricacies. In this paper, we propose inherently interpretable PHNNs and +quaternion-like networks, thus without the need for any post-hoc method. To +achieve this, we define a type of cosine-similarity transform within the +parameterized hypercomplex domain. This PHB-cos transform induces weight +alignment with relevant input features and allows to reduce the model into a +single linear transform, rendering it directly interpretable. In this work, we +start to draw insights into how this unique branch of neural models operates. +We observe that hypercomplex networks exhibit a tendency to concentrate on the +shape around the main object of interest, in addition to the shape of the +object itself. We provide a thorough analysis, studying single neurons of +different layers and comparing them against how real-valued networks learn. The +code of the paper is available at https://github.com/ispamm/HxAI. + +
+
+ comment: The paper has been accepted at IEEE WCCI 2024 +
+
+
+
+
+ + ☆ FastCAR: Fast Classification And Regression Multi-Task Learning via Task + Consolidation for Modelling a Continuous Property Variable of Object Classes + + +
+ FastCAR is a novel task consolidation approach in Multi-Task Learning (MTL) +for a classification and a regression task, despite task heterogeneity with +only subtle correlation. It addresses object classification and continuous +property variable regression, a crucial use case in science and engineering. +FastCAR involves a labeling transformation approach that can be used with a +single-task regression network architecture. FastCAR outperforms traditional +MTL model families, parametrized in the landscape of architecture and loss +weighting schemes, when learning of both tasks are collectively considered +(classification accuracy of 99.54%, regression mean absolute percentage error +of 2.3%). The experiments performed used an Advanced Steel Property dataset +contributed by us. The dataset comprises 4536 images of 224x224 pixels, +annotated with object classes and hardness properties that take continuous +values. With the labeling transformation and single-task regression network +architecture, FastCAR achieves reduced latency and time efficiency. + +
+
+
+
+
+ + ☆ AID: Attention Interpolation of Text-to-Image Diffusion + + +
+ Conditional diffusion models can create unseen images in various settings, +aiding image interpolation. Interpolation in latent spaces is well-studied, but +interpolation with specific conditions like text or poses is less understood. +Simple approaches, such as linear interpolation in the space of conditions, +often result in images that lack consistency, smoothness, and fidelity. To that +end, we introduce a novel training-free technique named Attention Interpolation +via Diffusion (AID). Our key contributions include 1) proposing an inner/outer +interpolated attention layer; 2) fusing the interpolated attention with +self-attention to boost fidelity; and 3) applying beta distribution to +selection to increase smoothness. We also present a variant, Prompt-guided +Attention Interpolation via Diffusion (PAID), that considers interpolation as a +condition-dependent generative process. This method enables the creation of new +images with greater consistency, smoothness, and efficiency, and offers control +over the exact path of interpolation. Our approach demonstrates effectiveness +for conceptual and spatial interpolation. Code and demo are available at +https://github.com/QY-H00/attention-interpolation-diffusion. + +
+
+
+
+
+ + ☆ TC4D: Trajectory-Conditioned Text-to-4D Generation + + +
+ Recent techniques for text-to-4D generation synthesize dynamic 3D scenes +using supervision from pre-trained text-to-video models. However, existing +representations for motion, such as deformation models or time-dependent neural +representations, are limited in the amount of motion they can generate-they +cannot synthesize motion extending far beyond the bounding box used for volume +rendering. The lack of a more flexible motion model contributes to the gap in +realism between 4D generation methods and recent, near-photorealistic video +generation models. Here, we propose TC4D: trajectory-conditioned text-to-4D +generation, which factors motion into global and local components. We represent +the global motion of a scene's bounding box using rigid transformation along a +trajectory parameterized by a spline. We learn local deformations that conform +to the global trajectory using supervision from a text-to-video model. Our +approach enables the synthesis of scenes animated along arbitrary trajectories, +compositional scene generation, and significant improvements to the realism and +amount of generated motion, which we evaluate qualitatively and through a user +study. Video results can be viewed on our website: +https://sherwinbahmani.github.io/tc4d. + +
+
+ comment: Project Page: https://sherwinbahmani.github.io/tc4d +
+
+
+
+
+ + ☆ CMP: Cooperative Motion Prediction with Multi-Agent Communication + + +
+ The confluence of the advancement of Autonomous Vehicles (AVs) and the +maturity of Vehicle-to-Everything (V2X) communication has enabled the +capability of cooperative connected and automated vehicles (CAVs). Building on +top of cooperative perception, this paper explores the feasibility and +effectiveness of cooperative motion prediction. Our method, CMP, takes LiDAR +signals as input to enhance tracking and prediction capabilities. Unlike +previous work that focuses separately on either cooperative perception or +motion prediction, our framework, to the best of our knowledge, is the first to +address the unified problem where CAVs share information in both perception and +prediction modules. Incorporated into our design is the unique capability to +tolerate realistic V2X bandwidth limitations and transmission delays, while +dealing with bulky perception representations. We also propose a prediction +aggregation module, which unifies the predictions obtained by different CAVs +and generates the final prediction. Through extensive experiments and ablation +studies, we demonstrate the effectiveness of our method in cooperative +perception, tracking, and motion prediction tasks. In particular, CMP reduces +the average prediction error by 17.2\% with fewer missing detections compared +with the no cooperation setting. Our work marks a significant step forward in +the cooperative capabilities of CAVs, showcasing enhanced performance in +complex scenarios. + +
+
+
+
+
+ + ☆ Leveraging Near-Field Lighting for Monocular Depth Estimation from + Endoscopy Videos + + +
+ Monocular depth estimation in endoscopy videos can enable assistive and +robotic surgery to obtain better coverage of the organ and detection of various +health issues. Despite promising progress on mainstream, natural image depth +estimation, techniques perform poorly on endoscopy images due to a lack of +strong geometric features and challenging illumination effects. In this paper, +we utilize the photometric cues, i.e., the light emitted from an endoscope and +reflected by the surface, to improve monocular depth estimation. We first +create two novel loss functions with supervised and self-supervised variants +that utilize a per-pixel shading representation. We then propose a novel depth +refinement network (PPSNet) that leverages the same per-pixel shading +representation. Finally, we introduce teacher-student transfer learning to +produce better depth maps from both synthetic data with supervision and +clinical data with self-supervision. We achieve state-of-the-art results on the +C3VD dataset while estimating high-quality depth maps from clinical data. Our +code, pre-trained models, and supplementary materials can be found on our +project page: https://ppsnet.github.io/ + +
+
+ comment: 26 pages, 7 tables, 7 figures +
+
+
+
+
+ + ☆ ELGC-Net: Efficient Local-Global Context Aggregation for Remote Sensing + Change Detection + + +
+ Deep learning has shown remarkable success in remote sensing change detection +(CD), aiming to identify semantic change regions between co-registered +satellite image pairs acquired at distinct time stamps. However, existing +convolutional neural network and transformer-based frameworks often struggle to +accurately segment semantic change regions. Moreover, transformers-based +methods with standard self-attention suffer from quadratic computational +complexity with respect to the image resolution, making them less practical for +CD tasks with limited training data. To address these issues, we propose an +efficient change detection framework, ELGC-Net, which leverages rich contextual +information to precisely estimate change regions while reducing the model size. +Our ELGC-Net comprises a Siamese encoder, fusion modules, and a decoder. The +focus of our design is the introduction of an Efficient Local-Global Context +Aggregator module within the encoder, capturing enhanced global context and +local spatial information through a novel pooled-transpose (PT) attention and +depthwise convolution, respectively. The PT attention employs pooling +operations for robust feature extraction and minimizes computational cost with +transposed attention. Extensive experiments on three challenging CD datasets +demonstrate that ELGC-Net outperforms existing methods. Compared to the recent +transformer-based CD approach (ChangeFormer), ELGC-Net achieves a 1.4% gain in +intersection over union metric on the LEVIR-CD dataset, while significantly +reducing trainable parameters. Our proposed ELGC-Net sets a new +state-of-the-art performance in remote sensing change detection benchmarks. +Finally, we also introduce ELGC-Net-LW, a lighter variant with significantly +reduced computational complexity, suitable for resource-constrained settings, +while achieving comparable performance. Project url +https://github.com/techmn/elgcnet. + +
+
+ comment: accepted at IEEE TGRS +
+
+
+
+
+ + ☆ Scalable Non-Cartesian Magnetic Resonance Imaging with R2D2 + + +
+ We propose a new approach for non-Cartesian magnetic resonance image +reconstruction. While unrolled architectures provide robustness via +data-consistency layers, embedding measurement operators in Deep Neural Network +(DNN) can become impractical at large scale. Alternative Plug-and-Play (PnP) +approaches, where the denoising DNNs are blind to the measurement setting, are +not affected by this limitation and have also proven effective, but their +highly iterative nature also affects scalability. To address this scalability +challenge, we leverage the "Residual-to-Residual DNN series for high-Dynamic +range imaging (R2D2)" approach recently introduced in astronomical imaging. +R2D2's reconstruction is formed as a series of residual images, iteratively +estimated as outputs of DNNs taking the previous iteration's image estimate and +associated data residual as inputs. The method can be interpreted as a learned +version of the Matching Pursuit algorithm. We demonstrate R2D2 in simulation, +considering radial k-space sampling acquisition sequences. Our preliminary +results suggest that R2D2 achieves: (i) suboptimal performance compared to its +unrolled incarnation R2D2-Net, which is however non-scalable due to the +necessary embedding of NUFFT-based data-consistency layers; (ii) superior +reconstruction quality to a scalable version of R2D2-Net embedding an FFT-based +approximation for data consistency; (iii) superior reconstruction quality to +PnP, while only requiring few iterations. + +
+
+ comment: submitted to IEEE EUSIPCO 2024 +
+
+
+
+
+ + ☆ Serpent: Scalable and Efficient Image Restoration via Multi-scale + Structured State Space Models + + +
+ The landscape of computational building blocks of efficient image restoration +architectures is dominated by a combination of convolutional processing and +various attention mechanisms. However, convolutional filters are inherently +local and therefore struggle at modeling long-range dependencies in images. On +the other hand, attention excels at capturing global interactions between +arbitrary image regions, however at a quadratic cost in image dimension. In +this work, we propose Serpent, an architecture that leverages recent advances +in state space models (SSMs) in its core computational block. SSMs, originally +introduced for sequence modeling, can maintain a global receptive field with a +favorable linear scaling in input size. Our preliminary results demonstrate +that Serpent can achieve reconstruction quality on par with state-of-the-art +techniques, while requiring orders of magnitude less compute (up to $150$ fold +reduction in FLOPS) and a factor of up to $5\times$ less GPU memory while +maintaining a compact model size. + +
+
+ comment: 7 pages, 5 figures, preliminary workshop submission of a + comprehensive work to be released soon +
+
+
+
+
+ + ☆ Octree-GS: Towards Consistent Real-time Rendering with LOD-Structured 3D + Gaussians + + +
+ The recent 3D Gaussian splatting (3D-GS) has shown remarkable rendering +fidelity and efficiency compared to NeRF-based neural scene representations. +While demonstrating the potential for real-time rendering, 3D-GS encounters +rendering bottlenecks in large scenes with complex details due to an excessive +number of Gaussian primitives located within the viewing frustum. This +limitation is particularly noticeable in zoom-out views and can lead to +inconsistent rendering speeds in scenes with varying details. Moreover, it +often struggles to capture the corresponding level of details at different +scales with its heuristic density control operation. Inspired by the +Level-of-Detail (LOD) techniques, we introduce Octree-GS, featuring an +LOD-structured 3D Gaussian approach supporting level-of-detail decomposition +for scene representation that contributes to the final rendering results. Our +model dynamically selects the appropriate level from the set of +multi-resolution anchor points, ensuring consistent rendering performance with +adaptive LOD adjustments while maintaining high-fidelity rendering results. + +
+
+ comment: Project page: https://city-super.github.io/octree-gs/ +
+
+
+
+
+ + ☆ A Survey on 3D Egocentric Human Pose Estimation + + +
+ Egocentric human pose estimation aims to estimate human body poses and +develop body representations from a first-person camera perspective. It has +gained vast popularity in recent years because of its wide range of +applications in sectors like XR-technologies, human-computer interaction, and +fitness tracking. However, to the best of our knowledge, there is no systematic +literature review based on the proposed solutions regarding egocentric 3D human +pose estimation. To that end, the aim of this survey paper is to provide an +extensive overview of the current state of egocentric pose estimation research. +In this paper, we categorize and discuss the popular datasets and the different +pose estimation models, highlighting the strengths and weaknesses of different +methods by comparative analysis. This survey can be a valuable resource for +both researchers and practitioners in the field, offering insights into key +concepts and cutting-edge solutions in egocentric pose estimation, its +wide-ranging applications, as well as the open problems with future scope. + +
+
+
+
+
+ + ☆ 2D Gaussian Splatting for Geometrically Accurate Radiance Fields + + +
+ 3D Gaussian Splatting (3DGS) has recently revolutionized radiance field +reconstruction, achieving high quality novel view synthesis and fast rendering +speed without baking. However, 3DGS fails to accurately represent surfaces due +to the multi-view inconsistent nature of 3D Gaussians. We present 2D Gaussian +Splatting (2DGS), a novel approach to model and reconstruct geometrically +accurate radiance fields from multi-view images. Our key idea is to collapse +the 3D volume into a set of 2D oriented planar Gaussian disks. Unlike 3D +Gaussians, 2D Gaussians provide view-consistent geometry while modeling +surfaces intrinsically. To accurately recover thin surfaces and achieve stable +optimization, we introduce a perspective-accurate 2D splatting process +utilizing ray-splat intersection and rasterization. Additionally, we +incorporate depth distortion and normal consistency terms to further enhance +the quality of the reconstructions. We demonstrate that our differentiable +renderer allows for noise-free and detailed geometry reconstruction while +maintaining competitive appearance quality, fast training speed, and real-time +rendering. Our code will be made publicly available. + +
+
+ comment: 12 pages, 12 figures +
+
+
+
+
+ + ☆ Sen2Fire: A Challenging Benchmark Dataset for Wildfire Detection using + Sentinel Data + + +
+ Utilizing satellite imagery for wildfire detection presents substantial +potential for practical applications. To advance the development of machine +learning algorithms in this domain, our study introduces the \textit{Sen2Fire} +dataset--a challenging satellite remote sensing dataset tailored for wildfire +detection. This dataset is curated from Sentinel-2 multi-spectral data and +Sentinel-5P aerosol product, comprising a total of 2466 image patches. Each +patch has a size of 512$\times$512 pixels with 13 bands. Given the distinctive +sensitivities of various wavebands to wildfire responses, our research focuses +on optimizing wildfire detection by evaluating different wavebands and +employing a combination of spectral indices, such as normalized burn ratio +(NBR) and normalized difference vegetation index (NDVI). The results suggest +that, in contrast to using all bands for wildfire detection, selecting specific +band combinations yields superior performance. Additionally, our study +underscores the positive impact of integrating Sentinel-5 aerosol data for +wildfire detection. The code and dataset are available online +(https://zenodo.org/records/10881058). + +
+
+
+
+
+ + ☆ Superior and Pragmatic Talking Face Generation with Teacher-Student + Framework + + +
+ Talking face generation technology creates talking videos from arbitrary +appearance and motion signal, with the "arbitrary" offering ease of use but +also introducing challenges in practical applications. Existing methods work +well with standard inputs but suffer serious performance degradation with +intricate real-world ones. Moreover, efficiency is also an important concern in +deployment. To comprehensively address these issues, we introduce SuperFace, a +teacher-student framework that balances quality, robustness, cost and +editability. We first propose a simple but effective teacher model capable of +handling inputs of varying qualities to generate high-quality results. Building +on this, we devise an efficient distillation strategy to acquire an +identity-specific student model that maintains quality with significantly +reduced computational load. Our experiments validate that SuperFace offers a +more comprehensive solution than existing methods for the four mentioned +objectives, especially in reducing FLOPs by 99\% with the student model. +SuperFace can be driven by both video and audio and allows for localized facial +attributes editing. + +
+
+
+
+
+ + ☆ Deepfake Generation and Detection: A Benchmark and Survey + + +
+ In addition to the advancements in deepfake generation, corresponding +detection technologies need to continuously evolve to regulate the potential +misuse of deepfakes, such as for privacy invasion and phishing attacks. This +survey comprehensively reviews the latest developments in deepfake generation +and detection, summarizing and analyzing the current state of the art in this +rapidly evolving field. We first unify task definitions, comprehensively +introduce datasets and metrics, and discuss the development of generation and +detection technology frameworks. Then, we discuss the development of several +related sub-fields and focus on researching four mainstream deepfake fields: +popular face swap, face reenactment, talking face generation, and facial +attribute editing, as well as foreign detection. Subsequently, we +comprehensively benchmark representative methods on popular datasets for each +field, fully evaluating the latest and influential works published in top +conferences/journals. Finally, we analyze the challenges and future research +directions of the discussed fields. We closely follow the latest developments +in https://github.com/flyingby/Awesome-Deepfake-Generation-and-Detection. + +
+
+
+
+
+ + ☆ Low-Latency Neural Stereo Streaming CVPR2024 + + +
+ The rise of new video modalities like virtual reality or autonomous driving +has increased the demand for efficient multi-view video compression methods, +both in terms of rate-distortion (R-D) performance and in terms of delay and +runtime. While most recent stereo video compression approaches have shown +promising performance, they compress left and right views sequentially, leading +to poor parallelization and runtime performance. This work presents Low-Latency +neural codec for Stereo video Streaming (LLSS), a novel parallel stereo video +coding method designed for fast and efficient low-latency stereo video +streaming. Instead of using a sequential cross-view motion compensation like +existing methods, LLSS introduces a bidirectional feature shifting module to +directly exploit mutual information among views and encode them effectively +with a joint cross-view prior model for entropy coding. Thanks to this design, +LLSS processes left and right views in parallel, minimizing latency; all while +substantially improving R-D performance compared to both existing neural and +conventional codecs. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ Boosting Diffusion Models with Moving Average Sampling in Frequency + Domain CVPR 2024 + + +
+ Diffusion models have recently brought a powerful revolution in image +generation. Despite showing impressive generative capabilities, most of these +models rely on the current sample to denoise the next one, possibly resulting +in denoising instability. In this paper, we reinterpret the iterative denoising +process as model optimization and leverage a moving average mechanism to +ensemble all the prior samples. Instead of simply applying moving average to +the denoised samples at different timesteps, we first map the denoised samples +to data space and then perform moving average to avoid distribution shift +across timesteps. In view that diffusion models evolve the recovery from +low-frequency components to high-frequency details, we further decompose the +samples into different frequency components and execute moving average +separately on each component. We name the complete approach "Moving Average +Sampling in Frequency domain (MASF)". MASF could be seamlessly integrated into +mainstream pre-trained diffusion models and sampling schedules. Extensive +experiments on both unconditional and conditional diffusion models demonstrate +that our MASF leads to superior performances compared to the baselines, with +almost negligible additional complexity cost. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ To Supervise or Not to Supervise: Understanding and Addressing the Key + Challenges of 3D Transfer Learning + + +
+ Transfer learning has long been a key factor in the advancement of many +fields including 2D image analysis. Unfortunately, its applicability in 3D data +processing has been relatively limited. While several approaches for 3D +transfer learning have been proposed in recent literature, with contrastive +learning gaining particular prominence, most existing methods in this domain +have only been studied and evaluated in limited scenarios. Most importantly, +there is currently a lack of principled understanding of both when and why 3D +transfer learning methods are applicable. Remarkably, even the applicability of +standard supervised pre-training is poorly understood. In this work, we conduct +the first in-depth quantitative and qualitative investigation of supervised and +contrastive pre-training strategies and their utility in downstream 3D tasks. +We demonstrate that layer-wise analysis of learned features provides +significant insight into the downstream utility of trained networks. Informed +by this analysis, we propose a simple geometric regularization strategy, which +improves the transferability of supervised pre-training. Our work thus sheds +light onto both the specific challenges of 3D transfer learning, as well as +strategies to overcome them. + +
+
+
+
+
+ + ☆ Hierarchical Open-Vocabulary 3D Scene Graphs for Language-Grounded Robot + Navigation + + +
+ Recent open-vocabulary robot mapping methods enrich dense geometric maps with +pre-trained visual-language features. While these maps allow for the prediction +of point-wise saliency maps when queried for a certain language concept, +large-scale environments and abstract queries beyond the object level still +pose a considerable hurdle, ultimately limiting language-grounded robotic +navigation. In this work, we present HOV-SG, a hierarchical open-vocabulary 3D +scene graph mapping approach for language-grounded robot navigation. Leveraging +open-vocabulary vision foundation models, we first obtain state-of-the-art +open-vocabulary segment-level maps in 3D and subsequently construct a 3D scene +graph hierarchy consisting of floor, room, and object concepts, each enriched +with open-vocabulary features. Our approach is able to represent multi-story +buildings and allows robotic traversal of those using a cross-floor Voronoi +graph. HOV-SG is evaluated on three distinct datasets and surpasses previous +baselines in open-vocabulary semantic accuracy on the object, room, and floor +level while producing a 75% reduction in representation size compared to dense +open-vocabulary maps. In order to prove the efficacy and generalization +capabilities of HOV-SG, we showcase successful long-horizon +language-conditioned robot navigation within real-world multi-storage +environments. We provide code and trial video data at http://hovsg.github.io/. + +
+
+ comment: Code and video are available at http://hovsg.github.io/ +
+
+
+
+
+ + ☆ ReMamber: Referring Image Segmentation with Mamba Twister + + +
+ Referring Image Segmentation (RIS) leveraging transformers has achieved great +success on the interpretation of complex visual-language tasks. However, the +quadratic computation cost makes it resource-consuming in capturing long-range +visual-language dependencies. Fortunately, Mamba addresses this with efficient +linear complexity in processing. However, directly applying Mamba to +multi-modal interactions presents challenges, primarily due to inadequate +channel interactions for the effective fusion of multi-modal data. In this +paper, we propose ReMamber, a novel RIS architecture that integrates the power +of Mamba with a multi-modal Mamba Twister block. The Mamba Twister explicitly +models image-text interaction, and fuses textual and visual features through +its unique channel and spatial twisting mechanism. We achieve the +state-of-the-art on three challenging benchmarks. Moreover, we conduct thorough +analyses of ReMamber and discuss other fusion designs using Mamba. These +provide valuable perspectives for future research. + +
+
+
+
+
+ + ☆ GTA-HDR: A Large-Scale Synthetic Dataset for HDR Image Reconstruction + + +
+ High Dynamic Range (HDR) content (i.e., images and videos) has a broad range +of applications. However, capturing HDR content from real-world scenes is +expensive and time- consuming. Therefore, the challenging task of +reconstructing visually accurate HDR images from their Low Dynamic Range (LDR) +counterparts is gaining attention in the vision research community. A major +challenge in this research problem is the lack of datasets, which capture +diverse scene conditions (e.g., lighting, shadows, weather, locations, +landscapes, objects, humans, buildings) and various image features (e.g., +color, contrast, saturation, hue, luminance, brightness, radiance). To address +this gap, in this paper, we introduce GTA-HDR, a large-scale synthetic dataset +of photo-realistic HDR images sampled from the GTA-V video game. We perform +thorough evaluation of the proposed dataset, which demonstrates significant +qualitative and quantitative improvements of the state-of-the-art HDR image +reconstruction methods. Furthermore, we demonstrate the effectiveness of the +proposed dataset and its impact on additional computer vision tasks including +3D human pose estimation, human body part segmentation, and holistic scene +segmentation. The dataset, data collection pipeline, and evaluation code are +available at: https://github.com/HrishavBakulBarua/GTA-HDR. + +
+
+ comment: Submitted to IEEE +
+
+
+
+
+ + ☆ A foundation model utilizing chest CT volumes and radiology reports for + supervised-level zero-shot detection of abnormalities + + +
+ A major challenge in computational research in 3D medical imaging is the lack +of comprehensive datasets. Addressing this issue, our study introduces CT-RATE, +the first 3D medical imaging dataset that pairs images with textual reports. +CT-RATE consists of 25,692 non-contrast chest CT volumes, expanded to 50,188 +through various reconstructions, from 21,304 unique patients, along with +corresponding radiology text reports. Leveraging CT-RATE, we developed CT-CLIP, +a CT-focused contrastive language-image pre-training framework. As a versatile, +self-supervised model, CT-CLIP is designed for broad application and does not +require task-specific training. Remarkably, CT-CLIP outperforms +state-of-the-art, fully supervised methods in multi-abnormality detection +across all key metrics, thus eliminating the need for manual annotation. We +also demonstrate its utility in case retrieval, whether using imagery or +textual queries, thereby advancing knowledge dissemination. The open-source +release of CT-RATE and CT-CLIP marks a significant advancement in medical AI, +enhancing 3D imaging analysis and fostering innovation in healthcare. + +
+
+
+
+
+ + ☆ Assessment of Multimodal Large Language Models in Alignment with Human + Values + + +
+ Large Language Models (LLMs) aim to serve as versatile assistants aligned +with human values, as defined by the principles of being helpful, honest, and +harmless (hhh). However, in terms of Multimodal Large Language Models (MLLMs), +despite their commendable performance in perception and reasoning tasks, their +alignment with human values remains largely unexplored, given the complexity of +defining hhh dimensions in the visual world and the difficulty in collecting +relevant data that accurately mirrors real-world situations. To address this +gap, we introduce Ch3Ef, a Compreh3ensive Evaluation dataset and strategy for +assessing alignment with human expectations. Ch3Ef dataset contains 1002 +human-annotated data samples, covering 12 domains and 46 tasks based on the hhh +principle. We also present a unified evaluation strategy supporting assessment +across various scenarios and different perspectives. Based on the evaluation +results, we summarize over 10 key findings that deepen the understanding of +MLLM capabilities, limitations, and the dynamic relationships between +evaluation levels, guiding future advancements in the field. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2311.02692 +
+
+
+
+
+ + ☆ DiffH2O: Diffusion-Based Synthesis of Hand-Object Interactions from + Textual Descriptions + + +
+ Generating natural hand-object interactions in 3D is challenging as the +resulting hand and object motions are expected to be physically plausible and +semantically meaningful. Furthermore, generalization to unseen objects is +hindered by the limited scale of available hand-object interaction datasets. We +propose DiffH2O, a novel method to synthesize realistic, one or two-handed +object interactions from provided text prompts and geometry of the object. The +method introduces three techniques that enable effective learning from limited +data. First, we decompose the task into a grasping stage and a text-based +interaction stage and use separate diffusion models for each. In the grasping +stage, the model only generates hand motions, whereas in the interaction phase +both hand and object poses are synthesized. Second, we propose a compact +representation that tightly couples hand and object poses. Third, we propose +two different guidance schemes to allow more control of the generated motions: +grasp guidance and detailed textual guidance. Grasp guidance takes a single +target grasping pose and guides the diffusion model to reach this grasp at the +end of the grasping stage, which provides control over the grasping pose. Given +a grasping motion from this stage, multiple different actions can be prompted +in the interaction phase. For textual guidance, we contribute comprehensive +text descriptions to the GRAB dataset and show that they enable our method to +have more fine-grained control over hand-object interactions. Our quantitative +and qualitative evaluation demonstrates that the proposed method outperforms +baseline methods and leads to natural hand-object motions. Moreover, we +demonstrate the practicality of our framework by utilizing a hand pose estimate +from an off-the-shelf pose estimator for guidance, and then sampling multiple +different actions in the interaction stage. + +
+
+ comment: Project Page: https://diffh2o.github.io/ +
+
+
+
+
+ + ☆ Efficient Image Pre-Training with Siamese Cropped Masked Autoencoders + + +
+ Self-supervised pre-training of image encoders is omnipresent in the +literature, particularly following the introduction of Masked autoencoders +(MAE). Current efforts attempt to learn object-centric representations from +motion in videos. In particular, SiamMAE recently introduced a Siamese network, +training a shared-weight encoder from two frames of a video with a high +asymmetric masking ratio (95%). In this work, we propose CropMAE, an +alternative approach to the Siamese pre-training introduced by SiamMAE. Our +method specifically differs by exclusively considering pairs of cropped images +sourced from the same image but cropped differently, deviating from the +conventional pairs of frames extracted from a video. CropMAE therefore +alleviates the need for video datasets, while maintaining competitive +performances and drastically reducing pre-training time. Furthermore, we +demonstrate that CropMAE learns similar object-centric representations without +explicit motion, showing that current self-supervised learning methods do not +learn objects from motion, but rather thanks to the Siamese architecture. +Finally, CropMAE achieves the highest masking ratio to date (98.5%), enabling +the reconstruction of images using only two visible patches. Our code is +available at https://github.com/alexandre-eymael/CropMAE. + +
+
+ comment: 19 pages, 6 figures, 3 tables, 1 page of supplementary material +
+
+
+
+
+ + ☆ DN-Splatter: Depth and Normal Priors for Gaussian Splatting and Meshing + + +
+ 3D Gaussian splatting, a novel differentiable rendering technique, has +achieved state-of-the-art novel view synthesis results with high rendering +speeds and relatively low training times. However, its performance on scenes +commonly seen in indoor datasets is poor due to the lack of geometric +constraints during optimization. We extend 3D Gaussian splatting with depth and +normal cues to tackle challenging indoor datasets and showcase techniques for +efficient mesh extraction, an important downstream application. Specifically, +we regularize the optimization procedure with depth information, enforce local +smoothness of nearby Gaussians, and use the geometry of the 3D Gaussians +supervised by normal cues to achieve better alignment with the true scene +geometry. We improve depth estimation and novel view synthesis results over +baselines and show how this simple yet effective regularization technique can +be used to directly extract meshes from the Gaussian representation yielding +more physically accurate reconstructions on indoor scenes. Our code will be +released in https://github.com/maturk/dn-splatter. + +
+
+
+
+
+ + ☆ Annotated Biomedical Video Generation using Denoising Diffusion + Probabilistic Models and Flow Fields + + +
+ The segmentation and tracking of living cells play a vital role within the +biomedical domain, particularly in cancer research, drug development, and +developmental biology. These are usually tedious and time-consuming tasks that +are traditionally done by biomedical experts. Recently, to automatize these +processes, deep learning based segmentation and tracking methods have been +proposed. These methods require large-scale datasets and their full potential +is constrained by the scarcity of annotated data in the biomedical imaging +domain. To address this limitation, we propose Biomedical Video Diffusion Model +(BVDM), capable of generating realistic-looking synthetic microscopy videos. +Trained only on a single real video, BVDM can generate videos of arbitrary +length with pixel-level annotations that can be used for training data-hungry +models. It is composed of a denoising diffusion probabilistic model (DDPM) +generating high-fidelity synthetic cell microscopy images and a flow prediction +model (FPM) predicting the non-rigid transformation between consecutive video +frames. During inference, initially, the DDPM imposes realistic cell textures +on synthetic cell masks which are generated based on real data statistics. The +flow prediction model predicts the flow field between consecutive masks and +applies that to the DDPM output from the previous time frame to create the next +one while keeping temporal consistency. BVDM outperforms state-of-the-art +synthetic live cell microscopy video generation models. Furthermore, we +demonstrate that a sufficiently large synthetic dataset enhances the +performance of cell segmentation and tracking models compared to using a +limited amount of available real data. + +
+
+
+
+
+ + ☆ Improving Text-to-Image Consistency via Automatic Prompt Optimization + + +
+ Impressive advances in text-to-image (T2I) generative models have yielded a +plethora of high performing models which are able to generate aesthetically +appealing, photorealistic images. Despite the progress, these models still +struggle to produce images that are consistent with the input prompt, +oftentimes failing to capture object quantities, relations and attributes +properly. Existing solutions to improve prompt-image consistency suffer from +the following challenges: (1) they oftentimes require model fine-tuning, (2) +they only focus on nearby prompt samples, and (3) they are affected by +unfavorable trade-offs among image quality, representation diversity, and +prompt-image consistency. In this paper, we address these challenges and +introduce a T2I optimization-by-prompting framework, OPT2I, which leverages a +large language model (LLM) to improve prompt-image consistency in T2I models. +Our framework starts from a user prompt and iteratively generates revised +prompts with the goal of maximizing a consistency score. Our extensive +validation on two datasets, MSCOCO and PartiPrompts, shows that OPT2I can boost +the initial consistency score by up to 24.9% in terms of DSG score while +preserving the FID and increasing the recall between generated and real data. +Our work paves the way toward building more reliable and robust T2I systems by +harnessing the power of LLMs. + +
+
+
+
+
+ + ☆ Towards 3D Vision with Low-Cost Single-Photon Cameras + + +
+ We present a method for reconstructing 3D shape of arbitrary Lambertian +objects based on measurements by miniature, energy-efficient, low-cost +single-photon cameras. These cameras, operating as time resolved image sensors, +illuminate the scene with a very fast pulse of diffuse light and record the +shape of that pulse as it returns back from the scene at a high temporal +resolution. We propose to model this image formation process, account for its +non-idealities, and adapt neural rendering to reconstruct 3D geometry from a +set of spatially distributed sensors with known poses. We show that our +approach can successfully recover complex 3D shapes from simulated data. We +further demonstrate 3D object reconstruction from real-world captures, +utilizing measurements from a commodity proximity sensor. Our work draws a +connection between image-based modeling and active range scanning and is a step +towards 3D vision with single-photon cameras. + +
+
+
+
+
+ + ☆ Evaluating the Efficacy of Prompt-Engineered Large Multimodal Models + Versus Fine-Tuned Vision Transformers in Image-Based Security Applications + + +
+ The success of Large Language Models (LLMs) has led to a parallel rise in the +development of Large Multimodal Models (LMMs), such as Gemini-pro, which have +begun to transform a variety of applications. These sophisticated multimodal +models are designed to interpret and analyze complex data, integrating both +textual and visual information on a scale previously unattainable, opening new +avenues for a range of applications. This paper investigates the applicability +and effectiveness of prompt-engineered Gemini-pro LMMs versus fine-tuned Vision +Transformer (ViT) models in addressing critical security challenges. We focus +on two distinct tasks: a visually evident task of detecting simple triggers, +such as small squares in images, indicative of potential backdoors, and a +non-visually evident task of malware classification through visual +representations. Our results highlight a significant divergence in performance, +with Gemini-pro falling short in accuracy and reliability when compared to +fine-tuned ViT models. The ViT models, on the other hand, demonstrate +exceptional accuracy, achieving near-perfect performance on both tasks. This +study not only showcases the strengths and limitations of prompt-engineered +LMMs in cybersecurity applications but also emphasizes the unmatched efficacy +of fine-tuned ViT models for precise and dependable tasks. + +
+
+
+
+
+ + ☆ GenesisTex: Adapting Image Denoising Diffusion to Texture Space + + +
+ We present GenesisTex, a novel method for synthesizing textures for 3D +geometries from text descriptions. GenesisTex adapts the pretrained image +diffusion model to texture space by texture space sampling. Specifically, we +maintain a latent texture map for each viewpoint, which is updated with +predicted noise on the rendering of the corresponding viewpoint. The sampled +latent texture maps are then decoded into a final texture map. During the +sampling process, we focus on both global and local consistency across multiple +viewpoints: global consistency is achieved through the integration of style +consistency mechanisms within the noise prediction network, and low-level +consistency is achieved by dynamically aligning latent textures. Finally, we +apply reference-based inpainting and img2img on denser views for texture +refinement. Our approach overcomes the limitations of slow optimization in +distillation-based methods and instability in inpainting-based methods. +Experiments on meshes from various sources demonstrate that our method +surpasses the baseline methods quantitatively and qualitatively. + +
+
+ comment: 12 pages, 10 figures +
+
+
+
+
+ + ☆ CT Synthesis with Conditional Diffusion Models for Abdominal Lymph Node + Segmentation + + +
+ Despite the significant success achieved by deep learning methods in medical +image segmentation, researchers still struggle in the computer-aided diagnosis +of abdominal lymph nodes due to the complex abdominal environment, small and +indistinguishable lesions, and limited annotated data. To address these +problems, we present a pipeline that integrates the conditional diffusion model +for lymph node generation and the nnU-Net model for lymph node segmentation to +improve the segmentation performance of abdominal lymph nodes through +synthesizing a diversity of realistic abdominal lymph node data. We propose +LN-DDPM, a conditional denoising diffusion probabilistic model (DDPM) for lymph +node (LN) generation. LN-DDPM utilizes lymph node masks and anatomical +structure masks as model conditions. These conditions work in two conditioning +mechanisms: global structure conditioning and local detail conditioning, to +distinguish between lymph nodes and their surroundings and better capture lymph +node characteristics. The obtained paired abdominal lymph node images and masks +are used for the downstream segmentation task. Experimental results on the +abdominal lymph node datasets demonstrate that LN-DDPM outperforms other +generative methods in the abdominal lymph node image synthesis and better +assists the downstream abdominal lymph node segmentation task. + +
+
+
+
+
+ + ☆ MUTE-SLAM: Real-Time Neural SLAM with Multiple Tri-Plane Hash + Representations + + +
+ We introduce MUTE-SLAM, a real-time neural RGB-D SLAM system employing +multiple tri-plane hash-encodings for efficient scene representation. MUTE-SLAM +effectively tracks camera positions and incrementally builds a scalable +multi-map representation for both small and large indoor environments. It +dynamically allocates sub-maps for newly observed local regions, enabling +constraint-free mapping without prior scene information. Unlike traditional +grid-based methods, we use three orthogonal axis-aligned planes for +hash-encoding scene properties, significantly reducing hash collisions and the +number of trainable parameters. This hybrid approach not only speeds up +convergence but also enhances the fidelity of surface reconstruction. +Furthermore, our optimization strategy concurrently optimizes all sub-maps +intersecting with the current camera frustum, ensuring global consistency. +Extensive testing on both real-world and synthetic datasets has shown that +MUTE-SLAM delivers state-of-the-art surface reconstruction quality and +competitive tracking performance across diverse indoor settings. The code will +be made public upon acceptance of the paper. + +
+
+
+
+
+ + ☆ Makeup Prior Models for 3D Facial Makeup Estimation and Applications CVPR2024 + + +
+ In this work, we introduce two types of makeup prior models to extend +existing 3D face prior models: PCA-based and StyleGAN2-based priors. The +PCA-based prior model is a linear model that is easy to construct and is +computationally efficient. However, it retains only low-frequency information. +Conversely, the StyleGAN2-based model can represent high-frequency information +with relatively higher computational cost than the PCA-based model. Although +there is a trade-off between the two models, both are applicable to 3D facial +makeup estimation and related applications. By leveraging makeup prior models +and designing a makeup consistency module, we effectively address the +challenges that previous methods faced in robustly estimating makeup, +particularly in the context of handling self-occluded faces. In experiments, we +demonstrate that our approach reduces computational costs by several orders of +magnitude, achieving speeds up to 180 times faster. In addition, by improving +the accuracy of the estimated makeup, we confirm that our methods are highly +advantageous for various 3D facial makeup applications such as 3D makeup face +reconstruction, user-friendly makeup editing, makeup transfer, and +interpolation. + +
+
+ comment: CVPR2024. Project: https://yangxingchao.github.io/makeup-priors-page +
+
+
+
+
+ + ☆ Noise2Noise Denoising of CRISM Hyperspectral Data ICLR 2024 + + +
+ Hyperspectral data acquired by the Compact Reconnaissance Imaging +Spectrometer for Mars (CRISM) have allowed for unparalleled mapping of the +surface mineralogy of Mars. Due to sensor degradation over time, a significant +portion of the recently acquired data is considered unusable. Here a new +data-driven model architecture, Noise2Noise4Mars (N2N4M), is introduced to +remove noise from CRISM images. Our model is self-supervised and does not +require zero-noise target data, making it well suited for use in Planetary +Science applications where high quality labelled data is scarce. We demonstrate +its strong performance on synthetic-noise data and CRISM images, and its impact +on downstream classification performance, outperforming benchmark methods on +most metrics. This allows for detailed analysis for critical sites of interest +on the Martian surface, including proposed lander sites. + +
+
+ comment: 5 pages, 3 figures. Accepted as a conference paper at the ICLR 2024 + ML4RS Workshop +
+
+
+
+
+ + ☆ DataCook: Crafting Anti-Adversarial Examples for Healthcare Data + Copyright Protection + + +
+ In the realm of healthcare, the challenges of copyright protection and +unauthorized third-party misuse are increasingly significant. Traditional +methods for data copyright protection are applied prior to data distribution, +implying that models trained on these data become uncontrollable. This paper +introduces a novel approach, named DataCook, designed to safeguard the +copyright of healthcare data during the deployment phase. DataCook operates by +"cooking" the raw data before distribution, enabling the development of models +that perform normally on this processed data. However, during the deployment +phase, the original test data must be also "cooked" through DataCook to ensure +normal model performance. This process grants copyright holders control over +authorization during the deployment phase. The mechanism behind DataCook is by +crafting anti-adversarial examples (AntiAdv), which are designed to enhance +model confidence, as opposed to standard adversarial examples (Adv) that aim to +confuse models. Similar to Adv, AntiAdv introduces imperceptible perturbations, +ensuring that the data processed by DataCook remains easily understandable. We +conducted extensive experiments on MedMNIST datasets, encompassing both 2D/3D +data and the high-resolution variants. The outcomes indicate that DataCook +effectively meets its objectives, preventing models trained on AntiAdv from +analyzing unauthorized data effectively, without compromising the validity and +accuracy of the data in legitimate scenarios. Code and data are available at +https://github.com/MedMNIST/DataCook. + +
+
+
+
+
+ + ☆ Multi-Task Dense Prediction via Mixture of Low-Rank Experts CVPR 2024 + + +
+ Previous multi-task dense prediction methods based on the Mixture of Experts +(MoE) have received great performance but they neglect the importance of +explicitly modeling the global relations among all tasks. In this paper, we +present a novel decoder-focused method for multi-task dense prediction, called +Mixture-of-Low-Rank-Experts (MLoRE). To model the global task relationships, +MLoRE adds a generic convolution path to the original MoE structure, where each +task feature can go through this path for explicit parameter sharing. +Furthermore, to control the parameters and computational cost brought by the +increase in the number of experts, we take inspiration from LoRA and propose to +leverage the low-rank format of a vanilla convolution in the expert network. +Since the low-rank experts have fewer parameters and can be dynamically +parameterized into the generic convolution, the parameters and computational +cost do not change much with the increase of experts. Benefiting from this +design, we increase the number of experts and its reception field to enlarge +the representation capacity, facilitating multiple dense tasks learning in a +unified network. Extensive experiments on the PASCAL-Context and NYUD-v2 +benchmarks show that our MLoRE achieves superior performance compared to +previous state-of-the-art methods on all metrics. Our code is available at +https://github.com/YuqiYang213/MLoRE. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Paired Diffusion: Generation of related, synthetic PET-CT-Segmentation + scans using Linked Denoising Diffusion Probabilistic Models + + +
+ The rapid advancement of Artificial Intelligence (AI) in biomedical imaging +and radiotherapy is hindered by the limited availability of large imaging data +repositories. With recent research and improvements in denoising diffusion +probabilistic models (DDPM), high quality synthetic medical scans are now +possible. Despite this, there is currently no way of generating multiple +related images, such as a corresponding ground truth which can be used to train +models, so synthetic scans are often manually annotated before use. This +research introduces a novel architecture that is able to generate multiple, +related PET-CT-tumour mask pairs using paired networks and conditional +encoders. Our approach includes innovative, time step-controlled mechanisms and +a `noise-seeding' strategy to improve DDPM sampling consistency. While our +model requires a modified perceptual loss function to ensure accurate feature +alignment we show generation of clearly aligned synthetic images and +improvement in segmentation accuracy with generated images. + +
+
+ comment: to be published in IEEE International Symposium on Biomedical Imaging + 2024 +
+
+
+
+
+ + ☆ FastPerson: Enhancing Video Learning through Effective Video + Summarization that Preserves Linguistic and Visual Contexts + + +
+ Quickly understanding lengthy lecture videos is essential for learners with +limited time and interest in various topics to improve their learning +efficiency. To this end, video summarization has been actively researched to +enable users to view only important scenes from a video. However, these studies +focus on either the visual or audio information of a video and extract +important segments in the video. Therefore, there is a risk of missing +important information when both the teacher's speech and visual information on +the blackboard or slides are important, such as in a lecture video. To tackle +this issue, we propose FastPerson, a video summarization approach that +considers both the visual and auditory information in lecture videos. +FastPerson creates summary videos by utilizing audio transcriptions along with +on-screen images and text, minimizing the risk of overlooking crucial +information for learners. Further, it provides a feature that allows learners +to switch between the summary and original videos for each chapter of the +video, enabling them to adjust the pace of learning based on their interests +and level of understanding. We conducted an evaluation with 40 participants to +assess the effectiveness of our method and confirmed that it reduced viewing +time by 53\% at the same level of comprehension as that when using traditional +video playback methods. + +
+
+
+
+
+ + ☆ Deep Learning for Segmentation of Cracks in High-Resolution Images of + Steel Bridges + + +
+ Automating the current bridge visual inspection practices using drones and +image processing techniques is a prominent way to make these inspections more +effective, robust, and less expensive. In this paper, we investigate the +development of a novel deep-learning method for the detection of fatigue cracks +in high-resolution images of steel bridges. First, we present a novel and +challenging dataset comprising of images of cracks in steel bridges. Secondly, +we integrate the ConvNext neural network with a previous state- of-the-art +encoder-decoder network for crack segmentation. We study and report, the +effects of the use of background patches on the network performance when +applied to high-resolution images of cracks in steel bridges. Finally, we +introduce a loss function that allows the use of more background patches for +the training process, which yields a significant reduction in false positive +rates. + +
+
+
+
+
+ + ☆ Invisible Gas Detection: An RGB-Thermal Cross Attention Network and A + New Benchmark + + +
+ The widespread use of various chemical gases in industrial processes +necessitates effective measures to prevent their leakage during transportation +and storage, given their high toxicity. Thermal infrared-based computer vision +detection techniques provide a straightforward approach to identify gas leakage +areas. However, the development of high-quality algorithms has been challenging +due to the low texture in thermal images and the lack of open-source datasets. +In this paper, we present the RGB-Thermal Cross Attention Network (RT-CAN), +which employs an RGB-assisted two-stream network architecture to integrate +texture information from RGB images and gas area information from thermal +images. Additionally, to facilitate the research of invisible gas detection, we +introduce Gas-DB, an extensive open-source gas detection database including +about 1.3K well-annotated RGB-thermal images with eight variant collection +scenes. Experimental results demonstrate that our method successfully leverages +the advantages of both modalities, achieving state-of-the-art (SOTA) +performance among RGB-thermal methods, surpassing single-stream SOTA models in +terms of accuracy, Intersection of Union (IoU), and F2 metrics by 4.86%, 5.65%, +and 4.88%, respectively. The code and data will be made available soon. + +
+
+
+
+
+ + ☆ Groupwise Query Specialization and Quality-Aware Multi-Assignment for + Transformer-based Visual Relationship Detection CVPR 2024 + + +
+ Visual Relationship Detection (VRD) has seen significant advancements with +Transformer-based architectures recently. However, we identify two key +limitations in a conventional label assignment for training Transformer-based +VRD models, which is a process of mapping a ground-truth (GT) to a prediction. +Under the conventional assignment, an unspecialized query is trained since a +query is expected to detect every relation, which makes it difficult for a +query to specialize in specific relations. Furthermore, a query is also +insufficiently trained since a GT is assigned only to a single prediction, +therefore near-correct or even correct predictions are suppressed by being +assigned no relation as a GT. To address these issues, we propose Groupwise +Query Specialization and Quality-Aware Multi-Assignment (SpeaQ). Groupwise +Query Specialization trains a specialized query by dividing queries and +relations into disjoint groups and directing a query in a specific query group +solely toward relations in the corresponding relation group. Quality-Aware +Multi-Assignment further facilitates the training by assigning a GT to multiple +predictions that are significantly close to a GT in terms of a subject, an +object, and the relation in between. Experimental results and analyses show +that SpeaQ effectively trains specialized queries, which better utilize the +capacity of a model, resulting in consistent performance gains with zero +additional inference cost across multiple VRD models and benchmarks. Code is +available at https://github.com/mlvlab/SpeaQ. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Panonut360: A Head and Eye Tracking Dataset for Panoramic Video ACM MM + + +
+ With the rapid development and widespread application of VR/AR technology, +maximizing the quality of immersive panoramic video services that match users' +personal preferences and habits has become a long-standing challenge. +Understanding the saliency region where users focus, based on data collected +with HMDs, can promote multimedia encoding, transmission, and quality +assessment. At the same time, large-scale datasets are essential for +researchers and developers to explore short/long-term user behavior patterns +and train AI models related to panoramic videos. However, existing panoramic +video datasets often include low-frequency user head or eye movement data +through short-term videos only, lacking sufficient data for analyzing users' +Field of View (FoV) and generating video saliency regions. + Driven by these practical factors, in this paper, we present a head and eye +tracking dataset involving 50 users (25 males and 25 females) watching 15 +panoramic videos. The dataset provides details on the viewport and gaze +attention locations of users. Besides, we present some statistics samples +extracted from the dataset. For example, the deviation between head and eye +movements challenges the widely held assumption that gaze attention decreases +from the center of the FoV following a Gaussian distribution. Our analysis +reveals a consistent downward offset in gaze fixations relative to the FoV in +experimental settings involving multiple users and videos. That's why we name +the dataset Panonut, a saliency weighting shaped like a donut. Finally, we also +provide a script that generates saliency distributions based on given head or +eye coordinates and pre-generated saliency distribution map sets of each video +from the collected eye tracking data. + The dataset is available on website: https://dianvrlab.github.io/Panonut360/. + +
+
+ comment: 7 pages,ACM MMSys'24 accepted +
+
+
+
+
+ + ☆ The Solution for the CVPR 2023 1st foundation model challenge-Track2 + + +
+ In this paper, we propose a solution for cross-modal transportation +retrieval. Due to the cross-domain problem of traffic images, we divide the +problem into two sub-tasks of pedestrian retrieval and vehicle retrieval +through a simple strategy. In pedestrian retrieval tasks, we use IRRA as the +base model and specifically design an Attribute Classification to mine the +knowledge implied by attribute labels. More importantly, We use the strategy of +Inclusion Relation Matching to make the image-text pairs with inclusion +relation have similar representation in the feature space. For the vehicle +retrieval task, we use BLIP as the base model. Since aligning the color +attributes of vehicles is challenging, we introduce attribute-based object +detection techniques to add color patch blocks to vehicle images for color data +augmentation. This serves as strong prior information, helping the model +perform the image-text alignment. At the same time, we incorporate labeled +attributes into the image-text alignment loss to learn fine-grained alignment +and prevent similar images and texts from being incorrectly separated. Our +approach ranked first in the final B-board test with a score of 70.9. + +
+
+
+
+
+ + ☆ Rotate to Scan: UNet-like Mamba with Triplet SSM Module for Medical + Image Segmentation + + +
+ Image segmentation holds a vital position in the realms of diagnosis and +treatment within the medical domain. Traditional convolutional neural networks +(CNNs) and Transformer models have made significant advancements in this realm, +but they still encounter challenges because of limited receptive field or high +computing complexity. Recently, State Space Models (SSMs), particularly Mamba +and its variants, have demonstrated notable performance in the field of vision. +However, their feature extraction methods may not be sufficiently effective and +retain some redundant structures, leaving room for parameter reduction. +Motivated by previous spatial and channel attention methods, we propose Triplet +Mamba-UNet. The method leverages residual VSS Blocks to extract intensive +contextual features, while Triplet SSM is employed to fuse features across +spatial and channel dimensions. We conducted experiments on ISIC17, ISIC18, +CVC-300, CVC-ClinicDB, Kvasir-SEG, CVC-ColonDB, and Kvasir-Instrument datasets, +demonstrating the superior segmentation performance of our proposed TM-UNet. +Additionally, compared to the previous VM-UNet, our model achieves a one-third +reduction in parameters. + +
+
+
+
+
+ + ☆ PlainMamba: Improving Non-Hierarchical Mamba in Visual Recognition + + +
+ We present PlainMamba: a simple non-hierarchical state space model (SSM) +designed for general visual recognition. The recent Mamba model has shown how +SSMs can be highly competitive with other architectures on sequential data and +initial attempts have been made to apply it to images. In this paper, we +further adapt the selective scanning process of Mamba to the visual domain, +enhancing its ability to learn features from two-dimensional images by (i) a +continuous 2D scanning process that improves spatial continuity by ensuring +adjacency of tokens in the scanning sequence, and (ii) direction-aware updating +which enables the model to discern the spatial relations of tokens by encoding +directional information. Our architecture is designed to be easy to use and +easy to scale, formed by stacking identical PlainMamba blocks, resulting in a +model with constant width throughout all layers. The architecture is further +simplified by removing the need for special tokens. We evaluate PlainMamba on a +variety of visual recognition tasks including image classification, semantic +segmentation, object detection, and instance segmentation. Our method achieves +performance gains over previous non-hierarchical models and is competitive with +hierarchical alternatives. For tasks requiring high-resolution inputs, in +particular, PlainMamba requires much less computing while maintaining high +performance. Code and models are available at +https://github.com/ChenhongyiYang/PlainMamba + +
+
+
+
+
+ + ☆ AniPortrait: Audio-Driven Synthesis of Photorealistic Portrait Animation + + +
+ In this study, we propose AniPortrait, a novel framework for generating +high-quality animation driven by audio and a reference portrait image. Our +methodology is divided into two stages. Initially, we extract 3D intermediate +representations from audio and project them into a sequence of 2D facial +landmarks. Subsequently, we employ a robust diffusion model, coupled with a +motion module, to convert the landmark sequence into photorealistic and +temporally consistent portrait animation. Experimental results demonstrate the +superiority of AniPortrait in terms of facial naturalness, pose diversity, and +visual quality, thereby offering an enhanced perceptual experience. Moreover, +our methodology exhibits considerable potential in terms of flexibility and +controllability, which can be effectively applied in areas such as facial +motion editing or face reenactment. We release code and model weights at +https://github.com/scutzzj/AniPortrait + +
+
+
+
+
+ + ☆ Manifold-Guided Lyapunov Control with Diffusion Models + + +
+ This paper presents a novel approach to generating stabilizing controllers +for a large class of dynamical systems using diffusion models. The core +objective is to develop stabilizing control functions by identifying the +closest asymptotically stable vector field relative to a predetermined manifold +and adjusting the control function based on this finding. To achieve this, we +employ a diffusion model trained on pairs consisting of asymptotically stable +vector fields and their corresponding Lyapunov functions. Our numerical results +demonstrate that this pre-trained model can achieve stabilization over +previously unseen systems efficiently and rapidly, showcasing the potential of +our approach in fast zero-shot control and generalizability. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ Not All Similarities Are Created Equal: Leveraging Data-Driven Biases to + Inform GenAI Copyright Disputes + + +
+ The advent of Generative Artificial Intelligence (GenAI) models, including +GitHub Copilot, OpenAI GPT, and Stable Diffusion, has revolutionized content +creation, enabling non-professionals to produce high-quality content across +various domains. This transformative technology has led to a surge of synthetic +content and sparked legal disputes over copyright infringement. To address +these challenges, this paper introduces a novel approach that leverages the +learning capacity of GenAI models for copyright legal analysis, demonstrated +with GPT2 and Stable Diffusion models. Copyright law distinguishes between +original expressions and generic ones (Sc\`enes \`a faire), protecting the +former and permitting reproduction of the latter. However, this distinction has +historically been challenging to make consistently, leading to over-protection +of copyrighted works. GenAI offers an unprecedented opportunity to enhance this +legal analysis by revealing shared patterns in preexisting works. We propose a +data-driven approach to identify the genericity of works created by GenAI, +employing "data-driven bias" to assess the genericity of expressive +compositions. This approach aids in copyright scope determination by utilizing +the capabilities of GenAI to identify and prioritize expressive elements and +rank them according to their frequency in the model's dataset. The potential +implications of measuring expressive genericity for copyright law are profound. +Such scoring could assist courts in determining copyright scope during +litigation, inform the registration practices of Copyright Offices, allowing +registration of only highly original synthetic works, and help copyright owners +signal the value of their works and facilitate fairer licensing deals. More +generally, this approach offers valuable insights to policymakers grappling +with adapting copyright law to the challenges posed by the era of GenAI. + +
+
+ comment: Presented at ACM CSLAW 2024 +
+
+
+
+
+ + ☆ Hierarchical Light Transformer Ensembles for Multimodal Trajectory + Forecasting + + +
+ Accurate trajectory forecasting is crucial for the performance of various +systems, such as advanced driver-assistance systems and self-driving vehicles. +These forecasts allow to anticipate events leading to collisions and, +therefore, to mitigate them. Deep Neural Networks have excelled in motion +forecasting, but issues like overconfidence and uncertainty quantification +persist. Deep Ensembles address these concerns, yet applying them to multimodal +distributions remains challenging. In this paper, we propose a novel approach +named Hierarchical Light Transformer Ensembles (HLT-Ens), aimed at efficiently +training an ensemble of Transformer architectures using a novel hierarchical +loss function. HLT-Ens leverages grouped fully connected layers, inspired by +grouped convolution techniques, to capture multimodal distributions, +effectively. Through extensive experimentation, we demonstrate that HLT-Ens +achieves state-of-the-art performance levels, offering a promising avenue for +improving trajectory forecasting techniques. + +
+
+
+
+
+ + ☆ Predicting Perceived Gloss: Do Weak Labels Suffice? + + +
+ Estimating perceptual attributes of materials directly from images is a +challenging task due to their complex, not fully-understood interactions with +external factors, such as geometry and lighting. Supervised deep learning +models have recently been shown to outperform traditional approaches, but rely +on large datasets of human-annotated images for accurate perception +predictions. Obtaining reliable annotations is a costly endeavor, aggravated by +the limited ability of these models to generalise to different aspects of +appearance. In this work, we show how a much smaller set of human annotations +("strong labels") can be effectively augmented with automatically derived "weak +labels" in the context of learning a low-dimensional image-computable gloss +metric. We evaluate three alternative weak labels for predicting human gloss +perception from limited annotated data. Incorporating weak labels enhances our +gloss prediction beyond the current state of the art. Moreover, it enables a +substantial reduction in human annotation costs without sacrificing accuracy, +whether working with rendered images or real photographs. + +
+
+ comment: Computer Graphics Forum (Eurographics 2024) +
+
+
+
+
+ + ☆ DiffFAE: Advancing High-fidelity One-shot Facial Appearance Editing with + Space-sensitive Customization and Semantic Preservation + + +
+ Facial Appearance Editing (FAE) aims to modify physical attributes, such as +pose, expression and lighting, of human facial images while preserving +attributes like identity and background, showing great importance in +photograph. In spite of the great progress in this area, current researches +generally meet three challenges: low generation fidelity, poor attribute +preservation, and inefficient inference. To overcome above challenges, this +paper presents DiffFAE, a one-stage and highly-efficient diffusion-based +framework tailored for high-fidelity FAE. For high-fidelity query attributes +transfer, we adopt Space-sensitive Physical Customization (SPC), which ensures +the fidelity and generalization ability by utilizing rendering texture derived +from 3D Morphable Model (3DMM). In order to preserve source attributes, we +introduce the Region-responsive Semantic Composition (RSC). This module is +guided to learn decoupled source-regarding features, thereby better preserving +the identity and alleviating artifacts from non-facial attributes such as hair, +clothes, and background. We further introduce a consistency regularization for +our pipeline to enhance editing controllability by leveraging prior knowledge +in the attention matrices of diffusion model. Extensive experiments demonstrate +the superiority of DiffFAE over existing methods, achieving state-of-the-art +performance in facial appearance editing. + +
+
+
+
+
+ + ☆ Exploring Dynamic Transformer for Efficient Object Tracking + + +
+ The speed-precision trade-off is a critical problem for visual object +tracking which usually requires low latency and deployment on constrained +resources. Existing solutions for efficient tracking mainly focus on adopting +light-weight backbones or modules, which nevertheless come at the cost of a +sacrifice in precision. In this paper, inspired by dynamic network routing, we +propose DyTrack, a dynamic transformer framework for efficient tracking. +Real-world tracking scenarios exhibit diverse levels of complexity. We argue +that a simple network is sufficient for easy frames in video sequences, while +more computation could be assigned to difficult ones. DyTrack automatically +learns to configure proper reasoning routes for various inputs, gaining better +utilization of the available computational budget. Thus, it can achieve higher +performance with the same running speed. We formulate instance-specific +tracking as a sequential decision problem and attach terminating branches to +intermediate layers of the entire model. Especially, to fully utilize the +computations, we introduce the feature recycling mechanism to reuse the outputs +of predecessors. Furthermore, a target-aware self-distillation strategy is +designed to enhance the discriminating capabilities of early predictions by +effectively mimicking the representation pattern of the deep model. Extensive +experiments on multiple benchmarks demonstrate that DyTrack achieves promising +speed-precision trade-offs with only a single model. For instance, DyTrack +obtains 64.9% AUC on LaSOT with a speed of 256 fps. + +
+
+
+
+
+ + ☆ High-Resolution Image Translation Model Based on Grayscale Redefinition + + +
+ Image-to-image translation is a technique that focuses on transferring images +from one domain to another while maintaining the essential content +representations. In recent years, image-to-image translation has gained +significant attention and achieved remarkable advancements due to its diverse +applications in computer vision and image processing tasks. In this work, we +propose an innovative method for image translation between different domains. +For high-resolution image translation tasks, we use a grayscale adjustment +method to achieve pixel-level translation. For other tasks, we utilize the +Pix2PixHD model with a coarse-to-fine generator, multi-scale discriminator, and +improved loss to enhance the image translation performance. On the other hand, +to tackle the issue of sparse training data, we adopt model weight +initialization from other task to optimize the performance of the current task. + +
+
+
+
+
+ + ☆ Learning with Unreliability: Fast Few-shot Voxel Radiance Fields with + Relative Geometric Consistency CVPR 2024 + + +
+ We propose a voxel-based optimization framework, ReVoRF, for few-shot +radiance fields that strategically address the unreliability in pseudo novel +view synthesis. Our method pivots on the insight that relative depth +relationships within neighboring regions are more reliable than the absolute +color values in disoccluded areas. Consequently, we devise a bilateral +geometric consistency loss that carefully navigates the trade-off between color +fidelity and geometric accuracy in the context of depth consistency for +uncertain regions. Moreover, we present a reliability-guided learning strategy +to discern and utilize the variable quality across synthesized views, +complemented by a reliability-aware voxel smoothing algorithm that smoothens +the transition between reliable and unreliable data patches. Our approach +allows for a more nuanced use of all available data, promoting enhanced +learning from regions previously considered unsuitable for high-quality +reconstruction. Extensive experiments across diverse datasets reveal that our +approach attains significant gains in efficiency and accuracy, delivering +rendering speeds of 3 FPS, 7 mins to train a $360^\circ$ scene, and a 5\% +improvement in PSNR over existing few-shot methods. Code is available at +https://github.com/HKCLynn/ReVoRF. + +
+
+ comment: CVPR 2024 final version +
+
+
+
+
+ + ☆ UADA3D: Unsupervised Adversarial Domain Adaptation for 3D Object + Detection with Sparse LiDAR and Large Domain Gaps + + +
+ In this study, we address a gap in existing unsupervised domain adaptation +approaches on LiDAR-based 3D object detection, which have predominantly +concentrated on adapting between established, high-density autonomous driving +datasets. We focus on sparser point clouds, capturing scenarios from different +perspectives: not just from vehicles on the road but also from mobile robots on +sidewalks, which encounter significantly different environmental conditions and +sensor configurations. We introduce Unsupervised Adversarial Domain Adaptation +for 3D Object Detection (UADA3D). UADA3D does not depend on pre-trained source +models or teacher-student architectures. Instead, it uses an adversarial +approach to directly learn domain-invariant features. We demonstrate its +efficacy in various adaptation scenarios, showing significant improvements in +both self-driving car and mobile robot domains. Our code is open-source and +will be available soon. + +
+
+
+
+
+ + ☆ AniArtAvatar: Animatable 3D Art Avatar from a Single Image + + +
+ We present a novel approach for generating animatable 3D-aware art avatars +from a single image, with controllable facial expressions, head poses, and +shoulder movements. Unlike previous reenactment methods, our approach utilizes +a view-conditioned 2D diffusion model to synthesize multi-view images from a +single art portrait with a neutral expression. With the generated colors and +normals, we synthesize a static avatar using an SDF-based neural surface. For +avatar animation, we extract control points, transfer the motion with these +points, and deform the implicit canonical space. Firstly, we render the front +image of the avatar, extract the 2D landmarks, and project them to the 3D space +using a trained SDF network. We extract 3D driving landmarks using 3DMM and +transfer the motion to the avatar landmarks. To animate the avatar pose, we +manually set the body height and bound the head and torso of an avatar with two +cages. The head and torso can be animated by transforming the two cages. Our +approach is a one-shot pipeline that can be applied to various styles. +Experiments demonstrate that our method can generate high-quality 3D art +avatars with desired control over different motions. + +
+
+
+
+
+ + ☆ Grad-CAMO: Learning Interpretable Single-Cell Morphological Profiles + from 3D Cell Painting Images + + +
+ Despite their black-box nature, deep learning models are extensively used in +image-based drug discovery to extract feature vectors from single cells in +microscopy images. To better understand how these networks perform +representation learning, we employ visual explainability techniques (e.g., +Grad-CAM). Our analyses reveal several mechanisms by which supervised models +cheat, exploiting biologically irrelevant pixels when extracting morphological +features from images, such as noise in the background. This raises doubts +regarding the fidelity of learned single-cell representations and their +relevance when investigating downstream biological questions. To address this +misalignment between researcher expectations and machine behavior, we introduce +Grad-CAMO, a novel single-cell interpretability score for supervised feature +extractors. Grad-CAMO measures the proportion of a model's attention that is +concentrated on the cell of interest versus the background. This metric can be +assessed per-cell or averaged across a validation set, offering a tool to audit +individual features vectors or guide the improved design of deep learning +architectures. Importantly, Grad-CAMO seamlessly integrates into existing +workflows, requiring no dataset or model modifications, and is compatible with +both 2D and 3D Cell Painting data. Additional results are available at +https://github.com/eigenvivek/Grad-CAMO. + +
+
+
+
+
+ + ☆ MMVP: A Multimodal MoCap Dataset with Vision and Pressure Sensors CVPR2024 + + +
+ Foot contact is an important cue not only for human motion capture but also +for motion understanding and physically plausible motion generation. However, +most of the foot-contact annotations in existing datasets are estimated by +purely visual matching and distance thresholding, which results in low accuracy +and coarse granularity. Even though existing multimodal datasets +synergistically capture plantar pressure (foot contact) and visual signals, +they are specifically designed for small-range and slow motion such as Taiji +Quan and Yoga. Therefore, there is still a lack of a vision-pressure multimodal +dataset with large-range and fast human motion, as well as accurate and dense +foot-contact annotation. To fill this gap, we propose a Multimodal MoCap +Dataset with Vision and Pressure sensors, named MMVP. MMVP provides accurate +and dense plantar pressure signals synchronized with RGBD observations, which +is especially useful for both plausible shape estimation, robust pose fitting +without foot drifting, and accurate global translation tracking. To validate +the dataset, we propose an RGBD-P SMPL fitting method and also a +monocular-video-based baseline framework, VP-MoCap, for human motion capture. +Experiments demonstrate that our RGBD-P SMPL Fitting results significantly +outperform pure visual motion capture. Moreover, VP-MoCap outperforms SOTA +methods in foot-contact and global translation estimation accuracy. We believe +the configuration of the dataset and the baseline frameworks will stimulate the +research in this direction and also provide a good reference for MoCap +applications in various domains. Project page: +https://haolyuan.github.io/MMVP-Dataset/. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ☆ Fake or JPEG? Revealing Common Biases in Generated Image Detection + Datasets + + +
+ The widespread adoption of generative image models has highlighted the urgent +need to detect artificial content, which is a crucial step in combating +widespread manipulation and misinformation. Consequently, numerous detectors +and associated datasets have emerged. However, many of these datasets +inadvertently introduce undesirable biases, thereby impacting the effectiveness +and evaluation of detectors. In this paper, we emphasize that many datasets for +AI-generated image detection contain biases related to JPEG compression and +image size. Using the GenImage dataset, we demonstrate that detectors indeed +learn from these undesired factors. Furthermore, we show that removing the +named biases substantially increases robustness to JPEG compression and +significantly alters the cross-generator performance of evaluated detectors. +Specifically, it leads to more than 11 percentage points increase in +cross-generator performance for ResNet50 and Swin-T detectors on the GenImage +dataset, achieving state-of-the-art results. + We provide the dataset and source codes of this paper on the anonymous +website: https://www.unbiased-genimage.org + +
+
+
+
+
+ + ☆ Dual Memory Networks: A Versatile Adaptation Approach for + Vision-Language Models CVPR2024 + + +
+ With the emergence of pre-trained vision-language models like CLIP, how to +adapt them to various downstream classification tasks has garnered significant +attention in recent research. The adaptation strategies can be typically +categorized into three paradigms: zero-shot adaptation, few-shot adaptation, +and the recently-proposed training-free few-shot adaptation. Most existing +approaches are tailored for a specific setting and can only cater to one or two +of these paradigms. In this paper, we introduce a versatile adaptation approach +that can effectively work under all three settings. Specifically, we propose +the dual memory networks that comprise dynamic and static memory components. +The static memory caches training data knowledge, enabling training-free +few-shot adaptation, while the dynamic memory preserves historical test +features online during the testing process, allowing for the exploration of +additional data insights beyond the training set. This novel capability +enhances model performance in the few-shot setting and enables model usability +in the absence of training data. The two memory networks employ the same +flexible memory interactive strategy, which can operate in a training-free mode +and can be further enhanced by incorporating learnable projection layers. Our +approach is tested across 11 datasets under the three task settings. +Remarkably, in the zero-shot scenario, it outperforms existing methods by over +3\% and even shows superior results against methods utilizing external training +data. Additionally, our method exhibits robust performance against natural +distribution shifts. Codes are available at \url{https://github.com/YBZh/DMN}. + +
+
+ comment: CVPR2024; Codes are available at \url{https://github.com/YBZh/DMN} +
+
+
+
+
+ + ☆ DeepMIF: Deep Monotonic Implicit Fields for Large-Scale LiDAR 3D Mapping + + +
+ Recently, significant progress has been achieved in sensing real large-scale +outdoor 3D environments, particularly by using modern acquisition equipment +such as LiDAR sensors. Unfortunately, they are fundamentally limited in their +ability to produce dense, complete 3D scenes. To address this issue, recent +learning-based methods integrate neural implicit representations and +optimizable feature grids to approximate surfaces of 3D scenes. However, +naively fitting samples along raw LiDAR rays leads to noisy 3D mapping results +due to the nature of sparse, conflicting LiDAR measurements. Instead, in this +work we depart from fitting LiDAR data exactly, instead letting the network +optimize a non-metric monotonic implicit field defined in 3D space. To fit our +field, we design a learning system integrating a monotonicity loss that enables +optimizing neural monotonic fields and leverages recent progress in large-scale +3D mapping. Our algorithm achieves high-quality dense 3D mapping performance as +captured by multiple quantitative and perceptual measures and visual results +obtained for Mai City, Newer College, and KITTI benchmarks. The code of our +approach will be made publicly available. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ☆ Practical Applications of Advanced Cloud Services and Generative AI + Systems in Medical Image Analysis + + +
+ The medical field is one of the important fields in the application of +artificial intelligence technology. With the explosive growth and +diversification of medical data, as well as the continuous improvement of +medical needs and challenges, artificial intelligence technology is playing an +increasingly important role in the medical field. Artificial intelligence +technologies represented by computer vision, natural language processing, and +machine learning have been widely penetrated into diverse scenarios such as +medical imaging, health management, medical information, and drug research and +development, and have become an important driving force for improving the level +and quality of medical services.The article explores the transformative +potential of generative AI in medical imaging, emphasizing its ability to +generate syntheticACM-2 data, enhance images, aid in anomaly detection, and +facilitate image-to-image translation. Despite challenges like model +complexity, the applications of generative models in healthcare, including +Med-PaLM 2 technology, show promising results. By addressing limitations in +dataset size and diversity, these models contribute to more accurate diagnoses +and improved patient outcomes. However, ethical considerations and +collaboration among stakeholders are essential for responsible implementation. +Through experiments leveraging GANs to augment brain tumor MRI datasets, the +study demonstrates how generative AI can enhance image quality and diversity, +ultimately advancing medical diagnostics and patient care. + +
+
+
+
+
+ + ☆ A Gaze-grounded Visual Question Answering Dataset for Clarifying + Ambiguous Japanese Questions LREC + + +
+ Situated conversations, which refer to visual information as visual question +answering (VQA), often contain ambiguities caused by reliance on directive +information. This problem is exacerbated because some languages, such as +Japanese, often omit subjective or objective terms. Such ambiguities in +questions are often clarified by the contexts in conversational situations, +such as joint attention with a user or user gaze information. In this study, we +propose the Gaze-grounded VQA dataset (GazeVQA) that clarifies ambiguous +questions using gaze information by focusing on a clarification process +complemented by gaze information. We also propose a method that utilizes gaze +target estimation results to improve the accuracy of GazeVQA tasks. Our +experimental results showed that the proposed method improved the performance +in some cases of a VQA system on GazeVQA and identified some typical problems +of GazeVQA tasks that need to be improved. + +
+
+ comment: LREC-COLING 2024 +
+
+
+
+
+ + ☆ WordRobe: Text-Guided Generation of Textured 3D Garments + + +
+ In this paper, we tackle a new and challenging problem of text-driven +generation of 3D garments with high-quality textures. We propose "WordRobe", a +novel framework for the generation of unposed & textured 3D garment meshes from +user-friendly text prompts. We achieve this by first learning a latent +representation of 3D garments using a novel coarse-to-fine training strategy +and a loss for latent disentanglement, promoting better latent interpolation. +Subsequently, we align the garment latent space to the CLIP embedding space in +a weakly supervised manner, enabling text-driven 3D garment generation and +editing. For appearance modeling, we leverage the zero-shot generation +capability of ControlNet to synthesize view-consistent texture maps in a single +feed-forward inference step, thereby drastically decreasing the generation time +as compared to existing methods. We demonstrate superior performance over +current SOTAs for learning 3D garment latent space, garment interpolation, and +text-driven texture synthesis, supported by quantitative evaluation and +qualitative user study. The unposed 3D garment meshes generated using WordRobe +can be directly fed to standard cloth simulation & animation pipelines without +any post-processing. + +
+
+
+
+
+ + ☆ NeRF-HuGS: Improved Neural Radiance Fields in Non-static Scenes Using + Heuristics-Guided Segmentation CVPR2024 + + +
+ Neural Radiance Field (NeRF) has been widely recognized for its excellence in +novel view synthesis and 3D scene reconstruction. However, their effectiveness +is inherently tied to the assumption of static scenes, rendering them +susceptible to undesirable artifacts when confronted with transient distractors +such as moving objects or shadows. In this work, we propose a novel paradigm, +namely "Heuristics-Guided Segmentation" (HuGS), which significantly enhances +the separation of static scenes from transient distractors by harmoniously +combining the strengths of hand-crafted heuristics and state-of-the-art +segmentation models, thus significantly transcending the limitations of +previous solutions. Furthermore, we delve into the meticulous design of +heuristics, introducing a seamless fusion of Structure-from-Motion (SfM)-based +heuristics and color residual heuristics, catering to a diverse range of +texture profiles. Extensive experiments demonstrate the superiority and +robustness of our method in mitigating transient distractors for NeRFs trained +in non-static scenes. Project page: https://cnhaox.github.io/NeRF-HuGS/. + +
+
+ comment: To appear in CVPR2024 +
+
+
+
+
+ + ☆ Boosting Few-Shot Learning with Disentangled Self-Supervised Learning + and Meta-Learning for Medical Image Classification + + +
+ Background and objective: Employing deep learning models in critical domains +such as medical imaging poses challenges associated with the limited +availability of training data. We present a strategy for improving the +performance and generalization capabilities of models trained in low-data +regimes. Methods: The proposed method starts with a pre-training phase, where +features learned in a self-supervised learning setting are disentangled to +improve the robustness of the representations for downstream tasks. We then +introduce a meta-fine-tuning step, leveraging related classes between +meta-training and meta-testing phases but varying the granularity level. This +approach aims to enhance the model's generalization capabilities by exposing it +to more challenging classification tasks during meta-training and evaluating it +on easier tasks but holding greater clinical relevance during meta-testing. We +demonstrate the effectiveness of the proposed approach through a series of +experiments exploring several backbones, as well as diverse pre-training and +fine-tuning schemes, on two distinct medical tasks, i.e., classification of +prostate cancer aggressiveness from MRI data and classification of breast +cancer malignity from microscopic images. Results: Our results indicate that +the proposed approach consistently yields superior performance w.r.t. ablation +experiments, maintaining competitiveness even when a distribution shift between +training and evaluation data occurs. Conclusion: Extensive experiments +demonstrate the effectiveness and wide applicability of the proposed approach. +We hope that this work will add another solution to the arsenal of addressing +learning issues in data-scarce imaging domains. + +
+
+ comment: 20 pages, 4 figures, 4 tables. Submitted to Elsevier on 25 March 2024 +
+
+
+
+
+ + ☆ Equipping Sketch Patches with Context-Aware Positional Encoding for + Graphic Sketch Representation + + +
+ The drawing order of a sketch records how it is created stroke-by-stroke by a +human being. For graphic sketch representation learning, recent studies have +injected sketch drawing orders into graph edge construction by linking each +patch to another in accordance to a temporal-based nearest neighboring +strategy. However, such constructed graph edges may be unreliable, since a +sketch could have variants of drawings. In this paper, we propose a +variant-drawing-protected method by equipping sketch patches with context-aware +positional encoding (PE) to make better use of drawing orders for learning +graphic sketch representation. Instead of injecting sketch drawings into graph +edges, we embed these sequential information into graph nodes only. More +specifically, each patch embedding is equipped with a sinusoidal absolute PE to +highlight the sequential position in the drawing order. And its neighboring +patches, ranked by the values of self-attention scores between patch +embeddings, are equipped with learnable relative PEs to restore the contextual +positions within a neighborhood. During message aggregation via graph +convolutional networks, a node receives both semantic contents from patch +embeddings and contextual patterns from PEs by its neighbors, arriving at +drawing-order-enhanced sketch representations. Experimental results indicate +that our method significantly improves sketch healing and controllable sketch +synthesis. + +
+
+
+
+
+ + ☆ Boosting Adversarial Training via Fisher-Rao Norm-based Regularization CVPR2024 + + +
+ Adversarial training is extensively utilized to improve the adversarial +robustness of deep neural networks. Yet, mitigating the degradation of standard +generalization performance in adversarial-trained models remains an open +problem. This paper attempts to resolve this issue through the lens of model +complexity. First, We leverage the Fisher-Rao norm, a geometrically invariant +metric for model complexity, to establish the non-trivial bounds of the +Cross-Entropy Loss-based Rademacher complexity for a ReLU-activated Multi-Layer +Perceptron. Then we generalize a complexity-related variable, which is +sensitive to the changes in model width and the trade-off factors in +adversarial training. Moreover, intensive empirical evidence validates that +this variable highly correlates with the generalization gap of Cross-Entropy +loss between adversarial-trained and standard-trained models, especially during +the initial and final phases of the training process. Building upon this +observation, we propose a novel regularization framework, called Logit-Oriented +Adversarial Training (LOAT), which can mitigate the trade-off between +robustness and accuracy while imposing only a negligible increase in +computational overhead. Our extensive experiments demonstrate that the proposed +regularization strategy can boost the performance of the prevalent adversarial +training algorithms, including PGD-AT, TRADES, TRADES (LSE), MART, and DM-AT, +across various network architectures. Our code will be available at +https://github.com/TrustAI/LOAT. + +
+
+ comment: This paper has been accepted to CVPR2024 +
+
+
+
+
+ + ☆ Random-coupled Neural Network + + +
+ Improving the efficiency of current neural networks and modeling them in +biological neural systems have become popular research directions in recent +years. Pulse-coupled neural network (PCNN) is a well applicated model for +imitating the computation characteristics of the human brain in computer vision +and neural network fields. However, differences between the PCNN and biological +neural systems remain: limited neural connection, high computational cost, and +lack of stochastic property. In this study, random-coupled neural network +(RCNN) is proposed. It overcomes these difficulties in PCNN's neuromorphic +computing via a random inactivation process. This process randomly closes some +neural connections in the RCNN model, realized by the random inactivation +weight matrix of link input. This releases the computational burden of PCNN, +making it affordable to achieve vast neural connections. Furthermore, the image +and video processing mechanisms of RCNN are researched. It encodes constant +stimuli as periodic spike trains and periodic stimuli as chaotic spike trains, +the same as biological neural information encoding characteristics. Finally, +the RCNN is applicated to image segmentation, fusion, and pulse shape +discrimination subtasks. It is demonstrated to be robust, efficient, and highly +anti-noised, with outstanding performance in all applications mentioned above. + +
+
+
+
+
+ + ☆ DS-AL: A Dual-Stream Analytic Learning for Exemplar-Free + Class-Incremental Learning AAAI 2024 + + +
+ Class-incremental learning (CIL) under an exemplar-free constraint has +presented a significant challenge. Existing methods adhering to this constraint +are prone to catastrophic forgetting, far more so than replay-based techniques +that retain access to past samples. In this paper, to solve the exemplar-free +CIL problem, we propose a Dual-Stream Analytic Learning (DS-AL) approach. The +DS-AL contains a main stream offering an analytical (i.e., closed-form) linear +solution, and a compensation stream improving the inherent under-fitting +limitation due to adopting linear mapping. The main stream redefines the CIL +problem into a Concatenated Recursive Least Squares (C-RLS) task, allowing an +equivalence between the CIL and its joint-learning counterpart. The +compensation stream is governed by a Dual-Activation Compensation (DAC) module. +This module re-activates the embedding with a different activation function +from the main stream one, and seeks fitting compensation by projecting the +embedding to the null space of the main stream's linear mapping. Empirical +results demonstrate that the DS-AL, despite being an exemplar-free technique, +delivers performance comparable with or better than that of replay-based +methods across various datasets, including CIFAR-100, ImageNet-100 and +ImageNet-Full. Additionally, the C-RLS' equivalent property allows the DS-AL to +execute CIL in a phase-invariant manner. This is evidenced by a +never-before-seen 500-phase CIL ImageNet task, which performs on a level +identical to a 5-phase one. Our codes are available at +https://github.com/ZHUANGHP/Analytic-continual-learning. + +
+
+ comment: Accepted in AAAI 2024 +
+
+
+
+
+ + ☆ SeNM-VAE: Semi-Supervised Noise Modeling with Hierarchical Variational + Autoencoder + + +
+ The data bottleneck has emerged as a fundamental challenge in learning based +image restoration methods. Researchers have attempted to generate synthesized +training data using paired or unpaired samples to address this challenge. This +study proposes SeNM-VAE, a semi-supervised noise modeling method that leverages +both paired and unpaired datasets to generate realistic degraded data. Our +approach is based on modeling the conditional distribution of degraded and +clean images with a specially designed graphical model. Under the variational +inference framework, we develop an objective function for handling both paired +and unpaired data. We employ our method to generate paired training samples for +real-world image denoising and super-resolution tasks. Our approach excels in +the quality of synthetic degraded images compared to other unpaired and paired +noise modeling methods. Furthermore, our approach demonstrates remarkable +performance in downstream image restoration tasks, even with limited paired +data. With more paired data, our method achieves the best performance on the +SIDD dataset. + +
+
+
+
+
+ + ☆ Sharing the Cost of Success: A Game for Evaluating and Learning + Collaborative Multi-Agent Instruction Giving and Following Policies LREC + + +
+ In collaborative goal-oriented settings, the participants are not only +interested in achieving a successful outcome, but do also implicitly negotiate +the effort they put into the interaction (by adapting to each other). In this +work, we propose a challenging interactive reference game that requires two +players to coordinate on vision and language observations. The learning signal +in this game is a score (given after playing) that takes into account the +achieved goal and the players' assumed efforts during the interaction. We show +that a standard Proximal Policy Optimization (PPO) setup achieves a high +success rate when bootstrapped with heuristic partner behaviors that implement +insights from the analysis of human-human interactions. And we find that a +pairing of neural partners indeed reduces the measured joint effort when +playing together repeatedly. However, we observe that in comparison to a +reasonable heuristic pairing there is still room for improvement -- which +invites further research in the direction of cost-sharing in collaborative +interactions. + +
+
+ comment: 9 pages, Accepted at LREC-COLING 2024 +
+
+
+
+
+ + ☆ Dr.Hair: Reconstructing Scalp-Connected Hair Strands without + Pre-training via Differentiable Rendering of Line Segments CVPR 2024 + + +
+ In the film and gaming industries, achieving a realistic hair appearance +typically involves the use of strands originating from the scalp. However, +reconstructing these strands from observed surface images of hair presents +significant challenges. The difficulty in acquiring Ground Truth (GT) data has +led state-of-the-art learning-based methods to rely on pre-training with +manually prepared synthetic CG data. This process is not only labor-intensive +and costly but also introduces complications due to the domain gap when +compared to real-world data. In this study, we propose an optimization-based +approach that eliminates the need for pre-training. Our method represents hair +strands as line segments growing from the scalp and optimizes them using a +novel differentiable rendering algorithm. To robustly optimize a substantial +number of slender explicit geometries, we introduce 3D orientation estimation +utilizing global optimization, strand initialization based on Laplace's +equation, and reparameterization that leverages geometric connectivity and +spatial proximity. Unlike existing optimization-based methods, our method is +capable of reconstructing internal hair flow in an absolute direction. Our +method exhibits robust and accurate inverse rendering, surpassing the quality +of existing methods and significantly improving processing speed. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ DiffGaze: A Diffusion Model for Continuous Gaze Sequence Generation on + 360° Images + + +
+ We present DiffGaze, a novel method for generating realistic and diverse +continuous human gaze sequences on 360{\deg} images based on a conditional +score-based denoising diffusion model. Generating human gaze on 360{\deg} +images is important for various human-computer interaction and computer +graphics applications, e.g. for creating large-scale eye tracking datasets or +for realistic animation of virtual humans. However, existing methods are +limited to predicting discrete fixation sequences or aggregated saliency maps, +thereby neglecting crucial parts of natural gaze behaviour. Our method uses +features extracted from 360{\deg} images as condition and uses two transformers +to model the temporal and spatial dependencies of continuous human gaze. We +evaluate DiffGaze on two 360{\deg} image benchmarks for gaze sequence +generation as well as scanpath prediction and saliency prediction. Our +evaluations show that DiffGaze outperforms state-of-the-art methods on all +tasks on both benchmarks. We also report a 21-participant user study showing +that our method generates gaze sequences that are indistinguishable from real +human sequences. + +
+
+
+
+
+ + ☆ LaRE^2: Latent Reconstruction Error Based Method for Diffusion-Generated + Image Detection CVPR 2024 + + +
+ The evolution of Diffusion Models has dramatically improved image generation +quality, making it increasingly difficult to differentiate between real and +generated images. This development, while impressive, also raises significant +privacy and security concerns. In response to this, we propose a novel Latent +REconstruction error guided feature REfinement method (LaRE^2) for detecting +the diffusion-generated images. We come up with the Latent Reconstruction Error +(LaRE), the first reconstruction-error based feature in the latent space for +generated image detection. LaRE surpasses existing methods in terms of feature +extraction efficiency while preserving crucial cues required to differentiate +between the real and the fake. To exploit LaRE, we propose an Error-Guided +feature REfinement module (EGRE), which can refine the image feature guided by +LaRE to enhance the discriminativeness of the feature. Our EGRE utilizes an +align-then-refine mechanism, which effectively refines the image feature for +generated-image detection from both spatial and channel perspectives. Extensive +experiments on the large-scale GenImage benchmark demonstrate the superiority +of our LaRE^2, which surpasses the best SoTA method by up to 11.9%/12.1% +average ACC/AP across 8 different image generators. LaRE also surpasses +existing methods in terms of feature extraction cost, delivering an impressive +speed enhancement of 8 times. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Building Bridges across Spatial and Temporal Resolutions: + Reference-Based Super-Resolution via Change Priors and Conditional Diffusion + Model CVPR2024 + + +
+ Reference-based super-resolution (RefSR) has the potential to build bridges +across spatial and temporal resolutions of remote sensing images. However, +existing RefSR methods are limited by the faithfulness of content +reconstruction and the effectiveness of texture transfer in large scaling +factors. Conditional diffusion models have opened up new opportunities for +generating realistic high-resolution images, but effectively utilizing +reference images within these models remains an area for further exploration. +Furthermore, content fidelity is difficult to guarantee in areas without +relevant reference information. To solve these issues, we propose a +change-aware diffusion model named Ref-Diff for RefSR, using the land cover +change priors to guide the denoising process explicitly. Specifically, we +inject the priors into the denoising model to improve the utilization of +reference information in unchanged areas and regulate the reconstruction of +semantically relevant content in changed areas. With this powerful guidance, we +decouple the semantics-guided denoising and reference texture-guided denoising +processes to improve the model performance. Extensive experiments demonstrate +the superior effectiveness and robustness of the proposed method compared with +state-of-the-art RefSR methods in both quantitative and qualitative +evaluations. The code and data are available at +https://github.com/dongrunmin/RefDiff. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ Chain of Compression: A Systematic Approach to Combinationally Compress + Convolutional Neural Networks + + +
+ Convolutional neural networks (CNNs) have achieved significant popularity, +but their computational and memory intensity poses challenges for +resource-constrained computing systems, particularly with the prerequisite of +real-time performance. To release this burden, model compression has become an +important research focus. Many approaches like quantization, pruning, early +exit, and knowledge distillation have demonstrated the effect of reducing +redundancy in neural networks. Upon closer examination, it becomes apparent +that each approach capitalizes on its unique features to compress the neural +network, and they can also exhibit complementary behavior when combined. To +explore the interactions and reap the benefits from the complementary features, +we propose the Chain of Compression, which works on the combinational sequence +to apply these common techniques to compress the neural network. Validated on +the image-based regression and classification networks across different data +sets, our proposed Chain of Compression can significantly compress the +computation cost by 100-1000 times with ignorable accuracy loss compared with +the baseline model. + +
+
+ comment: 10 pages, 15 figures +
+
+
+
+
+ + ☆ Integrating Mamba Sequence Model and Hierarchical Upsampling Network for + Accurate Semantic Segmentation of Multiple Sclerosis Legion + + +
+ Integrating components from convolutional neural networks and state space +models in medical image segmentation presents a compelling approach to enhance +accuracy and efficiency. We introduce Mamba HUNet, a novel architecture +tailored for robust and efficient segmentation tasks. Leveraging strengths from +Mamba UNet and the lighter version of Hierarchical Upsampling Network (HUNet), +Mamba HUNet combines convolutional neural networks local feature extraction +power with state space models long range dependency modeling capabilities. We +first converted HUNet into a lighter version, maintaining performance parity +and then integrated this lighter HUNet into Mamba HUNet, further enhancing its +efficiency. The architecture partitions input grayscale images into patches, +transforming them into 1D sequences for processing efficiency akin to Vision +Transformers and Mamba models. Through Visual State Space blocks and patch +merging layers, hierarchical features are extracted while preserving spatial +information. Experimental results on publicly available Magnetic Resonance +Imaging scans, notably in Multiple Sclerosis lesion segmentation, demonstrate +Mamba HUNet's effectiveness across diverse segmentation tasks. The model's +robustness and flexibility underscore its potential in handling complex +anatomical structures. These findings establish Mamba HUNet as a promising +solution in advancing medical image segmentation, with implications for +improving clinical decision making processes. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ Test-time Adaptation Meets Image Enhancement: Improving Accuracy via + Uncertainty-aware Logit Switching IJCNN2024 + + +
+ Deep neural networks have achieved remarkable success in a variety of +computer vision applications. However, there is a problem of degrading accuracy +when the data distribution shifts between training and testing. As a solution +of this problem, Test-time Adaptation~(TTA) has been well studied because of +its practicality. Although TTA methods increase accuracy under distribution +shift by updating the model at test time, using high-uncertainty predictions is +known to degrade accuracy. Since the input image is the root of the +distribution shift, we incorporate a new perspective on enhancing the input +image into TTA methods to reduce the prediction's uncertainty. We hypothesize +that enhancing the input image reduces prediction's uncertainty and increase +the accuracy of TTA methods. On the basis of our hypothesis, we propose a novel +method: Test-time Enhancer and Classifier Adaptation~(TECA). In TECA, the +classification model is combined with the image enhancement model that +transforms input images into recognition-friendly ones, and these models are +updated by existing TTA methods. Furthermore, we found that the prediction from +the enhanced image does not always have lower uncertainty than the prediction +from the original image. Thus, we propose logit switching, which compares the +uncertainty measure of these predictions and outputs the lower one. In our +experiments, we evaluate TECA with various TTA methods and show that TECA +reduces prediction's uncertainty and increases accuracy of TTA methods despite +having no hyperparameters and little parameter overhead. + +
+
+ comment: Accepted to IJCNN2024 +
+
+
+
+
+ + ☆ InterHandGen: Two-Hand Interaction Generation via Cascaded Reverse + Diffusion CVPR 2024 + + +
+ We present InterHandGen, a novel framework that learns the generative prior +of two-hand interaction. Sampling from our model yields plausible and diverse +two-hand shapes in close interaction with or without an object. Our prior can +be incorporated into any optimization or learning methods to reduce ambiguity +in an ill-posed setup. Our key observation is that directly modeling the joint +distribution of multiple instances imposes high learning complexity due to its +combinatorial nature. Thus, we propose to decompose the modeling of joint +distribution into the modeling of factored unconditional and conditional single +instance distribution. In particular, we introduce a diffusion model that +learns the single-hand distribution unconditional and conditional to another +hand via conditioning dropout. For sampling, we combine anti-penetration and +classifier-free guidance to enable plausible generation. Furthermore, we +establish the rigorous evaluation protocol of two-hand synthesis, where our +method significantly outperforms baseline generative models in terms of +plausibility and diversity. We also demonstrate that our diffusion prior can +boost the performance of two-hand reconstruction from monocular in-the-wild +images, achieving new state-of-the-art accuracy. + +
+
+ comment: Accepted to CVPR 2024, project page: + https://jyunlee.github.io/projects/interhandgen/ +
+
+
+
+
+ + ☆ Learning to Visually Localize Sound Sources from Mixtures without Prior + Source Knowledge CVPR 2024 + + +
+ The goal of the multi-sound source localization task is to localize sound +sources from the mixture individually. While recent multi-sound source +localization methods have shown improved performance, they face challenges due +to their reliance on prior information about the number of objects to be +separated. In this paper, to overcome this limitation, we present a novel +multi-sound source localization method that can perform localization without +prior knowledge of the number of sound sources. To achieve this goal, we +propose an iterative object identification (IOI) module, which can recognize +sound-making objects in an iterative manner. After finding the regions of +sound-making objects, we devise object similarity-aware clustering (OSC) loss +to guide the IOI module to effectively combine regions of the same object but +also distinguish between different objects and backgrounds. It enables our +method to perform accurate localization of sound-making objects without any +prior knowledge. Extensive experimental results on the MUSIC and VGGSound +benchmarks show the significant performance improvements of the proposed method +over the existing methods for both single and multi-source. Our code is +available at: https://github.com/VisualAIKHU/NoPrior_MultiSSL + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Neural Clustering based Visual Representation Learning CVPR 2024 + + +
+ We investigate a fundamental aspect of machine vision: the measurement of +features, by revisiting clustering, one of the most classic approaches in +machine learning and data analysis. Existing visual feature extractors, +including ConvNets, ViTs, and MLPs, represent an image as rectangular regions. +Though prevalent, such a grid-style paradigm is built upon engineering practice +and lacks explicit modeling of data distribution. In this work, we propose +feature extraction with clustering (FEC), a conceptually elegant yet +surprisingly ad-hoc interpretable neural clustering framework, which views +feature extraction as a process of selecting representatives from data and thus +automatically captures the underlying data distribution. Given an image, FEC +alternates between grouping pixels into individual clusters to abstract +representatives and updating the deep features of pixels with current +representatives. Such an iterative working mechanism is implemented in the form +of several neural layers and the final representatives can be used for +downstream tasks. The cluster assignments across layers, which can be viewed +and inspected by humans, make the forward process of FEC fully transparent and +empower it with promising ad-hoc interpretability. Extensive experiments on +various visual recognition models and tasks verify the effectiveness, +generality, and interpretability of FEC. We expect this work will provoke a +rethink of the current de facto grid-style paradigm. + +
+
+ comment: CVPR 2024. Code: https://github.com/guikunchen/FEC/ +
+
+
+
+
+ + ☆ SSF3D: Strict Semi-Supervised 3D Object Detection with Switching Filter + + +
+ SSF3D modified the semi-supervised 3D object detection (SS3DOD) framework, +which designed specifically for point cloud data. Leveraging the +characteristics of non-coincidence and weak correlation of target objects in +point cloud, we adopt a strategy of retaining only the truth-determining pseudo +labels and trimming the other fuzzy labels with points, instead of pursuing a +balance between the quantity and quality of pseudo labels. Besides, we notice +that changing the filter will make the model meet different distributed +targets, which is beneficial to break the training bottleneck. Two mechanism +are introduced to achieve above ideas: strict threshold and filter switching. +The experiments are conducted to analyze the effectiveness of above approaches +and their impact on the overall performance of the system. Evaluating on the +KITTI dataset, SSF3D exhibits superior performance compared to the current +state-of-the-art methods. The code will be released here. + +
+
+
+
+
+ + ☆ Decoupled Pseudo-labeling for Semi-Supervised Monocular 3D Object + Detection CVPR2024 + + +
+ We delve into pseudo-labeling for semi-supervised monocular 3D object +detection (SSM3OD) and discover two primary issues: a misalignment between the +prediction quality of 3D and 2D attributes and the tendency of depth +supervision derived from pseudo-labels to be noisy, leading to significant +optimization conflicts with other reliable forms of supervision. We introduce a +novel decoupled pseudo-labeling (DPL) approach for SSM3OD. Our approach +features a Decoupled Pseudo-label Generation (DPG) module, designed to +efficiently generate pseudo-labels by separately processing 2D and 3D +attributes. This module incorporates a unique homography-based method for +identifying dependable pseudo-labels in BEV space, specifically for 3D +attributes. Additionally, we present a DepthGradient Projection (DGP) module to +mitigate optimization conflicts caused by noisy depth supervision of +pseudo-labels, effectively decoupling the depth gradient and removing +conflicting gradients. This dual decoupling strategy-at both the pseudo-label +generation and gradient levels-significantly improves the utilization of +pseudo-labels in SSM3OD. Our comprehensive experiments on the KITTI benchmark +demonstrate the superiority of our method over existing approaches. + +
+
+ comment: To appear in CVPR2024 +
+
+
+
+
+ + ☆ Self-Rectifying Diffusion Sampling with Perturbed-Attention Guidance + + +
+ Recent studies have demonstrated that diffusion models are capable of +generating high-quality samples, but their quality heavily depends on sampling +guidance techniques, such as classifier guidance (CG) and classifier-free +guidance (CFG). These techniques are often not applicable in unconditional +generation or in various downstream tasks such as image restoration. In this +paper, we propose a novel sampling guidance, called Perturbed-Attention +Guidance (PAG), which improves diffusion sample quality across both +unconditional and conditional settings, achieving this without requiring +additional training or the integration of external modules. PAG is designed to +progressively enhance the structure of samples throughout the denoising +process. It involves generating intermediate samples with degraded structure by +substituting selected self-attention maps in diffusion U-Net with an identity +matrix, by considering the self-attention mechanisms' ability to capture +structural information, and guiding the denoising process away from these +degraded samples. In both ADM and Stable Diffusion, PAG surprisingly improves +sample quality in conditional and even unconditional scenarios. Moreover, PAG +significantly improves the baseline performance in various downstream tasks +where existing guidances such as CG or CFG cannot be fully utilized, including +ControlNet with empty prompts and image restoration such as inpainting and +deblurring. + +
+
+ comment: Project page is available at + https://ku-cvlab.github.io/Perturbed-Attention-Guidance +
+
+
+
+
+ + ☆ AIDE: An Automatic Data Engine for Object Detection in Autonomous + Driving CVPR-2024 + + +
+ Autonomous vehicle (AV) systems rely on robust perception models as a +cornerstone of safety assurance. However, objects encountered on the road +exhibit a long-tailed distribution, with rare or unseen categories posing +challenges to a deployed perception model. This necessitates an expensive +process of continuously curating and annotating data with significant human +effort. We propose to leverage recent advances in vision-language and large +language models to design an Automatic Data Engine (AIDE) that automatically +identifies issues, efficiently curates data, improves the model through +auto-labeling, and verifies the model through generation of diverse scenarios. +This process operates iteratively, allowing for continuous self-improvement of +the model. We further establish a benchmark for open-world detection on AV +datasets to comprehensively evaluate various learning paradigms, demonstrating +our method's superior performance at a reduced cost. + +
+
+ comment: Accepted by CVPR-2024 +
+
+
+
+
+ + ☆ CoDA: Instructive Chain-of-Domain Adaptation with Severity-Aware Visual + Prompt Tuning + + +
+ Unsupervised Domain Adaptation (UDA) aims to adapt models from labeled source +domains to unlabeled target domains. When adapting to adverse scenes, existing +UDA methods fail to perform well due to the lack of instructions, leading their +models to overlook discrepancies within all adverse scenes. To tackle this, we +propose CoDA which instructs models to distinguish, focus, and learn from these +discrepancies at scene and image levels. Specifically, CoDA consists of a +Chain-of-Domain (CoD) strategy and a Severity-Aware Visual Prompt Tuning +(SAVPT) mechanism. CoD focuses on scene-level instructions to divide all +adverse scenes into easy and hard scenes, guiding models to adapt from source +to easy domains with easy scene images, and then to hard domains with hard +scene images, thereby laying a solid foundation for whole adaptations. Building +upon this foundation, we employ SAVPT to dive into more detailed image-level +instructions to boost performance. SAVPT features a novel metric Severity that +divides all adverse scene images into low-severity and high-severity images. +Then Severity directs visual prompts and adapters, instructing models to +concentrate on unified severity features instead of scene-specific features, +without adding complexity to the model architecture. CoDA achieves SOTA +performances on widely-used benchmarks under all adverse scenes. Notably, CoDA +outperforms the existing ones by 4.6%, and 10.3% mIoU on the Foggy Driving, and +Foggy Zurich benchmarks, respectively. Our code is available at +https://github.com/Cuzyoung/CoDA + +
+
+
+
+
+ + ☆ Activity-Biometrics: Person Identification from Daily Activities CVPR 2024 + + +
+ In this work, we study a novel problem which focuses on person identification +while performing daily activities. Learning biometric features from RGB videos +is challenging due to spatio-temporal complexity and presence of appearance +biases such as clothing color and background. We propose ABNet, a novel +framework which leverages disentanglement of biometric and non-biometric +features to perform effective person identification from daily activities. +ABNet relies on a bias-less teacher to learn biometric features from RGB videos +and explicitly disentangle non-biometric features with the help of biometric +distortion. In addition, ABNet also exploits activity prior for biometrics +which is enabled by joint biometric and activity learning. We perform +comprehensive evaluation of the proposed approach across five different +datasets which are derived from existing activity recognition benchmarks. +Furthermore, we extensively compare ABNet with existing works in person +identification and demonstrate its effectiveness for activity-based biometrics +across all five datasets. The code and dataset can be accessed at: +\url{https://github.com/sacrcv/Activity-Biometrics/} + +
+
+ comment: CVPR 2024 Main conference +
+
+
+
+
+ + ☆ TRAM: Global Trajectory and Motion of 3D Humans from in-the-wild Videos + + +
+ We propose TRAM, a two-stage method to reconstruct a human's global +trajectory and motion from in-the-wild videos. TRAM robustifies SLAM to recover +the camera motion in the presence of dynamic humans and uses the scene +background to derive the motion scale. Using the recovered camera as a +metric-scale reference frame, we introduce a video transformer model (VIMO) to +regress the kinematic body motion of a human. By composing the two motions, we +achieve accurate recovery of 3D humans in the world space, reducing global +motion errors by 60% from prior work. https://yufu-wang.github.io/tram4d/ + +
+
+ comment: The project website: https://yufu-wang.github.io/tram4d/ +
+
+
+
+
+ + ☆ Language Models are Free Boosters for Biomedical Imaging Tasks + + +
+ In this study, we uncover the unexpected efficacy of residual-based large +language models (LLMs) as part of encoders for biomedical imaging tasks, a +domain traditionally devoid of language or textual data. The approach diverges +from established methodologies by utilizing a frozen transformer block, +extracted from pre-trained LLMs, as an innovative encoder layer for the direct +processing of visual tokens. This strategy represents a significant departure +from the standard multi-modal vision-language frameworks, which typically hinge +on language-driven prompts and inputs. We found that these LLMs could boost +performance across a spectrum of biomedical imaging applications, including +both 2D and 3D visual classification tasks, serving as plug-and-play boosters. +More interestingly, as a byproduct, we found that the proposed framework +achieved superior performance, setting new state-of-the-art results on +extensive, standardized datasets in MedMNIST-2D and 3D. Through this work, we +aim to open new avenues for employing LLMs in biomedical imaging and enriching +the understanding of their potential in this specialized domain. + +
+
+
+
+
+ + ☆ The Solution for the ICCV 2023 1st Scientific Figure Captioning + Challenge + + +
+ In this paper, we propose a solution for improving the quality of captions +generated for figures in papers. We adopt the approach of summarizing the +textual content in the paper to generate image captions. Throughout our study, +we encounter discrepancies in the OCR information provided in the official +dataset. To rectify this, we employ the PaddleOCR toolkit to extract OCR +information from all images. Moreover, we observe that certain textual content +in the official paper pertains to images that are not relevant for captioning, +thereby introducing noise during caption generation. To mitigate this issue, we +leverage LLaMA to extract image-specific information by querying the textual +content based on image mentions, effectively filtering out extraneous +information. Additionally, we recognize a discrepancy between the primary use +of maximum likelihood estimation during text generation and the evaluation +metrics such as ROUGE employed to assess the quality of generated captions. To +bridge this gap, we integrate the BRIO model framework, enabling a more +coherent alignment between the generation and evaluation processes. Our +approach ranked first in the final test with a score of 4.49. + +
+
+
+
+
+ + ☆ OVER-NAV: Elevating Iterative Vision-and-Language Navigation with + Open-Vocabulary Detection and StructurEd Representation CVPR 2024 + + +
+ Recent advances in Iterative Vision-and-Language Navigation (IVLN) introduce +a more meaningful and practical paradigm of VLN by maintaining the agent's +memory across tours of scenes. Although the long-term memory aligns better with +the persistent nature of the VLN task, it poses more challenges on how to +utilize the highly unstructured navigation memory with extremely sparse +supervision. Towards this end, we propose OVER-NAV, which aims to go over and +beyond the current arts of IVLN techniques. In particular, we propose to +incorporate LLMs and open-vocabulary detectors to distill key information and +establish correspondence between multi-modal signals. Such a mechanism +introduces reliable cross-modal supervision and enables on-the-fly +generalization to unseen scenes without the need of extra annotation and +re-training. To fully exploit the interpreted navigation data, we further +introduce a structured representation, coded Omnigraph, to effectively +integrate multi-modal information along the tour. Accompanied with a novel +omnigraph fusion mechanism, OVER-NAV is able to extract the most relevant +knowledge from omnigraph for a more accurate navigating action. In addition, +OVER-NAV seamlessly supports both discrete and continuous environments under a +unified framework. We demonstrate the superiority of OVER-NAV in extensive +experiments. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Labeling subtypes in a Parkinson's Cohort using Multifeatures in MRI - + Integrating Grey and White Matter Information + + +
+ Thresholding of networks has long posed a challenge in brain connectivity +analysis. Weighted networks are typically binarized using threshold measures to +facilitate network analysis. Previous studies on MRI-based brain networks have +predominantly utilized density or sparsity-based thresholding techniques, +optimized within specific ranges derived from network metrics such as path +length, clustering coefficient, and small-world index. Thus, determination of a +single threshold value for facilitating comparative analysis of networks +remains elusive. To address this, our study introduces Mutual K-Nearest +Neighbor (MKNN)-based thresholding for brain network analysis. Here, nearest +neighbor selection is based on the highest correlation between features of +brain regions. Construction of brain networks was accomplished by computing +Pearson correlations between grey matter volume and white matter volume for +each pair of brain regions. Structural MRI data from 180 Parkinsons patients +and 70 controls from the NIMHANS, India were analyzed. Subtypes within +Parkinsons disease were identified based on grey and white matter volume +atrophy using source-based morphometric decomposition. The loading coefficients +were correlated with clinical features to discern clinical relationship with +the deciphered subtypes. Our data-mining approach revealed: Subtype A (N = 51, +intermediate type), Subtype B (N = 57, mild-severe type with mild motor +symptoms), and Subtype AB (N = 36, most-severe type with predominance in motor +impairment). Subtype-specific weighted matrices were binarized using MKNN-based +thresholding for brain network analysis. Permutation tests on network metrics +of resulting bipartite graphs demonstrated significant group differences in +betweenness centrality and participation coefficient. The identified hubs were +specific to each subtype, with some hubs conserved across different subtypes. + +
+
+ comment: 31 pages, 10 figures, 3 tables +
+
+
+
+
+ + ☆ Staircase Localization for Autonomous Exploration in Urban Environments + + +
+ A staircase localization method is proposed for robots to explore urban +environments autonomously. The proposed method employs a modular design in the +form of a cascade pipeline consisting of three modules of stair detection, line +segment detection, and stair localization modules. The stair detection module +utilizes an object detection algorithm based on deep learning to generate a +region of interest (ROI). From the ROI, line segment features are extracted +using a deep line segment detection algorithm. The extracted line segments are +used to localize a staircase in terms of position, orientation, and stair +direction. The stair detection and localization are performed only with a +single RGB-D camera. Each component of the proposed pipeline does not need to +be designed particularly for staircases, which makes it easy to maintain the +whole pipeline and replace each component with state-of-the-art deep learning +detection techniques. The results of real-world experiments show that the +proposed method can perform accurate stair detection and localization during +autonomous exploration for various structured and unstructured upstairs and +downstairs with shadows, dirt, and occlusions by artificial and natural +objects. + +
+
+ comment: 9 pages, 10 figures +
+
+
+
+
+ + ☆ Accuracy enhancement method for speech emotion recognition from + spectrogram using temporal frequency correlation and positional information + learning through knowledge transfer + + +
+ In this paper, we propose a method to improve the accuracy of speech emotion +recognition (SER) by using vision transformer (ViT) to attend to the +correlation of frequency (y-axis) with time (x-axis) in spectrogram and +transferring positional information between ViT through knowledge transfer. The +proposed method has the following originality i) We use vertically segmented +patches of log-Mel spectrogram to analyze the correlation of frequencies over +time. This type of patch allows us to correlate the most relevant frequencies +for a particular emotion with the time they were uttered. ii) We propose the +use of image coordinate encoding, an absolute positional encoding suitable for +ViT. By normalizing the x, y coordinates of the image to -1 to 1 and +concatenating them to the image, we can effectively provide valid absolute +positional information for ViT. iii) Through feature map matching, the locality +and location information of the teacher network is effectively transmitted to +the student network. Teacher network is a ViT that contains locality of +convolutional stem and absolute position information through image coordinate +encoding, and student network is a structure that lacks positional encoding in +the basic ViT structure. In feature map matching stage, we train through the +mean absolute error (L1 loss) to minimize the difference between the feature +maps of the two networks. To validate the proposed method, three emotion +datasets (SAVEE, EmoDB, and CREMA-D) consisting of speech were converted into +log-Mel spectrograms for comparison experiments. The experimental results show +that the proposed method significantly outperforms the state-of-the-art methods +in terms of weighted accuracy while requiring significantly fewer floating +point operations (FLOPs). Overall, the proposed method offers an promising +solution for SER by providing improved efficiency and performance. + +
+
+
+
+
+ + ☆ Physical 3D Adversarial Attacks against Monocular Depth Estimation in + Autonomous Driving CVPR 2024 + + +
+ Deep learning-based monocular depth estimation (MDE), extensively applied in +autonomous driving, is known to be vulnerable to adversarial attacks. Previous +physical attacks against MDE models rely on 2D adversarial patches, so they +only affect a small, localized region in the MDE map but fail under various +viewpoints. To address these limitations, we propose 3D Depth Fool +(3D$^2$Fool), the first 3D texture-based adversarial attack against MDE models. +3D$^2$Fool is specifically optimized to generate 3D adversarial textures +agnostic to model types of vehicles and to have improved robustness in bad +weather conditions, such as rain and fog. Experimental results validate the +superior performance of our 3D$^2$Fool across various scenarios, including +vehicles, MDE models, weather conditions, and viewpoints. Real-world +experiments with printed 3D textures on physical vehicle models further +demonstrate that our 3D$^2$Fool can cause an MDE error of over 10 meters. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Tracing and segmentation of molecular patterns in 3-dimensional + cryo-et/em density maps through algorithmic image processing and deep + learning-based techniques + + +
+ Understanding the structures of biological macromolecules is highly important +as they are closely associated with cellular functionalities. Comprehending the +precise organization actin filaments is crucial because they form the dynamic +cytoskeleton, which offers structural support to cells and connects the cell's +interior with its surroundings. However, determining the precise organization +of actin filaments is challenging due to the poor quality of cryo-electron +tomography (cryo-ET) images, which suffer from low signal-to-noise (SNR) ratios +and the presence of missing wedge, as well as diverse shape characteristics of +actin filaments. To address these formidable challenges, the primary component +of this dissertation focuses on developing sophisticated computational +techniques for tracing actin filaments. In particular, three novel +methodologies have been developed: i) BundleTrac, for tracing bundle-like actin +filaments found in Stereocilium, ii) Spaghetti Tracer, for tracing filaments +that move individually with loosely cohesive movements, and iii) Struwwel +Tracer, for tracing randomly orientated actin filaments in the actin network. +The second component of the dissertation introduces a convolutional neural +network (CNN) based segmentation model to determine the location of protein +secondary structures, such as helices and beta-sheets, in medium-resolution +(5-10 Angstrom) 3-dimensional cryo-electron microscopy (cryo-EM) images. This +methodology later evolved into a tool named DeepSSETracer. The final component +of the dissertation presents a novel algorithm, cylindrical fit measure, to +estimate image structure match at helix regions in medium-resolution cryo-EM +images. Overall, my dissertation has made significant contributions to +addressing critical research challenges in structural biology by introducing +various computational methods and tools. + +
+
+
+
+
+ + ☆ The Effects of Short Video-Sharing Services on Video Copy Detection + + +
+ The short video-sharing services that allow users to post 10-30 second videos +(e.g., YouTube Shorts and TikTok) have attracted a lot of attention in recent +years. However, conventional video copy detection (VCD) methods mainly focus on +general video-sharing services (e.g., YouTube and Bilibili), and the effects of +short video-sharing services on video copy detection are still unclear. +Considering that illegally copied videos in short video-sharing services have +service-distinctive characteristics, especially in those time lengths, the pros +and cons of VCD in those services are required to be analyzed. In this paper, +we examine the effects of short video-sharing services on VCD by constructing a +dataset that has short video-sharing service characteristics. Our novel dataset +is automatically constructed from the publicly available dataset to have +reference videos and fixed short-time-length query videos, and such automation +procedures assure the reproducibility and data privacy preservation of this +paper. From the experimental results focusing on segment-level and video-level +situations, we can see that three effects: "Segment-level VCD in short +video-sharing services is more difficult than those in general video-sharing +services", "Video-level VCD in short video-sharing services is easier than +those in general video-sharing services", "The video alignment component mainly +suppress the detection performance in short video-sharing services". + +
+
+
+
+
+ + ☆ Automated Report Generation for Lung Cytological Images Using a CNN + Vision Classifier and Multiple-Transformer Text Decoders: Preliminary Study + + +
+ Cytology plays a crucial role in lung cancer diagnosis. Pulmonary cytology +involves cell morphological characterization in the specimen and reporting the +corresponding findings, which are extremely burdensome tasks. In this study, we +propose a report-generation technique for lung cytology images. In total, 71 +benign and 135 malignant pulmonary cytology specimens were collected. Patch +images were extracted from the captured specimen images, and the findings were +assigned to each image as a dataset for report generation. The proposed method +consists of a vision model and a text decoder. In the former, a convolutional +neural network (CNN) is used to classify a given image as benign or malignant, +and the features related to the image are extracted from the intermediate +layer. Independent text decoders for benign and malignant cells are prepared +for text generation, and the text decoder switches according to the CNN +classification results. The text decoder is configured using a Transformer that +uses the features obtained from the CNN for report generation. Based on the +evaluation results, the sensitivity and specificity were 100% and 96.4%, +respectively, for automated benign and malignant case classification, and the +saliency map indicated characteristic benign and malignant areas. The grammar +and style of the generated texts were confirmed as correct and in better +agreement with gold standard compared to existing LLM-based image-captioning +methods and single-text-decoder ablation model. These results indicate that the +proposed method is useful for pulmonary cytology classification and reporting. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ Leak and Learn: An Attacker's Cookbook to Train Using Leaked Data from + Federated Learning CVPR 2024 + + +
+ Federated learning is a decentralized learning paradigm introduced to +preserve privacy of client data. Despite this, prior work has shown that an +attacker at the server can still reconstruct the private training data using +only the client updates. These attacks are known as data reconstruction attacks +and fall into two major categories: gradient inversion (GI) and linear layer +leakage attacks (LLL). However, despite demonstrating the effectiveness of +these attacks in breaching privacy, prior work has not investigated the +usefulness of the reconstructed data for downstream tasks. In this work, we +explore data reconstruction attacks through the lens of training and improving +models with leaked data. We demonstrate the effectiveness of both GI and LLL +attacks in maliciously training models using the leaked data more accurately +than a benign federated learning strategy. Counter-intuitively, this bump in +training quality can occur despite limited reconstruction quality or a small +total number of leaked images. Finally, we show the limitations of these +attacks for downstream training, individually for GI attacks and for LLL +attacks. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Pseudo-MRI-Guided PET Image Reconstruction Method Based on a Diffusion + Probabilistic Model + + +
+ Anatomically guided PET reconstruction using MRI information has been shown +to have the potential to improve PET image quality. However, these improvements +are limited to PET scans with paired MRI information. In this work we employed +a diffusion probabilistic model (DPM) to infer T1-weighted-MRI (deep-MRI) +images from FDG-PET brain images. We then use the DPM-generated T1w-MRI to +guide the PET reconstruction. The model was trained with brain FDG scans, and +tested in datasets containing multiple levels of counts. Deep-MRI images +appeared somewhat degraded than the acquired MRI images. Regarding PET image +quality, volume of interest analysis in different brain regions showed that +both PET reconstructed images using the acquired and the deep-MRI images +improved image quality compared to OSEM. Same conclusions were found analysing +the decimated datasets. A subjective evaluation performed by two physicians +confirmed that OSEM scored consistently worse than the MRI-guided PET images +and no significant differences were observed between the MRI-guided PET images. +This proof of concept shows that it is possible to infer DPM-based MRI imagery +to guide the PET reconstruction, enabling the possibility of changing +reconstruction parameters such as the strength of the prior on anatomically +guided PET reconstruction in the absence of MRI. + +
+
+
+
+
+ + ☆ Integrative Graph-Transformer Framework for Histopathology Whole Slide + Image Representation and Classification + + +
+ In digital pathology, the multiple instance learning (MIL) strategy is widely +used in the weakly supervised histopathology whole slide image (WSI) +classification task where giga-pixel WSIs are only labeled at the slide level. +However, existing attention-based MIL approaches often overlook contextual +information and intrinsic spatial relationships between neighboring tissue +tiles, while graph-based MIL frameworks have limited power to recognize the +long-range dependencies. In this paper, we introduce the integrative +graph-transformer framework that simultaneously captures the context-aware +relational features and global WSI representations through a novel Graph +Transformer Integration (GTI) block. Specifically, each GTI block consists of a +Graph Convolutional Network (GCN) layer modeling neighboring relations at the +local instance level and an efficient global attention model capturing +comprehensive global information from extensive feature embeddings. Extensive +experiments on three publicly available WSI datasets: TCGA-NSCLC, TCGA-RCC and +BRIGHT, demonstrate the superiority of our approach over current +state-of-the-art MIL methods, achieving an improvement of 1.0% to 2.6% in +accuracy and 0.7%-1.6% in AUROC. + +
+
+
+
+
+ + ☆ Recommendation of data-free class-incremental learning algorithms by + simulating future data + + +
+ Class-incremental learning deals with sequential data streams composed of +batches of classes. Various algorithms have been proposed to address the +challenging case where samples from past classes cannot be stored. However, +selecting an appropriate algorithm for a user-defined setting is an open +problem, as the relative performance of these algorithms depends on the +incremental settings. To solve this problem, we introduce an algorithm +recommendation method that simulates the future data stream. Given an initial +set of classes, it leverages generative models to simulate future classes from +the same visual domain. We evaluate recent algorithms on the simulated stream +and recommend the one which performs best in the user-defined incremental +setting. We illustrate the effectiveness of our method on three large datasets +using six algorithms and six incremental settings. Our method outperforms +competitive baselines, and performance is close to that of an oracle choosing +the best algorithm in each setting. This work contributes to facilitate the +practical deployment of incremental learning. + +
+
+
+
+
+ + ☆ EgoLifter: Open-world 3D Segmentation for Egocentric Perception + + +
+ In this paper we present EgoLifter, a novel system that can automatically +segment scenes captured from egocentric sensors into a complete decomposition +of individual 3D objects. The system is specifically designed for egocentric +data where scenes contain hundreds of objects captured from natural +(non-scanning) motion. EgoLifter adopts 3D Gaussians as the underlying +representation of 3D scenes and objects and uses segmentation masks from the +Segment Anything Model (SAM) as weak supervision to learn flexible and +promptable definitions of object instances free of any specific object +taxonomy. To handle the challenge of dynamic objects in ego-centric videos, we +design a transient prediction module that learns to filter out dynamic objects +in the 3D reconstruction. The result is a fully automatic pipeline that is able +to reconstruct 3D object instances as collections of 3D Gaussians that +collectively compose the entire scene. We created a new benchmark on the Aria +Digital Twin dataset that quantitatively demonstrates its state-of-the-art +performance in open-world 3D segmentation from natural egocentric input. We run +EgoLifter on various egocentric activity datasets which shows the promise of +the method for 3D egocentric perception at scale. + +
+
+ comment: Preprint. Project page: https://egolifter.github.io/ +
+
+
+
+
+ + ☆ TDIP: Tunable Deep Image Processing, a Real Time Melt Pool Monitoring + Solution + + +
+ In the era of Industry 4.0, Additive Manufacturing (AM), particularly metal +AM, has emerged as a significant contributor due to its innovative and +cost-effective approach to fabricate highly intricate geometries. Despite its +potential, this industry still lacks real-time capable process monitoring +algorithms. Recent advancements in this field suggest that Melt Pool (MP) +signatures during the fabrication process contain crucial information about +process dynamics and quality. To obtain this information, various sensory +approaches, such as high-speed cameras-based vision modules are employed for +online fabrication monitoring. However, many conventional in-depth analyses +still cannot process all the recorded data simultaneously. Although +conventional Image Processing (ImP) solutions provide a targeted tunable +approach, they pose a trade-off between convergence certainty and convergence +speed. As a result, conventional methods are not suitable for a dynamically +changing application like MP monitoring. Therefore, this article proposes the +implementation of a Tunable Deep Image Processing (TDIP) method to address the +data-rich monitoring needs in real-time. The proposed model is first trained to +replicate an ImP algorithm with tunable features and methodology. The TDIP +model is then further improved to account for MP geometries and fabrication +quality based on the vision input and process parameters. The TDIP model +achieved over 94% estimation accuracy with more than 96% R2 score for quality, +geometry, and MP signature estimation and isolation. The TDIP model can process +500 images per second, while conventional methods taking a few minutes per +image. This significant processing time reduction enables the integration of +vision-based monitoring in real-time for processes and quality estimation. + +
+
+
+
+
+ + ☆ QuakeSet: A Dataset and Low-Resource Models to Monitor Earthquakes + through Sentinel-1 SC + + +
+ Earthquake monitoring is necessary to promptly identify the affected areas, +the severity of the events, and, finally, to estimate damages and plan the +actions needed for the restoration process. The use of seismic stations to +monitor the strength and origin of earthquakes is limited when dealing with +remote areas (we cannot have global capillary coverage). Identification and +analysis of all affected areas is mandatory to support areas not monitored by +traditional stations. Using social media images in crisis management has proven +effective in various situations. However, they are still limited by the +possibility of using communication infrastructures in case of an earthquake and +by the presence of people in the area. Moreover, social media images and +messages cannot be used to estimate the actual severity of earthquakes and +their characteristics effectively. The employment of satellites to monitor +changes around the globe grants the possibility of exploiting instrumentation +that is not limited by the visible spectrum, the presence of land +infrastructures, and people in the affected areas. In this work, we propose a +new dataset composed of images taken from Sentinel-1 and a new series of tasks +to help monitor earthquakes from a new detailed view. Coupled with the data, we +provide a series of traditional machine learning and deep learning models as +baselines to assess the effectiveness of ML-based models in earthquake +analysis. + +
+
+ comment: Accepted at ISCRAM 2024 +
+
+
+
+
+ + ☆ Segment Any Medical Model Extended SP + + +
+ The Segment Anything Model (SAM) has drawn significant attention from +researchers who work on medical image segmentation because of its +generalizability. However, researchers have found that SAM may have limited +performance on medical images compared to state-of-the-art non-foundation +models. Regardless, the community sees potential in extending, fine-tuning, +modifying, and evaluating SAM for analysis of medical imaging. An increasing +number of works have been published focusing on the mentioned four directions, +where variants of SAM are proposed. To this end, a unified platform helps push +the boundary of the foundation model for medical images, facilitating the use, +modification, and validation of SAM and its variants in medical image +segmentation. In this work, we introduce SAMM Extended (SAMME), a platform that +integrates new SAM variant models, adopts faster communication protocols, +accommodates new interactive modes, and allows for fine-tuning of subcomponents +of the models. These features can expand the potential of foundation models +like SAM, and the results can be translated to applications such as +image-guided therapy, mixed reality interaction, robotic navigation, and data +augmentation. + +
+
+ comment: The content of the manuscript has been presented in SPIE Medical + Imaging 2024, and had been accepted to appear in the proceedings of the + conference +
+
+
+
+
+ + ☆ Mathematical Foundation and Corrections for Full Range Head Pose + Estimation + + +
+ Numerous works concerning head pose estimation (HPE) offer algorithms or +proposed neural network-based approaches for extracting Euler angles from +either facial key points or directly from images of the head region. However, +many works failed to provide clear definitions of the coordinate systems and +Euler or Tait-Bryan angles orders in use. It is a well-known fact that rotation +matrices depend on coordinate systems, and yaw, roll, and pitch angles are +sensitive to their application order. Without precise definitions, it becomes +challenging to validate the correctness of the output head pose and drawing +routines employed in prior works. In this paper, we thoroughly examined the +Euler angles defined in the 300W-LP dataset, head pose estimation such as +3DDFA-v2, 6D-RepNet, WHENet, etc, and the validity of their drawing routines of +the Euler angles. When necessary, we infer their coordinate system and sequence +of yaw, roll, pitch from provided code. This paper presents (1) code and +algorithms for inferring coordinate system from provided source code, code for +Euler angle application order and extracting precise rotation matrices and the +Euler angles, (2) code and algorithms for converting poses from one rotation +system to another, (3) novel formulae for 2D augmentations of the rotation +matrices, and (4) derivations and code for the correct drawing routines for +rotation matrices and poses. This paper also addresses the feasibility of +defining rotations with right-handed coordinate system in Wikipedia and SciPy, +which makes the Euler angle extraction much easier for full-range head pose +research. + +
+
+
+
+
+ + ☆ Tutorial on Diffusion Models for Imaging and Vision + + +
+ The astonishing growth of generative tools in recent years has empowered many +exciting applications in text-to-image generation and text-to-video generation. +The underlying principle behind these generative tools is the concept of +diffusion, a particular sampling mechanism that has overcome some shortcomings +that were deemed difficult in the previous approaches. The goal of this +tutorial is to discuss the essential ideas underlying the diffusion models. The +target audience of this tutorial includes undergraduate and graduate students +who are interested in doing research on diffusion models or applying these +models to solve other problems. + +
+
+
+
+
+ + ☆ Efficient Multi-Band Temporal Video Filter for Reducing Human-Robot + Interaction + + +
+ Although mobile robots have on-board sensors to perform navigation, their +efficiency in completing paths can be enhanced by planning to avoid human +interaction. Infrastructure cameras can capture human activity continuously for +the purpose of compiling activity analytics to choose efficient times and +routes. We describe a cascade temporal filtering method to efficiently extract +short- and long-term activity in two time dimensions, isochronal and +chronological, for use in global path planning and local navigation +respectively. The temporal filter has application either independently, or, if +object recognition is also required, it can be used as a pre-filter to perform +activity-gating of the more computationally expensive neural network +processing. For a testbed 32-camera network, we show how this hybrid approach +can achieve over 8 times improvement in frames per second throughput and 6.5 +times reduction of system power use. We also show how the cost map of static +objects in the ROS robot software development framework is augmented with +dynamic regions determined from the temporal filter. + +
+
+ comment: 15 pages, 5 figures, 4 tables +
+
+
+
+
+ + ☆ A Personalized Video-Based Hand Taxonomy: Application for Individuals + with Spinal Cord Injury + + +
+ Hand function is critical for our interactions and quality of life. Spinal +cord injuries (SCI) can impair hand function, reducing independence. A +comprehensive evaluation of function in home and community settings requires a +hand grasp taxonomy for individuals with impaired hand function. Developing +such a taxonomy is challenging due to unrepresented grasp types in standard +taxonomies, uneven data distribution across injury levels, and limited data. +This study aims to automatically identify the dominant distinct hand grasps in +egocentric video using semantic clustering. Egocentric video recordings +collected in the homes of 19 individual with cervical SCI were used to cluster +grasping actions with semantic significance. A deep learning model integrating +posture and appearance data was employed to create a personalized hand +taxonomy. Quantitative analysis reveals a cluster purity of 67.6% +- 24.2% with +with 18.0% +- 21.8% redundancy. Qualitative assessment revealed meaningful +clusters in video content. This methodology provides a flexible and effective +strategy to analyze hand function in the wild. It offers researchers and +clinicians an efficient tool for evaluating hand function, aiding sensitive +assessments and tailored intervention plans. + +
+
+
+
+
+ + ☆ OCAI: Improving Optical Flow Estimation by Occlusion and Consistency + Aware Interpolation CVPR 2024 + + +
+ The scarcity of ground-truth labels poses one major challenge in developing +optical flow estimation models that are both generalizable and robust. While +current methods rely on data augmentation, they have yet to fully exploit the +rich information available in labeled video sequences. We propose OCAI, a +method that supports robust frame interpolation by generating intermediate +video frames alongside optical flows in between. Utilizing a forward warping +approach, OCAI employs occlusion awareness to resolve ambiguities in pixel +values and fills in missing values by leveraging the forward-backward +consistency of optical flows. Additionally, we introduce a teacher-student +style semi-supervised learning method on top of the interpolated frames. Using +a pair of unlabeled frames and the teacher model's predicted optical flow, we +generate interpolated frames and flows to train a student model. The teacher's +weights are maintained using Exponential Moving Averaging of the student. Our +evaluations demonstrate perceptually superior interpolation quality and +enhanced optical flow accuracy on established benchmarks such as Sintel and +KITTI. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ EgoPoseFormer: A Simple Baseline for Egocentric 3D Human Pose Estimation + + +
+ We present EgoPoseFormer, a simple yet effective transformer-based model for +stereo egocentric human pose estimation. The main challenge in egocentric pose +estimation is overcoming joint invisibility, which is caused by self-occlusion +or a limited field of view (FOV) of head-mounted cameras. Our approach +overcomes this challenge by incorporating a two-stage pose estimation paradigm: +in the first stage, our model leverages the global information to estimate each +joint's coarse location, then in the second stage, it employs a DETR style +transformer to refine the coarse locations by exploiting fine-grained stereo +visual features. In addition, we present a deformable stereo operation to +enable our transformer to effectively process multi-view features, which +enables it to accurately localize each joint in the 3D world. We evaluate our +method on the stereo UnrealEgo dataset and show it significantly outperforms +previous approaches while being computationally efficient: it improves MPJPE by +27.4mm (45% improvement) with only 7.9% model parameters and 13.1% FLOPs +compared to the state-of-the-art. Surprisingly, with proper training +techniques, we find that even our first-stage pose proposal network can achieve +superior performance compared to previous arts. We also show that our method +can be seamlessly extended to monocular settings, which achieves +state-of-the-art performance on the SceneEgo dataset, improving MPJPE by 25.5mm +(21% improvement) compared to the best existing method with only 60.7% model +parameters and 36.4% FLOPs. + +
+
+ comment: Tech Report +
+
+
+
+
+ + ☆ Every Shot Counts: Using Exemplars for Repetition Counting in Videos + + +
+ Video repetition counting infers the number of repetitions of recurring +actions or motion within a video. We propose an exemplar-based approach that +discovers visual correspondence of video exemplars across repetitions within +target videos. Our proposed Every Shot Counts (ESCounts) model is an +attention-based encoder-decoder that encodes videos of varying lengths +alongside exemplars from the same and different videos. In training, ESCounts +regresses locations of high correspondence to the exemplars within the video. +In tandem, our method learns a latent that encodes representations of general +repetitive motions, which we use for exemplar-free, zero-shot inference. +Extensive experiments over commonly used datasets (RepCount, Countix, and +UCFRep) showcase ESCounts obtaining state-of-the-art performance across all +three datasets. On RepCount, ESCounts increases the off-by-one from 0.39 to +0.56 and decreases the mean absolute error from 0.38 to 0.21. Detailed +ablations further demonstrate the effectiveness of our method. + +
+
+ comment: Project website: https://sinhasaptarshi.github.io/escounts +
+
+
+
+
+ + ☆ State of the art applications of deep learning within tracking and + detecting marine debris: A survey + + +
+ Deep learning techniques have been explored within the marine litter problem +for approximately 20 years but the majority of the research has developed +rapidly in the last five years. We provide an in-depth, up to date, summary and +analysis of 28 of the most recent and significant contributions of deep +learning in marine debris. From cross referencing the research paper results, +the YOLO family significantly outperforms all other methods of object detection +but there are many respected contributions to this field that have +categorically agreed that a comprehensive database of underwater debris is not +currently available for machine learning. Using a small dataset curated and +labelled by us, we tested YOLOv5 on a binary classification task and found the +accuracy was low and the rate of false positives was high; highlighting the +importance of a comprehensive database. We conclude this survey with over 40 +future research recommendations and open challenges. + +
+
+ comment: Review paper, 60 pages including references, 1 figure, 3 tables, 1 + supplementary data +
+
+
+
+
+ + ☆ Spectral Convolutional Transformer: Harmonizing Real vs. Complex + Multi-View Spectral Operators for Vision Transformer + + +
+ Transformers used in vision have been investigated through diverse +architectures - ViT, PVT, and Swin. These have worked to improve the attention +mechanism and make it more efficient. Differently, the need for including local +information was felt, leading to incorporating convolutions in transformers +such as CPVT and CvT. Global information is captured using a complex Fourier +basis to achieve global token mixing through various methods, such as AFNO, +GFNet, and Spectformer. We advocate combining three diverse views of data - +local, global, and long-range dependence. We also investigate the simplest +global representation using only the real domain spectral representation - +obtained through the Hartley transform. We use a convolutional operator in the +initial layers to capture local information. Through these two contributions, +we are able to optimize and obtain a spectral convolution transformer (SCT) +that provides improved performance over the state-of-the-art methods while +reducing the number of parameters. Through extensive experiments, we show that +SCT-C-small gives state-of-the-art performance on the ImageNet dataset and +reaches 84.5\% top-1 accuracy, while SCT-C-Large reaches 85.9\% and SCT-C-Huge +reaches 86.4\%. We evaluate SCT on transfer learning on datasets such as +CIFAR-10, CIFAR-100, Oxford Flower, and Stanford Car. We also evaluate SCT on +downstream tasks i.e. instance segmentation on the MSCOCO dataset. The project +page is available on this webpage.\url{https://github.com/badripatro/sct} + +
+
+
+
+
+ + ☆ Global Point Cloud Registration Network for Large Transformations + + +
+ Three-dimensional data registration is an established yet challenging problem +that is key in many different applications, such as mapping the environment for +autonomous vehicles, and modeling objects and people for avatar creation, among +many others. Registration refers to the process of mapping multiple data into +the same coordinate system by means of matching correspondences and +transformation estimation. Novel proposals exploit the benefits of deep +learning architectures for this purpose, as they learn the best features for +the data, providing better matches and hence results. However, the state of the +art is usually focused on cases of relatively small transformations, although +in certain applications and in a real and practical environment, large +transformations are very common. In this paper, we present ReLaTo (Registration +for Large Transformations), an architecture that faces the cases where large +transformations happen while maintaining good performance for local +transformations. This proposal uses a novel Softmax pooling layer to find +correspondences in a bilateral consensus manner between two point sets, +sampling the most confident matches. These matches are used to estimate a +coarse and global registration using weighted Singular Value Decomposition +(SVD). A target-guided denoising step is then applied to both the obtained +matches and latent features, estimating the final fine registration considering +the local geometry. All these steps are carried out following an end-to-end +approach, which has been shown to improve 10 state-of-the-art registration +methods in two datasets commonly used for this task (ModelNet40 and KITTI), +especially in the case of large transformations. + +
+
+
+
+
+ + ☆ TGGLinesPlus: A robust topological graph-guided computer vision + algorithm for line detection from images + + +
+ Line detection is a classic and essential problem in image processing, +computer vision and machine intelligence. Line detection has many important +applications, including image vectorization (e.g., document recognition and art +design), indoor mapping, and important societal challenges (e.g., sea ice +fracture line extraction from satellite imagery). Many line detection +algorithms and methods have been developed, but robust and intuitive methods +are still lacking. In this paper, we proposed and implemented a topological +graph-guided algorithm, named TGGLinesPlus, for line detection. Our experiments +on images from a wide range of domains have demonstrated the flexibility of our +TGGLinesPlus algorithm. We also benchmarked our algorithm with five classic and +state-of-the-art line detection methods and the results demonstrate the +robustness of TGGLinesPlus. We hope our open-source implementation of +TGGLinesPlus will inspire and pave the way for many applications where spatial +science matters. + +
+
+ comment: Our TGGLinesPlus Python implementation is open source. 27 pages, 8 + figures and 4 tables +
+
+
+
+
+ + ☆ Move as You Say, Interact as You Can: Language-guided Human Motion + Generation with Scene Affordance CVPR 2024 + + +
+ Despite significant advancements in text-to-motion synthesis, generating +language-guided human motion within 3D environments poses substantial +challenges. These challenges stem primarily from (i) the absence of powerful +generative models capable of jointly modeling natural language, 3D scenes, and +human motion, and (ii) the generative models' intensive data requirements +contrasted with the scarcity of comprehensive, high-quality, +language-scene-motion datasets. To tackle these issues, we introduce a novel +two-stage framework that employs scene affordance as an intermediate +representation, effectively linking 3D scene grounding and conditional motion +generation. Our framework comprises an Affordance Diffusion Model (ADM) for +predicting explicit affordance map and an Affordance-to-Motion Diffusion Model +(AMDM) for generating plausible human motions. By leveraging scene affordance +maps, our method overcomes the difficulty in generating human motion under +multimodal condition signals, especially when training with limited data +lacking extensive language-scene-motion pairs. Our extensive experiments +demonstrate that our approach consistently outperforms all baselines on +established benchmarks, including HumanML3D and HUMANISE. Additionally, we +validate our model's exceptional generalization capabilities on a specially +curated evaluation set featuring previously unseen descriptions and scenes. + +
+
+ comment: CVPR 2024; 16 pages +
+
+
+
+
+ + ☆ Bidirectional Consistency Models + + +
+ Diffusion models (DMs) are capable of generating remarkably high-quality +samples by iteratively denoising a random vector, a process that corresponds to +moving along the probability flow ordinary differential equation (PF ODE). +Interestingly, DMs can also invert an input image to noise by moving backward +along the PF ODE, a key operation for downstream tasks such as interpolation +and image editing. However, the iterative nature of this process restricts its +speed, hindering its broader application. Recently, Consistency Models (CMs) +have emerged to address this challenge by approximating the integral of the PF +ODE, thereby bypassing the need to iterate. Yet, the absence of an explicit ODE +solver complicates the inversion process. To resolve this, we introduce the +Bidirectional Consistency Model (BCM), which learns a single neural network +that enables both forward and backward traversal along the PF ODE, efficiently +unifying generation and inversion tasks within one framework. Notably, our +proposed method enables one-step generation and inversion while also allowing +the use of additional steps to enhance generation quality or reduce +reconstruction error. Furthermore, by leveraging our model's bidirectional +consistency, we introduce a sampling strategy that can enhance FID while +preserving the generated image content. We further showcase our model's +capabilities in several downstream tasks, such as interpolation and inpainting, +and present demonstrations of potential applications, including blind +restoration of compressed images and defending black-box adversarial attacks. + +
+
+ comment: 40 pages, 25 figures +
+
+
+
+
+ + ☆ SpectralWaste Dataset: Multimodal Data for Waste Sorting Automation + + +
+ The increase in non-biodegradable waste is a worldwide concern. Recycling +facilities play a crucial role, but their automation is hindered by the complex +characteristics of waste recycling lines like clutter or object deformation. In +addition, the lack of publicly available labeled data for these environments +makes developing robust perception systems challenging. Our work explores the +benefits of multimodal perception for object segmentation in real waste +management scenarios. First, we present SpectralWaste, the first dataset +collected from an operational plastic waste sorting facility that provides +synchronized hyperspectral and conventional RGB images. This dataset contains +labels for several categories of objects that commonly appear in sorting plants +and need to be detected and separated from the main trash flow for several +reasons, such as security in the management line or reuse. Additionally, we +propose a pipeline employing different object segmentation architectures and +evaluate the alternatives on our dataset, conducting an extensive analysis for +both multimodal and unimodal alternatives. Our evaluation pays special +attention to efficiency and suitability for real-time processing and +demonstrates how HSI can bring a boost to RGB-only perception in these +realistic industrial settings without much computational overhead. + +
+
+
+
+
+ + ☆ Predicting species occurrence patterns from partial observations ICLR 2024 + + +
+ To address the interlinked biodiversity and climate crises, we need an +understanding of where species occur and how these patterns are changing. +However, observational data on most species remains very limited, and the +amount of data available varies greatly between taxonomic groups. We introduce +the problem of predicting species occurrence patterns given (a) satellite +imagery, and (b) known information on the occurrence of other species. To +evaluate algorithms on this task, we introduce SatButterfly, a dataset of +satellite images, environmental data and observational data for butterflies, +which is designed to pair with the existing SatBird dataset of bird +observational data. To address this task, we propose a general model, R-Tran, +for predicting species occurrence patterns that enables the use of partial +observational data wherever found. We find that R-Tran outperforms other +methods in predicting species encounter rates with partial information both +within a taxon (birds) and across taxa (birds and butterflies). Our approach +opens new perspectives to leveraging insights from species with abundant data +to other species with scarce data, by modelling the ecosystems in which they +co-occur. + +
+
+ comment: Tackling Climate Change with Machine Learning workshop at ICLR 2024 +
+
+
+
+
+ + ☆ Text Is MASS: Modeling as Stochastic Embedding for Text-Video Retrieval CVPR 2024 + + +
+ The increasing prevalence of video clips has sparked growing interest in +text-video retrieval. Recent advances focus on establishing a joint embedding +space for text and video, relying on consistent embedding representations to +compute similarity. However, the text content in existing datasets is generally +short and concise, making it hard to fully describe the redundant semantics of +a video. Correspondingly, a single text embedding may be less expressive to +capture the video embedding and empower the retrieval. In this study, we +propose a new stochastic text modeling method T-MASS, i.e., text is modeled as +a stochastic embedding, to enrich text embedding with a flexible and resilient +semantic range, yielding a text mass. To be specific, we introduce a +similarity-aware radius module to adapt the scale of the text mass upon the +given text-video pairs. Plus, we design and develop a support text +regularization to further control the text mass during the training. The +inference pipeline is also tailored to fully exploit the text mass for accurate +retrieval. Empirical evidence suggests that T-MASS not only effectively +attracts relevant text-video pairs while distancing irrelevant ones, but also +enables the determination of precise text embeddings for relevant pairs. Our +experimental results show a substantial improvement of T-MASS over baseline (3% +to 6.3% by R@1). Also, T-MASS achieves state-of-the-art performance on five +benchmark datasets, including MSRVTT, LSMDC, DiDeMo, VATEX, and Charades. + +
+
+ comment: Accepted by CVPR 2024, code and model are available at + https://github.com/Jiamian-Wang/T-MASS-text-video-retrieval +
+
+
+
+
+ + ☆ GTA-HDR: A Large-Scale Synthetic Dataset for HDR Image Reconstruction + + +
+ High Dynamic Range (HDR) content (i.e., images and videos) has a broad range +of applications. However, capturing HDR content from real-world scenes is +expensive and time-consuming. Therefore, the challenging task of reconstructing +visually accurate HDR images from their Low Dynamic Range (LDR) counterparts is +gaining attention in the vision research community. A major challenge in this +research problem is the lack of datasets, which capture diverse scene +conditions (e.g., lighting, shadows, weather, locations, landscapes, objects, +humans, buildings) and various image features (e.g., color, contrast, +saturation, hue, luminance, brightness, radiance). To address this gap, in this +paper, we introduce GTA-HDR, a large-scale synthetic dataset of photo-realistic +HDR images sampled from the GTA-V video game. We perform thorough evaluation of +the proposed dataset, which demonstrates significant qualitative and +quantitative improvements of the state-of-the-art HDR image reconstruction +methods. Furthermore, we demonstrate the effectiveness of the proposed dataset +and its impact on additional computer vision tasks including 3D human pose +estimation, human body part segmentation, and holistic scene segmentation. The +dataset, data collection pipeline, and evaluation code are available at: +https://github.com/HrishavBakulBarua/GTA-HDR. + +
+
+ comment: Submitted to IEEE +
+
+
+
+
+ + ☆ Noise2Noise Denoising of CRISM Hyperspectral Data ICLR 2024 + + +
+ Hyperspectral data acquired by the Compact Reconnaissance Imaging +Spectrometer for Mars (CRISM) have allowed for unparalleled mapping of the +surface mineralogy of Mars. Due to sensor degradation over time, a significant +portion of the recently acquired data is considered unusable. Here a new +data-driven model architecture, Noise2Noise4Mars (N2N4M), is introduced to +remove noise from CRISM images. Our model is self-supervised and does not +require zero-noise target data, making it well suited for use in Planetary +Science applications where high quality labelled data is scarce. We demonstrate +its strong performance on synthetic-noise data and CRISM images, and its impact +on downstream classification performance, outperforming benchmark methods on +most metrics. This allows for detailed analysis for critical sites of interest +on the Martian surface, including proposed lander sites. + +
+
+ comment: 5 pages, 3 figures. Accepted as a conference paper at the ICLR 2024 + ML4RS Workshop +
+
+
+
+
+ + ☆ Semi-Supervised Image Captioning Considering Wasserstein Graph Matching + + +
+ Image captioning can automatically generate captions for the given images, +and the key challenge is to learn a mapping function from visual features to +natural language features. Existing approaches are mostly supervised ones, +i.e., each image has a corresponding sentence in the training set. However, +considering that describing images always requires a huge of manpower, we +usually have limited amount of described images (i.e., image-text pairs) and a +large number of undescribed images in real-world applications. Thereby, a +dilemma is the "Semi-Supervised Image Captioning". To solve this problem, we +propose a novel Semi-Supervised Image Captioning method considering Wasserstein +Graph Matching (SSIC-WGM), which turns to adopt the raw image inputs to +supervise the generated sentences. Different from traditional single modal +semi-supervised methods, the difficulty of semi-supervised cross-modal learning +lies in constructing intermediately comparable information among heterogeneous +modalities. In this paper, SSIC-WGM adopts the successful scene graphs as +intermediate information, and constrains the generated sentences from two +aspects: 1) inter-modal consistency. SSIC-WGM constructs the scene graphs of +the raw image and generated sentence respectively, then employs the wasserstein +distance to better measure the similarity between region embeddings of +different graphs. 2) intra-modal consistency. SSIC-WGM takes the data +augmentation techniques for the raw images, then constrains the consistency +among augmented images and generated sentences. Consequently, SSIC-WGM combines +the cross-modal pseudo supervision and structure invariant measure for +efficiently using the undescribed images, and learns more reasonable mapping +function. + +
+
+
+
+
+ + ☆ Deep Learning for Segmentation of Cracks in High-Resolution Images of + Steel Bridges + + +
+ Automating the current bridge visual inspection practices using drones and +image processing techniques is a prominent way to make these inspections more +effective, robust, and less expensive. In this paper, we investigate the +development of a novel deep-learning method for the detection of fatigue cracks +in high-resolution images of steel bridges. First, we present a novel and +challenging dataset comprising of images of cracks in steel bridges. Secondly, +we integrate the ConvNext neural network with a previous state-of-the-art +encoder-decoder network for crack segmentation. We study and report, the +effects of the use of background patches on the network performance when +applied to high-resolution images of cracks in steel bridges. Finally, we +introduce a loss function that allows the use of more background patches for +the training process, which yields a significant reduction in false positive +rates. + +
+
+
+
+
+ + ☆ Solution for Point Tracking Task of ICCV 1st Perception Test Challenge + 2023 + + +
+ This report proposes an improved method for the Tracking Any Point (TAP) +task, which tracks any physical surface through a video. Several existing +approaches have explored the TAP by considering the temporal relationships to +obtain smooth point motion trajectories, however, they still suffer from the +cumulative error caused by temporal prediction. To address this issue, we +propose a simple yet effective approach called TAP with confident static points +(TAPIR+), which focuses on rectifying the tracking of the static point in the +videos shot by a static camera. To clarify, our approach contains two key +components: (1) Multi-granularity Camera Motion Detection, which could identify +the video sequence by the static camera shot. (2) CMR-based point trajectory +prediction with one moving object segmentation approach to isolate the static +point from the moving object. Our approach ranked first in the final test with +a score of 0.46. + +
+
+
+
+
+ + ☆ Labeling subtypes in a Parkinson's Cohort using Multifeatures in MRI -- + Integrating Grey and White Matter Information + + +
+ Thresholding of networks has long posed a challenge in brain connectivity +analysis. Weighted networks are typically binarized using threshold measures to +facilitate network analysis. Previous studies on MRI-based brain networks have +predominantly utilized density or sparsity-based thresholding techniques, +optimized within specific ranges derived from network metrics such as path +length, clustering coefficient, and small-world index. Thus, determination of a +single threshold value for facilitating comparative analysis of networks +remains elusive. To address this, our study introduces Mutual K-Nearest +Neighbor (MKNN)-based thresholding for brain network analysis. Here, nearest +neighbor selection is based on the highest correlation between features of +brain regions. Construction of brain networks was accomplished by computing +Pearson correlations between grey matter volume and white matter volume for +each pair of brain regions. Structural MRI data from 180 Parkinsons patients +and 70 controls from the NIMHANS, India were analyzed. Subtypes within +Parkinsons disease were identified based on grey and white matter volume +atrophy using source-based morphometric decomposition. The loading coefficients +were correlated with clinical features to discern clinical relationship with +the deciphered subtypes. Our data-mining approach revealed: Subtype A (N = 51, +intermediate type), Subtype B (N = 57, mild-severe type with mild motor +symptoms), and Subtype AB (N = 36, most-severe type with predominance in motor +impairment). Subtype-specific weighted matrices were binarized using MKNN-based +thresholding for brain network analysis. Permutation tests on network metrics +of resulting bipartite graphs demonstrated significant group differences in +betweenness centrality and participation coefficient. The identified hubs were +specific to each subtype, with some hubs conserved across different subtypes. + +
+
+ comment: 31 pages, 10 figures, 3 tables +
+
+
+
+
+ + ☆ Predicting risk of cardiovascular disease using retinal OCT imaging + + +
+ We investigated the potential of optical coherence tomography (OCT) as an +additional imaging technique to predict future cardiovascular disease (CVD). We +utilised a self-supervised deep learning approach based on Variational +Autoencoders (VAE) to learn low-dimensional representations of high-dimensional +3D OCT images and to capture distinct characteristics of different retinal +layers within the OCT image. A Random Forest (RF) classifier was subsequently +trained using the learned latent features and participant demographic and +clinical data, to differentiate between patients at risk of CVD events (MI or +stroke) and non-CVD cases. Our predictive model, trained on multimodal data, +was assessed based on its ability to correctly identify individuals likely to +suffer from a CVD event(MI or stroke), within a 5-year interval after image +acquisition. Our self-supervised VAE feature selection and multimodal Random +Forest classifier differentiate between patients at risk of future CVD events +and the control group with an AUC of 0.75, outperforming the clinically +established QRISK3 score (AUC= 0.597). The choroidal layer visible in OCT +images was identified as an important predictor of future CVD events using a +novel approach to model explanability. Retinal OCT imaging provides a +cost-effective and non-invasive alternative to predict the risk of +cardiovascular disease and is readily accessible in optometry practices and +hospitals. + +
+
+ comment: 18 pages for main manuscript, 7 figures, 2 pages for appendix and + preprint for a journal +
+
+
+
+
+ + ☆ Clinical Domain Knowledge-Derived Template Improves Post Hoc AI + Explanations in Pneumothorax Classification + + +
+ Background: Pneumothorax is an acute thoracic disease caused by abnormal air +collection between the lungs and chest wall. To address the opaqueness often +associated with deep learning (DL) models, explainable artificial intelligence +(XAI) methods have been introduced to outline regions related to pneumothorax +diagnoses made by DL models. However, these explanations sometimes diverge from +actual lesion areas, highlighting the need for further improvement. Method: We +propose a template-guided approach to incorporate the clinical knowledge of +pneumothorax into model explanations generated by XAI methods, thereby +enhancing the quality of these explanations. Utilizing one lesion delineation +created by radiologists, our approach first generates a template that +represents potential areas of pneumothorax occurrence. This template is then +superimposed on model explanations to filter out extraneous explanations that +fall outside the template's boundaries. To validate its efficacy, we carried +out a comparative analysis of three XAI methods with and without our template +guidance when explaining two DL models in two real-world datasets. Results: The +proposed approach consistently improved baseline XAI methods across twelve +benchmark scenarios built on three XAI methods, two DL models, and two +datasets. The average incremental percentages, calculated by the performance +improvements over the baseline performance, were 97.8% in Intersection over +Union (IoU) and 94.1% in Dice Similarity Coefficient (DSC) when comparing model +explanations and ground-truth lesion areas. Conclusions: In the context of +pneumothorax diagnoses, we proposed a template-guided approach for improving AI +explanations. We anticipate that our template guidance will forge a fresh +approach to elucidating AI models by integrating clinical domain expertise. + +
+
+
+
+
+ + ☆ SugarcaneNet2024: An Optimized Weighted Average Ensemble Approach of + LASSO Regularized Pre-trained Models for Sugarcane Disease Classification + + +
+ Sugarcane, a key crop for the world's sugar industry, is prone to several +diseases that have a substantial negative influence on both its yield and +quality. To effectively manage and implement preventative initiatives, diseases +must be detected promptly and accurately. In this study, we present a unique +model called sugarcaneNet2024 that outperforms previous methods for +automatically and quickly detecting sugarcane disease through leaf image +processing. Our proposed model consolidates an optimized weighted average +ensemble of seven customized and LASSO-regularized pre-trained models, +particularly InceptionV3, InceptionResNetV2, DenseNet201, DenseNet169, +Xception, and ResNet152V2. Initially, we added three more dense layers with +0.0001 LASSO regularization, three 30% dropout layers, and three batch +normalizations with renorm enabled at the bottom of these pre-trained models to +improve the performance. The accuracy of sugarcane leaf disease classification +was greatly increased by this addition. Following this, several comparative +studies between the average ensemble and individual models were carried out, +indicating that the ensemble technique performed better. The average ensemble +of all modified pre-trained models produced outstanding outcomes: 100%, 99%, +99%, and 99.45% for f1 score, precision, recall, and accuracy, respectively. +Performance was further enhanced by the implementation of an optimized weighted +average ensemble technique incorporated with grid search. This optimized +sugarcaneNet2024 model performed the best for detecting sugarcane diseases, +having achieved accuracy, precision, recall, and F1 score of 99.67%, 100%, +100%, and 100% , respectively. + +
+
+ comment: 32 pages, 11 Figures, 13 Tables +
+
+
+
+
+ + ♻ ☆ DiVa-360: The Dynamic Visual Dataset for Immersive Neural Fields + + +
+ Advances in neural fields are enabling high-fidelity capture of the shape and +appearance of dynamic 3D scenes. However, their capabilities lag behind those +offered by conventional representations such as 2D videos because of +algorithmic challenges and the lack of large-scale multi-view real-world +datasets. We address the dataset limitation with DiVa-360, a real-world 360 +dynamic visual dataset that contains synchronized high-resolution and +long-duration multi-view video sequences of table-scale scenes captured using a +customized low-cost system with 53 cameras. It contains 21 object-centric +sequences categorized by different motion types, 25 intricate hand-object +interaction sequences, and 8 long-duration sequences for a total of 17.4 M +image frames. In addition, we provide foreground-background segmentation masks, +synchronized audio, and text descriptions. We benchmark the state-of-the-art +dynamic neural field methods on DiVa-360 and provide insights about existing +methods and future challenges on long-duration neural field capture. + +
+
+
+
+
+ + ♻ ☆ HoloVIC: Large-scale Dataset and Benchmark for Multi-Sensor Holographic + Intersection and Vehicle-Infrastructure Cooperative CVPR 2024 + + +
+ Vehicle-to-everything (V2X) is a popular topic in the field of Autonomous +Driving in recent years. Vehicle-infrastructure cooperation (VIC) becomes one +of the important research area. Due to the complexity of traffic conditions +such as blind spots and occlusion, it greatly limits the perception +capabilities of single-view roadside sensing systems. To further enhance the +accuracy of roadside perception and provide better information to the vehicle +side, in this paper, we constructed holographic intersections with various +layouts to build a large-scale multi-sensor holographic vehicle-infrastructure +cooperation dataset, called HoloVIC. Our dataset includes 3 different types of +sensors (Camera, Lidar, Fisheye) and employs 4 sensor-layouts based on the +different intersections. Each intersection is equipped with 6-18 sensors to +capture synchronous data. While autonomous vehicles pass through these +intersections for collecting VIC data. HoloVIC contains in total on 100k+ +synchronous frames from different sensors. Additionally, we annotated 3D +bounding boxes based on Camera, Fisheye, and Lidar. We also associate the IDs +of the same objects across different devices and consecutive frames in +sequence. Based on HoloVIC, we formulated four tasks to facilitate the +development of related research. We also provide benchmarks for these tasks. + +
+
+ comment: Accept to CVPR 2024, Benchmark Website: https://holovic.net +
+
+
+
+
+ + ♻ ☆ TRIPS: Trilinear Point Splatting for Real-Time Radiance Field Rendering + + +
+ Point-based radiance field rendering has demonstrated impressive results for +novel view synthesis, offering a compelling blend of rendering quality and +computational efficiency. However, also latest approaches in this domain are +not without their shortcomings. 3D Gaussian Splatting [Kerbl and Kopanas et al. +2023] struggles when tasked with rendering highly detailed scenes, due to +blurring and cloudy artifacts. On the other hand, ADOP [R\"uckert et al. 2022] +can accommodate crisper images, but the neural reconstruction network decreases +performance, it grapples with temporal instability and it is unable to +effectively address large gaps in the point cloud. + In this paper, we present TRIPS (Trilinear Point Splatting), an approach that +combines ideas from both Gaussian Splatting and ADOP. The fundamental concept +behind our novel technique involves rasterizing points into a screen-space +image pyramid, with the selection of the pyramid layer determined by the +projected point size. This approach allows rendering arbitrarily large points +using a single trilinear write. A lightweight neural network is then used to +reconstruct a hole-free image including detail beyond splat resolution. +Importantly, our render pipeline is entirely differentiable, allowing for +automatic optimization of both point sizes and positions. + Our evaluation demonstrate that TRIPS surpasses existing state-of-the-art +methods in terms of rendering quality while maintaining a real-time frame rate +of 60 frames per second on readily available hardware. This performance extends +to challenging scenarios, such as scenes featuring intricate geometry, +expansive landscapes, and auto-exposed footage. + The project page is located at: https://lfranke.github.io/trips/ + +
+
+
+
+
+ + ♻ ☆ Semi-Supervised Crowd Counting from Unlabeled Data + + +
+ Automatic Crowd behavior analysis can be applied to effectively help the +daily transportation statistics and planning, which helps the smart city +construction. As one of the most important keys, crowd counting has drawn +increasing attention. Recent works achieved promising performance but relied on +the supervised paradigm with expensive crowd annotations. To alleviate the +annotation cost in real-world transportation scenarios, in this work we +proposed a semi-supervised learning framework $S^{4}\textit{Crowd}$, which can +leverage both unlabeled/labeled data for robust crowd counting. In the +unsupervised pathway, two \textit{self-supervised losses} were proposed to +simulate the crowd variations such as scale, illumination, based on which +supervised information pseudo labels were generated and gradually refined. We +also proposed a crowd-driven recurrent unit \textit{Gated-Crowd-Recurrent-Unit +(GCRU)}, which can preserve discriminant crowd information by extracting +second-order statistics, yielding pseudo labels with improved quality. A joint +loss including both unsupervised/supervised information was proposed, and a +dynamic weighting strategy was employed to balance the importance of the +unsupervised loss and supervised loss at different training stages. We +conducted extensive experiments on four popular crowd counting datasets in +semi-supervised settings. Experimental results supported the effectiveness of +each proposed component in our $S^{4}$Crowd framework. Our method achieved +competitive performance in semi-supervised learning approaches on these crowd +counting datasets. + +
+
+
+
+
+ + ♻ ☆ Efficient Pre-training for Localized Instruction Generation of Videos + + +
+ Procedural videos show step-by-step demonstrations of tasks like recipe +preparation. Understanding such videos is challenging, involving the precise +localization of steps and the generation of textual instructions. Manually +annotating steps and writing instructions is costly, which limits the size of +current datasets and hinders effective learning. Leveraging large but noisy +video-transcript datasets for pre-training can boost performance, but demands +significant computational resources. Furthermore, transcripts contain +irrelevant content and exhibit style variation compared to instructions written +by human annotators. To mitigate both issues, we propose a technique, +Sieve-&-Swap, to automatically curate a smaller dataset: (i) Sieve filters +irrelevant transcripts and (ii) Swap enhances the quality of the text +instruction by automatically replacing the transcripts with human-written +instructions from a text-only recipe dataset. The curated dataset, three orders +of magnitude smaller than current web-scale datasets, enables efficient +training of large-scale models with competitive performance. We complement our +Sieve-\&-Swap approach with a Procedure Transformer (ProcX) for end-to-end step +localization and instruction generation for procedural videos. When this model +is pre-trained on our curated dataset, it achieves state-of-the-art performance +in zero-shot and finetuning settings on YouCook2 and Tasty, while using a +fraction of the computational resources. + +
+
+ comment: This version has some missing experiments and elaborative technical + details +
+
+
+
+
+ + ♻ ☆ SimLVSeg: Simplifying Left Ventricular Segmentation in 2D+Time + Echocardiograms with Self- and Weakly-Supervised Learning + + +
+ Echocardiography has become an indispensable clinical imaging modality for +general heart health assessment. From calculating biomarkers such as ejection +fraction to the probability of a patient's heart failure, accurate segmentation +of the heart structures allows doctors to assess the heart's condition and +devise treatments with greater precision and accuracy. However, achieving +accurate and reliable left ventricle segmentation is time-consuming and +challenging due to different reasons. Hence, clinicians often rely on +segmenting the left ventricular (LV) in two specific echocardiogram frames to +make a diagnosis. This limited coverage in manual LV segmentation poses a +challenge for developing automatic LV segmentation with high temporal +consistency, as the resulting dataset is typically annotated sparsely. In +response to this challenge, this work introduces SimLVSeg, a novel paradigm +that enables video-based networks for consistent LV segmentation from sparsely +annotated echocardiogram videos. SimLVSeg consists of self-supervised +pre-training with temporal masking, followed by weakly supervised learning +tailored for LV segmentation from sparse annotations. We demonstrate how +SimLVSeg outperforms the state-of-the-art solutions by achieving a 93.32% +(95%CI 93.21-93.43%) dice score on the largest 2D+time echocardiography dataset +(EchoNet-Dynamic) while being more efficient. SimLVSeg is compatible with two +types of video segmentation networks: 2D super image and 3D segmentation. To +show the effectiveness of our approach, we provide extensive ablation studies, +including pre-training settings and various deep learning backbones. We further +conduct an out-of-distribution test to showcase SimLVSeg's generalizability on +unseen distribution (CAMUS dataset). The code is publicly available at +https://github.com/fadamsyah/SimLVSeg. + +
+
+
+
+
+ + ♻ ☆ HIMap: HybrId Representation Learning for End-to-end Vectorized HD Map + Construction CVPR 2024 + + +
+ Vectorized High-Definition (HD) map construction requires predictions of the +category and point coordinates of map elements (e.g. road boundary, lane +divider, pedestrian crossing, etc.). State-of-the-art methods are mainly based +on point-level representation learning for regressing accurate point +coordinates. However, this pipeline has limitations in obtaining element-level +information and handling element-level failures, e.g. erroneous element shape +or entanglement between elements. To tackle the above issues, we propose a +simple yet effective HybrId framework named HIMap to sufficiently learn and +interact both point-level and element-level information. Concretely, we +introduce a hybrid representation called HIQuery to represent all map elements, +and propose a point-element interactor to interactively extract and encode the +hybrid information of elements, e.g. point position and element shape, into the +HIQuery. Additionally, we present a point-element consistency constraint to +enhance the consistency between the point-level and element-level information. +Finally, the output point-element integrated HIQuery can be directly converted +into map elements' class, point coordinates, and mask. We conduct extensive +experiments and consistently outperform previous methods on both nuScenes and +Argoverse2 datasets. Notably, our method achieves $77.8$ mAP on the nuScenes +dataset, remarkably superior to previous SOTAs by $8.3$ mAP at least. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Exploiting Semantic Reconstruction to Mitigate Hallucinations in + Vision-Language Models + + +
+ Hallucinations in vision-language models pose a significant challenge to +their reliability, particularly in the generation of long captions. Current +methods fall short of accurately identifying and mitigating these +hallucinations. To address this issue, we introduce ESREAL, a novel +unsupervised learning framework designed to suppress the generation of +hallucinations through accurate localization and penalization of hallucinated +tokens. Initially, ESREAL creates a reconstructed image based on the generated +caption and aligns its corresponding regions with those of the original image. +This semantic reconstruction aids in identifying both the presence and type of +token-level hallucinations within the generated caption. Subsequently, ESREAL +computes token-level hallucination scores by assessing the semantic similarity +of aligned regions based on the type of hallucination. Finally, ESREAL employs +a proximal policy optimization algorithm, where it selectively penalizes +hallucinated tokens according to their token-level hallucination scores. Our +framework notably reduces hallucinations in LLaVA, InstructBLIP, and mPLUG-Owl2 +by 32.81%, 27.08%, and 7.46% on the CHAIR metric. This improvement is achieved +solely through signals derived from the image itself, without the need for any +image-text pairs. + +
+
+
+
+
+ + ♻ ☆ Pushing Auto-regressive Models for 3D Shape Generation at Capacity and + Scalability + + +
+ Auto-regressive models have achieved impressive results in 2D image +generation by modeling joint distributions in grid space. In this paper, we +extend auto-regressive models to 3D domains, and seek a stronger ability of 3D +shape generation by improving auto-regressive models at capacity and +scalability simultaneously. Firstly, we leverage an ensemble of publicly +available 3D datasets to facilitate the training of large-scale models. It +consists of a comprehensive collection of approximately 900,000 objects, with +multiple properties of meshes, points, voxels, rendered images, and text +captions. This diverse labeled dataset, termed Objaverse-Mix, empowers our +model to learn from a wide range of object variations. However, directly +applying 3D auto-regression encounters critical challenges of high +computational demands on volumetric grids and ambiguous auto-regressive order +along grid dimensions, resulting in inferior quality of 3D shapes. To this end, +we then present a novel framework Argus3D in terms of capacity. Concretely, our +approach introduces discrete representation learning based on a latent vector +instead of volumetric grids, which not only reduces computational costs but +also preserves essential geometric details by learning the joint distributions +in a more tractable order. The capacity of conditional generation can thus be +realized by simply concatenating various conditioning inputs to the latent +vector, such as point clouds, categories, images, and texts. In addition, +thanks to the simplicity of our model architecture, we naturally scale up our +approach to a larger model with an impressive 3.6 billion parameters, further +enhancing the quality of versatile 3D generation. Extensive experiments on four +generation tasks demonstrate that Argus3D can synthesize diverse and faithful +shapes across multiple categories, achieving remarkable performance. + +
+
+ comment: Project page: https://argus-3d.github.io/ . Datasets: + https://huggingface.co/datasets/BAAI/Objaverse-MIX. arXiv admin note: + substantial text overlap with arXiv:2303.14700 +
+
+
+
+
+ + ♻ ☆ ReMoS: 3D Motion-Conditioned Reaction Synthesis for Two-Person + Interactions + + +
+ Current approaches for 3D human motion synthesis generate high-quality +animations of digital humans performing a wide variety of actions and gestures. +However, a notable technological gap exists in addressing the complex dynamics +of multi-human interactions within this paradigm. In this work, we present +ReMoS, a denoising diffusion-based model that synthesizes full-body reactive +motion of a person in a two-person interaction scenario. Assuming the motion of +one person is given, we employ a combined spatio-temporal cross-attention +mechanism to synthesize the reactive body and hand motion of the second person, +thereby completing the interactions between the two. We demonstrate ReMoS +across challenging two-person scenarios such as pair-dancing, Ninjutsu, +kickboxing, and acrobatics, where one person's movements have complex and +diverse influences on the other. We also contribute the ReMoCap dataset for +two-person interactions containing full-body and finger motions. We evaluate +ReMoS through multiple quantitative metrics, qualitative visualizations, and a +user study, and also indicate usability in interactive motion editing +applications. + +
+
+ comment: 17 pages, 7 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ MedPromptX: Grounded Multimodal Prompting for Chest X-ray Diagnosis + + +
+ Chest X-ray images are commonly used for predicting acute and chronic +cardiopulmonary conditions, but efforts to integrate them with structured +clinical data face challenges due to incomplete electronic health records +(EHR). This paper introduces \textbf{MedPromptX}, the first model to integrate +multimodal large language models (MLLMs), few-shot prompting (FP) and visual +grounding (VG) to combine imagery with EHR data for chest X-ray diagnosis. A +pre-trained MLLM is utilized to complement the missing EHR information, +providing a comprehensive understanding of patients' medical history. +Additionally, FP reduces the necessity for extensive training of MLLMs while +effectively tackling the issue of hallucination. Nevertheless, the process of +determining the optimal number of few-shot examples and selecting high-quality +candidates can be burdensome, yet it profoundly influences model performance. +Hence, we propose a new technique that dynamically refines few-shot data for +real-time adjustment to new patient scenarios. Moreover, VG aids in focusing +the model's attention on relevant regions of interest in X-ray images, +enhancing the identification of abnormalities. We release MedPromptX-VQA, a new +in-context visual question answering dataset encompassing interleaved image and +EHR data derived from MIMIC-IV and MIMIC-CXR databases. Results demonstrate the +SOTA performance of MedPromptX, achieving an 11% improvement in F1-score +compared to the baselines. Code and data are available at +https://github.com/BioMedIA-MBZUAI/MedPromptX + +
+
+
+
+
+ + ♻ ☆ Text-Guided Variational Image Generation for Industrial Anomaly + Detection and Segmentation CVPR 2024 + + +
+ We propose a text-guided variational image generation method to address the +challenge of getting clean data for anomaly detection in industrial +manufacturing. Our method utilizes text information about the target object, +learned from extensive text library documents, to generate non-defective data +images resembling the input image. The proposed framework ensures that the +generated non-defective images align with anticipated distributions derived +from textual and image-based knowledge, ensuring stability and generality. +Experimental results demonstrate the effectiveness of our approach, surpassing +previous methods even with limited non-defective data. Our approach is +validated through generalization tests across four baseline models and three +distinct datasets. We present an additional analysis to enhance the +effectiveness of anomaly detection models by utilizing the generated images. + +
+
+ comment: 18 pages, Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Identity-aware Dual-constraint Network for Cloth-Changing Person + Re-identification + + +
+ Cloth-Changing Person Re-Identification (CC-ReID) aims to accurately identify +the target person in more realistic surveillance scenarios, where pedestrians +usually change their clothing. Despite great progress, limited cloth-changing +training samples in existing CC-ReID datasets still prevent the model from +adequately learning cloth-irrelevant features. In addition, due to the absence +of explicit supervision to keep the model constantly focused on +cloth-irrelevant areas, existing methods are still hampered by the disruption +of clothing variations. To solve the above issues, we propose an Identity-aware +Dual-constraint Network (IDNet) for the CC-ReID task. Specifically, to help the +model extract cloth-irrelevant clues, we propose a Clothes Diversity +Augmentation (CDA), which generates more realistic cloth-changing samples by +enriching the clothing color while preserving the texture. In addition, a +Multi-scale Constraint Block (MCB) is designed, which extracts fine-grained +identity-related features and effectively transfers cloth-irrelevant knowledge. +Moreover, a Counterfactual-guided Attention Module (CAM) is presented, which +learns cloth-irrelevant features from channel and space dimensions and utilizes +the counterfactual intervention for supervising the attention map to highlight +identity-related regions. Finally, a Semantic Alignment Constraint (SAC) is +designed to facilitate high-level semantic feature interaction. Comprehensive +experiments on four CC-ReID datasets indicate that our method outperforms prior +state-of-the-art approaches. + +
+
+
+
+
+ + ♻ ☆ Unveiling the Pitfalls of Knowledge Editing for Large Language Models ICLR 2024 + + +
+ As the cost associated with fine-tuning Large Language Models (LLMs) +continues to rise, recent research efforts have pivoted towards developing +methodologies to edit implicit knowledge embedded within LLMs. Yet, there's +still a dark cloud lingering overhead -- will knowledge editing trigger +butterfly effect? since it is still unclear whether knowledge editing might +introduce side effects that pose potential risks or not. This paper pioneers +the investigation into the potential pitfalls associated with knowledge editing +for LLMs. To achieve this, we introduce new benchmark datasets and propose +innovative evaluation metrics. Our results underline two pivotal concerns: (1) +Knowledge Conflict: Editing groups of facts that logically clash can magnify +the inherent inconsistencies in LLMs-a facet neglected by previous methods. (2) +Knowledge Distortion: Altering parameters with the aim of editing factual +knowledge can irrevocably warp the innate knowledge structure of LLMs. +Experimental results vividly demonstrate that knowledge editing might +inadvertently cast a shadow of unintended consequences on LLMs, which warrant +attention and efforts for future works. Code and data are available at +https://github.com/zjunlp/PitfallsKnowledgeEditing. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Generative 3D Part Assembly via Part-Whole-Hierarchy Message Passing + + +
+ Generative 3D part assembly involves understanding part relationships and +predicting their 6-DoF poses for assembling a realistic 3D shape. Prior work +often focus on the geometry of individual parts, neglecting part-whole +hierarchies of objects. Leveraging two key observations: 1) super-part poses +provide strong hints about part poses, and 2) predicting super-part poses is +easier due to fewer superparts, we propose a part-whole-hierarchy message +passing network for efficient 3D part assembly. We first introduce super-parts +by grouping geometrically similar parts without any semantic labels. Then we +employ a part-whole hierarchical encoder, wherein a super-part encoder predicts +latent super-part poses based on input parts. Subsequently, we transform the +point cloud using the latent poses, feeding it to the part encoder for +aggregating super-part information and reasoning about part relationships to +predict all part poses. In training, only ground-truth part poses are required. +During inference, the predicted latent poses of super-parts enhance +interpretability. Experimental results on the PartNet dataset show that our +method achieves state-of-the-art performance in part and connectivity accuracy +and enables an interpretable hierarchical part assembly. + +
+
+
+
+
+ + ♻ ☆ InNeRF360: Text-Guided 3D-Consistent Object Inpainting on 360-degree + Neural Radiance Fields CVPR 2024 + + +
+ We propose InNeRF360, an automatic system that accurately removes +text-specified objects from 360-degree Neural Radiance Fields (NeRF). The +challenge is to effectively remove objects while inpainting perceptually +consistent content for the missing regions, which is particularly demanding for +existing NeRF models due to their implicit volumetric representation. Moreover, +unbounded scenes are more prone to floater artifacts in the inpainted region +than frontal-facing scenes, as the change of object appearance and background +across views is more sensitive to inaccurate segmentations and inconsistent +inpainting. With a trained NeRF and a text description, our method efficiently +removes specified objects and inpaints visually consistent content without +artifacts. We apply depth-space warping to enforce consistency across multiview +text-encoded segmentations, and then refine the inpainted NeRF model using +perceptual priors and 3D diffusion-based geometric priors to ensure visual +plausibility. Through extensive experiments in segmentation and inpainting on +360-degree and frontal-facing NeRFs, we show that our approach is effective and +enhances NeRF's editability. Project page: https://ivrl.github.io/InNeRF360. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Passive Non-Line-of-Sight Imaging with Light Transport Modulation + + +
+ Passive non-line-of-sight (NLOS) imaging has witnessed rapid development in +recent years, due to its ability to image objects that are out of sight. The +light transport condition plays an important role in this task since changing +the conditions will lead to different imaging models. Existing learning-based +NLOS methods usually train independent models for different light transport +conditions, which is computationally inefficient and impairs the practicality +of the models. In this work, we propose NLOS-LTM, a novel passive NLOS imaging +method that effectively handles multiple light transport conditions with a +single network. We achieve this by inferring a latent light transport +representation from the projection image and using this representation to +modulate the network that reconstructs the hidden image from the projection +image. We train a light transport encoder together with a vector quantizer to +obtain the light transport representation. To further regulate this +representation, we jointly learn both the reconstruction network and the +reprojection network during training. A set of light transport modulation +blocks is used to modulate the two jointly trained networks in a multi-scale +way. Extensive experiments on a large-scale passive NLOS dataset demonstrate +the superiority of the proposed method. The code is available at +https://github.com/JerryOctopus/NLOS-LTM. + +
+
+
+
+
+ + ♻ ☆ ViT-Lens: Towards Omni-modal Representations CVPR2024 + + +
+ Aiming to advance AI agents, large foundation models significantly improve +reasoning and instruction execution, yet the current focus on vision and +language neglects the potential of perceiving diverse modalities in open-world +environments. However, the success of data-driven vision and language models is +costly or even infeasible to be reproduced for rare modalities. In this paper, +we present ViT-Lens-2 that facilitates efficient omni-modal representation +learning by perceiving novel modalities with a pretrained ViT and aligning them +to a pre-defined space. Specifically, the modality-specific lens is tuned to +project any-modal signals to an intermediate embedding space, which are then +processed by a strong ViT with pre-trained visual knowledge. The encoded +representations are optimized toward aligning with the modal-independent space, +pre-defined by off-the-shelf foundation models. ViT-Lens-2 provides a unified +solution for representation learning of increasing modalities with two +appealing advantages: (i) Unlocking the great potential of pretrained ViTs to +novel modalities effectively with efficient data regime; (ii) Enabling emergent +downstream capabilities through modality alignment and shared ViT parameters. +We tailor ViT-Lens-2 to learn representations for 3D point cloud, depth, audio, +tactile and EEG, and set new state-of-the-art results across various +understanding tasks, such as zero-shot classification. By seamlessly +integrating ViT-Lens-2 into Multimodal Foundation Models, we enable +Any-modality to Text and Image Generation in a zero-shot manner. Code and +models are available at https://github.com/TencentARC/ViT-Lens. + +
+
+ comment: This work is a follow-up of arXiv:2308.10185. Accepted to CVPR2024 +
+
+
+
+
+ + ♻ ☆ Implicit Discriminative Knowledge Learning for Visible-Infrared Person + Re-Identification CVPR 2024 + + +
+ Visible-Infrared Person Re-identification (VI-ReID) is a challenging +cross-modal pedestrian retrieval task, due to significant intra-class +variations and cross-modal discrepancies among different cameras. Existing +works mainly focus on embedding images of different modalities into a unified +space to mine modality-shared features. They only seek distinctive information +within these shared features, while ignoring the identity-aware useful +information that is implicit in the modality-specific features. To address this +issue, we propose a novel Implicit Discriminative Knowledge Learning (IDKL) +network to uncover and leverage the implicit discriminative information +contained within the modality-specific. First, we extract modality-specific and +modality-shared features using a novel dual-stream network. Then, the +modality-specific features undergo purification to reduce their modality style +discrepancies while preserving identity-aware discriminative knowledge. +Subsequently, this kind of implicit knowledge is distilled into the +modality-shared feature to enhance its distinctiveness. Finally, an alignment +loss is proposed to minimize modality discrepancy on enhanced modality-shared +features. Extensive experiments on multiple public datasets demonstrate the +superiority of IDKL network over the state-of-the-art methods. Code is +available at https://github.com/1KK077/IDKL. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ In Search of a Data Transformation That Accelerates Neural Field + Training CVPR 2024 + + +
+ Neural field is an emerging paradigm in data representation that trains a +neural network to approximate the given signal. A key obstacle that prevents +its widespread adoption is the encoding speed-generating neural fields requires +an overfitting of a neural network, which can take a significant number of SGD +steps to reach the desired fidelity level. In this paper, we delve into the +impacts of data transformations on the speed of neural field training, +specifically focusing on how permuting pixel locations affect the convergence +speed of SGD. Counterintuitively, we find that randomly permuting the pixel +locations can considerably accelerate the training. To explain this phenomenon, +we examine the neural field training through the lens of PSNR curves, loss +landscapes, and error patterns. Our analyses suggest that the random pixel +permutations remove the easy-to-fit patterns, which facilitate easy +optimization in the early stage but hinder capturing fine details of the +signal. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ AV2AV: Direct Audio-Visual Speech to Audio-Visual Speech Translation + with Unified Audio-Visual Speech Representation CVPR 2024 + + +
+ This paper proposes a novel direct Audio-Visual Speech to Audio-Visual Speech +Translation (AV2AV) framework, where the input and output of the system are +multimodal (i.e., audio and visual speech). With the proposed AV2AV, two key +advantages can be brought: 1) We can perform real-like conversations with +individuals worldwide in a virtual meeting by utilizing our own primary +languages. In contrast to Speech-to-Speech Translation (A2A), which solely +translates between audio modalities, the proposed AV2AV directly translates +between audio-visual speech. This capability enhances the dialogue experience +by presenting synchronized lip movements along with the translated speech. 2) +We can improve the robustness of the spoken language translation system. By +employing the complementary information of audio-visual speech, the system can +effectively translate spoken language even in the presence of acoustic noise, +showcasing robust performance. To mitigate the problem of the absence of a +parallel AV2AV translation dataset, we propose to train our spoken language +translation system with the audio-only dataset of A2A. This is done by learning +unified audio-visual speech representations through self-supervised learning in +advance to train the translation system. Moreover, we propose an AV-Renderer +that can generate raw audio and video in parallel. It is designed with +zero-shot speaker modeling, thus the speaker in source audio-visual speech can +be maintained at the target translated audio-visual speech. The effectiveness +of AV2AV is evaluated with extensive experiments in a many-to-many language +translation setting. Demo page is available on +https://choijeongsoo.github.io/av2av. + +
+
+ comment: CVPR 2024. Code & Demo: https://choijeongsoo.github.io/av2av +
+
+
+
+
+ + ♻ ☆ SINC: Spatial Composition of 3D Human Motions for Simultaneous Action + Generation + + +
+ Our goal is to synthesize 3D human motions given textual inputs describing +simultaneous actions, for example 'waving hand' while 'walking' at the same +time. We refer to generating such simultaneous movements as performing 'spatial +compositions'. In contrast to temporal compositions that seek to transition +from one action to another, spatial compositing requires understanding which +body parts are involved in which action, to be able to move them +simultaneously. Motivated by the observation that the correspondence between +actions and body parts is encoded in powerful language models, we extract this +knowledge by prompting GPT-3 with text such as "what are the body parts +involved in the action ?", while also providing the parts list and +few-shot examples. Given this action-part mapping, we combine body parts from +two motions together and establish the first automated method to spatially +compose two actions. However, training data with compositional actions is +always limited by the combinatorics. Hence, we further create synthetic data +with this approach, and use it to train a new state-of-the-art text-to-motion +generation model, called SINC ("SImultaneous actioN Compositions for 3D human +motions"). In our experiments, that training with such GPT-guided synthetic +data improves spatial composition generation over baselines. Our code is +publicly available at https://sinc.is.tue.mpg.de/. + +
+
+ comment: Teaser Fixed +
+
+
+
+
+ + ♻ ☆ Powerful Lossy Compression for Noisy Images ICME 2024 + + +
+ Image compression and denoising represent fundamental challenges in image +processing with many real-world applications. To address practical demands, +current solutions can be categorized into two main strategies: 1) sequential +method; and 2) joint method. However, sequential methods have the disadvantage +of error accumulation as there is information loss between multiple individual +models. Recently, the academic community began to make some attempts to tackle +this problem through end-to-end joint methods. Most of them ignore that +different regions of noisy images have different characteristics. To solve +these problems, in this paper, our proposed signal-to-noise ratio~(SNR) aware +joint solution exploits local and non-local features for image compression and +denoising simultaneously. We design an end-to-end trainable network, which +includes the main encoder branch, the guidance branch, and the signal-to-noise +ratio~(SNR) aware branch. We conducted extensive experiments on both synthetic +and real-world datasets, demonstrating that our joint solution outperforms +existing state-of-the-art methods. + +
+
+ comment: Accepted by ICME 2024 +
+
+
+
+
+ + ♻ ☆ ViT-Lens: Initiating Omni-Modal Exploration through 3D Insights + + +
+ Though the success of CLIP-based training recipes in vision-language models, +their scalability to more modalities (e.g., 3D, audio, etc.) is limited to +large-scale data, which is expensive or even inapplicable for rare modalities. +In this paper, we present ViT-Lens that facilitates efficient omni-modal +representation learning by perceiving novel modalities with a pretrained ViT +and aligning to a pre-defined space. Specifically, the modality-specific lens +is tuned to project multimodal signals to the shared embedding space, which are +then processed by a strong ViT that carries pre-trained image knowledge. The +encoded multimodal representations are optimized toward aligning with the +modal-independent space, pre-defined by off-the-shelf foundation models. A +well-trained lens with a ViT backbone has the potential to serve as one of +these foundation models, supervising the learning of subsequent modalities. +ViT-Lens provides a unified solution for representation learning of increasing +modalities with two appealing benefits: (i) Exploiting the pretrained ViT +across tasks and domains effectively with efficient data regime; (ii) Emergent +downstream capabilities of novel modalities are demonstrated due to the +modality alignment space. We evaluate ViT-Lens in the context of 3D as an +initial verification. In zero-shot 3D classification, ViT-Lens achieves +substantial improvements over previous state-of-the-art, showing 52.0% accuracy +on Objaverse-LVIS, 87.4% on ModelNet40, and 60.6% on ScanObjectNN. Furthermore, +we enable zero-shot 3D question-answering by simply integrating the trained 3D +lens into the InstructBLIP model without any adaptation. We will release the +results of ViT-Lens on more modalities in the near future. + +
+
+ comment: 19 pages, 4 figures and 9 tables +
+
+
+
+
+ + ♻ ☆ TP2O: Creative Text Pair-to-Object Generation using Balance + Swap-Sampling + + +
+ Generating creative combinatorial objects from two seemingly unrelated object +texts is a challenging task in text-to-image synthesis, often hindered by a +focus on emulating existing data distributions. In this paper, we develop a +straightforward yet highly effective method, called \textbf{balance +swap-sampling}. First, we propose a swapping mechanism that generates a novel +combinatorial object image set by randomly exchanging intrinsic elements of two +text embeddings through a cutting-edge diffusion model. Second, we introduce a +balance swapping region to efficiently sample a small subset from the newly +generated image set by balancing CLIP distances between the new images and +their original generations, increasing the likelihood of accepting the +high-quality combinations. Last, we employ a segmentation method to compare +CLIP distances among the segmented components, ultimately selecting the most +promising object from the sampled subset. Extensive experiments demonstrate +that our approach outperforms recent SOTA T2I methods. Surprisingly, our +results even rival those of human artists, such as frog-broccoli. + +
+
+ comment: Project page: https://tp2o.github.io/anon/ +
+
+
+
+
+ + ♻ ☆ Segment and Caption Anything CVPR 24 + + +
+ We propose a method to efficiently equip the Segment Anything Model (SAM) +with the ability to generate regional captions. SAM presents strong +generalizability to segment anything while is short for semantic understanding. +By introducing a lightweight query-based feature mixer, we align the +region-specific features with the embedding space of language models for later +caption generation. As the number of trainable parameters is small (typically +in the order of tens of millions), it costs less computation, less memory +usage, and less communication bandwidth, resulting in both fast and scalable +training. To address the scarcity problem of regional caption data, we propose +to first pre-train our model on objection detection and segmentation tasks. We +call this step weak supervision pretraining since the pre-training data only +contains category names instead of full-sentence descriptions. The weak +supervision pretraining allows us to leverage many publicly available object +detection and segmentation datasets. We conduct extensive experiments to +demonstrate the superiority of our method and validate each design choice. This +work serves as a stepping stone towards scaling up regional captioning data and +sheds light on exploring efficient ways to augment SAM with regional semantics. +The project page, along with the associated code, can be accessed via +https://xk-huang.github.io/segment-caption-anything/. + +
+
+ comment: The project page, along with the associated code, can be accessed via + https://xk-huang.github.io/segment-caption-anything/; Update author + information; Accepted by CVPR 24 +
+
+
+
+
+ + ♻ ☆ TagAlign: Improving Vision-Language Alignment with Multi-Tag + Classification + + +
+ The crux of learning vision-language models is to extract semantically +aligned information from visual and linguistic data. Existing attempts usually +face the problem of coarse alignment, e.g., the vision encoder struggles in +localizing an attribute-specified object. In this work, we propose an +embarrassingly simple approach to better align image and text features with no +need of additional data formats other than image-text pairs. Concretely, given +an image and its paired text, we manage to parse objects (e.g., cat) and +attributes (e.g., black) from the description, which are highly likely to exist +in the image. It is noteworthy that the parsing pipeline is fully automatic and +thus enjoys good scalability. With these parsed semantics as supervision +signals, we can complement the commonly used image-text contrastive loss with +the multi-tag classification loss. Extensive experimental results on a broad +suite of semantic segmentation datasets substantiate the average 5.2\% +improvement of our framework over existing alternatives. Furthermore, the +visualization results indicate that attribute supervision makes vision-language +models accurately localize attribute-specified objects. Project page can be +found at https://qinying-liu.github.io/Tag-Align. + +
+
+
+
+
+ + ♻ ☆ SGS-SLAM: Semantic Gaussian Splatting For Neural Dense SLAM + + +
+ We present SGS-SLAM, the first semantic visual SLAM system based on Gaussian +Splatting. It incorporates appearance, geometry, and semantic features through +multi-channel optimization, addressing the oversmoothing limitations of neural +implicit SLAM systems in high-quality rendering, scene understanding, and +object-level geometry. We introduce a unique semantic feature loss that +effectively compensates for the shortcomings of traditional depth and color +losses in object optimization. Through a semantic-guided keyframe selection +strategy, we prevent erroneous reconstructions caused by cumulative errors. +Extensive experiments demonstrate that SGS-SLAM delivers state-of-the-art +performance in camera pose estimation, map reconstruction, precise semantic +segmentation, and object-level geometric accuracy, while ensuring real-time +rendering capabilities. + +
+
+
+
+
+ + ♻ ☆ ArtAdapter: Text-to-Image Style Transfer using Multi-Level Style Encoder + and Explicit Adaptation + + +
+ This work introduces ArtAdapter, a transformative text-to-image (T2I) style +transfer framework that transcends traditional limitations of color, +brushstrokes, and object shape, capturing high-level style elements such as +composition and distinctive artistic expression. The integration of a +multi-level style encoder with our proposed explicit adaptation mechanism +enables ArtAdapter to achieve unprecedented fidelity in style transfer, +ensuring close alignment with textual descriptions. Additionally, the +incorporation of an Auxiliary Content Adapter (ACA) effectively separates +content from style, alleviating the borrowing of content from style references. +Moreover, our novel fast finetuning approach could further enhance zero-shot +style representation while mitigating the risk of overfitting. Comprehensive +evaluations confirm that ArtAdapter surpasses current state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ Clean-image Backdoor Attacks + + +
+ To gather a significant quantity of annotated training data for +high-performance image classification models, numerous companies opt to enlist +third-party providers to label their unlabeled data. This practice is widely +regarded as secure, even in cases where some annotated errors occur, as the +impact of these minor inaccuracies on the final performance of the models is +negligible and existing backdoor attacks require attacker's ability to poison +the training images. Nevertheless, in this paper, we propose clean-image +backdoor attacks which uncover that backdoors can still be injected via a +fraction of incorrect labels without modifying the training images. +Specifically, in our attacks, the attacker first seeks a trigger feature to +divide the training images into two parts: those with the feature and those +without it. Subsequently, the attacker falsifies the labels of the former part +to a backdoor class. The backdoor will be finally implanted into the target +model after it is trained on the poisoned data. During the inference phase, the +attacker can activate the backdoor in two ways: slightly modifying the input +image to obtain the trigger feature, or taking an image that naturally has the +trigger feature as input. We conduct extensive experiments to demonstrate the +effectiveness and practicality of our attacks. According to the experimental +results, we conclude that our attacks seriously jeopardize the fairness and +robustness of image classification models, and it is necessary to be vigilant +about the incorrect labels in outsourced labeling. + +
+
+
+
+
+ + ♻ ☆ Transferring Relative Monocular Depth to Surgical Vision with Temporal + Consistency + + +
+ Relative monocular depth, inferring depth up to shift and scale from a single +image, is an active research topic. Recent deep learning models, trained on +large and varied meta-datasets, now provide excellent performance in the domain +of natural images. However, few datasets exist which provide ground truth depth +for endoscopic images, making training such models from scratch unfeasible. +This work investigates the transfer of these models into the surgical domain, +and presents an effective and simple way to improve on standard supervision +through the use of temporal consistency self-supervision. We show temporal +consistency significantly improves supervised training alone when transferring +to the low-data regime of endoscopy, and outperforms the prevalent +self-supervision technique for this task. In addition we show our method +drastically outperforms the state-of-the-art method from within the domain of +endoscopy. We also release our code, model and ensembled meta-dataset, +Meta-MED, establishing a strong benchmark for future work. + +
+
+
+
+
+ + ♻ ☆ Towards Source-free Domain Adaptive Semantic Segmentation via + Importance-aware and Prototype-contrast Learning + + +
+ Domain adaptive semantic segmentation enables robust pixel-wise understanding +in real-world driving scenes. Source-free domain adaptation, as a more +practical technique, addresses the concerns of data privacy and storage +limitations in typical unsupervised domain adaptation methods, making it +especially relevant in the context of intelligent vehicles. It utilizes a +well-trained source model and unlabeled target data to achieve adaptation in +the target domain. However, in the absence of source data and target labels, +current solutions cannot sufficiently reduce the impact of domain shift and +fully leverage the information from the target data. In this paper, we propose +an end-to-end source-free domain adaptation semantic segmentation method via +Importance-Aware and Prototype-Contrast (IAPC) learning. The proposed IAPC +framework effectively extracts domain-invariant knowledge from the well-trained +source model and learns domain-specific knowledge from the unlabeled target +domain. Specifically, considering the problem of domain shift in the prediction +of the target domain by the source model, we put forward an importance-aware +mechanism for the biased target prediction probability distribution to extract +domain-invariant knowledge from the source model. We further introduce a +prototype-contrast strategy, which includes a prototype-symmetric cross-entropy +loss and a prototype-enhanced cross-entropy loss, to learn target intra-domain +knowledge without relying on labels. A comprehensive variety of experiments on +two domain adaptive semantic segmentation benchmarks demonstrates that the +proposed end-to-end IAPC solution outperforms existing state-of-the-art +methods. The source code is publicly available at +https://github.com/yihong-97/Source-free-IAPC. + +
+
+ comment: Accepted to IEEE Transactions on Intelligent Vehicles (T-IV). The + source code is publicly available at + https://github.com/yihong-97/Source-free-IAPC +
+
+
+
+
+ + ♻ ☆ SD4Match: Learning to Prompt Stable Diffusion Model for Semantic + Matching CVPR 2024 + + +
+ In this paper, we address the challenge of matching semantically similar +keypoints across image pairs. Existing research indicates that the intermediate +output of the UNet within the Stable Diffusion (SD) can serve as robust image +feature maps for such a matching task. We demonstrate that by employing a basic +prompt tuning technique, the inherent potential of Stable Diffusion can be +harnessed, resulting in a significant enhancement in accuracy over previous +approaches. We further introduce a novel conditional prompting module that +conditions the prompt on the local details of the input image pairs, leading to +a further improvement in performance. We designate our approach as SD4Match, +short for Stable Diffusion for Semantic Matching. Comprehensive evaluations of +SD4Match on the PF-Pascal, PF-Willow, and SPair-71k datasets show that it sets +new benchmarks in accuracy across all these datasets. Particularly, SD4Match +outperforms the previous state-of-the-art by a margin of 12 percentage points +on the challenging SPair-71k dataset. + +
+
+ comment: Accepted to CVPR 2024. Project website: + https://sd4match.active.vision/ +
+
+
+
+
+ + ♻ ☆ ObjectCompose: Evaluating Resilience of Vision-Based Models on + Object-to-Background Compositional Changes + + +
+ Given the large-scale multi-modal training of recent vision-based models and +their generalization capabilities, understanding the extent of their robustness +is critical for their real-world deployment. In this work, we evaluate the +resilience of current vision-based models against diverse object-to-background +context variations. The majority of robustness evaluation methods have +introduced synthetic datasets to induce changes to object characteristics +(viewpoints, scale, color) or utilized image transformation techniques +(adversarial changes, common corruptions) on real images to simulate shifts in +distributions. Recent works have explored leveraging large language models and +diffusion models to generate changes in the background. However, these methods +either lack in offering control over the changes to be made or distort the +object semantics, making them unsuitable for the task. Our method, on the other +hand, can induce diverse object-to-background changes while preserving the +original semantics and appearance of the object. To achieve this goal, we +harness the generative capabilities of text-to-image, image-to-text, and +image-to-segment models to automatically generate a broad spectrum of +object-to-background changes. We induce both natural and adversarial background +changes by either modifying the textual prompts or optimizing the latents and +textual embedding of text-to-image models. We produce various versions of +standard vision datasets (ImageNet, COCO), incorporating either diverse and +realistic backgrounds into the images or introducing color, texture, and +adversarial changes in the background. We conduct extensive experiment to +analyze the robustness of vision-based models against object-to-background +context variations across diverse tasks. Code +https://github.com/Muhammad-Huzaifaa/ObjectCompose.git + +
+
+
+
+
+ + ♻ ☆ Motion Generation from Fine-grained Textual Descriptions + + +
+ The task of text2motion is to generate human motion sequences from given +textual descriptions, where the model explores diverse mappings from natural +language instructions to human body movements. While most existing works are +confined to coarse-grained motion descriptions, e.g., "A man squats.", +fine-grained descriptions specifying movements of relevant body parts are +barely explored. Models trained with coarse-grained texts may not be able to +learn mappings from fine-grained motion-related words to motion primitives, +resulting in the failure to generate motions from unseen descriptions. In this +paper, we build a large-scale language-motion dataset specializing in +fine-grained textual descriptions, FineHumanML3D, by feeding GPT-3.5-turbo with +step-by-step instructions with pseudo-code compulsory checks. Accordingly, we +design a new text2motion model, FineMotionDiffuse, making full use of +fine-grained textual information. Our quantitative evaluation shows that +FineMotionDiffuse trained on FineHumanML3D improves FID by a large margin of +0.38, compared with competitive baselines. According to the qualitative +evaluation and case study, our model outperforms MotionDiffuse in generating +spatially or chronologically composite motions, by learning the implicit +mappings from fine-grained descriptions to the corresponding basic motions. We +release our data at https://github.com/KunhangL/finemotiondiffuse. + +
+
+
+
+
+ + ♻ ☆ Towards Low-Energy Adaptive Personalization for Resource-Constrained + Devices + + +
+ The personalization of machine learning (ML) models to address data drift is +a significant challenge in the context of Internet of Things (IoT) +applications. Presently, most approaches focus on fine-tuning either the full +base model or its last few layers to adapt to new data, while often neglecting +energy costs. However, various types of data drift exist, and fine-tuning the +full base model or the last few layers may not result in optimal performance in +certain scenarios. We propose Target Block Fine-Tuning (TBFT), a low-energy +adaptive personalization framework designed for resource-constrained devices. +We categorize data drift and personalization into three types: input-level, +feature-level, and output-level. For each type, we fine-tune different blocks +of the model to achieve optimal performance with reduced energy costs. +Specifically, input-, feature-, and output-level correspond to fine-tuning the +front, middle, and rear blocks of the model. We evaluate TBFT on a ResNet +model, three datasets, three different training sizes, and a Raspberry Pi. +Compared with the $Block Avg$, where each block is fine-tuned individually and +their performance improvements are averaged, TBFT exhibits an improvement in +model accuracy by an average of 15.30% whilst saving 41.57% energy consumption +on average compared with full fine-tuning. + +
+
+ comment: Accepetd to The 4th Workshop on Machine Learning and Systems + (EuroMLSys '24) +
+
+
+
+
+ + ♻ ☆ FPT: Fine-grained Prompt Tuning for Parameter and Memory Efficient Fine + Tuning in High-resolution Medical Image Classification + + +
+ Parameter-efficient fine-tuning (PEFT) is proposed as a cost-effective way to +transfer pre-trained models to downstream tasks, avoiding the high cost of +updating entire large-scale pre-trained models (LPMs). In this work, we present +Fine-grained Prompt Tuning (FPT), a novel PEFT method for medical image +classification. FPT significantly reduces memory consumption compared to other +PEFT methods, especially in high-resolution contexts. To achieve this, we first +freeze the weights of the LPM and construct a learnable lightweight side +network. The frozen LPM takes high-resolution images as input to extract +fine-grained features, while the side network is fed low-resolution images to +reduce memory usage. To allow the side network to access pre-trained knowledge, +we introduce fine-grained prompts that summarize information from the LPM +through a fusion module. Important tokens selection and preloading techniques +are employed to further reduce training cost and memory requirements. We +evaluate FPT on four medical datasets with varying sizes, modalities, and +complexities. Experimental results demonstrate that FPT achieves comparable +performance to fine-tuning the entire LPM while using only 1.8% of the +learnable parameters and 13% of the memory costs of an encoder ViT-B model with +a 512 x 512 input resolution. + +
+
+
+
+
+ + ♻ ☆ SegVol: Universal and Interactive Volumetric Medical Image Segmentation + + +
+ Precise image segmentation provides clinical study with instructive +information. Despite the remarkable progress achieved in medical image +segmentation, there is still an absence of 3D foundation segmentation model +that can segment a wide range of anatomical categories with easy user +interaction. In this paper, we propose a 3D foundation segmentation model, +named SegVol, supporting universal and interactive volumetric medical image +segmentation. By scaling up training data to 90K unlabeled Computed Tomography +(CT) volumes and 6K labeled CT volumes, this foundation model supports the +segmentation of over 200 anatomical categories using semantic and spatial +prompts. Extensive experiments on 10 internal validation tasks and 18 external +validation tasks verify that SegVol outperforms the state of the art by a large +margin. Through its capacity to provide precise volumetric segmentation across +various anatomical categories, SegVol has the potential to accelerate +advancements in medical imaging diagnosis and facilitate treatment +optimization. The model and code are publicly available at: +https://github.com/BAAI-DCAI/SegVol. + +
+
+
+
+
+ + ♻ ☆ DreamComposer: Controllable 3D Object Generation via Multi-View + Conditions + + +
+ Utilizing pre-trained 2D large-scale generative models, recent works are +capable of generating high-quality novel views from a single in-the-wild image. +However, due to the lack of information from multiple views, these works +encounter difficulties in generating controllable novel views. In this paper, +we present DreamComposer, a flexible and scalable framework that can enhance +existing view-aware diffusion models by injecting multi-view conditions. +Specifically, DreamComposer first uses a view-aware 3D lifting module to obtain +3D representations of an object from multiple views. Then, it renders the +latent features of the target view from 3D representations with the multi-view +feature fusion module. Finally the target view features extracted from +multi-view inputs are injected into a pre-trained diffusion model. Experiments +show that DreamComposer is compatible with state-of-the-art diffusion models +for zero-shot novel view synthesis, further enhancing them to generate +high-fidelity novel view images with multi-view conditions, ready for +controllable 3D object reconstruction and various other applications. + +
+
+ comment: Project Page: https://yhyang-myron.github.io/DreamComposer/ +
+
+
+
+
+ + ♻ ☆ Regularizing Self-supervised 3D Scene Flows with Surface Awareness and + Cyclic Consistency + + +
+ Learning without supervision how to predict 3D scene flows from point clouds +is essential to many perception systems. We propose a novel learning framework +for this task which improves the necessary regularization. Relying on the +assumption that scene elements are mostly rigid, current smoothness losses are +built on the definition of ``rigid clusters" in the input point clouds. The +definition of these clusters is challenging and has a significant impact on the +quality of predicted flows. We introduce two new consistency losses that +enlarge clusters while preventing them from spreading over distinct objects. In +particular, we enforce \emph{temporal} consistency with a forward-backward +cyclic loss and \emph{spatial} consistency by considering surface orientation +similarity in addition to spatial proximity. The proposed losses are +model-independent and can thus be used in a plug-and-play fashion to +significantly improve the performance of existing models, as demonstrated on +two most widely used architectures. We also showcase the effectiveness and +generalization capability of our framework on four standard sensor-unique +driving datasets, achieving state-of-the-art performance in 3D scene flow +estimation. Our codes are available on https://github.com/ctu-vras/sac-flow. + +
+
+
+
+
+ + ♻ ☆ P2ANet: A Dataset and Benchmark for Dense Action Detection from Table + Tennis Match Broadcasting Videos + + +
+ While deep learning has been widely used for video analytics, such as video +classification and action detection, dense action detection with fast-moving +subjects from sports videos is still challenging. In this work, we release yet +another sports video benchmark \TheName{} for \emph{\underline{P}}ing +\emph{\underline{P}}ong-\emph{\underline{A}}ction detection, which consists of +2,721 video clips collected from the broadcasting videos of professional table +tennis matches in World Table Tennis Championships and Olympiads. We work with +a crew of table tennis professionals and referees on a specially designed +annotation toolbox to obtain fine-grained action labels (in 14 classes) for +every ping-pong action that appeared in the dataset, and formulate two sets of +action detection problems -- \emph{action localization} and \emph{action +recognition}. We evaluate a number of commonly-seen action recognition (e.g., +TSM, TSN, Video SwinTransformer, and Slowfast) and action localization models +(e.g., BSN, BSN++, BMN, TCANet), using \TheName{} for both problems, under +various settings. These models can only achieve 48\% area under the AR-AN curve +for localization and 82\% top-one accuracy for recognition since the ping-pong +actions are dense with fast-moving subjects but broadcasting videos are with +only 25 FPS. The results confirm that \TheName{} is still a challenging task +and can be used as a special benchmark for dense action detection from videos. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Semantic Segmentation Through Depth-Guided Feature + Correlation and Sampling CVPR 2024 + + +
+ Traditionally, training neural networks to perform semantic segmentation +required expensive human-made annotations. But more recently, advances in the +field of unsupervised learning have made significant progress on this issue and +towards closing the gap to supervised algorithms. To achieve this, semantic +knowledge is distilled by learning to correlate randomly sampled features from +images across an entire dataset. In this work, we build upon these advances by +incorporating information about the structure of the scene into the training +process through the use of depth information. We achieve this by (1) learning +depth-feature correlation by spatially correlate the feature maps with the +depth maps to induce knowledge about the structure of the scene and (2) +implementing farthest-point sampling to more effectively select relevant +features by utilizing 3D sampling techniques on depth information of the scene. +Finally, we demonstrate the effectiveness of our technical contributions +through extensive experimentation and present significant improvements in +performance across multiple benchmark datasets. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Diffusion Reflectance Map: Single-Image Stochastic Inverse Rendering of + Illumination and Reflectance CVPR 2024 + + +
+ Reflectance bounds the frequency spectrum of illumination in the object +appearance. In this paper, we introduce the first stochastic inverse rendering +method, which recovers the attenuated frequency spectrum of an illumination +jointly with the reflectance of an object of known geometry from a single +image. Our key idea is to solve this blind inverse problem in the reflectance +map, an appearance representation invariant to the underlying geometry, by +learning to reverse the image formation with a novel diffusion model which we +refer to as the Diffusion Reflectance Map Network (DRMNet). Given an observed +reflectance map converted and completed from the single input image, DRMNet +generates a reflectance map corresponding to a perfect mirror sphere while +jointly estimating the reflectance. The forward process can be understood as +gradually filtering a natural illumination with lower and lower frequency +reflectance and additive Gaussian noise. DRMNet learns to invert this process +with two subnetworks, IllNet and RefNet, which work in concert towards this +joint estimation. The network is trained on an extensive synthetic dataset and +is demonstrated to generalize to real images, showing state-of-the-art accuracy +on established datasets. + +
+
+ comment: to be published in CVPR 2024 +
+
+
+
+
+ + ♻ ☆ ProMamba: Prompt-Mamba for polyp segmentation + + +
+ Detecting polyps through colonoscopy is an important task in medical image +segmentation, which provides significant assistance and reference value for +clinical surgery. However, accurate segmentation of polyps is a challenging +task due to two main reasons. Firstly, polyps exhibit various shapes and +colors. Secondly, the boundaries between polyps and their normal surroundings +are often unclear. Additionally, significant differences between different +datasets lead to limited generalization capabilities of existing methods. To +address these issues, we propose a segmentation model based on Prompt-Mamba, +which incorporates the latest Vision-Mamba and prompt technologies. Compared to +previous models trained on the same dataset, our model not only maintains high +segmentation accuracy on the validation part of the same dataset but also +demonstrates superior accuracy on unseen datasets, exhibiting excellent +generalization capabilities. Notably, we are the first to apply the +Vision-Mamba architecture to polyp segmentation and the first to utilize prompt +technology in a polyp segmentation model. Our model efficiently accomplishes +segmentation tasks, surpassing previous state-of-the-art methods by an average +of 5% across six datasets. Furthermore, we have developed multiple versions of +our model with scaled parameter counts, achieving better performance than +previous models even with fewer parameters. Our code and trained weights will +be released soon. + +
+
+ comment: 10 pages, 2 figures,3 tabels +
+
+
+
+
+ + ♻ ☆ SocialCircle: Learning the Angle-based Social Interaction Representation + for Pedestrian Trajectory Prediction CVPR 2024 + + +
+ Analyzing and forecasting trajectories of agents like pedestrians and cars in +complex scenes has become more and more significant in many intelligent systems +and applications. The diversity and uncertainty in socially interactive +behaviors among a rich variety of agents make this task more challenging than +other deterministic computer vision tasks. Researchers have made a lot of +efforts to quantify the effects of these interactions on future trajectories +through different mathematical models and network structures, but this problem +has not been well solved. Inspired by marine animals that localize the +positions of their companions underwater through echoes, we build a new +anglebased trainable social interaction representation, named SocialCircle, for +continuously reflecting the context of social interactions at different angular +orientations relative to the target agent. We validate the effect of the +proposed SocialCircle by training it along with several newly released +trajectory prediction models, and experiments show that the SocialCircle not +only quantitatively improves the prediction performance, but also qualitatively +helps better simulate social interactions when forecasting pedestrian +trajectories in a way that is consistent with human intuitions. + +
+
+ comment: CVPR 2024 accepted +
+
+
+
+
+ + ♻ ☆ Emotic Masked Autoencoder with Attention Fusion for Facial Expression + Recognition + + +
+ Facial Expression Recognition (FER) is a critical task within computer vision +with diverse applications across various domains. Addressing the challenge of +limited FER datasets, which hampers the generalization capability of expression +recognition models, is imperative for enhancing performance. Our paper presents +an innovative approach integrating the MAE-Face self-supervised learning (SSL) +method and Fusion Attention mechanism for expression classification, +particularly showcased in the 6th Affective Behavior 32 pages harvmac; added +references for section 5Analysis in-the-wild (ABAW) competition. Additionally, +we propose preprocessing techniques to emphasize essential facial features, +thereby enhancing model performance on both training and validation sets, +notably demonstrated on the Aff-wild2 dataset. + +
+
+ comment: 6 pages; added references for section 1; corrected typo for email + author +
+
+
+
+
+ + ♻ ☆ Learning User Embeddings from Human Gaze for Personalised Saliency + Prediction + + +
+ Reusable embeddings of user behaviour have shown significant performance +improvements for the personalised saliency prediction task. However, prior +works require explicit user characteristics and preferences as input, which are +often difficult to obtain. We present a novel method to extract user embeddings +from pairs of natural images and corresponding saliency maps generated from a +small amount of user-specific eye tracking data. At the core of our method is a +Siamese convolutional neural encoder that learns the user embeddings by +contrasting the image and personal saliency map pairs of different users. +Evaluations on two public saliency datasets show that the generated embeddings +have high discriminative power, are effective at refining universal saliency +maps to the individual users, and generalise well across users and images. +Finally, based on our model's ability to encode individual user +characteristics, our work points towards other applications that can benefit +from reusable embeddings of gaze behaviour. + +
+
+
+
+
+ + ♻ ☆ VRP-SAM: SAM with Visual Reference Prompt CVPR 2024 + + +
+ In this paper, we propose a novel Visual Reference Prompt (VRP) encoder that +empowers the Segment Anything Model (SAM) to utilize annotated reference images +as prompts for segmentation, creating the VRP-SAM model. In essence, VRP-SAM +can utilize annotated reference images to comprehend specific objects and +perform segmentation of specific objects in target image. It is note that the +VRP encoder can support a variety of annotation formats for reference images, +including \textbf{point}, \textbf{box}, \textbf{scribble}, and \textbf{mask}. +VRP-SAM achieves a breakthrough within the SAM framework by extending its +versatility and applicability while preserving SAM's inherent strengths, thus +enhancing user-friendliness. To enhance the generalization ability of VRP-SAM, +the VRP encoder adopts a meta-learning strategy. To validate the effectiveness +of VRP-SAM, we conducted extensive empirical studies on the Pascal and COCO +datasets. Remarkably, VRP-SAM achieved state-of-the-art performance in visual +reference segmentation with minimal learnable parameters. Furthermore, VRP-SAM +demonstrates strong generalization capabilities, allowing it to perform +segmentation of unseen objects and enabling cross-domain segmentation. The +source code and models will be available at +\url{https://github.com/syp2ysy/VRP-SAM} + +
+
+ comment: Accepted by CVPR 2024; The camera-ready version +
+
+
+
+
+ + ♻ ☆ SeFFeC: Semantic Facial Feature Control for Fine-grained Face Editing + + +
+ We propose Semantic Facial Feature Control (SeFFeC) - a novel method for +fine-grained face shape editing. Our method enables the manipulation of +human-understandable, semantic face features, such as nose length or mouth +width, which are defined by different groups of facial landmarks. In contrast +to existing methods, the use of facial landmarks enables precise measurement of +the facial features, which then enables training SeFFeC without any manually +annotated labels. SeFFeC consists of a transformer-based encoder network that +takes a latent vector of a pre-trained generative model and a facial feature +embedding as input, and learns to modify the latent vector to perform the +desired face edit operation. To ensure that the desired feature measurement is +changed towards the target value without altering uncorrelated features, we +introduced a novel semantic face feature loss. Qualitative and quantitative +results show that SeFFeC enables precise and fine-grained control of 23 facial +features, some of which could not previously be controlled by other methods, +without requiring manual annotations. Unlike existing methods, SeFFeC also +provides deterministic control over the exact values of the facial features and +more localised and disentangled face edits. + +
+
+
+
+
+ + ♻ ☆ Dual Prototype Attention for Unsupervised Video Object Segmentation CVPR 2024 + + +
+ Unsupervised video object segmentation (VOS) aims to detect and segment the +most salient object in videos. The primary techniques used in unsupervised VOS +are 1) the collaboration of appearance and motion information; and 2) temporal +fusion between different frames. This paper proposes two novel prototype-based +attention mechanisms, inter-modality attention (IMA) and inter-frame attention +(IFA), to incorporate these techniques via dense propagation across different +modalities and frames. IMA densely integrates context information from +different modalities based on a mutual refinement. IFA injects global context +of a video to the query frame, enabling a full utilization of useful properties +from multiple frames. Experimental results on public benchmark datasets +demonstrate that our proposed approach outperforms all existing methods by a +substantial margin. The proposed two components are also thoroughly validated +via ablative study. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ From Pretext to Purpose: Batch-Adaptive Self-Supervised Learning + + +
+ In recent years, self-supervised contrastive learning has emerged as a +distinguished paradigm in the artificial intelligence landscape. It facilitates +unsupervised feature learning through contrastive delineations at the instance +level. However, crafting an effective self-supervised paradigm remains a +pivotal challenge within this field. This paper delves into two crucial factors +impacting self-supervised contrastive learning-bach size and pretext tasks, and +from a data processing standpoint, proposes an adaptive technique of batch +fusion. The proposed method, via dimensionality reduction and reconstruction of +batch data, enables formerly isolated individual data to partake in intra-batch +communication through the Embedding Layer. Moreover, it adaptively amplifies +the self-supervised feature encoding capability as the training progresses. We +conducted a linear classification test of this method based on the classic +contrastive learning framework on ImageNet-1k. The empirical findings +illustrate that our approach achieves state-of-the-art performance under +equitable comparisons. Benefiting from its "plug-and-play" characteristics, we +further explored other contrastive learning methods. On the ImageNet-100, +compared to the original performance, the top1 has seen a maximum increase of +1.25%. We suggest that the proposed method may contribute to the advancement of +data-driven self-supervised learning research, bringing a fresh perspective to +this community. + +
+
+ comment: 14 pages, 2 figures, the code of this paper will be released soon +
+
+
+
+
+ + ♻ ☆ LLaFS: When Large Language Models Meet Few-Shot Segmentation CVPR2024 + + +
+ This paper proposes LLaFS, the first attempt to leverage large language +models (LLMs) in few-shot segmentation. In contrast to the conventional +few-shot segmentation methods that only rely on the limited and biased +information from the annotated support images, LLaFS leverages the vast prior +knowledge gained by LLM as an effective supplement and directly uses the LLM to +segment images in a few-shot manner. To enable the text-based LLM to handle +image-related tasks, we carefully design an input instruction that allows the +LLM to produce segmentation results represented as polygons, and propose a +region-attribute table to simulate the human visual mechanism and provide +multi-modal guidance. We also synthesize pseudo samples and use curriculum +learning for pretraining to augment data and achieve better optimization. LLaFS +achieves state-of-the-art results on multiple datasets, showing the potential +of using LLMs for few-shot computer vision tasks. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ♻ ☆ EcoSense: Energy-Efficient Intelligent Sensing for In-Shore Ship + Detection through Edge-Cloud Collaboration + + +
+ Detecting marine objects inshore presents challenges owing to algorithmic +intricacies and complexities in system deployment. We propose a +difficulty-aware edge-cloud collaborative sensing system that splits the task +into object localization and fine-grained classification. Objects are +classified either at the edge or within the cloud, based on their estimated +difficulty. The framework comprises a low-power device-tailored front-end model +for object localization, classification, and difficulty estimation, along with +a transformer-graph convolutional network-based back-end model for fine-grained +classification. Our system demonstrates superior performance (mAP@0.5 +4.3%}) +on widely used marine object detection datasets, significantly reducing both +data transmission volume (by 95.43%) and energy consumption (by 72.7%}) at the +system level. We validate the proposed system across various embedded system +platforms and in real-world scenarios involving drone deployment. + +
+
+
+
+
+ + ♻ ☆ Vision Transformers with Hierarchical Attention + + +
+ This paper tackles the high computational/space complexity associated with +Multi-Head Self-Attention (MHSA) in vanilla vision transformers. To this end, +we propose Hierarchical MHSA (H-MHSA), a novel approach that computes +self-attention in a hierarchical fashion. Specifically, we first divide the +input image into patches as commonly done, and each patch is viewed as a token. +Then, the proposed H-MHSA learns token relationships within local patches, +serving as local relationship modeling. Then, the small patches are merged into +larger ones, and H-MHSA models the global dependencies for the small number of +the merged tokens. At last, the local and global attentive features are +aggregated to obtain features with powerful representation capacity. Since we +only calculate attention for a limited number of tokens at each step, the +computational load is reduced dramatically. Hence, H-MHSA can efficiently model +global relationships among tokens without sacrificing fine-grained information. +With the H-MHSA module incorporated, we build a family of +Hierarchical-Attention-based Transformer Networks, namely HAT-Net. To +demonstrate the superiority of HAT-Net in scene understanding, we conduct +extensive experiments on fundamental vision tasks, including image +classification, semantic segmentation, object detection, and instance +segmentation. Therefore, HAT-Net provides a new perspective for vision +transformers. Code and pretrained models are available at +https://github.com/yun-liu/HAT-Net. + +
+
+ comment: Machine Intelligence Research (MIR), DOI: 10.1007/s11633-024-1393-8 +
+
+
+
+
+ + ♻ ☆ Domain-Aware Fine-Tuning: Enhancing Neural Network Adaptability + + +
+ Fine-tuning pre-trained neural network models has become a widely adopted +approach across various domains. However, it can lead to the distortion of +pre-trained feature extractors that already possess strong generalization +capabilities. Mitigating feature distortion during adaptation to new target +domains is crucial. Recent studies have shown promising results in handling +feature distortion by aligning the head layer on in-distribution datasets +before performing fine-tuning. Nonetheless, a significant limitation arises +from the treatment of batch normalization layers during fine-tuning, leading to +suboptimal performance. In this paper, we propose Domain-Aware Fine-Tuning +(DAFT), a novel approach that incorporates batch normalization conversion and +the integration of linear probing and fine-tuning. Our batch normalization +conversion method effectively mitigates feature distortion by reducing +modifications to the neural network during fine-tuning. Additionally, we +introduce the integration of linear probing and fine-tuning to optimize the +head layer with gradual adaptation of the feature extractor. By leveraging +batch normalization layers and integrating linear probing and fine-tuning, our +DAFT significantly mitigates feature distortion and achieves improved model +performance on both in-distribution and out-of-distribution datasets. Extensive +experiments demonstrate that our method outperforms other baseline methods, +demonstrating its effectiveness in not only improving performance but also +mitigating feature distortion. + +
+
+
+
+
+ + ♻ ☆ NeuS-PIR: Learning Relightable Neural Surface using Pre-Integrated + Rendering + + +
+ This paper presents a method, namely NeuS-PIR, for recovering relightable +neural surfaces using pre-integrated rendering from multi-view images or video. +Unlike methods based on NeRF and discrete meshes, our method utilizes implicit +neural surface representation to reconstruct high-quality geometry, which +facilitates the factorization of the radiance field into two components: a +spatially varying material field and an all-frequency lighting representation. +This factorization, jointly optimized using an adapted differentiable +pre-integrated rendering framework with material encoding regularization, in +turn addresses the ambiguity of geometry reconstruction and leads to better +disentanglement and refinement of each scene property. Additionally, we +introduced a method to distil indirect illumination fields from the learned +representations, further recovering the complex illumination effect like +inter-reflection. Consequently, our method enables advanced applications such +as relighting, which can be seamlessly integrated with modern graphics engines. +Qualitative and quantitative experiments have shown that NeuS-PIR outperforms +existing methods across various tasks on both synthetic and real datasets. +Source code is available at https://github.com/Sheldonmao/NeuSPIR + +
+
+
+
+
+ + ♻ ☆ Gaze-guided Hand-Object Interaction Synthesis: Benchmark and Method + + +
+ Gaze plays a crucial role in revealing human attention and intention, +shedding light on the cognitive processes behind human actions. The integration +of gaze guidance with the dynamics of hand-object interactions boosts the +accuracy of human motion prediction. However, the lack of datasets that capture +the intricate relationship and consistency among gaze, hand, and object +movements remains a substantial hurdle. In this paper, we introduce the first +Gaze-guided Hand-Object Interaction dataset, GazeHOI, and present a novel task +for synthesizing gaze-guided hand-object interactions. Our dataset, GazeHOI, +features simultaneous 3D modeling of gaze, hand, and object interactions, +comprising 479 sequences with an average duration of 19.1 seconds, 812 +sub-sequences, and 33 objects of various sizes. We propose a hierarchical +framework centered on a gaze-guided hand-object interaction diffusion model, +named GHO-Diffusion. In the pre-diffusion phase, we separate gaze conditions +into spatial-temporal features and goal pose conditions at different levels of +information granularity. During the diffusion phase, two gaze-conditioned +diffusion models are stacked to simplify the complex synthesis of hand-object +motions. Here, the object motion diffusion model generates sequences of object +motions based on gaze conditions, while the hand motion diffusion model +produces hand motions based on the generated object motion. To improve +fine-grained goal pose alignment, we introduce a Spherical Gaussian constraint +to guide the denoising step. In the subsequent post-diffusion phase, we +optimize the generated hand motions using contact consistency. Our extensive +experiments highlight the uniqueness of our dataset and the effectiveness of +our approach. + +
+
+
+
+
+ + ♻ ☆ Learning-based Axial Video Motion Magnification + + +
+ Video motion magnification amplifies invisible small motions to be +perceptible, which provides humans with a spatially dense and holistic +understanding of small motions in the scene of interest. This is based on the +premise that magnifying small motions enhances the legibility of motions. In +the real world, however, vibrating objects often possess convoluted systems +that have complex natural frequencies, modes, and directions. Existing motion +magnification often fails to improve legibility since the intricate motions +still retain complex characteristics even after being magnified, which may +distract us from analyzing them. In this work, we focus on improving legibility +by proposing a new concept, axial motion magnification, which magnifies +decomposed motions along the user-specified direction. Axial motion +magnification can be applied to various applications where motions of specific +axes are critical, by providing simplified and easily readable motion +information. To achieve this, we propose a novel Motion Separation Module that +enables to disentangle and magnify the motion representation along axes of +interest. Furthermore, we build a new synthetic training dataset for the axial +motion magnification task. Our proposed method improves the legibility of +resulting motions along certain axes by adding a new feature: user +controllability. Axial motion magnification is a more generalized concept; +thus, our method can be directly adapted to the generic motion magnification +and achieves favorable performance against competing methods. + +
+
+ comment: main paper: 12 pages, supplementary: 10 pages, 20 figures, 1 table +
+
+
+
+
+ + ♻ ☆ Decomposing Disease Descriptions for Enhanced Pathology Detection: A + Multi-Aspect Vision-Language Pre-training Framework CVPR2024 + + +
+ Medical vision language pre-training (VLP) has emerged as a frontier of +research, enabling zero-shot pathological recognition by comparing the query +image with the textual descriptions for each disease. Due to the complex +semantics of biomedical texts, current methods struggle to align medical images +with key pathological findings in unstructured reports. This leads to the +misalignment with the target disease's textual representation. In this paper, +we introduce a novel VLP framework designed to dissect disease descriptions +into their fundamental aspects, leveraging prior knowledge about the visual +manifestations of pathologies. This is achieved by consulting a large language +model and medical experts. Integrating a Transformer module, our approach +aligns an input image with the diverse elements of a disease, generating +aspect-centric image representations. By consolidating the matches from each +aspect, we improve the compatibility between an image and its associated +disease. Additionally, capitalizing on the aspect-oriented representations, we +present a dual-head Transformer tailored to process known and unknown diseases, +optimizing the comprehensive detection efficacy. Conducting experiments on +seven downstream datasets, ours improves the accuracy of recent methods by up +to 8.56% and 17.0% for seen and unseen categories, respectively. Our code is +released at https://github.com/HieuPhan33/MAVL. + +
+
+ comment: Accepted at CVPR2024. Pre-print before final camera-ready version +
+
+
+
+
+ + ♻ ☆ Lodge: A Coarse to Fine Diffusion Network for Long Dance Generation + Guided by the Characteristic Dance Primitives CVPR2024 + + +
+ We propose Lodge, a network capable of generating extremely long dance +sequences conditioned on given music. We design Lodge as a two-stage coarse to +fine diffusion architecture, and propose the characteristic dance primitives +that possess significant expressiveness as intermediate representations between +two diffusion models. The first stage is global diffusion, which focuses on +comprehending the coarse-level music-dance correlation and production +characteristic dance primitives. In contrast, the second-stage is the local +diffusion, which parallelly generates detailed motion sequences under the +guidance of the dance primitives and choreographic rules. In addition, we +propose a Foot Refine Block to optimize the contact between the feet and the +ground, enhancing the physical realism of the motion. Our approach can +parallelly generate dance sequences of extremely long length, striking a +balance between global choreographic patterns and local motion quality and +expressiveness. Extensive experiments validate the efficacy of our method. + +
+
+ comment: Accepted by CVPR2024, Project page: + https://li-ronghui.github.io/lodge +
+
+
+
+
+ + ♻ ☆ Image Captioning in news report scenario + + +
+ Image captioning strives to generate pertinent captions for specified images, +situating itself at the crossroads of Computer Vision (CV) and Natural Language +Processing (NLP). This endeavor is of paramount importance with far-reaching +applications in recommendation systems, news outlets, social media, and beyond. +Particularly within the realm of news reporting, captions are expected to +encompass detailed information, such as the identities of celebrities captured +in the images. However, much of the existing body of work primarily centers +around understanding scenes and actions. In this paper, we explore the realm of +image captioning specifically tailored for celebrity photographs, illustrating +its broad potential for enhancing news industry practices. This exploration +aims to augment automated news content generation, thereby facilitating a more +nuanced dissemination of information. Our endeavor shows a broader horizon, +enriching the narrative in news reporting through a more intuitive image +captioning framework. + +
+
+ comment: 10 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ LoCo: Locally Constrained Training-Free Layout-to-Image Synthesis + + +
+ Recent text-to-image diffusion models have reached an unprecedented level in +generating high-quality images. However, their exclusive reliance on textual +prompts often falls short in precise control of image compositions. In this +paper, we propose LoCo, a training-free approach for layout-to-image Synthesis +that excels in producing high-quality images aligned with both textual prompts +and layout instructions. Specifically, we introduce a Localized Attention +Constraint (LAC), leveraging semantic affinity between pixels in self-attention +maps to create precise representations of desired objects and effectively +ensure the accurate placement of objects in designated regions. We further +propose a Padding Token Constraint (PTC) to leverage the semantic information +embedded in previously neglected padding tokens, improving the consistency +between object appearance and layout instructions. LoCo seamlessly integrates +into existing text-to-image and layout-to-image models, enhancing their +performance in spatial control and addressing semantic failures observed in +prior methods. Extensive experiments showcase the superiority of our approach, +surpassing existing state-of-the-art training-free layout-to-image methods both +qualitatively and quantitatively across multiple benchmarks. + +
+
+ comment: Demo: https://huggingface.co/spaces/Pusheen/LoCo; Project page: + https://momopusheen.github.io/LoCo/ +
+
+
+
+
+ + ♻ ☆ $\texttt{NePhi}$: Neural Deformation Fields for Approximately + Diffeomorphic Medical Image Registration + + +
+ This work proposes NePhi, a generalizable neural deformation model which +results in approximately diffeomorphic transformations. In contrast to the +predominant voxel-based transformation fields used in learning-based +registration approaches, NePhi represents deformations functionally, leading to +great flexibility within the design space of memory consumption during training +and inference, inference time, registration accuracy, as well as transformation +regularity. Specifically, NePhi 1) requires less memory compared to voxel-based +learning approaches, 2) improves inference speed by predicting latent codes, +compared to current existing neural deformation based registration approaches +that \emph{only} rely on optimization, 3) improves accuracy via instance +optimization, and 4) shows excellent deformation regularity which is highly +desirable for medical image registration. We demonstrate the performance of +NePhi on a 2D synthetic dataset as well as for real 3D lung registration. Our +results show that NePhi can match the accuracy of voxel-based representations +in a single-resolution registration setting. For multi-resolution registration, +our method matches the accuracy of current SOTA learning-based registration +approaches with instance optimization while reducing memory requirements by a +factor of five. + +
+
+
+
+
+ + ♻ ☆ A Novel Approach to Industrial Defect Generation through Blended Latent + Diffusion Model with Online Adaptation + + +
+ Effectively addressing the challenge of industrial Anomaly Detection (AD) +necessitates an ample supply of defective samples, a constraint often hindered +by their scarcity in industrial contexts. This paper introduces a novel +algorithm designed to augment defective samples, thereby enhancing AD +performance. The proposed method tailors the blended latent diffusion model for +defect sample generation, employing a diffusion model to generate defective +samples in the latent space. A feature editing process, controlled by a +``trimap" mask and text prompts, refines the generated samples. The image +generation inference process is structured into three stages: a free diffusion +stage, an editing diffusion stage, and an online decoder adaptation stage. This +sophisticated inference strategy yields high-quality synthetic defective +samples with diverse pattern variations, leading to significantly improved AD +accuracies based on the augmented training set. Specifically, on the widely +recognized MVTec AD dataset, the proposed method elevates the state-of-the-art +(SOTA) performance of AD with augmented data by 1.5%, 1.9%, and 3.1% for AD +metrics AP, IAP, and IAP90, respectively. The implementation code of this work +can be found at the GitHub repository +https://github.com/GrandpaXun242/AdaBLDM.git + +
+
+ comment: 13 pages,7 figures +
+
+
+
+
+ + ♻ ☆ X-Portrait: Expressive Portrait Animation with Hierarchical Motion + Attention + + +
+ We propose X-Portrait, an innovative conditional diffusion model tailored for +generating expressive and temporally coherent portrait animation. Specifically, +given a single portrait as appearance reference, we aim to animate it with +motion derived from a driving video, capturing both highly dynamic and subtle +facial expressions along with wide-range head movements. As its core, we +leverage the generative prior of a pre-trained diffusion model as the rendering +backbone, while achieve fine-grained head pose and expression control with +novel controlling signals within the framework of ControlNet. In contrast to +conventional coarse explicit controls such as facial landmarks, our motion +control module is learned to interpret the dynamics directly from the original +driving RGB inputs. The motion accuracy is further enhanced with a patch-based +local control module that effectively enhance the motion attention to +small-scale nuances like eyeball positions. Notably, to mitigate the identity +leakage from the driving signals, we train our motion control modules with +scaling-augmented cross-identity images, ensuring maximized disentanglement +from the appearance reference modules. Experimental results demonstrate the +universal effectiveness of X-Portrait across a diverse range of facial +portraits and expressive driving sequences, and showcase its proficiency in +generating captivating portrait animations with consistently maintained +identity characteristics. + +
+
+
+
+
+ + ♻ ☆ VMRNN: Integrating Vision Mamba and LSTM for Efficient and Accurate + Spatiotemporal Forecasting + + +
+ Combining CNNs or ViTs, with RNNs for spatiotemporal forecasting, has yielded +unparalleled results in predicting temporal and spatial dynamics. However, +modeling extensive global information remains a formidable challenge; CNNs are +limited by their narrow receptive fields, and ViTs struggle with the intensive +computational demands of their attention mechanisms. The emergence of recent +Mamba-based architectures has been met with enthusiasm for their exceptional +long-sequence modeling capabilities, surpassing established vision models in +efficiency and accuracy, which motivates us to develop an innovative +architecture tailored for spatiotemporal forecasting. In this paper, we propose +the VMRNN cell, a new recurrent unit that integrates the strengths of Vision +Mamba blocks with LSTM. We construct a network centered on VMRNN cells to +tackle spatiotemporal prediction tasks effectively. Our extensive evaluations +show that our proposed approach secures competitive results on a variety of +tasks while maintaining a smaller model size. Our code is available at +https://github.com/yyyujintang/VMRNN-PyTorch. + +
+
+ comment: 11 pages, 7 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ DiffCast: A Unified Framework via Residual Diffusion for Precipitation + Nowcasting CVPR 2024 + + +
+ Precipitation nowcasting is an important spatio-temporal prediction task to +predict the radar echoes sequences based on current observations, which can +serve both meteorological science and smart city applications. Due to the +chaotic evolution nature of the precipitation systems, it is a very challenging +problem. Previous studies address the problem either from the perspectives of +deterministic modeling or probabilistic modeling. However, their predictions +suffer from the blurry, high-value echoes fading away and position inaccurate +issues. The root reason of these issues is that the chaotic evolutionary +precipitation systems are not appropriately modeled. Inspired by the nature of +the systems, we propose to decompose and model them from the perspective of +global deterministic motion and local stochastic variations with residual +mechanism. A unified and flexible framework that can equip any type of +spatio-temporal models is proposed based on residual diffusion, which +effectively tackles the shortcomings of previous methods. Extensive +experimental results on four publicly available radar datasets demonstrate the +effectiveness and superiority of the proposed framework, compared to +state-of-the-art techniques. Our code is publicly available at +https://github.com/DeminYu98/DiffCast. + +
+
+ comment: CVPR 2024; https://github.com/DeminYu98/DiffCast +
+
+
+
+
+ + ♻ ☆ Diffusion Models Generate Images Like Painters: an Analytical Theory of + Outline First, Details Later NeurIPS23 + + +
+ How do diffusion generative models convert pure noise into meaningful images? +In a variety of pretrained diffusion models (including conditional latent space +models like Stable Diffusion), we observe that the reverse diffusion process +that underlies image generation has the following properties: (i) individual +trajectories tend to be low-dimensional and resemble 2D `rotations'; (ii) +high-variance scene features like layout tend to emerge earlier, while +low-variance details tend to emerge later; and (iii) early perturbations tend +to have a greater impact on image content than later perturbations. To +understand these phenomena, we derive and study a closed-form solution to the +probability flow ODE for a Gaussian distribution, which shows that the reverse +diffusion state rotates towards a gradually-specified target on the image +manifold. It also shows that generation involves first committing to an +outline, and then to finer and finer details. We find that this solution +accurately describes the initial phase of image generation for pretrained +models, and can in principle be used to make image generation more efficient by +skipping reverse diffusion steps. Finally, we use our solution to characterize +the image manifold in Stable Diffusion. Our viewpoint reveals an unexpected +similarity between generation by GANs and diffusion and provides a conceptual +link between diffusion and image retrieval. + +
+
+ comment: 44 pages, 28 figures. A briefer version was presented at NeurIPS23 + Workshop on Diffusion Models [arXiv:2311.10892] +
+
+
+
+
+ + ♻ ☆ Confidence-Triggered Detection: Accelerating Real-time + Tracking-by-detection Systems + + +
+ Real-time object tracking necessitates a delicate balance between speed and +accuracy, a challenge exacerbated by the computational demands of deep learning +methods. In this paper, we propose Confidence-Triggered Detection (CTD), an +innovative approach that strategically bypasses object detection for frames +closely resembling intermediate states, leveraging tracker confidence scores. +CTD not only enhances tracking speed but also preserves accuracy, surpassing +existing tracking algorithms. Through extensive evaluation across various +tracker confidence thresholds, we identify an optimal trade-off between +tracking speed and accuracy, providing crucial insights for parameter +fine-tuning and enhancing CTD's practicality in real-world scenarios. Our +experiments across diverse detection models underscore the robustness and +versatility of the CTD framework, demonstrating its potential to enable +real-time tracking in resource-constrained environments. + +
+
+ comment: 9 pages, 5 figures, 1 table +
+
+
+
+
+ + ♻ ☆ Troika: Multi-Path Cross-Modal Traction for Compositional Zero-Shot + Learning CVPR 2024 + + +
+ Recent compositional zero-shot learning (CZSL) methods adapt pre-trained +vision-language models (VLMs) by constructing trainable prompts only for +composed state-object pairs. Relying on learning the joint representation of +seen compositions, these methods ignore the explicit modeling of the state and +object, thus limiting the exploitation of pre-trained knowledge and +generalization to unseen compositions. With a particular focus on the +universality of the solution, in this work, we propose a novel paradigm for +CZSL models that establishes three identification branches (i.e., Multi-Path) +to jointly model the state, object, and composition. The presented Troika is +our implementation that aligns the branch-specific prompt representations with +decomposed visual features. To calibrate the bias between semantically similar +multi-modal representations, we further devise a Cross-Modal Traction module +into Troika that shifts the prompt representation towards the current visual +content. We conduct extensive experiments on three popular benchmarks, where +our method significantly outperforms existing methods in both closed-world and +open-world settings. The code will be available at +https://github.com/bighuang624/Troika. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Joint Learning Neuronal Skeleton and Brain Circuit Topology with + Permutation Invariant Encoders for Neuron Classification AAAI 2024 + + +
+ Determining the types of neurons within a nervous system plays a significant +role in the analysis of brain connectomics and the investigation of +neurological diseases. However, the efficiency of utilizing anatomical, +physiological, or molecular characteristics of neurons is relatively low and +costly. With the advancements in electron microscopy imaging and analysis +techniques for brain tissue, we are able to obtain whole-brain connectome +consisting neuronal high-resolution morphology and connectivity information. +However, few models are built based on such data for automated neuron +classification. In this paper, we propose NeuNet, a framework that combines +morphological information of neurons obtained from skeleton and topological +information between neurons obtained from neural circuit. Specifically, NeuNet +consists of three components, namely Skeleton Encoder, Connectome Encoder, and +Readout Layer. Skeleton Encoder integrates the local information of neurons in +a bottom-up manner, with a one-dimensional convolution in neural skeleton's +point data; Connectome Encoder uses a graph neural network to capture the +topological information of neural circuit; finally, Readout Layer fuses the +above two information and outputs classification results. We reprocess and +release two new datasets for neuron classification task from volume electron +microscopy(VEM) images of human brain cortex and Drosophila brain. Experiments +on these two datasets demonstrated the effectiveness of our model with accuracy +of 0.9169 and 0.9363, respectively. Code and data are available at: +https://github.com/WHUminghui/NeuNet. + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Doubly Abductive Counterfactual Inference for Text-based Image Editing CVPR 2024 + + +
+ We study text-based image editing (TBIE) of a single image by counterfactual +inference because it is an elegant formulation to precisely address the +requirement: the edited image should retain the fidelity of the original one. +Through the lens of the formulation, we find that the crux of TBIE is that +existing techniques hardly achieve a good trade-off between editability and +fidelity, mainly due to the overfitting of the single-image fine-tuning. To +this end, we propose a Doubly Abductive Counterfactual inference framework +(DAC). We first parameterize an exogenous variable as a UNet LoRA, whose +abduction can encode all the image details. Second, we abduct another exogenous +variable parameterized by a text encoder LoRA, which recovers the lost +editability caused by the overfitted first abduction. Thanks to the second +abduction, which exclusively encodes the visual transition from post-edit to +pre-edit, its inversion -- subtracting the LoRA -- effectively reverts pre-edit +back to post-edit, thereby accomplishing the edit. Through extensive +experiments, our DAC achieves a good trade-off between editability and +fidelity. Thus, we can support a wide spectrum of user editing intents, +including addition, removal, manipulation, replacement, style transfer, and +facial change, which are extensively validated in both qualitative and +quantitative evaluations. Codes are in https://github.com/xuesong39/DAC. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ PKU-DyMVHumans: A Multi-View Video Benchmark for High-Fidelity Dynamic + Human Modeling + + +
+ High-quality human reconstruction and photo-realistic rendering of a dynamic +scene is a long-standing problem in computer vision and graphics. Despite +considerable efforts invested in developing various capture systems and +reconstruction algorithms, recent advancements still struggle with loose or +oversized clothing and overly complex poses. In part, this is due to the +challenges of acquiring high-quality human datasets. To facilitate the +development of these fields, in this paper, we present PKU-DyMVHumans, a +versatile human-centric dataset for high-fidelity reconstruction and rendering +of dynamic human scenarios from dense multi-view videos. It comprises 8.2 +million frames captured by more than 56 synchronized cameras across diverse +scenarios. These sequences comprise 32 human subjects across 45 different +scenarios, each with a high-detailed appearance and realistic human motion. +Inspired by recent advancements in neural radiance field (NeRF)-based scene +representations, we carefully set up an off-the-shelf framework that is easy to +provide those state-of-the-art NeRF-based implementations and benchmark on +PKU-DyMVHumans dataset. It is paving the way for various applications like +fine-grained foreground/background decomposition, high-quality human +reconstruction and photo-realistic novel view synthesis of a dynamic scene. +Extensive studies are performed on the benchmark, demonstrating new +observations and challenges that emerge from using such high-fidelity dynamic +data. The dataset is available at: https://pku-dymvhumans.github.io. + +
+
+
+
+
+ + ♻ ☆ Preserve Your Own Correlation: A Noise Prior for Video Diffusion Models ICCV 2023 + + +
+ Despite tremendous progress in generating high-quality images using diffusion +models, synthesizing a sequence of animated frames that are both photorealistic +and temporally coherent is still in its infancy. While off-the-shelf +billion-scale datasets for image generation are available, collecting similar +video data of the same scale is still challenging. Also, training a video +diffusion model is computationally much more expensive than its image +counterpart. In this work, we explore finetuning a pretrained image diffusion +model with video data as a practical solution for the video synthesis task. We +find that naively extending the image noise prior to video noise prior in video +diffusion leads to sub-optimal performance. Our carefully designed video noise +prior leads to substantially better performance. Extensive experimental +validation shows that our model, Preserve Your Own Correlation (PYoCo), attains +SOTA zero-shot text-to-video results on the UCF-101 and MSR-VTT benchmarks. It +also achieves SOTA video generation quality on the small-scale UCF-101 +benchmark with a $10\times$ smaller model using significantly less computation +than the prior art. + +
+
+ comment: ICCV 2023. Project webpage: + https://research.nvidia.com/labs/dir/pyoco +
+
+
+
+
+ + ♻ ☆ MEDDAP: Medical Dataset Enhancement via Diversified Augmentation + Pipeline MICCAI-2024 + + +
+ The effectiveness of Deep Neural Networks (DNNs) heavily relies on the +abundance and accuracy of available training data. However, collecting and +annotating data on a large scale is often both costly and time-intensive, +particularly in medical cases where practitioners are already occupied with +their duties. Moreover, ensuring that the model remains robust across various +scenarios of image capture is crucial in medical domains, especially when +dealing with ultrasound images that vary based on the settings of different +devices and the manual operation of the transducer. To address this challenge, +we introduce a novel pipeline called MEDDAP, which leverages Stable Diffusion +(SD) models to augment existing small datasets by automatically generating new +informative labeled samples. Pretrained checkpoints for SD are typically based +on natural images, and training them for medical images requires significant +GPU resources due to their heavy parameters. To overcome this challenge, we +introduce USLoRA (Ultrasound Low-Rank Adaptation), a novel fine-tuning method +tailored specifically for ultrasound applications. USLoRA allows for selective +fine-tuning of weights within SD, requiring fewer than 0.1\% of parameters +compared to fully fine-tuning only the UNet portion of SD. To enhance dataset +diversity, we incorporate different adjectives into the generation process +prompts, thereby desensitizing the classifiers to intensity changes across +different images. This approach is inspired by clinicians' decision-making +processes regarding breast tumors, where tumor shape often plays a more crucial +role than intensity. In conclusion, our pipeline not only outperforms +classifiers trained on the original dataset but also demonstrates superior +performance when encountering unseen datasets. The source code is available at +https://github.com/yasamin-med/MEDDAP. + +
+
+ comment: submitted to miccai 2024 submitted to miccai 2024 Submitted to + MICCAI-2024 +
+
+
+
+
+ + ♻ ☆ HOOD: Real-Time Human Presence and Out-of-Distribution Detection Using + FMCW Radar + + +
+ Detecting human presence indoors with millimeter-wave frequency-modulated +continuous-wave (FMCW) radar faces challenges from both moving and stationary +clutter. This work proposes a robust and real-time capable human presence and +out-of-distribution (OOD) detection method using 60 GHz short-range FMCW radar. +HOOD solves the human presence and OOD detection problems simultaneously in a +single pipeline. Our solution relies on a reconstruction-based architecture and +works with radar macro and micro range-Doppler images (RDIs). HOOD aims to +accurately detect the presence of humans in the presence or absence of moving +and stationary disturbers. Since HOOD is also an OOD detector, it aims to +detect moving or stationary clutters as OOD in humans' absence and predicts the +current scene's output as "no presence." HOOD performs well in diverse +scenarios, demonstrating its effectiveness across different human activities +and situations. On our dataset collected with a 60 GHz short-range FMCW radar, +we achieve an average AUROC of 94.36%. Additionally, our extensive evaluations +and experiments demonstrate that HOOD outperforms state-of-the-art (SOTA) OOD +detection methods in terms of common OOD detection metrics. Importantly, HOOD +also perfectly fits on Raspberry Pi 3B+ with an ARM Cortex-A53 CPU, which +showcases its versatility across different hardware environments. Videos of our +human presence detection experiments are available at: +https://muskahya.github.io/HOOD + +
+
+ comment: 10 pages, 2 figures, project page: https://muskahya.github.io/HOOD +
+
+
+
+
+ + ♻ ☆ HIVE: Harnessing Human Feedback for Instructional Visual Editing CVPR + + +
+ Incorporating human feedback has been shown to be crucial to align text +generated by large language models to human preferences. We hypothesize that +state-of-the-art instructional image editing models, where outputs are +generated based on an input image and an editing instruction, could similarly +benefit from human feedback, as their outputs may not adhere to the correct +instructions and preferences of users. In this paper, we present a novel +framework to harness human feedback for instructional visual editing (HIVE). +Specifically, we collect human feedback on the edited images and learn a reward +function to capture the underlying user preferences. We then introduce scalable +diffusion model fine-tuning methods that can incorporate human preferences +based on the estimated reward. Besides, to mitigate the bias brought by the +limitation of data, we contribute a new 1M training dataset, a 3.6K reward +dataset for rewards learning, and a 1K evaluation dataset to boost the +performance of instructional image editing. We conduct extensive empirical +experiments quantitatively and qualitatively, showing that HIVE is favored over +previous state-of-the-art instructional image editing approaches by a large +margin. + +
+
+ comment: In CVPR, 2024 +
+
+
+
+
+ + ♻ ☆ ERM++: An Improved Baseline for Domain Generalization + + +
+ Domain Generalization (DG) measures a classifier's ability to generalize to +new distributions of data it was not trained on. Recent work has shown that a +hyperparameter-tuned Empirical Risk Minimization (ERM) training procedure, that +is simply minimizing the empirical risk on the source domains, can outperform +most existing DG methods. ERM has achieved such strong results while only +tuning hyper-parameters such as learning rate, weight decay, batch size, and +dropout. However there are additional hyperparameters which further limit +overfitting and catastrophic forgetting. We therefore focus on tuning +previously untuned hyper-parameters, including training amount, initialization, +and additional regularizers. We call the resulting stronger baseline ERM++. +ERM++ improves the performance of DG by over 5% compared to prior ERM baselines +on a standard benchmark of 5 datasets with a ResNet-50 and over 15% with a +ViT-B/16, and outperforms all SOTA methods on DomainBed with both +architectures. We also explore the relationship between DG performance and +similarity to pre-training data, and find that similarity to pre-training data +distributions is an important driver of performance, but that ERM++ with +stronger initializations can deliver strong performance even on dissimilar +datasets.Code is released at https://github.com/piotr-teterwak/erm_plusplus. + +
+
+ comment: An improved baseline for Domain Generalization +
+
+
+
+
+ + ♻ ☆ Step-Calibrated Diffusion for Biomedical Optical Image Restoration + + +
+ High-quality, high-resolution medical imaging is essential for clinical care. +Raman-based biomedical optical imaging uses non-ionizing infrared radiation to +evaluate human tissues in real time and is used for early cancer detection, +brain tumor diagnosis, and intraoperative tissue analysis. Unfortunately, +optical imaging is vulnerable to image degradation due to laser scattering and +absorption, which can result in diagnostic errors and misguided treatment. +Restoration of optical images is a challenging computer vision task because the +sources of image degradation are multi-factorial, stochastic, and +tissue-dependent, preventing a straightforward method to obtain paired +low-quality/high-quality data. Here, we present Restorative Step-Calibrated +Diffusion (RSCD), an unpaired image restoration method that views the image +restoration problem as completing the finishing steps of a diffusion-based +image generation task. RSCD uses a step calibrator model to dynamically +determine the severity of image degradation and the number of steps required to +complete the reverse diffusion process for image restoration. RSCD outperforms +other widely used unpaired image restoration methods on both image quality and +perceptual evaluation metrics for restoring optical images. Medical imaging +experts consistently prefer images restored using RSCD in blinded comparison +experiments and report minimal to no hallucinations. Finally, we show that RSCD +improves performance on downstream clinical imaging tasks, including automated +brain tumor diagnosis and deep tissue imaging. Our code is available at +https://github.com/MLNeurosurg/restorative_step-calibrated_diffusion. + +
+
+
+
+
+ + ♻ ☆ Visual Whole-Body Control for Legged Loco-Manipulation + + +
+ We study the problem of mobile manipulation using legged robots equipped with +an arm, namely legged loco-manipulation. The robot legs, while usually utilized +for mobility, offer an opportunity to amplify the manipulation capabilities by +conducting whole-body control. That is, the robot can control the legs and the +arm at the same time to extend its workspace. We propose a framework that can +conduct the whole-body control autonomously with visual observations. Our +approach, namely Visual Whole-Body Control(VBC), is composed of a low-level +policy using all degrees of freedom to track the end-effector manipulator +position and a high-level policy proposing the end-effector position based on +visual inputs. We train both levels of policies in simulation and perform +Sim2Real transfer for real robot deployment. We perform extensive experiments +and show significant improvements over baselines in picking up diverse objects +in different configurations (heights, locations, orientations) and +environments. Project page: https://wholebody-b1.github.io + +
+
+ comment: The first two authors contribute equally. Project page: + https://wholebody-b1.github.io +
+
+
+
+
+ + ♻ ☆ CLAMP: Contrastive LAnguage Model Prompt-tuning + + +
+ Large language models (LLMs) have emerged as powerful general-purpose +interfaces for many machine learning problems. Recent work has adapted LLMs to +generative visual tasks like image captioning, visual question answering, and +visual chat, using a relatively small amount of instruction-tuning data. In +this paper, we explore whether modern LLMs can also be adapted to classifying +an image into a set of categories. First, we evaluate multimodal LLMs that are +tuned for generative tasks on zero-shot image classification and find that +their performance is far below that of specialized models like CLIP. We then +propose an approach for light fine-tuning of LLMs using the same contrastive +image-caption matching objective as CLIP. Our results show that LLMs can, +indeed, achieve good image classification performance when adapted this way. +Our approach beats state-of-the-art mLLMs by 13% and slightly outperforms +contrastive learning with a custom text model, while also retaining the LLM's +generative abilities. LLM initialization appears to particularly help +classification in domains under-represented in the visual pre-training data. + +
+
+
+
+
+ + ♻ ☆ Fast Point Cloud to Mesh Reconstruction for Deformable Object Tracking + + +
+ The world around us is full of soft objects we perceive and deform with +dexterous hand movements. For a robotic hand to control soft objects, it has to +acquire online state feedback of the deforming object. While RGB-D cameras can +collect occluded point clouds at a rate of 30Hz, this does not represent a +continuously trackable object surface. Hence, in this work, we developed a +method that takes as input a template mesh which is the mesh of an object in +its non-deformed state and a deformed point cloud of the same object, and then +shapes the template mesh such that it matches the deformed point cloud. The +reconstruction of meshes from point clouds has long been studied in the field +of Computer graphics under 3D reconstruction and 4D reconstruction, however, +both lack the speed and generalizability needed for robotics applications. Our +model is designed using a point cloud auto-encoder and a Real-NVP architecture. +Our trained model can perform mesh reconstruction and tracking at a rate of +58Hz on a template mesh of 3000 vertices and a deformed point cloud of 5000 +points and is generalizable to the deformations of six different object +categories which are assumed to be made of soft material in our experiments +(scissors, hammer, foam brick, cleanser bottle, orange, and dice). The object +meshes are taken from the YCB benchmark dataset. An instance of a downstream +application can be the control algorithm for a robotic hand that requires +online feedback from the state of the manipulated object which would allow +online grasp adaptation in a closed-loop manner. Furthermore, the tracking +capacity of our method can help in the system identification of deforming +objects in a marker-free approach. In future work, we will extend our trained +model to generalize beyond six object categories and additionally to real-world +deforming point clouds. + +
+
+ comment: 8 pages with appendix,16 figures +
+
+
+
+
+ + ♻ ☆ SplaTAM: Splat, Track & Map 3D Gaussians for Dense RGB-D SLAM CVPR 2024 + + +
+ Dense simultaneous localization and mapping (SLAM) is crucial for robotics +and augmented reality applications. However, current methods are often hampered +by the non-volumetric or implicit way they represent a scene. This work +introduces SplaTAM, an approach that, for the first time, leverages explicit +volumetric representations, i.e., 3D Gaussians, to enable high-fidelity +reconstruction from a single unposed RGB-D camera, surpassing the capabilities +of existing methods. SplaTAM employs a simple online tracking and mapping +system tailored to the underlying Gaussian representation. It utilizes a +silhouette mask to elegantly capture the presence of scene density. This +combination enables several benefits over prior representations, including fast +rendering and dense optimization, quickly determining if areas have been +previously mapped, and structured map expansion by adding more Gaussians. +Extensive experiments show that SplaTAM achieves up to 2x superior performance +in camera pose estimation, map construction, and novel-view synthesis over +existing methods, paving the way for more immersive high-fidelity SLAM +applications. + +
+
+ comment: CVPR 2024. Website: https://spla-tam.github.io/ +
+
+
+
+
+ + ♻ ☆ FoundationPose: Unified 6D Pose Estimation and Tracking of Novel Objects + + +
+ We present FoundationPose, a unified foundation model for 6D object pose +estimation and tracking, supporting both model-based and model-free setups. Our +approach can be instantly applied at test-time to a novel object without +fine-tuning, as long as its CAD model is given, or a small number of reference +images are captured. We bridge the gap between these two setups with a neural +implicit representation that allows for effective novel view synthesis, keeping +the downstream pose estimation modules invariant under the same unified +framework. Strong generalizability is achieved via large-scale synthetic +training, aided by a large language model (LLM), a novel transformer-based +architecture, and contrastive learning formulation. Extensive evaluation on +multiple public datasets involving challenging scenarios and objects indicate +our unified approach outperforms existing methods specialized for each task by +a large margin. In addition, it even achieves comparable results to +instance-level methods despite the reduced assumptions. Project page: +https://nvlabs.github.io/FoundationPose/ + +
+
+
+
+
+ + ♻ ☆ Living Scenes: Multi-object Relocalization and Reconstruction in + Changing 3D Environments CVPR 2024 + + +
+ Research into dynamic 3D scene understanding has primarily focused on +short-term change tracking from dense observations, while little attention has +been paid to long-term changes with sparse observations. We address this gap +with MoRE, a novel approach for multi-object relocalization and reconstruction +in evolving environments. We view these environments as "living scenes" and +consider the problem of transforming scans taken at different points in time +into a 3D reconstruction of the object instances, whose accuracy and +completeness increase over time. At the core of our method lies an +SE(3)-equivariant representation in a single encoder-decoder network, trained +on synthetic data. This representation enables us to seamlessly tackle instance +matching, registration, and reconstruction. We also introduce a joint +optimization algorithm that facilitates the accumulation of point clouds +originating from the same instance across multiple scans taken at different +points in time. We validate our method on synthetic and real-world data and +demonstrate state-of-the-art performance in both end-to-end performance and +individual subtasks. + +
+
+ comment: CVPR 2024 camera-ready +
+
+
+
+
+ + ♻ ☆ Object Detectors in the Open Environment: Challenges, Solutions, and + Outlook + + +
+ With the emergence of foundation models, deep learning-based object detectors +have shown practical usability in closed set scenarios. However, for real-world +tasks, object detectors often operate in open environments, where crucial +factors (e.g., data distribution, objective) that influence model learning are +often changing. The dynamic and intricate nature of the open environment poses +novel and formidable challenges to object detectors. Unfortunately, current +research on object detectors in open environments lacks a comprehensive +analysis of their distinctive characteristics, challenges, and corresponding +solutions, which hinders their secure deployment in critical real-world +scenarios. This paper aims to bridge this gap by conducting a comprehensive +review and analysis of object detectors in open environments. We initially +identified limitations of key structural components within the existing +detection pipeline and propose the open environment object detector challenge +framework that includes four quadrants (i.e., out-of-domain, out-of-category, +robust learning, and incremental learning) based on the dimensions of the data +/ target changes. For each quadrant of challenges in the proposed framework, we +present a detailed description and systematic analysis of the overarching goals +and core difficulties, systematically review the corresponding solutions, and +benchmark their performance over multiple widely adopted datasets. In addition, +we engage in a discussion of open problems and potential avenues for future +research. This paper aims to provide a fresh, comprehensive, and systematic +understanding of the challenges and solutions associated with open-environment +object detectors, thus catalyzing the development of more solid applications in +real-world scenarios. A project related to this survey can be found at +https://github.com/LiangSiyuan21/OEOD_Survey. + +
+
+ comment: 32 pages, 17 figures +
+
+
+
+
+ + ♻ ☆ MP5: A Multi-modal Open-ended Embodied System in Minecraft via Active + Perception CVPR2024 + + +
+ It is a long-lasting goal to design an embodied system that can solve +long-horizon open-world tasks in human-like ways. However, existing approaches +usually struggle with compound difficulties caused by the logic-aware +decomposition and context-aware execution of these tasks. To this end, we +introduce MP5, an open-ended multimodal embodied system built upon the +challenging Minecraft simulator, which can decompose feasible sub-objectives, +design sophisticated situation-aware plans, and perform embodied action +control, with frequent communication with a goal-conditioned active perception +scheme. Specifically, MP5 is developed on top of recent advances in Multimodal +Large Language Models (MLLMs), and the system is modulated into functional +modules that can be scheduled and collaborated to ultimately solve pre-defined +context- and process-dependent tasks. Extensive experiments prove that MP5 can +achieve a 22% success rate on difficult process-dependent tasks and a 91% +success rate on tasks that heavily depend on the context. Moreover, MP5 +exhibits a remarkable ability to address many open-ended tasks that are +entirely novel. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ♻ ☆ Learning from Synthetic Human Group Activities + + +
+ The study of complex human interactions and group activities has become a +focal point in human-centric computer vision. However, progress in related +tasks is often hindered by the challenges of obtaining large-scale labeled +datasets from real-world scenarios. To address the limitation, we introduce +M3Act, a synthetic data generator for multi-view multi-group multi-person human +atomic actions and group activities. Powered by Unity Engine, M3Act features +multiple semantic groups, highly diverse and photorealistic images, and a +comprehensive set of annotations, which facilitates the learning of +human-centered tasks across single-person, multi-person, and multi-group +conditions. We demonstrate the advantages of M3Act across three core +experiments. The results suggest our synthetic dataset can significantly +improve the performance of several downstream methods and replace real-world +datasets to reduce cost. Notably, M3Act improves the state-of-the-art MOTRv2 on +DanceTrack dataset, leading to a hop on the leaderboard from 10th to 2nd place. +Moreover, M3Act opens new research for controllable 3D group activity +generation. We define multiple metrics and propose a competitive baseline for +the novel task. Our code and data are available at our project page: +http://cjerry1243.github.io/M3Act. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 214 + +
+
+
+ + ☆ Exploiting Priors from 3D Diffusion Models for RGB-Based One-Shot View + Planning IROS + 2024 + + +
+ Object reconstruction is relevant for many autonomous robotic tasks that +require interaction with the environment. A key challenge in such scenarios is +planning view configurations to collect informative measurements for +reconstructing an initially unknown object. One-shot view planning enables +efficient data collection by predicting view configurations and planning the +globally shortest path connecting all views at once. However, geometric priors +about the object are required to conduct one-shot view planning. In this work, +we propose a novel one-shot view planning approach that utilizes the powerful +3D generation capabilities of diffusion models as priors. By incorporating such +geometric priors into our pipeline, we achieve effective one-shot view planning +starting with only a single RGB image of the object to be reconstructed. Our +planning experiments in simulation and real-world setups indicate that our +approach balances well between object reconstruction quality and movement cost. + +
+
+ comment: Sicong Pan and Liren Jin have equal contribution. Submitted to IROS + 2024 +
+
+
+
+
+ + ☆ CurbNet: Curb Detection Framework Based on LiDAR Point Cloud + Segmentation + + +
+ Curb detection is an important function in intelligent driving and can be +used to determine drivable areas of the road. However, curbs are difficult to +detect due to the complex road environment. This paper introduces CurbNet, a +novel framework for curb detection, leveraging point cloud segmentation. +Addressing the dearth of comprehensive curb datasets and the absence of 3D +annotations, we have developed the 3D-Curb dataset, encompassing 7,100 frames, +which represents the largest and most categorically diverse collection of curb +point clouds currently available. Recognizing that curbs are primarily +characterized by height variations, our approach harnesses spatially-rich 3D +point clouds for training. To tackle the challenges presented by the uneven +distribution of curb features on the xy-plane and their reliance on z-axis +high-frequency features, we introduce the multi-scale and channel attention +(MSCA) module, a bespoke solution designed to optimize detection performance. +Moreover, we propose an adaptive weighted loss function group, specifically +formulated to counteract the imbalance in the distribution of curb point clouds +relative to other categories. Our extensive experimentation on 2 major datasets +has yielded results that surpass existing benchmarks set by leading curb +detection and point cloud segmentation models. By integrating multi-clustering +and curve fitting techniques in our post-processing stage, we have +substantially reduced noise in curb detection, thereby enhancing precision to +0.8744. Notably, CurbNet has achieved an exceptional average metrics of over +0.95 at a tolerance of just 0.15m, thereby establishing a new benchmark. +Furthermore, corroborative real-world experiments and dataset analyzes mutually +validate each other, solidifying CurbNet's superior detection proficiency and +its robust generalizability. + +
+
+
+
+
+ + ☆ HPL-ESS: Hybrid Pseudo-Labeling for Unsupervised Event-based Semantic + Segmentation + + +
+ Event-based semantic segmentation has gained popularity due to its capability +to deal with scenarios under high-speed motion and extreme lighting conditions, +which cannot be addressed by conventional RGB cameras. Since it is hard to +annotate event data, previous approaches rely on event-to-image reconstruction +to obtain pseudo labels for training. However, this will inevitably introduce +noise, and learning from noisy pseudo labels, especially when generated from a +single source, may reinforce the errors. This drawback is also called +confirmation bias in pseudo-labeling. In this paper, we propose a novel hybrid +pseudo-labeling framework for unsupervised event-based semantic segmentation, +HPL-ESS, to alleviate the influence of noisy pseudo labels. In particular, we +first employ a plain unsupervised domain adaptation framework as our baseline, +which can generate a set of pseudo labels through self-training. Then, we +incorporate offline event-to-image reconstruction into the framework, and +obtain another set of pseudo labels by predicting segmentation maps on the +reconstructed images. A noisy label learning strategy is designed to mix the +two sets of pseudo labels and enhance the quality. Moreover, we propose a soft +prototypical alignment module to further improve the consistency of target +domain features. Extensive experiments show that our proposed method +outperforms existing state-of-the-art methods by a large margin on the +DSEC-Semantic dataset (+5.88% accuracy, +10.32% mIoU), which even surpasses +several supervised methods. + +
+
+
+
+
+ + ☆ The Anatomy of Adversarial Attacks: Concept-based XAI Dissection + + +
+ Adversarial attacks (AAs) pose a significant threat to the reliability and +robustness of deep neural networks. While the impact of these attacks on model +predictions has been extensively studied, their effect on the learned +representations and concepts within these models remains largely unexplored. In +this work, we perform an in-depth analysis of the influence of AAs on the +concepts learned by convolutional neural networks (CNNs) using eXplainable +artificial intelligence (XAI) techniques. Through an extensive set of +experiments across various network architectures and targeted AA techniques, we +unveil several key findings. First, AAs induce substantial alterations in the +concept composition within the feature space, introducing new concepts or +modifying existing ones. Second, the adversarial perturbation itself can be +linearly decomposed into a set of latent vector components, with a subset of +these being responsible for the attack's success. Notably, we discover that +these components are target-specific, i.e., are similar for a given target +class throughout different AA techniques and starting classes. Our findings +provide valuable insights into the nature of AAs and their impact on learned +representations, paving the way for the development of more robust and +interpretable deep learning models, as well as effective defenses against +adversarial threats. + +
+
+
+
+
+ + ☆ Diff-Def: Diffusion-Generated Deformation Fields for Conditional Atlases + + +
+ Anatomical atlases are widely used for population analysis. Conditional +atlases target a particular sub-population defined via certain conditions (e.g. +demographics or pathologies) and allow for the investigation of fine-grained +anatomical differences - such as morphological changes correlated with age. +Existing approaches use either registration-based methods that are unable to +handle large anatomical variations or generative models, which can suffer from +training instabilities and hallucinations. To overcome these limitations, we +use latent diffusion models to generate deformation fields, which transform a +general population atlas into one representing a specific sub-population. By +generating a deformation field and registering the conditional atlas to a +neighbourhood of images, we ensure structural plausibility and avoid +hallucinations, which can occur during direct image synthesis. We compare our +method to several state-of-the-art atlas generation methods in experiments +using 5000 brain as well as whole-body MR images from UK Biobank. Our method +generates highly realistic atlases with smooth transformations and high +anatomical fidelity, outperforming the baselines. + +
+
+
+
+
+ + ☆ Creating a Digital Twin of Spinal Surgery: A Proof of Concept + + +
+ Surgery digitalization is the process of creating a virtual replica of +real-world surgery, also referred to as a surgical digital twin (SDT). It has +significant applications in various fields such as education and training, +surgical planning, and automation of surgical tasks. Given their detailed +representations of surgical procedures, SDTs are an ideal foundation for +machine learning methods, enabling automatic generation of training data. In +robotic surgery, SDTs can provide realistic virtual environments in which +robots may learn through trial and error. In this paper, we present a proof of +concept (PoC) for surgery digitalization that is applied to an ex-vivo spinal +surgery performed in realistic conditions. The proposed digitalization focuses +on the acquisition and modelling of the geometry and appearance of the entire +surgical scene. We employ five RGB-D cameras for dynamic 3D reconstruction of +the surgeon, a high-end camera for 3D reconstruction of the anatomy, an +infrared stereo camera for surgical instrument tracking, and a laser scanner +for 3D reconstruction of the operating room and data fusion. We justify the +proposed methodology, discuss the challenges faced and further extensions of +our prototype. While our PoC partially relies on manual data curation, its high +quality and great potential motivate the development of automated methods for +the creation of SDTs. The quality of our SDT can be assessed in a rendered +video available at https://youtu.be/LqVaWGgaTMY . + +
+
+
+
+
+ + ☆ DPStyler: Dynamic PromptStyler for Source-Free Domain Generalization + + +
+ Source-Free Domain Generalization (SFDG) aims to develop a model that works +for unseen target domains without relying on any source domain. Recent work, +PromptStyler, employs text prompts to simulate different distribution shifts in +the joint vision-language space, allowing the model to generalize effectively +to unseen domains without using any images. However, 1) PromptStyler's style +generation strategy has limitations, as all style patterns are fixed after the +first training phase. This leads to the training set in the second training +phase being restricted to a limited set of styles. Additionally, 2) the frozen +text encoder in PromptStyler result in the encoder's output varying with the +style of the input text prompts, making it difficult for the model to learn +domain-invariant features. In this paper, we introduce Dynamic PromptStyler +(DPStyler), comprising Style Generation and Style Removal modules to address +these issues. The Style Generation module refreshes all styles at every +training epoch, while the Style Removal module eliminates variations in the +encoder's output features caused by input styles. Moreover, since the Style +Generation module, responsible for generating style word vectors using random +sampling or style mixing, makes the model sensitive to input text prompts, we +introduce a model ensemble method to mitigate this sensitivity. Extensive +experiments demonstrate that our framework outperforms state-of-the-art methods +on benchmark datasets. + +
+
+
+
+
+ + ☆ Assessing the Performance of Deep Learning for Automated Gleason Grading + in Prostate Cancer + + +
+ Prostate cancer is a dominant health concern calling for advanced diagnostic +tools. Utilizing digital pathology and artificial intelligence, this study +explores the potential of 11 deep neural network architectures for automated +Gleason grading in prostate carcinoma focusing on comparing traditional and +recent architectures. A standardized image classification pipeline, based on +the AUCMEDI framework, facilitated robust evaluation using an in-house dataset +consisting of 34,264 annotated tissue tiles. The results indicated varying +sensitivity across architectures, with ConvNeXt demonstrating the strongest +performance. Notably, newer architectures achieved superior performance, even +though with challenges in differentiating closely related Gleason grades. The +ConvNeXt model was capable of learning a balance between complexity and +generalizability. Overall, this study lays the groundwork for enhanced Gleason +grading systems, potentially improving diagnostic efficiency for prostate +cancer. + +
+
+
+
+
+ + ☆ Synapse: Learning Preferential Concepts from Visual Demonstrations + + +
+ This paper addresses the problem of preference learning, which aims to learn +user-specific preferences (e.g., "good parking spot", "convenient drop-off +location") from visual input. Despite its similarity to learning factual +concepts (e.g., "red cube"), preference learning is a fundamentally harder +problem due to its subjective nature and the paucity of person-specific +training data. We address this problem using a new framework called Synapse, +which is a neuro-symbolic approach designed to efficiently learn preferential +concepts from limited demonstrations. Synapse represents preferences as +neuro-symbolic programs in a domain-specific language (DSL) that operates over +images, and leverages a novel combination of visual parsing, large language +models, and program synthesis to learn programs representing individual +preferences. We evaluate Synapse through extensive experimentation including a +user case study focusing on mobility-related concepts in mobile robotics and +autonomous driving. Our evaluation demonstrates that Synapse significantly +outperforms existing baselines as well as its own ablations. The code and other +details can be found on the project website https://amrl.cs.utexas.edu/synapse . + +
+
+ comment: 23 pages, 7 figures; Preprint +
+
+
+
+
+ + ☆ DeepGleason: a System for Automated Gleason Grading of Prostate Cancer + using Deep Neural Networks + + +
+ Advances in digital pathology and artificial intelligence (AI) offer +promising opportunities for clinical decision support and enhancing diagnostic +workflows. Previous studies already demonstrated AI's potential for automated +Gleason grading, but lack state-of-the-art methodology and model reusability. +To address this issue, we propose DeepGleason: an open-source deep neural +network based image classification system for automated Gleason grading using +whole-slide histopathology images from prostate tissue sections. Implemented +with the standardized AUCMEDI framework, our tool employs a tile-wise +classification approach utilizing fine-tuned image preprocessing techniques in +combination with a ConvNeXt architecture which was compared to various +state-of-the-art architectures. The neural network model was trained and +validated on an in-house dataset of 34,264 annotated tiles from 369 prostate +carcinoma slides. We demonstrated that DeepGleason is capable of highly +accurate and reliable Gleason grading with a macro-averaged F1-score of 0.806, +AUC of 0.991, and Accuracy of 0.974. The internal architecture comparison +revealed that the ConvNeXt model was superior performance-wise on our dataset +to established and other modern architectures like transformers. Furthermore, +we were able to outperform the current state-of-the-art in tile-wise +fine-classification with a sensitivity and specificity of 0.94 and 0.98 for +benign vs malignant detection as well as of 0.91 and 0.75 for Gleason 3 vs +Gleason 4 & 5 classification, respectively. Our tool contributes to the wider +adoption of AI-based Gleason grading within the research community and paves +the way for broader clinical application of deep learning models in digital +pathology. DeepGleason is open-source and publicly available for research +application in the following Git repository: +https://github.com/frankkramer-lab/DeepGleason. + +
+
+
+
+
+ + ☆ FOOL: Addressing the Downlink Bottleneck in Satellite Computing with + Neural Feature Compression + + +
+ Nanosatellite constellations equipped with sensors capturing large geographic +regions provide unprecedented opportunities for Earth observation. As +constellation sizes increase, network contention poses a downlink bottleneck. +Orbital Edge Computing (OEC) leverages limited onboard compute resources to +reduce transfer costs by processing the raw captures at the source. However, +current solutions have limited practicability due to reliance on crude +filtering methods or over-prioritizing particular downstream tasks. + This work presents FOOL, an OEC-native and task-agnostic feature compression +method that preserves prediction performance. FOOL partitions high-resolution +satellite imagery to maximize throughput. Further, it embeds context and +leverages inter-tile dependencies to lower transfer costs with negligible +overhead. While FOOL is a feature compressor, it can recover images with +competitive scores on perceptual quality measures at lower bitrates. We +extensively evaluate transfer cost reduction by including the peculiarity of +intermittently available network connections in low earth orbit. Lastly, we +test the feasibility of our system for standardized nanosatellite form factors. +We demonstrate that FOOL permits downlinking over 100x the data volume without +relying on prior information on the downstream tasks. + +
+
+ comment: 18 pages, double column, 19 figures, 7 tables, Initial Submission to + IEEE Transactions on Mobile Computing +
+
+
+
+
+ + ☆ Domain Adaptive Detection of MAVs: A Benchmark and Noise Suppression + Network + + +
+ Visual detection of Micro Air Vehicles (MAVs) has attracted increasing +attention in recent years due to its important application in various tasks. +The existing methods for MAV detection assume that the training set and testing +set have the same distribution. As a result, when deployed in new domains, the +detectors would have a significant performance degradation due to domain +discrepancy. In this paper, we study the problem of cross-domain MAV detection. +The contributions of this paper are threefold. 1) We propose a +Multi-MAV-Multi-Domain (M3D) dataset consisting of both simulation and +realistic images. Compared to other existing datasets, the proposed one is more +comprehensive in the sense that it covers rich scenes, diverse MAV types, and +various viewing angles. A new benchmark for cross-domain MAV detection is +proposed based on the proposed dataset. 2) We propose a Noise Suppression +Network (NSN) based on the framework of pseudo-labeling and a large-to-small +training procedure. To reduce the challenging pseudo-label noises, two novel +modules are designed in this network. The first is a prior-based curriculum +learning module for allocating adaptive thresholds for pseudo labels with +different difficulties. The second is a masked copy-paste augmentation module +for pasting truly-labeled MAVs on unlabeled target images and thus decreasing +pseudo-label noises. 3) Extensive experimental results verify the superior +performance of the proposed method compared to the state-of-the-art ones. In +particular, it achieves mAP of 46.9%(+5.8%), 50.5%(+3.7%), and 61.5%(+11.3%) on +the tasks of simulation-to-real adaptation, cross-scene adaptation, and +cross-camera adaptation, respectively. + +
+
+ comment: 17 pages, 11 figures. Accepted by IEEE Transactions on Automation + Science and Engineering +
+
+
+
+
+ + ☆ Clustering Propagation for Universal Medical Image Segmentation CVPR2024 + + +
+ Prominent solutions for medical image segmentation are typically tailored for +automatic or interactive setups, posing challenges in facilitating progress +achieved in one task to another.$_{\!}$ This$_{\!}$ also$_{\!}$ +necessitates$_{\!}$ separate$_{\!}$ models for each task, duplicating both +training time and parameters.$_{\!}$ To$_{\!}$ address$_{\!}$ above$_{\!}$ +issues,$_{\!}$ we$_{\!}$ introduce$_{\!}$ S2VNet,$_{\!}$ a$_{\!}$ +universal$_{\!}$ framework$_{\!}$ that$_{\!}$ leverages$_{\!}$ +Slice-to-Volume$_{\!}$ propagation$_{\!}$ to$_{\!}$ unify automatic/interactive +segmentation within a single model and one training session. Inspired by +clustering-based segmentation techniques, S2VNet makes full use of the +slice-wise structure of volumetric data by initializing cluster centers from +the cluster$_{\!}$ results$_{\!}$ of$_{\!}$ previous$_{\!}$ slice.$_{\!}$ This +enables knowledge acquired from prior slices to assist in the segmentation of +the current slice, further efficiently bridging the communication between +remote slices using mere 2D networks. Moreover, such a framework readily +accommodates interactive segmentation with no architectural change, simply by +initializing centroids from user inputs. S2VNet distinguishes itself by swift +inference speeds and reduced memory consumption compared to prevailing 3D +solutions. It can also handle multi-class interactions with each of them +serving to initialize different centroids. Experiments on three benchmarks +demonstrate S2VNet surpasses task-specified solutions on both +automatic/interactive setups. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ Self-Adaptive Reality-Guided Diffusion for Artifact-Free + Super-Resolution + + +
+ Artifact-free super-resolution (SR) aims to translate low-resolution images +into their high-resolution counterparts with a strict integrity of the original +content, eliminating any distortions or synthetic details. While traditional +diffusion-based SR techniques have demonstrated remarkable abilities to enhance +image detail, they are prone to artifact introduction during iterative +procedures. Such artifacts, ranging from trivial noise to unauthentic textures, +deviate from the true structure of the source image, thus challenging the +integrity of the super-resolution process. In this work, we propose +Self-Adaptive Reality-Guided Diffusion (SARGD), a training-free method that +delves into the latent space to effectively identify and mitigate the +propagation of artifacts. Our SARGD begins by using an artifact detector to +identify implausible pixels, creating a binary mask that highlights artifacts. +Following this, the Reality Guidance Refinement (RGR) process refines artifacts +by integrating this mask with realistic latent representations, improving +alignment with the original image. Nonetheless, initial realistic-latent +representations from lower-quality images result in over-smoothing in the final +output. To address this, we introduce a Self-Adaptive Guidance (SAG) mechanism. +It dynamically computes a reality score, enhancing the sharpness of the +realistic latent. These alternating mechanisms collectively achieve +artifact-free super-resolution. Extensive experiments demonstrate the +superiority of our method, delivering detailed artifact-free high-resolution +images while reducing sampling steps by 2X. We release our code at +https://github.com/ProAirVerse/Self-Adaptive-Guidance-Diffusion.git. + +
+
+
+
+
+ + ☆ Multi-Scale Texture Loss for CT denoising with GANs + + +
+ Generative Adversarial Networks (GANs) have proved as a powerful framework +for denoising applications in medical imaging. However, GAN-based denoising +algorithms still suffer from limitations in capturing complex relationships +within the images. In this regard, the loss function plays a crucial role in +guiding the image generation process, encompassing how much a synthetic image +differs from a real image. To grasp highly complex and non-linear textural +relationships in the training process, this work presents a loss function that +leverages the intrinsic multi-scale nature of the Gray-Level-Co-occurrence +Matrix (GLCM). Although the recent advances in deep learning have demonstrated +superior performance in classification and detection tasks, we hypothesize that +its information content can be valuable when integrated into GANs' training. To +this end, we propose a differentiable implementation of the GLCM suited for +gradient-based optimization. Our approach also introduces a self-attention +layer that dynamically aggregates the multi-scale texture information extracted +from the images. We validate our approach by carrying out extensive experiments +in the context of low-dose CT denoising, a challenging application that aims to +enhance the quality of noisy CT scans. We utilize three publicly available +datasets, including one simulated and two real datasets. The results are +promising as compared to other well-established loss functions, being also +consistent across three different GAN architectures. The code is available at: +https://github.com/FrancescoDiFeola/DenoTextureLoss + +
+
+
+
+
+ + ☆ AI-Generated Video Detection via Spatio-Temporal Anomaly Learning + + +
+ The advancement of generation models has led to the emergence of highly +realistic artificial intelligence (AI)-generated videos. Malicious users can +easily create non-existent videos to spread false information. This letter +proposes an effective AI-generated video detection (AIGVDet) scheme by +capturing the forensic traces with a two-branch spatio-temporal convolutional +neural network (CNN). Specifically, two ResNet sub-detectors are learned +separately for identifying the anomalies in spatical and optical flow domains, +respectively. Results of such sub-detectors are fused to further enhance the +discrimination ability. A large-scale generated video dataset (GVD) is +constructed as a benchmark for model training and evaluation. Extensive +experimental results verify the high generalization and robustness of our +AIGVDet scheme. Code and dataset will be available at +https://github.com/multimediaFor/AIGVDet. + +
+
+
+
+
+ + ☆ V2X-PC: Vehicle-to-everything Collaborative Perception via Point Cluster + + +
+ The objective of the collaborative vehicle-to-everything perception task is +to enhance the individual vehicle's perception capability through message +communication among neighboring traffic agents. Previous methods focus on +achieving optimal performance within bandwidth limitations and typically adopt +BEV maps as the basic collaborative message units. However, we demonstrate that +collaboration with dense representations is plagued by object feature +destruction during message packing, inefficient message aggregation for +long-range collaboration, and implicit structure representation communication. +To tackle these issues, we introduce a brand new message unit, namely point +cluster, designed to represent the scene sparsely with a combination of +low-level structure information and high-level semantic information. The point +cluster inherently preserves object information while packing messages, with +weak relevance to the collaboration range, and supports explicit structure +modeling. Building upon this representation, we propose a novel framework +V2X-PC for collaborative perception. This framework includes a Point Cluster +Packing (PCP) module to keep object feature and manage bandwidth through the +manipulation of cluster point numbers. As for effective message aggregation, we +propose a Point Cluster Aggregation (PCA) module to match and merge point +clusters associated with the same object. To further handle time latency and +pose errors encountered in real-world scenarios, we propose parameter-free +solutions that can adapt to different noisy levels without finetuning. +Experiments on two widely recognized collaborative perception benchmarks +showcase the superior performance of our method compared to the previous +state-of-the-art approaches relying on BEV maps. + +
+
+
+
+
+ + ☆ SDXS: Real-Time One-Step Latent Diffusion Models with Image Conditions + + +
+ Recent advancements in diffusion models have positioned them at the forefront +of image generation. Despite their superior performance, diffusion models are +not without drawbacks; they are characterized by complex architectures and +substantial computational demands, resulting in significant latency due to +their iterative sampling process. To mitigate these limitations, we introduce a +dual approach involving model miniaturization and a reduction in sampling +steps, aimed at significantly decreasing model latency. Our methodology +leverages knowledge distillation to streamline the U-Net and image decoder +architectures, and introduces an innovative one-step DM training technique that +utilizes feature matching and score distillation. We present two models, +SDXS-512 and SDXS-1024, achieving inference speeds of approximately 100 FPS +(30x faster than SD v1.5) and 30 FP (60x faster than SDXL) on a single GPU, +respectively. Moreover, our training approach offers promising applications in +image-conditioned control, facilitating efficient image-to-image translation. + +
+
+
+
+
+ + ☆ Calibrating Bayesian UNet++ for Sub-Seasonal Forecasting ICLR 2024 + + +
+ Seasonal forecasting is a crucial task when it comes to detecting the extreme +heat and colds that occur due to climate change. Confidence in the predictions +should be reliable since a small increase in the temperatures in a year has a +big impact on the world. Calibration of the neural networks provides a way to +ensure our confidence in the predictions. However, calibrating regression +models is an under-researched topic, especially in forecasters. We calibrate a +UNet++ based architecture, which was shown to outperform physics-based models +in temperature anomalies. We show that with a slight trade-off between +prediction error and calibration error, it is possible to get more reliable and +sharper forecasts. We believe that calibration should be an important part of +safety-critical machine learning applications such as weather forecasters. + +
+
+ comment: Accepted as a workshop paper at "ICLR 2024 Tackling Climate Change + with Machine Learning" +
+
+
+
+
+ + ☆ Enhancing Industrial Transfer Learning with Style Filter: Cost Reduction + and Defect-Focus + + +
+ Addressing the challenge of data scarcity in industrial domains, transfer +learning emerges as a pivotal paradigm. This work introduces Style Filter, a +tailored methodology for industrial contexts. By selectively filtering source +domain data before knowledge transfer, Style Filter reduces the quantity of +data while maintaining or even enhancing the performance of transfer learning +strategy. Offering label-free operation, minimal reliance on prior knowledge, +independence from specific models, and re-utilization, Style Filter is +evaluated on authentic industrial datasets, highlighting its effectiveness when +employed before conventional transfer strategies in the deep learning domain. +The results underscore the effectiveness of Style Filter in real-world +industrial applications. + +
+
+ comment: 17 pages, 11 figures,4 tables +
+
+
+
+
+ + ☆ SatSynth: Augmenting Image-Mask Pairs through Diffusion Models for + Aerial Semantic Segmentation CVPR2024 + + +
+ In recent years, semantic segmentation has become a pivotal tool in +processing and interpreting satellite imagery. Yet, a prevalent limitation of +supervised learning techniques remains the need for extensive manual +annotations by experts. In this work, we explore the potential of generative +image diffusion to address the scarcity of annotated data in earth observation +tasks. The main idea is to learn the joint data manifold of images and labels, +leveraging recent advancements in denoising diffusion probabilistic models. To +the best of our knowledge, we are the first to generate both images and +corresponding masks for satellite segmentation. We find that the obtained pairs +not only display high quality in fine-scale features but also ensure a wide +sampling diversity. Both aspects are crucial for earth observation data, where +semantic classes can vary severely in scale and occurrence frequency. We employ +the novel data instances for downstream segmentation, as a form of data +augmentation. In our experiments, we provide comparisons to prior works based +on discriminative diffusion models or GANs. We demonstrate that integrating +generated samples yields significant quantitative improvements for satellite +semantic segmentation -- both compared to baselines and when training only on +the original data. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ☆ EDUE: Expert Disagreement-Guided One-Pass Uncertainty Estimation for + Medical Image Segmentation + + +
+ Deploying deep learning (DL) models in medical applications relies on +predictive performance and other critical factors, such as conveying +trustworthy predictive uncertainty. Uncertainty estimation (UE) methods provide +potential solutions for evaluating prediction reliability and improving the +model confidence calibration. Despite increasing interest in UE, challenges +persist, such as the need for explicit methods to capture aleatoric uncertainty +and align uncertainty estimates with real-life disagreements among domain +experts. This paper proposes an Expert Disagreement-Guided Uncertainty +Estimation (EDUE) for medical image segmentation. By leveraging variability in +ground-truth annotations from multiple raters, we guide the model during +training and incorporate random sampling-based strategies to enhance +calibration confidence. Our method achieves 55% and 23% improvement in +correlation on average with expert disagreements at the image and pixel levels, +respectively, better calibration, and competitive segmentation performance +compared to the state-of-the-art deep ensembles, requiring only a single +forward pass. + +
+
+
+
+
+ + ☆ In the Search for Optimal Multi-view Learning Models for Crop + Classification with Global Remote Sensing Data + + +
+ Crop classification is of critical importance due to its role in studying +crop pattern changes, resource management, and carbon sequestration. When +employing data-driven techniques for its prediction, utilizing various temporal +data sources is necessary. Deep learning models have proven to be effective for +this task by mapping time series data to high-level representation for +prediction. However, they face substantial challenges when dealing with +multiple input patterns. The literature offers limited guidance for Multi-View +Learning (MVL) scenarios, as it has primarily focused on exploring fusion +strategies with specific encoders and validating them in local regions. In +contrast, we investigate the impact of simultaneous selection of the fusion +strategy and the encoder architecture evaluated on a global-scale cropland and +crop-type classifications. We use a range of five fusion strategies (Input, +Feature, Decision, Ensemble, Hybrid) and five temporal encoder architectures +(LSTM, GRU, TempCNN, TAE, L-TAE) as possible MVL model configurations. The +validation is on the CropHarvest dataset that provides optical, radar, and +weather time series, and topographic information as input data. We found that +in scenarios with a limited number of labeled samples, a unique configuration +is insufficient for all the cases. Instead, a specialized combination, +including encoder and fusion strategy, should be meticulously sought. To +streamline this search process, we suggest initially identifying the optimal +encoder architecture tailored for a particular fusion strategy, and then +determining the most suitable fusion strategy for the classification task. We +provide a technical framework for researchers exploring crop classification or +related tasks through a MVL approach. + +
+
+ comment: submitted to journal +
+
+
+
+
+ + ☆ SegICL: A Universal In-context Learning Framework for Enhanced + Segmentation in Medical Imaging + + +
+ Medical image segmentation models adapting to new tasks in a training-free +manner through in-context learning is an exciting advancement. Universal +segmentation models aim to generalize across the diverse modality of medical +images, yet their effectiveness often diminishes when applied to +out-of-distribution (OOD) data modalities and tasks, requiring intricate +fine-tuning of model for optimal performance. For addressing this challenge, we +introduce SegICL, a novel approach leveraging In-Context Learning (ICL) for +image segmentation. Unlike existing methods, SegICL has the capability to +employ text-guided segmentation and conduct in-context learning with a small +set of image-mask pairs, eliminating the need for training the model from +scratch or fine-tuning for OOD tasks (including OOD modality and dataset). +Extensive experimental validation of SegICL demonstrates a positive correlation +between the number of prompt samples and segmentation performance on OOD +modalities and tasks. This indicates that SegICL effectively address new +segmentation tasks based on contextual information. Additionally, SegICL also +exhibits comparable segmentation performance to mainstream models on OOD and +in-distribution tasks. Our code will be released soon. + +
+
+
+
+
+ + ☆ Revealing Vulnerabilities of Neural Networks in Parameter Learning and + Defense Against Explanation-Aware Backdoors + + +
+ Explainable Artificial Intelligence (XAI) strategies play a crucial part in +increasing the understanding and trustworthiness of neural networks. +Nonetheless, these techniques could potentially generate misleading +explanations. Blinding attacks can drastically alter a machine learning +algorithm's prediction and explanation, providing misleading information by +adding visually unnoticeable artifacts into the input, while maintaining the +model's accuracy. It poses a serious challenge in ensuring the reliability of +XAI methods. To ensure the reliability of XAI methods poses a real challenge, +we leverage statistical analysis to highlight the changes in CNN weights within +a CNN following blinding attacks. We introduce a method specifically designed +to limit the effectiveness of such attacks during the evaluation phase, +avoiding the need for extra training. The method we suggest defences against +most modern explanation-aware adversarial attacks, achieving an approximate +decrease of ~99\% in the Attack Success Rate (ASR) and a ~91\% reduction in the +Mean Square Error (MSE) between the original explanation and the defended +(post-attack) explanation across three unique types of attacks. + +
+
+
+
+
+ + ☆ Elysium: Exploring Object-level Perception in Videos via MLLM + + +
+ Multi-modal Large Language Models (MLLMs) have demonstrated their ability to +perceive objects in still images, but their application in video-related tasks, +such as object tracking, remains understudied. This lack of exploration is +primarily due to two key challenges. Firstly, extensive pretraining on +large-scale video datasets is required to equip MLLMs with the capability to +perceive objects across multiple frames and understand inter-frame +relationships. Secondly, processing a large number of frames within the context +window of Large Language Models (LLMs) can impose a significant computational +burden. To address the first challenge, we introduce ElysiumTrack-1M, a +large-scale video dataset paired with novel tasks: Referring Single Object +Tracking (RSOT) and Video Referring Expression Generation (Video-REG). +ElysiumTrack-1M contains 1.27 million annotated video frames with corresponding +object boxes and descriptions. Leveraging this dataset, we conduct training of +MLLMs and propose a token-compression model T-Selector to tackle the second +challenge. Our proposed approach, Elysium: Exploring Object-level Perception in +Videos via MLLM, is an end-to-end trainable MLLM that makes the first attempt +to conduct object-level tasks in videos without requiring any additional +plug-in or expert models. + +
+
+
+
+
+ + ☆ QKFormer: Hierarchical Spiking Transformer using Q-K Attention + + +
+ Spiking Transformers, which integrate Spiking Neural Networks (SNNs) with +Transformer architectures, have attracted significant attention due to their +potential for energy efficiency and high performance. However, existing models +in this domain still suffer from suboptimal performance. We introduce several +innovations to improve the performance: i) We propose a novel spike-form Q-K +attention mechanism, tailored for SNNs, which efficiently models the importance +of token or channel dimensions through binary vectors with linear complexity. +ii) We incorporate the hierarchical structure, which significantly benefits the +performance of both the brain and artificial neural networks, into spiking +transformers to obtain multi-scale spiking representation. iii) We design a +versatile and powerful patch embedding module with a deformed shortcut +specifically for spiking transformers. Together, we develop QKFormer, a +hierarchical spiking transformer based on Q-K attention with direct training. +QKFormer shows significantly superior performance over existing +state-of-the-art SNN models on various mainstream datasets. Notably, with +comparable size to Spikformer (66.34 M, 74.81%), QKFormer (64.96 M) achieves a +groundbreaking top-1 accuracy of 85.65% on ImageNet-1k, substantially +outperforming Spikformer by 10.84%. To our best knowledge, this is the first +time that directly training SNNs have exceeded 85% accuracy on ImageNet-1K. The +code and models are publicly available at +https://github.com/zhouchenlin2096/QKFormer + +
+
+ comment: 10 pages, code: https://github.com/zhouchenlin2096/QKFormer +
+
+
+
+
+ + ☆ DOrA: 3D Visual Grounding with Order-Aware Referring + + +
+ 3D visual grounding aims to identify the target object within a 3D point +cloud scene referred to by a natural language description. While previous works +attempt to exploit the verbo-visual relation with proposed cross-modal +transformers, unstructured natural utterances and scattered objects might lead +to undesirable performances. In this paper, we introduce DOrA, a novel 3D +visual grounding framework with Order-Aware referring. DOrA is designed to +leverage Large Language Models (LLMs) to parse language description, suggesting +a referential order of anchor objects. Such ordered anchor objects allow DOrA +to update visual features and locate the target object during the grounding +process. Experimental results on the NR3D and ScanRefer datasets demonstrate +our superiority in both low-resource and full-data scenarios. In particular, +DOrA surpasses current state-of-the-art frameworks by 9.3% and 7.8% grounding +accuracy under 1% data and 10% data settings, respectively. + +
+
+
+
+
+ + ☆ VMRNN: Integrating Vision Mamba and LSTM for Efficient and Accurate + Spatiotemporal Forecasting + + +
+ Combining CNNs or ViTs, with RNNs for spatiotemporal forecasting, has yielded +unparalleled results in predicting temporal and spatial dynamics. However, +modeling extensive global information remains a formidable challenge; CNNs are +limited by their narrow receptive fields, and ViTs struggle with the intensive +computational demands of their attention mechanisms. The emergence of recent +Mamba-based architectures has been met with enthusiasm for their exceptional +long-sequence modeling capabilities, surpassing established vision models in +efficiency and accuracy, which motivates us to develop an innovative +architecture tailored for spatiotemporal forecasting. In this paper, we propose +the VMRNN cell, a new recurrent unit that integrates the strengths of Vision +Mamba blocks with LSTM. We construct a network centered on VMRNN cells to +tackle spatiotemporal prediction tasks effectively. Our extensive evaluations +show that our proposed approach secures competitive results on a variety of +tasks while maintaining a smaller model size. Our code is available at +https://github.com/yyyujintang/VMRNN-PyTorch. + +
+
+ comment: 11 pages, 7 figures. arXiv admin note: text overlap with + arXiv:2308.09891 by other authors +
+
+
+
+
+ + ☆ An Intermediate Fusion ViT Enables Efficient Text-Image Alignment in + Diffusion Models + + +
+ Diffusion models have been widely used for conditional data cross-modal +generation tasks such as text-to-image and text-to-video. However, +state-of-the-art models still fail to align the generated visual concepts with +high-level semantics in a language such as object count, spatial relationship, +etc. We approach this problem from a multimodal data fusion perspective and +investigate how different fusion strategies can affect vision-language +alignment. We discover that compared to the widely used early fusion of +conditioning text in a pretrained image feature space, a specially designed +intermediate fusion can: (i) boost text-to-image alignment with improved +generation quality and (ii) improve training and inference efficiency by +reducing low-rank text-to-image attention calculations. We perform experiments +using a text-to-image generation task on the MS-COCO dataset. We compare our +intermediate fusion mechanism with the classic early fusion mechanism on two +common conditioning methods on a U-shaped ViT backbone. Our intermediate fusion +model achieves a higher CLIP Score and lower FID, with 20% reduced FLOPs, and +50% increased training speed compared to a strong U-ViT baseline with an early +fusion. + +
+
+
+
+
+ + ☆ Open-Set Recognition in the Age of Vision-Language Models + + +
+ Are vision-language models (VLMs) open-set models because they are trained on +internet-scale datasets? We answer this question with a clear no - VLMs +introduce closed-set assumptions via their finite query set, making them +vulnerable to open-set conditions. We systematically evaluate VLMs for open-set +recognition and find they frequently misclassify objects not contained in their +query set, leading to alarmingly low precision when tuned for high recall and +vice versa. We show that naively increasing the size of the query set to +contain more and more classes does not mitigate this problem, but instead +causes diminishing task performance and open-set performance. We establish a +revised definition of the open-set problem for the age of VLMs, define a new +benchmark and evaluation protocol to facilitate standardised evaluation and +research in this important area, and evaluate promising baseline approaches +based on predictive uncertainty and dedicated negative embeddings on a range of +VLM classifiers and object detectors. + +
+
+ comment: 31 pages, under review +
+
+
+
+
+ + ☆ ModeTv2: GPU-accelerated Motion Decomposition Transformer for Pairwise + Optimization in Medical Image Registration + + +
+ Deformable image registration plays a crucial role in medical imaging, aiding +in disease diagnosis and image-guided interventions. Traditional iterative +methods are slow, while deep learning (DL) accelerates solutions but faces +usability and precision challenges. This study introduces a pyramid network +with the enhanced motion decomposition Transformer (ModeTv2) operator, +showcasing superior pairwise optimization (PO) akin to traditional methods. We +re-implement ModeT operator with CUDA extensions to enhance its computational +efficiency. We further propose RegHead module which refines deformation fields, +improves the realism of deformation and reduces parameters. By adopting the PO, +the proposed network balances accuracy, efficiency, and generalizability. +Extensive experiments on two public brain MRI datasets and one abdominal CT +dataset demonstrate the network's suitability for PO, providing a DL model with +enhanced usability and interpretability. The code is publicly available. + +
+
+
+
+
+ + ☆ CMViM: Contrastive Masked Vim Autoencoder for 3D Multi-modal + Representation Learning for AD classification + + +
+ Alzheimer's disease (AD) is an incurable neurodegenerative condition leading +to cognitive and functional deterioration. Given the lack of a cure, prompt and +precise AD diagnosis is vital, a complex process dependent on multiple factors +and multi-modal data. While successful efforts have been made to integrate +multi-modal representation learning into medical datasets, scant attention has +been given to 3D medical images. In this paper, we propose Contrastive Masked +Vim Autoencoder (CMViM), the first efficient representation learning method +tailored for 3D multi-modal data. Our proposed framework is built on a masked +Vim autoencoder to learn a unified multi-modal representation and +long-dependencies contained in 3D medical images. We also introduce an +intra-modal contrastive learning module to enhance the capability of the +multi-modal Vim encoder for modeling the discriminative features in the same +modality, and an inter-modal contrastive learning module to alleviate +misaligned representation among modalities. Our framework consists of two main +steps: 1) incorporate the Vision Mamba (Vim) into the mask autoencoder to +reconstruct 3D masked multi-modal data efficiently. 2) align the multi-modal +representations with contrastive learning mechanisms from both intra-modal and +inter-modal aspects. Our framework is pre-trained and validated ADNI2 dataset +and validated on the downstream task for AD classification. The proposed CMViM +yields 2.7\% AUC performance improvement compared with other state-of-the-art +methods. + +
+
+ comment: 11 pages, 1 figure +
+
+
+
+
+ + ☆ Visually Guided Generative Text-Layout Pre-training for Document + Intelligence NAACL 2024 + + +
+ Prior study shows that pre-training techniques can boost the performance of +visual document understanding (VDU), which typically requires models to gain +abilities to perceive and reason both document texts and layouts (e.g., +locations of texts and table-cells). To this end, we propose visually guided +generative text-layout pre-training, named ViTLP. Given a document image, the +model optimizes hierarchical language and layout modeling objectives to +generate the interleaved text and layout sequence. In addition, to address the +limitation of processing long documents by Transformers, we introduce a +straightforward yet effective multi-segment generative pre-training scheme, +facilitating ViTLP to process word-intensive documents of any length. ViTLP can +function as a native OCR model to localize and recognize texts of document +images. Besides, ViTLP can be effectively applied to various downstream VDU +tasks. Extensive experiments show that ViTLP achieves competitive performance +over existing baselines on benchmark VDU tasks, including information +extraction, document classification, and document question answering. + +
+
+ comment: Accepted to NAACL 2024 main conference. The first version of this + paper was submitted to OpenReview + (https://openreview.net/forum?id=ARtBIBAmNR) in June 2023 +
+
+
+
+
+ + ☆ Let Real Images be as a Judger, Spotting Fake Images Synthesized with + Generative Models + + +
+ In the last few years, generative models have shown their powerful +capabilities in synthesizing realistic images in both quality and diversity +(i.e., facial images, and natural subjects). Unfortunately, the artifact +patterns in fake images synthesized by different generative models are +inconsistent, leading to the failure of previous research that relied on +spotting subtle differences between real and fake. In our preliminary +experiments, we find that the artifacts in fake images always change with the +development of the generative model, while natural images exhibit stable +statistical properties. In this paper, we employ natural traces shared only by +real images as an additional predictive target in the detector. Specifically, +the natural traces are learned from the wild real images and we introduce +extended supervised contrastive learning to bring them closer to real images +and further away from fake ones. This motivates the detector to make decisions +based on the proximity of images to the natural traces. To conduct a +comprehensive experiment, we built a high-quality and diverse dataset that +includes generative models comprising 6 GAN and 6 diffusion models, to evaluate +the effectiveness in generalizing unknown forgery techniques and robustness in +surviving different transformations. Experimental results show that our +proposed method gives 96.1% mAP significantly outperforms the baselines. +Extensive experiments conducted on the widely recognized platform Midjourney +reveal that our proposed method achieves an accuracy exceeding 78.4%, +underscoring its practicality for real-world application deployment. The source +code and partial self-built dataset are available in supplementary material. + +
+
+
+
+
+ + ☆ Make-Your-Anchor: A Diffusion-based 2D Avatar Generation Framework CVPR2024 + + +
+ Despite the remarkable process of talking-head-based avatar-creating +solutions, directly generating anchor-style videos with full-body motions +remains challenging. In this study, we propose Make-Your-Anchor, a novel system +necessitating only a one-minute video clip of an individual for training, +subsequently enabling the automatic generation of anchor-style videos with +precise torso and hand movements. Specifically, we finetune a proposed +structure-guided diffusion model on input video to render 3D mesh conditions +into human appearances. We adopt a two-stage training strategy for the +diffusion model, effectively binding movements with specific appearances. To +produce arbitrary long temporal video, we extend the 2D U-Net in the frame-wise +diffusion model to a 3D style without additional training cost, and a simple +yet effective batch-overlapped temporal denoising module is proposed to bypass +the constraints on video length during inference. Finally, a novel +identity-specific face enhancement module is introduced to improve the visual +quality of facial regions in the output videos. Comparative experiments +demonstrate the effectiveness and superiority of the system in terms of visual +quality, temporal coherence, and identity preservation, outperforming SOTA +diffusion/non-diffusion methods. Project page: +\url{https://github.com/ICTMCG/Make-Your-Anchor}. + +
+
+ comment: accepted at CVPR2024 +
+
+
+
+
+ + ☆ Medical Image Registration and Its Application in Retinal Images: A + Review + + +
+ Medical image registration is vital for disease diagnosis and treatment with +its ability to merge diverse information of images, which may be captured under +different times, angles, or modalities. Although several surveys have reviewed +the development of medical image registration, these surveys have not +systematically summarized methodologies of existing medical image registration +methods. To this end, we provide a comprehensive review of these methods from +traditional and deep learning-based directions, aiming to help audiences +understand the development of medical image registration quickly. In +particular, we review recent advances in retinal image registration at the end +of each section, which has not attracted much attention. Additionally, we also +discuss the current challenges of retinal image registration and provide +insights and prospects for future research. + +
+
+
+
+
+ + ☆ Self-Supervised Learning for Medical Image Data with Anatomy-Oriented + Imaging Planes + + +
+ Self-supervised learning has emerged as a powerful tool for pretraining deep +networks on unlabeled data, prior to transfer learning of target tasks with +limited annotation. The relevance between the pretraining pretext and target +tasks is crucial to the success of transfer learning. Various pretext tasks +have been proposed to utilize properties of medical image data (e.g., three +dimensionality), which are more relevant to medical image analysis than generic +ones for natural images. However, previous work rarely paid attention to data +with anatomy-oriented imaging planes, e.g., standard cardiac magnetic resonance +imaging views. As these imaging planes are defined according to the anatomy of +the imaged organ, pretext tasks effectively exploiting this information can +pretrain the networks to gain knowledge on the organ of interest. In this work, +we propose two complementary pretext tasks for this group of medical image data +based on the spatial relationship of the imaging planes. The first is to learn +the relative orientation between the imaging planes and implemented as +regressing their intersecting lines. The second exploits parallel imaging +planes to regress their relative slice locations within a stack. Both pretext +tasks are conceptually straightforward and easy to implement, and can be +combined in multitask learning for better representation learning. Thorough +experiments on two anatomical structures (heart and knee) and representative +target tasks (semantic segmentation and classification) demonstrate that the +proposed pretext tasks are effective in pretraining deep networks for +remarkably boosted performance on the target tasks, and superior to other +recent approaches. + +
+
+ comment: Medical Image Analysis +
+
+
+
+
+ + ☆ PathoTune: Adapting Visual Foundation Model to Pathological Specialists MICCAI 2024 + + +
+ As natural image understanding moves towards the pretrain-finetune era, +research in pathology imaging is concurrently evolving. Despite the predominant +focus on pretraining pathological foundation models, how to adapt foundation +models to downstream tasks is little explored. For downstream adaptation, we +propose the existence of two domain gaps, i.e., the Foundation-Task Gap and the +Task-Instance Gap. To mitigate these gaps, we introduce PathoTune, a framework +designed to efficiently adapt pathological or even visual foundation models to +pathology-specific tasks via multi-modal prompt tuning. The proposed framework +leverages Task-specific Visual Prompts and Task-specific Textual Prompts to +identify task-relevant features, along with Instance-specific Visual Prompts +for encoding single pathological image features. Results across multiple +datasets at both patch-level and WSI-level demonstrate its superior performance +over single-modality prompt tuning approaches. Significantly, PathoTune +facilitates the direct adaptation of natural visual foundation models to +pathological tasks, drastically outperforming pathological foundation models +with simple linear probing. The code will be available upon acceptance. + +
+
+ comment: Submitted to MICCAI 2024 +
+
+
+
+
+ + ☆ CT-Bound: Fast Boundary Estimation From Noisy Images Via Hybrid + Convolution and Transformer Neural Networks + + +
+ We present CT-Bound, a fast boundary estimation method for noisy images using +a hybrid Convolution and Transformer neural network. The proposed architecture +decomposes boundary estimation into two tasks: local detection and global +regularization of image boundaries. It first estimates a parametric +representation of boundary structures only using the input image within a small +receptive field and then refines the boundary structure in the parameter domain +without accessing the input image. Because of this, a part of the network can +be easily trained using naive, synthetic images and still generalized to real +images, and the entire architecture is computationally efficient as the +boundary refinement is non-iterative and not in the image domain. Compared with +the previous highest accuracy methods, our experiment shows that CT-Bound is +100 times faster, producing comparably accurate, high-quality boundary and +color maps. We also demonstrate that CT-Bound can produce boundary and color +maps on real captured images without extra fine-tuning and real-time boundary +map and color map videos at ten frames per second. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ☆ REFRAME: Reflective Surface Real-Time Rendering for Mobile Devices + + +
+ This work tackles the challenging task of achieving real-time novel view +synthesis on various scenes, including highly reflective objects and unbounded +outdoor scenes. Existing real-time rendering methods, especially those based on +meshes, often have subpar performance in modeling surfaces with rich +view-dependent appearances. Our key idea lies in leveraging meshes for +rendering acceleration while incorporating a novel approach to parameterize +view-dependent information. We decompose the color into diffuse and specular, +and model the specular color in the reflected direction based on a neural +environment map. Our experiments demonstrate that our method achieves +comparable reconstruction quality for highly reflective surfaces compared to +state-of-the-art offline methods, while also efficiently enabling real-time +rendering on edge devices such as smartphones. + +
+
+ comment: Project Page:https://xdimlab.github.io/REFRAME/ +
+
+
+
+
+ + ☆ Camera-aware Label Refinement for Unsupervised Person Re-identification + + +
+ Unsupervised person re-identification aims to retrieve images of a specified +person without identity labels. Many recent unsupervised Re-ID approaches adopt +clustering-based methods to measure cross-camera feature similarity to roughly +divide images into clusters. They ignore the feature distribution discrepancy +induced by camera domain gap, resulting in the unavoidable performance +degradation. Camera information is usually available, and the feature +distribution in the single camera usually focuses more on the appearance of the +individual and has less intra-identity variance. Inspired by the observation, +we introduce a \textbf{C}amera-\textbf{A}ware \textbf{L}abel +\textbf{R}efinement~(CALR) framework that reduces camera discrepancy by +clustering intra-camera similarity. Specifically, we employ intra-camera +training to obtain reliable local pseudo labels within each camera, and then +refine global labels generated by inter-camera clustering and train the +discriminative model using more reliable global pseudo labels in a self-paced +manner. Meanwhile, we develop a camera-alignment module to align feature +distributions under different cameras, which could help deal with the camera +variance further. Extensive experiments validate the superiority of our +proposed method over state-of-the-art approaches. The code is accessible at +https://github.com/leeBooMla/CALR. + +
+
+ comment: submitted to IEEE TMM +
+
+
+
+
+ + ☆ If CLIP Could Talk: Understanding Vision-Language Model Representations + Through Their Preferred Concept Descriptions + + +
+ Recent works often assume that Vision-Language Model (VLM) representations +are based on visual attributes like shape. However, it is unclear to what +extent VLMs prioritize this information to represent concepts. We propose +Extract and Explore (EX2), a novel approach to characterize important textual +features for VLMs. EX2 uses reinforcement learning to align a large language +model with VLM preferences and generates descriptions that incorporate the +important features for the VLM. Then, we inspect the descriptions to identify +the features that contribute to VLM representations. We find that spurious +descriptions have a major role in VLM representations despite providing no +helpful information, e.g., Click to enlarge photo of CONCEPT. More importantly, +among informative descriptions, VLMs rely significantly on non-visual +attributes like habitat to represent visual concepts. Also, our analysis +reveals that different VLMs prioritize different attributes in their +representations. Overall, we show that VLMs do not simply match images to scene +descriptions and that non-visual or even spurious descriptions significantly +influence their representations. + +
+
+ comment: Code: https://github.com/BatsResearch/ex2 +
+
+
+
+
+ + ☆ RCBEVDet: Radar-camera Fusion in Bird's Eye View for 3D Object Detection CVPR2024 + + +
+ Three-dimensional object detection is one of the key tasks in autonomous +driving. To reduce costs in practice, low-cost multi-view cameras for 3D object +detection are proposed to replace the expansive LiDAR sensors. However, relying +solely on cameras is difficult to achieve highly accurate and robust 3D object +detection. An effective solution to this issue is combining multi-view cameras +with the economical millimeter-wave radar sensor to achieve more reliable +multi-modal 3D object detection. In this paper, we introduce RCBEVDet, a +radar-camera fusion 3D object detection method in the bird's eye view (BEV). +Specifically, we first design RadarBEVNet for radar BEV feature extraction. +RadarBEVNet consists of a dual-stream radar backbone and a Radar Cross-Section +(RCS) aware BEV encoder. In the dual-stream radar backbone, a point-based +encoder and a transformer-based encoder are proposed to extract radar features, +with an injection and extraction module to facilitate communication between the +two encoders. The RCS-aware BEV encoder takes RCS as the object size prior to +scattering the point feature in BEV. Besides, we present the Cross-Attention +Multi-layer Fusion module to automatically align the multi-modal BEV feature +from radar and camera with the deformable attention mechanism, and then fuse +the feature with channel and spatial fusion layers. Experimental results show +that RCBEVDet achieves new state-of-the-art radar-camera fusion results on +nuScenes and view-of-delft (VoD) 3D object detection benchmarks. Furthermore, +RCBEVDet achieves better 3D detection results than all real-time camera-only +and radar-camera 3D object detectors with a faster inference speed at 21~28 +FPS. The source code will be released at https://github.com/VDIGPKU/RCBEVDet. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ Producing and Leveraging Online Map Uncertainty in Trajectory Prediction CVPR 2024 + + +
+ High-definition (HD) maps have played an integral role in the development of +modern autonomous vehicle (AV) stacks, albeit with high associated labeling and +maintenance costs. As a result, many recent works have proposed methods for +estimating HD maps online from sensor data, enabling AVs to operate outside of +previously-mapped regions. However, current online map estimation approaches +are developed in isolation of their downstream tasks, complicating their +integration in AV stacks. In particular, they do not produce uncertainty or +confidence estimates. In this work, we extend multiple state-of-the-art online +map estimation methods to additionally estimate uncertainty and show how this +enables more tightly integrating online mapping with trajectory forecasting. In +doing so, we find that incorporating uncertainty yields up to 50% faster +training convergence and up to 15% better prediction performance on the +real-world nuScenes driving dataset. + +
+
+ comment: 14 pages, 14 figures, 6 tables. CVPR 2024 +
+
+
+
+
+ + ☆ Real-time Neuron Segmentation for Voltage Imaging + + +
+ In voltage imaging, where the membrane potentials of individual neurons are +recorded at from hundreds to thousand frames per second using fluorescence +microscopy, data processing presents a challenge. Even a fraction of a minute +of recording with a limited image size yields gigabytes of video data +consisting of tens of thousands of frames, which can be time-consuming to +process. Moreover, millisecond-level short exposures lead to noisy video +frames, obscuring neuron footprints especially in deep-brain samples where +noisy signals are buried in background fluorescence. To address this challenge, +we propose a fast neuron segmentation method able to detect multiple, +potentially overlapping, spiking neurons from noisy video frames, and implement +a data processing pipeline incorporating the proposed segmentation method along +with GPU-accelerated motion correction. By testing on existing datasets as well +as on new datasets we introduce, we show that our pipeline extracts neuron +footprints that agree well with human annotation even from cluttered datasets, +and demonstrate real-time processing of voltage imaging data on a single +desktop computer for the first time. + +
+
+
+
+
+ + ☆ DOCTR: Disentangled Object-Centric Transformer for Point Scene + Understanding + + +
+ Point scene understanding is a challenging task to process real-world scene +point cloud, which aims at segmenting each object, estimating its pose, and +reconstructing its mesh simultaneously. Recent state-of-the-art method first +segments each object and then processes them independently with multiple stages +for the different sub-tasks. This leads to a complex pipeline to optimize and +makes it hard to leverage the relationship constraints between multiple +objects. In this work, we propose a novel Disentangled Object-Centric +TRansformer (DOCTR) that explores object-centric representation to facilitate +learning with multiple objects for the multiple sub-tasks in a unified manner. +Each object is represented as a query, and a Transformer decoder is adapted to +iteratively optimize all the queries involving their relationship. In +particular, we introduce a semantic-geometry disentangled query (SGDQ) design +that enables the query features to attend separately to semantic information +and geometric information relevant to the corresponding sub-tasks. A hybrid +bipartite matching module is employed to well use the supervisions from all the +sub-tasks during training. Qualitative and quantitative experimental results +demonstrate that our method achieves state-of-the-art performance on the +challenging ScanNet dataset. Code is available at +https://github.com/SAITPublic/DOCTR. + +
+
+
+
+
+ + ☆ Benchmarks and Challenges in Pose Estimation for Egocentric Hand + Interactions with Objects + + +
+ We interact with the world with our hands and see it through our own +(egocentric) perspective. A holistic 3D understanding of such interactions from +egocentric views is important for tasks in robotics, AR/VR, action recognition +and motion generation. Accurately reconstructing such interactions in 3D is +challenging due to heavy occlusion, viewpoint bias, camera distortion, and +motion blur from the head movement. To this end, we designed the HANDS23 +challenge based on the AssemblyHands and ARCTIC datasets with carefully +designed training and testing splits. Based on the results of the top submitted +methods and more recent baselines on the leaderboards, we perform a thorough +analysis on 3D hand(-object) reconstruction tasks. Our analysis demonstrates +the effectiveness of addressing distortion specific to egocentric cameras, +adopting high-capacity transformers to learn complex hand-object interactions, +and fusing predictions from different views. Our study further reveals +challenging scenarios intractable with state-of-the-art methods, such as fast +hand motion, object reconstruction from narrow egocentric views, and close +contact between two hands and objects. Our efforts will enrich the community's +knowledge foundation and facilitate future hand studies on egocentric +hand-object interactions. + +
+
+
+
+
+ + ☆ Enhancing Visual Place Recognition via Fast and Slow Adaptive Biasing in + Event Cameras + + +
+ Event cameras are increasingly popular in robotics due to their beneficial +features, such as low latency, energy efficiency, and high dynamic range. +Nevertheless, their downstream task performance is greatly influenced by the +optimization of bias parameters. These parameters, for instance, regulate the +necessary change in light intensity to trigger an event, which in turn depends +on factors such as the environment lighting and camera motion. This paper +introduces feedback control algorithms that automatically tune the bias +parameters through two interacting methods: 1) An immediate, on-the-fly fast +adaptation of the refractory period, which sets the minimum interval between +consecutive events, and 2) if the event rate exceeds the specified bounds even +after changing the refractory period repeatedly, the controller adapts the +pixel bandwidth and event thresholds, which stabilizes after a short period of +noise events across all pixels (slow adaptation). Our evaluation focuses on the +visual place recognition task, where incoming query images are compared to a +given reference database. We conducted comprehensive evaluations of our +algorithms' adaptive feedback control in real-time. To do so, we collected the +QCR-Fast-and-Slow dataset that contains DAVIS346 event camera streams from 366 +repeated traversals of a Scout Mini robot navigating through a 100 meter long +indoor lab setting (totaling over 35km distance traveled) in varying brightness +conditions with ground truth location information. Our proposed feedback +controllers result in superior performance when compared to the standard bias +settings and prior feedback control methods. Our findings also detail the +impact of bias adjustments on task performance and feature ablation studies on +the fast and slow adaptation mechanisms. + +
+
+ comment: 8 pages, 9 figures, paper under review +
+
+
+
+
+ + ☆ Refining Text-to-Image Generation: Towards Accurate Training-Free + Glyph-Enhanced Image Generation + + +
+ Over the past few years, Text-to-Image (T2I) generation approaches based on +diffusion models have gained significant attention. However, vanilla diffusion +models often suffer from spelling inaccuracies in the text displayed within the +generated images. The capability to generate visual text is crucial, offering +both academic interest and a wide range of practical applications. To produce +accurate visual text images, state-of-the-art techniques adopt a +glyph-controlled image generation approach, consisting of a text layout +generator followed by an image generator that is conditioned on the generated +text layout. Nevertheless, our study reveals that these models still face three +primary challenges, prompting us to develop a testbed to facilitate future +research. We introduce a benchmark, LenCom-Eval, specifically designed for +testing models' capability in generating images with Lengthy and Complex visual +text. Subsequently, we introduce a training-free framework to enhance the +two-stage generation approaches. We examine the effectiveness of our approach +on both LenCom-Eval and MARIO-Eval benchmarks and demonstrate notable +improvements across a range of evaluation metrics, including CLIPScore, OCR +precision, recall, F1 score, accuracy, and edit distance scores. For instance, +our proposed framework improves the backbone model, TextDiffuser, by more than +23\% and 13.5\% in terms of OCR word F1 on LenCom-Eval and MARIO-Eval, +respectively. Our work makes a unique contribution to the field by focusing on +generating images with long and rare text sequences, a niche previously +unexplored by existing literature + +
+
+
+
+
+ + ☆ Unsupervised Template-assisted Point Cloud Shape Correspondence Network CVPR2024 + + +
+ Unsupervised point cloud shape correspondence aims to establish point-wise +correspondences between source and target point clouds. Existing methods obtain +correspondences directly by computing point-wise feature similarity between +point clouds. However, non-rigid objects possess strong deformability and +unusual shapes, making it a longstanding challenge to directly establish +correspondences between point clouds with unconventional shapes. To address +this challenge, we propose an unsupervised Template-Assisted point cloud shape +correspondence Network, termed TANet, including a template generation module +and a template assistance module. The proposed TANet enjoys several merits. +Firstly, the template generation module establishes a set of learnable +templates with explicit structures. Secondly, we introduce a template +assistance module that extensively leverages the generated templates to +establish more accurate shape correspondences from multiple perspectives. +Extensive experiments on four human and animal datasets demonstrate that TANet +achieves favorable performance against state-of-the-art methods. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ☆ Spike-NeRF: Neural Radiance Field Based On Spike Camera ICME2024 + + +
+ As a neuromorphic sensor with high temporal resolution, spike cameras offer +notable advantages over traditional cameras in high-speed vision applications +such as high-speed optical estimation, depth estimation, and object tracking. +Inspired by the success of the spike camera, we proposed Spike-NeRF, the first +Neural Radiance Field derived from spike data, to achieve 3D reconstruction and +novel viewpoint synthesis of high-speed scenes. Instead of the multi-view +images at the same time of NeRF, the inputs of Spike-NeRF are continuous spike +streams captured by a moving spike camera in a very short time. To reconstruct +a correct and stable 3D scene from high-frequency but unstable spike data, we +devised spike masks along with a distinctive loss function. We evaluate our +method qualitatively and numerically on several challenging synthetic scenes +generated by blender with the spike camera simulator. Our results demonstrate +that Spike-NeRF produces more visually appealing results than the existing +methods and the baseline we proposed in high-speed scenes. Our code and data +will be released soon. + +
+
+ comment: This paper is accepted by ICME2024 +
+
+
+
+
+ + ☆ A Survey on Long Video Generation: Challenges, Methods, and Prospects + + +
+ Video generation is a rapidly advancing research area, garnering significant +attention due to its broad range of applications. One critical aspect of this +field is the generation of long-duration videos, which presents unique +challenges and opportunities. This paper presents the first survey of recent +advancements in long video generation and summarises them into two key +paradigms: divide and conquer temporal autoregressive. + We delve into the common models employed in each paradigm, including aspects +of network design and conditioning techniques. Furthermore, we offer a +comprehensive overview and classification of the datasets and evaluation +metrics which are crucial for advancing long video generation research. +Concluding with a summary of existing studies, we also discuss the emerging +challenges and future directions in this dynamic field. We hope that this +survey will serve as an essential reference for researchers and practitioners +in the realm of long video generation. + +
+
+
+
+
+ + ☆ Ensemble Adversarial Defense via Integration of Multiple Dispersed Low + Curvature Models IJCNN + + +
+ The integration of an ensemble of deep learning models has been extensively +explored to enhance defense against adversarial attacks. The diversity among +sub-models increases the attack cost required to deceive the majority of the +ensemble, thereby improving the adversarial robustness. While existing +approaches mainly center on increasing diversity in feature representations or +dispersion of first-order gradients with respect to input, the limited +correlation between these diversity metrics and adversarial robustness +constrains the performance of ensemble adversarial defense. In this work, we +aim to enhance ensemble diversity by reducing attack transferability. We +identify second-order gradients, which depict the loss curvature, as a key +factor in adversarial robustness. Computing the Hessian matrix involved in +second-order gradients is computationally expensive. To address this, we +approximate the Hessian-vector product using differential approximation. Given +that low curvature provides better robustness, our ensemble model was designed +to consider the influence of curvature among different sub-models. We introduce +a novel regularizer to train multiple more-diverse low-curvature network +models. Extensive experiments across various datasets demonstrate that our +ensemble model exhibits superior robustness against a range of attacks, +underscoring the effectiveness of our approach. + +
+
+ comment: Accepted to The 2024 International Joint Conference on Neural + Networks (IJCNN) +
+
+
+
+
+ + ☆ ASDF: Assembly State Detection Utilizing Late Fusion by Integrating 6D + Pose Estimation + + +
+ In medical and industrial domains, providing guidance for assembly processes +is critical to ensure efficiency and safety. Errors in assembly can lead to +significant consequences such as extended surgery times, and prolonged +manufacturing or maintenance times in industry. Assembly scenarios can benefit +from in-situ AR visualization to provide guidance, reduce assembly times and +minimize errors. To enable in-situ visualization 6D pose estimation can be +leveraged. Existing 6D pose estimation techniques primarily focus on individual +objects and static captures. However, assembly scenarios have various dynamics +including occlusion during assembly and dynamics in the assembly objects +appearance. Existing work, combining object detection/6D pose estimation and +assembly state detection focuses either on pure deep learning-based approaches, +or limit the assembly state detection to building blocks. To address the +challenges of 6D pose estimation in combination with assembly state detection, +our approach ASDF builds upon the strengths of YOLOv8, a real-time capable +object detection framework. We extend this framework, refine the object pose +and fuse pose knowledge with network-detected pose information. Utilizing our +late fusion in our Pose2State module results in refined 6D pose estimation and +assembly state detection. By combining both pose and state information, our +Pose2State module predicts the final assembly state with precision. Our +evaluation on our ASDF dataset shows that our Pose2State module leads to an +improved assembly state detection and that the improvement of the assembly +state further leads to a more robust 6D pose estimation. Moreover, on the GBOT +dataset, we outperform the pure deep learning-based network, and even +outperform the hybrid and pure tracking-based approaches. + +
+
+
+
+
+ + ☆ Multi-attention Associate Prediction Network for Visual Tracking + + +
+ Classification-regression prediction networks have realized impressive +success in several modern deep trackers. However, there is an inherent +difference between classification and regression tasks, so they have diverse +even opposite demands for feature matching. Existed models always ignore the +key issue and only employ a unified matching block in two task branches, +decaying the decision quality. Besides, these models also struggle with +decision misalignment situation. In this paper, we propose a multi-attention +associate prediction network (MAPNet) to tackle the above problems. Concretely, +two novel matchers, i.e., category-aware matcher and spatial-aware matcher, are +first designed for feature comparison by integrating self, cross, channel or +spatial attentions organically. They are capable of fully capturing the +category-related semantics for classification and the local spatial contexts +for regression, respectively. Then, we present a dual alignment module to +enhance the correspondences between two branches, which is useful to find the +optimal tracking solution. Finally, we describe a Siamese tracker built upon +the proposed prediction network, which achieves the leading performance on five +tracking benchmarks, consisting of LaSOT, TrackingNet, GOT-10k, TNL2k and +UAV123, and surpasses other state-of-the-art approaches. + +
+
+
+
+
+ + ☆ Text-IF: Leveraging Semantic Text Guidance for Degradation-Aware and + Interactive Image Fusion CVPR 2024 + + +
+ Image fusion aims to combine information from different source images to +create a comprehensively representative image. Existing fusion methods are +typically helpless in dealing with degradations in low-quality source images +and non-interactive to multiple subjective and objective needs. To solve them, +we introduce a novel approach that leverages semantic text guidance image +fusion model for degradation-aware and interactive image fusion task, termed as +Text-IF. It innovatively extends the classical image fusion to the text guided +image fusion along with the ability to harmoniously address the degradation and +interaction issues during fusion. Through the text semantic encoder and +semantic interaction fusion decoder, Text-IF is accessible to the all-in-one +infrared and visible image degradation-aware processing and the interactive +flexible fusion outcomes. In this way, Text-IF achieves not only multi-modal +image fusion, but also multi-modal information fusion. Extensive experiments +prove that our proposed text guided image fusion strategy has obvious +advantages over SOTA methods in the image fusion performance and degradation +treatment. The code is available at https://github.com/XunpengYi/Text-IF. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Dia-LLaMA: Towards Large Language Model-driven CT Report Generation + + +
+ Medical report generation has achieved remarkable advancements yet has still +been faced with several challenges. First, the inherent imbalance in the +distribution of normal and abnormal cases may lead models to exhibit a biased +focus on normal samples, resulting in unreliable diagnoses. Second, the +frequent occurrence of common template sentences in the reports may overwhelm +the critical abnormal information. Moreover, existing works focus on 2D chest +X-rays, leaving CT report generation underexplored due to the high-dimensional +nature of CT images and the limited availability of CT-report pairs. Recently, +LLM has shown a great ability to generate reliable answers with appropriate +prompts, which shed light on addressing the aforementioned challenges. In this +paper, we propose Dia-LLaMA, a framework to adapt the LLaMA2-7B for CT report +generation by incorporating diagnostic information as guidance prompts. +Considering the high dimension of CT, we leverage a pre-trained ViT3D with +perceiver to extract the visual information. To tailor the LLM for report +generation and emphasize abnormality, we extract additional diagnostic +information by referring to a disease prototype memory bank, which is updated +during training to capture common disease representations. Furthermore, we +introduce disease-aware attention to enable the model to adjust attention for +different diseases. Experiments on the chest CT dataset demonstrated that our +proposed method outperformed previous methods and achieved state-of-the-art on +both clinical efficacy performance and natural language generation metrics. The +code will be made publically available. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Synthesize Step-by-Step: Tools, Templates and LLMs as Data Generators + for Reasoning-Based Chart VQA CVPR 2024 + + +
+ Understanding data visualizations like charts and plots requires reasoning +about both visual elements and numerics. Although strong in extractive +questions, current chart visual question answering (chart VQA) models suffer on +complex reasoning questions. In this work, we address the lack of reasoning +ability by data augmentation. We leverage Large Language Models (LLMs), which +have shown to have strong reasoning ability, as an automatic data annotator +that generates question-answer annotations for chart images. The key innovation +in our method lies in the Synthesize Step-by-Step strategy: our LLM-based data +generator learns to decompose the complex question into step-by-step +sub-questions (rationales), which are then used to derive the final answer +using external tools, i.e. Python. This step-wise generation procedure is +trained on synthetic data generated using a template-based QA generation +pipeline. Experimental results highlight the significance of the proposed +step-by-step generation. By training with the LLM-augmented data (LAMENDA), we +significantly enhance the chart VQA models, achieving the state-of-the-art +accuracy on the ChartQA and PlotQA datasets. In particular, our approach +improves the accuracy of the previous state-of-the-art approach from 38% to 54% +on the human-written questions in the ChartQA dataset, which needs strong +reasoning. We hope our work underscores the potential of synthetic data and +encourages further exploration of data augmentation using LLMs for +reasoning-heavy tasks. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Residual Dense Swin Transformer for Continuous Depth-Independent + Ultrasound Imaging ICASSP2024 + + +
+ Ultrasound imaging is crucial for evaluating organ morphology and function, +yet depth adjustment can degrade image quality and field-of-view, presenting a +depth-dependent dilemma. Traditional interpolation-based zoom-in techniques +often sacrifice detail and introduce artifacts. Motivated by the potential of +arbitrary-scale super-resolution to naturally address these inherent +challenges, we present the Residual Dense Swin Transformer Network (RDSTN), +designed to capture the non-local characteristics and long-range dependencies +intrinsic to ultrasound images. It comprises a linear embedding module for +feature enhancement, an encoder with shifted-window attention for modeling +non-locality, and an MLP decoder for continuous detail reconstruction. This +strategy streamlines balancing image quality and field-of-view, which offers +superior textures over traditional methods. Experimentally, RDSTN outperforms +existing approaches while requiring fewer parameters. In conclusion, RDSTN +shows promising potential for ultrasound image enhancement by overcoming the +limitations of conventional interpolation-based methods and achieving +depth-independent imaging. + +
+
+ comment: Accepted by ICASSP2024, https://ieeexplore.ieee.org/document/10447712 +
+
+
+
+
+ + ☆ FlashEval: Towards Fast and Accurate Evaluation of Text-to-image + Diffusion Generative Models CVPR 2024 + + +
+ In recent years, there has been significant progress in the development of +text-to-image generative models. Evaluating the quality of the generative +models is one essential step in the development process. Unfortunately, the +evaluation process could consume a significant amount of computational +resources, making the required periodic evaluation of model performance (e.g., +monitoring training progress) impractical. Therefore, we seek to improve the +evaluation efficiency by selecting the representative subset of the text-image +dataset. We systematically investigate the design choices, including the +selection criteria (textural features or image-based metrics) and the selection +granularity (prompt-level or set-level). We find that the insights from prior +work on subset selection for training data do not generalize to this problem, +and we propose FlashEval, an iterative search algorithm tailored to evaluation +data selection. We demonstrate the effectiveness of FlashEval on ranking +diffusion models with various configurations, including architectures, +quantization levels, and sampler schedules on COCO and DiffusionDB datasets. +Our searched 50-item subset could achieve comparable evaluation quality to the +randomly sampled 500-item subset for COCO annotations on unseen models, +achieving a 10x evaluation speedup. We release the condensed subset of these +commonly used datasets to help facilitate diffusion algorithm design and +evaluation, and open-source FlashEval as a tool for condensing future datasets, +accessible at https://github.com/thu-nics/FlashEval. + +
+
+ comment: The paper is accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Elite360D: Towards Efficient 360 Depth Estimation via Semantic- and + Distance-Aware Bi-Projection Fusion CVPR2024 + + +
+ 360 depth estimation has recently received great attention for 3D +reconstruction owing to its omnidirectional field of view (FoV). Recent +approaches are predominantly focused on cross-projection fusion with +geometry-based re-projection: they fuse 360 images with equirectangular +projection (ERP) and another projection type, e.g., cubemap projection to +estimate depth with the ERP format. However, these methods suffer from 1) +limited local receptive fields, making it hardly possible to capture large FoV +scenes, and 2) prohibitive computational cost, caused by the complex +cross-projection fusion module design. In this paper, we propose Elite360D, a +novel framework that inputs the ERP image and icosahedron projection (ICOSAP) +point set, which is undistorted and spatially continuous. Elite360D is superior +in its capacity in learning a representation from a local-with-global +perspective. With a flexible ERP image encoder, it includes an ICOSAP point +encoder, and a Bi-projection Bi-attention Fusion (B2F) module (totally ~1M +parameters). Specifically, the ERP image encoder can take various perspective +image-trained backbones (e.g., ResNet, Transformer) to extract local features. +The point encoder extracts the global features from the ICOSAP. Then, the B2F +module captures the semantic- and distance-aware dependencies between each +pixel of the ERP feature and the entire ICOSAP feature set. Without specific +backbone design and obvious computational cost increase, Elite360D outperforms +the prior arts on several benchmark datasets. + +
+
+ comment: 8 pages, accepted by CVPR2024 +
+
+
+
+
+ + ☆ GoodSAM: Bridging Domain and Capacity Gaps via Segment Anything Model + for Distortion-aware Panoramic Semantic Segmentation CVPR 2024 + + +
+ This paper tackles a novel yet challenging problem: how to transfer knowledge +from the emerging Segment Anything Model (SAM) -- which reveals impressive +zero-shot instance segmentation capacity -- to learn a compact panoramic +semantic segmentation model, i.e., student, without requiring any labeled data. +This poses considerable challenges due to SAM's inability to provide semantic +labels and the large capacity gap between SAM and the student. To this end, we +propose a novel framework, called GoodSAM, that introduces a teacher assistant +(TA) to provide semantic information, integrated with SAM to generate ensemble +logits to achieve knowledge transfer. Specifically, we propose a +Distortion-Aware Rectification (DAR) module that first addresses the distortion +problem of panoramic images by imposing prediction-level consistency and +boundary enhancement. This subtly enhances TA's prediction capacity on +panoramic images. DAR then incorporates a cross-task complementary fusion block +to adaptively merge the predictions of SAM and TA to obtain more reliable +ensemble logits. Moreover, we introduce a Multi-level Knowledge Adaptation +(MKA) module to efficiently transfer the multi-level feature knowledge from TA +and ensemble logits to learn a compact student model. Extensive experiments on +two benchmarks show that our GoodSAM achieves a remarkable +3.75\% mIoU +improvement over the state-of-the-art (SOTA) domain adaptation methods. Also, +our most lightweight model achieves comparable performance to the SOTA methods +with only 3.7M parameters. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Distilling Semantic Priors from SAM to Efficient Image Restoration + Models + + +
+ In image restoration (IR), leveraging semantic priors from segmentation +models has been a common approach to improve performance. The recent segment +anything model (SAM) has emerged as a powerful tool for extracting advanced +semantic priors to enhance IR tasks. However, the computational cost of SAM is +prohibitive for IR, compared to existing smaller IR models. The incorporation +of SAM for extracting semantic priors considerably hampers the model inference +efficiency. To address this issue, we propose a general framework to distill +SAM's semantic knowledge to boost exiting IR models without interfering with +their inference process. Specifically, our proposed framework consists of the +semantic priors fusion (SPF) scheme and the semantic priors distillation (SPD) +scheme. SPF fuses two kinds of information between the restored image predicted +by the original IR model and the semantic mask predicted by SAM for the refined +restored image. SPD leverages a self-distillation manner to distill the fused +semantic priors to boost the performance of original IR models. Additionally, +we design a semantic-guided relation (SGR) module for SPD, which ensures +semantic feature representation space consistency to fully distill the priors. +We demonstrate the effectiveness of our framework across multiple IR models and +tasks, including deraining, deblurring, and denoising. + +
+
+
+
+
+ + ☆ Generating Potent Poisons and Backdoors from Scratch with Guided + Diffusion + + +
+ Modern neural networks are often trained on massive datasets that are web +scraped with minimal human inspection. As a result of this insecure curation +pipeline, an adversary can poison or backdoor the resulting model by uploading +malicious data to the internet and waiting for a victim to scrape and train on +it. Existing approaches for creating poisons and backdoors start with randomly +sampled clean data, called base samples, and then modify those samples to craft +poisons. However, some base samples may be significantly more amenable to +poisoning than others. As a result, we may be able to craft more potent poisons +by carefully choosing the base samples. In this work, we use guided diffusion +to synthesize base samples from scratch that lead to significantly more potent +poisons and backdoors than previous state-of-the-art attacks. Our Guided +Diffusion Poisoning (GDP) base samples can be combined with any downstream +poisoning or backdoor attack to boost its effectiveness. Our implementation +code is publicly available at: https://github.com/hsouri/GDP . + +
+
+
+
+
+ + ☆ RSTAR: Rotational Streak Artifact Reduction in 4D CBCT using Separable + and Circular Convolutions + + +
+ Four-dimensional cone-beam computed tomography (4D CBCT) provides +respiration-resolved images and can be used for image-guided radiation therapy. +However, the ability to reveal respiratory motion comes at the cost of image +artifacts. As raw projection data are sorted into multiple respiratory phases, +there is a limited number of cone-beam projections available for image +reconstruction. Consequently, the 4D CBCT images are covered by severe streak +artifacts. Although several deep learning-based methods have been proposed to +address this issue, most algorithms employ ordinary network models, neglecting +the intrinsic structural prior within 4D CBCT images. In this paper, we first +explore the origin and appearance of streak artifacts in 4D CBCT +images.Specifically, we find that streak artifacts exhibit a periodic +rotational motion along with the patient's respiration. This unique motion +pattern inspires us to distinguish the artifacts from the desired anatomical +structures in the spatiotemporal domain. Thereafter, we propose a +spatiotemporal neural network named RSTAR-Net with separable and circular +convolutions for Rotational Streak Artifact Reduction. The specially designed +model effectively encodes dynamic image features, facilitating the recovery of +4D CBCT images. Moreover, RSTAR-Net is also lightweight and computationally +efficient. Extensive experiments substantiate the effectiveness of our proposed +method, and RSTAR-Net shows superior performance to comparison methods. + +
+
+
+
+
+ + ☆ ChebMixer: Efficient Graph Representation Learning with MLP Mixer + + +
+ Graph neural networks have achieved remarkable success in learning graph +representations, especially graph Transformer, which has recently shown +superior performance on various graph mining tasks. However, graph Transformer +generally treats nodes as tokens, which results in quadratic complexity +regarding the number of nodes during self-attention computation. The graph MLP +Mixer addresses this challenge by using the efficient MLP Mixer technique from +computer vision. However, the time-consuming process of extracting graph tokens +limits its performance. In this paper, we present a novel architecture named +ChebMixer, a newly graph MLP Mixer that uses fast Chebyshev polynomials-based +spectral filtering to extract a sequence of tokens. Firstly, we produce +multiscale representations of graph nodes via fast Chebyshev polynomial-based +spectral filtering. Next, we consider each node's multiscale representations as +a sequence of tokens and refine the node representation with an effective MLP +Mixer. Finally, we aggregate the multiscale representations of nodes through +Chebyshev interpolation. Owing to the powerful representation capabilities and +fast computational properties of MLP Mixer, we can quickly extract more +informative node representations to improve the performance of downstream +tasks. The experimental results prove our significant improvements in a variety +of scenarios ranging from graph node classification to medical image +segmentation. + +
+
+
+
+
+ + ☆ 3D-EffiViTCaps: 3D Efficient Vision Transformer with Capsule for Medical + Image Segmentation ICPR2024 + + +
+ Medical image segmentation (MIS) aims to finely segment various organs. It +requires grasping global information from both parts and the entire image for +better segmenting, and clinically there are often certain requirements for +segmentation efficiency. Convolutional neural networks (CNNs) have made +considerable achievements in MIS. However, they are difficult to fully collect +global context information and their pooling layer may cause information loss. +Capsule networks, which combine the benefits of CNNs while taking into account +additional information such as relative location that CNNs do not, have lately +demonstrated some advantages in MIS. Vision Transformer (ViT) employs +transformers in visual tasks. Transformer based on attention mechanism has +excellent global inductive modeling capabilities and is expected to capture +longrange information. Moreover, there have been resent studies on making ViT +more lightweight to minimize model complexity and increase efficiency. In this +paper, we propose a U-shaped 3D encoder-decoder network named 3D-EffiViTCaps, +which combines 3D capsule blocks with 3D EfficientViT blocks for MIS. Our +encoder uses capsule blocks and EfficientViT blocks to jointly capture local +and global semantic information more effectively and efficiently with less +information loss, while the decoder employs CNN blocks and EfficientViT blocks +to catch ffner details for segmentation. We conduct experiments on various +datasets, including iSeg-2017, Hippocampus and Cardiac to verify the +performance and efficiency of 3D-EffiViTCaps, which performs better than +previous 3D CNN-based, 3D Capsule-based and 3D Transformer-based models. We +further implement a series of ablation experiments on the main blocks. Our code +is available at: https://github.com/HidNeuron/3D-EffiViTCaps. + +
+
+ comment: 15 pages, 4 figures, submitted to ICPR2024 +
+
+
+
+
+ + ☆ Impact of Video Compression Artifacts on Fisheye Camera Visual + Perception Tasks + + +
+ Autonomous driving systems require extensive data collection schemes to cover +the diverse scenarios needed for building a robust and safe system. The data +volumes are in the order of Exabytes and have to be stored for a long period of +time (i.e., more than 10 years of the vehicle's life cycle). Lossless +compression doesn't provide sufficient compression ratios, hence, lossy video +compression has been explored. It is essential to prove that lossy video +compression artifacts do not impact the performance of the perception +algorithms. However, there is limited work in this area to provide a solid +conclusion. In particular, there is no such work for fisheye cameras, which +have high radial distortion and where compression may have higher artifacts. +Fisheye cameras are commonly used in automotive systems for 3D object detection +task. In this work, we provide the first analysis of the impact of standard +video compression codecs on wide FOV fisheye camera images. We demonstrate that +the achievable compression with negligible impact depends on the dataset and +temporal prediction of the video codec. We propose a radial distortion-aware +zonal metric to evaluate the performance of artifacts in fisheye images. In +addition, we present a novel method for estimating affine mode parameters of +the latest VVC codec, and suggest some areas for improvement in video codecs +for the application to fisheye imagery. + +
+
+
+
+
+ + ☆ MEDDAP: Medical Dataset Enhancement via Diversified Augmentation + Pipeline MICCAI-2024 + + +
+ The effectiveness of Deep Neural Networks (DNNs) heavily relies on the +abundance and accuracy of available training data. However, collecting and +annotating data on a large scale is often both costly and time-intensive, +particularly in medical cases where practitioners are already occupied with +their duties. Moreover, ensuring that the model remains robust across various +scenarios of image capture is crucial in medical domains, especially when +dealing with ultrasound images that vary based on the settings of different +devices and the manual operation of the transducer. To address this challenge, +we introduce a novel pipeline called MEDDAP, which leverages Stable Diffusion +(SD) models to augment existing small datasets by automatically generating new +informative labeled samples. Pretrained checkpoints for SD are typically based +on natural images, and training them for medical images requires significant +GPU resources due to their heavy parameters. To overcome this challenge, we +introduce USLoRA (Ultrasound Low-Rank Adaptation), a novel fine-tuning method +tailored specifically for ultrasound applications. USLoRA allows for selective +fine-tuning of weights within SD, requiring fewer than 0.1\% of parameters +compared to fully fine-tuning only the UNet portion of SD. To enhance dataset +diversity, we incorporate different adjectives into the generation process +prompts, thereby desensitizing the classifiers to intensity changes across +different images. This approach is inspired by clinicians' decision-making +processes regarding breast tumors, where tumor shape often plays a more crucial +role than intensity. In conclusion, our pipeline not only outperforms +classifiers trained on the original dataset but also demonstrates superior +performance when encountering unseen datasets. The source code is available at +https://github.com/yasamin-med/MEDDAP. + +
+
+ comment: submitted to miccai 2024 submitted to miccai 2024 Submitted to + MICCAI-2024 +
+
+
+
+
+ + ☆ Decoding the visual attention of pathologists to reveal their level of + expertise + + +
+ We present a method for classifying the expertise of a pathologist based on +how they allocated their attention during a cancer reading. We engage this +decoding task by developing a novel method for predicting the attention of +pathologists as they read whole-slide Images (WSIs) of prostate and make cancer +grade classifications. Our ground truth measure of a pathologists' attention is +the x, y and z (magnification) movement of their viewport as they navigated +through WSIs during readings, and to date we have the attention behavior of 43 +pathologists reading 123 WSIs. These data revealed that specialists have higher +agreement in both their attention and cancer grades compared to general +pathologists and residents, suggesting that sufficient information may exist in +their attention behavior to classify their expertise level. To attempt this, we +trained a transformer-based model to predict the visual attention heatmaps of +resident, general, and specialist (GU) pathologists during Gleason grading. +Based solely on a pathologist's attention during a reading, our model was able +to predict their level of expertise with 75.3%, 56.1%, and 77.2% accuracy, +respectively, better than chance and baseline models. Our model therefore +enables a pathologist's expertise level to be easily and objectively evaluated, +important for pathology training and competency assessment. Tools developed +from our model could also be used to help pathology trainees learn how to read +WSIs like an expert. + +
+
+
+
+
+ + ☆ DreamPolisher: Towards High-Quality Text-to-3D Generation via Geometric + Diffusion + + +
+ We present DreamPolisher, a novel Gaussian Splatting based method with +geometric guidance, tailored to learn cross-view consistency and intricate +detail from textual descriptions. While recent progress on text-to-3D +generation methods have been promising, prevailing methods often fail to ensure +view-consistency and textural richness. This problem becomes particularly +noticeable for methods that work with text input alone. To address this, we +propose a two-stage Gaussian Splatting based approach that enforces geometric +consistency among views. Initially, a coarse 3D generation undergoes refinement +via geometric optimization. Subsequently, we use a ControlNet driven refiner +coupled with the geometric consistency term to improve both texture fidelity +and overall consistency of the generated 3D asset. Empirical evaluations across +diverse textual prompts spanning various object categories demonstrate the +efficacy of DreamPolisher in generating consistent and realistic 3D objects, +aligning closely with the semantics of the textual instructions. + +
+
+ comment: Project webpage: https://yuanze-lin.me/DreamPolisher_page/ +
+
+
+
+
+ + ☆ Co-Occurring of Object Detection and Identification towards unlabeled + object discovery + + +
+ In this paper, we propose a novel deep learning based approach for +identifying co-occurring objects in conjunction with base objects in multilabel +object categories. Nowadays, with the advancement in computer vision based +techniques we need to know about co-occurring objects with respect to base +object for various purposes. The pipeline of the proposed work is composed of +two stages: in the first stage of the proposed model we detect all the bounding +boxes present in the image and their corresponding labels, then in the second +stage we perform co-occurrence matrix analysis. In co-occurrence matrix +analysis, we set base classes based on the maximum occurrences of the labels +and build association rules and generate frequent patterns. These frequent +patterns will show base classes and their corresponding co-occurring classes. +We performed our experiments on two publicly available datasets: Pascal VOC and +MS-COCO. The experimental results on public benchmark dataset is reported in +Sec 4. Further we extend this work by considering all frequently objects as +unlabeled and what if they are occluded as well. + +
+
+ comment: 6 pages, 2 figures, +
+
+
+
+
+ + ☆ DiffusionAct: Controllable Diffusion Autoencoder for One-shot Face + Reenactment + + +
+ Video-driven neural face reenactment aims to synthesize realistic facial +images that successfully preserve the identity and appearance of a source face, +while transferring the target head pose and facial expressions. Existing +GAN-based methods suffer from either distortions and visual artifacts or poor +reconstruction quality, i.e., the background and several important appearance +details, such as hair style/color, glasses and accessories, are not faithfully +reconstructed. Recent advances in Diffusion Probabilistic Models (DPMs) enable +the generation of high-quality realistic images. To this end, in this paper we +present DiffusionAct, a novel method that leverages the photo-realistic image +generation of diffusion models to perform neural face reenactment. +Specifically, we propose to control the semantic space of a Diffusion +Autoencoder (DiffAE), in order to edit the facial pose of the input images, +defined as the head pose orientation and the facial expressions. Our method +allows one-shot, self, and cross-subject reenactment, without requiring +subject-specific fine-tuning. We compare against state-of-the-art GAN-, +StyleGAN2-, and diffusion-based methods, showing better or on-par reenactment +performance. + +
+
+ comment: Project page: https://stelabou.github.io/diffusionact/ +
+
+
+
+
+ + ☆ AnimateMe: 4D Facial Expressions via Diffusion Models + + +
+ The field of photorealistic 3D avatar reconstruction and generation has +garnered significant attention in recent years; however, animating such avatars +remains challenging. Recent advances in diffusion models have notably enhanced +the capabilities of generative models in 2D animation. In this work, we +directly utilize these models within the 3D domain to achieve controllable and +high-fidelity 4D facial animation. By integrating the strengths of diffusion +processes and geometric deep learning, we employ Graph Neural Networks (GNNs) +as denoising diffusion models in a novel approach, formulating the diffusion +process directly on the mesh space and enabling the generation of 3D facial +expressions. This facilitates the generation of facial deformations through a +mesh-diffusion-based model. Additionally, to ensure temporal coherence in our +animations, we propose a consistent noise sampling method. Under a series of +both quantitative and qualitative experiments, we showcase that the proposed +method outperforms prior work in 4D expression synthesis by generating +high-fidelity extreme expressions. Furthermore, we applied our method to +textured 4D facial expression generation, implementing a straightforward +extension that involves training on a large-scale textured 4D facial expression +database. + +
+
+
+
+
+ + ☆ Strategies to Improve Real-World Applicability of Laparoscopic Anatomy + Segmentation Models + + +
+ Accurate identification and localization of anatomical structures of varying +size and appearance in laparoscopic imaging are necessary to leverage the +potential of computer vision techniques for surgical decision support. +Segmentation performance of such models is traditionally reported using metrics +of overlap such as IoU. However, imbalanced and unrealistic representation of +classes in the training data and suboptimal selection of reported metrics have +the potential to skew nominal segmentation performance and thereby ultimately +limit clinical translation. In this work, we systematically analyze the impact +of class characteristics (i.e., organ size differences), training and test data +composition (i.e., representation of positive and negative examples), and +modeling parameters (i.e., foreground-to-background class weight) on eight +segmentation metrics: accuracy, precision, recall, IoU, F1 score, specificity, +Hausdorff Distance, and Average Symmetric Surface Distance. Based on our +findings, we propose two simple yet effective strategies to improve real-world +applicability of image segmentation models in laparoscopic surgical data: (1) +inclusion of negative examples in the training process and (2) adaptation of +foreground-background weights in segmentation models to maximize model +performance with respect to specific metrics of interest, depending on the +clinical use case. + +
+
+ comment: 13 pages, 5 figures, 4 tables +
+
+
+
+
+ + ☆ LOTUS: Evasive and Resilient Backdoor Attacks through Sub-Partitioning CVPR + 2024 + + +
+ Backdoor attack poses a significant security threat to Deep Learning +applications. Existing attacks are often not evasive to established backdoor +detection techniques. This susceptibility primarily stems from the fact that +these attacks typically leverage a universal trigger pattern or transformation +function, such that the trigger can cause misclassification for any input. In +response to this, recent papers have introduced attacks using sample-specific +invisible triggers crafted through special transformation functions. While +these approaches manage to evade detection to some extent, they reveal +vulnerability to existing backdoor mitigation techniques. To address and +enhance both evasiveness and resilience, we introduce a novel backdoor attack +LOTUS. Specifically, it leverages a secret function to separate samples in the +victim class into a set of partitions and applies unique triggers to different +partitions. Furthermore, LOTUS incorporates an effective trigger focusing +mechanism, ensuring only the trigger corresponding to the partition can induce +the backdoor behavior. Extensive experimental results show that LOTUS can +achieve high attack success rate across 4 datasets and 7 model structures, and +effectively evading 13 backdoor detection and mitigation techniques. The code +is available at https://github.com/Megum1/LOTUS. + +
+
+ comment: IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR + 2024) +
+
+
+
+
+ + ☆ Brain Stroke Segmentation Using Deep Learning Models: A Comparative + Study + + +
+ Stroke segmentation plays a crucial role in the diagnosis and treatment of +stroke patients by providing spatial information about affected brain regions +and the extent of damage. Segmenting stroke lesions accurately is a challenging +task, given that conventional manual techniques are time consuming and prone to +errors. Recently, advanced deep models have been introduced for general medical +image segmentation, demonstrating promising results that surpass many state of +the art networks when evaluated on specific datasets. With the advent of the +vision Transformers, several models have been introduced based on them, while +others have aimed to design better modules based on traditional convolutional +layers to extract long-range dependencies like Transformers. The question of +whether such high-level designs are necessary for all segmentation cases to +achieve the best results remains unanswered. In this study, we selected four +types of deep models that were recently proposed and evaluated their +performance for stroke segmentation: a pure Transformer-based architecture +(DAE-Former), two advanced CNN-based models (LKA and DLKA) with attention +mechanisms in their design, an advanced hybrid model that incorporates CNNs +with Transformers (FCT), and the well- known self-adaptive nnUNet framework +with its configuration based on given data. We examined their performance on +two publicly available datasets, and found that the nnUNet achieved the best +results with the simplest design among all. Revealing the robustness issue of +Transformers to such variabilities serves as a potential reason for their +weaker performance. Furthermore, nnUNet's success underscores the significant +impact of preprocessing and postprocessing techniques in enhancing segmentation +results, surpassing the focus solely on architectural designs + +
+
+
+
+
+ + ☆ Histogram Layers for Neural Engineered Features + + +
+ In the computer vision literature, many effective histogram-based features +have been developed. These engineered features include local binary patterns +and edge histogram descriptors among others and they have been shown to be +informative features for a variety of computer vision tasks. In this paper, we +explore whether these features can be learned through histogram layers embedded +in a neural network and, therefore, be leveraged within deep learning +frameworks. By using histogram features, local statistics of the feature maps +from the convolution neural networks can be used to better represent the data. +We present neural versions of local binary pattern and edge histogram +descriptors that jointly improve the feature representation and perform image +classification. Experiments are presented on benchmark and real-world datasets. + +
+
+ comment: 11 pages, 7 figures, submitted for review +
+
+
+
+
+ + ☆ Engagement Measurement Based on Facial Landmarks and Spatial-Temporal + Graph Convolutional Networks + + +
+ Engagement in virtual learning is crucial for a variety of factors including +learner satisfaction, performance, and compliance with learning programs, but +measuring it is a challenging task. There is therefore considerable interest in +utilizing artificial intelligence and affective computing to measure engagement +in natural settings as well as on a large scale. This paper introduces a novel, +privacy-preserving method for engagement measurement from videos. It uses +facial landmarks, which carry no personally identifiable information, extracted +from videos via the MediaPipe deep learning solution. The extracted facial +landmarks are fed to a Spatial-Temporal Graph Convolutional Network (ST-GCN) to +output the engagement level of the learner in the video. To integrate the +ordinal nature of the engagement variable into the training process, ST-GCNs +undergo training in a novel ordinal learning framework based on transfer +learning. Experimental results on two video student engagement measurement +datasets show the superiority of the proposed method compared to previous +methods with improved state-of-the-art on the EngageNet dataset with a %3.1 +improvement in four-class engagement level classification accuracy and on the +Online Student Engagement dataset with a %1.5 improvement in binary engagement +classification accuracy. The relatively lightweight ST-GCN and its integration +with the real-time MediaPipe deep learning solution make the proposed approach +capable of being deployed on virtual learning platforms and measuring +engagement in real time. + +
+
+
+
+
+ + ☆ Task2Box: Box Embeddings for Modeling Asymmetric Task Relationships + + +
+ Modeling and visualizing relationships between tasks or datasets is an +important step towards solving various meta-tasks such as dataset discovery, +multi-tasking, and transfer learning. However, many relationships, such as +containment and transferability, are naturally asymmetric and current +approaches for representation and visualization (e.g., t-SNE do not readily +support this. We propose Task2Box, an approach to represent tasks using box +embeddings -- axis-aligned hyperrectangles in low dimensional spaces -- that +can capture asymmetric relationships between them through volumetric overlaps. +We show that Task2Box accurately predicts unseen hierarchical relationships +between nodes in ImageNet and iNaturalist datasets, as well as transferability +between tasks in the Taskonomy benchmark. We also show that box embeddings +estimated from task representations (e.g., CLIP, Task2Vec, or attribute based) +can be used to predict relationships between unseen tasks more accurately than +classifiers trained on the same representations, as well as handcrafted +asymmetric distances (e.g., KL divergence). This suggests that low-dimensional +box embeddings can effectively capture these task relationships and have the +added advantage of being interpretable. We use the approach to visualize +relationships among publicly available image classification datasets on popular +dataset hosting platform called Hugging Face. + +
+
+
+
+
+ + ☆ Benchmarking Video Frame Interpolation + + +
+ Video frame interpolation, the task of synthesizing new frames in between two +or more given ones, is becoming an increasingly popular research target. +However, the current evaluation of frame interpolation techniques is not ideal. +Due to the plethora of test datasets available and inconsistent computation of +error metrics, a coherent and fair comparison across papers is very +challenging. Furthermore, new test sets have been proposed as part of method +papers so they are unable to provide the in-depth evaluation of a dedicated +benchmarking paper. Another severe downside is that these test sets violate the +assumption of linearity when given two input frames, making it impossible to +solve without an oracle. We hence strongly believe that the community would +greatly benefit from a benchmarking paper, which is what we propose. +Specifically, we present a benchmark which establishes consistent error metrics +by utilizing a submission website that computes them, provides insights by +analyzing the interpolation quality with respect to various per-pixel +attributes such as the motion magnitude, contains a carefully designed test set +adhering to the assumption of linearity by utilizing synthetic data, and +evaluates the computational efficiency in a coherent manner. + +
+
+ comment: http://sniklaus.com/vfibench +
+
+
+
+
+ + ☆ Animal Avatars: Reconstructing Animatable 3D Animals from Casual Videos + + +
+ We present a method to build animatable dog avatars from monocular videos. +This is challenging as animals display a range of (unpredictable) non-rigid +movements and have a variety of appearance details (e.g., fur, spots, tails). +We develop an approach that links the video frames via a 4D solution that +jointly solves for animal's pose variation, and its appearance (in a canonical +pose). To this end, we significantly improve the quality of template-based +shape fitting by endowing the SMAL parametric model with Continuous Surface +Embeddings, which brings image-to-mesh reprojection constaints that are denser, +and thus stronger, than the previously used sparse semantic keypoint +correspondences. To model appearance, we propose an implicit duplex-mesh +texture that is defined in the canonical pose, but can be deformed using SMAL +pose coefficients and later rendered to enforce a photometric compatibility +with the input video frames. On the challenging CoP3D and APTv2 datasets, we +demonstrate superior results (both in terms of pose estimates and predicted +appearance) to existing template-free (RAC) and template-based approaches +(BARC, BITE). + +
+
+
+
+
+ + ☆ SynFog: A Photo-realistic Synthetic Fog Dataset based on End-to-end + Imaging Simulation for Advancing Real-World Defogging in Autonomous Driving + + +
+ To advance research in learning-based defogging algorithms, various synthetic +fog datasets have been developed. However, existing datasets created using the +Atmospheric Scattering Model (ASM) or real-time rendering engines often +struggle to produce photo-realistic foggy images that accurately mimic the +actual imaging process. This limitation hinders the effective generalization of +models from synthetic to real data. In this paper, we introduce an end-to-end +simulation pipeline designed to generate photo-realistic foggy images. This +pipeline comprehensively considers the entire physically-based foggy scene +imaging process, closely aligning with real-world image capture methods. Based +on this pipeline, we present a new synthetic fog dataset named SynFog, which +features both sky light and active lighting conditions, as well as three levels +of fog density. Experimental results demonstrate that models trained on SynFog +exhibit superior performance in visual perception and detection accuracy +compared to others when applied to real-world foggy images. + +
+
+
+
+
+ + ☆ A Comparative Analysis of Visual Odometry in Virtual and Real-World + Railways Environments + + +
+ Perception tasks play a crucial role in the development of automated +operations and systems across multiple application fields. In the railway +transportation domain, these tasks can improve the safety, reliability, and +efficiency of various perations, including train localization, signal +recognition, and track discrimination. However, collecting considerable and +precisely labeled datasets for testing such novel algorithms poses extreme +challenges in the railway environment due to the severe restrictions in +accessing the infrastructures and the practical difficulties associated with +properly equipping trains with the required sensors, such as cameras and +LiDARs. The remarkable innovations of graphic engine tools offer new solutions +to craft realistic synthetic datasets. To illustrate the advantages of +employing graphic simulation for early-stage testing of perception tasks in the +railway domain, this paper presents a comparative analysis of the performance +of a SLAM algorithm applied both in a virtual synthetic environment and a +real-world scenario. The analysis leverages virtual railway environments +created with the latest version of Unreal Engine, facilitating data collection +and allowing the examination of challenging scenarios, including +low-visibility, dangerous operational modes, and complex environments. The +results highlight the feasibility and potentiality of graphic simulation to +advance perception tasks in the railway domain. + +
+
+
+
+
+ + ☆ A Study in Dataset Pruning for Image Super-Resolution + + +
+ In image Super-Resolution (SR), relying on large datasets for training is a +double-edged sword. While offering rich training material, they also demand +substantial computational and storage resources. In this work, we analyze +dataset pruning as a solution to these challenges. We introduce a novel +approach that reduces a dataset to a core-set of training samples, selected +based on their loss values as determined by a simple pre-trained SR model. By +focusing the training on just 50% of the original dataset, specifically on the +samples characterized by the highest loss values, we achieve results comparable +to or even surpassing those obtained from training on the entire dataset. +Interestingly, our analysis reveals that the top 5% of samples with the highest +loss values negatively affect the training process. Excluding these samples and +adjusting the selection to favor easier samples further enhances training +outcomes. Our work opens new perspectives to the untapped potential of dataset +pruning in image SR. It suggests that careful selection of training data based +on loss-value metrics can lead to better SR models, challenging the +conventional wisdom that more data inevitably leads to better performance. + +
+
+
+
+
+ + ☆ Continuous, Subject-Specific Attribute Control in T2I Models by + Identifying Semantic Directions + + +
+ In recent years, advances in text-to-image (T2I) diffusion models have +substantially elevated the quality of their generated images. However, +achieving fine-grained control over attributes remains a challenge due to the +limitations of natural language prompts (such as no continuous set of +intermediate descriptions existing between ``person'' and ``old person''). Even +though many methods were introduced that augment the model or generation +process to enable such control, methods that do not require a fixed reference +image are limited to either enabling global fine-grained attribute expression +control or coarse attribute expression control localized to specific subjects, +not both simultaneously. We show that there exist directions in the commonly +used token-level CLIP text embeddings that enable fine-grained subject-specific +control of high-level attributes in text-to-image models. Based on this +observation, we introduce one efficient optimization-free and one robust +optimization-based method to identify these directions for specific attributes +from contrastive text prompts. We demonstrate that these directions can be used +to augment the prompt text input with fine-grained control over attributes of +specific subjects in a compositional manner (control over multiple attributes +of a single subject) without having to adapt the diffusion model. Project page: +https://compvis.github.io/attribute-control. Code is available at +https://github.com/CompVis/attribute-control. + +
+
+ comment: Project page: https://compvis.github.io/attribute-control +
+
+
+
+
+ + ☆ Calib3D: Calibrating Model Preferences for Reliable 3D Scene + Understanding + + +
+ Safety-critical 3D scene understanding tasks necessitate not only accurate +but also confident predictions from 3D perception models. This study introduces +Calib3D, a pioneering effort to benchmark and scrutinize the reliability of 3D +scene understanding models from an uncertainty estimation viewpoint. We +comprehensively evaluate 28 state-of-the-art models across 10 diverse 3D +datasets, uncovering insightful phenomena that cope with both the aleatoric and +epistemic uncertainties in 3D scene understanding. We discover that despite +achieving impressive levels of accuracy, existing models frequently fail to +provide reliable uncertainty estimates -- a pitfall that critically undermines +their applicability in safety-sensitive contexts. Through extensive analysis of +key factors such as network capacity, LiDAR representations, rasterization +resolutions, and 3D data augmentation techniques, we correlate these aspects +directly with the model calibration efficacy. Furthermore, we introduce DeptS, +a novel depth-aware scaling approach aimed at enhancing 3D model calibration. +Extensive experiments across a wide range of configurations validate the +superiority of our method. We hope this work could serve as a cornerstone for +fostering reliable 3D scene understanding. Code and benchmark toolkits are +publicly available. + +
+
+ comment: Preprint; 37 pages, 8 figures, 11 tables; Code at + https://github.com/ldkong1205/Calib3D +
+
+
+
+
+ + ☆ Optimizing LiDAR Placements for Robust Driving Perception in Adverse + Conditions + + +
+ The robustness of driving perception systems under unprecedented conditions +is crucial for safety-critical usages. Latest advancements have prompted +increasing interests towards multi-LiDAR perception. However, prevailing +driving datasets predominantly utilize single-LiDAR systems and collect data +devoid of adverse conditions, failing to capture the complexities of real-world +environments accurately. Addressing these gaps, we proposed Place3D, a +full-cycle pipeline that encompasses LiDAR placement optimization, data +generation, and downstream evaluations. Our framework makes three appealing +contributions. 1) To identify the most effective configurations for multi-LiDAR +systems, we introduce a Surrogate Metric of the Semantic Occupancy Grids +(M-SOG) to evaluate LiDAR placement quality. 2) Leveraging the M-SOG metric, we +propose a novel optimization strategy to refine multi-LiDAR placements. 3) +Centered around the theme of multi-condition multi-LiDAR perception, we collect +a 364,000-frame dataset from both clean and adverse conditions. Extensive +experiments demonstrate that LiDAR placements optimized using our approach +outperform various baselines. We showcase exceptional robustness in both 3D +object detection and LiDAR semantic segmentation tasks, under diverse adverse +weather and sensor failure conditions. Code and benchmark toolkit are publicly +available. + +
+
+ comment: Preprint; 40 pages, 11 figures, 15 tables; Code at + https://github.com/ywyeli/Place3D +
+
+
+
+
+ + ☆ FlashFace: Human Image Personalization with High-fidelity Identity + Preservation + + +
+ This work presents FlashFace, a practical tool with which users can easily +personalize their own photos on the fly by providing one or a few reference +face images and a text prompt. Our approach is distinguishable from existing +human photo customization methods by higher-fidelity identity preservation and +better instruction following, benefiting from two subtle designs. First, we +encode the face identity into a series of feature maps instead of one image +token as in prior arts, allowing the model to retain more details of the +reference faces (e.g., scars, tattoos, and face shape ). Second, we introduce a +disentangled integration strategy to balance the text and image guidance during +the text-to-image generation process, alleviating the conflict between the +reference faces and the text prompts (e.g., personalizing an adult into a +"child" or an "elder"). Extensive experimental results demonstrate the +effectiveness of our method on various applications, including human image +personalization, face swapping under language prompts, making virtual +characters into real people, etc. Project Page: +https://jshilong.github.io/flashface-page. + +
+
+ comment: Project Page:https://jshilong.github.io/flashface-page +
+
+
+
+
+ + ☆ DreamLIP: Language-Image Pre-training with Long Captions + + +
+ Language-image pre-training largely relies on how precisely and thoroughly a +text describes its paired image. In practice, however, the contents of an image +can be so rich that well describing them requires lengthy captions (e.g., with +10 sentences), which are usually missing in existing datasets. Consequently, +there are currently no clear evidences on whether and how language-image +pre-training could benefit from long captions. To figure this out, we first +re-caption 30M images with detailed descriptions using a pre-trained +Multi-modality Large Language Model (MLLM), and then study the usage of the +resulting captions under a contrastive learning framework. We observe that, +each sentence within a long caption is very likely to describe the image +partially (e.g., an object). Motivated by this, we propose to dynamically +sample sub-captions from the text label to construct multiple positive pairs, +and introduce a grouping loss to match the embeddings of each sub-caption with +its corresponding local image patches in a self-supervised manner. Experimental +results on a wide rage of downstream tasks demonstrate the consistent +superiority of our method, termed DreamLIP, over previous alternatives, +highlighting its fine-grained representational capacity. It is noteworthy that, +on the tasks of image-text retrieval and semantic segmentation, our model +trained with 30M image-text pairs achieves on par or even better performance +than CLIP trained with 400M pairs. Project page is available at +https://zyf0619sjtu.github.io/dream-lip. + +
+
+
+
+
+ + ☆ Invertible Diffusion Models for Compressed Sensing + + +
+ While deep neural networks (NN) significantly advance image compressed +sensing (CS) by improving reconstruction quality, the necessity of training +current CS NNs from scratch constrains their effectiveness and hampers rapid +deployment. Although recent methods utilize pre-trained diffusion models for +image reconstruction, they struggle with slow inference and restricted +adaptability to CS. To tackle these challenges, this paper proposes Invertible +Diffusion Models (IDM), a novel efficient, end-to-end diffusion-based CS +method. IDM repurposes a large-scale diffusion sampling process as a +reconstruction model, and finetunes it end-to-end to recover original images +directly from CS measurements, moving beyond the traditional paradigm of +one-step noise estimation learning. To enable such memory-intensive end-to-end +finetuning, we propose a novel two-level invertible design to transform both +(1) the multi-step sampling process and (2) the noise estimation U-Net in each +step into invertible networks. As a result, most intermediate features are +cleared during training to reduce up to 93.8% GPU memory. In addition, we +develop a set of lightweight modules to inject measurements into noise +estimator to further facilitate reconstruction. Experiments demonstrate that +IDM outperforms existing state-of-the-art CS networks by up to 2.64dB in PSNR. +Compared to the recent diffusion model-based approach DDNM, our IDM achieves up +to 10.09dB PSNR gain and 14.54 times faster inference. + +
+
+
+
+
+ + ☆ TRIP: Temporal Residual Learning with Image Noise Prior for + Image-to-Video Diffusion Models CVPR 2024 + + +
+ Recent advances in text-to-video generation have demonstrated the utility of +powerful diffusion models. Nevertheless, the problem is not trivial when +shaping diffusion models to animate static image (i.e., image-to-video +generation). The difficulty originates from the aspect that the diffusion +process of subsequent animated frames should not only preserve the faithful +alignment with the given image but also pursue temporal coherence among +adjacent frames. To alleviate this, we present TRIP, a new recipe of +image-to-video diffusion paradigm that pivots on image noise prior derived from +static image to jointly trigger inter-frame relational reasoning and ease the +coherent temporal modeling via temporal residual learning. Technically, the +image noise prior is first attained through one-step backward diffusion process +based on both static image and noised video latent codes. Next, TRIP executes a +residual-like dual-path scheme for noise prediction: 1) a shortcut path that +directly takes image noise prior as the reference noise of each frame to +amplify the alignment between the first frame and subsequent frames; 2) a +residual path that employs 3D-UNet over noised video and static image latent +codes to enable inter-frame relational reasoning, thereby easing the learning +of the residual noise for each frame. Furthermore, both reference and residual +noise of each frame are dynamically merged via attention mechanism for final +video generation. Extensive experiments on WebVid-10M, DTDB and MSR-VTT +datasets demonstrate the effectiveness of our TRIP for image-to-video +generation. Please see our project page at https://trip-i2v.github.io/TRIP/. + +
+
+ comment: CVPR 2024; Project page: https://trip-i2v.github.io/TRIP/ +
+
+
+
+
+ + ☆ SD-DiT: Unleashing the Power of Self-supervised Discrimination in + Diffusion Transformer CVPR 2024 + + +
+ Diffusion Transformer (DiT) has emerged as the new trend of generative +diffusion models on image generation. In view of extremely slow convergence in +typical DiT, recent breakthroughs have been driven by mask strategy that +significantly improves the training efficiency of DiT with additional +intra-image contextual learning. Despite this progress, mask strategy still +suffers from two inherent limitations: (a) training-inference discrepancy and +(b) fuzzy relations between mask reconstruction & generative diffusion process, +resulting in sub-optimal training of DiT. In this work, we address these +limitations by novelly unleashing the self-supervised discrimination knowledge +to boost DiT training. Technically, we frame our DiT in a teacher-student +manner. The teacher-student discriminative pairs are built on the diffusion +noises along the same Probability Flow Ordinary Differential Equation (PF-ODE). +Instead of applying mask reconstruction loss over both DiT encoder and decoder, +we decouple DiT encoder and decoder to separately tackle discriminative and +generative objectives. In particular, by encoding discriminative pairs with +student and teacher DiT encoders, a new discriminative loss is designed to +encourage the inter-image alignment in the self-supervised embedding space. +After that, student samples are fed into student DiT decoder to perform the +typical generative diffusion task. Extensive experiments are conducted on +ImageNet dataset, and our method achieves a competitive balance between +training cost and generative capacity. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ VP3D: Unleashing 2D Visual Prompt for Text-to-3D Generation CVPR 2024 + + +
+ Recent innovations on text-to-3D generation have featured Score Distillation +Sampling (SDS), which enables the zero-shot learning of implicit 3D models +(NeRF) by directly distilling prior knowledge from 2D diffusion models. +However, current SDS-based models still struggle with intricate text prompts +and commonly result in distorted 3D models with unrealistic textures or +cross-view inconsistency issues. In this work, we introduce a novel Visual +Prompt-guided text-to-3D diffusion model (VP3D) that explicitly unleashes the +visual appearance knowledge in 2D visual prompt to boost text-to-3D generation. +Instead of solely supervising SDS with text prompt, VP3D first capitalizes on +2D diffusion model to generate a high-quality image from input text, which +subsequently acts as visual prompt to strengthen SDS optimization with explicit +visual appearance. Meanwhile, we couple the SDS optimization with additional +differentiable reward function that encourages rendering images of 3D models to +better visually align with 2D visual prompt and semantically match with text +prompt. Through extensive experiments, we show that the 2D Visual Prompt in our +VP3D significantly eases the learning of visual appearance of 3D models and +thus leads to higher visual fidelity with more detailed textures. It is also +appealing in view that when replacing the self-generating visual prompt with a +given reference image, VP3D is able to trigger a new task of stylized +text-to-3D generation. Our project page is available at +https://vp3d-cvpr24.github.io. + +
+
+ comment: CVPR 2024; Project page: https://vp3d-cvpr24.github.io +
+
+
+
+
+ + ☆ Learning Spatial Adaptation and Temporal Coherence in Diffusion Models + for Video Super-Resolution CVPR 2024 + + +
+ Diffusion models are just at a tipping point for image super-resolution task. +Nevertheless, it is not trivial to capitalize on diffusion models for video +super-resolution which necessitates not only the preservation of visual +appearance from low-resolution to high-resolution videos, but also the temporal +consistency across video frames. In this paper, we propose a novel approach, +pursuing Spatial Adaptation and Temporal Coherence (SATeCo), for video +super-resolution. SATeCo pivots on learning spatial-temporal guidance from +low-resolution videos to calibrate both latent-space high-resolution video +denoising and pixel-space video reconstruction. Technically, SATeCo freezes all +the parameters of the pre-trained UNet and VAE, and only optimizes two +deliberately-designed spatial feature adaptation (SFA) and temporal feature +alignment (TFA) modules, in the decoder of UNet and VAE. SFA modulates frame +features via adaptively estimating affine parameters for each pixel, +guaranteeing pixel-wise guidance for high-resolution frame synthesis. TFA +delves into feature interaction within a 3D local window (tubelet) through +self-attention, and executes cross-attention between tubelet and its +low-resolution counterpart to guide temporal feature alignment. Extensive +experiments conducted on the REDS4 and Vid4 datasets demonstrate the +effectiveness of our approach. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Visual CoT: Unleashing Chain-of-Thought Reasoning in Multi-Modal + Language Models + + +
+ This paper presents Visual CoT, a novel pipeline that leverages the reasoning +capabilities of multi-modal large language models (MLLMs) by incorporating +visual Chain-of-Thought (CoT) reasoning. While MLLMs have shown promise in +various visual tasks, they often lack interpretability and struggle with +complex visual inputs. To address these challenges, we propose a multi-turn +processing pipeline that dynamically focuses on visual inputs and provides +interpretable thoughts. We collect and introduce the Visual CoT dataset +comprising 373k question-answer pairs, annotated with intermediate bounding +boxes highlighting key regions essential for answering the questions. +Importantly, the introduced benchmark is capable of evaluating MLLMs in +scenarios requiring specific local region identification. Extensive experiments +demonstrate the effectiveness of our framework and shed light on better +inference strategies. The Visual CoT dataset, benchmark, and pre-trained models +are available to foster further research in this direction. + +
+
+ comment: Code: https://github.com/deepcs233/Visual-CoT +
+
+
+
+
+ + ☆ Understanding Long Videos in One Multimodal Language Model Pass + + +
+ Large Language Models (LLMs), known to contain a strong awareness of world +knowledge, have allowed recent approaches to achieve excellent performance on +Long-Video Understanding benchmarks, but at high inference costs. In this work, +we first propose Likelihood Selection, a simple technique that unlocks faster +inference in autoregressive LLMs for multiple-choice tasks common in long-video +benchmarks. In addition to faster inference, we discover the resulting models +to yield surprisingly good accuracy on long-video tasks, even with no video +specific information. Building on this, we inject video-specific object-centric +information extracted from off-the-shelf pre-trained models and utilize natural +language as a medium for information fusion. Our resulting Multimodal Video +Understanding (MVU) framework demonstrates state-of-the-art performance across +long-video and fine-grained action recognition benchmarks. Code available at: +https://github.com/kahnchana/mvu + +
+
+ comment: 24 pages +
+
+
+
+
+ + ☆ Composed Video Retrieval via Enriched Context and Discriminative + Embeddings CVPR-2024 + + +
+ Composed video retrieval (CoVR) is a challenging problem in computer vision +which has recently highlighted the integration of modification text with visual +queries for more sophisticated video search in large databases. Existing works +predominantly rely on visual queries combined with modification text to +distinguish relevant videos. However, such a strategy struggles to fully +preserve the rich query-specific context in retrieved target videos and only +represents the target video using visual embedding. We introduce a novel CoVR +framework that leverages detailed language descriptions to explicitly encode +query-specific contextual information and learns discriminative embeddings of +vision only, text only and vision-text for better alignment to accurately +retrieve matched target videos. Our proposed framework can be flexibly employed +for both composed video (CoVR) and image (CoIR) retrieval tasks. Experiments on +three datasets show that our approach obtains state-of-the-art performance for +both CovR and zero-shot CoIR tasks, achieving gains as high as around 7% in +terms of recall@K=1 score. Our code, models, detailed language descriptions for +WebViD-CoVR dataset are available at +\url{https://github.com/OmkarThawakar/composed-video-retrieval} + +
+
+ comment: CVPR-2024 +
+
+
+
+
+ + ☆ DriveCoT: Integrating Chain-of-Thought Reasoning with End-to-End Driving + + +
+ End-to-end driving has made significant progress in recent years, +demonstrating benefits such as system simplicity and competitive driving +performance under both open-loop and closed-loop settings. Nevertheless, the +lack of interpretability and controllability in its driving decisions hinders +real-world deployment for end-to-end driving systems. In this paper, we collect +a comprehensive end-to-end driving dataset named DriveCoT, leveraging the CARLA +simulator. It contains sensor data, control decisions, and chain-of-thought +labels to indicate the reasoning process. We utilize the challenging driving +scenarios from the CARLA leaderboard 2.0, which involve high-speed driving and +lane-changing, and propose a rule-based expert policy to control the vehicle +and generate ground truth labels for its reasoning process across different +driving aspects and the final decisions. This dataset can serve as an open-loop +end-to-end driving benchmark, enabling the evaluation of accuracy in various +chain-of-thought aspects and the final decision. In addition, we propose a +baseline model called DriveCoT-Agent, trained on our dataset, to generate +chain-of-thought predictions and final decisions. The trained model exhibits +strong performance in both open-loop and closed-loop evaluations, demonstrating +the effectiveness of our proposed dataset. + +
+
+
+
+
+ + ☆ Mapping Image Transformations Onto Pixel Processor Arrays + + +
+ Pixel Processor Arrays (PPA) present a new vision sensor/processor +architecture consisting of a SIMD array of processor elements, each capable of +light capture, storage, processing and local communication. Such a device +allows visual data to be efficiently stored and manipulated directly upon the +focal plane, but also demands the invention of new approaches and algorithms, +suitable for the massively-parallel fine-grain processor arrays. In this paper +we demonstrate how various image transformations, including shearing, rotation +and scaling, can be performed directly upon a PPA. The implementation details +are presented using the SCAMP-5 vision chip, that contains a 256x256 +pixel-parallel array. Our approaches for performing the image transformations +efficiently exploit the parallel computation in a cellular processor array, +minimizing the number of SIMD instructions required. These fundamental image +transformations are vital building blocks for many visual tasks. This paper +aims to serve as a reference for future PPA research while demonstrating the +flexibility of PPA architectures. + +
+
+
+
+
+ + ☆ Comp4D: LLM-Guided Compositional 4D Scene Generation + + +
+ Recent advancements in diffusion models for 2D and 3D content creation have +sparked a surge of interest in generating 4D content. However, the scarcity of +3D scene datasets constrains current methodologies to primarily object-centric +generation. To overcome this limitation, we present Comp4D, a novel framework +for Compositional 4D Generation. Unlike conventional methods that generate a +singular 4D representation of the entire scene, Comp4D innovatively constructs +each 4D object within the scene separately. Utilizing Large Language Models +(LLMs), the framework begins by decomposing an input text prompt into distinct +entities and maps out their trajectories. It then constructs the compositional +4D scene by accurately positioning these objects along their designated paths. +To refine the scene, our method employs a compositional score distillation +technique guided by the pre-defined trajectories, utilizing pre-trained +diffusion models across text-to-image, text-to-video, and text-to-3D domains. +Extensive experiments demonstrate our outstanding 4D content creation +capability compared to prior arts, showcasing superior visual quality, motion +fidelity, and enhanced object interactions. + +
+
+ comment: Project page: https://vita-group.github.io/Comp4D/ +
+
+
+
+
+ + ☆ Be Yourself: Bounded Attention for Multi-Subject Text-to-Image + Generation + + +
+ Text-to-image diffusion models have an unprecedented ability to generate +diverse and high-quality images. However, they often struggle to faithfully +capture the intended semantics of complex input prompts that include multiple +subjects. Recently, numerous layout-to-image extensions have been introduced to +improve user control, aiming to localize subjects represented by specific +tokens. Yet, these methods often produce semantically inaccurate images, +especially when dealing with multiple semantically or visually similar +subjects. In this work, we study and analyze the causes of these limitations. +Our exploration reveals that the primary issue stems from inadvertent semantic +leakage between subjects in the denoising process. This leakage is attributed +to the diffusion model's attention layers, which tend to blend the visual +features of different subjects. To address these issues, we introduce Bounded +Attention, a training-free method for bounding the information flow in the +sampling process. Bounded Attention prevents detrimental leakage among subjects +and enables guiding the generation to promote each subject's individuality, +even with complex multi-subject conditioning. Through extensive +experimentation, we demonstrate that our method empowers the generation of +multiple subjects that better align with given prompts and layouts. + +
+
+ comment: Project page: https://omer11a.github.io/bounded-attention/ +
+
+
+
+
+ + ☆ Self-STORM: Deep Unrolled Self-Supervised Learning for Super-Resolution + Microscopy + + +
+ The use of fluorescent molecules to create long sequences of low-density, +diffraction-limited images enables highly-precise molecule localization. +However, this methodology requires lengthy imaging times, which limits the +ability to view dynamic interactions of live cells on short time scales. Many +techniques have been developed to reduce the number of frames needed for +localization, from classic iterative optimization to deep neural networks. +Particularly, deep algorithm unrolling utilizes both the structure of iterative +sparse recovery algorithms and the performance gains of supervised deep +learning. However, the robustness of this approach is highly dependant on +having sufficient training data. In this paper we introduce deep unrolled +self-supervised learning, which alleviates the need for such data by training a +sequence-specific, model-based autoencoder that learns only from given +measurements. Our proposed method exceeds the performance of its supervised +counterparts, thus allowing for robust, dynamic imaging well below the +diffraction limit without any labeled training samples. Furthermore, the +suggested model-based autoencoder scheme can be utilized to enhance +generalization in any sparse recovery framework, without the need for external +training data. + +
+
+
+
+
+ + ☆ Joint chest X-ray diagnosis and clinical visual attention prediction + with multi-stage cooperative learning: enhancing interpretability + + +
+ As deep learning has become the state-of-the-art for computer-assisted +diagnosis, interpretability of the automatic decisions is crucial for clinical +deployment. While various methods were proposed in this domain, visual +attention maps of clinicians during radiological screening offer a unique asset +to provide important insights and can potentially enhance the quality of +computer-assisted diagnosis. With this paper, we introduce a novel +deep-learning framework for joint disease diagnosis and prediction of +corresponding visual saliency maps for chest X-ray scans. Specifically, we +designed a novel dual-encoder multi-task UNet, which leverages both a +DenseNet201 backbone and a Residual and Squeeze-and-Excitation block-based +encoder to extract diverse features for saliency map prediction, and a +multi-scale feature-fusion classifier to perform disease classification. To +tackle the issue of asynchronous training schedules of individual tasks in +multi-task learning, we proposed a multi-stage cooperative learning strategy, +with contrastive learning for feature encoder pretraining to boost performance. +Experiments show that our proposed method outperformed existing techniques for +chest X-ray diagnosis and the quality of visual saliency map prediction. + +
+
+
+
+
+ + ☆ Visual Whole-Body Control for Legged Loco-Manipulation + + +
+ We study the problem of mobile manipulation using legged robots equipped with +an arm, namely legged loco-manipulation. The robot legs, while usually utilized +for mobility, offer an opportunity to amplify the manipulation capabilities by +conducting whole-body control. That is, the robot can control the legs and the +arm at the same time to extend its workspace. We propose a framework that can +conduct the whole-body control autonomously with visual observations. Our +approach, namely \ourFull~(\our), is composed of a low-level policy using all +degrees of freedom to track the end-effector manipulator position and a +high-level policy proposing the end-effector position based on visual inputs. +We train both levels of policies in simulation and perform Sim2Real transfer +for real robot deployment. We perform extensive experiments and show +significant improvements over baselines in picking up diverse objects in +different configurations (heights, locations, orientations) and environments. +Project page: https://wholebody-b1.github.io + +
+
+ comment: The first two authors contribute equally. Project page: + https://wholebody-b1.github.io +
+
+
+
+
+ + ☆ GSDF: 3DGS Meets SDF for Improved Rendering and Reconstruction + + +
+ Presenting a 3D scene from multiview images remains a core and long-standing +challenge in computer vision and computer graphics. Two main requirements lie +in rendering and reconstruction. Notably, SOTA rendering quality is usually +achieved with neural volumetric rendering techniques, which rely on aggregated +point/primitive-wise color and neglect the underlying scene geometry. Learning +of neural implicit surfaces is sparked from the success of neural rendering. +Current works either constrain the distribution of density fields or the shape +of primitives, resulting in degraded rendering quality and flaws on the learned +scene surfaces. The efficacy of such methods is limited by the inherent +constraints of the chosen neural representation, which struggles to capture +fine surface details, especially for larger, more intricate scenes. To address +these issues, we introduce GSDF, a novel dual-branch architecture that combines +the benefits of a flexible and efficient 3D Gaussian Splatting (3DGS) +representation with neural Signed Distance Fields (SDF). The core idea is to +leverage and enhance the strengths of each branch while alleviating their +limitation through mutual guidance and joint supervision. We show on diverse +scenes that our design unlocks the potential for more accurate and detailed +surface reconstructions, and at the meantime benefits 3DGS rendering with +structures that are more aligned with the underlying geometry. + +
+
+ comment: Project page: https://city-super.github.io/GSDF +
+
+
+
+
+ + ☆ TwinLiteNetPlus: A Stronger Model for Real-time Drivable Area and Lane + Segmentation + + +
+ Semantic segmentation is crucial for autonomous driving, particularly for +Drivable Area and Lane Segmentation, ensuring safety and navigation. To address +the high computational costs of current state-of-the-art (SOTA) models, this +paper introduces TwinLiteNetPlus (TwinLiteNet$^+$), a model adept at balancing +efficiency and accuracy. TwinLiteNet$^+$ incorporates standard and depth-wise +separable dilated convolutions, reducing complexity while maintaining high +accuracy. It is available in four configurations, from the robust 1.94 +million-parameter TwinLiteNet$^+_{\text{Large}}$ to the ultra-compact +34K-parameter TwinLiteNet$^+_{\text{Nano}}$. Notably, +TwinLiteNet$^+_{\text{Large}}$ attains a 92.9\% mIoU for Drivable Area +Segmentation and a 34.2\% IoU for Lane Segmentation. These results notably +outperform those of current SOTA models while requiring a computational cost +that is approximately 11 times lower in terms of Floating Point Operations +(FLOPs) compared to the existing SOTA model. Extensively tested on various +embedded devices, TwinLiteNet$^+$ demonstrates promising latency and power +efficiency, underscoring its suitability for real-world autonomous vehicle +applications. + +
+
+
+
+
+ + ☆ Isolated Diffusion: Optimizing Multi-Concept Text-to-Image Generation + Training-Freely with Isolated Diffusion Guidance + + +
+ Large-scale text-to-image diffusion models have achieved great success in +synthesizing high-quality and diverse images given target text prompts. Despite +the revolutionary image generation ability, current state-of-the-art models +still struggle to deal with multi-concept generation accurately in many cases. +This phenomenon is known as ``concept bleeding" and displays as the unexpected +overlapping or merging of various concepts. This paper presents a general +approach for text-to-image diffusion models to address the mutual interference +between different subjects and their attachments in complex scenes, pursuing +better text-image consistency. The core idea is to isolate the synthesizing +processes of different concepts. We propose to bind each attachment to +corresponding subjects separately with split text prompts. Besides, we +introduce a revision method to fix the concept bleeding problem in +multi-subject synthesis. We first depend on pre-trained object detection and +segmentation models to obtain the layouts of subjects. Then we isolate and +resynthesize each subject individually with corresponding text prompts to avoid +mutual interference. Overall, we achieve a training-free strategy, named +Isolated Diffusion, to optimize multi-concept text-to-image synthesis. It is +compatible with the latest Stable Diffusion XL (SDXL) and prior Stable +Diffusion (SD) models. We compare our approach with alternative methods using a +variety of multi-concept text prompts and demonstrate its effectiveness with +clear advantages in text-image consistency and user study. + +
+
+
+
+
+ + ☆ Hyperspherical Classification with Dynamic Label-to-Prototype Assignment CVPR 2024 + + +
+ Aiming to enhance the utilization of metric space by the parametric softmax +classifier, recent studies suggest replacing it with a non-parametric +alternative. Although a non-parametric classifier may provide better metric +space utilization, it introduces the challenge of capturing inter-class +relationships. A shared characteristic among prior non-parametric classifiers +is the static assignment of labels to prototypes during the training, ie, each +prototype consistently represents a class throughout the training course. +Orthogonal to previous works, we present a simple yet effective method to +optimize the category assigned to each prototype (label-to-prototype +assignment) during the training. To this aim, we formalize the problem as a +two-step optimization objective over network parameters and label-to-prototype +assignment mapping. We solve this optimization using a sequential combination +of gradient descent and Bipartide matching. We demonstrate the benefits of the +proposed approach by conducting experiments on balanced and long-tail +classification problems using different backbone network architectures. In +particular, our method outperforms its competitors by 1.22\% accuracy on +CIFAR-100, and 2.15\% on ImageNet-200 using a metric space dimension half of +the size of its competitors. Code: +https://github.com/msed-Ebrahimi/DL2PA_CVPR24 + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ PropTest: Automatic Property Testing for Improved Visual Programming + + +
+ Visual Programming has emerged as an alternative to end-to-end black-box +visual reasoning models. This type of methods leverage Large Language Models +(LLMs) to decompose a problem and generate the source code for an executable +computer program. This strategy has the advantage of offering an interpretable +reasoning path and does not require finetuning a model with task-specific data. +We propose PropTest, a general strategy that improves visual programming by +further using an LLM to generate code that tests for visual properties in an +initial round of proposed solutions. Particularly, our method tests for +data-type consistency, as well as syntactic and semantic properties in the +generated solutions. Our proposed solution outperforms baselines and achieves +comparable results to state-of-the-art methods while using smaller and publicly +available LLMs (CodeLlama-7B and WizardCoder-15B). This is demonstrated across +different benchmarks on visual question answering and referring expression +comprehension, showing the efficacy of our approach in enhancing the +performance and generalization of visual reasoning tasks. Specifically, +PropTest improves ViperGPT by obtaining 48.66% accuracy (+8.3%) on the A-OKVQA +benchmark and 52.8% (+3.3%) on the RefCOCO+ benchmark using CodeLlama-7B. + +
+
+ comment: Project Page: https://jaywonkoo17.github.io/PropTest/ +
+
+
+
+
+ + ☆ Make-It-Vivid: Dressing Your Animatable Biped Cartoon Characters from + Text + + +
+ Creating and animating 3D biped cartoon characters is crucial and valuable in +various applications. Compared with geometry, the diverse texture design plays +an important role in making 3D biped cartoon characters vivid and charming. +Therefore, we focus on automatic texture design for cartoon characters based on +input instructions. This is challenging for domain-specific requirements and a +lack of high-quality data. To address this challenge, we propose Make-It-Vivid, +the first attempt to enable high-quality texture generation from text in UV +space. We prepare a detailed text-texture paired data for 3D characters by +using vision-question-answering agents. Then we customize a pretrained +text-to-image model to generate texture map with template structure while +preserving the natural 2D image knowledge. Furthermore, to enhance fine-grained +details, we propose a novel adversarial learning scheme to shorten the domain +gap between original dataset and realistic texture domain. Extensive +experiments show that our approach outperforms current texture generation +methods, resulting in efficient character texturing and faithful generation +with prompts. Besides, we showcase various applications such as out of domain +generation and texture stylization. We also provide an efficient generation +system for automatic text-guided textured character generation and animation. + +
+
+ comment: Project page: https://make-it-vivid.github.io/ +
+
+
+
+
+ + ☆ Provably Robust Score-Based Diffusion Posterior Sampling for + Plug-and-Play Image Reconstruction + + +
+ In a great number of tasks in science and engineering, the goal is to infer +an unknown image from a small number of measurements collected from a known +forward model describing certain sensing or imaging modality. Due to resource +constraints, this task is often extremely ill-posed, which necessitates the +adoption of expressive prior information to regularize the solution space. +Score-based diffusion models, due to its impressive empirical success, have +emerged as an appealing candidate of an expressive prior in image +reconstruction. In order to accommodate diverse tasks at once, it is of great +interest to develop efficient, consistent and robust algorithms that +incorporate {\em unconditional} score functions of an image prior distribution +in conjunction with flexible choices of forward models. + This work develops an algorithmic framework for employing score-based +diffusion models as an expressive data prior in general nonlinear inverse +problems. Motivated by the plug-and-play framework in the imaging community, we +introduce a diffusion plug-and-play method (\textsf{DPnP}) that alternatively +calls two samplers, a proximal consistency sampler based solely on the +likelihood function of the forward model, and a denoising diffusion sampler +based solely on the score functions of the image prior. The key insight is that +denoising under white Gaussian noise can be solved {\em rigorously} via both +stochastic (i.e., DDPM-type) and deterministic (i.e., DDIM-type) samplers using +the unconditional score functions. We establish both asymptotic and +non-asymptotic performance guarantees of \textsf{DPnP}, and provide numerical +experiments to illustrate its promise in solving both linear and nonlinear +image reconstruction tasks. To the best of our knowledge, \textsf{DPnP} is the +first provably-robust posterior sampling method for nonlinear inverse problems +using unconditional diffusion priors. + +
+
+
+
+
+ + ☆ Towards Balanced RGB-TSDF Fusion for Consistent Semantic Scene + Completion by 3D RGB Feature Completion and a Classwise Entropy Loss Function + + +
+ Semantic Scene Completion (SSC) aims to jointly infer semantics and +occupancies of 3D scenes. Truncated Signed Distance Function (TSDF), a 3D +encoding of depth, has been a common input for SSC. Furthermore, RGB-TSDF +fusion, seems promising since these two modalities provide color and geometry +information, respectively. Nevertheless, RGB-TSDF fusion has been considered +nontrivial and commonly-used naive addition will result in inconsistent +results. We argue that the inconsistency comes from the sparsity of RGB +features upon projecting into 3D space, while TSDF features are dense, leading +to imbalanced feature maps when summed up. To address this RGB-TSDF +distribution difference, we propose a two-stage network with a 3D RGB feature +completion module that completes RGB features with meaningful values for +occluded areas. Moreover, we propose an effective classwise entropy loss +function to punish inconsistency. Extensive experiments on public datasets +verify that our method achieves state-of-the-art performance among methods that +do not adopt extra data. + +
+
+
+
+
+ + ☆ CVT-xRF: Contrastive In-Voxel Transformer for 3D Consistent Radiance + Fields from Sparse Inputs CVPR 2024 + + +
+ Neural Radiance Fields (NeRF) have shown impressive capabilities for +photorealistic novel view synthesis when trained on dense inputs. However, when +trained on sparse inputs, NeRF typically encounters issues of incorrect density +or color predictions, mainly due to insufficient coverage of the scene causing +partial and sparse supervision, thus leading to significant performance +degradation. While existing works mainly consider ray-level consistency to +construct 2D learning regularization based on rendered color, depth, or +semantics on image planes, in this paper we propose a novel approach that +models 3D spatial field consistency to improve NeRF's performance with sparse +inputs. Specifically, we first adopt a voxel-based ray sampling strategy to +ensure that the sampled rays intersect with a certain voxel in 3D space. We +then randomly sample additional points within the voxel and apply a Transformer +to infer the properties of other points on each ray, which are then +incorporated into the volume rendering. By backpropagating through the +rendering loss, we enhance the consistency among neighboring points. +Additionally, we propose to use a contrastive loss on the encoder output of the +Transformer to further improve consistency within each voxel. Experiments +demonstrate that our method yields significant improvement over different +radiance fields in the sparse inputs setting, and achieves comparable +performance with current works. + +
+
+ comment: The paper is accepted by CVPR 2024. Project page is available at + https://zhongyingji.github.io/CVT-xRF +
+
+
+
+
+ + ☆ INPC: Implicit Neural Point Clouds for Radiance Field Rendering + + +
+ We introduce a new approach for reconstruction and novel-view synthesis of +unbounded real-world scenes. In contrast to previous methods using either +volumetric fields, grid-based models, or discrete point cloud proxies, we +propose a hybrid scene representation, which implicitly encodes a point cloud +in a continuous octree-based probability field and a multi-resolution hash +grid. In doing so, we combine the benefits of both worlds by retaining +favorable behavior during optimization: Our novel implicit point cloud +representation and differentiable bilinear rasterizer enable fast rendering +while preserving fine geometric detail without depending on initial priors like +structure-from-motion point clouds. Our method achieves state-of-the-art image +quality on several common benchmark datasets. Furthermore, we achieve fast +inference at interactive frame rates, and can extract explicit point clouds to +further enhance performance. + +
+
+ comment: Project page: https://fhahlbohm.github.io/inpc/ +
+
+
+
+
+ + ☆ Multiple Object Tracking as ID Prediction + + +
+ In Multiple Object Tracking (MOT), tracking-by-detection methods have stood +the test for a long time, which split the process into two parts according to +the definition: object detection and association. They leverage robust +single-frame detectors and treat object association as a post-processing step +through hand-crafted heuristic algorithms and surrogate tasks. However, the +nature of heuristic techniques prevents end-to-end exploitation of training +data, leading to increasingly cumbersome and challenging manual modification +while facing complicated or novel scenarios. In this paper, we regard this +object association task as an End-to-End in-context ID prediction problem and +propose a streamlined baseline called MOTIP. Specifically, we form the target +embeddings into historical trajectory information while considering the +corresponding IDs as in-context prompts, then directly predict the ID labels +for the objects in the current frame. Thanks to this end-to-end process, MOTIP +can learn tracking capabilities straight from training data, freeing itself +from burdensome hand-crafted algorithms. Without bells and whistles, our method +achieves impressive state-of-the-art performance in complex scenarios like +DanceTrack and SportsMOT, and it performs competitively with other +transformer-based methods on MOT17. We believe that MOTIP demonstrates +remarkable potential and can serve as a starting point for future research. The +code is available at https://github.com/MCG-NJU/MOTIP. + +
+
+ comment: 71.4 HOTA on DanceTrack (with CrowdHuman), 67.5/70.0 HOTA on + DanceTrack built upon Deformable DETR and DAB-Deformable DETR respectively + (without additional data). The code repository will be created within several + days +
+
+
+
+
+ + ☆ From Two Stream to One Stream: Efficient RGB-T Tracking via Mutual + Prompt Learning and Knowledge Distillation + + +
+ Due to the complementary nature of visible light and thermal in-frared +modalities, object tracking based on the fusion of visible light images and +thermal images (referred to as RGB-T tracking) has received increasing +attention from researchers in recent years. How to achieve more comprehensive +fusion of information from the two modalities at a lower cost has been an issue +that re-searchers have been exploring. Inspired by visual prompt learn-ing, we +designed a novel two-stream RGB-T tracking architecture based on cross-modal +mutual prompt learning, and used this model as a teacher to guide a one-stream +student model for rapid learning through knowledge distillation techniques. +Extensive experiments have shown that, compared to similar RGB-T track-ers, our +designed teacher model achieved the highest precision rate, while the student +model, with comparable precision rate to the teacher model, realized an +inference speed more than three times faster than the teacher model.(Codes will +be available if accepted.) + +
+
+
+
+
+ + ☆ UrbanVLP: A Multi-Granularity Vision-Language Pre-Trained Foundation + Model for Urban Indicator Prediction + + +
+ Urban indicator prediction aims to infer socio-economic metrics in diverse +urban landscapes using data-driven methods. However, prevalent pre-trained +models, particularly those reliant on satellite imagery, face dual challenges. +Firstly, concentrating solely on macro-level patterns from satellite data may +introduce bias, lacking nuanced details at micro levels, such as architectural +details at a place. Secondly, the lack of interpretability in pre-trained +models limits their utility in providing transparent evidence for urban +planning. In response to these issues, we devise a novel Vision-Language +Pre-Trained Model (UrbanVLP) in this paper. Our UrbanVLP seamlessly integrates +multi-granularity information from both macro (satellite) and micro +(street-view) levels, overcoming the limitations of prior pre-trained models. +Moreover, it introduces automatic text generation and calibration, elevating +interpretability in downstream applications by producing high-quality text +descriptions of urban imagery. Rigorous experiments conducted across six +socio-economic tasks underscore UrbanVLP's superior performance. We also deploy +a web platform to verify its practicality. + +
+
+
+
+
+ + ☆ One-Shot Domain Incremental Learning IJCNN + + +
+ Domain incremental learning (DIL) has been discussed in previous studies on +deep neural network models for classification. In DIL, we assume that samples +on new domains are observed over time. The models must classify inputs on all +domains. In practice, however, we may encounter a situation where we need to +perform DIL under the constraint that the samples on the new domain are +observed only infrequently. Therefore, in this study, we consider the extreme +case where we have only one sample from the new domain, which we call one-shot +DIL. We first empirically show that existing DIL methods do not work well in +one-shot DIL. We have analyzed the reason for this failure through various +investigations. According to our analysis, we clarify that the difficulty of +one-shot DIL is caused by the statistics in the batch normalization layers. +Therefore, we propose a technique regarding these statistics and demonstrate +the effectiveness of our technique through experiments on open datasets. + +
+
+ comment: accepted at IEEE International Joint Conference on Neural Networks + (IJCNN) 2024 +
+
+
+
+
+ + ☆ Learning from Reduced Labels for Long-Tailed Data + + +
+ Long-tailed data is prevalent in real-world classification tasks and heavily +relies on supervised information, which makes the annotation process +exceptionally labor-intensive and time-consuming. Unfortunately, despite being +a common approach to mitigate labeling costs, existing weakly supervised +learning methods struggle to adequately preserve supervised information for +tail samples, resulting in a decline in accuracy for the tail classes. To +alleviate this problem, we introduce a novel weakly supervised labeling setting +called Reduced Label. The proposed labeling setting not only avoids the decline +of supervised information for the tail samples, but also decreases the labeling +costs associated with long-tailed data. Additionally, we propose an +straightforward and highly efficient unbiased framework with strong theoretical +guarantees to learn from these Reduced Labels. Extensive experiments conducted +on benchmark datasets including ImageNet validate the effectiveness of our +approach, surpassing the performance of state-of-the-art weakly supervised +methods. + +
+
+ comment: 12 pages, 3 figures +
+
+
+
+
+ + ☆ Resolution Limit of Single-Photon LiDAR + + +
+ Single-photon Light Detection and Ranging (LiDAR) systems are often equipped +with an array of detectors for improved spatial resolution and sensing speed. +However, given a fixed amount of flux produced by the laser transmitter across +the scene, the per-pixel Signal-to-Noise Ratio (SNR) will decrease when more +pixels are packed in a unit space. This presents a fundamental trade-off +between the spatial resolution of the sensor array and the SNR received at each +pixel. Theoretical characterization of this fundamental limit is explored. By +deriving the photon arrival statistics and introducing a series of new +approximation techniques, the Mean Squared Error (MSE) of the +maximum-likelihood estimator of the time delay is derived. The theoretical +predictions align well with simulations and real data. + +
+
+
+
+
+ + ☆ ProIn: Learning to Predict Trajectory Based on Progressive Interactions + for Autonomous Driving + + +
+ Accurate motion prediction of pedestrians, cyclists, and other surrounding +vehicles (all called agents) is very important for autonomous driving. Most +existing works capture map information through an one-stage interaction with +map by vector-based attention, to provide map constraints for social +interaction and multi-modal differentiation. However, these methods have to +encode all required map rules into the focal agent's feature, so as to retain +all possible intentions' paths while at the meantime to adapt to potential +social interaction. In this work, a progressive interaction network is proposed +to enable the agent's feature to progressively focus on relevant maps, in order +to better learn agents' feature representation capturing the relevant map +constraints. The network progressively encode the complex influence of map +constraints into the agent's feature through graph convolutions at the +following three stages: after historical trajectory encoder, after social +interaction, and after multi-modal differentiation. In addition, a weight +allocation mechanism is proposed for multi-modal training, so that each mode +can obtain learning opportunities from a single-mode ground truth. Experiments +have validated the superiority of progressive interactions to the existing +one-stage interaction, and demonstrate the effectiveness of each component. +Encouraging results were obtained in the challenging benchmarks. + +
+
+
+
+
+ + ☆ Brain Stroke Segmentation Using Deep Learning Models: A Comparative + Study + + +
+ Stroke segmentation plays a crucial role in the diagnosis and treatment of +stroke patients by providing spatial information about affected brain regions +and the extent of damage. Segmenting stroke lesions accurately is a challenging +task, given that conventional manual techniques are time consuming and prone to +errors. Recently, advanced deep models have been introduced for general medical +image segmentation, demonstrating promising results that surpass many state of +the art networks when evaluated on specific datasets. With the advent of the +vision Transformers, several models have been introduced based on them, while +others have aimed to design better modules based on traditional convolutional +layers to extract long-range dependencies like Transformers. The question of +whether such high-level designs are necessary for all segmentation cases to +achieve the best results remains unanswered. In this study, we selected four +types of deep models that were recently proposed and evaluated their +performance for stroke segmentation: a pure Transformer-based architecture +(DAE-Former), two advanced CNN-based models (LKA and DLKA) with attention +mechanisms in their design, an advanced hybrid model that incorporates CNNs +with Transformers (FCT), and the well-known self-adaptive nnUNet framework with +its configuration based on given data. We examined their performance on two +publicly available datasets, and found that the nnUNet achieved the best +results with the simplest design among all. Revealing the robustness issue of +Transformers to such variabilities serves as a potential reason for their +weaker performance. Furthermore, nnUNet's success underscores the significant +impact of preprocessing and postprocessing techniques in enhancing segmentation +results, surpassing the focus solely on architectural designs + +
+
+
+
+
+ + ♻ ☆ HAIFIT: Human-Centered AI for Fashion Image Translation + + +
+ In the realm of fashion design, sketches serve as the canvas for expressing +an artist's distinctive drawing style and creative vision, capturing intricate +details like stroke variations and texture nuances. The advent of +sketch-to-image cross-modal translation technology has notably aided designers. +However, existing methods often compromise these sketch details during image +generation, resulting in images that deviate from the designer's intended +concept. This limitation hampers the ability to offer designers a precise +preview of the final output. To overcome this challenge, we introduce HAIFIT, a +novel approach that transforms sketches into high-fidelity, lifelike clothing +images by integrating multi-scale features and capturing extensive feature map +dependencies from diverse perspectives. Through extensive qualitative and +quantitative evaluations conducted on our self-collected dataset, our method +demonstrates superior performance compared to existing methods in generating +photorealistic clothing images. Our method excels in preserving the distinctive +style and intricate details essential for fashion design applications. + +
+
+ comment: 8 pages,8 figures +
+
+
+
+
+ + ♻ ☆ SeMoLi: What Moves Together Belongs Together CVPR 2024 + + +
+ We tackle semi-supervised object detection based on motion cues. Recent +results suggest that heuristic-based clustering methods in conjunction with +object trackers can be used to pseudo-label instances of moving objects and use +these as supervisory signals to train 3D object detectors in Lidar data without +manual supervision. We re-think this approach and suggest that both, object +detection, as well as motion-inspired pseudo-labeling, can be tackled in a +data-driven manner. We leverage recent advances in scene flow estimation to +obtain point trajectories from which we extract long-term, class-agnostic +motion patterns. Revisiting correlation clustering in the context of message +passing networks, we learn to group those motion patterns to cluster points to +object instances. By estimating the full extent of the objects, we obtain +per-scan 3D bounding boxes that we use to supervise a Lidar object detection +network. Our method not only outperforms prior heuristic-based approaches (57.5 +AP, +14 improvement over prior work), more importantly, we show we can +pseudo-label and train object detectors across datasets. + +
+
+ comment: Accepted to CVPR 2024! +
+
+
+
+
+ + ♻ ☆ Geometric Generative Models based on Morphological Equivariant PDEs and + GANs + + +
+ Content and image generation consist in creating or generating data from +noisy information by extracting specific features such as texture, edges, and +other thin image structures. We are interested here in generative models, and +two main problems are addressed. Firstly, the improvements of specific feature +extraction while accounting at multiscale levels intrinsic geometric features; +and secondly, the equivariance of the network to reduce its complexity and +provide a geometric interpretability. To proceed, we propose a geometric +generative model based on an equivariant partial differential equation (PDE) +for group convolution neural networks (G-CNNs), so called PDE-G-CNNs, built on +morphology operators and generative adversarial networks (GANs). Equivariant +morphological PDE layers are composed of multiscale dilations and erosions +formulated in Riemannian manifolds, while group symmetries are defined on a Lie +group. We take advantage of the Lie group structure to properly integrate the +equivariance in layers, and are able to use the Riemannian metric to solve the +multiscale morphological operations. Each point of the Lie group is associated +with a unique point in the manifold, which helps us derive a metric on the +Riemannian manifold from a tensor field invariant under the Lie group so that +the induced metric has the same symmetries. The proposed geometric +morphological GAN (GM-GAN) is obtained by using the proposed morphological +equivariant convolutions in PDE-G-CNNs to bring nonlinearity in classical CNNs. +GM-GAN is evaluated on MNIST data and compared with GANs. Preliminary results +show that GM-GAN model outperforms classical GAN. + +
+
+
+
+
+ + ♻ ☆ Towards Precise 3D Human Pose Estimation with Multi-Perspective + Spatial-Temporal Relational Transformers IJCNN 2024 + + +
+ 3D human pose estimation captures the human joint points in three-dimensional +space while keeping the depth information and physical structure. That is +essential for applications that require precise pose information, such as +human-computer interaction, scene understanding, and rehabilitation training. +Due to the challenges in data collection, mainstream datasets of 3D human pose +estimation are primarily composed of multi-view video data collected in +laboratory environments, which contains rich spatial-temporal correlation +information besides the image frame content. Given the remarkable +self-attention mechanism of transformers, capable of capturing the +spatial-temporal correlation from multi-view video datasets, we propose a +multi-stage framework for 3D sequence-to-sequence (seq2seq) human pose +detection. Firstly, the spatial module represents the human pose feature by +intra-image content, while the frame-image relation module extracts temporal +relationships and 3D spatial positional relationship features between the +multi-perspective images. Secondly, the self-attention mechanism is adopted to +eliminate the interference from non-human body parts and reduce computing +resources. Our method is evaluated on Human3.6M, a popular 3D human pose +detection dataset. Experimental results demonstrate that our approach achieves +state-of-the-art performance on this dataset. The source code will be available +at https://github.com/WUJINHUAN/3D-human-pose. + +
+
+ comment: Accepted to IJCNN 2024. The source code will be available at + https://github.com/WUJINHUAN/3D-human-pose +
+
+
+
+
+ + ♻ ☆ Meet JEANIE: a Similarity Measure for 3D Skeleton Sequences via + Temporal-Viewpoint Alignment ACCV'22 + + +
+ Video sequences exhibit significant nuisance variations (undesired effects) +of speed of actions, temporal locations, and subjects' poses, leading to +temporal-viewpoint misalignment when comparing two sets of frames or evaluating +the similarity of two sequences. Thus, we propose Joint tEmporal and cAmera +viewpoiNt alIgnmEnt (JEANIE) for sequence pairs. In particular, we focus on 3D +skeleton sequences whose camera and subjects' poses can be easily manipulated +in 3D. We evaluate JEANIE on skeletal Few-shot Action Recognition (FSAR), where +matching well temporal blocks (temporal chunks that make up a sequence) of +support-query sequence pairs (by factoring out nuisance variations) is +essential due to limited samples of novel classes. Given a query sequence, we +create its several views by simulating several camera locations. For a support +sequence, we match it with view-simulated query sequences, as in the popular +Dynamic Time Warping (DTW). Specifically, each support temporal block can be +matched to the query temporal block with the same or adjacent (next) temporal +index, and adjacent camera views to achieve joint local temporal-viewpoint +warping. JEANIE selects the smallest distance among matching paths with +different temporal-viewpoint warping patterns, an advantage over DTW which only +performs temporal alignment. We also propose an unsupervised FSAR akin to +clustering of sequences with JEANIE as a distance measure. JEANIE achieves +state-of-the-art results on NTU-60, NTU-120, Kinetics-skeleton and UWA3D +Multiview Activity II on supervised and unsupervised FSAR, and their +meta-learning inspired fusion. + +
+
+ comment: Accepted by the International Journal of Computer Vision (IJCV). An + extension of our ACCV'22 paper [arXiv:arXiv:2210.16820] which was + distinguished by the Sang Uk Lee Best Student Paper Award +
+
+
+
+
+ + ♻ ☆ An Image is Worth 1/2 Tokens After Layer 2: Plug-and-Play Inference + Acceleration for Large Vision-Language Models + + +
+ In this study, we identify the inefficient attention phenomena in Large +Vision-Language Models (LVLMs), notably within prominent models like LLaVA-1.5, +QwenVL-Chat and Video-LLaVA. We find out that the attention computation over +visual tokens is of extreme inefficiency in the deep layers of popular LVLMs, +suggesting a need for a sparser approach compared to textual data handling. To +this end, we introduce FastV, a versatile plug-and-play method designed to +optimize computational efficiency by learning adaptive attention patterns in +early layers and pruning visual tokens in subsequent ones. Our evaluations +demonstrate FastV's ability to dramatically reduce computational costs (e.g., a +45 reduction in FLOPs for LLaVA-1.5-13B) without sacrificing performance in a +wide range of image and video understanding tasks. The computational efficiency +and performance trade-off of FastV are highly customizable and +pareto-efficient. It can compress the FLOPs of a 13B-parameter model to achieve +a lower budget than that of a 7B-parameter model, while still maintaining +superior performance. We believe FastV has practical values for deployment of +LVLMs in edge devices and commercial models. Code is released at +https://github.com/pkunlp-icler/FastV. + +
+
+ comment: 21 papes, 8 figures, code is released at + https://github.com/pkunlp-icler/FastV +
+
+
+
+
+ + ♻ ☆ MambaIR: A Simple Baseline for Image Restoration with State-Space Model + + +
+ Recent years have seen significant advancements in image restoration, largely +attributed to the development of modern deep neural networks, such as CNNs and +Transformers. However, existing restoration backbones often face the dilemma +between global receptive fields and efficient computation, hindering their +application in practice. Recently, the Selective Structured State Space Model, +especially the improved version Mamba, has shown great potential for long-range +dependency modeling with linear complexity, which offers a way to resolve the +above dilemma. However, the standard Mamba still faces certain challenges in +low-level vision such as local pixel forgetting and channel redundancy. In this +work, we introduce a simple but effective baseline, named MambaIR, which +introduces both local enhancement and channel attention to improve the vanilla +Mamba. In this way, our MambaIR takes advantage of the local pixel similarity +and reduces the channel redundancy. Extensive experiments demonstrate the +superiority of our method, for example, MambaIR outperforms SwinIR by up to +0.45dB on image SR, using similar computational cost but with a global +receptive field. Code is available at \url{https://github.com/csguoh/MambaIR}. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ♻ ☆ Text-Conditioned Resampler For Long Form Video Understanding + + +
+ In this paper we present a text-conditioned video resampler (TCR) module that +uses a pre-trained and frozen visual encoder and large language model (LLM) to +process long video sequences for a task. TCR localises relevant visual features +from the video given a text condition and provides them to a LLM to generate a +text response. Due to its lightweight design and use of cross-attention, TCR +can process more than 100 frames at a time with plain attention and without +optimised implementations. We make the following contributions: (i) we design a +transformer-based sampling architecture that can process long videos +conditioned on a task, together with a training method that enables it to +bridge pre-trained visual and language models; (ii) we identify tasks that +could benefit from longer video perception; and (iii) we empirically validate +its efficacy on a wide variety of evaluation tasks including NextQA, EgoSchema, +and the EGO4D-LTA challenge. + +
+
+
+
+
+ + ♻ ☆ EMAGE: Towards Unified Holistic Co-Speech Gesture Generation via + Expressive Masked Audio Gesture Modeling CVPR + + +
+ We propose EMAGE, a framework to generate full-body human gestures from audio +and masked gestures, encompassing facial, local body, hands, and global +movements. To achieve this, we first introduce BEAT2 (BEAT-SMPLX-FLAME), a new +mesh-level holistic co-speech dataset. BEAT2 combines MoShed SMPLX body with +FLAME head parameters and further refines the modeling of head, neck, and +finger movements, offering a community-standardized, high-quality 3D motion +captured dataset. EMAGE leverages masked body gesture priors during training to +boost inference performance. It involves a Masked Audio Gesture Transformer, +facilitating joint training on audio-to-gesture generation and masked gesture +reconstruction to effectively encode audio and body gesture hints. Encoded body +hints from masked gestures are then separately employed to generate facial and +body movements. Moreover, EMAGE adaptively merges speech features from the +audio's rhythm and content and utilizes four compositional VQ-VAEs to enhance +the results' fidelity and diversity. Experiments demonstrate that EMAGE +generates holistic gestures with state-of-the-art performance and is flexible +in accepting predefined spatial-temporal gesture inputs, generating complete, +audio-synchronized results. Our code and dataset are available at +https://pantomatrix.github.io/EMAGE/ + +
+
+ comment: CVPR Camera Ready; Project Page: https://pantomatrix.github.io/EMAGE/ +
+
+
+
+
+ + ♻ ☆ BioNeRF: Biologically Plausible Neural Radiance Fields for View + Synthesis + + +
+ This paper presents BioNeRF, a biologically plausible architecture that +models scenes in a 3D representation and synthesizes new views through radiance +fields. Since NeRF relies on the network weights to store the scene's +3-dimensional representation, BioNeRF implements a cognitive-inspired mechanism +that fuses inputs from multiple sources into a memory-like structure, improving +the storing capacity and extracting more intrinsic and correlated information. +BioNeRF also mimics a behavior observed in pyramidal cells concerning +contextual information, in which the memory is provided as the context and +combined with the inputs of two subsequent neural models, one responsible for +producing the volumetric densities and the other the colors used to render the +scene. Experimental results show that BioNeRF outperforms state-of-the-art +results concerning a quality measure that encodes human perception in two +datasets: real-world images and synthetic data. + +
+
+
+
+
+ + ♻ ☆ Multi-modal Instruction Tuned LLMs with Fine-grained Visual Perception CVPR 2024 + + +
+ Multimodal Large Language Model (MLLMs) leverages Large Language Models as a +cognitive framework for diverse visual-language tasks. Recent efforts have been +made to equip MLLMs with visual perceiving and grounding capabilities. However, +there still remains a gap in providing fine-grained pixel-level perceptions and +extending interactions beyond text-specific inputs. In this work, we propose +{\bf{AnyRef}}, a general MLLM model that can generate pixel-wise object +perceptions and natural language descriptions from multi-modality references, +such as texts, boxes, images, or audio. This innovation empowers users with +greater flexibility to engage with the model beyond textual and regional +prompts, without modality-specific designs. Through our proposed refocusing +mechanism, the generated grounding output is guided to better focus on the +referenced object, implicitly incorporating additional pixel-level supervision. +This simple modification utilizes attention scores generated during the +inference of LLM, eliminating the need for extra computations while exhibiting +performance enhancements in both grounding masks and referring expressions. +With only publicly available training data, our model achieves state-of-the-art +results across multiple benchmarks, including diverse modality referring +segmentation and region-level referring expression generation. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Boosting Adversarial Transferability by Block Shuffle and Rotation CVPR 2024 + + +
+ Adversarial examples mislead deep neural networks with imperceptible +perturbations and have brought significant threats to deep learning. An +important aspect is their transferability, which refers to their ability to +deceive other models, thus enabling attacks in the black-box setting. Though +various methods have been proposed to boost transferability, the performance +still falls short compared with white-box attacks. In this work, we observe +that existing input transformation based attacks, one of the mainstream +transfer-based attacks, result in different attention heatmaps on various +models, which might limit the transferability. We also find that breaking the +intrinsic relation of the image can disrupt the attention heatmap of the +original image. Based on this finding, we propose a novel input transformation +based attack called block shuffle and rotation (BSR). Specifically, BSR splits +the input image into several blocks, then randomly shuffles and rotates these +blocks to construct a set of new images for gradient calculation. Empirical +evaluations on the ImageNet dataset demonstrate that BSR could achieve +significantly better transferability than the existing input transformation +based methods under single-model and ensemble-model settings. Combining BSR +with the current input transformation method can further improve the +transferability, which significantly outperforms the state-of-the-art methods. +Code is available at https://github.com/Trustworthy-AI-Group/BSR + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Word4Per: Zero-shot Composed Person Retrieval + + +
+ Searching for specific person has great social benefits and security value, +and it often involves a combination of visual and textual information. +Conventional person retrieval methods, whether image-based or text-based, +usually fall short in effectively harnessing both types of information, leading +to the loss of accuracy. In this paper, a whole new task called Composed Person +Retrieval (CPR) is proposed to jointly utilize both image and text information +for target person retrieval. However, the supervised CPR requires very costly +manual annotation dataset, while there are currently no available resources. To +mitigate this issue, we firstly introduce the Zero-shot Composed Person +Retrieval (ZS-CPR), which leverages existing domain-related data to resolve the +CPR problem without expensive annotations. Secondly, to learn ZS-CPR model, we +propose a two-stage learning framework, Word4Per, where a lightweight Textual +Inversion Network (TINet) and a text-based person retrieval model based on +fine-tuned Contrastive Language-Image Pre-training (CLIP) network are learned +without utilizing any CPR data. Thirdly, a finely annotated Image-Text Composed +Person Retrieval (ITCPR) dataset is built as the benchmark to assess the +performance of the proposed Word4Per framework. Extensive experiments under +both Rank-1 and mAP demonstrate the effectiveness of Word4Per for the ZS-CPR +task, surpassing the comparative methods by over 10\%. The code and ITCPR +dataset will be publicly available at +https://github.com/Delong-liu-bupt/Word4Per. + +
+
+
+
+
+ + ♻ ☆ Knowledge Distillation for Road Detection based on cross-model + Semi-Supervised Learning + + +
+ The advancement of knowledge distillation has played a crucial role in +enabling the transfer of knowledge from larger teacher models to smaller and +more efficient student models, and is particularly beneficial for online and +resource-constrained applications. The effectiveness of the student model +heavily relies on the quality of the distilled knowledge received from the +teacher. Given the accessibility of unlabelled remote sensing data, +semi-supervised learning has become a prevalent strategy for enhancing model +performance. However, relying solely on semi-supervised learning with smaller +models may be insufficient due to their limited capacity for feature +extraction. This limitation restricts their ability to exploit training data. +To address this issue, we propose an integrated approach that combines +knowledge distillation and semi-supervised learning methods. This hybrid +approach leverages the robust capabilities of large models to effectively +utilise large unlabelled data whilst subsequently providing the small student +model with rich and informative features for enhancement. The proposed +semi-supervised learning-based knowledge distillation (SSLKD) approach +demonstrates a notable improvement in the performance of the student model, in +the application of road segmentation, surpassing the effectiveness of +traditional semi-supervised learning methods. + +
+
+
+
+
+ + ♻ ☆ Unveiling the Blind Spots: A Critical Examination of Fairness in + Autonomous Driving Systems + + +
+ Autonomous driving systems have extended the spectrum of Web of Things for +intelligent vehicles and have become an important component of the Web +ecosystem. Similar to traditional Web-based applications, fairness is an +essential aspect for ensuring the high quality of autonomous driving systems, +particularly in the context of pedestrian detectors within them. However, there +is an absence in the literature of a comprehensive assessment of the fairness +of current Deep Learning (DL)-based pedestrian detectors. To fill the gap, we +evaluate eight widely-explored DL-based pedestrian detectors across demographic +groups on large-scale real-world datasets. To enable a thorough fairness +evaluation, we provide extensive annotations for the datasets, resulting in +8,311 images with 16,070 gender labels, 20,115 age labels, and 3,513 skin tone +labels. Our findings reveal significant fairness issues related to age. The +undetected proportions for adults are 20.14% lower compared to children. +Furthermore, we explore how various driving scenarios affect the fairness of +pedestrian detectors. We find that the bias may exacerbate for children and +females towards low brightness and low contrast. + +
+
+ comment: Update the models evaluated and the experimental results +
+
+
+
+
+ + ♻ ☆ HiFi-123: Towards High-fidelity One Image to 3D Content Generation + + +
+ Recent advances in diffusion models have enabled 3D generation from a single +image. However, current methods often produce suboptimal results for novel +views, with blurred textures and deviations from the reference image, limiting +their practical applications. In this paper, we introduce HiFi-123, a method +designed for high-fidelity and multi-view consistent 3D generation. Our +contributions are twofold: First, we propose a Reference-Guided Novel View +Enhancement (RGNV) technique that significantly improves the fidelity of +diffusion-based zero-shot novel view synthesis methods. Second, capitalizing on +the RGNV, we present a novel Reference-Guided State Distillation (RGSD) loss. +When incorporated into the optimization-based image-to-3D pipeline, our method +significantly improves 3D generation quality, achieving state-of-the-art +performance. Comprehensive evaluations demonstrate the effectiveness of our +approach over existing methods, both qualitatively and quantitatively. Video +results are available on the project page. + +
+
+ comment: Project Page: https://drexubery.github.io/HiFi-123/ +
+
+
+
+
+ + ♻ ☆ SVGDreamer: Text Guided SVG Generation with Diffusion Model CVPR 2024 + + +
+ Recently, text-guided scalable vector graphics (SVGs) synthesis has shown +promise in domains such as iconography and sketch. However, existing +text-to-SVG generation methods lack editability and struggle with visual +quality and result diversity. To address these limitations, we propose a novel +text-guided vector graphics synthesis method called SVGDreamer. SVGDreamer +incorporates a semantic-driven image vectorization (SIVE) process that enables +the decomposition of synthesis into foreground objects and background, thereby +enhancing editability. Specifically, the SIVE process introduce attention-based +primitive control and an attention-mask loss function for effective control and +manipulation of individual elements. Additionally, we propose a Vectorized +Particle-based Score Distillation (VPSD) approach to tackle the challenges of +shape over-smoothing, color over-saturation, limited diversity in results, and +slow convergence in existing text-to-SVG generation methods. VPSD models SVGs +as distributions of control points and colors to counteract over-smoothing and +over-saturation. Furthermore, VPSD leverages a reward model to reweight vector +particles, which improves aesthetic appeal and accelerates convergence. +Extensive experiments have been conducted to validate the effectiveness of +SVGDreamer, demonstrating its superiority over baseline methods in terms of +editability, visual quality, and diversity. The code and demo of SVGDreamer can +be found at https://ximinng.github.io/SVGDreamer-project/ + +
+
+ comment: Accepted by CVPR 2024. project link: + https://ximinng.github.io/SVGDreamer-project/ +
+
+
+
+
+ + ♻ ☆ Variational Bayes image restoration with compressive autoencoders + + +
+ Regularization of inverse problems is of paramount importance in +computational imaging. The ability of neural networks to learn efficient image +representations has been recently exploited to design powerful data-driven +regularizers. While state-of-the-art plug-and-play methods rely on an implicit +regularization provided by neural denoisers, alternative Bayesian approaches +consider Maximum A Posteriori (MAP) estimation in the latent space of a +generative model, thus with an explicit regularization. However, +state-of-the-art deep generative models require a huge amount of training data +compared to denoisers. Besides, their complexity hampers the optimization +involved in latent MAP derivation. In this work, we first propose to use +compressive autoencoders instead. These networks, which can be seen as +variational autoencoders with a flexible latent prior, are smaller and easier +to train than state-of-the-art generative models. As a second contribution, we +introduce the Variational Bayes Latent Estimation (VBLE) algorithm, which +performs latent estimation within the framework of variational inference. +Thanks to a simple yet efficient parameterization of the variational posterior, +VBLE allows for fast and easy (approximate) posterior sampling. Experimental +results on image datasets BSD and FFHQ demonstrate that VBLE reaches similar +performance than state-of-the-art plug-and-play methods, while being able to +quantify uncertainties faster than other existing posterior sampling +techniques. + +
+
+
+
+
+ + ♻ ☆ Mask Grounding for Referring Image Segmentation CVPR2024 + + +
+ Referring Image Segmentation (RIS) is a challenging task that requires an +algorithm to segment objects referred by free-form language expressions. +Despite significant progress in recent years, most state-of-the-art (SOTA) +methods still suffer from considerable language-image modality gap at the pixel +and word level. These methods generally 1) rely on sentence-level language +features for language-image alignment and 2) lack explicit training supervision +for fine-grained visual grounding. Consequently, they exhibit weak object-level +correspondence between visual and language features. Without well-grounded +features, prior methods struggle to understand complex expressions that require +strong reasoning over relationships among multiple objects, especially when +dealing with rarely used or ambiguous clauses. To tackle this challenge, we +introduce a novel Mask Grounding auxiliary task that significantly improves +visual grounding within language features, by explicitly teaching the model to +learn fine-grained correspondence between masked textual tokens and their +matching visual objects. Mask Grounding can be directly used on prior RIS +methods and consistently bring improvements. Furthermore, to holistically +address the modality gap, we also design a cross-modal alignment loss and an +accompanying alignment module. These additions work synergistically with Mask +Grounding. With all these techniques, our comprehensive approach culminates in +MagNet (Mask-grounded Network), an architecture that significantly outperforms +prior arts on three key benchmarks (RefCOCO, RefCOCO+ and G-Ref), demonstrating +our method's effectiveness in addressing current limitations of RIS algorithms. +Our code and pre-trained weights will be released. + +
+
+ comment: Accepted by CVPR2024; Project page: + https://yxchng.github.io/projects/mask-grounding +
+
+
+
+
+ + ♻ ☆ Multimodal-Conditioned Latent Diffusion Models for Fashion Image Editing + + +
+ Fashion illustration is a crucial medium for designers to convey their +creative vision and transform design concepts into tangible representations +that showcase the interplay between clothing and the human body. In the context +of fashion design, computer vision techniques have the potential to enhance and +streamline the design process. Departing from prior research primarily focused +on virtual try-on, this paper tackles the task of multimodal-conditioned +fashion image editing. Our approach aims to generate human-centric fashion +images guided by multimodal prompts, including text, human body poses, garment +sketches, and fabric textures. To address this problem, we propose extending +latent diffusion models to incorporate these multiple modalities and modifying +the structure of the denoising network, taking multimodal prompts as input. To +condition the proposed architecture on fabric textures, we employ textual +inversion techniques and let diverse cross-attention layers of the denoising +network attend to textual and texture information, thus incorporating different +granularity conditioning details. Given the lack of datasets for the task, we +extend two existing fashion datasets, Dress Code and VITON-HD, with multimodal +annotations. Experimental evaluations demonstrate the effectiveness of our +proposed approach in terms of realism and coherence concerning the provided +multimodal inputs. + +
+
+
+
+
+ + ♻ ☆ LightIt: Illumination Modeling and Control for Diffusion Models + + +
+ We introduce LightIt, a method for explicit illumination control for image +generation. Recent generative methods lack lighting control, which is crucial +to numerous artistic aspects of image generation such as setting the overall +mood or cinematic appearance. To overcome these limitations, we propose to +condition the generation on shading and normal maps. We model the lighting with +single bounce shading, which includes cast shadows. We first train a shading +estimation module to generate a dataset of real-world images and shading pairs. +Then, we train a control network using the estimated shading and normals as +input. Our method demonstrates high-quality image generation and lighting +control in numerous scenes. Additionally, we use our generated dataset to train +an identity-preserving relighting model, conditioned on an image and a target +shading. Our method is the first that enables the generation of images with +controllable, consistent lighting and performs on par with specialized +relighting state-of-the-art methods. + +
+
+ comment: Project page: https://peter-kocsis.github.io/LightIt/ Video: + https://youtu.be/cCfSBD5aPLI +
+
+
+
+
+ + ♻ ☆ Fully automated workflow for the design of patient-specific orthopaedic + implants: application to total knee arthroplasty + + +
+ Arthroplasty is commonly performed to treat joint osteoarthritis, reducing +pain and improving mobility. While arthroplasty has known several technical +improvements, a significant share of patients are still unsatisfied with their +surgery. Personalised arthroplasty improves surgical outcomes however current +solutions require delays, making it difficult to integrate in clinical routine. +We propose a fully automated workflow to design patient-specific implants, +presented for total knee arthroplasty, the most widely performed arthroplasty +in the world nowadays. + The proposed pipeline first uses artificial neural networks to segment the +proximal and distal extremities of the femur and tibia. Then the full bones are +reconstructed using augmented statistical shape models, combining shape and +landmarks information. Finally, 77 morphological parameters are computed to +design patient-specific implants. The developed workflow has been trained using +91 CT scans of lower limb and evaluated on 41 CT scans manually segmented, in +terms of accuracy and execution time. + The workflow accuracy was $0.4\pm0.2mm$ for the segmentation, $1.2\pm0.4mm$ +for the full bones reconstruction, and $2.8\pm2.2mm$ for the anatomical +landmarks determination. The custom implants fitted the patients' anatomy with +$0.6\pm0.2mm$ accuracy. The whole process from segmentation to implants' design +lasted about 5 minutes. + The proposed workflow allows for a fast and reliable personalisation of knee +implants, directly from the patient CT image without requiring any manual +intervention. It establishes a patient-specific pre-operative planning for TKA +in a very short time making it easily available for all patients. Combined with +efficient implant manufacturing techniques, this solution could help answer the +growing number of arthroplasties while reducing complications and improving the +patients' satisfaction. + +
+
+
+
+
+ + ♻ ☆ denoiSplit: a method for joint image splitting and unsupervised + denoising + + +
+ In this work we present denoiSplit, a method to tackle a new analysis task, +i.e. the challenge of joint semantic image splitting and unsupervised +denoising. This dual approach has important applications in fluorescence +microscopy, where semantic image splitting has important applications but noise +does generally hinder the downstream analysis of image content. Image splitting +involves dissecting an image into its distinguishable semantic structures. We +show that the current state-of-the-art method for this task struggles in the +presence of image noise, inadvertently also distributing the noise across the +predicted outputs. The method we present here can deal with image noise by +integrating an unsupervised denoising sub-task. This integration results in +improved semantic image unmixing, even in the presence of notable and realistic +levels of imaging noise. A key innovation in denoiSplit is the use of +specifically formulated noise models and the suitable adjustment of +KL-divergence loss for the high-dimensional hierarchical latent space we are +training. We showcase the performance of denoiSplit across 4 tasks on +real-world microscopy images. Additionally, we perform qualitative and +quantitative evaluations and compare results to existing benchmarks, +demonstrating the effectiveness of using denoiSplit: a single Variational +Splitting Encoder-Decoder (VSE) Network using two suitable noise models to +jointly perform semantic splitting and denoising. + +
+
+
+
+
+ + ♻ ☆ Unraveling Instance Associations: A Closer Look for Audio-Visual + Segmentation + + +
+ Audio-visual segmentation (AVS) is a challenging task that involves +accurately segmenting sounding objects based on audio-visual cues. The +effectiveness of audio-visual learning critically depends on achieving accurate +cross-modal alignment between sound and visual objects. Successful audio-visual +learning requires two essential components: 1) a challenging dataset with +high-quality pixel-level multi-class annotated images associated with audio +files, and 2) a model that can establish strong links between audio information +and its corresponding visual object. However, these requirements are only +partially addressed by current methods, with training sets containing biased +audio-visual data, and models that generalise poorly beyond this biased +training set. In this work, we propose a new cost-effective strategy to build +challenging and relatively unbiased high-quality audio-visual segmentation +benchmarks. We also propose a new informative sample mining method for +audio-visual supervised contrastive learning to leverage discriminative +contrastive samples to enforce cross-modal understanding. We show empirical +results that demonstrate the effectiveness of our benchmark. Furthermore, +experiments conducted on existing AVS datasets and on our new benchmark show +that our method achieves state-of-the-art (SOTA) segmentation accuracy. + +
+
+ comment: Code is available at https://github.com/cyh-0/CAVP +
+
+
+
+
+ + ♻ ☆ FocusCLIP: Multimodal Subject-Level Guidance for Zero-Shot Transfer in + Human-Centric Tasks + + +
+ We propose FocusCLIP, integrating subject-level guidance--a specialized +mechanism for target-specific supervision--into the CLIP framework for improved +zero-shot transfer on human-centric tasks. Our novel contributions enhance CLIP +on both the vision and text sides. On the vision side, we incorporate ROI +heatmaps emulating human visual attention mechanisms to emphasize +subject-relevant image regions. On the text side, we introduce human pose +descriptions to provide rich contextual information. For human-centric tasks, +FocusCLIP is trained with images from the MPII Human Pose dataset. The proposed +approach surpassed CLIP by an average of 8.61% across five previously unseen +datasets covering three human-centric tasks. FocusCLIP achieved an average +accuracy of 33.65% compared to 25.04% by CLIP. We observed a 3.98% improvement +in activity recognition, a 14.78% improvement in age classification, and a +7.06% improvement in emotion recognition. Moreover, using our proposed +single-shot LLM prompting strategy, we release a high-quality MPII Pose +Descriptions dataset to encourage further research in multimodal learning for +human-centric tasks. Furthermore, we also demonstrate the effectiveness of our +subject-level supervision on non-human-centric tasks. FocusCLIP shows a 2.47% +improvement over CLIP in zero-shot bird classification using the CUB dataset. +Our findings emphasize the potential of integrating subject-level guidance with +general pretraining methods for enhanced downstream performance. + +
+
+
+
+
+ + ♻ ☆ Unleashing the Power of Self-Supervised Image Denoising: A Comprehensive + Review + + +
+ The advent of deep learning has brought a revolutionary transformation to +image denoising techniques. However, the persistent challenge of acquiring +noise-clean pairs for supervised methods in real-world scenarios remains +formidable, necessitating the exploration of more practical self-supervised +image denoising. This paper focuses on self-supervised image denoising methods +that offer effective solutions to address this challenge. Our comprehensive +review thoroughly analyzes the latest advancements in self-supervised image +denoising approaches, categorizing them into three distinct classes: General +methods, Blind Spot Network (BSN)-based methods, and Transformer-based methods. +For each class, we provide a concise theoretical analysis along with their +practical applications. To assess the effectiveness of these methods, we +present both quantitative and qualitative experimental results on various +datasets, utilizing classical algorithms as benchmarks. Additionally, we +critically discuss the current limitations of these methods and propose +promising directions for future research. By offering a detailed overview of +recent developments in self-supervised image denoising, this review serves as +an invaluable resource for researchers and practitioners in the field, +facilitating a deeper understanding of this emerging domain and inspiring +further advancements. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ♻ ☆ BiTT: Bi-directional Texture Reconstruction of Interacting Two Hands + from a Single Image CVPR 2024 + + +
+ Creating personalized hand avatars is important to offer a realistic +experience to users on AR / VR platforms. While most prior studies focused on +reconstructing 3D hand shapes, some recent work has tackled the reconstruction +of hand textures on top of shapes. However, these methods are often limited to +capturing pixels on the visible side of a hand, requiring diverse views of the +hand in a video or multiple images as input. In this paper, we propose a novel +method, BiTT(Bi-directional Texture reconstruction of Two hands), which is the +first end-to-end trainable method for relightable, pose-free texture +reconstruction of two interacting hands taking only a single RGB image, by +three novel components: 1) bi-directional (left $\leftrightarrow$ right) +texture reconstruction using the texture symmetry of left / right hands, 2) +utilizing a texture parametric model for hand texture recovery, and 3) the +overall coarse-to-fine stage pipeline for reconstructing personalized texture +of two interacting hands. BiTT first estimates the scene light condition and +albedo image from an input image, then reconstructs the texture of both hands +through the texture parametric model and bi-directional texture reconstructor. +In experiments using InterHand2.6M and RGB2Hands datasets, our method +significantly outperforms state-of-the-art hand texture reconstruction methods +quantitatively and qualitatively. The code is available at +https://github.com/yunminjin2/BiTT + +
+
+ comment: Accepted by CVPR 2024, Project Page: + https://yunminjin2.github.io/projects/bitt/ +
+
+
+
+
+ + ♻ ☆ Toulouse Hyperspectral Data Set: a benchmark data set to assess + semi-supervised spectral representation learning and pixel-wise + classification techniques + + +
+ Airborne hyperspectral images can be used to map the land cover in large +urban areas, thanks to their very high spatial and spectral resolutions on a +wide spectral domain. While the spectral dimension of hyperspectral images is +highly informative of the chemical composition of the land surface, the use of +state-of-the-art machine learning algorithms to map the land cover has been +dramatically limited by the availability of training data. To cope with the +scarcity of annotations, semi-supervised and self-supervised techniques have +lately raised a lot of interest in the community. Yet, the publicly available +hyperspectral data sets commonly used to benchmark machine learning models are +not totally suited to evaluate their generalization performances due to one or +several of the following properties: a limited geographical coverage (which +does not reflect the spectral diversity in metropolitan areas), a small number +of land cover classes and a lack of appropriate standard train / test splits +for semi-supervised and self-supervised learning. Therefore, we release in this +paper the Toulouse Hyperspectral Data Set that stands out from other data sets +in the above-mentioned respects in order to meet key issues in spectral +representation learning and classification over large-scale hyperspectral +images with very few labeled pixels. Besides, we discuss and experiment +self-supervised techniques for spectral representation learning, including the +Masked Autoencoder, and establish a baseline for pixel-wise classification +achieving 85% overall accuracy and 77% F1 score. The Toulouse Hyperspectral +Data Set and our code are publicly available at +https://www.toulouse-hyperspectral-data-set.com and +https://www.github.com/Romain3Ch216/tlse-experiments, respectively. + +
+
+ comment: 17 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ Geometric Prior Based Deep Human Point Cloud Geometry Compression + + +
+ The emergence of digital avatars has raised an exponential increase in the +demand for human point clouds with realistic and intricate details. The +compression of such data becomes challenging with overwhelming data amounts +comprising millions of points. Herein, we leverage the human geometric prior in +geometry redundancy removal of point clouds, greatly promoting the compression +performance. More specifically, the prior provides topological constraints as +geometry initialization, allowing adaptive adjustments with a compact parameter +set that could be represented with only a few bits. Therefore, we can envisage +high-resolution human point clouds as a combination of geometric priors and +structural deviations. The priors could first be derived with an aligned point +cloud, and subsequently the difference of features is compressed into a compact +latent code. The proposed framework can operate in a play-and-plug fashion with +existing learning based point cloud compression methods. Extensive experimental +results show that our approach significantly improves the compression +performance without deteriorating the quality, demonstrating its promise in a +variety of applications. + +
+
+ comment: Accepted by TCSVT 2024 +
+
+
+
+
+ + ♻ ☆ Explaining CLIP's performance disparities on data from blind/low vision + users CVPR + + +
+ Large multi-modal models (LMMs) hold the potential to usher in a new era of +automated visual assistance for people who are blind or low vision (BLV). Yet, +these models have not been systematically evaluated on data captured by BLV +users. We address this by empirically assessing CLIP, a widely-used LMM likely +to underpin many assistive technologies. Testing 25 CLIP variants in a +zero-shot classification task, we find that their accuracy is 15 percentage +points lower on average for images captured by BLV users than web-crawled +images. This disparity stems from CLIP's sensitivities to 1) image content +(e.g. not recognizing disability objects as well as other objects); 2) image +quality (e.g. not being robust to lighting variation); and 3) text content +(e.g. not recognizing objects described by tactile adjectives as well as visual +ones). We delve deeper with a textual analysis of three common pre-training +datasets: LAION-400M, LAION-2B and DataComp-1B, showing that disability content +is rarely mentioned. We then provide three examples that illustrate how the +performance disparities extend to three downstream models underpinned by CLIP: +OWL-ViT, CLIPSeg and DALL-E2. We find that few-shot learning with as few as 5 +images can mitigate CLIP's quality-of-service disparities for BLV users in some +scenarios, which we discuss alongside a set of other possible mitigations. + +
+
+ comment: Accepted at 2024 IEEE/CVF Conference on Computer Vision and Pattern + Recognition (CVPR) +
+
+
+
+
+ + ♻ ☆ Distributionally Generative Augmentation for Fair Facial Attribute + Classification CVPR 2024 + + +
+ Facial Attribute Classification (FAC) holds substantial promise in widespread +applications. However, FAC models trained by traditional methodologies can be +unfair by exhibiting accuracy inconsistencies across varied data +subpopulations. This unfairness is largely attributed to bias in data, where +some spurious attributes (e.g., Male) statistically correlate with the target +attribute (e.g., Smiling). Most of existing fairness-aware methods rely on the +labels of spurious attributes, which may be unavailable in practice. This work +proposes a novel, generation-based two-stage framework to train a fair FAC +model on biased data without additional annotation. Initially, we identify the +potential spurious attributes based on generative models. Notably, it enhances +interpretability by explicitly showing the spurious attributes in image space. +Following this, for each image, we first edit the spurious attributes with a +random degree sampled from a uniform distribution, while keeping target +attribute unchanged. Then we train a fair FAC model by fostering model +invariance to these augmentation. Extensive experiments on three common +datasets demonstrate the effectiveness of our method in promoting fairness in +FAC without compromising accuracy. Codes are in +https://github.com/heqianpei/DiGA. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Contrastive Pre-Training with Multi-View Fusion for No-Reference Point + Cloud Quality Assessment + + +
+ No-reference point cloud quality assessment (NR-PCQA) aims to automatically +evaluate the perceptual quality of distorted point clouds without available +reference, which have achieved tremendous improvements due to the utilization +of deep neural networks. However, learning-based NR-PCQA methods suffer from +the scarcity of labeled data and usually perform suboptimally in terms of +generalization. To solve the problem, we propose a novel contrastive +pre-training framework tailored for PCQA (CoPA), which enables the pre-trained +model to learn quality-aware representations from unlabeled data. To obtain +anchors in the representation space, we project point clouds with different +distortions into images and randomly mix their local patches to form mixed +images with multiple distortions. Utilizing the generated anchors, we constrain +the pre-training process via a quality-aware contrastive loss following the +philosophy that perceptual quality is closely related to both content and +distortion. Furthermore, in the model fine-tuning stage, we propose a +semantic-guided multi-view fusion module to effectively integrate the features +of projected images from multiple perspectives. Extensive experiments show that +our method outperforms the state-of-the-art PCQA methods on popular benchmarks. +Further investigations demonstrate that CoPA can also benefit existing +learning-based PCQA models. + +
+
+
+
+
+ + ♻ ☆ Differentiable Point-based Inverse Rendering + + +
+ We present differentiable point-based inverse rendering, DPIR, an +analysis-by-synthesis method that processes images captured under diverse +illuminations to estimate shape and spatially-varying BRDF. To this end, we +adopt point-based rendering, eliminating the need for multiple samplings per +ray, typical of volumetric rendering, thus significantly enhancing the speed of +inverse rendering. To realize this idea, we devise a hybrid point-volumetric +representation for geometry and a regularized basis-BRDF representation for +reflectance. The hybrid geometric representation enables fast rendering through +point-based splatting while retaining the geometric details and stability +inherent to SDF-based representations. The regularized basis-BRDF mitigates the +ill-posedness of inverse rendering stemming from limited light-view angular +samples. We also propose an efficient shadow detection method using point-based +shadow map rendering. Our extensive evaluations demonstrate that DPIR +outperforms prior works in terms of reconstruction accuracy, computational +efficiency, and memory footprint. Furthermore, our explicit point-based +representation and rendering enables intuitive geometry and reflectance +editing. + +
+
+
+
+
+ + ♻ ☆ HallusionBench: An Advanced Diagnostic Suite for Entangled Language + Hallucination and Visual Illusion in Large Vision-Language Models CVPR 2024 + + +
+ We introduce HallusionBench, a comprehensive benchmark designed for the +evaluation of image-context reasoning. This benchmark presents significant +challenges to advanced large visual-language models (LVLMs), such as +GPT-4V(Vision), Gemini Pro Vision, Claude 3, and LLaVA-1.5, by emphasizing +nuanced understanding and interpretation of visual data. The benchmark +comprises 346 images paired with 1129 questions, all meticulously crafted by +human experts. We introduce a novel structure for these visual questions +designed to establish control groups. This structure enables us to conduct a +quantitative analysis of the models' response tendencies, logical consistency, +and various failure modes. In our evaluation on HallusionBench, we benchmarked +15 different models, highlighting a 31.42% question-pair accuracy achieved by +the state-of-the-art GPT-4V. Notably, all other evaluated models achieve +accuracy below 16%. Moreover, our analysis not only highlights the observed +failure modes, including language hallucination and visual illusion, but also +deepens an understanding of these pitfalls. Our comprehensive case studies +within HallusionBench shed light on the challenges of hallucination and +illusion in LVLMs. Based on these insights, we suggest potential pathways for +their future improvement. The benchmark and codebase can be accessed at +https://github.com/tianyi-lab/HallusionBench. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Time-Efficient and Identity-Consistent Virtual Try-On Using A Variant of + Altered Diffusion Models + + +
+ This study discusses the critical issues of Virtual Try-On in contemporary +e-commerce and the prospective metaverse, emphasizing the challenges of +preserving intricate texture details and distinctive features of the target +person and the clothes in various scenarios, such as clothing texture and +identity characteristics like tattoos or accessories. In addition to the +fidelity of the synthesized images, the efficiency of the synthesis process +presents a significant hurdle. Various existing approaches are explored, +highlighting the limitations and unresolved aspects, e.g., identity information +omission, uncontrollable artifacts, and low synthesis speed. It then proposes a +novel diffusion-based solution that addresses garment texture preservation and +user identity retention during virtual try-on. The proposed network comprises +two primary modules - a warping module aligning clothing with individual +features and a try-on module refining the attire and generating missing parts +integrated with a mask-aware post-processing technique ensuring the integrity +of the individual's identity. It demonstrates impressive results, surpassing +the state-of-the-art in speed by nearly 20 times during inference, with +superior fidelity in qualitative assessments. Quantitative evaluations confirm +comparable performance with the recent SOTA method on the VITON-HD and +Dresscode datasets. + +
+
+
+
+
+ + ♻ ☆ Mipha: A Comprehensive Overhaul of Multimodal Assistant with Small + Language Models + + +
+ Multimodal Large Language Models (MLLMs) have showcased impressive skills in +tasks related to visual understanding and reasoning. Yet, their widespread +application faces obstacles due to the high computational demands during both +the training and inference phases, restricting their use to a limited audience +within the research and user communities. In this paper, we investigate the +design aspects of Multimodal Small Language Models (MSLMs) and propose an +efficient multimodal assistant named Mipha, which is designed to create synergy +among various aspects: visual representation, language models, and optimization +strategies. We show that without increasing the volume of training data, our +Mipha-3B outperforms the state-of-the-art large MLLMs, especially +LLaVA-1.5-13B, on multiple benchmarks. Through detailed discussion, we provide +insights and guidelines for developing strong MSLMs that rival the capabilities +of MLLMs. Our code is available at https://github.com/zhuyiche/llava-phi. + +
+
+
+
+
+ + ♻ ☆ Dispersed Structured Light for Hyperspectral 3D Imaging + + +
+ Hyperspectral 3D imaging aims to acquire both depth and spectral information +of a scene. However, existing methods are either prohibitively expensive and +bulky or compromise on spectral and depth accuracy. In this work, we present +Dispersed Structured Light (DSL), a cost-effective and compact method for +accurate hyperspectral 3D imaging. DSL modifies a traditional projector-camera +system by placing a sub-millimeter thick diffraction grating film front of the +projector. The grating disperses structured light based on light wavelength. To +utilize the dispersed structured light, we devise a model for dispersive +projection image formation and a per-pixel hyperspectral 3D reconstruction +method. We validate DSL by instantiating a compact experimental prototype. DSL +achieves spectral accuracy of 18.8nm full-width half-maximum (FWHM) and depth +error of 1mm. We demonstrate that DSL outperforms prior work on practical +hyperspectral 3D imaging. DSL promises accurate and practical hyperspectral 3D +imaging for diverse application domains, including computer vision and +graphics, cultural heritage, geology, and biology. + +
+
+
+
+
+ + ♻ ☆ PIA: Your Personalized Image Animator via Plug-and-Play Modules in + Text-to-Image Models + + +
+ Recent advancements in personalized text-to-image (T2I) models have +revolutionized content creation, empowering non-experts to generate stunning +images with unique styles. While promising, adding realistic motions into these +personalized images by text poses significant challenges in preserving distinct +styles, high-fidelity details, and achieving motion controllability by text. In +this paper, we present PIA, a Personalized Image Animator that excels in +aligning with condition images, achieving motion controllability by text, and +the compatibility with various personalized T2I models without specific tuning. +To achieve these goals, PIA builds upon a base T2I model with well-trained +temporal alignment layers, allowing for the seamless transformation of any +personalized T2I model into an image animation model. A key component of PIA is +the introduction of the condition module, which utilizes the condition frame +and inter-frame affinity as input to transfer appearance information guided by +the affinity hint for individual frame synthesis in the latent space. This +design mitigates the challenges of appearance-related image alignment within +and allows for a stronger focus on aligning with motion-related guidance. + +
+
+ comment: Project page: https://pi-animator.github.io/ +
+
+
+
+
+ + ♻ ☆ I-PHYRE: Interactive Physical Reasoning ICLR 2024 + + +
+ Current evaluation protocols predominantly assess physical reasoning in +stationary scenes, creating a gap in evaluating agents' abilities to interact +with dynamic events. While contemporary methods allow agents to modify initial +scene configurations and observe consequences, they lack the capability to +interact with events in real time. To address this, we introduce I-PHYRE, a +framework that challenges agents to simultaneously exhibit intuitive physical +reasoning, multi-step planning, and in-situ intervention. Here, intuitive +physical reasoning refers to a quick, approximate understanding of physics to +address complex problems; multi-step denotes the need for extensive sequence +planning in I-PHYRE, considering each intervention can significantly alter +subsequent choices; and in-situ implies the necessity for timely object +manipulation within a scene, where minor timing deviations can result in task +failure. We formulate four game splits to scrutinize agents' learning and +generalization of essential principles of interactive physical reasoning, +fostering learning through interaction with representative scenarios. Our +exploration involves three planning strategies and examines several supervised +and reinforcement agents' zero-shot generalization proficiency on I-PHYRE. The +outcomes highlight a notable gap between existing learning algorithms and human +performance, emphasizing the imperative for more research in enhancing agents +with interactive physical reasoning capabilities. The environment and baselines +will be made publicly available. + +
+
+ comment: 21 pages, ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Solving the bongard-logo problem by modeling a probabilistic model + + +
+ Abstract reasoning problems challenge the perceptual and cognitive abilities +of AI algorithms, demanding deeper pattern discernment and inductive reasoning +beyond explicit image features. This study introduces PMoC, a tailored +probability model for the Bongard-Logo problem, achieving high reasoning +accuracy by constructing independent probability models. Additionally, we +present Pose-Transformer, an enhanced Transformer-Encoder designed for complex +abstract reasoning tasks, including Bongard-Logo, RAVEN, I-RAVEN, and PGM. +Pose-Transformer incorporates positional information learning, inspired by +capsule networks' pose matrices, enhancing its focus on local positional +relationships in image data processing. When integrated with PMoC, it further +improves reasoning accuracy. Our approach effectively addresses reasoning +difficulties associated with abstract entities' positional changes, +outperforming previous models on the OIG, D3$\times$3 subsets of RAVEN, and PGM +databases. This research contributes to advancing AI's capabilities in abstract +reasoning and cognitive pattern recognition. + +
+
+ comment: 14 pages, 11 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Triple-CFN: Restructuring Conceptual Spaces for Enhancing Abstract + Reasoning process + + +
+ Abstract reasoning problems pose significant challenges to artificial +intelligence algorithms, demanding cognitive capabilities beyond those required +for perception tasks. This study introduces the Triple-CFN approach to tackle +the Bongard-Logo problem, achieving notable reasoning accuracy by implicitly +reorganizing the concept space of conflicting instances. Additionally, the +Triple-CFN paradigm proves effective for the RPM problem with necessary +modifications, yielding competitive results. To further enhance performance on +the RPM issue, we develop the Meta Triple-CFN network, which explicitly +structures the problem space while maintaining interpretability on progressive +patterns. The success of Meta Triple-CFN is attributed to its paradigm of +modeling the conceptual space, equivalent to normalizing reasoning information. +Based on this ideology, we introduce the Re-space layer, enhancing the +performance of both Meta Triple-CFN and Triple-CFN. This paper aims to +contribute to advancements in machine intelligence by exploring innovative +network designs for addressing abstract reasoning problems, paving the way for +further breakthroughs in this domain. + +
+
+ comment: 14 pages, 14 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ D4C glove-train: solving the RPM and Bongard-logo problem by + distributing and Circumscribing concepts + + +
+ This paper achieves noteworthy progress in the realm of abstract reasoning, +particularly in addressing Raven's Progressive Matrices (RPM) and Bongard-Logo +challenges. Initially, we introduce Lico-Net, a novel baseline model that +resolves RPM problems with remarkable accuracy. Leveraging this foundation, we +advance with the D3C approach, which advocates representing the underlying +concepts in abstract reasoning problems through distributions. This perspective +enhances the performance of both Lico-Net and a baseline model excelling in +Bongard-Logo tasks. To bolster the computational efficiency of D3C, we present +the D3C-cos variant, offering a streamlined yet precise solution. Furthermore, +we propose the D2C method, redefining conceptual boundaries within these +domains and bridging the divide between high-level abstractions and their +lower-dimensional counterparts. Finally, we extend our methodology to D4C, +employing adversarial techniques to refine conceptual boundaries further and +demonstrate substantial improvements in both RPM and Bongard-Logo challenges. +Overall, our contributions present a fresh outlook and practical advancements +in the field of abstract reasoning. + +
+
+ comment: 18 pages, 19 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ CiPR: An Efficient Framework with Cross-instance Positive Relations for + Generalized Category Discovery + + +
+ We tackle the issue of generalized category discovery (GCD). GCD considers +the open-world problem of automatically clustering a partially labelled +dataset, in which the unlabelled data may contain instances from both novel +categories and labelled classes. In this paper, we address the GCD problem with +an unknown category number for the unlabelled data. We propose a framework, +named CiPR, to bootstrap the representation by exploiting Cross-instance +Positive Relations in the partially labelled data for contrastive learning, +which have been neglected in existing methods. To obtain reliable +cross-instance relations to facilitate representation learning, we introduce a +semi-supervised hierarchical clustering algorithm, named selective neighbor +clustering (SNC), which can produce a clustering hierarchy directly from the +connected components of a graph constructed from selective neighbors. We +further present a method to estimate the unknown class number using SNC with a +joint reference score that considers clustering indexes of both labelled and +unlabelled data, and extend SNC to allow label assignment for the unlabelled +instances with a given class number. We thoroughly evaluate our framework on +public generic image recognition datasets and challenging fine-grained +datasets, and establish a new state-of-the-art. Code: +https://github.com/haoosz/CiPR + +
+
+ comment: Accepted to TMLR. Code: https://github.com/haoosz/CiPR +
+
+
+
+
+ + ♻ ☆ HalluciDoctor: Mitigating Hallucinatory Toxicity in Visual Instruction + Data CVPR 2024 + + +
+ Multi-modal Large Language Models (MLLMs) tuned on machine-generated +instruction-following data have demonstrated remarkable performance in various +multi-modal understanding and generation tasks. However, the hallucinations +inherent in machine-generated data, which could lead to hallucinatory outputs +in MLLMs, remain under-explored. This work aims to investigate various +hallucinations (i.e., object, relation, attribute hallucinations) and mitigate +those hallucinatory toxicities in large-scale machine-generated visual +instruction datasets. Drawing on the human ability to identify factual errors, +we present a novel hallucination detection and elimination framework, +HalluciDoctor, based on the cross-checking paradigm. We use our framework to +identify and eliminate hallucinations in the training data automatically. +Interestingly, HalluciDoctor also indicates that spurious correlations arising +from long-tail object co-occurrences contribute to hallucinations. Based on +that, we execute counterfactual visual instruction expansion to balance data +distribution, thereby enhancing MLLMs' resistance to hallucinations. +Comprehensive experiments on hallucination evaluation benchmarks show that our +method successfully mitigates 44.6% hallucinations relatively and maintains +competitive performance compared to LLaVA. The data and code for this paper are +publicly available. \url{https://github.com/Yuqifan1117/HalluciDoctor}. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ W-HMR: Human Mesh Recovery in World Space with Weak-supervised Camera + Calibration and Orientation Correction + + +
+ For a long time, in reconstructing 3D human bodies from monocular images, +most methods opted to simplify the task by minimizing the influence of the +camera. Using a coarse focal length setting results in the reconstructed bodies +not aligning well with distorted images. Ignoring camera rotation leads to an +unrealistic reconstructed body pose in world space. Consequently, the +application scenarios of existing methods are confined to controlled +environments. When confronted with complex and diverse in-the-wild images, they +struggle to achieve accurate and reasonable reconstruction in world space. To +address the above issues, we propose W-HMR, which decouples global body +recovery into camera calibration, local body recovery, and global body +orientation correction. We design the first weak-supervised camera calibration +method for body distortion, eliminating dependence on focal length labels and +achieving finer mesh-image alignment. We propose a novel orientation correction +module to allow the reconstructed human body to remain normal in world space. +Decoupling body orientation and body pose enables our model to consider the +accuracy in camera coordinate and the reasonableness in world coordinate +simultaneously, expanding the range of applications. As a result, W-HMR +achieves high-quality reconstruction in dual coordinate systems, particularly +in challenging scenes. Codes and demos have been released on the project page +https://yw0208.github.io/w-hmr/. + +
+
+ comment: Project Page: https://yw0208.github.io/w-hmr/ +
+
+
+
+
+ + ♻ ☆ When Semantic Segmentation Meets Frequency Aliasing ICLR 2024 + + +
+ Despite recent advancements in semantic segmentation, where and what pixels +are hard to segment remains largely unexplored. Existing research only +separates an image into easy and hard regions and empirically observes the +latter are associated with object boundaries. In this paper, we conduct a +comprehensive analysis of hard pixel errors, categorizing them into three +types: false responses, merging mistakes, and displacements. Our findings +reveal a quantitative association between hard pixels and aliasing, which is +distortion caused by the overlapping of frequency components in the Fourier +domain during downsampling. To identify the frequencies responsible for +aliasing, we propose using the equivalent sampling rate to calculate the +Nyquist frequency, which marks the threshold for aliasing. Then, we introduce +the aliasing score as a metric to quantify the extent of aliasing. While +positively correlated with the proposed aliasing score, three types of hard +pixels exhibit different patterns. Here, we propose two novel de-aliasing +filter (DAF) and frequency mixing (FreqMix) modules to alleviate aliasing +degradation by accurately removing or adjusting frequencies higher than the +Nyquist frequency. The DAF precisely removes the frequencies responsible for +aliasing before downsampling, while the FreqMix dynamically selects +high-frequency components within the encoder block. Experimental results +demonstrate consistent improvements in semantic segmentation and low-light +instance segmentation tasks. The code is available at: +https://github.com/Linwei-Chen/Seg-Aliasing. + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Cell Variational Information Bottleneck Network + + +
+ In this work, we propose Cell Variational Information Bottleneck Network +(cellVIB), a convolutional neural network using information bottleneck +mechanism, which can be combined with the latest feedforward network +architecture in an end-to-end training method. Our Cell Variational Information +Bottleneck Network is constructed by stacking VIB cells, which generate feature +maps with uncertainty. As layers going deeper, the regularization effect will +gradually increase, instead of directly adding excessive regular constraints to +the output layer of the model as in Deep VIB. Under each VIB cell, the +feedforward process learns an independent mean term and an standard deviation +term, and predicts the Gaussian distribution based on them. The feedback +process is based on reparameterization trick for effective training. This work +performs an extensive analysis on MNIST dataset to verify the effectiveness of +each VIB cells, and provides an insightful analysis on how the VIB cells affect +mutual information. Experiments conducted on CIFAR-10 also prove that our +cellVIB is robust against noisy labels during training and against corrupted +images during testing. Then, we validate our method on PACS dataset, whose +results show that the VIB cells can significantly improve the generalization +performance of the basic model. Finally, in a more complex representation +learning task, face recognition, our network structure has also achieved very +competitive results. + +
+
+
+
+
+ + ♻ ☆ Don't Judge by the Look: Towards Motion Coherent Video Representation ICLR2024 + + +
+ Current training pipelines in object recognition neglect Hue Jittering when +doing data augmentation as it not only brings appearance changes that are +detrimental to classification, but also the implementation is inefficient in +practice. In this study, we investigate the effect of hue variance in the +context of video understanding and find this variance to be beneficial since +static appearances are less important in videos that contain motion +information. Based on this observation, we propose a data augmentation method +for video understanding, named Motion Coherent Augmentation (MCA), that +introduces appearance variation in videos and implicitly encourages the model +to prioritize motion patterns, rather than static appearances. Concretely, we +propose an operation SwapMix to efficiently modify the appearance of video +samples, and introduce Variation Alignment (VA) to resolve the distribution +shift caused by SwapMix, enforcing the model to learn appearance invariant +representations. Comprehensive empirical evaluation across various +architectures and different datasets solidly validates the effectiveness and +generalization ability of MCA, and the application of VA in other augmentation +methods. Code is available at https://github.com/BeSpontaneous/MCA-pytorch. + +
+
+ comment: Accepted by ICLR2024 +
+
+
+
+
+ + ♻ ☆ Cartoon Hallucinations Detection: Pose-aware In Context Visual Learning + + +
+ Large-scale Text-to-Image (TTI) models have become a common approach for +generating training data in various generative fields. However, visual +hallucinations, which contain perceptually critical defects, remain a concern, +especially in non-photorealistic styles like cartoon characters. We propose a +novel visual hallucination detection system for cartoon character images +generated by TTI models. Our approach leverages pose-aware in-context visual +learning (PA-ICVL) with Vision-Language Models (VLMs), utilizing both RGB +images and pose information. By incorporating pose guidance from a fine-tuned +pose estimator, we enable VLMs to make more accurate decisions. Experimental +results demonstrate significant improvements in identifying visual +hallucinations compared to baseline methods relying solely on RGB images. This +research advances TTI models by mitigating visual hallucinations, expanding +their potential in non-photorealistic domains. + +
+
+ comment: 11 pages, 12 figures, 1 table, Project page: + https://gh-bumsookim.github.io/Cartoon-Hallucinations-Detection/ +
+
+
+
+
+ + ♻ ☆ MMA-Diffusion: MultiModal Attack on Diffusion Models CVPR 2024 + + +
+ In recent years, Text-to-Image (T2I) models have seen remarkable +advancements, gaining widespread adoption. However, this progress has +inadvertently opened avenues for potential misuse, particularly in generating +inappropriate or Not-Safe-For-Work (NSFW) content. Our work introduces +MMA-Diffusion, a framework that presents a significant and realistic threat to +the security of T2I models by effectively circumventing current defensive +measures in both open-source models and commercial online services. Unlike +previous approaches, MMA-Diffusion leverages both textual and visual modalities +to bypass safeguards like prompt filters and post-hoc safety checkers, thus +exposing and highlighting the vulnerabilities in existing defense mechanisms. + +
+
+ comment: CVPR 2024. Code is available at + https://github.com/yangyijune/MMA-Diffusion +
+
+
+
+
+ + ♻ ☆ Noisy-Correspondence Learning for Text-to-Image Person Re-identification + + +
+ Text-to-image person re-identification (TIReID) is a compelling topic in the +cross-modal community, which aims to retrieve the target person based on a +textual query. Although numerous TIReID methods have been proposed and achieved +promising performance, they implicitly assume the training image-text pairs are +correctly aligned, which is not always the case in real-world scenarios. In +practice, the image-text pairs inevitably exist under-correlated or even +false-correlated, a.k.a noisy correspondence (NC), due to the low quality of +the images and annotation errors. To address this problem, we propose a novel +Robust Dual Embedding method (RDE) that can learn robust visual-semantic +associations even with NC. Specifically, RDE consists of two main components: +1) A Confident Consensus Division (CCD) module that leverages the dual-grained +decisions of dual embedding modules to obtain a consensus set of clean training +data, which enables the model to learn correct and reliable visual-semantic +associations. 2) A Triplet Alignment Loss (TAL) relaxes the conventional +Triplet Ranking loss with the hardest negative samples to a log-exponential +upper bound over all negative ones, thus preventing the model collapse under NC +and can also focus on hard-negative samples for promising performance. We +conduct extensive experiments on three public benchmarks, namely CUHK-PEDES, +ICFG-PEDES, and RSTPReID, to evaluate the performance and robustness of our +RDE. Our method achieves state-of-the-art results both with and without +synthetic noisy correspondences on all three datasets. Code is available at +https://github.com/QinYang79/RDE. + +
+
+
+
+
+ + ♻ ☆ CRS-Diff: Controllable Generative Remote Sensing Foundation Model + + +
+ The emergence of diffusion models has revolutionized the field of image +generation, providing new methods for creating high-quality, high-resolution +images across various applications. However, the potential of these models for +generating domain-specific images, particularly remote sensing (RS) images, +remains largely untapped. RS images that are notable for their high resolution, +extensive coverage, and rich information content, bring new challenges that +general diffusion models may not adequately address. This paper proposes +CRS-Diff, a pioneering diffusion modeling framework specifically tailored for +generating remote sensing imagery, leveraging the inherent advantages of +diffusion models while integrating advanced control mechanisms to ensure that +the imagery is not only visually clear but also enriched with geographic and +temporal information. The model integrates global and local control inputs, +enabling precise combinations of generation conditions to refine the generation +process. A comprehensive evaluation of CRS-Diff has demonstrated its superior +capability to generate RS imagery both in a single condition and multiple +conditions compared with previous methods in terms of image quality and +diversity. + +
+
+
+
+
+ + ♻ ☆ Telling Left from Right: Identifying Geometry-Aware Semantic + Correspondence CVPR 24 + + +
+ While pre-trained large-scale vision models have shown significant promise +for semantic correspondence, their features often struggle to grasp the +geometry and orientation of instances. This paper identifies the importance of +being geometry-aware for semantic correspondence and reveals a limitation of +the features of current foundation models under simple post-processing. We show +that incorporating this information can markedly enhance semantic +correspondence performance with simple but effective solutions in both +zero-shot and supervised settings. We also construct a new challenging +benchmark for semantic correspondence built from an existing animal pose +estimation dataset, for both pre-training validating models. Our method +achieves a PCK@0.10 score of 65.4 (zero-shot) and 85.6 (supervised) on the +challenging SPair-71k dataset, outperforming the state of the art by 5.5p and +11.0p absolute gains, respectively. Our code and datasets are publicly +available at: https://telling-left-from-right.github.io/. + +
+
+ comment: Accepted by CVPR 24, project page: + https://telling-left-from-right.github.io/ +
+
+
+
+
+ + ♻ ☆ VURF: A General-purpose Reasoning and Self-refinement Framework for + Video Understanding + + +
+ Recent studies have demonstrated the effectiveness of Large Language Models +(LLMs) as reasoning modules that can deconstruct complex tasks into more +manageable sub-tasks, particularly when applied to visual reasoning tasks for +images. In contrast, this paper introduces a Video Understanding and Reasoning +Framework (VURF) based on the reasoning power of LLMs. Ours is a novel approach +to extend the utility of LLMs in the context of video tasks, leveraging their +capacity to generalize from minimal input and output demonstrations within a +contextual framework. By presenting LLMs with pairs of instructions and their +corresponding high-level programs, we harness their contextual learning +capabilities to generate executable visual programs for video understanding. To +enhance program's accuracy and robustness, we implement two important +strategies. Firstly, we employ a feedback-generation approach, powered by +GPT-3.5, to rectify errors in programs utilizing unsupported functions. +Secondly, taking motivation from recent works on self refinement of LLM +outputs, we introduce an iterative procedure for improving the quality of the +in-context examples by aligning the initial outputs to the outputs that would +have been generated had the LLM not been bound by the structure of the +in-context examples. Our results on several video-specific tasks, including +visual QA, video anticipation, pose estimation and multi-video QA illustrate +the efficacy of these enhancements in improving the performance of visual +programming approaches for video tasks. + +
+
+
+
+
+ + ♻ ☆ URS-NeRF: Unordered Rolling Shutter Bundle Adjustment for Neural + Radiance Fields + + +
+ We propose a novel rolling shutter bundle adjustment method for neural +radiance fields (NeRF), which utilizes the unordered rolling shutter (RS) +images to obtain the implicit 3D representation. Existing NeRF methods suffer +from low-quality images and inaccurate initial camera poses due to the RS +effect in the image, whereas, the previous method that incorporates the RS into +NeRF requires strict sequential data input, limiting its widespread +applicability. In constant, our method recovers the physical formation of RS +images by estimating camera poses and velocities, thereby removing the input +constraints on sequential data. Moreover, we adopt a coarse-to-fine training +strategy, in which the RS epipolar constraints of the pairwise frames in the +scene graph are used to detect the camera poses that fall into local minima. +The poses detected as outliers are corrected by the interpolation method with +neighboring poses. The experimental results validate the effectiveness of our +method over state-of-the-art works and demonstrate that the reconstruction of +3D representations is not constrained by the requirement of video sequence +input. + +
+
+
+
+
+ + ♻ ☆ Improving White-box Robustness of Pre-processing Defenses via Joint + Adversarial Training + + +
+ Deep neural networks (DNNs) are vulnerable to adversarial noise. A range of +adversarial defense techniques have been proposed to mitigate the interference +of adversarial noise, among which the input pre-processing methods are scalable +and show great potential to safeguard DNNs. However, pre-processing methods may +suffer from the robustness degradation effect, in which the defense reduces +rather than improving the adversarial robustness of a target model in a +white-box setting. A potential cause of this negative effect is that +adversarial training examples are static and independent to the pre-processing +model. To solve this problem, we investigate the influence of full adversarial +examples which are crafted against the full model, and find they indeed have a +positive impact on the robustness of defenses. Furthermore, we find that simply +changing the adversarial training examples in pre-processing methods does not +completely alleviate the robustness degradation effect. This is due to the +adversarial risk of the pre-processed model being neglected, which is another +cause of the robustness degradation effect. Motivated by above analyses, we +propose a method called Joint Adversarial Training based Pre-processing (JATP) +defense. Specifically, we formulate a feature similarity based adversarial risk +for the pre-processing model by using full adversarial examples found in a +feature space. Unlike standard adversarial training, we only update the +pre-processing model, which prompts us to introduce a pixel-wise loss to +improve its cross-model transferability. We then conduct a joint adversarial +training on the pre-processing model to minimize this overall risk. Empirical +results show that our method could effectively mitigate the robustness +degradation effect across different target models in comparison to previous +state-of-the-art approaches. + +
+
+
+
+
+ + ♻ ☆ Masked Vector Quantization + + +
+ Generative models with discrete latent representations have recently +demonstrated an impressive ability to learn complex high-dimensional data +distributions. However, their performance relies on a long sequence of tokens +per instance and a large number of codebook entries, resulting in long sampling +times and considerable computation to fit the categorical posterior. To address +these issues, we propose the Masked Vector Quantization (MVQ) framework which +increases the representational capacity of each code vector by learning mask +configurations via a stochastic winner-takes-all training regime called +Multiple Hypothese Dropout (MH-Dropout). On ImageNet 64$\times$64, MVQ reduces +FID in existing vector quantization architectures by up to $68\%$ at 2 tokens +per instance and $57\%$ at 5 tokens. These improvements widen as codebook +entries is reduced and allows for $7\textit{--}45\times$ speed-up in token +sampling during inference. As an additional benefit, we find that smaller +latent spaces lead to MVQ identifying transferable visual representations where +multiple can be smoothly combined. + +
+
+ comment: A newer version of this manuscript was archived under 2312.11735 +
+
+
+
+
+ + ♻ ☆ LLaVA-PruMerge: Adaptive Token Reduction for Efficient Large Multimodal + Models + + +
+ Large Multimodal Models (LMMs) have shown significant reasoning capabilities +by connecting a visual encoder and a large language model. LMMs typically use a +fixed amount of visual tokens, such as the penultimate layer features in the +CLIP visual encoder, as the prefix content. Recent LMMs incorporate more +complex visual inputs, such as high-resolution images and videos, which +increase the number of visual tokens significantly. However, due to the design +of the Transformer architecture, computational costs associated with these +models tend to increase quadratically with the number of input tokens. To +tackle this problem, we explore a token reduction mechanism and find, similar +to prior work, that many visual tokens are spatially redundant. Based on this, +we propose PruMerge, a novel adaptive visual token reduction approach, which +largely reduces the number of visual tokens while maintaining comparable model +performance. We first select the unpruned visual tokens based on their +similarity to class tokens and spatial tokens. We then cluster the pruned +tokens based on key similarity and merge the clustered tokens with the unpruned +tokens to supplement their information. Empirically, when applied to LLaVA-1.5, +our approach can compress the visual tokens by 18 times on average, and achieve +comparable performance across diverse visual question-answering and reasoning +tasks. Code and checkpoints are at https://llava-prumerge.github.io/. + +
+
+ comment: Project page: https://llava-prumerge.github.io/ +
+
+
+
+
+ + ♻ ☆ Point-DETR3D: Leveraging Imagery Data with Spatial Point Prior for + Weakly Semi-supervised 3D Object Detection AAAI2024 + + +
+ Training high-accuracy 3D detectors necessitates massive labeled 3D +annotations with 7 degree-of-freedom, which is laborious and time-consuming. +Therefore, the form of point annotations is proposed to offer significant +prospects for practical applications in 3D detection, which is not only more +accessible and less expensive but also provides strong spatial information for +object localization. In this paper, we empirically discover that it is +non-trivial to merely adapt Point-DETR to its 3D form, encountering two main +bottlenecks: 1) it fails to encode strong 3D prior into the model, and 2) it +generates low-quality pseudo labels in distant regions due to the extreme +sparsity of LiDAR points. To overcome these challenges, we introduce +Point-DETR3D, a teacher-student framework for weakly semi-supervised 3D +detection, designed to fully capitalize on point-wise supervision within a +constrained instance-wise annotation budget.Different from Point-DETR which +encodes 3D positional information solely through a point encoder, we propose an +explicit positional query initialization strategy to enhance the positional +prior. Considering the low quality of pseudo labels at distant regions produced +by the teacher model, we enhance the detector's perception by incorporating +dense imagery data through a novel Cross-Modal Deformable RoI Fusion +(D-RoI).Moreover, an innovative point-guided self-supervised learning technique +is proposed to allow for fully exploiting point priors, even in student +models.Extensive experiments on representative nuScenes dataset demonstrate our +Point-DETR3D obtains significant improvements compared to previous works. +Notably, with only 5% of labeled data, Point-DETR3D achieves over 90% +performance of its fully supervised counterpart. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ♻ ☆ Cell Tracking according to Biological Needs -- Strong Mitosis-aware + Random-finite Sets Tracker with Aleatoric Uncertainty + + +
+ Cell tracking and segmentation assist biologists in extracting insights from +large-scale microscopy time-lapse data. Driven by local accuracy metrics, +current tracking approaches often suffer from a lack of long-term consistency. +To address this issue, we introduce an uncertainty estimation technique for +neural tracking-by-regression frameworks and incorporate it into our novel +extended Poisson multi-Bernoulli mixture tracker. Our uncertainty estimation +identifies uncertain associations within high-performing tracking-by-regression +methods using problem-specific test-time augmentations. Leveraging this +uncertainty, along with a novel mitosis-aware assignment problem formulation, +our tracker resolves false associations and mitosis detections stemming from +long-term conflicts. We evaluate our approach on nine competitive datasets and +demonstrate that it outperforms the current state-of-the-art on biologically +relevant metrics substantially, achieving improvements by a factor of +approximately $5.75$. Furthermore, we uncover new insights into the behavior of +tracking-by-regression uncertainty. + +
+
+ comment: 23 pages, 10 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ MEDPSeg: Hierarchical polymorphic multitask learning for the + segmentation of ground-glass opacities, consolidation, and pulmonary + structures on computed tomography + + +
+ The COVID-19 pandemic response highlighted the potential of deep learning +methods in facilitating the diagnosis, prognosis and understanding of lung +diseases through automated segmentation of pulmonary structures and lesions in +chest computed tomography (CT). Automated separation of lung lesion into +ground-glass opacity (GGO) and consolidation is hindered due to the +labor-intensive and subjective nature of this task, resulting in scarce +availability of ground truth for supervised learning. To tackle this problem, +we propose MEDPSeg. MEDPSeg learns from heterogeneous chest CT targets through +hierarchical polymorphic multitask learning (HPML). HPML explores the +hierarchical nature of GGO and consolidation, lung lesions, and the lungs, with +further benefits achieved through multitasking airway and pulmonary artery +segmentation. Over 6000 volumetric CT scans from different partially labeled +sources were used for training and testing. Experiments show PML enabling new +state-of-the-art performance for GGO and consolidation segmentation tasks. In +addition, MEDPSeg simultaneously performs segmentation of the lung parenchyma, +airways, pulmonary artery, and lung lesions, all in a single forward +prediction, with performance comparable to state-of-the-art methods specialized +in each of those targets. Finally, we provide an open-source implementation +with a graphical user interface at https://github.com/MICLab-Unicamp/medpseg. + +
+
+ comment: This manuscript is under review and might change in the future +
+
+
+
+
+ + ♻ ☆ SCHEME: Scalable Channer Mixer for Vision Transformers + + +
+ Vision Transformers have received significant attention due to their +impressive performance in many vision tasks. While the token mixer or attention +block has been studied in great detail, the channel mixer or feature mixing +block (FFN or MLP) has not been explored in depth albeit it accounts for a bulk +of the parameters and computation in a model. In this work, we study whether +sparse feature mixing can replace the dense connections and confirm this with a +block diagonal MLP structure that improves the accuracy by supporting larger +expansion ratios. To improve the feature clusters formed by this structure and +thereby further improve the accuracy, a lightweight, parameter-free, channel +covariance attention (CCA) mechanism is introduced as a parallel branch during +training. This design of CCA enables gradual feature mixing across channel +groups during training whose contribution decays to zero as the training +progresses to convergence. This allows the CCA block to be discarded during +inference, thus enabling enhanced performance with no additional computational +cost. The resulting $\textit{Scalable CHannEl MixEr}$ (SCHEME) can be plugged +into any ViT architecture to obtain a gamut of models with different trade-offs +between complexity and performance by controlling the block diagonal structure +size in the MLP. This is shown by the introduction of a new family of +SCHEMEformer models that is shown to establish new Pareto frontiers for +accuracy vs FLOPS, accuracy vs model size, and accuracy vs throughput, +especially for fast transformers of small model size. For example, the +SCHEMEformer establishes a new SOTA of 79.7% accuracy for ViTs using pure +attention mixers on ImageNet-1K at 1.77G FLOPs. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ Context-Aware Meta-Learning ICLR 2024 + + +
+ Large Language Models like ChatGPT demonstrate a remarkable capacity to learn +new concepts during inference without any fine-tuning. However, visual models +trained to detect new objects during inference have been unable to replicate +this ability, and instead either perform poorly or require meta-training and/or +fine-tuning on similar objects. In this work, we propose a meta-learning +algorithm that emulates Large Language Models by learning new visual concepts +during inference without fine-tuning. Our approach leverages a frozen +pre-trained feature extractor, and analogous to in-context learning, recasts +visual meta-learning as sequence modeling over datapoints with known labels and +a test datapoint with an unknown label. On 8 out of 11 meta-learning +benchmarks, our approach -- without meta-training or fine-tuning -- exceeds or +matches the state-of-the-art algorithm, P>M>F, which is meta-trained on these +benchmarks. Our code is available at https://github.com/cfifty/CAML. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ MetaSegNet: Metadata-collaborative Vision-Language Representation + Learning for Semantic Segmentation of Remote Sensing Images + + +
+ Semantic segmentation of remote sensing images plays a vital role in a wide +range of Earth Observation (EO) applications, such as land use land cover +mapping, environment monitoring, and sustainable development. Driven by rapid +developments in Artificial Intelligence (AI), deep learning (DL) has emerged as +the mainstream tool for semantic segmentation and has achieved many +breakthroughs in the field of remote sensing. However, the existing DL-based +methods mainly focus on unimodal visual data while ignoring the rich multimodal +information involved in the real world, usually demonstrating weak reliability +and generlization. Inspired by the success of Vision Transformers and large +language models, we propose a novel metadata-collaborative multimodal +segmentation network (MetaSegNet) that applies vision-language representation +learning for semantic segmentation of remote sensing images. Unlike the common +model structure that only uses unimodal visual data, we extract the key +characteristic (e.g. the climate zone) from freely available remote sensing +image metadata and transfer it into knowledge-based text prompts via the +generic ChatGPT. Then, we construct an image encoder, a text encoder and a +cross-modal attention fusion subnetwork to extract the image and text feature +and apply image-text interaction. Benefiting from such a design, the proposed +MetaSegNet demonstrates superior generalization and achieves competitive +accuracy with the state-of-the-art semantic segmentation methods on the +large-scale OpenEarthMap dataset (68.6% mIoU) and Potsdam dataset (93.3% mean +F1 score) as well as LoveDA dataset (52.2% mIoU). + +
+
+
+
+
+ + ♻ ☆ Fix-Con: Automatic Fault Localization and Repair of Deep Learning Model + Conversions between Frameworks + + +
+ Converting deep learning models between frameworks is a common step to +maximize model compatibility across devices and leverage optimization features +that may be exclusively provided in one deep learning framework. However, this +conversion process may be riddled with bugs, making the converted models either +undeployable or problematic, considerably degrading their prediction +correctness. + In this paper we propose an automated approach for fault localization and +repair, Fix-Con, during model conversion between deep learning frameworks. +Fix-Con is capable of detecting and fixing faults introduced in model input, +parameters, hyperparameters, and the model graph during conversion. + Fix-Con uses a set of fault types (mined from surveying conversion issues +reported \nick{in code repositories and forums}) to localize potential +conversion faults in the converted target model and then repair them +appropriately, e.g., replacing the parameters of the target model with those +from the source model. This is done iteratively for every image in the dataset, +comparing output label differences between the source model and the converted +target model until all differences are resolved. We evaluate the effectiveness +of Fix-Con in fixing model conversion bugs of three widely used image +recognition models converted across four different deep learning frameworks. +Overall, Fix-Con was able to fix $462$ out of $755$ detected conversion faults, +either completely repairing or significantly improving the performance of $14$ +out of the $15$ erroneous conversion cases. + +
+
+ comment: 12 pages, 4 figures, 3 tables, 1 algorithm +
+
+
+
+
+ + ♻ ☆ DISN: Deep Implicit Surface Network for High-quality Single-view 3D + Reconstruction + + +
+ Reconstructing 3D shapes from single-view images has been a long-standing +research problem. In this paper, we present DISN, a Deep Implicit Surface +Network which can generate a high-quality detail-rich 3D mesh from an 2D image +by predicting the underlying signed distance fields. In addition to utilizing +global image features, DISN predicts the projected location for each 3D point +on the 2D image, and extracts local features from the image feature maps. +Combining global and local features significantly improves the accuracy of the +signed distance field prediction, especially for the detail-rich areas. To the +best of our knowledge, DISN is the first method that constantly captures +details such as holes and thin structures present in 3D shapes from single-view +images. DISN achieves the state-of-the-art single-view reconstruction +performance on a variety of shape categories reconstructed from both synthetic +and real images. Code is available at https://github.com/xharlie/DISN The +supplementary can be found at +https://xharlie.github.io/images/neurips_2019_supp.pdf + +
+
+ comment: This project was in part supported by the gift funding to the + University of Southern California from Adobe Research +
+
+
+
+
+ + ♻ ☆ Fault Localization for Buggy Deep Learning Framework Conversions in + Image Recognition + + +
+ When deploying Deep Neural Networks (DNNs), developers often convert models +from one deep learning framework to another (e.g., TensorFlow to PyTorch). +However, this process is error-prone and can impact target model accuracy. To +identify the extent of such impact, we perform and briefly present a +differential analysis against three DNNs widely used for image recognition +(MobileNetV2, ResNet101, and InceptionV3) converted across four well-known deep +learning frameworks (PyTorch, Keras, TensorFlow (TF), and TFLite), which +revealed numerous model crashes and output label discrepancies of up to 100%. +To mitigate such errors, we present a novel approach towards fault localization +and repair of buggy deep learning framework conversions, focusing on +pre-trained image recognition models. Our technique consists of four stages of +analysis: 1) conversion tools, 2) model parameters, 3) model hyperparameters, +and 4) graph representation. In addition, we propose various strategies towards +fault repair of the faults detected. We implement our technique on top of the +Apache TVM deep learning compiler, and we test it by conducting a preliminary +fault localization analysis for the conversion of InceptionV3 from TF to +TFLite. Our approach detected a fault in a common DNN converter tool, which +introduced precision errors in weights, reducing model accuracy. After our +fault localization, we repaired the issue, reducing our conversion error to +zero. + +
+
+ comment: 5 pages, 3 figures, 1 table +
+
+
+
+
+ + ♻ ☆ DeltaNN: Assessing the Impact of Computational Environment Parameters on + the Performance of Image Recognition Models + + +
+ Image recognition tasks typically use deep learning and require enormous +processing power, thus relying on hardware accelerators like GPUs and TPUs for +fast, timely processing. Failure in real-time image recognition tasks can occur +due to sub-optimal mapping on hardware accelerators during model deployment, +which may lead to timing uncertainty and erroneous behavior. Mapping on +hardware accelerators is done using multiple software components like deep +learning frameworks, compilers, and device libraries, that we refer to as the +computational environment. Owing to the increased use of image recognition +tasks in safety-critical applications like autonomous driving and medical +imaging, it is imperative to assess their robustness to changes in the +computational environment, as the impact of parameters like deep learning +frameworks, compiler optimizations, and hardware devices on model performance +and correctness is not yet well understood. + In this paper we present a differential testing framework, DeltaNN, that +allows us to assess the impact of different computational environment +parameters on the performance of image recognition models during deployment, +post training. DeltaNN generates different implementations of a given image +recognition model for variations in environment parameters, namely, deep +learning frameworks, compiler optimizations and hardware devices and analyzes +differences in model performance as a result. Using DeltaNN, we conduct an +empirical study of robustness analysis of three popular image recognition +models using the ImageNet dataset. We report the impact in terms of +misclassifications and inference time differences across different settings. In +total, we observed up to 100% output label differences across deep learning +frameworks, and up to 81% unexpected performance degradation in terms of +inference time, when applying compiler optimizations. + +
+
+ comment: 11 pages, 10 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Few-Shot Class Incremental Learning with Attention-Aware Self-Adaptive + Prompt + + +
+ Few-Shot Class-Incremental Learning (FSCIL) models aim to incrementally learn +new classes with scarce samples while preserving knowledge of old ones. +Existing FSCIL methods usually fine-tune the entire backbone, leading to +overfitting and hindering the potential to learn new classes. On the other +hand, recent prompt-based CIL approaches alleviate forgetting by training +prompts with sufficient data in each task. In this work, we propose a novel +framework named Attention-aware Self-adaptive Prompt (ASP). ASP encourages +task-invariant prompts to capture shared knowledge by reducing specific +information from the attention aspect. Additionally, self-adaptive +task-specific prompts in ASP provide specific information and transfer +knowledge from old classes to new classes with an Information Bottleneck +learning objective. In summary, ASP prevents overfitting on base task and does +not require enormous data in few-shot incremental tasks. Extensive experiments +on three benchmark datasets validate that ASP consistently outperforms +state-of-the-art FSCIL and prompt-based CIL methods in terms of both learning +new classes and mitigating forgetting. + +
+
+
+
+
+ + ♻ ☆ Pre-Trained Masked Image Model for Mobile Robot Navigation ICRA 2024 + + +
+ 2D top-down maps are commonly used for the navigation and exploration of +mobile robots through unknown areas. Typically, the robot builds the navigation +maps incrementally from local observations using onboard sensors. Recent works +have shown that predicting the structural patterns in the environment through +learning-based approaches can greatly enhance task efficiency. While many such +works build task-specific networks using limited datasets, we show that the +existing foundational vision networks can accomplish the same without any +fine-tuning. Specifically, we use Masked Autoencoders, pre-trained on street +images, to present novel applications for field-of-view expansion, single-agent +topological exploration, and multi-agent exploration for indoor mapping, across +different input modalities. Our work motivates the use of foundational vision +models for generalized structure prediction-driven applications, especially in +the dearth of training data. For more qualitative results see +https://raaslab.org/projects/MIM4Robots. + +
+
+ comment: Accepted at ICRA 2024 +
+
+
+
+
+ + ♻ ☆ LEOD: Label-Efficient Object Detection for Event Cameras CVPR 2024 + + +
+ Object detection with event cameras benefits from the sensor's low latency +and high dynamic range. However, it is costly to fully label event streams for +supervised training due to their high temporal resolution. To reduce this cost, +we present LEOD, the first method for label-efficient event-based detection. +Our approach unifies weakly- and semi-supervised object detection with a +self-training mechanism. We first utilize a detector pre-trained on limited +labels to produce pseudo ground truth on unlabeled events. Then, the detector +is re-trained with both real and generated labels. Leveraging the temporal +consistency of events, we run bi-directional inference and apply tracking-based +post-processing to enhance the quality of pseudo labels. To stabilize training +against label noise, we further design a soft anchor assignment strategy. We +introduce new experimental protocols to evaluate the task of label-efficient +event-based detection on Gen1 and 1Mpx datasets. LEOD consistently outperforms +supervised baselines across various labeling ratios. For example, on Gen1, it +improves mAP by 8.6% and 7.8% for RVT-S trained with 1% and 2% labels. On 1Mpx, +RVT-S with 10% labels even surpasses its fully-supervised counterpart using +100% labels. LEOD maintains its effectiveness even when all labeled data are +available, reaching new state-of-the-art results. Finally, we show that our +method readily scales to improve larger detectors as well. Code is released at +https://github.com/Wuziyi616/LEOD + +
+
+ comment: CVPR 2024. Code: https://github.com/Wuziyi616/LEOD +
+
+
+
+
+ + ♻ ☆ ContextSeg: Sketch Semantic Segmentation by Querying the Context with + Attention + + +
+ Sketch semantic segmentation is a well-explored and pivotal problem in +computer vision involving the assignment of pre-defined part labels to +individual strokes. This paper presents ContextSeg - a simple yet highly +effective approach to tackling this problem with two stages. In the first +stage, to better encode the shape and positional information of strokes, we +propose to predict an extra dense distance field in an autoencoder network to +reinforce structural information learning. In the second stage, we treat an +entire stroke as a single entity and label a group of strokes within the same +semantic part using an auto-regressive Transformer with the default attention +mechanism. By group-based labeling, our method can fully leverage the context +information when making decisions for the remaining groups of strokes. Our +method achieves the best segmentation accuracy compared with state-of-the-art +approaches on two representative datasets and has been extensively evaluated +demonstrating its superior performance. Additionally, we offer insights into +solving part imbalance in training data and the preliminary experiment on +cross-category training, which can inspire future research in this field. + +
+
+
+
+
+ + ♻ ☆ Fusing Domain-Specific Content from Large Language Models into Knowledge + Graphs for Enhanced Zero Shot Object State Classification AAAI + + +
+ Domain-specific knowledge can significantly contribute to addressing a wide +variety of vision tasks. However, the generation of such knowledge entails +considerable human labor and time costs. This study investigates the potential +of Large Language Models (LLMs) in generating and providing domain-specific +information through semantic embeddings. To achieve this, an LLM is integrated +into a pipeline that utilizes Knowledge Graphs and pre-trained semantic vectors +in the context of the Vision-based Zero-shot Object State Classification task. +We thoroughly examine the behavior of the LLM through an extensive ablation +study. Our findings reveal that the integration of LLM-based embeddings, in +combination with general-purpose pre-trained embeddings, leads to substantial +performance improvements. Drawing insights from this ablation study, we conduct +a comparative analysis against competing models, thereby highlighting the +state-of-the-art performance achieved by the proposed approach. + +
+
+ comment: Accepted at the AAAI-MAKE 24 +
+
+
+
+
+ + ♻ ☆ A Closer Look at the Few-Shot Adaptation of Large Vision-Language Models CVPR 2024 + + +
+ Efficient transfer learning (ETL) is receiving increasing attention to adapt +large pre-trained language-vision models on downstream tasks with a few labeled +samples. While significant progress has been made, we reveal that +state-of-the-art ETL approaches exhibit strong performance only in +narrowly-defined experimental setups, and with a careful adjustment of +hyperparameters based on a large corpus of labeled samples. In particular, we +make two interesting, and surprising empirical observations. First, to +outperform a simple Linear Probing baseline, these methods require to optimize +their hyper-parameters on each target task. And second, they typically +underperform -- sometimes dramatically -- standard zero-shot predictions in the +presence of distributional drifts. Motivated by the unrealistic assumptions +made in the existing literature, i.e., access to a large validation set and +case-specific grid-search for optimal hyperparameters, we propose a novel +approach that meets the requirements of real-world scenarios. More concretely, +we introduce a CLass-Adaptive linear Probe (CLAP) objective, whose balancing +term is optimized via an adaptation of the general Augmented Lagrangian method +tailored to this context. We comprehensively evaluate CLAP on a broad span of +datasets and scenarios, demonstrating that it consistently outperforms SoTA +approaches, while yet being a much more efficient alternative. + +
+
+ comment: CVPR 2024. Code: https://github.com/jusiro/CLAP +
+
+
+
+
+ + ♻ ☆ Testing MediaPipe Holistic for Linguistic Analysis of Nonmanual Markers + in Sign Languages + + +
+ Advances in Deep Learning have made possible reliable landmark tracking of +human bodies and faces that can be used for a variety of tasks. We test a +recent Computer Vision solution, MediaPipe Holistic (MPH), to find out if its +tracking of the facial features is reliable enough for a linguistic analysis of +data from sign languages, and compare it to an older solution (OpenFace, OF). +We use an existing data set of sentences in Kazakh-Russian Sign Language and a +newly created small data set of videos with head tilts and eyebrow movements. +We find that MPH does not perform well enough for linguistic analysis of +eyebrow movement - but in a different way from OF, which is also performing +poorly without correction. We reiterate a previous proposal to train additional +correction models to overcome these limitations. + +
+
+
+
+
+ + ♻ ☆ CADTalk: An Algorithm and Benchmark for Semantic Commenting of CAD + Programs + + +
+ CAD programs are a popular way to compactly encode shapes as a sequence of +operations that are easy to parametrically modify. However, without sufficient +semantic comments and structure, such programs can be challenging to +understand, let alone modify. We introduce the problem of semantic commenting +CAD programs, wherein the goal is to segment the input program into code blocks +corresponding to semantically meaningful shape parts and assign a semantic +label to each block. We solve the problem by combining program parsing with +visual-semantic analysis afforded by recent advances in foundational language +and vision models. Specifically, by executing the input programs, we create +shapes, which we use to generate conditional photorealistic images to make use +of semantic annotators for such images. We then distill the information across +the images and link back to the original programs to semantically comment on +them. Additionally, we collected and annotated a benchmark dataset, CADTalk, +consisting of 5,288 machine-made programs and 45 human-made programs with +ground truth semantic comments. We extensively evaluated our approach, compared +it to a GPT-based baseline, and an open-set shape segmentation baseline, and +reported an 83.24% accuracy on the new CADTalk dataset. Code and data: +https://enigma-li.github.io/CADTalk/. + +
+
+
+
+
+ + ♻ ☆ TetraSphere: A Neural Descriptor for O(3)-Invariant Point Cloud Analysis CVPR 2024 + + +
+ In many practical applications, 3D point cloud analysis requires rotation +invariance. In this paper, we present a learnable descriptor invariant under 3D +rotations and reflections, i.e., the O(3) actions, utilizing the recently +introduced steerable 3D spherical neurons and vector neurons. Specifically, we +propose an embedding of the 3D spherical neurons into 4D vector neurons, which +leverages end-to-end training of the model. In our approach, we perform +TetraTransform--an equivariant embedding of the 3D input into 4D, constructed +from the steerable neurons--and extract deeper O(3)-equivariant features using +vector neurons. This integration of the TetraTransform into the VN-DGCNN +framework, termed TetraSphere, negligibly increases the number of parameters by +less than 0.0002%. TetraSphere sets a new state-of-the-art performance +classifying randomly rotated real-world object scans of the challenging subsets +of ScanObjectNN. Additionally, TetraSphere outperforms all equivariant methods +on randomly rotated synthetic data: classifying objects from ModelNet40 and +segmenting parts of the ShapeNet shapes. Thus, our results reveal the practical +value of steerable 3D spherical neurons for learning in 3D Euclidean space. The +code is available at https://github.com/pavlo-melnyk/tetrasphere. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Check, Locate, Rectify: A Training-Free Layout Calibration System for + Text-to-Image Generation + + +
+ Diffusion models have recently achieved remarkable progress in generating +realistic images. However, challenges remain in accurately understanding and +synthesizing the layout requirements in the textual prompts. To align the +generated image with layout instructions, we present a training-free layout +calibration system SimM that intervenes in the generative process on the fly +during inference time. Specifically, following a "check-locate-rectify" +pipeline, the system first analyses the prompt to generate the target layout +and compares it with the intermediate outputs to automatically detect errors. +Then, by moving the located activations and making intra- and inter-map +adjustments, the rectification process can be performed with negligible +computational overhead. To evaluate SimM over a range of layout requirements, +we present a benchmark SimMBench that compensates for the lack of superlative +spatial relations in existing datasets. And both quantitative and qualitative +results demonstrate the effectiveness of the proposed SimM in calibrating the +layout inconsistencies. Our project page is at https://simm-t2i.github.io/SimM. + +
+
+
+
+
+ + ♻ ☆ From Pixels to Insights: A Survey on Automatic Chart Understanding in + the Era of Large Foundation Models + + +
+ Data visualization in the form of charts plays a pivotal role in data +analysis, offering critical insights and aiding in informed decision-making. +Automatic chart understanding has witnessed significant advancements with the +rise of large foundation models in recent years. Foundation models, such as +large language models, have revolutionized various natural language processing +tasks and are increasingly being applied to chart understanding tasks. This +survey paper provides a comprehensive overview of the recent developments, +challenges, and future directions in chart understanding within the context of +these foundation models. We review fundamental building blocks crucial for +studying chart understanding tasks. Additionally, we explore various tasks and +their evaluation metrics and sources of both charts and textual inputs. Various +modeling strategies are then examined, encompassing both classification-based +and generation-based approaches, along with tool augmentation techniques that +enhance chart understanding performance. Furthermore, we discuss the +state-of-the-art performance of each task and discuss how we can improve the +performance. Challenges and future directions are addressed, highlighting the +importance of several topics, such as domain-specific charts, lack of efforts +in developing evaluation metrics, and agent-oriented settings. This survey +paper serves as a comprehensive resource for researchers and practitioners in +the fields of natural language processing, computer vision, and data analysis, +providing valuable insights and directions for future research in chart +understanding leveraging large foundation models. The studies mentioned in this +paper, along with emerging new research, will be continually updated at: +https://github.com/khuangaf/Awesome-Chart-Understanding. + +
+
+
+
+
+ + ♻ ☆ Learning Disentangled Identifiers for Action-Customized Text-to-Image + Generation + + +
+ This study focuses on a novel task in text-to-image (T2I) generation, namely +action customization. The objective of this task is to learn the co-existing +action from limited data and generalize it to unseen humans or even animals. +Experimental results show that existing subject-driven customization methods +fail to learn the representative characteristics of actions and struggle in +decoupling actions from context features, including appearance. To overcome the +preference for low-level features and the entanglement of high-level features, +we propose an inversion-based method Action-Disentangled Identifier (ADI) to +learn action-specific identifiers from the exemplar images. ADI first expands +the semantic conditioning space by introducing layer-wise identifier tokens, +thereby increasing the representational richness while distributing the +inversion across different features. Then, to block the inversion of +action-agnostic features, ADI extracts the gradient invariance from the +constructed sample triples and masks the updates of irrelevant channels. To +comprehensively evaluate the task, we present an ActionBench that includes a +variety of actions, each accompanied by meticulously selected samples. Both +quantitative and qualitative results show that our ADI outperforms existing +baselines in action-customized T2I generation. Our project page is at +https://adi-t2i.github.io/ADI. + +
+
+
+
+
+ + ♻ ☆ TACO: Benchmarking Generalizable Bimanual Tool-ACtion-Object + Understanding + + +
+ Humans commonly work with multiple objects in daily life and can intuitively +transfer manipulation skills to novel objects by understanding object +functional regularities. However, existing technical approaches for analyzing +and synthesizing hand-object manipulation are mostly limited to handling a +single hand and object due to the lack of data support. To address this, we +construct TACO, an extensive bimanual hand-object-interaction dataset spanning +a large variety of tool-action-object compositions for daily human activities. +TACO contains 2.5K motion sequences paired with third-person and egocentric +views, precise hand-object 3D meshes, and action labels. To rapidly expand the +data scale, we present a fully automatic data acquisition pipeline combining +multi-view sensing with an optical motion capture system. With the vast +research fields provided by TACO, we benchmark three generalizable +hand-object-interaction tasks: compositional action recognition, generalizable +hand-object motion forecasting, and cooperative grasp synthesis. Extensive +experiments reveal new insights, challenges, and opportunities for advancing +the studies of generalizable hand-object motion analysis and synthesis. Our +data and code are available at https://taco2024.github.io. + +
+
+
+
+
+ + ♻ ☆ Estimating Uncertainty in Landslide Segmentation Models + + +
+ Landslides are a recurring, widespread hazard. Preparation and mitigation +efforts can be aided by a high-quality, large-scale dataset that covers global +at-risk areas. Such a dataset currently does not exist and is impossible to +construct manually. Recent automated efforts focus on deep learning models for +landslide segmentation (pixel labeling) from satellite imagery. However, it is +also important to characterize the uncertainty or confidence levels of such +segmentations. Accurate and robust uncertainty estimates can enable low-cost +(in terms of manual labor) oversight of auto-generated landslide databases to +resolve errors, identify hard negative examples, and increase the size of +labeled training data. In this paper, we evaluate several methods for assessing +pixel-level uncertainty of the segmentation. Three methods that do not require +architectural changes were compared, including Pre-Threshold activations, +Monte-Carlo Dropout and Test-Time Augmentation -- a method that measures the +robustness of predictions in the face of data augmentation. Experimentally, the +quality of the latter method was consistently higher than the others across a +variety of models and metrics in our dataset. + +
+
+
+
+
+ + ♻ ☆ Point Transformer V3: Simpler, Faster, Stronger CVPR 2024 + + +
+ This paper is not motivated to seek innovation within the attention +mechanism. Instead, it focuses on overcoming the existing trade-offs between +accuracy and efficiency within the context of point cloud processing, +leveraging the power of scale. Drawing inspiration from recent advances in 3D +large-scale representation learning, we recognize that model performance is +more influenced by scale than by intricate design. Therefore, we present Point +Transformer V3 (PTv3), which prioritizes simplicity and efficiency over the +accuracy of certain mechanisms that are minor to the overall performance after +scaling, such as replacing the precise neighbor search by KNN with an efficient +serialized neighbor mapping of point clouds organized with specific patterns. +This principle enables significant scaling, expanding the receptive field from +16 to 1024 points while remaining efficient (a 3x increase in processing speed +and a 10x improvement in memory efficiency compared with its predecessor, +PTv2). PTv3 attains state-of-the-art results on over 20 downstream tasks that +span both indoor and outdoor scenarios. Further enhanced with multi-dataset +joint training, PTv3 pushes these results to a higher level. + +
+
+ comment: CVPR 2024, code available at Pointcept + (https://github.com/Pointcept/PointTransformerV3) +
+
+
+
+
+ + ♻ ☆ A Forward and Backward Compatible Framework for Few-shot + Class-incremental Pill Recognition + + +
+ Automatic Pill Recognition (APR) systems are crucial for enhancing hospital +efficiency, assisting visually impaired individuals, and preventing +cross-infection. However, most existing deep learning-based pill recognition +systems can only perform classification on classes with sufficient training +data. In practice, the high cost of data annotation and the continuous increase +in new pill classes necessitate the development of a few-shot class-incremental +pill recognition system. This paper introduces the first few-shot +class-incremental pill recognition framework, named Discriminative and +Bidirectional Compatible Few-Shot Class-Incremental Learning (DBC-FSCIL). It +encompasses forward-compatible and backward-compatible learning components. In +forward-compatible learning, we propose an innovative virtual class synthesis +strategy and a Center-Triplet (CT) loss to enhance discriminative feature +learning. These virtual classes serve as placeholders in the feature space for +future class updates, providing diverse semantic knowledge for model training. +For backward-compatible learning, we develop a strategy to synthesize reliable +pseudo-features of old classes using uncertainty quantification, facilitating +Data Replay (DR) and Knowledge Distillation (KD). This approach allows for the +flexible synthesis of features and effectively reduces additional storage +requirements for samples and models. Additionally, we construct a new pill +image dataset for FSCIL and assess various mainstream FSCIL methods, +establishing new benchmarks. Our experimental results demonstrate that our +framework surpasses existing State-of-the-art (SOTA) methods. The code is +available at https://github.com/zhang-jinghua/DBC-FSCIL. + +
+
+
+
+
+ + ♻ ☆ Efficient Dataset Distillation via Minimax Diffusion CVPR 2024 + + +
+ Dataset distillation reduces the storage and computational consumption of +training a network by generating a small surrogate dataset that encapsulates +rich information of the original large-scale one. However, previous +distillation methods heavily rely on the sample-wise iterative optimization +scheme. As the images-per-class (IPC) setting or image resolution grows larger, +the necessary computation will demand overwhelming time and resources. In this +work, we intend to incorporate generative diffusion techniques for computing +the surrogate dataset. Observing that key factors for constructing an effective +surrogate dataset are representativeness and diversity, we design additional +minimax criteria in the generative training to enhance these facets for the +generated images of diffusion models. We present a theoretical model of the +process as hierarchical diffusion control demonstrating the flexibility of the +diffusion process to target these criteria without jeopardizing the +faithfulness of the sample to the desired distribution. The proposed method +achieves state-of-the-art validation performance while demanding much less +computational resources. Under the 100-IPC setting on ImageWoof, our method +requires less than one-twentieth the distillation time of previous methods, yet +yields even better performance. Source code and generated data are available in +https://github.com/vimar-gu/MinimaxDiffusion. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Zero-BEV: Zero-shot Projection of Any First-Person Modality to BEV Maps + + +
+ Bird's-eye view (BEV) maps are an important geometrically structured +representation widely used in robotics, in particular self-driving vehicles and +terrestrial robots. Existing algorithms either require depth information for +the geometric projection, which is not always reliably available, or are +trained end-to-end in a fully supervised way to map visual first-person +observations to BEV representation, and are therefore restricted to the output +modality they have been trained for. In contrast, we propose a new model +capable of performing zero-shot projections of any modality available in a +first person view to the corresponding BEV map. This is achieved by +disentangling the geometric inverse perspective projection from the modality +transformation, eg. RGB to occupancy. The method is general and we showcase +experiments projecting to BEV three different modalities: semantic +segmentation, motion vectors and object bounding boxes detected in first +person. We experimentally show that the model outperforms competing methods, in +particular the widely used baseline resorting to monocular depth estimation. + +
+
+
+
+
+ + ♻ ☆ LAENeRF: Local Appearance Editing for Neural Radiance Fields CVPR 2024 + + +
+ Due to the omnipresence of Neural Radiance Fields (NeRFs), the interest +towards editable implicit 3D representations has surged over the last years. +However, editing implicit or hybrid representations as used for NeRFs is +difficult due to the entanglement of appearance and geometry encoded in the +model parameters. Despite these challenges, recent research has shown first +promising steps towards photorealistic and non-photorealistic appearance edits. +The main open issues of related work include limited interactivity, a lack of +support for local edits and large memory requirements, rendering them less +useful in practice. We address these limitations with LAENeRF, a unified +framework for photorealistic and non-photorealistic appearance editing of +NeRFs. To tackle local editing, we leverage a voxel grid as starting point for +region selection. We learn a mapping from expected ray terminations to final +output color, which can optionally be supervised by a style loss, resulting in +a framework which can perform photorealistic and non-photorealistic appearance +editing of selected regions. Relying on a single point per ray for our mapping, +we limit memory requirements and enable fast optimization. To guarantee +interactivity, we compose the output color using a set of learned, modifiable +base colors, composed with additive layer mixing. Compared to concurrent work, +LAENeRF enables recoloring and stylization while keeping processing time low. +Furthermore, we demonstrate that our approach surpasses baseline methods both +quantitatively and qualitatively. + +
+
+ comment: Accepted to CVPR 2024! Project website: + https://r4dl.github.io/LAENeRF/ +
+
+
+
+
+ + ♻ ☆ A Call to Reflect on Evaluation Practices for Age Estimation: + Comparative Analysis of the State-of-the-Art and a Unified Benchmark CVPR 2024 + + +
+ Comparing different age estimation methods poses a challenge due to the +unreliability of published results stemming from inconsistencies in the +benchmarking process. Previous studies have reported continuous performance +improvements over the past decade using specialized methods; however, our +findings challenge these claims. This paper identifies two trivial, yet +persistent issues with the currently used evaluation protocol and describes how +to resolve them. We offer an extensive comparative analysis for +state-of-the-art facial age estimation methods. Surprisingly, we find that the +performance differences between the methods are negligible compared to the +effect of other factors, such as facial alignment, facial coverage, image +resolution, model architecture, or the amount of data used for pretraining. We +use the gained insights to propose using FaRL as the backbone model and +demonstrate its effectiveness on all public datasets. We make the source code +and exact data splits public on GitHub. + +
+
+ comment: CVPR 2024 Camera-Ready +
+
+
+
+
+ + ♻ ☆ Investigating and Mitigating the Side Effects of Noisy Views for + Self-Supervised Clustering Algorithms in Practical Multi-View Scenarios + + +
+ Multi-view clustering (MVC) aims at exploring category structures among +multi-view data in self-supervised manners. Multiple views provide more +information than single views and thus existing MVC methods can achieve +satisfactory performance. However, their performance might seriously degenerate +when the views are noisy in practical multi-view scenarios. In this paper, we +formally investigate the drawback of noisy views and then propose a +theoretically grounded deep MVC method (namely MVCAN) to address this issue. +Specifically, we propose a novel MVC objective that enables un-shared +parameters and inconsistent clustering predictions across multiple views to +reduce the side effects of noisy views. Furthermore, a two-level multi-view +iterative optimization is designed to generate robust learning targets for +refining individual views' representation learning. Theoretical analysis +reveals that MVCAN works by achieving the multi-view consistency, +complementarity, and noise robustness. Finally, experiments on extensive public +datasets demonstrate that MVCAN outperforms state-of-the-art methods and is +robust against the existence of noisy views. + +
+
+
+
+
+ + ♻ ☆ V4D: Voxel for 4D Novel View Synthesis + + +
+ Neural radiance fields have made a remarkable breakthrough in the novel view +synthesis task at the 3D static scene. However, for the 4D circumstance (e.g., +dynamic scene), the performance of the existing method is still limited by the +capacity of the neural network, typically in a multilayer perceptron network +(MLP). In this paper, we utilize 3D Voxel to model the 4D neural radiance +field, short as V4D, where the 3D voxel has two formats. The first one is to +regularly model the 3D space and then use the sampled local 3D feature with the +time index to model the density field and the texture field by a tiny MLP. The +second one is in look-up tables (LUTs) format that is for the pixel-level +refinement, where the pseudo-surface produced by the volume rendering is +utilized as the guidance information to learn a 2D pixel-level refinement +mapping. The proposed LUTs-based refinement module achieves the performance +gain with little computational cost and could serve as the plug-and-play module +in the novel view synthesis task. Moreover, we propose a more effective +conditional positional encoding toward the 4D data that achieves performance +gain with negligible computational burdens. Extensive experiments demonstrate +that the proposed method achieves state-of-the-art performance at a low +computational cost. + +
+
+ comment: Code released. Accepted by IEEE TVCG 2023 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 111 + +
+
+
+ + ☆ AutoInst: Automatic Instance-Based Segmentation of LiDAR 3D Scans + + +
+ Recently, progress in acquisition equipment such as LiDAR sensors has enabled +sensing increasingly spacious outdoor 3D environments. Making sense of such 3D +acquisitions requires fine-grained scene understanding, such as constructing +instance-based 3D scene segmentations. Commonly, a neural network is trained +for this task; however, this requires access to a large, densely annotated +dataset, which is widely known to be challenging to obtain. To address this +issue, in this work we propose to predict instance segmentations for 3D scenes +in an unsupervised way, without relying on ground-truth annotations. To this +end, we construct a learning framework consisting of two components: (1) a +pseudo-annotation scheme for generating initial unsupervised pseudo-labels; and +(2) a self-training algorithm for instance segmentation to fit robust, accurate +instances from initial noisy proposals. To enable generating 3D instance mask +proposals, we construct a weighted proxy-graph by connecting 3D points with +edges integrating multi-modal image- and point-based self-supervised features, +and perform graph-cuts to isolate individual pseudo-instances. We then build on +a state-of-the-art point-based architecture and train a 3D instance +segmentation model, resulting in significant refinement of initial proposals. +To scale to arbitrary complexity 3D scenes, we design our algorithm to operate +on local 3D point chunks and construct a merging step to generate scene-level +instance segmentations. Experiments on the challenging SemanticKITTI benchmark +demonstrate the potential of our approach, where it attains 13.3% higher +Average Precision and 9.1% higher F1 score compared to the best-performing +baseline. The code will be made publicly available at +https://github.com/artonson/autoinst. + +
+
+ comment: 9 pages, 7 figures +
+
+
+
+
+ + ☆ latentSplat: Autoencoding Variational Gaussians for Fast Generalizable + 3D Reconstruction + + +
+ We present latentSplat, a method to predict semantic Gaussians in a 3D latent +space that can be splatted and decoded by a light-weight generative 2D +architecture. Existing methods for generalizable 3D reconstruction either do +not enable fast inference of high resolution novel views due to slow volume +rendering, or are limited to interpolation of close input views, even in +simpler settings with a single central object, where 360-degree generalization +is possible. In this work, we combine a regression-based approach with a +generative model, moving towards both of these capabilities within the same +method, trained purely on readily available real video data. The core of our +method are variational 3D Gaussians, a representation that efficiently encodes +varying uncertainty within a latent space consisting of 3D feature Gaussians. +From these Gaussians, specific instances can be sampled and rendered via +efficient Gaussian splatting and a fast, generative decoder network. We show +that latentSplat outperforms previous works in reconstruction quality and +generalization, while being fast and scalable to high-resolution data. + +
+
+ comment: Project website: https://geometric-rl.mpi-inf.mpg.de/latentsplat/ +
+
+
+
+
+ + ☆ HemoSet: The First Blood Segmentation Dataset for Automation of + Hemostasis Management + + +
+ Hemorrhaging occurs in surgeries of all types, forcing surgeons to quickly +adapt to the visual interference that results from blood rapidly filling the +surgical field. Introducing automation into the crucial surgical task of +hemostasis management would offload mental and physical tasks from the surgeon +and surgical assistants while simultaneously increasing the efficiency and +safety of the operation. The first step in automation of hemostasis management +is detection of blood in the surgical field. To propel the development of blood +detection algorithms in surgeries, we present HemoSet, the first blood +segmentation dataset based on bleeding during a live animal robotic surgery. +Our dataset features vessel hemorrhage scenarios where turbulent flow leads to +abnormal pooling geometries in surgical fields. These pools are formed in +conditions endemic to surgical procedures -- uneven heterogeneous tissue, under +glossy lighting conditions and rapid tool movement. We benchmark several +state-of-the-art segmentation models and provide insight into the difficulties +specific to blood detection. We intend for HemoSet to spur development of +autonomous blood suction tools by providing a platform for training and +refining blood segmentation models, addressing the precision needed for such +robotics. + +
+
+
+
+
+ + ☆ AVicuna: Audio-Visual LLM with Interleaver and Context-Boundary + Alignment for Temporal Referential Dialogue + + +
+ In everyday communication, humans frequently use speech and gestures to refer +to specific areas or objects, a process known as Referential Dialogue (RD). +While prior studies have investigated RD through Large Language Models (LLMs) +or Large Multimodal Models (LMMs) in static contexts, the exploration of +Temporal Referential Dialogue (TRD) within audio-visual media remains limited. +Two primary challenges hinder progress in this field: (1) the absence of +comprehensive, untrimmed audio-visual video datasets with precise temporal +annotations, and (2) the need for methods to integrate complex temporal +auditory and visual cues effectively. To address these challenges, we introduce +a novel framework to generate PU-VALOR, an extensive audio-visual dataset +comprising over 114,000 untrimmed videos with accurate temporal demarcations. +We also present AVicuna, featuring an Audio-Visual Tokens Interleaver (AVTI) +that ensures the temporal alignment of audio-visual information. Additionally, +we develop the A5-222K dataset, encompassing more than 200,000 audio-text +pairings, to facilitate the audio and text alignments. Our experiments +demonstrate that AVicuna can effectively handle TRD in audio-visual videos and +achieve state-of-the-art performance on various audio-visual video +understanding tasks, particularly in untrimmed videos. We further investigate +the optimal audio-interleaving rate for interleaved audio-visual inputs, which +maximizes performance on the Audio-Visual Event Dense Localization task. + +
+
+
+
+
+ + ☆ L-MAE: Longitudinal masked auto-encoder with time and severity-aware + encoding for diabetic retinopathy progression prediction + + +
+ Pre-training strategies based on self-supervised learning (SSL) have proven +to be effective pretext tasks for many downstream tasks in computer vision. Due +to the significant disparity between medical and natural images, the +application of typical SSL is not straightforward in medical imaging. +Additionally, those pretext tasks often lack context, which is critical for +computer-aided clinical decision support. In this paper, we developed a +longitudinal masked auto-encoder (MAE) based on the well-known +Transformer-based MAE. In particular, we explored the importance of time-aware +position embedding as well as disease progression-aware masking. Taking into +account the time between examinations instead of just scheduling them offers +the benefit of capturing temporal changes and trends. The masking strategy, for +its part, evolves during follow-up to better capture pathological changes, +ensuring a more accurate assessment of disease progression. Using OPHDIAT, a +large follow-up screening dataset targeting diabetic retinopathy (DR), we +evaluated the pre-trained weights on a longitudinal task, which is to predict +the severity label of the next visit within 3 years based on the past time +series examinations. Our results demonstrated the relevancy of both time-aware +position embedding and masking strategies based on disease progression +knowledge. Compared to popular baseline models and standard longitudinal +Transformers, these simple yet effective extensions significantly enhance the +predictive ability of deep classification models. + +
+
+
+
+
+ + ☆ Object Detectors in the Open Environment:Challenges, Solutions, and + Outlook + + +
+ With the emergence of foundation models, deep learning-based object detectors +have shown practical usability in closed set scenarios. However, for real-world +tasks, object detectors often operate in open environments, where crucial +factors (\eg, data distribution, objective) that influence model learning are +often changing. The dynamic and intricate nature of the open environment poses +novel and formidable challenges to object detectors. Unfortunately, current +research on object detectors in open environments lacks a comprehensive +analysis of their distinctive characteristics, challenges, and corresponding +solutions, which hinders their secure deployment in critical real-world +scenarios. This paper aims to bridge this gap by conducting a comprehensive +review and analysis of object detectors in open environments. We initially +identified limitations of key structural components within the existing +detection pipeline and propose the open environment object detector challenge +framework that includes four quadrants (\ie, out-of-domain, out-of-category, +robust learning, and incremental learning) based on the dimensions of the data +/ target changes. For each quadrant of challenges in the proposed framework, we +present a detailed description and systematic analysis of the overarching goals +and core difficulties, systematically review the corresponding solutions, and +benchmark their performance over multiple widely adopted datasets. In addition, +we engage in a discussion of open problems and potential avenues for future +research. This paper aims to provide a fresh, comprehensive, and systematic +understanding of the challenges and solutions associated with open-environment +object detectors, thus catalyzing the development of more solid applications in +real-world scenarios. + +
+
+ comment: 32 pages, 17 figures +
+
+
+
+
+ + ☆ Constricting Normal Latent Space for Anomaly Detection with Normal-only + Training Data ICLR + + +
+ In order to devise an anomaly detection model using only normal training +data, an autoencoder (AE) is typically trained to reconstruct the data. As a +result, the AE can extract normal representations in its latent space. During +test time, since AE is not trained using real anomalies, it is expected to +poorly reconstruct the anomalous data. However, several researchers have +observed that it is not the case. In this work, we propose to limit the +reconstruction capability of AE by introducing a novel latent constriction +loss, which is added to the existing reconstruction loss. By using our method, +no extra computational cost is added to the AE during test time. Evaluations +using three video anomaly detection benchmark datasets, i.e., Ped2, Avenue, and +ShanghaiTech, demonstrate the effectiveness of our method in limiting the +reconstruction capability of AE, which leads to a better anomaly detection +model. + +
+
+ comment: ICLR Workshop 2024 (PML4LRS) +
+
+
+
+
+ + ☆ Emotion Recognition from the perspective of Activity Recognition + + +
+ Applications of an efficient emotion recognition system can be found in +several domains such as medicine, driver fatigue surveillance, social robotics, +and human-computer interaction. Appraising human emotional states, behaviors, +and reactions displayed in real-world settings can be accomplished using latent +continuous dimensions. Continuous dimensional models of human affect, such as +those based on valence and arousal are more accurate in describing a broad +range of spontaneous everyday emotions than more traditional models of discrete +stereotypical emotion categories (e.g. happiness, surprise). Most of the prior +work on estimating valence and arousal considers laboratory settings and acted +data. But, for emotion recognition systems to be deployed and integrated into +real-world mobile and computing devices, we need to consider data collected in +the world. Action recognition is a domain of Computer Vision that involves +capturing complementary information on appearance from still frames and motion +between frames. In this paper, we treat emotion recognition from the +perspective of action recognition by exploring the application of deep learning +architectures specifically designed for action recognition, for continuous +affect recognition. We propose a novel three-stream end-to-end deep learning +regression pipeline with an attention mechanism, which is an ensemble design +based on sub-modules of multiple state-of-the-art action recognition systems. +The pipeline constitutes a novel data pre-processing approach with a spatial +self-attention mechanism to extract keyframes. The optical flow of +high-attention regions of the face is extracted to capture temporal context. +AFEW-VA in-the-wild dataset has been used to conduct comparative experiments. +Quantitative analysis shows that the proposed model outperforms multiple +standard baselines of both emotion recognition and action recognition models. + +
+
+
+
+
+ + ☆ Out-of-Distribution Detection via Deep Multi-Comprehension Ensemble + + +
+ Recent research underscores the pivotal role of the Out-of-Distribution (OOD) +feature representation field scale in determining the efficacy of models in OOD +detection. Consequently, the adoption of model ensembles has emerged as a +prominent strategy to augment this feature representation field, capitalizing +on anticipated model diversity. + However, our introduction of novel qualitative and quantitative model +ensemble evaluation methods, specifically Loss Basin/Barrier Visualization and +the Self-Coupling Index, reveals a critical drawback in existing ensemble +methods. We find that these methods incorporate weights that are +affine-transformable, exhibiting limited variability and thus failing to +achieve the desired diversity in feature representation. + To address this limitation, we elevate the dimensions of traditional model +ensembles, incorporating various factors such as different weight +initializations, data holdout, etc., into distinct supervision tasks. This +innovative approach, termed Multi-Comprehension (MC) Ensemble, leverages +diverse training tasks to generate distinct comprehensions of the data and +labels, thereby extending the feature representation field. + Our experimental results demonstrate the superior performance of the MC +Ensemble strategy in OOD detection compared to both the naive Deep Ensemble +method and a standalone model of comparable size. This underscores the +effectiveness of our proposed approach in enhancing the model's capability to +detect instances outside its training distribution. + +
+
+
+
+
+ + ☆ Laplacian-guided Entropy Model in Neural Codec with Blur-dissipated + Synthesis CVPR2024 + + +
+ While replacing Gaussian decoders with a conditional diffusion model enhances +the perceptual quality of reconstructions in neural image compression, their +lack of inductive bias for image data restricts their ability to achieve +state-of-the-art perceptual levels. To address this limitation, we adopt a +non-isotropic diffusion model at the decoder side. This model imposes an +inductive bias aimed at distinguishing between frequency contents, thereby +facilitating the generation of high-quality images. Moreover, our framework is +equipped with a novel entropy model that accurately models the probability +distribution of latent representation by exploiting spatio-channel correlations +in latent space, while accelerating the entropy decoding step. This +channel-wise entropy model leverages both local and global spatial contexts +within each channel chunk. The global spatial context is built upon the +Transformer, which is specifically designed for image compression tasks. The +designed Transformer employs a Laplacian-shaped positional encoding, the +learnable parameters of which are adaptively adjusted for each channel cluster. +Our experiments demonstrate that our proposed framework yields better +perceptual quality compared to cutting-edge generative-based codecs, and the +proposed entropy model contributes to notable bitrate savings. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ Unlearning Backdoor Threats: Enhancing Backdoor Defense in Multimodal + Contrastive Learning via Local Token Unlearning + + +
+ Multimodal contrastive learning has emerged as a powerful paradigm for +building high-quality features using the complementary strengths of various +data modalities. However, the open nature of such systems inadvertently +increases the possibility of backdoor attacks. These attacks subtly embed +malicious behaviors within the model during training, which can be activated by +specific triggers in the inference phase, posing significant security risks. +Despite existing countermeasures through fine-tuning that reduce the adverse +impacts of such attacks, these defenses often degrade the clean accuracy and +necessitate the construction of extensive clean training pairs. In this paper, +we explore the possibility of a less-cost defense from the perspective of model +unlearning, that is, whether the model can be made to quickly \textbf{u}nlearn +\textbf{b}ackdoor \textbf{t}hreats (UBT) by constructing a small set of +poisoned samples. Specifically, we strengthen the backdoor shortcuts to +discover suspicious samples through overfitting training prioritized by weak +similarity samples. Building on the initial identification of suspicious +samples, we introduce an innovative token-based localized forgetting training +regime. This technique specifically targets the poisoned aspects of the model, +applying a focused effort to unlearn the backdoor associations and trying not +to damage the integrity of the overall model. Experimental results show that +our method not only ensures a minimal success rate for attacks, but also +preserves the model's high clean accuracy. + +
+
+ comment: 6 pages, 2 figures +
+
+
+
+
+ + ☆ Partially Blinded Unlearning: Class Unlearning for Deep Networks a + Bayesian Perspective + + +
+ In order to adhere to regulatory standards governing individual data privacy +and safety, machine learning models must systematically eliminate information +derived from specific subsets of a user's training data that can no longer be +utilized. The emerging discipline of Machine Unlearning has arisen as a pivotal +area of research, facilitating the process of selectively discarding +information designated to specific sets or classes of data from a pre-trained +model, thereby eliminating the necessity for extensive retraining from scratch. +The principal aim of this study is to formulate a methodology tailored for the +purposeful elimination of information linked to a specific class of data from a +pre-trained classification network. This intentional removal is crafted to +degrade the model's performance specifically concerning the unlearned data +class while concurrently minimizing any detrimental impacts on the model's +performance in other classes. To achieve this goal, we frame the class +unlearning problem from a Bayesian perspective, which yields a loss function +that minimizes the log-likelihood associated with the unlearned data with a +stability regularization in parameter space. This stability regularization +incorporates Mohalanobis distance with respect to the Fisher Information matrix +and $l_2$ distance from the pre-trained model parameters. Our novel approach, +termed \textbf{Partially-Blinded Unlearning (PBU)}, surpasses existing +state-of-the-art class unlearning methods, demonstrating superior +effectiveness. Notably, PBU achieves this efficacy without requiring awareness +of the entire training dataset but only to the unlearned data points, marking a +distinctive feature of its performance. + +
+
+
+
+
+ + ☆ On the Equivalency, Substitutability, and Flexibility of Synthetic Data + + +
+ We study, from an empirical standpoint, the efficacy of synthetic data in +real-world scenarios. Leveraging synthetic data for training perception models +has become a key strategy embraced by the community due to its efficiency, +scalability, perfect annotations, and low costs. Despite proven advantages, few +studies put their stress on how to efficiently generate synthetic datasets to +solve real-world problems and to what extent synthetic data can reduce the +effort for real-world data collection. To answer the questions, we +systematically investigate several interesting properties of synthetic data -- +the equivalency of synthetic data to real-world data, the substitutability of +synthetic data for real data, and the flexibility of synthetic data generators +to close up domain gaps. Leveraging the M3Act synthetic data generator, we +conduct experiments on DanceTrack and MOT17. Our results suggest that synthetic +data not only enhances model performance but also demonstrates substitutability +for real data, with 60% to 80% replacement without performance loss. In +addition, our study of the impact of synthetic data distributions on downstream +performance reveals the importance of flexible data generators in narrowing +domain gaps for improved model adaptability. + +
+
+
+
+
+ + ☆ Adversarially Masked Video Consistency for Unsupervised Domain + Adaptation + + +
+ We study the problem of unsupervised domain adaptation for egocentric videos. +We propose a transformer-based model to learn class-discriminative and +domain-invariant feature representations. It consists of two novel designs. The +first module is called Generative Adversarial Domain Alignment Network with the +aim of learning domain-invariant representations. It simultaneously learns a +mask generator and a domain-invariant encoder in an adversarial way. The +domain-invariant encoder is trained to minimize the distance between the source +and target domain. The masking generator, conversely, aims at producing +challenging masks by maximizing the domain distance. The second is a Masked +Consistency Learning module to learn class-discriminative representations. It +enforces the prediction consistency between the masked target videos and their +full forms. To better evaluate the effectiveness of domain adaptation methods, +we construct a more challenging benchmark for egocentric videos, U-Ego4D. Our +method achieves state-of-the-art performance on the Epic-Kitchen and the +proposed U-Ego4D benchmark. + +
+
+
+
+
+ + ☆ Low Rank Groupwise Deformations for Motion Tracking in Cardiac Cine MRI + + +
+ Diffeomorphic image registration is a commonly used method to deform one +image to resemble another. While warping a single image to another is useful, +it can be advantageous to warp multiple images simultaneously, such as in +tracking the motion of the heart across a sequence of images. In this paper, +our objective is to propose a novel method capable of registering a group or +sequence of images to a target image, resulting in registered images that +appear identical and therefore have a low rank. Moreover, we aim for these +registered images to closely resemble the target image. Through experimental +evidence, we will demonstrate our method's superior efficacy in producing +low-rank groupwise deformations compared to other state-of-the-art approaches. + +
+
+ comment: A thesis submitted to the University of Birmingham for MSc Degree +
+
+
+
+
+ + ☆ Dual-modal Prior Semantic Guided Infrared and Visible Image Fusion for + Intelligent Transportation System + + +
+ Infrared and visible image fusion (IVF) plays an important role in +intelligent transportation system (ITS). The early works predominantly focus on +boosting the visual appeal of the fused result, and only several recent +approaches have tried to combine the high-level vision task with IVF. However, +they prioritize the design of cascaded structure to seek unified suitable +features and fit different tasks. Thus, they tend to typically bias toward to +reconstructing raw pixels without considering the significance of semantic +features. Therefore, we propose a novel prior semantic guided image fusion +method based on the dual-modality strategy, improving the performance of IVF in +ITS. Specifically, to explore the independent significant semantic of each +modality, we first design two parallel semantic segmentation branches with a +refined feature adaptive-modulation (RFaM) mechanism. RFaM can perceive the +features that are semantically distinct enough in each semantic segmentation +branch. Then, two pilot experiments based on the two branches are conducted to +capture the significant prior semantic of two images, which then is applied to +guide the fusion task in the integration of semantic segmentation branches and +fusion branches. In addition, to aggregate both high-level semantics and +impressive visual effects, we further investigate the frequency response of the +prior semantics, and propose a multi-level representation-adaptive fusion +(MRaF) module to explicitly integrate the low-frequent prior semantic with the +high-frequent details. Extensive experiments on two public datasets demonstrate +the superiority of our method over the state-of-the-art image fusion +approaches, in terms of either the visual appeal or the high-level semantics. + +
+
+
+
+
+ + ☆ Inverse Rendering of Glossy Objects via the Neural Plenoptic Function + and Radiance Fields CVPR 2024 + + +
+ Inverse rendering aims at recovering both geometry and materials of objects. +It provides a more compatible reconstruction for conventional rendering +engines, compared with the neural radiance fields (NeRFs). On the other hand, +existing NeRF-based inverse rendering methods cannot handle glossy objects with +local light interactions well, as they typically oversimplify the illumination +as a 2D environmental map, which assumes infinite lights only. Observing the +superiority of NeRFs in recovering radiance fields, we propose a novel 5D +Neural Plenoptic Function (NeP) based on NeRFs and ray tracing, such that more +accurate lighting-object interactions can be formulated via the rendering +equation. We also design a material-aware cone sampling strategy to efficiently +integrate lights inside the BRDF lobes with the help of pre-filtered radiance +fields. Our method has two stages: the geometry of the target object and the +pre-filtered environmental radiance fields are reconstructed in the first +stage, and materials of the target object are estimated in the second stage +with the proposed NeP and material-aware cone sampling strategy. Extensive +experiments on the proposed real-world and synthetic datasets demonstrate that +our method can reconstruct high-fidelity geometry/materials of challenging +glossy objects with complex lighting interactions from nearby objects. Project +webpage: https://whyy.site/paper/nep + +
+
+ comment: CVPR 2024 paper. Project webpage https://whyy.site/paper/nep +
+
+
+
+
+ + ☆ Exemplar-Free Class Incremental Learning via Incremental Representation + + +
+ Exemplar-Free Class Incremental Learning (efCIL) aims to continuously +incorporate the knowledge from new classes while retaining previously learned +information, without storing any old-class exemplars (i.e., samples). For this +purpose, various efCIL methods have been proposed over the past few years, +generally with elaborately constructed old pseudo-features, increasing the +difficulty of model development and interpretation. In contrast, we propose a +\textbf{simple Incremental Representation (IR) framework} for efCIL without +constructing old pseudo-features. IR utilizes dataset augmentation to cover a +suitable feature space and prevents the model from forgetting by using a single +L2 space maintenance loss. We discard the transient classifier trained on each +one of the sequence tasks and instead replace it with a 1-near-neighbor +classifier for inference, ensuring the representation is incrementally updated +during CIL. Extensive experiments demonstrate that our proposed IR achieves +comparable performance while significantly preventing the model from forgetting +on CIFAR100, TinyImageNet, and ImageNetSubset datasets. + +
+
+
+
+
+ + ☆ Leveraging Deep Learning and Xception Architecture for High-Accuracy MRI + Classification in Alzheimer Diagnosis + + +
+ Exploring the application of deep learning technologies in the field of +medical diagnostics, Magnetic Resonance Imaging (MRI) provides a unique +perspective for observing and diagnosing complex neurodegenerative diseases +such as Alzheimer Disease (AD). With advancements in deep learning, +particularly in Convolutional Neural Networks (CNNs) and the Xception network +architecture, we are now able to analyze and classify vast amounts of MRI data +with unprecedented accuracy. The progress of this technology not only enhances +our understanding of brain structural changes but also opens up new avenues for +monitoring disease progression through non-invasive means and potentially +allows for precise diagnosis in the early stages of the disease. + This study aims to classify MRI images using deep learning models to identify +different stages of Alzheimer Disease through a series of innovative data +processing and model construction steps. Our experimental results show that the +deep learning framework based on the Xception model achieved a 99.6% accuracy +rate in the multi-class MRI image classification task, demonstrating its +potential application value in assistive diagnosis. Future research will focus +on expanding the dataset, improving model interpretability, and clinical +validation to further promote the application of deep learning technology in +the medical field, with the hope of bringing earlier diagnosis and more +personalized treatment plans to Alzheimer Disease patients. + +
+
+
+
+
+ + ☆ Frankenstein: Generating Semantic-Compositional 3D Scenes in One + Tri-Plane + + +
+ We present Frankenstein, a diffusion-based framework that can generate +semantic-compositional 3D scenes in a single pass. Unlike existing methods that +output a single, unified 3D shape, Frankenstein simultaneously generates +multiple separated shapes, each corresponding to a semantically meaningful +part. The 3D scene information is encoded in one single tri-plane tensor, from +which multiple Singed Distance Function (SDF) fields can be decoded to +represent the compositional shapes. During training, an auto-encoder compresses +tri-planes into a latent space, and then the denoising diffusion process is +employed to approximate the distribution of the compositional scenes. +Frankenstein demonstrates promising results in generating room interiors as +well as human avatars with automatically separated parts. The generated scenes +facilitate many downstream applications, such as part-wise re-texturing, object +rearrangement in the room or avatar cloth re-targeting. + +
+
+ comment: Video: https://youtu.be/lRn-HqyCrLI +
+
+
+
+
+ + ☆ Image Captioning in news report scenario + + +
+ Image captioning strives to generate pertinent captions for specified images, +situating itself at the crossroads of Computer Vision (CV) and Natural Language +Processing (NLP). This endeavor is of paramount importance with far-reaching +applications in recommendation systems, news outlets, social media, and beyond. +Particularly within the realm of news reporting, captions are expected to +encompass detailed information, such as the identities of celebrities captured +in the images. However, much of the existing body of work primarily centers +around understanding scenes and actions. In this paper, we explore the realm of +image captioning specifically tailored for celebrity photographs, illustrating +its broad potential for enhancing news industry practices. This exploration +aims to augment automated news content generation, thereby facilitating a more +nuanced dissemination of information. Our endeavor shows a broader horizon, +enriching the narrative in news reporting through a more intuitive image +captioning framework. + +
+
+ comment: 10 pages, 4 figures +
+
+
+
+
+ + ☆ Skull-to-Face: Anatomy-Guided 3D Facial Reconstruction and Editing + + +
+ Deducing the 3D face from a skull is an essential but challenging task in +forensic science and archaeology. Existing methods for automated facial +reconstruction yield inaccurate results, suffering from the non-determinative +nature of the problem that a skull with a sparse set of tissue depth cannot +fully determine the skinned face. Additionally, their texture-less results +require further post-processing stages to achieve a photo-realistic appearance. +This paper proposes an end-to-end 3D face reconstruction and exploration tool, +providing textured 3D faces for reference. With the help of state-of-the-art +text-to-image diffusion models and image-based facial reconstruction +techniques, we generate an initial reference 3D face, whose biological profile +aligns with the given skull. We then adapt these initial faces to meet the +statistical expectations of extruded anatomical landmarks on the skull through +an optimization process. The joint statistical distribution of tissue depths is +learned on a small set of anatomical landmarks on the skull. To support further +adjustment, we propose an efficient face adaptation tool to assist users in +tuning tissue depths, either globally or at local regions, while observing +plausible visual feedback. Experiments conducted on a real skull-face dataset +demonstrated the effectiveness of our proposed pipeline in terms of +reconstruction accuracy, diversity, and stability. + +
+
+
+
+
+ + ☆ Blur2Blur: Blur Conversion for Unsupervised Image Deblurring on Unknown + Domains CVPR 2024 + + +
+ This paper presents an innovative framework designed to train an image +deblurring algorithm tailored to a specific camera device. This algorithm works +by transforming a blurry input image, which is challenging to deblur, into +another blurry image that is more amenable to deblurring. The transformation +process, from one blurry state to another, leverages unpaired data consisting +of sharp and blurry images captured by the target camera device. Learning this +blur-to-blur transformation is inherently simpler than direct blur-to-sharp +conversion, as it primarily involves modifying blur patterns rather than the +intricate task of reconstructing fine image details. The efficacy of the +proposed approach has been demonstrated through comprehensive experiments on +various benchmarks, where it significantly outperforms state-of-the-art methods +both quantitatively and qualitatively. Our code and data are available at +https://zero1778.github.io/blur2blur/ + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ FH-SSTNet: Forehead Creases based User Verification using Spatio-Spatial + Temporal Network + + +
+ Biometric authentication, which utilizes contactless features, such as +forehead patterns, has become increasingly important for identity verification +and access management. The proposed method is based on learning a 3D +spatio-spatial temporal convolution to create detailed pictures of forehead +patterns. We introduce a new CNN model called the Forehead Spatio-Spatial +Temporal Network (FH-SSTNet), which utilizes a 3D CNN architecture with triplet +loss to capture distinguishing features. We enhance the model's discrimination +capability using Arcloss in the network's head. Experimentation on the Forehead +Creases version 1 (FH-V1) dataset, containing 247 unique subjects, demonstrates +the superior performance of FH-SSTNet compared to existing methods and +pre-trained CNNs like ResNet50, especially for forehead-based user +verification. The results demonstrate the superior performance of FH-SSTNet for +forehead-based user verification, confirming its effectiveness in identity +authentication. + +
+
+ comment: 6 pages, 5 Figure, IWBF conference +
+
+
+
+
+ + ☆ From Discrete to Continuous: Deep Fair Clustering With Transferable + Representations + + +
+ We consider the problem of deep fair clustering, which partitions data into +clusters via the representations extracted by deep neural networks while hiding +sensitive data attributes. To achieve fairness, existing methods present a +variety of fairness-related objective functions based on the group fairness +criterion. However, these works typically assume that the sensitive attributes +are discrete and do not work for continuous sensitive variables, such as the +proportion of the female population in an area. Besides, the potential of the +representations learned from clustering tasks to improve performance on other +tasks is ignored by existing works. In light of these limitations, we propose a +flexible deep fair clustering method that can handle discrete and continuous +sensitive attributes simultaneously. Specifically, we design an information +bottleneck style objective function to learn fair and clustering-friendly +representations. Furthermore, we explore for the first time the transferability +of the extracted representations to other downstream tasks. Unlike existing +works, we impose fairness at the representation level, which could guarantee +fairness for the transferred task regardless of clustering results. To verify +the effectiveness of the proposed method, we perform extensive experiments on +datasets with discrete and continuous sensitive attributes, demonstrating the +advantage of our method in comparison with state-of-the-art methods. + +
+
+
+
+
+ + ☆ Diffusion Model is a Good Pose Estimator from 3D RF-Vision + + +
+ Human pose estimation (HPE) from Radio Frequency vision (RF-vision) performs +human sensing using RF signals that penetrate obstacles without revealing +privacy (e.g., facial information). Recently, mmWave radar has emerged as a +promising RF-vision sensor, providing radar point clouds by processing RF +signals. However, the mmWave radar has a limited resolution with severe noise, +leading to inaccurate and inconsistent human pose estimation. This work +proposes mmDiff, a novel diffusion-based pose estimator tailored for noisy +radar data. Our approach aims to provide reliable guidance as conditions to +diffusion models. Two key challenges are addressed by mmDiff: (1) +miss-detection of parts of human bodies, which is addressed by a module that +isolates feature extraction from different body parts, and (2) signal +inconsistency due to environmental interference, which is tackled by +incorporating prior knowledge of body structure and motion. Several modules are +designed to achieve these goals, whose features work as the conditions for the +subsequent diffusion model, eliminating the miss-detection and instability of +HPE based on RF-vision. Extensive experiments demonstrate that mmDiff +outperforms existing methods significantly, achieving state-of-the-art +performances on public datasets. + +
+
+
+
+
+ + ☆ Pose-Guided Self-Training with Two-Stage Clustering for Unsupervised + Landmark Discovery CVPR 2024 + + +
+ Unsupervised landmarks discovery (ULD) for an object category is a +challenging computer vision problem. In pursuit of developing a robust ULD +framework, we explore the potential of a recent paradigm of self-supervised +learning algorithms, known as diffusion models. Some recent works have shown +that these models implicitly contain important correspondence cues. Towards +harnessing the potential of diffusion models for the ULD task, we make the +following core contributions. First, we propose a ZeroShot ULD baseline based +on simple clustering of random pixel locations with nearest neighbour matching. +It delivers better results than existing ULD methods. Second, motivated by the +ZeroShot performance, we develop a ULD algorithm based on diffusion features +using self-training and clustering which also outperforms prior methods by +notable margins. Third, we introduce a new proxy task based on generating +latent pose codes and also propose a two-stage clustering mechanism to +facilitate effective pseudo-labeling, resulting in a significant performance +improvement. Overall, our approach consistently outperforms state-of-the-art +methods on four challenging benchmarks AFLW, MAFL, CatHeads and LS3D by +significant margins. + +
+
+ comment: Accepted in CVPR 2024 +
+
+
+
+
+ + ☆ Cross-domain Multi-modal Few-shot Object Detection via Rich Text + + +
+ Cross-modal feature extraction and integration have led to steady performance +improvements in few-shot learning tasks due to generating richer features. +However, existing multi-modal object detection (MM-OD) methods degrade when +facing significant domain-shift and are sample insufficient. We hypothesize +that rich text information could more effectively help the model to build a +knowledge relationship between the vision instance and its language description +and can help mitigate domain shift. Specifically, we study the Cross-Domain +few-shot generalization of MM-OD (CDMM-FSOD) and propose a meta-learning based +multi-modal few-shot object detection method that utilizes rich text semantic +information as an auxiliary modality to achieve domain adaptation in the +context of FSOD. Our proposed network contains (i) a multi-modal feature +aggregation module that aligns the vision and language support feature +embeddings and (ii) a rich text semantic rectify module that utilizes +bidirectional text feature generation to reinforce multi-modal feature +alignment and thus to enhance the model's language understanding capability. We +evaluate our model on common standard cross-domain object detection datasets +and demonstrate that our approach considerably outperforms existing FSOD +methods. + +
+
+
+
+
+ + ☆ Improving Scene Graph Generation with Relation Words' Debiasing in + Vision-Language Models + + +
+ Scene Graph Generation (SGG) provides basic language representation of visual +scenes, requiring models to grasp complex and diverse semantics between various +objects. However, this complexity and diversity in SGG also leads to +underrepresentation, where part of test triplets are rare or even unseen during +training, resulting in imprecise predictions. To tackle this, we propose using +the SGG models with pretrained vision-language models (VLMs) to enhance +representation. However, due to the gap between the pretraining and SGG, +directly ensembling the pretrained VLMs leads to severe biases across relation +words. Thus, we introduce LM Estimation to approximate the words' distribution +underlies in the pretraining language sets, and then use the distribution for +debiasing. After that, we ensemble VLMs with SGG models to enhance +representation. Considering that each model may represent better at different +samples, we use a certainty-aware indicator to score each sample and +dynamically adjust the ensemble weights. Our method effectively addresses the +words biases, enhances SGG's representation, and achieve markable performance +enhancements. It is training-free and integrates well with existing SGG models. + +
+
+
+
+
+ + ☆ EgoExoLearn: A Dataset for Bridging Asynchronous Ego- and Exo-centric + View of Procedural Activities in Real World CVPR 2024 + + +
+ Being able to map the activities of others into one's own point of view is +one fundamental human skill even from a very early age. Taking a step toward +understanding this human ability, we introduce EgoExoLearn, a large-scale +dataset that emulates the human demonstration following process, in which +individuals record egocentric videos as they execute tasks guided by +demonstration videos. Focusing on the potential applications in daily +assistance and professional support, EgoExoLearn contains egocentric and +demonstration video data spanning 120 hours captured in daily life scenarios +and specialized laboratories. Along with the videos we record high-quality gaze +data and provide detailed multimodal annotations, formulating a playground for +modeling the human ability to bridge asynchronous procedural actions from +different viewpoints. To this end, we present benchmarks such as cross-view +association, cross-view action planning, and cross-view referenced skill +assessment, along with detailed analysis. We expect EgoExoLearn can serve as an +important resource for bridging the actions across views, thus paving the way +for creating AI agents capable of seamlessly learning by observing humans in +the real world. Code and data can be found at: +https://github.com/OpenGVLab/EgoExoLearn + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Enhancing MRI-Based Classification of Alzheimer's Disease with + Explainable 3D Hybrid Compact Convolutional Transformers + + +
+ Alzheimer's disease (AD), characterized by progressive cognitive decline and +memory loss, presents a formidable global health challenge, underscoring the +critical importance of early and precise diagnosis for timely interventions and +enhanced patient outcomes. While MRI scans provide valuable insights into brain +structures, traditional analysis methods often struggle to discern intricate 3D +patterns crucial for AD identification. Addressing this challenge, we introduce +an alternative end-to-end deep learning model, the 3D Hybrid Compact +Convolutional Transformers 3D (HCCT). By synergistically combining +convolutional neural networks (CNNs) and vision transformers (ViTs), the 3D +HCCT adeptly captures both local features and long-range relationships within +3D MRI scans. Extensive evaluations on prominent AD benchmark dataset, ADNI, +demonstrate the 3D HCCT's superior performance, surpassing state of the art CNN +and transformer-based methods in classification accuracy. Its robust +generalization capability and interpretability marks a significant stride in AD +classification from 3D MRI scans, promising more accurate and reliable +diagnoses for improved patient care and superior clinical outcomes. + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ☆ Fusion of Minutia Cylinder Codes and Minutia Patch Embeddings for Latent + Fingerprint Recognition + + +
+ Latent fingerprints are one of the most widely used forensic evidence by law +enforcement agencies. However, latent recognition performance is far from the +exemplary performance of sensor fingerprint recognition due to deformations and +artifacts within these images. In this study, we propose a fusion based local +matching approach towards latent fingerprint recognition. Recent latent +recognition studies typically relied on local descriptor generation methods, in +which either handcrafted minutiae features or deep neural network features are +extracted around a minutia of interest, in the latent recognition process. +Proposed approach would integrate these handcrafted features with a recently +proposed deep neural network embedding features in a multi-stage fusion +approach to significantly improve latent recognition results. Effectiveness of +the proposed approach has been shown on several public and private data sets. +As demonstrated in our experimental results, proposed method improves rank-1 +identification accuracy by considerably for real-world datasets when compared +to either the single usage of these features or existing state-of-the-art +methods in the literature. + +
+
+ comment: 9 pages,7 figures, 4 tables +
+
+
+
+
+ + ☆ Gaze-guided Hand-Object Interaction Synthesis: Benchmark and Method + + +
+ Gaze plays a crucial role in revealing human attention and intention, +shedding light on the cognitive processes behind human actions. The integration +of gaze guidance with the dynamics of hand-object interactions boosts the +accuracy of human motion prediction. However, the lack of datasets that capture +the intricate relationship and consistency among gaze, hand, and object +movements remains a substantial hurdle. In this paper, we introduce the first +Gaze-guided Hand-Object Interaction dataset, GazeHOI, and present a novel task +for synthesizing gaze-guided hand-object interactions. Our dataset, GazeHOI, +features simultaneous 3D modeling of gaze, hand, and object interactions, +comprising 479 sequences with an average duration of 19.1 seconds, 812 +sub-sequences, and 33 objects of various sizes. We propose a hierarchical +framework centered on a gaze-guided hand-object interaction diffusion model, +named GHO-Diffusion. In the pre-diffusion phase, we separate gaze conditions +into spatial-temporal features and goal pose conditions at different levels of +information granularity. During the diffusion phase, two gaze-conditioned +diffusion models are stacked to simplify the complex synthesis of hand-object +motions. Here, the object motion diffusion model generates sequences of object +motions based on gaze conditions, while the hand motion diffusion model +produces hand motions based on the generated object motion. To improve +fine-grained goal pose alignment, we introduce a Spherical Gaussian constraint +to guide the denoising step. In the subsequent post-diffusion phase, we +optimize the generated hand motions using contact consistency. Our extensive +experiments highlight the uniqueness of our dataset and the effectiveness of +our approach. + +
+
+
+
+
+ + ☆ Exploiting Semantic Reconstruction to Mitigate Hallucinations in + Vision-Language Models + + +
+ Hallucinations in vision-language models pose a significant challenge to +their reliability, particularly in the generation of long captions. Current +methods fall short of accurately identifying and mitigating these +hallucinations. To address this issue, we introduce ESREAL, a novel +unsupervised learning framework designed to suppress the generation of +hallucinations through accurate localization and penalization of hallucinated +tokens. Initially, ESREAL creates a reconstructed image based on the generated +caption and aligns its corresponding regions with those of the original image. +This semantic reconstruction aids in identifying both the presence and type of +token-level hallucinations within the generated caption. Subsequently, ESREAL +computes token-level hallucination scores by assessing the semantic similarity +of aligned regions based on the type of hallucination. Finally, ESREAL employs +a proximal policy optimization algorithm, where it selectively penalizes +hallucinated tokens according to their token-level hallucination scores. Our +framework notably reduces hallucinations in LLaVA, InstructBLIP, and mPLUG-Owl2 +by 32.81%, 27.08%, and 7.46% on the CHAIR metric. This improvement is achieved +solely through signals derived from the image itself, without the need for any +image-text pairs. + +
+
+
+
+
+ + ☆ Towards Online Real-Time Memory-based Video Inpainting Transformers + + +
+ Video inpainting tasks have seen significant improvements in recent years +with the rise of deep neural networks and, in particular, vision transformers. +Although these models show promising reconstruction quality and temporal +consistency, they are still unsuitable for live videos, one of the last steps +to make them completely convincing and usable. The main limitations are that +these state-of-the-art models inpaint using the whole video (offline +processing) and show an insufficient frame rate. In our approach, we propose a +framework to adapt existing inpainting transformers to these constraints by +memorizing and refining redundant computations while maintaining a decent +inpainting quality. Using this framework with some of the most recent +inpainting models, we show great online results with a consistent throughput +above 20 frames per second. The code and pretrained models will be made +available upon acceptance. + +
+
+
+
+
+ + ☆ Realtime Robust Shape Estimation of Deformable Linear Object ICRA 2024 + + +
+ Realtime shape estimation of continuum objects and manipulators is essential +for developing accurate planning and control paradigms. The existing methods +that create dense point clouds from camera images, and/or use distinguishable +markers on a deformable body have limitations in realtime tracking of large +continuum objects/manipulators. The physical occlusion of markers can often +compromise accurate shape estimation. We propose a robust method to estimate +the shape of linear deformable objects in realtime using scattered and +unordered key points. By utilizing a robust probability-based labeling +algorithm, our approach identifies the true order of the detected key points +and then reconstructs the shape using piecewise spline interpolation. The +approach only relies on knowing the number of the key points and the interval +between two neighboring points. We demonstrate the robustness of the method +when key points are partially occluded. The proposed method is also integrated +into a simulation in Unity for tracking the shape of a cable with a length of +1m and a radius of 5mm. The simulation results show that our proposed approach +achieves an average length error of 1.07% over the continuum's centerline and +an average cross-section error of 2.11mm. The real-world experiments of +tracking and estimating a heavy-load cable prove that the proposed approach is +robust under occlusion and complex entanglement scenarios. + +
+
+ comment: This paper has been accepted to IEEE ICRA 2024 as a contributed paper +
+
+
+
+
+ + ☆ CFAT: Unleashing TriangularWindows for Image Super-resolution CVPR 2024 + + +
+ Transformer-based models have revolutionized the field of image +super-resolution (SR) by harnessing their inherent ability to capture complex +contextual features. The overlapping rectangular shifted window technique used +in transformer architecture nowadays is a common practice in super-resolution +models to improve the quality and robustness of image upscaling. However, it +suffers from distortion at the boundaries and has limited unique shifting +modes. To overcome these weaknesses, we propose a non-overlapping triangular +window technique that synchronously works with the rectangular one to mitigate +boundary-level distortion and allows the model to access more unique sifting +modes. In this paper, we propose a Composite Fusion Attention Transformer +(CFAT) that incorporates triangular-rectangular window-based local attention +with a channel-based global attention technique in image super-resolution. As a +result, CFAT enables attention mechanisms to be activated on more image pixels +and captures long-range, multi-scale features to improve SR performance. The +extensive experimental results and ablation study demonstrate the effectiveness +of CFAT in the SR domain. Our proposed model shows a significant 0.7 dB +performance improvement over other state-of-the-art SR architectures. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Entity-NeRF: Detecting and Removing Moving Entities in Urban Scenes CVPR 2024 + + +
+ Recent advancements in the study of Neural Radiance Fields (NeRF) for dynamic +scenes often involve explicit modeling of scene dynamics. However, this +approach faces challenges in modeling scene dynamics in urban environments, +where moving objects of various categories and scales are present. In such +settings, it becomes crucial to effectively eliminate moving objects to +accurately reconstruct static backgrounds. Our research introduces an +innovative method, termed here as Entity-NeRF, which combines the strengths of +knowledge-based and statistical strategies. This approach utilizes entity-wise +statistics, leveraging entity segmentation and stationary entity classification +through thing/stuff segmentation. To assess our methodology, we created an +urban scene dataset masked with moving objects. Our comprehensive experiments +demonstrate that Entity-NeRF notably outperforms existing techniques in +removing moving objects and reconstructing static urban backgrounds, both +quantitatively and qualitatively. + +
+
+ comment: Accepted by IEEE/CVF Conference on Computer Vision and Pattern + Recognition (CVPR 2024), Project website: + https://otonari726.github.io/entitynerf/ +
+
+
+
+
+ + ☆ Salience DETR: Enhancing Detection Transformer with Hierarchical + Salience Filtering Refinement CVPR 2024 + + +
+ DETR-like methods have significantly increased detection performance in an +end-to-end manner. The mainstream two-stage frameworks of them perform dense +self-attention and select a fraction of queries for sparse cross-attention, +which is proven effective for improving performance but also introduces a heavy +computational burden and high dependence on stable query selection. This paper +demonstrates that suboptimal two-stage selection strategies result in scale +bias and redundancy due to the mismatch between selected queries and objects in +two-stage initialization. To address these issues, we propose hierarchical +salience filtering refinement, which performs transformer encoding only on +filtered discriminative queries, for a better trade-off between computational +efficiency and precision. The filtering process overcomes scale bias through a +novel scale-independent salience supervision. To compensate for the semantic +misalignment among queries, we introduce elaborate query refinement modules for +stable two-stage initialization. Based on above improvements, the proposed +Salience DETR achieves significant improvements of +4.0% AP, +0.2% AP, +4.4% AP +on three challenging task-specific detection datasets, as well as 49.2% AP on +COCO 2017 with less FLOPs. The code is available at +https://github.com/xiuqhou/Salience-DETR. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Enhancing Video Transformers for Action Understanding with VLM-aided + Training + + +
+ Owing to their ability to extract relevant spatio-temporal video embeddings, +Vision Transformers (ViTs) are currently the best performing models in video +action understanding. However, their generalization over domains or datasets is +somewhat limited. In contrast, Visual Language Models (VLMs) have demonstrated +exceptional generalization performance, but are currently unable to process +videos. Consequently, they cannot extract spatio-temporal patterns that are +crucial for action understanding. In this paper, we propose the Four-tiered +Prompts (FTP) framework that takes advantage of the complementary strengths of +ViTs and VLMs. We retain ViTs' strong spatio-temporal representation ability +but improve the visual encodings to be more comprehensive and general by +aligning them with VLM outputs. The FTP framework adds four feature processors +that focus on specific aspects of human action in videos: action category, +action components, action description, and context information. The VLMs are +only employed during training, and inference incurs a minimal computation cost. +Our approach consistently yields state-of-the-art performance. For instance, we +achieve remarkable top-1 accuracy of 93.8% on Kinetics-400 and 83.4% on +Something-Something V2, surpassing VideoMAEv2 by 2.8% and 2.6%, respectively. + +
+
+
+
+
+ + ☆ Enhancing Visual Continual Learning with Language-Guided Supervision CVPR 2024 + + +
+ Continual learning (CL) aims to empower models to learn new tasks without +forgetting previously acquired knowledge. Most prior works concentrate on the +techniques of architectures, replay data, regularization, \etc. However, the +category name of each class is largely neglected. Existing methods commonly +utilize the one-hot labels and randomly initialize the classifier head. We +argue that the scarce semantic information conveyed by the one-hot labels +hampers the effective knowledge transfer across tasks. In this paper, we +revisit the role of the classifier head within the CL paradigm and replace the +classifier with semantic knowledge from pretrained language models (PLMs). +Specifically, we use PLMs to generate semantic targets for each class, which +are frozen and serve as supervision signals during training. Such targets fully +consider the semantic correlation between all classes across tasks. Empirical +studies show that our approach mitigates forgetting by alleviating +representation drifting and facilitating knowledge transfer across tasks. The +proposed method is simple to implement and can seamlessly be plugged into +existing methods with negligible adjustments. Extensive experiments based on +eleven mainstream baselines demonstrate the effectiveness and generalizability +of our approach to various protocols. For example, under the class-incremental +learning setting on ImageNet-100, our method significantly improves the Top-1 +accuracy by 3.2\% to 6.1\% while reducing the forgetting rate by 2.6\% to +13.1\%. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Self-Supervised Multi-Frame Neural Scene Flow + + +
+ Neural Scene Flow Prior (NSFP) and Fast Neural Scene Flow (FNSF) have shown +remarkable adaptability in the context of large out-of-distribution autonomous +driving. Despite their success, the underlying reasons for their astonishing +generalization capabilities remain unclear. Our research addresses this gap by +examining the generalization capabilities of NSFP through the lens of uniform +stability, revealing that its performance is inversely proportional to the +number of input point clouds. This finding sheds light on NSFP's effectiveness +in handling large-scale point cloud scene flow estimation tasks. Motivated by +such theoretical insights, we further explore the improvement of scene flow +estimation by leveraging historical point clouds across multiple frames, which +inherently increases the number of point clouds. Consequently, we propose a +simple and effective method for multi-frame point cloud scene flow estimation, +along with a theoretical evaluation of its generalization abilities. Our +analysis confirms that the proposed method maintains a limited generalization +error, suggesting that adding multiple frames to the scene flow optimization +process does not detract from its generalizability. Extensive experimental +results on large-scale autonomous driving Waymo Open and Argoverse lidar +datasets demonstrate that the proposed method achieves state-of-the-art +performance. + +
+
+
+
+
+ + ☆ Opportunities and challenges in the application of large artificial + intelligence models in radiology + + +
+ Influenced by ChatGPT, artificial intelligence (AI) large models have +witnessed a global upsurge in large model research and development. As people +enjoy the convenience by this AI large model, more and more large models in +subdivided fields are gradually being proposed, especially large models in +radiology imaging field. This article first introduces the development history +of large models, technical details, workflow, working principles of multimodal +large models and working principles of video generation large models. Secondly, +we summarize the latest research progress of AI large models in radiology +education, radiology report generation, applications of unimodal and multimodal +radiology. Finally, this paper also summarizes some of the challenges of large +AI models in radiology, with the aim of better promoting the rapid revolution +in the field of radiography. + +
+
+
+
+
+ + ☆ EVA: Zero-shot Accurate Attributes and Multi-Object Video Editing + + +
+ Current diffusion-based video editing primarily focuses on local editing +(\textit{e.g.,} object/background editing) or global style editing by utilizing +various dense correspondences. However, these methods often fail to accurately +edit the foreground and background simultaneously while preserving the original +layout. We find that the crux of the issue stems from the imprecise +distribution of attention weights across designated regions, including +inaccurate text-to-attribute control and attention leakage. To tackle this +issue, we introduce EVA, a \textbf{zero-shot} and \textbf{multi-attribute} +video editing framework tailored for human-centric videos with complex motions. +We incorporate a Spatial-Temporal Layout-Guided Attention mechanism that +leverages the intrinsic positive and negative correspondences of cross-frame +diffusion features. To avoid attention leakage, we utilize these +correspondences to boost the attention scores of tokens within the same +attribute across all video frames while limiting interactions between tokens of +different attributes in the self-attention layer. For precise text-to-attribute +manipulation, we use discrete text embeddings focused on specific layout areas +within the cross-attention layer. Benefiting from the precise attention weight +distribution, EVA can be easily generalized to multi-object editing scenarios +and achieves accurate identity mapping. Extensive experiments demonstrate EVA +achieves state-of-the-art results in real-world scenarios. Full results are +provided at https://knightyxp.github.io/EVA/ + +
+
+ comment: Project page: https://knightyxp.github.io/EVA +
+
+
+
+
+ + ☆ CG-SLAM: Efficient Dense RGB-D SLAM in a Consistent Uncertainty-aware 3D + Gaussian Field + + +
+ Recently neural radiance fields (NeRF) have been widely exploited as 3D +representations for dense simultaneous localization and mapping (SLAM). Despite +their notable successes in surface modeling and novel view synthesis, existing +NeRF-based methods are hindered by their computationally intensive and +time-consuming volume rendering pipeline. This paper presents an efficient +dense RGB-D SLAM system, i.e., CG-SLAM, based on a novel uncertainty-aware 3D +Gaussian field with high consistency and geometric stability. Through an +in-depth analysis of Gaussian Splatting, we propose several techniques to +construct a consistent and stable 3D Gaussian field suitable for tracking and +mapping. Additionally, a novel depth uncertainty model is proposed to ensure +the selection of valuable Gaussian primitives during optimization, thereby +improving tracking efficiency and accuracy. Experiments on various datasets +demonstrate that CG-SLAM achieves superior tracking and mapping performance +with a notable tracking speed of up to 15 Hz. We will make our source code +publicly available. Project page: https://zju3dv.github.io/cg-slam. + +
+
+ comment: Project Page: https://zju3dv.github.io/cg-slam +
+
+
+
+
+ + ☆ Are NeRFs ready for autonomous driving? Towards closing the + real-to-simulation gap + + +
+ Neural Radiance Fields (NeRFs) have emerged as promising tools for advancing +autonomous driving (AD) research, offering scalable closed-loop simulation and +data augmentation capabilities. However, to trust the results achieved in +simulation, one needs to ensure that AD systems perceive real and rendered data +in the same way. Although the performance of rendering methods is increasing, +many scenarios will remain inherently challenging to reconstruct faithfully. To +this end, we propose a novel perspective for addressing the real-to-simulated +data gap. Rather than solely focusing on improving rendering fidelity, we +explore simple yet effective methods to enhance perception model robustness to +NeRF artifacts without compromising performance on real data. Moreover, we +conduct the first large-scale investigation into the real-to-simulated data gap +in an AD setting using a state-of-the-art neural rendering technique. +Specifically, we evaluate object detectors and an online mapping model on real +and simulated data, and study the effects of different pre-training strategies. +Our results show notable improvements in model robustness to simulated data, +even improving real-world performance in some cases. Last, we delve into the +correlation between the real-to-simulated gap and image reconstruction metrics, +identifying FID and LPIPS as strong indicators. + +
+
+
+
+
+ + ☆ PKU-DyMVHumans: A Multi-View Video Benchmark for High-Fidelity Dynamic + Human Modeling + + +
+ High-quality human reconstruction and photo-realistic rendering of a dynamic +scene is a long-standing problem in computer vision and graphics. Despite +considerable efforts invested in developing various capture systems and +reconstruction algorithms, recent advancements still struggle with loose or +oversized clothing and overly complex poses. In part, this is due to the +challenges of acquiring high-quality human datasets. To facilitate the +development of these fields, in this paper, we present PKU-DyMVHumans, a +versatile human-centric dataset for high-fidelity reconstruction and rendering +of dynamic human scenarios from dense multi-view videos. It comprises 8.2 +million frames captured by more than 56 synchronized cameras across diverse +scenarios. These sequences comprise 32 human subjects across 45 different +scenarios, each with a high-detailed appearance and realistic human motion. +Inspired by recent advancements in neural radiance field (NeRF)-based scene +representations, we carefully set up an off-the-shelf framework that is easy to +provide those state-of-the-art NeRF-based implementations and benchmark on +PKU-DyMVHumans dataset. It is paving the way for various applications like +fine-grained foreground/background decomposition, high-quality human +reconstruction and photo-realistic novel view synthesis of a dynamic scene. +Extensive studies are performed on the benchmark, demonstrating new +observations and challenges that emerge from using such high-fidelity dynamic +data. The dataset is available at: https://pku-dymvhumans.github.io. + +
+
+
+
+
+ + ☆ Landmark-Guided Cross-Speaker Lip Reading with Mutual Information + Regularization LREC + + +
+ Lip reading, the process of interpreting silent speech from visual lip +movements, has gained rising attention for its wide range of realistic +applications. Deep learning approaches greatly improve current lip reading +systems. However, lip reading in cross-speaker scenarios where the speaker +identity changes, poses a challenging problem due to inter-speaker variability. +A well-trained lip reading system may perform poorly when handling a brand new +speaker. To learn a speaker-robust lip reading model, a key insight is to +reduce visual variations across speakers, avoiding the model overfitting to +specific speakers. In this work, in view of both input visual clues and latent +representations based on a hybrid CTC/attention architecture, we propose to +exploit the lip landmark-guided fine-grained visual clues instead of +frequently-used mouth-cropped images as input features, diminishing +speaker-specific appearance characteristics. Furthermore, a max-min mutual +information regularization approach is proposed to capture speaker-insensitive +latent representations. Experimental evaluations on public lip reading datasets +demonstrate the effectiveness of the proposed approach under the intra-speaker +and inter-speaker conditions. + +
+
+ comment: To appear in LREC-COLING 2024 +
+
+
+
+
+ + ☆ Robust Diffusion Models for Adversarial Purification + + +
+ Diffusion models (DMs) based adversarial purification (AP) has shown to be +the most powerful alternative to adversarial training (AT). However, these +methods neglect the fact that pre-trained diffusion models themselves are not +robust to adversarial attacks as well. Additionally, the diffusion process can +easily destroy semantic information and generate a high quality image but +totally different from the original input image after the reverse process, +leading to degraded standard accuracy. To overcome these issues, a natural idea +is to harness adversarial training strategy to retrain or fine-tune the +pre-trained diffusion model, which is computationally prohibitive. We propose a +novel robust reverse process with adversarial guidance, which is independent of +given pre-trained DMs and avoids retraining or fine-tuning the DMs. This robust +guidance can not only ensure to generate purified examples retaining more +semantic content but also mitigate the accuracy-robustness trade-off of DMs for +the first time, which also provides DM-based AP an efficient adaptive ability +to new attacks. Extensive experiments are conducted to demonstrate that our +method achieves the state-of-the-art results and exhibits generalization +against different attacks. + +
+
+
+
+
+ + ☆ Segment Anything Model for Road Network Graph Extraction + + +
+ We propose SAM-Road, an adaptation of the Segment Anything Model (SAM) for +extracting large-scale, vectorized road network graphs from satellite imagery. +To predict graph geometry, we formulate it as a dense semantic segmentation +task, leveraging the inherent strengths of SAM. The image encoder of SAM is +fine-tuned to produce probability masks for roads and intersections, from which +the graph vertices are extracted via simple non-maximum suppression. To predict +graph topology, we designed a lightweight transformer-based graph neural +network, which leverages the SAM image embeddings to estimate the edge +existence probabilities between vertices. Our approach directly predicts the +graph vertices and edges for large regions without expensive and complex +post-processing heuristics, and is capable of building complete road network +graphs spanning multiple square kilometers in a matter of seconds. With its +simple, straightforward, and minimalist design, SAM-Road achieves comparable +accuracy with the state-of-the-art method RNGDet++, while being 40 times faster +on the City-scale dataset. We thus demonstrate the power of a foundational +vision model when applied to a graph learning task. The code is available at +https://github.com/htcr/sam_road. + +
+
+
+
+
+ + ☆ A General and Efficient Federated Split Learning with Pre-trained Image + Transformers for Heterogeneous Data + + +
+ Federated Split Learning (FSL) is a promising distributed learning paradigm +in practice, which gathers the strengths of both Federated Learning (FL) and +Split Learning (SL) paradigms, to ensure model privacy while diminishing the +resource overhead of each client, especially on large transformer models in a +resource-constrained environment, e.g., Internet of Things (IoT). However, +almost all works merely investigate the performance with simple neural network +models in FSL. Despite the minor efforts focusing on incorporating Vision +Transformers (ViT) as model architectures, they train ViT from scratch, thereby +leading to enormous training overhead in each device with limited resources. +Therefore, in this paper, we harness Pre-trained Image Transformers (PITs) as +the initial model, coined FES-PIT, to accelerate the training process and +improve model robustness. Furthermore, we propose FES-PTZO to hinder the +gradient inversion attack, especially having the capability compatible with +black-box scenarios, where the gradient information is unavailable. Concretely, +FES-PTZO approximates the server gradient by utilizing a zeroth-order (ZO) +optimization, which replaces the backward propagation with just one forward +process. Empirically, we are the first to provide a systematic evaluation of +FSL methods with PITs in real-world datasets, different partial device +participations, and heterogeneous data splits. Our experiments verify the +effectiveness of our algorithms. + +
+
+
+
+
+ + ☆ Edit3K: Universal Representation Learning for Video Editing Components + + +
+ This paper focuses on understanding the predominant video creation pipeline, +i.e., compositional video editing with six main types of editing components, +including video effects, animation, transition, filter, sticker, and text. In +contrast to existing visual representation learning of visual materials (i.e., +images/videos), we aim to learn visual representations of editing +actions/components that are generally applied on raw materials. We start by +proposing the first large-scale dataset for editing components of video +creation, which covers about $3,094$ editing components with $618,800$ videos. +Each video in our dataset is rendered by various image/video materials with a +single editing component, which supports atomic visual understanding of +different editing components. It can also benefit several downstream tasks, +e.g., editing component recommendation, editing component +recognition/retrieval, etc. Existing visual representation methods perform +poorly because it is difficult to disentangle the visual appearance of editing +components from raw materials. To that end, we benchmark popular alternative +solutions and propose a novel method that learns to attend to the appearance of +editing components regardless of raw materials. Our method achieves favorable +results on editing component retrieval/recognition compared to the alternative +solutions. A user study is also conducted to show that our representations +cluster visually similar editing components better than other alternatives. +Furthermore, our learned representations used to transition recommendation +tasks achieve state-of-the-art results on the AutoTransition dataset. The code +and dataset will be released for academic use. + +
+
+
+
+
+ + ☆ Semantic Is Enough: Only Semantic Information For NeRF Reconstruction + + +
+ Recent research that combines implicit 3D representation with semantic +information, like Semantic-NeRF, has proven that NeRF model could perform +excellently in rendering 3D structures with semantic labels. This research aims +to extend the Semantic Neural Radiance Fields (Semantic-NeRF) model by focusing +solely on semantic output and removing the RGB output component. We reformulate +the model and its training procedure to leverage only the cross-entropy loss +between the model semantic output and the ground truth semantic images, +removing the colour data traditionally used in the original Semantic-NeRF +approach. We then conduct a series of identical experiments using the original +and the modified Semantic-NeRF model. Our primary objective is to obverse the +impact of this modification on the model performance by Semantic-NeRF, focusing +on tasks such as scene understanding, object detection, and segmentation. The +results offer valuable insights into the new way of rendering the scenes and +provide an avenue for further research and development in semantic-focused 3D +scene understanding. + +
+
+
+
+
+ + ☆ V2X-Real: a Largs-Scale Dataset for Vehicle-to-Everything Cooperative + Perception + + +
+ Recent advancements in Vehicle-to-Everything (V2X) technologies have enabled +autonomous vehicles to share sensing information to see through occlusions, +greatly boosting the perception capability. However, there are no real-world +datasets to facilitate the real V2X cooperative perception research -- existing +datasets either only support Vehicle-to-Infrastructure cooperation or +Vehicle-to-Vehicle cooperation. In this paper, we propose a dataset that has a +mixture of multiple vehicles and smart infrastructure simultaneously to +facilitate the V2X cooperative perception development with multi-modality +sensing data. Our V2X-Real is collected using two connected automated vehicles +and two smart infrastructures, which are all equipped with multi-modal sensors +including LiDAR sensors and multi-view cameras. The whole dataset contains 33K +LiDAR frames and 171K camera data with over 1.2M annotated bounding boxes of 10 +categories in very challenging urban scenarios. According to the collaboration +mode and ego perspective, we derive four types of datasets for Vehicle-Centric, +Infrastructure-Centric, Vehicle-to-Vehicle, and +Infrastructure-to-Infrastructure cooperative perception. Comprehensive +multi-class multi-agent benchmarks of SOTA cooperative perception methods are +provided. The V2X-Real dataset and benchmark codes will be released. + +
+
+
+
+
+ + ☆ Exploring the Impact of Dataset Bias on Dataset Distillation + + +
+ Dataset Distillation (DD) is a promising technique to synthesize a smaller +dataset that preserves essential information from the original dataset. This +synthetic dataset can serve as a substitute for the original large-scale one, +and help alleviate the training workload. However, current DD methods typically +operate under the assumption that the dataset is unbiased, overlooking +potential bias issues within the dataset itself. To fill in this blank, we +systematically investigate the influence of dataset bias on DD. To the best of +our knowledge, this is the first exploration in the DD domain. Given that there +are no suitable biased datasets for DD, we first construct two biased datasets, +CMNIST-DD and CCIFAR10-DD, to establish a foundation for subsequent analysis. +Then we utilize existing DD methods to generate synthetic datasets on CMNIST-DD +and CCIFAR10-DD, and evaluate their performance following the standard process. +Experiments demonstrate that biases present in the original dataset +significantly impact the performance of the synthetic dataset in most cases, +which highlights the necessity of identifying and mitigating biases in the +original datasets during DD. Finally, we reformulate DD within the context of a +biased dataset. Our code along with biased datasets are available at +https://github.com/yaolu-zjut/Biased-DD. + +
+
+
+
+
+ + ☆ A Unified Module for Accelerating STABLE-DIFFUSION: LCM-LORA + + +
+ This paper presents a comprehensive study on the unified module for +accelerating stable-diffusion processes, specifically focusing on the lcm-lora +module. Stable-diffusion processes play a crucial role in various scientific +and engineering domains, and their acceleration is of paramount importance for +efficient computational performance. The standard iterative procedures for +solving fixed-source discrete ordinates problems often exhibit slow +convergence, particularly in optically thick scenarios. To address this +challenge, unconditionally stable diffusion-acceleration methods have been +developed, aiming to enhance the computational efficiency of transport +equations and discrete ordinates problems. This study delves into the +theoretical foundations and numerical results of unconditionally stable +diffusion synthetic acceleration methods, providing insights into their +stability and performance for model discrete ordinates problems. Furthermore, +the paper explores recent advancements in diffusion model acceleration, +including on device acceleration of large diffusion models via gpu aware +optimizations, highlighting the potential for significantly improved inference +latency. The results and analyses in this study provide important insights into +stable diffusion processes and have important ramifications for the creation +and application of acceleration methods specifically, the lcm-lora module in a +variety of computing environments. + +
+
+
+
+
+ + ☆ RPMArt: Towards Robust Perception and Manipulation for Articulated + Objects IROS 2024 + + +
+ Articulated objects are commonly found in daily life. It is essential that +robots can exhibit robust perception and manipulation skills for articulated +objects in real-world robotic applications. However, existing methods for +articulated objects insufficiently address noise in point clouds and struggle +to bridge the gap between simulation and reality, thus limiting the practical +deployment in real-world scenarios. To tackle these challenges, we propose a +framework towards Robust Perception and Manipulation for Articulated Objects +(RPMArt), which learns to estimate the articulation parameters and manipulate +the articulation part from the noisy point cloud. Our primary contribution is a +Robust Articulation Network (RoArtNet) that is able to predict both joint +parameters and affordable points robustly by local feature learning and point +tuple voting. Moreover, we introduce an articulation-aware classification +scheme to enhance its ability for sim-to-real transfer. Finally, with the +estimated affordable point and articulation joint constraint, the robot can +generate robust actions to manipulate articulated objects. After learning only +from synthetic data, RPMArt is able to transfer zero-shot to real-world +articulated objects. Experimental results confirm our approach's effectiveness, +with our framework achieving state-of-the-art performance in both noise-added +simulation and real-world environments. The code and data will be open-sourced +for reproduction. More results are published on the project website at +https://r-pmart.github.io . + +
+
+ comment: 8 pages, 7 figures, submitted to 2024 IEEE/RSJ International + Conference on Intelligent Robots and Systems (IROS 2024), project website at + https://r-pmart.github.io +
+
+
+
+
+ + ☆ PaPr: Training-Free One-Step Patch Pruning with Lightweight ConvNets for + Faster Inference + + +
+ As deep neural networks evolve from convolutional neural networks (ConvNets) +to advanced vision transformers (ViTs), there is an increased need to eliminate +redundant data for faster processing without compromising accuracy. Previous +methods are often architecture-specific or necessitate re-training, restricting +their applicability with frequent model updates. To solve this, we first +introduce a novel property of lightweight ConvNets: their ability to identify +key discriminative patch regions in images, irrespective of model's final +accuracy or size. We demonstrate that fully-connected layers are the primary +bottleneck for ConvNets performance, and their suppression with simple weight +recalibration markedly enhances discriminative patch localization performance. +Using this insight, we introduce PaPr, a method for substantially pruning +redundant patches with minimal accuracy loss using lightweight ConvNets across +a variety of deep learning architectures, including ViTs, ConvNets, and hybrid +transformers, without any re-training. Moreover, the simple early-stage +one-step patch pruning with PaPr enhances existing patch reduction methods. +Through extensive testing on diverse architectures, PaPr achieves significantly +higher accuracy over state-of-the-art patch reduction methods with similar FLOP +count reduction. More specifically, PaPr reduces about 70% of redundant patches +in videos with less than 0.8% drop in accuracy, and up to 3.7x FLOPs reduction, +which is a 15% more reduction with 2.5% higher accuracy. + +
+
+
+
+
+ + ☆ Fill in the ____ (a Diffusion-based Image Inpainting Pipeline) + + +
+ Image inpainting is the process of taking an image and generating lost or +intentionally occluded portions. Inpainting has countless applications +including restoring previously damaged pictures, restoring the quality of +images that have been degraded due to compression, and removing unwanted +objects/text. Modern inpainting techniques have shown remarkable ability in +generating sensible completions for images with mask occlusions. In our paper, +an overview of the progress of inpainting techniques will be provided, along +with identifying current leading approaches, focusing on their strengths and +weaknesses. A critical gap in these existing models will be addressed, focusing +on the ability to prompt and control what exactly is generated. We will +additionally justify why we think this is the natural next progressive step +that inpainting models must take, and provide multiple approaches to +implementing this functionality. Finally, we will evaluate the results of our +approaches by qualitatively checking whether they generate high-quality images +that correctly inpaint regions with the objects that they are instructed to +produce. + +
+
+
+
+
+ + ☆ SM2C: Boost the Semi-supervised Segmentation for Medical Image by using + Meta Pseudo Labels and Mixed Images + + +
+ Recently, machine learning-based semantic segmentation algorithms have +demonstrated their potential to accurately segment regions and contours in +medical images, allowing the precise location of anatomical structures and +abnormalities. Although medical images are difficult to acquire and annotate, +semi-supervised learning methods are efficient in dealing with the scarcity of +labeled data. However, overfitting is almost inevitable due to the limited +images for training. Furthermore, the intricate shapes of organs and lesions in +medical images introduce additional complexity in different cases, preventing +networks from acquiring a strong ability to generalize. To this end, we +introduce a novel method called Scaling-up Mix with Multi-Class (SM2C). This +method uses three strategies - scaling-up image size, multi-class mixing, and +object shape jittering - to improve the ability to learn semantic features +within medical images. By diversifying the shape of the segmentation objects +and enriching the semantic information within each sample, the SM2C +demonstrates its potential, especially in the training of unlabelled data. +Extensive experiments demonstrate the effectiveness of the SM2C on three +benchmark medical image segmentation datasets. The proposed framework shows +significant improvements over state-of-the-art counterparts. + +
+
+
+
+
+ + ☆ Knowledge-Enhanced Dual-stream Zero-shot Composed Image Retrieval CVPR 2024 + + +
+ We study the zero-shot Composed Image Retrieval (ZS-CIR) task, which is to +retrieve the target image given a reference image and a description without +training on the triplet datasets. Previous works generate pseudo-word tokens by +projecting the reference image features to the text embedding space. However, +they focus on the global visual representation, ignoring the representation of +detailed attributes, e.g., color, object number and layout. To address this +challenge, we propose a Knowledge-Enhanced Dual-stream zero-shot composed image +retrieval framework (KEDs). KEDs implicitly models the attributes of the +reference images by incorporating a database. The database enriches the +pseudo-word tokens by providing relevant images and captions, emphasizing +shared attribute information in various aspects. In this way, KEDs recognizes +the reference image from diverse perspectives. Moreover, KEDs adopts an extra +stream that aligns pseudo-word tokens with textual concepts, leveraging +pseudo-triplets mined from image-text pairs. The pseudo-word tokens generated +in this stream are explicitly aligned with fine-grained semantics in the text +embedding space. Extensive experiments on widely used benchmarks, i.e. +ImageNet-R, COCO object, Fashion-IQ and CIRR, show that KEDs outperforms +previous zero-shot composed image retrieval methods. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Diverse Representation Embedding for Lifelong Person Re-Identification + + +
+ Lifelong Person Re-Identification (LReID) aims to continuously learn from +successive data streams, matching individuals across multiple cameras. The key +challenge for LReID is how to effectively preserve old knowledge while learning +new information incrementally. Task-level domain gaps and limited old task +datasets are key factors leading to catastrophic forgetting in ReLD, which are +overlooked in existing methods. To alleviate this problem, we propose a novel +Diverse Representation Embedding (DRE) framework for LReID. The proposed DRE +preserves old knowledge while adapting to new information based on +instance-level and task-level layout. Concretely, an Adaptive Constraint Module +(ACM) is proposed to implement integration and push away operations between +multiple representations, obtaining dense embedding subspace for each instance +to improve matching ability on limited old task datasets. Based on the +processed diverse representation, we interact knowledge between the adjustment +model and the learner model through Knowledge Update (KU) and Knowledge +Preservation (KP) strategies at the task-level layout, which reduce the +task-wise domain gap on both old and new tasks, and exploit diverse +representation of each instance in limited datasets from old tasks, improving +model performance for extended periods. Extensive experiments were conducted on +eleven Re-ID datasets, including five seen datasets for training in order-1 and +order-2 orders and six unseen datasets for inference. Compared to +state-of-the-art methods, our method achieves significantly improved +performance in holistic, large-scale, and occluded datasets. + +
+
+ comment: 11 pages,7 Tables,3 Figures +
+
+
+
+
+ + ☆ SDSTrack: Self-Distillation Symmetric Adapter Learning for Multi-Modal + Visual Object Tracking CVPR2024 + + +
+ Multimodal Visual Object Tracking (VOT) has recently gained significant +attention due to its robustness. Early research focused on fully fine-tuning +RGB-based trackers, which was inefficient and lacked generalized representation +due to the scarcity of multimodal data. Therefore, recent studies have utilized +prompt tuning to transfer pre-trained RGB-based trackers to multimodal data. +However, the modality gap limits pre-trained knowledge recall, and the +dominance of the RGB modality persists, preventing the full utilization of +information from other modalities. To address these issues, we propose a novel +symmetric multimodal tracking framework called SDSTrack. We introduce +lightweight adaptation for efficient fine-tuning, which directly transfers the +feature extraction ability from RGB to other domains with a small number of +trainable parameters and integrates multimodal features in a balanced, +symmetric manner. Furthermore, we design a complementary masked patch +distillation strategy to enhance the robustness of trackers in complex +environments, such as extreme weather, poor imaging, and sensor failure. +Extensive experiments demonstrate that SDSTrack outperforms state-of-the-art +methods in various multimodal tracking scenarios, including RGB+Depth, +RGB+Thermal, and RGB+Event tracking, and exhibits impressive results in extreme +conditions. Our source code is available at https://github.com/hoqolo/SDSTrack. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ Multi-Scale Spatio-Temporal Graph Convolutional Network for Facial + Expression Spotting + + +
+ Facial expression spotting is a significant but challenging task in facial +expression analysis. The accuracy of expression spotting is affected not only +by irrelevant facial movements but also by the difficulty of perceiving subtle +motions in micro-expressions. In this paper, we propose a Multi-Scale +Spatio-Temporal Graph Convolutional Network (SpoT-GCN) for facial expression +spotting. To extract more robust motion features, we track both short- and +long-term motion of facial muscles in compact sliding windows whose window +length adapts to the temporal receptive field of the network. This strategy, +termed the receptive field adaptive sliding window strategy, effectively +magnifies the motion features while alleviating the problem of severe head +movement. The subtle motion features are then converted to a facial graph +representation, whose spatio-temporal graph patterns are learned by a graph +convolutional network. This network learns both local and global features from +multiple scales of facial graph structures using our proposed facial local +graph pooling (FLGP). Furthermore, we introduce supervised contrastive learning +to enhance the discriminative capability of our model for difficult-to-classify +frames. The experimental results on the SAMM-LV and CAS(ME)^2 datasets +demonstrate that our method achieves state-of-the-art performance, particularly +in micro-expression spotting. Ablation studies further verify the effectiveness +of our proposed modules. + +
+
+ comment: Accepted by FG2024 +
+
+
+
+
+ + ☆ BIMCV-R: A Landmark Dataset for 3D CT Text-Image Retrieval + + +
+ The burgeoning integration of 3D medical imaging into healthcare has led to a +substantial increase in the workload of medical professionals. To assist +clinicians in their diagnostic processes and alleviate their workload, the +development of a robust system for retrieving similar case studies presents a +viable solution. While the concept holds great promise, the field of 3D medical +text-image retrieval is currently limited by the absence of robust evaluation +benchmarks and curated datasets. To remedy this, our study presents a +groundbreaking dataset, BIMCV-R (This dataset will be released upon +acceptance.), which includes an extensive collection of 8,069 3D CT volumes, +encompassing over 2 million slices, paired with their respective radiological +reports. Expanding upon the foundational work of our dataset, we craft a +retrieval strategy, MedFinder. This approach employs a dual-stream network +architecture, harnessing the potential of large language models to advance the +field of medical image retrieval beyond existing text-image retrieval +solutions. It marks our preliminary step towards developing a system capable of +facilitating text-to-image, image-to-text, and keyword-based retrieval tasks. + +
+
+
+
+
+ + ☆ Mars Spectrometry 2: Gas Chromatography -- Second place solution + + +
+ The Mars Spectrometry 2: Gas Chromatography challenge was sponsored by NASA +and run on the DrivenData competition platform in 2022. This report describes +the solution which achieved the second-best score on the competition's test +dataset. The solution utilized two-dimensional, image-like representations of +the competition's chromatography data samples. A number of different +Convolutional Neural Network models were trained and ensembled for the final +submission. + +
+
+
+
+
+ + ☆ Exploring Accurate 3D Phenotyping in Greenhouse through Neural Radiance + Fields + + +
+ Accurate collection of plant phenotyping is critical to optimising +sustainable farming practices in precision agriculture. Traditional phenotyping +in controlled laboratory environments, while valuable, falls short in +understanding plant growth under real-world conditions. Emerging sensor and +digital technologies offer a promising approach for direct phenotyping of +plants in farm environments. This study investigates a learning-based +phenotyping method using the Neural Radiance Field to achieve accurate in-situ +phenotyping of pepper plants in greenhouse environments. To quantitatively +evaluate the performance of this method, traditional point cloud registration +on 3D scanning data is implemented for comparison. Experimental result shows +that NeRF(Neural Radiance Fields) achieves competitive accuracy compared to the +3D scanning methods. The mean distance error between the scanner-based method +and the NeRF-based method is 0.865mm. This study shows that the learning-based +NeRF method achieves similar accuracy to 3D scanning-based methods but with +improved scalability and robustness. + +
+
+
+
+
+ + ☆ Towards Two-Stream Foveation-based Active Vision Learning + + +
+ Deep neural network (DNN) based machine perception frameworks process the +entire input in a one-shot manner to provide answers to both "what object is +being observed" and "where it is located". In contrast, the "two-stream +hypothesis" from neuroscience explains the neural processing in the human +visual cortex as an active vision system that utilizes two separate regions of +the brain to answer the what and the where questions. In this work, we propose +a machine learning framework inspired by the "two-stream hypothesis" and +explore the potential benefits that it offers. Specifically, the proposed +framework models the following mechanisms: 1) ventral (what) stream focusing on +the input regions perceived by the fovea part of an eye (foveation), 2) dorsal +(where) stream providing visual guidance, and 3) iterative processing of the +two streams to calibrate visual focus and process the sequence of focused image +patches. The training of the proposed framework is accomplished by label-based +DNN training for the ventral stream model and reinforcement learning for the +dorsal stream model. We show that the two-stream foveation-based learning is +applicable to the challenging task of weakly-supervised object localization +(WSOL), where the training data is limited to the object class or its +attributes. The framework is capable of both predicting the properties of an +object and successfully localizing it by predicting its bounding box. We also +show that, due to the independent nature of the two streams, the dorsal model +can be applied on its own to unseen images to localize objects from different +datasets. + +
+
+ comment: 18 pages, 14 figures, Under consideration at IEEE Transactions on + Cognitive and Developmental Systems +
+
+
+
+
+ + ☆ CBGT-Net: A Neuromimetic Architecture for Robust Classification of + Streaming Data + + +
+ This paper describes CBGT-Net, a neural network model inspired by the +cortico-basal ganglia-thalamic (CBGT) circuits found in mammalian brains. +Unlike traditional neural network models, which either generate an output for +each provided input, or an output after a fixed sequence of inputs, the +CBGT-Net learns to produce an output after a sufficient criteria for evidence +is achieved from a stream of observed data. For each observation, the CBGT-Net +generates a vector that explicitly represents the amount of evidence the +observation provides for each potential decision, accumulates the evidence over +time, and generates a decision when the accumulated evidence exceeds a +pre-defined threshold. We evaluate the proposed model on two image +classification tasks, where models need to predict image categories based on a +stream of small patches extracted from the image. We show that the CBGT-Net +provides improved accuracy and robustness compared to models trained to +classify from a single patch, and models leveraging an LSTM layer to classify +from a fixed sequence length of patches. + +
+
+
+
+
+ + ♻ ☆ Ghost on the Shell: An Expressive Representation of General 3D Shapes ICLR 2024 + + +
+ The creation of photorealistic virtual worlds requires the accurate modeling +of 3D surface geometry for a wide range of objects. For this, meshes are +appealing since they 1) enable fast physics-based rendering with realistic +material and lighting, 2) support physical simulation, and 3) are +memory-efficient for modern graphics pipelines. Recent work on reconstructing +and statistically modeling 3D shape, however, has critiqued meshes as being +topologically inflexible. To capture a wide range of object shapes, any 3D +representation must be able to model solid, watertight, shapes as well as thin, +open, surfaces. Recent work has focused on the former, and methods for +reconstructing open surfaces do not support fast reconstruction with material +and lighting or unconditional generative modelling. Inspired by the observation +that open surfaces can be seen as islands floating on watertight surfaces, we +parameterize open surfaces by defining a manifold signed distance field on +watertight templates. With this parameterization, we further develop a +grid-based and differentiable representation that parameterizes both watertight +and non-watertight meshes of arbitrary topology. Our new representation, called +Ghost-on-the-Shell (G-Shell), enables two important applications: +differentiable rasterization-based reconstruction from multiview images and +generative modelling of non-watertight meshes. We empirically demonstrate that +G-Shell achieves state-of-the-art performance on non-watertight mesh +reconstruction and generation tasks, while also performing effectively for +watertight meshes. + +
+
+ comment: ICLR 2024 Oral (v3: 30 pages, 19 figures, Project Page: + https://gshell3d.github.io/) +
+
+
+
+
+ + ♻ ☆ VQPy: An Object-Oriented Approach to Modern Video Analytics + + +
+ Video analytics is widely used in contemporary systems and services. At the +forefront of video analytics are video queries that users develop to find +objects of particular interest. Building upon the insight that video objects +(e.g., human, animals, cars, etc.), the center of video analytics, are similar +in spirit to objects modeled by traditional object-oriented languages, we +propose to develop an object-oriented approach to video analytics. This +approach, named VQPy, consists of a frontend$\unicode{x2015}$a Python variant +with constructs that make it easy for users to express video objects and their +interactions$\unicode{x2015}$as well as an extensible backend that can +automatically construct and optimize pipelines based on video objects. We have +implemented and open-sourced VQPy, which has been productized in Cisco as part +of its DeepVision framework. + +
+
+ comment: MLSys'24 +
+
+
+
+
+ + ♻ ☆ Latent Dataset Distillation with Diffusion Models + + +
+ The efficacy of machine learning has traditionally relied on the availability +of increasingly larger datasets. However, large datasets pose storage +challenges and contain non-influential samples, which could be ignored during +training without impacting the final accuracy of the model. In response to +these limitations, the concept of distilling the information on a dataset into +a condensed set of (synthetic) samples, namely a distilled dataset, emerged. +One crucial aspect is the selected architecture (usually ConvNet) for linking +the original and synthetic datasets. However, the final accuracy is lower if +the employed model architecture differs from the model used during +distillation. Another challenge is the generation of high-resolution images, +e.g., 128x128 and higher. In this paper, we propose Latent Dataset Distillation +with Diffusion Models (LD3M) that combine diffusion in latent space with +dataset distillation to tackle both challenges. LD3M incorporates a novel +diffusion process tailored for dataset distillation, which improves the +gradient norms for learning synthetic images. By adjusting the number of +diffusion steps, LD3M also offers a straightforward way of controlling the +trade-off between speed and accuracy. We evaluate our approach in several +ImageNet subsets and for high-resolution images (128x128 and 256x256). As a +result, LD3M consistently outperforms state-of-the-art distillation techniques +by up to 4.8 p.p. and 4.2 p.p. for 1 and 10 images per class, respectively. + +
+
+
+
+
+ + ♻ ☆ BAGS: Blur Agnostic Gaussian Splatting through Multi-Scale Kernel + Modeling + + +
+ Recent efforts in using 3D Gaussians for scene reconstruction and novel view +synthesis can achieve impressive results on curated benchmarks; however, images +captured in real life are often blurry. In this work, we analyze the robustness +of Gaussian-Splatting-based methods against various image blur, such as motion +blur, defocus blur, downscaling blur, \etc. Under these degradations, +Gaussian-Splatting-based methods tend to overfit and produce worse results than +Neural-Radiance-Field-based methods. To address this issue, we propose Blur +Agnostic Gaussian Splatting (BAGS). BAGS introduces additional 2D modeling +capacities such that a 3D-consistent and high quality scene can be +reconstructed despite image-wise blur. Specifically, we model blur by +estimating per-pixel convolution kernels from a Blur Proposal Network (BPN). +BPN is designed to consider spatial, color, and depth variations of the scene +to maximize modeling capacity. Additionally, BPN also proposes a +quality-assessing mask, which indicates regions where blur occur. Finally, we +introduce a coarse-to-fine kernel optimization scheme; this optimization scheme +is fast and avoids sub-optimal solutions due to a sparse point cloud +initialization, which often occurs when we apply Structure-from-Motion on +blurry images. We demonstrate that BAGS achieves photorealistic renderings +under various challenging blur conditions and imaging geometry, while +significantly improving upon existing approaches. + +
+
+
+
+
+ + ♻ ☆ Detection of diabetic retinopathy using longitudinal self-supervised + learning MICCAI + + +
+ Longitudinal imaging is able to capture both static anatomical structures and +dynamic changes in disease progression towards earlier and better +patient-specific pathology management. However, conventional approaches for +detecting diabetic retinopathy (DR) rarely take advantage of longitudinal +information to improve DR analysis. In this work, we investigate the benefit of +exploiting self-supervised learning with a longitudinal nature for DR diagnosis +purposes. We compare different longitudinal self-supervised learning (LSSL) +methods to model the disease progression from longitudinal retinal color fundus +photographs (CFP) to detect early DR severity changes using a pair of +consecutive exams. The experiments were conducted on a longitudinal DR +screening dataset with or without those trained encoders (LSSL) acting as a +longitudinal pretext task. Results achieve an AUC of 0.875 for the baseline +(model trained from scratch) and an AUC of 0.96 (95% CI: 0.9593-0.9655 DeLong +test) with a p-value < 2.2e-16 on early fusion using a simple ResNet alike +architecture with frozen LSSL weights, suggesting that the LSSL latent space +enables to encode the dynamic of DR progression. + +
+
+ comment: Accepted preprint for presentation at MICCAI-OMIA +
+
+
+
+
+ + ♻ ☆ Influencer Backdoor Attack on Semantic Segmentation + + +
+ When a small number of poisoned samples are injected into the training +dataset of a deep neural network, the network can be induced to exhibit +malicious behavior during inferences, which poses potential threats to +real-world applications. While they have been intensively studied in +classification, backdoor attacks on semantic segmentation have been largely +overlooked. Unlike classification, semantic segmentation aims to classify every +pixel within a given image. In this work, we explore backdoor attacks on +segmentation models to misclassify all pixels of a victim class by injecting a +specific trigger on non-victim pixels during inferences, which is dubbed +Influencer Backdoor Attack (IBA). IBA is expected to maintain the +classification accuracy of non-victim pixels and mislead classifications of all +victim pixels in every single inference and could be easily applied to +real-world scenes. Based on the context aggregation ability of segmentation +models, we proposed a simple, yet effective, Nearest-Neighbor trigger injection +strategy. We also introduce an innovative Pixel Random Labeling strategy which +maintains optimal performance even when the trigger is placed far from the +victim pixels. Our extensive experiments reveal that current segmentation +models do suffer from backdoor attacks, demonstrate IBA real-world +applicability, and show that our proposed techniques can further increase +attack performance. + +
+
+
+
+
+ + ♻ ☆ DNGaussian: Optimizing Sparse-View 3D Gaussian Radiance Fields with + Global-Local Depth Normalization CVPR 2024 + + +
+ Radiance fields have demonstrated impressive performance in synthesizing +novel views from sparse input views, yet prevailing methods suffer from high +training costs and slow inference speed. This paper introduces DNGaussian, a +depth-regularized framework based on 3D Gaussian radiance fields, offering +real-time and high-quality few-shot novel view synthesis at low costs. Our +motivation stems from the highly efficient representation and surprising +quality of the recent 3D Gaussian Splatting, despite it will encounter a +geometry degradation when input views decrease. In the Gaussian radiance +fields, we find this degradation in scene geometry primarily lined to the +positioning of Gaussian primitives and can be mitigated by depth constraint. +Consequently, we propose a Hard and Soft Depth Regularization to restore +accurate scene geometry under coarse monocular depth supervision while +maintaining a fine-grained color appearance. To further refine detailed +geometry reshaping, we introduce Global-Local Depth Normalization, enhancing +the focus on small local depth changes. Extensive experiments on LLFF, DTU, and +Blender datasets demonstrate that DNGaussian outperforms state-of-the-art +methods, achieving comparable or better results with significantly reduced +memory cost, a $25 \times$ reduction in training time, and over $3000 \times$ +faster rendering speed. + +
+
+ comment: Accepted at CVPR 2024. Project page: + https://fictionarry.github.io/DNGaussian/ +
+
+
+
+
+ + ♻ ☆ DGC-GNN: Leveraging Geometry and Color Cues for Visual Descriptor-Free + 2D-3D Matching CVPR 2024 + + +
+ Matching 2D keypoints in an image to a sparse 3D point cloud of the scene +without requiring visual descriptors has garnered increased interest due to its +low memory requirements, inherent privacy preservation, and reduced need for +expensive 3D model maintenance compared to visual descriptor-based methods. +However, existing algorithms often compromise on performance, resulting in a +significant deterioration compared to their descriptor-based counterparts. In +this paper, we introduce DGC-GNN, a novel algorithm that employs a +global-to-local Graph Neural Network (GNN) that progressively exploits +geometric and color cues to represent keypoints, thereby improving matching +accuracy. Our procedure encodes both Euclidean and angular relations at a +coarse level, forming the geometric embedding to guide the point matching. We +evaluate DGC-GNN on both indoor and outdoor datasets, demonstrating that it not +only doubles the accuracy of the state-of-the-art visual descriptor-free +algorithm but also substantially narrows the performance gap between +descriptor-based and descriptor-free methods. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ DemoCaricature: Democratising Caricature Generation with a Rough Sketch + + +
+ In this paper, we democratise caricature generation, empowering individuals +to effortlessly craft personalised caricatures with just a photo and a +conceptual sketch. Our objective is to strike a delicate balance between +abstraction and identity, while preserving the creativity and subjectivity +inherent in a sketch. To achieve this, we present Explicit Rank-1 Model Editing +alongside single-image personalisation, selectively applying nuanced edits to +cross-attention layers for a seamless merge of identity and style. +Additionally, we propose Random Mask Reconstruction to enhance robustness, +directing the model to focus on distinctive identity and style features. +Crucially, our aim is not to replace artists but to eliminate accessibility +barriers, allowing enthusiasts to engage in the artistry. + +
+
+
+
+
+ + ♻ ☆ SG-Bot: Object Rearrangement via Coarse-to-Fine Robotic Imagination on + Scene Graphs ICRA 2024 + + +
+ Object rearrangement is pivotal in robotic-environment interactions, +representing a significant capability in embodied AI. In this paper, we present +SG-Bot, a novel rearrangement framework that utilizes a coarse-to-fine scheme +with a scene graph as the scene representation. Unlike previous methods that +rely on either known goal priors or zero-shot large models, SG-Bot exemplifies +lightweight, real-time, and user-controllable characteristics, seamlessly +blending the consideration of commonsense knowledge with automatic generation +capabilities. SG-Bot employs a three-fold procedure--observation, imagination, +and execution--to adeptly address the task. Initially, objects are discerned +and extracted from a cluttered scene during the observation. These objects are +first coarsely organized and depicted within a scene graph, guided by either +commonsense or user-defined criteria. Then, this scene graph subsequently +informs a generative model, which forms a fine-grained goal scene considering +the shape information from the initial scene and object semantics. Finally, for +execution, the initial and envisioned goal scenes are matched to formulate +robotic action policies. Experimental results demonstrate that SG-Bot +outperforms competitors by a large margin. + +
+
+ comment: ICRA 2024 accepted. Project website: + https://sites.google.com/view/sg-bot +
+
+
+
+
+ + ♻ ☆ C-TPT: Calibrated Test-Time Prompt Tuning for Vision-Language Models via + Text Feature Dispersion ICLR 2024 + + +
+ In deep learning, test-time adaptation has gained attention as a method for +model fine-tuning without the need for labeled data. A prime exemplification is +the recently proposed test-time prompt tuning for large-scale vision-language +models such as CLIP. Unfortunately, these prompts have been mainly developed to +improve accuracy, overlooking the importance of calibration, which is a crucial +aspect for quantifying prediction uncertainty. However, traditional calibration +methods rely on substantial amounts of labeled data, making them impractical +for test-time scenarios. To this end, this paper explores calibration during +test-time prompt tuning by leveraging the inherent properties of CLIP. Through +a series of observations, we find that the prompt choice significantly affects +the calibration in CLIP, where the prompts leading to higher text feature +dispersion result in better-calibrated predictions. Introducing the Average +Text Feature Dispersion (ATFD), we establish its relationship with calibration +error and present a novel method, Calibrated Test-time Prompt Tuning (C-TPT), +for optimizing prompts during test-time with enhanced calibration. Through +extensive experiments on different CLIP architectures and datasets, we show +that C-TPT can effectively improve the calibration of test-time prompt tuning +without needing labeled data. The code is publicly accessible at +https://github.com/hee-suk-yoon/C-TPT. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ UCM-Net: A Lightweight and Efficient Solution for Skin Lesion + Segmentation using MLP and CNN + + +
+ Skin cancer is a significant public health problem, and computer-aided +diagnosis can help to prevent and treat it. A crucial step for computer-aided +diagnosis is accurately segmenting skin lesions in images, which allows for +lesion detection, classification, and analysis. However, this task is +challenging due to the diverse characteristics of lesions, such as appearance, +shape, size, color, texture, and location, as well as image quality issues like +noise, artifacts, and occlusions. Deep learning models have recently been +applied to skin lesion segmentation, but they have high parameter counts and +computational demands, making them unsuitable for mobile health applications. +To address this challenge, we propose UCM-Net, a novel, efficient, and +lightweight solution that integrates Multi-Layer Perceptions (MLP) and +Convolutional Neural Networks (CNN). Unlike conventional UNet architectures, +our UCMNet-Block reduces parameter overhead and enhances UCM-Net's learning +capabilities, leading to robust segmentation performance. We validate UCM-Net's +competitiveness through extensive experiments on PH2, isic2017 and isic2018 +datasets. Remarkably, UCM-Net has less than 50KB parameters and less than 0.05 +Giga-Operations Per Second (GLOPs), setting a new possible standard for +efficiency in skin lesion segmentation. The source code will be publicly +available. + +
+
+ comment: 17 pages, under review +
+
+
+
+
+ + ♻ ☆ CEIMVEN: An Approach of Cutting Edge Implementation of Modified Versions + of EfficientNet (V1-V2) Architecture for Breast Cancer Detection and + Classification from Ultrasound Images + + +
+ Undoubtedly breast cancer identifies itself as one of the most widespread and +terrifying cancers across the globe. Millions of women are getting affected +each year from it. Breast cancer remains the major one for being the reason of +largest number of demise of women. In the recent time of research, Medical +Image Computing and Processing has been playing a significant role for +detecting and classifying breast cancers from ultrasound images and mammograms, +along with the celestial touch of deep neural networks. In this research, we +focused mostly on our rigorous implementations and iterative result analysis of +different cutting-edge modified versions of EfficientNet architectures namely +EfficientNet-V1 (b0-b7) and EfficientNet-V2 (b0-b3) with ultrasound image, +named as CEIMVEN. We utilized transfer learning approach here for using the +pre-trained models of EfficientNet versions. We activated the hyper-parameter +tuning procedures, added fully connected layers, discarded the unprecedented +outliers and recorded the accuracy results from our custom modified +EfficientNet architectures. Our deep learning model training approach was +related to both identifying the cancer affected areas with region of interest +(ROI) techniques and multiple classifications (benign, malignant and normal). +The approximate testing accuracies we got from the modified versions of +EfficientNet-V1 (b0- 99.15%, b1- 98.58%, b2- 98.43%, b3- 98.01%, b4- 98.86%, +b5- 97.72%, b6- 97.72%, b7- 98.72%) and EfficientNet-V2 (b0- 99.29%, b1- +99.01%, b2- 98.72%, b3- 99.43%) are showing very bright future and strong +potentials of deep learning approach for the successful detection and +classification of breast cancers from the ultrasound images at a very early +stage. The code for this research is available here: +https://github.com/ac005sheekar/CEIMVEN-Cutting-Edge-Implementation-of-Modified-EfficientNet-V1-V2-for-BreastCancer-Detection. + +
+
+
+
+
+ + ♻ ☆ CARZero: Cross-Attention Alignment for Radiology Zero-Shot + Classification + + +
+ The advancement of Zero-Shot Learning in the medical domain has been driven +forward by using pre-trained models on large-scale image-text pairs, focusing +on image-text alignment. However, existing methods primarily rely on cosine +similarity for alignment, which may not fully capture the complex relationship +between medical images and reports. To address this gap, we introduce a novel +approach called Cross-Attention Alignment for Radiology Zero-Shot +Classification (CARZero). Our approach innovatively leverages cross-attention +mechanisms to process image and report features, creating a Similarity +Representation that more accurately reflects the intricate relationships in +medical semantics. This representation is then linearly projected to form an +image-text similarity matrix for cross-modality alignment. Additionally, +recognizing the pivotal role of prompt selection in zero-shot learning, CARZero +incorporates a Large Language Model-based prompt alignment strategy. This +strategy standardizes diverse diagnostic expressions into a unified format for +both training and inference phases, overcoming the challenges of manual prompt +design. Our approach is simple yet effective, demonstrating state-of-the-art +performance in zero-shot classification on five official chest radiograph +diagnostic test sets, including remarkable results on datasets with long-tail +distributions of rare diseases. This achievement is attributed to our new +image-text alignment strategy, which effectively addresses the complex +relationship between medical images and reports. Code and models are available +at https://github.com/laihaoran/CARZero. + +
+
+
+
+
+ + ♻ ☆ DGL-GAN: Discriminator Guided Learning for GAN Compression + + +
+ Generative Adversarial Networks (GANs) with high computation costs, e.g., +BigGAN and StyleGAN2, have achieved remarkable results in synthesizing +high-resolution images from random noise. Reducing the computation cost of GANs +while keeping generating photo-realistic images is a challenging field. In this +work, we propose a novel yet simple {\bf D}iscriminator {\bf G}uided {\bf +L}earning approach for compressing vanilla {\bf GAN}, dubbed {\bf DGL-GAN}. +Motivated by the phenomenon that the teacher discriminator may contain some +meaningful information about both real images and fake images, we merely +transfer the knowledge from the teacher discriminator via the adversarial +interaction between the teacher discriminator and the student generator. We +apply DGL-GAN to compress the two most representative large-scale vanilla GANs, +i.e., StyleGAN2 and BigGAN. Experiments show that DGL-GAN achieves +state-of-the-art (SOTA) results on both StyleGAN2 and BigGAN. Moreover, DGL-GAN +is also effective in boosting the performance of original uncompressed GANs. +Original uncompressed StyleGAN2 boosted with DGL-GAN achieves FID 2.65 on FFHQ, +which achieves a new state-of-the-art performance. Code and models are +available at \url{https://github.com/yuesongtian/DGL-GAN} + +
+
+
+
+
+ + ♻ ☆ MAS: Multi-view Ancestral Sampling for 3D motion generation using 2D + diffusion + + +
+ We introduce Multi-view Ancestral Sampling (MAS), a method for 3D motion +generation, using 2D diffusion models that were trained on motions obtained +from in-the-wild videos. As such, MAS opens opportunities to exciting and +diverse fields of motion previously under-explored as 3D data is scarce and +hard to collect. MAS works by simultaneously denoising multiple 2D motion +sequences representing different views of the same 3D motion. It ensures +consistency across all views at each diffusion step by combining the individual +generations into a unified 3D sequence, and projecting it back to the original +views. We demonstrate MAS on 2D pose data acquired from videos depicting +professional basketball maneuvers, rhythmic gymnastic performances featuring a +ball apparatus, and horse races. In each of these domains, 3D motion capture is +arduous, and yet, MAS generates diverse and realistic 3D sequences. Unlike the +Score Distillation approach, which optimizes each sample by repeatedly applying +small fixes, our method uses a sampling process that was constructed for the +diffusion framework. As we demonstrate, MAS avoids common issues such as +out-of-domain sampling and mode-collapse. https://guytevet.github.io/mas-page/ + +
+
+
+
+
+ + ♻ ☆ BEVNeXt: Reviving Dense BEV Frameworks for 3D Object Detection + + +
+ Recently, the rise of query-based Transformer decoders is reshaping +camera-based 3D object detection. These query-based decoders are surpassing the +traditional dense BEV (Bird's Eye View)-based methods. However, we argue that +dense BEV frameworks remain important due to their outstanding abilities in +depth estimation and object localization, depicting 3D scenes accurately and +comprehensively. This paper aims to address the drawbacks of the existing dense +BEV-based 3D object detectors by introducing our proposed enhanced components, +including a CRF-modulated depth estimation module enforcing object-level +consistencies, a long-term temporal aggregation module with extended receptive +fields, and a two-stage object decoder combining perspective techniques with +CRF-modulated depth embedding. These enhancements lead to a "modernized" dense +BEV framework dubbed BEVNeXt. On the nuScenes benchmark, BEVNeXt outperforms +both BEV-based and query-based frameworks under various settings, achieving a +state-of-the-art result of 64.2 NDS on the nuScenes test set. Code will be +available at \url{https://github.com/woxihuanjiangguo/BEVNeXt}. + +
+
+
+
+
+ + ♻ ☆ Training-free Zero-shot Composed Image Retrieval with Local Concept + Reranking + + +
+ Composed image retrieval attempts to retrieve an image of interest from +gallery images through a composed query of a reference image and its +corresponding modified text. It has recently attracted attention due to the +collaboration of information-rich images and concise language to precisely +express the requirements of target images. Most current composed image +retrieval methods follow a supervised learning approach to training on a costly +triplet dataset composed of a reference image, modified text, and a +corresponding target image. To avoid difficult to-obtain labeled triplet +training data, zero-shot composed image retrieval (ZS-CIR) has been introduced, +which aims to retrieve the target image by learning from image-text pairs +(self-supervised triplets), without the need for human-labeled triplets. +However, this self-supervised triplet learning approach is computationally less +effective and less understandable as it assumes the interaction between image +and text is conducted with implicit query embedding without explicit semantical +interpretation. In this work, we present a new training-free zero-shot composed +image retrieval method which translates the query into explicit +human-understandable text. This helps improve model learning efficiency to +enhance the generalization capacity of foundation models. Further, we introduce +a Local Concept Re-ranking (LCR) mechanism to focus on discriminative local +information extracted from the modified instructions. Extensive experiments on +four ZS-CIR benchmarks show that our method achieves comparable performances to +that of the state of-the-art triplet training based methods, but significantly +outperforms other training-free methods on the open domain datasets (CIRR, +CIRCO and COCO), as well as the fashion domain dataset (FashionIQ). + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ Reality's Canvas, Language's Brush: Crafting 3D Avatars from Monocular + Video + + +
+ Recent advancements in 3D avatar generation excel with multi-view supervision +for photorealistic models. However, monocular counterparts lag in quality +despite broader applicability. We propose ReCaLaB to close this gap. ReCaLaB is +a fully-differentiable pipeline that learns high-fidelity 3D human avatars from +just a single RGB video. A pose-conditioned deformable NeRF is optimized to +volumetrically represent a human subject in canonical T-pose. The canonical +representation is then leveraged to efficiently associate neural textures using +2D-3D correspondences. This enables the separation of diffused color generation +and lighting correction branches that jointly compose an RGB prediction. The +design allows to control intermediate results for human pose, body shape, +texture, and lighting with text prompts. An image-conditioned diffusion model +thereby helps to animate appearance and pose of the 3D avatar to create video +sequences with previously unseen human motion. Extensive experiments show that +ReCaLaB outperforms previous monocular approaches in terms of image quality for +image synthesis tasks. Moreover, natural language offers an intuitive user +interface for creative manipulation of 3D human avatars. + +
+
+ comment: Video link: https://youtu.be/Oz83z1es2J4 +
+
+
+
+
+ + ♻ ☆ AGFSync: Leveraging AI-Generated Feedback for Preference Optimization in + Text-to-Image Generation + + +
+ Text-to-Image (T2I) diffusion models have achieved remarkable success in +image generation. Despite their progress, challenges remain in both +prompt-following ability, image quality and lack of high-quality datasets, +which are essential for refining these models. As acquiring labeled data is +costly, we introduce AGFSync, a framework that enhances T2I diffusion models +through Direct Preference Optimization (DPO) in a fully AI-driven approach. +AGFSync utilizes Vision-Language Models (VLM) to assess image quality across +style, coherence, and aesthetics, generating feedback data within an AI-driven +loop. By applying AGFSync to leading T2I models such as SD v1.4, v1.5, and +SDXL, our extensive experiments on the TIFA dataset demonstrate notable +improvements in VQA scores, aesthetic evaluations, and performance on the HPSv2 +benchmark, consistently outperforming the base models. AGFSync's method of +refining T2I diffusion models paves the way for scalable alignment techniques. + +
+
+
+
+
+ + ♻ ☆ Video Editing via Factorized Diffusion Distillation + + +
+ We introduce Emu Video Edit (EVE), a model that establishes a new +state-of-the art in video editing without relying on any supervised video +editing data. To develop EVE we separately train an image editing adapter and a +video generation adapter, and attach both to the same text-to-image model. +Then, to align the adapters towards video editing we introduce a new +unsupervised distillation procedure, Factorized Diffusion Distillation. This +procedure distills knowledge from one or more teachers simultaneously, without +any supervised data. We utilize this procedure to teach EVE to edit videos by +jointly distilling knowledge to (i) precisely edit each individual frame from +the image editing adapter, and (ii) ensure temporal consistency among the +edited frames using the video generation adapter. Finally, to demonstrate the +potential of our approach in unlocking other capabilities, we align additional +combinations of adapters + +
+
+
+
+
+ + ♻ ☆ HyMNet: a Multimodal Deep Learning System for Hypertension + Classification using Fundus Photographs and Cardiometabolic Risk Factors + + +
+ In recent years, deep learning has shown promise in predicting hypertension +(HTN) from fundus images. However, most prior research has primarily focused on +analyzing a single type of data, which may not capture the full complexity of +HTN risk. To address this limitation, this study introduces a multimodal deep +learning (MMDL) system, dubbed HyMNet, which combines fundus images and +cardiometabolic risk factors, specifically age and gender, to improve +hypertension detection capabilities. Our MMDL system uses RETFound, a +foundation model pre-trained on 1.6 million retinal images, for the fundus path +and a fully connected neural network for the age and gender path. The two paths +are jointly trained by concatenating the feature vectors from each path that +are then fed into a fusion network. The system was trained on 5,016 retinal +images from 1,243 individuals collected from the Saudi Ministry of National +Guard Health Affairs. The results show that the multimodal model that +integrates fundus images along with age and gender outperforms the unimodal +system trained solely on fundus photographs, with an F1 score of 0.771 [0.747, +0.796], and 0.745 [0.719, 0.772] for hypertension detection, respectively. +Additionally, we studied the effect underlying diabetes mellitus has on the +model's predictive ability, concluding that diabetes is used as a confounding +variable for distinguishing hypertensive cases. Our code and model weights are +publicly available at https://github.com/MohammedSB/HyMNet. + +
+
+
+
+
+ + ♻ ☆ Few-shot Object Localization + + +
+ Existing object localization methods are tailored to locate a specific class +of objects, relying on abundant labeled data for model optimization. However, +in numerous real-world scenarios, acquiring large labeled data can be arduous, +significantly constraining the broader application of localization models. To +bridge this research gap, this paper proposes the novel task of Few-Shot Object +Localization (FSOL), which seeks to achieve precise localization with limited +samples available. This task achieves generalized object localization by +leveraging a small number of labeled support samples to query the positional +information of objects within corresponding images. To advance this research +field, we propose an innovative high-performance baseline model. Our model +integrates a dual-path feature augmentation module to enhance shape association +and gradient differences between supports and query images, alongside a self +query module designed to explore the association between feature maps and query +images. Experimental results demonstrate a significant performance improvement +of our approach in the FSOL task, establishing an efficient benchmark for +further research. All codes and data are available at +https://github.com/Ryh1218/FSOL. + +
+
+
+
+
+ + ♻ ☆ Less is More: A Closer Look at Semantic-based Few-Shot Learning + + +
+ Few-shot Learning aims to learn and distinguish new categories with a very +limited number of available images, presenting a significant challenge in the +realm of deep learning. Recent researchers have sought to leverage the +additional textual or linguistic information of these rare categories with a +pre-trained language model to facilitate learning, thus partially alleviating +the problem of insufficient supervision signals. However, the full potential of +the textual information and pre-trained language model have been underestimated +in the few-shot learning till now, resulting in limited performance +enhancements. To address this, we propose a simple but effective framework for +few-shot learning tasks, specifically designed to exploit the textual +information and language model. In more detail, we explicitly exploit the +zero-shot capability of the pre-trained language model with the learnable +prompt. And we just add the visual feature with the textual feature for +inference directly without the intricate designed fusion modules in previous +works. Additionally, we apply the self-ensemble and distillation to further +enhance these components. Our extensive experiments conducted across four +widely used few-shot datasets demonstrate that our simple framework achieves +impressive results. Particularly noteworthy is its outstanding performance in +the 1-shot learning task, surpassing state-of-the-art methods by an average of +3.0\% in classification accuracy. \footnote{We will make the source codes of +the proposed framework publicly available upon acceptance. }. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Cross-Domain Image Retrieval via Prototypical Optimal + Transport AAAI2024 + + +
+ Unsupervised cross-domain image retrieval (UCIR) aims to retrieve images +sharing the same category across diverse domains without relying on labeled +data. Prior approaches have typically decomposed the UCIR problem into two +distinct tasks: intra-domain representation learning and cross-domain feature +alignment. However, these segregated strategies overlook the potential +synergies between these tasks. This paper introduces ProtoOT, a novel Optimal +Transport formulation explicitly tailored for UCIR, which integrates +intra-domain feature representation learning and cross-domain alignment into a +unified framework. ProtoOT leverages the strengths of the K-means clustering +method to effectively manage distribution imbalances inherent in UCIR. By +utilizing K-means for generating initial prototypes and approximating class +marginal distributions, we modify the constraints in Optimal Transport +accordingly, significantly enhancing its performance in UCIR scenarios. +Furthermore, we incorporate contrastive learning into the ProtoOT framework to +further improve representation learning. This encourages local semantic +consistency among features with similar semantics, while also explicitly +enforcing separation between features and unmatched prototypes, thereby +enhancing global discriminativeness. ProtoOT surpasses existing +state-of-the-art methods by a notable margin across benchmark datasets. +Notably, on DomainNet, ProtoOT achieves an average P@200 enhancement of 24.44%, +and on Office-Home, it demonstrates a P@15 improvement of 12.12%. Code is +available at https://github.com/HCVLAB/ProtoOT. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ♻ ☆ Point-PEFT: Parameter-Efficient Fine-Tuning for 3D Pre-trained Models + + +
+ The popularity of pre-trained large models has revolutionized downstream +tasks across diverse fields, such as language, vision, and multi-modality. To +minimize the adaption cost for downstream tasks, many Parameter-Efficient +Fine-Tuning (PEFT) techniques are proposed for language and 2D image +pre-trained models. However, the specialized PEFT method for 3D pre-trained +models is still under-explored. To this end, we introduce Point-PEFT, a novel +framework for adapting point cloud pre-trained models with minimal learnable +parameters. Specifically, for a pre-trained 3D model, we freeze most of its +parameters, and only tune the newly added PEFT modules on downstream tasks, +which consist of a Point-prior Prompt and a Geometry-aware Adapter. The +Point-prior Prompt adopts a set of learnable prompt tokens, for which we +propose to construct a memory bank with domain-specific knowledge, and utilize +a parameter-free attention to enhance the prompt tokens. The Geometry-aware +Adapter aims to aggregate point cloud features within spatial neighborhoods to +capture fine-grained geometric information through local interactions. +Extensive experiments indicate that our Point-PEFT can achieve better +performance than the full fine-tuning on various downstream tasks, while using +only 5% of the trainable parameters, demonstrating the efficiency and +effectiveness of our approach. Code is released at +https://github.com/Ivan-Tang-3D/Point-PEFT. + +
+
+ comment: The specialized PEFT framework for 3D pre-trained models, which + achieves competitive performance to full fine-tuning, and significantly + reduces the computational resources. Project page: + https://github.com/Ivan-Tang-3D/Point-PEFT +
+
+
+
+
+ + ♻ ☆ A Literature Review of Literature Reviews in Pattern Analysis and + Machine Intelligence + + +
+ By consolidating scattered knowledge, the literature review provides a +comprehensive understanding of the investigated topic. However, reading, +conducting, or peer-reviewing review papers generally demands a significant +investment of time and effort from researchers. To improve efficiency, this +paper aims to provide a thorough review of reviews in the PAMI field from +diverse perspectives. First, this paper proposes several article-level, +field-normalized, and large language model-empowered bibliometric indicators to +evaluate reviews. To facilitate this, a meta-data database dubbed RiPAMI, and a +topic dataset are constructed. Second, based on these indicators, the study +presents comparative analyses of representative reviews, unveiling the +characteristics of publications across various fields, periods, and journals. +The newly emerging AI-generated literature reviews are also appraised, and the +observed differences suggest that most AI-generated reviews still lag behind +human-authored reviews in multiple aspects. Third, we briefly provide a +subjective evaluation of representative PAMI reviews and introduce a paper +structure-based typology of literature reviews. This typology may improve the +clarity and effectiveness for scholars in reading and writing reviews, while +also serving as a guide for AI systems in generating well-organized reviews. +Finally, this work offers insights into the current challenges of literature +reviews and envisions future directions for their development. + +
+
+ comment: IEEE version v1. [February 19, 2024] IEEE version v2 with typos + fixed. [February 23, 2024] IEEE version v3 with errors fixed. [February 29, + 2024] IEEE version v4 with improved quaility. [February 29, 2024] +
+
+
+
+
+ + ♻ ☆ See, Imagine, Plan: Discovering and Hallucinating Tasks from a Single + Image + + +
+ Humans can not only recognize and understand the world in its current state +but also envision future scenarios that extend beyond immediate perception. To +resemble this profound human capacity, we introduce zero-shot task +hallucination -- given a single RGB image of any scene comprising unknown +environments and objects, our model can identify potential tasks and imagine +their execution in a vivid narrative, realized as a video. We develop a modular +pipeline that progressively enhances scene decomposition, comprehension, and +reconstruction, incorporating VLM for dynamic interaction and 3D motion +planning for object trajectories. Our model can discover diverse tasks, with +the generated task videos demonstrating realistic and compelling visual +outcomes that are understandable by both machines and humans. Project Page: +https://dannymcy.github.io/zeroshot_task_hallucination/ + +
+
+ comment: Project Page: https://dannymcy.github.io/zeroshot_task_hallucination/ +
+
+
+
+
+ + ♻ ☆ Improving Online Source-free Domain Adaptation for Object Detection by + Unsupervised Data Acquisition + + +
+ Effective object detection in mobile robots is challenged by deployment in +diverse and unfamiliar environments. Online Source-Free Domain Adaptation +(O-SFDA) offers model adaptation using a stream of unlabeled data from a target +domain in online manner. However, not all captured frames contain information +that is beneficial for adaptation, particularly when there is a strong class +imbalance. This paper introduces a novel approach to enhance O-SFDA for +adaptive object detection in mobile robots via unsupervised data acquisition. +Our methodology prioritizes the most informative unlabeled frames for inclusion +in the online training process. Empirical evaluation on a real-world dataset +reveals that our method outperforms existing state-of-the-art O-SFDA +techniques, demonstrating the viability of unsupervised data acquisition for +improving adaptive object detection in mobile robots. + +
+
+
+
+
+ + ♻ ☆ Frequency Decoupling for Motion Magnification via Multi-Level Isomorphic + Architecture CVPR2024 + + +
+ Video Motion Magnification (VMM) aims to reveal subtle and imperceptible +motion information of objects in the macroscopic world. Prior methods directly +model the motion field from the Eulerian perspective by Representation Learning +that separates shape and texture or Multi-domain Learning from phase +fluctuations. Inspired by the frequency spectrum, we observe that the +low-frequency components with stable energy always possess spatial structure +and less noise, making them suitable for modeling the subtle motion field. To +this end, we present FD4MM, a new paradigm of Frequency Decoupling for Motion +Magnification with a Multi-level Isomorphic Architecture to capture multi-level +high-frequency details and a stable low-frequency structure (motion field) in +video space. Since high-frequency details and subtle motions are susceptible to +information degradation due to their inherent subtlety and unavoidable external +interference from noise, we carefully design Sparse High/Low-pass Filters to +enhance the integrity of details and motion structures, and a Sparse Frequency +Mixer to promote seamless recoupling. Besides, we innovatively design a +contrastive regularization for this task to strengthen the model's ability to +discriminate irrelevant features, reducing undesired motion magnification. +Extensive experiments on both Real-world and Synthetic Datasets show that our +FD4MM outperforms SOTA methods. Meanwhile, FD4MM reduces FLOPs by 1.63$\times$ +and boosts inference speed by 1.68$\times$ than the latest method. Our code is +available at https://github.com/Jiafei127/FD4MM. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ SynerMix: Synergistic Mixup Solution for Enhanced Intra-Class Cohesion + and Inter-Class Separability in Image Classification + + +
+ To address the issues of MixUp and its variants (e.g., Manifold MixUp) in +image classification tasks-namely, their neglect of mixing within the same +class (intra-class mixup) and their inadequacy in enhancing intra-class +cohesion through their mixing operations-we propose a novel mixup method named +SynerMix-Intra and, building upon this, introduce a synergistic mixup solution +named SynerMix. SynerMix-Intra specifically targets intra-class mixup to +bolster intra-class cohesion, a feature not addressed by current mixup methods. +For each mini-batch, it leverages feature representations of unaugmented +original images from each class to generate a synthesized feature +representation through random linear interpolation. All synthesized +representations are then fed into the classification and loss layers to +calculate an average classification loss that significantly enhances +intra-class cohesion. Furthermore, SynerMix combines SynerMix-Intra with an +existing mixup approach (e.g., MixUp, Manifold MixUp), which primarily focuses +on inter-class mixup and has the benefit of enhancing inter-class separability. +In doing so, it integrates both inter- and intra-class mixup in a balanced way +while concurrently improving intra-class cohesion and inter-class separability. +Experimental results on six datasets show that SynerMix achieves a 0.1% to +3.43% higher accuracy than the best of either MixUp or SynerMix-Intra alone, +averaging a 1.16% gain. It also surpasses the top-performer of either Manifold +MixUp or SynerMix-Intra by 0.12% to 5.16%, with an average gain of 1.11%. Given +that SynerMix is model-agnostic, it holds significant potential for application +in other domains where mixup methods have shown promise, such as speech and +text classification. Our code is publicly available at: +https://github.com/wxitxy/synermix.git. + +
+
+ comment: 25 pages,12 figures +
+
+
+
+
+ + ♻ ☆ SAM-DA: UAV Tracks Anything at Night with SAM-Powered Domain Adaptation + + +
+ Domain adaptation (DA) has demonstrated significant promise for real-time +nighttime unmanned aerial vehicle (UAV) tracking. However, the state-of-the-art +(SOTA) DA still lacks the potential object with accurate pixel-level location +and boundary to generate the high-quality target domain training sample. This +key issue constrains the transfer learning of the real-time daytime SOTA +trackers for challenging nighttime UAV tracking. Recently, the notable Segment +Anything Model (SAM) has achieved a remarkable zero-shot generalization ability +to discover abundant potential objects due to its huge data-driven training +approach. To solve the aforementioned issue, this work proposes a novel +SAM-powered DA framework for real-time nighttime UAV tracking, i.e., SAM-DA. +Specifically, an innovative SAM-powered target domain training sample swelling +is designed to determine enormous high-quality target domain training samples +from every single raw nighttime image. This novel one-to-many generation +significantly expands the high-quality target domain training sample for DA. +Comprehensive experiments on extensive nighttime UAV videos prove the +robustness and domain adaptability of SAM-DA for nighttime UAV tracking. +Especially, compared to the SOTA DA, SAM-DA can achieve better performance with +fewer raw nighttime images, i.e., the fewer-better training. This economized +training approach facilitates the quick validation and deployment of algorithms +for UAVs. The code is available at https://github.com/vision4robotics/SAM-DA. + +
+
+
+
+
+ + ♻ ☆ SAI3D: Segment Any Instance in 3D Scenes CVPR 2024 + + +
+ Advancements in 3D instance segmentation have traditionally been tethered to +the availability of annotated datasets, limiting their application to a narrow +spectrum of object categories. Recent efforts have sought to harness +vision-language models like CLIP for open-set semantic reasoning, yet these +methods struggle to distinguish between objects of the same categories and rely +on specific prompts that are not universally applicable. In this paper, we +introduce SAI3D, a novel zero-shot 3D instance segmentation approach that +synergistically leverages geometric priors and semantic cues derived from +Segment Anything Model (SAM). Our method partitions a 3D scene into geometric +primitives, which are then progressively merged into 3D instance segmentations +that are consistent with the multi-view SAM masks. Moreover, we design a +hierarchical region-growing algorithm with a dynamic thresholding mechanism, +which largely improves the robustness of finegrained 3D scene parsing.Empirical +evaluations on ScanNet, Matterport3D and the more challenging ScanNet++ +datasets demonstrate the superiority of our approach. Notably, SAI3D +outperforms existing open-vocabulary baselines and even surpasses +fully-supervised methods in class-agnostic segmentation on ScanNet++. Our +project page is at https://yd-yin.github.io/SAI3D. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Exploiting Auxiliary Caption for Video Grounding + + +
+ Video grounding aims to locate a moment of interest matching the given query +sentence from an untrimmed video. Previous works ignore the {sparsity dilemma} +in video annotations, which fails to provide the context information between +potential events and query sentences in the dataset. In this paper, we contend +that exploiting easily available captions which describe general actions, i.e., +auxiliary captions defined in our paper, will significantly boost the +performance. To this end, we propose an Auxiliary Caption Network (ACNet) for +video grounding. Specifically, we first introduce dense video captioning to +generate dense captions and then obtain auxiliary captions by Non-Auxiliary +Caption Suppression (NACS). To capture the potential information in auxiliary +captions, we propose Caption Guided Attention (CGA) project the semantic +relations between auxiliary captions and query sentences into temporal space +and fuse them into visual representations. Considering the gap between +auxiliary captions and ground truth, we propose Asymmetric Cross-modal +Contrastive Learning (ACCL) for constructing more negative pairs to maximize +cross-modal mutual information. Extensive experiments on three public datasets +(i.e., ActivityNet Captions, TACoS and ActivityNet-CG) demonstrate that our +method significantly outperforms state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ InsertNeRF: Instilling Generalizability into NeRF with HyperNet Modules ICLR 2024 + + +
+ Generalizing Neural Radiance Fields (NeRF) to new scenes is a significant +challenge that existing approaches struggle to address without extensive +modifications to vanilla NeRF framework. We introduce InsertNeRF, a method for +INStilling gEneRalizabiliTy into NeRF. By utilizing multiple plug-and-play +HyperNet modules, InsertNeRF dynamically tailors NeRF's weights to specific +reference scenes, transforming multi-scale sampling-aware features into +scene-specific representations. This novel design allows for more accurate and +efficient representations of complex appearances and geometries. Experiments +show that this method not only achieves superior generalization performance but +also provides a flexible pathway for integration with other NeRF-like systems, +even in sparse input settings. Code will be available +https://github.com/bbbbby-99/InsertNeRF. + +
+
+ comment: This work was accepted at ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Task-Customized Mixture of Adapters for General Image Fusion CVPR 2024 + + +
+ General image fusion aims at integrating important information from +multi-source images. However, due to the significant cross-task gap, the +respective fusion mechanism varies considerably in practice, resulting in +limited performance across subtasks. To handle this problem, we propose a novel +task-customized mixture of adapters (TC-MoA) for general image fusion, +adaptively prompting various fusion tasks in a unified model. We borrow the +insight from the mixture of experts (MoE), taking the experts as efficient +tuning adapters to prompt a pre-trained foundation model. These adapters are +shared across different tasks and constrained by mutual information +regularization, ensuring compatibility with different tasks while +complementarity for multi-source images. The task-specific routing networks +customize these adapters to extract task-specific information from different +sources with dynamic dominant intensity, performing adaptive visual feature +prompt fusion. Notably, our TC-MoA controls the dominant intensity bias for +different fusion tasks, successfully unifying multiple fusion tasks in a single +model. Extensive experiments show that TC-MoA outperforms the competing +approaches in learning commonalities while retaining compatibility for general +image fusion (multi-modal, multi-exposure, and multi-focus), and also +demonstrating striking controllability on more generalization experiments. The +code is available at https://github.com/YangSun22/TC-MoA . + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ GPS-Gaussian: Generalizable Pixel-wise 3D Gaussian Splatting for + Real-time Human Novel View Synthesis CVPR 2024 + + +
+ We present a new approach, termed GPS-Gaussian, for synthesizing novel views +of a character in a real-time manner. The proposed method enables 2K-resolution +rendering under a sparse-view camera setting. Unlike the original Gaussian +Splatting or neural implicit rendering methods that necessitate per-subject +optimizations, we introduce Gaussian parameter maps defined on the source views +and regress directly Gaussian Splatting properties for instant novel view +synthesis without any fine-tuning or optimization. To this end, we train our +Gaussian parameter regression module on a large amount of human scan data, +jointly with a depth estimation module to lift 2D parameter maps to 3D space. +The proposed framework is fully differentiable and experiments on several +datasets demonstrate that our method outperforms state-of-the-art methods while +achieving an exceeding rendering speed. + +
+
+ comment: Accepted by CVPR 2024. Project page: + https://shunyuanzheng.github.io/GPS-Gaussian +
+
+
+
+
+ + ♻ ☆ EHRDiff: Exploring Realistic EHR Synthesis with Diffusion Models + + +
+ Electronic health records (EHR) contain a wealth of biomedical information, +serving as valuable resources for the development of precision medicine +systems. However, privacy concerns have resulted in limited access to +high-quality and large-scale EHR data for researchers, impeding progress in +methodological development. Recent research has delved into synthesizing +realistic EHR data through generative modeling techniques, where a majority of +proposed methods relied on generative adversarial networks (GAN) and their +variants for EHR synthesis. Despite GAN-based methods attaining +state-of-the-art performance in generating EHR data, these approaches are +difficult to train and prone to mode collapse. Recently introduced in +generative modeling, diffusion models have established cutting-edge performance +in image generation, but their efficacy in EHR data synthesis remains largely +unexplored. In this study, we investigate the potential of diffusion models for +EHR data synthesis and introduce a novel method, EHRDiff. Through extensive +experiments, EHRDiff establishes new state-of-the-art quality for synthetic EHR +data, protecting private information in the meanwhile. + +
+
+ comment: Accepted by TMLR, preprint of camera-ready version +
+
+
+
+
+ + ♻ ☆ A Large-Scale Empirical Study on Improving the Fairness of Image + Classification Models ISSTA 2024 + + +
+ Fairness has been a critical issue that affects the adoption of deep learning +models in real practice. To improve model fairness, many existing methods have +been proposed and evaluated to be effective in their own contexts. However, +there is still no systematic evaluation among them for a comprehensive +comparison under the same context, which makes it hard to understand the +performance distinction among them, hindering the research progress and +practical adoption of them. To fill this gap, this paper endeavours to conduct +the first large-scale empirical study to comprehensively compare the +performance of existing state-of-the-art fairness improving techniques. +Specifically, we target the widely-used application scenario of image +classification, and utilized three different datasets and five commonly-used +performance metrics to assess in total 13 methods from diverse categories. Our +findings reveal substantial variations in the performance of each method across +different datasets and sensitive attributes, indicating over-fitting on +specific datasets by many existing methods. Furthermore, different fairness +evaluation metrics, due to their distinct focuses, yield significantly +different assessment results. Overall, we observe that pre-processing methods +and in-processing methods outperform post-processing methods, with +pre-processing methods exhibiting the best performance. Our empirical study +offers comprehensive recommendations for enhancing fairness in deep learning +models. We approach the problem from multiple dimensions, aiming to provide a +uniform evaluation platform and inspire researchers to explore more effective +fairness solutions via a set of implications. + +
+
+ comment: Accepted by the 33rd ACM SIGSOFT International Symposium on Software + Testing and Analysis (ISSTA 2024). Please include ISSTA in any citations +
+
+
+
+
+ + ♻ ☆ A Number Sense as an Emergent Property of the Manipulating Brain + + +
+ The ability to understand and manipulate numbers and quantities emerges +during childhood, but the mechanism through which humans acquire and develop +this ability is still poorly understood. We explore this question through a +model, assuming that the learner is able to pick up and place small objects +from, and to, locations of its choosing, and will spontaneously engage in such +undirected manipulation. We further assume that the learner's visual system +will monitor the changing arrangements of objects in the scene and will learn +to predict the effects of each action by comparing perception with a +supervisory signal from the motor system. We model perception using standard +deep networks for feature extraction and classification, and gradient descent +learning. Our main finding is that, from learning the task of action +prediction, an unexpected image representation emerges exhibiting regularities +that foreshadow the perception and representation of numbers and quantity. +These include distinct categories for zero and the first few natural numbers, a +strict ordering of the numbers, and a one-dimensional signal that correlates +with numerical quantity. As a result, our model acquires the ability to +estimate numerosity, i.e. the number of objects in the scene, as well as +subitization, i.e. the ability to recognize at a glance the exact number of +objects in small scenes. Remarkably, subitization and numerosity estimation +extrapolate to scenes containing many objects, far beyond the three objects +used during training. We conclude that important aspects of a facility with +numbers and quantities may be learned with supervision from a simple +pre-training task. Our observations suggest that cross-modal learning is a +powerful learning mechanism that may be harnessed in artificial intelligence. + +
+
+ comment: 16 pages, 5 figures, 15 supplemental figures +
+
+
+
+
+ + ♻ ☆ To Generate or Not? Safety-Driven Unlearned Diffusion Models Are Still + Easy To Generate Unsafe Images ... For Now + + +
+ The recent advances in diffusion models (DMs) have revolutionized the +generation of realistic and complex images. However, these models also +introduce potential safety hazards, such as producing harmful content and +infringing data copyrights. Despite the development of safety-driven unlearning +techniques to counteract these challenges, doubts about their efficacy persist. +To tackle this issue, we introduce an evaluation framework that leverages +adversarial prompts to discern the trustworthiness of these safety-driven DMs +after they have undergone the process of unlearning harmful concepts. +Specifically, we investigated the adversarial robustness of DMs, assessed by +adversarial prompts, when eliminating unwanted concepts, styles, and objects. +We develop an effective and efficient adversarial prompt generation approach +for DMs, termed UnlearnDiffAtk. This method capitalizes on the intrinsic +classification abilities of DMs to simplify the creation of adversarial +prompts, thereby eliminating the need for auxiliary classification or diffusion +models.Through extensive benchmarking, we evaluate the robustness of five +widely-used safety-driven unlearned DMs (i.e., DMs after unlearning undesirable +concepts, styles, or objects) across a variety of tasks. Our results +demonstrate the effectiveness and efficiency merits of UnlearnDiffAtk over the +state-of-the-art adversarial prompt generation method and reveal the lack of +robustness of current safety-driven unlearning techniques when applied to DMs. +Codes are available at https://github.com/OPTML-Group/Diffusion-MU-Attack. +WARNING: This paper contains model outputs that may be offensive in nature. + +
+
+ comment: Codes are available at + https://github.com/OPTML-Group/Diffusion-MU-Attack +
+
+
+
+
+ + ♻ ☆ Multi-view Deep Subspace Clustering Networks + + +
+ Multi-view subspace clustering aims to discover the inherent structure of +data by fusing multiple views of complementary information. Most existing +methods first extract multiple types of handcrafted features and then learn a +joint affinity matrix for clustering. The disadvantage of this approach lies in +two aspects: 1) multi-view relations are not embedded into feature learning, +and 2) the end-to-end learning manner of deep learning is not suitable for +multi-view clustering. Even when deep features have been extracted, it is a +nontrivial problem to choose a proper backbone for clustering on different +datasets. To address these issues, we propose the Multi-view Deep Subspace +Clustering Networks (MvDSCN), which learns a multi-view self-representation +matrix in an end-to-end manner. The MvDSCN consists of two sub-networks, \ie, a +diversity network (Dnet) and a universality network (Unet). A latent space is +built using deep convolutional autoencoders, and a self-representation matrix +is learned in the latent space using a fully connected layer. Dnet learns +view-specific self-representation matrices, whereas Unet learns a common +self-representation matrix for all views. To exploit the complementarity of +multi-view representations, the Hilbert--Schmidt independence criterion (HSIC) +is introduced as a diversity regularizer that captures the nonlinear, +high-order inter-view relations. Because different views share the same label +space, the self-representation matrices of each view are aligned to the common +one by universality regularization. The MvDSCN also unifies multiple backbones +to boost clustering performance and avoid the need for model selection. +Experiments demonstrate the superiority of the MvDSCN. + +
+
+ comment: Accepted by T-CYB +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 73 + +
+
+
+ + ☆ Finding needles in a haystack: A Black-Box Approach to Invisible + Watermark Detection + + +
+ In this paper, we propose WaterMark Detection (WMD), the first invisible +watermark detection method under a black-box and annotation-free setting. WMD +is capable of detecting arbitrary watermarks within a given reference dataset +using a clean non-watermarked dataset as a reference, without relying on +specific decoding methods or prior knowledge of the watermarking techniques. We +develop WMD using foundations of offset learning, where a clean non-watermarked +dataset enables us to isolate the influence of only watermarked samples in the +reference dataset. Our comprehensive evaluations demonstrate the effectiveness +of WMD, significantly outperforming naive detection methods, which only yield +AUC scores around 0.5. In contrast, WMD consistently achieves impressive +detection AUC scores, surpassing 0.9 in most single-watermark datasets and +exceeding 0.7 in more challenging multi-watermark scenarios across diverse +datasets and watermarking methods. As invisible watermarks become increasingly +prevalent, while specific decoding techniques remain undisclosed, our approach +provides a versatile solution and establishes a path toward increasing +accountability, transparency, and trust in our digital visual content. + +
+
+
+
+
+ + ☆ IllusionVQA: A Challenging Optical Illusion Dataset for Vision Language + Models + + +
+ The advent of Vision Language Models (VLM) has allowed researchers to +investigate the visual understanding of a neural network using natural +language. Beyond object classification and detection, VLMs are capable of +visual comprehension and common-sense reasoning. This naturally led to the +question: How do VLMs respond when the image itself is inherently unreasonable? +To this end, we present IllusionVQA: a diverse dataset of challenging optical +illusions and hard-to-interpret scenes to test the capability of VLMs in two +distinct multiple-choice VQA tasks - comprehension and soft localization. +GPT4V, the best-performing VLM, achieves 62.99% accuracy (4-shot) on the +comprehension task and 49.7% on the localization task (4-shot and +Chain-of-Thought). Human evaluation reveals that humans achieve 91.03% and 100% +accuracy in comprehension and localization. We discover that In-Context +Learning (ICL) and Chain-of-Thought reasoning substantially degrade the +performance of GeminiPro on the localization task. Tangentially, we discover a +potential weakness in the ICL capabilities of VLMs: they fail to locate optical +illusions even when the correct answer is in the context window as a few-shot +example. + +
+
+
+
+
+ + ☆ MapTracker: Tracking with Strided Memory Fusion for Consistent Vector HD + Mapping + + +
+ This paper presents a vector HD-mapping algorithm that formulates the mapping +as a tracking task and uses a history of memory latents to ensure consistent +reconstructions over time. Our method, MapTracker, accumulates a sensor stream +into memory buffers of two latent representations: 1) Raster latents in the +bird's-eye-view (BEV) space and 2) Vector latents over the road elements (i.e., +pedestrian-crossings, lane-dividers, and road-boundaries). The approach borrows +the query propagation paradigm from the tracking literature that explicitly +associates tracked road elements from the previous frame to the current, while +fusing a subset of memory latents selected with distance strides to further +enhance temporal consistency. A vector latent is decoded to reconstruct the +geometry of a road element. The paper further makes benchmark contributions by +1) Improving processing code for existing datasets to produce consistent ground +truth with temporal alignments and 2) Augmenting existing mAP metrics with +consistency checks. MapTracker significantly outperforms existing methods on +both nuScenes and Agroverse2 datasets by over 8% and 19% on the conventional +and the new consistency-aware metrics, respectively. The code will be available +on our project page: https://map-tracker.github.io. + +
+
+ comment: Project page: https://map-tracker.github.io +
+
+
+
+
+ + ☆ Deep Domain Adaptation: A Sim2Real Neural Approach for Improving + Eye-Tracking Systems + + +
+ Eye image segmentation is a critical step in eye tracking that has great +influence over the final gaze estimate. Segmentation models trained using +supervised machine learning can excel at this task, their effectiveness is +determined by the degree of overlap between the narrow distributions of image +properties defined by the target dataset and highly specific training datasets, +of which there are few. Attempts to broaden the distribution of existing eye +image datasets through the inclusion of synthetic eye images have found that a +model trained on synthetic images will often fail to generalize back to +real-world eye images. In remedy, we use dimensionality-reduction techniques to +measure the overlap between the target eye images and synthetic training data, +and to prune the training dataset in a manner that maximizes distribution +overlap. We demonstrate that our methods result in robust, improved performance +when tackling the discrepancy between simulation and real-world data samples. + +
+
+ comment: 14 pages, 8 figures, accepted to ETRA 2024 +
+
+
+
+
+ + ☆ Adaptive Super Resolution For One-Shot Talking-Head Generation + + +
+ The one-shot talking-head generation learns to synthesize a talking-head +video with one source portrait image under the driving of same or different +identity video. Usually these methods require plane-based pixel transformations +via Jacobin matrices or facial image warps for novel poses generation. The +constraints of using a single image source and pixel displacements often +compromise the clarity of the synthesized images. Some methods try to improve +the quality of synthesized videos by introducing additional super-resolution +modules, but this will undoubtedly increase computational consumption and +destroy the original data distribution. In this work, we propose an adaptive +high-quality talking-head video generation method, which synthesizes +high-resolution video without additional pre-trained modules. Specifically, +inspired by existing super-resolution methods, we down-sample the one-shot +source image, and then adaptively reconstruct high-frequency details via an +encoder-decoder module, resulting in enhanced video clarity. Our method +consistently improves the quality of generated videos through a straightforward +yet effective strategy, substantiated by quantitative and qualitative +evaluations. The code and demo video are available on: +\url{https://github.com/Songluchuan/AdaSR-TalkingHead/}. + +
+
+ comment: 5 pages, 3 figures +
+
+
+
+
+ + ☆ Feature Manipulation for DDPM based Change Detection + + +
+ Change Detection is a classic task of computer vision that receives a +bi-temporal image pair as input and separates the semantically changed and +unchanged regions of it. The diffusion model is used in image synthesis and as +a feature extractor and has been applied to various downstream tasks. Using +this, a feature map is extracted from the pre-trained diffusion model from the +large-scale data set, and changes are detected through the additional network. +On the one hand, the current diffusion-based change detection approach focuses +only on extracting a good feature map using the diffusion model. It obtains and +uses differences without further adjustment to the created feature map. Our +method focuses on manipulating the feature map extracted from the Diffusion +Model to be more semantically useful, and for this, we propose two methods: +Feature Attention and FDAF. Our model with Feature Attention achieved a +state-of-the-art F1 score (90.18) and IoU (83.86) on the LEVIR-CD dataset. + +
+
+ comment: This paper has been accepted by the 2024 5th International Conference + on Computer Vision, Image and Deep Learning +
+
+
+
+
+ + ☆ Explore until Confident: Efficient Exploration for Embodied Question + Answering + + +
+ We consider the problem of Embodied Question Answering (EQA), which refers to +settings where an embodied agent such as a robot needs to actively explore an +environment to gather information until it is confident about the answer to a +question. In this work, we leverage the strong semantic reasoning capabilities +of large vision-language models (VLMs) to efficiently explore and answer such +questions. However, there are two main challenges when using VLMs in EQA: they +do not have an internal memory for mapping the scene to be able to plan how to +explore over time, and their confidence can be miscalibrated and can cause the +robot to prematurely stop exploration or over-explore. We propose a method that +first builds a semantic map of the scene based on depth information and via +visual prompting of a VLM - leveraging its vast knowledge of relevant regions +of the scene for exploration. Next, we use conformal prediction to calibrate +the VLM's question answering confidence, allowing the robot to know when to +stop exploration - leading to a more calibrated and efficient exploration +strategy. To test our framework in simulation, we also contribute a new EQA +dataset with diverse, realistic human-robot scenarios and scenes built upon the +Habitat-Matterport 3D Research Dataset (HM3D). Both simulated and real robot +experiments show our proposed approach improves the performance and efficiency +over baselines that do no leverage VLM for exploration or do not calibrate its +confidence. Webpage with experiment videos and code: +https://explore-eqa.github.io/ + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ X-Portrait: Expressive Portrait Animation with Hierarchical Motion + Attention + + +
+ We propose X-Portrait, an innovative conditional diffusion model tailored for +generating expressive and temporally coherent portrait animation. Specifically, +given a single portrait as appearance reference, we aim to animate it with +motion derived from a driving video, capturing both highly dynamic and subtle +facial expressions along with wide-range head movements. As its core, we +leverage the generative prior of a pre-trained diffusion model as the rendering +backbone, while achieve fine-grained head pose and expression control with +novel controlling signals within the framework of ControlNet. In contrast to +conventional coarse explicit controls such as facial landmarks, our motion +control module is learned to interpret the dynamics directly from the original +driving RGB inputs. The motion accuracy is further enhanced with a patch-based +local control module that effectively enhance the motion attention to +small-scale nuances like eyeball positions. Notably, to mitigate the identity +leakage from the driving signals, we train our motion control modules with +scaling-augmented cross-identity images, ensuring maximized disentanglement +from the appearance reference modules. Experimental results demonstrate the +universal effectiveness of X-Portrait across a diverse range of facial +portraits and expressive driving sequences, and showcase its proficiency in +generating captivating portrait animations with consistently maintained +identity characteristics. + +
+
+
+
+
+ + ☆ An Embarrassingly Simple Defense Against Backdoor Attacks On SSL + + +
+ Self Supervised Learning (SSL) has emerged as a powerful paradigm to tackle +data landscapes with absence of human supervision. The ability to learn +meaningful tasks without the use of labeled data makes SSL a popular method to +manage large chunks of data in the absence of labels. However, recent work +indicates SSL to be vulnerable to backdoor attacks, wherein models can be +controlled, possibly maliciously, to suit an adversary's motives. Li et.al +(2022) introduce a novel frequency-based backdoor attack: CTRL. They show that +CTRL can be used to efficiently and stealthily gain control over a victim's +model trained using SSL. In this work, we devise two defense strategies against +frequency-based attacks in SSL: One applicable before model training and the +second to be applied during model inference. Our first contribution utilizes +the invariance property of the downstream task to defend against backdoor +attacks in a generalizable fashion. We observe the ASR (Attack Success Rate) to +reduce by over 60% across experiments. Our Inference-time defense relies on +evasiveness of the attack and uses the luminance channel to defend against +attacks. Using object classification as the downstream task for SSL, we +demonstrate successful defense strategies that do not require re-training of +the model. Code is available at https://github.com/Aryan-Satpathy/Backdoor. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ Towards Low-Energy Adaptive Personalization for Resource-Constrained + Devices + + +
+ The personalization of machine learning (ML) models to address data drift is +a significant challenge in the context of Internet of Things (IoT) +applications. Presently, most approaches focus on fine-tuning either the full +base model or its last few layers to adapt to new data, while often neglecting +energy costs. However, various types of data drift exist, and fine-tuning the +full base model or the last few layers may not result in optimal performance in +certain scenarios. We propose Target Block Fine-Tuning (TBFT), a low-energy +adaptive personalization framework designed for resource-constrained devices. +We categorize data drift and personalization into three types: input-level, +feature-level, and output-level. For each type, we fine-tune different blocks +of the model to achieve optimal performance with reduced energy costs. +Specifically, input-, feature-, and output-level correspond to fine-tuning the +front, middle, and rear blocks of the model. We evaluate TBFT on a ResNet +model, three datasets, three different training sizes, and a Raspberry Pi. +Compared with the $Block Avg$, where each block is fine-tuned individually and +their performance improvements are averaged, TBFT exhibits an improvement in +model accuracy by an average of 15.30% whilst saving 41.57% energy consumption +on average compared with full fine-tuning. + +
+
+ comment: Accepetd to The 4th Workshop on Machine Learning and Systems + (EuroMLSys '24) +
+
+
+
+
+ + ☆ MatchSeg: Towards Better Segmentation via Reference Image Matching + + +
+ Recently, automated medical image segmentation methods based on deep learning +have achieved great success. However, they heavily rely on large annotated +datasets, which are costly and time-consuming to acquire. Few-shot learning +aims to overcome the need for annotated data by using a small labeled dataset, +known as a support set, to guide predicting labels for new, unlabeled images, +known as the query set. Inspired by this paradigm, we introduce MatchSeg, a +novel framework that enhances medical image segmentation through strategic +reference image matching. We leverage contrastive language-image pre-training +(CLIP) to select highly relevant samples when defining the support set. +Additionally, we design a joint attention module to strengthen the interaction +between support and query features, facilitating a more effective knowledge +transfer between support and query sets. We validated our method across four +public datasets. Experimental results demonstrate superior segmentation +performance and powerful domain generalization ability of MatchSeg against +existing methods for domain-specific and cross-domain segmentation tasks. Our +code is made available at https://github.com/keeplearning-again/MatchSeg + +
+
+
+
+
+ + ☆ Human Motion Prediction under Unexpected Perturbation + + +
+ We investigate a new task in human motion prediction, which is predicting +motions under unexpected physical perturbation potentially involving multiple +people. Compared with existing research, this task involves predicting less +controlled, unpremeditated and pure reactive motions in response to external +impact and how such motions can propagate through people. It brings new +challenges such as data scarcity and predicting complex interactions. To this +end, we propose a new method capitalizing differential physics and deep neural +networks, leading to an explicit Latent Differential Physics (LDP) model. +Through experiments, we demonstrate that LDP has high data efficiency, +outstanding prediction accuracy, strong generalizability and good +explainability. Since there is no similar research, a comprehensive comparison +with 11 adapted baselines from several relevant domains is conducted, showing +LDP outperforming existing research both quantitatively and qualitatively, +improving prediction accuracy by as much as 70%, and demonstrating +significantly stronger generalization. + +
+
+
+
+
+ + ☆ Diffusion-based Aesthetic QR Code Generation via Scanning-Robust + Perceptual Guidance + + +
+ QR codes, prevalent in daily applications, lack visual appeal due to their +conventional black-and-white design. Integrating aesthetics while maintaining +scannability poses a challenge. In this paper, we introduce a novel +diffusion-model-based aesthetic QR code generation pipeline, utilizing +pre-trained ControlNet and guided iterative refinement via a novel classifier +guidance (SRG) based on the proposed Scanning-Robust Loss (SRL) tailored with +QR code mechanisms, which ensures both aesthetics and scannability. To further +improve the scannability while preserving aesthetics, we propose a two-stage +pipeline with Scanning-Robust Perceptual Guidance (SRPG). Moreover, we can +further enhance the scannability of the generated QR code by post-processing it +through the proposed Scanning-Robust Projected Gradient Descent (SRPGD) +post-processing technique based on SRL with proven convergence. With extensive +quantitative, qualitative, and subjective experiments, the results demonstrate +that the proposed approach can generate diverse aesthetic QR codes with +flexibility in detail. In addition, our pipelines outperforming existing models +in terms of Scanning Success Rate (SSR) 86.67% (+40%) with comparable aesthetic +scores. The pipeline combined with SRPGD further achieves 96.67% (+50%). Our +code will be available https://github.com/jwliao1209/DiffQRCode. + +
+
+
+
+
+ + ☆ Cognitive resilience: Unraveling the proficiency of image-captioning + models to interpret masked visual content ICLR 2024 + + +
+ This study explores the ability of Image Captioning (IC) models to decode +masked visual content sourced from diverse datasets. Our findings reveal the IC +model's capability to generate captions from masked images, closely resembling +the original content. Notably, even in the presence of masks, the model adeptly +crafts descriptive textual information that goes beyond what is observable in +the original image-generated captions. While the decoding performance of the IC +model experiences a decline with an increase in the masked region's area, the +model still performs well when important regions of the image are not masked at +high coverage. + +
+
+ comment: Accepted as tiny paper in ICLR 2024 +
+
+
+
+
+ + ☆ An edge detection-based deep learning approach for tear meniscus height + measurement + + +
+ Automatic measurements of tear meniscus height (TMH) have been achieved by +using deep learning techniques; however, annotation is significantly influenced +by subjective factors and is both time-consuming and labor-intensive. In this +paper, we introduce an automatic TMH measurement technique based on edge +detection-assisted annotation within a deep learning framework. This method +generates mask labels less affected by subjective factors with enhanced +efficiency compared to previous annotation approaches. For improved +segmentation of the pupil and tear meniscus areas, the convolutional neural +network Inceptionv3 was first implemented as an image quality assessment model, +effectively identifying higher-quality images with an accuracy of 98.224%. +Subsequently, by using the generated labels, various algorithms, including +Unet, ResUnet, Deeplabv3+FcnResnet101, Deeplabv3+FcnResnet50, FcnResnet50, and +FcnResnet101 were trained, with Unet demonstrating the best performance. +Finally, Unet was used for automatic pupil and tear meniscus segmentation to +locate the center of the pupil and calculate TMH,respectively. An evaluation of +the mask quality predicted by Unet indicated a Mean Intersection over Union of +0.9362, a recall of 0.9261, a precision of 0.9423, and an F1-Score of 0.9326. +Additionally, the TMH predicted by the model was assessed, with the fitting +curve represented as y= 0.982x-0.862, an overall correlation coefficient of +r^2=0.961 , and an accuracy of 94.80% (237/250). In summary, the algorithm can +automatically screen images based on their quality,segment the pupil and tear +meniscus areas, and automatically measure TMH. Measurement results using the AI +algorithm demonstrate a high level of consistency with manual measurements, +offering significant support to clinical doctors in diagnosing dry eye disease. + +
+
+ comment: 22 pages, 5 figures +
+
+
+
+
+ + ☆ Inpainting-Driven Mask Optimization for Object Removal IJCNN 2024 + + +
+ This paper proposes a mask optimization method for improving the quality of +object removal using image inpainting. While many inpainting methods are +trained with a set of random masks, a target for inpainting may be an object, +such as a person, in many realistic scenarios. This domain gap between masks in +training and inference images increases the difficulty of the inpainting task. +In our method, this domain gap is resolved by training the inpainting network +with object masks extracted by segmentation, and such object masks are also +used in the inference step. Furthermore, to optimize the object masks for +inpainting, the segmentation network is connected to the inpainting network and +end-to-end trained to improve the inpainting performance. The effect of this +end-to-end training is further enhanced by our mask expansion loss for +achieving the trade-off between large and small masks. Experimental results +demonstrate the effectiveness of our method for better object removal using +image inpainting. + +
+
+ comment: Accepted to IJCNN 2024 (International Joint Conference on Neural + Networks) +
+
+
+
+
+ + ☆ Centered Masking for Language-Image Pre-Training + + +
+ We introduce Gaussian masking for Language-Image Pre-Training (GLIP) a novel, +straightforward, and effective technique for masking image patches during +pre-training of a vision-language model. GLIP builds on Fast Language-Image +Pre-Training (FLIP), which randomly masks image patches while training a CLIP +model. GLIP replaces random masking with centered masking, that uses a Gaussian +distribution and is inspired by the importance of image patches at the center +of the image. GLIP retains the same computational savings as FLIP, while +improving performance across a range of downstream datasets and tasks, as +demonstrated by our experimental results. We show the benefits of GLIP to be +easy to obtain, requiring no delicate tuning of the Gaussian, and also +applicable to data sets containing images without an obvious center focus. + +
+
+
+
+
+ + ☆ VLM-CPL: Consensus Pseudo Labels from Vision-Language Models for Human + Annotation-Free Pathological Image Classification + + +
+ Despite that deep learning methods have achieved remarkable performance in +pathology image classification, they heavily rely on labeled data, demanding +extensive human annotation efforts. In this study, we present a novel human +annotation-free method for pathology image classification by leveraging +pre-trained Vision-Language Models (VLMs). Without human annotation, pseudo +labels of the training set are obtained by utilizing the zero-shot inference +capabilities of VLM, which may contain a lot of noise due to the domain shift +between the pre-training data and the target dataset. To address this issue, we +introduce VLM-CPL, a novel approach based on consensus pseudo labels that +integrates two noisy label filtering techniques with a semi-supervised learning +strategy. Specifically, we first obtain prompt-based pseudo labels with +uncertainty estimation by zero-shot inference with the VLM using multiple +augmented views of an input. Then, by leveraging the feature representation +ability of VLM, we obtain feature-based pseudo labels via sample clustering in +the feature space. Prompt-feature consensus is introduced to select reliable +samples based on the consensus between the two types of pseudo labels. By +rejecting low-quality pseudo labels, we further propose High-confidence Cross +Supervision (HCS) to learn from samples with reliable pseudo labels and the +remaining unlabeled samples. Experimental results showed that our method +obtained an accuracy of 87.1% and 95.1% on the HPH and LC25K datasets, +respectively, and it largely outperformed existing zero-shot classification and +noisy label learning methods. The code is available at +https://github.com/lanfz2000/VLM-CPL. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Once for Both: Single Stage of Importance and Sparsity Search for Vision + Transformer Compression CVPR 2024 + + +
+ Recent Vision Transformer Compression (VTC) works mainly follow a two-stage +scheme, where the importance score of each model unit is first evaluated or +preset in each submodule, followed by the sparsity score evaluation according +to the target sparsity constraint. Such a separate evaluation process induces +the gap between importance and sparsity score distributions, thus causing high +search costs for VTC. In this work, for the first time, we investigate how to +integrate the evaluations of importance and sparsity scores into a single +stage, searching the optimal subnets in an efficient manner. Specifically, we +present OFB, a cost-efficient approach that simultaneously evaluates both +importance and sparsity scores, termed Once for Both (OFB), for VTC. First, a +bi-mask scheme is developed by entangling the importance score and the +differentiable sparsity score to jointly determine the pruning potential +(prunability) of each unit. Such a bi-mask search strategy is further used +together with a proposed adaptive one-hot loss to realize the +progressive-and-efficient search for the most important subnet. Finally, +Progressive Masked Image Modeling (PMIM) is proposed to regularize the feature +space to be more representative during the search process, which may be +degraded by the dimension reduction. Extensive experiments demonstrate that OFB +can achieve superior compression performance over state-of-the-art +searching-based and pruning-based methods under various Vision Transformer +architectures, meanwhile promoting search efficiency significantly, e.g., +costing one GPU search day for the compression of DeiT-S on ImageNet-1K. + +
+
+ comment: Accepted by CVPR 2024. Our code will be available at + www.github.com/HankYe/Once-for-Both +
+
+
+
+
+ + ☆ Time-series Initialization and Conditioning for Video-agnostic + Stabilization of Video Super-Resolution using Recurrent Networks IJCNN 2024 + + +
+ A Recurrent Neural Network (RNN) for Video Super Resolution (VSR) is +generally trained with randomly clipped and cropped short videos extracted from +original training videos due to various challenges in learning RNNs. However, +since this RNN is optimized to super-resolve short videos, VSR of long videos +is degraded due to the domain gap. Our preliminary experiments reveal that such +degradation changes depending on the video properties, such as the video length +and dynamics. To avoid this degradation, this paper proposes the training +strategy of RNN for VSR that can work efficiently and stably independently of +the video length and dynamics. The proposed training strategy stabilizes VSR by +training a VSR network with various RNN hidden states changed depending on the +video properties. Since computing such a variety of hidden states is +time-consuming, this computational cost is reduced by reusing the hidden states +for efficient training. In addition, training stability is further improved +with frame-number conditioning. Our experimental results demonstrate that the +proposed method performed better than base methods in videos with various +lengths and dynamics. + +
+
+ comment: Accepted to IJCNN 2024 (International Joint Conference on Neural + Networks) +
+
+
+
+
+ + ☆ Spatio-Temporal Bi-directional Cross-frame Memory for Distractor + Filtering Point Cloud Single Object Tracking + + +
+ 3D single object tracking within LIDAR point clouds is a pivotal task in +computer vision, with profound implications for autonomous driving and +robotics. However, existing methods, which depend solely on appearance matching +via Siamese networks or utilize motion information from successive frames, +encounter significant challenges. Issues such as similar objects nearby or +occlusions can result in tracker drift. To mitigate these challenges, we design +an innovative spatio-temporal bi-directional cross-frame distractor filtering +tracker, named STMD-Tracker. Our first step involves the creation of a 4D +multi-frame spatio-temporal graph convolution backbone. This design separates +KNN graph spatial embedding and incorporates 1D temporal convolution, +effectively capturing temporal fluctuations and spatio-temporal information. +Subsequently, we devise a novel bi-directional cross-frame memory procedure. +This integrates future and synthetic past frame memory to enhance the current +memory, thereby improving the accuracy of iteration-based tracking. This +iterative memory update mechanism allows our tracker to dynamically compensate +for information in the current frame, effectively reducing tracker drift. +Lastly, we construct spatially reliable Gaussian masks on the fused features to +eliminate distractor points. This is further supplemented by an object-aware +sampling strategy, which bolsters the efficiency and precision of object +localization, thereby reducing tracking errors caused by distractors. Our +extensive experiments on KITTI, NuScenes and Waymo datasets demonstrate that +our approach significantly surpasses the current state-of-the-art methods. + +
+
+ comment: 18 pages,6 figures +
+
+
+
+
+ + ☆ Innovative Quantitative Analysis for Disease Progression Assessment in + Familial Cerebral Cavernous Malformations + + +
+ Familial cerebral cavernous malformation (FCCM) is a hereditary disorder +characterized by abnormal vascular structures within the central nervous +system. The FCCM lesions are often numerous and intricate, making quantitative +analysis of the lesions a labor-intensive task. Consequently, clinicians face +challenges in quantitatively assessing the severity of lesions and determining +whether lesions have progressed. To alleviate this problem, we propose a +quantitative statistical framework for FCCM, comprising an efficient annotation +module, an FCCM lesion segmentation module, and an FCCM lesion quantitative +statistics module. Our framework demonstrates precise segmentation of the FCCM +lesion based on efficient data annotation, achieving a Dice coefficient of +93.22\%. More importantly, we focus on quantitative statistics of lesions, +which is combined with image registration to realize the quantitative +comparison of lesions between different examinations of patients, and a +visualization framework has been established for doctors to comprehensively +compare and analyze lesions. The experimental results have demonstrated that +our proposed framework not only obtains objective, accurate, and comprehensive +quantitative statistical information, which provides a quantitative assessment +method for disease progression and drug efficacy study, but also considerably +reduces the manual measurement and statistical workload of lesions, assisting +clinical decision-making for FCCM and accelerating progress in FCCM clinical +research. This highlights the potential of practical application of the +framework in FCCM clinical research and clinical decision-making. The codes are +available at https://github.com/6zrg/Quantitative-Statistics-of-FCCM. + +
+
+
+
+
+ + ☆ In-Context Matting CVPR 2024 + + +
+ We introduce in-context matting, a novel task setting of image matting. Given +a reference image of a certain foreground and guided priors such as points, +scribbles, and masks, in-context matting enables automatic alpha estimation on +a batch of target images of the same foreground category, without additional +auxiliary input. This setting marries good performance in auxiliary input-based +matting and ease of use in automatic matting, which finds a good trade-off +between customization and automation. To overcome the key challenge of accurate +foreground matching, we introduce IconMatting, an in-context matting model +built upon a pre-trained text-to-image diffusion model. Conditioned on inter- +and intra-similarity matching, IconMatting can make full use of reference +context to generate accurate target alpha mattes. To benchmark the task, we +also introduce a novel testing dataset ICM-$57$, covering 57 groups of +real-world images. Quantitative and qualitative results on the ICM-57 testing +set show that IconMatting rivals the accuracy of trimap-based matting while +retaining the automation level akin to automatic matting. Code is available at +https://github.com/tiny-smart/in-context-matting + +
+
+ comment: Accepted to CVPR 2024. Code is available at + https://github.com/tiny-smart/in-context-matting +
+
+
+
+
+ + ☆ Depth Estimation fusing Image and Radar Measurements with Uncertain + Directions IJCNN 2024 + + +
+ This paper proposes a depth estimation method using radar-image fusion by +addressing the uncertain vertical directions of sparse radar measurements. In +prior radar-image fusion work, image features are merged with the uncertain +sparse depths measured by radar through convolutional layers. This approach is +disturbed by the features computed with the uncertain radar depths. +Furthermore, since the features are computed with a fully convolutional +network, the uncertainty of each depth corresponding to a pixel is spread out +over its surrounding pixels. Our method avoids this problem by computing +features only with an image and conditioning the features pixelwise with the +radar depth. Furthermore, the set of possibly correct radar directions is +identified with reliable LiDAR measurements, which are available only in the +training stage. Our method improves training data by learning only these +possibly correct radar directions, while the previous method trains raw radar +measurements, including erroneous measurements. Experimental results +demonstrate that our method can improve the quantitative and qualitative +results compared with its base method using radar-image fusion. + +
+
+ comment: Accepted to IJCNN 2024 (International Joint Conference on Neural + Networks) +
+
+
+
+
+ + ☆ Adversarial Defense Teacher for Cross-Domain Object Detection under Poor + Visibility Conditions + + +
+ Existing object detectors encounter challenges in handling domain shifts +between training and real-world data, particularly under poor visibility +conditions like fog and night. Cutting-edge cross-domain object detection +methods use teacher-student frameworks and compel teacher and student models to +produce consistent predictions under weak and strong augmentations, +respectively. In this paper, we reveal that manually crafted augmentations are +insufficient for optimal teaching and present a simple yet effective framework +named Adversarial Defense Teacher (ADT), leveraging adversarial defense to +enhance teaching quality. Specifically, we employ adversarial attacks, +encouraging the model to generalize on subtly perturbed inputs that effectively +deceive the model. To address small objects under poor visibility conditions, +we propose a Zoom-in Zoom-out strategy, which zooms-in images for better +pseudo-labels and zooms-out images and pseudo-labels to learn refined features. +Our results demonstrate that ADT achieves superior performance, reaching 54.5% +mAP on Foggy Cityscapes, surpassing the previous state-of-the-art by 2.6% mAP. + +
+
+
+
+
+ + ☆ Graph Image Prior for Unsupervised Dynamic MRI Reconstruction + + +
+ The inductive bias of the convolutional neural network (CNN) can act as a +strong prior for image restoration, which is known as the Deep Image Prior +(DIP). In recent years, DIP has been utilized in unsupervised dynamic MRI +reconstruction, which adopts a generative model from the latent space to the +image space. However, existing methods usually utilize a single pyramid-shaped +CNN architecture to parameterize the generator, which cannot effectively +exploit the spatio-temporal correlations within the dynamic data. In this work, +we propose a novel scheme to exploit the DIP prior for dynamic MRI +reconstruction, named ``Graph Image Prior'' (GIP). The generative model is +decomposed into two stages: image recovery and manifold discovery, which is +bridged by a graph convolutional network to exploit the spatio-temporal +correlations. In addition, we devise an ADMM algorithm to alternately optimize +the images and the network parameters to further improve the reconstruction +performance. Experimental results demonstrate that GIP outperforms compressed +sensing methods and unsupervised methods over different sampling trajectories, +and significantly reduces the performance gap with the state-of-art supervised +deep-learning methods. Moreover, GIP displays superior generalization ability +when transferred to a different reconstruction setting, without the need for +any additional data. + +
+
+
+
+
+ + ☆ FusionINN: Invertible Image Fusion for Brain Tumor Monitoring + + +
+ Image fusion typically employs non-invertible neural networks to merge +multiple source images into a single fused image. However, for clinical +experts, solely relying on fused images may be insufficient for making +diagnostic decisions, as the fusion mechanism blends features from source +images, thereby making it difficult to interpret the underlying tumor +pathology. We introduce FusionINN, a novel invertible image fusion framework, +capable of efficiently generating fused images and also decomposing them back +to the source images by solving the inverse of the fusion process. FusionINN +guarantees lossless one-to-one pixel mapping by integrating a normally +distributed latent image alongside the fused image to facilitate the generative +modeling of the decomposition process. To the best of our knowledge, we are the +first to investigate the decomposability of fused images, which is particularly +crucial for life-sensitive applications such as medical image fusion compared +to other tasks like multi-focus or multi-exposure image fusion. Our extensive +experimentation validates FusionINN over existing discriminative and generative +fusion methods, both subjectively and objectively. Moreover, compared to a +recent denoising diffusion-based fusion model, our approach offers faster and +qualitatively better fusion results. We also exhibit the clinical utility of +our results in aiding disease prognosis. + +
+
+ comment: Source code coming soon +
+
+
+
+
+ + ☆ Towards Human-Like Machine Comprehension: Few-Shot Relational Learning + in Visually-Rich Documents COLING2024 + + +
+ Key-value relations are prevalent in Visually-Rich Documents (VRDs), often +depicted in distinct spatial regions accompanied by specific color and font +styles. These non-textual cues serve as important indicators that greatly +enhance human comprehension and acquisition of such relation triplets. However, +current document AI approaches often fail to consider this valuable prior +information related to visual and spatial features, resulting in suboptimal +performance, particularly when dealing with limited examples. To address this +limitation, our research focuses on few-shot relational learning, specifically +targeting the extraction of key-value relation triplets in VRDs. Given the +absence of a suitable dataset for this task, we introduce two new few-shot +benchmarks built upon existing supervised benchmark datasets. Furthermore, we +propose a variational approach that incorporates relational 2D-spatial priors +and prototypical rectification techniques. This approach aims to generate +relation representations that are more aware of the spatial context and unseen +relation in a manner similar to human perception. Experimental results +demonstrate the effectiveness of our proposed method by showcasing its ability +to outperform existing methods. This study also opens up new possibilities for +practical applications. + +
+
+ comment: 13 pages, 7 figures, accepted by LERC-COLING2024 +
+
+
+
+
+ + ☆ AOCIL: Exemplar-free Analytic Online Class Incremental Learning with Low + Time and Resource Consumption + + +
+ Online Class Incremental Learning (OCIL) aims to train the model in a +task-by-task manner, where data arrive in mini-batches at a time while previous +data are not accessible. A significant challenge is known as Catastrophic +Forgetting, i.e., loss of the previous knowledge on old data. To address this, +replay-based methods show competitive results but invade data privacy, while +exemplar-free methods protect data privacy but struggle for accuracy. In this +paper, we proposed an exemplar-free approach -- Analytic Online Class +Incremental Learning (AOCIL). Instead of back-propagation, we design the +Analytic Classifier (AC) updated by recursive least square, cooperating with a +frozen backbone. AOCIL simultaneously achieves high accuracy, low resource +consumption and data privacy protection. We conduct massive experiments on four +existing benchmark datasets, and the results demonstrate the strong capability +of handling OCIL scenarios. Codes will be ready. + +
+
+
+
+
+ + ☆ iDAT: inverse Distillation Adapter-Tuning ICME + 2024 + + +
+ Adapter-Tuning (AT) method involves freezing a pre-trained model and +introducing trainable adapter modules to acquire downstream knowledge, thereby +calibrating the model for better adaptation to downstream tasks. This paper +proposes a distillation framework for the AT method instead of crafting a +carefully designed adapter module, which aims to improve fine-tuning +performance. For the first time, we explore the possibility of combining the AT +method with knowledge distillation. Via statistical analysis, we observe +significant differences in the knowledge acquisition between adapter modules of +different models. Leveraging these differences, we propose a simple yet +effective framework called inverse Distillation Adapter-Tuning (iDAT). +Specifically, we designate the smaller model as the teacher and the larger +model as the student. The two are jointly trained, and online knowledge +distillation is applied to inject knowledge of different perspective to student +model, and significantly enhance the fine-tuning performance on downstream +tasks. Extensive experiments on the VTAB-1K benchmark with 19 image +classification tasks demonstrate the effectiveness of iDAT. The results show +that using existing AT method within our iDAT framework can further yield a +2.66% performance gain, with only an additional 0.07M trainable parameters. Our +approach compares favorably with state-of-the-arts without bells and whistles. +Our code is available at https://github.com/JCruan519/iDAT. + +
+
+ comment: 10 pages, 9 figures, 13 tables. This paper has been accepted by ICME + 2024 +
+
+
+
+
+ + ☆ 3D-TransUNet for Brain Metastases Segmentation in the BraTS2023 + Challenge + + +
+ Segmenting brain tumors is complex due to their diverse appearances and +scales. Brain metastases, the most common type of brain tumor, are a frequent +complication of cancer. Therefore, an effective segmentation model for brain +metastases must adeptly capture local intricacies to delineate small tumor +regions while also integrating global context to understand broader scan +features. The TransUNet model, which combines Transformer self-attention with +U-Net's localized information, emerges as a promising solution for this task. +In this report, we address brain metastases segmentation by training the +3D-TransUNet model on the Brain Tumor Segmentation (BraTS-METS) 2023 challenge +dataset. Specifically, we explored two architectural configurations: the +Encoder-only 3D-TransUNet, employing Transformers solely in the encoder, and +the Decoder-only 3D-TransUNet, utilizing Transformers exclusively in the +decoder. For Encoder-only 3D-TransUNet, we note that Masked-Autoencoder +pre-training is required for a better initialization of the Transformer Encoder +and thus accelerates the training process. We identify that the Decoder-only +3D-TransUNet model should offer enhanced efficacy in the segmentation of brain +metastases, as indicated by our 5-fold cross-validation on the training set. +However, our use of the Encoder-only 3D-TransUNet model already yield notable +results, with an average lesion-wise Dice score of 59.8\% on the test set, +securing second place in the BraTS-METS 2023 challenge. + +
+
+
+
+
+ + ☆ Ev-Edge: Efficient Execution of Event-based Vision Algorithms on + Commodity Edge Platforms + + +
+ Event cameras have emerged as a promising sensing modality for autonomous +navigation systems, owing to their high temporal resolution, high dynamic range +and negligible motion blur. To process the asynchronous temporal event streams +from such sensors, recent research has shown that a mix of Artificial Neural +Networks (ANNs), Spiking Neural Networks (SNNs) as well as hybrid SNN-ANN +algorithms are necessary to achieve high accuracies across a range of +perception tasks. However, we observe that executing such workloads on +commodity edge platforms which feature heterogeneous processing elements such +as CPUs, GPUs and neural accelerators results in inferior performance. This is +due to the mismatch between the irregular nature of event streams and diverse +characteristics of algorithms on the one hand and the underlying hardware +platform on the other. We propose Ev-Edge, a framework that contains three key +optimizations to boost the performance of event-based vision systems on edge +platforms: (1) An Event2Sparse Frame converter directly transforms raw event +streams into sparse frames, enabling the use of sparse libraries with minimal +encoding overheads (2) A Dynamic Sparse Frame Aggregator merges sparse frames +at runtime by trading off the temporal granularity of events and computational +demand thereby improving hardware utilization (3) A Network Mapper maps +concurrently executing tasks to different processing elements while also +selecting layer precision by considering both compute and communication +overheads. On several state-of-art networks for a range of autonomous +navigation tasks, Ev-Edge achieves 1.28x-2.05x improvements in latency and +1.23x-2.15x in energy over an all-GPU implementation on the NVIDIA Jetson +Xavier AGX platform for single-task execution scenarios. Ev-Edge also achieves +1.43x-1.81x latency improvements over round-robin scheduling methods in +multi-task execution scenarios. + +
+
+
+
+
+ + ☆ PNAS-MOT: Multi-Modal Object Tracking with Pareto Neural Architecture + Search + + +
+ Multiple object tracking is a critical task in autonomous driving. Existing +works primarily focus on the heuristic design of neural networks to obtain high +accuracy. As tracking accuracy improves, however, neural networks become +increasingly complex, posing challenges for their practical application in real +driving scenarios due to the high level of latency. In this paper, we explore +the use of the neural architecture search (NAS) methods to search for efficient +architectures for tracking, aiming for low real-time latency while maintaining +relatively high accuracy. Another challenge for object tracking is the +unreliability of a single sensor, therefore, we propose a multi-modal framework +to improve the robustness. Experiments demonstrate that our algorithm can run +on edge devices within lower latency constraints, thus greatly reducing the +computational requirements for multi-modal object tracking while keeping lower +latency. + +
+
+ comment: IEEE Robotics and Automation Letters 2024. Code is available at + https://github.com/PholyPeng/PNAS-MOT +
+
+
+
+
+ + ☆ Contact-aware Human Motion Generation from Textual Descriptions + + +
+ This paper addresses the problem of generating 3D interactive human motion +from text. Given a textual description depicting the actions of different body +parts in contact with objects, we synthesize sequences of 3D body poses that +are visually natural and physically plausible. Yet, this task poses a +significant challenge due to the inadequate consideration of interactions by +physical contacts in both motion and textual descriptions, leading to unnatural +and implausible sequences. To tackle this challenge, we create a novel dataset +named RICH-CAT, representing ``Contact-Aware Texts'' constructed from the RICH +dataset. RICH-CAT comprises high-quality motion, accurate human-object contact +labels, and detailed textual descriptions, encompassing over 8,500 motion-text +pairs across 26 indoor/outdoor actions. Leveraging RICH-CAT, we propose a novel +approach named CATMO for text-driven interactive human motion synthesis that +explicitly integrates human body contacts as evidence. We employ two VQ-VAE +models to encode motion and body contact sequences into distinct yet +complementary latent spaces and an intertwined GPT for generating human motions +and contacts in a mutually conditioned manner. Additionally, we introduce a +pre-trained text encoder to learn textual embeddings that better discriminate +among various contact types, allowing for more precise control over synthesized +motions and contacts. Our experiments demonstrate the superior performance of +our approach compared to existing text-to-motion methods, producing stable, +contact-aware motion sequences. Code and data will be available for research +purposes. + +
+
+ comment: Project page: https://xymsh.github.io/RICH-CAT/ +
+
+
+
+
+ + ☆ G-ACIL: Analytic Learning for Exemplar-Free Generalized Class + Incremental Learning + + +
+ Class incremental learning (CIL) trains a network on sequential tasks with +separated categories but suffers from catastrophic forgetting, where models +quickly lose previously learned knowledge when acquiring new tasks. The +generalized CIL (GCIL) aims to address the CIL problem in a more real-world +scenario, where incoming data have mixed data categories and unknown sample +size distribution, leading to intensified forgetting. Existing attempts for the +GCIL either have poor performance, or invade data privacy by saving historical +exemplars. To address this, in this paper, we propose an exemplar-free +generalized analytic class incremental learning (G-ACIL). The G-ACIL adopts +analytic learning (a gradient-free training technique), and delivers an +analytical solution (i.e., closed-form) to the GCIL scenario. This solution is +derived via decomposing the incoming data into exposed and unexposed classes, +allowing an equivalence between the incremental learning and its joint +training, i.e., the weight-invariant property. Such an equivalence is +theoretically validated through matrix analysis tools, and hence contributes +interpretability in GCIL. It is also empirically evidenced by experiments on +various datasets and settings of GCIL. The results show that the G-ACIL +exhibits leading performance with high robustness compared with existing +competitive GCIL methods. Codes will be ready at +https://github.com/ZHUANGHP/Analytic-continual-learning. + +
+
+
+
+
+ + ☆ UPNeRF: A Unified Framework for Monocular 3D Object Reconstruction and + Pose Estimation + + +
+ Monocular 3D reconstruction for categorical objects heavily relies on +accurately perceiving each object's pose. While gradient-based optimization +within a NeRF framework updates initially given poses, this paper highlights +that such a scheme fails when the initial pose even moderately deviates from +the true pose. Consequently, existing methods often depend on a third-party 3D +object to provide an initial object pose, leading to increased complexity and +generalization issues. To address these challenges, we present UPNeRF, a +Unified framework integrating Pose estimation and NeRF-based reconstruction, +bringing us closer to real-time monocular 3D object reconstruction. UPNeRF +decouples the object's dimension estimation and pose refinement to resolve the +scale-depth ambiguity, and introduces an effective projected-box representation +that generalizes well cross different domains. While using a dedicated pose +estimator that smoothly integrates into an object-centric NeRF, UPNeRF is free +from external 3D detectors. UPNeRF achieves state-of-the-art results in both +reconstruction and pose estimation tasks on the nuScenes dataset. Furthermore, +UPNeRF exhibits exceptional Cross-dataset generalization on the KITTI and Waymo +datasets, surpassing prior methods with up to 50% reduction in rotation and +translation error. + +
+
+
+
+
+ + ☆ Gaussian in the Wild: 3D Gaussian Splatting for Unconstrained Image + Collections + + +
+ Novel view synthesis from unconstrained in-the-wild images remains a +meaningful but challenging task. The photometric variation and transient +occluders in those unconstrained images make it difficult to reconstruct the +original scene accurately. Previous approaches tackle the problem by +introducing a global appearance feature in Neural Radiance Fields (NeRF). +However, in the real world, the unique appearance of each tiny point in a scene +is determined by its independent intrinsic material attributes and the varying +environmental impacts it receives. Inspired by this fact, we propose Gaussian +in the wild (GS-W), a method that uses 3D Gaussian points to reconstruct the +scene and introduces separated intrinsic and dynamic appearance feature for +each point, capturing the unchanged scene appearance along with dynamic +variation like illumination and weather. Additionally, an adaptive sampling +strategy is presented to allow each Gaussian point to focus on the local and +detailed information more effectively. We also reduce the impact of transient +occluders using a 2D visibility map. More experiments have demonstrated better +reconstruction quality and details of GS-W compared to previous methods, with a +$1000\times$ increase in rendering speed. + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ☆ SceneX:Procedural Controllable Large-scale Scene Generation via + Large-language Models + + +
+ Due to its great application potential, large-scale scene generation has +drawn extensive attention in academia and industry. Recent research employs +powerful generative models to create desired scenes and achieves promising +results. However, most of these methods represent the scene using 3D primitives +(e.g. point cloud or radiance field) incompatible with the industrial pipeline, +which leads to a substantial gap between academic research and industrial +deployment. Procedural Controllable Generation (PCG) is an efficient technique +for creating scalable and high-quality assets, but it is unfriendly for +ordinary users as it demands profound domain expertise. To address these +issues, we resort to using the large language model (LLM) to drive the +procedural modeling. In this paper, we introduce a large-scale scene generation +framework, SceneX, which can automatically produce high-quality procedural +models according to designers' textual descriptions.Specifically, the proposed +method comprises two components, PCGBench and PCGPlanner. The former +encompasses an extensive collection of accessible procedural assets and +thousands of hand-craft API documents. The latter aims to generate executable +actions for Blender to produce controllable and precise 3D assets guided by the +user's instructions. Our SceneX can generate a city spanning 2.5 km times 2.5 +km with delicate layout and geometric structures, drastically reducing the time +cost from several weeks for professional PCG engineers to just a few hours for +an ordinary user. Extensive experiments demonstrated the capability of our +method in controllable large-scale scene generation and editing, including +asset placement and season translation. + +
+
+
+
+
+ + ☆ Technical Report: Masked Skeleton Sequence Modeling for Learning Larval + Zebrafish Behavior Latent Embeddings + + +
+ In this report, we introduce a novel self-supervised learning method for +extracting latent embeddings from behaviors of larval zebrafish. Drawing +inspiration from Masked Modeling techniquesutilized in image processing with +Masked Autoencoders (MAE) \cite{he2022masked} and in natural language +processing with Generative Pre-trained Transformer (GPT) +\cite{radford2018improving}, we treat behavior sequences as a blend of images +and language. For the skeletal sequences of swimming zebrafish, we propose a +pioneering Transformer-CNN architecture, the Sequence Spatial-Temporal +Transformer (SSTFormer), designed to capture the inter-frame correlation of +different joints. This correlation is particularly valuable, as it reflects the +coordinated movement of various parts of the fish body across adjacent frames. +To handle the high frame rate, we segment the skeleton sequence into distinct +time slices, analogous to "words" in a sentence, and employ self-attention +transformer layers to encode the consecutive frames within each slice, +capturing the spatial correlation among different joints. Furthermore, we +incorporate a CNN-based attention module to enhance the representations +outputted by the transformer layers. Lastly, we introduce a temporal feature +aggregation operation between time slices to improve the discrimination of +similar behaviors. + +
+
+
+
+
+ + ☆ Temporal-Spatial Object Relations Modeling for Vision-and-Language + Navigation + + +
+ Vision-and-Language Navigation (VLN) is a challenging task where an agent is +required to navigate to a natural language described location via vision +observations. The navigation abilities of the agent can be enhanced by the +relations between objects, which are usually learned using internal objects or +external datasets. The relationships between internal objects are modeled +employing graph convolutional network (GCN) in traditional studies. However, +GCN tends to be shallow, limiting its modeling ability. To address this issue, +we utilize a cross attention mechanism to learn the connections between objects +over a trajectory, which takes temporal continuity into account, termed as +Temporal Object Relations (TOR). The external datasets have a gap with the +navigation environment, leading to inaccurate modeling of relations. To avoid +this problem, we construct object connections based on observations from all +viewpoints in the navigational environment, which ensures complete spatial +coverage and eliminates the gap, called Spatial Object Relations (SOR). +Additionally, we observe that agents may repeatedly visit the same location +during navigation, significantly hindering their performance. For resolving +this matter, we introduce the Turning Back Penalty (TBP) loss function, which +penalizes the agent's repetitive visiting behavior, substantially reducing the +navigational distance. Experimental results on the REVERIE, SOON, and R2R +datasets demonstrate the effectiveness of the proposed method. + +
+
+
+
+
+ + ☆ The Limits of Perception: Analyzing Inconsistencies in Saliency Maps in + XAI + + +
+ Explainable artificial intelligence (XAI) plays an indispensable role in +demystifying the decision-making processes of AI, especially within the +healthcare industry. Clinicians rely heavily on detailed reasoning when making +a diagnosis, often CT scans for specific features that distinguish between +benign and malignant lesions. A comprehensive diagnostic approach includes an +evaluation of imaging results, patient observations, and clinical tests. The +surge in deploying deep learning models as support systems in medical +diagnostics has been significant, offering advances that traditional methods +could not. However, the complexity and opacity of these models present a +double-edged sword. As they operate as "black boxes," with their reasoning +obscured and inaccessible, there's an increased risk of misdiagnosis, which can +lead to patient harm. Hence, there is a pressing need to cultivate transparency +within AI systems, ensuring that the rationale behind an AI's diagnostic +recommendations is clear and understandable to medical practitioners. This +shift towards transparency is not just beneficial -- it's a critical step +towards responsible AI integration in healthcare, ensuring that AI aids rather +than hinders medical professionals in their crucial work. + +
+
+ comment: 7 pages, 1 figure, 2 tables +
+
+
+
+
+ + ☆ DS-NeRV: Implicit Neural Video Representation with Decomposed Static and + Dynamic Codes CVPR 2024 + + +
+ Implicit neural representations for video (NeRV) have recently become a novel +way for high-quality video representation. However, existing works employ a +single network to represent the entire video, which implicitly confuse static +and dynamic information. This leads to an inability to effectively compress the +redundant static information and lack the explicitly modeling of global +temporal-coherent dynamic details. To solve above problems, we propose DS-NeRV, +which decomposes videos into sparse learnable static codes and dynamic codes +without the need for explicit optical flow or residual supervision. By setting +different sampling rates for two codes and applying weighted sum and +interpolation sampling methods, DS-NeRV efficiently utilizes redundant static +information while maintaining high-frequency details. Additionally, we design a +cross-channel attention-based (CCA) fusion module to efficiently fuse these two +codes for frame decoding. Our approach achieves a high quality reconstruction +of 31.2 PSNR with only 0.35M parameters thanks to separate static and dynamic +codes representation and outperforms existing NeRV methods in many downstream +tasks. Our project website is at https://haoyan14.github.io/DS-NeRV. + +
+
+ comment: CVPR 2024. Project page at https://haoyan14.github.io/DS-NeRV +
+
+
+
+
+ + ☆ An active learning model to classify animal species in Hong Kong + + +
+ Camera traps are used by ecologists globally as an efficient and non-invasive +method to monitor animals. While it is time-consuming to manually label the +collected images, recent advances in deep learning and computer vision has made +it possible to automating this process [1]. A major obstacle to this is the +generalisability of these models when applying these images to independently +collected data from other parts of the world [2]. Here, we use a deep active +learning workflow [3], and train a model that is applicable to camera trap +images collected in Hong Kong. + +
+
+ comment: 6 pages, 2 figures, 1 table +
+
+
+
+
+ + ☆ What Do You See in Vehicle? Comprehensive Vision Solution for In-Vehicle + Gaze Estimation CVPR24 + + +
+ Driver's eye gaze holds a wealth of cognitive and intentional cues crucial +for intelligent vehicles. Despite its significance, research on in-vehicle gaze +estimation remains limited due to the scarcity of comprehensive and +well-annotated datasets in real driving scenarios. In this paper, we present +three novel elements to advance in-vehicle gaze research. Firstly, we introduce +IVGaze, a pioneering dataset capturing in-vehicle gaze, collected from 125 +subjects and covering a large range of gaze and head poses within vehicles. +Conventional gaze collection systems are inadequate for in-vehicle use. In this +dataset, we propose a new vision-based solution for in-vehicle gaze collection, +introducing a refined gaze target calibration method to tackle annotation +challenges. Second, our research focuses on in-vehicle gaze estimation +leveraging the IVGaze. In-vehicle face images often suffer from low resolution, +prompting our introduction of a gaze pyramid transformer that leverages +transformer-based multilevel features integration. Expanding upon this, we +introduce the dual-stream gaze pyramid transformer (GazeDPTR). Employing +perspective transformation, we rotate virtual cameras to normalize images, +utilizing camera pose to merge normalized and original images for accurate gaze +estimation. GazeDPTR shows state-of-the-art performance on the IVGaze dataset. +Thirdly, we explore a novel strategy for gaze zone classification by extending +the GazeDPTR. A foundational tri-plane and project gaze onto these planes are +newly defined. Leveraging both positional features from the projection points +and visual attributes from images, we achieve superior performance compared to +relying solely on visual features, substantiating the advantage of gaze +estimation. Our project is available at https://yihua.zone/work/ivgaze. + +
+
+ comment: CVPR24 +
+
+
+
+
+ + ☆ Boosting Few-Shot Learning via Attentive Feature Regularization AAAI 2024 + + +
+ Few-shot learning (FSL) based on manifold regularization aims to improve the +recognition capacity of novel objects with limited training samples by mixing +two samples from different categories with a blending factor. However, this +mixing operation weakens the feature representation due to the linear +interpolation and the overlooking of the importance of specific channels. To +solve these issues, this paper proposes attentive feature regularization (AFR) +which aims to improve the feature representativeness and discriminability. In +our approach, we first calculate the relations between different categories of +semantic labels to pick out the related features used for regularization. Then, +we design two attention-based calculations at both the instance and channel +levels. These calculations enable the regularization procedure to focus on two +crucial aspects: the feature complementarity through adaptive interpolation in +related categories and the emphasis on specific feature channels. Finally, we +combine these regularization strategies to significantly improve the classifier +performance. Empirical studies on several popular FSL benchmarks demonstrate +the effectiveness of AFR, which improves the recognition accuracy of novel +categories without the need to retrain any feature extractor, especially in the +1-shot setting. Furthermore, the proposed AFR can seamlessly integrate into +other FSL methods to improve classification performance. + +
+
+ comment: Accepted to AAAI 2024 +
+
+
+
+
+ + ☆ Role of Locality and Weight Sharing in Image-Based Tasks: A Sample + Complexity Separation between CNNs, LCNs, and FCNs ICLR 2024 + + +
+ Vision tasks are characterized by the properties of locality and translation +invariance. The superior performance of convolutional neural networks (CNNs) on +these tasks is widely attributed to the inductive bias of locality and weight +sharing baked into their architecture. Existing attempts to quantify the +statistical benefits of these biases in CNNs over locally connected +convolutional neural networks (LCNs) and fully connected neural networks (FCNs) +fall into one of the following categories: either they disregard the optimizer +and only provide uniform convergence upper bounds with no separating lower +bounds, or they consider simplistic tasks that do not truly mirror the locality +and translation invariance as found in real-world vision tasks. To address +these deficiencies, we introduce the Dynamic Signal Distribution (DSD) +classification task that models an image as consisting of $k$ patches, each of +dimension $d$, and the label is determined by a $d$-sparse signal vector that +can freely appear in any one of the $k$ patches. On this task, for any +orthogonally equivariant algorithm like gradient descent, we prove that CNNs +require $\tilde{O}(k+d)$ samples, whereas LCNs require $\Omega(kd)$ samples, +establishing the statistical advantages of weight sharing in translation +invariant tasks. Furthermore, LCNs need $\tilde{O}(k(k+d))$ samples, compared +to $\Omega(k^2d)$ samples for FCNs, showcasing the benefits of locality in +local tasks. Additionally, we develop information theoretic tools for analyzing +randomized algorithms, which may be of interest for statistical research. + +
+
+ comment: 40 pages, 4 figures, Accepted to ICLR 2024, Spotlight +
+
+
+
+
+ + ♻ ☆ Fast-DiM: Towards Fast Diffusion Morphs + + +
+ Diffusion Morphs (DiM) are a recent state-of-the-art method for creating high +quality face morphs; however, they require a high number of network function +evaluations (NFE) to create the morphs.We propose a new DiM pipeline, Fast-DiM, +which can create morphs of a similar quality but with lower NFE. We investigate +the ODE solvers used to solve the Probability Flow ODE and the impact they have +on the the creation of face morphs. Additionally, we employ an alternative +method for encoding images into the latent space of the Diffusion model by +solving the Probability Flow ODE as time runs forwards. Our experiments show +that we can reduce the NFE by upwards of 85% in the encoding process while +experiencing only 1.6% reduction in Mated Morph Presentation Match Rate +(MMPMR). Likewise, we showed we could cut NFE, in the sampling process, in half +with only a maximal reduction of 0.23% in MMPMR. + +
+
+ comment: Revised manuscript. Under review for publication +
+
+
+
+
+ + ♻ ☆ Structure-Aware Sparse-View X-ray 3D Reconstruction CVPR 2024 + + +
+ X-ray, known for its ability to reveal internal structures of objects, is +expected to provide richer information for 3D reconstruction than visible +light. Yet, existing neural radiance fields (NeRF) algorithms overlook this +important nature of X-ray, leading to their limitations in capturing structural +contents of imaged objects. In this paper, we propose a framework, +Structure-Aware X-ray Neural Radiodensity Fields (SAX-NeRF), for sparse-view +X-ray 3D reconstruction. Firstly, we design a Line Segment-based Transformer +(Lineformer) as the backbone of SAX-NeRF. Linefomer captures internal +structures of objects in 3D space by modeling the dependencies within each line +segment of an X-ray. Secondly, we present a Masked Local-Global (MLG) ray +sampling strategy to extract contextual and geometric information in 2D +projection. Plus, we collect a larger-scale dataset X3D covering wider X-ray +applications. Experiments on X3D show that SAX-NeRF surpasses previous +NeRF-based methods by 12.56 and 2.49 dB on novel view synthesis and CT +reconstruction. Code, models, and data are released at +https://github.com/caiyuanhao1998/SAX-NeRF + +
+
+ comment: CVPR 2024; The first Transformer-based method for X-ray and CT 3D + reconstruction +
+
+
+
+
+ + ♻ ☆ NaVid: Video-based VLM Plans the Next Step for Vision-and-Language + Navigation + + +
+ Vision-and-Language Navigation (VLN) stands as a key research problem of +Embodied AI, aiming at enabling agents to navigate in unseen environments +following linguistic instructions. In this field, generalization is a +long-standing challenge, either to out-of-distribution scenes or from Sim to +Real. In this paper, we propose NaVid, a video-based large vision language +model (VLM), to mitigate such a generalization gap. NaVid makes the first +endeavour to showcase the capability of VLMs to achieve state-of-the-art level +navigation performance without any maps, odometer and depth inputs. Following +human instruction, NaVid only requires an on-the-fly video stream from a +monocular RGB camera equipped on the robot to output the next-step action. Our +formulation mimics how humans navigate and naturally gets rid of the problems +introduced by odometer noises, and the Sim2Real gaps from map or depth inputs. +Moreover, our video-based approach can effectively encode the historical +observations of robots as spatio-temporal contexts for decision-making and +instruction following. We train NaVid with 550k navigation samples collected +from VLN-CE trajectories, including action-planning and instruction-reasoning +samples, along with 665k large-scale web data. Extensive experiments show that +NaVid achieves SOTA performance in simulation environments and the real world, +demonstrating superior cross-dataset and Sim2Real transfer. We thus believe our +proposed VLM approach plans the next step for not only the navigation agents +but also this research field. + +
+
+
+
+
+ + ♻ ☆ VT-Former: An Exploratory Study on Vehicle Trajectory Prediction for + Highway Surveillance through Graph Isomorphism and Transformer + + +
+ Enhancing roadway safety has become an essential computer vision focus area +for Intelligent Transportation Systems (ITS). As a part of ITS, Vehicle +Trajectory Prediction (VTP) aims to forecast a vehicle's future positions based +on its past and current movements. VTP is a pivotal element for road safety, +aiding in applications such as traffic management, accident prevention, +work-zone safety, and energy optimization. While most works in this field focus +on autonomous driving, with the growing number of surveillance cameras, another +sub-field emerges for surveillance VTP with its own set of challenges. In this +paper, we introduce VT-Former, a novel transformer-based VTP approach for +highway safety and surveillance. In addition to utilizing transformers to +capture long-range temporal patterns, a new Graph Attentive Tokenization (GAT) +module has been proposed to capture intricate social interactions among +vehicles. This study seeks to explore both the advantages and the limitations +inherent in combining transformer architecture with graphs for VTP. Our +investigation, conducted across three benchmark datasets from diverse +surveillance viewpoints, showcases the State-of-the-Art (SotA) or comparable +performance of VT-Former in predicting vehicle trajectories. This study +underscores the potentials of VT-Former and its architecture, opening new +avenues for future research and exploration. + +
+
+ comment: Completely updated based on the reviews received for the paper +
+
+
+
+
+ + ♻ ☆ Deep Point Cloud Normal Estimation via Triplet Learning ICME 2022 + + +
+ Normal estimation on 3D point clouds is a fundamental problem in 3D vision +and graphics. Current methods often show limited accuracy in predicting normals +at sharp features (e.g., edges and corners) and less robustness to noise. In +this paper, we propose a novel normal estimation method for point clouds. It +consists of two phases: (a) feature encoding which learns representations of +local patches, and (b) normal estimation that takes the learned representation +as input and regresses the normal vector. We are motivated that local patches +on isotropic and anisotropic surfaces have similar or distinct normals, and +that separable features or representations can be learned to facilitate normal +estimation. To realise this, we first construct triplets of local patches on 3D +point cloud data, and design a triplet network with a triplet loss for feature +encoding. We then design a simple network with several MLPs and a loss function +to regress the normal vector. Despite having a smaller network size compared to +most other methods, experimental results show that our method preserves sharp +features and achieves better normal estimation results on CAD-like shapes. + +
+
+ comment: Accepted by ICME 2022. Supplementary material available at + https://ieeexplore.ieee.org/document/9859844/media#media +
+
+
+
+
+ + ♻ ☆ InstaFlow: One Step is Enough for High-Quality Diffusion-Based + Text-to-Image Generation ICLR 2024 + + +
+ Diffusion models have revolutionized text-to-image generation with its +exceptional quality and creativity. However, its multi-step sampling process is +known to be slow, often requiring tens of inference steps to obtain +satisfactory results. Previous attempts to improve its sampling speed and +reduce computational costs through distillation have been unsuccessful in +achieving a functional one-step model. In this paper, we explore a recent +method called Rectified Flow, which, thus far, has only been applied to small +datasets. The core of Rectified Flow lies in its \emph{reflow} procedure, which +straightens the trajectories of probability flows, refines the coupling between +noises and images, and facilitates the distillation process with student +models. We propose a novel text-conditioned pipeline to turn Stable Diffusion +(SD) into an ultra-fast one-step model, in which we find reflow plays a +critical role in improving the assignment between noise and images. Leveraging +our new pipeline, we create, to the best of our knowledge, the first one-step +diffusion-based text-to-image generator with SD-level image quality, achieving +an FID (Frechet Inception Distance) of $23.3$ on MS COCO 2017-5k, surpassing +the previous state-of-the-art technique, progressive distillation, by a +significant margin ($37.2$ $\rightarrow$ $23.3$ in FID). By utilizing an +expanded network with 1.7B parameters, we further improve the FID to $22.4$. We +call our one-step models \emph{InstaFlow}. On MS COCO 2014-30k, InstaFlow +yields an FID of $13.1$ in just $0.09$ second, the best in $\leq 0.1$ second +regime, outperforming the recent StyleGAN-T ($13.9$ in $0.1$ second). Notably, +the training of InstaFlow only costs 199 A100 GPU days. Codes and pre-trained +models are available at \url{github.com/gnobitab/InstaFlow}. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ GaussianAvatar: Towards Realistic Human Avatar Modeling from a Single + Video via Animatable 3D Gaussians + + +
+ We present GaussianAvatar, an efficient approach to creating realistic human +avatars with dynamic 3D appearances from a single video. We start by +introducing animatable 3D Gaussians to explicitly represent humans in various +poses and clothing styles. Such an explicit and animatable representation can +fuse 3D appearances more efficiently and consistently from 2D observations. Our +representation is further augmented with dynamic properties to support +pose-dependent appearance modeling, where a dynamic appearance network along +with an optimizable feature tensor is designed to learn the +motion-to-appearance mapping. Moreover, by leveraging the differentiable motion +condition, our method enables a joint optimization of motions and appearances +during avatar modeling, which helps to tackle the long-standing issue of +inaccurate motion estimation in monocular settings. The efficacy of +GaussianAvatar is validated on both the public dataset and our collected +dataset, demonstrating its superior performances in terms of appearance quality +and rendering efficiency. + +
+
+ comment: Project Page: https://huliangxiao.github.io/GaussianAvatar +
+
+
+
+
+ + ♻ ☆ Exploring 3D Human Pose Estimation and Forecasting from the Robot's + Perspective: The HARPER Dataset + + +
+ We introduce HARPER, a novel dataset for 3D body pose estimation and forecast +in dyadic interactions between users and Spot, the quadruped robot manufactured +by Boston Dynamics. The key-novelty is the focus on the robot's perspective, +i.e., on the data captured by the robot's sensors. These make 3D body pose +analysis challenging because being close to the ground captures humans only +partially. The scenario underlying HARPER includes 15 actions, of which 10 +involve physical contact between the robot and users. The Corpus contains not +only the recordings of the built-in stereo cameras of Spot, but also those of a +6-camera OptiTrack system (all recordings are synchronized). This leads to +ground-truth skeletal representations with a precision lower than a millimeter. +In addition, the Corpus includes reproducible benchmarks on 3D Human Pose +Estimation, Human Pose Forecasting, and Collision Prediction, all based on +publicly available baseline approaches. This enables future HARPER users to +rigorously compare their results with those we provide in this work. + +
+
+
+
+
+ + ♻ ☆ Generalizing to Unseen Domains with Wasserstein Distributional + Robustness under Limited Source Knowledge + + +
+ Domain generalization aims at learning a universal model that performs well +on unseen target domains, incorporating knowledge from multiple source domains. +In this research, we consider the scenario where different domain shifts occur +among conditional distributions of different classes across domains. When +labeled samples in the source domains are limited, existing approaches are not +sufficiently robust. To address this problem, we propose a novel domain +generalization framework called {Wasserstein Distributionally Robust Domain +Generalization} (WDRDG), inspired by the concept of distributionally robust +optimization. We encourage robustness over conditional distributions within +class-specific Wasserstein uncertainty sets and optimize the worst-case +performance of a classifier over these uncertainty sets. We further develop a +test-time adaptation module leveraging optimal transport to quantify the +relationship between the unseen target domain and source domains to make +adaptive inference for target data. Experiments on the Rotated MNIST, PACS and +the VLCS datasets demonstrate that our method could effectively balance the +robustness and discriminability in challenging generalization scenarios. + +
+
+
+
+
+ + ♻ ☆ Learning without Exact Guidance: Updating Large-scale High-resolution + Land Cover Maps from Low-resolution Historical Labels CVPR 2024 + + +
+ Large-scale high-resolution (HR) land-cover mapping is a vital task to survey +the Earth's surface and resolve many challenges facing humanity. However, it is +still a non-trivial task hindered by complex ground details, various landforms, +and the scarcity of accurate training labels over a wide-span geographic area. +In this paper, we propose an efficient, weakly supervised framework +(Paraformer) to guide large-scale HR land-cover mapping with easy-access +historical land-cover data of low resolution (LR). Specifically, existing +land-cover mapping approaches reveal the dominance of CNNs in preserving local +ground details but still suffer from insufficient global modeling in various +landforms. Therefore, we design a parallel CNN-Transformer feature extractor in +Paraformer, consisting of a downsampling-free CNN branch and a Transformer +branch, to jointly capture local and global contextual information. Besides, +facing the spatial mismatch of training data, a pseudo-label-assisted training +(PLAT) module is adopted to reasonably refine LR labels for weakly supervised +semantic segmentation of HR images. Experiments on two large-scale datasets +demonstrate the superiority of Paraformer over other state-of-the-art methods +for automatically updating HR land-cover maps from LR historical labels. + +
+
+ comment: 11 pages, 9 figures, accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ TLIC: Learned Image Compression with ROI-Weighted Distortion and Bit + Allocation + + +
+ This short paper describes our method for the track of image compression. To +achieve better perceptual quality, we use the adversarial loss to generate +realistic textures, use region of interest (ROI) mask to guide the bit +allocation for different regions. Our Team name is TLIC. + +
+
+ comment: 2nd Place in the Image Compression Track, CLIC 2024, DCC 2024 +
+
+
+
+
+ + ♻ ☆ Feature Completion Transformer for Occluded Person Re-identification + + +
+ Occluded person re-identification (Re-ID) is a challenging problem due to the +destruction of occluders. Most existing methods focus on visible human body +parts through some prior information. However, when complementary occlusions +occur, features in occluded regions can interfere with matching, which affects +performance severely. In this paper, different from most previous works that +discard the occluded region, we propose a Feature Completion Transformer +(FCFormer) to implicitly complement the semantic information of occluded parts +in the feature space. Specifically, Occlusion Instance Augmentation (OIA) is +proposed to simulates real and diverse occlusion situations on the holistic +image. These augmented images not only enrich the amount of occlusion samples +in the training set, but also form pairs with the holistic images. +Subsequently, a dual-stream architecture with a shared encoder is proposed to +learn paired discriminative features from pairs of inputs. Without additional +semantic information, an occluded-holistic feature sample-label pair can be +automatically created. Then, Feature Completion Decoder (FCD) is designed to +complement the features of occluded regions by using learnable tokens to +aggregate possible information from self-generated occluded features. Finally, +we propose the Cross Hard Triplet (CHT) loss to further bridge the gap between +complementing features and extracting features under the same ID. In addition, +Feature Completion Consistency (FC$^2$) loss is introduced to help the +generated completion feature distribution to be closer to the real holistic +feature distribution. Extensive experiments over five challenging datasets +demonstrate that the proposed FCFormer achieves superior performance and +outperforms the state-of-the-art methods by significant margins on occluded +datasets. + +
+
+ comment: Published on IEEE Transactions on Multimedia (TMM) +
+
+
+
+
+ + ♻ ☆ Rethinking the Evaluation Protocol of Domain Generalization + + +
+ Domain generalization aims to solve the challenge of Out-of-Distribution +(OOD) generalization by leveraging common knowledge learned from multiple +training domains to generalize to unseen test domains. To accurately evaluate +the OOD generalization ability, it is required that test data information is +unavailable. However, the current domain generalization protocol may still have +potential test data information leakage. This paper examines the risks of test +data information leakage from two aspects of the current evaluation protocol: +supervised pretraining on ImageNet and oracle model selection. We propose +modifications to the current protocol that we should employ self-supervised +pretraining or train from scratch instead of employing the current supervised +pretraining, and we should use multiple test domains. These would result in a +more precise evaluation of OOD generalization ability. We also rerun the +algorithms with the modified protocol and introduce new leaderboards to +encourage future research in domain generalization with a fairer comparison. + +
+
+
+
+
+ + ♻ ☆ MoSA: Mixture of Sparse Adapters for Visual Efficient Tuning + + +
+ With the rapid growth in the scale of pre-trained foundation models, +parameter-efficient fine-tuning techniques have gained significant attention, +among which Adapter Tuning is the most widely used. Despite achieving +efficiency, it still underperforms full fine-tuning, and the performance +improves at the cost of an increase in parameters. Recent efforts have either +focused on training multiple adapter experts to increase model capacity or on +pruning adapters to achieve parameter efficiency. However, both approaches +introduce more parameters compared to the original adapter, hence are not +computationally efficient. Motivated by this, we propose Mixture of Sparse +Adapters, or MoSA, as a novel Adapter Tuning method to fully unleash the +potential of each parameter in the adapter. We first split the standard adapter +into multiple non-overlapping modules, then stochastically activate them for +sparse training, and finally merge them to form a complete adapter after +tuning. In this way, MoSA can achieve significantly better performance than +standard adapters without any additional computational or storage overhead. +Furthermore, we propose a hierarchical sparse strategy to better leverage +limited training data. Extensive experiments on a series of 27 visual tasks +demonstrate that MoSA consistently outperforms other Adapter Tuning methods as +well as other baselines by a large margin. Furthermore, MoSA brings consistent +improvements across various model scales, architectures, and different PEFT +methods. Code will be released. + +
+
+ comment: 16 pages, 7 figures. Official code: + https://github.com/Theia-4869/MoSA +
+
+
+
+
+ + ♻ ☆ SimA: Simple Softmax-free Attention for Vision Transformers + + +
+ Recently, vision transformers have become very popular. However, deploying +them in many applications is computationally expensive partly due to the +Softmax layer in the attention block. We introduce a simple but effective, +Softmax-free attention block, SimA, which normalizes query and key matrices +with simple $\ell_1$-norm instead of using Softmax layer. Then, the attention +block in SimA is a simple multiplication of three matrices, so SimA can +dynamically change the ordering of the computation at the test time to achieve +linear computation on the number of tokens or the number of channels. We +empirically show that SimA applied to three SOTA variations of transformers, +DeiT, XCiT, and CvT, results in on-par accuracy compared to the SOTA models, +without any need for Softmax layer. Interestingly, changing SimA from +multi-head to single-head has only a small effect on the accuracy, which +simplifies the attention block further. The code is available here: +https://github.com/UCDvision/sima + +
+
+ comment: Code is available here: https://github.com/UCDvision/sima +
+
+
+
+
+ + ♻ ☆ Real3D-Portrait: One-shot Realistic 3D Talking Portrait Synthesis ICLR 2024 + + +
+ One-shot 3D talking portrait generation aims to reconstruct a 3D avatar from +an unseen image, and then animate it with a reference video or audio to +generate a talking portrait video. The existing methods fail to simultaneously +achieve the goals of accurate 3D avatar reconstruction and stable talking face +animation. Besides, while the existing works mainly focus on synthesizing the +head part, it is also vital to generate natural torso and background segments +to obtain a realistic talking portrait video. To address these limitations, we +present Real3D-Potrait, a framework that (1) improves the one-shot 3D +reconstruction power with a large image-to-plane model that distills 3D prior +knowledge from a 3D face generative model; (2) facilitates accurate +motion-conditioned animation with an efficient motion adapter; (3) synthesizes +realistic video with natural torso movement and switchable background using a +head-torso-background super-resolution model; and (4) supports one-shot +audio-driven talking face generation with a generalizable audio-to-motion +model. Extensive experiments show that Real3D-Portrait generalizes well to +unseen identities and generates more realistic talking portrait videos compared +to previous methods. Video samples and source code are available at +https://real3dportrait.github.io . + +
+
+ comment: ICLR 2024 (Spotlight). Project page: https://real3dportrait.github.io +
+
+
+
+
+ + ♻ ☆ GeNIe: Generative Hard Negative Images Through Diffusion + + +
+ Data augmentation is crucial in training deep models, preventing them from +overfitting to limited data. Recent advances in generative AI, e.g., diffusion +models, have enabled more sophisticated augmentation techniques that produce +data resembling natural images. We introduce GeNIe a novel augmentation method +which leverages a latent diffusion model conditioned on a text prompt to merge +contrasting data points (an image from the source category and a text prompt +from the target category) to generate challenging samples. To achieve this, +inspired by recent diffusion based image editing techniques, we limit the +number of diffusion iterations to ensure the generated image retains low-level +and background features from the source image while representing the target +category, resulting in a hard negative sample for the source category. We +further enhance the proposed approach by finding the appropriate noise level +adaptively for each image (coined as GeNIe-Ada) leading to further performance +improvement. Our extensive experiments, in both few-shot and long-tail +distribution settings, demonstrate the effectiveness of our novel augmentation +method and its superior performance over the prior art. Our code is available +here: https://github.com/UCDvision/GeNIe + +
+
+ comment: Our code is available https://github.com/UCDvision/GeNIe +
+
+
+
+
+ + ♻ ☆ Emotion Recognition Using Transformers with Masked Learning + + +
+ In recent years, deep learning has achieved innovative advancements in +various fields, including the analysis of human emotions and behaviors. +Initiatives such as the Affective Behavior Analysis in-the-wild (ABAW) +competition have been particularly instrumental in driving research in this +area by providing diverse and challenging datasets that enable precise +evaluation of complex emotional states. This study leverages the Vision +Transformer (ViT) and Transformer models to focus on the estimation of +Valence-Arousal (VA), which signifies the positivity and intensity of emotions, +recognition of various facial expressions, and detection of Action Units (AU) +representing fundamental muscle movements. This approach transcends traditional +Convolutional Neural Networks (CNNs) and Long Short-Term Memory (LSTM) based +methods, proposing a new Transformer-based framework that maximizes the +understanding of temporal and spatial features. The core contributions of this +research include the introduction of a learning technique through random frame +masking and the application of Focal loss adapted for imbalanced data, +enhancing the accuracy and applicability of emotion and behavior analysis in +real-world settings. This approach is expected to contribute to the advancement +of emotional computing and deep learning methodologies. + +
+
+
+
+
+ + ♻ ☆ Using Human Feedback to Fine-tune Diffusion Models without Any Reward + Model CVPR 2024 + + +
+ Using reinforcement learning with human feedback (RLHF) has shown significant +promise in fine-tuning diffusion models. Previous methods start by training a +reward model that aligns with human preferences, then leverage RL techniques to +fine-tune the underlying models. However, crafting an efficient reward model +demands extensive datasets, optimal architecture, and manual hyperparameter +tuning, making the process both time and cost-intensive. The direct preference +optimization (DPO) method, effective in fine-tuning large language models, +eliminates the necessity for a reward model. However, the extensive GPU memory +requirement of the diffusion model's denoising process hinders the direct +application of the DPO method. To address this issue, we introduce the Direct +Preference for Denoising Diffusion Policy Optimization (D3PO) method to +directly fine-tune diffusion models. The theoretical analysis demonstrates that +although D3PO omits training a reward model, it effectively functions as the +optimal reward model trained using human feedback data to guide the learning +process. This approach requires no training of a reward model, proving to be +more direct, cost-effective, and minimizing computational overhead. In +experiments, our method uses the relative scale of objectives as a proxy for +human preference, delivering comparable results to methods using ground-truth +rewards. Moreover, D3PO demonstrates the ability to reduce image distortion +rates and generate safer images, overcoming challenges lacking robust reward +models. Our code is publicly available at https://github.com/yk7333/D3PO. + +
+
+ comment: CVPR 2024 accepted; huggingface daily paper +
+
+
+
+
+ + ♻ ☆ Visual Programming for Zero-shot Open-Vocabulary 3D Visual Grounding CVPR 2024 + + +
+ 3D Visual Grounding (3DVG) aims at localizing 3D object based on textual +descriptions. Conventional supervised methods for 3DVG often necessitate +extensive annotations and a predefined vocabulary, which can be restrictive. To +address this issue, we propose a novel visual programming approach for +zero-shot open-vocabulary 3DVG, leveraging the capabilities of large language +models (LLMs). Our approach begins with a unique dialog-based method, engaging +with LLMs to establish a foundational understanding of zero-shot 3DVG. Building +on this, we design a visual program that consists of three types of modules, +i.e., view-independent, view-dependent, and functional modules. These modules, +specifically tailored for 3D scenarios, work collaboratively to perform complex +reasoning and inference. Furthermore, we develop an innovative language-object +correlation module to extend the scope of existing 3D object detectors into +open-vocabulary scenarios. Extensive experiments demonstrate that our zero-shot +approach can outperform some supervised baselines, marking a significant stride +towards effective 3DVG. + +
+
+ comment: Accepted by CVPR 2024, project website: + https://curryyuan.github.io/ZSVG3D/ +
+
+
+
+
+ + ♻ ☆ EvalCrafter: Benchmarking and Evaluating Large Video Generation Models + + +
+ The vision and language generative models have been overgrown in recent +years. For video generation, various open-sourced models and public-available +services have been developed to generate high-quality videos. However, these +methods often use a few metrics, e.g., FVD or IS, to evaluate the performance. +We argue that it is hard to judge the large conditional generative models from +the simple metrics since these models are often trained on very large datasets +with multi-aspect abilities. Thus, we propose a novel framework and pipeline +for exhaustively evaluating the performance of the generated videos. Our +approach involves generating a diverse and comprehensive list of 700 prompts +for text-to-video generation, which is based on an analysis of real-world user +data and generated with the assistance of a large language model. Then, we +evaluate the state-of-the-art video generative models on our carefully designed +benchmark, in terms of visual qualities, content qualities, motion qualities, +and text-video alignment with 17 well-selected objective metrics. To obtain the +final leaderboard of the models, we further fit a series of coefficients to +align the objective metrics to the users' opinions. Based on the proposed human +alignment method, our final score shows a higher correlation than simply +averaging the metrics, showing the effectiveness of the proposed evaluation +method. + +
+
+ comment: Technical Report, Project page: https://evalcrafter.github.io/ +
+
+
+
+
+ + ♻ ☆ DPoser: Diffusion Model as Robust 3D Human Pose Prior + + +
+ This work targets to construct a robust human pose prior. However, it remains +a persistent challenge due to biomechanical constraints and diverse human +movements. Traditional priors like VAEs and NDFs often exhibit shortcomings in +realism and generalization, notably with unseen noisy poses. To address these +issues, we introduce DPoser, a robust and versatile human pose prior built upon +diffusion models. DPoser regards various pose-centric tasks as inverse problems +and employs variational diffusion sampling for efficient solving. Accordingly, +designed with optimization frameworks, DPoser seamlessly benefits human mesh +recovery, pose generation, pose completion, and motion denoising tasks. +Furthermore, due to the disparity between the articulated poses and structured +images, we propose truncated timestep scheduling to enhance the effectiveness +of DPoser. Our approach demonstrates considerable enhancements over common +uniform scheduling used in image domains, boasting improvements of 5.4%, 17.2%, +and 3.8% across human mesh recovery, pose completion, and motion denoising, +respectively. Comprehensive experiments demonstrate the superiority of DPoser +over existing state-of-the-art pose priors across multiple tasks. + +
+
+ comment: Project Page: https://dposer.github.io; Code Released: + https://github.com/moonbow721/DPoser +
+
+
+
+
+ + ♻ ☆ On the Privacy Effect of Data Enhancement via the Lens of Memorization + + +
+ Machine learning poses severe privacy concerns as it has been shown that the +learned models can reveal sensitive information about their training data. Many +works have investigated the effect of widely adopted data augmentation and +adversarial training techniques, termed data enhancement in the paper, on the +privacy leakage of machine learning models. Such privacy effects are often +measured by membership inference attacks (MIAs), which aim to identify whether +a particular example belongs to the training set or not. We propose to +investigate privacy from a new perspective called memorization. Through the +lens of memorization, we find that previously deployed MIAs produce misleading +results as they are less likely to identify samples with higher privacy risks +as members compared to samples with low privacy risks. To solve this problem, +we deploy a recent attack that can capture individual samples' memorization +degrees for evaluation. Through extensive experiments, we unveil several +findings about the connections between three essential properties of machine +learning models, including privacy, generalization gap, and adversarial +robustness. We demonstrate that the generalization gap and privacy leakage are +less correlated than those of the previous results. Moreover, there is not +necessarily a trade-off between adversarial robustness and privacy as stronger +adversarial robustness does not make the model more susceptible to privacy +attacks. + +
+
+ comment: Accepted by IEEE TIFS, 17 pages +
+
+
+
+
+ + ♻ ☆ SurgicalPart-SAM: Part-to-Whole Collaborative Prompting for Surgical + Instrument Segmentation + + +
+ The Segment Anything Model (SAM) exhibits promise in generic object +segmentation and offers potential for various applications. Existing methods +have applied SAM to surgical instrument segmentation (SIS) by tuning SAM-based +frameworks with surgical data. However, they fall short in two crucial aspects: +(1) Straightforward model tuning with instrument masks treats each instrument +as a single entity, neglecting their complex structures and fine-grained +details; and (2) Instrument category-based prompts are not flexible and +informative enough to describe instrument structures. To address these +problems, in this paper, we investigate text promptable SIS and propose +SurgicalPart-SAM (SP-SAM), a novel SAM efficient-tuning approach that +explicitly integrates instrument structure knowledge with SAM's generic +knowledge, guided by expert knowledge on instrument part compositions. +Specifically, we achieve this by proposing (1) Collaborative Prompts that +describe instrument structures via collaborating category-level and part-level +texts; (2) Cross-Modal Prompt Encoder that encodes text prompts jointly with +visual embeddings into discriminative part-level representations; and (3) +Part-to-Whole Adaptive Fusion and Hierarchical Decoding that adaptively fuse +the part-level representations into a whole for accurate instrument +segmentation in surgical scenarios. Built upon them, SP-SAM acquires a better +capability to comprehend surgical instruments in terms of both overall +structure and part-level details. Extensive experiments on both the EndoVis2018 +and EndoVis2017 datasets demonstrate SP-SAM's state-of-the-art performance with +minimal tunable parameters. The code will be available at +https://github.com/wenxi-yue/SurgicalPart-SAM. + +
+
+ comment: Technical Report. The source code will be released at + https://github.com/wenxi-yue/SurgicalPart-SAM +
+
+
+
+
+ + ♻ ☆ LCV2: An Efficient Pretraining-Free Framework for Grounded Visual + Question Answering + + +
+ In this paper, the LCV2 modular method is proposed for the Grounded Visual +Question Answering task in the vision-language multimodal domain. This approach +relies on a frozen large language model (LLM) as intermediate mediator between +the off-the-shelf VQA model and the off-the-shelf visual grounding (VG) model, +where the LLM transforms and conveys textual information between the two +modules based on a designed prompt. LCV2 establish an integrated plug-and-play +framework without the need for any pre-training process. This framework can be +deployed for VQA Grounding tasks under low computational resources. The +modularized model within the framework allows application with various +state-of-the-art pre-trained models, exhibiting significant potential to be +advance with the times. Experimental implementations were conducted under +constrained computational and memory resources, evaluating the proposed +method's performance on benchmark datasets including GQA, CLEVR, and +VizWiz-VQA-Grounding. Comparative analyses with baseline methods demonstrate +the robust competitiveness of LCV2. + +
+
+ comment: 21 pages,9 figures +
+
+
+
+
+ + ♻ ☆ Surf-D: Generating High-Quality Surfaces of Arbitrary Topologies Using + Diffusion Models + + +
+ We present Surf-D, a novel method for generating high-quality 3D shapes as +Surfaces with arbitrary topologies using Diffusion models. Previous methods +explored shape generation with different representations and they suffer from +limited topologies and poor geometry details. To generate high-quality surfaces +of arbitrary topologies, we use the Unsigned Distance Field (UDF) as our +surface representation to accommodate arbitrary topologies. Furthermore, we +propose a new pipeline that employs a point-based AutoEncoder to learn a +compact and continuous latent space for accurately encoding UDF and support +high-resolution mesh extraction. We further show that our new pipeline +significantly outperforms the prior approaches to learning the distance fields, +such as the grid-based AutoEncoder, which is not scalable and incapable of +learning accurate UDF. In addition, we adopt a curriculum learning strategy to +efficiently embed various surfaces. With the pretrained shape latent space, we +employ a latent diffusion model to acquire the distribution of various shapes. +Extensive experiments are presented on using Surf-D for unconditional +generation, category conditional generation, image conditional generation, and +text-to-shape tasks. The experiments demonstrate the superior performance of +Surf-D in shape generation across multiple modalities as conditions. Visit our +project page at https://yzmblog.github.io/projects/SurfD/. + +
+
+ comment: Project Page: https://yzmblog.github.io/projects/SurfD/ +
+
+
+
+
+ + ♻ ☆ Supporting Vision-Language Model Inference with Confounder-pruning + Knowledge Prompt + + +
+ Vision-language models are pre-trained by aligning image-text pairs in a +common space to deal with open-set visual concepts. To boost the +transferability of the pre-trained models, recent works adopt fixed or +learnable prompts, i.e., classification weights are synthesized from natural +language describing task-relevant categories, to reduce the gap between tasks +in the training and test phases. However, how and what prompts can improve +inference performance remains unclear. In this paper, we explicitly clarify the +importance of including semantic information in prompts, while existing +prompting methods generate prompts without exploring the semantic information +of textual labels. Manually constructing prompts with rich semantics requires +domain expertise and is extremely time-consuming. To cope with this issue, we +propose a semantic-aware prompt learning method, namely CPKP, which retrieves +an ontological knowledge graph by treating the textual label as a query to +extract task-relevant semantic information. CPKP further introduces a +double-tier confounder-pruning procedure to refine the derived semantic +information. The graph-tier confounders are gradually identified and phased +out, inspired by the principle of Granger causality. The feature-tier +confounders are demolished by following the maximum entropy principle in +information theory. Empirically, the evaluations demonstrate the effectiveness +of CPKP, e.g., with two shots, CPKP outperforms the manual-prompt method by +4.64% and the learnable-prompt method by 1.09% on average, and the superiority +of CPKP in domain generalization compared to benchmark approaches. Our +implementation is available at https://github.com/Mowenyii/CPKP. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 183 + +
+
+
+ + ☆ DiffusionMTL: Learning Multi-Task Denoising Diffusion Model from + Partially Annotated Data CVPR 2024 + + +
+ Recently, there has been an increased interest in the practical problem of +learning multiple dense scene understanding tasks from partially annotated +data, where each training sample is only labeled for a subset of the tasks. The +missing of task labels in training leads to low-quality and noisy predictions, +as can be observed from state-of-the-art methods. To tackle this issue, we +reformulate the partially-labeled multi-task dense prediction as a pixel-level +denoising problem, and propose a novel multi-task denoising diffusion framework +coined as DiffusionMTL. It designs a joint diffusion and denoising paradigm to +model a potential noisy distribution in the task prediction or feature maps and +generate rectified outputs for different tasks. To exploit multi-task +consistency in denoising, we further introduce a Multi-Task Conditioning +strategy, which can implicitly utilize the complementary nature of the tasks to +help learn the unlabeled tasks, leading to an improvement in the denoising +performance of the different tasks. Extensive quantitative and qualitative +experiments demonstrate that the proposed multi-task denoising diffusion model +can significantly improve multi-task prediction maps, and outperform the +state-of-the-art methods on three challenging multi-task benchmarks, under two +different partial-labeling evaluation settings. The code is available at +https://prismformore.github.io/diffusionmtl/. + +
+
+ comment: The paper is accepted by CVPR 2024 +
+
+
+
+
+ + ☆ LLaVA-PruMerge: Adaptive Token Reduction for Efficient Large Multimodal + Models + + +
+ Large Multimodal Models (LMMs) have shown significant reasoning capabilities +by connecting a visual encoder and a large language model. LMMs typically use a +fixed amount of visual tokens, such as the penultimate layer features in the +CLIP visual encoder, as the prefix content. Recent LMMs incorporate more +complex visual inputs, such as high-resolution images and videos, which +increase the number of visual tokens significantly. However, due to the design +of the Transformer architecture, computational costs associated with these +models tend to increase quadratically with the number of input tokens. To +tackle this problem, we explore a token reduction mechanism and find, similar +to prior work, that many visual tokens are spatially redundant. Based on this, +we propose PruMerge, a novel adaptive visual token reduction approach, which +largely reduces the number of visual tokens while maintaining comparable model +performance. We first select the unpruned visual tokens based on their +similarity to class tokens and spatial tokens. We then cluster the pruned +tokens based on key similarity and merge the clustered tokens with the unpruned +tokens to supplement their information. Empirically, when applied to LLaVA-1.5, +our approach can compress the visual tokens by 14.4 times on average, and +achieve comparable performance across diverse visual question-answering and +reasoning tasks. Code and checkpoints are at https://llava-prumerge.github.io/. + +
+
+ comment: Project page: https://llava-prumerge.github.io/ +
+
+
+
+
+ + ☆ LATTE3D: Large-scale Amortized Text-To-Enhanced3D Synthesis + + +
+ Recent text-to-3D generation approaches produce impressive 3D results but +require time-consuming optimization that can take up to an hour per prompt. +Amortized methods like ATT3D optimize multiple prompts simultaneously to +improve efficiency, enabling fast text-to-3D synthesis. However, they cannot +capture high-frequency geometry and texture details and struggle to scale to +large prompt sets, so they generalize poorly. We introduce LATTE3D, addressing +these limitations to achieve fast, high-quality generation on a significantly +larger prompt set. Key to our method is 1) building a scalable architecture and +2) leveraging 3D data during optimization through 3D-aware diffusion priors, +shape regularization, and model initialization to achieve robustness to diverse +and complex training prompts. LATTE3D amortizes both neural field and textured +surface generation to produce highly detailed textured meshes in a single +forward pass. LATTE3D generates 3D objects in 400ms, and can be further +enhanced with fast test-time optimization. + +
+
+ comment: See the project website at + https://research.nvidia.com/labs/toronto-ai/LATTE3D/ +
+
+
+
+
+ + ☆ ThemeStation: Generating Theme-Aware 3D Assets from Few Exemplars + + +
+ Real-world applications often require a large gallery of 3D assets that share +a consistent theme. While remarkable advances have been made in general 3D +content creation from text or image, synthesizing customized 3D assets +following the shared theme of input 3D exemplars remains an open and +challenging problem. In this work, we present ThemeStation, a novel approach +for theme-aware 3D-to-3D generation. ThemeStation synthesizes customized 3D +assets based on given few exemplars with two goals: 1) unity for generating 3D +assets that thematically align with the given exemplars and 2) diversity for +generating 3D assets with a high degree of variations. To this end, we design a +two-stage framework that draws a concept image first, followed by a +reference-informed 3D modeling stage. We propose a novel dual score +distillation (DSD) loss to jointly leverage priors from both the input +exemplars and the synthesized concept image. Extensive experiments and user +studies confirm that ThemeStation surpasses prior works in producing diverse +theme-aware 3D models with impressive quality. ThemeStation also enables +various applications such as controllable 3D-to-3D generation. + +
+
+ comment: Project page: https://3dthemestation.github.io/ +
+
+
+
+
+ + ☆ DragAPart: Learning a Part-Level Motion Prior for Articulated Objects + + +
+ We introduce DragAPart, a method that, given an image and a set of drags as +input, can generate a new image of the same object in a new state, compatible +with the action of the drags. Differently from prior works that focused on +repositioning objects, DragAPart predicts part-level interactions, such as +opening and closing a drawer. We study this problem as a proxy for learning a +generalist motion model, not restricted to a specific kinematic structure or +object category. To this end, we start from a pre-trained image generator and +fine-tune it on a new synthetic dataset, Drag-a-Move, which we introduce. +Combined with a new encoding for the drags and dataset randomization, the new +model generalizes well to real images and different categories. Compared to +prior motion-controlled generators, we demonstrate much better part-level +motion understanding. + +
+
+ comment: Project page: https://dragapart.github.io/ +
+
+
+
+
+ + ☆ Long-CLIP: Unlocking the Long-Text Capability of CLIP + + +
+ Contrastive Language-Image Pre-training (CLIP) has been the cornerstone for +zero-shot classification, text-image retrieval, and text-image generation by +aligning image and text modalities. Despite its widespread adoption, a +significant limitation of CLIP lies in the inadequate length of text input. The +length of the text token is restricted to 77, and an empirical study shows the +actual effective length is even less than 20. This prevents CLIP from handling +detailed descriptions, limiting its applications for image retrieval and +text-to-image generation with extensive prerequisites. To this end, we propose +Long-CLIP as a plug-and-play alternative to CLIP that supports long-text input, +retains or even surpasses its zero-shot generalizability, and aligns the CLIP +latent space, making it readily replace CLIP without any further adaptation in +downstream frameworks. Nevertheless, achieving this goal is far from +straightforward, as simplistic fine-tuning can result in a significant +degradation of CLIP's performance. Moreover, substituting the text encoder with +a language model supporting longer contexts necessitates pretraining with vast +amounts of data, incurring significant expenses. Accordingly, Long-CLIP +introduces an efficient fine-tuning solution on CLIP with two novel strategies +designed to maintain the original capabilities, including (1) a +knowledge-preserved stretching of positional embedding and (2) a primary +component matching of CLIP features. With leveraging just one million extra +long text-image pairs, Long-CLIP has shown the superiority to CLIP for about +20% in long caption text-image retrieval and 6% in traditional text-image +retrieval tasks, e.g., COCO and Flickr30k. Furthermore, Long-CLIP offers +enhanced capabilities for generating images from detailed text descriptions by +replacing CLIP in a plug-and-play manner. + +
+
+ comment: All codes and models are publicly available at + https://github.com/beichenzbc/Long-CLIP +
+
+
+
+
+ + ☆ InternVideo2: Scaling Video Foundation Models for Multimodal Video + Understanding + + +
+ We introduce InternVideo2, a new video foundation model (ViFM) that achieves +the state-of-the-art performance in action recognition, video-text tasks, and +video-centric dialogue. Our approach employs a progressive training paradigm +that unifies the different self- or weakly-supervised learning frameworks of +masked video token reconstruction, cross-modal contrastive learning, and next +token prediction. Different training stages would guide our model to capture +different levels of structure and semantic information through different +pretext tasks. At the data level, we prioritize the spatiotemporal consistency +by semantically segmenting videos and generating video-audio-speech captions. +This improves the alignment between video and text. We scale both data and +model size for our InternVideo2. Through extensive experiments, we validate our +designs and demonstrate the state-of-the-art performance on over 60 video and +audio tasks. Notably, our model outperforms others on various video-related +captioning, dialogue, and long video understanding benchmarks, highlighting its +ability to reason and comprehend long temporal contexts. Code and models are +available at https://github.com/OpenGVLab/InternVideo2/. + +
+
+ comment: a technical report about video understanding +
+
+
+
+
+ + ☆ Augmented Reality based Simulated Data (ARSim) with multi-view + consistency for AV perception networks + + +
+ Detecting a diverse range of objects under various driving scenarios is +essential for the effectiveness of autonomous driving systems. However, the +real-world data collected often lacks the necessary diversity presenting a +long-tail distribution. Although synthetic data has been utilized to overcome +this issue by generating virtual scenes, it faces hurdles such as a significant +domain gap and the substantial efforts required from 3D artists to create +realistic environments. To overcome these challenges, we present ARSim, a fully +automated, comprehensive, modular framework designed to enhance real multi-view +image data with 3D synthetic objects of interest. The proposed method +integrates domain adaptation and randomization strategies to address covariate +shift between real and simulated data by inferring essential domain attributes +from real data and employing simulation-based randomization for other +attributes. We construct a simplified virtual scene using real data and +strategically place 3D synthetic assets within it. Illumination is achieved by +estimating light distribution from multiple images capturing the surroundings +of the vehicle. Camera parameters from real data are employed to render +synthetic assets in each frame. The resulting augmented multi-view consistent +dataset is used to train a multi-camera perception network for autonomous +vehicles. Experimental results on various AV perception tasks demonstrate the +superior performance of networks trained on the augmented dataset. + +
+
+ comment: 17 pages, 15 figures, 7 tables +
+
+
+
+
+ + ☆ Learning Topological Representations for Deep Image Understanding + + +
+ In many scenarios, especially biomedical applications, the correct +delineation of complex fine-scaled structures such as neurons, tissues, and +vessels is critical for downstream analysis. Despite the strong predictive +power of deep learning methods, they do not provide a satisfactory +representation of these structures, thus creating significant barriers in +scalable annotation and downstream analysis. In this dissertation, we tackle +such challenges by proposing novel representations of these topological +structures in a deep learning framework. We leverage the mathematical tools +from topological data analysis, i.e., persistent homology and discrete Morse +theory, to develop principled methods for better segmentation and uncertainty +estimation, which will become powerful tools for scalable annotation. + +
+
+ comment: Ph.D. thesis from Stony Brook University. This thesis includes works + arXiv:1906.05404, arXiv:2110.08335, arXiv:2112.07812, arXiv:2103.09992, + arXiv:2206.01742 +
+
+
+
+
+ + ☆ SiMBA: Simplified Mamba-Based Architecture for Vision and Multivariate + Time series + + +
+ Transformers have widely adopted attention networks for sequence mixing and +MLPs for channel mixing, playing a pivotal role in achieving breakthroughs +across domains. However, recent literature highlights issues with attention +networks, including low inductive bias and quadratic complexity concerning +input sequence length. State Space Models (SSMs) like S4 and others (Hippo, +Global Convolutions, liquid S4, LRU, Mega, and Mamba), have emerged to address +the above issues to help handle longer sequence lengths. Mamba, while being the +state-of-the-art SSM, has a stability issue when scaled to large networks for +computer vision datasets. We propose SiMBA, a new architecture that introduces +Einstein FFT (EinFFT) for channel modeling by specific eigenvalue computations +and uses the Mamba block for sequence modeling. Extensive performance studies +across image and time-series benchmarks demonstrate that SiMBA outperforms +existing SSMs, bridging the performance gap with state-of-the-art transformers. +Notably, SiMBA establishes itself as the new state-of-the-art SSM on ImageNet +and transfer learning benchmarks such as Stanford Car and Flower as well as +task learning benchmarks as well as seven time series benchmark datasets. The +project page is available on this website +~\url{https://github.com/badripatro/Simba}. + +
+
+
+
+
+ + ☆ Neural Plasticity-Inspired Foundation Model for Observing the Earth + Crossing Modalities + + +
+ The development of foundation models has revolutionized our ability to +interpret the Earth's surface using satellite observational data. Traditional +models have been siloed, tailored to specific sensors or data types like +optical, radar, and hyperspectral, each with its own unique characteristics. +This specialization hinders the potential for a holistic analysis that could +benefit from the combined strengths of these diverse data sources. Our novel +approach introduces the Dynamic One-For-All (DOFA) model, leveraging the +concept of neural plasticity in brain science to integrate various data +modalities into a single framework adaptively. This dynamic hypernetwork, +adjusting to different wavelengths, enables a single versatile Transformer +jointly trained on data from five sensors to excel across 12 distinct Earth +observation tasks, including sensors never seen during pretraining. DOFA's +innovative design offers a promising leap towards more accurate, efficient, and +unified Earth observation analysis, showcasing remarkable adaptability and +performance in harnessing the potential of multimodal Earth observation data. + +
+
+ comment: 33 pages, 10 figures +
+
+
+
+
+ + ☆ Fully automated workflow for the design of patient-specific orthopaedic + implants: application to total knee arthroplasty + + +
+ Arthroplasty is commonly performed to treat joint osteoarthritis, reducing +pain and improving mobility. While arthroplasty has known several technical +improvements, a significant share of patients are still unsatisfied with their +surgery. Personalised arthroplasty improves surgical outcomes however current +solutions require delays, making it difficult to integrate in clinical routine. +We propose a fully automated workflow to design patient-specific implants, +presented for total knee arthroplasty, the most widely performed arthroplasty +in the world nowadays. + The proposed pipeline first uses artificial neural networks to segment the +proximal and distal extremities of the femur and tibia. Then the full bones are +reconstructed using augmented statistical shape models, combining shape and +landmarks information. Finally, 77 morphological parameters are computed to +design patient-specific implants. The developed workflow has been trained using +91 CT scans of lower limb and evaluated on 41 CT scans manually segmented, in +terms of accuracy and execution time. + The workflow accuracy was $0.4\pm0.2mm$ for the segmentation, $1.2\pm0.4mm$ +for the full bones reconstruction, and $2.8\pm2.2mm$ for the anatomical +landmarks determination. The custom implants fitted the patients' anatomy with +$0.6\pm0.2mm$ accuracy. The whole process from segmentation to implants' design +lasted about 5 minutes. + The proposed workflow allows for a fast and reliable personalisation of knee +implants, directly from the patient CT image without requiring any manual +intervention. It establishes a patient-specific pre-operative planning for TKA +in a very short time making it easily available for all patients. Combined with +efficient implant manufacturing techniques, this solution could help answer the +growing number of arthroplasties while reducing complications and improving the +patients' satisfaction. + +
+
+
+
+
+ + ☆ Selectively Informative Description can Reduce Undesired Embedding + Entanglements in Text-to-Image Personalization CVPR 2024 + + +
+ In text-to-image personalization, a timely and crucial challenge is the +tendency of generated images overfitting to the biases present in the reference +images. We initiate our study with a comprehensive categorization of the biases +into background, nearby-object, tied-object, substance (in style +re-contextualization), and pose biases. These biases manifest in the generated +images due to their entanglement into the subject embedding. This undesired +embedding entanglement not only results in the reflection of biases from the +reference images into the generated images but also notably diminishes the +alignment of the generated images with the given generation prompt. To address +this challenge, we propose SID~(Selectively Informative Description), a text +description strategy that deviates from the prevalent approach of only +characterizing the subject's class identification. SID is generated utilizing +multimodal GPT-4 and can be seamlessly integrated into optimization-based +models. We present comprehensive experimental results along with analyses of +cross-attention maps, subject-alignment, non-subject-disentanglement, and +text-alignment. + +
+
+ comment: Published at CVPR 2024 +
+
+
+
+
+ + ☆ Point-DETR3D: Leveraging Imagery Data with Spatial Point Prior for + Weakly Semi-supervised 3D Object Detection AAAI2024 + + +
+ Training high-accuracy 3D detectors necessitates massive labeled 3D +annotations with 7 degree-of-freedom, which is laborious and time-consuming. +Therefore, the form of point annotations is proposed to offer significant +prospects for practical applications in 3D detection, which is not only more +accessible and less expensive but also provides strong spatial information for +object localization.In this paper, we empirically discover that it is +non-trivial to merely adapt Point-DETR to its 3D form, encountering two main +bottlenecks: 1) it fails to encode strong 3D prior into the model, and 2) it +generates low-quality pseudo labels in distant regions due to the extreme +sparsity of LiDAR points. To overcome these challenges, we introduce +Point-DETR3D, a teacher-student framework for weakly semi-supervised 3D +detection, designed to fully capitalize on point-wise supervision within a +constrained instance-wise annotation budget.Different from Point-DETR which +encodes 3D positional information solely through a point encoder, we propose an +explicit positional query initialization strategy to enhance the positional +prior. Considering the low quality of pseudo labels at distant regions produced +by the teacher model, we enhance the detector's perception by incorporating +dense imagery data through a novel Cross-Modal Deformable RoI Fusion +(D-RoI).Moreover, an innovative point-guided self-supervised learning technique +is proposed to allow for fully exploiting point priors, even in student +models.Extensive experiments on representative nuScenes dataset demonstrate our +Point-DETR3D obtains significant improvements compared to previous works. +Notably, with only 5% of labeled data, Point-DETR3D achieves over 90% +performance of its fully supervised counterpart. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ☆ Ultrasound Imaging based on the Variance of a Diffusion Restoration + Model + + +
+ Despite today's prevalence of ultrasound imaging in medicine, ultrasound +signal-to-noise ratio is still affected by several sources of noise and +artefacts. Moreover, enhancing ultrasound image quality involves balancing +concurrent factors like contrast, resolution, and speckle preservation. +Recently, there has been progress in both model-based and learning-based +approaches addressing the problem of ultrasound image reconstruction. Bringing +the best from both worlds, we propose a hybrid reconstruction method combining +an ultrasound linear direct model with a learning-based prior coming from a +generative Denoising Diffusion model. More specifically, we rely on the +unsupervised fine-tuning of a pre-trained Denoising Diffusion Restoration Model +(DDRM). Given the nature of multiplicative noise inherent to ultrasound, this +paper proposes an empirical model to characterize the stochasticity of +diffusion reconstruction of ultrasound images, and shows the interest of its +variance as an echogenicity map estimator. We conduct experiments on synthetic, +in-vitro, and in-vivo data, demonstrating the efficacy of our variance imaging +approach in achieving high-quality image reconstructions from single plane-wave +acquisitions and in comparison to state-of-the-art methods. + +
+
+ comment: 5 pages; submitted to EUSIPCO 2024. arXiv admin note: text overlap + with arXiv:2310.20618 +
+
+
+
+
+ + ☆ Global Control for Local SO(3)-Equivariant Scale-Invariant Vessel + Segmentation + + +
+ Personalized 3D vascular models can aid in a range of diagnostic, prognostic, +and treatment-planning tasks relevant to cardiovascular disease management. +Deep learning provides a means to automatically obtain such models. Ideally, a +user should have control over the exact region of interest (ROI) to be included +in a vascular model, and the model should be watertight and highly accurate. To +this end, we propose a combination of a global controller leveraging voxel mask +segmentations to provide boundary conditions for vessels of interest to a +local, iterative vessel segmentation model. We introduce the preservation of +scale- and rotational symmetries in the local segmentation model, leading to +generalisation to vessels of unseen sizes and orientations. Combined with the +global controller, this enables flexible 3D vascular model building, without +additional retraining. We demonstrate the potential of our method on a dataset +containing abdominal aortic aneurysms (AAAs). Our method performs on par with a +state-of-the-art segmentation model in the segmentation of AAAs, iliac arteries +and renal arteries, while providing a watertight, smooth surface segmentation. +Moreover, we demonstrate that by adapting the global controller, we can easily +extend vessel sections in the 3D model. + +
+
+
+
+
+ + ☆ CR3DT: Camera-RADAR Fusion for 3D Detection and Tracking + + +
+ Accurate detection and tracking of surrounding objects is essential to enable +self-driving vehicles. While Light Detection and Ranging (LiDAR) sensors have +set the benchmark for high performance, the appeal of camera-only solutions +lies in their cost-effectiveness. Notably, despite the prevalent use of Radio +Detection and Ranging (RADAR) sensors in automotive systems, their potential in +3D detection and tracking has been largely disregarded due to data sparsity and +measurement noise. As a recent development, the combination of RADARs and +cameras is emerging as a promising solution. This paper presents Camera-RADAR +3D Detection and Tracking (CR3DT), a camera-RADAR fusion model for 3D object +detection, and Multi-Object Tracking (MOT). Building upon the foundations of +the State-of-the-Art (SotA) camera-only BEVDet architecture, CR3DT demonstrates +substantial improvements in both detection and tracking capabilities, by +incorporating the spatial and velocity information of the RADAR sensor. +Experimental results demonstrate an absolute improvement in detection +performance of 5.3% in mean Average Precision (mAP) and a 14.9% increase in +Average Multi-Object Tracking Accuracy (AMOTA) on the nuScenes dataset when +leveraging both modalities. CR3DT bridges the gap between high-performance and +cost-effective perception systems in autonomous driving, by capitalizing on the +ubiquitous presence of RADAR in automotive applications. + +
+
+
+
+
+ + ☆ Controlled Training Data Generation with Diffusion Models + + +
+ In this work, we present a method to control a text-to-image generative model +to produce training data specifically "useful" for supervised learning. Unlike +previous works that employ an open-loop approach and pre-define prompts to +generate new data using either a language model or human expertise, we develop +an automated closed-loop system which involves two feedback mechanisms. The +first mechanism uses feedback from a given supervised model and finds +adversarial prompts that result in image generations that maximize the model +loss. While these adversarial prompts result in diverse data informed by the +model, they are not informed of the target distribution, which can be +inefficient. Therefore, we introduce the second feedback mechanism that guides +the generation process towards a certain target distribution. We call the +method combining these two mechanisms Guided Adversarial Prompts. We perform +our evaluations on different tasks, datasets and architectures, with different +types of distribution shifts (spuriously correlated data, unseen domains) and +demonstrate the efficiency of the proposed feedback mechanisms compared to +open-loop approaches. + +
+
+ comment: Project page at https://adversarial-prompts.epfl.ch/ +
+
+
+
+
+ + ☆ WSCLoc: Weakly-Supervised Sparse-View Camera Relocalization + + +
+ Despite the advancements in deep learning for camera relocalization tasks, +obtaining ground truth pose labels required for the training process remains a +costly endeavor. While current weakly supervised methods excel in lightweight +label generation, their performance notably declines in scenarios with sparse +views. In response to this challenge, we introduce WSCLoc, a system capable of +being customized to various deep learning-based relocalization models to +enhance their performance under weakly-supervised and sparse view conditions. +This is realized with two stages. In the initial stage, WSCLoc employs a +multilayer perceptron-based structure called WFT-NeRF to co-optimize image +reconstruction quality and initial pose information. To ensure a stable +learning process, we incorporate temporal information as input. Furthermore, +instead of optimizing SE(3), we opt for $\mathfrak{sim}(3)$ optimization to +explicitly enforce a scale constraint. In the second stage, we co-optimize the +pre-trained WFT-NeRF and WFT-Pose. This optimization is enhanced by +Time-Encoding based Random View Synthesis and supervised by inter-frame +geometric constraints that consider pose, depth, and RGB information. We +validate our approaches on two publicly available datasets, one outdoor and one +indoor. Our experimental results demonstrate that our weakly-supervised +relocalization solutions achieve superior pose estimation accuracy in +sparse-view scenarios, comparable to state-of-the-art camera relocalization +methods. We will make our code publicly available. + +
+
+
+
+
+ + ☆ Hyperbolic Metric Learning for Visual Outlier Detection + + +
+ Out-Of-Distribution (OOD) detection is critical to deploy deep learning +models in safety-critical applications. However, the inherent hierarchical +concept structure of visual data, which is instrumental to OOD detection, is +often poorly captured by conventional methods based on Euclidean geometry. This +work proposes a metric framework that leverages the strengths of Hyperbolic +geometry for OOD detection. Inspired by previous works that refine the decision +boundary for OOD data with synthetic outliers, we extend this method to +Hyperbolic space. Interestingly, we find that synthetic outliers do not benefit +OOD detection in Hyperbolic space as they do in Euclidean space. Furthermore we +explore the relationship between OOD detection performance and Hyperbolic +embedding dimension, addressing practical concerns in resource-constrained +environments. Extensive experiments show that our framework improves the FPR95 +for OOD detection from 22\% to 15\% and from 49% to 28% on CIFAR-10 and +CIFAR-100 respectively compared to Euclidean methods. + +
+
+
+
+
+ + ☆ Spectral Motion Alignment for Video Motion Transfer using Diffusion + Models + + +
+ The evolution of diffusion models has greatly impacted video generation and +understanding. Particularly, text-to-video diffusion models (VDMs) have +significantly facilitated the customization of input video with target +appearance, motion, etc. Despite these advances, challenges persist in +accurately distilling motion information from video frames. While existing +works leverage the consecutive frame residual as the target motion vector, they +inherently lack global motion context and are vulnerable to frame-wise +distortions. To address this, we present Spectral Motion Alignment (SMA), a +novel framework that refines and aligns motion vectors using Fourier and +wavelet transforms. SMA learns motion patterns by incorporating +frequency-domain regularization, facilitating the learning of whole-frame +global motion dynamics, and mitigating spatial artifacts. Extensive experiments +demonstrate SMA's efficacy in improving motion transfer while maintaining +computational efficiency and compatibility across various video customization +frameworks. + +
+
+ comment: Project page: + https://geonyeong-park.github.io/spectral-motion-alignment/ +
+
+
+
+
+ + ☆ Self-Supervised Backbone Framework for Diverse Agricultural Vision Tasks + + +
+ Computer vision in agriculture is game-changing with its ability to transform +farming into a data-driven, precise, and sustainable industry. Deep learning +has empowered agriculture vision to analyze vast, complex visual data, but +heavily rely on the availability of large annotated datasets. This remains a +bottleneck as manual labeling is error-prone, time-consuming, and expensive. +The lack of efficient labeling approaches inspired us to consider +self-supervised learning as a paradigm shift, learning meaningful feature +representations from raw agricultural image data. In this work, we explore how +self-supervised representation learning unlocks the potential applicability to +diverse agriculture vision tasks by eliminating the need for large-scale +annotated datasets. We propose a lightweight framework utilizing SimCLR, a +contrastive learning approach, to pre-train a ResNet-50 backbone on a large, +unannotated dataset of real-world agriculture field images. Our experimental +analysis and results indicate that the model learns robust features applicable +to a broad range of downstream agriculture tasks discussed in the paper. +Additionally, the reduced reliance on annotated data makes our approach more +cost-effective and accessible, paving the way for broader adoption of computer +vision in agriculture. + +
+
+
+
+
+ + ☆ Reasoning-Enhanced Object-Centric Learning for Videos + + +
+ Object-centric learning aims to break down complex visual scenes into more +manageable object representations, enhancing the understanding and reasoning +abilities of machine learning systems toward the physical world. Recently, +slot-based video models have demonstrated remarkable proficiency in segmenting +and tracking objects, but they overlook the importance of the effective +reasoning module. In the real world, reasoning and predictive abilities play a +crucial role in human perception and object tracking; in particular, these +abilities are closely related to human intuitive physics. Inspired by this, we +designed a novel reasoning module called the Slot-based Time-Space Transformer +with Memory buffer (STATM) to enhance the model's perception ability in complex +scenes. The memory buffer primarily serves as storage for slot information from +upstream modules, the Slot-based Time-Space Transformer makes predictions +through slot-based spatiotemporal attention computations and fusion. Our +experiment results on various datasets show that STATM can significantly +enhance object-centric learning capabilities of slot-based video models. + +
+
+
+
+
+ + ☆ IS-Fusion: Instance-Scene Collaborative Fusion for Multimodal 3D Object + Detection CVPR 2024 + + +
+ Bird's eye view (BEV) representation has emerged as a dominant solution for +describing 3D space in autonomous driving scenarios. However, objects in the +BEV representation typically exhibit small sizes, and the associated point +cloud context is inherently sparse, which leads to great challenges for +reliable 3D perception. In this paper, we propose IS-Fusion, an innovative +multimodal fusion framework that jointly captures the Instance- and Scene-level +contextual information. IS-Fusion essentially differs from existing approaches +that only focus on the BEV scene-level fusion by explicitly incorporating +instance-level multimodal information, thus facilitating the instance-centric +tasks like 3D object detection. It comprises a Hierarchical Scene Fusion (HSF) +module and an Instance-Guided Fusion (IGF) module. HSF applies Point-to-Grid +and Grid-to-Region transformers to capture the multimodal scene context at +different granularities. IGF mines instance candidates, explores their +relationships, and aggregates the local multimodal context for each instance. +These instances then serve as guidance to enhance the scene feature and yield +an instance-aware BEV representation. On the challenging nuScenes benchmark, +IS-Fusion outperforms all the published multimodal works to date. Code is +available at: https://github.com/yinjunbo/IS-Fusion. + +
+
+ comment: Accepted to CVPR 2024; Code: https://github.com/yinjunbo/IS-Fusion +
+
+
+
+
+ + ☆ WEEP: A method for spatial interpretation of weakly supervised CNN + models in computational pathology + + +
+ Deep learning enables the modelling of high-resolution histopathology +whole-slide images (WSI). Weakly supervised learning of tile-level data is +typically applied for tasks where labels only exist on the patient or WSI level +(e.g. patient outcomes or histological grading). In this context, there is a +need for improved spatial interpretability of predictions from such models. We +propose a novel method, Wsi rEgion sElection aPproach (WEEP), for model +interpretation. It provides a principled yet straightforward way to establish +the spatial area of WSI required for assigning a particular prediction label. +We demonstrate WEEP on a binary classification task in the area of breast +cancer computational pathology. WEEP is easy to implement, is directly +connected to the model-based decision process, and offers information relevant +to both research and diagnostic applications. + +
+
+
+
+
+ + ☆ Shadow Generation for Composite Image Using Diffusion model CVPR2024 + + +
+ In the realm of image composition, generating realistic shadow for the +inserted foreground remains a formidable challenge. Previous works have +developed image-to-image translation models which are trained on paired +training data. However, they are struggling to generate shadows with accurate +shapes and intensities, hindered by data scarcity and inherent task complexity. +In this paper, we resort to foundation model with rich prior knowledge of +natural shadow images. Specifically, we first adapt ControlNet to our task and +then propose intensity modulation modules to improve the shadow intensity. +Moreover, we extend the small-scale DESOBA dataset to DESOBAv2 using a novel +data acquisition pipeline. Experimental results on both DESOBA and DESOBAv2 +datasets as well as real composite images demonstrate the superior capability +of our model for shadow generation task. The dataset, code, and model are +released at https://github.com/bcmi/Object-Shadow-Generation-Dataset-DESOBAv2. + +
+
+ comment: accepted by CVPR2024 +
+
+
+
+
+ + ☆ LeGO: Leveraging a Surface Deformation Network for Animatable Stylized + Face Generation with One Example + + +
+ Recent advances in 3D face stylization have made significant strides in few +to zero-shot settings. However, the degree of stylization achieved by existing +methods is often not sufficient for practical applications because they are +mostly based on statistical 3D Morphable Models (3DMM) with limited variations. +To this end, we propose a method that can produce a highly stylized 3D face +model with desired topology. Our methods train a surface deformation network +with 3DMM and translate its domain to the target style using a paired exemplar. +The network achieves stylization of the 3D face mesh by mimicking the style of +the target using a differentiable renderer and directional CLIP losses. +Additionally, during the inference process, we utilize a Mesh Agnostic Encoder +(MAGE) that takes deformation target, a mesh of diverse topologies as input to +the stylization process and encodes its shape into our latent space. The +resulting stylized face model can be animated by commonly used 3DMM blend +shapes. A set of quantitative and qualitative evaluations demonstrate that our +method can produce highly stylized face meshes according to a given style and +output them in a desired topology. We also demonstrate example applications of +our method including image-based stylized avatar generation, linear +interpolation of geometric styles, and facial animation of stylized avatars. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ Anytime, Anywhere, Anyone: Investigating the Feasibility of Segment + Anything Model for Crowd-Sourcing Medical Image Annotations + + +
+ Curating annotations for medical image segmentation is a labor-intensive and +time-consuming task that requires domain expertise, resulting in "narrowly" +focused deep learning (DL) models with limited translational utility. Recently, +foundation models like the Segment Anything Model (SAM) have revolutionized +semantic segmentation with exceptional zero-shot generalizability across +various domains, including medical imaging, and hold a lot of promise for +streamlining the annotation process. However, SAM has yet to be evaluated in a +crowd-sourced setting to curate annotations for training 3D DL segmentation +models. In this work, we explore the potential of SAM for crowd-sourcing +"sparse" annotations from non-experts to generate "dense" segmentation masks +for training 3D nnU-Net models, a state-of-the-art DL segmentation model. Our +results indicate that while SAM-generated annotations exhibit high mean Dice +scores compared to ground-truth annotations, nnU-Net models trained on +SAM-generated annotations perform significantly worse than nnU-Net models +trained on ground-truth annotations ($p<0.001$, all). + +
+
+
+
+
+ + ☆ GCN-DevLSTM: Path Development for Skeleton-Based Action Recognition + + +
+ Skeleton-based action recognition (SAR) in videos is an important but +challenging task in computer vision. The recent state-of-the-art models for SAR +are primarily based on graph convolutional neural networks (GCNs), which are +powerful in extracting the spatial information of skeleton data. However, it is +yet clear that such GCN-based models can effectively capture the temporal +dynamics of human action sequences. To this end, we propose the DevLSTM module, +which exploits the path development -- a principled and parsimonious +representation for sequential data by leveraging the Lie group structure. The +path development, originated from Rough path theory, can effectively capture +the order of events in high-dimensional stream data with massive dimension +reduction and consequently enhance the LSTM module substantially. Our proposed +G-DevLSTM module can be conveniently plugged into the temporal graph, +complementing existing advanced GCN-based models. Our empirical studies on the +NTU60, NTU120 and Chalearn2013 datasets demonstrate that our proposed hybrid +model significantly outperforms the current best-performing methods in SAR +tasks. The code is available at https://github.com/DeepIntoStreams/GCN-DevLSTM. + +
+
+
+
+
+ + ☆ MSCoTDet: Language-driven Multi-modal Fusion for Improved Multispectral + Pedestrian Detection + + +
+ Multispectral pedestrian detection is attractive for around-the-clock +applications due to the complementary information between RGB and thermal +modalities. However, current models often fail to detect pedestrians in obvious +cases, especially due to the modality bias learned from statistically biased +datasets. From these problems, we anticipate that maybe understanding the +complementary information itself is difficult to achieve from vision-only +models. Accordingly, we propose a novel Multispectral Chain-of-Thought +Detection (MSCoTDet) framework, which incorporates Large Language Models (LLMs) +to understand the complementary information at the semantic level and further +enhance the fusion process. Specifically, we generate text descriptions of the +pedestrian in each RGB and thermal modality and design a Multispectral +Chain-of-Thought (MSCoT) prompting, which models a step-by-step process to +facilitate cross-modal reasoning at the semantic level and perform accurate +detection. Moreover, we design a Language-driven Multi-modal Fusion (LMF) +strategy that enables fusing vision-driven and language-driven detections. +Extensive experiments validate that MSCoTDet improves multispectral pedestrian +detection. + +
+
+
+
+
+ + ☆ DITTO: Demonstration Imitation by Trajectory Transformation IROS 2024 + + +
+ Teaching robots new skills quickly and conveniently is crucial for the +broader adoption of robotic systems. In this work, we address the problem of +one-shot imitation from a single human demonstration, given by an RGB-D video +recording through a two-stage process. In the first stage which is offline, we +extract the trajectory of the demonstration. This entails segmenting +manipulated objects and determining their relative motion in relation to +secondary objects such as containers. Subsequently, in the live online +trajectory generation stage, we first \mbox{re-detect} all objects, then we +warp the demonstration trajectory to the current scene, and finally, we trace +the trajectory with the robot. To complete these steps, our method makes +leverages several ancillary models, including those for segmentation, relative +object pose estimation, and grasp prediction. We systematically evaluate +different combinations of correspondence and re-detection methods to validate +our design decision across a diverse range of tasks. Specifically, we collect +demonstrations of ten different tasks including pick-and-place tasks as well as +articulated object manipulation. Finally, we perform extensive evaluations on a +real robot system to demonstrate the effectiveness and utility of our approach +in real-world scenarios. We make the code publicly available at +http://ditto.cs.uni-freiburg.de. + +
+
+ comment: 8 pages, 4 figures, 3 tables, submitted to IROS 2024 +
+
+
+
+
+ + ☆ Your Image is My Video: Reshaping the Receptive Field via Image-To-Video + Differentiable AutoAugmentation and Fusion + + +
+ The landscape of deep learning research is moving towards innovative +strategies to harness the true potential of data. Traditionally, emphasis has +been on scaling model architectures, resulting in large and complex neural +networks, which can be difficult to train with limited computational resources. +However, independently of the model size, data quality (i.e. amount and +variability) is still a major factor that affects model generalization. In this +work, we propose a novel technique to exploit available data through the use of +automatic data augmentation for the tasks of image classification and semantic +segmentation. We introduce the first Differentiable Augmentation Search method +(DAS) to generate variations of images that can be processed as videos. +Compared to previous approaches, DAS is extremely fast and flexible, allowing +the search on very large search spaces in less than a GPU day. Our intuition is +that the increased receptive field in the temporal dimension provided by DAS +could lead to benefits also to the spatial receptive field. More specifically, +we leverage DAS to guide the reshaping of the spatial receptive field by +selecting task-dependant transformations. As a result, compared to standard +augmentation alternatives, we improve in terms of accuracy on ImageNet, +Cifar10, Cifar100, Tiny-ImageNet, Pascal-VOC-2012 and CityScapes datasets when +plugging-in our DAS over different light-weight video backbones. + +
+
+
+
+
+ + ☆ SFOD: Spiking Fusion Object Detector CVPR2024 + + +
+ Event cameras, characterized by high temporal resolution, high dynamic range, +low power consumption, and high pixel bandwidth, offer unique capabilities for +object detection in specialized contexts. Despite these advantages, the +inherent sparsity and asynchrony of event data pose challenges to existing +object detection algorithms. Spiking Neural Networks (SNNs), inspired by the +way the human brain codes and processes information, offer a potential solution +to these difficulties. However, their performance in object detection using +event cameras is limited in current implementations. In this paper, we propose +the Spiking Fusion Object Detector (SFOD), a simple and efficient approach to +SNN-based object detection. Specifically, we design a Spiking Fusion Module, +achieving the first-time fusion of feature maps from different scales in SNNs +applied to event cameras. Additionally, through integrating our analysis and +experiments conducted during the pretraining of the backbone network on the +NCAR dataset, we delve deeply into the impact of spiking decoding strategies +and loss functions on model performance. Thereby, we establish state-of-the-art +classification results based on SNNs, achieving 93.7\% accuracy on the NCAR +dataset. Experimental results on the GEN1 detection dataset demonstrate that +the SFOD achieves a state-of-the-art mAP of 32.1\%, outperforming existing +SNN-based approaches. Our research not only underscores the potential of SNNs +in object detection with event cameras but also propels the advancement of +SNNs. Code is available at https://github.com/yimeng-fan/SFOD. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ PDE-CNNs: Axiomatic Derivations and Applications + + +
+ PDE-based Group Convolutional Neural Networks (PDE-G-CNNs) utilize solvers of +geometrically meaningful evolution PDEs as substitutes for the conventional +components in G-CNNs. PDE-G-CNNs offer several key benefits all at once: fewer +parameters, inherent equivariance, better performance, data efficiency, and +geometric interpretability. In this article we focus on Euclidean equivariant +PDE-G-CNNs where the feature maps are two dimensional throughout. We call this +variant of the framework a PDE-CNN. We list several practically desirable +axioms and derive from these which PDEs should be used in a PDE-CNN. Here our +approach to geometric learning via PDEs is inspired by the axioms of classical +linear and morphological scale-space theory, which we generalize by introducing +semifield-valued signals. Furthermore, we experimentally confirm for small +networks that PDE-CNNs offer fewer parameters, better performance, and data +efficiency in comparison to CNNs. We also investigate what effect the use of +different semifields has on the performance of the models. + +
+
+
+
+
+ + ☆ LSK3DNet: Towards Effective and Efficient 3D Perception with Large + Sparse Kernels CVPR 2024 + + +
+ Autonomous systems need to process large-scale, sparse, and irregular point +clouds with limited compute resources. Consequently, it is essential to develop +LiDAR perception methods that are both efficient and effective. Although +naively enlarging 3D kernel size can enhance performance, it will also lead to +a cubically-increasing overhead. Therefore, it is crucial to develop +streamlined 3D large kernel designs that eliminate redundant weights and work +effectively with larger kernels. In this paper, we propose an efficient and +effective Large Sparse Kernel 3D Neural Network (LSK3DNet) that leverages +dynamic pruning to amplify the 3D kernel size. Our method comprises two core +components: Spatial-wise Dynamic Sparsity (SDS) and Channel-wise Weight +Selection (CWS). SDS dynamically prunes and regrows volumetric weights from the +beginning to learn a large sparse 3D kernel. It not only boosts performance but +also significantly reduces model size and computational cost. Moreover, CWS +selects the most important channels for 3D convolution during training and +subsequently prunes the redundant channels to accelerate inference for 3D +vision tasks. We demonstrate the effectiveness of LSK3DNet on three benchmark +datasets and five tracks compared with classical models and large kernel +designs. Notably, LSK3DNet achieves the state-of-the-art performance on +SemanticKITTI (i.e., 75.6% on single-scan and 63.4% on multi-scan), with +roughly 40% model size reduction and 60% computing operations reduction +compared to the naive large 3D kernel model. + +
+
+ comment: Accepted at CVPR 2024; Project page: + https://github.com/FengZicai/LSK3DNet +
+
+
+
+
+ + ☆ FastCAD: Real-Time CAD Retrieval and Alignment from Scans and Videos + + +
+ Digitising the 3D world into a clean, CAD model-based representation has +important applications for augmented reality and robotics. Current +state-of-the-art methods are computationally intensive as they individually +encode each detected object and optimise CAD alignments in a second stage. In +this work, we propose FastCAD, a real-time method that simultaneously retrieves +and aligns CAD models for all objects in a given scene. In contrast to previous +works, we directly predict alignment parameters and shape embeddings. We +achieve high-quality shape retrievals by learning CAD embeddings in a +contrastive learning framework and distilling those into FastCAD. Our +single-stage method accelerates the inference time by a factor of 50 compared +to other methods operating on RGB-D scans while outperforming them on the +challenging Scan2CAD alignment benchmark. Further, our approach collaborates +seamlessly with online 3D reconstruction techniques. This enables the real-time +generation of precise CAD model-based reconstructions from videos at 10 FPS. +Doing so, we significantly improve the Scan2CAD alignment accuracy in the video +setting from 43.0% to 48.2% and the reconstruction accuracy from 22.9% to +29.6%. + +
+
+
+
+
+ + ☆ Infrastructure-Assisted Collaborative Perception in Automated Valet + Parking: A Safety Perspective + + +
+ Environmental perception in Automated Valet Parking (AVP) has been a +challenging task due to severe occlusions in parking garages. Although +Collaborative Perception (CP) can be applied to broaden the field of view of +connected vehicles, the limited bandwidth of vehicular communications restricts +its application. In this work, we propose a BEV feature-based CP network +architecture for infrastructure-assisted AVP systems. The model takes the +roadside camera and LiDAR as optional inputs and adaptively fuses them with +onboard sensors in a unified BEV representation. Autoencoder and downsampling +are applied for channel-wise and spatial-wise dimension reduction, while +sparsification and quantization further compress the feature map with little +loss in data precision. Combining these techniques, the size of a BEV feature +map is effectively compressed to fit in the feasible data rate of the NR-V2X +network. With the synthetic AVP dataset, we observe that CP can effectively +increase perception performance, especially for pedestrians. Moreover, the +advantage of infrastructure-assisted CP is demonstrated in two typical +safety-critical scenarios in the AVP setting, increasing the maximum safe +cruising speed by up to 3m/s in both scenarios. + +
+
+ comment: 7 pages, 7 figures, 4 tables, accepted by IEEE VTC2024-Spring +
+
+
+
+
+ + ☆ A Multimodal Approach for Cross-Domain Image Retrieval + + +
+ Image generators are gaining vast amount of popularity and have rapidly +changed how digital content is created. With the latest AI technology, millions +of high quality images are being generated by the public, which are constantly +motivating the research community to push the limits of generative models to +create more complex and realistic images. This paper focuses on Cross-Domain +Image Retrieval (CDIR) which can be used as an additional tool to inspect +collections of generated images by determining the level of similarity between +images in a dataset. An ideal retrieval system would be able to generalize to +unseen complex images from multiple domains (e.g., photos, drawings and +paintings). To address this goal, we propose a novel caption-matching approach +that leverages multimodal language-vision architectures pre-trained on large +datasets. The method is tested on DomainNet and Office-Home datasets and +consistently achieves state-of-the-art performance over the latest approaches +in the literature for cross-domain image retrieval. In order to verify the +effectiveness with AI-generated images, the method was also put to test with a +database composed by samples collected from Midjourney, which is a widely used +generative platform for content creation. + +
+
+
+
+
+ + ☆ An In-Depth Analysis of Data Reduction Methods for Sustainable Deep + Learning + + +
+ In recent years, Deep Learning has gained popularity for its ability to solve +complex classification tasks, increasingly delivering better results thanks to +the development of more accurate models, the availability of huge volumes of +data and the improved computational capabilities of modern computers. However, +these improvements in performance also bring efficiency problems, related to +the storage of datasets and models, and to the waste of energy and time +involved in both the training and inference processes. In this context, data +reduction can help reduce energy consumption when training a deep learning +model. In this paper, we present up to eight different methods to reduce the +size of a tabular training dataset, and we develop a Python package to apply +them. We also introduce a representativeness metric based on topology to +measure how similar are the reduced datasets and the full training dataset. +Additionally, we develop a methodology to apply these data reduction methods to +image datasets for object detection tasks. Finally, we experimentally compare +how these data reduction methods affect the representativeness of the reduced +dataset, the energy consumption and the predictive performance of the model. + +
+
+
+
+
+ + ☆ Modular Deep Active Learning Framework for Image Annotation: A Technical + Report for the Ophthalmo-AI Project + + +
+ Image annotation is one of the most essential tasks for guaranteeing proper +treatment for patients and tracking progress over the course of therapy in the +field of medical imaging and disease diagnosis. However, manually annotating a +lot of 2D and 3D imaging data can be extremely tedious. Deep Learning (DL) +based segmentation algorithms have completely transformed this process and made +it possible to automate image segmentation. By accurately segmenting medical +images, these algorithms can greatly minimize the time and effort necessary for +manual annotation. Additionally, by incorporating Active Learning (AL) methods, +these segmentation algorithms can perform far more effectively with a smaller +amount of ground truth data. We introduce MedDeepCyleAL, an end-to-end +framework implementing the complete AL cycle. It provides researchers with the +flexibility to choose the type of deep learning model they wish to employ and +includes an annotation tool that supports the classification and segmentation +of medical images. The user-friendly interface allows for easy alteration of +the AL and DL model settings through a configuration file, requiring no prior +programming experience. While MedDeepCyleAL can be applied to any kind of image +data, we have specifically applied it to ophthalmology data in this project. + +
+
+ comment: DFKI Technical Report +
+
+
+
+
+ + ☆ Deep Generative Model based Rate-Distortion for Image Downscaling + Assessment CVPR 2024 + + +
+ In this paper, we propose Image Downscaling Assessment by Rate-Distortion +(IDA-RD), a novel measure to quantitatively evaluate image downscaling +algorithms. In contrast to image-based methods that measure the quality of +downscaled images, ours is process-based that draws ideas from rate-distortion +theory to measure the distortion incurred during downscaling. Our main idea is +that downscaling and super-resolution (SR) can be viewed as the encoding and +decoding processes in the rate-distortion model, respectively, and that a +downscaling algorithm that preserves more details in the resulting +low-resolution (LR) images should lead to less distorted high-resolution (HR) +images in SR. In other words, the distortion should increase as the downscaling +algorithm deteriorates. However, it is non-trivial to measure this distortion +as it requires the SR algorithm to be blind and stochastic. Our key insight is +that such requirements can be met by recent SR algorithms based on deep +generative models that can find all matching HR images for a given LR image on +their learned image manifolds. Extensive experimental results show the +effectiveness of our IDA-RD measure. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Transfer CLIP for Generalizable Image Denoising CVPR2024 + + +
+ Image denoising is a fundamental task in computer vision. While prevailing +deep learning-based supervised and self-supervised methods have excelled in +eliminating in-distribution noise, their susceptibility to out-of-distribution +(OOD) noise remains a significant challenge. The recent emergence of +contrastive language-image pre-training (CLIP) model has showcased exceptional +capabilities in open-world image recognition and segmentation. Yet, the +potential for leveraging CLIP to enhance the robustness of low-level tasks +remains largely unexplored. This paper uncovers that certain dense features +extracted from the frozen ResNet image encoder of CLIP exhibit +distortion-invariant and content-related properties, which are highly desirable +for generalizable denoising. Leveraging these properties, we devise an +asymmetrical encoder-decoder denoising network, which incorporates dense +features including the noisy image and its multi-scale features from the frozen +ResNet encoder of CLIP into a learnable image decoder to achieve generalizable +denoising. The progressive feature augmentation strategy is further proposed to +mitigate feature overfitting and improve the robustness of the learnable +decoder. Extensive experiments and comparisons conducted across diverse OOD +noises, including synthetic noise, real-world sRGB noise, and low-dose CT image +noise, demonstrate the superior generalization ability of our method. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ Gradient-based Sampling for Class Imbalanced Semi-supervised Object + Detection ICCV2023 + + +
+ Current semi-supervised object detection (SSOD) algorithms typically assume +class balanced datasets (PASCAL VOC etc.) or slightly class imbalanced datasets +(MS-COCO, etc). This assumption can be easily violated since real world +datasets can be extremely class imbalanced in nature, thus making the +performance of semi-supervised object detectors far from satisfactory. Besides, +the research for this problem in SSOD is severely under-explored. To bridge +this research gap, we comprehensively study the class imbalance problem for +SSOD under more challenging scenarios, thus forming the first experimental +setting for class imbalanced SSOD (CI-SSOD). Moreover, we propose a simple yet +effective gradient-based sampling framework that tackles the class imbalance +problem from the perspective of two types of confirmation biases. To tackle +confirmation bias towards majority classes, the gradient-based reweighting and +gradient-based thresholding modules leverage the gradients from each class to +fully balance the influence of the majority and minority classes. To tackle the +confirmation bias from incorrect pseudo labels of minority classes, the +class-rebalancing sampling module resamples unlabeled data following the +guidance of the gradient-based reweighting module. Experiments on three +proposed sub-tasks, namely MS-COCO, MS-COCO to Object365 and LVIS, suggest that +our method outperforms current class imbalanced object detectors by clear +margins, serving as a baseline for future research in CI-SSOD. Code will be +available at https://github.com/nightkeepers/CI-SSOD. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ EndoGSLAM: Real-Time Dense Reconstruction and Tracking in Endoscopic + Surgeries using Gaussian Splatting + + +
+ Precise camera tracking, high-fidelity 3D tissue reconstruction, and +real-time online visualization are critical for intrabody medical imaging +devices such as endoscopes and capsule robots. However, existing SLAM +(Simultaneous Localization and Mapping) methods often struggle to achieve both +complete high-quality surgical field reconstruction and efficient computation, +restricting their intraoperative applications among endoscopic surgeries. In +this paper, we introduce EndoGSLAM, an efficient SLAM approach for endoscopic +surgeries, which integrates streamlined Gaussian representation and +differentiable rasterization to facilitate over 100 fps rendering speed during +online camera tracking and tissue reconstructing. Extensive experiments show +that EndoGSLAM achieves a better trade-off between intraoperative availability +and reconstruction quality than traditional or neural SLAM approaches, showing +tremendous potential for endoscopic surgeries. The project page is at +https://EndoGSLAM.loping151.com + +
+
+
+
+
+ + ☆ SYNCS: Synthetic Data and Contrastive Self-Supervised Training for + Central Sulcus Segmentation + + +
+ Bipolar disorder (BD) and schizophrenia (SZ) are severe mental disorders with +profound societal impact. Identifying risk markers early is crucial for +understanding disease progression and enabling preventive measures. The Danish +High Risk and Resilience Study (VIA) focuses on understanding early disease +processes, particularly in children with familial high risk (FHR). +Understanding structural brain changes associated with these diseases during +early stages is essential for effective interventions. The central sulcus (CS) +is a prominent brain landmark related to brain regions involved in motor and +sensory processing. Analyzing CS morphology can provide valuable insights into +neurodevelopmental abnormalities in the FHR group. However, segmenting the +central sulcus (CS) presents challenges due to its variability, especially in +adolescents. This study introduces two novel approaches to improve CS +segmentation: synthetic data generation to model CS variability and +self-supervised pre-training with multi-task learning to adapt models to new +cohorts. These methods aim to enhance segmentation performance across diverse +populations, eliminating the need for extensive preprocessing. + +
+
+
+
+
+ + ☆ An Open-World, Diverse, Cross-Spatial-Temporal Benchmark for Dynamic + Wild Person Re-Identification + + +
+ Person re-identification (ReID) has made great strides thanks to the +data-driven deep learning techniques. However, the existing benchmark datasets +lack diversity, and models trained on these data cannot generalize well to +dynamic wild scenarios. To meet the goal of improving the explicit +generalization of ReID models, we develop a new Open-World, Diverse, +Cross-Spatial-Temporal dataset named OWD with several distinct features. 1) +Diverse collection scenes: multiple independent open-world and highly dynamic +collecting scenes, including streets, intersections, shopping malls, etc. 2) +Diverse lighting variations: long time spans from daytime to nighttime with +abundant illumination changes. 3) Diverse person status: multiple camera +networks in all seasons with normal/adverse weather conditions and diverse +pedestrian appearances (e.g., clothes, personal belongings, poses, etc.). 4) +Protected privacy: invisible faces for privacy critical applications. To +improve the implicit generalization of ReID, we further propose a Latent Domain +Expansion (LDE) method to develop the potential of source data, which decouples +discriminative identity-relevant and trustworthy domain-relevant features and +implicitly enforces domain-randomized identity feature space expansion with +richer domain diversity to facilitate domain invariant representations. Our +comprehensive evaluations with most benchmark datasets in the community are +crucial for progress, although this work is far from the grand goal toward +open-world and dynamic wild applications. + +
+
+ comment: Accepted by IJCV in 2024 +
+
+
+
+
+ + ☆ PseudoTouch: Efficiently Imaging the Surface Feel of Objects for Robotic + Manipulation IROS2024 + + +
+ Humans seemingly incorporate potential touch signals in their perception. Our +goal is to equip robots with a similar capability, which we term \ourmodel. +\ourmodel aims to predict the expected touch signal based on a visual patch +representing the touched area. We frame this problem as the task of learning a +low-dimensional visual-tactile embedding, wherein we encode a depth patch from +which we decode the tactile signal. To accomplish this task, we employ ReSkin, +an inexpensive and replaceable magnetic-based tactile sensor. Using ReSkin, we +collect and train PseudoTouch on a dataset comprising aligned tactile and +visual data pairs obtained through random touching of eight basic geometric +shapes. We demonstrate the efficacy of PseudoTouch through its application to +two downstream tasks: object recognition and grasp stability prediction. In the +object recognition task, we evaluate the learned embedding's performance on a +set of five basic geometric shapes and five household objects. Using +PseudoTouch, we achieve an object recognition accuracy 84% after just ten +touches, surpassing a proprioception baseline. For the grasp stability task, we +use ACRONYM labels to train and evaluate a grasp success predictor using +PseudoTouch's predictions derived from virtual depth information. Our approach +yields an impressive 32% absolute improvement in accuracy compared to the +baseline relying on partial point cloud data. We make the data, code, and +trained models publicly available at http://pseudotouch.cs.uni-freiburg.de. + +
+
+ comment: 8 pages, 7 figures, 2 tables, submitted to IROS2024 +
+
+
+
+
+ + ☆ Improving cross-domain brain tissue segmentation in fetal MRI with + synthetic data + + +
+ Segmentation of fetal brain tissue from magnetic resonance imaging (MRI) +plays a crucial role in the study of in utero neurodevelopment. However, +automated tools face substantial domain shift challenges as they must be robust +to highly heterogeneous clinical data, often limited in numbers and lacking +annotations. Indeed, high variability of the fetal brain morphology, MRI +acquisition parameters, and superresolution reconstruction (SR) algorithms +adversely affect the model's performance when evaluated out-of-domain. In this +work, we introduce FetalSynthSeg, a domain randomization method to segment +fetal brain MRI, inspired by SynthSeg. Our results show that models trained +solely on synthetic data outperform models trained on real data in out-ofdomain +settings, validated on a 120-subject cross-domain dataset. Furthermore, we +extend our evaluation to 40 subjects acquired using lowfield (0.55T) MRI and +reconstructed with novel SR models, showcasing robustness across different +magnetic field strengths and SR algorithms. Leveraging a generative synthetic +approach, we tackle the domain shift problem in fetal brain MRI and offer +compelling prospects for applications in fields with limited and highly +heterogeneous data. + +
+
+ comment: 10 pages, 5 figures, 1 table +
+
+
+
+
+ + ☆ UniTraj: A Unified Framework for Scalable Vehicle Trajectory Prediction + + +
+ Vehicle trajectory prediction has increasingly relied on data-driven +solutions, but their ability to scale to different data domains and the impact +of larger dataset sizes on their generalization remain under-explored. While +these questions can be studied by employing multiple datasets, it is +challenging due to several discrepancies, \textit{e.g.,} in data formats, map +resolution, and semantic annotation types. To address these challenges, we +introduce UniTraj, a comprehensive framework that unifies various datasets, +models, and evaluation criteria, presenting new opportunities for the vehicle +trajectory prediction field. In particular, using UniTraj, we conduct extensive +experiments and find that model performance significantly drops when +transferred to other datasets. However, enlarging data size and diversity can +substantially improve performance, leading to a new state-of-the-art result for +the nuScenes dataset. We provide insights into dataset characteristics to +explain these findings. The code can be found here: +\hyperlink{https://github.com/vita-epfl/UniTraj}{https://github.com/vita-epfl/UniTraj}. + +
+
+
+
+
+ + ☆ IFSENet : Harnessing Sparse Iterations for Interactive Few-shot + Segmentation Excellence + + +
+ Training a computer vision system to segment a novel class typically requires +collecting and painstakingly annotating lots of images with objects from that +class. Few-shot segmentation techniques reduce the required number of images to +learn to segment a new class, but careful annotations of object boundaries are +still required. On the other hand, interactive segmentation techniques only +focus on incrementally improving the segmentation of one object at a time +(typically, using clicks given by an expert) in a class-agnostic manner. We +combine the two concepts to drastically reduce the effort required to train +segmentation models for novel classes. Instead of trivially feeding interactive +segmentation masks as ground truth to a few-shot segmentation model, we propose +IFSENet, which can accept sparse supervision on a single or few support images +in the form of clicks to generate masks on support (training, at least clicked +upon once) as well as query (test, never clicked upon) images. To trade-off +effort for accuracy flexibly, the number of images and clicks can be +incrementally added to the support set to further improve the segmentation of +support as well as query images. The proposed model approaches the accuracy of +previous state-of-the-art few-shot segmentation models with considerably lower +annotation effort (clicks instead of maps), when tested on Pascal and SBD +datasets on query images. It also works well as an interactive segmentation +method on support images. + +
+
+
+
+
+ + ☆ Cell Variational Information Bottleneck Network + + +
+ In this work, we propose Cell Variational Information Bottleneck Network +(cellVIB), a convolutional neural network using information bottleneck +mechanism, which can be combined with the latest feedforward network +architecture in an end-to-end training method. Our Cell Variational Information +Bottleneck Network is constructed by stacking VIB cells, which generate feature +maps with uncertainty. As layers going deeper, the regularization effect will +gradually increase, instead of directly adding excessive regular constraints to +the output layer of the model as in Deep VIB. Under each VIB cell, the +feedforward process learns an independent mean term and an standard deviation +term, and predicts the Gaussian distribution based on them. The feedback +process is based on reparameterization trick for effective training. This work +performs an extensive analysis on MNIST dataset to verify the effectiveness of +each VIB cells, and provides an insightful analysis on how the VIB cells affect +mutual information. Experiments conducted on CIFAR-10 also prove that our +cellVIB is robust against noisy labels during training and against corrupted +images during testing. Then, we validate our method on PACS dataset, whose +results show that the VIB cells can significantly improve the generalization +performance of the basic model. Finally, in a more complex representation +learning task, face recognition, our network structure has also achieved very +competitive results. + +
+
+
+
+
+ + ☆ Integrating multiscale topology in digital pathology with pyramidal + graph convolutional networks + + +
+ Graph convolutional networks (GCNs) have emerged as a powerful alternative to +multiple instance learning with convolutional neural networks in digital +pathology, offering superior handling of structural information across various +spatial ranges - a crucial aspect of learning from gigapixel H&E-stained whole +slide images (WSI). However, graph message-passing algorithms often suffer from +oversmoothing when aggregating a large neighborhood. Hence, effective modeling +of multi-range interactions relies on the careful construction of the graph. +Our proposed multi-scale GCN (MS-GCN) tackles this issue by leveraging +information across multiple magnification levels in WSIs. MS-GCN enables the +simultaneous modeling of long-range structural dependencies at lower +magnifications and high-resolution cellular details at higher magnifications, +akin to analysis pipelines usually conducted by pathologists. The +architecture's unique configuration allows for the concurrent modeling of +structural patterns at lower magnifications and detailed cellular features at +higher ones, while also quantifying the contribution of each magnification +level to the prediction. Through testing on different datasets, MS-GCN +demonstrates superior performance over existing single-magnification GCN +methods. The enhancement in performance and interpretability afforded by our +method holds promise for advancing computational pathology models, especially +in tasks requiring extensive spatial context. + +
+
+
+
+
+ + ☆ Recent Trends in 3D Reconstruction of General Non-Rigid Scenes + + +
+ Reconstructing models of the real world, including 3D geometry, appearance, +and motion of real scenes, is essential for computer graphics and computer +vision. It enables the synthesizing of photorealistic novel views, useful for +the movie industry and AR/VR applications. It also facilitates the content +creation necessary in computer games and AR/VR by avoiding laborious manual +design processes. Further, such models are fundamental for intelligent +computing systems that need to interpret real-world scenes and actions to act +and interact safely with the human world. Notably, the world surrounding us is +dynamic, and reconstructing models of dynamic, non-rigidly moving scenes is a +severely underconstrained and challenging problem. This state-of-the-art report +(STAR) offers the reader a comprehensive summary of state-of-the-art techniques +with monocular and multi-view inputs such as data from RGB and RGB-D sensors, +among others, conveying an understanding of different approaches, their +potential applications, and promising further research directions. The report +covers 3D reconstruction of general non-rigid scenes and further addresses the +techniques for scene decomposition, editing and controlling, and generalizable +and generative modeling. More specifically, we first review the common and +fundamental concepts necessary to understand and navigate the field and then +discuss the state-of-the-art techniques by reviewing recent approaches that use +traditional and machine-learning-based neural representations, including a +discussion on the newly enabled applications. The STAR is concluded with a +discussion of the remaining limitations and open challenges. + +
+
+ comment: 42 pages, 18 figures, 5 tables; State-of-the-Art Report at + EUROGRAPHICS 2024 +
+
+
+
+
+ + ☆ Towards a Comprehensive, Efficient and Promptable Anatomic Structure + Segmentation Model using 3D Whole-body CT Scans + + +
+ Segment anything model (SAM) demonstrates strong generalization ability on +natural image segmentation. However, its direct adaption in medical image +segmentation tasks shows significant performance drops with inferior accuracy +and unstable results. It may also requires an excessive number of prompt points +to obtain a reasonable accuracy. For segmenting 3D radiological CT or MRI +scans, a 2D SAM model has to separately handle hundreds of 2D slices. Although +quite a few studies explore adapting SAM into medical image volumes, the +efficiency of 2D adaption methods is unsatisfactory and 3D adaptation methods +only capable of segmenting specific organs/tumors. In this work, we propose a +comprehensive and scalable 3D SAM model for whole-body CT segmentation, named +CT-SAM3D. Instead of adapting SAM, we propose a 3D promptable segmentation +model using a (nearly) fully labeled CT dataset. To train CT-SAM3D effectively, +ensuring the model's accurate responses to higher-dimensional spatial prompts +is crucial, and 3D patch-wise training is required due to GPU memory +constraints. For this purpose, we propose two key technical developments: 1) a +progressively and spatially aligned prompt encoding method to effectively +encode click prompts in local 3D space; and 2) a cross-patch prompt learning +scheme to capture more 3D spatial context, which is beneficial for reducing the +editing workloads when interactively prompting on large organs. CT-SAM3D is +trained and validated using a curated dataset of 1204 CT scans containing 107 +whole-body anatomies, reporting significantly better quantitative performance +against all previous SAM-derived models by a large margin with much fewer click +prompts. Our model can handle segmenting unseen organ as well. Code, data, and +our 3D interactive segmentation tool with quasi-real-time responses will be +made publicly available. + +
+
+
+
+
+ + ☆ Subjective Quality Assessment of Compressed Tone-Mapped High Dynamic + Range Videos + + +
+ High Dynamic Range (HDR) videos are able to represent wider ranges of +contrasts and colors than Standard Dynamic Range (SDR) videos, giving more +vivid experiences. Due to this, HDR videos are expected to grow into the +dominant video modality of the future. However, HDR videos are incompatible +with existing SDR displays, which form the majority of affordable consumer +displays on the market. Because of this, HDR videos must be processed by +tone-mapping them to reduced bit-depths to service a broad swath of SDR-limited +video consumers. Here, we analyze the impact of tone-mapping operators on the +visual quality of streaming HDR videos. To this end, we built the first +large-scale subjectively annotated open-source database of compressed +tone-mapped HDR videos, containing 15,000 tone-mapped sequences derived from 40 +unique HDR source contents. The videos in the database were labeled with more +than 750,000 subjective quality annotations, collected from more than 1,600 +unique human observers. We demonstrate the usefulness of the new subjective +database by benchmarking objective models of visual quality on it. We envision +that the new LIVE Tone-Mapped HDR (LIVE-TMHDR) database will enable significant +progress on HDR video tone mapping and quality assessment in the future. To +this end, we make the database freely available to the community at +https://live.ece.utexas.edu/research/LIVE_TMHDR/index.html + +
+
+
+
+
+ + ☆ MM-Diff: High-Fidelity Image Personalization via Multi-Modal Condition + Integration + + +
+ Recent advances in tuning-free personalized image generation based on +diffusion models are impressive. However, to improve subject fidelity, existing +methods either retrain the diffusion model or infuse it with dense visual +embeddings, both of which suffer from poor generalization and efficiency. Also, +these methods falter in multi-subject image generation due to the unconstrained +cross-attention mechanism. In this paper, we propose MM-Diff, a unified and +tuning-free image personalization framework capable of generating high-fidelity +images of both single and multiple subjects in seconds. Specifically, to +simultaneously enhance text consistency and subject fidelity, MM-Diff employs a +vision encoder to transform the input image into CLS and patch embeddings. CLS +embeddings are used on the one hand to augment the text embeddings, and on the +other hand together with patch embeddings to derive a small number of +detail-rich subject embeddings, both of which are efficiently integrated into +the diffusion model through the well-designed multimodal cross-attention +mechanism. Additionally, MM-Diff introduces cross-attention map constraints +during the training phase, ensuring flexible multi-subject image sampling +during inference without any predefined inputs (e.g., layout). Extensive +experiments demonstrate the superior performance of MM-Diff over other leading +methods. + +
+
+
+
+
+ + ☆ Continual Vision-and-Language Navigation + + +
+ Vision-and-Language Navigation (VLN) agents navigate to a destination using +natural language instructions and the visual information they observe. Existing +methods for training VLN agents presuppose fixed datasets, leading to a +significant limitation: the introduction of new environments necessitates +retraining with previously encountered environments to preserve their +knowledge. This makes it difficult to train VLN agents that operate in the +ever-changing real world. To address this limitation, we present the Continual +Vision-and-Language Navigation (CVLN) paradigm, designed to evaluate agents +trained through a continual learning process. For the training and evaluation +of CVLN agents, we re-arrange existing VLN datasets to propose two datasets: +CVLN-I, focused on navigation via initial-instruction interpretation, and +CVLN-D, aimed at navigation through dialogue with other agents. Furthermore, we +propose two novel rehearsal-based methods for CVLN, Perplexity Replay (PerpR) +and Episodic Self-Replay (ESR). PerpR prioritizes replaying challenging +episodes based on action perplexity, while ESR replays previously predicted +action logits to preserve learned behaviors. We demonstrate the effectiveness +of the proposed methods on CVLN through extensive experiments. + +
+
+
+
+
+ + ☆ Cartoon Hallucinations Detection: Pose-aware In Context Visual Learning + + +
+ Large-scale Text-to-Image (TTI) models have become a common approach for +generating training data in various generative fields. However, visual +hallucinations, which contain perceptually critical defects, remain a concern, +especially in non-photorealistic styles like cartoon characters. We propose a +novel visual hallucination detection system for cartoon character images +generated by TTI models. Our approach leverages pose-aware in-context visual +learning (PA-ICVL) with Vision-Language Models (VLMs), utilizing both RGB +images and pose information. By incorporating pose guidance from a fine-tuned +pose estimator, we enable VLMs to make more accurate decisions. Experimental +results demonstrate significant improvements in identifying visual +hallucinations compared to baseline methods relying solely on RGB images. This +research advances TTI models by mitigating visual hallucinations, expanding +their potential in non-photorealistic domains. + +
+
+ comment: 11 pages, 12 figures, 1 table, Project page: + https://gh-bumsookim.github.io/Cartoon-Hallucinations-Detection/ +
+
+
+
+
+ + ☆ Multimodal Fusion with Pre-Trained Model Features in Affective Behaviour + Analysis In-the-wild + + +
+ Multimodal fusion is a significant method for most multimodal tasks. With the +recent surge in the number of large pre-trained models, combining both +multimodal fusion methods and pre-trained model features can achieve +outstanding performance in many multimodal tasks. In this paper, we present our +approach, which leverages both advantages for addressing the task of Expression +(Expr) Recognition and Valence-Arousal (VA) Estimation. We evaluate the +Aff-Wild2 database using pre-trained models, then extract the final hidden +layers of the models as features. Following preprocessing and interpolation or +convolution to align the extracted features, different models are employed for +modal fusion. Our code is available at GitHub - FulgenceWen/ABAW6th. + +
+
+
+
+
+ + ☆ Toward Tiny and High-quality Facial Makeup with Data Amplify Learning + + +
+ Contemporary makeup approaches primarily hinge on unpaired learning +paradigms, yet they grapple with the challenges of inaccurate supervision +(e.g., face misalignment) and sophisticated facial prompts (including face +parsing, and landmark detection). These challenges prohibit low-cost deployment +of facial makeup models, especially on mobile devices. To solve above problems, +we propose a brand-new learning paradigm, termed "Data Amplify Learning (DAL)," +alongside a compact makeup model named "TinyBeauty." The core idea of DAL lies +in employing a Diffusion-based Data Amplifier (DDA) to "amplify" limited images +for the model training, thereby enabling accurate pixel-to-pixel supervision +with merely a handful of annotations. Two pivotal innovations in DDA facilitate +the above training approach: (1) A Residual Diffusion Model (RDM) is designed +to generate high-fidelity detail and circumvent the detail vanishing problem in +the vanilla diffusion models; (2) A Fine-Grained Makeup Module (FGMM) is +proposed to achieve precise makeup control and combination while retaining face +identity. Coupled with DAL, TinyBeauty necessitates merely 80K parameters to +achieve a state-of-the-art performance without intricate face prompts. +Meanwhile, TinyBeauty achieves a remarkable inference speed of up to 460 fps on +the iPhone 13. Extensive experiments show that DAL can produce highly +competitive makeup models using only 5 image pairs. + +
+
+
+
+
+ + ☆ An Integrated Neighborhood and Scale Information Network for Open-Pit + Mine Change Detection in High-Resolution Remote Sensing Images + + +
+ Open-pit mine change detection (CD) in high-resolution (HR) remote sensing +images plays a crucial role in mineral development and environmental +protection. Significant progress has been made in this field in recent years, +largely due to the advancement of deep learning techniques. However, existing +deep-learning-based CD methods encounter challenges in effectively integrating +neighborhood and scale information, resulting in suboptimal performance. +Therefore, by exploring the influence patterns of neighborhood and scale +information, this paper proposes an Integrated Neighborhood and Scale +Information Network (INSINet) for open-pit mine CD in HR remote sensing images. +Specifically, INSINet introduces 8-neighborhood-image information to acquire a +larger receptive field, improving the recognition of center image boundary +regions. Drawing on techniques of skip connection, deep supervision, and +attention mechanism, the multi-path deep supervised attention (MDSA) module is +designed to enhance multi-scale information fusion and change feature +extraction. Experimental analysis reveals that incorporating neighborhood and +scale information enhances the F1 score of INSINet by 6.40%, with improvements +of 3.08% and 3.32% respectively. INSINet outperforms existing methods with an +Overall Accuracy of 97.69%, Intersection over Union of 71.26%, and F1 score of +83.22%. INSINet shows significance for open-pit mine CD in HR remote sensing +images. + +
+
+
+
+
+ + ☆ Image Classification with Rotation-Invariant Variational Quantum + Circuits + + +
+ Variational quantum algorithms are gaining attention as an early application +of Noisy Intermediate-Scale Quantum (NISQ) devices. One of the main problems of +variational methods lies in the phenomenon of Barren Plateaus, present in the +optimization of variational parameters. Adding geometric inductive bias to the +quantum models has been proposed as a potential solution to mitigate this +problem, leading to a new field called Geometric Quantum Machine Learning. In +this work, an equivariant architecture for variational quantum classifiers is +introduced to create a label-invariant model for image classification with +$C_4$ rotational label symmetry. The equivariant circuit is benchmarked against +two different architectures, and it is experimentally observed that the +geometric approach boosts the model's performance. Finally, a classical +equivariant convolution operation is proposed to extend the quantum model for +the processing of larger images, employing the resources available in NISQ +devices. + +
+
+ comment: 9 pages, 9 figures +
+
+
+
+
+ + ☆ VRSO: Visual-Centric Reconstruction for Static Object Annotation + + +
+ As a part of the perception results of intelligent driving systems, static +object detection (SOD) in 3D space provides crucial cues for driving +environment understanding. With the rapid deployment of deep neural networks +for SOD tasks, the demand for high-quality training samples soars. The +traditional, also reliable, way is manual labeling over the dense LiDAR point +clouds and reference images. Though most public driving datasets adopt this +strategy to provide SOD ground truth (GT), it is still expensive (requires +LiDAR scanners) and low-efficient (time-consuming and unscalable) in practice. +This paper introduces VRSO, a visual-centric approach for static object +annotation. VRSO is distinguished in low cost, high efficiency, and high +quality: (1) It recovers static objects in 3D space with only camera images as +input, and (2) manual labeling is barely involved since GT for SOD tasks is +generated based on an automatic reconstruction and annotation pipeline. (3) +Experiments on the Waymo Open Dataset show that the mean reprojection error +from VRSO annotation is only 2.6 pixels, around four times lower than the Waymo +labeling (10.6 pixels). Source code is available at: +https://github.com/CaiYingFeng/VRSO. + +
+
+ comment: submitted to iros 2024 +
+
+
+
+
+ + ☆ BSNet: Box-Supervised Simulation-assisted Mean Teacher for 3D Instance + Segmentation + + +
+ 3D instance segmentation (3DIS) is a crucial task, but point-level +annotations are tedious in fully supervised settings. Thus, using bounding +boxes (bboxes) as annotations has shown great potential. The current mainstream +approach is a two-step process, involving the generation of pseudo-labels from +box annotations and the training of a 3DIS network with the pseudo-labels. +However, due to the presence of intersections among bboxes, not every point has +a determined instance label, especially in overlapping areas. To generate +higher quality pseudo-labels and achieve more precise weakly supervised 3DIS +results, we propose the Box-Supervised Simulation-assisted Mean Teacher for 3D +Instance Segmentation (BSNet), which devises a novel pseudo-labeler called +Simulation-assisted Transformer. The labeler consists of two main components. +The first is Simulation-assisted Mean Teacher, which introduces Mean Teacher +for the first time in this task and constructs simulated samples to assist the +labeler in acquiring prior knowledge about overlapping areas. To better model +local-global structure, we also propose Local-Global Aware Attention as the +decoder for teacher and student labelers. Extensive experiments conducted on +the ScanNetV2 and S3DIS datasets verify the superiority of our designs. Code is +available at +\href{https://github.com/peoplelu/BSNet}{https://github.com/peoplelu/BSNet}. + +
+
+
+
+
+ + ☆ Vehicle Detection Performance in Nordic Region ICPR2024 + + +
+ This paper addresses the critical challenge of vehicle detection in the harsh +winter conditions in the Nordic regions, characterized by heavy snowfall, +reduced visibility, and low lighting. Due to their susceptibility to +environmental distortions and occlusions, traditional vehicle detection methods +have struggled in these adverse conditions. The advanced proposed deep learning +architectures brought promise, yet the unique difficulties of detecting +vehicles in Nordic winters remain inadequately addressed. This study uses the +Nordic Vehicle Dataset (NVD), which has UAV images from northern Sweden, to +evaluate the performance of state-of-the-art vehicle detection algorithms under +challenging weather conditions. Our methodology includes a comprehensive +evaluation of single-stage, two-stage, and transformer-based detectors against +the NVD. We propose a series of enhancements tailored to each detection +framework, including data augmentation, hyperparameter tuning, transfer +learning, and novel strategies designed explicitly for the DETR model. Our +findings not only highlight the limitations of current detection systems in the +Nordic environment but also offer promising directions for enhancing these +algorithms for improved robustness and accuracy in vehicle detection amidst the +complexities of winter landscapes. The code and the dataset are available at +https://nvd.ltu-ai.dev + +
+
+ comment: submitted to ICPR2024 +
+
+
+
+
+ + ☆ Extracting Human Attention through Crowdsourced Patch Labeling + + +
+ In image classification, a significant problem arises from bias in the +datasets. When it contains only specific types of images, the classifier begins +to rely on shortcuts - simplistic and erroneous rules for decision-making. This +leads to high performance on the training dataset but inferior results on new, +varied images, as the classifier's generalization capability is reduced. For +example, if the images labeled as mustache consist solely of male figures, the +model may inadvertently learn to classify images by gender rather than the +presence of a mustache. One approach to mitigate such biases is to direct the +model's attention toward the target object's location, usually marked using +bounding boxes or polygons for annotation. However, collecting such annotations +requires substantial time and human effort. Therefore, we propose a novel +patch-labeling method that integrates AI assistance with crowdsourcing to +capture human attention from images, which can be a viable solution for +mitigating bias. Our method consists of two steps. First, we extract the +approximate location of a target using a pre-trained saliency detection model +supplemented by human verification for accuracy. Then, we determine the +human-attentive area in the image by iteratively dividing the image into +smaller patches and employing crowdsourcing to ascertain whether each patch can +be classified as the target object. We demonstrated the effectiveness of our +method in mitigating bias through improved classification accuracy and the +refined focus of the model. Also, crowdsourced experiments validate that our +method collects human annotation up to 3.4 times faster than annotating object +locations with polygons, significantly reducing the need for human resources. +We conclude the paper by discussing the advantages of our method in a +crowdsourcing context, mainly focusing on aspects of human errors and +accessibility. + +
+
+ comment: 21 pages, 11 figures +
+
+
+
+
+ + ☆ Cell Tracking according to Biological Needs -- Strong Mitosis-aware + Random-finite Sets Tracker with Aleatoric Uncertainty + + +
+ Cell tracking and segmentation assist biologists in extracting insights from +large-scale microscopy time-lapse data. Driven by local accuracy metrics, +current tracking approaches often suffer from a lack of long-term consistency. +To address this issue, we introduce an uncertainty estimation technique for +neural tracking-by-regression frameworks and incorporate it into our novel +extended Poisson multi-Bernoulli mixture tracker. Our uncertainty estimation +identifies uncertain associations within high-performing tracking-by-regression +methods using problem-specific test-time augmentations. Leveraging this +uncertainty, along with a novel mitosis-aware assignment problem formulation, +our tracker resolves false associations and mitosis detections stemming from +long-term conflicts. We evaluate our approach on nine competitive datasets and +demonstrate that it outperforms the current state-of-the-art on biologically +relevant metrics substantially, achieving improvements by a factor of +approximately $5.75$. Furthermore, we uncover new insights into the behavior of +tracking-by-regression uncertainty. + +
+
+ comment: 23 pages, 10 figures, 5 tables +
+
+
+
+
+ + ☆ Clean-image Backdoor Attacks + + +
+ To gather a significant quantity of annotated training data for +high-performance image classification models, numerous companies opt to enlist +third-party providers to label their unlabeled data. This practice is widely +regarded as secure, even in cases where some annotated errors occur, as the +impact of these minor inaccuracies on the final performance of the models is +negligible and existing backdoor attacks require attacker's ability to poison +the training images. Nevertheless, in this paper, we propose clean-image +backdoor attacks which uncover that backdoors can still be injected via a +fraction of incorrect labels without modifying the training images. +Specifically, in our attacks, the attacker first seeks a trigger feature to +divide the training images into two parts: those with the feature and those +without it. Subsequently, the attacker falsifies the labels of the former part +to a backdoor class. The backdoor will be finally implanted into the target +model after it is trained on the poisoned data. During the inference phase, the +attacker can activate the backdoor in two ways: slightly modifying the input +image to obtain the trigger feature, or taking an image that naturally has the +trigger feature as input. We conduct extensive experiments to demonstrate the +effectiveness and practicality of our attacks. According to the experimental +results, we conclude that our attacks seriously jeopardize the fairness and +robustness of image classification models, and it is necessary to be vigilant +about the incorrect labels in outsourced labeling. + +
+
+
+
+
+ + ☆ TexRO: Generating Delicate Textures of 3D Models by Recursive + Optimization + + +
+ This paper presents TexRO, a novel method for generating delicate textures of +a known 3D mesh by optimizing its UV texture. The key contributions are +two-fold. We propose an optimal viewpoint selection strategy, that finds the +most miniature set of viewpoints covering all the faces of a mesh. Our +viewpoint selection strategy guarantees the completeness of a generated result. +We propose a recursive optimization pipeline that optimizes a UV texture at +increasing resolutions, with an adaptive denoising method that re-uses existing +textures for new texture generation. Through extensive experimentation, we +demonstrate the superior performance of TexRO in terms of texture quality, +detail preservation, visual consistency, and, notably runtime speed, +outperforming other current methods. The broad applicability of TexRO is +further confirmed through its successful use on diverse 3D models. + +
+
+ comment: Technical report. Project page: + \href{https://3d-aigc.github.io/TexRO}{https://3d-aigc.github.io/TexRO} +
+
+
+
+
+ + ☆ Tri-Perspective View Decomposition for Geometry-Aware Depth Completion CVPR 2024 + + +
+ Depth completion is a vital task for autonomous driving, as it involves +reconstructing the precise 3D geometry of a scene from sparse and noisy depth +measurements. However, most existing methods either rely only on 2D depth +representations or directly incorporate raw 3D point clouds for compensation, +which are still insufficient to capture the fine-grained 3D geometry of the +scene. To address this challenge, we introduce Tri-Perspective view +Decomposition (TPVD), a novel framework that can explicitly model 3D geometry. +In particular, (1) TPVD ingeniously decomposes the original point cloud into +three 2D views, one of which corresponds to the sparse depth input. (2) We +design TPV Fusion to update the 2D TPV features through recurrent 2D-3D-2D +aggregation, where a Distance-Aware Spherical Convolution (DASC) is applied. +(3) By adaptively choosing TPV affinitive neighbors, the newly proposed +Geometric Spatial Propagation Network (GSPN) further improves the geometric +consistency. As a result, our TPVD outperforms existing methods on KITTI, +NYUv2, and SUN RGBD. Furthermore, we build a novel depth completion dataset +named TOFDC, which is acquired by the time-of-flight (TOF) sensor and the color +camera on smartphones. Project page: +https://yanzq95.github.io/projectpage/TOFDC/index.html + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ ParFormer: Vision Transformer Baseline with Parallel Local Global Token + Mixer and Convolution Attention Patch Embedding + + +
+ This work presents ParFormer as an enhanced transformer architecture that +allows the incorporation of different token mixers into a single stage, hence +improving feature extraction capabilities. Integrating both local and global +data allows for precise representation of short- and long-range spatial +relationships without the need for computationally intensive methods such as +shifting windows. Along with the parallel token mixer encoder, We offer the +Convolutional Attention Patch Embedding (CAPE) as an enhancement of standard +patch embedding to improve token mixer extraction with a convolutional +attention module. Our comprehensive evaluation demonstrates that our ParFormer +outperforms CNN-based and state-of-the-art transformer-based architectures in +image classification and several complex tasks such as object recognition. The +proposed CAPE has been demonstrated to benefit the overall MetaFormer +architecture, even while utilizing the Identity Mapping Token Mixer, resulting +in a 0.5\% increase in accuracy. The ParFormer models outperformed ConvNeXt and +Swin Transformer for the pure convolution and transformer model in accuracy. +Furthermore, our model surpasses the current leading hybrid transformer by +reaching competitive Top-1 scores in the ImageNet-1K classification test. +Specifically, our model variants with 11M, 23M, and 34M parameters achieve +scores of 80.4\%, 82.1\%, and 83.1\%, respectively. Code: +https://github.com/novendrastywn/ParFormer-CAPE-2024 + +
+
+
+
+
+ + ☆ Magic for the Age of Quantized DNNs + + +
+ Recently, the number of parameters in DNNs has explosively increased, as +exemplified by LLMs (Large Language Models), making inference on small-scale +computers more difficult. Model compression technology is, therefore, essential +for integration into products. In this paper, we propose a method of +quantization-aware training. We introduce a novel normalization (Layer-Batch +Normalization) that is independent of the mini-batch size and does not require +any additional computation cost during inference. Then, we quantize the weights +by the scaled round-clip function with the weight standardization. We also +quantize activation functions using the same function and apply surrogate +gradients to train the model with both quantized weights and the quantized +activation functions. We call this method Magic for the age of Quantised DNNs +(MaQD). Experimental results show that our quantization method can be achieved +with minimal accuracy degradation. + +
+
+ comment: 14 pages, 5 figures, 4 tables +
+
+
+
+
+ + ☆ Improve Cross-domain Mixed Sampling with Guidance Training for Adaptive + Segmentation + + +
+ Unsupervised Domain Adaptation (UDA) endeavors to adjust models trained on a +source domain to perform well on a target domain without requiring additional +annotations. In the context of domain adaptive semantic segmentation, which +tackles UDA for dense prediction, the goal is to circumvent the need for costly +pixel-level annotations. Typically, various prevailing methods baseline rely on +constructing intermediate domains via cross-domain mixed sampling techniques to +mitigate the performance decline caused by domain gaps. However, such +approaches generate synthetic data that diverge from real-world distributions, +potentially leading the model astray from the true target distribution. To +address this challenge, we propose a novel auxiliary task called Guidance +Training. This task facilitates the effective utilization of cross-domain mixed +sampling techniques while mitigating distribution shifts from the real world. +Specifically, Guidance Training guides the model to extract and reconstruct the +target-domain feature distribution from mixed data, followed by decoding the +reconstructed target-domain features to make pseudo-label predictions. +Importantly, integrating Guidance Training incurs minimal training overhead and +imposes no additional inference burden. We demonstrate the efficacy of our +approach by integrating it with existing methods, consistently improving +performance. The implementation will be available at +https://github.com/Wenlve-Zhou/Guidance-Training. + +
+
+
+
+
+ + ☆ Generative Active Learning for Image Synthesis Personalization + + +
+ This paper presents a pilot study that explores the application of active +learning, traditionally studied in the context of discriminative models, to +generative models. We specifically focus on image synthesis personalization +tasks. The primary challenge in conducting active learning on generative models +lies in the open-ended nature of querying, which differs from the closed form +of querying in discriminative models that typically target a single concept. We +introduce the concept of anchor directions to transform the querying process +into a semi-open problem. We propose a direction-based uncertainty sampling +strategy to enable generative active learning and tackle the +exploitation-exploration dilemma. Extensive experiments are conducted to +validate the effectiveness of our approach, demonstrating that an open-source +model can achieve superior performance compared to closed-source models +developed by large companies, such as Google's StyleDrop. The source code is +available at https://github.com/zhangxulu1996/GAL4Personalization. + +
+
+
+
+
+ + ☆ Piecewise-Linear Manifolds for Deep Metric Learning + + +
+ Unsupervised deep metric learning (UDML) focuses on learning a semantic +representation space using only unlabeled data. This challenging problem +requires accurately estimating the similarity between data points, which is +used to supervise a deep network. For this purpose, we propose to model the +high-dimensional data manifold using a piecewise-linear approximation, with +each low-dimensional linear piece approximating the data manifold in a small +neighborhood of a point. These neighborhoods are used to estimate similarity +between data points. We empirically show that this similarity estimate +correlates better with the ground truth than the similarity estimates of +current state-of-the-art techniques. We also show that proxies, commonly used +in supervised metric learning, can be used to model the piecewise-linear +manifold in an unsupervised setting, helping improve performance. Our method +outperforms existing unsupervised metric learning approaches on standard +zero-shot image retrieval benchmarks. + +
+
+ comment: Accepted at CPAL 2024 (Oral) +
+
+
+
+
+ + ☆ AVT2-DWF: Improving Deepfake Detection with Audio-Visual Fusion and + Dynamic Weighting Strategies + + +
+ With the continuous improvements of deepfake methods, forgery messages have +transitioned from single-modality to multi-modal fusion, posing new challenges +for existing forgery detection algorithms. In this paper, we propose AVT2-DWF, +the Audio-Visual dual Transformers grounded in Dynamic Weight Fusion, which +aims to amplify both intra- and cross-modal forgery cues, thereby enhancing +detection capabilities. AVT2-DWF adopts a dual-stage approach to capture both +spatial characteristics and temporal dynamics of facial expressions. This is +achieved through a face transformer with an n-frame-wise tokenization strategy +encoder and an audio transformer encoder. Subsequently, it uses multi-modal +conversion with dynamic weight fusion to address the challenge of heterogeneous +information fusion between audio and visual modalities. Experiments on +DeepfakeTIMIT, FakeAVCeleb, and DFDC datasets indicate that AVT2-DWF achieves +state-of-the-art performance intra- and cross-dataset Deepfake detection. Code +is available at https://github.com/raining-dev/AVT2-DWF. + +
+
+
+
+
+ + ☆ Trajectory Regularization Enhances Self-Supervised Geometric + Representation + + +
+ Self-supervised learning (SSL) has proven effective in learning high-quality +representations for various downstream tasks, with a primary focus on semantic +tasks. However, its application in geometric tasks remains underexplored, +partially due to the absence of a standardized evaluation method for geometric +representations. To address this gap, we introduce a new pose-estimation +benchmark for assessing SSL geometric representations, which demands training +without semantic or pose labels and achieving proficiency in both semantic and +geometric downstream tasks. On this benchmark, we study enhancing SSL geometric +representations without sacrificing semantic classification accuracy. We find +that leveraging mid-layer representations improves pose-estimation performance +by 10-20%. Further, we introduce an unsupervised trajectory-regularization +loss, which improves performance by an additional 4% and improves +generalization ability on out-of-distribution data. We hope the proposed +benchmark and methods offer new insights and improvements in self-supervised +geometric representation learning. + +
+
+
+
+
+ + ☆ DreamFlow: High-Quality Text-to-3D Generation by Approximating + Probability Flow ICLR 2024 + + +
+ Recent progress in text-to-3D generation has been achieved through the +utilization of score distillation methods: they make use of the pre-trained +text-to-image (T2I) diffusion models by distilling via the diffusion model +training objective. However, such an approach inevitably results in the use of +random timesteps at each update, which increases the variance of the gradient +and ultimately prolongs the optimization process. In this paper, we propose to +enhance the text-to-3D optimization by leveraging the T2I diffusion prior in +the generative sampling process with a predetermined timestep schedule. To this +end, we interpret text-to3D optimization as a multi-view image-to-image +translation problem, and propose a solution by approximating the probability +flow. By leveraging the proposed novel optimization algorithm, we design +DreamFlow, a practical three-stage coarseto-fine text-to-3D optimization +framework that enables fast generation of highquality and high-resolution +(i.e., 1024x1024) 3D contents. For example, we demonstrate that DreamFlow is 5 +times faster than the existing state-of-the-art text-to-3D method, while +producing more photorealistic 3D contents. Visit our project page +(https://kyungmnlee.github.io/dreamflow.github.io/) for visualizations. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ☆ GPT-Connect: Interaction between Text-Driven Human Motion Generator and + 3D Scenes in a Training-free Manner + + +
+ Recently, while text-driven human motion generation has received massive +research attention, most existing text-driven motion generators are generally +only designed to generate motion sequences in a blank background. While this is +the case, in practice, human beings naturally perform their motions in 3D +scenes, rather than in a blank background. Considering this, we here aim to +perform scene-aware text-drive motion generation instead. Yet, intuitively +training a separate scene-aware motion generator in a supervised way can +require a large amount of motion samples to be troublesomely collected and +annotated in a large scale of different 3D scenes. To handle this task rather +in a relatively convenient manner, in this paper, we propose a novel +GPT-connect framework. In GPT-connect, we enable scene-aware motion sequences +to be generated directly utilizing the existing blank-background human motion +generator, via leveraging ChatGPT to connect the existing motion generator with +the 3D scene in a totally training-free manner. Extensive experiments +demonstrate the efficacy and generalizability of our proposed framework. + +
+
+
+
+
+ + ☆ CLIP-VQDiffusion : Langauge Free Training of Text To Image generation + using CLIP and vector quantized diffusion model + + +
+ There has been a significant progress in text conditional image generation +models. Recent advancements in this field depend not only on improvements in +model structures, but also vast quantities of text-image paired datasets. +However, creating these kinds of datasets is very costly and requires a +substantial amount of labor. Famous face datasets don't have corresponding text +captions, making it difficult to develop text conditional image generation +models on these datasets. Some research has focused on developing text to image +generation models using only images without text captions. Here, we propose +CLIP-VQDiffusion, which leverage the pretrained CLIP model to provide +multimodal text-image representations and strong image generation capabilities. +On the FFHQ dataset, our model outperformed previous state-of-the-art methods +by 4.4% in clipscore and generated very realistic images even when the text was +both in and out of distribution. The pretrained models and codes will soon be +available at https://github.com/INFINIQ-AI1/CLIPVQDiffusion + +
+
+ comment: 15 pages, 9 figures +
+
+
+
+
+ + ☆ STAG4D: Spatial-Temporal Anchored Generative 4D Gaussians + + +
+ Recent progress in pre-trained diffusion models and 3D generation have +spurred interest in 4D content creation. However, achieving high-fidelity 4D +generation with spatial-temporal consistency remains a challenge. In this work, +we propose STAG4D, a novel framework that combines pre-trained diffusion models +with dynamic 3D Gaussian splatting for high-fidelity 4D generation. Drawing +inspiration from 3D generation techniques, we utilize a multi-view diffusion +model to initialize multi-view images anchoring on the input video frames, +where the video can be either real-world captured or generated by a video +diffusion model. To ensure the temporal consistency of the multi-view sequence +initialization, we introduce a simple yet effective fusion strategy to leverage +the first frame as a temporal anchor in the self-attention computation. With +the almost consistent multi-view sequences, we then apply the score +distillation sampling to optimize the 4D Gaussian point cloud. The 4D Gaussian +spatting is specially crafted for the generation task, where an adaptive +densification strategy is proposed to mitigate the unstable Gaussian gradient +for robust optimization. Notably, the proposed pipeline does not require any +pre-training or fine-tuning of diffusion networks, offering a more accessible +and practical solution for the 4D generation task. Extensive experiments +demonstrate that our method outperforms prior 4D generation works in rendering +quality, spatial-temporal consistency, and generation robustness, setting a new +state-of-the-art for 4D generation from diverse inputs, including text, image, +and video. + +
+
+
+
+
+ + ☆ Survey on Modeling of Articulated Objects + + +
+ 3D modeling of articulated objects is a research problem within computer +vision, graphics, and robotics. Its objective is to understand the shape and +motion of the articulated components, represent the geometry and mobility of +object parts, and create realistic models that reflect articulated objects in +the real world. This survey provides a comprehensive overview of the current +state-of-the-art in 3D modeling of articulated objects, with a specific focus +on the task of articulated part perception and articulated object creation +(reconstruction and generation). We systematically review and discuss the +relevant literature from two perspectives: geometry processing and articulation +modeling. Through this survey, we highlight the substantial progress made in +these areas, outline the ongoing challenges, and identify gaps for future +research. Our survey aims to serve as a foundational reference for researchers +and practitioners in computer vision and graphics, offering insights into the +complexities of articulated object modeling. + +
+
+
+
+
+ + ☆ Defying Imbalanced Forgetting in Class Incremental Learning AAAI2024 + + +
+ We observe a high level of imbalance in the accuracy of different classes in +the same old task for the first time. This intriguing phenomenon, discovered in +replay-based Class Incremental Learning (CIL), highlights the imbalanced +forgetting of learned classes, as their accuracy is similar before the +occurrence of catastrophic forgetting. This discovery remains previously +unidentified due to the reliance on average incremental accuracy as the +measurement for CIL, which assumes that the accuracy of classes within the same +task is similar. However, this assumption is invalid in the face of +catastrophic forgetting. Further empirical studies indicate that this +imbalanced forgetting is caused by conflicts in representation between +semantically similar old and new classes. These conflicts are rooted in the +data imbalance present in replay-based CIL methods. Building on these insights, +we propose CLass-Aware Disentanglement (CLAD) to predict the old classes that +are more likely to be forgotten and enhance their accuracy. Importantly, CLAD +can be seamlessly integrated into existing CIL methods. Extensive experiments +demonstrate that CLAD consistently improves current replay-based methods, +resulting in performance gains of up to 2.56%. + +
+
+ comment: AAAI2024 +
+
+
+
+
+ + ☆ Web-based Melanoma Detection + + +
+ Melanoma is the most aggressive form of skin cancer, and early detection can +significantly increase survival rates and prevent cancer spread. However, +developing reliable automated detection techniques is difficult due to the lack +of standardized datasets and evaluation methods. This study introduces a +unified melanoma classification approach that supports 54 combinations of 11 +datasets and 24 state-of-the-art deep learning architectures. It enables a fair +comparison of 1,296 experiments and results in a lightweight model deployable +to the web-based MeshNet architecture named Mela-D. This approach can run up to +33x faster by reducing parameters 24x to yield an analogous 88.8\% accuracy +comparable with ResNet50 on previously unseen images. This allows efficient and +accurate melanoma detection in real-world settings that can run on +consumer-level hardware. + +
+
+ comment: 10 pages, 9 figures +
+
+
+
+
+ + ☆ Geometric Generative Models based on Morphological Equivariant PDEs and + GANs + + +
+ Content and image generation consist in creating or generating data from +noisy information by extracting specific features such as texture, edges, and +other thin image structures. We are interested here in generative models, and +two main problems are addressed. Firstly, the improvements of specific feature +extraction while accounting at multiscale levels intrinsic geometric features; +and secondly, the equivariance of the network to reduce its complexity and +provide a geometric interpretability. To proceed, we propose a geometric +generative model based on an equivariant partial differential equation (PDE) +for group convolution neural networks (G-CNNs), so called PDE-G-CNNs, built on +morphology operators and generative adversarial networks (GANs). Equivariant +morphological PDE layers are composed of multiscale dilations and erosions +formulated in Riemannian manifolds, while group symmetries are defined on a Lie +group. We take advantage of the Lie group structure to properly integrate the +equivariance in layers, and are able to use the Riemannian metric to solve the +multiscale morphological operations. Each point of the Lie group is associated +with a unique point in the manifold, which helps us derive a metric on the +Riemannian manifold from a tensor field invariant under the Lie group so that +the induced metric has the same symmetries. The proposed geometric +morphological GAN (GM-GAN) is obtained by using the proposed morphological +equivariant convolutions in PDE-G-CNNs to bring nonlinearity in classical CNNs. +GM-GAN is evaluated on MNIST data and compared with GANs. Preliminary results +show that GM-GAN model outperforms classical GAN. + +
+
+
+
+
+ + ☆ GaNI: Global and Near Field Illumination Aware Neural Inverse Rendering + + +
+ In this paper, we present GaNI, a Global and Near-field Illumination-aware +neural inverse rendering technique that can reconstruct geometry, albedo, and +roughness parameters from images of a scene captured with co-located light and +camera. Existing inverse rendering techniques with co-located light-camera +focus on single objects only, without modeling global illumination and +near-field lighting more prominent in scenes with multiple objects. We +introduce a system that solves this problem in two stages; we first reconstruct +the geometry powered by neural volumetric rendering NeuS, followed by inverse +neural radiosity that uses the previously predicted geometry to estimate albedo +and roughness. However, such a naive combination fails and we propose multiple +technical contributions that enable this two-stage approach. We observe that +NeuS fails to handle near-field illumination and strong specular reflections +from the flashlight in a scene. We propose to implicitly model the effects of +near-field illumination and introduce a surface angle loss function to handle +specular reflections. Similarly, we observe that invNeRad assumes constant +illumination throughout the capture and cannot handle moving flashlights during +capture. We propose a light position-aware radiance cache network and +additional smoothness priors on roughness to reconstruct reflectance. +Experimental evaluation on synthetic and real data shows that our method +outperforms the existing co-located light-camera-based inverse rendering +techniques. Our approach produces significantly better reflectance and slightly +better geometry than capture strategies that do not require a dark room. + +
+
+
+
+
+ + ☆ RetiGen: A Framework for Generalized Retinal Diagnosis Using Multi-View + Fundus Images + + +
+ This study introduces a novel framework for enhancing domain generalization +in medical imaging, specifically focusing on utilizing unlabelled multi-view +colour fundus photographs. Unlike traditional approaches that rely on +single-view imaging data and face challenges in generalizing across diverse +clinical settings, our method leverages the rich information in the unlabelled +multi-view imaging data to improve model robustness and accuracy. By +incorporating a class balancing method, a test-time adaptation technique and a +multi-view optimization strategy, we address the critical issue of domain shift +that often hampers the performance of machine learning models in real-world +applications. Experiments comparing various state-of-the-art domain +generalization and test-time optimization methodologies show that our approach +consistently outperforms when combined with existing baseline and +state-of-the-art methods. We also show our online method improves all existing +techniques. Our framework demonstrates improvements in domain generalization +capabilities and offers a practical solution for real-world deployment by +facilitating online adaptation to new, unseen datasets. Our code is available +at https://github.com/zgy600/RetiGen . + +
+
+
+
+
+ + ☆ Semantic Gaussians: Open-Vocabulary Scene Understanding with 3D Gaussian + Splatting + + +
+ Open-vocabulary 3D scene understanding presents a significant challenge in +computer vision, withwide-ranging applications in embodied agents and augmented +reality systems. Previous approaches haveadopted Neural Radiance Fields (NeRFs) +to analyze 3D scenes. In this paper, we introduce SemanticGaussians, a novel +open-vocabulary scene understanding approach based on 3D Gaussian Splatting. +Our keyidea is distilling pre-trained 2D semantics into 3D Gaussians. We design +a versatile projection approachthat maps various 2Dsemantic features from +pre-trained image encoders into a novel semantic component of 3D Gaussians, +withoutthe additional training required by NeRFs. We further build a 3D +semantic network that directly predictsthe semantic component from raw 3D +Gaussians for fast inference. We explore several applications ofSemantic +Gaussians: semantic segmentation on ScanNet-20, where our approach attains a +4.2% mIoU and 4.0%mAcc improvement over prior open-vocabulary scene +understanding counterparts; object part segmentation,sceneediting, and +spatial-temporal segmentation with better qualitative results over 2D and 3D +baselines,highlighting its versatility and effectiveness on supporting diverse +downstream tasks. + +
+
+ comment: Project page: see https://semantic-gaussians.github.io +
+
+
+
+
+ + ☆ InterFusion: Text-Driven Generation of 3D Human-Object Interaction + + +
+ In this study, we tackle the complex task of generating 3D human-object +interactions (HOI) from textual descriptions in a zero-shot text-to-3D manner. +We identify and address two key challenges: the unsatisfactory outcomes of +direct text-to-3D methods in HOI, largely due to the lack of paired +text-interaction data, and the inherent difficulties in simultaneously +generating multiple concepts with complex spatial relationships. To effectively +address these issues, we present InterFusion, a two-stage framework +specifically designed for HOI generation. InterFusion involves human pose +estimations derived from text as geometric priors, which simplifies the +text-to-3D conversion process and introduces additional constraints for +accurate object generation. At the first stage, InterFusion extracts 3D human +poses from a synthesized image dataset depicting a wide range of interactions, +subsequently mapping these poses to interaction descriptions. The second stage +of InterFusion capitalizes on the latest developments in text-to-3D generation, +enabling the production of realistic and high-quality 3D HOI scenes. This is +achieved through a local-global optimization process, where the generation of +human body and object is optimized separately, and jointly refined with a +global optimization of the entire scene, ensuring a seamless and contextually +coherent integration. Our experimental results affirm that InterFusion +significantly outperforms existing state-of-the-art methods in 3D HOI +generation. + +
+
+
+
+
+ + ☆ Towards Automatic Abdominal MRI Organ Segmentation: Leveraging + Synthesized Data Generated From CT Labels + + +
+ Deep learning has shown great promise in the ability to automatically +annotate organs in magnetic resonance imaging (MRI) scans, for example, of the +brain. However, despite advancements in the field, the ability to accurately +segment abdominal organs remains difficult across MR. In part, this may be +explained by the much greater variability in image appearance and severely +limited availability of training labels. The inherent nature of computed +tomography (CT) scans makes it easier to annotate, resulting in a larger +availability of expert annotations for the latter. We leverage a +modality-agnostic domain randomization approach, utilizing CT label maps to +generate synthetic images on-the-fly during training, further used to train a +U-Net segmentation network for abdominal organs segmentation. Our approach +shows comparable results compared to fully-supervised segmentation methods +trained on MR data. Our method results in Dice scores of 0.90 (0.08) and 0.91 +(0.08) for the right and left kidney respectively, compared to a pretrained +nnU-Net model yielding 0.87 (0.20) and 0.91 (0.03). We will make our code +publicly available. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ Efficiently Assemble Normalization Layers and Regularization for + Federated Domain Generalization + + +
+ Domain shift is a formidable issue in Machine Learning that causes a model to +suffer from performance degradation when tested on unseen domains. Federated +Domain Generalization (FedDG) attempts to train a global model using +collaborative clients in a privacy-preserving manner that can generalize well +to unseen clients possibly with domain shift. However, most existing FedDG +methods either cause additional privacy risks of data leakage or induce +significant costs in client communication and computation, which are major +concerns in the Federated Learning paradigm. To circumvent these challenges, +here we introduce a novel architectural method for FedDG, namely gPerXAN, which +relies on a normalization scheme working with a guiding regularizer. In +particular, we carefully design Personalized eXplicitly Assembled Normalization +to enforce client models selectively filtering domain-specific features that +are biased towards local data while retaining discrimination of those features. +Then, we incorporate a simple yet effective regularizer to guide these models +in directly capturing domain-invariant representations that the global model's +classifier can leverage. Extensive experimental results on two benchmark +datasets, i.e., PACS and Office-Home, and a real-world medical dataset, +Camelyon17, indicate that our proposed method outperforms other existing +methods in addressing this particular problem. + +
+
+
+
+
+ + ☆ Forward Learning for Gradient-based Black-box Saliency Map Generation + + +
+ Gradient-based saliency maps are widely used to explain deep neural network +decisions. However, as models become deeper and more black-box, such as in +closed-source APIs like ChatGPT, computing gradients become challenging, +hindering conventional explanation methods. In this work, we introduce a novel +unified framework for estimating gradients in black-box settings and generating +saliency maps to interpret model decisions. We employ the likelihood ratio +method to estimate output-to-input gradients and utilize them for saliency map +generation. Additionally, we propose blockwise computation techniques to +enhance estimation accuracy. Extensive experiments in black-box settings +validate the effectiveness of our method, demonstrating accurate gradient +estimation and explainability of generated saliency maps. Furthermore, we +showcase the scalability of our approach by applying it to explain GPT-Vision, +revealing the continued relevance of gradient-based explanation methods in the +era of large, closed-source, and black-box models. + +
+
+
+
+
+ + ☆ FairerCLIP: Debiasing CLIP's Zero-Shot Predictions using Functions in + RKHSs ICLR + + +
+ Large pre-trained vision-language models such as CLIP provide compact and +general-purpose representations of text and images that are demonstrably +effective across multiple downstream zero-shot prediction tasks. However, owing +to the nature of their training process, these models have the potential to 1) +propagate or amplify societal biases in the training data and 2) learn to rely +on spurious features. This paper proposes FairerCLIP, a general approach for +making zero-shot predictions of CLIP more fair and robust to spurious +correlations. We formulate the problem of jointly debiasing CLIP's image and +text representations in reproducing kernel Hilbert spaces (RKHSs), which +affords multiple benefits: 1) Flexibility: Unlike existing approaches, which +are specialized to either learn with or without ground-truth labels, FairerCLIP +is adaptable to learning in both scenarios. 2) Ease of Optimization: FairerCLIP +lends itself to an iterative optimization involving closed-form solvers, which +leads to $4\times$-$10\times$ faster training than the existing methods. 3) +Sample Efficiency: Under sample-limited conditions, FairerCLIP significantly +outperforms baselines when they fail entirely. And, 4) Performance: +Empirically, FairerCLIP achieves appreciable accuracy gains on benchmark +fairness and spurious correlation datasets over their respective baselines. + +
+
+ comment: The Twelfth International Conference on Learning Representations + (ICLR) 2024 +
+
+
+
+
+ + ☆ MedPromptX: Grounded Multimodal Prompting for Chest X-ray Diagnosis + + +
+ Chest X-ray images are commonly used for predicting acute and chronic +cardiopulmonary conditions, but efforts to integrate them with structured +clinical data face challenges due to incomplete electronic health records +(EHR). This paper introduces \textbf{MedPromptX}, the first model to integrate +multimodal large language models (MLLMs), few-shot prompting (FP) and visual +grounding (VG) to combine imagery with EHR data for chest X-ray diagnosis. A +pre-trained MLLM is utilized to complement the missing EHR information, +providing a comprehensive understanding of patients' medical history. +Additionally, FP reduces the necessity for extensive training of MLLMs while +effectively tackling the issue of hallucination. Nevertheless, the process of +determining the optimal number of few-shot examples and selecting high-quality +candidates can be burdensome, yet it profoundly influences model performance. +Hence, we propose a new technique that dynamically refines few-shot data for +real-time adjustment to new patient scenarios. Moreover, VG aids in focusing +the model's attention on relevant regions of interest in X-ray images, +enhancing the identification of abnormalities. We release MedPromptX-VQA, a new +in-context visual question answering dataset encompassing interleaved image and +EHR data derived from MIMIC-IV and MIMIC-CXR databases. Results demonstrate the +SOTA performance of MedPromptX, achieving an 11% improvement in F1-score +compared to the baselines. Code and data are available at +\url{https://github.com/BioMedIA-MBZUAI/MedPromptX}. + +
+
+
+
+
+ + ☆ U-ARE-ME: Uncertainty-Aware Rotation Estimation in Manhattan + Environments + + +
+ Camera rotation estimation from a single image is a challenging task, often +requiring depth data and/or camera intrinsics, which are generally not +available for in-the-wild videos. Although external sensors such as inertial +measurement units (IMUs) can help, they often suffer from drift and are not +applicable in non-inertial reference frames. We present U-ARE-ME, an algorithm +that estimates camera rotation along with uncertainty from uncalibrated RGB +images. Using a Manhattan World assumption, our method leverages the per-pixel +geometric priors encoded in single-image surface normal predictions and +performs optimisation over the SO(3) manifold. Given a sequence of images, we +can use the per-frame rotation estimates and their uncertainty to perform +multi-frame optimisation, achieving robustness and temporal consistency. Our +experiments demonstrate that U-ARE-ME performs comparably to RGB-D methods and +is more robust than sparse feature-based SLAM methods. We encourage the reader +to view the accompanying video at https://callum-rhodes.github.io/U-ARE-ME for +a visual overview of our method. + +
+
+ comment: For the project page and video see + https://callum-rhodes.github.io/U-ARE-ME +
+
+
+
+
+ + ☆ Augmented Reality Warnings in Roadway Work Zones: Evaluating the Effect + of Modality on Worker Reaction Times + + +
+ Given the aging highway infrastructure requiring extensive rebuilding and +enhancements, and the consequent rise in the number of work zones, there is an +urgent need to develop advanced safety systems to protect workers. While +Augmented Reality (AR) holds significant potential for delivering warnings to +workers, its integration into roadway work zones remains relatively unexplored. +The primary objective of this study is to improve safety measures within +roadway work zones by conducting an extensive analysis of how different +combinations of multimodal AR warnings influence the reaction times of workers. +This paper addresses this gap through a series of experiments that aim to +replicate the distinctive conditions of roadway work zones, both in real-world +and virtual reality environments. Our approach comprises three key components: +an advanced AR system prototype, a VR simulation of AR functionality within the +work zone environment, and the Wizard of Oz technique to synchronize user +experiences across experiments. To assess reaction times, we leverage both the +simple reaction time (SRT) technique and an innovative vision-based metric that +utilizes real-time pose estimation. By conducting five experiments in +controlled outdoor work zones and indoor VR settings, our study provides +valuable information on how various multimodal AR warnings impact workers +reaction times. Furthermore, our findings reveal the disparities in reaction +times between VR simulations and real-world scenarios, thereby gauging VR's +capability to mirror the dynamics of roadway work zones. Furthermore, our +results substantiate the potential and reliability of vision-based reaction +time measurements. These insights resonate well with those derived using the +SRT technique, underscoring the viability of this approach for tangible +real-world uses. + +
+
+
+
+
+ + ☆ Do not trust what you trust: Miscalibration in Semi-supervised Learning + + +
+ State-of-the-art semi-supervised learning (SSL) approaches rely on highly +confident predictions to serve as pseudo-labels that guide the training on +unlabeled samples. An inherent drawback of this strategy stems from the quality +of the uncertainty estimates, as pseudo-labels are filtered only based on their +degree of uncertainty, regardless of the correctness of their predictions. +Thus, assessing and enhancing the uncertainty of network predictions is of +paramount importance in the pseudo-labeling process. In this work, we +empirically demonstrate that SSL methods based on pseudo-labels are +significantly miscalibrated, and formally demonstrate the minimization of the +min-entropy, a lower bound of the Shannon entropy, as a potential cause for +miscalibration. To alleviate this issue, we integrate a simple penalty term, +which enforces the logit distances of the predictions on unlabeled samples to +remain low, preventing the network predictions to become overconfident. +Comprehensive experiments on a variety of SSL image classification benchmarks +demonstrate that the proposed solution systematically improves the calibration +performance of relevant SSL models, while also enhancing their discriminative +power, being an appealing addition to tackle SSL tasks. + +
+
+
+
+
+ + ☆ A2DMN: Anatomy-Aware Dilated Multiscale Network for Breast Ultrasound + Semantic Segmentation + + +
+ In recent years, convolutional neural networks for semantic segmentation of +breast ultrasound (BUS) images have shown great success; however, two major +challenges still exist. 1) Most current approaches inherently lack the ability +to utilize tissue anatomy, resulting in misclassified image regions. 2) They +struggle to produce accurate boundaries due to the repeated down-sampling +operations. To address these issues, we propose a novel breast anatomy-aware +network for capturing fine image details and a new smoothness term that encodes +breast anatomy. It incorporates context information across multiple spatial +scales to generate more accurate semantic boundaries. Extensive experiments are +conducted to compare the proposed method and eight state-of-the-art approaches +using a BUS dataset with 325 images. The results demonstrate the proposed +method significantly improves the segmentation of the muscle, mammary, and +tumor classes and produces more accurate fine details of tissue boundaries. + +
+
+
+
+
+ + ☆ An Optimization Framework to Enforce Multi-View Consistency for + Texturing 3D Meshes Using Pre-Trained Text-to-Image Models + + +
+ A fundamental problem in the texturing of 3D meshes using pre-trained +text-to-image models is to ensure multi-view consistency. State-of-the-art +approaches typically use diffusion models to aggregate multi-view inputs, where +common issues are the blurriness caused by the averaging operation in the +aggregation step or inconsistencies in local features. This paper introduces an +optimization framework that proceeds in four stages to achieve multi-view +consistency. Specifically, the first stage generates an over-complete set of 2D +textures from a predefined set of viewpoints using an MV-consistent diffusion +process. The second stage selects a subset of views that are mutually +consistent while covering the underlying 3D model. We show how to achieve this +goal by solving semi-definite programs. The third stage performs non-rigid +alignment to align the selected views across overlapping regions. The fourth +stage solves an MRF problem to associate each mesh face with a selected view. +In particular, the third and fourth stages are iterated, with the cuts obtained +in the fourth stage encouraging non-rigid alignment in the third stage to focus +on regions close to the cuts. Experimental results show that our approach +significantly outperforms baseline approaches both qualitatively and +quantitatively. + +
+
+
+
+
+ + ☆ Language-Based Depth Hints for Monocular Depth Estimation + + +
+ Monocular depth estimation (MDE) is inherently ambiguous, as a given image +may result from many different 3D scenes and vice versa. To resolve this +ambiguity, an MDE system must make assumptions about the most likely 3D scenes +for a given input. These assumptions can be either explicit or implicit. In +this work, we demonstrate the use of natural language as a source of an +explicit prior about the structure of the world. The assumption is made that +human language encodes the likely distribution in depth-space of various +objects. We first show that a language model encodes this implicit bias during +training, and that it can be extracted using a very simple learned approach. We +then show that this prediction can be provided as an explicit source of +assumption to an MDE system, using an off-the-shelf instance segmentation model +that provides the labels used as the input to the language model. We +demonstrate the performance of our method on the NYUD2 dataset, showing +improvement compared to the baseline and to random controls. + +
+
+ comment: 8 pages, 1 figure. Work originally done in June 2022 +
+
+
+
+
+ + ☆ Pixel-GS: Density Control with Pixel-aware Gradient for 3D Gaussian + Splatting + + +
+ 3D Gaussian Splatting (3DGS) has demonstrated impressive novel view synthesis +results while advancing real-time rendering performance. However, it relies +heavily on the quality of the initial point cloud, resulting in blurring and +needle-like artifacts in areas with insufficient initializing points. This is +mainly attributed to the point cloud growth condition in 3DGS that only +considers the average gradient magnitude of points from observable views, +thereby failing to grow for large Gaussians that are observable for many +viewpoints while many of them are only covered in the boundaries. To this end, +we propose a novel method, named Pixel-GS, to take into account the number of +pixels covered by the Gaussian in each view during the computation of the +growth condition. We regard the covered pixel numbers as the weights to +dynamically average the gradients from different views, such that the growth of +large Gaussians can be prompted. As a result, points within the areas with +insufficient initializing points can be grown more effectively, leading to a +more accurate and detailed reconstruction. In addition, we propose a simple yet +effective strategy to scale the gradient field according to the distance to the +camera, to suppress the growth of floaters near the camera. Extensive +experiments both qualitatively and quantitatively demonstrate that our method +achieves state-of-the-art rendering quality while maintaining real-time +rendering speed, on the challenging Mip-NeRF 360 and Tanks & Temples datasets. + +
+
+
+
+
+ + ☆ Evaluating GPT-4 with Vision on Detection of Radiological Findings on + Chest Radiographs + + +
+ The study examines the application of GPT-4V, a multi-modal large language +model equipped with visual recognition, in detecting radiological findings from +a set of 100 chest radiographs and suggests that GPT-4V is currently not ready +for real-world diagnostic usage in interpreting chest radiographs. + +
+
+
+
+
+ + ☆ Medical Image Data Provenance for Medical Cyber-Physical System + + +
+ Continuous advancements in medical technology have led to the creation of +affordable mobile imaging devices suitable for telemedicine and remote +monitoring. However, the rapid examination of large populations poses +challenges, including the risk of fraudulent practices by healthcare +professionals and social workers exchanging unverified images via mobile +applications. To mitigate these risks, this study proposes using watermarking +techniques to embed a device fingerprint (DFP) into captured images, ensuring +data provenance. The DFP, representing the unique attributes of the capturing +device and raw image, is embedded into raw images before storage, thus enabling +verification of image authenticity and source. Moreover, a robust remote +validation method is introduced to authenticate images, enhancing the integrity +of medical image data in interconnected healthcare systems. Through a case +study on mobile fundus imaging, the effectiveness of the proposed framework is +evaluated in terms of computational efficiency, image quality, security, and +trustworthiness. This approach is suitable for a range of applications, +including telemedicine, the Internet of Medical Things (IoMT), eHealth, and +Medical Cyber-Physical Systems (MCPS) applications, providing a reliable means +to maintain data provenance in diagnostic settings utilizing medical images or +videos. + +
+
+
+
+
+ + ☆ Improving Forward Compatibility in Class Incremental Learning by + Increasing Representation Rank and Feature Richness + + +
+ Class Incremental Learning (CIL) constitutes a pivotal subfield within +continual learning, aimed at enabling models to progressively learn new +classification tasks while retaining knowledge obtained from prior tasks. +Although previous studies have predominantly focused on backward compatible +approaches to mitigate catastrophic forgetting, recent investigations have +introduced forward compatible methods to enhance performance on novel tasks and +complement existing backward compatible methods. In this study, we introduce an +effective-Rank based Feature Richness enhancement (RFR) method, designed for +improving forward compatibility. Specifically, this method increases the +effective rank of representations during the base session, thereby facilitating +the incorporation of more informative features pertinent to unseen novel tasks. +Consequently, RFR achieves dual objectives in backward and forward +compatibility: minimizing feature extractor modifications and enhancing novel +task performance, respectively. To validate the efficacy of our approach, we +establish a theoretical connection between effective rank and the Shannon +entropy of representations. Subsequently, we conduct comprehensive experiments +by integrating RFR into eleven well-known CIL methods. Our results demonstrate +the effectiveness of our approach in enhancing novel-task performance while +mitigating catastrophic forgetting. Furthermore, our method notably improves +the average incremental accuracy across all eleven cases examined. + +
+
+
+
+
+ + ☆ Data-centric Prediction Explanation via Kernelized Stein Discrepancy + + +
+ Existing example-based prediction explanation methods often bridge test and +training data points through the model's parameters or latent representations. +While these methods offer clues to the causes of model predictions, they often +exhibit innate shortcomings, such as incurring significant computational +overhead or producing coarse-grained explanations. This paper presents a +Highly-precise and Data-centric Explanation (HD-Explain), a straightforward +prediction explanation method exploiting properties of Kernelized Stein +Discrepancy (KSD). Specifically, the KSD uniquely defines a parameterized +kernel function for a trained model that encodes model-dependent data +correlation. By leveraging the kernel function, one can identify training +samples that provide the best predictive support to a test point efficiently. +We conducted thorough analyses and experiments across multiple classification +domains, where we show that HD-Explain outperforms existing methods from +various aspects, including 1) preciseness (fine-grained explanation), 2) +consistency, and 3) computation efficiency, leading to a surprisingly simple, +effective, and robust prediction explanation solution. + +
+
+
+
+
+ + ♻ ☆ Gaussian-SLAM: Photo-realistic Dense SLAM with Gaussian Splatting + + +
+ We present a dense simultaneous localization and mapping (SLAM) method that +uses 3D Gaussians as a scene representation. Our approach enables +interactive-time reconstruction and photo-realistic rendering from real-world +single-camera RGBD videos. To this end, we propose a novel effective strategy +for seeding new Gaussians for newly explored areas and their effective online +optimization that is independent of the scene size and thus scalable to larger +scenes. This is achieved by organizing the scene into sub-maps which are +independently optimized and do not need to be kept in memory. We further +accomplish frame-to-model camera tracking by minimizing photometric and +geometric losses between the input and rendered frames. The Gaussian +representation allows for high-quality photo-realistic real-time rendering of +real-world scenes. Evaluation on synthetic and real-world datasets demonstrates +competitive or superior performance in mapping, tracking, and rendering +compared to existing neural dense SLAM methods. + +
+
+
+
+
+ + ♻ ☆ Videoshop: Localized Semantic Video Editing with Noise-Extrapolated + Diffusion Inversion + + +
+ We introduce Videoshop, a training-free video editing algorithm for localized +semantic edits. Videoshop allows users to use any editing software, including +Photoshop and generative inpainting, to modify the first frame; it +automatically propagates those changes, with semantic, spatial, and temporally +consistent motion, to the remaining frames. Unlike existing methods that enable +edits only through imprecise textual instructions, Videoshop allows users to +add or remove objects, semantically change objects, insert stock photos into +videos, etc. with fine-grained control over locations and appearance. We +achieve this through image-based video editing by inverting latents with noise +extrapolation, from which we generate videos conditioned on the edited image. +Videoshop produces higher quality edits against 6 baselines on 2 editing +benchmarks using 10 evaluation metrics. + +
+
+ comment: Project page at https://videoshop-editing.github.io/ +
+
+
+
+
+ + ♻ ☆ VideoPoet: A Large Language Model for Zero-Shot Video Generation + + +
+ We present VideoPoet, a language model capable of synthesizing high-quality +video, with matching audio, from a large variety of conditioning signals. +VideoPoet employs a decoder-only transformer architecture that processes +multimodal inputs -- including images, videos, text, and audio. The training +protocol follows that of Large Language Models (LLMs), consisting of two +stages: pretraining and task-specific adaptation. During pretraining, VideoPoet +incorporates a mixture of multimodal generative objectives within an +autoregressive Transformer framework. The pretrained LLM serves as a foundation +that can be adapted for a range of video generation tasks. We present empirical +results demonstrating the model's state-of-the-art capabilities in zero-shot +video generation, specifically highlighting VideoPoet's ability to generate +high-fidelity motions. Project page: http://sites.research.google/videopoet/ + +
+
+ comment: Project page: http://sites.research.google/videopoet/ +
+
+
+
+
+ + ♻ ☆ MM1: Methods, Analysis & Insights from Multimodal LLM Pre-training + + +
+ In this work, we discuss building performant Multimodal Large Language Models +(MLLMs). In particular, we study the importance of various architecture +components and data choices. Through careful and comprehensive ablations of the +image encoder, the vision language connector, and various pre-training data +choices, we identified several crucial design lessons. For example, we +demonstrate that for large-scale multimodal pre-training using a careful mix of +image-caption, interleaved image-text, and text-only data is crucial for +achieving state-of-the-art (SOTA) few-shot results across multiple benchmarks, +compared to other published pre-training results. Further, we show that the +image encoder together with image resolution and the image token count has +substantial impact, while the vision-language connector design is of +comparatively negligible importance. By scaling up the presented recipe, we +build MM1, a family of multimodal models up to 30B parameters, including both +dense models and mixture-of-experts (MoE) variants, that are SOTA in +pre-training metrics and achieve competitive performance after supervised +fine-tuning on a range of established multimodal benchmarks. Thanks to +large-scale pre-training, MM1 enjoys appealing properties such as enhanced +in-context learning, and multi-image reasoning, enabling few-shot +chain-of-thought prompting. + +
+
+
+
+
+ + ♻ ☆ SkySense: A Multi-Modal Remote Sensing Foundation Model Towards + Universal Interpretation for Earth Observation Imagery CVPR2024 + + +
+ Prior studies on Remote Sensing Foundation Model (RSFM) reveal immense +potential towards a generic model for Earth Observation. Nevertheless, these +works primarily focus on a single modality without temporal and geo-context +modeling, hampering their capabilities for diverse tasks. In this study, we +present SkySense, a generic billion-scale model, pre-trained on a curated +multi-modal Remote Sensing Imagery (RSI) dataset with 21.5 million temporal +sequences. SkySense incorporates a factorized multi-modal spatiotemporal +encoder taking temporal sequences of optical and Synthetic Aperture Radar (SAR) +data as input. This encoder is pre-trained by our proposed Multi-Granularity +Contrastive Learning to learn representations across different modal and +spatial granularities. To further enhance the RSI representations by the +geo-context clue, we introduce Geo-Context Prototype Learning to learn +region-aware prototypes upon RSI's multi-modal spatiotemporal features. To our +best knowledge, SkySense is the largest Multi-Modal RSFM to date, whose modules +can be flexibly combined or used individually to accommodate various tasks. It +demonstrates remarkable generalization capabilities on a thorough evaluation +encompassing 16 datasets over 7 tasks, from single- to multi-modal, static to +temporal, and classification to localization. SkySense surpasses 18 recent +RSFMs in all test scenarios. Specifically, it outperforms the latest models +such as GFM, SatLas and Scale-MAE by a large margin, i.e., 2.76%, 3.67% and +3.61% on average respectively. We will release the pre-trained weights to +facilitate future research and Earth Observation applications. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ Fast ODE-based Sampling for Diffusion Models in Around 5 Steps CVPR 2024 + + +
+ Sampling from diffusion models can be treated as solving the corresponding +ordinary differential equations (ODEs), with the aim of obtaining an accurate +solution with as few number of function evaluations (NFE) as possible. +Recently, various fast samplers utilizing higher-order ODE solvers have emerged +and achieved better performance than the initial first-order one. However, +these numerical methods inherently result in certain approximation errors, +which significantly degrades sample quality with extremely small NFE (e.g., +around 5). In contrast, based on the geometric observation that each sampling +trajectory almost lies in a two-dimensional subspace embedded in the ambient +space, we propose Approximate MEan-Direction Solver (AMED-Solver) that +eliminates truncation errors by directly learning the mean direction for fast +diffusion sampling. Besides, our method can be easily used as a plugin to +further improve existing ODE-based samplers. Extensive experiments on image +synthesis with the resolution ranging from 32 to 512 demonstrate the +effectiveness of our method. With only 5 NFE, we achieve 6.61 FID on CIFAR-10, +10.74 FID on ImageNet 64$\times$64, and 13.20 FID on LSUN Bedroom. Our code is +available at https://github.com/zju-pi/diff-sampler. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Cobra: Extending Mamba to Multi-Modal Large Language Model for Efficient + Inference + + +
+ In recent years, the application of multimodal large language models (MLLM) +in various fields has achieved remarkable success. However, as the foundation +model for many downstream tasks, current MLLMs are composed of the well-known +Transformer network, which has a less efficient quadratic computation +complexity. To improve the efficiency of such basic models, we propose Cobra, a +linear computational complexity MLLM. Specifically, Cobra integrates the +efficient Mamba language model into the visual modality. Moreover, we explore +and study various modal fusion schemes to create an effective multi-modal +Mamba. Extensive experiments demonstrate that (1) Cobra achieves extremely +competitive performance with current computationally efficient state-of-the-art +methods, e.g., LLaVA-Phi, TinyLLaVA, and MobileVLM v2, and has faster speed due +to Cobra's linear sequential modeling. (2) Interestingly, the results of +closed-set challenging prediction benchmarks show that Cobra performs well in +overcoming visual illusions and spatial relationship judgments. (3) Notably, +Cobra even achieves comparable performance to LLaVA with about 43% of the +number of parameters. We will make all codes of Cobra open-source and hope that +the proposed method can facilitate future research on complexity problems in +MLLM. Our project page is available at: https://sites.google.com/view/cobravlm. + +
+
+
+
+
+ + ♻ ☆ Faster Neighborhood Attention: Reducing the O(n^2) Cost of Self + Attention at the Threadblock Level + + +
+ Neighborhood attention reduces the cost of self attention by restricting each +token's attention span to its nearest neighbors. This restriction, +parameterized by a window size and dilation factor, draws a spectrum of +possible attention patterns between linear projection and self attention. +Neighborhood attention, and more generally sliding window attention patterns, +have long been bounded by infrastructure, particularly in higher-rank spaces +(2-D and 3-D), calling for the development of custom kernels, which have been +limited in either functionality, or performance, if not both. In this work, we +first show that neighborhood attention can be represented as a batched GEMM +problem, similar to standard attention, and implement it for 1-D and 2-D +neighborhood attention. These kernels on average provide 895% and 272% +improvement in full precision latency compared to existing naive kernels for +1-D and 2-D neighborhood attention respectively. We find certain inherent +inefficiencies in all unfused neighborhood attention kernels that bound their +performance and lower-precision scalability. We also developed fused +neighborhood attention; an adaptation of fused dot-product attention kernels +that allow fine-grained control over attention across different spatial axes. +Known for reducing the quadratic time complexity of self attention to a linear +complexity, neighborhood attention can now enjoy a reduced and constant memory +footprint, and record-breaking half precision latency. We observe that our +fused kernels successfully circumvent some of the unavoidable inefficiencies in +unfused implementations. While our unfused GEMM-based kernels only improve half +precision performance compared to naive kernels by an average of 496% and 113% +in 1-D and 2-D problems respectively, our fused kernels improve naive kernels +by an average of 1607% and 581% in 1-D and 2-D problems respectively. + +
+
+ comment: Project page: https://github.com/SHI-Labs/NATTEN +
+
+
+
+
+ + ♻ ☆ Semantics, Distortion, and Style Matter: Towards Source-free UDA for + Panoramic Segmentation CVPR 2024 + + +
+ This paper addresses an interesting yet challenging problem -- source-free +unsupervised domain adaptation (SFUDA) for pinhole-to-panoramic semantic +segmentation -- given only a pinhole image-trained model (i.e., source) and +unlabeled panoramic images (i.e., target). Tackling this problem is nontrivial +due to the semantic mismatches, style discrepancies, and inevitable distortion +of panoramic images. To this end, we propose a novel method that utilizes +Tangent Projection (TP) as it has less distortion and meanwhile slits the +equirectangular projection (ERP) with a fixed FoV to mimic the pinhole images. +Both projections are shown effective in extracting knowledge from the source +model. However, the distinct projection discrepancies between source and target +domains impede the direct knowledge transfer; thus, we propose a panoramic +prototype adaptation module (PPAM) to integrate panoramic prototypes from the +extracted knowledge for adaptation. We then impose the loss constraints on both +predictions and prototypes and propose a cross-dual attention module (CDAM) at +the feature level to better align the spatial and channel characteristics +across the domains and projections. Both knowledge extraction and transfer +processes are synchronously updated to reach the best performance. Extensive +experiments on the synthetic and real-world benchmarks, including outdoor and +indoor scenarios, demonstrate that our method achieves significantly better +performance than prior SFUDA methods for pinhole-to-panoramic adaptation. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Win-Win: Training High-Resolution Vision Transformers from Two Windows ICLR 2024 + + +
+ Transformers have become the standard in state-of-the-art vision +architectures, achieving impressive performance on both image-level and dense +pixelwise tasks. However, training vision transformers for high-resolution +pixelwise tasks has a prohibitive cost. Typical solutions boil down to +hierarchical architectures, fast and approximate attention, or training on +low-resolution crops. This latter solution does not constrain architectural +choices, but it leads to a clear performance drop when testing at resolutions +significantly higher than that used for training, thus requiring ad-hoc and +slow post-processing schemes. In this paper, we propose a novel strategy for +efficient training and inference of high-resolution vision transformers. The +key principle is to mask out most of the high-resolution inputs during +training, keeping only N random windows. This allows the model to learn local +interactions between tokens inside each window, and global interactions between +tokens from different windows. As a result, the model can directly process the +high-resolution input at test time without any special trick. We show that this +strategy is effective when using relative positional embedding such as rotary +embeddings. It is 4 times faster to train than a full-resolution network, and +it is straightforward to use at test time compared to existing approaches. We +apply this strategy to three dense prediction tasks with high-resolution data. +First, we show on the task of semantic segmentation that a simple setting with +2 windows performs best, hence the name of our method: Win-Win. Second, we +confirm this result on the task of monocular depth prediction. Third, we +further extend it to the binocular task of optical flow, reaching +state-of-the-art performance on the Spring benchmark that contains Full-HD +images with an order of magnitude faster inference than the best competitor. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Inducing High Energy-Latency of Large Vision-Language Models with + Verbose Images ICLR 2024 + + +
+ Large vision-language models (VLMs) such as GPT-4 have achieved exceptional +performance across various multi-modal tasks. However, the deployment of VLMs +necessitates substantial energy consumption and computational resources. Once +attackers maliciously induce high energy consumption and latency time +(energy-latency cost) during inference of VLMs, it will exhaust computational +resources. In this paper, we explore this attack surface about availability of +VLMs and aim to induce high energy-latency cost during inference of VLMs. We +find that high energy-latency cost during inference of VLMs can be manipulated +by maximizing the length of generated sequences. To this end, we propose +verbose images, with the goal of crafting an imperceptible perturbation to +induce VLMs to generate long sentences during inference. Concretely, we design +three loss objectives. First, a loss is proposed to delay the occurrence of +end-of-sequence (EOS) token, where EOS token is a signal for VLMs to stop +generating further tokens. Moreover, an uncertainty loss and a token diversity +loss are proposed to increase the uncertainty over each generated token and the +diversity among all tokens of the whole generated sequence, respectively, which +can break output dependency at token-level and sequence-level. Furthermore, a +temporal weight adjustment algorithm is proposed, which can effectively balance +these losses. Extensive experiments demonstrate that our verbose images can +increase the length of generated sequences by 7.87 times and 8.56 times +compared to original images on MS-COCO and ImageNet datasets, which presents +potential challenges for various applications. Our code is available at +https://github.com/KuofengGao/Verbose_Images. + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Residual Denoising Diffusion Models CVPR2024 + + +
+ We propose residual denoising diffusion models (RDDM), a novel dual diffusion +process that decouples the traditional single denoising diffusion process into +residual diffusion and noise diffusion. This dual diffusion framework expands +the denoising-based diffusion models, initially uninterpretable for image +restoration, into a unified and interpretable model for both image generation +and restoration by introducing residuals. Specifically, our residual diffusion +represents directional diffusion from the target image to the degraded input +image and explicitly guides the reverse generation process for image +restoration, while noise diffusion represents random perturbations in the +diffusion process. The residual prioritizes certainty, while the noise +emphasizes diversity, enabling RDDM to effectively unify tasks with varying +certainty or diversity requirements, such as image generation and restoration. +We demonstrate that our sampling process is consistent with that of DDPM and +DDIM through coefficient transformation, and propose a partially +path-independent generation process to better understand the reverse process. +Notably, our RDDM enables a generic UNet, trained with only an L1 loss and a +batch size of 1, to compete with state-of-the-art image restoration methods. We +provide code and pre-trained models to encourage further exploration, +application, and development of our innovative framework +(https://github.com/nachifur/RDDM). + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ♻ ☆ VisionGPT-3D: A Generalized Multimodal Agent for Enhanced 3D Vision + Understanding + + +
+ The evolution of text to visual components facilitates people's daily lives, +such as generating image, videos from text and identifying the desired elements +within the images. Computer vision models involving the multimodal abilities in +the previous days are focused on image detection, classification based on +well-defined objects. Large language models (LLMs) introduces the +transformation from nature language to visual objects, which present the visual +layout for text contexts. OpenAI GPT-4 has emerged as the pinnacle in LLMs, +while the computer vision (CV) domain boasts a plethora of state-of-the-art +(SOTA) models and algorithms to convert 2D images to their 3D representations. +However, the mismatching between the algorithms with the problem could lead to +undesired results. In response to this challenge, we propose an unified +VisionGPT-3D framework to consolidate the state-of-the-art vision models, +thereby facilitating the development of vision-oriented AI. VisionGPT-3D +provides a versatile multimodal framework building upon the strengths of +multimodal foundation models. It seamlessly integrates various SOTA vision +models and brings the automation in the selection of SOTA vision models, +identifies the suitable 3D mesh creation algorithms corresponding to 2D depth +maps analysis, generates optimal results based on diverse multimodal inputs +such as text prompts. + Keywords: VisionGPT-3D, 3D vision understanding, Multimodal agent + +
+
+ comment: 12 pages, 7 figures, pending conference +
+
+
+
+
+ + ♻ ☆ Toulouse Hyperspectral Data Set: a benchmark data set to assess + semi-supervised spectral representation learning and pixel-wise + classification techniques + + +
+ Airborne hyperspectral images can be used to map the land cover in large +urban areas, thanks to their very high spatial and spectral resolutions on a +wide spectral domain. While the spectral dimension of hyperspectral images is +highly informative of the chemical composition of the land surface, the use of +state-of-the-art machine learning algorithms to map the land cover has been +dramatically limited by the availability of training data. To cope with the +scarcity of annotations, semi-supervised and self-supervised techniques have +lately raised a lot of interest in the community. Yet, the publicly available +hyperspectral data sets commonly used to benchmark machine learning models are +not totally suited to evaluate their generalization performances due to one or +several of the following properties: a limited geographical coverage (which +does not reflect the spectral diversity in metropolitan areas), a small number +of land cover classes and a lack of appropriate standard train / test splits +for semi-supervised and self-supervised learning. Therefore, we release in this +paper the Toulouse Hyperspectral Data Set that stands out from other data sets +in the above-mentioned respects in order to meet key issues in spectral +representation learning and classification over large-scale hyperspectral +images with very few labeled pixels. Besides, we discuss and experiment +self-supervised techniques for spectral representation learning, including the +Masked Autoencoder, and establish a baseline for pixel-wise classification +achieving 85% overall accuracy and 77% F1 score. The Toulouse Hyperspectral +Data Set and our code are publicly available at +https://www.toulouse-hyperspectral-data-set.com and +https://www.github.com/Romain3Ch216/tlse-experiments, respectively. + +
+
+ comment: 17 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ ShapeFormer: Shape Prior Visible-to-Amodal Transformer-based Amodal + Instance Segmentation IJCNN 2024 + + +
+ Amodal Instance Segmentation (AIS) presents a challenging task as it involves +predicting both visible and occluded parts of objects within images. Existing +AIS methods rely on a bidirectional approach, encompassing both the transition +from amodal features to visible features (amodal-to-visible) and from visible +features to amodal features (visible-to-amodal). Our observation shows that the +utilization of amodal features through the amodal-to-visible can confuse the +visible features due to the extra information of occluded/hidden segments not +presented in visible display. Consequently, this compromised quality of visible +features during the subsequent visible-to-amodal transition. To tackle this +issue, we introduce ShapeFormer, a decoupled Transformer-based model with a +visible-to-amodal transition. It facilitates the explicit relationship between +output segmentations and avoids the need for amodal-to-visible transitions. +ShapeFormer comprises three key modules: (i) Visible-Occluding Mask Head for +predicting visible segmentation with occlusion awareness, (ii) Shape-Prior +Amodal Mask Head for predicting amodal and occluded masks, and (iii) +Category-Specific Shape Prior Retriever aims to provide shape prior knowledge. +Comprehensive experiments and extensive ablation studies across various AIS +benchmarks demonstrate the effectiveness of our ShapeFormer. The code is +available at: https://github.com/UARK-AICV/ShapeFormer + +
+
+ comment: Accepted to IJCNN 2024 +
+
+
+
+
+ + ♻ ☆ S-DyRF: Reference-Based Stylized Radiance Fields for Dynamic Scenes CVPR 2024 + + +
+ Current 3D stylization methods often assume static scenes, which violates the +dynamic nature of our real world. To address this limitation, we present +S-DyRF, a reference-based spatio-temporal stylization method for dynamic neural +radiance fields. However, stylizing dynamic 3D scenes is inherently challenging +due to the limited availability of stylized reference images along the temporal +axis. Our key insight lies in introducing additional temporal cues besides the +provided reference. To this end, we generate temporal pseudo-references from +the given stylized reference. These pseudo-references facilitate the +propagation of style information from the reference to the entire dynamic 3D +scene. For coarse style transfer, we enforce novel views and times to mimic the +style details present in pseudo-references at the feature level. To preserve +high-frequency details, we create a collection of stylized temporal pseudo-rays +from temporal pseudo-references. These pseudo-rays serve as detailed and +explicit stylization guidance for achieving fine style transfer. Experiments on +both synthetic and real-world datasets demonstrate that our method yields +plausible stylized results of space-time view synthesis on dynamic 3D scenes. + +
+
+ comment: Accepted by CVPR 2024. Project page: + https://xingyi-li.github.io/s-dyrf/ +
+
+
+
+
+ + ♻ ☆ Beyond Inserting: Learning Identity Embedding for Semantic-Fidelity + Personalized Diffusion Generation + + +
+ Advanced diffusion-based Text-to-Image (T2I) models, such as the Stable +Diffusion Model, have made significant progress in generating diverse and +high-quality images using text prompts alone. However, when non-famous users +require personalized image generation for their identities (IDs), the T2I +models fail to accurately generate their ID-related images. The main problem is +that pre-trained T2I models do not learn the mapping between the new ID prompts +and their corresponding visual content. The previous methods either failed to +accurately fit the face region or lost the interactive generative ability with +other existing concepts in T2I models. In other words, they are unable to +generate T2I-aligned and semantic-fidelity images for the given prompts with +other concepts such as scenes (``Eiffel Tower''), actions (``holding a +basketball''), and facial attributes (``eyes closed''). In this paper, we focus +on inserting accurate and interactive ID embedding into the Stable Diffusion +Model for semantic-fidelity personalized generation. We address this challenge +from two perspectives: face-wise region fitting and semantic-fidelity token +optimization. Specifically, we first visualize the attention overfit problem +and propose a face-wise attention loss to fit the face region instead of +entangling ID-unrelated information, such as face layout and background. This +key trick significantly enhances the ID accuracy and interactive generative +ability with other existing concepts. Then, we optimize one ID representation +as multiple per-stage tokens where each token contains two disentangled +features. This expansion of the textual conditioning space improves +semantic-fidelity control. Extensive experiments validate that our results +exhibit superior ID accuracy, text-based manipulation ability, and +generalization compared to previous methods. + +
+
+ comment: 14 pages, 16 figures +
+
+
+
+
+ + ♻ ☆ Multi-conditioned Graph Diffusion for Neural Architecture Search + + +
+ Neural architecture search automates the design of neural network +architectures usually by exploring a large and thus complex architecture search +space. To advance the architecture search, we present a graph diffusion-based +NAS approach that uses discrete conditional graph diffusion processes to +generate high-performing neural network architectures. We then propose a +multi-conditioned classifier-free guidance approach applied to graph diffusion +networks to jointly impose constraints such as high accuracy and low hardware +latency. Unlike the related work, our method is completely differentiable and +requires only a single model training. In our evaluations, we show promising +results on six standard benchmarks, yielding novel and unique architectures at +a fast speed, i.e. less than 0.2 seconds per architecture. Furthermore, we +demonstrate the generalisability and efficiency of our method through +experiments on ImageNet dataset. + +
+
+ comment: Accepted at Transactions on Machine Learning Research (TMLR) +
+
+
+
+
+ + ♻ ☆ PIA: Your Personalized Image Animator via Plug-and-Play Modules in + Text-to-Image Models + + +
+ Recent advancements in personalized text-to-image (T2I) models have +revolutionized content creation, empowering non-experts to generate stunning +images with unique styles. While promising, adding realistic motions into these +personalized images by text poses significant challenges in preserving distinct +styles, high-fidelity details, and achieving motion controllability by text. In +this paper, we present PIA, a Personalized Image Animator that excels in +aligning with condition images, achieving motion controllability by text, and +the compatibility with various personalized T2I models without specific tuning. +To achieve these goals, PIA builds upon a base T2I model with well-trained +temporal alignment layers, allowing for the seamless transformation of any +personalized T2I model into an image animation model. A key component of PIA is +the introduction of the condition module, which utilizes the condition frame +and inter-frame affinity as input to transfer appearance information guided by +the affinity hint for individual frame synthesis in the latent space. This +design mitigates the challenges of appearance-related image alignment within +and allows for a stronger focus on aligning with motion-related guidance. + +
+
+ comment: Project page: https://pi-animator.github.io/ +
+
+
+
+
+ + ♻ ☆ FunQA: Towards Surprising Video Comprehension + + +
+ Surprising videos, such as funny clips, creative performances, or visual +illusions, attract significant attention. Enjoyment of these videos is not +simply a response to visual stimuli; rather, it hinges on the human capacity to +understand (and appreciate) commonsense violations depicted in these videos. We +introduce FunQA, a challenging video question-answering (QA) dataset +specifically designed to evaluate and enhance the depth of video reasoning +based on counter-intuitive and fun videos. Unlike most video QA benchmarks +which focus on less surprising contexts, e.g., cooking or instructional videos, +FunQA covers three previously unexplored types of surprising videos: 1) +HumorQA, 2) CreativeQA, and 3) MagicQA. For each subset, we establish rigorous +QA tasks designed to assess the model's capability in counter-intuitive +timestamp localization, detailed video description, and reasoning around +counter-intuitiveness. We also pose higher-level tasks, such as attributing a +fitting and vivid title to the video and scoring the video creativity. In +total, the FunQA benchmark consists of 312K free-text QA pairs derived from +4.3K video clips, spanning a total of 24 video hours. Moreover, we propose +FunMentor, an agent designed for Vision-Language Models (VLMs) that uses +multi-turn dialogues to enhance models' understanding of counter-intuitiveness. +Extensive experiments with existing VLMs demonstrate the effectiveness of +FunMentor and reveal significant performance gaps for the FunQA videos across +spatial-temporal reasoning, visual-centered reasoning, and free-text +generation. + +
+
+ comment: Project Page: https://funqa-benchmark.github.io/ Codebase: + https://github.com/Jingkang50/FunQA +
+
+
+
+
+ + ♻ ☆ You Only Need Two Detectors to Achieve Multi-Modal 3D Multi-Object + Tracking + + +
+ In the classical tracking-by-detection (TBD) paradigm, detection and tracking +are separately and sequentially conducted, and data association must be +properly performed to achieve satisfactory tracking performance. In this paper, +a new end-to-end multi-object tracking framework is proposed, which integrates +object detection and multi-object tracking into a single model. The proposed +tracking framework eliminates the complex data association process in the +classical TBD paradigm, and requires no additional training. Secondly, the +regression confidence of historical trajectories is investigated, and the +possible states of a trajectory (weak object or strong object) in the current +frame are predicted. Then, a confidence fusion module is designed to guide +non-maximum suppression for trajectories and detections to achieve ordered and +robust tracking. Thirdly, by integrating historical trajectory features, the +regression performance of the detector is enhanced, which better reflects the +occlusion and disappearance patterns of objects in real world. Lastly, +extensive experiments are conducted on the commonly used KITTI and Waymo +datasets. The results show that the proposed framework can achieve robust +tracking by using only a 2D detector and a 3D detector, and it is proven more +accurate than many of the state-of-the-art TBD-based multi-modal tracking +methods. The source codes of the proposed method are available at +https://github.com/wangxiyang2022/YONTD-MOT. + +
+
+ comment: 11 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Mora: Enabling Generalist Video Generation via A Multi-Agent Framework + + +
+ Sora is the first large-scale generalist video generation model that garnered +significant attention across society. Since its launch by OpenAI in February +2024, no other video generation models have paralleled {Sora}'s performance or +its capacity to support a broad spectrum of video generation tasks. +Additionally, there are only a few fully published video generation models, +with the majority being closed-source. To address this gap, this paper proposes +a new multi-agent framework Mora, which incorporates several advanced visual AI +agents to replicate generalist video generation demonstrated by Sora. In +particular, Mora can utilize multiple visual agents and successfully mimic +Sora's video generation capabilities in various tasks, such as (1) +text-to-video generation, (2) text-conditional image-to-video generation, (3) +extend generated videos, (4) video-to-video editing, (5) connect videos and (6) +simulate digital worlds. Our extensive experimental results show that Mora +achieves performance that is proximate to that of Sora in various tasks. +However, there exists an obvious performance gap between our work and Sora when +assessed holistically. In summary, we hope this project can guide the future +trajectory of video generation through collaborative AI agents. + +
+
+
+
+
+ + ♻ ☆ MC-NeRF: Multi-Camera Neural Radiance Fields for Multi-Camera Image + Acquisition Systems + + +
+ Neural Radiance Fields (NeRF) use multi-view images for 3D scene +representation, demonstrating remarkable performance. As one of the primary +sources of multi-view images, multi-camera systems encounter challenges such as +varying intrinsic parameters and frequent pose changes. Most previous +NeRF-based methods assume a unique camera and rarely consider multi-camera +scenarios. Besides, some NeRF methods that can optimize intrinsic and extrinsic +parameters still remain susceptible to suboptimal solutions when these +parameters are poor initialized. In this paper, we propose MC-NeRF, a method +that enables joint optimization of both intrinsic and extrinsic parameters +alongside NeRF. The method also supports each image corresponding to +independent camera parameters. First, we tackle coupling issue and the +degenerate case that arise from the joint optimization between intrinsic and +extrinsic parameters. Second, based on the proposed solutions, we introduce an +efficient calibration image acquisition scheme for multi-camera systems, +including the design of calibration object. Finally, we present an end-to-end +network with training sequence that enables the estimation of intrinsic and +extrinsic parameters, along with the rendering network. Furthermore, +recognizing that most existing datasets are designed for a unique camera, we +construct a real multi-camera image acquisition system and create a +corresponding new dataset, which includes both simulated data and real-world +captured images. Experiments confirm the effectiveness of our method when each +image corresponds to different camera parameters. Specifically, we use +multi-cameras, each with different intrinsic and extrinsic parameters in +real-world system, to achieve 3D scene representation without providing initial +poses. + +
+
+ comment: This manuscript is currently under review +
+
+
+
+
+ + ♻ ☆ ZePT: Zero-Shot Pan-Tumor Segmentation via Query-Disentangling and + Self-Prompting CVPR 2024 + + +
+ The long-tailed distribution problem in medical image analysis reflects a +high prevalence of common conditions and a low prevalence of rare ones, which +poses a significant challenge in developing a unified model capable of +identifying rare or novel tumor categories not encountered during training. In +this paper, we propose a new zero-shot pan-tumor segmentation framework (ZePT) +based on query-disentangling and self-prompting to segment unseen tumor +categories beyond the training set. ZePT disentangles the object queries into +two subsets and trains them in two stages. Initially, it learns a set of +fundamental queries for organ segmentation through an object-aware feature +grouping strategy, which gathers organ-level visual features. Subsequently, it +refines the other set of advanced queries that focus on the auto-generated +visual prompts for unseen tumor segmentation. Moreover, we introduce +query-knowledge alignment at the feature level to enhance each query's +discriminative representation and generalizability. Extensive experiments on +various tumor segmentation tasks demonstrate the performance superiority of +ZePT, which surpasses the previous counterparts and evidence the promising +ability for zero-shot tumor segmentation in real-world settings. + +
+
+ comment: This paper has been accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ FSC: Few-point Shape Completion CVPR 2024 + + +
+ While previous studies have demonstrated successful 3D object shape +completion with a sufficient number of points, they often fail in scenarios +when a few points, e.g. tens of points, are observed. Surprisingly, via entropy +analysis, we find that even a few points, e.g. 64 points, could retain +substantial information to help recover the 3D shape of the object. To address +the challenge of shape completion with very sparse point clouds, we then +propose Few-point Shape Completion (FSC) model, which contains a novel +dual-branch feature extractor for handling extremely sparse inputs, coupled +with an extensive branch for maximal point utilization with a saliency branch +for dynamic importance assignment. This model is further bolstered by a +two-stage revision network that refines both the extracted features and the +decoder output, enhancing the detail and authenticity of the completed point +cloud. Our experiments demonstrate the feasibility of recovering 3D shapes from +a few points. The proposed Few-point Shape Completion (FSC) model outperforms +previous methods on both few-point inputs and many-point inputs, and shows good +generalizability to different object categories. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ RGBD GS-ICP SLAM + + +
+ Simultaneous Localization and Mapping (SLAM) with dense representation plays +a key role in robotics, Virtual Reality (VR), and Augmented Reality (AR) +applications. Recent advancements in dense representation SLAM have highlighted +the potential of leveraging neural scene representation and 3D Gaussian +representation for high-fidelity spatial representation. In this paper, we +propose a novel dense representation SLAM approach with a fusion of Generalized +Iterative Closest Point (G-ICP) and 3D Gaussian Splatting (3DGS). In contrast +to existing methods, we utilize a single Gaussian map for both tracking and +mapping, resulting in mutual benefits. Through the exchange of covariances +between tracking and mapping processes with scale alignment techniques, we +minimize redundant computations and achieve an efficient system. Additionally, +we enhance tracking accuracy and mapping quality through our keyframe selection +methods. Experimental results demonstrate the effectiveness of our approach, +showing an incredibly fast speed up to 107 FPS (for the entire system) and +superior quality of the reconstructed map. + +
+
+
+
+
+ + ♻ ☆ CPA-Enhancer: Chain-of-Thought Prompted Adaptive Enhancer for Object + Detection under Unknown Degradations + + +
+ Object detection methods under known single degradations have been +extensively investigated. However, existing approaches require prior knowledge +of the degradation type and train a separate model for each, limiting their +practical applications in unpredictable environments. To address this +challenge, we propose a chain-of-thought (CoT) prompted adaptive enhancer, +CPA-Enhancer, for object detection under unknown degradations. Specifically, +CPA-Enhancer progressively adapts its enhancement strategy under the +step-by-step guidance of CoT prompts, that encode degradation-related +information. To the best of our knowledge, it's the first work that exploits +CoT prompting for object detection tasks. Overall, CPA-Enhancer is a +plug-and-play enhancement model that can be integrated into any generic +detectors to achieve substantial gains on degraded images, without knowing the +degradation type priorly. Experimental results demonstrate that CPA-Enhancer +not only sets the new state of the art for object detection but also boosts the +performance of other downstream vision tasks under unknown degradations. + +
+
+
+
+
+ + ♻ ☆ S2DM: Sector-Shaped Diffusion Models for Video Generation + + +
+ Diffusion models have achieved great success in image generation. However, +when leveraging this idea for video generation, we face significant challenges +in maintaining the consistency and continuity across video frames. This is +mainly caused by the lack of an effective framework to align frames of videos +with desired temporal features while preserving consistent semantic and +stochastic features. In this work, we propose a novel Sector-Shaped Diffusion +Model (S2DM) whose sector-shaped diffusion region is formed by a set of +ray-shaped reverse diffusion processes starting at the same noise point. S2DM +can generate a group of intrinsically related data sharing the same semantic +and stochastic features while varying on temporal features with appropriate +guided conditions. We apply S2DM to video generation tasks, and explore the use +of optical flow as temporal conditions. Our experimental results show that S2DM +outperforms many existing methods in the task of video generation without any +temporal-feature modelling modules. For text-to-video generation tasks where +temporal conditions are not explicitly given, we propose a two-stage generation +strategy which can decouple the generation of temporal features from +semantic-content features. We show that, without additional training, our model +integrated with another temporal conditions generative model can still achieve +comparable performance with existing works. Our results can be viewd at +https://s2dm.github.io/S2DM/. + +
+
+ comment: 17 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ AI-Dentify: Deep learning for proximal caries detection on bitewing + x-ray -- HUNT4 Oral Health Study + + +
+ Background: Dental caries diagnosis requires the manual inspection of +diagnostic bitewing images of the patient, followed by a visual inspection and +probing of the identified dental pieces with potential lesions. Yet the use of +artificial intelligence, and in particular deep-learning, has the potential to +aid in the diagnosis by providing a quick and informative analysis of the +bitewing images. + Methods: A dataset of 13,887 bitewings from the HUNT4 Oral Health Study were +annotated individually by six different experts, and used to train three +different object detection deep-learning architectures: RetinaNet (ResNet50), +YOLOv5 (M size), and EfficientDet (D0 and D1 sizes). A consensus dataset of 197 +images, annotated jointly by the same six dentist, was used for evaluation. A +five-fold cross validation scheme was used to evaluate the performance of the +AI models. + Results: he trained models show an increase in average precision and +F1-score, and decrease of false negative rate, with respect to the dental +clinicians. When compared against the dental clinicians, the YOLOv5 model shows +the largest improvement, reporting 0.647 mean average precision, 0.548 mean +F1-score, and 0.149 mean false negative rate. Whereas the best annotators on +each of these metrics reported 0.299, 0.495, and 0.164 respectively. + Conclusion: Deep-learning models have shown the potential to assist dental +professionals in the diagnosis of caries. Yet, the task remains challenging due +to the artifacts natural to the bitewing images. + +
+
+ comment: 24 pages, 5 figure, 7 tables +
+
+
+
+
+ + ♻ ☆ Event-based Simultaneous Localization and Mapping: A Comprehensive + Survey + + +
+ In recent decades, visual simultaneous localization and mapping (vSLAM) has +gained significant interest in both academia and industry. It estimates camera +motion and reconstructs the environment concurrently using visual sensors on a +moving robot. However, conventional cameras are limited by hardware, including +motion blur and low dynamic range, which can negatively impact performance in +challenging scenarios like high-speed motion and high dynamic range +illumination. Recent studies have demonstrated that event cameras, a new type +of bio-inspired visual sensor, offer advantages such as high temporal +resolution, dynamic range, low power consumption, and low latency. This paper +presents a timely and comprehensive review of event-based vSLAM algorithms that +exploit the benefits of asynchronous and irregular event streams for +localization and mapping tasks. The review covers the working principle of +event cameras and various event representations for preprocessing event data. +It also categorizes event-based vSLAM methods into four main categories: +feature-based, direct, motion-compensation, and deep learning methods, with +detailed discussions and practical guidance for each approach. Furthermore, the +paper evaluates the state-of-the-art methods on various benchmarks, +highlighting current challenges and future opportunities in this emerging +research area. A public repository will be maintained to keep track of the +rapid developments in this field at +{\url{https://github.com/kun150kun/ESLAM-survey}}. + +
+
+
+
+
+ + ♻ ☆ SyncTweedies: A General Generative Framework Based on Synchronized + Diffusions + + +
+ We introduce a general framework for generating diverse visual content, +including ambiguous images, panorama images, mesh textures, and Gaussian splat +textures, by synchronizing multiple diffusion processes. We present exhaustive +investigation into all possible scenarios for synchronizing multiple diffusion +processes through a canonical space and analyze their characteristics across +applications. In doing so, we reveal a previously unexplored case: averaging +the outputs of Tweedie's formula while conducting denoising in multiple +instance spaces. This case also provides the best quality with the widest +applicability to downstream tasks. We name this case SyncTweedies. In our +experiments generating visual content aforementioned, we demonstrate the +superior quality of generation by SyncTweedies compared to other +synchronization methods, optimization-based and iterative-update-based methods. + +
+
+ comment: Project page: https://synctweedies.github.io/ +
+
+
+
+
+ + ♻ ☆ Detection Is Tracking: Point Cloud Multi-Sweep Deep Learning Models + Revisited + + +
+ Conventional tracking paradigm takes in instantaneous measurements such as +range and bearing, and produces object tracks across time. In applications such +as autonomous driving, lidar measurements in the form of point clouds are +usually passed through a "virtual sensor" realized by a deep learning model, to +produce "measurements" such as bounding boxes, which are in turn ingested by a +tracking module to produce object tracks. Very often multiple lidar sweeps are +accumulated in a buffer to merge and become the input to the virtual sensor. We +argue in this paper that such an input already contains temporal information, +and therefore the virtual sensor output should also contain temporal +information, not just instantaneous values for the time corresponding to the +end of the buffer. In particular, we present the deep learning model called +MULti-Sweep PAired Detector (MULSPAD) that produces, for each detected object, +a pair of bounding boxes at both the end time and the beginning time of the +input buffer. This is achieved with fairly straightforward changes in commonly +used lidar detection models, and with only marginal extra processing, but the +resulting symmetry is satisfying. Such paired detections make it possible not +only to construct rudimentary trackers fairly easily, but also to construct +more sophisticated trackers that can exploit the extra information conveyed by +the pair and be robust to choices of motion models and object birth/death +models. We have conducted preliminary training and experimentation using Waymo +Open Dataset, which shows the efficacy of our proposed method. + +
+
+ comment: My previous employer Motional is requiring a review and approval + process before I can publish this paper +
+
+
+
+
+ + ♻ ☆ Mixture of Cluster-conditional LoRA Experts for Vision-language + Instruction Tuning + + +
+ Instruction tuning of Large Vision-language Models (LVLMs) has revolutionized +the development of versatile models with zero-shot generalization across a wide +range of downstream vision-language tasks. However, the diversity of training +tasks of different sources and formats would lead to inevitable task conflicts, +where different tasks conflict for the same set of model parameters, resulting +in sub-optimal instructionfollowing abilities. To address that, we propose the +Mixture of Clusterconditional LoRA Experts (MoCLE), a novel Mixture of Experts +(MoE) architecture designed to activate the task-customized model parameters +based on the instruction clusters. A separate universal expert is further +incorporated to improve generalization capabilities of MoCLE for novel +instructions. Extensive experiments on 11 zero-shot tasks demonstrate the +effectiveness of MoCLE. + +
+
+ comment: Project website: https://gyhdog99.github.io/projects/mocle/ +
+
+
+
+
+ + ♻ ☆ ToonAging: Face Re-Aging upon Artistic Portrait Style Transfer + + +
+ Face re-aging is a prominent field in computer vision and graphics, with +significant applications in photorealistic domains such as movies, advertising, +and live streaming. Recently, the need to apply face re-aging to +non-photorealistic images, like comics, illustrations, and animations, has +emerged as an extension in various entertainment sectors. However, the lack of +a network that can seamlessly edit the apparent age in NPR images has limited +these tasks to a naive, sequential approach. This often results in unpleasant +artifacts and a loss of facial attributes due to domain discrepancies. In this +paper, we introduce a novel one-stage method for face re-aging combined with +portrait style transfer, executed in a single generative step. We leverage +existing face re-aging and style transfer networks, both trained within the +same PR domain. Our method uniquely fuses distinct latent vectors, each +responsible for managing aging-related attributes and NPR appearance. By +adopting an exemplar-based approach, our method offers greater flexibility +compared to domain-level fine-tuning approaches, which typically require +separate training or fine-tuning for each domain. This effectively addresses +the limitation of requiring paired datasets for re-aging and domain-level, +data-driven approaches for stylization. Our experiments show that our model can +effortlessly generate re-aged images while simultaneously transferring the +style of examples, maintaining both natural appearance and controllability. + +
+
+ comment: 14 pages, 15 figures, 1 table +
+
+
+
+
+ + ♻ ☆ Eyes Closed, Safety On: Protecting Multimodal LLMs via Image-to-Text + Transformation + + +
+ Multimodal large language models (MLLMs) have shown impressive reasoning +abilities, which, however, are also more vulnerable to jailbreak attacks than +their LLM predecessors. Although still capable of detecting unsafe responses, +we observe that safety mechanisms of the pre-aligned LLMs in MLLMs can be +easily bypassed due to the introduction of image features. To construct robust +MLLMs, we propose ECSO(Eyes Closed, Safety On), a novel training-free +protecting approach that exploits the inherent safety awareness of MLLMs, and +generates safer responses via adaptively transforming unsafe images into texts +to activate intrinsic safety mechanism of pre-aligned LLMs in MLLMs. +Experiments on five state-of-the-art (SoTA) MLLMs demonstrate that our ECSO +enhances model safety significantly (e.g., a 37.6% improvement on the +MM-SafetyBench (SD+OCR), and 71.3% on VLSafe for the LLaVA-1.5-7B), while +consistently maintaining utility results on common MLLM benchmarks. +Furthermore, we show that ECSO can be used as a data engine to generate +supervised-finetuning (SFT) data for MLLM alignment without extra human +intervention. + +
+
+ comment: Project Page: https://gyhdog99.github.io/projects/ecso/ +
+
+
+
+
+ + ♻ ☆ MV-ROPE: Multi-view Constraints for Robust Category-level Object Pose + and Size Estimation + + +
+ Recently there has been a growing interest in category-level object pose and +size estimation, and prevailing methods commonly rely on single view RGB-D +images. However, one disadvantage of such methods is that they require accurate +depth maps which cannot be produced by consumer-grade sensors. Furthermore, +many practical real-world situations involve a moving camera that continuously +observes its surroundings, and the temporal information of the input video +streams is simply overlooked by single-view methods. We propose a novel +solution that makes use of RGB video streams. Our framework consists of three +modules: a scale-aware monocular dense SLAM solution, a lightweight object pose +predictor, and an object-level pose graph optimizer. The SLAM module utilizes a +video stream and additional scale-sensitive readings to estimate camera poses +and metric depth. The object pose predictor then generates canonical object +representations from RGB images. The object pose is estimated through geometric +registration of these canonical object representations with estimated object +depth points. All per-view estimates finally undergo optimization within a pose +graph, culminating in the output of robust and accurate canonical object poses. +Our experimental results demonstrate that when utilizing public dataset +sequences with high-quality depth information, the proposed method exhibits +comparable performance to state-of-the-art RGB-D methods. We also collect and +evaluate on new datasets containing depth maps of varying quality to further +quantitatively benchmark the proposed method alongside previous RGB-D based +methods. We demonstrate a significant advantage in scenarios where depth input +is absent or the quality of depth sensing is limited. + +
+
+
+
+
+ + ♻ ☆ D-SCo: Dual-Stream Conditional Diffusion for Monocular Hand-Held Object + Reconstruction + + +
+ Reconstructing hand-held objects from a single RGB image is a challenging +task in computer vision. In contrast to prior works that utilize deterministic +modeling paradigms, we employ a point cloud denoising diffusion model to +account for the probabilistic nature of this problem. In the core, we introduce +centroid-fixed dual-stream conditional diffusion for monocular hand-held object +reconstruction (D-SCo), tackling two predominant challenges. First, to avoid +the object centroid from deviating, we utilize a novel hand-constrained +centroid fixing paradigm, enhancing the stability of diffusion and reverse +processes and the precision of feature projection. Second, we introduce a +dual-stream denoiser to semantically and geometrically model hand-object +interactions with a novel unified hand-object semantic embedding, enhancing the +reconstruction performance of the hand-occluded region of the object. +Experiments on the synthetic ObMan dataset and three real-world datasets HO3D, +MOW and DexYCB demonstrate that our approach can surpass all other +state-of-the-art methods. Codes will be released. + +
+
+
+
+
+ + ♻ ☆ Listen to Look into the Future: Audio-Visual Egocentric Gaze + Anticipation + + +
+ Egocentric gaze anticipation serves as a key building block for the emerging +capability of Augmented Reality. Notably, gaze behavior is driven by both +visual cues and audio signals during daily activities. Motivated by this +observation, we introduce the first model that leverages both the video and +audio modalities for egocentric gaze anticipation. Specifically, we propose a +Contrastive Spatial-Temporal Separable (CSTS) fusion approach that adopts two +modules to separately capture audio-visual correlations in spatial and temporal +dimensions, and applies a contrastive loss on the re-weighted audio-visual +features from fusion modules for representation learning. We conduct extensive +ablation studies and thorough analysis using two egocentric video datasets: +Ego4D and Aria, to validate our model design. We demonstrate the audio improves +the performance by +2.5% and +2.4% on the two datasets. Our model also +outperforms the prior state-of-the-art methods by at least +1.9% and +1.6%. +Moreover, we provide visualizations to show the gaze anticipation results and +provide additional insights into audio-visual representation learning. The code +and data split are available on our website +(https://bolinlai.github.io/CSTS-EgoGazeAnticipation/). + +
+
+ comment: 30 pages +
+
+
+
+
+ + ♻ ☆ 6D-Diff: A Keypoint Diffusion Framework for 6D Object Pose Estimation CVPR 2024 + + +
+ Estimating the 6D object pose from a single RGB image often involves noise +and indeterminacy due to challenges such as occlusions and cluttered +backgrounds. Meanwhile, diffusion models have shown appealing performance in +generating high-quality images from random noise with high indeterminacy +through step-by-step denoising. Inspired by their denoising capability, we +propose a novel diffusion-based framework (6D-Diff) to handle the noise and +indeterminacy in object pose estimation for better performance. In our +framework, to establish accurate 2D-3D correspondence, we formulate 2D +keypoints detection as a reverse diffusion (denoising) process. To facilitate +such a denoising process, we design a Mixture-of-Cauchy-based forward diffusion +process and condition the reverse process on the object features. Extensive +experiments on the LM-O and YCB-V datasets demonstrate the effectiveness of our +framework. + +
+
+ comment: CVPR 2024 CAMERA-READY +
+
+
+
+
+ + ♻ ☆ Promoting Segment Anything Model towards Highly Accurate Dichotomous + Image Segmentation + + +
+ The Segment Anything Model (SAM) represents a significant breakthrough into +foundation models for computer vision, providing a large-scale image +segmentation model. However, despite SAM's zero-shot performance, its +segmentation masks lack fine-grained details, particularly in accurately +delineating object boundaries. We have high expectations regarding whether SAM, +as a foundation model, can be improved towards highly accurate object +segmentation, which is known as dichotomous image segmentation (DIS). To +address this issue, we propose DIS-SAM, which advances SAM towards DIS with +extremely accurate details. DIS-SAM is a framework specifically tailored for +highly accurate segmentation, maintaining SAM's promptable design. DIS-SAM +employs a two-stage approach, integrating SAM with a modified IS-Net dedicated +to DIS. Despite its simplicity, DIS-SAM demonstrates significantly enhanced +segmentation accuracy compared to SAM and HQ-SAM. + +
+
+
+
+
+ + ♻ ☆ Instance-aware Exploration-Verification-Exploitation for Instance + ImageGoal Navigation + + +
+ As a new embodied vision task, Instance ImageGoal Navigation (IIN) aims to +navigate to a specified object depicted by a goal image in an unexplored +environment. + The main challenge of this task lies in identifying the target object from +different viewpoints while rejecting similar distractors. + Existing ImageGoal Navigation methods usually adopt the simple +Exploration-Exploitation framework and ignore the identification of specific +instance during navigation. + In this work, we propose to imitate the human behaviour of ``getting closer +to confirm" when distinguishing objects from a distance. + Specifically, we design a new modular navigation framework named +Instance-aware Exploration-Verification-Exploitation (IEVE) for instance-level +image goal navigation. + Our method allows for active switching among the exploration, verification, +and exploitation actions, thereby facilitating the agent in making reasonable +decisions under different situations. + On the challenging HabitatMatterport 3D semantic (HM3D-SEM) dataset, our +method surpasses previous state-of-the-art work, with a classical segmentation +model (0.684 vs. 0.561 success) or a robust model (0.702 vs. 0.561 success) + +
+
+
+
+
+ + ♻ ☆ mPLUG-Owl: Modularization Empowers Large Language Models with + Multimodality + + +
+ Large language models (LLMs) have demonstrated impressive zero-shot abilities +on a variety of open-ended tasks, while recent research has also explored the +use of LLMs for multi-modal generation. In this study, we introduce mPLUG-Owl, +a novel training paradigm that equips LLMs with multi-modal abilities through +modularized learning of foundation LLM, a visual knowledge module, and a visual +abstractor module. This approach can support multiple modalities and facilitate +diverse unimodal and multimodal abilities through modality collaboration. The +training paradigm of mPLUG-Owl involves a two-stage method for aligning image +and text, which learns visual knowledge with the assistance of LLM while +maintaining and even improving the generation abilities of LLM. In the first +stage, the visual knowledge module and abstractor module are trained with a +frozen LLM module to align the image and text. In the second stage, +language-only and multi-modal supervised datasets are used to jointly fine-tune +a low-rank adaption (LoRA) module on LLM and the abstractor module by freezing +the visual knowledge module. We carefully build a visually-related instruction +evaluation set OwlEval. Experimental results show that our model outperforms +existing multi-modal models, demonstrating mPLUG-Owl's impressive instruction +and visual understanding ability, multi-turn conversation ability, and +knowledge reasoning ability. Besides, we observe some unexpected and exciting +abilities such as multi-image correlation and scene text understanding, which +makes it possible to leverage it for harder real scenarios, such as vision-only +document comprehension. Our code, pre-trained model, instruction-tuned models, +and evaluation set are available at https://github.com/X-PLUG/mPLUG-Owl. The +online demo is available at https://www.modelscope.cn/studios/damo/mPLUG-Owl. + +
+
+ comment: Working in Process +
+
+
+
+
+ + ♻ ☆ ID-like Prompt Learning for Few-Shot Out-of-Distribution Detection + + +
+ Out-of-distribution (OOD) detection methods often exploit auxiliary outliers +to train model identifying OOD samples, especially discovering challenging +outliers from auxiliary outliers dataset to improve OOD detection. However, +they may still face limitations in effectively distinguishing between the most +challenging OOD samples that are much like in-distribution (ID) data, i.e., +\idlike samples. To this end, we propose a novel OOD detection framework that +discovers \idlike outliers using CLIP \cite{DBLP:conf/icml/RadfordKHRGASAM21} +from the vicinity space of the ID samples, thus helping to identify these most +challenging OOD samples. Then a prompt learning framework is proposed that +utilizes the identified \idlike outliers to further leverage the capabilities +of CLIP for OOD detection. Benefiting from the powerful CLIP, we only need a +small number of ID samples to learn the prompts of the model without exposing +other auxiliary outlier datasets. By focusing on the most challenging \idlike +OOD samples and elegantly exploiting the capabilities of CLIP, our method +achieves superior few-shot learning performance on various real-world image +datasets (e.g., in 4-shot OOD detection on the ImageNet-1k dataset, our method +reduces the average FPR95 by 12.16\% and improves the average AUROC by 2.76\%, +compared to state-of-the-art methods). Code is available at +https://github.com/ycfate/ID-like. + +
+
+
+
+
+ + ♻ ☆ BigGait: Learning Gait Representation You Want by Large Vision Models + + +
+ Gait recognition stands as one of the most pivotal remote identification +technologies and progressively expands across research and industry +communities. However, existing gait recognition methods heavily rely on +task-specific upstream driven by supervised learning to provide explicit gait +representations like silhouette sequences, which inevitably introduce expensive +annotation costs and potential error accumulation. Escaping from this trend, +this work explores effective gait representations based on the all-purpose +knowledge produced by task-agnostic Large Vision Models (LVMs) and proposes a +simple yet efficient gait framework, termed BigGait. Specifically, the Gait +Representation Extractor (GRE) within BigGait draws upon design principles from +established gait representations, effectively transforming all-purpose +knowledge into implicit gait representations without requiring third-party +supervision signals. Experiments on CCPG, CAISA-B* and SUSTech1K indicate that +BigGait significantly outperforms the previous methods in both within-domain +and cross-domain tasks in most cases, and provides a more practical paradigm +for learning the next-generation gait representation. Finally, we delve into +prospective challenges and promising directions in LVMs-based gait recognition, +aiming to inspire future work in this emerging topic. The source code is +available at https://github.com/ShiqiYu/OpenGait. + +
+
+
+
+
+ + ♻ ☆ Image super-resolution via dynamic network + + +
+ Convolutional neural networks (CNNs) depend on deep network architectures to +extract accurate information for image super-resolution. However, obtained +information of these CNNs cannot completely express predicted high-quality +images for complex scenes. In this paper, we present a dynamic network for +image super-resolution (DSRNet), which contains a residual enhancement block, +wide enhancement block, feature refinement block and construction block. The +residual enhancement block is composed of a residual enhanced architecture to +facilitate hierarchical features for image super-resolution. To enhance +robustness of obtained super-resolution model for complex scenes, a wide +enhancement block achieves a dynamic architecture to learn more robust +information to enhance applicability of an obtained super-resolution model for +varying scenes. To prevent interference of components in a wide enhancement +block, a refinement block utilizes a stacked architecture to accurately learn +obtained features. Also, a residual learning operation is embedded in the +refinement block to prevent long-term dependency problem. Finally, a +construction block is responsible for reconstructing high-quality images. +Designed heterogeneous architecture can not only facilitate richer structural +information, but also be lightweight, which is suitable for mobile digital +devices. Experimental results shows that our method is more competitive in +terms of performance and recovering time of image super-resolution and +complexity. The code of DSRNet can be obtained at +https://github.com/hellloxiaotian/DSRNet. + +
+
+
+
+
+ + ♻ ☆ Open-sourced Data Ecosystem in Autonomous Driving: the Present and + Future + + +
+ With the continuous maturation and application of autonomous driving +technology, a systematic examination of open-source autonomous driving datasets +becomes instrumental in fostering the robust evolution of the industry +ecosystem. Current autonomous driving datasets can broadly be categorized into +two generations. The first-generation autonomous driving datasets are +characterized by relatively simpler sensor modalities, smaller data scale, and +is limited to perception-level tasks. KITTI, introduced in 2012, serves as a +prominent representative of this initial wave. In contrast, the +second-generation datasets exhibit heightened complexity in sensor modalities, +greater data scale and diversity, and an expansion of tasks from perception to +encompass prediction and control. Leading examples of the second generation +include nuScenes and Waymo, introduced around 2019. This comprehensive review, +conducted in collaboration with esteemed colleagues from both academia and +industry, systematically assesses over seventy open-source autonomous driving +datasets from domestic and international sources. It offers insights into +various aspects, such as the principles underlying the creation of high-quality +datasets, the pivotal role of data engine systems, and the utilization of +generative foundation models to facilitate scalable data generation. +Furthermore, this review undertakes an exhaustive analysis and discourse +regarding the characteristics and data scales that future third-generation +autonomous driving datasets should possess. It also delves into the scientific +and technical challenges that warrant resolution. These endeavors are pivotal +in advancing autonomous innovation and fostering technological enhancement in +critical domains. For further details, please refer to +https://github.com/OpenDriveLab/DriveAGI. + +
+
+ comment: This article is a simplified English translation of corresponding + Chinese article. Please refer to Chinese version for the complete content +
+
+
+
+
+ + ♻ ☆ LSKNet: A Foundation Lightweight Backbone for Remote Sensing + + +
+ Remote sensing images pose distinct challenges for downstream tasks due to +their inherent complexity. While a considerable amount of research has been +dedicated to remote sensing classification, object detection and semantic +segmentation, most of these studies have overlooked the valuable prior +knowledge embedded within remote sensing scenarios. Such prior knowledge can be +useful because remote sensing objects may be mistakenly recognized without +referencing a sufficiently long-range context, which can vary for different +objects. This paper considers these priors and proposes a lightweight Large +Selective Kernel Network (LSKNet) backbone. LSKNet can dynamically adjust its +large spatial receptive field to better model the ranging context of various +objects in remote sensing scenarios. To our knowledge, large and selective +kernel mechanisms have not been previously explored in remote sensing images. +Without bells and whistles, our lightweight LSKNet sets new state-of-the-art +scores on standard remote sensing classification, object detection and semantic +segmentation benchmarks. Our comprehensive analysis further validated the +significance of the identified priors and the effectiveness of LSKNet. The code +is available at https://github.com/zcablii/LSKNet. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2303.09030 +
+
+
+
+
+ + ♻ ☆ Unified Language-Vision Pretraining in LLM with Dynamic Discrete Visual + Tokenization ICLR 2024 + + +
+ Recently, the remarkable advance of the Large Language Model (LLM) has +inspired researchers to transfer its extraordinary reasoning capability to both +vision and language data. However, the prevailing approaches primarily regard +the visual input as a prompt and focus exclusively on optimizing the text +generation process conditioned upon vision content by a frozen LLM. Such an +inequitable treatment of vision and language heavily constrains the model's +potential. In this paper, we break through this limitation by representing both +vision and language in a unified form. Specifically, we introduce a +well-designed visual tokenizer to translate the non-linguistic image into a +sequence of discrete tokens like a foreign language that LLM can read. The +resulting visual tokens encompass high-level semantics worthy of a word and +also support dynamic sequence length varying from the image. Coped with this +tokenizer, the presented foundation model called LaVIT can handle both image +and text indiscriminately under the same generative learning paradigm. This +unification empowers LaVIT to serve as an impressive generalist interface to +understand and generate multi-modal content simultaneously. Extensive +experiments further showcase that it outperforms the existing models by a large +margin on massive vision-language tasks. Our code and models are available at +https://github.com/jy0205/LaVIT. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ LEGO: Learning EGOcentric Action Frame Generation via Visual Instruction + Tuning + + +
+ Generating instructional images of human daily actions from an egocentric +viewpoint serves as a key step towards efficient skill transfer. In this paper, +we introduce a novel problem -- egocentric action frame generation. The goal is +to synthesize an image depicting an action in the user's context (i.e., action +frame) by conditioning on a user prompt and an input egocentric image. Notably, +existing egocentric action datasets lack the detailed annotations that describe +the execution of actions. Additionally, existing diffusion-based image +manipulation models are sub-optimal in controlling the state transition of an +action in egocentric image pixel space because of the domain gap. To this end, +we propose to Learn EGOcentric (LEGO) action frame generation via visual +instruction tuning. First, we introduce a prompt enhancement scheme to generate +enriched action descriptions from a visual large language model (VLLM) by +visual instruction tuning. Then we propose a novel method to leverage image and +text embeddings from the VLLM as additional conditioning to improve the +performance of a diffusion model. We validate our model on two egocentric +datasets -- Ego4D and Epic-Kitchens. Our experiments show substantial +improvement over prior image manipulation models in both quantitative and +qualitative evaluation. We also conduct detailed ablation studies and analysis +to provide insights in our method. More details of the dataset and code are +available on the website (https://bolinlai.github.io/Lego_EgoActGen/). + +
+
+ comment: 34 pages +
+
+
+
+
+ + ♻ ☆ Predicting Generalization of AI Colonoscopy Models to Unseen Data + + +
+ $\textbf{Background}$: Generalizability of AI colonoscopy algorithms is +important for wider adoption in clinical practice. However, current techniques +for evaluating performance on unseen data require expensive and time-intensive +labels. + $\textbf{Methods}$: We use a "Masked Siamese Network" (MSN) to identify novel +phenomena in unseen data and predict polyp detector performance. MSN is trained +to predict masked out regions of polyp images, without any labels. We test +MSN's ability to be trained on data only from Israel and detect unseen +techniques, narrow-band imaging (NBI) and chromendoscoy (CE), on colonoscopes +from Japan (354 videos, 128 hours). We also test MSN's ability to predict +performance of Computer Aided Detection (CADe) of polyps on colonoscopies from +both countries, even though MSN is not trained on data from Japan. + $\textbf{Results}$: MSN correctly identifies NBI and CE as less similar to +Israel whitelight than Japan whitelight (bootstrapped z-test, |z| > 496, p < +10^-8 for both) using the label-free Frechet distance. MSN detects NBI with 99% +accuracy, predicts CE better than our heuristic (90% vs 79% accuracy) despite +being trained only on whitelight, and is the only method that is robust to +noisy labels. MSN predicts CADe polyp detector performance on in-domain Israel +and out-of-domain Japan colonoscopies (r=0.79, 0.37 respectively). With few +examples of Japan detector performance to train on, MSN prediction of Japan +performance improves (r=0.56). + $\textbf{Conclusion}$: Our technique can identify distribution shifts in +clinical data and can predict CADe detector performance on unseen data, without +labels. Our self-supervised approach can aid in detecting when data in practice +is different from training, such as between hospitals or data has meaningfully +shifted from training. MSN has potential for application to medical image +domains beyond colonoscopy. + +
+
+
+
+
+ + ♻ ☆ Retrieval-Augmented Layout Transformer for Content-Aware Layout + Generation CVPR 2024 + + +
+ Content-aware graphic layout generation aims to automatically arrange visual +elements along with a given content, such as an e-commerce product image. In +this paper, we argue that the current layout generation approaches suffer from +the limited training data for the high-dimensional layout structure. We show +that a simple retrieval augmentation can significantly improve the generation +quality. Our model, which is named Retrieval-Augmented Layout Transformer +(RALF), retrieves nearest neighbor layout examples based on an input image and +feeds these results into an autoregressive generator. Our model can apply +retrieval augmentation to various controllable generation tasks and yield +high-quality layouts within a unified architecture. Our extensive experiments +show that RALF successfully generates content-aware layouts in both constrained +and unconstrained settings and significantly outperforms the baselines. + +
+
+ comment: Accepted to CVPR 2024, Project website: + https://udonda.github.io/RALF/ +
+
+
+
+
+ + ♻ ☆ BiTT: Bi-directional Texture Reconstruction of Interacting Two Hands + from a Single Image CVPR 2024 + + +
+ Creating personalized hand avatars is important to offer a realistic +experience to users on AR / VR platforms. While most prior studies focused on +reconstructing 3D hand shapes, some recent work has tackled the reconstruction +of hand textures on top of shapes. However, these methods are often limited to +capturing pixels on the visible side of a hand, requiring diverse views of the +hand in a video or multiple images as input. In this paper, we propose a novel +method, BiTT(Bi-directional Texture reconstruction of Two hands), which is the +first end-to-end trainable method for relightable, pose-free texture +reconstruction of two interacting hands taking only a single RGB image, by +three novel components: 1) bi-directional (left $\leftrightarrow$ right) +texture reconstruction using the texture symmetry of left / right hands, 2) +utilizing a texture parametric model for hand texture recovery, and 3) the +overall coarse-to-fine stage pipeline for reconstructing personalized texture +of two interacting hands. BiTT first estimates the scene light condition and +albedo image from an input image, then reconstructs the texture of both hands +through the texture parametric model and bi-directional texture reconstructor. +In experiments using InterHand2.6M and RGB2Hands datasets, our method +significantly outperforms state-of-the-art hand texture reconstruction methods +quantitatively and qualitatively. The code is available at +https://github.com/yunminjin2/BiTT + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ On Image Search in Histopathology + + +
+ Pathology images of histopathology can be acquired from camera-mounted +microscopes or whole slide scanners. Utilizing similarity calculations to match +patients based on these images holds significant potential in research and +clinical contexts. Recent advancements in search technologies allow for +implicit quantification of tissue morphology across diverse primary sites, +facilitating comparisons and enabling inferences about diagnosis, and +potentially prognosis, and predictions for new patients when compared against a +curated database of diagnosed and treated cases. In this paper, we +comprehensively review the latest developments in image search technologies for +histopathology, offering a concise overview tailored for computational +pathology researchers seeking effective, fast and efficient image search +methods in their work. + +
+
+ comment: A chapter in the Book "Artificial INtelligence in Digital Pathology" + by Cohen and Chauhan, 2024 +
+
+
+
+
+ + ♻ ☆ Video Super-Resolution Transformer with Masked Inter&Intra-Frame + Attention CVPR 2024 + + +
+ Recently, Vision Transformer has achieved great success in recovering missing +details in low-resolution sequences, i.e., the video super-resolution (VSR) +task. Despite its superiority in VSR accuracy, the heavy computational burden +as well as the large memory footprint hinder the deployment of +Transformer-based VSR models on constrained devices. In this paper, we address +the above issue by proposing a novel feature-level masked processing framework: +VSR with Masked Intra and inter frame Attention (MIA-VSR). The core of MIA-VSR +is leveraging feature-level temporal continuity between adjacent frames to +reduce redundant computations and make more rational use of previously enhanced +SR features. Concretely, we propose an intra-frame and inter-frame attention +block which takes the respective roles of past features and input features into +consideration and only exploits previously enhanced features to provide +supplementary information. In addition, an adaptive block-wise mask prediction +module is developed to skip unimportant computations according to feature +similarity between adjacent frames. We conduct detailed ablation studies to +validate our contributions and compare the proposed method with recent +state-of-the-art VSR approaches. The experimental results demonstrate that +MIA-VSR improves the memory and computation efficiency over state-of-the-art +methods, without trading off PSNR accuracy. The code is available at +https://github.com/LabShuHangGU/MIA-VSR. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ SwiftBrush: One-Step Text-to-Image Diffusion Model with Variational + Score Distillation CVPR 2024 + + +
+ Despite their ability to generate high-resolution and diverse images from +text prompts, text-to-image diffusion models often suffer from slow iterative +sampling processes. Model distillation is one of the most effective directions +to accelerate these models. However, previous distillation methods fail to +retain the generation quality while requiring a significant amount of images +for training, either from real data or synthetically generated by the teacher +model. In response to this limitation, we present a novel image-free +distillation scheme named $\textbf{SwiftBrush}$. Drawing inspiration from +text-to-3D synthesis, in which a 3D neural radiance field that aligns with the +input prompt can be obtained from a 2D text-to-image diffusion prior via a +specialized loss without the use of any 3D data ground-truth, our approach +re-purposes that same loss for distilling a pretrained multi-step text-to-image +model to a student network that can generate high-fidelity images with just a +single inference step. In spite of its simplicity, our model stands as one of +the first one-step text-to-image generators that can produce images of +comparable quality to Stable Diffusion without reliance on any training image +data. Remarkably, SwiftBrush achieves an FID score of $\textbf{16.67}$ and a +CLIP score of $\textbf{0.29}$ on the COCO-30K benchmark, achieving competitive +results or even substantially surpassing existing state-of-the-art distillation +techniques. + +
+
+ comment: Accepted to CVPR 2024; Project Page: + https://thuanz123.github.io/swiftbrush/ +
+
+
+
+
+ + ♻ ☆ Physics-Enhanced Multi-fidelity Learning for Optical Surface Imprint + + +
+ Human fingerprints serve as one unique and powerful characteristic for each +person, from which policemen can recognize the identity. Similar to humans, +many natural bodies and intrinsic mechanical qualities can also be uniquely +identified from surface characteristics. To measure the elasto-plastic +properties of one material, one formally sharp indenter is pushed into the +measured body under constant force and retracted, leaving a unique residual +imprint of the minute size from several micrometers to nanometers. However, one +great challenge is how to map the optical image of this residual imprint into +the real wanted mechanical properties, \ie, the tensile force curve. In this +paper, we propose a novel method to use multi-fidelity neural networks (MFNN) +to solve this inverse problem. We first build up the NN model via pure +simulation data, and then bridge the sim-to-real gap via transfer learning. +Considering the difficulty of collecting real experimental data, we use NN to +dig out the unknown physics and also implant the known physics into the +transfer learning framework, thus highly improving the model stability and +decreasing the data requirement. The final constructed model only needs +three-shot calibration of real materials. We tested the final model across 20 +real materials and achieved satisfying accuracy. This work serves as one great +example of applying machine learning into scientific research, especially under +the constraints of data limitation and fidelity variance. + +
+
+ comment: 15 pages, 11 figure +
+
+
+
+
+ + ♻ ☆ Rethinking Boundary Discontinuity Problem for Oriented Object Detection + + +
+ Oriented object detection has been developed rapidly in the past few years, +where rotation equivariance is crucial for detectors to predict rotated boxes. +It is expected that the prediction can maintain the corresponding rotation when +objects rotate, but severe mutation in angular prediction is sometimes observed +when objects rotate near the boundary angle, which is well-known boundary +discontinuity problem. The problem has been long believed to be caused by the +sharp loss increase at the angular boundary, and widely used joint-optim +IoU-like methods deal with this problem by loss-smoothing. However, we +experimentally find that even state-of-the-art IoU-like methods actually fail +to solve the problem. On further analysis, we find that the key to solution +lies in encoding mode of the smoothing function rather than in joint or +independent optimization. In existing IoU-like methods, the model essentially +attempts to fit the angular relationship between box and object, where the +break point at angular boundary makes the predictions highly unstable.To deal +with this issue, we propose a dual-optimization paradigm for angles. We +decouple reversibility and joint-optim from single smoothing function into two +distinct entities, which for the first time achieves the objectives of both +correcting angular boundary and blending angle with other parameters.Extensive +experiments on multiple datasets show that boundary discontinuity problem is +well-addressed. Moreover, typical IoU-like methods are improved to the same +level without obvious performance gap. The code is available at +https://github.com/hangxu-cv/cvpr24acm. + +
+
+ comment: cvpr 2024 +
+
+
+
+
+ + ♻ ☆ BadCLIP: Trigger-Aware Prompt Learning for Backdoor Attacks on CLIP + + +
+ Contrastive Vision-Language Pre-training, known as CLIP, has shown promising +effectiveness in addressing downstream image recognition tasks. However, recent +works revealed that the CLIP model can be implanted with a downstream-oriented +backdoor. On downstream tasks, one victim model performs well on clean samples +but predicts a specific target class whenever a specific trigger is present. +For injecting a backdoor, existing attacks depend on a large amount of +additional data to maliciously fine-tune the entire pre-trained CLIP model, +which makes them inapplicable to data-limited scenarios. In this work, +motivated by the recent success of learnable prompts, we address this problem +by injecting a backdoor into the CLIP model in the prompt learning stage. Our +method named BadCLIP is built on a novel and effective mechanism in backdoor +attacks on CLIP, i.e., influencing both the image and text encoders with the +trigger. It consists of a learnable trigger applied to images and a +trigger-aware context generator, such that the trigger can change text features +via trigger-aware prompts, resulting in a powerful and generalizable attack. +Extensive experiments conducted on 11 datasets verify that the clean accuracy +of BadCLIP is similar to those of advanced prompt learning methods and the +attack success rate is higher than 99% in most cases. BadCLIP is also +generalizable to unseen classes, and shows a strong generalization capability +under cross-dataset and cross-domain settings. + +
+
+ comment: 14 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Hulk: A Universal Knowledge Translator for Human-Centric Tasks + + +
+ Human-centric perception tasks, e.g., pedestrian detection, skeleton-based +action recognition, and pose estimation, have wide industrial applications, +such as metaverse and sports analysis. There is a recent surge to develop +human-centric foundation models that can benefit a broad range of human-centric +perception tasks. While many human-centric foundation models have achieved +success, they did not explore 3D and vision-language tasks for human-centric +and required task-specific finetuning. These limitations restrict their +application to more downstream tasks and situations. To tackle these problems, +we present Hulk, the first multimodal human-centric generalist model, capable +of addressing 2D vision, 3D vision, skeleton-based, and vision-language tasks +without task-specific finetuning. The key to achieving this is condensing +various task-specific heads into two general heads, one for discrete +representations, e.g., languages, and the other for continuous representations, +e.g., location coordinates. The outputs of two heads can be further stacked +into four distinct input and output modalities. This uniform representation +enables Hulk to treat diverse human-centric tasks as modality translation, +integrating knowledge across a wide range of tasks. Comprehensive evaluations +of Hulk on 12 benchmarks covering 8 human-centric tasks demonstrate the +superiority of our proposed method, achieving state-of-the-art performance in +11 benchmarks. The code is available on https://github.com/OpenGVLab/Hulk. + +
+
+ comment: 24 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ CBNet: A Plug-and-Play Network for Segmentation-Based Scene Text + Detection + + +
+ Recently, segmentation-based methods are quite popular in scene text +detection, which mainly contain two steps: text kernel segmentation and +expansion. However, the segmentation process only considers each pixel +independently, and the expansion process is difficult to achieve a favorable +accuracy-speed trade-off. In this paper, we propose a Context-aware and +Boundary-guided Network (CBN) to tackle these problems. In CBN, a basic text +detector is firstly used to predict initial segmentation results. Then, we +propose a context-aware module to enhance text kernel feature representations, +which considers both global and local contexts. Finally, we introduce a +boundary-guided module to expand enhanced text kernels adaptively with only the +pixels on the contours, which not only obtains accurate text boundaries but +also keeps high speed, especially on high-resolution output maps. In +particular, with a lightweight backbone, the basic detector equipped with our +proposed CBN achieves state-of-the-art results on several popular benchmarks, +and our proposed CBN can be plugged into several segmentation-based methods. +Code is available at https://github.com/XiiZhao/cbn.pytorch. + +
+
+ comment: Accepted by IJCV 2024. Code is available at + https://github.com/XiiZhao/cbn.pytorch +
+
+
+
+
+ + ♻ ☆ Large Multilingual Models Pivot Zero-Shot Multimodal Learning across + Languages + + +
+ Recently there has been a significant surge in multimodal learning in terms +of both image-to-text and text-to-image generation. However, the success is +typically limited to English, leaving other languages largely behind. Building +a competitive counterpart in other languages is highly challenging due to the +low-resource nature of non-English multimodal data (i.e., lack of large-scale, +high-quality image-text data). In this work, we propose MPM, an effective +training paradigm for training large multimodal models in non-English +languages. MPM demonstrates that Multilingual language models can Pivot +zero-shot Multimodal learning across languages. Specifically, based on a strong +multilingual large language model, multimodal models pretrained on English-only +image-text data can well generalize to other languages in a (quasi)-zero-shot +manner, even surpassing models trained on image-text data in native languages. +Taking Chinese as a practice of MPM, we build large multimodal models VisCPM in +image-to-text and text-to-image generation, which achieve state-of-the-art +(open-source) performance in Chinese. To facilitate future research, we +open-source codes and model weights at https://github.com/OpenBMB/VisCPM.git. + +
+
+ comment: https://github.com/OpenBMB/VisCPM.git +
+
+
+
+
+ + ♻ ☆ Bidirectional Temporal Diffusion Model for Temporally Consistent Human + Animation + + +
+ We introduce a method to generate temporally coherent human animation from a +single image, a video, or a random noise. This problem has been formulated as +modeling of an auto-regressive generation, i.e., to regress past frames to +decode future frames. However, such unidirectional generation is highly prone +to motion drifting over time, generating unrealistic human animation with +significant artifacts such as appearance distortion. We claim that +bidirectional temporal modeling enforces temporal coherence on a generative +network by largely suppressing the motion ambiguity of human appearance. To +prove our claim, we design a novel human animation framework using a denoising +diffusion model: a neural network learns to generate the image of a person by +denoising temporal Gaussian noises whose intermediate results are +cross-conditioned bidirectionally between consecutive frames. In the +experiments, our method demonstrates strong performance compared to existing +unidirectional approaches with realistic temporal coherence. + +
+
+ comment: Project page: see https://typest.github.io/btdm +
+
+
+
+
+ + ♻ ☆ AnyV2V: A Plug-and-Play Framework For Any Video-to-Video Editing Tasks + + +
+ Video-to-video editing involves editing a source video along with additional +control (such as text prompts, subjects, or styles) to generate a new video +that aligns with the source video and the provided control. Traditional methods +have been constrained to certain editing types, limiting their ability to meet +the wide range of user demands. In this paper, we introduce AnyV2V, a novel +training-free framework designed to simplify video editing into two primary +steps: (1) employing an off-the-shelf image editing model (e.g. +InstructPix2Pix, InstantID, etc) to modify the first frame, (2) utilizing an +existing image-to-video generation model (e.g. I2VGen-XL) for DDIM inversion +and feature injection. In the first stage, AnyV2V can plug in any existing +image editing tools to support an extensive array of video editing tasks. +Beyond the traditional prompt-based editing methods, AnyV2V also can support +novel video editing tasks, including reference-based style transfer, +subject-driven editing, and identity manipulation, which were unattainable by +previous methods. In the second stage, AnyV2V can plug in any existing +image-to-video models to perform DDIM inversion and intermediate feature +injection to maintain the appearance and motion consistency with the source +video. On the prompt-based editing, we show that AnyV2V can outperform the +previous best approach by 35\% on prompt alignment, and 25\% on human +preference. On the three novel tasks, we show that AnyV2V also achieves a high +success rate. We believe AnyV2V will continue to thrive due to its ability to +seamlessly integrate the fast-evolving image editing methods. Such +compatibility can help AnyV2V to increase its versatility to cater to diverse +user demands. + +
+
+ comment: preprint +
+
+
+
+
+ + ♻ ☆ iSLAM: Imperative SLAM + + +
+ Simultaneous Localization and Mapping (SLAM) stands as one of the critical +challenges in robot navigation. A SLAM system often consists of a front-end +component for motion estimation and a back-end system for eliminating +estimation drifts. Recent advancements suggest that data-driven methods are +highly effective for front-end tasks, while geometry-based methods continue to +be essential in the back-end processes. However, such a decoupled paradigm +between the data-driven front-end and geometry-based back-end can lead to +sub-optimal performance, consequently reducing the system's capabilities and +generalization potential. To solve this problem, we proposed a novel +self-supervised imperative learning framework, named imperative SLAM (iSLAM), +which fosters reciprocal correction between the front-end and back-end, thus +enhancing performance without necessitating any external supervision. +Specifically, we formulate the SLAM problem as a bilevel optimization so that +the front-end and back-end are bidirectionally connected. As a result, the +front-end model can learn global geometric knowledge obtained through pose +graph optimization by back-propagating the residuals from the back-end +component. We showcase the effectiveness of this new framework through an +application of stereo-inertial SLAM. The experiments show that the iSLAM +training strategy achieves an accuracy improvement of 22% on average over a +baseline model. To the best of our knowledge, iSLAM is the first SLAM system +showing that the front-end and back-end components can mutually correct each +other in a self-supervised manner. + +
+
+ comment: The paper has been accepted by IEEE Robotics and Automation Letters + (RA-L) +
+
+
+
+
+ + ♻ ☆ NAYER: Noisy Layer Data Generation for Efficient and Effective Data-free + Knowledge Distillation CVPR 2024 + + +
+ Data-Free Knowledge Distillation (DFKD) has made significant recent strides +by transferring knowledge from a teacher neural network to a student neural +network without accessing the original data. Nonetheless, existing approaches +encounter a significant challenge when attempting to generate samples from +random noise inputs, which inherently lack meaningful information. +Consequently, these models struggle to effectively map this noise to the +ground-truth sample distribution, resulting in prolonging training times and +low-quality outputs. In this paper, we propose a novel Noisy Layer Generation +method (NAYER) which relocates the random source from the input to a noisy +layer and utilizes the meaningful constant label-text embedding (LTE) as the +input. LTE is generated by using the language model once, and then it is stored +in memory for all subsequent training processes. The significance of LTE lies +in its ability to contain substantial meaningful inter-class information, +enabling the generation of high-quality samples with only a few training steps. +Simultaneously, the noisy layer plays a key role in addressing the issue of +diversity in sample generation by preventing the model from overemphasizing the +constrained label information. By reinitializing the noisy layer in each +iteration, we aim to facilitate the generation of diverse samples while still +retaining the method's efficiency, thanks to the ease of learning provided by +LTE. Experiments carried out on multiple datasets demonstrate that our NAYER +not only outperforms the state-of-the-art methods but also achieves speeds 5 to +15 times faster than previous approaches. The code is available at +https://github.com/tmtuan1307/nayer. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ UniChest: Conquer-and-Divide Pre-training for Multi-Source Chest X-Ray + Classification + + +
+ Vision-Language Pre-training (VLP) that utilizes the multi-modal information +to promote the training efficiency and effectiveness, has achieved great +success in vision recognition of natural domains and shown promise in medical +imaging diagnosis for the Chest X-Rays (CXRs). However, current works mainly +pay attention to the exploration on single dataset of CXRs, which locks the +potential of this powerful paradigm on larger hybrid of multi-source CXRs +datasets. We identify that although blending samples from the diverse sources +offers the advantages to improve the model generalization, it is still +challenging to maintain the consistent superiority for the task of each source +due to the existing heterogeneity among sources. To handle this dilemma, we +design a Conquer-and-Divide pre-training framework, termed as UniChest, aiming +to make full use of the collaboration benefit of multiple sources of CXRs while +reducing the negative influence of the source heterogeneity. Specially, the +``Conquer" stage in UniChest encourages the model to sufficiently capture +multi-source common patterns, and the ``Divide" stage helps squeeze +personalized patterns into different small experts (query networks). We conduct +thorough experiments on many benchmarks, e.g., ChestX-ray14, CheXpert, +Vindr-CXR, Shenzhen, Open-I and SIIM-ACR Pneumothorax, verifying the +effectiveness of UniChest over a range of baselines, and release our codes and +pre-training models at https://github.com/Elfenreigen/UniChest. + +
+
+ comment: Accepted at IEEE Transactions on Medical Imaging +
+
+
+
+
+ + ♻ ☆ Spacewalk-18: A Benchmark for Multimodal and Long-form Procedural Video + Understanding + + +
+ Learning from videos is an emerging research area that enables robots to +acquire skills from human demonstrations, such as procedural videos. To do +this, video-language models must be able to obtain structured understandings, +such as the temporal segmentation of a demonstration into sequences of actions +and skills, and to generalize the understandings to novel domains. In pursuit +of this goal, we introduce Spacewalk-18, a benchmark containing two tasks: (1) +step recognition and (2) intra-video retrieval over a dataset of temporally +segmented and labeled tasks in International Space Station spacewalk +recordings. In tandem, the two tasks quantify a model's ability to make use of: +(1) out-of-domain visual information; (2) a high temporal context window; and +(3) multimodal (e.g. visual and speech) domains. This departs from existing +benchmarks for procedural video understanding, which typically deal with short +context lengths and can be solved with a single modality. Spacewalk-18, with +its inherent multimodal and long-form complexity, exposes the high difficulty +of task recognition and segmentation. We find that state-of-the-art methods +perform poorly on our benchmark, but improvements can be obtained by +incorporating information from longer-range temporal context across different +modalities. Our experiments underscore the need to develop new approaches to +these tasks. Data, model, and code will be released at +https://brown-palm.github.io/Spacewalk-18/. + +
+
+ comment: Under submission. Code and models will be released at + https://brown-palm.github.io/Spacewalk-18/ +
+
+
+
+
+ + ♻ ☆ Few-shot Adaption to Distribution Shifts By Mixing Source and Target + Embeddings + + +
+ Pretrained machine learning models need to be adapted to distribution shifts +when deployed in new target environments. When obtaining labeled data from the +target distribution is expensive, few-shot adaptation with only a few examples +from the target distribution becomes essential. In this work, we propose +MixPro, a lightweight and highly data-efficient approach for few-shot +adaptation. MixPro first generates a relatively large dataset by mixing +(linearly combining) pre-trained embeddings of large source data with those of +the few target examples. This process preserves important features of both +source and target distributions, while mitigating the specific noise in the +small target data. Then, it trains a linear classifier on the mixed embeddings +to effectively adapts the model to the target distribution without overfitting +the small target data. Theoretically, we demonstrate the advantages of MixPro +over previous methods. Our experiments, conducted across various model +architectures on 8 datasets featuring different types of distribution shifts, +reveal that MixPro can outperform baselines by up to 7\%, with only 2-4 target +examples. + +
+
+
+
+
+ + ♻ ☆ Online Open-set Semi-supervised Object Detection with Dual Competing + Head + + +
+ Open-set semi-supervised object detection (OSSOD) task leverages practical +open-set unlabeled datasets that comprise both in-distribution (ID) and +out-of-distribution (OOD) instances for conducting semi-supervised object +detection (SSOD). The main challenge in OSSOD is distinguishing and filtering +the OOD instances (i.e., outliers) during pseudo-labeling since OODs will +affect the performance. The only OSSOD work employs an additional offline OOD +detection network trained solely with labeled data to solve this problem. +However, the limited labeled data restricts the potential for improvement. +Meanwhile, the offline strategy results in low efficiency. To alleviate these +issues, this paper proposes an end-to-end online OSSOD framework that improves +performance and efficiency: 1) We propose a semi-supervised outlier filtering +method that more effectively filters the OOD instances using both labeled and +unlabeled data. 2) We propose a threshold-free Dual Competing OOD head that +further improves the performance by suppressing the error accumulation during +semi-supervised outlier filtering. 3) Our proposed method is an online +end-to-end trainable OSSOD framework. Experimental results show that our method +achieves state-of-the-art performance on several OSSOD benchmarks compared to +existing methods. Moreover, additional experiments show that our method is more +efficient and can be easily applied to different SSOD frameworks to boost their +performance. + +
+
+
+
+
+ + ♻ ☆ SecondPose: SE(3)-Consistent Dual-Stream Feature Fusion for + Category-Level Pose Estimation CVPR 2024 + + +
+ Category-level object pose estimation, aiming to predict the 6D pose and 3D +size of objects from known categories, typically struggles with large +intra-class shape variation. Existing works utilizing mean shapes often fall +short of capturing this variation. To address this issue, we present +SecondPose, a novel approach integrating object-specific geometric features +with semantic category priors from DINOv2. Leveraging the advantage of DINOv2 +in providing SE(3)-consistent semantic features, we hierarchically extract two +types of SE(3)-invariant geometric features to further encapsulate +local-to-global object-specific information. These geometric features are then +point-aligned with DINOv2 features to establish a consistent object +representation under SE(3) transformations, facilitating the mapping from +camera space to the pre-defined canonical space, thus further enhancing pose +estimation. Extensive experiments on NOCS-REAL275 demonstrate that SecondPose +achieves a 12.4% leap forward over the state-of-the-art. Moreover, on a more +complex dataset HouseCat6D which provides photometrically challenging objects, +SecondPose still surpasses other competitors by a large margin. + +
+
+ comment: CVPR 2024 accepted. Code is available at: + https://github.com/NOrangeeroli/SecondPose +
+
+
+
+
+ + ♻ ☆ A Unified Model for Longitudinal Multi-Modal Multi-View Prediction with + Missingness + + +
+ Medical records often consist of different modalities, such as images, text, +and tabular information. Integrating all modalities offers a holistic view of a +patient's condition, while analyzing them longitudinally provides a better +understanding of disease progression. However, real-world longitudinal medical +records present challenges: 1) patients may lack some or all of the data for a +specific timepoint, and 2) certain modalities or views might be absent for all +patients during a particular period. In this work, we introduce a unified model +for longitudinal multi-modal multi-view prediction with missingness. Our method +allows as many timepoints as desired for input, and aims to leverage all +available data, regardless of their availability. We conduct extensive +experiments on the knee osteoarthritis dataset from the Osteoarthritis +Initiative for pain and Kellgren-Lawrence grade prediction at a future +timepoint. We demonstrate the effectiveness of our method by comparing results +from our unified model to specific models that use the same modality and view +combinations during training and evaluation. We also show the benefit of having +extended temporal data and provide post-hoc analysis for a deeper understanding +of each modality/view's importance for different tasks. + +
+
+
+
+
+ + ♻ ☆ Learning for Transductive Threshold Calibration in Open-World + Recognition + + +
+ In deep metric learning for visual recognition, the calibration of distance +thresholds is crucial for achieving desired model performance in the true +positive rates (TPR) or true negative rates (TNR). However, calibrating this +threshold presents challenges in open-world scenarios, where the test classes +can be entirely disjoint from those encountered during training. We define the +problem of finding distance thresholds for a trained embedding model to achieve +target performance metrics over unseen open-world test classes as open-world +threshold calibration. Existing posthoc threshold calibration methods, reliant +on inductive inference and requiring a calibration dataset with a similar +distance distribution as the test data, often prove ineffective in open-world +scenarios. To address this, we introduce OpenGCN, a Graph Neural Network-based +transductive threshold calibration method with enhanced adaptability and +robustness. OpenGCN learns to predict pairwise connectivity for the unlabeled +test instances embedded in a graph to determine its TPR and TNR at various +distance thresholds, allowing for transductive inference of the distance +thresholds which also incorporates test-time information. Extensive experiments +across open-world visual recognition benchmarks validate OpenGCN's superiority +over existing posthoc calibration methods for open-world threshold calibration. + +
+
+
+
+
+ + ♻ ☆ Generative deep learning-enabled ultra-large field-of-view lens-free + imaging + + +
+ Advancements in high-throughput biomedical applications necessitate +real-time, large field-of-view (FOV) imaging capabilities. Conventional +lens-free imaging (LFI) systems, while addressing the limitations of physical +lenses, have been constrained by dynamic, hard-to-model optical fields, +resulting in a limited one-shot FOV of approximately 20 $mm^2$. This +restriction has been a major bottleneck in applications like live-cell imaging +and automation of microfluidic systems for biomedical research. Here, we +present a deep-learning(DL)-based imaging framework - GenLFI - leveraging +generative artificial intelligence (AI) for holographic image reconstruction. +We demonstrate that GenLFI can achieve a real-time FOV over 550 $mm^2$, +surpassing the current LFI system by more than 20-fold, and even larger than +the world's largest confocal microscope by 1.76 times. The resolution is at the +sub-pixel level of 5.52 $\mu m$, without the need for a shifting light source. +The unsupervised learning-based reconstruction does not require optical field +modeling, making imaging dynamic 3D samples (e.g., droplet-based microfluidics +and 3D cell models) in complex optical fields possible. This GenLFI framework +unlocks the potential of LFI systems, offering a robust tool to tackle new +frontiers in high-throughput biomedical applications such as drug discovery. + +
+
+
+
+
+ + ♻ ☆ HalluciDet: Hallucinating RGB Modality for Person Detection Through + Privileged Information WACV + + +
+ A powerful way to adapt a visual recognition model to a new domain is through +image translation. However, common image translation approaches only focus on +generating data from the same distribution as the target domain. Given a +cross-modal application, such as pedestrian detection from aerial images, with +a considerable shift in data distribution between infrared (IR) to visible +(RGB) images, a translation focused on generation might lead to poor +performance as the loss focuses on irrelevant details for the task. In this +paper, we propose HalluciDet, an IR-RGB image translation model for object +detection. Instead of focusing on reconstructing the original image on the IR +modality, it seeks to reduce the detection loss of an RGB detector, and +therefore avoids the need to access RGB data. This model produces a new image +representation that enhances objects of interest in the scene and greatly +improves detection performance. We empirically compare our approach against +state-of-the-art methods for image translation and for fine-tuning on IR, and +show that our HalluciDet improves detection accuracy in most cases by +exploiting the privileged information encoded in a pre-trained RGB detector. +Code: https://github.com/heitorrapela/HalluciDet + +
+
+ comment: IEEE/CVF Winter Conference on Applications of Computer Vision (WACV) + 2024 +
+
+
+
+
+ + ♻ ☆ Decoupled Data Consistency with Diffusion Purification for Image + Restoration + + +
+ Diffusion models have recently gained traction as a powerful class of deep +generative priors, excelling in a wide range of image restoration tasks due to +their exceptional ability to model data distributions. To solve image +restoration problems, many existing techniques achieve data consistency by +incorporating additional likelihood gradient steps into the reverse sampling +process of diffusion models. However, the additional gradient steps pose a +challenge for real-world practical applications as they incur a large +computational overhead, thereby increasing inference time. They also present +additional difficulties when using accelerated diffusion model samplers, as the +number of data consistency steps is limited by the number of reverse sampling +steps. In this work, we propose a novel diffusion-based image restoration +solver that addresses these issues by decoupling the reverse process from the +data consistency steps. Our method involves alternating between a +reconstruction phase to maintain data consistency and a refinement phase that +enforces the prior via diffusion purification. Our approach demonstrates +versatility, making it highly adaptable for efficient problem-solving in latent +space. Additionally, it reduces the necessity for numerous sampling steps +through the integration of consistency models. The efficacy of our approach is +validated through comprehensive experiments across various image restoration +tasks, including image denoising, deblurring, inpainting, and super-resolution. + +
+
+
+
+
+ + ♻ ☆ Scene-LLM: Extending Language Model for 3D Visual Understanding and + Reasoning + + +
+ This paper introduces Scene-LLM, a 3D-visual-language model that enhances +embodied agents' abilities in interactive 3D indoor environments by integrating +the reasoning strengths of Large Language Models (LLMs). Scene-LLM adopts a +hybrid 3D visual feature representation, that incorporates dense spatial +information and supports scene state updates. The model employs a projection +layer to efficiently project these features in the pre-trained textual +embedding space, enabling effective interpretation of 3D visual information. +Unique to our approach is the integration of both scene-level and ego-centric +3D information. This combination is pivotal for interactive planning, where +scene-level data supports global planning and ego-centric data is important for +localization. Notably, we use ego-centric 3D frame features for feature +alignment, an efficient technique that enhances the model's ability to align +features of small objects within the scene. Our experiments with Scene-LLM +demonstrate its strong capabilities in dense captioning, question answering, +and interactive planning. We believe Scene-LLM advances the field of 3D visual +understanding and reasoning, offering new possibilities for sophisticated agent +interactions in indoor settings. + +
+
+
+
+
+ + ♻ ☆ Do Vision and Language Encoders Represent the World Similarly? CVPR 2024 + + +
+ Aligned text-image encoders such as CLIP have become the de facto model for +vision-language tasks. Furthermore, modality-specific encoders achieve +impressive performances in their respective domains. This raises a central +question: does an alignment exist between uni-modal vision and language +encoders since they fundamentally represent the same physical world? Analyzing +the latent spaces structure of vision and language models on image-caption +benchmarks using the Centered Kernel Alignment (CKA), we find that the +representation spaces of unaligned and aligned encoders are semantically +similar. In the absence of statistical similarity in aligned encoders like +CLIP, we show that a possible matching of unaligned encoders exists without any +training. We frame this as a seeded graph-matching problem exploiting the +semantic similarity between graphs and propose two methods - a Fast Quadratic +Assignment Problem optimization, and a novel localized CKA metric-based +matching/retrieval. We demonstrate the effectiveness of this on several +downstream tasks including cross-lingual, cross-domain caption matching and +image classification. Code available at github.com/mayug/0-shot-llm-vision. + +
+
+ comment: Accepted CVPR 2024 +
+
+
+
+
+ + ♻ ☆ An Audio-Visual Speech Separation Model Inspired by + Cortico-Thalamo-Cortical Circuits + + +
+ Audio-visual approaches involving visual inputs have laid the foundation for +recent progress in speech separation. However, the optimization of the +concurrent usage of auditory and visual inputs is still an active research +area. Inspired by the cortico-thalamo-cortical circuit, in which the sensory +processing mechanisms of different modalities modulate one another via the +non-lemniscal sensory thalamus, we propose a novel cortico-thalamo-cortical +neural network (CTCNet) for audio-visual speech separation (AVSS). First, the +CTCNet learns hierarchical auditory and visual representations in a bottom-up +manner in separate auditory and visual subnetworks, mimicking the functions of +the auditory and visual cortical areas. Then, inspired by the large number of +connections between cortical regions and the thalamus, the model fuses the +auditory and visual information in a thalamic subnetwork through top-down +connections. Finally, the model transmits this fused information back to the +auditory and visual subnetworks, and the above process is repeated several +times. The results of experiments on three speech separation benchmark datasets +show that CTCNet remarkably outperforms existing AVSS methods with considerably +fewer parameters. These results suggest that mimicking the anatomical +connectome of the mammalian brain has great potential for advancing the +development of deep neural networks. Project repo is +https://github.com/JusperLee/CTCNet. + +
+
+ comment: Accepted by TPAMI 2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 241 + +
+
+
+ + ☆ Zero-Shot Multi-Object Shape Completion + + +
+ We present a 3D shape completion method that recovers the complete geometry +of multiple objects in complex scenes from a single RGB-D image. Despite +notable advancements in single object 3D shape completion, high-quality +reconstructions in highly cluttered real-world multi-object scenes remains a +challenge. To address this issue, we propose OctMAE, an architecture that +leverages an Octree U-Net and a latent 3D MAE to achieve high-quality and near +real-time multi-object shape completion through both local and global geometric +reasoning. Because a na\"ive 3D MAE can be computationally intractable and +memory intensive even in the latent space, we introduce a novel occlusion +masking strategy and adopt 3D rotary embeddings, which significantly improves +the runtime and shape completion quality. To generalize to a wide range of +objects in diverse scenes, we create a large-scale photorealistic dataset, +featuring a diverse set of 12K 3D object models from the Objaverse dataset +which are rendered in multi-object scenes with physics-based positioning. Our +method outperforms the current state-of-the-art on both synthetic and +real-world datasets and demonstrates a strong zero-shot capability. + +
+
+ comment: 21 pages, 8 figues +
+
+
+
+
+ + ☆ MVSplat: Efficient 3D Gaussian Splatting from Sparse Multi-View Images + + +
+ We propose MVSplat, an efficient feed-forward 3D Gaussian Splatting model +learned from sparse multi-view images. To accurately localize the Gaussian +centers, we propose to build a cost volume representation via plane sweeping in +the 3D space, where the cross-view feature similarities stored in the cost +volume can provide valuable geometry cues to the estimation of depth. We learn +the Gaussian primitives' opacities, covariances, and spherical harmonics +coefficients jointly with the Gaussian centers while only relying on +photometric supervision. We demonstrate the importance of the cost volume +representation in learning feed-forward Gaussian Splatting models via extensive +experimental evaluations. On the large-scale RealEstate10K and ACID benchmarks, +our model achieves state-of-the-art performance with the fastest feed-forward +inference speed (22 fps). Compared to the latest state-of-the-art method +pixelSplat, our model uses $10\times $ fewer parameters and infers more than +$2\times$ faster while providing higher appearance and geometry quality as well +as better cross-dataset generalization. + +
+
+ comment: Project page: https://donydchen.github.io/mvsplat Code: + https://github.com/donydchen/mvsplat +
+
+
+
+
+ + ☆ LiFT: A Surprisingly Simple Lightweight Feature Transform for Dense ViT + Descriptors + + +
+ We present a simple self-supervised method to enhance the performance of ViT +features for dense downstream tasks. Our Lightweight Feature Transform (LiFT) +is a straightforward and compact postprocessing network that can be applied to +enhance the features of any pre-trained ViT backbone. LiFT is fast and easy to +train with a self-supervised objective, and it boosts the density of ViT +features for minimal extra inference cost. Furthermore, we demonstrate that +LiFT can be applied with approaches that use additional task-specific +downstream modules, as we integrate LiFT with ViTDet for COCO detection and +segmentation. Despite the simplicity of LiFT, we find that it is not simply +learning a more complex version of bilinear interpolation. Instead, our LiFT +training protocol leads to several desirable emergent properties that benefit +ViT features in dense downstream tasks. This includes greater scale invariance +for features, and better object boundary maps. By simply training LiFT for a +few epochs, we show improved performance on keypoint correspondence, detection, +segmentation, and object discovery tasks. Overall, LiFT provides an easy way to +unlock the benefits of denser feature arrays for a fraction of the +computational cost. For more details, refer to our project page at +https://www.cs.umd.edu/~sakshams/LiFT/. + +
+
+
+
+
+ + ☆ ODTFormer: Efficient Obstacle Detection and Tracking with Stereo Cameras + Based on Transformer + + +
+ Obstacle detection and tracking represent a critical component in robot +autonomous navigation. In this paper, we propose ODTFormer, a Transformer-based +model to address both obstacle detection and tracking problems. For the +detection task, our approach leverages deformable attention to construct a 3D +cost volume, which is decoded progressively in the form of voxel occupancy +grids. We further track the obstacles by matching the voxels between +consecutive frames. The entire model can be optimized in an end-to-end manner. +Through extensive experiments on DrivingStereo and KITTI benchmarks, our model +achieves state-of-the-art performance in the obstacle detection task. We also +report comparable accuracy to state-of-the-art obstacle tracking models while +requiring only a fraction of their computation cost, typically ten-fold to +twenty-fold less. The code and model weights will be publicly released. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ MathVerse: Does Your Multi-modal LLM Truly See the Diagrams in Visual + Math Problems? + + +
+ The remarkable progress of Multi-modal Large Language Models (MLLMs) has +garnered unparalleled attention, due to their superior performance in visual +contexts. However, their capabilities in visual math problem-solving remain +insufficiently evaluated and understood. We investigate current benchmarks to +incorporate excessive visual content within textual questions, which +potentially assist MLLMs in deducing answers without truly interpreting the +input diagrams. To this end, we introduce MathVerse, an all-around visual math +benchmark designed for an equitable and in-depth evaluation of MLLMs. We +meticulously collect 2,612 high-quality, multi-subject math problems with +diagrams from publicly available sources. Each problem is then transformed by +human annotators into six distinct versions, each offering varying degrees of +information content in multi-modality, contributing to 15K test samples in +total. This approach allows MathVerse to comprehensively assess whether and how +much MLLMs can truly understand the visual diagrams for mathematical reasoning. +In addition, we propose a Chain-of-Thought (CoT) evaluation strategy for a +fine-grained assessment of the output answers. Rather than naively judging True +or False, we employ GPT-4(V) to adaptively extract crucial reasoning steps, and +then score each step with detailed error analysis, which can reveal the +intermediate CoT reasoning quality by MLLMs. We hope the MathVerse benchmark +may provide unique insights to guide the future development of MLLMs. Project +page: https://mathverse-cuhk.github.io + +
+
+ comment: 46 Pages, Work in Progress, Benchmark Project Page: + https://mathverse-cuhk.github.io +
+
+
+
+
+ + ☆ Simplified Diffusion Schrödinger Bridge + + +
+ This paper introduces a novel theoretical simplification of the Diffusion +Schr\"odinger Bridge (DSB) that facilitates its unification with Score-based +Generative Models (SGMs), addressing the limitations of DSB in complex data +generation and enabling faster convergence and enhanced performance. By +employing SGMs as an initial solution for DSB, our approach capitalizes on the +strengths of both frameworks, ensuring a more efficient training process and +improving the performance of SGM. We also propose a reparameterization +technique that, despite theoretical approximations, practically improves the +network's fitting capabilities. Our extensive experimental evaluations confirm +the effectiveness of the simplified DSB, demonstrating its significant +improvements. We believe the contributions of this work pave the way for +advanced generative modeling. The code is available at +https://github.com/tzco/Simplified-Diffusion-Schrodinger-Bridge. + +
+
+
+
+
+ + ☆ Language Repository for Long Video Understanding + + +
+ Language has become a prominent modality in computer vision with the rise of +multi-modal LLMs. Despite supporting long context-lengths, their effectiveness +in handling long-term information gradually declines with input length. This +becomes critical, especially in applications such as long-form video +understanding. In this paper, we introduce a Language Repository (LangRepo) for +LLMs, that maintains concise and structured information as an interpretable +(i.e., all-textual) representation. Our repository is updated iteratively based +on multi-scale video chunks. We introduce write and read operations that focus +on pruning redundancies in text, and extracting information at various temporal +scales. The proposed framework is evaluated on zero-shot visual +question-answering benchmarks including EgoSchema, NExT-QA, IntentQA and +NExT-GQA, showing state-of-the-art performance at its scale. Our code is +available at https://github.com/kkahatapitiya/LangRepo. + +
+
+
+
+
+ + ☆ GRM: Large Gaussian Reconstruction Model for Efficient 3D Reconstruction + and Generation + + +
+ We introduce GRM, a large-scale reconstructor capable of recovering a 3D +asset from sparse-view images in around 0.1s. GRM is a feed-forward +transformer-based model that efficiently incorporates multi-view information to +translate the input pixels into pixel-aligned Gaussians, which are unprojected +to create a set of densely distributed 3D Gaussians representing a scene. +Together, our transformer architecture and the use of 3D Gaussians unlock a +scalable and efficient reconstruction framework. Extensive experimental results +demonstrate the superiority of our method over alternatives regarding both +reconstruction quality and efficiency. We also showcase the potential of GRM in +generative tasks, i.e., text-to-3D and image-to-3D, by integrating it with +existing multi-view diffusion models. Our project website is at: +https://justimyhxu.github.io/projects/grm/. + +
+
+ comment: Project page: https://justimyhxu.github.io/projects/grm/ Code: + https://github.com/justimyhxu/GRM +
+
+
+
+
+ + ☆ ClusteringSDF: Self-Organized Neural Implicit Surfaces for 3D + Decomposition + + +
+ 3D decomposition/segmentation still remains a challenge as large-scale 3D +annotated data is not readily available. Contemporary approaches typically +leverage 2D machine-generated segments, integrating them for 3D consistency. +While the majority of these methods are based on NeRFs, they face a potential +weakness that the instance/semantic embedding features derive from independent +MLPs, thus preventing the segmentation network from learning the geometric +details of the objects directly through radiance and density. In this paper, we +propose ClusteringSDF, a novel approach to achieve both segmentation and +reconstruction in 3D via the neural implicit surface representation, +specifically Signal Distance Function (SDF), where the segmentation rendering +is directly integrated with the volume rendering of neural implicit surfaces. +Although based on ObjectSDF++, ClusteringSDF no longer requires the +ground-truth segments for supervision while maintaining the capability of +reconstructing individual object surfaces, but purely with the noisy and +inconsistent labels from pre-trained models.As the core of ClusteringSDF, we +introduce a high-efficient clustering mechanism for lifting the 2D labels to 3D +and the experimental results on the challenging scenes from ScanNet and Replica +datasets show that ClusteringSDF can achieve competitive performance compared +against the state-of-the-art with significantly reduced training time. + +
+
+ comment: Project Page: https://sm0kywu.github.io/ClusteringSDF/ +
+
+
+
+
+ + ☆ Videoshop: Localized Semantic Video Editing with Noise-Extrapolated + Diffusion Inversion + + +
+ We introduce Videoshop, a training-free video editing algorithm for localized +semantic edits. Videoshop allows users to use any editing software, including +Photoshop and generative inpainting, to modify the first frame; it +automatically propagates those changes, with semantic, spatial, and temporally +consistent motion, to the remaining frames. Unlike existing methods that enable +edits only through imprecise textual instructions, Videoshop allows users to +add or remove objects, semantically change objects, insert stock photos into +videos, etc. with fine-grained control over locations and appearance. We +achieve this through image-based video editing by inverting latents with noise +extrapolation, from which we generate videos conditioned on the edited image. +Videoshop produces higher quality edits against 6 baselines on 2 editing +benchmarks using 10 evaluation metrics. + +
+
+
+
+
+ + ☆ Hierarchical Text-to-Vision Self Supervised Alignment for Improved + Histopathology Representation Learning + + +
+ Self-supervised representation learning has been highly promising for +histopathology image analysis with numerous approaches leveraging their +patient-slide-patch hierarchy to learn better representations. In this paper, +we explore how the combination of domain specific natural language information +with such hierarchical visual representations can benefit rich representation +learning for medical image tasks. Building on automated language description +generation for features visible in histopathology images, we present a novel +language-tied self-supervised learning framework, Hierarchical Language-tied +Self-Supervision (HLSS) for histopathology images. We explore contrastive +objectives and granular language description based text alignment at multiple +hierarchies to inject language modality information into the visual +representations. Our resulting model achieves state-of-the-art performance on +two medical imaging benchmarks, OpenSRH and TCGA datasets. Our framework also +provides better interpretability with our language aligned representation +space. Code is available at https://github.com/Hasindri/HLSS. + +
+
+ comment: 13 pages and 5 figures +
+
+
+
+
+ + ☆ AdaIR: Adaptive All-in-One Image Restoration via Frequency Mining and + Modulation + + +
+ In the image acquisition process, various forms of degradation, including +noise, haze, and rain, are frequently introduced. These degradations typically +arise from the inherent limitations of cameras or unfavorable ambient +conditions. To recover clean images from degraded versions, numerous +specialized restoration methods have been developed, each targeting a specific +type of degradation. Recently, all-in-one algorithms have garnered significant +attention by addressing different types of degradations within a single model +without requiring prior information of the input degradation type. However, +these methods purely operate in the spatial domain and do not delve into the +distinct frequency variations inherent to different degradation types. To +address this gap, we propose an adaptive all-in-one image restoration network +based on frequency mining and modulation. Our approach is motivated by the +observation that different degradation types impact the image content on +different frequency subbands, thereby requiring different treatments for each +restoration task. Specifically, we first mine low- and high-frequency +information from the input features, guided by the adaptively decoupled spectra +of the degraded image. The extracted features are then modulated by a +bidirectional operator to facilitate interactions between different frequency +components. Finally, the modulated features are merged into the original input +for a progressively guided restoration. With this approach, the model achieves +adaptive reconstruction by accentuating the informative frequency subbands +according to different input degradations. Extensive experiments demonstrate +that the proposed method achieves state-of-the-art performance on different +image restoration tasks, including denoising, dehazing, deraining, motion +deblurring, and low-light image enhancement. Our code is available at +https://github.com/c-yn/AdaIR. + +
+
+ comment: 28 pages,15 figures +
+
+
+
+
+ + ☆ DreamReward: Text-to-3D Generation with Human Preference + + +
+ 3D content creation from text prompts has shown remarkable success recently. +However, current text-to-3D methods often generate 3D results that do not align +well with human preferences. In this paper, we present a comprehensive +framework, coined DreamReward, to learn and improve text-to-3D models from +human preference feedback. To begin with, we collect 25k expert comparisons +based on a systematic annotation pipeline including rating and ranking. Then, +we build Reward3D -- the first general-purpose text-to-3D human preference +reward model to effectively encode human preferences. Building upon the 3D +reward model, we finally perform theoretical analysis and present the Reward3D +Feedback Learning (DreamFL), a direct tuning algorithm to optimize the +multi-view diffusion models with a redefined scorer. Grounded by theoretical +proof and extensive experiment comparisons, our DreamReward successfully +generates high-fidelity and 3D consistent results with significant boosts in +prompt alignment with human intention. Our results demonstrate the great +potential for learning from human feedback to improve text-to-3D models. + +
+
+ comment: Project page: https://jamesyjl.github.io/DreamReward +
+
+
+
+
+ + ☆ Explorative Inbetweening of Time and Space + + +
+ We introduce bounded generation as a generalized task to control video +generation to synthesize arbitrary camera and subject motion based only on a +given start and end frame. Our objective is to fully leverage the inherent +generalization capability of an image-to-video model without additional +training or fine-tuning of the original model. This is achieved through the +proposed new sampling strategy, which we call Time Reversal Fusion, that fuses +the temporally forward and backward denoising paths conditioned on the start +and end frame, respectively. The fused path results in a video that smoothly +connects the two frames, generating inbetweening of faithful subject motion, +novel views of static scenes, and seamless video looping when the two bounding +frames are identical. We curate a diverse evaluation dataset of image pairs and +compare against the closest existing methods. We find that Time Reversal Fusion +outperforms related work on all subtasks, exhibiting the ability to generate +complex motions and 3D-consistent views guided by bounded frames. See project +page at https://time-reversal.github.io. + +
+
+ comment: project page at https://time-reversal.github.io +
+
+
+
+
+ + ☆ T-Rex2: Towards Generic Object Detection via Text-Visual Prompt Synergy + + +
+ We present T-Rex2, a highly practical model for open-set object detection. +Previous open-set object detection methods relying on text prompts effectively +encapsulate the abstract concept of common objects, but struggle with rare or +complex object representation due to data scarcity and descriptive limitations. +Conversely, visual prompts excel in depicting novel objects through concrete +visual examples, but fall short in conveying the abstract concept of objects as +effectively as text prompts. Recognizing the complementary strengths and +weaknesses of both text and visual prompts, we introduce T-Rex2 that synergizes +both prompts within a single model through contrastive learning. T-Rex2 accepts +inputs in diverse formats, including text prompts, visual prompts, and the +combination of both, so that it can handle different scenarios by switching +between the two prompt modalities. Comprehensive experiments demonstrate that +T-Rex2 exhibits remarkable zero-shot object detection capabilities across a +wide spectrum of scenarios. We show that text prompts and visual prompts can +benefit from each other within the synergy, which is essential to cover massive +and complicated real-world scenarios and pave the way towards generic object +detection. Model API is now available at +\url{https://github.com/IDEA-Research/T-Rex}. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ☆ ReNoise: Real Image Inversion Through Iterative Noising + + +
+ Recent advancements in text-guided diffusion models have unlocked powerful +image manipulation capabilities. However, applying these methods to real images +necessitates the inversion of the images into the domain of the pretrained +diffusion model. Achieving faithful inversion remains a challenge, particularly +for more recent models trained to generate images with a small number of +denoising steps. In this work, we introduce an inversion method with a high +quality-to-operation ratio, enhancing reconstruction accuracy without +increasing the number of operations. Building on reversing the diffusion +sampling process, our method employs an iterative renoising mechanism at each +inversion sampling step. This mechanism refines the approximation of a +predicted point along the forward diffusion trajectory, by iteratively applying +the pretrained diffusion model, and averaging these predictions. We evaluate +the performance of our ReNoise technique using various sampling algorithms and +models, including recent accelerated diffusion models. Through comprehensive +evaluations and comparisons, we show its effectiveness in terms of both +accuracy and speed. Furthermore, we confirm that our method preserves +editability by demonstrating text-driven image editing on real images. + +
+
+ comment: project page at: https://garibida.github.io/ReNoise-Inversion/ +
+
+
+
+
+ + ☆ MyVLM: Personalizing VLMs for User-Specific Queries + + +
+ Recent large-scale vision-language models (VLMs) have demonstrated remarkable +capabilities in understanding and generating textual descriptions for visual +content. However, these models lack an understanding of user-specific concepts. +In this work, we take a first step toward the personalization of VLMs, enabling +them to learn and reason over user-provided concepts. For example, we explore +whether these models can learn to recognize you in an image and communicate +what you are doing, tailoring the model to reflect your personal experiences +and relationships. To effectively recognize a variety of user-specific +concepts, we augment the VLM with external concept heads that function as +toggles for the model, enabling the VLM to identify the presence of specific +target concepts in a given image. Having recognized the concept, we learn a new +concept embedding in the intermediate feature space of the VLM. This embedding +is tasked with guiding the language model to naturally integrate the target +concept in its generated response. We apply our technique to BLIP-2 and LLaVA +for personalized image captioning and further show its applicability for +personalized visual question-answering. Our experiments demonstrate our ability +to generalize to unseen images of learned concepts while preserving the model +behavior on unrelated inputs. + +
+
+ comment: Project page: https://snap-research.github.io/MyVLM/ +
+
+
+
+
+ + ☆ PSALM: Pixelwise SegmentAtion with Large Multi-Modal Model + + +
+ PSALM is a powerful extension of the Large Multi-modal Model (LMM) to address +the segmentation task challenges. To overcome the limitation of the LMM being +limited to textual output, PSALM incorporates a mask decoder and a +well-designed input schema to handle a variety of segmentation tasks. This +schema includes images, task instructions, conditional prompts, and mask +tokens, which enable the model to generate and classify segmentation masks +effectively. The flexible design of PSALM supports joint training across +multiple datasets and tasks, leading to improved performance and task +generalization. PSALM achieves superior results on several benchmarks, such as +RefCOCO/RefCOCO+/RefCOCOg, COCO Panoptic Segmentation, and COCO-Interactive, +and further exhibits zero-shot capabilities on unseen tasks, such as +open-vocabulary segmentation, generalized referring expression segmentation and +video object segmentation, making a significant step towards a GPT moment in +computer vision. Through extensive experiments, PSALM demonstrates its +potential to transform the domain of image segmentation, leveraging the robust +visual understanding capabilities of LMMs as seen in natural language +processing. Code and models are available at https://github.com/zamling/PSALM. + +
+
+
+
+
+ + ☆ VXP: Voxel-Cross-Pixel Large-scale Image-LiDAR Place Recognition + + +
+ Recent works on the global place recognition treat the task as a retrieval +problem, where an off-the-shelf global descriptor is commonly designed in +image-based and LiDAR-based modalities. However, it is non-trivial to perform +accurate image-LiDAR global place recognition since extracting consistent and +robust global descriptors from different domains (2D images and 3D point +clouds) is challenging. To address this issue, we propose a novel +Voxel-Cross-Pixel (VXP) approach, which establishes voxel and pixel +correspondences in a self-supervised manner and brings them into a shared +feature space. Specifically, VXP is trained in a two-stage manner that first +explicitly exploits local feature correspondences and enforces similarity of +global descriptors. Extensive experiments on the three benchmarks (Oxford +RobotCar, ViViD++ and KITTI) demonstrate our method surpasses the +state-of-the-art cross-modal retrieval by a large margin. + +
+
+ comment: Project page https://yunjinli.github.io/projects-vxp/ +
+
+
+
+
+ + ☆ Implicit Style-Content Separation using B-LoRA + + +
+ Image stylization involves manipulating the visual appearance and texture +(style) of an image while preserving its underlying objects, structures, and +concepts (content). The separation of style and content is essential for +manipulating the image's style independently from its content, ensuring a +harmonious and visually pleasing result. Achieving this separation requires a +deep understanding of both the visual and semantic characteristics of images, +often necessitating the training of specialized models or employing heavy +optimization. In this paper, we introduce B-LoRA, a method that leverages LoRA +(Low-Rank Adaptation) to implicitly separate the style and content components +of a single image, facilitating various image stylization tasks. By analyzing +the architecture of SDXL combined with LoRA, we find that jointly learning the +LoRA weights of two specific blocks (referred to as B-LoRAs) achieves +style-content separation that cannot be achieved by training each B-LoRA +independently. Consolidating the training into only two blocks and separating +style and content allows for significantly improving style manipulation and +overcoming overfitting issues often associated with model fine-tuning. Once +trained, the two B-LoRAs can be used as independent components to allow various +image stylization tasks, including image style transfer, text-based image +stylization, consistent style generation, and style-content mixing. + +
+
+
+
+
+ + ☆ Visibility-Aware Keypoint Localization for 6DoF Object Pose Estimation + + +
+ Localizing predefined 3D keypoints in a 2D image is an effective way to +establish 3D-2D correspondences for 6DoF object pose estimation. However, +unreliable localization results of invisible keypoints degrade the quality of +correspondences. In this paper, we address this issue by localizing the +important keypoints in terms of visibility. Since keypoint visibility +information is currently missing in dataset collection process, we propose an +efficient way to generate binary visibility labels from available object-level +annotations, for keypoints of both asymmetric objects and symmetric objects. We +further derive real-valued visibility-aware importance from binary labels based +on PageRank algorithm. Taking advantage of the flexibility of our +visibility-aware importance, we construct VAPO (Visibility-Aware POse +estimator) by integrating the visibility-aware importance with a +state-of-the-art pose estimation algorithm, along with additional positional +encoding. Extensive experiments are conducted on popular pose estimation +benchmarks including Linemod, Linemod-Occlusion, and YCB-V. The results show +that, VAPO improves both the keypoint correspondences and final estimated +poses, and clearly achieves state-of-the-art performances. + +
+
+
+
+
+ + ☆ Gaussian Frosting: Editable Complex Radiance Fields with Real-Time + Rendering + + +
+ We propose Gaussian Frosting, a novel mesh-based representation for +high-quality rendering and editing of complex 3D effects in real-time. Our +approach builds on the recent 3D Gaussian Splatting framework, which optimizes +a set of 3D Gaussians to approximate a radiance field from images. We propose +first extracting a base mesh from Gaussians during optimization, then building +and refining an adaptive layer of Gaussians with a variable thickness around +the mesh to better capture the fine details and volumetric effects near the +surface, such as hair or grass. We call this layer Gaussian Frosting, as it +resembles a coating of frosting on a cake. The fuzzier the material, the +thicker the frosting. We also introduce a parameterization of the Gaussians to +enforce them to stay inside the frosting layer and automatically adjust their +parameters when deforming, rescaling, editing or animating the mesh. Our +representation allows for efficient rendering using Gaussian splatting, as well +as editing and animation by modifying the base mesh. We demonstrate the +effectiveness of our method on various synthetic and real scenes, and show that +it outperforms existing surface-based approaches. We will release our code and +a web-based viewer as additional contributions. Our project page is the +following: https://anttwo.github.io/frosting/ + +
+
+ comment: Project Webpage: https://anttwo.github.io/frosting/ +
+
+
+
+
+ + ☆ Token Transformation Matters: Towards Faithful Post-hoc Explanation for + Vision Transformer CVPR 2024 + + +
+ While Transformers have rapidly gained popularity in various computer vision +applications, post-hoc explanations of their internal mechanisms remain largely +unexplored. Vision Transformers extract visual information by representing +image regions as transformed tokens and integrating them via attention weights. +However, existing post-hoc explanation methods merely consider these attention +weights, neglecting crucial information from the transformed tokens, which +fails to accurately illustrate the rationales behind the models' predictions. +To incorporate the influence of token transformation into interpretation, we +propose TokenTM, a novel post-hoc explanation method that utilizes our +introduced measurement of token transformation effects. Specifically, we +quantify token transformation effects by measuring changes in token lengths and +correlations in their directions pre- and post-transformation. Moreover, we +develop initialization and aggregation rules to integrate both attention +weights and token transformation effects across all layers, capturing holistic +token contributions throughout the model. Experimental results on segmentation +and perturbation tests demonstrate the superiority of our proposed TokenTM +compared to state-of-the-art Vision Transformer explanation methods. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ DINO-Tracker: Taming DINO for Self-Supervised Point Tracking in a Single + Video + + +
+ We present DINO-Tracker -- a new framework for long-term dense tracking in +video. The pillar of our approach is combining test-time training on a single +video, with the powerful localized semantic features learned by a pre-trained +DINO-ViT model. Specifically, our framework simultaneously adopts DINO's +features to fit to the motion observations of the test video, while training a +tracker that directly leverages the refined features. The entire framework is +trained end-to-end using a combination of self-supervised losses, and +regularization that allows us to retain and benefit from DINO's semantic prior. +Extensive evaluation demonstrates that our method achieves state-of-the-art +results on known benchmarks. DINO-tracker significantly outperforms +self-supervised methods and is competitive with state-of-the-art supervised +trackers, while outperforming them in challenging cases of tracking under +long-term occlusions. + +
+
+
+
+
+ + ☆ Estimating Physical Information Consistency of Channel Data Augmentation + for Remote Sensing Images + + +
+ The application of data augmentation for deep learning (DL) methods plays an +important role in achieving state-of-the-art results in supervised, +semi-supervised, and self-supervised image classification. In particular, +channel transformations (e.g., solarize, grayscale, brightness adjustments) are +integrated into data augmentation pipelines for remote sensing (RS) image +classification tasks. However, contradicting beliefs exist about their proper +applications to RS images. A common point of critique is that the application +of channel augmentation techniques may lead to physically inconsistent spectral +data (i.e., pixel signatures). To shed light on the open debate, we propose an +approach to estimate whether a channel augmentation technique affects the +physical information of RS images. To this end, the proposed approach estimates +a score that measures the alignment of a pixel signature within a time series +that can be naturally subject to deviations caused by factors such as +acquisition conditions or phenological states of vegetation. We compare the +scores associated with original and augmented pixel signatures to evaluate the +physical consistency. Experimental results on a multi-label image +classification task show that channel augmentations yielding a score that +exceeds the expected deviation of original pixel signatures can not improve the +performance of a baseline model trained without augmentation. + +
+
+ comment: Accepted at the IEEE International Geoscience and Remote Sensing + Symposium +
+
+
+
+
+ + ☆ Object-Centric Domain Randomization for 3D Shape Reconstruction in the + Wild + + +
+ One of the biggest challenges in single-view 3D shape reconstruction in the +wild is the scarcity of <3D shape, 2D image>-paired data from real-world +environments. Inspired by remarkable achievements via domain randomization, we +propose ObjectDR which synthesizes such paired data via a random simulation of +visual variations in object appearances and backgrounds. Our data synthesis +framework exploits a conditional generative model (e.g., ControlNet) to +generate images conforming to spatial conditions such as 2.5D sketches, which +are obtainable through a rendering process of 3D shapes from object collections +(e.g., Objaverse-XL). To simulate diverse variations while preserving object +silhouettes embedded in spatial conditions, we also introduce a disentangled +framework which leverages an initial object guidance. After synthesizing a wide +range of data, we pre-train a model on them so that it learns to capture a +domain-invariant geometry prior which is consistent across various domains. We +validate its effectiveness by substantially improving 3D shape reconstruction +models on a real-world benchmark. In a scale-up evaluation, our pre-training +achieves 23.6% superior results compared with the pre-training on high-quality +computer graphics renderings. + +
+
+ comment: Project Page: https://ObjectDR.github.io +
+
+
+
+
+ + ☆ Transfer Learning for Cross-dataset Isolated Sign Language Recognition + in Under-Resourced Datasets + + +
+ Sign language recognition (SLR) has recently achieved a breakthrough in +performance thanks to deep neural networks trained on large annotated sign +datasets. Of the many different sign languages, these annotated datasets are +only available for a select few. Since acquiring gloss-level labels on sign +language videos is difficult, learning by transferring knowledge from existing +annotated sources is useful for recognition in under-resourced sign languages. +This study provides a publicly available cross-dataset transfer learning +benchmark from two existing public Turkish SLR datasets. We use a temporal +graph convolution-based sign language recognition approach to evaluate five +supervised transfer learning approaches and experiment with closed-set and +partial-set cross-dataset transfer learning. Experiments demonstrate that +improvement over finetuning based transfer learning is possible with +specialized supervised transfer learning methods. + +
+
+ comment: Accepted to The 18th IEEE International Conference on Automatic Face + and Gesture Recognition 2024, Code available in + https://github.com/alpk/tid-supervised-transfer-learning-dataset +
+
+
+
+
+ + ☆ HAC: Hash-grid Assisted Context for 3D Gaussian Splatting Compression + + +
+ 3D Gaussian Splatting (3DGS) has emerged as a promising framework for novel +view synthesis, boasting rapid rendering speed with high fidelity. However, the +substantial Gaussians and their associated attributes necessitate effective +compression techniques. Nevertheless, the sparse and unorganized nature of the +point cloud of Gaussians (or anchors in our paper) presents challenges for +compression. To address this, we make use of the relations between the +unorganized anchors and the structured hash grid, leveraging their mutual +information for context modeling, and propose a Hash-grid Assisted Context +(HAC) framework for highly compact 3DGS representation. Our approach introduces +a binary hash grid to establish continuous spatial consistencies, allowing us +to unveil the inherent spatial relations of anchors through a carefully +designed context model. To facilitate entropy coding, we utilize Gaussian +distributions to accurately estimate the probability of each quantized +attribute, where an adaptive quantization module is proposed to enable +high-precision quantization of these attributes for improved fidelity +restoration. Additionally, we incorporate an adaptive masking strategy to +eliminate invalid Gaussians and anchors. Importantly, our work is the pioneer +to explore context-based compression for 3DGS representation, resulting in a +remarkable size reduction of over $75\times$ compared to vanilla 3DGS, while +simultaneously improving fidelity, and achieving over $11\times$ size reduction +over SOTA 3DGS compression approach Scaffold-GS. Our code is available here: +https://github.com/YihangChen-ee/HAC + +
+
+ comment: Project Page: https://yihangchen-ee.github.io/project_hac/ Code: + https://github.com/YihangChen-ee/HAC +
+
+
+
+
+ + ☆ Click to Grasp: Zero-Shot Precise Manipulation via Visual Diffusion + Descriptors + + +
+ Precise manipulation that is generalizable across scenes and objects remains +a persistent challenge in robotics. Current approaches for this task heavily +depend on having a significant number of training instances to handle objects +with pronounced visual and/or geometric part ambiguities. Our work explores the +grounding of fine-grained part descriptors for precise manipulation in a +zero-shot setting by utilizing web-trained text-to-image diffusion-based +generative models. We tackle the problem by framing it as a dense semantic part +correspondence task. Our model returns a gripper pose for manipulating a +specific part, using as reference a user-defined click from a source image of a +visually different instance of the same object. We require no manual grasping +demonstrations as we leverage the intrinsic object geometry and features. +Practical experiments in a real-world tabletop scenario validate the efficacy +of our approach, demonstrating its potential for advancing semantic-aware +robotics manipulation. Web page: https://tsagkas.github.io/click2grasp + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ☆ Invisible Needle Detection in Ultrasound: Leveraging Mechanism-Induced + Vibration + + +
+ In clinical applications that involve ultrasound-guided intervention, the +visibility of the needle can be severely impeded due to steep insertion and +strong distractors such as speckle noise and anatomical occlusion. To address +this challenge, we propose VibNet, a learning-based framework tailored to +enhance the robustness and accuracy of needle detection in ultrasound images, +even when the target becomes invisible to the naked eye. Inspired by Eulerian +Video Magnification techniques, we utilize an external step motor to induce +low-amplitude periodic motion on the needle. These subtle vibrations offer the +potential to generate robust frequency features for detecting the motion +patterns around the needle. To robustly and precisely detect the needle +leveraging these vibrations, VibNet integrates learning-based +Short-Time-Fourier-Transform and Hough-Transform modules to achieve successive +sub-goals, including motion feature extraction in the spatiotemporal space, +frequency feature aggregation, and needle detection in the Hough space. Based +on the results obtained on distinct ex vivo porcine and bovine tissue samples, +the proposed algorithm exhibits superior detection performance with efficient +computation and generalization capability. + +
+
+
+
+
+ + ☆ Cobra: Extending Mamba to Multi-Modal Large Language Model for Efficient + Inference + + +
+ In recent years, the application of multimodal large language models (MLLM) +in various fields has achieved remarkable success. However, as the foundation +model for many downstream tasks, current MLLMs are composed of the well-known +Transformer network, which has a less efficient quadratic computation +complexity. To improve the efficiency of such basic models, we propose Cobra, a +linear computational complexity MLLM. Specifically, Cobra integrates the +efficient Mamba language model into the visual modality. Moreover, we explore +and study various modal fusion schemes to create an effective multi-modal +Mamba. Extensive experiments demonstrate that (1) Cobra achieves extremely +competitive performance with current computationally efficient state-of-the-art +methods, \textit{e.g.}, LLaVA-Phi, TinyLLaVA, and MobileVLM v2, and has faster +speed due to Cobra's linear sequential modeling. (2) Interestingly, the results +of closed-set challenging prediction benchmarks show that Cobra performs well +in overcoming visual illusions and spatial relationship judgments. (3) Notably, +Cobra even achieves comparable performance to LLaVA with about 43% of the +number of parameters. We will make all codes of Cobra open-source and hope that +the proposed method can facilitate future research on complexity problems in +MLLM. Our project page is available at: https://sites.google.com/view/cobravlm. + +
+
+
+
+
+ + ☆ View-decoupled Transformer for Person Re-identification under + Aerial-ground Camera Network CVPR 2024 + + +
+ Existing person re-identification methods have achieved remarkable advances +in appearance-based identity association across homogeneous cameras, such as +ground-ground matching. However, as a more practical scenario, aerial-ground +person re-identification (AGPReID) among heterogeneous cameras has received +minimal attention. To alleviate the disruption of discriminative identity +representation by dramatic view discrepancy as the most significant challenge +in AGPReID, the view-decoupled transformer (VDT) is proposed as a simple yet +effective framework. Two major components are designed in VDT to decouple +view-related and view-unrelated features, namely hierarchical subtractive +separation and orthogonal loss, where the former separates these two features +inside the VDT, and the latter constrains these two to be independent. In +addition, we contribute a large-scale AGPReID dataset called CARGO, consisting +of five/eight aerial/ground cameras, 5,000 identities, and 108,563 images. +Experiments on two datasets show that VDT is a feasible and effective solution +for AGPReID, surpassing the previous method on mAP/Rank1 by up to 5.0%/2.7% on +CARGO and 3.7%/5.2% on AG-ReID, keeping the same magnitude of computational +complexity. Our project is available at https://github.com/LinlyAC/VDT-AGPReID + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Denoising Diffusion Models for 3D Healthy Brain Tissue Inpainting + + +
+ Monitoring diseases that affect the brain's structural integrity requires +automated analysis of magnetic resonance (MR) images, e.g., for the evaluation +of volumetric changes. However, many of the evaluation tools are optimized for +analyzing healthy tissue. To enable the evaluation of scans containing +pathological tissue, it is therefore required to restore healthy tissue in the +pathological areas. In this work, we explore and extend denoising diffusion +models for consistent inpainting of healthy 3D brain tissue. We modify +state-of-the-art 2D, pseudo-3D, and 3D methods working in the image space, as +well as 3D latent and 3D wavelet diffusion models, and train them to synthesize +healthy brain tissue. Our evaluation shows that the pseudo-3D model performs +best regarding the structural-similarity index, peak signal-to-noise ratio, and +mean squared error. To emphasize the clinical relevance, we fine-tune this +model on data containing synthetic MS lesions and evaluate it on a downstream +brain tissue segmentation task, whereby it outperforms the established FMRIB +Software Library (FSL) lesion-filling method. + +
+
+
+
+
+ + ☆ MULDE: Multiscale Log-Density Estimation via Denoising Score Matching + for Video Anomaly Detection + + +
+ We propose a novel approach to video anomaly detection: we treat feature +vectors extracted from videos as realizations of a random variable with a fixed +distribution and model this distribution with a neural network. This lets us +estimate the likelihood of test videos and detect video anomalies by +thresholding the likelihood estimates. We train our video anomaly detector +using a modification of denoising score matching, a method that injects +training data with noise to facilitate modeling its distribution. To eliminate +hyperparameter selection, we model the distribution of noisy video features +across a range of noise levels and introduce a regularizer that tends to align +the models for different levels of noise. At test time, we combine anomaly +indications at multiple noise scales with a Gaussian mixture model. Running our +video anomaly detector induces minimal delays as inference requires merely +extracting the features and forward-propagating them through a shallow neural +network and a Gaussian mixture model. Our experiments on five popular video +anomaly detection benchmarks demonstrate state-of-the-art performance, both in +the object-centric and in the frame-centric setup. + +
+
+
+
+
+ + ☆ Learning to Project for Cross-Task Knowledge Distillation + + +
+ Traditional knowledge distillation (KD) relies on a proficient teacher +trained on the target task, which is not always available. In this setting, +cross-task distillation can be used, enabling the use of any teacher model +trained on a different task. However, many KD methods prove ineffective when +applied to this cross-task setting. To address this limitation, we propose a +simple modification: the use of an inverted projection. We show that this +drop-in replacement for a standard projector is effective by learning to +disregard any task-specific features which might degrade the student's +performance. We find that this simple modification is sufficient for extending +many KD methods to the cross-task setting, where the teacher and student tasks +can be very different. In doing so, we obtain up to a 1.9% improvement in the +cross-task setting compared to the traditional projection, at no additional +cost. Our method can obtain significant performance improvements (up to 7%) +when using even a randomly-initialised teacher on various tasks such as depth +estimation, image translation, and semantic segmentation, despite the lack of +any learned knowledge to transfer. To provide conceptual and analytical +insights into this result, we show that using an inverted projection allows the +distillation loss to be decomposed into a knowledge transfer and a spectral +regularisation component. Through this analysis we are additionally able to +propose a novel regularisation loss that allows teacher-free distillation, +enabling performance improvements of up to 8.57% on ImageNet with no additional +training costs. + +
+
+
+
+
+ + ☆ Adversary-Robust Graph-Based Learning of WSIs + + +
+ Enhancing the robustness of deep learning models against adversarial attacks +is crucial, especially in critical domains like healthcare where significant +financial interests heighten the risk of such attacks. Whole slide images +(WSIs) are high-resolution, digitized versions of tissue samples mounted on +glass slides, scanned using sophisticated imaging equipment. The digital +analysis of WSIs presents unique challenges due to their gigapixel size and +multi-resolution storage format. In this work, we aim at improving the +robustness of cancer Gleason grading classification systems against adversarial +attacks, addressing challenges at both the image and graph levels. As regards +the proposed algorithm, we develop a novel and innovative graph-based model +which utilizes GNN to extract features from the graph representation of WSIs. A +denoising module, along with a pooling layer is incorporated to manage the +impact of adversarial attacks on the WSIs. The process concludes with a +transformer module that classifies various grades of prostate cancer based on +the processed data. To assess the effectiveness of the proposed method, we +conducted a comparative analysis using two scenarios. Initially, we trained and +tested the model without the denoiser using WSIs that had not been exposed to +any attack. We then introduced a range of attacks at either the image or graph +level and processed them through the proposed network. The performance of the +model was evaluated in terms of accuracy and kappa scores. The results from +this comparison showed a significant improvement in cancer diagnosis accuracy, +highlighting the robustness and efficiency of the proposed method in handling +adversarial challenges in the context of medical imaging. + +
+
+
+
+
+ + ☆ DesignEdit: Multi-Layered Latent Decomposition and Fusion for Unified & + Accurate Image Editing + + +
+ Recently, how to achieve precise image editing has attracted increasing +attention, especially given the remarkable success of text-to-image generation +models. To unify various spatial-aware image editing abilities into one +framework, we adopt the concept of layers from the design domain to manipulate +objects flexibly with various operations. The key insight is to transform the +spatial-aware image editing task into a combination of two sub-tasks: +multi-layered latent decomposition and multi-layered latent fusion. First, we +segment the latent representations of the source images into multiple layers, +which include several object layers and one incomplete background layer that +necessitates reliable inpainting. To avoid extra tuning, we further explore the +inner inpainting ability within the self-attention mechanism. We introduce a +key-masking self-attention scheme that can propagate the surrounding context +information into the masked region while mitigating its impact on the regions +outside the mask. Second, we propose an instruction-guided latent fusion that +pastes the multi-layered latent representations onto a canvas latent. We also +introduce an artifact suppression scheme in the latent space to enhance the +inpainting quality. Due to the inherent modular advantages of such +multi-layered representations, we can achieve accurate image editing, and we +demonstrate that our approach consistently surpasses the latest spatial editing +methods, including Self-Guidance and DiffEditor. Last, we show that our +approach is a unified framework that supports various accurate image editing +tasks on more than six different editing tasks. + +
+
+ comment: technical report, 15 pages, webpage: https://design-edit.github.io/ +
+
+
+
+
+ + ☆ HyperGALE: ASD Classification via Hypergraph Gated Attention with + Learnable Hyperedges IJCNN 2024 + + +
+ Autism Spectrum Disorder (ASD) is a neurodevelopmental condition +characterized by varied social cognitive challenges and repetitive behavioral +patterns. Identifying reliable brain imaging-based biomarkers for ASD has been +a persistent challenge due to the spectrum's diverse symptomatology. Existing +baselines in the field have made significant strides in this direction, yet +there remains room for improvement in both performance and interpretability. We +propose \emph{HyperGALE}, which builds upon the hypergraph by incorporating +learned hyperedges and gated attention mechanisms. This approach has led to +substantial improvements in the model's ability to interpret complex brain +graph data, offering deeper insights into ASD biomarker characterization. +Evaluated on the extensive ABIDE II dataset, \emph{HyperGALE} not only improves +interpretability but also demonstrates statistically significant enhancements +in key performance metrics compared to both previous baselines and the +foundational hypergraph model. The advancement \emph{HyperGALE} brings to ASD +research highlights the potential of sophisticated graph-based techniques in +neurodevelopmental studies. The source code and implementation instructions are +available at GitHub:https://github.com/mehular0ra/HyperGALE. + +
+
+ comment: Accepted to IJCNN 2024 +
+
+
+
+
+ + ☆ Detoxifying Large Language Models via Knowledge Editing + + +
+ This paper investigates using knowledge editing techniques to detoxify Large +Language Models (LLMs). We construct a benchmark, SafeEdit, which covers nine +unsafe categories with various powerful attack prompts and equips comprehensive +metrics for systematic evaluation. We conduct experiments to compare knowledge +editing approaches with previous baselines, indicating that knowledge editing +has the potential to efficiently detoxify LLMs with limited impact on general +performance. Then, we propose a simple yet effective baseline, dubbed +Detoxifying with Intraoperative Neural Monitoring (DINM), to diminish the +toxicity of LLMs within a few tuning steps via only one instance. We further +provide an in-depth analysis of the internal mechanism for various detoxify +approaches, demonstrating that previous methods like SFT and DPO may merely +suppress the activations of toxic parameters, while DINM mitigates the toxicity +of the toxic parameters to a certain extent, making permanent adjustments. We +hope that these insights could shed light on future work of developing +detoxifying approaches and the underlying knowledge mechanisms of LLMs. Code +and benchmark are available at https://github.com/zjunlp/EasyEdit. + +
+
+ comment: Ongoing work. Project website: + https://zjunlp.github.io/project/SafeEdit Benchmark: + https://huggingface.co/datasets/zjunlp/SafeEdit Code: + https://github.com/zjunlp/EasyEdit +
+
+
+
+
+ + ☆ AnyV2V: A Plug-and-Play Framework For Any Video-to-Video Editing Tasks + + +
+ Video-to-video editing involves editing a source video along with additional +control (such as text prompts, subjects, or styles) to generate a new video +that aligns with the source video and the provided control. Traditional methods +have been constrained to certain editing types, limiting their ability to meet +the wide range of user demands. In this paper, we introduce AnyV2V, a novel +training-free framework designed to simplify video editing into two primary +steps: (1) employing an off-the-shelf image editing model (e.g. +InstructPix2Pix, InstantID, etc) to modify the first frame, (2) utilizing an +existing image-to-video generation model (e.g. I2VGen-XL) for DDIM inversion +and feature injection. In the first stage, AnyV2V can plug in any existing +image editing tools to support an extensive array of video editing tasks. +Beyond the traditional prompt-based editing methods, AnyV2V also can support +novel video editing tasks, including reference-based style transfer, +subject-driven editing, and identity manipulation, which were unattainable by +previous methods. In the second stage, AnyV2V can plug in any existing +image-to-video models to perform DDIM inversion and intermediate feature +injection to maintain the appearance and motion consistency with the source +video. On the prompt-based editing, we show that AnyV2V can outperform the +previous best approach by 35\% on prompt alignment, and 25\% on human +preference. On the three novel tasks, we show that AnyV2V also achieves a high +success rate. We believe AnyV2V will continue to thrive due to its ability to +seamlessly integrate the fast-evolving image editing methods. Such +compatibility can help AnyV2V to increase its versatility to cater to diverse +user demands. + +
+
+ comment: preprint +
+
+
+
+
+ + ☆ CathFlow: Self-Supervised Segmentation of Catheters in Interventional + Ultrasound Using Optical Flow and Transformers + + +
+ In minimally invasive endovascular procedures, contrast-enhanced angiography +remains the most robust imaging technique. However, it is at the expense of the +patient and clinician's health due to prolonged radiation exposure. As an +alternative, interventional ultrasound has notable benefits such as being +radiation-free, fast to deploy, and having a small footprint in the operating +room. Yet, ultrasound is hard to interpret, and highly prone to artifacts and +noise. Additionally, interventional radiologists must undergo extensive +training before they become qualified to diagnose and treat patients +effectively, leading to a shortage of staff, and a lack of open-source +datasets. In this work, we seek to address both problems by introducing a +self-supervised deep learning architecture to segment catheters in longitudinal +ultrasound images, without demanding any labeled data. The network architecture +builds upon AiAReSeg, a segmentation transformer built with the Attention in +Attention mechanism, and is capable of learning feature changes across time and +space. To facilitate training, we used synthetic ultrasound data based on +physics-driven catheter insertion simulations, and translated the data into a +unique CT-Ultrasound common domain, CACTUSS, to improve the segmentation +performance. We generated ground truth segmentation masks by computing the +optical flow between adjacent frames using FlowNet2, and performed thresholding +to obtain a binary map estimate. Finally, we validated our model on a test +dataset, consisting of unseen synthetic data and images collected from silicon +aorta phantoms, thus demonstrating its potential for applications to clinical +data in the future. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ☆ Exploring 3D Human Pose Estimation and Forecasting from the Robot's + Perspective: The HARPER Dataset + + +
+ We introduce HARPER, a novel dataset for 3D body pose estimation and forecast +in dyadic interactions between users and \spot, the quadruped robot +manufactured by Boston Dynamics. The key-novelty is the focus on the robot's +perspective, i.e., on the data captured by the robot's sensors. These make 3D +body pose analysis challenging because being close to the ground captures +humans only partially. The scenario underlying HARPER includes 15 actions, of +which 10 involve physical contact between the robot and users. The Corpus +contains not only the recordings of the built-in stereo cameras of Spot, but +also those of a 6-camera OptiTrack system (all recordings are synchronized). +This leads to ground-truth skeletal representations with a precision lower than +a millimeter. In addition, the Corpus includes reproducible benchmarks on 3D +Human Pose Estimation, Human Pose Forecasting, and Collision Prediction, all +based on publicly available baseline approaches. This enables future HARPER +users to rigorously compare their results with those we provide in this work. + +
+
+
+
+
+ + ☆ RoDLA: Benchmarking the Robustness of Document Layout Analysis Models CVPR 2024 + + +
+ Before developing a Document Layout Analysis (DLA) model in real-world +applications, conducting comprehensive robustness testing is essential. +However, the robustness of DLA models remains underexplored in the literature. +To address this, we are the first to introduce a robustness benchmark for DLA +models, which includes 450K document images of three datasets. To cover +realistic corruptions, we propose a perturbation taxonomy with 36 common +document perturbations inspired by real-world document processing. +Additionally, to better understand document perturbation impacts, we propose +two metrics, Mean Perturbation Effect (mPE) for perturbation assessment and +Mean Robustness Degradation (mRD) for robustness evaluation. Furthermore, we +introduce a self-titled model, i.e., Robust Document Layout Analyzer (RoDLA), +which improves attention mechanisms to boost extraction of robust features. +Experiments on the proposed benchmarks (PubLayNet-P, DocLayNet-P, and +M$^6$Doc-P) demonstrate that RoDLA obtains state-of-the-art mRD scores of +115.7, 135.4, and 150.4, respectively. Compared to previous methods, RoDLA +achieves notable improvements in mAP of +3.8%, +7.1% and +12.1%, respectively. + +
+
+ comment: Accepted by CVPR 2024. Project page: + https://yufanchen96.github.io/projects/RoDLA +
+
+
+
+
+ + ☆ Analysing Diffusion Segmentation for Medical Images + + +
+ Denoising Diffusion Probabilistic models have become increasingly popular due +to their ability to offer probabilistic modeling and generate diverse outputs. +This versatility inspired their adaptation for image segmentation, where +multiple predictions of the model can produce segmentation results that not +only achieve high quality but also capture the uncertainty inherent in the +model. Here, powerful architectures were proposed for improving diffusion +segmentation performance. However, there is a notable lack of analysis and +discussions on the differences between diffusion segmentation and image +generation, and thorough evaluations are missing that distinguish the +improvements these architectures provide for segmentation in general from their +benefit for diffusion segmentation specifically. In this work, we critically +analyse and discuss how diffusion segmentation for medical images differs from +diffusion image generation, with a particular focus on the training behavior. +Furthermore, we conduct an assessment how proposed diffusion segmentation +architectures perform when trained directly for segmentation. Lastly, we +explore how different medical segmentation tasks influence the diffusion +segmentation behavior and the diffusion process could be adapted accordingly. +With these analyses, we aim to provide in-depth insights into the behavior of +diffusion segmentation that allow for a better design and evaluation of +diffusion segmentation methods in the future. + +
+
+
+
+
+ + ☆ Raw Instinct: Trust Your Classifiers and Skip the Conversion + + +
+ Using RAW-images in computer vision problems is surprisingly underexplored +considering that converting from RAW to RGB does not introduce any new capture +information. In this paper, we show that a sufficiently advanced classifier can +yield equivalent results on RAW input compared to RGB and present a new public +dataset consisting of RAW images and the corresponding converted RGB images. +Classifying images directly from RAW is attractive, as it allows for skipping +the conversion to RGB, lowering computation time significantly. Two CNN +classifiers are used to classify the images in both formats, confirming that +classification performance can indeed be preserved. We furthermore show that +the total computation time from RAW image data to classification results for +RAW images can be up to 8.46 times faster than RGB. These results contribute to +the evidence found in related works, that using RAW images as direct input to +computer vision algorithms looks very promising. + +
+
+ comment: https://www.kaggle.com/datasets/mathiasviborg/raw-instinct +
+
+
+
+
+ + ☆ Biased Binary Attribute Classifiers Ignore the Majority Classes + + +
+ To visualize the regions of interest that classifiers base their decisions +on, different Class Activation Mapping (CAM) methods have been developed. +However, all of these techniques target categorical classifiers only, though +most real-world tasks are binary classification. In this paper, we extend +gradient-based CAM techniques to work with binary classifiers and visualize the +active regions for binary facial attribute classifiers. When training an +unbalanced binary classifier on an imbalanced dataset, it is well-known that +the majority class, i.e. the class with many training samples, is mostly +predicted much better than minority class with few training instances. In our +experiments on the CelebA dataset, we verify these results, when training an +unbalanced classifier to extract 40 facial attributes simultaneously. One would +expect that the biased classifier has learned to extract features mainly for +the majority classes and that the proportional energy of the activations mainly +reside in certain specific regions of the image where the attribute is located. +However, we find very little regular activation for samples of majority +classes, while the active regions for minority classes seem mostly reasonable +and overlap with our expectations. These results suggest that biased +classifiers mainly rely on bias activation for majority classes. When training +a balanced classifier on the imbalanced data by employing attribute-specific +class weights, majority and minority classes are classified similarly well and +show expected activations for almost all attributes + +
+
+
+
+
+ + ☆ Ranking Distillation for Open-Ended Video Question Answering with + Insufficient Labels CVPR 2024 + + +
+ This paper focuses on open-ended video question answering, which aims to find +the correct answers from a large answer set in response to a video-related +question. This is essentially a multi-label classification task, since a +question may have multiple answers. However, due to annotation costs, the +labels in existing benchmarks are always extremely insufficient, typically one +answer per question. As a result, existing works tend to directly treat all the +unlabeled answers as negative labels, leading to limited ability for +generalization. In this work, we introduce a simple yet effective ranking +distillation framework (RADI) to mitigate this problem without additional +manual annotation. RADI employs a teacher model trained with incomplete labels +to generate rankings for potential answers, which contain rich knowledge about +label priority as well as label-associated visual cues, thereby enriching the +insufficient labeling information. To avoid overconfidence in the imperfect +teacher model, we further present two robust and parameter-free ranking +distillation approaches: a pairwise approach which introduces adaptive soft +margins to dynamically refine the optimization constraints on various pairwise +rankings, and a listwise approach which adopts sampling-based partial listwise +learning to resist the bias in teacher ranking. Extensive experiments on five +popular benchmarks consistently show that both our pairwise and listwise RADIs +outperform state-of-the-art methods. Further analysis demonstrates the +effectiveness of our methods on the insufficient labeling problem. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Style-Extracting Diffusion Models for Semi-Supervised Histopathology + Segmentation + + +
+ Deep learning-based image generation has seen significant advancements with +diffusion models, notably improving the quality of generated images. Despite +these developments, generating images with unseen characteristics beneficial +for downstream tasks has received limited attention. To bridge this gap, we +propose Style-Extracting Diffusion Models, featuring two conditioning +mechanisms. Specifically, we utilize 1) a style conditioning mechanism which +allows to inject style information of previously unseen images during image +generation and 2) a content conditioning which can be targeted to a downstream +task, e.g., layout for segmentation. We introduce a trainable style encoder to +extract style information from images, and an aggregation block that merges +style information from multiple style inputs. This architecture enables the +generation of images with unseen styles in a zero-shot manner, by leveraging +styles from unseen images, resulting in more diverse generations. In this work, +we use the image layout as target condition and first show the capability of +our method on a natural image dataset as a proof-of-concept. We further +demonstrate its versatility in histopathology, where we combine prior knowledge +about tissue composition and unannotated data to create diverse synthetic +images with known layouts. This allows us to generate additional synthetic data +to train a segmentation network in a semi-supervised fashion. We verify the +added value of the generated images by showing improved segmentation results +and lower performance variability between patients when synthetic images are +included during segmentation training. Our code will be made publicly available +at [LINK]. + +
+
+
+
+
+ + ☆ DP-RDM: Adapting Diffusion Models to Private Domains Without Fine-Tuning + + +
+ Text-to-image diffusion models have been shown to suffer from sample-level +memorization, possibly reproducing near-perfect replica of images that they are +trained on, which may be undesirable. To remedy this issue, we develop the +first differentially private (DP) retrieval-augmented generation algorithm that +is capable of generating high-quality image samples while providing provable +privacy guarantees. Specifically, we assume access to a text-to-image diffusion +model trained on a small amount of public data, and design a DP retrieval +mechanism to augment the text prompt with samples retrieved from a private +retrieval dataset. Our \emph{differentially private retrieval-augmented +diffusion model} (DP-RDM) requires no fine-tuning on the retrieval dataset to +adapt to another domain, and can use state-of-the-art generative models to +generate high-quality image samples while satisfying rigorous DP guarantees. +For instance, when evaluated on MS-COCO, our DP-RDM can generate samples with a +privacy budget of $\epsilon=10$, while providing a $3.5$ point improvement in +FID compared to public-only retrieval for up to $10,000$ queries. + +
+
+
+
+
+ + ☆ OA-CNNs: Omni-Adaptive Sparse CNNs for 3D Semantic Segmentation CVPR 2024 + + +
+ The booming of 3D recognition in the 2020s began with the introduction of +point cloud transformers. They quickly overwhelmed sparse CNNs and became +state-of-the-art models, especially in 3D semantic segmentation. However, +sparse CNNs are still valuable networks, due to their efficiency treasure, and +ease of application. In this work, we reexamine the design distinctions and +test the limits of what a sparse CNN can achieve. We discover that the key +credit to the performance difference is adaptivity. Specifically, we propose +two key components, i.e., adaptive receptive fields (spatially) and adaptive +relation, to bridge the gap. This exploration led to the creation of +Omni-Adaptive 3D CNNs (OA-CNNs), a family of networks that integrates a +lightweight module to greatly enhance the adaptivity of sparse CNNs at minimal +computational cost. Without any self-attention modules, OA-CNNs favorably +surpass point transformers in terms of accuracy in both indoor and outdoor +scenes, with much less latency and memory cost. Notably, it achieves 76.1%, +78.9%, and 70.6% mIoU on ScanNet v2, nuScenes, and SemanticKITTI validation +benchmarks respectively, while maintaining at most 5x better speed than +transformer counterparts. This revelation highlights the potential of pure +sparse CNNs to outperform transformer-related networks. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ CombiNeRF: A Combination of Regularization Techniques for Few-Shot + Neural Radiance Field View Synthesis 3DV + + +
+ Neural Radiance Fields (NeRFs) have shown impressive results for novel view +synthesis when a sufficiently large amount of views are available. When dealing +with few-shot settings, i.e. with a small set of input views, the training +could overfit those views, leading to artifacts and geometric and chromatic +inconsistencies in the resulting rendering. Regularization is a valid solution +that helps NeRF generalization. On the other hand, each of the most recent NeRF +regularization techniques aim to mitigate a specific rendering problem. +Starting from this observation, in this paper we propose CombiNeRF, a framework +that synergically combines several regularization techniques, some of them +novel, in order to unify the benefits of each. In particular, we regularize +single and neighboring rays distributions and we add a smoothness term to +regularize near geometries. After these geometric approaches, we propose to +exploit Lipschitz regularization to both NeRF density and color networks and to +use encoding masks for input features regularization. We show that CombiNeRF +outperforms the state-of-the-art methods with few-shot settings in several +publicly available datasets. We also present an ablation study on the LLFF and +NeRF-Synthetic datasets that support the choices made. We release with this +paper the open-source implementation of our framework. + +
+
+ comment: This paper has been accepted for publication at the 2024 + International Conference on 3D Vision (3DV) +
+
+
+
+
+ + ☆ GLC++: Source-Free Universal Domain Adaptation through Global-Local + Clustering and Contrastive Affinity Learning CVPR 2023 + + +
+ Deep neural networks often exhibit sub-optimal performance under covariate +and category shifts. Source-Free Domain Adaptation (SFDA) presents a promising +solution to this dilemma, yet most SFDA approaches are restricted to closed-set +scenarios. In this paper, we explore Source-Free Universal Domain Adaptation +(SF-UniDA) aiming to accurately classify "known" data belonging to common +categories and segregate them from target-private "unknown" data. We propose a +novel Global and Local Clustering (GLC) technique, which comprises an adaptive +one-vs-all global clustering algorithm to discern between target classes, +complemented by a local k-NN clustering strategy to mitigate negative transfer. +Despite the effectiveness, the inherent closed-set source architecture leads to +uniform treatment of "unknown" data, impeding the identification of distinct +"unknown" categories. To address this, we evolve GLC to GLC++, integrating a +contrastive affinity learning strategy. We examine the superiority of GLC and +GLC++ across multiple benchmarks and category shift scenarios. Remarkably, in +the most challenging open-partial-set scenarios, GLC and GLC++ surpass GATE by +16.7% and 18.6% in H-score on VisDA, respectively. GLC++ enhances the novel +category clustering accuracy of GLC by 4.3% in open-set scenarios on +Office-Home. Furthermore, the introduced contrastive learning strategy not only +enhances GLC but also significantly facilitates existing methodologies. + +
+
+ comment: This is a substantial extension of the CVPR 2023 paper "Upcycling + Models under Domain and Category Shift" +
+
+
+
+
+ + ☆ Pensieve: Retrospect-then-Compare Mitigates Visual Hallucination + + +
+ Multi-modal Large Language Models (MLLMs) demonstrate remarkable success +across various vision-language tasks. However, they suffer from visual +hallucination, where the generated responses diverge from the provided image. +Are MLLMs completely oblivious to accurate visual cues when they hallucinate? +Our investigation reveals that the visual branch may simultaneously advocate +both accurate and non-existent content. To address this issue, we propose +Pensieve, a training-free method inspired by our observation that analogous +visual hallucinations can arise among images sharing common semantic and +appearance characteristics. During inference, Pensieve enables MLLMs to +retrospect relevant images as references and compare them with the test image. +This paradigm assists MLLMs in downgrading hallucinatory content mistakenly +supported by the visual input. Experiments on Whoops, MME, POPE, and LLaVA +Bench demonstrate the efficacy of Pensieve in mitigating visual hallucination, +surpassing other advanced decoding strategies. Additionally, Pensieve aids +MLLMs in identifying details in the image and enhancing the specificity of +image descriptions. + +
+
+
+
+
+ + ☆ A Bag of Tricks for Few-Shot Class-Incremental Learning + + +
+ We present a bag of tricks framework for few-shot class-incremental learning +(FSCIL), which is a challenging form of continual learning that involves +continuous adaptation to new tasks with limited samples. FSCIL requires both +stability and adaptability, i.e., preserving proficiency in previously learned +tasks while learning new ones. Our proposed bag of tricks brings together eight +key and highly influential techniques that improve stability, adaptability, and +overall performance under a unified framework for FSCIL. We organize these +tricks into three categories: stability tricks, adaptability tricks, and +training tricks. Stability tricks aim to mitigate the forgetting of previously +learned classes by enhancing the separation between the embeddings of learned +classes and minimizing interference when learning new ones. On the other hand, +adaptability tricks focus on the effective learning of new classes. Finally, +training tricks improve the overall performance without compromising stability +or adaptability. We perform extensive experiments on three benchmark datasets, +CIFAR-100, CUB-200, and miniIMageNet, to evaluate the impact of our proposed +framework. Our detailed analysis shows that our approach substantially improves +both stability and adaptability, establishing a new state-of-the-art by +outperforming prior works in the area. We believe our method provides a go-to +solution and establishes a robust baseline for future research in this area. + +
+
+
+
+
+ + ☆ Tensor network compressibility of convolutional models + + +
+ Convolutional neural networks (CNNs) represent one of the most widely used +neural network architectures, showcasing state-of-the-art performance in +computer vision tasks. Although larger CNNs generally exhibit higher accuracy, +their size can be effectively reduced by "tensorization" while maintaining +accuracy. Tensorization consists of replacing the convolution kernels with +compact decompositions such as Tucker, Canonical Polyadic decompositions, or +quantum-inspired decompositions such as matrix product states, and directly +training the factors in the decompositions to bias the learning towards +low-rank decompositions. But why doesn't tensorization seem to impact the +accuracy adversely? We explore this by assessing how truncating the convolution +kernels of dense (untensorized) CNNs impact their accuracy. Specifically, we +truncated the kernels of (i) a vanilla four-layer CNN and (ii) ResNet-50 +pre-trained for image classification on CIFAR-10 and CIFAR-100 datasets. We +found that kernels (especially those inside deeper layers) could often be +truncated along several cuts resulting in significant loss in kernel norm but +not in classification accuracy. This suggests that such ``correlation +compression'' (underlying tensorization) is an intrinsic feature of how +information is encoded in dense CNNs. We also found that aggressively truncated +models could often recover the pre-truncation accuracy after only a few epochs +of re-training, suggesting that compressing the internal correlations of +convolution layers does not often transport the model to a worse minimum. Our +results can be applied to tensorize and compress CNN models more effectively. + +
+
+ comment: 20 pages, 21 images +
+
+
+
+
+ + ☆ InfNeRF: Towards Infinite Scale NeRF Rendering with O(log n) Space + Complexity + + +
+ The conventional mesh-based Level of Detail (LoD) technique, exemplified by +applications such as Google Earth and many game engines, exhibits the +capability to holistically represent a large scene even the Earth, and achieves +rendering with a space complexity of O(log n). This constrained data +requirement not only enhances rendering efficiency but also facilitates dynamic +data fetching, thereby enabling a seamless 3D navigation experience for users. +In this work, we extend this proven LoD technique to Neural Radiance Fields +(NeRF) by introducing an octree structure to represent the scenes in different +scales. This innovative approach provides a mathematically simple and elegant +representation with a rendering space complexity of O(log n), aligned with the +efficiency of mesh-based LoD techniques. We also present a novel training +strategy that maintains a complexity of O(n). This strategy allows for parallel +training with minimal overhead, ensuring the scalability and efficiency of our +proposed method. Our contribution is not only in extending the capabilities of +existing techniques but also in establishing a foundation for scalable and +efficient large-scale scene representation using NeRF and octree structures. + +
+
+
+
+
+ + ☆ SyncTweedies: A General Generative Framework Based on Synchronized + Diffusions + + +
+ We introduce a general framework for generating diverse visual content, +including ambiguous images, panorama images, mesh textures, and Gaussian splat +textures, by synchronizing multiple diffusion processes. We present exhaustive +investigation into all possible scenarios for synchronizing multiple diffusion +processes through a canonical space and analyze their characteristics across +applications. In doing so, we reveal a previously unexplored case: averaging +the outputs of Tweedie's formula while conducting denoising in multiple +instance spaces. This case also provides the best quality with the widest +applicability to downstream tasks. We name this case SyncTweedies. In our +experiments generating visual content aforementioned, we demonstrate the +superior quality of generation by SyncTweedies compared to other +synchronization methods, optimization-based and iterative-update-based methods. + +
+
+ comment: Project page: https://synctweedies.github.io/ +
+
+
+
+
+ + ☆ Enabling Visual Composition and Animation in Unsupervised Video + Generation + + +
+ In this work we propose a novel method for unsupervised controllable video +generation. Once trained on a dataset of unannotated videos, at inference our +model is capable of both composing scenes of predefined object parts and +animating them in a plausible and controlled way. This is achieved by +conditioning video generation on a randomly selected subset of local +pre-trained self-supervised features during training. We call our model CAGE +for visual Composition and Animation for video GEneration. We conduct a series +of experiments to demonstrate capabilities of CAGE in various settings. Project +website: https://araachie.github.io/cage. + +
+
+ comment: Project website: https://araachie.github.io/cage +
+
+
+
+
+ + ☆ SurroundSDF: Implicit 3D Scene Understanding Based on Signed Distance + Field + + +
+ Vision-centric 3D environment understanding is both vital and challenging for +autonomous driving systems. Recently, object-free methods have attracted +considerable attention. Such methods perceive the world by predicting the +semantics of discrete voxel grids but fail to construct continuous and accurate +obstacle surfaces. To this end, in this paper, we propose SurroundSDF to +implicitly predict the signed distance field (SDF) and semantic field for the +continuous perception from surround images. Specifically, we introduce a +query-based approach and utilize SDF constrained by the Eikonal formulation to +accurately describe the surfaces of obstacles. Furthermore, considering the +absence of precise SDF ground truth, we propose a novel weakly supervised +paradigm for SDF, referred to as the Sandwich Eikonal formulation, which +emphasizes applying correct and dense constraints on both sides of the surface, +thereby enhancing the perceptual accuracy of the surface. Experiments suggest +that our method achieves SOTA for both occupancy prediction and 3D scene +reconstruction tasks on the nuScenes dataset. + +
+
+
+
+
+ + ☆ Less but Better: Enabling Generalized Zero-shot Learning Towards Unseen + Domains by Intrinsic Learning from Redundant LLM Semantics + + +
+ Generalized zero-shot learning (GZSL) focuses on recognizing seen and unseen +classes against domain shift problem (DSP) where data of unseen classes may be +misclassified as seen classes. However, existing GZSL is still limited to seen +domains. In the current work, we pioneer cross-domain GZSL (CDGZSL) which +addresses GZSL towards unseen domains. Different from existing GZSL methods +which alleviate DSP by generating features of unseen classes with semantics, +CDGZSL needs to construct a common feature space across domains and acquire the +corresponding intrinsic semantics shared among domains to transfer from seen to +unseen domains. Considering the information asymmetry problem caused by +redundant class semantics annotated with large language models (LLMs), we +present Meta Domain Alignment Semantic Refinement (MDASR). Technically, MDASR +consists of two parts: Inter-class Similarity Alignment (ISA), which eliminates +the non-intrinsic semantics not shared across all domains under the guidance of +inter-class feature relationships, and Unseen-class Meta Generation (UMG), +which preserves intrinsic semantics to maintain connectivity between seen and +unseen classes by simulating feature generation. MDASR effectively aligns the +redundant semantic space with the common feature space, mitigating the +information asymmetry in CDGZSL. The effectiveness of MDASR is demonstrated on +the Office-Home and Mini-DomainNet, and we have shared the LLM-based semantics +for these datasets as the benchmark. + +
+
+ comment: This work is submitted to IEEE TNNLS and is subject to IEEE copyright +
+
+
+
+
+ + ☆ Varroa destructor detection on honey bees using hyperspectral imagery + + +
+ Hyperspectral (HS) imagery in agriculture is becoming increasingly common. +These images have the advantage of higher spectral resolution. Advanced +spectral processing techniques are required to unlock the information potential +in these HS images. The present paper introduces a method rooted in +multivariate statistics designed to detect parasitic Varroa destructor mites on +the body of western honey bee Apis mellifera, enabling easier and continuous +monitoring of the bee hives. The methodology explores unsupervised (K-means++) +and recently developed supervised (Kernel Flows - Partial Least-Squares, +KF-PLS) methods for parasitic identification. Additionally, in light of the +emergence of custom-band multispectral cameras, the present research outlines a +strategy for identifying the specific wavelengths necessary for effective +bee-mite separation, suitable for implementation in a custom-band camera. +Illustrated with a real-case dataset, our findings demonstrate that as few as +four spectral bands are sufficient for accurate parasite identification. + +
+
+
+
+
+ + ☆ LDTR: Transformer-based Lane Detection with Anchor-chain Representation + + +
+ Despite recent advances in lane detection methods, scenarios with limited- or +no-visual-clue of lanes due to factors such as lighting conditions and +occlusion remain challenging and crucial for automated driving. Moreover, +current lane representations require complex post-processing and struggle with +specific instances. Inspired by the DETR architecture, we propose LDTR, a +transformer-based model to address these issues. Lanes are modeled with a novel +anchor-chain, regarding a lane as a whole from the beginning, which enables +LDTR to handle special lanes inherently. To enhance lane instance perception, +LDTR incorporates a novel multi-referenced deformable attention module to +distribute attention around the object. Additionally, LDTR incorporates two +line IoU algorithms to improve convergence efficiency and employs a Gaussian +heatmap auxiliary branch to enhance model representation capability during +training. To evaluate lane detection models, we rely on Frechet distance, +parameterized F1-score, and additional synthetic metrics. Experimental results +demonstrate that LDTR achieves state-of-the-art performance on well-known +datasets. + +
+
+ comment: Accepted by CVM 2024 and CVMJ. 16 pages, 14 figures +
+
+
+
+
+ + ☆ Annotation-Efficient Polyp Segmentation via Active Learning + + +
+ Deep learning-based techniques have proven effective in polyp segmentation +tasks when provided with sufficient pixel-wise labeled data. However, the high +cost of manual annotation has created a bottleneck for model generalization. To +minimize annotation costs, we propose a deep active learning framework for +annotation-efficient polyp segmentation. In practice, we measure the +uncertainty of each sample by examining the similarity between features masked +by the prediction map of the polyp and the background area. Since the +segmentation model tends to perform weak in samples with indistinguishable +features of foreground and background areas, uncertainty sampling facilitates +the fitting of under-learning data. Furthermore, clustering image-level +features weighted by uncertainty identify samples that are both uncertain and +representative. To enhance the selectivity of the active selection strategy, we +propose a novel unsupervised feature discrepancy learning mechanism. The +selection strategy and feature optimization work in tandem to achieve optimal +performance with a limited annotation budget. Extensive experimental results +have demonstrated that our proposed method achieved state-of-the-art +performance compared to other competitors on both a public dataset and a +large-scale in-house dataset. + +
+
+ comment: 2024 IEEE 21th International Symposium on Biomedical Imaging (ISBI) +
+
+
+
+
+ + ☆ On the Concept Trustworthiness in Concept Bottleneck Models + + +
+ Concept Bottleneck Models (CBMs), which break down the reasoning process into +the input-to-concept mapping and the concept-to-label prediction, have garnered +significant attention due to their remarkable interpretability achieved by the +interpretable concept bottleneck. However, despite the transparency of the +concept-to-label prediction, the mapping from the input to the intermediate +concept remains a black box, giving rise to concerns about the trustworthiness +of the learned concepts (i.e., these concepts may be predicted based on +spurious cues). The issue of concept untrustworthiness greatly hampers the +interpretability of CBMs, thereby hindering their further advancement. To +conduct a comprehensive analysis on this issue, in this study we establish a +benchmark to assess the trustworthiness of concepts in CBMs. A pioneering +metric, referred to as concept trustworthiness score, is proposed to gauge +whether the concepts are derived from relevant regions. Additionally, an +enhanced CBM is introduced, enabling concept predictions to be made +specifically from distinct parts of the feature map, thereby facilitating the +exploration of their related regions. Besides, we introduce three modules, +namely the cross-layer alignment (CLA) module, the cross-image alignment (CIA) +module, and the prediction alignment (PA) module, to further enhance the +concept trustworthiness within the elaborated CBM. The experiments on five +datasets across ten architectures demonstrate that without using any concept +localization annotations during training, our model improves the concept +trustworthiness by a large margin, meanwhile achieving superior accuracy to the +state-of-the-arts. Our code is available at https://github.com/hqhQAQ/ProtoCBM. + +
+
+
+
+
+ + ☆ Towards Efficient Information Fusion: Concentric Dual Fusion Attention + Based Multiple Instance Learning for Whole Slide Images + + +
+ In the realm of digital pathology, multi-magnification Multiple Instance +Learning (multi-mag MIL) has proven effective in leveraging the hierarchical +structure of Whole Slide Images (WSIs) to reduce information loss and redundant +data. However, current methods fall short in bridging the domain gap between +pretrained models and medical imaging, and often fail to account for spatial +relationships across different magnifications. Addressing these challenges, we +introduce the Concentric Dual Fusion Attention-MIL (CDFA-MIL) framework,which +innovatively combines point-to-area feature-colum attention and point-to-point +concentric-row attention using concentric patch. This approach is designed to +effectively fuse correlated information, enhancing feature representation and +providing stronger correlation guidance for WSI analysis. CDFA-MIL +distinguishes itself by offering a robust fusion strategy that leads to +superior WSI recognition. Its application has demonstrated exceptional +performance, significantly surpassing existing MIL methods in accuracy and F1 +scores on prominent datasets like Camelyon16 and TCGA-NSCLC. Specifically, +CDFA-MIL achieved an average accuracy and F1-score of 93.7\% and 94.1\% +respectively on these datasets, marking a notable advancement over traditional +MIL approaches. + +
+
+ comment: 14 pages, 7 figures +
+
+
+
+
+ + ☆ $\nabla τ$: Gradient-based and Task-Agnostic machine Unlearning + + +
+ Machine Unlearning, the process of selectively eliminating the influence of +certain data examples used during a model's training, has gained significant +attention as a means for practitioners to comply with recent data protection +regulations. However, existing unlearning methods face critical drawbacks, +including their prohibitively high cost, often associated with a large number +of hyperparameters, and the limitation of forgetting only relatively small data +portions. This often makes retraining the model from scratch a quicker and more +effective solution. In this study, we introduce Gradient-based and +Task-Agnostic machine Unlearning ($\nabla \tau$), an optimization framework +designed to remove the influence of a subset of training data efficiently. It +applies adaptive gradient ascent to the data to be forgotten while using +standard gradient descent for the remaining data. $\nabla \tau$ offers multiple +benefits over existing approaches. It enables the unlearning of large sections +of the training dataset (up to 30%). It is versatile, supporting various +unlearning tasks (such as subset forgetting or class removal) and applicable +across different domains (images, text, etc.). Importantly, $\nabla \tau$ +requires no hyperparameter adjustments, making it a more appealing option than +retraining the model from scratch. We evaluate our framework's effectiveness +using a set of well-established Membership Inference Attack metrics, +demonstrating up to 10% enhancements in performance compared to +state-of-the-art methods without compromising the original model's accuracy. + +
+
+ comment: 14 pages, 2 figures +
+
+
+
+
+ + ☆ FFT-based Selection and Optimization of Statistics for Robust + Recognition of Severely Corrupted Images ICASSP 2024 + + +
+ Improving model robustness in case of corrupted images is among the key +challenges to enable robust vision systems on smart devices, such as robotic +agents. Particularly, robust test-time performance is imperative for most of +the applications. This paper presents a novel approach to improve robustness of +any classification model, especially on severely corrupted images. Our method +(FROST) employs high-frequency features to detect input image corruption type, +and select layer-wise feature normalization statistics. FROST provides the +state-of-the-art results for different models and datasets, outperforming +competitors on ImageNet-C by up to 37.1% relative gain, improving baseline of +40.9% mCE on severe corruptions. + +
+
+ comment: ICASSP 2024. Copyright 2024 IEEE. Personal use of this material is + permitted. Permission from IEEE must be obtained for all other uses, in any + current or future media, including reprinting/republishing this material for + advertising or promotional purposes, creating new collective works, for + resale or redistribution to servers or lists, or reuse of any copyrighted + component of this work in other +
+
+
+
+
+ + ☆ CFPL-FAS: Class Free Prompt Learning for Generalizable Face + Anti-spoofing + + +
+ Domain generalization (DG) based Face Anti-Spoofing (FAS) aims to improve the +model's performance on unseen domains. Existing methods either rely on domain +labels to align domain-invariant feature spaces, or disentangle generalizable +features from the whole sample, which inevitably lead to the distortion of +semantic feature structures and achieve limited generalization. In this work, +we make use of large-scale VLMs like CLIP and leverage the textual feature to +dynamically adjust the classifier's weights for exploring generalizable visual +features. Specifically, we propose a novel Class Free Prompt Learning (CFPL) +paradigm for DG FAS, which utilizes two lightweight transformers, namely +Content Q-Former (CQF) and Style Q-Former (SQF), to learn the different +semantic prompts conditioned on content and style features by using a set of +learnable query vectors, respectively. Thus, the generalizable prompt can be +learned by two improvements: (1) A Prompt-Text Matched (PTM) supervision is +introduced to ensure CQF learns visual representation that is most informative +of the content description. (2) A Diversified Style Prompt (DSP) technology is +proposed to diversify the learning of style prompts by mixing feature +statistics between instance-specific styles. Finally, the learned text features +modulate visual features to generalization through the designed Prompt +Modulation (PM). Extensive experiments show that the CFPL is effective and +outperforms the state-of-the-art methods on several cross-domain datasets. + +
+
+ comment: 11 pages, 4 figures +
+
+
+
+
+ + ☆ Neural Network-Based Processing and Reconstruction of Compromised + Biophotonic Image Data + + +
+ The integration of deep learning techniques with biophotonic setups has +opened new horizons in bioimaging. A compelling trend in this field involves +deliberately compromising certain measurement metrics to engineer better +bioimaging tools in terms of cost, speed, and form-factor, followed by +compensating for the resulting defects through the utilization of deep learning +models trained on a large amount of ideal, superior or alternative data. This +strategic approach has found increasing popularity due to its potential to +enhance various aspects of biophotonic imaging. One of the primary motivations +for employing this strategy is the pursuit of higher temporal resolution or +increased imaging speed, critical for capturing fine dynamic biological +processes. This approach also offers the prospect of simplifying hardware +requirements/complexities, thereby making advanced imaging standards more +accessible in terms of cost and/or size. This article provides an in-depth +review of the diverse measurement aspects that researchers intentionally impair +in their biophotonic setups, including the point spread function, +signal-to-noise ratio, sampling density, and pixel resolution. By deliberately +compromising these metrics, researchers aim to not only recuperate them through +the application of deep learning networks, but also bolster in return other +crucial parameters, such as the field-of-view, depth-of-field, and +space-bandwidth product. Here, we discuss various biophotonic methods that have +successfully employed this strategic approach. These techniques span broad +applications and showcase the versatility and effectiveness of deep learning in +the context of compromised biophotonic data. Finally, by offering our +perspectives on the future possibilities of this rapidly evolving concept, we +hope to motivate our readers to explore novel ways of balancing hardware +compromises with compensation via AI. + +
+
+ comment: 17 Pages, 4 Figures, 1 Table +
+
+
+
+
+ + ☆ Exosense: A Vision-Centric Scene Understanding System For Safe + Exoskeleton Navigation + + +
+ Exoskeletons for daily use by those with mobility impairments are being +developed. They will require accurate and robust scene understanding systems. +Current research has used vision to identify immediate terrain and geometric +obstacles, however these approaches are constrained to detections directly in +front of the user and are limited to classifying a finite range of terrain +types (e.g., stairs, ramps and level-ground). This paper presents Exosense, a +vision-centric scene understanding system which is capable of generating rich, +globally-consistent elevation maps, incorporating both semantic and terrain +traversability information. It features an elastic Atlas mapping framework +associated with a visual SLAM pose graph, embedded with open-vocabulary room +labels from a Vision-Language Model (VLM). The device's design includes a wide +field-of-view (FoV) fisheye multi-camera system to mitigate the challenges +introduced by the exoskeleton walking pattern. We demonstrate the system's +robustness to the challenges of typical periodic walking gaits, and its ability +to construct accurate semantically-rich maps in indoor settings. Additionally, +we showcase its potential for motion planning -- providing a step towards safe +navigation for exoskeletons. + +
+
+ comment: 8 pages, 10 figures +
+
+
+
+
+ + ☆ A Lightweight Attention-based Deep Network via Multi-Scale Feature + Fusion for Multi-View Facial Expression Recognition + + +
+ Convolutional neural networks (CNNs) and their variations have shown +effectiveness in facial expression recognition (FER). However, they face +challenges when dealing with high computational complexity and multi-view head +poses in real-world scenarios. We introduce a lightweight attentional network +incorporating multi-scale feature fusion (LANMSFF) to tackle these issues. For +the first challenge, we have carefully designed a lightweight fully +convolutional network (FCN). We address the second challenge by presenting two +novel components, namely mass attention (MassAtt) and point wise feature +selection (PWFS) blocks. The MassAtt block simultaneously generates channel and +spatial attention maps to recalibrate feature maps by emphasizing important +features while suppressing irrelevant ones. On the other hand, the PWFS block +employs a feature selection mechanism that discards less meaningful features +prior to the fusion process. This mechanism distinguishes it from previous +methods that directly fuse multi-scale features. Our proposed approach achieved +results comparable to state-of-the-art methods in terms of parameter counts and +robustness to pose variation, with accuracy rates of 90.77% on KDEF, 70.44% on +FER-2013, and 86.96% on FERPlus datasets. The code for LANMSFF is available at +https://github.com/AE-1129/LANMSFF. + +
+
+ comment: 9 pages, two-column, submitted to journal +
+
+
+
+
+ + ☆ SpikingResformer: Bridging ResNet and Vision Transformer in Spiking + Neural Networks CVPR + + +
+ The remarkable success of Vision Transformers in Artificial Neural Networks +(ANNs) has led to a growing interest in incorporating the self-attention +mechanism and transformer-based architecture into Spiking Neural Networks +(SNNs). While existing methods propose spiking self-attention mechanisms that +are compatible with SNNs, they lack reasonable scaling methods, and the overall +architectures proposed by these methods suffer from a bottleneck in effectively +extracting local features. To address these challenges, we propose a novel +spiking self-attention mechanism named Dual Spike Self-Attention (DSSA) with a +reasonable scaling method. Based on DSSA, we propose a novel spiking Vision +Transformer architecture called SpikingResformer, which combines the +ResNet-based multi-stage architecture with our proposed DSSA to improve both +performance and energy efficiency while reducing parameters. Experimental +results show that SpikingResformer achieves higher accuracy with fewer +parameters and lower energy consumption than other spiking Vision Transformer +counterparts. Notably, our SpikingResformer-L achieves 79.40% top-1 accuracy on +ImageNet with 4 time-steps, which is the state-of-the-art result in the SNN +field. + +
+
+ comment: To be published in the 2024 IEEE/CVF Conference on Computer Vision + and Pattern Recognition (CVPR) +
+
+
+
+
+ + ☆ Impact Assessment of Missing Data in Model Predictions for Earth + Observation Applications + + +
+ Earth observation (EO) applications involving complex and heterogeneous data +sources are commonly approached with machine learning models. However, there is +a common assumption that data sources will be persistently available. Different +situations could affect the availability of EO sources, like noise, clouds, or +satellite mission failures. In this work, we assess the impact of missing +temporal and static EO sources in trained models across four datasets with +classification and regression tasks. We compare the predictive quality of +different methods and find that some are naturally more robust to missing data. +The Ensemble strategy, in particular, achieves a prediction robustness up to +100%. We evidence that missing scenarios are significantly more challenging in +regression than classification tasks. Finally, we find that the optical view is +the most critical view when it is missing individually. + +
+
+ comment: Accepted at IEEE International Geoscience and Remote Sensing + Symposium 2024 +
+
+
+
+
+ + ☆ HySim: An Efficient Hybrid Similarity Measure for Patch Matching in + Image Inpainting + + +
+ Inpainting, for filling missing image regions, is a crucial task in various +applications, such as medical imaging and remote sensing. Trending data-driven +approaches efficiency, for image inpainting, often requires extensive data +preprocessing. In this sense, there is still a need for model-driven approaches +in case of application constrained with data availability and quality, +especially for those related for time series forecasting using image inpainting +techniques. This paper proposes an improved modeldriven approach relying on +patch-based techniques. Our approach deviates from the standard Sum of Squared +Differences (SSD) similarity measure by introducing a Hybrid Similarity +(HySim), which combines both strengths of Chebychev and Minkowski distances. +This hybridization enhances patch selection, leading to high-quality inpainting +results with reduced mismatch errors. Experimental results proved the +effectiveness of our approach against other model-driven techniques, such as +diffusion or patch-based approaches, showcasing its effectiveness in achieving +visually pleasing restorations. + +
+
+
+
+
+ + ☆ Open-Vocabulary Attention Maps with Token Optimization for Semantic + Segmentation in Diffusion Models + + +
+ Diffusion models represent a new paradigm in text-to-image generation. Beyond +generating high-quality images from text prompts, models such as Stable +Diffusion have been successfully extended to the joint generation of semantic +segmentation pseudo-masks. However, current extensions primarily rely on +extracting attentions linked to prompt words used for image synthesis. This +approach limits the generation of segmentation masks derived from word tokens +not contained in the text prompt. In this work, we introduce Open-Vocabulary +Attention Maps (OVAM)-a training-free method for text-to-image diffusion models +that enables the generation of attention maps for any word. In addition, we +propose a lightweight optimization process based on OVAM for finding tokens +that generate accurate attention maps for an object class with a single +annotation. We evaluate these tokens within existing state-of-the-art Stable +Diffusion extensions. The best-performing model improves its mIoU from 52.1 to +86.6 for the synthetic images' pseudo-masks, demonstrating that our optimized +tokens are an efficient way to improve the performance of existing methods +without architectural changes or retraining. + +
+
+
+
+
+ + ☆ Exploring Green AI for Audio Deepfake Detection + + +
+ The state-of-the-art audio deepfake detectors leveraging deep neural networks +exhibit impressive recognition performance. Nonetheless, this advantage is +accompanied by a significant carbon footprint. This is mainly due to the use of +high-performance computing with accelerators and high training time. Studies +show that average deep NLP model produces around 626k lbs of +CO\textsubscript{2} which is equivalent to five times of average US car +emission at its lifetime. This is certainly a massive threat to the +environment. To tackle this challenge, this study presents a novel framework +for audio deepfake detection that can be seamlessly trained using standard CPU +resources. Our proposed framework utilizes off-the-shelve self-supervised +learning (SSL) based models which are pre-trained and available in public +repositories. In contrast to existing methods that fine-tune SSL models and +employ additional deep neural networks for downstream tasks, we exploit +classical machine learning algorithms such as logistic regression and shallow +neural networks using the SSL embeddings extracted using the pre-trained model. +Our approach shows competitive results compared to the commonly used +high-carbon footprint approaches. In experiments with the ASVspoof 2019 LA +dataset, we achieve a 0.90\% equal error rate (EER) with less than 1k trainable +model parameters. To encourage further research in this direction and support +reproducible results, the Python code will be made publicly accessible +following acceptance. Github: https://github.com/sahasubhajit/Speech-Spoofing- + +
+
+ comment: This manuscript is under review in a conference +
+
+
+
+
+ + ☆ Enhancing Historical Image Retrieval with Compositional Cues + + +
+ In analyzing vast amounts of digitally stored historical image data, existing +content-based retrieval methods often overlook significant non-semantic +information, limiting their effectiveness for flexible exploration across +varied themes. To broaden the applicability of image retrieval methods for +diverse purposes and uncover more general patterns, we innovatively introduce a +crucial factor from computational aesthetics, namely image composition, into +this topic. By explicitly integrating composition-related information extracted +by CNN into the designed retrieval model, our method considers both the image's +composition rules and semantic information. Qualitative and quantitative +experiments demonstrate that the image retrieval network guided by composition +information outperforms those relying solely on content information, +facilitating the identification of images in databases closer to the target +image in human perception. Please visit https://github.com/linty5/CCBIR to try +our codes. + +
+
+
+
+
+ + ☆ Assessing the Robustness of Spectral Clustering for Deep Speaker + Diarization + + +
+ Clustering speaker embeddings is crucial in speaker diarization but hasn't +received as much focus as other components. Moreover, the robustness of speaker +diarization across various datasets hasn't been explored when the development +and evaluation data are from different domains. To bridge this gap, this study +thoroughly examines spectral clustering for both same-domain and cross-domain +speaker diarization. Our extensive experiments on two widely used corpora, AMI +and DIHARD, reveal the performance trend of speaker diarization in the presence +of domain mismatch. We observe that the performance difference between two +different domain conditions can be attributed to the role of spectral +clustering. In particular, keeping other modules unchanged, we show that +differences in optimal tuning parameters as well as speaker count estimation +originates due to the mismatch. This study opens several future directions for +speaker diarization research. + +
+
+ comment: Manuscript Under Review +
+
+
+
+
+ + ☆ Zero123-6D: Zero-shot Novel View Synthesis for RGB Category-level 6D + Pose Estimation + + +
+ Estimating the pose of objects through vision is essential to make robotic +platforms interact with the environment. Yet, it presents many challenges, +often related to the lack of flexibility and generalizability of +state-of-the-art solutions. Diffusion models are a cutting-edge neural +architecture transforming 2D and 3D computer vision, outlining remarkable +performances in zero-shot novel-view synthesis. Such a use case is particularly +intriguing for reconstructing 3D objects. However, localizing objects in +unstructured environments is rather unexplored. To this end, this work presents +Zero123-6D to demonstrate the utility of Diffusion Model-based +novel-view-synthesizers in enhancing RGB 6D pose estimation at category-level +by integrating them with feature extraction techniques. The outlined method +exploits such a novel view synthesizer to expand a sparse set of RGB-only +reference views for the zero-shot 6D pose estimation task. Experiments are +quantitatively analyzed on the CO3D dataset, showcasing increased performance +over baselines, a substantial reduction in data requirements, and the removal +of the necessity of depth information. + +
+
+ comment: 6 pages, 2 reference pages, 4 figures +
+
+
+
+
+ + ☆ Scene-Graph ViT: End-to-End Open-Vocabulary Visual Relationship + Detection + + +
+ Visual relationship detection aims to identify objects and their +relationships in images. Prior methods approach this task by adding separate +relationship modules or decoders to existing object detection architectures. +This separation increases complexity and hinders end-to-end training, which +limits performance. We propose a simple and highly efficient decoder-free +architecture for open-vocabulary visual relationship detection. Our model +consists of a Transformer-based image encoder that represents objects as tokens +and models their relationships implicitly. To extract relationship information, +we introduce an attention mechanism that selects object pairs likely to form a +relationship. We provide a single-stage recipe to train this model on a mixture +of object and relationship detection data. Our approach achieves +state-of-the-art relationship detection performance on Visual Genome and on the +large-vocabulary GQA benchmark at real-time inference speeds. We provide +analyses of zero-shot performance, ablations, and real-world qualitative +examples. + +
+
+
+
+
+ + ☆ A Framework for Portrait Stylization with Skin-Tone Awareness and Nudity + Identification ICASSP 2024 + + +
+ Portrait stylization is a challenging task involving the transformation of an +input portrait image into a specific style while preserving its inherent +characteristics. The recent introduction of Stable Diffusion (SD) has +significantly improved the quality of outcomes in this field. However, a +practical stylization framework that can effectively filter harmful input +content and preserve the distinct characteristics of an input, such as +skin-tone, while maintaining the quality of stylization remains lacking. These +challenges have hindered the wide deployment of such a framework. To address +these issues, this study proposes a portrait stylization framework that +incorporates a nudity content identification module (NCIM) and a +skin-tone-aware portrait stylization module (STAPSM). In experiments, NCIM +showed good performance in enhancing explicit content filtering, and STAPSM +accurately represented a diverse range of skin tones. Our proposed framework +has been successfully deployed in practice, and it has effectively satisfied +critical requirements of real-world applications. + +
+
+ comment: Accepted to ICASSP 2024 +
+
+
+
+
+ + ☆ Diffusion Models with Ensembled Structure-Based Anomaly Scoring for + Unsupervised Anomaly Detection + + +
+ Supervised deep learning techniques show promise in medical image analysis. +However, they require comprehensive annotated data sets, which poses +challenges, particularly for rare diseases. Consequently, unsupervised anomaly +detection (UAD) emerges as a viable alternative for pathology segmentation, as +only healthy data is required for training. However, recent UAD anomaly scoring +functions often focus on intensity only and neglect structural differences, +which impedes the segmentation performance. This work investigates the +potential of Structural Similarity (SSIM) to bridge this gap. SSIM captures +both intensity and structural disparities and can be advantageous over the +classical $l1$ error. However, we show that there is more than one optimal +kernel size for the SSIM calculation for different pathologies. Therefore, we +investigate an adaptive ensembling strategy for various kernel sizes to offer a +more pathology-agnostic scoring mechanism. We demonstrate that this ensembling +strategy can enhance the performance of DMs and mitigate the sensitivity to +different kernel sizes across varying pathologies, highlighting its promise for +brain MRI anomaly detection. + +
+
+ comment: Accepted at IEEE ISBI 2024 +
+
+
+
+
+ + ☆ LayoutLLM: Large Language Model Instruction Tuning for Visually Rich + Document Understanding LREC + + +
+ This paper proposes LayoutLLM, a more flexible document analysis method for +understanding imaged documents. Visually Rich Document Understanding tasks, +such as document image classification and information extraction, have gained +significant attention due to their importance. Existing methods have been +developed to enhance document comprehension by incorporating pre-training +awareness of images, text, and layout structure. However, these methods require +fine-tuning for each task and dataset, and the models are expensive to train +and operate. To overcome this limitation, we propose a new LayoutLLM that +integrates these with large-scale language models (LLMs). By leveraging the +strengths of existing research in document image understanding and LLMs' +superior language understanding capabilities, the proposed model, fine-tuned +with multimodal instruction datasets, performs an understanding of document +images in a single model. Our experiments demonstrate improvement over the +baseline model in various document analysis tasks. + +
+
+ comment: LREC-COLING 2024 +
+
+
+
+
+ + ☆ Safeguarding Medical Image Segmentation Datasets against Unauthorized + Training via Contour- and Texture-Aware Perturbations + + +
+ The widespread availability of publicly accessible medical images has +significantly propelled advancements in various research and clinical fields. +Nonetheless, concerns regarding unauthorized training of AI systems for +commercial purposes and the duties of patient privacy protection have led +numerous institutions to hesitate to share their images. This is particularly +true for medical image segmentation (MIS) datasets, where the processes of +collection and fine-grained annotation are time-intensive and laborious. +Recently, Unlearnable Examples (UEs) methods have shown the potential to +protect images by adding invisible shortcuts. These shortcuts can prevent +unauthorized deep neural networks from generalizing. However, existing UEs are +designed for natural image classification and fail to protect MIS datasets +imperceptibly as their protective perturbations are less learnable than +important prior knowledge in MIS, e.g., contour and texture features. To this +end, we propose an Unlearnable Medical image generation method, termed UMed. +UMed integrates the prior knowledge of MIS by injecting contour- and +texture-aware perturbations to protect images. Given that our target is to only +poison features critical to MIS, UMed requires only minimal perturbations +within the ROI and its contour to achieve greater imperceptibility (average +PSNR is 50.03) and protective performance (clean average DSC degrades from +82.18% to 6.80%). + +
+
+
+
+
+ + ☆ ResNet101 and DAE for Enhance Quality and Classification Accuracy in + Skin Cancer Imaging + + +
+ Skin cancer is a crucial health issue that requires timely detection for +higher survival rates. Traditional computer vision techniques face challenges +in addressing the advanced variability of skin lesion features, a gap partially +bridged by convolutional neural networks (CNNs). To overcome the existing +issues, we introduce an innovative convolutional ensemble network approach +named deep autoencoder (DAE) with ResNet101. This method utilizes +convolution-based deep neural networks for the detection of skin cancer. The +ISIC-2018 public data taken from the source is used for experimental results, +which demonstrate remarkable performance with the different in terms of +performance metrics. The methods result in 96.03% of accuracy, 95.40 % of +precision, 96.05% of recall, 0.9576 of F-measure, 0.98 of AUC. + +
+
+ comment: 6 Pages; 14 figures; 3 tables +
+
+
+
+
+ + ☆ Isotropic Gaussian Splatting for Real-Time Radiance Field Rendering + + +
+ The 3D Gaussian splatting method has drawn a lot of attention, thanks to its +high performance in training and high quality of the rendered image. However, +it uses anisotropic Gaussian kernels to represent the scene. Although such +anisotropic kernels have advantages in representing the geometry, they lead to +difficulties in terms of computation, such as splitting or merging two kernels. +In this paper, we propose to use isotropic Gaussian kernels to avoid such +difficulties in the computation, leading to a higher performance method. The +experiments confirm that the proposed method is about {\bf 100X} faster without +losing the geometry representation accuracy. The proposed method can be applied +in a large range applications where the radiance field is needed, such as 3D +reconstruction, view synthesis, and dynamic object modeling. + +
+
+
+
+
+ + ☆ Dermacen Analytica: A Novel Methodology Integrating Multi-Modal Large + Language Models with Machine Learning in tele-dermatology + + +
+ The rise of Artificial Intelligence creates great promise in the field of +medical discovery, diagnostics and patient management. However, the vast +complexity of all medical domains require a more complex approach that combines +machine learning algorithms, classifiers, segmentation algorithms and, lately, +large language models. In this paper, we describe, implement and assess an +Artificial Intelligence-empowered system and methodology aimed at assisting the +diagnosis process of skin lesions and other skin conditions within the field of +dermatology that aims to holistically address the diagnostic process in this +domain. The workflow integrates large language, transformer-based vision models +and sophisticated machine learning tools. This holistic approach achieves a +nuanced interpretation of dermatological conditions that simulates and +facilitates a dermatologist's workflow. We assess our proposed methodology +through a thorough cross-model validation technique embedded in an evaluation +pipeline that utilizes publicly available medical case studies of skin +conditions and relevant images. To quantitatively score the system performance, +advanced machine learning and natural language processing tools are employed +which focus on similarity comparison and natural language inference. +Additionally, we incorporate a human expert evaluation process based on a +structured checklist to further validate our results. We implemented the +proposed methodology in a system which achieved approximate (weighted) scores +of 0.87 for both contextual understanding and diagnostic accuracy, +demonstrating the efficacy of our approach in enhancing dermatological +analysis. The proposed methodology is expected to prove useful in the +development of next-generation tele-dermatology applications, enhancing remote +consultation capabilities and access to care, especially in underserved areas. + +
+
+
+
+
+ + ☆ Weak Supervision with Arbitrary Single Frame for Micro- and + Macro-expression Spotting + + +
+ Frame-level micro- and macro-expression spotting methods require +time-consuming frame-by-frame observation during annotation. Meanwhile, +video-level spotting lacks sufficient information about the location and number +of expressions during training, resulting in significantly inferior performance +compared with fully-supervised spotting. To bridge this gap, we propose a +point-level weakly-supervised expression spotting (PWES) framework, where each +expression requires to be annotated with only one random frame (i.e., a point). +To mitigate the issue of sparse label distribution, the prevailing solution is +pseudo-label mining, which, however, introduces new problems: localizing +contextual background snippets results in inaccurate boundaries and discarding +foreground snippets leads to fragmentary predictions. Therefore, we design the +strategies of multi-refined pseudo label generation (MPLG) and +distribution-guided feature contrastive learning (DFCL) to address these +problems. Specifically, MPLG generates more reliable pseudo labels by merging +class-specific probabilities, attention scores, fused features, and point-level +labels. DFCL is utilized to enhance feature similarity for the same categories +and feature variability for different categories while capturing global +representations across the entire datasets. Extensive experiments on the +CAS(ME)^2, CAS(ME)^3, and SAMM-LV datasets demonstrate PWES achieves promising +performance comparable to that of recent fully-supervised methods. + +
+
+
+
+
+ + ☆ RG-CAT: Detection Pipeline and Catalogue of Radio Galaxies in the EMU + Pilot Survey + + +
+ We present source detection and catalogue construction pipelines to build the +first catalogue of radio galaxies from the 270 $\rm deg^2$ pilot survey of the +Evolutionary Map of the Universe (EMU-PS) conducted with the Australian Square +Kilometre Array Pathfinder (ASKAP) telescope. The detection pipeline uses +Gal-DINO computer-vision networks (Gupta et al., 2024) to predict the +categories of radio morphology and bounding boxes for radio sources, as well as +their potential infrared host positions. The Gal-DINO network is trained and +evaluated on approximately 5,000 visually inspected radio galaxies and their +infrared hosts, encompassing both compact and extended radio morphologies. We +find that the Intersection over Union (IoU) for the predicted and ground truth +bounding boxes is larger than 0.5 for 99% of the radio sources, and 98% of +predicted host positions are within $3^{\prime \prime}$ of the ground truth +infrared host in the evaluation set. The catalogue construction pipeline uses +the predictions of the trained network on the radio and infrared image cutouts +based on the catalogue of radio components identified using the Selavy source +finder algorithm. Confidence scores of the predictions are then used to +prioritize Selavy components with higher scores and incorporate them first into +the catalogue. This results in identifications for a total of 211,625 radio +sources, with 201,211 classified as compact and unresolved. The remaining +10,414 are categorized as extended radio morphologies, including 582 FR-I, +5,602 FR-II, 1,494 FR-x (uncertain whether FR-I or FR-II), 2,375 R (single-peak +resolved) radio galaxies, and 361 with peculiar and other rare morphologies. We +cross-match the radio sources in the catalogue with the infrared and optical +catalogues, finding infrared cross-matches for 73% and photometric redshifts +for 36% of the radio galaxies. + +
+
+ comment: Accepted for publication in PASA. The paper has 22 pages, 12 figures + and 5 tables +
+
+
+
+
+ + ☆ SoftPatch: Unsupervised Anomaly Detection with Noisy Data + + +
+ Although mainstream unsupervised anomaly detection (AD) algorithms perform +well in academic datasets, their performance is limited in practical +application due to the ideal experimental setting of clean training data. +Training with noisy data is an inevitable problem in real-world anomaly +detection but is seldom discussed. This paper considers label-level noise in +image sensory anomaly detection for the first time. To solve this problem, we +proposed a memory-based unsupervised AD method, SoftPatch, which efficiently +denoises the data at the patch level. Noise discriminators are utilized to +generate outlier scores for patch-level noise elimination before coreset +construction. The scores are then stored in the memory bank to soften the +anomaly detection boundary. Compared with existing methods, SoftPatch maintains +a strong modeling ability of normal data and alleviates the overconfidence +problem in coreset. Comprehensive experiments in various noise scenes +demonstrate that SoftPatch outperforms the state-of-the-art AD methods on the +MVTecAD and BTAD benchmarks and is comparable to those methods under the +setting without noise. + +
+
+ comment: 36th Conference on Neural Information Processing Systems +
+
+
+
+
+ + ☆ Toward Multi-class Anomaly Detection: Exploring Class-aware Unified + Model against Inter-class Interference + + +
+ In the context of high usability in single-class anomaly detection models, +recent academic research has become concerned about the more complex +multi-class anomaly detection. Although several papers have designed unified +models for this task, they often overlook the utility of class labels, a potent +tool for mitigating inter-class interference. To address this issue, we +introduce a Multi-class Implicit Neural representation Transformer for unified +Anomaly Detection (MINT-AD), which leverages the fine-grained category +information in the training stage. By learning the multi-class distributions, +the model generates class-aware query embeddings for the transformer decoder, +mitigating inter-class interference within the reconstruction model. Utilizing +such an implicit neural representation network, MINT-AD can project category +and position information into a feature embedding space, further supervised by +classification and prior probability loss functions. Experimental results on +multiple datasets demonstrate that MINT-AD outperforms existing unified +training models. + +
+
+
+
+
+ + ☆ Unsupervised Audio-Visual Segmentation with Modality Alignment + + +
+ Audio-Visual Segmentation (AVS) aims to identify, at the pixel level, the +object in a visual scene that produces a given sound. Current AVS methods rely +on costly fine-grained annotations of mask-audio pairs, making them impractical +for scalability. To address this, we introduce unsupervised AVS, eliminating +the need for such expensive annotation. To tackle this more challenging +problem, we propose an unsupervised learning method, named Modality +Correspondence Alignment (MoCA), which seamlessly integrates off-the-shelf +foundation models like DINO, SAM, and ImageBind. This approach leverages their +knowledge complementarity and optimizes their joint usage for multi-modality +association. Initially, we estimate positive and negative image pairs in the +feature space. For pixel-level association, we introduce an audio-visual +adapter and a novel pixel matching aggregation strategy within the image-level +contrastive learning framework. This allows for a flexible connection between +object appearance and audio signal at the pixel level, with tolerance to +imaging variations such as translation and rotation. Extensive experiments on +the AVSBench (single and multi-object splits) and AVSS datasets demonstrate +that our MoCA outperforms strongly designed baseline methods and approaches +supervised counterparts, particularly in complex scenarios with multiple +auditory objects. Notably when comparing mIoU, MoCA achieves a substantial +improvement over baselines in both the AVSBench (S4: +17.24%; MS3: +67.64%) and +AVSS (+19.23%) audio-visual segmentation challenges. + +
+
+
+
+
+ + ☆ Debiasing surgeon: fantastic weights and how to find them + + +
+ Nowadays an ever-growing concerning phenomenon, the emergence of algorithmic +biases that can lead to unfair models, emerges. Several debiasing approaches +have been proposed in the realm of deep learning, employing more or less +sophisticated approaches to discourage these models from massively employing +these biases. However, a question emerges: is this extra complexity really +necessary? Is a vanilla-trained model already embodying some ``unbiased +sub-networks'' that can be used in isolation and propose a solution without +relying on the algorithmic biases? In this work, we show that such a +sub-network typically exists, and can be extracted from a vanilla-trained model +without requiring additional training. We further validate that such specific +architecture is incapable of learning a specific bias, suggesting that there +are possible architectural countermeasures to the problem of biases in deep +neural networks. + +
+
+
+
+
+ + ☆ Unleashing Unlabeled Data: A Paradigm for Cross-View Geo-Localization CVPR2024 + + +
+ This paper investigates the effective utilization of unlabeled data for +large-area cross-view geo-localization (CVGL), encompassing both unsupervised +and semi-supervised settings. Common approaches to CVGL rely on +ground-satellite image pairs and employ label-driven supervised training. +However, the cost of collecting precise cross-view image pairs hinders the +deployment of CVGL in real-life scenarios. Without the pairs, CVGL will be more +challenging to handle the significant imaging and spatial gaps between ground +and satellite images. To this end, we propose an unsupervised framework +including a cross-view projection to guide the model for retrieving initial +pseudo-labels and a fast re-ranking mechanism to refine the pseudo-labels by +leveraging the fact that ``the perfectly paired ground-satellite image is +located in a unique and identical scene". The framework exhibits competitive +performance compared with supervised works on three open-source benchmarks. Our +code and models will be released on https://github.com/liguopeng0923/UCVGL. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ PECI-Net: Bolus segmentation from video fluoroscopic swallowing study + images using preprocessing ensemble and cascaded inference + + +
+ Bolus segmentation is crucial for the automated detection of swallowing +disorders in videofluoroscopic swallowing studies (VFSS). However, it is +difficult for the model to accurately segment a bolus region in a VFSS image +because VFSS images are translucent, have low contrast and unclear region +boundaries, and lack color information. To overcome these challenges, we +propose PECI-Net, a network architecture for VFSS image analysis that combines +two novel techniques: the preprocessing ensemble network (PEN) and the cascaded +inference network (CIN). PEN enhances the sharpness and contrast of the VFSS +image by combining multiple preprocessing algorithms in a learnable way. CIN +reduces ambiguity in bolus segmentation by using context from other regions +through cascaded inference. Moreover, CIN prevents undesirable side effects +from unreliably segmented regions by referring to the context in an asymmetric +way. In experiments, PECI-Net exhibited higher performance than four recently +developed baseline models, outperforming TernausNet, the best among the +baseline models, by 4.54\% and the widely used UNet by 10.83\%. The results of +the ablation studies confirm that CIN and PEN are effective in improving bolus +segmentation performance. + +
+
+ comment: 20 pages, 8 figures, +
+
+
+
+
+ + ☆ StyleCineGAN: Landscape Cinemagraph Generation using a Pre-trained + StyleGAN + + +
+ We propose a method that can generate cinemagraphs automatically from a still +landscape image using a pre-trained StyleGAN. Inspired by the success of recent +unconditional video generation, we leverage a powerful pre-trained image +generator to synthesize high-quality cinemagraphs. Unlike previous approaches +that mainly utilize the latent space of a pre-trained StyleGAN, our approach +utilizes its deep feature space for both GAN inversion and cinemagraph +generation. Specifically, we propose multi-scale deep feature warping (MSDFW), +which warps the intermediate features of a pre-trained StyleGAN at different +resolutions. By using MSDFW, the generated cinemagraphs are of high resolution +and exhibit plausible looping animation. We demonstrate the superiority of our +method through user studies and quantitative comparisons with state-of-the-art +cinemagraph generation methods and a video generation method that uses a +pre-trained StyleGAN. + +
+
+ comment: Project website: https://jeolpyeoni.github.io/stylecinegan_project/ +
+
+
+
+
+ + ☆ OTSeg: Multi-prompt Sinkhorn Attention for Zero-Shot Semantic + Segmentation + + +
+ The recent success of CLIP has demonstrated promising results in zero-shot +semantic segmentation by transferring muiltimodal knowledge to pixel-level +classification. However, leveraging pre-trained CLIP knowledge to closely align +text embeddings with pixel embeddings still has limitations in existing +approaches. To address this issue, we propose OTSeg, a novel multimodal +attention mechanism aimed at enhancing the potential of multiple text prompts +for matching associated pixel embeddings. We first propose Multi-Prompts +Sinkhorn (MPS) based on the Optimal Transport (OT) algorithm, which leads +multiple text prompts to selectively focus on various semantic features within +image pixels. Moreover, inspired by the success of Sinkformers in unimodal +settings, we introduce the extension of MPS, called Multi-Prompts Sinkhorn +Attention (MPSA), which effectively replaces cross-attention mechanisms within +Transformer framework in multimodal settings. Through extensive experiments, we +demonstrate that OTSeg achieves state-of-the-art (SOTA) performance with +significant gains on Zero-Shot Semantic Segmentation (ZS3) tasks across three +benchmark datasets. + +
+
+ comment: 22 pages, 7 figures +
+
+
+
+
+ + ☆ Unified Static and Dynamic Network: Efficient Temporal Filtering for + Video Grounding + + +
+ Inspired by the activity-silent and persistent activity mechanisms in human +visual perception biology, we design a Unified Static and Dynamic Network +(UniSDNet), to learn the semantic association between the video and text/audio +queries in a cross-modal environment for efficient video grounding. For static +modeling, we devise a novel residual structure (ResMLP) to boost the global +comprehensive interaction between the video segments and queries, achieving +more effective semantic enhancement/supplement. For dynamic modeling, we +effectively exploit three characteristics of the persistent activity mechanism +in our network design for a better video context comprehension. Specifically, +we construct a diffusely connected video clip graph on the basis of 2D sparse +temporal masking to reflect the "short-term effect" relationship. We +innovatively consider the temporal distance and relevance as the joint +"auxiliary evidence clues" and design a multi-kernel Temporal Gaussian Filter +to expand the context clue into high-dimensional space, simulating the "complex +visual perception", and then conduct element level filtering convolution +operations on neighbour clip nodes in message passing stage for finally +generating and ranking the candidate proposals. Our UniSDNet is applicable to +both Natural Language Video Grounding (NLVG) and Spoken Language Video +Grounding (SLVG) tasks. Our UniSDNet achieves SOTA performance on three widely +used datasets for NLVG, as well as three datasets for SLVG, e.g., reporting new +records at 38.88% R@1,IoU@0.7 on ActivityNet Captions and 40.26% R@1,IoU@0.5 on +TACoS. To facilitate this field, we collect two new datasets (Charades-STA +Speech and TACoS Speech) for SLVG task. Meanwhile, the inference speed of our +UniSDNet is 1.56$\times$ faster than the strong multi-query benchmark. Code is +available at: https://github.com/xian-sh/UniSDNet. + +
+
+
+
+
+ + ☆ Mini-Splatting: Representing Scenes with a Constrained Number of + Gaussians + + +
+ In this study, we explore the challenge of efficiently representing scenes +with a constrained number of Gaussians. Our analysis shifts from traditional +graphics and 2D computer vision to the perspective of point clouds, +highlighting the inefficient spatial distribution of Gaussian representation as +a key limitation in model performance. To address this, we introduce strategies +for densification including blur split and depth reinitialization, and +simplification through Gaussian binarization and sampling. These techniques +reorganize the spatial positions of the Gaussians, resulting in significant +improvements across various datasets and benchmarks in terms of rendering +quality, resource consumption, and storage compression. Our proposed +Mini-Splatting method integrates seamlessly with the original rasterization +pipeline, providing a strong baseline for future research in +Gaussian-Splatting-based works. + +
+
+
+
+
+ + ☆ Leveraging Large Language Model-based Room-Object Relationships + Knowledge for Enhancing Multimodal-Input Object Goal Navigation + + +
+ Object-goal navigation is a crucial engineering task for the community of +embodied navigation; it involves navigating to an instance of a specified +object category within unseen environments. Although extensive investigations +have been conducted on both end-to-end and modular-based, data-driven +approaches, fully enabling an agent to comprehend the environment through +perceptual knowledge and perform object-goal navigation as efficiently as +humans remains a significant challenge. Recently, large language models have +shown potential in this task, thanks to their powerful capabilities for +knowledge extraction and integration. In this study, we propose a data-driven, +modular-based approach, trained on a dataset that incorporates common-sense +knowledge of object-to-room relationships extracted from a large language +model. We utilize the multi-channel Swin-Unet architecture to conduct +multi-task learning incorporating with multimodal inputs. The results in the +Habitat simulator demonstrate that our framework outperforms the baseline by an +average of 10.6% in the efficiency metric, Success weighted by Path Length +(SPL). The real-world demonstration shows that the proposed approach can +efficiently conduct this task by traversing several rooms. For more details and +real-world demonstrations, please check our project webpage +(https://sunleyuan.github.io/ObjectNav). + +
+
+ comment: will soon submit to the Elsevier journal, Advanced Engineering + Informatics +
+
+
+
+
+ + ☆ Volumetric Environment Representation for Vision-Language Navigation CVPR 2024 + + +
+ Vision-language navigation (VLN) requires an agent to navigate through an 3D +environment based on visual observations and natural language instructions. It +is clear that the pivotal factor for successful navigation lies in the +comprehensive scene understanding. Previous VLN agents employ monocular +frameworks to extract 2D features of perspective views directly. Though +straightforward, they struggle for capturing 3D geometry and semantics, leading +to a partial and incomplete environment representation. To achieve a +comprehensive 3D representation with fine-grained details, we introduce a +Volumetric Environment Representation (VER), which voxelizes the physical world +into structured 3D cells. For each cell, VER aggregates multi-view 2D features +into such a unified 3D space via 2D-3D sampling. Through coarse-to-fine feature +extraction and multi-task learning for VER, our agent predicts 3D occupancy, 3D +room layout, and 3D bounding boxes jointly. Based on online collected VERs, our +agent performs volume state estimation and builds episodic memory for +predicting the next step. Experimental results show our environment +representations from multi-task learning lead to evident performance gains on +VLN. Our model achieves state-of-the-art performance across VLN benchmarks +(R2R, REVERIE, and R4R). + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Harmonizing Visual and Textual Embeddings for Zero-Shot Text-to-Image + Customization + + +
+ In a surge of text-to-image (T2I) models and their customization methods that +generate new images of a user-provided subject, current works focus on +alleviating the costs incurred by a lengthy per-subject optimization. These +zero-shot customization methods encode the image of a specified subject into a +visual embedding which is then utilized alongside the textual embedding for +diffusion guidance. The visual embedding incorporates intrinsic information +about the subject, while the textual embedding provides a new, transient +context. However, the existing methods often 1) are significantly affected by +the input images, eg., generating images with the same pose, and 2) exhibit +deterioration in the subject's identity. We first pin down the problem and show +that redundant pose information in the visual embedding interferes with the +textual embedding containing the desired pose information. To address this +issue, we propose orthogonal visual embedding which effectively harmonizes with +the given textual embedding. We also adopt the visual-only embedding and inject +the subject's clear features utilizing a self-attention swap. Our results +demonstrate the effectiveness and robustness of our method, which offers highly +flexible zero-shot generation while effectively maintaining the subject's +identity. + +
+
+ comment: Project page: https://ldynx.github.io/harmony-zero-t2i/ +
+
+
+
+
+ + ☆ Efficient Video Diffusion Models via Content-Frame Motion-Latent + Decomposition ICLR 2024 + + +
+ Video diffusion models have recently made great progress in generation +quality, but are still limited by the high memory and computational +requirements. This is because current video diffusion models often attempt to +process high-dimensional videos directly. To tackle this issue, we propose +content-motion latent diffusion model (CMD), a novel efficient extension of +pretrained image diffusion models for video generation. Specifically, we +propose an autoencoder that succinctly encodes a video as a combination of a +content frame (like an image) and a low-dimensional motion latent +representation. The former represents the common content, and the latter +represents the underlying motion in the video, respectively. We generate the +content frame by fine-tuning a pretrained image diffusion model, and we +generate the motion latent representation by training a new lightweight +diffusion model. A key innovation here is the design of a compact latent space +that can directly utilizes a pretrained image diffusion model, which has not +been done in previous latent video diffusion models. This leads to considerably +better quality generation and reduced computational costs. For instance, CMD +can sample a video 7.7$\times$ faster than prior approaches by generating a +video of 512$\times$1024 resolution and length 16 in 3.1 seconds. Moreover, CMD +achieves an FVD score of 212.7 on WebVid-10M, 27.3% better than the previous +state-of-the-art of 292.4. + +
+
+ comment: ICLR 2024. Project page: https://sihyun.me/CMD +
+
+
+
+
+ + ☆ Empowering Segmentation Ability to Multi-modal Large Language Models + + +
+ Multi-modal large language models (MLLMs) can understand image-language +prompts and demonstrate impressive reasoning ability. In this paper, we extend +MLLMs' output by empowering MLLMs with the segmentation ability. The extended +MLLMs can both output language responses to the image-language prompts and +segment the regions that the complex question or query in the language prompts +focuses on. To this end, the existing work, LISA, enlarges the original word +embeddings with an additional segment token and fine-tunes dialogue generation +and query-focused segmentation together, where the feature of the segment token +is used to prompt the segment-anything model. Although they achieve superior +segmentation performance, we observe that the dialogue ability decreases by a +large margin compared to the original MLLMs. To maintain the original MLLMs' +dialogue ability, we propose a novel MLLMs framework, coined as LLaVASeg, which +leverages a chain-of-thought prompting strategy to instruct the MLLMs to +segment the target region queried by the user. The MLLMs are first prompted to +reason about the simple description of the target region from the complicated +user query, then extract the visual attributes of the target region according +to the understanding of MLLMs to the image. These visual attributes, such as +color and relative locations, are utilized to prompt the downstream +segmentation model. Experiments show that the proposed method keeps the +original dialogue ability and equips the MLLMs' model with strong reasoning +segmentation ability. The code is available at +https://github.com/YuqiYang213/LLaVASeg. + +
+
+ comment: 10 pages, 4 figures +
+
+
+
+
+ + ☆ Learning Decomposable and Debiased Representations via Attribute-Centric + Information Bottlenecks + + +
+ Biased attributes, spuriously correlated with target labels in a dataset, can +problematically lead to neural networks that learn improper shortcuts for +classifications and limit their capabilities for out-of-distribution (OOD) +generalization. Although many debiasing approaches have been proposed to ensure +correct predictions from biased datasets, few studies have considered learning +latent embedding consisting of intrinsic and biased attributes that contribute +to improved performance and explain how the model pays attention to attributes. +In this paper, we propose a novel debiasing framework, Debiasing Global +Workspace, introducing attention-based information bottlenecks for learning +compositional representations of attributes without defining specific bias +types. Based on our observation that learning shape-centric representation +helps robust performance on OOD datasets, we adopt those abilities to learn +robust and generalizable representations of decomposable latent embeddings +corresponding to intrinsic and biasing attributes. We conduct comprehensive +evaluations on biased datasets, along with both quantitative and qualitative +analyses, to showcase our approach's efficacy in attribute-centric +representation learning and its ability to differentiate between intrinsic and +bias-related features. + +
+
+ comment: 24 pages, 16 figures, 3 tables +
+
+
+
+
+ + ☆ Evidential Semantic Mapping in Off-road Environments with + Uncertainty-aware Bayesian Kernel Inference + + +
+ Robotic mapping with Bayesian Kernel Inference (BKI) has shown promise in +creating semantic maps by effectively leveraging local spatial information. +However, existing semantic mapping methods face challenges in constructing +reliable maps in unstructured outdoor scenarios due to unreliable semantic +predictions. To address this issue, we propose an evidential semantic mapping, +which can enhance reliability in perceptually challenging off-road +environments. We integrate Evidential Deep Learning into the semantic +segmentation network to obtain the uncertainty estimate of semantic prediction. +Subsequently, this semantic uncertainty is incorporated into an +uncertainty-aware BKI, tailored to prioritize more confident semantic +predictions when accumulating semantic information. By adaptively handling +semantic uncertainties, the proposed framework constructs robust +representations of the surroundings even in previously unseen environments. +Comprehensive experiments across various off-road datasets demonstrate that our +framework enhances accuracy and robustness, consistently outperforming existing +methods in scenes with high perceptual uncertainties. + +
+
+ comment: Our project website can be found at + https://kjyoung.github.io/Homepage/#/Projects/Evidential-Semantic-Mapping +
+
+
+
+
+ + ☆ Improving Image Classification Accuracy through Complementary + Intra-Class and Inter-Class Mixup + + +
+ MixUp and its variants, such as Manifold MixUp, have two key limitations in +image classification tasks. First, they often neglect mixing within the same +class (intra-class mixup), leading to an underutilization of the relationships +among samples within the same class. Second, although these methods effectively +enhance inter-class separability by mixing between different classes +(inter-class mixup), they fall short in improving intra-class cohesion through +their mixing operations, limiting their classification performance. To tackle +these issues, we propose a novel mixup method and a comprehensive integrated +solution.Our mixup approach specifically targets intra-class mixup, an aspect +commonly overlooked, to strengthen intra-class cohesion-a feature not provided +by current mixup techniques.For each mini-batch, our method utilizes feature +representations of unaugmented original images from each class within the +mini-batch to generate a single synthesized feature representation through +random linear interpolation. All synthesized representations for this +mini-batch are then fed into the classification and loss layers to calculate an +average classification loss that can markedly enhance intra-class cohesion. +Moreover, our integrated solution seamlessly combines our intra-class mixup +method with an existing mixup approach such as MixUp or Manifold MixUp. This +comprehensive solution incorporates inter- and intra-class mixup in a balanced +manner while concurrently improving intra-class cohesion and inter-class +separability. Experimental results on six public datasets demonstrate that our +integrated solution achieves a 0.1% to 3.43% higher accuracy than the best of +either MixUp or our intra-class mixup method, averaging a 1.16% gain. It also +outperforms the better performer of either Manifold MixUp or our intra-class +mixup method by 0.12% to 5.16%, with an average gain of 1.11%. + +
+
+ comment: 25 pages,12 figures +
+
+
+
+
+ + ☆ Powerful Lossy Compression for Noisy Images ICME 2024 + + +
+ Image compression and denoising represent fundamental challenges in image +processing with many real-world applications. To address practical demands, +current solutions can be categorized into two main strategies: 1) sequential +method; and 2) joint method. However, sequential methods have the disadvantage +of error accumulation as there is information loss between multiple individual +models. Recently, the academic community began to make some attempts to tackle +this problem through end-to-end joint methods. Most of them ignore that +different regions of noisy images have different characteristics. To solve +these problems, in this paper, our proposed signal-to-noise ratio~(SNR) aware +joint solution exploits local and non-local features for image compression and +denoising simultaneously. We design an end-to-end trainable network, which +includes the main encoder branch, the guidance branch, and the signal-to-noise +ratio~(SNR) aware branch. We conducted extensive experiments on both synthetic +and real-world datasets, demonstrating that our joint solution outperforms +existing state-of-the-art methods. + +
+
+ comment: Accpeted by ICME 2024 +
+
+
+
+
+ + ☆ 3D Object Detection from Point Cloud via Voting Step Diffusion + + +
+ 3D object detection is a fundamental task in scene understanding. Numerous +research efforts have been dedicated to better incorporate Hough voting into +the 3D object detection pipeline. However, due to the noisy, cluttered, and +partial nature of real 3D scans, existing voting-based methods often receive +votes from the partial surfaces of individual objects together with severe +noises, leading to sub-optimal detection performance. In this work, we focus on +the distributional properties of point clouds and formulate the voting process +as generating new points in the high-density region of the distribution of +object centers. To achieve this, we propose a new method to move random 3D +points toward the high-density region of the distribution by estimating the +score function of the distribution with a noise conditioned score network. +Specifically, we first generate a set of object center proposals to coarsely +identify the high-density region of the object center distribution. To estimate +the score function, we perturb the generated object center proposals by adding +normalized Gaussian noise, and then jointly estimate the score function of all +perturbed distributions. Finally, we generate new votes by moving random 3D +points to the high-density region of the object center distribution according +to the estimated score function. Extensive experiments on two large scale +indoor 3D scene datasets, SUN RGB-D and ScanNet V2, demonstrate the superiority +of our proposed method. The code will be released at +https://github.com/HHrEtvP/DiffVote. + +
+
+
+
+
+ + ☆ Soft Masked Transformer for Point Cloud Processing with Skip + Attention-Based Upsampling + + +
+ Point cloud processing methods leverage local and global point features %at +the feature level to cater to downstream tasks, yet they often overlook the +task-level context inherent in point clouds during the encoding stage. We argue +that integrating task-level information into the encoding stage significantly +enhances performance. To that end, we propose SMTransformer which incorporates +task-level information into a vector-based transformer by utilizing a soft mask +generated from task-level queries and keys to learn the attention weights. +Additionally, to facilitate effective communication between features from the +encoding and decoding layers in high-level tasks such as segmentation, we +introduce a skip-attention-based up-sampling block. This block dynamically +fuses features from various resolution points across the encoding and decoding +layers. To mitigate the increase in network parameters and training time +resulting from the complexity of the aforementioned blocks, we propose a novel +shared position encoding strategy. This strategy allows various transformer +blocks to share the same position information over the same resolution points, +thereby reducing network parameters and training time without compromising +accuracy.Experimental comparisons with existing methods on multiple datasets +demonstrate the efficacy of SMTransformer and skip-attention-based up-sampling +for point cloud processing tasks, including semantic segmentation and +classification. In particular, we achieve state-of-the-art semantic +segmentation results of 73.4% mIoU on S3DIS Area 5 and 62.4% mIoU on SWAN +dataset + +
+
+ comment: 14 pages, 8 figures +
+
+
+
+
+ + ☆ External Knowledge Enhanced 3D Scene Generation from Sketch + + +
+ Generating realistic 3D scenes is challenging due to the complexity of room +layouts and object geometries.We propose a sketch based knowledge enhanced +diffusion architecture (SEK) for generating customized, diverse, and plausible +3D scenes. SEK conditions the denoising process with a hand-drawn sketch of the +target scene and cues from an object relationship knowledge base. We first +construct an external knowledge base containing object relationships and then +leverage knowledge enhanced graph reasoning to assist our model in +understanding hand-drawn sketches. A scene is represented as a combination of +3D objects and their relationships, and then incrementally diffused to reach a +Gaussian distribution.We propose a 3D denoising scene transformer that learns +to reverse the diffusion process, conditioned by a hand-drawn sketch along with +knowledge cues, to regressively generate the scene including the 3D object +instances as well as their layout. Experiments on the 3D-FRONT dataset show +that our model improves FID, CKL by 17.41%, 37.18% in 3D scene generation and +FID, KID by 19.12%, 20.06% in 3D scene completion compared to the nearest +competitor DiffuScene. + +
+
+
+
+
+ + ☆ C-TPT: Calibrated Test-Time Prompt Tuning for Vision-Language Models via + Text Feature Dispersion ICLR 2024 + + +
+ In deep learning, test-time adaptation has gained attention as a method for +model fine-tuning without the need for labeled data. A prime exemplification is +the recently proposed test-time prompt tuning for large-scale vision-language +models such as CLIP. Unfortunately, these prompts have been mainly developed to +improve accuracy, overlooking the importance of calibration-a crucial aspect +for quantifying prediction uncertainty. However, traditional calibration +methods rely on substantial amounts of labeled data, making them impractical +for test-time scenarios. To this end, this paper explores calibration during +test-time prompt tuning by leveraging the inherent properties of CLIP. Through +a series of observations, we find that the prompt choice significantly affects +the calibration in CLIP, where the prompts leading to higher text feature +dispersion result in better-calibrated predictions. Introducing the Average +Text Feature Dispersion (ATFD), we establish its relationship with calibration +error and present a novel method, Calibrated Test-time Prompt Tuning (C-TPT), +for optimizing prompts during test-time with enhanced calibration. Through +extensive experiments on different CLIP architectures and datasets, we show +that C-TPT can effectively improve the calibration of test-time prompt tuning +without needing labeled data. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ☆ Training point-based deep learning networks for forest segmentation with + synthetic data ICPR + + +
+ Remote sensing through unmanned aerial systems (UAS) has been increasing in +forestry in recent years, along with using machine learning for data +processing. Deep learning architectures, extensively applied in natural +language and image processing, have recently been extended to the point cloud +domain. However, the availability of point cloud datasets for training and +testing remains limited. Creating forested environment point cloud datasets is +expensive, requires high-precision sensors, and is time-consuming as manual +point classification is required. Moreover, forest areas could be inaccessible +or dangerous for humans, further complicating data collection. Then, a question +arises whether it is possible to use synthetic data to train deep learning +networks without the need to rely on large volumes of real forest data. To +answer this question, we developed a realistic simulator that procedurally +generates synthetic forest scenes. Thanks to this, we have conducted a +comparative study of different state-of-the-art point-based deep learning +networks for forest segmentation. Using created datasets, we determined the +feasibility of using synthetic data to train deep learning networks to classify +point clouds from real forest datasets. Both the simulator and the datasets are +released as part of this work. + +
+
+ comment: 15 pages, 4 figures. Submitted to the International Conference on + Pattern Recognition (ICPR) 2024 +
+
+
+
+
+ + ☆ Test-time Similarity Modification for Person Re-identification toward + Temporal Distribution Shift IJCNN2024 + + +
+ Person re-identification (re-id), which aims to retrieve images of the same +person in a given image from a database, is one of the most practical image +recognition applications. In the real world, however, the environments that the +images are taken from change over time. This causes a distribution shift +between training and testing and degrades the performance of re-id. To maintain +re-id performance, models should continue adapting to the test environment's +temporal changes. Test-time adaptation (TTA), which aims to adapt models to the +test environment with only unlabeled test data, is a promising way to handle +this problem because TTA can adapt models instantly in the test environment. +However, the previous TTA methods are designed for classification and cannot be +directly applied to re-id. This is because the set of people's identities in +the dataset differs between training and testing in re-id, whereas the set of +classes is fixed in the current TTA methods designed for classification. To +improve re-id performance in changing test environments, we propose TEst-time +similarity Modification for Person re-identification (TEMP), a novel TTA method +for re-id. TEMP is the first fully TTA method for re-id, which does not require +any modification to pre-training. Inspired by TTA methods that refine the +prediction uncertainty in classification, we aim to refine the uncertainty in +re-id. However, the uncertainty cannot be computed in the same way as +classification in re-id since it is an open-set task, which does not share +person labels between training and testing. Hence, we propose re-id entropy, an +alternative uncertainty measure for re-id computed based on the similarity +between the feature vectors. Experiments show that the re-id entropy can +measure the uncertainty on re-id and TEMP improves the performance of re-id in +online settings where the distribution changes over time. + +
+
+ comment: Accepted to IJCNN2024 +
+
+
+
+
+ + ☆ Spatio-Temporal Proximity-Aware Dual-Path Model for Panoramic Activity + Recognition + + +
+ Panoramic Activity Recognition (PAR) seeks to identify diverse human +activities across different scales, from individual actions to social group and +global activities in crowded panoramic scenes. PAR presents two major +challenges: 1) recognizing the nuanced interactions among numerous individuals +and 2) understanding multi-granular human activities. To address these, we +propose Social Proximity-aware Dual-Path Network (SPDP-Net) based on two key +design principles. First, while previous works often focus on spatial distance +among individuals within an image, we argue to consider the spatio-temporal +proximity. It is crucial for individual relation encoding to correctly +understand social dynamics. Secondly, deviating from existing hierarchical +approaches (individual-to-social-to-global activity), we introduce a dual-path +architecture for multi-granular activity recognition. This architecture +comprises individual-to-global and individual-to-social paths, mutually +reinforcing each other's task with global-local context through multiple +layers. Through extensive experiments, we validate the effectiveness of the +spatio-temporal proximity among individuals and the dual-path architecture in +PAR. Furthermore, SPDP-Net achieves new state-of-the-art performance with +46.5\% of overall F1 score on JRDB-PAR dataset. + +
+
+
+
+
+ + ☆ Existence Is Chaos: Enhancing 3D Human Motion Prediction with + Uncertainty Consideration AAAI2024 + + +
+ Human motion prediction is consisting in forecasting future body poses from +historically observed sequences. It is a longstanding challenge due to motion's +complex dynamics and uncertainty. Existing methods focus on building up +complicated neural networks to model the motion dynamics. The predicted results +are required to be strictly similar to the training samples with L2 loss in +current training pipeline. However, little attention has been paid to the +uncertainty property which is crucial to the prediction task. We argue that the +recorded motion in training data could be an observation of possible future, +rather than a predetermined result. In addition, existing works calculate the +predicted error on each future frame equally during training, while recent work +indicated that different frames could play different roles. In this work, a +novel computationally efficient encoder-decoder model with uncertainty +consideration is proposed, which could learn proper characteristics for future +frames by a dynamic function. Experimental results on benchmark datasets +demonstrate that our uncertainty consideration approach has obvious advantages +both in quantity and quality. Moreover, the proposed method could produce +motion sequences with much better quality that avoids the intractable shaking +artefacts. We believe our work could provide a novel perspective to consider +the uncertainty quality for the general motion prediction task and encourage +the studies in this field. The code will be available in +https://github.com/Motionpre/Adaptive-Salient-Loss-SAGGB. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ☆ MaskSAM: Towards Auto-prompt SAM with Mask Classification for Medical + Image Segmentation + + +
+ Segment Anything Model~(SAM), a prompt-driven foundation model for natural +image segmentation, has demonstrated impressive zero-shot performance. However, +SAM does not work when directly applied to medical image segmentation tasks, +since SAM lacks the functionality to predict semantic labels for predicted +masks and needs to provide extra prompts, such as points or boxes, to segment +target regions. Meanwhile, there is a huge gap between 2D natural images and 3D +medical images, so the performance of SAM is imperfect for medical image +segmentation tasks. Following the above issues, we propose MaskSAM, a novel +mask classification prompt-free SAM adaptation framework for medical image +segmentation. We design a prompt generator combined with the image encoder in +SAM to generate a set of auxiliary classifier tokens, auxiliary binary masks, +and auxiliary bounding boxes. Each pair of auxiliary mask and box prompts, +which can solve the requirements of extra prompts, is associated with class +label predictions by the sum of the auxiliary classifier token and the +learnable global classifier tokens in the mask decoder of SAM to solve the +predictions of semantic labels. Meanwhile, we design a 3D depth-convolution +adapter for image embeddings and a 3D depth-MLP adapter for prompt embeddings. +We inject one of them into each transformer block in the image encoder and mask +decoder to enable pre-trained 2D SAM models to extract 3D information and adapt +to 3D medical images. Our method achieves state-of-the-art performance on +AMOS2022, 90.52% Dice, which improved by 2.7% compared to nnUNet. Our method +surpasses nnUNet by 1.7% on ACDC and 1.0% on Synapse datasets. + +
+
+
+
+
+ + ☆ Text-Enhanced Data-free Approach for Federated Class-Incremental + Learning CVPR 2024 + + +
+ Federated Class-Incremental Learning (FCIL) is an underexplored yet pivotal +issue, involving the dynamic addition of new classes in the context of +federated learning. In this field, Data-Free Knowledge Transfer (DFKT) plays a +crucial role in addressing catastrophic forgetting and data privacy problems. +However, prior approaches lack the crucial synergy between DFKT and the model +training phases, causing DFKT to encounter difficulties in generating +high-quality data from a non-anchored latent space of the old task model. In +this paper, we introduce LANDER (Label Text Centered Data-Free Knowledge +Transfer) to address this issue by utilizing label text embeddings (LTE) +produced by pretrained language models. Specifically, during the model training +phase, our approach treats LTE as anchor points and constrains the feature +embeddings of corresponding training samples around them, enriching the +surrounding area with more meaningful information. In the DFKT phase, by using +these LTE anchors, LANDER can synthesize more meaningful samples, thereby +effectively addressing the forgetting problem. Additionally, instead of tightly +constraining embeddings toward the anchor, the Bounding Loss is introduced to +encourage sample embeddings to remain flexible within a defined radius. This +approach preserves the natural differences in sample embeddings and mitigates +the embedding overlap caused by heterogeneous federated settings. Extensive +experiments conducted on CIFAR100, Tiny-ImageNet, and ImageNet demonstrate that +LANDER significantly outperforms previous methods and achieves state-of-the-art +performance in FCIL. The code is available at +https://github.com/tmtuan1307/lander. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Science based AI model certification for untrained operational + environments with application in traffic state estimation + + +
+ The expanding role of Artificial Intelligence (AI) in diverse engineering +domains highlights the challenges associated with deploying AI models in new +operational environments, involving substantial investments in data collection +and model training. Rapid application of AI necessitates evaluating the +feasibility of utilizing pre-trained models in unobserved operational settings +with minimal or no additional data. However, interpreting the opaque nature of +AI's black-box models remains a persistent challenge. Addressing this issue, +this paper proposes a science-based certification methodology to assess the +viability of employing pre-trained data-driven models in untrained operational +environments. The methodology advocates a profound integration of domain +knowledge, leveraging theoretical and analytical models from physics and +related disciplines, with data-driven AI models. This novel approach introduces +tools to facilitate the development of secure engineering systems, providing +decision-makers with confidence in the trustworthiness and safety of AI-based +models across diverse environments characterized by limited training data and +dynamic, uncertain conditions. The paper demonstrates the efficacy of this +methodology in real-world safety-critical scenarios, particularly in the +context of traffic state estimation. Through simulation results, the study +illustrates how the proposed methodology efficiently quantifies physical +inconsistencies exhibited by pre-trained AI models. By utilizing analytical +models, the methodology offers a means to gauge the applicability of +pre-trained AI models in new operational environments. This research +contributes to advancing the understanding and deployment of AI models, +offering a robust certification framework that enhances confidence in their +reliability and safety across a spectrum of operational conditions. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ☆ Unsupervised Intrinsic Image Decomposition with LiDAR Intensity Enhanced + Training + + +
+ Unsupervised intrinsic image decomposition (IID) is the process of separating +a natural image into albedo and shade without these ground truths. A recent +model employing light detection and ranging (LiDAR) intensity demonstrated +impressive performance, though the necessity of LiDAR intensity during +inference restricts its practicality. Thus, IID models employing only a single +image during inference while keeping as high IID quality as the one with an +image plus LiDAR intensity are highly desired. To address this challenge, we +propose a novel approach that utilizes only an image during inference while +utilizing an image and LiDAR intensity during training. Specifically, we +introduce a partially-shared model that accepts an image and LiDAR intensity +individually using a different specific encoder but processes them together in +specific components to learn shared representations. In addition, to enhance +IID quality, we propose albedo-alignment loss and image-LiDAR conversion (ILC) +paths. Albedo-alignment loss aligns the gray-scale albedo from an image to that +inferred from LiDAR intensity, thereby reducing cast shadows in albedo from an +image due to the absence of cast shadows in LiDAR intensity. Furthermore, to +translate the input image into albedo and shade style while keeping the image +contents, the input image is separated into style code and content code by +encoders. The ILC path mutually translates the image and LiDAR intensity, which +share content but differ in style, contributing to the distinct differentiation +of style from content. Consequently, LIET achieves comparable IID quality to +the existing model with LiDAR intensity, while utilizing only an image without +LiDAR intensity during inference. + +
+
+
+
+
+ + ☆ Surface Reconstruction from Point Clouds via Grid-based Intersection + Prediction + + +
+ Surface reconstruction from point clouds is a crucial task in the fields of +computer vision and computer graphics. SDF-based methods excel at +reconstructing smooth meshes with minimal error and artifacts but struggle with +representing open surfaces. On the other hand, UDF-based methods can +effectively represent open surfaces but often introduce noise near the surface, +leading to artifacts in the mesh. In this work, we propose a novel approach +that directly predicts the intersection points between sampled line segments of +point pairs and implicit surfaces. This method not only preserves the ability +to represent open surfaces but also eliminates artifacts in the mesh. Our +approach demonstrates state-of-the-art performance on three datasets: ShapeNet, +MGN, and ScanNet. The code will be made available upon acceptance. + +
+
+
+
+
+ + ☆ EventDance: Unsupervised Source-free Cross-modal Adaptation for + Event-based Object Recognition CVPR2024 + + +
+ In this paper, we make the first attempt at achieving the cross-modal (i.e., +image-to-events) adaptation for event-based object recognition without +accessing any labeled source image data owning to privacy and commercial +issues. Tackling this novel problem is non-trivial due to the novelty of event +cameras and the distinct modality gap between images and events. In particular, +as only the source model is available, a hurdle is how to extract the knowledge +from the source model by only using the unlabeled target event data while +achieving knowledge transfer. To this end, we propose a novel framework, dubbed +EventDance for this unsupervised source-free cross-modal adaptation problem. +Importantly, inspired by event-to-video reconstruction methods, we propose a +reconstruction-based modality bridging (RMB) module, which reconstructs +intensity frames from events in a self-supervised manner. This makes it +possible to build up the surrogate images to extract the knowledge (i.e., +labels) from the source model. We then propose a multi-representation knowledge +adaptation (MKA) module that transfers the knowledge to target models learning +events with multiple representation types for fully exploring the +spatiotemporal information of events. The two modules connecting the source and +target models are mutually updated so as to achieve the best performance. +Experiments on three benchmark datasets with two adaption settings show that +EventDance is on par with prior methods utilizing the source data. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ☆ QSMDiff: Unsupervised 3D Diffusion Models for Quantitative + Susceptibility Mapping + + +
+ Quantitative Susceptibility Mapping (QSM) dipole inversion is an ill-posed +inverse problem for quantifying magnetic susceptibility distributions from MRI +tissue phases. While supervised deep learning methods have shown success in +specific QSM tasks, their generalizability across different acquisition +scenarios remains constrained. Recent developments in diffusion models have +demonstrated potential for solving 2D medical imaging inverse problems. +However, their application to 3D modalities, such as QSM, remains challenging +due to high computational demands. In this work, we developed a 3D image +patch-based diffusion model, namely QSMDiff, for robust QSM reconstruction +across different scan parameters, alongside simultaneous super-resolution and +image-denoising tasks. QSMDiff adopts unsupervised 3D image patch training and +full-size measurement guidance during inference for controlled image +generation. Evaluation on simulated and in-vivo human brains, using +gradient-echo and echo-planar imaging sequences across different acquisition +parameters, demonstrates superior performance. The method proposed in QSMDiff +also holds promise for impacting other 3D medical imaging applications beyond +QSM. + +
+
+
+
+
+ + ☆ LeFusion: Synthesizing Myocardial Pathology on Cardiac MRI via + Lesion-Focus Diffusion Models + + +
+ Data generated in clinical practice often exhibits biases, such as long-tail +imbalance and algorithmic unfairness. This study aims to mitigate these +challenges through data synthesis. Previous efforts in medical imaging +synthesis have struggled with separating lesion information from background +context, leading to difficulties in generating high-quality backgrounds and +limited control over the synthetic output. Inspired by diffusion-based image +inpainting, we propose LeFusion, lesion-focused diffusion models. By +redesigning the diffusion learning objectives to concentrate on lesion areas, +it simplifies the model learning process and enhance the controllability of the +synthetic output, while preserving background by integrating forward-diffused +background contexts into the reverse diffusion process. Furthermore, we +generalize it to jointly handle multi-class lesions, and further introduce a +generative model for lesion masks to increase synthesis diversity. Validated on +the DE-MRI cardiac lesion segmentation dataset (Emidec), our methodology +employs the popular nnUNet to demonstrate that the synthetic data make it +possible to effectively enhance a state-of-the-art model. Code and model are +available at https://github.com/M3DV/LeFusion. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ Semantics from Space: Satellite-Guided Thermal Semantic Segmentation + Annotation for Aerial Field Robots + + +
+ We present a new method to automatically generate semantic segmentation +annotations for thermal imagery captured from an aerial vehicle by utilizing +satellite-derived data products alongside onboard global positioning and +attitude estimates. This new capability overcomes the challenge of developing +thermal semantic perception algorithms for field robots due to the lack of +annotated thermal field datasets and the time and costs of manual annotation, +enabling precise and rapid annotation of thermal data from field collection +efforts at a massively-parallelizable scale. By incorporating a +thermal-conditioned refinement step with visual foundation models, our approach +can produce highly-precise semantic segmentation labels using low-resolution +satellite land cover data for little-to-no cost. It achieves 98.5% of the +performance from using costly high-resolution options and demonstrates between +70-160% improvement over popular zero-shot semantic segmentation methods based +on large vision-language models currently used for generating annotations for +RGB imagery. Code will be available at: +https://github.com/connorlee77/aerial-auto-segment. + +
+
+
+
+
+ + ☆ Leveraging Thermal Modality to Enhance Reconstruction in Low-Light + Conditions + + +
+ Neural Radiance Fields (NeRF) accomplishes photo-realistic novel view +synthesis by learning the implicit volumetric representation of a scene from +multi-view images, which faithfully convey the colorimetric information. +However, sensor noises will contaminate low-value pixel signals, and the lossy +camera image signal processor will further remove near-zero intensities in +extremely dark situations, deteriorating the synthesis performance. Existing +approaches reconstruct low-light scenes from raw images but struggle to recover +texture and boundary details in dark regions. Additionally, they are unsuitable +for high-speed models relying on explicit representations. To address these +issues, we present Thermal-NeRF, which takes thermal and visible raw images as +inputs, considering the thermal camera is robust to the illumination variation +and raw images preserve any possible clues in the dark, to accomplish visible +and thermal view synthesis simultaneously. Also, the first multi-view thermal +and visible dataset (MVTV) is established to support the research on multimodal +NeRF. Thermal-NeRF achieves the best trade-off between detail preservation and +noise smoothing and provides better synthesis performance than previous work. +Finally, we demonstrate that both modalities are beneficial to each other in 3D +reconstruction. + +
+
+ comment: 25 pages, 13 figures +
+
+
+
+
+ + ☆ Accelerating ViT Inference on FPGA through Static and Dynamic Pruning + + +
+ Vision Transformers (ViTs) have achieved state-of-the-art accuracy on various +computer vision tasks. However, their high computational complexity prevents +them from being applied to many real-world applications. Weight and token +pruning are two well-known methods for reducing complexity: weight pruning +reduces the model size and associated computational demands, while token +pruning further dynamically reduces the computation based on the input. +Combining these two techniques should significantly reduce computation +complexity and model size; however, naively integrating them results in +irregular computation patterns, leading to significant accuracy drops and +difficulties in hardware acceleration. + Addressing the above challenges, we propose a comprehensive +algorithm-hardware codesign for accelerating ViT on FPGA through simultaneous +pruning -combining static weight pruning and dynamic token pruning. For +algorithm design, we systematically combine a hardware-aware structured +block-pruning method for pruning model parameters and a dynamic token pruning +method for removing unimportant token vectors. Moreover, we design a novel +training algorithm to recover the model's accuracy. For hardware design, we +develop a novel hardware accelerator for executing the pruned model. The +proposed hardware design employs multi-level parallelism with load balancing +strategy to efficiently deal with the irregular computation pattern led by the +two pruning approaches. Moreover, we develop an efficient hardware mechanism +for efficiently executing the on-the-fly token pruning. + +
+
+ comment: FCCM 2024 +
+
+
+
+
+ + ☆ DSGG: Dense Relation Transformer for an End-to-end Scene Graph + Generation CVPR 2024 + + +
+ Scene graph generation aims to capture detailed spatial and semantic +relationships between objects in an image, which is challenging due to +incomplete labelling, long-tailed relationship categories, and relational +semantic overlap. Existing Transformer-based methods either employ distinct +queries for objects and predicates or utilize holistic queries for relation +triplets and hence often suffer from limited capacity in learning low-frequency +relationships. In this paper, we present a new Transformer-based method, called +DSGG, that views scene graph detection as a direct graph prediction problem +based on a unique set of graph-aware queries. In particular, each graph-aware +query encodes a compact representation of both the node and all of its +relations in the graph, acquired through the utilization of a relaxed sub-graph +matching during the training process. Moreover, to address the problem of +relational semantic overlap, we utilize a strategy for relation distillation, +aiming to efficiently learn multiple instances of semantic relationships. +Extensive experiments on the VG and the PSG datasets show that our model +achieves state-of-the-art results, showing a significant improvement of 3.5\% +and 6.7\% in mR@50 and mR@100 for the scene-graph generation task and achieves +an even more substantial improvement of 8.5\% and 10.3\% in mR@50 and mR@100 +for the panoptic scene graph generation task. Code is available at +\url{https://github.com/zeeshanhayder/DSGG}. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ WeatherProof: Leveraging Language Guidance for Semantic Segmentation in + Adverse Weather + + +
+ We propose a method to infer semantic segmentation maps from images captured +under adverse weather conditions. We begin by examining existing models on +images degraded by weather conditions such as rain, fog, or snow, and found +that they exhibit a large performance drop as compared to those captured under +clear weather. To control for changes in scene structures, we propose +WeatherProof, the first semantic segmentation dataset with accurate clear and +adverse weather image pairs that share an underlying scene. Through this +dataset, we analyze the error modes in existing models and found that they were +sensitive to the highly complex combination of different weather effects +induced on the image during capture. To improve robustness, we propose a way to +use language as guidance by identifying contributions of adverse weather +conditions and injecting that as "side information". Models trained using our +language guidance exhibit performance gains by up to 10.2% in mIoU on +WeatherProof, up to 8.44% in mIoU on the widely used ACDC dataset compared to +standard training techniques, and up to 6.21% in mIoU on the ACDC dataset as +compared to previous SOTA methods. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2312.09534 +
+
+
+
+
+ + ☆ VidLA: Video-Language Alignment at Scale CVPR 2024 + + +
+ In this paper, we propose VidLA, an approach for video-language alignment at +scale. There are two major limitations of previous video-language alignment +approaches. First, they do not capture both short-range and long-range temporal +dependencies and typically employ complex hierarchical deep network +architectures that are hard to integrate with existing pretrained image-text +foundation models. To effectively address this limitation, we instead keep the +network architecture simple and use a set of data tokens that operate at +different temporal resolutions in a hierarchical manner, accounting for the +temporally hierarchical nature of videos. By employing a simple two-tower +architecture, we are able to initialize our video-language model with +pretrained image-text foundation models, thereby boosting the final +performance. Second, existing video-language alignment works struggle due to +the lack of semantically aligned large-scale training data. To overcome it, we +leverage recent LLMs to curate the largest video-language dataset to date with +better visual grounding. Furthermore, unlike existing video-text datasets which +only contain short clips, our dataset is enriched with video clips of varying +durations to aid our temporally hierarchical data tokens in extracting better +representations at varying temporal scales. Overall, empirical results show +that our proposed approach surpasses state-of-the-art methods on multiple +retrieval benchmarks, especially on longer videos, and performs competitively +on classification benchmarks. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Distribution-informed and wavelength-flexible data-driven photoacoustic + oximetry + + +
+ Significance: Photoacoustic imaging (PAI) promises to measure +spatially-resolved blood oxygen saturation, but suffers from a lack of accurate +and robust spectral unmixing methods to deliver on this promise. Accurate blood +oxygenation estimation could have important clinical applications, from cancer +detection to quantifying inflammation. + Aim: This study addresses the inflexibility of existing data-driven methods +for estimating blood oxygenation in PAI by introducing a recurrent neural +network architecture. + Approach: We created 25 simulated training dataset variations to assess +neural network performance. We used a long short-term memory network to +implement a wavelength-flexible network architecture and proposed the +Jensen-Shannon divergence to predict the most suitable training dataset. + Results: The network architecture can handle arbitrary input wavelengths and +outperforms linear unmixing and the previously proposed learned spectral +decolouring method. Small changes in the training data significantly affect the +accuracy of our method, but we find that the Jensen-Shannon divergence +correlates with the estimation error and is thus suitable for predicting the +most appropriate training datasets for any given application. + Conclusions: A flexible data-driven network architecture combined with the +Jensen-Shannon Divergence to predict the best training data set provides a +promising direction that might enable robust data-driven photoacoustic oximetry +for clinical use cases. + +
+
+ comment: 37 pages, 7 figures +
+
+
+
+
+ + ☆ KeyPoint Relative Position Encoding for Face Recognition CVPR2024 + + +
+ In this paper, we address the challenge of making ViT models more robust to +unseen affine transformations. Such robustness becomes useful in various +recognition tasks such as face recognition when image alignment failures occur. +We propose a novel method called KP-RPE, which leverages key points +(e.g.~facial landmarks) to make ViT more resilient to scale, translation, and +pose variations. We begin with the observation that Relative Position Encoding +(RPE) is a good way to bring affine transform generalization to ViTs. RPE, +however, can only inject the model with prior knowledge that nearby pixels are +more important than far pixels. Keypoint RPE (KP-RPE) is an extension of this +principle, where the significance of pixels is not solely dictated by their +proximity but also by their relative positions to specific keypoints within the +image. By anchoring the significance of pixels around keypoints, the model can +more effectively retain spatial relationships, even when those relationships +are disrupted by affine transformations. We show the merit of KP-RPE in face +and gait recognition. The experimental results demonstrate the effectiveness in +improving face recognition performance from low-quality images, particularly +where alignment is prone to failure. Code and pre-trained models are available. + +
+
+ comment: To appear in CVPR2024 +
+
+
+
+
+ + ☆ Hyperspectral Neural Radiance Fields + + +
+ Hyperspectral Imagery (HSI) has been used in many applications to +non-destructively determine the material and/or chemical compositions of +samples. There is growing interest in creating 3D hyperspectral +reconstructions, which could provide both spatial and spectral information +while also mitigating common HSI challenges such as non-Lambertian surfaces and +translucent objects. However, traditional 3D reconstruction with HSI is +difficult due to technological limitations of hyperspectral cameras. In recent +years, Neural Radiance Fields (NeRFs) have seen widespread success in creating +high quality volumetric 3D representations of scenes captured by a variety of +camera models. Leveraging recent advances in NeRFs, we propose computing a +hyperspectral 3D reconstruction in which every point in space and view +direction is characterized by wavelength-dependent radiance and transmittance +spectra. To evaluate our approach, a dataset containing nearly 2000 +hyperspectral images across 8 scenes and 2 cameras was collected. We perform +comparisons against traditional RGB NeRF baselines and apply ablation testing +with alternative spectra representations. Finally, we demonstrate the potential +of hyperspectral NeRFs for hyperspectral super-resolution and imaging sensor +simulation. We show that our hyperspectral NeRF approach enables creating fast, +accurate volumetric 3D hyperspectral scenes and enables several new +applications and areas for future study. + +
+
+ comment: Main paper: 15 pages + 2 pages references. Supplemental/Appendix: 6 + pages +
+
+
+
+
+ + ☆ Osmosis: RGBD Diffusion Prior for Underwater Image Restoration + + +
+ Underwater image restoration is a challenging task because of strong water +effects that increase dramatically with distance. This is worsened by lack of +ground truth data of clean scenes without water. Diffusion priors have emerged +as strong image restoration priors. However, they are often trained with a +dataset of the desired restored output, which is not available in our case. To +overcome this critical issue, we show how to leverage in-air images to train +diffusion priors for underwater restoration. We also observe that only color +data is insufficient, and augment the prior with a depth channel. We train an +unconditional diffusion model prior on the joint space of color and depth, +using standard RGBD datasets of natural outdoor scenes in air. Using this prior +together with a novel guidance method based on the underwater image formation +model, we generate posterior samples of clean images, removing the water +effects. Even though our prior did not see any underwater images during +training, our method outperforms state-of-the-art baselines for image +restoration on very challenging scenes. Data, models and code are published in +the project page. + +
+
+
+
+
+ + ☆ Evaluating Panoramic 3D Estimation in Indoor Lighting Analysis + + +
+ This paper presents the use of panoramic 3D estimation in lighting +simulation. Conventional lighting simulation necessitates detailed modeling as +input, resulting in significant labor effort and time cost. The 3D layout +estimation method directly takes a single panorama as input and generates a +lighting simulation model with room geometry and window aperture. We evaluate +the simulation results by comparing the luminance errors between on-site High +Dynamic Range (HDR) photographs, 3D estimation model, and detailed model in +panoramic representation and fisheye perspective. Given the selected scene, the +results demonstrate the estimated room layout is reliable for lighting +simulation. + +
+
+ comment: Annual Modeling and Simulation Conference (ANNSIM), May 20-23, 2024, + Washington D.C., USA +
+
+
+
+
+ + ☆ Multimodal-Conditioned Latent Diffusion Models for Fashion Image Editing + + +
+ Fashion illustration is a crucial medium for designers to convey their +creative vision and transform design concepts into tangible representations +that showcase the interplay between clothing and the human body. In the context +of fashion design, computer vision techniques have the potential to enhance and +streamline the design process. Departing from prior research primarily focused +on virtual try-on, this paper tackles the task of multimodal-conditioned +fashion image editing. Our approach aims to generate human-centric fashion +images guided by multimodal prompts, including text, human body poses, garment +sketches, and fabric textures. To address this problem, we propose extending +latent diffusion models to incorporate these multiple modalities and modifying +the structure of the denoising network, taking multimodal prompts as input. To +condition the proposed architecture on fabric textures, we employ textual +inversion techniques and let diverse cross-attention layers of the denoising +network attend to textual and texture information, thus incorporating different +granularity conditioning details. Given the lack of datasets for the task, we +extend two existing fashion datasets, Dress Code and VITON-HD, with multimodal +annotations. Experimental evaluations demonstrate the effectiveness of our +proposed approach in terms of realism and coherence concerning the provided +multimodal inputs. + +
+
+
+
+
+ + ☆ Learning Gaussian Representation for Eye Fixation Prediction + + +
+ Existing eye fixation prediction methods perform the mapping from input +images to the corresponding dense fixation maps generated from raw fixation +points. However, due to the stochastic nature of human fixation, the generated +dense fixation maps may be a less-than-ideal representation of human fixation. +To provide a robust fixation model, we introduce Gaussian Representation for +eye fixation modeling. Specifically, we propose to model the eye fixation map +as a mixture of probability distributions, namely a Gaussian Mixture Model. In +this new representation, we use several Gaussian distribution components as an +alternative to the provided fixation map, which makes the model more robust to +the randomness of fixation. Meanwhile, we design our framework upon some +lightweight backbones to achieve real-time fixation prediction. Experimental +results on three public fixation prediction datasets (SALICON, MIT1003, +TORONTO) demonstrate that our method is fast and effective. + +
+
+ comment: 11 pages, 7 figures +
+
+
+
+
+ + ☆ Deep Active Learning: A Reality Check + + +
+ We conduct a comprehensive evaluation of state-of-the-art deep active +learning methods. Surprisingly, under general settings, no single-model method +decisively outperforms entropy-based active learning, and some even fall short +of random sampling. We delve into overlooked aspects like starting budget, +budget step, and pretraining's impact, revealing their significance in +achieving superior results. Additionally, we extend our evaluation to other +tasks, exploring the active learning effectiveness in combination with +semi-supervised learning, and object detection. Our experiments provide +valuable insights and concrete recommendations for future active learning +studies. By uncovering the limitations of current methods and understanding the +impact of different experimental settings, we aim to inspire more efficient +training of deep learning models in real-world scenarios with limited +annotation budgets. This work contributes to advancing active learning's +efficacy in deep learning and empowers researchers to make informed decisions +when applying active learning to their tasks. + +
+
+
+
+
+ + ☆ Preventing Catastrophic Forgetting through Memory Networks in Continuous + Detection + + +
+ Modern pre-trained architectures struggle to retain previous information +while undergoing continuous fine-tuning on new tasks. Despite notable progress +in continual classification, systems designed for complex vision tasks such as +detection or segmentation still struggle to attain satisfactory performance. In +this work, we introduce a memory-based detection transformer architecture to +adapt a pre-trained DETR-style detector to new tasks while preserving knowledge +from previous tasks. We propose a novel localized query function for efficient +information retrieval from memory units, aiming to minimize forgetting. +Furthermore, we identify a fundamental challenge in continual detection +referred to as background relegation. This arises when object categories from +earlier tasks reappear in future tasks, potentially without labels, leading +them to be implicitly treated as background. This is an inevitable issue in +continual detection or segmentation. The introduced continual optimization +technique effectively tackles this challenge. Finally, we assess the +performance of our proposed system on continual detection benchmarks and +demonstrate that our approach surpasses the performance of existing +state-of-the-art resulting in 5-7% improvements on MS-COCO and PASCAL-VOC on +the task of continual detection. + +
+
+
+
+
+ + ☆ Latent Diffusion Models for Attribute-Preserving Image Anonymization + + +
+ Generative techniques for image anonymization have great potential to +generate datasets that protect the privacy of those depicted in the images, +while achieving high data fidelity and utility. Existing methods have focused +extensively on preserving facial attributes, but failed to embrace a more +comprehensive perspective that considers the scene and background into the +anonymization process. This paper presents, to the best of our knowledge, the +first approach to image anonymization based on Latent Diffusion Models (LDMs). +Every element of a scene is maintained to convey the same meaning, yet +manipulated in a way that makes re-identification difficult. We propose two +LDMs for this purpose: CAMOUFLaGE-Base exploits a combination of pre-trained +ControlNets, and a new controlling mechanism designed to increase the distance +between the real and anonymized images. CAMOFULaGE-Light is based on the +Adapter technique, coupled with an encoding designed to efficiently represent +the attributes of different persons in a scene. The former solution achieves +superior performance on most metrics and benchmarks, while the latter cuts the +inference time in half at the cost of fine-tuning a lightweight module. We show +through extensive experimental comparison that the proposed method is +competitive with the state-of-the-art concerning identity obfuscation whilst +better preserving the original content of the image and tackling unresolved +challenges that current solutions fail to address. + +
+
+
+
+
+ + ☆ On the exploitation of DCT statistics for cropping detectors + + +
+ {The study of frequency components derived from Discrete Cosine Transform +(DCT) has been widely used in image analysis. In recent years it has been +observed that significant information can be extrapolated from them about the +lifecycle of the image, but no study has focused on the analysis between them +and the source resolution of the image. In this work, we investigated a novel +image resolution classifier that employs DCT statistics with the goal to detect +the original resolution of images; in particular the insight was exploited to +address the challenge of identifying cropped images. Training a Machine +Learning (ML) classifier on entire images (not cropped), the generated model +can leverage this information to detect cropping. The results demonstrate the +classifier's reliability in distinguishing between cropped and not cropped +images, providing a dependable estimation of their original resolution. This +advancement has significant implications for image processing applications, +including digital security, authenticity verification, and visual quality +analysis, by offering a new tool for detecting image manipulations and +enhancing qualitative image assessment. This work opens new perspectives in the +field, with potential to transform image analysis and usage across multiple +domains.} + +
+
+ comment: 8 pages, 3 figures, conference +
+
+
+
+
+ + ☆ Multi-Agent VQA: Exploring Multi-Agent Foundation Models in Zero-Shot + Visual Question Answering + + +
+ This work explores the zero-shot capabilities of foundation models in Visual +Question Answering (VQA) tasks. We propose an adaptive multi-agent system, +named Multi-Agent VQA, to overcome the limitations of foundation models in +object detection and counting by using specialized agents as tools. Unlike +existing approaches, our study focuses on the system's performance without +fine-tuning it on specific VQA datasets, making it more practical and robust in +the open world. We present preliminary experimental results under zero-shot +scenarios and highlight some failure cases, offering new directions for future +research. + +
+
+ comment: A full version of the paper will be released soon. The codes are + available at https://github.com/bowen-upenn/Multi-Agent-VQA +
+
+
+
+
+ + ☆ Champ: Controllable and Consistent Human Image Animation with 3D + Parametric Guidance + + +
+ In this study, we introduce a methodology for human image animation by +leveraging a 3D human parametric model within a latent diffusion framework to +enhance shape alignment and motion guidance in curernt human generative +techniques. The methodology utilizes the SMPL(Skinned Multi-Person Linear) +model as the 3D human parametric model to establish a unified representation of +body shape and pose. This facilitates the accurate capture of intricate human +geometry and motion characteristics from source videos. Specifically, we +incorporate rendered depth images, normal maps, and semantic maps obtained from +SMPL sequences, alongside skeleton-based motion guidance, to enrich the +conditions to the latent diffusion model with comprehensive 3D shape and +detailed pose attributes. A multi-layer motion fusion module, integrating +self-attention mechanisms, is employed to fuse the shape and motion latent +representations in the spatial domain. By representing the 3D human parametric +model as the motion guidance, we can perform parametric shape alignment of the +human body between the reference image and the source video motion. +Experimental evaluations conducted on benchmark datasets demonstrate the +methodology's superior ability to generate high-quality human animations that +accurately capture both pose and shape variations. Furthermore, our approach +also exhibits superior generalization capabilities on the proposed wild +dataset. Project page: https://fudan-generative-vision.github.io/champ. + +
+
+
+
+
+ + ☆ Diffusion Attack: Leveraging Stable Diffusion for Naturalistic Image + Attacking + + +
+ In Virtual Reality (VR), adversarial attack remains a significant security +threat. Most deep learning-based methods for physical and digital adversarial +attacks focus on enhancing attack performance by crafting adversarial examples +that contain large printable distortions that are easy for human observers to +identify. However, attackers rarely impose limitations on the naturalness and +comfort of the appearance of the generated attack image, resulting in a +noticeable and unnatural attack. To address this challenge, we propose a +framework to incorporate style transfer to craft adversarial inputs of natural +styles that exhibit minimal detectability and maximum natural appearance, while +maintaining superior attack capabilities. + +
+
+ comment: Accepted to IEEE VRW +
+
+
+
+
+ + ☆ Few-Shot Adversarial Prompt Learning on Vision-Language Models + + +
+ The vulnerability of deep neural networks to imperceptible adversarial +perturbations has attracted widespread attention. Inspired by the success of +vision-language foundation models, previous efforts achieved zero-shot +adversarial robustness by aligning adversarial visual features with text +supervision. However, in practice, they are still unsatisfactory due to several +issues, including heavy adaptation cost, suboptimal text supervision, and +uncontrolled natural generalization capacity. In this paper, to address these +issues, we propose a few-shot adversarial prompt framework where adapting input +sequences with limited data makes significant adversarial robustness +improvement. Specifically, we achieve this by providing adversarially +correlated text supervision that is end-to-end learned from adversarial +examples. We also propose a novel training objective that enhances the +consistency of multi-modal features while encourages differentiated uni-modal +features between natural and adversarial examples. The proposed framework gives +access to learn adversarial text supervision, which provides superior +cross-modal adversarial alignment and matches state-of-the-art zero-shot +adversarial robustness with only 1% training data. + +
+
+ comment: 25 pages, 13 tables, 8 figures +
+
+
+
+
+ + ☆ StreamingT2V: Consistent, Dynamic, and Extendable Long Video Generation + from Text + + +
+ Text-to-video diffusion models enable the generation of high-quality videos +that follow text instructions, making it easy to create diverse and individual +content. However, existing approaches mostly focus on high-quality short video +generation (typically 16 or 24 frames), ending up with hard-cuts when naively +extended to the case of long video synthesis. To overcome these limitations, we +introduce StreamingT2V, an autoregressive approach for long video generation of +80, 240, 600, 1200 or more frames with smooth transitions. The key components +are:(i) a short-term memory block called conditional attention module (CAM), +which conditions the current generation on the features extracted from the +previous chunk via an attentional mechanism, leading to consistent chunk +transitions, (ii) a long-term memory block called appearance preservation +module, which extracts high-level scene and object features from the first +video chunk to prevent the model from forgetting the initial scene, and (iii) a +randomized blending approach that enables to apply a video enhancer +autoregressively for infinitely long videos without inconsistencies between +chunks. Experiments show that StreamingT2V generates high motion amount. In +contrast, all competing image-to-video methods are prone to video stagnation +when applied naively in an autoregressive manner. Thus, we propose with +StreamingT2V a high-quality seamless text-to-long video generator that +outperforms competitors with consistency and motion. Our code will be available +at: https://github.com/Picsart-AI-Research/StreamingT2V + +
+
+ comment: https://github.com/Picsart-AI-Research/StreamingT2V +
+
+
+
+
+ + ☆ Improving Robustness to Model Inversion Attacks via Sparse Coding + Architectures + + +
+ Recent model inversion attack algorithms permit adversaries to reconstruct a +neural network's private training data just by repeatedly querying the network +and inspecting its outputs. In this work, we develop a novel network +architecture that leverages sparse-coding layers to obtain superior robustness +to this class of attacks. Three decades of computer science research has +studied sparse coding in the context of image denoising, object recognition, +and adversarial misclassification settings, but to the best of our knowledge, +its connection to state-of-the-art privacy vulnerabilities remains unstudied. +However, sparse coding architectures suggest an advantageous means to defend +against model inversion attacks because they allow us to control the amount of +irrelevant private information encoded in a network's intermediate +representations in a manner that can be computed efficiently during training +and that is known to have little effect on classification accuracy. +Specifically, compared to networks trained with a variety of state-of-the-art +defenses, our sparse-coding architectures maintain comparable or higher +classification accuracy while degrading state-of-the-art training data +reconstructions by factors of 1.1 to 18.3 across a variety of reconstruction +quality metrics (PSNR, SSIM, FID). This performance advantage holds across 5 +datasets ranging from CelebA faces to medical images and CIFAR-10, and across +various state-of-the-art SGD-based and GAN-based inversion attacks, including +Plug-&-Play attacks. We provide a cluster-ready PyTorch codebase to promote +research and standardize defense evaluations. + +
+
+ comment: 32 pages, 15 Tables, and 9 Figures +
+
+
+
+
+ + ☆ Can 3D Vision-Language Models Truly Understand Natural Language? + + +
+ Rapid advancements in 3D vision-language (3D-VL) tasks have opened up new +avenues for human interaction with embodied agents or robots using natural +language. Despite this progress, we find a notable limitation: existing 3D-VL +models exhibit sensitivity to the styles of language input, struggling to +understand sentences with the same semantic meaning but written in different +variants. This observation raises a critical question: Can 3D vision-language +models truly understand natural language? To test the language +understandability of 3D-VL models, we first propose a language robustness task +for systematically assessing 3D-VL models across various tasks, benchmarking +their performance when presented with different language style variants. +Importantly, these variants are commonly encountered in applications requiring +direct interaction with humans, such as embodied robotics, given the diversity +and unpredictability of human language. We propose a 3D Language Robustness +Dataset, designed based on the characteristics of human language, to facilitate +the systematic study of robustness. Our comprehensive evaluation uncovers a +significant drop in the performance of all existing models across various 3D-VL +tasks. Even the state-of-the-art 3D-LLM fails to understand some variants of +the same sentences. Further in-depth analysis suggests that the existing models +have a fragile and biased fusion module, which stems from the low diversity of +the existing dataset. Finally, we propose a training-free module driven by LLM, +which improves language robustness. Datasets and code will be available at +github. + +
+
+ comment: https://github.com/VincentDENGP/3D-LR +
+
+
+
+
+ + ☆ VURF: A General-purpose Reasoning and Self-refinement Framework for + Video Understanding + + +
+ Recent studies have demonstrated the effectiveness of Large Language Models +(LLMs) as reasoning modules that can deconstruct complex tasks into more +manageable sub-tasks, particularly when applied to visual reasoning tasks for +images. In contrast, this paper introduces a Video Understanding and Reasoning +Framework (VURF) based on the reasoning power of LLMs. Ours is a novel approach +to extend the utility of LLMs in the context of video tasks, leveraging their +capacity to generalize from minimal input and output demonstrations within a +contextual framework. By presenting LLMs with pairs of instructions and their +corresponding high-level programs, we harness their contextual learning +capabilities to generate executable visual programs for video understanding. To +enhance program's accuracy and robustness, we implement two important +strategies. Firstly, we employ a feedback-generation approach, powered by +GPT-3.5, to rectify errors in programs utilizing unsupported functions. +Secondly, taking motivation from recent works on self refinement of LLM +outputs, we introduce an iterative procedure for improving the quality of the +in-context examples by aligning the initial outputs to the outputs that would +have been generated had the LLM not been bound by the structure of the +in-context examples. Our results on several video-specific tasks, including +visual QA, video anticipation, pose estimation and multi-video QA illustrate +the efficacy of these enhancements in improving the performance of visual +programming approaches for video tasks. Our Codes and data will be publicly +released. + +
+
+
+
+
+ + ☆ Auto-Train-Once: Controller Network Guided Automatic Network Pruning + from Scratch + + +
+ Current techniques for deep neural network (DNN) pruning often involve +intricate multi-step processes that require domain-specific expertise, making +their widespread adoption challenging. To address the limitation, the +Only-Train-Once (OTO) and OTOv2 are proposed to eliminate the need for +additional fine-tuning steps by directly training and compressing a general DNN +from scratch. Nevertheless, the static design of optimizers (in OTO) can lead +to convergence issues of local optima. In this paper, we proposed the +Auto-Train-Once (ATO), an innovative network pruning algorithm designed to +automatically reduce the computational and storage costs of DNNs. During the +model training phase, our approach not only trains the target model but also +leverages a controller network as an architecture generator to guide the +learning of target model weights. Furthermore, we developed a novel stochastic +gradient algorithm that enhances the coordination between model training and +controller network training, thereby improving pruning performance. We provide +a comprehensive convergence analysis as well as extensive experiments, and the +results show that our approach achieves state-of-the-art performance across +various model architectures (including ResNet18, ResNet34, ResNet50, ResNet56, +and MobileNetv2) on standard benchmark datasets (CIFAR-10, CIFAR-100, and +ImageNet). + +
+
+
+
+
+ + ☆ On the Detection of Anomalous or Out-Of-Distribution Data in Vision + Models Using Statistical Techniques + + +
+ Out-of-distribution data and anomalous inputs are vulnerabilities of machine +learning systems today, often causing systems to make incorrect predictions. +The diverse range of data on which these models are used makes detecting +atypical inputs a difficult and important task. We assess a tool, Benford's +law, as a method used to quantify the difference between real and corrupted +inputs. We believe that in many settings, it could function as a filter for +anomalous data points and for signalling out-of-distribution data. We hope to +open a discussion on these applications and further areas where this technique +is underexplored. + +
+
+
+
+
+ + ♻ ☆ Unveiling Typographic Deceptions: Insights of the Typographic + Vulnerability in Large Vision-Language Model + + +
+ Large Vision-Language Models (LVLMs) rely on vision encoders and Large +Language Models (LLMs) to exhibit remarkable capabilities on various +multi-modal tasks in the joint space of vision and language. However, the +Typographic Attack, which disrupts vision-language models (VLMs) such as +Contrastive Language-Image Pretraining (CLIP), has also been expected to be a +security threat to LVLMs. Firstly, we verify typographic attacks on current +well-known commercial and open-source LVLMs and uncover the widespread +existence of this threat. Secondly, to better assess this vulnerability, we +propose the most comprehensive and largest-scale Typographic Dataset to date. +The Typographic Dataset not only considers the evaluation of typographic +attacks under various multi-modal tasks but also evaluates the effects of +typographic attacks, influenced by texts generated with diverse factors. Based +on the evaluation results, we investigate the causes why typographic attacks +may impact VLMs and LVLMs, leading to three highly insightful discoveries. By +the examination of our discoveries and experimental validation in the +Typographic Dataset, we reduce the performance degradation from $42.07\%$ to +$13.90\%$ when LVLMs confront typographic attacks. + +
+
+
+
+
+ + ♻ ☆ The All-Seeing Project V2: Towards General Relation Comprehension of the + Open World + + +
+ We present the All-Seeing Project V2: a new model and dataset designed for +understanding object relations in images. Specifically, we propose the +All-Seeing Model V2 (ASMv2) that integrates the formulation of text generation, +object localization, and relation comprehension into a relation conversation +(ReC) task. Leveraging this unified task, our model excels not only in +perceiving and recognizing all objects within the image but also in grasping +the intricate relation graph between them, diminishing the relation +hallucination often encountered by Multi-modal Large Language Models (MLLMs). +To facilitate training and evaluation of MLLMs in relation understanding, we +created the first high-quality ReC dataset ({AS-V2) which is aligned with the +format of standard instruction tuning data. In addition, we design a new +benchmark, termed Circular-based Relation Probing Evaluation (CRPE) for +comprehensively evaluating the relation comprehension capabilities of MLLMs. +Notably, our ASMv2 achieves an overall accuracy of 52.04 on this relation-aware +benchmark, surpassing the 43.14 of LLaVA-1.5 by a large margin. We hope that +our work can inspire more future research and contribute to the evolution +towards artificial general intelligence. Our project is released at +https://github.com/OpenGVLab/all-seeing. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ♻ ☆ m&m's: A Benchmark to Evaluate Tool-Use for multi-step multi-modal Tasks + + +
+ Real-world multi-modal problems are rarely solved by a single machine +learning model, and often require multi-step computational plans that involve +stitching several models. Tool-augmented LLMs hold tremendous promise for +automating the generation of such computational plans. However, the lack of +standardized benchmarks for evaluating LLMs as planners for multi-step +multi-modal tasks has prevented a systematic study of planner design decisions. +Should LLMs generate a full plan in a single shot or step-by-step? Should they +invoke tools directly with Python code or through structured data formats like +JSON? Does feedback improve planning? To answer these questions and more, we +introduce m&m's: a benchmark containing 4K+ multi-step multi-modal tasks +involving 33 tools that include multi-modal models, (free) public APIs, and +image processing modules. For each of these task queries, we provide +automatically generated plans using this realistic toolset. We further provide +a high-quality subset of 1,565 task plans that are human-verified and correctly +executable. With m&m's, we evaluate 6 popular LLMs with 2 planning strategies +(multi-step vs. step-by-step planning), 2 plan formats (JSON vs. code), and 3 +types of feedback (parsing/verification/execution). Finally, we summarize +takeaways from our extensive experiments. Our dataset and code are available on +HuggingFace (https://huggingface.co/datasets/zixianma/mnms) and Github +(https://github.com/RAIVNLab/mnms). + +
+
+
+
+
+ + ♻ ☆ MedCycle: Unpaired Medical Report Generation via Cycle-Consistency + + +
+ Generating medical reports for X-ray images presents a significant challenge, +particularly in unpaired scenarios where access to paired image-report data for +training is unavailable. Previous works have typically learned a joint +embedding space for images and reports, necessitating a specific labeling +schema for both. We introduce an innovative approach that eliminates the need +for consistent labeling schemas, thereby enhancing data accessibility and +enabling the use of incompatible datasets. This approach is based on +cycle-consistent mapping functions that transform image embeddings into report +embeddings, coupled with report auto-encoding for medical report generation. +Our model and objectives consider intricate local details and the overarching +semantic context within images and reports. This approach facilitates the +learning of effective mapping functions, resulting in the generation of +coherent reports. It outperforms state-of-the-art results in unpaired chest +X-ray report generation, demonstrating improvements in both language and +clinical metrics. + +
+
+
+
+
+ + ♻ ☆ A Geospatial Approach to Predicting Desert Locust Breeding Grounds in + Africa + + +
+ Desert locust swarms present a major threat to agriculture and food security. +Addressing this challenge, our study develops an operationally-ready model for +predicting locust breeding grounds, which has the potential to enhance early +warning systems and targeted control measures. We curated a dataset from the +United Nations Food and Agriculture Organization's (UN-FAO) locust observation +records and analyzed it using two types of spatio-temporal input features: +remotely-sensed environmental and climate data as well as multi-spectral earth +observation images. Our approach employed custom deep learning models +(three-dimensional and LSTM-based recurrent convolutional networks), along with +the geospatial foundational model Prithvi recently released by Jakubik et al., +2023. These models notably outperformed existing baselines, with the +Prithvi-based model, fine-tuned on multi-spectral images from NASA's Harmonized +Landsat and Sentinel-2 (HLS) dataset, achieving the highest accuracy, F1 and +ROC-AUC scores (83.03%, 81.53% and 87.69%, respectively). A significant finding +from our research is that multi-spectral earth observation images alone are +sufficient for effective locust breeding ground prediction without the need to +explicitly incorporate climatic or environmental features. + +
+
+
+
+
+ + ♻ ☆ Towards Flexible, Scalable, and Adaptive Multi-Modal Conditioned Face + Synthesis + + +
+ Recent progress in multi-modal conditioned face synthesis has enabled the +creation of visually striking and accurately aligned facial images. Yet, +current methods still face issues with scalability, limited flexibility, and a +one-size-fits-all approach to control strength, not accounting for the +differing levels of conditional entropy, a measure of unpredictability in data +given some condition, across modalities. To address these challenges, we +introduce a novel uni-modal training approach with modal surrogates, coupled +with an entropy-aware modal-adaptive modulation, to support flexible, scalable, +and scalable multi-modal conditioned face synthesis network. Our uni-modal +training with modal surrogate that only leverage uni-modal data, use modal +surrogate to decorate condition with modal-specific characteristic and serve as +linker for inter-modal collaboration , fully learns each modality control in +face synthesis process as well as inter-modal collaboration. The entropy-aware +modal-adaptive modulation finely adjust diffusion noise according to +modal-specific characteristics and given conditions, enabling well-informed +step along denoising trajectory and ultimately leading to synthesis results of +high fidelity and quality. Our framework improves multi-modal face synthesis +under various conditions, surpassing current methods in image quality and +fidelity, as demonstrated by our thorough experimental results. + +
+
+
+
+
+ + ♻ ☆ MedMamba: Vision Mamba for Medical Image Classification + + +
+ Medical image classification is a very fundamental and crucial task in the +field of computer vision. These years, CNN-based and Transformer-based models +have been widely used to classify various medical images. Unfortunately, The +limitation of CNNs in long-range modeling capabilities prevents them from +effectively extracting features in medical images, while Transformers are +hampered by their quadratic computational complexity. Recent research has shown +that the state space model (SSM) represented by Mamba can efficiently model +long-range interactions while maintaining linear computational complexity. +Inspired by this, we propose Vision Mamba for medical image classification +(MedMamba). More specifically, we introduce a novel Conv-SSM module. Conv-SSM +combines the local feature extraction ability of convolutional layers with the +ability of SSM to capture long-range dependency, thereby modeling medical +images with different modalities. To demonstrate the potential of MedMamba, we +conducted extensive experiments using 14 publicly available medical datasets +with different imaging techniques and two private datasets built by ourselves. +Extensive experimental results demonstrate that the proposed MedMamba performs +well in detecting lesions in various medical images. To the best of our +knowledge, this is the first Vision Mamba tailored for medical image +classification. The purpose of this work is to establish a new baseline for +medical image classification tasks and provide valuable insights for the future +development of more efficient and effective SSM-based artificial intelligence +algorithms and application systems in the medical. Source code has been +available at https://github.com/YubiaoYue/MedMamba. + +
+
+
+
+
+ + ♻ ☆ Instance-aware Exploration-Verification-Exploitation for Instance + ImageGoal Navigation + + +
+ As a new embodied vision task, Instance ImageGoal Navigation (IIN) aims to +navigate to a specified object depicted by a goal image in an unexplored +environment. + The main challenge of this task lies in identifying the target object from +different viewpoints while rejecting similar distractors. + Existing ImageGoal Navigation methods usually adopt the simple +Exploration-Exploitation framework and ignore the identification of specific +instance during navigation. + In this work, we propose to imitate the human behaviour of ``getting closer +to confirm" when distinguishing objects from a distance. + Specifically, we design a new modular navigation framework named +Instance-aware Exploration-Verification-Exploitation (IEVE) for instance-level +image goal navigation. + Our method allows for active switching among the exploration, verification, +and exploitation actions, thereby facilitating the agent in making reasonable +decisions under different situations. + On the challenging HabitatMatterport 3D semantic (HM3D-SEM) dataset, our +method surpasses previous state-of-the-art work, with a classical segmentation +model (0.684 vs. 0.561 success) or a robust model (0.702 vs. 0.561 success). +Our code will be made publicly available at https://github.com/XiaohanLei/IEVE. + +
+
+
+
+
+ + ♻ ☆ Generalizing deep learning models for medical image classification + + +
+ Numerous Deep Learning (DL) models have been developed for a large spectrum +of medical image analysis applications, which promises to reshape various +facets of medical practice. Despite early advances in DL model validation and +implementation, which encourage healthcare institutions to adopt them, some +fundamental questions remain: are the DL models capable of generalizing? What +causes a drop in DL model performances? How to overcome the DL model +performance drop? Medical data are dynamic and prone to domain shift, due to +multiple factors such as updates to medical equipment, new imaging workflow, +and shifts in patient demographics or populations can induce this drift over +time. In this paper, we review recent developments in generalization methods +for DL-based classification models. We also discuss future challenges, +including the need for improved evaluation protocols and benchmarks, and +envisioned future developments to achieve robust, generalized models for +medical image classification. + +
+
+
+
+
+ + ♻ ☆ Chain-of-Spot: Interactive Reasoning Improves Large Vision-Language + Models + + +
+ In the realm of vision-language understanding, the proficiency of models in +interpreting and reasoning over visual content has become a cornerstone for +numerous applications. However, it is challenging for the visual encoder in +Large Vision-Language Models (LVLMs) to extract useful features tailored to +questions that aid the language model's response. Furthermore, a common +practice among existing LVLMs is to utilize lower-resolution images, which +restricts the ability for visual recognition. Our work introduces the +Chain-of-Spot (CoS) method, which we describe as Interactive Reasoning, a novel +approach that enhances feature extraction by focusing on key regions of +interest (ROI) within the image, corresponding to the posed questions or +instructions. This technique allows LVLMs to access more detailed visual +information without altering the original image resolution, thereby offering +multi-granularity image features. By integrating Chain-of-Spot with +instruct-following LLaVA-1.5 models, the process of image reasoning +consistently improves performance across a wide range of multimodal datasets +and benchmarks without bells and whistles and achieves new state-of-the-art +results. Our empirical findings demonstrate a significant improvement in LVLMs' +ability to understand and reason about visual content, paving the way for more +sophisticated visual instruction-following applications. Code and models are +available at https://github.com/dongyh20/Chain-of-Spot + +
+
+ comment: Project Page: https://sites.google.com/view/chain-of-spot/ +
+
+
+
+
+ + ♻ ☆ Neural Radiance Fields in Medical Imaging: Challenges and Next Steps + + +
+ Neural Radiance Fields (NeRF), as a pioneering technique in computer vision, +offer great potential to revolutionize medical imaging by synthesizing +three-dimensional representations from the projected two-dimensional image +data. However, they face unique challenges when applied to medical +applications. This paper presents a comprehensive examination of applications +of NeRFs in medical imaging, highlighting four imminent challenges, including +fundamental imaging principles, inner structure requirement, object boundary +definition, and color density significance. We discuss current methods on +different organs and discuss related limitations. We also review several +datasets and evaluation metrics and propose several promising directions for +future research. + +
+
+
+
+
+ + ♻ ☆ Learning a Depth Covariance Function CVPR 2023 + + +
+ We propose learning a depth covariance function with applications to +geometric vision tasks. Given RGB images as input, the covariance function can +be flexibly used to define priors over depth functions, predictive +distributions given observations, and methods for active point selection. We +leverage these techniques for a selection of downstream tasks: depth +completion, bundle adjustment, and monocular dense visual odometry. + +
+
+ comment: CVPR 2023. Project page: https://edexheim.github.io/DepthCov/ +
+
+
+
+
+ + ♻ ☆ T-MAE: Temporal Masked Autoencoders for Point Cloud Representation + Learning + + +
+ The scarcity of annotated data in LiDAR point cloud understanding hinders +effective representation learning. Consequently, scholars have been actively +investigating efficacious self-supervised pre-training paradigms. Nevertheless, +temporal information, which is inherent in the LiDAR point cloud sequence, is +consistently disregarded. To better utilize this property, we propose an +effective pre-training strategy, namely Temporal Masked Auto-Encoders (T-MAE), +which takes as input temporally adjacent frames and learns temporal dependency. +A SiamWCA backbone, containing a Siamese encoder and a windowed cross-attention +(WCA) module, is established for the two-frame input. Considering that the +movement of an ego-vehicle alters the view of the same instance, temporal +modeling also serves as a robust and natural data augmentation, enhancing the +comprehension of target objects. SiamWCA is a powerful architecture but heavily +relies on annotated data. Our T-MAE pre-training strategy alleviates its demand +for annotated data. Comprehensive experiments demonstrate that T-MAE achieves +the best performance on both Waymo and ONCE datasets among competitive +self-supervised approaches. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Ins-HOI: Instance Aware Human-Object Interactions Recovery + + +
+ Accurately modeling detailed interactions between human/hand and object is an +appealing yet challenging task. Current multi-view capture systems are only +capable of reconstructing multiple subjects into a single, unified mesh, which +fails to model the states of each instance individually during interactions. To +address this, previous methods use template-based representations to track +human/hand and object. However, the quality of the reconstructions is limited +by the descriptive capabilities of the templates so that these methods are +inherently struggle with geometry details, pressing deformations and invisible +contact surfaces. In this work, we propose an end-to-end Instance-aware +Human-Object Interactions recovery (Ins-HOI) framework by introducing an +instance-level occupancy field representation. However, the real-captured data +is presented as a holistic mesh, unable to provide instance-level supervision. +To address this, we further propose a complementary training strategy that +leverages synthetic data to introduce instance-level shape priors, enabling the +disentanglement of occupancy fields for different instances. Specifically, +synthetic data, created by randomly combining individual scans of humans/hands +and objects, guides the network to learn a coarse prior of instances. +Meanwhile, real-captured data helps in learning the overall geometry and +restricting interpenetration in contact areas. As demonstrated in experiments, +our method Ins-HOI supports instance-level reconstruction and provides +reasonable and realistic invisible contact surfaces even in cases of extremely +close interaction. To facilitate the research of this task, we collect a +large-scale, high-fidelity 3D scan dataset, including 5.2k high-quality scans +with real-world human-chair and hand-object interactions. The code and data +will be public for research purposes. + +
+
+ comment: Project Page: https://jiajunzhang16.github.io/ins-hoi/ , Code and + Dataset Page: https://github.com/jiajunzhang16/ins-hoi +
+
+
+
+
+ + ♻ ☆ GIVT: Generative Infinite-Vocabulary Transformers + + +
+ We introduce generative infinite-vocabulary transformers (GIVT) which +generate vector sequences with real-valued entries, instead of discrete tokens +from a finite vocabulary. To this end, we propose two surprisingly simple +modifications to decoder-only transformers: 1) at the input, we replace the +finite-vocabulary lookup table with a linear projection of the input vectors; +and 2) at the output, we replace the logits prediction (usually mapped to a +categorical distribution) with the parameters of a multivariate Gaussian +mixture model. Inspired by the image-generation paradigm of VQ-GAN and MaskGIT, +where transformers are used to model the discrete latent sequences of a VQ-VAE, +we use GIVT to model the unquantized real-valued latent sequences of a +$\beta$-VAE. In class-conditional image generation GIVT outperforms VQ-GAN (and +improved variants thereof) as well as MaskGIT, and achieves performance +competitive with recent latent diffusion models. Finally, we obtain strong +results outside of image generation when applying GIVT to panoptic segmentation +and depth estimation with a VAE variant of the UViM framework + +
+
+ comment: v2: add related NLP work, loss details. v3: Improved GMM formulation, + added adapter module, larger models, better image generation results. Code + and model checkpoints are available at: + https://github.com/google-research/big_vision +
+
+
+
+
+ + ♻ ☆ Closing the Gap: Achieving Better Accuracy-Robustness Tradeoffs against + Query-Based Attacks AAAI + + +
+ Although promising, existing defenses against query-based attacks share a +common limitation: they offer increased robustness against attacks at the price +of a considerable accuracy drop on clean samples. In this work, we show how to +efficiently establish, at test-time, a solid tradeoff between robustness and +accuracy when mitigating query-based attacks. Given that these attacks +necessarily explore low-confidence regions, our insight is that activating +dedicated defenses, such as random noise defense and random image +transformations, only for low-confidence inputs is sufficient to prevent them. +Our approach is independent of training and supported by theory. We verify the +effectiveness of our approach for various existing defenses by conducting +extensive experiments on CIFAR-10, CIFAR-100, and ImageNet. Our results confirm +that our proposal can indeed enhance these defenses by providing better +tradeoffs between robustness and accuracy when compared to state-of-the-art +approaches while being completely training-free. + +
+
+ comment: To appear in the Proceedings of the AAAI Conference on Artificial + Intelligence (AAAI) 2024 +
+
+
+
+
+ + ♻ ☆ Self-Supervised Class-Agnostic Motion Prediction with Spatial and + Temporal Consistency Regularizations CVPR2024 + + +
+ The perception of motion behavior in a dynamic environment holds significant +importance for autonomous driving systems, wherein class-agnostic motion +prediction methods directly predict the motion of the entire point cloud. While +most existing methods rely on fully-supervised learning, the manual labeling of +point cloud data is laborious and time-consuming. Therefore, several +annotation-efficient methods have been proposed to address this challenge. +Although effective, these methods rely on weak annotations or additional +multi-modal data like images, and the potential benefits inherent in the point +cloud sequence are still underexplored. To this end, we explore the feasibility +of self-supervised motion prediction with only unlabeled LiDAR point clouds. +Initially, we employ an optimal transport solver to establish coarse +correspondences between current and future point clouds as the coarse pseudo +motion labels. Training models directly using such coarse labels leads to +noticeable spatial and temporal prediction inconsistencies. To mitigate these +issues, we introduce three simple spatial and temporal regularization losses, +which facilitate the self-supervised training process effectively. Experimental +results demonstrate the significant superiority of our approach over the +state-of-the-art self-supervised methods. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ ColonNeRF: High-Fidelity Neural Reconstruction of Long Colonoscopy + + +
+ Colonoscopy reconstruction is pivotal for diagnosing colorectal cancer. +However, accurate long-sequence colonoscopy reconstruction faces three major +challenges: (1) dissimilarity among segments of the colon due to its meandering +and convoluted shape; (2) co-existence of simple and intricately folded +geometry structures; (3) sparse viewpoints due to constrained camera +trajectories. To tackle these challenges, we introduce a new reconstruction +framework based on neural radiance field (NeRF), named ColonNeRF, which +leverages neural rendering for novel view synthesis of long-sequence +colonoscopy. Specifically, to reconstruct the entire colon in a piecewise +manner, our ColonNeRF introduces a region division and integration module, +effectively reducing shape dissimilarity and ensuring geometric consistency in +each segment. To learn both the simple and complex geometry in a unified +framework, our ColonNeRF incorporates a multi-level fusion module that +progressively models the colon regions from easy to hard. Additionally, to +overcome the challenges from sparse views, we devise a DensiNet module for +densifying camera poses under the guidance of semantic consistency. We conduct +extensive experiments on both synthetic and real-world datasets to evaluate our +ColonNeRF. Quantitatively, ColonNeRF exhibits a 67%-85% increase in LPIPS-ALEX +scores. Qualitatively, our reconstruction visualizations show much clearer +textures and more accurate geometric details. These sufficiently demonstrate +our superior performance over the state-of-the-art methods. + +
+
+ comment: for Project Page, see https://showlab.github.io/ColonNeRF/ +
+
+
+
+
+ + ♻ ☆ Neuromorphic Imaging and Classification with Graph Learning + + +
+ Bio-inspired neuromorphic cameras asynchronously record pixel brightness +changes and generate sparse event streams. They can capture dynamic scenes with +little motion blur and more details in extreme illumination conditions. Due to +the multidimensional address-event structure, most existing vision algorithms +cannot properly handle asynchronous event streams. While several event +representations and processing methods have been developed to address such an +issue, they are typically driven by a large number of events, leading to +substantial overheads in runtime and memory. In this paper, we propose a new +graph representation of the event data and couple it with a Graph Transformer +to perform accurate neuromorphic classification. Extensive experiments show +that our approach leads to better results and excels at the challenging +realistic situations where only a small number of events and limited +computational resources are available, paving the way for neuromorphic +applications embedded into mobile facilities. + +
+
+ comment: 15 pages, 4 figures, and 7 tables. Accepted by Elsevier + Neurocomputing +
+
+
+
+
+ + ♻ ☆ BiTT: Bi-directional Texture Reconstruction of Interacting Two Hands + from a Single Image CVPR 2024 + + +
+ Creating personalized hand avatars is important to offer a realistic +experience to users on AR / VR platforms. While most prior studies focused on +reconstructing 3D hand shapes, some recent work has tackled the reconstruction +of hand textures on top of shapes. However, these methods are often limited to +capturing pixels on the visible side of a hand, requiring diverse views of the +hand in a video or multiple images as input. In this paper, we propose a novel +method, BiTT(Bi-directional Texture reconstruction of Two hands), which is the +first end-to-end trainable method for relightable, pose-free texture +reconstruction of two interacting hands taking only a single RGB image, by +three novel components: 1) bi-directional (left $\leftrightarrow$ right) +texture reconstruction using the texture symmetry of left / right hands, 2) +utilizing a texture parametric model for hand texture recovery, and 3) the +overall coarse-to-fine stage pipeline for reconstructing personalized texture +of two interacting hands. BiTT first estimates the scene light condition and +albedo image from an input image, then reconstructs the texture of both hands +through the texture parametric model and bi-directional texture reconstructor. +In experiments using InterHand2.6M and RGB2Hands datasets, our method +significantly outperforms state-of-the-art hand texture reconstruction methods +quantitatively and qualitatively. The code is available at +https://github.com/yunminjin2/BiTT + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ An explainable three dimension framework to uncover learning patterns: A + unified look in variable sulci recognition + + +
+ Explainable AI is crucial in medical imaging. In the challenging field of +neuroscience, visual topics present a high level of complexity, particularly +within three-dimensional space. The application of neuroscience, which involves +identifying brain sulcal features from MRI, faces significant hurdles due to +varying annotation protocols among experts and the intricate three-dimension +functionality of the brain. Consequently, traditional explainability approaches +fall short in effectively validating and evaluating these networks. To address +this, we first present a mathematical formulation delineating various +categories of explanation needs across diverse computer vision tasks, +categorized into self-explanatory, semi-explanatory, non-explanatory, and +new-pattern learning applications based on the reliability of the validation +protocol. With respect to this mathematical formulation, we propose a 3D +explainability framework aimed at validating the outputs of deep learning +networks in detecting the paracingulate sulcus an essential brain anatomical +feature. The framework integrates local 3D explanations, global explanations +through dimensionality reduction, concatenated global explanations, and +statistical shape features, unveiling new insights into pattern learning. We +trained and tested two advanced 3D deep learning networks on the challenging +TOP-OSLO dataset, significantly improving sulcus detection accuracy, +particularly on the left hemisphere. During evaluation with diverse annotation +protocols for this dataset, we highlighted the crucial role of an unbiased +annotation process in achieving precise predictions and effective pattern +learning within our proposed 3D framework. The proposed framework not only +annotates the variable sulcus but also uncovers hidden AI knowledge, promising +to advance our understanding of brain anatomy and function. + +
+
+
+
+
+ + ♻ ☆ A Generative Approach for Wikipedia-Scale Visual Entity Recognition CVPR2024 + + +
+ In this paper, we address web-scale visual entity recognition, specifically +the task of mapping a given query image to one of the 6 million existing +entities in Wikipedia. One way of approaching a problem of such scale is using +dual-encoder models (eg CLIP), where all the entity names and query images are +embedded into a unified space, paving the way for an approximate k-NN search. +Alternatively, it is also possible to re-purpose a captioning model to directly +generate the entity names for a given image. In contrast, we introduce a novel +Generative Entity Recognition (GER) framework, which given an input image +learns to auto-regressively decode a semantic and discriminative ``code'' +identifying the target entity. Our experiments demonstrate the efficacy of this +GER paradigm, showcasing state-of-the-art performance on the challenging OVEN +benchmark. GER surpasses strong captioning, dual-encoder, visual matching and +hierarchical classification baselines, affirming its advantage in tackling the +complexities of web-scale recognition. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ♻ ☆ Analyzing Local Representations of Self-supervised Vision Transformers + + +
+ In this paper, we present a comparative analysis of various self-supervised +Vision Transformers (ViTs), focusing on their local representative power. +Inspired by large language models, we examine the abilities of ViTs to perform +various computer vision tasks with little to no fine-tuning. We design +evaluation framework to analyze the quality of local, i.e.\ patch-level, +representations in the context of few-shot semantic segmentation, instance +identification, object retrieval and tracking. We discover that contrastive +learning based methods like DINO produce more universal patch representations +that can be immediately applied for downstream tasks with no parameter tuning, +compared to masked image modeling. The embeddings learned using the latter +approach, e.g. in masked autoencoders, have high variance features that harm +distance-based algorithms, such as k-NN, and do not contain useful information +for most downstream tasks. Furthermore, we demonstrate that removing these +high-variance features enhances k-NN for MAE, as well as for its recent +extension Scale-MAE. Finally, we find an object instance retrieval setting +where DINOv2, a model pretrained on two orders of magnitude more data, falls +short of its less compute intensive counterpart DINO. + +
+
+
+
+
+ + ♻ ☆ Enhanced Few-Shot Class-Incremental Learning via Ensemble Models + + +
+ Few-shot class-incremental learning (FSCIL) aims to continually fit new +classes with limited training data, while maintaining the performance of +previously learned classes. The main challenges are overfitting the rare new +training samples and forgetting old classes. While catastrophic forgetting has +been extensively studied, the overfitting problem has attracted less attention +in FSCIL. To tackle overfitting challenge, we design a new ensemble model +framework cooperated with data augmentation to boost generalization. In this +way, the enhanced model works as a library storing abundant features to +guarantee fast adaptation to downstream tasks. Specifically, the multi-input +multi-output ensemble structure is applied with a spatial-aware data +augmentation strategy, aiming at diversifying the feature extractor and +alleviating overfitting in incremental sessions. Moreover, self-supervised +learning is also integrated to further improve the model generalization. +Comprehensive experimental results show that the proposed method can indeed +mitigate the overfitting problem in FSCIL, and outperform the state-of-the-art +methods. + +
+
+
+
+
+ + ♻ ☆ Separate and Conquer: Decoupling Co-occurrence via Decomposition and + Representation for Weakly Supervised Semantic Segmentation CVPR 2024 + + +
+ Weakly supervised semantic segmentation (WSSS) with image-level labels aims +to achieve segmentation tasks without dense annotations. However, attributed to +the frequent coupling of co-occurring objects and the limited supervision from +image-level labels, the challenging co-occurrence problem is widely present and +leads to false activation of objects in WSSS. In this work, we devise a +'Separate and Conquer' scheme SeCo to tackle this issue from dimensions of +image space and feature space. In the image space, we propose to 'separate' the +co-occurring objects with image decomposition by subdividing images into +patches. Importantly, we assign each patch a category tag from Class Activation +Maps (CAMs), which spatially helps remove the co-context bias and guide the +subsequent representation. In the feature space, we propose to 'conquer' the +false activation by enhancing semantic representation with multi-granularity +knowledge contrast. To this end, a dual-teacher-single-student architecture is +designed and tag-guided contrast is conducted, which guarantee the correctness +of knowledge and further facilitate the discrepancy among co-contexts. We +streamline the multi-staged WSSS pipeline end-to-end and tackle this issue +without external supervision. Extensive experiments are conducted, validating +the efficiency of our method and the superiority over previous single-staged +and even multi-staged competitors on PASCAL VOC and MS COCO. Code is available +at https://github.com/zwyang6/SeCo.git. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Visually-Aware Context Modeling for News Image Captioning NAACL 2024 + + +
+ News Image Captioning aims to create captions from news articles and images, +emphasizing the connection between textual context and visual elements. +Recognizing the significance of human faces in news images and the face-name +co-occurrence pattern in existing datasets, we propose a face-naming module for +learning better name embeddings. Apart from names, which can be directly linked +to an image area (faces), news image captions mostly contain context +information that can only be found in the article. We design a retrieval +strategy using CLIP to retrieve sentences that are semantically close to the +image, mimicking human thought process of linking articles to images. +Furthermore, to tackle the problem of the imbalanced proportion of article +context and image context in captions, we introduce a simple yet effective +method Contrasting with Language Model backbone (CoLaM) to the training +pipeline. We conduct extensive experiments to demonstrate the efficacy of our +framework. We out-perform the previous state-of-the-art (without external data) +by 7.97/5.80 CIDEr scores on GoodNews/NYTimes800k. Our code is available at +https://github.com/tingyu215/VACNIC. + +
+
+ comment: Accepted at NAACL 2024 Main Conference +
+
+
+
+
+ + ♻ ☆ An Active Contour Model Driven By the Hybrid Signed Pressure Function + + +
+ Due to the influence of imaging equipment and complex imaging environments, +most images in daily life have features of intensity inhomogeneity and noise. +Therefore, many scholars have designed many image segmentation algorithms to +address these issues. Among them, the active contour model is one of the most +effective image segmentation algorithms.This paper proposes an active contour +model driven by the hybrid signed pressure function that combines global and +local information construction. Firstly, a new global region-based signed +pressure function is introduced by combining the average intensity of the inner +and outer regions of the curve with the median intensity of the inner region of +the evolution curve. Then, the paper uses the energy differences between the +inner and outer regions of the curve in the local region to design the signed +pressure function of the local term. Combine the two SPF function to obtain a +new signed pressure function and get the evolution equation of the new model. +Finally, experiments and numerical analysis show that the model has excellent +segmentation performance for both intensity inhomogeneous images and noisy +images. + +
+
+
+
+
+ + ♻ ☆ Direct2.5: Diverse Text-to-3D Generation via Multi-view 2.5D Diffusion CVPR 2024 + + +
+ Recent advances in generative AI have unveiled significant potential for the +creation of 3D content. However, current methods either apply a pre-trained 2D +diffusion model with the time-consuming score distillation sampling (SDS), or a +direct 3D diffusion model trained on limited 3D data losing generation +diversity. In this work, we approach the problem by employing a multi-view 2.5D +diffusion fine-tuned from a pre-trained 2D diffusion model. The multi-view 2.5D +diffusion directly models the structural distribution of 3D data, while still +maintaining the strong generalization ability of the original 2D diffusion +model, filling the gap between 2D diffusion-based and direct 3D diffusion-based +methods for 3D content generation. During inference, multi-view normal maps are +generated using the 2.5D diffusion, and a novel differentiable rasterization +scheme is introduced to fuse the almost consistent multi-view normal maps into +a consistent 3D model. We further design a normal-conditioned multi-view image +generation module for fast appearance generation given the 3D geometry. Our +method is a one-pass diffusion process and does not require any SDS +optimization as post-processing. We demonstrate through extensive experiments +that, our direct 2.5D generation with the specially-designed fusion scheme can +achieve diverse, mode-seeking-free, and high-fidelity 3D content generation in +only 10 seconds. Project page: https://nju-3dv.github.io/projects/direct25. + +
+
+ comment: CVPR 2024 camera ready, including more evaluations and discussions. + Project webpage: https://nju-3dv.github.io/projects/direct25 +
+
+
+
+
+ + ♻ ☆ Alleviating Exposure Bias in Diffusion Models through Sampling with + Shifted Time Steps ICLR2024 + + +
+ Diffusion Probabilistic Models (DPM) have shown remarkable efficacy in the +synthesis of high-quality images. However, their inference process +characteristically requires numerous, potentially hundreds, of iterative steps, +which could exaggerate the problem of exposure bias due to the training and +inference discrepancy. Previous work has attempted to mitigate this issue by +perturbing inputs during training, which consequently mandates the retraining +of the DPM. In this work, we conduct a systematic study of exposure bias in DPM +and, intriguingly, we find that the exposure bias could be alleviated with a +novel sampling method that we propose, without retraining the model. We +empirically and theoretically show that, during inference, for each backward +time step $t$ and corresponding state $\hat{x}_t$, there might exist another +time step $t_s$ which exhibits superior coupling with $\hat{x}_t$. Based on +this finding, we introduce a sampling method named Time-Shift Sampler. Our +framework can be seamlessly integrated to existing sampling algorithms, such as +DDPM, DDIM and other high-order solvers, inducing merely minimal additional +computations. Experimental results show our method brings significant and +consistent improvements in FID scores on different datasets and sampling +methods. For example, integrating Time-Shift Sampler to F-PNDM yields a +FID=3.88, achieving 44.49\% improvements as compared to F-PNDM, on CIFAR-10 +with 10 sampling steps, which is more performant than the vanilla DDIM with 100 +sampling steps. Our code is available at https://github.com/Mingxiao-Li/TS-DPM. + +
+
+ comment: Accepted at International Conference on Learning Representations + (ICLR2024) +
+
+
+
+
+ + ♻ ☆ To use or not to use proprietary street view images in (health and + place) research? That is the question + + +
+ Computer vision-based analysis of street view imagery has transformative +impacts on environmental assessments. Interactive web services, particularly +Google Street View, play an ever-important role in making imagery data +ubiquitous. Despite the technical ease of harnessing millions of Google Street +View images, this article questions the current practices in using this +proprietary data source from a European viewpoint. Our concern lies with +Google's terms of service, which restrict bulk image downloads and the +generation of street view image-based indices. To reconcile the challenge of +advancing society through groundbreaking research while maintaining data +license agreements and legal integrity, we believe it is crucial to 1) include +an author's statement on using proprietary street view data and the directives +it entails, 2) negotiate academic-specific license to democratize Google Street +View data access, and 3) adhere to open data principles and utilize open image +sources for future research. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Video Domain Adaptation with Masked Pre-Training and + Collaborative Self-Training CVPR 2024 + + +
+ In this work, we tackle the problem of unsupervised domain adaptation (UDA) +for video action recognition. Our approach, which we call UNITE, uses an image +teacher model to adapt a video student model to the target domain. UNITE first +employs self-supervised pre-training to promote discriminative feature learning +on target domain videos using a teacher-guided masked distillation objective. +We then perform self-training on masked target data, using the video student +model and image teacher model together to generate improved pseudolabels for +unlabeled target videos. Our self-training process successfully leverages the +strengths of both models to achieve strong transfer performance across domains. +We evaluate our approach on multiple video domain adaptation benchmarks and +observe significant improvements upon previously reported results. + +
+
+ comment: Accepted at CVPR 2024. 13 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ AI-KD: Adversarial learning and Implicit regularization for + self-Knowledge Distillation + + +
+ We present a novel adversarial penalized self-knowledge distillation method, +named adversarial learning and implicit regularization for self-knowledge +distillation (AI-KD), which regularizes the training procedure by adversarial +learning and implicit distillations. Our model not only distills the +deterministic and progressive knowledge which are from the pre-trained and +previous epoch predictive probabilities but also transfers the knowledge of the +deterministic predictive distributions using adversarial learning. The +motivation is that the self-knowledge distillation methods regularize the +predictive probabilities with soft targets, but the exact distributions may be +hard to predict. Our method deploys a discriminator to distinguish the +distributions between the pre-trained and student models while the student +model is trained to fool the discriminator in the trained procedure. Thus, the +student model not only can learn the pre-trained model's predictive +probabilities but also align the distributions between the pre-trained and +student models. We demonstrate the effectiveness of the proposed method with +network architectures on multiple datasets and show the proposed method +achieves better performance than state-of-the-art methods. + +
+
+ comment: Accepted to KBS +
+
+
+
+
+ + ♻ ☆ Consistency Enhancement-Based Deep Multiview Clustering via Contrastive + Learning + + +
+ Multiview clustering (MVC) segregates data samples into meaningful clusters +by synthesizing information across multiple views. Moreover, deep +learning-based methods have demonstrated their strong feature learning +capabilities in MVC scenarios. However, effectively generalizing feature +representations while maintaining consistency is still an intractable problem. +In addition, most existing deep clustering methods based on contrastive +learning overlook the consistency of the clustering representations during the +clustering process. In this paper, we show how the above problems can be +overcome and propose a consistent enhancement-based deep MVC method via +contrastive learning (CCEC). Specifically, semantic connection blocks are +incorporated into a feature representation to preserve the consistent +information among multiple views. Furthermore, the representation process for +clustering is enhanced through spectral clustering, and the consistency across +multiple views is improved. Experiments conducted on five datasets demonstrate +the effectiveness and superiority of our method in comparison with the +state-of-the-art (SOTA) methods. The code for this method can be accessed at +https://anonymous.4open.science/r/CCEC-E84E/. + +
+
+ comment: There are multiple errors that need to be corrected, including some + formulas and concept descriptions. We will re upload the paper after the + modifications are completed +
+
+
+
+
+ + ♻ ☆ LaserHuman: Language-guided Scene-aware Human Motion Generation in Free + Environment + + +
+ Language-guided scene-aware human motion generation has great significance +for entertainment and robotics. In response to the limitations of existing +datasets, we introduce LaserHuman, a pioneering dataset engineered to +revolutionize Scene-Text-to-Motion research. LaserHuman stands out with its +inclusion of genuine human motions within 3D environments, unbounded free-form +natural language descriptions, a blend of indoor and outdoor scenarios, and +dynamic, ever-changing scenes. Diverse modalities of capture data and rich +annotations present great opportunities for the research of conditional motion +generation, and can also facilitate the development of real-life applications. +Moreover, to generate semantically consistent and physically plausible human +motions, we propose a multi-conditional diffusion model, which is simple but +effective, achieving state-of-the-art performance on existing datasets. + +
+
+
+
+
+ + ♻ ☆ LLM4SGG: Large Language Model for Weakly Supervised Scene Graph + Generation CVPR 2024 + + +
+ Weakly-Supervised Scene Graph Generation (WSSGG) research has recently +emerged as an alternative to the fully-supervised approach that heavily relies +on costly annotations. In this regard, studies on WSSGG have utilized image +captions to obtain unlocalized triplets while primarily focusing on grounding +the unlocalized triplets over image regions. However, they have overlooked the +two issues involved in the triplet formation process from the captions: 1) +Semantic over-simplification issue arises when extracting triplets from +captions, where fine-grained predicates in captions are undesirably converted +into coarse-grained predicates, resulting in a long-tailed predicate +distribution, and 2) Low-density scene graph issue arises when aligning the +triplets in the caption with entity/predicate classes of interest, where many +triplets are discarded and not used in training, leading to insufficient +supervision. To tackle the two issues, we propose a new approach, i.e., Large +Language Model for weakly-supervised SGG (LLM4SGG), where we mitigate the two +issues by leveraging the LLM's in-depth understanding of language and reasoning +ability during the extraction of triplets from captions and alignment of +entity/predicate classes with target data. To further engage the LLM in these +processes, we adopt the idea of Chain-of-Thought and the in-context few-shot +learning strategy. To validate the effectiveness of LLM4SGG, we conduct +extensive experiments on Visual Genome and GQA datasets, showing significant +improvements in both Recall@K and mean Recall@K compared to the +state-of-the-art WSSGG methods. A further appeal is that LLM4SGG is +data-efficient, enabling effective model training with a small amount of +training images. + +
+
+ comment: 8 pages; CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Intrinsic Image Diffusion for Indoor Single-view Material Estimation + + +
+ We present Intrinsic Image Diffusion, a generative model for appearance +decomposition of indoor scenes. Given a single input view, we sample multiple +possible material explanations represented as albedo, roughness, and metallic +maps. Appearance decomposition poses a considerable challenge in computer +vision due to the inherent ambiguity between lighting and material properties +and the lack of real datasets. To address this issue, we advocate for a +probabilistic formulation, where instead of attempting to directly predict the +true material properties, we employ a conditional generative model to sample +from the solution space. Furthermore, we show that utilizing the strong learned +prior of recent diffusion models trained on large-scale real-world images can +be adapted to material estimation and highly improves the generalization to +real images. Our method produces significantly sharper, more consistent, and +more detailed materials, outperforming state-of-the-art methods by $1.5dB$ on +PSNR and by $45\%$ better FID score on albedo prediction. We demonstrate the +effectiveness of our approach through experiments on both synthetic and +real-world datasets. + +
+
+ comment: Project page: https://peter-kocsis.github.io/IntrinsicImageDiffusion/ + Video: https://youtu.be/lz0meJlj5cA +
+
+
+
+
+ + ♻ ☆ Point2RBox: Combine Knowledge from Synthetic Visual Patterns for + End-to-end Oriented Object Detection with Single Point Supervision + + +
+ With the rapidly increasing demand for oriented object detection (OOD), +recent research involving weakly-supervised detectors for learning rotated box +(RBox) from the horizontal box (HBox) has attracted more and more attention. In +this paper, we explore a more challenging yet label-efficient setting, namely +single point-supervised OOD, and present our approach called Point2RBox. +Specifically, we propose to leverage two principles: 1) Synthetic pattern +knowledge combination: By sampling around each labeled point on the image, we +spread the object feature to synthetic visual patterns with known boxes to +provide the knowledge for box regression. 2) Transform self-supervision: With a +transformed input image (e.g. scaled/rotated), the output RBoxes are trained to +follow the same transformation so that the network can perceive the relative +size/rotation between objects. The detector is further enhanced by a few +devised techniques to cope with peripheral issues, e.g. the anchor/layer +assignment as the size of the object is not available in our point supervision +setting. To our best knowledge, Point2RBox is the first end-to-end solution for +point-supervised OOD. In particular, our method uses a lightweight paradigm, +yet it achieves a competitive performance among point-supervised alternatives, +41.05%/27.62%/80.01% on DOTA/DIOR/HRSC datasets. + +
+
+ comment: 10 pages, 3 figures, 5 tables, code: + https://github.com/yuyi1005/point2rbox-mmrotate +
+
+
+
+
+ + ♻ ☆ Driving Animatronic Robot Facial Expression From Speech + + +
+ Animatronic robots aim to enable natural human-robot interaction through +lifelike facial expressions. However, generating realistic, speech-synchronized +robot expressions is challenging due to the complexities of facial biomechanics +and responsive motion synthesis. This paper presents a principled, +skinning-centric approach to drive animatronic robot facial expressions from +speech. The proposed approach employs linear blend skinning (LBS) as the core +representation to guide tightly integrated innovations in embodiment design and +motion synthesis. LBS informs the actuation topology, enables human expression +retargeting, and allows speech-driven facial motion generation. The proposed +approach is capable of generating highly realistic, real-time facial +expressions from speech on an animatronic face, significantly advancing robots' +ability to replicate nuanced human expressions for natural interaction. + +
+
+ comment: Under review. For associated project page, see + https://library87.github.io/animatronic-face-iros24 +
+
+
+
+
+ + ♻ ☆ Mpox-AISM: AI-Mediated Super Monitoring for Mpox and Like-Mpox + + +
+ The key to preventing the spread of mpox (monkeypox) lies in timely, +convenient, and accurate diagnosis for earlier-stage infected individuals. +Unfortunately, the resemblances between common skin diseases and mpox and the +need for professional diagnosis inevitably deteriorated the diagnosis of +earlier-stage patients with Mpox and contributed to its widespread outbreak in +crowded areas. Here, we proposed a real-time visualization strategy called +"Super Monitoring" using artificial intelligence and Internet technology, +thereby performing a low-cost, convenient, timely, and unspecialized diagnosis +for earlier-stage mpox. Specifically, such AI-mediated "super monitoring" +(Mpox-AISM) invokes a framework assembled by deep learning models, data +augmentation, self-supervised learning, and cloud services. Verified by +publicly available datasets, the Precision, Recall, Specificity, and F1-score +of Mpox-AISM in diagnosing mpox achieved 99.3%, 94.1%, 99.9%, and 96.6%, +respectively. Furthermore, Mpox-AISM's overall accuracy reaches 94.51% in +diagnosing mpox, six like-mpox skin diseases, and normal skin. We also employed +gradient-weighted class activation mapping to explain the decision-making +process of Mpox-AISM, thus handily understanding the specific characteristics +that may indicate the mpox's onset and improving its reliability. With the help +of the Internet and communication terminal, Mpox-AISM can perform a real-time, +low-cost, and convenient diagnosis for earlier-stage mpox in various real-world +settings, thereby effectively curbing the spread of mpox virus. + +
+
+
+
+
+ + ♻ ☆ Open-Vocabulary Camouflaged Object Segmentation + + +
+ Recently, the emergence of the large-scale vision-language model (VLM), such +as CLIP, has opened the way towards open-world object perception. Many works +have explored the utilization of pre-trained VLM for the challenging +open-vocabulary dense prediction task that requires perceiving diverse objects +with novel classes at inference time. Existing methods construct experiments +based on the public datasets of related tasks, which are not tailored for open +vocabulary and rarely involve imperceptible objects camouflaged in complex +scenes due to data collection bias and annotation costs. To fill in the gaps, +we introduce a new task, open-vocabulary camouflaged object segmentation +(OVCOS), and construct a large-scale complex scene dataset (\textbf{OVCamo}) +containing 11,483 hand-selected images with fine annotations and corresponding +object classes. Further, we build a strong single-stage open-vocabulary +\underline{c}amouflaged \underline{o}bject \underline{s}egmentation +transform\underline{er} baseline \textbf{OVCoser} attached to the +parameter-fixed CLIP with iterative semantic guidance and structure +enhancement. By integrating the guidance of class semantic knowledge and the +supplement of visual structure cues from the edge and depth information, the +proposed method can efficiently capture camouflaged objects. Moreover, this +effective framework also surpasses previous state-of-the-arts of +open-vocabulary semantic image segmentation by a large margin on our OVCamo +dataset. With the proposed dataset and baseline, we hope that this new task +with more practical value can further expand the research on open-vocabulary +dense prediction tasks. The code and data will be available in the future. + +
+
+ comment: Update the style and add details +
+
+
+
+
+ + ♻ ☆ Semantics Meets Temporal Correspondence: Self-supervised Object-centric + Learning in Videos ICCV 2023 + + +
+ Self-supervised methods have shown remarkable progress in learning high-level +semantics and low-level temporal correspondence. Building on these results, we +take one step further and explore the possibility of integrating these two +features to enhance object-centric representations. Our preliminary experiments +indicate that query slot attention can extract different semantic components +from the RGB feature map, while random sampling based slot attention can +exploit temporal correspondence cues between frames to assist instance +identification. Motivated by this, we propose a novel semantic-aware masked +slot attention on top of the fused semantic features and correspondence maps. +It comprises two slot attention stages with a set of shared learnable Gaussian +distributions. In the first stage, we use the mean vectors as slot +initialization to decompose potential semantics and generate semantic +segmentation masks through iterative attention. In the second stage, for each +semantics, we randomly sample slots from the corresponding Gaussian +distribution and perform masked feature aggregation within the semantic area to +exploit temporal correspondence patterns for instance identification. We adopt +semantic- and instance-level temporal consistency as self-supervision to +encourage temporally coherent object-centric representations. Our model +effectively identifies multiple object instances with semantic structure, +reaching promising results on unsupervised video object discovery. Furthermore, +we achieve state-of-the-art performance on dense label propagation tasks, +demonstrating the potential for object-centric analysis. The code is released +at https://github.com/shvdiwnkozbw/SMTC. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ ICP-Flow: LiDAR Scene Flow Estimation with ICP CVPR 2024 + + +
+ Scene flow characterizes the 3D motion between two LiDAR scans captured by an +autonomous vehicle at nearby timesteps. Prevalent methods consider scene flow +as point-wise unconstrained flow vectors that can be learned by either +large-scale training beforehand or time-consuming optimization at inference. +However, these methods do not take into account that objects in autonomous +driving often move rigidly. We incorporate this rigid-motion assumption into +our design, where the goal is to associate objects over scans and then estimate +the locally rigid transformations. We propose ICP-Flow, a learning-free flow +estimator. The core of our design is the conventional Iterative Closest Point +(ICP) algorithm, which aligns the objects over time and outputs the +corresponding rigid transformations. Crucially, to aid ICP, we propose a +histogram-based initialization that discovers the most likely translation, thus +providing a good starting point for ICP. The complete scene flow is then +recovered from the rigid transformations. We outperform state-of-the-art +baselines, including supervised models, on the Waymo dataset and perform +competitively on Argoverse-v2 and nuScenes. Further, we train a feedforward +neural network, supervised by the pseudo labels from our model, and achieve top +performance among all models capable of real-time inference. We validate the +advantage of our model on scene flow estimation with longer temporal gaps, up +to 0.4 seconds where other models fail to deliver meaningful results. + +
+
+ comment: CVPR 2024, camera-ready. Code: https://github.com/yanconglin/ICP-Flow +
+
+
+
+
+ + ♻ ☆ GSVA: Generalized Segmentation via Multimodal Large Language Models CVPR2024 + + +
+ Generalized Referring Expression Segmentation (GRES) extends the scope of +classic RES to refer to multiple objects in one expression or identify the +empty targets absent in the image. GRES poses challenges in modeling the +complex spatial relationships of the instances in the image and identifying +non-existing referents. Multimodal Large Language Models (MLLMs) have recently +shown tremendous progress in these complicated vision-language tasks. +Connecting Large Language Models (LLMs) and vision models, MLLMs are proficient +in understanding contexts with visual inputs. Among them, LISA, as a +representative, adopts a special [SEG] token to prompt a segmentation mask +decoder, e.g., SAM, to enable MLLMs in the RES task. However, existing +solutions to GRES remain unsatisfactory since current segmentation MLLMs cannot +correctly handle the cases where users might reference multiple subjects in a +singular prompt or provide descriptions incongruent with any image target. In +this paper, we propose Generalized Segmentation Vision Assistant (GSVA) to +address this gap. Specifically, GSVA reuses the [SEG] token to prompt the +segmentation model towards supporting multiple mask references simultaneously +and innovatively learns to generate a [REJ] token to reject the null targets +explicitly. Experiments validate GSVA's efficacy in resolving the GRES issue, +marking a notable enhancement and setting a new record on the GRES benchmark +gRefCOCO dataset. GSVA also proves effective across various classic referring +segmentation and comprehension tasks. + +
+
+ comment: Accepted by CVPR2024 (19 pages, 9 figures, 11 tables) +
+
+
+
+
+ + ♻ ☆ RTFS-Net: Recurrent Time-Frequency Modelling for Efficient Audio-Visual + Speech Separation ICLR + + +
+ Audio-visual speech separation methods aim to integrate different modalities +to generate high-quality separated speech, thereby enhancing the performance of +downstream tasks such as speech recognition. Most existing state-of-the-art +(SOTA) models operate in the time domain. However, their overly simplistic +approach to modeling acoustic features often necessitates larger and more +computationally intensive models in order to achieve SOTA performance. In this +paper, we present a novel time-frequency domain audio-visual speech separation +method: Recurrent Time-Frequency Separation Network (RTFS-Net), which applies +its algorithms on the complex time-frequency bins yielded by the Short-Time +Fourier Transform. We model and capture the time and frequency dimensions of +the audio independently using a multi-layered RNN along each dimension. +Furthermore, we introduce a unique attention-based fusion technique for the +efficient integration of audio and visual information, and a new mask +separation approach that takes advantage of the intrinsic spectral nature of +the acoustic features for a clearer separation. RTFS-Net outperforms the prior +SOTA method in both inference speed and separation quality while reducing the +number of parameters by 90% and MACs by 83%. This is the first time-frequency +domain audio-visual speech separation method to outperform all contemporary +time-domain counterparts. + +
+
+ comment: Accepted by The Twelfth International Conference on Learning + Representations (ICLR) 2024, see https://openreview.net/forum?id=PEuDO2EiDr +
+
+
+
+
+ + ♻ ☆ Unveiling Parts Beyond Objects:Towards Finer-Granularity Referring + Expression Segmentation CVPR 2024 + + +
+ Referring expression segmentation (RES) aims at segmenting the foreground +masks of the entities that match the descriptive natural language expression. +Previous datasets and methods for classic RES task heavily rely on the prior +assumption that one expression must refer to object-level targets. In this +paper, we take a step further to finer-grained part-level RES task. To promote +the object-level RES task towards finer-grained vision-language understanding, +we put forward a new multi-granularity referring expression segmentation (MRES) +task and construct an evaluation benchmark called RefCOCOm by manual +annotations. By employing our automatic model-assisted data engine, we build +the largest visual grounding dataset namely MRES-32M, which comprises over +32.2M high-quality masks and captions on the provided 1M images. Besides, a +simple yet strong model named UniRES is designed to accomplish the unified +object-level and part-level grounding task. Extensive experiments on our +RefCOCOm for MRES and three datasets (i.e., RefCOCO(+/g) for classic RES task +demonstrate the superiority of our method over previous state-of-the-art +methods. To foster future research into fine-grained visual grounding, our +benchmark RefCOCOm, the MRES-32M dataset and model UniRES will be publicly +available at https://github.com/Rubics-Xuan/MRES + +
+
+ comment: This work is accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ FMM-Attack: A Flow-based Multi-modal Adversarial Attack on Video-based + LLMs + + +
+ Despite the remarkable performance of video-based large language models +(LLMs), their adversarial threat remains unexplored. To fill this gap, we +propose the first adversarial attack tailored for video-based LLMs by crafting +flow-based multi-modal adversarial perturbations on a small fraction of frames +within a video, dubbed FMM-Attack. Extensive experiments show that our attack +can effectively induce video-based LLMs to generate incorrect answers when +videos are added with imperceptible adversarial perturbations. Intriguingly, +our FMM-Attack can also induce garbling in the model output, prompting +video-based LLMs to hallucinate. Overall, our observations inspire a further +understanding of multi-modal robustness and safety-related feature alignment +across different modalities, which is of great importance for various large +multi-modal models. Our code is available at +https://github.com/THU-Kingmin/FMM-Attack. + +
+
+
+
+
+ + ♻ ☆ CBNet: A Plug-and-Play Network for Segmentation-Based Scene Text + Detection + + +
+ Recently, segmentation-based methods are quite popular in scene text +detection, which mainly contain two steps: text kernel segmentation and +expansion. However, the segmentation process only considers each pixel +independently, and the expansion process is difficult to achieve a favorable +accuracy-speed trade-off. In this paper, we propose a Context-aware and +Boundary-guided Network (CBN) to tackle these problems. In CBN, a basic text +detector is firstly used to predict initial segmentation results. Then, we +propose a context-aware module to enhance text kernel feature representations, +which considers both global and local contexts. Finally, we introduce a +boundary-guided module to expand enhanced text kernels adaptively with only the +pixels on the contours, which not only obtains accurate text boundaries but +also keeps high speed, especially on high-resolution output maps. In +particular, with a lightweight backbone, the basic detector equipped with our +proposed CBN achieves state-of-the-art results on several popular benchmarks, +and our proposed CBN can be plugged into several segmentation-based methods. +Code is available at https://github.com/XiiZhao/cbn.pytorch. + +
+
+ comment: Accepted by IJCV 2024. Code is available at this https URL: + https://github.com/XiiZhao/cbn.pytorch +
+
+
+
+
+ + ♻ ☆ Conditional Tuning Network for Few-Shot Adaptation of Segmentation + Anything Model + + +
+ The recent Segment Anything Model (SAM) has demonstrated remarkable zero-shot +capability and flexible geometric prompting in general image segmentation. +However, SAM often struggles when handling various unconventional images, such +as aerial, medical, and non-RGB images. This paper presents CAT-SAM, a +ConditionAl Tuning network that adapts SAM toward various unconventional target +tasks with just few-shot target samples. CAT-SAM freezes the entire SAM and +adapts its mask decoder and image encoder simultaneously with a small number of +learnable parameters. The core design is a prompt bridge structure that enables +decoder-conditioned joint tuning of the heavyweight image encoder and the +lightweight mask decoder. The bridging maps the prompt token of the mask +decoder to the image encoder, fostering synergic adaptation of the encoder and +the decoder with mutual benefits. We develop two representative tuning +strategies for the image encoder which leads to two CAT-SAM variants: one +injecting learnable prompt tokens in the input space and the other inserting +lightweight adapter networks. Extensive experiments over 11 unconventional +tasks show that both CAT-SAM variants achieve superior target segmentation +performance consistently even under the very challenging one-shot adaptation +setup. Project page: https://xiaoaoran.github.io/projects/CAT-SAM + +
+
+ comment: Project page: https://xiaoaoran.github.io/projects/CAT-SAM +
+
+
+
+
+ + ♻ ☆ ShaDocFormer: A Shadow-Attentive Threshold Detector With Cascaded Fusion + Refiner for Document Shadow Removal IJCNN 2024 + + +
+ Document shadow is a common issue that arises when capturing documents using +mobile devices, which significantly impacts readability. Current methods +encounter various challenges, including inaccurate detection of shadow masks +and estimation of illumination. In this paper, we propose ShaDocFormer, a +Transformer-based architecture that integrates traditional methodologies and +deep learning techniques to tackle the problem of document shadow removal. The +ShaDocFormer architecture comprises two components: the Shadow-attentive +Threshold Detector (STD) and the Cascaded Fusion Refiner (CFR). The STD module +employs a traditional thresholding technique and leverages the attention +mechanism of the Transformer to gather global information, thereby enabling +precise detection of shadow masks. The cascaded and aggregative structure of +the CFR module facilitates a coarse-to-fine restoration process for the entire +image. As a result, ShaDocFormer excels in accurately detecting and capturing +variations in both shadow and illumination, thereby enabling effective removal +of shadows. Extensive experiments demonstrate that ShaDocFormer outperforms +current state-of-the-art methods in both qualitative and quantitative +measurements. + +
+
+ comment: Accepted by IJCNN 2024 +
+
+
+
+
+ + ♻ ☆ MinD-3D: Reconstruct High-quality 3D objects in Human Brain + + +
+ In this paper, we introduce Recon3DMind, an innovative task aimed at +reconstructing 3D visuals from Functional Magnetic Resonance Imaging (fMRI) +signals, marking a significant advancement in the fields of cognitive +neuroscience and computer vision. To support this pioneering task, we present +the fMRI-Shape dataset, which includes data from 14 participants and features +360-degree videos of 3D objects to enable comprehensive fMRI signal capture +across various settings, thereby laying a foundation for future research. +Furthermore, we propose MinD-3D, a novel and effective three-stage framework +specifically designed to decode the brain's 3D visual information from fMRI +signals, demonstrating the feasibility of this challenging task. The framework +begins by extracting and aggregating features from fMRI frames through a +neuro-fusion encoder, subsequently employs a feature bridge diffusion model to +generate visual features, and ultimately recovers the 3D object via a +generative transformer decoder. We assess the performance of MinD-3D using a +suite of semantic and structural metrics and analyze the correlation between +the features extracted by our model and the visual regions of interest (ROIs) +in fMRI signals. Our findings indicate that MinD-3D not only reconstructs 3D +objects with high semantic relevance and spatial similarity but also +significantly enhances our understanding of the human brain's capabilities in +processing 3D visual information. Project page at: +https://jianxgao.github.io/MinD-3D. + +
+
+ comment: 26 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ Neural Markov Random Field for Stereo Matching CVPR 2024 + + +
+ Stereo matching is a core task for many computer vision and robotics +applications. Despite their dominance in traditional stereo methods, the +hand-crafted Markov Random Field (MRF) models lack sufficient modeling accuracy +compared to end-to-end deep models. While deep learning representations have +greatly improved the unary terms of the MRF models, the overall accuracy is +still severely limited by the hand-crafted pairwise terms and message passing. +To address these issues, we propose a neural MRF model, where both potential +functions and message passing are designed using data-driven neural networks. +Our fully data-driven model is built on the foundation of variational inference +theory, to prevent convergence issues and retain stereo MRF's graph inductive +bias. To make the inference tractable and scale well to high-resolution images, +we also propose a Disparity Proposal Network (DPN) to adaptively prune the +search space of disparity. The proposed approach ranks $1^{st}$ on both KITTI +2012 and 2015 leaderboards among all published methods while running faster +than 100 ms. This approach significantly outperforms prior global methods, +e.g., lowering D1 metric by more than 50% on KITTI 2015. In addition, our +method exhibits strong cross-domain generalization and can recover sharp edges. +The codes at https://github.com/aeolusguan/NMRF + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Active Prompt Learning in Vision Language Models CVPR 2024 + + +
+ Pre-trained Vision Language Models (VLMs) have demonstrated notable progress +in various zero-shot tasks, such as classification and retrieval. Despite their +performance, because improving performance on new tasks requires task-specific +knowledge, their adaptation is essential. While labels are needed for the +adaptation, acquiring them is typically expensive. To overcome this challenge, +active learning, a method of achieving a high performance by obtaining labels +for a small number of samples from experts, has been studied. Active learning +primarily focuses on selecting unlabeled samples for labeling and leveraging +them to train models. In this study, we pose the question, "how can the +pre-trained VLMs be adapted under the active learning framework?" In response +to this inquiry, we observe that (1) simply applying a conventional active +learning framework to pre-trained VLMs even may degrade performance compared to +random selection because of the class imbalance in labeling candidates, and (2) +the knowledge of VLMs can provide hints for achieving the balance before +labeling. Based on these observations, we devise a novel active learning +framework for VLMs, denoted as PCB. To assess the effectiveness of our +approach, we conduct experiments on seven different real-world datasets, and +the results demonstrate that PCB surpasses conventional active learning and +random sampling methods. Code will be available in +https://github.com/kaist-dmlab/pcb . + +
+
+ comment: accepted at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ LMM-Assisted Breast Cancer Treatment Target Segmentation with + Consistency Embedding + + +
+ Recent advancements in Artificial Intelligence (AI) have profoundly +influenced medical fields, by providing tools to reduce clinical workloads. +However, most AI models are constrained to execute unimodal tasks, in stark +contrast to the comprehensive approaches utilized by medical professionals. To +address this, here we present RO-LMM, a multi-purpose large multimodal model +(LMM) tailored for the field of radiation oncology. This model covers series of +tasks within clinical workflow, adept at clinical report summarization, +radiation treatment plan suggestion, and plan-guided target volume +segmentation. In particular, to perform consecutive clinical tasks, we further +present a novel Consistency Embedding Fine-Tuning (CEFTune) technique, which +boosts LMM's robustness to noisy inputs while preserving the capability of +handling clean inputs, and transform this concept into LMM-driven segmentation +framework as Consistency Embedding Segmentation~(CESEG). Experimental results +on multi-centre cohorts demonstrate our RO-LMM's promising performance for +multiple clinical tasks with generalization capabilities. + +
+
+ comment: 30 pages, 16 table, 5 figures +
+
+
+
+
+ + ♻ ☆ NocPlace: Nocturnal Visual Place Recognition via Generative and + Inherited Knowledge Transfer + + +
+ Visual Place Recognition (VPR) is crucial in computer vision, aiming to +retrieve database images similar to a query image from an extensive collection +of known images. However, like many vision tasks, VPR always degrades at night +due to the scarcity of nighttime images. Moreover, VPR needs to address the +cross-domain problem of night-to-day rather than just the issue of a single +nighttime domain. In response to these issues, we present NocPlace, which +leverages generative and inherited knowledge transfer to embed resilience +against dazzling lights and extreme darkness in the global descriptor. First, +we establish a day-night urban scene dataset called NightCities, capturing +diverse lighting variations and dark scenarios across 60 cities globally. Then, +an image generation network is trained on this dataset and processes a +large-scale VPR dataset, obtaining its nighttime version. Finally, VPR models +are fine-tuned using descriptors inherited from themselves and night-style +images, which builds explicit cross-domain contrastive relationships. +Comprehensive experiments on various datasets demonstrate our contributions and +the superiority of NocPlace. Without adding any real-time computing resources, +NocPlace improves the performance of Eigenplaces by 7.6% on Tokyo 24/7 Night +and 16.8% on SVOX Night. + +
+
+ comment: 28 pages,9 figures +
+
+
+
+
+ + ♻ ☆ ED-NeRF: Efficient Text-Guided Editing of 3D Scene with Latent Space + NeRF ICLR 2024 + + +
+ Recently, there has been a significant advancement in text-to-image diffusion +models, leading to groundbreaking performance in 2D image generation. These +advancements have been extended to 3D models, enabling the generation of novel +3D objects from textual descriptions. This has evolved into NeRF editing +methods, which allow the manipulation of existing 3D objects through textual +conditioning. However, existing NeRF editing techniques have faced limitations +in their performance due to slow training speeds and the use of loss functions +that do not adequately consider editing. To address this, here we present a +novel 3D NeRF editing approach dubbed ED-NeRF by successfully embedding +real-world scenes into the latent space of the latent diffusion model (LDM) +through a unique refinement layer. This approach enables us to obtain a NeRF +backbone that is not only faster but also more amenable to editing compared to +traditional image space NeRF editing. Furthermore, we propose an improved loss +function tailored for editing by migrating the delta denoising score (DDS) +distillation loss, originally used in 2D image editing to the three-dimensional +domain. This novel loss function surpasses the well-known score distillation +sampling (SDS) loss in terms of suitability for editing purposes. Our +experimental results demonstrate that ED-NeRF achieves faster editing speed +while producing improved output quality compared to state-of-the-art 3D editing +models. + +
+
+ comment: ICLR 2024; Project Page: https://jhq1234.github.io/ed-nerf.github.io/ +
+
+
+
+
+ + ♻ ☆ Less is More: Data Value Estimation for Visual Instruction Tuning + + +
+ Visual instruction tuning is the key to building multimodal large language +models (MLLMs), which greatly improves the reasoning capabilities of large +language models (LLMs) in vision scenario. However, existing MLLMs mostly rely +on a mixture of multiple highly diverse visual instruction datasets for +training (even more than a million instructions), which may introduce data +redundancy. To investigate this issue, we conduct a series of empirical +studies, which reveal a significant redundancy within the visual instruction +datasets, and show that greatly reducing the amount of several instruction +dataset even do not affect the performance. Based on the findings, we propose a +new data selection approach TIVE, to eliminate redundancy within visual +instruction data. TIVE first estimates the task-level and instance-level value +of the visual instructions based on computed gradients. Then, according to the +estimated values, TIVE determines the task proportion within the visual +instructions, and selects representative instances to compose a smaller visual +instruction subset for training. Experiments on LLaVA-1.5 show that our +approach using only about 7.5% data can achieve comparable performance as the +full-data fine-tuned model across seven benchmarks, even surpassing it on four +of the benchmarks. Our code and data will be publicly released. + +
+
+
+
+
+ + ♻ ☆ A Fourier Transform Framework for Domain Adaptation + + +
+ By using unsupervised domain adaptation (UDA), knowledge can be transferred +from a label-rich source domain to a target domain that contains relevant +information but lacks labels. Many existing UDA algorithms suffer from directly +using raw images as input, resulting in models that overly focus on redundant +information and exhibit poor generalization capability. To address this issue, +we attempt to improve the performance of unsupervised domain adaptation by +employing the Fourier method (FTF).Specifically, FTF is inspired by the +amplitude of Fourier spectra, which primarily preserves low-level statistical +information. In FTF, we effectively incorporate low-level information from the +target domain into the source domain by fusing the amplitudes of both domains +in the Fourier domain. Additionally, we observe that extracting features from +batches of images can eliminate redundant information while retaining +class-specific features relevant to the task. Building upon this observation, +we apply the Fourier Transform at the data stream level for the first time. To +further align multiple sources of data, we introduce the concept of correlation +alignment. To evaluate the effectiveness of our FTF method, we conducted +evaluations on four benchmark datasets for domain adaptation, including +Office-31, Office-Home, ImageCLEF-DA, and Office-Caltech. Our results +demonstrate superior performance. + +
+
+ comment: The paper contains significant errors and the experimental + methodology is not rigorous. The experimental section and methodology need to + be rewritten +
+
+
+
+
+ + ♻ ☆ EfficientDreamer: High-Fidelity and Robust 3D Creation via + Orthogonal-view Diffusion Prior + + +
+ While image diffusion models have made significant progress in text-driven 3D +content creation, they often fail to accurately capture the intended meaning of +text prompts, especially for view information. This limitation leads to the +Janus problem, where multi-faced 3D models are generated under the guidance of +such diffusion models. In this paper, we propose a robust high-quality 3D +content generation pipeline by exploiting orthogonal-view image guidance. +First, we introduce a novel 2D diffusion model that generates an image +consisting of four orthogonal-view sub-images based on the given text prompt. +Then, the 3D content is created using this diffusion model. Notably, the +generated orthogonal-view image provides strong geometric structure priors and +thus improves 3D consistency. As a result, it effectively resolves the Janus +problem and significantly enhances the quality of 3D content creation. +Additionally, we present a 3D synthesis fusion network that can further improve +the details of the generated 3D contents. Both quantitative and qualitative +evaluations demonstrate that our method surpasses previous text-to-3D +techniques. Project page: https://efficientdreamer.github.io. + +
+
+
+
+
+ + ♻ ☆ SwIPE: Efficient and Robust Medical Image Segmentation with Implicit + Patch Embeddings MICCAI'23 + + +
+ Modern medical image segmentation methods primarily use discrete +representations in the form of rasterized masks to learn features and generate +predictions. Although effective, this paradigm is spatially inflexible, scales +poorly to higher-resolution images, and lacks direct understanding of object +shapes. To address these limitations, some recent works utilized implicit +neural representations (INRs) to learn continuous representations for +segmentation. However, these methods often directly adopted components designed +for 3D shape reconstruction. More importantly, these formulations were also +constrained to either point-based or global contexts, lacking contextual +understanding or local fine-grained details, respectively--both critical for +accurate segmentation. To remedy this, we propose a novel approach, SwIPE +(Segmentation with Implicit Patch Embeddings), that leverages the advantages of +INRs and predicts shapes at the patch level--rather than at the point level or +image level--to enable both accurate local boundary delineation and global +shape coherence. Extensive evaluations on two tasks (2D polyp segmentation and +3D abdominal organ segmentation) show that SwIPE significantly improves over +recent implicit approaches and outperforms state-of-the-art discrete methods +with over 10x fewer parameters. Our method also demonstrates superior data +efficiency and improved robustness to data shifts across image resolutions and +datasets. Code is available on Github +(https://github.com/charzharr/miccai23-swipe-implicit-segmentation). + +
+
+ comment: Accepted to the 2023 International Conference on Medical Image + Computing and Computer Assisted Intervention (MICCAI'23) +
+
+
+
+
+ + ♻ ☆ M3FAS: An Accurate and Robust MultiModal Mobile Face Anti-Spoofing + System + + +
+ Face presentation attacks (FPA), also known as face spoofing, have brought +increasing concerns to the public through various malicious applications, such +as financial fraud and privacy leakage. Therefore, safeguarding face +recognition systems against FPA is of utmost importance. Although existing +learning-based face anti-spoofing (FAS) models can achieve outstanding +detection performance, they lack generalization capability and suffer +significant performance drops in unforeseen environments. Many methodologies +seek to use auxiliary modality data (e.g., depth and infrared maps) during the +presentation attack detection (PAD) to address this limitation. However, these +methods can be limited since (1) they require specific sensors such as depth +and infrared cameras for data capture, which are rarely available on commodity +mobile devices, and (2) they cannot work properly in practical scenarios when +either modality is missing or of poor quality. In this paper, we devise an +accurate and robust MultiModal Mobile Face Anti-Spoofing system named M3FAS to +overcome the issues above. The primary innovation of this work lies in the +following aspects: (1) To achieve robust PAD, our system combines visual and +auditory modalities using three commonly available sensors: camera, speaker, +and microphone; (2) We design a novel two-branch neural network with three +hierarchical feature aggregation modules to perform cross-modal feature fusion; +(3). We propose a multi-head training strategy, allowing the model to output +predictions from the vision, acoustic, and fusion heads, resulting in a more +flexible PAD. Extensive experiments have demonstrated the accuracy, robustness, +and flexibility of M3FAS under various challenging experimental settings. The +source code and dataset are available at: https://github.com/ChenqiKONG/M3FAS/ + +
+
+
+
+
+ + ♻ ☆ See, Imagine, Plan: Discovering and Hallucinating Tasks from a Single + Image + + +
+ Humans can not only recognize and understand the world in its current state +but also envision future scenarios that extend beyond immediate perception. To +resemble this profound human capacity, we introduce zero-shot task +hallucination -- given a single RGB image of any scene comprising unknown +environments and objects, our model can identify potential tasks and imagine +their execution in a vivid narrative, realized as a video. We develop a modular +pipeline that progressively enhances scene decomposition, comprehension, and +reconstruction, incorporating VLM for dynamic interaction and 3D motion +planning for object trajectories. Our model can discover diverse tasks, with +the generated task videos demonstrating realistic and compelling visual +outcomes that are understandable by both machines and humans. Project Page: +https://dannymcy.github.io/zeroshot_task_hallucination/ + +
+
+ comment: Project Page: https://dannymcy.github.io/zeroshot_task_hallucination/ +
+
+
+
+
+ + ♻ ☆ Federated Learning for Large-Scale Scene Modeling with Neural Radiance + Fields + + +
+ We envision a system to continuously build and maintain a map based on +earth-scale neural radiance fields (NeRF) using data collected from vehicles +and drones in a lifelong learning manner. However, existing large-scale +modeling by NeRF has problems in terms of scalability and maintainability when +modeling earth-scale environments. Therefore, to address these problems, we +propose a federated learning pipeline for large-scale modeling with NeRF. We +tailor the model aggregation pipeline in federated learning for NeRF, thereby +allowing local updates of NeRF. In the aggregation step, the accuracy of the +clients' global pose is critical. Thus, we also propose global pose alignment +to align the noisy global pose of clients before the aggregation step. In +experiments, we show the effectiveness of the proposed pose alignment and the +federated learning pipeline on the large-scale scene dataset, Mill19. + +
+
+ comment: Our subsequent work is available at arXiv:2403.11460 +
+
+
+
+
+ + ♻ ☆ Chat-UniVi: Unified Visual Representation Empowers Large Language Models + with Image and Video Understanding CVPR 2024 + + +
+ Large language models have demonstrated impressive universal capabilities +across a wide range of open-ended tasks and have extended their utility to +encompass multimodal conversations. However, existing methods encounter +challenges in effectively handling both image and video understanding, +particularly with limited visual tokens. In this work, we introduce Chat-UniVi, +a Unified Vision-language model capable of comprehending and engaging in +conversations involving images and videos through a unified visual +representation. Specifically, we employ a set of dynamic visual tokens to +uniformly represent images and videos. This representation framework empowers +the model to efficiently utilize a limited number of visual tokens to +simultaneously capture the spatial details necessary for images and the +comprehensive temporal relationship required for videos. Moreover, we leverage +a multi-scale representation, enabling the model to perceive both high-level +semantic concepts and low-level visual details. Notably, Chat-UniVi is trained +on a mixed dataset containing both images and videos, allowing direct +application to tasks involving both mediums without requiring any +modifications. Extensive experimental results demonstrate that Chat-UniVi +consistently outperforms even existing methods exclusively designed for either +images or videos. Code is available at +https://github.com/PKU-YuanGroup/Chat-UniVi. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Deep Learning for Inertial Positioning: A Survey + + +
+ Inertial sensors are widely utilized in smartphones, drones, robots, and IoT +devices, playing a crucial role in enabling ubiquitous and reliable +localization. Inertial sensor-based positioning is essential in various +applications, including personal navigation, location-based security, and +human-device interaction. However, low-cost MEMS inertial sensors' measurements +are inevitably corrupted by various error sources, leading to unbounded drifts +when integrated doubly in traditional inertial navigation algorithms, +subjecting inertial positioning to the problem of error drifts. In recent +years, with the rapid increase in sensor data and computational power, deep +learning techniques have been developed, sparking significant research into +addressing the problem of inertial positioning. Relevant literature in this +field spans across mobile computing, robotics, and machine learning. In this +article, we provide a comprehensive review of deep learning-based inertial +positioning and its applications in tracking pedestrians, drones, vehicles, and +robots. We connect efforts from different fields and discuss how deep learning +can be applied to address issues such as sensor calibration, positioning error +drift reduction, and multi-sensor fusion. This article aims to attract readers +from various backgrounds, including researchers and practitioners interested in +the potential of deep learning-based techniques to solve inertial positioning +problems. Our review demonstrates the exciting possibilities that deep learning +brings to the table and provides a roadmap for future research in this field. + +
+
+ comment: Accepted by IEEE Transactions on Intelligent Transportation Systems +
+
+
+
+
+ + ♻ ☆ Frequency-Aware Transformer for Learned Image Compression ICLR2024 + + +
+ Learned image compression (LIC) has gained traction as an effective solution +for image storage and transmission in recent years. However, existing LIC +methods are redundant in latent representation due to limitations in capturing +anisotropic frequency components and preserving directional details. To +overcome these challenges, we propose a novel frequency-aware transformer (FAT) +block that for the first time achieves multiscale directional ananlysis for +LIC. The FAT block comprises frequency-decomposition window attention (FDWA) +modules to capture multiscale and directional frequency components of natural +images. Additionally, we introduce frequency-modulation feed-forward network +(FMFFN) to adaptively modulate different frequency components, improving +rate-distortion performance. Furthermore, we present a transformer-based +channel-wise autoregressive (T-CA) model that effectively exploits channel +dependencies. Experiments show that our method achieves state-of-the-art +rate-distortion performance compared to existing LIC methods, and evidently +outperforms latest standardized codec VTM-12.1 by 14.5%, 15.1%, 13.0% in +BD-rate on the Kodak, Tecnick, and CLIC datasets. + +
+
+ comment: ICLR2024 poster +
+
+
+
+
+ + ♻ ☆ TiC-CLIP: Continual Training of CLIP Models ICLR 2024 + + +
+ Keeping large foundation models up to date on latest data is inherently +expensive. To avoid the prohibitive costs of constantly retraining, it is +imperative to continually train these models. This problem is exacerbated by +the lack of any large scale continual learning benchmarks or baselines. We +introduce the first set of web-scale Time-Continual (TiC) benchmarks for +training vision-language models: TiC-DataComp, TiC-YFCC, and TiC-Redcaps. +TiC-DataComp, our largest dataset, contains over 12.7B timestamped image-text +pairs spanning 9 years (2014-2022). We first use our benchmarks to curate +various dynamic evaluations to measure temporal robustness of existing models. +We show OpenAI's CLIP (trained on data up to 2020) loses $\approx 8\%$ +zero-shot accuracy on our curated retrieval task from 2021-2022 compared with +more recently trained models in OpenCLIP repository. We then study how to +efficiently train models on time-continuous data. We demonstrate that a simple +rehearsal-based approach that continues training from the last checkpoint and +replays old data reduces compute by $2.5\times$ when compared to the standard +practice of retraining from scratch. Code is available at +https://github.com/apple/ml-tic-clip. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Vision-Language Models can Identify Distracted Driver Behavior from + Naturalistic Videos + + +
+ Recognizing the activities causing distraction in real-world driving +scenarios is critical for ensuring the safety and reliability of both drivers +and pedestrians on the roadways. Conventional computer vision techniques are +typically data-intensive and require a large volume of annotated training data +to detect and classify various distracted driving behaviors, thereby limiting +their efficiency and scalability. We aim to develop a generalized framework +that showcases robust performance with access to limited or no annotated +training data. Recently, vision-language models have offered large-scale +visual-textual pretraining that can be adapted to task-specific learning like +distracted driving activity recognition. Vision-language pretraining models, +such as CLIP, have shown significant promise in learning natural +language-guided visual representations. This paper proposes a CLIP-based driver +activity recognition approach that identifies driver distraction from +naturalistic driving images and videos. CLIP's vision embedding offers +zero-shot transfer and task-based finetuning, which can classify distracted +activities from driving video data. Our results show that this framework offers +state-of-the-art performance on zero-shot transfer and video-based CLIP for +predicting the driver's state on two public datasets. We propose both +frame-based and video-based frameworks developed on top of the CLIP's visual +representation for distracted driving detection and classification tasks and +report the results. + +
+
+ comment: 15 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Weighted Ensemble Models Are Strong Continual Learners + + +
+ In this work, we study the problem of continual learning (CL) where the goal +is to learn a model on a sequence of tasks, such that the data from the +previous tasks becomes unavailable while learning on the current task data. CL +is essentially a balancing act between being able to learn on the new task +(i.e., plasticity) and maintaining the performance on the previously learned +concepts (i.e., stability). Intending to address the stability-plasticity +trade-off, we propose to perform weight-ensembling of the model parameters of +the previous and current tasks. This weighted-ensembled model, which we call +Continual Model Averaging (or CoMA), attains high accuracy on the current task +by leveraging plasticity, while not deviating too far from the previous weight +configuration, ensuring stability. We also propose an improved variant of CoMA, +named Continual Fisher-weighted Model Averaging (or CoFiMA), that selectively +weighs each parameter in the weights ensemble by leveraging the Fisher +information of the weights of the model. Both variants are conceptually simple, +easy to implement, and effective in attaining state-of-the-art performance on +several standard CL benchmarks. Code is available at: +https://github.com/IemProg/CoFiMA. + +
+
+ comment: Code: https://github.com/IemProg/CoFiMA +
+
+
+
+
+ + ♻ ☆ AMP: Autoregressive Motion Prediction Revisited with Next Token + Prediction for Autonomous Driving + + +
+ As an essential task in autonomous driving (AD), motion prediction aims to +predict the future states of surround objects for navigation. One natural +solution is to estimate the position of other agents in a step-by-step manner +where each predicted time-step is conditioned on both observed time-steps and +previously predicted time-steps, i.e., autoregressive prediction. Pioneering +works like SocialLSTM and MFP design their decoders based on this intuition. +However, almost all state-of-the-art works assume that all predicted time-steps +are independent conditioned on observed time-steps, where they use a single +linear layer to generate positions of all time-steps simultaneously. They +dominate most motion prediction leaderboards due to the simplicity of training +MLPs compared to autoregressive networks. + In this paper, we introduce the GPT style next token prediction into motion +forecasting. In this way, the input and output could be represented in a +unified space and thus the autoregressive prediction becomes more feasible. +However, different from language data which is composed of homogeneous units +-words, the elements in the driving scene could have complex spatial-temporal +and semantic relations. To this end, we propose to adopt three factorized +attention modules with different neighbors for information aggregation and +different position encoding styles to capture their relations, e.g., encoding +the transformation between coordinate systems for spatial relativity while +adopting RoPE for temporal relativity. Empirically, by equipping with the +aforementioned tailored designs, the proposed method achieves state-of-the-art +performance in the Waymo Open Motion and Waymo Interaction datasets. Notably, +AMP outperforms other recent autoregressive motion prediction methods: MotionLM +and StateTransformer, which demonstrates the effectiveness of the proposed +designs. + +
+
+
+
+
+ + ♻ ☆ Soft-Label Anonymous Gastric X-ray Image Distillation ICIP 2020 + + +
+ This paper presents a soft-label anonymous gastric X-ray image distillation +method based on a gradient descent approach. The sharing of medical data is +demanded to construct high-accuracy computer-aided diagnosis (CAD) systems. +However, the large size of the medical dataset and privacy protection are +remaining problems in medical data sharing, which hindered the research of CAD +systems. The idea of our distillation method is to extract the valid +information of the medical dataset and generate a tiny distilled dataset that +has a different data distribution. Different from model distillation, our +method aims to find the optimal distilled images, distilled labels and the +optimized learning rate. Experimental results show that the proposed method can +not only effectively compress the medical dataset but also anonymize medical +images to protect the patient's private information. The proposed approach can +improve the efficiency and security of medical data sharing. + +
+
+ comment: The first paper to explore real-world dataset distillation; Work was + done in 2019 and published as a conference paper at ICIP 2020 +
+
+
+
+
+ + ♻ ☆ Enhancing Multimodal Cooperation via Fine-grained Modality Valuation CVPR 2024 + + +
+ One primary topic of multimodal learning is to jointly incorporate +heterogeneous information from different modalities. However, most models often +suffer from unsatisfactory multimodal cooperation, which cannot jointly utilize +all modalities well. Some methods are proposed to identify and enhance the +worse learnt modality, but they are often hard to provide the fine-grained +observation of multimodal cooperation at sample-level with theoretical support. +Hence, it is essential to reasonably observe and improve the fine-grained +cooperation between modalities, especially when facing realistic scenarios +where the modality discrepancy could vary across different samples. To this +end, we introduce a sample-level modality valuation metric to evaluate the +contribution of each modality for each sample. Via modality valuation, we +observe that modality discrepancy indeed could be different at sample-level, +beyond the global contribution discrepancy at dataset-level. We further analyze +this issue and improve cooperation between modalities at sample-level by +enhancing the discriminative ability of low-contributing modalities in a +targeted manner. Overall, our methods reasonably observe the fine-grained +uni-modal contribution and achieve considerable improvement. The source code +and dataset are available at +\url{https://github.com/GeWu-Lab/Valuate-and-Enhance-Multimodal-Cooperation}. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ PrPSeg: Universal Proposition Learning for Panoramic Renal Pathology + Segmentation + + +
+ Understanding the anatomy of renal pathology is crucial for advancing disease +diagnostics, treatment evaluation, and clinical research. The complex kidney +system comprises various components across multiple levels, including regions +(cortex, medulla), functional units (glomeruli, tubules), and cells (podocytes, +mesangial cells in glomerulus). Prior studies have predominantly overlooked the +intricate spatial interrelations among objects from clinical knowledge. In this +research, we introduce a novel universal proposition learning approach, called +panoramic renal pathology segmentation (PrPSeg), designed to segment +comprehensively panoramic structures within kidney by integrating extensive +knowledge of kidney anatomy. + In this paper, we propose (1) the design of a comprehensive universal +proposition matrix for renal pathology, facilitating the incorporation of +classification and spatial relationships into the segmentation process; (2) a +token-based dynamic head single network architecture, with the improvement of +the partial label image segmentation and capability for future data +enlargement; and (3) an anatomy loss function, quantifying the inter-object +relationships across the kidney. + +
+
+ comment: IEEE / CVF Computer Vision and Pattern Recognition Conference 2024 +
+
+
+
+
+ + ♻ ☆ Implicit Discriminative Knowledge Learning for Visible-Infrared Person + Re-Identification CVPR 2024 + + +
+ Visible-Infrared Person Re-identification (VI-ReID) is a challenging +cross-modal pedestrian retrieval task, due to significant intra-class +variations and cross-modal discrepancies among different cameras. Existing +works mainly focus on embedding images of different modalities into a unified +space to mine modality-shared features. They only seek distinctive information +within these shared features, while ignoring the identity-aware useful +information that is implicit in the modality-specific features. To address this +issue, we propose a novel Implicit Discriminative Knowledge Learning (IDKL) +network to uncover and leverage the implicit discriminative information +contained within the modality-specific. First, we extract modality-specific and +modality-shared features using a novel dual-stream network. Then, the +modality-specific features undergo purification to reduce their modality style +discrepancies while preserving identity-aware discriminative knowledge. +Subsequently, this kind of implicit knowledge is distilled into the +modality-shared feature to enhance its distinctiveness. Finally, an alignment +loss is proposed to minimize modality discrepancy on enhanced modality-shared +features. Extensive experiments on multiple public datasets demonstrate the +superiority of IDKL network over the state-of-the-art methods. Code is +available at https://github.com/1KK077/IDKL. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ IDAdapter: Learning Mixed Features for Tuning-Free Personalization of + Text-to-Image Models + + +
+ Leveraging Stable Diffusion for the generation of personalized portraits has +emerged as a powerful and noteworthy tool, enabling users to create +high-fidelity, custom character avatars based on their specific prompts. +However, existing personalization methods face challenges, including test-time +fine-tuning, the requirement of multiple input images, low preservation of +identity, and limited diversity in generated outcomes. To overcome these +challenges, we introduce IDAdapter, a tuning-free approach that enhances the +diversity and identity preservation in personalized image generation from a +single face image. IDAdapter integrates a personalized concept into the +generation process through a combination of textual and visual injections and a +face identity loss. During the training phase, we incorporate mixed features +from multiple reference images of a specific identity to enrich +identity-related content details, guiding the model to generate images with +more diverse styles, expressions, and angles compared to previous works. +Extensive evaluations demonstrate the effectiveness of our method, achieving +both diversity and identity fidelity in generated images. + +
+
+ comment: 14 pages, 15 figures +
+
+
+
+
+ + ♻ ☆ Aligning Step-by-Step Instructional Diagrams to Video Demonstrations + + +
+ Multimodal alignment facilitates the retrieval of instances from one modality +when queried using another. In this paper, we consider a novel setting where +such an alignment is between (i) instruction steps that are depicted as +assembly diagrams (commonly seen in Ikea assembly manuals) and (ii) video +segments from in-the-wild videos; these videos comprising an enactment of the +assembly actions in the real world. To learn this alignment, we introduce a +novel supervised contrastive learning method that learns to align videos with +the subtle details in the assembly diagrams, guided by a set of novel losses. +To study this problem and demonstrate the effectiveness of our method, we +introduce a novel dataset: IAW for Ikea assembly in the wild consisting of 183 +hours of videos from diverse furniture assembly collections and nearly 8,300 +illustrations from their associated instruction manuals and annotated for their +ground truth alignments. We define two tasks on this dataset: First, nearest +neighbor retrieval between video segments and illustrations, and, second, +alignment of instruction steps and the segments for each video. Extensive +experiments on IAW demonstrate superior performances of our approach against +alternatives. + +
+
+ comment: Project website: + https://academic.davidz.cn/en/publication/zhang-cvpr-2023/ +
+
+
+
+
+ + ♻ ☆ R2Human: Real-Time 3D Human Appearance Rendering from a Single Image + + +
+ Rendering 3D human appearance in different views is crucial for achieving +holographic communication and immersive VR/AR. Existing methods either rely on +multi-camera setups or have low-quality rendered images from a single image. In +this paper, we propose R2Human, the first approach for real-time inference and +rendering of photorealistic 3D human appearance from a single image. The core +of our approach is to combine the strengths of implicit texture fields and +explicit neural rendering with our novel representation, namely Z-map. Based on +this, we present an end-to-end network that performs high-fidelity color +reconstruction of visible areas and provides reliable color inference for +occluded regions. To further enhance the 3D perception ability of our network, +we leverage the Fourier occupancy field as a prior for generating the texture +field and providing a sampling surface in the rendering stage. We also propose +a consistency loss and a spatio-temporal fusion strategy to ensure the +multi-view coherence. Experimental results show that our method outperforms the +state-of-the-art methods on both synthetic data and challenging real-world +images, in real time. + +
+
+
+
+
+ + ♻ ☆ V2X-DGW: Domain Generalization for Multi-agent Perception under Adverse + Weather Conditions + + +
+ Current LiDAR-based Vehicle-to-Everything (V2X) multi-agent perception +systems have shown the significant success on 3D object detection. While these +models perform well in the trained clean weather, they struggle in unseen +adverse weather conditions with the real-world domain gap. In this paper, we +propose a domain generalization approach, named V2X-DGW, for LiDAR-based 3D +object detection on multi-agent perception system under adverse weather +conditions. Not only in the clean weather does our research aim to ensure +favorable multi-agent performance, but also in the unseen adverse weather +conditions by learning only on the clean weather data. To advance research in +this area, we have simulated the impact of three prevalent adverse weather +conditions on two widely-used multi-agent datasets, resulting in the creation +of two novel benchmark datasets: OPV2V-w and V2XSet-w. + To this end, we first introduce the Adaptive Weather Augmentation (AWA) to +mimic the unseen adverse weather conditions, and then propose two alignments +for generalizable representation learning: Trust-region Weather-invariant +Alignment (TWA) and Agent-aware Contrastive Alignment (ACA). Extensive +experimental results demonstrate that our V2X-DGW achieved improvements in the +unseen adverse weather conditions. + +
+
+
+
+
+ + ♻ ☆ ComCLIP: Training-Free Compositional Image and Text Matching + + +
+ Contrastive Language-Image Pretraining (CLIP) has demonstrated great +zero-shot performance for matching images and text. However, it is still +challenging to adapt vision-lanaguage pretrained models like CLIP to +compositional image and text matching -- a more challenging image and text +matching task requiring the model understanding of compositional word concepts +and visual components. Towards better compositional generalization in zero-shot +image and text matching, in this paper, we study the problem from a causal +perspective: the erroneous semantics of individual entities are essentially +confounders that cause the matching failure. Therefore, we propose a novel +\textbf{\textit{training-free}} compositional CLIP model (ComCLIP). ComCLIP +disentangles input images into subjects, objects, and action sub-images and +composes CLIP's vision encoder and text encoder to perform evolving matching +over compositional text embedding and sub-image embeddings. In this way, +ComCLIP can mitigate spurious correlations introduced by the pretrained CLIP +models and dynamically evaluate the importance of each component. Experiments +on four compositional image-text matching datasets: SVO, ComVG, Winoground, and +VL-checklist, and two general image-text retrieval datasets: Flick30K, and +MSCOCO demonstrate the effectiveness of our plug-and-play method, which boosts +the \textbf{\textit{zero-shot}} inference ability of CLIP, SLIP, and BLIP2 even +without further training or fine-tuning. Our codes can be found at +https://github.com/eric-ai-lab/ComCLIP. + +
+
+
+
+
+ + ♻ ☆ FourCastNeXt: Optimizing FourCastNet Training for Limited Compute + + +
+ FourCastNeXt is an optimization of FourCastNet - a global machine learning +weather forecasting model - that performs with a comparable level of accuracy +and can be trained using around 5% of the original FourCastNet computational +requirements. This technical report presents strategies for model optimization +that maintain similar performance as measured by the root-mean-square error +(RMSE) of the modelled variables. By providing a model with very low +comparative training costs, FourCastNeXt makes Neural Earth System Modelling +much more accessible to researchers looking to conduct training experiments and +ablation studies. FourCastNeXt training and inference code are available at +https://github.com/nci/FourCastNeXt + +
+
+ comment: Major revision. All prior content (text, figures, table) has been + updated. Additionally, new text, tables and figures have been added. Updated + title. Updated author list +
+
+
+
+
+ + ♻ ☆ VQPy: An Object-Oriented Approach to Modern Video Analytics + + +
+ Video analytics is widely used in contemporary systems and services. At the +forefront of video analytics are video queries that users develop to find +objects of particular interest. Building upon the insight that video objects +(e.g., human, animals, cars, etc.), the center of video analytics, are similar +in spirit to objects modeled by traditional object-oriented languages, we +propose to develop an object-oriented approach to video analytics. This +approach, named VQPy, consists of a frontend$\unicode{x2015}$a Python variant +with constructs that make it easy for users to express video objects and their +interactions$\unicode{x2015}$as well as an extensible backend that can +automatically construct and optimize pipelines based on video objects. We have +implemented and open-sourced VQPy, which has been productized in Cisco as part +of its DeepVision framework. + +
+
+
+
+
+ + ♻ ☆ TD-MPC2: Scalable, Robust World Models for Continuous Control ICLR 2024 + + +
+ TD-MPC is a model-based reinforcement learning (RL) algorithm that performs +local trajectory optimization in the latent space of a learned implicit +(decoder-free) world model. In this work, we present TD-MPC2: a series of +improvements upon the TD-MPC algorithm. We demonstrate that TD-MPC2 improves +significantly over baselines across 104 online RL tasks spanning 4 diverse task +domains, achieving consistently strong results with a single set of +hyperparameters. We further show that agent capabilities increase with model +and data size, and successfully train a single 317M parameter agent to perform +80 tasks across multiple task domains, embodiments, and action spaces. We +conclude with an account of lessons, opportunities, and risks associated with +large TD-MPC2 agents. Explore videos, models, data, code, and more at +https://tdmpc2.com + +
+
+ comment: ICLR 2024. Explore videos, models, data, code, and more at + https://tdmpc2.com +
+
+
+
+
+ + ♻ ☆ Dodging DeepFake Detection via Implicit Spatial-Domain Notch Filtering + + +
+ The current high-fidelity generation and high-precision detection of DeepFake +images are at an arms race. We believe that producing DeepFakes that are highly +realistic and 'detection evasive' can serve the ultimate goal of improving +future generation DeepFake detection capabilities. In this paper, we propose a +simple yet powerful pipeline to reduce the artifact patterns of fake images +without hurting image quality by performing implicit spatial-domain notch +filtering. We first demonstrate that frequency-domain notch filtering, although +famously shown to be effective in removing periodic noise in the spatial +domain, is infeasible for our task at hand due to the manual designs required +for the notch filters. We, therefore, resort to a learning-based approach to +reproduce the notch filtering effects, but solely in the spatial domain. We +adopt a combination of adding overwhelming spatial noise for breaking the +periodic noise pattern and deep image filtering to reconstruct the noise-free +fake images, and we name our method DeepNotch. Deep image filtering provides a +specialized filter for each pixel in the noisy image, producing filtered images +with high fidelity compared to their DeepFake counterparts. Moreover, we also +use the semantic information of the image to generate an adversarial guidance +map to add noise intelligently. Our large-scale evaluation on 3 representative +state-of-the-art DeepFake detection methods (tested on 16 types of DeepFakes) +has demonstrated that our technique significantly reduces the accuracy of these +3 fake image detection methods, 36.79% on average and up to 97.02% in the best +case. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ♻ ☆ RGNet: A Unified Clip Retrieval and Grounding Network for Long Videos + + +
+ Locating specific moments within long videos (20-120 minutes) presents a +significant challenge, akin to finding a needle in a haystack. Adapting +existing short video (5-30 seconds) grounding methods to this problem yields +poor performance. Since most real life videos, such as those on YouTube and +AR/VR, are lengthy, addressing this issue is crucial. Existing methods +typically operate in two stages: clip retrieval and grounding. However, this +disjoint process limits the retrieval module's fine-grained event +understanding, crucial for specific moment detection. We propose RGNet which +deeply integrates clip retrieval and grounding into a single network capable of +processing long videos into multiple granular levels, e.g., clips and frames. +Its core component is a novel transformer encoder, RG-Encoder, that unifies the +two stages through shared features and mutual optimization. The encoder +incorporates a sparse attention mechanism and an attention loss to model both +granularity jointly. Moreover, we introduce a contrastive clip sampling +technique to mimic the long video paradigm closely during training. RGNet +surpasses prior methods, showcasing state-of-the-art performance on long video +temporal grounding (LVTG) datasets MAD and Ego4D. + +
+
+ comment: The code is released at https://github.com/Tanveer81/RGNet +
+
+
+
+
+ + ♻ ☆ Tur[k]ingBench: A Challenge Benchmark for Web Agents + + +
+ Recent chatbots have demonstrated impressive ability to understand and +communicate in raw-text form. However, there is more to the world than raw +text. For example, humans spend long hours of their time on web pages, where +text is intertwined with other modalities and tasks are accomplished in the +form of various complex interactions. Can state-of-the-art multi-modal models +generalize to such complex domains? + To address this question, we introduce TurkingBench, a benchmark of tasks +formulated as web pages containing textual instructions with multi-modal +context. Unlike existing work which employs artificially synthesized web pages, +here we use natural HTML pages that were originally designed for crowdsourcing +workers for various annotation purposes. The HTML instructions of each task are +also instantiated with various values (obtained from the crowdsourcing tasks) +to form new instances of the task. This benchmark contains 32.2K instances +distributed across 158 tasks. + Additionally, to facilitate the evaluation on TurkingBench, we develop an +evaluation framework that connects the responses of chatbots to modifications +on web pages (modifying a text box, checking a radio, etc.). We evaluate the +performance of state-of-the-art models, including language-only, vision-only, +and layout-only models, and their combinations, on this benchmark. Our findings +reveal that these models perform significantly better than random chance, yet +considerable room exists for improvement. We hope this benchmark will help +facilitate the evaluation and development of web-based agents. + +
+
+
+
+
+ + ♻ ☆ Short-Form Videos and Mental Health: A Knowledge-Guided Neural Topic + Model + + +
+ While short-form videos head to reshape the entire social media landscape, +experts are exceedingly worried about their depressive impacts on viewers, as +evidenced by medical studies. To prevent widespread consequences, platforms are +eager to predict these videos' impact on viewers' mental health. Subsequently, +they can take intervention measures, such as revising recommendation algorithms +and displaying viewer discretion. Nevertheless, applicable predictive methods +lack relevance to well-established medical knowledge, which outlines clinically +proven external and environmental factors of depression. To account for such +medical knowledge, we resort to an emergent methodological discipline, seeded +Neural Topic Models (NTMs). However, existing seeded NTMs suffer from the +limitations of single-origin topics, unknown topic sources, unclear seed +supervision, and suboptimal convergence. To address those challenges, we +develop a novel Knowledge-guided Multimodal NTM to predict a short-form video's +depressive impact on viewers. Extensive empirical analyses using TikTok and +Douyin datasets prove that our method outperforms state-of-the-art benchmarks. +Our method also discovers medically relevant topics from videos that are linked +to depressive impact. We contribute to IS with a novel video analytics method +that is generalizable to other video classification problems. Practically, our +method can help platforms understand videos' mental impacts, thus adjusting +recommendations and video topic disclosure. + +
+
+
+
+
+ + ♻ ☆ PointNeRF++: A multi-scale, point-based Neural Radiance Field + + +
+ Point clouds offer an attractive source of information to complement images +in neural scene representations, especially when few images are available. +Neural rendering methods based on point clouds do exist, but they do not +perform well when the point cloud quality is low -- e.g., sparse or incomplete, +which is often the case with real-world data. We overcome these problems with a +simple representation that aggregates point clouds at multiple scale levels +with sparse voxel grids at different resolutions. To deal with point cloud +sparsity, we average across multiple scale levels -- but only among those that +are valid, i.e., that have enough neighboring points in proximity to the ray of +a pixel. To help model areas without points, we add a global voxel at the +coarsest scale, thus unifying ``classical'' and point-based NeRF formulations. +We validate our method on the NeRF Synthetic, ScanNet, and KITTI-360 datasets, +outperforming the state of the art, with a significant gap compared to other +NeRF-based methods, especially on more challenging scenes. + +
+
+ comment: Project website: https://pointnerfpp.github.io/ +
+
+
+
+
+ + ♻ ☆ MiKASA: Multi-Key-Anchor & Scene-Aware Transformer for 3D Visual + Grounding + + +
+ 3D visual grounding involves matching natural language descriptions with +their corresponding objects in 3D spaces. Existing methods often face +challenges with accuracy in object recognition and struggle in interpreting +complex linguistic queries, particularly with descriptions that involve +multiple anchors or are view-dependent. In response, we present the MiKASA +(Multi-Key-Anchor Scene-Aware) Transformer. Our novel end-to-end trained model +integrates a self-attention-based scene-aware object encoder and an original +multi-key-anchor technique, enhancing object recognition accuracy and the +understanding of spatial relationships. Furthermore, MiKASA improves the +explainability of decision-making, facilitating error diagnosis. Our model +achieves the highest overall accuracy in the Referit3D challenge for both the +Sr3D and Nr3D datasets, particularly excelling by a large margin in categories +that require viewpoint-dependent descriptions. + +
+
+
+
+
+ + ♻ ☆ FERGI: Automatic Annotation of User Preferences for Text-to-Image + Generation from Spontaneous Facial Expression Reaction + + +
+ Researchers have proposed to use data of human preference feedback to +fine-tune text-to-image generative models. However, the scalability of human +feedback collection has been limited by its reliance on manual annotation. +Therefore, we develop and test a method to automatically annotate user +preferences from their spontaneous facial expression reaction to the generated +images. We collect a dataset of Facial Expression Reaction to Generated Images +(FERGI) and show that the activations of multiple facial action units (AUs) are +highly correlated with user evaluations of the generated images. Specifically, +AU4 (brow lowerer) is reflective of negative evaluations of the generated image +whereas AU12 (lip corner puller) is reflective of positive evaluations. These +can be useful in two ways. Firstly, we can automatically annotate user +preferences between image pairs with substantial difference in these AU +responses with an accuracy significantly outperforming state-of-the-art scoring +models. Secondly, directly integrating the AU responses with the scoring models +improves their consistency with human preferences. Finally, this method of +automatic annotation with facial expression analysis can be potentially +generalized to other generation tasks. The code is available at +https://github.com/ShuangquanFeng/FERGI, and the dataset is also available at +the same link for research purposes. + +
+
+
+
+
+ + ♻ ☆ The Manga Whisperer: Automatically Generating Transcriptions for Comics CVPR'24 + + +
+ In the past few decades, Japanese comics, commonly referred to as Manga, have +transcended both cultural and linguistic boundaries to become a true worldwide +sensation. Yet, the inherent reliance on visual cues and illustration within +manga renders it largely inaccessible to individuals with visual impairments. +In this work, we seek to address this substantial barrier, with the aim of +ensuring that manga can be appreciated and actively engaged by everyone. +Specifically, we tackle the problem of diarisation i.e. generating a +transcription of who said what and when, in a fully automatic way. + To this end, we make the following contributions: (1) we present a unified +model, Magi, that is able to (a) detect panels, text boxes and character boxes, +(b) cluster characters by identity (without knowing the number of clusters +apriori), and (c) associate dialogues to their speakers; (2) we propose a novel +approach that is able to sort the detected text boxes in their reading order +and generate a dialogue transcript; (3) we annotate an evaluation benchmark for +this task using publicly available [English] manga pages. The code, evaluation +datasets and the pre-trained model can be found at: +https://github.com/ragavsachdeva/magi. + +
+
+ comment: Accepted at CVPR'24 +
+
+
+
+
+ + ♻ ☆ Generalizing deep learning models for medical image classification + + +
+ Numerous Deep Learning (DL) models have been developed for a large spectrum +of medical image analysis applications, which promises to reshape various +facets of medical practice. Despite early advances in DL model validation and +implementation, which encourage healthcare institutions to adopt them, some +fundamental questions remain: are the DL models capable of generalizing? What +causes a drop in DL model performances? How to overcome the DL model +performance drop? Medical data are dynamic and prone to domain shift, due to +multiple factors such as updates to medical equipment, new imaging workflow, +and shifts in patient demographics or populations can induce this drift over +time. In this paper, we review recent developments in generalization methods +for DL-based classification models. We also discuss future challenges, +including the need for improved evaluation protocols and benchmarks, and +envisioned future developments to achieve robust, generalized models for +medical image classification. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 214 + +
+
+
+ + ☆ On Pretraining Data Diversity for Self-Supervised Learning + + +
+ We explore the impact of training with more diverse datasets, characterized +by the number of unique samples, on the performance of self-supervised learning +(SSL) under a fixed computational budget. Our findings consistently demonstrate +that increasing pretraining data diversity enhances SSL performance, albeit +only when the distribution distance to the downstream data is minimal. Notably, +even with an exceptionally large pretraining data diversity achieved through +methods like web crawling or diffusion-generated data, among other ways, the +distribution shift remains a challenge. Our experiments are comprehensive with +seven SSL methods using large-scale datasets such as ImageNet and YFCC100M +amounting to over 200 GPU days. Code and trained models will be available at +https://github.com/hammoudhasan/DiversitySSL . + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Editing Massive Concepts in Text-to-Image Diffusion Models + + +
+ Text-to-image diffusion models suffer from the risk of generating outdated, +copyrighted, incorrect, and biased content. While previous methods have +mitigated the issues on a small scale, it is essential to handle them +simultaneously in larger-scale real-world scenarios. We propose a two-stage +method, Editing Massive Concepts In Diffusion Models (EMCID). The first stage +performs memory optimization for each individual concept with dual +self-distillation from text alignment loss and diffusion noise prediction loss. +The second stage conducts massive concept editing with multi-layer, closed form +model editing. We further propose a comprehensive benchmark, named ImageNet +Concept Editing Benchmark (ICEB), for evaluating massive concept editing for +T2I models with two subtasks, free-form prompts, massive concept categories, +and extensive evaluation metrics. Extensive experiments conducted on our +proposed benchmark and previous benchmarks demonstrate the superior scalability +of EMCID for editing up to 1,000 concepts, providing a practical approach for +fast adjustment and re-deployment of T2I diffusion models in real-world +applications. + +
+
+ comment: Project page: https://silentview.github.io/EMCID/ . Code: + https://github.com/SilentView/EMCID +
+
+
+
+
+ + ☆ RAR: Retrieving And Ranking Augmented MLLMs for Visual Recognition + + +
+ CLIP (Contrastive Language-Image Pre-training) uses contrastive learning from +noise image-text pairs to excel at recognizing a wide array of candidates, yet +its focus on broad associations hinders the precision in distinguishing subtle +differences among fine-grained items. Conversely, Multimodal Large Language +Models (MLLMs) excel at classifying fine-grained categories, thanks to their +substantial knowledge from pre-training on web-level corpora. However, the +performance of MLLMs declines with an increase in category numbers, primarily +due to growing complexity and constraints of limited context window size. To +synergize the strengths of both approaches and enhance the few-shot/zero-shot +recognition abilities for datasets characterized by extensive and fine-grained +vocabularies, this paper introduces RAR, a Retrieving And Ranking augmented +method for MLLMs. We initially establish a multi-modal retriever based on CLIP +to create and store explicit memory for different categories beyond the +immediate context window. During inference, RAR retrieves the top-k similar +results from the memory and uses MLLMs to rank and make the final predictions. +Our proposed approach not only addresses the inherent limitations in +fine-grained recognition but also preserves the model's comprehensive knowledge +base, significantly boosting accuracy across a range of vision-language +recognition tasks. Notably, our approach demonstrates a significant improvement +in performance on 5 fine-grained visual recognition benchmarks, 11 few-shot +image recognition datasets, and the 2 object detection datasets under the +zero-shot recognition setting. + +
+
+ comment: Project: https://github.com/Liuziyu77/RAR +
+
+
+
+
+ + ☆ RadSplat: Radiance Field-Informed Gaussian Splatting for Robust + Real-Time Rendering with 900+ FPS + + +
+ Recent advances in view synthesis and real-time rendering have achieved +photorealistic quality at impressive rendering speeds. While Radiance +Field-based methods achieve state-of-the-art quality in challenging scenarios +such as in-the-wild captures and large-scale scenes, they often suffer from +excessively high compute requirements linked to volumetric rendering. Gaussian +Splatting-based methods, on the other hand, rely on rasterization and naturally +achieve real-time rendering but suffer from brittle optimization heuristics +that underperform on more challenging scenes. In this work, we present +RadSplat, a lightweight method for robust real-time rendering of complex +scenes. Our main contributions are threefold. First, we use radiance fields as +a prior and supervision signal for optimizing point-based scene +representations, leading to improved quality and more robust optimization. +Next, we develop a novel pruning technique reducing the overall point count +while maintaining high quality, leading to smaller and more compact scene +representations with faster inference speeds. Finally, we propose a novel +test-time filtering approach that further accelerates rendering and allows to +scale to larger, house-sized scenes. We find that our method enables +state-of-the-art synthesis of complex captures at 900+ FPS. + +
+
+ comment: Project page at https://m-niemeyer.github.io/radsplat/ +
+
+
+
+
+ + ☆ Learning from Models and Data for Visual Grounding + + +
+ We introduce SynGround, a novel framework that combines data-driven learning +and knowledge transfer from various large-scale pretrained models to enhance +the visual grounding capabilities of a pretrained vision-and-language model. +The knowledge transfer from the models initiates the generation of image +descriptions through an image description generator. These descriptions serve +dual purposes: they act as prompts for synthesizing images through a +text-to-image generator, and as queries for synthesizing text, from which +phrases are extracted using a large language model. Finally, we leverage an +open-vocabulary object detector to generate synthetic bounding boxes for the +synthetic images and texts. We finetune a pretrained vision-and-language model +on this dataset by optimizing a mask-attention consistency objective that +aligns region annotations with gradient-based model explanations. The resulting +model improves the grounding capabilities of an off-the-shelf +vision-and-language model. Particularly, SynGround improves the pointing game +accuracy of ALBEF on the Flickr30k dataset from 79.38% to 87.26%, and on +RefCOCO+ Test A from 69.35% to 79.06% and on RefCOCO+ Test B from 53.77% to +63.67%. + +
+
+ comment: Project Page: https://catherine-r-he.github.io/SynGround/ +
+
+
+
+
+ + ☆ Bounding Box Stability against Feature Dropout Reflects Detector + Generalization across Environments ICLR 2024 + + +
+ Bounding boxes uniquely characterize object detection, where a good detector +gives accurate bounding boxes of categories of interest. However, in the +real-world where test ground truths are not provided, it is non-trivial to find +out whether bounding boxes are accurate, thus preventing us from assessing the +detector generalization ability. In this work, we find under feature map +dropout, good detectors tend to output bounding boxes whose locations do not +change much, while bounding boxes of poor detectors will undergo noticeable +position changes. We compute the box stability score (BoS score) to reflect +this stability. Specifically, given an image, we compute a normal set of +bounding boxes and a second set after feature map dropout. To obtain BoS score, +we use bipartite matching to find the corresponding boxes between the two sets +and compute the average Intersection over Union (IoU) across the entire test +set. We contribute to finding that BoS score has a strong, positive correlation +with detection accuracy measured by mean average precision (mAP) under various +test environments. This relationship allows us to predict the accuracy of +detectors on various real-world test sets without accessing test ground truths, +verified on canonical detection tasks such as vehicle detection and pedestrian +detection. Code and data are available at https://github.com/YangYangGirl/BoS. + +
+
+ comment: ICLR 2024 spotlight +
+
+
+
+
+ + ☆ ZigMa: Zigzag Mamba Diffusion Model + + +
+ The diffusion model has long been plagued by scalability and quadratic +complexity issues, especially within transformer-based structures. In this +study, we aim to leverage the long sequence modeling capability of a +State-Space Model called Mamba to extend its applicability to visual data +generation. Firstly, we identify a critical oversight in most current +Mamba-based vision methods, namely the lack of consideration for spatial +continuity in the scan scheme of Mamba. Secondly, building upon this insight, +we introduce a simple, plug-and-play, zero-parameter method named Zigzag Mamba, +which outperforms Mamba-based baselines and demonstrates improved speed and +memory utilization compared to transformer-based baselines. Lastly, we +integrate Zigzag Mamba with the Stochastic Interpolant framework to investigate +the scalability of the model on large-resolution visual datasets, such as +FacesHQ $1024\times 1024$ and UCF101, MultiModal-CelebA-HQ, and MS COCO +$256\times 256$. Code will be released at https://taohu.me/zigma/ + +
+
+ comment: Project Page: https://taohu.me/zigma/ +
+
+
+
+
+ + ☆ TimeRewind: Rewinding Time with Image-and-Events Video Diffusion + + +
+ This paper addresses the novel challenge of ``rewinding'' time from a single +captured image to recover the fleeting moments missed just before the shutter +button is pressed. This problem poses a significant challenge in computer +vision and computational photography, as it requires predicting plausible +pre-capture motion from a single static frame, an inherently ill-posed task due +to the high degree of freedom in potential pixel movements. We overcome this +challenge by leveraging the emerging technology of neuromorphic event cameras, +which capture motion information with high temporal resolution, and integrating +this data with advanced image-to-video diffusion models. Our proposed framework +introduces an event motion adaptor conditioned on event camera data, guiding +the diffusion model to generate videos that are visually coherent and +physically grounded in the captured events. Through extensive experimentation, +we demonstrate the capability of our approach to synthesize high-quality videos +that effectively ``rewind'' time, showcasing the potential of combining event +camera technology with generative models. Our work opens new avenues for +research at the intersection of computer vision, computational photography, and +generative modeling, offering a forward-thinking solution to capturing missed +moments and enhancing future consumer cameras and smartphones. Please see the +project page at https://timerewind.github.io/ for video results and code +release. + +
+
+
+
+
+ + ☆ Hierarchical NeuroSymbolic Approach for Action Quality Assessment + + +
+ Action quality assessment (AQA) applies computer vision to quantitatively +assess the performance or execution of a human action. Current AQA approaches +are end-to-end neural models, which lack transparency and tend to be biased +because they are trained on subjective human judgements as ground-truth. To +address these issues, we introduce a neuro-symbolic paradigm for AQA, which +uses neural networks to abstract interpretable symbols from video data and +makes quality assessments by applying rules to those symbols. We take diving as +the case study. We found that domain experts prefer our system and find it more +informative than purely neural approaches to AQA in diving. Our system also +achieves state-of-the-art action recognition and temporal segmentation, and +automatically generates a detailed report that breaks the dive down into its +elements and provides objective scoring with visual evidence. As verified by a +group of domain experts, this report may be used to assist judges in scoring, +help train judges, and provide feedback to divers. We will open-source all of +our annotated training data and code for ease of reproducibility. + +
+
+
+
+
+ + ☆ Bridge the Modality and Capacity Gaps in Vision-Language Model Selection + + +
+ Vision Language Models (VLMs) excel in zero-shot image classification by +pairing images with textual category names. The expanding variety of +Pre-Trained VLMs enhances the likelihood of identifying a suitable VLM for +specific tasks. Thus, a promising zero-shot image classification strategy is +selecting the most appropriate Pre-Trained VLM from the VLM Zoo, relying solely +on the text data of the target dataset without access to the dataset's images. +In this paper, we analyze two inherent challenges in assessing the ability of a +VLM in this Language-Only VLM selection: the "Modality Gap" -- the disparity in +VLM's embeddings across two different modalities, making text a less reliable +substitute for images; and the "Capability Gap" -- the discrepancy between the +VLM's overall ranking and its ranking for target dataset, hindering direct +prediction of a model's dataset-specific performance from its general +performance. We propose VLM Selection With gAp Bridging (SWAB) to mitigate the +negative impact of these two gaps. SWAB first adopts optimal transport to +capture the relevance between open-source datasets and target dataset with a +transportation matrix. It then uses this matrix to transfer useful statistics +of VLMs from open-source datasets to the target dataset for bridging those two +gaps and enhancing the VLM's capacity estimation for VLM selection. Experiments +across various VLMs and image classification datasets validate SWAB's +effectiveness. + +
+
+
+
+
+ + ☆ DepthFM: Fast Monocular Depth Estimation with Flow Matching + + +
+ Monocular depth estimation is crucial for numerous downstream vision tasks +and applications. Current discriminative approaches to this problem are limited +due to blurry artifacts, while state-of-the-art generative methods suffer from +slow sampling due to their SDE nature. Rather than starting from noise, we seek +a direct mapping from input image to depth map. We observe that this can be +effectively framed using flow matching, since its straight trajectories through +solution space offer efficiency and high quality. Our study demonstrates that a +pre-trained image diffusion model can serve as an adequate prior for a flow +matching depth model, allowing efficient training on only synthetic data to +generalize to real images. We find that an auxiliary surface normals loss +further improves the depth estimates. Due to the generative nature of our +approach, our model reliably predicts the confidence of its depth estimates. On +standard benchmarks of complex natural scenes, our lightweight approach +exhibits state-of-the-art performance at favorable low computational cost +despite only being trained on little synthetic data. + +
+
+
+
+
+ + ☆ Certified Human Trajectory Prediction + + +
+ Trajectory prediction plays an essential role in autonomous vehicles. While +numerous strategies have been developed to enhance the robustness of trajectory +prediction models, these methods are predominantly heuristic and do not offer +guaranteed robustness against adversarial attacks and noisy observations. In +this work, we propose a certification approach tailored for the task of +trajectory prediction. To this end, we address the inherent challenges +associated with trajectory prediction, including unbounded outputs, and +mutli-modality, resulting in a model that provides guaranteed robustness. +Furthermore, we integrate a denoiser into our method to further improve the +performance. Through comprehensive evaluations, we demonstrate the +effectiveness of the proposed technique across various baselines and using +standard trajectory prediction datasets. The code will be made available +online: https://s-attack.github.io/ + +
+
+
+
+
+ + ☆ Describe-and-Dissect: Interpreting Neurons in Vision Networks with + Language Models + + +
+ In this paper, we propose Describe-and-Dissect (DnD), a novel method to +describe the roles of hidden neurons in vision networks. DnD utilizes recent +advancements in multimodal deep learning to produce complex natural language +descriptions, without the need for labeled training data or a predefined set of +concepts to choose from. Additionally, DnD is training-free, meaning we don't +train any new models and can easily leverage more capable general purpose +models in the future. We have conducted extensive qualitative and quantitative +analysis to show that DnD outperforms prior work by providing higher quality +neuron descriptions. Specifically, our method on average provides the highest +quality labels and is more than 2 times as likely to be selected as the best +explanation for a neuron than the best baseline. + +
+
+
+
+
+ + ☆ Towards Principled Representation Learning from Videos for Reinforcement + Learning ICLR 2024 + + +
+ We study pre-training representations for decision-making using video data, +which is abundantly available for tasks such as game agents and software +testing. Even though significant empirical advances have been made on this +problem, a theoretical understanding remains absent. We initiate the +theoretical investigation into principled approaches for representation +learning and focus on learning the latent state representations of the +underlying MDP using video data. We study two types of settings: one where +there is iid noise in the observation, and a more challenging setting where +there is also the presence of exogenous noise, which is non-iid noise that is +temporally correlated, such as the motion of people or cars in the background. +We study three commonly used approaches: autoencoding, temporal contrastive +learning, and forward modeling. We prove upper bounds for temporal contrastive +learning and forward modeling in the presence of only iid noise. We show that +these approaches can learn the latent state and use it to do efficient +downstream RL with polynomial sample complexity. When exogenous noise is also +present, we establish a lower bound result showing that the sample complexity +of learning from video data can be exponentially worse than learning from +action-labeled trajectory data. This partially explains why reinforcement +learning with video pre-training is hard. We evaluate these representational +learning methods in two visual domains, yielding results that are consistent +with our theoretical findings. + +
+
+ comment: ICLR 2024 Spotlight Conference Paper +
+
+
+
+
+ + ☆ Practical End-to-End Optical Music Recognition for Pianoform Music + + +
+ The majority of recent progress in Optical Music Recognition (OMR) has been +achieved with Deep Learning methods, especially models following the end-to-end +paradigm, reading input images and producing a linear sequence of tokens. +Unfortunately, many music scores, especially piano music, cannot be easily +converted to a linear sequence. This has led OMR researchers to use custom +linearized encodings, instead of broadly accepted structured formats for music +notation. Their diversity makes it difficult to compare the performance of OMR +systems directly. To bring recent OMR model progress closer to useful results: +(a) We define a sequential format called Linearized MusicXML, allowing to train +an end-to-end model directly and maintaining close cohesion and compatibility +with the industry-standard MusicXML format. (b) We create a dev and test set +for benchmarking typeset OMR with MusicXML ground truth based on the OpenScore +Lieder corpus. They contain 1,438 and 1,493 pianoform systems, each with an +image from IMSLP. (c) We train and fine-tune an end-to-end model to serve as a +baseline on the dataset and employ the TEDn metric to evaluate the model. We +also test our model against the recently published synthetic pianoform dataset +GrandStaff and surpass the state-of-the-art results. + +
+
+ comment: 15+4 pages, 6 figures +
+
+
+
+
+ + ☆ HierCode: A Lightweight Hierarchical Codebook for Zero-shot Chinese Text + Recognition + + +
+ Text recognition, especially for complex scripts like Chinese, faces unique +challenges due to its intricate character structures and vast vocabulary. +Traditional one-hot encoding methods struggle with the representation of +hierarchical radicals, recognition of Out-Of-Vocabulary (OOV) characters, and +on-device deployment due to their computational intensity. To address these +challenges, we propose HierCode, a novel and lightweight codebook that exploits +the innate hierarchical nature of Chinese characters. HierCode employs a +multi-hot encoding strategy, leveraging hierarchical binary tree encoding and +prototype learning to create distinctive, informative representations for each +character. This approach not only facilitates zero-shot recognition of OOV +characters by utilizing shared radicals and structures but also excels in +line-level recognition tasks by computing similarity with visual features, a +notable advantage over existing methods. Extensive experiments across diverse +benchmarks, including handwritten, scene, document, web, and ancient text, have +showcased HierCode's superiority for both conventional and zero-shot Chinese +character or text recognition, exhibiting state-of-the-art performance with +significantly fewer parameters and fast inference speed. + +
+
+
+
+
+ + ☆ When Cars meet Drones: Hyperbolic Federated Learning for Source-Free + Domain Adaptation in Adverse Weather + + +
+ In Federated Learning (FL), multiple clients collaboratively train a global +model without sharing private data. In semantic segmentation, the Federated +source Free Domain Adaptation (FFreeDA) setting is of particular interest, +where clients undergo unsupervised training after supervised pretraining at the +server side. While few recent works address FL for autonomous vehicles, +intrinsic real-world challenges such as the presence of adverse weather +conditions and the existence of different autonomous agents are still +unexplored. To bridge this gap, we address both problems and introduce a new +federated semantic segmentation setting where both car and drone clients +co-exist and collaborate. Specifically, we propose a novel approach for this +setting which exploits a batch-norm weather-aware strategy to dynamically adapt +the model to the different weather conditions, while hyperbolic space +prototypes are used to align the heterogeneous client representations. Finally, +we introduce FLYAWARE, the first semantic segmentation dataset with adverse +weather data for aerial vehicles. + +
+
+
+
+
+ + ☆ Enhancing Gait Video Analysis in Neurodegenerative Diseases by Knowledge + Augmentation in Vision Language Model + + +
+ We present a knowledge augmentation strategy for assessing the diagnostic +groups and gait impairment from monocular gait videos. Based on a large-scale +pre-trained Vision Language Model (VLM), our model learns and improves visual, +textual, and numerical representations of patient gait videos, through a +collective learning across three distinct modalities: gait videos, +class-specific descriptions, and numerical gait parameters. Our specific +contributions are two-fold: First, we adopt a knowledge-aware prompt tuning +strategy to utilize the class-specific medical description in guiding the text +prompt learning. Second, we integrate the paired gait parameters in the form of +numerical texts to enhance the numeracy of the textual representation. Results +demonstrate that our model not only significantly outperforms state-of-the-art +(SOTA) in video-based classification tasks but also adeptly decodes the learned +class-specific text features into natural language descriptions using the +vocabulary of quantitative gait parameters. The code and the model will be made +available at our project page. + +
+
+
+
+
+ + ☆ Leveraging High-Resolution Features for Improved Deep Hashing-based + Image Retrieval + + +
+ Deep hashing techniques have emerged as the predominant approach for +efficient image retrieval. Traditionally, these methods utilize pre-trained +convolutional neural networks (CNNs) such as AlexNet and VGG-16 as feature +extractors. However, the increasing complexity of datasets poses challenges for +these backbone architectures in capturing meaningful features essential for +effective image retrieval. In this study, we explore the efficacy of employing +high-resolution features learned through state-of-the-art techniques for image +retrieval tasks. Specifically, we propose a novel methodology that utilizes +High-Resolution Networks (HRNets) as the backbone for the deep hashing task, +termed High-Resolution Hashing Network (HHNet). Our approach demonstrates +superior performance compared to existing methods across all tested benchmark +datasets, including CIFAR-10, NUS-WIDE, MS COCO, and ImageNet. This performance +improvement is more pronounced for complex datasets, which highlights the need +to learn high-resolution features for intricate image retrieval tasks. +Furthermore, we conduct a comprehensive analysis of different HRNet +configurations and provide insights into the optimal architecture for the deep +hashing task + +
+
+
+
+
+ + ☆ Be-Your-Outpainter: Mastering Video Outpainting through Input-Specific + Adaptation + + +
+ Video outpainting is a challenging task, aiming at generating video content +outside the viewport of the input video while maintaining inter-frame and +intra-frame consistency. Existing methods fall short in either generation +quality or flexibility. We introduce MOTIA Mastering Video Outpainting Through +Input-Specific Adaptation, a diffusion-based pipeline that leverages both the +intrinsic data-specific patterns of the source video and the image/video +generative prior for effective outpainting. MOTIA comprises two main phases: +input-specific adaptation and pattern-aware outpainting. The input-specific +adaptation phase involves conducting efficient and effective pseudo outpainting +learning on the single-shot source video. This process encourages the model to +identify and learn patterns within the source video, as well as bridging the +gap between standard generative processes and outpainting. The subsequent +phase, pattern-aware outpainting, is dedicated to the generalization of these +learned patterns to generate outpainting outcomes. Additional strategies +including spatial-aware insertion and noise travel are proposed to better +leverage the diffusion model's generative prior and the acquired video patterns +from source videos. Extensive evaluations underscore MOTIA's superiority, +outperforming existing state-of-the-art methods in widely recognized +benchmarks. Notably, these advancements are achieved without necessitating +extensive, task-specific tuning. + +
+
+ comment: Code will be available at https://github.com/G-U-N/Be-Your-Outpainter +
+
+
+
+
+ + ☆ DBA-Fusion: Tightly Integrating Deep Dense Visual Bundle Adjustment with + Multiple Sensors for Large-Scale Localization and Mapping + + +
+ Visual simultaneous localization and mapping (VSLAM) has broad applications, +with state-of-the-art methods leveraging deep neural networks for better +robustness and applicability. However, there is a lack of research in fusing +these learning-based methods with multi-sensor information, which could be +indispensable to push related applications to large-scale and complex +scenarios. In this paper, we tightly integrate the trainable deep dense bundle +adjustment (DBA) with multi-sensor information through a factor graph. In the +framework, recurrent optical flow and DBA are performed among sequential +images. The Hessian information derived from DBA is fed into a generic factor +graph for multi-sensor fusion, which employs a sliding window and supports +probabilistic marginalization. A pipeline for visual-inertial integration is +firstly developed, which provides the minimum ability of metric-scale +localization and mapping. Furthermore, other sensors (e.g., global navigation +satellite system) are integrated for driftless and geo-referencing +functionality. Extensive tests are conducted on both public datasets and +self-collected datasets. The results validate the superior localization +performance of our approach, which enables real-time dense mapping in +large-scale environments. The code has been made open-source +(https://github.com/GREAT-WHU/DBA-Fusion). + +
+
+
+
+
+ + ☆ Fostc3net:A Lightweight YOLOv5 Based On the Network Structure + Optimization + + +
+ Transmission line detection technology is crucial for automatic monitoring +and ensuring the safety of electrical facilities. The YOLOv5 series is +currently one of the most advanced and widely used methods for object +detection. However, it faces inherent challenges, such as high computational +load on devices and insufficient detection accuracy. To address these concerns, +this paper presents an enhanced lightweight YOLOv5 technique customized for +mobile devices, specifically intended for identifying objects associated with +transmission lines. The C3Ghost module is integrated into the convolutional +network of YOLOv5 to reduce floating point operations per second (FLOPs) in the +feature channel fusion process and improve feature expression performance. In +addition, a FasterNet module is introduced to replace the c3 module in the +YOLOv5 Backbone. The FasterNet module uses Partial Convolutions to process only +a portion of the input channels, improving feature extraction efficiency and +reducing computational overhead. To address the imbalance between simple and +challenging samples in the dataset and the diversity of aspect ratios of +bounding boxes, the wIoU v3 LOSS is adopted as the loss function. To validate +the performance of the proposed approach, Experiments are conducted on a custom +dataset of transmission line poles. The results show that the proposed model +achieves a 1% increase in detection accuracy, a 13% reduction in FLOPs, and a +26% decrease in model parameters compared to the existing YOLOv5.In the +ablation experiment, it was also discovered that while the Fastnet module and +the CSghost module improved the precision of the original YOLOv5 baseline +model, they caused a decrease in the mAP@.5-.95 metric. However, the +improvement of the wIoUv3 loss function significantly mitigated the decline of +the mAP@.5-.95 metric. + +
+
+
+
+
+ + ☆ Insight Into the Collocation of Multi-Source Satellite Imagery for + Multi-Scale Vessel Detection + + +
+ Ship detection from satellite imagery using Deep Learning (DL) is an +indispensable solution for maritime surveillance. However, applying DL models +trained on one dataset to others having differences in spatial resolution and +radiometric features requires many adjustments. To overcome this issue, this +paper focused on the DL models trained on datasets that consist of different +optical images and a combination of radar and optical data. When dealing with a +limited number of training images, the performance of DL models via this +approach was satisfactory. They could improve 5-20% of average precision, +depending on the optical images tested. Likewise, DL models trained on the +combined optical and radar dataset could be applied to both optical and radar +images. Our experiments showed that the models trained on an optical dataset +could be used for radar images, while those trained on a radar dataset offered +very poor scores when applied to optical images. + +
+
+ comment: 5 pages, accepted to IGARSS 2024 +
+
+
+
+
+ + ☆ MotorEase: Automated Detection of Motor Impairment Accessibility Issues + in Mobile App UIs ICSE 2024 + + +
+ Recent research has begun to examine the potential of automatically finding +and fixing accessibility issues that manifest in software. However, while +recent work makes important progress, it has generally been skewed toward +identifying issues that affect users with certain disabilities, such as those +with visual or hearing impairments. However, there are other groups of users +with different types of disabilities that also need software tooling support to +improve their experience. As such, this paper aims to automatically identify +accessibility issues that affect users with motor-impairments. + To move toward this goal, this paper introduces a novel approach, called +MotorEase, capable of identifying accessibility issues in mobile app UIs that +impact motor-impaired users. Motor-impaired users often have limited ability to +interact with touch-based devices, and instead may make use of a switch or +other assistive mechanism -- hence UIs must be designed to support both limited +touch gestures and the use of assistive devices. MotorEase adapts computer +vision and text processing techniques to enable a semantic understanding of app +UI screens, enabling the detection of violations related to four popular, +previously unexplored UI design guidelines that support motor-impaired users, +including: (i) visual touch target size, (ii) expanding sections, (iii) +persisting elements, and (iv) adjacent icon visual distance. We evaluate +MotorEase on a newly derived benchmark, called MotorCheck, that contains 555 +manually annotated examples of violations to the above accessibility +guidelines, across 1599 screens collected from 70 applications via a mobile app +testing tool. Our experiments illustrate that MotorEase is able to identify +violations with an average accuracy of ~90%, and a false positive rate of less +than 9%, outperforming baseline techniques. + +
+
+ comment: Accepted to ICSE 2024 Research Track, 13 pages +
+
+
+
+
+ + ☆ SPTNet: An Efficient Alternative Framework for Generalized Category + Discovery with Spatial Prompt Tuning ICLR 2024 + + +
+ Generalized Category Discovery (GCD) aims to classify unlabelled images from +both `seen' and `unseen' classes by transferring knowledge from a set of +labelled `seen' class images. A key theme in existing GCD approaches is +adapting large-scale pre-trained models for the GCD task. An alternate +perspective, however, is to adapt the data representation itself for better +alignment with the pre-trained model. As such, in this paper, we introduce a +two-stage adaptation approach termed SPTNet, which iteratively optimizes model +parameters (i.e., model-finetuning) and data parameters (i.e., prompt +learning). Furthermore, we propose a novel spatial prompt tuning method (SPT) +which considers the spatial property of image data, enabling the method to +better focus on object parts, which can transfer between seen and unseen +classes. We thoroughly evaluate our SPTNet on standard benchmarks and +demonstrate that our method outperforms existing GCD methods. Notably, we find +our method achieves an average accuracy of 61.4% on the SSB, surpassing prior +state-of-the-art methods by approximately 10%. The improvement is particularly +remarkable as our method yields extra parameters amounting to only 0.117% of +those in the backbone architecture. Project page: +https://visual-ai.github.io/sptnet. + +
+
+ comment: Accepted as a conference paper at ICLR 2024; Project page: + https://visual-ai.github.io/sptnet +
+
+
+
+
+ + ☆ DVMNet: Computing Relative Pose for Unseen Objects Beyond Hypotheses CVPR 2024 + + +
+ Determining the relative pose of an object between two images is pivotal to +the success of generalizable object pose estimation. Existing approaches +typically approximate the continuous pose representation with a large number of +discrete pose hypotheses, which incurs a computationally expensive process of +scoring each hypothesis at test time. By contrast, we present a Deep Voxel +Matching Network (DVMNet) that eliminates the need for pose hypotheses and +computes the relative object pose in a single pass. To this end, we map the two +input RGB images, reference and query, to their respective voxelized 3D +representations. We then pass the resulting voxels through a pose estimation +module, where the voxels are aligned and the pose is computed in an end-to-end +fashion by solving a least-squares problem. To enhance robustness, we introduce +a weighted closest voxel algorithm capable of mitigating the impact of noisy +voxels. We conduct extensive experiments on the CO3D, LINEMOD, and Objaverse +datasets, demonstrating that our method delivers more accurate relative pose +estimates for novel objects at a lower computational cost compared to +state-of-the-art methods. Our code is released at: +https://github.com/sailor-z/DVMNet/. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Step-Calibrated Diffusion for Biomedical Optical Image Restoration + + +
+ High-quality, high-resolution medical imaging is essential for clinical care. +Raman-based biomedical optical imaging uses non-ionizing infrared radiation to +evaluate human tissues in real time and is used for early cancer detection, +brain tumor diagnosis, and intraoperative tissue analysis. Unfortunately, +optical imaging is vulnerable to image degradation due to laser scattering and +absorption, which can result in diagnostic errors and misguided treatment. +Restoration of optical images is a challenging computer vision task because the +sources of image degradation are multi-factorial, stochastic, and +tissue-dependent, preventing a straightforward method to obtain paired +low-quality/high-quality data. Here, we present Restorative Step-Calibrated +Diffusion (RSCD), an unpaired image restoration method that views the image +restoration problem as completing the finishing steps of a diffusion-based +image generation task. RSCD uses a step calibrator model to dynamically +determine the severity of image degradation and the number of steps required to +complete the reverse diffusion process for image restoration. RSCD outperforms +other widely used unpaired image restoration methods on both image quality and +perceptual evaluation metrics for restoring optical images. Medical imaging +experts consistently prefer images restored using RSCD in blinded comparison +experiments and report minimal to no hallucinations. Finally, we show that RSCD +improves performance on downstream clinical imaging tasks, including automated +brain tumor diagnosis and deep tissue imaging. Our code is available at +https://github.com/MLNeurosurg/restorative_step-calibrated_diffusion. + +
+
+
+
+
+ + ☆ AUD-TGN: Advancing Action Unit Detection with Temporal Convolution and + GPT-2 in Wild Audiovisual Contexts + + +
+ Leveraging the synergy of both audio data and visual data is essential for +understanding human emotions and behaviors, especially in in-the-wild setting. +Traditional methods for integrating such multimodal information often stumble, +leading to less-than-ideal outcomes in the task of facial action unit +detection. To overcome these shortcomings, we propose a novel approach +utilizing audio-visual multimodal data. This method enhances audio feature +extraction by leveraging Mel Frequency Cepstral Coefficients (MFCC) and Log-Mel +spectrogram features alongside a pre-trained VGGish network. Moreover, this +paper adaptively captures fusion features across modalities by modeling the +temporal relationships, and ultilizes a pre-trained GPT-2 model for +sophisticated context-aware fusion of multimodal information. Our method +notably improves the accuracy of AU detection by understanding the temporal and +contextual nuances of the data, showcasing significant advancements in the +comprehension of intricate scenarios. These findings underscore the potential +of integrating temporal dynamics and contextual interpretation, paving the way +for future research endeavors. + +
+
+
+
+
+ + ☆ Retina Vision Transformer (RetinaViT): Introducing Scaled Patches into + Vision Transformers + + +
+ Humans see low and high spatial frequency components at the same time, and +combine the information from both to form a visual scene. Drawing on this +neuroscientific inspiration, we propose an altered Vision Transformer +architecture where patches from scaled down versions of the input image are +added to the input of the first Transformer Encoder layer. We name this model +Retina Vision Transformer (RetinaViT) due to its inspiration from the human +visual system. Our experiments show that when trained on the ImageNet-1K +dataset with a moderate configuration, RetinaViT achieves a 3.3% performance +improvement over the original ViT. We hypothesize that this improvement can be +attributed to the inclusion of low spatial frequency components in the input, +which improves the ability to capture structural features, and to select and +forward important features to deeper layers. RetinaViT thereby opens doors to +further investigations into vertical pathways and attention patterns. + +
+
+
+
+
+ + ☆ DanceCamera3D: 3D Camera Movement Synthesis with Music and Dance CVPR 2024 + + +
+ Choreographers determine what the dances look like, while cameramen determine +the final presentation of dances. Recently, various methods and datasets have +showcased the feasibility of dance synthesis. However, camera movement +synthesis with music and dance remains an unsolved challenging problem due to +the scarcity of paired data. Thus, we present DCM, a new multi-modal 3D +dataset, which for the first time combines camera movement with dance motion +and music audio. This dataset encompasses 108 dance sequences (3.2 hours) of +paired dance-camera-music data from the anime community, covering 4 music +genres. With this dataset, we uncover that dance camera movement is +multifaceted and human-centric, and possesses multiple influencing factors, +making dance camera synthesis a more challenging task compared to camera or +dance synthesis alone. To overcome these difficulties, we propose +DanceCamera3D, a transformer-based diffusion model that incorporates a novel +body attention loss and a condition separation strategy. For evaluation, we +devise new metrics measuring camera movement quality, diversity, and dancer +fidelity. Utilizing these metrics, we conduct extensive experiments on our DCM +dataset, providing both quantitative and qualitative evidence showcasing the +effectiveness of our DanceCamera3D model. Code and video demos are available at +https://github.com/Carmenw1203/DanceCamera3D-Official. + +
+
+ comment: Accept to CVPR 2024 +
+
+
+
+
+ + ☆ T-Pixel2Mesh: Combining Global and Local Transformer for 3D Mesh + Generation from a Single Image ICASSP 2024 + + +
+ Pixel2Mesh (P2M) is a classical approach for reconstructing 3D shapes from a +single color image through coarse-to-fine mesh deformation. Although P2M is +capable of generating plausible global shapes, its Graph Convolution Network +(GCN) often produces overly smooth results, causing the loss of fine-grained +geometry details. Moreover, P2M generates non-credible features for occluded +regions and struggles with the domain gap from synthetic data to real-world +images, which is a common challenge for single-view 3D reconstruction methods. +To address these challenges, we propose a novel Transformer-boosted +architecture, named T-Pixel2Mesh, inspired by the coarse-to-fine approach of +P2M. Specifically, we use a global Transformer to control the holistic shape +and a local Transformer to progressively refine the local geometry details with +graph-based point upsampling. To enhance real-world reconstruction, we present +the simple yet effective Linear Scale Search (LSS), which serves as prompt +tuning during the input preprocessing. Our experiments on ShapeNet demonstrate +state-of-the-art performance, while results on real-world data show the +generalization capability. + +
+
+ comment: Received by ICASSP 2024 +
+
+
+
+
+ + ☆ ProMamba: Prompt-Mamba for polyp segmentation + + +
+ Detecting polyps through colonoscopy is an important task in medical image +segmentation, which provides significant assistance and reference value for +clinical surgery. However, accurate segmentation of polyps is a challenging +task due to two main reasons. Firstly, polyps exhibit various shapes and +colors. Secondly, the boundaries between polyps and their normal surroundings +are often unclear. Additionally, significant differences between different +datasets lead to limited generalization capabilities of existing methods. To +address these issues, we propose a segmentation model based on Prompt-Mamba, +which incorporates the latest Vision-Mamba and prompt technologies. Compared to +previous models trained on the same dataset, our model not only maintains high +segmentation accuracy on the validation part of the same dataset but also +demonstrates superior accuracy on unseen datasets, exhibiting excellent +generalization capabilities. Notably, we are the first to apply the +Vision-Mamba architecture to polyp segmentation and the first to utilize prompt +technology in a polyp segmentation model. Our model efficiently accomplishes +segmentation tasks, surpassing previous state-of-the-art methods by an average +of 5% across six datasets. Furthermore, we have developed multiple versions of +our model with scaled parameter counts, achieving better performance than +previous models even with fewer parameters. Our code and trained weights will +be released soon. + +
+
+ comment: 10 pages, 2 figures,3 tabels +
+
+
+
+
+ + ☆ Recursive Cross-Modal Attention for Multimodal Fusion in Dimensional + Emotion Recognition + + +
+ Multi-modal emotion recognition has recently gained a lot of attention since +it can leverage diverse and complementary relationships over multiple +modalities, such as audio, visual, and text. Most state-of-the-art methods for +multimodal fusion rely on recurrent networks or conventional attention +mechanisms that do not effectively leverage the complementary nature of the +modalities. In this paper, we focus on dimensional emotion recognition based on +the fusion of facial, vocal, and text modalities extracted from videos. +Specifically, we propose a recursive cross-modal attention (RCMA) to +effectively capture the complementary relationships across the modalities in a +recursive fashion. The proposed model is able to effectively capture the +inter-modal relationships by computing the cross-attention weights across the +individual modalities and the joint representation of the other two modalities. +To further improve the inter-modal relationships, the obtained attended +features of the individual modalities are again fed as input to the cross-modal +attention to refine the feature representations of the individual modalities. +In addition to that, we have used Temporal convolution networks (TCNs) to +capture the temporal modeling (intra-modal relationships) of the individual +modalities. By deploying the TCNs as well cross-modal attention in a recursive +fashion, we are able to effectively capture both intra- and inter-modal +relationships across the audio, visual, and text modalities. Experimental +results on validation-set videos from the AffWild2 dataset indicate that our +proposed fusion model is able to achieve significant improvement over the +baseline for the sixth challenge of Affective Behavior Analysis in-the-Wild +2024 (ABAW6) competition. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2209.09068; + text overlap with arXiv:2203.14779 by other authors +
+
+
+
+
+ + ☆ Multimodal Variational Autoencoder for Low-cost Cardiac Hemodynamics + Instability Detection + + +
+ Recent advancements in non-invasive detection of cardiac hemodynamic +instability (CHDI) primarily focus on applying machine learning techniques to a +single data modality, e.g. cardiac magnetic resonance imaging (MRI). Despite +their potential, these approaches often fall short especially when the size of +labeled patient data is limited, a common challenge in the medical domain. +Furthermore, only a few studies have explored multimodal methods to study CHDI, +which mostly rely on costly modalities such as cardiac MRI and echocardiogram. +In response to these limitations, we propose a novel multimodal variational +autoencoder ($\text{CardioVAE}_\text{X,G}$) to integrate low-cost chest X-ray +(CXR) and electrocardiogram (ECG) modalities with pre-training on a large +unlabeled dataset. Specifically, $\text{CardioVAE}_\text{X,G}$ introduces a +novel tri-stream pre-training strategy to learn both shared and +modality-specific features, thus enabling fine-tuning with both unimodal and +multimodal datasets. We pre-train $\text{CardioVAE}_\text{X,G}$ on a large, +unlabeled dataset of $50,982$ subjects from a subset of MIMIC database and then +fine-tune the pre-trained model on a labeled dataset of $795$ subjects from the +ASPIRE registry. Comprehensive evaluations against existing methods show that +$\text{CardioVAE}_\text{X,G}$ offers promising performance (AUROC $=0.79$ and +Accuracy $=0.77$), representing a significant step forward in non-invasive +prediction of CHDI. Our model also excels in producing fine interpretations of +predictions directly associated with clinical features, thereby supporting +clinical decision-making. + +
+
+
+
+
+ + ☆ Learning User Embeddings from Human Gaze for Personalised Saliency + Prediction + + +
+ Reusable embeddings of user behaviour have shown significant performance +improvements for the personalised saliency prediction task. However, prior +works require explicit user characteristics and preferences as input, which are +often difficult to obtain. We present a novel method to extract user embeddings +from pairs of natural images and corresponding saliency maps generated from a +small amount of user-specific eye tracking data. At the core of our method is a +Siamese convolutional neural encoder that learns the user embeddings by +contrasting the image and personal saliency map pairs of different users. +Evaluations on two public saliency datasets show that the generated embeddings +have high discriminative power, are effective at refining universal saliency +maps to the individual users, and generalise well across users and images. +Finally, based on our model's ability to encode individual user +characteristics, our work points towards other applications that can benefit +from reusable embeddings of gaze behaviour. + +
+
+
+
+
+ + ☆ ZoDi: Zero-Shot Domain Adaptation with Diffusion-Based Image Transfer + + +
+ Deep learning models achieve high accuracy in segmentation tasks among +others, yet domain shift often degrades the models' performance, which can be +critical in real-world scenarios where no target images are available. This +paper proposes a zero-shot domain adaptation method based on diffusion models, +called ZoDi, which is two-fold by the design: zero-shot image transfer and +model adaptation. First, we utilize an off-the-shelf diffusion model to +synthesize target-like images by transferring the domain of source images to +the target domain. In this we specifically try to maintain the layout and +content by utilising layout-to-image diffusion models with stochastic +inversion. Secondly, we train the model using both source images and +synthesized images with the original segmentation maps while maximizing the +feature similarity of images from the two domains to learn domain-robust +representations. Through experiments we show benefits of ZoDi in the task of +image segmentation over state-of-the-art methods. It is also more applicable +than existing CLIP-based methods because it assumes no specific backbone or +models, and it enables to estimate the model's performance without target +images by inspecting generated images. Our implementation will be publicly +available. + +
+
+
+
+
+ + ☆ Meta-Point Learning and Refining for Category-Agnostic Pose Estimation CVPR 2024 + + +
+ Category-agnostic pose estimation (CAPE) aims to predict keypoints for +arbitrary classes given a few support images annotated with keypoints. Existing +methods only rely on the features extracted at support keypoints to predict or +refine the keypoints on query image, but a few support feature vectors are +local and inadequate for CAPE. Considering that human can quickly perceive +potential keypoints of arbitrary objects, we propose a novel framework for CAPE +based on such potential keypoints (named as meta-points). Specifically, we +maintain learnable embeddings to capture inherent information of various +keypoints, which interact with image feature maps to produce meta-points +without any support. The produced meta-points could serve as meaningful +potential keypoints for CAPE. Due to the inevitable gap between inherency and +annotation, we finally utilize the identities and details offered by support +keypoints to assign and refine meta-points to desired keypoints in query image. +In addition, we propose a progressive deformable point decoder and a slacked +regression loss for better prediction and supervision. Our novel framework not +only reveals the inherency of keypoints but also outperforms existing methods +of CAPE. Comprehensive experiments and in-depth studies on large-scale MP-100 +dataset demonstrate the effectiveness of our framework. + +
+
+ comment: Published in CVPR 2024 +
+
+
+
+
+ + ☆ H-vmunet: High-order Vision Mamba UNet for Medical Image Segmentation + + +
+ In the field of medical image segmentation, variant models based on +Convolutional Neural Networks (CNNs) and Visual Transformers (ViTs) as the base +modules have been very widely developed and applied. However, CNNs are often +limited in their ability to deal with long sequences of information, while the +low sensitivity of ViTs to local feature information and the problem of +secondary computational complexity limit their development. Recently, the +emergence of state-space models (SSMs), especially 2D-selective-scan (SS2D), +has had an impact on the longtime dominance of traditional CNNs and ViTs as the +foundational modules of visual neural networks. In this paper, we extend the +adaptability of SS2D by proposing a High-order Vision Mamba UNet (H-vmunet) for +medical image segmentation. Among them, the proposed High-order +2D-selective-scan (H-SS2D) progressively reduces the introduction of redundant +information during SS2D operations through higher-order interactions. In +addition, the proposed Local-SS2D module improves the learning ability of local +features of SS2D at each order of interaction. We conducted comparison and +ablation experiments on three publicly available medical image datasets +(ISIC2017, Spleen, and CVC-ClinicDB), and the results all demonstrate the +strong competitiveness of H-vmunet in medical image segmentation tasks. The +code is available from https://github.com/wurenkai/H-vmunet . + +
+
+
+
+
+ + ☆ VL-Mamba: Exploring State Space Models for Multimodal Learning + + +
+ Multimodal large language models (MLLMs) have attracted widespread interest +and have rich applications. However, the inherent attention mechanism in its +Transformer structure requires quadratic complexity and results in expensive +computational overhead. Therefore, in this work, we propose VL-Mamba, a +multimodal large language model based on state space models, which have been +shown to have great potential for long-sequence modeling with fast inference +and linear scaling in sequence length. Specifically, we first replace the +transformer-based backbone language model such as LLama or Vicuna with the +pre-trained Mamba language model. Then, we empirically explore how to +effectively apply the 2D vision selective scan mechanism for multimodal +learning and the combinations of different vision encoders and variants of +pretrained Mamba language models. The extensive experiments on diverse +multimodal benchmarks with competitive performance show the effectiveness of +our proposed VL-Mamba and demonstrate the great potential of applying state +space models for multimodal learning tasks. + +
+
+
+
+
+ + ☆ ReGround: Improving Textual and Spatial Grounding at No Cost + + +
+ When an image generation process is guided by both a text prompt and spatial +cues, such as a set of bounding boxes, do these elements work in harmony, or +does one dominate the other? Our analysis of a pretrained image diffusion model +that integrates gated self-attention into the U-Net reveals that spatial +grounding often outweighs textual grounding due to the sequential flow from +gated self-attention to cross-attention. We demonstrate that such bias can be +significantly mitigated without sacrificing accuracy in either grounding by +simply rewiring the network architecture, changing from sequential to parallel +for gated self-attention and cross-attention. This surprisingly simple yet +effective solution does not require any fine-tuning of the network but +significantly reduces the trade-off between the two groundings. Our experiments +demonstrate significant improvements from the original GLIGEN to the rewired +version in the trade-off between textual grounding and spatial grounding. + +
+
+ comment: Project page: https://re-ground.github.io/ +
+
+
+
+
+ + ☆ Leveraging feature communication in federated learning for remote + sensing image classification + + +
+ In the realm of Federated Learning (FL) applied to remote sensing image +classification, this study introduces and assesses several innovative +communication strategies. Our exploration includes feature-centric +communication, pseudo-weight amalgamation, and a combined method utilizing both +weights and features. Experiments conducted on two public scene classification +datasets unveil the effectiveness of these strategies, showcasing accelerated +convergence, heightened privacy, and reduced network information exchange. This +research provides valuable insights into the implications of feature-centric +communication in FL, offering potential applications tailored for remote +sensing scenarios. + +
+
+ comment: 5 pages, to appear in IGARSS 2024 +
+
+
+
+
+ + ☆ Portrait4D-v2: Pseudo Multi-View Data Creates Better 4D Head Synthesizer + + +
+ In this paper, we propose a novel learning approach for feed-forward one-shot +4D head avatar synthesis. Different from existing methods that often learn from +reconstructing monocular videos guided by 3DMM, we employ pseudo multi-view +videos to learn a 4D head synthesizer in a data-driven manner, avoiding +reliance on inaccurate 3DMM reconstruction that could be detrimental to the +synthesis performance. The key idea is to first learn a 3D head synthesizer +using synthetic multi-view images to convert monocular real videos into +multi-view ones, and then utilize the pseudo multi-view videos to learn a 4D +head synthesizer via cross-view self-reenactment. By leveraging a simple vision +transformer backbone with motion-aware cross-attentions, our method exhibits +superior performance compared to previous methods in terms of reconstruction +fidelity, geometry consistency, and motion control accuracy. We hope our method +offers novel insights into integrating 3D priors with 2D supervisions for +improved 4D head avatar creation. + +
+
+ comment: Project page: https://yudeng.github.io/Portrait4D-v2/ +
+
+
+
+
+ + ☆ Find n' Propagate: Open-Vocabulary 3D Object Detection in Urban + Environments + + +
+ In this work, we tackle the limitations of current LiDAR-based 3D object +detection systems, which are hindered by a restricted class vocabulary and the +high costs associated with annotating new object classes. Our exploration of +open-vocabulary (OV) learning in urban environments aims to capture novel +instances using pre-trained vision-language models (VLMs) with multi-sensor +data. We design and benchmark a set of four potential solutions as baselines, +categorizing them into either top-down or bottom-up approaches based on their +input data strategies. While effective, these methods exhibit certain +limitations, such as missing novel objects in 3D box estimation or applying +rigorous priors, leading to biases towards objects near the camera or of +rectangular geometries. To overcome these limitations, we introduce a universal +\textsc{Find n' Propagate} approach for 3D OV tasks, aimed at maximizing the +recall of novel objects and propagating this detection capability to more +distant areas thereby progressively capturing more. In particular, we utilize a +greedy box seeker to search against 3D novel boxes of varying orientations and +depth in each generated frustum and ensure the reliability of newly identified +boxes by cross alignment and density ranker. Additionally, the inherent bias +towards camera-proximal objects is alleviated by the proposed remote simulator, +which randomly diversifies pseudo-labeled novel instances in the self-training +process, combined with the fusion of base samples in the memory bank. Extensive +experiments demonstrate a 53% improvement in novel recall across diverse OV +settings, VLMs, and 3D detectors. Notably, we achieve up to a 3.97-fold +increase in Average Precision (AP) for novel object classes. The source code is +made available in the supplementary material. + +
+
+
+
+
+ + ☆ Ground-A-Score: Scaling Up the Score Distillation for Multi-Attribute + Editing + + +
+ Despite recent advancements in text-to-image diffusion models facilitating +various image editing techniques, complex text prompts often lead to an +oversight of some requests due to a bottleneck in processing text information. +To tackle this challenge, we present Ground-A-Score, a simple yet powerful +model-agnostic image editing method by incorporating grounding during score +distillation. This approach ensures a precise reflection of intricate prompt +requirements in the editing outcomes, taking into account the prior knowledge +of the object locations within the image. Moreover, the selective application +with a new penalty coefficient and contrastive loss helps to precisely target +editing areas while preserving the integrity of the objects in the source +image. Both qualitative assessments and quantitative analyses confirm that +Ground-A-Score successfully adheres to the intricate details of extended and +multifaceted prompts, ensuring high-quality outcomes that respect the original +image attributes. + +
+
+
+
+
+ + ☆ Diversity-aware Channel Pruning for StyleGAN Compression CVPR 2024 + + +
+ StyleGAN has shown remarkable performance in unconditional image generation. +However, its high computational cost poses a significant challenge for +practical applications. Although recent efforts have been made to compress +StyleGAN while preserving its performance, existing compressed models still lag +behind the original model, particularly in terms of sample diversity. To +overcome this, we propose a novel channel pruning method that leverages varying +sensitivities of channels to latent vectors, which is a key factor in sample +diversity. Specifically, by assessing channel importance based on their +sensitivities to latent vector perturbations, our method enhances the diversity +of samples in the compressed model. Since our method solely focuses on the +channel pruning stage, it has complementary benefits with prior training +schemes without additional training cost. Extensive experiments demonstrate +that our method significantly enhances sample diversity across various +datasets. Moreover, in terms of FID scores, our method not only surpasses +state-of-the-art by a large margin but also achieves comparable scores with +only half training iterations. + +
+
+ comment: Accepted to CVPR 2024. Project page: + https://jiwoogit.github.io/DCP-GAN_site +
+
+
+
+
+ + ☆ Next day fire prediction via semantic segmentation ACL + + +
+ In this paper we present a deep learning pipeline for next day fire +prediction. The next day fire prediction task consists in learning models that +receive as input the available information for an area up until a certain day, +in order to predict the occurrence of fire for the next day. Starting from our +previous problem formulation as a binary classification task on instances +(daily snapshots of each area) represented by tabular feature vectors, we +reformulate the problem as a semantic segmentation task on images; there, each +pixel corresponds to a daily snapshot of an area, while its channels represent +the formerly tabular training features. We demonstrate that this problem +formulation, built within a thorough pipeline achieves state of the art +results. + +
+
+ comment: Accepted in MACLEAN@ECML/PKDD 2023 +
+
+
+
+
+ + ☆ What explains the success of cross-modal fine-tuning with ORCA? + + +
+ ORCA (Shen et al., 2023) is a recent technique for cross-modal fine-tuning, +i.e., applying pre-trained transformer models to modalities beyond their +training data. The technique consists primarily of training an embedder and +fine-tuning the embedder and model. Despite its high performance on a variety +of downstream tasks, we do not understand precisely how each of these +components contribute to ORCA's success. Therefore, we run a series of +ablations and find that embedder training does not help 2D tasks at all, +contrary to what the original paper posits. In 1D tasks, some amount of +embedder training is necessary but more is not better. In 4 out of 6 datasets +we experiment with, it is model fine-tuning that makes the biggest difference. +Through our ablations and baselines, we contribute a better understanding of +the individual components of ORCA. + +
+
+
+
+
+ + ☆ IDAdapter: Learning Mixed Features for Tuning-Free Personalization of + Text-to-Image Models + + +
+ Leveraging Stable Diffusion for the generation of personalized portraits has +emerged as a powerful and noteworthy tool, enabling users to create +high-fidelity, custom character avatars based on their specific prompts. +However, existing personalization methods face challenges, including test-time +fine-tuning, the requirement of multiple input images, low preservation of +identity, and limited diversity in generated outcomes. To overcome these +challenges, we introduce IDAdapter, a tuning-free approach that enhances the +diversity and identity preservation in personalized image generation from a +single face image. IDAdapter integrates a personalized concept into the +generation process through a combination of textual and visual injections and a +face identity loss. During the training phase, we incorporate mixed features +from multiple reference images of a specific identity to enrich +identity-related content details, guiding the model to generate images with +more diverse styles, expressions, and angles compared to previous works. +Extensive evaluations demonstrate the effectiveness of our method, achieving +both diversity and identity fidelity in generated images. + +
+
+ comment: 14 pages, 15 figures +
+
+
+
+
+ + ☆ Compress3D: a Compressed Latent Space for 3D Generation from a Single + Image + + +
+ 3D generation has witnessed significant advancements, yet efficiently +producing high-quality 3D assets from a single image remains challenging. In +this paper, we present a triplane autoencoder, which encodes 3D models into a +compact triplane latent space to effectively compress both the 3D geometry and +texture information. Within the autoencoder framework, we introduce a 3D-aware +cross-attention mechanism, which utilizes low-resolution latent representations +to query features from a high-resolution 3D feature volume, thereby enhancing +the representation capacity of the latent space. Subsequently, we train a +diffusion model on this refined latent space. In contrast to solely relying on +image embedding for 3D generation, our proposed method advocates for the +simultaneous utilization of both image embedding and shape embedding as +conditions. Specifically, the shape embedding is estimated via a diffusion +prior model conditioned on the image embedding. Through comprehensive +experiments, we demonstrate that our method outperforms state-of-the-art +algorithms, achieving superior performance while requiring less training data +and time. Our approach enables the generation of high-quality 3D assets in +merely 7 seconds on a single A100 GPU. + +
+
+
+
+
+ + ☆ REAL: Representation Enhanced Analytic Learning for Exemplar-free + Class-incremental Learning + + +
+ Exemplar-free class-incremental learning (EFCIL) aims to mitigate +catastrophic forgetting in class-incremental learning without available +historical data. Compared with its counterpart (replay-based CIL) that stores +historical samples, the EFCIL suffers more from forgetting issues under the +exemplar-free constraint. In this paper, inspired by the recently developed +analytic learning (AL) based CIL, we propose a representation enhanced analytic +learning (REAL) for EFCIL. The REAL constructs a dual-stream base pretraining +(DS-BPT) and a representation enhancing distillation (RED) process to enhance +the representation of the extractor. The DS-BPT pretrains model in streams of +both supervised learning and self-supervised contrastive learning (SSCL) for +base knowledge extraction. The RED process distills the supervised knowledge to +the SSCL pretrained backbone and facilitates a subsequent AL-basd CIL that +converts the CIL to a recursive least-square problem. Our method addresses the +issue of insufficient discriminability in representations of unseen data caused +by a frozen backbone in the existing AL-based CIL. Empirical results on various +datasets including CIFAR-100, ImageNet-100 and ImageNet-1k, demonstrate that +our REAL outperforms the state-of-the-arts in EFCIL, and achieves comparable or +even more superior performance compared with the replay-based methods. + +
+
+
+
+
+ + ☆ Motion Generation from Fine-grained Textual Descriptions + + +
+ The task of text2motion is to generate motion sequences from given textual +descriptions, where a model should explore the interactions between natural +language instructions and human body movements. While most existing works are +confined to coarse-grained motion descriptions (e.g., "A man squats."), +fine-grained ones specifying movements of relevant body parts are barely +explored. Models trained with coarse texts may not be able to learn mappings +from fine-grained motion-related words to motion primitives, resulting in the +failure in generating motions from unseen descriptions. In this paper, we build +a large-scale language-motion dataset with fine-grained textual descriptions, +FineHumanML3D, by feeding GPT-3.5-turbo with delicate prompts. Accordingly, we +design a new text2motion model, FineMotionDiffuse, which makes full use of +fine-grained textual information. Our experiments show that FineMotionDiffuse +trained on FineHumanML3D acquires good results in quantitative evaluation. We +also find this model can better generate spatially/chronologically composite +motions by learning the implicit mappings from simple descriptions to the +corresponding basic motions. + +
+
+
+
+
+ + ☆ What if...?: Counterfactual Inception to Mitigate Hallucination Effects + in Large Multimodal Models + + +
+ This paper presents a way of enhancing the reliability of Large Multimodal +Models (LMMs) in addressing hallucination effects, where models generate +incorrect or unrelated responses. Without additional instruction tuning +paradigm, we introduce Counterfactual Inception, a novel method that implants +counterfactual thoughts into LMMs using carefully chosen, misaligned +counterfactual keywords. This method is grounded in the concept of +counterfactual thinking, a cognitive process where humans consider alternative +realities and outcomes. By applying this human-like reasoning mechanism to +LMMs, we aim to reduce hallucination effects and improve the models' +trustworthiness. We also propose Dual-modality Verification Process (DVP), a +rigorous framework for selecting optimal counterfactual keywords to trigger +counterfactual thinking into LMMs, concurrently considering visual and +linguistic context. Our extensive experiments across various LMMs, including +both open-source and proprietary models, corroborate that our method +significantly mitigates hallucination phenomena across different datasets. + +
+
+ comment: under review, code available: + https://github.com/IVY-LVLM/Counterfactual-Inception +
+
+
+
+
+ + ☆ Scale Decoupled Distillation CVPR2024 + + +
+ Logit knowledge distillation attracts increasing attention due to its +practicality in recent studies. However, it often suffers inferior performance +compared to the feature knowledge distillation. In this paper, we argue that +existing logit-based methods may be sub-optimal since they only leverage the +global logit output that couples multiple semantic knowledge. This may transfer +ambiguous knowledge to the student and mislead its learning. To this end, we +propose a simple but effective method, i.e., Scale Decoupled Distillation +(SDD), for logit knowledge distillation. SDD decouples the global logit output +into multiple local logit outputs and establishes distillation pipelines for +them. This helps the student to mine and inherit fine-grained and unambiguous +logit knowledge. Moreover, the decoupled knowledge can be further divided into +consistent and complementary logit knowledge that transfers the semantic +information and sample ambiguity, respectively. By increasing the weight of +complementary parts, SDD can guide the student to focus more on ambiguous +samples, improving its discrimination ability. Extensive experiments on several +benchmark datasets demonstrate the effectiveness of SDD for wide +teacher-student pairs, especially in the fine-grained classification task. Code +is available at: https://github.com/shicaiwei123/SDD-CVPR2024 + +
+
+ comment: Accepted to CVPR2024 10 pages 6figure +
+
+
+
+
+ + ☆ High-confidence pseudo-labels for domain adaptation in COVID-19 + detection + + +
+ This paper outlines our submission for the 4th COV19D competition as part of +the `Domain adaptation, Explainability, Fairness in AI for Medical Image +Analysis' (DEF-AI-MIA) workshop at the Computer Vision and Pattern Recognition +Conference (CVPR). The competition consists of two challenges. The first is to +train a classifier to detect the presence of COVID-19 from over one thousand CT +scans from the COV19-CT-DB database. The second challenge is to perform domain +adaptation by taking the dataset from Challenge 1 and adding a small number of +scans (some annotated and other not) for a different distribution. We +preprocessed the CT scans to segment the lungs, and output volumes with the +lungs individually and together. We then trained 3D ResNet and Swin Transformer +models on these inputs. We annotated the unlabeled CT scans using an ensemble +of these models and chose the high-confidence predictions as pseudo-labels for +fine-tuning. This resulted in a best cross-validation mean F1 score of 93.39\% +for Challenge 1 and a mean F1 score of 92.15 for Challenge 2. + +
+
+
+
+
+ + ☆ FMM-Attack: A Flow-based Multi-modal Adversarial Attack on Video-based + LLMs + + +
+ Despite the remarkable performance of video-based large language models +(LLMs), their adversarial threat remains unexplored. To fill this gap, we +propose the first adversarial attack tailored for video-based LLMs by crafting +flow-based multi-modal adversarial perturbations on a small fraction of frames +within a video, dubbed FMM-Attack. Extensive experiments show that our attack +can effectively induce video-based LLMs to generate incorrect answers when +videos are added with imperceptible adversarial perturbations. Intriguingly, +our FMM-Attack can also induce garbling in the model output, prompting +video-based LLMs to hallucinate. Overall, our observations inspire a further +understanding of multi-modal robustness and safety-related feature alignment +across different modalities, which is of great importance for various large +multi-modal models. Our code is available at +https://github.com/THU-Kingmin/FMM-Attack. + +
+
+
+
+
+ + ☆ VSTAR: Generative Temporal Nursing for Longer Dynamic Video Synthesis + + +
+ Despite tremendous progress in the field of text-to-video (T2V) synthesis, +open-sourced T2V diffusion models struggle to generate longer videos with +dynamically varying and evolving content. They tend to synthesize quasi-static +videos, ignoring the necessary visual change-over-time implied in the text +prompt. At the same time, scaling these models to enable longer, more dynamic +video synthesis often remains computationally intractable. To address this +challenge, we introduce the concept of Generative Temporal Nursing (GTN), where +we aim to alter the generative process on the fly during inference to improve +control over the temporal dynamics and enable generation of longer videos. We +propose a method for GTN, dubbed VSTAR, which consists of two key ingredients: +1) Video Synopsis Prompting (VSP) - automatic generation of a video synopsis +based on the original single prompt leveraging LLMs, which gives accurate +textual guidance to different visual states of longer videos, and 2) Temporal +Attention Regularization (TAR) - a regularization technique to refine the +temporal attention units of the pre-trained T2V diffusion models, which enables +control over the video dynamics. We experimentally showcase the superiority of +the proposed approach in generating longer, visually appealing videos over +existing open-sourced T2V models. We additionally analyze the temporal +attention maps realized with and without VSTAR, demonstrating the importance of +applying our method to mitigate neglect of the desired visual change over time. + +
+
+ comment: Project page: https://yumengli007.github.io/VSTAR +
+
+
+
+
+ + ☆ Improved Baselines for Data-efficient Perceptual Augmentation of LLMs + + +
+ The abilities of large language models (LLMs) have recently progressed to +unprecedented levels, paving the way to novel applications in a wide variety of +areas. In computer vision, LLMs can be used to prime vision-language tasks such +image captioning and visual question answering when coupled with pre-trained +vision backbones. While different approaches have been explored to interface +LLMs with ``perceptual backbones'' that process, e.g., visual or audio data, +they are often explored for different tasks, different datasets, and using +different perceptual backbones and language models, hindering direct comparison +of the interfacing mechanisms. To remedy this lack of comparability between +methods, we present an extensive experimental evaluation of different +interfacing mechanisms, across multiple tasks (including image, video, and +audio captioning as well as visual question answering), datasets and backbones, +paying special attention to low-data settings. We find improved performance +using existing mechanisms over state-of-the-art results, and identify a new +interfacing mechanism that yields (near) optimal results across different +tasks, while obtaining a 4x reduction in training time. + +
+
+
+
+
+ + ☆ A Unified Optimal Transport Framework for Cross-Modal Retrieval with + Noisy Labels + + +
+ Cross-modal retrieval (CMR) aims to establish interaction between different +modalities, among which supervised CMR is emerging due to its flexibility in +learning semantic category discrimination. Despite the remarkable performance +of previous supervised CMR methods, much of their success can be attributed to +the well-annotated data. However, even for unimodal data, precise annotation is +expensive and time-consuming, and it becomes more challenging with the +multimodal scenario. In practice, massive multimodal data are collected from +the Internet with coarse annotation, which inevitably introduces noisy labels. +Training with such misleading labels would bring two key challenges -- +enforcing the multimodal samples to \emph{align incorrect semantics} and +\emph{widen the heterogeneous gap}, resulting in poor retrieval performance. To +tackle these challenges, this work proposes UOT-RCL, a Unified framework based +on Optimal Transport (OT) for Robust Cross-modal Retrieval. First, we propose a +semantic alignment based on partial OT to progressively correct the noisy +labels, where a novel cross-modal consistent cost function is designed to blend +different modalities and provide precise transport cost. Second, to narrow the +discrepancy in multi-modal data, an OT-based relation alignment is proposed to +infer the semantic-level cross-modal matching. Both of these two components +leverage the inherent correlation among multi-modal data to facilitate +effective cost function. The experiments on three widely-used cross-modal +retrieval datasets demonstrate that our UOT-RCL surpasses the state-of-the-art +approaches and significantly improves the robustness against noisy labels. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ Deepfake Detection without Deepfakes: Generalization via Synthetic + Frequency Patterns Injection + + +
+ Deepfake detectors are typically trained on large sets of pristine and +generated images, resulting in limited generalization capacity; they excel at +identifying deepfakes created through methods encountered during training but +struggle with those generated by unknown techniques. This paper introduces a +learning approach aimed at significantly enhancing the generalization +capabilities of deepfake detectors. Our method takes inspiration from the +unique "fingerprints" that image generation processes consistently introduce +into the frequency domain. These fingerprints manifest as structured and +distinctly recognizable frequency patterns. We propose to train detectors using +only pristine images injecting in part of them crafted frequency patterns, +simulating the effects of various deepfake generation techniques without being +specific to any. These synthetic patterns are based on generic shapes, grids, +or auras. We evaluated our approach using diverse architectures across 25 +different generation methods. The models trained with our approach were able to +perform state-of-the-art deepfake detection, demonstrating also superior +generalization capabilities in comparison with previous methods. Indeed, they +are untied to any specific generation technique and can effectively identify +deepfakes regardless of how they were made. + +
+
+
+
+
+ + ☆ Scaling Diffusion Models to Real-World 3D LiDAR Scene Completion + + +
+ Computer vision techniques play a central role in the perception stack of +autonomous vehicles. Such methods are employed to perceive the vehicle +surroundings given sensor data. 3D LiDAR sensors are commonly used to collect +sparse 3D point clouds from the scene. However, compared to human perception, +such systems struggle to deduce the unseen parts of the scene given those +sparse point clouds. In this matter, the scene completion task aims at +predicting the gaps in the LiDAR measurements to achieve a more complete scene +representation. Given the promising results of recent diffusion models as +generative models for images, we propose extending them to achieve scene +completion from a single 3D LiDAR scan. Previous works used diffusion models +over range images extracted from LiDAR data, directly applying image-based +diffusion methods. Distinctly, we propose to directly operate on the points, +reformulating the noising and denoising diffusion process such that it can +efficiently work at scene scale. Together with our approach, we propose a +regularization loss to stabilize the noise predicted during the denoising +process. Our experimental evaluation shows that our method can complete the +scene given a single LiDAR scan as input, producing a scene with more details +compared to state-of-the-art scene completion methods. We believe that our +proposed diffusion process formulation can support further research in +diffusion models applied to scene-scale point cloud data. + +
+
+
+
+
+ + ☆ Progressive trajectory matching for medical dataset distillation + + +
+ It is essential but challenging to share medical image datasets due to +privacy issues, which prohibit building foundation models and knowledge +transfer. In this paper, we propose a novel dataset distillation method to +condense the original medical image datasets into a synthetic one that +preserves useful information for building an analysis model without accessing +the original datasets. Existing methods tackle only natural images by randomly +matching parts of the training trajectories of the model parameters trained by +the whole real datasets. However, through extensive experiments on medical +image datasets, the training process is extremely unstable and achieves +inferior distillation results. To solve these barriers, we propose to design a +novel progressive trajectory matching strategy to improve the training +stability for medical image dataset distillation. Additionally, it is observed +that improved stability prevents the synthetic dataset diversity and final +performance improvements. Therefore, we propose a dynamic overlap mitigation +module that improves the synthetic dataset diversity by dynamically eliminating +the overlap across different images and retraining parts of the synthetic +images for better convergence. Finally, we propose a new medical image dataset +distillation benchmark of various modalities and configurations to promote fair +evaluations. It is validated that our proposed method achieves 8.33% +improvement over previous state-of-the-art methods on average, and 11.7% +improvement when ipc=2 (i.e., image per class is 2). Codes and benchmarks will +be released. + +
+
+
+
+
+ + ☆ CLIPSwarm: Generating Drone Shows from Text Prompts with Vision-Language + Models + + +
+ This paper introduces CLIPSwarm, a new algorithm designed to automate the +modeling of swarm drone formations based on natural language. The algorithm +begins by enriching a provided word, to compose a text prompt that serves as +input to an iterative approach to find the formation that best matches the +provided word. The algorithm iteratively refines formations of robots to align +with the textual description, employing different steps for "exploration" and +"exploitation". Our framework is currently evaluated on simple formation +targets, limited to contour shapes. A formation is visually represented through +alpha-shape contours and the most representative color is automatically found +for the input word. To measure the similarity between the description and the +visual representation of the formation, we use CLIP [1], encoding text and +images into vectors and assessing their similarity. Subsequently, the algorithm +rearranges the formation to visually represent the word more effectively, +within the given constraints of available drones. Control actions are then +assigned to the drones, ensuring robotic behavior and collision-free movement. +Experimental results demonstrate the system's efficacy in accurately modeling +robot formations from natural language descriptions. The algorithm's +versatility is showcased through the execution of drone shows in photorealistic +simulation with varying shapes. We refer the reader to the supplementary video +for a visual reference of the results. + +
+
+
+
+
+ + ☆ An AI-Assisted Skincare Routine Recommendation System in XR + + +
+ In recent years, there has been an increasing interest in the use of +artificial intelligence (AI) and extended reality (XR) in the beauty industry. +In this paper, we present an AI-assisted skin care recommendation system +integrated into an XR platform. The system uses a convolutional neural network +(CNN) to analyse an individual's skin type and recommend personalised skin care +products in an immersive and interactive manner. Our methodology involves +collecting data from individuals through a questionnaire and conducting skin +analysis using a provided facial image in an immersive environment. This data +is then used to train the CNN model, which recognises the skin type and +existing issues and allows the recommendation engine to suggest personalised +skin care products. We evaluate our system in terms of the accuracy of the CNN +model, which achieves an average score of 93% in correctly classifying existing +skin issues. Being integrated into an XR system, this approach has the +potential to significantly enhance the beauty industry by providing immersive +and engaging experiences to users, leading to more efficient and consistent +skincare routines. + +
+
+
+
+
+ + ☆ HyperLLaVA: Dynamic Visual and Language Expert Tuning for Multimodal + Large Language Models + + +
+ Recent advancements indicate that scaling up Multimodal Large Language Models +(MLLMs) effectively enhances performance on downstream multimodal tasks. The +prevailing MLLM paradigm, \emph{e.g.}, LLaVA, transforms visual features into +text-like tokens using a \emph{static} vision-language mapper, thereby enabling +\emph{static} LLMs to develop the capability to comprehend visual information +through visual instruction tuning. Although promising, the \emph{static} tuning +strategy~\footnote{The static tuning refers to the trained model with static +parameters.} that shares the same parameters may constrain performance across +different downstream multimodal tasks. In light of this, we introduce +HyperLLaVA, which involves adaptive tuning of the projector and LLM parameters, +in conjunction with a dynamic visual expert and language expert, respectively. +These experts are derived from HyperNetworks, which generates adaptive +parameter shifts through visual and language guidance, enabling dynamic +projector and LLM modeling in two-stage training. + Our experiments demonstrate that our solution significantly surpasses LLaVA +on existing MLLM benchmarks, including MME, MMBench, SEED-Bench, and +LLaVA-Bench. ~\footnote{Our project is available on the link +https://github.com/DCDmllm/HyperLLaVA}. + +
+
+
+
+
+ + ☆ MedCycle: Unpaired Medical Report Generation via Cycle-Consistency + + +
+ Generating medical reports for X-ray images presents a significant challenge, +particularly in unpaired scenarios where access to paired image-report data for +training is unavailable. Previous works have typically learned a joint +embedding space for images and reports, necessitating a specific labeling +schema for both. We introduce an innovative approach that eliminates the need +for consistent labeling schemas, thereby enhancing data accessibility and +enabling the use of incompatible datasets. This approach is based on +cycle-consistent mapping functions that transform image embeddings into report +embeddings, coupled with report auto-encoding for medical report generation. +Our model and objectives consider intricate local details and the overarching +semantic context within images and reports. This approach facilitates the +learning of effective mapping functions, resulting in the generation of +coherent reports. It outperforms state-of-the-art results in unpaired chest +X-ray report generation, demonstrating improvements in both language and +clinical metrics. + +
+
+
+
+
+ + ☆ Fast-Poly: A Fast Polyhedral Framework For 3D Multi-Object Tracking + + +
+ 3D Multi-Object Tracking (MOT) captures stable and comprehensive motion +states of surrounding obstacles, essential for robotic perception. However, +current 3D trackers face issues with accuracy and latency consistency. In this +paper, we propose Fast-Poly, a fast and effective filter-based method for 3D +MOT. Building upon our previous work Poly-MOT, Fast-Poly addresses object +rotational anisotropy in 3D space, enhances local computation densification, +and leverages parallelization technique, improving inference speed and +precision. Fast-Poly is extensively tested on two large-scale tracking +benchmarks with Python implementation. On the nuScenes dataset, Fast-Poly +achieves new state-of-the-art performance with 75.8% AMOTA among all methods +and can run at 34.2 FPS on a personal CPU. On the Waymo dataset, Fast-Poly +exhibits competitive accuracy with 63.6% MOTA and impressive inference speed +(35.5 FPS). The source code is publicly available at +https://github.com/lixiaoyu2000/FastPoly. + +
+
+ comment: 1st on the NuScenes Tracking benchmark with 75.8 AMOTA and 34.2 FPS +
+
+
+
+
+ + ☆ Stochastic Geometry Models for Texture Synthesis of Machined Metallic + Surfaces: Sandblasting and Milling + + +
+ Training defect detection algorithms for visual surface inspection systems +requires a large and representative set of training data. Often there is not +enough real data available which additionally cannot cover the variety of +possible defects. Synthetic data generated by a synthetic visual surface +inspection environment can overcome this problem. Therefore, a digital twin of +the object is needed, whose micro-scale surface topography is modeled by +texture synthesis models. We develop stochastic texture models for sandblasted +and milled surfaces based on topography measurements of such surfaces. As the +surface patterns differ significantly, we use separate modeling approaches for +the two cases. Sandblasted surfaces are modeled by a combination of data-based +texture synthesis methods that rely entirely on the measurements. In contrast, +the model for milled surfaces is procedural and includes all process-related +parameters known from the machine settings. + +
+
+
+
+
+ + ☆ Advancing 6D Pose Estimation in Augmented Reality -- Overcoming + Projection Ambiguity with Uncontrolled Imagery + + +
+ This study addresses the challenge of accurate 6D pose estimation in +Augmented Reality (AR), a critical component for seamlessly integrating virtual +objects into real-world environments. Our research primarily addresses the +difficulty of estimating 6D poses from uncontrolled RGB images, a common +scenario in AR applications, which lacks metadata such as focal length. We +propose a novel approach that strategically decomposes the estimation of z-axis +translation and focal length, leveraging the neural-render and compare strategy +inherent in the FocalPose architecture. This methodology not only streamlines +the 6D pose estimation process but also significantly enhances the accuracy of +3D object overlaying in AR settings. Our experimental results demonstrate a +marked improvement in 6D pose estimation accuracy, with promising applications +in manufacturing and robotics. Here, the precise overlay of AR visualizations +and the advancement of robotic vision systems stand to benefit substantially +from our findings. + +
+
+
+
+
+ + ☆ MTP: Advancing Remote Sensing Foundation Model via Multi-Task + Pretraining + + +
+ Foundation models have reshaped the landscape of Remote Sensing (RS) by +enhancing various image interpretation tasks. Pretraining is an active research +topic, encompassing supervised and self-supervised learning methods to +initialize model weights effectively. However, transferring the pretrained +models to downstream tasks may encounter task discrepancy due to their +formulation of pretraining as image classification or object discrimination +tasks. In this study, we explore the Multi-Task Pretraining (MTP) paradigm for +RS foundation models to address this issue. Using a shared encoder and +task-specific decoder architecture, we conduct multi-task supervised +pretraining on the SAMRS dataset, encompassing semantic segmentation, instance +segmentation, and rotated object detection. MTP supports both convolutional +neural networks and vision transformer foundation models with over 300 million +parameters. The pretrained models are finetuned on various RS downstream tasks, +such as scene classification, horizontal and rotated object detection, semantic +segmentation, and change detection. Extensive experiments across 14 datasets +demonstrate the superiority of our models over existing ones of similar size +and their competitive performance compared to larger state-of-the-art models, +thus validating the effectiveness of MTP. + +
+
+ comment: The codes and pretrained models will be released at + https://github.com/ViTAE-Transformer/MTP +
+
+
+
+
+ + ☆ Diversified and Personalized Multi-rater Medical Image Segmentation CVPR 2024 + + +
+ Annotation ambiguity due to inherent data uncertainties such as blurred +boundaries in medical scans and different observer expertise and preferences +has become a major obstacle for training deep-learning based medical image +segmentation models. To address it, the common practice is to gather multiple +annotations from different experts, leading to the setting of multi-rater +medical image segmentation. Existing works aim to either merge different +annotations into the "groundtruth" that is often unattainable in numerous +medical contexts, or generate diverse results, or produce personalized results +corresponding to individual expert raters. Here, we bring up a more ambitious +goal for multi-rater medical image segmentation, i.e., obtaining both +diversified and personalized results. Specifically, we propose a two-stage +framework named D-Persona (first Diversification and then Personalization). In +Stage I, we exploit multiple given annotations to train a Probabilistic U-Net +model, with a bound-constrained loss to improve the prediction diversity. In +this way, a common latent space is constructed in Stage I, where different +latent codes denote diversified expert opinions. Then, in Stage II, we design +multiple attention-based projection heads to adaptively query the corresponding +expert prompts from the shared latent space, and then perform the personalized +medical image segmentation. We evaluated the proposed model on our in-house +Nasopharyngeal Carcinoma dataset and the public lung nodule dataset (i.e., +LIDC-IDRI). Extensive experiments demonstrated our D-Persona can provide +diversified and personalized results at the same time, achieving new SOTA +performance for multi-rater medical image segmentation. Our code will be +released at https://github.com/ycwu1997/D-Persona. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Cell Tracking in C. elegans with Cell Position Heatmap-Based Alignment + and Pairwise Detection + + +
+ 3D cell tracking in a living organism has a crucial role in live cell image +analysis. Cell tracking in C. elegans has two difficulties. First, cell +migration in a consecutive frame is large since they move their head during +scanning. Second, cell detection is often inconsistent in consecutive frames +due to touching cells and low-contrast images, and these inconsistent +detections affect the tracking performance worse. In this paper, we propose a +cell tracking method to address these issues, which has two main contributions. +First, we introduce cell position heatmap-based non-rigid alignment with +test-time fine-tuning, which can warp the detected points to near the positions +at the next frame. Second, we propose a pairwise detection method, which uses +the information of detection results at the previous frame for detecting cells +at the current frame. The experimental results demonstrate the effectiveness of +each module, and the proposed method achieved the best performance in +comparison. + +
+
+ comment: 4 pages, 5 figures, Accepted in EMBC 2023 +
+
+
+
+
+ + ☆ S2DM: Sector-Shaped Diffusion Models for Video Generation + + +
+ Diffusion models have achieved great success in image generation. However, +when leveraging this idea for video generation, we face significant challenges +in maintaining the consistency and continuity across video frames. This is +mainly caused by the lack of an effective framework to align frames of videos +with desired temporal features while preserving consistent semantic and +stochastic features. In this work, we propose a novel Sector-Shaped Diffusion +Model (S2DM) whose sector-shaped diffusion region is formed by a set of +ray-shaped reverse diffusion processes starting at the same noise point. S2DM +can generate a group of intrinsically related data sharing the same semantic +and stochastic features while varying on temporal features with appropriate +guided conditions. We apply S2DM to video generation tasks, and explore the use +of optical flow as temporal conditions. Our experimental results show that S2DM +outperforms many existing methods in the task of video generation without any +temporal-feature modelling modules. For text-to-video generation tasks where +temporal conditions are not explicitly given, we propose a two-stage generation +strategy which can decouple the generation of temporal features from +semantic-content features. We show that, without additional training, our model +integrated with another temporal conditions generative model can still achieve +comparable performance with existing works. Our results can be viewd at +https://s2dm.github.io/S2DM/. + +
+
+ comment: 17 pages, 6 figures +
+
+
+
+
+ + ☆ DOR3D-Net: Dense Ordinal Regression Network for 3D Hand Pose Estimation + + +
+ Depth-based 3D hand pose estimation is an important but challenging research +task in human-machine interaction community. Recently, dense regression methods +have attracted increasing attention in 3D hand pose estimation task, which +provide a low computational burden and high accuracy regression way by densely +regressing hand joint offset maps. However, large-scale regression offset +values are often affected by noise and outliers, leading to a significant drop +in accuracy. To tackle this, we re-formulate 3D hand pose estimation as a dense +ordinal regression problem and propose a novel Dense Ordinal Regression 3D Pose +Network (DOR3D-Net). Specifically, we first decompose offset value regression +into sub-tasks of binary classifications with ordinal constraints. Then, each +binary classifier can predict the probability of a binary spatial relationship +relative to joint, which is easier to train and yield much lower level of +noise. The estimated hand joint positions are inferred by aggregating the +ordinal regression results at local positions with a weighted sum. Furthermore, +both joint regression loss and ordinal regression loss are used to train our +DOR3D-Net in an end-to-end manner. Extensive experiments on public datasets +(ICVL, MSRA, NYU and HANDS2017) show that our design provides significant +improvements over SOTA methods. + +
+
+
+
+
+ + ☆ Unifying Local and Global Multimodal Features for Place Recognition in + Aliased and Low-Texture Environments ICRA + + +
+ Perceptual aliasing and weak textures pose significant challenges to the task +of place recognition, hindering the performance of Simultaneous Localization +and Mapping (SLAM) systems. This paper presents a novel model, called UMF +(standing for Unifying Local and Global Multimodal Features) that 1) leverages +multi-modality by cross-attention blocks between vision and LiDAR features, and +2) includes a re-ranking stage that re-orders based on local feature matching +the top-k candidates retrieved using a global representation. Our experiments, +particularly on sequences captured on a planetary-analogous environment, show +that UMF outperforms significantly previous baselines in those challenging +aliased environments. Since our work aims to enhance the reliability of SLAM in +all situations, we also explore its performance on the widely used RobotCar +dataset, for broader applicability. Code and models are available at +https://github.com/DLR-RM/UMF + +
+
+ comment: Accepted submission to International Conference on Robotics and + Automation (ICRA), 2024 +
+
+
+
+
+ + ☆ Robust image segmentation model based on binary level set SC + + +
+ In order to improve the robustness of traditional image segmentation models +to noise, this paper models the illumination term in intensity inhomogeneity +images. Additionally, to enhance the model's robustness to noisy images, we +incorporate the binary level set model into the proposed model. Compared to the +traditional level set, the binary level set eliminates the need for continuous +reinitialization. Moreover, by introducing the variational operator GL, our +model demonstrates better capability in segmenting noisy images. Finally, we +employ the three-step splitting operator method for solving, and the +effectiveness of the proposed model is demonstrated on various images. + +
+
+ comment: SCI +
+
+
+
+
+ + ☆ IIDM: Image-to-Image Diffusion Model for Semantic Image Synthesis + + +
+ Semantic image synthesis aims to generate high-quality images given semantic +conditions, i.e. segmentation masks and style reference images. Existing +methods widely adopt generative adversarial networks (GANs). GANs take all +conditional inputs and directly synthesize images in a single forward step. In +this paper, semantic image synthesis is treated as an image denoising task and +is handled with a novel image-to-image diffusion model (IIDM). Specifically, +the style reference is first contaminated with random noise and then +progressively denoised by IIDM, guided by segmentation masks. Moreover, three +techniques, refinement, color-transfer and model ensembles, are proposed to +further boost the generation quality. They are plug-in inference modules and do +not require additional training. Extensive experiments show that our IIDM +outperforms existing state-of-the-art methods by clear margins. Further +analysis is provided via detailed demonstrations. We have implemented IIDM +based on the Jittor framework; code is available at +https://github.com/ader47/jittor-jieke-semantic_images_synthesis. + +
+
+ comment: 6 pages, 7 figures, accetped by CVMJ 2024 +
+
+
+
+
+ + ☆ Correlation Clustering of Organoid Images + + +
+ In biological and medical research, scientists now routinely acquire +microscopy images of hundreds of morphologically heterogeneous organoids and +are then faced with the task of finding patterns in the image collection, i.e., +subsets of organoids that appear similar and potentially represent the same +morphological class. We adopt models and algorithms for correlating organoid +images, i.e., for quantifying the similarity in appearance and geometry of the +organoids they depict, and for clustering organoid images by consolidating +conflicting correlations. For correlating organoid images, we adopt and compare +two alternatives, a partial quadratic assignment problem and a twin network. +For clustering organoid images, we employ the correlation clustering problem. +Empirically, we learn the parameters of these models, infer a clustering of +organoid images, and quantify the accuracy of the inferred clusters, with +respect to a training set and a test set we contribute of state-of-the-art +light microscopy images of organoids clustered manually by biologists. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ☆ Few-shot Oriented Object Detection with Memorable Contrastive Learning + in Remote Sensing Images + + +
+ Few-shot object detection (FSOD) has garnered significant research attention +in the field of remote sensing due to its ability to reduce the dependency on +large amounts of annotated data. However, two challenges persist in this area: +(1) axis-aligned proposals, which can result in misalignment for arbitrarily +oriented objects, and (2) the scarcity of annotated data still limits the +performance for unseen object categories. To address these issues, we propose a +novel FSOD method for remote sensing images called Few-shot Oriented object +detection with Memorable Contrastive learning (FOMC). Specifically, we employ +oriented bounding boxes instead of traditional horizontal bounding boxes to +learn a better feature representation for arbitrary-oriented aerial objects, +leading to enhanced detection performance. To the best of our knowledge, we are +the first to address oriented object detection in the few-shot setting for +remote sensing images. To address the challenging issue of object +misclassification, we introduce a supervised contrastive learning module with a +dynamically updated memory bank. This module enables the use of large batches +of negative samples and enhances the model's capability to learn discriminative +features for unseen classes. We conduct comprehensive experiments on the DOTA +and HRSC2016 datasets, and our model achieves state-of-the-art performance on +the few-shot oriented object detection task. Code and pretrained models will be +released. + +
+
+ comment: 13 pages, 8 tables, 10 figures +
+
+
+
+
+ + ☆ Counting Network for Learning from Majority Label ICASSP 2024 + + +
+ The paper proposes a novel problem in multi-class Multiple-Instance Learning +(MIL) called Learning from the Majority Label (LML). In LML, the majority class +of instances in a bag is assigned as the bag's label. LML aims to classify +instances using bag-level majority classes. This problem is valuable in various +applications. Existing MIL methods are unsuitable for LML due to aggregating +confidences, which may lead to inconsistency between the bag-level label and +the label obtained by counting the number of instances for each class. This may +lead to incorrect instance-level classification. We propose a counting network +trained to produce the bag-level majority labels estimated by counting the +number of instances for each class. This led to the consistency of the majority +class between the network outputs and one obtained by counting the number of +instances. Experimental results show that our counting network outperforms +conventional MIL methods on four datasets The code is publicly available at +https://github.com/Shiku-Kaito/Counting-Network-for-Learning-from-Majority-Label. + +
+
+ comment: 5 pages, 4 figures, Accepted in ICASSP 2024 +
+
+
+
+
+ + ☆ ManiPose: A Comprehensive Benchmark for Pose-aware Object Manipulation + in Robotics IROS 2024 + + +
+ Robotic manipulation in everyday scenarios, especially in unstructured +environments, requires skills in pose-aware object manipulation (POM), which +adapts robots' grasping and handling according to an object's 6D pose. +Recognizing an object's position and orientation is crucial for effective +manipulation. For example, if a mug is lying on its side, it's more effective +to grasp it by the rim rather than the handle. Despite its importance, research +in POM skills remains limited, because learning manipulation skills requires +pose-varying simulation environments and datasets. This paper introduces +ManiPose, a pioneering benchmark designed to advance the study of pose-varying +manipulation tasks. ManiPose encompasses: 1) Simulation environments for POM +feature tasks ranging from 6D pose-specific pick-and-place of single objects to +cluttered scenes, further including interactions with articulated objects. 2) A +comprehensive dataset featuring geometrically consistent and +manipulation-oriented 6D pose labels for 2936 real-world scanned rigid objects +and 100 articulated objects across 59 categories. 3) A baseline for POM, +leveraging the inferencing abilities of LLM (e.g., ChatGPT) to analyze the +relationship between 6D pose and task-specific requirements, offers enhanced +pose-aware grasp prediction and motion planning capabilities. Our benchmark +demonstrates notable advancements in pose estimation, pose-aware manipulation, +and real-robot skill transfer, setting new standards for POM research. We will +open-source the ManiPose benchmark with the final version paper, inviting the +community to engage with our resources, available at our +website:https://sites.google.com/view/manipose. + +
+
+ comment: 8 pages, 7 figures, submitted to 2024 IEEE/RSJ International + Conference on Intelligent Robots and Systems (IROS 2024) +
+
+
+
+
+ + ☆ AGFSync: Leveraging AI-Generated Feedback for Preference Optimization in + Text-to-Image Generation + + +
+ Text-to-Image (T2I) diffusion models have achieved remarkable success in +image generation. Despite their progress, challenges remain in both +prompt-following ability, image quality and lack of high-quality datasets, +which are essential for refining these models. As acquiring labeled data is +costly, we introduce AGFSync, a framework that enhances T2I diffusion models +through Direct Preference Optimization (DPO) in a fully AI-driven approach. +AGFSync utilizes Vision-Language Models (VLM) to assess image quality across +style, coherence, and aesthetics, generating feedback data within an AI-driven +loop. By applying AGFSync to leading T2I models such as SD v1.4, v1.5, and +SDXL, our extensive experiments on the TIFA dataset demonstrate notable +improvements in VQA scores, aesthetic evaluations, and performance on the HPSv2 +benchmark, consistently outperforming the base models. AGFSync's method of +refining T2I diffusion models paves the way for scalable alignment techniques. + +
+
+
+
+
+ + ☆ OrthCaps: An Orthogonal CapsNet with Sparse Attention Routing and + Pruning + + +
+ Redundancy is a persistent challenge in Capsule Networks (CapsNet),leading to +high computational costs and parameter counts. Although previous works have +introduced pruning after the initial capsule layer, dynamic routing's fully +connected nature and non-orthogonal weight matrices reintroduce redundancy in +deeper layers. Besides, dynamic routing requires iterating to converge, further +increasing computational demands. In this paper, we propose an Orthogonal +Capsule Network (OrthCaps) to reduce redundancy, improve routing performance +and decrease parameter counts. Firstly, an efficient pruned capsule layer is +introduced to discard redundant capsules. Secondly, dynamic routing is replaced +with orthogonal sparse attention routing, eliminating the need for iterations +and fully connected structures. Lastly, weight matrices during routing are +orthogonalized to sustain low capsule similarity, which is the first approach +to introduce orthogonality into CapsNet as far as we know. Our experiments on +baseline datasets affirm the efficiency and robustness of OrthCaps in +classification tasks, in which ablation studies validate the criticality of +each component. Remarkably, OrthCaps-Shallow outperforms other Capsule Network +benchmarks on four datasets, utilizing only 110k parameters, which is a mere +1.25% of a standard Capsule Network's total. To the best of our knowledge, it +achieves the smallest parameter count among existing Capsule Networks. +Similarly, OrthCaps-Deep demonstrates competitive performance across four +datasets, utilizing only 1.2% of the parameters required by its counterparts. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ Hierarchical Gaussian Mixture Normalizing Flow Modeling for Unified + Anomaly Detection + + +
+ Unified anomaly detection (AD) is one of the most challenges for anomaly +detection, where one unified model is trained with normal samples from multiple +classes with the objective to detect anomalies in these classes. For such a +challenging task, popular normalizing flow (NF) based AD methods may fall into +a "homogeneous mapping" issue,where the NF-based AD models are biased to +generate similar latent representations for both normal and abnormal features, +and thereby lead to a high missing rate of anomalies. In this paper, we propose +a novel Hierarchical Gaussian mixture normalizing flow modeling method for +accomplishing unified Anomaly Detection, which we call HGAD. Our HGAD consists +of two key components: inter-class Gaussian mixture modeling and intra-class +mixed class centers learning. Compared to the previous NF-based AD methods, the +hierarchical Gaussian mixture modeling approach can bring stronger +representation capability to the latent space of normalizing flows, so that +even complex multi-class distribution can be well represented and learned in +the latent space. In this way, we can avoid mapping different class +distributions into the same single Gaussian prior, thus effectively avoiding or +mitigating the "homogeneous mapping" issue. We further indicate that the more +distinguishable different class centers, the more conducive to avoiding the +bias issue. Thus, we further propose a mutual information maximization loss for +better structuring the latent feature space. We evaluate our method on four +real-world AD benchmarks, where we can significantly improve the previous +NF-based AD methods and also outperform the SOTA unified AD methods. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ vid-TLDR: Training Free Token merging for Light-weight Video Transformer CVPR + + +
+ Video Transformers have become the prevalent solution for various video +downstream tasks with superior expressive power and flexibility. However, these +video transformers suffer from heavy computational costs induced by the massive +number of tokens across the entire video frames, which has been the major +barrier to training the model. Further, the patches irrelevant to the main +contents, e.g., backgrounds, degrade the generalization performance of models. +To tackle these issues, we propose training free token merging for lightweight +video Transformer (vid-TLDR) that aims to enhance the efficiency of video +Transformers by merging the background tokens without additional training. For +vid-TLDR, we introduce a novel approach to capture the salient regions in +videos only with the attention map. Further, we introduce the saliency-aware +token merging strategy by dropping the background tokens and sharpening the +object scores. Our experiments show that vid-TLDR significantly mitigates the +computational complexity of video Transformers while achieving competitive +performance compared to the base model without vid-TLDR. Code is available at +https://github.com/mlvlab/vid-TLDR. + +
+
+ comment: Conference on Computer Vision and Pattern Recognition (CVPR), 2024 +
+
+
+
+
+ + ☆ TiBiX: Leveraging Temporal Information for Bidirectional X-ray and + Report Generation + + +
+ With the emergence of vision language models in the medical imaging domain, +numerous studies have focused on two dominant research activities: (1) report +generation from Chest X-rays (CXR), and (2) synthetic scan generation from text +or reports. Despite some research incorporating multi-view CXRs into the +generative process, prior patient scans and reports have been generally +disregarded. This can inadvertently lead to the leaving out of important +medical information, thus affecting generation quality. To address this, we +propose TiBiX: Leveraging Temporal information for Bidirectional X-ray and +Report Generation. Considering previous scans, our approach facilitates +bidirectional generation, primarily addressing two challenging problems: (1) +generating the current image from the previous image and current report and (2) +generating the current report based on both the previous and current images. +Moreover, we extract and release a curated temporal benchmark dataset derived +from the MIMIC-CXR dataset, which focuses on temporal data. Our comprehensive +experiments and ablation studies explore the merits of incorporating prior CXRs +and achieve state-of-the-art (SOTA) results on the report generation task. +Furthermore, we attain on-par performance with SOTA image generation efforts, +thus serving as a new baseline in longitudinal bidirectional CXR-to-report +generation. The code is available at https://github.com/BioMedIA-MBZUAI/TiBiX. + +
+
+
+
+
+ + ☆ FissionFusion: Fast Geometric Generation and Hierarchical Souping for + Medical Image Analysis + + +
+ The scarcity of well-annotated medical datasets requires leveraging transfer +learning from broader datasets like ImageNet or pre-trained models like CLIP. +Model soups averages multiple fine-tuned models aiming to improve performance +on In-Domain (ID) tasks and enhance robustness against Out-of-Distribution +(OOD) datasets. However, applying these methods to the medical imaging domain +faces challenges and results in suboptimal performance. This is primarily due +to differences in error surface characteristics that stem from data +complexities such as heterogeneity, domain shift, class imbalance, and +distributional shifts between training and testing phases. To address this +issue, we propose a hierarchical merging approach that involves local and +global aggregation of models at various levels based on models' hyperparameter +configurations. Furthermore, to alleviate the need for training a large number +of models in the hyperparameter search, we introduce a computationally +efficient method using a cyclical learning rate scheduler to produce multiple +models for aggregation in the weight space. Our method demonstrates significant +improvements over the model souping approach across multiple datasets (around +6% gain in HAM10000 and CheXpert datasets) while maintaining low computational +costs for model generation and selection. Moreover, we achieve better results +on OOD datasets than model soups. The code is available at +https://github.com/BioMedIA-MBZUAI/FissionFusion. + +
+
+
+
+
+ + ☆ Adaptive Critical Subgraph Mining for Cognitive Impairment Conversion + Prediction with T1-MRI-based Brain Network + + +
+ Prediction the conversion to early-stage dementia is critical for mitigating +its progression but remains challenging due to subtle cognitive impairments and +structural brain changes. Traditional T1-weighted magnetic resonance imaging +(T1-MRI) research focus on identifying brain atrophy regions but often fails to +address the intricate connectivity between them. This limitation underscores +the necessity of focuing on inter-regional connectivity for a comprehensive +understand of the brain's complex network. Moreover, there is a pressing demand +for methods that adaptively preserve and extract critical information, +particularly specialized subgraph mining techniques for brain networks. These +are essential for developing high-quality feature representations that reveal +critical spatial impacts of structural brain changes and its topology. In this +paper, we propose Brain-SubGNN, a novel graph representation network to mine +and enhance critical subgraphs based on T1-MRI. This network provides a +subgraph-level interpretation, enhancing interpretability and insights for +graph analysis. The process begins by extracting node features and a +correlation matrix between nodes to construct a task-oriented brain network. +Brain-SubGNN then adaptively identifies and enhances critical subgraphs, +capturing both loop and neighbor subgraphs. This method reflects the loop +topology and local changes, indicative of long-range connections, and maintains +local and global brain attributes. Extensive experiments validate the +effectiveness and advantages of Brain-SubGNN, demonstrating its potential as a +powerful tool for understanding and diagnosing early-stage dementia. Source +code is available at https://github.com/Leng-10/Brain-SubGNN. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ Learning Novel View Synthesis from Heterogeneous Low-light Captures + + +
+ Neural radiance field has achieved fundamental success in novel view +synthesis from input views with the same brightness level captured under fixed +normal lighting. Unfortunately, synthesizing novel views remains to be a +challenge for input views with heterogeneous brightness level captured under +low-light condition. The condition is pretty common in the real world. It +causes low-contrast images where details are concealed in the darkness and +camera sensor noise significantly degrades the image quality. To tackle this +problem, we propose to learn to decompose illumination, reflectance, and noise +from input views according to that reflectance remains invariant across +heterogeneous views. To cope with heterogeneous brightness and noise levels +across multi-views, we learn an illumination embedding and optimize a noise map +individually for each view. To allow intuitive editing of the illumination, we +design an illumination adjustment module to enable either brightening or +darkening of the illumination component. Comprehensive experiments demonstrate +that this approach enables effective intrinsic decomposition for low-light +multi-view noisy images and achieves superior visual quality and numerical +performance for synthesizing novel views compared to state-of-the-art methods. + +
+
+
+
+
+ + ☆ AMP: Autoregressive Motion Prediction Revisited with Next Token + Prediction for Autonomous Driving + + +
+ As an essential task in autonomous driving (AD), motion prediction aims to +predict the future states of surround objects for navigation. One natural +solution is to estimate the position of other agents in a step-by-step manner +where each predicted time-step is conditioned on both observed time-steps and +previously predicted time-steps, i.e., autoregressive prediction. Pioneering +works like SocialLSTM and MFP design their decoders based on this intuition. +However, almost all state-of-the-art works assume that all predicted time-steps +are independent conditioned on observed time-steps, where they use a single +linear layer to generate positions of all time-steps simultaneously. They +dominate most motion prediction leaderboards due to the simplicity of training +MLPs compared to autoregressive networks. + In this paper, we introduce the GPT style next token prediction into motion +forecasting. In this way, the input and output could be represented in a +unified space and thus the autoregressive prediction becomes more feasible. +However, different from language data which is composed of homogeneous units +-words, the elements in the driving scene could have complex spatial-temporal +and semantic relations. To this end, we propose to adopt three factorized +attention modules with different neighbors for information aggregation and +different position encoding styles to capture their relations, e.g., encoding +the transformation between coordinate systems for spatial relativity while +adopting RoPE for temporal relativity. Empirically, by equipping with the +aforementioned tailored designs, the proposed method achieves state-of-the-art +performance in the Waymo Open Motion and Waymo Interaction datasets. Notably, +AMP outperforms other recent autoregressive motion prediction methods: MotionLM +and StateTransformer, which demonstrates the effectiveness of the proposed +designs. + +
+
+
+
+
+ + ☆ Efficient scene text image super-resolution with semantic guidance + + +
+ Scene text image super-resolution has significantly improved the accuracy of +scene text recognition. However, many existing methods emphasize performance +over efficiency and ignore the practical need for lightweight solutions in +deployment scenarios. Faced with the issues, our work proposes an efficient +framework called SGENet to facilitate deployment on resource-limited platforms. +SGENet contains two branches: super-resolution branch and semantic guidance +branch. We apply a lightweight pre-trained recognizer as a semantic extractor +to enhance the understanding of text information. Meanwhile, we design the +visual-semantic alignment module to achieve bidirectional alignment between +image features and semantics, resulting in the generation of highquality prior +guidance. We conduct extensive experiments on benchmark dataset, and the +proposed SGENet achieves excellent performance with fewer computational costs. +Code is available at https://github.com/SijieLiu518/SGENet + +
+
+
+
+
+ + ☆ Gaussian Splatting on the Move: Blur and Rolling Shutter Compensation + for Natural Camera Motion + + +
+ High-quality scene reconstruction and novel view synthesis based on Gaussian +Splatting (3DGS) typically require steady, high-quality photographs, often +impractical to capture with handheld cameras. We present a method that adapts +to camera motion and allows high-quality scene reconstruction with handheld +video data suffering from motion blur and rolling shutter distortion. Our +approach is based on detailed modelling of the physical image formation process +and utilizes velocities estimated using visual-inertial odometry (VIO). Camera +poses are considered non-static during the exposure time of a single image +frame and camera poses are further optimized in the reconstruction process. We +formulate a differentiable rendering pipeline that leverages screen space +approximation to efficiently incorporate rolling-shutter and motion blur +effects into the 3DGS framework. Our results with both synthetic and real data +demonstrate superior performance in mitigating camera motion over existing +methods, thereby advancing 3DGS in naturalistic settings. + +
+
+ comment: Source code available at https://github.com/SpectacularAI/3dgs-deblur +
+
+
+
+
+ + ☆ Out-of-Distribution Detection Using Peer-Class Generated by Large + Language Model + + +
+ Out-of-distribution (OOD) detection is a critical task to ensure the +reliability and security of machine learning models deployed in real-world +applications. Conventional methods for OOD detection that rely on single-modal +information, often struggle to capture the rich variety of OOD instances. The +primary difficulty in OOD detection arises when an input image has numerous +similarities to a particular class in the in-distribution (ID) dataset, e.g., +wolf to dog, causing the model to misclassify it. Nevertheless, it may be easy +to distinguish these classes in the semantic domain. To this end, in this +paper, a novel method called ODPC is proposed, in which specific prompts to +generate OOD peer classes of ID semantics are designed by a large language +model as an auxiliary modality to facilitate detection. Moreover, a contrastive +loss based on OOD peer classes is devised to learn compact representations of +ID classes and improve the clarity of boundaries between different classes. The +extensive experiments on five benchmark datasets show that the method we +propose can yield state-of-the-art results. + +
+
+
+
+
+ + ☆ DD-RobustBench: An Adversarial Robustness Benchmark for Dataset + Distillation + + +
+ Dataset distillation is an advanced technique aimed at compressing datasets +into significantly smaller counterparts, while preserving formidable training +performance. Significant efforts have been devoted to promote evaluation +accuracy under limited compression ratio while overlooked the robustness of +distilled dataset. In this work, we introduce a comprehensive benchmark that, +to the best of our knowledge, is the most extensive to date for evaluating the +adversarial robustness of distilled datasets in a unified way. Our benchmark +significantly expands upon prior efforts by incorporating a wider range of +dataset distillation methods, including the latest advancements such as TESLA +and SRe2L, a diverse array of adversarial attack methods, and evaluations +across a broader and more extensive collection of datasets such as ImageNet-1K. +Moreover, we assessed the robustness of these distilled datasets against +representative adversarial attack algorithms like PGD and AutoAttack, while +exploring their resilience from a frequency perspective. We also discovered +that incorporating distilled data into the training batches of the original +dataset can yield to improvement of robustness. + +
+
+
+
+
+ + ☆ HyperFusion: A Hypernetwork Approach to Multimodal Integration of + Tabular and Medical Imaging Data for Predictive Modeling + + +
+ The integration of diverse clinical modalities such as medical imaging and +the tabular data obtained by the patients' Electronic Health Records (EHRs) is +a crucial aspect of modern healthcare. The integrative analysis of multiple +sources can provide a comprehensive understanding of a patient's condition and +can enhance diagnoses and treatment decisions. Deep Neural Networks (DNNs) +consistently showcase outstanding performance in a wide range of multimodal +tasks in the medical domain. However, the complex endeavor of effectively +merging medical imaging with clinical, demographic and genetic information +represented as numerical tabular data remains a highly active and ongoing +research pursuit. + We present a novel framework based on hypernetworks to fuse clinical imaging +and tabular data by conditioning the image processing on the EHR's values and +measurements. This approach aims to leverage the complementary information +present in these modalities to enhance the accuracy of various medical +applications. We demonstrate the strength and the generality of our method on +two different brain Magnetic Resonance Imaging (MRI) analysis tasks, namely, +brain age prediction conditioned by subject's sex, and multiclass Alzheimer's +Disease (AD) classification conditioned by tabular data. We show that our +framework outperforms both single-modality models and state-of-the-art +MRI-tabular data fusion methods. The code, enclosed to this manuscript will be +made publicly available. + +
+
+ comment: 17 pages, 8 figures +
+
+
+
+
+ + ☆ PuzzleVQA: Diagnosing Multimodal Reasoning Challenges of Language Models + with Abstract Visual Patterns + + +
+ Large multimodal models extend the impressive capabilities of large language +models by integrating multimodal understanding abilities. However, it is not +clear how they can emulate the general intelligence and reasoning ability of +humans. As recognizing patterns and abstracting concepts are key to general +intelligence, we introduce PuzzleVQA, a collection of puzzles based on abstract +patterns. With this dataset, we evaluate large multimodal models with abstract +patterns based on fundamental concepts, including colors, numbers, sizes, and +shapes. Through our experiments on state-of-the-art large multimodal models, we +find that they are not able to generalize well to simple abstract patterns. +Notably, even GPT-4V cannot solve more than half of the puzzles. To diagnose +the reasoning challenges in large multimodal models, we progressively guide the +models with our ground truth reasoning explanations for visual perception, +inductive reasoning, and deductive reasoning. Our systematic analysis finds +that the main bottlenecks of GPT-4V are weaker visual perception and inductive +reasoning abilities. Through this work, we hope to shed light on the +limitations of large multimodal models and how they can better emulate human +cognitive processes in the future (Our data and code will be released publicly +at https://github.com/declare-lab/LLM-PuzzleTest). + +
+
+
+
+
+ + ☆ LaserHuman: Language-guided Scene-aware Human Motion Generation in Free + Environment + + +
+ Language-guided scene-aware human motion generation has great significance +for entertainment and robotics. In response to the limitations of existing +datasets, we introduce LaserHuman, a pioneering dataset engineered to +revolutionize Scene-Text-to-Motion research. LaserHuman stands out with its +inclusion of genuine human motions within 3D environments, unbounded free-form +natural language descriptions, a blend of indoor and outdoor scenarios, and +dynamic, ever-changing scenes. Diverse modalities of capture data and rich +annotations present great opportunities for the research of conditional motion +generation, and can also facilitate the development of real-life applications. +Moreover, to generate semantically consistent and physically plausible human +motions, we propose a multi-conditional diffusion model, which is simple but +effective, achieving state-of-the-art performance on existing datasets. + +
+
+
+
+
+ + ☆ DetDiffusion: Synergizing Generative and Perceptive Models for Enhanced + Data Generation and Perception CVPR 2024 + + +
+ Current perceptive models heavily depend on resource-intensive datasets, +prompting the need for innovative solutions. Leveraging recent advances in +diffusion models, synthetic data, by constructing image inputs from various +annotations, proves beneficial for downstream tasks. While prior methods have +separately addressed generative and perceptive models, DetDiffusion, for the +first time, harmonizes both, tackling the challenges in generating effective +data for perceptive models. To enhance image generation with perceptive models, +we introduce perception-aware loss (P.A. loss) through segmentation, improving +both quality and controllability. To boost the performance of specific +perceptive models, our method customizes data augmentation by extracting and +utilizing perception-aware attribute (P.A. Attr) during generation. +Experimental results from the object detection task highlight DetDiffusion's +superior performance, establishing a new state-of-the-art in layout-guided +generation. Furthermore, image syntheses from DetDiffusion can effectively +augment training data, significantly enhancing downstream detection +performance. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Rotary Position Embedding for Vision Transformer + + +
+ Rotary Position Embedding (RoPE) performs remarkably on language models, +especially for length extrapolation of Transformers. However, the impacts of +RoPE on computer vision domains have been underexplored, even though RoPE +appears capable of enhancing Vision Transformer (ViT) performance in a way +similar to the language domain. This study provides a comprehensive analysis of +RoPE when applied to ViTs, utilizing practical implementations of RoPE for 2D +vision data. The analysis reveals that RoPE demonstrates impressive +extrapolation performance, i.e., maintaining precision while increasing image +resolution at inference. It eventually leads to performance improvement for +ImageNet-1k, COCO detection, and ADE-20k segmentation. We believe this study +provides thorough guidelines to apply RoPE into ViT, promising improved +backbone performance with minimal extra computational overhead. Our code and +pre-trained models are available at https://github.com/naver-ai/rope-vit + +
+
+ comment: 20 pages, 5 figures +
+
+
+
+
+ + ☆ EcoSense: Energy-Efficient Intelligent Sensing for In-Shore Ship + Detection through Edge-Cloud Collaboration + + +
+ Detecting marine objects inshore presents challenges owing to algorithmic +intricacies and complexities in system deployment. We propose a +difficulty-aware edge-cloud collaborative sensing system that splits the task +into object localization and fine-grained classification. Objects are +classified either at the edge or within the cloud, based on their estimated +difficulty. The framework comprises a low-power device-tailored front-end model +for object localization, classification, and difficulty estimation, along with +a transformer-graph convolutional network-based back-end model for fine-grained +classification. Our system demonstrates superior performance (mAP@0.5 +4.3%}) +on widely used marine object detection datasets, significantly reducing both +data transmission volume (by 95.43%) and energy consumption (by 72.7%}) at the +system level. We validate the proposed system across various embedded system +platforms and in real-world scenarios involving drone deployment. + +
+
+
+
+
+ + ☆ Multi-Modal Hallucination Control by Visual Information Grounding + + +
+ Generative Vision-Language Models (VLMs) are prone to generate +plausible-sounding textual answers that, however, are not always grounded in +the input image. We investigate this phenomenon, usually referred to as +"hallucination" and show that it stems from an excessive reliance on the +language prior. In particular, we show that as more tokens are generated, the +reliance on the visual prompt decreases, and this behavior strongly correlates +with the emergence of hallucinations. To reduce hallucinations, we introduce +Multi-Modal Mutual-Information Decoding (M3ID), a new sampling method for +prompt amplification. M3ID amplifies the influence of the reference image over +the language prior, hence favoring the generation of tokens with higher mutual +information with the visual prompt. M3ID can be applied to any pre-trained +autoregressive VLM at inference time without necessitating further training and +with minimal computational overhead. If training is an option, we show that +M3ID can be paired with Direct Preference Optimization (DPO) to improve the +model's reliance on the prompt image without requiring any labels. Our +empirical findings show that our algorithms maintain the fluency and linguistic +capabilities of pre-trained VLMs while reducing hallucinations by mitigating +visually ungrounded answers. Specifically, for the LLaVA 13B model, M3ID and +M3ID+DPO reduce the percentage of hallucinated objects in captioning tasks by +25% and 28%, respectively, and improve the accuracy on VQA benchmarks such as +POPE by 21% and 24%. + +
+
+
+
+
+ + ☆ Uncertainty Driven Active Learning for Image Segmentation in Underwater + Inspection + + +
+ Active learning aims to select the minimum amount of data to train a model +that performs similarly to a model trained with the entire dataset. We study +the potential of active learning for image segmentation in underwater +infrastructure inspection tasks, where large amounts of data are typically +collected. The pipeline inspection images are usually semantically repetitive +but with great variations in quality. We use mutual information as the +acquisition function, calculated using Monte Carlo dropout. To assess the +effectiveness of the framework, DenseNet and HyperSeg are trained with the +CamVid dataset using active learning. In addition, HyperSeg is trained with a +pipeline inspection dataset of over 50,000 images. For the pipeline dataset, +HyperSeg with active learning achieved 67.5% meanIoU using 12.5% of the data, +and 61.4% with the same amount of randomly selected images. This shows that +using active learning for segmentation models in underwater inspection tasks +can lower the cost significantly. + +
+
+ comment: 16 pages, 8 figures, to be published in the Proceedings of the 4th + International Conference on Robotics, Computer Vision and Intelligent + Systems, Springer Nature, Feb 2024 +
+
+
+
+
+ + ☆ P-Count: Persistence-based Counting of White Matter Hyperintensities in + Brain MRI + + +
+ White matter hyperintensities (WMH) are a hallmark of cerebrovascular disease +and multiple sclerosis. Automated WMH segmentation methods enable quantitative +analysis via estimation of total lesion load, spatial distribution of lesions, +and number of lesions (i.e., number of connected components after +thresholding), all of which are correlated with patient outcomes. While the two +former measures can generally be estimated robustly, the number of lesions is +highly sensitive to noise and segmentation mistakes -- even when small +connected components are eroded or disregarded. In this article, we present +P-Count, an algebraic WMH counting tool based on persistent homology that +accounts for the topological features of WM lesions in a robust manner. Using +computational geometry, P-Count takes the persistence of connected components +into consideration, effectively filtering out the noisy WMH positives, +resulting in a more accurate count of true lesions. We validated P-Count on the +ISBI2015 longitudinal lesion segmentation dataset, where it produces +significantly more accurate results than direct thresholding. + +
+
+ comment: 11 pages, 4 figures +
+
+
+
+
+ + ☆ SeFFeC: Semantic Facial Feature Control for Fine-grained Face Editing + + +
+ We propose Semantic Facial Feature Control (SeFFeC) - a novel method for +fine-grained face shape editing. Our method enables the manipulation of +human-understandable, semantic face features, such as nose length or mouth +width, which are defined by different groups of facial landmarks. In contrast +to existing methods, the use of facial landmarks enables precise measurement of +the facial features, which then enables training SeFFeC without any manually +annotated labels. SeFFeC consists of a transformer-based encoder network that +takes a latent vector of a pre-trained generative model and a facial feature +embedding as input, and learns to modify the latent vector to perform the +desired face edit operation. To ensure that the desired feature measurement is +changed towards the target value without altering uncorrelated features, we +introduced a novel semantic face feature loss. Qualitative and quantitative +results show that SeFFeC enables precise and fine-grained control of 23 facial +features, some of which could not previously be controlled by other methods, +without requiring manual annotations. Unlike existing methods, SeFFeC also +provides deterministic control over the exact values of the facial features and +more localised and disentangled face edits. + +
+
+
+
+
+ + ☆ ConGeo: Robust Cross-view Geo-localization across Ground View Variations + + +
+ Cross-view geo-localization aims at localizing a ground-level query image by +matching it to its corresponding geo-referenced aerial view. In real-world +scenarios, the task requires accommodating diverse ground images captured by +users with varying orientations and reduced field of views (FoVs). However, +existing learning pipelines are orientation-specific or FoV-specific, demanding +separate model training for different ground view variations. Such models +heavily depend on the North-aligned spatial correspondence and predefined FoVs +in the training data, compromising their robustness across different settings. +To tackle this challenge, we propose ConGeo, a single- and cross-modal +Contrastive method for Geo-localization: it enhances robustness and consistency +in feature representations to improve a model's invariance to orientation and +its resilience to FoV variations, by enforcing proximity between ground view +variations of the same location. As a generic learning objective for cross-view +geo-localization, when integrated into state-of-the-art pipelines, ConGeo +significantly boosts the performance of three base models on four +geo-localization benchmarks for diverse ground view variations and outperforms +competing methods that train separate models for each ground view variation. + +
+
+ comment: Project page at https://chasel-tsui.github.io/ConGeo/ +
+
+
+
+
+ + ☆ ACDG-VTON: Accurate and Contained Diffusion Generation for Virtual + Try-On + + +
+ Virtual Try-on (VTON) involves generating images of a person wearing selected +garments. Diffusion-based methods, in particular, can create high-quality +images, but they struggle to maintain the identities of the input garments. We +identified this problem stems from the specifics in the training formulation +for diffusion. To address this, we propose a unique training scheme that limits +the scope in which diffusion is trained. We use a control image that perfectly +aligns with the target image during training. In turn, this accurately +preserves garment details during inference. We demonstrate our method not only +effectively conserves garment details but also allows for layering, styling, +and shoe try-on. Our method runs multi-garment try-on in a single inference +cycle and can support high-quality zoomed-in generations without training in +higher resolutions. Finally, we show our method surpasses prior methods in +accuracy and quality. + +
+
+
+
+
+ + ☆ Enhancing Fingerprint Image Synthesis with GANs, Diffusion Models, and + Style Transfer Techniques + + +
+ We present novel approaches involving generative adversarial networks and +diffusion models in order to synthesize high quality, live and spoof +fingerprint images while preserving features such as uniqueness and diversity. +We generate live fingerprints from noise with a variety of methods, and we use +image translation techniques to translate live fingerprint images to spoof. To +generate different types of spoof images based on limited training data we +incorporate style transfer techniques through a cycle autoencoder equipped with +a Wasserstein metric along with Gradient Penalty (CycleWGAN-GP) in order to +avoid mode collapse and instability. We find that when the spoof training data +includes distinct spoof characteristics, it leads to improved live-to-spoof +translation. We assess the diversity and realism of the generated live +fingerprint images mainly through the Fr\'echet Inception Distance (FID) and +the False Acceptance Rate (FAR). Our best diffusion model achieved a FID of +15.78. The comparable WGAN-GP model achieved slightly higher FID while +performing better in the uniqueness assessment due to a slightly lower FAR when +matched against the training data, indicating better creativity. Moreover, we +give example images showing that a DDPM model clearly can generate realistic +fingerprint images. + +
+
+
+
+
+ + ☆ CoMo: Controllable Motion Generation through Language Guided Pose Code + Editing + + +
+ Text-to-motion models excel at efficient human motion generation, but +existing approaches lack fine-grained controllability over the generation +process. Consequently, modifying subtle postures within a motion or inserting +new actions at specific moments remains a challenge, limiting the applicability +of these methods in diverse scenarios. In light of these challenges, we +introduce CoMo, a Controllable Motion generation model, adept at accurately +generating and editing motions by leveraging the knowledge priors of large +language models (LLMs). Specifically, CoMo decomposes motions into discrete and +semantically meaningful pose codes, with each code encapsulating the semantics +of a body part, representing elementary information such as "left knee slightly +bent". Given textual inputs, CoMo autoregressively generates sequences of pose +codes, which are then decoded into 3D motions. Leveraging pose codes as +interpretable representations, an LLM can directly intervene in motion editing +by adjusting the pose codes according to editing instructions. Experiments +demonstrate that CoMo achieves competitive performance in motion generation +compared to state-of-the-art models while, in human studies, CoMo substantially +surpasses previous work in motion editing abilities. + +
+
+
+
+
+ + ☆ Towards Learning Contrast Kinetics with Multi-Condition Latent Diffusion + Models + + +
+ Contrast agents in dynamic contrast enhanced magnetic resonance imaging allow +to localize tumors and observe their contrast kinetics, which is essential for +cancer characterization and respective treatment decision-making. However, +contrast agent administration is not only associated with adverse health risks, +but also restricted for patients during pregnancy, and for those with kidney +malfunction, or other adverse reactions. With contrast uptake as key biomarker +for lesion malignancy, cancer recurrence risk, and treatment response, it +becomes pivotal to reduce the dependency on intravenous contrast agent +administration. To this end, we propose a multi-conditional latent diffusion +model capable of acquisition time-conditioned image synthesis of DCE-MRI +temporal sequences. To evaluate medical image synthesis, we additionally +propose and validate the Fr\'echet radiomics distance as an image quality +measure based on biomarker variability between synthetic and real imaging data. +Our results demonstrate our method's ability to generate realistic +multi-sequence fat-saturated breast DCE-MRI and uncover the emerging potential +of deep learning based contrast kinetics simulation. We publicly share our +accessible codebase at https://github.com/RichardObi/ccnet. + +
+
+
+
+
+ + ☆ ExMap: Leveraging Explainability Heatmaps for Unsupervised Group + Robustness to Spurious Correlations + + +
+ Group robustness strategies aim to mitigate learned biases in deep learning +models that arise from spurious correlations present in their training +datasets. However, most existing methods rely on the access to the label +distribution of the groups, which is time-consuming and expensive to obtain. As +a result, unsupervised group robustness strategies are sought. Based on the +insight that a trained model's classification strategies can be inferred +accurately based on explainability heatmaps, we introduce ExMap, an +unsupervised two stage mechanism designed to enhance group robustness in +traditional classifiers. ExMap utilizes a clustering module to infer +pseudo-labels based on a model's explainability heatmaps, which are then used +during training in lieu of actual labels. Our empirical studies validate the +efficacy of ExMap - We demonstrate that it bridges the performance gap with its +supervised counterparts and outperforms existing partially supervised and +unsupervised methods. Additionally, ExMap can be seamlessly integrated with +existing group robustness learning strategies. Finally, we demonstrate its +potential in tackling the emerging issue of multiple shortcut +mitigation\footnote{Code available at \url{https://github.com/rwchakra/exmap}}. + +
+
+
+
+
+ + ☆ Building Optimal Neural Architectures using Interpretable Knowledge CVPR'24 + + +
+ Neural Architecture Search is a costly practice. The fact that a search space +can span a vast number of design choices with each architecture evaluation +taking nontrivial overhead makes it hard for an algorithm to sufficiently +explore candidate networks. In this paper, we propose AutoBuild, a scheme which +learns to align the latent embeddings of operations and architecture modules +with the ground-truth performance of the architectures they appear in. By doing +so, AutoBuild is capable of assigning interpretable importance scores to +architecture modules, such as individual operation features and larger macro +operation sequences such that high-performance neural networks can be +constructed without any need for search. Through experiments performed on +state-of-the-art image classification, segmentation, and Stable Diffusion +models, we show that by mining a relatively small set of evaluated +architectures, AutoBuild can learn to build high-quality architectures directly +or help to reduce search space to focus on relevant areas, finding better +architectures that outperform both the original labeled ones and ones found by +search baselines. Code available at +https://github.com/Ascend-Research/AutoBuild + +
+
+ comment: CVPR'24; 18 Pages, 18 Figures, 3 Tables +
+
+
+
+
+ + ☆ Text-to-3D Shape Generation + + +
+ Recent years have seen an explosion of work and interest in text-to-3D shape +generation. Much of the progress is driven by advances in 3D representations, +large-scale pretraining and representation learning for text and image data +enabling generative AI models, and differentiable rendering. Computational +systems that can perform text-to-3D shape generation have captivated the +popular imagination as they enable non-expert users to easily create 3D content +directly from text. However, there are still many limitations and challenges +remaining in this problem space. In this state-of-the-art report, we provide a +survey of the underlying technology and methods enabling text-to-3D shape +generation to summarize the background literature. We then derive a systematic +categorization of recent work on text-to-3D shape generation based on the type +of supervision data required. Finally, we discuss limitations of the existing +categories of methods, and delineate promising directions for future work. + +
+
+
+
+
+ + ☆ AdaViPro: Region-based Adaptive Visual Prompt for Large-Scale Models + Adapting ICIP 2024 + + +
+ Recently, prompt-based methods have emerged as a new alternative +`parameter-efficient fine-tuning' paradigm, which only fine-tunes a small +number of additional parameters while keeping the original model frozen. +However, despite achieving notable results, existing prompt methods mainly +focus on `what to add', while overlooking the equally important aspect of +`where to add', typically relying on the manually crafted placement. To this +end, we propose a region-based Adaptive Visual Prompt, named AdaViPro, which +integrates the `where to add' optimization of the prompt into the learning +process. Specifically, we reconceptualize the `where to add' optimization as a +problem of regional decision-making. During inference, AdaViPro generates a +regionalized mask map for the whole image, which is composed of 0 and 1, to +designate whether to apply or discard the prompt in each specific area. +Therefore, we employ Gumbel-Softmax sampling to enable AdaViPro's end-to-end +learning through standard back-propagation. Extensive experiments demonstrate +that our AdaViPro yields new efficiency and accuracy trade-offs for adapting +pre-trained models. + +
+
+ comment: Submitted to ICIP 2024 +
+
+
+
+
+ + ☆ SC-Tune: Unleashing Self-Consistent Referential Comprehension in Large + Vision Language Models CVPR2024 + + +
+ Recent trends in Large Vision Language Models (LVLMs) research have been +increasingly focusing on advancing beyond general image understanding towards +more nuanced, object-level referential comprehension. In this paper, we present +and delve into the self-consistency capability of LVLMs, a crucial aspect that +reflects the models' ability to both generate informative captions for specific +objects and subsequently utilize these captions to accurately re-identify the +objects in a closed-loop process. This capability significantly mirrors the +precision and reliability of fine-grained visual-language understanding. Our +findings reveal that the self-consistency level of existing LVLMs falls short +of expectations, posing limitations on their practical applicability and +potential. To address this gap, we introduce a novel fine-tuning paradigm named +Self-Consistency Tuning (SC-Tune). It features the synergistic learning of a +cyclic describer-locator system. This paradigm is not only data-efficient but +also exhibits generalizability across multiple LVLMs. Through extensive +experiments, we demonstrate that SC-Tune significantly elevates performance +across a spectrum of object-level vision-language benchmarks and maintains +competitive or improved performance on image-level vision-language benchmarks. +Both our model and code will be publicly available at +https://github.com/ivattyue/SC-Tune. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ SAMCT: Segment Any CT Allowing Labor-Free Task-Indicator Prompts + + +
+ Segment anything model (SAM), a foundation model with superior versatility +and generalization across diverse segmentation tasks, has attracted widespread +attention in medical imaging. However, it has been proved that SAM would +encounter severe performance degradation due to the lack of medical knowledge +in training and local feature encoding. Though several SAM-based models have +been proposed for tuning SAM in medical imaging, they still suffer from +insufficient feature extraction and highly rely on high-quality prompts. In +this paper, we construct a large CT dataset consisting of 1.1M CT images and 5M +masks from public datasets and propose a powerful foundation model SAMCT +allowing labor-free prompts. Specifically, based on SAM, SAMCT is further +equipped with a U-shaped CNN image encoder, a cross-branch interaction module, +and a task-indicator prompt encoder. The U-shaped CNN image encoder works in +parallel with the ViT image encoder in SAM to supplement local features. +Cross-branch interaction enhances the feature expression capability of the CNN +image encoder and the ViT image encoder by exchanging global perception and +local features from one to the other. The task-indicator prompt encoder is a +plug-and-play component to effortlessly encode task-related indicators into +prompt embeddings. In this way, SAMCT can work in an automatic manner in +addition to the semi-automatic interactive strategy in SAM. Extensive +experiments demonstrate the superiority of SAMCT against the state-of-the-art +task-specific and SAM-based medical foundation models on various tasks. The +code, data, and models are released at https://github.com/xianlin7/SAMCT. + +
+
+
+
+
+ + ☆ A Unified and General Framework for Continual Learning ICLR 2024 + + +
+ Continual Learning (CL) focuses on learning from dynamic and changing data +distributions while retaining previously acquired knowledge. Various methods +have been developed to address the challenge of catastrophic forgetting, +including regularization-based, Bayesian-based, and memory-replay-based +techniques. However, these methods lack a unified framework and common +terminology for describing their approaches. This research aims to bridge this +gap by introducing a comprehensive and overarching framework that encompasses +and reconciles these existing methodologies. Notably, this new framework is +capable of encompassing established CL approaches as special instances within a +unified and general optimization objective. An intriguing finding is that +despite their diverse origins, these methods share common mathematical +structures. This observation highlights the compatibility of these seemingly +distinct techniques, revealing their interconnectedness through a shared +underlying optimization objective. Moreover, the proposed general framework +introduces an innovative concept called refresh learning, specifically designed +to enhance the CL performance. This novel approach draws inspiration from +neuroscience, where the human brain often sheds outdated information to improve +the retention of crucial knowledge and facilitate the acquisition of new +information. In essence, refresh learning operates by initially unlearning +current data and subsequently relearning it. It serves as a versatile plug-in +that seamlessly integrates with existing CL methods, offering an adaptable and +effective enhancement to the learning process. Extensive experiments on CL +benchmarks and theoretical analysis demonstrate the effectiveness of the +proposed refresh learning. Code is available at +\url{https://github.com/joey-wang123/CL-refresh-learning}. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ☆ Mora: Enabling Generalist Video Generation via A Multi-Agent Framework + + +
+ Sora is the first large-scale generalist video generation model that garnered +significant attention across society. Since its launch by OpenAI in February +2024, no other video generation models have paralleled {Sora}'s performance or +its capacity to support a broad spectrum of video generation tasks. +Additionally, there are only a few fully published video generation models, +with the majority being closed-source. To address this gap, this paper proposes +a new multi-agent framework Mora, which incorporates several advanced visual AI +agents to replicate generalist video generation demonstrated by Sora. In +particular, Mora can utilize multiple visual agents and successfully mimic +Sora's video generation capabilities in various tasks, such as (1) +text-to-video generation, (2) text-conditional image-to-video generation, (3) +extend generated videos, (4) video-to-video editing, (5) connect videos and (6) +simulate digital worlds. Our extensive experimental results show that Mora +achieves performance that is proximate to that of Sora in various tasks. +However, there exists an obvious performance gap between our work and Sora when +assessed holistically. In summary, we hope this project can guide the future +trajectory of video generation through collaborative AI agents. + +
+
+
+
+
+ + ☆ Beyond Skeletons: Integrative Latent Mapping for Coherent 4D Sequence + Generation + + +
+ Directly learning to model 4D content, including shape, color and motion, is +challenging. Existing methods depend on skeleton-based motion control and offer +limited continuity in detail. To address this, we propose a novel framework +that generates coherent 4D sequences with animation of 3D shapes under given +conditions with dynamic evolution of shape and color over time through +integrative latent mapping. We first employ an integrative latent unified +representation to encode shape and color information of each detailed 3D +geometry frame. The proposed skeleton-free latent 4D sequence joint +representation allows us to leverage diffusion models in a low-dimensional +space to control the generation of 4D sequences. Finally, temporally coherent +4D sequences are generated conforming well to the input images and text +prompts. Extensive experiments on the ShapeNet, 3DBiCar and DeformingThings4D +datasets for several tasks demonstrate that our method effectively learns to +generate quality 3D shapes with color and 4D mesh animations, improving over +the current state-of-the-art. Source code will be released. + +
+
+
+
+
+ + ☆ Self-Attention Based Semantic Decomposition in Vector Symbolic + Architectures + + +
+ Vector Symbolic Architectures (VSAs) have emerged as a novel framework for +enabling interpretable machine learning algorithms equipped with the ability to +reason and explain their decision processes. The basic idea is to represent +discrete information through high dimensional random vectors. Complex data +structures can be built up with operations over vectors such as the "binding" +operation involving element-wise vector multiplication, which associates data +together. The reverse task of decomposing the associated elements is a +combinatorially hard task, with an exponentially large search space. The main +algorithm for performing this search is the resonator network, inspired by +Hopfield network-based memory search operations. + In this work, we introduce a new variant of the resonator network, based on +self-attention based update rules in the iterative search problem. This update +rule, based on the Hopfield network with log-sum-exp energy function and +norm-bounded states, is shown to substantially improve the performance and rate +of convergence. As a result, our algorithm enables a larger capacity for +associative memory, enabling applications in many tasks like perception based +pattern recognition, scene decomposition, and object reasoning. We substantiate +our algorithm with a thorough evaluation and comparisons to baselines. + +
+
+
+
+
+ + ☆ Nellie: Automated organelle segmentation, tracking, and hierarchical + feature extraction in 2D/3D live-cell microscopy + + +
+ The analysis of dynamic organelles remains a formidable challenge, though key +to understanding biological processes. We introduce Nellie, an automated and +unbiased pipeline for segmentation, tracking, and feature extraction of diverse +intracellular structures. Nellie adapts to image metadata, eliminating user +input. Nellie's preprocessing pipeline enhances structural contrast on multiple +intracellular scales allowing for robust hierarchical segmentation of +sub-organellar regions. Internal motion capture markers are generated and +tracked via a radius-adaptive pattern matching scheme, and used as guides for +sub-voxel flow interpolation. Nellie extracts a plethora of features at +multiple hierarchical levels for deep and customizable analysis. Nellie +features a Napari-based GUI that allows for code-free operation and +visualization, while its modular open-source codebase invites customization by +experienced users. We demonstrate Nellie's wide variety of use cases with two +examples: unmixing multiple organelles from a single channel using +feature-based classification and training an unsupervised graph autoencoder on +mitochondrial multi-mesh graphs to quantify latent space embedding changes +following ionomycin treatment. + +
+
+ comment: for associated code, see https://github.com/aelefebv/nellie; 82 + pages, 5 main figures, 11 extended figures +
+
+
+
+
+ + ☆ Learning to Infer Generative Template Programs for Visual Concepts + + +
+ People grasp flexible visual concepts from a few examples. We explore a +neurosymbolic system that learns how to infer programs that capture visual +concepts in a domain-general fashion. We introduce Template Programs: +programmatic expressions from a domain-specific language that specify +structural and parametric patterns common to an input concept. Our framework +supports multiple concept-related tasks, including few-shot generation and +co-segmentation through parsing. We develop a learning paradigm that allows us +to train networks that infer Template Programs directly from visual datasets +that contain concept groupings. We run experiments across multiple visual +domains: 2D layouts, Omniglot characters, and 3D shapes. We find that our +method outperforms task-specific alternatives, and performs competitively +against domain-specific approaches for the limited domains where they exist. + +
+
+
+
+
+ + ☆ EC-IoU: Orienting Safety for Object Detectors via Ego-Centric + Intersection-over-Union IROS 2024 + + +
+ This paper presents safety-oriented object detection via a novel Ego-Centric +Intersection-over-Union (EC-IoU) measure, addressing practical concerns when +applying state-of-the-art learning-based perception models in safety-critical +domains such as autonomous driving. Concretely, we propose a weighting +mechanism to refine the widely used IoU measure, allowing it to assign a higher +score to a prediction that covers closer points of a ground-truth object from +the ego agent's perspective. The proposed EC-IoU measure can be used in typical +evaluation processes to select object detectors with higher safety-related +performance for downstream tasks. It can also be integrated into common loss +functions for model fine-tuning. While geared towards safety, our experiment +with the KITTI dataset demonstrates the performance of a model trained on +EC-IoU can be better than that of a variant trained on IoU in terms of mean +Average Precision as well. + +
+
+ comment: 8 pages (IEEE double column format), 7 figures, 2 tables, submitted + to IROS 2024 +
+
+
+
+
+ + ☆ Using Super-Resolution Imaging for Recognition of Low-Resolution Blurred + License Plates: A Comparative Study of Real-ESRGAN, A-ESRGAN, and StarSRGAN + + +
+ With the robust development of technology, license plate recognition +technology can now be properly applied in various scenarios, such as road +monitoring, tracking of stolen vehicles, detection at parking lot entrances and +exits, and so on. However, the precondition for these applications to function +normally is that the license plate must be 'clear' enough to be recognized by +the system with the correct license plate number. If the license plate becomes +blurred due to some external factors, then the accuracy of recognition will be +greatly reduced. Although there are many road surveillance cameras in Taiwan, +the quality of most cameras is not good, often leading to the inability to +recognize license plate numbers due to low photo resolution. Therefore, this +study focuses on using super-resolution technology to process blurred license +plates. This study will mainly fine-tune three super-resolution models: +Real-ESRGAN, A-ESRGAN, and StarSRGAN, and compare their effectiveness in +enhancing the resolution of license plate photos and enabling accurate license +plate recognition. By comparing different super-resolution models, it is hoped +to find the most suitable model for this task, providing valuable references +for future researchers. + +
+
+ comment: Master's thesis +
+
+
+
+
+ + ♻ ☆ AnyHome: Open-Vocabulary Generation of Structured and Textured 3D Homes + + +
+ Inspired by cognitive theories, we introduce AnyHome, a framework that +translates any text into well-structured and textured indoor scenes at a +house-scale. By prompting Large Language Models (LLMs) with designed templates, +our approach converts provided textual narratives into amodal structured +representations. These representations guarantee consistent and realistic +spatial layouts by directing the synthesis of a geometry mesh within defined +constraints. A Score Distillation Sampling process is then employed to refine +the geometry, followed by an egocentric inpainting process that adds lifelike +textures to it. AnyHome stands out with its editability, customizability, +diversity, and realism. The structured representations for scenes allow for +extensive editing at varying levels of granularity. Capable of interpreting +texts ranging from simple labels to detailed narratives, AnyHome generates +detailed geometries and textures that outperform existing methods in both +quantitative and qualitative measures. + +
+
+
+
+
+ + ♻ ☆ Magic-Me: Identity-Specific Video Customized Diffusion + + +
+ Creating content with specified identities (ID) has attracted significant +interest in the field of generative models. In the field of text-to-image +generation (T2I), subject-driven creation has achieved great progress with the +identity controlled via reference images. However, its extension to video +generation is not well explored. In this work, we propose a simple yet +effective subject identity controllable video generation framework, termed +Video Custom Diffusion (VCD). With a specified identity defined by a few +images, VCD reinforces the identity characteristics and injects frame-wise +correlation at the initialization stage for stable video outputs. To achieve +this, we propose three novel components that are essential for high-quality +identity preservation and stable video generation: 1) a noise initialization +method with 3D Gaussian Noise Prior for better inter-frame stability; 2) an ID +module based on extended Textual Inversion trained with the cropped identity to +disentangle the ID information from the background 3) Face VCD and Tiled VCD +modules to reinforce faces and upscale the video to higher resolution while +preserving the identity's features. We conducted extensive experiments to +verify that VCD is able to generate stable videos with better ID over the +baselines. Besides, with the transferability of the encoded identity in the ID +module, VCD is also working well with personalized text-to-image models +available publicly. The codes are available at +https://github.com/Zhen-Dong/Magic-Me. + +
+
+ comment: Project Page at https://magic-me-webpage.github.io +
+
+
+
+
+ + ♻ ☆ m&m's: A Benchmark to Evaluate Tool-Use for multi-step multi-modal Tasks + + +
+ Real-world multi-modal problems are rarely solved by a single machine +learning model, and often require multi-step computational plans that involve +stitching several models. Tool-augmented LLMs hold tremendous promise for +automating the generation of such computational plans. However, the lack of +standardized benchmarks for evaluating LLMs as planners for multi-step +multi-modal tasks has prevented a systematic study of planner design decisions. +Should LLMs generate a full plan in a single shot or step-by-step? Should they +invoke tools directly with Python code or through structured data formats like +JSON? Does feedback improve planning? To answer these questions and more, we +introduce m&m's: a benchmark containing 4K+ multi-step multi-modal tasks +involving 33 tools that include multi-modal models, (free) public APIs, and +image processing modules. For each of these task queries, we provide +automatically generated plans using this realistic toolset. We further provide +a high-quality subset of 1,565 task plans that are human-verified and correctly +executable. With m&m's, we evaluate 6 popular LLMs with 2 planning strategies +(multi-step vs. step-by-step planning), 2 plan formats (JSON vs. code), and 3 +types of feedback (parsing/verification/execution). Finally, we summarize +takeaways from our extensive experiments. Our dataset and code are available on +HuggingFace (https://huggingface.co/datasets/zixianma/mnms) and Github +(https://github.com/RAIVNLab/mnms). + +
+
+
+
+
+ + ♻ ☆ TrackDiffusion: Tracklet-Conditioned Video Generation via Diffusion + Models + + +
+ Despite remarkable achievements in video synthesis, achieving granular +control over complex dynamics, such as nuanced movement among multiple +interacting objects, still presents a significant hurdle for dynamic world +modeling, compounded by the necessity to manage appearance and disappearance, +drastic scale changes, and ensure consistency for instances across frames. +These challenges hinder the development of video generation that can faithfully +mimic real-world complexity, limiting utility for applications requiring +high-level realism and controllability, including advanced scene simulation and +training of perception systems. To address that, we propose TrackDiffusion, a +novel video generation framework affording fine-grained trajectory-conditioned +motion control via diffusion models, which facilitates the precise manipulation +of the object trajectories and interactions, overcoming the prevalent +limitation of scale and continuity disruptions. A pivotal component of +TrackDiffusion is the instance enhancer, which explicitly ensures inter-frame +consistency of multiple objects, a critical factor overlooked in the current +literature. Moreover, we demonstrate that generated video sequences by our +TrackDiffusion can be used as training data for visual perception models. To +the best of our knowledge, this is the first work to apply video diffusion +models with tracklet conditions and demonstrate that generated frames can be +beneficial for improving the performance of object trackers. + +
+
+
+
+
+ + ♻ ☆ PathMMU: A Massive Multimodal Expert-Level Benchmark for Understanding + and Reasoning in Pathology + + +
+ The emergence of large multimodal models has unlocked remarkable potential in +AI, particularly in pathology. However, the lack of specialized, high-quality +benchmark impeded their development and precise evaluation. To address this, we +introduce PathMMU, the largest and highest-quality expert-validated pathology +benchmark for Large Multimodal Models (LMMs). It comprises 33,428 multimodal +multi-choice questions and 24,067 images from various sources, each accompanied +by an explanation for the correct answer. The construction of PathMMU harnesses +GPT-4V's advanced capabilities, utilizing over 30,000 image-caption pairs to +enrich captions and generate corresponding Q&As in a cascading process. +Significantly, to maximize PathMMU's authority, we invite seven pathologists to +scrutinize each question under strict standards in PathMMU's validation and +test sets, while simultaneously setting an expert-level performance benchmark +for PathMMU. We conduct extensive evaluations, including zero-shot assessments +of 14 open-sourced and 4 closed-sourced LMMs and their robustness to image +corruption. We also fine-tune representative LMMs to assess their adaptability +to PathMMU. The empirical findings indicate that advanced LMMs struggle with +the challenging PathMMU benchmark, with the top-performing LMM, GPT-4V, +achieving only a 49.8% zero-shot performance, significantly lower than the +71.8% demonstrated by human pathologists. After fine-tuning, significantly +smaller open-sourced LMMs can outperform GPT-4V but still fall short of the +expertise shown by pathologists. We hope that the PathMMU will offer valuable +insights and foster the development of more specialized, next-generation LMMs +for pathology. + +
+
+ comment: 27 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ Jaccard Metric Losses: Optimizing the Jaccard Index with Soft Labels NeurIPS 2023 + + +
+ Intersection over Union (IoU) losses are surrogates that directly optimize +the Jaccard index. Leveraging IoU losses as part of the loss function have +demonstrated superior performance in semantic segmentation tasks compared to +optimizing pixel-wise losses such as the cross-entropy loss alone. However, we +identify a lack of flexibility in these losses to support vital training +techniques like label smoothing, knowledge distillation, and semi-supervised +learning, mainly due to their inability to process soft labels. To address +this, we introduce Jaccard Metric Losses (JMLs), which are identical to the +soft Jaccard loss in standard settings with hard labels but are fully +compatible with soft labels. We apply JMLs to three prominent use cases of soft +labels: label smoothing, knowledge distillation and semi-supervised learning, +and demonstrate their potential to enhance model accuracy and calibration. Our +experiments show consistent improvements over the cross-entropy loss across 4 +semantic segmentation datasets (Cityscapes, PASCAL VOC, ADE20K, DeepGlobe Land) +and 13 architectures, including classic CNNs and recent vision transformers. +Remarkably, our straightforward approach significantly outperforms +state-of-the-art knowledge distillation and semi-supervised learning methods. +The code is available at +\href{https://github.com/zifuwanggg/JDTLosses}{https://github.com/zifuwanggg/JDTLosses}. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Periodic Vibration Gaussian: Dynamic Urban Scene Reconstruction and + Real-time Rendering + + +
+ Modeling dynamic, large-scale urban scenes is challenging due to their highly +intricate geometric structures and unconstrained dynamics in both space and +time. Prior methods often employ high-level architectural priors, separating +static and dynamic elements, resulting in suboptimal capture of their +synergistic interactions. To address this challenge, we present a unified +representation model, called Periodic Vibration Gaussian (PVG). PVG builds upon +the efficient 3D Gaussian splatting technique, originally designed for static +scene representation, by introducing periodic vibration-based temporal +dynamics. This innovation enables PVG to elegantly and uniformly represent the +characteristics of various objects and elements in dynamic urban scenes. To +enhance temporally coherent and large scene representation learning with sparse +training data, we introduce a novel temporal smoothing mechanism and a +position-aware adaptive control strategy respectively. Extensive experiments on +Waymo Open Dataset and KITTI benchmarks demonstrate that PVG surpasses +state-of-the-art alternatives in both reconstruction and novel view synthesis +for both dynamic and static scenes. Notably, PVG achieves this without relying +on manually labeled object bounding boxes or expensive optical flow estimation. +Moreover, PVG exhibits 900-fold acceleration in rendering over the best +alternative. + +
+
+ comment: Project page: https://fudan-zvg.github.io/PVG/ +
+
+
+
+
+ + ♻ ☆ Normalizing flow-based deep variational Bayesian network for seismic + multi-hazards and impacts estimation from InSAR imagery + + +
+ Onsite disasters like earthquakes can trigger cascading hazards and impacts, +such as landslides and infrastructure damage, leading to catastrophic losses; +thus, rapid and accurate estimates are crucial for timely and effective +post-disaster responses. Interferometric Synthetic aperture radar (InSAR) data +is important in providing high-resolution onsite information for rapid hazard +estimation. Most recent methods using InSAR imagery signals predict a single +type of hazard and thus often suffer low accuracy due to noisy and complex +signals induced by co-located hazards, impacts, and irrelevant environmental +changes (e.g., vegetation changes, human activities). We introduce a novel +stochastic variational inference with normalizing flows derived to jointly +approximate posteriors of multiple unobserved hazards and impacts from noisy +InSAR imagery. + +
+
+ comment: This paper needs to be reviewed by the USGS +
+
+
+
+
+ + ♻ ☆ Uncertainty-Aware Source-Free Adaptive Image Super-Resolution with + Wavelet Augmentation Transformer + + +
+ Unsupervised Domain Adaptation (UDA) can effectively address domain gap +issues in real-world image Super-Resolution (SR) by accessing both the source +and target data. Considering privacy policies or transmission restrictions of +source data in practical scenarios, we propose a SOurce-free Domain Adaptation +framework for image SR (SODA-SR) to address this issue, i.e., adapt a +source-trained model to a target domain with only unlabeled target data. +SODA-SR leverages the source-trained model to generate refined pseudo-labels +for teacher-student learning. To better utilize pseudo-labels, we propose a +novel wavelet-based augmentation method, named Wavelet Augmentation Transformer +(WAT), which can be flexibly incorporated with existing networks, to implicitly +produce useful augmented data. WAT learns low-frequency information of varying +levels across diverse samples, which is aggregated efficiently via deformable +attention. Furthermore, an uncertainty-aware self-training mechanism is +proposed to improve the accuracy of pseudo-labels, with inaccurate predictions +being rectified by uncertainty estimation. To acquire better SR results and +avoid overfitting pseudo-labels, several regularization losses are proposed to +constrain target LR and SR images in the frequency domain. Experiments show +that without accessing source data, SODA-SR outperforms state-of-the-art UDA +methods in both synthetic$\rightarrow$real and real$\rightarrow$real adaptation +settings, and is not constrained by specific network architectures. + +
+
+ comment: 11 pages, 7 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Towards Architecture-Agnostic Untrained Network Priors for Image + Reconstruction with Frequency Regularization + + +
+ Untrained networks inspired by deep image prior have shown promising +capabilities in recovering a high-quality image from noisy or partial +measurements, without requiring training data. Their success has been widely +attributed to the spectral bias acting as an implicit regularization induced by +suitable network architectures. However, applications of such network-based +priors often entail superfluous architectural decisions, overfitting risks, and +slow optimization, all of which hinder their practicality. In this work, we +propose efficient, architecture-agnostic methods for a more direct frequency +control over the network priors: 1) constraining the bandwidth of the +white-noise input, 2) controlling the bandwidth of the interpolation-based +upsamplers, and 3) regularizing the Lipschitz constants of the layers. We show +that even with just one extra line of code, the overfitting issues in +underperforming architectures can be alleviated such that their performance +gaps with the high-performing counterparts can be largely closed despite their +distinct configurations, mitigating the need for architecture tuning. This then +makes it possible to employ a more compact model to achieve similar or superior +performance to larger models with greater efficiency. Our regularized network +priors compare favorably with current supervised and self-supervised methods on +MRI reconstruction and image inpainting tasks, serving as a stronger zero-shot +baseline reconstructor. Our code will be made publicly available. + +
+
+
+
+
+ + ♻ ☆ Simple Semantic-Aided Few-Shot Learning CVPR 2024 + + +
+ Learning from a limited amount of data, namely Few-Shot Learning, stands out +as a challenging computer vision task. Several works exploit semantics and +design complicated semantic fusion mechanisms to compensate for rare +representative features within restricted data. However, relying on naive +semantics such as class names introduces biases due to their brevity, while +acquiring extensive semantics from external knowledge takes a huge time and +effort. This limitation severely constrains the potential of semantics in +few-shot learning. In this paper, we design an automatic way called Semantic +Evolution to generate high-quality semantics. The incorporation of high-quality +semantics alleviates the need for complex network structures and learning +algorithms used in previous works. Hence, we employ a simple two-layer network +termed Semantic Alignment Network to transform semantics and visual features +into robust class prototypes with rich discriminative features for few-shot +classification. The experimental results show our framework outperforms all +previous methods on six benchmarks, demonstrating a simple network with +high-quality semantics can beat intricate multi-modal modules on few-shot +classification tasks. Code is available at +https://github.com/zhangdoudou123/SemFew. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ MMICL: Empowering Vision-language Model with Multi-Modal In-Context + Learning ICLR2024 + + +
+ Since the resurgence of deep learning, vision-language models (VLMs) enhanced +by large language models (LLMs) have grown exponentially in popularity. +However, while LLMs can utilize extensive background knowledge and task +information with in-context learning, most VLMs still struggle with +understanding complex multi-modal prompts with multiple images, making VLMs +less effective in downstream vision-language tasks. In this paper, we address +the limitation above by 1) introducing vision-language Model with Multi-Modal +In-Context Learning(MMICL), a new approach to allow the VLM to deal with +multi-modal inputs efficiently; 2) proposing a novel context scheme to augment +the in-context learning ability of the VLM; 3) constructing the Multi-modal +In-Context Learning (MIC) dataset, designed to enhance the VLM's ability to +understand complex multi-modal prompts. Our experiments confirm that MMICL +achieves new state-of-the-art zero-shot performance on a wide range of general +vision-language tasks, especially for complex benchmarks, including MME and +MMBench. Our analysis demonstrates that MMICL effectively tackles the challenge +of complex multi-modal prompt understanding and emerges the impressive ICL +ability. Furthermore, we observe that MMICL successfully alleviates language +bias in VLMs, a common issue for VLMs that often leads to hallucination when +faced with extensive textual context. Our code, dataset, dataset tool, and +model are available at https://github.com/PKUnlp-icler/MIC + +
+
+ comment: Accepted by ICLR2024 +
+
+
+
+
+ + ♻ ☆ Multimodal Prompt Perceiver: Empower Adaptiveness, Generalizability and + Fidelity for All-in-One Image Restoration + + +
+ Despite substantial progress, all-in-one image restoration (IR) grapples with +persistent challenges in handling intricate real-world degradations. This paper +introduces MPerceiver: a novel multimodal prompt learning approach that +harnesses Stable Diffusion (SD) priors to enhance adaptiveness, +generalizability and fidelity for all-in-one image restoration. Specifically, +we develop a dual-branch module to master two types of SD prompts: textual for +holistic representation and visual for multiscale detail representation. Both +prompts are dynamically adjusted by degradation predictions from the CLIP image +encoder, enabling adaptive responses to diverse unknown degradations. Moreover, +a plug-in detail refinement module improves restoration fidelity via direct +encoder-to-decoder information transformation. To assess our method, MPerceiver +is trained on 9 tasks for all-in-one IR and outperforms state-of-the-art +task-specific methods across most tasks. Post multitask pre-training, +MPerceiver attains a generalized representation in low-level vision, exhibiting +remarkable zero-shot and few-shot capabilities in unseen tasks. Extensive +experiments on 16 IR tasks underscore the superiority of MPerceiver in terms of +adaptiveness, generalizability and fidelity. + +
+
+ comment: 13 pages, 8 figures, 9 tables +
+
+
+
+
+ + ♻ ☆ Auto-Vocabulary Semantic Segmentation + + +
+ Open-ended image understanding tasks gained significant attention from the +research community, particularly with the emergence of Vision-Language Models. +Open-Vocabulary Segmentation (OVS) methods are capable of performing semantic +segmentation without relying on a fixed vocabulary, and in some cases, they +operate without the need for training or fine-tuning. However, OVS methods +typically require users to specify the vocabulary based on the task or dataset +at hand. In this paper, we introduce \textit{Auto-Vocabulary Semantic +Segmentation (AVS)}, advancing open-ended image understanding by eliminating +the necessity to predefine object categories for segmentation. Our approach, +\ours, presents a framework that autonomously identifies relevant class names +using enhanced BLIP embeddings, which are utilized for segmentation afterwards. +Given that open-ended object category predictions cannot be directly compared +with a fixed ground truth, we develop a Large Language Model-based +Auto-Vocabulary Evaluator (LAVE) to efficiently evaluate the automatically +generated class names and their corresponding segments. Our method sets new +benchmarks on datasets such as PASCAL VOC and Context, ADE20K, and Cityscapes +for AVS and showcases competitive performance to OVS methods that require +specified class names. + +
+
+
+
+
+ + ♻ ☆ CoNeS: Conditional neural fields with shift modulation for + multi-sequence MRI translation + + +
+ Multi-sequence magnetic resonance imaging (MRI) has found wide applications +in both modern clinical studies and deep learning research. However, in +clinical practice, it frequently occurs that one or more of the MRI sequences +are missing due to different image acquisition protocols or contrast agent +contraindications of patients, limiting the utilization of deep learning models +trained on multi-sequence data. One promising approach is to leverage +generative models to synthesize the missing sequences, which can serve as a +surrogate acquisition. State-of-the-art methods tackling this problem are based +on convolutional neural networks (CNN) which usually suffer from spectral +biases, resulting in poor reconstruction of high-frequency fine details. In +this paper, we propose Conditional Neural fields with Shift modulation (CoNeS), +a model that takes voxel coordinates as input and learns a representation of +the target images for multi-sequence MRI translation. The proposed model uses a +multi-layer perceptron (MLP) instead of a CNN as the decoder for pixel-to-pixel +mapping. Hence, each target image is represented as a neural field that is +conditioned on the source image via shift modulation with a learned latent +code. Experiments on BraTS 2018 and an in-house clinical dataset of vestibular +schwannoma patients showed that the proposed method outperformed +state-of-the-art methods for multi-sequence MRI translation both visually and +quantitatively. Moreover, we conducted spectral analysis, showing that CoNeS +was able to overcome the spectral bias issue common in conventional CNN models. +To further evaluate the usage of synthesized images in clinical downstream +tasks, we tested a segmentation network using the synthesized images at +inference. + +
+
+ comment: Accepted for publication at the Journal of Machine Learning for + Biomedical Imaging (MELBA) https://melba-journal.org/2024:004 +
+
+
+
+
+ + ♻ ☆ Dice Semimetric Losses: Optimizing the Dice Score with Soft Labels MICCAI 2023 + + +
+ The soft Dice loss (SDL) has taken a pivotal role in numerous automated +segmentation pipelines in the medical imaging community. Over the last years, +some reasons behind its superior functioning have been uncovered and further +optimizations have been explored. However, there is currently no implementation +that supports its direct utilization in scenarios involving soft labels. Hence, +a synergy between the use of SDL and research leveraging the use of soft +labels, also in the context of model calibration, is still missing. In this +work, we introduce Dice semimetric losses (DMLs), which (i) are by design +identical to SDL in a standard setting with hard labels, but (ii) can be +employed in settings with soft labels. Our experiments on the public QUBIQ, +LiTS and KiTS benchmarks confirm the potential synergy of DMLs with soft labels +(e.g. averaging, label smoothing, and knowledge distillation) over hard labels +(e.g. majority voting and random selection). As a result, we obtain superior +Dice scores and model calibration, which supports the wider adoption of DMLs in +practice. The code is available at https://github.com/zifuwanggg/JDTLosses + +
+
+ comment: MICCAI 2023 +
+
+
+
+
+ + ♻ ☆ Poly Kernel Inception Network for Remote Sensing Detection + + +
+ Object detection in remote sensing images (RSIs) often suffers from several +increasing challenges, including the large variation in object scales and the +diverse-ranging context. Prior methods tried to address these challenges by +expanding the spatial receptive field of the backbone, either through +large-kernel convolution or dilated convolution. However, the former typically +introduces considerable background noise, while the latter risks generating +overly sparse feature representations. In this paper, we introduce the Poly +Kernel Inception Network (PKINet) to handle the above challenges. PKINet +employs multi-scale convolution kernels without dilation to extract object +features of varying scales and capture local context. In addition, a Context +Anchor Attention (CAA) module is introduced in parallel to capture long-range +contextual information. These two components work jointly to advance the +performance of PKINet on four challenging remote sensing detection benchmarks, +namely DOTA-v1.0, DOTA-v1.5, HRSC2016, and DIOR-R. + +
+
+ comment: accepted by IEEE Conference on Computer Vision and Pattern + Recognition, 2024 +
+
+
+
+
+ + ♻ ☆ Weakly supervised segmentation of intracranial aneurysms using a novel + 3D focal modulation UNet + + +
+ Accurate identification and quantification of unruptured intracranial +aneurysms (UIAs) is crucial for the risk assessment and treatment of this +cerebrovascular disorder. Current 2D manual assessment on 3D magnetic resonance +angiography (MRA) is suboptimal and time-consuming. In addition, one major +issue in medical image segmentation is the need for large well-annotated data, +which can be expensive to obtain. Techniques that mitigate this requirement, +such as weakly supervised learning with coarse labels are highly desirable. In +the paper, we propose FocalSegNet, a novel 3D focal modulation UNet, to detect +an aneurysm and offer an initial, coarse segmentation of it from time-of-flight +MRA image patches, which is further refined with a dense conditional random +field (CRF) post-processing layer to produce a final segmentation map. We +trained and evaluated our model on a public dataset, and in terms of UIA +detection, our model showed a low false-positive rate of 0.21 and a high +sensitivity of 0.80. For voxel-wise aneurysm segmentation, we achieved a Dice +score of 0.68 and a 95% Hausdorff distance of ~0.95 mm, demonstrating its +strong performance. We evaluated our algorithms against the state-of-the-art 3D +Residual-UNet and Swin-UNETR, and illustrated the superior performance of our +proposed FocalSegNet, highlighting the advantages of employing focal modulation +for this task. + +
+
+
+
+
+ + ♻ ☆ View-Consistent 3D Editing with Gaussian Splatting + + +
+ The advent of 3D Gaussian Splatting (3DGS) has revolutionized 3D editing, +offering efficient, high-fidelity rendering and enabling precise local +manipulations. Currently, diffusion-based 2D editing models are harnessed to +modify multi-view rendered images, which then guide the editing of 3DGS models. +However, this approach faces a critical issue of multi-view inconsistency, +where the guidance images exhibit significant discrepancies across views, +leading to mode collapse and visual artifacts of 3DGS. To this end, we +introduce View-consistent Editing (VcEdit), a novel framework that seamlessly +incorporates 3DGS into image editing processes, ensuring multi-view consistency +in edited guidance images and effectively mitigating mode collapse issues. +VcEdit employs two innovative consistency modules: the Cross-attention +Consistency Module and the Editing Consistency Module, both designed to reduce +inconsistencies in edited images. By incorporating these consistency modules +into an iterative pattern, VcEdit proficiently resolves the issue of multi-view +inconsistency, facilitating high-quality 3DGS editing across a diverse range of +scenes. + +
+
+
+
+
+ + ♻ ☆ DiffMOT: A Real-time Diffusion-based Multiple Object Tracker with + Non-linear Prediction CVPR2024 + + +
+ In Multiple Object Tracking, objects often exhibit non-linear motion of +acceleration and deceleration, with irregular direction changes. +Tacking-by-detection (TBD) trackers with Kalman Filter motion prediction work +well in pedestrian-dominant scenarios but fall short in complex situations when +multiple objects perform non-linear and diverse motion simultaneously. To +tackle the complex non-linear motion, we propose a real-time diffusion-based +MOT approach named DiffMOT. Specifically, for the motion predictor component, +we propose a novel Decoupled Diffusion-based Motion Predictor (D$^2$MP). It +models the entire distribution of various motion presented by the data as a +whole. It also predicts an individual object's motion conditioning on an +individual's historical motion information. Furthermore, it optimizes the +diffusion process with much fewer sampling steps. As a MOT tracker, the DiffMOT +is real-time at 22.7FPS, and also outperforms the state-of-the-art on +DanceTrack and SportsMOT datasets with $62.3\%$ and $76.2\%$ in HOTA metrics, +respectively. To the best of our knowledge, DiffMOT is the first to introduce a +diffusion probabilistic model into the MOT to tackle non-linear motion +prediction. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ♻ ☆ OSCaR: Object State Captioning and State Change Representation NAACL 2024 + + +
+ The capability of intelligent models to extrapolate and comprehend changes in +object states is a crucial yet demanding aspect of AI research, particularly +through the lens of human interaction in real-world settings. This task +involves describing complex visual environments, identifying active objects, +and interpreting their changes as conveyed through language. Traditional +methods, which isolate object captioning and state change detection, offer a +limited view of dynamic environments. Moreover, relying on a small set of +symbolic words to represent changes has restricted the expressiveness of the +language. To address these challenges, in this paper, we introduce the Object +State Captioning and State Change Representation (OSCaR) dataset and benchmark. +OSCaR consists of 14,084 annotated video segments with nearly 1,000 unique +objects from various egocentric video collections. It sets a new testbed for +evaluating multimodal large language models (MLLMs). Our experiments +demonstrate that while MLLMs show some skill, they lack a full understanding of +object state changes. The benchmark includes a fine-tuned model that, despite +initial capabilities, requires significant improvements in accuracy and +generalization ability for effective understanding of these changes. Our code +and dataset are available at https://github.com/nguyennm1024/OSCaR. + +
+
+ comment: NAACL 2024 +
+
+
+
+
+ + ♻ ☆ On the Privacy Effect of Data Enhancement via the Lens of Memorization + + +
+ Machine learning poses severe privacy concerns as it has been shown that the +learned models can reveal sensitive information about their training data. Many +works have investigated the effect of widely adopted data augmentation and +adversarial training techniques, termed data enhancement in the paper, on the +privacy leakage of machine learning models. Such privacy effects are often +measured by membership inference attacks (MIAs), which aim to identify whether +a particular example belongs to the training set or not. We propose to +investigate privacy from a new perspective called memorization. Through the +lens of memorization, we find that previously deployed MIAs produce misleading +results as they are less likely to identify samples with higher privacy risks +as members compared to samples with low privacy risks. To solve this problem, +we deploy a recent attack that can capture individual samples' memorization +degrees for evaluation. Through extensive experiments, we unveil several +findings about the connections between three essential properties of machine +learning models, including privacy, generalization gap, and adversarial +robustness. We demonstrate that the generalization gap and privacy leakage are +less correlated than those of the previous results. Moreover, there is not +necessarily a trade-off between adversarial robustness and privacy as stronger +adversarial robustness does not make the model more susceptible to privacy +attacks. + +
+
+ comment: Accepted by IEEE TIFS, 17 pages +
+
+
+
+
+ + ♻ ☆ Multimodal Fusion Method with Spatiotemporal Sequences and Relationship + Learning for Valence-Arousal Estimation + + +
+ This paper presents our approach for the VA (Valence-Arousal) estimation task +in the ABAW6 competition. We devised a comprehensive model by preprocessing +video frames and audio segments to extract visual and audio features. Through +the utilization of Temporal Convolutional Network (TCN) modules, we effectively +captured the temporal and spatial correlations between these features. +Subsequently, we employed a Transformer encoder structure to learn long-range +dependencies, thereby enhancing the model's performance and generalization +ability. Our method leverages a multimodal data fusion approach, integrating +pre-trained audio and video backbones for feature extraction, followed by +TCN-based spatiotemporal encoding and Transformer-based temporal information +capture. Experimental results demonstrate the effectiveness of our approach, +achieving competitive performance in VA estimation on the AffWild2 dataset. + +
+
+ comment: 8 pages,3 figures +
+
+
+
+
+ + ♻ ☆ Surfer: Progressive Reasoning with World Models for Robotic Manipulation + + +
+ Considering how to make the model accurately understand and follow natural +language instructions and perform actions consistent with world knowledge is a +key challenge in robot manipulation. This mainly includes human fuzzy +instruction reasoning and the following of physical knowledge. Therefore, the +embodied intelligence agent must have the ability to model world knowledge from +training data. However, most existing vision and language robot manipulation +methods mainly operate in less realistic simulator and language settings and +lack explicit modeling of world knowledge. To bridge this gap, we introduce a +novel and simple robot manipulation framework, called Surfer. It is based on +the world model, treats robot manipulation as a state transfer of the visual +scene, and decouples it into two parts: action and scene. Then, the +generalization ability of the model on new instructions and new scenes is +enhanced by explicit modeling of the action and scene prediction in multi-modal +information. In addition to the framework, we also built a robot manipulation +simulator that supports full physics execution based on the MuJoCo physics +engine. It can automatically generate demonstration training data and test +data, effectively reducing labor costs. To conduct a comprehensive and +systematic evaluation of the robot manipulation model in terms of language +understanding and physical execution, we also created a robotic manipulation +benchmark with progressive reasoning tasks, called SeaWave. It contains 4 +levels of progressive reasoning tasks and can provide a standardized testing +platform for embedded AI agents in multi-modal environments. On average, Surfer +achieved a success rate of 54.74% on the defined four levels of manipulation +tasks, exceeding the best baseline performance of 47.64%. + +
+
+
+
+
+ + ♻ ☆ Learning Spatiotemporal Inconsistency via Thumbnail Layout for Face + Deepfake Detection + + +
+ The deepfake threats to society and cybersecurity have provoked significant +public apprehension, driving intensified efforts within the realm of deepfake +video detection. Current video-level methods are mostly based on {3D CNNs} +resulting in high computational demands, although have achieved good +performance. This paper introduces an elegantly simple yet effective strategy +named Thumbnail Layout (TALL), which transforms a video clip into a pre-defined +layout to realize the preservation of spatial and temporal dependencies. This +transformation process involves sequentially masking frames at the same +positions within each frame. These frames are then resized into sub-frames and +reorganized into the predetermined layout, forming thumbnails. TALL is +model-agnostic and has remarkable simplicity, necessitating only minimal code +modifications. Furthermore, we introduce a graph reasoning block (GRB) and +semantic consistency (SC) loss to strengthen TALL, culminating in TALL++. GRB +enhances interactions between different semantic regions to capture +semantic-level inconsistency clues. The semantic consistency loss imposes +consistency constraints on semantic features to improve model generalization +ability. Extensive experiments on intra-dataset, cross-dataset, +diffusion-generated image detection, and deepfake generation method recognition +show that TALL++ achieves results surpassing or comparable to the +state-of-the-art methods, demonstrating the effectiveness of our approaches for +various deepfake detection problems. The code is available at +https://github.com/rainy-xu/TALL4Deepfake. + +
+
+ comment: Accepted by IJCV +
+
+
+
+
+ + ♻ ☆ Vulnerability analysis of captcha using Deep learning + + +
+ Several websites improve their security and avoid dangerous Internet attacks +by implementing CAPTCHAs (Completely Automated Public Turing test to tell +Computers and Humans Apart), a type of verification to identify whether the +end-user is human or a robot. The most prevalent type of CAPTCHA is text-based, +designed to be easily recognized by humans while being unsolvable towards +machines or robots. However, as deep learning technology progresses, +development of convolutional neural network (CNN) models that predict +text-based CAPTCHAs becomes easier. The purpose of this research is to +investigate the flaws and vulnerabilities in the CAPTCHA generating systems in +order to design more resilient CAPTCHAs. To achieve this, we created CapNet, a +Convolutional Neural Network. The proposed platform can evaluate both numerical +and alphanumerical CAPTCHAs + +
+
+
+
+
+ + ♻ ☆ Analyzing and Improving the Training Dynamics of Diffusion Models + + +
+ Diffusion models currently dominate the field of data-driven image synthesis +with their unparalleled scaling to large datasets. In this paper, we identify +and rectify several causes for uneven and ineffective training in the popular +ADM diffusion model architecture, without altering its high-level structure. +Observing uncontrolled magnitude changes and imbalances in both the network +activations and weights over the course of training, we redesign the network +layers to preserve activation, weight, and update magnitudes on expectation. We +find that systematic application of this philosophy eliminates the observed +drifts and imbalances, resulting in considerably better networks at equal +computational complexity. Our modifications improve the previous record FID of +2.41 in ImageNet-512 synthesis to 1.81, achieved using fast deterministic +sampling. + As an independent contribution, we present a method for setting the +exponential moving average (EMA) parameters post-hoc, i.e., after completing +the training run. This allows precise tuning of EMA length without the cost of +performing several training runs, and reveals its surprising interactions with +network architecture, training time, and guidance. + +
+
+
+
+
+ + ♻ ☆ Style Injection in Diffusion: A Training-free Approach for Adapting + Large-scale Diffusion Models for Style Transfer CVPR 2024 + + +
+ Despite the impressive generative capabilities of diffusion models, existing +diffusion model-based style transfer methods require inference-stage +optimization (e.g. fine-tuning or textual inversion of style) which is +time-consuming, or fails to leverage the generative ability of large-scale +diffusion models. To address these issues, we introduce a novel artistic style +transfer method based on a pre-trained large-scale diffusion model without any +optimization. Specifically, we manipulate the features of self-attention layers +as the way the cross-attention mechanism works; in the generation process, +substituting the key and value of content with those of style image. This +approach provides several desirable characteristics for style transfer +including 1) preservation of content by transferring similar styles into +similar image patches and 2) transfer of style based on similarity of local +texture (e.g. edge) between content and style images. Furthermore, we introduce +query preservation and attention temperature scaling to mitigate the issue of +disruption of original content, and initial latent Adaptive Instance +Normalization (AdaIN) to deal with the disharmonious color (failure to transfer +the colors of style). Our experimental results demonstrate that our proposed +method surpasses state-of-the-art methods in both conventional and +diffusion-based style transfer baselines. + +
+
+ comment: Accepted to CVPR 2024. Project page: + https://jiwoogit.github.io/StyleID_site +
+
+
+
+
+ + ♻ ☆ Joint Person Identity, Gender and Age Estimation from Hand Images using + Deep Multi-Task Representation Learning + + +
+ In this paper, we propose a multi-task representation learning framework to +jointly estimate the identity, gender and age of individuals from their hand +images for the purpose of criminal investigations since the hand images are +often the only available information in cases of serious crime such as sexual +abuse. We investigate different up-to-date deep learning architectures and +compare their performance for joint estimation of identity, gender and age from +hand images of perpetrators of serious crime. To simplify the age prediction, +we create age groups for the age estimation. We make extensive evaluations and +comparisons of both convolution-based and transformer-based deep learning +architectures on a publicly available 11k hands dataset. Our experimental +analysis shows that it is possible to efficiently estimate not only identity +but also other attributes such as gender and age of suspects jointly from hand +images for criminal investigations, which is crucial in assisting international +police forces in the court to identify and convict abusers. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2209.04821 +
+
+
+
+
+ + ♻ ☆ iComMa: Inverting 3D Gaussian Splatting for Camera Pose Estimation via + Comparing and Matching + + +
+ We present a method named iComMa to address the 6D camera pose estimation +problem in computer vision. Conventional pose estimation methods typically rely +on the target's CAD model or necessitate specific network training tailored to +particular object classes. Some existing methods have achieved promising +results in mesh-free object and scene pose estimation by inverting the Neural +Radiance Fields (NeRF). However, they still struggle with adverse +initializations such as large rotations and translations. To address this +issue, we propose an efficient method for accurate camera pose estimation by +inverting 3D Gaussian Splatting (3DGS). Specifically, a gradient-based +differentiable framework optimizes camera pose by minimizing the residual +between the query image and the rendered image, requiring no training. An +end-to-end matching module is designed to enhance the model's robustness +against adverse initializations, while minimizing pixel-level comparing loss +aids in precise pose estimation. Experimental results on synthetic and complex +real-world data demonstrate the effectiveness of the proposed approach in +challenging conditions and the accuracy of camera pose estimation. + +
+
+
+
+
+ + ♻ ☆ IVAC-P2L: Leveraging Irregular Repetition Priors for Improving Video + Action Counting + + +
+ Video Action Counting (VAC) is crucial in analyzing sports, fitness, and +everyday activities by quantifying repetitive actions in videos. However, +traditional VAC methods have overlooked the complexity of action repetitions, +such as interruptions and the variability in cycle duration. Our research +addresses the shortfall by introducing a novel approach to VAC, called +Irregular Video Action Counting (IVAC). IVAC prioritizes modeling irregular +repetition patterns in videos, which we define through two primary aspects: +Inter-cycle Consistency and Cycle-interval Inconsistency. Inter-cycle +Consistency ensures homogeneity in the spatial-temporal representations of +cycle segments, signifying action uniformity within cycles. Cycle-interval +inconsistency highlights the importance of distinguishing between cycle +segments and intervals based on their inherent content differences. To +encapsulate these principles, we propose a new methodology that includes +consistency and inconsistency modules, supported by a unique pull-push loss +(P2L) mechanism. The IVAC-P2L model applies a pull loss to promote coherence +among cycle segment features and a push loss to clearly distinguish features of +cycle segments from interval segments. Empirical evaluations conducted on the +RepCount dataset demonstrate that the IVAC-P2L model sets a new benchmark in +VAC task performance. Furthermore, the model demonstrates exceptional +adaptability and generalization across various video contents, outperforming +existing models on two additional datasets, UCFRep and Countix, without the +need for dataset-specific optimization. These results confirm the efficacy of +our approach in addressing irregular repetitions in videos and pave the way for +further advancements in video analysis and understanding. + +
+
+ comment: Source code: https://github.com/hwang-cs-ime/IVAC-P2L +
+
+
+
+
+ + ♻ ☆ Enhanced Face Authentication With Separate Loss Functions + + +
+ The overall objective of the main project is to propose and develop a system +of facial authentication in unlocking phones or applications in phones using +facial recognition. The system will include four separate architectures: face +detection, face recognition, face spoofing, and classification of closed eyes. +In which, we consider the problem of face recognition to be the most important, +determining the true identity of the person standing in front of the screen +with absolute accuracy is what facial recognition systems need to achieve. +Along with the development of the face recognition problem, the problem of the +anti-fake face is also gradually becoming popular and equally important. Our +goal is to propose and develop two loss functions: LMCot and Double Loss. Then +apply them to the face authentication process. + +
+
+ comment: in Vietnamese language +
+
+
+
+
+ + ♻ ☆ Impact of Synthetic Images on Morphing Attack Detection Using a Siamese + Network + + +
+ This paper evaluated the impact of synthetic images on Morphing Attack +Detection (MAD) using a Siamese network with a semi-hard-loss function. Intra +and cross-dataset evaluations were performed to measure synthetic image +generalisation capabilities using a cross-dataset for evaluation. Three +different pre-trained networks were used as feature extractors from traditional +MobileNetV2, MobileNetV3 and EfficientNetB0. Our results show that MAD trained +on EfficientNetB0 from FERET, FRGCv2, and FRLL can reach a lower error rate in +comparison with SOTA. Conversely, worse performances were reached when the +system was trained only with synthetic images. A mixed approach (synthetic + +digital) database may help to improve MAD and reduce the error rate. This fact +shows that we still need to keep going with our efforts to include synthetic +images in the training process. + +
+
+ comment: Arxiv version of CIARP2023 - fixed typo errors +
+
+
+
+
+ + ♻ ☆ Immunohistochemistry guided segmentation of benign epithelial cells, in + situ lesions, and invasive epithelial cells in breast cancer slides + + +
+ Digital pathology enables automatic analysis of histopathological sections +using artificial intelligence (AI). Automatic evaluation could improve +diagnostic efficiency and help find associations between morphological features +and clinical outcome. For development of such prediction models, identifying +invasive epithelial cells, and separating these from benign epithelial cells +and in situ lesions would be the first step. In this study, we aimed to develop +an AI model for segmentation of epithelial cells in sections from breast +cancer. We generated epithelial ground truth masks by restaining hematoxylin +and eosin (HE) sections with cytokeratin (CK) AE1/AE3, and by pathologists' +annotations. HE/CK image pairs were used to train a convolutional neural +network, and data augmentation was used to make the model more robust. Tissue +microarrays (TMAs) from 839 patients, and whole slide images from two patients +were used for training and evaluation of the models. The sections were derived +from four cohorts of breast cancer patients. TMAs from 21 patients from a fifth +cohort was used as a second test set. In quantitative evaluation, a mean Dice +score of 0.70, 0.79, and 0.75 for invasive epithelial cells, benign epithelial +cells, and in situ lesions, respectively, were achieved. In qualitative scoring +(0-5) by pathologists, results were best for all epithelium and invasive +epithelium, with scores of 4.7 and 4.4. Scores for benign epithelium and in +situ lesions were 3.7 and 2.0. The proposed model segmented epithelial cells in +HE stained breast cancer slides well, but further work is needed for accurate +division between the classes. Immunohistochemistry, together with pathologists' +annotations, enabled the creation of accurate ground truths. The model is made +freely available in FastPathology and the code is available at +https://github.com/AICAN-Research/breast-epithelium-segmentation + +
+
+ comment: 19 pages, 6 figures. Submitted to a scientific journal +
+
+
+
+
+ + ♻ ☆ MoST: Motion Style Transformer between Diverse Action Contents CVPR 2024 + + +
+ While existing motion style transfer methods are effective between two +motions with identical content, their performance significantly diminishes when +transferring style between motions with different contents. This challenge lies +in the lack of clear separation between content and style of a motion. To +tackle this challenge, we propose a novel motion style transformer that +effectively disentangles style from content and generates a plausible motion +with transferred style from a source motion. Our distinctive approach to +achieving the goal of disentanglement is twofold: (1) a new architecture for +motion style transformer with `part-attentive style modulator across body +parts' and `Siamese encoders that encode style and content features +separately'; (2) style disentanglement loss. Our method outperforms existing +methods and demonstrates exceptionally high quality, particularly in motion +pairs with different contents, without the need for heuristic post-processing. +Codes are available at https://github.com/Boeun-Kim/MoST. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ AttriCLIP: A Non-Incremental Learner for Incremental Knowledge Learning + + +
+ Continual learning aims to enable a model to incrementally learn knowledge +from sequentially arrived data. Previous works adopt the conventional +classification architecture, which consists of a feature extractor and a +classifier. The feature extractor is shared across sequentially arrived tasks +or classes, but one specific group of weights of the classifier corresponding +to one new class should be incrementally expanded. Consequently, the parameters +of a continual learner gradually increase. Moreover, as the classifier contains +all historical arrived classes, a certain size of the memory is usually +required to store rehearsal data to mitigate classifier bias and catastrophic +forgetting. In this paper, we propose a non-incremental learner, named +AttriCLIP, to incrementally extract knowledge of new classes or tasks. +Specifically, AttriCLIP is built upon the pre-trained visual-language model +CLIP. Its image encoder and text encoder are fixed to extract features from +both images and text. Text consists of a category name and a fixed number of +learnable parameters which are selected from our designed attribute word bank +and serve as attributes. As we compute the visual and textual similarity for +classification, AttriCLIP is a non-incremental learner. The attribute prompts, +which encode the common knowledge useful for classification, can effectively +mitigate the catastrophic forgetting and avoid constructing a replay memory. We +evaluate our AttriCLIP and compare it with CLIP-based and previous +state-of-the-art continual learning methods in realistic settings with +domain-shift and long-sequence learning. The results show that our method +performs favorably against previous state-of-the-arts. The implementation code +can be available at https://github.com/bhrqw/AttriCLIP. + +
+
+
+
+
+ + ♻ ☆ Camera Height Doesn't Change: Unsupervised Training for Metric Monocular + Road-Scene Depth Estimation + + +
+ In this paper, we introduce a novel training method for making any monocular +depth network learn absolute scale and estimate metric road-scene depth just +from regular training data, i.e., driving videos. We refer to this training +framework as StableCamH. The key idea is to leverage cars found on the road as +sources of scale supervision but to incorporate them in the training robustly. +StableCamH detects and estimates the sizes of cars in the frame and aggregates +scale information extracted from them into a camera height estimate whose +consistency across the entire video sequence is enforced as scale supervision. +This realizes robust unsupervised training of any, otherwise scale-oblivious, +monocular depth network to become not only scale-aware but also metric-accurate +without the need for auxiliary sensors and extra supervision. Extensive +experiments on the KITTI and Cityscapes datasets show the effectiveness of +StableCamH and its state-of-the-art accuracy compared with related methods. We +also show that StableCamH enables training on mixed datasets of different +camera heights, which leads to larger-scale training and thus higher +generalization. Metric depth reconstruction is essential in any road-scene +visual modeling, and StableCamH democratizes its deployment by establishing the +means to train any model as a metric depth estimator. + +
+
+
+
+
+ + ♻ ☆ End-to-end Learned Visual Odometry with Events and Frames + + +
+ Visual Odometry (VO) is crucial for autonomous robotic navigation, especially +in GPS-denied environments like planetary terrains. To improve robustness, +recent model-based VO systems have begun combining standard and event-based +cameras. Event cameras excel in low-light and high-speed motion, while standard +cameras provide dense and easier-to-track features, even in low-textured areas. +However, the field of image- and event-based VO still predominantly relies on +model-based methods and is yet to fully integrate recent image-only +advancements leveraging end-to-end learning-based architectures. Seamlessly +integrating the two modalities remains challenging due to their different +nature, one asynchronous, the other not, limiting the potential for a more +effective image- and event-based VO. We introduce RAMP-VO, the first end-to-end +learned image- and event-based VO system. It leverages novel Recurrent, +Asynchronous, and Massively Parallel (RAMP) encoders capable of fusing +asynchronous events with image data, providing 8x faster inference and 33% more +accurate predictions than existing solutions. Despite being trained only in +simulation, RAMP-VO outperforms image- and event-based methods by 46% and 60%, +respectively, on traditional, real-world benchmarks as well as newly introduced +Apollo and Malapert landing sequences, paving the way for robust and +asynchronous VO in space. + +
+
+ comment: 8 pages, 5 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ QUASAR: QUality and Aesthetics Scoring with Advanced Representations + + +
+ This paper introduces a new data-driven, non-parametric method for image +quality and aesthetics assessment, surpassing existing approaches and requiring +no prompt engineering or fine-tuning. We eliminate the need for expressive +textual embeddings by proposing efficient image anchors in the data. Through +extensive evaluations of 7 state-of-the-art self-supervised models, our method +demonstrates superior performance and robustness across various datasets and +benchmarks. Notably, it achieves high agreement with human assessments even +with limited data and shows high robustness to the nature of data and their +pre-processing pipeline. Our contributions offer a streamlined solution for +assessment of images while providing insights into the perception of visual +information. + +
+
+
+
+
+ + ♻ ☆ A Hybrid Transformer-Sequencer approach for Age and Gender + classification from in-wild facial images + + +
+ The advancements in computer vision and image processing techniques have led +to emergence of new application in the domain of visual surveillance, targeted +advertisement, content-based searching, and human-computer interaction etc. Out +of the various techniques in computer vision, face analysis, in particular, has +gained much attention. Several previous studies have tried to explore different +applications of facial feature processing for a variety of tasks, including age +and gender classification. However, despite several previous studies having +explored the problem, the age and gender classification of in-wild human faces +is still far from the achieving the desired levels of accuracy required for +real-world applications. This paper, therefore, attempts to bridge this gap by +proposing a hybrid model that combines self-attention and BiLSTM approaches for +age and gender classification problems. The proposed models performance is +compared with several state-of-the-art model proposed so far. An improvement of +approximately 10percent and 6percent over the state-of-the-art implementations +for age and gender classification, respectively, are noted for the proposed +model. The proposed model is thus found to achieve superior performance and is +found to provide a more generalized learning. The model can, therefore, be +applied as a core classification component in various image processing and +computer vision problems. + +
+
+ comment: 22 pages +
+
+
+
+
+ + ♻ ☆ DrivingGaussian: Composite Gaussian Splatting for Surrounding Dynamic + Autonomous Driving Scenes + + +
+ We present DrivingGaussian, an efficient and effective framework for +surrounding dynamic autonomous driving scenes. For complex scenes with moving +objects, we first sequentially and progressively model the static background of +the entire scene with incremental static 3D Gaussians. We then leverage a +composite dynamic Gaussian graph to handle multiple moving objects, +individually reconstructing each object and restoring their accurate positions +and occlusion relationships within the scene. We further use a LiDAR prior for +Gaussian Splatting to reconstruct scenes with greater details and maintain +panoramic consistency. DrivingGaussian outperforms existing methods in dynamic +driving scene reconstruction and enables photorealistic surround-view synthesis +with high-fidelity and multi-camera consistency. Our project page is at: +https://github.com/VDIGPKU/DrivingGaussian. + +
+
+
+
+
+ + ♻ ☆ AdjointDPM: Adjoint Sensitivity Method for Gradient Backpropagation of + Diffusion Probabilistic Models + + +
+ Existing customization methods require access to multiple reference examples +to align pre-trained diffusion probabilistic models (DPMs) with user-provided +concepts. This paper aims to address the challenge of DPM customization when +the only available supervision is a differentiable metric defined on the +generated contents. Since the sampling procedure of DPMs involves recursive +calls to the denoising UNet, na\"ive gradient backpropagation requires storing +the intermediate states of all iterations, resulting in extremely high memory +consumption. To overcome this issue, we propose a novel method AdjointDPM, +which first generates new samples from diffusion models by solving the +corresponding probability-flow ODEs. It then uses the adjoint sensitivity +method to backpropagate the gradients of the loss to the models' parameters +(including conditioning signals, network weights, and initial noises) by +solving another augmented ODE. To reduce numerical errors in both the forward +generation and gradient backpropagation processes, we further reparameterize +the probability-flow ODE and augmented ODE as simple non-stiff ODEs using +exponential integration. Finally, we demonstrate the effectiveness of +AdjointDPM on three interesting tasks: converting visual effects into +identification text embeddings, finetuning DPMs for specific types of +stylization, and optimizing initial noise to generate adversarial samples for +security auditing. + +
+
+
+
+
+ + ♻ ☆ KP-RED: Exploiting Semantic Keypoints for Joint 3D Shape Retrieval and + Deformation CVPR 2024 + + +
+ In this paper, we present KP-RED, a unified KeyPoint-driven REtrieval and +Deformation framework that takes object scans as input and jointly retrieves +and deforms the most geometrically similar CAD models from a pre-processed +database to tightly match the target. Unlike existing dense matching based +methods that typically struggle with noisy partial scans, we propose to +leverage category-consistent sparse keypoints to naturally handle both full and +partial object scans. Specifically, we first employ a lightweight retrieval +module to establish a keypoint-based embedding space, measuring the similarity +among objects by dynamically aggregating deformation-aware local-global +features around extracted keypoints. Objects that are close in the embedding +space are considered similar in geometry. Then we introduce the neural +cage-based deformation module that estimates the influence vector of each +keypoint upon cage vertices inside its local support region to control the +deformation of the retrieved shape. Extensive experiments on the synthetic +dataset PartNet and the real-world dataset Scan2CAD demonstrate that KP-RED +surpasses existing state-of-the-art approaches by a large margin. Codes and +trained models will be released in https://github.com/lolrudy/KP-RED. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Learning to Produce Semi-dense Correspondences for Visual Localization CVPR 2024 + + +
+ This study addresses the challenge of performing visual localization in +demanding conditions such as night-time scenarios, adverse weather, and +seasonal changes. While many prior studies have focused on improving +image-matching performance to facilitate reliable dense keypoint matching +between images, existing methods often heavily rely on predefined feature +points on a reconstructed 3D model. Consequently, they tend to overlook +unobserved keypoints during the matching process. Therefore, dense keypoint +matches are not fully exploited, leading to a notable reduction in accuracy, +particularly in noisy scenes. To tackle this issue, we propose a novel +localization method that extracts reliable semi-dense 2D-3D matching points +based on dense keypoint matches. This approach involves regressing semi-dense +2D keypoints into 3D scene coordinates using a point inference network. The +network utilizes both geometric and visual cues to effectively infer 3D +coordinates for unobserved keypoints from the observed ones. The abundance of +matching information significantly enhances the accuracy of camera pose +estimation, even in scenarios involving noisy or sparse 3D models. +Comprehensive evaluations demonstrate that the proposed method outperforms +other methods in challenging scenes and achieves competitive results in +large-scale visual localization benchmarks. The code will be available. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Genixer: Empowering Multimodal Large Language Models as a Powerful Data + Generator + + +
+ Instruction tuning data is essential for training the Multimodal Large +Language Models (MLLMs). However, the creation of high-quality instruction +tuning data presents significant challenges. Prior methods that depended on +GPT-4 for data generation were not only costly but also lacked satisfactory +performance in complex tasks (i.e., grounding-based reasoning tasks). To +address these issues, we developed an innovative data generation pipeline, +Genixer, to generate various high-quality instruction tuning data, including +nine representative tasks, e.g., Common VQA, REC, REG, and PointQ. +Specifically, Genixer provides a unified solution with four key steps for +alleviating the difficulty of data generation: (i) instruction data collection, +(ii) instruction template design, (iii) empowering MLLM, and (iv) data +generation and filtering. Subsequently, the superior qualitative results of our +Genixer demonstrate that current MLLMs have a strong potential to evolve into +powerful data generators. Additionally, to validate the efficacy of generated +data quantitatively, we add the instruction tuning data produced by Genixer +into the training of two representative MLLMs and observe the consistent +improvements on various VQA tasks and multimodal benchmarks. + +
+
+ comment: Technical report +
+
+
+
+
+ + ♻ ☆ Modality-missing RGBT Tracking: Invertible Prompt Learning and + High-quality Benchmarks + + +
+ Current RGBT tracking research relies on the complete multi-modal input, but +modal information might miss due to some factors such as thermal sensor +self-calibration and data transmission error, called modality-missing challenge +in this work. To address this challenge, we propose a novel invertible prompt +learning approach, which integrates the content-preserving prompts into a +well-trained tracking model to adapt to various modality-missing scenarios, for +robust RGBT tracking. Given one modality-missing scenario, we propose to +utilize the available modality to generate the prompt of the missing modality +to adapt to RGBT tracking model. However, the cross-modality gap between +available and missing modalities usually causes semantic distortion and +information loss in prompt generation. To handle this issue, we design the +invertible prompter by incorporating the full reconstruction of the input +available modality from the generated prompt. To provide a comprehensive +evaluation platform, we construct several high-quality benchmark datasets, in +which various modality-missing scenarios are considered to simulate real-world +challenges. Extensive experiments on three modality-missing benchmark datasets +show that our method achieves significant performance improvements compared +with state-of-the-art methods. We have released the code and simulation +datasets at: +\href{https://github.com/Alexadlu/Modality-missing-RGBT-Tracking.git}{https://github.com/Alexadlu/Modality-missing-RGBT-Tracking.git}. + +
+
+
+
+
+ + ♻ ☆ EAGLE: Eigen Aggregation Learning for Object-Centric Unsupervised + Semantic Segmentation + + +
+ Semantic segmentation has innately relied on extensive pixel-level annotated +data, leading to the emergence of unsupervised methodologies. Among them, +leveraging self-supervised Vision Transformers for unsupervised semantic +segmentation (USS) has been making steady progress with expressive deep +features. Yet, for semantically segmenting images with complex objects, a +predominant challenge remains: the lack of explicit object-level semantic +encoding in patch-level features. This technical limitation often leads to +inadequate segmentation of complex objects with diverse structures. To address +this gap, we present a novel approach, EAGLE, which emphasizes object-centric +representation learning for unsupervised semantic segmentation. Specifically, +we introduce EiCue, a spectral technique providing semantic and structural cues +through an eigenbasis derived from the semantic similarity matrix of deep image +features and color affinity from an image. Further, by incorporating our +object-centric contrastive loss with EiCue, we guide our model to learn +object-level representations with intra- and inter-image object-feature +consistency, thereby enhancing semantic accuracy. Extensive experiments on +COCO-Stuff, Cityscapes, and Potsdam-3 datasets demonstrate the state-of-the-art +USS results of EAGLE with accurate and consistent semantic segmentation across +complex scenes. + +
+
+
+
+
+ + ♻ ☆ Towards Effective Multiple-in-One Image Restoration: A Sequential and + Prompt Learning Strategy + + +
+ While single task image restoration (IR) has achieved significant successes, +it remains a challenging issue to train a single model which can tackle +multiple IR tasks. In this work, we investigate in-depth the multiple-in-one +(MiO) IR problem, which comprises seven popular IR tasks. We point out that MiO +IR faces two pivotal challenges: the optimization of diverse objectives and the +adaptation to multiple tasks. To tackle these challenges, we present two simple +yet effective strategies. The first strategy, referred to as sequential +learning, attempts to address how to optimize the diverse objectives, which +guides the network to incrementally learn individual IR tasks in a sequential +manner rather than mixing them together. The second strategy, i.e., prompt +learning, attempts to address how to adapt to the different IR tasks, which +assists the network to understand the specific task and improves the +generalization ability. By evaluating on 19 test sets, we demonstrate that the +sequential and prompt learning strategies can significantly enhance the MiO +performance of commonly used CNN and Transformer backbones. Our experiments +also reveal that the two strategies can supplement each other to learn better +degradation representations and enhance the model robustness. It is expected +that our proposed MiO IR formulation and strategies could facilitate the +research on how to train IR models with higher generalization capabilities. + +
+
+
+
+
+ + ♻ ☆ Posterior Distillation Sampling + + +
+ We introduce Posterior Distillation Sampling (PDS), a novel optimization +method for parametric image editing based on diffusion models. Existing +optimization-based methods, which leverage the powerful 2D prior of diffusion +models to handle various parametric images, have mainly focused on generation. +Unlike generation, editing requires a balance between conforming to the target +attribute and preserving the identity of the source content. Recent 2D image +editing methods have achieved this balance by leveraging the stochastic latent +encoded in the generative process of diffusion models. To extend the editing +capabilities of diffusion models shown in pixel space to parameter space, we +reformulate the 2D image editing method into an optimization form named PDS. +PDS matches the stochastic latents of the source and the target, enabling the +sampling of targets in diverse parameter spaces that align with a desired +attribute while maintaining the source's identity. We demonstrate that this +optimization resembles running a generative process with the target attribute, +but aligning this process with the trajectory of the source's generative +process. Extensive editing results in Neural Radiance Fields and Scalable +Vector Graphics representations demonstrate that PDS is capable of sampling +targets to fulfill the aforementioned balance across various parameter spaces. + +
+
+ comment: Project page: https://posterior-distillation-sampling.github.io/ +
+
+
+
+
+ + ♻ ☆ StyleHumanCLIP: Text-guided Garment Manipulation for StyleGAN-Human + + +
+ This paper tackles text-guided control of StyleGAN for editing garments in +full-body human images. Existing StyleGAN-based methods suffer from handling +the rich diversity of garments and body shapes and poses. We propose a +framework for text-guided full-body human image synthesis via an +attention-based latent code mapper, which enables more disentangled control of +StyleGAN than existing mappers. Our latent code mapper adopts an attention +mechanism that adaptively manipulates individual latent codes on different +StyleGAN layers under text guidance. In addition, we introduce feature-space +masking at inference time to avoid unwanted changes caused by text inputs. Our +quantitative and qualitative evaluations reveal that our method can control +generated images more faithfully to given texts than existing methods. + +
+
+ comment: VISIAPP 2024, project page: + https://www.cgg.cs.tsukuba.ac.jp/~yoshikawa/pub/style_human_clip/ +
+
+
+
+
+ + ♻ ☆ MEDBind: Unifying Language and Multimodal Medical Data Embeddings + + +
+ Medical vision-language pretraining models (VLPM) have achieved remarkable +progress in fusing chest X-rays (CXR) with clinical texts, introducing +image-text data binding approaches that enable zero-shot learning and +downstream clinical tasks. However, the current landscape lacks the holistic +integration of additional medical modalities, such as electrocardiograms (ECG). +We present MEDBind (Medical Electronic patient recorD), which learns joint +embeddings across CXR, ECG, and medical text. Using text data as the central +anchor, MEDBind features tri-modality binding, delivering competitive +performance in top-K retrieval, zero-shot, and few-shot benchmarks against +established VLPM, and the ability for CXR-to-ECG zero-shot classification and +retrieval. This seamless integration is achieved through combination of +contrastive loss on modality-text pairs with our proposed contrastive loss +function, Edge-Modality Contrastive Loss, fostering a cohesive embedding space +for CXR, ECG, and text. Finally, we demonstrate that MEDBind can improve +downstream tasks by directly integrating CXR and ECG embeddings into a +large-language model for multimodal prompt tuning. + +
+
+
+
+
+ + ♻ ☆ SALAD: Part-Level Latent Diffusion for 3D Shape Generation and + Manipulation + + +
+ We present a cascaded diffusion model based on a part-level implicit 3D +representation. Our model achieves state-of-the-art generation quality and also +enables part-level shape editing and manipulation without any additional +training in conditional setup. Diffusion models have demonstrated impressive +capabilities in data generation as well as zero-shot completion and editing via +a guided reverse process. Recent research on 3D diffusion models has focused on +improving their generation capabilities with various data representations, +while the absence of structural information has limited their capability in +completion and editing tasks. We thus propose our novel diffusion model using a +part-level implicit representation. To effectively learn diffusion with +high-dimensional embedding vectors of parts, we propose a cascaded framework, +learning diffusion first on a low-dimensional subspace encoding extrinsic +parameters of parts and then on the other high-dimensional subspace encoding +intrinsic attributes. In the experiments, we demonstrate the outperformance of +our method compared with the previous ones both in generation and part-level +completion and manipulation tasks. + +
+
+ comment: Project page: https://salad3d.github.io +
+
+
+
+
+ + ♻ ☆ Prompt Highlighter: Interactive Control for Multi-Modal LLMs CVPR 2024 + + +
+ This study targets a critical aspect of multi-modal LLMs' (LLMs&VLMs) +inference: explicit controllable text generation. Multi-modal LLMs empower +multi-modality understanding with the capability of semantic generation yet +bring less explainability and heavier reliance on prompt contents due to their +autoregressive generative nature. While manipulating prompt formats could +improve outputs, designing specific and precise prompts per task can be +challenging and ineffective. To tackle this issue, we introduce a novel +inference method, Prompt Highlighter, which enables users to highlight specific +prompt spans to interactively control the focus during generation. Motivated by +the classifier-free diffusion guidance, we form regular and unconditional +context pairs based on highlighted tokens, demonstrating that the +autoregressive generation in models can be guided in a classifier-free way. +Notably, we find that, during inference, guiding the models with highlighted +tokens through the attention weights leads to more desired outputs. Our +approach is compatible with current LLMs and VLMs, achieving impressive +customized generation results without training. Experiments confirm its +effectiveness in focusing on input contexts and generating reliable content. +Without tuning on LLaVA-v1.5, our method secured 70.7 in the MMBench test and +1552.5 in MME-perception. The code is available at: +https://github.com/dvlab-research/Prompt-Highlighter/ + +
+
+ comment: CVPR 2024; Project Page: + https://julianjuaner.github.io/projects/PromptHighlighter +
+
+
+
+
+ + ♻ ☆ Training Small Multimodal Models to Bridge Biomedical Competency Gap: A + Case Study in Radiology Imaging + + +
+ The scaling laws and extraordinary performance of large foundation models +motivate the development and utilization of such large models in biomedicine. +However, despite early promising results on some biomedical benchmarks, there +are still major challenges that need to be addressed before these models can be +used in real-world applications. Frontier models such as GPT-4V still have +major competency gaps in multimodal capabilities for biomedical applications. +Moreover, pragmatic issues such as access, cost, latency, and compliance make +it hard for clinicians to use privately-hosted state-of-the-art large models +directly on private patient data. In this paper, we explore training +open-source small multimodal models (SMMs) to bridge biomedical competency gaps +for unmet clinical needs. To maximize data efficiency, we adopt a modular +approach by incorporating state-of-the-art pre-trained models for image and +text modalities, and focusing on training a lightweight adapter to ground each +modality to the text embedding space. We conduct a comprehensive study of this +approach on radiology imaging. For training, we assemble a large dataset with +over 1 million image-text pairs. For evaluation, we propose a clinically driven +novel approach using GPT-4 and demonstrate its parity with expert evaluation. +We also study grounding qualitatively using attention. For best practice, we +conduct a systematic ablation study on various choices in data engineering and +multimodal training. The resulting LLaVA-Rad (7B) model attains +state-of-the-art results on radiology tasks such as report generation and +cross-modal retrieval, even outperforming much larger models such as GPT-4V and +Med-PaLM M (84B). LLaVA-Rad is fast and can be run on a single V100 GPU in +private settings, offering a promising state-of-the-art tool for real-world +clinical applications. + +
+
+
+
+
+ + ♻ ☆ OmniCount: Multi-label Object Counting with Semantic-Geometric Priors + + +
+ Object counting is pivotal for understanding the composition of scenes. +Previously, this task was dominated by class-specific methods, which have +gradually evolved into more adaptable class-agnostic strategies. However, these +strategies come with their own set of limitations, such as the need for manual +exemplar input and multiple passes for multiple categories, resulting in +significant inefficiencies. This paper introduces a new, more practical +approach enabling simultaneous counting of multiple object categories using an +open vocabulary framework. Our solution, OmniCount, stands out by using +semantic and geometric insights from pre-trained models to count multiple +categories of objects as specified by users, all without additional training. +OmniCount distinguishes itself by generating precise object masks and +leveraging point prompts via the Segment Anything Model for efficient counting. +To evaluate OmniCount, we created the OmniCount-191 benchmark, a +first-of-its-kind dataset with multi-label object counts, including points, +bounding boxes, and VQA annotations. Our comprehensive evaluation in +OmniCount-191, alongside other leading benchmarks, demonstrates OmniCount's +exceptional performance, significantly outpacing existing solutions and +heralding a new era in object counting technology. + +
+
+
+
+
+ + ♻ ☆ An Image-based Typology for Visualization + + +
+ We present and discuss the results of a qualitative analysis of visual +representations from images. We labeled each image's essential stimuli, the +removal of which would render a visualization uninterpretable. As a result, we +derive a typology of 10 visualization types of defined groups. We describe the +typology derivation process in which we engaged. The resulting typology and +image analysis can serve a number of purposes: enabling researchers to study +the evolution of the community and its research output over time, facilitating +the categorization of visualization images for the purpose of research and +teaching, allowing researchers and practitioners to identify visual design +styles to further align the quantification of any visual information processor, +be that a person or an algorithm observer, and it facilitates a discussion of +standardization in visualization. In addition to the visualization typology +from images, we provide a dataset of 6,833 tagged images and an online tool +that can be used to explore and analyze the large set of labeled images. The +tool and data set enable scholars to closely examine the diverse visual designs +used and how they are published and communicated in our community. A +pre-registration, a free copy of this paper, and all supplemental materials are +available via osf.io/dxjwt. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2209.07533 +
+
+
+
+
+ + ♻ ☆ AVID: Any-Length Video Inpainting with Diffusion Model + + +
+ Recent advances in diffusion models have successfully enabled text-guided +image inpainting. While it seems straightforward to extend such editing +capability into the video domain, there have been fewer works regarding +text-guided video inpainting. Given a video, a masked region at its initial +frame, and an editing prompt, it requires a model to do infilling at each frame +following the editing guidance while keeping the out-of-mask region intact. +There are three main challenges in text-guided video inpainting: ($i$) temporal +consistency of the edited video, ($ii$) supporting different inpainting types +at different structural fidelity levels, and ($iii$) dealing with variable +video length. To address these challenges, we introduce Any-Length Video +Inpainting with Diffusion Model, dubbed as AVID. At its core, our model is +equipped with effective motion modules and adjustable structure guidance, for +fixed-length video inpainting. Building on top of that, we propose a novel +Temporal MultiDiffusion sampling pipeline with a middle-frame attention +guidance mechanism, facilitating the generation of videos with any desired +duration. Our comprehensive experiments show our model can robustly deal with +various inpainting types at different video duration ranges, with high quality. +More visualization results are made publicly available at +https://zhang-zx.github.io/AVID/ . + +
+
+ comment: Project website: https://zhang-zx.github.io/AVID/ +
+
+
+
+
+ + ♻ ☆ Beyond Specialization: Assessing the Capabilities of MLLMs in Age and + Gender Estimation + + +
+ Multimodal Large Language Models (MLLMs) have recently gained immense +popularity. Powerful commercial models like ChatGPT-4V and Gemini, as well as +open-source ones such as LLaVA, are essentially general-purpose models and are +applied to solve a wide variety of tasks, including those in computer vision. +These neural networks possess such strong general knowledge and reasoning +abilities that they have proven capable of working even on tasks for which they +were not specifically trained. We compared the capabilities of the most +powerful MLLMs to date: ShareGPT4V, ChatGPT, LLaVA-Next in a specialized task +of age and gender estimation with our state-of-the-art specialized model, +MiVOLO. We also updated MiVOLO and provide details and new metrics in this +article. This comparison has yielded some interesting results and insights +about the strengths and weaknesses of the participating models. Furthermore, we +attempted various ways to fine-tune the ShareGPT4V model for this specific +task, aiming to achieve state-of-the-art results in this particular challenge. +Although such a model would not be practical in production, as it is incredibly +expensive compared to a specialized model like MiVOLO, it could be very useful +in some tasks, like data annotation. + +
+
+
+
+
+ + ♻ ☆ PhotoBot: Reference-Guided Interactive Photography via Natural Language IROS'24 + + +
+ We introduce PhotoBot, a framework for fully automated photo acquisition +based on an interplay between high-level human language guidance and a robot +photographer. We propose to communicate photography suggestions to the user via +reference images that are selected from a curated gallery. We leverage a visual +language model (VLM) and an object detector to characterize the reference +images via textual descriptions and then use a large language model (LLM) to +retrieve relevant reference images based on a user's language query through +text-based reasoning. To correspond the reference image and the observed scene, +we exploit pre-trained features from a vision transformer capable of capturing +semantic similarity across marked appearance variations. Using these features, +we compute pose adjustments for an RGB-D camera by solving a +perspective-n-point (PnP) problem. We demonstrate our approach using a +manipulator equipped with a wrist camera. Our user studies show that photos +taken by PhotoBot are often more aesthetically pleasing than those taken by +users themselves, as measured by human feedback. We also show that PhotoBot can +generalize to other reference sources such as paintings. + +
+
+ comment: Submitted to the IEEE/RSJ International Conference on Intelligent + Robotics and Systems (IROS'24), Abu Dhabi, UAE, Oct 14-18, 2024 +
+
+
+
+
+ + ♻ ☆ MRC-Net: 6-DoF Pose Estimation with MultiScale Residual Correlation CVPR 2024 + + +
+ We propose a single-shot approach to determining 6-DoF pose of an object with +available 3D computer-aided design (CAD) model from a single RGB image. Our +method, dubbed MRC-Net, comprises two stages. The first performs pose +classification and renders the 3D object in the classified pose. The second +stage performs regression to predict fine-grained residual pose within class. +Connecting the two stages is a novel multi-scale residual correlation (MRC) +layer that captures high-and-low level correspondences between the input image +and rendering from first stage. MRC-Net employs a Siamese network with shared +weights between both stages to learn embeddings for input and rendered images. +To mitigate ambiguity when predicting discrete pose class labels on symmetric +objects, we use soft probabilistic labels to define pose class in the first +stage. We demonstrate state-of-the-art accuracy, outperforming all competing +RGB-based methods on four challenging BOP benchmark datasets: T-LESS, LM-O, +YCB-V, and ITODD. Our method is non-iterative and requires no complex +post-processing. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ How to Handle Sketch-Abstraction in Sketch-Based Image Retrieval? CVPR 2024 + + +
+ In this paper, we propose a novel abstraction-aware sketch-based image +retrieval framework capable of handling sketch abstraction at varied levels. +Prior works had mainly focused on tackling sub-factors such as drawing style +and order, we instead attempt to model abstraction as a whole, and propose +feature-level and retrieval granularity-level designs so that the system builds +into its DNA the necessary means to interpret abstraction. On learning +abstraction-aware features, we for the first-time harness the rich semantic +embedding of pre-trained StyleGAN model, together with a novel +abstraction-level mapper that deciphers the level of abstraction and +dynamically selects appropriate dimensions in the feature matrix +correspondingly, to construct a feature matrix embedding that can be freely +traversed to accommodate different levels of abstraction. For granularity-level +abstraction understanding, we dictate that the retrieval model should not treat +all abstraction-levels equally and introduce a differentiable surrogate Acc.@q +loss to inject that understanding into the system. Different to the +gold-standard triplet loss, our Acc.@q loss uniquely allows a sketch to +narrow/broaden its focus in terms of how stringent the evaluation should be - +the more abstract a sketch, the less stringent (higher q). Extensive +experiments depict our method to outperform existing state-of-the-arts in +standard SBIR tasks along with challenging scenarios like early retrieval, +forensic sketch-photo matching, and style-invariant retrieval. + +
+
+ comment: Accepted in CVPR 2024. Project page available at + https://subhadeepkoley.github.io/AbstractAway +
+
+
+
+
+ + ♻ ☆ Text-to-Image Diffusion Models are Great Sketch-Photo Matchmakers CVPR 2024 + + +
+ This paper, for the first time, explores text-to-image diffusion models for +Zero-Shot Sketch-based Image Retrieval (ZS-SBIR). We highlight a pivotal +discovery: the capacity of text-to-image diffusion models to seamlessly bridge +the gap between sketches and photos. This proficiency is underpinned by their +robust cross-modal capabilities and shape bias, findings that are substantiated +through our pilot studies. In order to harness pre-trained diffusion models +effectively, we introduce a straightforward yet powerful strategy focused on +two key aspects: selecting optimal feature layers and utilising visual and +textual prompts. For the former, we identify which layers are most enriched +with information and are best suited for the specific retrieval requirements +(category-level or fine-grained). Then we employ visual and textual prompts to +guide the model's feature extraction process, enabling it to generate more +discriminative and contextually relevant cross-modal representations. Extensive +experiments on several benchmark datasets validate significant performance +improvements. + +
+
+ comment: Accepted in CVPR 2024. Project page available at + https://subhadeepkoley.github.io/DiffusionZSSBIR +
+
+
+
+
+ + ♻ ☆ You'll Never Walk Alone: A Sketch and Text Duet for Fine-Grained Image + Retrieval CVPR 2024 + + +
+ Two primary input modalities prevail in image retrieval: sketch and text. +While text is widely used for inter-category retrieval tasks, sketches have +been established as the sole preferred modality for fine-grained image +retrieval due to their ability to capture intricate visual details. In this +paper, we question the reliance on sketches alone for fine-grained image +retrieval by simultaneously exploring the fine-grained representation +capabilities of both sketch and text, orchestrating a duet between the two. The +end result enables precise retrievals previously unattainable, allowing users +to pose ever-finer queries and incorporate attributes like colour and +contextual cues from text. For this purpose, we introduce a novel +compositionality framework, effectively combining sketches and text using +pre-trained CLIP models, while eliminating the need for extensive fine-grained +textual descriptions. Last but not least, our system extends to novel +applications in composed image retrieval, domain attribute transfer, and +fine-grained generation, providing solutions for various real-world scenarios. + +
+
+ comment: Accepted in CVPR 2024. Project page available at + https://subhadeepkoley.github.io/Sketch2Word +
+
+
+
+
+ + ♻ ☆ It's All About Your Sketch: Democratising Sketch Control in Diffusion + Models CVPR 2024 + + +
+ This paper unravels the potential of sketches for diffusion models, +addressing the deceptive promise of direct sketch control in generative AI. We +importantly democratise the process, enabling amateur sketches to generate +precise images, living up to the commitment of "what you sketch is what you +get". A pilot study underscores the necessity, revealing that deformities in +existing models stem from spatial-conditioning. To rectify this, we propose an +abstraction-aware framework, utilising a sketch adapter, adaptive time-step +sampling, and discriminative guidance from a pre-trained fine-grained +sketch-based image retrieval model, working synergistically to reinforce +fine-grained sketch-photo association. Our approach operates seamlessly during +inference without the need for textual prompts; a simple, rough sketch akin to +what you and I can create suffices! We welcome everyone to examine results +presented in the paper and its supplementary. Contributions include +democratising sketch control, introducing an abstraction-aware framework, and +leveraging discriminative guidance, validated through extensive experiments. + +
+
+ comment: Accepted in CVPR 2024. Project page available at + https://subhadeepkoley.github.io/StableSketching +
+
+
+
+
+ + ♻ ☆ Discover and Mitigate Multiple Biased Subgroups in Image Classifiers CVPR 2024 + + +
+ Machine learning models can perform well on in-distribution data but often +fail on biased subgroups that are underrepresented in the training data, +hindering the robustness of models for reliable applications. Such subgroups +are typically unknown due to the absence of subgroup labels. Discovering biased +subgroups is the key to understanding models' failure modes and further +improving models' robustness. Most previous works of subgroup discovery make an +implicit assumption that models only underperform on a single biased subgroup, +which does not hold on in-the-wild data where multiple biased subgroups exist. + In this work, we propose Decomposition, Interpretation, and Mitigation (DIM), +a novel method to address a more challenging but also more practical problem of +discovering multiple biased subgroups in image classifiers. Our approach +decomposes the image features into multiple components that represent multiple +subgroups. This decomposition is achieved via a bilinear dimension reduction +method, Partial Least Square (PLS), guided by useful supervision from the image +classifier. We further interpret the semantic meaning of each subgroup +component by generating natural language descriptions using vision-language +foundation models. Finally, DIM mitigates multiple biased subgroups +simultaneously via two strategies, including the data- and model-centric +strategies. Extensive experiments on CIFAR-100 and Breeds datasets demonstrate +the effectiveness of DIM in discovering and mitigating multiple biased +subgroups. Furthermore, DIM uncovers the failure modes of the classifier on +Hard ImageNet, showcasing its broader applicability to understanding model bias +in image classifiers. The code is available at +https://github.com/ZhangAIPI/DIM. + +
+
+ comment: CVPR 2024. Code is available at https://github.com/ZhangAIPI/DIM +
+
+
+
+
+ + ♻ ☆ Align before Adapt: Leveraging Entity-to-Region Alignments for + Generalizable Video Action Recognition CVPR 2024 + + +
+ Large-scale visual-language pre-trained models have achieved significant +success in various video tasks. However, most existing methods follow an "adapt +then align" paradigm, which adapts pre-trained image encoders to model +video-level representations and utilizes one-hot or text embedding of the +action labels for supervision. This paradigm overlooks the challenge of mapping +from static images to complicated activity concepts. In this paper, we propose +a novel "Align before Adapt" (ALT) paradigm. Prior to adapting to video +representation learning, we exploit the entity-to-region alignments for each +frame. The alignments are fulfilled by matching the region-aware image +embeddings to an offline-constructed text corpus. With the aligned entities, we +feed their text embeddings to a transformer-based video adapter as the queries, +which can help extract the semantics of the most important entities from a +video to a vector. This paradigm reuses the visual-language alignment of VLP +during adaptation and tries to explain an action by the underlying entities. +This helps understand actions by bridging the gap with complex activity +semantics, particularly when facing unfamiliar or unseen categories. ALT +demonstrates competitive performance while maintaining remarkably low +computational costs. In fully supervised experiments, it achieves 88.1% top-1 +accuracy on Kinetics-400 with only 4947 GFLOPs. Moreover, ALT outperforms the +previous state-of-the-art methods in both zero-shot and few-shot experiments, +emphasizing its superior generalizability across various learning scenarios. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Towards a Perceptual Evaluation Framework for Lighting Estimation + + +
+ Progress in lighting estimation is tracked by computing existing image +quality assessment (IQA) metrics on images from standard datasets. While this +may appear to be a reasonable approach, we demonstrate that doing so does not +correlate to human preference when the estimated lighting is used to relight a +virtual scene into a real photograph. To study this, we design a controlled +psychophysical experiment where human observers must choose their preference +amongst rendered scenes lit using a set of lighting estimation algorithms +selected from the recent literature, and use it to analyse how these algorithms +perform according to human perception. Then, we demonstrate that none of the +most popular IQA metrics from the literature, taken individually, correctly +represent human perception. Finally, we show that by learning a combination of +existing IQA metrics, we can more accurately represent human preference. This +provides a new perceptual framework to help evaluate future lighting estimation +algorithms. + +
+
+
+
+
+ + ♻ ☆ Deep Feature Consistent Variational Autoencoder WACV + + +
+ We present a novel method for constructing Variational Autoencoder (VAE). +Instead of using pixel-by-pixel loss, we enforce deep feature consistency +between the input and the output of a VAE, which ensures the VAE's output to +preserve the spatial correlation characteristics of the input, thus leading the +output to have a more natural visual appearance and better perceptual quality. +Based on recent deep learning works such as style transfer, we employ a +pre-trained deep convolutional neural network (CNN) and use its hidden features +to define a feature perceptual loss for VAE training. Evaluated on the CelebA +face dataset, we show that our model produces better results than other methods +in the literature. We also show that our method can produce latent vectors that +can capture the semantic information of face expressions and can be used to +achieve state-of-the-art performance in facial attribute prediction. + +
+
+ comment: WACV +
+
+
+
+
+ + ♻ ☆ LISNeRF Mapping: LiDAR-based Implicit Mapping via Semantic Neural Fields + for Large-Scale 3D Scenes + + +
+ Large-scale semantic mapping is crucial for outdoor autonomous agents to +fulfill high-level tasks such as planning and navigation. This paper proposes a +novel method for large-scale 3D semantic reconstruction through implicit +representations from posed LiDAR measurements alone. We first leverage an +octree-based and hierarchical structure to store implicit features, then these +implicit features are decoded to semantic information and signed distance value +through shallow Multilayer Perceptrons (MLPs). We adopt off-the-shelf +algorithms to predict the semantic labels and instance IDs of point clouds. We +then jointly optimize the feature embeddings and MLPs parameters with a +self-supervision paradigm for point cloud geometry and a pseudo-supervision +paradigm for semantic and panoptic labels. Subsequently, categories and +geometric structures for novel points are regressed, and marching cubes are +exploited to subdivide and visualize the scenes in the inferring stage. For +scenarios with memory constraints, a map stitching strategy is also developed +to merge sub-maps into a complete map. Experiments on two real-world datasets, +SemanticKITTI and SemanticPOSS, demonstrate the superior segmentation +efficiency and mapping effectiveness of our framework compared to current +state-of-the-art 3D LiDAR mapping methods. + +
+
+
+
+
+ + ♻ ☆ PanoDiffusion: 360-degree Panorama Outpainting via Diffusion + + +
+ Generating complete 360-degree panoramas from narrow field of view images is +ongoing research as omnidirectional RGB data is not readily available. Existing +GAN-based approaches face some barriers to achieving higher quality output, and +have poor generalization performance over different mask types. In this paper, +we present our 360-degree indoor RGB-D panorama outpainting model using latent +diffusion models (LDM), called PanoDiffusion. We introduce a new bi-modal +latent diffusion structure that utilizes both RGB and depth panoramic data +during training, which works surprisingly well to outpaint depth-free RGB +images during inference. We further propose a novel technique of introducing +progressive camera rotations during each diffusion denoising step, which leads +to substantial improvement in achieving panorama wraparound consistency. +Results show that our PanoDiffusion not only significantly outperforms +state-of-the-art methods on RGB-D panorama outpainting by producing diverse +well-structured results for different types of masks, but can also synthesize +high-quality depth panoramas to provide realistic 3D indoor models. + +
+
+ comment: Project Page: https://sm0kywu.github.io/panodiffusion/ +
+
+
+
+
+ + ♻ ☆ CAGE: Controllable Articulation GEneration CVPR 2024 + + +
+ We address the challenge of generating 3D articulated objects in a +controllable fashion. Currently, modeling articulated 3D objects is either +achieved through laborious manual authoring, or using methods from prior work +that are hard to scale and control directly. We leverage the interplay between +part shape, connectivity, and motion using a denoising diffusion-based method +with attention modules designed to extract correlations between part +attributes. Our method takes an object category label and a part connectivity +graph as input and generates an object's geometry and motion parameters. The +generated objects conform to user-specified constraints on the object category, +part shape, and part articulation. Our experiments show that our method +outperforms the state-of-the-art in articulated object generation, producing +more realistic objects while conforming better to user constraints. + Video Summary at: http://youtu.be/cH_rbKbyTpE + +
+
+ comment: CVPR 2024. Project page: https://3dlg-hcvc.github.io/cage/ +
+
+
+
+
+ + ♻ ☆ Modeling Collaborator: Enabling Subjective Vision Classification With + Minimal Human Effort via LLM Tool-Use + + +
+ From content moderation to wildlife conservation, the number of applications +that require models to recognize nuanced or subjective visual concepts is +growing. Traditionally, developing classifiers for such concepts requires +substantial manual effort measured in hours, days, or even months to identify +and annotate data needed for training. Even with recently proposed Agile +Modeling techniques, which enable rapid bootstrapping of image classifiers, +users are still required to spend 30 minutes or more of monotonous, repetitive +data labeling just to train a single classifier. Drawing on Fiske's Cognitive +Miser theory, we propose a new framework that alleviates manual effort by +replacing human labeling with natural language interactions, reducing the total +effort required to define a concept by an order of magnitude: from labeling +2,000 images to only 100 plus some natural language interactions. Our framework +leverages recent advances in foundation models, both large language models and +vision-language models, to carve out the concept space through conversation and +by automatically labeling training data points. Most importantly, our framework +eliminates the need for crowd-sourced annotations. Moreover, our framework +ultimately produces lightweight classification models that are deployable in +cost-sensitive scenarios. Across 15 subjective concepts and across 2 public +image classification datasets, our trained models outperform traditional Agile +Modeling as well as state-of-the-art zero-shot classification models like +ALIGN, CLIP, CuPL, and large visual question-answering models like PaLI-X. + +
+
+
+
+
+ + ♻ ☆ A Dataset and Benchmark for Copyright Protection from Text-to-Image + Diffusion Models + + +
+ Copyright is a legal right that grants creators the exclusive authority to +reproduce, distribute, and profit from their creative works. However, the +recent advancements in text-to-image generation techniques have posed +significant challenges to copyright protection, as these methods have +facilitated the learning of unauthorized content, artistic creations, and +portraits, which are subsequently utilized to generate and disseminate +uncontrolled content. Especially, the use of stable diffusion, an emerging +model for text-to-image generation, poses an increased risk of unauthorized +copyright infringement and distribution. Currently, there is a lack of +systematic studies evaluating the potential correlation between content +generated by stable diffusion and those under copyright protection. Conducting +such studies faces several challenges, including i) the intrinsic ambiguity +related to copyright infringement in text-to-image models, ii) the absence of a +comprehensive large-scale dataset, and iii) the lack of standardized metrics +for defining copyright infringement. This work provides the first large-scale +standardized dataset and benchmark on copyright protection. Specifically, we +propose a pipeline to coordinate CLIP, ChatGPT, and diffusion models to +generate a dataset that contains anchor images, corresponding prompts, and +images generated by text-to-image models, reflecting the potential abuses of +copyright. Furthermore, we explore a suite of evaluation metrics to judge the +effectiveness of copyright protection methods. The proposed dataset, benchmark +library, and evaluation metrics will be open-sourced to facilitate future +research and application. The website and dataset can be accessed website +dataset. + +
+
+ comment: Improve experimental content +
+
+
+
+
+ + ♻ ☆ Tackling the Singularities at the Endpoints of Time Intervals in + Diffusion Models CVPR2024 + + +
+ Most diffusion models assume that the reverse process adheres to a Gaussian +distribution. However, this approximation has not been rigorously validated, +especially at singularities, where t=0 and t=1. Improperly dealing with such +singularities leads to an average brightness issue in applications, and limits +the generation of images with extreme brightness or darkness. We primarily +focus on tackling singularities from both theoretical and practical +perspectives. Initially, we establish the error bounds for the reverse process +approximation, and showcase its Gaussian characteristics at singularity time +steps. Based on this theoretical insight, we confirm the singularity at t=1 is +conditionally removable while it at t=0 is an inherent property. Upon these +significant conclusions, we propose a novel plug-and-play method SingDiffusion +to address the initial singular time step sampling, which not only effectively +resolves the average brightness issue for a wide range of diffusion models +without extra training efforts, but also enhances their generation capability +in achieving notable lower FID scores. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ♻ ☆ Don't Blame the Annotator: Bias Already Starts in the Annotation + Instructions EACL 2023 + + +
+ In recent years, progress in NLU has been driven by benchmarks. These +benchmarks are typically collected by crowdsourcing, where annotators write +examples based on annotation instructions crafted by dataset creators. In this +work, we hypothesize that annotators pick up on patterns in the crowdsourcing +instructions, which bias them to write many similar examples that are then +over-represented in the collected data. We study this form of bias, termed +instruction bias, in 14 recent NLU benchmarks, showing that instruction +examples often exhibit concrete patterns, which are propagated by crowdworkers +to the collected data. This extends previous work (Geva et al., 2019) and +raises a new concern of whether we are modeling the dataset creator's +instructions, rather than the task. Through a series of experiments, we show +that, indeed, instruction bias can lead to overestimation of model performance, +and that models struggle to generalize beyond biases originating in the +crowdsourcing instructions. We further analyze the influence of instruction +bias in terms of pattern frequency and model size, and derive concrete +recommendations for creating future NLU benchmarks. + +
+
+ comment: EACL 2023 (Outstanding Paper Award) +
+
+
+
+
+ + ♻ ☆ LDM-ISP: Enhancing Neural ISP for Low Light with Latent Diffusion Models + + +
+ Enhancing a low-light noisy RAW image into a well-exposed and clean sRGB +image is a significant challenge for modern digital cameras. Prior approaches +have difficulties in recovering fine-grained details and true colors of the +scene under extremely low-light environments due to near-to-zero SNR. +Meanwhile, diffusion models have shown significant progress towards general +domain image generation. In this paper, we propose to leverage the pre-trained +latent diffusion model to perform the neural ISP for enhancing extremely +low-light images. Specifically, to tailor the pre-trained latent diffusion +model to operate on the RAW domain, we train a set of lightweight taming +modules to inject the RAW information into the diffusion denoising process via +modulating the intermediate features of UNet. We further observe different +roles of UNet denoising and decoder reconstruction in the latent diffusion +model, which inspires us to decompose the low-light image enhancement task into +latent-space low-frequency content generation and decoding-phase high-frequency +detail maintenance. Through extensive experiments on representative datasets, +we demonstrate our simple design not only achieves state-of-the-art performance +in quantitative evaluations but also shows significant superiority in visual +comparisons over strong baselines, which highlight the effectiveness of +powerful generative priors for neural ISP under extremely low-light +environments. The project page is available at +https://csqiangwen.github.io/projects/ldm-isp/ + +
+
+
+
+
+ + ♻ ☆ View while Moving: Efficient Video Recognition in Long-untrimmed Videos ACM MM 2023 + + +
+ Recent adaptive methods for efficient video recognition mostly follow the +two-stage paradigm of "preview-then-recognition" and have achieved great +success on multiple video benchmarks. However, this two-stage paradigm involves +two visits of raw frames from coarse-grained to fine-grained during inference +(cannot be parallelized), and the captured spatiotemporal features cannot be +reused in the second stage (due to varying granularity), being not friendly to +efficiency and computation optimization. To this end, inspired by human +cognition, we propose a novel recognition paradigm of "View while Moving" for +efficient long-untrimmed video recognition. In contrast to the two-stage +paradigm, our paradigm only needs to access the raw frame once. The two phases +of coarse-grained sampling and fine-grained recognition are combined into +unified spatiotemporal modeling, showing great performance. Moreover, we +investigate the properties of semantic units in video and propose a +hierarchical mechanism to efficiently capture and reason about the unit-level +and video-level temporal semantics in long-untrimmed videos respectively. +Extensive experiments on both long-untrimmed and short-trimmed videos +demonstrate that our approach outperforms state-of-the-art methods in terms of +accuracy as well as efficiency, yielding new efficiency and accuracy trade-offs +for video spatiotemporal modeling. + +
+
+ comment: Published on ACM MM 2023 +
+
+
+
+
+ + ♻ ☆ D-YOLO a robust framework for object detection in adverse weather + conditions + + +
+ Adverse weather conditions including haze, snow and rain lead to decline in +image qualities, which often causes a decline in performance for deep-learning +based detection networks. Most existing approaches attempts to rectify hazy +images before performing object detection, which increases the complexity of +the network and may result in the loss in latent information. To better +integrate image restoration and object detection tasks, we designed a +double-route network with an attention feature fusion module, taking both hazy +and dehazed features into consideration. We also proposed a subnetwork to +provide haze-free features to the detection network. Specifically, our D-YOLO +improves the performance of the detection network by minimizing the distance +between the clear feature extraction subnetwork and detection network. +Experiments on RTTS and FoggyCityscapes datasets show that D-YOLO demonstrates +better performance compared to the state-of-the-art methods. It is a robust +detection framework for bridging the gap between low-level dehazing and +high-level detection. + +
+
+ comment: Object detection in adverse weather conditions. arXiv admin note: + text overlap with arXiv:2209.01373 by other authors +
+
+
+
+
+ + ♻ ☆ Content-aware Masked Image Modeling Transformer for Stereo Image + Compression + + +
+ Existing learning-based stereo image codec adopt sophisticated transformation +with simple entropy models derived from single image codecs to encode latent +representations. However, those entropy models struggle to effectively capture +the spatial-disparity characteristics inherent in stereo images, which leads to +suboptimal rate-distortion results. In this paper, we propose a stereo image +compression framework, named CAMSIC. CAMSIC independently transforms each image +to latent representation and employs a powerful decoder-free Transformer +entropy model to capture both spatial and disparity dependencies, by +introducing a novel content-aware masked image modeling (MIM) technique. Our +content-aware MIM facilitates efficient bidirectional interaction between prior +information and estimated tokens, which naturally obviates the need for an +extra Transformer decoder. Experiments show that our stereo image codec +achieves state-of-the-art rate-distortion performance on two stereo image +datasets Cityscapes and InStereo2K with fast encoding and decoding speed. + +
+
+
+
+
+ + ♻ ☆ Leveraging Neural Radiance Field in Descriptor Synthesis for Keypoints + Scene Coordinate Regression + + +
+ Classical structural-based visual localization methods offer high accuracy +but face trade-offs in terms of storage, speed, and privacy. A recent +innovation, keypoint scene coordinate regression (KSCR) named D2S addresses +these issues by leveraging graph attention networks to enhance keypoint +relationships and predict their 3D coordinates using a simple multilayer +perceptron (MLP). Camera pose is then determined via PnP+RANSAC, using +established 2D-3D correspondences. While KSCR achieves competitive results, +rivaling state-of-the-art image-retrieval methods like HLoc across multiple +benchmarks, its performance is hindered when data samples are limited due to +the deep learning model's reliance on extensive data. This paper proposes a +solution to this challenge by introducing a pipeline for keypoint descriptor +synthesis using Neural Radiance Field (NeRF). By generating novel poses and +feeding them into a trained NeRF model to create new views, our approach +enhances the KSCR's generalization capabilities in data-scarce environments. +The proposed system could significantly improve localization accuracy by up to +50% and cost only a fraction of time for data synthesis. Furthermore, its +modular design allows for the integration of multiple NeRFs, offering a +versatile and efficient solution for visual localization. The implementation is +publicly available at: https://github.com/ais-lab/DescriptorSynthesis4Feat2Map. + +
+
+
+
+
+ + ♻ ☆ UWFormer: Underwater Image Enhancement via a Semi-Supervised Multi-Scale + Transformer IJCNN 2024 + + +
+ Underwater images often exhibit poor quality, distorted color balance and low +contrast due to the complex and intricate interplay of light, water, and +objects. Despite the significant contributions of previous underwater +enhancement techniques, there exist several problems that demand further +improvement: (i) The current deep learning methods rely on Convolutional Neural +Networks (CNNs) that lack the multi-scale enhancement, and global perception +field is also limited. (ii) The scarcity of paired real-world underwater +datasets poses a significant challenge, and the utilization of synthetic image +pairs could lead to overfitting. To address the aforementioned problems, this +paper introduces a Multi-scale Transformer-based Network called UWFormer for +enhancing images at multiple frequencies via semi-supervised learning, in which +we propose a Nonlinear Frequency-aware Attention mechanism and a Multi-Scale +Fusion Feed-forward Network for low-frequency enhancement. Besides, we +introduce a special underwater semi-supervised training strategy, where we +propose a Subaqueous Perceptual Loss function to generate reliable pseudo +labels. Experiments using full-reference and non-reference underwater +benchmarks demonstrate that our method outperforms state-of-the-art methods in +terms of both quantity and visual quality. + +
+
+ comment: Accepted by IJCNN 2024 +
+
+
+
+
+ + ♻ ☆ Human Mesh Recovery from Arbitrary Multi-view Images + + +
+ Human mesh recovery from arbitrary multi-view images involves two +characteristics: the arbitrary camera poses and arbitrary number of camera +views. Because of the variability, designing a unified framework to tackle this +task is challenging. The challenges can be summarized as the dilemma of being +able to simultaneously estimate arbitrary camera poses and recover human mesh +from arbitrary multi-view images while maintaining flexibility. To solve this +dilemma, we propose a divide and conquer framework for Unified Human Mesh +Recovery (U-HMR) from arbitrary multi-view images. In particular, U-HMR +consists of a decoupled structure and two main components: camera and body +decoupling (CBD), camera pose estimation (CPE), and arbitrary view fusion +(AVF). As camera poses and human body mesh are independent of each other, CBD +splits the estimation of them into two sub-tasks for two individual +sub-networks (\ie, CPE and AVF) to handle respectively, thus the two sub-tasks +are disentangled. In CPE, since each camera pose is unrelated to the others, we +adopt a shared MLP to process all views in a parallel way. In AVF, in order to +fuse multi-view information and make the fusion operation independent of the +number of views, we introduce a transformer decoder with a SMPL parameters +query token to extract cross-view features for mesh recovery. To demonstrate +the efficacy and flexibility of the proposed framework and effect of each +component, we conduct extensive experiments on three public datasets: +Human3.6M, MPI-INF-3DHP, and TotalCapture. + +
+
+
+
+
+ + ♻ ☆ BA-SAM: Scalable Bias-Mode Attention Mask for Segment Anything Model CVPR + + +
+ In this paper, we address the challenge of image resolution variation for the +Segment Anything Model (SAM). SAM, known for its zero-shot generalizability, +exhibits a performance degradation when faced with datasets with varying image +sizes. Previous approaches tend to resize the image to a fixed size or adopt +structure modifications, hindering the preservation of SAM's rich prior +knowledge. Besides, such task-specific tuning necessitates a complete +retraining of the model, which is cost-expensive and unacceptable for +deployment in the downstream tasks. In this paper, we reformulate this issue as +a length extrapolation problem, where token sequence length varies while +maintaining a consistent patch size for images of different sizes. To this end, +we propose Scalable Bias-Mode Attention Mask (BA-SAM) to enhance SAM's +adaptability to varying image resolutions while eliminating the need for +structure modifications. Firstly, we introduce a new scaling factor to ensure +consistent magnitude in the attention layer's dot product values when the token +sequence length changes. Secondly, we present a bias-mode attention mask that +allows each token to prioritize neighboring information, mitigating the impact +of untrained distant information. Our BA-SAM demonstrates efficacy in two +scenarios: zero-shot and fine-tuning. Extensive evaluation on diverse datasets, +including DIS5K, DUTS, ISIC, COD10K, and COCO, reveals its ability to +significantly mitigate performance degradation in the zero-shot setting and +achieve state-of-the-art performance with minimal fine-tuning. Furthermore, we +propose a generalized model and benchmark, showcasing BA-SAM's generalizability +across all four datasets simultaneously. Code is available at +https://github.com/zongzi13545329/BA-SAM + +
+
+ comment: Accepted to IEEE/CVF Conference on Computer Vision and Pattern + Recognition (CVPR), 2024 +
+
+
+
+
+ + ♻ ☆ GaussNav: Gaussian Splatting for Visual Navigation + + +
+ In embodied vision, Instance ImageGoal Navigation (IIN) requires an agent to +locate a specific object depicted in a goal image within an unexplored +environment. The primary difficulty of IIN stems from the necessity of +recognizing the target object across varying viewpoints and rejecting potential +distractors. + Existing map-based navigation methods largely adopt the representation form +of Bird's Eye View (BEV) maps, which, however, lack the representation of +detailed textures in a scene. + To address the above issues, we propose a new Gaussian Splatting Navigation +(abbreviated as GaussNav) framework for IIN task, which constructs a novel map +representation based on 3D Gaussian Splatting (3DGS). + The proposed framework enables the agent to not only memorize the geometry +and semantic information of the scene, but also retain the textural features of +objects. + Our GaussNav framework demonstrates a significant leap in performance, +evidenced by an increase in Success weighted by Path Length (SPL) from 0.252 to +0.578 on the challenging Habitat-Matterport 3D (HM3D) dataset. + Our code will be made publicly available. + +
+
+ comment: conference +
+
+
+
+
+ + ♻ ☆ Data-Efficient Contrastive Language-Image Pretraining: Prioritizing Data + Quality over Quantity AISTATS 2024 + + +
+ Contrastive Language-Image Pre-training (CLIP) on large-scale image-caption +datasets learns representations that can achieve remarkable zero-shot +generalization. However, such models require a massive amount of pre-training +data. Improving the quality of the pre-training data has been shown to be much +more effective in improving CLIP's performance than increasing its volume. +Nevertheless, finding small subsets of training data that provably generalize +the best has remained an open question. In this work, we propose the first +theoretically rigorous data selection method for CLIP. We show that subsets +that closely preserve the cross-covariance of the images and captions of the +full data provably achieve a superior generalization performance. Our extensive +experiments on ConceptualCaptions3M and ConceptualCaptions12M demonstrate that +subsets found by \method\ achieve over 2.7x and 1.4x the accuracy of the next +best baseline on ImageNet and its shifted versions. Moreover, we show that our +subsets obtain 1.5x the average accuracy across 11 downstream datasets, of the +next best baseline. The code is available at: +https://github.com/BigML-CS-UCLA/clipcov-data-efficient-clip. + +
+
+ comment: AISTATS 2024, Code: + https://github.com/BigML-CS-UCLA/clipcov-data-efficient-clip +
+
+
+
+
+ + ♻ ☆ A Dual-Augmentor Framework for Domain Generalization in 3D Human Pose + Estimation CVPR 2024 + + +
+ 3D human pose data collected in controlled laboratory settings present +challenges for pose estimators that generalize across diverse scenarios. To +address this, domain generalization is employed. Current methodologies in +domain generalization for 3D human pose estimation typically utilize +adversarial training to generate synthetic poses for training. Nonetheless, +these approaches exhibit several limitations. First, the lack of prior +information about the target domain complicates the application of suitable +augmentation through a single pose augmentor, affecting generalization on +target domains. Moreover, adversarial training's discriminator tends to enforce +similarity between source and synthesized poses, impeding the exploration of +out-of-source distributions. Furthermore, the pose estimator's optimization is +not exposed to domain shifts, limiting its overall generalization ability. + To address these limitations, we propose a novel framework featuring two pose +augmentors: the weak and the strong augmentors. Our framework employs +differential strategies for generation and discrimination processes, +facilitating the preservation of knowledge related to source poses and the +exploration of out-of-source distributions without prior information about +target poses. Besides, we leverage meta-optimization to simulate domain shifts +in the optimization process of the pose estimator, thereby improving its +generalization ability. Our proposed approach significantly outperforms +existing methods, as demonstrated through comprehensive experiments on various +benchmark datasets.Our code will be released at +\url{https://github.com/davidpengucf/DAF-DG}. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Just Add $100 More: Augmenting NeRF-based Pseudo-LiDAR Point Cloud for + Resolving Class-imbalance Problem + + +
+ Typical LiDAR-based 3D object detection models are trained in a supervised +manner with real-world data collection, which is often imbalanced over classes +(or long-tailed). To deal with it, augmenting minority-class examples by +sampling ground truth (GT) LiDAR points from a database and pasting them into a +scene of interest is often used, but challenges still remain: inflexibility in +locating GT samples and limited sample diversity. In this work, we propose to +leverage pseudo-LiDAR point clouds generated (at a low cost) from videos +capturing a surround view of miniatures or real-world objects of minor classes. +Our method, called Pseudo Ground Truth Augmentation (PGT-Aug), consists of +three main steps: (i) volumetric 3D instance reconstruction using a 2D-to-3D +view synthesis model, (ii) object-level domain alignment with LiDAR intensity +estimation and (iii) a hybrid context-aware placement method from ground and +map information. We demonstrate the superiority and generality of our method +through performance improvements in extensive experiments conducted on three +popular benchmarks, i.e., nuScenes, KITTI, and Lyft, especially for the +datasets with large domain gaps captured by different LiDAR configurations. Our +code and data will be publicly available upon publication. + +
+
+ comment: 28 pages, 12 figures, 11 tables +
+
+
+
+
+ + ♻ ☆ GeoScaler: Geometry and Rendering-Aware Downsampling of 3D Mesh Textures + + +
+ High-resolution texture maps are necessary for representing real-world +objects accurately with 3D meshes. The large sizes of textures can bottleneck +the real-time rendering of high-quality virtual 3D scenes on devices having low +computational budgets and limited memory. Downsampling the texture maps +directly addresses the issue, albeit at the cost of visual fidelity. +Traditionally, downsampling of texture maps is performed using methods like +bicubic interpolation and the Lanczos algorithm. These methods ignore the +geometric layout of the mesh and its UV parametrization and also do not account +for the rendering process used to obtain the final visualization that the users +will experience. Towards filling these gaps, we introduce GeoScaler, which is a +method of downsampling texture maps of 3D meshes while incorporating geometric +cues, and by maximizing the visual fidelity of the rendered views of the +textured meshes. We show that the textures generated by GeoScaler deliver +significantly better quality rendered images compared to those generated by +traditional downsampling methods + +
+
+
+
+
+ + ♻ ☆ Mamba-ND: Selective State Space Modeling for Multi-Dimensional Data + + +
+ In recent years, Transformers have become the de-facto architecture for +sequence modeling on text and a variety of multi-dimensional data, such as +images and video. However, the use of self-attention layers in a Transformer +incurs prohibitive compute and memory complexity that scales quadratically +w.r.t. the sequence length. A recent architecture, Mamba, based on state space +models has been shown to achieve comparable performance for modeling text +sequences, while scaling linearly with the sequence length. In this work, we +present Mamba-ND, a generalized design extending the Mamba architecture to +arbitrary multi-dimensional data. Our design alternatively unravels the input +data across different dimensions following row-major orderings. We provide a +systematic comparison of Mamba-ND with several other alternatives, based on +prior multi-dimensional extensions such as Bi-directional LSTMs and S4ND. +Empirically, we show that Mamba-ND demonstrates performance competitive with +the state-of-the-art on a variety of multi-dimensional benchmarks, including +ImageNet-1K classification, HMDB-51 action recognition, and ERA5 weather +forecasting. + +
+
+ comment: 22 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ LingoQA: Video Question Answering for Autonomous Driving + + +
+ Autonomous driving has long faced a challenge with public acceptance due to +the lack of explainability in the decision-making process. Video +question-answering (QA) in natural language provides the opportunity for +bridging this gap. Nonetheless, evaluating the performance of Video QA models +has proved particularly tough due to the absence of comprehensive benchmarks. +To fill this gap, we introduce LingoQA, a benchmark specifically for autonomous +driving Video QA. The LingoQA trainable metric demonstrates a 0.95 Spearman +correlation coefficient with human evaluations. We introduce a Video QA dataset +of central London consisting of 419k samples that we release with the paper. We +establish a baseline vision-language model and run extensive ablation studies +to understand its performance. + +
+
+ comment: Benchmark and dataset are available at + https://github.com/wayveai/LingoQA/ +
+
+
+
+
+ + ♻ ☆ Malaria Parasitic Detection using a New Deep Boosted and Ensemble + Learning Framework + + +
+ Malaria is a potentially fatal plasmodium parasite injected by female +anopheles mosquitoes that infect red blood cells and millions worldwide yearly. +However, specialists' manual screening in clinical practice is laborious and +prone to error. Therefore, a novel Deep Boosted and Ensemble Learning (DBEL) +framework, comprising the stacking of new Boosted-BR-STM convolutional neural +networks (CNN) and the ensemble ML classifiers, is developed to screen malaria +parasite images. The proposed Boosted-BR-STM is based on a new +dilated-convolutional block-based split transform merge (STM) and feature-map +Squeezing-Boosting (SB) ideas. Moreover, the new STM block uses regional and +boundary operations to learn the malaria parasite's homogeneity, heterogeneity, +and boundary with patterns. Furthermore, the diverse boosted channels are +attained by employing Transfer Learning-based new feature-map SB in STM blocks +at the abstract, medium, and conclusion levels to learn minute intensity and +texture variation of the parasitic pattern. The proposed DBEL framework +implicates the stacking of prominent and diverse boosted channels and provides +the generated discriminative features of the developed Boosted-BR-STM to the +ensemble of ML classifiers. The proposed framework improves the discrimination +ability and generalization of ensemble learning. Moreover, the deep feature +spaces of the developed Boosted-BR-STM and customized CNNs are fed into ML +classifiers for comparative analysis. The proposed DBEL framework outperforms +the existing techniques on the NIH malaria dataset that are enhanced using +discrete wavelet transform to enrich feature space. The proposed DBEL framework +achieved Accuracy (98.50%), Sensitivity (0.9920), F-score (0.9850), and AUC +(0.997), which suggest it to be utilized for malaria parasite screening. + +
+
+ comment: 26 pages, 10 figures, 9 Tables +
+
+
+
+
+ + ♻ ☆ Precipitation Downscaling with Spatiotemporal Video Diffusion + + +
+ In climate science and meteorology, high-resolution local precipitation (rain +and snowfall) predictions are limited by the computational costs of +simulation-based methods. Statistical downscaling, or super-resolution, is a +common workaround where a low-resolution prediction is improved using +statistical approaches. Unlike traditional computer vision tasks, weather and +climate applications require capturing the accurate conditional distribution of +high-resolution given low-resolution patterns to assure reliable ensemble +averages and unbiased estimates of extreme events, such as heavy rain. This +work extends recent video diffusion models to precipitation super-resolution, +employing a deterministic downscaler followed by a temporally-conditioned +diffusion model to capture noise characteristics and high-frequency patterns. +We test our approach on FV3GFS output, an established large-scale global +atmosphere model, and compare it against five state-of-the-art baselines. Our +analysis, capturing CRPS, MSE, precipitation distributions, and qualitative +aspects using California and the Himalayas as examples, establishes our method +as a new standard for data-driven precipitation downscaling. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 243 + +
+
+
+ + ☆ Wear-Any-Way: Manipulable Virtual Try-on via Sparse Correspondence + Alignment + + +
+ This paper introduces a novel framework for virtual try-on, termed +Wear-Any-Way. Different from previous methods, Wear-Any-Way is a customizable +solution. Besides generating high-fidelity results, our method supports users +to precisely manipulate the wearing style. To achieve this goal, we first +construct a strong pipeline for standard virtual try-on, supporting +single/multiple garment try-on and model-to-model settings in complicated +scenarios. To make it manipulable, we propose sparse correspondence alignment +which involves point-based control to guide the generation for specific +locations. With this design, Wear-Any-Way gets state-of-the-art performance for +the standard setting and provides a novel interaction form for customizing the +wearing style. For instance, it supports users to drag the sleeve to make it +rolled up, drag the coat to make it open, and utilize clicks to control the +style of tuck, etc. Wear-Any-Way enables more liberated and flexible +expressions of the attires, holding profound implications in the fashion +industry. + +
+
+ comment: Project Page: https://mengtingchen.github.io/wear-any-way-page/ +
+
+
+
+
+ + ☆ Chain-of-Spot: Interactive Reasoning Improves Large Vision-Language + Models + + +
+ In the realm of vision-language understanding, the proficiency of models in +interpreting and reasoning over visual content has become a cornerstone for +numerous applications. However, it is challenging for the visual encoder in +Large Vision-Language Models (LVLMs) to extract useful features tailored to +questions that aid the language model's response. Furthermore, a common +practice among existing LVLMs is to utilize lower-resolution images, which +restricts the ability for visual recognition. Our work introduces the +Chain-of-Spot (CoS) method, which we describe as Interactive Reasoning, a novel +approach that enhances feature extraction by focusing on key regions of +interest (ROI) within the image, corresponding to the posed questions or +instructions. This technique allows LVLMs to access more detailed visual +information without altering the original image resolution, thereby offering +multi-granularity image features. By integrating Chain-of-Spot with +instruct-following LLaVA-1.5 models, the process of image reasoning +consistently improves performance across a wide range of multimodal datasets +and benchmarks without bells and whistles and achieves new state-of-the-art +results. Our empirical findings demonstrate a significant improvement in LVLMs' +ability to understand and reason about visual content, paving the way for more +sophisticated visual instruction-following applications. Code and models are +available at https://github.com/dongyh20/Chain-of-Spot + +
+
+ comment: Project Page: https://sites.google.com/view/chain-of-spot/ +
+
+
+
+
+ + ☆ Negative Yields Positive: Unified Dual-Path Adapter for Vision-Language + Models + + +
+ Recently, large-scale pre-trained Vision-Language Models (VLMs) have +demonstrated great potential in learning open-world visual representations, and +exhibit remarkable performance across a wide range of downstream tasks through +efficient fine-tuning. In this work, we innovatively introduce the concept of +dual learning into fine-tuning VLMs, i.e., we not only learn what an image is, +but also what an image isn't. Building on this concept, we introduce a novel +DualAdapter approach to enable dual-path adaptation of VLMs from both positive +and negative perspectives with only limited annotated samples. In the inference +stage, our DualAdapter performs unified predictions by simultaneously +conducting complementary positive selection and negative exclusion across +target classes, thereby enhancing the overall recognition accuracy of VLMs in +downstream tasks. Our extensive experimental results across 15 datasets +validate that the proposed DualAdapter outperforms existing state-of-the-art +methods on both few-shot learning and domain generalization tasks while +achieving competitive computational efficiency. Code is available at +https://github.com/zhangce01/DualAdapter. + +
+
+
+
+
+ + ☆ FouriScale: A Frequency Perspective on Training-Free High-Resolution + Image Synthesis + + +
+ In this study, we delve into the generation of high-resolution images from +pre-trained diffusion models, addressing persistent challenges, such as +repetitive patterns and structural distortions, that emerge when models are +applied beyond their trained resolutions. To address this issue, we introduce +an innovative, training-free approach FouriScale from the perspective of +frequency domain analysis. We replace the original convolutional layers in +pre-trained diffusion models by incorporating a dilation technique along with a +low-pass operation, intending to achieve structural consistency and scale +consistency across resolutions, respectively. Further enhanced by a +padding-then-crop strategy, our method can flexibly handle text-to-image +generation of various aspect ratios. By using the FouriScale as guidance, our +method successfully balances the structural integrity and fidelity of generated +images, achieving an astonishing capacity of arbitrary-size, high-resolution, +and high-quality generation. With its simplicity and compatibility, our method +can provide valuable insights for future explorations into the synthesis of +ultra-high-resolution images. The code will be released at +https://github.com/LeonHLJ/FouriScale. + +
+
+
+
+
+ + ☆ FRESCO: Spatial-Temporal Correspondence for Zero-Shot Video Translation CVPR 24 + + +
+ The remarkable efficacy of text-to-image diffusion models has motivated +extensive exploration of their potential application in video domains. +Zero-shot methods seek to extend image diffusion models to videos without +necessitating model training. Recent methods mainly focus on incorporating +inter-frame correspondence into attention mechanisms. However, the soft +constraint imposed on determining where to attend to valid features can +sometimes be insufficient, resulting in temporal inconsistency. In this paper, +we introduce FRESCO, intra-frame correspondence alongside inter-frame +correspondence to establish a more robust spatial-temporal constraint. This +enhancement ensures a more consistent transformation of semantically similar +content across frames. Beyond mere attention guidance, our approach involves an +explicit update of features to achieve high spatial-temporal consistency with +the input video, significantly improving the visual coherence of the resulting +translated videos. Extensive experiments demonstrate the effectiveness of our +proposed framework in producing high-quality, coherent videos, marking a +notable improvement over existing zero-shot methods. + +
+
+ comment: CVPR 24, Code: https://github.com/williamyang1991/FRESCO, Project: + https://www.mmlab-ntu.com/project/fresco/ +
+
+
+
+
+ + ☆ TexTile: A Differentiable Metric for Texture Tileability CVPR 2024 + + +
+ We introduce TexTile, a novel differentiable metric to quantify the degree +upon which a texture image can be concatenated with itself without introducing +repeating artifacts (i.e., the tileability). Existing methods for tileable +texture synthesis focus on general texture quality, but lack explicit analysis +of the intrinsic repeatability properties of a texture. In contrast, our +TexTile metric effectively evaluates the tileable properties of a texture, +opening the door to more informed synthesis and analysis of tileable textures. +Under the hood, TexTile is formulated as a binary classifier carefully built +from a large dataset of textures of different styles, semantics, regularities, +and human annotations.Key to our method is a set of architectural modifications +to baseline pre-train image classifiers to overcome their shortcomings at +measuring tileability, along with a custom data augmentation and training +regime aimed at increasing robustness and accuracy. We demonstrate that TexTile +can be plugged into different state-of-the-art texture synthesis methods, +including diffusion-based strategies, and generate tileable textures while +keeping or even improving the overall texture quality. Furthermore, we show +that TexTile can objectively evaluate any tileable texture synthesis method, +whereas the current mix of existing metrics produces uncorrelated scores which +heavily hinders progress in the field. + +
+
+ comment: CVPR 2024. Project page: https://mslab.es/projects/TexTile/ +
+
+
+
+
+ + ☆ FaceXFormer: A Unified Transformer for Facial Analysis + + +
+ In this work, we introduce FaceXformer, an end-to-end unified transformer +model for a comprehensive range of facial analysis tasks such as face parsing, +landmark detection, head pose estimation, attributes recognition, and +estimation of age, gender, race, and landmarks visibility. Conventional methods +in face analysis have often relied on task-specific designs and preprocessing +techniques, which limit their approach to a unified architecture. Unlike these +conventional methods, our FaceXformer leverages a transformer-based +encoder-decoder architecture where each task is treated as a learnable token, +enabling the integration of multiple tasks within a single framework. Moreover, +we propose a parameter-efficient decoder, FaceX, which jointly processes face +and task tokens, thereby learning generalized and robust face representations +across different tasks. To the best of our knowledge, this is the first work to +propose a single model capable of handling all these facial analysis tasks +using transformers. We conducted a comprehensive analysis of effective +backbones for unified face task processing and evaluated different task queries +and the synergy between them. We conduct experiments against state-of-the-art +specialized models and previous multi-task models in both intra-dataset and +cross-dataset evaluations across multiple benchmarks. Additionally, our model +effectively handles images "in-the-wild," demonstrating its robustness and +generalizability across eight different tasks, all while maintaining the +real-time performance of 37 FPS. + +
+
+ comment: Project page: https://kartik-3004.github.io/facexformer_web/ +
+
+
+
+
+ + ☆ WHAC: World-grounded Humans and Cameras + + +
+ Estimating human and camera trajectories with accurate scale in the world +coordinate system from a monocular video is a highly desirable yet challenging +and ill-posed problem. In this study, we aim to recover expressive parametric +human models (i.e., SMPL-X) and corresponding camera poses jointly, by +leveraging the synergy between three critical players: the world, the human, +and the camera. Our approach is founded on two key observations. Firstly, +camera-frame SMPL-X estimation methods readily recover absolute human depth. +Secondly, human motions inherently provide absolute spatial cues. By +integrating these insights, we introduce a novel framework, referred to as +WHAC, to facilitate world-grounded expressive human pose and shape estimation +(EHPS) alongside camera pose estimation, without relying on traditional +optimization techniques. Additionally, we present a new synthetic dataset, +WHAC-A-Mole, which includes accurately annotated humans and cameras, and +features diverse interactive human motions as well as realistic camera +trajectories. Extensive experiments on both standard and newly established +benchmarks highlight the superiority and efficacy of our framework. We will +make the code and dataset publicly available. + +
+
+ comment: Homepage: https://wqyin.github.io/projects/WHAC/ +
+
+
+
+
+ + ☆ GVGEN: Text-to-3D Generation with Volumetric Representation + + +
+ In recent years, 3D Gaussian splatting has emerged as a powerful technique +for 3D reconstruction and generation, known for its fast and high-quality +rendering capabilities. To address these shortcomings, this paper introduces a +novel diffusion-based framework, GVGEN, designed to efficiently generate 3D +Gaussian representations from text input. We propose two innovative +techniques:(1) Structured Volumetric Representation. We first arrange +disorganized 3D Gaussian points as a structured form GaussianVolume. This +transformation allows the capture of intricate texture details within a volume +composed of a fixed number of Gaussians. To better optimize the representation +of these details, we propose a unique pruning and densifying method named the +Candidate Pool Strategy, enhancing detail fidelity through selective +optimization. (2) Coarse-to-fine Generation Pipeline. To simplify the +generation of GaussianVolume and empower the model to generate instances with +detailed 3D geometry, we propose a coarse-to-fine pipeline. It initially +constructs a basic geometric structure, followed by the prediction of complete +Gaussian attributes. Our framework, GVGEN, demonstrates superior performance in +qualitative and quantitative assessments compared to existing 3D generation +methods. Simultaneously, it maintains a fast generation speed ($\sim$7 +seconds), effectively striking a balance between quality and efficiency. + +
+
+ comment: project page: https://gvgen.github.io/ +
+
+
+
+
+ + ☆ FutureDepth: Learning to Predict the Future Improves Video Depth + Estimation + + +
+ In this paper, we propose a novel video depth estimation approach, +FutureDepth, which enables the model to implicitly leverage multi-frame and +motion cues to improve depth estimation by making it learn to predict the +future at training. More specifically, we propose a future prediction network, +F-Net, which takes the features of multiple consecutive frames and is trained +to predict multi-frame features one time step ahead iteratively. In this way, +F-Net learns the underlying motion and correspondence information, and we +incorporate its features into the depth decoding process. Additionally, to +enrich the learning of multiframe correspondence cues, we further leverage a +reconstruction network, R-Net, which is trained via adaptively masked +auto-encoding of multiframe feature volumes. At inference time, both F-Net and +R-Net are used to produce queries to work with the depth decoder, as well as a +final refinement network. Through extensive experiments on several benchmarks, +i.e., NYUDv2, KITTI, DDAD, and Sintel, which cover indoor, driving, and +open-domain scenarios, we show that FutureDepth significantly improves upon +baseline models, outperforms existing video depth estimation methods, and sets +new state-of-the-art (SOTA) accuracy. Furthermore, FutureDepth is more +efficient than existing SOTA video depth estimation models and has similar +latencies when comparing to monocular models + +
+
+
+
+
+ + ☆ Just Shift It: Test-Time Prototype Shifting for Zero-Shot Generalization + with Vision-Language Models + + +
+ Advancements in vision-language models (VLMs) have propelled the field of +computer vision, particularly in the zero-shot learning setting. Despite their +promise, the effectiveness of these models often diminishes due to domain +shifts in test environments. To address this, we introduce the Test-Time +Prototype Shifting (TPS) framework, a pioneering approach designed to adapt +VLMs to test datasets using unlabeled test inputs. Our method is based on the +notion of modulating per-class prototypes in the shared embedding space. By +pre-computing and caching prototypes generated with the pre-trained text +encoder, TPS not only facilitates optimization-free prototype reuse for +subsequent predictions but also enables seamless integration with current +advancements in prompt engineering. At test-time, TPS dynamically learns shift +vectors for each prototype based solely on the given test sample, effectively +bridging the domain gap and enhancing classification accuracy. A notable aspect +of our framework is its significantly reduced memory and computational demands +when compared to conventional text-prompt tuning methods. Extensive evaluations +across 15 datasets involving natural distribution shifts and cross-dataset +generalization demonstrate TPS's superior performance, achieving +state-of-the-art results while reducing resource requirements. + +
+
+
+
+
+ + ☆ Segment Anything for comprehensive analysis of grapevine cluster + architecture and berry properties + + +
+ Grape cluster architecture and compactness are complex traits influencing +disease susceptibility, fruit quality, and yield. Evaluation methods for these +traits include visual scoring, manual methodologies, and computer vision, with +the latter being the most scalable approach. Most of the existing computer +vision approaches for processing cluster images often rely on conventional +segmentation or machine learning with extensive training and limited +generalization. The Segment Anything Model (SAM), a novel foundation model +trained on a massive image dataset, enables automated object segmentation +without additional training. This study demonstrates out-of-the-box SAM's high +accuracy in identifying individual berries in 2D cluster images. Using this +model, we managed to segment approximately 3,500 cluster images, generating +over 150,000 berry masks, each linked with spatial coordinates within their +clusters. The correlation between human-identified berries and SAM predictions +was very strong (Pearson r2=0.96). Although the visible berry count in images +typically underestimates the actual cluster berry count due to visibility +issues, we demonstrated that this discrepancy could be adjusted using a linear +regression model (adjusted R2=0.87). We emphasized the critical importance of +the angle at which the cluster is imaged, noting its substantial effect on +berry counts and architecture. We proposed different approaches in which berry +location information facilitated the calculation of complex features related to +cluster architecture and compactness. Finally, we discussed SAM's potential +integration into currently available pipelines for image generation and +processing in vineyard conditions. + +
+
+
+
+
+ + ☆ Zero-Reference Low-Light Enhancement via Physical Quadruple Priors CVPR-2024 + + +
+ Understanding illumination and reducing the need for supervision pose a +significant challenge in low-light enhancement. Current approaches are highly +sensitive to data usage during training and illumination-specific +hyper-parameters, limiting their ability to handle unseen scenarios. In this +paper, we propose a new zero-reference low-light enhancement framework +trainable solely with normal light images. To accomplish this, we devise an +illumination-invariant prior inspired by the theory of physical light transfer. +This prior serves as the bridge between normal and low-light images. Then, we +develop a prior-to-image framework trained without low-light data. During +testing, this framework is able to restore our illumination-invariant prior +back to images, automatically achieving low-light enhancement. Within this +framework, we leverage a pretrained generative diffusion model for model +ability, introduce a bypass decoder to handle detail distortion, as well as +offer a lightweight version for practicality. Extensive experiments demonstrate +our framework's superiority in various scenarios as well as good +interpretability, robustness, and efficiency. Code is available on our project +homepage: http://daooshee.github.io/QuadPrior-Website/ + +
+
+ comment: Accepted by CVPR-2024 +
+
+
+
+
+ + ☆ You Only Sample Once: Taming One-Step Text-To-Image Synthesis by + Self-Cooperative Diffusion GANs + + +
+ We introduce YOSO, a novel generative model designed for rapid, scalable, and +high-fidelity one-step image synthesis. This is achieved by integrating the +diffusion process with GANs. Specifically, we smooth the distribution by the +denoising generator itself, performing self-cooperative learning. We show that +our method can serve as a one-step generation model training from scratch with +competitive performance. Moreover, we show that our method can be extended to +finetune pre-trained text-to-image diffusion for high-quality one-step +text-to-image synthesis even with LoRA fine-tuning. In particular, we provide +the first diffusion transformer that can generate images in one step trained on +512 resolution, with the capability of adapting to 1024 resolution without +explicit training. Our code is provided at https://github.com/Luo-Yihong/YOSO. + +
+
+ comment: Early version +
+
+
+
+
+ + ☆ Contextual AD Narration with Interleaved Multimodal Sequence + + +
+ The Audio Description (AD) task aims to generate descriptions of visual +elements for visually impaired individuals to help them access long-form video +contents, like movie. With video feature, text, character bank and context +information as inputs, the generated ADs are able to correspond to the +characters by name and provide reasonable, contextual descriptions to help +audience understand the storyline of movie. To achieve this goal, we propose to +leverage pre-trained foundation models through a simple and unified framework +to generate ADs with interleaved multimodal sequence as input, termed as +Uni-AD. To enhance the alignment of features across various modalities with +finer granularity, we introduce a simple and lightweight module that maps video +features into the textual feature space. Moreover, we also propose a +character-refinement module to provide more precise information by identifying +the main characters who play more significant role in the video context. With +these unique designs, we further incorporate contextual information and a +contrastive loss into our architecture to generate more smooth and contextual +ADs. Experiments on the MAD-eval dataset show that Uni-AD can achieve +state-of-the-art performance on AD generation, which demonstrates the +effectiveness of our approach. Code will be available at +https://github.com/MCG-NJU/Uni-AD. + +
+
+
+
+
+ + ☆ Semantic Layering in Room Segmentation via LLMs + + +
+ In this paper, we introduce Semantic Layering in Room Segmentation via LLMs +(SeLRoS), an advanced method for semantic room segmentation by integrating +Large Language Models (LLMs) with traditional 2D map-based segmentation. Unlike +previous approaches that solely focus on the geometric segmentation of indoor +environments, our work enriches segmented maps with semantic data, including +object identification and spatial relationships, to enhance robotic navigation. +By leveraging LLMs, we provide a novel framework that interprets and organizes +complex information about each segmented area, thereby improving the accuracy +and contextual relevance of room segmentation. Furthermore, SeLRoS overcomes +the limitations of existing algorithms by using a semantic evaluation method to +accurately distinguish true room divisions from those erroneously generated by +furniture and segmentation inaccuracies. The effectiveness of SeLRoS is +verified through its application across 30 different 3D environments. Source +code and experiment videos for this work are available at: +https://sites.google.com/view/selros. + +
+
+
+
+
+ + ☆ Ultra-High-Resolution Image Synthesis with Pyramid Diffusion Model + + +
+ We introduce the Pyramid Diffusion Model (PDM), a novel architecture designed +for ultra-high-resolution image synthesis. PDM utilizes a pyramid latent +representation, providing a broader design space that enables more flexible, +structured, and efficient perceptual compression which enable AutoEncoder and +Network of Diffusion to equip branches and deeper layers. To enhance PDM's +capabilities for generative tasks, we propose the integration of +Spatial-Channel Attention and Res-Skip Connection, along with the utilization +of Spectral Norm and Decreasing Dropout Strategy for the Diffusion Network and +AutoEncoder. In summary, PDM achieves the synthesis of images with a 2K +resolution for the first time, demonstrated on two new datasets comprising +images of sizes 2048x2048 pixels and 2048x1024 pixels respectively. We believe +that this work offers an alternative approach to designing scalable image +generative models, while also providing incremental reinforcement for existing +frameworks. + +
+
+ comment: Preprint Version +
+
+
+
+
+ + ☆ TexDreamer: Towards Zero-Shot High-Fidelity 3D Human Texture Generation + + +
+ Texturing 3D humans with semantic UV maps remains a challenge due to the +difficulty of acquiring reasonably unfolded UV. Despite recent text-to-3D +advancements in supervising multi-view renderings using large text-to-image +(T2I) models, issues persist with generation speed, text consistency, and +texture quality, resulting in data scarcity among existing datasets. We present +TexDreamer, the first zero-shot multimodal high-fidelity 3D human texture +generation model. Utilizing an efficient texture adaptation finetuning +strategy, we adapt large T2I model to a semantic UV structure while preserving +its original generalization capability. Leveraging a novel feature translator +module, the trained model is capable of generating high-fidelity 3D human +textures from either text or image within seconds. Furthermore, we introduce +ArTicuLated humAn textureS (ATLAS), the largest high-resolution (1024 X 1024) +3D human texture dataset which contains 50k high-fidelity textures with text +descriptions. + +
+
+ comment: Project Page: https://ggxxii.github.io/texdreamer/ +
+
+
+
+
+ + ☆ mPLUG-DocOwl 1.5: Unified Structure Learning for OCR-free Document + Understanding + + +
+ Structure information is critical for understanding the semantics of +text-rich images, such as documents, tables, and charts. Existing Multimodal +Large Language Models (MLLMs) for Visual Document Understanding are equipped +with text recognition ability but lack general structure understanding +abilities for text-rich document images. In this work, we emphasize the +importance of structure information in Visual Document Understanding and +propose the Unified Structure Learning to boost the performance of MLLMs. Our +Unified Structure Learning comprises structure-aware parsing tasks and +multi-grained text localization tasks across 5 domains: document, webpage, +table, chart, and natural image. To better encode structure information, we +design a simple and effective vision-to-text module H-Reducer, which can not +only maintain the layout information but also reduce the length of visual +features by merging horizontal adjacent patches through convolution, enabling +the LLM to understand high-resolution images more efficiently. Furthermore, by +constructing structure-aware text sequences and multi-grained pairs of texts +and bounding boxes for publicly available text-rich images, we build a +comprehensive training set DocStruct4M to support structure learning. Finally, +we construct a small but high-quality reasoning tuning dataset DocReason25K to +trigger the detailed explanation ability in the document domain. Our model +DocOwl 1.5 achieves state-of-the-art performance on 10 visual document +understanding benchmarks, improving the SOTA performance of MLLMs with a 7B LLM +by more than 10 points in 5/10 benchmarks. Our codes, models, and datasets are +publicly available at +https://github.com/X-PLUG/mPLUG-DocOwl/tree/main/DocOwl1.5. + +
+
+ comment: 21 pages, 15 figures +
+
+
+
+
+ + ☆ MEDBind: Unifying Language and Multimodal Medical Data Embeddings + + +
+ Medical vision-language pretraining models (VLPM) have achieved remarkable +progress in fusing chest X-rays (CXR) with clinical texts, introducing +image-text data binding approaches that enable zero-shot learning and +downstream clinical tasks. However, the current landscape lacks the holistic +integration of additional medical modalities, such as electrocardiograms (ECG). +We present MEDBind (Medical Electronic patient recorD), which learns joint +embeddings across CXR, ECG, and medical text. Using text data as the central +anchor, MEDBind features tri-modality binding, delivering competitive +performance in top-K retrieval, zero-shot, and few-shot benchmarks against +established VLPM, and the ability for CXR-to-ECG zero-shot classification and +retrieval. This seamless integration is achieved through combination of +contrastive loss on modality-text pairs with our proposed contrastive loss +function, Edge-Modality Contrastive Loss, fostering a cohesive embedding space +for CXR, ECG, and text. Finally, we demonstrate that MEDBind can improve +downstream tasks by directly integrating CXR and ECG embeddings into a +large-language model for multimodal prompt tuning. + +
+
+
+
+
+ + ☆ Adaptive Visual Imitation Learning for Robotic Assisted Feeding Across + Varied Bowl Configurations and Food Types + + +
+ In this study, we introduce a novel visual imitation network with a spatial +attention module for robotic assisted feeding (RAF). The goal is to acquire +(i.e., scoop) food items from a bowl. However, achieving robust and adaptive +food manipulation is particularly challenging. To deal with this, we propose a +framework that integrates visual perception with imitation learning to enable +the robot to handle diverse scenarios during scooping. Our approach, named AVIL +(adaptive visual imitation learning), exhibits adaptability and robustness +across different bowl configurations in terms of material, size, and position, +as well as diverse food types including granular, semi-solid, and liquid, even +in the presence of distractors. We validate the effectiveness of our approach +by conducting experiments on a real robot. We also compare its performance with +a baseline. The results demonstrate improvement over the baseline across all +scenarios, with an enhancement of up to 2.5 times in terms of a success metric. +Notably, our model, trained solely on data from a transparent glass bowl +containing granular cereals, showcases generalization ability when tested +zero-shot on other bowl configurations with different types of food. + +
+
+
+
+
+ + ☆ EmoVOCA: Speech-Driven Emotional 3D Talking Heads + + +
+ The domain of 3D talking head generation has witnessed significant progress +in recent years. A notable challenge in this field consists in blending +speech-related motions with expression dynamics, which is primarily caused by +the lack of comprehensive 3D datasets that combine diversity in spoken +sentences with a variety of facial expressions. Whereas literature works +attempted to exploit 2D video data and parametric 3D models as a workaround, +these still show limitations when jointly modeling the two motions. In this +work, we address this problem from a different perspective, and propose an +innovative data-driven technique that we used for creating a synthetic dataset, +called EmoVOCA, obtained by combining a collection of inexpressive 3D talking +heads and a set of 3D expressive sequences. To demonstrate the advantages of +this approach, and the quality of the dataset, we then designed and trained an +emotional 3D talking head generator that accepts a 3D face, an audio file, an +emotion label, and an intensity value as inputs, and learns to animate the +audio-synchronized lip movements with expressive traits of the face. +Comprehensive experiments, both quantitative and qualitative, using our data +and generator evidence superior ability in synthesizing convincing animations, +when compared with the best performing methods in the literature. Our code and +pre-trained model will be made available. + +
+
+
+
+
+ + ☆ HYDRA: A Hyper Agent for Dynamic Compositional Visual Reasoning + + +
+ Recent advances in visual reasoning (VR), particularly with the aid of Large +Vision-Language Models (VLMs), show promise but require access to large-scale +datasets and face challenges such as high computational costs and limited +generalization capabilities. Compositional visual reasoning approaches have +emerged as effective strategies; however, they heavily rely on the commonsense +knowledge encoded in Large Language Models (LLMs) to perform planning, +reasoning, or both, without considering the effect of their decisions on the +visual reasoning process, which can lead to errors or failed procedures. To +address these challenges, we introduce HYDRA, a multi-stage dynamic +compositional visual reasoning framework designed for reliable and +incrementally progressive general reasoning. HYDRA integrates three essential +modules: a planner, a Reinforcement Learning (RL) agent serving as a cognitive +controller, and a reasoner. The planner and reasoner modules utilize an LLM to +generate instruction samples and executable code from the selected instruction, +respectively, while the RL agent dynamically interacts with these modules, +making high-level decisions on selection of the best instruction sample given +information from the historical state stored through a feedback loop. This +adaptable design enables HYDRA to adjust its actions based on previous feedback +received during the reasoning process, leading to more reliable reasoning +outputs and ultimately enhancing its overall effectiveness. Our framework +demonstrates state-of-the-art performance in various VR tasks on four different +widely-used datasets. + +
+
+
+
+
+ + ☆ Confusing Pair Correction Based on Category Prototype for Domain + Adaptation under Noisy Environments AAAI 2024 + + +
+ In this paper, we address unsupervised domain adaptation under noisy +environments, which is more challenging and practical than traditional domain +adaptation. In this scenario, the model is prone to overfitting noisy labels, +resulting in a more pronounced domain shift and a notable decline in the +overall model performance. Previous methods employed prototype methods for +domain adaptation on robust feature spaces. However, these approaches struggle +to effectively classify classes with similar features under noisy environments. +To address this issue, we propose a new method to detect and correct confusing +class pair. We first divide classes into easy and hard classes based on the +small loss criterion. We then leverage the top-2 predictions for each sample +after aligning the source and target domain to find the confusing pair in the +hard classes. We apply label correction to the noisy samples within the +confusing pair. With the proposed label correction method, we can train our +model with more accurate labels. Extensive experiments confirm the +effectiveness of our method and demonstrate its favorable performance compared +with existing state-of-the-art methods. Our codes are publicly available at +https://github.com/Hehxcf/CPC/. + +
+
+ comment: AAAI 2024 +
+
+
+
+
+ + ☆ PoNQ: a Neural QEM-based Mesh Representation + + +
+ Although polygon meshes have been a standard representation in geometry +processing, their irregular and combinatorial nature hinders their suitability +for learning-based applications. In this work, we introduce a novel learnable +mesh representation through a set of local 3D sample Points and their +associated Normals and Quadric error metrics (QEM) w.r.t. the underlying shape, +which we denote PoNQ. A global mesh is directly derived from PoNQ by +efficiently leveraging the knowledge of the local quadric errors. Besides +marking the first use of QEM within a neural shape representation, our +contribution guarantees both topological and geometrical properties by ensuring +that a PoNQ mesh does not self-intersect and is always the boundary of a +volume. Notably, our representation does not rely on a regular grid, is +supervised directly by the target surface alone, and also handles open surfaces +with boundaries and/or sharp features. We demonstrate the efficacy of PoNQ +through a learning-based mesh prediction from SDF grids and show that our +method surpasses recent state-of-the-art techniques in terms of both surface +and edge-based metrics. + +
+
+
+
+
+ + ☆ Generative Enhancement for 3D Medical Images + + +
+ The limited availability of 3D medical image datasets, due to privacy +concerns and high collection or annotation costs, poses significant challenges +in the field of medical imaging. While a promising alternative is the use of +synthesized medical data, there are few solutions for realistic 3D medical +image synthesis due to difficulties in backbone design and fewer 3D training +samples compared to 2D counterparts. In this paper, we propose GEM-3D, a novel +generative approach to the synthesis of 3D medical images and the enhancement +of existing datasets using conditional diffusion models. Our method begins with +a 2D slice, noted as the informed slice to serve the patient prior, and +propagates the generation process using a 3D segmentation mask. By decomposing +the 3D medical images into masks and patient prior information, GEM-3D offers a +flexible yet effective solution for generating versatile 3D images from +existing datasets. GEM-3D can enable dataset enhancement by combining informed +slice selection and generation at random positions, along with editable mask +volumes to introduce large variations in diffusion sampling. Moreover, as the +informed slice contains patient-wise information, GEM-3D can also facilitate +counterfactual image synthesis and dataset-level de-enhancement with desired +control. Experiments on brain MRI and abdomen CT images demonstrate that GEM-3D +is capable of synthesizing high-quality 3D medical images with volumetric +consistency, offering a straightforward solution for dataset enhancement during +inference. The code is available at https://github.com/HKU-MedAI/GEM-3D. + +
+
+ comment: 19 pages, 4 figures +
+
+
+
+
+ + ☆ Compositional 3D Scene Synthesis with Scene Graph Guided Layout-Shape + Generation + + +
+ Compositional 3D scene synthesis has diverse applications across a spectrum +of industries such as robotics, films, and video games, as it closely mirrors +the complexity of real-world multi-object environments. Early works typically +employ shape retrieval based frameworks which naturally suffer from limited +shape diversity. Recent progresses have been made in shape generation with +powerful generative models, such as diffusion models, which increases the shape +fidelity. However, these approaches separately treat 3D shape generation and +layout generation. The synthesized scenes are usually hampered by layout +collision, which implies that the scene-level fidelity is still under-explored. +In this paper, we aim at generating realistic and reasonable 3D scenes from +scene graph. To enrich the representation capability of the given scene graph +inputs, large language model is utilized to explicitly aggregate the global +graph features with local relationship features. With a unified graph +convolution network (GCN), graph features are extracted from scene graphs +updated via joint layout-shape distribution. During scene generation, an +IoU-based regularization loss is introduced to constrain the predicted 3D +layouts. Benchmarked on the SG-FRONT dataset, our method achieves better 3D +scene synthesis, especially in terms of scene-level fidelity. The source code +will be released after publication. + +
+
+
+
+
+ + ☆ Global-guided Focal Neural Radiance Field for Large-scale Scene + Rendering + + +
+ Neural radiance fields~(NeRF) have recently been applied to render +large-scale scenes. However, their limited model capacity typically results in +blurred rendering results. Existing large-scale NeRFs primarily address this +limitation by partitioning the scene into blocks, which are subsequently +handled by separate sub-NeRFs. These sub-NeRFs, trained from scratch and +processed independently, lead to inconsistencies in geometry and appearance +across the scene. Consequently, the rendering quality fails to exhibit +significant improvement despite the expansion of model capacity. In this work, +we present global-guided focal neural radiance field (GF-NeRF) that achieves +high-fidelity rendering of large-scale scenes. Our proposed GF-NeRF utilizes a +two-stage (Global and Focal) architecture and a global-guided training +strategy. The global stage obtains a continuous representation of the entire +scene while the focal stage decomposes the scene into multiple blocks and +further processes them with distinct sub-encoders. Leveraging this two-stage +architecture, sub-encoders only need fine-tuning based on the global encoder, +thus reducing training complexity in the focal stage while maintaining +scene-wide consistency. Spatial information and error information from the +global stage also benefit the sub-encoders to focus on crucial areas and +effectively capture more details of large-scale scenes. Notably, our approach +does not rely on any prior knowledge about the target scene, attributing +GF-NeRF adaptable to various large-scale scene types, including street-view and +aerial-view scenes. We demonstrate that our method achieves high-fidelity, +natural rendering results on various types of large-scale datasets. Our project +page: https://shaomq2187.github.io/GF-NeRF/ + +
+
+
+
+
+ + ☆ AnySkill: Learning Open-Vocabulary Physical Skill for Interactive Agents + + +
+ Traditional approaches in physics-based motion generation, centered around +imitation learning and reward shaping, often struggle to adapt to new +scenarios. To tackle this limitation, we propose AnySkill, a novel hierarchical +method that learns physically plausible interactions following open-vocabulary +instructions. Our approach begins by developing a set of atomic actions via a +low-level controller trained via imitation learning. Upon receiving an +open-vocabulary textual instruction, AnySkill employs a high-level policy that +selects and integrates these atomic actions to maximize the CLIP similarity +between the agent's rendered images and the text. An important feature of our +method is the use of image-based rewards for the high-level policy, which +allows the agent to learn interactions with objects without manual reward +engineering. We demonstrate AnySkill's capability to generate realistic and +natural motion sequences in response to unseen instructions of varying lengths, +marking it the first method capable of open-vocabulary physical skill learning +for interactive humanoid agents. + +
+
+
+
+
+ + ☆ Embarrassingly Simple Scribble Supervision for 3D Medical Segmentation + + +
+ Traditionally, segmentation algorithms require dense annotations for +training, demanding significant annotation efforts, particularly within the 3D +medical imaging field. Scribble-supervised learning emerges as a possible +solution to this challenge, promising a reduction in annotation efforts when +creating large-scale datasets. Recently, a plethora of methods for optimized +learning from scribbles have been proposed, but have so far failed to position +scribble annotation as a beneficial alternative. We relate this shortcoming to +two major issues: 1) the complex nature of many methods which deeply ties them +to the underlying segmentation model, thus preventing a migration to more +powerful state-of-the-art models as the field progresses and 2) the lack of a +systematic evaluation to validate consistent performance across the broader +medical domain, resulting in a lack of trust when applying these methods to new +segmentation problems. To address these issues, we propose a comprehensive +scribble supervision benchmark consisting of seven datasets covering a diverse +set of anatomies and pathologies imaged with varying modalities. We furthermore +propose the systematic use of partial losses, i.e. losses that are only +computed on annotated voxels. Contrary to most existing methods, these losses +can be seamlessly integrated into state-of-the-art segmentation methods, +enabling them to learn from scribble annotations while preserving their +original loss formulations. Our evaluation using nnU-Net reveals that while +most existing methods suffer from a lack of generalization, the proposed +approach consistently delivers state-of-the-art performance. Thanks to its +simplicity, our approach presents an embarrassingly simple yet effective +solution to the challenges of scribble supervision. Source code as well as our +extensive scribble benchmarking suite will be made publicly available upon +publication. + +
+
+
+
+
+ + ☆ Re-identification from histopathology images + + +
+ In numerous studies, deep learning algorithms have proven their potential for +the analysis of histopathology images, for example, for revealing the subtypes +of tumors or the primary origin of metastases. These models require large +datasets for training, which must be anonymized to prevent possible patient +identity leaks. This study demonstrates that even relatively simple deep +learning algorithms can re-identify patients in large histopathology datasets +with substantial accuracy. We evaluated our algorithms on two TCIA datasets +including lung squamous cell carcinoma (LSCC) and lung adenocarcinoma (LUAD). +We also demonstrate the algorithm's performance on an in-house dataset of +meningioma tissue. We predicted the source patient of a slide with F1 scores of +50.16 % and 52.30 % on the LSCC and LUAD datasets, respectively, and with 62.31 +% on our meningioma dataset. Based on our findings, we formulated a risk +assessment scheme to estimate the risk to the patient's privacy prior to +publication. + +
+
+ comment: 20 pages, 7 figures, 2 tables +
+
+
+
+
+ + ☆ VisualCritic: Making LMMs Perceive Visual Quality Like Humans + + +
+ At present, large multimodal models (LMMs) have exhibited impressive +generalization capabilities in understanding and generating visual signals. +However, they currently still lack sufficient capability to perceive low-level +visual quality akin to human perception. Can LMMs achieve this and show the +same degree of generalization in this regard? If so, not only could the +versatility of LMMs be further enhanced, but also the challenge of poor +cross-dataset performance in the field of visual quality assessment could be +addressed. In this paper, we explore this question and provide the answer +"Yes!". As the result of this initial exploration, we present VisualCritic, the +first LMM for broad-spectrum image subjective quality assessment. VisualCritic +can be used across diverse data right out of box, without any requirements of +dataset-specific adaptation operations like conventional specialist models. As +an instruction-following LMM, VisualCritic enables new capabilities of (1) +quantitatively measuring the perceptual quality of given images in terms of +their Mean Opinion Score (MOS), noisiness, colorfulness, sharpness, and other +numerical indicators, (2) qualitatively evaluating visual quality and providing +explainable descriptions, (3) discerning whether a given image is AI-generated +or photographic. Extensive experiments demonstrate the efficacy of VisualCritic +by comparing it with other open-source LMMs and conventional specialist models +over both AI-generated and photographic images. + +
+
+
+
+
+ + ☆ DreamDA: Generative Data Augmentation with Diffusion Models + + +
+ The acquisition of large-scale, high-quality data is a resource-intensive and +time-consuming endeavor. Compared to conventional Data Augmentation (DA) +techniques (e.g. cropping and rotation), exploiting prevailing diffusion models +for data generation has received scant attention in classification tasks. +Existing generative DA methods either inadequately bridge the domain gap +between real-world and synthesized images, or inherently suffer from a lack of +diversity. To solve these issues, this paper proposes a new +classification-oriented framework DreamDA, which enables data synthesis and +label generation by way of diffusion models. DreamDA generates diverse samples +that adhere to the original data distribution by considering training images in +the original data as seeds and perturbing their reverse diffusion process. In +addition, since the labels of the generated data may not align with the labels +of their corresponding seed images, we introduce a self-training paradigm for +generating pseudo labels and training classifiers using the synthesized data. +Extensive experiments across four tasks and five datasets demonstrate +consistent improvements over strong baselines, revealing the efficacy of +DreamDA in synthesizing high-quality and diverse images with accurate labels. +Our code will be available at https://github.com/yunxiangfu2001/DreamDA. + +
+
+ comment: 14 pages, 8 tables, 3 figures +
+
+
+
+
+ + ☆ RelationVLM: Making Large Vision-Language Models Understand Visual + Relations + + +
+ The development of Large Vision-Language Models (LVLMs) is striving to catch +up with the success of Large Language Models (LLMs), yet it faces more +challenges to be resolved. Very recent works enable LVLMs to localize +object-level visual contents and ground text to them. Nonetheless, current +LVLMs still struggle to precisely understand visual relations due to the lack +of relevant data. In this work, we present RelationVLM, a large vision-language +model capable of comprehending various levels and types of relations whether +across multiple images or within a video. Specifically, we devise a multi-stage +relation-aware training scheme and a series of corresponding data configuration +strategies to bestow RelationVLM with the capabilities of understanding +semantic relations, temporal associations and geometric transforms. Extensive +case studies and quantitative evaluations show RelationVLM has strong +capability in understanding such relations and emerges impressive in-context +capability of reasoning from few-shot examples by comparison. This work fosters +the advancements of LVLMs by enabling them to support a wider range of +downstream applications toward artificial general intelligence. + +
+
+
+
+
+ + ☆ Learning Neural Volumetric Pose Features for Camera Localization + + +
+ We introduce a novel neural volumetric pose feature, termed PoseMap, designed +to enhance camera localization by encapsulating the information between images +and the associated camera poses. Our framework leverages an Absolute Pose +Regression (APR) architecture, together with an augmented NeRF module. This +integration not only facilitates the generation of novel views to enrich the +training dataset but also enables the learning of effective pose features. +Additionally, we extend our architecture for self-supervised online alignment, +allowing our method to be used and fine-tuned for unlabelled images within a +unified framework. Experiments demonstrate that our method achieves 14.28% and +20.51% performance gain on average in indoor and outdoor benchmark scenes, +outperforming existing APR methods with state-of-the-art accuracy. + +
+
+ comment: 14 pages, 9 figures +
+
+
+
+
+ + ☆ DDSB: An Unsupervised and Training-free Method for Phase Detection in + Echocardiography + + +
+ Accurate identification of End-Diastolic (ED) and End-Systolic (ES) frames is +key for cardiac function assessment through echocardiography. However, +traditional methods face several limitations: they require extensive amounts of +data, extensive annotations by medical experts, significant training resources, +and often lack robustness. Addressing these challenges, we proposed an +unsupervised and training-free method, our novel approach leverages +unsupervised segmentation to enhance fault tolerance against segmentation +inaccuracies. By identifying anchor points and analyzing directional +deformation, we effectively reduce dependence on the accuracy of initial +segmentation images and enhance fault tolerance, all while improving +robustness. Tested on Echo-dynamic and CAMUS datasets, our method achieves +comparable accuracy to learning-based models without their associated +drawbacks. The code is available at https://github.com/MRUIL/DDSB + +
+
+
+
+
+ + ☆ Total Disentanglement of Font Images into Style and Character Class + Features + + +
+ In this paper, we demonstrate a total disentanglement of font images. Total +disentanglement is a neural network-based method for decomposing each font +image nonlinearly and completely into its style and content (i.e., character +class) features. It uses a simple but careful training procedure to extract the +common style feature from all `A'-`Z' images in the same font and the common +content feature from all `A' (or another class) images in different fonts. +These disentangled features guarantee the reconstruction of the original font +image. Various experiments have been conducted to understand the performance of +total disentanglement. First, it is demonstrated that total disentanglement is +achievable with very high accuracy; this is experimental proof of the +long-standing open question, ``Does `A'-ness exist?'' Hofstadter (1985). +Second, it is demonstrated that the disentangled features produced by total +disentanglement apply to a variety of tasks, including font recognition, +character recognition, and one-shot font image generation. + +
+
+
+
+
+ + ☆ ViTGaze: Gaze Following with Interaction Features in Vision Transformers + + +
+ Gaze following aims to interpret human-scene interactions by predicting the +person's focal point of gaze. Prevailing approaches often use multi-modality +inputs, most of which adopt a two-stage framework. Hence their performance +highly depends on the previous prediction accuracy. Others use a +single-modality approach with complex decoders, increasing network +computational load. Inspired by the remarkable success of pre-trained plain +Vision Transformers (ViTs), we introduce a novel single-modality gaze following +framework, ViTGaze. In contrast to previous methods, ViTGaze creates a brand +new gaze following framework based mainly on powerful encoders (dec. param. +less than 1%). Our principal insight lies in that the inter-token interactions +within self-attention can be transferred to interactions between humans and +scenes. Leveraging this presumption, we formulate a framework consisting of a +4D interaction encoder and a 2D spatial guidance module to extract human-scene +interaction information from self-attention maps. Furthermore, our +investigation reveals that ViT with self-supervised pre-training exhibits an +enhanced ability to extract correlated information. A large number of +experiments have been conducted to demonstrate the performance of the proposed +method. Our method achieves state-of-the-art (SOTA) performance among all +single-modality methods (3.4% improvement on AUC, 5.1% improvement on AP) and +very comparable performance against multi-modality methods with 59% number of +parameters less. + +
+
+
+
+
+ + ☆ Discover and Mitigate Multiple Biased Subgroups in Image Classifiers + + +
+ Machine learning models can perform well on in-distribution data but often +fail on biased subgroups that are underrepresented in the training data, +hindering the robustness of models for reliable applications. Such subgroups +are typically unknown due to the absence of subgroup labels. Discovering biased +subgroups is the key to understanding models' failure modes and further +improving models' robustness. Most previous works of subgroup discovery make an +implicit assumption that models only underperform on a single biased subgroup, +which does not hold on in-the-wild data where multiple biased subgroups exist. + In this work, we propose Decomposition, Interpretation, and Mitigation (DIM), +a novel method to address a more challenging but also more practical problem of +discovering multiple biased subgroups in image classifiers. Our approach +decomposes the image features into multiple components that represent multiple +subgroups. This decomposition is achieved via a bilinear dimension reduction +method, Partial Least Square (PLS), guided by useful supervision from the image +classifier. We further interpret the semantic meaning of each subgroup +component by generating natural language descriptions using vision-language +foundation models. Finally, DIM mitigates multiple biased subgroups +simultaneously via two strategies, including the data- and model-centric +strategies. Extensive experiments on CIFAR-100 and Breeds datasets demonstrate +the effectiveness of DIM in discovering and mitigating multiple biased +subgroups. Furthermore, DIM uncovers the failure modes of the classifier on +Hard ImageNet, showcasing its broader applicability to understanding model bias +in image classifiers. The code is available at +https://github.com/ZhangAIPI/DIM. + +
+
+
+
+
+ + ☆ Multispectral Image Restoration by Generalized Opponent Transformation + Total Variation + + +
+ Multispectral images (MSI) contain light information in different wavelengths +of objects, which convey spectral-spatial information and help improve the +performance of various image processing tasks. Numerous techniques have been +created to extend the application of total variation regularization in +restoring multispectral images, for example, based on channel coupling and +adaptive total variation regularization. The primary contribution of this paper +is to propose and develop a new multispectral total variation regularization in +a generalized opponent transformation domain instead of the original +multispectral image domain. Here opponent transformations for multispectral +images are generalized from a well-known opponent transformation for color +images. We will explore the properties of generalized opponent transformation +total variation (GOTTV) regularization and the corresponding optimization +formula for multispectral image restoration. To evaluate the effectiveness of +the new GOTTV method, we provide numerical examples that showcase its superior +performance compared to existing multispectral image total variation methods, +using criteria such as MPSNR and MSSIM. + +
+
+
+
+
+ + ☆ Inter- and intra-uncertainty based feature aggregation model for + semi-supervised histopathology image segmentation + + +
+ Acquiring pixel-level annotations is often limited in applications such as +histology studies that require domain expertise. Various semi-supervised +learning approaches have been developed to work with limited ground truth +annotations, such as the popular teacher-student models. However, hierarchical +prediction uncertainty within the student model (intra-uncertainty) and image +prediction uncertainty (inter-uncertainty) have not been fully utilized by +existing methods. To address these issues, we first propose a novel inter- and +intra-uncertainty regularization method to measure and constrain both inter- +and intra-inconsistencies in the teacher-student architecture. We also propose +a new two-stage network with pseudo-mask guided feature aggregation (PG-FANet) +as the segmentation model. The two-stage structure complements with the +uncertainty regularization strategy to avoid introducing extra modules in +solving uncertainties and the aggregation mechanisms enable multi-scale and +multi-stage feature integration. Comprehensive experimental results over the +MoNuSeg and CRAG datasets show that our PG-FANet outperforms other +state-of-the-art methods and our semi-supervised learning framework yields +competitive performance with a limited amount of labeled data. + +
+
+
+
+
+ + ☆ WaveFace: Authentic Face Restoration with Efficient Frequency Recovery + + +
+ Although diffusion models are rising as a powerful solution for blind face +restoration, they are criticized for two problems: 1) slow training and +inference speed, and 2) failure in preserving identity and recovering +fine-grained facial details. In this work, we propose WaveFace to solve the +problems in the frequency domain, where low- and high-frequency components +decomposed by wavelet transformation are considered individually to maximize +authenticity as well as efficiency. The diffusion model is applied to recover +the low-frequency component only, which presents general information of the +original image but 1/16 in size. To preserve the original identity, the +generation is conditioned on the low-frequency component of low-quality images +at each denoising step. Meanwhile, high-frequency components at multiple +decomposition levels are handled by a unified network, which recovers complex +facial details in a single step. Evaluations on four benchmark datasets show +that: 1) WaveFace outperforms state-of-the-art methods in authenticity, +especially in terms of identity preservation, and 2) authentic images are +restored with the efficiency 10x faster than existing diffusion model-based BFR +methods. + +
+
+
+
+
+ + ☆ Building Brain Tumor Segmentation Networks with User-Assisted Filter + Estimation and Selection + + +
+ Brain tumor image segmentation is a challenging research topic in which +deep-learning models have presented the best results. However, the traditional +way of training those models from many pre-annotated images leaves several +unanswered questions. Hence methodologies, such as Feature Learning from Image +Markers (FLIM), have involved an expert in the learning loop to reduce human +effort in data annotation and build models sufficiently deep for a given +problem. FLIM has been successfully used to create encoders, estimating the +filters of all convolutional layers from patches centered at marker voxels. In +this work, we present Multi-Step (MS) FLIM - a user-assisted approach to +estimating and selecting the most relevant filters from multiple FLIM +executions. MS-FLIM is used only for the first convolutional layer, and the +results already indicate improvement over FLIM. For evaluation, we build a +simple U-shaped encoder-decoder network, named sU-Net, for glioblastoma +segmentation using T1Gd and FLAIR MRI scans, varying the encoder's training +method, using FLIM, MS-FLIM, and backpropagation algorithm. Also, we compared +these sU-Nets with two State-Of-The-Art (SOTA) deep-learning models using two +datasets. The results show that the sU-Net based on MS-FLIM outperforms the +other training methods and achieves effectiveness within the standard +deviations of the SOTA models. + +
+
+ comment: 10 pages, 5 figures, 2 tables, 24 references, manuscript of + conference paper +
+
+
+
+
+ + ☆ Towards Controllable Face Generation with Semantic Latent Diffusion + Models + + +
+ Semantic Image Synthesis (SIS) is among the most popular and effective +techniques in the field of face generation and editing, thanks to its good +generation quality and the versatility is brings along. Recent works attempted +to go beyond the standard GAN-based framework, and started to explore Diffusion +Models (DMs) for this task as these stand out with respect to GANs in terms of +both quality and diversity. On the other hand, DMs lack in fine-grained +controllability and reproducibility. To address that, in this paper we propose +a SIS framework based on a novel Latent Diffusion Model architecture for human +face generation and editing that is both able to reproduce and manipulate a +real reference image and generate diversity-driven results. The proposed system +utilizes both SPADE normalization and cross-attention layers to merge shape and +style information and, by doing so, allows for a precise control over each of +the semantic parts of the human face. This was not possible with previous +methods in the state of the art. Finally, we performed an extensive set of +experiments to prove that our model surpasses current state of the art, both +qualitatively and quantitatively. + +
+
+
+
+
+ + ☆ Towards Multimodal In-Context Learning for Vision & Language Models + + +
+ Inspired by the emergence of Large Language Models (LLMs) that can truly +understand human language, significant progress has been made in aligning +other, non-language, modalities to be `understandable' by an LLM, primarily via +converting their samples into a sequence of embedded language-like tokens +directly fed into the LLM (decoder) input stream. However, so far limited +attention has been given to transferring (and evaluating) one of the core LLM +capabilities to the emerging VLMs, namely the In-Context Learning (ICL) +ability, or in other words to guide VLMs to desired target downstream tasks or +output structure using in-context image+text demonstrations. In this work, we +dive deeper into analyzing the capabilities of some of the state-of-the-art +VLMs to follow ICL instructions, discovering them to be somewhat lacking. We +discover that even models that underwent large-scale mixed modality +pre-training and were implicitly guided to make use of interleaved image and +text information (intended to consume helpful context from multiple images) +under-perform when prompted with few-shot (ICL) demonstrations, likely due to +their lack of `direct' ICL instruction tuning. To test this conjecture, we +propose a simple, yet surprisingly effective, strategy of extending a common +VLM alignment framework with ICL support, methodology, and curriculum. We +explore, analyze, and provide insights into effective data mixes, leading up to +a significant 21.03% (and 11.3% on average) ICL performance boost over the +strongest VLM baselines and a variety of ICL benchmarks. We also contribute new +benchmarks for ICL evaluation in VLMs and discuss their advantages over the +prior art. + +
+
+
+
+
+ + ☆ Diffusion-Driven Self-Supervised Learning for Shape Reconstruction and + Pose Estimation + + +
+ Fully-supervised category-level pose estimation aims to determine the 6-DoF +poses of unseen instances from known categories, requiring expensive mannual +labeling costs. Recently, various self-supervised category-level pose +estimation methods have been proposed to reduce the requirement of the +annotated datasets. However, most methods rely on synthetic data or 3D CAD +model for self-supervised training, and they are typically limited to +addressing single-object pose problems without considering multi-objective +tasks or shape reconstruction. To overcome these challenges and limitations, we +introduce a diffusion-driven self-supervised network for multi-object shape +reconstruction and categorical pose estimation, only leveraging the shape +priors. Specifically, to capture the SE(3)-equivariant pose features and 3D +scale-invariant shape information, we present a Prior-Aware Pyramid 3D Point +Transformer in our network. This module adopts a point convolutional layer with +radial-kernels for pose-aware learning and a 3D scale-invariant graph +convolution layer for object-level shape representation, respectively. +Furthermore, we introduce a pretrain-to-refine self-supervised training +paradigm to train our network. It enables proposed network to capture the +associations between shape priors and observations, addressing the challenge of +intra-class shape variations by utilising the diffusion mechanism. Extensive +experiments conducted on four public datasets and a self-built dataset +demonstrate that our method significantly outperforms state-of-the-art +self-supervised category-level baselines and even surpasses some +fully-supervised instance-level and category-level methods. + +
+
+
+
+
+ + ☆ HUGS: Holistic Urban 3D Scene Understanding via Gaussian Splatting + + +
+ Holistic understanding of urban scenes based on RGB images is a challenging +yet important problem. It encompasses understanding both the geometry and +appearance to enable novel view synthesis, parsing semantic labels, and +tracking moving objects. Despite considerable progress, existing approaches +often focus on specific aspects of this task and require additional inputs such +as LiDAR scans or manually annotated 3D bounding boxes. In this paper, we +introduce a novel pipeline that utilizes 3D Gaussian Splatting for holistic +urban scene understanding. Our main idea involves the joint optimization of +geometry, appearance, semantics, and motion using a combination of static and +dynamic 3D Gaussians, where moving object poses are regularized via physical +constraints. Our approach offers the ability to render new viewpoints in +real-time, yielding 2D and 3D semantic information with high accuracy, and +reconstruct dynamic scenes, even in scenarios where 3D bounding box detection +are highly noisy. Experimental results on KITTI, KITTI-360, and Virtual KITTI 2 +demonstrate the effectiveness of our approach. + +
+
+ comment: Our project page is at https://xdimlab.github.io/hugs_website +
+
+
+
+
+ + ☆ Addressing Source Scale Bias via Image Warping for Domain Adaptation + + +
+ In visual recognition, scale bias is a key challenge due to the imbalance of +object and image size distribution inherent in real scene datasets. +Conventional solutions involve injecting scale invariance priors, oversampling +the dataset at different scales during training, or adjusting scale at +inference. While these strategies mitigate scale bias to some extent, their +ability to adapt across diverse datasets is limited. Besides, they increase +computational load during training and latency during inference. In this work, +we use adaptive attentional processing -- oversampling salient object regions +by warping images in-place during training. Discovering that shifting the +source scale distribution improves backbone features, we developed a +instance-level warping guidance aimed at object region sampling to mitigate +source scale bias in domain adaptation. Our approach improves adaptation across +geographies, lighting and weather conditions, is agnostic to the task, domain +adaptation algorithm, saliency guidance, and underlying model architecture. +Highlights include +6.1 mAP50 for BDD100K Clear $\rightarrow$ DENSE Foggy, +3.7 +mAP50 for BDD100K Day $\rightarrow$ Night, +3.0 mAP50 for BDD100K Clear +$\rightarrow$ Rainy, and +6.3 mIoU for Cityscapes $\rightarrow$ ACDC. Our +approach adds minimal memory during training and has no additional latency at +inference time. Please see Appendix for more results and analysis. + +
+
+
+
+
+ + ☆ Selective, Interpretable, and Motion Consistent Privacy Attribute + Obfuscation for Action Recognition + + +
+ Concerns for the privacy of individuals captured in public imagery have led +to privacy-preserving action recognition. Existing approaches often suffer from +issues arising through obfuscation being applied globally and a lack of +interpretability. Global obfuscation hides privacy sensitive regions, but also +contextual regions important for action recognition. Lack of interpretability +erodes trust in these new technologies. We highlight the limitations of current +paradigms and propose a solution: Human selected privacy templates that yield +interpretability by design, an obfuscation scheme that selectively hides +attributes and also induces temporal consistency, which is important in action +recognition. Our approach is architecture agnostic and directly modifies input +imagery, while existing approaches generally require architecture training. Our +approach offers more flexibility, as no retraining is required, and outperforms +alternatives on three widely used datasets. + +
+
+
+
+
+ + ☆ Selective Domain-Invariant Feature for Generalizable Deepfake Detection ICASSP 2024 + + +
+ With diverse presentation forgery methods emerging continually, detecting the +authenticity of images has drawn growing attention. Although existing methods +have achieved impressive accuracy in training dataset detection, they still +perform poorly in the unseen domain and suffer from forgery of irrelevant +information such as background and identity, affecting generalizability. To +solve this problem, we proposed a novel framework Selective Domain-Invariant +Feature (SDIF), which reduces the sensitivity to face forgery by fusing content +features and styles. Specifically, we first use a Farthest-Point Sampling (FPS) +training strategy to construct a task-relevant style sample representation +space for fusing with content features. Then, we propose a dynamic feature +extraction module to generate features with diverse styles to improve the +performance and effectiveness of the feature extractor. Finally, a domain +separation strategy is used to retain domain-related features to help +distinguish between real and fake faces. Both qualitative and quantitative +results in existing benchmarks and proposals demonstrate the effectiveness of +our approach. + +
+
+ comment: Accepted by ICASSP 2024 +
+
+
+
+
+ + ☆ AnimateDiff-Lightning: Cross-Model Diffusion Distillation + + +
+ We present AnimateDiff-Lightning for lightning-fast video generation. Our +model uses progressive adversarial diffusion distillation to achieve new +state-of-the-art in few-step video generation. We discuss our modifications to +adapt it for the video modality. Furthermore, we propose to simultaneously +distill the probability flow of multiple base diffusion models, resulting in a +single distilled motion module with broader style compatibility. We are pleased +to release our distilled AnimateDiff-Lightning model for the community's use. + +
+
+
+
+
+ + ☆ Learning Cross-view Visual Geo-localization without Ground Truth + + +
+ Cross-View Geo-Localization (CVGL) involves determining the geographical +location of a query image by matching it with a corresponding GPS-tagged +reference image. Current state-of-the-art methods predominantly rely on +training models with labeled paired images, incurring substantial annotation +costs and training burdens. In this study, we investigate the adaptation of +frozen models for CVGL without requiring ground truth pair labels. We observe +that training on unlabeled cross-view images presents significant challenges, +including the need to establish relationships within unlabeled data and +reconcile view discrepancies between uncertain queries and references. To +address these challenges, we propose a self-supervised learning framework to +train a learnable adapter for a frozen Foundation Model (FM). This adapter is +designed to map feature distributions from diverse views into a uniform space +using unlabeled data exclusively. To establish relationships within unlabeled +data, we introduce an Expectation-Maximization-based Pseudo-labeling module, +which iteratively estimates associations between cross-view features and +optimizes the adapter. To maintain the robustness of the FM's representation, +we incorporate an information consistency module with a reconstruction loss, +ensuring that adapted features retain strong discriminative ability across +views. Experimental results demonstrate that our proposed method achieves +significant improvements over vanilla FMs and competitive accuracy compared to +supervised methods, while necessitating fewer training parameters and relying +solely on unlabeled data. Evaluation of our adaptation for task-specific models +further highlights its broad applicability. + +
+
+
+
+
+ + ☆ Federated Semi-supervised Learning for Medical Image Segmentation with + intra-client and inter-client Consistency + + +
+ Medical image segmentation plays a vital role in clinic disease diagnosis and +medical image analysis. However, labeling medical images for segmentation task +is tough due to the indispensable domain expertise of radiologists. +Furthermore, considering the privacy and sensitivity of medical images, it is +impractical to build a centralized segmentation dataset from different medical +institutions. Federated learning aims to train a shared model of isolated +clients without local data exchange which aligns well with the scarcity and +privacy characteristics of medical data. To solve the problem of labeling hard, +many advanced semi-supervised methods have been proposed in a centralized data +setting. As for federated learning, how to conduct semi-supervised learning +under this distributed scenario is worth investigating. In this work, we +propose a novel federated semi-supervised learning framework for medical image +segmentation. The intra-client and inter-client consistency learning are +introduced to smooth predictions at the data level and avoid confirmation bias +of local models. They are achieved with the assistance of a Variational +Autoencoder (VAE) trained collaboratively by clients. The added VAE model plays +three roles: 1) extracting latent low-dimensional features of all labeled and +unlabeled data; 2) performing a novel type of data augmentation in calculating +intra-client consistency loss; 3) utilizing the generative ability of itself to +conduct inter-client consistency distillation. The proposed framework is +compared with other federated semi-supervised or self-supervised learning +methods. The experimental results illustrate that our method outperforms the +state-of-the-art method while avoiding a lot of computation and communication +overhead. + +
+
+ comment: Working in progress +
+
+
+
+
+ + ☆ As Firm As Their Foundations: Can open-sourced foundation models be used + to create adversarial examples for downstream tasks? + + +
+ Foundation models pre-trained on web-scale vision-language data, such as +CLIP, are widely used as cornerstones of powerful machine learning systems. +While pre-training offers clear advantages for downstream learning, it also +endows downstream models with shared adversarial vulnerabilities that can be +easily identified through the open-sourced foundation model. In this work, we +expose such vulnerabilities in CLIP's downstream models and show that +foundation models can serve as a basis for attacking their downstream systems. +In particular, we propose a simple yet effective adversarial attack strategy +termed Patch Representation Misalignment (PRM). Solely based on open-sourced +CLIP vision encoders, this method produces adversaries that simultaneously fool +more than 20 downstream models spanning 4 common vision-language tasks +(semantic segmentation, object detection, image captioning and visual +question-answering). Our findings highlight the concerning safety risks +introduced by the extensive usage of public foundational models in the +development of downstream systems, calling for extra caution in these +scenarios. + +
+
+
+
+
+ + ☆ Audio-Visual Compound Expression Recognition Method based on Late + Modality Fusion and Rule-based Decision + + +
+ This paper presents the results of the SUN team for the Compound Expressions +Recognition Challenge of the 6th ABAW Competition. We propose a novel +audio-visual method for compound expression recognition. Our method relies on +emotion recognition models that fuse modalities at the emotion probability +level, while decisions regarding the prediction of compound expressions are +based on predefined rules. Notably, our method does not use any training data +specific to the target task. The method is evaluated in multi-corpus training +and cross-corpus validation setups. Our findings from the challenge demonstrate +that the proposed method can potentially form a basis for development of +intelligent tools for annotating audio-visual data in the context of human's +basic and compound emotions. The source code is publicly available. + +
+
+ comment: 7 pages, 3 figures +
+
+
+
+
+ + ☆ WaterVG: Waterway Visual Grounding based on Text-Guided Vision and + mmWave Radar + + +
+ The perception of waterways based on human intent holds significant +importance for autonomous navigation and operations of Unmanned Surface +Vehicles (USVs) in water environments. Inspired by visual grounding, in this +paper, we introduce WaterVG, the first visual grounding dataset designed for +USV-based waterway perception based on human intention prompts. WaterVG +encompasses prompts describing multiple targets, with annotations at the +instance level including bounding boxes and masks. Notably, WaterVG includes +11,568 samples with 34,950 referred targets, which integrates both visual and +radar characteristics captured by monocular camera and millimeter-wave (mmWave) +radar, enabling a finer granularity of text prompts. Furthermore, we propose a +novel multi-modal visual grounding model, Potamoi, which is a multi-modal and +multi-task model based on the one-stage paradigm with a designed Phased +Heterogeneous Modality Fusion (PHMF) structure, including Adaptive Radar +Weighting (ARW) and Multi-Head Slim Cross Attention (MHSCA). In specific, MHSCA +is a low-cost and efficient fusion module with a remarkably small parameter +count and FLOPs, elegantly aligning and fusing scenario context information +captured by two sensors with linguistic features, which can effectively address +tasks of referring expression comprehension and segmentation based on +fine-grained prompts. Comprehensive experiments and evaluations have been +conducted on WaterVG, where our Potamoi archives state-of-the-art performances +compared with counterparts. + +
+
+ comment: 10 pages, 9 figures +
+
+
+
+
+ + ☆ IFFNeRF: Initialisation Free and Fast 6DoF pose estimation from a single + image and a NeRF model ICRA 2024 + + +
+ We introduce IFFNeRF to estimate the six degrees-of-freedom (6DoF) camera +pose of a given image, building on the Neural Radiance Fields (NeRF) +formulation. IFFNeRF is specifically designed to operate in real-time and +eliminates the need for an initial pose guess that is proximate to the sought +solution. IFFNeRF utilizes the Metropolis-Hasting algorithm to sample surface +points from within the NeRF model. From these sampled points, we cast rays and +deduce the color for each ray through pixel-level view synthesis. The camera +pose can then be estimated as the solution to a Least Squares problem by +selecting correspondences between the query image and the resulting bundle. We +facilitate this process through a learned attention mechanism, bridging the +query image embedding with the embedding of parameterized rays, thereby +matching rays pertinent to the image. Through synthetic and real evaluation +settings, we show that our method can improve the angular and translation error +accuracy by 80.1% and 67.3%, respectively, compared to iNeRF while performing +at 34fps on consumer hardware and not requiring the initial pose guess. + +
+
+ comment: Accepted ICRA 2024, Project page: + https://mbortolon97.github.io/iffnerf/ +
+
+
+
+
+ + ☆ Driving Animatronic Robot Facial Expression From Speech + + +
+ Animatronic robots aim to enable natural human-robot interaction through +lifelike facial expressions. However, generating realistic, speech-synchronized +robot expressions is challenging due to the complexities of facial biomechanics +and responsive motion synthesis. This paper presents a principled, +skinning-centric approach to drive animatronic robot facial expressions from +speech. The proposed approach employs linear blend skinning (LBS) as the core +representation to guide tightly integrated innovations in embodiment design and +motion synthesis. LBS informs the actuation topology, enables human expression +retargeting, and allows speech-driven facial motion generation. The proposed +approach is capable of generating highly realistic, real-time facial +expressions from speech on an animatronic face, significantly advancing robots' +ability to replicate nuanced human expressions for natural interaction. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Tuning-Free Image Customization with Image and Text Guidance + + +
+ Despite significant advancements in image customization with diffusion +models, current methods still have several limitations: 1) unintended changes +in non-target areas when regenerating the entire image; 2) guidance solely by a +reference image or text descriptions; and 3) time-consuming fine-tuning, which +limits their practical application. In response, we introduce a tuning-free +framework for simultaneous text-image-guided image customization, enabling +precise editing of specific image regions within seconds. Our approach +preserves the semantic features of the reference image subject while allowing +modification of detailed attributes based on text descriptions. To achieve +this, we propose an innovative attention blending strategy that blends +self-attention features in the UNet decoder during the denoising process. To +our knowledge, this is the first tuning-free method that concurrently utilizes +text and image guidance for image customization in specific regions. Our +approach outperforms previous methods in both human and quantitative +evaluations, providing an efficient solution for various practical +applications, such as image synthesis, design, and creative photography. + +
+
+ comment: 17 pages, 8 figures +
+
+
+
+
+ + ☆ LASPA: Latent Spatial Alignment for Fast Training-free Single Image + Editing + + +
+ We present a novel, training-free approach for textual editing of real images +using diffusion models. Unlike prior methods that rely on computationally +expensive finetuning, our approach leverages LAtent SPatial Alignment (LASPA) +to efficiently preserve image details. We demonstrate how the diffusion process +is amenable to spatial guidance using a reference image, leading to +semantically coherent edits. This eliminates the need for complex optimization +and costly model finetuning, resulting in significantly faster editing compared +to previous methods. Additionally, our method avoids the storage requirements +associated with large finetuned models. These advantages make our approach +particularly well-suited for editing on mobile devices and applications +demanding rapid response times. While simple and fast, our method achieves +62-71\% preference in a user-study and significantly better model-based editing +strength and image preservation scores. + +
+
+
+
+
+ + ☆ Real-IAD: A Real-World Multi-View Dataset for Benchmarking Versatile + Industrial Anomaly Detection CVPR2024 + + +
+ Industrial anomaly detection (IAD) has garnered significant attention and +experienced rapid development. However, the recent development of IAD approach +has encountered certain difficulties due to dataset limitations. On the one +hand, most of the state-of-the-art methods have achieved saturation (over 99% +in AUROC) on mainstream datasets such as MVTec, and the differences of methods +cannot be well distinguished, leading to a significant gap between public +datasets and actual application scenarios. On the other hand, the research on +various new practical anomaly detection settings is limited by the scale of the +dataset, posing a risk of overfitting in evaluation results. Therefore, we +propose a large-scale, Real-world, and multi-view Industrial Anomaly Detection +dataset, named Real-IAD, which contains 150K high-resolution images of 30 +different objects, an order of magnitude larger than existing datasets. It has +a larger range of defect area and ratio proportions, making it more challenging +than previous datasets. To make the dataset closer to real application +scenarios, we adopted a multi-view shooting method and proposed sample-level +evaluation metrics. In addition, beyond the general unsupervised anomaly +detection setting, we propose a new setting for Fully Unsupervised Industrial +Anomaly Detection (FUIAD) based on the observation that the yield rate in +industrial production is usually greater than 60%, which has more practical +application value. Finally, we report the results of popular IAD methods on the +Real-IAD dataset, providing a highly challenging benchmark to promote the +development of the IAD field. + +
+
+ comment: It is accepted by CVPR2024 +
+
+
+
+
+ + ☆ EAS-SNN: End-to-End Adaptive Sampling and Representation for Event-based + Detection with Recurrent Spiking Neural Networks + + +
+ Event cameras, with their high dynamic range and temporal resolution, are +ideally suited for object detection, especially under scenarios with motion +blur and challenging lighting conditions. However, while most existing +approaches prioritize optimizing spatiotemporal representations with advanced +detection backbones and early aggregation functions, the crucial issue of +adaptive event sampling remains largely unaddressed. Spiking Neural Networks +(SNNs), which operate on an event-driven paradigm through sparse spike +communication, emerge as a natural fit for addressing this challenge. In this +study, we discover that the neural dynamics of spiking neurons align closely +with the behavior of an ideal temporal event sampler. Motivated by this +insight, we propose a novel adaptive sampling module that leverages recurrent +convolutional SNNs enhanced with temporal memory, facilitating a fully +end-to-end learnable framework for event-based detection. Additionally, we +introduce Residual Potential Dropout (RPD) and Spike-Aware Training (SAT) to +regulate potential distribution and address performance degradation encountered +in spike-based sampling modules. Through rigorous testing on neuromorphic +datasets for event-based detection, our approach demonstrably surpasses +existing state-of-the-art spike-based methods, achieving superior performance +with significantly fewer parameters and time steps. For instance, our method +achieves a 4.4\% mAP improvement on the Gen1 dataset, while requiring 38\% +fewer parameters and three time steps. Moreover, the applicability and +effectiveness of our adaptive sampling methodology extend beyond SNNs, as +demonstrated through further validation on conventional non-spiking detection +models. + +
+
+
+
+
+ + ☆ Lifting Multi-View Detection and Tracking to the Bird's Eye View + + +
+ Taking advantage of multi-view aggregation presents a promising solution to +tackle challenges such as occlusion and missed detection in multi-object +tracking and detection. Recent advancements in multi-view detection and 3D +object recognition have significantly improved performance by strategically +projecting all views onto the ground plane and conducting detection analysis +from a Bird's Eye View. In this paper, we compare modern lifting methods, both +parameter-free and parameterized, to multi-view aggregation. Additionally, we +present an architecture that aggregates the features of multiple times steps to +learn robust detection and combines appearance- and motion-based cues for +tracking. Most current tracking approaches either focus on pedestrians or +vehicles. In our work, we combine both branches and add new challenges to +multi-view detection with cross-scene setups. Our method generalizes to three +public datasets across two domains: (1) pedestrian: Wildtrack and MultiviewX, +and (2) roadside perception: Synthehicle, achieving state-of-the-art +performance in detection and tracking. https://github.com/tteepe/TrackTacular + +
+
+
+
+
+ + ☆ Compound Expression Recognition via Multi Model Ensemble + + +
+ Compound Expression Recognition (CER) plays a crucial role in interpersonal +interactions. Due to the existence of Compound Expressions , human emotional +expressions are complex, requiring consideration of both local and global +facial expressions to make judgments. In this paper, to address this issue, we +propose a solution based on ensemble learning methods for Compound Expression +Recognition. Specifically, our task is classification, where we train three +expression classification models based on convolutional networks, Vision +Transformers, and multi-scale local attention networks. Then, through model +ensemble using late fusion, we merge the outputs of multiple models to predict +the final result. Our method achieves high accuracy on RAF-DB and is able to +recognize expressions through zero-shot on certain portions of C-EXPR-DB. + +
+
+
+
+
+ + ☆ Adapting Visual-Language Models for Generalizable Anomaly Detection in + Medical Images CVPR 2024 + + +
+ Recent advancements in large-scale visual-language pre-trained models have +led to significant progress in zero-/few-shot anomaly detection within natural +image domains. However, the substantial domain divergence between natural and +medical images limits the effectiveness of these methodologies in medical +anomaly detection. This paper introduces a novel lightweight multi-level +adaptation and comparison framework to repurpose the CLIP model for medical +anomaly detection. Our approach integrates multiple residual adapters into the +pre-trained visual encoder, enabling a stepwise enhancement of visual features +across different levels. This multi-level adaptation is guided by multi-level, +pixel-wise visual-language feature alignment loss functions, which recalibrate +the model's focus from object semantics in natural imagery to anomaly +identification in medical images. The adapted features exhibit improved +generalization across various medical data types, even in zero-shot scenarios +where the model encounters unseen medical modalities and anatomical regions +during training. Our experiments on medical anomaly detection benchmarks +demonstrate that our method significantly surpasses current state-of-the-art +models, with an average AUC improvement of 6.24% and 7.33% for anomaly +classification, 2.03% and 2.37% for anomaly segmentation, under the zero-shot +and few-shot settings, respectively. Source code is available at: +https://github.com/MediaBrain-SJTU/MVFA-AD + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Confidence Self-Calibration for Multi-Label Class-Incremental Learning + + +
+ The partial label challenge in Multi-Label Class-Incremental Learning (MLCIL) +arises when only the new classes are labeled during training, while past and +future labels remain unavailable. This issue leads to a proliferation of +false-positive errors due to erroneously high confidence multi-label +predictions, exacerbating catastrophic forgetting within the disjoint label +space. In this paper, we aim to refine multi-label confidence calibration in +MLCIL and propose a Confidence Self-Calibration (CSC) approach. Firstly, for +label relationship calibration, we introduce a class-incremental graph +convolutional network that bridges the isolated label spaces by constructing +learnable, dynamically extended label relationship graph. Then, for confidence +calibration, we present a max-entropy regularization for each multi-label +increment, facilitating confidence self-calibration through the penalization of +over-confident output distributions. Our approach attains new state-of-the-art +results in MLCIL tasks on both MS-COCO and PASCAL VOC datasets, with the +calibration of label confidences confirmed through our methodology. + +
+
+
+
+
+ + ☆ M2DA: Multi-Modal Fusion Transformer Incorporating Driver Attention for + Autonomous Driving + + +
+ End-to-end autonomous driving has witnessed remarkable progress. However, the +extensive deployment of autonomous vehicles has yet to be realized, primarily +due to 1) inefficient multi-modal environment perception: how to integrate data +from multi-modal sensors more efficiently; 2) non-human-like scene +understanding: how to effectively locate and predict critical risky agents in +traffic scenarios like an experienced driver. To overcome these challenges, in +this paper, we propose a Multi-Modal fusion transformer incorporating Driver +Attention (M2DA) for autonomous driving. To better fuse multi-modal data and +achieve higher alignment between different modalities, a novel +Lidar-Vision-Attention-based Fusion (LVAFusion) module is proposed. By +incorporating driver attention, we empower the human-like scene understanding +ability to autonomous vehicles to identify crucial areas within complex +scenarios precisely and ensure safety. We conduct experiments on the CARLA +simulator and achieve state-of-the-art performance with less data in +closed-loop benchmarks. Source codes are available at +https://anonymous.4open.science/r/M2DA-4772. + +
+
+
+
+
+ + ☆ RGBD GS-ICP SLAM + + +
+ Simultaneous Localization and Mapping (SLAM) with dense representation plays +a key role in robotics, Virtual Reality (VR), and Augmented Reality (AR) +applications. Recent advancements in dense representation SLAM have highlighted +the potential of leveraging neural scene representation and 3D Gaussian +representation for high-fidelity spatial representation. In this paper, we +propose a novel dense representation SLAM approach with a fusion of Generalized +Iterative Closest Point (G-ICP) and 3D Gaussian Splatting (3DGS). In contrast +to existing methods, we utilize a single Gaussian map for both tracking and +mapping, resulting in mutual benefits. Through the exchange of covariances +between tracking and mapping processes with scale alignment techniques, we +minimize redundant computations and achieve an efficient system. Additionally, +we enhance tracking accuracy and mapping quality through our keyframe selection +methods. Experimental results demonstrate the effectiveness of our approach, +showing an incredibly fast speed up to 107 FPS (for the entire system) and +superior quality of the reconstructed map. + +
+
+
+
+
+ + ☆ HCPM: Hierarchical Candidates Pruning for Efficient Detector-Free + Matching + + +
+ Deep learning-based image matching methods play a crucial role in computer +vision, yet they often suffer from substantial computational demands. To tackle +this challenge, we present HCPM, an efficient and detector-free local +feature-matching method that employs hierarchical pruning to optimize the +matching pipeline. In contrast to recent detector-free methods that depend on +an exhaustive set of coarse-level candidates for matching, HCPM selectively +concentrates on a concise subset of informative candidates, resulting in fewer +computational candidates and enhanced matching efficiency. The method comprises +a self-pruning stage for selecting reliable candidates and an +interactive-pruning stage that identifies correlated patches at the coarse +level. Our results reveal that HCPM significantly surpasses existing methods in +terms of speed while maintaining high accuracy. The source code will be made +available upon publication. + +
+
+
+
+
+ + ☆ Prompt-Guided Adaptive Model Transformation for Whole Slide Image + Classification + + +
+ Multiple instance learning (MIL) has emerged as a popular method for +classifying histopathology whole slide images (WSIs). Existing approaches +typically rely on frozen pre-trained models to extract instance features, +neglecting the substantial domain shift between pre-training natural and +histopathological images. To address this issue, we propose PAMT, a novel +Prompt-guided Adaptive Model Transformation framework that enhances MIL +classification performance by seamlessly adapting pre-trained models to the +specific characteristics of histopathology data. To capture the intricate +histopathology distribution, we introduce Representative Patch Sampling (RPS) +and Prototypical Visual Prompt (PVP) to reform the input data, building a +compact while informative representation. Furthermore, to narrow the domain +gap, we introduce Adaptive Model Transformation (AMT) that integrates adapter +blocks within the feature extraction pipeline, enabling the pre-trained models +to learn domain-specific features. We rigorously evaluate our approach on two +publicly available datasets, Camelyon16 and TCGA-NSCLC, showcasing substantial +improvements across various MIL models. Our findings affirm the potential of +PAMT to set a new benchmark in WSI classification, underscoring the value of a +targeted reprogramming approach. + +
+
+
+
+
+ + ☆ Vox-Fusion++: Voxel-based Neural Implicit Dense Tracking and Mapping + with Multi-maps + + +
+ In this paper, we introduce Vox-Fusion++, a multi-maps-based robust dense +tracking and mapping system that seamlessly fuses neural implicit +representations with traditional volumetric fusion techniques. Building upon +the concept of implicit mapping and positioning systems, our approach extends +its applicability to real-world scenarios. Our system employs a voxel-based +neural implicit surface representation, enabling efficient encoding and +optimization of the scene within each voxel. To handle diverse environments +without prior knowledge, we incorporate an octree-based structure for scene +division and dynamic expansion. To achieve real-time performance, we propose a +high-performance multi-process framework. This ensures the system's suitability +for applications with stringent time constraints. Additionally, we adopt the +idea of multi-maps to handle large-scale scenes, and leverage loop detection +and hierarchical pose optimization strategies to reduce long-term pose drift +and remove duplicate geometry. Through comprehensive evaluations, we +demonstrate that our method outperforms previous methods in terms of +reconstruction quality and accuracy across various scenarios. We also show that +our Vox-Fusion++ can be used in augmented reality and collaborative mapping +applications. Our source code will be publicly available at +\url{https://github.com/zju3dv/Vox-Fusion_Plus_Plus} + +
+
+ comment: 14 pages. arXiv admin note: text overlap with arXiv:2210.15858 +
+
+
+
+
+ + ☆ High-Fidelity SLAM Using Gaussian Splatting with Rendering-Guided + Densification and Regularized Optimization IROS24 + + +
+ We propose a dense RGBD SLAM system based on 3D Gaussian Splatting that +provides metrically accurate pose tracking and visually realistic +reconstruction. To this end, we first propose a Gaussian densification strategy +based on the rendering loss to map unobserved areas and refine reobserved +areas. Second, we introduce extra regularization parameters to alleviate the +forgetting problem in the continuous mapping problem, where parameters tend to +overfit the latest frame and result in decreasing rendering quality for +previous frames. Both mapping and tracking are performed with Gaussian +parameters by minimizing re-rendering loss in a differentiable way. Compared to +recent neural and concurrently developed gaussian splatting RGBD SLAM +baselines, our method achieves state-of-the-art results on the synthetic +dataset Replica and competitive results on the real-world dataset TUM. + +
+
+ comment: submitted to IROS24 +
+
+
+
+
+ + ☆ ExACT: Language-guided Conceptual Reasoning and Uncertainty Estimation + for Event-based Action Recognition and More CVPR2024 + + +
+ Event cameras have recently been shown beneficial for practical vision tasks, +such as action recognition, thanks to their high temporal resolution, power +efficiency, and reduced privacy concerns. However, current research is hindered +by 1) the difficulty in processing events because of their prolonged duration +and dynamic actions with complex and ambiguous semantics and 2) the redundant +action depiction of the event frame representation with fixed stacks. We find +language naturally conveys abundant semantic information, rendering it +stunningly superior in reducing semantic uncertainty. In light of this, we +propose ExACT, a novel approach that, for the first time, tackles event-based +action recognition from a cross-modal conceptualizing perspective. Our ExACT +brings two technical contributions. Firstly, we propose an adaptive +fine-grained event (AFE) representation to adaptively filter out the repeated +events for the stationary objects while preserving dynamic ones. This subtly +enhances the performance of ExACT without extra computational cost. Then, we +propose a conceptual reasoning-based uncertainty estimation module, which +simulates the recognition process to enrich the semantic representation. In +particular, conceptual reasoning builds the temporal relation based on the +action semantics, and uncertainty estimation tackles the semantic uncertainty +of actions based on the distributional representation. Experiments show that +our ExACT achieves superior recognition accuracy of 94.83%(+2.23%), +90.10%(+37.47%) and 67.24% on PAF, HARDVS and our SeAct datasets respectively. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ UniBind: LLM-Augmented Unified and Balanced Representation Space to Bind + Them All CVPR2024 + + +
+ We present UniBind, a flexible and efficient approach that learns a unified +representation space for seven diverse modalities -- images, text, audio, point +cloud, thermal, video, and event data. Existing works, eg., ImageBind, treat +the image as the central modality and build an image-centered representation +space; however, the space may be sub-optimal as it leads to an unbalanced +representation space among all modalities. Moreover, the category names are +directly used to extract text embeddings for the downstream tasks, making it +hardly possible to represent the semantics of multi-modal data. The +'out-of-the-box' insight of our UniBind is to make the alignment center +modality-agnostic and further learn a unified and balanced representation +space, empowered by the large language models (LLMs). UniBind is superior in +its flexible application to all CLIP-style models and delivers remarkable +performance boosts. To make this possible, we 1) construct a knowledge base of +text embeddings with the help of LLMs and multi-modal LLMs; 2) adaptively build +LLM-augmented class-wise embedding center on top of the knowledge base and +encoded visual embeddings; 3) align all the embeddings to the LLM-augmented +embedding center via contrastive learning to achieve a unified and balanced +representation space. UniBind shows strong zero-shot recognition performance +gains over prior arts by an average of 6.36%. Finally, we achieve new +state-of-the-art performance, eg., a 6.75% gain on ImageNet, on the multi-modal +fine-tuning setting while reducing 90% of the learnable parameters. + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ☆ PCT: Perspective Cue Training Framework for Multi-Camera BEV + Segmentation + + +
+ Generating annotations for bird's-eye-view (BEV) segmentation presents +significant challenges due to the scenes' complexity and the high manual +annotation cost. In this work, we address these challenges by leveraging the +abundance of unlabeled data available. We propose the Perspective Cue Training +(PCT) framework, a novel training framework that utilizes pseudo-labels +generated from unlabeled perspective images using publicly available semantic +segmentation models trained on large street-view datasets. PCT applies a +perspective view task head to the image encoder shared with the BEV +segmentation head, effectively utilizing the unlabeled data to be trained with +the generated pseudo-labels. Since image encoders are present in nearly all +camera-based BEV segmentation architectures, PCT is flexible and applicable to +various existing BEV architectures. PCT can be applied to various settings +where unlabeled data is available. In this paper, we applied PCT for +semi-supervised learning (SSL) and unsupervised domain adaptation (UDA). +Additionally, we introduce strong input perturbation through Camera Dropout +(CamDrop) and feature perturbation via BEV Feature Dropout (BFD), which are +crucial for enhancing SSL capabilities using our teacher-student framework. Our +comprehensive approach is simple and flexible but yields significant +improvements over various baselines for SSL and UDA, achieving competitive +performances even against the current state-of-the-art. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+ + ☆ Dynamic Spatial-Temporal Aggregation for Skeleton-Aware Sign Language + Recognition + + +
+ Skeleton-aware sign language recognition (SLR) has gained popularity due to +its ability to remain unaffected by background information and its lower +computational requirements. Current methods utilize spatial graph modules and +temporal modules to capture spatial and temporal features, respectively. +However, their spatial graph modules are typically built on fixed graph +structures such as graph convolutional networks or a single learnable graph, +which only partially explore joint relationships. Additionally, a simple +temporal convolution kernel is used to capture temporal information, which may +not fully capture the complex movement patterns of different signers. To +overcome these limitations, we propose a new spatial architecture consisting of +two concurrent branches, which build input-sensitive joint relationships and +incorporates specific domain knowledge for recognition, respectively. These two +branches are followed by an aggregation process to distinguishe important joint +connections. We then propose a new temporal module to model multi-scale +temporal information to capture complex human dynamics. Our method achieves +state-of-the-art accuracy compared to previous skeleton-aware methods on four +large-scale SLR benchmarks. Moreover, our method demonstrates superior accuracy +compared to RGB-based methods in most cases while requiring much fewer +computational resources, bringing better accuracy-computation trade-off. Code +is available at https://github.com/hulianyuyy/DSTA-SLR. + +
+
+
+
+
+ + ☆ Generalized Consistency Trajectory Models for Image Manipulation + + +
+ Diffusion-based generative models excel in unconditional generation, as well +as on applied tasks such as image editing and restoration. The success of +diffusion models lies in the iterative nature of diffusion: diffusion breaks +down the complex process of mapping noise to data into a sequence of simple +denoising tasks. Moreover, we are able to exert fine-grained control over the +generation process by injecting guidance terms into each denoising step. +However, the iterative process is also computationally intensive, often taking +from tens up to thousands of function evaluations. Although consistency +trajectory models (CTMs) enable traversal between any time points along the +probability flow ODE (PFODE) and score inference with a single function +evaluation, CTMs only allow translation from Gaussian noise to data. Thus, this +work aims to unlock the full potential of CTMs by proposing generalized CTMs +(GCTMs), which translate between arbitrary distributions via ODEs. We discuss +the design space of GCTMs and demonstrate their efficacy in various image +manipulation tasks such as image-to-image translation, restoration, and +editing. Code: \url{https://github.com/1202kbs/GCTM} + +
+
+
+
+
+ + ☆ Semantics, Distortion, and Style Matter: Towards Source-free UDA for + Panoramic Segmentation CVPR 2024 + + +
+ This paper addresses an interesting yet challenging problem -- source-free +unsupervised domain adaptation (SFUDA) for pinhole-to-panoramic semantic +segmentation -- given only a pinhole image-trained model (i.e., source) and +unlabeled panoramic images (i.e., target). Tackling this problem is nontrivial +due to the semantic mismatches, style discrepancies, and inevitable distortion +of panoramic images. To this end, we propose a novel method that utilizes +Tangent Projection (TP) as it has less distortion and meanwhile slits the +equirectangular projection (ERP) with a fixed FoV to mimic the pinhole images. +Both projections are shown effective in extracting knowledge from the source +model. However, the distinct projection discrepancies between source and target +domains impede the direct knowledge transfer; thus, we propose a panoramic +prototype adaptation module (PPAM) to integrate panoramic prototypes from the +extracted knowledge for adaptation. We then impose the loss constraints on both +predictions and prototypes and propose a cross-dual attention module (CDAM) at +the feature level to better align the spatial and channel characteristics +across the domains and projections. Both knowledge extraction and transfer +processes are synchronously updated to reach the best performance. Extensive +experiments on the synthetic and real-world benchmarks, including outdoor and +indoor scenarios, demonstrate that our method achieves significantly better +performance than prior SFUDA methods for pinhole-to-panoramic adaptation. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Task-Customized Mixture of Adapters for General Image Fusion CVPR 2024 + + +
+ General image fusion aims at integrating important information from +multi-source images. However, due to the significant cross-task gap, the +respective fusion mechanism varies considerably in practice, resulting in +limited performance across subtasks. To handle this problem, we propose a novel +task-customized mixture of adapters (TC-MoA) for general image fusion, +adaptively prompting various fusion tasks in a unified model. We borrow the +insight from the mixture of experts (MoE), taking the experts as efficient +tuning adapters to prompt a pre-trained foundation model. These adapters are +shared across different tasks and constrained by mutual information +regularization, ensuring compatibility with different tasks while +complementarity for multi-source images. The task-specific routing networks +customize these adapters to extract task-specific information from different +sources with dynamic dominant intensity, performing adaptive visual feature +prompt fusion. Notably, our TC-MoA controls the dominant intensity bias for +different fusion tasks, successfully unifying multiple fusion tasks in a single +model. Extensive experiments show that TC-MoA outperforms the competing +approaches in learning commonalities while retaining compatibility for general +image fusion (multi-modal, multi-exposure, and multi-focus), and also +demonstrating striking controllability on more generalization experiments. The +code is available at https://github.com/YangSun22/TC-MoA . + +
+
+ comment: 19 pages, 17 figures, CVPR 2024 +
+
+
+
+
+ + ☆ A Trainable Feature Extractor Module for Deep Neural Networks and + Scanpath Classification + + +
+ Scanpath classification is an area in eye tracking research with possible +applications in medicine, manufacturing as well as training systems for +students in various domains. In this paper we propose a trainable feature +extraction module for deep neural networks. The purpose of this module is to +transform a scanpath into a feature vector which is directly useable for the +deep neural network architecture. Based on the backpropagated error of the deep +neural network, the feature extraction module adapts its parameters to improve +the classification performance. Therefore, our feature extraction module is +jointly trainable with the deep neural network. The motivation to this feature +extraction module is based on classical histogram-based approaches which +usually compute distributions over a scanpath. We evaluated our module on three +public datasets and compared it to the state of the art approaches. + +
+
+
+
+
+ + ☆ DetToolChain: A New Prompting Paradigm to Unleash Detection Ability of + MLLM + + +
+ We present DetToolChain, a novel prompting paradigm, to unleash the zero-shot +object detection ability of multimodal large language models (MLLMs), such as +GPT-4V and Gemini. Our approach consists of a detection prompting toolkit +inspired by high-precision detection priors and a new Chain-of-Thought to +implement these prompts. Specifically, the prompts in the toolkit are designed +to guide the MLLM to focus on regional information (e.g., zooming in), read +coordinates according to measure standards (e.g., overlaying rulers and +compasses), and infer from the contextual information (e.g., overlaying scene +graphs). Building upon these tools, the new detection chain-of-thought can +automatically decompose the task into simple subtasks, diagnose the +predictions, and plan for progressive box refinements. The effectiveness of our +framework is demonstrated across a spectrum of detection tasks, especially hard +cases. Compared to existing state-of-the-art methods, GPT-4V with our +DetToolChain improves state-of-the-art object detectors by +21.5% AP50 on MS +COCO Novel class set for open-vocabulary detection, +24.23% Acc on RefCOCO val +set for zero-shot referring expression comprehension, +14.5% AP on D-cube +describe object detection FULL setting. + +
+
+
+
+
+ + ☆ A Hybrid Transformer-Sequencer approach for Age and Gender + classification from in-wild facial images + + +
+ The advancements in computer vision and image processing techniques have led +to emergence of new application in the domain of visual surveillance, targeted +advertisement, content-based searching, and human-computer interaction etc. Out +of the various techniques in computer vision, face analysis, in particular, has +gained much attention. Several previous studies have tried to explore different +applications of facial feature processing for a variety of tasks, including age +and gender classification. However, despite several previous studies having +explored the problem, the age and gender classification of in-wild human faces +is still far from the achieving the desired levels of accuracy required for +real-world applications. This paper, therefore, attempts to bridge this gap by +proposing a hybrid model that combines self-attention and BiLSTM approaches for +age and gender classification problems. The proposed models performance is +compared with several state-of-the-art model proposed so far. An improvement of +approximately 10percent and 6percent over the state-of-the-art implementations +for age and gender classification, respectively, are noted for the proposed +model. The proposed model is thus found to achieve superior performance and is +found to provide a more generalized learning. The model can, therefore, be +applied as a core classification component in various image processing and +computer vision problems. + +
+
+ comment: 22 pages +
+
+
+
+
+ + ☆ TT-BLIP: Enhancing Fake News Detection Using BLIP and Tri-Transformer + + +
+ Detecting fake news has received a lot of attention. Many previous methods +concatenate independently encoded unimodal data, ignoring the benefits of +integrated multimodal information. Also, the absence of specialized feature +extraction for text and images further limits these methods. This paper +introduces an end-to-end model called TT-BLIP that applies the bootstrapping +language-image pretraining for unified vision-language understanding and +generation (BLIP) for three types of information: BERT and +BLIP\textsubscript{Txt} for text, ResNet and BLIP\textsubscript{Img} for +images, and bidirectional BLIP encoders for multimodal information. The +Multimodal Tri-Transformer fuses tri-modal features using three types of +multi-head attention mechanisms, ensuring integrated modalities for enhanced +representations and improved multimodal data analysis. The experiments are +performed using two fake news datasets, Weibo and Gossipcop. The results +indicate TT-BLIP outperforms the state-of-the-art models. + +
+
+ comment: 8 pages, submitted to conference +
+
+
+
+
+ + ☆ PostoMETRO: Pose Token Enhanced Mesh Transformer for Robust 3D Human + Mesh Recovery + + +
+ With the recent advancements in single-image-based human mesh recovery, there +is a growing interest in enhancing its performance in certain extreme +scenarios, such as occlusion, while maintaining overall model accuracy. +Although obtaining accurately annotated 3D human poses under occlusion is +challenging, there is still a wealth of rich and precise 2D pose annotations +that can be leveraged. However, existing works mostly focus on directly +leveraging 2D pose coordinates to estimate 3D pose and mesh. In this paper, we +present PostoMETRO($\textbf{Pos}$e $\textbf{to}$ken enhanced $\textbf{ME}$sh +$\textbf{TR}$ansf$\textbf{O}$rmer), which integrates occlusion-resilient 2D +pose representation into transformers in a token-wise manner. Utilizing a +specialized pose tokenizer, we efficiently condense 2D pose data to a compact +sequence of pose tokens and feed them to the transformer together with the +image tokens. This process not only ensures a rich depiction of texture from +the image but also fosters a robust integration of pose and image information. +Subsequently, these combined tokens are queried by vertex and joint tokens to +decode 3D coordinates of mesh vertices and human joints. Facilitated by the +robust pose token representation and the effective combination, we are able to +produce more precise 3D coordinates, even under extreme scenarios like +occlusion. Experiments on both standard and occlusion-specific benchmarks +demonstrate the effectiveness of PostoMETRO. Qualitative results further +illustrate the clarity of how 2D pose can help 3D reconstruction. Code will be +made available. + +
+
+
+
+
+ + ☆ SC-Diff: 3D Shape Completion with Latent Diffusion Models + + +
+ This paper introduces a 3D shape completion approach using a 3D latent +diffusion model optimized for completing shapes, represented as Truncated +Signed Distance Functions (TSDFs), from partial 3D scans. Our method combines +image-based conditioning through cross-attention and spatial conditioning +through the integration of 3D features from captured partial scans. This dual +guidance enables high-fidelity, realistic shape completions at superior +resolutions. At the core of our approach is the compression of 3D data into a +low-dimensional latent space using an auto-encoder inspired by 2D latent +diffusion models. This compression facilitates the processing of +higher-resolution shapes and allows us to apply our model across multiple +object classes, a significant improvement over other existing diffusion-based +shape completion methods, which often require a separate diffusion model for +each class. We validated our approach against two common benchmarks in the +field of shape completion, demonstrating competitive performance in terms of +accuracy and realism and performing on par with state-of-the-art methods +despite operating at a higher resolution with a single model for all object +classes. We present a comprehensive evaluation of our model, showcasing its +efficacy in handling diverse shape completion challenges, even on unseen object +classes. The code will be released upon acceptance. + +
+
+ comment: 22 pages +
+
+
+
+
+ + ☆ Few-shot Object Localization + + +
+ Existing few-shot object counting tasks primarily focus on quantifying the +number of objects in an image, neglecting precise positional information. To +bridge this research gap, this paper introduces the novel task of Few-Shot +Object Localization (FSOL), which aims to provide accurate object positional +information. This task achieves generalized object localization by leveraging a +small number of labeled support samples to query the positional information of +objects within corresponding images. To advance this research field, we propose +an innovative high-performance baseline model. Our model integrates a dual-path +feature augmentation module to enhance shape association and gradient +differences between supports and query images, alongside a self-query module +designed to explore the association between feature maps and query images. +Experimental results demonstrate a significant performance improvement of our +approach in the FSOL task, establishing an efficient benchmark for further +research. + +
+
+
+
+
+ + ☆ Non-negative Contrastive Learning ICLR 2024 + + +
+ Deep representations have shown promising performance when transferred to +downstream tasks in a black-box manner. Yet, their inherent lack of +interpretability remains a significant challenge, as these features are often +opaque to human understanding. In this paper, we propose Non-negative +Contrastive Learning (NCL), a renaissance of Non-negative Matrix Factorization +(NMF) aimed at deriving interpretable features. The power of NCL lies in its +enforcement of non-negativity constraints on features, reminiscent of NMF's +capability to extract features that align closely with sample clusters. NCL not +only aligns mathematically well with an NMF objective but also preserves NMF's +interpretability attributes, resulting in a more sparse and disentangled +representation compared to standard contrastive learning (CL). Theoretically, +we establish guarantees on the identifiability and downstream generalization of +NCL. Empirically, we show that these advantages enable NCL to outperform CL +significantly on feature disentanglement, feature selection, as well as +downstream classification tasks. At last, we show that NCL can be easily +extended to other learning scenarios and benefit supervised learning as well. +Code is available at https://github.com/PKU-ML/non_neg. + +
+
+ comment: 22 pages. Accepted by ICLR 2024 +
+
+
+
+
+ + ☆ Privacy-Preserving Face Recognition Using Trainable Feature Subtraction CVPR 2024 + + +
+ The widespread adoption of face recognition has led to increasing privacy +concerns, as unauthorized access to face images can expose sensitive personal +information. This paper explores face image protection against viewing and +recovery attacks. Inspired by image compression, we propose creating a visually +uninformative face image through feature subtraction between an original face +and its model-produced regeneration. Recognizable identity features within the +image are encouraged by co-training a recognition model on its high-dimensional +feature representation. To enhance privacy, the high-dimensional representation +is crafted through random channel shuffling, resulting in randomized +recognizable images devoid of attacker-leverageable texture details. We distill +our methodologies into a novel privacy-preserving face recognition method, +MinusFace. Experiments demonstrate its high recognition accuracy and effective +privacy protection. Its code is available at https://github.com/Tencent/TFace. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ CLIP-VIS: Adapting CLIP for Open-Vocabulary Video Instance Segmentation + + +
+ Open-vocabulary video instance segmentation strives to segment and track +instances belonging to an open set of categories in a video. The +vision-language model Contrastive Language-Image Pre-training (CLIP) has shown +strong zero-shot classification ability in image-level open-vocabulary task. In +this paper, we propose a simple encoder-decoder network, called CLIP-VIS, to +adapt CLIP for open-vocabulary video instance segmentation. Our CLIP-VIS adopts +frozen CLIP image encoder and introduces three modules, including +class-agnostic mask generation, temporal topK-enhanced matching, and weighted +open-vocabulary classification. Given a set of initial queries, class-agnostic +mask generation employs a transformer decoder to predict query masks and +corresponding object scores and mask IoU scores. Then, temporal topK-enhanced +matching performs query matching across frames by using K mostly matched +frames. Finally, weighted open-vocabulary classification first generates query +visual features with mask pooling, and second performs weighted classification +using object scores and mask IoU scores. Our CLIP-VIS does not require the +annotations of instance categories and identities. The experiments are +performed on various video instance segmentation datasets, which demonstrate +the effectiveness of our proposed method, especially on novel categories. When +using ConvNeXt-B as backbone, our CLIP-VIS achieves the AP and APn scores of +32.1% and 40.3% on validation set of LV-VIS dataset, which outperforms OV2Seg +by 11.0% and 24.0% respectively. We will release the source code and models at +https://github.com/zwq456/CLIP-VIS.git. + +
+
+
+
+
+ + ☆ Intention Action Anticipation Model with Guide-Feedback Loop Mechanism + + +
+ Anticipating human intention from videos has broad applications, such as +automatic driving, robot assistive technology, and virtual reality. This study +addresses the problem of intention action anticipation using egocentric video +sequences to estimate actions that indicate human intention. We propose a +Hierarchical Complete-Recent (HCR) information fusion model that makes full use +of the features of the entire video sequence (i.e., complete features) and the +features of the video tail sequence (i.e., recent features). The HCR model has +two primary mechanisms. The Guide-Feedback Loop (GFL) mechanism is proposed to +model the relation between one recent feature and one complete feature. Based +on GFL, the MultiComplete-Recent Feature Aggregation (MCRFA) module is proposed +to model the relation of one recent feature with multiscale complete features. +Based on GFL and MCRFA, the HCR model can hierarchically explore the rich +interrelationships between multiscale complete features and multiscale recent +features. Through comparative and ablation experiments, we validate the +effectiveness of our model on two well-known public datasets: EPIC-Kitchens and +EGTEA Gaze+. + +
+
+
+
+
+ + ☆ Do Generated Data Always Help Contrastive Learning? ICLR 2024 + + +
+ Contrastive Learning (CL) has emerged as one of the most successful paradigms +for unsupervised visual representation learning, yet it often depends on +intensive manual data augmentations. With the rise of generative models, +especially diffusion models, the ability to generate realistic images close to +the real data distribution has been well recognized. These generated +high-equality images have been successfully applied to enhance contrastive +representation learning, a technique termed ``data inflation''. However, we +find that the generated data (even from a good diffusion model like DDPM) may +sometimes even harm contrastive learning. We investigate the causes behind this +failure from the perspective of both data inflation and data augmentation. For +the first time, we reveal the complementary roles that stronger data inflation +should be accompanied by weaker augmentations, and vice versa. We also provide +rigorous theoretical explanations for these phenomena via deriving its +generalization bounds under data inflation. Drawing from these insights, we +propose Adaptive Inflation (AdaInf), a purely data-centric strategy without +introducing any extra computation cost. On benchmark datasets, AdaInf can bring +significant improvements for various contrastive learning methods. Notably, +without using external data, AdaInf obtains 94.70% linear accuracy on CIFAR-10 +with SimCLR, setting a new record that surpasses many sophisticated methods. +Code is available at https://github.com/PKU-ML/adainf. + +
+
+ comment: 19 pages. Accepted by ICLR 2024 +
+
+
+
+
+ + ☆ Boosting Transferability in Vision-Language Attacks via Diversification + along the Intersection Region of Adversarial Trajectory + + +
+ Vision-language pre-training (VLP) models exhibit remarkable capabilities in +comprehending both images and text, yet they remain susceptible to multimodal +adversarial examples (AEs). Strengthening adversarial attacks and uncovering +vulnerabilities, especially common issues in VLP models (e.g., high +transferable AEs), can stimulate further research on constructing reliable and +practical VLP models. A recent work (i.e., Set-level guidance attack) indicates +that augmenting image-text pairs to increase AE diversity along the +optimization path enhances the transferability of adversarial examples +significantly. However, this approach predominantly emphasizes diversity around +the online adversarial examples (i.e., AEs in the optimization period), leading +to the risk of overfitting the victim model and affecting the transferability. +In this study, we posit that the diversity of adversarial examples towards the +clean input and online AEs are both pivotal for enhancing transferability +across VLP models. Consequently, we propose using diversification along the +intersection region of adversarial trajectory to expand the diversity of AEs. +To fully leverage the interaction between modalities, we introduce text-guided +adversarial example selection during optimization. Furthermore, to further +mitigate the potential overfitting, we direct the adversarial text deviating +from the last intersection region along the optimization path, rather than +adversarial images as in existing methods. Extensive experiments affirm the +effectiveness of our method in improving transferability across various VLP +models and downstream vision-and-language tasks (e.g., Image-Text +Retrieval(ITR), Visual Grounding(VG), Image Captioning(IC)). + +
+
+
+
+
+ + ☆ Self-learning Canonical Space for Multi-view 3D Human Pose Estimation + + +
+ Multi-view 3D human pose estimation is naturally superior to single view one, +benefiting from more comprehensive information provided by images of multiple +views. The information includes camera poses, 2D/3D human poses, and 3D +geometry. However, the accurate annotation of these information is hard to +obtain, making it challenging to predict accurate 3D human pose from multi-view +images. To deal with this issue, we propose a fully self-supervised framework, +named cascaded multi-view aggregating network (CMANet), to construct a +canonical parameter space to holistically integrate and exploit multi-view +information. In our framework, the multi-view information is grouped into two +categories: 1) intra-view information , 2) inter-view information. Accordingly, +CMANet consists of two components: intra-view module (IRV) and inter-view +module (IEV). IRV is used for extracting initial camera pose and 3D human pose +of each view; IEV is to fuse complementary pose information and cross-view 3D +geometry for a final 3D human pose. To facilitate the aggregation of the intra- +and inter-view, we define a canonical parameter space, depicted by per-view +camera pose and human pose and shape parameters ($\theta$ and $\beta$) of SMPL +model, and propose a two-stage learning procedure. At first stage, IRV learns +to estimate camera pose and view-dependent 3D human pose supervised by +confident output of an off-the-shelf 2D keypoint detector. At second stage, IRV +is frozen and IEV further refines the camera pose and optimizes the 3D human +pose by implicitly encoding the cross-view complement and 3D geometry +constraint, achieved by jointly fitting predicted multi-view 2D keypoints. The +proposed framework, modules, and learning strategy are demonstrated to be +effective by comprehensive experiments and CMANet is superior to +state-of-the-art methods in extensive quantitative and qualitative analysis. + +
+
+
+
+
+ + ☆ Precise-Physics Driven Text-to-3D Generation + + +
+ Text-to-3D generation has shown great promise in generating novel 3D content +based on given text prompts. However, existing generative methods mostly focus +on geometric or visual plausibility while ignoring precise physics perception +for the generated 3D shapes. This greatly hinders the practicality of generated +3D shapes in real-world applications. In this work, we propose Phy3DGen, a +precise-physics-driven text-to-3D generation method. By analyzing the solid +mechanics of generated 3D shapes, we reveal that the 3D shapes generated by +existing text-to-3D generation methods are impractical for real-world +applications as the generated 3D shapes do not conform to the laws of physics. +To this end, we leverage 3D diffusion models to provide 3D shape priors and +design a data-driven differentiable physics layer to optimize 3D shape priors +with solid mechanics. This allows us to optimize geometry efficiently and learn +precise physics information about 3D shapes at the same time. Experimental +results demonstrate that our method can consider both geometric plausibility +and precise physics perception, further bridging 3D virtual modeling and +precise physical worlds. + +
+
+
+
+
+ + ☆ Human Mesh Recovery from Arbitrary Multi-view Images + + +
+ Human mesh recovery from arbitrary multi-view images involves two +characteristics: the arbitrary camera poses and arbitrary number of camera +views. Because of the variability, designing a unified framework to tackle this +task is challenging. The challenges can be summarized as the dilemma of being +able to simultaneously estimate arbitrary camera poses and recover human mesh +from arbitrary multi-view images while maintaining flexibility. To solve this +dilemma, we propose a divide and conquer framework for Unified Human Mesh +Recovery (U-HMR) from arbitrary multi-view images. In particular, U-HMR +consists of a decoupled structure and two main components: camera and body +decoupling (CBD), camera pose estimation (CPE), and arbitrary view fusion +(AVF). As camera poses and human body mesh are independent of each other, CBD +splits the estimation of them into two sub-tasks for two individual +sub-networks (\ie, CPE and AVF) to handle respectively, thus the two sub-tasks +are disentangled. In CPE, since each camera pose is unrelated to the others, we +adopt a shared MLP to process all views in a parallel way. In AVF, in order to +fuse multi-view information and make the fusion operation independent of the +number of views, we introduce a transformer decoder with a SMPL parameters +query token to extract cross-view features for mesh recovery. To demonstrate +the efficacy and flexibility of the proposed framework and effect of each +component, we conduct extensive experiments on three public datasets: +Human3.6M, MPI-INF-3DHP, and TotalCapture. + +
+
+
+
+
+ + ☆ Prototipo de video juego activo basado en una cámara 3D para motivar + la actividad física en niños y adultos mayores + + +
+ This document describes the development of a video game prototype designed to +encourage physical activity among children and older adults. The prototype +consists of a laptop, a camera with 3D sensors, and optionally requires an LCD +screen or a projector. The programming component of this prototype was +developed in Scratch, a programming language geared towards children, which +greatly facilitates the creation of a game tailored to the users' preferences. +The idea to create such a prototype originated from the desire to offer an +option that promotes physical activity among children and adults, given that a +lack of physical exercise is a predominant factor in the development of chronic +degenerative diseases such as diabetes and hypertension, to name the most +common. As a result of this initiative, an active video game prototype was +successfully developed, based on a ping-pong game, which allows both children +and adults to interact in a fun way while encouraging the performance of +physical activities that can positively impact the users' health. + +
+
+ comment: 13 pages, in Spanish language, 11 figures +
+
+
+
+
+ + ☆ Geometric Constraints in Deep Learning Frameworks: A Survey + + +
+ Stereophotogrammetry is an emerging technique of scene understanding. Its +origins go back to at least the 1800s when people first started to investigate +using photographs to measure the physical properties of the world. Since then, +thousands of approaches have been explored. The classic geometric techniques of +Shape from Stereo is built on using geometry to define constraints on scene and +camera geometry and then solving the non-linear systems of equations. More +recent work has taken an entirely different approach, using end-to-end deep +learning without any attempt to explicitly model the geometry. In this survey, +we explore the overlap for geometric-based and deep learning-based frameworks. +We compare and contrast geometry enforcing constraints integrated into a deep +learning framework for depth estimation or other closely related problems. We +present a new taxonomy for prevalent geometry enforcing constraints used in +modern deep learning frameworks. We also present insightful observations and +potential future research directions. + +
+
+ comment: A preprint +
+
+
+
+
+ + ☆ TransformMix: Learning Transformation and Mixing Strategies from Data + + +
+ Data augmentation improves the generalization power of deep learning models +by synthesizing more training samples. Sample-mixing is a popular data +augmentation approach that creates additional data by combining existing +samples. Recent sample-mixing methods, like Mixup and Cutmix, adopt simple +mixing operations to blend multiple inputs. Although such a heuristic approach +shows certain performance gains in some computer vision tasks, it mixes the +images blindly and does not adapt to different datasets automatically. A mixing +strategy that is effective for a particular dataset does not often generalize +well to other datasets. If not properly configured, the methods may create +misleading mixed images, which jeopardize the effectiveness of sample-mixing +augmentations. In this work, we propose an automated approach, TransformMix, to +learn better transformation and mixing augmentation strategies from data. In +particular, TransformMix applies learned transformations and mixing masks to +create compelling mixed images that contain correct and important information +for the target tasks. We demonstrate the effectiveness of TransformMix on +multiple datasets in transfer learning, classification, object detection, and +knowledge distillation settings. Experimental results show that our method +achieves better performance as well as efficiency when compared with strong +sample-mixing baselines. + +
+
+ comment: 17 pages, 9 figures +
+
+
+
+
+ + ☆ Multimodal Fusion Method with Spatiotemporal Sequences and Relationship + Learning for Valence-Arousal Estimation + + +
+ This paper presents our approach for the VA (Valence-Arousal) estimation task +in the ABAW6 competition. We devised a comprehensive model by preprocessing +video frames and audio segments to extract visual and audio features. Through +the utilization of Temporal Convolutional Network (TCN) modules, we effectively +captured the temporal and spatial correlations between these features. +Subsequently, we employed a Transformer encoder structure to learn long-range +dependencies, thereby enhancing the model's performance and generalization +ability. Our method leverages a multimodal data fusion approach, integrating +pre-trained audio and video backbones for feature extraction, followed by +TCN-based spatiotemporal encoding and Transformer-based temporal information +capture. Experimental results demonstrate the effectiveness of our approach, +achieving competitive performance in VA estimation on the AffWild2 dataset. + +
+
+ comment: 6 pages,1 figures +
+
+
+
+
+ + ☆ Eye-gaze Guided Multi-modal Alignment Framework for Radiology + + +
+ In multi-modal frameworks, the alignment of cross-modal features presents a +significant challenge. The predominant approach in multi-modal pre-training +emphasizes either global or local alignment between modalities, utilizing +extensive datasets. This bottom-up driven method often suffers from a lack of +interpretability, a critical concern in radiology. Previous studies have +integrated high-level labels in medical images or text, but these still rely on +manual annotation, a costly and labor-intensive process. Our work introduces a +novel approach by using eye-gaze data, collected synchronously by radiologists +during diagnostic evaluations. This data, indicating radiologists' focus areas, +naturally links chest X-rays to diagnostic texts. We propose the Eye-gaze +Guided Multi-modal Alignment (EGMA) framework to harness eye-gaze data for +better alignment of image and text features, aiming to reduce reliance on +manual annotations and thus cut training costs. Our model demonstrates robust +performance, outperforming other state-of-the-art methods in zero-shot +classification and retrieval tasks. The incorporation of easily-obtained +eye-gaze data during routine radiological diagnoses signifies a step towards +minimizing manual annotation dependency. Additionally, we explore the impact of +varying amounts of eye-gaze data on model performance, highlighting the +feasibility and utility of integrating this auxiliary data into multi-modal +pre-training. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ☆ VisionGPT: LLM-Assisted Real-Time Anomaly Detection for Safe Visual + Navigation + + +
+ This paper explores the potential of Large Language Models(LLMs) in zero-shot +anomaly detection for safe visual navigation. With the assistance of the +state-of-the-art real-time open-world object detection model Yolo-World and +specialized prompts, the proposed framework can identify anomalies within +camera-captured frames that include any possible obstacles, then generate +concise, audio-delivered descriptions emphasizing abnormalities, assist in safe +visual navigation in complex circumstances. Moreover, our proposed framework +leverages the advantages of LLMs and the open-vocabulary object detection model +to achieve the dynamic scenario switch, which allows users to transition +smoothly from scene to scene, which addresses the limitation of traditional +visual navigation. Furthermore, this paper explored the performance +contribution of different prompt components, provided the vision for future +improvement in visual accessibility, and paved the way for LLMs in video +anomaly detection and vision-language understanding. + +
+
+
+
+
+ + ☆ ComboVerse: Compositional 3D Assets Creation Using Spatially-Aware + Diffusion Guidance + + +
+ Generating high-quality 3D assets from a given image is highly desirable in +various applications such as AR/VR. Recent advances in single-image 3D +generation explore feed-forward models that learn to infer the 3D model of an +object without optimization. Though promising results have been achieved in +single object generation, these methods often struggle to model complex 3D +assets that inherently contain multiple objects. In this work, we present +ComboVerse, a 3D generation framework that produces high-quality 3D assets with +complex compositions by learning to combine multiple models. 1) We first +perform an in-depth analysis of this ``multi-object gap'' from both model and +data perspectives. 2) Next, with reconstructed 3D models of different objects, +we seek to adjust their sizes, rotation angles, and locations to create a 3D +asset that matches the given image. 3) To automate this process, we apply +spatially-aware score distillation sampling (SSDS) from pretrained diffusion +models to guide the positioning of objects. Our proposed framework emphasizes +spatial alignment of objects, compared with standard score distillation +sampling, and thus achieves more accurate results. Extensive experiments +validate ComboVerse achieves clear improvements over existing methods in +generating compositional 3D assets. + +
+
+ comment: https://cyw-3d.github.io/ComboVerse/ +
+
+
+
+
+ + ☆ Understanding Training-free Diffusion Guidance: Mechanisms and + Limitations + + +
+ Adding additional control to pretrained diffusion models has become an +increasingly popular research area, with extensive applications in computer +vision, reinforcement learning, and AI for science. Recently, several studies +have proposed training-free diffusion guidance by using off-the-shelf networks +pretrained on clean images. This approach enables zero-shot conditional +generation for universal control formats, which appears to offer a free lunch +in diffusion guidance. In this paper, we aim to develop a deeper understanding +of the operational mechanisms and fundamental limitations of training-free +guidance. We offer a theoretical analysis that supports training-free guidance +from the perspective of optimization, distinguishing it from classifier-based +(or classifier-free) guidance. To elucidate their drawbacks, we theoretically +demonstrate that training-free methods are more susceptible to adversarial +gradients and exhibit slower convergence rates compared to classifier guidance. +We then introduce a collection of techniques designed to overcome the +limitations, accompanied by theoretical rationale and empirical evidence. Our +experiments in image and motion generation confirm the efficacy of these +techniques. + +
+
+
+
+
+ + ☆ VQ-NeRV: A Vector Quantized Neural Representation for Videos + + +
+ Implicit neural representations (INR) excel in encoding videos within neural +networks, showcasing promise in computer vision tasks like video compression +and denoising. INR-based approaches reconstruct video frames from +content-agnostic embeddings, which hampers their efficacy in video frame +regression and restricts their generalization ability for video interpolation. +To address these deficiencies, Hybrid Neural Representation for Videos (HNeRV) +was introduced with content-adaptive embeddings. Nevertheless, HNeRV's +compression ratios remain relatively low, attributable to an oversight in +leveraging the network's shallow features and inter-frame residual information. +In this work, we introduce an advanced U-shaped architecture, Vector +Quantized-NeRV (VQ-NeRV), which integrates a novel component--the VQ-NeRV +Block. This block incorporates a codebook mechanism to discretize the network's +shallow residual features and inter-frame residual information effectively. +This approach proves particularly advantageous in video compression, as it +results in smaller size compared to quantized features. Furthermore, we +introduce an original codebook optimization technique, termed shallow codebook +optimization, designed to refine the utility and efficiency of the codebook. +The experimental evaluations indicate that VQ-NeRV outperforms HNeRV on video +regression tasks, delivering superior reconstruction quality (with an increase +of 1-2 dB in Peak Signal-to-Noise Ratio (PSNR)), better bit per pixel (bpp) +efficiency, and improved video inpainting outcomes. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ OV9D: Open-Vocabulary Category-Level 9D Object Pose and Size Estimation + + +
+ This paper studies a new open-set problem, the open-vocabulary category-level +object pose and size estimation. Given human text descriptions of arbitrary +novel object categories, the robot agent seeks to predict the position, +orientation, and size of the target object in the observed scene image. To +enable such generalizability, we first introduce OO3D-9D, a large-scale +photorealistic dataset for this task. Derived from OmniObject3D, OO3D-9D is the +largest and most diverse dataset in the field of category-level object pose and +size estimation. It includes additional annotations for the symmetry axis of +each category, which help resolve symmetric ambiguity. Apart from the +large-scale dataset, we find another key to enabling such generalizability is +leveraging the strong prior knowledge in pre-trained visual-language foundation +models. We then propose a framework built on pre-trained DinoV2 and +text-to-image stable diffusion models to infer the normalized object coordinate +space (NOCS) maps of the target instances. This framework fully leverages the +visual semantic prior from DinoV2 and the aligned visual and language knowledge +within the text-to-image diffusion model, which enables generalization to +various text descriptions of novel categories. Comprehensive quantitative and +qualitative experiments demonstrate that the proposed open-vocabulary method, +trained on our large-scale synthesized data, significantly outperforms the +baseline and can effectively generalize to real-world images of unseen +categories. The project page is at https://ov9d.github.io. + +
+
+
+
+
+ + ☆ VideoBadminton: A Video Dataset for Badminton Action Recognition + + +
+ In the dynamic and evolving field of computer vision, action recognition has +become a key focus, especially with the advent of sophisticated methodologies +like Convolutional Neural Networks (CNNs), Convolutional 3D, Transformer, and +spatial-temporal feature fusion. These technologies have shown promising +results on well-established benchmarks but face unique challenges in real-world +applications, particularly in sports analysis, where the precise decomposition +of activities and the distinction of subtly different actions are crucial. +Existing datasets like UCF101, HMDB51, and Kinetics have offered a diverse +range of video data for various scenarios. However, there's an increasing need +for fine-grained video datasets that capture detailed categorizations and +nuances within broader action categories. In this paper, we introduce the +VideoBadminton dataset derived from high-quality badminton footage. Through an +exhaustive evaluation of leading methodologies on this dataset, this study aims +to advance the field of action recognition, particularly in badminton sports. +The introduction of VideoBadminton could not only serve for badminton action +recognition but also provide a dataset for recognizing fine-grained actions. +The insights gained from these evaluations are expected to catalyze further +research in action comprehension, especially within sports contexts. + +
+
+
+
+
+ + ☆ Low-Trace Adaptation of Zero-shot Self-supervised Blind Image Denoising + + +
+ Deep learning-based denoiser has been the focus of recent development on +image denoising. In the past few years, there has been increasing interest in +developing self-supervised denoising networks that only require noisy images, +without the need for clean ground truth for training. However, a performance +gap remains between current self-supervised methods and their supervised +counterparts. Additionally, these methods commonly depend on assumptions about +noise characteristics, thereby constraining their applicability in real-world +scenarios. Inspired by the properties of the Frobenius norm expansion, we +discover that incorporating a trace term reduces the optimization goal +disparity between self-supervised and supervised methods, thereby enhancing the +performance of self-supervised learning. To exploit this insight, we propose a +trace-constraint loss function and design the low-trace adaptation Noise2Noise +(LoTA-N2N) model that bridges the gap between self-supervised and supervised +learning. Furthermore, we have discovered that several existing self-supervised +denoising frameworks naturally fall within the proposed trace-constraint loss +as subcases. Extensive experiments conducted on natural and confocal image +datasets indicate that our method achieves state-of-the-art performance within +the realm of zero-shot self-supervised image denoising approaches, without +relying on any assumptions regarding the noise. + +
+
+ comment: 11pages, 6 figures +
+
+
+
+
+ + ☆ XPose: eXplainable Human Pose Estimation + + +
+ Current approaches in pose estimation primarily concentrate on enhancing +model architectures, often overlooking the importance of comprehensively +understanding the rationale behind model decisions. In this paper, we propose +XPose, a novel framework that incorporates Explainable AI (XAI) principles into +pose estimation. This integration aims to elucidate the individual contribution +of each keypoint to final prediction, thereby elevating the model's +transparency and interpretability. Conventional XAI techniques have +predominantly addressed tasks with single-target tasks like classification. +Additionally, the application of Shapley value, a common measure in XAI, to +pose estimation has been hindered by prohibitive computational demands. + To address these challenges, this work introduces an innovative concept +called Group Shapley Value (GSV). This approach strategically organizes +keypoints into clusters based on their interdependencies. Within these +clusters, GSV meticulously calculates Shapley value for keypoints, while for +inter-cluster keypoints, it opts for a more holistic group-level valuation. +This dual-level computation framework meticulously assesses keypoint +contributions to the final outcome, optimizing computational efficiency. +Building on the insights into keypoint interactions, we devise a novel data +augmentation technique known as Group-based Keypoint Removal (GKR). This method +ingeniously removes individual keypoints during training phases, deliberately +preserving those with strong mutual connections, thereby refining the model's +predictive prowess for non-visible keypoints. The empirical validation of GKR +across a spectrum of standard approaches attests to its efficacy. GKR's success +demonstrates how using Explainable AI (XAI) can directly enhance pose +estimation models. + +
+
+
+
+
+ + ☆ GaussianFlow: Splatting Gaussian Dynamics for 4D Content Creation + + +
+ Creating 4D fields of Gaussian Splatting from images or videos is a +challenging task due to its under-constrained nature. While the optimization +can draw photometric reference from the input videos or be regulated by +generative models, directly supervising Gaussian motions remains underexplored. +In this paper, we introduce a novel concept, Gaussian flow, which connects the +dynamics of 3D Gaussians and pixel velocities between consecutive frames. The +Gaussian flow can be efficiently obtained by splatting Gaussian dynamics into +the image space. This differentiable process enables direct dynamic supervision +from optical flow. Our method significantly benefits 4D dynamic content +generation and 4D novel view synthesis with Gaussian Splatting, especially for +contents with rich motions that are hard to be handled by existing methods. The +common color drifting issue that happens in 4D generation is also resolved with +improved Guassian dynamics. Superior visual quality on extensive experiments +demonstrates our method's effectiveness. Quantitative and qualitative +evaluations show that our method achieves state-of-the-art results on both +tasks of 4D generation and 4D novel view synthesis. Project page: +https://zerg-overmind.github.io/GaussianFlow.github.io/ + +
+
+
+
+
+ + ☆ Class and Region-Adaptive Constraints for Network Calibration + + +
+ In this work, we present a novel approach to calibrate segmentation networks +that considers the inherent challenges posed by different categories and object +regions. In particular, we present a formulation that integrates class and +region-wise constraints into the learning objective, with multiple penalty +weights to account for class and region differences. Finding the optimal +penalty weights manually, however, might be unfeasible, and potentially hinder +the optimization process. To overcome this limitation, we propose an approach +based on Class and Region-Adaptive constraints (CRaC), which allows to learn +the class and region-wise penalty weights during training. CRaC is based on a +general Augmented Lagrangian method, a well-established technique in +constrained optimization. Experimental results on two popular segmentation +benchmarks, and two well-known segmentation networks, demonstrate the +superiority of CRaC compared to existing approaches. The code is available at: +https://github.com/Bala93/CRac/ + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ DMAD: Dual Memory Bank for Real-World Anomaly Detection + + +
+ Training a unified model is considered to be more suitable for practical +industrial anomaly detection scenarios due to its generalization ability and +storage efficiency. However, this multi-class setting, which exclusively uses +normal data, overlooks the few but important accessible annotated anomalies in +the real world. To address the challenge of real-world anomaly detection, we +propose a new framework named Dual Memory bank enhanced representation learning +for Anomaly Detection (DMAD). This framework handles both unsupervised and +semi-supervised scenarios in a unified (multi-class) setting. DMAD employs a +dual memory bank to calculate feature distance and feature attention between +normal and abnormal patterns, thereby encapsulating knowledge about normal and +abnormal instances. This knowledge is then used to construct an enhanced +representation for anomaly score learning. We evaluated DMAD on the MVTec-AD +and VisA datasets. The results show that DMAD surpasses current +state-of-the-art methods, highlighting DMAD's capability in handling the +complexities of real-world anomaly detection scenarios. + +
+
+
+
+
+ + ☆ Entity6K: A Large Open-Domain Evaluation Dataset for Real-World Entity + Recognition + + +
+ Open-domain real-world entity recognition is essential yet challenging, +involving identifying various entities in diverse environments. The lack of a +suitable evaluation dataset has been a major obstacle in this field due to the +vast number of entities and the extensive human effort required for data +curation. We introduce Entity6K, a comprehensive dataset for real-world entity +recognition, featuring 5,700 entities across 26 categories, each supported by 5 +human-verified images with annotations. Entity6K offers a diverse range of +entity names and categorizations, addressing a gap in existing datasets. We +conducted benchmarks with existing models on tasks like image captioning, +object detection, zero-shot classification, and dense captioning to demonstrate +Entity6K's effectiveness in evaluating models' entity recognition capabilities. +We believe Entity6K will be a valuable resource for advancing accurate entity +recognition in open-domain settings. + +
+
+
+
+
+ + ☆ Deep Few-view High-resolution Photon-counting Extremity CT at Halved + Dose for a Clinical Trial + + +
+ The latest X-ray photon-counting computed tomography (PCCT) for extremity +allows multi-energy high-resolution (HR) imaging for tissue characterization +and material decomposition. However, both radiation dose and imaging speed need +improvement for contrast-enhanced and other studies. Despite the success of +deep learning methods for 2D few-view reconstruction, applying them to HR +volumetric reconstruction of extremity scans for clinical diagnosis has been +limited due to GPU memory constraints, training data scarcity, and domain gap +issues. In this paper, we propose a deep learning-based approach for PCCT image +reconstruction at halved dose and doubled speed in a New Zealand clinical +trial. Particularly, we present a patch-based volumetric refinement network to +alleviate the GPU memory limitation, train network with synthetic data, and use +model-based iterative refinement to bridge the gap between synthetic and +real-world data. The simulation and phantom experiments demonstrate +consistently improved results under different acquisition conditions on both +in- and off-domain structures using a fixed network. The image quality of 8 +patients from the clinical trial are evaluated by three radiologists in +comparison with the standard image reconstruction with a full-view dataset. It +is shown that our proposed approach is essentially identical to or better than +the clinical benchmark in terms of diagnostic image quality scores. Our +approach has a great potential to improve the safety and efficiency of PCCT +without compromising image quality. + +
+
+ comment: 9 figures, 5 tables +
+
+
+
+
+ + ☆ Depth-guided NeRF Training via Earth Mover's Distance + + +
+ Neural Radiance Fields (NeRFs) are trained to minimize the rendering loss of +predicted viewpoints. However, the photometric loss often does not provide +enough information to disambiguate between different possible geometries +yielding the same image. Previous work has thus incorporated depth supervision +during NeRF training, leveraging dense predictions from pre-trained depth +networks as pseudo-ground truth. While these depth priors are assumed to be +perfect once filtered for noise, in practice, their accuracy is more +challenging to capture. This work proposes a novel approach to uncertainty in +depth priors for NeRF supervision. Instead of using custom-trained depth or +uncertainty priors, we use off-the-shelf pretrained diffusion models to predict +depth and capture uncertainty during the denoising process. Because we know +that depth priors are prone to errors, we propose to supervise the ray +termination distance distribution with Earth Mover's Distance instead of +enforcing the rendered depth to replicate the depth prior exactly through +L2-loss. Our depth-guided NeRF outperforms all baselines on standard depth +metrics by a large margin while maintaining performance on photometric +measures. + +
+
+ comment: Preprint. Under review +
+
+
+
+
+ + ☆ Diversity-Aware Agnostic Ensemble of Sharpness Minimizers + + +
+ There has long been plenty of theoretical and empirical evidence supporting +the success of ensemble learning. Deep ensembles in particular take advantage +of training randomness and expressivity of individual neural networks to gain +prediction diversity, ultimately leading to better generalization, robustness +and uncertainty estimation. In respect of generalization, it is found that +pursuing wider local minima result in models being more robust to shifts +between training and testing sets. A natural research question arises out of +these two approaches as to whether a boost in generalization ability can be +achieved if ensemble learning and loss sharpness minimization are integrated. +Our work investigates this connection and proposes DASH - a learning algorithm +that promotes diversity and flatness within deep ensembles. More concretely, +DASH encourages base learners to move divergently towards low-loss regions of +minimal sharpness. We provide a theoretical backbone for our method along with +extensive empirical evidence demonstrating an improvement in ensemble +generalizability. + +
+
+
+
+
+ + ☆ DecentNeRFs: Decentralized Neural Radiance Fields from Crowdsourced + Images + + +
+ Neural radiance fields (NeRFs) show potential for transforming images +captured worldwide into immersive 3D visual experiences. However, most of this +captured visual data remains siloed in our camera rolls as these images contain +personal details. Even if made public, the problem of learning 3D +representations of billions of scenes captured daily in a centralized manner is +computationally intractable. Our approach, DecentNeRF, is the first attempt at +decentralized, crowd-sourced NeRFs that require $\sim 10^4\times$ less server +computing for a scene than a centralized approach. Instead of sending the raw +data, our approach requires users to send a 3D representation, distributing the +high computation cost of training centralized NeRFs between the users. It +learns photorealistic scene representations by decomposing users' 3D views into +personal and global NeRFs and a novel optimally weighted aggregation of only +the latter. We validate the advantage of our approach to learn NeRFs with +photorealism and minimal server computation cost on structured synthetic and +real-world photo tourism datasets. We further analyze how secure aggregation of +global NeRFs in DecentNeRF minimizes the undesired reconstruction of personal +content by the server. + +
+
+
+
+
+ + ☆ ADAPT to Robustify Prompt Tuning Vision Transformers + + +
+ The performance of deep models, including Vision Transformers, is known to be +vulnerable to adversarial attacks. Many existing defenses against these +attacks, such as adversarial training, rely on full-model fine-tuning to induce +robustness in the models. These defenses require storing a copy of the entire +model, that can have billions of parameters, for each task. At the same time, +parameter-efficient prompt tuning is used to adapt large transformer-based +models to downstream tasks without the need to save large copies. In this +paper, we examine parameter-efficient prompt tuning of Vision Transformers for +downstream tasks under the lens of robustness. We show that previous +adversarial defense methods, when applied to the prompt tuning paradigm, suffer +from gradient obfuscation and are vulnerable to adaptive attacks. We introduce +ADAPT, a novel framework for performing adaptive adversarial training in the +prompt tuning paradigm. Our method achieves competitive robust accuracy of ~40% +w.r.t. SOTA robustness methods using full-model fine-tuning, by tuning only ~1% +of the number of parameters. + +
+
+
+
+
+ + ☆ Hermite coordinate interpolation kernels: application to image zooming + + +
+ A number of basic image processing tasks, such as any geometric +transformation require interpolation at subpixel image values. In this work we +utilize the multidimensional coordinate Hermite spline interpolation defined on +non-equal spaced, rectilinear grids and apply it to a very common image +processing task, image zooming. Since Hermite interpolation utilizes function +values, as well as partial derivative values, it is natural to apply it to +image processing tasks as a special case of equi-spaced grid, using numerical +approximations of the image partial derivatives at each pixel. Furthermore, the +task of image interpolation requires the calculation of image values at +positions with nono-zero fractional part. Thus, any spline interpolation can be +written as convolution with an appropriate kernel. In this context we generate +the Hermite kernels according to the derived $n-$dimensional interpolant of +Theorem 2 in [1]. We show that despite the increased complexity of the +interpolant, once the kernels are constructed, the Hermite spline interpolation +can be applied to images as efficiently as any other less complicated method. +Finally, we perform illustrative numerical examples to showcase the +applicability and high accuracy of the proposed Hermite kernels for image +zooming, compared to other interpolation methods, both traditional +convolution-based, as well as employing deep learning, in terms of PSNR, as +well as SSIM error metrics. The proposed Hermite spline kernels outperform all +other methods in the majority of the test images, in experiments using many +cascaded repetitions of the zoom operation. Interesting conclusions can be +drawn considering all methods under comparison. + +
+
+
+
+
+ + ☆ 3D Semantic MapNet: Building Maps for Multi-Object Re-Identification in + 3D + + +
+ We study the task of 3D multi-object re-identification from embodied tours. +Specifically, an agent is given two tours of an environment (e.g. an apartment) +under two different layouts (e.g. arrangements of furniture). Its task is to +detect and re-identify objects in 3D - e.g. a "sofa" moved from location A to +B, a new "chair" in the second layout at location C, or a "lamp" from location +D in the first layout missing in the second. To support this task, we create an +automated infrastructure to generate paired egocentric tours of +initial/modified layouts in the Habitat simulator using Matterport3D scenes, +YCB and Google-scanned objects. We present 3D Semantic MapNet (3D-SMNet) - a +two-stage re-identification model consisting of (1) a 3D object detector that +operates on RGB-D videos with known pose, and (2) a differentiable object +matching module that solves correspondence estimation between two sets of 3D +bounding boxes. Overall, 3D-SMNet builds object-based maps of each layout and +then uses a differentiable matcher to re-identify objects across the tours. +After training 3D-SMNet on our generated episodes, we demonstrate zero-shot +transfer to real-world rearrangement scenarios by instantiating our task in +Replica, Active Vision, and RIO environments depicting rearrangements. On all +datasets, we find 3D-SMNet outperforms competitive baselines. Further, we show +jointly training on real and generated episodes can lead to significant +improvements over training on real data alone. + +
+
+ comment: 8pages +
+
+
+
+
+ + ☆ Reflectivity Is All You Need!: Advancing LiDAR Semantic Segmentation + + +
+ LiDAR semantic segmentation frameworks predominantly leverage geometry-based +features to differentiate objects within a scan. While these methods excel in +scenarios with clear boundaries and distinct shapes, their performance declines +in environments where boundaries are blurred, particularly in off-road +contexts. To address this, recent strides in 3D segmentation algorithms have +focused on harnessing raw LiDAR intensity measurements to improve prediction +accuracy. Despite these efforts, current learning-based models struggle to +correlate the intricate connections between raw intensity and factors such as +distance, incidence angle, material reflectivity, and atmospheric conditions. +Building upon our prior work, this paper delves into the advantages of +employing calibrated intensity (also referred to as reflectivity) within +learning-based LiDAR semantic segmentation frameworks. We initially establish +that incorporating reflectivity as an input enhances the existing LiDAR +semantic segmentation model. Furthermore, we present findings that enable the +model to learn to calibrate intensity can boost its performance. Through +extensive experimentation on the off-road dataset Rellis-3D, we demonstrate +notable improvements. Specifically, converting intensity to reflectivity +results in a 4% increase in mean Intersection over Union (mIoU) when compared +to using raw intensity in Off-road scenarios. Additionally, we also investigate +the possible benefits of using calibrated intensity in semantic segmentation in +urban environments (SemanticKITTI) and cross-sensor domain adaptation. + +
+
+
+
+
+ + ☆ Castor: Competing shapelets for fast and accurate time series + classification + + +
+ Shapelets are discriminative subsequences, originally embedded in +shapelet-based decision trees but have since been extended to shapelet-based +transformations. We propose Castor, a simple, efficient, and accurate time +series classification algorithm that utilizes shapelets to transform time +series. The transformation organizes shapelets into groups with varying +dilation and allows the shapelets to compete over the time context to construct +a diverse feature representation. By organizing the shapelets into groups, we +enable the transformation to transition between levels of competition, +resulting in methods that more closely resemble distance-based transformations +or dictionary-based transformations. We demonstrate, through an extensive +empirical investigation, that Castor yields transformations that result in +classifiers that are significantly more accurate than several state-of-the-art +classifiers. In an extensive ablation study, we examine the effect of choosing +hyperparameters and suggest accurate and efficient default values. + +
+
+ comment: Submitted to Data Mining and Knowledge Discovery Journal +
+
+
+
+
+ + ☆ A conditional latent autoregressive recurrent model for generation and + forecasting of beam dynamics in particle accelerators + + +
+ Particle accelerators are complex systems that focus, guide, and accelerate +intense charged particle beams to high energy. Beam diagnostics present a +challenging problem due to limited non-destructive measurements, +computationally demanding simulations, and inherent uncertainties in the +system. We propose a two-step unsupervised deep learning framework named as +Conditional Latent Autoregressive Recurrent Model (CLARM) for learning the +spatiotemporal dynamics of charged particles in accelerators. CLARM consists of +a Conditional Variational Autoencoder (CVAE) transforming six-dimensional phase +space into a lower-dimensional latent distribution and a Long Short-Term Memory +(LSTM) network capturing temporal dynamics in an autoregressive manner. The +CLARM can generate projections at various accelerator modules by sampling and +decoding the latent space representation. The model also forecasts future +states (downstream locations) of charged particles from past states (upstream +locations). The results demonstrate that the generative and forecasting ability +of the proposed approach is promising when tested against a variety of +evaluation metrics. + +
+
+
+
+
+ + ☆ LUWA Dataset: Learning Lithic Use-Wear Analysis on Microscopic Images CVPR + + +
+ Lithic Use-Wear Analysis (LUWA) using microscopic images is an underexplored +vision-for-science research area. It seeks to distinguish the worked material, +which is critical for understanding archaeological artifacts, material +interactions, tool functionalities, and dental records. However, this +challenging task goes beyond the well-studied image classification problem for +common objects. It is affected by many confounders owing to the complex wear +mechanism and microscopic imaging, which makes it difficult even for human +experts to identify the worked material successfully. In this paper, we +investigate the following three questions on this unique vision task for the +first time:(i) How well can state-of-the-art pre-trained models (like DINOv2) +generalize to the rarely seen domain? (ii) How can few-shot learning be +exploited for scarce microscopic images? (iii) How do the ambiguous +magnification and sensing modality influence the classification accuracy? To +study these, we collaborated with archaeologists and built the first +open-source and the largest LUWA dataset containing 23,130 microscopic images +with different magnifications and sensing modalities. Extensive experiments +show that existing pre-trained models notably outperform human experts but +still leave a large gap for improvements. Most importantly, the LUWA dataset +provides an underexplored opportunity for vision and learning communities and +complements existing image classification problems on common objects. + +
+
+ comment: CVPR +
+
+
+
+
+ + ☆ Improved EATFormer: A Vision Transformer for Medical Image + Classification + + +
+ The accurate analysis of medical images is vital for diagnosing and +predicting medical conditions. Traditional approaches relying on radiologists +and clinicians suffer from inconsistencies and missed diagnoses. Computer-aided +diagnosis systems can assist in achieving early, accurate, and efficient +diagnoses. This paper presents an improved Evolutionary Algorithm-based +Transformer architecture for medical image classification using Vision +Transformers. The proposed EATFormer architecture combines the strengths of +Convolutional Neural Networks and Vision Transformers, leveraging their ability +to identify patterns in data and adapt to specific characteristics. The +architecture incorporates novel components, including the Enhanced EA-based +Transformer block with Feed-Forward Network, Global and Local Interaction , and +Multi-Scale Region Aggregation modules. It also introduces the Modulated +Deformable MSA module for dynamic modeling of irregular locations. The paper +discusses the Vision Transformer (ViT) model's key features, such as +patch-based processing, positional context incorporation, and Multi-Head +Attention mechanism. It introduces the Multi-Scale Region Aggregation module, +which aggregates information from different receptive fields to provide an +inductive bias. The Global and Local Interaction module enhances the MSA-based +global module by introducing a local path for extracting discriminative local +information. Experimental results on the Chest X-ray and Kvasir datasets +demonstrate that the proposed EATFormer significantly improves prediction speed +and accuracy compared to baseline models. + +
+
+
+
+
+ + ☆ DeblurDiNAT: A Lightweight and Effective Transformer for Image + Deblurring + + +
+ Blurry images may contain local and global non-uniform artifacts, which +complicate the deblurring process and make it more challenging to achieve +satisfactory results. Recently, Transformers generate improved deblurring +outcomes than existing CNN architectures. However, the large model size and +long inference time are still two bothersome issues which have not been fully +explored. To this end, we propose DeblurDiNAT, a compact encoder-decoder +Transformer which efficiently restores clean images from real-world blurry +ones. We adopt an alternating dilation factor structure with the aim of +global-local feature learning. Also, we observe that simply using +self-attention layers in networks does not always produce good deblurred +results. To solve this problem, we propose a channel modulation self-attention +(CMSA) block, where a cross-channel learner (CCL) is utilized to capture +channel relationships. In addition, we present a divide and multiply +feed-forward network (DMFN) allowing fast feature propagation. Moreover, we +design a lightweight gated feature fusion (LGFF) module, which performs +controlled feature merging. Comprehensive experimental results show that the +proposed model, named DeblurDiNAT, provides a favorable performance boost +without introducing noticeable computational costs over the baseline, and +achieves state-of-the-art (SOTA) performance on several image deblurring +datasets. Compared to nearest competitors, our space-efficient and time-saving +method demonstrates a stronger generalization ability with 3%-68% fewer +parameters and produces deblurred images that are visually closer to the ground +truth. + +
+
+
+
+
+ + ☆ SIFT-DBT: Self-supervised Initialization and Fine-Tuning for Imbalanced + Digital Breast Tomosynthesis Image Classification + + +
+ Digital Breast Tomosynthesis (DBT) is a widely used medical imaging modality +for breast cancer screening and diagnosis, offering higher spatial resolution +and greater detail through its 3D-like breast volume imaging capability. +However, the increased data volume also introduces pronounced data imbalance +challenges, where only a small fraction of the volume contains suspicious +tissue. This further exacerbates the data imbalance due to the case-level +distribution in real-world data and leads to learning a trivial classification +model that only predicts the majority class. To address this, we propose a +novel method using view-level contrastive Self-supervised Initialization and +Fine-Tuning for identifying abnormal DBT images, namely SIFT-DBT. We further +introduce a patch-level multi-instance learning method to preserve spatial +resolution. The proposed method achieves 92.69% volume-wise AUC on an +evaluation of 970 unique studies. + +
+
+ comment: Accepted by IEEE ISBI 2024 +
+
+
+
+
+ + ☆ A Parallel Workflow for Polar Sea-Ice Classification using Auto-labeling + of Sentinel-2 Imagery + + +
+ The observation of the advancing and retreating pattern of polar sea ice +cover stands as a vital indicator of global warming. This research aims to +develop a robust, effective, and scalable system for classifying polar sea ice +as thick/snow-covered, young/thin, or open water using Sentinel-2 (S2) images. +Since the S2 satellite is actively capturing high-resolution imagery over the +earth's surface, there are lots of images that need to be classified. One major +obstacle is the absence of labeled S2 training data (images) to act as the +ground truth. We demonstrate a scalable and accurate method for segmenting and +automatically labeling S2 images using carefully determined color thresholds. +We employ a parallel workflow using PySpark to scale and achieve 9-fold data +loading and 16-fold map-reduce speedup on auto-labeling S2 images based on thin +cloud and shadow-filtered color-based segmentation to generate label data. The +auto-labeled data generated from this process are then employed to train a +U-Net machine learning model, resulting in good classification accuracy. As +training the U-Net classification model is computationally heavy and +time-consuming, we distribute the U-Net model training to scale it over 8 GPUs +using the Horovod framework over a DGX cluster with a 7.21x speedup without +affecting the accuracy of the model. Using the Antarctic's Ross Sea region as +an example, the U-Net model trained on auto-labeled data achieves a +classification accuracy of 98.97% for auto-labeled training datasets when the +thin clouds and shadows from the S2 images are filtered out. + +
+
+ comment: Accepted in the 25th IEEE International Workshop on Parallel and + Distributed Scientific and Engineering Computing (PDSEC 2024), May 2024. + arXiv admin note: substantial text overlap with arXiv:2303.12719 +
+
+
+
+
+ + ☆ Better Call SAL: Towards Learning to Segment Anything in Lidar + + +
+ We propose $\texttt{SAL}$ ($\texttt{S}$egment $\texttt{A}$nything in +$\texttt{L}$idar) method consisting of a text-promptable zero-shot model for +segmenting and classifying any object in Lidar, and a pseudo-labeling engine +that facilitates model training without manual supervision. While the +established paradigm for $\textit{Lidar Panoptic Segmentation}$ (LPS) relies on +manual supervision for a handful of object classes defined a priori, we utilize +2D vision foundation models to generate 3D supervision "for free". Our +pseudo-labels consist of instance masks and corresponding CLIP tokens, which we +lift to Lidar using calibrated multi-modal data. By training our model on these +labels, we distill the 2D foundation models into our Lidar $\texttt{SAL}$ +model. Even without manual labels, our model achieves $91\%$ in terms of +class-agnostic segmentation and $44\%$ in terms of zero-shot LPS of the fully +supervised state-of-the-art. Furthermore, we outperform several baselines that +do not distill but only lift image features to 3D. More importantly, we +demonstrate that $\texttt{SAL}$ supports arbitrary class prompts, can be easily +extended to new datasets, and shows significant potential to improve with +increasing amounts of self-labeled data. + +
+
+
+
+
+ + ☆ Trustworthiness of Pretrained Transformers for Lung Cancer Segmentation + + +
+ We assessed the trustworthiness of two self-supervision pretrained +transformer models, Swin UNETR and SMIT, for fine-tuned lung (LC) tumor +segmentation using 670 CT and MRI scans. We measured segmentation accuracy on +two public 3D-CT datasets, robustness on CT scans of patients with COVID-19, CT +scans of patients with ovarian cancer and T2-weighted MRI of men with prostate +cancer, and zero-shot generalization of LC for T2-weighted MRIs. Both models +demonstrated high accuracy on in-distribution data (Dice 0.80 for SMIT and 0.78 +for Swin UNETR). SMIT showed similar near-out-of-distribution performance on CT +scans (AUROC 89.85% vs. 89.19%) but significantly better +far-out-of-distribution accuracy on CT (AUROC 97.2% vs. 87.1%) and MRI (92.15% +vs. 73.8%). SMIT outperformed Swin UNETR in zero-shot segmentation on MRI (Dice +0.78 vs. 0.69). We expect these findings to guide the safe development and +deployment of current and future pretrained models in routine clinical use. + +
+
+
+
+
+ + ☆ Knowing Your Nonlinearities: Shapley Interactions Reveal the Underlying + Structure of Data + + +
+ Measuring nonlinear feature interaction is an established approach to +understanding complex patterns of attribution in many models. In this paper, we +use Shapley Taylor interaction indices (STII) to analyze the impact of +underlying data structure on model representations in a variety of modalities, +tasks, and architectures. Considering linguistic structure in masked and +auto-regressive language models (MLMs and ALMs), we find that STII increases +within idiomatic expressions and that MLMs scale STII with syntactic distance, +relying more on syntax in their nonlinear structure than ALMs do. Our speech +model findings reflect the phonetic principal that the openness of the oral +cavity determines how much a phoneme varies based on its context. Finally, we +study image classifiers and illustrate that feature interactions intuitively +reflect object boundaries. Our wide range of results illustrates the benefits +of interdisciplinary work and domain expertise in interpretability research. + +
+
+
+
+
+ + ☆ Train Ego-Path Detection on Railway Tracks Using End-to-End Deep + Learning + + +
+ This paper introduces the task of "train ego-path detection", a refined +approach to railway track detection designed for intelligent onboard vision +systems. Whereas existing research lacks precision and often considers all +tracks within the visual field uniformly, our proposed task specifically aims +to identify the train's immediate path, or "ego-path", within potentially +complex and dynamic railway environments. Building on this, we extend the +RailSem19 dataset with ego-path annotations, facilitating further research in +this direction. At the heart of our study lies TEP-Net, an end-to-end deep +learning framework tailored for ego-path detection, featuring a configurable +model architecture, a dynamic data augmentation strategy, and a domain-specific +loss function. Leveraging a regression-based approach, TEP-Net outperforms +SOTA: while addressing the track detection problem in a more nuanced way than +previously, our model achieves 97.5% IoU on the test set and is faster than all +existing methods. Further comparative analysis highlights the relevance of the +conceptual choices behind TEP-Net, demonstrating its inherent propensity for +robustness across diverse environmental conditions and operational dynamics. +This work opens promising avenues for the development of intelligent driver +assistance systems and autonomous train operations, paving the way toward safer +and more efficient railway transportation. + +
+
+
+
+
+ + ☆ HuLP: Human-in-the-Loop for Prognosis + + +
+ This paper introduces HuLP, a Human-in-the-Loop for Prognosis model designed +to enhance the reliability and interpretability of prognostic models in +clinical contexts, especially when faced with the complexities of missing +covariates and outcomes. HuLP offers an innovative approach that enables human +expert intervention, empowering clinicians to interact with and correct models' +predictions, thus fostering collaboration between humans and AI models to +produce more accurate prognosis. Additionally, HuLP addresses the challenges of +missing data by utilizing neural networks and providing a tailored methodology +that effectively handles missing data. Traditional methods often struggle to +capture the nuanced variations within patient populations, leading to +compromised prognostic predictions. HuLP imputes missing covariates based on +imaging features, aligning more closely with clinician workflows and enhancing +reliability. We conduct our experiments on two real-world, publicly available +medical datasets to demonstrate the superiority of HuLP. + +
+
+
+
+
+ + ☆ SceneScript: Reconstructing Scenes With An Autoregressive Structured + Language Model + + +
+ We introduce SceneScript, a method that directly produces full scene models +as a sequence of structured language commands using an autoregressive, +token-based approach. Our proposed scene representation is inspired by recent +successes in transformers & LLMs, and departs from more traditional methods +which commonly describe scenes as meshes, voxel grids, point clouds or radiance +fields. Our method infers the set of structured language commands directly from +encoded visual data using a scene language encoder-decoder architecture. To +train SceneScript, we generate and release a large-scale synthetic dataset +called Aria Synthetic Environments consisting of 100k high-quality in-door +scenes, with photorealistic and ground-truth annotated renders of egocentric +scene walkthroughs. Our method gives state-of-the art results in architectural +layout estimation, and competitive results in 3D object detection. Lastly, we +explore an advantage for SceneScript, which is the ability to readily adapt to +new commands via simple additions to the structured language, which we +illustrate for tasks such as coarse 3D object part reconstruction. + +
+
+ comment: see project page, https://projectaria.com/scenescript +
+
+
+
+
+ + ☆ Magic Fixup: Streamlining Photo Editing by Watching Dynamic Videos + + +
+ We propose a generative model that, given a coarsely edited image, +synthesizes a photorealistic output that follows the prescribed layout. Our +method transfers fine details from the original image and preserves the +identity of its parts. Yet, it adapts it to the lighting and context defined by +the new layout. Our key insight is that videos are a powerful source of +supervision for this task: objects and camera motions provide many observations +of how the world changes with viewpoint, lighting, and physical interactions. +We construct an image dataset in which each sample is a pair of source and +target frames extracted from the same video at randomly chosen time intervals. +We warp the source frame toward the target using two motion models that mimic +the expected test-time user edits. We supervise our model to translate the +warped image into the ground truth, starting from a pretrained diffusion model. +Our model design explicitly enables fine detail transfer from the source frame +to the generated image, while closely following the user-specified layout. We +show that by using simple segmentations and coarse 2D manipulations, we can +synthesize a photorealistic edit faithful to the user's input while addressing +second-order effects like harmonizing the lighting and physical interactions +between edited objects. + +
+
+ comment: Project page: https://magic-fixup.github.io/ +
+
+
+
+
+ + ☆ When Do We Not Need Larger Vision Models? + + +
+ Scaling up the size of vision models has been the de facto standard to obtain +more powerful visual representations. In this work, we discuss the point beyond +which larger vision models are not necessary. First, we demonstrate the power +of Scaling on Scales (S$^2$), whereby a pre-trained and frozen smaller vision +model (e.g., ViT-B or ViT-L), run over multiple image scales, can outperform +larger models (e.g., ViT-H or ViT-G) on classification, segmentation, depth +estimation, Multimodal LLM (MLLM) benchmarks, and robotic manipulation. +Notably, S$^2$ achieves state-of-the-art performance in detailed understanding +of MLLM on the V* benchmark, surpassing models such as GPT-4V. We examine the +conditions under which S$^2$ is a preferred scaling approach compared to +scaling on model size. While larger models have the advantage of better +generalization on hard examples, we show that features of larger vision models +can be well approximated by those of multi-scale smaller models. This suggests +most, if not all, of the representations learned by current large pre-trained +models can also be obtained from multi-scale smaller models. Our results show +that a multi-scale smaller model has comparable learning capacity to a larger +model, and pre-training smaller models with S$^2$ can match or even exceed the +advantage of larger models. We release a Python package that can apply S$^2$ on +any vision model with one line of code: +https://github.com/bfshi/scaling_on_scales. + +
+
+ comment: Code: https://github.com/bfshi/scaling_on_scales +
+
+
+
+
+ + ☆ TAPTR: Tracking Any Point with Transformers as Detection + + +
+ In this paper, we propose a simple and strong framework for Tracking Any +Point with TRansformers (TAPTR). Based on the observation that point tracking +bears a great resemblance to object detection and tracking, we borrow designs +from DETR-like algorithms to address the task of TAP. In the proposed +framework, in each video frame, each tracking point is represented as a point +query, which consists of a positional part and a content part. As in DETR, each +query (its position and content feature) is naturally updated layer by layer. +Its visibility is predicted by its updated content feature. Queries belonging +to the same tracking point can exchange information through self-attention +along the temporal dimension. As all such operations are well-designed in +DETR-like algorithms, the model is conceptually very simple. We also adopt some +useful designs such as cost volume from optical flow models and develop simple +designs to provide long temporal information while mitigating the feature +drifting issue. Our framework demonstrates strong performance with +state-of-the-art performance on various TAP datasets with faster inference +speed. + +
+
+
+
+
+ + ☆ Physics-Guided Neural Networks for Intraventricular Vector Flow Mapping + + +
+ Intraventricular vector flow mapping (iVFM) seeks to enhance and quantify +color Doppler in cardiac imaging. In this study, we propose novel alternatives +to the traditional iVFM optimization scheme by utilizing physics-informed +neural networks (PINNs) and a physics-guided nnU-Net-based supervised approach. +Through rigorous evaluation on simulated color Doppler images derived from a +patient-specific computational fluid dynamics model and in vivo Doppler +acquisitions, both approaches demonstrate comparable reconstruction performance +to the original iVFM algorithm. The efficiency of PINNs is boosted through +dual-stage optimization and pre-optimized weights. On the other hand, the +nnU-Net method excels in generalizability and real time capabilities. Notably, +nnU-Net shows superior robustness on sparse and truncated Doppler data while +maintaining independence from explicit boundary conditions. Overall, our +results highlight the effectiveness of these methods in reconstructing +intraventricular vector blood flow. The study also suggests potential +applications of PINNs in ultrafast color Doppler imaging and the incorporation +of fluid dynamics equations to derive biomarkers for cardiovascular diseases +based on blood flow. + +
+
+ comment: 11 pages, submitted to IEEE TUFFC +
+
+
+
+
+ + ☆ Emotic Masked Autoencoder with Attention Fusion for Facial Expression + Recognition + + +
+ Facial Expression Recognition (FER) is a critical task within computer vision +with diverse applications across various domains. Addressing the challenge of +limited FER datasets, which hampers the generalization capability of expression +recognition models, is imperative for enhancing performance. Our paper presents +an innovative approach integrating the MAE-Face self-supervised learning (SSL) +method and Fusion Attention mechanism for expression classification, +particularly showcased in the 6th Affective Behavior Analysis in-the-wild +(ABAW) competition. Additionally, we propose preprocessing techniques to +emphasize essential facial features, thereby enhancing model performance on +both training and validation sets, notably demonstrated on the Aff-wild2 +dataset. + +
+
+
+
+
+ + ☆ Emotion Recognition Using Transformers with Masked Learning + + +
+ In recent years, deep learning has achieved innovative advancements in +various fields, including the analysis of human emotions and behaviors. +Initiatives such as the Affective Behavior Analysis in-the-wild (ABAW) +competition have been particularly instrumental in driving research in this +area by providing diverse and challenging datasets that enable precise +evaluation of complex emotional states. This study leverages the Vision +Transformer (ViT) and Transformer models to focus on the estimation of +Valence-Arousal (VA), which signifies the positivity and intensity of emotions, +recognition of various facial expressions, and detection of Action Units (AU) +representing fundamental muscle movements. This approach transcends traditional +Convolutional Neural Networks (CNNs) and Long Short-Term Memory (LSTM) based +methods, proposing a new Transformer-based framework that maximizes the +understanding of temporal and spatial features. The core contributions of this +research include the introduction of a learning technique through random frame +masking and the application of Focal loss adapted for imbalanced data, +enhancing the accuracy and applicability of emotion and behavior analysis in +real-world settings. This approach is expected to contribute to the advancement +of emotional computing and deep learning methodologies. + +
+
+
+
+
+ + ☆ Super-High-Fidelity Image Compression via Hierarchical-ROI and Adaptive + Quantization + + +
+ Learned Image Compression (LIC) has achieved dramatic progress regarding +objective and subjective metrics. MSE-based models aim to improve objective +metrics while generative models are leveraged to improve visual quality +measured by subjective metrics. However, they all suffer from blurring or +deformation at low bit rates, especially at below $0.2bpp$. Besides, +deformation on human faces and text is unacceptable for visual quality +assessment, and the problem becomes more prominent on small faces and text. To +solve this problem, we combine the advantage of MSE-based models and generative +models by utilizing region of interest (ROI). We propose Hierarchical-ROI +(H-ROI), to split images into several foreground regions and one background +region to improve the reconstruction of regions containing faces, text, and +complex textures. Further, we propose adaptive quantization by non-linear +mapping within the channel dimension to constrain the bit rate while +maintaining the visual quality. Exhaustive experiments demonstrate that our +methods achieve better visual quality on small faces and text with lower bit +rates, e.g., $0.7X$ bits of HiFiC and $0.5X$ bits of BPG. + +
+
+
+
+
+ + ☆ Bypassing LLM Watermarks with Color-Aware Substitutions + + +
+ Watermarking approaches are proposed to identify if text being circulated is +human or large language model (LLM) generated. The state-of-the-art +watermarking strategy of Kirchenbauer et al. (2023a) biases the LLM to generate +specific (``green'') tokens. However, determining the robustness of this +watermarking method is an open problem. Existing attack methods fail to evade +detection for longer text segments. We overcome this limitation, and propose +{\em Self Color Testing-based Substitution (SCTS)}, the first ``color-aware'' +attack. SCTS obtains color information by strategically prompting the +watermarked LLM and comparing output tokens frequencies. It uses this +information to determine token colors, and substitutes green tokens with +non-green ones. In our experiments, SCTS successfully evades watermark +detection using fewer number of edits than related work. Additionally, we show +both theoretically and empirically that SCTS can remove the watermark for +arbitrarily long watermarked text. + +
+
+
+
+
+ + ☆ Understanding Why Label Smoothing Degrades Selective Classification and + How to Fix It + + +
+ Label smoothing (LS) is a popular regularisation method for training deep +neural network classifiers due to its effectiveness in improving test accuracy +and its simplicity in implementation. "Hard" one-hot labels are "smoothed" by +uniformly distributing probability mass to other classes, reducing overfitting. +In this work, we reveal that LS negatively affects selective classification +(SC) - where the aim is to reject misclassifications using a model's predictive +uncertainty. We first demonstrate empirically across a range of tasks and +architectures that LS leads to a consistent degradation in SC. We then explain +this by analysing logit-level gradients, showing that LS exacerbates +overconfidence and underconfidence by regularising the max logit more when the +probability of error is low, and less when the probability of error is high. +This elucidates previously reported experimental results where strong +classifiers underperform in SC. We then demonstrate the empirical effectiveness +of logit normalisation for recovering lost SC performance caused by LS. +Furthermore, based on our gradient analysis, we explain why such normalisation +is effective. We will release our code shortly. + +
+
+
+
+
+ + ☆ Unveiling the Anomalies in an Ever-Changing World: A Benchmark for + Pixel-Level Anomaly Detection in Continual Learning + + +
+ Anomaly Detection is a relevant problem in numerous real-world applications, +especially when dealing with images. However, little attention has been paid to +the issue of changes over time in the input data distribution, which may cause +a significant decrease in performance. In this study, we investigate the +problem of Pixel-Level Anomaly Detection in the Continual Learning setting, +where new data arrives over time and the goal is to perform well on new and old +data. We implement several state-of-the-art techniques to solve the Anomaly +Detection problem in the classic setting and adapt them to work in the +Continual Learning setting. To validate the approaches, we use a real-world +dataset of images with pixel-based anomalies to provide a reliable benchmark +and serve as a foundation for further advancements in the field. We provide a +comprehensive analysis, discussing which Anomaly Detection methods and which +families of approaches seem more suitable for the Continual Learning setting. + +
+
+
+
+
+ + ♻ ☆ Resolution- and Stimulus-agnostic Super-Resolution of Ultra-High-Field + Functional MRI: Application to Visual Studies + + +
+ High-resolution fMRI provides a window into the brain's mesoscale +organization. Yet, higher spatial resolution increases scan times, to +compensate for the low signal and contrast-to-noise ratio. This work introduces +a deep learning-based 3D super-resolution (SR) method for fMRI. By +incorporating a resolution-agnostic image augmentation framework, our method +adapts to varying voxel sizes without retraining. We apply this innovative +technique to localize fine-scale motion-selective sites in the early visual +areas. Detection of these sites typically requires a resolution higher than 1 +mm isotropic, whereas here, we visualize them based on lower resolution (2-3mm +isotropic) fMRI data. Remarkably, the super-resolved fMRI is able to recover +high-frequency detail of the interdigitated organization of these sites +(relative to the color-selective sites), even with training data sourced from +different subjects and experimental paradigms -- including non-visual +resting-state fMRI, underscoring its robustness and versatility. Quantitative +and qualitative results indicate that our method has the potential to enhance +the spatial resolution of fMRI, leading to a drastic reduction in acquisition +time. + +
+
+ comment: ISBI2024 final version +
+
+
+
+
+ + ♻ ☆ DRESS: Instructing Large Vision-Language Models to Align and Interact + with Humans via Natural Language Feedback CVPR 2024 + + +
+ We present DRESS, a large vision language model (LVLM) that innovatively +exploits Natural Language feedback (NLF) from Large Language Models to enhance +its alignment and interactions by addressing two key limitations in the +state-of-the-art LVLMs. First, prior LVLMs generally rely only on the +instruction finetuning stage to enhance alignment with human preferences. +Without incorporating extra feedback, they are still prone to generate +unhelpful, hallucinated, or harmful responses. Second, while the visual +instruction tuning data is generally structured in a multi-turn dialogue +format, the connections and dependencies among consecutive conversational turns +are weak. This reduces the capacity for effective multi-turn interactions. To +tackle these, we propose a novel categorization of the NLF into two key types: +critique and refinement. The critique NLF identifies the strengths and +weaknesses of the responses and is used to align the LVLMs with human +preferences. The refinement NLF offers concrete suggestions for improvement and +is adopted to improve the interaction ability of the LVLMs-- which focuses on +LVLMs' ability to refine responses by incorporating feedback in multi-turn +interactions. To address the non-differentiable nature of NLF, we generalize +conditional reinforcement learning for training. Our experimental results +demonstrate that DRESS can generate more helpful (9.76%), honest (11.52%), and +harmless (21.03%) responses, and more effectively learn from feedback during +multi-turn interactions compared to SOTA LVMLs. + +
+
+ comment: CVPR 2024. The feedback datasets are released at: + https://huggingface.co/datasets/YangyiYY/LVLM_NLF +
+
+
+
+
+ + ♻ ☆ EscherNet: A Generative Model for Scalable View Synthesis CVPR2024 + + +
+ We introduce EscherNet, a multi-view conditioned diffusion model for view +synthesis. EscherNet learns implicit and generative 3D representations coupled +with a specialised camera positional encoding, allowing precise and continuous +relative control of the camera transformation between an arbitrary number of +reference and target views. EscherNet offers exceptional generality, +flexibility, and scalability in view synthesis -- it can generate more than 100 +consistent target views simultaneously on a single consumer-grade GPU, despite +being trained with a fixed number of 3 reference views to 3 target views. As a +result, EscherNet not only addresses zero-shot novel view synthesis, but also +naturally unifies single- and multi-image 3D reconstruction, combining these +diverse tasks into a single, cohesive framework. Our extensive experiments +demonstrate that EscherNet achieves state-of-the-art performance in multiple +benchmarks, even when compared to methods specifically tailored for each +individual problem. This remarkable versatility opens up new directions for +designing scalable neural architectures for 3D vision. Project page: +https://kxhit.github.io/EscherNet. + +
+
+ comment: CVPR2024 Project Page: https://kxhit.github.io/EscherNet +
+
+
+
+
+ + ♻ ☆ Best of Both Worlds: Hybrid SNN-ANN Architecture for Event-based Optical + Flow Estimation + + +
+ In the field of robotics, event-based cameras are emerging as a promising +low-power alternative to traditional frame-based cameras for capturing +high-speed motion and high dynamic range scenes. This is due to their sparse +and asynchronous event outputs. Spiking Neural Networks (SNNs) with their +asynchronous event-driven compute, show great potential for extracting the +spatio-temporal features from these event streams. In contrast, the standard +Analog Neural Networks (ANNs) fail to process event data effectively. However, +training SNNs is difficult due to additional trainable parameters (thresholds +and leaks), vanishing spikes at deeper layers, and a non-differentiable binary +activation function. Furthermore, an additional data structure, membrane +potential, responsible for keeping track of temporal information, must be +fetched and updated at every timestep in SNNs. To overcome these challenges, we +propose a novel SNN-ANN hybrid architecture that combines the strengths of +both. Specifically, we leverage the asynchronous compute capabilities of SNN +layers to effectively extract the input temporal information. Concurrently, the +ANN layers facilitate training and efficient hardware deployment on traditional +machine learning hardware such as GPUs. We provide extensive experimental +analysis for assigning each layer to be spiking or analog, leading to a network +configuration optimized for performance and ease of training. We evaluate our +hybrid architecture for optical flow estimation on DSEC-flow and Multi-Vehicle +Stereo Event-Camera (MVSEC) datasets. On the DSEC-flow dataset, the hybrid +SNN-ANN architecture achieves a 40% reduction in average endpoint error (AEE) +with 22% lower energy consumption compared to Full-SNN, and 48% lower AEE +compared to Full-ANN, while maintaining comparable energy usage. + +
+
+
+
+
+ + ♻ ☆ Exploring Facial Expression Recognition through Semi-Supervised + Pretraining and Temporal Modeling + + +
+ Facial Expression Recognition (FER) plays a crucial role in computer vision +and finds extensive applications across various fields. This paper aims to +present our approach for the upcoming 6th Affective Behavior Analysis +in-the-Wild (ABAW) competition, scheduled to be held at CVPR2024. In the facial +expression recognition task, The limited size of the FER dataset poses a +challenge to the expression recognition model's generalization ability, +resulting in subpar recognition performance. To address this problem, we employ +a semi-supervised learning technique to generate expression category +pseudo-labels for unlabeled face data. At the same time, we uniformly sampled +the labeled facial expression samples and implemented a debiased feedback +learning strategy to address the problem of category imbalance in the dataset +and the possible data bias in semi-supervised learning. Moreover, to further +compensate for the limitation and bias of features obtained only from static +images, we introduced a Temporal Encoder to learn and capture temporal +relationships between neighbouring expression image features. In the 6th ABAW +competition, our method achieved outstanding results on the official validation +set, a result that fully confirms the effectiveness and competitiveness of our +proposed method. + +
+
+
+
+
+ + ♻ ☆ Align before Adapt: Leveraging Entity-to-Region Alignments for + Generalizable Video Action Recognition CVPR 2024 + + +
+ Large-scale visual-language pre-trained models have achieved significant +success in various video tasks. However, most existing methods follow an "adapt +then align" paradigm, which adapts pre-trained image encoders to model +video-level representations and utilizes one-hot or text embedding of the +action labels for supervision. This paradigm overlooks the challenge of mapping +from static images to complicated activity concepts. In this paper, we propose +a novel "Align before Adapt" (ALT) paradigm. Prior to adapting to video +representation learning, we exploit the entity-to-region alignments for each +frame. The alignments are fulfilled by matching the region-aware image +embeddings to an offline-constructed text corpus. With the aligned entities, we +feed their text embeddings to a transformer-based video adapter as the queries, +which can help extract the semantics of the most important entities from a +video to a vector. This paradigm reuses the visual-language alignment of VLP +during adaptation and tries to explain an action by the underlying entities. +This helps understand actions by bridging the gap with complex activity +semantics, particularly when facing unfamiliar or unseen categories. ALT +demonstrates competitive performance while maintaining remarkably low +computational costs. In fully supervised experiments, it achieves 88.1% top-1 +accuracy on Kinetics-400 with only 4947 GFLOPs. Moreover, ALT outperforms the +previous state-of-the-art methods in both zero-shot and few-shot experiments, +emphasizing its superior generalizability across various learning scenarios. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Vertical Federated Image Segmentation + + +
+ With the popularization of AI solutions for image based problems, there has +been a growing concern for both data privacy and acquisition. In a large number +of cases, information is located on separate data silos and it can be difficult +for a developer to consolidate all of it in a fashion that is appropriate for +machine learning model development. Alongside this, a portion of these +localized data regions may not have access to a labelled ground truth. This +indicates that they have the capacity to reach conclusions numerically, but are +not able to assign classifications amid a lack of pertinent information. Such a +determination is often negligible, especially when attempting to develop image +based solutions that often necessitate this capability. With this being the +case, we propose an innovative vertical federated learning (VFL) model +architecture that can operate under this common set of conditions. This is the +first (and currently the only) implementation of a system that can work under +the constraints of a VFL environment and perform image segmentation while +maintaining nominal accuracies. We achieved this by utilizing an FCN that +boasts the ability to operate on federates that lack labelled data and +privately share the respective weights with a central server, that of which +hosts the necessary features for classification. Tests were conducted on the +CamVid dataset in order to determine the impact of heavy feature compression +required for the transfer of information between federates, as well as to reach +nominal conclusions about the overall performance metrics when working under +such constraints. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Self-Supervised Learning for Image Super-Resolution and Deblurring + + +
+ Self-supervised methods have recently proved to be nearly as effective as +supervised methods in various imaging inverse problems, paving the way for +learning-based methods in scientific and medical imaging applications where +ground truth data is hard or expensive to obtain. This is the case in magnetic +resonance imaging and computed tomography. These methods critically rely on +invariance to translations and/or rotations of the image distribution to learn +from incomplete measurement data alone. However, existing approaches fail to +obtain competitive performances in the problems of image super-resolution and +deblurring, which play a key role in most imaging systems. In this work, we +show that invariance to translations and rotations is insufficient to learn +from measurements that only contain low-frequency information. Instead, we +propose a new self-supervised approach that leverages the fact that many image +distributions are approximately scale-invariant, and that enables recovering +high-frequency information lost in the measurement process. We demonstrate +throughout a series of experiments on real datasets that the proposed method +outperforms other self-supervised approaches, and obtains performances on par +with fully supervised learning. + +
+
+
+
+
+ + ♻ ☆ SmartRefine: A Scenario-Adaptive Refinement Framework for Efficient + Motion Prediction CVPR 2024 + + +
+ Predicting the future motion of surrounding agents is essential for +autonomous vehicles (AVs) to operate safely in dynamic, human-robot-mixed +environments. Context information, such as road maps and surrounding agents' +states, provides crucial geometric and semantic information for motion behavior +prediction. To this end, recent works explore two-stage prediction frameworks +where coarse trajectories are first proposed, and then used to select critical +context information for trajectory refinement. However, they either incur a +large amount of computation or bring limited improvement, if not both. In this +paper, we introduce a novel scenario-adaptive refinement strategy, named +SmartRefine, to refine prediction with minimal additional computation. +Specifically, SmartRefine can comprehensively adapt refinement configurations +based on each scenario's properties, and smartly chooses the number of +refinement iterations by introducing a quality score to measure the prediction +quality and remaining refinement potential of each scenario. SmartRefine is +designed as a generic and flexible approach that can be seamlessly integrated +into most state-of-the-art motion prediction models. Experiments on Argoverse +(1 & 2) show that our method consistently improves the prediction accuracy of +multiple state-of-the-art prediction models. Specifically, by adding +SmartRefine to QCNet, we outperform all published ensemble-free works on the +Argoverse 2 leaderboard (single agent track) at submission. Comprehensive +studies are also conducted to ablate design choices and explore the mechanism +behind multi-iteration refinement. Codes are available at +https://github.com/opendilab/SmartRefine/ + +
+
+ comment: Camera-ready version for CVPR 2024 +
+
+
+
+
+ + ♻ ☆ SynCDR : Training Cross Domain Retrieval Models with Synthetic Data + + +
+ In cross-domain retrieval, a model is required to identify images from the +same semantic category across two visual domains. For instance, given a sketch +of an object, a model needs to retrieve a real image of it from an online +store's catalog. A standard approach for such a problem is learning a feature +space of images where Euclidean distances reflect similarity. Even without +human annotations, which may be expensive to acquire, prior methods function +reasonably well using unlabeled images for training. Our problem constraint +takes this further to scenarios where the two domains do not necessarily share +any common categories in training data. This can occur when the two domains in +question come from different versions of some biometric sensor recording +identities of different people. We posit a simple solution, which is to +generate synthetic data to fill in these missing category examples across +domains. This, we do via category preserving translation of images from one +visual domain to another. We compare approaches specifically trained for this +translation for a pair of domains, as well as those that can use large-scale +pre-trained text-to-image diffusion models via prompts, and find that the +latter can generate better replacement synthetic data, leading to more accurate +cross-domain retrieval models. Our best SynCDR model can outperform prior art +by up to 15\%. Code for our work is available at +https://github.com/samarth4149/SynCDR . + +
+
+ comment: Pre-print +
+
+
+
+
+ + ♻ ☆ Generic 3D Diffusion Adapter Using Controlled Multi-View Editing + + +
+ Open-domain 3D object synthesis has been lagging behind image synthesis due +to limited data and higher computational complexity. To bridge this gap, recent +works have investigated multi-view diffusion but often fall short in either 3D +consistency, visual quality, or efficiency. This paper proposes MVEdit, which +functions as a 3D counterpart of SDEdit, employing ancestral sampling to +jointly denoise multi-view images and output high-quality textured meshes. +Built on off-the-shelf 2D diffusion models, MVEdit achieves 3D consistency +through a training-free 3D Adapter, which lifts the 2D views of the last +timestep into a coherent 3D representation, then conditions the 2D views of the +next timestep using rendered views, without uncompromising visual quality. With +an inference time of only 2-5 minutes, this framework achieves better trade-off +between quality and speed than score distillation. MVEdit is highly versatile +and extendable, with a wide range of applications including text/image-to-3D +generation, 3D-to-3D editing, and high-quality texture synthesis. In +particular, evaluations demonstrate state-of-the-art performance in both +image-to-3D and text-guided texture generation tasks. Additionally, we +introduce a method for fine-tuning 2D latent diffusion models on small 3D +datasets with limited resources, enabling fast low-resolution text-to-3D +initialization. + +
+
+ comment: V2 note: Fix missing acknowledgements. Project page: + https://lakonik.github.io/mvedit +
+
+
+
+
+ + ♻ ☆ BugNIST - a Large Volumetric Dataset for Object Detection under Domain + Shift + + +
+ Domain shift significantly influences the performance of deep learning +algorithms, particularly for object detection within volumetric 3D images. +Annotated training data is essential for deep learning-based object detection. +However, annotating densely packed objects is time-consuming and costly. +Instead, we suggest training models on individually scanned objects, causing a +domain shift between training and detection data. To address this challenge, we +introduce the BugNIST dataset, comprising 9154 micro-CT volumes of 12 bug types +and 388 volumes of tightly packed bug mixtures. This dataset is characterized +by having objects with the same appearance in the source and target domain, +which is uncommon for other benchmark datasets for domain shift. During +training, individual bug volumes labeled by class are utilized, while testing +employs mixtures with center point annotations and bug type labels. Together +with the dataset, we provide a baseline detection analysis, aiming at advancing +the field of 3D object detection methods. + +
+
+ comment: 20 pages, 6 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ MM1: Methods, Analysis & Insights from Multimodal LLM Pre-training + + +
+ In this work, we discuss building performant Multimodal Large Language Models +(MLLMs). In particular, we study the importance of various architecture +components and data choices. Through careful and comprehensive ablations of the +image encoder, the vision language connector, and various pre-training data +choices, we identified several crucial design lessons. For example, we +demonstrate that for large-scale multimodal pre-training using a careful mix of +image-caption, interleaved image-text, and text-only data is crucial for +achieving state-of-the-art (SOTA) few-shot results across multiple benchmarks, +compared to other published pre-training results. Further, we show that the +image encoder together with image resolution and the image token count has +substantial impact, while the vision-language connector design is of +comparatively negligible importance. By scaling up the presented recipe, we +build MM1, a family of multimodal models up to 30B parameters, including both +dense models and mixture-of-experts (MoE) variants, that are SOTA in +pre-training metrics and achieve competitive performance after supervised +fine-tuning on a range of established multimodal benchmarks. Thanks to +large-scale pre-training, MM1 enjoys appealing properties such as enhanced +in-context learning, and multi-image reasoning, enabling few-shot +chain-of-thought prompting. + +
+
+
+
+
+ + ♻ ☆ Cross-Domain Few-Shot Object Detection via Enhanced Open-Set Object + Detector + + +
+ This paper studies the challenging cross-domain few-shot object detection +(CD-FSOD), aiming to develop an accurate object detector for novel domains with +minimal labeled examples. While transformer-based open-set detectors, such as +DE-ViT, show promise in traditional few-shot object detection, their +generalization to CD-FSOD remains unclear: 1) can such open-set detection +methods easily generalize to CD-FSOD? 2) If not, how can models be enhanced +when facing huge domain gaps? To answer the first question, we employ measures +including style, inter-class variance (ICV), and indefinable boundaries (IB) to +understand the domain gap. Based on these measures, we establish a new +benchmark named CD-FSOD to evaluate object detection methods, revealing that +most of the current approaches fail to generalize across domains. Technically, +we observe that the performance decline is associated with our proposed +measures: style, ICV, and IB. Consequently, we propose several novel modules to +address these issues. First, the learnable instance features align initial +fixed instances with target categories, enhancing feature distinctiveness. +Second, the instance reweighting module assigns higher importance to +high-quality instances with slight IB. Third, the domain prompter encourages +features resilient to different styles by synthesizing imaginary domains +without altering semantic contents. These techniques collectively contribute to +the development of the Cross-Domain Vision Transformer for CD-FSOD (CD-ViTO), +significantly improving upon the base DE-ViT. Experimental results validate the +efficacy of our model. All datasets, codes, and models will be released to the +community. + +
+
+
+
+
+ + ♻ ☆ MambaMIR: An Arbitrary-Masked Mamba for Joint Medical Image + Reconstruction and Uncertainty Estimation + + +
+ The recent Mamba model has shown remarkable adaptability for visual +representation learning, including in medical imaging tasks. This study +introduces MambaMIR, a Mamba-based model for medical image reconstruction, as +well as its Generative Adversarial Network-based variant, MambaMIR-GAN. Our +proposed MambaMIR inherits several advantages, such as linear complexity, +global receptive fields, and dynamic weights, from the original Mamba model. +The innovated arbitrary-mask mechanism effectively adapt Mamba to our image +reconstruction task, providing randomness for subsequent Monte Carlo-based +uncertainty estimation. Experiments conducted on various medical image +reconstruction tasks, including fast MRI and SVCT, which cover anatomical +regions such as the knee, chest, and abdomen, have demonstrated that MambaMIR +and MambaMIR-GAN achieve comparable or superior reconstruction results relative +to state-of-the-art methods. Additionally, the estimated uncertainty maps offer +further insights into the reliability of the reconstruction quality. The code +is publicly available at https://github.com/ayanglab/MambaMIR. + +
+
+
+
+
+ + ♻ ☆ Low-power, Continuous Remote Behavioral Localization with Event Cameras + + +
+ Researchers in natural science need reliable methods for quantifying animal +behavior. Recently, numerous computer vision methods emerged to automate the +process. However, observing wild species at remote locations remains a +challenging task due to difficult lighting conditions and constraints on power +supply and data storage. Event cameras offer unique advantages for +battery-dependent remote monitoring due to their low power consumption and high +dynamic range capabilities. We use this novel sensor to quantify a behavior in +Chinstrap penguins called ecstatic display. We formulate the problem as a +temporal action detection task, determining the start and end times of the +behavior. For this purpose, we recorded a colony of breeding penguins in +Antarctica for several weeks and labeled event data on 16 nests. The developed +method consists of a generator of candidate time intervals (proposals) and a +classifier of the actions within them. The experiments show that the event +cameras' natural response to motion is effective for continuous behavior +monitoring and detection, reaching a mean average precision (mAP) of 58% (which +increases to 63% in good weather conditions). The results also demonstrate the +robustness against various lighting conditions contained in the challenging +dataset. The low-power capabilities of the event camera allow it to record +significantly longer than with a conventional camera. This work pioneers the +use of event cameras for remote wildlife observation, opening new +interdisciplinary opportunities. https://tub-rip.github.io/eventpenguins/ + +
+
+ comment: 13 pages, 8 figures, 12 tables, Project page: + https://tub-rip.github.io/eventpenguins/ +
+
+
+
+
+ + ♻ ☆ GCT: Graph Co-Training for Semi-Supervised Few-Shot Learning + + +
+ Few-shot learning (FSL), purposing to resolve the problem of data-scarce, has +attracted considerable attention in recent years. A popular FSL framework +contains two phases: (i) the pre-train phase employs the base data to train a +CNN-based feature extractor. (ii) the meta-test phase applies the frozen +feature extractor to novel data (novel data has different categories from base +data) and designs a classifier for recognition. To correct few-shot data +distribution, researchers propose Semi-Supervised Few-Shot Learning (SSFSL) by +introducing unlabeled data. Although SSFSL has been proved to achieve +outstanding performances in the FSL community, there still exists a fundamental +problem: the pre-trained feature extractor can not adapt to the novel data +flawlessly due to the cross-category setting. Usually, large amounts of noises +are introduced to the novel feature. We dub it as Feature-Extractor-Maladaptive +(FEM) problem. To tackle FEM, we make two efforts in this paper. First, we +propose a novel label prediction method, Isolated Graph Learning (IGL). IGL +introduces the Laplacian operator to encode the raw data to graph space, which +helps reduce the dependence on features when classifying, and then project +graph representation to label space for prediction. The key point is that: IGL +can weaken the negative influence of noise from the feature representation +perspective, and is also flexible to independently complete training and +testing procedures, which is suitable for SSFSL. Second, we propose Graph +Co-Training (GCT) to tackle this challenge from a multi-modal fusion +perspective by extending the proposed IGL to the co-training framework. GCT is +a semi-supervised method that exploits the unlabeled samples with two modal +features to crossly strengthen the IGL classifier. + +
+
+
+
+
+ + ♻ ☆ BA-SAM: Scalable Bias-Mode Attention Mask for Segment Anything Model + + +
+ In this paper, we address the challenge of image resolution variation for the +Segment Anything Model (SAM). SAM, known for its zero-shot generalizability, +exhibits a performance degradation when faced with datasets with varying image +sizes. Previous approaches tend to resize the image to a fixed size or adopt +structure modifications, hindering the preservation of SAM's rich prior +knowledge. Besides, such task-specific tuning necessitates a complete +retraining of the model, which is cost-expensive and unacceptable for +deployment in the downstream tasks. In this paper, we reformulate this issue as +a length extrapolation problem, where token sequence length varies while +maintaining a consistent patch size for images of different sizes. To this end, +we propose Scalable Bias-Mode Attention Mask (BA-SAM) to enhance SAM's +adaptability to varying image resolutions while eliminating the need for +structure modifications. Firstly, we introduce a new scaling factor to ensure +consistent magnitude in the attention layer's dot product values when the token +sequence length changes. Secondly, we present a bias-mode attention mask that +allows each token to prioritize neighboring information, mitigating the impact +of untrained distant information. Our BA-SAM demonstrates efficacy in two +scenarios: zero-shot and fine-tuning. Extensive evaluation on diverse datasets, +including DIS5K, DUTS, ISIC, COD10K, and COCO, reveals its ability to +significantly mitigate performance degradation in the zero-shot setting and +achieve state-of-the-art performance with minimal fine-tuning. Furthermore, we +propose a generalized model and benchmark, showcasing BA-SAM's generalizability +across all four datasets simultaneously. + +
+
+ comment: Code:https://github.com/zongzi13545329/BA-SAM +
+
+
+
+
+ + ♻ ☆ WoVoGen: World Volume-aware Diffusion for Controllable Multi-camera + Driving Scene Generation + + +
+ Generating multi-camera street-view videos is critical for augmenting +autonomous driving datasets, addressing the urgent demand for extensive and +varied data. Due to the limitations in diversity and challenges in handling +lighting conditions, traditional rendering-based methods are increasingly being +supplanted by diffusion-based methods. However, a significant challenge in +diffusion-based methods is ensuring that the generated sensor data preserve +both intra-world consistency and inter-sensor coherence. To address these +challenges, we combine an additional explicit world volume and propose the +World Volume-aware Multi-camera Driving Scene Generator (WoVoGen). This system +is specifically designed to leverage 4D world volume as a foundational element +for video generation. Our model operates in two distinct phases: (i) +envisioning the future 4D temporal world volume based on vehicle control +sequences, and (ii) generating multi-camera videos, informed by this envisioned +4D temporal world volume and sensor interconnectivity. The incorporation of the +4D world volume empowers WoVoGen not only to generate high-quality street-view +videos in response to vehicle control inputs but also to facilitate scene +editing tasks. + +
+
+
+
+
+ + ♻ ☆ MuSHRoom: Multi-Sensor Hybrid Room Dataset for Joint 3D Reconstruction + and Novel View Synthesis + + +
+ Metaverse technologies demand accurate, real-time, and immersive modeling on +consumer-grade hardware for both non-human perception (e.g., +drone/robot/autonomous car navigation) and immersive technologies like AR/VR, +requiring both structural accuracy and photorealism. However, there exists a +knowledge gap in how to apply geometric reconstruction and photorealism +modeling (novel view synthesis) in a unified framework. To address this gap and +promote the development of robust and immersive modeling and rendering with +consumer-grade devices, we propose a real-world Multi-Sensor Hybrid Room +Dataset (MuSHRoom). Our dataset presents exciting challenges and requires +state-of-the-art methods to be cost-effective, robust to noisy data and +devices, and can jointly learn 3D reconstruction and novel view synthesis +instead of treating them as separate tasks, making them ideal for real-world +applications. We benchmark several famous pipelines on our dataset for joint 3D +mesh reconstruction and novel view synthesis. Our dataset and benchmark show +great potential in promoting the improvements for fusing 3D reconstruction and +high-quality rendering in a robust and computationally efficient end-to-end +fashion. The dataset and code are available at the project website: +https://xuqianren.github.io/publications/MuSHRoom/. + +
+
+
+
+
+ + ♻ ☆ 3D Scene Creation and Rendering via Rough Meshes: A Lighting Transfer + Avenue + + +
+ This paper studies how to flexibly integrate reconstructed 3D models into +practical 3D modeling pipelines such as 3D scene creation and rendering. Due to +the technical difficulty, one can only obtain rough 3D models (R3DMs) for most +real objects using existing 3D reconstruction techniques. As a result, +physically-based rendering (PBR) would render low-quality images or videos for +scenes that are constructed by R3DMs. One promising solution would be +representing real-world objects as Neural Fields such as NeRFs, which are able +to generate photo-realistic renderings of an object under desired viewpoints. +However, a drawback is that the synthesized views through Neural Fields +Rendering (NFR) cannot reflect the simulated lighting details on R3DMs in PBR +pipelines, especially when object interactions in the 3D scene creation cause +local shadows. To solve this dilemma, we propose a lighting transfer network +(LighTNet) to bridge NFR and PBR, such that they can benefit from each other. +LighTNet reasons about a simplified image composition model, remedies the +uneven surface issue caused by R3DMs, and is empowered by several +perceptual-motivated constraints and a new Lab angle loss which enhances the +contrast between lighting strength and colors. Comparisons demonstrate that +LighTNet is superior in synthesizing impressive lighting, and is promising in +pushing NFR further in practical 3D modeling workflows. + +
+
+ comment: Accepted by IEEE Transactions on Pattern Analysis and Machine + Intelligence (T-PAMI), project page: + http://3d-front-future.github.io/LighTNet +
+
+
+
+
+ + ♻ ☆ P-MapNet: Far-seeing Map Generator Enhanced by both SDMap and HDMap + Priors + + +
+ Autonomous vehicles are gradually entering city roads today, with the help of +high-definition maps (HDMaps). However, the reliance on HDMaps prevents +autonomous vehicles from stepping into regions without this expensive digital +infrastructure. This fact drives many researchers to study online HDMap +generation algorithms, but the performance of these algorithms at far regions +is still unsatisfying. We present P-MapNet, in which the letter P highlights +the fact that we focus on incorporating map priors to improve model +performance. Specifically, we exploit priors in both SDMap and HDMap. On one +hand, we extract weakly aligned SDMap from OpenStreetMap, and encode it as an +additional conditioning branch. Despite the misalignment challenge, our +attention-based architecture adaptively attends to relevant SDMap skeletons and +significantly improves performance. On the other hand, we exploit a masked +autoencoder to capture the prior distribution of HDMap, which can serve as a +refinement module to mitigate occlusions and artifacts. We benchmark on the +nuScenes and Argoverse2 datasets. Through comprehensive experiments, we show +that: (1) our SDMap prior can improve online map generation performance, using +both rasterized (by up to $+18.73$ $\rm mIoU$) and vectorized (by up to $+8.50$ +$\rm mAP$) output representations. (2) our HDMap prior can improve map +perceptual metrics by up to $6.34\%$. (3) P-MapNet can be switched into +different inference modes that covers different regions of the +accuracy-efficiency trade-off landscape. (4) P-MapNet is a far-seeing solution +that brings larger improvements on longer ranges. Codes and models are publicly +available at https://jike5.github.io/P-MapNet. + +
+
+ comment: Code: https://jike5.github.io/P-MapNet +
+
+
+
+
+ + ♻ ☆ MineDreamer: Learning to Follow Instructions via Chain-of-Imagination + for Simulated-World Control + + +
+ It is a long-lasting goal to design a generalist-embodied agent that can +follow diverse instructions in human-like ways. However, existing approaches +often fail to steadily follow instructions due to difficulties in understanding +abstract and sequential natural language instructions. To this end, we +introduce MineDreamer, an open-ended embodied agent built upon the challenging +Minecraft simulator with an innovative paradigm that enhances +instruction-following ability in low-level control signal generation. +Specifically, MineDreamer is developed on top of recent advances in Multimodal +Large Language Models (MLLMs) and diffusion models, and we employ a +Chain-of-Imagination (CoI) mechanism to envision the step-by-step process of +executing instructions and translating imaginations into more precise visual +prompts tailored to the current state; subsequently, the agent generates +keyboard-and-mouse actions to efficiently achieve these imaginations, steadily +following the instructions at each step. Extensive experiments demonstrate that +MineDreamer follows single and multi-step instructions steadily, significantly +outperforming the best generalist agent baseline and nearly doubling its +performance. Moreover, qualitative analysis of the agent's imaginative ability +reveals its generalization and comprehension of the open world. + +
+
+ comment: Project page: https://sites.google.com/view/minedreamer/main +
+
+
+
+
+ + ♻ ☆ TiC-CLIP: Continual Training of CLIP Models ICLR 2024 + + +
+ Keeping large foundation models up to date on latest data is inherently +expensive. To avoid the prohibitive costs of constantly retraining, it is +imperative to \emph{continually} train these models. This problem is +exacerbated by the lack of any large scale continual learning benchmarks or +baselines. We introduce the first set of web-scale Time-Continual (TiC) +benchmarks for training vision-language models: TiC-DataComp, TiC-YFCC, and +TiC-Redcaps. TiC-DataComp, our largest dataset, contains over 12.7B timestamped +image-text pairs spanning 9 years (2014--2022). We first use our benchmarks to +curate various \emph{dynamic} evaluations to measure temporal robustness of +existing models. We show OpenAI's CLIP (trained on data up to 2020) loses +$\approx 8\%$ zero-shot accuracy on our curated retrieval task from 2021--2022 +compared with more recently trained models in OpenCLIP repository. We then +study how to efficiently train models on time-continuous data. We demonstrate +that a simple rehearsal-based approach that continues training from the last +checkpoint and replays old data reduces compute by $2.5\times$ when compared to +the standard practice of retraining from scratch. Code is available at +https://github.com/apple/ml-tic-clip. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Divide and not forget: Ensemble of selectively trained experts in + Continual Learning ICLR 2024 + + +
+ Class-incremental learning is becoming more popular as it helps models widen +their applicability while not forgetting what they already know. A trend in +this area is to use a mixture-of-expert technique, where different models work +together to solve the task. However, the experts are usually trained all at +once using whole task data, which makes them all prone to forgetting and +increasing computational burden. To address this limitation, we introduce a +novel approach named SEED. SEED selects only one, the most optimal expert for a +considered task, and uses data from this task to fine-tune only this expert. +For this purpose, each expert represents each class with a Gaussian +distribution, and the optimal expert is selected based on the similarity of +those distributions. Consequently, SEED increases diversity and heterogeneity +within the experts while maintaining the high stability of this ensemble +method. The extensive experiments demonstrate that SEED achieves +state-of-the-art performance in exemplar-free settings across various +scenarios, showing the potential of expert diversification through data in +continual learning. + +
+
+ comment: Accepted for ICLR 2024 (main track), code is available at: + https://github.com/grypesc/SEED +
+
+
+
+
+ + ♻ ☆ Subjective-Aligned Dateset and Metric for Text-to-Video Quality + Assessment + + +
+ With the rapid development of generative models, Artificial +Intelligence-Generated Contents (AIGC) have exponentially increased in daily +lives. Among them, Text-to-Video (T2V) generation has received widespread +attention. Though many T2V models have been released for generating high +perceptual quality videos, there is still lack of a method to evaluate the +quality of these videos quantitatively. To solve this issue, we establish the +largest-scale Text-to-Video Quality Assessment DataBase (T2VQA-DB) to date. The +dataset is composed of 10,000 videos generated by 9 different T2V models. We +also conduct a subjective study to obtain each video's corresponding mean +opinion score. Based on T2VQA-DB, we propose a novel transformer-based model +for subjective-aligned Text-to-Video Quality Assessment (T2VQA). The model +extracts features from text-video alignment and video fidelity perspectives, +then it leverages the ability of a large language model to give the prediction +score. Experimental results show that T2VQA outperforms existing T2V metrics +and SOTA video quality assessment models. Quantitative analysis indicates that +T2VQA is capable of giving subjective-align predictions, validating its +effectiveness. The dataset and code will be released at +https://github.com/QMME/T2VQA. + +
+
+
+
+
+ + ♻ ☆ PopulAtion Parameter Averaging (PAPA) + + +
+ Ensemble methods combine the predictions of multiple models to improve +performance, but they require significantly higher computation costs at +inference time. To avoid these costs, multiple neural networks can be combined +into one by averaging their weights. However, this usually performs +significantly worse than ensembling. Weight averaging is only beneficial when +different enough to benefit from combining them, but similar enough to average +well. Based on this idea, we propose PopulAtion Parameter Averaging (PAPA): a +method that combines the generality of ensembling with the efficiency of weight +averaging. PAPA leverages a population of diverse models (trained on different +data orders, augmentations, and regularizations) while slowly pushing the +weights of the networks toward the population average of the weights. We also +propose PAPA variants (PAPA-all, and PAPA-2) that average weights rarely rather +than continuously; all methods increase generalization, but PAPA tends to +perform best. PAPA reduces the performance gap between averaging and +ensembling, increasing the average accuracy of a population of models by up to +0.8% on CIFAR-10, 1.9% on CIFAR-100, and 1.6% on ImageNet when compared to +training independent (non-averaged) models. + +
+
+ comment: Blog post: https://ajolicoeur.wordpress.com/papa/, Code: + https://github.com/SamsungSAILMontreal/PAPA, TMLR journal publication: + https://openreview.net/forum?id=cPDVjsOytS +
+
+
+
+
+ + ♻ ☆ Bidirectional Temporal Diffusion Model for Temporally Consistent Human + Animation + + +
+ We introduce a method to generate temporally coherent human animation from a +single image, a video, or a random noise. This problem has been formulated as +modeling of an auto-regressive generation, i.e., to regress past frames to +decode future frames. However, such unidirectional generation is highly prone +to motion drifting over time, generating unrealistic human animation with +significant artifacts such as appearance distortion. We claim that +bidirectional temporal modeling enforces temporal coherence on a generative +network by largely suppressing the motion ambiguity of human appearance. To +prove our claim, we design a novel human animation framework using a denoising +diffusion model: a neural network learns to generate the image of a person by +denoising temporal Gaussian noises whose intermediate results are +cross-conditioned bidirectionally between consecutive frames. In the +experiments, our method demonstrates strong performance compared to existing +unidirectional approaches with realistic temporal coherence + +
+
+
+
+
+ + ♻ ☆ Meta-Prompting for Automating Zero-shot Visual Recognition with LLMs + + +
+ Prompt ensembling of Large Language Model (LLM) generated category-specific +prompts has emerged as an effective method to enhance zero-shot recognition +ability of Vision-Language Models (VLMs). To obtain these category-specific +prompts, the present methods rely on hand-crafting the prompts to the LLMs for +generating VLM prompts for the downstream tasks. However, this requires +manually composing these task-specific prompts and still, they might not cover +the diverse set of visual concepts and task-specific styles associated with the +categories of interest. To effectively take humans out of the loop and +completely automate the prompt generation process for zero-shot recognition, we +propose Meta-Prompting for Visual Recognition (MPVR). Taking as input only +minimal information about the target task, in the form of its short natural +language description, and a list of associated class labels, MPVR automatically +produces a diverse set of category-specific prompts resulting in a strong +zero-shot classifier. MPVR generalizes effectively across various popular +zero-shot image recognition benchmarks belonging to widely different domains +when tested with multiple LLMs and VLMs. For example, MPVR obtains a zero-shot +recognition improvement over CLIP by up to 19.8% and 18.2% (5.0% and 4.5% on +average over 20 datasets) leveraging GPT and Mixtral LLMs, respectively + +
+
+ comment: Project Page (Code and Data): + https://jmiemirza.github.io/Meta-Prompting/ +
+
+
+
+
+ + ♻ ☆ CPA-Enhancer: Chain-of-Thought Prompted Adaptive Enhancer for Object + Detection under Unknown Degradations + + +
+ Object detection methods under known single degradations have been +extensively investigated. However, existing approaches require prior knowledge +of the degradation type and train a separate model for each, limiting their +practical applications in unpredictable environments. To address this +challenge, we propose a chain-of-thought (CoT) prompted adaptive enhancer, +CPA-Enhancer, for object detection under unknown degradations. Specifically, +CPA-Enhancer progressively adapts its enhancement strategy under the +step-by-step guidance of CoT prompts, that encode degradation-related +information. To the best of our knowledge, it's the first work that exploits +CoT prompting for object detection tasks. Overall, CPA-Enhancer is a +plug-and-play enhancement model that can be integrated into any generic +detectors to achieve substantial gains on degraded images, without knowing the +degradation type priorly. Experimental results demonstrate that CPA-Enhancer +not only sets the new state of the art for object detection but also boosts the +performance of other downstream vision tasks under unknown degradations. + +
+
+
+
+
+ + ♻ ☆ Dysen-VDM: Empowering Dynamics-aware Text-to-Video Diffusion with LLMs CVPR 2024 + + +
+ Text-to-video (T2V) synthesis has gained increasing attention in the +community, in which the recently emerged diffusion models (DMs) have +promisingly shown stronger performance than the past approaches. While existing +state-of-the-art DMs are competent to achieve high-resolution video generation, +they may largely suffer from key limitations (e.g., action occurrence +disorders, crude video motions) with respect to the intricate temporal dynamics +modeling, one of the crux of video synthesis. In this work, we investigate +strengthening the awareness of video dynamics for DMs, for high-quality T2V +generation. Inspired by human intuition, we design an innovative dynamic scene +manager (dubbed as Dysen) module, which includes (step-1) extracting from input +text the key actions with proper time-order arrangement, (step-2) transforming +the action schedules into the dynamic scene graph (DSG) representations, and +(step-3) enriching the scenes in the DSG with sufficient and reasonable +details. Taking advantage of the existing powerful LLMs (e.g., ChatGPT) via +in-context learning, Dysen realizes (nearly) human-level temporal dynamics +understanding. Finally, the resulting video DSG with rich action scene details +is encoded as fine-grained spatio-temporal features, integrated into the +backbone T2V DM for video generating. Experiments on popular T2V datasets +suggest that our Dysen-VDM consistently outperforms prior arts with significant +margins, especially in scenarios with complex actions. Codes at +https://haofei.vip/Dysen-VDM + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ SAMAug: Point Prompt Augmentation for Segment Anything Model + + +
+ This paper introduces SAMAug, a novel visual point augmentation method for +the Segment Anything Model (SAM) that enhances interactive image segmentation +performance. SAMAug generates augmented point prompts to provide more +information about the user's intention to SAM. Starting with an initial point +prompt, SAM produces an initial mask, which is then fed into our proposed +SAMAug to generate augmented point prompts. By incorporating these extra +points, SAM can generate augmented segmentation masks based on both the +augmented point prompts and the initial prompt, resulting in improved +segmentation performance. We conducted evaluations using four different point +augmentation strategies: random sampling, sampling based on maximum difference +entropy, maximum distance, and saliency. Experiment results on the COCO, +Fundus, COVID QUEx, and ISIC2018 datasets show that SAMAug can boost SAM's +segmentation results, especially using the maximum distance and saliency. +SAMAug demonstrates the potential of visual prompt augmentation for computer +vision. Codes of SAMAug are available at github.com/yhydhx/SAMAug + +
+
+
+
+
+ + ♻ ☆ EasyEdit: An Easy-to-use Knowledge Editing Framework for Large Language + Models + + +
+ Large Language Models (LLMs) usually suffer from knowledge cutoff or fallacy +issues, which means they are unaware of unseen events or generate text with +incorrect facts owing to outdated/noisy data. To this end, many knowledge +editing approaches for LLMs have emerged -- aiming to subtly inject/edit +updated knowledge or adjust undesired behavior while minimizing the impact on +unrelated inputs. Nevertheless, due to significant differences among various +knowledge editing methods and the variations in task setups, there is no +standard implementation framework available for the community, which hinders +practitioners from applying knowledge editing to applications. To address these +issues, we propose EasyEdit, an easy-to-use knowledge editing framework for +LLMs. It supports various cutting-edge knowledge editing approaches and can be +readily applied to many well-known LLMs such as T5, GPT-J, LlaMA, etc. +Empirically, we report the knowledge editing results on LlaMA-2 with EasyEdit, +demonstrating that knowledge editing surpasses traditional fine-tuning in terms +of reliability and generalization. We have released the source code on GitHub, +along with Google Colab tutorials and comprehensive documentation for beginners +to get started. Besides, we present an online system for real-time knowledge +editing, and a demo video. + +
+
+ comment: Code: https://github.com/zjunlp/EasyEdit HF Demo: + https://huggingface.co/spaces/zjunlp/EasyEdit Video: + https://youtu.be/Gm6T0QaaskU Docs: https://zjunlp.gitbook.io/easyedit +
+
+
+
+
+ + ♻ ☆ DePT: Decoupled Prompt Tuning + + +
+ This work breaks through the Base-New Tradeoff (BNT)dilemma in prompt tuning, +i.e., the better the tuned model generalizes to the base (or target) task, the +worse it generalizes to new tasks, and vice versa. Specifically, through an +in-depth analysis of the learned features of the base and new tasks, we observe +that the BNT stems from a channel bias issue, i.e., the vast majority of +feature channels are occupied by base-specific knowledge, resulting in the +collapse of taskshared knowledge important to new tasks. To address this, we +propose the Decoupled Prompt Tuning (DePT) framework, which decouples +base-specific knowledge from feature channels into an isolated feature space +during prompt tuning, so as to maximally preserve task-shared knowledge in the +original feature space for achieving better zero-shot generalization on new +tasks. Importantly, our DePT is orthogonal to existing prompt tuning methods, +hence it can improve all of them. Extensive experiments on 11 datasets show the +strong flexibility and effectiveness of DePT. Our code and pretrained models +are available at https://github.com/Koorye/DePT. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ♻ ☆ Urban Sound Propagation: a Benchmark for 1-Step Generative Modeling of + Complex Physical Systems + + +
+ Data-driven modeling of complex physical systems is receiving a growing +amount of attention in the simulation and machine learning communities. Since +most physical simulations are based on compute-intensive, iterative +implementations of differential equation systems, a (partial) replacement with +learned, 1-step inference models has the potential for significant speedups in +a wide range of application areas. In this context, we present a novel +benchmark for the evaluation of 1-step generative learning models in terms of +speed and physical correctness. Our Urban Sound Propagation benchmark is based +on the physically complex and practically relevant, yet intuitively easy to +grasp task of modeling the 2d propagation of waves from a sound source in an +urban environment. We provide a dataset with 100k samples, where each sample +consists of pairs of real 2d building maps drawn from OpenStreetmap, a +parameterized sound source, and a simulated ground truth sound propagation for +the given scene. The dataset provides four different simulation tasks with +increasing complexity regarding reflection, diffraction and source variance. A +first baseline evaluation of common generative U-Net, GAN and Diffusion models +shows, that while these models are very well capable of modeling sound +propagations in simple cases, the approximation of sub-systems represented by +higher order equations systematically fails. Information about the dataset, +download instructions and source codes are provided on our website: +https://www.urban-sound-data.org. + +
+
+
+
+
+ + ♻ ☆ BAD-Gaussians: Bundle Adjusted Deblur Gaussian Splatting + + +
+ While neural rendering has demonstrated impressive capabilities in 3D scene +reconstruction and novel view synthesis, it heavily relies on high-quality +sharp images and accurate camera poses. Numerous approaches have been proposed +to train Neural Radiance Fields (NeRF) with motion-blurred images, commonly +encountered in real-world scenarios such as low-light or long-exposure +conditions. However, the implicit representation of NeRF struggles to +accurately recover intricate details from severely motion-blurred images and +cannot achieve real-time rendering. In contrast, recent advancements in 3D +Gaussian Splatting achieve high-quality 3D scene reconstruction and real-time +rendering by explicitly optimizing point clouds as Gaussian spheres. + In this paper, we introduce a novel approach, named BAD-Gaussians (Bundle +Adjusted Deblur Gaussian Splatting), which leverages explicit Gaussian +representation and handles severe motion-blurred images with inaccurate camera +poses to achieve high-quality scene reconstruction. Our method models the +physical image formation process of motion-blurred images and jointly learns +the parameters of Gaussians while recovering camera motion trajectories during +exposure time. + In our experiments, we demonstrate that BAD-Gaussians not only achieves +superior rendering quality compared to previous state-of-the-art deblur neural +rendering methods on both synthetic and real datasets but also enables +real-time rendering capabilities. + Our project page and source code is available at +https://lingzhezhao.github.io/BAD-Gaussians/ + +
+
+ comment: Project Page and Source Code: + https://lingzhezhao.github.io/BAD-Gaussians/ +
+
+
+
+
+ + ♻ ☆ ScanTalk: 3D Talking Heads from Unregistered Scans + + +
+ Speech-driven 3D talking heads generation has emerged as a significant area +of interest among researchers, presenting numerous challenges. Existing methods +are constrained by animating faces with fixed topologies, wherein point-wise +correspondence is established, and the number and order of points remains +consistent across all identities the model can animate. In this work, we +present ScanTalk, a novel framework capable of animating 3D faces in arbitrary +topologies including scanned data. Our approach relies on the DiffusionNet +architecture to overcome the fixed topology constraint, offering promising +avenues for more flexible and realistic 3D animations. By leveraging the power +of DiffusionNet, ScanTalk not only adapts to diverse facial structures but also +maintains fidelity when dealing with scanned data, thereby enhancing the +authenticity and versatility of generated 3D talking heads. Through +comprehensive comparisons with state-of-the-art methods, we validate the +efficacy of our approach, demonstrating its capacity to generate realistic +talking heads comparable to existing techniques. While our primary objective is +to develop a generic method free from topological constraints, all +state-of-the-art methodologies are bound by such limitations. Code for +reproducing our results, and the pre-trained model will be made available. + +
+
+
+
+
+ + ♻ ☆ PGA: Personalizing Grasping Agents with Single Human-Robot Interaction + + +
+ Language-Conditioned Robotic Grasping (LCRG) aims to develop robots that +comprehend and grasp objects based on natural language instructions. While the +ability to understand personal objects like my wallet facilitates more natural +interaction with human users, current LCRG systems only allow generic language +instructions, e.g., the black-colored wallet next to the laptop. To this end, +we introduce a task scenario GraspMine alongside a novel dataset aimed at +pinpointing and grasping personal objects given personal indicators via +learning from a single human-robot interaction, rather than a large labeled +dataset. Our proposed method, Personalized Grasping Agent (PGA), addresses +GraspMine by leveraging the unlabeled image data of the user's environment, +called Reminiscence. Specifically, PGA acquires personal object information by +a user presenting a personal object with its associated indicator, followed by +PGA inspecting the object by rotating it. Based on the acquired information, +PGA pseudo-labels objects in the Reminiscence by our proposed label propagation +algorithm. Harnessing the information acquired from the interactions and the +pseudo-labeled objects in the Reminiscence, PGA adapts the object grounding +model to grasp personal objects. This results in significant efficiency while +previous LCRG systems rely on resource-intensive human annotations -- +necessitating hundreds of labeled data to learn my wallet. Moreover, PGA +outperforms baseline methods across all metrics and even shows comparable +performance compared to the fully-supervised method, which learns from 9k +annotated data samples. We further validate PGA's real-world applicability by +employing a physical robot to execute GrsapMine. Code and data are publicly +available at https://github.com/JHKim-snu/PGA. + +
+
+ comment: 8 pages, under review +
+
+
+
+
+ + ♻ ☆ ECAMP: Entity-centered Context-aware Medical Vision Language + Pre-training + + +
+ Despite significant advancements in medical vision-language pre-training, +existing methods have largely overlooked the inherent entity-specific context +within radiology reports and the complex cross-modality contextual +relationships between text and images. To close this gap, we propose a novel +Entity-centered Context-aware Medical Vision-language Pre-training (ECAMP) +framework, which is designed to enable a more entity-centered and +context-sensitive interpretation of medical data. Utilizing the recent powerful +large language model, we distill entity-centered context from medical reports, +which enables ECAMP to gain more effective supervision from the text modality. +By further pre-training our model with carefully designed entity-aware, +context-enhanced masked language modeling and context-guided super-resolution +tasks, ECAMP significantly refines the interplay between text and image +modalities, leading to an enhanced ability to extract entity-centered +contextual features. Besides, our proposed multi-scale context fusion design +also improves the semantic integration of both coarse and fine-level image +representations, prompting better performance for multi-scale downstream +applications. Combining these components leads to significant performance leaps +over current state-of-the-art methods and establishes a new standard for +cross-modality learning in medical imaging, whose effectiveness is demonstrated +by our extensive experiments on various tasks including classification, +segmentation, and detection across several public datasets. Code and models are +available at https://github.com/ToniChopp/ECAMP. + +
+
+
+
+
+ + ♻ ☆ Goal-conditioned dual-action imitation learning for dexterous dual-arm + robot manipulation + + +
+ Long-horizon dexterous robot manipulation of deformable objects, such as +banana peeling, is a problematic task because of the difficulties in object +modeling and a lack of knowledge about stable and dexterous manipulation +skills. This paper presents a goal-conditioned dual-action (GC-DA) deep +imitation learning (DIL) approach that can learn dexterous manipulation skills +using human demonstration data. Previous DIL methods map the current sensory +input and reactive action, which often fails because of compounding errors in +imitation learning caused by the recurrent computation of actions. The method +predicts reactive action only when the precise manipulation of the target +object is required (local action) and generates the entire trajectory when +precise manipulation is not required (global action). This dual-action +formulation effectively prevents compounding error in the imitation learning +using the trajectory-based global action while responding to unexpected changes +in the target object during the reactive local action. The proposed method was +tested in a real dual-arm robot and successfully accomplished the +banana-peeling task. + +
+
+ comment: 19 pages, published in Transactions on Robotics (T-RO) +
+
+
+
+
+ + ♻ ☆ Towards image compression with perfect realism at ultra-low bitrates + + +
+ Image codecs are typically optimized to trade-off bitrate \vs distortion +metrics. At low bitrates, this leads to compression artefacts which are easily +perceptible, even when training with perceptual or adversarial losses. To +improve image quality and remove dependency on the bitrate, we propose to +decode with iterative diffusion models. We condition the decoding process on a +vector-quantized image representation, as well as a global image description to +provide additional context. We dub our model PerCo for 'perceptual +compression', and compare it to state-of-the-art codecs at rates from 0.1 down +to 0.003 bits per pixel. The latter rate is more than an order of magnitude +smaller than those considered in most prior work, compressing a 512x768 Kodak +image with less than 153 bytes. Despite this ultra-low bitrate, our approach +maintains the ability to reconstruct realistic images. We find that our model +leads to reconstructions with state-of-the-art visual quality as measured by +FID and KID. As predicted by rate-distortion-perception theory, visual quality +is less dependent on the bitrate than previous methods. + +
+
+
+
+
+ + ♻ ☆ OCTDL: Optical Coherence Tomography Dataset for Image-Based Deep + Learning Methods + + +
+ Optical coherence tomography (OCT) is a non-invasive imaging technique with +extensive clinical applications in ophthalmology. OCT enables the visualization +of the retinal layers, playing a vital role in the early detection and +monitoring of retinal diseases. OCT uses the principle of light wave +interference to create detailed images of the retinal microstructures, making +it a valuable tool for diagnosing ocular conditions. This work presents an +open-access OCT dataset (OCTDL) comprising over 2000 OCT images labeled +according to disease group and retinal pathology. The dataset consists of OCT +records of patients with Age-related Macular Degeneration (AMD), Diabetic +Macular Edema (DME), Epiretinal Membrane (ERM), Retinal Artery Occlusion (RAO), +Retinal Vein Occlusion (RVO), and Vitreomacular Interface Disease (VID). The +images were acquired with an Optovue Avanti RTVue XR using raster scanning +protocols with dynamic scan length and image resolution. Each retinal b-scan +was acquired by centering on the fovea and interpreted and cataloged by an +experienced retinal specialist. In this work, we applied Deep Learning +classification techniques to this new open-access dataset. + +
+
+
+
+
+ + ♻ ☆ Genixer: Empowering Multimodal Large Language Models as a Powerful Data + Generator + + +
+ Instruction tuning data is essential for training the Multimodal Large +Language Models (MLLMs). However, the creation of high-quality instruction +tuning data presents significant challenges. Prior methods that depended on +GPT-4 for data generation were not only costly but also lacked satisfactory +performance in complex tasks (i.e., grounding-based reasoning tasks). To +address these issues, we developed an innovative data generation pipeline, +Genixer, to generate various high-quality instruction tuning data, including +nine representative tasks, e.g., Common VQA, REC, REG, and PointQ. +Specifically, Genixer provides a unified solution with four key steps for +alleviating the difficulty of data generation: (i) instruction data collection, +(ii) instruction template design, (iii) empowering MLLM, and (iv) data +generation and filtering. Subsequently, the superior qualitative results of our +Genixer demonstrate that current MLLMs have a strong potential to evolve into +powerful data generators. Additionally, to validate the efficacy of generated +data quantitatively, we add the instruction tuning data produced by Genixer +into the training of two representative MLLMs and observe the consistent +improvements on various VQA tasks and multimodal benchmarks. + +
+
+ comment: Technical report +
+
+
+
+
+ + ♻ ☆ GSVA: Generalized Segmentation via Multimodal Large Language Models CVPR2024 + + +
+ Generalized Referring Expression Segmentation (GRES) extends the scope of +classic RES to refer to multiple objects in one expression or identify the +empty targets absent in the image. GRES poses challenges in modeling the +complex spatial relationships of the instances in the image and identifying +non-existing referents. Multimodal Large Language Models (MLLMs) have recently +shown tremendous progress in these complicated vision-language tasks. +Connecting Large Language Models (LLMs) and vision models, MLLMs are proficient +in understanding contexts with visual inputs. Among them, LISA, as a +representative, adopts a special [SEG] token to prompt a segmentation mask +decoder, e.g., SAM, to enable MLLMs in the RES task. However, existing +solutions to GRES remain unsatisfactory since current segmentation MLLMs cannot +correctly handle the cases where users might reference multiple subjects in a +singular prompt or provide descriptions incongruent with any image target. In +this paper, we propose Generalized Segmentation Vision Assistant (GSVA) to +address this gap. Specifically, GSVA reuses the [SEG] token to prompt the +segmentation model towards supporting multiple mask references simultaneously +and innovatively learns to generate a [REJ] token to reject the null targets +explicitly. Experiments validate GSVA's efficacy in resolving the GRES issue, +marking a notable enhancement and setting a new record on the GRES benchmark +gRefCOCO dataset. GSVA also proves effective across various classic referring +segmentation and comprehension tasks. + +
+
+ comment: Accepted by CVPR2024 (19 pages, 9 figures, 11 tables) +
+
+
+
+
+ + ♻ ☆ DynamicGlue: Epipolar and Time-Informed Data Association in Dynamic + Environments using Graph Neural Networks + + +
+ The assumption of a static environment is common in many geometric computer +vision tasks like SLAM but limits their applicability in highly dynamic scenes. +Since these tasks rely on identifying point correspondences between input +images within the static part of the environment, we propose a graph neural +network-based sparse feature matching network designed to perform robust +matching under challenging conditions while excluding keypoints on moving +objects. We employ a similar scheme of attentional aggregation over graph edges +to enhance keypoint representations as state-of-the-art feature-matching +networks but augment the graph with epipolar and temporal information and +vastly reduce the number of graph edges. Furthermore, we introduce a +self-supervised training scheme to extract pseudo labels for image pairs in +dynamic environments from exclusively unprocessed visual-inertial data. A +series of experiments show the superior performance of our network as it +excludes keypoints on moving objects compared to state-of-the-art feature +matching networks while still achieving similar results regarding conventional +matching metrics. When integrated into a SLAM system, our network significantly +improves performance, especially in highly dynamic scenes. + +
+
+
+
+
+ + ♻ ☆ GVA: Reconstructing Vivid 3D Gaussian Avatars from Monocular Videos + + +
+ In this paper, we present a novel method that facilitates the creation of +vivid 3D Gaussian avatars from monocular video inputs (GVA). Our innovation +lies in addressing the intricate challenges of delivering high-fidelity human +body reconstructions and aligning 3D Gaussians with human skin surfaces +accurately. The key contributions of this paper are twofold. Firstly, we +introduce a pose refinement technique to improve hand and foot pose accuracy by +aligning normal maps and silhouettes. Precise pose is crucial for correct shape +and appearance reconstruction. Secondly, we address the problems of unbalanced +aggregation and initialization bias that previously diminished the quality of +3D Gaussian avatars, through a novel surface-guided re-initialization method +that ensures accurate alignment of 3D Gaussian points with avatar surfaces. +Experimental results demonstrate that our proposed method achieves +high-fidelity and vivid 3D Gaussian avatar reconstruction. Extensive +experimental analyses validate the performance qualitatively and +quantitatively, demonstrating that it achieves state-of-the-art performance in +photo-realistic novel view synthesis while offering fine-grained control over +the human body and hand pose. Project page: https://3d-aigc.github.io/GVA/. + +
+
+
+
+
+ + ♻ ☆ DyBluRF: Dynamic Neural Radiance Fields from Blurry Monocular Video CVPR 2024 + + +
+ Recent advancements in dynamic neural radiance field methods have yielded +remarkable outcomes. However, these approaches rely on the assumption of sharp +input images. When faced with motion blur, existing dynamic NeRF methods often +struggle to generate high-quality novel views. In this paper, we propose +DyBluRF, a dynamic radiance field approach that synthesizes sharp novel views +from a monocular video affected by motion blur. To account for motion blur in +input images, we simultaneously capture the camera trajectory and object +Discrete Cosine Transform (DCT) trajectories within the scene. Additionally, we +employ a global cross-time rendering approach to ensure consistent temporal +coherence across the entire scene. We curate a dataset comprising diverse +dynamic scenes that are specifically tailored for our task. Experimental +results on our dataset demonstrate that our method outperforms existing +approaches in generating sharp novel views from motion-blurred inputs while +maintaining spatial-temporal consistency of the scene. + +
+
+ comment: Accepted by CVPR 2024. Project page: + https://huiqiang-sun.github.io/dyblurf/ +
+
+
+
+
+ + ♻ ☆ AV-SUPERB: A Multi-Task Evaluation Benchmark for Audio-Visual + Representation Models ICASSP 2024 + + +
+ Audio-visual representation learning aims to develop systems with human-like +perception by utilizing correlation between auditory and visual information. +However, current models often focus on a limited set of tasks, and +generalization abilities of learned representations are unclear. To this end, +we propose the AV-SUPERB benchmark that enables general-purpose evaluation of +unimodal audio/visual and bimodal fusion representations on 7 datasets covering +5 audio-visual tasks in speech and audio processing. We evaluate 5 recent +self-supervised models and show that none of these models generalize to all +tasks, emphasizing the need for future study on improving universal model +performance. In addition, we show that representations may be improved with +intermediate-task fine-tuning and audio event classification with AudioSet +serves as a strong intermediate task. We release our benchmark with evaluation +code and a model submission platform to encourage further research in +audio-visual learning. + +
+
+ comment: Accepted to ICASSP 2024; Evaluation Code: + https://github.com/roger-tseng/av-superb Submission Platform: + https://av.superbbenchmark.org +
+
+
+
+
+ + ♻ ☆ Towards Plastic and Stable Exemplar-Free Incremental Learning: A + Dual-Learner Framework with Cumulative Parameter Averaging + + +
+ The dilemma between plasticity and stability presents a significant challenge +in Incremental Learning (IL), especially in the exemplar-free scenario where +accessing old-task samples is strictly prohibited during the learning of a new +task. A straightforward solution to this issue is learning and storing an +independent model for each task, known as Single Task Learning (STL). Despite +the linear growth in model storage with the number of tasks in STL, we +empirically discover that averaging these model parameters can potentially +preserve knowledge across all tasks. Inspired by this observation, we propose a +Dual-Learner framework with Cumulative Parameter Averaging (DLCPA). DLCPA +employs a dual-learner design: a plastic learner focused on acquiring new-task +knowledge and a stable learner responsible for accumulating all learned +knowledge. The knowledge from the plastic learner is transferred to the stable +learner via cumulative parameter averaging. Additionally, several task-specific +classifiers work in cooperation with the stable learner to yield the final +prediction. Specifically, when learning a new task, these modules are updated +in a cyclic manner: i) the plastic learner is initially optimized using a +self-supervised loss besides the supervised loss to enhance the feature +extraction robustness; ii) the stable learner is then updated with respect to +the plastic learner in a cumulative parameter averaging manner to maintain its +task-wise generalization; iii) the task-specific classifier is accordingly +optimized to align with the stable learner. Experimental results on CIFAR-100 +and Tiny-ImageNet show that DLCPA outperforms several state-of-the-art +exemplar-free baselines in both Task-IL and Class-IL settings. + +
+
+
+
+
+ + ♻ ☆ Enhancing Quality of Compressed Images by Mitigating Enhancement Bias + Towards Compression Domain CVPR 2024 + + +
+ Existing quality enhancement methods for compressed images focus on aligning +the enhancement domain with the raw domain to yield realistic images. However, +these methods exhibit a pervasive enhancement bias towards the compression +domain, inadvertently regarding it as more realistic than the raw domain. This +bias makes enhanced images closely resemble their compressed counterparts, thus +degrading their perceptual quality. In this paper, we propose a simple yet +effective method to mitigate this bias and enhance the quality of compressed +images. Our method employs a conditional discriminator with the compressed +image as a key condition, and then incorporates a domain-divergence +regularization to actively distance the enhancement domain from the compression +domain. Through this dual strategy, our method enables the discrimination +against the compression domain, and brings the enhancement domain closer to the +raw domain. Comprehensive quality evaluations confirm the superiority of our +method over other state-of-the-art methods without incurring inference +overheads. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Mask-Based Modeling for Neural Radiance Fields + + +
+ Most Neural Radiance Fields (NeRFs) exhibit limited generalization +capabilities, which restrict their applicability in representing multiple +scenes using a single model. To address this problem, existing generalizable +NeRF methods simply condition the model on image features. These methods still +struggle to learn precise global representations over diverse scenes since they +lack an effective mechanism for interacting among different points and views. +In this work, we unveil that 3D implicit representation learning can be +significantly improved by mask-based modeling. Specifically, we propose masked +ray and view modeling for generalizable NeRF (MRVM-NeRF), which is a +self-supervised pretraining target to predict complete scene representations +from partially masked features along each ray. With this pretraining target, +MRVM-NeRF enables better use of correlations across different points and views +as the geometry priors, which thereby strengthens the capability of capturing +intricate details within the scenes and boosts the generalization capability +across different scenes. Extensive experiments demonstrate the effectiveness of +our proposed MRVM-NeRF on both synthetic and real-world datasets, qualitatively +and quantitatively. Besides, we also conduct experiments to show the +compatibility of our proposed method with various backbones and its superiority +under few-shot cases. + +
+
+
+
+
+ + ♻ ☆ A Comprehensive Survey on 3D Content Generation + + +
+ Recent years have witnessed remarkable advances in artificial intelligence +generated content(AIGC), with diverse input modalities, e.g., text, image, +video, audio and 3D. The 3D is the most close visual modality to real-world 3D +environment and carries enormous knowledge. The 3D content generation shows +both academic and practical values while also presenting formidable technical +challenges. This review aims to consolidate developments within the burgeoning +domain of 3D content generation. Specifically, a new taxonomy is proposed that +categorizes existing approaches into three types: 3D native generative methods, +2D prior-based 3D generative methods, and hybrid 3D generative methods. The +survey covers approximately 60 papers spanning the major techniques. Besides, +we discuss limitations of current 3D content generation techniques, and point +out open challenges as well as promising directions for future work. +Accompanied with this survey, we have established a project website where the +resources on 3D content generation research are provided. The project page is +available at https://github.com/hitcslj/Awesome-AIGC-3D. + +
+
+ comment: under review +
+
+
+
+
+ + ♻ ☆ Zippo: Zipping Color and Transparency Distributions into a Single + Diffusion Model + + +
+ Beyond the superiority of the text-to-image diffusion model in generating +high-quality images, recent studies have attempted to uncover its potential for +adapting the learned semantic knowledge to visual perception tasks. In this +work, instead of translating a generative diffusion model into a visual +perception model, we explore to retain the generative ability with the +perceptive adaptation. To accomplish this, we present Zippo, a unified +framework for zipping the color and transparency distributions into a single +diffusion model by expanding the diffusion latent into a joint representation +of RGB images and alpha mattes. By alternatively selecting one modality as the +condition and then applying the diffusion process to the counterpart modality, +Zippo is capable of generating RGB images from alpha mattes and predicting +transparency from input images. In addition to single-modality prediction, we +propose a modality-aware noise reassignment strategy to further empower Zippo +with jointly generating RGB images and its corresponding alpha mattes under the +text guidance. Our experiments showcase Zippo's ability of efficient +text-conditioned transparent image generation and present plausible results of +Matte-to-RGB and RGB-to-Matte translation. + +
+
+
+
+
+ + ♻ ☆ On the Diversity and Realism of Distilled Dataset: An Efficient Dataset + Distillation Paradigm + + +
+ Contemporary machine learning requires training large neural networks on +massive datasets and thus faces the challenges of high computational demands. +Dataset distillation, as a recent emerging strategy, aims to compress +real-world datasets for efficient training. However, this line of research +currently struggle with large-scale and high-resolution datasets, hindering its +practicality and feasibility. To this end, we re-examine the existing dataset +distillation methods and identify three properties required for large-scale +real-world applications, namely, realism, diversity, and efficiency. As a +remedy, we propose RDED, a novel computationally-efficient yet effective data +distillation paradigm, to enable both diversity and realism of the distilled +data. Extensive empirical results over various neural architectures and +datasets demonstrate the advancement of RDED: we can distill the full +ImageNet-1K to a small dataset comprising 10 images per class within 7 minutes, +achieving a notable 42% top-1 accuracy with ResNet-18 on a single RTX-4090 GPU +(while the SOTA only achieves 21% but requires 6 hours). + +
+
+ comment: 17 pages, 20 figures +
+
+
+
+
+ + ♻ ☆ Shared and Private Information Learning in Multimodal Sentiment Analysis + with Deep Modal Alignment and Self-supervised Multi-Task Learning + + +
+ Designing an effective representation learning method for multimodal +sentiment analysis tasks is a crucial research direction. The challenge lies in +learning both shared and private information in a complete modal +representation, which is difficult with uniform multimodal labels and a raw +feature fusion approach. In this work, we propose a deep modal shared +information learning module based on the covariance matrix to capture the +shared information between modalities. Additionally, we use a label generation +module based on a self-supervised learning strategy to capture the private +information of the modalities. Our module is plug-and-play in multimodal tasks, +and by changing the parameterization, it can adjust the information exchange +relationship between the modes and learn the private or shared information +between the specified modes. We also employ a multi-task learning strategy to +help the model focus its attention on the modal differentiation training data. +We provide a detailed formulation derivation and feasibility proof for the +design of the deep modal shared information learning module. We conduct +extensive experiments on three common multimodal sentiment analysis baseline +datasets, and the experimental results validate the reliability of our model. +Furthermore, we explore more combinatorial techniques for the use of the +module. Our approach outperforms current state-of-the-art methods on most of +the metrics of the three public datasets. + +
+
+
+
+
+ + ♻ ☆ Approximation and bounding techniques for the Fisher-Rao distances + + +
+ The Fisher-Rao distance between two probability distributions of a +statistical model is defined as the Riemannian geodesic distance induced by the +Fisher information metric. In order to calculate the Fisher-Rao distance in +closed-form, we need (1) to elicit a formula for the Fisher-Rao geodesics, and +(2) to integrate the Fisher length element along those geodesics. We consider +several numerically robust approximation and bounding techniques for the +Fisher-Rao distances: First, we report generic upper bounds on Fisher-Rao +distances based on closed-form 1D Fisher-Rao distances of submodels. Second, we +describe several generic approximation schemes depending on whether the +Fisher-Rao geodesics or pregeodesics are available in closed-form or not. In +particular, we obtain a generic method to guarantee an arbitrarily small +additive error on the approximation provided that Fisher-Rao pregeodesics and +tight lower and upper bounds are available. Third, we consider the case of +Fisher metrics being Hessian metrics, and report generic tight upper bounds on +the Fisher-Rao distances using techniques of information geometry. +Uniparametric and biparametric statistical models always have Fisher Hessian +metrics, and in general a simple test allows to check whether the Fisher +information matrix yields a Hessian metric or not. Fourth, we consider +elliptical distribution families and show how to apply the above techniques to +these models. We also propose two new distances based either on the Fisher-Rao +lengths of curves serving as proxies of Fisher-Rao geodesics, or based on the +Birkhoff/Hilbert projective cone distance. Last, we consider an alternative +group-theoretic approach for statistical transformation models based on the +notion of maximal invariant which yields insights on the structures of the +Fisher-Rao distance formula which may be used fruitfully in applications. + +
+
+ comment: 43 pages +
+
+
+
+
+ + ♻ ☆ Cameras as Rays: Pose Estimation via Ray Diffusion ICLR 2024 + + +
+ Estimating camera poses is a fundamental task for 3D reconstruction and +remains challenging given sparsely sampled views (<10). In contrast to existing +approaches that pursue top-down prediction of global parametrizations of camera +extrinsics, we propose a distributed representation of camera pose that treats +a camera as a bundle of rays. This representation allows for a tight coupling +with spatial image features improving pose precision. We observe that this +representation is naturally suited for set-level transformers and develop a +regression-based approach that maps image patches to corresponding rays. To +capture the inherent uncertainties in sparse-view pose inference, we adapt this +approach to learn a denoising diffusion model which allows us to sample +plausible modes while improving performance. Our proposed methods, both +regression- and diffusion-based, demonstrate state-of-the-art performance on +camera pose estimation on CO3D while generalizing to unseen object categories +and in-the-wild captures. + +
+
+ comment: In ICLR 2024 (oral). v2: updated references. Project webpage: + https://jasonyzhang.com/RayDiffusion +
+
+
+
+
+ + ♻ ☆ Leveraging Spatial and Semantic Feature Extraction for Skin Cancer + Diagnosis with Capsule Networks and Graph Neural Networks + + +
+ In the realm of skin lesion image classification, the intricate spatial and +semantic features pose significant challenges for conventional Convolutional +Neural Network (CNN)-based methodologies. These challenges are compounded by +the imbalanced nature of skin lesion datasets, which hampers the ability of +models to learn minority class features effectively. Despite augmentation +strategies, such as those using Generative Adversarial Networks (GANs), +previous attempts have not fully addressed these complexities. This study +introduces an innovative approach by integrating Graph Neural Networks (GNNs) +with Capsule Networks to enhance classification performance. GNNs, known for +their proficiency in handling graph-structured data, offer an advanced +mechanism for capturing complex patterns and relationships beyond the +capabilities of traditional CNNs. Capsule Networks further contribute by +providing superior recognition of spatial hierarchies within images. Our +research focuses on evaluating and enhancing the Tiny Pyramid Vision GNN (Tiny +Pyramid ViG) architecture by incorporating it with a Capsule Network. This +hybrid model was applied to the MNIST:HAM10000 dataset, a comprehensive skin +lesion dataset designed for benchmarking classification models. After 75 epochs +of training, our model achieved a significant accuracy improvement, reaching +89.23% and 95.52%, surpassing established benchmarks such as GoogLeNet +(83.94%), InceptionV3 (86.82%), MobileNet V3 (89.87%), EfficientNet-B7 +(92.07%), ResNet18 (92.22%), ResNet34 (91.90%), ViT-Base (73.70%), and IRv2-SA +(93.47%) on the same dataset. This outcome underscores the potential of our +approach in overcoming the inherent challenges of skin lesion classification, +contributing to the advancement of image-based diagnosis in dermatology. + +
+
+ comment: This is the first version of our paper, we gladly expect feedback and + corrections if there is any mistake within our paper +
+
+
+
+
+ + ♻ ☆ Motion Mamba: Efficient and Long Sequence Motion Generation with + Hierarchical and Bidirectional Selective SSM + + +
+ Human motion generation stands as a significant pursuit in generative +computer vision, while achieving long-sequence and efficient motion generation +remains challenging. Recent advancements in state space models (SSMs), notably +Mamba, have showcased considerable promise in long sequence modeling with an +efficient hardware-aware design, which appears to be a promising direction to +build motion generation model upon it. Nevertheless, adapting SSMs to motion +generation faces hurdles since the lack of a specialized design architecture to +model motion sequence. To address these challenges, we propose Motion Mamba, a +simple and efficient approach that presents the pioneering motion generation +model utilized SSMs. Specifically, we design a Hierarchical Temporal Mamba +(HTM) block to process temporal data by ensemble varying numbers of isolated +SSM modules across a symmetric U-Net architecture aimed at preserving motion +consistency between frames. We also design a Bidirectional Spatial Mamba (BSM) +block to bidirectionally process latent poses, to enhance accurate motion +generation within a temporal frame. Our proposed method achieves up to 50% FID +improvement and up to 4 times faster on the HumanML3D and KIT-ML datasets +compared to the previous best diffusion-based method, which demonstrates strong +capabilities of high-quality long sequence motion modeling and real-time human +motion generation. See project website +https://steve-zeyu-zhang.github.io/MotionMamba/ + +
+
+
+
+
+ + ♻ ☆ OMG: Towards Open-vocabulary Motion Generation via Mixture of + Controllers CVPR 2024 + + +
+ We have recently seen tremendous progress in realistic text-to-motion +generation. Yet, the existing methods often fail or produce implausible motions +with unseen text inputs, which limits the applications. In this paper, we +present OMG, a novel framework, which enables compelling motion generation from +zero-shot open-vocabulary text prompts. Our key idea is to carefully tailor the +pretrain-then-finetune paradigm into the text-to-motion generation. At the +pre-training stage, our model improves the generation ability by learning the +rich out-of-domain inherent motion traits. To this end, we scale up a large +unconditional diffusion model up to 1B parameters, so as to utilize the massive +unlabeled motion data up to over 20M motion instances. At the subsequent +fine-tuning stage, we introduce motion ControlNet, which incorporates text +prompts as conditioning information, through a trainable copy of the +pre-trained model and the proposed novel Mixture-of-Controllers (MoC) block. +MoC block adaptively recognizes various ranges of the sub-motions with a +cross-attention mechanism and processes them separately with the +text-token-specific experts. Such a design effectively aligns the CLIP token +embeddings of text prompts to various ranges of compact and expressive motion +features. Extensive experiments demonstrate that our OMG achieves significant +improvements over the state-of-the-art methods on zero-shot text-to-motion +generation. Project page: https://tr3e.github.io/omg-page. + +
+
+ comment: accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ O$^2$-Recon: Completing 3D Reconstruction of Occluded Objects in the + Scene with a Pre-trained 2D Diffusion Model AAAI 2024 + + +
+ Occlusion is a common issue in 3D reconstruction from RGB-D videos, often +blocking the complete reconstruction of objects and presenting an ongoing +problem. In this paper, we propose a novel framework, empowered by a 2D +diffusion-based in-painting model, to reconstruct complete surfaces for the +hidden parts of objects. Specifically, we utilize a pre-trained diffusion model +to fill in the hidden areas of 2D images. Then we use these in-painted images +to optimize a neural implicit surface representation for each instance for 3D +reconstruction. Since creating the in-painting masks needed for this process is +tricky, we adopt a human-in-the-loop strategy that involves very little human +engagement to generate high-quality masks. Moreover, some parts of objects can +be totally hidden because the videos are usually shot from limited +perspectives. To ensure recovering these invisible areas, we develop a cascaded +network architecture for predicting signed distance field, making use of +different frequency bands of positional encoding and maintaining overall +smoothness. Besides the commonly used rendering loss, Eikonal loss, and +silhouette loss, we adopt a CLIP-based semantic consistency loss to guide the +surface from unseen camera angles. Experiments on ScanNet scenes show that our +proposed framework achieves state-of-the-art accuracy and completeness in +object-level reconstruction from scene-level RGB-D videos. Code: +https://github.com/THU-LYJ-Lab/O2-Recon. + +
+
+ comment: AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Predicated Diffusion: Predicate Logic-Based Attention Guidance for + Text-to-Image Diffusion Models + + +
+ Diffusion models have achieved remarkable results in generating high-quality, +diverse, and creative images. However, when it comes to text-based image +generation, they often fail to capture the intended meaning presented in the +text. For instance, a specified object may not be generated, an unnecessary +object may be generated, and an adjective may alter objects it was not intended +to modify. Moreover, we found that relationships indicating possession between +objects are often overlooked. While users' intentions in text are diverse, +existing methods tend to specialize in only some aspects of these. In this +paper, we propose Predicated Diffusion, a unified framework to express users' +intentions. We consider that the root of the above issues lies in the text +encoder, which often focuses only on individual words and neglects the logical +relationships between them. The proposed method does not solely rely on the +text encoder, but instead, represents the intended meaning in the text as +propositions using predicate logic and treats the pixels in the attention maps +as the fuzzy predicates. This enables us to obtain a differentiable loss +function that makes the image fulfill the proposition by minimizing it. When +compared to several existing methods, we demonstrated that Predicated Diffusion +can generate images that are more faithful to various text prompts, as verified +by human evaluators and pretrained image-text models. + +
+
+ comment: 20 pages, 16 figures, 6 tables, ~500 images, ~30MB +
+
+
+
+
+ + ♻ ☆ CarbonNet: How Computer Vision Plays a Role in Climate Change? + Application: Learning Geomechanics from Subsurface Geometry of CCS to + Mitigate Global Warming + + +
+ We introduce a new approach using computer vision to predict the land surface +displacement from subsurface geometry images for Carbon Capture and +Sequestration (CCS). CCS has been proved to be a key component for a carbon +neutral society. However, scientists see there are challenges along the way +including the high computational cost due to the large model scale and +limitations to generalize a pre-trained model with complex physics. We tackle +those challenges by training models directly from the subsurface geometry +images. The goal is to understand the respons of land surface displacement due +to carbon injection and utilize our trained models to inform decision making in +CCS projects. + We implement multiple models (CNN, ResNet, and ResNetUNet) for static +mechanics problem, which is a image prediction problem. Next, we use the LSTM +and transformer for transient mechanics scenario, which is a video prediction +problem. It shows ResNetUNet outperforms the others thanks to its architecture +in static mechanics problem, and LSTM shows comparable performance to +transformer in transient problem. This report proceeds by outlining our dataset +in detail followed by model descriptions in method section. Result and +discussion state the key learning, observations, and conclusion with future +work rounds out the paper. + +
+
+
+
+
+ + ♻ ☆ Multi-View Video-Based Learning: Leveraging Weak Labels for Frame-Level + Perception + + +
+ For training a video-based action recognition model that accepts multi-view +video, annotating frame-level labels is tedious and difficult. However, it is +relatively easy to annotate sequence-level labels. This kind of coarse +annotations are called as weak labels. However, training a multi-view +video-based action recognition model with weak labels for frame-level +perception is challenging. In this paper, we propose a novel learning +framework, where the weak labels are first used to train a multi-view +video-based base model, which is subsequently used for downstream frame-level +perception tasks. The base model is trained to obtain individual latent +embeddings for each view in the multi-view input. For training the model using +the weak labels, we propose a novel latent loss function. We also propose a +model that uses the view-specific latent embeddings for downstream frame-level +action recognition and detection tasks. The proposed framework is evaluated +using the MM Office dataset by comparing several baseline algorithms. The +results show that the proposed base model is effectively trained using weak +labels and the latent embeddings help the downstream models improve accuracy. + +
+
+
+
+
+ + ♻ ☆ EventBind: Learning a Unified Representation to Bind Them All for + Event-based Open-world Understanding + + +
+ In this paper, we propose EventBind, a novel and effective framework that +unleashes the potential of vision-language models (VLMs) for event-based +recognition to compensate for the lack of large-scale event-based datasets. In +particular, due to the distinct modality gap with the image-text data and the +lack of large-scale datasets, learning a common representation space for +images, texts, and events is non-trivial.Intuitively, we need to address two +key challenges: 1) how to generalize CLIP's visual encoder to event data while +fully leveraging events' unique properties, e.g., sparsity and high temporal +resolution; 2) how to effectively align the multi-modal embeddings, i.e., +image, text, and events. Accordingly, we first introduce a novel event encoder +that subtly models the temporal information from events and meanwhile, +generates event prompts for modality bridging. We then design a text encoder +that generates content prompts and utilizes hybrid text prompts to enhance +EventBind's generalization ability across diverse datasets.With the proposed +event encoder, text encoder, and image encoder, a novel Hierarchical Triple +Contrastive Alignment (HTCA) module is introduced to jointly optimize the +correlation and enable efficient knowledge transfer among the three modalities. +We evaluate various settings, including fine-tuning and few-shot on three +benchmarks, and our EventBind achieves new state-of-the-art accuracy compared +with the previous methods, such as on N-Caltech101 (+5.34% and +1.70%) and +N-Imagenet (+5.65% and +1.99%) with fine-tuning and 20-shot settings, +respectively. Moreover, our EventBind can be flexibly extended to the event +retrieval task using text or image queries, showing plausible performance. Our +project code will be made publicly available. + +
+
+ comment: Conference version with supplementary +
+
+
+
+
+ + ♻ ☆ Towards Dense and Accurate Radar Perception Via Efficient Cross-Modal + Diffusion Model + + +
+ Millimeter wave (mmWave) radars have attracted significant attention from +both academia and industry due to their capability to operate in extreme +weather conditions. However, they face challenges in terms of sparsity and +noise interference, which hinder their application in the field of micro aerial +vehicle (MAV) autonomous navigation. To this end, this paper proposes a novel +approach to dense and accurate mmWave radar point cloud construction via +cross-modal learning. Specifically, we introduce diffusion models, which +possess state-of-the-art performance in generative modeling, to predict +LiDAR-like point clouds from paired raw radar data. We also incorporate the +most recent diffusion model inference accelerating techniques to ensure that +the proposed method can be implemented on MAVs with limited computing +resources.We validate the proposed method through extensive benchmark +comparisons and real-world experiments, demonstrating its superior performance +and generalization ability. Code and pretrained models will be available at +https://github.com/ZJU-FAST-Lab/Radar-Diffusion. + +
+
+ comment: 8 pages, 6 figures, submitted to RA-L +
+
+
+
+
+ + ♻ ☆ Controllable Text-to-3D Generation via Surface-Aligned Gaussian + Splatting + + +
+ While text-to-3D and image-to-3D generation tasks have received considerable +attention, one important but under-explored field between them is controllable +text-to-3D generation, which we mainly focus on in this work. To address this +task, 1) we introduce Multi-view ControlNet (MVControl), a novel neural network +architecture designed to enhance existing pre-trained multi-view diffusion +models by integrating additional input conditions, such as edge, depth, normal, +and scribble maps. Our innovation lies in the introduction of a conditioning +module that controls the base diffusion model using both local and global +embeddings, which are computed from the input condition images and camera +poses. Once trained, MVControl is able to offer 3D diffusion guidance for +optimization-based 3D generation. And, 2) we propose an efficient multi-stage +3D generation pipeline that leverages the benefits of recent large +reconstruction models and score distillation algorithm. Building upon our +MVControl architecture, we employ a unique hybrid diffusion guidance method to +direct the optimization process. In pursuit of efficiency, we adopt 3D +Gaussians as our representation instead of the commonly used implicit +representations. We also pioneer the use of SuGaR, a hybrid representation that +binds Gaussians to mesh triangle faces. This approach alleviates the issue of +poor geometry in 3D Gaussians and enables the direct sculpting of fine-grained +geometry on the mesh. Extensive experiments demonstrate that our method +achieves robust generalization and enables the controllable generation of +high-quality 3D content. + +
+
+ comment: Project page: https://lizhiqi49.github.io/MVControl/ +
+
+
+
+
+ + ♻ ☆ OctreeOcc: Efficient and Multi-Granularity Occupancy Prediction Using + Octree Queries + + +
+ Occupancy prediction has increasingly garnered attention in recent years for +its fine-grained understanding of 3D scenes. Traditional approaches typically +rely on dense, regular grid representations, which often leads to excessive +computational demands and a loss of spatial details for small objects. This +paper introduces OctreeOcc, an innovative 3D occupancy prediction framework +that leverages the octree representation to adaptively capture valuable +information in 3D, offering variable granularity to accommodate object shapes +and semantic regions of varying sizes and complexities. In particular, we +incorporate image semantic information to improve the accuracy of initial +octree structures and design an effective rectification mechanism to refine the +octree structure iteratively. Our extensive evaluations show that OctreeOcc not +only surpasses state-of-the-art methods in occupancy prediction, but also +achieves a 15%-24% reduction in computational overhead compared to +dense-grid-based methods. + +
+
+
+
+
+ + ♻ ☆ Layered 3D Human Generation via Semantic-Aware Diffusion Model + + +
+ The generation of 3D clothed humans has attracted increasing attention in +recent years. However, existing work cannot generate layered high-quality 3D +humans with consistent body structures. As a result, these methods are unable +to arbitrarily and separately change and edit the body and clothing of the +human. In this paper, we propose a text-driven layered 3D human generation +framework based on a novel physically-decoupled semantic-aware diffusion model. +To keep the generated clothing consistent with the target text, we propose a +semantic-confidence strategy for clothing that can eliminate the non-clothing +content generated by the model. To match the clothing with different body +shapes, we propose a SMPL-driven implicit field deformation network that +enables the free transfer and reuse of clothing. Besides, we introduce uniform +shape priors based on the SMPL model for body and clothing, respectively, which +generates more diverse 3D content without being constrained by specific +templates. The experimental results demonstrate that the proposed method not +only generates 3D humans with consistent body structures but also allows free +editing in a layered manner. The source code will be made public. + +
+
+
+
+
+ + ♻ ☆ Improving Diffusion Models for Virtual Try-on + + +
+ This paper considers image-based virtual try-on, which renders an image of a +person wearing a curated garment, given a pair of images depicting the person +and the garment, respectively. Previous works adapt existing exemplar-based +inpainting diffusion models for virtual try-on to improve the naturalness of +the generated visuals compared to other methods (e.g., GAN-based), but they +fail to preserve the identity of the garments. To overcome this limitation, we +propose a novel diffusion model that improves garment fidelity and generates +authentic virtual try-on images. Our method, coined IDM-VTON, uses two +different modules to encode the semantics of garment image; given the base UNet +of the diffusion model, 1) the high-level semantics extracted from a visual +encoder are fused to the cross-attention layer, and then 2) the low-level +features extracted from parallel UNet are fused to the self-attention layer. In +addition, we provide detailed textual prompts for both garment and person +images to enhance the authenticity of the generated visuals. Finally, we +present a customization method using a pair of person-garment images, which +significantly improves fidelity and authenticity. Our experimental results show +that our method outperforms previous approaches (both diffusion-based and +GAN-based) in preserving garment details and generating authentic virtual +try-on images, both qualitatively and quantitatively. Furthermore, the proposed +customization method demonstrates its effectiveness in a real-world scenario. +More visualizations are available in our project page: +https://idm-vton.github.io + +
+
+
+
+
+ + ♻ ☆ Boosting Image Restoration via Priors from Pre-trained Models CVPR2024 + + +
+ Pre-trained models with large-scale training data, such as CLIP and Stable +Diffusion, have demonstrated remarkable performance in various high-level +computer vision tasks such as image understanding and generation from language +descriptions. Yet, their potential for low-level tasks such as image +restoration remains relatively unexplored. In this paper, we explore such +models to enhance image restoration. As off-the-shelf features (OSF) from +pre-trained models do not directly serve image restoration, we propose to learn +an additional lightweight module called Pre-Train-Guided Refinement Module +(PTG-RM) to refine restoration results of a target restoration network with +OSF. PTG-RM consists of two components, Pre-Train-Guided Spatial-Varying +Enhancement (PTG-SVE), and Pre-Train-Guided Channel-Spatial Attention +(PTG-CSA). PTG-SVE enables optimal short- and long-range neural operations, +while PTG-CSA enhances spatial-channel attention for restoration-related +learning. Extensive experiments demonstrate that PTG-RM, with its compact size +($<$1M parameters), effectively enhances restoration performance of various +models across different tasks, including low-light enhancement, deraining, +deblurring, and denoising. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ♻ ☆ RadarCam-Depth: Radar-Camera Fusion for Depth Estimation with Learned + Metric Scale + + +
+ We present a novel approach for metric dense depth estimation based on the +fusion of a single-view image and a sparse, noisy Radar point cloud. The direct +fusion of heterogeneous Radar and image data, or their encodings, tends to +yield dense depth maps with significant artifacts, blurred boundaries, and +suboptimal accuracy. To circumvent this issue, we learn to augment versatile +and robust monocular depth prediction with the dense metric scale induced from +sparse and noisy Radar data. We propose a Radar-Camera framework for highly +accurate and fine-detailed dense depth estimation with four stages, including +monocular depth prediction, global scale alignment of monocular depth with +sparse Radar points, quasi-dense scale estimation through learning the +association between Radar points and image patches, and local scale refinement +of dense depth using a scale map learner. Our proposed method significantly +outperforms the state-of-the-art Radar-Camera depth estimation methods by +reducing the mean absolute error (MAE) of depth estimation by 25.6% and 40.2% +on the challenging nuScenes dataset and our self-collected ZJU-4DRadarCam +dataset, respectively. Our code and dataset will be released at +\url{https://github.com/MMOCKING/RadarCam-Depth}. + +
+
+
+
+
+ + ♻ ☆ UFineBench: Towards Text-based Person Retrieval with Ultra-fine + Granularity + + +
+ Existing text-based person retrieval datasets often have relatively +coarse-grained text annotations. This hinders the model to comprehend the +fine-grained semantics of query texts in real scenarios. To address this +problem, we contribute a new benchmark named \textbf{UFineBench} for text-based +person retrieval with ultra-fine granularity. + Firstly, we construct a new \textbf{dataset} named UFine6926. We collect a +large number of person images and manually annotate each image with two +detailed textual descriptions, averaging 80.8 words each. The average word +count is three to four times that of the previous datasets. In addition of +standard in-domain evaluation, we also propose a special \textbf{evaluation +paradigm} more representative of real scenarios. It contains a new evaluation +set with cross domains, cross textual granularity and cross textual styles, +named UFine3C, and a new evaluation metric for accurately measuring retrieval +ability, named mean Similarity Distribution (mSD). Moreover, we propose CFAM, a +more efficient \textbf{algorithm} especially designed for text-based person +retrieval with ultra fine-grained texts. It achieves fine granularity mining by +adopting a shared cross-modal granularity decoder and hard negative match +mechanism. + With standard in-domain evaluation, CFAM establishes competitive performance +across various datasets, especially on our ultra fine-grained UFine6926. +Furthermore, by evaluating on UFine3C, we demonstrate that training on our +UFine6926 significantly improves generalization to real scenarios compared with +other coarse-grained datasets. The dataset and code will be made publicly +available at \url{https://github.com/Zplusdragon/UFineBench}. + +
+
+
+
+
+ + ♻ ☆ GRA: Detecting Oriented Objects through Group-wise Rotating and + Attention + + +
+ Oriented object detection, an emerging task in recent years, aims to identify +and locate objects across varied orientations. This requires the detector to +accurately capture the orientation information, which varies significantly +within and across images. Despite the existing substantial efforts, +simultaneously ensuring model effectiveness and parameter efficiency remains +challenging in this scenario. In this paper, we propose a lightweight yet +effective Group-wise Rotating and Attention (GRA) module to replace the +convolution operations in backbone networks for oriented object detection. GRA +can adaptively capture fine-grained features of objects with diverse +orientations, comprising two key components: Group-wise Rotating and Group-wise +Attention. Group-wise Rotating first divides the convolution kernel into +groups, where each group extracts different object features by rotating at a +specific angle according to the object orientation. Subsequently, Group-wise +Attention is employed to adaptively enhance the object-related regions in the +feature. The collaborative effort of these components enables GRA to +effectively capture the various orientation information while maintaining +parameter efficiency. Extensive experimental results demonstrate the +superiority of our method. For example, GRA achieves a new state-of-the-art +(SOTA) on the DOTA-v2.0 benchmark, while saving the parameters by nearly 50% +compared to the previous SOTA method. Code will be released. + +
+
+ comment: tech report +
+
+
+
+
+ + ♻ ☆ Urban Scene Diffusion through Semantic Occupancy Map + + +
+ Generating unbounded 3D scenes is crucial for large-scale scene understanding +and simulation. Urban scenes, unlike natural landscapes, consist of various +complex man-made objects and structures such as roads, traffic signs, vehicles, +and buildings. To create a realistic and detailed urban scene, it is crucial to +accurately represent the geometry and semantics of the underlying objects, +going beyond their visual appearance. In this work, we propose UrbanDiffusion, +a 3D diffusion model that is conditioned on a Bird's-Eye View (BEV) map and +generates an urban scene with geometry and semantics in the form of semantic +occupancy map. Our model introduces a novel paradigm that learns the data +distribution of scene-level structures within a latent space and further +enables the expansion of the synthesized scene into an arbitrary scale. After +training on real-world driving datasets, our model can generate a wide range of +diverse urban scenes given the BEV maps from the held-out set and also +generalize to the synthesized maps from a driving simulator. We further +demonstrate its application to scene image synthesis with a pretrained image +generator as a prior. + +
+
+ comment: The project website is https://metadriverse.github.io/urbandiff/ +
+
+
+
+
+ + ♻ ☆ FTIC: Frequency-Aware Transformer for Learned Image Compression ICLR2024 + + +
+ Learned image compression (LIC) has gained traction as an effective solution +for image storage and transmission in recent years. However, existing LIC +methods are redundant in latent representation due to limitations in capturing +anisotropic frequency components and preserving directional details. To +overcome these challenges, we propose a novel frequency-aware transformer (FAT) +block that for the first time achieves multiscale directional ananlysis for +LIC. The FAT block comprises frequency-decomposition window attention (FDWA) +modules to capture multiscale and directional frequency components of natural +images. Additionally, we introduce frequency-modulation feed-forward network +(FMFFN) to adaptively modulate different frequency components, improving +rate-distortion performance. Furthermore, we present a transformer-based +channel-wise autoregressive (T-CA) model that effectively exploits channel +dependencies. Experiments show that our method achieves state-of-the-art +rate-distortion performance compared to existing LIC methods, and evidently +outperforms latest standardized codec VTM-12.1 by 14.5%, 15.1%, 13.0% in +BD-rate on the Kodak, Tecnick, and CLIC datasets. + +
+
+ comment: ICLR2024 poster +
+
+
+
+
+ + ♻ ☆ Exploiting Optical Flow Guidance for Transformer-Based Video Inpainting ECCV + 2022 + + +
+ Transformers have been widely used for video processing owing to the +multi-head self attention (MHSA) mechanism. However, the MHSA mechanism +encounters an intrinsic difficulty for video inpainting, since the features +associated with the corrupted regions are degraded and incur inaccurate self +attention. This problem, termed query degradation, may be mitigated by first +completing optical flows and then using the flows to guide the self attention, +which was verified in our previous work - flow-guided transformer (FGT). We +further exploit the flow guidance and propose FGT++ to pursue more effective +and efficient video inpainting. First, we design a lightweight flow completion +network by using local aggregation and edge loss. Second, to address the query +degradation, we propose a flow guidance feature integration module, which uses +the motion discrepancy to enhance the features, together with a flow-guided +feature propagation module that warps the features according to the flows. +Third, we decouple the transformer along the temporal and spatial dimensions, +where flows are used to select the tokens through a temporally deformable MHSA +mechanism, and global tokens are combined with the inner-window local tokens +through a dual perspective MHSA mechanism. FGT++ is experimentally evaluated to +be outperforming the existing video inpainting networks qualitatively and +quantitatively. + +
+
+ comment: Accepted to TPAMI. This manuscript is a journal extension of our ECCV + 2022 paper (arXiv:2208.06768) +
+
+
+
+
+ + ♻ ☆ Fully Authentic Visual Question Answering Dataset from Online + Communities + + +
+ Visual Question Answering (VQA) entails answering questions about images. We +introduce the first VQA dataset in which all contents originate from an +authentic use case. Sourced from online question answering community forums, we +call it VQAonline. We characterize this dataset and how it relates to eight +mainstream VQA datasets. Observing that answers in our dataset tend to be much +longer (i.e., a mean of 173 words) and so incompatible with standard VQA +evaluation metrics, we instead utilize popular metrics for longer text +evaluation for evaluating six state-of-the-art VQA models on VQAonline and +report where they struggle most. Finally, we analyze which evaluation metrics +align best with human judgments. To facilitate future extensions, we +publicly-share the dataset at: https://vqaonline.github.io/. + +
+
+
+
+
+ + ♻ ☆ Key-point Guided Deformable Image Manipulation Using Diffusion Model + + +
+ In this paper, we introduce a Key-point-guided Diffusion probabilistic Model +(KDM) that gains precise control over images by manipulating the object's +key-point. We propose a two-stage generative model incorporating an optical +flow map as an intermediate output. By doing so, a dense pixel-wise +understanding of the semantic relation between the image and sparse key point +is configured, leading to more realistic image generation. Additionally, the +integration of optical flow helps regulate the inter-frame variance of +sequential images, demonstrating an authentic sequential image generation. The +KDM is evaluated with diverse key-point conditioned image synthesis tasks, +including facial image generation, human pose synthesis, and echocardiography +video prediction, demonstrating the KDM is proving consistency enhanced and +photo-realistic images compared with state-of-the-art models. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ♻ ☆ Towards Generalizing to Unseen Domains with Few Labels CVPR 2024 + + +
+ We approach the challenge of addressing semi-supervised domain generalization +(SSDG). Specifically, our aim is to obtain a model that learns +domain-generalizable features by leveraging a limited subset of labelled data +alongside a substantially larger pool of unlabeled data. Existing domain +generalization (DG) methods which are unable to exploit unlabeled data perform +poorly compared to semi-supervised learning (SSL) methods under SSDG setting. +Nevertheless, SSL methods have considerable room for performance improvement +when compared to fully-supervised DG training. To tackle this underexplored, +yet highly practical problem of SSDG, we make the following core contributions. +First, we propose a feature-based conformity technique that matches the +posterior distributions from the feature space with the pseudo-label from the +model's output space. Second, we develop a semantics alignment loss to learn +semantically-compatible representations by regularizing the semantic structure +in the feature space. Our method is plug-and-play and can be readily integrated +with different SSL-based SSDG baselines without introducing any additional +parameters. Extensive experimental results across five challenging DG +benchmarks with four strong SSL baselines suggest that our method provides +consistent and notable gains in two different SSDG settings. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Chasing Day and Night: Towards Robust and Efficient All-Day Object + Detection Guided by an Event Camera ICRA 2024 + + +
+ The ability to detect objects in all lighting (i.e., normal-, over-, and +under-exposed) conditions is crucial for real-world applications, such as +self-driving.Traditional RGB-based detectors often fail under such varying +lighting conditions.Therefore, recent works utilize novel event cameras to +supplement or guide the RGB modality; however, these methods typically adopt +asymmetric network structures that rely predominantly on the RGB modality, +resulting in limited robustness for all-day detection. In this paper, we +propose EOLO, a novel object detection framework that achieves robust and +efficient all-day detection by fusing both RGB and event modalities. Our EOLO +framework is built based on a lightweight spiking neural network (SNN) to +efficiently leverage the asynchronous property of events. Buttressed by it, we +first introduce an Event Temporal Attention (ETA) module to learn the high +temporal information from events while preserving crucial edge information. +Secondly, as different modalities exhibit varying levels of importance under +diverse lighting conditions, we propose a novel Symmetric RGB-Event Fusion +(SREF) module to effectively fuse RGB-Event features without relying on a +specific modality, thus ensuring a balanced and adaptive fusion for all-day +detection. In addition, to compensate for the lack of paired RGB-Event datasets +for all-day training and evaluation, we propose an event synthesis approach +based on the randomized optical flow that allows for directly generating the +event frame from a single exposure image. We further build two new datasets, +E-MSCOCO and E-VOC based on the popular benchmarks MSCOCO and PASCAL VOC. +Extensive experiments demonstrate that our EOLO outperforms the +state-of-the-art detectors,e.g.,RENet,by a substantial margin (+3.74% mAP50) in +all lighting conditions.Our code and datasets will be available at +https://vlislab22.github.io/EOLO/ + +
+
+ comment: Accepted by ICRA 2024 +
+
+
+
+
+ + ♻ ☆ GGRt: Towards Pose-free Generalizable 3D Gaussian Splatting in Real-time + + +
+ This paper presents GGRt, a novel approach to generalizable novel view +synthesis that alleviates the need for real camera poses, complexity in +processing high-resolution images, and lengthy optimization processes, thus +facilitating stronger applicability of 3D Gaussian Splatting (3D-GS) in +real-world scenarios. Specifically, we design a novel joint learning framework +that consists of an Iterative Pose Optimization Network (IPO-Net) and a +Generalizable 3D-Gaussians (G-3DG) model. With the joint learning mechanism, +the proposed framework can inherently estimate robust relative pose information +from the image observations and thus primarily alleviate the requirement of +real camera poses. Moreover, we implement a deferred back-propagation mechanism +that enables high-resolution training and inference, overcoming the resolution +constraints of previous methods. To enhance the speed and efficiency, we +further introduce a progressive Gaussian cache module that dynamically adjusts +during training and inference. As the first pose-free generalizable 3D-GS +framework, GGRt achieves inference at $\ge$ 5 FPS and real-time rendering at +$\ge$ 100 FPS. Through extensive experimentation, we demonstrate that our +method outperforms existing NeRF-based pose-free techniques in terms of +inference speed and effectiveness. It can also approach the real pose-based +3D-GS methods. Our contributions provide a significant leap forward for the +integration of computer vision and computer graphics into practical +applications, offering state-of-the-art results on LLFF, KITTI, and Waymo Open +datasets and enabling real-time rendering for immersive experiences. + +
+
+ comment: Project page: + \href{https://3d-aigc.github.io/GGRt}{https://3d-aigc.github.io/GGRt} +
+
+
+
+
+ + ♻ ☆ Aria-NeRF: Multimodal Egocentric View Synthesis + + +
+ We seek to accelerate research in developing rich, multimodal scene models +trained from egocentric data, based on differentiable volumetric ray-tracing +inspired by Neural Radiance Fields (NeRFs). The construction of a NeRF-like +model from an egocentric image sequence plays a pivotal role in understanding +human behavior and holds diverse applications within the realms of VR/AR. Such +egocentric NeRF-like models may be used as realistic simulations, contributing +significantly to the advancement of intelligent agents capable of executing +tasks in the real-world. The future of egocentric view synthesis may lead to +novel environment representations going beyond today's NeRFs by augmenting +visual data with multimodal sensors such as IMU for egomotion tracking, audio +sensors to capture surface texture and human language context, and eye-gaze +trackers to infer human attention patterns in the scene. To support and +facilitate the development and evaluation of egocentric multimodal scene +modeling, we present a comprehensive multimodal egocentric video dataset. This +dataset offers a comprehensive collection of sensory data, featuring RGB +images, eye-tracking camera footage, audio recordings from a microphone, +atmospheric pressure readings from a barometer, positional coordinates from +GPS, connectivity details from Wi-Fi and Bluetooth, and information from +dual-frequency IMU datasets (1kHz and 800Hz) paired with a magnetometer. The +dataset was collected with the Meta Aria Glasses wearable device platform. The +diverse data modalities and the real-world context captured within this dataset +serve as a robust foundation for furthering our understanding of human behavior +and enabling more immersive and intelligent experiences in the realms of VR, +AR, and robotics. + +
+
+
+
+
+ + ♻ ☆ Securely Fine-tuning Pre-trained Encoders Against Adversarial Examples + + +
+ With the evolution of self-supervised learning, the pre-training paradigm has +emerged as a predominant solution within the deep learning landscape. Model +providers furnish pre-trained encoders designed to function as versatile +feature extractors, enabling downstream users to harness the benefits of +expansive models with minimal effort through fine-tuning. Nevertheless, recent +works have exposed a vulnerability in pre-trained encoders, highlighting their +susceptibility to downstream-agnostic adversarial examples (DAEs) meticulously +crafted by attackers. The lingering question pertains to the feasibility of +fortifying the robustness of downstream models against DAEs, particularly in +scenarios where the pre-trained encoders are publicly accessible to the +attackers. + In this paper, we initially delve into existing defensive mechanisms against +adversarial examples within the pre-training paradigm. Our findings reveal that +the failure of current defenses stems from the domain shift between +pre-training data and downstream tasks, as well as the sensitivity of encoder +parameters. In response to these challenges, we propose Genetic +Evolution-Nurtured Adversarial Fine-tuning (Gen-AF), a two-stage adversarial +fine-tuning approach aimed at enhancing the robustness of downstream models. +Our extensive experiments, conducted across ten self-supervised training +methods and six datasets, demonstrate that Gen-AF attains high testing accuracy +and robust testing accuracy against state-of-the-art DAEs. + +
+
+
+
+
+ + ♻ ☆ GenCorres: Consistent Shape Matching via Coupled Implicit-Explicit Shape + Generative Models ICLR 2024 + + +
+ This paper introduces GenCorres, a novel unsupervised joint shape matching +(JSM) approach. Our key idea is to learn a mesh generator to fit an unorganized +deformable shape collection while constraining deformations between adjacent +synthetic shapes to preserve geometric structures such as local rigidity and +local conformality. GenCorres presents three appealing advantages over existing +JSM techniques. First, GenCorres performs JSM among a synthetic shape +collection whose size is much bigger than the input shapes and fully leverages +the datadriven power of JSM. Second, GenCorres unifies consistent shape +matching and pairwise matching (i.e., by enforcing deformation priors between +adjacent synthetic shapes). Third, the generator provides a concise encoding of +consistent shape correspondences. However, learning a mesh generator from an +unorganized shape collection is challenging, requiring a good initialization. +GenCorres addresses this issue by learning an implicit generator from the input +shapes, which provides intermediate shapes between two arbitrary shapes. We +introduce a novel approach for computing correspondences between adjacent +implicit surfaces, which we use to regularize the implicit generator. Synthetic +shapes of the implicit generator then guide initial fittings (i.e., via +template-based deformation) for learning the mesh generator. Experimental +results show that GenCorres considerably outperforms state-of-the-art JSM +techniques. The synthetic shapes of GenCorres also achieve salient performance +gains against state-of-the-art deformable shape generators. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Efficient Diffusion-Driven Corruption Editor for Test-Time Adaptation + + +
+ Test-time adaptation (TTA) addresses the unforeseen distribution shifts +occurring during test time. In TTA, both performance and, memory and time +consumption serve as crucial considerations. A recent diffusion-based TTA +approach for restoring corrupted images involves image-level updates. However, +using pixel space diffusion significantly increases resource requirements +compared to conventional model updating TTA approaches, revealing limitations +as a TTA method. To address this, we propose a novel TTA method by leveraging a +latent diffusion model (LDM) based image editing model and fine-tuning it with +our newly introduced corruption modeling scheme. This scheme enhances the +robustness of the diffusion model against distribution shifts by creating +(clean, corrupted) image pairs and fine-tuning the model to edit corrupted +images into clean ones. Moreover, we introduce a distilled variant to +accelerate the model for corruption editing using only 4 network function +evaluations (NFEs). We extensively validated our method across various +architectures and datasets including image and video domains. Our model +achieves the best performance with a 100 times faster runtime than that of a +diffusion-based baseline. Furthermore, it outpaces the speed of the model +updating TTA method based on data augmentation threefold, rendering an +image-level updating approach more practical. + +
+
+
+
+
+ + ♻ ☆ Interactive Continual Learning: Fast and Slow Thinking CVPR 2024 + + +
+ Advanced life forms, sustained by the synergistic interaction of neural +cognitive mechanisms, continually acquire and transfer knowledge throughout +their lifespan. In contrast, contemporary machine learning paradigms exhibit +limitations in emulating the facets of continual learning (CL). Nonetheless, +the emergence of large language models (LLMs) presents promising avenues for +realizing CL via interactions with these models. Drawing on Complementary +Learning System theory, this paper presents a novel Interactive Continual +Learning (ICL) framework, enabled by collaborative interactions among models of +various sizes. Specifically, we assign the ViT model as System1 and multimodal +LLM as System2. To enable the memory module to deduce tasks from class +information and enhance Set2Set retrieval, we propose the Class-Knowledge-Task +Multi-Head Attention (CKT-MHA). Additionally, to improve memory retrieval in +System1 through enhanced geometric representation, we introduce the CL-vMF +mechanism, based on the von Mises-Fisher (vMF) distribution. Meanwhile, we +introduce the von Mises-Fisher Outlier Detection and Interaction (vMF-ODI) +strategy to identify hard examples, thus enhancing collaboration between +System1 and System2 for complex reasoning realization. Comprehensive evaluation +of our proposed ICL demonstrates significant resistance to forgetting and +superior performance relative to existing methods. Code is available at +github.com/ICL. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ MoreStyle: Relax Low-frequency Constraint of Fourier-based Image + Reconstruction in Generalizable Medical Image Segmentation + + +
+ The task of single-source domain generalization (SDG) in medical image +segmentation is crucial due to frequent domain shifts in clinical image +datasets. To address the challenge of poor generalization across different +domains, we introduce a Plug-and-Play module for data augmentation called +MoreStyle. MoreStyle diversifies image styles by relaxing low-frequency +constraints in Fourier space, guiding the image reconstruction network. With +the help of adversarial learning, MoreStyle further expands the style range and +pinpoints the most intricate style combinations within latent features. To +handle significant style variations, we introduce an uncertainty-weighted loss. +This loss emphasizes hard-to-classify pixels resulting only from style shifts +while mitigating true hard-to-classify pixels in both MoreStyle-generated and +original images. Extensive experiments on two widely used benchmarks +demonstrate that the proposed MoreStyle effectively helps to achieve good +domain generalization ability, and has the potential to further boost the +performance of some state-of-the-art SDG methods. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ WIA-LD2ND: Wavelet-based Image Alignment for Self-supervised Low-Dose CT + Denoising + + +
+ In clinical examinations and diagnoses, low-dose computed tomography (LDCT) +is crucial for minimizing health risks compared with normal-dose computed +tomography (NDCT). However, reducing the radiation dose compromises the +signal-to-noise ratio, leading to degraded quality of CT images. To address +this, we analyze LDCT denoising task based on experimental results from the +frequency perspective, and then introduce a novel self-supervised CT image +denoising method called WIA-LD2ND, only using NDCT data. The proposed WIA-LD2ND +comprises two modules: Wavelet-based Image Alignment (WIA) and Frequency-Aware +Multi-scale Loss (FAM). First, WIA is introduced to align NDCT with LDCT by +mainly adding noise to the high-frequency components, which is the main +difference between LDCT and NDCT. Second, to better capture high-frequency +components and detailed information, Frequency-Aware Multi-scale Loss (FAM) is +proposed by effectively utilizing multi-scale feature space. Extensive +experiments on two public LDCT denoising datasets demonstrate that our +WIA-LD2ND, only uses NDCT, outperforms existing several state-of-the-art +weakly-supervised and self-supervised methods. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Uncertainty-Aware Adapter: Adapting Segment Anything Model (SAM) for + Ambiguous Medical Image Segmentation + + +
+ The Segment Anything Model (SAM) gained significant success in natural image +segmentation, and many methods have tried to fine-tune it to medical image +segmentation. An efficient way to do so is by using Adapters, specialized +modules that learn just a few parameters to tailor SAM specifically for medical +images. However, unlike natural images, many tissues and lesions in medical +images have blurry boundaries and may be ambiguous. Previous efforts to adapt +SAM ignore this challenge and can only predict distinct segmentation. It may +mislead clinicians or cause misdiagnosis, especially when encountering rare +variants or situations with low model confidence. In this work, we propose a +novel module called the Uncertainty-aware Adapter, which efficiently +fine-tuning SAM for uncertainty-aware medical image segmentation. Utilizing a +conditional variational autoencoder, we encoded stochastic samples to +effectively represent the inherent uncertainty in medical imaging. We designed +a new module on a standard adapter that utilizes a condition-based strategy to +interact with samples to help SAM integrate uncertainty. We evaluated our +method on two multi-annotated datasets with different modalities: LIDC-IDRI +(lung abnormalities segmentation) and REFUGE2 (optic-cup segmentation). The +experimental results show that the proposed model outperforms all the previous +methods and achieves the new state-of-the-art (SOTA) on both benchmarks. We +also demonstrated that our method can generate diverse segmentation hypotheses +that are more realistic as well as heterogeneous. + +
+
+
+
+
+ + ♻ ☆ Improving Visual Quality and Transferability of Adversarial Attacks on + Face Recognition Simultaneously with Adversarial Restoration + + +
+ Adversarial face examples possess two critical properties: Visual Quality and +Transferability. However, existing approaches rarely address these properties +simultaneously, leading to subpar results. To address this issue, we propose a +novel adversarial attack technique known as Adversarial Restoration +(AdvRestore), which enhances both visual quality and transferability of +adversarial face examples by leveraging a face restoration prior. In our +approach, we initially train a Restoration Latent Diffusion Model (RLDM) +designed for face restoration. Subsequently, we employ the inference process of +RLDM to generate adversarial face examples. The adversarial perturbations are +applied to the intermediate features of RLDM. Additionally, by treating RLDM +face restoration as a sibling task, the transferability of the generated +adversarial face examples is further improved. Our experimental results +validate the effectiveness of the proposed attack method. + +
+
+ comment: \copyright 2023 IEEE. Personal use of this material is permitted. + Permission from IEEE must be obtained for all other uses, in any current or + future media, including reprinting/republishing this material for advertising + or promotional purposes, creating new collective works, for resale or + redistribution to servers or lists, or reuse of any copyrighted component of + this work in other works +
+
+
+
+
+ + ♻ ☆ DiffPortrait3D: Controllable Diffusion for Zero-Shot Portrait View + Synthesis + + +
+ We present DiffPortrait3D, a conditional diffusion model that is capable of +synthesizing 3D-consistent photo-realistic novel views from as few as a single +in-the-wild portrait. Specifically, given a single RGB input, we aim to +synthesize plausible but consistent facial details rendered from novel camera +views with retained both identity and facial expression. In lieu of +time-consuming optimization and fine-tuning, our zero-shot method generalizes +well to arbitrary face portraits with unposed camera views, extreme facial +expressions, and diverse artistic depictions. At its core, we leverage the +generative prior of 2D diffusion models pre-trained on large-scale image +datasets as our rendering backbone, while the denoising is guided with +disentangled attentive control of appearance and camera pose. To achieve this, +we first inject the appearance context from the reference image into the +self-attention layers of the frozen UNets. The rendering view is then +manipulated with a novel conditional control module that interprets the camera +pose by watching a condition image of a crossed subject from the same view. +Furthermore, we insert a trainable cross-view attention module to enhance view +consistency, which is further strengthened with a novel 3D-aware noise +generation process during inference. We demonstrate state-of-the-art results +both qualitatively and quantitatively on our challenging in-the-wild and +multi-view benchmarks. + +
+
+
+
+
+ + ♻ ☆ Mitigating Hallucination in Large Multi-Modal Models via Robust + Instruction Tuning ICLR 2024 + + +
+ Despite the promising progress in multi-modal tasks, current large +multi-modal models (LMMs) are prone to hallucinating inconsistent descriptions +with respect to the associated image and human instructions. This paper +addresses this issue by introducing the first large and diverse visual +instruction tuning dataset, named Large-scale Robust Visual (LRV)-Instruction. +Our dataset comprises 400k visual instructions generated by GPT4, covering 16 +vision-and-language tasks with open-ended instructions and answers. Unlike +existing studies that primarily focus on positive instruction samples, we +design LRV-Instruction to include both positive and negative instructions for +more robust visual instruction tuning. Our negative instructions are designed +at three semantic levels: (i) Nonexistent Object Manipulation, (ii) Existent +Object Manipulation and (iii) Knowledge Manipulation. To efficiently measure +the hallucination generated by LMMs, we propose GPT4-Assisted Visual +Instruction Evaluation (GAVIE), a stable approach to evaluate visual +instruction tuning like human experts. GAVIE does not require human-annotated +groundtruth answers and can adapt to diverse instruction formats. We conduct +comprehensive experiments to investigate the hallucination of LMMs. Our results +demonstrate existing LMMs exhibit significant hallucinations when presented +with our negative instructions, particularly Existent Object and Knowledge +Manipulation instructions. Moreover, we successfully mitigate hallucination by +finetuning MiniGPT4 and mPLUG-Owl on LRV-Instruction while improving +performance on several public datasets compared to state-of-the-art methods. +Additionally, we observed that a balanced ratio of positive and negative +instances in the training data leads to a more robust model. Code and data are +available at https://github.com/FuxiaoLiu/LRV-Instruction. + +
+
+ comment: 40 pages, 32 figures, ICLR 2024 +
+
+
+
+
+ + ♻ ☆ DREAM: Diffusion Rectification and Estimation-Adaptive Models + + +
+ We present DREAM, a novel training framework representing Diffusion +Rectification and Estimation Adaptive Models, requiring minimal code changes +(just three lines) yet significantly enhancing the alignment of training with +sampling in diffusion models. DREAM features two components: diffusion +rectification, which adjusts training to reflect the sampling process, and +estimation adaptation, which balances perception against distortion. When +applied to image super-resolution (SR), DREAM adeptly navigates the tradeoff +between minimizing distortion and preserving high image quality. Experiments +demonstrate DREAM's superiority over standard diffusion-based SR methods, +showing a $2$ to $3\times $ faster training convergence and a $10$ to +$20\times$ reduction in sampling steps to achieve comparable results. We hope +DREAM will inspire a rethinking of diffusion model training paradigms. + +
+
+ comment: 16 pages, 22 figures, 5 tables; the first two authors contributed to + this work equally +
+
+
+
+
+ + ♻ ☆ Instant Uncertainty Calibration of NeRFs Using a Meta-calibrator + + +
+ Although Neural Radiance Fields (NeRFs) have markedly improved novel view +synthesis, accurate uncertainty quantification in their image predictions +remains an open problem. The prevailing methods for estimating uncertainty, +including the state-of-the-art Density-aware NeRF Ensembles (DANE) [29], +quantify uncertainty without calibration. This frequently leads to over- or +under-confidence in image predictions, which can undermine their real-world +applications. In this paper, we propose a method which, for the first time, +achieves calibrated uncertainties for NeRFs. To accomplish this, we overcome a +significant challenge in adapting existing calibration techniques to NeRFs: a +need to hold out ground truth images from the target scene, reducing the number +of images left to train the NeRF. This issue is particularly problematic in +sparse-view settings, where we can operate with as few as three images. To +address this, we introduce the concept of a meta-calibrator that performs +uncertainty calibration for NeRFs with a single forward pass without the need +for holding out any images from the target scene. Our meta-calibrator is a +neural network that takes as input the NeRF images and uncalibrated uncertainty +maps and outputs a scene-specific calibration curve that corrects the NeRF's +uncalibrated uncertainties. We show that the meta-calibrator can generalize on +unseen scenes and achieves well-calibrated and state-of-the-art uncertainty for +NeRFs, significantly beating DANE and other approaches. This opens +opportunities to improve applications that rely on accurate NeRF uncertainty +estimates such as next-best view planning and potentially more trustworthy +image reconstruction for medical diagnosis. + +
+
+
+
+
+ + ♻ ☆ Measuring and Improving Chain-of-Thought Reasoning in Vision-Language + Models NAACL 2024 + + +
+ Vision-language models (VLMs) have recently demonstrated strong efficacy as +visual assistants that can parse natural queries about the visual content and +generate human-like outputs. In this work, we explore the ability of these +models to demonstrate human-like reasoning based on the perceived information. +To address a crucial concern regarding the extent to which their reasoning +capabilities are fully consistent and grounded, we also measure the reasoning +consistency of these models. We achieve this by proposing a chain-of-thought +(CoT) based consistency measure. However, such an evaluation requires a +benchmark that encompasses both high-level inference and detailed reasoning +chains, which is costly. We tackle this challenge by proposing a +LLM-Human-in-the-Loop pipeline, which notably reduces cost while simultaneously +ensuring the generation of a high-quality dataset. Based on this pipeline and +the existing coarse-grained annotated dataset, we build the CURE benchmark to +measure both the zero-shot reasoning performance and consistency of VLMs. We +evaluate existing state-of-the-art VLMs, and find that even the best-performing +model is unable to demonstrate strong visual reasoning capabilities and +consistency, indicating that substantial efforts are required to enable VLMs to +perform visual reasoning as systematically and consistently as humans. As an +early step, we propose a two-stage training framework aimed at improving both +the reasoning performance and consistency of VLMs. The first stage involves +employing supervised fine-tuning of VLMs using step-by-step reasoning samples +automatically generated by LLMs. In the second stage, we further augment the +training process by incorporating feedback provided by LLMs to produce +reasoning chains that are highly consistent and grounded. We empirically +highlight the effectiveness of our framework in both reasoning performance and +consistency. + +
+
+ comment: NAACL 2024 Main Conference. The data is released at + https://github.com/Yangyi-Chen/CoTConsistency +
+
+
+
+
+ + ♻ ☆ DermSynth3D: Synthesis of in-the-wild Annotated Dermatology Images + + +
+ In recent years, deep learning (DL) has shown great potential in the field of +dermatological image analysis. However, existing datasets in this domain have +significant limitations, including a small number of image samples, limited +disease conditions, insufficient annotations, and non-standardized image +acquisitions. To address these shortcomings, we propose a novel framework +called DermSynth3D. DermSynth3D blends skin disease patterns onto 3D textured +meshes of human subjects using a differentiable renderer and generates 2D +images from various camera viewpoints under chosen lighting conditions in +diverse background scenes. Our method adheres to top-down rules that constrain +the blending and rendering process to create 2D images with skin conditions +that mimic in-the-wild acquisitions, ensuring more meaningful results. The +framework generates photo-realistic 2D dermoscopy images and the corresponding +dense annotations for semantic segmentation of the skin, skin conditions, body +parts, bounding boxes around lesions, depth maps, and other 3D scene +parameters, such as camera position and lighting conditions. DermSynth3D allows +for the creation of custom datasets for various dermatology tasks. We +demonstrate the effectiveness of data generated using DermSynth3D by training +DL models on synthetic data and evaluating them on various dermatology tasks +using real 2D dermatological images. We make our code publicly available at +https://github.com/sfu-mial/DermSynth3D. + +
+
+ comment: Accepted to Medical Image Analysis (MedIA) 2024 +
+
+
+
+
+ + ♻ ☆ OSDaR23: Open Sensor Data for Rail 2023 + + +
+ To achieve a driverless train operation on mainline railways, actual and +potential obstacles for the train's driveway must be detected automatically by +appropriate sensor systems. Machine learning algorithms have proven to be +powerful tools for this task during the last years. However, these algorithms +require large amounts of high-quality annotated data containing +railway-specific objects as training data. Unfortunately, all of the publicly +available datasets that tackle this requirement are restricted in some way. +Therefore, this paper presents OSDaR23, a multi-sensor dataset of 45 +subsequences acquired in Hamburg, Germany, in September 2021, that was created +to foster driverless train operation on mainline railways. The sensor setup +consists of multiple calibrated and synchronized infrared (IR) and visual (RGB) +cameras, lidars, a radar, and position and acceleration sensors mounted on the +front of a rail vehicle. In addition to the raw data, the dataset contains +204091 polyline, polygonal, rectangle, and cuboid annotations in total for 20 +different object classes. It is the first publicly available multi-sensor +dataset annotated with a variety of object classes that are relevant for the +railway context. OSDaR23, available at data.fid-move.de/dataset/osdar23, can +also be used for tasks beyond collision prediction, which are listed in this +paper. + +
+
+ comment: 7 pages, 11 images, 5 tables +
+
+
+
+
+ + ♻ ☆ Semi-supervised Active Learning for Video Action Detection AAAI + + +
+ In this work, we focus on label efficient learning for video action +detection. We develop a novel semi-supervised active learning approach which +utilizes both labeled as well as unlabeled data along with informative sample +selection for action detection. Video action detection requires spatio-temporal +localization along with classification, which poses several challenges for both +active learning informative sample selection as well as semi-supervised +learning pseudo label generation. First, we propose NoiseAug, a simple +augmentation strategy which effectively selects informative samples for video +action detection. Next, we propose fft-attention, a novel technique based on +high-pass filtering which enables effective utilization of pseudo label for SSL +in video action detection by emphasizing on relevant activity region within a +video. We evaluate the proposed approach on three different benchmark datasets, +UCF-101-24, JHMDB-21, and Youtube-VOS. First, we demonstrate its effectiveness +on video action detection where the proposed approach outperforms prior works +in semi-supervised and weakly-supervised learning along with several baseline +approaches in both UCF101-24 and JHMDB-21. Next, we also show its effectiveness +on Youtube-VOS for video object segmentation demonstrating its generalization +capability for other dense prediction tasks in videos. + +
+
+ comment: AAAI Conference on Artificial Intelligence, Main Technical Track + (AAAI), 2024 +
+
+
+
+
+ + ♻ ☆ Open Stamped Parts Dataset + + +
+ We present the Open Stamped Parts Dataset (OSPD), featuring synthetic and +real images of stamped metal sheets for auto manufacturing. The real part +images, captured from 7 cameras, consist of 7,980 unlabeled images and 1,680 +labeled images. In addition, we have compiled a defect dataset by overlaying +synthetically generated masks on 10% of the holes. The synthetic dataset +replicates the real manufacturing environment in terms of lighting and part +placement relative to the cameras. The synthetic data includes 7,980 training +images, 1,680 validation images and 1,680 test images, each with bounding box +and segmentation mask annotations around all holes. 10% of the holes in the +synthetic data mimic defects generated in the real image dataset. We trained a +hole-detection model on the synthetic-OSPD, achieving a modified recall score +of 67.2% and a precision of 94.4% . We anticipate researchers in the auto +manufacturing and broader machine learning and computer vision communities +using OSPD to advance the state of the art in defect detection of stamped holes +in the metalsheet stamping process. The dataset is available for download at: +https://tinyurl.com/hm6xatd7 + +
+
+ comment: 6 pages, 7 figures, 2 tables +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 270 + +
+
+
+ + ☆ Implicit Discriminative Knowledge Learning for Visible-Infrared Person + Re-Identification CVPR 2024 + + +
+ Visible-Infrared Person Re-identification (VI-ReID) is a challenging +cross-modal pedestrian retrieval task, due to significant intra-class +variations and cross-modal discrepancies among different cameras. Existing +works mainly focus on embedding images of different modalities into a unified +space to mine modality-shared features. They only seek distinctive information +within these shared features, while ignoring the identity-aware useful +information that is implicit in the modality-specific features. To address this +issue, we propose a novel Implicit Discriminative Knowledge Learning (IDKL) +network to uncover and leverage the implicit discriminative information +contained within the modality-specific. First, we extract modality-specific and +modality-shared features using a novel dual-stream network. Then, the +modality-specific features undergo purification to reduce their modality style +discrepancies while preserving identity-aware discriminative knowledge. +Subsequently, this kind of implicit knowledge is distilled into the +modality-shared feature to enhance its distinctiveness. Finally, an alignment +loss is proposed to minimize modality discrepancy on enhanced modality-shared +features. Extensive experiments on multiple public datasets demonstrate the +superiority of IDKL network over the state-of-the-art methods. Code is +available at https://github.com/1KK077/IDKL. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ LLaVA-UHD: an LMM Perceiving Any Aspect Ratio and High-Resolution Images + + +
+ Visual encoding constitutes the basis of large multimodal models (LMMs) in +understanding the visual world. Conventional LMMs process images in fixed sizes +and limited resolutions, while recent explorations in this direction are +limited in adaptivity, efficiency, and even correctness. In this work, we first +take GPT-4V and LLaVA-1.5 as representative examples and expose systematic +flaws rooted in their visual encoding strategy. To address the challenges, we +present LLaVA-UHD, a large multimodal model that can efficiently perceive +images in any aspect ratio and high resolution. LLaVA-UHD includes three key +components: (1) An image modularization strategy that divides native-resolution +images into smaller variable-sized slices for efficient and extensible +encoding, (2) a compression module that further condenses image tokens from +visual encoders, and (3) a spatial schema to organize slice tokens for LLMs. +Comprehensive experiments show that LLaVA-UHD outperforms established LMMs +trained with 2-3 orders of magnitude more data on 9 benchmarks. Notably, our +model built on LLaVA-1.5 336x336 supports 6 times larger (i.e., 672x1088) +resolution images using only 94% inference computation, and achieves 6.4 +accuracy improvement on TextVQA. Moreover, the model can be efficiently trained +in academic settings, within 23 hours on 8 A100 GPUs (vs. 26 hours of +LLaVA-1.5). We make the data and code publicly available at +https://github.com/thunlp/LLaVA-UHD. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ A Spatial-Temporal Progressive Fusion Network for Breast Lesion + Segmentation in Ultrasound Videos + + +
+ Ultrasound video-based breast lesion segmentation provides a valuable +assistance in early breast lesion detection and treatment. However, existing +works mainly focus on lesion segmentation based on ultrasound breast images +which usually can not be adapted well to obtain desirable results on ultrasound +videos. The main challenge for ultrasound video-based breast lesion +segmentation is how to exploit the lesion cues of both intra-frame and +inter-frame simultaneously. To address this problem, we propose a novel +Spatial-Temporal Progressive Fusion Network (STPFNet) for video based breast +lesion segmentation problem. The main aspects of the proposed STPFNet are +threefold. First, we propose to adopt a unified network architecture to capture +both spatial dependences within each ultrasound frame and temporal correlations +between different frames together for ultrasound data representation. Second, +we propose a new fusion module, termed Multi-Scale Feature Fusion (MSFF), to +fuse spatial and temporal cues together for lesion detection. MSFF can help to +determine the boundary contour of lesion region to overcome the issue of lesion +boundary blurring. Third, we propose to exploit the segmentation result of +previous frame as the prior knowledge to suppress the noisy background and +learn more robust representation. In particular, we introduce a new publicly +available ultrasound video breast lesion segmentation dataset, termed UVBLS200, +which is specifically dedicated to breast lesion segmentation. It contains 200 +videos, including 80 videos of benign lesions and 120 videos of malignant +lesions. Experiments on the proposed dataset demonstrate that the proposed +STPFNet achieves better breast lesion detection performance than +state-of-the-art methods. + +
+
+
+
+
+ + ☆ Urban Scene Diffusion through Semantic Occupancy Map + + +
+ Generating unbounded 3D scenes is crucial for large-scale scene understanding +and simulation. Urban scenes, unlike natural landscapes, consist of various +complex man-made objects and structures such as roads, traffic signs, vehicles, +and buildings. To create a realistic and detailed urban scene, it is crucial to +accurately represent the geometry and semantics of the underlying objects, +going beyond their visual appearance. In this work, we propose UrbanDiffusion, +a 3D diffusion model that is conditioned on a Bird's-Eye View (BEV) map and +generates an urban scene with geometry and semantics in the form of semantic +occupancy map. Our model introduces a novel paradigm that learns the data +distribution of scene-level structures within a latent space and further +enables the expansion of the synthesized scene into an arbitrary scale. After +training on real-world driving datasets, our model can generate a wide range of +diverse urban scenes given the BEV maps from the held-out set and also +generalize to the synthesized maps from a driving simulator. We further +demonstrate its application to scene image synthesis with a pretrained image +generator as a prior. + +
+
+ comment: The project website is https://metadriverse.github.io/urbandiff/ +
+
+
+
+
+ + ☆ TrajectoryNAS: A Neural Architecture Search for Trajectory Prediction + + +
+ Autonomous driving systems are a rapidly evolving technology that enables +driverless car production. Trajectory prediction is a critical component of +autonomous driving systems, enabling cars to anticipate the movements of +surrounding objects for safe navigation. Trajectory prediction using Lidar +point-cloud data performs better than 2D images due to providing 3D +information. However, processing point-cloud data is more complicated and +time-consuming than 2D images. Hence, state-of-the-art 3D trajectory +predictions using point-cloud data suffer from slow and erroneous predictions. +This paper introduces TrajectoryNAS, a pioneering method that focuses on +utilizing point cloud data for trajectory prediction. By leveraging Neural +Architecture Search (NAS), TrajectoryNAS automates the design of trajectory +prediction models, encompassing object detection, tracking, and forecasting in +a cohesive manner. This approach not only addresses the complex +interdependencies among these tasks but also emphasizes the importance of +accuracy and efficiency in trajectory modeling. Through empirical studies, +TrajectoryNAS demonstrates its effectiveness in enhancing the performance of +autonomous driving systems, marking a significant advancement in the +field.Experimental results reveal that TrajcetoryNAS yield a minimum of 4.8 +higger accuracy and 1.1* lower latency over competing methods on the NuScenes +dataset. + +
+
+
+
+
+ + ☆ Object Segmentation-Assisted Inter Prediction for Versatile Video Coding + + +
+ In modern video coding standards, block-based inter prediction is widely +adopted, which brings high compression efficiency. However, in natural videos, +there are usually multiple moving objects of arbitrary shapes, resulting in +complex motion fields that are difficult to compactly represent. This problem +has been tackled by more flexible block partitioning methods in the Versatile +Video Coding (VVC) standard, but the more flexible partitions require more +overhead bits to signal and still cannot be made arbitrary shaped. To address +this limitation, we propose an object segmentation-assisted inter prediction +method (SAIP), where objects in the reference frames are segmented by some +advanced technologies. With a proper indication, the object segmentation mask +is translated from the reference frame to the current frame as the +arbitrary-shaped partition of different regions without any extra signal. Using +the segmentation mask, motion compensation is separately performed for +different regions, achieving higher prediction accuracy. The segmentation mask +is further used to code the motion vectors of different regions more +efficiently. Moreover, segmentation mask is considered in the joint +rate-distortion optimization for motion estimation and partition estimation to +derive the motion vector of different regions and partition more accurately. +The proposed method is implemented into the VVC reference software, VTM version +12.0. Experimental results show that the proposed method achieves up to 1.98%, +1.14%, 0.79%, and on average 0.82%, 0.49%, 0.37% BD-rate reduction for common +test sequences, under the Low-delay P, Low-delay B, and Random Access +configurations, respectively. + +
+
+ comment: 22 pages, 15 figures +
+
+
+
+
+ + ☆ TTT-KD: Test-Time Training for 3D Semantic Segmentation through + Knowledge Distillation from Foundation Models + + +
+ Test-Time Training (TTT) proposes to adapt a pre-trained network to changing +data distributions on-the-fly. In this work, we propose the first TTT method +for 3D semantic segmentation, TTT-KD, which models Knowledge Distillation (KD) +from foundation models (e.g. DINOv2) as a self-supervised objective for +adaptation to distribution shifts at test-time. Given access to paired +image-pointcloud (2D-3D) data, we first optimize a 3D segmentation backbone for +the main task of semantic segmentation using the pointclouds and the task of 2D +$\to$ 3D KD by using an off-the-shelf 2D pre-trained foundation model. At +test-time, our TTT-KD updates the 3D segmentation backbone for each test +sample, by using the self-supervised task of knowledge distillation, before +performing the final prediction. Extensive evaluations on multiple indoor and +outdoor 3D segmentation benchmarks show the utility of TTT-KD, as it improves +performance for both in-distribution (ID) and out-of-distribution (ODO) test +datasets. We achieve a gain of up to 13% mIoU (7% on average) when the train +and test distributions are similar and up to 45% (20% on average) when adapting +to OOD test samples. + +
+
+
+
+
+ + ☆ MoreStyle: Relax Low-frequency Constraint of Fourier-based Image + Reconstruction in Generalizable Medical Image Segmentation + + +
+ The task of single-source domain generalization (SDG) in medical image +segmentation is crucial due to frequent domain shifts in clinical image +datasets. To address the challenge of poor generalization across different +domains, we introduce a Plug-and-Play module for data augmentation called +MoreStyle. MoreStyle diversifies image styles by relaxing low-frequency +constraints in Fourier space, guiding the image reconstruction network. With +the help of adversarial learning, MoreStyle further expands the style range and +pinpoints the most intricate style combinations within latent features. To +handle significant style variations, we introduce an uncertainty-weighted loss. +This loss emphasizes hard-to-classify pixels resulting only from style shifts +while mitigating true hard-to-classify pixels in both MoreStyle-generated and +original images. Extensive experiments on two widely used benchmarks +demonstrate that the proposed MoreStyle effectively helps to achieve good +domain generalization ability, and has the potential to further boost the +performance of some state-of-the-art SDG methods. + +
+
+ comment: 10 pages, 3 figures +
+
+
+
+
+ + ☆ MASSTAR: A Multi-Modal and Large-Scale Scene Dataset with a Versatile + Toolchain for Surface Prediction and Completion IROS2024 + + +
+ Surface prediction and completion have been widely studied in various +applications. Recently, research in surface completion has evolved from small +objects to complex large-scale scenes. As a result, researchers have begun +increasing the volume of data and leveraging a greater variety of data +modalities including rendered RGB images, descriptive texts, depth images, etc, +to enhance algorithm performance. However, existing datasets suffer from a +deficiency in the amounts of scene-level models along with the corresponding +multi-modal information. Therefore, a method to scale the datasets and generate +multi-modal information in them efficiently is essential. To bridge this +research gap, we propose MASSTAR: a Multi-modal lArge-scale Scene dataset with +a verSatile Toolchain for surfAce pRediction and completion. We develop a +versatile and efficient toolchain for processing the raw 3D data from the +environments. It screens out a set of fine-grained scene models and generates +the corresponding multi-modal data. Utilizing the toolchain, we then generate +an example dataset composed of over a thousand scene-level models with partial +real-world data added. We compare MASSTAR with the existing datasets, which +validates its superiority: the ability to efficiently extract high-quality +models from complex scenarios to expand the dataset. Additionally, several +representative surface completion algorithms are benchmarked on MASSTAR, which +reveals that existing algorithms can hardly deal with scene-level completion. +We will release the source code of our toolchain and the dataset. For more +details, please see our project page at https://sysu-star.github.io/MASSTAR. + +
+
+ comment: Submitted to IROS2024. Code: https://github.com/SYSU-STAR/MASSTAR. + Project Page: https://github.com/SYSU-STAR/MASSTAR +
+
+
+
+
+ + ☆ NEDS-SLAM: A Novel Neural Explicit Dense Semantic SLAM Framework using + 3D Gaussian Splatting + + +
+ We propose NEDS-SLAM, an Explicit Dense semantic SLAM system based on 3D +Gaussian representation, that enables robust 3D semantic mapping, accurate +camera tracking, and high-quality rendering in real-time. In the system, we +propose a Spatially Consistent Feature Fusion model to reduce the effect of +erroneous estimates from pre-trained segmentation head on semantic +reconstruction, achieving robust 3D semantic Gaussian mapping. Additionally, we +employ a lightweight encoder-decoder to compress the high-dimensional semantic +features into a compact 3D Gaussian representation, mitigating the burden of +excessive memory consumption. Furthermore, we leverage the advantage of 3D +Gaussian splatting, which enables efficient and differentiable novel view +rendering, and propose a Virtual Camera View Pruning method to eliminate +outlier GS points, thereby effectively enhancing the quality of scene +representations. Our NEDS-SLAM method demonstrates competitive performance over +existing dense semantic SLAM methods in terms of mapping and tracking accuracy +on Replica and ScanNet datasets, while also showing excellent capabilities in +3D dense semantic mapping. + +
+
+
+
+
+ + ☆ Exploring 3D-aware Latent Spaces for Efficiently Learning Numerous + Scenes + + +
+ We present a method enabling the scaling of NeRFs to learn a large number of +semantically-similar scenes. We combine two techniques to improve the required +training time and memory cost per scene. First, we learn a 3D-aware latent +space in which we train Tri-Plane scene representations, hence reducing the +resolution at which scenes are learned. Moreover, we present a way to share +common information across scenes, hence allowing for a reduction of model +complexity to learn a particular scene. Our method reduces effective per-scene +memory costs by 44% and per-scene time costs by 86% when training 1000 scenes. +Our project page can be found at https://3da-ae.github.io . + +
+
+
+
+
+ + ☆ Better (pseudo-)labels for semi-supervised instance segmentation ICLR 2024 + + +
+ Despite the availability of large datasets for tasks like image +classification and image-text alignment, labeled data for more complex +recognition tasks, such as detection and segmentation, is less abundant. In +particular, for instance segmentation annotations are time-consuming to +produce, and the distribution of instances is often highly skewed across +classes. While semi-supervised teacher-student distillation methods show +promise in leveraging vast amounts of unlabeled data, they suffer from +miscalibration, resulting in overconfidence in frequently represented classes +and underconfidence in rarer ones. Additionally, these methods encounter +difficulties in efficiently learning from a limited set of examples. We +introduce a dual-strategy to enhance the teacher model's training process, +substantially improving the performance on few-shot learning. Secondly, we +propose a calibration correction mechanism that that enables the student model +to correct the teacher's calibration errors. Using our approach, we observed +marked improvements over a state-of-the-art supervised baseline performance on +the LVIS dataset, with an increase of 2.8% in average precision (AP) and 10.3% +gain in AP for rare classes. + +
+
+ comment: Appeared at the Practical ML for Low Resource Settings workshop at + ICLR 2024 +
+
+
+
+
+ + ☆ Towards Generalizing to Unseen Domains with Few Labels CVPR 2024 + + +
+ We approach the challenge of addressing semi-supervised domain generalization +(SSDG). Specifically, our aim is to obtain a model that learns +domain-generalizable features by leveraging a limited subset of labelled data +alongside a substantially larger pool of unlabeled data. Existing domain +generalization (DG) methods which are unable to exploit unlabeled data perform +poorly compared to semi-supervised learning (SSL) methods under SSDG setting. +Nevertheless, SSL methods have considerable room for performance improvement +when compared to fully-supervised DG training. To tackle this underexplored, +yet highly practical problem of SSDG, we make the following core contributions. +First, we propose a feature-based conformity technique that matches the +posterior distributions from the feature space with the pseudo-label from the +model's output space. Second, we develop a semantics alignment loss to learn +semantically-compatible representations by regularizing the semantic structure +in the feature space. Our method is plug-and-play and can be readily integrated +with different SSL-based SSDG baselines without introducing any additional +parameters. Extensive experimental results across five challenging DG +benchmarks with four strong SSL baselines suggest that our method provides +consistent and notable gains in two different SSDG settings. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ WIA-LD2ND: Wavelet-based Image Alignment for Self-supervised Low-Dose CT + Denoising + + +
+ In clinical examinations and diagnoses, low-dose computed tomography (LDCT) +is crucial for minimizing health risks compared with normal-dose computed +tomography (NDCT). However, reducing the radiation dose compromises the +signal-to-noise ratio, leading to degraded quality of CT images. To address +this, we analyze LDCT denoising task based on experimental results from the +frequency perspective, and then introduce a novel self-supervised CT image +denoising method called WIA-LD2ND, only using NDCT data. The proposed WIA-LD2ND +comprises two modules: Wavelet-based Image Alignment (WIA) and Frequency-Aware +Multi-scale Loss (FAM). First, WIA is introduced to align NDCT with LDCT by +mainly adding noise to the high-frequency components, which is the main +difference between LDCT and NDCT. Second, to better capture high-frequency +components and detailed information, Frequency-Aware Multi-scale Loss (FAM) is +proposed by effectively utilizing multi-scale feature space. Extensive +experiments on two public LDCT denoising datasets demonstrate that our +WIA-LD2ND, only uses NDCT, outperforms existing several state-of-the-art +weakly-supervised and self-supervised methods. + +
+
+ comment: 11 pages, 4 figures +
+
+
+
+
+ + ☆ Binary Noise for Binary Tasks: Masked Bernoulli Diffusion for + Unsupervised Anomaly Detection + + +
+ The high performance of denoising diffusion models for image generation has +paved the way for their application in unsupervised medical anomaly detection. +As diffusion-based methods require a lot of GPU memory and have long sampling +times, we present a novel and fast unsupervised anomaly detection approach +based on latent Bernoulli diffusion models. We first apply an autoencoder to +compress the input images into a binary latent representation. Next, a +diffusion model that follows a Bernoulli noise schedule is employed to this +latent space and trained to restore binary latent representations from +perturbed ones. The binary nature of this diffusion model allows us to identify +entries in the latent space that have a high probability of flipping their +binary code during the denoising process, which indicates out-of-distribution +data. We propose a masking algorithm based on these probabilities, which +improves the anomaly detection scores. We achieve state-of-the-art performance +compared to other diffusion-based unsupervised anomaly detection algorithms +while significantly reducing sampling time and memory consumption. The code is +available at https://github.com/JuliaWolleb/Anomaly_berdiff. + +
+
+
+
+
+ + ☆ Normalized Validity Scores for DNNs in Regression based Eye Feature + Extraction + + +
+ We propose an improvement to the landmark validity loss. Landmark detection +is widely used in head pose estimation, eyelid shape extraction, as well as +pupil and iris segmentation. There are numerous additional applications where +landmark detection is used to estimate the shape of complex objects. One part +of this process is the accurate and fine-grained detection of the shape. The +other part is the validity or inaccuracy per landmark, which can be used to +detect unreliable areas, where the shape possibly does not fit, and to improve +the accuracy of the entire shape extraction by excluding inaccurate landmarks. +We propose a normalization in the loss formulation, which improves the accuracy +of the entire approach due to the numerical balance of the normalized +inaccuracy. In addition, we propose a margin for the inaccuracy to reduce the +impact of gradients, which are produced by negligible errors close to the +ground truth. + +
+
+
+
+
+ + ☆ LocalStyleFool: Regional Video Style Transfer Attack Using Segment + Anything Model SP + + +
+ Previous work has shown that well-crafted adversarial perturbations can +threaten the security of video recognition systems. Attackers can invade such +models with a low query budget when the perturbations are semantic-invariant, +such as StyleFool. Despite the query efficiency, the naturalness of the minutia +areas still requires amelioration, since StyleFool leverages style transfer to +all pixels in each frame. To close the gap, we propose LocalStyleFool, an +improved black-box video adversarial attack that superimposes regional +style-transfer-based perturbations on videos. Benefiting from the popularity +and scalably usability of Segment Anything Model (SAM), we first extract +different regions according to semantic information and then track them through +the video stream to maintain the temporal consistency. Then, we add +style-transfer-based perturbations to several regions selected based on the +associative criterion of transfer-based gradient information and regional area. +Perturbation fine adjustment is followed to make stylized videos adversarial. +We demonstrate that LocalStyleFool can improve both intra-frame and inter-frame +naturalness through a human-assessed survey, while maintaining competitive +fooling rate and query efficiency. Successful experiments on the +high-resolution dataset also showcase that scrupulous segmentation of SAM helps +to improve the scalability of adversarial attacks under high-resolution data. + +
+
+ comment: Accepted to 2024 IEEE Security and Privacy Workshops (SPW) +
+
+
+
+
+ + ☆ Prioritized Semantic Learning for Zero-shot Instance Navigation + + +
+ We study zero-shot instance navigation, in which the agent navigates to a +specific object without using object annotations for training. Previous object +navigation approaches apply the image-goal navigation (ImageNav) task (go to +the location of an image) for pretraining, and transfer the agent to achieve +object goals using a vision-language model. However, these approaches lead to +issues of semantic neglect, where the model fails to learn meaningful semantic +alignments. In this paper, we propose a Prioritized Semantic Learning (PSL) +method to improve the semantic understanding ability of navigation agents. +Specifically, a semantic-enhanced PSL agent is proposed and a prioritized +semantic training strategy is introduced to select goal images that exhibit +clear semantic supervision and relax the reward function from strict exact view +matching. At inference time, a semantic expansion inference scheme is designed +to preserve the same granularity level of the goal-semantic as training. +Furthermore, for the popular HM3D environment, we present an Instance +Navigation (InstanceNav) task that requires going to a specific object instance +with detailed descriptions, as opposed to the Object Navigation (ObjectNav) +task where the goal is defined merely by the object category. Our PSL agent +outperforms the previous state-of-the-art by 66% on zero-shot ObjectNav in +terms of success rate and is also superior on the new InstanceNav task. Code +will be released at https://anonymous.4open. science/r/PSL/. + +
+
+
+
+
+ + ☆ Gridless 2D Recovery of Lines using the Sliding Frank-Wolfe Algorithm + + +
+ We present a new approach leveraging the Sliding Frank--Wolfe algorithm to +address the challenge of line recovery in degraded images. Building upon +advances in conditional gradient methods for sparse inverse problems with +differentiable measurement models, we propose two distinct models tailored for +line detection tasks within the realm of blurred line deconvolution and ridge +detection of linear chirps in spectrogram images. + +
+
+
+
+
+ + ☆ MedMerge: Merging Models for Effective Transfer Learning to Medical + Imaging Tasks + + +
+ Transfer learning has become a powerful tool to initialize deep learning +models to achieve faster convergence and higher performance. This is especially +useful in the medical imaging analysis domain, where data scarcity limits +possible performance gains for deep learning models. Some advancements have +been made in boosting the transfer learning performance gain by merging models +starting from the same initialization. However, in the medical imaging analysis +domain, there is an opportunity in merging models starting from different +initialisations, thus combining the features learnt from different tasks. In +this work, we propose MedMerge, a method whereby the weights of different +models can be merged, and their features can be effectively utilized to boost +performance on a new task. With MedMerge, we learn kernel-level weights that +can later be used to merge the models into a single model, even when starting +from different initializations. Testing on various medical imaging analysis +tasks, we show that our merged model can achieve significant performance gains, +with up to 3% improvement on the F1 score. The code implementation of this work +will be available at www.github.com/BioMedIA-MBZUAI/MedMerge. + +
+
+
+
+
+ + ☆ Diffusion-Based Environment-Aware Trajectory Prediction + + +
+ The ability to predict the future trajectories of traffic participants is +crucial for the safe and efficient operation of autonomous vehicles. In this +paper, a diffusion-based generative model for multi-agent trajectory prediction +is proposed. The model is capable of capturing the complex interactions between +traffic participants and the environment, accurately learning the multimodal +nature of the data. The effectiveness of the approach is assessed on +large-scale datasets of real-world traffic scenarios, showing that our model +outperforms several well-established methods in terms of prediction accuracy. +By the incorporation of differential motion constraints on the model output, we +illustrate that our model is capable of generating a diverse set of realistic +future trajectories. Through the use of an interaction-aware guidance signal, +we further demonstrate that the model can be adapted to predict the behavior of +less cooperative agents, emphasizing its practical applicability under +uncertain traffic conditions. + +
+
+
+
+
+ + ☆ Arc2Face: A Foundation Model of Human Faces + + +
+ This paper presents Arc2Face, an identity-conditioned face foundation model, +which, given the ArcFace embedding of a person, can generate diverse +photo-realistic images with an unparalleled degree of face similarity than +existing models. Despite previous attempts to decode face recognition features +into detailed images, we find that common high-resolution datasets (e.g. FFHQ) +lack sufficient identities to reconstruct any subject. To that end, we +meticulously upsample a significant portion of the WebFace42M database, the +largest public dataset for face recognition (FR). Arc2Face builds upon a +pretrained Stable Diffusion model, yet adapts it to the task of ID-to-face +generation, conditioned solely on ID vectors. Deviating from recent works that +combine ID with text embeddings for zero-shot personalization of text-to-image +models, we emphasize on the compactness of FR features, which can fully capture +the essence of the human face, as opposed to hand-crafted prompts. Crucially, +text-augmented models struggle to decouple identity and text, usually +necessitating some description of the given face to achieve satisfactory +similarity. Arc2Face, however, only needs the discriminative features of +ArcFace to guide the generation, offering a robust prior for a plethora of +tasks where ID consistency is of paramount importance. As an example, we train +a FR model on synthetic images from our model and achieve superior performance +to existing synthetic datasets. + +
+
+ comment: 29 pages, 20 figures. Project page: https://arc2face.github.io/ +
+
+
+
+
+ + ☆ An Accurate and Real-time Relative Pose Estimation from Triple + Point-line Images by Decoupling Rotation and Translation + + +
+ Line features are valid complements for point features in man-made +environments. 3D-2D constraints provided by line features have been widely used +in Visual Odometry (VO) and Structure-from-Motion (SfM) systems. However, how +to accurately solve three-view relative motion only with 2D observations of +points and lines in real time has not been fully explored. In this paper, we +propose a novel three-view pose solver based on rotation-translation decoupled +estimation. First, a high-precision rotation estimation method based on normal +vector coplanarity constraints that consider the uncertainty of observations is +proposed, which can be solved by Levenberg-Marquardt (LM) algorithm +efficiently. Second, a robust linear translation constraint that minimizes the +degree of the rotation components and feature observation components in +equations is elaborately designed for estimating translations accurately. +Experiments on synthetic data and real-world data show that the proposed +approach improves both rotation and translation accuracy compared to the +classical trifocal-tensor-based method and the state-of-the-art two-view +algorithm in outdoor and indoor environments. + +
+
+
+
+
+ + ☆ Personalized 3D Human Pose and Shape Refinement ICCV + + +
+ Recently, regression-based methods have dominated the field of 3D human pose +and shape estimation. Despite their promising results, a common issue is the +misalignment between predictions and image observations, often caused by minor +joint rotation errors that accumulate along the kinematic chain. To address +this issue, we propose to construct dense correspondences between initial human +model estimates and the corresponding images that can be used to refine the +initial predictions. To this end, we utilize renderings of the 3D models to +predict per-pixel 2D displacements between the synthetic renderings and the RGB +images. This allows us to effectively integrate and exploit appearance +information of the persons. Our per-pixel displacements can be efficiently +transformed to per-visible-vertex displacements and then used for 3D model +refinement by minimizing a reprojection loss. To demonstrate the effectiveness +of our approach, we refine the initial 3D human mesh predictions of multiple +models using different refinement procedures on 3DPW and RICH. We show that our +approach not only consistently leads to better image-model alignment, but also +to improved 3D accuracy. + +
+
+ comment: Accepted to 2023 IEEE/CVF International Conference on Computer Vision + Workshops (ICCVW) +
+
+
+
+
+ + ☆ Compositional Kronecker Context Optimization for Vision-Language Models + + +
+ Context Optimization (CoOp) has emerged as a simple yet effective technique +for adapting CLIP-like vision-language models to downstream image recognition +tasks. Nevertheless, learning compact context with satisfactory base-to-new, +domain and cross-task generalization ability while adapting to new tasks is +still a challenge. To tackle such a challenge, we propose a lightweight yet +generalizable approach termed Compositional Kronecker Context Optimization +(CK-CoOp). Technically, the prompt's context words in CK-CoOp are learnable +vectors, which are crafted by linearly combining base vectors sourced from a +dictionary. These base vectors consist of a non-learnable component obtained by +quantizing the weights in the token embedding layer, and a learnable component +constructed by applying Kronecker product on several learnable tiny matrices. +Intuitively, the compositional structure mitigates the risk of overfitting on +training data by remembering more pre-trained knowledge. Meantime, the +Kronecker product breaks the non-learnable restrictions of the dictionary, +thereby enhancing representation ability with minimal additional parameters. +Extensive experiments confirm that CK-CoOp achieves state-of-the-art +performance under base-to-new, domain and cross-task generalization evaluation, +but also has the metrics of fewer learnable parameters and efficient training +and inference speed. + +
+
+
+
+
+ + ☆ LoRA-Composer: Leveraging Low-Rank Adaptation for Multi-Concept + Customization in Training-Free Diffusion Models + + +
+ Customization generation techniques have significantly advanced the synthesis +of specific concepts across varied contexts. Multi-concept customization +emerges as the challenging task within this domain. Existing approaches often +rely on training a Low-Rank Adaptations (LoRA) fusion matrix of multiple LoRA +to merge various concepts into a single image. However, we identify this +straightforward method faces two major challenges: 1) concept confusion, which +occurs when the model cannot preserve distinct individual characteristics, and +2) concept vanishing, where the model fails to generate the intended subjects. +To address these issues, we introduce LoRA-Composer, a training-free framework +designed for seamlessly integrating multiple LoRAs, thereby enhancing the +harmony among different concepts within generated images. LoRA-Composer +addresses concept vanishing through Concept Injection Constraints, enhancing +concept visibility via an expanded cross-attention mechanism. To combat concept +confusion, Concept Isolation Constraints are introduced, refining the +self-attention computation. Furthermore, Latent Re-initialization is proposed +to effectively stimulate concept-specific latent within designated regions. Our +extensive testing showcases a notable enhancement in LoRA-Composer's +performance compared to standard baselines, especially when eliminating the +image-based conditions like canny edge or pose estimations. Code is released at +https://github.com/Young98CN/LoRA\_Composer. + +
+
+
+
+
+ + ☆ QEAN: Quaternion-Enhanced Attention Network for Visual Dance Generation + + +
+ The study of music-generated dance is a novel and challenging Image +generation task. It aims to input a piece of music and seed motions, then +generate natural dance movements for the subsequent music. Transformer-based +methods face challenges in time series prediction tasks related to human +movements and music due to their struggle in capturing the nonlinear +relationship and temporal aspects. This can lead to issues like joint +deformation, role deviation, floating, and inconsistencies in dance movements +generated in response to the music. In this paper, we propose a +Quaternion-Enhanced Attention Network (QEAN) for visual dance synthesis from a +quaternion perspective, which consists of a Spin Position Embedding (SPE) +module and a Quaternion Rotary Attention (QRA) module. First, SPE embeds +position information into self-attention in a rotational manner, leading to +better learning of features of movement sequences and audio sequences, and +improved understanding of the connection between music and dance. Second, QRA +represents and fuses 3D motion features and audio features in the form of a +series of quaternions, enabling the model to better learn the temporal +coordination of music and dance under the complex temporal cycle conditions of +dance generation. Finally, we conducted experiments on the dataset AIST++, and +the results show that our approach achieves better and more robust performance +in generating accurate, high-quality dance movements. Our source code and +dataset can be available from https://github.com/MarasyZZ/QEAN and +https://google.github.io/aistplusplus_dataset respectively. + +
+
+ comment: Accepted by The Visual Computer Journal +
+
+
+
+
+ + ☆ GaussNav: Gaussian Splatting for Visual Navigation + + +
+ In embodied vision, Instance ImageGoal Navigation (IIN) requires an agent to +locate a specific object depicted in a goal image within an unexplored +environment. The primary difficulty of IIN stems from the necessity of +recognizing the target object across varying viewpoints and rejecting potential +distractors. + Existing map-based navigation methods largely adopt the representation form +of Bird's Eye View (BEV) maps, which, however, lack the representation of +detailed textures in a scene. + To address the above issues, we propose a new Gaussian Splatting Navigation +(abbreviated as GaussNav) framework for IIN task, which constructs a novel map +representation based on 3D Gaussian Splatting (3DGS). + The proposed framework enables the agent to not only memorize the geometry +and semantic information of the scene, but also retain the textural features of +objects. + Our GaussNav framework demonstrates a significant leap in performance, +evidenced by an increase in Success weighted by Path Length (SPL) from 0.252 to +0.578 on the challenging Habitat-Matterport 3D (HM3D) dataset. + Our code will be made publicly available. + +
+
+ comment: conference +
+
+
+
+
+ + ☆ Multi-View Video-Based Learning: Leveraging Weak Labels for Frame-Level + Perception + + +
+ For training a video-based action recognition model that accepts multi-view +video, annotating frame-level labels is tedious and difficult. However, it is +relatively easy to annotate sequence-level labels. This kind of coarse +annotations are called as weak labels. However, training a multi-view +video-based action recognition model with weak labels for frame-level +perception is challenging. In this paper, we propose a novel learning +framework, where the weak labels are first used to train a multi-view +video-based base model, which is subsequently used for downstream frame-level +perception tasks. The base model is trained to obtain individual latent +embeddings for each view in the multi-view input. For training the model using +the weak labels, we propose a novel latent loss function. We also propose a +model that uses the view-specific latent embeddings for downstream frame-level +action recognition and detection tasks. The proposed framework is evaluated +using the MM Office dataset by comparing several baseline algorithms. The +results show that the proposed base model is effectively trained using weak +labels and the latent embeddings help the downstream models improve accuracy. + +
+
+
+
+
+ + ☆ CRS-Diff: Controllable Generative Remote Sensing Foundation Model + + +
+ The emergence of diffusion models has revolutionized the field of image +generation, providing new methods for creating high-quality, high-resolution +images across various applications. However, the potential of these models for +generating domain-specific images, particularly remote sensing (RS) images, +remains largely untapped. RS images that are notable for their high resolution, +extensive coverage, and rich information content, bring new challenges that +general diffusion models may not adequately address. This paper proposes +CRS-Diff, a pioneering diffusion modeling framework specifically tailored for +generating remote sensing imagery, leveraging the inherent advantages of +diffusion models while integrating advanced control mechanisms to ensure that +the imagery is not only visually clear but also enriched with geographic and +temporal information. The model integrates global and local control inputs, +enabling precise combinations of generation conditions to refine the generation +process. A comprehensive evaluation of CRS-Diff has demonstrated its superior +capability to generate RS imagery both in a single condition and multiple +conditions compared with previous methods in terms of image quality and +diversity. + +
+
+
+
+
+ + ☆ End-to-end multi-modal product matching in fashion e-commerce KDD + + +
+ Product matching, the task of identifying different representations of the +same product for better discoverability, curation, and pricing, is a key +capability for online marketplace and e-commerce companies. We present a robust +multi-modal product matching system in an industry setting, where large +datasets, data distribution shifts and unseen domains pose challenges. We +compare different approaches and conclude that a relatively straightforward +projection of pretrained image and text encoders, trained through contrastive +learning, yields state-of-the-art results, while balancing cost and +performance. Our solution outperforms single modality matching systems and +large pretrained models, such as CLIP. Furthermore we show how a +human-in-the-loop process can be combined with model-based predictions to +achieve near perfect precision in a production system. + +
+
+ comment: 9 pages, submitted to SIGKDD +
+
+
+
+
+ + ☆ HSEmotion Team at the 6th ABAW Competition: Facial Expressions, + Valence-Arousal and Emotion Intensity Prediction + + +
+ This article presents our results for the sixth Affective Behavior Analysis +in-the-wild (ABAW) competition. To improve the trustworthiness of facial +analysis, we study the possibility of using pre-trained deep models that +extract reliable emotional features without the need to fine-tune the neural +networks for a downstream task. In particular, we introduce several lightweight +models based on MobileViT, MobileFaceNet, EfficientNet, and DDAMFN +architectures trained in multi-task scenarios to recognize facial expressions, +valence, and arousal on static photos. These neural networks extract +frame-level features fed into a simple classifier, e.g., linear feed-forward +neural network, to predict emotion intensity, compound expressions, action +units, facial expressions, and valence/arousal. Experimental results for five +tasks from the sixth ABAW challenge demonstrate that our approach lets us +significantly improve quality metrics on validation sets compared to existing +non-ensemble techniques. + +
+
+ comment: 10 pages, 1 figure, 8 tables +
+
+
+
+
+ + ☆ UV Gaussians: Joint Learning of Mesh Deformation and Gaussian Textures + for Human Avatar Modeling + + +
+ Reconstructing photo-realistic drivable human avatars from multi-view image +sequences has been a popular and challenging topic in the field of computer +vision and graphics. While existing NeRF-based methods can achieve high-quality +novel view rendering of human models, both training and inference processes are +time-consuming. Recent approaches have utilized 3D Gaussians to represent the +human body, enabling faster training and rendering. However, they undermine the +importance of the mesh guidance and directly predict Gaussians in 3D space with +coarse mesh guidance. This hinders the learning procedure of the Gaussians and +tends to produce blurry textures. Therefore, we propose UV Gaussians, which +models the 3D human body by jointly learning mesh deformations and 2D UV-space +Gaussian textures. We utilize the embedding of UV map to learn Gaussian +textures in 2D space, leveraging the capabilities of powerful 2D networks to +extract features. Additionally, through an independent Mesh network, we +optimize pose-dependent geometric deformations, thereby guiding Gaussian +rendering and significantly enhancing rendering quality. We collect and process +a new dataset of human motion, which includes multi-view images, scanned +models, parametric model registration, and corresponding texture maps. +Experimental results demonstrate that our method achieves state-of-the-art +synthesis of novel view and novel pose. The code and data will be made +available on the homepage https://alex-jyj.github.io/UV-Gaussians/ once the +paper is accepted. + +
+
+
+
+
+ + ☆ DynoSurf: Neural Deformation-based Temporally Consistent Dynamic Surface + Reconstruction + + +
+ This paper explores the problem of reconstructing temporally consistent +surfaces from a 3D point cloud sequence without correspondence. To address this +challenging task, we propose DynoSurf, an unsupervised learning framework +integrating a template surface representation with a learnable deformation +field. Specifically, we design a coarse-to-fine strategy for learning the +template surface based on the deformable tetrahedron representation. +Furthermore, we propose a learnable deformation representation based on the +learnable control points and blending weights, which can deform the template +surface non-rigidly while maintaining the consistency of the local shape. +Experimental results demonstrate the significant superiority of DynoSurf over +current state-of-the-art approaches, showcasing its potential as a powerful +tool for dynamic mesh reconstruction. The code is publicly available at +https://github.com/yaoyx689/DynoSurf. + +
+
+
+
+
+ + ☆ OurDB: Ouroboric Domain Bridging for Multi-Target Domain Adaptive + Semantic Segmentation + + +
+ Multi-target domain adaptation (MTDA) for semantic segmentation poses a +significant challenge, as it involves multiple target domains with varying +distributions. The goal of MTDA is to minimize the domain discrepancies among a +single source and multi-target domains, aiming to train a single model that +excels across all target domains. Previous MTDA approaches typically employ +multiple teacher architectures, where each teacher specializes in one target +domain to simplify the task. However, these architectures hinder the student +model from fully assimilating comprehensive knowledge from all target-specific +teachers and escalate training costs with increasing target domains. In this +paper, we propose an ouroboric domain bridging (OurDB) framework, offering an +efficient solution to the MTDA problem using a single teacher architecture. +This framework dynamically cycles through multiple target domains, aligning +each domain individually to restrain the biased alignment problem, and utilizes +Fisher information to minimize the forgetting of knowledge from previous target +domains. We also propose a context-guided class-wise mixup (CGMix) that +leverages contextual information tailored to diverse target contexts in MTDA. +Experimental evaluations conducted on four urban driving datasets (i.e., GTA5, +Cityscapes, IDD, and Mapillary) demonstrate the superiority of our method over +existing state-of-the-art approaches. + +
+
+
+
+
+ + ☆ 3DGS-Calib: 3D Gaussian Splatting for Multimodal SpatioTemporal + Calibration + + +
+ Reliable multimodal sensor fusion algorithms re- quire accurate +spatiotemporal calibration. Recently, targetless calibration techniques based +on implicit neural representations have proven to provide precise and robust +results. Nevertheless, such methods are inherently slow to train given the high +compu- tational overhead caused by the large number of sampled points required +for volume rendering. With the recent introduction of 3D Gaussian Splatting as +a faster alternative to implicit representation methods, we propose to leverage +this new ren- dering approach to achieve faster multi-sensor calibration. We +introduce 3DGS-Calib, a new calibration method that relies on the speed and +rendering accuracy of 3D Gaussian Splatting to achieve multimodal +spatiotemporal calibration that is accurate, robust, and with a substantial +speed-up compared to methods relying on implicit neural representations. We +demonstrate the superiority of our proposal with experimental results on +sequences from KITTI-360, a widely used driving dataset. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ MISS: Memory-efficient Instance Segmentation Framework By Visual + Inductive Priors Flow Propagation + + +
+ Instance segmentation, a cornerstone task in computer vision, has +wide-ranging applications in diverse industries. The advent of deep learning +and artificial intelligence has underscored the criticality of training +effective models, particularly in data-scarce scenarios - a concern that +resonates in both academic and industrial circles. A significant impediment in +this domain is the resource-intensive nature of procuring high-quality, +annotated data for instance segmentation, a hurdle that amplifies the challenge +of developing robust models under resource constraints. In this context, the +strategic integration of a visual prior into the training dataset emerges as a +potential solution to enhance congruity with the testing data distribution, +consequently reducing the dependency on computational resources and the need +for highly complex models. However, effectively embedding a visual prior into +the learning process remains a complex endeavor. Addressing this challenge, we +introduce the MISS (Memory-efficient Instance Segmentation System) framework. +MISS leverages visual inductive prior flow propagation, integrating intrinsic +prior knowledge from the Synergy-basketball dataset at various stages: data +preprocessing, augmentation, training, and inference. Our empirical evaluations +underscore the efficacy of MISS, demonstrating commendable performance in +scenarios characterized by limited data availability and memory constraints. + +
+
+
+
+
+ + ☆ Just Add $100 More: Augmenting NeRF-based Pseudo-LiDAR Point Cloud for + Resolving Class-imbalance Problem ECCV 2024 + + +
+ Typical LiDAR-based 3D object detection models are trained in a supervised +manner with real-world data collection, which is often imbalanced over classes +(or long-tailed). To deal with it, augmenting minority-class examples by +sampling ground truth (GT) LiDAR points from a database and pasting them into a +scene of interest is often used, but challenges still remain: inflexibility in +locating GT samples and limited sample diversity. In this work, we propose to +leverage pseudo-LiDAR point clouds generated (at a low cost) from videos +capturing a surround view of miniatures or real-world objects of minor classes. +Our method, called Pseudo Ground Truth Augmentation (PGT-Aug), consists of +three main steps: (i) volumetric 3D instance reconstruction using a 2D-to-3D +view synthesis model, (ii) object-level domain alignment with LiDAR intensity +estimation and (iii) a hybrid context-aware placement method from ground and +map information. We demonstrate the superiority and generality of our method +through performance improvements in extensive experiments conducted on three +popular benchmarks, i.e., nuScenes, KITTI, and Lyft, especially for the +datasets with large domain gaps captured by different LiDAR configurations. Our +code and data will be publicly available upon publication. + +
+
+ comment: 28 pages, 12 figures, 11 tables; Submitted to ECCV 2024 +
+
+
+
+
+ + ☆ Augment Before Copy-Paste: Data and Memory Efficiency-Oriented Instance + Segmentation Framework for Sport-scenes + + +
+ Instance segmentation is a fundamental task in computer vision with broad +applications across various industries. In recent years, with the proliferation +of deep learning and artificial intelligence applications, how to train +effective models with limited data has become a pressing issue for both +academia and industry. In the Visual Inductive Priors challenge (VIPriors2023), +participants must train a model capable of precisely locating individuals on a +basketball court, all while working with limited data and without the use of +transfer learning or pre-trained models. We propose Memory effIciency inStance +Segmentation framework based on visual inductive prior flow propagation that +effectively incorporates inherent prior information from the dataset into both +the data preprocessing and data augmentation stages, as well as the inference +phase. Our team (ACVLAB) experiments demonstrate that our model achieves +promising performance (0.509 AP@0.50:0.95) even under limited data and memory +constraints. + +
+
+
+
+
+ + ☆ LogicalDefender: Discovering, Extracting, and Utilizing Common-Sense + Knowledge + + +
+ Large text-to-image models have achieved astonishing performance in +synthesizing diverse and high-quality images guided by texts. With +detail-oriented conditioning control, even finer-grained spatial control can be +achieved. However, some generated images still appear unreasonable, even with +plentiful object features and a harmonious style. In this paper, we delve into +the underlying causes and find that deep-level logical information, serving as +common-sense knowledge, plays a significant role in understanding and +processing images. Nonetheless, almost all models have neglected the importance +of logical relations in images, resulting in poor performance in this aspect. +Following this observation, we propose LogicalDefender, which combines images +with the logical knowledge already summarized by humans in text. This +encourages models to learn logical knowledge faster and better, and +concurrently, extracts the widely applicable logical knowledge from both images +and human knowledge. Experiments show that our model has achieved better +logical performance, and the extracted logical knowledge can be effectively +applied to other scenarios. + +
+
+
+
+
+ + ☆ EffiVED:Efficient Video Editing via Text-instruction Diffusion Models + + +
+ Large-scale text-to-video models have shown remarkable abilities, but their +direct application in video editing remains challenging due to limited +available datasets. Current video editing methods commonly require per-video +fine-tuning of diffusion models or specific inversion optimization to ensure +high-fidelity edits. In this paper, we introduce EffiVED, an efficient +diffusion-based model that directly supports instruction-guided video editing. +To achieve this, we present two efficient workflows to gather video editing +pairs, utilizing augmentation and fundamental vision-language techniques. These +workflows transform vast image editing datasets and open-world videos into a +high-quality dataset for training EffiVED. Experimental results reveal that +EffiVED not only generates high-quality editing videos but also executes +rapidly. Finally, we demonstrate that our data collection method significantly +improves editing performance and can potentially tackle the scarcity of video +editing data. The datasets will be made publicly available upon publication. + +
+
+
+
+
+ + ☆ Learning Unified Reference Representation for Unsupervised Multi-class + Anomaly Detection + + +
+ In the field of multi-class anomaly detection, reconstruction-based methods +derived from single-class anomaly detection face the well-known challenge of +``learning shortcuts'', wherein the model fails to learn the patterns of normal +samples as it should, opting instead for shortcuts such as identity mapping or +artificial noise elimination. Consequently, the model becomes unable to +reconstruct genuine anomalies as normal instances, resulting in a failure of +anomaly detection. To counter this issue, we present a novel unified feature +reconstruction-based anomaly detection framework termed RLR (Reconstruct +features from a Learnable Reference representation). Unlike previous methods, +RLR utilizes learnable reference representations to compel the model to learn +normal feature patterns explicitly, thereby prevents the model from succumbing +to the ``learning shortcuts'' issue. Additionally, RLR incorporates locality +constraints into the learnable reference to facilitate more effective normal +pattern capture and utilizes a masked learnable key attention mechanism to +enhance robustness. Evaluation of RLR on the 15-category MVTec-AD dataset and +the 12-category VisA dataset shows superior performance compared to +state-of-the-art methods under the unified setting. The code of RLR will be +publicly available. + +
+
+
+
+
+ + ☆ Hierarchical Frequency-based Upsampling and Refining for Compressed + Video Quality Enhancement + + +
+ Video compression artifacts arise due to the quantization operation in the +frequency domain. The goal of video quality enhancement is to reduce +compression artifacts and reconstruct a visually-pleasant result. In this work, +we propose a hierarchical frequency-based upsampling and refining neural +network (HFUR) for compressed video quality enhancement. HFUR consists of two +modules: implicit frequency upsampling module (ImpFreqUp) and hierarchical and +iterative refinement module (HIR). ImpFreqUp exploits DCT-domain prior derived +through implicit DCT transform, and accurately reconstructs the DCT-domain loss +via a coarse-to-fine transfer. Consequently, HIR is introduced to facilitate +cross-collaboration and information compensation between the scales, thus +further refine the feature maps and promote the visual quality of the final +output. We demonstrate the effectiveness of the proposed modules via ablation +experiments and visualized results. Extensive experiments on public benchmarks +show that HFUR achieves state-of-the-art performance for both constant bit rate +and constant QP modes. + +
+
+
+
+
+ + ☆ TARN-VIST: Topic Aware Reinforcement Network for Visual Storytelling + + +
+ As a cross-modal task, visual storytelling aims to generate a story for an +ordered image sequence automatically. Different from the image captioning task, +visual storytelling requires not only modeling the relationships between +objects in the image but also mining the connections between adjacent images. +Recent approaches primarily utilize either end-to-end frameworks or multi-stage +frameworks to generate relevant stories, but they usually overlook latent topic +information. In this paper, in order to generate a more coherent and relevant +story, we propose a novel method, Topic Aware Reinforcement Network for VIsual +StoryTelling (TARN-VIST). In particular, we pre-extracted the topic information +of stories from both visual and linguistic perspectives. Then we apply two +topic-consistent reinforcement learning rewards to identify the discrepancy +between the generated story and the human-labeled story so as to refine the +whole generation process. Extensive experimental results on the VIST dataset +and human evaluation demonstrate that our proposed model outperforms most of +the competitive models across multiple evaluation metrics. + +
+
+
+
+
+ + ☆ Boosting Continual Learning of Vision-Language Models via + Mixture-of-Experts Adapters CVPR2024 + + +
+ Continual learning can empower vision-language models to continuously acquire +new knowledge, without the need for access to the entire historical dataset. +However, mitigating the performance degradation in large-scale models is +non-trivial due to (i) parameter shifts throughout lifelong learning and (ii) +significant computational burdens associated with full-model tuning. In this +work, we present a parameter-efficient continual learning framework to +alleviate long-term forgetting in incremental learning with vision-language +models. Our approach involves the dynamic expansion of a pre-trained CLIP +model, through the integration of Mixture-of-Experts (MoE) adapters in response +to new tasks. To preserve the zero-shot recognition capability of +vision-language models, we further introduce a Distribution Discriminative +Auto-Selector (DDAS) that automatically routes in-distribution and +out-of-distribution inputs to the MoE Adapter and the original CLIP, +respectively. Through extensive experiments across various settings, our +proposed method consistently outperforms previous state-of-the-art approaches +while concurrently reducing parameter training burdens by 60%. Our code locates +at https://github.com/JiazuoYu/MoE-Adapters4CL + +
+
+ comment: This work is accepted by CVPR2024. More modifications may be + performed +
+
+
+
+
+ + ☆ Hierarchical Spatial Proximity Reasoning for Vision-and-Language + Navigation + + +
+ Most Vision-and-Language Navigation (VLN) algorithms tend to make decision +errors, primarily due to a lack of visual common sense and insufficient +reasoning capabilities. To address this issue, this paper proposes a +Hierarchical Spatial Proximity Reasoning (HSPR) model. Firstly, we design a +Scene Understanding Auxiliary Task (SUAT) to assist the agent in constructing a +knowledge base of hierarchical spatial proximity for reasoning navigation. +Specifically, this task utilizes panoramic views and object features to +identify regions in the navigation environment and uncover the adjacency +relationships between regions, objects, and region-object pairs. Secondly, we +dynamically construct a semantic topological map through agent-environment +interactions and propose a Multi-step Reasoning Navigation Algorithm (MRNA) +based on the map. This algorithm continuously plans various feasible paths from +one region to another, utilizing the constructed proximity knowledge base, +enabling more efficient exploration. Additionally, we introduce a Proximity +Adaptive Attention Module (PAAM) and Residual Fusion Method (RFM) to enable the +model to obtain more accurate navigation decision confidence. Finally, we +conduct experiments on publicly available datasets including REVERIE, SOON, +R2R, and R4R to validate the effectiveness of the proposed approach. + +
+
+
+
+
+ + ☆ Semantic Prompting with Image-Token for Continual Learning + + +
+ Continual learning aims to refine model parameters for new tasks while +retaining knowledge from previous tasks. Recently, prompt-based learning has +emerged to leverage pre-trained models to be prompted to learn subsequent tasks +without the reliance on the rehearsal buffer. Although this approach has +demonstrated outstanding results, existing methods depend on preceding +task-selection process to choose appropriate prompts. However, imperfectness in +task-selection may lead to negative impacts on the performance particularly in +the scenarios where the number of tasks is large or task distributions are +imbalanced. To address this issue, we introduce I-Prompt, a task-agnostic +approach focuses on the visual semantic information of image tokens to +eliminate task prediction. Our method consists of semantic prompt matching, +which determines prompts based on similarities between tokens, and image +token-level prompting, which applies prompts directly to image tokens in the +intermediate layers. Consequently, our method achieves competitive performance +on four benchmarks while significantly reducing training time compared to +state-of-the-art methods. Moreover, we demonstrate the superiority of our +method across various scenarios through extensive experiments. + +
+
+
+
+
+ + ☆ OCR is All you need: Importing Multi-Modality into Image-based Defect + Detection System + + +
+ Automatic optical inspection (AOI) plays a pivotal role in the manufacturing +process, predominantly leveraging high-resolution imaging instruments for +scanning purposes. It detects anomalies by analyzing image textures or +patterns, making it an essential tool in industrial manufacturing and quality +control. Despite its importance, the deployment of models for AOI often faces +challenges. These include limited sample sizes, which hinder effective feature +learning, variations among source domains, and sensitivities to changes in +lighting and camera positions during imaging. These factors collectively +compromise the accuracy of model predictions. Traditional AOI often fails to +capitalize on the rich mechanism-parameter information from machines or inside +images, including statistical parameters, which typically benefit AOI +classification. To address this, we introduce an external modality-guided data +mining framework, primarily rooted in optical character recognition (OCR), to +extract statistical features from images as a second modality to enhance +performance, termed OANet (Ocr-Aoi-Net). A key aspect of our approach is the +alignment of external modality features, extracted using a single +modality-aware model, with image features encoded by a convolutional neural +network. This synergy enables a more refined fusion of semantic representations +from different modalities. We further introduce feature refinement and a gating +function in our OANet to optimize the combination of these features, enhancing +inference and decision-making capabilities. Experimental outcomes show that our +methodology considerably boosts the recall rate of the defect detection model +and maintains high robustness even in challenging scenarios. + +
+
+
+
+
+ + ☆ EchoReel: Enhancing Action Generation of Existing Video Diffusion Models + + +
+ Recent large-scale video datasets have facilitated the generation of diverse +open-domain videos of Video Diffusion Models (VDMs). Nonetheless, the efficacy +of VDMs in assimilating complex knowledge from these datasets remains +constrained by their inherent scale, leading to suboptimal comprehension and +synthesis of numerous actions. In this paper, we introduce EchoReel, a novel +approach to augment the capability of VDMs in generating intricate actions by +emulating motions from pre-existing videos, which are readily accessible from +databases or online repositories. EchoReel seamlessly integrates with existing +VDMs, enhancing their ability to produce realistic motions without compromising +their fundamental capabilities. Specifically, the Action Prism (AP), is +introduced to distill motion information from reference videos, which requires +training on only a small dataset. Leveraging the knowledge from pre-trained +VDMs, EchoReel incorporates new action features into VDMs through the +additional layers, eliminating the need for any further fine-tuning of +untrained actions. Extensive experiments demonstrate that EchoReel is not +merely replicating the whole content from references, and it significantly +improves the generation of realistic actions, even in situations where existing +VDMs might directly fail. + +
+
+ comment: 22 pages, 10 figures +
+
+
+
+
+ + ☆ Out-of-Distribution Detection Should Use Conformal Prediction (and + Vice-versa?) + + +
+ Research on Out-Of-Distribution (OOD) detection focuses mainly on building +scores that efficiently distinguish OOD data from In Distribution (ID) data. On +the other hand, Conformal Prediction (CP) uses non-conformity scores to +construct prediction sets with probabilistic coverage guarantees. In this work, +we propose to use CP to better assess the efficiency of OOD scores. +Specifically, we emphasize that in standard OOD benchmark settings, evaluation +metrics can be overly optimistic due to the finite sample size of the test +dataset. Based on the work of (Bates et al., 2022), we define new conformal +AUROC and conformal FRP@TPR95 metrics, which are corrections that provide +probabilistic conservativeness guarantees on the variability of these metrics. +We show the effect of these corrections on two reference OOD and anomaly +detection benchmarks, OpenOOD (Yang et al., 2022) and ADBench (Han et al., +2022). We also show that the benefits of using OOD together with CP apply the +other way around by using OOD scores as non-conformity scores, which results in +improving upon current CP methods. One of the key messages of these +contributions is that since OOD is concerned with designing scores and CP with +interpreting these scores, the two fields may be inherently intertwined. + +
+
+
+
+
+ + ☆ Continual Forgetting for Pre-trained Vision Models CVPR 2024 + + +
+ For privacy and security concerns, the need to erase unwanted information +from pre-trained vision models is becoming evident nowadays. In real-world +scenarios, erasure requests originate at any time from both users and model +owners. These requests usually form a sequence. Therefore, under such a +setting, selective information is expected to be continuously removed from a +pre-trained model while maintaining the rest. We define this problem as +continual forgetting and identify two key challenges. (i) For unwanted +knowledge, efficient and effective deleting is crucial. (ii) For remaining +knowledge, the impact brought by the forgetting procedure should be minimal. To +address them, we propose Group Sparse LoRA (GS-LoRA). Specifically, towards +(i), we use LoRA modules to fine-tune the FFN layers in Transformer blocks for +each forgetting task independently, and towards (ii), a simple group sparse +regularization is adopted, enabling automatic selection of specific LoRA groups +and zeroing out the others. GS-LoRA is effective, parameter-efficient, +data-efficient, and easy to implement. We conduct extensive experiments on face +recognition, object detection and image classification and demonstrate that +GS-LoRA manages to forget specific classes with minimal impact on other +classes. Codes will be released on \url{https://github.com/bjzhb666/GS-LoRA}. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Video Object Segmentation with Dynamic Query Modulation ICME2024 + + +
+ Storing intermediate frame segmentations as memory for long-range context +modeling, spatial-temporal memory-based methods have recently showcased +impressive results in semi-supervised video object segmentation (SVOS). +However, these methods face two key limitations: 1) relying on non-local +pixel-level matching to read memory, resulting in noisy retrieved features for +segmentation; 2) segmenting each object independently without interaction. +These shortcomings make the memory-based methods struggle in similar object and +multi-object segmentation. To address these issues, we propose a query +modulation method, termed QMVOS. This method summarizes object features into +dynamic queries and then treats them as dynamic filters for mask prediction, +thereby providing high-level descriptions and object-level perception for the +model. Efficient and effective multi-object interactions are realized through +inter-query attention. Extensive experiments demonstrate that our method can +bring significant improvements to the memory-based SVOS method and achieve +competitive performance on standard SVOS benchmarks. The code is available at +https://github.com/zht8506/QMVOS. + +
+
+ comment: Accepted by ICME2024 +
+
+
+
+
+ + ☆ SSAP: A Shape-Sensitive Adversarial Patch for Comprehensive Disruption + of Monocular Depth Estimation in Autonomous Navigation Applications + + +
+ Monocular depth estimation (MDE) has advanced significantly, primarily +through the integration of convolutional neural networks (CNNs) and more +recently, Transformers. However, concerns about their susceptibility to +adversarial attacks have emerged, especially in safety-critical domains like +autonomous driving and robotic navigation. Existing approaches for assessing +CNN-based depth prediction methods have fallen short in inducing comprehensive +disruptions to the vision system, often limited to specific local areas. In +this paper, we introduce SSAP (Shape-Sensitive Adversarial Patch), a novel +approach designed to comprehensively disrupt monocular depth estimation (MDE) +in autonomous navigation applications. Our patch is crafted to selectively +undermine MDE in two distinct ways: by distorting estimated distances or by +creating the illusion of an object disappearing from the system's perspective. +Notably, our patch is shape-sensitive, meaning it considers the specific shape +and scale of the target object, thereby extending its influence beyond +immediate proximity. Furthermore, our patch is trained to effectively address +different scales and distances from the camera. Experimental results +demonstrate that our approach induces a mean depth estimation error surpassing +0.5, impacting up to 99% of the targeted region for CNN-based MDE models. +Additionally, we investigate the vulnerability of Transformer-based MDE models +to patch-based attacks, revealing that SSAP yields a significant error of 0.59 +and exerts substantial influence over 99% of the target region on these models. + +
+
+
+
+
+ + ☆ Sim-to-Real Grasp Detection with Global-to-Local RGB-D Adaptation ICRA 2024 + + +
+ This paper focuses on the sim-to-real issue of RGB-D grasp detection and +formulates it as a domain adaptation problem. In this case, we present a +global-to-local method to address hybrid domain gaps in RGB and depth data and +insufficient multi-modal feature alignment. First, a self-supervised rotation +pre-training strategy is adopted to deliver robust initialization for RGB and +depth networks. We then propose a global-to-local alignment pipeline with +individual global domain classifiers for scene features of RGB and depth images +as well as a local one specifically working for grasp features in the two +modalities. In particular, we propose a grasp prototype adaptation module, +which aims to facilitate fine-grained local feature alignment by dynamically +updating and matching the grasp prototypes from the simulation and real-world +scenarios throughout the training process. Due to such designs, the proposed +method substantially reduces the domain shift and thus leads to consistent +performance improvements. Extensive experiments are conducted on the +GraspNet-Planar benchmark and physical environment, and superior results are +achieved which demonstrate the effectiveness of our method. + +
+
+ comment: Accepted at ICRA 2024 +
+
+
+
+
+ + ☆ GenFlow: Generalizable Recurrent Flow for 6D Pose Refinement of Novel + Objects + + +
+ Despite the progress of learning-based methods for 6D object pose estimation, +the trade-off between accuracy and scalability for novel objects still exists. +Specifically, previous methods for novel objects do not make good use of the +target object's 3D shape information since they focus on generalization by +processing the shape indirectly, making them less effective. We present +GenFlow, an approach that enables both accuracy and generalization to novel +objects with the guidance of the target object's shape. Our method predicts +optical flow between the rendered image and the observed image and refines the +6D pose iteratively. It boosts the performance by a constraint of the 3D shape +and the generalizable geometric knowledge learned from an end-to-end +differentiable system. We further improve our model by designing a cascade +network architecture to exploit the multi-scale correlations and coarse-to-fine +refinement. GenFlow ranked first on the unseen object pose estimation +benchmarks in both the RGB and RGB-D cases. It also achieves performance +competitive with existing state-of-the-art methods for the seen object pose +estimation without any fine-tuning. + +
+
+
+
+
+ + ☆ Circle Representation for Medical Instance Object Segmentation + + +
+ Recently, circle representation has been introduced for medical imaging, +designed specifically to enhance the detection of instance objects that are +spherically shaped (e.g., cells, glomeruli, and nuclei). Given its outstanding +effectiveness in instance detection, it is compelling to consider the +application of circle representation for segmenting instance medical objects. +In this study, we introduce CircleSnake, a simple end-to-end segmentation +approach that utilizes circle contour deformation for segmenting ball-shaped +medical objects at the instance level. The innovation of CircleSnake lies in +these three areas: (1) It substitutes the complex bounding box-to-octagon +contour transformation with a more consistent and rotation-invariant bounding +circle-to-circle contour adaptation. This adaptation specifically targets +ball-shaped medical objects. (2) The circle representation employed in +CircleSnake significantly reduces the degrees of freedom to two, compared to +eight in the octagon representation. This reduction enhances both the +robustness of the segmentation performance and the rotational consistency of +the method. (3) CircleSnake is the first end-to-end deep instance segmentation +pipeline to incorporate circle representation, encompassing consistent circle +detection, circle contour proposal, and circular convolution in a unified +framework. This integration is achieved through the novel application of +circular graph convolution within the context of circle detection and instance +segmentation. In practical applications, such as the detection of glomeruli, +nuclei, and eosinophils in pathological images, CircleSnake has demonstrated +superior performance and greater rotation invariance when compared to +benchmarks. The code has been made publicly available: +https://github.com/hrlblab/CircleSnake. + +
+
+
+
+
+ + ☆ End-To-End Underwater Video Enhancement: Dataset and Model + + +
+ Underwater video enhancement (UVE) aims to improve the visibility and frame +quality of underwater videos, which has significant implications for marine +research and exploration. However, existing methods primarily focus on +developing image enhancement algorithms to enhance each frame independently. +There is a lack of supervised datasets and models specifically tailored for UVE +tasks. To fill this gap, we construct the Synthetic Underwater Video +Enhancement (SUVE) dataset, comprising 840 diverse underwater-style videos +paired with ground-truth reference videos. Based on this dataset, we train a +novel underwater video enhancement model, UVENet, which utilizes inter-frame +relationships to achieve better enhancement performance. Through extensive +experiments on both synthetic and real underwater videos, we demonstrate the +effectiveness of our approach. This study represents the first comprehensive +exploration of UVE to our knowledge. The code is available at +https://anonymous.4open.science/r/UVENet. + +
+
+
+
+
+ + ☆ Covid-19 detection from CT scans using EfficientNet and Attention + mechanism + + +
+ Manual diagnosis and analysis of COVID-19 through the examination of lung +Computed Tomography (CT) scan images by physicians tends to result in +inefficiency, especially with high patient volumes and numerous images per +patient. We address the need for automation by developing a deep learning +model-based pipeline for COVID-19 detection from CT scan images of the lungs. +The Domain adaptation, Explainability, and Fairness in AI for Medical Image +Analysis Workshop and COVID-19 Diagnosis Competition (DEF-AI-MIA COV19D) +provides an opportunity to assess our designed pipeline for COVID-19 detection +from CT scan images. The proposed pipeline incorporates EfficientNet with an +Attention mechanism with a pre-processing step. Our pipeline outperforms last +year's teams on the validation set of the competition dataset. + +
+
+
+
+
+ + ☆ MLVICX: Multi-Level Variance-Covariance Exploration for Chest X-ray + Self-Supervised Representation Learning + + +
+ Self-supervised learning (SSL) is potentially useful in reducing the need for +manual annotation and making deep learning models accessible for medical image +analysis tasks. By leveraging the representations learned from unlabeled data, +self-supervised models perform well on tasks that require little to no +fine-tuning. However, for medical images, like chest X-rays, which are +characterized by complex anatomical structures and diverse clinical conditions, +there arises a need for representation learning techniques that can encode +fine-grained details while preserving the broader contextual information. In +this context, we introduce MLVICX (Multi-Level Variance-Covariance Exploration +for Chest X-ray Self-Supervised Representation Learning), an approach to +capture rich representations in the form of embeddings from chest X-ray images. +Central to our approach is a novel multi-level variance and covariance +exploration strategy that empowers the model to detect diagnostically +meaningful patterns while reducing redundancy effectively. By enhancing the +variance and covariance of the learned embeddings, MLVICX promotes the +retention of critical medical insights by adapting both global and local +contextual details. We demonstrate the performance of MLVICX in advancing +self-supervised chest X-ray representation learning through comprehensive +experiments. The performance enhancements we observe across various downstream +tasks highlight the significance of the proposed approach in enhancing the +utility of chest X-ray embeddings for precision medical diagnosis and +comprehensive image analysis. For pertaining, we used the NIH-Chest X-ray +dataset, while for downstream tasks, we utilized NIH-Chest X-ray, Vinbig-CXR, +RSNA pneumonia, and SIIM-ACR Pneumothorax datasets. Overall, we observe more +than 3% performance gains over SOTA SSL approaches in various downstream tasks. + +
+
+
+
+
+ + ☆ Diffusion Models are Geometry Critics: Single Image 3D Editing Using + Pre-Trained Diffusion Priors + + +
+ We propose a novel image editing technique that enables 3D manipulations on +single images, such as object rotation and translation. Existing 3D-aware image +editing approaches typically rely on synthetic multi-view datasets for training +specialized models, thus constraining their effectiveness on open-domain images +featuring significantly more varied layouts and styles. In contrast, our method +directly leverages powerful image diffusion models trained on a broad spectrum +of text-image pairs and thus retain their exceptional generalization abilities. +This objective is realized through the development of an iterative novel view +synthesis and geometry alignment algorithm. The algorithm harnesses diffusion +models for dual purposes: they provide appearance prior by predicting novel +views of the selected object using estimated depth maps, and they act as a +geometry critic by correcting misalignments in 3D shapes across the sampled +views. Our method can generate high-quality 3D-aware image edits with large +viewpoint transformations and high appearance and shape consistency with the +input image, pushing the boundaries of what is possible with single-image +3D-aware editing. + +
+
+ comment: Project page: https://wangrc.site/DiffCriticEdit/ +
+
+
+
+
+ + ☆ Domain Adaptation Using Pseudo Labels for COVID-19 Detection + + +
+ In response to the need for rapid and accurate COVID-19 diagnosis during the +global pandemic, we present a two-stage framework that leverages pseudo labels +for domain adaptation to enhance the detection of COVID-19 from CT scans. By +utilizing annotated data from one domain and non-annotated data from another, +the model overcomes the challenge of data scarcity and variability, common in +emergent health crises. The innovative approach of generating pseudo labels +enables the model to iteratively refine its learning process, thereby improving +its accuracy and adaptability across different hospitals and medical centres. +Experimental results on COV19-CT-DB database showcase the model's potential to +achieve high diagnostic precision, significantly contributing to efficient +patient management and alleviating the strain on healthcare systems. Our method +achieves 0.92 Macro F1 Score on the validation set of Covid-19 domain +adaptation challenge. + +
+
+
+
+
+ + ☆ Do CLIPs Always Generalize Better than ImageNet Models? + + +
+ Large vision language models, such as CLIPs, have revolutionized modern +machine learning. CLIPs have demonstrated great generalizability under +distribution shifts, supported by an increasing body of literature. However, +the evaluation datasets for CLIPs are variations primarily designed for +ImageNet benchmarks, which may not fully reflect the extent to which CLIPs, +e.g., pre-trained on LAION, robust to spurious correlations. To bridge the gap, +we collect a real-world dataset called CounterAnimal that contains realistic +spurious features found in animal photos. CounterAnimal consists of a) the +common group: comprising animals on common backgrounds, and b) the counter +group: including animals on unusual backgrounds. The performance drops from the +common to counter groups quantify the reliance of models on spurious features +(i.e., backgrounds) to predict the animals. We find that CLIPs trained on +either LAION or the OpenAI data exhibit notable performance drops on the +counter group. Surprisingly, we observe that single-modal models trained on +ImageNet are more robust than CLIPs. We provide both theoretical and empirical +explanations for why CLIPs still learn spurious features. Our findings suggest +that distribution shifts remain an open problem for CLIPs, and one needs to be +cautious about test setups when evaluating foundation models pre-trained on a +significantly different scale and distribution. + +
+
+ comment: Qizhou Wang, Yong Lin, and Yongqiang Chen contributed equally. + Project page: https://counteranimal.github.io +
+
+
+
+
+ + ☆ CCC++: Optimized Color Classified Colorization with Segment Anything + Model (SAM) Empowered Object Selective Color Harmonization + + +
+ In this paper, we formulate the colorization problem into a multinomial +classification problem and then apply a weighted function to classes. We +propose a set of formulas to transform color values into color classes and vice +versa. To optimize the classes, we experiment with different bin sizes for +color class transformation. Observing class appearance, standard deviation, and +model parameters on various extremely large-scale real-time images in practice +we propose 532 color classes for our classification task. During training, we +propose a class-weighted function based on true class appearance in each batch +to ensure proper saturation of individual objects. We adjust the weights of the +major classes, which are more frequently observed, by lowering them, while +escalating the weights of the minor classes, which are less commonly observed. +In our class re-weight formula, we propose a hyper-parameter for finding the +optimal trade-off between the major and minor appeared classes. As we apply +regularization to enhance the stability of the minor class, occasional minor +noise may appear at the object's edges. We propose a novel object-selective +color harmonization method empowered by the Segment Anything Model (SAM) to +refine and enhance these edges. We propose two new color image evaluation +metrics, the Color Class Activation Ratio (CCAR), and the True Activation Ratio +(TAR), to quantify the richness of color components. We compare our proposed +model with state-of-the-art models using six different dataset: Place, ADE, +Celeba, COCO, Oxford 102 Flower, and ImageNet, in qualitative and quantitative +approaches. The experimental results show that our proposed model outstrips +other models in visualization, CNR and in our proposed CCAR and TAR measurement +criteria while maintaining satisfactory performance in regression (MSE, PSNR), +similarity (SSIM, LPIPS, UIUI), and generative criteria (FID). + +
+
+ comment: arXiv admin note: text overlap with arXiv:2403.01476 +
+
+
+
+
+ + ☆ SmartRefine: An Scenario-Adaptive Refinement Framework for Efficient + Motion Prediction CVPR 2024 + + +
+ Predicting the future motion of surrounding agents is essential for +autonomous vehicles (AVs) to operate safely in dynamic, human-robot-mixed +environments. Context information, such as road maps and surrounding agents' +states, provides crucial geometric and semantic information for motion behavior +prediction. To this end, recent works explore two-stage prediction frameworks +where coarse trajectories are first proposed, and then used to select critical +context information for trajectory refinement. However, they either incur a +large amount of computation or bring limited improvement, if not both. In this +paper, we introduce a novel scenario-adaptive refinement strategy, named +SmartRefine, to refine prediction with minimal additional computation. +Specifically, SmartRefine can comprehensively adapt refinement configurations +based on each scenario's properties, and smartly chooses the number of +refinement iterations by introducing a quality score to measure the prediction +quality and remaining refinement potential of each scenario. SmartRefine is +designed as a generic and flexible approach that can be seamlessly integrated +into most state-of-the-art motion prediction models. Experiments on Argoverse +(1 & 2) show that our method consistently improves the prediction accuracy of +multiple state-of-the-art prediction models. Specifically, by adding +SmartRefine to QCNet, we outperform all published ensemble-free works on the +Argoverse 2 leaderboard (single agent track) at submission. Comprehensive +studies are also conducted to ablate design choices and explore the mechanism +behind multi-iteration refinement. Codes are available at +https://github.com/opendilab/SmartRefine/ + +
+
+ comment: Camera-ready version for CVPR 2024 +
+
+
+
+
+ + ☆ VideoAgent: A Memory-augmented Multimodal Agent for Video Understanding + + +
+ We explore how reconciling several foundation models (large language models +and vision-language models) with a novel unified memory mechanism could tackle +the challenging video understanding problem, especially capturing the long-term +temporal relations in lengthy videos. In particular, the proposed multimodal +agent VideoAgent: 1) constructs a structured memory to store both the generic +temporal event descriptions and object-centric tracking states of the video; 2) +given an input task query, it employs tools including video segment +localization and object memory querying along with other visual foundation +models to interactively solve the task, utilizing the zero-shot tool-use +ability of LLMs. VideoAgent demonstrates impressive performances on several +long-horizon video understanding benchmarks, an average increase of 6.6% on +NExT-QA and 26.0% on EgoSchema over baselines, closing the gap between +open-sourced models and private counterparts including Gemini 1.5 Pro. + +
+
+ comment: Project page: videoagent.github.io; First two authors contributed + equally +
+
+
+
+
+ + ☆ Towards understanding the nature of direct functional connectivity in + visual brain network + + +
+ Recent advances in neuroimaging have enabled studies in functional +connectivity (FC) of human brain, alongside investigation of the neuronal basis +of cognition. One important FC study is the representation of vision in human +brain. The release of publicly available dataset BOLD5000 has made it possible +to study the brain dynamics during visual tasks in greater detail. In this +paper, a comprehensive analysis of fMRI time series (TS) has been performed to +explore different types of visual brain networks (VBN). The novelty of this +work lies in (1) constructing VBN with consistently significant direct +connectivity using both marginal and partial correlation, which is further +analyzed using graph theoretic measures, (2) classification of VBNs as formed +by image complexity-specific TS, using graphical features. In image +complexity-specific VBN classification, XGBoost yields average accuracy in the +range of 86.5% to 91.5% for positively correlated VBN, which is 2% greater than +that using negative correlation. This result not only reflects the +distinguishing graphical characteristics of each image complexity-specific VBN, +but also highlights the importance of studying both positively correlated and +negatively correlated VBN to understand the how differently brain functions +while viewing different complexities of real-world images. + +
+
+
+
+
+ + ☆ Generative Motion Stylization within Canonical Motion Space + + +
+ Stylized motion breathes life into characters. However, the fixed skeleton +structure and style representation hinder existing data-driven motion synthesis +methods from generating stylized motion for various characters. In this work, +we propose a generative motion stylization pipeline, named MotionS, for +synthesizing diverse and stylized motion on cross-structure characters using +cross-modality style prompts. Our key insight is to embed motion style into a +cross-modality latent space and perceive the cross-structure skeleton +topologies, allowing for motion stylization within a canonical motion space. +Specifically, the large-scale Contrastive-Language-Image-Pre-training (CLIP) +model is leveraged to construct the cross-modality latent space, enabling +flexible style representation within this space. Additionally, two +topology-encoded tokens are learned to capture the canonical and specific +skeleton topologies, facilitating cross-structure topology shifting. +Subsequently, the topology-shifted stylization diffusion is designed to +generate motion content for the specific skeleton and stylize it in the shifted +canonical motion space using multi-modality style descriptions. Through an +extensive set of examples, we demonstrate the flexibility and generalizability +of our pipeline across various characters and style descriptions. Qualitative +and quantitative experiments underscore the superiority of our pipeline over +state-of-the-art methods, consistently delivering high-quality stylized motion +across a broad spectrum of skeletal structures. + +
+
+
+
+
+ + ☆ Collage Prompting: Budget-Friendly Visual Recognition with GPT-4V + + +
+ Recent advancements in generative AI have suggested that by taking visual +prompt, GPT-4V can demonstrate significant proficiency in image recognition +task. Despite its impressive capabilities, the financial cost associated with +GPT-4V's inference presents a substantial barrier for its wide use. To address +this challenge, our work introduces Collage Prompting, a budget-friendly +prompting approach that concatenates multiple images into a single visual +input. With collage prompt, GPT-4V is able to perform image recognition on +several images simultaneously. Based on the observation that the accuracy of +GPT-4V's image recognition varies significantly with the order of images within +the collage prompt, our method further learns to optimize the arrangement of +images for maximum recognition accuracy. A graph predictor is trained to +indicate the accuracy of each collage prompt, then we propose an optimization +method to navigate the search space of possible image arrangements. Experiment +results across various datasets demonstrate the cost-efficiency score of +collage prompt is much larger than standard prompt. Additionally, collage +prompt with learned arrangement achieves clearly better accuracy than collage +prompt with random arrangement in GPT-4V's visual recognition. + +
+
+
+
+
+ + ☆ Siamese Learning with Joint Alignment and Regression for + Weakly-Supervised Video Paragraph Grounding CVPR 2024 + + +
+ Video Paragraph Grounding (VPG) is an emerging task in video-language +understanding, which aims at localizing multiple sentences with semantic +relations and temporal order from an untrimmed video. However, existing VPG +approaches are heavily reliant on a considerable number of temporal labels that +are laborious and time-consuming to acquire. In this work, we introduce and +explore Weakly-Supervised Video Paragraph Grounding (WSVPG) to eliminate the +need of temporal annotations. Different from previous weakly-supervised +grounding frameworks based on multiple instance learning or reconstruction +learning for two-stage candidate ranking, we propose a novel siamese learning +framework that jointly learns the cross-modal feature alignment and temporal +coordinate regression without timestamp labels to achieve concise one-stage +localization for WSVPG. Specifically, we devise a Siamese Grounding TRansformer +(SiamGTR) consisting of two weight-sharing branches for learning complementary +supervision. An Augmentation Branch is utilized for directly regressing the +temporal boundaries of a complete paragraph within a pseudo video, and an +Inference Branch is designed to capture the order-guided feature correspondence +for localizing multiple sentences in a normal video. We demonstrate by +extensive experiments that our paradigm has superior practicability and +flexibility to achieve efficient weakly-supervised or semi-supervised learning, +outperforming state-of-the-art methods trained with the same or stronger +supervision. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Fed3DGS: Scalable 3D Gaussian Splatting with Federated Learning + + +
+ In this work, we present Fed3DGS, a scalable 3D reconstruction framework +based on 3D Gaussian splatting (3DGS) with federated learning. Existing +city-scale reconstruction methods typically adopt a centralized approach, which +gathers all data in a central server and reconstructs scenes. The approach +hampers scalability because it places a heavy load on the server and demands +extensive data storage when reconstructing scenes on a scale beyond city-scale. +In pursuit of a more scalable 3D reconstruction, we propose a federated +learning framework with 3DGS, which is a decentralized framework and can +potentially use distributed computational resources across millions of clients. +We tailor a distillation-based model update scheme for 3DGS and introduce +appearance modeling for handling non-IID data in the scenario of 3D +reconstruction with federated learning. We simulate our method on several +large-scale benchmarks, and our method demonstrates rendered image quality +comparable to centralized approaches. In addition, we also simulate our method +with data collected in different seasons, demonstrating that our framework can +reflect changes in the scenes and our appearance modeling captures changes due +to seasonal variations. + +
+
+ comment: Code: https://github.com/DensoITLab/Fed3DGS +
+
+
+
+
+ + ☆ Bridging 3D Gaussian and Mesh for Freeview Video Rendering + + +
+ This is only a preview version of GauMesh. Recently, primitive-based +rendering has been proven to achieve convincing results in solving the problem +of modeling and rendering the 3D dynamic scene from 2D images. Despite this, in +the context of novel view synthesis, each type of primitive has its inherent +defects in terms of representation ability. It is difficult to exploit the mesh +to depict the fuzzy geometry. Meanwhile, the point-based splatting (e.g. the 3D +Gaussian Splatting) method usually produces artifacts or blurry pixels in the +area with smooth geometry and sharp textures. As a result, it is difficult, +even not impossible, to represent the complex and dynamic scene with a single +type of primitive. To this end, we propose a novel approach, GauMesh, to bridge +the 3D Gaussian and Mesh for modeling and rendering the dynamic scenes. Given a +sequence of tracked mesh as initialization, our goal is to simultaneously +optimize the mesh geometry, color texture, opacity maps, a set of 3D Gaussians, +and the deformation field. At a specific time, we perform $\alpha$-blending on +the RGB and opacity values based on the merged and re-ordered z-buffers from +mesh and 3D Gaussian rasterizations. This produces the final rendering, which +is supervised by the ground-truth image. Experiments demonstrate that our +approach adapts the appropriate type of primitives to represent the different +parts of the dynamic scene and outperforms all the baseline methods in both +quantitative and qualitative comparisons without losing render speed. + +
+
+ comment: 7 pages +
+
+
+
+
+ + ☆ CasSR: Activating Image Power for Real-World Image Super-Resolution + + +
+ The objective of image super-resolution is to generate clean and +high-resolution images from degraded versions. Recent advancements in diffusion +modeling have led to the emergence of various image super-resolution techniques +that leverage pretrained text-to-image (T2I) models. Nevertheless, due to the +prevalent severe degradation in low-resolution images and the inherent +characteristics of diffusion models, achieving high-fidelity image restoration +remains challenging. Existing methods often exhibit issues including semantic +loss, artifacts, and the introduction of spurious content not present in the +original image. To tackle this challenge, we propose Cascaded diffusion for +Super-Resolution, CasSR , a novel method designed to produce highly detailed +and realistic images. In particular, we develop a cascaded controllable +diffusion model that aims to optimize the extraction of information from +low-resolution images. This model generates a preliminary reference image to +facilitate initial information extraction and degradation mitigation. +Furthermore, we propose a multi-attention mechanism to enhance the T2I model's +capability in maximizing the restoration of the original image content. Through +a comprehensive blend of qualitative and quantitative analyses, we substantiate +the efficacy and superiority of our approach. + +
+
+
+
+
+ + ☆ Zero-shot Compound Expression Recognition with Visual Language Model at + the 6th ABAW Challenge + + +
+ Conventional approaches to facial expression recognition primarily focus on +the classification of six basic facial expressions. Nevertheless, real-world +situations present a wider range of complex compound expressions that consist +of combinations of these basics ones due to limited availability of +comprehensive training datasets. The 6th Workshop and Competition on Affective +Behavior Analysis in-the-wild (ABAW) offered unlabeled datasets containing +compound expressions. In this study, we propose a zero-shot approach for +recognizing compound expressions by leveraging a pretrained visual language +model integrated with some traditional CNN networks. + +
+
+ comment: USTC-AC's paper for Compound Expression (CE) Recognition Challenge in + 6th Workshop and Competition on Affective Behavior Analysis in-the-wild + (ABAW) +
+
+
+
+
+ + ☆ Robust Overfitting Does Matter: Test-Time Adversarial Purification With + FGSM CVPR 2024 + + +
+ Numerous studies have demonstrated the susceptibility of deep neural networks +(DNNs) to subtle adversarial perturbations, prompting the development of many +advanced adversarial defense methods aimed at mitigating adversarial attacks. +Current defense strategies usually train DNNs for a specific adversarial attack +method and can achieve good robustness in defense against this type of +adversarial attack. Nevertheless, when subjected to evaluations involving +unfamiliar attack modalities, empirical evidence reveals a pronounced +deterioration in the robustness of DNNs. Meanwhile, there is a trade-off +between the classification accuracy of clean examples and adversarial examples. +Most defense methods often sacrifice the accuracy of clean examples in order to +improve the adversarial robustness of DNNs. To alleviate these problems and +enhance the overall robust generalization of DNNs, we propose the Test-Time +Pixel-Level Adversarial Purification (TPAP) method. This approach is based on +the robust overfitting characteristic of DNNs to the fast gradient sign method +(FGSM) on training and test datasets. It utilizes FGSM for adversarial +purification, to process images for purifying unknown adversarial perturbations +from pixels at testing time in a "counter changes with changelessness" manner, +thereby enhancing the defense capability of DNNs against various unknown +adversarial attacks. Extensive experimental results show that our method can +effectively improve both overall robust generalization of DNNs, notably over +previous methods. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Motion-aware 3D Gaussian Splatting for Efficient Dynamic Scene + Reconstruction + + +
+ 3D Gaussian Splatting (3DGS) has become an emerging tool for dynamic scene +reconstruction. However, existing methods focus mainly on extending static 3DGS +into a time-variant representation, while overlooking the rich motion +information carried by 2D observations, thus suffering from performance +degradation and model redundancy. To address the above problem, we propose a +novel motion-aware enhancement framework for dynamic scene reconstruction, +which mines useful motion cues from optical flow to improve different paradigms +of dynamic 3DGS. Specifically, we first establish a correspondence between 3D +Gaussian movements and pixel-level flow. Then a novel flow augmentation method +is introduced with additional insights into uncertainty and loss collaboration. +Moreover, for the prevalent deformation-based paradigm that presents a harder +optimization problem, a transient-aware deformation auxiliary module is +proposed. We conduct extensive experiments on both multi-view and monocular +scenes to verify the merits of our work. Compared with the baselines, our +method shows significant superiority in both rendering quality and efficiency. + +
+
+
+
+
+ + ☆ Boosting Continuous Emotion Recognition with Self-Pretraining using + Masked Autoencoders, Temporal Convolutional Networks, and Transformers + + +
+ Human emotion recognition holds a pivotal role in facilitating seamless +human-computer interaction. This paper delineates our methodology in tackling +the Valence-Arousal (VA) Estimation Challenge, Expression (Expr) Classification +Challenge, and Action Unit (AU) Detection Challenge within the ambit of the 6th +Workshop and Competition on Affective Behavior Analysis in-the-wild (ABAW). Our +study advocates a novel approach aimed at refining continuous emotion +recognition. We achieve this by initially harnessing pre-training with Masked +Autoencoders (MAE) on facial datasets, followed by fine-tuning on the aff-wild2 +dataset annotated with expression (Expr) labels. The pre-trained model serves +as an adept visual feature extractor, thereby enhancing the model's robustness. +Furthermore, we bolster the performance of continuous emotion recognition by +integrating Temporal Convolutional Network (TCN) modules and Transformer +Encoder modules into our framework. + +
+
+
+
+
+ + ☆ BAGS: Building Animatable Gaussian Splatting from a Monocular Video with + Diffusion Priors + + +
+ Animatable 3D reconstruction has significant applications across various +fields, primarily relying on artists' handcraft creation. Recently, some +studies have successfully constructed animatable 3D models from monocular +videos. However, these approaches require sufficient view coverage of the +object within the input video and typically necessitate significant time and +computational costs for training and rendering. This limitation restricts the +practical applications. In this work, we propose a method to build animatable +3D Gaussian Splatting from monocular video with diffusion priors. The 3D +Gaussian representations significantly accelerate the training and rendering +process, and the diffusion priors allow the method to learn 3D models with +limited viewpoints. We also present the rigid regularization to enhance the +utilization of the priors. We perform an extensive evaluation across various +real-world videos, demonstrating its superior performance compared to the +current state-of-the-art methods. + +
+
+ comment: https://talegqz.github.io/BAGS/ +
+
+
+
+
+ + ☆ Benchmarking the Robustness of UAV Tracking Against Common Corruptions + + +
+ The robustness of unmanned aerial vehicle (UAV) tracking is crucial in many +tasks like surveillance and robotics. Despite its importance, little attention +is paid to the performance of UAV trackers under common corruptions due to lack +of a dedicated platform. Addressing this, we propose UAV-C, a large-scale +benchmark for assessing robustness of UAV trackers under common corruptions. +Specifically, UAV-C is built upon two popular UAV datasets by introducing 18 +common corruptions from 4 representative categories including adversarial, +sensor, blur, and composite corruptions in different levels. Finally, UAV-C +contains more than 10K sequences. To understand the robustness of existing UAV +trackers against corruptions, we extensively evaluate 12 representative +algorithms on UAV-C. Our study reveals several key findings: 1) Current +trackers are vulnerable to corruptions, indicating more attention needed in +enhancing the robustness of UAV trackers; 2) When accompanying together, +composite corruptions result in more severe degradation to trackers; and 3) +While each tracker has its unique performance profile, some trackers may be +more sensitive to specific corruptions. By releasing UAV-C, we hope it, along +with comprehensive analysis, serves as a valuable resource for advancing the +robustness of UAV tracking against corruption. Our UAV-C will be available at +https://github.com/Xiaoqiong-Liu/UAV-C. + +
+
+
+
+
+ + ☆ VmambaIR: Visual State Space Model for Image Restoration + + +
+ Image restoration is a critical task in low-level computer vision, aiming to +restore high-quality images from degraded inputs. Various models, such as +convolutional neural networks (CNNs), generative adversarial networks (GANs), +transformers, and diffusion models (DMs), have been employed to address this +problem with significant impact. However, CNNs have limitations in capturing +long-range dependencies. DMs require large prior models and computationally +intensive denoising steps. Transformers have powerful modeling capabilities but +face challenges due to quadratic complexity with input image size. To address +these challenges, we propose VmambaIR, which introduces State Space Models +(SSMs) with linear complexity into comprehensive image restoration tasks. We +utilize a Unet architecture to stack our proposed Omni Selective Scan (OSS) +blocks, consisting of an OSS module and an Efficient Feed-Forward Network +(EFFN). Our proposed omni selective scan mechanism overcomes the unidirectional +modeling limitation of SSMs by efficiently modeling image information flows in +all six directions. Furthermore, we conducted a comprehensive evaluation of our +VmambaIR across multiple image restoration tasks, including image deraining, +single image super-resolution, and real-world image super-resolution. Extensive +experimental results demonstrate that our proposed VmambaIR achieves +state-of-the-art (SOTA) performance with much fewer computational resources and +parameters. Our research highlights the potential of state space models as +promising alternatives to the transformer and CNN architectures in serving as +foundational frameworks for next-generation low-level visual tasks. + +
+
+ comment: 23 pages +
+
+
+
+
+ + ☆ DreamSampler: Unifying Diffusion Sampling and Score Distillation for + Image Manipulation + + +
+ Reverse sampling and score-distillation have emerged as main workhorses in +recent years for image manipulation using latent diffusion models (LDMs). While +reverse diffusion sampling often requires adjustments of LDM architecture or +feature engineering, score distillation offers a simple yet powerful +model-agnostic approach, but it is often prone to mode-collapsing. To address +these limitations and leverage the strengths of both approaches, here we +introduce a novel framework called {\em DreamSampler}, which seamlessly +integrates these two distinct approaches through the lens of regularized latent +optimization. Similar to score-distillation, DreamSampler is a model-agnostic +approach applicable to any LDM architecture, but it allows both distillation +and reverse sampling with additional guidance for image editing and +reconstruction. Through experiments involving image editing, SVG reconstruction +and etc, we demonstrate the competitive performance of DreamSampler compared to +existing approaches, while providing new applications. + +
+
+
+
+
+ + ☆ Scene-LLM: Extending Language Model for 3D Visual Understanding and + Reasoning + + +
+ This paper introduces Scene-LLM, a 3D-visual-language model that enhances +embodied agents' abilities in interactive 3D indoor environments by integrating +the reasoning strengths of Large Language Models (LLMs). Scene-LLM adopts a +hybrid 3D visual feature representation, that incorporates dense spatial +information and supports scene state updates. The model employs a projection +layer to efficiently project these features in the pre-trained textual +embedding space, enabling effective interpretation of 3D visual information. +Unique to our approach is the integration of both scene-level and ego-centric +3D information. This combination is pivotal for interactive planning, where +scene-level data supports global planning and ego-centric data is important for +localization. Notably, we use ego-centric 3D frame features for feature +alignment, an efficient technique that enhances the model's ability to align +features of small objects within the scene. Our experiments with Scene-LLM +demonstrate its strong capabilities in dense captioning, question answering, +and interactive planning. We believe Scene-LLM advances the field of 3D visual +understanding and reasoning, offering new possibilities for sophisticated agent +interactions in indoor settings. + +
+
+
+
+
+ + ☆ Defense Against Adversarial Attacks on No-Reference Image Quality Models + with Gradient Norm Regularization CVPR 2024 + + +
+ The task of No-Reference Image Quality Assessment (NR-IQA) is to estimate the +quality score of an input image without additional information. NR-IQA models +play a crucial role in the media industry, aiding in performance evaluation and +optimization guidance. However, these models are found to be vulnerable to +adversarial attacks, which introduce imperceptible perturbations to input +images, resulting in significant changes in predicted scores. In this paper, we +propose a defense method to improve the stability in predicted scores when +attacked by small perturbations, thus enhancing the adversarial robustness of +NR-IQA models. To be specific, we present theoretical evidence showing that the +magnitude of score changes is related to the $\ell_1$ norm of the model's +gradient with respect to the input image. Building upon this theoretical +foundation, we propose a norm regularization training strategy aimed at +reducing the $\ell_1$ norm of the gradient, thereby boosting the robustness of +NR-IQA models. Experiments conducted on four NR-IQA baseline models demonstrate +the effectiveness of our strategy in reducing score changes in the presence of +adversarial attacks. To the best of our knowledge, this work marks the first +attempt to defend against adversarial attacks on NR-IQA models. Our study +offers valuable insights into the adversarial robustness of NR-IQA models and +provides a foundation for future research in this area. + +
+
+ comment: accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Investigating the Benefits of Projection Head for Representation + Learning + + +
+ An effective technique for obtaining high-quality representations is adding a +projection head on top of the encoder during training, then discarding it and +using the pre-projection representations. Despite its proven practical +effectiveness, the reason behind the success of this technique is poorly +understood. The pre-projection representations are not directly optimized by +the loss function, raising the question: what makes them better? In this work, +we provide a rigorous theoretical answer to this question. We start by +examining linear models trained with self-supervised contrastive loss. We +reveal that the implicit bias of training algorithms leads to layer-wise +progressive feature weighting, where features become increasingly unequal as we +go deeper into the layers. Consequently, lower layers tend to have more +normalized and less specialized representations. We theoretically characterize +scenarios where such representations are more beneficial, highlighting the +intricate interplay between data augmentation and input features. Additionally, +we demonstrate that introducing non-linearity into the network allows lower +layers to learn features that are completely absent in higher layers. Finally, +we show how this mechanism improves the robustness in supervised contrastive +learning and supervised learning. We empirically validate our results through +various experiments on CIFAR-10/100, UrbanCars and shifted versions of +ImageNet. We also introduce a potential alternative to projection head, which +offers a more interpretable and controllable design. + +
+
+
+
+
+ + ☆ Boosting Order-Preserving and Transferability for Neural Architecture + Search: a Joint Architecture Refined Search and Fine-tuning Approach CVPR2024 + + +
+ Supernet is a core component in many recent Neural Architecture Search (NAS) +methods. It not only helps embody the search space but also provides a +(relative) estimation of the final performance of candidate architectures. +Thus, it is critical that the top architectures ranked by a supernet should be +consistent with those ranked by true performance, which is known as the +order-preserving ability. In this work, we analyze the order-preserving ability +on the whole search space (global) and a sub-space of top architectures +(local), and empirically show that the local order-preserving for current +two-stage NAS methods still need to be improved. To rectify this, we propose a +novel concept of Supernet Shifting, a refined search strategy combining +architecture searching with supernet fine-tuning. Specifically, apart from +evaluating, the training loss is also accumulated in searching and the supernet +is updated every iteration. Since superior architectures are sampled more +frequently in evolutionary searching, the supernet is encouraged to focus on +top architectures, thus improving local order-preserving. Besides, a +pre-trained supernet is often un-reusable for one-shot methods. We show that +Supernet Shifting can fulfill transferring supernet to a new dataset. +Specifically, the last classifier layer will be unset and trained through +evolutionary searching. Comprehensive experiments show that our method has +better order-preserving ability and can find a dominating architecture. +Moreover, the pre-trained supernet can be easily transferred into a new dataset +with no loss of performance. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ ShapeFormer: Shape Prior Visible-to-Amodal Transformer-based Amodal + Instance Segmentation IJCNN 2024 + + +
+ Amodal Instance Segmentation (AIS) presents a challenging task as it involves +predicting both visible and occluded parts of objects within images. Existing +AIS methods rely on a bidirectional approach, encompassing both the transition +from amodal features to visible features (amodal-to-visible) and from visible +features to amodal features (visible-to-amodal). Our observation shows that the +utilization of amodal features through the amodal-to-visible can confuse the +visible features due to the extra information of occluded/hidden segments not +presented in visible display. Consequently, this compromised quality of visible +features during the subsequent visible-to-amodal transition. To tackle this +issue, we introduce ShapeFormer, a decoupled Transformer-based model with a +visible-to-amodal transition. It facilitates the explicit relationship between +output segmentations and avoids the need for amodal-to-visible transitions. +ShapeFormer comprises three key modules: (i) Visible-Occluding Mask Head for +predicting visible segmentation with occlusion awareness, (ii) Shape-Prior +Amodal Mask Head for predicting amodal and occluded masks, and (iii) +Category-Specific Shape Prior Retriever aims to provide shape prior knowledge. +Comprehensive experiments and extensive ablation studies across various AIS +benchmarks demonstrate the effectiveness of our ShapeFormer. The code is +available at: https://github.com/UARK-AICV/ShapeFormer + +
+
+ comment: Accepted to IJCNN 2024 +
+
+
+
+
+ + ☆ Path-GPTOmic: A Balanced Multi-modal Learning Framework for Survival + Outcome Prediction + + +
+ For predicting cancer survival outcomes, standard approaches in clinical +research are often based on two main modalities: pathology images for observing +cell morphology features, and genomic (e.g., bulk RNA-seq) for quantifying gene +expressions. However, existing pathology-genomic multi-modal algorithms face +significant challenges: (1) Valuable biological insights regarding genes and +gene-gene interactions are frequently overlooked; (2) one modality often +dominates the optimization process, causing inadequate training for the other +modality. In this paper, we introduce a new multi-modal ``Path-GPTOmic" +framework for cancer survival outcome prediction. First, to extract valuable +biological insights, we regulate the embedding space of a foundation model, +scGPT, initially trained on single-cell RNA-seq data, making it adaptable for +bulk RNA-seq data. Second, to address the imbalance-between-modalities problem, +we propose a gradient modulation mechanism tailored to the Cox partial +likelihood loss for survival prediction. The contributions of the modalities +are dynamically monitored and adjusted during the training process, encouraging +that both modalities are sufficiently trained. Evaluated on two TCGA(The Cancer +Genome Atlas) datasets, our model achieves substantially improved survival +prediction accuracy. + +
+
+ comment: Accepted by IEEE International Symposium on Biomedical Imaging (ISBI + 2024) +
+
+
+
+
+ + ☆ GT-Rain Single Image Deraining Challenge Report + + +
+ This report reviews the results of the GT-Rain challenge on single image +deraining at the UG2+ workshop at CVPR 2023. The aim of this competition is to +study the rainy weather phenomenon in real world scenarios, provide a novel +real world rainy image dataset, and to spark innovative ideas that will further +the development of single image deraining methods on real images. Submissions +were trained on the GT-Rain dataset and evaluated on an extension of the +dataset consisting of 15 additional scenes. Scenes in GT-Rain are comprised of +real rainy image and ground truth image captured moments after the rain had +stopped. 275 participants were registered in the challenge and 55 competed in +the final testing phase. + +
+
+
+
+
+ + ☆ Removing Undesirable Concepts in Text-to-Image Generative Models with + Learnable Prompts + + +
+ Generative models have demonstrated remarkable potential in generating +visually impressive content from textual descriptions. However, training these +models on unfiltered internet data poses the risk of learning and subsequently +propagating undesirable concepts, such as copyrighted or unethical content. In +this paper, we propose a novel method to remove undesirable concepts from +text-to-image generative models by incorporating a learnable prompt into the +cross-attention module. This learnable prompt acts as additional memory to +transfer the knowledge of undesirable concepts into it and reduce the +dependency of these concepts on the model parameters and corresponding textual +inputs. Because of this knowledge transfer into the prompt, erasing these +undesirable concepts is more stable and has minimal negative impact on other +concepts. We demonstrate the effectiveness of our method on the Stable +Diffusion model, showcasing its superiority over state-of-the-art erasure +methods in terms of removing undesirable content while preserving other +unrelated elements. + +
+
+
+
+
+ + ☆ EffiPerception: an Efficient Framework for Various Perception Tasks + + +
+ The accuracy-speed-memory trade-off is always the priority to consider for +several computer vision perception tasks. + Previous methods mainly focus on a single or small couple of these tasks, +such as creating effective data augmentation, feature extractor, learning +strategies, etc. These approaches, however, could be inherently task-specific: +their proposed model's performance may depend on a specific perception task or +a dataset. + Targeting to explore common learning patterns and increasing the module +robustness, we propose the EffiPerception framework. + It could achieve great accuracy-speed performance with relatively low memory +cost under several perception tasks: 2D Object Detection, 3D Object Detection, +2D Instance Segmentation, and 3D Point Cloud Segmentation. + Overall, the framework consists of three parts: + (1) Efficient Feature Extractors, which extract the input features for each +modality. (2) Efficient Layers, plug-in plug-out layers that further process +the feature representation, aggregating core learned information while pruning +noisy proposals. (3) The EffiOptim, an 8-bit optimizer to further cut down the +computational cost and facilitate performance stability. + Extensive experiments on the KITTI, semantic-KITTI, and COCO datasets +revealed that EffiPerception could show great accuracy-speed-memory overall +performance increase within the four detection and segmentation tasks, in +comparison to earlier, well-respected methods. + +
+
+
+
+
+ + ☆ Prototipo de un Contador Bidireccional Automático de Personas basado + en sensores de visión 3D + + +
+ 3D sensors, also known as RGB-D sensors, utilize depth images where each +pixel measures the distance from the camera to objects, using principles like +structured light or time-of-flight. Advances in artificial vision have led to +affordable 3D cameras capable of real-time object detection without object +movement, surpassing 2D cameras in information depth. These cameras can +identify objects of varying colors and reflectivities and are less affected by +lighting changes. The described prototype uses RGB-D sensors for bidirectional +people counting in venues, aiding security and surveillance in spaces like +stadiums or airports. It determines real-time occupancy and checks against +maximum capacity, crucial during emergencies. The system includes a RealSense +D415 depth camera and a mini-computer running object detection algorithms to +count people and a 2D camera for identity verification. The system supports +statistical analysis and uses C++, Python, and PHP with OpenCV for image +processing, demonstrating a comprehensive approach to monitoring venue +occupancy. + +
+
+ comment: 8 pages, in Spanish language, 8 figures +
+
+
+
+
+ + ☆ R3DS: Reality-linked 3D Scenes for Panoramic Scene Understanding + + +
+ We introduce the Reality-linked 3D Scenes (R3DS) dataset of synthetic 3D +scenes mirroring the real-world scene arrangements from Matterport3D panoramas. +Compared to prior work, R3DS has more complete and densely populated scenes +with objects linked to real-world observations in panoramas. R3DS also provides +an object support hierarchy, and matching object sets (e.g., same chairs around +a dining table) for each scene. Overall, R3DS contains 19K objects represented +by 3,784 distinct CAD models from over 100 object categories. We demonstrate +the effectiveness of R3DS on the Panoramic Scene Understanding task. We find +that: 1) training on R3DS enables better generalization; 2) support relation +prediction trained with R3DS improves performance compared to heuristically +calculated support; and 3) R3DS offers a challenging benchmark for future work +on panoramic scene understanding. + +
+
+
+
+
+ + ☆ Estimation and Analysis of Slice Propagation Uncertainty in 3D Anatomy + Segmentation + + +
+ Supervised methods for 3D anatomy segmentation demonstrate superior +performance but are often limited by the availability of annotated data. This +limitation has led to a growing interest in self-supervised approaches in +tandem with the abundance of available un-annotated data. Slice propagation has +emerged as an self-supervised approach that leverages slice registration as a +self-supervised task to achieve full anatomy segmentation with minimal +supervision. This approach significantly reduces the need for domain expertise, +time, and the cost associated with building fully annotated datasets required +for training segmentation networks. However, this shift toward reduced +supervision via deterministic networks raises concerns about the +trustworthiness and reliability of predictions, especially when compared with +more accurate supervised approaches. To address this concern, we propose the +integration of calibrated uncertainty quantification (UQ) into slice +propagation methods, providing insights into the model's predictive reliability +and confidence levels. Incorporating uncertainty measures enhances user +confidence in self-supervised approaches, thereby improving their practical +applicability. We conducted experiments on three datasets for 3D abdominal +segmentation using five UQ methods. The results illustrate that incorporating +UQ improves not only model trustworthiness, but also segmentation accuracy. +Furthermore, our analysis reveals various failure modes of slice propagation +methods that might not be immediately apparent to end-users. This study opens +up new research avenues to improve the accuracy and trustworthiness of slice +propagation methods. + +
+
+ comment: 13 pages including Supplementary, 4 figures +
+
+
+
+
+ + ☆ Data-Efficient Contrastive Language-Image Pretraining: Prioritizing Data + Quality over Quantity AISTATS 2024 + + +
+ Contrastive Language-Image Pre-training (CLIP) on large-scale image-caption +datasets learns representations that can achieve remarkable zero-shot +generalization. However, such models require a massive amount of pre-training +data. Improving the quality of the pre-training data has been shown to be much +more effective in improving CLIP's performance than increasing its volume. +Nevertheless, finding small subsets of training data that provably generalize +the best has remained an open question. In this work, we propose the first +theoretically rigorous data selection method for CLIP. We show that subsets +that closely preserve the cross-covariance of the images and captions of the +full data provably achieve a superior generalization performance. Our extensive +experiments on ConceptualCaptions3M and ConceptualCaptions12M demonstrate that +subsets found by \method\ achieve over 2.7x and 1.4x the accuracy of the next +best baseline on ImageNet and its shifted versions. Moreover, we show that our +subsets obtain 1.5x the average accuracy across 11 downstream datasets, of the +next best baseline. The code is available at: +https://github.com/BigML-CS-UCLA/clipcov-data-efficient-clip. + +
+
+ comment: AISTATS 2024, Code: + https://github.com/BigML-CS-UCLA/clipcov-data-efficient-clip +
+
+
+
+
+ + ☆ Improving Generalization via Meta-Learning on Hard Samples CVPR 2024 + + +
+ Learned reweighting (LRW) approaches to supervised learning use an +optimization criterion to assign weights for training instances, in order to +maximize performance on a representative validation dataset. We pose and +formalize the problem of optimized selection of the validation set used in LRW +training, to improve classifier generalization. In particular, we show that +using hard-to-classify instances in the validation set has both a theoretical +connection to, and strong empirical evidence of generalization. We provide an +efficient algorithm for training this meta-optimized model, as well as a simple +train-twice heuristic for careful comparative study. We demonstrate that LRW +with easy validation data performs consistently worse than LRW with hard +validation data, establishing the validity of our meta-optimization problem. +Our proposed algorithm outperforms a wide range of baselines on a range of +datasets and domain shift challenges (Imagenet-1K, CIFAR-100, Clothing-1M, +CAMELYON, WILDS, etc.), with ~1% gains using VIT-B on Imagenet. We also show +that using naturally hard examples for validation (Imagenet-R / Imagenet-A) in +LRW training for Imagenet improves performance on both clean and naturally hard +test instances by 1-2%. Secondary analyses show that using hard validation data +in an LRW framework improves margins on test data, hinting at the mechanism +underlying our empirical gains. We believe this work opens up new research +directions for the meta-optimization of meta-learning in a supervised learning +context. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ Fusion Transformer with Object Mask Guidance for Image Forgery Analysis + + +
+ In this work, we introduce OMG-Fuser, a fusion transformer-based network +designed to extract information from various forensic signals to enable robust +image forgery detection and localization. Our approach can operate with an +arbitrary number of forensic signals and leverages object information for their +analysis -- unlike previous methods that rely on fusion schemes with few +signals and often disregard image semantics. To this end, we design a forensic +signal stream composed of a transformer guided by an object attention +mechanism, associating patches that depict the same objects. In that way, we +incorporate object-level information from the image. Each forensic signal is +processed by a different stream that adapts to its peculiarities. Subsequently, +a token fusion transformer efficiently aggregates the outputs of an arbitrary +number of network streams and generates a fused representation for each image +patch. These representations are finally processed by a long-range dependencies +transformer that captures the intrinsic relations between the image patches. We +assess two fusion variants on top of the proposed approach: (i) score-level +fusion that fuses the outputs of multiple image forensics algorithms and (ii) +feature-level fusion that fuses low-level forensic traces directly. Both +variants exceed state-of-the-art performance on seven datasets for image +forgery detection and localization, with a relative average improvement of +12.1% and 20.4% in terms of F1. Our network demonstrates robustness against +traditional and novel forgery attacks and can be expanded with new signals +without training from scratch. + +
+
+
+
+
+ + ☆ Large-scale flood modeling and forecasting with FloodCast + + +
+ Large-scale hydrodynamic models generally rely on fixed-resolution spatial +grids and model parameters as well as incurring a high computational cost. This +limits their ability to accurately forecast flood crests and issue +time-critical hazard warnings. In this work, we build a fast, stable, accurate, +resolution-invariant, and geometry-adaptative flood modeling and forecasting +framework that can perform at large scales, namely FloodCast. The framework +comprises two main modules: multi-satellite observation and hydrodynamic +modeling. In the multi-satellite observation module, a real-time unsupervised +change detection method and a rainfall processing and analysis tool are +proposed to harness the full potential of multi-satellite observations in +large-scale flood prediction. In the hydrodynamic modeling module, a +geometry-adaptive physics-informed neural solver (GeoPINS) is introduced, +benefiting from the absence of a requirement for training data in +physics-informed neural networks and featuring a fast, accurate, and +resolution-invariant architecture with Fourier neural operators. GeoPINS +demonstrates impressive performance on popular PDEs across regular and +irregular domains. Building upon GeoPINS, we propose a sequence-to-sequence +GeoPINS model to handle long-term temporal series and extensive spatial domains +in large-scale flood modeling. Next, we establish a benchmark dataset in the +2022 Pakistan flood to assess various flood prediction methods. Finally, we +validate the model in three dimensions - flood inundation range, depth, and +transferability of spatiotemporal downscaling. Traditional hydrodynamics and +sequence-to-sequence GeoPINS exhibit exceptional agreement during high water +levels, while comparative assessments with SAR-based flood depth data show that +sequence-to-sequence GeoPINS outperforms traditional hydrodynamics, with +smaller prediction errors. + +
+
+ comment: 40 pages, 16 figures, under review +
+
+
+
+
+ + ☆ A Unified Model for Longitudinal Multi-Modal Multi-View Prediction with + Missingness + + +
+ Medical records often consist of different modalities, such as images, text, +and tabular information. Integrating all modalities offers a holistic view of a +patient's condition, while analyzing them longitudinally provides a better +understanding of disease progression. However, real-world longitudinal medical +records present challenges: 1) patients may lack some or all of the data for a +specific timepoint, and 2) certain modalities or views might be absent for all +patients during a particular period. In this work, we introduce a unified model +for longitudinal multi-modal multi-view (MMMV) prediction with missingness. Our +method allows as many timepoints as desired for input, and aims to leverage all +available data, regardless of their availability. We conduct extensive +experiments on the knee osteoarthritis dataset from the Osteoarthritis +Initiative (OAI) for pain and Kellgren-Lawrence grade (KLG) prediction at a +future timepoint. We demonstrate the effectiveness of our method by comparing +results from our unified model to specific models that use the same modality +and view combinations during training and evaluation. We also show the benefit +of having extended temporal data and provide post-hoc analysis for a deeper +understanding of each modality/view's importance for different tasks. + +
+
+
+
+
+ + ☆ Synthetic Image Generation in Cyber Influence Operations: An Emergent + Threat? + + +
+ The evolution of artificial intelligence (AI) has catalyzed a transformation +in digital content generation, with profound implications for cyber influence +operations. This report delves into the potential and limitations of generative +deep learning models, such as diffusion models, in fabricating convincing +synthetic images. We critically assess the accessibility, practicality, and +output quality of these tools and their implications in threat scenarios of +deception, influence, and subversion. Notably, the report generates content for +several hypothetical cyber influence operations to demonstrate the current +capabilities and limitations of these AI-driven methods for threat actors. +While generative models excel at producing illustrations and non-realistic +imagery, creating convincing photo-realistic content remains a significant +challenge, limited by computational resources and the necessity for +human-guided refinement. Our exploration underscores the delicate balance +between technological advancement and its potential for misuse, prompting +recommendations for ongoing research, defense mechanisms, multi-disciplinary +collaboration, and policy development. These recommendations aim to leverage +AI's potential for positive impact while safeguarding against its risks to the +integrity of information, especially in the context of cyber influence. + +
+
+ comment: 44 pages, 56 figures +
+
+
+
+
+ + ☆ Bootstrapping Reinforcement Learning with Imitation for Vision-Based + Agile Flight + + +
+ We combine the effectiveness of Reinforcement Learning (RL) and the +efficiency of Imitation Learning (IL) in the context of vision-based, +autonomous drone racing. We focus on directly processing visual input without +explicit state estimation. While RL offers a general framework for learning +complex controllers through trial and error, it faces challenges regarding +sample efficiency and computational demands due to the high dimensionality of +visual inputs. Conversely, IL demonstrates efficiency in learning from visual +demonstrations but is limited by the quality of those demonstrations and faces +issues like covariate shift. To overcome these limitations, we propose a novel +training framework combining RL and IL's advantages. Our framework involves +three stages: initial training of a teacher policy using privileged state +information, distilling this policy into a student policy using IL, and +performance-constrained adaptive RL fine-tuning. Our experiments in both +simulated and real-world environments demonstrate that our approach achieves +superior performance and robustness than IL or RL alone in navigating a +quadrotor through a racing course using only visual information without +explicit state estimation. + +
+
+
+
+
+ + ☆ DeCoTR: Enhancing Depth Completion with 2D and 3D Attentions CVPR 2024 + + +
+ In this paper, we introduce a novel approach that harnesses both 2D and 3D +attentions to enable highly accurate depth completion without requiring +iterative spatial propagations. Specifically, we first enhance a baseline +convolutional depth completion model by applying attention to 2D features in +the bottleneck and skip connections. This effectively improves the performance +of this simple network and sets it on par with the latest, complex +transformer-based models. Leveraging the initial depths and features from this +network, we uplift the 2D features to form a 3D point cloud and construct a 3D +point transformer to process it, allowing the model to explicitly learn and +exploit 3D geometric features. In addition, we propose normalization techniques +to process the point cloud, which improves learning and leads to better +accuracy than directly using point transformers off the shelf. Furthermore, we +incorporate global attention on downsampled point cloud features, which enables +long-range context while still being computationally feasible. We evaluate our +method, DeCoTR, on established depth completion benchmarks, including NYU Depth +V2 and KITTI, showcasing that it sets new state-of-the-art performance. We +further conduct zero-shot evaluations on ScanNet and DDAD benchmarks and +demonstrate that DeCoTR has superior generalizability compared to existing +approaches. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ☆ FLex: Joint Pose and Dynamic Radiance Fields Optimization for Stereo + Endoscopic Videos + + +
+ Reconstruction of endoscopic scenes is an important asset for various medical +applications, from post-surgery analysis to educational training. Neural +rendering has recently shown promising results in endoscopic reconstruction +with deforming tissue. However, the setup has been restricted to a static +endoscope, limited deformation, or required an external tracking device to +retrieve camera pose information of the endoscopic camera. With FLex we adress +the challenging setup of a moving endoscope within a highly dynamic environment +of deforming tissue. We propose an implicit scene separation into multiple +overlapping 4D neural radiance fields (NeRFs) and a progressive optimization +scheme jointly optimizing for reconstruction and camera poses from scratch. +This improves the ease-of-use and allows to scale reconstruction capabilities +in time to process surgical videos of 5,000 frames and more; an improvement of +more than ten times compared to the state of the art while being agnostic to +external tracking information. Extensive evaluations on the StereoMIS dataset +show that FLex significantly improves the quality of novel view synthesis while +maintaining competitive pose accuracy. + +
+
+
+
+
+ + ☆ E2F-Net: Eyes-to-Face Inpainting via StyleGAN Latent Space + + +
+ Face inpainting, the technique of restoring missing or damaged regions in +facial images, is pivotal for applications like face recognition in occluded +scenarios and image analysis with poor-quality captures. This process not only +needs to produce realistic visuals but also preserve individual identity +characteristics. The aim of this paper is to inpaint a face given periocular +region (eyes-to-face) through a proposed new Generative Adversarial Network +(GAN)-based model called Eyes-to-Face Network (E2F-Net). The proposed approach +extracts identity and non-identity features from the periocular region using +two dedicated encoders have been used. The extracted features are then mapped +to the latent space of a pre-trained StyleGAN generator to benefit from its +state-of-the-art performance and its rich, diverse and expressive latent space +without any additional training. We further improve the StyleGAN output to find +the optimal code in the latent space using a new optimization for GAN inversion +technique. Our E2F-Net requires a minimum training process reducing the +computational complexity as a secondary benefit. Through extensive experiments, +we show that our method successfully reconstructs the whole face with high +quality, surpassing current techniques, despite significantly less training and +supervision efforts. We have generated seven eyes-to-face datasets based on +well-known public face datasets for training and verifying our proposed +methods. The code and datasets are publicly available. + +
+
+
+
+
+ + ☆ The POLAR Traverse Dataset: A Dataset of Stereo Camera Images Simulating + Traverses across Lunar Polar Terrain under Extreme Lighting Conditions + + +
+ We present the POLAR Traverse Dataset: a dataset of high-fidelity stereo pair +images of lunar-like terrain under polar lighting conditions designed to +simulate a straight-line traverse. Images from individual traverses with +different camera heights and pitches were recorded at 1 m intervals by moving a +suspended stereo bar across a test bed filled with regolith simulant and shaped +to mimic lunar south polar terrain. Ground truth geometry and camera position +information was also recorded. This dataset is intended for developing and +testing software algorithms that rely on stereo or monocular camera images, +such as visual odometry, for use in the lunar polar environment, as well as to +provide insight into the expected lighting conditions in lunar polar regions. + +
+
+ comment: 6 pages, 5 figures, 3 tables. Associated dataset can be found at + https://ti.arc.nasa.gov/dataset/PolarTrav/ +
+
+
+
+
+ + ☆ Graph-Jigsaw Conditioned Diffusion Model for Skeleton-based Video + Anomaly Detection + + +
+ Skeleton-based video anomaly detection (SVAD) is a crucial task in computer +vision. Accurately identifying abnormal patterns or events enables operators to +promptly detect suspicious activities, thereby enhancing safety. Achieving this +demands a comprehensive understanding of human motions, both at body and region +levels, while also accounting for the wide variations of performing a single +action. However, existing studies fail to simultaneously address these crucial +properties. This paper introduces a novel, practical and lightweight framework, +namely Graph-Jigsaw Conditioned Diffusion Model for Skeleton-based Video +Anomaly Detection (GiCiSAD) to overcome the challenges associated with SVAD. +GiCiSAD consists of three novel modules: the Graph Attention-based Forecasting +module to capture the spatio-temporal dependencies inherent in the data, the +Graph-level Jigsaw Puzzle Maker module to distinguish subtle region-level +discrepancies between normal and abnormal motions, and the Graph-based +Conditional Diffusion model to generate a wide spectrum of human motions. +Extensive experiments on four widely used skeleton-based video datasets show +that GiCiSAD outperforms existing methods with significantly fewer training +parameters, establishing it as the new state-of-the-art. + +
+
+ comment: 18 pages, 2 figures, 6 tables +
+
+
+
+
+ + ☆ Generalizing deep learning models for medical image classification + + +
+ Numerous Deep Learning (DL) models have been developed for a large spectrum +of medical image analysis applications, which promises to reshape various +facets of medical practice. Despite early advances in DL model validation and +implementation, which encourage healthcare institutions to adopt them, some +fundamental questions remain: are the DL models capable of generalizing? What +causes a drop in DL model performances? How to overcome the DL model +performance drop? Medical data are dynamic and prone to domain shift, due to +multiple factors such as updates to medical equipment, new imaging workflow, +and shifts in patient demographics or populations can induce this drift over +time. In this paper, we review recent developments in generalization methods +for DL-based classification models. We also discuss future challenges, +including the need for improved evaluation protocols and benchmarks, and +envisioned future developments to achieve robust, generalized models for +medical image classification. + +
+
+
+
+
+ + ☆ ThermoNeRF: Multimodal Neural Radiance Fields for Thermal Novel View + Synthesis + + +
+ Thermal scene reconstruction exhibit great potential for applications across +a broad spectrum of fields, including building energy consumption analysis and +non-destructive testing. However, existing methods typically require dense +scene measurements and often rely on RGB images for 3D geometry reconstruction, +with thermal information being projected post-reconstruction. This two-step +strategy, adopted due to the lack of texture in thermal images, can lead to +disparities between the geometry and temperatures of the reconstructed objects +and those of the actual scene. To address this challenge, we propose +ThermoNeRF, a novel multimodal approach based on Neural Radiance Fields, +capable of rendering new RGB and thermal views of a scene jointly. To overcome +the lack of texture in thermal images, we use paired RGB and thermal images to +learn scene density, while distinct networks estimate color and temperature +information. Furthermore, we introduce ThermoScenes, a new dataset to palliate +the lack of available RGB+thermal datasets for scene reconstruction. +Experimental results validate that ThermoNeRF achieves accurate thermal image +synthesis, with an average mean absolute error of 1.5$^\circ$C, an improvement +of over 50% compared to using concatenated RGB+thermal data with Nerfacto, a +state-of-the-art NeRF method. + +
+
+
+
+
+ + ☆ Development of Automated Neural Network Prediction for Echocardiographic + Left ventricular Ejection Fraction + + +
+ The echocardiographic measurement of left ventricular ejection fraction +(LVEF) is fundamental to the diagnosis and classification of patients with +heart failure (HF). In order to quantify LVEF automatically and accurately, +this paper proposes a new pipeline method based on deep neural networks and +ensemble learning. Within the pipeline, an Atrous Convolutional Neural Network +(ACNN) was first trained to segment the left ventricle (LV), before employing +the area-length formulation based on the ellipsoid single-plane model to +calculate LVEF values. This formulation required inputs of LV area, derived +from segmentation using an improved Jeffrey's method, as well as LV length, +derived from a novel ensemble learning model. To further improve the pipeline's +accuracy, an automated peak detection algorithm was used to identify +end-diastolic and end-systolic frames, avoiding issues with human error. +Subsequently, single-beat LVEF values were averaged across all cardiac cycles +to obtain the final LVEF. This method was developed and internally validated in +an open-source dataset containing 10,030 echocardiograms. The Pearson's +correlation coefficient was 0.83 for LVEF prediction compared to expert human +analysis (p<0.001), with a subsequent area under the receiver operator curve +(AUROC) of 0.98 (95% confidence interval 0.97 to 0.99) for categorisation of HF +with reduced ejection (HFrEF; LVEF<40%). In an external dataset with 200 +echocardiograms, this method achieved an AUC of 0.90 (95% confidence interval +0.88 to 0.91) for HFrEF assessment. This study demonstrates that an automated +neural network-based calculation of LVEF is comparable to expert clinicians +performing time-consuming, frame-by-frame manual evaluation of cardiac systolic +function. + +
+
+ comment: Accepted to Frontiers in Medicine +
+
+
+
+
+ + ☆ Fusing Domain-Specific Content from Large Language Models into Knowledge + Graphs for Enhanced Zero Shot Object State Classification AAAI + + +
+ Domain-specific knowledge can significantly contribute to addressing a wide +variety of vision tasks. However, the generation of such knowledge entails +considerable human labor and time costs. This study investigates the potential +of Large Language Models (LLMs) in generating and providing domain-specific +information through semantic embeddings. To achieve this, an LLM is integrated +into a pipeline that utilizes Knowledge Graphs and pre-trained semantic vectors +in the context of the Vision-based Zero-shot Object State Classification task. +We thoroughly examine the behavior of the LLM through an extensive ablation +study. Our findings reveal that the integration of LLM-based embeddings, in +combination with general-purpose pre-trained embeddings, leads to substantial +performance improvements. Drawing insights from this ablation study, we conduct +a comparative analysis against competing models, thereby highlighting the +state-of-the-art performance achieved by the proposed approach. + +
+
+ comment: Accepted at the AAAI-MAKE 24 +
+
+
+
+
+ + ☆ Exploring Pre-trained Text-to-Video Diffusion Models for Referring Video + Object Segmentation + + +
+ In this paper, we explore the visual representations produced from a +pre-trained text-to-video (T2V) diffusion model for video understanding tasks. +We hypothesize that the latent representation learned from a pretrained +generative T2V model encapsulates rich semantics and coherent temporal +correspondences, thereby naturally facilitating video understanding. Our +hypothesis is validated through the classic referring video object segmentation +(R-VOS) task. We introduce a novel framework, termed ``VD-IT'', tailored with +dedicatedly designed components built upon a fixed pretrained T2V model. +Specifically, VD-IT uses textual information as a conditional input, ensuring +semantic consistency across time for precise temporal instance matching. It +further incorporates image tokens as supplementary textual inputs, enriching +the feature set to generate detailed and nuanced masks.Besides, instead of +using the standard Gaussian noise, we propose to predict the video-specific +noise with an extra noise prediction module, which can help preserve the +feature fidelity and elevates segmentation quality. Through extensive +experiments, we surprisingly observe that fixed generative T2V diffusion +models, unlike commonly used video backbones (e.g., Video Swin Transformer) +pretrained with discriminative image/video pre-tasks, exhibit better potential +to maintain semantic alignment and temporal consistency. On existing standard +benchmarks, our VD-IT achieves highly competitive results, surpassing many +existing state-of-the-art methods. The code will be available at +\url{https://github.com/buxiangzhiren/VD-IT} + +
+
+ comment: The code will be available at + \url{https://github.com/buxiangzhiren/VD-IT} +
+
+
+
+
+ + ☆ Distilling Datasets Into Less Than One Image + + +
+ Dataset distillation aims to compress a dataset into a much smaller one so +that a model trained on the distilled dataset achieves high accuracy. Current +methods frame this as maximizing the distilled classification accuracy for a +budget of K distilled images-per-class, where K is a positive integer. In this +paper, we push the boundaries of dataset distillation, compressing the dataset +into less than an image-per-class. It is important to realize that the +meaningful quantity is not the number of distilled images-per-class but the +number of distilled pixels-per-dataset. We therefore, propose Poster Dataset +Distillation (PoDD), a new approach that distills the entire original dataset +into a single poster. The poster approach motivates new technical solutions for +creating training images and learnable labels. Our method can achieve +comparable or better performance with less than an image-per-class compared to +existing methods that use one image-per-class. Specifically, our method +establishes a new state-of-the-art performance on CIFAR-10, CIFAR-100, and +CUB200 using as little as 0.3 images-per-class. + +
+
+
+
+
+ + ☆ Zero-Shot Image Feature Consensus with Deep Functional Maps + + +
+ Correspondences emerge from large-scale vision models trained for generative +and discriminative tasks. This has been revealed and benchmarked by computing +correspondence maps between pairs of images, using nearest neighbors on the +feature grids. Existing work has attempted to improve the quality of these +correspondence maps by carefully mixing features from different sources, such +as by combining the features of different layers or networks. We point out that +a better correspondence strategy is available, which directly imposes structure +on the correspondence field: the functional map. Wielding this simple +mathematical tool, we lift the correspondence problem from the pixel space to +the function space and directly optimize for mappings that are globally +coherent. We demonstrate that our technique yields correspondences that are not +only smoother but also more accurate, with the possibility of better reflecting +the knowledge embedded in the large-scale vision models that we are studying. +Our approach sets a new state-of-the-art on various dense correspondence tasks. +We also demonstrate our effectiveness in keypoint correspondence and affordance +map transfer. + +
+
+
+
+
+ + ☆ One-Step Image Translation with Text-to-Image Models + + +
+ In this work, we address two limitations of existing conditional diffusion +models: their slow inference speed due to the iterative denoising process and +their reliance on paired data for model fine-tuning. To tackle these issues, we +introduce a general method for adapting a single-step diffusion model to new +tasks and domains through adversarial learning objectives. Specifically, we +consolidate various modules of the vanilla latent diffusion model into a single +end-to-end generator network with small trainable weights, enhancing its +ability to preserve the input image structure while reducing overfitting. We +demonstrate that, for unpaired settings, our model CycleGAN-Turbo outperforms +existing GAN-based and diffusion-based methods for various scene translation +tasks, such as day-to-night conversion and adding/removing weather effects like +fog, snow, and rain. We extend our method to paired settings, where our model +pix2pix-Turbo is on par with recent works like Control-Net for Sketch2Photo and +Edge2Image, but with a single-step inference. This work suggests that +single-step diffusion models can serve as strong backbones for a range of GAN +learning objectives. Our code and models are available at +https://github.com/GaParmar/img2img-turbo. + +
+
+ comment: Github: https://github.com/GaParmar/img2img-turbo +
+
+
+
+
+ + ☆ CoCoCo: Improving Text-Guided Video Inpainting for Better Consistency, + Controllability and Compatibility + + +
+ Recent advancements in video generation have been remarkable, yet many +existing methods struggle with issues of consistency and poor text-video +alignment. Moreover, the field lacks effective techniques for text-guided video +inpainting, a stark contrast to the well-explored domain of text-guided image +inpainting. To this end, this paper proposes a novel text-guided video +inpainting model that achieves better consistency, controllability and +compatibility. Specifically, we introduce a simple but efficient motion capture +module to preserve motion consistency, and design an instance-aware region +selection instead of a random region selection to obtain better textual +controllability, and utilize a novel strategy to inject some personalized +models into our CoCoCo model and thus obtain better model compatibility. +Extensive experiments show that our model can generate high-quality video +clips. Meanwhile, our model shows better motion consistency, textual +controllability and model compatibility. More details are shown in +[cococozibojia.github.io](cococozibojia.github.io). + +
+
+
+
+
+ + ☆ VFusion3D: Learning Scalable 3D Generative Models from Video Diffusion + Models + + +
+ This paper presents a novel paradigm for building scalable 3D generative +models utilizing pre-trained video diffusion models. The primary obstacle in +developing foundation 3D generative models is the limited availability of 3D +data. Unlike images, texts, or videos, 3D data are not readily accessible and +are difficult to acquire. This results in a significant disparity in scale +compared to the vast quantities of other types of data. To address this issue, +we propose using a video diffusion model, trained with extensive volumes of +text, images, and videos, as a knowledge source for 3D data. By unlocking its +multi-view generative capabilities through fine-tuning, we generate a +large-scale synthetic multi-view dataset to train a feed-forward 3D generative +model. The proposed model, VFusion3D, trained on nearly 3M synthetic multi-view +data, can generate a 3D asset from a single image in seconds and achieves +superior performance when compared to current SOTA feed-forward 3D generative +models, with users preferring our results over 70% of the time. + +
+
+ comment: Project page: https://junlinhan.github.io/projects/vfusion3d.html +
+
+
+
+
+ + ☆ HiKER-SGG: Hierarchical Knowledge Enhanced Robust Scene Graph Generation CVPR 2024 + + +
+ Being able to understand visual scenes is a precursor for many downstream +tasks, including autonomous driving, robotics, and other vision-based +approaches. A common approach enabling the ability to reason over visual data +is Scene Graph Generation (SGG); however, many existing approaches assume +undisturbed vision, i.e., the absence of real-world corruptions such as fog, +snow, smoke, as well as non-uniform perturbations like sun glare or water +drops. In this work, we propose a novel SGG benchmark containing procedurally +generated weather corruptions and other transformations over the Visual Genome +dataset. Further, we introduce a corresponding approach, Hierarchical Knowledge +Enhanced Robust Scene Graph Generation (HiKER-SGG), providing a strong baseline +for scene graph generation under such challenging setting. At its core, +HiKER-SGG utilizes a hierarchical knowledge graph in order to refine its +predictions from coarse initial estimates to detailed predictions. In our +extensive experiments, we show that HiKER-SGG does not only demonstrate +superior performance on corrupted images in a zero-shot manner, but also +outperforms current state-of-the-art methods on uncorrupted SGG tasks. Code is +available at https://github.com/zhangce01/HiKER-SGG. + +
+
+ comment: Accepted by CVPR 2024. Project page: + https://zhangce01.github.io/HiKER-SGG +
+
+
+
+
+ + ☆ Expandable Subspace Ensemble for Pre-Trained Model-Based + Class-Incremental Learning CVPR 2024 + + +
+ Class-Incremental Learning (CIL) requires a learning system to continually +learn new classes without forgetting. Despite the strong performance of +Pre-Trained Models (PTMs) in CIL, a critical issue persists: learning new +classes often results in the overwriting of old ones. Excessive modification of +the network causes forgetting, while minimal adjustments lead to an inadequate +fit for new classes. As a result, it is desired to figure out a way of +efficient model updating without harming former knowledge. In this paper, we +propose ExpAndable Subspace Ensemble (EASE) for PTM-based CIL. To enable model +updating without conflict, we train a distinct lightweight adapter module for +each new task, aiming to create task-specific subspaces. These adapters span a +high-dimensional feature space, enabling joint decision-making across multiple +subspaces. As data evolves, the expanding subspaces render the old class +classifiers incompatible with new-stage spaces. Correspondingly, we design a +semantic-guided prototype complement strategy that synthesizes old classes' new +features without using any old class instance. Extensive experiments on seven +benchmark datasets verify EASE's state-of-the-art performance. Code is +available at: https://github.com/sun-hailong/CVPR24-Ease + +
+
+ comment: Accepted to CVPR 2024. Code is available at: + https://github.com/sun-hailong/CVPR24-Ease +
+
+
+
+
+ + ☆ Align and Distill: Unifying and Improving Domain Adaptive Object + Detection + + +
+ Object detectors often perform poorly on data that differs from their +training set. Domain adaptive object detection (DAOD) methods have recently +demonstrated strong results on addressing this challenge. Unfortunately, we +identify systemic benchmarking pitfalls that call past results into question +and hamper further progress: (a) Overestimation of performance due to +underpowered baselines, (b) Inconsistent implementation practices preventing +transparent comparisons of methods, and (c) Lack of generality due to outdated +backbones and lack of diversity in benchmarks. We address these problems by +introducing: (1) A unified benchmarking and implementation framework, Align and +Distill (ALDI), enabling comparison of DAOD methods and supporting future +development, (2) A fair and modern training and evaluation protocol for DAOD +that addresses benchmarking pitfalls, (3) A new DAOD benchmark dataset, +CFC-DAOD, enabling evaluation on diverse real-world data, and (4) A new method, +ALDI++, that achieves state-of-the-art results by a large margin. ALDI++ +outperforms the previous state-of-the-art by +3.5 AP50 on Cityscapes to Foggy +Cityscapes, +5.7 AP50 on Sim10k to Cityscapes (where ours is the only method to +outperform a fair baseline), and +2.0 AP50 on CFC Kenai to Channel. Our +framework, dataset, and state-of-the-art method offer a critical reset for DAOD +and provide a strong foundation for future research. Code and data are +available: https://github.com/justinkay/aldi and +https://github.com/visipedia/caltech-fish-counting. + +
+
+ comment: 30 pages, 10 figures +
+
+
+
+
+ + ☆ Ultraman: Single Image 3D Human Reconstruction with Ultra Speed and + Detail + + +
+ 3D human body reconstruction has been a challenge in the field of computer +vision. Previous methods are often time-consuming and difficult to capture the +detailed appearance of the human body. In this paper, we propose a new method +called \emph{Ultraman} for fast reconstruction of textured 3D human models from +a single image. Compared to existing techniques, \emph{Ultraman} greatly +improves the reconstruction speed and accuracy while preserving high-quality +texture details. We present a set of new frameworks for human reconstruction +consisting of three parts, geometric reconstruction, texture generation and +texture mapping. Firstly, a mesh reconstruction framework is used, which +accurately extracts 3D human shapes from a single image. At the same time, we +propose a method to generate a multi-view consistent image of the human body +based on a single image. This is finally combined with a novel texture mapping +method to optimize texture details and ensure color consistency during +reconstruction. Through extensive experiments and evaluations, we demonstrate +the superior performance of \emph{Ultraman} on various standard datasets. In +addition, \emph{Ultraman} outperforms state-of-the-art methods in terms of +human rendering quality and speed. Upon acceptance of the article, we will make +the code and data publicly available. + +
+
+ comment: Project Page: https://air-discover.github.io/Ultraman/ +
+
+
+
+
+ + ☆ From Pixels to Insights: A Survey on Automatic Chart Understanding in + the Era of Large Foundation Models + + +
+ Data visualization in the form of charts plays a pivotal role in data +analysis, offering critical insights and aiding in informed decision-making. +Automatic chart understanding has witnessed significant advancements with the +rise of large foundation models in recent years. Foundation models, such as +large language models (LLMs), have revolutionized various natural language +processing (NLP) tasks and are increasingly being applied to chart +understanding tasks. This survey paper provides a comprehensive overview of the +recent developments, challenges, and future directions in chart understanding +within the context of these foundation models. The paper begins by defining +chart understanding, outlining problem formulations, and discussing fundamental +building blocks crucial for studying chart understanding tasks. In the section +on tasks and datasets, we explore various tasks within chart understanding and +discuss their evaluation metrics and sources of both charts and textual inputs. +Modeling strategies are then examined, encompassing both classification-based +and generation-based approaches, along with tool augmentation techniques that +enhance chart understanding performance. Furthermore, we discuss the +state-of-the-art performance of each task and discuss how we can improve the +performance. Challenges and future directions are addressed in a dedicated +section, highlighting issues such as domain-specific charts, lack of efforts in +evaluation, and agent-oriented settings. This survey paper serves to provide +valuable insights and directions for future research in chart understanding +leveraging large foundation models. The studies mentioned in this paper, along +with emerging new research, will be continually updated at: +https://github.com/khuangaf/Awesome-Chart-Understanding. + +
+
+
+
+
+ + ☆ FlexCap: Generating Rich, Localized, and Flexible Captions in Images + + +
+ We introduce a versatile $\textit{flexible-captioning}$ vision-language model +(VLM) capable of generating region-specific descriptions of varying lengths. +The model, FlexCap, is trained to produce length-conditioned captions for input +bounding boxes, and this allows control over the information density of its +output, with descriptions ranging from concise object labels to detailed +captions. To achieve this we create large-scale training datasets of image +region descriptions of varying length, starting from captioned images. This +flexible-captioning capability has several valuable applications. + First, FlexCap demonstrates superior performance in dense captioning tasks on +the Visual Genome dataset. Second, a visual question answering (VQA) system can +be built by employing FlexCap to generate localized descriptions as inputs to a +large language model. The resulting system achieves state-of-the-art zero-shot +performance on a number of VQA datasets. We also demonstrate a +$\textit{localize-then-describe}$ approach with FlexCap can be better at +open-ended object detection than a $\textit{describe-then-localize}$ approach +with other VLMs. We highlight a novel characteristic of FlexCap, which is its +ability to extract diverse visual information through prefix conditioning. +Finally, we qualitatively demonstrate FlexCap's broad applicability in tasks +such as image labeling, object attribute recognition, and visual dialog. +Project webpage: https://flex-cap.github.io . + +
+
+
+
+
+ + ☆ LN3Diff: Scalable Latent Neural Fields Diffusion for Speedy 3D + Generation + + +
+ The field of neural rendering has witnessed significant progress with +advancements in generative models and differentiable rendering techniques. +Though 2D diffusion has achieved success, a unified 3D diffusion pipeline +remains unsettled. This paper introduces a novel framework called LN3Diff to +address this gap and enable fast, high-quality, and generic conditional 3D +generation. Our approach harnesses a 3D-aware architecture and variational +autoencoder (VAE) to encode the input image into a structured, compact, and 3D +latent space. The latent is decoded by a transformer-based decoder into a +high-capacity 3D neural field. Through training a diffusion model on this +3D-aware latent space, our method achieves state-of-the-art performance on +ShapeNet for 3D generation and demonstrates superior performance in monocular +3D reconstruction and conditional 3D generation across various datasets. +Moreover, it surpasses existing 3D diffusion methods in terms of inference +speed, requiring no per-instance optimization. Our proposed LN3Diff presents a +significant advancement in 3D generative modeling and holds promise for various +applications in 3D vision and graphics tasks. + +
+
+ comment: project webpage: https://nirvanalan.github.io/projects/ln3diff/ +
+
+
+
+
+ + ☆ Fast High-Resolution Image Synthesis with Latent Adversarial Diffusion + Distillation + + +
+ Diffusion models are the main driver of progress in image and video +synthesis, but suffer from slow inference speed. Distillation methods, like the +recently introduced adversarial diffusion distillation (ADD) aim to shift the +model from many-shot to single-step inference, albeit at the cost of expensive +and difficult optimization due to its reliance on a fixed pretrained DINOv2 +discriminator. We introduce Latent Adversarial Diffusion Distillation (LADD), a +novel distillation approach overcoming the limitations of ADD. In contrast to +pixel-based ADD, LADD utilizes generative features from pretrained latent +diffusion models. This approach simplifies training and enhances performance, +enabling high-resolution multi-aspect ratio image synthesis. We apply LADD to +Stable Diffusion 3 (8B) to obtain SD3-Turbo, a fast model that matches the +performance of state-of-the-art text-to-image generators using only four +unguided sampling steps. Moreover, we systematically investigate its scaling +behavior and demonstrate LADD's effectiveness in various applications such as +image editing and inpainting. + +
+
+
+
+
+ + ☆ GeoWizard: Unleashing the Diffusion Priors for 3D Geometry Estimation + from a Single Image + + +
+ We introduce GeoWizard, a new generative foundation model designed for +estimating geometric attributes, e.g., depth and normals, from single images. +While significant research has already been conducted in this area, the +progress has been substantially limited by the low diversity and poor quality +of publicly available datasets. As a result, the prior works either are +constrained to limited scenarios or suffer from the inability to capture +geometric details. In this paper, we demonstrate that generative models, as +opposed to traditional discriminative models (e.g., CNNs and Transformers), can +effectively address the inherently ill-posed problem. We further show that +leveraging diffusion priors can markedly improve generalization, detail +preservation, and efficiency in resource usage. Specifically, we extend the +original stable diffusion model to jointly predict depth and normal, allowing +mutual information exchange and high consistency between the two +representations. More importantly, we propose a simple yet effective strategy +to segregate the complex data distribution of various scenes into distinct +sub-distributions. This strategy enables our model to recognize different scene +layouts, capturing 3D geometry with remarkable fidelity. GeoWizard sets new +benchmarks for zero-shot depth and normal prediction, significantly enhancing +many downstream applications such as 3D reconstruction, 2D content creation, +and novel viewpoint synthesis. + +
+
+ comment: Project page: https://fuxiao0719.github.io/projects/geowizard/ +
+
+
+
+
+ + ☆ HOIDiffusion: Generating Realistic 3D Hand-Object Interaction Data + + +
+ 3D hand-object interaction data is scarce due to the hardware constraints in +scaling up the data collection process. In this paper, we propose HOIDiffusion +for generating realistic and diverse 3D hand-object interaction data. Our model +is a conditional diffusion model that takes both the 3D hand-object geometric +structure and text description as inputs for image synthesis. This offers a +more controllable and realistic synthesis as we can specify the structure and +style inputs in a disentangled manner. HOIDiffusion is trained by leveraging a +diffusion model pre-trained on large-scale natural images and a few 3D human +demonstrations. Beyond controllable image synthesis, we adopt the generated 3D +data for learning 6D object pose estimation and show its effectiveness in +improving perception systems. Project page: +https://mq-zhang1.github.io/HOIDiffusion + +
+
+ comment: Project page: https://mq-zhang1.github.io/HOIDiffusion +
+
+
+
+
+ + ☆ VideoMV: Consistent Multi-View Generation Based on Large Video + Generative Model + + +
+ Generating multi-view images based on text or single-image prompts is a +critical capability for the creation of 3D content. Two fundamental questions +on this topic are what data we use for training and how to ensure multi-view +consistency. This paper introduces a novel framework that makes fundamental +contributions to both questions. Unlike leveraging images from 2D diffusion +models for training, we propose a dense consistent multi-view generation model +that is fine-tuned from off-the-shelf video generative models. Images from +video generative models are more suitable for multi-view generation because the +underlying network architecture that generates them employs a temporal module +to enforce frame consistency. Moreover, the video data sets used to train these +models are abundant and diverse, leading to a reduced train-finetuning domain +gap. To enhance multi-view consistency, we introduce a 3D-Aware Denoising +Sampling, which first employs a feed-forward reconstruction module to get an +explicit global 3D model, and then adopts a sampling strategy that effectively +involves images rendered from the global 3D model into the denoising sampling +loop to improve the multi-view consistency of the final images. As a +by-product, this module also provides a fast way to create 3D assets +represented by 3D Gaussians within a few seconds. Our approach can generate 24 +dense views and converges much faster in training than state-of-the-art +approaches (4 GPU hours versus many thousand GPU hours) with comparable visual +quality and consistency. By further fine-tuning, our approach outperforms +existing state-of-the-art methods in both quantitative metrics and visual +effects. Our project page is aigc3d.github.io/VideoMV. + +
+
+ comment: Project page: aigc3d.github.io/VideoMV/ +
+
+
+
+
+ + ☆ SV3D: Novel Multi-view Synthesis and 3D Generation from a Single Image + using Latent Video Diffusion + + +
+ We present Stable Video 3D (SV3D) -- a latent video diffusion model for +high-resolution, image-to-multi-view generation of orbital videos around a 3D +object. Recent work on 3D generation propose techniques to adapt 2D generative +models for novel view synthesis (NVS) and 3D optimization. However, these +methods have several disadvantages due to either limited views or inconsistent +NVS, thereby affecting the performance of 3D object generation. In this work, +we propose SV3D that adapts image-to-video diffusion model for novel multi-view +synthesis and 3D generation, thereby leveraging the generalization and +multi-view consistency of the video models, while further adding explicit +camera control for NVS. We also propose improved 3D optimization techniques to +use SV3D and its NVS outputs for image-to-3D generation. Extensive experimental +results on multiple datasets with 2D and 3D metrics as well as user study +demonstrate SV3D's state-of-the-art performance on NVS as well as 3D +reconstruction compared to prior works. + +
+
+ comment: Project page: https://sv3d.github.io/ +
+
+
+
+
+ + ☆ GenView: Enhancing View Quality with Pretrained Generative Model for + Self-Supervised Learning + + +
+ Self-supervised learning has achieved remarkable success in acquiring +high-quality representations from unlabeled data. The widely adopted +contrastive learning framework aims to learn invariant representations by +minimizing the distance between positive views originating from the same image. +However, existing techniques to construct positive views highly rely on manual +transformations, resulting in limited diversity and potentially false positive +pairs. To tackle these challenges, we present GenView, a controllable framework +that augments the diversity of positive views leveraging the power of +pretrained generative models while preserving semantics. We develop an adaptive +view generation method that dynamically adjusts the noise level in sampling to +ensure the preservation of essential semantic meaning while introducing +variability. Additionally, we introduce a quality-driven contrastive loss, +which assesses the quality of positive pairs by considering both foreground +similarity and background diversity. This loss prioritizes the high-quality +positive pairs we construct while reducing the influence of low-quality pairs, +thereby mitigating potential semantic inconsistencies introduced by generative +models and aggressive data augmentation. Thanks to the improved positive view +quality and the quality-driven contrastive loss, GenView significantly improves +self-supervised learning across various tasks. For instance, GenView improves +MoCov2 performance by 2.5%/2.2% on ImageNet linear/semi-supervised +classification. Moreover, GenView even performs much better than naively +augmenting the ImageNet dataset with Laion400M or ImageNet21K. Code is +available at https://github.com/xiaojieli0903/genview. + +
+
+ comment: Code: https://github.com/xiaojieli0903/genview +
+
+
+
+
+ + ☆ DreamMotion: Space-Time Self-Similarity Score Distillation for Zero-Shot + Video Editing + + +
+ Text-driven diffusion-based video editing presents a unique challenge not +encountered in image editing literature: establishing real-world motion. Unlike +existing video editing approaches, here we focus on score distillation sampling +to circumvent the standard reverse diffusion process and initiate optimization +from videos that already exhibit natural motion. Our analysis reveals that +while video score distillation can effectively introduce new content indicated +by target text, it can also cause significant structure and motion deviation. +To counteract this, we propose to match space-time self-similarities of the +original video and the edited video during the score distillation. Thanks to +the use of score distillation, our approach is model-agnostic, which can be +applied for both cascaded and non-cascaded video diffusion frameworks. Through +extensive comparisons with leading methods, our approach demonstrates its +superiority in altering appearances while accurately preserving the original +structure and motion. + +
+
+ comment: Project page: https://hyeonho99.github.io/dreammotion/ +
+
+
+
+
+ + ☆ HIRI-ViT: Scaling Vision Transformer with High Resolution Inputs + + +
+ The hybrid deep models of Vision Transformer (ViT) and Convolution Neural +Network (CNN) have emerged as a powerful class of backbones for vision tasks. +Scaling up the input resolution of such hybrid backbones naturally strengthes +model capacity, but inevitably suffers from heavy computational cost that +scales quadratically. Instead, we present a new hybrid backbone with +HIgh-Resolution Inputs (namely HIRI-ViT), that upgrades prevalent four-stage +ViT to five-stage ViT tailored for high-resolution inputs. HIRI-ViT is built +upon the seminal idea of decomposing the typical CNN operations into two +parallel CNN branches in a cost-efficient manner. One high-resolution branch +directly takes primary high-resolution features as inputs, but uses less +convolution operations. The other low-resolution branch first performs +down-sampling and then utilizes more convolution operations over such +low-resolution features. Experiments on both recognition task (ImageNet-1K +dataset) and dense prediction tasks (COCO and ADE20K datasets) demonstrate the +superiority of HIRI-ViT. More remarkably, under comparable computational cost +($\sim$5.0 GFLOPs), HIRI-ViT achieves to-date the best published Top-1 accuracy +of 84.3% on ImageNet with 448$\times$448 inputs, which absolutely improves +83.4% of iFormer-S by 0.9% with 224$\times$224 inputs. + +
+
+ comment: IEEE Transactions on Pattern Analysis and Machine Intelligence + (TPAMI) +
+
+
+
+
+ + ☆ GetMesh: A Controllable Model for High-quality Mesh Generation and + Manipulation + + +
+ Mesh is a fundamental representation of 3D assets in various industrial +applications, and is widely supported by professional softwares. However, due +to its irregular structure, mesh creation and manipulation is often +time-consuming and labor-intensive. In this paper, we propose a highly +controllable generative model, GetMesh, for mesh generation and manipulation +across different categories. By taking a varying number of points as the latent +representation, and re-organizing them as triplane representation, GetMesh +generates meshes with rich and sharp details, outperforming both +single-category and multi-category counterparts. Moreover, it also enables +fine-grained control over the generation process that previous mesh generative +models cannot achieve, where changing global/local mesh topologies, +adding/removing mesh parts, and combining mesh parts across categories can be +intuitively, efficiently, and robustly accomplished by adjusting the number, +positions or features of latent points. Project page is +https://getmesh.github.io. + +
+
+
+
+
+ + ☆ Diffusion Denoising as a Certified Defense against Clean-label Poisoning + + +
+ We present a certified defense to clean-label poisoning attacks. These +attacks work by injecting a small number of poisoning samples (e.g., 1%) that +contain $p$-norm bounded adversarial perturbations into the training data to +induce a targeted misclassification of a test-time input. Inspired by the +adversarial robustness achieved by $denoised$ $smoothing$, we show how an +off-the-shelf diffusion model can sanitize the tampered training data. We +extensively test our defense against seven clean-label poisoning attacks and +reduce their attack success to 0-16% with only a negligible drop in the test +time accuracy. We compare our defense with existing countermeasures against +clean-label poisoning, showing that the defense reduces the attack success the +most and offers the best model utility. Our results highlight the need for +future work on developing stronger clean-label attacks and using our certified +yet practical defense as a strong baseline to evaluate these attacks. + +
+
+
+
+
+ + ☆ Pedestrian Tracking with Monocular Camera using Unconstrained 3D Motion + Model + + +
+ A first-principle single-object model is proposed for pedestrian tracking. It +is assumed that the extent of the moving object can be described via known +statistics in 3D, such as pedestrian height. The proposed model thus need not +constrain the object motion in 3D to a common ground plane, which is usual in +3D visual tracking applications. A nonlinear filter for this model is +implemented using the unscented Kalman filter (UKF) and tested using the +publicly available MOT-17 dataset. The proposed solution yields promising +results in 3D while maintaining perfect results when projected into the 2D +image. Moreover, the estimation error covariance matches the true one. Unlike +conventional methods, the introduced model parameters have convenient meaning +and can readily be adjusted for a problem. + +
+
+ comment: Submitted to FUSION2024 conference +
+
+
+
+
+ + ☆ OUCopula: Bi-Channel Multi-Label Copula-Enhanced Adapter-Based CNN for + Myopia Screening Based on OU-UWF Images + + +
+ Myopia screening using cutting-edge ultra-widefield (UWF) fundus imaging is +potentially significant for ophthalmic outcomes. Current multidisciplinary +research between ophthalmology and deep learning (DL) concentrates primarily on +disease classification and diagnosis using single-eye images, largely ignoring +joint modeling and prediction for Oculus Uterque (OU, both eyes). Inspired by +the complex relationships between OU and the high correlation between the +(continuous) outcome labels (Spherical Equivalent and Axial Length), we propose +a framework of copula-enhanced adapter convolutional neural network (CNN) +learning with OU UWF fundus images (OUCopula) for joint prediction of multiple +clinical scores. We design a novel bi-channel multi-label CNN that can (1) take +bi-channel image inputs subject to both high correlation and heterogeneity (by +sharing the same backbone network and employing adapters to parameterize the +channel-wise discrepancy), and (2) incorporate correlation information between +continuous output labels (using a copula). Solid experiments show that OUCopula +achieves satisfactory performance in myopia score prediction compared to +backbone models. Moreover, OUCopula can far exceed the performance of models +constructed for single-eye inputs. Importantly, our study also hints at the +potential extension of the bi-channel model to a multi-channel paradigm and the +generalizability of OUCopula across various backbone CNNs. + +
+
+
+
+
+ + ☆ Enhanced Event-Based Video Reconstruction with Motion Compensation + + +
+ Deep neural networks for event-based video reconstruction often suffer from a +lack of interpretability and have high memory demands. A lightweight network +called CISTA-LSTC has recently been introduced showing that high-quality +reconstruction can be achieved through the systematic design of its +architecture. However, its modelling assumption that input signals and output +reconstructed frame share the same sparse representation neglects the +displacement caused by motion. To address this, we propose warping the input +intensity frames and sparse codes to enhance reconstruction quality. A +CISTA-Flow network is constructed by integrating a flow network with CISTA-LSTC +for motion compensation. The system relies solely on events, in which predicted +flow aids in reconstruction and then reconstructed frames are used to +facilitate flow estimation. We also introduce an iterative training framework +for this combined system. Results demonstrate that our approach achieves +state-of-the-art reconstruction accuracy and simultaneously provides reliable +dense flow estimation. Furthermore, our model exhibits flexibility in that it +can integrate different flow networks, suggesting its potential for further +performance enhancement. + +
+
+ comment: 22 pages, 8 figures (supplementary material included) +
+
+
+
+
+ + ☆ IVAC-P2L: Enhancing Video Action Counting through Irregular Repetition + Priors + + +
+ Video Action Counting (VAC) is crucial in analyzing sports, fitness, and +everyday activities by quantifying repetitive actions in videos. However, +traditional VAC methods have overlooked the complexity of action repetitions, +such as interruptions and the variability in cycle duration. Our research +addresses the shortfall by introducing a novel approach to VAC, called +Irregular Video Action Counting (IVAC). IVAC prioritizes modeling irregular +repetition patterns in videos, which we define through two primary aspects: +Inter-cycle Consistency and Cycle-interval Inconsistency. Inter-cycle +Consistency ensures homogeneity in the spatial-temporal representations of +cycle segments, signifying action uniformity within cycles. Cycle-interval +inconsistency highlights the importance of distinguishing between cycle +segments and intervals based on their inherent content differences. To +encapsulate these principles, we propose a new methodology that includes +consistency and inconsistency modules, supported by a unique pull-push loss +(P2L) mechanism. The IVAC-P2L model applies a pull loss to promote coherence +among cycle segment features and a push loss to clearly distinguish features of +cycle segments from interval segments. Empirical evaluations conducted on the +RepCount dataset demonstrate that the IVAC-P2L model sets a new benchmark in +VAC task performance. Furthermore, the model demonstrates exceptional +adaptability and generalization across various video contents, outperforming +existing models on two additional datasets, UCFRep and Countix, without the +need for dataset-specific optimization. These results confirm the efficacy of +our approach in addressing irregular repetitions in videos and pave the way for +further advancements in video analysis and understanding. + +
+
+ comment: Under continuous updates. Modified for arXiv +
+
+
+
+
+ + ☆ Advancing COVID-19 Detection in 3D CT Scans + + +
+ To make a more accurate diagnosis of COVID-19, we propose a straightforward +yet effective model. Firstly, we analyse the characteristics of 3D CT scans and +remove the non-lung parts, facilitating the model to focus on lesion-related +areas and reducing computational cost. We use ResNeSt50 as the strong feature +extractor, initializing it with pretrained weights which have COVID-19-specific +prior knowledge. Our model achieves a Macro F1 Score of 0.94 on the validation +set of the 4th COV19D Competition Challenge $\mathrm{I}$, surpassing the +baseline by 16%. This indicates its effectiveness in distinguishing between +COVID-19 and non-COVID-19 cases, making it a robust method for COVID-19 +detection. + +
+
+
+
+
+ + ☆ AI-Assisted Cervical Cancer Screening + + +
+ Visual Inspection with Acetic Acid (VIA) remains the most feasible cervical +cancer screening test in resource-constrained settings of low- and +middle-income countries (LMICs), which are often performed screening camps or +primary/community health centers by nurses instead of the preferred but +unavailable expert Gynecologist. To address the highly subjective nature of the +test, various handheld devices integrating cameras or smartphones have been +recently explored to capture cervical images during VIA and aid decision-making +via telemedicine or AI models. Most studies proposing AI models retrospectively +use a relatively small number of already collected images from specific +devices, digital cameras, or smartphones; the challenges and protocol for +quality image acquisition during VIA in resource-constrained camp settings, +challenges in getting gold standard, data imbalance, etc. are often overlooked. +We present a novel approach and describe the end-to-end design process to build +a robust smartphone-based AI-assisted system that does not require buying a +separate integrated device: the proposed protocol for quality image acquisition +in resource-constrained settings, dataset collected from 1,430 women during VIA +performed by nurses in screening camps, preprocessing pipeline, and training +and evaluation of a deep-learning-based classification model aimed to identify +(pre)cancerous lesions. Our work shows that the readily available smartphones +and a suitable protocol can capture the cervix images with the required details +for the VIA test well; the deep-learning-based classification model provides +promising results to assist nurses in VIA screening; and provides a direction +for large-scale data collection and validation in resource-constrained +settings. + +
+
+
+
+
+ + ☆ HyperColorization: Propagating spatially sparse noisy spectral clues for + reconstructing hyperspectral images + + +
+ Hyperspectral cameras face challenging spatial-spectral resolution trade-offs +and are more affected by shot noise than RGB photos taken over the same total +exposure time. Here, we present a colorization algorithm to reconstruct +hyperspectral images from a grayscale guide image and spatially sparse spectral +clues. We demonstrate that our algorithm generalizes to varying spectral +dimensions for hyperspectral images, and show that colorizing in a low-rank +space reduces compute time and the impact of shot noise. To enhance robustness, +we incorporate guided sampling, edge-aware filtering, and dimensionality +estimation techniques. Our method surpasses previous algorithms in various +performance metrics, including SSIM, PSNR, GFC, and EMD, which we analyze as +metrics for characterizing hyperspectral image quality. Collectively, these +findings provide a promising avenue for overcoming the time-space-wavelength +resolution trade-off by reconstructing a dense hyperspectral image from samples +obtained by whisk or push broom scanners, as well as hybrid spatial-spectral +computational imaging systems. + +
+
+ comment: 16 Pages, 13 Figures, 3 Tables, for more information: + https://mehmetkeremaydin.github.io/hypercolorization/ +
+
+
+
+
+ + ☆ High-energy physics image classification: A Survey of Jet Applications + + +
+ In recent times, the fields of high-energy physics (HEP) experimentation and +phenomenological studies have seen the integration of machine learning (ML) and +its specialized branch, deep learning (DL). This survey offers a comprehensive +assessment of these applications within the realm of various DL approaches. The +initial segment of the paper introduces the fundamentals encompassing diverse +particle physics types and establishes criteria for evaluating particle physics +in tandem with learning models. Following this, a comprehensive taxonomy is +presented for representing HEP images, encompassing accessible datasets, +intricate details of preprocessing techniques, and methods of feature +extraction and selection. Subsequently, the focus shifts to an exploration of +available artificial intelligence (AI) models tailored to HEP images, along +with a concentrated examination of HEP image classification pertaining to Jet +particles. Within this review, a profound investigation is undertaken into +distinct ML and DL proposed state-of-the art (SOTA) techniques, underscoring +their implications for HEP inquiries. The discussion delves into specific +applications in substantial detail, including Jet tagging, Jet tracking, +particle classification, and more. The survey culminates with an analysis +concerning the present status of HEP grounded in DL methodologies, encompassing +inherent challenges and prospective avenues for future research endeavors. + +
+
+
+
+
+ + ☆ LayerDiff: Exploring Text-guided Multi-layered Composable Image + Synthesis via Layer-Collaborative Diffusion Model + + +
+ Despite the success of generating high-quality images given any text prompts +by diffusion-based generative models, prior works directly generate the entire +images, but cannot provide object-wise manipulation capability. To support +wider real applications like professional graphic design and digital artistry, +images are frequently created and manipulated in multiple layers to offer +greater flexibility and control. Therefore in this paper, we propose a +layer-collaborative diffusion model, named LayerDiff, specifically designed for +text-guided, multi-layered, composable image synthesis. The composable image +consists of a background layer, a set of foreground layers, and associated mask +layers for each foreground element. To enable this, LayerDiff introduces a +layer-based generation paradigm incorporating multiple layer-collaborative +attention modules to capture inter-layer patterns. Specifically, an inter-layer +attention module is designed to encourage information exchange and learning +between layers, while a text-guided intra-layer attention module incorporates +layer-specific prompts to direct the specific-content generation for each +layer. A layer-specific prompt-enhanced module better captures detailed textual +cues from the global prompt. Additionally, a self-mask guidance sampling +strategy further unleashes the model's ability to generate multi-layered +images. We also present a pipeline that integrates existing perceptual and +generative models to produce a large dataset of high-quality, text-prompted, +multi-layered images. Extensive experiments demonstrate that our LayerDiff +model can generate high-quality multi-layered images with performance +comparable to conventional whole-image generation methods. Moreover, LayerDiff +enables a broader range of controllable generative applications, including +layer-specific image editing and style transfer. + +
+
+
+
+
+ + ☆ RoGUENeRF: A Robust Geometry-Consistent Universal Enhancer for NeRF + + +
+ Recent advances in neural rendering have enabled highly photorealistic 3D +scene reconstruction and novel view synthesis. Despite this progress, current +state-of-the-art methods struggle to reconstruct high frequency detail, due to +factors such as a low-frequency bias of radiance fields and inaccurate camera +calibration. One approach to mitigate this issue is to enhance images +post-rendering. 2D enhancers can be pre-trained to recover some detail but are +agnostic to scene geometry and do not easily generalize to new distributions of +image degradation. Conversely, existing 3D enhancers are able to transfer +detail from nearby training images in a generalizable manner, but suffer from +inaccurate camera calibration and can propagate errors from the geometry into +rendered images. We propose a neural rendering enhancer, RoGUENeRF, which +exploits the best of both paradigms. Our method is pre-trained to learn a +general enhancer while also leveraging information from nearby training images +via robust 3D alignment and geometry-aware fusion. Our approach restores +high-frequency textures while maintaining geometric consistency and is also +robust to inaccurate camera calibration. We show that RoGUENeRF substantially +enhances the rendering quality of a wide range of neural rendering baselines, +e.g. improving the PSNR of MipNeRF360 by 0.63dB and Nerfacto by 1.34dB on the +real world 360v2 dataset. + +
+
+
+
+
+ + ☆ Tur[k]ingBench: A Challenge Benchmark for Web Agents + + +
+ Recent chatbots have demonstrated impressive ability to understand and +communicate in raw-text form. However, there is more to the world than raw +text. For example, humans spend long hours of their time on web pages, where +text is intertwined with other modalities and tasks are accomplished in the +form of various complex interactions. Can state-of-the-art multi-modal models +generalize to such complex domains? + To address this question, we introduce TurkingBench, a benchmark of tasks +formulated as web pages containing textual instructions with multi-modal +context. Unlike existing work which employs artificially synthesized web pages, +here we use natural HTML pages that were originally designed for crowdsourcing +workers for various annotation purposes. The HTML instructions of each task are +also instantiated with various values (obtained from the crowdsourcing tasks) +to form new instances of the task. This benchmark contains 32.2K instances +distributed across 158 tasks. + Additionally, to facilitate the evaluation on TurkingBench, we develop an +evaluation framework that connects the responses of chatbots to modifications +on web pages (modifying a text box, checking a radio, etc.). We evaluate the +performance of state-of-the-art models, including language-only, vision-only, +and layout-only models, and their combinations, on this benchmark. Our findings +reveal that these models perform significantly better than random chance, yet +considerable room exists for improvement. We hope this benchmark will help +facilitate the evaluation and development of web-based agents. + +
+
+
+
+
+ + ☆ GNeRP: Gaussian-guided Neural Reconstruction of Reflective Objects with + Noisy Polarization Priors ICLR 2024 + + +
+ Learning surfaces from neural radiance field (NeRF) became a rising topic in +Multi-View Stereo (MVS). Recent Signed Distance Function (SDF)-based methods +demonstrated their ability to reconstruct accurate 3D shapes of Lambertian +scenes. However, their results on reflective scenes are unsatisfactory due to +the entanglement of specular radiance and complicated geometry. To address the +challenges, we propose a Gaussian-based representation of normals in SDF +fields. Supervised by polarization priors, this representation guides the +learning of geometry behind the specular reflection and captures more details +than existing methods. Moreover, we propose a reweighting strategy in the +optimization process to alleviate the noise issue of polarization priors. To +validate the effectiveness of our design, we capture polarimetric information, +and ground truth meshes in additional reflective scenes with various geometry. +We also evaluated our framework on the PANDORA dataset. Comparisons prove our +method outperforms existing neural 3D reconstruction methods in reflective +scenes by a large margin. + +
+
+ comment: Accepted to ICLR 2024 Poster. For the Appendix, please see + http://yukiumi13.github.io/gnerp_page +
+
+
+
+
+ + ☆ N-Modal Contrastive Losses with Applications to Social Media Data in + Trimodal Space + + +
+ The social media landscape of conflict dynamics has grown increasingly +multi-modal. Recent advancements in model architectures such as CLIP have +enabled researchers to begin studying the interplay between the modalities of +text and images in a shared latent space. However, CLIP models fail to handle +situations on social media when modalities present in a post expand above two. +Social media dynamics often require understanding the interplay between not +only text and images, but video as well. In this paper we explore an extension +of the contrastive loss function to allow for any number of modalities, and +demonstrate its usefulness in trimodal spaces on social media. By extending +CLIP into three dimensions we can further aide understanding social media +landscapes where all three modalities are present (an increasingly common +situation). We use a newly collected public data set of Telegram posts +containing all three modalities to train, and then demonstrate the usefulness +of, a trimodal model in two OSINT scenarios: classifying a social media +artifact post as either pro-Russian or pro-Ukrainian and identifying which +account a given artifact originated from. While trimodal CLIP models have been +explored before (though not on social media data), we also display a novel +quadmodal CLIP model. This model can learn the interplay between text, image, +video, and audio. We demonstrate new state-of-the-art baseline results on +retrieval for quadmodel models moving forward. + +
+
+
+
+
+ + ☆ Deep learning automates Cobb angle measurement compared with + multi-expert observers + + +
+ Scoliosis, a prevalent condition characterized by abnormal spinal curvature +leading to deformity, requires precise assessment methods for effective +diagnosis and management. The Cobb angle is a widely used scoliosis +quantification method that measures the degree of curvature between the tilted +vertebrae. Yet, manual measuring of Cobb angles is time-consuming and +labor-intensive, fraught with significant interobserver and intraobserver +variability. To address these challenges and the lack of interpretability found +in certain existing automated methods, we have created fully automated software +that not only precisely measures the Cobb angle but also provides clear +visualizations of these measurements. This software integrates deep neural +network-based spine region detection and segmentation, spine centerline +identification, pinpointing the most significantly tilted vertebrae, and direct +visualization of Cobb angles on the original images. Upon comparison with the +assessments of 7 expert readers, our algorithm exhibited a mean deviation in +Cobb angle measurements of 4.17 degrees, notably surpassing the manual +approach's average intra-reader discrepancy of 5.16 degrees. The algorithm also +achieved intra-class correlation coefficients (ICC) exceeding 0.96 and Pearson +correlation coefficients above 0.944, reflecting robust agreement with expert +assessments and superior measurement reliability. Through the comprehensive +reader study and statistical analysis, we believe this algorithm not only +ensures a higher consensus with expert readers but also enhances +interpretability and reproducibility during assessments. It holds significant +promise for clinical application, potentially aiding physicians in more +accurate scoliosis assessment and diagnosis, thereby improving patient care. + +
+
+ comment: 17 pages, 5 figures +
+
+
+
+
+ + ☆ SuperLoRA: Parameter-Efficient Unified Adaptation of Multi-Layer + Attention Modules + + +
+ Low-rank adaptation (LoRA) and its variants are widely employed in +fine-tuning large models, including large language models for natural language +processing and diffusion models for computer vision. This paper proposes a +generalized framework called SuperLoRA that unifies and extends different LoRA +variants, which can be realized under different hyper-parameter settings. +Introducing grouping, folding, shuffling, projecting, and tensor factoring, +SuperLoRA offers high flexibility compared with other LoRA variants and +demonstrates superior performance for transfer learning tasks especially in the +extremely few-parameter regimes. + +
+
+ comment: 33 pages, 29 figures +
+
+
+
+
+ + ☆ ReGenNet: Towards Human Action-Reaction Synthesis CVPR 2024 + + +
+ Humans constantly interact with their surrounding environments. Current +human-centric generative models mainly focus on synthesizing humans plausibly +interacting with static scenes and objects, while the dynamic human +action-reaction synthesis for ubiquitous causal human-human interactions is +less explored. Human-human interactions can be regarded as asymmetric with +actors and reactors in atomic interaction periods. In this paper, we +comprehensively analyze the asymmetric, dynamic, synchronous, and detailed +nature of human-human interactions and propose the first multi-setting human +action-reaction synthesis benchmark to generate human reactions conditioned on +given human actions. To begin with, we propose to annotate the actor-reactor +order of the interaction sequences for the NTU120, InterHuman, and Chi3D +datasets. Based on them, a diffusion-based generative model with a Transformer +decoder architecture called ReGenNet together with an explicit distance-based +interaction loss is proposed to predict human reactions in an online manner, +where the future states of actors are unavailable to reactors. Quantitative and +qualitative results show that our method can generate instant and plausible +human reactions compared to the baselines, and can generalize to unseen actor +motions and viewpoint changes. + +
+
+ comment: Accepted by CVPR 2024, Project Page: + https://liangxuy.github.io/ReGenNet/ +
+
+
+
+
+ + ☆ InTeX: Interactive Text-to-texture Synthesis via Unified Depth-aware + Inpainting + + +
+ Text-to-texture synthesis has become a new frontier in 3D content creation +thanks to the recent advances in text-to-image models. Existing methods +primarily adopt a combination of pretrained depth-aware diffusion and +inpainting models, yet they exhibit shortcomings such as 3D inconsistency and +limited controllability. To address these challenges, we introduce InteX, a +novel framework for interactive text-to-texture synthesis. 1) InteX includes a +user-friendly interface that facilitates interaction and control throughout the +synthesis process, enabling region-specific repainting and precise texture +editing. 2) Additionally, we develop a unified depth-aware inpainting model +that integrates depth information with inpainting cues, effectively mitigating +3D inconsistencies and improving generation speed. Through extensive +experiments, our framework has proven to be both practical and effective in +text-to-texture synthesis, paving the way for high-quality 3D content creation. + +
+
+ comment: Project Page: https://me.kiui.moe/intex/ +
+
+
+
+
+ + ☆ Deep Bayesian Future Fusion for Self-Supervised, High-Resolution, + Off-Road Mapping + + +
+ The limited sensing resolution of resource-constrained off-road vehicles +poses significant challenges towards reliable off-road autonomy. To overcome +this limitation, we propose a general framework based on fusing the future +information (i.e. future fusion) for self-supervision. Recent approaches +exploit this future information alongside the hand-crafted heuristics to +directly supervise the targeted downstream tasks (e.g. traversability +estimation). However, in this paper, we opt for a more general line of +development - time-efficient completion of the highest resolution (i.e. 2cm per +pixel) BEV map in a self-supervised manner via future fusion, which can be used +for any downstream tasks for better longer range prediction. To this end, +first, we create a high-resolution future-fusion dataset containing pairs of +(RGB / height) raw sparse and noisy inputs and map-based dense labels. Next, to +accommodate the noise and sparsity of the sensory information, especially in +the distal regions, we design an efficient realization of the Bayes filter onto +the vanilla convolutional network via the recurrent mechanism. Equipped with +the ideas from SOTA generative models, our Bayesian structure effectively +predicts high-quality BEV maps in the distal regions. Extensive evaluation on +both the quality of completion and downstream task on our future-fusion dataset +demonstrates the potential of our approach. + +
+
+
+
+
+ + ☆ Towards Real-Time Fast Unmanned Aerial Vehicle Detection Using Dynamic + Vision Sensors + + +
+ Unmanned Aerial Vehicles (UAVs) are gaining popularity in civil and military +applications. However, uncontrolled access to restricted areas threatens +privacy and security. Thus, prevention and detection of UAVs are pivotal to +guarantee confidentiality and safety. Although active scanning, mainly based on +radars, is one of the most accurate technologies, it can be expensive and less +versatile than passive inspections, e.g., object recognition. Dynamic vision +sensors (DVS) are bio-inspired event-based vision models that leverage +timestamped pixel-level brightness changes in fast-moving scenes that adapt +well to low-latency object detection. This paper presents F-UAV-D (Fast +Unmanned Aerial Vehicle Detector), an embedded system that enables fast-moving +drone detection. In particular, we propose a setup to exploit DVS as an +alternative to RGB cameras in a real-time and low-power configuration. Our +approach leverages the high-dynamic range (HDR) and background suppression of +DVS and, when trained with various fast-moving drones, outperforms RGB input in +suboptimal ambient conditions such as low illumination and fast-moving scenes. +Our results show that F-UAV-D can (i) detect drones by using less than <15 W on +average and (ii) perform real-time inference (i.e., <50 ms) by leveraging the +CPU and GPU nodes of our edge computer. + +
+
+ comment: Accepted at 2024 IEEE International Instrumentation and Measurement + Technology Conference (I2MTC) +
+
+
+
+
+ + ☆ IDF-CR: Iterative Diffusion Process for Divide-and-Conquer Cloud Removal + in Remote-sensing Images + + +
+ Deep learning technologies have demonstrated their effectiveness in removing +cloud cover from optical remote-sensing images. Convolutional Neural Networks +(CNNs) exert dominance in the cloud removal tasks. However, constrained by the +inherent limitations of convolutional operations, CNNs can address only a +modest fraction of cloud occlusion. In recent years, diffusion models have +achieved state-of-the-art (SOTA) proficiency in image generation and +reconstruction due to their formidable generative capabilities. Inspired by the +rapid development of diffusion models, we first present an iterative diffusion +process for cloud removal (IDF-CR), which exhibits a strong generative +capabilities to achieve component divide-and-conquer cloud removal. IDF-CR +consists of a pixel space cloud removal module (Pixel-CR) and a latent space +iterative noise diffusion network (IND). Specifically, IDF-CR is divided into +two-stage models that address pixel space and latent space. The two-stage model +facilitates a strategic transition from preliminary cloud reduction to +meticulous detail refinement. In the pixel space stage, Pixel-CR initiates the +processing of cloudy images, yielding a suboptimal cloud removal prior to +providing the diffusion model with prior cloud removal knowledge. In the latent +space stage, the diffusion model transforms low-quality cloud removal into +high-quality clean output. We refine the Stable Diffusion by implementing +ControlNet. In addition, an unsupervised iterative noise refinement (INR) +module is introduced for diffusion model to optimize the distribution of the +predicted noise, thereby enhancing advanced detail recovery. Our model performs +best with other SOTA methods, including image reconstruction and optical +remote-sensing cloud removal on the optical remote-sensing datasets. + +
+
+ comment: Accepted by IEEE TGRS, we first present an iterative diffusion + process for cloud removal, the code is available at: + https://github.com/SongYxing/IDF-CR +
+
+
+
+
+ + ☆ View-Consistent 3D Editing with Gaussian Splatting + + +
+ The advent of 3D Gaussian Splatting (3DGS) has revolutionized 3D editing, +offering efficient, high-fidelity rendering and enabling precise local +manipulations. Currently, diffusion-based 2D editing models are harnessed to +modify multi-view rendered images, which then guide the editing of 3DGS models. +However, this approach faces a critical issue of multi-view inconsistency, +where the guidance images exhibit significant discrepancies across views, +leading to mode collapse and visual artifacts of 3DGS. To this end, we +introduce View-consistent Editing (VcEdit), a novel framework that seamlessly +incorporates 3DGS into image editing processes, ensuring multi-view consistency +in edited guidance images and effectively mitigating mode collapse issues. +VcEdit employs two innovative consistency modules: the Cross-attention +Consistency Module and the Editing Consistency Module, both designed to reduce +inconsistencies in edited images. By incorporating these consistency modules +into an iterative pattern, VcEdit proficiently resolves the issue of multi-view +inconsistency, facilitating high-quality 3DGS editing across a diverse range of +scenes. + +
+
+
+
+
+ + ☆ Exploring Multi-modal Neural Scene Representations With Applications on + Thermal Imaging + + +
+ Neural Radiance Fields (NeRFs) quickly evolved as the new de-facto standard +for the task of novel view synthesis when trained on a set of RGB images. In +this paper, we conduct a comprehensive evaluation of neural scene +representations, such as NeRFs, in the context of multi-modal learning. +Specifically, we present four different strategies of how to incorporate a +second modality, other than RGB, into NeRFs: (1) training from scratch +independently on both modalities; (2) pre-training on RGB and fine-tuning on +the second modality; (3) adding a second branch; and (4) adding a separate +component to predict (color) values of the additional modality. We chose +thermal imaging as second modality since it strongly differs from RGB in terms +of radiosity, making it challenging to integrate into neural scene +representations. For the evaluation of the proposed strategies, we captured a +new publicly available multi-view dataset, ThermalMix, consisting of six common +objects and about 360 RGB and thermal images in total. We employ cross-modality +calibration prior to data capturing, leading to high-quality alignments between +RGB and thermal images. Our findings reveal that adding a second branch to NeRF +performs best for novel view synthesis on thermal images while also yielding +compelling results on RGB. Finally, we also show that our analysis generalizes +to other modalities, including near-infrared images and depth maps. Project +page: https://mert-o.github.io/ThermalNeRF/. + +
+
+ comment: 24 pages, 14 figures +
+
+
+
+
+ + ☆ denoiSplit: a method for joint image splitting and unsupervised + denoising + + +
+ In this work we present denoiSplit, a method to tackle a new analysis task, +i.e. the challenge of joint semantic image splitting and unsupervised +denoising. This dual approach has important applications in fluorescence +microscopy, where semantic image splitting has important applications but noise +does generally hinder the downstream analysis of image content. Image splitting +involves dissecting an image into its distinguishable semantic structures. We +show that the current state-of-the-art method for this task struggles in the +presence of image noise, inadvertently also distributing the noise across the +predicted outputs. The method we present here can deal with image noise by +integrating an unsupervised denoising sub-task. This integration results in +improved semantic image unmixing, even in the presence of notable and realistic +levels of imaging noise. A key innovation in denoiSplit is the use of +specifically formulated noise models and the suitable adjustment of +KL-divergence loss for the high-dimensional hierarchical latent space we are +training. We showcase the performance of denoiSplit across 4 tasks on +real-world microscopy images. Additionally, we perform qualitative and +quantitative evaluations and compare results to existing benchmarks, +demonstrating the effectiveness of using denoiSplit: a single Variational +Splitting Encoder-Decoder (VSE) Network using two suitable noise models to +jointly perform semantic splitting and denoising. + +
+
+
+
+
+ + ☆ GraphBEV: Towards Robust BEV Feature Alignment for Multi-Modal 3D Object + Detection + + +
+ Integrating LiDAR and camera information into Bird's-Eye-View (BEV) +representation has emerged as a crucial aspect of 3D object detection in +autonomous driving. However, existing methods are susceptible to the inaccurate +calibration relationship between LiDAR and the camera sensor. Such inaccuracies +result in errors in depth estimation for the camera branch, ultimately causing +misalignment between LiDAR and camera BEV features. In this work, we propose a +robust fusion framework called Graph BEV. Addressing errors caused by +inaccurate point cloud projection, we introduce a Local Align module that +employs neighbor-aware depth features via Graph matching. Additionally, we +propose a Global Align module to rectify the misalignment between LiDAR and +camera BEV features. Our Graph BEV framework achieves state-of-the-art +performance, with an mAP of 70.1\%, surpassing BEV Fusion by 1.6\% on the +nuscenes validation set. Importantly, our Graph BEV outperforms BEV Fusion by +8.3\% under conditions with misalignment noise. + +
+
+
+
+
+ + ☆ Agent3D-Zero: An Agent for Zero-shot 3D Understanding + + +
+ The ability to understand and reason the 3D real world is a crucial milestone +towards artificial general intelligence. The current common practice is to +finetune Large Language Models (LLMs) with 3D data and texts to enable 3D +understanding. Despite their effectiveness, these approaches are inherently +limited by the scale and diversity of the available 3D data. Alternatively, in +this work, we introduce Agent3D-Zero, an innovative 3D-aware agent framework +addressing the 3D scene understanding in a zero-shot manner. The essence of our +approach centers on reconceptualizing the challenge of 3D scene perception as a +process of understanding and synthesizing insights from multiple images, +inspired by how our human beings attempt to understand 3D scenes. By +consolidating this idea, we propose a novel way to make use of a Large Visual +Language Model (VLM) via actively selecting and analyzing a series of +viewpoints for 3D understanding. Specifically, given an input 3D scene, +Agent3D-Zero first processes a bird's-eye view image with custom-designed +visual prompts, then iteratively chooses the next viewpoints to observe and +summarize the underlying knowledge. A distinctive advantage of Agent3D-Zero is +the introduction of novel visual prompts, which significantly unleash the VLMs' +ability to identify the most informative viewpoints and thus facilitate +observing 3D scenes. Extensive experiments demonstrate the effectiveness of the +proposed framework in understanding diverse and previously unseen 3D +environments. + +
+
+ comment: project page: https://zhangsha1024.github.io/Agent3D-Zero/ +
+
+
+
+
+ + ☆ Evaluating Text to Image Synthesis: Survey and Taxonomy of Image Quality + Metrics + + +
+ Recent advances in text-to-image synthesis have been enabled by exploiting a +combination of language and vision through foundation models. These models are +pre-trained on tremendous amounts of text-image pairs sourced from the World +Wide Web or other large-scale databases. As the demand for high-quality image +generation shifts towards ensuring content alignment between text and image, +novel evaluation metrics have been developed with the aim of mimicking human +judgments. Thus, researchers have started to collect datasets with increasingly +complex annotations to study the compositionality of vision-language models and +their incorporation as a quality measure of compositional alignment between +text and image contents. In this work, we provide a comprehensive overview of +existing text-to-image evaluation metrics and propose a new taxonomy for +categorizing these metrics. We also review frequently adopted text-image +benchmark datasets before discussing techniques to optimize text-to-image +synthesis models towards quality and human preferences. Ultimately, we derive +guidelines for improving text-to-image evaluation and discuss the open +challenges and current limitations. + +
+
+ comment: preprint, 18 pages, 2 figures, 2 tables +
+
+
+
+
+ + ☆ TCNet: Continuous Sign Language Recognition from Trajectories and + Correlated Regions + + +
+ A key challenge in continuous sign language recognition (CSLR) is to +efficiently capture long-range spatial interactions over time from the video +input. To address this challenge, we propose TCNet, a hybrid network that +effectively models spatio-temporal information from Trajectories and Correlated +regions. TCNet's trajectory module transforms frames into aligned trajectories +composed of continuous visual tokens. In addition, for a query token, +self-attention is learned along the trajectory. As such, our network can also +focus on fine-grained spatio-temporal patterns, such as finger movements, of a +specific region in motion. TCNet's correlation module uses a novel dynamic +attention mechanism that filters out irrelevant frame regions. Additionally, it +assigns dynamic key-value tokens from correlated regions to each query. Both +innovations significantly reduce the computation cost and memory. We perform +experiments on four large-scale datasets: PHOENIX14, PHOENIX14-T, CSL, and +CSL-Daily, respectively. Our results demonstrate that TCNet consistently +achieves state-of-the-art performance. For example, we improve over the +previous state-of-the-art by 1.5% and 1.0% word error rate on PHOENIX14 and +PHOENIX14-T, respectively. + +
+
+
+
+
+ + ☆ HVDistill: Transferring Knowledge from Images to Point Clouds via + Unsupervised Hybrid-View Distillation + + +
+ We present a hybrid-view-based knowledge distillation framework, termed +HVDistill, to guide the feature learning of a point cloud neural network with a +pre-trained image network in an unsupervised manner. By exploiting the +geometric relationship between RGB cameras and LiDAR sensors, the +correspondence between the two modalities based on both image-plane view and +bird-eye view can be established, which facilitates representation learning. +Specifically, the image-plane correspondences can be simply obtained by +projecting the point clouds, while the bird-eye-view correspondences can be +achieved by lifting pixels to the 3D space with the predicted depths under the +supervision of projected point clouds. The image teacher networks provide rich +semantics from the image-plane view and meanwhile acquire geometric information +from the bird-eye view. Indeed, image features from the two views naturally +complement each other and together can ameliorate the learned feature +representation of the point cloud student networks. Moreover, with a +self-supervised pre-trained 2D network, HVDistill requires neither 2D nor 3D +annotations. We pre-train our model on nuScenes dataset and transfer it to +several downstream tasks on nuScenes, SemanticKITTI, and KITTI datasets for +evaluation. Extensive experimental results show that our method achieves +consistent improvements over the baseline trained from scratch and +significantly outperforms the existing schemes. Codes are available at +git@github.com:zhangsha1024/HVDistill.git. + +
+
+
+
+
+ + ☆ Aerial Lifting: Neural Urban Semantic and Building Instance Lifting from + Aerial Imagery CVPR 2024 + + +
+ We present a neural radiance field method for urban-scale semantic and +building-level instance segmentation from aerial images by lifting noisy 2D +labels to 3D. This is a challenging problem due to two primary reasons. +Firstly, objects in urban aerial images exhibit substantial variations in size, +including buildings, cars, and roads, which pose a significant challenge for +accurate 2D segmentation. Secondly, the 2D labels generated by existing +segmentation methods suffer from the multi-view inconsistency problem, +especially in the case of aerial images, where each image captures only a small +portion of the entire scene. To overcome these limitations, we first introduce +a scale-adaptive semantic label fusion strategy that enhances the segmentation +of objects of varying sizes by combining labels predicted from different +altitudes, harnessing the novel-view synthesis capabilities of NeRF. We then +introduce a novel cross-view instance label grouping strategy based on the 3D +scene representation to mitigate the multi-view inconsistency problem in the 2D +instance labels. Furthermore, we exploit multi-view reconstructed depth priors +to improve the geometric quality of the reconstructed radiance field, resulting +in enhanced segmentation results. Experiments on multiple real-world +urban-scale datasets demonstrate that our approach outperforms existing +methods, highlighting its effectiveness. + +
+
+ comment: CVPR 2024: https://zyqz97.github.io/Aerial_Lifting/ +
+
+
+
+
+ + ☆ Dynamic Tuning Towards Parameter and Inference Efficiency for ViT + Adaptation + + +
+ Existing parameter-efficient fine-tuning (PEFT) methods have achieved +significant success on vision transformers (ViTs) adaptation by improving +parameter efficiency. However, the exploration of enhancing inference +efficiency during adaptation remains underexplored. This limits the broader +application of pre-trained ViT models, especially when the model is +computationally extensive. In this paper, we propose Dynamic Tuning (DyT), a +novel approach to improve both parameter and inference efficiency for ViT +adaptation. Specifically, besides using the lightweight adapter modules, we +propose a token dispatcher to distinguish informative tokens from less +important ones, allowing the latter to dynamically skip the original block, +thereby reducing the redundant computation during inference. Additionally, we +explore multiple design variants to find the best practice of DyT. Finally, +inspired by the mixture-of-experts (MoE) mechanism, we introduce an enhanced +adapter to further boost the adaptation performance. We validate DyT across +various tasks, including image/video recognition and semantic segmentation. For +instance, DyT achieves comparable or even superior performance compared to +existing PEFT methods while evoking only 71%-85% of their FLOPs on the VTAB-1K +benchmark. + +
+
+
+
+
+ + ☆ Federated Modality-specific Encoders and Multimodal Anchors for + Personalized Brain Tumor Segmentation AAAI 2024 + + +
+ Most existing federated learning (FL) methods for medical image analysis only +considered intramodal heterogeneity, limiting their applicability to multimodal +imaging applications. In practice, it is not uncommon that some FL participants +only possess a subset of the complete imaging modalities, posing inter-modal +heterogeneity as a challenge to effectively training a global model on all +participants' data. In addition, each participant would expect to obtain a +personalized model tailored for its local data characteristics from the FL in +such a scenario. In this work, we propose a new FL framework with federated +modality-specific encoders and multimodal anchors (FedMEMA) to simultaneously +address the two concurrent issues. Above all, FedMEMA employs an exclusive +encoder for each modality to account for the inter-modal heterogeneity in the +first place. In the meantime, while the encoders are shared by the +participants, the decoders are personalized to meet individual needs. +Specifically, a server with full-modal data employs a fusion decoder to +aggregate and fuse representations from all modality-specific encoders, thus +bridging the modalities to optimize the encoders via backpropagation reversely. +Meanwhile, multiple anchors are extracted from the fused multimodal +representations and distributed to the clients in addition to the encoder +parameters. On the other end, the clients with incomplete modalities calibrate +their missing-modal representations toward the global full-modal anchors via +scaled dot-product cross-attention, making up the information loss due to +absent modalities while adapting the representations of present ones. FedMEMA +is validated on the BraTS 2020 benchmark for multimodal brain tumor +segmentation. Results show that it outperforms various up-to-date methods for +multimodal and personalized FL and that its novel designs are effective. Our +code is available. + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ☆ OpenOcc: Open Vocabulary 3D Scene Reconstruction via Occupancy + Representation + + +
+ 3D reconstruction has been widely used in autonomous navigation fields of +mobile robotics. However, the former research can only provide the basic +geometry structure without the capability of open-world scene understanding, +limiting advanced tasks like human interaction and visual navigation. Moreover, +traditional 3D scene understanding approaches rely on expensive labeled 3D +datasets to train a model for a single task with supervision. Thus, geometric +reconstruction with zero-shot scene understanding i.e. Open vocabulary 3D +Understanding and Reconstruction, is crucial for the future development of +mobile robots. In this paper, we propose OpenOcc, a novel framework unifying +the 3D scene reconstruction and open vocabulary understanding with neural +radiance fields. We model the geometric structure of the scene with occupancy +representation and distill the pre-trained open vocabulary model into a 3D +language field via volume rendering for zero-shot inference. Furthermore, a +novel semantic-aware confidence propagation (SCP) method has been proposed to +relieve the issue of language field representation degeneracy caused by +inconsistent measurements in distilled features. Experimental results show that +our approach achieves competitive performance in 3D scene understanding tasks, +especially for small and long-tail objects. + +
+
+
+
+
+ + ☆ SETA: Semantic-Aware Token Augmentation for Domain Generalization + + +
+ Domain generalization (DG) aims to enhance the model robustness against +domain shifts without accessing target domains. A prevalent category of methods +for DG is data augmentation, which focuses on generating virtual samples to +simulate domain shifts. However, existing augmentation techniques in DG are +mainly tailored for convolutional neural networks (CNNs), with limited +exploration in token-based architectures, i.e., vision transformer (ViT) and +multi-layer perceptrons (MLP) models. In this paper, we study the impact of +prior CNN-based augmentation methods on token-based models, revealing their +performance is suboptimal due to the lack of incentivizing the model to learn +holistic shape information. To tackle the issue, we propose the SEmantic-aware +Token Augmentation (SETA) method. SETA transforms token features by perturbing +local edge cues while preserving global shape features, thereby enhancing the +model learning of shape information. To further enhance the generalization +ability of the model, we introduce two stylized variants of our method combined +with two state-of-the-art style augmentation methods in DG. We provide a +theoretical insight into our method, demonstrating its effectiveness in +reducing the generalization risk bound. Comprehensive experiments on five +benchmarks prove that our method achieves SOTA performances across various ViT +and MLP architectures. Our code is available at +https://github.com/lingeringlight/SETA. + +
+
+ comment: 13 pages, 6 figures +
+
+
+
+
+ + ☆ PAON: A New Neuron Model using Padé Approximants ICIP 2024 + + +
+ Convolutional neural networks (CNN) are built upon the classical +McCulloch-Pitts neuron model, which is essentially a linear model, where the +nonlinearity is provided by a separate activation function. Several researchers +have proposed enhanced neuron models, including quadratic neurons, generalized +operational neurons, generative neurons, and super neurons, with stronger +nonlinearity than that provided by the pointwise activation function. There has +also been a proposal to use Pade approximation as a generalized activation +function. In this paper, we introduce a brand new neuron model called Pade +neurons (Paons), inspired by the Pade approximants, which is the best +mathematical approximation of a transcendental function as a ratio of +polynomials with different orders. We show that Paons are a super set of all +other proposed neuron models. Hence, the basic neuron in any known CNN model +can be replaced by Paons. In this paper, we extend the well-known ResNet to +PadeNet (built by Paons) to demonstrate the concept. Our experiments on the +single-image super-resolution task show that PadeNets can obtain better results +than competing architectures. + +
+
+ comment: Submitted to IEEE ICIP 2024 +
+
+
+
+
+ + ☆ Deep Medial Voxels: Learned Medial Axis Approximations for Anatomical + Shape Modeling + + +
+ Shape reconstruction from imaging volumes is a recurring need in medical +image analysis. Common workflows start with a segmentation step, followed by +careful post-processing and,finally, ad hoc meshing algorithms. As this +sequence can be timeconsuming, neural networks are trained to reconstruct +shapes through template deformation. These networks deliver state-ofthe-art +results without manual intervention, but, so far, they have primarily been +evaluated on anatomical shapes with little topological variety between +individuals. In contrast, other works favor learning implicit shape models, +which have multiple benefits for meshing and visualization. Our work follows +this direction by introducing deep medial voxels, a semi-implicit +representation that faithfully approximates the topological skeleton from +imaging volumes and eventually leads to shape reconstruction via convolution +surfaces. Our reconstruction technique shows potential for both visualization +and computer simulations. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ EMIE-MAP: Large-Scale Road Surface Reconstruction Based on Explicit Mesh + and Implicit Encoding + + +
+ Road surface reconstruction plays a vital role in autonomous driving systems, +enabling road lane perception and high-precision mapping. Recently, neural +implicit encoding has achieved remarkable results in scene representation, +particularly in the realistic rendering of scene textures. However, it faces +challenges in directly representing geometric information for large-scale +scenes. To address this, we propose EMIE-MAP, a novel method for large-scale +road surface reconstruction based on explicit mesh and implicit encoding. The +road geometry is represented using explicit mesh, where each vertex stores +implicit encoding representing the color and semantic information. To overcome +the difficulty in optimizing road elevation, we introduce a trajectory-based +elevation initialization and an elevation residual learning method based on +Multi-Layer Perceptron (MLP). Additionally, by employing implicit encoding and +multi-camera color MLPs decoding, we achieve separate modeling of scene +physical properties and camera characteristics, allowing surround-view +reconstruction compatible with different camera models. Our method achieves +remarkable road surface reconstruction performance in a variety of real-world +challenging scenarios. + +
+
+
+
+
+ + ☆ Infinite-ID: Identity-preserved Personalization via ID-semantics + Decoupling Paradigm + + +
+ Drawing on recent advancements in diffusion models for text-to-image +generation, identity-preserved personalization has made significant progress in +accurately capturing specific identities with just a single reference image. +However, existing methods primarily integrate reference images within the text +embedding space, leading to a complex entanglement of image and text +information, which poses challenges for preserving both identity fidelity and +semantic consistency. To tackle this challenge, we propose Infinite-ID, an +ID-semantics decoupling paradigm for identity-preserved personalization. +Specifically, we introduce identity-enhanced training, incorporating an +additional image cross-attention module to capture sufficient ID information +while deactivating the original text cross-attention module of the diffusion +model. This ensures that the image stream faithfully represents the identity +provided by the reference image while mitigating interference from textual +input. Additionally, we introduce a feature interaction mechanism that combines +a mixed attention module with an AdaIN-mean operation to seamlessly merge the +two streams. This mechanism not only enhances the fidelity of identity and +semantic consistency but also enables convenient control over the styles of the +generated images. Extensive experimental results on both raw photo generation +and style image generation demonstrate the superior performance of our proposed +method. + +
+
+
+
+
+ + ☆ DVN-SLAM: Dynamic Visual Neural SLAM Based on Local-Global Encoding + + +
+ Recent research on Simultaneous Localization and Mapping (SLAM) based on +implicit representation has shown promising results in indoor environments. +However, there are still some challenges: the limited scene representation +capability of implicit encodings, the uncertainty in the rendering process from +implicit representations, and the disruption of consistency by dynamic objects. +To address these challenges, we propose a real-time dynamic visual SLAM system +based on local-global fusion neural implicit representation, named DVN-SLAM. To +improve the scene representation capability, we introduce a local-global fusion +neural implicit representation that enables the construction of an implicit map +while considering both global structure and local details. To tackle +uncertainties arising from the rendering process, we design an information +concentration loss for optimization, aiming to concentrate scene information on +object surfaces. The proposed DVN-SLAM achieves competitive performance in +localization and mapping across multiple datasets. More importantly, DVN-SLAM +demonstrates robustness in dynamic scenes, a trait that sets it apart from +other NeRF-based methods. + +
+
+
+
+
+ + ☆ Modality-Agnostic fMRI Decoding of Vision and Language ICLR 2024 + + +
+ Previous studies have shown that it is possible to map brain activation data +of subjects viewing images onto the feature representation space of not only +vision models (modality-specific decoding) but also language models +(cross-modal decoding). In this work, we introduce and use a new large-scale +fMRI dataset (~8,500 trials per subject) of people watching both images and +text descriptions of such images. This novel dataset enables the development of +modality-agnostic decoders: a single decoder that can predict which stimulus a +subject is seeing, irrespective of the modality (image or text) in which the +stimulus is presented. We train and evaluate such decoders to map brain signals +onto stimulus representations from a large range of publicly available vision, +language and multimodal (vision+language) models. Our findings reveal that (1) +modality-agnostic decoders perform as well as (and sometimes even better than) +modality-specific decoders (2) modality-agnostic decoders mapping brain data +onto representations from unimodal models perform as well as decoders relying +on multimodal representations (3) while language and low-level visual +(occipital) brain regions are best at decoding text and image stimuli, +respectively, high-level visual (temporal) regions perform well on both +stimulus types. + +
+
+ comment: To appear at ICLR 2024 workshop on Representational Alignment + (Re-Align) +
+
+
+
+
+ + ☆ BEVCar: Camera-Radar Fusion for BEV Map and Object Segmentation + + +
+ Semantic scene segmentation from a bird's-eye-view (BEV) perspective plays a +crucial role in facilitating planning and decision-making for mobile robots. +Although recent vision-only methods have demonstrated notable advancements in +performance, they often struggle under adverse illumination conditions such as +rain or nighttime. While active sensors offer a solution to this challenge, the +prohibitively high cost of LiDARs remains a limiting factor. Fusing camera data +with automotive radars poses a more inexpensive alternative but has received +less attention in prior research. In this work, we aim to advance this +promising avenue by introducing BEVCar, a novel approach for joint BEV object +and map segmentation. The core novelty of our approach lies in first learning a +point-based encoding of raw radar data, which is then leveraged to efficiently +initialize the lifting of image features into the BEV space. We perform +extensive experiments on the nuScenes dataset and demonstrate that BEVCar +outperforms the current state of the art. Moreover, we show that incorporating +radar information significantly enhances robustness in challenging +environmental conditions and improves segmentation performance for distant +objects. To foster future research, we provide the weather split of the +nuScenes dataset used in our experiments, along with our code and trained +models at http://bevcar.cs.uni-freiburg.de. + +
+
+
+
+
+ + ☆ 3R-INN: How to be climate friendly while consuming/delivering videos? + + +
+ The consumption of a video requires a considerable amount of energy during +the various stages of its life-cycle. With a billion hours of video consumed +daily, this contributes significantly to the greenhouse gas emission. +Therefore, reducing the end-to-end carbon footprint of the video chain, while +preserving the quality of experience at the user side, is of high importance. +To contribute in an impactful manner, we propose 3R-INN, a single light +invertible network that does three tasks at once: given a high-resolution +grainy image, it Rescales it to a lower resolution, Removes film grain and +Reduces its power consumption when displayed. Providing such a minimum viable +quality content contributes to reducing the energy consumption during encoding, +transmission, decoding and display. 3R-INN also offers the possibility to +restore either the high-resolution grainy original image or a grain-free +version, thanks to its invertibility and the disentanglement of the high +frequency, and without transmitting auxiliary data. Experiments show that, +while enabling significant energy savings for encoding (78%), decoding (77%) +and rendering (5% to 20%), 3R-INN outperforms state-of-the-art film grain +synthesis and energy-aware methods and achieves state-of-the-art performance on +the rescaling task on different test-sets. + +
+
+
+
+
+ + ☆ Relational Representation Learning Network for Cross-Spectral Image + Patch Matching + + +
+ Recently, feature relation learning has drawn widespread attention in +cross-spectral image patch matching. However, existing related research focuses +on extracting diverse relations between image patch features and ignores +sufficient intrinsic feature representations of individual image patches. +Therefore, an innovative relational representation learning idea is proposed +for the first time, which simultaneously focuses on sufficiently mining the +intrinsic features of individual image patches and the relations between image +patch features. Based on this, we construct a lightweight Relational +Representation Learning Network (RRL-Net). Specifically, we innovatively +construct an autoencoder to fully characterize the individual intrinsic +features, and introduce a Feature Interaction Learning (FIL) module to extract +deep-level feature relations. To further fully mine individual intrinsic +features, a lightweight Multi-dimensional Global-to-Local Attention (MGLA) +module is constructed to enhance the global feature extraction of individual +image patches and capture local dependencies within global features. By +combining the MGLA module, we further explore the feature extraction network +and construct an Attention-based Lightweight Feature Extraction (ALFE) network. +In addition, we propose a Multi-Loss Post-Pruning (MLPP) optimization strategy, +which greatly promotes network optimization while avoiding increases in +parameters and inference time. Extensive experiments demonstrate that our +RRL-Net achieves state-of-the-art (SOTA) performance on multiple public +datasets. Our code will be made public later. + +
+
+
+
+
+ + ☆ LSKNet: A Foundation Lightweight Backbone for Remote Sensing + + +
+ Remote sensing images pose distinct challenges for downstream tasks due to +their inherent complexity. While a considerable amount of research has been +dedicated to remote sensing classification, object detection and semantic +segmentation, most of these studies have overlooked the valuable prior +knowledge embedded within remote sensing scenarios. Such prior knowledge can be +useful because remote sensing objects may be mistakenly recognized without +referencing a sufficiently long-range context, which can vary for different +objects. This paper considers these priors and proposes a lightweight Large +Selective Kernel Network (LSKNet) backbone. LSKNet can dynamically adjust its +large spatial receptive field to better model the ranging context of various +objects in remote sensing scenarios. To our knowledge, large and selective +kernel mechanisms have not been previously explored in remote sensing images. +Without bells and whistles, our lightweight LSKNet sets new state-of-the-art +scores on standard remote sensing classification, object detection and semantic +segmentation benchmarks. Our comprehensive analysis further validated the +significance of the identified priors and the effectiveness of LSKNet. The code +is available at https://github.com/zcablii/LSKNet. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2303.09030 +
+
+
+
+
+ + ☆ 3DGS-Calib: 3D Gaussian Splatting for Multimodal SpatioTemporal + Calibration + + +
+ Reliable multimodal sensor fusion algorithms require accurate spatiotemporal +calibration. Recently, targetless calibration techniques based on implicit +neural representations have proven to provide precise and robust results. +Nevertheless, such methods are inherently slow to train given the high +computational overhead caused by the large number of sampled points required +for volume rendering. With the recent introduction of 3D Gaussian Splatting as +a faster alternative to implicit representation methods, we propose to leverage +this new rendering approach to achieve faster multi-sensor calibration. We +introduce 3DGS-Calib, a new calibration method that relies on the speed and +rendering accuracy of 3D Gaussian Splatting to achieve multimodal +spatiotemporal calibration that is accurate, robust, and with a substantial +speed-up compared to methods relying on implicit neural representations. We +demonstrate the superiority of our proposal with experimental results on +sequences from KITTI-360, a widely used driving dataset. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ GCAM: Gaussian and causal-attention model of food fine-grained + recognition + + +
+ Currently, most food recognition relies on deep learning for category +classification. However, these approaches struggle to effectively distinguish +between visually similar food samples, highlighting the pressing need to +address fine-grained issues in food recognition. To mitigate these challenges, +we propose the adoption of a Gaussian and causal-attention model for +fine-grained object recognition.In particular, we train to obtain Gaussian +features over target regions, followed by the extraction of fine-grained +features from the objects, thereby enhancing the feature mapping capabilities +of the target regions. To counteract data drift resulting from uneven data +distributions, we employ a counterfactual reasoning approach. By using +counterfactual interventions, we analyze the impact of the learned image +attention mechanism on network predictions, enabling the network to acquire +more useful attention weights for fine-grained image recognition. Finally, we +design a learnable loss strategy to balance training stability across various +modules, ultimately improving the accuracy of the final target recognition. We +validate our approach on four relevant datasets, demonstrating its excellent +performance across these four datasets.We experimentally show that GCAM +surpasses state-of-the-art methods on the ETH-FOOD101, UECFOOD256, and +Vireo-FOOD172 datasets. Furthermore, our approach also achieves +state-of-the-art performance on the CUB-200 dataset. + +
+
+ comment: 23 pages, 11 figures +
+
+
+
+
+ + ☆ Boosting Continual Learning of Vision-Language Models via + Mixture-of-Experts Adapters CVPR2024 + + +
+ Continual learning can empower vision-language models to continuously acquire +new knowledge, without the need for access to the entire historical dataset. +However, mitigating the performance degradation in large-scale models is +non-trivial due to (i) parameter shifts throughout lifelong learning and (ii) +significant computational burdens associated with full-model tuning. In this +work, we present a parameter-efficient continual learning framework to +alleviate long-term forgetting in incremental learning with vision-language +models. Our approach involves the dynamic expansion of a pre-trained CLIP +model, through the integration of Mixture-of-Experts (MoE) adapters in response +to new tasks. To preserve the zero-shot recognition capability of +vision-language models, we further introduce a Distribution Discriminative +Auto-Selector (DDAS) that automatically routes in-distribution and +out-of-distribution inputs to the MoE Adapter and the original CLIP, +respectively. Through extensive experiments across various settings, our +proposed method consistently outperforms previous state-of-the-art approaches +while concurrently reducing parameter training burdens by 60%. Our code locates +at https://github.com/JiazuoYu/MoE-Adapters4CL + +
+
+ comment: This work is accepted by CVPR2024. More modifications may be + performed +
+
+
+
+
+ + ♻ ☆ Unsupervised Modality-Transferable Video Highlight Detection with + Representation Activation Sequence Learning + + +
+ Identifying highlight moments of raw video materials is crucial for improving +the efficiency of editing videos that are pervasive on internet platforms. +However, the extensive work of manually labeling footage has created obstacles +to applying supervised methods to videos of unseen categories. The absence of +an audio modality that contains valuable cues for highlight detection in many +videos also makes it difficult to use multimodal strategies. In this paper, we +propose a novel model with cross-modal perception for unsupervised highlight +detection. The proposed model learns representations with visual-audio level +semantics from image-audio pair data via a self-reconstruction task. To achieve +unsupervised highlight detection, we investigate the latent representations of +the network and propose the representation activation sequence learning (RASL) +module with k-point contrastive learning to learn significant representation +activations. To connect the visual modality with the audio modality, we use the +symmetric contrastive learning (SCL) module to learn the paired visual and +audio representations. Furthermore, an auxiliary task of masked feature vector +sequence (FVS) reconstruction is simultaneously conducted during pretraining +for representation enhancement. During inference, the cross-modal pretrained +model can generate representations with paired visual-audio semantics given +only the visual modality. The RASL module is used to output the highlight +scores. The experimental results show that the proposed framework achieves +superior performance compared to other state-of-the-art approaches. + +
+
+ comment: Accepted by IEEE Transactions on Image Processing, 2024 +
+
+
+
+
+ + ♻ ☆ Histo-Genomic Knowledge Distillation For Cancer Prognosis From + Histopathology Whole Slide Images + + +
+ Histo-genomic multi-modal methods have recently emerged as a powerful +paradigm, demonstrating significant potential for improving cancer prognosis. +However, genome sequencing, unlike histopathology imaging, is still not widely +accessible in underdeveloped regions, limiting the application of these +multi-modal approaches in clinical settings. To address this, we propose a +novel Genome-informed Hyper-Attention Network, termed G-HANet, which is capable +of effectively distilling the histo-genomic knowledge during training to +elevate uni-modal whole slide image (WSI)-based inference for the first time. +Compared with traditional knowledge distillation methods (i.e., teacher-student +architecture) in other tasks, our end-to-end model is superior in terms of +training efficiency and learning cross-modal interactions. Specifically, the +network comprises the cross-modal associating branch (CAB) and hyper-attention +survival branch (HSB). Through the genomic data reconstruction from WSIs, CAB +effectively distills the associations between functional genotypes and +morphological phenotypes and offers insights into the gene expression profiles +in the feature space. Subsequently, HSB leverages the distilled histo-genomic +associations as well as the generated morphology-based weights to achieve the +hyper-attention modeling of the patients from both histopathology and genomic +perspectives to improve cancer prognosis. Extensive experiments are conducted +on five TCGA benchmarking datasets and the results demonstrate that G-HANet +significantly outperforms the state-of-the-art WSI-based methods and achieves +competitive performance with genome-based and multi-modal methods. G-HANet is +expected to be explored as a useful tool by the research community to address +the current bottleneck of insufficient histo-genomic data pairing in the +context of cancer prognosis and precision oncology. + +
+
+
+
+
+ + ♻ ☆ Stop Reasoning! When Multimodal LLMs with Chain-of-Thought Reasoning + Meets Adversarial Images + + +
+ Recently, Multimodal LLMs (MLLMs) have shown a great ability to understand +images. However, like traditional vision models, they are still vulnerable to +adversarial images. Meanwhile, Chain-of-Thought (CoT) reasoning has been widely +explored on MLLMs, which not only improves model's performance, but also +enhances model's explainability by giving intermediate reasoning steps. +Nevertheless, there is still a lack of study regarding MLLMs' adversarial +robustness with CoT and an understanding of what the rationale looks like when +MLLMs infer wrong answers with adversarial images. Our research evaluates the +adversarial robustness of MLLMs when employing CoT reasoning, finding that CoT +marginally improves adversarial robustness against existing attack methods. +Moreover, we introduce a novel stop-reasoning attack technique that effectively +bypasses the CoT-induced robustness enhancements. Finally, we demonstrate the +alterations in CoT reasoning when MLLMs confront adversarial images, shedding +light on their reasoning process under adversarial attacks. + +
+
+
+
+
+ + ♻ ☆ ECAMP: Entity-centered Context-aware Medical Vision Language + Pre-training + + +
+ Despite significant advancements in medical vision-language pre-training, +existing methods have largely overlooked the inherent entity-specific context +within radiology reports and the complex cross-modality contextual +relationships between text and images. To close this gap, we propose a novel +Entity-centered Context-aware Medical Vision-language Pre-training (ECAMP) +framework, which is designed to enable a more entity-centered and +context-sensitive interpretation of medical data. Utilizing the recent powerful +large language model, we distill entity-centered context from medical reports, +which enables ECAMP to gain more effective supervision from the text modality. +By further pre-training our model with carefully designed entity-aware, +context-enhanced masked language modeling and context-guided super-resolution +tasks, ECAMP significantly refines the interplay between text and image +modalities, leading to an enhanced ability to extract entity-centered +contextual features. Besides, our proposed multi-scale context fusion design +also improves the semantic integration of both coarse and fine-level image +representations, prompting better performance for multi-scale downstream +applications. Combining these components leads to significant performance leaps +over current state-of-the-art methods and establishes a new standard for +cross-modality learning in medical imaging, whose effectiveness is demonstrated +by our extensive experiments on various tasks including classification, +segmentation, and detection across several public datasets. Code and models are +available at https://github.com/ToniChopp/ECAMP. + +
+
+
+
+
+ + ♻ ☆ Matching Non-Identical Objects + + +
+ Not identical but similar objects are everywhere in the world. Examples +include four-legged animals such as dogs and cats, cars of different models, +akin flowers in various colors, and countless others. In this study, we address +a novel task of matching such non-identical objects. We propose a simple +weighting scheme of descriptors that enhances various sparse image matching +methods, which were originally designed for matching identical objects captured +from different perspectives, and achieve semantically robust matching. The +experiments show successful matching between non-identical objects in various +cases including domain shift. Further, we present a first evaluation of the +robustness of the image matching methods under common corruptions, which is a +sort of domain shift, and the proposed method improves the matching in this +case as well. + +
+
+ comment: 10+7 pages, 10 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Hybrid Reasoning Based on Large Language Models for Autonomous Car + Driving + + +
+ Large Language Models (LLMs) have garnered significant attention for their +ability to understand text and images, generate human-like text, and perform +complex reasoning tasks. However, their ability to generalize this advanced +reasoning with a combination of natural language text for decision-making in +dynamic situations requires further exploration. In this study, we investigate +how well LLMs can adapt and apply a combination of arithmetic and common-sense +reasoning, particularly in autonomous driving scenarios. We hypothesize that +LLMs hybrid reasoning abilities can improve autonomous driving by enabling them +to analyze detected object and sensor data, understand driving regulations and +physical laws, and offer additional context. This addresses complex scenarios, +like decisions in low visibility (due to weather conditions), where traditional +methods might fall short. We evaluated Large Language Models (LLMs) based on +accuracy by comparing their answers with human-generated ground truth inside +CARLA. The results showed that when a combination of images (detected objects) +and sensor data is fed into the LLM, it can offer precise information for brake +and throttle control in autonomous vehicles across various weather conditions. +This formulation and answers can assist in decision-making for auto-pilot +systems. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ GRAM: Global Reasoning for Multi-Page VQA + + +
+ The increasing use of transformer-based large language models brings forward +the challenge of processing long sequences. In document visual question +answering (DocVQA), leading methods focus on the single-page setting, while +documents can span hundreds of pages. We present GRAM, a method that seamlessly +extends pre-trained single-page models to the multi-page setting, without +requiring computationally-heavy pretraining. To do so, we leverage a +single-page encoder for local page-level understanding, and enhance it with +document-level designated layers and learnable tokens, facilitating the flow of +information across pages for global reasoning. To enforce our model to utilize +the newly introduced document tokens, we propose a tailored bias adaptation +method. For additional computational savings during decoding, we introduce an +optional compression stage using our compression-transformer +(C-Former),reducing the encoded sequence length, thereby allowing a tradeoff +between quality and latency. Extensive experiments showcase GRAM's +state-of-the-art performance on the benchmarks for multi-page DocVQA, +demonstrating the effectiveness of our approach. + +
+
+
+
+
+ + ♻ ☆ BraSyn 2023 challenge: Missing MRI synthesis and the effect of different + learning objectives MICCAI + + +
+ This work addresses the Brain Magnetic Resonance Image Synthesis for Tumor +Segmentation (BraSyn) challenge, which was hosted as part of the Brain Tumor +Segmentation (BraTS) challenge in 2023. In this challenge, researchers are +invited to synthesize a missing magnetic resonance image sequence, given other +available sequences, to facilitate tumor segmentation pipelines trained on +complete sets of image sequences. This problem can be tackled using deep +learning within the framework of paired image-to-image translation. In this +study, we propose investigating the effectiveness of a commonly used deep +learning framework, such as Pix2Pix, trained under the supervision of different +image-quality loss functions. Our results indicate that the use of different +loss functions significantly affects the synthesis quality. We systematically +study the impact of various loss functions in the multi-sequence MR image +synthesis setting of the BraSyn challenge. Furthermore, we demonstrate how +image synthesis performance can be optimized by combining different learning +objectives beneficially. + +
+
+ comment: minor changes, to be published as part of the 9th BrainLes: + International MICCAI Brain Lesion Workshop +
+
+
+
+
+ + ♻ ☆ Deep Homography Estimation for Visual Place Recognition AAAI2024 + + +
+ Visual place recognition (VPR) is a fundamental task for many applications +such as robot localization and augmented reality. Recently, the hierarchical +VPR methods have received considerable attention due to the trade-off between +accuracy and efficiency. They usually first use global features to retrieve the +candidate images, then verify the spatial consistency of matched local features +for re-ranking. However, the latter typically relies on the RANSAC algorithm +for fitting homography, which is time-consuming and non-differentiable. This +makes existing methods compromise to train the network only in global feature +extraction. Here, we propose a transformer-based deep homography estimation +(DHE) network that takes the dense feature map extracted by a backbone network +as input and fits homography for fast and learnable geometric verification. +Moreover, we design a re-projection error of inliers loss to train the DHE +network without additional homography labels, which can also be jointly trained +with the backbone network to help it extract the features that are more +suitable for local matching. Extensive experiments on benchmark datasets show +that our method can outperform several state-of-the-art methods. And it is more +than one order of magnitude faster than the mainstream hierarchical VPR methods +using RANSAC. The code is released at https://github.com/Lu-Feng/DHE-VPR. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ♻ ☆ Ricci flow-based brain surface covariance descriptors for diagnosing + Alzheimer's disease + + +
+ Automated feature extraction from MRI brain scans and diagnosis of +Alzheimer's disease are ongoing challenges. With advances in 3D imaging +technology, 3D data acquisition is becoming more viable and efficient than its +2D counterpart. Rather than using feature-based vectors, in this paper, for the +first time, we suggest a pipeline to extract novel covariance-based descriptors +from the cortical surface using the Ricci energy optimization. The covariance +descriptors are components of the nonlinear manifold of symmetric +positive-definite matrices, thus we focus on using the Gaussian radial basis +function to apply manifold-based classification to the 3D shape problem. +Applying this novel signature to the analysis of abnormal cortical brain +morphometry allows for diagnosing Alzheimer's disease. Experimental studies +performed on about two hundred 3D MRI brain models, gathered from Alzheimer's +Disease Neuroimaging Initiative (ADNI) dataset demonstrate the effectiveness of +our descriptors in achieving remarkable classification accuracy. + +
+
+ comment: Accepted for publication in Biomedical Signal Processing and Control + journal +
+
+
+
+
+ + ♻ ☆ EmoCLIP: A Vision-Language Method for Zero-Shot Video Facial Expression + Recognition + + +
+ Facial Expression Recognition (FER) is a crucial task in affective computing, +but its conventional focus on the seven basic emotions limits its applicability +to the complex and expanding emotional spectrum. To address the issue of new +and unseen emotions present in dynamic in-the-wild FER, we propose a novel +vision-language model that utilises sample-level text descriptions (i.e. +captions of the context, expressions or emotional cues) as natural language +supervision, aiming to enhance the learning of rich latent representations, for +zero-shot classification. To test this, we evaluate using zero-shot +classification of the model trained on sample-level descriptions on four +popular dynamic FER datasets. Our findings show that this approach yields +significant improvements when compared to baseline methods. Specifically, for +zero-shot video FER, we outperform CLIP by over 10\% in terms of Weighted +Average Recall and 5\% in terms of Unweighted Average Recall on several +datasets. Furthermore, we evaluate the representations obtained from the +network trained using sample-level descriptions on the downstream task of +mental health symptom estimation, achieving performance comparable or superior +to state-of-the-art methods and strong agreement with human experts. Namely, we +achieve a Pearson's Correlation Coefficient of up to 0.85 on schizophrenia +symptom severity estimation, which is comparable to human experts' agreement. +The code is publicly available at: https://github.com/NickyFot/EmoCLIP. + +
+
+ comment: Accepted at FG'2024 +
+
+
+
+
+ + ♻ ☆ Learning Triangular Distribution in Visual World CVPR 2024 + + +
+ Convolution neural network is successful in pervasive vision tasks, including +label distribution learning, which usually takes the form of learning an +injection from the non-linear visual features to the well-defined labels. +However, how the discrepancy between features is mapped to the label +discrepancy is ambient, and its correctness is not guaranteed.To address these +problems, we study the mathematical connection between feature and its label, +presenting a general and simple framework for label distribution learning. We +propose a so-called Triangular Distribution Transform (TDT) to build an +injective function between feature and label, guaranteeing that any symmetric +feature discrepancy linearly reflects the difference between labels. The +proposed TDT can be used as a plug-in in mainstream backbone networks to +address different label distribution learning tasks. Experiments on Facial Age +Recognition, Illumination Chromaticity Estimation, and Aesthetics assessment +show that TDT achieves on-par or better results than the prior arts. + +
+
+ comment: Accepet by CVPR 2024 (11 pages, 5 figures) +
+
+
+
+
+ + ♻ ☆ Learning Exhaustive Correlation for Spectral Super-Resolution: Where + Spatial-Spectral Attention Meets Linear Dependence + + +
+ Spectral super-resolution that aims to recover hyperspectral image (HSI) from +easily obtainable RGB image has drawn increasing interest in the field of +computational photography. The crucial aspect of spectral super-resolution lies +in exploiting the correlation within HSIs. However, two types of bottlenecks in +existing Transformers limit performance improvement and practical applications. +First, existing Transformers often separately emphasize either spatial-wise or +spectral-wise correlation, disrupting the 3D features of HSI and hindering the +exploitation of unified spatial-spectral correlation. Second, existing +self-attention mechanism always establishes full-rank correlation matrix by +learning the correlation between pairs of tokens, leading to its inability to +describe linear dependence widely existing in HSI among multiple tokens. To +address these issues, we propose a novel Exhaustive Correlation Transformer +(ECT) for spectral super-resolution. First, we propose a Spectral-wise +Discontinuous 3D (SD3D) splitting strategy, which models unified +spatial-spectral correlation by integrating spatial-wise continuous splitting +strategy and spectral-wise discontinuous splitting strategy. Second, we propose +a Dynamic Low-Rank Mapping (DLRM) model, which captures linear dependence among +multiple tokens through a dynamically calculated low-rank dependence map. By +integrating unified spatial-spectral attention and linear dependence, our ECT +can model exhaustive correlation within HSI. The experimental results on both +simulated and real data indicate that our method achieves state-of-the-art +performance. Codes and pretrained models will be available later. + +
+
+
+
+
+ + ♻ ☆ CMMMU: A Chinese Massive Multi-discipline Multimodal Understanding + Benchmark + + +
+ As the capabilities of large multimodal models (LMMs) continue to advance, +evaluating the performance of LMMs emerges as an increasing need. Additionally, +there is an even larger gap in evaluating the advanced knowledge and reasoning +abilities of LMMs in non-English contexts such as Chinese. We introduce CMMMU, +a new Chinese Massive Multi-discipline Multimodal Understanding benchmark +designed to evaluate LMMs on tasks demanding college-level subject knowledge +and deliberate reasoning in a Chinese context. CMMMU is inspired by and +strictly follows the annotation and analysis pattern of MMMU. + CMMMU includes 12k manually collected multimodal questions from college +exams, quizzes, and textbooks, covering six core disciplines: Art & Design, +Business, Science, Health & Medicine, Humanities & Social Science, and Tech & +Engineering, like its companion, MMMU. These questions span 30 subjects and +comprise 39 highly heterogeneous image types, such as charts, diagrams, maps, +tables, music sheets, and chemical structures. + CMMMU focuses on complex perception and reasoning with domain-specific +knowledge in the Chinese context. We evaluate 11 open-source LLMs and one +proprietary GPT-4V(ision). Even GPT-4V only achieves accuracies of 42%, +indicating a large space for improvement. CMMMU will boost the community to +build the next-generation LMMs towards expert artificial intelligence and +promote the democratization of LMMs by providing diverse language contexts. + +
+
+
+
+
+ + ♻ ☆ Is it Really Negative? Evaluating Natural Language Video Localization + Performance on Multiple Reliable Videos Pool + + +
+ With the explosion of multimedia content in recent years, Video Corpus Moment +Retrieval (VCMR), which aims to detect a video moment that matches a given +natural language query from multiple videos, has become a critical problem. +However, existing VCMR studies have a significant limitation since they have +regarded all videos not paired with a specific query as negative, neglecting +the possibility of including false negatives when constructing the negative +video set. In this paper, we propose an MVMR (Massive Videos Moment Retrieval) +task that aims to localize video frames within a massive video set, mitigating +the possibility of falsely distinguishing positive and negative videos. For +this task, we suggest an automatic dataset construction framework by employing +textual and visual semantic matching evaluation methods on the existing video +moment search datasets and introduce three MVMR datasets. To solve MVMR task, +we further propose a strong method, CroCs, which employs cross-directional +contrastive learning that selectively identifies the reliable and informative +negatives, enhancing the robustness of a model on MVMR task. Experimental +results on the introduced datasets reveal that existing video moment search +models are easily distracted by negative video frames, whereas our model shows +significant performance. + +
+
+ comment: 15 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Exposure Bracketing is All You Need for Unifying Image Restoration and + Enhancement Tasks + + +
+ It is highly desired but challenging to acquire high-quality photos with +clear content in low-light environments. Although multi-image processing +methods (using burst, dual-exposure, or multi-exposure images) have made +significant progress in addressing this issue, they typically focus on specific +restoration or enhancement problems, being insufficient in exploiting +multi-image. Motivated by that multi-exposure images are complementary in +denoising, deblurring, high dynamic range imaging, and super-resolution, we +propose to utilize exposure bracketing photography to unify restoration and +enhancement tasks in this work. Due to the difficulty in collecting real-world +pairs, we suggest a solution that first pre-trains the model with synthetic +paired data and then adapts it to real-world unlabeled images. In particular, a +temporally modulated recurrent network (TMRNet) and self-supervised adaptation +method are proposed. Moreover, we construct a data simulation pipeline to +synthesize pairs and collect real-world images from 200 nighttime scenarios. +Experiments on both datasets show that our method performs favorably against +the state-of-the-art multi-image processing ones. The dataset, code, and +pre-trained models are available at https://github.com/cszhilu1998/BracketIRE. + +
+
+ comment: 28 pages +
+
+
+
+
+ + ♻ ☆ Enhancing the Reliability of Segment Anything Model for Auto-Prompting + Medical Image Segmentation with Uncertainty Rectification + + +
+ The Segment Anything Model (SAM) has recently emerged as a groundbreaking +foundation model for prompt-driven image segmentation tasks. However, both the +original SAM and its medical variants require slice-by-slice manual prompting +of target structures, which directly increase the burden for applications. +Despite attempts of auto-prompting to turn SAM into a fully automatic manner, +it still exhibits subpar performance and lacks of reliability especially in the +field of medical imaging. In this paper, we propose UR-SAM, an uncertainty +rectified SAM framework to enhance the reliability for auto-prompting medical +image segmentation. Building upon a localization framework for automatic prompt +generation, our method incorporates a prompt augmentation module to obtain a +series of input prompts for SAM for uncertainty estimation and an +uncertainty-based rectification module to further utilize the distribution of +estimated uncertainty to improve the segmentation performance. Extensive +experiments on two public 3D medical datasets covering the segmentation of 35 +organs demonstrate that without supplementary training or fine-tuning, our +method further improves the segmentation performance with up to 10.7 % and 13.8 +% in dice similarity coefficient, demonstrating efficiency and broad +capabilities for medical image segmentation without manual prompting. + +
+
+
+
+
+ + ♻ ☆ Multimodal Pathway: Improve Transformers with Irrelevant Data from Other + Modalities CVPR 2024 + + +
+ We propose to improve transformers of a specific modality with irrelevant +data from other modalities, e.g., improve an ImageNet model with audio or point +cloud datasets. We would like to highlight that the data samples of the target +modality are irrelevant to the other modalities, which distinguishes our method +from other works utilizing paired (e.g., CLIP) or interleaved data of different +modalities. We propose a methodology named Multimodal Pathway - given a target +modality and a transformer designed for it, we use an auxiliary transformer +trained with data of another modality and construct pathways to connect +components of the two models so that data of the target modality can be +processed by both models. In this way, we utilize the universal +sequence-to-sequence modeling abilities of transformers obtained from two +modalities. As a concrete implementation, we use a modality-specific tokenizer +and task-specific head as usual but utilize the transformer blocks of the +auxiliary model via a proposed method named Cross-Modal Re-parameterization, +which exploits the auxiliary weights without any inference costs. On the image, +point cloud, video, and audio recognition tasks, we observe significant and +consistent performance improvements with irrelevant data from other modalities. +The code and models are available at https://github.com/AILab-CVC/M2PT. + +
+
+ comment: CVPR 2024. Code and models are available at + https://github.com/AILab-CVC/M2PT +
+
+
+
+
+ + ♻ ☆ ProMISe: Promptable Medical Image Segmentation using SAM + + +
+ With the proposal of the Segment Anything Model (SAM), fine-tuning SAM for +medical image segmentation (MIS) has become popular. However, due to the large +size of the SAM model and the significant domain gap between natural and +medical images, fine-tuning-based strategies are costly with potential risk of +instability, feature damage and catastrophic forgetting. Furthermore, some +methods of transferring SAM to a domain-specific MIS through fine-tuning +strategies disable the model's prompting capability, severely limiting its +utilization scenarios. In this paper, we propose an Auto-Prompting Module +(APM), which provides SAM-based foundation model with Euclidean adaptive +prompts in the target domain. Our experiments demonstrate that such adaptive +prompts significantly improve SAM's non-fine-tuned performance in MIS. In +addition, we propose a novel non-invasive method called Incremental Pattern +Shifting (IPS) to adapt SAM to specific medical domains. Experimental results +show that the IPS enables SAM to achieve state-of-the-art or competitive +performance in MIS without the need for fine-tuning. By coupling these two +methods, we propose ProMISe, an end-to-end non-fine-tuned framework for +Promptable Medical Image Segmentation. Our experiments demonstrate that both +using our methods individually or in combination achieves satisfactory +performance in low-cost pattern shifting, with all of SAM's parameters frozen. + +
+
+
+
+
+ + ♻ ☆ UniRepLKNet: A Universal Perception Large-Kernel ConvNet for Audio, + Video, Point Cloud, Time-Series and Image Recognition CVPR 2024 + + +
+ Large-kernel convolutional neural networks (ConvNets) have recently received +extensive research attention, but two unresolved and critical issues demand +further investigation. 1) The architectures of existing large-kernel ConvNets +largely follow the design principles of conventional ConvNets or transformers, +while the architectural design for large-kernel ConvNets remains +under-addressed. 2) As transformers have dominated multiple modalities, it +remains to be investigated whether ConvNets also have a strong universal +perception ability in domains beyond vision. In this paper, we contribute from +two aspects. 1) We propose four architectural guidelines for designing +large-kernel ConvNets, the core of which is to exploit the essential +characteristics of large kernels that distinguish them from small kernels - +they can see wide without going deep. Following such guidelines, our proposed +large-kernel ConvNet shows leading performance in image recognition (ImageNet +accuracy of 88.0%, ADE20K mIoU of 55.6%, and COCO box AP of 56.4%), +demonstrating better performance and higher speed than the recent powerful +competitors. 2) We discover large kernels are the key to unlocking the +exceptional performance of ConvNets in domains where they were originally not +proficient. With certain modality-related preprocessing approaches, the +proposed model achieves state-of-the-art performance on time-series forecasting +and audio recognition tasks even without modality-specific customization to the +architecture. All the code and models are publicly available on GitHub and +Huggingface. + +
+
+ comment: CVPR 2024. Code, all the models, reproducible training scripts at + https://github.com/AILab-CVC/UniRepLKNet +
+
+
+
+
+ + ♻ ☆ Object-aware Inversion and Reassembly for Image Editing + + +
+ By comparing the original and target prompts, we can obtain numerous editing +pairs, each comprising an object and its corresponding editing target. To allow +editability while maintaining fidelity to the input image, existing editing +methods typically involve a fixed number of inversion steps that project the +whole input image to its noisier latent representation, followed by a denoising +process guided by the target prompt. However, we find that the optimal number +of inversion steps for achieving ideal editing results varies significantly +among different editing pairs, owing to varying editing difficulties. +Therefore, the current literature, which relies on a fixed number of inversion +steps, produces sub-optimal generation quality, especially when handling +multiple editing pairs in a natural image. To this end, we propose a new image +editing paradigm, dubbed Object-aware Inversion and Reassembly (OIR), to enable +object-level fine-grained editing. Specifically, we design a new search metric, +which determines the optimal inversion steps for each editing pair, by jointly +considering the editability of the target and the fidelity of the non-editing +region. We use our search metric to find the optimal inversion step for each +editing pair when editing an image. We then edit these editing pairs separately +to avoid concept mismatch. Subsequently, we propose an additional reassembly +step to seamlessly integrate the respective editing results and the non-editing +region to obtain the final edited image. To systematically evaluate the +effectiveness of our method, we collect two datasets called OIRBench for +benchmarking single- and multi-object editing, respectively. Experiments +demonstrate that our method achieves superior performance in editing object +shapes, colors, materials, categories, etc., especially in multi-object editing +scenarios. + +
+
+ comment: Project Page: https://aim-uofa.github.io/OIR-Diffusion/ +
+
+
+
+
+ + ♻ ☆ CRISP: Hybrid Structured Sparsity for Class-aware Model Pruning DATE + + +
+ Machine learning pipelines for classification tasks often train a universal +model to achieve accuracy across a broad range of classes. However, a typical +user encounters only a limited selection of classes regularly. This disparity +provides an opportunity to enhance computational efficiency by tailoring models +to focus on user-specific classes. Existing works rely on unstructured pruning, +which introduces randomly distributed non-zero values in the model, making it +unsuitable for hardware acceleration. Alternatively, some approaches employ +structured pruning, such as channel pruning, but these tend to provide only +minimal compression and may lead to reduced model accuracy. In this work, we +propose CRISP, a novel pruning framework leveraging a hybrid structured +sparsity pattern that combines both fine-grained N:M structured sparsity and +coarse-grained block sparsity. Our pruning strategy is guided by a +gradient-based class-aware saliency score, allowing us to retain weights +crucial for user-specific classes. CRISP achieves high accuracy with minimal +memory consumption for popular models like ResNet-50, VGG-16, and MobileNetV2 +on ImageNet and CIFAR-100 datasets. Moreover, CRISP delivers up to 14$\times$ +reduction in latency and energy consumption compared to existing pruning +methods while maintaining comparable accuracy. Our code is available at +https://github.com/shivmgg/CRISP/. + +
+
+ comment: 6 pages, accepted in Design, Automation & Test in Europe Conference & + Exhibition (DATE) 2024 +
+
+
+
+
+ + ♻ ☆ Robust Domain Adaptive Object Detection with Unified Multi-Granularity + Alignment + + +
+ Domain adaptive detection aims to improve the generalization of detectors on +target domain. To reduce discrepancy in feature distributions between two +domains, recent approaches achieve domain adaption through feature alignment in +different granularities via adversarial learning. However, they neglect the +relationship between multiple granularities and different features in +alignment, degrading detection. Addressing this, we introduce a unified +multi-granularity alignment (MGA)-based detection framework for +domain-invariant feature learning. The key is to encode the dependencies across +different granularities including pixel-, instance-, and category-levels +simultaneously to align two domains. Specifically, based on pixel-level +features, we first develop an omni-scale gated fusion (OSGF) module to +aggregate discriminative representations of instances with scale-aware +convolutions, leading to robust multi-scale detection. Besides, we introduce +multi-granularity discriminators to identify where, either source or target +domains, different granularities of samples come from. Note that, MGA not only +leverages instance discriminability in different categories but also exploits +category consistency between two domains for detection. Furthermore, we present +an adaptive exponential moving average (AEMA) strategy that explores model +assessments for model update to improve pseudo labels and alleviate local +misalignment problem, boosting detection robustness. Extensive experiments on +multiple domain adaption scenarios validate the superiority of MGA over other +approaches on FCOS and Faster R-CNN detectors. Code will be released at +https://github.com/tiankongzhang/MGA. + +
+
+
+
+
+ + ♻ ☆ Flooding Regularization for Stable Training of Generative Adversarial + Networks + + +
+ Generative Adversarial Networks (GANs) have shown remarkable performance in +image generation. However, GAN training suffers from the problem of +instability. One of the main approaches to address this problem is to modify +the loss function, often using regularization terms in addition to changing the +type of adversarial losses. This paper focuses on directly regularizing the +adversarial loss function. We propose a method that applies flooding, an +overfitting suppression method in supervised learning, to GANs to directly +prevent the discriminator's loss from becoming excessively low. Flooding +requires tuning the flood level, but when applied to GANs, we propose that the +appropriate range of flood level settings is determined by the adversarial loss +function, supported by theoretical analysis of GANs using the binary cross +entropy loss. We experimentally verify that flooding stabilizes GAN training +and can be combined with other stabilization techniques. We also show that by +restricting the discriminator's loss to be no less than the flood level, the +training proceeds stably even when the flood level is somewhat high. + +
+
+ comment: 25 pages, 9 figures, 18 tables +
+
+
+
+
+ + ♻ ☆ Effectiveness Assessment of Recent Large Vision-Language Models + + +
+ The advent of large vision-language models (LVLMs) represents a noteworthy +advancement towards the pursuit of artificial general intelligence. However, +the extent of their efficacy across both specialized and general tasks warrants +further investigation. This article endeavors to evaluate the competency of +popular LVLMs in specialized and general tasks, respectively, aiming to offer a +comprehensive comprehension of these innovative methodologies. To gauge their +efficacy in specialized tasks, we tailor a comprehensive testbed comprising +three distinct scenarios: natural, healthcare, and industrial, encompassing six +challenging tasks. These tasks include salient, camouflaged, and transparent +object detection, as well as polyp and skin lesion detection, alongside +industrial anomaly detection. We examine the performance of three recent +open-source LVLMs -- MiniGPT-v2, LLaVA-1.5, and Shikra -- in the realm of +visual recognition and localization. Moreover, we conduct empirical +investigations utilizing the aforementioned models alongside GPT-4V, assessing +their multi-modal understanding capacities in general tasks such as object +counting, absurd question answering, affordance reasoning, attribute +recognition, and spatial relation reasoning. Our investigations reveal that +these models demonstrate limited proficiency not only in specialized tasks but +also in general tasks. We delve deeper into this inadequacy and suggest several +potential factors, including limited cognition in specialized tasks, object +hallucination, text-to-image interference, and decreased robustness in complex +problems. We hope this study would provide valuable insights for the future +development of LVLMs, augmenting their power in coping with both general and +specialized applications. + +
+
+
+
+
+ + ♻ ☆ SparseDFF: Sparse-View Feature Distillation for One-Shot Dexterous + Manipulation + + +
+ Humans demonstrate remarkable skill in transferring manipulation abilities +across objects of varying shapes, poses, and appearances, a capability rooted +in their understanding of semantic correspondences between different instances. +To equip robots with a similar high-level comprehension, we present SparseDFF, +a novel DFF for 3D scenes utilizing large 2D vision models to extract semantic +features from sparse RGBD images, a domain where research is limited despite +its relevance to many tasks with fixed-camera setups. SparseDFF generates +view-consistent 3D DFFs, enabling efficient one-shot learning of dexterous +manipulations by mapping image features to a 3D point cloud. Central to +SparseDFF is a feature refinement network, optimized with a contrastive loss +between views and a point-pruning mechanism for feature continuity. This +facilitates the minimization of feature discrepancies w.r.t. end-effector +parameters, bridging demonstrations and target manipulations. Validated in +real-world scenarios with a dexterous hand, SparseDFF proves effective in +manipulating both rigid and deformable objects, demonstrating significant +generalization capabilities across object and scene variations. + +
+
+
+
+
+ + ♻ ☆ Phased Data Augmentation for Training a Likelihood-Based Generative + Model with Limited Data + + +
+ Generative models excel in creating realistic images, yet their dependency on +extensive datasets for training presents significant challenges, especially in +domains where data collection is costly or challenging. Current data-efficient +methods largely focus on GAN architectures, leaving a gap in training other +types of generative models. Our study introduces "phased data augmentation" as +a novel technique that addresses this gap by optimizing training in limited +data scenarios without altering the inherent data distribution. By limiting the +augmentation intensity throughout the learning phases, our method enhances the +model's ability to learn from limited data, thus maintaining fidelity. Applied +to a model integrating PixelCNNs with VQ-VAE-2, our approach demonstrates +superior performance in both quantitative and qualitative evaluations across +diverse datasets. This represents an important step forward in the efficient +training of likelihood-based models, extending the usefulness of data +augmentation techniques beyond just GANs. + +
+
+
+
+
+ + ♻ ☆ Multimodal Transformer Distillation for Audio-Visual Synchronization ICASSP 2024 + + +
+ Audio-visual synchronization aims to determine whether the mouth movements +and speech in the video are synchronized. VocaLiST reaches state-of-the-art +performance by incorporating multimodal Transformers to model audio-visual +interact information. However, it requires high computing resources, making it +impractical for real-world applications. This paper proposed an MTDVocaLiST +model, which is trained by our proposed multimodal Transformer distillation +(MTD) loss. MTD loss enables MTDVocaLiST model to deeply mimic the +cross-attention distribution and value-relation in the Transformer of VocaLiST. +Additionally, we harness uncertainty weighting to fully exploit the interaction +information across all layers. Our proposed method is effective in two aspects: +From the distillation method perspective, MTD loss outperforms other strong +distillation baselines. From the distilled model's performance perspective: 1) +MTDVocaLiST outperforms similar-size SOTA models, SyncNet, and Perfect Match +models by 15.65% and 3.35%; 2) MTDVocaLiST reduces the model size of VocaLiST +by 83.52%, yet still maintaining similar performance. + +
+
+ comment: Accepted by ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ Self-supervised Video Object Segmentation with Distillation Learning of + Deformable Attention + + +
+ Video object segmentation is a fundamental research problem in computer +vision. Recent techniques have often applied attention mechanism to object +representation learning from video sequences. However, due to temporal changes +in the video data, attention maps may not well align with the objects of +interest across video frames, causing accumulated errors in long-term video +processing. In addition, existing techniques have utilised complex +architectures, requiring highly computational complexity and hence limiting the +ability to integrate video object segmentation into low-powered devices. To +address these issues, we propose a new method for self-supervised video object +segmentation based on distillation learning of deformable attention. +Specifically, we devise a lightweight architecture for video object +segmentation that is effectively adapted to temporal changes. This is enabled +by deformable attention mechanism, where the keys and values capturing the +memory of a video sequence in the attention module have flexible locations +updated across frames. The learnt object representations are thus adaptive to +both the spatial and temporal dimensions. We train the proposed architecture in +a self-supervised fashion through a new knowledge distillation paradigm where +deformable attention maps are integrated into the distillation loss. We +qualitatively and quantitatively evaluate our method and compare it with +existing methods on benchmark datasets including DAVIS 2016/2017 and +YouTube-VOS 2018/2019. Experimental results verify the superiority of our +method via its achieved state-of-the-art performance and optimal memory usage. + +
+
+ comment: under review +
+
+
+
+
+ + ♻ ☆ Hierarchical Auto-Organizing System for Open-Ended Multi-Agent + Navigation ICLR 2024 + + +
+ Due to the dynamic and unpredictable open-world setting, navigating complex +environments in Minecraft poses significant challenges for multi-agent systems. +Agents must interact with the environment and coordinate their actions with +other agents to achieve common objectives. However, traditional approaches +often struggle to efficiently manage inter-agent communication and task +distribution, crucial for effective multi-agent navigation. Furthermore, +processing and integrating multi-modal information (such as visual, textual, +and auditory data) is essential for agents to comprehend their goals and +navigate the environment successfully and fully. To address this issue, we +design the HAS framework to auto-organize groups of LLM-based agents to +complete navigation tasks. In our approach, we devise a hierarchical +auto-organizing navigation system, which is characterized by 1) a hierarchical +system for multi-agent organization, ensuring centralized planning and +decentralized execution; 2) an auto-organizing and intra-communication +mechanism, enabling dynamic group adjustment under subtasks; 3) a multi-modal +information platform, facilitating multi-modal perception to perform the three +navigation tasks with one system. To assess organizational behavior, we design +a series of navigation tasks in the Minecraft environment, which includes +searching and exploring. We aim to develop embodied organizations that push the +boundaries of embodied AI, moving it towards a more human-like organizational +structure. + +
+
+ comment: ICLR 2024 Workshop on LLM Agents +
+
+
+
+
+ + ♻ ☆ Diffeomorphic Mesh Deformation via Efficient Optimal Transport for + Cortical Surface Reconstruction ICLR 2024 + + +
+ Mesh deformation plays a pivotal role in many 3D vision tasks including +dynamic simulations, rendering, and reconstruction. However, defining an +efficient discrepancy between predicted and target meshes remains an open +problem. A prevalent approach in current deep learning is the set-based +approach which measures the discrepancy between two surfaces by comparing two +randomly sampled point-clouds from the two meshes with Chamfer pseudo-distance. +Nevertheless, the set-based approach still has limitations such as lacking a +theoretical guarantee for choosing the number of points in sampled +point-clouds, and the pseudo-metricity and the quadratic complexity of the +Chamfer divergence. To address these issues, we propose a novel metric for +learning mesh deformation. The metric is defined by sliced Wasserstein distance +on meshes represented as probability measures that generalize the set-based +approach. By leveraging probability measure space, we gain flexibility in +encoding meshes using diverse forms of probability measures, such as +continuous, empirical, and discrete measures via varifold representation. After +having encoded probability measures, we can compare meshes by using the sliced +Wasserstein distance which is an effective optimal transport distance with +linear computational complexity and can provide a fast statistical rate for +approximating the surface of meshes. To the end, we employ a neural ordinary +differential equation (ODE) to deform the input surface into the target shape +by modeling the trajectories of the points on the surface. Our experiments on +cortical surface reconstruction demonstrate that our approach surpasses other +competing methods in multiple datasets and metrics. + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ♻ ☆ EC-Depth: Exploring the consistency of self-supervised monocular depth + estimation in challenging scenes + + +
+ Self-supervised monocular depth estimation holds significant importance in +the fields of autonomous driving and robotics. However, existing methods are +typically trained and tested on standard datasets, overlooking the impact of +various adverse conditions prevalent in real-world applications, such as rainy +days. As a result, it is commonly observed that these methods struggle to +handle these challenging scenarios. To address this issue, we present EC-Depth, +a novel self-supervised two-stage framework to achieve a robust depth +estimation. In the first stage, we propose depth consistency regularization to +propagate reliable supervision from standard to challenging scenes. In the +second stage, we adopt the Mean Teacher paradigm and propose a novel +consistency-based pseudo-label filtering strategy to improve the quality of +pseudo-labels, further improving both the accuracy and robustness of our model. +Extensive experiments demonstrate that our method achieves accurate and +consistent depth predictions in both standard and challenging scenarios, +surpassing existing state-of-the-art methods on KITTI, KITTI-C, DrivingStereo, +and NuScenes-Night benchmarks. + +
+
+ comment: Project page: https://ruijiezhu94.github.io/ECDepth_page +
+
+
+
+
+ + ♻ ☆ MotionGPT: Finetuned LLMs Are General-Purpose Motion Generators AAAI 2024 + + +
+ Generating realistic human motion from given action descriptions has +experienced significant advancements because of the emerging requirement of +digital humans. While recent works have achieved impressive results in +generating motion directly from textual action descriptions, they often support +only a single modality of the control signal, which limits their application in +the real digital human industry. This paper presents a Motion General-Purpose +generaTor (MotionGPT) that can use multimodal control signals, e.g., text and +single-frame poses, for generating consecutive human motions by treating +multimodal signals as special input tokens in large language models (LLMs). +Specifically, we first quantize multimodal control signals into discrete codes +and then formulate them in a unified prompt instruction to ask the LLMs to +generate the motion answer. Our MotionGPT demonstrates a unified human motion +generation model with multimodal control signals by tuning a mere 0.4% of LLM +parameters. To the best of our knowledge, MotionGPT is the first method to +generate human motion by multimodal control signals, which we hope can shed +light on this new direction. Visit our webpage at +https://qiqiapink.github.io/MotionGPT/. + +
+
+ comment: 18 pages, 8 figures, accepted by AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Diffusion Reward: Learning Rewards via Conditional Video Diffusion + + +
+ Learning rewards from expert videos offers an affordable and effective +solution to specify the intended behaviors for reinforcement learning tasks. In +this work, we propose Diffusion Reward, a novel framework that learns rewards +from expert videos via conditional video diffusion models for solving complex +visual RL problems. Our key insight is that lower generative diversity is +observed when conditioned on expert trajectories. Diffusion Reward is +accordingly formalized by the negative of conditional entropy that encourages +productive exploration of expert-like behaviors. We show the efficacy of our +method over 10 robotic manipulation tasks from MetaWorld and Adroit with visual +input and sparse reward. Moreover, Diffusion Reward could even solve unseen +tasks successfully and effectively, largely surpassing baseline methods. +Project page and code: https://diffusion-reward.github.io/. + +
+
+ comment: Project page and code: https://diffusion-reward.github.io/ +
+
+
+
+
+ + ♻ ☆ Shifting to Machine Supervision: Annotation-Efficient Semi and + Self-Supervised Learning for Automatic Medical Image Segmentation and + Classification + + +
+ Advancements in clinical treatment are increasingly constrained by the +limitations of supervised learning techniques, which depend heavily on large +volumes of annotated data. The annotation process is not only costly but also +demands substantial time from clinical specialists. Addressing this issue, we +introduce the S4MI (Self-Supervision and Semi-Supervision for Medical Imaging) +pipeline, a novel approach that leverages the advancements in self-supervised +and semi-supervised learning. These techniques engage in auxiliary tasks that +do not require labeling, thus simplifying the scaling of machine supervision +compared to fully-supervised methods. Our study benchmarks these techniques on +three distinct medical imaging datasets to evaluate their effectiveness in +classification and segmentation tasks. Notably, we observed that +self-supervised learning significantly surpassed the performance of supervised +methods in the classification of all evaluated datasets. Remarkably, the +semi-supervised approach demonstrated superior outcomes in segmentation, +outperforming fully-supervised methods while using 50% fewer labels across all +datasets. In line with our commitment to contributing to the scientific +community, we have made the S4MI code openly accessible, allowing for broader +application and further development of these methods. + +
+
+ comment: Seventeen pages (incl. references), five figures, and one table. + (Under Review) +
+
+
+
+
+ + ♻ ☆ LLM-CXR: Instruction-Finetuned LLM for CXR Image Understanding and + Generation ICLR 2024 + + +
+ Following the impressive development of LLMs, vision-language alignment in +LLMs is actively being researched to enable multimodal reasoning and visual IO. +This direction of research is particularly relevant to medical imaging because +medical image analysis and generation consist of reasoning based on a +combination of visual features and prior knowledge. Many recent works have +focused on training adapter networks that serve as an information bridge +between image processing networks and LLMs; but presumably, in order to achieve +maximum reasoning potential of LLMs on visual information as well, visual and +language features should be allowed to interact more freely. This is especially +important in the medical domain because understanding and generating medical +images such as chest X-rays (CXR) require not only accurate visual and +language-based reasoning but also a more intimate mapping between the two +modalities. Thus, taking inspiration from previous work on the transformer and +VQ-GAN combination for bidirectional image and text generation, we build upon +this approach and develop a method for instruction-tuning an LLM pre-trained +only on text to gain vision-language capabilities for medical images. +Specifically, we leverage a pretrained LLM's existing question-answering and +instruction-following abilities to teach it to understand visual inputs by +instructing it to answer questions about image inputs and, symmetrically, +output both text and image responses appropriate to a given query by tuning the +LLM with diverse tasks that encompass image-based text-generation and +text-based image-generation. We show that our model, LLM-CXR, trained in this +approach shows better image-text alignment in both CXR understanding and +generation tasks while being smaller in size compared to previously developed +models that perform a narrower range of tasks. The code is at +https://github.com/hyn2028/llm-cxr. + +
+
+ comment: 21 pages, 8 figures; ICLR 2024 (poster) +
+
+
+
+
+ + ♻ ☆ When Semantic Segmentation Meets Frequency Aliasing ICLR 2024 + + +
+ Despite recent advancements in semantic segmentation, where and what pixels +are hard to segment remains largely unexplored. Existing research only +separates an image into easy and hard regions and empirically observes the +latter are associated with object boundaries. In this paper, we conduct a +comprehensive analysis of hard pixel errors, categorizing them into three +types: false responses, merging mistakes, and displacements. Our findings +reveal a quantitative association between hard pixels and aliasing, which is +distortion caused by the overlapping of frequency components in the Fourier +domain during downsampling. To identify the frequencies responsible for +aliasing, we propose using the equivalent sampling rate to calculate the +Nyquist frequency, which marks the threshold for aliasing. Then, we introduce +the aliasing score as a metric to quantify the extent of aliasing. While +positively correlated with the proposed aliasing score, three types of hard +pixels exhibit different patterns. Here, we propose two novel de-aliasing +filter (DAF) and frequency mixing (FreqMix) modules to alleviate aliasing +degradation by accurately removing or adjusting frequencies higher than the +Nyquist frequency. The DAF precisely removes the frequencies responsible for +aliasing before downsampling, while the FreqMix dynamically selects +high-frequency components within the encoder block. Experimental results +demonstrate consistent improvements in semantic segmentation and low-light +instance segmentation tasks. The code is available at: +https://github.com/Linwei-Chen/Seg-Aliasing. + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ♻ ☆ TivNe-SLAM: Dynamic Mapping and Tracking via Time-Varying Neural + Radiance Fields + + +
+ Previous attempts to integrate Neural Radiance Fields (NeRF) into the +Simultaneous Localization and Mapping (SLAM) framework either rely on the +assumption of static scenes or require the ground truth camera poses, which +impedes their application in real-world scenarios. In this paper, we propose a +time-varying representation to track and reconstruct the dynamic scenes. +Firstly, two processes, tracking process and mapping process, are +simultaneously maintained in our framework. For the tracking process, all input +images are uniformly sampled, then progressively trained in a self-supervised +paradigm. For the mapping process, we leverage motion masks to distinguish +dynamic objects from static background, and sample more pixels from dynamic +areas. Secondly, the parameter optimization for both processes consists of two +stages: the first stage associates time with 3D positions to convert the +deformation field to the canonical field. And the second stage associates time +with the embeddings of canonical field to obtain colors and Signed Distance +Function (SDF). Lastly, we propose a novel keyframe selection strategy based on +the overlapping rate. We evaluate our approach on two synthetic datasets and +one real-world dataset. And the experiments validate that our method achieves +competitive results in both tracking and mapping when compared to existing +state-of-the-art NeRF-based methods. + +
+
+
+
+
+ + ♻ ☆ Effective Message Hiding with Order-Preserving Mechanisms + + +
+ Message hiding, a technique that conceals secret message bits within a cover +image, aims to achieve an optimal balance among message capacity, recovery +accuracy, and imperceptibility. While convolutional neural networks have +notably improved message capacity and imperceptibility, achieving high recovery +accuracy remains challenging. This challenge arises because convolutional +operations struggle to preserve the sequential order of message bits and +effectively address the discrepancy between these two modalities. To address +this, we propose StegaFormer, an innovative MLP-based framework designed to +preserve bit order and enable global fusion between modalities. Specifically, +StegaFormer incorporates three crucial components: Order-Preserving Message +Encoder (OPME), Decoder (OPMD) and Global Message-Image Fusion (GMIF). OPME and +OPMD aim to preserve the order of message bits by segmenting the entire +sequence into equal-length segments and incorporating sequential information +during encoding and decoding. Meanwhile, GMIF employs a cross-modality fusion +mechanism to effectively fuse the features from the two uncorrelated +modalities. Experimental results on the COCO and DIV2K datasets demonstrate +that StegaFormer surpasses existing state-of-the-art methods in terms of +recovery accuracy, message capacity, and imperceptibility. We will make our +code publicly available. + +
+
+ comment: 7 Pages +
+
+
+
+
+ + ♻ ☆ CURSOR: Scalable Mixed-Order Hypergraph Matching with CUR Decomposition CVPR 2024 + + +
+ To achieve greater accuracy, hypergraph matching algorithms require +exponential increases in computational resources. Recent kd-tree-based +approximate nearest neighbor (ANN) methods, despite the sparsity of their +compatibility tensor, still require exhaustive calculations for large-scale +graph matching. This work utilizes CUR tensor decomposition and introduces a +novel cascaded second and third-order hypergraph matching framework (CURSOR) +for efficient hypergraph matching. A CUR-based second-order graph matching +algorithm is used to provide a rough match, and then the core of CURSOR, a +fiber-CUR-based tensor generation method, directly calculates entries of the +compatibility tensor by leveraging the initial second-order match result. This +significantly decreases the time complexity and tensor density. A probability +relaxation labeling (PRL)-based matching algorithm, especially suitable for +sparse tensors, is developed. Experiment results on large-scale synthetic +datasets and widely-adopted benchmark sets demonstrate the superiority of +CURSOR over existing methods. The tensor generation method in CURSOR can be +integrated seamlessly into existing hypergraph matching methods to improve +their performance and lower their computational costs. + +
+
+ comment: Accepted to CVPR 2024. The final camera-ready version. 15 pages with + supplementary materials and 11 figures. Minor grammarly and syntax errors + fixed. Irrelavant hyperrefs removed. Authorship information amended +
+
+
+
+
+ + ♻ ☆ Biophysics Informed Pathological Regularisation for Brain Tumour + Segmentation + + +
+ Recent advancements in deep learning have significantly improved brain tumour +segmentation techniques; however, the results still lack confidence and +robustness as they solely consider image data without biophysical priors or +pathological information. Integrating biophysics-informed regularisation is one +effective way to change this situation, as it provides an prior regularisation +for automated end-to-end learning. In this paper, we propose a novel approach +that designs brain tumour growth Partial Differential Equation (PDE) models as +a regularisation with deep learning, operational with any network model. Our +method introduces tumour growth PDE models directly into the segmentation +process, improving accuracy and robustness, especially in data-scarce +scenarios. This system estimates tumour cell density using a periodic +activation function. By effectively integrating this estimation with +biophysical models, we achieve a better capture of tumour characteristics. This +approach not only aligns the segmentation closer to actual biological behaviour +but also strengthens the model's performance under limited data conditions. We +demonstrate the effectiveness of our framework through extensive experiments on +the BraTS 2023 dataset, showcasing significant improvements in both precision +and reliability of tumour segmentation. + +
+
+ comment: 11 pages, 4 figures and 1 table +
+
+
+
+
+ + ♻ ☆ Key-point Guided Deformable Image Manipulation Using Diffusion Model + + +
+ In this paper, we introduce a Key-point-guided Diffusion probabilistic Model +(KDM) that gains precise control over images by manipulating the object's +key-point. We propose a two-stage generative model incorporating an optical +flow map as an intermediate output. By doing so, a dense pixel-wise +understanding of the semantic relation between the image and sparse key point +is configured, leading to more realistic image generation. Additionally, the +integration of optical flow helps regulate the inter-frame variance of +sequential images, demonstrating an authentic sequential image generation. The +KDM is evaluated with diverse key-point conditioned image synthesis tasks, +including facial image generation, human pose synthesis, and echocardiography +video prediction, demonstrating the KDM is proving consistency enhanced and +photo-realistic images compared with state-of-the-art models. + +
+
+ comment: 1. The ideas and approaches for the existing network have undergone + significant revisions, along with changes in the dataset, resulting in an + overall overhaul. I am planning to upload the newly written paper. 2. All + authors have agreed to these decisions +
+
+
+
+
+ + ♻ ☆ Understanding Domain Generalization: A Noise Robustness Perspective ICLR 2024 + + +
+ Despite the rapid development of machine learning algorithms for domain +generalization (DG), there is no clear empirical evidence that the existing DG +algorithms outperform the classic empirical risk minimization (ERM) across +standard benchmarks. To better understand this phenomenon, we investigate +whether there are benefits of DG algorithms over ERM through the lens of label +noise. Specifically, our finite-sample analysis reveals that label noise +exacerbates the effect of spurious correlations for ERM, undermining +generalization. Conversely, we illustrate that DG algorithms exhibit implicit +label-noise robustness during finite-sample training even when spurious +correlation is present. Such desirable property helps mitigate spurious +correlations and improve generalization in synthetic experiments. However, +additional comprehensive experiments on real-world benchmark datasets indicate +that label-noise robustness does not necessarily translate to better +performance compared to ERM. We conjecture that the failure mode of ERM arising +from spurious correlations may be less pronounced in practice. + +
+
+ comment: Accepted to the 12th International Conference on Learning + Representations (ICLR 2024). Code is available at + https://github.com/qiaoruiyt/NoiseRobustDG +
+
+
+
+
+ + ♻ ☆ Point Mamba: A Novel Point Cloud Backbone Based on State Space Model + with Octree-Based Ordering Strategy + + +
+ Recently, state space model (SSM) has gained great attention due to its +promising performance, linear complexity, and long sequence modeling ability in +both language and image domains. However, it is non-trivial to extend SSM to +the point cloud field, because of the causality requirement of SSM and the +disorder and irregularity nature of point clouds. In this paper, we propose a +novel SSM-based point cloud processing backbone, named Point Mamba, with a +causality-aware ordering mechanism. To construct the causal dependency +relationship, we design an octree-based ordering strategy on raw irregular +points, globally sorting points in a z-order sequence and also retaining their +spatial proximity. Our method achieves state-of-the-art performance compared +with transformer-based counterparts, with 93.4% accuracy and 75.7 mIOU +respectively on the ModelNet40 classification dataset and ScanNet semantic +segmentation dataset. Furthermore, our Point Mamba has linear complexity, which +is more efficient than transformer-based methods. Our method demonstrates the +great potential that SSM can serve as a generic backbone in point cloud +understanding. Codes are released at https://github.com/IRMVLab/Point-Mamba. + +
+
+
+
+
+ + ♻ ☆ AuG-KD: Anchor-Based Mixup Generation for Out-of-Domain Knowledge + Distillation ICLR 2024 + + +
+ Due to privacy or patent concerns, a growing number of large models are +released without granting access to their training data, making transferring +their knowledge inefficient and problematic. In response, Data-Free Knowledge +Distillation (DFKD) methods have emerged as direct solutions. However, simply +adopting models derived from DFKD for real-world applications suffers +significant performance degradation, due to the discrepancy between teachers' +training data and real-world scenarios (student domain). The degradation stems +from the portions of teachers' knowledge that are not applicable to the student +domain. They are specific to the teacher domain and would undermine students' +performance. Hence, selectively transferring teachers' appropriate knowledge +becomes the primary challenge in DFKD. In this work, we propose a simple but +effective method AuG-KD. It utilizes an uncertainty-guided and sample-specific +anchor to align student-domain data with the teacher domain and leverages a +generative method to progressively trade off the learning process between OOD +knowledge distillation and domain-specific information learning via mixup +learning. Extensive experiments in 3 datasets and 8 settings demonstrate the +stability and superiority of our approach. Code available at +https://github.com/IshiKura-a/AuG-KD . + +
+
+ comment: Accepted to ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Improving Neural Radiance Field using Near-Surface Sampling with Point + Cloud Generation + + +
+ Neural radiance field (NeRF) is an emerging view synthesis method that +samples points in a three-dimensional (3D) space and estimates their existence +and color probabilities. The disadvantage of NeRF is that it requires a long +training time since it samples many 3D points. In addition, if one samples +points from occluded regions or in the space where an object is unlikely to +exist, the rendering quality of NeRF can be degraded. These issues can be +solved by estimating the geometry of 3D scene. This paper proposes a +near-surface sampling framework to improve the rendering quality of NeRF. To +this end, the proposed method estimates the surface of a 3D object using depth +images of the training set and sampling is performed around there only. To +obtain depth information on a novel view, the paper proposes a 3D point cloud +generation method and a simple refining method for projected depth from a point +cloud. Experimental results show that the proposed near-surface sampling NeRF +framework can significantly improve the rendering quality, compared to the +original NeRF and three different state-of-the-art NeRF. In addition, one can +significantly accelerate the training time of a NeRF model with the proposed +near-surface sampling framework. + +
+
+ comment: 14 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ HallusionBench: An Advanced Diagnostic Suite for Entangled Language + Hallucination and Visual Illusion in Large Vision-Language Models + + +
+ We introduce HallusionBench, a comprehensive benchmark designed for the +evaluation of image-context reasoning. This benchmark presents significant +challenges to advanced large visual-language models (LVLMs), such as +GPT-4V(Vision), Gemini Pro Vision, Claude 3, and LLaVA-1.5, by emphasizing +nuanced understanding and interpretation of visual data. The benchmark +comprises 346 images paired with 1129 questions, all meticulously crafted by +human experts. We introduce a novel structure for these visual questions +designed to establish control groups. This structure enables us to conduct a +quantitative analysis of the models' response tendencies, logical consistency, +and various failure modes. In our evaluation on HallusionBench, we benchmarked +15 different models, highlighting a 31.42% question-pair accuracy achieved by +the state-of-the-art GPT-4V. Notably, all other evaluated models achieve +accuracy below 16%. Moreover, our analysis not only highlights the observed +failure modes, including language hallucination and visual illusion, but also +deepens an understanding of these pitfalls. Our comprehensive case studies +within HallusionBench shed light on the challenges of hallucination and +illusion in LVLMs. Based on these insights, we suggest potential pathways for +their future improvement. The benchmark and codebase can be accessed at +https://github.com/tianyi-lab/HallusionBench. + +
+
+
+
+
+ + ♻ ☆ SAM-Lightening: A Lightweight Segment Anything Model with Dilated Flash + Attention to Achieve 30 times Acceleration + + +
+ Segment Anything Model (SAM) has garnered significant attention in +segmentation tasks due to their zero-shot generalization ability. However, a +broader application of SAMs to real-world practice has been restricted by their +low inference speed and high computational memory demands, which mainly stem +from the attention mechanism. Existing work concentrated on optimizing the +encoder, yet has not adequately addressed the inefficiency of the attention +mechanism itself, even when distilled to a smaller model, which thus leaves +space for further improvement. In response, we introduce SAM-Lightening, a +variant of SAM, that features a re-engineered attention mechanism, termed +Dilated Flash Attention. It not only facilitates higher parallelism, enhancing +processing efficiency but also retains compatibility with the existing +FlashAttention. Correspondingly, we propose a progressive distillation to +enable an efficient knowledge transfer from the vanilla SAM without costly +training from scratch. Experiments on COCO and LVIS reveal that SAM-Lightening +significantly outperforms the state-of-the-art methods in both run-time +efficiency and segmentation accuracy. Specifically, it can achieve an inference +speed of 7 milliseconds (ms) per image, for images of size 1024*1024 pixels, +which is 30.1 times faster than the vanilla SAM and 2.1 times than the +state-of-the-art. Moreover, it takes only 244MB memory, which is 3.5\% of the +vanilla SAM. The code and weights are available at +https://anonymous.4open.science/r/SAM-LIGHTENING-BC25/. + +
+
+
+
+
+ + ♻ ☆ LOSTU: Fast, Scalable, and Uncertainty-Aware Triangulation + + +
+ This work proposes a non-iterative, scalable, and statistically optimal way +to triangulate called \texttt{LOSTU}. Unlike triangulation algorithms that +minimize the reprojection ($L_2$) error, LOSTU will still provide the maximum +likelihood estimate when there are errors in camera pose or parameters. This +generic framework is used to contextualize other triangulation methods like the +direct linear transform (DLT) or the midpoint. Synthetic experiments show that +LOSTU can be substantially faster than using uncertainty-aware +Levenberg-Marquardt (or similar) optimization schemes, while providing results +of comparable precision. Finally, LOSTU is implemented in sequential +reconstruction in conjunction with uncertainty-aware pose estimation, where it +yields better reconstruction metrics. + +
+
+ comment: 19 pages, 5 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Lifelong Person Re-Identification with Backward-Compatibility + + +
+ Lifelong person re-identification (LReID) assumes a practical scenario where +the model is sequentially trained on continuously incoming datasets while +alleviating the catastrophic forgetting in the old datasets. However, not only +the training datasets but also the gallery images are incrementally +accumulated, that requires a huge amount of computational complexity and +storage space to extract the features at the inference phase. In this paper, we +address the above mentioned problem by incorporating the backward-compatibility +to LReID for the first time. We train the model using the continuously incoming +datasets while maintaining the model's compatibility toward the previously +trained old models without re-computing the features of the old gallery images. +To this end, we devise the cross-model compatibility loss based on the +contrastive learning with respect to the replay features across all the old +datasets. Moreover, we also develop the knowledge consolidation method based on +the part classification to learn the shared representation across different +datasets for the backward-compatibility. We suggest a more practical +methodology for performance evaluation as well where all the gallery and query +images are considered together. Experimental results demonstrate that the +proposed method achieves a significantly higher performance of the +backward-compatibility compared with the existing methods. It is a promising +tool for more practical scenarios of LReID. + +
+
+ comment: 17 pages, 5 figures, 7 tables +
+
+
+
+
+ + ♻ ☆ Frequency-Adaptive Dilated Convolution for Semantic Segmentation + + +
+ Dilated convolution, which expands the receptive field by inserting gaps +between its consecutive elements, is widely employed in computer vision. In +this study, we propose three strategies to improve individual phases of dilated +convolution from the view of spectrum analysis. Departing from the conventional +practice of fixing a global dilation rate as a hyperparameter, we introduce +Frequency-Adaptive Dilated Convolution (FADC), which dynamically adjusts +dilation rates spatially based on local frequency components. Subsequently, we +design two plug-in modules to directly enhance effective bandwidth and +receptive field size. The Adaptive Kernel (AdaKern) module decomposes +convolution weights into low-frequency and high-frequency components, +dynamically adjusting the ratio between these components on a per-channel +basis. By increasing the high-frequency part of convolution weights, AdaKern +captures more high-frequency components, thereby improving effective bandwidth. +The Frequency Selection (FreqSelect) module optimally balances high- and +low-frequency components in feature representations through spatially variant +reweighting. It suppresses high frequencies in the background to encourage FADC +to learn a larger dilation, thereby increasing the receptive field for an +expanded scope. Extensive experiments on segmentation and object detection +consistently validate the efficacy of our approach. The code is publicly +available at \url{https://github.com/Linwei-Chen/FADC}. + +
+
+
+
+
+ + ♻ ☆ GPT-4V with Emotion: A Zero-shot Benchmark for Generalized Emotion + Recognition + + +
+ Recently, GPT-4 with Vision (GPT-4V) has demonstrated remarkable visual +capabilities across various tasks, but its performance in emotion recognition +has not been fully evaluated. To bridge this gap, we present the quantitative +evaluation results of GPT-4V on 21 benchmark datasets covering 6 tasks: visual +sentiment analysis, tweet sentiment analysis, micro-expression recognition, +facial emotion recognition, dynamic facial emotion recognition, and multimodal +emotion recognition. This paper collectively refers to these tasks as +``Generalized Emotion Recognition (GER)''. Through experimental analysis, we +observe that GPT-4V exhibits strong visual understanding capabilities in GER +tasks. Meanwhile, GPT-4V shows the ability to integrate multimodal clues and +exploit temporal information, which is also critical for emotion recognition. +However, it's worth noting that GPT-4V is primarily designed for general +domains and cannot recognize micro-expressions that require specialized +knowledge. To the best of our knowledge, this paper provides the first +quantitative assessment of GPT-4V for GER tasks. We have open-sourced the code +and encourage subsequent researchers to broaden the evaluation scope by +including more tasks and datasets. Our code and evaluation results are +available at: https://github.com/zeroQiaoba/gpt4v-emotion. + +
+
+
+
+
+ + ♻ ☆ DynaMoN: Motion-Aware Fast and Robust Camera Localization for Dynamic + Neural Radiance Fields + + +
+ The accurate reconstruction of dynamic scenes with neural radiance fields is +significantly dependent on the estimation of camera poses. Widely used +structure-from-motion pipelines encounter difficulties in accurately tracking +the camera trajectory when faced with separate dynamics of the scene content +and the camera movement. To address this challenge, we propose DynaMoN. DynaMoN +utilizes semantic segmentation and generic motion masks to handle dynamic +content for initial camera pose estimation and statics-focused ray sampling for +fast and accurate novel-view synthesis. Our novel iterative learning scheme +switches between training the NeRF and updating the pose parameters for an +improved reconstruction and trajectory estimation quality. The proposed +pipeline shows significant acceleration of the training process. We extensively +evaluate our approach on two real-world dynamic datasets, the TUM RGB-D and the +BONN RGB-D Dynamic dataset. DynaMoN improves over the state-of-the-art both in +terms of reconstruction quality and trajectory accuracy. We plan to make our +code public to enhance research in this area. + +
+
+ comment: 6 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ A survey of synthetic data augmentation methods in computer vision + + +
+ The standard approach to tackling computer vision problems is to train deep +convolutional neural network (CNN) models using large-scale image datasets +which are representative of the target task. However, in many scenarios, it is +often challenging to obtain sufficient image data for the target task. Data +augmentation is a way to mitigate this challenge. A common practice is to +explicitly transform existing images in desired ways so as to create the +required volume and variability of training data necessary to achieve good +generalization performance. In situations where data for the target domain is +not accessible, a viable workaround is to synthesize training data from +scratch--i.e., synthetic data augmentation. This paper presents an extensive +review of synthetic data augmentation techniques. It covers data synthesis +approaches based on realistic 3D graphics modeling, neural style transfer +(NST), differential neural rendering, and generative artificial intelligence +(AI) techniques such as generative adversarial networks (GANs) and variational +autoencoders (VAEs). For each of these classes of methods, we focus on the +important data generation and augmentation techniques, general scope of +application and specific use-cases, as well as existing limitations and +possible workarounds. Additionally, we provide a summary of common synthetic +datasets for training computer vision models, highlighting the main features, +application domains and supported tasks. Finally, we discuss the effectiveness +of synthetic data augmentation methods. Since this is the first paper to +explore synthetic data augmentation methods in great detail, we are hoping to +equip readers with the necessary background information and in-depth knowledge +of existing methods and their attendant issues. + +
+
+
+
+
+ + ♻ ☆ On Robust Cross-View Consistency in Self-Supervised Monocular Depth + Estimation + + +
+ Remarkable progress has been made in self-supervised monocular depth +estimation (SS-MDE) by exploring cross-view consistency, e.g., photometric +consistency and 3D point cloud consistency. However, they are very vulnerable +to illumination variance, occlusions, texture-less regions, as well as moving +objects, making them not robust enough to deal with various scenes. To address +this challenge, we study two kinds of robust cross-view consistency in this +paper. Firstly, the spatial offset field between adjacent frames is obtained by +reconstructing the reference frame from its neighbors via deformable alignment, +which is used to align the temporal depth features via a Depth Feature +Alignment (DFA) loss. Secondly, the 3D point clouds of each reference frame and +its nearby frames are calculated and transformed into voxel space, where the +point density in each voxel is calculated and aligned via a Voxel Density +Alignment (VDA) loss. In this way, we exploit the temporal coherence in both +depth feature space and 3D voxel space for SS-MDE, shifting the +"point-to-point" alignment paradigm to the "region-to-region" one. Compared +with the photometric consistency loss as well as the rigid point cloud +alignment loss, the proposed DFA and VDA losses are more robust owing to the +strong representation power of deep features as well as the high tolerance of +voxel density to the aforementioned challenges. Experimental results on several +outdoor benchmarks show that our method outperforms current state-of-the-art +techniques. Extensive ablation study and analysis validate the effectiveness of +the proposed losses, especially in challenging scenes. The code and models are +available at https://github.com/sunnyHelen/RCVC-depth. + +
+
+
+
+
+ + ♻ ☆ Control and Automation for Industrial Production Storage Zone: + Generation of Optimal Route Using Image Processing + + +
+ Digital image processing (DIP) is of great importance in validating and +guaranteeing parameters that ensure the quality of mass-produced products. +Therefore, this article focused on developing an industrial automation method +for a zone of a production line model using the DIP. The neo-cascade +methodology employed allowed for defining each of the stages in an adequate +way, ensuring the inclusion of the relevant methods for its development, which +finally incurred in the modeling, design, implementation, and testing of an +optimal route generation system for a warehouse area, using DIP with +optimization guidelines, in conjunction with an embedded platform and the +connection to programmable logic controllers (PLCs) for its execution. The +system was based on the OpenCV library; tool focused on artificial vision, +which was implemented on an object-oriented programming (OOP) platform based on +Java language. It generated the optimal route for the automation of processes +in a scale warehouse area, using the segmentation of objects and the +optimization of flow in networks as pillars, ending with the connection to PLCs +as a method of action, which in case of implementation would eliminate +constraints such as process inefficiency, the use of manpower to perform these +tasks, inadequate use of resources, among others + +
+
+ comment: 17 figures, 17 tables, from a thesis (2017) +
+
+
+
+
+ + ♻ ☆ DyRoNet: Dynamic Routing and Low-Rank Adapters for Autonomous Driving + Streaming Perception + + +
+ The advancement of autonomous driving systems hinges on the ability to +achieve low-latency and high-accuracy perception. To address this critical +need, this paper introduces Dynamic Routering Network (DyRoNet), a low-rank +enhanced dynamic routing framework designed for streaming perception in +autonomous driving systems. DyRoNet integrates a suite of pre-trained branch +networks, each meticulously fine-tuned to function under distinct environmental +conditions. At its core, the framework offers a speed router module, developed +to assess and route input data to the most suitable branch for processing. This +approach not only addresses the inherent limitations of conventional models in +adapting to diverse driving conditions but also ensures the balance between +performance and efficiency. Extensive experimental evaluations demonstrating +the adaptability of DyRoNet to diverse branch selection strategies, resulting +in significant performance enhancements across different scenarios. This work +not only establishes a new benchmark for streaming perception but also provides +valuable engineering insights for future work. + +
+
+ comment: Project: https://tastevision.github.io/DyRoNet/ +
+
+
+
+
+ + ♻ ☆ Boundary Attention: Learning to Localize Boundaries under High Noise + + +
+ We present a differentiable model that infers explicit boundaries, including +curves, corners and junctions, using a mechanism that we call boundary +attention. Boundary attention is a boundary-aware local attention operation +that, when applied densely and repeatedly, progressively refines a field of +variables that specify an unrasterized description of the local boundary +structure in every overlapping patch within an image. It operates in a +bottom-up fashion, similar to classical methods for sub-pixel edge localization +and edge-linking, but with a higher-dimensional description of local boundary +structure, a notion of spatial consistency that is learned instead of designed, +and a sequence of operations that is end-to-end differentiable. We train our +model using simple synthetic data and then evaluate it using photographs that +were captured under low-light conditions with variable amounts of noise. We +find that our method generalizes to natural images corrupted by real sensor +noise, and predicts consistent boundaries under increasingly noisy conditions +where other state-of-the-art methods fail. + +
+
+ comment: Project website at boundaryattention.github.io: + http://boundaryattention.github.io +
+
+
+
+
+ + ♻ ☆ ANIM: Accurate Neural Implicit Model for Human Reconstruction from a + single RGB-D image CVPR24 + + +
+ Recent progress in human shape learning, shows that neural implicit models +are effective in generating 3D human surfaces from limited number of views, and +even from a single RGB image. However, existing monocular approaches still +struggle to recover fine geometric details such as face, hands or cloth +wrinkles. They are also easily prone to depth ambiguities that result in +distorted geometries along the camera optical axis. In this paper, we explore +the benefits of incorporating depth observations in the reconstruction process +by introducing ANIM, a novel method that reconstructs arbitrary 3D human shapes +from single-view RGB-D images with an unprecedented level of accuracy. Our +model learns geometric details from both multi-resolution pixel-aligned and +voxel-aligned features to leverage depth information and enable spatial +relationships, mitigating depth ambiguities. We further enhance the quality of +the reconstructed shape by introducing a depth-supervision strategy, which +improves the accuracy of the signed distance field estimation of points that +lie on the reconstructed surface. Experiments demonstrate that ANIM outperforms +state-of-the-art works that use RGB, surface normals, point cloud or RGB-D data +as input. In addition, we introduce ANIM-Real, a new multi-modal dataset +comprising high-quality scans paired with consumer-grade RGB-D camera, and our +protocol to fine-tune ANIM, enabling high-quality reconstruction from +real-world human capture. + +
+
+ comment: Accepted to CVPR24; Project page: + https://marcopesavento.github.io/ANIM/ +
+
+
+
+
+ + ♻ ☆ T-MARS: Improving Visual Representations by Circumventing Text Feature + Learning ICLR 2024 + + +
+ Large web-sourced multimodal datasets have powered a slew of new methods for +learning general-purpose visual representations, advancing the state of the art +in computer vision and revolutionizing zero- and few-shot recognition. One +crucial decision facing practitioners is how, if at all, to curate these +ever-larger datasets. For example, the creators of the LAION-5B dataset chose +to retain only image-caption pairs whose CLIP similarity score exceeded a +designated threshold. In this paper, we propose a new state-of-the-art data +filtering approach motivated by our observation that nearly 40% of LAION's +images contain text that overlaps significantly with the caption. Intuitively, +such data could be wasteful as it incentivizes models to perform optical +character recognition rather than learning visual features. However, naively +removing all such data could also be wasteful, as it throws away images that +contain visual features (in addition to overlapping text). Our simple and +scalable approach, T-MARS (Text Masking and Re-Scoring), filters out only those +pairs where the text dominates the remaining visual features -- by first +masking out the text and then filtering out those with a low CLIP similarity +score of the masked image. Experimentally, T-MARS outperforms the top-ranked +method on the "medium scale" of DataComp (a data filtering benchmark) by a +margin of 6.5% on ImageNet and 4.7% on VTAB. Additionally, our systematic +evaluation on various data pool sizes from 2M to 64M shows that the accuracy +gains enjoyed by T-MARS linearly increase as data and compute are scaled +exponentially. Code is available at https://github.com/locuslab/T-MARS. + +
+
+ comment: Accepted to ICLR 2024. Oral at ICCV Datacomp 2023 +
+
+
+
+
+ + ♻ ☆ COLE: A Hierarchical Generation Framework for Multi-Layered and Editable + Graphic Design + + +
+ Graphic design, which has been evolving since the 15th century, plays a +crucial role in advertising. The creation of high-quality designs demands +design-oriented planning, reasoning, and layer-wise generation. Unlike the +recent CanvaGPT, which integrates GPT-4 with existing design templates to build +a custom GPT, this paper introduces the COLE system - a hierarchical generation +framework designed to comprehensively address these challenges. This COLE +system can transform a vague intention prompt into a high-quality multi-layered +graphic design, while also supporting flexible editing based on user input. +Examples of such input might include directives like ``design a poster for +Hisaishi's concert.'' The key insight is to dissect the complex task of +text-to-design generation into a hierarchy of simpler sub-tasks, each addressed +by specialized models working collaboratively. The results from these models +are then consolidated to produce a cohesive final output. Our hierarchical task +decomposition can streamline the complex process and significantly enhance +generation reliability. Our COLE system comprises multiple fine-tuned Large +Language Models (LLMs), Large Multimodal Models (LMMs), and Diffusion Models +(DMs), each specifically tailored for design-aware layer-wise captioning, +layout planning, reasoning, and the task of generating images and text. +Furthermore, we construct the DESIGNINTENTION benchmark to demonstrate the +superiority of our COLE system over existing methods in generating high-quality +graphic designs from user intent. Last, we present a Canva-like multi-layered +image editing tool to support flexible editing of the generated multi-layered +graphic design images. We perceive our COLE system as an important step towards +addressing more complex and multi-layered graphic design generation tasks in +the future. + +
+
+ comment: Technical report. Project page: + https://graphic-design-generation-github-io.vercel.app/ +
+
+
+
+
+ + ♻ ☆ ImageNet-OOD: Deciphering Modern Out-of-Distribution Detection + Algorithms ICLR 2024 + + +
+ The task of out-of-distribution (OOD) detection is notoriously ill-defined. +Earlier works focused on new-class detection, aiming to identify label-altering +data distribution shifts, also known as "semantic shift." However, recent works +argue for a focus on failure detection, expanding the OOD evaluation framework +to account for label-preserving data distribution shifts, also known as +"covariate shift." Intriguingly, under this new framework, complex OOD +detectors that were previously considered state-of-the-art now perform +similarly to, or even worse than the simple maximum softmax probability +baseline. This raises the question: what are the latest OOD detectors actually +detecting? Deciphering the behavior of OOD detection algorithms requires +evaluation datasets that decouples semantic shift and covariate shift. To aid +our investigations, we present ImageNet-OOD, a clean semantic shift dataset +that minimizes the interference of covariate shift. Through comprehensive +experiments, we show that OOD detectors are more sensitive to covariate shift +than to semantic shift, and the benefits of recent OOD detection algorithms on +semantic shift detection is minimal. Our dataset and analyses provide important +insights for guiding the design of future OOD detectors. + +
+
+ comment: ICLR 2024. Code and dataset at + https://github.com/princetonvisualai/imagenetood +
+
+
+
+
+ + ♻ ☆ Evaluating the Utility of Conformal Prediction Sets for AI-Advised Image + Labeling + + +
+ As deep neural networks are more commonly deployed in high-stakes domains, +their black-box nature makes uncertainty quantification challenging. We +investigate the effects of presenting conformal prediction sets--a +distribution-free class of methods for generating prediction sets with +specified coverage--to express uncertainty in AI-advised decision-making. +Through a large online experiment, we compare the utility of conformal +prediction sets to displays of Top-1 and Top-k predictions for AI-advised image +labeling. In a pre-registered analysis, we find that the utility of prediction +sets for accuracy varies with the difficulty of the task: while they result in +accuracy on par with or less than Top-1 and Top-k displays for easy images, +prediction sets excel at assisting humans in labeling out-of-distribution (OOD) +images, especially when the set size is small. Our results empirically pinpoint +practical challenges of conformal prediction sets and provide implications on +how to incorporate them for real-world decision-making. + +
+
+ comment: 19 pages, 11 figures, 10 tables. Accepted by ACM CHI 2024 +
+
+
+
+
+ + ♻ ☆ Touch-GS: Visual-Tactile Supervised 3D Gaussian Splatting + + +
+ In this work, we propose a novel method to supervise 3D Gaussian Splatting +(3DGS) scenes using optical tactile sensors. Optical tactile sensors have +become widespread in their use in robotics for manipulation and object +representation; however, raw optical tactile sensor data is unsuitable to +directly supervise a 3DGS scene. Our representation leverages a Gaussian +Process Implicit Surface to implicitly represent the object, combining many +touches into a unified representation with uncertainty. We merge this model +with a monocular depth estimation network, which is aligned in a two stage +process, coarsely aligning with a depth camera and then finely adjusting to +match our touch data. For every training image, our method produces a +corresponding fused depth and uncertainty map. Utilizing this additional +information, we propose a new loss function, variance weighted depth supervised +loss, for training the 3DGS scene model. We leverage the DenseTact optical +tactile sensor and RealSense RGB-D camera to show that combining touch and +vision in this manner leads to quantitatively and qualitatively better results +than vision or touch alone in a few-view scene syntheses on opaque as well as +on reflective and transparent objects. Please see our project page at +http://armlabstanford.github.io/touch-gs + +
+
+ comment: 8 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ BeyondPixels: A Comprehensive Review of the Evolution of Neural Radiance + Fields + + +
+ Neural rendering combines ideas from classical computer graphics and machine +learning to synthesize images from real-world observations. NeRF, short for +Neural Radiance Fields, is a recent innovation that uses AI algorithms to +create 3D objects from 2D images. By leveraging an interpolation approach, NeRF +can produce new 3D reconstructed views of complicated scenes. Rather than +directly restoring the whole 3D scene geometry, NeRF generates a volumetric +representation called a ``radiance field,'' which is capable of creating color +and density for every point within the relevant 3D space. The broad appeal and +notoriety of NeRF make it imperative to examine the existing research on the +topic comprehensively. While previous surveys on 3D rendering have primarily +focused on traditional computer vision-based or deep learning-based approaches, +only a handful of them discuss the potential of NeRF. However, such surveys +have predominantly focused on NeRF's early contributions and have not explored +its full potential. NeRF is a relatively new technique continuously being +investigated for its capabilities and limitations. This survey reviews recent +advances in NeRF and categorizes them according to their architectural designs, +especially in the field of novel view synthesis. + +
+
+ comment: 33 page, 7 figure, 5 table +
+
+
+
+
+ + ♻ ☆ Predicting Generalization of AI Colonoscopy Models to Unseen Data + + +
+ Background: Generalizability of AI colonoscopy algorithms is important for +wider adoption in clinical practice. However, current techniques for evaluating +performance on unseen data require expensive and time-intensive labels. + Methods: We use a "Masked Siamese Network" (MSN) to identify novel phenomena +in unseen data and predict polyp detector performance. MSN is trained to +predict masked out regions of polyp images, without any labels. We test MSN's +ability to be trained on data only from Israel and detect unseen techniques, +narrow-band imaging (NBI) and chromendoscoy (CE), on colonoscopes from Japan +(354 videos, 128 hours). We also test MSN's ability to predict performance of +Computer Aided Detection (CADe) of polyps on colonoscopies from both countries, +even though MSN is not trained on data from Japan. + Results: MSN correctly identifies NBI and CE as less similar to Israel +whitelight than Japan whitelight (bootstrapped z-test, |z| > 496, p < 10^-8 for +both) using the label-free Frechet distance. MSN detects NBI with 99% accuracy, +predicts CE better than our heuristic (90% vs 79% accuracy) despite being +trained only on whitelight, and is the only method that is robust to noisy +labels. MSN predicts CADe polyp detector performance on in-domain Israel and +out-of-domain Japan colonoscopies (r=0.79, 0.37 respectively). With few +examples of Japan detector performance to train on, MSN prediction of Japan +performance improves (r=0.56). + Conclusion: Our technique can identify distribution shifts in clinical data +and can predict CADe detector performance on unseen data, without labels. Our +self-supervised approach can aid in detecting when data in practice is +different from training, such as between hospitals or data has meaningfully +shifted from training. MSN has potential for application to medical image +domains beyond colonoscopy. + +
+
+
+
+
+ + ♻ ☆ DCVNet: Dilated Cost Volume Networks for Fast Optical Flow + + +
+ The cost volume, capturing the similarity of possible correspondences across +two input images, is a key ingredient in state-of-the-art optical flow +approaches. When sampling correspondences to build the cost volume, a large +neighborhood radius is required to deal with large displacements, introducing a +significant computational burden. To address this, coarse-to-fine or recurrent +processing of the cost volume is usually adopted, where correspondence sampling +in a local neighborhood with a small radius suffices. In this paper, we propose +an alternative by constructing cost volumes with different dilation factors to +capture small and large displacements simultaneously. A U-Net with skip +connections is employed to convert the dilated cost volumes into interpolation +weights between all possible captured displacements to get the optical flow. +Our proposed model DCVNet only needs to process the cost volume once in a +simple feedforward manner and does not rely on the sequential processing +strategy. DCVNet obtains comparable accuracy to existing approaches and +achieves real-time inference (30 fps on a mid-end 1080ti GPU). The code and +model weights are available at https://github.com/neu-vi/ezflow. + +
+
+
+
+
+ + ♻ ☆ MVDiffusion++: A Dense High-resolution Multi-view Diffusion Model for + Single or Sparse-view 3D Object Reconstruction + + +
+ This paper presents a neural architecture MVDiffusion++ for 3D object +reconstruction that synthesizes dense and high-resolution views of an object +given one or a few images without camera poses. MVDiffusion++ achieves superior +flexibility and scalability with two surprisingly simple ideas: 1) A +``pose-free architecture'' where standard self-attention among 2D latent +features learns 3D consistency across an arbitrary number of conditional and +generation views without explicitly using camera pose information; and 2) A +``view dropout strategy'' that discards a substantial number of output views +during training, which reduces the training-time memory footprint and enables +dense and high-resolution view synthesis at test time. We use the Objaverse for +training and the Google Scanned Objects for evaluation with standard novel view +synthesis and 3D reconstruction metrics, where MVDiffusion++ significantly +outperforms the current state of the arts. We also demonstrate a text-to-3D +application example by combining MVDiffusion++ with a text-to-image generative +model. The project page is at https://mvdiffusion-plusplus.github.io. + +
+
+ comment: 3D generation, project page: https://mvdiffusion-plusplus.github.io/ +
+
+
+
+
+ + ♻ ☆ HD-Painter: High-Resolution and Prompt-Faithful Text-Guided Image + Inpainting with Diffusion Models + + +
+ Recent progress in text-guided image inpainting, based on the unprecedented +success of text-to-image diffusion models, has led to exceptionally realistic +and visually plausible results. However, there is still significant potential +for improvement in current text-to-image inpainting models, particularly in +better aligning the inpainted area with user prompts and performing +high-resolution inpainting. Therefore, we introduce HD-Painter, a training free +approach that accurately follows prompts and coherently scales to high +resolution image inpainting. To this end, we design the Prompt-Aware +Introverted Attention (PAIntA) layer enhancing self-attention scores by prompt +information resulting in better text aligned generations. To further improve +the prompt coherence we introduce the Reweighting Attention Score Guidance +(RASG) mechanism seamlessly integrating a post-hoc sampling strategy into the +general form of DDIM to prevent out-of-distribution latent shifts. Moreover, +HD-Painter allows extension to larger scales by introducing a specialized +super-resolution technique customized for inpainting, enabling the completion +of missing regions in images of up to 2K resolution. Our experiments +demonstrate that HD-Painter surpasses existing state-of-the-art approaches +quantitatively and qualitatively across multiple metrics and a user study. Code +is publicly available at: https://github.com/Picsart-AI-Research/HD-Painter + +
+
+
+
+
+ + ♻ ☆ StillFast: An End-to-End Approach for Short-Term Object Interaction + Anticipation + + +
+ Anticipation problem has been studied considering different aspects such as +predicting humans' locations, predicting hands and objects trajectories, and +forecasting actions and human-object interactions. In this paper, we studied +the short-term object interaction anticipation problem from the egocentric +point of view, proposing a new end-to-end architecture named StillFast. Our +approach simultaneously processes a still image and a video detecting and +localizing next-active objects, predicting the verb which describes the future +interaction and determining when the interaction will start. Experiments on the +large-scale egocentric dataset EGO4D show that our method outperformed +state-of-the-art approaches on the considered task. Our method is ranked first +in the public leaderboard of the EGO4D short term object interaction +anticipation challenge 2022. Please see the project web page for code and +additional details: https://iplab.dmi.unict.it/stillfast/. + +
+
+
+
+
+ + ♻ ☆ PerceptionCLIP: Visual Classification by Inferring and Conditioning on + Contexts ICLR 2024 + + +
+ Vision-language models like CLIP are widely used in zero-shot image +classification due to their ability to understand various visual concepts and +natural language descriptions. However, how to fully leverage CLIP's +unprecedented human-like understanding capabilities to achieve better +performance is still an open question. This paper draws inspiration from the +human visual perception process: when classifying an object, humans first infer +contextual attributes (e.g., background and orientation) which help separate +the foreground object from the background, and then classify the object based +on this information. Inspired by it, we observe that providing CLIP with +contextual attributes improves zero-shot image classification and mitigates +reliance on spurious features. We also observe that CLIP itself can reasonably +infer the attributes from an image. With these observations, we propose a +training-free, two-step zero-shot classification method PerceptionCLIP. Given +an image, it first infers contextual attributes (e.g., background) and then +performs object classification conditioning on them. Our experiments show that +PerceptionCLIP achieves better generalization, group robustness, and +interoperability. Our code is available at +https://github.com/umd-huang-lab/perceptionCLIP + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ♻ ☆ DIG In: Evaluating Disparities in Image Generations with Indicators for + Geographic Diversity + + +
+ The unprecedented photorealistic results achieved by recent text-to-image +generative systems and their increasing use as plug-and-play content creation +solutions make it crucial to understand their potential biases. In this work, +we introduce three indicators to evaluate the realism, diversity and +prompt-generation consistency of text-to-image generative systems when prompted +to generate objects from across the world. Our indicators complement +qualitative analysis of the broader impact of such systems by enabling +automatic and efficient benchmarking of geographic disparities, an important +step towards building responsible visual content creation systems. We use our +proposed indicators to analyze potential geographic biases in state-of-the-art +visual content creation systems and find that: (1) models have less realism and +diversity of generations when prompting for Africa and West Asia than Europe, +(2) prompting with geographic information comes at a cost to prompt-consistency +and diversity of generated images, and (3) models exhibit more region-level +disparities for some objects than others. Perhaps most interestingly, our +indicators suggest that progress in image generation quality has come at the +cost of real-world geographic representation. Our comprehensive evaluation +constitutes a crucial step towards ensuring a positive experience of visual +content creation for everyone. + +
+
+
+
+
+ + ♻ ☆ Farm3D: Learning Articulated 3D Animals by Distilling 2D Diffusion 3DV 2024 + + +
+ We present Farm3D, a method for learning category-specific 3D reconstructors +for articulated objects, relying solely on "free" virtual supervision from a +pre-trained 2D diffusion-based image generator. Recent approaches can learn a +monocular network that predicts the 3D shape, albedo, illumination, and +viewpoint of any object occurrence, given a collection of single-view images of +an object category. However, these approaches heavily rely on manually curated +clean training data, which are expensive to obtain. We propose a framework that +uses an image generator, such as Stable Diffusion, to generate synthetic +training data that are sufficiently clean and do not require further manual +curation, enabling the learning of such a reconstruction network from scratch. +Additionally, we incorporate the diffusion model as a score to enhance the +learning process. The idea involves randomizing certain aspects of the +reconstruction, such as viewpoint and illumination, generating virtual views of +the reconstructed 3D object, and allowing the 2D network to assess the quality +of the resulting image, thus providing feedback to the reconstructor. Unlike +work based on distillation, which produces a single 3D asset for each textual +prompt, our approach yields a monocular reconstruction network capable of +outputting a controllable 3D asset from any given image, whether real or +generated, in a single forward pass in a matter of seconds. Our network can be +used for analysis, including monocular reconstruction, or for synthesis, +generating articulated assets for real-time applications such as video games. + +
+
+ comment: In 3DV 2024, Project page: http://farm3d.github.io +
+
+
+
+
+ + ♻ ☆ Multi-Tailed Vision Transformer for Efficient Inference + + +
+ Recently, Vision Transformer (ViT) has achieved promising performance in +image recognition and gradually serves as a powerful backbone in various vision +tasks. To satisfy the sequential input of Transformer, the tail of ViT first +splits each image into a sequence of visual tokens with a fixed length. Then +the following self-attention layers constructs the global relationship between +tokens to produce useful representation for the downstream tasks. Empirically, +representing the image with more tokens leads to better performance, yet the +quadratic computational complexity of self-attention layer to the number of +tokens could seriously influence the efficiency of ViT's inference. For +computational reduction, a few pruning methods progressively prune +uninformative tokens in the Transformer encoder, while leaving the number of +tokens before the Transformer untouched. In fact, fewer tokens as the input for +the Transformer encoder can directly reduce the following computational cost. +In this spirit, we propose a Multi-Tailed Vision Transformer (MT-ViT) in the +paper. MT-ViT adopts multiple tails to produce visual sequences of different +lengths for the following Transformer encoder. A tail predictor is introduced +to decide which tail is the most efficient for the image to produce accurate +prediction. Both modules are optimized in an end-to-end fashion, with the +Gumbel-Softmax trick. Experiments on ImageNet-1K demonstrate that MT-ViT can +achieve a significant reduction on FLOPs with no degradation of the accuracy +and outperform other compared methods in both accuracy and FLOPs. + +
+
+
+
+
+ + ♻ ☆ A Theoretical and Practical Framework for Evaluating Uncertainty + Calibration in Object Detection + + +
+ The proliferation of Deep Neural Networks has resulted in machine learning +systems becoming increasingly more present in various real-world applications. +Consequently, there is a growing demand for highly reliable models in many +domains, making the problem of uncertainty calibration pivotal when considering +the future of deep learning. This is especially true when considering object +detection systems, that are commonly present in safety-critical applications +such as autonomous driving, robotics and medical diagnosis. For this reason, +this work presents a novel theoretical and practical framework to evaluate +object detection systems in the context of uncertainty calibration. This +encompasses a new comprehensive formulation of this concept through distinct +formal definitions, and also three novel evaluation metrics derived from such +theoretical foundation. The robustness of the proposed uncertainty calibration +metrics is shown through a series of representative experiments. + +
+
+ comment: Pre-print +
+
+
+
+
+ + ♻ ☆ LHRS-Bot: Empowering Remote Sensing with VGI-Enhanced Large Multimodal + Language Model + + +
+ The revolutionary capabilities of large language models (LLMs) have paved the +way for multimodal large language models (MLLMs) and fostered diverse +applications across various specialized domains. In the remote sensing (RS) +field, however, the diverse geographical landscapes and varied objects in RS +imagery are not adequately considered in recent MLLM endeavors. To bridge this +gap, we construct a large-scale RS image-text dataset, LHRS-Align, and an +informative RS-specific instruction dataset, LHRS-Instruct, leveraging the +extensive volunteered geographic information (VGI) and globally available RS +images. Building on this foundation, we introduce LHRS-Bot, an MLLM tailored +for RS image understanding through a novel multi-level vision-language +alignment strategy and a curriculum learning method. Additionally, we introduce +LHRS-Bench, a benchmark for thoroughly evaluating MLLMs' abilities in RS image +understanding. Comprehensive experiments demonstrate that LHRS-Bot exhibits a +profound understanding of RS images and the ability to perform nuanced +reasoning within the RS domain. + +
+
+ comment: 36 pages, 10 figures. Github https://github.com/NJU-LHRS/LHRS-Bot +
+
+
+
+
+ + ♻ ☆ SlimSAM: 0.1% Data Makes Segment Anything Slim + + +
+ Current approaches for compressing the Segment Anything Model (SAM) yield +commendable results, yet necessitate extensive data to train a new network from +scratch. Employing conventional pruning techniques can remarkably reduce data +requirements but would suffer from a degradation in performance. To address +this challenging trade-off, we introduce SlimSAM, a novel data-efficient SAM +compression method that achieves superior performance with extremely less +training data. The essence of SlimSAM is encapsulated in the alternate slimming +framework which effectively enhances knowledge inheritance under severely +limited training data availability and exceptional pruning ratio. Diverging +from prior techniques, our framework progressively compresses the model by +alternately pruning and distilling distinct, decoupled sub-structures. +Disturbed Taylor pruning is also proposed to address the misalignment between +the pruning objective and training target, thereby boosting the +post-distillation after pruning. SlimSAM yields significant performance +improvements while demanding over 10 times less training data than any other +existing compression methods. Even when compared to the original SAM, SlimSAM +achieves approaching performance while reducing parameter counts to merely 1.4% +(9.1M), MACs to 0.8% (23G), and requiring only 0.1% (10k) of the SAM training +data. The code is available at http://github.com/czg1225/SlimSAM. + +
+
+ comment: Work in progress. Code reposity: http://github.com/czg1225/SlimSAM +
+
+
+
+
+ + ♻ ☆ EHRDiff: Exploring Realistic EHR Synthesis with Diffusion Models + + +
+ Electronic health records (EHR) contain a wealth of biomedical information, +serving as valuable resources for the development of precision medicine +systems. However, privacy concerns have resulted in limited access to +high-quality and large-scale EHR data for researchers, impeding progress in +methodological development. Recent research has delved into synthesizing +realistic EHR data through generative modeling techniques, where a majority of +proposed methods relied on generative adversarial networks (GAN) and their +variants for EHR synthesis. Despite GAN-based methods attaining +state-of-the-art performance in generating EHR data, these approaches are +difficult to train and prone to mode collapse. Recently introduced in +generative modeling, diffusion models have established cutting-edge performance +in image generation, but their efficacy in EHR data synthesis remains largely +unexplored. In this study, we investigate the potential of diffusion models for +EHR data synthesis and introduce a novel method, EHRDiff. Through extensive +experiments, EHRDiff establishes new state-of-the-art quality for synthetic EHR +data, protecting private information in the meanwhile. + +
+
+ comment: Accepted by TMLR, preprint of camera-ready version +
+
+
+
+
+ + ♻ ☆ DeepSolo++: Let Transformer Decoder with Explicit Points Solo for + Multilingual Text Spotting CVPR 2023 + + +
+ End-to-end text spotting aims to integrate scene text detection and +recognition into a unified framework. Dealing with the relationship between the +two sub-tasks plays a pivotal role in designing effective spotters. Although +Transformer-based methods eliminate the heuristic post-processing, they still +suffer from the synergy issue between the sub-tasks and low training +efficiency. Besides, they overlook the exploring on multilingual text spotting +which requires an extra script identification task. In this paper, we present +DeepSolo++, a simple DETR-like baseline that lets a single decoder with +explicit points solo for text detection, recognition, and script identification +simultaneously. Technically, for each text instance, we represent the character +sequence as ordered points and model them with learnable explicit point +queries. After passing a single decoder, the point queries have encoded +requisite text semantics and locations, thus can be further decoded to the +center line, boundary, script, and confidence of text via very simple +prediction heads in parallel. Furthermore, we show the surprisingly good +extensibility of our method, in terms of character class, language type, and +task. On the one hand, our method not only performs well in English scenes but +also masters the transcription with complex font structure and a thousand-level +character classes, such as Chinese. On the other hand, our DeepSolo++ achieves +better performance on the additionally introduced script identification task +with a simpler training pipeline compared with previous methods. In addition, +our models are also compatible with line annotations, which require much less +annotation cost than polygons. The code is available at +\url{https://github.com/ViTAE-Transformer/DeepSolo}. + +
+
+ comment: The extension of the CVPR 2023 paper (DeepSolo: Let Transformer + Decoder with Explicit Points Solo for Text Spotting). arXiv admin note: + substantial text overlap with arXiv:2211.10772 +
+
+
+
+
+ + ♻ ☆ Privacy Protection in MRI Scans Using 3D Masked Autoencoders + + +
+ MRI scans provide valuable medical information, however they also contain +sensitive and personally identifiable information that needs to be protected. +Whereas MRI metadata is easily sanitized, MRI image data is a privacy risk +because it contains information to render highly-realistic 3D visualizations of +a patient's head, enabling malicious actors to possibly identify the subject by +cross-referencing a database. Data anonymization and de-identification is +concerned with ensuring the privacy and confidentiality of individuals' +personal information. Traditional MRI de-identification methods remove +privacy-sensitive parts (e.g. eyes, nose etc.) from a given scan. This comes at +the expense of introducing a domain shift that can throw off downstream +analyses. In this work, we propose CP-MAE, a model that de-identifies the face +by remodeling it (e.g. changing the face) rather than by removing parts using +masked autoencoders. CP-MAE outperforms all previous approaches in terms of +downstream task performance as well as de-identification. With our method we +are able to synthesize high-fidelity scans of resolution up to $256^3$ -- +compared to $128^3$ with previous approaches -- which constitutes an eight-fold +increase in the number of voxels. + +
+
+
+
+
+ + ♻ ☆ Hidden in Plain Sight: Undetectable Adversarial Bias Attacks on + Vulnerable Patient Populations + + +
+ The proliferation of artificial intelligence (AI) in radiology has shed light +on the risk of deep learning (DL) models exacerbating clinical biases towards +vulnerable patient populations. While prior literature has focused on +quantifying biases exhibited by trained DL models, demographically targeted +adversarial bias attacks on DL models and its implication in the clinical +environment remains an underexplored field of research in medical imaging. In +this work, we demonstrate that demographically targeted label poisoning attacks +can introduce undetectable underdiagnosis bias in DL models. Our results across +multiple performance metrics and demographic groups like sex, age, and their +intersectional subgroups show that adversarial bias attacks demonstrate +high-selectivity for bias in the targeted group by degrading group model +performance without impacting overall model performance. Furthermore, our +results indicate that adversarial bias attacks result in biased DL models that +propagate prediction bias even when evaluated with external datasets. + +
+
+ comment: 29 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Depth- and Semantics-aware Multi-modal Domain Translation: Generating 3D + Panoramic Color Images from LiDAR Point Clouds + + +
+ This work presents a new depth- and semantics-aware conditional generative +model, named TITAN-Next, for cross-domain image-to-image translation in a +multi-modal setup between LiDAR and camera sensors. The proposed model +leverages scene semantics as a mid-level representation and is able to +translate raw LiDAR point clouds to RGB-D camera images by solely relying on +semantic scene segments. We claim that this is the first framework of its kind +and it has practical applications in autonomous vehicles such as providing a +fail-safe mechanism and augmenting available data in the target image domain. +The proposed model is evaluated on the large-scale and challenging +Semantic-KITTI dataset, and experimental findings show that it considerably +outperforms the original TITAN-Net and other strong baselines by 23.7$\%$ +margin in terms of IoU. + +
+
+
+
+
+ + ♻ ☆ Tag2Text: Guiding Vision-Language Model via Image Tagging ICLR 2024 + + +
+ This paper presents Tag2Text, a vision language pre-training (VLP) framework, +which introduces image tagging into vision-language models to guide the +learning of visual-linguistic features. In contrast to prior works which +utilize object tags either manually labeled or automatically detected with an +off-the-shelf detector with limited performance, our approach explicitly learns +an image tagger using tags parsed from image-paired text and thus provides a +strong semantic guidance to vision-language models. In this way, Tag2Text can +utilize large-scale annotation-free image tags in accordance with image-text +pairs, and provides more diverse tag categories beyond objects. As a result, +Tag2Text demonstrates the ability of a foundational image tagging model, with +superior zero-shot performance even comparable to fully supervised models. +Moreover, by leveraging the tagging guidance, Tag2Text effectively enhances the +performance of vision-language models on both generation-based and +alignment-based tasks. Across a wide range of downstream benchmarks, Tag2Text +achieves state-of-the-art results with similar model sizes and data scales, +demonstrating the efficacy of the proposed tagging guidance. Code, demo and +pre-trained models are available at +https://github.com/xinyu1205/recognize-anything. + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Towards Seamless Adaptation of Pre-trained Models for Visual Place + Recognition ICLR2024 + + +
+ Recent studies show that vision models pre-trained in generic visual learning +tasks with large-scale data can provide useful feature representations for a +wide range of visual perception problems. However, few attempts have been made +to exploit pre-trained foundation models in visual place recognition (VPR). Due +to the inherent difference in training objectives and data between the tasks of +model pre-training and VPR, how to bridge the gap and fully unleash the +capability of pre-trained models for VPR is still a key issue to address. To +this end, we propose a novel method to realize seamless adaptation of +pre-trained models for VPR. Specifically, to obtain both global and local +features that focus on salient landmarks for discriminating places, we design a +hybrid adaptation method to achieve both global and local adaptation +efficiently, in which only lightweight adapters are tuned without adjusting the +pre-trained model. Besides, to guide effective adaptation, we propose a mutual +nearest neighbor local feature loss, which ensures proper dense local features +are produced for local matching and avoids time-consuming spatial verification +in re-ranking. Experimental results show that our method outperforms the +state-of-the-art methods with less training data and training time, and uses +about only 3% retrieval runtime of the two-stage VPR methods with RANSAC-based +spatial verification. It ranks 1st on the MSLS challenge leaderboard (at the +time of submission). The code is released at +https://github.com/Lu-Feng/SelaVPR. + +
+
+ comment: ICLR2024 +
+
+
+
+
+ + ♻ ☆ Towards Real-World Blind Face Restoration with Generative Diffusion + Prior + + +
+ Blind face restoration is an important task in computer vision and has gained +significant attention due to its wide-range applications. Previous works mainly +exploit facial priors to restore face images and have demonstrated high-quality +results. However, generating faithful facial details remains a challenging +problem due to the limited prior knowledge obtained from finite data. In this +work, we delve into the potential of leveraging the pretrained Stable Diffusion +for blind face restoration. We propose BFRffusion which is thoughtfully +designed to effectively extract features from low-quality face images and could +restore realistic and faithful facial details with the generative prior of the +pretrained Stable Diffusion. In addition, we build a privacy-preserving face +dataset called PFHQ with balanced attributes like race, gender, and age. This +dataset can serve as a viable alternative for training blind face restoration +networks, effectively addressing privacy and bias concerns usually associated +with the real face datasets. Through an extensive series of experiments, we +demonstrate that our BFRffusion achieves state-of-the-art performance on both +synthetic and real-world public testing datasets for blind face restoration and +our PFHQ dataset is an available resource for training blind face restoration +networks. The codes, pretrained models, and dataset are released at +https://github.com/chenxx89/BFRffusion. + +
+
+
+
+
+ + ♻ ☆ Quality and Quantity: Unveiling a Million High-Quality Images for + Text-to-Image Synthesis in Fashion Design + + +
+ The fusion of AI and fashion design has emerged as a promising research area. +However, the lack of extensive, interrelated data on clothing and try-on stages +has hindered the full potential of AI in this domain. Addressing this, we +present the Fashion-Diffusion dataset, a product of multiple years' rigorous +effort. This dataset, the first of its kind, comprises over a million +high-quality fashion images, paired with detailed text descriptions. Sourced +from a diverse range of geographical locations and cultural backgrounds, the +dataset encapsulates global fashion trends. The images have been meticulously +annotated with fine-grained attributes related to clothing and humans, +simplifying the fashion design process into a Text-to-Image (T2I) task. The +Fashion-Diffusion dataset not only provides high-quality text-image pairs and +diverse human-garment pairs but also serves as a large-scale resource about +humans, thereby facilitating research in T2I generation. Moreover, to foster +standardization in the T2I-based fashion design field, we propose a new +benchmark comprising multiple datasets for evaluating the performance of +fashion design models. This work represents a significant leap forward in the +realm of AI-driven fashion design, setting a new standard for future research +in this field. + +
+
+
+
+
+ + ♻ ☆ Approaching Test Time Augmentation in the Context of Uncertainty + Calibration for Deep Neural Networks + + +
+ With the rise of Deep Neural Networks, machine learning systems are nowadays +ubiquitous in a number of real-world applications, which bears the need for +highly reliable models. This requires a thorough look not only at the accuracy +of such systems, but also at their predictive uncertainty. Hence, we propose a +novel technique (with two different variations, named M-ATTA and V-ATTA) based +on test time augmentation, to improve the uncertainty calibration of deep +models for image classification. By leveraging na adaptive weighting system, +M/V-ATTA improves uncertainty calibration without affecting the model's +accuracy. The performance of these techniques is evaluated by considering +diverse metrics related to uncertainty calibration, demonstrating their +robustness. Empirical results, obtained on CIFAR-10, CIFAR-100, Aerial Image +Dataset, as well as in two different scenarios under distribution-shift, +indicate that the proposed methods outperform several state-of-the-art post-hoc +calibration techniques. Furthermore, the methods proposed also show +improvements in terms of predictive entropy on out-of-distribution samples. +Code for M/V-ATTA available at: https://github.com/pedrormconde/MV-ATTA + +
+
+ comment: Submitted to IEEE Transactions on Pattern Analysis and Machine + Intelligence +
+
+
+
+
+ + ♻ ☆ Towards Lossless Dataset Distillation via Difficulty-Aligned Trajectory + Matching ICLR 2024 + + +
+ The ultimate goal of Dataset Distillation is to synthesize a small synthetic +dataset such that a model trained on this synthetic set will perform equally +well as a model trained on the full, real dataset. Until now, no method of +Dataset Distillation has reached this completely lossless goal, in part due to +the fact that previous methods only remain effective when the total number of +synthetic samples is extremely small. Since only so much information can be +contained in such a small number of samples, it seems that to achieve truly +loss dataset distillation, we must develop a distillation method that remains +effective as the size of the synthetic dataset grows. In this work, we present +such an algorithm and elucidate why existing methods fail to generate larger, +high-quality synthetic sets. Current state-of-the-art methods rely on +trajectory-matching, or optimizing the synthetic data to induce similar +long-term training dynamics as the real data. We empirically find that the +training stage of the trajectories we choose to match (i.e., early or late) +greatly affects the effectiveness of the distilled dataset. Specifically, early +trajectories (where the teacher network learns easy patterns) work well for a +low-cardinality synthetic set since there are fewer examples wherein to +distribute the necessary information. Conversely, late trajectories (where the +teacher network learns hard patterns) provide better signals for larger +synthetic sets since there are now enough samples to represent the necessary +complex patterns. Based on our findings, we propose to align the difficulty of +the generated patterns with the size of the synthetic dataset. In doing so, we +successfully scale trajectory matching-based methods to larger synthetic +datasets, achieving lossless dataset distillation for the very first time. Code +and distilled datasets are available at https://gzyaftermath.github.io/DATM. + +
+
+ comment: First lossless dataset distillation method, accepted by ICLR 2024 +
+
+
+
+
+ + ♻ ☆ D-SCo: Dual-Stream Conditional Diffusion for Monocular Hand-Held Object + Reconstruction + + +
+ Reconstructing hand-held objects from a single RGB image is a challenging +task in computer vision. In contrast to prior works that utilize deterministic +modeling paradigms, we employ a point cloud denoising diffusion model to +account for the probabilistic nature of this problem. In the core, we introduce +centroid-fixed dual-stream conditional diffusion for monocular hand-held object +reconstruction (D-SCo), tackling two predominant challenges. First, to avoid +the object centroid from deviating, we utilize a novel hand-constrained +centroid fixing paradigm, enhancing the stability of diffusion and reverse +processes and the precision of feature projection. Second, we introduce a +dual-stream denoiser to semantically and geometrically model hand-object +interactions with a novel unified hand-object semantic embedding, enhancing the +reconstruction performance of the hand-occluded region of the object. +Experiments on the synthetic ObMan dataset and three real-world datasets HO3D, +MOW and DexYCB demonstrate that our approach can surpass all other +state-of-the-art methods. Codes will be released. + +
+
+
+
+
+ + ♻ ☆ Interaction Replica: Tracking Human-Object Interaction and Scene Changes + From Human Motion 3DV'24 + + +
+ Our world is not static and humans naturally cause changes in their +environments through interactions, e.g., opening doors or moving furniture. +Modeling changes caused by humans is essential for building digital twins, +e.g., in the context of shared physical-virtual spaces (metaverses) and +robotics. In order for widespread adoption of such emerging applications, the +sensor setup used to capture the interactions needs to be inexpensive and +easy-to-use for non-expert users. I.e., interactions should be captured and +modeled by simple ego-centric sensors such as a combination of cameras and IMU +sensors, not relying on any external cameras or object trackers. Yet, to the +best of our knowledge, no work tackling the challenging problem of modeling +human-scene interactions via such an ego-centric sensor setup exists. This +paper closes this gap in the literature by developing a novel approach that +combines visual localization of humans in the scene with contact-based +reasoning about human-scene interactions from IMU data. Interestingly, we can +show that even without visual observations of the interactions, human-scene +contacts and interactions can be realistically predicted from human pose +sequences. Our method, iReplica (Interaction Replica), is an essential first +step towards the egocentric capture of human interactions and modeling of +dynamic scenes, which is required for future AR/VR applications in immersive +virtual universes and for training machines to behave like humans. Our code, +data and model are available on our project page at +http://virtualhumans.mpi-inf.mpg.de/ireplica/ + +
+
+ comment: International Conference on 3D Vision 2024 (3DV'24) +
+
+
+
+
+ + ♻ ☆ STREAM: Spatio-TempoRal Evaluation and Analysis Metric for Video + Generative Models ICLR 2024 + + +
+ Image generative models have made significant progress in generating +realistic and diverse images, supported by comprehensive guidance from various +evaluation metrics. However, current video generative models struggle to +generate even short video clips, with limited tools that provide insights for +improvements. Current video evaluation metrics are simple adaptations of image +metrics by switching the embeddings with video embedding networks, which may +underestimate the unique characteristics of video. Our analysis reveals that +the widely used Frechet Video Distance (FVD) has a stronger emphasis on the +spatial aspect than the temporal naturalness of video and is inherently +constrained by the input size of the embedding networks used, limiting it to 16 +frames. Additionally, it demonstrates considerable instability and diverges +from human evaluations. To address the limitations, we propose STREAM, a new +video evaluation metric uniquely designed to independently evaluate spatial and +temporal aspects. This feature allows comprehensive analysis and evaluation of +video generative models from various perspectives, unconstrained by video +length. We provide analytical and experimental evidence demonstrating that +STREAM provides an effective evaluation tool for both visual and temporal +quality of videos, offering insights into area of improvement for video +generative models. To the best of our knowledge, STREAM is the first evaluation +metric that can separately assess the temporal and spatial aspects of videos. +Our code is available at https://github.com/pro2nit/STREAM. + +
+
+ comment: Our work is accepted to ICLR 2024 +
+
+
+
+
+ + ♻ ☆ ChildCI Framework: Analysis of Motor and Cognitive Development in + Children-Computer Interaction for Age Detection + + +
+ This article presents a comprehensive analysis of the different tests +proposed in the recent ChildCI framework, proving its potential for generating +a better understanding of children's neuromotor and cognitive development along +time, as well as their possible application in other research areas such as +e-Health and e-Learning. In particular, we propose a set of over 100 global +features related to motor and cognitive aspects of the children interaction +with mobile devices, some of them collected and adapted from the literature. + Furthermore, we analyse the robustness and discriminative power of the +proposed feature set including experimental results for the task of children +age group detection based on their motor and cognitive behaviours. Two +different scenarios are considered in this study: i) single-test scenario, and +ii) multiple-test scenario. Results over 93% accuracy are achieved using the +publicly available ChildCIdb_v1 database (over 400 children from 18 months to 8 +years old), proving the high correlation of children's age with the way they +interact with mobile devices. + +
+
+ comment: 12 pages, 3 figures, 7 tables +
+
+
+
+
+ + ♻ ☆ MDU-Net: Multi-scale Densely Connected U-Net for biomedical image + segmentation + + +
+ Biomedical image segmentation plays a central role in quantitative analysis, +clinical diagnosis, and medical intervention. In the light of the fully +convolutional networks (FCN) and U-Net, deep convolutional networks (DNNs) have +made significant contributions to biomedical image segmentation applications. +In this paper, we propose three different multi-scale dense connections (MDC) +for the encoder, the decoder of U-shaped architectures, and across them. Based +on three dense connections, we propose a multi-scale densely connected U-Net +(MDU-Net) for biomedical image segmentation. MDU-Net directly fuses the +neighboring feature maps with different scales from both higher layers and +lower layers to strengthen feature propagation in the current layer. +Multi-scale dense connections, which contain shorter connections between layers +close to the input and output, also make a much deeper U-Net possible. Besides, +we introduce quantization to alleviate the potential overfitting in dense +connections, and further improve the segmentation performance. We evaluate our +proposed model on the MICCAI 2015 Gland Segmentation (GlaS) dataset. The three +MDC improve U-Net performance by up to 1.8% on test A and 3.5% on test B in the +MICCAI Gland dataset. Meanwhile, the MDU-Net with quantization obviously +improves the segmentation performance of original U-Net. + +
+
+ comment: 10 pages, 5 figures, 6 tables, published in the Health Information + Science and Systems journal +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 119 + +
+
+
+ + ☆ Reconstruct before Query: Continual Missing Modality Learning with + Decomposed Prompt Collaboration + + +
+ Pre-trained large multi-modal models (LMMs) exploit fine-tuning to adapt +diverse user applications. Nevertheless, fine-tuning may face challenges due to +deactivated sensors (e.g., cameras turned off for privacy or technical issues), +yielding modality-incomplete data and leading to inconsistency in training data +and the data for inference. Additionally, continuous training leads to +catastrophic forgetting, diluting the knowledge in pre-trained LMMs. To +overcome these challenges, we introduce a novel task, Continual Missing +Modality Learning (CMML), to investigate how models can generalize when data of +certain modalities is missing during continual fine-tuning. Our preliminary +benchmarks reveal that existing methods suffer from a significant performance +drop in CMML, even with the aid of advanced continual learning techniques. +Therefore, we devise a framework termed Reconstruct before Query (RebQ). It +decomposes prompts into modality-specific ones and breaks them into components +stored in pools accessible via a key-query mechanism, which facilitates +ParameterEfficient Fine-Tuning and enhances knowledge transferability for +subsequent tasks. Meanwhile, our RebQ leverages extensive multi-modal knowledge +from pre-trained LMMs to reconstruct the data of missing modality. +Comprehensive experiments demonstrate that RebQ effectively reconstructs the +missing modality information and retains pre-trained knowledge. Specifically, +compared with the baseline, RebQ improves average precision from 20.00 to 50.92 +and decreases average forgetting from 75.95 to 8.56. Code and datasets are +available on https://github.com/Tree-Shu-Zhao/RebQ.pytorch + +
+
+
+
+
+ + ☆ V2X-DGW: Domain Generalization for Multi-agent Perception under Adverse + Weather Conditions + + +
+ Current LiDAR-based Vehicle-to-Everything (V2X) multi-agent perception +systems have shown the significant success on 3D object detection. While these +models perform well in the trained clean weather, they struggle in unseen +adverse weather conditions with the real-world domain gap. In this paper, we +propose a domain generalization approach, named V2X-DGW, for LiDAR-based 3D +object detection on multi-agent perception system under adverse weather +conditions. Not only in the clean weather does our research aim to ensure +favorable multi-agent performance, but also in the unseen adverse weather +conditions by learning only on the clean weather data. To advance research in +this area, we have simulated the impact of three prevalent adverse weather +conditions on two widely-used multi-agent datasets, resulting in the creation +of two novel benchmark datasets: OPV2V-w and V2XSet-w. + To this end, we first introduce the Adaptive Weather Augmentation (AWA) to +mimic the unseen adverse weather conditions, and then propose two alignments +for generalizable representation learning: Trust-region Weather-invariant +Alignment (TWA) and Agent-aware Contrastive Alignment (ACA). Extensive +experimental results demonstrate that our V2X-DGW achieved improvements in the +unseen adverse weather conditions. + +
+
+
+
+
+ + ☆ DynamicGlue: Epipolar and Time-Informed Data Association in Dynamic + Environments using Graph Neural Networks + + +
+ The assumption of a static environment is common in many geometric computer +vision tasks like SLAM but limits their applicability in highly dynamic scenes. +Since these tasks rely on identifying point correspondences between input +images within the static part of the environment, we propose a graph neural +network-based sparse feature matching network designed to perform robust +matching under challenging conditions while excluding keypoints on moving +objects. We employ a similar scheme of attentional aggregation over graph edges +to enhance keypoint representations as state-of-the-art feature-matching +networks but augment the graph with epipolar and temporal information and +vastly reduce the number of graph edges. Furthermore, we introduce a +self-supervised training scheme to extract pseudo labels for image pairs in +dynamic environments from exclusively unprocessed visual-inertial data. A +series of experiments show the superior performance of our network as it +excludes keypoints on moving objects compared to state-of-the-art feature +matching networks while still achieving similar results regarding conventional +matching metrics. When integrated into a SLAM system, our network significantly +improves performance, especially in highly dynamic scenes. + +
+
+
+
+
+ + ☆ 3DGS-ReLoc: 3D Gaussian Splatting for Map Representation and Visual + ReLocalization + + +
+ This paper presents a novel system designed for 3D mapping and visual +relocalization using 3D Gaussian Splatting. Our proposed method uses LiDAR and +camera data to create accurate and visually plausible representations of the +environment. By leveraging LiDAR data to initiate the training of the 3D +Gaussian Splatting map, our system constructs maps that are both detailed and +geometrically accurate. To mitigate excessive GPU memory usage and facilitate +rapid spatial queries, we employ a combination of a 2D voxel map and a KD-tree. +This preparation makes our method well-suited for visual localization tasks, +enabling efficient identification of correspondences between the query image +and the rendered image from the Gaussian Splatting map via normalized +cross-correlation (NCC). Additionally, we refine the camera pose of the query +image using feature-based matching and the Perspective-n-Point (PnP) technique. +The effectiveness, adaptability, and precision of our system are demonstrated +through extensive evaluation on the KITTI360 dataset. + +
+
+ comment: 8 pages, 7 figures +
+
+
+
+
+ + ☆ Creating Seamless 3D Maps Using Radiance Fields + + +
+ It is desirable to create 3D object models and 3D maps from 2D input images +for applications such as navigation, virtual tourism, and urban planning. The +traditional methods of creating 3D maps, (such as photogrammetry), require a +large number of images and odometry. Additionally, traditional methods have +difficulty with reflective surfaces and specular reflections; windows and +chrome in the scene can be problematic. Google Road View is a familiar +application, which uses traditional methods to fuse a collection of 2D input +images into the illusion of a 3D map. However, Google Road View does not create +an actual 3D object model, only a collection of views. The objective of this +work is to create an actual 3D object model using updated techniques. Neural +Radiance Fields (NeRF[1]) has emerged as a potential solution, offering the +capability to produce more precise and intricate 3D maps. Gaussian Splatting[4] +is another contemporary technique. This investigation compares Neural Radiance +Fields to Gaussian Splatting, and describes some of their inner workings. Our +primary contribution is a method for improving the results of the 3D +reconstructed models. Our results indicate that Gaussian Splatting was superior +to the NeRF technique. + +
+
+ comment: 10 pages with figures +
+
+
+
+
+ + ☆ StainDiffuser: MultiTask Dual Diffusion Model for Virtual Staining + + +
+ Hematoxylin and Eosin (H&E) staining is the most commonly used for disease +diagnosis and tumor recurrence tracking. Hematoxylin excels at highlighting +nuclei, whereas eosin stains the cytoplasm. However, H&E stain lacks details +for differentiating different types of cells relevant to identifying the grade +of the disease or response to specific treatment variations. Pathologists +require special immunohistochemical (IHC) stains that highlight different cell +types. These stains help in accurately identifying different regions of disease +growth and their interactions with the cell's microenvironment. The advent of +deep learning models has made Image-to-Image (I2I) translation a key research +area, reducing the need for expensive physical staining processes. Pix2Pix and +CycleGAN are still the most commonly used methods for virtual staining +applications. However, both suffer from hallucinations or staining +irregularities when H&E stain has less discriminate information about the +underlying cells IHC needs to highlight (e.g.,CD3 lymphocytes). Diffusion +models are currently the state-of-the-art models for image generation and +conditional generation tasks. However, they require extensive and diverse +datasets (millions of samples) to converge, which is less feasible for virtual +staining applications.Inspired by the success of multitask deep learning models +for limited dataset size, we propose StainDiffuser, a novel multitask dual +diffusion architecture for virtual staining that converges under a limited +training budget. StainDiffuser trains two diffusion processes simultaneously: +(a) generation of cell-specific IHC stain from H&E and (b) H&E-based cell +segmentation using coarse segmentation only during training. Our results show +that StainDiffuser produces high-quality results for easier (CK8/18,epithelial +marker) and difficult stains(CD3, Lymphocytes). + +
+
+
+
+
+ + ☆ Ensembling and Test Augmentation for Covid-19 Detection and Covid-19 + Domain Adaptation from 3D CT-Scans + + +
+ Since the emergence of Covid-19 in late 2019, medical image analysis using +artificial intelligence (AI) has emerged as a crucial research area, +particularly with the utility of CT-scan imaging for disease diagnosis. This +paper contributes to the 4th COV19D competition, focusing on Covid-19 Detection +and Covid-19 Domain Adaptation Challenges. Our approach centers on lung +segmentation and Covid-19 infection segmentation employing the recent CNN-based +segmentation architecture PDAtt-Unet, which simultaneously segments lung +regions and infections. Departing from traditional methods, we concatenate the +input slice (grayscale) with segmented lung and infection, generating three +input channels akin to color channels. Additionally, we employ three 3D CNN +backbones Customized Hybrid-DeCoVNet, along with pretrained 3D-Resnet-18 and +3D-Resnet-50 models to train Covid-19 recognition for both challenges. +Furthermore, we explore ensemble approaches and testing augmentation to enhance +performance. Comparison with baseline results underscores the substantial +efficiency of our approach, with a significant margin in terms of F1-score (14 +%). This study advances the field by presenting a comprehensive methodology for +accurate Covid-19 detection and adaptation, leveraging cutting-edge AI +techniques in medical image analysis. + +
+
+
+
+
+ + ☆ Enhancing Bandwidth Efficiency for Video Motion Transfer Applications + using Deep Learning Based Keypoint Prediction + + +
+ We propose a deep learning based novel prediction framework for enhanced +bandwidth reduction in motion transfer enabled video applications such as video +conferencing, virtual reality gaming and privacy preservation for patient +health monitoring. To model complex motion, we use the First Order Motion Model +(FOMM) that represents dynamic objects using learned keypoints along with their +local affine transformations. Keypoints are extracted by a self-supervised +keypoint detector and organized in a time series corresponding to the video +frames. Prediction of keypoints, to enable transmission using lower frames per +second on the source device, is performed using a Variational Recurrent Neural +Network (VRNN). The predicted keypoints are then synthesized to video frames +using an optical flow estimator and a generator network. This efficacy of +leveraging keypoint based representations in conjunction with VRNN based +prediction for both video animation and reconstruction is demonstrated on three +diverse datasets. For real-time applications, our results show the +effectiveness of our proposed architecture by enabling up to 2x additional +bandwidth reduction over existing keypoint based video motion transfer +frameworks without significantly compromising video quality. + +
+
+
+
+
+ + ☆ Domain-Guided Masked Autoencoders for Unique Player Identification + + +
+ Unique player identification is a fundamental module in vision-driven sports +analytics. Identifying players from broadcast videos can aid with various +downstream tasks such as player assessment, in-game analysis, and broadcast +production. However, automatic detection of jersey numbers using deep features +is challenging primarily due to: a) motion blur, b) low resolution video feed, +and c) occlusions. With their recent success in various vision tasks, masked +autoencoders (MAEs) have emerged as a superior alternative to conventional +feature extractors. However, most MAEs simply zero-out image patches either +randomly or focus on where to mask rather than how to mask. Motivated by human +vision, we devise a novel domain-guided masking policy for MAEs termed d-MAE to +facilitate robust feature extraction in the presence of motion blur for player +identification. We further introduce a new spatio-temporal network leveraging +our novel d-MAE for unique player identification. We conduct experiments on +three large-scale sports datasets, including a curated baseball dataset, the +SoccerNet dataset, and an in-house ice hockey dataset. We preprocess the +datasets using an upgraded keyframe identification (KfID) module by focusing on +frames containing jersey numbers. Additionally, we propose a keyframe-fusion +technique to augment keyframes, preserving spatial and temporal context. Our +spatio-temporal network showcases significant improvements, surpassing the +current state-of-the-art by 8.58%, 4.29%, and 1.20% in the test set accuracies, +respectively. Rigorous ablations highlight the effectiveness of our +domain-guided masking approach and the refined KfID module, resulting in +performance enhancements of 1.48% and 1.84% respectively, compared to original +architectures. + +
+
+ comment: Submitted to 21st International Conference on Robots and Vision + (CRV'24), Guelph, Ontario, Canada +
+
+
+
+
+ + ☆ GeoGaussian: Geometry-aware Gaussian Splatting for Scene Rendering + + +
+ During the Gaussian Splatting optimization process, the scene's geometry can +gradually deteriorate if its structure is not deliberately preserved, +especially in non-textured regions such as walls, ceilings, and furniture +surfaces. This degradation significantly affects the rendering quality of novel +views that deviate significantly from the viewpoints in the training data. To +mitigate this issue, we propose a novel approach called GeoGaussian. Based on +the smoothly connected areas observed from point clouds, this method introduces +a novel pipeline to initialize thin Gaussians aligned with the surfaces, where +the characteristic can be transferred to new generations through a carefully +designed densification strategy. Finally, the pipeline ensures that the scene's +geometry and texture are maintained through constrained optimization processes +with explicit geometry constraints. Benefiting from the proposed architecture, +the generative ability of 3D Gaussians is enhanced, especially in structured +regions. Our proposed pipeline achieves state-of-the-art performance in novel +view synthesis and geometric reconstruction, as evaluated qualitatively and +quantitatively on public datasets. + +
+
+
+
+
+ + ☆ Few-Shot VQA with Frozen LLMs: A Tale of Two Approaches + + +
+ Two approaches have emerged to input images into large language models +(LLMs). The first is to caption images into natural language. The second is to +map image feature embeddings into the domain of the LLM and pass the mapped +embeddings directly to the LLM. The majority of recent few-shot multimodal work +reports performance using architectures that employ variations of one of these +two approaches. But they overlook an important comparison between them. We +design a controlled and focused experiment to compare these two approaches to +few-shot visual question answering (VQA) with LLMs. Our findings indicate that +for Flan-T5 XL, a 3B parameter LLM, connecting visual embeddings directly to +the LLM embedding space does not guarantee improved performance over using +image captions. In the zero-shot regime, we find using textual image captions +is better. In the few-shot regimes, how the in-context examples are selected +determines which is better. + +
+
+
+
+
+ + ☆ A Dual-Augmentor Framework for Domain Generalization in 3D Human Pose + Estimation CVPR 2024 + + +
+ 3D human pose data collected in controlled laboratory settings present +challenges for pose estimators that generalize across diverse scenarios. To +address this, domain generalization is employed. Current methodologies in +domain generalization for 3D human pose estimation typically utilize +adversarial training to generate synthetic poses for training. Nonetheless, +these approaches exhibit several limitations. First, the lack of prior +information about the target domain complicates the application of suitable +augmentation through a single pose augmentor, affecting generalization on +target domains. Moreover, adversarial training's discriminator tends to enforce +similarity between source and synthesized poses, impeding the exploration of +out-of-source distributions. Furthermore, the pose estimator's optimization is +not exposed to domain shifts, limiting its overall generalization ability. + To address these limitations, we propose a novel framework featuring two pose +augmentors: the weak and the strong augmentors. Our framework employs +differential strategies for generation and discrimination processes, +facilitating the preservation of knowledge related to source poses and the +exploration of out-of-source distributions without prior information about +target poses. Besides, we leverage meta-optimization to simulate domain shifts +in the optimization process of the pose estimator, thereby improving its +generalization ability. Our proposed approach significantly outperforms +existing methods, as demonstrated through comprehensive experiments on various +benchmark datasets. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ SQ-LLaVA: Self-Questioning for Large Vision-Language Assistant + + +
+ Recent advancements in the vision-language model have shown notable +generalization in vision-language tasks after visual instruction tuning. +However, bridging the gap between the pre-trained vision encoder and the large +language models becomes the whole network's bottleneck. To improve +cross-modality alignment, existing works usually consider more visual +instruction data covering a broader range of vision tasks to fine-tune the +model for question-answering, which are costly to obtain. However, the image +contains rich contextual information that has been largely under-explored. This +paper first attempts to harness this overlooked context within visual +instruction data, training the model to self-supervised `learning' how to ask +high-quality questions. In this way, we introduce a novel framework named +SQ-LLaVA: Self-Questioning for Large Vision-Language Assistant. SQ-LLaVA +exhibits proficiency in generating flexible and meaningful image-related +questions while analyzing the visual clue and prior language knowledge, +signifying an advanced level of generalized visual understanding. Moreover, +fine-tuning SQ-LLaVA on higher-quality instruction data shows a consistent +performance improvement compared with traditional visual-instruction tuning +methods. This improvement highlights the efficacy of self-questioning +techniques in achieving a deeper and more nuanced comprehension of visual +content across various contexts. + +
+
+
+
+
+ + ☆ Order-One Rolling Shutter Cameras + + +
+ Rolling shutter (RS) cameras dominate consumer and smartphone markets. +Several methods for computing the absolute pose of RS cameras have appeared in +the last 20 years, but the relative pose problem has not been fully solved yet. +We provide a unified theory for the important class of order-one rolling +shutter (RS$_1$) cameras. These cameras generalize the perspective projection +to RS cameras, projecting a generic space point to exactly one image point via +a rational map. We introduce a new back-projection RS camera model, +characterize RS$_1$ cameras, construct explicit parameterizations of such +cameras, and determine the image of a space line. We classify all minimal +problems for solving the relative camera pose problem with linear RS$_1$ +cameras and discover new practical cases. Finally, we show how the theory can +be used to explain RS models previously used for absolute pose computation. + +
+
+ comment: 36 pages, 6 figures, 3 ancillary files +
+
+
+
+
+ + ☆ Advanced Knowledge Extraction of Physical Design Drawings, Translation + and conversion to CAD formats using Deep Learning + + +
+ The maintenance, archiving and usage of the design drawings is cumbersome in +physical form in different industries for longer period. It is hard to extract +information by simple scanning of drawing sheets. Converting them to their +digital formats such as Computer-Aided Design (CAD), with needed knowledge +extraction can solve this problem. The conversion of these machine drawings to +its digital form is a crucial challenge which requires advanced techniques. +This research proposes an innovative methodology utilizing Deep Learning +methods. The approach employs object detection model, such as Yolov7, Faster +R-CNN, to detect physical drawing objects present in the images followed by, +edge detection algorithms such as canny filter to extract and refine the +identified lines from the drawing region and curve detection techniques to +detect circle. Also ornaments (complex shapes) within the drawings are +extracted. To ensure comprehensive conversion, an Optical Character Recognition +(OCR) tool is integrated to identify and extract the text elements from the +drawings. The extracted data which includes the lines, shapes and text is +consolidated and stored in a structured comma separated values(.csv) file +format. The accuracy and the efficiency of conversion is evaluated. Through +this, conversion can be automated to help organizations enhance their +productivity, facilitate seamless collaborations and preserve valuable design +information in a digital format easily accessible. Overall, this study +contributes to the advancement of CAD conversions, providing accurate results +from the translating process. Future research can focus on handling diverse +drawing types, enhanced accuracy in shape and line detection and extraction. + +
+
+
+
+
+ + ☆ Fast Personalized Text-to-Image Syntheses With Attention Injection + + +
+ Currently, personalized image generation methods mostly require considerable +time to finetune and often overfit the concept resulting in generated images +that are similar to custom concepts but difficult to edit by prompts. We +propose an effective and fast approach that could balance the text-image +consistency and identity consistency of the generated image and reference +image. Our method can generate personalized images without any fine-tuning +while maintaining the inherent text-to-image generation ability of diffusion +models. Given a prompt and a reference image, we merge the custom concept into +generated images by manipulating cross-attention and self-attention layers of +the original diffusion model to generate personalized images that match the +text description. Comprehensive experiments highlight the superiority of our +method. + +
+
+
+
+
+ + ☆ BrightDreamer: Generic 3D Gaussian Generative Framework for Fast + Text-to-3D Synthesis + + +
+ Text-to-3D synthesis has recently seen intriguing advances by combining the +text-to-image models with 3D representation methods, e.g., Gaussian Splatting +(GS), via Score Distillation Sampling (SDS). However, a hurdle of existing +methods is the low efficiency, per-prompt optimization for a single 3D object. +Therefore, it is imperative for a paradigm shift from per-prompt optimization +to one-stage generation for any unseen text prompts, which yet remains +challenging. A hurdle is how to directly generate a set of millions of 3D +Gaussians to represent a 3D object. This paper presents BrightDreamer, an +end-to-end single-stage approach that can achieve generalizable and fast (77 +ms) text-to-3D generation. Our key idea is to formulate the generation process +as estimating the 3D deformation from an anchor shape with predefined +positions. For this, we first propose a Text-guided Shape Deformation (TSD) +network to predict the deformed shape and its new positions, used as the +centers (one attribute) of 3D Gaussians. To estimate the other four attributes +(i.e., scaling, rotation, opacity, and SH coefficient), we then design a novel +Text-guided Triplane Generator (TTG) to generate a triplane representation for +a 3D object. The center of each Gaussian enables us to transform the triplane +feature into the four attributes. The generated 3D Gaussians can be finally +rendered at 705 frames per second. Extensive experiments demonstrate the +superiority of our method over existing methods. Also, BrightDreamer possesses +a strong semantic understanding capability even for complex text prompts. The +project code is available at https://vlislab22.github.io/BrightDreamer. + +
+
+
+
+
+ + ☆ Bilateral Propagation Network for Depth Completion CVPR 2024 + + +
+ Depth completion aims to derive a dense depth map from sparse depth +measurements with a synchronized color image. Current state-of-the-art (SOTA) +methods are predominantly propagation-based, which work as an iterative +refinement on the initial estimated dense depth. However, the initial depth +estimations mostly result from direct applications of convolutional layers on +the sparse depth map. In this paper, we present a Bilateral Propagation Network +(BP-Net), that propagates depth at the earliest stage to avoid directly +convolving on sparse data. Specifically, our approach propagates the target +depth from nearby depth measurements via a non-linear model, whose coefficients +are generated through a multi-layer perceptron conditioned on both +\emph{radiometric difference} and \emph{spatial distance}. By integrating +bilateral propagation with multi-modal fusion and depth refinement in a +multi-scale framework, our BP-Net demonstrates outstanding performance on both +indoor and outdoor scenes. It achieves SOTA on the NYUv2 dataset and ranks 1st +on the KITTI depth completion benchmark at the time of submission. Experimental +results not only show the effectiveness of bilateral propagation but also +emphasize the significance of early-stage propagation in contrast to the +refinement stage. Our code and trained models will be available on the project +page. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Stylized Face Sketch Extraction via Generative Prior with Limited Data + + +
+ Facial sketches are both a concise way of showing the identity of a person +and a means to express artistic intention. While a few techniques have recently +emerged that allow sketches to be extracted in different styles, they typically +rely on a large amount of data that is difficult to obtain. Here, we propose +StyleSketch, a method for extracting high-resolution stylized sketches from a +face image. Using the rich semantics of the deep features from a pretrained +StyleGAN, we are able to train a sketch generator with 16 pairs of face and the +corresponding sketch images. The sketch generator utilizes part-based losses +with two-stage learning for fast convergence during training for high-quality +sketch extraction. Through a set of comparisons, we show that StyleSketch +outperforms existing state-of-the-art sketch extraction methods and few-shot +image adaptation methods for the task of extracting high-resolution abstract +face sketches. We further demonstrate the versatility of StyleSketch by +extending its use to other domains and explore the possibility of semantic +editing. The project page can be found in +https://kwanyun.github.io/stylesketch_project. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ Uncertainty-Aware Pseudo-Label Filtering for Source-Free Unsupervised + Domain Adaptation + + +
+ Source-free unsupervised domain adaptation (SFUDA) aims to enable the +utilization of a pre-trained source model in an unlabeled target domain without +access to source data. Self-training is a way to solve SFUDA, where confident +target samples are iteratively selected as pseudo-labeled samples to guide +target model learning. However, prior heuristic noisy pseudo-label filtering +methods all involve introducing extra models, which are sensitive to model +assumptions and may introduce additional errors or mislabeling. In this work, +we propose a method called Uncertainty-aware Pseudo-label-filtering Adaptation +(UPA) to efficiently address this issue in a coarse-to-fine manner. Specially, +we first introduce a sample selection module named Adaptive Pseudo-label +Selection (APS), which is responsible for filtering noisy pseudo labels. The +APS utilizes a simple sample uncertainty estimation method by aggregating +knowledge from neighboring samples and confident samples are selected as clean +pseudo-labeled. Additionally, we incorporate Class-Aware Contrastive Learning +(CACL) to mitigate the memorization of pseudo-label noise by learning robust +pair-wise representation supervised by pseudo labels. Through extensive +experiments conducted on three widely used benchmarks, we demonstrate that our +proposed method achieves competitive performance on par with state-of-the-art +SFUDA methods. Code is available at https://github.com/chenxi52/UPA. + +
+
+ comment: Neurocomputing 2024 +
+
+
+
+
+ + ☆ NeoNeXt: Novel neural network operator and architecture based on the + patch-wise matrix multiplications + + +
+ Most of the computer vision architectures nowadays are built upon the +well-known foundation operations: fully-connected layers, convolutions and +multi-head self-attention blocks. In this paper we propose a novel foundation +operation - NeoCell - which learns matrix patterns and performs patchwise +matrix multiplications with the input data. The main advantages of the proposed +operator are (1) simple implementation without need in operations like im2col, +(2) low computational complexity (especially for large matrices) and (3) simple +and flexible implementation of up-/down-sampling. We validate NeoNeXt family of +models based on this operation on ImageNet-1K classification task and show that +they achieve competitive quality. + +
+
+
+
+
+ + ☆ YOLOv9 for Fracture Detection in Pediatric Wrist Trauma X-ray Images + + +
+ The introduction of YOLOv9, the latest version of the You Only Look Once +(YOLO) series, has led to its widespread adoption across various scenarios. +This paper is the first to apply the YOLOv9 algorithm model to the fracture +detection task as computer-assisted diagnosis (CAD) to help radiologists and +surgeons to interpret X-ray images. Specifically, this paper trained the model +on the GRAZPEDWRI-DX dataset and extended the training set using data +augmentation techniques to improve the model performance. Experimental results +demonstrate that compared to the mAP 50-95 of the current state-of-the-art +(SOTA) model, the YOLOv9 model increased the value from 42.16% to 43.73%, with +an improvement of 3.7%. The implementation code is publicly available at +https://github.com/RuiyangJu/YOLOv9-Fracture-Detection. + +
+
+
+
+
+ + ☆ Compact 3D Gaussian Splatting For Dense Visual SLAM + + +
+ Recent work has shown that 3D Gaussian-based SLAM enables high-quality +reconstruction, accurate pose estimation, and real-time rendering of scenes. +However, these approaches are built on a tremendous number of redundant 3D +Gaussian ellipsoids, leading to high memory and storage costs, and slow +training speed. To address the limitation, we propose a compact 3D Gaussian +Splatting SLAM system that reduces the number and the parameter size of +Gaussian ellipsoids. A sliding window-based masking strategy is first proposed +to reduce the redundant ellipsoids. Then we observe that the covariance matrix +(geometry) of most 3D Gaussian ellipsoids are extremely similar, which +motivates a novel geometry codebook to compress 3D Gaussian geometric +attributes, i.e., the parameters. Robust and accurate pose estimation is +achieved by a global bundle adjustment method with reprojection loss. Extensive +experiments demonstrate that our method achieves faster training and rendering +speed while maintaining the state-of-the-art (SOTA) quality of the scene +representation. + +
+
+
+
+
+ + ☆ FORCE: Dataset and Method for Intuitive Physics Guided Human-object + Interaction + + +
+ Interactions between human and objects are influenced not only by the +object's pose and shape, but also by physical attributes such as object mass +and surface friction. They introduce important motion nuances that are +essential for diversity and realism. Despite advancements in recent +kinematics-based methods, this aspect has been overlooked. Generating nuanced +human motion presents two challenges. First, it is non-trivial to learn from +multi-modal human and object information derived from both the physical and +non-physical attributes. Second, there exists no dataset capturing nuanced +human interactions with objects of varying physical properties, hampering model +development. This work addresses the gap by introducing the FORCE model, a +kinematic approach for synthesizing diverse, nuanced human-object interactions +by modeling physical attributes. Our key insight is that human motion is +dictated by the interrelation between the force exerted by the human and the +perceived resistance. Guided by a novel intuitive physics encoding, the model +captures the interplay between human force and resistance. Experiments also +demonstrate incorporating human force facilitates learning multi-class motion. +Accompanying our model, we contribute the FORCE dataset. It features diverse, +different-styled motion through interactions with varying resistances. + +
+
+ comment: 24 pages, 9 figures +
+
+
+
+
+ + ☆ Universal Semi-Supervised Domain Adaptation by Mitigating Common-Class + Bias CVPR 2024 + + +
+ Domain adaptation is a critical task in machine learning that aims to improve +model performance on a target domain by leveraging knowledge from a related +source domain. In this work, we introduce Universal Semi-Supervised Domain +Adaptation (UniSSDA), a practical yet challenging setting where the target +domain is partially labeled, and the source and target label space may not +strictly match. UniSSDA is at the intersection of Universal Domain Adaptation +(UniDA) and Semi-Supervised Domain Adaptation (SSDA): the UniDA setting does +not allow for fine-grained categorization of target private classes not +represented in the source domain, while SSDA focuses on the restricted +closed-set setting where source and target label spaces match exactly. Existing +UniDA and SSDA methods are susceptible to common-class bias in UniSSDA +settings, where models overfit to data distributions of classes common to both +domains at the expense of private classes. We propose a new prior-guided +pseudo-label refinement strategy to reduce the reinforcement of common-class +bias due to pseudo-labeling, a common label propagation strategy in domain +adaptation. We demonstrate the effectiveness of the proposed strategy on +benchmark datasets Office-Home, DomainNet, and VisDA. The proposed strategy +attains the best performance across UniSSDA adaptation settings and establishes +a new baseline for UniSSDA. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ STAIR: Semantic-Targeted Active Implicit Reconstruction + + +
+ Many autonomous robotic applications require object-level understanding when +deployed. Actively reconstructing objects of interest, i.e. objects with +specific semantic meanings, is therefore relevant for a robot to perform +downstream tasks in an initially unknown environment. In this work, we propose +a novel framework for semantic-targeted active reconstruction using posed RGB-D +measurements and 2D semantic labels as input. The key components of our +framework are a semantic implicit neural representation and a compatible +planning utility function based on semantic rendering and uncertainty +estimation, enabling adaptive view planning to target objects of interest. Our +planning approach achieves better reconstruction performance in terms of mesh +and novel view rendering quality compared to implicit reconstruction baselines +that do not consider semantics for view planning. Our framework further +outperforms a state-of-the-art semantic-targeted active reconstruction pipeline +based on explicit maps, justifying our choice of utilising implicit neural +representations to tackle semantic-targeted active reconstruction problems. + +
+
+
+
+
+ + ☆ Simple 2D Convolutional Neural Network-based Approach for COVID-19 + Detection + + +
+ This study explores the use of deep learning techniques for analyzing lung +Computed Tomography (CT) images. Classic deep learning approaches face +challenges with varying slice counts and resolutions in CT images, a diversity +arising from the utilization of assorted scanning equipment. Typically, +predictions are made on single slices which are then combined for a +comprehensive outcome. Yet, this method does not incorporate learning features +specific to each slice, leading to a compromise in effectiveness. To address +these challenges, we propose an advanced Spatial-Slice Feature Learning +(SSFL++) framework specifically tailored for CT scans. It aims to filter out +out-of-distribution (OOD) data within the entire CT scan, allowing us to select +essential spatial-slice features for analysis by reducing data redundancy by +70\%. Additionally, we introduce a Kernel-Density-based slice Sampling (KDS) +method to enhance stability during training and inference phases, thereby +accelerating convergence and enhancing overall performance. Remarkably, our +experiments reveal that our model achieves promising results with a simple +EfficientNet-2D (E2D) model. The effectiveness of our approach is confirmed on +the COVID-19-CT-DB datasets provided by the DEF-AI-MIA workshop. + +
+
+
+
+
+ + ☆ Concatenate, Fine-tuning, Re-training: A SAM-enabled Framework for + Semi-supervised 3D Medical Image Segmentation + + +
+ Segment Anything Model (SAM) fine-tuning has shown remarkable performance in +medical image segmentation in a fully supervised manner, but requires precise +annotations. To reduce the annotation cost and maintain satisfactory +performance, in this work, we leverage the capabilities of SAM for establishing +semi-supervised medical image segmentation models. Rethinking the requirements +of effectiveness, efficiency, and compatibility, we propose a three-stage +framework, i.e., Concatenate, Fine-tuning, and Re-training (CFR). The current +fine-tuning approaches mostly involve 2D slice-wise fine-tuning that disregards +the contextual information between adjacent slices. Our concatenation strategy +mitigates the mismatch between natural and 3D medical images. The concatenated +images are then used for fine-tuning SAM, providing robust initialization +pseudo-labels. Afterwards, we train a 3D semi-supervised segmentation model +while maintaining the same parameter size as the conventional segmenter such as +V-Net. Our CFR framework is plug-and-play, and easily compatible with various +popular semi-supervised methods. Extensive experiments validate that our CFR +achieves significant improvements in both moderate annotation and scarce +annotation across four datasets. In particular, CFR framework improves the Dice +score of Mean Teacher from 29.68% to 74.40% with only one labeled data of LA +dataset. + +
+
+
+
+
+ + ☆ SpikeNeRF: Learning Neural Radiance Fields from Continuous Spike Stream CVPR 2024 + + +
+ Spike cameras, leveraging spike-based integration sampling and high temporal +resolution, offer distinct advantages over standard cameras. However, existing +approaches reliant on spike cameras often assume optimal illumination, a +condition frequently unmet in real-world scenarios. To address this, we +introduce SpikeNeRF, the first work that derives a NeRF-based volumetric scene +representation from spike camera data. Our approach leverages NeRF's multi-view +consistency to establish robust self-supervision, effectively eliminating +erroneous measurements and uncovering coherent structures within exceedingly +noisy input amidst diverse real-world illumination scenarios. The framework +comprises two core elements: a spike generation model incorporating an +integrate-and-fire neuron layer and parameters accounting for non-idealities, +such as threshold variation, and a spike rendering loss capable of generalizing +across varying illumination conditions. We describe how to effectively optimize +neural radiance fields to render photorealistic novel views from the novel +continuous spike stream, demonstrating advantages over other vision sensors in +certain scenes. Empirical evaluations conducted on both real and novel +realistically simulated sequences affirm the efficacy of our methodology. The +dataset and source code are released at +https://github.com/BIT-Vision/SpikeNeRF. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ CPA-Enhancer: Chain-of-Thought Prompted Adaptive Enhancer for Object + Detection under Unknown Degradations + + +
+ Object detection methods under known single degradations have been +extensively investigated. However, existing approaches require prior knowledge +of the degradation type and train a separate model for each, limiting their +practical applications in unpredictable environments. To address this +challenge, we propose a chain-of-thought (CoT) prompted adaptive enhancer, +CPA-Enhancer, for object detection under unknown degradations. Specifically, +CPA-Enhancer progressively adapts its enhancement strategy under the +step-by-step guidance of CoT prompts, that encode degradation-related +information. To the best of our knowledge, it's the first work that exploits +CoT prompting for object detection tasks. Overall, CPA-Enhancer is a +plug-and-play enhancement model that can be integrated into any generic +detectors to achieve substantial gains on degraded images, without knowing the +degradation type priorly. Experimental results demonstrate that CPA-Enhancer +not only sets the new state of the art for object detection but also boosts the +performance of other downstream vision tasks under unknown degradations. + +
+
+
+
+
+ + ☆ RCdpia: A Renal Carcinoma Digital Pathology Image Annotation dataset + based on pathologists + + +
+ The annotation of digital pathological slide data for renal cell carcinoma is +of paramount importance for correct diagnosis of artificial intelligence models +due to the heterogeneous nature of the tumor. This process not only facilitates +a deeper understanding of renal cell cancer heterogeneity but also aims to +minimize noise in the data for more accurate studies. To enhance the +applicability of the data, two pathologists were enlisted to meticulously +curate, screen, and label a kidney cancer pathology image dataset from The +Cancer Genome Atlas Program (TCGA) database. Subsequently, a Resnet model was +developed to validate the annotated dataset against an additional dataset from +the First Affiliated Hospital of Zhejiang University. Based on these results, +we have meticulously compiled the TCGA digital pathological dataset with +independent labeling of tumor regions and adjacent areas (RCdpia), which +includes 109 cases of kidney chromophobe cell carcinoma, 486 cases of kidney +clear cell carcinoma, and 292 cases of kidney papillary cell carcinoma. This +dataset is now publicly accessible at http://39.171.241.18:8888/RCdpia/. +Furthermore, model analysis has revealed significant discrepancies in +predictive outcomes when applying the same model to datasets from different +centers. Leveraging the RCdpia, we can now develop more precise digital +pathology artificial intelligence models for tasks such as normalization, +classification, and segmentation. These advancements underscore the potential +for more nuanced and accurate AI applications in the field of digital +pathology. + +
+
+ comment: 8 pages, 3 figures, 1 table +
+
+
+
+
+ + ☆ THOR: Text to Human-Object Interaction Diffusion via Relation + Intervention + + +
+ This paper addresses new methodologies to deal with the challenging task of +generating dynamic Human-Object Interactions from textual descriptions +(Text2HOI). While most existing works assume interactions with limited body +parts or static objects, our task involves addressing the variation in human +motion, the diversity of object shapes, and the semantic vagueness of object +motion simultaneously. To tackle this, we propose a novel Text-guided +Human-Object Interaction diffusion model with Relation Intervention (THOR). +THOR is a cohesive diffusion model equipped with a relation intervention +mechanism. In each diffusion step, we initiate text-guided human and object +motion and then leverage human-object relations to intervene in object motion. +This intervention enhances the spatial-temporal relations between humans and +objects, with human-centric interaction representation providing additional +guidance for synthesizing consistent motion from text. To achieve more +reasonable and realistic results, interaction losses is introduced at different +levels of motion granularity. Moreover, we construct Text-BEHAVE, a Text2HOI +dataset that seamlessly integrates textual descriptions with the currently +largest publicly available 3D HOI dataset. Both quantitative and qualitative +experiments demonstrate the effectiveness of our proposed model. + +
+
+
+
+
+ + ☆ MindEye2: Shared-Subject Models Enable fMRI-To-Image With 1 Hour of Data + + +
+ Reconstructions of visual perception from brain activity have improved +tremendously, but the practical utility of such methods has been limited. This +is because such models are trained independently per subject where each subject +requires dozens of hours of expensive fMRI training data to attain high-quality +results. The present work showcases high-quality reconstructions using only 1 +hour of fMRI training data. We pretrain our model across 7 subjects and then +fine-tune on minimal data from a new subject. Our novel functional alignment +procedure linearly maps all brain data to a shared-subject latent space, +followed by a shared non-linear mapping to CLIP image space. We then map from +CLIP space to pixel space by fine-tuning Stable Diffusion XL to accept CLIP +latents as inputs instead of text. This approach improves out-of-subject +generalization with limited training data and also attains state-of-the-art +image retrieval and reconstruction metrics compared to single-subject +approaches. MindEye2 demonstrates how accurate reconstructions of perception +are possible from a single visit to the MRI facility. All code is available on +GitHub. + +
+
+ comment: Code at https://github.com/MedARC-AI/MindEyeV2/tree/main +
+
+
+
+
+ + ☆ TAG: Guidance-free Open-Vocabulary Semantic Segmentation + + +
+ Semantic segmentation is a crucial task in computer vision, where each pixel +in an image is classified into a category. However, traditional methods face +significant challenges, including the need for pixel-level annotations and +extensive training. Furthermore, because supervised learning uses a limited set +of predefined categories, models typically struggle with rare classes and +cannot recognize new ones. Unsupervised and open-vocabulary segmentation, +proposed to tackle these issues, faces challenges, including the inability to +assign specific class labels to clusters and the necessity of user-provided +text queries for guidance. In this context, we propose a novel approach, TAG +which achieves Training, Annotation, and Guidance-free open-vocabulary semantic +segmentation. TAG utilizes pre-trained models such as CLIP and DINO to segment +images into meaningful categories without additional training or dense +annotations. It retrieves class labels from an external database, providing +flexibility to adapt to new scenarios. Our TAG achieves state-of-the-art +results on PascalVOC, PascalContext and ADE20K for open-vocabulary segmentation +without given class names, i.e. improvement of +15.3 mIoU on PascalVOC. All +code and data will be released at https://github.com/Valkyrja3607/TAG. + +
+
+ comment: 18 pages +
+
+
+
+
+ + ☆ MaskDiffusion: Exploiting Pre-trained Diffusion Models for Semantic + Segmentation + + +
+ Semantic segmentation is essential in computer vision for various +applications, yet traditional approaches face significant challenges, including +the high cost of annotation and extensive training for supervised learning. +Additionally, due to the limited predefined categories in supervised learning, +models typically struggle with infrequent classes and are unable to predict +novel classes. To address these limitations, we propose MaskDiffusion, an +innovative approach that leverages pretrained frozen Stable Diffusion to +achieve open-vocabulary semantic segmentation without the need for additional +training or annotation, leading to improved performance compared to similar +methods. We also demonstrate the superior performance of MaskDiffusion in +handling open vocabularies, including fine-grained and proper noun-based +categories, thus expanding the scope of segmentation applications. Overall, our +MaskDiffusion shows significant qualitative and quantitative improvements in +contrast to other comparable unsupervised segmentation methods, i.e. on the +Potsdam dataset (+10.5 mIoU compared to GEM) and COCO-Stuff (+14.8 mIoU +compared to DiffSeg). All code and data will be released at +https://github.com/Valkyrja3607/MaskDiffusion. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ☆ Neural Markov Random Field for Stereo Matching CVPR 2024 + + +
+ Stereo matching is a core task for many computer vision and robotics +applications. Despite their dominance in traditional stereo methods, the +hand-crafted Markov Random Field (MRF) models lack sufficient modeling accuracy +compared to end-to-end deep models. While deep learning representations have +greatly improved the unary terms of the MRF models, the overall accuracy is +still severely limited by the hand-crafted pairwise terms and message passing. +To address these issues, we propose a neural MRF model, where both potential +functions and message passing are designed using data-driven neural networks. +Our fully data-driven model is built on the foundation of variational inference +theory, to prevent convergence issues and retain stereo MRF's graph inductive +bias. To make the inference tractable and scale well to high-resolution images, +we also propose a Disparity Proposal Network (DPN) to adaptively prune the +search space of disparity. The proposed approach ranks $1^{st}$ on both KITTI +2012 and 2015 leaderboards among all published methods while running faster +than 100 ms. This approach significantly outperforms prior global methods, +e.g., lowering D1 metric by more than 50% on KITTI 2015. In addition, our +method exhibits strong cross-domain generalization and can recover sharp edges. +The codes at https://github.com/aeolusguan/NMRF . + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Self-Supervised Video Desmoking for Laparoscopic Surgery + + +
+ Due to the difficulty of collecting real paired data, most existing desmoking +methods train the models by synthesizing smoke, generalizing poorly to real +surgical scenarios. Although a few works have explored single-image real-world +desmoking in unpaired learning manners, they still encounter challenges in +handling dense smoke. In this work, we address these issues together by +introducing the self-supervised surgery video desmoking (SelfSVD). On the one +hand, we observe that the frame captured before the activation of high-energy +devices is generally clear (named pre-smoke frame, PS frame), thus it can serve +as supervision for other smoky frames, making real-world self-supervised video +desmoking practically feasible. On the other hand, in order to enhance the +desmoking performance, we further feed the valuable information from PS frame +into models, where a masking strategy and a regularization term are presented +to avoid trivial solutions. In addition, we construct a real surgery video +dataset for desmoking, which covers a variety of smoky scenes. Extensive +experiments on the dataset show that our SelfSVD can remove smoke more +effectively and efficiently while recovering more photo-realistic details than +the state-of-the-art methods. The dataset, codes, and pre-trained models are +available at \url{https://github.com/ZcsrenlongZ/SelfSVD}. + +
+
+ comment: 28 pages +
+
+
+
+
+ + ☆ Boosting Semi-Supervised Temporal Action Localization by Learning from + Non-Target Classes + + +
+ The crux of semi-supervised temporal action localization (SS-TAL) lies in +excavating valuable information from abundant unlabeled videos. However, +current approaches predominantly focus on building models that are robust to +the error-prone target class (i.e, the predicted class with the highest +confidence) while ignoring informative semantics within non-target classes. +This paper approaches SS-TAL from a novel perspective by advocating for +learning from non-target classes, transcending the conventional focus solely on +the target class. The proposed approach involves partitioning the label space +of the predicted class distribution into distinct subspaces: target class, +positive classes, negative classes, and ambiguous classes, aiming to mine both +positive and negative semantics that are absent in the target class, while +excluding ambiguous classes. To this end, we first devise innovative strategies +to adaptively select high-quality positive and negative classes from the label +space, by modeling both the confidence and rank of a class in relation to those +of the target class. Then, we introduce novel positive and negative losses +designed to guide the learning process, pushing predictions closer to positive +classes and away from negative classes. Finally, the positive and negative +processes are integrated into a hybrid positive-negative learning framework, +facilitating the utilization of non-target classes in both labeled and +unlabeled videos. Experimental results on THUMOS14 and ActivityNet v1.3 +demonstrate the superiority of the proposed method over prior state-of-the-art +approaches. + +
+
+
+
+
+ + ☆ NetTrack: Tracking Highly Dynamic Objects with a Net CVPR 2024 + + +
+ The complex dynamicity of open-world objects presents non-negligible +challenges for multi-object tracking (MOT), often manifested as severe +deformations, fast motion, and occlusions. Most methods that solely depend on +coarse-grained object cues, such as boxes and the overall appearance of the +object, are susceptible to degradation due to distorted internal relationships +of dynamic objects. To address this problem, this work proposes NetTrack, an +efficient, generic, and affordable tracking framework to introduce fine-grained +learning that is robust to dynamicity. Specifically, NetTrack constructs a +dynamicity-aware association with a fine-grained Net, leveraging point-level +visual cues. Correspondingly, a fine-grained sampler and matching method have +been incorporated. Furthermore, NetTrack learns object-text correspondence for +fine-grained localization. To evaluate MOT in extremely dynamic open-world +scenarios, a bird flock tracking (BFT) dataset is constructed, which exhibits +high dynamicity with diverse species and open-world scenarios. Comprehensive +evaluation on BFT validates the effectiveness of fine-grained learning on +object dynamicity, and thorough transfer experiments on challenging open-world +benchmarks, i.e., TAO, TAO-OW, AnimalTrack, and GMOT-40, validate the strong +generalization ability of NetTrack even without finetuning. Project page: +https://george-zhuang.github.io/nettrack/. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ DuPL: Dual Student with Trustworthy Progressive Learning for Robust + Weakly Supervised Semantic Segmentation CVPR 2024 + + +
+ Recently, One-stage Weakly Supervised Semantic Segmentation (WSSS) with +image-level labels has gained increasing interest due to simplification over +its cumbersome multi-stage counterpart. Limited by the inherent ambiguity of +Class Activation Map (CAM), we observe that one-stage pipelines often encounter +confirmation bias caused by incorrect CAM pseudo-labels, impairing their final +segmentation performance. Although recent works discard many unreliable +pseudo-labels to implicitly alleviate this issue, they fail to exploit +sufficient supervision for their models. To this end, we propose a dual student +framework with trustworthy progressive learning (DuPL). Specifically, we +propose a dual student network with a discrepancy loss to yield diverse CAMs +for each sub-net. The two sub-nets generate supervision for each other, +mitigating the confirmation bias caused by learning their own incorrect +pseudo-labels. In this process, we progressively introduce more trustworthy +pseudo-labels to be involved in the supervision through dynamic threshold +adjustment with an adaptive noise filtering strategy. Moreover, we believe that +every pixel, even discarded from supervision due to its unreliability, is +important for WSSS. Thus, we develop consistency regularization on these +discarded regions, providing supervision of every pixel. Experiment results +demonstrate the superiority of the proposed DuPL over the recent +state-of-the-art alternatives on PASCAL VOC 2012 and MS COCO datasets. Code is +available at https://github.com/Wu0409/DuPL. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Quality-Aware Image-Text Alignment for Real-World Image Quality + Assessment + + +
+ No-Reference Image Quality Assessment (NR-IQA) focuses on designing methods +to measure image quality in alignment with human perception when a high-quality +reference image is unavailable. The reliance on annotated Mean Opinion Scores +(MOS) in the majority of state-of-the-art NR-IQA approaches limits their +scalability and broader applicability to real-world scenarios. To overcome this +limitation, we propose QualiCLIP (Quality-aware CLIP), a CLIP-based +self-supervised opinion-unaware method that does not require labeled MOS. In +particular, we introduce a quality-aware image-text alignment strategy to make +CLIP generate representations that correlate with the inherent quality of the +images. Starting from pristine images, we synthetically degrade them with +increasing levels of intensity. Then, we train CLIP to rank these degraded +images based on their similarity to quality-related antonym text prompts, while +guaranteeing consistent representations for images with comparable quality. Our +method achieves state-of-the-art performance on several datasets with authentic +distortions. Moreover, despite not requiring MOS, QualiCLIP outperforms +supervised methods when their training dataset differs from the testing one, +thus proving to be more suitable for real-world scenarios. Furthermore, our +approach demonstrates greater robustness and improved explainability than +competing methods. The code and the model are publicly available at +https://github.com/miccunifi/QualiCLIP. + +
+
+
+
+
+ + ☆ Artifact Feature Purification for Cross-domain Detection of AI-generated + Images + + +
+ In the era of AIGC, the fast development of visual content generation +technologies, such as diffusion models, bring potential security risks to our +society. Existing generated image detection methods suffer from performance +drop when faced with out-of-domain generators and image scenes. To relieve this +problem, we propose Artifact Purification Network (APN) to facilitate the +artifact extraction from generated images through the explicit and implicit +purification processes. For the explicit one, a suspicious frequency-band +proposal method and a spatial feature decomposition method are proposed to +extract artifact-related features. For the implicit one, a training strategy +based on mutual information estimation is proposed to further purify the +artifact-related features. Experiments show that for cross-generator detection, +the average accuracy of APN is 5.6% ~ 16.4% higher than the previous 10 methods +on GenImage dataset and 1.7% ~ 50.1% on DiffusionForensics dataset. For +cross-scene detection, APN maintains its high performance. Via visualization +analysis, we find that the proposed method extracts flexible forgery patterns +and condenses the forgery information diluted in irrelevant features. We also +find that the artifact features APN focuses on across generators and scenes are +global and diverse. The code will be available on GitHub. + +
+
+ comment: This work is under consideration at Computer Vision and Image + Understanding +
+
+
+
+
+ + ☆ CGI-DM: Digital Copyright Authentication for Diffusion Models via + Contrasting Gradient Inversion CVPR 2024 + + +
+ Diffusion Models (DMs) have evolved into advanced image generation tools, +especially for few-shot generation where a pretrained model is fine-tuned on a +small set of images to capture a specific style or object. Despite their +success, concerns exist about potential copyright violations stemming from the +use of unauthorized data in this process. In response, we present Contrasting +Gradient Inversion for Diffusion Models (CGI-DM), a novel method featuring +vivid visual representations for digital copyright authentication. Our approach +involves removing partial information of an image and recovering missing +details by exploiting conceptual differences between the pretrained and +fine-tuned models. We formulate the differences as KL divergence between latent +variables of the two models when given the same input image, which can be +maximized through Monte Carlo sampling and Projected Gradient Descent (PGD). +The similarity between original and recovered images serves as a strong +indicator of potential infringements. Extensive experiments on the WikiArt and +Dreambooth datasets demonstrate the high accuracy of CGI-DM in digital +copyright authentication, surpassing alternative validation techniques. Code +implementation is available at https://github.com/Nicholas0228/Revelio. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Selective Hourglass Mapping for Universal Image Restoration Based on + Diffusion Model CVPR2024 + + +
+ Universal image restoration is a practical and potential computer vision task +for real-world applications. The main challenge of this task is handling the +different degradation distributions at once. Existing methods mainly utilize +task-specific conditions (e.g., prompt) to guide the model to learn different +distributions separately, named multi-partite mapping. However, it is not +suitable for universal model learning as it ignores the shared information +between different tasks. In this work, we propose an advanced selective +hourglass mapping strategy based on diffusion model, termed DiffUIR. Two novel +considerations make our DiffUIR non-trivial. Firstly, we equip the model with +strong condition guidance to obtain accurate generation direction of diffusion +model (selective). More importantly, DiffUIR integrates a flexible shared +distribution term (SDT) into the diffusion algorithm elegantly and naturally, +which gradually maps different distributions into a shared one. In the reverse +process, combined with SDT and strong condition guidance, DiffUIR iteratively +guides the shared distribution to the task-specific distribution with high +image quality (hourglass). Without bells and whistles, by only modifying the +mapping strategy, we achieve state-of-the-art performance on five image +restoration tasks, 22 benchmarks in the universal setting and zero-shot +generalization setting. Surprisingly, by only using a lightweight model (only +0.89M), we could achieve outstanding performance. The source code and +pre-trained models are available at https://github.com/iSEE-Laboratory/DiffUIR + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ☆ Training A Small Emotional Vision Language Model for Visual Art + Comprehension + + +
+ This paper develops small vision language models to understand visual art, +which, given an art work, aims to identify its emotion category and explain +this prediction with natural language. While small models are computationally +efficient, their capacity is much limited compared with large models. To break +this trade-off, this paper builds a small emotional vision language model +(SEVLM) by emotion modeling and input-output feature alignment. On the one +hand, based on valence-arousal-dominance (VAD) knowledge annotated by +psychology experts, we introduce and fuse emotional features derived through +VAD dictionary and a VAD head to align VAD vectors of predicted emotion +explanation and the ground truth. This allows the vision language model to +better understand and generate emotional texts, compared with using traditional +text embeddings alone. On the other hand, we design a contrastive head to pull +close embeddings of the image, its emotion class, and explanation, which aligns +model outputs and inputs. On two public affective explanation datasets, we show +that the proposed techniques consistently improve the visual art understanding +performance of baseline SEVLMs. Importantly, the proposed model can be trained +and evaluated on a single RTX 2080 Ti while exhibiting very strong performance: +it not only outperforms the state-of-the-art small models but is also +competitive compared with LLaVA 7B after fine-tuning and GPT4(V). + +
+
+
+
+
+ + ☆ A lightweight deep learning pipeline with DRDA-Net and MobileNet for + breast cancer classification + + +
+ Accurate and early detection of breast cancer is essential for successful +treatment. This paper introduces a novel deep-learning approach for improved +breast cancer classification in histopathological images, a crucial step in +diagnosis. Our method hinges on the Dense Residual Dual-Shuffle Attention +Network (DRDA-Net), inspired by ShuffleNet's efficient architecture. DRDA-Net +achieves exceptional accuracy across various magnification levels on the +BreaKHis dataset, a breast cancer histopathology analysis benchmark. However, +for real-world deployment, computational efficiency is paramount. We integrate +a pre-trained MobileNet model renowned for its lightweight design to address +computational. MobileNet ensures fast execution even on devices with limited +resources without sacrificing performance. This combined approach offers a +promising solution for accurate breast cancer diagnosis, paving the way for +faster and more accessible screening procedures. + +
+
+ comment: 4 pages, 3 figures +
+
+
+
+
+ + ☆ Recent Advances in 3D Gaussian Splatting + + +
+ The emergence of 3D Gaussian Splatting (3DGS) has greatly accelerated the +rendering speed of novel view synthesis. Unlike neural implicit representations +like Neural Radiance Fields (NeRF) that represent a 3D scene with position and +viewpoint-conditioned neural networks, 3D Gaussian Splatting utilizes a set of +Gaussian ellipsoids to model the scene so that efficient rendering can be +accomplished by rasterizing Gaussian ellipsoids into images. Apart from the +fast rendering speed, the explicit representation of 3D Gaussian Splatting +facilitates editing tasks like dynamic reconstruction, geometry editing, and +physical simulation. Considering the rapid change and growing number of works +in this field, we present a literature review of recent 3D Gaussian Splatting +methods, which can be roughly classified into 3D reconstruction, 3D editing, +and other downstream applications by functionality. Traditional point-based +rendering methods and the rendering formulation of 3D Gaussian Splatting are +also illustrated for a better understanding of this technique. This survey aims +to help beginners get into this field quickly and provide experienced +researchers with a comprehensive overview, which can stimulate the future +development of the 3D Gaussian Splatting representation. + +
+
+
+
+
+ + ☆ Omni-Recon: Towards General-Purpose Neural Radiance Fields for Versatile + 3D Applications + + +
+ Recent breakthroughs in Neural Radiance Fields (NeRFs) have sparked +significant demand for their integration into real-world 3D applications. +However, the varied functionalities required by different 3D applications often +necessitate diverse NeRF models with various pipelines, leading to tedious NeRF +training for each target task and cumbersome trial-and-error experiments. +Drawing inspiration from the generalization capability and adaptability of +emerging foundation models, our work aims to develop one general-purpose NeRF +for handling diverse 3D tasks. We achieve this by proposing a framework called +Omni-Recon, which is capable of (1) generalizable 3D reconstruction and +zero-shot multitask scene understanding, and (2) adaptability to diverse +downstream 3D applications such as real-time rendering and scene editing. Our +key insight is that an image-based rendering pipeline, with accurate geometry +and appearance estimation, can lift 2D image features into their 3D +counterparts, thus extending widely explored 2D tasks to the 3D world in a +generalizable manner. Specifically, our Omni-Recon features a general-purpose +NeRF model using image-based rendering with two decoupled branches: one complex +transformer-based branch that progressively fuses geometry and appearance +features for accurate geometry estimation, and one lightweight branch for +predicting blending weights of source views. This design achieves +state-of-the-art (SOTA) generalizable 3D surface reconstruction quality with +blending weights reusable across diverse tasks for zero-shot multitask scene +understanding. In addition, it can enable real-time rendering after baking the +complex geometry branch into meshes, swift adaptation to achieve SOTA +generalizable 3D understanding performance, and seamless integration with 2D +diffusion models for text-guided 3D editing. + +
+
+
+
+
+ + ☆ GRA: Detecting Oriented Objects through Group-wise Rotating and + Attention + + +
+ Oriented object detection, an emerging task in recent years, aims to identify +and locate objects across varied orientations. This requires the detector to +accurately capture the orientation information, which varies significantly +within and across images. Despite the existing substantial efforts, +simultaneously ensuring model effectiveness and parameter efficiency remains +challenging in this scenario. In this paper, we propose a lightweight yet +effective \textbf{G}roup-wise \textbf{R}otating and \textbf{A}ttention (GRA) +module to replace the convolution operations in backbone networks for oriented +object detection. GRA can adaptively capture fine-grained features of objects +with diverse orientations, comprising two key components: Group-wise Rotating +and Group-wise Attention. Group-wise Rotating first divides the convolution +kernel into groups, where each group extracts different object features by +rotating at a specific angle according to the object orientation. Subsequently, +Group-wise Attention is employed to adaptively enhance the object-related +regions in the feature. The collaborative effort of these components enables +GRA to effectively capture the various orientation information while +maintaining parameter efficiency. Extensive experimental results demonstrate +the superiority of our method. For example, GRA achieves a new state-of-the-art +(SOTA) on the DOTA-v2.0 benchmark, while saving the parameters by nearly 50\% +compared to the previous SOTA method. Code will be released. + +
+
+
+
+
+ + ☆ LERENet: Eliminating Intra-class Differences for Metal Surface Defect + Few-shot Semantic Segmentation + + +
+ Few-shot segmentation models excel in metal defect detection due to their +rapid generalization ability to new classes and pixel-level segmentation, +rendering them ideal for addressing data scarcity issues and achieving refined +object delineation in industrial applications. Existing works neglect the +\textit{Intra-Class Differences}, inherent in metal surface defect data, which +hinders the model from learning sufficient knowledge from the support set to +guide the query set segmentation. Specifically, it can be categorized into two +types: the \textit{Semantic Difference} induced by internal factors in metal +samples and the \textit{Distortion Difference} caused by external factors of +surroundings. To address these differences, we introduce a \textbf{L}ocal +d\textbf{E}scriptor based \textbf{R}easoning and \textbf{E}xcitation +\textbf{Net}work (\textbf{LERENet}) to learn the two-view guidance, i.e., local +and global information from the graph and feature space, and fuse them to +segment precisely. Since the relation structure of local features embedded in +graph space will help to eliminate \textit{Semantic Difference}, we employ +Multi-Prototype Reasoning (MPR) module, extracting local descriptors based +prototypes and analyzing local-view feature relevance in support-query pairs. +Besides, due to the global information that will assist in countering the +\textit{Distortion Difference} in observations, we utilize Multi-Prototype +Excitation (MPE) module to capture the global-view relations in support-query +pairs. Finally, we employ an Information Fusion Module (IFM) to fuse learned +prototypes in local and global views to generate pixel-level masks. Our +comprehensive experiments on defect datasets demonstrate that it outperforms +existing benchmarks, establishing a new state-of-the-art. + +
+
+
+
+
+ + ☆ A Versatile Framework for Multi-scene Person Re-identification + + +
+ Person Re-identification (ReID) has been extensively developed for a decade +in order to learn the association of images of the same person across +non-overlapping camera views. To overcome significant variations between images +across camera views, mountains of variants of ReID models were developed for +solving a number of challenges, such as resolution change, clothing change, +occlusion, modality change, and so on. Despite the impressive performance of +many ReID variants, these variants typically function distinctly and cannot be +applied to other challenges. To our best knowledge, there is no versatile ReID +model that can handle various ReID challenges at the same time. This work +contributes to the first attempt at learning a versatile ReID model to solve +such a problem. Our main idea is to form a two-stage prompt-based twin modeling +framework called VersReID. Our VersReID firstly leverages the scene label to +train a ReID Bank that contains abundant knowledge for handling various scenes, +where several groups of scene-specific prompts are used to encode different +scene-specific knowledge. In the second stage, we distill a V-Branch model with +versatile prompts from the ReID Bank for adaptively solving the ReID of +different scenes, eliminating the demand for scene labels during the inference +stage. To facilitate training VersReID, we further introduce the multi-scene +properties into self-supervised learning of ReID via a multi-scene prioris data +augmentation (MPDA) strategy. Through extensive experiments, we demonstrate the +success of learning an effective and versatile ReID model for handling ReID +tasks under multi-scene conditions without manual assignment of scene labels in +the inference stage, including general, low-resolution, clothing change, +occlusion, and cross-modality scenes. Codes and models are available at +https://github.com/iSEE-Laboratory/VersReID. + +
+
+ comment: To appear in TPAMI +
+
+
+
+
+ + ☆ Unifying Feature and Cost Aggregation with Transformers for Semantic and + Visual Correspondence ICLR'24 + + +
+ This paper introduces a Transformer-based integrative feature and cost +aggregation network designed for dense matching tasks. In the context of dense +matching, many works benefit from one of two forms of aggregation: feature +aggregation, which pertains to the alignment of similar features, or cost +aggregation, a procedure aimed at instilling coherence in the flow estimates +across neighboring pixels. In this work, we first show that feature aggregation +and cost aggregation exhibit distinct characteristics and reveal the potential +for substantial benefits stemming from the judicious use of both aggregation +processes. We then introduce a simple yet effective architecture that harnesses +self- and cross-attention mechanisms to show that our approach unifies feature +aggregation and cost aggregation and effectively harnesses the strengths of +both techniques. Within the proposed attention layers, the features and cost +volume both complement each other, and the attention layers are interleaved +through a coarse-to-fine design to further promote accurate correspondence +estimation. Finally at inference, our network produces multi-scale predictions, +computes their confidence scores, and selects the most confident flow for final +prediction. Our framework is evaluated on standard benchmarks for semantic +matching, and also applied to geometric matching, where we show that our +approach achieves significant improvements compared to existing methods. + +
+
+ comment: Accepted by ICLR'24 (camera ready version) Code and weights can be + found here: https://github.com/KU-CVLAB/UFC +
+
+
+
+
+ + ☆ PhD: A Prompted Visual Hallucination Evaluation Dataset + + +
+ The rapid growth of Large Language Models (LLMs) has driven the development +of Large Vision-Language Models (LVLMs). The challenge of hallucination, +prevalent in LLMs, also emerges in LVLMs. However, most existing efforts mainly +focus on object hallucination in LVLM, ignoring diverse types of LVLM +hallucinations. In this study, we delve into the Intrinsic Vision-Language +Hallucination (IVL-Hallu) issue, thoroughly analyzing different types of +IVL-Hallu on their causes and reflections. Specifically, we propose several +novel IVL-Hallu tasks and categorize them into four types: (a) object +hallucination, which arises from the misidentification of objects, (b) +attribute hallucination, which is caused by the misidentification of +attributes, (c) multi-modal conflicting hallucination, which derives from the +contradictions between textual and visual information, and (d) +counter-common-sense hallucination, which owes to the contradictions between +the LVLM knowledge and actual images. Based on these taxonomies, we propose a +more challenging benchmark named PhD to evaluate and explore IVL-Hallu. An +automated pipeline is proposed for generating different types of IVL-Hallu +data. Extensive experiments on five SOTA LVLMs reveal their inability to +effectively tackle our proposed IVL-Hallu tasks, with detailed analyses and +insights on the origins and possible solutions of these new challenging +IVL-Hallu tasks, facilitating future researches on IVL-Hallu and LVLM. The +benchmark can be accessed at +\href{https://github.com/jiazhen-code/IntrinsicHallu}{this https URL}. + +
+
+
+
+
+ + ☆ Local-consistent Transformation Learning for Rotation-invariant Point + Cloud Analysis CVPR 2024 + + +
+ Rotation invariance is an important requirement for point shape analysis. To +achieve this, current state-of-the-art methods attempt to construct the local +rotation-invariant representation through learning or defining the local +reference frame (LRF). Although efficient, these LRF-based methods suffer from +perturbation of local geometric relations, resulting in suboptimal local +rotation invariance. To alleviate this issue, we propose a Local-consistent +Transformation (LocoTrans) learning strategy. Specifically, we first construct +the local-consistent reference frame (LCRF) by considering the symmetry of the +two axes in LRF. In comparison with previous LRFs, our LCRF is able to preserve +local geometric relationships better through performing local-consistent +transformation. However, as the consistency only exists in local regions, the +relative pose information is still lost in the intermediate layers of the +network. We mitigate such a relative pose issue by developing a relative pose +recovery (RPR) module. RPR aims to restore the relative pose between adjacent +transformed patches. Equipped with LCRF and RPR, our LocoTrans is capable of +learning local-consistent transformation and preserving local geometry, which +benefits rotation invariance learning. Competitive performance under arbitrary +rotations on both shape classification and part segmentation tasks and +ablations can demonstrate the effectiveness of our method. Code will be +available publicly at https://github.com/wdttt/LocoTrans. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ 3D Human Reconstruction in the Wild with Synthetic Data Using Generative + Models + + +
+ In this work, we show that synthetic data created by generative models is +complementary to computer graphics (CG) rendered data for achieving remarkable +generalization performance on diverse real-world scenes for 3D human pose and +shape estimation (HPS). Specifically, we propose an effective approach based on +recent diffusion models, termed HumanWild, which can effortlessly generate +human images and corresponding 3D mesh annotations. We first collect a +large-scale human-centric dataset with comprehensive annotations, e.g., text +captions and surface normal images. Then, we train a customized ControlNet +model upon this dataset to generate diverse human images and initial +ground-truth labels. At the core of this step is that we can easily obtain +numerous surface normal images from a 3D human parametric model, e.g., SMPL-X, +by rendering the 3D mesh onto the image plane. As there exists inevitable noise +in the initial labels, we then apply an off-the-shelf foundation segmentation +model, i.e., SAM, to filter negative data samples. Our data generation pipeline +is flexible and customizable to facilitate different real-world tasks, e.g., +ego-centric scenes and perspective-distortion scenes. The generated dataset +comprises 0.79M images with corresponding 3D annotations, covering versatile +viewpoints, scenes, and human identities. We train various HPS regressors on +top of the generated data and evaluate them on a wide range of benchmarks +(3DPW, RICH, EgoBody, AGORA, SSP-3D) to verify the effectiveness of the +generated data. By exclusively employing generative models, we generate +large-scale in-the-wild human images and high-quality annotations, eliminating +the need for real-world data collection. + +
+
+ comment: project page: https://yongtaoge.github.io/projects/humanwild +
+
+
+
+
+ + ☆ Self-supervised co-salient object detection via feature correspondence + at multiple scales + + +
+ Our paper introduces a novel two-stage self-supervised approach for detecting +co-occurring salient objects (CoSOD) in image groups without requiring +segmentation annotations. Unlike existing unsupervised methods that rely solely +on patch-level information (e.g. clustering patch descriptors) or on +computation heavy off-the-shelf components for CoSOD, our lightweight model +leverages feature correspondences at both patch and region levels, +significantly improving prediction performance. In the first stage, we train a +self-supervised network that detects co-salient regions by computing local +patch-level feature correspondences across images. We obtain the segmentation +predictions using confidence-based adaptive thresholding. In the next stage, we +refine these intermediate segmentations by eliminating the detected regions +(within each image) whose averaged feature representations are dissimilar to +the foreground feature representation averaged across all the cross-attention +maps (from the previous stage). Extensive experiments on three CoSOD benchmark +datasets show that our self-supervised model outperforms the corresponding +state-of-the-art models by a huge margin (e.g. on the CoCA dataset, our model +has a 13.7% F-measure gain over the SOTA unsupervised CoSOD model). Notably, +our self-supervised model also outperforms several recent fully supervised +CoSOD models on the three test datasets (e.g., on the CoCA dataset, our model +has a 4.6% F-measure gain over a recent supervised CoSOD model). + +
+
+
+
+
+ + ☆ Self-Supervised Quantization-Aware Knowledge Distillation + + +
+ Quantization-aware training (QAT) and Knowledge Distillation (KD) are +combined to achieve competitive performance in creating low-bit deep learning +models. However, existing works applying KD to QAT require tedious +hyper-parameter tuning to balance the weights of different loss terms, assume +the availability of labeled training data, and require complex, computationally +intensive training procedures for good performance. To address these +limitations, this paper proposes a novel Self-Supervised Quantization-Aware +Knowledge Distillation (SQAKD) framework. SQAKD first unifies the forward and +backward dynamics of various quantization functions, making it flexible for +incorporating various QAT works. Then it formulates QAT as a co-optimization +problem that simultaneously minimizes the KL-Loss between the full-precision +and low-bit models for KD and the discretization error for quantization, +without supervision from labels. A comprehensive evaluation shows that SQAKD +substantially outperforms the state-of-the-art QAT and KD works for a variety +of model architectures. Our code is at: https://github.com/kaiqi123/SQAKD.git. + +
+
+
+
+
+ + ☆ Source Prompt Disentangled Inversion for Boosting Image Editability with + Diffusion Models + + +
+ Text-driven diffusion models have significantly advanced the image editing +performance by using text prompts as inputs. One crucial step in text-driven +image editing is to invert the original image into a latent noise code +conditioned on the source prompt. While previous methods have achieved +promising results by refactoring the image synthesizing process, the inverted +latent noise code is tightly coupled with the source prompt, limiting the image +editability by target text prompts. To address this issue, we propose a novel +method called Source Prompt Disentangled Inversion (SPDInv), which aims at +reducing the impact of source prompt, thereby enhancing the text-driven image +editing performance by employing diffusion models. To make the inverted noise +code be independent of the given source prompt as much as possible, we indicate +that the iterative inversion process should satisfy a fixed-point constraint. +Consequently, we transform the inversion problem into a searching problem to +find the fixed-point solution, and utilize the pre-trained diffusion models to +facilitate the searching process. The experimental results show that our +proposed SPDInv method can effectively mitigate the conflicts between the +target editing prompt and the source prompt, leading to a significant decrease +in editing artifacts. In addition to text-driven image editing, with SPDInv we +can easily adapt customized image generation models to localized editing tasks +and produce promising performance. The source code are available at +https://github.com/leeruibin/SPDInv. + +
+
+
+
+
+ + ☆ Hierarchical Generative Network for Face Morphing Attacks + + +
+ Face morphing attacks circumvent face recognition systems (FRSs) by creating +a morphed image that contains multiple identities. However, existing face +morphing attack methods either sacrifice image quality or compromise the +identity preservation capability. Consequently, these attacks fail to bypass +FRSs verification well while still managing to deceive human observers. These +methods typically rely on global information from contributing images, ignoring +the detailed information from effective facial regions. To address the above +issues, we propose a novel morphing attack method to improve the quality of +morphed images and better preserve the contributing identities. Our proposed +method leverages the hierarchical generative network to capture both local +detailed and global consistency information. Additionally, a mask-guided image +blending module is dedicated to removing artifacts from areas outside the face +to improve the image's visual quality. The proposed attack method is compared +to state-of-the-art methods on three public datasets in terms of FRSs' +vulnerability, attack detectability, and image quality. The results show our +method's potential threat of deceiving FRSs while being capable of passing +multiple morphing attack detection (MAD) scenarios. + +
+
+ comment: Accepted by FG2024 +
+
+
+
+
+ + ☆ Graph Expansion in Pruned Recurrent Neural Network Layers Preserve + Performance ICLR 2024 + + +
+ Expansion property of a graph refers to its strong connectivity as well as +sparseness. It has been reported that deep neural networks can be pruned to a +high degree of sparsity while maintaining their performance. Such pruning is +essential for performing real time sequence learning tasks using recurrent +neural networks in resource constrained platforms. We prune recurrent networks +such as RNNs and LSTMs, maintaining a large spectral gap of the underlying +graphs and ensuring their layerwise expansion properties. We also study the +time unfolded recurrent network graphs in terms of the properties of their +bipartite layers. Experimental results for the benchmark sequence MNIST, +CIFAR-10, and Google speech command data show that expander graph properties +are key to preserving classification accuracy of RNN and LSTM. + +
+
+ comment: Accepted as tiny paper in ICLR 2024 +
+
+
+
+
+ + ☆ Lost in Translation? Translation Errors and Challenges for Fair + Assessment of Text-to-Image Models on Multilingual Concepts NAACL 2024 + + +
+ Benchmarks of the multilingual capabilities of text-to-image (T2I) models +compare generated images prompted in a test language to an expected image +distribution over a concept set. One such benchmark, "Conceptual Coverage +Across Languages" (CoCo-CroLa), assesses the tangible noun inventory of T2I +models by prompting them to generate pictures from a concept list translated to +seven languages and comparing the output image populations. Unfortunately, we +find that this benchmark contains translation errors of varying severity in +Spanish, Japanese, and Chinese. We provide corrections for these errors and +analyze how impactful they are on the utility and validity of CoCo-CroLa as a +benchmark. We reassess multiple baseline T2I models with the revisions, compare +the outputs elicited under the new translations to those conditioned on the +old, and show that a correction's impactfulness on the image-domain benchmark +results can be predicted in the text domain with similarity scores. Our +findings will guide the future development of T2I multilinguality metrics by +providing analytical tools for practical translation decisions. + +
+
+ comment: NAACL 2024 Main Conference +
+
+
+
+
+ + ☆ Multitask frame-level learning for few-shot sound event detection + + +
+ This paper focuses on few-shot Sound Event Detection (SED), which aims to +automatically recognize and classify sound events with limited samples. +However, prevailing methods methods in few-shot SED predominantly rely on +segment-level predictions, which often providing detailed, fine-grained +predictions, particularly for events of brief duration. Although frame-level +prediction strategies have been proposed to overcome these limitations, these +strategies commonly face difficulties with prediction truncation caused by +background noise. To alleviate this issue, we introduces an innovative +multitask frame-level SED framework. In addition, we introduce TimeFilterAug, a +linear timing mask for data augmentation, to increase the model's robustness +and adaptability to diverse acoustic environments. The proposed method achieves +a F-score of 63.8%, securing the 1st rank in the few-shot bioacoustic event +detection category of the Detection and Classification of Acoustic Scenes and +Events Challenge 2023. + +
+
+ comment: 6 pages, 4 figures, conference +
+
+
+
+
+ + ☆ m&m's: A Benchmark to Evaluate Tool-Use for multi-step multi-modal Tasks + + +
+ Real-world multi-modal problems are rarely solved by a single machine +learning model, and often require multi-step computational plans that involve +stitching several models. Tool-augmented LLMs hold tremendous promise for +automating the generation of such computational plans. However, the lack of +standardized benchmarks for evaluating LLMs as planners for multi-step +multi-modal tasks has prevented a systematic study of planner design decisions. +Should LLMs generate a full plan in a single shot or step-by-step? Should they +invoke tools directly with Python code or through structured data formats like +JSON? Does feedback improve planning? To answer these questions and more, we +introduce m&m's: a benchmark containing 4K+ multi-step multi-modal tasks +involving 33 tools that include multi-modal models, (free) public APIs, and +image processing modules. For each of these task queries, we provide +automatically generated plans using this realistic toolset. We further provide +a high-quality subset of 1,565 task plans that are human-verified and correctly +executable. With m&m's, we evaluate 6 popular LLMs with 2 planning strategies +(multi-step vs. step-by-step planning), 2 plan formats (JSON vs. code), and 3 +types of feedback (parsing/verification/execution). Finally, we summarize +takeaways from our extensive experiments. Our dataset and code are available on +HuggingFace (https://huggingface.co/datasets/zixianma/mnms) and Github +(https://github.com/RAIVNLab/mnms). + +
+
+
+
+
+ + ☆ Customizing Visual-Language Foundation Models for Multi-modal Anomaly + Detection and Reasoning + + +
+ Anomaly detection is vital in various industrial scenarios, including the +identification of unusual patterns in production lines and the detection of +manufacturing defects for quality control. Existing techniques tend to be +specialized in individual scenarios and lack generalization capacities. In this +study, we aim to develop a generic anomaly detection model applicable across +multiple scenarios. To achieve this, we customize generic visual-language +foundation models that possess extensive knowledge and robust reasoning +abilities into anomaly detectors and reasoners. Specifically, we introduce a +multi-modal prompting strategy that incorporates domain knowledge from experts +as conditions to guide the models. Our approach considers multi-modal prompt +types, including task descriptions, class context, normality rules, and +reference images. In addition, we unify the input representation of +multi-modality into a 2D image format, enabling multi-modal anomaly detection +and reasoning. Our preliminary studies demonstrate that combining visual and +language prompts as conditions for customizing the models enhances anomaly +detection performance. The customized models showcase the ability to detect +anomalies across different data modalities such as images and point clouds. +Qualitative case studies further highlight the anomaly detection and reasoning +capabilities, particularly for multi-object scenes and temporal data. Our code +is available at https://github.com/Xiaohao-Xu/Customizable-VLM. + +
+
+
+
+
+ + ☆ Adaptive Semantic-Enhanced Denoising Diffusion Probabilistic Model for + Remote Sensing Image Super-Resolution + + +
+ Remote sensing image super-resolution (SR) is a crucial task to restore +high-resolution (HR) images from low-resolution (LR) observations. Recently, +the Denoising Diffusion Probabilistic Model (DDPM) has shown promising +performance in image reconstructions by overcoming problems inherent in +generative models, such as over-smoothing and mode collapse. However, the +high-frequency details generated by DDPM often suffer from misalignment with HR +images due to the model's tendency to overlook long-range semantic contexts. +This is attributed to the widely used U-Net decoder in the conditional noise +predictor, which tends to overemphasize local information, leading to the +generation of noises with significant variances during the prediction process. +To address these issues, an adaptive semantic-enhanced DDPM (ASDDPM) is +proposed to enhance the detail-preserving capability of the DDPM by +incorporating low-frequency semantic information provided by the Transformer. +Specifically, a novel adaptive diffusion Transformer decoder (ADTD) is +developed to bridge the semantic gap between the encoder and decoder through +regulating the noise prediction with the global contextual relationships and +long-range dependencies in the diffusion process. Additionally, a residual +feature fusion strategy establishes information exchange between the two +decoders at multiple levels. As a result, the predicted noise generated by our +approach closely approximates that of the real noise distribution.Extensive +experiments on two SR and two semantic segmentation datasets confirm the +superior performance of the proposed ASDDPM in both SR and the subsequent +downstream applications. The source code will be available at +https://github.com/littlebeen/ASDDPM-Adaptive-Semantic-Enhanced-DDPM. + +
+
+
+
+
+ + ☆ Zippo: Zipping Color and Transparency Distributions into a Single + Diffusion Model + + +
+ Beyond the superiority of the text-to-image diffusion model in generating +high-quality images, recent studies have attempted to uncover its potential for +adapting the learned semantic knowledge to visual perception tasks. In this +work, instead of translating a generative diffusion model into a visual +perception model, we explore to retain the generative ability with the +perceptive adaptation. To accomplish this, we present Zippo, a unified +framework for zipping the color and transparency distributions into a single +diffusion model by expanding the diffusion latent into a joint representation +of RGB images and alpha mattes. By alternatively selecting one modality as the +condition and then applying the diffusion process to the counterpart modality, +Zippo is capable of generating RGB images from alpha mattes and predicting +transparency from input images. In addition to single-modality prediction, we +propose a modality-aware noise reassignment strategy to further empower Zippo +with jointly generating RGB images and its corresponding alpha mattes under the +text guidance. Our experiments showcase Zippo's ability of efficient +text-conditioned transparent image generation and present plausible results of +Matte-to-RGB and RGB-to-Matte translation. + +
+
+
+
+
+ + ☆ Audio-Visual Segmentation via Unlabeled Frame Exploitation CVPR 2024 + + +
+ Audio-visual segmentation (AVS) aims to segment the sounding objects in video +frames. Although great progress has been witnessed, we experimentally reveal +that current methods reach marginal performance gain within the use of the +unlabeled frames, leading to the underutilization issue. To fully explore the +potential of the unlabeled frames for AVS, we explicitly divide them into two +categories based on their temporal characteristics, i.e., neighboring frame +(NF) and distant frame (DF). NFs, temporally adjacent to the labeled frame, +often contain rich motion information that assists in the accurate localization +of sounding objects. Contrary to NFs, DFs have long temporal distances from the +labeled frame, which share semantic-similar objects with appearance variations. +Considering their unique characteristics, we propose a versatile framework that +effectively leverages them to tackle AVS. Specifically, for NFs, we exploit the +motion cues as the dynamic guidance to improve the objectness localization. +Besides, we exploit the semantic cues in DFs by treating them as valid +augmentations to the labeled frames, which are then used to enrich data +diversity in a self-training manner. Extensive experimental results demonstrate +the versatility and superiority of our method, unleashing the power of the +abundant unlabeled frames. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Tokensome: Towards a Genetic Vision-Language GPT for Explainable and + Cognitive Karyotyping + + +
+ Automatic karyotype analysis is often defined as a visual perception task +focused solely on chromosomal object-level modeling. This definition has led +most existing methods to overlook componential and holistic information, +significantly constraining model performance. Moreover, the lack of +interpretability in current technologies hinders clinical adoption. In this +paper, we introduce Tokensome, a novel vision-language model based on +chromosome tokenization for explainable and cognitive karyotyping. Tokensome +elevates the method from the conventional visual perception layer to the +cognitive decision-making layer. This elevation enables the integration of +domain knowledge and cognitive reasoning via knowledge graphs and LLMs, +markedly enhancing model's explainability and facilitating abnormality +detection. + +
+
+ comment: Preprint. Work in progress +
+
+
+
+
+ + ☆ Controllable Relation Disentanglement for Few-Shot Class-Incremental + Learning + + +
+ In this paper, we propose to tackle Few-Shot Class-Incremental Learning +(FSCIL) from a new perspective, i.e., relation disentanglement, which means +enhancing FSCIL via disentangling spurious relation between categories. The +challenge of disentangling spurious correlations lies in the poor +controllability of FSCIL. On one hand, an FSCIL model is required to be trained +in an incremental manner and thus it is very hard to directly control +relationships between categories of different sessions. On the other hand, +training samples per novel category are only in the few-shot setting, which +increases the difficulty of alleviating spurious relation issues as well. To +overcome this challenge, in this paper, we propose a new simple-yet-effective +method, called ConTrollable Relation-disentangLed Few-Shot Class-Incremental +Learning (CTRL-FSCIL). Specifically, during the base session, we propose to +anchor base category embeddings in feature space and construct disentanglement +proxies to bridge gaps between the learning for category representations in +different sessions, thereby making category relation controllable. During +incremental learning, the parameters of the backbone network are frozen in +order to relieve the negative impact of data scarcity. Moreover, a +disentanglement loss is designed to effectively guide a relation +disentanglement controller to disentangle spurious correlations between the +embeddings encoded by the backbone. In this way, the spurious correlation issue +in FSCIL can be suppressed. Extensive experiments on CIFAR-100, mini-ImageNet, +and CUB-200 datasets demonstrate the effectiveness of our CTRL-FSCIL method. + +
+
+
+
+
+ + ☆ Intelligent Railroad Grade Crossing: Leveraging Semantic Segmentation + and Object Detection for Enhanced Safety + + +
+ Crashes and delays at Railroad Highway Grade Crossings (RHGC), where highways +and railroads intersect, pose significant safety concerns for the U.S. Federal +Railroad Administration (FRA). Despite the critical importance of addressing +accidents and traffic delays at highway-railroad intersections, there is a +notable dearth of research on practical solutions for managing these issues. In +response to this gap in the literature, our study introduces an intelligent +system that leverages machine learning and computer vision techniques to +enhance safety at Railroad Highway Grade crossings (RHGC). This research +proposed a Non-Maximum Suppression (NMS)- based ensemble model that integrates +a variety of YOLO variants, specifically YOLOv5S, YOLOv5M, and YOLOv5L, for +grade-crossing object detection, utilizes segmentation techniques from the UNet +architecture for detecting approaching rail at a grade crossing. Both methods +are implemented on a Raspberry Pi. Moreover, the strategy employs +high-definition cameras installed at the RHGC. This framework enables the +system to monitor objects within the Region of Interest (ROI) at crossings, +detect the approach of trains, and clear the crossing area before a train +arrives. Regarding accuracy, precision, recall, and Intersection over Union +(IoU), the proposed state-of-the-art NMS-based object detection ensemble model +achieved 96% precision. In addition, the UNet segmentation model obtained a 98% +IoU value. This automated railroad grade crossing system powered by artificial +intelligence represents a promising solution for enhancing safety at +highway-railroad intersections. + +
+
+ comment: 11 pages, 11 figures, conference +
+
+
+
+
+ + ☆ Large Language Models Powered Context-aware Motion Prediction + + +
+ Motion prediction is among the most fundamental tasks in autonomous driving. +Traditional methods of motion forecasting primarily encode vector information +of maps and historical trajectory data of traffic participants, lacking a +comprehensive understanding of overall traffic semantics, which in turn affects +the performance of prediction tasks. In this paper, we utilized Large Language +Models (LLMs) to enhance the global traffic context understanding for motion +prediction tasks. We first conducted systematic prompt engineering, visualizing +complex traffic environments and historical trajectory information of traffic +participants into image prompts -- Transportation Context Map (TC-Map), +accompanied by corresponding text prompts. Through this approach, we obtained +rich traffic context information from the LLM. By integrating this information +into the motion prediction model, we demonstrate that such context can enhance +the accuracy of motion predictions. Furthermore, considering the cost +associated with LLMs, we propose a cost-effective deployment strategy: +enhancing the accuracy of motion prediction tasks at scale with 0.7\% +LLM-augmented datasets. Our research offers valuable insights into enhancing +the understanding of traffic scenes of LLMs and the motion prediction +performance of autonomous driving. + +
+
+ comment: 6 pages,4 figures +
+
+
+
+
+ + ☆ Analytic-Splatting: Anti-Aliased 3D Gaussian Splatting via Analytic + Integration + + +
+ The 3D Gaussian Splatting (3DGS) gained its popularity recently by combining +the advantages of both primitive-based and volumetric 3D representations, +resulting in improved quality and efficiency for 3D scene rendering. However, +3DGS is not alias-free, and its rendering at varying resolutions could produce +severe blurring or jaggies. This is because 3DGS treats each pixel as an +isolated, single point rather than as an area, causing insensitivity to changes +in the footprints of pixels. Consequently, this discrete sampling scheme +inevitably results in aliasing, owing to the restricted sampling bandwidth. In +this paper, we derive an analytical solution to address this issue. More +specifically, we use a conditioned logistic function as the analytic +approximation of the cumulative distribution function (CDF) in a +one-dimensional Gaussian signal and calculate the Gaussian integral by +subtracting the CDFs. We then introduce this approximation in the +two-dimensional pixel shading, and present Analytic-Splatting, which +analytically approximates the Gaussian integral within the 2D-pixel window area +to better capture the intensity response of each pixel. Moreover, we use the +approximated response of the pixel window integral area to participate in the +transmittance calculation of volume rendering, making Analytic-Splatting +sensitive to the changes in pixel footprint at different resolutions. +Experiments on various datasets validate that our approach has better +anti-aliasing capability that gives more details and better fidelity. + +
+
+ comment: 29 pages +
+
+
+
+
+ + ☆ OSTAF: A One-Shot Tuning Method for Improved Attribute-Focused T2I + Personalization + + +
+ Personalized text-to-image (T2I) models not only produce lifelike and varied +visuals but also allow users to tailor the images to fit their personal taste. +These personalization techniques can grasp the essence of a concept through a +collection of images, or adjust a pre-trained text-to-image model with a +specific image input for subject-driven or attribute-aware guidance. Yet, +accurately capturing the distinct visual attributes of an individual image +poses a challenge for these methods. To address this issue, we introduce OSTAF, +a novel parameter-efficient one-shot fine-tuning method which only utilizes one +reference image for T2I personalization. A novel hypernetwork-powered +attribute-focused fine-tuning mechanism is employed to achieve the precise +learning of various attribute features (e.g., appearance, shape or drawing +style) from the reference image. Comparing to existing image customization +methods, our method shows significant superiority in attribute identification +and application, as well as achieves a good balance between efficiency and +output quality. + +
+
+
+
+
+ + ☆ Unveiling and Mitigating Memorization in Text-to-image Diffusion Models + through Cross Attention + + +
+ Recent advancements in text-to-image diffusion models have demonstrated their +remarkable capability to generate high-quality images from textual prompts. +However, increasing research indicates that these models memorize and replicate +images from their training data, raising tremendous concerns about potential +copyright infringement and privacy risks. In our study, we provide a novel +perspective to understand this memorization phenomenon by examining its +relationship with cross-attention mechanisms. We reveal that during +memorization, the cross-attention tends to focus disproportionately on the +embeddings of specific tokens. The diffusion model is overfitted to these token +embeddings, memorizing corresponding training images. To elucidate this +phenomenon, we further identify and discuss various intrinsic findings of +cross-attention that contribute to memorization. Building on these insights, we +introduce an innovative approach to detect and mitigate memorization in +diffusion models. The advantage of our proposed method is that it will not +compromise the speed of either the training or the inference processes in these +models while preserving the quality of generated images. Our code is available +at https://github.com/renjie3/MemAttn . + +
+
+
+
+
+ + ☆ Endora: Video Generation Models as Endoscopy Simulators + + +
+ Generative models hold promise for revolutionizing medical education, +robot-assisted surgery, and data augmentation for machine learning. Despite +progress in generating 2D medical images, the complex domain of clinical video +generation has largely remained untapped.This paper introduces \model, an +innovative approach to generate medical videos that simulate clinical endoscopy +scenes. We present a novel generative model design that integrates a +meticulously crafted spatial-temporal video transformer with advanced 2D vision +foundation model priors, explicitly modeling spatial-temporal dynamics during +video generation. We also pioneer the first public benchmark for endoscopy +simulation with video generation models, adapting existing state-of-the-art +methods for this endeavor.Endora demonstrates exceptional visual quality in +generating endoscopy videos, surpassing state-of-the-art methods in extensive +testing. Moreover, we explore how this endoscopy simulator can empower +downstream video analysis tasks and even generate 3D medical scenes with +multi-view consistency. In a nutshell, Endora marks a notable breakthrough in +the deployment of generative AI for clinical endoscopy research, setting a +substantial stage for further advances in medical content generation. For more +details, please visit our project page: https://endora-medvidgen.github.io/. + +
+
+ comment: Project page: https://endora-medvidgen.github.io/ +
+
+
+
+
+ + ☆ From Pixels to Predictions: Spectrogram and Vision Transformer for + Better Time Series Forecasting + + +
+ Time series forecasting plays a crucial role in decision-making across +various domains, but it presents significant challenges. Recent studies have +explored image-driven approaches using computer vision models to address these +challenges, often employing lineplots as the visual representation of time +series data. In this paper, we propose a novel approach that uses +time-frequency spectrograms as the visual representation of time series data. +We introduce the use of a vision transformer for multimodal learning, +showcasing the advantages of our approach across diverse datasets from +different domains. To evaluate its effectiveness, we compare our method against +statistical baselines (EMA and ARIMA), a state-of-the-art deep learning-based +approach (DeepAR), other visual representations of time series data (lineplot +images), and an ablation study on using only the time series as input. Our +experiments demonstrate the benefits of utilizing spectrograms as a visual +representation for time series data, along with the advantages of employing a +vision transformer for simultaneous learning in both the time and frequency +domains. + +
+
+ comment: Published at ACM ICAIF 2023 +
+
+
+
+
+ + ☆ PhD: A Prompted Visual Hallucination Evaluation Dataset + + +
+ The rapid growth of Large Language Models (LLMs) has driven the development +of Large Vision-Language Models (LVLMs). The challenge of hallucination, +prevalent in LLMs, also emerges in LVLMs. However, most existing efforts mainly +focus on object hallucination in LVLM, ignoring diverse types of LVLM +hallucinations. In this study, we delve into the Intrinsic Vision-Language +Hallucination (IVL-Hallu) issue, thoroughly analyzing different types of +IVL-Hallu on their causes and reflections. Specifically, we propose several +novel IVL-Hallu tasks and categorize them into four types: (a) object +hallucination, which arises from the misidentification of objects, (b) +attribute hallucination, which is caused by the misidentification of +attributes, (c) multi-modal conflicting hallucination, which derives from the +contradictions between textual and visual information, and (d) +counter-common-sense hallucination, which owes to the contradictions between +the LVLM knowledge and actual images. Based on these taxonomies, we propose a +more challenging benchmark named PhD to evaluate and explore IVL-Hallu. An +automated pipeline is proposed for generating different types of IVL-Hallu +data. Extensive experiments on five SOTA LVLMs reveal their inability to +effectively tackle our proposed IVL-Hallu tasks, with detailed analyses and +insights on the origins and possible solutions of these new challenging +IVL-Hallu tasks, facilitating future researches on IVL-Hallu and LVLM. The +benchmark can be accessed at https://github.com/jiazhen-code/IntrinsicHallu + +
+
+
+
+
+ + ♻ ☆ FaceTalk: Audio-Driven Motion Diffusion for Neural Parametric Head + Models + + +
+ We introduce FaceTalk, a novel generative approach designed for synthesizing +high-fidelity 3D motion sequences of talking human heads from input audio +signal. To capture the expressive, detailed nature of human heads, including +hair, ears, and finer-scale eye movements, we propose to couple speech signal +with the latent space of neural parametric head models to create high-fidelity, +temporally coherent motion sequences. We propose a new latent diffusion model +for this task, operating in the expression space of neural parametric head +models, to synthesize audio-driven realistic head sequences. In the absence of +a dataset with corresponding NPHM expressions to audio, we optimize for these +correspondences to produce a dataset of temporally-optimized NPHM expressions +fit to audio-video recordings of people talking. To the best of our knowledge, +this is the first work to propose a generative approach for realistic and +high-quality motion synthesis of volumetric human heads, representing a +significant advancement in the field of audio-driven 3D animation. Notably, our +approach stands out in its ability to generate plausible motion sequences that +can produce high-fidelity head animation coupled with the NPHM shape space. Our +experimental results substantiate the effectiveness of FaceTalk, consistently +achieving superior and visually natural motion, encompassing diverse facial +expressions and styles, outperforming existing methods by 75% in perceptual +user study evaluation. + +
+
+ comment: Paper Video: https://youtu.be/7Jf0kawrA3Q Project Page: + https://shivangi-aneja.github.io/projects/facetalk/ +
+
+
+
+
+ + ♻ ☆ Cross-Shaped Windows Transformer with Self-supervised Pretraining for + Clinically Significant Prostate Cancer Detection in Bi-parametric MRI + + +
+ Biparametric magnetic resonance imaging (bpMRI) has demonstrated promising +results in prostate cancer (PCa) detection using convolutional neural networks +(CNNs). Recently, transformers have achieved competitive performance compared +to CNNs in computer vision. Large scale transformers need abundant annotated +data for training, which are difficult to obtain in medical imaging. +Self-supervised learning (SSL) utilizes unlabeled data to generate meaningful +semantic representations without the need for costly annotations, enhancing +model performance on tasks with limited labeled data. We introduce a novel +end-to-end Cross-Shaped windows (CSwin) transformer UNet model, CSwin UNet, to +detect clinically significant prostate cancer (csPCa) in prostate bi-parametric +MR imaging (bpMRI) and demonstrate the effectiveness of our proposed +self-supervised pre-training framework. Using a large prostate bpMRI dataset +with 1500 patients, we first pretrain CSwin transformer using multi-task +self-supervised learning to improve data-efficiency and network +generalizability. We then finetune using lesion annotations to perform csPCa +detection. Five-fold cross validation shows that self-supervised CSwin UNet +achieves 0.888 AUC and 0.545 Average Precision (AP), significantly +outperforming four comparable models (Swin UNETR, DynUNet, Attention UNet, +UNet). Using a separate bpMRI dataset with 158 patients, we evaluate our method +robustness to external hold-out data. Self-supervised CSwin UNet achieves 0.79 +AUC and 0.45 AP, still outperforming all other comparable methods and +demonstrating good generalization to external data. + +
+
+
+
+
+ + ♻ ☆ AISFormer: Amodal Instance Segmentation with Transformer BMVC2022 + + +
+ Amodal Instance Segmentation (AIS) aims to segment the region of both visible +and possible occluded parts of an object instance. While Mask R-CNN-based AIS +approaches have shown promising results, they are unable to model high-level +features coherence due to the limited receptive field. The most recent +transformer-based models show impressive performance on vision tasks, even +better than Convolution Neural Networks (CNN). In this work, we present +AISFormer, an AIS framework, with a Transformer-based mask head. AISFormer +explicitly models the complex coherence between occluder, visible, amodal, and +invisible masks within an object's regions of interest by treating them as +learnable queries. Specifically, AISFormer contains four modules: (i) feature +encoding: extract ROI and learn both short-range and long-range visual +features. (ii) mask transformer decoding: generate the occluder, visible, and +amodal mask query embeddings by a transformer decoder (iii) invisible mask +embedding: model the coherence between the amodal and visible masks, and (iv) +mask predicting: estimate output masks including occluder, visible, amodal and +invisible. We conduct extensive experiments and ablation studies on three +challenging benchmarks i.e. KINS, D2SA, and COCOA-cls to evaluate the +effectiveness of AISFormer. The code is available at: +https://github.com/UARK-AICV/AISFormer + +
+
+ comment: Accepted to BMVC2022 +
+
+
+
+
+ + ♻ ☆ Single-Model and Any-Modality for Video Object Tracking CVPR2024 + + +
+ In the realm of video object tracking, auxiliary modalities such as depth, +thermal, or event data have emerged as valuable assets to complement the RGB +trackers. In practice, most existing RGB trackers learn a single set of +parameters to use them across datasets and applications. However, a similar +single-model unification for multi-modality tracking presents several +challenges. These challenges stem from the inherent heterogeneity of inputs -- +each with modality-specific representations, the scarcity of multi-modal +datasets, and the absence of all the modalities at all times. In this work, we +introduce Un-Track, a Unified Tracker of a single set of parameters for any +modality. To handle any modality, our method learns their common latent space +through low-rank factorization and reconstruction techniques. More importantly, +we use only the RGB-X pairs to learn the common latent space. This unique +shared representation seamlessly binds all modalities together, enabling +effective unification and accommodating any missing modality, all within a +single transformer-based architecture. Our Un-Track achieves +8.1 absolute +F-score gain, on the DepthTrack dataset, by introducing only +2.14 (over 21.50) +GFLOPs with +6.6M (over 93M) parameters, through a simple yet efficient +prompting strategy. Extensive comparisons on five benchmark datasets with +different modalities show that Un-Track surpasses both SOTA unified trackers +and modality-specific counterparts, validating our effectiveness and +practicality. The source code is publicly available at +https://github.com/Zongwei97/UnTrack. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ TMT-VIS: Taxonomy-aware Multi-dataset Joint Training for Video Instance + Segmentation NeurIPS 2023 + + +
+ Training on large-scale datasets can boost the performance of video instance +segmentation while the annotated datasets for VIS are hard to scale up due to +the high labor cost. What we possess are numerous isolated filed-specific +datasets, thus, it is appealing to jointly train models across the aggregation +of datasets to enhance data volume and diversity. However, due to the +heterogeneity in category space, as mask precision increases with the data +volume, simply utilizing multiple datasets will dilute the attention of models +on different taxonomies. Thus, increasing the data scale and enriching taxonomy +space while improving classification precision is important. In this work, we +analyze that providing extra taxonomy information can help models concentrate +on specific taxonomy, and propose our model named Taxonomy-aware Multi-dataset +Joint Training for Video Instance Segmentation (TMT-VIS) to address this vital +challenge. Specifically, we design a two-stage taxonomy aggregation module that +first compiles taxonomy information from input videos and then aggregates these +taxonomy priors into instance queries before the transformer decoder. We +conduct extensive experimental evaluations on four popular and challenging +benchmarks, including YouTube-VIS 2019, YouTube-VIS 2021, OVIS, and UVO. Our +model shows significant improvement over the baseline solutions, and sets new +state-of-the-art records on all benchmarks. These appealing and encouraging +results demonstrate the effectiveness and generality of our approach. The code +is available at https://github.com/rkzheng99/TMT-VIS . + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Large Language Models and Foundation Models in Smart Agriculture: + Basics, Opportunities, and Challenges + + +
+ The past decade has witnessed the rapid development and adoption of ML & DL +methodologies in agricultural systems, showcased by great successes in +agricultural applications. However, these conventional ML/DL models have +certain limitations: they heavily rely on large, costly-to-acquire labeled +datasets for training, require specialized expertise for development and +maintenance, and are mostly tailored for specific tasks, thus lacking +generalizability. Recently, large pre-trained models, also known as FMs, have +demonstrated remarkable successes in language, vision, and decision-making +tasks across various domains. These models are trained on a large amount of +data from multiple domains and modalities. Once trained, they can accomplish +versatile tasks with just minor fine-tuning and minimal task-specific labeled +data. Despite their proven effectiveness and huge potential, there has been +little exploration of applying FMs to agriculture AI. Thus, this study aims to +explore the potential of FMs in the field of smart agriculture. In particular, +conceptual tools and technical background are presented to help the +understanding of the problem space and uncover new research directions. To this +end, recent FMs in the general CS domain are reviewed, and the models are +categorized into four categories: language FMs, vision FMs, multimodal FMs, and +reinforcement learning FMs. Then, the steps of developing agriculture FMs +(AFMs) are outlined and potential applications in smart agriculture are +discussed. Moreover, challenges and risks associated with developing AFMs are +discussed, including model training, validation, and deployment. In summary, +the advancement of AI in agriculture is explored by introducing AFMs as a +promising paradigm that can significantly mitigate the reliance on extensive +labeled datasets and enhance the efficiency, effectiveness, and generalization +of agricultural AI systems. + +
+
+ comment: 18 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ PixArt-Σ: Weak-to-Strong Training of Diffusion Transformer for 4K + Text-to-Image Generation + + +
+ In this paper, we introduce PixArt-\Sigma, a Diffusion Transformer +model~(DiT) capable of directly generating images at 4K resolution. +PixArt-\Sigma represents a significant advancement over its predecessor, +PixArt-\alpha, offering images of markedly higher fidelity and improved +alignment with text prompts. A key feature of PixArt-\Sigma is its training +efficiency. Leveraging the foundational pre-training of PixArt-\alpha, it +evolves from the `weaker' baseline to a `stronger' model via incorporating +higher quality data, a process we term "weak-to-strong training". The +advancements in PixArt-\Sigma are twofold: (1) High-Quality Training Data: +PixArt-\Sigma incorporates superior-quality image data, paired with more +precise and detailed image captions. (2) Efficient Token Compression: we +propose a novel attention module within the DiT framework that compresses both +keys and values, significantly improving efficiency and facilitating +ultra-high-resolution image generation. Thanks to these improvements, +PixArt-\Sigma achieves superior image quality and user prompt adherence +capabilities with significantly smaller model size (0.6B parameters) than +existing text-to-image diffusion models, such as SDXL (2.6B parameters) and SD +Cascade (5.1B parameters). Moreover, PixArt-\Sigma's capability to generate 4K +images supports the creation of high-resolution posters and wallpapers, +efficiently bolstering the production of high-quality visual content in +industries such as film and gaming. + +
+
+ comment: Project Page: https://pixart-alpha.github.io/PixArt-sigma-project/ +
+
+
+
+
+ + ♻ ☆ Forgedit: Text Guided Image Editing via Learning and Forgetting + + +
+ Text-guided image editing on real or synthetic images, given only the +original image itself and the target text prompt as inputs, is a very general +and challenging task. It requires an editing model to estimate by itself which +part of the image should be edited, and then perform either rigid or non-rigid +editing while preserving the characteristics of original image. In this paper, +we design a novel text-guided image editing method, named as Forgedit. First, +we propose a vision-language joint optimization framework capable of +reconstructing the original image in 30 seconds, much faster than previous SOTA +and much less overfitting. Then we propose a novel vector projection mechanism +in text embedding space of Diffusion Models, which is capable to control the +identity similarity and editing strength seperately. Finally, we discovered a +general property of UNet in Diffusion Models, i.e., Unet encoder learns space +and structure, Unet decoder learns appearance and identity. With such a +property, we design forgetting mechanisms to successfully tackle the fatal and +inevitable overfitting issues when fine-tuning Diffusion Models on one image, +thus significantly boosting the editing capability of Diffusion Models. Our +method, Forgedit, built on Stable Diffusion, achieves new state-of-the-art +results on the challenging text-guided image editing benchmark: TEdBench, +surpassing the previous SOTA methods such as Imagic with Imagen, in terms of +both CLIP score and LPIPS score. Codes are available at +https://github.com/witcherofresearch/Forgedit + +
+
+ comment: Codes are available at https://github.com/witcherofresearch/Forgedit +
+
+
+
+
+ + ♻ ☆ RDA-INR: Riemannian Diffeomorphic Autoencoding via Implicit Neural + Representations + + +
+ Diffeomorphic registration frameworks such as Large Deformation Diffeomorphic +Metric Mapping (LDDMM) are used in computer graphics and the medical domain for +atlas building, statistical latent modeling, and pairwise and groupwise +registration. In recent years, researchers have developed neural network-based +approaches regarding diffeomorphic registration to improve the accuracy and +computational efficiency of traditional methods. In this work, we focus on a +limitation of neural network-based atlas building and statistical latent +modeling methods, namely that they either are (i) resolution dependent or (ii) +disregard any data/problem-specific geometry needed for proper mean-variance +analysis. In particular, we overcome this limitation by designing a novel +encoder based on resolution-independent implicit neural representations. The +encoder achieves resolution invariance for LDDMM-based statistical latent +modeling. Additionally, the encoder adds LDDMM Riemannian geometry to +resolution-independent deep learning models for statistical latent modeling. We +showcase that the Riemannian geometry aspect improves latent modeling and is +required for a proper mean-variance analysis. Furthermore, to showcase the +benefit of resolution independence for LDDMM-based data variability modeling, +we show that our approach outperforms another neural network-based LDDMM latent +code model. Our work paves a way to more research into how Riemannian geometry, +shape/image analysis, and deep learning can be combined. + +
+
+ comment: 34 pages, 27 figures (including subfigures) +
+
+
+
+
+ + ♻ ☆ SemCity: Semantic Scene Generation with Triplane Diffusion CVPR 2024 + + +
+ We present "SemCity," a 3D diffusion model for semantic scene generation in +real-world outdoor environments. Most 3D diffusion models focus on generating a +single object, synthetic indoor scenes, or synthetic outdoor scenes, while the +generation of real-world outdoor scenes is rarely addressed. In this paper, we +concentrate on generating a real-outdoor scene through learning a diffusion +model on a real-world outdoor dataset. In contrast to synthetic data, +real-outdoor datasets often contain more empty spaces due to sensor +limitations, causing challenges in learning real-outdoor distributions. To +address this issue, we exploit a triplane representation as a proxy form of +scene distributions to be learned by our diffusion model. Furthermore, we +propose a triplane manipulation that integrates seamlessly with our triplane +diffusion model. The manipulation improves our diffusion model's applicability +in a variety of downstream tasks related to outdoor scene generation such as +scene inpainting, scene outpainting, and semantic scene completion refinements. +In experimental results, we demonstrate that our triplane diffusion model shows +meaningful generation results compared with existing work in a real-outdoor +dataset, SemanticKITTI. We also show our triplane manipulation facilitates +seamlessly adding, removing, or modifying objects within a scene. Further, it +also enables the expansion of scenes toward a city-level scale. Finally, we +evaluate our method on semantic scene completion refinements where our +diffusion model enhances predictions of semantic scene completion networks by +learning scene distribution. Our code is available at +https://github.com/zoomin-lee/SemCity. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Swin UNETR++: Advancing Transformer-Based Dense Dose Prediction Towards + Fully Automated Radiation Oncology Treatments ML4H + + +
+ The field of Radiation Oncology is uniquely positioned to benefit from the +use of artificial intelligence to fully automate the creation of radiation +treatment plans for cancer therapy. This time-consuming and specialized task +combines patient imaging with organ and tumor segmentation to generate a 3D +radiation dose distribution to meet clinical treatment goals, similar to +voxel-level dense prediction. In this work, we propose Swin UNETR++, that +contains a lightweight 3D Dual Cross-Attention (DCA) module to capture the +intra and inter-volume relationships of each patient's unique anatomy, which +fully convolutional neural networks lack. Our model was trained, validated, and +tested on the Open Knowledge-Based Planning dataset. In addition to metrics of +Dose Score $\overline{S_{\text{Dose}}}$ and DVH Score +$\overline{S_{\text{DVH}}}$ that quantitatively measure the difference between +the predicted and ground-truth 3D radiation dose distribution, we propose the +qualitative metrics of average volume-wise acceptance rate +$\overline{R_{\text{VA}}}$ and average patient-wise clinical acceptance rate +$\overline{R_{\text{PA}}}$ to assess the clinical reliability of the +predictions. Swin UNETR++ demonstrates near-state-of-the-art performance on +validation and test dataset (validation: $\overline{S_{\text{DVH}}}$=1.492 Gy, +$\overline{S_{\text{Dose}}}$=2.649 Gy, $\overline{R_{\text{VA}}}$=88.58%, +$\overline{R_{\text{PA}}}$=100.0%; test: $\overline{S_{\text{DVH}}}$=1.634 Gy, +$\overline{S_{\text{Dose}}}$=2.757 Gy, $\overline{R_{\text{VA}}}$=90.50%, +$\overline{R_{\text{PA}}}$=98.0%), establishing a basis for future studies to +translate 3D dose predictions into a deliverable treatment plan, facilitating +full automation. + +
+
+ comment: Extended Abstract presented at Machine Learning for Health (ML4H) + symposium 2023, December 10th, 2023, New Orleans, United States, 16 pages +
+
+
+
+
+ + ♻ ☆ Curriculum Learning for ab initio Deep Learned Refractive Optics + + +
+ Deep optical optimization has recently emerged as a new paradigm for +designing computational imaging systems using only the output image as the +objective. However, it has been limited to either simple optical systems +consisting of a single element such as a diffractive optical element (DOE) or +metalens, or the fine-tuning of compound lenses from good initial designs. Here +we present a DeepLens design method based on curriculum learning, which is able +to learn optical designs of compound lenses ab initio from randomly initialized +surfaces without human intervention, therefore overcoming the need for a good +initial design. We demonstrate the effectiveness of our approach by fully +automatically designing both classical imaging lenses and a large field-of-view +extended depth-of-field computational lens in a cellphone-style form factor, +with highly aspheric surfaces and a short back focal length. + +
+
+ comment: Automatically design computational lenses from scratch with + differentiable ray tracing +
+
+
+
+
+ + ♻ ☆ Unifying Global-Local Representations in Salient Object Detection with + Transformer + + +
+ The fully convolutional network (FCN) has dominated salient object detection +for a long period. However, the locality of CNN requires the model deep enough +to have a global receptive field and such a deep model always leads to the loss +of local details. In this paper, we introduce a new attention-based encoder, +vision transformer, into salient object detection to ensure the globalization +of the representations from shallow to deep layers. With the global view in +very shallow layers, the transformer encoder preserves more local +representations to recover the spatial details in final saliency maps. Besides, +as each layer can capture a global view of its previous layer, adjacent layers +can implicitly maximize the representation differences and minimize the +redundant features, making that every output feature of transformer layers +contributes uniquely for final prediction. To decode features from the +transformer, we propose a simple yet effective deeply-transformed decoder. The +decoder densely decodes and upsamples the transformer features, generating the +final saliency map with less noise injection. Experimental results demonstrate +that our method significantly outperforms other FCN-based and transformer-based +methods in five benchmarks by a large margin, with an average of 12.17% +improvement in terms of Mean Absolute Error (MAE). Code will be available at +https://github.com/OliverRensu/GLSTR. + +
+
+ comment: accepted by IEEE TETCI +
+
+
+
+
+ + ♻ ☆ WSI-SAM: Multi-resolution Segment Anything Model (SAM) for + histopathology whole-slide images + + +
+ The Segment Anything Model (SAM) marks a significant advancement in +segmentation models, offering robust zero-shot abilities and dynamic prompting. +However, existing medical SAMs are not suitable for the multi-scale nature of +whole-slide images (WSIs), restricting their effectiveness. To resolve this +drawback, we present WSI-SAM, enhancing SAM with precise object segmentation +capabilities for histopathology images using multi-resolution patches, while +preserving its efficient, prompt-driven design, and zero-shot abilities. To +fully exploit pretrained knowledge while minimizing training overhead, we keep +SAM frozen, introducing only minimal extra parameters and computational +overhead. In particular, we introduce High-Resolution (HR) token, +Low-Resolution (LR) token and dual mask decoder. This decoder integrates the +original SAM mask decoder with a lightweight fusion module that integrates +features at multiple scales. Instead of predicting a mask independently, we +integrate HR and LR token at intermediate layer to jointly learn features of +the same object across multiple resolutions. Experiments show that our WSI-SAM +outperforms state-of-the-art SAM and its variants. In particular, our model +outperforms SAM by 4.1 and 2.5 percent points on a ductal carcinoma in situ +(DCIS) segmentation tasks and breast cancer metastasis segmentation task +(CAMELYON16 dataset). The code will be available at +https://github.com/HongLiuuuuu/WSI-SAM. + +
+
+ comment: 12 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ DCNFIS: Deep Convolutional Neuro-Fuzzy Inference System + + +
+ A key challenge in eXplainable Artificial Intelligence is the well-known +tradeoff between the transparency of an algorithm (i.e., how easily a human can +directly understand the algorithm, as opposed to receiving a post-hoc +explanation), and its accuracy. We report on the design of a new deep network +that achieves improved transparency without sacrificing accuracy. We design a +deep convolutional neuro-fuzzy inference system (DCNFIS) by hybridizing fuzzy +logic and deep learning models and show that DCNFIS performs as accurately as +existing convolutional neural networks on four well-known datasets and 3 famous +architectures. Our performance comparison with available fuzzy methods show +that DCNFIS is now state-of-the-art fuzzy system and outperforms other shallow +and deep fuzzy methods to the best of our knowledge. At the end, we exploit the +transparency of fuzzy logic by deriving explanations, in the form of saliency +maps, from the fuzzy rules encoded in the network to take benefit of fuzzy +logic upon regular deep learning methods. We investigate the properties of +these explanations in greater depth using the Fashion-MNIST dataset. + +
+
+
+
+
+ + ♻ ☆ rFaceNet: An End-to-End Network for Enhanced Physiological Signal + Extraction through Identity-Specific Facial Contours + + +
+ Remote photoplethysmography (rPPG) technique extracts blood volume pulse +(BVP) signals from subtle pixel changes in video frames. This study introduces +rFaceNet, an advanced rPPG method that enhances the extraction of facial BVP +signals with a focus on facial contours. rFaceNet integrates identity-specific +facial contour information and eliminates redundant data. It efficiently +extracts facial contours from temporally normalized frame inputs through a +Temporal Compressor Unit (TCU) and steers the model focus to relevant facial +regions by using the Cross-Task Feature Combiner (CTFC). Through elaborate +training, the quality and interpretability of facial physiological signals +extracted by rFaceNet are greatly improved compared to previous methods. +Moreover, our novel approach demonstrates superior performance than SOTA +methods in various heart rate estimation benchmarks. + +
+
+ comment: under-review +
+
+
+
+
+ + ♻ ☆ D4C glove-train: solving the RPM and Bongard-logo problem by + distributing and Circumscribing concepts + + +
+ This paper achieves significant progress in the field of abstract reasoning, +particularly in addressing Raven's Progressive Matrices (RPM) and Bongard-Logo +problems. We propose the D2C approach, which redefines conceptual boundaries in +these domains and bridges the gap between high-level concepts and their +low-dimensional representations. Based on this, we further introduce the D3C +method that handles Bongard-Logo problems and significantly improves reasoning +accuracy by estimating the distribution of image representations and measuring +their Sinkhorn distance. To enhance computational efficiency, we introduce the +D3C-cos variant, which provides an efficient and accurate solution for RPM +problems by constraining distribution distances. Additionally, we present +Lico-Net, a network that combines D3C and D3C-cos to achieve state-of-the-art +performance in both problem-solving and interpretability. Finally, we extend +our approach to D4C, employing adversarial strategies to further refine +conceptual boundaries and demonstrate notable improvements for both RPM and +Bongard-Logo problems. Overall, our contributions offer a new perspective and +practical solutions to the field of abstract reasoning. + +
+
+ comment: 16 pages, 14 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Towards Effective Multiple-in-One Image Restoration: A Sequential and + Prompt Learning Strategy + + +
+ While single task image restoration (IR) has achieved significant successes, +it remains a challenging issue to train a single model which can tackle +multiple IR tasks. In this work, we investigate in-depth the multiple-in-one +(MiO) IR problem, which comprises seven popular IR tasks. We point out that MiO +IR faces two pivotal challenges: the optimization of diverse objectives and the +adaptation to multiple tasks. To tackle these challenges, we present two simple +yet effective strategies. The first strategy, referred to as sequential +learning, attempts to address how to optimize the diverse objectives, which +guides the network to incrementally learn individual IR tasks in a sequential +manner rather than mixing them together. The second strategy, i.e., prompt +learning, attempts to address how to adapt to the different IR tasks, which +assists the network to understand the specific task and improves the +generalization ability. By evaluating on 19 test sets, we demonstrate that the +sequential and prompt learning strategies can significantly enhance the MiO +performance of commonly used CNN and Transformer backbones. Our experiments +also reveal that the two strategies can supplement each other to learn better +degradation representations and enhance the model robustness. It is expected +that our proposed MiO IR formulation and strategies could facilitate the +research on how to train IR models with higher generalization capabilities. + +
+
+
+
+
+ + ♻ ☆ Intention-driven Ego-to-Exo Video Generation + + +
+ Ego-to-exo video generation refers to generating the corresponding exocentric +video according to the egocentric video, providing valuable applications in +AR/VR and embodied AI. Benefiting from advancements in diffusion model +techniques, notable progress has been achieved in video generation. However, +existing methods build upon the spatiotemporal consistency assumptions between +adjacent frames, which cannot be satisfied in the ego-to-exo scenarios due to +drastic changes in views. To this end, this paper proposes an Intention-Driven +Ego-to-exo video generation framework (IDE) that leverages action intention +consisting of human movement and action description as view-independent +representation to guide video generation, preserving the consistency of content +and motion. Specifically, the egocentric head trajectory is first estimated +through multi-view stereo matching. Then, cross-view feature perception module +is introduced to establish correspondences between exo- and ego- views, guiding +the trajectory transformation module to infer human full-body movement from the +head trajectory. Meanwhile, we present an action description unit that maps the +action semantics into the feature space consistent with the exocentric image. +Finally, the inferred human movement and high-level action descriptions jointly +guide the generation of exocentric motion and interaction content (i.e., +corresponding optical flow and occlusion maps) in the backward process of the +diffusion model, ultimately warping them into the corresponding exocentric +video. We conduct extensive experiments on the relevant dataset with diverse +exo-ego video pairs, and our IDE outperforms state-of-the-art models in both +subjective and objective assessments, demonstrating its efficacy in ego-to-exo +video generation. + +
+
+
+
+
+ + ♻ ☆ Repositioning the Subject within Image + + +
+ Current image manipulation primarily centers on static manipulation, such as +replacing specific regions within an image or altering its overall style. In +this paper, we introduce an innovative dynamic manipulation task, subject +repositioning. This task involves relocating a user-specified subject to a +desired position while preserving the image's fidelity. Our research reveals +that the fundamental sub-tasks of subject repositioning, which include filling +the void left by the repositioned subject, reconstructing obscured portions of +the subject and blending the subject to be consistent with surrounding areas, +can be effectively reformulated as a unified, prompt-guided inpainting task. +Consequently, we can employ a single diffusion generative model to address +these sub-tasks using various task prompts learned through our proposed task +inversion technique. Additionally, we integrate pre-processing and +post-processing techniques to further enhance the quality of subject +repositioning. These elements together form our SEgment-gEnerate-and-bLEnd +(SEELE) framework. To assess SEELE's effectiveness in subject repositioning, we +assemble a real-world subject repositioning dataset called ReS. Results of +SEELE on ReS demonstrate its efficacy. + +
+
+ comment: Project page: https://yikai-wang.github.io/seele/. Dataset: + https://github.com/Yikai-Wang/ReS. Arxiv version uses small size images for + fast preview. Full size PDF is available at project page +
+
+
+
+
+ + ♻ ☆ How Physics and Background Attributes Impact Video Transformers in + Robotic Manipulation: A Case Study on Planar Pushing IROS 2024 + + +
+ As model and dataset sizes continue to scale in robot learning, the need to +understand what is the specific factor in the dataset that affects model +performance becomes increasingly urgent to ensure cost-effective data +collection and model performance. In this work, we empirically investigate how +physics attributes (color, friction coefficient, shape) and scene background +characteristics, such as the complexity and dynamics of interactions with +background objects, influence the performance of Video Transformers in +predicting planar pushing trajectories. We aim to investigate three primary +questions: How do physics attributes and background scene characteristics +influence model performance? What kind of changes in attributes are most +detrimental to model generalization? What proportion of fine-tuning data is +required to adapt models to novel scenarios? To facilitate this research, we +present CloudGripper-Push-1K, a large real-world vision-based robot pushing +dataset comprising 1278 hours and 460,000 videos of planar pushing interactions +with objects with different physics and background attributes. We also propose +Video Occlusion Transformer (VOT), a generic modular video-transformer-based +trajectory prediction framework which features 3 choices of 2D-spatial encoders +as the subject of our case study. Dataset and codes will be available at +https://cloudgripper.org. + +
+
+ comment: Under review at IEEE/RSJ IROS 2024 +
+
+
+
+
+ + ♻ ☆ SVGDreamer: Text Guided SVG Generation with Diffusion Model + + +
+ Recently, text-guided scalable vector graphics (SVGs) synthesis has shown +promise in domains such as iconography and sketch. However, existing +text-to-SVG generation methods lack editability and struggle with visual +quality and result diversity. To address these limitations, we propose a novel +text-guided vector graphics synthesis method called SVGDreamer. SVGDreamer +incorporates a semantic-driven image vectorization (SIVE) process that enables +the decomposition of synthesis into foreground objects and background, thereby +enhancing editability. Specifically, the SIVE process introduce attention-based +primitive control and an attention-mask loss function for effective control and +manipulation of individual elements. Additionally, we propose a Vectorized +Particle-based Score Distillation (VPSD) approach to tackle the challenges of +color over-saturation, vector primitives over-smoothing, and limited result +diversity in existing text-to-SVG generation methods. Furthermore, on the basis +of VPSD, we introduce Reward Feedback Learning (ReFL) to accelerate VPSD +convergence and improve aesthetic appeal. Extensive experiments have been +conducted to validate the effectiveness of SVGDreamer, demonstrating its +superiority over baseline methods in terms of editability, visual quality, and +diversity. The code and demo of SVGDreamer can be found at +https://ximinng.github.io/SVGDreamer-project/ + +
+
+ comment: 19 pages, 16 figures, project link: + https://ximinng.github.io/SVGDreamer-project/ +
+
+
+
+
+ + ♻ ☆ 4DGen: Grounded 4D Content Generation with Spatial-temporal Consistency + + +
+ Aided by text-to-image and text-to-video diffusion models, existing 4D +content creation pipelines utilize score distillation sampling to optimize the +entire dynamic 3D scene. However, as these pipelines generate 4D content from +text or image inputs, they incur significant time and effort in prompt +engineering through trial and error. This work introduces 4DGen, a novel, +holistic framework for grounded 4D content creation that decomposes the 4D +generation task into multiple stages. We identify static 3D assets and +monocular video sequences as key components in constructing the 4D content. Our +pipeline facilitates conditional 4D generation, enabling users to specify +geometry (3D assets) and motion (monocular videos), thus offering superior +control over content creation. Furthermore, we construct our 4D representation +using dynamic 3D Gaussians, which permits efficient, high-resolution +supervision through rendering during training, thereby facilitating +high-quality 4D generation. Additionally, we employ spatial-temporal pseudo +labels on anchor frames, along with seamless consistency priors implemented +through 3D-aware score distillation sampling and smoothness regularizations. +Compared to existing baselines, our approach yields competitive results in +faithfully reconstructing input signals and realistically inferring renderings +from novel viewpoints and timesteps. Most importantly, our method supports +grounded generation, offering users enhanced control, a feature difficult to +achieve with previous methods. Project page: +https://vita-group.github.io/4DGen/ + +
+
+ comment: Project page: https://vita-group.github.io/4DGen/ +
+
+
+
+
+ + ♻ ☆ Multi-modal vision-language model for generalizable annotation-free + pathological lesions localization + + +
+ Defining pathologies automatically from medical images aids the understanding +of the emergence and progression of diseases, and such an ability is crucial in +clinical diagnostics. However, existing deep learning models heavily rely on +expert annotations and lack generalization capabilities in open clinical +environments. In this study, we present a generalizable vision-language +pre-training model for Annotation-Free pathological lesions Localization +(AFLoc). The core strength of AFLoc lies in its extensive multi-level semantic +structure-based contrastive learning, which comprehensively aligns +multi-granularity medical concepts from reports with abundant image features, +to adapt to the diverse expressions of pathologies and unseen pathologies +without the reliance on image annotations from experts. We demonstrate the +proof of concept on CXR images, with extensive experimental validation across 4 +distinct external datasets, encompassing 11 types of chest pathologies. The +results demonstrate that AFLoc surpasses 6 state-of-the-art methods and even +outperforms the human benchmark in locating 5 different pathologies. We further +verify its generalization ability in retinal fundus image pathological lesions +localization. Our approach showcases AFLoc versatilities and underscores its +suitability in complex clinical environments. + +
+
+
+
+
+ + ♻ ☆ Multimodal self-supervised learning for lesion localization + + +
+ Multimodal deep learning utilizing imaging and diagnostic reports has made +impressive progress in the field of medical imaging diagnostics, demonstrating +a particularly strong capability for auxiliary diagnosis in cases where +sufficient annotation information is lacking. Nonetheless, localizing diseases +accurately without detailed positional annotations remains a challenge. +Although existing methods have attempted to utilize local information to +achieve fine-grained semantic alignment, their capability in extracting the +fine-grained semantics of the comprehensive context within reports is limited. +To address this problem, a new method is introduced that takes full sentences +from textual reports as the basic units for local semantic alignment. This +approach combines chest X-ray images with their corresponding textual reports, +performing contrastive learning at both global and local levels. The leading +results obtained by this method on multiple datasets confirm its efficacy in +the task of lesion localization. + +
+
+
+
+
+ + ♻ ☆ OpenSUN3D: 1st Workshop Challenge on Open-Vocabulary 3D Scene + Understanding ICCV 2023 + + +
+ This report provides an overview of the challenge hosted at the OpenSUN3D +Workshop on Open-Vocabulary 3D Scene Understanding held in conjunction with +ICCV 2023. The goal of this workshop series is to provide a platform for +exploration and discussion of open-vocabulary 3D scene understanding tasks, +including but not limited to segmentation, detection and mapping. We provide an +overview of the challenge hosted at the workshop, present the challenge +dataset, the evaluation methodology, and brief descriptions of the winning +methods. For additional details, please see +https://opensun3d.github.io/index_iccv23.html. + +
+
+ comment: Our OpenSUN3D workshop website for ICCV 2023: + https://opensun3d.github.io/index_iccv23.html +
+
+
+
+
+ + ♻ ☆ Modeling Continuous Motion for 3D Point Cloud Object Tracking + + +
+ The task of 3D single object tracking (SOT) with LiDAR point clouds is +crucial for various applications, such as autonomous driving and robotics. +However, existing approaches have primarily relied on appearance matching or +motion modeling within only two successive frames, thereby overlooking the +long-range continuous motion property of objects in 3D space. To address this +issue, this paper presents a novel approach that views each tracklet as a +continuous stream: at each timestamp, only the current frame is fed into the +network to interact with multi-frame historical features stored in a memory +bank, enabling efficient exploitation of sequential information. To achieve +effective cross-frame message passing, a hybrid attention mechanism is designed +to account for both long-range relation modeling and local geometric feature +extraction. Furthermore, to enhance the utilization of multi-frame features for +robust tracking, a contrastive sequence enhancement strategy is proposed, which +uses ground truth tracklets to augment training sequences and promote +discrimination against false positives in a contrastive manner. Extensive +experiments demonstrate that the proposed method outperforms the +state-of-the-art method by significant margins on multiple benchmarks. + +
+
+
+
+
+ + ♻ ☆ MiM-ISTD: Mamba-in-Mamba for Efficient Infrared Small Target Detection + + +
+ Recently, infrared small target detection (ISTD) has made significant +progress, thanks to the development of basic models. Specifically, the +structures combining convolutional networks with transformers can successfully +extract both local and global features. However, the disadvantage of the +transformer is also inherited, i.e., the quadratic computational complexity to +the length of the sequence. Inspired by the recent basic model with linear +complexity for long-distance modeling, called Mamba, we explore the potential +of this state space model for ISTD task in terms of effectiveness and +efficiency in the paper. However, directly applying Mamba achieves poor +performance since local features, which are critical to detecting small +targets, cannot be fully exploited. Instead, we tailor a Mamba-in-Mamba +(MiM-ISTD) structure for efficient ISTD. Specifically, we treat the local +patches as "visual sentences" and use the Outer Mamba to explore the global +information. We then decompose each visual sentence into sub-patches as "visual +words" and use the Inner Mamba to further explore the local information among +words in the visual sentence with negligible computational costs. By +aggregating the word and sentence features, the MiM-ISTD can effectively +explore both global and local information. Experiments on NUAA-SIRST and +IRSTD-1k show the superior accuracy and efficiency of our method. Specifically, +MiM-ISTD is $10 \times$ faster than the SOTA method and reduces GPU memory +usage by 73.4$\%$ when testing on $2048 \times 2048$ image, overcoming the +computation and memory constraints on high-resolution infrared images. Source +code is available at https://github.com/txchen-USTC/MiM-ISTD. + +
+
+ comment: The first Mamba-based model for infrared small target detection +
+
+
+
+
+ + ♻ ☆ TempCompass: Do Video LLMs Really Understand Videos? + + +
+ Recently, there is a surge in interest surrounding video large language +models (Video LLMs). However, existing benchmarks fail to provide a +comprehensive feedback on the temporal perception ability of Video LLMs. On the +one hand, most of them are unable to distinguish between different temporal +aspects (e.g., speed, direction) and thus cannot reflect the nuanced +performance on these specific aspects. On the other hand, they are limited in +the diversity of task formats (e.g., only multi-choice QA), which hinders the +understanding of how temporal perception performance may vary across different +types of tasks. Motivated by these two problems, we propose the +\textbf{TempCompass} benchmark, which introduces a diversity of temporal +aspects and task formats. To collect high-quality test data, we devise two +novel strategies: (1) In video collection, we construct conflicting videos that +share the same static content but differ in a specific temporal aspect, which +prevents Video LLMs from leveraging single-frame bias or language priors. (2) +To collect the task instructions, we propose a paradigm where humans first +annotate meta-information for a video and then an LLM generates the +instruction. We also design an LLM-based approach to automatically and +accurately evaluate the responses from Video LLMs. Based on TempCompass, we +comprehensively evaluate 8 state-of-the-art (SOTA) Video LLMs and 3 Image LLMs, +and reveal the discerning fact that these models exhibit notably poor temporal +perception ability. The data and evaluation code are available at +https://github.com/llyx97/TempCompass. + +
+
+
+
+
+ + ♻ ☆ MOFI: Learning Image Representations from Noisy Entity Annotated Images ICLR 2024 + + +
+ We present MOFI, Manifold OF Images, a new vision foundation model designed +to learn image representations from noisy entity annotated images. MOFI differs +from previous work in two key aspects: (i) pre-training data, and (ii) training +recipe. Regarding data, we introduce a new approach to automatically assign +entity labels to images from noisy image-text pairs. Our approach involves +employing a named entity recognition model to extract entities from the +alt-text, and then using a CLIP model to select the correct entities as labels +of the paired image. It's a simple, cost-effective method that can scale to +handle billions of web-mined image-text pairs. Through this method, we have +created Image-to-Entities (I2E), a new dataset with 1 billion images and 2 +million distinct entities, covering rich visual concepts in the wild. Building +upon the I2E dataset, we study different training recipes like supervised +pre-training, contrastive pre-training, and multi-task learning. For +contrastive pre-training, we treat entity names as free-form text, and further +enrich them with entity descriptions. Experiments show that supervised +pre-training with large-scale fine-grained entity labels is highly effective +for image retrieval tasks, and multi-task training further improves the +performance. The final MOFI model achieves 86.66% mAP on the challenging +GPR1200 dataset, surpassing the previous state-of-the-art performance of 72.19% +from OpenAI's CLIP model. Further experiments on zero-shot and linear probe +image classification also show that MOFI outperforms a CLIP model trained on +the original image-text data, demonstrating the effectiveness of the I2E +dataset in learning strong image representations. We release our code and model +weights at https://github.com/apple/ml-mofi. + +
+
+ comment: Accepted to ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Towards Context-Stable and Visual-Consistent Image Inpainting + + +
+ Recent progress in inpainting increasingly relies on generative models, +leveraging their strong generation capabilities for addressing large irregular +masks. However, this enhanced generation often introduces context-instability, +leading to arbitrary object generation within masked regions. This paper +proposes a balanced solution, emphasizing the importance of unmasked regions in +guiding inpainting while preserving generation capacity. Our approach, Aligned +Stable Inpainting with UnKnown Areas Prior (ASUKA), employs a Masked +Auto-Encoder (MAE) to produce reconstruction-based prior. Aligned with the +powerful Stable Diffusion inpainting model (SD), ASUKA significantly improves +context stability. ASUKA further adopts an inpainting-specialized decoder, +highly reducing the color inconsistency issue of SD and thus ensuring more +visual-consistent inpainting. We validate effectiveness of inpainting +algorithms on benchmark dataset Places 2 and a collection of several existing +datasets, dubbed MISATO, across diverse domains and masking scenarios. Results +on these benchmark datasets confirm ASUKA's efficacy in both context-stability +and visual-consistency compared to SD and other inpainting algorithms. + +
+
+ comment: Project page: https://yikai-wang.github.io/asuka/ where full-size PDF + with appendix is available. Dataset: + https://github.com/Yikai-Wang/asuka-misato. Yikai Wang and Chenjie Cao + contribute equally +
+
+
+
+
+ + ♻ ☆ Anomaly Heterogeneity Learning for Open-set Supervised Anomaly Detection CVPR2024 + + +
+ Open-set supervised anomaly detection (OSAD) - a recently emerging anomaly +detection area - aims at utilizing a few samples of anomaly classes seen during +training to detect unseen anomalies (i.e., samples from open-set anomaly +classes), while effectively identifying the seen anomalies. Benefiting from the +prior knowledge illustrated by the seen anomalies, current OSAD methods can +often largely reduce false positive errors. However, these methods are trained +in a closed-set setting and treat the anomaly examples as from a homogeneous +distribution, rendering them less effective in generalizing to unseen anomalies +that can be drawn from any distribution. This paper proposes to learn +heterogeneous anomaly distributions using the limited anomaly examples to +address this issue. To this end, we introduce a novel approach, namely Anomaly +Heterogeneity Learning (AHL), that simulates a diverse set of heterogeneous +anomaly distributions and then utilizes them to learn a unified heterogeneous +abnormality model in surrogate open-set environments. Further, AHL is a generic +framework that existing OSAD models can plug and play for enhancing their +abnormality modeling. Extensive experiments on nine real-world anomaly +detection datasets show that AHL can 1) substantially enhance different +state-of-the-art OSAD models in detecting seen and unseen anomalies, and 2) +effectively generalize to unseen anomalies in new domains. Code is available at +https://github.com/mala-lab/AHL. + +
+
+ comment: Accepted by CVPR2024; 15 pages; 4 figures +
+
+
+
+
+ + ♻ ☆ WeatherDepth: Curriculum Contrastive Learning for Self-Supervised Depth + Estimation under Adverse Weather Conditions ICRA 2024 + + +
+ Depth estimation models have shown promising performance on clear scenes but +fail to generalize to adverse weather conditions due to illumination +variations, weather particles, etc. In this paper, we propose WeatherDepth, a +self-supervised robust depth estimation model with curriculum contrastive +learning, to tackle performance degradation in complex weather conditions. +Concretely, we first present a progressive curriculum learning scheme with +three simple-to-complex curricula to gradually adapt the model from clear to +relative adverse, and then to adverse weather scenes. It encourages the model +to gradually grasp beneficial depth cues against the weather effect, yielding +smoother and better domain adaption. Meanwhile, to prevent the model from +forgetting previous curricula, we integrate contrastive learning into different +curricula. By drawing reference knowledge from the previous course, our +strategy establishes a depth consistency constraint between different courses +toward robust depth estimation in diverse weather. Besides, to reduce manual +intervention and better adapt to different models, we designed an adaptive +curriculum scheduler to automatically search for the best timing for course +switching. In the experiment, the proposed solution is proven to be easily +incorporated into various architectures and demonstrates state-of-the-art +(SoTA) performance on both synthetic and real weather datasets. Source code and +data are available at \url{https://github.com/wangjiyuan9/WeatherDepth}. + +
+
+ comment: 6 pages, accept by ICRA 2024 +
+
+
+
+
+ + ♻ ☆ CalibFormer: A Transformer-based Automatic LiDAR-Camera Calibration + Network + + +
+ The fusion of LiDARs and cameras has been increasingly adopted in autonomous +driving for perception tasks. The performance of such fusion-based algorithms +largely depends on the accuracy of sensor calibration, which is challenging due +to the difficulty of identifying common features across different data +modalities. Previously, many calibration methods involved specific targets +and/or manual intervention, which has proven to be cumbersome and costly. +Learning-based online calibration methods have been proposed, but their +performance is barely satisfactory in most cases. These methods usually suffer +from issues such as sparse feature maps, unreliable cross-modality association, +inaccurate calibration parameter regression, etc. In this paper, to address +these issues, we propose CalibFormer, an end-to-end network for automatic +LiDAR-camera calibration. We aggregate multiple layers of camera and LiDAR +image features to achieve high-resolution representations. A multi-head +correlation module is utilized to identify correlations between features more +accurately. Lastly, we employ transformer architectures to estimate accurate +calibration parameters from the correlation information. Our method achieved a +mean translation error of $0.8751 \mathrm{cm}$ and a mean rotation error of +$0.0562 ^{\circ}$ on the KITTI dataset, surpassing existing state-of-the-art +methods and demonstrating strong robustness, accuracy, and generalization +capabilities. + +
+
+
+
+
+ + ♻ ☆ Visual Object Tracking on Multi-modal RGB-D Videos: A Review + + +
+ The development of visual object tracking has continued for decades. Recent +years, as the wide accessibility of the low-cost RGBD sensors, the task of +visual object tracking on RGB-D videos has drawn much attention. Compared to +conventional RGB-only tracking, the RGB-D videos can provide more information +that facilitates objecting tracking in some complicated scenarios. The goal of +this review is to summarize the relative knowledge of the research filed of +RGB-D tracking. To be specific, we will generalize the related RGB-D tracking +benchmarking datasets as well as the corresponding performance measurements. +Besides, the existing RGB-D tracking methods are summarized in the paper. +Moreover, we discuss the possible future direction in the field of RGB-D +tracking. + +
+
+ comment: I prefer not to present this paper due to its subpar quality +
+
+
+
+
+ + ♻ ☆ TensoIR: Tensorial Inverse Rendering + + +
+ We propose TensoIR, a novel inverse rendering approach based on tensor +factorization and neural fields. Unlike previous works that use purely +MLP-based neural fields, thus suffering from low capacity and high computation +costs, we extend TensoRF, a state-of-the-art approach for radiance field +modeling, to estimate scene geometry, surface reflectance, and environment +illumination from multi-view images captured under unknown lighting conditions. +Our approach jointly achieves radiance field reconstruction and +physically-based model estimation, leading to photo-realistic novel view +synthesis and relighting results. Benefiting from the efficiency and +extensibility of the TensoRF-based representation, our method can accurately +model secondary shading effects (like shadows and indirect lighting) and +generally support input images captured under single or multiple unknown +lighting conditions. The low-rank tensor representation allows us to not only +achieve fast and compact reconstruction but also better exploit shared +information under an arbitrary number of capturing lighting conditions. We +demonstrate the superiority of our method to baseline methods qualitatively and +quantitatively on various challenging synthetic and real-world scenes. + +
+
+ comment: Project page: https://haian-jin.github.io/TensoIR +
+
+
+
+
+ + ♻ ☆ Implicit Event-RGBD Neural SLAM CVPR 2024 + + +
+ Implicit neural SLAM has achieved remarkable progress recently. Nevertheless, +existing methods face significant challenges in non-ideal scenarios, such as +motion blur or lighting variation, which often leads to issues like convergence +failures, localization drifts, and distorted mapping. To address these +challenges, we propose EN-SLAM, the first event-RGBD implicit neural SLAM +framework, which effectively leverages the high rate and high dynamic range +advantages of event data for tracking and mapping. Specifically, EN-SLAM +proposes a differentiable CRF (Camera Response Function) rendering technique to +generate distinct RGB and event camera data via a shared radiance field, which +is optimized by learning a unified implicit representation with the captured +event and RGBD supervision. Moreover, based on the temporal difference property +of events, we propose a temporal aggregating optimization strategy for the +event joint tracking and global bundle adjustment, capitalizing on the +consecutive difference constraints of events, significantly enhancing tracking +accuracy and robustness. Finally, we construct the simulated dataset +DEV-Indoors and real captured dataset DEV-Reals containing 6 scenes, 17 +sequences with practical motion blur and lighting changes for evaluations. +Experimental results show that our method outperforms the SOTA methods in both +tracking ATE and mapping ACC with a real-time 17 FPS in various challenging +environments. Project page: https://delinqu.github.io/EN-SLAM. + +
+
+ comment: Accept at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ PubDef: Defending Against Transfer Attacks From Public Models ICLR 2024 + + +
+ Adversarial attacks have been a looming and unaddressed threat in the +industry. However, through a decade-long history of the robustness evaluation +literature, we have learned that mounting a strong or optimal attack is +challenging. It requires both machine learning and domain expertise. In other +words, the white-box threat model, religiously assumed by a large majority of +the past literature, is unrealistic. In this paper, we propose a new practical +threat model where the adversary relies on transfer attacks through publicly +available surrogate models. We argue that this setting will become the most +prevalent for security-sensitive applications in the future. We evaluate the +transfer attacks in this setting and propose a specialized defense method based +on a game-theoretic perspective. The defenses are evaluated under 24 public +models and 11 attack algorithms across three datasets (CIFAR-10, CIFAR-100, and +ImageNet). Under this threat model, our defense, PubDef, outperforms the +state-of-the-art white-box adversarial training by a large margin with almost +no loss in the normal accuracy. For instance, on ImageNet, our defense achieves +62% accuracy under the strongest transfer attack vs only 36% of the best +adversarially trained model. Its accuracy when not under attack is only 2% +lower than that of an undefended model (78% vs 80%). We release our code at +https://github.com/wagner-group/pubdef. + +
+
+ comment: ICLR 2024. Code available at https://github.com/wagner-group/pubdef +
+
+
+
+
+ + ♻ ☆ Generalized Large-Scale Data Condensation via Various Backbone and + Statistical Matching CVPR2024 + + +
+ The lightweight "local-match-global" matching introduced by SRe2L +successfully creates a distilled dataset with comprehensive information on the +full 224x224 ImageNet-1k. However, this one-sided approach is limited to a +particular backbone, layer, and statistics, which limits the improvement of the +generalization of a distilled dataset. We suggest that sufficient and various +"local-match-global" matching are more precise and effective than a single one +and has the ability to create a distilled dataset with richer information and +better generalization. We call this perspective "generalized matching" and +propose Generalized Various Backbone and Statistical Matching (G-VBSM) in this +work, which aims to create a synthetic dataset with densities, ensuring +consistency with the complete dataset across various backbones, layers, and +statistics. As experimentally demonstrated, G-VBSM is the first algorithm to +obtain strong performance across both small-scale and large-scale datasets. +Specifically, G-VBSM achieves a performance of 38.7% on CIFAR-100 with +128-width ConvNet, 47.6% on Tiny-ImageNet with ResNet18, and 31.4% on the full +224x224 ImageNet-1k with ResNet18, under images per class (IPC) 10, 50, and 10, +respectively. These results surpass all SOTA methods by margins of 3.9%, 6.5%, +and 10.1%, respectively. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language + Models + + +
+ Multimodal Large Language Model (MLLM) relies on the powerful LLM to perform +multimodal tasks, showing amazing emergent abilities in recent studies, such as +writing poems based on an image. However, it is difficult for these case +studies to fully reflect the performance of MLLM, lacking a comprehensive +evaluation. In this paper, we fill in this blank, presenting the first +comprehensive MLLM Evaluation benchmark MME. It measures both perception and +cognition abilities on a total of 14 subtasks. In order to avoid data leakage +that may arise from direct use of public datasets for evaluation, the +annotations of instruction-answer pairs are all manually designed. The concise +instruction design allows us to fairly compare MLLMs, instead of struggling in +prompt engineering. Besides, with such an instruction, we can also easily carry +out quantitative statistics. A total of 30 advanced MLLMs are comprehensively +evaluated on our MME, which not only suggests that existing MLLMs still have a +large room for improvement, but also reveals the potential directions for the +subsequent model optimization. The data application manner and online +leaderboards are released at +https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation. + +
+
+ comment: Project page: + https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models +
+
+
+
+
+ + ♻ ☆ MaxQ: Multi-Axis Query for N:M Sparsity Network CVPR2024 + + +
+ N:M sparsity has received increasing attention due to its remarkable +performance and latency trade-off compared with structured and unstructured +sparsity. However, existing N:M sparsity methods do not differentiate the +relative importance of weights among blocks and leave important weights +underappreciated. Besides, they directly apply N:M sparsity to the whole +network, which will cause severe information loss. Thus, they are still +sub-optimal. In this paper, we propose an efficient and effective Multi-Axis +Query methodology, dubbed as MaxQ, to rectify these problems. During the +training, MaxQ employs a dynamic approach to generate soft N:M masks, +considering the weight importance across multiple axes. This method enhances +the weights with more importance and ensures more effective updates. Meanwhile, +a sparsity strategy that gradually increases the percentage of N:M weight +blocks is applied, which allows the network to heal from the pruning-induced +damage progressively. During the runtime, the N:M soft masks can be precomputed +as constants and folded into weights without causing any distortion to the +sparse pattern and incurring additional computational overhead. Comprehensive +experiments demonstrate that MaxQ achieves consistent improvements across +diverse CNN architectures in various computer vision tasks, including image +classification, object detection and instance segmentation. For ResNet50 with +1:16 sparse pattern, MaxQ can achieve 74.6\% top-1 accuracy on ImageNet and +improve by over 2.8\% over the state-of-the-art. Codes and checkpoints are +available at \url{https://github.com/JingyangXiang/MaxQ}. + +
+
+ comment: Accepted by the IEEE/CVF Conference on Computer Vision and Pattern + Recognition 2024 (CVPR2024) +
+
+
+
+
+ + ♻ ☆ Instance-Level Trojan Attacks on Visual Question Answering via + Adversarial Learning in Neuron Activation Space IJCNN 2024 + + +
+ Trojan attacks embed perturbations in input data leading to malicious +behavior in neural network models. A combination of various Trojans in +different modalities enables an adversary to mount a sophisticated attack on +multimodal learning such as Visual Question Answering (VQA). However, +multimodal Trojans in conventional methods are susceptible to parameter +adjustment during processes such as fine-tuning. To this end, we propose an +instance-level multimodal Trojan attack on VQA that efficiently adapts to +fine-tuned models through a dual-modality adversarial learning method. This +method compromises two specific neurons in a specific perturbation layer in the +pretrained model to produce overly large neuron activations. Then, a malicious +correlation between these overactive neurons and the malicious output of a +fine-tuned model is established through adversarial learning. Extensive +experiments are conducted using the VQA-v2 dataset, based on a wide range of +metrics including sample efficiency, stealthiness, and robustness. The proposed +attack demonstrates enhanced performance with diverse vision and text Trojans +tailored for each sample. We demonstrate that the proposed attack can be +efficiently adapted to different fine-tuned models, by injecting only a few +shots of Trojan samples. Moreover, we investigate the attack performance under +conventional defenses, where the defenses cannot effectively mitigate the +attack. + +
+
+ comment: Accepted for IJCNN 2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 89 + +
+
+
+ + ☆ Texture Edge detection by Patch consensus (TEP) + + +
+ We propose Texture Edge detection using Patch consensus (TEP) which is a +training-free method to detect the boundary of texture. We propose a new simple +way to identify the texture edge location, using the consensus of segmented +local patch information. While on the boundary, even using local patch +information, the distinction between textures are typically not clear, but +using neighbor consensus give a clear idea of the boundary. We utilize local +patch, and its response against neighboring regions, to emphasize the +similarities and the differences across different textures. The step of +segmentation of response further emphasizes the edge location, and the +neighborhood voting gives consensus and stabilize the edge detection. We +analyze texture as a stationary process to give insight into the patch width +parameter verses the quality of edge detection. We derive the necessary +condition for textures to be distinguished, and analyze the patch width with +respect to the scale of textures. Various experiments are presented to validate +the proposed model. + +
+
+
+
+
+ + ☆ Multiplane Quantitative Phase Imaging Using a Wavelength-Multiplexed + Diffractive Optical Processor + + +
+ Quantitative phase imaging (QPI) is a label-free technique that provides +optical path length information for transparent specimens, finding utility in +biology, materials science, and engineering. Here, we present quantitative +phase imaging of a 3D stack of phase-only objects using a +wavelength-multiplexed diffractive optical processor. Utilizing multiple +spatially engineered diffractive layers trained through deep learning, this +diffractive processor can transform the phase distributions of multiple 2D +objects at various axial positions into intensity patterns, each encoded at a +unique wavelength channel. These wavelength-multiplexed patterns are projected +onto a single field-of-view (FOV) at the output plane of the diffractive +processor, enabling the capture of quantitative phase distributions of input +objects located at different axial planes using an intensity-only image sensor. +Based on numerical simulations, we show that our diffractive processor could +simultaneously achieve all-optical quantitative phase imaging across several +distinct axial planes at the input by scanning the illumination wavelength. A +proof-of-concept experiment with a 3D-fabricated diffractive processor further +validated our approach, showcasing successful imaging of two distinct phase +objects at different axial positions by scanning the illumination wavelength in +the terahertz spectrum. Diffractive network-based multiplane QPI designs can +open up new avenues for compact on-chip phase imaging and sensing devices. + +
+
+ comment: 27 Pages, 9 Figures +
+
+
+
+
+ + ☆ FH-TabNet: Multi-Class Familial Hypercholesterolemia Detection via a + Multi-Stage Tabular Deep Learning + + +
+ Familial Hypercholesterolemia (FH) is a genetic disorder characterized by +elevated levels of Low-Density Lipoprotein (LDL) cholesterol or its associated +genes. Early-stage and accurate categorization of FH is of significance +allowing for timely interventions to mitigate the risk of life-threatening +conditions. Conventional diagnosis approach, however, is complex, costly, and a +challenging interpretation task even for experienced clinicians resulting in +high underdiagnosis rates. Although there has been a recent surge of interest +in using Machine Learning (ML) models for early FH detection, existing +solutions only consider a binary classification task solely using classical ML +models. Despite its significance, application of Deep Learning (DL) for FH +detection is in its infancy, possibly, due to categorical nature of the +underlying clinical data. The paper addresses this gap by introducing the +FH-TabNet, which is a multi-stage tabular DL network for multi-class (Definite, +Probable, Possible, and Unlikely) FH detection. The FH-TabNet initially +involves applying a deep tabular data learning architecture (TabNet) for +primary categorization into healthy (Possible/Unlikely) and patient +(Probable/Definite) classes. Subsequently, independent TabNet classifiers are +applied to each subgroup, enabling refined classification. The model's +performance is evaluated through 5-fold cross-validation illustrating superior +performance in categorizing FH patients, particularly in the challenging +low-prevalence subcategories. + +
+
+
+
+
+ + ☆ Reward Guided Latent Consistency Distillation + + +
+ Latent Consistency Distillation (LCD) has emerged as a promising paradigm for +efficient text-to-image synthesis. By distilling a latent consistency model +(LCM) from a pre-trained teacher latent diffusion model (LDM), LCD facilitates +the generation of high-fidelity images within merely 2 to 4 inference steps. +However, the LCM's efficient inference is obtained at the cost of the sample +quality. In this paper, we propose compensating the quality loss by aligning +LCM's output with human preference during training. Specifically, we introduce +Reward Guided LCD (RG-LCD), which integrates feedback from a reward model (RM) +into the LCD process by augmenting the original LCD loss with the objective of +maximizing the reward associated with LCM's single-step generation. As +validated through human evaluation, when trained with the feedback of a good +RM, the 2-step generations from our RG-LCM are favored by humans over the +50-step DDIM samples from the teacher LDM, representing a 25 times inference +acceleration without quality loss. + As directly optimizing towards differentiable RMs can suffer from +over-optimization, we overcome this difficulty by proposing the use of a latent +proxy RM (LRM). This novel component serves as an intermediary, connecting our +LCM with the RM. Empirically, we demonstrate that incorporating the LRM into +our RG-LCD successfully avoids high-frequency noise in the generated images, +contributing to both improved FID on MS-COCO and a higher HPSv2.1 score on +HPSv2's test set, surpassing those achieved by the baseline LCM. + +
+
+ comment: Project page: https://rg-lcd.github.io/ +
+
+
+
+
+ + ☆ EfficientMorph: Parameter-Efficient Transformer-Based Architecture for + 3D Image Registration + + +
+ Transformers have emerged as the state-of-the-art architecture in medical +image registration, outperforming convolutional neural networks (CNNs) by +addressing their limited receptive fields and overcoming gradient instability +in deeper models. Despite their success, transformer-based models require +substantial resources for training, including data, memory, and computational +power, which may restrict their applicability for end users with limited +resources. In particular, existing transformer-based 3D image registration +architectures face three critical gaps that challenge their efficiency and +effectiveness. Firstly, while mitigating the quadratic complexity of full +attention by focusing on local regions, window-based attention mechanisms often +fail to adequately integrate local and global information. Secondly, feature +similarities across attention heads that were recently found in multi-head +attention architectures indicate a significant computational redundancy, +suggesting that the capacity of the network could be better utilized to enhance +performance. Lastly, the granularity of tokenization, a key factor in +registration accuracy, presents a trade-off; smaller tokens improve detail +capture at the cost of higher computational complexity, increased memory +demands, and a risk of overfitting. Here, we propose EfficientMorph, a +transformer-based architecture for unsupervised 3D image registration. It +optimizes the balance between local and global attention through a plane-based +attention mechanism, reduces computational redundancy via cascaded group +attention, and captures fine details without compromising computational +efficiency, thanks to a Hi-Res tokenization strategy complemented by merging +operations. Notably, EfficientMorph sets a new benchmark for performance on the +OASIS dataset with 16-27x fewer parameters. + +
+
+
+
+
+ + ☆ Fast Sparse View Guided NeRF Update for Object Reconfigurations + + +
+ Neural Radiance Field (NeRF), as an implicit 3D scene representation, lacks +inherent ability to accommodate changes made to the initial static scene. If +objects are reconfigured, it is difficult to update the NeRF to reflect the new +state of the scene without time-consuming data re-capturing and NeRF +re-training. To address this limitation, we develop the first update method for +NeRFs to physical changes. Our method takes only sparse new images (e.g. 4) of +the altered scene as extra inputs and update the pre-trained NeRF in around 1 +to 2 minutes. Particularly, we develop a pipeline to identify scene changes and +update the NeRF accordingly. Our core idea is the use of a second helper NeRF +to learn the local geometry and appearance changes, which sidesteps the +optimization difficulties in direct NeRF fine-tuning. The interpolation power +of the helper NeRF is the key to accurately reconstruct the un-occluded objects +regions under sparse view supervision. Our method imposes no constraints on +NeRF pre-training, and requires no extra user input or explicit semantic +priors. It is an order of magnitude faster than re-training NeRF from scratch +while maintaining on-par and even superior performance. + +
+
+
+
+
+ + ☆ Neuro-Symbolic Video Search + + +
+ The unprecedented surge in video data production in recent years necessitates +efficient tools to extract meaningful frames from videos for downstream tasks. +Long-term temporal reasoning is a key desideratum for frame retrieval systems. +While state-of-the-art foundation models, like VideoLLaMA and ViCLIP, are +proficient in short-term semantic understanding, they surprisingly fail at +long-term reasoning across frames. A key reason for this failure is that they +intertwine per-frame perception and temporal reasoning into a single deep +network. Hence, decoupling but co-designing semantic understanding and temporal +reasoning is essential for efficient scene identification. We propose a system +that leverages vision-language models for semantic understanding of individual +frames but effectively reasons about the long-term evolution of events using +state machines and temporal logic (TL) formulae that inherently capture memory. +Our TL-based reasoning improves the F1 score of complex event identification by +9-15% compared to benchmarks that use GPT4 for reasoning on state-of-the-art +self-driving datasets such as Waymo and NuScenes. + +
+
+
+
+
+ + ☆ MASSM: An End-to-End Deep Learning Framework for Multi-Anatomy + Statistical Shape Modeling Directly From Images + + +
+ Statistical Shape Modeling (SSM) is an effective method for quantitatively +analyzing anatomical variations within populations. However, its utility is +limited by the need for manual segmentations of anatomies, a task that relies +on the scarce expertise of medical professionals. Recent advances in deep +learning have provided a promising approach that automatically generates +statistical representations from unsegmented images. Once trained, these deep +learning-based models eliminate the need for manual segmentation for new +subjects. Nonetheless, most current methods still require manual pre-alignment +of image volumes and specifying a bounding box around the target anatomy prior +for inference, resulting in a partially manual inference process. Recent +approaches facilitate anatomy localization but only estimate statistical +representations at the population level. However, they cannot delineate anatomy +directly in images and are limited to modeling a single anatomy. Here, we +introduce MASSM, a novel end-to-end deep learning framework that simultaneously +localizes multiple anatomies in an image, estimates population-level +statistical representations, and delineates each anatomy. Our findings +emphasize the crucial role of local correspondences, showcasing their +indispensability in providing superior shape information for medical imaging +tasks. + +
+
+
+
+
+ + ☆ Topologically faithful multi-class segmentation in medical images + + +
+ Topological accuracy in medical image segmentation is a highly important +property for downstream applications such as network analysis and flow modeling +in vessels or cell counting. Recently, significant methodological advancements +have brought well-founded concepts from algebraic topology to binary +segmentation. However, these approaches have been underexplored in multi-class +segmentation scenarios, where topological errors are common. We propose a +general loss function for topologically faithful multi-class segmentation +extending the recent Betti matching concept, which is based on induced +matchings of persistence barcodes. We project the N-class segmentation problem +to N single-class segmentation tasks, which allows us to use 1-parameter +persistent homology making training of neural networks computationally +feasible. We validate our method on a comprehensive set of four medical +datasets with highly variant topological characteristics. Our loss formulation +significantly enhances topological correctness in cardiac, cell, artery-vein, +and Circle of Willis segmentation. + +
+
+
+
+
+ + ☆ N2F2: Hierarchical Scene Understanding with Nested Neural Feature Fields + + +
+ Understanding complex scenes at multiple levels of abstraction remains a +formidable challenge in computer vision. To address this, we introduce Nested +Neural Feature Fields (N2F2), a novel approach that employs hierarchical +supervision to learn a single feature field, wherein different dimensions +within the same high-dimensional feature encode scene properties at varying +granularities. Our method allows for a flexible definition of hierarchies, +tailored to either the physical dimensions or semantics or both, thereby +enabling a comprehensive and nuanced understanding of scenes. We leverage a 2D +class-agnostic segmentation model to provide semantically meaningful pixel +groupings at arbitrary scales in the image space, and query the CLIP +vision-encoder to obtain language-aligned embeddings for each of these +segments. Our proposed hierarchical supervision method then assigns different +nested dimensions of the feature field to distill the CLIP embeddings using +deferred volumetric rendering at varying physical scales, creating a +coarse-to-fine representation. Extensive experiments show that our approach +outperforms the state-of-the-art feature field distillation methods on tasks +such as open-vocabulary 3D segmentation and localization, demonstrating the +effectiveness of the learned nested feature field. + +
+
+
+
+
+ + ☆ Boosting Flow-based Generative Super-Resolution Models via Learned Prior CVPR2024 + + +
+ Flow-based super-resolution (SR) models have demonstrated astonishing +capabilities in generating high-quality images. However, these methods +encounter several challenges during image generation, such as grid artifacts, +exploding inverses, and suboptimal results due to a fixed sampling temperature. +To overcome these issues, this work introduces a conditional learned prior to +the inference phase of a flow-based SR model. This prior is a latent code +predicted by our proposed latent module conditioned on the low-resolution +image, which is then transformed by the flow model into an SR image. Our +framework is designed to seamlessly integrate with any contemporary flow-based +SR model without modifying its architecture or pre-trained weights. We evaluate +the effectiveness of our proposed framework through extensive experiments and +ablation analyses. The proposed framework successfully addresses all the +inherent issues in flow-based SR models and enhances their performance in +various SR scenarios. Our code is available at: +https://github.com/liyuantsao/FlowSR-LP + +
+
+ comment: Accepted to CVPR2024 +
+
+
+
+
+ + ☆ OMG: Occlusion-friendly Personalized Multi-concept Generation in + Diffusion Models + + +
+ Personalization is an important topic in text-to-image generation, especially +the challenging multi-concept personalization. Current multi-concept methods +are struggling with identity preservation, occlusion, and the harmony between +foreground and background. In this work, we propose OMG, an occlusion-friendly +personalized generation framework designed to seamlessly integrate multiple +concepts within a single image. We propose a novel two-stage sampling solution. +The first stage takes charge of layout generation and visual comprehension +information collection for handling occlusions. The second one utilizes the +acquired visual comprehension information and the designed noise blending to +integrate multiple concepts while considering occlusions. We also observe that +the initiation denoising timestep for noise blending is the key to identity +preservation and layout. Moreover, our method can be combined with various +single-concept models, such as LoRA and InstantID without additional tuning. +Especially, LoRA models on civitai.com can be exploited directly. Extensive +experiments demonstrate that OMG exhibits superior performance in multi-concept +personalization. + +
+
+ comment: Homepage: https://kongzhecn.github.io/omg-project/ Github: + https://github.com/kongzhecn/OMG/ +
+
+
+
+
+ + ☆ Automatic Spatial Calibration of Near-Field MIMO Radar With Respect to + Optical Sensors + + +
+ Despite an emerging interest in MIMO radar, the utilization of its +complementary strengths in combination with optical sensors has so far been +limited to far-field applications, due to the challenges that arise from mutual +sensor calibration in the near field. In fact, most related approaches in the +autonomous industry propose target-based calibration methods using corner +reflectors that have proven to be unsuitable for the near field. In contrast, +we propose a novel, joint calibration approach for optical RGB-D sensors and +MIMO radars that is designed to operate in the radar's near-field range, within +decimeters from the sensors. Our pipeline consists of a bespoke calibration +target, allowing for automatic target detection and localization, followed by +the spatial calibration of the two sensor coordinate systems through target +registration. We validate our approach using two different depth sensing +technologies from the optical domain. The experiments show the efficiency and +accuracy of our calibration for various target displacements, as well as its +robustness of our localization in terms of signal ambiguities. + +
+
+ comment: 8 pages, 9 figures +
+
+
+
+
+ + ☆ Task-Aware Low-Rank Adaptation of Segment Anything Model + + +
+ The Segment Anything Model (SAM), with its remarkable zero-shot capability, +has been proven to be a powerful foundation model for image segmentation tasks, +which is an important task in computer vision. However, the transfer of its +rich semantic information to multiple different downstream tasks remains +unexplored. In this paper, we propose the Task-Aware Low-Rank Adaptation +(TA-LoRA) method, which enables SAM to work as a foundation model for +multi-task learning. Specifically, TA-LoRA injects an update parameter tensor +into each layer of the encoder in SAM and leverages a low-rank tensor +decomposition method to incorporate both task-shared and task-specific +information. Furthermore, we introduce modified SAM (mSAM) for multi-task +learning where we remove the prompt encoder of SAM and use task-specific no +mask embeddings and mask decoder for each task. Extensive experiments conducted +on benchmark datasets substantiate the efficacy of TA-LoRA in enhancing the +performance of mSAM across multiple downstream tasks. + +
+
+
+
+
+ + ☆ Exploiting Topological Prior for Boosting Point Cloud Generation + + +
+ This paper presents an innovative enhancement to the Sphere as Prior +Generative Adversarial Network (SP-GAN) model, a state-of-the-art GAN designed +for point cloud generation. A novel method is introduced for point cloud +generation that elevates the structural integrity and overall quality of the +generated point clouds by incorporating topological priors into the training +process of the generator. Specifically, this work utilizes the K-means +algorithm to segment a point cloud from the repository into clusters and +extract centroids, which are then used as priors in the generation process of +the SP-GAN. Furthermore, the discriminator component of the SP-GAN utilizes the +identical point cloud that contributed the centroids, ensuring a coherent and +consistent learning environment. This strategic use of centroids as intuitive +guides not only boosts the efficiency of global feature learning but also +substantially improves the structural coherence and fidelity of the generated +point clouds. By applying the K-means algorithm to generate centroids as the +prior, the work intuitively and experimentally demonstrates that such a prior +enhances the quality of generated point clouds. + +
+
+ comment: 7 pages, 3 figures, AIDML 2024 +
+
+
+
+
+ + ☆ Ctrl123: Consistent Novel View Synthesis via Closed-Loop Transcription + + +
+ Large image diffusion models have demonstrated zero-shot capability in novel +view synthesis (NVS). However, existing diffusion-based NVS methods struggle to +generate novel views that are accurately consistent with the corresponding +ground truth poses and appearances, even on the training set. This consequently +limits the performance of downstream tasks, such as image-to-multiview +generation and 3D reconstruction. We realize that such inconsistency is largely +due to the fact that it is difficult to enforce accurate pose and appearance +alignment directly in the diffusion training, as mostly done by existing +methods such as Zero123. To remedy this problem, we propose Ctrl123, a +closed-loop transcription-based NVS diffusion method that enforces alignment +between the generated view and ground truth in a pose-sensitive feature space. +Our extensive experiments demonstrate the effectiveness of Ctrl123 on the tasks +of NVS and 3D reconstruction, achieving significant improvements in both +multiview-consistency and pose-consistency over existing methods. + +
+
+
+
+
+ + ☆ ScanTalk: 3D Talking Heads from Unregistered Scans + + +
+ Speech-driven 3D talking heads generation has emerged as a significant area +of interest among researchers, presenting numerous challenges. Existing methods +are constrained by animating faces with fixed topologies, wherein point-wise +correspondence is established, and the number and order of points remains +consistent across all identities the model can animate. In this work, we +present ScanTalk, a novel framework capable of animating 3D faces in arbitrary +topologies including scanned data. Our approach relies on the DiffusionNet +architecture to overcome the fixed topology constraint, offering promising +avenues for more flexible and realistic 3D animations. By leveraging the power +of DiffusionNet, ScanTalk not only adapts to diverse facial structures but also +maintains fidelity when dealing with scanned data, thereby enhancing the +authenticity and versatility of generated 3D talking heads. Through +comprehensive comparisons with state-of-the-art methods, we validate the +efficacy of our approach, demonstrating its capacity to generate realistic +talking heads comparable to existing techniques. While our primary objective is +to develop a generic method free from topological constraints, all +state-of-the-art methodologies are bound by such limitations. Code for +reproducing our results, and the pre-trained model will be made available. + +
+
+
+
+
+ + ☆ Channel-wise Feature Decorrelation for Enhanced Learned Image + Compression + + +
+ The emerging Learned Compression (LC) replaces the traditional codec modules +with Deep Neural Networks (DNN), which are trained end-to-end for +rate-distortion performance. This approach is considered as the future of +image/video compression, and major efforts have been dedicated to improving its +compression efficiency. However, most proposed works target compression +efficiency by employing more complex DNNS, which contributes to higher +computational complexity. Alternatively, this paper proposes to improve +compression by fully exploiting the existing DNN capacity. To do so, the latent +features are guided to learn a richer and more diverse set of features, which +corresponds to better reconstruction. A channel-wise feature decorrelation loss +is designed and is integrated into the LC optimization. Three strategies are +proposed and evaluated, which optimize (1) the transformation network, (2) the +context model, and (3) both networks. Experimental results on two established +LC methods show that the proposed method improves the compression with a +BD-Rate of up to 8.06%, with no added complexity. The proposed solution can be +applied as a plug-and-play solution to optimize any similar LC method. + +
+
+
+
+
+ + ☆ Understanding Robustness of Visual State Space Models for Image + Classification + + +
+ Visual State Space Model (VMamba) has recently emerged as a promising +architecture, exhibiting remarkable performance in various computer vision +tasks. However, its robustness has not yet been thoroughly studied. In this +paper, we delve into the robustness of this architecture through comprehensive +investigations from multiple perspectives. Firstly, we investigate its +robustness to adversarial attacks, employing both whole-image and +patch-specific adversarial attacks. Results demonstrate superior adversarial +robustness compared to Transformer architectures while revealing scalability +weaknesses. Secondly, the general robustness of VMamba is assessed against +diverse scenarios, including natural adversarial examples, out-of-distribution +data, and common corruptions. VMamba exhibits exceptional generalizability with +out-of-distribution data but shows scalability weaknesses against natural +adversarial examples and common corruptions. Additionally, we explore VMamba's +gradients and back-propagation during white-box attacks, uncovering unique +vulnerabilities and defensive capabilities of its novel components. Lastly, the +sensitivity of VMamba to image structure variations is examined, highlighting +vulnerabilities associated with the distribution of disturbance areas and +spatial information, with increased susceptibility closer to the image center. +Through these comprehensive studies, we contribute to a deeper understanding of +VMamba's robustness, providing valuable insights for refining and advancing the +capabilities of deep neural networks in computer vision applications. + +
+
+ comment: 27 pages +
+
+
+
+
+ + ☆ Uncertainty-Aware Adapter: Adapting Segment Anything Model (SAM) for + Ambiguous Medical Image Segmentation + + +
+ The Segment Anything Model (SAM) gained significant success in natural image +segmentation, and many methods have tried to fine-tune it to medical image +segmentation. An efficient way to do so is by using Adapters, specialized +modules that learn just a few parameters to tailor SAM specifically for medical +images. However, unlike natural images, many tissues and lesions in medical +images have blurry boundaries and may be ambiguous. Previous efforts to adapt +SAM ignore this challenge and can only predict distinct segmentation.It may +mislead clinicians or cause misdiagnosis, especially when encountering rare +variants or situations with low model confidence. In this work, we propose a +novel module called the Uncertainty-aware Adapter, which efficiently +fine-tuning SAM for uncertainty-aware medical image segmentation. Utilizing a +conditional variational autoencoder, we encoded stochastic samples to +effectively represent the inherent uncertainty in medical imaging. We designed +a new module on a standard adapter that utilizes a condition-based strategy to +interact with samples to help SAM integrate uncertainty. We evaluated our +method on two multi-annotated datasets with different modalities: LIDC-IDRI +(lung abnormalities segmentation) and REFUGE2 (optic-cup segmentation). The +experimental results show that the proposed model outperforms all the previous +methods and achieves the new state-of-the-art (SOTA) on both benchmarks. We +also demonstrated that our method can generate diverse segmentation hypotheses +that are more realistic as well as heterogeneous. + +
+
+
+
+
+ + ☆ Learning Dual-Level Deformable Implicit Representation for Real-World + Scale Arbitrary Super-Resolution + + +
+ Scale arbitrary super-resolution based on implicit image function gains +increasing popularity since it can better represent the visual world in a +continuous manner. However, existing scale arbitrary works are trained and +evaluated on simulated datasets, where low-resolution images are generated from +their ground truths by the simplest bicubic downsampling. These models exhibit +limited generalization to real-world scenarios due to the greater complexity of +real-world degradations. To address this issue, we build a RealArbiSR dataset, +a new real-world super-resolution benchmark with both integer and non-integer +scaling factors for the training and evaluation of real-world scale arbitrary +super-resolution. Moreover, we propose a Dual-level Deformable Implicit +Representation (DDIR) to solve real-world scale arbitrary super-resolution. +Specifically, we design the appearance embedding and deformation field to +handle both image-level and pixel-level deformations caused by real-world +degradations. The appearance embedding models the characteristics of +low-resolution inputs to deal with photometric variations at different scales, +and the pixel-based deformation field learns RGB differences which result from +the deviations between the real-world and simulated degradations at arbitrary +coordinates. Extensive experiments show our trained model achieves +state-of-the-art performance on the RealArbiSR and RealSR benchmarks for +real-world scale arbitrary super-resolution. Our dataset as well as source code +will be publicly available. + +
+
+
+
+
+ + ☆ FishNet: Deep Neural Networks for Low-Cost Fish Stock Estimation + + +
+ Fish stock assessment often involves manual fish counting by taxonomy +specialists, which is both time-consuming and costly. We propose an automated +computer vision system that performs both taxonomic classification and fish +size estimation from images taken with a low-cost digital camera. The system +first performs object detection and segmentation using a Mask R-CNN to identify +individual fish from images containing multiple fish, possibly consisting of +different species. Then each fish species is classified and the predicted +length using separate machine learning models. These models are trained on a +dataset of 50,000 hand-annotated images containing 163 different fish species, +ranging in length from 10cm to 250cm. Evaluated on held-out test data, our +system achieves a $92\%$ intersection over union on the fish segmentation task, +a $89\%$ top-1 classification accuracy on single fish species classification, +and a $2.3$~cm mean error on the fish length estimation task. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Automatic location detection based on deep learning + + +
+ The proliferation of digital images and the advancements in deep learning +have paved the way for innovative solutions in various domains, especially in +the field of image classification. Our project presents an in-depth study and +implementation of an image classification system specifically tailored to +identify and classify images of Indian cities. Drawing from an extensive +dataset, our model classifies images into five major Indian cities: Ahmedabad, +Delhi, Kerala, Kolkata, and Mumbai to recognize the distinct features and +characteristics of each city/state. To achieve high precision and recall rates, +we adopted two approaches. The first, a vanilla Convolutional Neural Network +(CNN) and then we explored the power of transfer learning by leveraging the +VGG16 model. The vanilla CNN achieved commendable accuracy and the VGG16 model +achieved a test accuracy of 63.6%. Evaluations highlighted the strengths and +potential areas of improvement, positioning our model as not only competitive +but also scalable for broader applications. With an emphasis on open-source +ethos, our work aims to contribute to the community, encouraging further +development and diverse applications. Our findings demonstrate the potential +applications in tourism, urban planning, and even real-time location +identification systems, among others. + +
+
+
+
+
+ + ☆ Efficient Diffusion-Driven Corruption Editor for Test-Time Adaptation + + +
+ Test-time adaptation (TTA) addresses the unforeseen distribution shifts +occurring during test time. In TTA, both performance and, memory and time +consumption serve as crucial considerations. A recent diffusion-based TTA +approach for restoring corrupted images involves image-level updates. However, +using pixel space diffusion significantly increases resource requirements +compared to conventional model updating TTA approaches, revealing limitations +as a TTA method. To address this, we propose a novel TTA method by leveraging a +latent diffusion model (LDM) based image editing model and fine-tuning it with +our newly introduced corruption modeling scheme. This scheme enhances the +robustness of the diffusion model against distribution shifts by creating +(clean, corrupted) image pairs and fine-tuning the model to edit corrupted +images into clean ones. Moreover, we introduce a distilled variant to +accelerate the model for corruption editing using only 4 network function +evaluations (NFEs). We extensively validated our method across various +architectures and datasets including image and video domains. Our model +achieves the best performance with a 100 times faster runtime than that of a +diffusion-based baseline. Furthermore, it outpaces the speed of the model +updating TTA method based on data augmentation threefold, rendering an +image-level updating approach more practical. + +
+
+
+
+
+ + ☆ HourglassNeRF: Casting an Hourglass as a Bundle of Rays for Few-shot + Neural Rendering + + +
+ Recent advancements in the Neural Radiance Field (NeRF) have bolstered its +capabilities for novel view synthesis, yet its reliance on dense multi-view +training images poses a practical challenge. Addressing this, we propose +HourglassNeRF, an effective regularization-based approach with a novel +hourglass casting strategy. Our proposed hourglass is conceptualized as a +bundle of additional rays within the area between the original input ray and +its corresponding reflection ray, by featurizing the conical frustum via +Integrated Positional Encoding (IPE). This design expands the coverage of +unseen views and enables an adaptive high-frequency regularization based on +target pixel photo-consistency. Furthermore, we propose luminance consistency +regularization based on the Lambertian assumption, which is known to be +effective for training a set of augmented rays under the few-shot setting. +Leveraging the inherent property of a Lambertian surface, which retains +consistent luminance irrespective of the viewing angle, we assume our proposed +hourglass as a collection of flipped diffuse reflection rays and enhance the +luminance consistency between the original input ray and its corresponding +hourglass, resulting in more physically grounded training framework and +performance improvement. Our HourglassNeRF outperforms its baseline and +achieves competitive results on multiple benchmarks with sharply rendered fine +details. The code will be available. + +
+
+ comment: 21 pages, 11 figures +
+
+
+
+
+ + ☆ Urban Sound Propagation: a Benchmark for 1-Step Generative Modeling of + Complex Physical Systems + + +
+ Data-driven modeling of complex physical systems is receiving a growing +amount of attention in the simulation and machine learning communities. Since +most physical simulations are based on compute-intensive, iterative +implementations of differential equation systems, a (partial) replacement with +learned, 1-step inference models has the potential for significant speedups in +a wide range of application areas. In this context, we present a novel +benchmark for the evaluation of 1-step generative learning models in terms of +speed and physical correctness. Our Urban Sound Propagation benchmark is based +on the physically complex and practically relevant, yet intuitively easy to +grasp task of modeling the 2d propagation of waves from a sound source in an +urban environment. We provide a dataset with 100k samples, where each sample +consists of pairs of real 2d building maps drawn from OpenStreetmap, a +parameterized sound source, and a simulated ground truth sound propagation for +the given scene. The dataset provides four different simulation tasks with +increasing complexity regarding reflection, diffraction and source variance. A +first baseline evaluation of common generative U-Net, GAN and Diffusion models +shows, that while these models are very well capable of modeling sound +propagations in simple cases, the approximation of sub-systems represented by +higher order equations systematically fails. Information about the dataset, +download instructions and source codes are provided on our anonymous website: +https://www.urban-sound-data.org. + +
+
+
+
+
+ + ☆ Rethinking Multi-view Representation Learning via Distilled + Disentangling CVPR 2024 + + +
+ Multi-view representation learning aims to derive robust representations that +are both view-consistent and view-specific from diverse data sources. This +paper presents an in-depth analysis of existing approaches in this domain, +highlighting a commonly overlooked aspect: the redundancy between +view-consistent and view-specific representations. To this end, we propose an +innovative framework for multi-view representation learning, which incorporates +a technique we term 'distilled disentangling'. Our method introduces the +concept of masked cross-view prediction, enabling the extraction of compact, +high-quality view-consistent representations from various sources without +incurring extra computational overhead. Additionally, we develop a distilled +disentangling module that efficiently filters out consistency-related +information from multi-view representations, resulting in purer view-specific +representations. This approach significantly reduces redundancy between +view-consistent and view-specific representations, enhancing the overall +efficiency of the learning process. Our empirical evaluations reveal that +higher mask ratios substantially improve the quality of view-consistent +representations. Moreover, we find that reducing the dimensionality of +view-consistent representations relative to that of view-specific +representations further refines the quality of the combined representations. +Our code is accessible at: https://github.com/Guanzhou-Ke/MRDD. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ LuoJiaHOG: A Hierarchy Oriented Geo-aware Image Caption Dataset for + Remote Sensing Image-Text Retrival + + +
+ Image-text retrieval (ITR) plays a significant role in making informed +decisions for various remote sensing (RS) applications. Nonetheless, creating +ITR datasets containing vision and language modalities not only requires +significant geo-spatial sampling area but also varing categories and detailed +descriptions. To this end, we introduce an image caption dataset LuojiaHOG, +which is geospatial-aware, label-extension-friendly and +comprehensive-captioned. LuojiaHOG involves the hierarchical spatial sampling, +extensible classification system to Open Geospatial Consortium (OGC) standards, +and detailed caption generation. In addition, we propose a CLIP-based Image +Semantic Enhancement Network (CISEN) to promote sophisticated ITR. CISEN +consists of two components, namely dual-path knowledge transfer and progressive +cross-modal feature fusion. Comprehensive statistics on LuojiaHOG reveal the +richness in sampling diversity, labels quantity and descriptions granularity. +The evaluation on LuojiaHOG is conducted across various state-of-the-art ITR +models, including ALBEF, ALIGN, CLIP, FILIP, Wukong, GeoRSCLIP and CISEN. We +use second- and third-level labels to evaluate these vision-language models +through adapter-tuning and CISEN demonstrates superior performance. For +instance, it achieves the highest scores with WMAP@5 of 88.47\% and 87.28\% on +third-level ITR tasks, respectively. In particular, CISEN exhibits an +improvement of approximately 1.3\% and 0.9\% in terms of WMAP@5 compared to its +baseline. These findings highlight CISEN advancements accurately retrieving +pertinent information across image and text. LuojiaHOG and CISEN can serve as a +foundational resource for future RS image-text alignment research, facilitating +a wide range of vision-language applications. + +
+
+
+
+
+ + ☆ Could We Generate Cytology Images from Histopathology Images? An + Empirical Study + + +
+ Automation in medical imaging is quite challenging due to the unavailability +of annotated datasets and the scarcity of domain experts. In recent years, deep +learning techniques have solved some complex medical imaging tasks like disease +classification, important object localization, segmentation, etc. However, most +of the task requires a large amount of annotated data for their successful +implementation. To mitigate the shortage of data, different generative models +are proposed for data augmentation purposes which can boost the classification +performances. For this, different synthetic medical image data generation +models are developed to increase the dataset. Unpaired image-to-image +translation models here shift the source domain to the target domain. In the +breast malignancy identification domain, FNAC is one of the low-cost +low-invasive modalities normally used by medical practitioners. But +availability of public datasets in this domain is very poor. Whereas, for +automation of cytology images, we need a large amount of annotated data. +Therefore synthetic cytology images are generated by translating breast +histopathology samples which are publicly available. In this study, we have +explored traditional image-to-image transfer models like CycleGAN, and Neural +Style Transfer. Further, it is observed that the generated cytology images are +quite similar to real breast cytology samples by measuring FID and KID scores. + +
+
+ comment: Accept at International Conference on Advanced Computing and + Applications(ICACA-2024) +
+
+
+
+
+ + ☆ Fuzzy Rank-based Late Fusion Technique for Cytology image Segmentation ICDE + + +
+ Cytology image segmentation is quite challenging due to its complex cellular +structure and multiple overlapping regions. On the other hand, for supervised +machine learning techniques, we need a large amount of annotated data, which is +costly. In recent years, late fusion techniques have given some promising +performances in the field of image classification. In this paper, we have +explored a fuzzy-based late fusion techniques for cytology image segmentation. +This fusion rule integrates three traditional semantic segmentation models +UNet, SegNet, and PSPNet. The technique is applied on two cytology image +datasets, i.e., cervical cytology(HErlev) and breast cytology(JUCYT-v1) image +datasets. We have achieved maximum MeanIoU score 84.27% and 83.79% on the +HErlev dataset and JUCYT-v1 dataset after the proposed late fusion technique, +respectively which are better than that of the traditional fusion rules such as +average probability, geometric mean, Borda Count, etc. The codes of the +proposed model are available on GitHub. + +
+
+ comment: Accept at International Conference on Data, Electronics and Computing + (ICDEC-2023) +
+
+
+
+
+ + ☆ Improving Adversarial Transferability of Visual-Language Pre-training + Models through Collaborative Multimodal Interaction + + +
+ Despite the substantial advancements in Vision-Language Pre-training (VLP) +models, their susceptibility to adversarial attacks poses a significant +challenge. Existing work rarely studies the transferability of attacks on VLP +models, resulting in a substantial performance gap from white-box attacks. We +observe that prior work overlooks the interaction mechanisms between +modalities, which plays a crucial role in understanding the intricacies of VLP +models. In response, we propose a novel attack, called Collaborative Multimodal +Interaction Attack (CMI-Attack), leveraging modality interaction through +embedding guidance and interaction enhancement. Specifically, attacking text at +the embedding level while preserving semantics, as well as utilizing +interaction image gradients to enhance constraints on perturbations of texts +and images. Significantly, in the image-text retrieval task on Flickr30K +dataset, CMI-Attack raises the transfer success rates from ALBEF to TCL, +$\text{CLIP}_{\text{ViT}}$ and $\text{CLIP}_{\text{CNN}}$ by 8.11%-16.75% over +state-of-the-art methods. Moreover, CMI-Attack also demonstrates superior +performance in cross-task generalization scenarios. Our work addresses the +underexplored realm of transfer attacks on VLP models, shedding light on the +importance of modality interaction for enhanced adversarial robustness. + +
+
+
+
+
+ + ☆ Regularizing CNNs using Confusion Penalty Based Label Smoothing for + Histopathology Images + + +
+ Deep Learning, particularly Convolutional Neural Networks (CNN), has been +successful in computer vision tasks and medical image analysis. However, modern +CNNs can be overconfident, making them difficult to deploy in real-world +scenarios. Researchers propose regularizing techniques, such as Label Smoothing +(LS), which introduces soft labels for training data, making the classifier +more regularized. LS captures disagreements or lack of confidence in the +training phase, making the classifier more regularized. Although LS is quite +simple and effective, traditional LS techniques utilize a weighted average +between target distribution and a uniform distribution across the classes, +which limits the objective of LS as well as the performance. This paper +introduces a novel LS technique based on the confusion penalty, which treats +model confusion for each class with more importance than others. We have +performed extensive experiments with well-known CNN architectures with this +technique on publicly available Colorectal Histology datasets and got +satisfactory results. Also, we have compared our findings with the +State-of-the-art and shown our method's efficacy with Reliability diagrams and +t-distributed Stochastic Neighbor Embedding (t-SNE) plots of feature space. + +
+
+ comment: Accepted at CICBA 2024 : 6th International Conference on + Computational Intelligence in Communications, and Business Analytics +
+
+
+
+
+ + ☆ COVID-CT-H-UNet: a novel COVID-19 CT segmentation network based on + attention mechanism and Bi-category Hybrid loss + + +
+ Since 2019, the global COVID-19 outbreak has emerged as a crucial focus in +healthcare research. Although RT-PCR stands as the primary method for COVID-19 +detection, its extended detection time poses a significant challenge. +Consequently, supplementing RT-PCR with the pathological study of COVID-19 +through CT imaging has become imperative. The current segmentation approach +based on TVLoss enhances the connectivity of afflicted areas. Nevertheless, it +tends to misclassify normal pixels between certain adjacent diseased regions as +diseased pixels. The typical Binary cross entropy(BCE) based U-shaped network +only concentrates on the entire CT images without emphasizing on the affected +regions, which results in hazy borders and low contrast in the projected +output. In addition, the fraction of infected pixels in CT images is much less, +which makes it a challenge for segmentation models to make accurate +predictions. In this paper, we propose COVID-CT-H-UNet, a COVID-19 CT +segmentation network to solve these problems. To recognize the unaffected +pixels between neighbouring diseased regions, extra visual layer information is +captured by combining the attention module on the skip connections with the +proposed composite function Bi-category Hybrid Loss. The issue of hazy +boundaries and poor contrast brought on by the BCE Loss in conventional +techniques is resolved by utilizing the composite function Bi-category Hybrid +Loss that concentrates on the pixels in the diseased area. The experiment shows +when compared to the previous COVID-19 segmentation networks, the proposed +COVID-CT-H-UNet's segmentation impact has greatly improved, and it may be used +to identify and study clinical COVID-19. + +
+
+ comment: Accepted at CICBA 2024 : 6th International Conference on + Computational Intelligence in Communications, and Business Analytics +
+
+
+
+
+ + ☆ Efficient Domain Adaptation for Endoscopic Visual Odometry + + +
+ Visual odometry plays a crucial role in endoscopic imaging, yet the scarcity +of realistic images with ground truth poses poses a significant challenge. +Therefore, domain adaptation offers a promising approach to bridge the +pre-operative planning domain with the intra-operative real domain for learning +odometry information. However, existing methodologies suffer from +inefficiencies in the training time. In this work, an efficient neural style +transfer framework for endoscopic visual odometry is proposed, which compresses +the time from pre-operative planning to testing phase to less than five +minutes. For efficient traing, this work focuses on training modules with only +a limited number of real images and we exploit pre-operative prior information +to dramatically reduce training duration. Moreover, during the testing phase, +we propose a novel Test Time Adaptation (TTA) method to mitigate the gap in +lighting conditions between training and testing datasets. Experimental +evaluations conducted on two public endoscope datasets showcase that our method +achieves state-of-the-art accuracy in visual odometry tasks while boasting the +fastest training speeds. These results demonstrate significant promise for +intra-operative surgery applications. + +
+
+
+
+
+ + ☆ RetMIL: Retentive Multiple Instance Learning for Histopathological Whole + Slide Image Classification + + +
+ Histopathological whole slide image (WSI) analysis with deep learning has +become a research focus in computational pathology. The current paradigm is +mainly based on multiple instance learning (MIL), in which approaches with +Transformer as the backbone are well discussed. These methods convert WSI tasks +into sequence tasks by representing patches as tokens in the WSI sequence. +However, the feature complexity brought by high heterogeneity and the +ultra-long sequences brought by gigapixel size makes Transformer-based MIL +suffer from the challenges of high memory consumption, slow inference speed, +and lack of performance. To this end, we propose a retentive MIL method called +RetMIL, which processes WSI sequences through hierarchical feature propagation +structure. At the local level, the WSI sequence is divided into multiple +subsequences. Tokens of each subsequence are updated through a parallel linear +retention mechanism and aggregated utilizing an attention layer. At the global +level, subsequences are fused into a global sequence, then updated through a +serial retention mechanism, and finally the slide-level representation is +obtained through a global attention pooling. We conduct experiments on two +public CAMELYON and BRACS datasets and an public-internal LUNG dataset, +confirming that RetMIL not only achieves state-of-the-art performance but also +significantly reduces computational overhead. Our code will be accessed +shortly. + +
+
+ comment: under review +
+
+
+
+
+ + ☆ A Comprehensive Study of Multimodal Large Language Models for Image + Quality Assessment + + +
+ While Multimodal Large Language Models (MLLMs) have experienced significant +advancement on visual understanding and reasoning, their potentials to serve as +powerful, flexible, interpretable, and text-driven models for Image Quality +Assessment (IQA) remains largely unexplored. In this paper, we conduct a +comprehensive and systematic study of prompting MLLMs for IQA. Specifically, we +first investigate nine prompting systems for MLLMs as the combinations of three +standardized testing procedures in psychophysics (i.e., the single-stimulus, +double-stimulus, and multiple-stimulus methods) and three popular prompting +strategies in natural language processing (i.e., the standard, in-context, and +chain-of-thought prompting). We then present a difficult sample selection +procedure, taking into account sample diversity and uncertainty, to further +challenge MLLMs equipped with the respective optimal prompting systems. We +assess three open-source and one close-source MLLMs on several visual +attributes of image quality (e.g., structural and textural distortions, color +differences, and geometric transformations) in both full-reference and +no-reference scenarios. Experimental results show that only the close-source +GPT-4V provides a reasonable account for human perception of image quality, but +is weak at discriminating fine-grained quality variations (e.g., color +differences) and at comparing visual quality of multiple images, tasks humans +can perform effortlessly. + +
+
+
+
+
+ + ☆ Just Say the Name: Online Continual Learning with Category Names Only + via Data Generation + + +
+ In real-world scenarios, extensive manual annotation for continual learning +is impractical due to prohibitive costs. Although prior arts, influenced by +large-scale webly supervised training, suggest leveraging web-scraped data in +continual learning, this poses challenges such as data imbalance, usage +restrictions, and privacy concerns. Addressing the risks of continual webly +supervised training, we present an online continual learning framework - +Generative Name only Continual Learning (G-NoCL). The proposed G-NoCL uses a +set of generators G along with the learner. When encountering new concepts +(i.e., classes), G-NoCL employs the novel sample complexity-guided data +ensembling technique DIverSity and COmplexity enhancing ensemBlER (DISCOBER) to +optimally sample training data from generated data. Through extensive +experimentation, we demonstrate superior performance of DISCOBER in G-NoCL +online CL benchmarks, covering both In-Distribution (ID) and +Out-of-Distribution (OOD) generalization evaluations, compared to naive +generator-ensembling, web-supervised, and manually annotated data. + +
+
+
+
+
+ + ☆ MSI-NeRF: Linking Omni-Depth with View Synthesis through Multi-Sphere + Image aided Generalizable Neural Radiance Field + + +
+ Panoramic observation using fisheye cameras is significant in robot +perception, reconstruction, and remote operation. However, panoramic images +synthesized by traditional methods lack depth information and can only provide +three degrees-of-freedom (3DoF) rotation rendering in virtual reality +applications. To fully preserve and exploit the parallax information within the +original fisheye cameras, we introduce MSI-NeRF, which combines deep learning +omnidirectional depth estimation and novel view rendering. We first construct a +multi-sphere image as a cost volume through feature extraction and warping of +the input images. It is then processed by geometry and appearance decoders, +respectively. Unlike methods that regress depth maps directly, we further build +an implicit radiance field using spatial points and interpolated 3D feature +vectors as input. In this way, we can simultaneously realize omnidirectional +depth estimation and 6DoF view synthesis. Our method is trained in a +semi-self-supervised manner. It does not require target view images and only +uses depth data for supervision. Our network has the generalization ability to +reconstruct unknown scenes efficiently using only four images. Experimental +results show that our method outperforms existing methods in depth estimation +and novel view synthesis tasks. + +
+
+ comment: 8 pages, 7 figures, Submitted to IEEE/RSJ International Conference on + Intelligent Robots and Systems 2024 +
+
+
+
+
+ + ☆ SF(DA)$^2$: Source-free Domain Adaptation Through the Lens of Data + Augmentation ICLR 2024 + + +
+ In the face of the deep learning model's vulnerability to domain shift, +source-free domain adaptation (SFDA) methods have been proposed to adapt models +to new, unseen target domains without requiring access to source domain data. +Although the potential benefits of applying data augmentation to SFDA are +attractive, several challenges arise such as the dependence on prior knowledge +of class-preserving transformations and the increase in memory and +computational requirements. In this paper, we propose Source-free Domain +Adaptation Through the Lens of Data Augmentation (SF(DA)$^2$), a novel approach +that leverages the benefits of data augmentation without suffering from these +challenges. We construct an augmentation graph in the feature space of the +pretrained model using the neighbor relationships between target features and +propose spectral neighborhood clustering to identify partitions in the +prediction space. Furthermore, we propose implicit feature augmentation and +feature disentanglement as regularization loss functions that effectively +utilize class semantic information within the feature space. These regularizers +simulate the inclusion of an unlimited number of augmented target features into +the augmentation graph while minimizing computational and memory demands. Our +method shows superior adaptation performance in SFDA scenarios, including 2D +image and 3D point cloud datasets and a highly imbalanced dataset. + +
+
+ comment: ICLR 2024. Code: https://github.com/shinyflight/SFDA2 +
+
+
+
+
+ + ☆ DUE: Dynamic Uncertainty-Aware Explanation Supervision via 3D Imputation + + +
+ Explanation supervision aims to enhance deep learning models by integrating +additional signals to guide the generation of model explanations, showcasing +notable improvements in both the predictability and explainability of the +model. However, the application of explanation supervision to +higher-dimensional data, such as 3D medical images, remains an under-explored +domain. Challenges associated with supervising visual explanations in the +presence of an additional dimension include: 1) spatial correlation changed, 2) +lack of direct 3D annotations, and 3) uncertainty varies across different parts +of the explanation. To address these challenges, we propose a Dynamic +Uncertainty-aware Explanation supervision (DUE) framework for 3D explanation +supervision that ensures uncertainty-aware explanation guidance when dealing +with sparsely annotated 3D data with diffusion-based 3D interpolation. Our +proposed framework is validated through comprehensive experiments on diverse +real-world medical imaging datasets. The results demonstrate the effectiveness +of our framework in enhancing the predictability and explainability of deep +learning models in the context of medical imaging diagnosis applications. + +
+
+ comment: 9 pages,6 figures +
+
+
+
+
+ + ☆ View-Centric Multi-Object Tracking with Homographic Matching in Moving + UAV + + +
+ In this paper, we address the challenge of multi-object tracking (MOT) in +moving Unmanned Aerial Vehicle (UAV) scenarios, where irregular flight +trajectories, such as hovering, turning left/right, and moving up/down, lead to +significantly greater complexity compared to fixed-camera MOT. Specifically, +changes in the scene background not only render traditional frame-to-frame +object IOU association methods ineffective but also introduce significant view +shifts in the objects, which complicates tracking. To overcome these issues, we +propose a novel universal HomView-MOT framework, which for the first time, +harnesses the view Homography inherent in changing scenes to solve MOT +challenges in moving environments, incorporating Homographic Matching and +View-Centric concepts. We introduce a Fast Homography Estimation (FHE) +algorithm for rapid computation of Homography matrices between video frames, +enabling object View-Centric ID Learning (VCIL) and leveraging multi-view +Homography to learn cross-view ID features. Concurrently, our Homographic +Matching Filter (HMF) maps object bounding boxes from different frames onto a +common view plane for a more realistic physical IOU association. Extensive +experiments have proven that these innovations allow HomView-MOT to achieve +state-of-the-art performance on prominent UAV MOT datasets VisDrone and UAVDT. + +
+
+
+
+
+ + ☆ Exploring Learning-based Motion Models in Multi-Object Tracking + + +
+ In the field of multi-object tracking (MOT), traditional methods often rely +on the Kalman Filter for motion prediction, leveraging its strengths in linear +motion scenarios. However, the inherent limitations of these methods become +evident when confronted with complex, nonlinear motions and occlusions +prevalent in dynamic environments like sports and dance. This paper explores +the possibilities of replacing the Kalman Filter with various learning-based +motion model that effectively enhances tracking accuracy and adaptability +beyond the constraints of Kalman Filter-based systems. In this paper, we +proposed MambaTrack, an online motion-based tracker that outperforms all +existing motion-based trackers on the challenging DanceTrack and SportsMOT +datasets. Moreover, we further exploit the potential of the state-space-model +in trajectory feature extraction to boost the tracking performance and proposed +MambaTrack+, which achieves the state-of-the-art performance on DanceTrack +dataset with 56.1 HOTA and 54.9 IDF1. + +
+
+
+
+
+ + ☆ Affective Behaviour Analysis via Integrating Multi-Modal Knowledge + + +
+ Affective Behavior Analysis aims to facilitate technology emotionally smart, +creating a world where devices can understand and react to our emotions as +humans do. To comprehensively evaluate the authenticity and applicability of +emotional behavior analysis techniques in natural environments, the 6th +competition on Affective Behavior Analysis in-the-wild (ABAW) utilizes the +Aff-Wild2, Hume-Vidmimic2, and C-EXPR-DB datasets to set up five competitive +tracks, i.e., Valence-Arousal (VA) Estimation, Expression (EXPR) Recognition, +Action Unit (AU) Detection, Compound Expression (CE) Recognition, and Emotional +Mimicry Intensity (EMI) Estimation. In this paper, we present our method +designs for the five tasks. Specifically, our design mainly includes three +aspects: 1) Utilizing a transformer-based feature fusion module to fully +integrate emotional information provided by audio signals, visual images, and +transcripts, offering high-quality expression features for the downstream +tasks. 2) To achieve high-quality facial feature representations, we employ +Masked-Auto Encoder as the visual features extraction model and fine-tune it +with our facial dataset. 3) Considering the complexity of the video collection +scenes, we conduct a more detailed dataset division based on scene +characteristics and train the classifier for each scene. Extensive experiments +demonstrate the superiority of our designs. + +
+
+ comment: 11 pages, 1 figure +
+
+
+
+
+ + ☆ VisionCLIP: An Med-AIGC based Ethical Language-Image Foundation Model + for Generalizable Retina Image Analysis + + +
+ Generalist foundation model has ushered in newfound capabilities in medical +domain. However, the contradiction between the growing demand for high-quality +annotated data with patient privacy continues to intensify. The utilization of +medical artificial intelligence generated content (Med-AIGC) as an +inexhaustible resource repository arises as a potential solution to address the +aforementioned challenge. Here we harness 1 million open-source synthetic +fundus images paired with natural language descriptions, to curate an ethical +language-image foundation model for retina image analysis named VisionCLIP. +VisionCLIP achieves competitive performance on three external datasets compared +with the existing method pre-trained on real-world data in a zero-shot fashion. +The employment of artificially synthetic images alongside corresponding textual +data for training enables the medical foundation model to successfully +assimilate knowledge of disease symptomatology, thereby circumventing potential +breaches of patient confidentiality. + +
+
+
+
+
+ + ☆ Active Label Correction for Semantic Segmentation with Foundation Models + + +
+ Training and validating models for semantic segmentation require datasets +with pixel-wise annotations, which are notoriously labor-intensive. Although +useful priors such as foundation models or crowdsourced datasets are available, +they are error-prone. We hence propose an effective framework of active label +correction (ALC) based on a design of correction query to rectify pseudo labels +of pixels, which in turn is more annotator-friendly than the standard one +inquiring to classify a pixel directly according to our theoretical analysis +and user study. Specifically, leveraging foundation models providing useful +zero-shot predictions on pseudo labels and superpixels, our method comprises +two key techniques: (i) an annotator-friendly design of correction query with +the pseudo labels, and (ii) an acquisition function looking ahead label +expansions based on the superpixels. Experimental results on PASCAL, +Cityscapes, and Kvasir-SEG datasets demonstrate the effectiveness of our ALC +framework, outperforming prior methods for active semantic segmentation and +label correction. Notably, utilizing our method, we obtained a revised dataset +of PASCAL by rectifying errors in 2.6 million pixels in PASCAL dataset. + +
+
+
+
+
+ + ☆ MicroDiffusion: Implicit Representation-Guided Diffusion for 3D + Reconstruction from Limited 2D Microscopy Projections CVPR2024 + + +
+ Volumetric optical microscopy using non-diffracting beams enables rapid +imaging of 3D volumes by projecting them axially to 2D images but lacks crucial +depth information. Addressing this, we introduce MicroDiffusion, a pioneering +tool facilitating high-quality, depth-resolved 3D volume reconstruction from +limited 2D projections. While existing Implicit Neural Representation (INR) +models often yield incomplete outputs and Denoising Diffusion Probabilistic +Models (DDPM) excel at capturing details, our method integrates INR's +structural coherence with DDPM's fine-detail enhancement capabilities. We +pretrain an INR model to transform 2D axially-projected images into a +preliminary 3D volume. This pretrained INR acts as a global prior guiding +DDPM's generative process through a linear interpolation between INR outputs +and noise inputs. This strategy enriches the diffusion process with structured +3D information, enhancing detail and reducing noise in localized 2D images. By +conditioning the diffusion model on the closest 2D projection, MicroDiffusion +substantially enhances fidelity in resulting 3D reconstructions, surpassing INR +and standard DDPM outputs with unparalleled image quality and structural +fidelity. Our code and dataset are available at +https://github.com/UCSC-VLAA/MicroDiffusion. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ☆ DarkGS: Learning Neural Illumination and 3D Gaussians Relighting for + Robotic Exploration in the Dark + + +
+ Humans have the remarkable ability to construct consistent mental models of +an environment, even under limited or varying levels of illumination. We wish +to endow robots with this same capability. In this paper, we tackle the +challenge of constructing a photorealistic scene representation under poorly +illuminated conditions and with a moving light source. We approach the task of +modeling illumination as a learning problem, and utilize the developed +illumination model to aid in scene reconstruction. We introduce an innovative +framework that uses a data-driven approach, Neural Light Simulators (NeLiS), to +model and calibrate the camera-light system. Furthermore, we present DarkGS, a +method that applies NeLiS to create a relightable 3D Gaussian scene model +capable of real-time, photorealistic rendering from novel viewpoints. We show +the applicability and robustness of our proposed simulator and system in a +variety of real-world environments. + +
+
+ comment: 8 pages, 9 figures +
+
+
+
+
+ + ☆ Speech-driven Personalized Gesture Synthetics: Harnessing Automatic + Fuzzy Feature Inference + + +
+ Speech-driven gesture generation is an emerging field within virtual human +creation. However, a significant challenge lies in accurately determining and +processing the multitude of input features (such as acoustic, semantic, +emotional, personality, and even subtle unknown features). Traditional +approaches, reliant on various explicit feature inputs and complex multimodal +processing, constrain the expressiveness of resulting gestures and limit their +applicability. To address these challenges, we present Persona-Gestor, a novel +end-to-end generative model designed to generate highly personalized 3D +full-body gestures solely relying on raw speech audio. The model combines a +fuzzy feature extractor and a non-autoregressive Adaptive Layer Normalization +(AdaLN) transformer diffusion architecture. The fuzzy feature extractor +harnesses a fuzzy inference strategy that automatically infers implicit, +continuous fuzzy features. These fuzzy features, represented as a unified +latent feature, are fed into the AdaLN transformer. The AdaLN transformer +introduces a conditional mechanism that applies a uniform function across all +tokens, thereby effectively modeling the correlation between the fuzzy features +and the gesture sequence. This module ensures a high level of gesture-speech +synchronization while preserving naturalness. Finally, we employ the diffusion +model to train and infer various gestures. Extensive subjective and objective +evaluations on the Trinity, ZEGGS, and BEAT datasets confirm our model's +superior performance to the current state-of-the-art approaches. Persona-Gestor +improves the system's usability and generalization capabilities, setting a new +benchmark in speech-driven gesture synthesis and broadening the horizon for +virtual human technology. Supplementary videos and code can be accessed at +https://zf223669.github.io/Diffmotion-v2-website/ + +
+
+ comment: 12 pages, +
+
+
+
+
+ + ☆ Enhancing Out-of-Distribution Detection with Multitesting-based + Layer-wise Feature Fusion + + +
+ Deploying machine learning in open environments presents the challenge of +encountering diverse test inputs that differ significantly from the training +data. These out-of-distribution samples may exhibit shifts in local or global +features compared to the training distribution. The machine learning (ML) +community has responded with a number of methods aimed at distinguishing +anomalous inputs from original training data. However, the majority of previous +studies have primarily focused on the output layer or penultimate layer of +pre-trained deep neural networks. In this paper, we propose a novel framework, +Multitesting-based Layer-wise Out-of-Distribution (OOD) Detection (MLOD), to +identify distributional shifts in test samples at different levels of features +through rigorous multiple testing procedure. Our approach distinguishes itself +from existing methods as it does not require modifying the structure or +fine-tuning of the pre-trained classifier. Through extensive experiments, we +demonstrate that our proposed framework can seamlessly integrate with any +existing distance-based inspection method while efficiently utilizing feature +extractors of varying depths. Our scheme effectively enhances the performance +of out-of-distribution detection when compared to baseline methods. In +particular, MLOD-Fisher achieves superior performance in general. When trained +using KNN on CIFAR10, MLOD-Fisher significantly lowers the false positive rate +(FPR) from 24.09% to 7.47% on average compared to merely utilizing the features +of the last layer. + +
+
+
+
+
+ + ☆ Securely Fine-tuning Pre-trained Encoders Against Adversarial Examples + + +
+ With the evolution of self-supervised learning, the pre-training paradigm has +emerged as a predominant solution within the deep learning landscape. Model +providers furnish pre-trained encoders designed to function as versatile +feature extractors, enabling downstream users to harness the benefits of +expansive models with minimal effort through fine-tuning. Nevertheless, recent +works have exposed a vulnerability in pre-trained encoders, highlighting their +susceptibility to downstream-agnostic adversarial examples (DAEs) meticulously +crafted by attackers. The lingering question pertains to the feasibility of +fortifying the robustness of downstream models against DAEs, particularly in +scenarios where the pre-trained encoders are publicly accessible to the +attackers. + In this paper, we initially delve into existing defensive mechanisms against +adversarial examples within the pre-training paradigm. Our findings reveal that +the failure of current defenses stems from the domain shift between +pre-training data and downstream tasks, as well as the sensitivity of encoder +parameters. In response to these challenges, we propose Genetic +Evolution-Nurtured Adversarial Fine-tuning (Gen-AF), a two-stage adversarial +fine-tuning approach aimed at enhancing the robustness of downstream models. +Our extensive experiments, conducted across ten self-supervised training +methods and six datasets, demonstrate that Gen-AF attains high testing accuracy +and robust testing accuracy against state-of-the-art DAEs. + +
+
+
+
+
+ + ☆ Unsupervised Collaborative Metric Learning with Mixed-Scale Groups for + General Object Retrieval + + +
+ The task of searching for visual objects in a large image dataset is +difficult because it requires efficient matching and accurate localization of +objects that can vary in size. Although the segment anything model (SAM) offers +a potential solution for extracting object spatial context, learning embeddings +for local objects remains a challenging problem. This paper presents a novel +unsupervised deep metric learning approach, termed unsupervised collaborative +metric learning with mixed-scale groups (MS-UGCML), devised to learn embeddings +for objects of varying scales. Following this, a benchmark of challenges is +assembled by utilizing COCO 2017 and VOC 2007 datasets to facilitate the +training and evaluation of general object retrieval models. Finally, we conduct +comprehensive ablation studies and discuss the complexities faced within the +domain of general object retrieval. Our object retrieval evaluations span a +range of datasets, including BelgaLogos, Visual Genome, LVIS, in addition to a +challenging evaluation set that we have individually assembled for +open-vocabulary evaluation. These comprehensive evaluations effectively +highlight the robustness of our unsupervised MS-UGCML approach, with an object +level and image level mAPs improvement of up to 6.69% and 10.03%, respectively. +The code is publicly available at https://github.com/dengyuhai/MS-UGCML. + +
+
+ comment: 13 pages, 10 figures +
+
+
+
+
+ + ☆ ContourDiff: Unpaired Image Translation with Contour-Guided Diffusion + Models + + +
+ Accurately translating medical images across different modalities (e.g., CT +to MRI) has numerous downstream clinical and machine learning applications. +While several methods have been proposed to achieve this, they often prioritize +perceptual quality with respect to output domain features over preserving +anatomical fidelity. However, maintaining anatomy during translation is +essential for many tasks, e.g., when leveraging masks from the input domain to +develop a segmentation model with images translated to the output domain. To +address these challenges, we propose ContourDiff, a novel framework that +leverages domain-invariant anatomical contour representations of images. These +representations are simple to extract from images, yet form precise spatial +constraints on their anatomical content. We introduce a diffusion model that +converts contour representations of images from arbitrary input domains into +images in the output domain of interest. By applying the contour as a +constraint at every diffusion sampling step, we ensure the preservation of +anatomical content. We evaluate our method by training a segmentation model on +images translated from CT to MRI with their original CT masks and testing its +performance on real MRIs. Our method outperforms other unpaired image +translation methods by a significant margin, furthermore without the need to +access any input domain information during training. + +
+
+ comment: Code will be released on GitHub +
+
+
+
+
+ + ☆ StableGarment: Garment-Centric Generation via Stable Diffusion + + +
+ In this paper, we introduce StableGarment, a unified framework to tackle +garment-centric(GC) generation tasks, including GC text-to-image, controllable +GC text-to-image, stylized GC text-to-image, and robust virtual try-on. The +main challenge lies in retaining the intricate textures of the garment while +maintaining the flexibility of pre-trained Stable Diffusion. Our solution +involves the development of a garment encoder, a trainable copy of the +denoising UNet equipped with additive self-attention (ASA) layers. These ASA +layers are specifically devised to transfer detailed garment textures, also +facilitating the integration of stylized base models for the creation of +stylized images. Furthermore, the incorporation of a dedicated try-on +ControlNet enables StableGarment to execute virtual try-on tasks with +precision. We also build a novel data engine that produces high-quality +synthesized data to preserve the model's ability to follow prompts. Extensive +experiments demonstrate that our approach delivers state-of-the-art (SOTA) +results among existing virtual try-on methods and exhibits high flexibility +with broad potential applications in various garment-centric image generation. + +
+
+
+
+
+ + ☆ Bidirectional Multi-Step Domain Generalization for Visible-Infrared + Person Re-Identification + + +
+ A key challenge in visible-infrared person re-identification (V-I ReID) is +training a backbone model capable of effectively addressing the significant +discrepancies across modalities. State-of-the-art methods that generate a +single intermediate bridging domain are often less effective, as this generated +domain may not adequately capture sufficient common discriminant information. +This paper introduces the Bidirectional Multi-step Domain Generalization +(BMDG), a novel approach for unifying feature representations across diverse +modalities. BMDG creates multiple virtual intermediate domains by finding and +aligning body part features extracted from both I and V modalities. Indeed, +BMDG aims to reduce the modality gaps in two steps. First, it aligns modalities +in feature space by learning shared and modality-invariant body part prototypes +from V and I images. Then, it generalizes the feature representation by +applying bidirectional multi-step learning, which progressively refines feature +representations in each step and incorporates more prototypes from both +modalities. In particular, our method minimizes the cross-modal gap by +identifying and aligning shared prototypes that capture key discriminative +features across modalities, then uses multiple bridging steps based on this +information to enhance the feature representation. Experiments conducted on +challenging V-I ReID datasets indicate that our BMDG approach outperforms +state-of-the-art part-based models or methods that generate an intermediate +domain from V-I person ReID. + +
+
+
+
+
+ + ☆ Segment Any Object Model (SAOM): Real-to-Simulation Fine-Tuning Strategy + for Multi-Class Multi-Instance Segmentation + + +
+ Multi-class multi-instance segmentation is the task of identifying masks for +multiple object classes and multiple instances of the same class within an +image. The foundational Segment Anything Model (SAM) is designed for promptable +multi-class multi-instance segmentation but tends to output part or sub-part +masks in the "everything" mode for various real-world applications. Whole +object segmentation masks play a crucial role for indoor scene understanding, +especially in robotics applications. We propose a new domain invariant +Real-to-Simulation (Real-Sim) fine-tuning strategy for SAM. We use object +images and ground truth data collected from Ai2Thor simulator during +fine-tuning (real-to-sim). To allow our Segment Any Object Model (SAOM) to work +in the "everything" mode, we propose the novel nearest neighbour assignment +method, updating point embeddings for each ground-truth mask. SAOM is evaluated +on our own dataset collected from Ai2Thor simulator. SAOM significantly +improves on SAM, with a 28% increase in mIoU and a 25% increase in mAcc for 54 +frequently-seen indoor object classes. Moreover, our Real-to-Simulation +fine-tuning strategy demonstrates promising generalization performance in real +environments without being trained on the real-world data (sim-to-real). The +dataset and the code will be released after publication. + +
+
+
+
+
+ + ☆ HCF-Net: Hierarchical Context Fusion Network for Infrared Small Object + Detection + + +
+ Infrared small object detection is an important computer vision task +involving the recognition and localization of tiny objects in infrared images, +which usually contain only a few pixels. However, it encounters difficulties +due to the diminutive size of the objects and the generally complex backgrounds +in infrared images. In this paper, we propose a deep learning method, HCF-Net, +that significantly improves infrared small object detection performance through +multiple practical modules. Specifically, it includes the parallelized +patch-aware attention (PPA) module, dimension-aware selective integration +(DASI) module, and multi-dilated channel refiner (MDCR) module. The PPA module +uses a multi-branch feature extraction strategy to capture feature information +at different scales and levels. The DASI module enables adaptive channel +selection and fusion. The MDCR module captures spatial features of different +receptive field ranges through multiple depth-separable convolutional layers. +Extensive experimental results on the SIRST infrared single-frame image dataset +show that the proposed HCF-Net performs well, surpassing other traditional and +deep learning models. Code is available at +https://github.com/zhengshuchen/HCFNet. + +
+
+
+
+
+ + ☆ DPPE: Dense Pose Estimation in a Plenoxels Environment using Gradient + Approximation + + +
+ We present DPPE, a dense pose estimation algorithm that functions over a +Plenoxels environment. Recent advances in neural radiance field techniques have +shown that it is a powerful tool for environment representation. More recent +neural rendering algorithms have significantly improved both training duration +and rendering speed. Plenoxels introduced a fully-differentiable radiance field +technique that uses Plenoptic volume elements contained in voxels for +rendering, offering reduced training times and better rendering accuracy, while +also eliminating the neural net component. In this work, we introduce a 6-DoF +monocular RGB-only pose estimation procedure for Plenoxels, which seeks to +recover the ground truth camera pose after a perturbation. We employ a +variation on classical template matching techniques, using stochastic gradient +descent to optimize the pose by minimizing errors in re-rendering. In +particular, we examine an approach that takes advantage of the rapid rendering +speed of Plenoxels to numerically approximate part of the pose gradient, using +a central differencing technique. We show that such methods are effective in +pose estimation. Finally, we perform ablations over key components of the +problem space, with a particular focus on image subsampling and Plenoxel grid +resolution. Project website: https://sites.google.com/view/dppe + +
+
+ comment: 8 pages, 4 figures, conference +
+
+
+
+
+ + ☆ Match-Stereo-Videos: Bidirectional Alignment for Consistent Dynamic + Stereo Matching + + +
+ Dynamic stereo matching is the task of estimating consistent disparities from +stereo videos with dynamic objects. Recent learning-based methods prioritize +optimal performance on a single stereo pair, resulting in temporal +inconsistencies. Existing video methods apply per-frame matching and +window-based cost aggregation across the time dimension, leading to +low-frequency oscillations at the scale of the window size. Towards this +challenge, we develop a bidirectional alignment mechanism for adjacent frames +as a fundamental operation. We further propose a novel framework, BiDAStereo, +that achieves consistent dynamic stereo matching. Unlike the existing methods, +we model this task as local matching and global aggregation. Locally, we +consider correlation in a triple-frame manner to pool information from adjacent +frames and improve the temporal consistency. Globally, to exploit the entire +sequence's consistency and extract dynamic scene cues for aggregation, we +develop a motion-propagation recurrent unit. Extensive experiments demonstrate +the performance of our method, showcasing improvements in prediction quality +and achieving state-of-the-art results on various commonly used benchmarks. + +
+
+
+
+
+ + ☆ Vector search with small radiuses + + +
+ In recent years, the dominant accuracy metric for vector search is the recall +of a result list of fixed size (top-k retrieval), considering as ground truth +the exact vector retrieval results. Although convenient to compute, this metric +is distantly related to the end-to-end accuracy of a full system that +integrates vector search. In this paper we focus on the common case where a +hard decision needs to be taken depending on the vector retrieval results, for +example, deciding whether a query image matches a database image or not. We +solve this as a range search task, where all vectors within a certain radius +from the query are returned. + We show that the value of a range search result can be modeled rigorously +based on the query-to-vector distance. This yields a metric for range search, +RSM, that is both principled and easy to compute without running an end-to-end +evaluation. We apply this metric to the case of image retrieval. We show that +indexing methods that are adapted for top-k retrieval do not necessarily +maximize the RSM. In particular, for inverted file based indexes, we show that +visiting a limited set of clusters and encoding vectors compactly yields near +optimal results. + +
+
+
+
+
+ + ☆ Deep Generative Design for Mass Production + + +
+ Generative Design (GD) has evolved as a transformative design approach, +employing advanced algorithms and AI to create diverse and innovative solutions +beyond traditional constraints. Despite its success, GD faces significant +challenges regarding the manufacturability of complex designs, often +necessitating extensive manual modifications due to limitations in standard +manufacturing processes and the reliance on additive manufacturing, which is +not ideal for mass production. Our research introduces an innovative framework +addressing these manufacturability concerns by integrating constraints +pertinent to die casting and injection molding into GD, through the utilization +of 2D depth images. This method simplifies intricate 3D geometries into +manufacturable profiles, removing unfeasible features such as +non-manufacturable overhangs and allowing for the direct consideration of +essential manufacturing aspects like thickness and rib design. Consequently, +designs previously unsuitable for mass production are transformed into viable +solutions. We further enhance this approach by adopting an advanced 2D +generative model, which offer a more efficient alternative to traditional 3D +shape generation methods. Our results substantiate the efficacy of this +framework, demonstrating the production of innovative, and, importantly, +manufacturable designs. This shift towards integrating practical manufacturing +considerations into GD represents a pivotal advancement, transitioning from +purely inspirational concepts to actionable, production-ready solutions. Our +findings underscore usefulness and potential of GD for broader industry +adoption, marking a significant step forward in aligning GD with the demands of +manufacturing challenges. + +
+
+
+
+
+ + ♻ ☆ Learning Degradation-Independent Representations for Camera ISP + Pipelines CVPR 2024 + + +
+ Image signal processing (ISP) pipeline plays a fundamental role in digital +cameras, which converts raw Bayer sensor data to RGB images. However, +ISP-generated images usually suffer from imperfections due to the compounded +degradations that stem from sensor noises, demosaicing noises, compression +artifacts, and possibly adverse effects of erroneous ISP hyperparameter +settings such as ISO and gamma values. In a general sense, these ISP +imperfections can be considered as degradations. The highly complex mechanisms +of ISP degradations, some of which are even unknown, pose great challenges to +the generalization capability of deep neural networks (DNN) for image +restoration and to their adaptability to downstream tasks. To tackle the +issues, we propose a novel DNN approach to learn degradation-independent +representations (DiR) through the refinement of a self-supervised learned +baseline representation. The proposed DiR learning technique has remarkable +domain generalization capability and consequently, it outperforms +state-of-the-art methods across various downstream tasks, including blind image +restoration, object detection, and instance segmentation, as verified in our +experiments. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Analyzing and Mitigating Object Hallucination in Large Vision-Language + Models ICLR 2024 + + +
+ Large vision-language models (LVLMs) have shown remarkable abilities in +understanding visual information with human languages. However, LVLMs still +suffer from object hallucination, which is the problem of generating +descriptions that include objects that do not actually exist in the images. +This can negatively impact many vision-language tasks, such as visual +summarization and reasoning. To address this issue, we propose a simple yet +powerful algorithm, LVLM Hallucination Revisor (LURE), to post-hoc rectify +object hallucination in LVLMs by reconstructing less hallucinatory +descriptions. LURE is grounded in a rigorous statistical analysis of the key +factors underlying object hallucination, including co-occurrence (the frequent +appearance of certain objects alongside others in images), uncertainty (objects +with higher uncertainty during LVLM decoding), and object position +(hallucination often appears in the later part of the generated text). LURE can +also be seamlessly integrated with any LVLMs. We evaluate LURE on six +open-source LVLMs, achieving a 23% improvement in general object hallucination +evaluation metrics over the previous best approach. In both GPT and human +evaluations, LURE consistently ranks at the top. Our data and code are +available at https://github.com/YiyangZhou/LURE. + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Creating Image Datasets in Agricultural Environments using DALL.E: + Generative AI-Powered Large Language Model + + +
+ This research investigated the role of artificial intelligence (AI), +specifically the DALL.E model by OpenAI, in advancing data generation and +visualization techniques in agriculture. DALL.E, an advanced AI image +generator, works alongside ChatGPT's language processing to transform text +descriptions and image clues into realistic visual representations of the +content. The study used both approaches of image generation: text-to-image and +image-to image (variation). Six types of datasets depicting fruit crop +environment were generated. These AI-generated images were then compared +against ground truth images captured by sensors in real agricultural fields. +The comparison was based on Peak Signal-to-Noise Ratio (PSNR) and Feature +Similarity Index (FSIM) metrics. The image-to-image generation exhibited a +5.78% increase in average PSNR over text-to-image methods, signifying superior +image clarity and quality. However, this method also resulted in a 10.23% +decrease in average FSIM, indicating a diminished structural and textural +similarity to the original images. Similar to these measures, human evaluation +also showed that images generated using image-to-image-based method were more +realistic compared to those generated with text-to-image approach. The results +highlighted DALL.E's potential in generating realistic agricultural image +datasets and thus accelerating the development and adoption of imaging-based +precision agricultural solutions. + +
+
+ comment: 9 Figures, 1 table, 17 pages +
+
+
+
+
+ + ♻ ☆ Navigation as Attackers Wish? Towards Building Robust Embodied Agents + under Federated Learning + + +
+ Federated embodied agent learning protects the data privacy of individual +visual environments by keeping data locally at each client (the individual +environment) during training. However, since the local data is inaccessible to +the server under federated learning, attackers may easily poison the training +data of the local client to build a backdoor in the agent without notice. +Deploying such an agent raises the risk of potential harm to humans, as the +attackers may easily navigate and control the agent as they wish via the +backdoor. Towards Byzantine-robust federated embodied agent learning, in this +paper, we study the attack and defense for the task of vision-and-language +navigation (VLN), where the agent is required to follow natural language +instructions to navigate indoor environments. First, we introduce a simple but +effective attack strategy, Navigation as Wish (NAW), in which the malicious +client manipulates local trajectory data to implant a backdoor into the global +model. Results on two VLN datasets (R2R and RxR) show that NAW can easily +navigate the deployed VLN agent regardless of the language instruction, without +affecting its performance on normal test sets. Then, we propose a new +Prompt-Based Aggregation (PBA) to defend against the NAW attack in federated +VLN, which provides the server with a ''prompt'' of the vision-and-language +alignment variance between the benign and malicious clients so that they can be +distinguished during training. We validate the effectiveness of the PBA method +on protecting the global model from the NAW attack, which outperforms other +state-of-the-art defense methods by a large margin in the defense metrics on +R2R and RxR. + +
+
+
+
+
+ + ♻ ☆ Toward Generalist Anomaly Detection via In-context Residual Learning + with Few-shot Sample Prompts CVPR 2024 + + +
+ This paper explores the problem of Generalist Anomaly Detection (GAD), aiming +to train one single detection model that can generalize to detect anomalies in +diverse datasets from different application domains without any further +training on the target data. Some recent studies have shown that large +pre-trained Visual-Language Models (VLMs) like CLIP have strong generalization +capabilities on detecting industrial defects from various datasets, but their +methods rely heavily on handcrafted text prompts about defects, making them +difficult to generalize to anomalies in other applications, e.g., medical image +anomalies or semantic anomalies in natural images. In this work, we propose to +train a GAD model with few-shot normal images as sample prompts for AD on +diverse datasets on the fly. To this end, we introduce a novel approach that +learns an in-context residual learning model for GAD, termed InCTRL. It is +trained on an auxiliary dataset to discriminate anomalies from normal samples +based on a holistic evaluation of the residuals between query images and +few-shot normal sample prompts. Regardless of the datasets, per definition of +anomaly, larger residuals are expected for anomalies than normal samples, +thereby enabling InCTRL to generalize across different domains without further +training. Comprehensive experiments on nine AD datasets are performed to +establish a GAD benchmark that encapsulate the detection of industrial defect +anomalies, medical anomalies, and semantic anomalies in both one-vs-all and +multi-class setting, on which InCTRL is the best performer and significantly +outperforms state-of-the-art competing methods. Code is available at +https://github.com/mala-lab/InCTRL. + +
+
+ comment: Accepted to CVPR 2024; 17 pages; 5 figures +
+
+
+
+
+ + ♻ ☆ Adaptive Calibration: A Unified Conversion Framework of Spiking Neural + Networks + + +
+ Spiking Neural Networks (SNNs) have emerged as a promising energy-efficient +alternative to traditional Artificial Neural Networks (ANNs). Despite this, +bridging the performance gap with ANNs in practical scenarios remains a +significant challenge. This paper focuses on addressing the dual objectives of +enhancing the performance and efficiency of SNNs through the established SNN +Calibration conversion framework. Inspired by the biological nervous system, we +propose a novel Adaptive-Firing Neuron Model (AdaFire) that dynamically adjusts +firing patterns across different layers, substantially reducing conversion +errors within limited timesteps. Moreover, to meet our efficiency objectives, +we propose two novel strategies: an Sensitivity Spike Compression (SSC) +technique and an Input-aware Adaptive Timesteps (IAT) technique. These +techniques synergistically reduce both energy consumption and latency during +the conversion process, thereby enhancing the overall efficiency of SNNs. +Extensive experiments demonstrate our approach outperforms state-of-the-art +SNNs methods, showcasing superior performance and efficiency in 2D, 3D, and +event-driven classification, as well as object detection and segmentation +tasks. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ VLLaVO: Mitigating Visual Gap through LLMs + + +
+ Recent advances achieved by deep learning models rely on the independent and +identically distributed assumption, hindering their applications in real-world +scenarios with domain shifts. To tackle this issue, cross-domain learning aims +at extracting domain-invariant knowledge to reduce the domain shift between +training and testing data. However, in visual cross-domain learning, +traditional methods concentrate solely on the image modality, disregarding the +potential benefits of incorporating the text modality. In this work, we propose +VLLaVO, combining Vision language models and Large Language models as Visual +cross-dOmain learners. VLLaVO uses vision-language models to convert images +into detailed textual descriptions. A large language model is then finetuned on +textual descriptions of the source/target domain generated by a designed +instruction template. Extensive experimental results under domain +generalization and unsupervised domain adaptation settings demonstrate the +effectiveness of the proposed method. + +
+
+
+
+
+ + ♻ ☆ Towards More Unified In-context Visual Understanding CVPR 2024 + + +
+ The rapid advancement of large language models (LLMs) has accelerated the +emergence of in-context learning (ICL) as a cutting-edge approach in the +natural language processing domain. Recently, ICL has been employed in visual +understanding tasks, such as semantic segmentation and image captioning, +yielding promising results. However, existing visual ICL framework can not +enable producing content across multiple modalities, which limits their +potential usage scenarios. To address this issue, we present a new ICL +framework for visual understanding with multi-modal output enabled. First, we +quantize and embed both text and visual prompt into a unified representational +space, structured as interleaved in-context sequences. Then a decoder-only +sparse transformer architecture is employed to perform generative modeling on +them, facilitating in-context learning. Thanks to this design, the model is +capable of handling in-context vision understanding tasks with multimodal +output in a unified pipeline.Experimental results demonstrate that our model +achieves competitive performance compared with specialized models and previous +ICL baselines. Overall, our research takes a further step toward unified +multimodal in-context learning. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ EpiDiff: Enhancing Multi-View Synthesis via Localized + Epipolar-Constrained Diffusion + + +
+ Generating multiview images from a single view facilitates the rapid +generation of a 3D mesh conditioned on a single image. Recent methods that +introduce 3D global representation into diffusion models have shown the +potential to generate consistent multiviews, but they have reduced generation +speed and face challenges in maintaining generalizability and quality. To +address this issue, we propose EpiDiff, a localized interactive multiview +diffusion model. At the core of the proposed approach is to insert a +lightweight epipolar attention block into the frozen diffusion model, +leveraging epipolar constraints to enable cross-view interaction among feature +maps of neighboring views. The newly initialized 3D modeling module preserves +the original feature distribution of the diffusion model, exhibiting +compatibility with a variety of base diffusion models. Experiments show that +EpiDiff generates 16 multiview images in just 12 seconds, and it surpasses +previous methods in quality evaluation metrics, including PSNR, SSIM and LPIPS. +Additionally, EpiDiff can generate a more diverse distribution of views, +improving the reconstruction quality from generated multiviews. Please see our +project page at https://huanngzh.github.io/EpiDiff/. + +
+
+ comment: Project page: https://huanngzh.github.io/EpiDiff/ +
+
+
+
+
+ + ♻ ☆ Rotation Augmented Distillation for Exemplar-Free Class Incremental + Learning with Detailed Analysis + + +
+ Class incremental learning (CIL) aims to recognize both the old and new +classes along the increment tasks. Deep neural networks in CIL suffer from +catastrophic forgetting and some approaches rely on saving exemplars from +previous tasks, known as the exemplar-based setting, to alleviate this problem. +On the contrary, this paper focuses on the Exemplar-Free setting with no old +class sample preserved. Balancing the plasticity and stability in deep feature +learning with only supervision from new classes is more challenging. Most +existing Exemplar-Free CIL methods report the overall performance only and lack +further analysis. In this work, different methods are examined with +complementary metrics in greater detail. Moreover, we propose a simple CIL +method, Rotation Augmented Distillation (RAD), which achieves one of the +top-tier performances under the Exemplar-Free setting. Detailed analysis shows +our RAD benefits from the superior balance between plasticity and stability. +Finally, more challenging exemplar-free settings with fewer initial classes are +undertaken for further demonstrations and comparisons among the +state-of-the-art methods. + +
+
+ comment: Accepted by PRCV2023 +
+
+
+
+
+ + ♻ ☆ Large Content And Behavior Models To Understand, Simulate, And Optimize + Content And Behavior + + +
+ Shannon and Weaver's seminal information theory divides communication into +three levels: technical, semantic, and effectiveness. While the technical level +deals with the accurate reconstruction of transmitted symbols, the semantic and +effectiveness levels deal with the inferred meaning and its effect on the +receiver. Large Language Models (LLMs), with their wide generalizability, make +some progress towards the second level. However, LLMs and other communication +models are not conventionally designed for predicting and optimizing +communication for desired receiver behaviors and intents. As a result, the +effectiveness level remains largely untouched by modern communication systems. +In this paper, we introduce the receivers' "behavior tokens," such as shares, +likes, clicks, purchases, and retweets, in the LLM's training corpora to +optimize content for the receivers and predict their behaviors. Other than +showing similar performance to LLMs on content understanding tasks, our trained +models show generalization capabilities on the behavior dimension for behavior +simulation, content simulation, behavior understanding, and behavior domain +adaptation. We show results on all these capabilities using a wide range of +tasks on three corpora. We call these models Large Content and Behavior Models +(LCBMs). Further, to spur more research on LCBMs, we release our new Content +Behavior Corpus (CBC), a repository containing communicator, message, and +corresponding receiver behavior (https://behavior-in-the-wild.github.io/LCBM). + +
+
+
+
+
+ + ♻ ☆ HyperPredict: Estimating Hyperparameter Effects for Instance-Specific + Regularization in Deformable Image Registration + + +
+ Methods for medical image registration infer geometric transformations that +align pairs/groups of images by maximising an image similarity metric. This +problem is ill-posed as several solutions may have equivalent likelihoods, also +optimising purely for image similarity can yield implausible transformations. +For these reasons regularization terms are essential to obtain meaningful +registration results. However, this requires the introduction of at least one +hyperparameter often termed $\lambda$, that serves as a tradeoff between loss +terms. In some situations, the quality of the estimated transformation greatly +depends on hyperparameter choice, and different choices may be required +depending on the characteristics of the data. Analyzing the effect of these +hyperparameters requires labelled data, which is not commonly available at +test-time. In this paper, we propose a method for evaluating the influence of +hyperparameters and subsequently selecting an optimal value for given image +pairs. Our approach which we call HyperPredict, implements a Multi-Layer +Perceptron that learns the effect of selecting particular hyperparameters for +registering an image pair by predicting the resulting segmentation overlap and +measure of deformation smoothness. This approach enables us to select optimal +hyperparameters at test time without requiring labelled data, removing the need +for a one-size-fits-all cross-validation approach. Furthermore, the criteria +used to define optimal hyperparameter is flexible post-training, allowing us to +efficiently choose specific properties. We evaluate our proposed method on the +OASIS brain MR dataset using a recent deep learning approach(cLapIRN) and an +algorithmic method(Niftyreg). Our results demonstrate good performance in +predicting the effects of regularization hyperparameters and highlight the +benefits of our image-pair specific approach to hyperparameter selection. + +
+
+ comment: Accepted for publication at the Journal of Machine Learning for + Biomedical Imaging (MELBA) https://melba-journal.org/2024:005 +
+
+
+
+
+ + ♻ ☆ Boosting Neural Representations for Videos with a Conditional Decoder CVPR 2024 + + +
+ Implicit neural representations (INRs) have emerged as a promising approach +for video storage and processing, showing remarkable versatility across various +video tasks. However, existing methods often fail to fully leverage their +representation capabilities, primarily due to inadequate alignment of +intermediate features during target frame decoding. This paper introduces a +universal boosting framework for current implicit video representation +approaches. Specifically, we utilize a conditional decoder with a +temporal-aware affine transform module, which uses the frame index as a prior +condition to effectively align intermediate features with target frames. +Besides, we introduce a sinusoidal NeRV-like block to generate diverse +intermediate features and achieve a more balanced parameter distribution, +thereby enhancing the model's capacity. With a high-frequency +information-preserving reconstruction loss, our approach successfully boosts +multiple baseline INRs in the reconstruction quality and convergence speed for +video regression, and exhibits superior inpainting and interpolation results. +Further, we integrate a consistent entropy minimization technique and develop +video codecs based on these boosted INRs. Experiments on the UVG dataset +confirm that our enhanced codecs significantly outperform baseline INRs and +offer competitive rate-distortion performance compared to traditional and +learning-based codecs. Code is available at +https://github.com/Xinjie-Q/Boosting-NeRV. + +
+
+ comment: Accept by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Private, fair and accurate: Training large-scale, privacy-preserving AI + models in medical imaging + + +
+ Artificial intelligence (AI) models are increasingly used in the medical +domain. However, as medical data is highly sensitive, special precautions to +ensure its protection are required. The gold standard for privacy preservation +is the introduction of differential privacy (DP) to model training. Prior work +indicates that DP has negative implications on model accuracy and fairness, +which are unacceptable in medicine and represent a main barrier to the +widespread use of privacy-preserving techniques. In this work, we evaluated the +effect of privacy-preserving training of AI models regarding accuracy and +fairness compared to non-private training. For this, we used two datasets: (1) +A large dataset (N=193,311) of high quality clinical chest radiographs, and (2) +a dataset (N=1,625) of 3D abdominal computed tomography (CT) images, with the +task of classifying the presence of pancreatic ductal adenocarcinoma (PDAC). +Both were retrospectively collected and manually labeled by experienced +radiologists. We then compared non-private deep convolutional neural networks +(CNNs) and privacy-preserving (DP) models with respect to privacy-utility +trade-offs measured as area under the receiver-operator-characteristic curve +(AUROC), and privacy-fairness trade-offs, measured as Pearson's r or +Statistical Parity Difference. We found that, while the privacy-preserving +trainings yielded lower accuracy, they did largely not amplify discrimination +against age, sex or co-morbidity. Our study shows that -- under the challenging +realistic circumstances of a real-life clinical dataset -- the +privacy-preserving training of diagnostic deep learning models is possible with +excellent diagnostic accuracy and fairness. + +
+
+ comment: Published in Communications Medicine. Nature Portfolio +
+
+
+
+
+ + ♻ ☆ Adversarial Examples are Misaligned in Diffusion Model Manifolds IJCNN + + +
+ In recent years, diffusion models (DMs) have drawn significant attention for +their success in approximating data distributions, yielding state-of-the-art +generative results. Nevertheless, the versatility of these models extends +beyond their generative capabilities to encompass various vision applications, +such as image inpainting, segmentation, adversarial robustness, among others. +This study is dedicated to the investigation of adversarial attacks through the +lens of diffusion models. However, our objective does not involve enhancing the +adversarial robustness of image classifiers. Instead, our focus lies in +utilizing the diffusion model to detect and analyze the anomalies introduced by +these attacks on images. To that end, we systematically examine the alignment +of the distributions of adversarial examples when subjected to the process of +transformation using diffusion models. The efficacy of this approach is +assessed across CIFAR-10 and ImageNet datasets, including varying image sizes +in the latter. The results demonstrate a notable capacity to discriminate +effectively between benign and attacked images, providing compelling evidence +that adversarial instances do not align with the learned manifold of the DMs. + +
+
+ comment: accepted at IJCNN +
+
+
+
+
+ + ♻ ☆ Confidence-Aware RGB-D Face Recognition via Virtual Depth Synthesis + + +
+ 2D face recognition encounters challenges in unconstrained environments due +to varying illumination, occlusion, and pose. Recent studies focus on RGB-D +face recognition to improve robustness by incorporating depth information. +However, collecting sufficient paired RGB-D training data is expensive and +time-consuming, hindering wide deployment. In this work, we first construct a +diverse depth dataset generated by 3D Morphable Models for depth model +pre-training. Then, we propose a domain-independent pre-training framework that +utilizes readily available pre-trained RGB and depth models to separately +perform face recognition without needing additional paired data for retraining. +To seamlessly integrate the two distinct networks and harness the complementary +benefits of RGB and depth information for improved accuracy, we propose an +innovative Adaptive Confidence Weighting (ACW). This mechanism is designed to +learn confidence estimates for each modality to achieve modality fusion at the +score level. Our method is simple and lightweight, only requiring ACW training +beyond the backbone models. Experiments on multiple public RGB-D face +recognition benchmarks demonstrate state-of-the-art performance surpassing +previous methods based on depth estimation and feature fusion, validating the +efficacy of our approach. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Progressive3D: Progressively Local Editing for Text-to-3D Content + Creation with Complex Semantic Prompts ICLR2024 + + +
+ Recent text-to-3D generation methods achieve impressive 3D content creation +capacity thanks to the advances in image diffusion models and optimizing +strategies. However, current methods struggle to generate correct 3D content +for a complex prompt in semantics, i.e., a prompt describing multiple +interacted objects binding with different attributes. In this work, we propose +a general framework named Progressive3D, which decomposes the entire generation +into a series of locally progressive editing steps to create precise 3D content +for complex prompts, and we constrain the content change to only occur in +regions determined by user-defined region prompts in each editing step. +Furthermore, we propose an overlapped semantic component suppression technique +to encourage the optimization process to focus more on the semantic differences +between prompts. Extensive experiments demonstrate that the proposed +Progressive3D framework generates precise 3D content for prompts with complex +semantics and is general for various text-to-3D methods driven by different 3D +representations. + +
+
+ comment: Accept by ICLR2024. Project Page: + https://cxh0519.github.io/projects/Progressive3D/ +
+
+
+
+
+ + ♻ ☆ Spatialyze: A Geospatial Video Analytics System with Spatial-Aware + Optimizations + + +
+ Videos that are shot using commodity hardware such as phones and surveillance +cameras record various metadata such as time and location. We encounter such +geospatial videos on a daily basis and such videos have been growing in volume +significantly. Yet, we do not have data management systems that allow users to +interact with such data effectively. + In this paper, we describe Spatialyze, a new framework for end-to-end +querying of geospatial videos. Spatialyze comes with a domain-specific language +where users can construct geospatial video analytic workflows using a 3-step, +declarative, build-filter-observe paradigm. Internally, Spatialyze leverages +the declarative nature of such workflows, the temporal-spatial metadata stored +with videos, and physical behavior of real-world objects to optimize the +execution of workflows. Our results using real-world videos and workflows show +that Spatialyze can reduce execution time by up to 5.3x, while maintaining up +to 97.1% accuracy compared to unoptimized execution. + +
+
+ comment: GitHub Repository: https://github.com/apperception-db/spatialyze +
+
+
+
+
+ + ♻ ☆ Diffusion Model-Based Image Editing: A Survey + + +
+ Denoising diffusion models have emerged as a powerful tool for various image +generation and editing tasks, facilitating the synthesis of visual content in +an unconditional or input-conditional manner. The core idea behind them is +learning to reverse the process of gradually adding noise to images, allowing +them to generate high-quality samples from a complex distribution. In this +survey, we provide an exhaustive overview of existing methods using diffusion +models for image editing, covering both theoretical and practical aspects in +the field. We delve into a thorough analysis and categorization of these works +from multiple perspectives, including learning strategies, user-input +conditions, and the array of specific editing tasks that can be accomplished. +In addition, we pay special attention to image inpainting and outpainting, and +explore both earlier traditional context-driven and current multimodal +conditional methods, offering a comprehensive analysis of their methodologies. +To further evaluate the performance of text-guided image editing algorithms, we +propose a systematic benchmark, EditEval, featuring an innovative metric, LMM +Score. Finally, we address current limitations and envision some potential +directions for future research. The accompanying repository is released at +https://github.com/SiatMMLab/Awesome-Diffusion-Model-Based-Image-Editing-Methods. + +
+
+
+
+
+ + ♻ ☆ Universal Debiased Editing on Foundation Models for Fair Medical Image + Classification + + +
+ In the era of Foundation Models' (FMs) rising prominence in AI, our study +addresses the challenge of biases in medical images while using FM API, +particularly spurious correlations between pixels and sensitive attributes. +Traditional methods for bias mitigation face limitations due to the restricted +access to web-hosted FMs and difficulties in addressing the underlying bias +encoded within the FM API. We propose an U(niversal) D(ebiased) E(diting) +strategy, termed UDE, which generates UDE noise to mask such spurious +correlation. UDE is capable of mitigating bias both within the FM API embedding +and the images themselves. Furthermore, UDE is suitable for both white-box and +black-box FM APIs, where we introduced G(reedy) (Z)eroth-O(rder) (GeZO) +optimization for it when the gradient is inaccessible in black-box APIs. Our +whole pipeline enables fairness-aware image editing that can be applied across +various medical contexts without requiring direct model manipulation or +significant computational resources. Our empirical results demonstrate the +method's effectiveness in maintaining fairness and utility across different +patient groups and diseases. In the era of AI-driven medicine, this work +contributes to making healthcare diagnostics more equitable, showcasing a +practical solution for bias mitigation in pre-trained image FMs. + +
+
+
+
+
+ + ♻ ☆ InstructCV: Instruction-Tuned Text-to-Image Diffusion Models as Vision + Generalists ICLR 2024 + + +
+ Recent advances in generative diffusion models have enabled text-controlled +synthesis of realistic and diverse images with impressive quality. Despite +these remarkable advances, the application of text-to-image generative models +in computer vision for standard visual recognition tasks remains limited. The +current de facto approach for these tasks is to design model architectures and +loss functions that are tailored to the task at hand. In this paper, we develop +a unified language interface for computer vision tasks that abstracts away +task-specific design choices and enables task execution by following natural +language instructions. Our approach involves casting multiple computer vision +tasks as text-to-image generation problems. Here, the text represents an +instruction describing the task, and the resulting image is a visually-encoded +task output. To train our model, we pool commonly-used computer vision datasets +covering a range of tasks, including segmentation, object detection, depth +estimation, and classification. We then use a large language model to +paraphrase prompt templates that convey the specific tasks to be conducted on +each image, and through this process, we create a multi-modal and multi-task +training dataset comprising input and output images along with annotated +instructions. Following the InstructPix2Pix architecture, we apply +instruction-tuning to a text-to-image diffusion model using our constructed +dataset, steering its functionality from a generative model to an +instruction-guided multi-task vision learner. Experiments demonstrate that our +model, dubbed InstructCV, performs competitively compared to other generalist +and task-specific vision models. Moreover, it exhibits compelling +generalization capabilities to unseen data, categories, and user instructions. + +
+
+ comment: ICLR 2024; Code is available at https://github.com/AlaaLab/InstructCV +
+
+
+
+
+ + ♻ ☆ Diffusion in Diffusion: Cyclic One-Way Diffusion for + Text-Vision-Conditioned Generation ICLR2024 + + +
+ Originating from the diffusion phenomenon in physics that describes particle +movement, the diffusion generative models inherit the characteristics of +stochastic random walk in the data space along the denoising trajectory. +However, the intrinsic mutual interference among image regions contradicts the +need for practical downstream application scenarios where the preservation of +low-level pixel information from given conditioning is desired (e.g., +customization tasks like personalized generation and inpainting based on a +user-provided single image). In this work, we investigate the diffusion +(physics) in diffusion (machine learning) properties and propose our Cyclic +One-Way Diffusion (COW) method to control the direction of diffusion phenomenon +given a pre-trained frozen diffusion model for versatile customization +application scenarios, where the low-level pixel information from the +conditioning needs to be preserved. Notably, unlike most current methods that +incorporate additional conditions by fine-tuning the base text-to-image +diffusion model or learning auxiliary networks, our method provides a novel +perspective to understand the task needs and is applicable to a wider range of +customization scenarios in a learning-free manner. Extensive experiment results +show that our proposed COW can achieve more flexible customization based on +strict visual conditions in different application settings. Project page: +https://wangruoyu02.github.io/cow.github.io/. + +
+
+ comment: Accepted by ICLR2024, 21 pages with 18 figures +
+
+
+
+
+ + ♻ ☆ IMPUS: Image Morphing with Perceptually-Uniform Sampling Using Diffusion + Models ICLR 2024 + + +
+ We present a diffusion-based image morphing approach with +perceptually-uniform sampling (IMPUS) that produces smooth, direct and +realistic interpolations given an image pair. The embeddings of two images may +lie on distinct conditioned distributions of a latent diffusion model, +especially when they have significant semantic difference. To bridge this gap, +we interpolate in the locally linear and continuous text embedding space and +Gaussian latent space. We first optimize the endpoint text embeddings and then +map the images to the latent space using a probability flow ODE. Unlike +existing work that takes an indirect morphing path, we show that the model +adaptation yields a direct path and suppresses ghosting artifacts in the +interpolated images. To achieve this, we propose a heuristic bottleneck +constraint based on a novel relative perceptual path diversity score that +automatically controls the bottleneck size and balances the diversity along the +path with its directness. We also propose a perceptually-uniform sampling +technique that enables visually smooth changes between the interpolated images. +Extensive experiments validate that our IMPUS can achieve smooth, direct, and +realistic image morphing and is adaptable to several other generative tasks. + +
+
+ comment: Published as a conference paper at ICLR 2024 +
+
+
+
+
+ + ♻ ☆ TFS-ViT: Token-Level Feature Stylization for Domain Generalization + + +
+ Standard deep learning models such as convolutional neural networks (CNNs) +lack the ability of generalizing to domains which have not been seen during +training. This problem is mainly due to the common but often wrong assumption +of such models that the source and target data come from the same i.i.d. +distribution. Recently, Vision Transformers (ViTs) have shown outstanding +performance for a broad range of computer vision tasks. However, very few +studies have investigated their ability to generalize to new domains. This +paper presents a first Token-level Feature Stylization (TFS-ViT) approach for +domain generalization, which improves the performance of ViTs to unseen data by +synthesizing new domains. Our approach transforms token features by mixing the +normalization statistics of images from different domains. We further improve +this approach with a novel strategy for attention-aware stylization, which uses +the attention maps of class (CLS) tokens to compute and mix normalization +statistics of tokens corresponding to different image regions. The proposed +method is flexible to the choice of backbone model and can be easily applied to +any ViT-based architecture with a negligible increase in computational +complexity. Comprehensive experiments show that our approach is able to achieve +state-of-the-art performance on five challenging benchmarks for domain +generalization, and demonstrate its ability to deal with different types of +domain shifts. The implementation is available at: +https://github.com/Mehrdad-Noori/TFS-ViT_Token-level_Feature_Stylization. + +
+
+
+
+
+ + ♻ ☆ EdgeOL: Efficient in-situ Online Learning on Edge Devices + + +
+ Emerging applications, such as robot-assisted eldercare and object +recognition, generally employ deep learning neural networks (DNNs) and +naturally require: i) handling streaming-in inference requests and ii) adapting +to possible deployment scenario changes. Online model fine-tuning is widely +adopted to satisfy these needs. However, an inappropriate fine-tuning scheme +could involve significant energy consumption, making it challenging to deploy +on edge devices. In this paper, we propose EdgeOL, an edge online learning +framework that optimizes inference accuracy, fine-tuning execution time, and +energy efficiency through both inter-tuning and intra-tuning optimizations. +Experimental results show that, on average, EdgeOL reduces overall fine-tuning +execution time by 64%, energy consumption by 52%, and improves average +inference accuracy by 1.75% over the immediate online learning strategy. + +
+
+
+
+
+ + ♻ ☆ PeLK: Parameter-efficient Large Kernel ConvNets with Peripheral + Convolution CVPR 2024 + + +
+ Recently, some large kernel convnets strike back with appealing performance +and efficiency. However, given the square complexity of convolution, scaling up +kernels can bring about an enormous amount of parameters and the proliferated +parameters can induce severe optimization problem. Due to these issues, current +CNNs compromise to scale up to 51x51 in the form of stripe convolution (i.e., +51x5 + 5x51) and start to saturate as the kernel size continues growing. In +this paper, we delve into addressing these vital issues and explore whether we +can continue scaling up kernels for more performance gains. Inspired by human +vision, we propose a human-like peripheral convolution that efficiently reduces +over 90% parameter count of dense grid convolution through parameter sharing, +and manage to scale up kernel size to extremely large. Our peripheral +convolution behaves highly similar to human, reducing the complexity of +convolution from O(K^2) to O(logK) without backfiring performance. Built on +this, we propose Parameter-efficient Large Kernel Network (PeLK). Our PeLK +outperforms modern vision Transformers and ConvNet architectures like Swin, +ConvNeXt, RepLKNet and SLaK on various vision tasks including ImageNet +classification, semantic segmentation on ADE20K and object detection on MS +COCO. For the first time, we successfully scale up the kernel size of CNNs to +an unprecedented 101x101 and demonstrate consistent improvements. + +
+
+ comment: CVPR 2024; Modification for Fig.1(b); Add Acknowledgements +
+
+
+
+
+ + ♻ ☆ Deep Unsupervised Learning Using Spike-Timing-Dependent Plasticity + + +
+ Spike-Timing-Dependent Plasticity (STDP) is an unsupervised learning +mechanism for Spiking Neural Networks (SNNs) that has received significant +attention from the neuromorphic hardware community. However, scaling such local +learning techniques to deeper networks and large-scale tasks has remained +elusive. In this work, we investigate a Deep-STDP framework where a rate-based +convolutional network, that can be deployed in a neuromorphic setting, is +trained in tandem with pseudo-labels generated by the STDP clustering process +on the network outputs. We achieve $24.56\%$ higher accuracy and $3.5\times$ +faster convergence speed at iso-accuracy on a 10-class subset of the Tiny +ImageNet dataset in contrast to a $k$-means clustering approach. + +
+
+
+
+
+ + ♻ ☆ MVDream: Multi-view Diffusion for 3D Generation + + +
+ We introduce MVDream, a diffusion model that is able to generate consistent +multi-view images from a given text prompt. Learning from both 2D and 3D data, +a multi-view diffusion model can achieve the generalizability of 2D diffusion +models and the consistency of 3D renderings. We demonstrate that such a +multi-view diffusion model is implicitly a generalizable 3D prior agnostic to +3D representations. It can be applied to 3D generation via Score Distillation +Sampling, significantly enhancing the consistency and stability of existing +2D-lifting methods. It can also learn new concepts from a few 2D examples, akin +to DreamBooth, but for 3D generation. + +
+
+ comment: Camera-ready version; Our project page is https://MV-Dream.github.io +
+
+
+
+
+ + ♻ ☆ Empirical Validation of Conformal Prediction for Trustworthy Skin + Lesions Classification + + +
+ Background and objective: Uncertainty quantification is a pivotal field that +contributes to realizing reliable and robust systems. It becomes instrumental +in fortifying safe decisions by providing complementary information, +particularly within high-risk applications. existing studies have explored +various methods that often operate under specific assumptions or necessitate +substantial modifications to the network architecture to effectively account +for uncertainties. The objective of this paper is to study Conformal +Prediction, an emerging distribution-free uncertainty quantification technique, +and provide a comprehensive understanding of the advantages and limitations +inherent in various methods within the medical imaging field. + Methods: In this study, we developed Conformal Prediction, Monte Carlo +Dropout, and Evidential Deep Learning approaches to assess uncertainty +quantification in deep neural networks. The effectiveness of these methods is +evaluated using three public medical imaging datasets focused on detecting +pigmented skin lesions and blood cell types. + Results: The experimental results demonstrate a significant enhancement in +uncertainty quantification with the utilization of the Conformal Prediction +method, surpassing the performance of the other two methods. Furthermore, the +results present insights into the effectiveness of each uncertainty method in +handling Out-of-Distribution samples from domain-shifted datasets. Our code is +available at: + Conclusions: Our conclusion highlights a robust and consistent performance of +conformal prediction across diverse testing conditions. This positions it as +the preferred choice for decision-making in safety-critical applications. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 238 + +
+
+
+ + ☆ Leveraging Synthetic Data for Generalizable and Fair Facial Action Unit + Detection + + +
+ Facial action unit (AU) detection is a fundamental block for objective facial +expression analysis. Supervised learning approaches require a large amount of +manual labeling which is costly. The limited labeled data are also not diverse +in terms of gender which can affect model fairness. In this paper, we propose +to use synthetically generated data and multi-source domain adaptation (MSDA) +to address the problems of the scarcity of labeled data and the diversity of +subjects. Specifically, we propose to generate a diverse dataset through +synthetic facial expression re-targeting by transferring the expressions from +real faces to synthetic avatars. Then, we use MSDA to transfer the AU detection +knowledge from a real dataset and the synthetic dataset to a target dataset. +Instead of aligning the overall distributions of different domains, we propose +Paired Moment Matching (PM2) to align the features of the paired real and +synthetic data with the same facial expression. To further improve gender +fairness, PM2 matches the features of the real data with a female and a male +synthetic image. Our results indicate that synthetic data and the proposed +model improve both AU detection performance and fairness across genders, +demonstrating its potential to solve AU detection in-the-wild. + +
+
+ comment: The work was done in 2021 +
+
+
+
+
+ + ☆ Giving a Hand to Diffusion Models: a Two-Stage Approach to Improving + Conditional Human Image Generation + + +
+ Recent years have seen significant progress in human image generation, +particularly with the advancements in diffusion models. However, existing +diffusion methods encounter challenges when producing consistent hand anatomy +and the generated images often lack precise control over the hand pose. To +address this limitation, we introduce a novel approach to pose-conditioned +human image generation, dividing the process into two stages: hand generation +and subsequent body out-painting around the hands. We propose training the hand +generator in a multi-task setting to produce both hand images and their +corresponding segmentation masks, and employ the trained model in the first +stage of generation. An adapted ControlNet model is then used in the second +stage to outpaint the body around the generated hands, producing the final +result. A novel blending technique is introduced to preserve the hand details +during the second stage that combines the results of both stages in a coherent +way. This involves sequential expansion of the out-painted region while fusing +the latent representations, to ensure a seamless and cohesive synthesis of the +final image. Experimental evaluations demonstrate the superiority of our +proposed method over state-of-the-art techniques, in both pose accuracy and +image quality, as validated on the HaGRID dataset. Our approach not only +enhances the quality of the generated hands but also offers improved control +over hand pose, advancing the capabilities of pose-conditioned human image +generation. The source code of the proposed approach is available at +https://github.com/apelykh/hand-to-diffusion. + +
+
+
+
+
+ + ☆ Cannabis Seed Variant Detection using Faster R-CNN CCS 2024 + + +
+ Analyzing and detecting cannabis seed variants is crucial for the agriculture +industry. It enables precision breeding, allowing cultivators to selectively +enhance desirable traits. Accurate identification of seed variants also ensures +regulatory compliance, facilitating the cultivation of specific cannabis +strains with defined characteristics, ultimately improving agricultural +productivity and meeting diverse market demands. This paper presents a study on +cannabis seed variant detection by employing a state-of-the-art object +detection model Faster R-CNN. This study implemented the model on a locally +sourced cannabis seed dataset in Thailand, comprising 17 distinct classes. We +evaluate six Faster R-CNN models by comparing performance on various metrics +and achieving a mAP score of 94.08\% and an F1 score of 95.66\%. This paper +presents the first known application of deep neural network object detection +models to the novel task of visually identifying cannabis seed types. + +
+
+ comment: 6 pages, 2 figures, this has been submitted and accepted for + publication at IEEE - ICACCS 2024 +
+
+
+
+
+ + ☆ PyHySCO: GPU-Enabled Susceptibility Artifact Distortion Correction in + Seconds + + +
+ Over the past decade, reversed Gradient Polarity (RGP) methods have become a +popular approach for correcting susceptibility artifacts in Echo-Planar Imaging +(EPI). Although several post-processing tools for RGP are available, their +implementations do not fully leverage recent hardware, algorithmic, and +computational advances, leading to correction times of several minutes per +image volume. To enable 3D RGP correction in seconds, we introduce PyHySCO, a +user-friendly EPI distortion correction tool implemented in PyTorch that +enables multi-threading and efficient use of graphics processing units (GPUs). +PyHySCO uses a time-tested physical distortion model and mathematical +formulation and is, therefore, reliable without training. An algorithmic +improvement in PyHySCO is its novel initialization scheme that uses 1D optimal +transport. PyHySCO is published under the GNU public license and can be used +from the command line or its Python interface. Our extensive numerical +validation using 3T and 7T data from the Human Connectome Project suggests that +PyHySCO achieves accuracy comparable to that of leading RGP tools at a fraction +of the cost. We also validate the new initialization scheme, compare different +optimization algorithms, and test the algorithm on different hardware and +arithmetic precision. + +
+
+ comment: 20 pages, 9 figures +
+
+
+
+
+ + ☆ IMPRINT: Generative Object Compositing by Learning Identity-Preserving + Representation + + +
+ Generative object compositing emerges as a promising new avenue for +compositional image editing. However, the requirement of object identity +preservation poses a significant challenge, limiting practical usage of most +existing methods. In response, this paper introduces IMPRINT, a novel +diffusion-based generative model trained with a two-stage learning framework +that decouples learning of identity preservation from that of compositing. The +first stage is targeted for context-agnostic, identity-preserving pretraining +of the object encoder, enabling the encoder to learn an embedding that is both +view-invariant and conducive to enhanced detail preservation. The subsequent +stage leverages this representation to learn seamless harmonization of the +object composited to the background. In addition, IMPRINT incorporates a +shape-guidance mechanism offering user-directed control over the compositing +process. Extensive experiments demonstrate that IMPRINT significantly +outperforms existing methods and various baselines on identity preservation and +composition quality. + +
+
+
+
+
+ + ☆ Robust Influence-based Training Methods for Noisy Brain MRI + + +
+ Correctly classifying brain tumors is imperative to the prompt and accurate +treatment of a patient. While several classification algorithms based on +classical image processing or deep learning methods have been proposed to +rapidly classify tumors in MR images, most assume the unrealistic setting of +noise-free training data. In this work, we study a difficult but realistic +setting of training a deep learning model on noisy MR images to classify brain +tumors. We propose two training methods that are robust to noisy MRI training +data, Influence-based Sample Reweighing (ISR) and Influence-based Sample +Perturbation (ISP), which are based on influence functions from robust +statistics. Using the influence functions, in ISR, we adaptively reweigh +training examples according to how helpful/harmful they are to the training +process, while in ISP, we craft and inject helpful perturbation proportional to +the influence score. Both ISR and ISP harden the classification model against +noisy training data without significantly affecting the generalization ability +of the model on test data. We conduct empirical evaluations over a common brain +tumor dataset and compare ISR and ISP to three baselines. Our empirical results +show that ISR and ISP can efficiently train deep learning models robust against +noisy training data. + +
+
+
+
+
+ + ☆ On the low-shot transferability of [V]-Mamba + + +
+ The strength of modern large-scale neural networks lies in their ability to +efficiently adapt to new tasks with few examples. Although extensive research +has investigated the transferability of Vision Transformers (ViTs) to various +downstream tasks under diverse constraints, this study shifts focus to explore +the transfer learning potential of [V]-Mamba. We compare its performance with +ViTs across different few-shot data budgets and efficient transfer methods. Our +analysis yields three key insights into [V]-Mamba's few-shot transfer +performance: (a) [V]-Mamba demonstrates superior or equivalent few-shot +learning capabilities compared to ViTs when utilizing linear probing (LP) for +transfer, (b) Conversely, [V]-Mamba exhibits weaker or similar few-shot +learning performance compared to ViTs when employing visual prompting (VP) as +the transfer method, and (c) We observe a weak positive correlation between the +performance gap in transfer via LP and VP and the scale of the [V]-Mamba model. +This preliminary analysis lays the foundation for more comprehensive studies +aimed at furthering our understanding of the capabilities of [V]-Mamba variants +and their distinctions from ViTs. + +
+
+ comment: Preprint (Work in progress) +
+
+
+
+
+ + ☆ EAGLE: An Edge-Aware Gradient Localization Enhanced Loss for CT Image + Reconstruction + + +
+ Computed Tomography (CT) image reconstruction is crucial for accurate +diagnosis and deep learning approaches have demonstrated significant potential +in improving reconstruction quality. However, the choice of loss function +profoundly affects the reconstructed images. Traditional mean squared error +loss often produces blurry images lacking fine details, while alternatives +designed to improve may introduce structural artifacts or other undesirable +effects. To address these limitations, we propose Eagle-Loss, a novel loss +function designed to enhance the visual quality of CT image reconstructions. +Eagle-Loss applies spectral analysis of localized features within gradient +changes to enhance sharpness and well-defined edges. We evaluated Eagle-Loss on +two public datasets across low-dose CT reconstruction and CT field-of-view +extension tasks. Our results show that Eagle-Loss consistently improves the +visual quality of reconstructed images, surpassing state-of-the-art methods +across various network architectures. Code and data are available at +\url{https://github.com/sypsyp97/Eagle_Loss}. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Latent Object Characteristics Recognition with Visual to Haptic-Audio + Cross-modal Transfer Learning + + +
+ Recognising the characteristics of objects while a robot handles them is +crucial for adjusting motions that ensure stable and efficient interactions +with containers. Ahead of realising stable and efficient robot motions for +handling/transferring the containers, this work aims to recognise the latent +unobservable object characteristics. While vision is commonly used for object +recognition by robots, it is ineffective for detecting hidden objects. However, +recognising objects indirectly using other sensors is a challenging task. To +address this challenge, we propose a cross-modal transfer learning approach +from vision to haptic-audio. We initially train the model with vision, directly +observing the target object. Subsequently, we transfer the latent space learned +from vision to a second module, trained only with haptic-audio and motor data. +This transfer learning framework facilitates the representation of object +characteristics using indirect sensor data, thereby improving recognition +accuracy. For evaluating the recognition accuracy of our proposed learning +framework we selected shape, position, and orientation as the object +characteristics. Finally, we demonstrate online recognition of both trained and +untrained objects using the humanoid robot Nextage Open. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ GS-Pose: Cascaded Framework for Generalizable Segmentation-based 6D + Object Pose Estimation + + +
+ This paper introduces GS-Pose, an end-to-end framework for locating and +estimating the 6D pose of objects. GS-Pose begins with a set of posed RGB +images of a previously unseen object and builds three distinct representations +stored in a database. At inference, GS-Pose operates sequentially by locating +the object in the input image, estimating its initial 6D pose using a retrieval +approach, and refining the pose with a render-and-compare method. The key +insight is the application of the appropriate object representation at each +stage of the process. In particular, for the refinement step, we utilize 3D +Gaussian splatting, a novel differentiable rendering technique that offers high +rendering speed and relatively low optimization time. Off-the-shelf toolchains +and commodity hardware, such as mobile phones, can be used to capture new +objects to be added to the database. Extensive evaluations on the LINEMOD and +OnePose-LowTexture datasets demonstrate excellent performance, establishing the +new state-of-the-art. Project page: https://dingdingcai.github.io/gs-pose. + +
+
+ comment: Project Page: https://dingdingcai.github.io/gs-pose +
+
+
+
+
+ + ☆ Spiking Neural Networks for Fast-Moving Object Detection on Neuromorphic + Hardware Devices Using an Event-Based Camera + + +
+ Table tennis is a fast-paced and exhilarating sport that demands agility, +precision, and fast reflexes. In recent years, robotic table tennis has become +a popular research challenge for robot perception algorithms. Fast and accurate +ball detection is crucial for enabling a robotic arm to rally the ball back +successfully. Previous approaches have employed conventional frame-based +cameras with Convolutional Neural Networks (CNNs) or traditional computer +vision methods. In this paper, we propose a novel solution that combines an +event-based camera with Spiking Neural Networks (SNNs) for ball detection. We +use multiple state-of-the-art SNN frameworks and develop a SNN architecture for +each of them, complying with their corresponding constraints. Additionally, we +implement the SNN solution across multiple neuromorphic edge devices, +conducting comparisons of their accuracies and run-times. This furnishes +robotics researchers with a benchmark illustrating the capabilities achievable +with each SNN framework and a corresponding neuromorphic edge device. Next to +this comparison of SNN solutions for robots, we also show that an SNN on a +neuromorphic edge device is able to run in real-time in a closed loop robotic +system, a table tennis robot in our use case. + +
+
+
+
+
+ + ☆ D-Net: Dynamic Large Kernel with Dynamic Feature Fusion for Volumetric + Medical Image Segmentation + + +
+ Hierarchical transformers have achieved significant success in medical image +segmentation due to their large receptive field and capabilities of effectively +leveraging global long-range contextual information. Convolutional neural +networks (CNNs) can also deliver a large receptive field by using large +kernels, enabling them to achieve competitive performance with fewer model +parameters. However, CNNs incorporated with large convolutional kernels remain +constrained in adaptively capturing multi-scale features from organs with large +variations in shape and size due to the employment of fixed-sized kernels. +Additionally, they are unable to utilize global contextual information +efficiently. To address these limitations, we propose Dynamic Large Kernel +(DLK) and Dynamic Feature Fusion (DFF) modules. The DLK module employs multiple +large kernels with varying kernel sizes and dilation rates to capture +multi-scale features. Subsequently, a dynamic selection mechanism is utilized +to adaptively highlight the most important spatial features based on global +information. Additionally, the DFF module is proposed to adaptively fuse +multi-scale local feature maps based on their global information. We integrate +DLK and DFF in a hierarchical transformer architecture to develop a novel +architecture, termed D-Net. D-Net is able to effectively utilize a multi-scale +large receptive field and adaptively harness global contextual information. +Extensive experimental results demonstrate that D-Net outperforms other +state-of-the-art models in the two volumetric segmentation tasks, including +abdominal multi-organ segmentation and multi-modality brain tumor segmentation. +Our code is available at https://github.com/sotiraslab/DLK. + +
+
+ comment: 12 pages, 4 figures, 2 tables +
+
+
+
+
+ + ☆ Not Just Change the Labels, Learn the Features: Watermarking Deep Neural + Networks with Multi-View Data + + +
+ With the increasing prevalence of Machine Learning as a Service (MLaaS) +platforms, there is a growing focus on deep neural network (DNN) watermarking +techniques. These methods are used to facilitate the verification of ownership +for a target DNN model to protect intellectual property. One of the most widely +employed watermarking techniques involves embedding a trigger set into the +source model. Unfortunately, existing methodologies based on trigger sets are +still susceptible to functionality-stealing attacks, potentially enabling +adversaries to steal the functionality of the source model without a reliable +means of verifying ownership. In this paper, we first introduce a novel +perspective on trigger set-based watermarking methods from a feature learning +perspective. Specifically, we demonstrate that by selecting data exhibiting +multiple features, also referred to as $\textit{multi-view data}$, it becomes +feasible to effectively defend functionality stealing attacks. Based on this +perspective, we introduce a novel watermarking technique based on Multi-view +dATa, called MAT, for efficiently embedding watermarks within DNNs. This +approach involves constructing a trigger set with multi-view data and +incorporating a simple feature-based regularization method for training the +source model. We validate our method across various benchmarks and demonstrate +its efficacy in defending against model extraction attacks, surpassing relevant +baselines by a significant margin. + +
+
+
+
+
+ + ☆ SwinMTL: A Shared Architecture for Simultaneous Depth Estimation and + Semantic Segmentation from Monocular Camera Images + + +
+ This research paper presents an innovative multi-task learning framework that +allows concurrent depth estimation and semantic segmentation using a single +camera. The proposed approach is based on a shared encoder-decoder +architecture, which integrates various techniques to improve the accuracy of +the depth estimation and semantic segmentation task without compromising +computational efficiency. Additionally, the paper incorporates an adversarial +training component, employing a Wasserstein GAN framework with a critic +network, to refine model's predictions. The framework is thoroughly evaluated +on two datasets - the outdoor Cityscapes dataset and the indoor NYU Depth V2 +dataset - and it outperforms existing state-of-the-art methods in both +segmentation and depth estimation tasks. We also conducted ablation studies to +analyze the contributions of different components, including pre-training +strategies, the inclusion of critics, the use of logarithmic depth scaling, and +advanced image augmentations, to provide a better understanding of the proposed +framework. The accompanying source code is accessible at +\url{https://github.com/PardisTaghavi/SwinMTL}. + +
+
+
+
+
+ + ☆ InterLUDE: Interactions between Labeled and Unlabeled Data to Enhance + Semi-Supervised Learning + + +
+ Semi-supervised learning (SSL) seeks to enhance task performance by training +on both labeled and unlabeled data. Mainstream SSL image classification methods +mostly optimize a loss that additively combines a supervised classification +objective with a regularization term derived solely from unlabeled data. This +formulation neglects the potential for interaction between labeled and +unlabeled images. In this paper, we introduce InterLUDE, a new approach to +enhance SSL made of two parts that each benefit from labeled-unlabeled +interaction. The first part, embedding fusion, interpolates between labeled and +unlabeled embeddings to improve representation learning. The second part is a +new loss, grounded in the principle of consistency regularization, that aims to +minimize discrepancies in the model's predictions between labeled versus +unlabeled inputs. Experiments on standard closed-set SSL benchmarks and a +medical SSL task with an uncurated unlabeled set show clear benefits to our +approach. On the STL-10 dataset with only 40 labels, InterLUDE achieves 3.2% +error rate, while the best previous method reports 14.9%. + +
+
+ comment: Semi-supervised Learning; Vision Transformers +
+
+
+
+
+ + ☆ PALM: Pushing Adaptive Learning Rate Mechanisms for Continual Test-Time + Adaptation + + +
+ Real-world vision models in dynamic environments face rapid shifts in domain +distributions, leading to decreased recognition performance. Continual +test-time adaptation (CTTA) directly adjusts a pre-trained source +discriminative model to these changing domains using test data. A highly +effective CTTA method involves applying layer-wise adaptive learning rates, and +selectively adapting pre-trained layers. However, it suffers from the poor +estimation of domain shift and the inaccuracies arising from the pseudo-labels. +In this work, we aim to overcome these limitations by identifying layers +through the quantification of model prediction uncertainty without relying on +pseudo-labels. We utilize the magnitude of gradients as a metric, calculated by +backpropagating the KL divergence between the softmax output and a uniform +distribution, to select layers for further adaptation. Subsequently, for the +parameters exclusively belonging to these selected layers, with the remaining +ones frozen, we evaluate their sensitivity in order to approximate the domain +shift, followed by adjusting their learning rates accordingly. Overall, this +approach leads to a more robust and stable optimization than prior approaches. +We conduct extensive image classification experiments on CIFAR-10C, CIFAR-100C, +and ImageNet-C and demonstrate the efficacy of our method against standard +benchmarks and prior methods. + +
+
+
+
+
+ + ☆ MeDSLIP: Medical Dual-Stream Language-Image Pre-training for + Fine-grained Alignment + + +
+ Vision-language pre-training (VLP) models have shown significant advancements +in the medical domain. Yet, most VLP models align raw reports to images at a +very coarse level, without modeling fine-grained relationships between +anatomical and pathological concepts outlined in reports and the corresponding +semantic counterparts in images. To address this problem, we propose a Medical +Dual-Stream Language-Image Pre-training (MeDSLIP) framework. Specifically, +MeDSLIP establishes vision-language fine-grained alignments via disentangling +visual and textual representations into anatomy-relevant and pathology-relevant +streams. Moreover, a novel vision-language Prototypical Contr-astive Learning +(ProtoCL) method is adopted in MeDSLIP to enhance the alignment within the +anatomical and pathological streams. MeDSLIP further employs cross-stream +Intra-image Contrastive Learning (ICL) to ensure the consistent coexistence of +paired anatomical and pathological concepts within the same image. Such a +cross-stream regularization encourages the model to exploit the synchrony +between two streams for a more comprehensive representation learning. MeDSLIP +is evaluated under zero-shot and supervised fine-tuning settings on three +public datasets: NIH CXR14, RSNA Pneumonia, and SIIM-ACR Pneumothorax. Under +these settings, MeDSLIP outperforms six leading CNN-based models on +classification, grounding, and segmentation tasks. + +
+
+
+
+
+ + ☆ Leveraging CLIP for Inferring Sensitive Information and Improving Model + Fairness + + +
+ Performance disparities across sub-populations are known to exist in deep +learning-based vision recognition models, but previous work has largely +addressed such fairness concerns assuming knowledge of sensitive attribute +labels. To overcome this reliance, previous strategies have involved separate +learning structures to expose and adjust for disparities. In this work, we +explore a new paradigm that does not require sensitive attribute labels, and +evades the need for extra training by leveraging the vision-language model, +CLIP, as a rich knowledge source to infer sensitive information. We present +sample clustering based on similarity derived from image and +attribute-specified language embeddings and assess their correspondence to true +attribute distribution. We train a target model by re-sampling and augmenting +under-performed clusters. Extensive experiments on multiple benchmark bias +datasets show clear fairness gains of the model over existing baselines, which +indicate that CLIP can extract discriminative sensitive information prompted by +language, and used to promote model fairness. + +
+
+
+
+
+ + ☆ NeuralOCT: Airway OCT Analysis via Neural Fields + + +
+ Optical coherence tomography (OCT) is a popular modality in ophthalmology and +is also used intravascularly. Our interest in this work is OCT in the context +of airway abnormalities in infants and children where the high resolution of +OCT and the fact that it is radiation-free is important. The goal of airway OCT +is to provide accurate estimates of airway geometry (in 2D and 3D) to assess +airway abnormalities such as subglottic stenosis. We propose +$\texttt{NeuralOCT}$, a learning-based approach to process airway OCT images. +Specifically, $\texttt{NeuralOCT}$ extracts 3D geometries from OCT scans by +robustly bridging two steps: point cloud extraction via 2D segmentation and 3D +reconstruction from point clouds via neural fields. Our experiments show that +$\texttt{NeuralOCT}$ produces accurate and robust 3D airway reconstructions +with an average A-line error smaller than 70 micrometer. Our code will cbe +available on GitHub. + +
+
+
+
+
+ + ☆ LightIt: Illumination Modeling and Control for Diffusion Models + + +
+ We introduce LightIt, a method for explicit illumination control for image +generation. Recent generative methods lack lighting control, which is crucial +to numerous artistic aspects of image generation such as setting the overall +mood or cinematic appearance. To overcome these limitations, we propose to +condition the generation on shading and normal maps. We model the lighting with +single bounce shading, which includes cast shadows. We first train a shading +estimation module to generate a dataset of real-world images and shading pairs. +Then, we train a control network using the estimated shading and normals as +input. Our method demonstrates high-quality image generation and lighting +control in numerous scenes. Additionally, we use our generated dataset to train +an identity-preserving relighting model, conditioned on an image and a target +shading. Our method is the first that enables the generation of images with +controllable, consistent lighting and performs on par with specialized +relighting state-of-the-art methods. + +
+
+ comment: Project page: https://peter-kocsis.github.io/LightIt/ Video: + https://youtu.be/cCfSBD5aPLI +
+
+
+
+
+ + ☆ SurvRNC: Learning Ordered Representations for Survival Prediction using + Rank-N-Contrast + + +
+ Predicting the likelihood of survival is of paramount importance for +individuals diagnosed with cancer as it provides invaluable information +regarding prognosis at an early stage. This knowledge enables the formulation +of effective treatment plans that lead to improved patient outcomes. In the +past few years, deep learning models have provided a feasible solution for +assessing medical images, electronic health records, and genomic data to +estimate cancer risk scores. However, these models often fall short of their +potential because they struggle to learn regression-aware feature +representations. In this study, we propose Survival Rank-N Contrast (SurvRNC) +method, which introduces a loss function as a regularizer to obtain an ordered +representation based on the survival times. This function can handle censored +data and can be incorporated into any survival model to ensure that the learned +representation is ordinal. The model was extensively evaluated on a HEad \& +NeCK TumOR (HECKTOR) segmentation and the outcome-prediction task dataset. We +demonstrate that using the SurvRNC method for training can achieve higher +performance on different deep survival models. Additionally, it outperforms +state-of-the-art methods by 3.6% on the concordance index. The code is publicly +available on https://github.com/numanai/SurvRNC + +
+
+
+
+
+ + ☆ P-MapNet: Far-seeing Map Generator Enhanced by both SDMap and HDMap + Priors + + +
+ Autonomous vehicles are gradually entering city roads today, with the help of +high-definition maps (HDMaps). However, the reliance on HDMaps prevents +autonomous vehicles from stepping into regions without this expensive digital +infrastructure. This fact drives many researchers to study online HDMap +generation algorithms, but the performance of these algorithms at far regions +is still unsatisfying. We present P-MapNet, in which the letter P highlights +the fact that we focus on incorporating map priors to improve model +performance. Specifically, we exploit priors in both SDMap and HDMap. On one +hand, we extract weakly aligned SDMap from OpenStreetMap, and encode it as an +additional conditioning branch. Despite the misalignment challenge, our +attention-based architecture adaptively attends to relevant SDMap skeletons and +significantly improves performance. On the other hand, we exploit a masked +autoencoder to capture the prior distribution of HDMap, which can serve as a +refinement module to mitigate occlusions and artifacts. We benchmark on the +nuScenes and Argoverse2 datasets. Through comprehensive experiments, we show +that: (1) our SDMap prior can improve online map generation performance, using +both rasterized (by up to $+18.73$ $\rm mIoU$) and vectorized (by up to $+8.50$ +$\rm mAP$) output representations. (2) our HDMap prior can improve map +perceptual metrics by up to $6.34\%$. (3) P-MapNet can be switched into +different inference modes that covers different regions of the +accuracy-efficiency trade-off landscape. (4) P-MapNet is a far-seeing solution +that brings larger improvements on longer ranges. Codes and models are publicly +available at https://jike5.github.io/P-MapNet. + +
+
+
+
+
+ + ☆ Strong and Controllable Blind Image Decomposition + + +
+ Blind image decomposition aims to decompose all components present in an +image, typically used to restore a multi-degraded input image. While fully +recovering the clean image is appealing, in some scenarios, users might want to +retain certain degradations, such as watermarks, for copyright protection. To +address this need, we add controllability to the blind image decomposition +process, allowing users to enter which types of degradation to remove or +retain. We design an architecture named controllable blind image decomposition +network. Inserted in the middle of U-Net structure, our method first decomposes +the input feature maps and then recombines them according to user instructions. +Advantageously, this functionality is implemented at minimal computational +cost: decomposition and recombination are all parameter-free. Experimentally, +our system excels in blind image decomposition tasks and can outputs partially +or fully restored images that well reflect user intentions. Furthermore, we +evaluate and configure different options for the network structure and loss +functions. This, combined with the proposed decomposition-and-recombination +method, yields an efficient and competitive system for blind image +decomposition, compared with current state-of-the-art methods. + +
+
+ comment: Code: https://github.com/Zhangzeyu97/CBD.git +
+
+
+
+
+ + ☆ Frozen Feature Augmentation for Few-Shot Image Classification CVPR 2024 + + +
+ Training a linear classifier or lightweight model on top of pretrained vision +model outputs, so-called 'frozen features', leads to impressive performance on +a number of downstream few-shot tasks. Currently, frozen features are not +modified during training. On the other hand, when networks are trained directly +on images, data augmentation is a standard recipe that improves performance +with no substantial overhead. In this paper, we conduct an extensive pilot +study on few-shot image classification that explores applying data +augmentations in the frozen feature space, dubbed 'frozen feature augmentation +(FroFA)', covering twenty augmentations in total. Our study demonstrates that +adopting a deceptively simple pointwise FroFA, such as brightness, can improve +few-shot performance consistently across three network architectures, three +large pretraining datasets, and eight transfer datasets. + +
+
+ comment: CVPR 2024 (18 pages, main paper + supplementary material) +
+
+
+
+
+ + ☆ Lodge: A Coarse to Fine Diffusion Network for Long Dance Generation + Guided by the Characteristic Dance Primitives + + +
+ We propose Lodge, a network capable of generating extremely long dance +sequences conditioned on given music. We design Lodge as a two-stage coarse to +fine diffusion architecture, and propose the characteristic dance primitives +that possess significant expressiveness as intermediate representations between +two diffusion models. The first stage is global diffusion, which focuses on +comprehending the coarse-level music-dance correlation and production +characteristic dance primitives. In contrast, the second-stage is the local +diffusion, which parallelly generates detailed motion sequences under the +guidance of the dance primitives and choreographic rules. In addition, we +propose a Foot Refine Block to optimize the contact between the feet and the +ground, enhancing the physical realism of the motion. Our approach can +parallelly generate dance sequences of extremely long length, striking a +balance between global choreographic patterns and local motion quality and +expressiveness. Extensive experiments validate the efficacy of our method. + +
+
+
+
+
+ + ☆ VideoAgent: Long-form Video Understanding with Large Language Model as + Agent + + +
+ Long-form video understanding represents a significant challenge within +computer vision, demanding a model capable of reasoning over long multi-modal +sequences. Motivated by the human cognitive process for long-form video +understanding, we emphasize interactive reasoning and planning over the ability +to process lengthy visual inputs. We introduce a novel agent-based system, +VideoAgent, that employs a large language model as a central agent to +iteratively identify and compile crucial information to answer a question, with +vision-language foundation models serving as tools to translate and retrieve +visual information. Evaluated on the challenging EgoSchema and NExT-QA +benchmarks, VideoAgent achieves 54.1% and 71.3% zero-shot accuracy with only +8.4 and 8.2 frames used on average. These results demonstrate superior +effectiveness and efficiency of our method over the current state-of-the-art +methods, highlighting the potential of agent-based approaches in advancing +long-form video understanding. + +
+
+
+
+
+ + ☆ FeatUp: A Model-Agnostic Framework for Features at Any Resolution ICLR + + +
+ Deep features are a cornerstone of computer vision research, capturing image +semantics and enabling the community to solve downstream tasks even in the +zero- or few-shot regime. However, these features often lack the spatial +resolution to directly perform dense prediction tasks like segmentation and +depth prediction because models aggressively pool information over large areas. +In this work, we introduce FeatUp, a task- and model-agnostic framework to +restore lost spatial information in deep features. We introduce two variants of +FeatUp: one that guides features with high-resolution signal in a single +forward pass, and one that fits an implicit model to a single image to +reconstruct features at any resolution. Both approaches use a multi-view +consistency loss with deep analogies to NeRFs. Our features retain their +original semantics and can be swapped into existing applications to yield +resolution and performance gains even without re-training. We show that FeatUp +significantly outperforms other feature upsampling and image super-resolution +approaches in class activation map generation, transfer learning for +segmentation and depth prediction, and end-to-end training for semantic +segmentation. + +
+
+ comment: Accepted to the International Conference on Learning Representations + (ICLR) 2024 +
+
+
+
+
+ + ☆ A Novel Framework for Multi-Person Temporal Gaze Following and Social + Gaze Prediction + + +
+ Gaze following and social gaze prediction are fundamental tasks providing +insights into human communication behaviors, intent, and social interactions. +Most previous approaches addressed these tasks separately, either by designing +highly specialized social gaze models that do not generalize to other social +gaze tasks or by considering social gaze inference as an ad-hoc post-processing +of the gaze following task. Furthermore, the vast majority of gaze following +approaches have proposed static models that can handle only one person at a +time, therefore failing to take advantage of social interactions and temporal +dynamics. In this paper, we address these limitations and introduce a novel +framework to jointly predict the gaze target and social gaze label for all +people in the scene. The framework comprises of: (i) a temporal, +transformer-based architecture that, in addition to image tokens, handles +person-specific tokens capturing the gaze information related to each +individual; (ii) a new dataset, VSGaze, that unifies annotation types across +multiple gaze following and social gaze datasets. We show that our model +trained on VSGaze can address all tasks jointly, and achieves state-of-the-art +results for multi-person gaze following and social gaze prediction. + +
+
+
+
+
+ + ☆ Benchmarking Zero-Shot Robustness of Multimodal Foundation Models: A + Pilot Study + + +
+ Pre-training image representations from the raw text about images enables +zero-shot vision transfer to downstream tasks. Through pre-training on millions +of samples collected from the internet, multimodal foundation models, such as +CLIP, produce state-of-the-art zero-shot results that often reach +competitiveness with fully supervised methods without the need for +task-specific training. Besides the encouraging performance on classification +accuracy, it is reported that these models close the robustness gap by matching +the performance of supervised models trained on ImageNet under natural +distribution shift. Because robustness is critical to real-world applications, +especially safety-critical ones, in this paper, we present a comprehensive +evaluation based on a large-scale robustness benchmark covering 7 natural, 3 +synthetic distribution shifts, and 11 adversarial attacks. We use CLIP as a +pilot study. We show that CLIP leads to a significant robustness drop compared +to supervised ImageNet models on our benchmark, especially under synthetic +distribution shift and adversarial attacks. Furthermore, data overlap analysis +suggests that the observed robustness under natural distribution shifts could +be attributed, at least in part, to data overlap. In summary, our evaluation +shows a comprehensive evaluation of robustness is necessary; and there is a +significant need to improve the robustness of zero-shot multimodal models. + +
+
+
+
+
+ + ☆ A General Method to Incorporate Spatial Information into Loss Functions + for GAN-based Super-resolution Models + + +
+ Generative Adversarial Networks (GANs) have shown great performance on +super-resolution problems since they can generate more visually realistic +images and video frames. However, these models often introduce side effects +into the outputs, such as unexpected artifacts and noises. To reduce these +artifacts and enhance the perceptual quality of the results, in this paper, we +propose a general method that can be effectively used in most GAN-based +super-resolution (SR) models by introducing essential spatial information into +the training process. We extract spatial information from the input data and +incorporate it into the training loss, making the corresponding loss a +spatially adaptive (SA) one. After that, we utilize it to guide the training +process. We will show that the proposed approach is independent of the methods +used to extract the spatial information and independent of the SR tasks and +models. This method consistently guides the training process towards generating +visually pleasing SR images and video frames, substantially mitigating +artifacts and noise, ultimately leading to enhanced perceptual quality. + +
+
+
+
+
+ + ☆ Mitigating Dialogue Hallucination for Large Multi-modal Models via + Adversarial Instruction Tuning + + +
+ Mitigating hallucinations of Large Multi-modal Models(LMMs) is crucial to +enhance their reliability for general-purpose assistants. This paper shows that +such hallucinations of LMMs can be significantly exacerbated by preceding +user-system dialogues. To precisely measure this, we first present an +evaluation benchmark by extending popular multi-modal benchmark datasets with +prepended hallucinatory dialogues generated by our novel Adversarial Question +Generator, which can automatically generate image-related yet adversarial +dialogues by adopting adversarial attacks on LMMs. On our benchmark, the +zero-shot performance of state-of-the-art LMMs dropped significantly for both +the VQA and Captioning tasks. Next, we further reveal this hallucination is +mainly due to the prediction bias toward preceding dialogues rather than visual +content. To reduce this bias, we propose Adversarial Instruction Tuning that +robustly fine-tunes LMMs on augmented multi-modal instruction-following +datasets with hallucinatory dialogues. Extensive experiments show that our +proposed approach successfully reduces dialogue hallucination while maintaining +or even improving performance. + +
+
+
+
+
+ + ☆ Joint Multimodal Transformer for Dimensional Emotional Recognition in + the Wild + + +
+ Audiovisual emotion recognition (ER) in videos has immense potential over +unimodal performance. It effectively leverages the inter- and intra-modal +dependencies between visual and auditory modalities. This work proposes a novel +audio-visual emotion recognition system utilizing a joint multimodal +transformer architecture with key-based cross-attention. This framework aims to +exploit the complementary nature of audio and visual cues (facial expressions +and vocal patterns) in videos, leading to superior performance compared to +solely relying on a single modality. The proposed model leverages separate +backbones for capturing intra-modal temporal dependencies within each modality +(audio and visual). Subsequently, a joint multimodal transformer architecture +integrates the individual modality embeddings, enabling the model to +effectively capture inter-modal (between audio and visual) and intra-modal +(within each modality) relationships. Extensive evaluations on the challenging +Affwild2 dataset demonstrate that the proposed model significantly outperforms +baseline and state-of-the-art methods in ER tasks. + +
+
+ comment: 5 pages, 1 figure +
+
+
+
+
+ + ☆ Approximate Nullspace Augmented Finetuning for Robust Vision + Transformers + + +
+ Enhancing the robustness of deep learning models, particularly in the realm +of vision transformers (ViTs), is crucial for their real-world deployment. In +this work, we provide a finetuning approach to enhance the robustness of vision +transformers inspired by the concept of nullspace from linear algebra. Our +investigation centers on whether a vision transformer can exhibit resilience to +input variations akin to the nullspace property in linear mappings, implying +that perturbations sampled from this nullspace do not influence the model's +output when added to the input. Firstly, we show that for many pretrained ViTs, +a non-trivial nullspace exists due to the presence of the patch embedding +layer. Secondly, as nullspace is a concept associated with linear algebra, we +demonstrate that it is possible to synthesize approximate nullspace elements +for the non-linear blocks of ViTs employing an optimisation strategy. Finally, +we propose a fine-tuning strategy for ViTs wherein we augment the training data +with synthesized approximate nullspace noise. After finetuning, we find that +the model demonstrates robustness to adversarial and natural image perbutations +alike. + +
+
+ comment: 21 pages, 8 figures +
+
+
+
+
+ + ☆ Understanding the Double Descent Phenomenon in Deep Learning + + +
+ Combining empirical risk minimization with capacity control is a classical +strategy in machine learning when trying to control the generalization gap and +avoid overfitting, as the model class capacity gets larger. Yet, in modern deep +learning practice, very large over-parameterized models (e.g. neural networks) +are optimized to fit perfectly the training data and still obtain great +generalization performance. Past the interpolation point, increasing model +complexity seems to actually lower the test error. + In this tutorial, we explain the concept of double descent and its +mechanisms. The first section sets the classical statistical learning framework +and introduces the double descent phenomenon. By looking at a number of +examples, section 2 introduces inductive biases that appear to have a key role +in double descent by selecting, among the multiple interpolating solutions, a +smooth empirical risk minimizer. Finally, section 3 explores the double descent +with two linear models, and gives other points of view from recent related +works. + +
+
+
+
+
+ + ☆ Solving General Noisy Inverse Problem via Posterior Sampling: A Policy + Gradient Viewpoint AISTATS 2024 + + +
+ Solving image inverse problems (e.g., super-resolution and inpainting) +requires generating a high fidelity image that matches the given input (the +low-resolution image or the masked image). By using the input image as +guidance, we can leverage a pretrained diffusion generative model to solve a +wide range of image inverse tasks without task specific model fine-tuning. To +precisely estimate the guidance score function of the input image, we propose +Diffusion Policy Gradient (DPG), a tractable computation method by viewing the +intermediate noisy images as policies and the target image as the states +selected by the policy. Experiments show that our method is robust to both +Gaussian and Poisson noise degradation on multiple linear and non-linear +inverse tasks, resulting into a higher image restoration quality on FFHQ, +ImageNet and LSUN datasets. + +
+
+ comment: Accepted and to Appear, AISTATS 2024 +
+
+
+
+
+ + ☆ Robust Shape Fitting for 3D Scene Abstraction + + +
+ Humans perceive and construct the world as an arrangement of simple +parametric models. In particular, we can often describe man-made environments +using volumetric primitives such as cuboids or cylinders. Inferring these +primitives is important for attaining high-level, abstract scene descriptions. +Previous approaches for primitive-based abstraction estimate shape parameters +directly and are only able to reproduce simple objects. In contrast, we propose +a robust estimator for primitive fitting, which meaningfully abstracts complex +real-world environments using cuboids. A RANSAC estimator guided by a neural +network fits these primitives to a depth map. We condition the network on +previously detected parts of the scene, parsing it one-by-one. To obtain +cuboids from single RGB images, we additionally optimise a depth estimation CNN +end-to-end. Naively minimising point-to-primitive distances leads to large or +spurious cuboids occluding parts of the scene. We thus propose an improved +occlusion-aware distance metric correctly handling opaque scenes. Furthermore, +we present a neural network based cuboid solver which provides more +parsimonious scene abstractions while also reducing inference time. The +proposed algorithm does not require labour-intensive labels, such as cuboid +annotations, for training. Results on the NYU Depth v2 dataset demonstrate that +the proposed algorithm successfully abstracts cluttered real-world 3D scene +layouts. + +
+
+ comment: Accepted for publication in Transactions on Pattern Analysis and + Machine Intelligence (PAMI). arXiv admin note: substantial text overlap with + arXiv:2105.02047 +
+
+
+
+
+ + ☆ Using an LLM to Turn Sign Spottings into Spoken Language Sentences + + +
+ Sign Language Translation (SLT) is a challenging task that aims to generate +spoken language sentences from sign language videos. In this paper, we +introduce a hybrid SLT approach, Spotter+GPT, that utilizes a sign spotter and +a pretrained large language model to improve SLT performance. Our method builds +upon the strengths of both components. The videos are first processed by the +spotter, which is trained on a linguistic sign language dataset, to identify +individual signs. These spotted signs are then passed to the powerful language +model, which transforms them into coherent and contextually appropriate spoken +language sentences. + +
+
+
+
+
+ + ☆ SWAG: Splatting in the Wild images with Appearance-conditioned Gaussians + + +
+ Implicit neural representation methods have shown impressive advancements in +learning 3D scenes from unstructured in-the-wild photo collections but are +still limited by the large computational cost of volumetric rendering. More +recently, 3D Gaussian Splatting emerged as a much faster alternative with +superior rendering quality and training efficiency, especially for small-scale +and object-centric scenarios. Nevertheless, this technique suffers from poor +performance on unstructured in-the-wild data. To tackle this, we extend over 3D +Gaussian Splatting to handle unstructured image collections. We achieve this by +modeling appearance to seize photometric variations in the rendered images. +Additionally, we introduce a new mechanism to train transient Gaussians to +handle the presence of scene occluders in an unsupervised manner. Experiments +on diverse photo collection scenes and multi-pass acquisition of outdoor +landmarks show the effectiveness of our method over prior works achieving +state-of-the-art results with improved efficiency. + +
+
+
+
+
+ + ☆ NeuFlow: Real-time, High-accuracy Optical Flow Estimation on Robots + Using Edge Devices + + +
+ Real-time high-accuracy optical flow estimation is a crucial component in +various applications, including localization and mapping in robotics, object +tracking, and activity recognition in computer vision. While recent +learning-based optical flow methods have achieved high accuracy, they often +come with heavy computation costs. In this paper, we propose a highly efficient +optical flow architecture, called NeuFlow, that addresses both high accuracy +and computational cost concerns. The architecture follows a global-to-local +scheme. Given the features of the input images extracted at different spatial +resolutions, global matching is employed to estimate an initial optical flow on +the 1/16 resolution, capturing large displacement, which is then refined on the +1/8 resolution with lightweight CNN layers for better accuracy. We evaluate our +approach on Jetson Orin Nano and RTX 2080 to demonstrate efficiency +improvements across different computing platforms. We achieve a notable 10x-80x +speedup compared to several state-of-the-art methods, while maintaining +comparable accuracy. Our approach achieves around 30 FPS on edge computing +platforms, which represents a significant breakthrough in deploying complex +computer vision tasks such as SLAM on small robots like drones. The full +training and evaluation code is available at +https://github.com/neufieldrobotics/NeuFlow. + +
+
+
+
+
+ + ☆ Real-Time Image Segmentation via Hybrid Convolutional-Transformer + Architecture Search IROS 2024 + + +
+ Image segmentation is one of the most fundamental problems in computer vision +and has drawn a lot of attentions due to its vast applications in image +understanding and autonomous driving. However, designing effective and +efficient segmentation neural architectures is a labor-intensive process that +may require lots of trials by human experts. In this paper, we address the +challenge of integrating multi-head self-attention into high resolution +representation CNNs efficiently, by leveraging architecture search. Manually +replacing convolution layers with multi-head self-attention is non-trivial due +to the costly overhead in memory to maintain high resolution. By contrast, we +develop a multi-target multi-branch supernet method, which not only fully +utilizes the advantages of high-resolution features, but also finds the proper +location for placing multi-head self-attention module. Our search algorithm is +optimized towards multiple objective s (e.g., latency and mIoU) and capable of +finding architectures on Pareto frontier with arbitrary number of branches in a +single search. We further present a series of model via Hybrid +Convolutional-Transformer Architecture Search (HyCTAS) method that searched for +the best hybrid combination of light-weight convolution layers and +memory-efficient self-attention layers between branches from different +resolutions and fuse to high resolution for both efficiency and effectiveness. +Extensive experiments demonstrate that HyCTAS outperforms previous methods on +semantic segmentation task. Code and models are available at +\url{https://github.com/MarvinYu1995/HyCTAS}. + +
+
+ comment: 8 pages, 3 figures, submitted to IROS 2024 +
+
+
+
+
+ + ☆ A comparative study on machine learning approaches for rock mass + classification using drilling data + + +
+ Current rock engineering design in drill and blast tunnelling primarily +relies on engineers' observational assessments. Measure While Drilling (MWD) +data, a high-resolution sensor dataset collected during tunnel excavation, is +underutilised, mainly serving for geological visualisation. This study aims to +automate the translation of MWD data into actionable metrics for rock +engineering. It seeks to link data to specific engineering actions, thus +providing critical decision support for geological challenges ahead of the +tunnel face. Leveraging a large and geologically diverse dataset of 500,000 +drillholes from 15 tunnels, the research introduces models for accurate rock +mass quality classification in a real-world tunnelling context. Both +conventional machine learning and image-based deep learning are explored to +classify MWD data into Q-classes and Q-values, examples of metrics describing +the stability of the rock mass, using both tabular and image data. The results +indicate that the K-nearest neighbours algorithm in an ensemble with tree-based +models using tabular data, effectively classifies rock mass quality. It +achieves a cross-validated balanced accuracy of 0.86 in classifying rock mass +into the Q-classes A, B, C, D, E1, E2, and 0.95 for a binary classification +with E versus the rest. Classification using a CNN with MWD-images for each +blasting round resulted in a balanced accuracy of 0.82 for binary +classification. Regressing the Q-value from tabular MWD-data achieved +cross-validated R2 and MSE scores of 0.80 and 0.18 for a similar ensemble model +as in classification. High performance in regression and classification boosts +confidence in automated rock mass assessment. Applying advanced modelling on a +unique dataset demonstrates MWD data's value in improving rock mass +classification accuracy and advancing data-driven rock engineering design, +reducing manual intervention. + +
+
+
+
+
+ + ☆ Energy Correction Model in the Feature Space for Out-of-Distribution + Detection NeurIPS + + +
+ In this work, we study the out-of-distribution (OOD) detection problem +through the use of the feature space of a pre-trained deep classifier. We show +that learning the density of in-distribution (ID) features with an energy-based +models (EBM) leads to competitive detection results. However, we found that the +non-mixing of MCMC sampling during the EBM's training undermines its detection +performance. To overcome this an energy-based correction of a mixture of +class-conditional Gaussian distributions. We obtains favorable results when +compared to a strong baseline like the KNN detector on the CIFAR-10/CIFAR-100 +OOD detection benchmarks. + +
+
+ comment: NeurIPS ML Safety Workshop (2022) +
+
+
+
+
+ + ☆ Isotropic3D: Image-to-3D Generation Based on a Single CLIP Embedding + + +
+ Encouraged by the growing availability of pre-trained 2D diffusion models, +image-to-3D generation by leveraging Score Distillation Sampling (SDS) is +making remarkable progress. Most existing methods combine novel-view lifting +from 2D diffusion models which usually take the reference image as a condition +while applying hard L2 image supervision at the reference view. Yet heavily +adhering to the image is prone to corrupting the inductive knowledge of the 2D +diffusion model leading to flat or distorted 3D generation frequently. In this +work, we reexamine image-to-3D in a novel perspective and present Isotropic3D, +an image-to-3D generation pipeline that takes only an image CLIP embedding as +input. Isotropic3D allows the optimization to be isotropic w.r.t. the azimuth +angle by solely resting on the SDS loss. The core of our framework lies in a +two-stage diffusion model fine-tuning. Firstly, we fine-tune a text-to-3D +diffusion model by substituting its text encoder with an image encoder, by +which the model preliminarily acquires image-to-image capabilities. Secondly, +we perform fine-tuning using our Explicit Multi-view Attention (EMA) which +combines noisy multi-view images with the noise-free reference image as an +explicit condition. CLIP embedding is sent to the diffusion model throughout +the whole process while reference images are discarded once after fine-tuning. +As a result, with a single image CLIP embedding, Isotropic3D is capable of +generating multi-view mutually consistent images and also a 3D model with more +symmetrical and neat content, well-proportioned geometry, rich colored texture, +and less distortion compared with existing image-to-3D methods while still +preserving the similarity to the reference image to a large extent. The project +page is available at https://isotropic3d.github.io/. The code and models are +available at https://github.com/pkunliu/Isotropic3D. + +
+
+ comment: Project page: https://isotropic3d.github.io/ Source code: + https://github.com/pkunliu/Isotropic3D +
+
+
+
+
+ + ☆ CDMAD: Class-Distribution-Mismatch-Aware Debiasing for Class-Imbalanced + Semi-Supervised Learning CVPR 2024 + + +
+ Pseudo-label-based semi-supervised learning (SSL) algorithms trained on a +class-imbalanced set face two cascading challenges: 1) Classifiers tend to be +biased towards majority classes, and 2) Biased pseudo-labels are used for +training. It is difficult to appropriately re-balance the classifiers in SSL +because the class distribution of an unlabeled set is often unknown and could +be mismatched with that of a labeled set. We propose a novel class-imbalanced +SSL algorithm called class-distribution-mismatch-aware debiasing (CDMAD). For +each iteration of training, CDMAD first assesses the classifier's biased degree +towards each class by calculating the logits on an image without any patterns +(e.g., solid color image), which can be considered irrelevant to the training +set. CDMAD then refines biased pseudo-labels of the base SSL algorithm by +ensuring the classifier's neutrality. CDMAD uses these refined pseudo-labels +during the training of the base SSL algorithm to improve the quality of the +representations. In the test phase, CDMAD similarly refines biased class +predictions on test samples. CDMAD can be seen as an extension of post-hoc +logit adjustment to address a challenge of incorporating the unknown class +distribution of the unlabeled set for re-balancing the biased classifier under +class distribution mismatch. CDMAD ensures Fisher consistency for the balanced +error. Extensive experiments verify the effectiveness of CDMAD. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Evaluating Perceptual Distances by Fitting Binomial Distributions to + Two-Alternative Forced Choice Data + + +
+ The two-alternative forced choice (2AFC) experimental setup is popular in the +visual perception literature, where practitioners aim to understand how human +observers perceive distances within triplets that consist of a reference image +and two distorted versions of that image. In the past, this had been conducted +in controlled environments, with a tournament-style algorithm dictating which +images are shown to each participant to rank the distorted images. Recently, +crowd-sourced perceptual datasets have emerged, with no images shared between +triplets, making ranking impossible. Evaluating perceptual distances using this +data is non-trivial, relying on reducing the collection of judgements on a +triplet to a binary decision -- which is suboptimal and prone to misleading +conclusions. Instead, we statistically model the underlying decision-making +process during 2AFC experiments using a binomial distribution. We use maximum +likelihood estimation to fit a distribution to the perceptual judgements, +conditioned on the perceptual distance to test and impose consistency and +smoothness between our empirical estimates of the density. This way, we can +evaluate a different number of judgements per triplet, and can calculate +metrics such as likelihoods of judgements according to a set of distances -- +key ingredients that neural network counterparts lack. + +
+
+
+
+
+ + ☆ EXAMS-V: A Multi-Discipline Multilingual Multimodal Exam Benchmark for + Evaluating Vision Language Models + + +
+ We introduce EXAMS-V, a new challenging multi-discipline multimodal +multilingual exam benchmark for evaluating vision language models. It consists +of 20,932 multiple-choice questions across 20 school disciplines covering +natural science, social science, and other miscellaneous studies, e.g., +religion, fine arts, business, etc. EXAMS-V includes a variety of multimodal +features such as text, images, tables, figures, diagrams, maps, scientific +symbols, and equations. The questions come in 11 languages from 7 language +families. Unlike existing benchmarks, EXAMS-V is uniquely curated by gathering +school exam questions from various countries, with a variety of education +systems. This distinctive approach calls for intricate reasoning across diverse +languages and relies on region-specific knowledge. Solving the problems in the +dataset requires advanced perception and joint reasoning over the text and the +visual content of the image. Our evaluation results demonstrate that this is a +challenging dataset, which is difficult even for advanced vision-text models +such as GPT-4V and Gemini; this underscores the inherent complexity of the +dataset and its significance as a future benchmark. + +
+
+
+
+
+ + ☆ PASTA: Towards Flexible and Efficient HDR Imaging Via Progressively + Aggregated Spatio-Temporal Aligment + + +
+ Leveraging Transformer attention has led to great advancements in HDR +deghosting. However, the intricate nature of self-attention introduces +practical challenges, as existing state-of-the-art methods often demand +high-end GPUs or exhibit slow inference speeds, especially for high-resolution +images like 2K. Striking an optimal balance between performance and latency +remains a critical concern. In response, this work presents PASTA, a novel +Progressively Aggregated Spatio-Temporal Alignment framework for HDR +deghosting. Our approach achieves effectiveness and efficiency by harnessing +hierarchical representation during feature distanglement. Through the +utilization of diverse granularities within the hierarchical structure, our +method substantially boosts computational speed and optimizes the HDR imaging +workflow. In addition, we explore within-scale feature modeling with local and +global attention, gradually merging and refining them in a coarse-to-fine +fashion. Experimental results showcase PASTA's superiority over current SOTA +methods in both visual quality and performance metrics, accompanied by a +substantial 3-fold (x3) increase in inference speed. + +
+
+
+
+
+ + ☆ Overcoming Distribution Shifts in Plug-and-Play Methods with Test-Time + Training + + +
+ Plug-and-Play Priors (PnP) is a well-known class of methods for solving +inverse problems in computational imaging. PnP methods combine physical forward +models with learned prior models specified as image denoisers. A common issue +with the learned models is that of a performance drop when there is a +distribution shift between the training and testing data. Test-time training +(TTT) was recently proposed as a general strategy for improving the performance +of learned models when training and testing data come from different +distributions. In this paper, we propose PnP-TTT as a new method for overcoming +distribution shifts in PnP. PnP-TTT uses deep equilibrium learning (DEQ) for +optimizing a self-supervised loss at the fixed points of PnP iterations. +PnP-TTT can be directly applied on a single test sample to improve the +generalization of PnP. We show through simulations that given a sufficient +number of measurements, PnP-TTT enables the use of image priors trained on +natural images for image reconstruction in magnetic resonance imaging (MRI). + +
+
+
+
+
+ + ☆ Open Stamped Parts Dataset + + +
+ We present the Open Stamped Parts Dataset (OSPD), featuring synthetic and +real images of stamped metal sheets for auto manufacturing. The real part +images, captured from 7 cameras, consist of 7,980 unlabeled images and 1,680 +labeled images. In addition, we have compiled a defect dataset by overlaying +synthetically generated masks on 10% of the holes. The synthetic dataset +replicates the real manufacturing environment in terms of lighting and part +placement relative to the cameras. The synthetic data includes 7,980 training +images, 1,680 validation images and 1,680 test images, each with bounding box +and segmentation mask annotations around all holes. 10% of the holes in the +synthetic data mimic defects generated in the real image dataset. We trained a +hole-detection model on the synthetic-OSPD, achieving a modified recall score +of 67.2% and a precision of 94.4% . We anticipate researchers in the auto +manufacturing and broader machine learning and computer vision communities +using OSPD to advance the state of the art in defect detection of stamped holes +in the metalsheet stamping process. The dataset is available for download at: +https://tinyurl.com/hm6xatd7 + +
+
+ comment: 6 pages, 7 figures, 2 tables +
+
+
+
+
+ + ☆ Testing MediaPipe Holistic for Linguistic Analysis of Nonmanual Markers + in Sign Languages + + +
+ Advances in Deep Learning have made possible reliable landmark tracking of +human bodies and faces that can be used for a variety of tasks. We test a +recent Computer Vision solution, MediaPipe Holistic (MPH), to find out if its +tracking of the facial features is reliable enough for a linguistic analysis of +data from sign languages, and compare it to an older solution (OpenFace, OF). +We use an existing data set of sentences in Kazakh-Russian Sign Language and a +newly created small data set of videos with head tilts and eyebrow movements. +We find that MPH does not perform well enough for linguistic analysis of +eyebrow movement -- but in a different way from OF, which is also performing +poorly without correction. We reiterate a previous proposal to train additional +correction models to overcome these limitations. + +
+
+
+
+
+ + ☆ CPGA: Coding Priors-Guided Aggregation Network for Compressed Video + Quality Enhancement + + +
+ Recently, numerous approaches have achieved notable success in compressed +video quality enhancement (VQE). However, these methods usually ignore the +utilization of valuable coding priors inherently embedded in compressed videos, +such as motion vectors and residual frames, which carry abundant temporal and +spatial information. To remedy this problem, we propose the Coding +Priors-Guided Aggregation (CPGA) network to utilize temporal and spatial +information from coding priors. The CPGA mainly consists of an inter-frame +temporal aggregation (ITA) module and a multi-scale non-local aggregation (MNA) +module. Specifically, the ITA module aggregates temporal information from +consecutive frames and coding priors, while the MNA module globally captures +spatial information guided by residual frames. In addition, to facilitate +research in VQE task, we newly construct the Video Coding Priors (VCP) dataset, +comprising 300 videos with various coding priors extracted from corresponding +bitstreams. It remedies the shortage of previous datasets on the lack of coding +information. Experimental results demonstrate the superiority of our method +compared to existing state-of-the-art methods. The code and dataset will be +released at https://github.com/CPGA/CPGA.git. + +
+
+
+
+
+ + ☆ ANIM: Accurate Neural Implicit Model for Human Reconstruction from a + single RGB-D image + + +
+ Recent progress in human shape learning, shows that neural implicit models +are effective in generating 3D human surfaces from limited number of views, and +even from a single RGB image. However, existing monocular approaches still +struggle to recover fine geometric details such as face, hands or cloth +wrinkles. They are also easily prone to depth ambiguities that result in +distorted geometries along the camera optical axis. In this paper, we explore +the benefits of incorporating depth observations in the reconstruction process +by introducing ANIM, a novel method that reconstructs arbitrary 3D human shapes +from single-view RGB-D images with an unprecedented level of accuracy. Our +model learns geometric details from both multi-resolution pixel-aligned and +voxel-aligned features to leverage depth information and enable spatial +relationships, mitigating depth ambiguities. We further enhance the quality of +the reconstructed shape by introducing a depth-supervision strategy, which +improves the accuracy of the signed distance field estimation of points that +lie on the reconstructed surface. Experiments demonstrate that ANIM outperforms +state-of-the-art works that use RGB, surface normals, point cloud or RGB-D data +as input. In addition, we introduce ANIM-Real, a new multi-modal dataset +comprising high-quality scans paired with consumer-grade RGB-D camera, and our +protocol to fine-tune ANIM, enabling high-quality reconstruction from +real-world human capture. + +
+
+
+
+
+ + ☆ SimPB: A Single Model for 2D and 3D Object Detection from Multiple + Cameras + + +
+ The field of autonomous driving has attracted considerable interest in +approaches that directly infer 3D objects in the Bird's Eye View (BEV) from +multiple cameras. Some attempts have also explored utilizing 2D detectors from +single images to enhance the performance of 3D detection. However, these +approaches rely on a two-stage process with separate detectors, where the 2D +detection results are utilized only once for token selection or query +initialization. In this paper, we present a single model termed SimPB, which +simultaneously detects 2D objects in the perspective view and 3D objects in the +BEV space from multiple cameras. To achieve this, we introduce a hybrid decoder +consisting of several multi-view 2D decoder layers and several 3D decoder +layers, specifically designed for their respective detection tasks. A Dynamic +Query Allocation module and an Adaptive Query Aggregation module are proposed +to continuously update and refine the interaction between 2D and 3D results, in +a cyclic 3D-2D-3D manner. Additionally, Query-group Attention is utilized to +strengthen the interaction among 2D queries within each camera group. In the +experiments, we evaluate our method on the nuScenes dataset and demonstrate +promising results for both 2D and 3D detection tasks. Our code is available at: +https://github.com/nullmax-vision/SimPB. + +
+
+
+
+
+ + ☆ ParaPoint: Learning Global Free-Boundary Surface Parameterization of 3D + Point Clouds + + +
+ Surface parameterization is a fundamental geometry processing problem with +rich downstream applications. Traditional approaches are designed to operate on +well-behaved mesh models with high-quality triangulations that are laboriously +produced by specialized 3D modelers, and thus unable to meet the processing +demand for the current explosion of ordinary 3D data. In this paper, we seek to +perform UV unwrapping on unstructured 3D point clouds. Technically, we propose +ParaPoint, an unsupervised neural learning pipeline for achieving global +free-boundary surface parameterization by building point-wise mappings between +given 3D points and 2D UV coordinates with adaptively deformed boundaries. We +ingeniously construct several geometrically meaningful sub-networks with +specific functionalities, and assemble them into a bi-directional cycle mapping +framework. We also design effective loss functions and auxiliary differential +geometric constraints for the optimization of the neural mapping process. To +the best of our knowledge, this work makes the first attempt to investigate +neural point cloud parameterization that pursues both global mappings and free +boundaries. Experiments demonstrate the effectiveness and inspiring potential +of our proposed learning paradigm. The code will be publicly available. + +
+
+
+
+
+ + ☆ Denoising Task Difficulty-based Curriculum for Training Diffusion Models + + +
+ Diffusion-based generative models have emerged as powerful tools in the realm +of generative modeling. Despite extensive research on denoising across various +timesteps and noise levels, a conflict persists regarding the relative +difficulties of the denoising tasks. While various studies argue that lower +timesteps present more challenging tasks, others contend that higher timesteps +are more difficult. To address this conflict, our study undertakes a +comprehensive examination of task difficulties, focusing on convergence +behavior and changes in relative entropy between consecutive probability +distributions across timesteps. Our observational study reveals that denoising +at earlier timesteps poses challenges characterized by slower convergence and +higher relative entropy, indicating increased task difficulty at these lower +timesteps. Building on these observations, we introduce an easy-to-hard +learning scheme, drawing from curriculum learning, to enhance the training +process of diffusion models. By organizing timesteps or noise levels into +clusters and training models with descending orders of difficulty, we +facilitate an order-aware training regime, progressing from easier to harder +denoising tasks, thereby deviating from the conventional approach of training +diffusion models simultaneously across all timesteps. Our approach leads to +improved performance and faster convergence by leveraging the benefits of +curriculum learning, while maintaining orthogonality with existing improvements +in diffusion training techniques. We validate these advantages through +comprehensive experiments in image generation tasks, including unconditional, +class-conditional, and text-to-image generation. + +
+
+ comment: 22 pages, 8 figures, 5 tables +
+
+
+
+
+ + ☆ End-to-end Adaptive Dynamic Subsampling and Reconstruction for Cardiac + MRI + + +
+ Accelerating dynamic MRI is essential for enhancing clinical applications, +such as adaptive radiotherapy, and improving patient comfort. Traditional deep +learning (DL) approaches for accelerated dynamic MRI reconstruction typically +rely on predefined or random subsampling patterns, applied uniformly across all +temporal phases. This standard practice overlooks the potential benefits of +leveraging temporal correlations and lacks the adaptability required for +case-specific subsampling optimization, which holds the potential for +maximizing reconstruction quality. Addressing this gap, we present a novel +end-to-end framework for adaptive dynamic MRI subsampling and reconstruction. +Our pipeline integrates a DL-based adaptive sampler, generating case-specific +dynamic subsampling patterns, trained end-to-end with a state-of-the-art 2D +dynamic reconstruction network, namely vSHARP, which effectively reconstructs +the adaptive dynamic subsampled data into a moving image. Our method is +assessed using dynamic cine cardiac MRI data, comparing its performance against +vSHARP models that employ common subsampling trajectories, and pipelines +trained to optimize dataset-specific sampling schemes alongside vSHARP +reconstruction. Our results indicate superior reconstruction quality, +particularly at high accelerations. + +
+
+ comment: 14 pages, 5 figures, 8 tables +
+
+
+
+
+ + ☆ SCILLA: SurfaCe Implicit Learning for Large Urban Area, a volumetric + hybrid solution + + +
+ Neural implicit surface representation methods have recently shown impressive +3D reconstruction results. However, existing solutions struggle to reconstruct +urban outdoor scenes due to their large, unbounded, and highly detailed nature. +Hence, to achieve accurate reconstructions, additional supervision data such as +LiDAR, strong geometric priors, and long training times are required. To tackle +such issues, we present SCILLA, a new hybrid implicit surface learning method +to reconstruct large driving scenes from 2D images. SCILLA's hybrid +architecture models two separate implicit fields: one for the volumetric +density and another for the signed distance to the surface. To accurately +represent urban outdoor scenarios, we introduce a novel volume-rendering +strategy that relies on self-supervised probabilistic density estimation to +sample points near the surface and transition progressively from volumetric to +surface representation. Our solution permits a proper and fast initialization +of the signed distance field without relying on any geometric prior on the +scene, compared to concurrent methods. By conducting extensive experiments on +four outdoor driving datasets, we show that SCILLA can learn an accurate and +detailed 3D surface scene representation in various urban scenarios while being +two times faster to train compared to previous state-of-the-art solutions. + +
+
+
+
+
+ + ☆ Thermal-NeRF: Neural Radiance Fields from an Infrared Camera + + +
+ In recent years, Neural Radiance Fields (NeRFs) have demonstrated significant +potential in encoding highly-detailed 3D geometry and environmental appearance, +positioning themselves as a promising alternative to traditional explicit +representation for 3D scene reconstruction. However, the predominant reliance +on RGB imaging presupposes ideal lighting conditions: a premise frequently +unmet in robotic applications plagued by poor lighting or visual obstructions. +This limitation overlooks the capabilities of infrared (IR) cameras, which +excel in low-light detection and present a robust alternative under such +adverse scenarios. To tackle these issues, we introduce Thermal-NeRF, the first +method that estimates a volumetric scene representation in the form of a NeRF +solely from IR imaging. By leveraging a thermal mapping and structural thermal +constraint derived from the thermal characteristics of IR imaging, our method +showcasing unparalleled proficiency in recovering NeRFs in visually degraded +scenes where RGB-based methods fall short. We conduct extensive experiments to +demonstrate that Thermal-NeRF can achieve superior quality compared to existing +methods. Furthermore, we contribute a dataset for IR-based NeRF applications, +paving the way for future research in IR NeRF reconstruction. + +
+
+
+
+
+ + ☆ How Powerful Potential of Attention on Image Restoration? + + +
+ Transformers have demonstrated their effectiveness in image restoration +tasks. Existing Transformer architectures typically comprise two essential +components: multi-head self-attention and feed-forward network (FFN). The +former captures long-range pixel dependencies, while the latter enables the +model to learn complex patterns and relationships in the data. Previous studies +have demonstrated that FFNs are key-value memories \cite{geva2020transformer}, +which are vital in modern Transformer architectures. In this paper, we conduct +an empirical study to explore the potential of attention mechanisms without +using FFN and provide novel structures to demonstrate that removing FFN is +flexible for image restoration. Specifically, we propose Continuous Scaling +Attention (\textbf{CSAttn}), a method that computes attention continuously in +three stages without using FFN. To achieve competitive performance, we propose +a series of key components within the attention. Our designs provide a closer +look at the attention mechanism and reveal that some simple operations can +significantly affect the model performance. We apply our \textbf{CSAttn} to +several image restoration tasks and show that our model can outperform +CNN-based and Transformer-based image restoration approaches. + +
+
+
+
+
+ + ☆ NECA: Neural Customizable Human Avatar CVPR 2024 + + +
+ Human avatar has become a novel type of 3D asset with various applications. +Ideally, a human avatar should be fully customizable to accommodate different +settings and environments. In this work, we introduce NECA, an approach capable +of learning versatile human representation from monocular or sparse-view +videos, enabling granular customization across aspects such as pose, shadow, +shape, lighting and texture. The core of our approach is to represent humans in +complementary dual spaces and predict disentangled neural fields of geometry, +albedo, shadow, as well as an external lighting, from which we are able to +derive realistic rendering with high-frequency details via volumetric +rendering. Extensive experiments demonstrate the advantage of our method over +the state-of-the-art methods in photorealistic rendering, as well as various +editing tasks such as novel pose synthesis and relighting. The code is +available at https://github.com/iSEE-Laboratory/NECA. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Uni-SMART: Universal Science Multimodal Analysis and Research + Transformer + + +
+ In scientific research and its application, scientific literature analysis is +crucial as it allows researchers to build on the work of others. However, the +fast growth of scientific knowledge has led to a massive increase in scholarly +articles, making in-depth literature analysis increasingly challenging and +time-consuming. The emergence of Large Language Models (LLMs) has offered a new +way to address this challenge. Known for their strong abilities in summarizing +texts, LLMs are seen as a potential tool to improve the analysis of scientific +literature. However, existing LLMs have their own limits. Scientific literature +often includes a wide range of multimodal elements, such as molecular +structure, tables, and charts, which are hard for text-focused LLMs to +understand and analyze. This issue points to the urgent need for new solutions +that can fully understand and analyze multimodal content in scientific +literature. To answer this demand, we present Uni-SMART (Universal Science +Multimodal Analysis and Research Transformer), an innovative model designed for +in-depth understanding of multimodal scientific literature. Through rigorous +quantitative evaluation across several domains, Uni-SMART demonstrates superior +performance over leading text-focused LLMs. Furthermore, our exploration +extends to practical applications, including patent infringement detection and +nuanced analysis of charts. These applications not only highlight Uni-SMART's +adaptability but also its potential to revolutionize how we interact with +scientific literature. + +
+
+
+
+
+ + ☆ Context-Semantic Quality Awareness Network for Fine-Grained Visual + Categorization + + +
+ Exploring and mining subtle yet distinctive features between sub-categories +with similar appearances is crucial for fine-grained visual categorization +(FGVC). However, less effort has been devoted to assessing the quality of +extracted visual representations. Intuitively, the network may struggle to +capture discriminative features from low-quality samples, which leads to a +significant decline in FGVC performance. To tackle this challenge, we propose a +weakly supervised Context-Semantic Quality Awareness Network (CSQA-Net) for +FGVC. In this network, to model the spatial contextual relationship between +rich part descriptors and global semantics for capturing more discriminative +details within the object, we design a novel multi-part and multi-scale +cross-attention (MPMSCA) module. Before feeding to the MPMSCA module, the part +navigator is developed to address the scale confusion problems and accurately +identify the local distinctive regions. Furthermore, we propose a generic +multi-level semantic quality evaluation module (MLSQE) to progressively +supervise and enhance hierarchical semantics from different levels of the +backbone network. Finally, context-aware features from MPMSCA and semantically +enhanced features from MLSQE are fed into the corresponding quality probing +classifiers to evaluate their quality in real-time, thus boosting the +discriminability of feature representations. Comprehensive experiments on four +popular and highly competitive FGVC datasets demonstrate the superiority of the +proposed CSQA-Net in comparison with the state-of-the-art methods. + +
+
+
+
+
+ + ☆ Leveraging Neural Radiance Field in Descriptor Synthesis for Keypoints + Scene Coordinate Regression + + +
+ Classical structural-based visual localization methods offer high accuracy +but face trade-offs in terms of storage, speed, and privacy. A recent +innovation, keypoint scene coordinate regression (KSCR) named D2S addresses +these issues by leveraging graph attention networks to enhance keypoint +relationships and predict their 3D coordinates using a simple multilayer +perceptron (MLP). Camera pose is then determined via PnP+RANSAC, using +established 2D-3D correspondences. While KSCR achieves competitive results, +rivaling state-of-the-art image-retrieval methods like HLoc across multiple +benchmarks, its performance is hindered when data samples are limited due to +the deep learning model's reliance on extensive data. This paper proposes a +solution to this challenge by introducing a pipeline for keypoint descriptor +synthesis using Neural Radiance Field (NeRF). By generating novel poses and +feeding them into a trained NeRF model to create new views, our approach +enhances the KSCR's generalization capabilities in data-scarce environments. +The proposed system could significantly improve localization accuracy by up to +50\% and cost only a fraction of time for data synthesis. Furthermore, its +modular design allows for the integration of multiple NeRFs, offering a +versatile and efficient solution for visual localization. The implementation is +publicly available at: https://github.com/ais-lab/DescriptorSynthesis4Feat2Map. + +
+
+
+
+
+ + ☆ Deep Learning for Multi-Level Detection and Localization of Myocardial + Scars Based on Regional Strain Validated on Virtual Patients + + +
+ How well the heart is functioning can be quantified through measurements of +myocardial deformation via echocardiography. Clinical assessment of cardiac +function is generally focused on global indices of relative shortening, +however, territorial, and segmental strain indices have shown to be abnormal in +regions of myocardial disease, such as scar. In this work, we propose a single +framework to predict myocardial disease substrates at global, territorial, and +segmental levels using regional myocardial strain traces as input to a +convolutional neural network (CNN)-based classification algorithm. An +anatomically meaningful representation of the input data from the clinically +standard bullseye representation to a multi-channel 2D image is proposed, to +formulate the task as an image classification problem, thus enabling the use of +state-of-the-art neural network configurations. A Fully Convolutional Network +(FCN) is trained to detect and localize myocardial scar from regional left +ventricular (LV) strain patterns. Simulated regional strain data from a +controlled dataset of virtual patients with varying degrees and locations of +myocardial scar is used for training and validation. The proposed method +successfully detects and localizes the scars on 98% of the 5490 left ventricle +(LV) segments of the 305 patients in the test set using strain traces only. Due +to the sparse existence of scar, only 10% of the LV segments in the virtual +patient cohort have scar. Taking the imbalance into account, the class balanced +accuracy is calculated as 95%. The performance is reported on global, +territorial, and segmental levels. The proposed method proves successful on the +strain traces of the virtual cohort and offers the potential to solve the +regional myocardial scar detection problem on the strain traces of the real +patient cohorts. + +
+
+ comment: 11 pages, 9 figures and 1 table. Preliminary results of the method + was presented as poster in IEEE conference International Ultrasonics + Symposium 2022 in Venice, Italy +
+
+
+
+
+ + ☆ Few-Shot Image Classification and Segmentation as Visual Question + Answering Using Vision-Language Models + + +
+ The task of few-shot image classification and segmentation (FS-CS) involves +classifying and segmenting target objects in a query image, given only a few +examples of the target classes. We introduce the Vision-Instructed Segmentation +and Evaluation (VISE) method that transforms the FS-CS problem into the Visual +Question Answering (VQA) problem, utilising Vision-Language Models (VLMs), and +addresses it in a training-free manner. By enabling a VLM to interact with +off-the-shelf vision models as tools, the proposed method is capable of +classifying and segmenting target objects using only image-level labels. +Specifically, chain-of-thought prompting and in-context learning guide the VLM +to answer multiple-choice questions like a human; vision models such as YOLO +and Segment Anything Model (SAM) assist the VLM in completing the task. The +modular framework of the proposed method makes it easily extendable. Our +approach achieves state-of-the-art performance on the Pascal-5i and COCO-20i +datasets. + +
+
+
+
+
+ + ☆ Local positional graphs and attentive local features for a data and + runtime-efficient hierarchical place recognition pipeline + + +
+ Large-scale applications of Visual Place Recognition (VPR) require +computationally efficient approaches. Further, a well-balanced combination of +data-based and training-free approaches can decrease the required amount of +training data and effort and can reduce the influence of distribution shifts +between the training and application phases. This paper proposes a runtime and +data-efficient hierarchical VPR pipeline that extends existing approaches and +presents novel ideas. There are three main contributions: First, we propose +Local Positional Graphs (LPG), a training-free and runtime-efficient approach +to encode spatial context information of local image features. LPG can be +combined with existing local feature detectors and descriptors and considerably +improves the image-matching quality compared to existing techniques in our +experiments. Second, we present Attentive Local SPED (ATLAS), an extension of +our previous local features approach with an attention module that improves the +feature quality while maintaining high data efficiency. The influence of the +proposed modifications is evaluated in an extensive ablation study. Third, we +present a hierarchical pipeline that exploits hyperdimensional computing to use +the same local features as holistic HDC-descriptors for fast candidate +selection and for candidate reranking. We combine all contributions in a +runtime and data-efficient VPR pipeline that shows benefits over the +state-of-the-art method Patch-NetVLAD on a large collection of standard place +recognition datasets with 15$\%$ better performance in VPR accuracy, 54$\times$ +faster feature comparison speed, and 55$\times$ less descriptor storage +occupancy, making our method promising for real-world high-performance +large-scale VPR in changing environments. Code will be made available with +publication of this paper. + +
+
+ comment: IEEE Robotics and Automation Letters (RA-L) +
+
+
+
+
+ + ☆ Towards Generalizable Deepfake Video Detection with Thumbnail Layout and + Graph Reasoning + + +
+ The deepfake threats to society and cybersecurity have provoked significant +public apprehension, driving intensified efforts within the realm of deepfake +video detection. Current video-level methods are mostly based on {3D CNNs} +resulting in high computational demands, although have achieved good +performance. This paper introduces an elegantly simple yet effective strategy +named Thumbnail Layout (TALL), which transforms a video clip into a pre-defined +layout to realize the preservation of spatial and temporal dependencies. This +transformation process involves sequentially masking frames at the same +positions within each frame. These frames are then resized into sub-frames and +reorganized into the predetermined layout, forming thumbnails. TALL is +model-agnostic and has remarkable simplicity, necessitating only minimal code +modifications. Furthermore, we introduce a graph reasoning block (GRB) and +semantic consistency (SC) loss to strengthen TALL, culminating in TALL++. GRB +enhances interactions between different semantic regions to capture +semantic-level inconsistency clues. The semantic consistency loss imposes +consistency constraints on semantic features to improve model generalization +ability. Extensive experiments on intra-dataset, cross-dataset, +diffusion-generated image detection, and deepfake generation method recognition +show that TALL++ achieves results surpassing or comparable to the +state-of-the-art methods, demonstrating the effectiveness of our approaches for +various deepfake detection problems. The code is available at +https://github.com/rainy-xu/TALL4Deepfake. + +
+
+ comment: Accepted by IJCV +
+
+
+
+
+ + ☆ Arbitrary-Scale Image Generation and Upsampling using Latent Diffusion + Model and Implicit Neural Decoder CVPR 2024 + + +
+ Super-resolution (SR) and image generation are important tasks in computer +vision and are widely adopted in real-world applications. Most existing +methods, however, generate images only at fixed-scale magnification and suffer +from over-smoothing and artifacts. Additionally, they do not offer enough +diversity of output images nor image consistency at different scales. Most +relevant work applied Implicit Neural Representation (INR) to the denoising +diffusion model to obtain continuous-resolution yet diverse and high-quality SR +results. Since this model operates in the image space, the larger the +resolution of image is produced, the more memory and inference time is +required, and it also does not maintain scale-specific consistency. We propose +a novel pipeline that can super-resolve an input image or generate from a +random noise a novel image at arbitrary scales. The method consists of a +pretrained auto-encoder, a latent diffusion model, and an implicit neural +decoder, and their learning strategies. The proposed method adopts diffusion +processes in a latent space, thus efficient, yet aligned with output image +space decoded by MLPs at arbitrary scales. More specifically, our +arbitrary-scale decoder is designed by the symmetric decoder w/o up-scaling +from the pretrained auto-encoder, and Local Implicit Image Function (LIIF) in +series. The latent diffusion process is learnt by the denoising and the +alignment losses jointly. Errors in output images are backpropagated via the +fixed decoder, improving the quality of output images. In the extensive +experiments using multiple public benchmarks on the two tasks i.e. image +super-resolution and novel image generation at arbitrary scales, the proposed +method outperforms relevant methods in metrics of image quality, diversity and +scale consistency. It is significantly better than the relevant prior-art in +the inference speed and memory usage. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Magic Tokens: Select Diverse Tokens for Multi-modal Object + Re-Identification CVPR2024 + + +
+ Single-modal object re-identification (ReID) faces great challenges in +maintaining robustness within complex visual scenarios. In contrast, +multi-modal object ReID utilizes complementary information from diverse +modalities, showing great potentials for practical applications. However, +previous methods may be easily affected by irrelevant backgrounds and usually +ignore the modality gaps. To address above issues, we propose a novel learning +framework named \textbf{EDITOR} to select diverse tokens from vision +Transformers for multi-modal object ReID. We begin with a shared vision +Transformer to extract tokenized features from different input modalities. +Then, we introduce a Spatial-Frequency Token Selection (SFTS) module to +adaptively select object-centric tokens with both spatial and frequency +information. Afterwards, we employ a Hierarchical Masked Aggregation (HMA) +module to facilitate feature interactions within and across modalities. +Finally, to further reduce the effect of backgrounds, we propose a Background +Consistency Constraint (BCC) and an Object-Centric Feature Refinement (OCFR). +They are formulated as two new loss functions, which improve the feature +discrimination with background suppression. As a result, our framework can +generate more discriminative features for multi-modal object ReID. Extensive +experiments on three multi-modal ReID benchmarks verify the effectiveness of +our methods. The code is available at https://github.com/924973292/EDITOR. + +
+
+ comment: This work is accepted by CVPR2024. More modifications may be + performed +
+
+
+
+
+ + ☆ Region-aware Distribution Contrast: A Novel Approach to Multi-Task + Partially Supervised Learning + + +
+ In this study, we address the intricate challenge of multi-task dense +prediction, encompassing tasks such as semantic segmentation, depth estimation, +and surface normal estimation, particularly when dealing with partially +annotated data (MTPSL). The complexity arises from the absence of complete task +labels for each training image. Given the inter-related nature of these +pixel-wise dense tasks, our focus is on mining and capturing cross-task +relationships. Existing solutions typically rely on learning global image +representations for global cross-task image matching, imposing constraints +that, unfortunately, sacrifice the finer structures within the images. +Attempting local matching as a remedy faces hurdles due to the lack of precise +region supervision, making local alignment a challenging endeavor. The +introduction of Segment Anything Model (SAM) sheds light on addressing local +alignment challenges by providing free and high-quality solutions for region +detection. Leveraging SAM-detected regions, the subsequent challenge lies in +aligning the representations within these regions. Diverging from conventional +methods that directly learn a monolithic image representation, our proposal +involves modeling region-wise representations using Gaussian Distributions. +Aligning these distributions between corresponding regions from different tasks +imparts higher flexibility and capacity to capture intra-region structures, +accommodating a broader range of tasks. This innovative approach significantly +enhances our ability to effectively capture cross-task relationships, resulting +in improved overall performance in partially supervised multi-task dense +prediction scenarios. Extensive experiments conducted on two widely used +benchmarks underscore the superior effectiveness of our proposed method, +showcasing state-of-the-art performance even when compared to fully supervised +methods. + +
+
+
+
+
+ + ☆ CoLeCLIP: Open-Domain Continual Learning via Joint Task Prompt and + Vocabulary Learning + + +
+ This paper explores the problem of continual learning (CL) of vision-language +models (VLMs) in open domains, where the models need to perform continual +updating and inference on a streaming of datasets from diverse seen and unseen +domains with novel classes. Such a capability is crucial for various +applications in open environments, e.g., AI assistants, autonomous driving +systems, and robotics. Current CL studies mostly focus on closed-set scenarios +in a single domain with known classes. Large pre-trained VLMs like CLIP have +demonstrated superior zero-shot recognition ability, and a number of recent +studies leverage this ability to mitigate catastrophic forgetting in CL, but +they focus on closed-set CL in a single domain dataset. Open-domain CL of large +VLMs is significantly more challenging due to 1) large class correlations and +domain gaps across the datasets and 2) the forgetting of zero-shot knowledge in +the pre-trained VLMs in addition to the knowledge learned from the newly +adapted datasets. In this work we introduce a novel approach, termed CoLeCLIP, +that learns an open-domain CL model based on CLIP. It addresses these +challenges by a joint learning of a set of task prompts and a cross-domain +class vocabulary. Extensive experiments on 11 domain datasets show that +CoLeCLIP outperforms state-of-the-art methods for open-domain CL under both +task- and class-incremental learning settings. + +
+
+
+
+
+ + ☆ FDGaussian: Fast Gaussian Splatting from Single Image via + Geometric-aware Diffusion Model + + +
+ Reconstructing detailed 3D objects from single-view images remains a +challenging task due to the limited information available. In this paper, we +introduce FDGaussian, a novel two-stage framework for single-image 3D +reconstruction. Recent methods typically utilize pre-trained 2D diffusion +models to generate plausible novel views from the input image, yet they +encounter issues with either multi-view inconsistency or lack of geometric +fidelity. To overcome these challenges, we propose an orthogonal plane +decomposition mechanism to extract 3D geometric features from the 2D input, +enabling the generation of consistent multi-view images. Moreover, we further +accelerate the state-of-the-art Gaussian Splatting incorporating epipolar +attention to fuse images from different viewpoints. We demonstrate that +FDGaussian generates images with high consistency across different views and +reconstructs high-quality 3D objects, both qualitatively and quantitatively. +More examples can be found at our website https://qjfeng.net/FDGaussian/. + +
+
+
+
+
+ + ☆ A Fixed-Point Approach to Unified Prompt-Based Counting AAAI 2024 + + +
+ Existing class-agnostic counting models typically rely on a single type of +prompt, e.g., box annotations. This paper aims to establish a comprehensive +prompt-based counting framework capable of generating density maps for +concerned objects indicated by various prompt types, such as box, point, and +text. To achieve this goal, we begin by converting prompts from different +modalities into prompt masks without requiring training. These masks are then +integrated into a class-agnostic counting methodology for predicting density +maps. Furthermore, we introduce a fixed-point inference along with an +associated loss function to improve counting accuracy, all without introducing +new parameters. The effectiveness of this method is substantiated both +theoretically and experimentally. Additionally, a contrastive training scheme +is implemented to mitigate dataset bias inherent in current class-agnostic +counting datasets, a strategy whose effectiveness is confirmed by our ablation +study. Our model excels in prominent class-agnostic datasets and exhibits +superior performance in cross-dataset adaptation tasks. + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ☆ HawkEye: Training Video-Text LLMs for Grounding Text in Videos + + +
+ Video-text Large Language Models (video-text LLMs) have shown remarkable +performance in answering questions and holding conversations on simple videos. +However, they perform almost the same as random on grounding text queries in +long and complicated videos, having little ability to understand and reason +about temporal information, which is the most fundamental difference between +videos and images. In this paper, we propose HawkEye, one of the first +video-text LLMs that can perform temporal video grounding in a fully +text-to-text manner. To collect training data that is applicable for temporal +video grounding, we construct InternVid-G, a large-scale video-text corpus with +segment-level captions and negative spans, with which we introduce two new +time-aware training objectives to video-text LLMs. We also propose a +coarse-grained method of representing segments in videos, which is more robust +and easier for LLMs to learn and follow than other alternatives. Extensive +experiments show that HawkEye is better at temporal video grounding and +comparable on other video-text tasks with existing video-text LLMs, which +verifies its superior video-text multi-modal understanding abilities. + +
+
+
+
+
+ + ☆ Exploring Optical Flow Inclusion into nnU-Net Framework for Surgical + Instrument Segmentation + + +
+ Surgical instrument segmentation in laparoscopy is essential for +computer-assisted surgical systems. Despite the Deep Learning progress in +recent years, the dynamic setting of laparoscopic surgery still presents +challenges for precise segmentation. The nnU-Net framework excelled in semantic +segmentation analyzing single frames without temporal information. The +framework's ease of use, including its ability to be automatically configured, +and its low expertise requirements, have made it a popular base framework for +comparisons. Optical flow (OF) is a tool commonly used in video tasks to +estimate motion and represent it in a single frame, containing temporal +information. This work seeks to employ OF maps as an additional input to the +nnU-Net architecture to improve its performance in the surgical instrument +segmentation task, taking advantage of the fact that instruments are the main +moving objects in the surgical field. With this new input, the temporal +component would be indirectly added without modifying the architecture. Using +CholecSeg8k dataset, three different representations of movement were estimated +and used as new inputs, comparing them with a baseline model. Results showed +that the use of OF maps improves the detection of classes with high movement, +even when these are scarce in the dataset. To further improve performance, +future work may focus on implementing other OF-preserving augmentations. + +
+
+
+
+
+ + ☆ BlindDiff: Empowering Degradation Modelling in Diffusion Models for + Blind Image Super-Resolution + + +
+ Diffusion models (DM) have achieved remarkable promise in image +super-resolution (SR). However, most of them are tailored to solving non-blind +inverse problems with fixed known degradation settings, limiting their +adaptability to real-world applications that involve complex unknown +degradations. In this work, we propose BlindDiff, a DM-based blind SR method to +tackle the blind degradation settings in SISR. BlindDiff seamlessly integrates +the MAP-based optimization into DMs, which constructs a joint distribution of +the low-resolution (LR) observation, high-resolution (HR) data, and degradation +kernels for the data and kernel priors, and solves the blind SR problem by +unfolding MAP approach along with the reverse process. Unlike most DMs, +BlindDiff firstly presents a modulated conditional transformer (MCFormer) that +is pre-trained with noise and kernel constraints, further serving as a +posterior sampler to provide both priors simultaneously. Then, we plug a simple +yet effective kernel-aware gradient term between adjacent sampling iterations +that guides the diffusion model to learn degradation consistency knowledge. +This also enables to joint refine the degradation model as well as HR images by +observing the previous denoised sample. With the MAP-based reverse diffusion +process, we show that BlindDiff advocates alternate optimization for blur +kernel estimation and HR image restoration in a mutual reinforcing manner. +Experiments on both synthetic and real-world datasets show that BlindDiff +achieves the state-of-the-art performance with significant model complexity +reduction compared to recent DM-based methods. Code will be available at +\url{https://github.com/lifengcs/BlindDiff} + +
+
+
+
+
+ + ☆ A Data-Driven Approach for Mitigating Dark Current Noise and Bad Pixels + in Complementary Metal Oxide Semiconductor Cameras for Space-based Telescopes + + +
+ In recent years, there has been a gradual increase in the performance of +Complementary Metal Oxide Semiconductor (CMOS) cameras. These cameras have +gained popularity as a viable alternative to charge-coupled device (CCD) +cameras in a wide range of applications. One particular application is the CMOS +camera installed in small space telescopes. However, the limited power and +spatial resources available on satellites present challenges in maintaining +ideal observation conditions, including temperature and radiation environment. +Consequently, images captured by CMOS cameras are susceptible to issues such as +dark current noise and defective pixels. In this paper, we introduce a +data-driven framework for mitigating dark current noise and bad pixels for CMOS +cameras. Our approach involves two key steps: pixel clustering and function +fitting. During pixel clustering step, we identify and group pixels exhibiting +similar dark current noise properties. Subsequently, in the function fitting +step, we formulate functions that capture the relationship between dark current +and temperature, as dictated by the Arrhenius law. Our framework leverages +ground-based test data to establish distinct temperature-dark current relations +for pixels within different clusters. The cluster results could then be +utilized to estimate the dark current noise level and detect bad pixels from +real observational data. To assess the effectiveness of our approach, we have +conducted tests using real observation data obtained from the Yangwang-1 +satellite, equipped with a near-ultraviolet telescope and an optical telescope. +The results show a considerable improvement in the detection efficiency of +space-based telescopes. + +
+
+ comment: Accepted by the AJ, comments are welcome. The complete code could be + downloaded from: DOI: 10.12149/101387 +
+
+
+
+
+ + ☆ Learning on JPEG-LDPC Compressed Images: Classifying with Syndromes + + +
+ In goal-oriented communications, the objective of the receiver is often to +apply a Deep-Learning model, rather than reconstructing the original data. In +this context, direct learning over compressed data, without any prior decoding, +holds promise for enhancing the time-efficient execution of inference models at +the receiver. However, conventional entropic-coding methods like Huffman and +Arithmetic break data structure, rendering them unsuitable for learning without +decoding. In this paper, we propose an alternative approach in which entropic +coding is realized with Low-Density Parity Check (LDPC) codes. We hypothesize +that Deep Learning models can more effectively exploit the internal code +structure of LDPC codes. At the receiver, we leverage a specific class of +Recurrent Neural Networks (RNNs), specifically Gated Recurrent Unit (GRU), +trained for image classification. Our numerical results indicate that +classification based on LDPC-coded bit-planes surpasses Huffman and Arithmetic +coding, while necessitating a significantly smaller learning model. This +demonstrates the efficiency of classification directly from LDPC-coded data, +eliminating the need for any form of decompression, even partial, prior to +applying the learning model. + +
+
+ comment: 5 pages, 3 figures, conference paper, submitted to the EUSIPCO 2024 + Conference +
+
+
+
+
+ + ☆ Generative Region-Language Pretraining for Open-Ended Object Detection CVPR 2024 + + +
+ In recent research, significant attention has been devoted to the +open-vocabulary object detection task, aiming to generalize beyond the limited +number of classes labeled during training and detect objects described by +arbitrary category names at inference. Compared with conventional object +detection, open vocabulary object detection largely extends the object +detection categories. However, it relies on calculating the similarity between +image regions and a set of arbitrary category names with a pretrained +vision-and-language model. This implies that, despite its open-set nature, the +task still needs the predefined object categories during the inference stage. +This raises the question: What if we do not have exact knowledge of object +categories during inference? In this paper, we call such a new setting as +generative open-ended object detection, which is a more general and practical +problem. To address it, we formulate object detection as a generative problem +and propose a simple framework named GenerateU, which can detect dense objects +and generate their names in a free-form way. Particularly, we employ Deformable +DETR as a region proposal generator with a language model translating visual +regions to object names. To assess the free-form object detection task, we +introduce an evaluation method designed to quantitatively measure the +performance of generative outcomes. Extensive experiments demonstrate strong +zero-shot detection performance of our GenerateU. For example, on the LVIS +dataset, our GenerateU achieves comparable results to the open-vocabulary +object detection method GLIP, even though the category names are not seen by +GenerateU during inference. Code is available at: https:// +github.com/FoundationVision/GenerateU . + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Perceptual Quality-based Model Training under Annotator Label + Uncertainty + + +
+ Annotators exhibit disagreement during data labeling, which can be termed as +annotator label uncertainty. Annotator label uncertainty manifests in +variations of labeling quality. Training with a single low-quality annotation +per sample induces model reliability degradations. In this work, we first +examine the effects of annotator label uncertainty in terms of the model's +generalizability and prediction uncertainty. We observe that the model's +generalizability and prediction uncertainty degrade with the presence of +low-quality noisy labels. Meanwhile, our evaluation of existing uncertainty +estimation algorithms indicates their incapability in response to annotator +label uncertainty. To mitigate performance degradation, prior methods show that +training models with labels collected from multiple independent annotators can +enhance generalizability. However, they require massive annotations. Hence, we +introduce a novel perceptual quality-based model training framework to +objectively generate multiple labels for model training to enhance reliability, +while avoiding massive annotations. Specifically, we first select a subset of +samples with low perceptual quality scores ranked by statistical regularities +of visual signals. We then assign de-aggregated labels to each sample in this +subset to obtain a training set with multiple labels. Our experiments and +analysis demonstrate that training with the proposed framework alleviates the +degradation of generalizability and prediction uncertainty caused by annotator +label uncertainty. + +
+
+
+
+
+ + ☆ Animate Your Motion: Turning Still Images into Dynamic Videos + + +
+ In recent years, diffusion models have made remarkable strides in +text-to-video generation, sparking a quest for enhanced control over video +outputs to more accurately reflect user intentions. Traditional efforts +predominantly focus on employing either semantic cues, like images or depth +maps, or motion-based conditions, like moving sketches or object bounding +boxes. Semantic inputs offer a rich scene context but lack detailed motion +specificity; conversely, motion inputs provide precise trajectory information +but miss the broader semantic narrative. For the first time, we integrate both +semantic and motion cues within a diffusion model for video generation, as +demonstrated in Fig 1. To this end, we introduce the Scene and Motion +Conditional Diffusion (SMCD), a novel methodology for managing multimodal +inputs. It incorporates a recognized motion conditioning module and +investigates various approaches to integrate scene conditions, promoting +synergy between different modalities. For model training, we separate the +conditions for the two modalities, introducing a two-stage training pipeline. +Experimental results demonstrate that our design significantly enhances video +quality, motion precision, and semantic coherence. + +
+
+ comment: under review +
+
+
+
+
+ + ☆ A Hybrid SNN-ANN Network for Event-based Object Detection with Spatial + and Temporal Attention + + +
+ Event cameras offer high temporal resolution and dynamic range with minimal +motion blur, making them promising for object detection tasks. While Spiking +Neural Networks (SNNs) are a natural match for event-based sensory data and +enable ultra-energy efficient and low latency inference on neuromorphic +hardware, Artificial Neural Networks (ANNs) tend to display more stable +training dynamics and faster convergence resulting in greater task performance. +Hybrid SNN-ANN approaches are a promising alternative, enabling to leverage the +strengths of both SNN and ANN architectures. In this work, we introduce the +first Hybrid Attention-based SNN-ANN backbone for object detection using event +cameras. We propose a novel Attention-based SNN-ANN bridge module to capture +sparse spatial and temporal relations from the SNN layer and convert them into +dense feature maps for the ANN part of the backbone. Experimental results +demonstrate that our proposed method surpasses baseline hybrid and SNN-based +approaches by significant margins, with results comparable to existing +ANN-based methods. Extensive ablation studies confirm the effectiveness of our +proposed modules and architectural choices. These results pave the way toward a +hybrid SNN-ANN architecture that achieves ANN like performance at a drastically +reduced parameter budget. We implemented the SNN blocks on digital neuromorphic +hardware to investigate latency and power consumption and demonstrate the +feasibility of our approach. + +
+
+
+
+
+ + ☆ AUTONODE: A Neuro-Graphic Self-Learnable Engine for Cognitive GUI + Automation + + +
+ In recent advancements within the domain of Large Language Models (LLMs), +there has been a notable emergence of agents capable of addressing Robotic +Process Automation (RPA) challenges through enhanced cognitive capabilities and +sophisticated reasoning. This development heralds a new era of scalability and +human-like adaptability in goal attainment. In this context, we introduce +AUTONODE (Autonomous User-interface Transformation through Online Neuro-graphic +Operations and Deep Exploration). AUTONODE employs advanced neuro-graphical +techniques to facilitate autonomous navigation and task execution on web +interfaces, thereby obviating the necessity for predefined scripts or manual +intervention. Our engine empowers agents to comprehend and implement complex +workflows, adapting to dynamic web environments with unparalleled efficiency. +Our methodology synergizes cognitive functionalities with robotic automation, +endowing AUTONODE with the ability to learn from experience. We have integrated +an exploratory module, DoRA (Discovery and mapping Operation for graph +Retrieval Agent), which is instrumental in constructing a knowledge graph that +the engine utilizes to optimize its actions and achieve objectives with minimal +supervision. The versatility and efficacy of AUTONODE are demonstrated through +a series of experiments, highlighting its proficiency in managing a diverse +array of web-based tasks, ranging from data extraction to transaction +processing. + +
+
+
+
+
+ + ☆ Computer User Interface Understanding. A New Dataset and a Learning + Framework + + +
+ User Interface (UI) understanding has been an increasingly popular topic over +the last few years. So far, there has been a vast focus solely on web and +mobile applications. In this paper, we introduce the harder task of computer UI +understanding. With the goal of enabling research in this field, we have +generated a dataset with a set of videos where a user is performing a sequence +of actions and each image shows the desktop contents at that time point. We +also present a framework that is composed of a synthetic sample generation +pipeline to augment the dataset with relevant characteristics, and a +contrastive learning method to classify images in the videos. We take advantage +of the natural conditional, tree-like, relationship of the images' +characteristics to regularize the learning of the representations by dealing +with multiple partial tasks simultaneously. Experimental results show that the +proposed framework outperforms previously proposed hierarchical multi-label +contrastive losses in fine-grain UI classification. + +
+
+ comment: 14 pages main paper, 6 pages appendix +
+
+
+
+
+ + ☆ SemanticHuman-HD: High-Resolution Semantic Disentangled 3D Human + Generation + + +
+ With the development of neural radiance fields and generative models, +numerous methods have been proposed for learning 3D human generation from 2D +images. These methods allow control over the pose of the generated 3D human and +enable rendering from different viewpoints. However, none of these methods +explore semantic disentanglement in human image synthesis, i.e., they can not +disentangle the generation of different semantic parts, such as the body, tops, +and bottoms. Furthermore, existing methods are limited to synthesize images at +$512^2$ resolution due to the high computational cost of neural radiance +fields. To address these limitations, we introduce SemanticHuman-HD, the first +method to achieve semantic disentangled human image synthesis. Notably, +SemanticHuman-HD is also the first method to achieve 3D-aware image synthesis +at $1024^2$ resolution, benefiting from our proposed 3D-aware super-resolution +module. By leveraging the depth maps and semantic masks as guidance for the +3D-aware super-resolution, we significantly reduce the number of sampling +points during volume rendering, thereby reducing the computational cost. Our +comparative experiments demonstrate the superiority of our method. The +effectiveness of each proposed component is also verified through ablation +studies. Moreover, our method opens up exciting possibilities for various +applications, including 3D garment generation, semantic-aware image synthesis, +controllable image synthesis, and out-of-domain image synthesis. + +
+
+ comment: 26 pages, 14 figures +
+
+
+
+
+ + ☆ CoReEcho: Continuous Representation Learning for 2D+time + Echocardiography Analysis + + +
+ Deep learning (DL) models have been advancing automatic medical image +analysis on various modalities, including echocardiography, by offering a +comprehensive end-to-end training pipeline. This approach enables DL models to +regress ejection fraction (EF) directly from 2D+time echocardiograms, resulting +in superior performance. However, the end-to-end training pipeline makes the +learned representations less explainable. The representations may also fail to +capture the continuous relation among echocardiogram clips, indicating the +existence of spurious correlations, which can negatively affect the +generalization. To mitigate this issue, we propose CoReEcho, a novel training +framework emphasizing continuous representations tailored for direct EF +regression. Our extensive experiments demonstrate that CoReEcho: 1) outperforms +the current state-of-the-art (SOTA) on the largest echocardiography dataset +(EchoNet-Dynamic) with MAE of 3.90 & R2 of 82.44, and 2) provides robust and +generalizable features that transfer more effectively in related downstream +tasks. The code is publicly available at https://github.com/fadamsyah/CoReEcho. + +
+
+
+
+
+ + ☆ Cardiac valve event timing in echocardiography using deep learning and + triplane recordings + + +
+ Cardiac valve event timing plays a crucial role when conducting clinical +measurements using echocardiography. However, established automated approaches +are limited by the need of external electrocardiogram sensors, and manual +measurements often rely on timing from different cardiac cycles. Recent methods +have applied deep learning to cardiac timing, but they have mainly been +restricted to only detecting two key time points, namely end-diastole (ED) and +end-systole (ES). In this work, we propose a deep learning approach that +leverages triplane recordings to enhance detection of valve events in +echocardiography. Our method demonstrates improved performance detecting six +different events, including valve events conventionally associated with ED and +ES. Of all events, we achieve an average absolute frame difference (aFD) of +maximum 1.4 frames (29 ms) for start of diastasis, down to 0.6 frames (12 ms) +for mitral valve opening when performing a ten-fold cross-validation with test +splits on triplane data from 240 patients. On an external independent test +consisting of apical long-axis data from 180 other patients, the worst +performing event detection had an aFD of 1.8 (30 ms). The proposed approach has +the potential to significantly impact clinical practice by enabling more +accurate, rapid and comprehensive event detection, leading to improved clinical +measurements. + +
+
+ comment: To be published in IEEE Journal of Biomedical and Health Informatics. + 10 pages, 4 figures +
+
+
+
+
+ + ☆ Improving Medical Multi-modal Contrastive Learning with Expert + Annotations + + +
+ We introduce eCLIP, an enhanced version of the CLIP model that integrates +expert annotations in the form of radiologist eye-gaze heatmaps. It tackles key +challenges in contrastive multi-modal medical imaging analysis, notably data +scarcity and the "modality gap" -- a significant disparity between image and +text embeddings that diminishes the quality of representations and hampers +cross-modal interoperability. eCLIP integrates a heatmap processor and +leverages mixup augmentation to efficiently utilize the scarce expert +annotations, thus boosting the model's learning effectiveness. eCLIP is +designed to be generally applicable to any variant of CLIP without requiring +any modifications of the core architecture. Through detailed evaluations across +several tasks, including zero-shot inference, linear probing, cross-modal +retrieval, and Retrieval Augmented Generation (RAG) of radiology reports using +a frozen Large Language Model, eCLIP showcases consistent improvements in +embedding quality. The outcomes reveal enhanced alignment and uniformity, +affirming eCLIP's capability to harness high-quality annotations for enriched +multi-modal analysis in the medical imaging domain. + +
+
+ comment: Under review at a conference +
+
+
+
+
+ + ☆ GGRt: Towards Generalizable 3D Gaussians without Pose Priors in + Real-Time + + +
+ This paper presents GGRt, a novel approach to generalizable novel view +synthesis that alleviates the need for real camera poses, complexity in +processing high-resolution images, and lengthy optimization processes, thus +facilitating stronger applicability of 3D Gaussian Splatting (3D-GS) in +real-world scenarios. Specifically, we design a novel joint learning framework +that consists of an Iterative Pose Optimization Network (IPO-Net) and a +Generalizable 3D-Gaussians (G-3DG) model. With the joint learning mechanism, +the proposed framework can inherently estimate robust relative pose information +from the image observations and thus primarily alleviate the requirement of +real camera poses. Moreover, we implement a deferred back-propagation mechanism +that enables high-resolution training and inference, overcoming the resolution +constraints of previous methods. To enhance the speed and efficiency, we +further introduce a progressive Gaussian cache module that dynamically adjusts +during training and inference. As the first pose-free generalizable 3D-GS +framework, GGRt achieves inference at $\ge$ 5 FPS and real-time rendering at +$\ge$ 100 FPS. Through extensive experimentation, we demonstrate that our +method outperforms existing NeRF-based pose-free techniques in terms of +inference speed and effectiveness. It can also approach the real pose-based +3D-GS methods. Our contributions provide a significant leap forward for the +integration of computer vision and computer graphics into practical +applications, offering state-of-the-art results on LLFF, KITTI, and Waymo Open +datasets and enabling real-time rendering for immersive experiences. + +
+
+
+
+
+ + ☆ RCooper: A Real-world Large-scale Dataset for Roadside Cooperative + Perception CVPR2024 + + +
+ The value of roadside perception, which could extend the boundaries of +autonomous driving and traffic management, has gradually become more prominent +and acknowledged in recent years. However, existing roadside perception +approaches only focus on the single-infrastructure sensor system, which cannot +realize a comprehensive understanding of a traffic area because of the limited +sensing range and blind spots. Orienting high-quality roadside perception, we +need Roadside Cooperative Perception (RCooper) to achieve practical +area-coverage roadside perception for restricted traffic areas. Rcooper has its +own domain-specific challenges, but further exploration is hindered due to the +lack of datasets. We hence release the first real-world, large-scale RCooper +dataset to bloom the research on practical roadside cooperative perception, +including detection and tracking. The manually annotated dataset comprises 50k +images and 30k point clouds, including two representative traffic scenes (i.e., +intersection and corridor). The constructed benchmarks prove the effectiveness +of roadside cooperation perception and demonstrate the direction of further +research. Codes and dataset can be accessed at: +https://github.com/AIR-THU/DAIR-RCooper. + +
+
+ comment: Accepted by CVPR2024. 10 pages with 6 figures +
+
+
+
+
+ + ☆ E4C: Enhance Editability for Text-Based Image Editing by Harnessing + Efficient CLIP Guidance + + +
+ Diffusion-based image editing is a composite process of preserving the source +image content and generating new content or applying modifications. While +current editing approaches have made improvements under text guidance, most of +them have only focused on preserving the information of the input image, +disregarding the importance of editability and alignment to the target prompt. +In this paper, we prioritize the editability by proposing a zero-shot image +editing method, named \textbf{E}nhance \textbf{E}ditability for text-based +image \textbf{E}diting via \textbf{E}fficient \textbf{C}LIP guidance +(\textbf{E4C}), which only requires inference-stage optimization to explicitly +enhance the edibility and text alignment. Specifically, we develop a unified +dual-branch feature-sharing pipeline that enables the preservation of the +structure or texture of the source image while allowing the other to be adapted +based on the editing task. We further integrate CLIP guidance into our pipeline +by utilizing our novel random-gateway optimization mechanism to efficiently +enhance the semantic alignment with the target prompt. Comprehensive +quantitative and qualitative experiments demonstrate that our method +effectively resolves the text alignment issues prevalent in existing methods +while maintaining the fidelity to the source image, and performs well across a +wide range of editing tasks. + +
+
+
+
+
+ + ☆ TransLandSeg: A Transfer Learning Approach for Landslide Semantic + Segmentation Based on Vision Foundation Model + + +
+ Landslides are one of the most destructive natural disasters in the world, +posing a serious threat to human life and safety. The development of foundation +models has provided a new research paradigm for large-scale landslide +detection. The Segment Anything Model (SAM) has garnered widespread attention +in the field of image segmentation. However, our experiment found that SAM +performed poorly in the task of landslide segmentation. We propose +TransLandSeg, which is a transfer learning approach for landslide semantic +segmentation based on a vision foundation model (VFM). TransLandSeg outperforms +traditional semantic segmentation models on both the Landslide4Sense dataset +and the Bijie landslide dataset. Our proposed adaptive transfer learning (ATL) +architecture enables the powerful segmentation capability of SAM to be +transferred to landslide detection by training only 1.3% of the number of the +parameters of SAM, which greatly improves the training efficiency of the model. +Finally we also conducted ablation experiments on models with different ATL +structures, concluded that the deployment location and residual connection of +ATL play an important role in TransLandSeg accuracy improvement. + +
+
+
+
+
+ + ☆ Depth-induced Saliency Comparison Network for Diagnosis of Alzheimer's + Disease via Jointly Analysis of Visual Stimuli and Eye Movements + + +
+ Early diagnosis of Alzheimer's Disease (AD) is very important for following +medical treatments, and eye movements under special visual stimuli may serve as +a potential non-invasive biomarker for detecting cognitive abnormalities of AD +patients. In this paper, we propose an Depth-induced saliency comparison +network (DISCN) for eye movement analysis, which may be used for diagnosis the +Alzheimers disease. In DISCN, a salient attention module fuses normal eye +movements with RGB and depth maps of visual stimuli using hierarchical salient +attention (SAA) to evaluate comprehensive saliency maps, which contain +information from both visual stimuli and normal eye movement behaviors. In +addition, we introduce serial attention module (SEA) to emphasis the most +abnormal eye movement behaviors to reduce personal bias for a more robust +result. According to our experiments, the DISCN achieves consistent validity in +classifying the eye movements between the AD patients and normal controls. + +
+
+
+
+
+ + ☆ URS-NeRF: Unordered Rolling Shutter Bundle Adjustment for Neural + Radiance Fields + + +
+ We propose a novel rolling shutter bundle adjustment method for neural +radiance fields (NeRF), which utilizes the unordered rolling shutter (RS) +images to obtain the implicit 3D representation. Existing NeRF methods suffer +from low-quality images and inaccurate initial camera poses due to the RS +effect in the image, whereas, the previous method that incorporates the RS into +NeRF requires strict sequential data input, limiting its widespread +applicability. In constant, our method recovers the physical formation of RS +images by estimating camera poses and velocities, thereby removing the input +constraints on sequential data. Moreover, we adopt a coarse-to-fine training +strategy, in which the RS epipolar constraints of the pairwise frames in the +scene graph are used to detect the camera poses that fall into local minima. +The poses detected as outliers are corrected by the interpolation method with +neighboring poses. The experimental results validate the effectiveness of our +method over state-of-the-art works and demonstrate that the reconstruction of +3D representations is not constrained by the requirement of video sequence +input. + +
+
+
+
+
+ + ☆ Enhancing Human-Centered Dynamic Scene Understanding via Multiple LLMs + Collaborated Reasoning + + +
+ Human-centered dynamic scene understanding plays a pivotal role in enhancing +the capability of robotic and autonomous systems, in which Video-based +Human-Object Interaction (V-HOI) detection is a crucial task in semantic scene +understanding, aimed at comprehensively understanding HOI relationships within +a video to benefit the behavioral decisions of mobile robots and autonomous +driving systems. Although previous V-HOI detection models have made significant +strides in accurate detection on specific datasets, they still lack the general +reasoning ability like human beings to effectively induce HOI relationships. In +this study, we propose V-HOI Multi-LLMs Collaborated Reasoning (V-HOI MLCR), a +novel framework consisting of a series of plug-and-play modules that could +facilitate the performance of current V-HOI detection models by leveraging the +strong reasoning ability of different off-the-shelf pre-trained large language +models (LLMs). We design a two-stage collaboration system of different LLMs for +the V-HOI task. Specifically, in the first stage, we design a Cross-Agents +Reasoning scheme to leverage the LLM conduct reasoning from different aspects. +In the second stage, we perform Multi-LLMs Debate to get the final reasoning +answer based on the different knowledge in different LLMs. Additionally, we +devise an auxiliary training strategy that utilizes CLIP, a large +vision-language model to enhance the base V-HOI models' discriminative ability +to better cooperate with LLMs. We validate the superiority of our design by +demonstrating its effectiveness in improving the prediction accuracy of the +base V-HOI model via reasoning from multiple perspectives. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ CSDNet: Detect Salient Object in Depth-Thermal via A Lightweight Cross + Shallow and Deep Perception Network + + +
+ While we enjoy the richness and informativeness of multimodal data, it also +introduces interference and redundancy of information. To achieve optimal +domain interpretation with limited resources, we propose CSDNet, a lightweight +\textbf{C}ross \textbf{S}hallow and \textbf{D}eep Perception \textbf{Net}work +designed to integrate two modalities with less coherence, thereby discarding +redundant information or even modality. We implement our CSDNet for Salient +Object Detection (SOD) task in robotic perception. The proposed method +capitalises on spatial information prescreening and implicit coherence +navigation across shallow and deep layers of the depth-thermal (D-T) modality, +prioritising integration over fusion to maximise the scene interpretation. To +further refine the descriptive capabilities of the encoder for the less-known +D-T modalities, we also propose SAMAEP to guide an effective feature mapping to +the generalised feature space. Our approach is tested on the VDT-2048 dataset, +leveraging the D-T modality outperforms those of SOTA methods using RGB-T or +RGB-D modalities for the first time, achieves comparable performance with the +RGB-D-T triple-modality benchmark method with 5.97 times faster at runtime and +demanding 0.0036 times fewer FLOPs. Demonstrates the proposed CSDNet +effectively integrates the information from the D-T modality. The code will be +released upon acceptance. + +
+
+
+
+
+ + ☆ DyBluRF: Dynamic Neural Radiance Fields from Blurry Monocular Video + + +
+ Recent advancements in dynamic neural radiance field methods have yielded +remarkable outcomes. However, these approaches rely on the assumption of sharp +input images. When faced with motion blur, existing dynamic NeRF methods often +struggle to generate high-quality novel views. In this paper, we propose +DyBluRF, a dynamic radiance field approach that synthesizes sharp novel views +from a monocular video affected by motion blur. To account for motion blur in +input images, we simultaneously capture the camera trajectory and object +Discrete Cosine Transform (DCT) trajectories within the scene. Additionally, we +employ a global cross-time rendering approach to ensure consistent temporal +coherence across the entire scene. We curate a dataset comprising diverse +dynamic scenes that are specifically tailored for our task. Experimental +results on our dataset demonstrate that our method outperforms existing +approaches in generating sharp novel views from motion-blurred inputs while +maintaining spatial-temporal consistency of the scene. + +
+
+
+
+
+ + ☆ KP-RED: Exploiting Semantic Keypoints for Joint 3D Shape Retrieval and + Deformation CVPR 2024 + + +
+ In this paper, we present KP-RED, a unified KeyPoint-driven REtrieval and +Deformation framework that takes object scans as input and jointly retrieves +and deforms the most geometrically similar CAD models from a pre-processed +database to tightly match the target. Unlike existing dense matching based +methods that typically struggle with noisy partial scans, we propose to +leverage category-consistent sparse keypoints to naturally handle both full and +partial object scans. Specifically, we first employ a lightweight retrieval +module to establish a keypoint-based embedding space, measuring the similarity +among objects by dynamically aggregating deformation-aware local-global +features around extracted keypoints. Objects that are close in the embedding +space are considered similar in geometry. Then we introduce the neural +cage-based deformation module that estimates the influence vector of each +keypoint upon cage vertices inside its local support region to control the +deformation of the retrieved shape. Extensive experiments on the synthetic +dataset PartNet and the real-world dataset Scan2CAD demonstrate that KP-RED +surpasses existing state-of-the-art approaches by a large margin. Codes and +trained models will be released in https://github.com/lolrudy/KP-RED. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ DiffMAC: Diffusion Manifold Hallucination Correction for High + Generalization Blind Face Restoration + + +
+ Blind face restoration (BFR) is a highly challenging problem due to the +uncertainty of degradation patterns. Current methods have low generalization +across photorealistic and heterogeneous domains. In this paper, we propose a +Diffusion-Information-Diffusion (DID) framework to tackle diffusion manifold +hallucination correction (DiffMAC), which achieves high-generalization face +restoration in diverse degraded scenes and heterogeneous domains. Specifically, +the first diffusion stage aligns the restored face with spatial feature +embedding of the low-quality face based on AdaIN, which synthesizes +degradation-removal results but with uncontrollable artifacts for some hard +cases. Based on Stage I, Stage II considers information compression using +manifold information bottleneck (MIB) and finetunes the first diffusion model +to improve facial fidelity. DiffMAC effectively fights against blind +degradation patterns and synthesizes high-quality faces with attribute and +identity consistencies. Experimental results demonstrate the superiority of +DiffMAC over state-of-the-art methods, with a high degree of generalization in +real-world and heterogeneous settings. The source code and models will be +public. + +
+
+ comment: 15 pages, 12 figures +
+
+
+
+
+ + ☆ Adaptive Random Feature Regularization on Fine-tuning Deep Neural + Networks CVPR 2024 + + +
+ While fine-tuning is a de facto standard method for training deep neural +networks, it still suffers from overfitting when using small target datasets. +Previous methods improve fine-tuning performance by maintaining knowledge of +the source datasets or introducing regularization terms such as contrastive +loss. However, these methods require auxiliary source information (e.g., source +labels or datasets) or heavy additional computations. In this paper, we propose +a simple method called adaptive random feature regularization (AdaRand). +AdaRand helps the feature extractors of training models to adaptively change +the distribution of feature vectors for downstream classification tasks without +auxiliary source information and with reasonable computation costs. To this +end, AdaRand minimizes the gap between feature vectors and random reference +vectors that are sampled from class conditional Gaussian distributions. +Furthermore, AdaRand dynamically updates the conditional distribution to follow +the currently updated feature extractors and balance the distance between +classes in feature spaces. Our experiments show that AdaRand outperforms the +other fine-tuning regularization, which requires auxiliary source information +and heavy computation costs. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ RangeLDM: Fast Realistic LiDAR Point Cloud Generation + + +
+ Autonomous driving demands high-quality LiDAR data, yet the cost of physical +LiDAR sensors presents a significant scaling-up challenge. While recent efforts +have explored deep generative models to address this issue, they often consume +substantial computational resources with slow generation speeds while suffering +from a lack of realism. To address these limitations, we introduce RangeLDM, a +novel approach for rapidly generating high-quality range-view LiDAR point +clouds via latent diffusion models. We achieve this by correcting range-view +data distribution for accurate projection from point clouds to range images via +Hough voting, which has a critical impact on generative learning. We then +compress the range images into a latent space with a variational autoencoder, +and leverage a diffusion model to enhance expressivity. Additionally, we +instruct the model to preserve 3D structural fidelity by devising a +range-guided discriminator. Experimental results on KITTI-360 and nuScenes +datasets demonstrate both the robust expressiveness and fast speed of our LiDAR +point cloud generation. + +
+
+
+
+
+ + ☆ PQDynamicISP: Dynamically Controlled Image Signal Processor for Any + Image Sensors Pursuing Perceptual Quality SP + + +
+ Full DNN-based image signal processors (ISPs) have been actively studied and +have achieved superior image quality compared to conventional ISPs. In contrast +to this trend, we propose a lightweight ISP that consists of simple +conventional ISP functions but achieves high image quality by increasing +expressiveness. Specifically, instead of tuning the parameters of the ISP, we +propose to control them dynamically for each environment and even locally. As a +result, state-of-the-art accuracy is achieved on various datasets, including +other tasks like tone mapping and image enhancement, even though ours is +lighter than DNN-based ISPs. Additionally, our method can process different +image sensors with a single ISP through dynamic control, whereas conventional +methods require training for each sensor. + +
+
+ comment: Keywords: image signal processor, ISP, image enhancement, tone + mapping +
+
+
+
+
+ + ☆ Approximation and bounding techniques for the Fisher-Rao distances + + +
+ The Fisher-Rao distance between two probability distributions of a +statistical model is defined as the Riemannian geodesic distance induced by the +Fisher information metric. In order to calculate the Fisher-Rao distance in +closed-form, we need (1) to elicit a formula for the Fisher-Rao geodesics, and +(2) to integrate the Fisher length element along those geodesics. We consider +several numerically robust approximation and bounding techniques for the +Fisher-Rao distances: First, we report generic upper bounds on Fisher-Rao +distances based on closed-form 1D Fisher-Rao distances of submodels. Second, we +describe several generic approximation schemes depending on whether the +Fisher-Rao geodesics or pregeodesics are available in closed-form or not. In +particular, we obtain a generic method to guarantee an arbitrarily small +additive error on the approximation provided that Fisher-Rao pregeodesics and +tight lower and upper bounds are available. Third, we consider the case of +Fisher metrics being Hessian metrics, and report generic tight upper bounds on +the Fisher-Rao distances using techniques of information geometry. +Uniparametric and biparametric statistical models always have Fisher Hessian +metrics, and in general a simple test allows to check whether the Fisher +information matrix yields a Hessian metric or not. Fourth, we consider +elliptical distribution families and show how to apply the above techniques to +these models. We also propose two new distances based either on the Fisher-Rao +lengths of curves serving as proxies of Fisher-Rao geodesics, or based on the +Birkhoff/Hilbert projective cone distance. Last, we consider an alternative +group-theoretic approach for statistical transformation models based on the +notion of maximal invariant which yields insights on the structures of the +Fisher-Rao distance formula which may be used fruitfully in applications. + +
+
+ comment: 38 pages +
+
+
+
+
+ + ☆ Monkeypox disease recognition model based on improved SE-InceptionV3 + + +
+ In the wake of the global spread of monkeypox, accurate disease recognition +has become crucial. This study introduces an improved SE-InceptionV3 model, +embedding the SENet module and incorporating L2 regularization into the +InceptionV3 framework to enhance monkeypox disease detection. Utilizing the +Kaggle monkeypox dataset, which includes images of monkeypox and similar skin +conditions, our model demonstrates a noteworthy accuracy of 96.71% on the test +set, outperforming conventional methods and deep learning models. The SENet +modules channel attention mechanism significantly elevates feature +representation, while L2 regularization ensures robust generalization. +Extensive experiments validate the models superiority in precision, recall, and +F1 score, highlighting its effectiveness in differentiating monkeypox lesions +in diverse and complex cases. The study not only provides insights into the +application of advanced CNN architectures in medical diagnostics but also opens +avenues for further research in model optimization and hyperparameter tuning +for enhanced disease recognition. https://github.com/jzc777/SE-inceptionV3-L2 + +
+
+
+
+
+ + ☆ VRHCF: Cross-Source Point Cloud Registration via Voxel Representation + and Hierarchical Correspondence Filtering ICME + + +
+ Addressing the challenges posed by the substantial gap in point cloud data +collected from diverse sensors, achieving robust cross-source point cloud +registration becomes a formidable task. In response, we present a novel +framework for point cloud registration with broad applicability, suitable for +both homologous and cross-source registration scenarios. To tackle the issues +arising from different densities and distributions in cross-source point cloud +data, we introduce a feature representation based on spherical voxels. +Furthermore, addressing the challenge of numerous outliers and mismatches in +cross-source registration, we propose a hierarchical correspondence filtering +approach. This method progressively filters out mismatches, yielding a set of +high-quality correspondences. Our method exhibits versatile applicability and +excels in both traditional homologous registration and challenging cross-source +registration scenarios. Specifically, in homologous registration using the +3DMatch dataset, we achieve the highest registration recall of 95.1% and an +inlier ratio of 87.8%. In cross-source point cloud registration, our method +attains the best RR on the 3DCSR dataset, demonstrating a 9.3 percentage points +improvement. The code is available at https://github.com/GuiyuZhao/VRHCF. + +
+
+ comment: Accepted by IEEE International Conference on Multimedia and Expo + (ICME), 2024 +
+
+
+
+
+ + ☆ CrossGLG: LLM Guides One-shot Skeleton-based 3D Action Recognition in a + Cross-level Manner + + +
+ Most existing one-shot skeleton-based action recognition focuses on raw +low-level information (e.g., joint location), and may suffer from local +information loss and low generalization ability. To alleviate these, we propose +to leverage text description generated from large language models (LLM) that +contain high-level human knowledge, to guide feature learning, in a +global-local-global way. Particularly, during training, we design $2$ prompts +to gain global and local text descriptions of each action from an LLM. We first +utilize the global text description to guide the skeleton encoder focus on +informative joints (i.e.,global-to-local). Then we build non-local interaction +between local text and joint features, to form the final global representation +(i.e., local-to-global). To mitigate the asymmetry issue between the training +and inference phases, we further design a dual-branch architecture that allows +the model to perform novel class inference without any text input, also making +the additional inference cost neglectable compared with the base skeleton +encoder. Extensive experiments on three different benchmarks show that CrossGLG +consistently outperforms the existing SOTA methods with large margins, and the +inference cost (model size) is only $2.8$\% than the previous SOTA. CrossGLG +can also serve as a plug-and-play module that can substantially enhance the +performance of different SOTA skeleton encoders with a neglectable cost during +inference. The source code will be released soon. + +
+
+
+
+
+ + ☆ Learning Physical Dynamics for Object-centric Visual Prediction + + +
+ The ability to model the underlying dynamics of visual scenes and reason +about the future is central to human intelligence. Many attempts have been made +to empower intelligent systems with such physical understanding and prediction +abilities. However, most existing methods focus on pixel-to-pixel prediction, +which suffers from heavy computational costs while lacking a deep understanding +of the physical dynamics behind videos. Recently, object-centric prediction +methods have emerged and attracted increasing interest. Inspired by it, this +paper proposes an unsupervised object-centric prediction model that makes +future predictions by learning visual dynamics between objects. Our model +consists of two modules, perceptual, and dynamic module. The perceptual module +is utilized to decompose images into several objects and synthesize images with +a set of object-centric representations. The dynamic module fuses contextual +information, takes environment-object and object-object interaction into +account, and predicts the future trajectory of objects. Extensive experiments +are conducted to validate the effectiveness of the proposed method. Both +quantitative and qualitative experimental results demonstrate that our model +generates higher visual quality and more physically reliable predictions +compared to the state-of-the-art methods. + +
+
+ comment: 13 pages, 10 figures +
+
+
+
+
+ + ☆ Benchmarking Adversarial Robustness of Image Shadow Removal with + Shadow-adaptive Attacks ICASSP 2024 + + +
+ Shadow removal is a task aimed at erasing regional shadows present in images +and reinstating visually pleasing natural scenes with consistent illumination. +While recent deep learning techniques have demonstrated impressive performance +in image shadow removal, their robustness against adversarial attacks remains +largely unexplored. Furthermore, many existing attack frameworks typically +allocate a uniform budget for perturbations across the entire input image, +which may not be suitable for attacking shadow images. This is primarily due to +the unique characteristic of spatially varying illumination within shadow +images. In this paper, we propose a novel approach, called shadow-adaptive +adversarial attack. Different from standard adversarial attacks, our attack +budget is adjusted based on the pixel intensity in different regions of shadow +images. Consequently, the optimized adversarial noise in the shadowed regions +becomes visually less perceptible while permitting a greater tolerance for +perturbations in non-shadow regions. The proposed shadow-adaptive attacks +naturally align with the varying illumination distribution in shadow images, +resulting in perturbations that are less conspicuous. Building on this, we +conduct a comprehensive empirical evaluation of existing shadow removal +methods, subjecting them to various levels of attack on publicly available +datasets. + +
+
+ comment: Accepted to ICASSP 2024 +
+
+
+
+
+ + ☆ Revisiting Adversarial Training under Long-Tailed Distributions CVPR 2024 + + +
+ Deep neural networks are vulnerable to adversarial attacks, often leading to +erroneous outputs. Adversarial training has been recognized as one of the most +effective methods to counter such attacks. However, existing adversarial +training techniques have predominantly been tested on balanced datasets, +whereas real-world data often exhibit a long-tailed distribution, casting doubt +on the efficacy of these methods in practical scenarios. + In this paper, we delve into adversarial training under long-tailed +distributions. Through an analysis of the previous work "RoBal", we discover +that utilizing Balanced Softmax Loss alone can achieve performance comparable +to the complete RoBal approach while significantly reducing training overheads. +Additionally, we reveal that, similar to uniform distributions, adversarial +training under long-tailed distributions also suffers from robust overfitting. +To address this, we explore data augmentation as a solution and unexpectedly +discover that, unlike results obtained with balanced data, data augmentation +not only effectively alleviates robust overfitting but also significantly +improves robustness. We further investigate the reasons behind the improvement +of robustness through data augmentation and identify that it is attributable to +the increased diversity of examples. Extensive experiments further corroborate +that data augmentation alone can significantly improve robustness. Finally, +building on these findings, we demonstrate that compared to RoBal, the +combination of BSL and data augmentation leads to a +6.66% improvement in model +robustness under AutoAttack on CIFAR-10-LT. Our code is available at +https://github.com/NISPLab/AT-BSL . + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ Codebook Transfer with Part-of-Speech for Vector-Quantized Image + Modeling CVPR 2024 + + +
+ Vector-Quantized Image Modeling (VQIM) is a fundamental research problem in +image synthesis, which aims to represent an image with a discrete token +sequence. Existing studies effectively address this problem by learning a +discrete codebook from scratch and in a code-independent manner to quantize +continuous representations into discrete tokens. However, learning a codebook +from scratch and in a code-independent manner is highly challenging, which may +be a key reason causing codebook collapse, i.e., some code vectors can rarely +be optimized without regard to the relationship between codes and good codebook +priors such that die off finally. In this paper, inspired by pretrained +language models, we find that these language models have actually pretrained a +superior codebook via a large number of text corpus, but such information is +rarely exploited in VQIM. To this end, we propose a novel codebook transfer +framework with part-of-speech, called VQCT, which aims to transfer a +well-trained codebook from pretrained language models to VQIM for robust +codebook learning. Specifically, we first introduce a pretrained codebook from +language models and part-of-speech knowledge as priors. Then, we construct a +vision-related codebook with these priors for achieving codebook transfer. +Finally, a novel codebook transfer network is designed to exploit abundant +semantic relationships between codes contained in pretrained codebooks for +robust VQIM codebook learning. Experimental results on four datasets show that +our VQCT method achieves superior VQIM performance over previous +state-of-the-art methods. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ☆ Boundary Matters: A Bi-Level Active Finetuning Framework + + +
+ The pretraining-finetuning paradigm has gained widespread adoption in vision +tasks and other fields, yet it faces the significant challenge of high sample +annotation costs. To mitigate this, the concept of active finetuning has +emerged, aiming to select the most appropriate samples for model finetuning +within a limited budget. Traditional active learning methods often struggle in +this setting due to their inherent bias in batch selection. Furthermore, the +recent active finetuning approach has primarily concentrated on aligning the +distribution of selected subsets with the overall data pool, focusing solely on +diversity. In this paper, we propose a Bi-Level Active Finetuning framework to +select the samples for annotation in one shot, which includes two stages: core +sample selection for diversity, and boundary sample selection for uncertainty. +The process begins with the identification of pseudo-class centers, followed by +an innovative denoising method and an iterative strategy for boundary sample +selection in the high-dimensional feature space, all without relying on +ground-truth labels. Our comprehensive experiments provide both qualitative and +quantitative evidence of our method's efficacy, outperforming all the existing +baselines. + +
+
+
+
+
+ + ☆ What Makes Good Collaborative Views? Contrastive Mutual Information + Maximization for Multi-Agent Perception + + +
+ Multi-agent perception (MAP) allows autonomous systems to understand complex +environments by interpreting data from multiple sources. This paper +investigates intermediate collaboration for MAP with a specific focus on +exploring "good" properties of collaborative view (i.e., post-collaboration +feature) and its underlying relationship to individual views (i.e., +pre-collaboration features), which were treated as an opaque procedure by most +existing works. We propose a novel framework named CMiMC (Contrastive Mutual +Information Maximization for Collaborative Perception) for intermediate +collaboration. The core philosophy of CMiMC is to preserve discriminative +information of individual views in the collaborative view by maximizing mutual +information between pre- and post-collaboration features while enhancing the +efficacy of collaborative views by minimizing the loss function of downstream +tasks. In particular, we define multi-view mutual information (MVMI) for +intermediate collaboration that evaluates correlations between collaborative +views and individual views on both global and local scales. We establish +CMiMNet based on multi-view contrastive learning to realize estimation and +maximization of MVMI, which assists the training of a collaboration encoder for +voxel-level feature fusion. We evaluate CMiMC on V2X-Sim 1.0, and it improves +the SOTA average precision by 3.08% and 4.44% at 0.5 and 0.7 IoU +(Intersection-over-Union) thresholds, respectively. In addition, CMiMC can +reduce communication volume to 1/32 while achieving performance comparable to +SOTA. Code and Appendix are released at https://github.com/77SWF/CMiMC. + +
+
+
+
+
+ + ☆ Hybrid Convolutional and Attention Network for Hyperspectral Image + Denoising + + +
+ Hyperspectral image (HSI) denoising is critical for the effective analysis +and interpretation of hyperspectral data. However, simultaneously modeling +global and local features is rarely explored to enhance HSI denoising. In this +letter, we propose a hybrid convolution and attention network (HCANet), which +leverages both the strengths of convolution neural networks (CNNs) and +Transformers. To enhance the modeling of both global and local features, we +have devised a convolution and attention fusion module aimed at capturing +long-range dependencies and neighborhood spectral correlations. Furthermore, to +improve multi-scale information aggregation, we design a multi-scale +feed-forward network to enhance denoising performance by extracting features at +different scales. Experimental results on mainstream HSI datasets demonstrate +the rationality and effectiveness of the proposed HCANet. The proposed model is +effective in removing various types of complex noise. Our codes are available +at \url{https://github.com/summitgao/HCANet}. + +
+
+ comment: IEEE GRSL 2024 +
+
+
+
+
+ + ☆ Contrastive Pre-Training with Multi-View Fusion for No-Reference Point + Cloud Quality Assessment + + +
+ No-reference point cloud quality assessment (NR-PCQA) aims to automatically +evaluate the perceptual quality of distorted point clouds without available +reference, which have achieved tremendous improvements due to the utilization +of deep neural networks. However, learning-based NR-PCQA methods suffer from +the scarcity of labeled data and usually perform suboptimally in terms of +generalization. To solve the problem, we propose a novel contrastive +pre-training framework tailored for PCQA (CoPA), which enables the pre-trained +model to learn quality-aware representations from unlabeled data. To obtain +anchors in the representation space, we project point clouds with different +distortions into images and randomly mix their local patches to form mixed +images with multiple distortions. Utilizing the generated anchors, we constrain +the pre-training process via a quality-aware contrastive loss following the +philosophy that perceptual quality is closely related to both content and +distortion. Furthermore, in the model fine-tuning stage, we propose a +semantic-guided multi-view fusion module to effectively integrate the features +of projected images from multiple perspectives. Extensive experiments show that +our method outperforms the state-of-the-art PCQA methods on popular benchmarks. +Further investigations demonstrate that CoPA can also benefit existing +learning-based PCQA models. + +
+
+
+
+
+ + ☆ Progressive Divide-and-Conquer via Subsampling Decomposition for + Accelerated MRI CVPR 2024 + + +
+ Deep unfolding networks (DUN) have emerged as a popular iterative framework +for accelerated magnetic resonance imaging (MRI) reconstruction. However, +conventional DUN aims to reconstruct all the missing information within the +entire null space in each iteration. Thus it could be challenging when dealing +with highly ill-posed degradation, usually leading to unsatisfactory +reconstruction. In this work, we propose a Progressive Divide-And-Conquer +(PDAC) strategy, aiming to break down the subsampling process in the actual +severe degradation and thus perform reconstruction sequentially. Starting from +decomposing the original maximum-a-posteriori problem of accelerated MRI, we +present a rigorous derivation of the proposed PDAC framework, which could be +further unfolded into an end-to-end trainable network. Specifically, each +iterative stage in PDAC focuses on recovering a distinct moderate degradation +according to the decomposition. Furthermore, as part of the PDAC iteration, +such decomposition is adaptively learned as an auxiliary task through a +degradation predictor which provides an estimation of the decomposed sampling +mask. Following this prediction, the sampling mask is further integrated via a +severity conditioning module to ensure awareness of the degradation severity at +each stage. Extensive experiments demonstrate that our proposed method achieves +superior performance on the publicly available fastMRI and Stanford2D FSE +datasets in both multi-coil and single-coil settings. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ☆ PAME: Self-Supervised Masked Autoencoder for No-Reference Point Cloud + Quality Assessment + + +
+ No-reference point cloud quality assessment (NR-PCQA) aims to automatically +predict the perceptual quality of point clouds without reference, which has +achieved remarkable performance due to the utilization of deep learning-based +models. However, these data-driven models suffer from the scarcity of labeled +data and perform unsatisfactorily in cross-dataset evaluations. To address this +problem, we propose a self-supervised pre-training framework using masked +autoencoders (PAME) to help the model learn useful representations without +labels. Specifically, after projecting point clouds into images, our PAME +employs dual-branch autoencoders, reconstructing masked patches from distorted +images into the original patches within reference and distorted images. In this +manner, the two branches can separately learn content-aware features and +distortion-aware features from the projected images. Furthermore, in the model +fine-tuning stage, the learned content-aware features serve as a guide to fuse +the point cloud quality features extracted from different perspectives. +Extensive experiments show that our method outperforms the state-of-the-art +NR-PCQA methods on popular benchmarks in terms of prediction accuracy and +generalizability. + +
+
+
+
+
+ + ☆ RID-TWIN: An end-to-end pipeline for automatic face de-identification in + videos ICIP 2024 + + +
+ Face de-identification in videos is a challenging task in the domain of +computer vision, primarily used in privacy-preserving applications. Despite the +considerable progress achieved through generative vision models, there remain +multiple challenges in the latest approaches. They lack a comprehensive +discussion and evaluation of aspects such as realism, temporal coherence, and +preservation of non-identifiable features. In our work, we propose RID-Twin: a +novel pipeline that leverages the state-of-the-art generative models, and +decouples identity from motion to perform automatic face de-identification in +videos. We investigate the task from a holistic point of view and discuss how +our approach addresses the pertinent existing challenges in this domain. We +evaluate the performance of our methodology on the widely employed VoxCeleb2 +dataset, and also a custom dataset designed to accommodate the limitations of +certain behavioral variations absent in the VoxCeleb2 dataset. We discuss the +implications and advantages of our work and suggest directions for future +research. + +
+
+ comment: This work has been submitted to IEEE ICIP 2024 +
+
+
+
+
+ + ☆ Group-Mix SAM: Lightweight Solution for Industrial Assembly Line + Applications + + +
+ Since the advent of the Segment Anything Model(SAM) approximately one year +ago, it has engendered significant academic interest and has spawned a large +number of investigations and publications from various perspectives. However, +the deployment of SAM in practical assembly line scenarios has yet to +materialize due to its large image encoder, which weighs in at an imposing +632M. In this study, we have replaced the heavyweight image encoder with a +lightweight one, thereby enabling the deployment of SAM in practical assembly +line scenarios. Specifically, we have employed decoupled distillation to train +the encoder of MobileSAM in a resource-limited setting. The entire knowledge +distillation experiment can be completed in a single day on a single RTX 4090. +The resulting lightweight SAM, referred to as Group-Mix SAM, had 37.63% (2.16M) +fewer parameters and 42.5% (15614.7M) fewer floating-point operations compared +to MobileSAM. However, on our constructed industrial dataset, MALSD, its mIoU +was only marginally lower than that of MobileSAM, at 0.615. Finally, we +conducted a comprehensive comparative experiment to demonstrate the superiority +of Group-Mix SAM in the industrial domain. With its exceptional performance, +our Group-Mix SAM is more suitable for practical assembly line applications. + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ☆ T4P: Test-Time Training of Trajectory Prediction via Masked Autoencoder + and Actor-specific Token Memory CVPR 2024 + + +
+ Trajectory prediction is a challenging problem that requires considering +interactions among multiple actors and the surrounding environment. While +data-driven approaches have been used to address this complex problem, they +suffer from unreliable predictions under distribution shifts during test time. +Accordingly, several online learning methods have been proposed using +regression loss from the ground truth of observed data leveraging the +auto-labeling nature of trajectory prediction task. We mainly tackle the +following two issues. First, previous works underfit and overfit as they only +optimize the last layer of the motion decoder. To this end, we employ the +masked autoencoder (MAE) for representation learning to encourage complex +interaction modeling in shifted test distribution for updating deeper layers. +Second, utilizing the sequential nature of driving data, we propose an +actor-specific token memory that enables the test-time learning of actor-wise +motion characteristics. Our proposed method has been validated across various +challenging cross-dataset distribution shift scenarios including nuScenes, +Lyft, Waymo, and Interaction. Our method surpasses the performance of existing +state-of-the-art online learning methods in terms of both prediction accuracy +and computational efficiency. The code is available at +https://github.com/daeheepark/T4P. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ☆ Texture-GS: Disentangling the Geometry and Texture for 3D Gaussian + Splatting Editing + + +
+ 3D Gaussian splatting, emerging as a groundbreaking approach, has drawn +increasing attention for its capabilities of high-fidelity reconstruction and +real-time rendering. However, it couples the appearance and geometry of the +scene within the Gaussian attributes, which hinders the flexibility of editing +operations, such as texture swapping. To address this issue, we propose a novel +approach, namely Texture-GS, to disentangle the appearance from the geometry by +representing it as a 2D texture mapped onto the 3D surface, thereby +facilitating appearance editing. Technically, the disentanglement is achieved +by our proposed texture mapping module, which consists of a UV mapping MLP to +learn the UV coordinates for the 3D Gaussian centers, a local Taylor expansion +of the MLP to efficiently approximate the UV coordinates for the ray-Gaussian +intersections, and a learnable texture to capture the fine-grained appearance. +Extensive experiments on the DTU dataset demonstrate that our method not only +facilitates high-fidelity appearance editing but also achieves real-time +rendering on consumer-level devices, e.g. a single RTX 2080 Ti GPU. + +
+
+
+
+
+ + ☆ TextBlockV2: Towards Precise-Detection-Free Scene Text Spotting with + Pre-trained Language Model + + +
+ Existing scene text spotters are designed to locate and transcribe texts from +images. However, it is challenging for a spotter to achieve precise detection +and recognition of scene texts simultaneously. Inspired by the glimpse-focus +spotting pipeline of human beings and impressive performances of Pre-trained +Language Models (PLMs) on visual tasks, we ask: 1) "Can machines spot texts +without precise detection just like human beings?", and if yes, 2) "Is text +block another alternative for scene text spotting other than word or +character?" To this end, our proposed scene text spotter leverages advanced +PLMs to enhance performance without fine-grained detection. Specifically, we +first use a simple detector for block-level text detection to obtain rough +positional information. Then, we finetune a PLM using a large-scale OCR dataset +to achieve accurate recognition. Benefiting from the comprehensive language +knowledge gained during the pre-training phase, the PLM-based recognition +module effectively handles complex scenarios, including multi-line, reversed, +occluded, and incomplete-detection texts. Taking advantage of the fine-tuned +language model on scene recognition benchmarks and the paradigm of text block +detection, extensive experiments demonstrate the superior performance of our +scene text spotter across multiple public benchmarks. Additionally, we attempt +to spot texts directly from an entire scene image to demonstrate the potential +of PLMs, even Large Language Models (LLMs). + +
+
+ comment: 12 pages, 8 figures +
+
+
+
+
+ + ☆ Towards Adversarially Robust Dataset Distillation by Curvature + Regularization + + +
+ Dataset distillation (DD) allows datasets to be distilled to fractions of +their original size while preserving the rich distributional information so +that models trained on the distilled datasets can achieve a comparable accuracy +while saving significant computational loads. Recent research in this area has +been focusing on improving the accuracy of models trained on distilled +datasets. In this paper, we aim to explore a new perspective of DD. We study +how to embed adversarial robustness in distilled datasets, so that models +trained on these datasets maintain the high accuracy and meanwhile acquire +better adversarial robustness. We propose a new method that achieves this goal +by incorporating curvature regularization into the distillation process with +much less computational overhead than standard adversarial training. Extensive +empirical experiments suggest that our method not only outperforms standard +adversarial training on both accuracy and robustness with less computation +overhead but is also capable of generating robust distilled datasets that can +withstand various adversarial attacks. + +
+
+ comment: 17 pages, 3 figures +
+
+
+
+
+ + ☆ SphereDiffusion: Spherical Geometry-Aware Distortion Resilient Diffusion + Model AAAI2024 + + +
+ Controllable spherical panoramic image generation holds substantial +applicative potential across a variety of domains.However, it remains a +challenging task due to the inherent spherical distortion and geometry +characteristics, resulting in low-quality content generation.In this paper, we +introduce a novel framework of SphereDiffusion to address these unique +challenges, for better generating high-quality and precisely controllable +spherical panoramic images.For the spherical distortion characteristic, we +embed the semantics of the distorted object with text encoding, then explicitly +construct the relationship with text-object correspondence to better use the +pre-trained knowledge of the planar images.Meanwhile, we employ a deformable +technique to mitigate the semantic deviation in latent space caused by +spherical distortion.For the spherical geometry characteristic, in virtue of +spherical rotation invariance, we improve the data diversity and optimization +objectives in the training process, enabling the model to better learn the +spherical geometry characteristic.Furthermore, we enhance the denoising process +of the diffusion model, enabling it to effectively use the learned geometric +characteristic to ensure the boundary continuity of the generated images.With +these specific techniques, experiments on Structured3D dataset show that +SphereDiffusion significantly improves the quality of controllable spherical +image generation and relatively reduces around 35% FID on average. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ☆ Rethinking Low-quality Optical Flow in Unsupervised Surgical Instrument + Segmentation + + +
+ Video-based surgical instrument segmentation plays an important role in +robot-assisted surgeries. Unlike supervised settings, unsupervised segmentation +relies heavily on motion cues, which are challenging to discern due to the +typically lower quality of optical flow in surgical footage compared to natural +scenes. This presents a considerable burden for the advancement of unsupervised +segmentation techniques. In our work, we address the challenge of enhancing +model performance despite the inherent limitations of low-quality optical flow. +Our methodology employs a three-pronged approach: extracting boundaries +directly from the optical flow, selectively discarding frames with inferior +flow quality, and employing a fine-tuning process with variable frame rates. We +thoroughly evaluate our strategy on the EndoVis2017 VOS dataset and Endovis2017 +Challenge dataset, where our model demonstrates promising results, achieving a +mean Intersection-over-Union (mIoU) of 0.75 and 0.72, respectively. Our +findings suggest that our approach can greatly decrease the need for manual +annotations in clinical environments and may facilitate the annotation process +for new datasets. The code is available at +https://github.com/wpr1018001/Rethinking-Low-quality-Optical-Flow.git + +
+
+
+
+
+ + ☆ Knowledge Condensation and Reasoning for Knowledge-based VQA + + +
+ Knowledge-based visual question answering (KB-VQA) is a challenging task, +which requires the model to leverage external knowledge for comprehending and +answering questions grounded in visual content. Recent studies retrieve the +knowledge passages from external knowledge bases and then use them to answer +questions. However, these retrieved knowledge passages often contain irrelevant +or noisy information, which limits the performance of the model. To address the +challenge, we propose two synergistic models: Knowledge Condensation model and +Knowledge Reasoning model. We condense the retrieved knowledge passages from +two perspectives. First, we leverage the multimodal perception and reasoning +ability of the visual-language models to distill concise knowledge concepts +from retrieved lengthy passages, ensuring relevance to both the visual content +and the question. Second, we leverage the text comprehension ability of the +large language models to summarize and condense the passages into the knowledge +essence which helps answer the question. These two types of condensed knowledge +are then seamlessly integrated into our Knowledge Reasoning model, which +judiciously navigates through the amalgamated information to arrive at the +conclusive answer. Extensive experiments validate the superiority of the +proposed method. Compared to previous methods, our method achieves +state-of-the-art performance on knowledge-based VQA datasets (65.1% on OK-VQA +and 60.1% on A-OKVQA) without resorting to the knowledge produced by GPT-3 +(175B). + +
+
+
+
+
+ + ☆ SparseFusion: Efficient Sparse Multi-Modal Fusion Framework for + Long-Range 3D Perception + + +
+ Multi-modal 3D object detection has exhibited significant progress in recent +years. However, most existing methods can hardly scale to long-range scenarios +due to their reliance on dense 3D features, which substantially escalate +computational demands and memory usage. In this paper, we introduce +SparseFusion, a novel multi-modal fusion framework fully built upon sparse 3D +features to facilitate efficient long-range perception. The core of our method +is the Sparse View Transformer module, which selectively lifts regions of +interest in 2D image space into the unified 3D space. The proposed module +introduces sparsity from both semantic and geometric aspects which only fill +grids that foreground objects potentially reside in. Comprehensive experiments +have verified the efficiency and effectiveness of our framework in long-range +3D perception. Remarkably, on the long-range Argoverse2 dataset, SparseFusion +reduces memory footprint and accelerates the inference by about two times +compared to dense detectors. It also achieves state-of-the-art performance with +mAP of 41.2% and CDS of 32.1%. The versatility of SparseFusion is also +validated in the temporal object detection task and 3D lane detection task. +Codes will be released upon acceptance. + +
+
+
+
+
+ + ☆ Multi-criteria Token Fusion with One-step-ahead Attention for Efficient + Vision Transformers CVPR + + +
+ Vision Transformer (ViT) has emerged as a prominent backbone for computer +vision. For more efficient ViTs, recent works lessen the quadratic cost of the +self-attention layer by pruning or fusing the redundant tokens. However, these +works faced the speed-accuracy trade-off caused by the loss of information. +Here, we argue that token fusion needs to consider diverse relations between +tokens to minimize information loss. In this paper, we propose a Multi-criteria +Token Fusion (MCTF), that gradually fuses the tokens based on multi-criteria +(e.g., similarity, informativeness, and size of fused tokens). Further, we +utilize the one-step-ahead attention, which is the improved approach to capture +the informativeness of the tokens. By training the model equipped with MCTF +using a token reduction consistency, we achieve the best speed-accuracy +trade-off in the image classification (ImageNet1K). Experimental results prove +that MCTF consistently surpasses the previous reduction methods with and +without training. Specifically, DeiT-T and DeiT-S with MCTF reduce FLOPs by +about 44% while improving the performance (+0.5%, and +0.3%) over the base +model, respectively. We also demonstrate the applicability of MCTF in various +Vision Transformers (e.g., T2T-ViT, LV-ViT), achieving at least 31% speedup +without performance degradation. Code is available at +https://github.com/mlvlab/MCTF. + +
+
+ comment: Conference on Computer Vision and Pattern Recognition (CVPR), 2024 +
+
+
+
+
+ + ☆ Linear optimal transport subspaces for point set classification + + +
+ Learning from point sets is an essential component in many computer vision +and machine learning applications. Native, unordered, and permutation invariant +set structure space is challenging to model, particularly for point set +classification under spatial deformations. Here we propose a framework for +classifying point sets experiencing certain types of spatial deformations, with +a particular emphasis on datasets featuring affine deformations. Our approach +employs the Linear Optimal Transport (LOT) transform to obtain a linear +embedding of set-structured data. Utilizing the mathematical properties of the +LOT transform, we demonstrate its capacity to accommodate variations in point +sets by constructing a convex data space, effectively simplifying point set +classification problems. Our method, which employs a nearest-subspace algorithm +in the LOT space, demonstrates label efficiency, non-iterative behavior, and +requires no hyper-parameter tuning. It achieves competitive accuracies compared +to state-of-the-art methods across various point set classification tasks. +Furthermore, our approach exhibits robustness in out-of-distribution scenarios +where training and test distributions vary in terms of deformation magnitudes. + +
+
+
+
+
+ + ☆ Real-World Computational Aberration Correction via Quantized + Domain-Mixing Representation + + +
+ Relying on paired synthetic data, existing learning-based Computational +Aberration Correction (CAC) methods are confronted with the intricate and +multifaceted synthetic-to-real domain gap, which leads to suboptimal +performance in real-world applications. In this paper, in contrast to improving +the simulation pipeline, we deliver a novel insight into real-world CAC from +the perspective of Unsupervised Domain Adaptation (UDA). By incorporating +readily accessible unpaired real-world data into training, we formalize the +Domain Adaptive CAC (DACAC) task, and then introduce a comprehensive Real-world +aberrated images (Realab) dataset to benchmark it. The setup task presents a +formidable challenge due to the intricacy of understanding the target +aberration domain. To this intent, we propose a novel Quntized Domain-Mixing +Representation (QDMR) framework as a potent solution to the issue. QDMR adapts +the CAC model to the target domain from three key aspects: (1) reconstructing +aberrated images of both domains by a VQGAN to learn a Domain-Mixing Codebook +(DMC) which characterizes the degradation-aware priors; (2) modulating the deep +features in CAC model with DMC to transfer the target domain knowledge; and (3) +leveraging the trained VQGAN to generate pseudo target aberrated images from +the source ones for convincing target domain supervision. Extensive experiments +on both synthetic and real-world benchmarks reveal that the models with QDMR +consistently surpass the competitive methods in mitigating the +synthetic-to-real gap, which produces visually pleasant real-world CAC results +with fewer artifacts. Codes and datasets will be made publicly available. + +
+
+ comment: Codes and datasets will be made publicly available at + https://github.com/zju-jiangqi/QDMR +
+
+
+
+
+ + ☆ Cardiac Magnetic Resonance 2D+T Short- and Long-axis Segmentation via + Spatio-temporal SAM Adaptation + + +
+ Accurate 2D+T myocardium segmentation in cine cardiac magnetic resonance +(CMR) scans is essential to analyze LV motion throughout the cardiac cycle +comprehensively. The Segment Anything Model (SAM), known for its accurate +segmentation and zero-shot generalization, has not yet been tailored for CMR +2D+T segmentation. We therefore introduce CMR2D+T-SAM, a novel approach to +adapt SAM for CMR 2D+T segmentation using spatio-temporal adaption. This +approach also incorporates a U-Net framework for multi-scale feature +extraction, as well as text prompts for accurate segmentation on both +short-axis (SAX) and long-axis (LAX) views using a single model. CMR2D+T-SAM +outperforms existing deep learning methods on the STACOM2011 dataset, achieving +a myocardium Dice score of 0.885 and a Hausdorff distance (HD) of 2.900 pixels. +It also demonstrates superior zero-shot generalization on the ACDC dataset with +a Dice score of 0.840 and a HD of 4.076 pixels. + +
+
+ comment: 10 pages, 4 figures +
+
+
+
+
+ + ☆ ST-LDM: A Universal Framework for Text-Grounded Object Generation in + Real Images + + +
+ We present a novel image editing scenario termed Text-grounded Object +Generation (TOG), defined as generating a new object in the real image +spatially conditioned by textual descriptions. Existing diffusion models +exhibit limitations of spatial perception in complex real-world scenes, relying +on additional modalities to enforce constraints, and TOG imposes heightened +challenges on scene comprehension under the weak supervision of linguistic +information. We propose a universal framework ST-LDM based on Swin-Transformer, +which can be integrated into any latent diffusion model with training-free +backward guidance. ST-LDM encompasses a global-perceptual autoencoder with +adaptable compression scales and hierarchical visual features, parallel with +deformable multimodal transformer to generate region-wise guidance for the +subsequent denoising process. We transcend the limitation of traditional +attention mechanisms that only focus on existing visual features by introducing +deformable feature alignment to hierarchically refine spatial positioning fused +with multi-scale visual and linguistic information. Extensive Experiments +demonstrate that our model enhances the localization of attention mechanisms +while preserving the generative capabilities inherent to diffusion models. + +
+
+
+
+
+ + ☆ Visual Foundation Models Boost Cross-Modal Unsupervised Domain + Adaptation for 3D Semantic Segmentation + + +
+ Unsupervised domain adaptation (UDA) is vital for alleviating the workload of +labeling 3D point cloud data and mitigating the absence of labels when facing a +newly defined domain. Various methods of utilizing images to enhance the +performance of cross-domain 3D segmentation have recently emerged. However, the +pseudo labels, which are generated from models trained on the source domain and +provide additional supervised signals for the unseen domain, are inadequate +when utilized for 3D segmentation due to their inherent noisiness and +consequently restrict the accuracy of neural networks. With the advent of 2D +visual foundation models (VFMs) and their abundant knowledge prior, we propose +a novel pipeline VFMSeg to further enhance the cross-modal unsupervised domain +adaptation framework by leveraging these models. In this work, we study how to +harness the knowledge priors learned by VFMs to produce more accurate labels +for unlabeled target domains and improve overall performance. We first utilize +a multi-modal VFM, which is pre-trained on large scale image-text pairs, to +provide supervised labels (VFM-PL) for images and point clouds from the target +domain. Then, another VFM trained on fine-grained 2D masks is adopted to guide +the generation of semantically augmented images and point clouds to enhance the +performance of neural networks, which mix the data from source and target +domains like view frustums (FrustumMixing). Finally, we merge class-wise +prediction across modalities to produce more accurate annotations for unlabeled +target domains. Our method is evaluated on various autonomous driving datasets +and the results demonstrate a significant improvement for 3D segmentation task. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ☆ FBPT: A Fully Binary Point Transformer ICRA 2024 + + +
+ This paper presents a novel Fully Binary Point Cloud Transformer (FBPT) model +which has the potential to be widely applied and expanded in the fields of +robotics and mobile devices. By compressing the weights and activations of a +32-bit full-precision network to 1-bit binary values, the proposed binary point +cloud Transformer network significantly reduces the storage footprint and +computational resource requirements of neural network models for point cloud +processing tasks, compared to full-precision point cloud networks. However, +achieving a fully binary point cloud Transformer network, where all parts +except the modules specific to the task are binary, poses challenges and +bottlenecks in quantizing the activations of Q, K, V and self-attention in the +attention module, as they do not adhere to simple probability distributions and +can vary with input data. Furthermore, in our network, the binary attention +module undergoes a degradation of the self-attention module due to the uniform +distribution that occurs after the softmax operation. The primary focus of this +paper is on addressing the performance degradation issue caused by the use of +binary point cloud Transformer modules. We propose a novel binarization +mechanism called dynamic-static hybridization. Specifically, our approach +combines static binarization of the overall network model with fine granularity +dynamic binarization of data-sensitive components. Furthermore, we make use of +a novel hierarchical training scheme to obtain the optimal model and +binarization parameters. These above improvements allow the proposed +binarization method to outperform binarization methods applied to convolution +neural networks when used in point cloud Transformer structures. To demonstrate +the superiority of our algorithm, we conducted experiments on two different +tasks: point cloud classification and place recognition. + +
+
+ comment: Accepted to ICRA 2024. arXiv admin note: substantial text overlap + with arXiv:2303.01166 +
+
+
+
+
+ + ☆ MEDPNet: Achieving High-Precision Adaptive Registration for Complex Die + Castings + + +
+ Due to their complex spatial structure and diverse geometric features, +achieving high-precision and robust point cloud registration for complex Die +Castings has been a significant challenge in the die-casting industry. Existing +point cloud registration methods primarily optimize network models using +well-established high-quality datasets, often neglecting practical application +in real scenarios. To address this gap, this paper proposes a high-precision +adaptive registration method called Multiscale Efficient Deep Closest Point +(MEDPNet) and introduces a die-casting point cloud dataset, DieCastCloud, +specifically designed to tackle the challenges of point cloud registration in +the die-casting industry. The MEDPNet method performs coarse die-casting point +cloud data registration using the Efficient-DCP method, followed by precision +registration using the Multiscale feature fusion dual-channel registration +(MDR) method. We enhance the modeling capability and computational efficiency +of the model by replacing the attention mechanism of the Transformer in DCP +with Efficient Attention and implementing a collaborative scale mechanism +through the combination of serial and parallel blocks. Additionally, we propose +the MDR method, which utilizes multilayer perceptrons (MLP), Normal +Distributions Transform (NDT), and Iterative Closest Point (ICP) to achieve +learnable adaptive fusion, enabling high-precision, scalable, and +noise-resistant global point cloud registration. Our proposed method +demonstrates excellent performance compared to state-of-the-art geometric and +learning-based registration methods when applied to complex die-casting point +cloud data. + +
+
+
+
+
+ + ☆ TRG-Net: An Interpretable and Controllable Rain Generator + + +
+ Exploring and modeling rain generation mechanism is critical for augmenting +paired data to ease training of rainy image processing models. Against this +task, this study proposes a novel deep learning based rain generator, which +fully takes the physical generation mechanism underlying rains into +consideration and well encodes the learning of the fundamental rain factors +(i.e., shape, orientation, length, width and sparsity) explicitly into the deep +network. Its significance lies in that the generator not only elaborately +design essential elements of the rain to simulate expected rains, like +conventional artificial strategies, but also finely adapt to complicated and +diverse practical rainy images, like deep learning methods. By rationally +adopting filter parameterization technique, we first time achieve a deep +network that is finely controllable with respect to rain factors and able to +learn the distribution of these factors purely from data. Our unpaired +generation experiments demonstrate that the rain generated by the proposed rain +generator is not only of higher quality, but also more effective for deraining +and downstream tasks compared to current state-of-the-art rain generation +methods. Besides, the paired data augmentation experiments, including both +in-distribution and out-of-distribution (OOD), further validate the diversity +of samples generated by our model for in-distribution deraining and OOD +generalization tasks. + +
+
+
+
+
+ + ☆ Controllable Text-to-3D Generation via Surface-Aligned Gaussian + Splatting + + +
+ While text-to-3D and image-to-3D generation tasks have received considerable +attention, one important but under-explored field between them is controllable +text-to-3D generation, which we mainly focus on in this work. To address this +task, 1) we introduce Multi-view ControlNet (MVControl), a novel neural network +architecture designed to enhance existing pre-trained multi-view diffusion +models by integrating additional input conditions, such as edge, depth, normal, +and scribble maps. Our innovation lies in the introduction of a conditioning +module that controls the base diffusion model using both local and global +embeddings, which are computed from the input condition images and camera +poses. Once trained, MVControl is able to offer 3D diffusion guidance for +optimization-based 3D generation. And, 2) we propose an efficient multi-stage +3D generation pipeline that leverages the benefits of recent large +reconstruction models and score distillation algorithm. Building upon our +MVControl architecture, we employ a unique hybrid diffusion guidance method to +direct the optimization process. In pursuit of efficiency, we adopt 3D +Gaussians as our representation instead of the commonly used implicit +representations. We also pioneer the use of SuGaR, a hybrid representation that +binds Gaussians to mesh triangle faces. This approach alleviates the issue of +poor geometry in 3D Gaussians and enables the direct sculpting of fine-grained +geometry on the mesh. Extensive experiments demonstrate that our method +achieves robust generalization and enables the controllable generation of +high-quality 3D content. + +
+
+ comment: Project page: https://lizhiqi49.github.io/MVControl/ +
+
+
+
+
+ + ☆ EfficientVMamba: Atrous Selective Scan for Light Weight Visual Mamba + + +
+ Prior efforts in light-weight model development mainly centered on CNN and +Transformer-based designs yet faced persistent challenges. CNNs adept at local +feature extraction compromise resolution while Transformers offer global reach +but escalate computational demands $\mathcal{O}(N^2)$. This ongoing trade-off +between accuracy and efficiency remains a significant hurdle. Recently, state +space models (SSMs), such as Mamba, have shown outstanding performance and +competitiveness in various tasks such as language modeling and computer vision, +while reducing the time complexity of global information extraction to +$\mathcal{O}(N)$. Inspired by this, this work proposes to explore the potential +of visual state space models in light-weight model design and introduce a novel +efficient model variant dubbed EfficientVMamba. Concretely, our EfficientVMamba +integrates a atrous-based selective scan approach by efficient skip sampling, +constituting building blocks designed to harness both global and local +representational features. Additionally, we investigate the integration between +SSM blocks and convolutions, and introduce an efficient visual state space +block combined with an additional convolution branch, which further elevate the +model performance. Experimental results show that, EfficientVMamba scales down +the computational complexity while yields competitive results across a variety +of vision tasks. For example, our EfficientVMamba-S with $1.3$G FLOPs improves +Vim-Ti with $1.5$G FLOPs by a large margin of $5.6\%$ accuracy on ImageNet. +Code is available at: \url{https://github.com/TerryPei/EfficientVMamba}. + +
+
+
+
+
+ + ☆ AD3: Implicit Action is the Key for World Models to Distinguish the + Diverse Visual Distractors + + +
+ Model-based methods have significantly contributed to distinguishing +task-irrelevant distractors for visual control. However, prior research has +primarily focused on heterogeneous distractors like noisy background videos, +leaving homogeneous distractors that closely resemble controllable agents +largely unexplored, which poses significant challenges to existing methods. To +tackle this problem, we propose Implicit Action Generator (IAG) to learn the +implicit actions of visual distractors, and present a new algorithm named +implicit Action-informed Diverse visual Distractors Distinguisher (AD3), that +leverages the action inferred by IAG to train separated world models. Implicit +actions effectively capture the behavior of background distractors, aiding in +distinguishing the task-irrelevant components, and the agent can optimize the +policy within the task-relevant state space. Our method achieves superior +performance on various visual control tasks featuring both heterogeneous and +homogeneous distractors. The indispensable role of implicit actions learned by +IAG is also empirically validated. + +
+
+
+
+
+ + ☆ Skeleton-Based Human Action Recognition with Noisy Labels + + +
+ Understanding human actions from body poses is critical for assistive robots +sharing space with humans in order to make informed and safe decisions about +the next interaction. However, precise temporal localization and annotation of +activity sequences is time-consuming and the resulting labels are often noisy. +If not effectively addressed, label noise negatively affects the model's +training, resulting in lower recognition quality. Despite its importance, +addressing label noise for skeleton-based action recognition has been +overlooked so far. In this study, we bridge this gap by implementing a +framework that augments well-established skeleton-based human action +recognition methods with label-denoising strategies from various research areas +to serve as the initial benchmark. Observations reveal that these baselines +yield only marginal performance when dealing with sparse skeleton data. +Consequently, we introduce a novel methodology, NoiseEraSAR, which integrates +global sample selection, co-teaching, and Cross-Modal Mixture-of-Experts +(CM-MOE) strategies, aimed at mitigating the adverse impacts of label noise. +Our proposed approach demonstrates better performance on the established +benchmark, setting new state-of-the-art standards. The source code for this +study will be made accessible at https://github.com/xuyizdby/NoiseEraSAR. + +
+
+ comment: The source code will be made accessible at + https://github.com/xuyizdby/NoiseEraSAR +
+
+
+
+
+ + ☆ GET: Unlocking the Multi-modal Potential of CLIP for Generalized + Category Discovery + + +
+ Given unlabelled datasets containing both old and new categories, generalized +category discovery (GCD) aims to accurately discover new classes while +correctly classifying old classes, leveraging the class concepts learned from +labeled samples. Current GCD methods only use a single visual modality of +information, resulting in poor classification of visually similar classes. +Though certain classes are visually confused, their text information might be +distinct, motivating us to introduce text information into the GCD task. +However, the lack of class names for unlabelled data makes it impractical to +utilize text information. To tackle this challenging problem, in this paper, we +propose a Text Embedding Synthesizer (TES) to generate pseudo text embeddings +for unlabelled samples. Specifically, our TES leverages the property that CLIP +can generate aligned vision-language features, converting visual embeddings +into tokens of the CLIP's text encoder to generate pseudo text embeddings. +Besides, we employ a dual-branch framework, through the joint learning and +instance consistency of different modality branches, visual and semantic +information mutually enhance each other, promoting the interaction and fusion +of visual and text embedding space. Our method unlocks the multi-modal +potentials of CLIP and outperforms the baseline methods by a large margin on +all GCD benchmarks, achieving new state-of-the-art. The code will be released +at \url{https://github.com/enguangW/GET}. + +
+
+
+
+
+ + ☆ Den-SOFT: Dense Space-Oriented Light Field DataseT for 6-DOF Immersive + Experience + + +
+ We have built a custom mobile multi-camera large-space dense light field +capture system, which provides a series of high-quality and sufficiently dense +light field images for various scenarios. Our aim is to contribute to the +development of popular 3D scene reconstruction algorithms such as IBRnet, NeRF, +and 3D Gaussian splitting. More importantly, the collected dataset, which is +much denser than existing datasets, may also inspire space-oriented light field +reconstruction, which is potentially different from object-centric 3D +reconstruction, for immersive VR/AR experiences. We utilized a total of 40 +GoPro 10 cameras, capturing images of 5k resolution. The number of photos +captured for each scene is no less than 1000, and the average density (view +number within a unit sphere) is 134.68. It is also worth noting that our system +is capable of efficiently capturing large outdoor scenes. Addressing the +current lack of large-space and dense light field datasets, we made efforts to +include elements such as sky, reflections, lights and shadows that are of +interest to researchers in the field of 3D reconstruction during the data +capture process. Finally, we validated the effectiveness of our provided +dataset on three popular algorithms and also integrated the reconstructed 3DGS +results into the Unity engine, demonstrating the potential of utilizing our +datasets to enhance the realism of virtual reality (VR) and create feasible +interactive spaces. The dataset is available at our project website. + +
+
+
+
+
+ + ☆ Autoregressive Queries for Adaptive Tracking with + Spatio-TemporalTransformers + + +
+ The rich spatio-temporal information is crucial to capture the complicated +target appearance variations in visual tracking. However, most top-performing +tracking algorithms rely on many hand-crafted components for spatio-temporal +information aggregation. Consequently, the spatio-temporal information is far +away from being fully explored. To alleviate this issue, we propose an adaptive +tracker with spatio-temporal transformers (named AQATrack), which adopts simple +autoregressive queries to effectively learn spatio-temporal information without +many hand-designed components. Firstly, we introduce a set of learnable and +autoregressive queries to capture the instantaneous target appearance changes +in a sliding window fashion. Then, we design a novel attention mechanism for +the interaction of existing queries to generate a new query in current frame. +Finally, based on the initial target template and learnt autoregressive +queries, a spatio-temporal information fusion module (STM) is designed for +spatiotemporal formation aggregation to locate a target object. Benefiting from +the STM, we can effectively combine the static appearance and instantaneous +changes to guide robust tracking. Extensive experiments show that our method +significantly improves the tracker's performance on six popular tracking +benchmarks: LaSOT, LaSOText, TrackingNet, GOT-10k, TNL2K, and UAV123. + +
+
+
+
+
+ + ☆ Medical Unlearnable Examples: Securing Medical Data from Unauthorized + Traning via Sparsity-Aware Local Masking + + +
+ With the rapid growth of artificial intelligence (AI) in healthcare, there +has been a significant increase in the generation and storage of sensitive +medical data. This abundance of data, in turn, has propelled the advancement of +medical AI technologies. However, concerns about unauthorized data +exploitation, such as training commercial AI models, often deter researchers +from making their invaluable datasets publicly available. In response to the +need to protect this hard-to-collect data while still encouraging medical +institutions to share it, one promising solution is to introduce imperceptible +noise into the data. This method aims to safeguard the data against +unauthorized training by inducing degradation in model generalization. Although +existing methods have shown commendable data protection capabilities in general +domains, they tend to fall short when applied to biomedical data, mainly due to +their failure to account for the sparse nature of medical images. To address +this problem, we propose the Sparsity-Aware Local Masking (SALM) method, a +novel approach that selectively perturbs significant pixel regions rather than +the entire image as previous strategies have done. This simple-yet-effective +approach significantly reduces the perturbation search space by concentrating +on local regions, thereby improving both the efficiency and effectiveness of +data protection for biomedical datasets characterized by sparse features. +Besides, we have demonstrated that SALM maintains the essential characteristics +of the data, ensuring its clinical utility remains uncompromised. Our extensive +experiments across various datasets and model architectures demonstrate that +SALM effectively prevents unauthorized training of deep-learning models and +outperforms previous state-of-the-art data protection methods. + +
+
+
+
+
+ + ☆ Boundary Constraint-free Biomechanical Model-Based Surface Matching for + Intraoperative Liver Deformation Correction + + +
+ In image-guided liver surgery, 3D-3D non-rigid registration methods play a +crucial role in estimating the mapping between the preoperative model and the +intraoperative surface represented as point clouds, addressing the challenge of +tissue deformation. Typically, these methods incorporate a biomechanical model, +represented as a finite element model (FEM), used to regularize a surface +matching term. This paper introduces a novel 3D-3D non-rigid registration +method. In contrast to the preceding techniques, our method uniquely +incorporates the FEM within the surface matching term itself, ensuring that the +estimated deformation maintains geometric consistency throughout the +registration process. Additionally, we eliminate the need to determine +zero-boundary conditions and applied force locations in the FEM. We achieve +this by integrating soft springs into the stiffness matrix and allowing forces +to be distributed across the entire liver surface. To further improve +robustness, we introduce a regularization technique focused on the gradient of +the force magnitudes. This regularization imposes spatial smoothness and helps +prevent the overfitting of irregular noise in intraoperative data. Optimization +is achieved through an accelerated proximal gradient algorithm, further +enhanced by our proposed method for determining the optimal step size. Our +method is evaluated and compared to both a learning-based method and a +traditional method that features FEM regularization using data collected on our +custom-developed phantom, as well as two publicly available datasets. Our +method consistently outperforms or is comparable to the baseline techniques. +Both the code and dataset will be made publicly available. + +
+
+
+
+
+ + ☆ ViTCN: Vision Transformer Contrastive Network For Reasoning + + +
+ Machine learning models have achieved significant milestones in various +domains, for example, computer vision models have an exceptional result in +object recognition, and in natural language processing, where Large Language +Models (LLM) like GPT can start a conversation with human-like proficiency. +However, abstract reasoning remains a challenge for these models, Can AI really +thinking like a human? still be a question yet to be answered. Raven +Progressive Matrices (RPM) is a metric designed to assess human reasoning +capabilities. It presents a series of eight images as a problem set, where the +participant should try to discover the underlying rules among these images and +select the most appropriate image from eight possible options that best +completes the sequence. This task always be used to test human reasoning +abilities and IQ. Zhang et al proposed a dataset called RAVEN which can be used +to test Machine Learning model abstract reasoning ability. In this paper, we +purposed Vision Transformer Contrastive Network which build on previous work +with the Contrastive Perceptual Inference network (CoPiNet), which set a new +benchmark for permutationinvariant models Raven Progressive Matrices by +incorporating contrast effects from psychology, cognition, and education, and +extends this foundation by leveraging the cutting-edge Vision Transformer +architecture. This integration aims to further refine the machine ability to +process and reason about spatial-temporal information from pixel-level inputs +and global wise features on RAVEN dataset. + +
+
+ comment: 5 pages, 2 figures , in proceeding of 5th International Seminar on + Artificial Intelligence, Networking and Information Technology +
+
+
+
+
+ + ☆ RadCLIP: Enhancing Radiologic Image Analysis through Contrastive + Language-Image Pre-training + + +
+ The integration of artificial intelligence (AI) with radiology has marked a +transformative era in medical diagnostics. Vision foundation models have been +adopted to enhance radiologic imaging analysis. However, the distinct +complexities of radiological imaging, including the interpretation of 2D and 3D +radiological data, pose unique challenges that existing models, trained on +general non-medical images, fail to address adequately. To bridge this gap and +capitalize on the diagnostic precision required in medical imaging, we +introduce RadCLIP: a pioneering cross-modal foundational model that harnesses +Contrastive Language-Image Pre-training (CLIP) to refine radiologic image +analysis. RadCLIP incorporates a novel 3D slice pooling mechanism tailored for +volumetric image analysis and is trained using a comprehensive and diverse +dataset of radiologic image-text pairs. Our evaluations demonstrate that +RadCLIP effectively aligns radiological images with their corresponding textual +annotations, and in the meantime, offers a robust vision backbone for +radiologic imagery with significant promise. + +
+
+
+
+
+ + ☆ Shifting Focus: From Global Semantics to Local Prominent Features in + Swin-Transformer for Knee Osteoarthritis Severity Assessment + + +
+ Conventional imaging diagnostics frequently encounter bottlenecks due to +manual inspection, which can lead to delays and inconsistencies. Although deep +learning offers a pathway to automation and enhanced accuracy, foundational +models in computer vision often emphasize global context at the expense of +local details, which are vital for medical imaging diagnostics. To address +this, we harness the Swin Transformer's capacity to discern extended spatial +dependencies within images through the hierarchical framework. Our novel +contribution lies in refining local feature representations, orienting them +specifically toward the final distribution of the classifier. This method +ensures that local features are not only preserved but are also enriched with +task-specific information, enhancing their relevance and detail at every +hierarchical level. By implementing this strategy, our model demonstrates +significant robustness and precision, as evidenced by extensive validation of +two established benchmarks for Knee OsteoArthritis (KOA) grade classification. +These results highlight our approach's effectiveness and its promising +implications for the future of medical imaging diagnostics. Our implementation +is available on https://github.com/mtliba/KOA_NLCS2024 + +
+
+
+
+
+ + ☆ Attention-Enhanced Hybrid Feature Aggregation Network for 3D Brain Tumor + Segmentation MICCAI + + +
+ Glioblastoma is a highly aggressive and malignant brain tumor type that +requires early diagnosis and prompt intervention. Due to its heterogeneity in +appearance, developing automated detection approaches is challenging. To +address this challenge, Artificial Intelligence (AI)-driven approaches in +healthcare have generated interest in efficiently diagnosing and evaluating +brain tumors. The Brain Tumor Segmentation Challenge (BraTS) is a platform for +developing and assessing automated techniques for tumor analysis using +high-quality, clinically acquired MRI data. In our approach, we utilized a +multi-scale, attention-guided and hybrid U-Net-shaped model -- GLIMS -- to +perform 3D brain tumor segmentation in three regions: Enhancing Tumor (ET), +Tumor Core (TC), and Whole Tumor (WT). The multi-scale feature extraction +provides better contextual feature aggregation in high resolutions and the Swin +Transformer blocks improve the global feature extraction at deeper levels of +the model. The segmentation mask generation in the decoder branch is guided by +the attention-refined features gathered from the encoder branch to enhance the +important attributes. Moreover, hierarchical supervision is used to train the +model efficiently. Our model's performance on the validation set resulted in +92.19, 87.75, and 83.18 Dice Scores and 89.09, 84.67, and 82.15 Lesion-wise +Dice Scores in WT, TC, and ET, respectively. The code is publicly available at +https://github.com/yaziciz/GLIMS. + +
+
+ comment: Accepted at 9th BrainLes Workshop (BraTS 2023 Challenge) @ + International Conference on Medical Image Computing and Computer Assisted + Intervention (MICCAI) 2023 +
+
+
+
+
+ + ☆ Quantization Effects on Neural Networks Perception: How would + quantization change the perceptual field of vision models? + + +
+ Neural network quantization is an essential technique for deploying models on +resource-constrained devices. However, its impact on model perceptual fields, +particularly regarding class activation maps (CAMs), remains a significant area +of investigation. In this study, we explore how quantization alters the spatial +recognition ability of the perceptual field of vision models, shedding light on +the alignment between CAMs and visual saliency maps across various +architectures. Leveraging a dataset of 10,000 images from ImageNet, we +rigorously evaluate six diverse foundational CNNs: VGG16, ResNet50, +EfficientNet, MobileNet, SqueezeNet, and DenseNet. We uncover nuanced changes +in CAMs and their alignment with human visual saliency maps through systematic +quantization techniques applied to these models. Our findings reveal the +varying sensitivities of different architectures to quantization and underscore +its implications for real-world applications in terms of model performance and +interpretability. The primary contribution of this work revolves around +deepening our understanding of neural network quantization, providing insights +crucial for deploying efficient and interpretable models in practical settings. + +
+
+
+
+
+ + ♻ ☆ PU-Ray: Domain-Independent Point Cloud Upsampling via Ray Marching on + Neural Implicit Surface + + +
+ While recent advancements in deep-learning point cloud upsampling methods +have improved the input to intelligent transportation systems, they still +suffer from issues of domain dependency between synthetic and real-scanned +point clouds. This paper addresses the above issues by proposing a new +ray-based upsampling approach with an arbitrary rate, where a depth prediction +is made for each query ray and its corresponding patch. Our novel method +simulates the sphere-tracing ray marching algorithm on the neural implicit +surface defined with an unsigned distance function (UDF) to achieve more +precise and stable ray-depth predictions by training a point-transformer-based +network. The rule-based mid-point query sampling method generates more evenly +distributed points without requiring an end-to-end model trained using a +nearest-neighbor-based reconstruction loss function, which may be biased +towards the training dataset. Self-supervised learning becomes possible with +accurate ground truths within the input point cloud. The results demonstrate +the method's versatility across domains and training scenarios with limited +computational resources and training data. Comprehensive analyses of synthetic +and real-scanned applications provide empirical evidence for the significance +of the upsampling task across the computer vision and graphics domains to +real-world applications of ITS. + +
+
+ comment: 17 pages (11 main + 6 supplement), 21 figures (8 main + 13 + supplement), 8 tables +
+
+
+
+
+ + ♻ ☆ Dataset Distillation via the Wasserstein Metric + + +
+ Dataset Distillation (DD) emerges as a powerful strategy to encapsulate the +expansive information of large datasets into significantly smaller, synthetic +equivalents, thereby preserving model performance with reduced computational +overhead. Pursuing this objective, we introduce the Wasserstein distance, a +metric grounded in optimal transport theory, to enhance distribution matching +in DD. Our approach employs the Wasserstein barycenter to provide a +geometrically meaningful method for quantifying distribution differences and +capturing the centroid of distribution sets efficiently. By embedding synthetic +data in the feature spaces of pretrained classification models, we facilitate +effective distribution matching that leverages prior knowledge inherent in +these models. Our method not only maintains the computational advantages of +distribution matching-based techniques but also achieves new state-of-the-art +performance across a range of high-resolution datasets. Extensive testing +demonstrates the effectiveness and adaptability of our method, underscoring the +untapped potential of Wasserstein metrics in dataset distillation. + +
+
+ comment: 21 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ MiniGPT-5: Interleaved Vision-and-Language Generation via Generative + Vokens + + +
+ The effectiveness of Multimodal Large Language Models (MLLMs) demonstrates a +profound capability in multimodal understanding. However, the simultaneous +generation of images with coherent texts is still underdeveloped. Addressing +this, we introduce a novel interleaved vision-and-language generation method, +centered around the concept of ``generative vokens". These vokens serve as +pivotal elements contributing to coherent image-text outputs. Our method is +marked by a unique two-stage training strategy for description-free multimodal +generation, which does not necessitate extensive descriptions of images. We +integrate classifier-free guidance to enhance the alignment of generated images +and texts, ensuring more seamless and contextually relevant multimodal +interactions. Our model, MiniGPT-5, exhibits substantial improvement over the +baseline models on multimodal generation datasets, including MMDialog and VIST. +The human evaluation shows MiniGPT-5 is better than the baseline model on more +than 56\% cases for multimodal generation, highlighting its efficacy across +diverse benchmarks. + +
+
+ comment: 23 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ FEDORA: Flying Event Dataset fOr Reactive behAvior + + +
+ The ability of resource-constrained biological systems such as fruitflies to +perform complex and high-speed maneuvers in cluttered environments has been one +of the prime sources of inspiration for developing vision-based autonomous +systems. To emulate this capability, the perception pipeline of such systems +must integrate information cues from tasks including optical flow and depth +estimation, object detection and tracking, and segmentation, among others. +However, the conventional approach of employing slow, synchronous inputs from +standard frame-based cameras constrains these perception capabilities, +particularly during high-speed maneuvers. Recently, event-based sensors have +emerged as low latency and low energy alternatives to standard frame-based +cameras for capturing high-speed motion, effectively speeding up perception and +hence navigation. For coherence, all the perception tasks must be trained on +the same input data. However, present-day datasets are curated mainly for a +single or a handful of tasks and are limited in the rate of the provided ground +truths. To address these limitations, we present Flying Event Dataset fOr +Reactive behAviour (FEDORA) - a fully synthetic dataset for perception tasks, +with raw data from frame-based cameras, event-based cameras, and Inertial +Measurement Units (IMU), along with ground truths for depth, pose, and optical +flow at a rate much higher than existing datasets. + +
+
+
+
+
+ + ♻ ☆ Uncovering the Hidden Cost of Model Compression + + +
+ In an age dominated by resource-intensive foundation models, the ability to +efficiently adapt to downstream tasks is crucial. Visual Prompting (VP), +drawing inspiration from the prompting techniques employed in Large Language +Models (LLMs), has emerged as a pivotal method for transfer learning in the +realm of computer vision. As the importance of efficiency continues to rise, +research into model compression has become indispensable in alleviating the +computational burdens associated with training and deploying over-parameterized +neural networks. A primary objective in model compression is to develop sparse +and/or quantized models capable of matching or even surpassing the performance +of their over-parameterized, full-precision counterparts. Although previous +studies have explored the effects of model compression on transfer learning, +its impact on visual prompting-based transfer remains unclear. This study aims +to bridge this gap, shedding light on the fact that model compression +detrimentally impacts the performance of visual prompting-based transfer, +particularly evident in scenarios with low data volume. Furthermore, our +findings underscore the adverse influence of sparsity on the calibration of +downstream visual-prompted models. However, intriguingly, we also illustrate +that such negative effects on calibration are not present when models are +compressed via quantization. This empirical investigation underscores the need +for a nuanced understanding beyond mere accuracy in sparse and quantized +settings, thereby paving the way for further exploration in Visual Prompting +techniques tailored for sparse and quantized models. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ milliFlow: Scene Flow Estimation on mmWave Radar Point Cloud for Human + Motion Sensing + + +
+ Human motion sensing plays a crucial role in smart systems for +decision-making, user interaction, and personalized services. Extensive +research that has been conducted is predominantly based on cameras, whose +intrusive nature limits their use in smart home applications. To address this, +mmWave radars have gained popularity due to their privacy-friendly features. In +this work, we propose milliFlow, a novel deep learning approach to estimate +scene flow as complementary motion information for mmWave point cloud, serving +as an intermediate level of features and directly benefiting downstream human +motion sensing tasks. Experimental results demonstrate the superior performance +of our method when compared with the competing approaches. Furthermore, by +incorporating scene flow information, we achieve remarkable improvements in +human activity recognition and human parsing and support human body part +tracking. To foster further research in this area, we will provide our codebase +and dataset for open access. + +
+
+ comment: 27 pages, 8 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ DreamLLM: Synergistic Multimodal Comprehension and Creation ICLR 2024 + + +
+ This paper presents DreamLLM, a learning framework that first achieves +versatile Multimodal Large Language Models (MLLMs) empowered with frequently +overlooked synergy between multimodal comprehension and creation. DreamLLM +operates on two fundamental principles. The first focuses on the generative +modeling of both language and image posteriors by direct sampling in the raw +multimodal space. This approach circumvents the limitations and information +loss inherent to external feature extractors like CLIP, and a more thorough +multimodal understanding is obtained. Second, DreamLLM fosters the generation +of raw, interleaved documents, modeling both text and image contents, along +with unstructured layouts. This allows DreamLLM to learn all conditional, +marginal, and joint multimodal distributions effectively. As a result, DreamLLM +is the first MLLM capable of generating free-form interleaved content. +Comprehensive experiments highlight DreamLLM's superior performance as a +zero-shot multimodal generalist, reaping from the enhanced learning synergy. +Project page: https://dreamllm.github.io. + +
+
+ comment: ICLR 2024 (Spotlight) +
+
+
+
+
+ + ♻ ☆ Perceptual Scales Predicted by Fisher Information Metrics + + +
+ Perception is often viewed as a process that transforms physical variables, +external to an observer, into internal psychological variables. Such a process +can be modeled by a function coined perceptual scale. The perceptual scale can +be deduced from psychophysical measurements that consist in comparing the +relative differences between stimuli (i.e. difference scaling experiments). +However, this approach is often overlooked by the modeling and experimentation +communities. Here, we demonstrate the value of measuring the perceptual scale +of classical (spatial frequency, orientation) and less classical physical +variables (interpolation between textures) by embedding it in recent +probabilistic modeling of perception. First, we show that the assumption that +an observer has an internal representation of univariate parameters such as +spatial frequency or orientation while stimuli are high-dimensional does not +lead to contradictory predictions when following the theoretical framework. +Second, we show that the measured perceptual scale corresponds to the +transduction function hypothesized in this framework. In particular, we +demonstrate that it is related to the Fisher information of the generative +model that underlies perception and we test the predictions given by the +generative model of different stimuli in a set a of difference scaling +experiments. Our main conclusion is that the perceptual scale is mostly driven +by the stimulus power spectrum. Finally, we propose that this measure of +perceptual scale is a way to push further the notion of perceptual distances by +estimating the perceptual geometry of images i.e. the path between images +instead of simply the distance between those. + +
+
+ comment: 15 pages, 6 figures, 7 appendix +
+
+
+
+
+ + ♻ ☆ SOS-Match: Segmentation for Open-Set Robust Correspondence Search and + Robot Localization in Unstructured Environments + + +
+ We present SOS-Match, a novel framework for detecting and matching objects in +unstructured environments. Our system consists of 1) a front-end mapping +pipeline using a zero-shot segmentation model to extract object masks from +images and track them across frames and 2) a frame alignment pipeline that uses +the geometric consistency of object relationships to efficiently localize +across a variety of conditions. We evaluate SOS-Match on the Batvik seasonal +dataset which includes drone flights collected over a coastal plot of southern +Finland during different seasons and lighting conditions. Results show that our +approach is more robust to changes in lighting and appearance than classical +image feature-based approaches or global descriptor methods, and it provides +more viewpoint invariance than learning-based feature detection and description +approaches. SOS-Match localizes within a reference map up to 46x faster than +other feature-based approaches and has a map size less than 0.5% the size of +the most compact other maps. SOS-Match is a promising new approach for landmark +detection and correspondence search in unstructured environments that is robust +to changes in lighting and appearance and is more computationally efficient +than other approaches, suggesting that the geometric arrangement of segments is +a valuable localization cue in unstructured environments. We release our +datasets at https://acl.mit.edu/SOS-Match/. + +
+
+ comment: 8 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Knolling Bot: Learning Robotic Object Arrangement from Tidy + Demonstrations + + +
+ Addressing the challenge of organizing scattered items in domestic spaces is +complicated by the diversity and subjective nature of tidiness. Just as the +complexity of human language allows for multiple expressions of the same idea, +household tidiness preferences and organizational patterns vary widely, so +presetting object locations would limit the adaptability to new objects and +environments. Inspired by advancements in natural language processing (NLP), +this paper introduces a self-supervised learning framework that allows robots +to understand and replicate the concept of tidiness from demonstrations of +well-organized layouts, akin to using conversational datasets to train Large +Language Models(LLM). We leverage a transformer neural network to predict the +placement of subsequent objects. We demonstrate a ``knolling'' system with a +robotic arm and an RGB camera to organize items of varying sizes and quantities +on a table. Our method not only trains a generalizable concept of tidiness, +enabling the model to provide diverse solutions and adapt to different numbers +of objects, but it can also incorporate human preferences to generate +customized tidy tables without explicit target positions for each object. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ Motion Mamba: Efficient and Long Sequence Motion Generation with + Hierarchical and Bidirectional Selective SSM + + +
+ Human motion generation stands as a significant pursuit in generative +computer vision, while achieving long-sequence and efficient motion generation +remains challenging. Recent advancements in state space models (SSMs), notably +Mamba, have showcased considerable promise in long sequence modeling with an +efficient hardware-aware design, which appears to be a promising direction to +build motion generation model upon it. Nevertheless, adapting SSMs to motion +generation faces hurdles since the lack of a specialized design architecture to +model motion sequence. To address these challenges, we propose Motion Mamba, a +simple and efficient approach that presents the pioneering motion generation +model utilized SSMs. Specifically, we design a Hierarchical Temporal Mamba +(HTM) block to process temporal data by ensemble varying numbers of isolated +SSM modules across a symmetric U-Net architecture aimed at preserving motion +consistency between frames. We also design a Bidirectional Spatial Mamba (BSM) +block to bidirectionally process latent poses, to enhance accurate motion +generation within a temporal frame. Our proposed method achieves up to 50% FID +improvement and up to 4 times faster on the HumanML3D and KIT-ML datasets +compared to the previous best diffusion-based method, which demonstrates strong +capabilities of high-quality long sequence motion modeling and real-time human +motion generation. See project website +https://steve-zeyu-zhang.github.io/MotionMamba/ + +
+
+
+
+
+ + ♻ ☆ Evaluating how interactive visualizations can assist in finding samples + where and how computer vision models make mistakes + + +
+ Creating Computer Vision (CV) models remains a complex practice, despite +their ubiquity. Access to data, the requirement for ML expertise, and model +opacity are just a few points of complexity that limit the ability of end-users +to build, inspect, and improve these models. Interactive ML perspectives have +helped address some of these issues by considering a teacher in the loop where +planning, teaching, and evaluating tasks take place. We present and evaluate +two interactive visualizations in the context of Sprite, a system for creating +CV classification and detection models for images originating from videos. We +study how these visualizations help Sprite's users identify (evaluate) and +select (plan) images where a model is struggling and can lead to improved +performance, compared to a baseline condition where users used a query +language. We found that users who had used the visualizations found more images +across a wider set of potential types of model errors. + +
+
+ comment: Hayeong Song, Gonzalo Ramos, and Peter Bodik. "Evaluating how + interactive visualizations can assist in finding samples where and how + computer vision models make mistakes" 2024 IEEE Pacific Visualization + Symposium (PacificVis). Ieee, 2024 +
+
+
+
+
+ + ♻ ☆ Generalization in diffusion models arises from geometry-adaptive + harmonic representations ICLR + + +
+ Deep neural networks (DNNs) trained for image denoising are able to generate +high-quality samples with score-based reverse diffusion algorithms. These +impressive capabilities seem to imply an escape from the curse of +dimensionality, but recent reports of memorization of the training set raise +the question of whether these networks are learning the "true" continuous +density of the data. Here, we show that two DNNs trained on non-overlapping +subsets of a dataset learn nearly the same score function, and thus the same +density, when the number of training images is large enough. In this regime of +strong generalization, diffusion-generated images are distinct from the +training set, and are of high visual quality, suggesting that the inductive +biases of the DNNs are well-aligned with the data density. We analyze the +learned denoising functions and show that the inductive biases give rise to a +shrinkage operation in a basis adapted to the underlying image. Examination of +these bases reveals oscillating harmonic structures along contours and in +homogeneous regions. We demonstrate that trained denoisers are inductively +biased towards these geometry-adaptive harmonic bases since they arise not only +when the network is trained on photographic images, but also when it is trained +on image classes supported on low-dimensional manifolds for which the harmonic +basis is suboptimal. Finally, we show that when trained on regular image +classes for which the optimal basis is known to be geometry-adaptive and +harmonic, the denoising performance of the networks is near-optimal. + +
+
+ comment: Accepted for oral presentation at ICLR, Vienna, May 2024 +
+
+
+
+
+ + ♻ ☆ Design and Flight Demonstration of a Quadrotor for Urban Mapping and + Target Tracking Research + + +
+ This paper describes the hardware design and flight demonstration of a small +quadrotor with imaging sensors for urban mapping, hazard avoidance, and target +tracking research. The vehicle is equipped with five cameras, including two +pairs of fisheye stereo cameras that enable a nearly omnidirectional view and a +two-axis gimbaled camera. An onboard NVIDIA Jetson Orin Nano computer running +the Robot Operating System software is used for data collection. An autonomous +tracking behavior was implemented to coordinate the motion of the quadrotor and +gimbaled camera to track a moving GPS coordinate. The data collection system +was demonstrated through a flight test that tracked a moving GPS-tagged vehicle +through a series of roads and parking lots. A map of the environment was +reconstructed from the collected images using the Direct Sparse Odometry (DSO) +algorithm. The performance of the quadrotor was also characterized by acoustic +noise, communication range, battery voltage in hover, and maximum speed tests. + +
+
+ comment: 7 pages, 10 figures, To be presented at IEEE SoutheastCon 2024 +
+
+
+
+
+ + ♻ ☆ Mipha: A Comprehensive Overhaul of Multimodal Assistant with Small + Language Models + + +
+ Multimodal Large Language Models (MLLMs) have showcased impressive skills in +tasks related to visual understanding and reasoning. Yet, their widespread +application faces obstacles due to the high computational demands during both +the training and inference phases, restricting their use to a limited audience +within the research and user communities. In this paper, we investigate the +design aspects of Multimodal Small Language Models (MSLMs) and propose an +efficient multimodal assistant named Mipha, which is designed to create synergy +among various aspects: visual representation, language models, and optimization +strategies. We show that without increasing the volume of training data, our +Mipha-3B outperforms the state-of-the-art large MLLMs, especially +LLaVA-1.5-13B, on multiple benchmarks. Through detailed discussion, we provide +insights and guidelines for developing strong MSLMs that rival the capabilities +of MLLMs. Our code is available at https://github.com/zhuyiche/Mipha. + +
+
+
+
+
+ + ♻ ☆ HOI-Diff: Text-Driven Synthesis of 3D Human-Object Interactions using + Diffusion Models + + +
+ We address the problem of generating realistic 3D human-object interactions +(HOIs) driven by textual prompts. To this end, we take a modular design and +decompose the complex task into simpler sub-tasks. We first develop a +dual-branch diffusion model (HOI-DM) to generate both human and object motions +conditioned on the input text, and encourage coherent motions by a +cross-attention communication module between the human and object motion +generation branches. We also develop an affordance prediction diffusion model +(APDM) to predict the contacting area between the human and object during the +interactions driven by the textual prompt. The APDM is independent of the +results by the HOI-DM and thus can correct potential errors by the latter. +Moreover, it stochastically generates the contacting points to diversify the +generated motions. Finally, we incorporate the estimated contacting points into +the classifier-guidance to achieve accurate and close contact between humans +and objects. To train and evaluate our approach, we annotate BEHAVE dataset +with text descriptions. Experimental results on BEHAVE and OMOMO demonstrate +that our approach produces realistic HOIs with various interactions and +different types of objects. + +
+
+ comment: Project Page: https://neu-vi.github.io/HOI-Diff/ +
+
+
+
+
+ + ♻ ☆ PRAGO: Differentiable Multi-View Pose Optimization From Objectness + Detections + + +
+ Robustly estimating camera poses from a set of images is a fundamental task +which remains challenging for differentiable methods, especially in the case of +small and sparse camera pose graphs. To overcome this challenge, we propose +Pose-refined Rotation Averaging Graph Optimization (PRAGO). From a set of +objectness detections on unordered images, our method reconstructs the +rotational pose, and in turn, the absolute pose, in a differentiable manner +benefiting from the optimization of a sequence of geometrical tasks. We show +how our objectness pose-refinement module in PRAGO is able to refine the +inherent ambiguities in pairwise relative pose estimation without removing +edges and avoiding making early decisions on the viability of graph edges. +PRAGO then refines the absolute rotations through iterative graph construction, +reweighting the graph edges to compute the final rotational pose, which can be +converted into absolute poses using translation averaging. We show that PRAGO +is able to outperform non-differentiable solvers on small and sparse scenes +extracted from 7-Scenes achieving a relative improvement of 21% for rotations +while achieving similar translation estimates. + +
+
+
+
+
+ + ♻ ☆ Geometry of the Visual Cortex with Applications to Image Inpainting and + Enhancement + + +
+ Equipping the rototranslation group $SE(2)$ with a sub-Riemannian structure +inspired by the visual cortex V1, we propose algorithms for image inpainting +and enhancement based on hypoelliptic diffusion. We innovate on previous +implementations of the methods by Citti, Sarti, and Boscain et al., by +proposing an alternative that prevents fading and is capable of producing +sharper results in a procedure that we call WaxOn-WaxOff. We also exploit the +sub-Riemannian structure to define a completely new unsharp filter using +$SE(2)$, analogous to the classical unsharp filter for 2D image processing. We +demonstrate our method on blood vessels enhancement in retinal scans. + +
+
+ comment: Associated python package available at + https://github.com/ballerin/v1diffusion +
+
+
+
+
+ + ♻ ☆ MRC-Net: 6-DoF Pose Estimation with MultiScale Residual Correlation CVPR 2024 + + +
+ We propose a single-shot approach to determining 6-DoF pose of an object with +available 3D computer-aided design (CAD) model from a single RGB image. Our +method, dubbed MRC-Net, comprises two stages. The first performs pose +classification and renders the 3D object in the classified pose. The second +stage performs regression to predict fine-grained residual pose within class. +Connecting the two stages is a novel multi-scale residual correlation (MRC) +layer that captures high-and-low level correspondences between the input image +and rendering from first stage. MRC-Net employs a Siamese network with shared +weights between both stages to learn embeddings for input and rendered images. +To mitigate ambiguity when predicting discrete pose class labels on symmetric +objects, we use soft probabilistic labels to define pose class in the first +stage. We demonstrate state-of-the-art accuracy, outperforming all competing +RGB-based methods on four challenging BOP benchmark datasets: T-LESS, LM-O, +YCB-V, and ITODD. Our method is non-iterative and requires no complex +post-processing. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ DeepRepViz: Identifying Confounders in Deep Learning Model Predictions + + +
+ Deep Learning (DL) models have gained popularity in neuroimaging studies for +predicting psychological behaviors, cognitive traits, and brain pathologies. +However, these models can be biased by confounders such as age, sex, or imaging +artifacts from the acquisition process. To address this, we introduce +'DeepRepViz', a two-part framework designed to identify confounders in DL model +predictions. The first component is a visualization tool that can be used to +qualitatively examine the final latent representation of the DL model. The +second component is a metric called 'Con-score' that quantifies the confounder +risk associated with a variable, using the final latent representation of the +DL model. We demonstrate the effectiveness of the Con-score using a simple +simulated setup by iteratively altering the strength of a simulated confounder +and observing the corresponding change in the Con-score. Next, we validate the +DeepRepViz framework on a large-scale neuroimaging dataset (n=12000) by +performing three MRI-phenotype prediction tasks that include (a) predicting +chronic alcohol users, (b) classifying participant sex, and (c) predicting +performance speed on a cognitive task called 'trail making'. DeepRepViz +identifies sex as a significant confounder in the DL model predicting chronic +alcohol users (Con-score=0.35) and age as a confounder in the model predicting +cognitive task performance (Con-score=0.3). In conclusion, the DeepRepViz +framework provides a systematic approach to test for potential confounders such +as age, sex, and imaging artifacts and improves the transparency of DL models +for neuroimaging studies. + +
+
+
+
+
+ + ♻ ☆ SimPLR: A Simple and Plain Transformer for Scaling-Efficient Object + Detection and Segmentation + + +
+ The ability to detect objects in images at varying scales has played a +pivotal role in the design of modern object detectors. Despite considerable +progress in removing hand-crafted components and simplifying the architecture +with transformers, multi-scale feature maps and/or pyramid design remain a key +factor for their empirical success. In this paper, we show that this reliance +on either feature pyramids or an hierarchical backbone is unnecessary and a +transformer-based detector with scale-aware attention enables the plain +detector `SimPLR' whose backbone and detection head are both non-hierarchical +and operate on single-scale features. We find through our experiments that +SimPLR with scale-aware attention is plain and simple, yet competitive with +multi-scale vision transformer alternatives. Compared to the multi-scale and +single-scale state-of-the-art, our model scales much better with bigger +capacity (self-supervised) models and more pre-training data, allowing us to +report a consistently better accuracy and faster runtime for object detection, +instance segmentation as well as panoptic segmentation. Code will be released. + +
+
+
+
+
+ + ♻ ☆ ChatGPT as a mapping assistant: A novel method to enrich maps with + generative AI and content derived from street-level photographs + + +
+ This paper explores the concept of leveraging generative AI as a mapping +assistant for enhancing the efficiency of collaborative mapping. We present +results of an experiment that combines multiple sources of volunteered +geographic information (VGI) and large language models (LLMs). Three analysts +described the content of crowdsourced Mapillary street-level photographs taken +along roads in a small test area in Miami, Florida. GPT-3.5-turbo was +instructed to suggest the most appropriate tagging for each road in +OpenStreetMap (OSM). The study also explores the utilization of BLIP-2, a +state-of-the-art multimodal pre-training method as an artificial analyst of +street-level photographs in addition to human analysts. Results demonstrate two +ways to effectively increase the accuracy of mapping suggestions without +modifying the underlying AI models: by (1) providing a more detailed +description of source photographs, and (2) combining prompt engineering with +additional context (e.g. location and objects detected along a road). The first +approach increases the suggestion accuracy by up to 29%, and the second one by +up to 20%. + +
+
+ comment: Submitted to The Fourth Spatial Data Science Symposium +
+
+
+
+
+ + ♻ ☆ Insect-Foundation: A Foundation Model and Large-scale 1M Dataset for + Visual Insect Understanding + + +
+ In precision agriculture, the detection and recognition of insects play an +essential role in the ability of crops to grow healthy and produce a +high-quality yield. The current machine vision model requires a large volume of +data to achieve high performance. However, there are approximately 5.5 million +different insect species in the world. None of the existing insect datasets can +cover even a fraction of them due to varying geographic locations and +acquisition costs. In this paper, we introduce a novel "Insect-1M" dataset, a +game-changing resource poised to revolutionize insect-related foundation model +training. Covering a vast spectrum of insect species, our dataset, including 1 +million images with dense identification labels of taxonomy hierarchy and +insect descriptions, offers a panoramic view of entomology, enabling foundation +models to comprehend visual and semantic information about insects like never +before. Then, to efficiently establish an Insect Foundation Model, we develop a +micro-feature self-supervised learning method with a Patch-wise Relevant +Attention mechanism capable of discerning the subtle differences among insect +images. In addition, we introduce Description Consistency loss to improve +micro-feature modeling via insect descriptions. Through our experiments, we +illustrate the effectiveness of our proposed approach in insect modeling and +achieve State-of-the-Art performance on standard benchmarks of insect-related +tasks. Our Insect Foundation Model and Dataset promise to empower the next +generation of insect-related vision models, bringing them closer to the +ultimate goal of precision agriculture. + +
+
+
+
+
+ + ♻ ☆ HUNTER: Unsupervised Human-centric 3D Detection via Transferring + Knowledge from Synthetic Instances to Real Scenes CVPR 2024 + + +
+ Human-centric 3D scene understanding has recently drawn increasing attention, +driven by its critical impact on robotics. However, human-centric real-life +scenarios are extremely diverse and complicated, and humans have intricate +motions and interactions. With limited labeled data, supervised methods are +difficult to generalize to general scenarios, hindering real-life applications. +Mimicking human intelligence, we propose an unsupervised 3D detection method +for human-centric scenarios by transferring the knowledge from synthetic human +instances to real scenes. To bridge the gap between the distinct data +representations and feature distributions of synthetic models and real point +clouds, we introduce novel modules for effective instance-to-scene +representation transfer and synthetic-to-real feature alignment. Remarkably, +our method exhibits superior performance compared to current state-of-the-art +techniques, achieving 87.8% improvement in mAP and closely approaching the +performance of fully supervised methods (62.15 mAP vs. 69.02 mAP) on HuCenLife +Dataset. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Fast and Simple Explainability for Point Cloud Networks + + +
+ We propose a fast and simple explainable AI (XAI) method for point cloud +data. It computes pointwise importance with respect to a trained network +downstream task. This allows better understanding of the network properties, +which is imperative for safety-critical applications. In addition to debugging +and visualization, our low computational complexity facilitates online feedback +to the network at inference. This can be used to reduce uncertainty and to +increase robustness. In this work, we introduce \emph{Feature Based +Interpretability} (FBI), where we compute the features' norm, per point, before +the bottleneck. We analyze the use of gradients and post- and pre-bottleneck +strategies, showing pre-bottleneck is preferred, in terms of smoothness and +ranking. We obtain at least three orders of magnitude speedup, compared to +current XAI methods, thus, scalable for big point clouds or large-scale +architectures. Our approach achieves SOTA results, in terms of classification +explainability. We demonstrate how the proposed measure is helpful in analyzing +and characterizing various aspects of 3D learning, such as rotation invariance, +robustness to out-of-distribution (OOD) outliers or domain shift and dataset +bias. + +
+
+
+
+
+ + ♻ ☆ GigaPose: Fast and Robust Novel Object Pose Estimation via One + Correspondence CVPR 2024 + + +
+ We present GigaPose, a fast, robust, and accurate method for CAD-based novel +object pose estimation in RGB images. GigaPose first leverages discriminative +"templates", rendered images of the CAD models, to recover the out-of-plane +rotation and then uses patch correspondences to estimate the four remaining +parameters. Our approach samples templates in only a two-degrees-of-freedom +space instead of the usual three and matches the input image to the templates +using fast nearest-neighbor search in feature space, results in a speedup +factor of 35x compared to the state of the art. Moreover, GigaPose is +significantly more robust to segmentation errors. Our extensive evaluation on +the seven core datasets of the BOP challenge demonstrates that it achieves +state-of-the-art accuracy and can be seamlessly integrated with existing +refinement methods. Additionally, we show the potential of GigaPose with 3D +models predicted by recent work on 3D reconstruction from a single image, +relaxing the need for CAD models and making 6D pose object estimation much more +convenient. Our source code and trained models are publicly available at +https://github.com/nv-nguyen/gigaPose + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Voting-based Multimodal Automatic Deception Detection + + +
+ Automatic Deception Detection has been a hot research topic for a long time, +using machine learning and deep learning to automatically detect deception, +brings new light to this old field. In this paper, we proposed a voting-based +method for automatic deception detection from videos using audio, visual and +lexical features. Experiments were done on two datasets, the Real-life trial +dataset by Michigan University and the Miami University deception detection +dataset. Video samples were split into frames of images, audio, and +manuscripts. Our Voting-based Multimodal proposed solution consists of three +models. The first model is CNN for detecting deception from images, the second +model is Support Vector Machine (SVM) on Mel spectrograms for detecting +deception from audio and the third model is Word2Vec on Support Vector Machine +(SVM) for detecting deception from manuscripts. Our proposed solution +outperforms state of the art. Best results achieved on images, audio and text +were 97%, 96%, 92% respectively on Real-Life Trial Dataset, and 97%, 82%, 73% +on video, audio and text respectively on Miami University Deception Detection. + +
+
+
+
+
+ + ♻ ☆ ScoreCL: Augmentation-Adaptive Contrastive Learning via Score-Matching + Function + + +
+ Self-supervised contrastive learning (CL) has achieved state-of-the-art +performance in representation learning by minimizing the distance between +positive pairs while maximizing that of negative ones. Recently, it has been +verified that the model learns better representation with diversely augmented +positive pairs because they enable the model to be more view-invariant. +However, only a few studies on CL have considered the difference between +augmented views, and have not gone beyond the hand-crafted findings. In this +paper, we first observe that the score-matching function can measure how much +data has changed from the original through augmentation. With the observed +property, every pair in CL can be weighted adaptively by the difference of +score values, resulting in boosting the performance of the existing CL method. +We show the generality of our method, referred to as ScoreCL, by consistently +improving various CL methods, SimCLR, SimSiam, W-MSE, and VICReg, up to 3%p in +k-NN evaluation on CIFAR-10, CIFAR-100, and ImageNet-100. Moreover, we have +conducted exhaustive experiments and ablations, including results on diverse +downstream tasks, comparison with possible baselines, and improvement when used +with other proposed augmentation methods. We hope our exploration will inspire +more research in exploiting the score matching for CL. + +
+
+
+
+
+ + ♻ ☆ SelfPromer: Self-Prompt Dehazing Transformers with Depth-Consistency AAAI24 + + +
+ This work presents an effective depth-consistency self-prompt Transformer for +image dehazing. It is motivated by an observation that the estimated depths of +an image with haze residuals and its clear counterpart vary. Enforcing the +depth consistency of dehazed images with clear ones, therefore, is essential +for dehazing. For this purpose, we develop a prompt based on the features of +depth differences between the hazy input images and corresponding clear +counterparts that can guide dehazing models for better restoration. +Specifically, we first apply deep features extracted from the input images to +the depth difference features for generating the prompt that contains the haze +residual information in the input. Then we propose a prompt embedding module +that is designed to perceive the haze residuals, by linearly adding the prompt +to the deep features. Further, we develop an effective prompt attention module +to pay more attention to haze residuals for better removal. By incorporating +the prompt, prompt embedding, and prompt attention into an encoder-decoder +network based on VQGAN, we can achieve better perception quality. As the depths +of clear images are not available at inference, and the dehazed images with +one-time feed-forward execution may still contain a portion of haze residuals, +we propose a new continuous self-prompt inference that can iteratively correct +the dehazing model towards better haze-free image generation. Extensive +experiments show that our method performs favorably against the +state-of-the-art approaches on both synthetic and real-world datasets in terms +of perception metrics including NIQE, PI, and PIQE. + +
+
+ comment: Accepted by AAAI24. Source codes will be made available at: + https://github.com/supersupercong/SelfPromer +
+
+
+
+
+ + ♻ ☆ Robust Identity Perceptual Watermark Against Deepfake Face Swapping + + +
+ Notwithstanding offering convenience and entertainment to society, Deepfake +face swapping has caused critical privacy issues with the rapid development of +deep generative models. Due to imperceptible artifacts in high-quality +synthetic images, passive detection models against face swapping in recent +years usually suffer performance damping regarding the generalizability issue. +Therefore, several studies have been attempted to proactively protect the +original images against malicious manipulations by inserting invisible signals +in advance. However, the existing proactive defense approaches demonstrate +unsatisfactory results with respect to visual quality, detection accuracy, and +source tracing ability. In this study, to fulfill the research gap, we propose +the first robust identity perceptual watermarking framework that concurrently +performs detection and source tracing against Deepfake face swapping +proactively. We assign identity semantics regarding the image contents to the +watermarks and devise an unpredictable and nonreversible chaotic encryption +system to ensure watermark confidentiality. The watermarks are encoded and +recovered by jointly training an encoder-decoder framework along with +adversarial image manipulations. Falsification and source tracing are +accomplished by justifying the consistency between the content-matched identity +perceptual watermark and the recovered robust watermark from the image. +Extensive experiments demonstrate state-of-the-art detection performance on +Deepfake face swapping under both cross-dataset and cross-manipulation +settings. + +
+
+ comment: In peer review +
+
+
+
+
+ + ♻ ☆ Perceptual Quality Assessment of Virtual Reality Videos in the Wild + + +
+ Investigating how people perceive virtual reality (VR) videos in the wild +(i.e., those captured by everyday users) is a crucial and challenging task in +VR-related applications due to complex authentic distortions localized in space +and time. Existing panoramic video databases only consider synthetic +distortions, assume fixed viewing conditions, and are limited in size. To +overcome these shortcomings, we construct the VR Video Quality in the Wild +(VRVQW) database, containing $502$ user-generated videos with diverse content +and distortion characteristics. Based on VRVQW, we conduct a formal +psychophysical experiment to record the scanpaths and perceived quality scores +from $139$ participants under two different viewing conditions. We provide a +thorough statistical analysis of the recorded data, observing significant +impact of viewing conditions on both human scanpaths and perceived quality. +Moreover, we develop an objective quality assessment model for VR videos based +on pseudocylindrical representation and convolution. Results on the proposed +VRVQW show that our method is superior to existing video quality assessment +models. We have made the database and code available at +https://github.com/limuhit/VR-Video-Quality-in-the-Wild. + +
+
+ comment: Accepted by IEEE Transactions on Circuits and Systems for Video + Technology +
+
+
+
+
+ + ♻ ☆ Seeking Flat Minima with Mean Teacher on Semi- and Weakly-Supervised + Domain Generalization for Object Detection + + +
+ Object detectors do not work well when domains largely differ between +training and testing data. To overcome this domain gap in object detection +without requiring expensive annotations, we consider two problem settings: +semi-supervised domain generalizable object detection (SS-DGOD) and +weakly-supervised DGOD (WS-DGOD). In contrast to the conventional domain +generalization for object detection that requires labeled data from multiple +domains, SS-DGOD and WS-DGOD require labeled data only from one domain and +unlabeled or weakly-labeled data from multiple domains for training. In this +paper, we show that object detectors can be effectively trained on the two +settings with the same Mean Teacher learning framework, where a student network +is trained with pseudo-labels output from a teacher on the unlabeled or +weakly-labeled data. We provide novel interpretations of why the Mean Teacher +learning framework works well on the two settings in terms of the relationships +between the generalization gap and flat minima in parameter space. On the basis +of the interpretations, we also propose incorporating a simple regularization +method into the Mean Teacher learning framework to find flatter minima. The +experimental results demonstrate that the regularization leads to flatter +minima and boosts the performance of the detectors trained with the Mean +Teacher learning framework on the two settings. They also indicate that those +detectors significantly outperform the state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ DyST: Towards Dynamic Neural Scene Representations on Real-World Videos ICLR 2024 + + +
+ Visual understanding of the world goes beyond the semantics and flat +structure of individual images. In this work, we aim to capture both the 3D +structure and dynamics of real-world scenes from monocular real-world videos. +Our Dynamic Scene Transformer (DyST) model leverages recent work in neural +scene representation to learn a latent decomposition of monocular real-world +videos into scene content, per-view scene dynamics, and camera pose. This +separation is achieved through a novel co-training scheme on monocular videos +and our new synthetic dataset DySO. DyST learns tangible latent representations +for dynamic scenes that enable view generation with separate control over the +camera and the content of the scene. + +
+
+ comment: ICLR 2024 spotlight. Project website: https://dyst-paper.github.io/ +
+
+
+
+
+ + ♻ ☆ Advancements in 3D Lane Detection Using LiDAR Point Clouds: From Data + Collection to Model Development ICRA2024 + + +
+ Advanced Driver-Assistance Systems (ADAS) have successfully integrated +learning-based techniques into vehicle perception and decision-making. However, +their application in 3D lane detection for effective driving environment +perception is hindered by the lack of comprehensive LiDAR datasets. The sparse +nature of LiDAR point cloud data prevents an efficient manual annotation +process. To solve this problem, we present LiSV-3DLane, a large-scale 3D lane +dataset that comprises 20k frames of surround-view LiDAR point clouds with +enriched semantic annotation. Unlike existing datasets confined to a frontal +perspective, LiSV-3DLane provides a full 360-degree spatial panorama around the +ego vehicle, capturing complex lane patterns in both urban and highway +environments. We leverage the geometric traits of lane lines and the intrinsic +spatial attributes of LiDAR data to design a simple yet effective automatic +annotation pipeline for generating finer lane labels. To propel future +research, we propose a novel LiDAR-based 3D lane detection model, LiLaDet, +incorporating the spatial geometry learning of the LiDAR point cloud into +Bird's Eye View (BEV) based lane identification. Experimental results indicate +that LiLaDet outperforms existing camera- and LiDAR-based approaches in the 3D +lane detection task on the K-Lane dataset and our LiSV-3DLane. + +
+
+ comment: Accepted by ICRA2024 +
+
+
+
+
+ + ♻ ☆ Detecting Brain Tumors through Multimodal Neural Networks ICPR + + +
+ Tumors can manifest in various forms and in different areas of the human +body. Brain tumors are specifically hard to diagnose and treat because of the +complexity of the organ in which they develop. Detecting them in time can lower +the chances of death and facilitate the therapy process for patients. The use +of Artificial Intelligence (AI) and, more specifically, deep learning, has the +potential to significantly reduce costs in terms of time and resources for the +discovery and identification of tumors from images obtained through imaging +techniques. This research work aims to assess the performance of a multimodal +model for the classification of Magnetic Resonance Imaging (MRI) scans +processed as grayscale images. The results are promising, and in line with +similar works, as the model reaches an accuracy of around 98\%. We also +highlight the need for explainability and transparency to ensure human control +and safety. + +
+
+ comment: Presented at NeroPRAI 2024 (co-located with ICPRAM 2024). This + version did not undergo peer review: refer to the open access version of + record (see DOI) +
+
+
+
+
+ + ♻ ☆ High-fidelity Person-centric Subject-to-Image Synthesis CVPR2024 + + +
+ Current subject-driven image generation methods encounter significant +challenges in person-centric image generation. The reason is that they learn +the semantic scene and person generation by fine-tuning a common pre-trained +diffusion, which involves an irreconcilable training imbalance. Precisely, to +generate realistic persons, they need to sufficiently tune the pre-trained +model, which inevitably causes the model to forget the rich semantic scene +prior and makes scene generation over-fit to the training data. Moreover, even +with sufficient fine-tuning, these methods can still not generate high-fidelity +persons since joint learning of the scene and person generation also lead to +quality compromise. In this paper, we propose Face-diffuser, an effective +collaborative generation pipeline to eliminate the above training imbalance and +quality compromise. Specifically, we first develop two specialized pre-trained +diffusion models, i.e., Text-driven Diffusion Model (TDM) and Subject-augmented +Diffusion Model (SDM), for scene and person generation, respectively. The +sampling process is divided into three sequential stages, i.e., semantic scene +construction, subject-scene fusion, and subject enhancement. The first and last +stages are performed by TDM and SDM respectively. The subject-scene fusion +stage, that is the collaboration achieved through a novel and highly effective +mechanism, Saliency-adaptive Noise Fusion (SNF). Specifically, it is based on +our key observation that there exists a robust link between classifier-free +guidance responses and the saliency of generated images. In each time step, SNF +leverages the unique strengths of each model and allows for the spatial +blending of predicted noises from both models automatically in a saliency-aware +manner. Extensive experiments confirm the impressive effectiveness and +robustness of the Face-diffuser. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ Automatic Report Generation for Histopathology images using pre-trained + Vision Transformers and BERT + + +
+ Deep learning for histopathology has been successfully used for disease +classification, image segmentation and more. However, combining image and text +modalities using current state-of-the-art (SOTA) methods has been a challenge +due to the high resolution of histopathology images. Automatic report +generation for histopathology images is one such challenge. In this work, we +show that using an existing pre-trained Vision Transformer (ViT) to encode +4096x4096 sized patches of the Whole Slide Image (WSI) and a pre-trained +Bidirectional Encoder Representations from Transformers (BERT) model for +language modeling-based decoder for report generation, we can build a +performant and portable report generation mechanism that takes into account the +whole high resolution image. Our method allows us to not only generate and +evaluate captions that describe the image, but also helps us classify the image +into tissue types and the gender of the patient as well. Our best performing +model achieves a 89.52% accuracy in Tissue Type classification with a BLEU-4 +score of 0.12 in our caption generation task. + +
+
+ comment: Accepted at IEEE ISBI 2024. arXiv admin note: substantial text + overlap with arXiv:2311.06176 +
+
+
+
+
+ + ♻ ☆ IRAD: Implicit Representation-driven Image Resampling against + Adversarial Attacks + + +
+ We introduce a novel approach to counter adversarial attacks, namely, image +resampling. Image resampling transforms a discrete image into a new one, +simulating the process of scene recapturing or rerendering as specified by a +geometrical transformation. The underlying rationale behind our idea is that +image resampling can alleviate the influence of adversarial perturbations while +preserving essential semantic information, thereby conferring an inherent +advantage in defending against adversarial attacks. To validate this concept, +we present a comprehensive study on leveraging image resampling to defend +against adversarial attacks. We have developed basic resampling methods that +employ interpolation strategies and coordinate shifting magnitudes. Our +analysis reveals that these basic methods can partially mitigate adversarial +attacks. However, they come with apparent limitations: the accuracy of clean +images noticeably decreases, while the improvement in accuracy on adversarial +examples is not substantial. We propose implicit representation-driven image +resampling (IRAD) to overcome these limitations. First, we construct an +implicit continuous representation that enables us to represent any input image +within a continuous coordinate space. Second, we introduce SampleNet, which +automatically generates pixel-wise shifts for resampling in response to +different inputs. Furthermore, we can extend our approach to the +state-of-the-art diffusion-based method, accelerating it with fewer time steps +while preserving its defense capability. Extensive experiments demonstrate that +our method significantly enhances the adversarial robustness of diverse deep +models against various attacks while maintaining high accuracy on clean images. + +
+
+
+
+
+ + ♻ ☆ Adversarial Training on Purification (AToP): Advancing Both Robustness + and Generalization + + +
+ The deep neural networks are known to be vulnerable to well-designed +adversarial attacks. The most successful defense technique based on adversarial +training (AT) can achieve optimal robustness against particular attacks but +cannot generalize well to unseen attacks. Another effective defense technique +based on adversarial purification (AP) can enhance generalization but cannot +achieve optimal robustness. Meanwhile, both methods share one common limitation +on the degraded standard accuracy. To mitigate these issues, we propose a novel +pipeline to acquire the robust purifier model, named Adversarial Training on +Purification (AToP), which comprises two components: perturbation destruction +by random transforms (RT) and purifier model fine-tuned (FT) by adversarial +loss. RT is essential to avoid overlearning to known attacks, resulting in the +robustness generalization to unseen attacks, and FT is essential for the +improvement of robustness. To evaluate our method in an efficient and scalable +way, we conduct extensive experiments on CIFAR-10, CIFAR-100, and ImageNette to +demonstrate that our method achieves optimal robustness and exhibits +generalization ability against unseen attacks. + +
+
+
+
+
+ + ♻ ☆ Intriguing Properties of Data Attribution on Diffusion Models ICLR 2024 + + +
+ Data attribution seeks to trace model outputs back to training data. With the +recent development of diffusion models, data attribution has become a desired +module to properly assign valuations for high-quality or copyrighted training +samples, ensuring that data contributors are fairly compensated or credited. +Several theoretically motivated methods have been proposed to implement data +attribution, in an effort to improve the trade-off between computational +scalability and effectiveness. In this work, we conduct extensive experiments +and ablation studies on attributing diffusion models, specifically focusing on +DDPMs trained on CIFAR-10 and CelebA, as well as a Stable Diffusion model +LoRA-finetuned on ArtBench. Intriguingly, we report counter-intuitive +observations that theoretically unjustified design choices for attribution +empirically outperform previous baselines by a large margin, in terms of both +linear datamodeling score and counterfactual evaluation. Our work presents a +significantly more efficient approach for attributing diffusion models, while +the unexpected findings suggest that at least in non-convex settings, +constructions guided by theoretical assumptions may lead to inferior +attribution performance. The code is available at +https://github.com/sail-sg/D-TRAK. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ ObjectCompose: Evaluating Resilience of Vision-Based Models on + Object-to-Background Compositional Changes + + +
+ Given the large-scale multi-modal training of recent vision-based models and +their generalization capabilities, understanding the extent of their robustness +is critical for their real-world deployment. In this work, we evaluate the +resilience of current vision-based models against diverse object-to-background +context variations. The majority of robustness evaluation methods have +introduced synthetic datasets to induce changes to object characteristics +(viewpoints, scale, color) or utilized image transformation techniques +(adversarial changes, common corruptions) on real images to simulate shifts in +distributions. Recent works have explored leveraging large language models and +diffusion models to generate changes in the background. However, these methods +either lack in offering control over the changes to be made or distort the +object semantics, making them unsuitable for the task. Our method, on the other +hand, can induce diverse object-to-background changes while preserving the +original semantics and appearance of the object. To achieve this goal, we +harness the generative capabilities of text-to-image, image-to-text, and +image-to-segment models to automatically generate a broad spectrum of +object-to-background changes. We induce both natural and adversarial background +changes by either modifying the textual prompts or optimizing the latents and +textual embedding of text-to-image models. This allows us to quantify the role +of background context in understanding the robustness and generalization of +deep neural networks. We produce various versions of standard vision datasets +(ImageNet, COCO), incorporating either diverse and realistic backgrounds into +the images or introducing color, texture, and adversarial changes in the +background. We conduct extensive experiment to analyze the robustness of +vision-based models against object-to-background context variations across +diverse tasks. + +
+
+
+
+
+ + ♻ ☆ CODIS: Benchmarking Context-Dependent Visual Comprehension for + Multimodal Large Language Models + + +
+ Multimodal large language models (MLLMs) have demonstrated promising results +in a variety of tasks that combine vision and language. As these models become +more integral to research and applications, conducting comprehensive +evaluations of their capabilities has grown increasingly important. However, +most existing benchmarks fail to consider that, in certain situations, images +need to be interpreted within a broader context. In this work, we introduce a +new benchmark, named as CODIS, designed to assess the ability of models to use +context provided in free-form text to enhance visual comprehension. Our +findings indicate that MLLMs consistently fall short of human performance on +this benchmark. Further analysis confirms that these models struggle to +effectively extract and utilize contextual information to improve their +understanding of images. This underscores the pressing need to enhance the +ability of MLLMs to comprehend visuals in a context-dependent manner. View our +project website at https://thunlp-mt.github.io/CODIS. + +
+
+
+
+
+ + ♻ ☆ Generative deep learning-enabled ultra-large field-of-view lens-free + imaging + + +
+ Advancements in high-throughput biomedical applications necessitate +real-time, large field-of-view (FOV) imaging capabilities. Conventional +lens-free imaging (LFI) systems, while addressing the limitations of physical +lenses, have been constrained by dynamic, hard-to-model optical fields, +resulting in a limited one-shot FOV of approximately 20 $mm^2$. This +restriction has been a major bottleneck in applications like live-cell imaging +and automation of microfluidic systems for biomedical research. Here, we +present a deep-learning(DL)-based imaging framework - GenLFI - leveraging +generative artificial intelligence (AI) for holographic image reconstruction. +We demonstrate that GenLFI can achieve a real-time FOV over 550 $mm^2$, +surpassing the current LFI system by more than 20-fold, and even larger than +the world's largest confocal microscope by 1.76 times. The resolution is at the +sub-pixel level of 5.52 $\mu m$, without the need for a shifting light source. +The unsupervised learning-based reconstruction does not require optical field +modeling, making imaging dynamic 3D samples (e.g., droplet-based microfluidics +and 3D cell models) in complex optical fields possible. This GenLFI framework +unlocks the potential of LFI systems, offering a robust tool to tackle new +frontiers in high-throughput biomedical applications such as drug discovery. + +
+
+
+
+
+ + ♻ ☆ Distilling Knowledge for Short-to-Long Term Trajectory Prediction + + +
+ Long-term trajectory forecasting is an important and challenging problem in +the fields of computer vision, machine learning, and robotics. One fundamental +difficulty stands in the evolution of the trajectory that becomes more and more +uncertain and unpredictable as the time horizon grows, subsequently increasing +the complexity of the problem. To overcome this issue, in this paper, we +propose Di-Long, a new method that employs the distillation of a short-term +trajectory model forecaster that guides a student network for long-term +trajectory prediction during the training process. Given a total sequence +length that comprehends the allowed observation for the student network and the +complementary target sequence, we let the student and the teacher solve two +different related tasks defined over the same full trajectory: the student +observes a short sequence and predicts a long trajectory, whereas the teacher +observes a longer sequence and predicts the remaining short target trajectory. +The teacher's task is less uncertain, and we use its accurate predictions to +guide the student through our knowledge distillation framework, reducing +long-term future uncertainty. Our experiments show that our proposed Di-Long +method is effective for long-term forecasting and achieves state-of-the-art +performance on the Intersection Drone Dataset (inD) and the Stanford Drone +Dataset (SDD). + +
+
+
+
+
+ + ♻ ☆ Training-Free Pretrained Model Merging CVPR2024 + + +
+ Recently, model merging techniques have surfaced as a solution to combine +multiple single-talent models into a single multi-talent model. However, +previous endeavors in this field have either necessitated additional training +or fine-tuning processes, or require that the models possess the same +pre-trained initialization. In this work, we identify a common drawback in +prior works w.r.t. the inconsistency of unit similarity in the weight space and +the activation space. To address this inconsistency, we propose an innovative +model merging framework, coined as merging under dual-space constraints +(MuDSC). Specifically, instead of solely maximizing the objective of a single +space, we advocate for the exploration of permutation matrices situated in a +region with a unified high similarity in the dual space, achieved through the +linear combination of activation and weight similarity matrices. In order to +enhance usability, we have also incorporated adaptations for group structure, +including Multi-Head Attention and Group Normalization. Comprehensive +experimental comparisons demonstrate that MuDSC can significantly boost the +performance of merged models with various task combinations and architectures. +Furthermore, the visualization of the merged model within the multi-task loss +landscape reveals that MuDSC enables the merged model to reside in the +overlapping segment, featuring a unified lower loss for each task. Our code is +publicly available at https://github.com/zju-vipa/training_free_model_merging. + +
+
+ comment: CVPR2024 accepted +
+
+
+
+
+ + ♻ ☆ Adaptive Multi-Modal Cross-Entropy Loss for Stereo Matching + + +
+ Despite the great success of deep learning in stereo matching, recovering +accurate disparity maps is still challenging. Currently, L1 and cross-entropy +are the two most widely used losses for stereo network training. Compared with +the former, the latter usually performs better thanks to its probability +modeling and direct supervision to the cost volume. However, how to accurately +model the stereo ground-truth for cross-entropy loss remains largely +under-explored. Existing works simply assume that the ground-truth +distributions are uni-modal, which ignores the fact that most of the edge +pixels can be multi-modal. In this paper, a novel adaptive multi-modal +cross-entropy loss (ADL) is proposed to guide the networks to learn different +distribution patterns for each pixel. Moreover, we optimize the disparity +estimator to further alleviate the bleeding or misalignment artifacts in +inference. Extensive experimental results show that our method is generic and +can help classic stereo networks regain state-of-the-art performance. In +particular, GANet with our method ranks $1^{st}$ on both the KITTI 2015 and +2012 benchmarks among the published methods. Meanwhile, excellent +synthetic-to-realistic generalization performance can be achieved by simply +replacing the traditional loss with ours. + +
+
+
+
+
+ + ♻ ☆ P-Mamba: Marrying Perona Malik Diffusion with Mamba for Efficient + Pediatric Echocardiographic Left Ventricular Segmentation + + +
+ In pediatric cardiology, the accurate and immediate assessment of cardiac +function through echocardiography is important since it can determine whether +urgent intervention is required in many emergencies. However, echocardiography +is characterized by ambiguity and heavy background noise interference, bringing +more difficulty to accurate segmentation. Present methods lack efficiency and +are also prone to mistakenly segmenting some background noise areas as the left +ventricular area due to noise disturbance. To relieve the two issues, we +introduce P-Mamba for efficient pediatric echocardiographic left ventricular +segmentation. Specifically, we turn to the recently proposed vision mamba +layers in our vision mamba encoder branch to improve the computing and memory +efficiency of our model while modeling global dependencies. In the other +DWT-based PMD encoder branch, we devise DWT-based Perona-Malik Diffusion (PMD) +Blocks that utilize PMD for noise suppression, while simultaneously preserving +the local shape cues of the left ventricle. Leveraging the strengths of both +the two encoder branches, P-Mamba achieves superior accuracy and efficiency to +established models, such as vision transformers with quadratic and linear +computational complexity. This innovative approach promises significant +advancements in pediatric cardiac imaging and beyond. + +
+
+
+
+
+ + ♻ ☆ Exploring the Capability of Text-to-Image Diffusion Models with + Structural Edge Guidance for Multi-Spectral Satellite Image Inpainting + + +
+ The letter investigates the utility of text-to-image inpainting models for +satellite image data. Two technical challenges of injecting structural guiding +signals into the generative process as well as translating the inpainted RGB +pixels to a wider set of MSI bands are addressed by introducing a novel +inpainting framework based on StableDiffusion and ControlNet as well as a novel +method for RGB-to-MSI translation. The results on a wider set of data suggest +that the inpainting synthesized via StableDiffusion suffers from undesired +artifacts and that a simple alternative of self-supervised internal inpainting +achieves a higher quality of synthesis. + +
+
+
+
+
+ + ♻ ☆ ViT-CoMer: Vision Transformer with Convolutional Multi-scale Feature + Interaction for Dense Predictions CVPR2024 + + +
+ Although Vision Transformer (ViT) has achieved significant success in +computer vision, it does not perform well in dense prediction tasks due to the +lack of inner-patch information interaction and the limited diversity of +feature scale. Most existing studies are devoted to designing vision-specific +transformers to solve the above problems, which introduce additional +pre-training costs. Therefore, we present a plain, pre-training-free, and +feature-enhanced ViT backbone with Convolutional Multi-scale feature +interaction, named ViT-CoMer, which facilitates bidirectional interaction +between CNN and transformer. Compared to the state-of-the-art, ViT-CoMer has +the following advantages: (1) We inject spatial pyramid multi-receptive field +convolutional features into the ViT architecture, which effectively alleviates +the problems of limited local information interaction and single-feature +representation in ViT. (2) We propose a simple and efficient CNN-Transformer +bidirectional fusion interaction module that performs multi-scale fusion across +hierarchical features, which is beneficial for handling dense prediction tasks. +(3) We evaluate the performance of ViT-CoMer across various dense prediction +tasks, different frameworks, and multiple advanced pre-training. Notably, our +ViT-CoMer-L achieves 64.3% AP on COCO val2017 without extra training data, and +62.1% mIoU on ADE20K val, both of which are comparable to state-of-the-art +methods. We hope ViT-CoMer can serve as a new backbone for dense prediction +tasks to facilitate future research. The code will be released at +https://github.com/Traffic-X/ViT-CoMer. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ♻ ☆ Follow-Up Differential Descriptions: Language Models Resolve Ambiguities + for Image Classification ICLR 2024 + + +
+ A promising approach for improving the performance of vision-language models +like CLIP for image classification is to extend the class descriptions (i.e., +prompts) with related attributes, e.g., using brown sparrow instead of sparrow. +However, current zero-shot methods select a subset of attributes regardless of +commonalities between the target classes, potentially providing no useful +information that would have helped to distinguish between them. For instance, +they may use color instead of bill shape to distinguish between sparrows and +wrens, which are both brown. We propose Follow-up Differential Descriptions +(FuDD), a zero-shot approach that tailors the class descriptions to each +dataset and leads to additional attributes that better differentiate the target +classes. FuDD first identifies the ambiguous classes for each image, and then +uses a Large Language Model (LLM) to generate new class descriptions that +differentiate between them. The new class descriptions resolve the initial +ambiguity and help predict the correct label. In our experiments, FuDD +consistently outperforms generic description ensembles and naive LLM-generated +descriptions on 12 datasets. We show that differential descriptions are an +effective tool to resolve class ambiguities, which otherwise significantly +degrade the performance. We also show that high quality natural language class +descriptions produced by FuDD result in comparable performance to few-shot +adaptation methods. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ StyleTalker: One-shot Style-based Audio-driven Talking Head Video + Generation + + +
+ We propose StyleTalker, a novel audio-driven talking head generation model +that can synthesize a video of a talking person from a single reference image +with accurately audio-synced lip shapes, realistic head poses, and eye blinks. +Specifically, by leveraging a pretrained image generator and an image encoder, +we estimate the latent codes of the talking head video that faithfully reflects +the given audio. This is made possible with several newly devised components: +1) A contrastive lip-sync discriminator for accurate lip synchronization, 2) A +conditional sequential variational autoencoder that learns the latent motion +space disentangled from the lip movements, such that we can independently +manipulate the motions and lip movements while preserving the identity. 3) An +auto-regressive prior augmented with normalizing flow to learn a complex +audio-to-motion multi-modal latent space. Equipped with these components, +StyleTalker can generate talking head videos not only in a motion-controllable +way when another motion source video is given but also in a completely +audio-driven manner by inferring realistic motions from the input audio. +Through extensive experiments and user studies, we show that our model is able +to synthesize talking head videos with impressive perceptual quality which are +accurately lip-synced with the input audios, largely outperforming +state-of-the-art baselines. + +
+
+
+
+
+ + ♻ ☆ Fully Differentiable Correlation-driven 2D/3D Registration for X-ray to + CT Image Fusion + + +
+ Image-based rigid 2D/3D registration is a critical technique for fluoroscopic +guided surgical interventions. In recent years, some learning-based fully +differentiable methods have produced beneficial outcomes while the process of +feature extraction and gradient flow transmission still lack controllability +and interpretability. To alleviate these problems, in this work, we propose a +novel fully differentiable correlation-driven network using a dual-branch +CNN-transformer encoder which enables the network to extract and separate +low-frequency global features from high-frequency local features. A +correlation-driven loss is further proposed for low-frequency feature and +high-frequency feature decomposition based on embedded information. Besides, a +training strategy that learns to approximate a convex-shape similarity function +is applied in our work. We test our approach on a in-house datasetand show that +it outperforms both existing fully differentiable learning-based registration +approaches and the conventional optimization-based baseline. + +
+
+ comment: ISBI 2024 +
+
+
+
+
+ + ♻ ☆ Finetuning Text-to-Image Diffusion Models for Fairness ICLR 2024 + + +
+ The rapid adoption of text-to-image diffusion models in society underscores +an urgent need to address their biases. Without interventions, these biases +could propagate a skewed worldview and restrict opportunities for minority +groups. In this work, we frame fairness as a distributional alignment problem. +Our solution consists of two main technical contributions: (1) a distributional +alignment loss that steers specific characteristics of the generated images +towards a user-defined target distribution, and (2) adjusted direct finetuning +of diffusion model's sampling process (adjusted DFT), which leverages an +adjusted gradient to directly optimize losses defined on the generated images. +Empirically, our method markedly reduces gender, racial, and their +intersectional biases for occupational prompts. Gender bias is significantly +reduced even when finetuning just five soft tokens. Crucially, our method +supports diverse perspectives of fairness beyond absolute equality, which is +demonstrated by controlling age to a $75\%$ young and $25\%$ old distribution +while simultaneously debiasing gender and race. Finally, our method is +scalable: it can debias multiple concepts at once by simply including these +prompts in the finetuning data. We share code and various fair diffusion model +adaptors at https://sail-sg.github.io/finetune-fair-diffusion/. + +
+
+ comment: ICLR 2024 oral presentation +
+
+
+
+
+ + ♻ ☆ Accurate Segmentation of Optic Disc And Cup from Multiple Pseudo-labels + by Noise-aware Learning SC + + +
+ Optic disc and cup segmentation plays a crucial role in automating the +screening and diagnosis of optic glaucoma. While data-driven convolutional +neural networks (CNNs) show promise in this area, the inherent ambiguity of +segmenting objects and background boundaries in the task of optic disc and cup +segmentation leads to noisy annotations that impact model performance. To +address this, we propose an innovative label-denoising method of Multiple +Pseudo-labels Noise-aware Network (MPNN) for accurate optic disc and cup +segmentation. Specifically, the Multiple Pseudo-labels Generation and Guided +Denoising (MPGGD) module generates pseudo-labels by multiple different +initialization networks trained on true labels, and the pixel-level consensus +information extracted from these pseudo-labels guides to differentiate clean +pixels from noisy pixels. The training framework of the MPNN is constructed by +a teacher-student architecture to learn segmentation from clean pixels and +noisy pixels. Particularly, such a framework adeptly leverages (i) reliable and +fundamental insight from clean pixels and (ii) the supplementary knowledge +within noisy pixels via multiple perturbation-based unsupervised consistency. +Compared to other label-denoising methods, comprehensive experimental results +on the RIGA dataset demonstrate our method's excellent performance. The code is +available at https://github.com/wwwtttjjj/MPNN + +
+
+ comment: CSCWD 2024 +
+
+
+
+
+ + ♻ ☆ Animatable Gaussians: Learning Pose-dependent Gaussian Maps for + High-fidelity Human Avatar Modeling CVPR 2024 + + +
+ Modeling animatable human avatars from RGB videos is a long-standing and +challenging problem. Recent works usually adopt MLP-based neural radiance +fields (NeRF) to represent 3D humans, but it remains difficult for pure MLPs to +regress pose-dependent garment details. To this end, we introduce Animatable +Gaussians, a new avatar representation that leverages powerful 2D CNNs and 3D +Gaussian splatting to create high-fidelity avatars. To associate 3D Gaussians +with the animatable avatar, we learn a parametric template from the input +videos, and then parameterize the template on two front \& back canonical +Gaussian maps where each pixel represents a 3D Gaussian. The learned template +is adaptive to the wearing garments for modeling looser clothes like dresses. +Such template-guided 2D parameterization enables us to employ a powerful +StyleGAN-based CNN to learn the pose-dependent Gaussian maps for modeling +detailed dynamic appearances. Furthermore, we introduce a pose projection +strategy for better generalization given novel poses. Overall, our method can +create lifelike avatars with dynamic, realistic and generalized appearances. +Experiments show that our method outperforms other state-of-the-art approaches. +Code: https://github.com/lizhe00/AnimatableGaussians + +
+
+ comment: Accepted by CVPR 2024, Projectpage: + https://animatable-gaussians.github.io/, Code: + https://github.com/lizhe00/AnimatableGaussians +
+
+
+
+
+ + ♻ ☆ Bridging Implicit and Explicit Geometric Transformation for Single-Image + View Synthesis + + +
+ Creating novel views from a single image has achieved tremendous strides with +advanced autoregressive models, as unseen regions have to be inferred from the +visible scene contents. Although recent methods generate high-quality novel +views, synthesizing with only one explicit or implicit 3D geometry has a +trade-off between two objectives that we call the "seesaw" problem: 1) +preserving reprojected contents and 2) completing realistic out-of-view +regions. Also, autoregressive models require a considerable computational cost. +In this paper, we propose a single-image view synthesis framework for +mitigating the seesaw problem while utilizing an efficient non-autoregressive +model. Motivated by the characteristics that explicit methods well preserve +reprojected pixels and implicit methods complete realistic out-of-view regions, +we introduce a loss function to complement two renderers. Our loss function +promotes that explicit features improve the reprojected area of implicit +features and implicit features improve the out-of-view area of explicit +features. With the proposed architecture and loss function, we can alleviate +the seesaw problem, outperforming autoregressive-based state-of-the-art methods +and generating an image $\approx$100 times faster. We validate the efficiency +and effectiveness of our method with experiments on RealEstate10K and ACID +datasets. + +
+
+ comment: TPAMI 2024 +
+
+
+
+
+ + ♻ ☆ Image Demoireing in RAW and sRGB Domains + + +
+ Moire patterns frequently appear when capturing screens with smartphones or +cameras, potentially compromising image quality. Previous studies suggest that +moire pattern elimination in the RAW domain offers greater effectiveness +compared to demoireing in the sRGB domain. Nevertheless, relying solely on RAW +data for image demoireing is insufficient in mitigating the color cast due to +the absence of essential information required for the color correction by the +image signal processor (ISP). In this paper, we propose to jointly utilize both +RAW and sRGB data for image demoireing (RRID), which are readily accessible in +modern smartphones and DSLR cameras. We develop Skip-Connection-based +Demoireing Module (SCDM) with Gated Feedback Module (GFM) and Frequency +Selection Module (FSM) embedded in skip-connections for the efficient and +effective demoireing of RAW and sRGB features, respectively. Subsequently, we +design a RGB Guided ISP (RGISP) to learn a device-dependent ISP, assisting the +process of color recovery. Extensive experiments demonstrate that our RRID +outperforms state-of-the-art approaches, in terms of the performance in moire +pattern removal and color cast correction by 0.62dB in PSNR and 0.003 in SSIM. + +
+
+
+
+
+ + ♻ ☆ UniHDA: A Unified and Versatile Framework for Multi-Modal Hybrid Domain + Adaptation + + +
+ Recently, generative domain adaptation has achieved remarkable progress, +enabling us to adapt a pre-trained generator to a new target domain. However, +existing methods simply adapt the generator to a single target domain and are +limited to a single modality, either text-driven or image-driven. Moreover, +they cannot maintain well consistency with the source domain, which impedes the +inheritance of the diversity. In this paper, we propose UniHDA, a +\textbf{unified} and \textbf{versatile} framework for generative hybrid domain +adaptation with multi-modal references from multiple domains. We use CLIP +encoder to project multi-modal references into a unified embedding space and +then linearly interpolate the direction vectors from multiple target domains to +achieve hybrid domain adaptation. To ensure \textbf{consistency} with the +source domain, we propose a novel cross-domain spatial structure (CSS) loss +that maintains detailed spatial structure information between source and target +generator. Experiments show that the adapted generator can synthesise realistic +images with various attribute compositions. Additionally, our framework is +generator-agnostic and versatile to multiple generators, e.g., StyleGAN, EG3D, +and Diffusion Models. + +
+
+
+
+
+ + ♻ ☆ DD-VNB: A Depth-based Dual-Loop Framework for Real-time Visually + Navigated Bronchoscopy + + +
+ Real-time 6 DOF localization of bronchoscopes is crucial for enhancing +intervention quality. However, current vision-based technologies struggle to +balance between generalization to unseen data and computational speed. In this +study, we propose a Depth-based Dual-Loop framework for real-time Visually +Navigated Bronchoscopy (DD-VNB) that can generalize across patient cases +without the need of re-training. The DD-VNB framework integrates two key +modules: depth estimation and dual-loop localization. To address the domain gap +among patients, we propose a knowledge-embedded depth estimation network that +maps endoscope frames to depth, ensuring generalization by eliminating +patient-specific textures. The network embeds view synthesis knowledge into a +cycle adversarial architecture for scale-constrained monocular depth +estimation. For real-time performance, our localization module embeds a fast +ego-motion estimation network into the loop of depth registration. The +ego-motion inference network estimates the pose change of the bronchoscope in +high frequency while depth registration against the pre-operative 3D model +provides absolute pose periodically. Specifically, the relative pose changes +are fed into the registration process as the initial guess to boost its +accuracy and speed. Experiments on phantom and in-vivo data from patients +demonstrate the effectiveness of our framework: 1) monocular depth estimation +outperforms SOTA, 2) localization achieves an accuracy of Absolute Tracking +Error (ATE) of 4.7 $\pm$ 3.17 mm in phantom and 6.49 $\pm$ 3.88 mm in patient +data, 3) with a frame-rate approaching video capture speed, 4) without the +necessity of case-wise network retraining. The framework's superior speed and +accuracy demonstrate its promising clinical potential for real-time +bronchoscopic navigation. + +
+
+
+
+
+ + ♻ ☆ CAMixerSR: Only Details Need More "Attention" CVPR 2024 + + +
+ To satisfy the rapidly increasing demands on the large image (2K-8K) +super-resolution (SR), prevailing methods follow two independent tracks: 1) +accelerate existing networks by content-aware routing, and 2) design better +super-resolution networks via token mixer refining. Despite directness, they +encounter unavoidable defects (e.g., inflexible route or non-discriminative +processing) limiting further improvements of quality-complexity trade-off. To +erase the drawbacks, we integrate these schemes by proposing a content-aware +mixer (CAMixer), which assigns convolution for simple contexts and additional +deformable window-attention for sparse textures. Specifically, the CAMixer uses +a learnable predictor to generate multiple bootstraps, including offsets for +windows warping, a mask for classifying windows, and convolutional attentions +for endowing convolution with the dynamic property, which modulates attention +to include more useful textures self-adaptively and improves the representation +capability of convolution. We further introduce a global classification loss to +improve the accuracy of predictors. By simply stacking CAMixers, we obtain +CAMixerSR which achieves superior performance on large-image SR, lightweight +SR, and omnidirectional-image SR. + +
+
+ comment: Accepted by CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Uncovering the Over-smoothing Challenge in Image Super-Resolution: + Entropy-based Quantification and Contrastive Optimization + + +
+ PSNR-oriented models are a critical class of super-resolution models with +applications across various fields. However, these models tend to generate +over-smoothed images, a problem that has been analyzed previously from the +perspectives of models or loss functions, but without taking into account the +impact of data properties. In this paper, we present a novel phenomenon that we +term the center-oriented optimization (COO) problem, where a model's output +converges towards the center point of similar high-resolution images, rather +than towards the ground truth. We demonstrate that the strength of this problem +is related to the uncertainty of data, which we quantify using entropy. We +prove that as the entropy of high-resolution images increases, their center +point will move further away from the clean image distribution, and the model +will generate over-smoothed images. Implicitly optimizing the COO problem, +perceptual-driven approaches such as perceptual loss, model structure +optimization, or GAN-based methods can be viewed. We propose an explicit +solution to the COO problem, called Detail Enhanced Contrastive Loss (DECLoss). +DECLoss utilizes the clustering property of contrastive learning to directly +reduce the variance of the potential high-resolution distribution and thereby +decrease the entropy. We evaluate DECLoss on multiple super-resolution +benchmarks and demonstrate that it improves the perceptual quality of +PSNR-oriented models. Moreover, when applied to GAN-based methods, such as +RaGAN, DECLoss helps to achieve state-of-the-art performance, such as 0.093 +LPIPS with 24.51 PSNR on 4x downsampled Urban100, validating the effectiveness +and generalization of our approach. + +
+
+ comment: Accepted in IEEE Transactions on Pattern Analysis and Machine + Intelligence +
+
+
+
+
+ + ♻ ☆ Reliable Spatial-Temporal Voxels For Multi-Modal Test-Time Adaptation + + +
+ Multi-modal test-time adaptation (MM-TTA) is proposed to adapt models to an +unlabeled target domain by leveraging the complementary multi-modal inputs in +an online manner. Previous MM-TTA methods rely on predictions of cross-modal +information in each input frame, while they ignore the fact that predictions of +geometric neighborhoods within consecutive frames are highly correlated, +leading to unstable predictions across time. To fulfill this gap, we propose +ReLiable Spatial-temporal Voxels (Latte), an MM-TTA method that leverages +reliable cross-modal spatial-temporal correspondences for multi-modal 3D +segmentation. Motivated by the fact that reliable predictions should be +consistent with their spatial-temporal correspondences, Latte aggregates +consecutive frames in a slide window manner and constructs ST voxel to capture +temporally local prediction consistency for each modality. After filtering out +ST voxels with high ST entropy, Latte conducts cross-modal learning for each +point and pixel by attending to those with reliable and consistent predictions +among both spatial and temporal neighborhoods. Experimental results show that +Latte achieves state-of-the-art performance on three different MM-TTA +benchmarks compared to previous MM-TTA or TTA methods. + +
+
+
+
+
+ + ♻ ☆ TextMonkey: An OCR-Free Large Multimodal Model for Understanding + Document + + +
+ We present TextMonkey, a large multimodal model (LMM) tailored for +text-centric tasks. Our approach introduces enhancement across several +dimensions: By adopting Shifted Window Attention with zero-initialization, we +achieve cross-window connectivity at higher input resolutions and stabilize +early training; We hypothesize that images may contain redundant tokens, and by +using similarity to filter out significant tokens, we can not only streamline +the token length but also enhance the model's performance. Moreover, by +expanding our model's capabilities to encompass text spotting and grounding, +and incorporating positional information into responses, we enhance +interpretability. It also learns to perform screenshot tasks through +finetuning. Evaluation on 12 benchmarks shows notable improvements: 5.2% in +Scene Text-Centric tasks (including STVQA, TextVQA, and OCRVQA), 6.9% in +Document-Oriented tasks (such as DocVQA, InfoVQA, ChartVQA, DeepForm, Kleister +Charity, and WikiTableQuestions), and 2.8% in Key Information Extraction tasks +(comprising FUNSD, SROIE, and POIE). It outperforms in scene text spotting with +a 10.9\% increase and sets a new standard on OCRBench, a comprehensive +benchmark consisting of 29 OCR-related assessments, with a score of 561, +surpassing previous open-sourced large multimodal models for document +understanding. Code will be released at https://github.com/Yuliang-Liu/Monkey. + +
+
+
+
+
+ + ♻ ☆ Energy-based Automated Model Evaluation ICLR2024 + + +
+ The conventional evaluation protocols on machine learning models rely heavily +on a labeled, i.i.d-assumed testing dataset, which is not often present in real +world applications. The Automated Model Evaluation (AutoEval) shows an +alternative to this traditional workflow, by forming a proximal prediction +pipeline of the testing performance without the presence of ground-truth +labels. Despite its recent successes, the AutoEval frameworks still suffer from +an overconfidence issue, substantial storage and computational cost. In that +regard, we propose a novel measure -- Meta-Distribution Energy (MDE) -- that +allows the AutoEval framework to be both more efficient and effective. The core +of the MDE is to establish a meta-distribution statistic, on the information +(energy) associated with individual samples, then offer a smoother +representation enabled by energy-based learning. We further provide our +theoretical insights by connecting the MDE with the classification loss. We +provide extensive experiments across modalities, datasets and different +architectural backbones to validate MDE's validity, together with its +superiority compared with prior approaches. We also prove MDE's versatility by +showing its seamless integration with large-scale models, and easy adaption to +learning scenarios with noisy- or imbalanced- labels. Code and data are +available: https://github.com/pengr/Energy_AutoEval + +
+
+ comment: ICLR2024 poster paper +
+
+
+
+
+ + ♻ ☆ OccFiner: Offboard Occupancy Refinement with Hybrid Propagation + + +
+ Vision-based occupancy prediction, also known as 3D Semantic Scene Completion +(SSC), presents a significant challenge in computer vision. Previous methods, +confined to onboard processing, struggle with simultaneous geometric and +semantic estimation, continuity across varying viewpoints, and single-view +occlusion. Our paper introduces OccFiner, a novel offboard framework designed +to enhance the accuracy of vision-based occupancy predictions. OccFiner +operates in two hybrid phases: 1) a multi-to-multi local propagation network +that implicitly aligns and processes multiple local frames for correcting +onboard model errors and consistently enhancing occupancy accuracy across all +distances. 2) the region-centric global propagation, focuses on refining labels +using explicit multi-view geometry and integrating sensor bias, especially to +increase the accuracy of distant occupied voxels. Extensive experiments +demonstrate that OccFiner improves both geometric and semantic accuracy across +various types of coarse occupancy, setting a new state-of-the-art performance +on the SemanticKITTI dataset. Notably, OccFiner elevates vision-based SSC +models to a level even surpassing that of LiDAR-based onboard SSC models. + +
+
+
+
+
+ + ♻ ☆ Discrete approximations of Gaussian smoothing and Gaussian derivatives + + +
+ This paper develops an in-depth treatment concerning the problem of +approximating the Gaussian smoothing and Gaussian derivative computations in +scale-space theory for application on discrete data. With close connections to +previous axiomatic treatments of continuous and discrete scale-space theory, we +consider three main ways discretizing these scale-space operations in terms of +explicit discrete convolutions, based on either (i) sampling the Gaussian +kernels and the Gaussian derivative kernels, (ii) locally integrating the +Gaussian kernels and the Gaussian derivative kernels over each pixel support +region and (iii) basing the scale-space analysis on the discrete analogue of +the Gaussian kernel, and then computing derivative approximations by applying +small-support central difference operators to the spatially smoothed image +data. + We study the properties of these three main discretization methods both +theoretically and experimentally, and characterize their performance by +quantitative measures, including the results they give rise to with respect to +the task of scale selection, investigated for four different use cases, and +with emphasis on the behaviour at fine scales. The results show that the +sampled Gaussian kernels and derivatives as well as the integrated Gaussian +kernels and derivatives perform very poorly at very fine scales. At very fine +scales, the discrete analogue of the Gaussian kernel with its corresponding +discrete derivative approximations performs substantially better. The sampled +Gaussian kernel and the sampled Gaussian derivatives do, on the other hand, +lead to numerically very good approximations of the corresponding continuous +results, when the scale parameter is sufficiently large, in the experiments +presented in the paper, when the scale parameter is greater than a value of +about 1, in units of the grid spacing. + +
+
+ comment: 40 pages, 21 figures +
+
+
+
+
+ + ♻ ☆ DragAnything: Motion Control for Anything using Entity Representation + + +
+ We introduce DragAnything, which utilizes a entity representation to achieve +motion control for any object in controllable video generation. Comparison to +existing motion control methods, DragAnything offers several advantages. +Firstly, trajectory-based is more userfriendly for interaction, when acquiring +other guidance signals (e.g., masks, depth maps) is labor-intensive. Users only +need to draw a line (trajectory) during interaction. Secondly, our entity +representation serves as an open-domain embedding capable of representing any +object, enabling the control of motion for diverse entities, including +background. Lastly, our entity representation allows simultaneous and distinct +motion control for multiple objects. Extensive experiments demonstrate that our +DragAnything achieves state-of-the-art performance for FVD, FID, and User +Study, particularly in terms of object motion control, where our method +surpasses the previous methods (e.g., DragNUWA) by 26% in human voting. + +
+
+ comment: The project website is at: + https://weijiawu.github.io/draganything_page/ . The code is at: + https://github.com/showlab/DragAnything +
+
+
+
+
+ + ♻ ☆ Creating and Leveraging a Synthetic Dataset of Cloud Optical Thickness + Measures for Cloud Detection in MSI + + +
+ Cloud formations often obscure optical satellite-based monitoring of the +Earth's surface, thus limiting Earth observation (EO) activities such as land +cover mapping, ocean color analysis, and cropland monitoring. The integration +of machine learning (ML) methods within the remote sensing domain has +significantly improved performance on a wide range of EO tasks, including cloud +detection and filtering, but there is still much room for improvement. A key +bottleneck is that ML methods typically depend on large amounts of annotated +data for training, which is often difficult to come by in EO contexts. This is +especially true when it comes to cloud optical thickness (COT) estimation. A +reliable estimation of COT enables more fine-grained and application-dependent +control compared to using pre-specified cloud categories, as is commonly done +in practice. To alleviate the COT data scarcity problem, in this work we +propose a novel synthetic dataset for COT estimation, that we subsequently +leverage for obtaining reliable and versatile cloud masks on real data. In our +dataset, top-of-atmosphere radiances have been simulated for 12 of the spectral +bands of the Multispectral Imagery (MSI) sensor onboard Sentinel-2 platforms. +These data points have been simulated under consideration of different cloud +types, COTs, and ground surface and atmospheric profiles. Extensive +experimentation of training several ML models to predict COT from the measured +reflectivity of the spectral bands demonstrates the usefulness of our proposed +dataset. In particular, by thresholding COT estimates from our ML models, we +show on two satellite image datasets (one that is publicly available, and one +which we have collected and annotated) that reliable cloud masks can be +obtained. The synthetic data, the collected real dataset, code and models have +been made publicly available at +https://github.com/aleksispi/ml-cloud-opt-thick. + +
+
+ comment: Published in the journal Remote Sensing (2024). Code, data and models + available at https://github.com/aleksispi/ml-cloud-opt-thick +
+
+
+
+
+ + ♻ ☆ Diffusion Models Trained with Large Data Are Transferable Visual Models + + +
+ We show that, simply initializing image understanding models using a +pre-trained UNet (or transformer) of diffusion models, it is possible to +achieve remarkable transferable performance on fundamental vision perception +tasks using a moderate amount of target data (even synthetic data only), +including monocular depth, surface normal, image segmentation, matting, human +pose estimation, among virtually many others. Previous works have adapted +diffusion models for various perception tasks, often reformulating these tasks +as generation processes to align with the diffusion process. In sharp contrast, +we demonstrate that fine-tuning these models with minimal adjustments can be a +more effective alternative, offering the advantages of being embarrassingly +simple and significantly faster. As the backbone network of Stable Diffusion +models is trained on giant datasets comprising billions of images, we observe +very robust generalization capabilities of the diffusion backbone. Experimental +results showcase the remarkable transferability of the backbone of diffusion +models across diverse tasks and real-world datasets. + +
+
+
+
+
+ + ♻ ☆ Kosmos-G: Generating Images in Context with Multimodal Large Language + Models + + +
+ Recent advancements in subject-driven image generation have made significant +strides. However, current methods still fall short in diverse application +scenarios, as they require test-time tuning and cannot accept interleaved +multi-image and text input. These limitations keep them far from the ultimate +goal of "image as a foreign language in image generation." This paper presents +Kosmos-G, a model that leverages the advanced multimodal perception +capabilities of Multimodal Large Language Models (MLLMs) to tackle the +aforementioned challenge. Our approach aligns the output space of MLLM with +CLIP using the textual modality as an anchor and performs compositional +instruction tuning on curated data. Kosmos-G demonstrates an impressive +capability of zero-shot subject-driven generation with interleaved multi-image +and text input. Notably, the score distillation instruction tuning requires no +modifications to the image decoder. This allows for a seamless substitution of +CLIP and effortless integration with a myriad of U-Net techniques ranging from +fine-grained controls to personalized image decoder variants. We posit Kosmos-G +as an initial attempt towards the goal of "image as a foreign language in image +generation." The code can be found at https://aka.ms/Kosmos-G + +
+
+ comment: Code: https://aka.ms/Kosmos-G Project Page: + https://xichenpan.github.io/kosmosg +
+
+
+
+
+ + ♻ ☆ Retrieving Conditions from Reference Images for Diffusion Models + + +
+ Newly developed diffusion-based techniques have showcased phenomenal +abilities in producing a wide range of high-quality images, sparking +considerable interest in various applications. A prevalent scenario is to +generate new images based on a subject from reference images. This subject +could be face identity for styled avatars, body and clothing for virtual try-on +and so on. Satisfying this requirement is evolving into a field called +Subject-Driven Generation. In this paper, we consider Subject-Driven Generation +as a unified retrieval problem with diffusion models. We introduce a novel +diffusion model architecture, named RetriNet, designed to address and solve +these problems by retrieving subject attributes from reference images +precisely, and filter out irrelevant information. RetriNet demonstrates +impressive performance when compared to existing state-of-the-art approaches in +face generation. We further propose a research and iteration friendly dataset, +RetriBooru, to study a more difficult problem, concept composition. Finally, to +better evaluate alignment between similarity and diversity or measure diversity +that have been previously unaccounted for, we introduce a novel class of +metrics named Similarity Weighted Diversity (SWD). + +
+
+
+
+
+ + ♻ ☆ Application of attention-based Siamese composite neural network in + medical image recognition + + +
+ Medical image recognition often faces the problem of insufficient data in +practical applications. Image recognition and processing under few-shot +conditions will produce overfitting, low recognition accuracy, low reliability +and insufficient robustness. It is often the case that the difference of +characteristics is subtle, and the recognition is affected by perspectives, +background, occlusion and other factors, which increases the difficulty of +recognition. Furthermore, in fine-grained images, the few-shot problem leads to +insufficient useful feature information in the images. Considering the +characteristics of few-shot and fine-grained image recognition, this study has +established a recognition model based on attention and Siamese neural network. +Aiming at the problem of few-shot samples, a Siamese neural network suitable +for classification model is proposed. The Attention-Based neural network is +used as the main network to improve the classification effect. Covid- 19 lung +samples have been selected for testing the model. The results show that the +less the number of image samples are, the more obvious the advantage shows than +the ordinary neural network. + +
+
+
+
+
+ + ♻ ☆ Occluded Cloth-Changing Person Re-Identification + + +
+ Cloth-changing person re-identification aims to retrieve and identify +spe-cific pedestrians by using cloth-unrelated features in person +cloth-changing scenarios. However, pedestrian images captured by surveillance +probes usually contain occlusions in real-world scenarios. The perfor-mance of +existing cloth-changing person re-identification methods is sig-nificantly +degraded due to the reduction of discriminative cloth-unrelated features caused +by occlusion. We define cloth-changing person re-identification in occlusion +scenarios as occluded cloth-changing person re-identification (Occ-CC-ReID), +and to the best of our knowledge, we are the first to propose occluded +cloth-changing person re-identification as a new task. We constructed two +occluded cloth-changing person re-identification datasets: Occluded-PRCC and +Occluded-LTCC. The da-tasets can be obtained from the following link: +https://github.com/1024AILab/Occluded-Cloth-Changing-Person-Re-Identification. + +
+
+
+
+
+ + ♻ ☆ A Dual-domain Regularization Method for Ring Artifact Removal of X-ray + CT + + +
+ Ring artifacts in computed tomography images, arising from the undesirable +responses of detector units, significantly degrade image quality and diagnostic +reliability. To address this challenge, we propose a dual-domain regularization +model to effectively remove ring artifacts, while maintaining the integrity of +the original CT image. The proposed model corrects the vertical stripe +artifacts on the sinogram by innovatively updating the response inconsistency +compensation coefficients of detector units, which is achieved by employing the +group sparse constraint and the projection-view direction sparse constraint on +the stripe artifacts. Simultaneously, we apply the sparse constraint on the +reconstructed image to further rectified ring artifacts in the image domain. +The key advantage of the proposed method lies in considering the relationship +between the response inconsistency compensation coefficients of the detector +units and the projection views, which enables a more accurate correction of the +response of the detector units. An alternating minimization method is designed +to solve the model. Comparative experiments on real photon counting detector +data demonstrate that the proposed method not only surpasses existing methods +in removing ring artifacts but also excels in preserving structural details and +image fidelity. + +
+
+
+
+
+ + ♻ ☆ Facial Kinship Verification from remote photoplethysmography + + +
+ Facial Kinship Verification (FKV) aims at automatically determining whether +two subjects have a kinship relation based on human faces. It has potential +applications in finding missing children and social media analysis. Traditional +FKV faces challenges as it is vulnerable to spoof attacks and raises privacy +issues. In this paper, we explore for the first time the FKV with vital +bio-signals, focusing on remote Photoplethysmography (rPPG). rPPG signals are +extracted from facial videos, resulting in a one-dimensional signal that +measures the changes in visible light reflection emitted to and detected from +the skin caused by the heartbeat. Specifically, in this paper, we employed a +straightforward one-dimensional Convolutional Neural Network (1DCNN) with a +1DCNN-Attention module and kinship contrastive loss to learn the kin similarity +from rPPGs. The network takes multiple rPPG signals extracted from various +facial Regions of Interest (ROIs) as inputs. Additionally, the 1DCNN attention +module is designed to learn and capture the discriminative kin features from +feature embeddings. Finally, we demonstrate the feasibility of rPPG to detect +kinship with the experiment evaluation on the UvANEMO Smile Database from +different kin relations. + +
+
+
+
+
+ + ♻ ☆ Aligning Step-by-Step Instructional Diagrams to Video Demonstrations + + +
+ Multimodal alignment facilitates the retrieval of instances from one modality +when queried using another. In this paper, we consider a novel setting where +such an alignment is between (i) instruction steps that are depicted as +assembly diagrams (commonly seen in Ikea assembly manuals) and (ii) video +segments from in-the-wild videos; these videos comprising an enactment of the +assembly actions in the real world. To learn this alignment, we introduce a +novel supervised contrastive learning method that learns to align videos with +the subtle details in the assembly diagrams, guided by a set of novel losses. +To study this problem and demonstrate the effectiveness of our method, we +introduce a novel dataset: IAW for Ikea assembly in the wild consisting of 183 +hours of videos from diverse furniture assembly collections and nearly 8,300 +illustrations from their associated instruction manuals and annotated for their +ground truth alignments. We define two tasks on this dataset: First, nearest +neighbor retrieval between video segments and illustrations, and, second, +alignment of instruction steps and the segments for each video. Extensive +experiments on IAW demonstrate superior performances of our approach against +alternatives. + +
+
+ comment: Project website: + https://academic.davidz.cn/en/publication/zhang-cvpr-2023/ +
+
+
+
+
+ + ♻ ☆ CardioCaps: Attention-based Capsule Network for Class-Imbalanced + Echocardiogram Classification + + +
+ Capsule Neural Networks (CapsNets) is a novel architecture that utilizes +vector-wise representations formed by multiple neurons. Specifically, the +Dynamic Routing CapsNets (DR-CapsNets) employ an affine matrix and dynamic +routing mechanism to train capsules and acquire translation-equivariance +properties, enhancing its robustness compared to traditional Convolutional +Neural Networks (CNNs). Echocardiograms, which capture moving images of the +heart, present unique challenges for traditional image classification methods. +In this paper, we explore the potential of DR-CapsNets and propose CardioCaps, +a novel attention-based DR-CapsNet architecture for class-imbalanced +echocardiogram classification. CardioCaps comprises two key components: a +weighted margin loss incorporating a regression auxiliary loss and an attention +mechanism. First, the weighted margin loss prioritizes positive cases, +supplemented by an auxiliary loss function based on the Ejection Fraction (EF) +regression task, a crucial measure of cardiac function. This approach enhances +the model's resilience in the face of class imbalance. Second, recognizing the +quadratic complexity of dynamic routing leading to training inefficiencies, we +adopt the attention mechanism as a more computationally efficient alternative. +Our results demonstrate that CardioCaps surpasses traditional machine learning +baseline methods, including Logistic Regression, Random Forest, and XGBoost +with sampling methods and a class weight matrix. Furthermore, CardioCaps +outperforms other deep learning baseline methods such as CNNs, ResNets, U-Nets, +and ViTs, as well as advanced CapsNets methods such as EM-CapsNets and +Efficient-CapsNets. Notably, our model demonstrates robustness to class +imbalance, achieving high precision even in datasets with a substantial +proportion of negative cases. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ HIG: Hierarchical Interlacement Graph Approach to Scene Graph Generation + in Video Understanding CVPR 2024 + + +
+ Visual interactivity understanding within visual scenes presents a +significant challenge in computer vision. Existing methods focus on complex +interactivities while leveraging a simple relationship model. These methods, +however, struggle with a diversity of appearance, situation, position, +interaction, and relation in videos. This limitation hinders the ability to +fully comprehend the interplay within the complex visual dynamics of subjects. +In this paper, we delve into interactivities understanding within visual +content by deriving scene graph representations from dense interactivities +among humans and objects. To achieve this goal, we first present a new dataset +containing Appearance-Situation-Position-Interaction-Relation predicates, named +ASPIRe, offering an extensive collection of videos marked by a wide range of +interactivities. Then, we propose a new approach named Hierarchical +Interlacement Graph (HIG), which leverages a unified layer and graph within a +hierarchical structure to provide deep insights into scene changes across five +distinct tasks. Our approach demonstrates superior performance to other methods +through extensive experiments conducted in various scenarios. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Open-world Machine Learning: A Review and New Outlooks + + +
+ Machine learning has achieved remarkable success in many applications. +However, existing studies are largely based on the closed-world assumption, +which assumes that the environment is stationary, and the model is fixed once +deployed. In many real-world applications, this fundamental and rather naive +assumption may not hold because an open environment is complex, dynamic, and +full of unknowns. In such cases, rejecting unknowns, discovering novelties, and +then incrementally learning them, could enable models to be safe and evolve +continually as biological systems do. This paper provides a holistic view of +open-world machine learning by investigating unknown rejection, novel class +discovery, and class-incremental learning in a unified paradigm. The +challenges, principles, and limitations of current methodologies are discussed +in detail. Finally, we discuss several potential directions for future +research. This paper aims to provide a comprehensive introduction to the +emerging open-world machine learning paradigm, to help researchers build more +powerful AI systems in their respective fields, and to promote the development +of artificial general intelligence. + +
+
+
+
+
+ + ♻ ☆ HyperHuman: Hyper-Realistic Human Generation with Latent Structural + Diffusion ICLR 2024 + + +
+ Despite significant advances in large-scale text-to-image models, achieving +hyper-realistic human image generation remains a desirable yet unsolved task. +Existing models like Stable Diffusion and DALL-E 2 tend to generate human +images with incoherent parts or unnatural poses. To tackle these challenges, +our key insight is that human image is inherently structural over multiple +granularities, from the coarse-level body skeleton to fine-grained spatial +geometry. Therefore, capturing such correlations between the explicit +appearance and latent structure in one model is essential to generate coherent +and natural human images. To this end, we propose a unified framework, +HyperHuman, that generates in-the-wild human images of high realism and diverse +layouts. Specifically, 1) we first build a large-scale human-centric dataset, +named HumanVerse, which consists of 340M images with comprehensive annotations +like human pose, depth, and surface normal. 2) Next, we propose a Latent +Structural Diffusion Model that simultaneously denoises the depth and surface +normal along with the synthesized RGB image. Our model enforces the joint +learning of image appearance, spatial relationship, and geometry in a unified +network, where each branch in the model complements to each other with both +structural awareness and textural richness. 3) Finally, to further boost the +visual quality, we propose a Structure-Guided Refiner to compose the predicted +conditions for more detailed generation of higher resolution. Extensive +experiments demonstrate that our framework yields the state-of-the-art +performance, generating hyper-realistic human images under diverse scenarios. +Project Page: https://snap-research.github.io/HyperHuman/ + +
+
+ comment: Accepted by ICLR 2024, camera-ready version. Project Page: + https://snap-research.github.io/HyperHuman/ +
+
+
+
+
+ + ♻ ☆ Rethinking Autoencoders for Medical Anomaly Detection from A Theoretical + Perspective + + +
+ Medical anomaly detection aims to identify abnormal findings using only +normal training data, playing a crucial role in health screening and +recognizing rare diseases. Reconstruction-based methods, particularly those +utilizing autoencoders (AEs), are dominant in this field. They work under the +assumption that AEs trained on only normal data cannot reconstruct unseen +abnormal regions well, thereby enabling the anomaly detection based on +reconstruction errors. However, this assumption does not always hold due to the +mismatch between the reconstruction training objective and the anomaly +detection task objective, rendering these methods theoretically unsound. This +study focuses on providing a theoretical foundation for AE-based reconstruction +methods in anomaly detection. By leveraging information theory, we elucidate +the principles of these methods and reveal that the key to improving AE in +anomaly detection lies in minimizing the information entropy of latent vectors. +Experiments on four datasets with two image modalities validate the +effectiveness of our theory. To the best of our knowledge, this is the first +effort to theoretically clarify the principles and design philosophy of AE for +anomaly detection. Code will be available upon acceptance. + +
+
+
+
+
+ + ♻ ☆ NTIRE 2023 Image Shadow Removal Challenge Technical Report: Team IIM_TTI + + +
+ In this paper, we analyze and discuss ShadowFormer in preparation for the +NTIRE2023 Shadow Removal Challenge [1], implementing five key improvements: +image alignment, the introduction of a perceptual quality loss function, the +semi-automatic annotation for shadow detection, joint learning of shadow +detection and removal, and the introduction of new data augmentation technique +"CutShadow" for shadow removal. Our method achieved scores of 0.196 (3rd out of +19) in LPIPS and 7.44 (4th out of 19) in the Mean Opinion Score (MOS). + +
+
+ comment: This version is a brief technical report submitted to the organizers, + and there are still some points to be added; please wait for updates until + May 2024. The code can be found here + (https://github.com/Yuki-11/NTIRE2023_ShadowRemoval_IIM_TTI) +
+
+
+
+
+ + ♻ ☆ Robust COVID-19 Detection in CT Images with CLIP + + +
+ In the realm of medical imaging, particularly for COVID-19 detection, deep +learning models face substantial challenges such as the necessity for extensive +computational resources, the paucity of well-annotated datasets, and a +significant amount of unlabeled data. In this work, we introduce the first +lightweight detector designed to overcome these obstacles, leveraging a frozen +CLIP image encoder and a trainable multilayer perception (MLP). Enhanced with +Conditional Value at Risk (CVaR) for robustness and a loss landscape flattening +strategy for improved generalization, our model is tailored for high efficacy +in COVID-19 detection. Furthermore, we integrate a teacher-student framework to +capitalize on the vast amounts of unlabeled data, enabling our model to achieve +superior performance despite the inherent data limitations. Experimental +results on the COV19-CT-DB dataset demonstrate the effectiveness of our +approach, surpassing baseline by up to 10.6% in `macro' F1 score in supervised +learning. The code is available at +https://github.com/Purdue-M2/COVID-19_Detection_M2_PURDUE. + +
+
+
+
+
+ + ♻ ☆ Generating Images with 3D Annotations Using Diffusion Models ICLR 2024 + + +
+ Diffusion models have emerged as a powerful generative method, capable of +producing stunning photo-realistic images from natural language descriptions. +However, these models lack explicit control over the 3D structure in the +generated images. Consequently, this hinders our ability to obtain detailed 3D +annotations for the generated images or to craft instances with specific poses +and distances. In this paper, we propose 3D Diffusion Style Transfer (3D-DST), +which incorporates 3D geometry control into diffusion models. Our method +exploits ControlNet, which extends diffusion models by using visual prompts in +addition to text prompts. We generate images of the 3D objects taken from 3D +shape repositories~(e.g., ShapeNet and Objaverse), render them from a variety +of poses and viewing directions, compute the edge maps of the rendered images, +and use these edge maps as visual prompts to generate realistic images. With +explicit 3D geometry control, we can easily change the 3D structures of the +objects in the generated images and obtain ground-truth 3D annotations +automatically. This allows us to improve a wide range of vision tasks, e.g., +classification and 3D pose estimation, in both in-distribution (ID) and +out-of-distribution (OOD) settings. We demonstrate the effectiveness of our +method through extensive experiments on ImageNet-100/200, ImageNet-R, +PASCAL3D+, ObjectNet3D, and OOD-CV. The results show that our method +significantly outperforms existing methods, e.g., 3.8 percentage points on +ImageNet-100 using DeiT-B. + +
+
+ comment: ICLR 2024 Spotlight. Code: https://ccvl.jhu.edu/3D-DST/ +
+
+
+
+
+ + ♻ ☆ Automated ensemble method for pediatric brain tumor segmentation MICCAI + + +
+ Brain tumors remain a critical global health challenge, necessitating +advancements in diagnostic techniques and treatment methodologies. A tumor or +its recurrence often needs to be identified in imaging studies and +differentiated from normal brain tissue. In response to the growing need for +age-specific segmentation models, particularly for pediatric patients, this +study explores the deployment of deep learning techniques using magnetic +resonance imaging (MRI) modalities. By introducing a novel ensemble approach +using ONet and modified versions of UNet, coupled with innovative loss +functions, this study achieves a precise segmentation model for the BraTS-PEDs +2023 Challenge. Data augmentation, including both single and composite +transformations, ensures model robustness and accuracy across different +scanning protocols. The ensemble strategy, integrating the ONet and UNet +models, shows greater effectiveness in capturing specific features and modeling +diverse aspects of the MRI images which result in lesion wise Dice scores of +0.52, 0.72 and 0.78 on unseen validation data and scores of 0.55, 0.70, 0.79 on +final testing data for the "enhancing tumor", "tumor core" and "whole tumor" +labels respectively. Visual comparisons further confirm the superiority of the +ensemble method in accurate tumor region coverage. The results indicate that +this advanced ensemble approach, building upon the unique strengths of +individual models, offers promising prospects for enhanced diagnostic accuracy +and effective treatment planning and monitoring for brain tumors in pediatric +brains. + +
+
+ comment: Accepted at MICCAI BrainLes Workshop 2023 +
+
+
+
+
+ + ♻ ☆ Exact Consistency Tests for Gaussian Mixture Filters using Normalized + Deviation Squared Statistics + + +
+ We consider the problem of evaluating dynamic consistency in discrete time +probabilistic filters that approximate stochastic system state densities with +Gaussian mixtures. Dynamic consistency means that the estimated probability +distributions correctly describe the actual uncertainties. As such, the problem +of consistency testing naturally arises in applications with regards to +estimator tuning and validation. However, due to the general complexity of the +density functions involved, straightforward approaches for consistency testing +of mixture-based estimators have remained challenging to define and implement. +This paper derives a new exact result for Gaussian mixture consistency testing +within the framework of normalized deviation squared (NDS) statistics. It is +shown that NDS test statistics for generic multivariate Gaussian mixture models +exactly follow mixtures of generalized chi-square distributions, for which +efficient computational tools are available. The accuracy and utility of the +resulting consistency tests are numerically demonstrated on static and dynamic +mixture estimation examples. + +
+
+ comment: 8 pages, 4 figures; final manuscript to be published 2024 American + Control Conference (ACC 2024), corrected small typos and updated Fig. 1 for + clarity +
+
+
+
+
+ + ♻ ☆ Softmax-free Linear Transformers + + +
+ Vision transformers (ViTs) have pushed the state-of-the-art for visual +perception tasks. The self-attention mechanism underpinning the strength of +ViTs has a quadratic complexity in both computation and memory usage. This +motivates the development of approximating the self-attention at linear +complexity. However, an in-depth analysis in this work reveals that existing +methods are either theoretically flawed or empirically ineffective for visual +recognition. We identify that their limitations are rooted in the inheritance +of softmax-based self-attention during approximations, that is, normalizing the +scaled dot-product between token feature vectors using the softmax function. As +preserving the softmax operation challenges any subsequent linearization +efforts. By this insight, a family of Softmax-Free Transformers (SOFT) are +proposed. Specifically, a Gaussian kernel function is adopted to replace the +dot-product similarity, enabling a full self-attention matrix to be +approximated under low-rank matrix decomposition. For computational robustness, +we estimate the Moore-Penrose inverse using an iterative Newton-Raphson method +in the forward process only, while calculating its theoretical gradients only +once in the backward process. To further expand applicability (e.g., dense +prediction tasks), an efficient symmetric normalization technique is +introduced. Extensive experiments on ImageNet, COCO, and ADE20K show that our +SOFT significantly improves the computational efficiency of existing ViT +variants. With linear complexity, much longer token sequences are permitted by +SOFT, resulting in superior trade-off between accuracy and complexity. Code and +models are available at https://github.com/fudan-zvg/SOFT. + +
+
+ comment: Accepted by IJCV. arXiv admin note: substantial text overlap with + arXiv:2110.11945 +
+
+
+
+
+ + ♻ ☆ RGM: A Robust Generalizable Matching Model + + +
+ Finding corresponding pixels within a pair of images is a fundamental +computer vision task with various applications. Due to the specific +requirements of different tasks like optical flow estimation and local feature +matching, previous works are primarily categorized into dense matching and +sparse feature matching focusing on specialized architectures along with +task-specific datasets, which may somewhat hinder the generalization +performance of specialized models. In this paper, we propose a deep model for +sparse and dense matching, termed RGM (Robust Generalist Matching). In +particular, we elaborately design a cascaded GRU module for refinement by +exploring the geometric similarity iteratively at multiple scales following an +additional uncertainty estimation module for sparsification. To narrow the gap +between synthetic training samples and real-world scenarios, we build a new, +large-scale dataset with sparse correspondence ground truth by generating +optical flow supervision with greater intervals. As such, we are able to mix up +various dense and sparse matching datasets, significantly improving the +training diversity. The generalization capacity of our proposed RGM is greatly +improved by learning the matching and uncertainty estimation in a two-stage +manner on the large, mixed data. Superior performance is achieved for zero-shot +matching and downstream geometry estimation across multiple datasets, +outperforming the previous methods by a large margin. + +
+
+ comment: Code is available at: https://github.com/aim-uofa/RGM +
+
+
+
+
+ + ♻ ☆ UNK-VQA: A Dataset and A Probe into Multi-modal Large Models' Abstention + Ability + + +
+ Teaching Visual Question Answering (VQA) models to refrain from answering +unanswerable questions is necessary for building a trustworthy AI system. +Existing studies, though have explored various aspects of VQA but somewhat +ignored this particular attribute. This paper aims to bridge the research gap +by contributing a comprehensive dataset, called UNK-VQA. The dataset is +specifically designed to address the challenge of questions that models do not +know. To this end, we first augment the existing data via deliberate +perturbations on either the image or question. In specific, we carefully ensure +that the question-image semantics remain close to the original unperturbed +distribution. By this means, the identification of unanswerable questions +becomes challenging, setting our dataset apart from others that involve mere +image replacement. We then extensively evaluate the zero- and few-shot +performance of several emerging multi-modal large models and discover their +significant limitations when applied to our dataset. Additionally, we also +propose a straightforward method to tackle these unanswerable questions. This +dataset, we believe, will serve as a valuable benchmark for enhancing the +abstention capability of VQA models, thereby leading to increased +trustworthiness of AI systems. We have made the dataset +(https://github.com/guoyang9/UNK-VQA) available to facilitate further +exploration in this area. + +
+
+
+
+
+
+
+ + + +
+
+ +
+
+ + diff --git a/index.js b/index.js new file mode 100644 index 0000000..69f5da7 --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`